diff --git a/.asf.yaml b/.asf.yaml
index 99fd6fac22c76..47a18d13cbca0 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -51,12 +51,20 @@ github:
     main:
       required_pull_request_reviews:
         required_approving_review_count: 1
+      required_status_checks:
+        contexts:
+          - "Check License Header"
+          - "Use prettier to check formatting of documents"
+          - "Check Markdown Links"
+          - "Validate required_status_checks in .asf.yaml"
+          - "Spell Check with Typos"
+          - "Circular Dependency Check"
+          - "Detect Unused Dependencies"
     # needs to be updated as part of the release process
     # .asf.yaml doesn't support wildcard branch protection rules, only exact branch names
     # https://github.com/apache/infrastructure-asfyaml?tab=readme-ov-file#branch-protection
-    # Keeping set of protected branches for future releases
-    # Meanwhile creating a prerelease script that will update the branch protection names
-    # automatically. Keep track on it https://github.com/apache/datafusion/issues/17134
+    # these branches protection blocks autogenerated during release process which is described in
+    # https://github.com/apache/datafusion/tree/main/dev/release#2-add-a-protection-to-release-candidate-branch
     branch-50:
       required_pull_request_reviews:
         required_approving_review_count: 1
@@ -66,66 +74,15 @@ github:
     branch-52:
       required_pull_request_reviews:
         required_approving_review_count: 1
-    branch-53:
-      required_pull_request_reviews:
-        required_approving_review_count: 1
-    branch-54:
-      required_pull_request_reviews:
-        required_approving_review_count: 1
-    branch-55:
-      required_pull_request_reviews:
-        required_approving_review_count: 1
-    branch-56:
-      required_pull_request_reviews:
-        required_approving_review_count: 1
-    branch-57:
-      required_pull_request_reviews:
-        required_approving_review_count: 1
-    branch-58:
-      required_pull_request_reviews:
-        required_approving_review_count: 1
-    branch-59:
-      required_pull_request_reviews:
-        required_approving_review_count: 1
-    branch-60:
-      required_pull_request_reviews:
-        required_approving_review_count: 1
-    branch-61:
-      required_pull_request_reviews:
-        required_approving_review_count: 1
-    branch-62:
-      required_pull_request_reviews:
-        required_approving_review_count: 1
-    branch-63:
-      required_pull_request_reviews:
-        required_approving_review_count: 1
-    branch-64:
-      required_pull_request_reviews:
-        required_approving_review_count: 1
-    branch-65:
-      required_pull_request_reviews:
-        required_approving_review_count: 1
-    branch-66:
-      required_pull_request_reviews:
-        required_approving_review_count: 1
-    branch-67:
-      required_pull_request_reviews:
-        required_approving_review_count: 1
-    branch-68:
-      required_pull_request_reviews:
-        required_approving_review_count: 1
-    branch-69:
-      required_pull_request_reviews:
-        required_approving_review_count: 1
-    branch-70:
-      required_pull_request_reviews:
-        required_approving_review_count: 1
   pull_requests:
     # enable updating head branches of pull requests
     allow_update_branch: true
     allow_auto_merge: true
+    # auto-delete head branches after being merged
+    del_branch_on_merge: true
 
 # publishes the content of the `asf-site` branch to
 # https://datafusion.apache.org/
 publish:
   whoami: asf-site
+
diff --git a/.github/actions/setup-builder/action.yaml b/.github/actions/setup-builder/action.yaml
index 22d2f2187dd07..6228370c955a9 100644
--- a/.github/actions/setup-builder/action.yaml
+++ b/.github/actions/setup-builder/action.yaml
@@ -46,3 +46,17 @@ runs:
       # https://github.com/actions/checkout/issues/766
       shell: bash
       run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
+    - name: Remove unnecessary preinstalled software
+      shell: bash
+      run: |
+        echo "Disk space before cleanup:"
+        df -h 
+        apt-get clean
+        # remove tool cache: about 8.5GB (github has host /opt/hostedtoolcache mounted as /__t)
+        rm -rf /__t/* || true
+        # remove Haskell runtime: about 6.3GB (host /usr/local/.ghcup)
+        rm -rf /host/usr/local/.ghcup || true
+        # remove Android library: about 7.8GB (host /usr/local/lib/android)
+        rm -rf /host/usr/local/lib/android || true
+        echo "Disk space after cleanup:"
+        df -h
\ No newline at end of file
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 9d1d77d44c378..2cd4bdfdd7923 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -23,6 +23,7 @@ updates:
       interval: weekly
     target-branch: main
     labels: [auto-dependencies]
+    open-pull-requests-limit: 15
     ignore:
       # major version bumps of arrow* and parquet are handled manually
       - dependency-name: "arrow*"
@@ -44,10 +45,27 @@ updates:
         patterns:
           - "prost*"
           - "pbjson*"
+
+        # Catch-all: group only minor/patch into a single PR,
+        # excluding deps we want always separate (and excluding arrow/parquet which have their own group)
+      all-other-cargo-deps:
+        applies-to: version-updates
+        patterns:
+          - "*"
+        exclude-patterns:
+          - "arrow*"
+          - "parquet"
+          - "object_store"
+          - "sqlparser"
+          - "prost*"
+          - "pbjson*"
+        update-types:
+          - "minor"
+          - "patch"
   - package-ecosystem: "github-actions"
     directory: "/"
     schedule:
-      interval: "daily"
+      interval: "weekly"
     open-pull-requests-limit: 10
     labels: [auto-dependencies]
   - package-ecosystem: "pip"
diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml
index f0a03d9841a9d..b7afdf3c1914d 100644
--- a/.github/workflows/audit.yml
+++ b/.github/workflows/audit.yml
@@ -33,17 +33,22 @@ on:
     paths:
       - "**/Cargo.toml"
       - "**/Cargo.lock"
-  
+
   merge_group:
 
+permissions:
+  contents: read
+
 jobs:
   security_audit:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Install cargo-audit
-        uses: taiki-e/install-action@f535147c22906d77695e11cb199e764aa610a4fc  # v2.62.46
+        uses: taiki-e/install-action@481c34c1cf3a84c68b5e46f4eccfc82af798415a  # v2.75.23
         with:
           tool: cargo-audit
       - name: Run audit check
+        # Note: you can ignore specific RUSTSEC issues using the `--ignore` flag ,for example:
+        # run: cargo audit --ignore RUSTSEC-2026-0001
         run: cargo audit
diff --git a/.github/workflows/breaking_changes_detector.yml b/.github/workflows/breaking_changes_detector.yml
new file mode 100644
index 0000000000000..4a4c909e7a781
--- /dev/null
+++ b/.github/workflows/breaking_changes_detector.yml
@@ -0,0 +1,142 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Detect semver-incompatible (breaking) API changes in crates modified by a PR.
+#
+# Only public workspace crates that have file changes are checked.
+# Internal crates (benchmarks, test-utils, sqllogictest, doc) are excluded.
+#
+# This workflow only runs cargo-semver-checks and uploads the result as an
+# artifact. The actual PR comment is posted by a companion workflow
+# (`breaking_changes_detector_comment.yml`) that picks up the artifact via
+# `workflow_run`.
+#
+# Why split it?
+#   "The GITHUB_TOKEN has read-only permissions in pull requests from forked
+#    repositories."
+#     https://docs.github.com/en/actions/reference/events-that-trigger-workflows#pull_request
+# A read-only token cannot post comments, so on fork PRs the previous
+# single-workflow design failed with HTTP 403. We can't simply broaden the
+# trigger here either: cargo-semver-checks compiles PR code (build.rs, proc
+# macros), so granting this job a write token would expose it to any code
+# in the PR. And ASF infra policy independently forbids `pull_request_target`
+# for any workflow that exposes GITHUB_TOKEN
+# (https://infra.apache.org/github-actions-policy.html). The companion
+# `workflow_run` workflow runs in the base-repo context with write access
+# and never executes PR code.
+
+name: "Detect breaking changes"
+
+on:
+  pull_request:
+    branches:
+      - main
+
+permissions:
+  contents: read
+
+jobs:
+  check-semver:
+    name: Check semver
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          fetch-depth: 0
+
+      # For fork PRs, `origin` points to the fork, not the upstream repo.
+      # Explicitly fetch the base branch from the upstream repo so we have
+      # a valid baseline ref for both diff and semver-checks.
+      - name: Fetch base branch
+        env:
+          BASE_REF: ${{ github.base_ref }}
+          REPO: ${{ github.repository }}
+        run: git fetch "https://github.com/${REPO}.git" "${BASE_REF}:refs/remotes/origin/${BASE_REF}"
+
+      - name: Determine changed crates
+        id: changed_crates
+        env:
+          BASE_REF: ${{ github.base_ref }}
+        run: |
+          PACKAGES=$(ci/scripts/changed_crates.sh changed-crates "origin/${BASE_REF}")
+          echo "packages=$PACKAGES" >> "$GITHUB_OUTPUT"
+          echo "Changed crates: $PACKAGES"
+
+      # `datafusion-substrait` (and crates that depend on it via sqllogictest)
+      # have a build script that calls protoc, which is not preinstalled on
+      # ubuntu-latest runners.
+      - name: Install Protobuf Compiler
+        if: steps.changed_crates.outputs.packages != ''
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y protobuf-compiler
+
+      - name: Install cargo-semver-checks
+        if: steps.changed_crates.outputs.packages != ''
+        uses: taiki-e/install-action@94cb46f8d6e437890146ffbd78a778b78e623fb2  # v2.74.0
+        with:
+          tool: cargo-semver-checks
+
+      - name: Run cargo-semver-checks
+        id: check_semver
+        if: steps.changed_crates.outputs.packages != ''
+        env:
+          BASE_REF: ${{ github.base_ref }}
+          PACKAGES: ${{ steps.changed_crates.outputs.packages }}
+        run: |
+          set +e
+          # `tee` lets cargo's output stream live into the Actions log
+          # while we also keep a copy for the PR comment.
+          ci/scripts/changed_crates.sh semver-check "origin/${BASE_REF}" $PACKAGES \
+            2>&1 | tee /tmp/semver-output.txt
+          EXIT_CODE=${PIPESTATUS[0]}
+          # Pass the result through an output instead of failing the job:
+          # a detected breaking change should surface as a PR comment, not a
+          # red check, so PR authors aren't confused by an intentional break.
+          if [ "$EXIT_CODE" -eq 0 ]; then
+            echo "result=success" >> "$GITHUB_OUTPUT"
+          else
+            echo "result=failure" >> "$GITHUB_OUTPUT"
+          fi
+
+      # Stage the data the companion comment workflow needs into a single
+      # directory. We default the result to "success" so the comment
+      # workflow clears any stale comment when the check step is skipped
+      # (e.g. no published crates changed).
+      - name: Stage artifact for comment workflow
+        if: always()
+        env:
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          CHECK_RESULT: ${{ steps.check_semver.outputs.result || 'success' }}
+        run: |
+          mkdir -p semver-artifact
+          echo "$PR_NUMBER" > semver-artifact/pr_number
+          echo "$CHECK_RESULT" > semver-artifact/result
+          if [ -f /tmp/semver-output.txt ]; then
+            sed 's/\x1b\[[0-9;]*m//g' /tmp/semver-output.txt > semver-artifact/logs
+          else
+            : > semver-artifact/logs
+          fi
+
+      - name: Upload artifact
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
+        with:
+          name: semver-check-result
+          path: semver-artifact/
+          retention-days: 1
diff --git a/.github/workflows/breaking_changes_detector_comment.yml b/.github/workflows/breaking_changes_detector_comment.yml
new file mode 100644
index 0000000000000..8e79426082557
--- /dev/null
+++ b/.github/workflows/breaking_changes_detector_comment.yml
@@ -0,0 +1,132 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Companion to `breaking_changes_detector.yml`. Posts the sticky PR comment.
+#
+# Why this workflow exists:
+#   "The GITHUB_TOKEN has read-only permissions in pull requests from forked
+#    repositories."
+#     https://docs.github.com/en/actions/reference/events-that-trigger-workflows#pull_request
+# That is why the upstream `pull_request` workflow cannot post the comment
+# itself when the PR comes from a fork.
+#
+# Why not `pull_request_target`? ASF infra policy forbids it:
+#   "You MUST NOT use `pull_request_target` as a trigger on ANY action that
+#    exports ANY confidential credentials or tokens such as GITHUB_TOKEN or
+#    NPM_TOKEN."
+#     https://infra.apache.org/github-actions-policy.html
+# `workflow_run` is the supported alternative: it runs in the base
+# repository's context regardless of where the upstream run was triggered
+# from, so the GITHUB_TOKEN here can be granted `pull-requests: write`. See:
+#   https://docs.github.com/en/actions/reference/events-that-trigger-workflows#workflow_run
+#
+# Security note: this workflow MUST NOT check out or execute any code from
+# the PR. The artifact's contents originate from a workflow run that may
+# have compiled fork-controlled code, so PR_NUMBER and CHECK_RESULT are
+# validated against strict patterns before being passed to any action.
+
+name: "Detect breaking changes - Comment"
+
+on:
+  workflow_run:
+    workflows: ["Detect breaking changes"]
+    types:
+      - completed
+
+permissions:
+  contents: read
+
+jobs:
+  comment-on-pr:
+    name: Comment on pull request
+    if: github.event.workflow_run.event == 'pull_request'
+    runs-on: ubuntu-latest
+    # Scoped to the minimum needed to upsert/delete the sticky comment.
+    permissions:
+      actions: read
+      pull-requests: write
+    steps:
+      - name: Download semver-check artifact
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c  # v8.0.1
+        with:
+          name: semver-check-result
+          run-id: ${{ github.event.workflow_run.id }}
+          github-token: ${{ github.token }}
+          path: ./semver-artifact
+
+      - name: Read and validate artifact
+        id: read
+        run: |
+          set -euo pipefail
+          # Validate every field: the artifact comes from a workflow run
+          # that compiled fork-controlled code, so its contents are untrusted.
+          PR_NUMBER=$(cat ./semver-artifact/pr_number)
+          if ! [[ "$PR_NUMBER" =~ ^[0-9]+$ ]]; then
+            echo "Invalid PR number: $PR_NUMBER" >&2
+            exit 1
+          fi
+          CHECK_RESULT=$(cat ./semver-artifact/result)
+          if [[ "$CHECK_RESULT" != "success" && "$CHECK_RESULT" != "failure" ]]; then
+            echo "Invalid check result: $CHECK_RESULT" >&2
+            exit 1
+          fi
+          echo "pr_number=$PR_NUMBER" >> "$GITHUB_OUTPUT"
+          echo "result=$CHECK_RESULT" >> "$GITHUB_OUTPUT"
+
+          # Multi-line output: random delimiter so a malicious log line can't
+          # close the heredoc and inject extra output keys. See:
+          #   https://docs.github.com/en/actions/reference/workflow-commands-for-github-actions#multiline-strings
+          DELIM="EOF_$(openssl rand -hex 16)"
+          {
+            echo "logs<<${DELIM}"
+            cat ./semver-artifact/logs
+            echo "${DELIM}"
+          } >> "$GITHUB_OUTPUT"
+
+      # The marker `<!-- semver-check-comment -->` is what makes the comment
+      # "sticky": maintain-one-comment uses it to find and replace (or
+      # delete) the existing comment instead of stacking new ones.
+      - name: Upsert sticky comment
+        if: steps.read.outputs.result != 'success'
+        uses: actions-cool/maintain-one-comment@909842216bc8e8658364c572ec52100f4c2cc50a  # v3.3.0
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          number: ${{ steps.read.outputs.pr_number }}
+          body-include: '<!-- semver-check-comment -->'
+          body: |
+            <!-- semver-check-comment -->
+            Thank you for opening this pull request!
+
+            Reviewer note: [cargo-semver-checks](https://github.com/obi1kenobi/cargo-semver-checks) reported the current version number is not SemVer-compatible with the changes in this pull request (compared against the base branch).
+
+            <details>
+            <summary>Details</summary>
+
+            ```
+            ${{ steps.read.outputs.logs }}
+            ```
+
+            </details>
+
+      - name: Delete sticky comment
+        if: steps.read.outputs.result == 'success'
+        uses: actions-cool/maintain-one-comment@909842216bc8e8658364c572ec52100f4c2cc50a  # v3.3.0
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          number: ${{ steps.read.outputs.pr_number }}
+          body-include: '<!-- semver-check-comment -->'
+          delete: true
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 0000000000000..70d38b28112de
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,55 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: "CodeQL"
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+  schedule:
+    - cron: '16 4 * * 1'
+
+permissions:
+  contents: read
+
+jobs:
+  analyze:
+    name: Analyze Actions
+    runs-on: ubuntu-slim
+    permissions:
+      contents: read
+      security-events: write
+      packages: read
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+      with:
+        persist-credentials: false
+
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4
+      with:
+        languages: actions
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4
+      with:
+        category: "/language:actions"
diff --git a/.github/workflows/dependencies.yml b/.github/workflows/dependencies.yml
index 7e736e1a7afbf..2f3a127ef98c4 100644
--- a/.github/workflows/dependencies.yml
+++ b/.github/workflows/dependencies.yml
@@ -25,26 +25,23 @@ on:
   push:
     branches-ignore:
       - 'gh-readonly-queue/**'
-    paths:
-      - "**/Cargo.toml"
-      - "**/Cargo.lock"
   pull_request:
-    paths:
-      - "**/Cargo.toml"
-      - "**/Cargo.lock"
   merge_group:
   # manual trigger
   # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
   workflow_dispatch:
 
+permissions:
+  contents: read
+
 jobs:
   depcheck:
-    name: circular dependency check
+    name: Circular Dependency Check
     runs-on: ubuntu-latest
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
@@ -58,12 +55,13 @@ jobs:
           cargo run
 
   detect-unused-dependencies:
+    name: Detect Unused Dependencies
     runs-on: ubuntu-latest
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Install cargo-machete
         run: cargo install cargo-machete --version ^0.9 --locked
       - name: Detect unused dependencies
-        run: cargo machete --with-metadata
\ No newline at end of file
+        run: cargo machete --with-metadata
diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml
index cc879f66cc936..376e68bcd5621 100644
--- a/.github/workflows/dev.yml
+++ b/.github/workflows/dev.yml
@@ -23,6 +23,9 @@ on:
   pull_request:
   merge_group:
 
+permissions:
+  contents: read
+
 concurrency:
   group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
   cancel-in-progress: true
@@ -32,28 +35,60 @@ jobs:
     runs-on: ubuntu-latest
     name: Check License Header
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Install HawkEye
+      # This CI job is bound by installation time, use `--profile dev` to speed it up
         run: cargo install hawkeye --version 6.2.0 --locked --profile dev
       - name: Run license header check
         run: ci/scripts/license_header.sh
 
   prettier:
     name: Use prettier to check formatting of documents
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-slim
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
-      - uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903  # v6.0.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+      - uses: actions/setup-node@6044e13b5dc448c55e2357c09f80417699197238  # v6.2.0
         with:
           node-version: "20"
       - name: Prettier check
+      # if you encounter error, see instructions inside the script
+        run: ci/scripts/doc_prettier_check.sh
+
+  markdown-link-check:
+    name: Check Markdown Links
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+      - name: Load tool versions
         run: |
-          # if you encounter error, rerun the command below and commit the changes
-          #
-          # ignore subproject CHANGELOG.md because they are machine generated
-          npx prettier@2.7.1 --write \
-            '{datafusion,datafusion-cli,datafusion-examples,dev,docs}/**/*.md' \
-            '!datafusion/CHANGELOG.md' \
-            README.md \
-            CONTRIBUTING.md
-          git diff --exit-code
+          source ci/scripts/utils/tool_versions.sh
+          echo "LYCHEE_VERSION=${LYCHEE_VERSION}" >> "$GITHUB_ENV"
+      - name: Install lychee
+        uses: taiki-e/install-action@481c34c1cf3a84c68b5e46f4eccfc82af798415a  # v2.75.23
+        with:
+          tool: lychee@${{ env.LYCHEE_VERSION }}
+      - name: Run markdown link check
+        run: bash ci/scripts/markdown_link_check.sh
+
+  asf-yaml-check:
+    name: Validate required_status_checks in .asf.yaml
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+      - run: pip install pyyaml
+      - run: python3 ci/scripts/check_asf_yaml_status_checks.py
+
+  typos:
+    name: Spell Check with Typos
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+      # Version fixed on purpose. It uses heuristics to detect typos, so upgrading
+      # it may cause checks to fail more often.
+      # We can upgrade it manually once a while.
+      - name: Install typos-cli
+        run: cargo install typos-cli --locked --version 1.37.0
+      - name: Run typos check
+        run: ci/scripts/typos_check.sh
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 588bf46aaca70..f0fbea566af69 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -28,36 +28,37 @@ name: Deploy DataFusion site
 
 jobs:
   build-docs:
+    permissions:
+      contents: write
     name: Build docs
     runs-on: ubuntu-latest
     steps:
       - name: Checkout docs sources
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
 
       - name: Checkout asf-site branch
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           ref: asf-site
           path: asf-site
 
-      - name: Setup Python
-        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c  # v6.0.0
-        with:
-          python-version: "3.12"
+      - name: Setup uv
+        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b  # v8.1.0
 
       - name: Install dependencies
+        run: uv sync --package datafusion-docs
+      - name: Install dependency graph tooling
         run: |
           set -x
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r docs/requirements.txt
+          sudo apt-get update
+          sudo apt-get install -y graphviz
+          cargo install cargo-depgraph --version ^1.6 --locked
 
       - name: Build docs
         run: |
           set -x
-          source venv/bin/activate
           cd docs
-          ./build.sh
+          uv run --package datafusion-docs ./build.sh
 
       - name: Copy & push the generated HTML
         run: |
diff --git a/.github/workflows/docs_pr.yaml b/.github/workflows/docs_pr.yaml
index c182f2ef85d23..4b8d25b0611eb 100644
--- a/.github/workflows/docs_pr.yaml
+++ b/.github/workflows/docs_pr.yaml
@@ -33,31 +33,31 @@ on:
   # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
   workflow_dispatch:
 
+permissions:
+  contents: read
+
 jobs:
-  
   # Test doc build
   linux-test-doc-build:
     name: Test doc build
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
-      - name: Setup Python
-        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c  # v6.0.0
-        with:
-          python-version: "3.12"
+      - name: Setup uv
+        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b  # v8.1.0
       - name: Install doc dependencies
+        run: uv sync --package datafusion-docs
+      - name: Install dependency graph tooling
         run: |
           set -x
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r docs/requirements.txt
+          sudo apt-get update
+          sudo apt-get install -y graphviz
+          cargo install cargo-depgraph --version ^1.6 --locked
       - name: Build docs html and check for warnings
         run: |
           set -x
-          source venv/bin/activate
           cd docs
-          ./build.sh # fails on errors
-
+          uv run --package datafusion-docs ./build.sh # fails on errors
diff --git a/.github/workflows/extended.yml b/.github/workflows/extended.yml
index 2472d2e0424fd..5776aed31b761 100644
--- a/.github/workflows/extended.yml
+++ b/.github/workflows/extended.yml
@@ -44,15 +44,10 @@ on:
       - 'datafusion/physical*/**/*.rs'
       - 'datafusion/expr*/**/*.rs'
       - 'datafusion/optimizer/**/*.rs'
+      - 'datafusion/sql/**/*.rs'
       - 'datafusion-testing'
   workflow_dispatch:
     inputs:
-      pr_number:
-        description: 'Pull request number'
-        type: string
-      check_run_id:
-        description: 'Check run ID for status updates'
-        type: string
       pr_head_sha:
         description: 'PR head SHA'
         type: string
@@ -66,10 +61,11 @@ jobs:
   # Check crate compiles and base cargo check passes
   linux-build-lib:
     name: linux build test
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=8,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     # note: do not use amd/rust container to preserve disk space
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0  # v2.1.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push
           submodules: true
@@ -80,7 +76,9 @@ jobs:
           source $HOME/.cargo/env
           rustup toolchain install
       - name: Install Protobuf Compiler
-        run: sudo apt-get install -y protobuf-compiler
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y protobuf-compiler
       - name: Prepare cargo build
         run: |
           cargo check --profile ci --all-targets
@@ -90,10 +88,11 @@ jobs:
   linux-test-extended:
     name: cargo test 'extended_tests' (amd64)
     needs: [linux-build-lib]
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=32,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     # note: do not use amd/rust container to preserve disk space
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0  # v2.1.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push
           submodules: true
@@ -106,7 +105,9 @@ jobs:
           source $HOME/.cargo/env
           rustup toolchain install
       - name: Install Protobuf Compiler
-        run: sudo apt-get install -y protobuf-compiler
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y protobuf-compiler
       # For debugging, test binaries can be large.
       - name: Show available disk space
         run: |
@@ -124,7 +125,7 @@ jobs:
             --lib \
             --tests \
             --bins \
-            --features avro,json,backtrace,extended_tests,recursive_protection
+            --features avro,json,backtrace,extended_tests,recursive_protection,parquet_encryption
       - name: Verify Working Directory Clean
         run: git diff --exit-code
       - name: Cleanup
@@ -133,11 +134,12 @@ jobs:
   # Check answers are correct when hash values collide
   hash-collisions:
     name: cargo test hash collisions (amd64)
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0  # v2.1.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push
           submodules: true
@@ -154,24 +156,20 @@ jobs:
 
   sqllogictest-sqlite:
     name: "Run sqllogictests with the sqlite test suite"
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=32,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0  # v2.1.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push
           submodules: true
           fetch-depth: 1
-      - name: Setup Rust toolchain
-        uses: ./.github/actions/setup-builder
-        with:
-          rust-version: stable
+      # Don't use setup-builder to avoid configuring RUST_BACKTRACE which is expensive
+      - name: Install protobuf compiler
+        run: |
+          apt-get update && apt-get install -y protobuf-compiler
       - name: Run sqllogictest
         run: |
-          cargo test --features backtrace --profile release-nonlto --test sqllogictests -- --include-sqlite
-          cargo clean
-
-
-
-
+          cargo test --features backtrace,parquet_encryption --profile ci-optimized --test sqllogictests -- --include-sqlite
\ No newline at end of file
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 0abf535b9741f..2d42d6ff964e8 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -31,7 +31,7 @@ on:
 jobs:
   process:
     name: Process
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-slim
     # only run for users whose permissions allow them to update PRs
     # otherwise labeler is failing:
     # https://github.com/apache/datafusion/issues/3743
@@ -39,8 +39,6 @@ jobs:
       contents: read
       pull-requests: write
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
-
       - name: Assign GitHub labels
         if: |
           github.event_name == 'pull_request_target' &&
diff --git a/.github/workflows/labeler/labeler-config.yml b/.github/workflows/labeler/labeler-config.yml
index 38d88059dab70..0e492b6f3f6dc 100644
--- a/.github/workflows/labeler/labeler-config.yml
+++ b/.github/workflows/labeler/labeler-config.yml
@@ -62,7 +62,7 @@ datasource:
 
 functions:
   - changed-files:
-      - any-glob-to-any-file: ['datafusion/functions/**/*', 'datafusion/functions-aggregate/**/*', 'datafusion/functions-aggregate-common', 'datafusion/functions-nested', 'datafusion/functions-table/**/*', 'datafusion/functions-window/**/*', 'datafusion/functions-window-common/**/*']
+      - any-glob-to-any-file: ['datafusion/functions/**/*', 'datafusion/functions-aggregate/**/*', 'datafusion/functions-aggregate-common/**/*', 'datafusion/functions-nested/**/*', 'datafusion/functions-table/**/*', 'datafusion/functions-window/**/*', 'datafusion/functions-window-common/**/*']
 
 
 optimizer:
diff --git a/.github/workflows/large_files.yml b/.github/workflows/large_files.yml
index 9cbfd6030a7f6..5a127e443fcb7 100644
--- a/.github/workflows/large_files.yml
+++ b/.github/workflows/large_files.yml
@@ -25,18 +25,21 @@ on:
   pull_request:
   merge_group:
 
+permissions:
+  contents: read
+
 jobs:
   check-files:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-slim
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           fetch-depth: 0
       - name: Check size of new Git objects
         env:
-          # 1 MB ought to be enough for anybody.
+          # 1.5 MB ought to be enough for anybody.
           # TODO in case we may want to consciously commit a bigger file to the repo without using Git LFS we may disable the check e.g. with a label
-          MAX_FILE_SIZE_BYTES: 1048576
+          MAX_FILE_SIZE_BYTES: 1572864
         shell: bash
         run: |
           if [ "${{ github.event_name }}" = "merge_group" ]; then
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 88d9f4e13378c..3f6462f0f01c1 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# For some actions, we use Runs-On to run them on ASF infrastructure: https://datafusion.apache.org/contributor-guide/#ci-runners
+
 name: Rust
 
 concurrency:
@@ -36,26 +38,29 @@ on:
       - "**.md"
       - ".github/ISSUE_TEMPLATE/**"
       - ".github/pull_request_template.md"
-  merge_group:
   # manual trigger
   # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
   workflow_dispatch:
 
+permissions:
+  contents: read
+
 jobs:
   # Check crate compiles and base cargo check passes
   linux-build-lib:
     name: linux build test
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=8,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0  # v2.1.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
           rust-version: stable
       - name: Rust Dependency Cache
-        uses: Swatinem/rust-cache@f13886b937689c021905a6b90929199931d60db1  # v2.8.1
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4  # v2.9.1
         with:
           shared-key: "amd-ci-check" # this job uses it's own cache becase check has a separate cache and we need it to be fast as it blocks other jobs
           save-if: ${{ github.ref_name == 'main' }}
@@ -77,7 +82,7 @@ jobs:
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
@@ -98,17 +103,17 @@ jobs:
   linux-datafusion-substrait-features:
     name: cargo check datafusion-substrait features
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
           rust-version: stable
       - name: Rust Dependency Cache
-        uses: Swatinem/rust-cache@f13886b937689c021905a6b90929199931d60db1  # v2.8.1
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4  # v2.9.1
         with:
           save-if: false # set in linux-test
           shared-key: "amd-ci"
@@ -135,11 +140,12 @@ jobs:
   linux-datafusion-proto-features:
     name: cargo check datafusion-proto features
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0  # v2.1.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
@@ -166,17 +172,18 @@ jobs:
   linux-cargo-check-datafusion:
     name: cargo check datafusion features
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0  # v2.1.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
           rust-version: stable
       - name: Rust Dependency Cache
-        uses: Swatinem/rust-cache@f13886b937689c021905a6b90929199931d60db1  # v2.8.1
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4  # v2.9.1
         with:
           save-if: false # set in linux-test
           shared-key: "amd-ci"
@@ -209,8 +216,6 @@ jobs:
         run: cargo check --profile ci --no-default-features -p datafusion --features=math_expressions
       - name: Check datafusion (parquet)
         run: cargo check --profile ci --no-default-features -p datafusion --features=parquet
-      - name: Check datafusion (pyarrow)
-        run: cargo check --profile ci --no-default-features -p datafusion --features=pyarrow
       - name: Check datafusion (regex_expressions)
         run: cargo check --profile ci --no-default-features -p datafusion --features=regex_expressions
       - name: Check datafusion (recursive_protection)
@@ -237,7 +242,7 @@ jobs:
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
@@ -268,11 +273,14 @@ jobs:
   linux-test:
     name: cargo test (amd64)
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
+      volumes:
+        - /usr/local:/host/usr/local
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0  # v2.1.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
@@ -281,7 +289,7 @@ jobs:
         with:
           rust-version: stable
       - name: Rust Dependency Cache
-        uses: Swatinem/rust-cache@f13886b937689c021905a6b90929199931d60db1  # v2.8.1
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4  # v2.9.1
         with:
          save-if: ${{ github.ref_name == 'main' }}
          shared-key: "amd-ci"
@@ -299,7 +307,7 @@ jobs:
             --lib \
             --tests \
             --bins \
-            --features serde,avro,json,backtrace,integration-tests,parquet_encryption
+            --features serde,avro,json,backtrace,integration-tests,parquet_encryption,substrait
       - name: Verify Working Directory Clean
         run: git diff --exit-code
         # Check no temporary directories created during test.
@@ -316,16 +324,17 @@ jobs:
   linux-test-datafusion-cli:
     name: cargo test datafusion-cli (amd64)
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0  # v2.1.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
       - name: Setup Rust toolchain
         run: rustup toolchain install stable
       - name: Rust Dependency Cache
-        uses: Swatinem/rust-cache@f13886b937689c021905a6b90929199931d60db1  # v2.8.1
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4  # v2.9.1
         with:
           save-if: false # set in linux-test
           shared-key: "amd-ci"
@@ -345,11 +354,12 @@ jobs:
   linux-test-example:
     name: cargo examples (amd64)
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0  # v2.1.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
@@ -358,23 +368,10 @@ jobs:
         with:
           rust-version: stable
       - name: Rust Dependency Cache
-        uses: Swatinem/rust-cache@f13886b937689c021905a6b90929199931d60db1  # v2.8.1
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4  # v2.9.1
         with:
           save-if: ${{ github.ref_name == 'main' }}
           shared-key: "amd-ci-linux-test-example"
-      - name: Remove unnecessary preinstalled software
-        run: |
-          echo "Disk space before cleanup:"
-          df -h
-          apt-get clean
-          rm -rf /__t/CodeQL
-          rm -rf /__t/PyPy
-          rm -rf /__t/Java_Temurin-Hotspot_jdk
-          rm -rf /__t/Python
-          rm -rf /__t/go
-          rm -rf /__t/Ruby
-          echo "Disk space after cleanup:"
-          df -h
       - name: Run examples
         run: |
           # test datafusion-sql examples
@@ -388,11 +385,12 @@ jobs:
   linux-test-doc:
     name: cargo test doc (amd64)
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0  # v2.1.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
@@ -409,11 +407,12 @@ jobs:
   linux-rustdoc:
     name: cargo doc
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0  # v2.1.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
@@ -425,7 +424,7 @@ jobs:
     name: build and run with wasm-pack
     runs-on: ubuntu-24.04
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup for wasm32
         run: |
           rustup target add wasm32-unknown-unknown
@@ -434,7 +433,7 @@ jobs:
           sudo apt-get update -qq
           sudo apt-get install -y -qq clang
       - name: Setup wasm-pack
-        uses: taiki-e/install-action@f535147c22906d77695e11cb199e764aa610a4fc  # v2.62.46
+        uses: taiki-e/install-action@481c34c1cf3a84c68b5e46f4eccfc82af798415a  # v2.75.23
         with:
           tool: wasm-pack
       - name: Run tests with headless mode
@@ -449,11 +448,12 @@ jobs:
   verify-benchmark-results:
     name: verify benchmark results (amd64)
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0  # v2.1.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
@@ -475,14 +475,14 @@ jobs:
           export RUST_MIN_STACK=20971520
           export TPCH_DATA=`realpath datafusion/sqllogictest/test_files/tpch/data`
           cargo test plan_q --package datafusion-benchmarks --profile ci --features=ci -- --test-threads=1
-          INCLUDE_TPCH=true cargo test --features backtrace --profile ci --package datafusion-sqllogictest --test sqllogictests
+          INCLUDE_TPCH=true cargo test --features backtrace,parquet_encryption,substrait --profile ci --package datafusion-sqllogictest --test sqllogictests
       - name: Verify Working Directory Clean
         run: git diff --exit-code
 
   sqllogictest-postgres:
     name: "Run sqllogictest with Postgres runner"
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     services:
@@ -500,7 +500,8 @@ jobs:
           --health-timeout 5s
           --health-retries 5
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0  # v2.1.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
@@ -520,11 +521,12 @@ jobs:
   sqllogictest-substrait:
     name: "Run sqllogictest in Substrait round-trip mode"
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0  # v2.1.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
@@ -537,7 +539,7 @@ jobs:
         #  command cannot be run for all the .slt files. Run it for just one that works (limit.slt)
         #  until most of the tickets in https://github.com/apache/datafusion/issues/16248 are addressed
         #  and this command can be run without filters.
-        run: cargo test --test sqllogictests -- --substrait-round-trip limit.slt
+        run: cargo test -p datafusion-sqllogictest --test sqllogictests --features substrait -- --substrait-round-trip limit.slt
 
   #  Temporarily commenting out the Windows flow, the reason is enormously slow running build
   #  Waiting for new Windows 2025 github runner
@@ -560,9 +562,9 @@ jobs:
 
   macos-aarch64:
     name: cargo test (macos-aarch64)
-    runs-on: macos-14
+    runs-on: macos-15
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
@@ -570,31 +572,7 @@ jobs:
         uses: ./.github/actions/setup-macos-aarch64-builder
       - name: Run tests (excluding doctests)
         shell: bash
-        run: cargo test --profile ci --exclude datafusion-cli --workspace --lib --tests --bins --features avro,json,backtrace,integration-tests
-
-  test-datafusion-pyarrow:
-    name: cargo test pyarrow (amd64)
-    needs: linux-build-lib
-    runs-on: ubuntu-latest
-    container:
-      image: amd64/rust:bullseye # Use the bullseye tag image which comes with python3.9
-    steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
-        with:
-          submodules: true
-          fetch-depth: 1
-      - name: Install PyArrow
-        run: |
-          echo "LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV
-          apt-get update
-          apt-get install python3-pip -y
-          python3 -m pip install pyarrow
-      - name: Setup Rust toolchain
-        uses: ./.github/actions/setup-builder
-        with:
-          rust-version: stable
-      - name: Run datafusion-common tests
-        run: cargo test --profile ci -p datafusion-common --features=pyarrow,sql
+        run: cargo test --profile ci --exclude datafusion-cli --workspace --lib --tests --bins --features avro,json,backtrace,integration-tests,substrait
 
   vendor:
     name: Verify Vendored Code
@@ -602,7 +580,7 @@ jobs:
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
@@ -619,7 +597,7 @@ jobs:
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
@@ -674,11 +652,12 @@ jobs:
   clippy:
     name: clippy
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0  # v2.1.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
@@ -689,7 +668,7 @@ jobs:
       - name: Install Clippy
         run: rustup component add clippy
       - name: Rust Dependency Cache
-        uses: Swatinem/rust-cache@f13886b937689c021905a6b90929199931d60db1  # v2.8.1
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4  # v2.9.1
         with:
           save-if: ${{ github.ref_name == 'main' }}
           shared-key: "amd-ci-clippy"
@@ -703,7 +682,7 @@ jobs:
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
@@ -720,11 +699,12 @@ jobs:
   config-docs-check:
     name: check configs.md and ***_functions.md is up-to-date
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0  # v2.1.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
@@ -732,7 +712,7 @@ jobs:
         uses: ./.github/actions/setup-builder
         with:
           rust-version: stable
-      - uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903  # v6.0.0
+      - uses: actions/setup-node@6044e13b5dc448c55e2357c09f80417699197238  # v6.2.0
         with:
           node-version: "20"
       - name: Check if configs.md has been modified
@@ -746,6 +726,38 @@ jobs:
           ./dev/update_function_docs.sh
           git diff --exit-code
 
+# This job ensures `datafusion-examples/README.md` stays in sync with the source code:
+# 1. Generates README automatically using the Rust examples docs generator
+#    (parsing documentation from `examples/<group>/main.rs`)
+# 2. Formats the generated Markdown using DataFusion's standard Prettier setup
+# 3. Compares the result against the committed README.md and fails if out-of-date
+  examples-docs-check:
+    name: check example README is up-to-date
+    needs: linux-build-lib
+    runs-on: ubuntu-latest
+    container:
+      image: amd64/rust
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Mark repository as safe for git
+        # Required for git commands inside container (avoids "dubious ownership" error)
+        run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
+
+      - name: Set up Node.js (required for prettier)
+        # doc_prettier_check.sh uses npx to run prettier for Markdown formatting
+        uses: actions/setup-node@v6
+        with:
+          node-version: '18'
+
+      - name: Run examples docs check script
+        run: |
+          bash ci/scripts/check_examples_docs.sh
+
   # Verify MSRV for the crates which are directly used by other projects:
   # - datafusion
   # - datafusion-substrait
@@ -757,11 +769,11 @@ jobs:
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
       - name: Install cargo-msrv
-        uses: taiki-e/install-action@f535147c22906d77695e11cb199e764aa610a4fc  # v2.62.46
+        uses: taiki-e/install-action@481c34c1cf3a84c68b5e46f4eccfc82af798415a  # v2.75.23
         with:
           tool: cargo-msrv
 
@@ -799,11 +811,3 @@ jobs:
       - name: Check datafusion-proto
         working-directory: datafusion/proto
         run: cargo msrv --output-format json --log-target stdout verify
-  typos:
-    name: Spell Check with Typos
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8  # v5.0.0
-        with:
-          persist-credentials: false
-      - uses: crate-ci/typos@07d900b8fa1097806b8adb6391b0d3e0ac2fdea7  # v1.39.0
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index d5fc9287aa6a5..8627b3bf044ff 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -22,12 +22,13 @@ on:
 
 jobs:
   close-stale-prs:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-slim
     permissions:
+      actions: write
       issues: write
       pull-requests: write
     steps:
-      - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008  # v10.1.0
+      - uses: actions/stale@b5d41d4e1d5dceea10e7104786b73624c18a190f  # v10.2.0
         with:
           stale-pr-message: "Thank you for your contribution. Unfortunately, this pull request is stale because it has been open 60 days with no activity. Please remove the stale label or comment or this will be closed in 7 days."
           days-before-pr-stale: 60
@@ -36,3 +37,4 @@ jobs:
           days-before-issue-stale: -1
           days-before-issue-close: -1
           repo-token: ${{ secrets.GITHUB_TOKEN }}
+          operations-per-run: 150
diff --git a/.github/workflows/take.yml b/.github/workflows/take.yml
index 86dc190add1d1..e34bf869ef8a0 100644
--- a/.github/workflows/take.yml
+++ b/.github/workflows/take.yml
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-name: Assign the issue via a `take` comment
+name: Assign/unassign the issue via `take` or `untake` comment
 on:
   issue_comment:
     types: created
@@ -25,17 +25,31 @@ permissions:
 
 jobs:
   issue_assign:
-    runs-on: ubuntu-latest
-    if: (!github.event.issue.pull_request) && github.event.comment.body == 'take'
+    runs-on: ubuntu-slim
+    if: (!github.event.issue.pull_request) && (github.event.comment.body == 'take' || github.event.comment.body == 'untake')
     concurrency:
       group: ${{ github.actor }}-issue-assign
     steps:
-      - run: |
-          CODE=$(curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -LI https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees/${{ github.event.comment.user.login }} -o /dev/null -w '%{http_code}\n' -s)
-          if [ "$CODE" -eq "204" ]
+      - name: Take or untake issue
+        env:
+          COMMENT_BODY: ${{ github.event.comment.body }}
+          ISSUE_NUMBER: ${{ github.event.issue.number }}
+          USER_LOGIN: ${{ github.event.comment.user.login }}
+          REPO: ${{ github.repository }}
+          TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          if [ "$COMMENT_BODY" == "take" ]
           then
-            echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}"
-            curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees
-          else
-            echo "Cannot assign issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}"
+            CODE=$(curl -H "Authorization: token $TOKEN" -LI https://api.github.com/repos/$REPO/issues/$ISSUE_NUMBER/assignees/$USER_LOGIN -o /dev/null -w '%{http_code}\n' -s)
+            if [ "$CODE" -eq "204" ]
+            then
+              echo "Assigning issue $ISSUE_NUMBER to $USER_LOGIN"
+              curl -X POST -H "Authorization: token $TOKEN" -H "Content-Type: application/json" -d "{\"assignees\": [\"$USER_LOGIN\"]}" https://api.github.com/repos/$REPO/issues/$ISSUE_NUMBER/assignees
+            else
+              echo "Cannot assign issue $ISSUE_NUMBER to $USER_LOGIN"
+            fi
+          elif [ "$COMMENT_BODY" == "untake" ]
+          then
+            echo "Unassigning issue $ISSUE_NUMBER from $USER_LOGIN"
+            curl -X DELETE -H "Authorization: token $TOKEN" -H "Content-Type: application/json" -d "{\"assignees\": [\"$USER_LOGIN\"]}" https://api.github.com/repos/$REPO/issues/$ISSUE_NUMBER/assignees
           fi
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 8466a72adaec8..c1f9677e47366 100644
--- a/.gitignore
+++ b/.gitignore
@@ -75,3 +75,9 @@ rat.txt
 
 # data generated by examples
 datafusion-examples/examples/datafusion-examples/
+
+# Samply profile data
+profile.json.gz
+
+# Claude Code personal settings
+.claude/settings.local.json
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000000000..9dff7f6f1ffd1
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,41 @@
+# Agent Guidelines for Apache DataFusion
+
+## Developer Documentation
+
+- [Quick Start Setup](docs/source/contributor-guide/development_environment.md#quick-start)
+- [Testing Quick Start](docs/source/contributor-guide/testing.md#testing-quick-start)
+- [Before Submitting a PR](docs/source/contributor-guide/index.md#before-submitting-a-pr)
+- [Contributor Guide](docs/source/contributor-guide/index.md)
+- [Architecture Guide](docs/source/contributor-guide/architecture.md)
+
+## Before Committing
+
+Before committing any changes, you MUST follow the instructions in
+[Before Submitting a PR](docs/source/contributor-guide/index.md#before-submitting-a-pr)
+and ensure the required checks listed there pass. Do not commit code that
+fails any of those checks.
+
+At a minimum, you MUST run and fix any errors from these commands before
+committing:
+
+```bash
+# Format code
+cargo fmt --all
+
+# Lint (must pass with no warnings)
+cargo clippy --all-targets --all-features -- -D warnings
+```
+
+You can also run the full lint suite used by CI:
+
+```bash
+./dev/rust_lint.sh
+# or auto-fix: ./dev/rust_lint.sh --write --allow-dirty
+```
+
+When creating a PR, you MUST follow the [PR template](.github/pull_request_template.md).
+
+## Testing
+
+See the [Testing Quick Start](docs/source/contributor-guide/testing.md#testing-quick-start)
+for the recommended pre-PR test commands.
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 120000
index 0000000000000..47dc3e3d863cf
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1 @@
+AGENTS.md
\ No newline at end of file
diff --git a/Cargo.lock b/Cargo.lock
index f500265108ff5..af52588e5338e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,71 +2,12 @@
 # It is not intended for manual editing.
 version = 4
 
-[[package]]
-name = "abi_stable"
-version = "0.11.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69d6512d3eb05ffe5004c59c206de7f99c34951504056ce23fc953842f12c445"
-dependencies = [
- "abi_stable_derive",
- "abi_stable_shared",
- "const_panic",
- "core_extensions",
- "crossbeam-channel",
- "generational-arena",
- "libloading 0.7.4",
- "lock_api",
- "parking_lot",
- "paste",
- "repr_offset",
- "rustc_version",
- "serde",
- "serde_derive",
- "serde_json",
-]
-
-[[package]]
-name = "abi_stable_derive"
-version = "0.11.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7178468b407a4ee10e881bc7a328a65e739f0863615cca4429d43916b05e898"
-dependencies = [
- "abi_stable_shared",
- "as_derive_utils",
- "core_extensions",
- "proc-macro2",
- "quote",
- "rustc_version",
- "syn 1.0.109",
- "typed-arena",
-]
-
-[[package]]
-name = "abi_stable_shared"
-version = "0.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b2b5df7688c123e63f4d4d649cba63f2967ba7f7861b1664fca3f77d3dad2b63"
-dependencies = [
- "core_extensions",
-]
-
 [[package]]
 name = "adler2"
 version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
 
-[[package]]
-name = "ahash"
-version = "0.7.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "891477e0c6a8957309ee5c45a6368af3ae14bb510732d2684ffa19af310920f9"
-dependencies = [
- "getrandom 0.2.16",
- "once_cell",
- "version_check",
-]
-
 [[package]]
 name = "ahash"
 version = "0.8.12"
@@ -83,9 +24,9 @@ dependencies = [
 
 [[package]]
 name = "aho-corasick"
-version = "1.1.3"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
+checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
 dependencies = [
  "memchr",
 ]
@@ -105,6 +46,15 @@ dependencies = [
  "alloc-no-stdlib",
 ]
 
+[[package]]
+name = "alloca"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5a7d05ea6aea7e9e64d25b9156ba2fee3fdd659e34e41063cd2fc7cd020d7f4"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "allocator-api2"
 version = "0.2.21"
@@ -128,9 +78,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
 
 [[package]]
 name = "anstream"
-version = "0.6.20"
+version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3ae563653d1938f79b1ab1b5e668c87c76a9930414574a6583a7b7e11a8e6192"
+checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d"
 dependencies = [
  "anstyle",
  "anstyle-parse",
@@ -143,33 +93,33 @@ dependencies = [
 
 [[package]]
 name = "anstyle"
-version = "1.0.11"
+version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd"
+checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
 
 [[package]]
 name = "anstyle-parse"
-version = "0.2.7"
+version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2"
+checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e"
 dependencies = [
  "utf8parse",
 ]
 
 [[package]]
 name = "anstyle-query"
-version = "1.1.4"
+version = "1.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2"
+checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
 dependencies = [
  "windows-sys 0.60.2",
 ]
 
 [[package]]
 name = "anstyle-wincon"
-version = "3.0.10"
+version = "3.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a"
+checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
 dependencies = [
  "anstyle",
  "once_cell_polyfill",
@@ -178,37 +128,17 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.100"
+version = "1.0.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61"
+checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
 
 [[package]]
-name = "apache-avro"
-version = "0.20.0"
+name = "ar_archive_writer"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a033b4ced7c585199fb78ef50fca7fe2f444369ec48080c5fd072efa1a03cc7"
+checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b"
 dependencies = [
- "bigdecimal",
- "bon",
- "bzip2 0.6.1",
- "crc32fast",
- "digest",
- "log",
- "miniz_oxide",
- "num-bigint",
- "quad-rand",
- "rand 0.9.2",
- "regex-lite",
- "serde",
- "serde_bytes",
- "serde_json",
- "snap",
- "strum 0.27.2",
- "strum_macros 0.27.2",
- "thiserror",
- "uuid",
- "xz2",
- "zstd",
+ "object",
 ]
 
 [[package]]
@@ -225,9 +155,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
 
 [[package]]
 name = "arrow"
-version = "57.0.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4df8bb5b0bd64c0b9bc61317fcc480bad0f00e56d3bc32c69a4c8dada4786bae"
+checksum = "d441fdda254b65f3e9025910eb2c2066b6295d9c8ed409522b8d2ace1ff8574c"
 dependencies = [
  "arrow-arith",
  "arrow-array",
@@ -238,20 +168,19 @@ dependencies = [
  "arrow-ipc",
  "arrow-json",
  "arrow-ord",
- "arrow-pyarrow",
  "arrow-row",
  "arrow-schema",
  "arrow-select",
  "arrow-string",
  "half",
- "rand 0.9.2",
+ "rand 0.9.4",
 ]
 
 [[package]]
 name = "arrow-arith"
-version = "57.0.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1a640186d3bd30a24cb42264c2dafb30e236a6f50d510e56d40b708c9582491"
+checksum = "ced5406f8b720cc0bc3aa9cf5758f93e8593cda5490677aa194e4b4b383f9a59"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -263,28 +192,52 @@ dependencies = [
 
 [[package]]
 name = "arrow-array"
-version = "57.0.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "219fe420e6800979744c8393b687afb0252b3f8a89b91027d27887b72aa36d31"
+checksum = "772bd34cacdda8baec9418d80d23d0fb4d50ef0735685bd45158b83dfeb6e62d"
 dependencies = [
- "ahash 0.8.12",
+ "ahash",
  "arrow-buffer",
  "arrow-data",
  "arrow-schema",
  "chrono",
  "chrono-tz",
  "half",
- "hashbrown 0.16.0",
+ "hashbrown 0.16.1",
  "num-complex",
  "num-integer",
  "num-traits",
 ]
 
+[[package]]
+name = "arrow-avro"
+version = "58.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "36a3aadd016f63dfd4941ae8e13539ba98a3c2995adc3c88b9336d2514f6c8a7"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-schema",
+ "bytes",
+ "bzip2",
+ "crc",
+ "flate2",
+ "indexmap 2.14.0",
+ "liblzma",
+ "rand 0.9.4",
+ "serde",
+ "serde_json",
+ "snap",
+ "strum_macros",
+ "uuid",
+ "zstd",
+]
+
 [[package]]
 name = "arrow-buffer"
-version = "57.0.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76885a2697a7edf6b59577f568b456afc94ce0e2edc15b784ce3685b6c3c5c27"
+checksum = "898f4cf1e9598fdb77f356fdf2134feedfd0ee8d5a4e0a5f573e7d0aec16baa4"
 dependencies = [
  "bytes",
  "half",
@@ -294,13 +247,14 @@ dependencies = [
 
 [[package]]
 name = "arrow-cast"
-version = "57.0.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c9ebb4c987e6b3b236fb4a14b20b34835abfdd80acead3ccf1f9bf399e1f168"
+checksum = "b0127816c96533d20fc938729f48c52d3e48f99717e7a0b5ade77d742510736d"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
  "arrow-data",
+ "arrow-ord",
  "arrow-schema",
  "arrow-select",
  "atoi",
@@ -315,9 +269,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-csv"
-version = "57.0.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92386159c8d4bce96f8bd396b0642a0d544d471bdc2ef34d631aec80db40a09c"
+checksum = "ca025bd0f38eeecb57c2153c0123b960494138e6a957bbda10da2b25415209fe"
 dependencies = [
  "arrow-array",
  "arrow-cast",
@@ -330,9 +284,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-data"
-version = "57.0.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "727681b95de313b600eddc2a37e736dcb21980a40f640314dcf360e2f36bc89b"
+checksum = "42d10beeab2b1c3bb0b53a00f7c944a178b622173a5c7bcabc3cb45d90238df4"
 dependencies = [
  "arrow-buffer",
  "arrow-schema",
@@ -343,9 +297,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-flight"
-version = "57.0.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f70bb56412a007b0cfc116d15f24dda6adeed9611a213852a004cda20085a3b9"
+checksum = "302b2e036335f3f04d65dad3f74ff1f2aae6dc671d6aa04dc6b61193761e16fb"
 dependencies = [
  "arrow-arith",
  "arrow-array",
@@ -371,9 +325,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-ipc"
-version = "57.0.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da9ba92e3de170295c98a84e5af22e2b037f0c7b32449445e6c493b5fca27f27"
+checksum = "609a441080e338147a84e8e6904b6da482cefb957c5cdc0f3398872f69a315d0"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -387,9 +341,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-json"
-version = "57.0.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b969b4a421ae83828591c6bf5450bd52e6d489584142845ad6a861f42fe35df8"
+checksum = "6ead0914e4861a531be48fe05858265cf854a4880b9ed12618b1d08cba9bebc8"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -398,7 +352,7 @@ dependencies = [
  "arrow-schema",
  "chrono",
  "half",
- "indexmap 2.12.0",
+ "indexmap 2.14.0",
  "itoa",
  "lexical-core",
  "memchr",
@@ -411,9 +365,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-ord"
-version = "57.0.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "141c05298b21d03e88062317a1f1a73f5ba7b6eb041b350015b1cd6aabc0519b"
+checksum = "763a7ba279b20b52dad300e68cfc37c17efa65e68623169076855b3a9e941ca5"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -422,23 +376,11 @@ dependencies = [
  "arrow-select",
 ]
 
-[[package]]
-name = "arrow-pyarrow"
-version = "57.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cfcfb2be2e9096236f449c11f425cddde18c4cc540f516d90f066f10a29ed515"
-dependencies = [
- "arrow-array",
- "arrow-data",
- "arrow-schema",
- "pyo3",
-]
-
 [[package]]
 name = "arrow-row"
-version = "57.0.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c5f3c06a6abad6164508ed283c7a02151515cef3de4b4ff2cebbcaeb85533db2"
+checksum = "e14fe367802f16d7668163ff647830258e6e0aeea9a4d79aaedf273af3bdcd3e"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -449,11 +391,11 @@ dependencies = [
 
 [[package]]
 name = "arrow-schema"
-version = "57.0.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9cfa7a03d1eee2a4d061476e1840ad5c9867a544ca6c4c59256496af5d0a8be5"
+checksum = "c30a1365d7a7dc50cc847e54154e6af49e4c4b0fddc9f607b687f29212082743"
 dependencies = [
- "bitflags 2.9.4",
+ "bitflags",
  "serde",
  "serde_core",
  "serde_json",
@@ -461,11 +403,11 @@ dependencies = [
 
 [[package]]
 name = "arrow-select"
-version = "57.0.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bafa595babaad59f2455f4957d0f26448fb472722c186739f4fac0823a1bdb47"
+checksum = "78694888660a9e8ac949853db393af2a8b8fc82c19ce333132dfa2e72cc1a7fe"
 dependencies = [
- "ahash 0.8.12",
+ "ahash",
  "arrow-array",
  "arrow-buffer",
  "arrow-data",
@@ -475,9 +417,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-string"
-version = "57.0.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32f46457dbbb99f2650ff3ac23e46a929e0ab81db809b02aa5511c258348bef2"
+checksum = "61e04a01f8bb73ce54437514c5fd3ee2aa3e8abe4c777ee5cc55853b1652f79e"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -490,23 +432,11 @@ dependencies = [
  "regex-syntax",
 ]
 
-[[package]]
-name = "as_derive_utils"
-version = "0.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff3c96645900a44cf11941c111bd08a6573b0e2f9f69bc9264b179d8fae753c4"
-dependencies = [
- "core_extensions",
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
-
 [[package]]
 name = "astral-tokio-tar"
-version = "0.5.6"
+version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec179a06c1769b1e42e1e2cbe74c7dcdb3d6383c838454d063eaac5bbb7ebbe5"
+checksum = "4ce73b17c62717c4b6a9af10b43e87c578b0cac27e00666d48304d3b7d2c0693"
 dependencies = [
  "filetime",
  "futures-core",
@@ -520,19 +450,14 @@ dependencies = [
 
 [[package]]
 name = "async-compression"
-version = "0.4.19"
+version = "0.4.42"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c"
+checksum = "e79b3f8a79cccc2898f31920fc69f304859b3bd567490f75ebf51ae1c792a9ac"
 dependencies = [
- "bzip2 0.5.2",
- "flate2",
- "futures-core",
- "memchr",
+ "compression-codecs",
+ "compression-core",
  "pin-project-lite",
  "tokio",
- "xz2",
- "zstd",
- "zstd-safe",
 ]
 
 [[package]]
@@ -540,9 +465,6 @@ name = "async-ffi"
 version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f4de21c0feef7e5a556e51af767c953f0501f7f300ba785cc99c47bdc8081a50"
-dependencies = [
- "abi_stable",
-]
 
 [[package]]
 name = "async-recursion"
@@ -552,7 +474,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -574,7 +496,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -585,7 +507,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -611,9 +533,9 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
 
 [[package]]
 name = "aws-config"
-version = "1.8.7"
+version = "1.8.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "04b37ddf8d2e9744a0b9c19ce0b78efe4795339a90b66b7bae77987092cd2e69"
+checksum = "50f156acdd2cf55f5aa53ee416c4ac851cf1222694506c0b1f78c85695e9ca9d"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -630,8 +552,8 @@ dependencies = [
  "bytes",
  "fastrand",
  "hex",
- "http 1.3.1",
- "ring",
+ "http 1.4.0",
+ "sha1 0.10.6",
  "time",
  "tokio",
  "tracing",
@@ -641,9 +563,9 @@ dependencies = [
 
 [[package]]
 name = "aws-credential-types"
-version = "1.2.7"
+version = "1.2.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "799a1290207254984cb7c05245111bc77958b92a3c9bb449598044b36341cce6"
+checksum = "8f20799b373a1be121fe3005fba0c2090af9411573878f224df44b42727fcaf7"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-runtime-api",
@@ -653,9 +575,9 @@ dependencies = [
 
 [[package]]
 name = "aws-lc-rs"
-version = "1.14.0"
+version = "1.16.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "94b8ff6c09cd57b16da53641caa860168b88c172a5ee163b0288d3d6eea12786"
+checksum = "0ec6fb3fe69024a75fa7e1bfb48aa6cf59706a101658ea01bfd33b2b248a038f"
 dependencies = [
  "aws-lc-sys",
  "zeroize",
@@ -663,11 +585,10 @@ dependencies = [
 
 [[package]]
 name = "aws-lc-sys"
-version = "0.31.0"
+version = "0.40.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0e44d16778acaf6a9ec9899b92cebd65580b83f685446bf2e1f5d3d732f99dcd"
+checksum = "f50037ee5e1e41e7b8f9d161680a725bd1626cb6f8c7e901f91f942850852fe7"
 dependencies = [
- "bindgen",
  "cc",
  "cmake",
  "dunce",
@@ -676,9 +597,9 @@ dependencies = [
 
 [[package]]
 name = "aws-runtime"
-version = "1.5.11"
+version = "1.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e1ed337dabcf765ad5f2fb426f13af22d576328aaf09eac8f70953530798ec0"
+checksum = "5dcd93c82209ac7413532388067dce79be5a8780c1786e5fae3df22e4dee2864"
 dependencies = [
  "aws-credential-types",
  "aws-sigv4",
@@ -689,9 +610,10 @@ dependencies = [
  "aws-smithy-types",
  "aws-types",
  "bytes",
+ "bytes-utils",
  "fastrand",
- "http 0.2.12",
- "http-body 0.4.6",
+ "http 1.4.0",
+ "http-body 1.0.1",
  "percent-encoding",
  "pin-project-lite",
  "tracing",
@@ -700,15 +622,16 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sso"
-version = "1.85.0"
+version = "1.98.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f2c741e2e439f07b5d1b33155e246742353d82167c785a2ff547275b7e32483"
+checksum = "d69c77aafa20460c68b6b3213c84f6423b6e76dbf89accd3e1789a686ffd9489"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
  "aws-smithy-json",
+ "aws-smithy-observability",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -716,21 +639,23 @@ dependencies = [
  "bytes",
  "fastrand",
  "http 0.2.12",
+ "http 1.4.0",
  "regex-lite",
  "tracing",
 ]
 
 [[package]]
 name = "aws-sdk-ssooidc"
-version = "1.87.0"
+version = "1.100.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6428ae5686b18c0ee99f6f3c39d94ae3f8b42894cdc35c35d8fb2470e9db2d4c"
+checksum = "1c7e7b09346d5ca22a2a08267555843a6a0127fb20d8964cb6ecfb8fdb190225"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
  "aws-smithy-json",
+ "aws-smithy-observability",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -738,21 +663,23 @@ dependencies = [
  "bytes",
  "fastrand",
  "http 0.2.12",
+ "http 1.4.0",
  "regex-lite",
  "tracing",
 ]
 
 [[package]]
 name = "aws-sdk-sts"
-version = "1.87.0"
+version = "1.103.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5871bec9a79a3e8d928c7788d654f135dde0e71d2dd98089388bab36b37ef607"
+checksum = "c2249b81a2e73a8027c41c378463a81ec39b8510f184f2caab87de912af0f49b"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
  "aws-smithy-json",
+ "aws-smithy-observability",
  "aws-smithy-query",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
@@ -761,15 +688,16 @@ dependencies = [
  "aws-types",
  "fastrand",
  "http 0.2.12",
+ "http 1.4.0",
  "regex-lite",
  "tracing",
 ]
 
 [[package]]
 name = "aws-sigv4"
-version = "1.3.4"
+version = "1.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "084c34162187d39e3740cb635acd73c4e3a551a36146ad6fe8883c929c9f876c"
+checksum = "68dc0b907359b120170613b5c09ccc61304eac3998ff6274b97d93ee6490115a"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-http",
@@ -780,7 +708,7 @@ dependencies = [
  "hex",
  "hmac",
  "http 0.2.12",
- "http 1.3.1",
+ "http 1.4.0",
  "percent-encoding",
  "sha2",
  "time",
@@ -789,9 +717,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-async"
-version = "1.2.5"
+version = "1.2.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e190749ea56f8c42bf15dd76c65e14f8f765233e6df9b0506d9d934ebef867c"
+checksum = "2ffcaf626bdda484571968400c326a244598634dc75fd451325a54ad1a59acfc"
 dependencies = [
  "futures-util",
  "pin-project-lite",
@@ -800,18 +728,19 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http"
-version = "0.62.3"
+version = "0.63.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c4dacf2d38996cf729f55e7a762b30918229917eca115de45dfa8dfb97796c9"
+checksum = "ba1ab2dc1c2c3749ead27180d333c42f11be8b0e934058fb4b2258ee8dbe5231"
 dependencies = [
  "aws-smithy-runtime-api",
  "aws-smithy-types",
  "bytes",
  "bytes-utils",
  "futures-core",
- "http 0.2.12",
- "http 1.3.1",
- "http-body 0.4.6",
+ "futures-util",
+ "http 1.4.0",
+ "http-body 1.0.1",
+ "http-body-util",
  "percent-encoding",
  "pin-project-lite",
  "pin-utils",
@@ -820,15 +749,15 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http-client"
-version = "1.1.1"
+version = "1.1.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "147e8eea63a40315d704b97bf9bc9b8c1402ae94f89d5ad6f7550d963309da1b"
+checksum = "6a2f165a7feee6f263028b899d0a181987f4fa7179a6411a32a439fba7c5f769"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
  "h2",
- "http 1.3.1",
+ "http 1.4.0",
  "hyper",
  "hyper-rustls",
  "hyper-util",
@@ -844,27 +773,27 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-json"
-version = "0.61.5"
+version = "0.62.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eaa31b350998e703e9826b2104dd6f63be0508666e1aba88137af060e8944047"
+checksum = "9648b0bb82a2eedd844052c6ad2a1a822d1f8e3adee5fbf668366717e428856a"
 dependencies = [
  "aws-smithy-types",
 ]
 
 [[package]]
 name = "aws-smithy-observability"
-version = "0.1.3"
+version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9364d5989ac4dd918e5cc4c4bdcc61c9be17dcd2586ea7f69e348fc7c6cab393"
+checksum = "a06c2315d173edbf1920da8ba3a7189695827002e4c0fc961973ab1c54abca9c"
 dependencies = [
  "aws-smithy-runtime-api",
 ]
 
 [[package]]
 name = "aws-smithy-query"
-version = "0.60.7"
+version = "0.60.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb"
+checksum = "1a56d79744fb3edb5d722ef79d86081e121d3b9422cb209eb03aea6aa4f21ebd"
 dependencies = [
  "aws-smithy-types",
  "urlencoding",
@@ -872,9 +801,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime"
-version = "1.9.2"
+version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fa63ad37685ceb7762fa4d73d06f1d5493feb88e3f27259b9ed277f4c01b185"
+checksum = "0504b1ab12debb5959e5165ee5fe97dd387e7aa7ea6a477bfd7635dfe769a4f5"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-http",
@@ -885,9 +814,10 @@ dependencies = [
  "bytes",
  "fastrand",
  "http 0.2.12",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body 0.4.6",
  "http-body 1.0.1",
+ "http-body-util",
  "pin-project-lite",
  "pin-utils",
  "tokio",
@@ -896,32 +826,44 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime-api"
-version = "1.9.0"
+version = "1.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07f5e0fc8a6b3f2303f331b94504bbf754d85488f402d6f1dd7a6080f99afe56"
+checksum = "b71a13df6ada0aafbf21a73bdfcdf9324cfa9df77d96b8446045be3cde61b42e"
 dependencies = [
  "aws-smithy-async",
+ "aws-smithy-runtime-api-macros",
  "aws-smithy-types",
  "bytes",
  "http 0.2.12",
- "http 1.3.1",
+ "http 1.4.0",
  "pin-project-lite",
  "tokio",
  "tracing",
  "zeroize",
 ]
 
+[[package]]
+name = "aws-smithy-runtime-api-macros"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8d7396fd9500589e62e460e987ecb671bad374934e55ec3b5f498cc7a8a8a7b7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "aws-smithy-types"
-version = "1.3.2"
+version = "1.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d498595448e43de7f4296b7b7a18a8a02c61ec9349128c80a368f7c3b4ab11a8"
+checksum = "9d73dbfbaa8e4bc57b9045137680b958d274823509a360abfd8e1d514d40c95c"
 dependencies = [
  "base64-simd",
  "bytes",
  "bytes-utils",
  "http 0.2.12",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body 0.4.6",
  "http-body 1.0.1",
  "http-body-util",
@@ -936,18 +878,18 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-xml"
-version = "0.60.10"
+version = "0.60.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3db87b96cb1b16c024980f133968d52882ca0daaee3a086c6decc500f6c99728"
+checksum = "0ce02add1aa3677d022f8adf81dcbe3046a95f17a1b1e8979c145cd21d3d22b3"
 dependencies = [
  "xmlparser",
 ]
 
 [[package]]
 name = "aws-types"
-version = "1.3.8"
+version = "1.3.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b069d19bf01e46298eaedd7c6f283fe565a59263e53eebec945f3e6398f42390"
+checksum = "2f4bbcaa9304ea40902d3d5f42a0428d1bd895a2b0f6999436fb279ffddc58ac"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-async",
@@ -959,14 +901,14 @@ dependencies = [
 
 [[package]]
 name = "axum"
-version = "0.8.4"
+version = "0.8.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "021e862c184ae977658b36c4500f7feac3221ca5da43e3f25bd04ab6c79a29b5"
+checksum = "31b698c5f9a010f6573133b09e0de5408834d0c82f8d7475a89fc1867a71cd90"
 dependencies = [
  "axum-core",
  "bytes",
  "futures-util",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body 1.0.1",
  "http-body-util",
  "itoa",
@@ -975,8 +917,7 @@ dependencies = [
  "mime",
  "percent-encoding",
  "pin-project-lite",
- "rustversion",
- "serde",
+ "serde_core",
  "sync_wrapper",
  "tower",
  "tower-layer",
@@ -985,18 +926,17 @@ dependencies = [
 
 [[package]]
 name = "axum-core"
-version = "0.5.2"
+version = "0.5.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "68464cd0412f486726fb3373129ef5d2993f90c34bc2bc1c1e9943b2f4fc7ca6"
+checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1"
 dependencies = [
  "bytes",
  "futures-core",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body 1.0.1",
  "http-body-util",
  "mime",
  "pin-project-lite",
- "rustversion",
  "sync_wrapper",
  "tower-layer",
  "tower-service",
@@ -1026,61 +966,22 @@ dependencies = [
 
 [[package]]
 name = "bigdecimal"
-version = "0.4.8"
+version = "0.4.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a22f228ab7a1b23027ccc6c350b72868017af7ea8356fbdf19f8d991c690013"
+checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695"
 dependencies = [
  "autocfg",
  "libm",
  "num-bigint",
  "num-integer",
  "num-traits",
- "serde",
-]
-
-[[package]]
-name = "bindgen"
-version = "0.72.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
-dependencies = [
- "bitflags 2.9.4",
- "cexpr",
- "clang-sys",
- "itertools 0.13.0",
- "log",
- "prettyplease",
- "proc-macro2",
- "quote",
- "regex",
- "rustc-hash",
- "shlex",
- "syn 2.0.108",
 ]
 
 [[package]]
 name = "bitflags"
-version = "1.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
-
-[[package]]
-name = "bitflags"
-version = "2.9.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394"
-
-[[package]]
-name = "bitvec"
-version = "1.0.1"
+version = "2.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c"
-dependencies = [
- "funty",
- "radium",
- "tap",
- "wyz",
-]
+checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
 
 [[package]]
 name = "blake2"
@@ -1088,20 +989,21 @@ version = "0.10.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe"
 dependencies = [
- "digest",
+ "digest 0.10.7",
 ]
 
 [[package]]
 name = "blake3"
-version = "1.8.2"
+version = "1.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3888aaa89e4b2a40fca9848e400f6a658a5a3978de7be858e209cafa8be9a4a0"
+checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce"
 dependencies = [
  "arrayref",
  "arrayvec",
  "cc",
  "cfg-if",
  "constant_time_eq",
+ "cpufeatures 0.3.0",
 ]
 
 [[package]]
@@ -1113,24 +1015,32 @@ dependencies = [
  "generic-array",
 ]
 
+[[package]]
+name = "block-buffer"
+version = "0.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be"
+dependencies = [
+ "hybrid-array",
+]
+
 [[package]]
 name = "bollard"
-version = "0.19.3"
+version = "0.20.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec7646ee90964aa59e9f832a67182791396a19a5b1d76eb17599a8310a7e2e09"
+checksum = "ee04c4c84f1f811b017f2fbb7dd8815c976e7ca98593de9c1e2afad0f636bff4"
 dependencies = [
  "async-stream",
  "base64 0.22.1",
- "bitflags 2.9.4",
+ "bitflags",
  "bollard-buildkit-proto",
  "bollard-stubs",
  "bytes",
- "chrono",
  "futures-core",
  "futures-util",
  "hex",
  "home",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body-util",
  "hyper",
  "hyper-named-pipe",
@@ -1140,17 +1050,16 @@ dependencies = [
  "log",
  "num",
  "pin-project-lite",
- "rand 0.9.2",
+ "rand 0.9.4",
  "rustls",
  "rustls-native-certs",
- "rustls-pemfile",
  "rustls-pki-types",
  "serde",
  "serde_derive",
  "serde_json",
- "serde_repr",
  "serde_urlencoded",
  "thiserror",
+ "time",
  "tokio",
  "tokio-stream",
  "tokio-util",
@@ -1175,67 +1084,18 @@ dependencies = [
 
 [[package]]
 name = "bollard-stubs"
-version = "1.49.1-rc.28.4.0"
+version = "1.52.1-rc.29.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5731fe885755e92beff1950774068e0cae67ea6ec7587381536fca84f1779623"
+checksum = "0f0a8ca8799131c1837d1282c3f81f31e76ceb0ce426e04a7fe1ccee3287c066"
 dependencies = [
  "base64 0.22.1",
  "bollard-buildkit-proto",
  "bytes",
- "chrono",
  "prost",
  "serde",
  "serde_json",
  "serde_repr",
- "serde_with",
-]
-
-[[package]]
-name = "bon"
-version = "3.7.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c2529c31017402be841eb45892278a6c21a000c0a17643af326c73a73f83f0fb"
-dependencies = [
- "bon-macros",
- "rustversion",
-]
-
-[[package]]
-name = "bon-macros"
-version = "3.7.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d82020dadcb845a345591863adb65d74fa8dc5c18a0b6d408470e13b7adc7005"
-dependencies = [
- "darling",
- "ident_case",
- "prettyplease",
- "proc-macro2",
- "quote",
- "rustversion",
- "syn 2.0.108",
-]
-
-[[package]]
-name = "borsh"
-version = "1.5.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad8646f98db542e39fc66e68a20b2144f6a732636df7c2354e74645faaa433ce"
-dependencies = [
- "borsh-derive",
- "cfg_aliases",
-]
-
-[[package]]
-name = "borsh-derive"
-version = "1.5.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fdd1d3c0c2f5833f22386f252fe8ed005c7f59fdcddeef025c01b4c3b9fd9ac3"
-dependencies = [
- "once_cell",
- "proc-macro-crate",
- "proc-macro2",
- "quote",
- "syn 2.0.108",
+ "time",
 ]
 
 [[package]]
@@ -1261,9 +1121,9 @@ dependencies = [
 
 [[package]]
 name = "bstr"
-version = "1.12.0"
+version = "1.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4"
+checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab"
 dependencies = [
  "memchr",
  "serde",
@@ -1271,31 +1131,9 @@ dependencies = [
 
 [[package]]
 name = "bumpalo"
-version = "3.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43"
-
-[[package]]
-name = "bytecheck"
-version = "0.6.12"
+version = "3.20.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "23cdc57ce23ac53c931e88a43d06d070a6fd142f2617be5855eb75efc9beb1c2"
-dependencies = [
- "bytecheck_derive",
- "ptr_meta",
- "simdutf8",
-]
-
-[[package]]
-name = "bytecheck_derive"
-version = "0.6.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3db406d29fbcd95542e92559bed4d8ad92636d1ca8b3b72ede10b4bcc010e659"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
+checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
 
 [[package]]
 name = "byteorder"
@@ -1305,9 +1143,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
 [[package]]
 name = "bytes"
-version = "1.10.1"
+version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
+checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
 
 [[package]]
 name = "bytes-utils"
@@ -1319,15 +1157,6 @@ dependencies = [
  "either",
 ]
 
-[[package]]
-name = "bzip2"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47"
-dependencies = [
- "bzip2-sys",
-]
-
 [[package]]
 name = "bzip2"
 version = "0.6.1"
@@ -1337,16 +1166,6 @@ dependencies = [
  "libbz2-rs-sys",
 ]
 
-[[package]]
-name = "bzip2-sys"
-version = "0.1.13+1.0.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14"
-dependencies = [
- "cc",
- "pkg-config",
-]
-
 [[package]]
 name = "cast"
 version = "0.3.0"
@@ -1355,9 +1174,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 
 [[package]]
 name = "cc"
-version = "1.2.38"
+version = "1.2.61"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "80f41ae168f955c12fb8960b057d70d0ca153fb83182b57d86380443527be7e9"
+checksum = "d16d90359e986641506914ba71350897565610e87ce0ad9e6f28569db3dd5c6d"
 dependencies = [
  "find-msvc-tools",
  "jobserver",
@@ -1365,20 +1184,11 @@ dependencies = [
  "shlex",
 ]
 
-[[package]]
-name = "cexpr"
-version = "0.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
-dependencies = [
- "nom",
-]
-
 [[package]]
 name = "cfg-if"
-version = "1.0.3"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
 
 [[package]]
 name = "cfg_aliases"
@@ -1387,17 +1197,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
 
 [[package]]
-name = "chrono"
-version = "0.4.42"
+name = "chacha20"
+version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2"
+checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601"
 dependencies = [
- "iana-time-zone",
+ "cfg-if",
+ "cpufeatures 0.3.0",
+ "rand_core 0.10.1",
+]
+
+[[package]]
+name = "chrono"
+version = "0.4.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0"
+dependencies = [
+ "iana-time-zone",
  "js-sys",
  "num-traits",
  "serde",
  "wasm-bindgen",
- "windows-link 0.2.0",
+ "windows-link",
 ]
 
 [[package]]
@@ -1437,33 +1258,11 @@ dependencies = [
  "half",
 ]
 
-[[package]]
-name = "clang-sys"
-version = "1.8.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
-dependencies = [
- "glob",
- "libc",
- "libloading 0.8.9",
-]
-
-[[package]]
-name = "clap"
-version = "2.34.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c"
-dependencies = [
- "bitflags 1.3.2",
- "textwrap",
- "unicode-width 0.1.14",
-]
-
 [[package]]
 name = "clap"
-version = "4.5.50"
+version = "4.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c2cfd7bf8a6017ddaa4e32ffe7403d547790db06bd171c1c53926faab501623"
+checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -1471,9 +1270,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.50"
+version = "4.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0a4c05b9e80c5ccd3a7ef080ad7b6ba7d6fc00a985b8b157197075677c82c7a0"
+checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
 dependencies = [
  "anstream",
  "anstyle",
@@ -1483,21 +1282,21 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.5.49"
+version = "4.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671"
+checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9"
 dependencies = [
- "heck 0.5.0",
+ "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "clap_lex"
-version = "0.7.5"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675"
+checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
 
 [[package]]
 name = "clipboard-win"
@@ -1510,53 +1309,66 @@ dependencies = [
 
 [[package]]
 name = "cmake"
-version = "0.1.54"
+version = "0.1.58"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0"
+checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678"
 dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "cmov"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f88a43d011fc4a6876cb7344703e297c71dda42494fee094d5f7c76bf13f746"
+
 [[package]]
 name = "colorchoice"
-version = "1.0.4"
+version = "1.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75"
+checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
 
 [[package]]
 name = "comfy-table"
-version = "7.1.2"
+version = "7.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56"
+checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47"
 dependencies = [
- "strum 0.26.3",
- "strum_macros 0.26.4",
- "unicode-width 0.2.1",
+ "unicode-segmentation",
+ "unicode-width 0.2.2",
 ]
 
 [[package]]
-name = "console"
-version = "0.15.11"
+name = "compression-codecs"
+version = "0.4.38"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8"
+checksum = "ce2548391e9c1929c21bf6aa2680af86fe4c1b33e6cea9ac1cfeec0bd11218cf"
 dependencies = [
- "encode_unicode",
- "libc",
- "once_cell",
- "windows-sys 0.59.0",
+ "bzip2",
+ "compression-core",
+ "flate2",
+ "liblzma",
+ "memchr",
+ "zstd",
+ "zstd-safe",
 ]
 
+[[package]]
+name = "compression-core"
+version = "0.4.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cc14f565cf027a105f7a44ccf9e5b424348421a1d8952a8fc9d499d313107789"
+
 [[package]]
 name = "console"
-version = "0.16.1"
+version = "0.16.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b430743a6eb14e9764d4260d4c0d8123087d504eeb9c48f2b2a5e810dd369df4"
+checksum = "d64e8af5551369d19cf50138de61f1c42074ab970f74e99be916646777f8fc87"
 dependencies = [
  "encode_unicode",
  "libc",
- "once_cell",
- "unicode-width 0.2.1",
- "windows-sys 0.61.0",
+ "unicode-width 0.2.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -1569,6 +1381,12 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "const-oid"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c"
+
 [[package]]
 name = "const-random"
 version = "0.1.18"
@@ -1584,25 +1402,16 @@ version = "0.1.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
 dependencies = [
- "getrandom 0.2.16",
+ "getrandom 0.2.17",
  "once_cell",
  "tiny-keccak",
 ]
 
-[[package]]
-name = "const_panic"
-version = "0.2.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e262cdaac42494e3ae34c43969f9cdeb7da178bdb4b66fa6a1ea2edb4c8ae652"
-dependencies = [
- "typewit",
-]
-
 [[package]]
 name = "constant_time_eq"
-version = "0.3.1"
+version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
+checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
 
 [[package]]
 name = "core-foundation"
@@ -1621,29 +1430,38 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 
 [[package]]
-name = "core_extensions"
-version = "1.5.4"
+name = "cpufeatures"
+version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42bb5e5d0269fd4f739ea6cedaf29c16d81c27a7ce7582008e90eb50dcd57003"
+checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
 dependencies = [
- "core_extensions_proc_macros",
+ "libc",
 ]
 
 [[package]]
-name = "core_extensions_proc_macros"
-version = "1.5.4"
+name = "cpufeatures"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "533d38ecd2709b7608fb8e18e4504deb99e9a72879e6aa66373a76d8dc4259ea"
+checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201"
+dependencies = [
+ "libc",
+]
 
 [[package]]
-name = "cpufeatures"
-version = "0.2.17"
+name = "crc"
+version = "3.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
+checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d"
 dependencies = [
- "libc",
+ "crc-catalog",
 ]
 
+[[package]]
+name = "crc-catalog"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "217698eaf96b4a3f0bc4f3662aaa55bdf913cd54d7204591faa790070c6d0853"
+
 [[package]]
 name = "crc32fast"
 version = "1.5.0"
@@ -1655,19 +1473,21 @@ dependencies = [
 
 [[package]]
 name = "criterion"
-version = "0.7.0"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e1c047a62b0cc3e145fa84415a3191f628e980b194c2755aa12300a4e6cbd928"
+checksum = "950046b2aa2492f9a536f5f4f9a3de7b9e2476e575e05bd6c333371add4d98f3"
 dependencies = [
+ "alloca",
  "anes",
  "cast",
  "ciborium",
- "clap 4.5.50",
+ "clap",
  "criterion-plot",
  "futures",
  "itertools 0.13.0",
  "num-traits",
  "oorandom",
+ "page_size",
  "plotters",
  "rayon",
  "regex",
@@ -1680,23 +1500,14 @@ dependencies = [
 
 [[package]]
 name = "criterion-plot"
-version = "0.6.0"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b1bcc0dc7dfae599d84ad0b1a55f80cde8af3725da8313b528da95ef783e338"
+checksum = "d8d80a2f4f5b554395e47b5d8305bc3d27813bacb73493eb1001e8f76dae29ea"
 dependencies = [
  "cast",
  "itertools 0.13.0",
 ]
 
-[[package]]
-name = "crossbeam-channel"
-version = "0.5.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2"
-dependencies = [
- "crossbeam-utils",
-]
-
 [[package]]
 name = "crossbeam-deque"
 version = "0.8.6"
@@ -1730,50 +1541,69 @@ checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
 
 [[package]]
 name = "crypto-common"
-version = "0.1.6"
+version = "0.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
+checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
 dependencies = [
  "generic-array",
  "typenum",
 ]
 
+[[package]]
+name = "crypto-common"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77727bb15fa921304124b128af125e7e3b968275d1b108b379190264f4423710"
+dependencies = [
+ "hybrid-array",
+]
+
 [[package]]
 name = "csv"
-version = "1.3.1"
+version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf"
+checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938"
 dependencies = [
  "csv-core",
  "itoa",
  "ryu",
- "serde",
+ "serde_core",
 ]
 
 [[package]]
 name = "csv-core"
-version = "0.1.12"
+version = "0.1.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d"
+checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782"
 dependencies = [
  "memchr",
 ]
 
 [[package]]
 name = "ctor"
-version = "0.6.1"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3ffc71fcdcdb40d6f087edddf7f8f1f8f79e6cf922f555a9ee8779752d4819bd"
+checksum = "83cf0d42651b16c6dfe68685716d18480d18a9c39c62d76e8cf3eb6ed5d8bcbf"
 dependencies = [
  "ctor-proc-macro",
  "dtor",
+ "link-section",
 ]
 
 [[package]]
 name = "ctor-proc-macro"
-version = "0.0.7"
+version = "0.0.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a949c44fcacbbbb7ada007dc7acb34603dd97cd47de5d054f2b6493ecebb483"
+
+[[package]]
+name = "ctutils"
+version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "52560adf09603e58c9a7ee1fe1dcb95a16927b17c127f0ac02d6e768a0e25bc1"
+checksum = "7d5515a3834141de9eafb9717ad39eea8247b5674e6066c404e8c4b365d2a29e"
+dependencies = [
+ "cmov",
+]
 
 [[package]]
 name = "cty"
@@ -1783,9 +1613,9 @@ checksum = "b365fabc795046672053e29c954733ec3b05e4be654ab130fe8f1f94d7051f35"
 
 [[package]]
 name = "darling"
-version = "0.21.3"
+version = "0.23.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0"
+checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d"
 dependencies = [
  "darling_core",
  "darling_macro",
@@ -1793,27 +1623,26 @@ dependencies = [
 
 [[package]]
 name = "darling_core"
-version = "0.21.3"
+version = "0.23.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4"
+checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0"
 dependencies = [
- "fnv",
  "ident_case",
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "darling_macro"
-version = "0.21.3"
+version = "0.23.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81"
+checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d"
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -1832,13 +1661,13 @@ dependencies = [
 
 [[package]]
 name = "datafusion"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "arrow-schema",
  "async-trait",
  "bytes",
- "bzip2 0.6.1",
+ "bzip2",
  "chrono",
  "criterion",
  "ctor",
@@ -1877,16 +1706,18 @@ dependencies = [
  "flate2",
  "futures",
  "glob",
+ "indexmap 2.14.0",
  "insta",
  "itertools 0.14.0",
+ "liblzma",
  "log",
  "nix",
  "object_store",
  "parking_lot",
  "parquet",
- "paste",
- "rand 0.9.2",
+ "rand 0.9.4",
  "rand_distr",
+ "recursive",
  "regex",
  "rstest",
  "serde",
@@ -1898,15 +1729,18 @@ dependencies = [
  "tokio",
  "url",
  "uuid",
- "xz2",
  "zstd",
 ]
 
 [[package]]
 name = "datafusion-benchmarks"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
+ "async-trait",
+ "bytes",
+ "clap",
+ "criterion",
  "datafusion",
  "datafusion-common",
  "datafusion-proto",
@@ -1917,19 +1751,19 @@ dependencies = [
  "mimalloc",
  "object_store",
  "parquet",
- "rand 0.9.2",
+ "rand 0.9.4",
  "regex",
  "serde",
  "serde_json",
  "snmalloc-rs",
- "structopt",
+ "tempfile",
  "tokio",
  "tokio-util",
 ]
 
 [[package]]
 name = "datafusion-catalog"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "async-trait",
@@ -1952,10 +1786,11 @@ dependencies = [
 
 [[package]]
 name = "datafusion-catalog-listing"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "async-trait",
+ "chrono",
  "datafusion-catalog",
  "datafusion-common",
  "datafusion-datasource",
@@ -1970,19 +1805,18 @@ dependencies = [
  "itertools 0.14.0",
  "log",
  "object_store",
- "tokio",
 ]
 
 [[package]]
 name = "datafusion-cli"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "async-trait",
  "aws-config",
  "aws-credential-types",
  "chrono",
- "clap 4.5.50",
+ "clap",
  "ctor",
  "datafusion",
  "datafusion-common",
@@ -1999,7 +1833,7 @@ dependencies = [
  "regex",
  "rstest",
  "rustyline",
- "testcontainers",
+ "serde_json",
  "testcontainers-modules",
  "tokio",
  "url",
@@ -2007,34 +1841,35 @@ dependencies = [
 
 [[package]]
 name = "datafusion-common"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
- "ahash 0.8.12",
- "apache-avro",
  "arrow",
  "arrow-ipc",
+ "arrow-schema",
  "chrono",
+ "criterion",
+ "foldhash 0.2.0",
  "half",
- "hashbrown 0.14.5",
+ "hashbrown 0.17.0",
  "hex",
- "indexmap 2.12.0",
+ "indexmap 2.14.0",
  "insta",
+ "itertools 0.14.0",
  "libc",
  "log",
  "object_store",
  "parquet",
- "paste",
- "pyo3",
- "rand 0.9.2",
+ "rand 0.9.4",
  "recursive",
  "sqlparser",
  "tokio",
+ "uuid",
  "web-time",
 ]
 
 [[package]]
 name = "datafusion-common-runtime"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "futures",
  "log",
@@ -2043,13 +1878,13 @@ dependencies = [
 
 [[package]]
 name = "datafusion-datasource"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "async-compression",
  "async-trait",
  "bytes",
- "bzip2 0.6.1",
+ "bzip2",
  "chrono",
  "criterion",
  "datafusion-common",
@@ -2064,21 +1899,23 @@ dependencies = [
  "flate2",
  "futures",
  "glob",
+ "insta",
  "itertools 0.14.0",
+ "liblzma",
  "log",
  "object_store",
- "rand 0.9.2",
+ "parking_lot",
+ "rand 0.9.4",
  "tempfile",
  "tokio",
  "tokio-util",
  "url",
- "xz2",
  "zstd",
 ]
 
 [[package]]
 name = "datafusion-datasource-arrow"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "arrow-ipc",
@@ -2101,26 +1938,24 @@ dependencies = [
 
 [[package]]
 name = "datafusion-datasource-avro"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
- "apache-avro",
  "arrow",
+ "arrow-avro",
  "async-trait",
  "bytes",
  "datafusion-common",
  "datafusion-datasource",
- "datafusion-physical-expr-common",
+ "datafusion-physical-expr-adapter",
  "datafusion-physical-plan",
  "datafusion-session",
  "futures",
- "num-traits",
  "object_store",
- "serde_json",
 ]
 
 [[package]]
 name = "datafusion-datasource-csv"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "async-trait",
@@ -2141,7 +1976,7 @@ dependencies = [
 
 [[package]]
 name = "datafusion-datasource-json"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "async-trait",
@@ -2156,23 +1991,28 @@ dependencies = [
  "datafusion-session",
  "futures",
  "object_store",
+ "serde_json",
  "tokio",
+ "tokio-stream",
 ]
 
 [[package]]
 name = "datafusion-datasource-parquet"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "async-trait",
  "bytes",
  "chrono",
+ "criterion",
  "datafusion-common",
  "datafusion-common-runtime",
  "datafusion-datasource",
  "datafusion-execution",
  "datafusion-expr",
+ "datafusion-functions",
  "datafusion-functions-aggregate-common",
+ "datafusion-functions-nested",
  "datafusion-physical-expr",
  "datafusion-physical-expr-adapter",
  "datafusion-physical-expr-common",
@@ -2185,16 +2025,17 @@ dependencies = [
  "object_store",
  "parking_lot",
  "parquet",
+ "tempfile",
  "tokio",
 ]
 
 [[package]]
 name = "datafusion-doc"
-version = "50.3.0"
+version = "53.1.0"
 
 [[package]]
 name = "datafusion-examples"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "arrow-flight",
@@ -2204,18 +2045,25 @@ dependencies = [
  "bytes",
  "dashmap",
  "datafusion",
- "datafusion-ffi",
+ "datafusion-common",
+ "datafusion-expr",
  "datafusion-physical-expr-adapter",
  "datafusion-proto",
+ "datafusion-sql",
  "env_logger",
  "futures",
+ "insta",
  "log",
  "mimalloc",
  "nix",
+ "nom",
  "object_store",
  "prost",
- "rand 0.9.2",
+ "rand 0.9.4",
+ "serde",
  "serde_json",
+ "strum",
+ "strum_macros",
  "tempfile",
  "test-utils",
  "tokio",
@@ -2228,30 +2076,33 @@ dependencies = [
 
 [[package]]
 name = "datafusion-execution"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
+ "arrow-buffer",
  "async-trait",
  "chrono",
  "dashmap",
  "datafusion-common",
  "datafusion-expr",
+ "datafusion-physical-expr-common",
  "futures",
  "insta",
  "log",
  "object_store",
  "parking_lot",
  "parquet",
- "rand 0.9.2",
+ "rand 0.9.4",
  "tempfile",
  "url",
 ]
 
 [[package]]
 name = "datafusion-expr"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
+ "arrow-schema",
  "async-trait",
  "chrono",
  "ctor",
@@ -2262,10 +2113,9 @@ dependencies = [
  "datafusion-functions-window-common",
  "datafusion-physical-expr-common",
  "env_logger",
- "indexmap 2.12.0",
+ "indexmap 2.14.0",
  "insta",
  "itertools 0.14.0",
- "paste",
  "recursive",
  "serde_json",
  "sqlparser",
@@ -2273,40 +2123,54 @@ dependencies = [
 
 [[package]]
 name = "datafusion-expr-common"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "datafusion-common",
- "indexmap 2.12.0",
+ "indexmap 2.14.0",
+ "insta",
  "itertools 0.14.0",
- "paste",
 ]
 
 [[package]]
 name = "datafusion-ffi"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
- "abi_stable",
  "arrow",
  "arrow-schema",
  "async-ffi",
  "async-trait",
  "datafusion",
+ "datafusion-catalog",
  "datafusion-common",
+ "datafusion-datasource",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-functions",
+ "datafusion-functions-aggregate",
  "datafusion-functions-aggregate-common",
+ "datafusion-functions-table",
+ "datafusion-functions-window",
+ "datafusion-physical-expr",
+ "datafusion-physical-expr-common",
+ "datafusion-physical-optimizer",
+ "datafusion-physical-plan",
  "datafusion-proto",
  "datafusion-proto-common",
+ "datafusion-session",
  "doc-comment",
  "futures",
+ "libloading",
  "log",
  "prost",
  "semver",
+ "stabby",
  "tokio",
 ]
 
 [[package]]
 name = "datafusion-functions"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "arrow-buffer",
@@ -2314,6 +2178,7 @@ dependencies = [
  "blake2",
  "blake3",
  "chrono",
+ "chrono-tz",
  "criterion",
  "ctor",
  "datafusion-common",
@@ -2322,25 +2187,25 @@ dependencies = [
  "datafusion-expr",
  "datafusion-expr-common",
  "datafusion-macros",
+ "datafusion-physical-expr-common",
  "env_logger",
  "hex",
  "itertools 0.14.0",
  "log",
- "md-5",
+ "md-5 0.11.0",
+ "memchr",
  "num-traits",
- "rand 0.9.2",
+ "rand 0.9.4",
  "regex",
  "sha2",
  "tokio",
- "unicode-segmentation",
  "uuid",
 ]
 
 [[package]]
 name = "datafusion-functions-aggregate"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
- "ahash 0.8.12",
  "arrow",
  "criterion",
  "datafusion-common",
@@ -2351,28 +2216,28 @@ dependencies = [
  "datafusion-macros",
  "datafusion-physical-expr",
  "datafusion-physical-expr-common",
+ "foldhash 0.2.0",
  "half",
  "log",
- "paste",
- "rand 0.9.2",
+ "num-traits",
+ "rand 0.9.4",
 ]
 
 [[package]]
 name = "datafusion-functions-aggregate-common"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
- "ahash 0.8.12",
  "arrow",
  "criterion",
  "datafusion-common",
  "datafusion-expr-common",
  "datafusion-physical-expr-common",
- "rand 0.9.2",
+ "rand 0.9.4",
 ]
 
 [[package]]
 name = "datafusion-functions-nested"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "arrow-ord",
@@ -2386,16 +2251,19 @@ dependencies = [
  "datafusion-functions-aggregate",
  "datafusion-functions-aggregate-common",
  "datafusion-macros",
+ "datafusion-physical-expr",
  "datafusion-physical-expr-common",
+ "hashbrown 0.17.0",
  "itertools 0.14.0",
+ "itoa",
  "log",
- "paste",
- "rand 0.9.2",
+ "memchr",
+ "rand 0.9.4",
 ]
 
 [[package]]
 name = "datafusion-functions-table"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "async-trait",
@@ -2404,14 +2272,14 @@ dependencies = [
  "datafusion-expr",
  "datafusion-physical-plan",
  "parking_lot",
- "paste",
 ]
 
 [[package]]
 name = "datafusion-functions-window"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
+ "criterion",
  "datafusion-common",
  "datafusion-doc",
  "datafusion-expr",
@@ -2420,12 +2288,11 @@ dependencies = [
  "datafusion-physical-expr",
  "datafusion-physical-expr-common",
  "log",
- "paste",
 ]
 
 [[package]]
 name = "datafusion-functions-window-common"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "datafusion-common",
  "datafusion-physical-expr-common",
@@ -2433,16 +2300,16 @@ dependencies = [
 
 [[package]]
 name = "datafusion-macros"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "datafusion-doc",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "datafusion-optimizer"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "async-trait",
@@ -2458,7 +2325,7 @@ dependencies = [
  "datafusion-physical-expr",
  "datafusion-sql",
  "env_logger",
- "indexmap 2.12.0",
+ "indexmap 2.14.0",
  "insta",
  "itertools 0.14.0",
  "log",
@@ -2469,9 +2336,8 @@ dependencies = [
 
 [[package]]
 name = "datafusion-physical-expr"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
- "ahash 0.8.12",
  "arrow",
  "criterion",
  "datafusion-common",
@@ -2481,20 +2347,21 @@ dependencies = [
  "datafusion-functions-aggregate-common",
  "datafusion-physical-expr-common",
  "half",
- "hashbrown 0.14.5",
- "indexmap 2.12.0",
+ "hashbrown 0.17.0",
+ "indexmap 2.14.0",
  "insta",
  "itertools 0.14.0",
  "parking_lot",
- "paste",
- "petgraph 0.8.3",
- "rand 0.9.2",
+ "petgraph",
+ "rand 0.9.4",
+ "recursive",
  "rstest",
+ "tokio",
 ]
 
 [[package]]
 name = "datafusion-physical-expr-adapter"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "datafusion-common",
@@ -2507,19 +2374,24 @@ dependencies = [
 
 [[package]]
 name = "datafusion-physical-expr-common"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
- "ahash 0.8.12",
  "arrow",
+ "chrono",
+ "criterion",
  "datafusion-common",
  "datafusion-expr-common",
- "hashbrown 0.14.5",
+ "hashbrown 0.17.0",
+ "indexmap 2.14.0",
  "itertools 0.14.0",
+ "parking_lot",
+ "pin-project",
+ "rand 0.9.4",
 ]
 
 [[package]]
 name = "datafusion-physical-optimizer"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "datafusion-common",
@@ -2527,6 +2399,7 @@ dependencies = [
  "datafusion-expr",
  "datafusion-expr-common",
  "datafusion-functions",
+ "datafusion-functions-window",
  "datafusion-physical-expr",
  "datafusion-physical-expr-common",
  "datafusion-physical-plan",
@@ -2539,19 +2412,19 @@ dependencies = [
 
 [[package]]
 name = "datafusion-physical-plan"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
- "ahash 0.8.12",
  "arrow",
+ "arrow-data",
  "arrow-ord",
  "arrow-schema",
  "async-trait",
- "chrono",
  "criterion",
  "datafusion-common",
  "datafusion-common-runtime",
  "datafusion-execution",
  "datafusion-expr",
+ "datafusion-functions",
  "datafusion-functions-aggregate",
  "datafusion-functions-aggregate-common",
  "datafusion-functions-window",
@@ -2560,14 +2433,15 @@ dependencies = [
  "datafusion-physical-expr-common",
  "futures",
  "half",
- "hashbrown 0.14.5",
- "indexmap 2.12.0",
+ "hashbrown 0.17.0",
+ "indexmap 2.14.0",
  "insta",
  "itertools 0.14.0",
  "log",
+ "num-traits",
  "parking_lot",
  "pin-project-lite",
- "rand 0.9.2",
+ "rand 0.9.4",
  "rstest",
  "rstest_reuse",
  "tokio",
@@ -2575,9 +2449,10 @@ dependencies = [
 
 [[package]]
 name = "datafusion-proto"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
+ "async-trait",
  "chrono",
  "datafusion",
  "datafusion-catalog",
@@ -2601,7 +2476,7 @@ dependencies = [
  "datafusion-proto-common",
  "doc-comment",
  "object_store",
- "pbjson",
+ "pbjson 0.9.0",
  "pretty_assertions",
  "prost",
  "serde",
@@ -2611,19 +2486,19 @@ dependencies = [
 
 [[package]]
 name = "datafusion-proto-common"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "datafusion-common",
  "doc-comment",
- "pbjson",
+ "pbjson 0.9.0",
  "prost",
  "serde",
 ]
 
 [[package]]
 name = "datafusion-pruning"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "datafusion-common",
@@ -2641,7 +2516,7 @@ dependencies = [
 
 [[package]]
 name = "datafusion-session"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "async-trait",
  "datafusion-common",
@@ -2653,27 +2528,35 @@ dependencies = [
 
 [[package]]
 name = "datafusion-spark"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "bigdecimal",
  "chrono",
  "crc32fast",
  "criterion",
+ "datafusion",
  "datafusion-catalog",
  "datafusion-common",
  "datafusion-execution",
  "datafusion-expr",
  "datafusion-functions",
+ "datafusion-functions-aggregate",
+ "datafusion-functions-aggregate-common",
+ "datafusion-functions-nested",
  "log",
- "rand 0.9.2",
- "sha1",
+ "num-traits",
+ "percent-encoding",
+ "rand 0.9.4",
+ "serde_json",
+ "sha1 0.11.0",
+ "sha2",
  "url",
 ]
 
 [[package]]
 name = "datafusion-sql"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "bigdecimal",
@@ -2686,11 +2569,10 @@ dependencies = [
  "datafusion-functions-nested",
  "datafusion-functions-window",
  "env_logger",
- "indexmap 2.12.0",
+ "indexmap 2.14.0",
  "insta",
  "itertools 0.14.0",
  "log",
- "paste",
  "recursive",
  "regex",
  "rstest",
@@ -2699,14 +2581,14 @@ dependencies = [
 
 [[package]]
 name = "datafusion-sqllogictest"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "arrow",
  "async-trait",
  "bigdecimal",
  "bytes",
  "chrono",
- "clap 4.5.50",
+ "clap",
  "datafusion",
  "datafusion-spark",
  "datafusion-substrait",
@@ -2717,14 +2599,12 @@ dependencies = [
  "itertools 0.14.0",
  "log",
  "object_store",
- "postgres-protocol",
  "postgres-types",
  "regex",
- "rust_decimal",
+ "serde_json",
  "sqllogictest",
  "sqlparser",
  "tempfile",
- "testcontainers",
  "testcontainers-modules",
  "thiserror",
  "tokio",
@@ -2733,7 +2613,7 @@ dependencies = [
 
 [[package]]
 name = "datafusion-substrait"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
  "async-recursion",
  "async-trait",
@@ -2750,13 +2630,13 @@ dependencies = [
  "substrait",
  "tokio",
  "url",
- "uuid",
 ]
 
 [[package]]
 name = "datafusion-wasmtest"
-version = "50.3.0"
+version = "53.1.0"
 dependencies = [
+ "bytes",
  "chrono",
  "console_error_panic_hook",
  "datafusion",
@@ -2766,6 +2646,7 @@ dependencies = [
  "datafusion-optimizer",
  "datafusion-physical-plan",
  "datafusion-sql",
+ "futures",
  "getrandom 0.3.4",
  "object_store",
  "tokio",
@@ -2776,12 +2657,12 @@ dependencies = [
 
 [[package]]
 name = "deranged"
-version = "0.5.3"
+version = "0.5.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d630bccd429a5bb5a64b5e94f693bfc48c9f8566418fda4c494cc94f911f87cc"
+checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c"
 dependencies = [
  "powerfmt",
- "serde",
+ "serde_core",
 ]
 
 [[package]]
@@ -2796,11 +2677,23 @@ version = "0.10.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
 dependencies = [
- "block-buffer",
- "crypto-common",
+ "block-buffer 0.10.4",
+ "crypto-common 0.1.7",
  "subtle",
 ]
 
+[[package]]
+name = "digest"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4850db49bf08e663084f7fb5c87d202ef91a3907271aff24a94eb97ff039153c"
+dependencies = [
+ "block-buffer 0.12.0",
+ "const-oid",
+ "crypto-common 0.2.1",
+ "ctutils",
+]
+
 [[package]]
 name = "dirs"
 version = "6.0.0"
@@ -2819,7 +2712,7 @@ dependencies = [
  "libc",
  "option-ext",
  "redox_users",
- "windows-sys 0.61.0",
+ "windows-sys 0.60.2",
 ]
 
 [[package]]
@@ -2830,14 +2723,14 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "doc-comment"
-version = "0.3.3"
+version = "0.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
+checksum = "780955b8b195a21ab8e4ac6b60dd1dbdcec1dc6c51c0617964b08c81785e12c9"
 
 [[package]]
 name = "docker_credential"
@@ -2852,18 +2745,18 @@ dependencies = [
 
 [[package]]
 name = "dtor"
-version = "0.1.1"
+version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "404d02eeb088a82cfd873006cb713fe411306c7d182c344905e101fb1167d301"
+checksum = "edf234dd1594d6dd434a8fb8cada51ddbbc593e40e4a01556a0b31c62da2775b"
 dependencies = [
  "dtor-proc-macro",
 ]
 
 [[package]]
 name = "dtor-proc-macro"
-version = "0.0.6"
+version = "0.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f678cf4a922c215c63e0de95eb1ff08a958a81d47e485cf9da1e27bf6305cfa5"
+checksum = "2647271c92754afcb174e758003cfd1cbf1e43e5a7853d7b1813e63e19e39a73"
 
 [[package]]
 name = "dunce"
@@ -2886,7 +2779,7 @@ dependencies = [
  "enum-ordinalize",
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -2903,35 +2796,35 @@ checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
 
 [[package]]
 name = "endian-type"
-version = "0.1.2"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d"
+checksum = "869b0adbda23651a9c5c0c3d270aac9fcb52e8622a8f2b17e57802d7791962f2"
 
 [[package]]
 name = "enum-ordinalize"
-version = "4.3.0"
+version = "4.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fea0dcfa4e54eeb516fe454635a95753ddd39acda650ce703031c6973e315dd5"
+checksum = "4a1091a7bb1f8f2c4b28f1fe2cef4980ca2d410a3d727d67ecc3178c9b0800f0"
 dependencies = [
  "enum-ordinalize-derive",
 ]
 
 [[package]]
 name = "enum-ordinalize-derive"
-version = "4.3.1"
+version = "4.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d28318a75d4aead5c4db25382e8ef717932d0346600cacae6357eb5941bc5ff"
+checksum = "8ca9601fb2d62598ee17836250842873a413586e5d7ed88b356e38ddbb0ec631"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "env_filter"
-version = "0.1.3"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0"
+checksum = "32e90c2accc4b07a8456ea0debdc2e7587bdd890680d71173a15d4ae604f6eef"
 dependencies = [
  "log",
  "regex",
@@ -2939,9 +2832,9 @@ dependencies = [
 
 [[package]]
 name = "env_logger"
-version = "0.11.8"
+version = "0.11.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f"
+checksum = "0621c04f2196ac3f488dd583365b9c09be011a4ab8b9f37248ffcc8f6198b56a"
 dependencies = [
  "anstream",
  "anstyle",
@@ -2963,7 +2856,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
 dependencies = [
  "libc",
- "windows-sys 0.61.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -2980,13 +2873,12 @@ checksum = "5692dd7b5a1978a5aeb0ce83b7655c58ca8efdcb79d21036ea249da95afec2c6"
 
 [[package]]
 name = "etcetera"
-version = "0.10.0"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26c7b13d0780cb82722fd59f6f57f925e143427e4a75313a6c77243bf5326ae6"
+checksum = "de48cc4d1c1d97a20fd819def54b890cadde72ed3ad0c614822a0a433361be96"
 dependencies = [
  "cfg-if",
- "home",
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -2997,26 +2889,25 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7"
 
 [[package]]
 name = "fastrand"
-version = "2.3.0"
+version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
+checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
 
 [[package]]
-name = "fd-lock"
-version = "4.0.4"
+name = "ferroid"
+version = "2.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ce92ff622d6dadf7349484f42c93271a0d49b7cc4d466a936405bacbe10aa78"
+checksum = "ee93edf3c501f0035bbeffeccfed0b79e14c311f12195ec0e661e114a0f60da4"
 dependencies = [
- "cfg-if",
- "rustix",
- "windows-sys 0.59.0",
+ "portable-atomic",
+ "rand 0.10.1",
+ "web-time",
 ]
 
 [[package]]
 name = "ffi_example_table_provider"
 version = "0.1.0"
 dependencies = [
- "abi_stable",
  "arrow",
  "datafusion",
  "datafusion-ffi",
@@ -3027,7 +2918,6 @@ dependencies = [
 name = "ffi_module_interface"
 version = "0.1.0"
 dependencies = [
- "abi_stable",
  "datafusion-ffi",
 ]
 
@@ -3035,30 +2925,29 @@ dependencies = [
 name = "ffi_module_loader"
 version = "0.1.0"
 dependencies = [
- "abi_stable",
  "datafusion",
  "datafusion-ffi",
  "ffi_module_interface",
+ "libloading",
  "tokio",
 ]
 
 [[package]]
 name = "filetime"
-version = "0.2.26"
+version = "0.2.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc0505cd1b6fa6580283f6bdf70a73fcf4aba1184038c90902b92b3dd0df63ed"
+checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db"
 dependencies = [
  "cfg-if",
  "libc",
  "libredox",
- "windows-sys 0.60.2",
 ]
 
 [[package]]
 name = "find-msvc-tools"
-version = "0.1.2"
+version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ced73b1dacfc750a6db6c0a0c3a3853c8b41997e2e2c563dc90804ae6867959"
+checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
 
 [[package]]
 name = "fixedbitset"
@@ -3068,23 +2957,23 @@ checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99"
 
 [[package]]
 name = "flatbuffers"
-version = "25.2.10"
+version = "25.12.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1"
+checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3"
 dependencies = [
- "bitflags 2.9.4",
+ "bitflags",
  "rustc_version",
 ]
 
 [[package]]
 name = "flate2"
-version = "1.1.4"
+version = "1.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc5a4e564e38c699f2880d3fda590bedc2e69f3f84cd48b457bd892ce61d0aa9"
+checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
 dependencies = [
  "crc32fast",
- "libz-rs-sys",
  "miniz_oxide",
+ "zlib-rs",
 ]
 
 [[package]]
@@ -3099,6 +2988,12 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
 
+[[package]]
+name = "foldhash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
+
 [[package]]
 name = "form_urlencoded"
 version = "1.2.2"
@@ -3110,9 +3005,9 @@ dependencies = [
 
 [[package]]
 name = "fs-err"
-version = "3.1.2"
+version = "3.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44f150ffc8782f35521cec2b23727707cb4045706ba3c854e86bef66b3a8cdbd"
+checksum = "73fde052dbfc920003cfd2c8e2c6e6d4cc7c1091538c3a24226cec0665ab08c0"
 dependencies = [
  "autocfg",
 ]
@@ -3123,17 +3018,11 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
 
-[[package]]
-name = "funty"
-version = "2.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
-
 [[package]]
 name = "futures"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
+checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -3146,9 +3035,9 @@ dependencies = [
 
 [[package]]
 name = "futures-channel"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
+checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -3156,15 +3045,15 @@ dependencies = [
 
 [[package]]
 name = "futures-core"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
+checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
 
 [[package]]
 name = "futures-executor"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
+checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d"
 dependencies = [
  "futures-core",
  "futures-task",
@@ -3173,32 +3062,32 @@ dependencies = [
 
 [[package]]
 name = "futures-io"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
+checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718"
 
 [[package]]
 name = "futures-macro"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
+checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "futures-sink"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7"
+checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893"
 
 [[package]]
 name = "futures-task"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
+checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393"
 
 [[package]]
 name = "futures-timer"
@@ -3208,9 +3097,9 @@ checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24"
 
 [[package]]
 name = "futures-util"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
+checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -3220,7 +3109,6 @@ dependencies = [
  "futures-task",
  "memchr",
  "pin-project-lite",
- "pin-utils",
  "slab",
 ]
 
@@ -3228,7 +3116,7 @@ dependencies = [
 name = "gen"
 version = "0.1.0"
 dependencies = [
- "pbjson-build",
+ "pbjson-build 0.9.0",
  "prost-build",
 ]
 
@@ -3236,24 +3124,15 @@ dependencies = [
 name = "gen-common"
 version = "0.1.0"
 dependencies = [
- "pbjson-build",
+ "pbjson-build 0.9.0",
  "prost-build",
 ]
 
 [[package]]
-name = "generational-arena"
-version = "0.2.9"
+name = "generic-array"
+version = "0.14.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "877e94aff08e743b651baaea359664321055749b398adff8740a7399af7796e7"
-dependencies = [
- "cfg-if",
-]
-
-[[package]]
-name = "generic-array"
-version = "0.14.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
+checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
 dependencies = [
  "typenum",
  "version_check",
@@ -3261,14 +3140,14 @@ dependencies = [
 
 [[package]]
 name = "getrandom"
-version = "0.2.16"
+version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592"
+checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
 dependencies = [
  "cfg-if",
  "js-sys",
  "libc",
- "wasi",
+ "wasi 0.11.1+wasi-snapshot-preview1",
  "wasm-bindgen",
 ]
 
@@ -3281,11 +3160,25 @@ dependencies = [
  "cfg-if",
  "js-sys",
  "libc",
- "r-efi",
+ "r-efi 5.3.0",
  "wasip2",
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "getrandom"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi 6.0.0",
+ "rand_core 0.10.1",
+ "wasip2",
+ "wasip3",
+]
+
 [[package]]
 name = "glob"
 version = "0.3.3"
@@ -3294,9 +3187,9 @@ checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
 
 [[package]]
 name = "globset"
-version = "0.4.16"
+version = "0.4.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54a1028dfc5f5df5da8a56a73e6c153c9a9708ec57232470703592a3f18e49f5"
+checksum = "52dfc19153a48bde0cbd630453615c8151bce3a5adfac7a0aebfbf0a1e1f57e3"
 dependencies = [
  "aho-corasick",
  "bstr",
@@ -3307,17 +3200,17 @@ dependencies = [
 
 [[package]]
 name = "h2"
-version = "0.4.12"
+version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f3c0b69cfcb4e1b9f1bf2f53f95f766e4661169728ec61cd3fe5a0166f2d1386"
+checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54"
 dependencies = [
  "atomic-waker",
  "bytes",
  "fnv",
  "futures-core",
  "futures-sink",
- "http 1.3.1",
- "indexmap 2.12.0",
+ "http 1.4.0",
+ "indexmap 2.14.0",
  "slab",
  "tokio",
  "tokio-util",
@@ -3333,6 +3226,8 @@ dependencies = [
  "cfg-if",
  "crunchy",
  "num-traits",
+ "rand 0.9.4",
+ "rand_distr",
  "zerocopy",
 ]
 
@@ -3341,19 +3236,12 @@ name = "hashbrown"
 version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
-dependencies = [
- "ahash 0.7.8",
-]
 
 [[package]]
 name = "hashbrown"
 version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
-dependencies = [
- "ahash 0.8.12",
- "allocator-api2",
-]
 
 [[package]]
 name = "hashbrown"
@@ -3361,24 +3249,29 @@ version = "0.15.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
 dependencies = [
- "allocator-api2",
- "equivalent",
- "foldhash",
+ "foldhash 0.1.5",
 ]
 
 [[package]]
 name = "hashbrown"
-version = "0.16.0"
+version = "0.16.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d"
+checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
+dependencies = [
+ "allocator-api2",
+ "equivalent",
+ "foldhash 0.2.0",
+]
 
 [[package]]
-name = "heck"
-version = "0.3.3"
+name = "hashbrown"
+version = "0.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c"
+checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51"
 dependencies = [
- "unicode-segmentation",
+ "allocator-api2",
+ "equivalent",
+ "foldhash 0.2.0",
 ]
 
 [[package]]
@@ -3395,20 +3288,20 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
 
 [[package]]
 name = "hmac"
-version = "0.12.1"
+version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
+checksum = "6303bc9732ae41b04cb554b844a762b4115a61bfaa81e3e83050991eeb56863f"
 dependencies = [
- "digest",
+ "digest 0.11.2",
 ]
 
 [[package]]
 name = "home"
-version = "0.5.11"
+version = "0.5.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf"
+checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d"
 dependencies = [
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -3424,12 +3317,11 @@ dependencies = [
 
 [[package]]
 name = "http"
-version = "1.3.1"
+version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565"
+checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a"
 dependencies = [
  "bytes",
- "fnv",
  "itoa",
 ]
 
@@ -3451,7 +3343,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
 dependencies = [
  "bytes",
- "http 1.3.1",
+ "http 1.4.0",
 ]
 
 [[package]]
@@ -3462,7 +3354,7 @@ checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
 dependencies = [
  "bytes",
  "futures-core",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body 1.0.1",
  "pin-project-lite",
 ]
@@ -3485,24 +3377,32 @@ version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424"
 
+[[package]]
+name = "hybrid-array"
+version = "0.4.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08d46837a0ed51fe95bd3b05de33cd64a1ee88fc797477ca48446872504507c5"
+dependencies = [
+ "typenum",
+]
+
 [[package]]
 name = "hyper"
-version = "1.7.0"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb3aa54a13a0dfe7fbe3a59e0c76093041720fdc77b110cc0fc260fafb4dc51e"
+checksum = "6299f016b246a94207e63da54dbe807655bf9e00044f73ded42c3ac5305fbcca"
 dependencies = [
  "atomic-waker",
  "bytes",
  "futures-channel",
  "futures-core",
  "h2",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body 1.0.1",
  "httparse",
  "httpdate",
  "itoa",
  "pin-project-lite",
- "pin-utils",
  "smallvec",
  "tokio",
  "want",
@@ -3525,16 +3425,15 @@ dependencies = [
 
 [[package]]
 name = "hyper-rustls"
-version = "0.27.7"
+version = "0.27.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58"
+checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f"
 dependencies = [
- "http 1.3.1",
+ "http 1.4.0",
  "hyper",
  "hyper-util",
  "rustls",
  "rustls-native-certs",
- "rustls-pki-types",
  "tokio",
  "tokio-rustls",
  "tower-service",
@@ -3555,16 +3454,15 @@ dependencies = [
 
 [[package]]
 name = "hyper-util"
-version = "0.1.17"
+version = "0.1.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c6995591a8f1380fcb4ba966a252a4b29188d51d2b89e3a252f5305be65aea8"
+checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0"
 dependencies = [
  "base64 0.22.1",
  "bytes",
  "futures-channel",
- "futures-core",
  "futures-util",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body 1.0.1",
  "hyper",
  "ipnet",
@@ -3594,9 +3492,9 @@ dependencies = [
 
 [[package]]
 name = "iana-time-zone"
-version = "0.1.64"
+version = "0.1.65"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb"
+checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470"
 dependencies = [
  "android_system_properties",
  "core-foundation-sys",
@@ -3604,7 +3502,7 @@ dependencies = [
  "js-sys",
  "log",
  "wasm-bindgen",
- "windows-core 0.62.0",
+ "windows-core",
 ]
 
 [[package]]
@@ -3618,12 +3516,13 @@ dependencies = [
 
 [[package]]
 name = "icu_collections"
-version = "2.0.0"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47"
+checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c"
 dependencies = [
  "displaydoc",
  "potential_utf",
+ "utf8_iter",
  "yoke",
  "zerofrom",
  "zerovec",
@@ -3631,9 +3530,9 @@ dependencies = [
 
 [[package]]
 name = "icu_locale_core"
-version = "2.0.0"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a"
+checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29"
 dependencies = [
  "displaydoc",
  "litemap",
@@ -3644,11 +3543,10 @@ dependencies = [
 
 [[package]]
 name = "icu_normalizer"
-version = "2.0.0"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979"
+checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4"
 dependencies = [
- "displaydoc",
  "icu_collections",
  "icu_normalizer_data",
  "icu_properties",
@@ -3659,42 +3557,38 @@ dependencies = [
 
 [[package]]
 name = "icu_normalizer_data"
-version = "2.0.0"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3"
+checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38"
 
 [[package]]
 name = "icu_properties"
-version = "2.0.1"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b"
+checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de"
 dependencies = [
- "displaydoc",
  "icu_collections",
  "icu_locale_core",
  "icu_properties_data",
  "icu_provider",
- "potential_utf",
  "zerotrie",
  "zerovec",
 ]
 
 [[package]]
 name = "icu_properties_data"
-version = "2.0.1"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632"
+checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14"
 
 [[package]]
 name = "icu_provider"
-version = "2.0.0"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af"
+checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421"
 dependencies = [
  "displaydoc",
  "icu_locale_core",
- "stable_deref_trait",
- "tinystr",
  "writeable",
  "yoke",
  "zerofrom",
@@ -3702,6 +3596,12 @@ dependencies = [
  "zerovec",
 ]
 
+[[package]]
+name = "id-arena"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
+
 [[package]]
 name = "ident_case"
 version = "1.0.1"
@@ -3742,47 +3642,42 @@ dependencies = [
 
 [[package]]
 name = "indexmap"
-version = "2.12.0"
+version = "2.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f"
+checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
 dependencies = [
  "equivalent",
- "hashbrown 0.16.0",
+ "hashbrown 0.17.0",
  "serde",
  "serde_core",
 ]
 
 [[package]]
 name = "indicatif"
-version = "0.18.0"
+version = "0.18.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70a646d946d06bedbbc4cac4c218acf4bbf2d87757a784857025f4d447e4e1cd"
+checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb"
 dependencies = [
- "console 0.16.1",
+ "console",
  "portable-atomic",
- "unicode-width 0.2.1",
+ "unicode-width 0.2.2",
  "unit-prefix",
  "web-time",
 ]
 
-[[package]]
-name = "indoc"
-version = "2.0.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd"
-
 [[package]]
 name = "insta"
-version = "1.43.2"
+version = "1.47.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46fdb647ebde000f43b5b53f773c30cf9b0cb4300453208713fa38b2c70935a0"
+checksum = "7b4a6248eb93a4401ed2f37dfe8ea592d3cf05b7cf4f8efa867b6895af7e094e"
 dependencies = [
- "console 0.15.11",
+ "console",
  "globset",
  "once_cell",
  "regex",
  "serde",
  "similar",
+ "tempfile",
  "walkdir",
 ]
 
@@ -3805,15 +3700,15 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
 
 [[package]]
 name = "ipnet"
-version = "2.11.0"
+version = "2.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130"
+checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2"
 
 [[package]]
 name = "iri-string"
-version = "0.7.8"
+version = "0.7.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dbc5ebe9c3a1a7a5127f920a418f7585e9e758e911d0466ed004f393b0e380b2"
+checksum = "25e659a4bb38e810ebc252e53b5814ff908a8c58c2a9ce2fae1bbec24cbf4e20"
 dependencies = [
  "memchr",
  "serde",
@@ -3821,9 +3716,9 @@ dependencies = [
 
 [[package]]
 name = "is_terminal_polyfill"
-version = "1.70.1"
+version = "1.70.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
+checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
 
 [[package]]
 name = "itertools"
@@ -3845,32 +3740,32 @@ dependencies = [
 
 [[package]]
 name = "itoa"
-version = "1.0.15"
+version = "1.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
+checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
 
 [[package]]
 name = "jiff"
-version = "0.2.15"
+version = "0.2.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be1f93b8b1eb69c77f24bbb0afdf66f54b632ee39af40ca21c4365a1d7347e49"
+checksum = "f00b5dbd620d61dfdcb6007c9c1f6054ebd75319f163d886a9055cec1155073d"
 dependencies = [
  "jiff-static",
  "log",
  "portable-atomic",
  "portable-atomic-util",
- "serde",
+ "serde_core",
 ]
 
 [[package]]
 name = "jiff-static"
-version = "0.2.15"
+version = "0.2.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4"
+checksum = "e000de030ff8022ea1da3f466fbb0f3a809f5e51ed31f6dd931c35181ad8e6d7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -3885,10 +3780,12 @@ dependencies = [
 
 [[package]]
 name = "js-sys"
-version = "0.3.82"
+version = "0.3.95"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b011eec8cc36da2aab2d5cff675ec18454fad408585853910a202391cf9f8e65"
+checksum = "2964e92d1d9dc3364cae4d718d93f227e3abb088e747d92e0395bfdedf1c12ca"
 dependencies = [
+ "cfg-if",
+ "futures-util",
  "once_cell",
  "wasm-bindgen",
 ]
@@ -3899,6 +3796,12 @@ version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
 
+[[package]]
+name = "leb128fmt"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
+
 [[package]]
 name = "lexical-core"
 version = "1.0.6"
@@ -3958,112 +3861,118 @@ dependencies = [
 
 [[package]]
 name = "libbz2-rs-sys"
-version = "0.2.2"
+version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7"
+checksum = "b3a6a8c165077efc8f3a971534c50ea6a1a18b329ef4a66e897a7e3a1494565f"
 
 [[package]]
 name = "libc"
-version = "0.2.177"
+version = "0.2.186"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976"
+checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
 
 [[package]]
 name = "libloading"
-version = "0.7.4"
+version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f"
+checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60"
 dependencies = [
  "cfg-if",
- "winapi",
+ "windows-link",
 ]
 
 [[package]]
-name = "libloading"
-version = "0.8.9"
+name = "liblzma"
+version = "0.4.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
+checksum = "b6033b77c21d1f56deeae8014eb9fbe7bdf1765185a6c508b5ca82eeaed7f899"
 dependencies = [
- "cfg-if",
- "windows-link 0.2.0",
+ "liblzma-sys",
+]
+
+[[package]]
+name = "liblzma-sys"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a60851d15cd8c5346eca4ab8babff585be2ae4bc8097c067291d3ffe2add3b6"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
 ]
 
 [[package]]
 name = "libm"
-version = "0.2.15"
+version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de"
+checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
 
 [[package]]
 name = "libmimalloc-sys"
-version = "0.1.44"
+version = "0.1.47"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "667f4fec20f29dfc6bc7357c582d91796c169ad7e2fce709468aefeb2c099870"
+checksum = "2d1eacfa31c33ec25e873c136ba5669f00f9866d0688bea7be4d3f7e43067df6"
 dependencies = [
  "cc",
  "cty",
- "libc",
 ]
 
 [[package]]
 name = "libredox"
-version = "0.1.10"
+version = "0.1.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "416f7e718bdb06000964960ffa43b4335ad4012ae8b99060261aa4a8088d5ccb"
+checksum = "e02f3bb43d335493c96bf3fd3a321600bf6bd07ed34bc64118e9293bdffea46c"
 dependencies = [
- "bitflags 2.9.4",
+ "bitflags",
  "libc",
- "redox_syscall",
+ "plain",
+ "redox_syscall 0.7.4",
 ]
 
 [[package]]
 name = "libtest-mimic"
-version = "0.8.1"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5297962ef19edda4ce33aaa484386e0a5b3d7f2f4e037cbeee00503ef6b29d33"
+checksum = "14e6ba06f0ade6e504aff834d7c34298e5155c6baca353cc6a4aaff2f9fd7f33"
 dependencies = [
  "anstream",
  "anstyle",
- "clap 4.5.50",
+ "clap",
  "escape8259",
 ]
 
 [[package]]
-name = "libz-rs-sys"
-version = "0.5.2"
+name = "link-section"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "840db8cf39d9ec4dd794376f38acc40d0fc65eec2a8f484f7fd375b84602becd"
-dependencies = [
- "zlib-rs",
-]
+checksum = "b685d66585d646efe09fec763d796c291049c8b6bf84e04954bffc8748341f0d"
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.11.0"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039"
+checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
 
 [[package]]
 name = "litemap"
-version = "0.8.0"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956"
+checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0"
 
 [[package]]
 name = "lock_api"
-version = "0.4.13"
+version = "0.4.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765"
+checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
 dependencies = [
- "autocfg",
  "scopeguard",
 ]
 
 [[package]]
 name = "log"
-version = "0.4.28"
+version = "0.4.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432"
+checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
 
 [[package]]
 name = "lru-slab"
@@ -4073,24 +3982,13 @@ checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
 
 [[package]]
 name = "lz4_flex"
-version = "0.11.5"
+version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08ab2867e3eeeca90e844d1940eab391c9dc5228783db2ed999acbc0a9ed375a"
+checksum = "db9a0d582c2874f68138a16ce1867e0ffde6c0bb0a0df85e1f36d04146db488a"
 dependencies = [
  "twox-hash",
 ]
 
-[[package]]
-name = "lzma-sys"
-version = "0.1.20"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27"
-dependencies = [
- "cc",
- "libc",
- "pkg-config",
-]
-
 [[package]]
 name = "matchit"
 version = "0.8.4"
@@ -4104,29 +4002,30 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf"
 dependencies = [
  "cfg-if",
- "digest",
+ "digest 0.10.7",
 ]
 
 [[package]]
-name = "memchr"
-version = "2.7.5"
+name = "md-5"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
+checksum = "69b6441f590336821bb897fb28fc622898ccceb1d6cea3fde5ea86b090c4de98"
+dependencies = [
+ "cfg-if",
+ "digest 0.11.2",
+]
 
 [[package]]
-name = "memoffset"
-version = "0.9.1"
+name = "memchr"
+version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
-dependencies = [
- "autocfg",
-]
+checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
 
 [[package]]
 name = "mimalloc"
-version = "0.1.48"
+version = "0.1.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e1ee66a4b64c74f4ef288bcbb9192ad9c3feaad75193129ac8509af543894fd8"
+checksum = "b3627c4272df786b9260cabaa46aec1d59c93ede723d4c3ef646c503816b0640"
 dependencies = [
  "libmimalloc-sys",
 ]
@@ -4139,20 +4038,14 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
 
 [[package]]
 name = "minicov"
-version = "0.3.7"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f27fe9f1cc3c22e1687f9446c2083c4c5fc7f0bcf1c7a86bdbded14985895b4b"
+checksum = "4869b6a491569605d66d3952bcdf03df789e5b536e5f0cf7758a7f08a55ae24d"
 dependencies = [
  "cc",
  "walkdir",
 ]
 
-[[package]]
-name = "minimal-lexical"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
-
 [[package]]
 name = "miniz_oxide"
 version = "0.8.9"
@@ -4165,13 +4058,13 @@ dependencies = [
 
 [[package]]
 name = "mio"
-version = "1.0.4"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c"
+checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1"
 dependencies = [
  "libc",
- "wasi",
- "windows-sys 0.59.0",
+ "wasi 0.11.1+wasi-snapshot-preview1",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -4191,11 +4084,11 @@ dependencies = [
 
 [[package]]
 name = "nix"
-version = "0.30.1"
+version = "0.31.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6"
+checksum = "5d6d0705320c1e6ba1d912b5e37cf18071b6c2e9b7fa8215a1e8a7651966f5d3"
 dependencies = [
- "bitflags 2.9.4",
+ "bitflags",
  "cfg-if",
  "cfg_aliases",
  "libc",
@@ -4203,30 +4096,29 @@ dependencies = [
 
 [[package]]
 name = "nom"
-version = "7.1.3"
+version = "8.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
+checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405"
 dependencies = [
  "memchr",
- "minimal-lexical",
 ]
 
 [[package]]
 name = "ntapi"
-version = "0.4.1"
+version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4"
+checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae"
 dependencies = [
  "winapi",
 ]
 
 [[package]]
 name = "nu-ansi-term"
-version = "0.50.1"
+version = "0.50.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d4a28e057d01f97e61255210fcff094d74ed0466038633e95017f5beb68e4399"
+checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
 dependencies = [
- "windows-sys 0.52.0",
+ "windows-sys 0.60.2",
 ]
 
 [[package]]
@@ -4251,7 +4143,6 @@ checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
 dependencies = [
  "num-integer",
  "num-traits",
- "serde",
 ]
 
 [[package]]
@@ -4265,9 +4156,9 @@ dependencies = [
 
 [[package]]
 name = "num-conv"
-version = "0.1.0"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
+checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967"
 
 [[package]]
 name = "num-integer"
@@ -4312,48 +4203,68 @@ dependencies = [
 
 [[package]]
 name = "objc2-core-foundation"
-version = "0.3.1"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1c10c2894a6fed806ade6027bcd50662746363a9589d3ec9d9bef30a4e4bc166"
+checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536"
 dependencies = [
- "bitflags 2.9.4",
+ "bitflags",
 ]
 
 [[package]]
 name = "objc2-io-kit"
-version = "0.3.1"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "71c1c64d6120e51cd86033f67176b1cb66780c2efe34dec55176f77befd93c0a"
+checksum = "33fafba39597d6dc1fb709123dfa8289d39406734be322956a69f0931c73bb15"
 dependencies = [
  "libc",
  "objc2-core-foundation",
 ]
 
+[[package]]
+name = "objc2-system-configuration"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7216bd11cbda54ccabcab84d523dc93b858ec75ecfb3a7d89513fa22464da396"
+dependencies = [
+ "objc2-core-foundation",
+]
+
+[[package]]
+name = "object"
+version = "0.37.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "object_store"
-version = "0.12.4"
+version = "0.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c1be0c6c22ec0817cdc77d3842f721a17fd30ab6965001415b5402a74e6b740"
+checksum = "622acbc9100d3c10e2ee15804b0caa40e55c933d5aa53814cd520805b7958a49"
 dependencies = [
  "async-trait",
  "base64 0.22.1",
  "bytes",
  "chrono",
  "form_urlencoded",
- "futures",
- "http 1.3.1",
+ "futures-channel",
+ "futures-core",
+ "futures-util",
+ "http 1.4.0",
  "http-body-util",
  "humantime",
  "hyper",
  "itertools 0.14.0",
- "md-5",
+ "md-5 0.10.6",
  "parking_lot",
  "percent-encoding",
  "quick-xml",
- "rand 0.9.2",
+ "rand 0.10.1",
  "reqwest",
  "ring",
- "rustls-pemfile",
+ "rustls-pki-types",
  "serde",
  "serde_json",
  "serde_urlencoded",
@@ -4368,15 +4279,15 @@ dependencies = [
 
 [[package]]
 name = "once_cell"
-version = "1.21.3"
+version = "1.21.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
+checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
 
 [[package]]
 name = "once_cell_polyfill"
-version = "1.70.1"
+version = "1.70.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad"
+checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
 
 [[package]]
 name = "oorandom"
@@ -4386,9 +4297,9 @@ checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
 
 [[package]]
 name = "openssl-probe"
-version = "0.1.6"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e"
+checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
 
 [[package]]
 name = "option-ext"
@@ -4413,15 +4324,25 @@ checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e"
 
 [[package]]
 name = "owo-colors"
-version = "4.2.2"
+version = "4.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d211803b9b6b570f68772237e415a029d5a50c65d382910b879fb19d3271f94d"
+
+[[package]]
+name = "page_size"
+version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48dd4f4a2c8405440fd0462561f0e5806bd0f77e86f51c761481bdd4018b545e"
+checksum = "30d5b2194ed13191c1999ae0704b7839fb18384fa22e49b57eeaa97d79ce40da"
+dependencies = [
+ "libc",
+ "winapi",
+]
 
 [[package]]
 name = "parking_lot"
-version = "0.12.4"
+version = "0.12.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70d58bf43669b5795d1576d0641cfb6fbb2057bf629506267a92807158584a13"
+checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a"
 dependencies = [
  "lock_api",
  "parking_lot_core",
@@ -4429,27 +4350,26 @@ dependencies = [
 
 [[package]]
 name = "parking_lot_core"
-version = "0.9.11"
+version = "0.9.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc838d2a56b5b1a6c25f55575dfc605fabb63bb2365f6c2353ef9159aa69e4a5"
+checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
 dependencies = [
  "cfg-if",
  "libc",
- "redox_syscall",
+ "redox_syscall 0.5.18",
  "smallvec",
- "windows-targets 0.52.6",
+ "windows-link",
 ]
 
 [[package]]
 name = "parquet"
-version = "57.0.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a0f31027ef1af7549f7cec603a9a21dce706d3f8d7c2060a68f43c1773be95a"
+checksum = "7d3f9f2205199603564127932b89695f52b62322f541d0fc7179d57c2e1c9877"
 dependencies = [
- "ahash 0.8.12",
+ "ahash",
  "arrow-array",
  "arrow-buffer",
- "arrow-cast",
  "arrow-data",
  "arrow-ipc",
  "arrow-schema",
@@ -4461,7 +4381,7 @@ dependencies = [
  "flate2",
  "futures",
  "half",
- "hashbrown 0.16.0",
+ "hashbrown 0.16.1",
  "lz4_flex",
  "num-bigint",
  "num-integer",
@@ -4500,7 +4420,7 @@ dependencies = [
  "regex",
  "regex-syntax",
  "structmeta",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4519,13 +4439,35 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "pbjson"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8edd1efdd8ab23ba9cb9ace3d9987a72663d5d7c9f74fa00b51d6213645cf6c"
+dependencies = [
+ "base64 0.22.1",
+ "serde",
+]
+
 [[package]]
 name = "pbjson-build"
 version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "af22d08a625a2213a78dbb0ffa253318c5c79ce3133d32d296655a7bdfb02095"
 dependencies = [
- "heck 0.5.0",
+ "heck",
+ "itertools 0.14.0",
+ "prost",
+ "prost-types",
+]
+
+[[package]]
+name = "pbjson-build"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2ed4d5c6ae95e08ac768883c8401cf0e8deb4e6e1d6a4e1fd3d2ec4f0ec63200"
+dependencies = [
+ "heck",
  "itertools 0.14.0",
  "prost",
  "prost-types",
@@ -4539,8 +4481,8 @@ checksum = "8e748e28374f10a330ee3bb9f29b828c0ac79831a32bab65015ad9b661ead526"
 dependencies = [
  "bytes",
  "chrono",
- "pbjson",
- "pbjson-build",
+ "pbjson 0.8.0",
+ "pbjson-build 0.8.0",
  "prost",
  "prost-build",
  "serde",
@@ -4552,16 +4494,6 @@ version = "2.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
 
-[[package]]
-name = "petgraph"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772"
-dependencies = [
- "fixedbitset",
- "indexmap 2.12.0",
-]
-
 [[package]]
 name = "petgraph"
 version = "0.8.3"
@@ -4570,7 +4502,7 @@ checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455"
 dependencies = [
  "fixedbitset",
  "hashbrown 0.15.5",
- "indexmap 2.12.0",
+ "indexmap 2.14.0",
  "serde",
 ]
 
@@ -4613,29 +4545,29 @@ dependencies = [
 
 [[package]]
 name = "pin-project"
-version = "1.1.10"
+version = "1.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
+checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517"
 dependencies = [
  "pin-project-internal",
 ]
 
 [[package]]
 name = "pin-project-internal"
-version = "1.1.10"
+version = "1.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
+checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "pin-project-lite"
-version = "0.2.16"
+version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
+checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
 
 [[package]]
 name = "pin-utils"
@@ -4645,9 +4577,15 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
 [[package]]
 name = "pkg-config"
-version = "0.3.32"
+version = "0.3.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e"
+
+[[package]]
+name = "plain"
+version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
+checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
 
 [[package]]
 name = "plotters"
@@ -4679,54 +4617,54 @@ dependencies = [
 
 [[package]]
 name = "portable-atomic"
-version = "1.11.1"
+version = "1.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483"
+checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49"
 
 [[package]]
 name = "portable-atomic-util"
-version = "0.2.4"
+version = "0.2.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507"
+checksum = "c2a106d1259c23fac8e543272398ae0e3c0b8d33c88ed73d0cc71b0f1d902618"
 dependencies = [
  "portable-atomic",
 ]
 
 [[package]]
 name = "postgres-derive"
-version = "0.4.7"
+version = "0.4.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56df96f5394370d1b20e49de146f9e6c25aa9ae750f449c9d665eafecb3ccae6"
+checksum = "ca1dad89d9ffdbf78502fde418eeede499b87772d88be780478f7f76dc8d471f"
 dependencies = [
- "heck 0.5.0",
+ "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "postgres-protocol"
-version = "0.6.9"
+version = "0.6.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fbef655056b916eb868048276cfd5d6a7dea4f81560dfd047f97c8c6fe3fcfd4"
+checksum = "56201207dac53e2f38e848e31b4b91616a6bb6e0c7205b77718994a7f49e70fc"
 dependencies = [
  "base64 0.22.1",
  "byteorder",
  "bytes",
  "fallible-iterator",
  "hmac",
- "md-5",
+ "md-5 0.11.0",
  "memchr",
- "rand 0.9.2",
+ "rand 0.10.1",
  "sha2",
  "stringprep",
 ]
 
 [[package]]
 name = "postgres-types"
-version = "0.2.11"
+version = "0.2.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef4605b7c057056dd35baeb6ac0c0338e4975b1f2bef0f65da953285eb007095"
+checksum = "8dc729a129e682e8d24170cd30ae1aa01b336b096cbb56df6d534ffec133d186"
 dependencies = [
  "bytes",
  "chrono",
@@ -4737,9 +4675,9 @@ dependencies = [
 
 [[package]]
 name = "potential_utf"
-version = "0.1.3"
+version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "84df19adbe5b5a0782edcab45899906947ab039ccf4573713735ee7de1e6b08a"
+checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564"
 dependencies = [
  "zerovec",
 ]
@@ -4776,56 +4714,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
 dependencies = [
  "proc-macro2",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "proc-macro-crate"
-version = "3.4.0"
+version = "3.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983"
+checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f"
 dependencies = [
  "toml_edit",
 ]
 
 [[package]]
-name = "proc-macro-error"
-version = "1.0.4"
+name = "proc-macro2"
+version = "1.0.106"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
 dependencies = [
- "proc-macro-error-attr",
- "proc-macro2",
- "quote",
- "syn 1.0.109",
- "version_check",
+ "unicode-ident",
 ]
 
 [[package]]
-name = "proc-macro-error-attr"
-version = "1.0.4"
+name = "prost"
+version = "0.14.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
-dependencies = [
- "proc-macro2",
- "quote",
- "version_check",
-]
-
-[[package]]
-name = "proc-macro2"
-version = "1.0.101"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de"
-dependencies = [
- "unicode-ident",
-]
-
-[[package]]
-name = "prost"
-version = "0.14.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7231bd9b3d3d33c86b58adbac74b5ec0ad9f496b19d22801d773636feaa95f3d"
+checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568"
 dependencies = [
  "bytes",
  "prost-derive",
@@ -4833,42 +4747,41 @@ dependencies = [
 
 [[package]]
 name = "prost-build"
-version = "0.14.1"
+version = "0.14.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac6c3320f9abac597dcbc668774ef006702672474aad53c6d596b62e487b40b1"
+checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7"
 dependencies = [
- "heck 0.5.0",
+ "heck",
  "itertools 0.14.0",
  "log",
  "multimap",
- "once_cell",
- "petgraph 0.7.1",
+ "petgraph",
  "prettyplease",
  "prost",
  "prost-types",
  "regex",
- "syn 2.0.108",
+ "syn 2.0.117",
  "tempfile",
 ]
 
 [[package]]
 name = "prost-derive"
-version = "0.14.1"
+version = "0.14.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9120690fafc389a67ba3803df527d0ec9cbbc9cc45e4cc20b332996dfb672425"
+checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b"
 dependencies = [
  "anyhow",
  "itertools 0.14.0",
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "prost-types"
-version = "0.14.1"
+version = "0.14.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9b4db3d6da204ed77bb26ba83b6122a73aeb2e87e25fbf7ad2e84c4ccbf8f72"
+checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7"
 dependencies = [
  "prost",
 ]
@@ -4884,105 +4797,19 @@ dependencies = [
 
 [[package]]
 name = "psm"
-version = "0.1.26"
+version = "0.1.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e944464ec8536cd1beb0bbfd96987eb5e3b72f2ecdafdc5c769a37f1fa2ae1f"
+checksum = "645dbe486e346d9b5de3ef16ede18c26e6c70ad97418f4874b8b1889d6e761ea"
 dependencies = [
+ "ar_archive_writer",
  "cc",
 ]
 
-[[package]]
-name = "ptr_meta"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0738ccf7ea06b608c10564b31debd4f5bc5e197fc8bfe088f68ae5ce81e7a4f1"
-dependencies = [
- "ptr_meta_derive",
-]
-
-[[package]]
-name = "ptr_meta_derive"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "16b845dbfca988fa33db069c0e230574d15a3088f147a87b64c7589eb662c9ac"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "pyo3"
-version = "0.26.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ba0117f4212101ee6544044dae45abe1083d30ce7b29c4b5cbdfa2354e07383"
-dependencies = [
- "indoc",
- "libc",
- "memoffset",
- "once_cell",
- "portable-atomic",
- "pyo3-build-config",
- "pyo3-ffi",
- "pyo3-macros",
- "unindent",
-]
-
-[[package]]
-name = "pyo3-build-config"
-version = "0.26.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fc6ddaf24947d12a9aa31ac65431fb1b851b8f4365426e182901eabfb87df5f"
-dependencies = [
- "target-lexicon",
-]
-
-[[package]]
-name = "pyo3-ffi"
-version = "0.26.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "025474d3928738efb38ac36d4744a74a400c901c7596199e20e45d98eb194105"
-dependencies = [
- "libc",
- "pyo3-build-config",
-]
-
-[[package]]
-name = "pyo3-macros"
-version = "0.26.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e64eb489f22fe1c95911b77c44cc41e7c19f3082fc81cce90f657cdc42ffded"
-dependencies = [
- "proc-macro2",
- "pyo3-macros-backend",
- "quote",
- "syn 2.0.108",
-]
-
-[[package]]
-name = "pyo3-macros-backend"
-version = "0.26.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "100246c0ecf400b475341b8455a9213344569af29a3c841d29270e53102e0fcf"
-dependencies = [
- "heck 0.5.0",
- "proc-macro2",
- "pyo3-build-config",
- "quote",
- "syn 2.0.108",
-]
-
-[[package]]
-name = "quad-rand"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40"
-
 [[package]]
 name = "quick-xml"
-version = "0.38.3"
+version = "0.39.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42a232e7487fc2ef313d96dde7948e7a3c05101870d8985e4fd8d26aedd27b89"
+checksum = "958f21e8e7ceb5a1aa7fa87fab28e7c75976e0bfe7e23ff069e0a260f894067d"
 dependencies = [
  "memchr",
  "serde",
@@ -5010,14 +4837,14 @@ dependencies = [
 
 [[package]]
 name = "quinn-proto"
-version = "0.11.13"
+version = "0.11.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31"
+checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098"
 dependencies = [
  "bytes",
  "getrandom 0.3.4",
  "lru-slab",
- "rand 0.9.2",
+ "rand 0.9.4",
  "ring",
  "rustc-hash",
  "rustls",
@@ -5040,14 +4867,14 @@ dependencies = [
  "once_cell",
  "socket2",
  "tracing",
- "windows-sys 0.60.2",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
 name = "quote"
-version = "1.0.41"
+version = "1.0.45"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1"
+checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
 dependencies = [
  "proc-macro2",
 ]
@@ -5059,16 +4886,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
 
 [[package]]
-name = "radium"
-version = "0.7.0"
+name = "r-efi"
+version = "6.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09"
+checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
 
 [[package]]
 name = "radix_trie"
-version = "0.2.1"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c069c179fcdc6a2fe24d8d18305cf085fdbd4f922c041943e203685d6a1c58fd"
+checksum = "3b4431027dcd37fc2a73ef740b5f233aa805897935b8bce0195e41bbf9a3289a"
 dependencies = [
  "endian-type",
  "nibble_vec",
@@ -5076,9 +4903,9 @@ dependencies = [
 
 [[package]]
 name = "rand"
-version = "0.8.5"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a"
 dependencies = [
  "libc",
  "rand_chacha 0.3.1",
@@ -5087,12 +4914,23 @@ dependencies = [
 
 [[package]]
 name = "rand"
-version = "0.9.2"
+version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
+checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea"
 dependencies = [
  "rand_chacha 0.9.0",
- "rand_core 0.9.3",
+ "rand_core 0.9.5",
+]
+
+[[package]]
+name = "rand"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207"
+dependencies = [
+ "chacha20",
+ "getrandom 0.4.2",
+ "rand_core 0.10.1",
 ]
 
 [[package]]
@@ -5112,7 +4950,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
 dependencies = [
  "ppv-lite86",
- "rand_core 0.9.3",
+ "rand_core 0.9.5",
 ]
 
 [[package]]
@@ -5121,18 +4959,24 @@ version = "0.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
 dependencies = [
- "getrandom 0.2.16",
+ "getrandom 0.2.17",
 ]
 
 [[package]]
 name = "rand_core"
-version = "0.9.3"
+version = "0.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
+checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c"
 dependencies = [
  "getrandom 0.3.4",
 ]
 
+[[package]]
+name = "rand_core"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69"
+
 [[package]]
 name = "rand_distr"
 version = "0.5.1"
@@ -5140,14 +4984,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
 dependencies = [
  "num-traits",
- "rand 0.9.2",
+ "rand 0.9.4",
 ]
 
 [[package]]
 name = "rayon"
-version = "1.11.0"
+version = "1.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f"
+checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
 dependencies = [
  "either",
  "rayon-core",
@@ -5180,16 +5024,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b"
 dependencies = [
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "redox_syscall"
-version = "0.5.17"
+version = "0.5.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77"
+checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
 dependencies = [
- "bitflags 2.9.4",
+ "bitflags",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f450ad9c3b1da563fb6948a8e0fb0fb9269711c9c73d9ea1de5058c79c8d643a"
+dependencies = [
+ "bitflags",
 ]
 
 [[package]]
@@ -5198,36 +5051,36 @@ version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac"
 dependencies = [
- "getrandom 0.2.16",
+ "getrandom 0.2.17",
  "libredox",
  "thiserror",
 ]
 
 [[package]]
 name = "ref-cast"
-version = "1.0.24"
+version = "1.0.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a0ae411dbe946a674d89546582cea4ba2bb8defac896622d6496f14c23ba5cf"
+checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d"
 dependencies = [
  "ref-cast-impl",
 ]
 
 [[package]]
 name = "ref-cast-impl"
-version = "1.0.24"
+version = "1.0.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1165225c21bff1f3bbce98f5a1f889949bc902d3575308cc7b0de30b4f6d27c7"
+checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "regex"
-version = "1.12.2"
+version = "1.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
+checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -5237,9 +5090,9 @@ dependencies = [
 
 [[package]]
 name = "regex-automata"
-version = "0.4.13"
+version = "0.4.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c"
+checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -5248,23 +5101,23 @@ dependencies = [
 
 [[package]]
 name = "regex-lite"
-version = "0.1.7"
+version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "943f41321c63ef1c92fd763bfe054d2668f7f225a5c29f0105903dc2fc04ba30"
+checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973"
 
 [[package]]
 name = "regex-syntax"
-version = "0.8.6"
+version = "0.8.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001"
+checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
 
 [[package]]
 name = "regress"
-version = "0.10.4"
+version = "0.10.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "145bb27393fe455dd64d6cbc8d059adfa392590a45eadf079c01b11857e7b010"
+checksum = "2057b2325e68a893284d1538021ab90279adac1139957ca2a74426c6f118fb48"
 dependencies = [
- "hashbrown 0.15.5",
+ "hashbrown 0.16.1",
  "memchr",
 ]
 
@@ -5274,36 +5127,18 @@ version = "1.9.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2"
 
-[[package]]
-name = "rend"
-version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "71fe3824f5629716b1589be05dacd749f6aa084c87e00e016714a8cdfccc997c"
-dependencies = [
- "bytecheck",
-]
-
-[[package]]
-name = "repr_offset"
-version = "0.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb1070755bd29dffc19d0971cab794e607839ba2ef4b69a9e6fbc8733c1b72ea"
-dependencies = [
- "tstr",
-]
-
 [[package]]
 name = "reqwest"
-version = "0.12.23"
+version = "0.12.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d429f34c8092b2d42c7c93cec323bb4adeb7c67698f70839adec842ec10c7ceb"
+checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147"
 dependencies = [
  "base64 0.22.1",
  "bytes",
  "futures-core",
  "futures-util",
  "h2",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body 1.0.1",
  "http-body-util",
  "hyper",
@@ -5342,41 +5177,12 @@ checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
 dependencies = [
  "cc",
  "cfg-if",
- "getrandom 0.2.16",
+ "getrandom 0.2.17",
  "libc",
  "untrusted",
  "windows-sys 0.52.0",
 ]
 
-[[package]]
-name = "rkyv"
-version = "0.7.45"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9008cd6385b9e161d8229e1f6549dd23c3d022f132a2ea37ac3a10ac4935779b"
-dependencies = [
- "bitvec",
- "bytecheck",
- "bytes",
- "hashbrown 0.12.3",
- "ptr_meta",
- "rend",
- "rkyv_derive",
- "seahash",
- "tinyvec",
- "uuid",
-]
-
-[[package]]
-name = "rkyv_derive"
-version = "0.7.45"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "503d1d27590a2b0a3a4ca4c94755aa2875657196ecbf401a42eff41d7de532c0"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
-
 [[package]]
 name = "rstest"
 version = "0.26.1"
@@ -5402,7 +5208,7 @@ dependencies = [
  "regex",
  "relative-path",
  "rustc_version",
- "syn 2.0.108",
+ "syn 2.0.117",
  "unicode-ident",
 ]
 
@@ -5413,32 +5219,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b3a8fb4672e840a587a66fc577a5491375df51ddb88f2a2c2a792598c326fe14"
 dependencies = [
  "quote",
- "rand 0.8.5",
- "syn 2.0.108",
-]
-
-[[package]]
-name = "rust_decimal"
-version = "1.38.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8975fc98059f365204d635119cf9c5a60ae67b841ed49b5422a9a7e56cdfac0"
-dependencies = [
- "arrayvec",
- "borsh",
- "bytes",
- "num-traits",
- "postgres-types",
- "rand 0.8.5",
- "rkyv",
- "serde",
- "serde_json",
+ "rand 0.8.6",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "rustc-hash"
-version = "2.1.1"
+version = "2.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
+checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
 
 [[package]]
 name = "rustc_version"
@@ -5451,22 +5240,22 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "1.1.2"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e"
+checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
 dependencies = [
- "bitflags 2.9.4",
+ "bitflags",
  "errno",
  "libc",
  "linux-raw-sys",
- "windows-sys 0.61.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
 name = "rustls"
-version = "0.23.32"
+version = "0.23.39"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd3c25631629d034ce7cd9940adc9d45762d46de2b0f57193c4443b92c6d4d40"
+checksum = "7c2c118cb077cca2822033836dfb1b975355dfb784b5e8da48f7b6c5db74e60e"
 dependencies = [
  "aws-lc-rs",
  "log",
@@ -5480,9 +5269,9 @@ dependencies = [
 
 [[package]]
 name = "rustls-native-certs"
-version = "0.8.1"
+version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fcff2dd52b58a8d98a70243663a0d234c4e2b79235637849d15913394a247d3"
+checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63"
 dependencies = [
  "openssl-probe",
  "rustls-pki-types",
@@ -5490,20 +5279,11 @@ dependencies = [
  "security-framework",
 ]
 
-[[package]]
-name = "rustls-pemfile"
-version = "2.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50"
-dependencies = [
- "rustls-pki-types",
-]
-
 [[package]]
 name = "rustls-pki-types"
-version = "1.12.0"
+version = "1.14.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79"
+checksum = "30a7197ae7eb376e574fe940d068c30fe0462554a3ddbe4eca7838e049c937a9"
 dependencies = [
  "web-time",
  "zeroize",
@@ -5511,9 +5291,9 @@ dependencies = [
 
 [[package]]
 name = "rustls-webpki"
-version = "0.103.6"
+version = "0.103.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8572f3c2cb9934231157b45499fc41e1f58c589fdfb81a844ba873265e80f8eb"
+checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e"
 dependencies = [
  "aws-lc-rs",
  "ring",
@@ -5529,14 +5309,13 @@ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
 
 [[package]]
 name = "rustyline"
-version = "17.0.2"
+version = "18.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e902948a25149d50edc1a8e0141aad50f54e22ba83ff988cf8f7c9ef07f50564"
+checksum = "4a990b25f351b25139ddc7f21ee3f6f56f86d6846b74ac8fad3a719a287cd4a0"
 dependencies = [
- "bitflags 2.9.4",
+ "bitflags",
  "cfg-if",
  "clipboard-win",
- "fd-lock",
  "home",
  "libc",
  "log",
@@ -5544,16 +5323,16 @@ dependencies = [
  "nix",
  "radix_trie",
  "unicode-segmentation",
- "unicode-width 0.2.1",
+ "unicode-width 0.2.2",
  "utf8parse",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
 name = "ryu"
-version = "1.0.20"
+version = "1.0.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
+checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
 
 [[package]]
 name = "same-file"
@@ -5566,11 +5345,11 @@ dependencies = [
 
 [[package]]
 name = "schannel"
-version = "0.1.28"
+version = "0.1.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1"
+checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939"
 dependencies = [
- "windows-sys 0.61.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -5599,9 +5378,9 @@ dependencies = [
 
 [[package]]
 name = "schemars"
-version = "1.0.4"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "82d20c4491bc164fa2f6c5d44565947a52ad80b9505d8e36f8d54c27c739fcd0"
+checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc"
 dependencies = [
  "dyn-clone",
  "ref-cast",
@@ -5618,7 +5397,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "serde_derive_internals",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -5627,19 +5406,13 @@ version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 
-[[package]]
-name = "seahash"
-version = "4.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"
-
 [[package]]
 name = "security-framework"
-version = "3.5.0"
+version = "3.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc198e42d9b7510827939c9a15f5062a0c913f3371d765977e586d2fe6c16f4a"
+checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d"
 dependencies = [
- "bitflags 2.9.4",
+ "bitflags",
  "core-foundation",
  "core-foundation-sys",
  "libc",
@@ -5648,9 +5421,9 @@ dependencies = [
 
 [[package]]
 name = "security-framework-sys"
-version = "2.15.0"
+version = "2.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0"
+checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3"
 dependencies = [
  "core-foundation-sys",
  "libc",
@@ -5658,9 +5431,9 @@ dependencies = [
 
 [[package]]
 name = "semver"
-version = "1.0.27"
+version = "1.0.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2"
+checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
 dependencies = [
  "serde",
  "serde_core",
@@ -5682,16 +5455,6 @@ dependencies = [
  "serde_derive",
 ]
 
-[[package]]
-name = "serde_bytes"
-version = "0.11.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a5d440709e79d88e51ac01c4b72fc6cb7314017bb7da9eeff678aa94c10e3ea8"
-dependencies = [
- "serde",
- "serde_core",
-]
-
 [[package]]
 name = "serde_core"
 version = "1.0.228"
@@ -5709,7 +5472,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -5720,20 +5483,21 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "serde_json"
-version = "1.0.145"
+version = "1.0.149"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
+checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
 dependencies = [
+ "indexmap 2.14.0",
  "itoa",
  "memchr",
- "ryu",
  "serde",
  "serde_core",
+ "zmij",
 ]
 
 [[package]]
@@ -5744,19 +5508,19 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "serde_tokenstream"
-version = "0.2.2"
+version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64060d864397305347a78851c51588fd283767e7e7589829e8121d65512340f1"
+checksum = "d7c49585c52c01f13c5c2ebb333f14f6885d76daa768d8a037d28017ec538c69"
 dependencies = [
  "proc-macro2",
  "quote",
  "serde",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -5773,19 +5537,18 @@ dependencies = [
 
 [[package]]
 name = "serde_with"
-version = "3.14.1"
+version = "3.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c522100790450cf78eeac1507263d0a350d4d5b30df0c8e1fe051a10c22b376e"
+checksum = "dd5414fad8e6907dbdd5bc441a50ae8d6e26151a03b1de04d89a5576de61d01f"
 dependencies = [
  "base64 0.22.1",
  "chrono",
  "hex",
  "indexmap 1.9.3",
- "indexmap 2.12.0",
+ "indexmap 2.14.0",
  "schemars 0.9.0",
- "schemars 1.0.4",
- "serde",
- "serde_derive",
+ "schemars 1.2.1",
+ "serde_core",
  "serde_json",
  "serde_with_macros",
  "time",
@@ -5793,14 +5556,14 @@ dependencies = [
 
 [[package]]
 name = "serde_with_macros"
-version = "3.14.1"
+version = "3.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "327ada00f7d64abaac1e55a6911e90cf665aa051b9a561c7006c157f4633135e"
+checksum = "d3db8978e608f1fe7357e211969fd9abdcae80bac1ba7a3369bb7eb6b404eb65"
 dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -5809,7 +5572,7 @@ version = "0.9.34+deprecated"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
 dependencies = [
- "indexmap 2.12.0",
+ "indexmap 2.14.0",
  "itoa",
  "ryu",
  "serde",
@@ -5823,21 +5586,38 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
 dependencies = [
  "cfg-if",
- "cpufeatures",
- "digest",
+ "cpufeatures 0.2.17",
+ "digest 0.10.7",
+]
+
+[[package]]
+name = "sha1"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aacc4cc499359472b4abe1bf11d0b12e688af9a805fa5e3016f9a386dc2d0214"
+dependencies = [
+ "cfg-if",
+ "cpufeatures 0.3.0",
+ "digest 0.11.2",
 ]
 
 [[package]]
 name = "sha2"
-version = "0.10.9"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
+checksum = "446ba717509524cb3f22f17ecc096f10f4822d76ab5c0b9822c5f9c284e825f4"
 dependencies = [
  "cfg-if",
- "cpufeatures",
- "digest",
+ "cpufeatures 0.3.0",
+ "digest 0.11.2",
 ]
 
+[[package]]
+name = "sha2-const-stable"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f179d4e11094a893b82fff208f74d448a7512f99f5a0acbd5c679b705f83ed9"
+
 [[package]]
 name = "sharded-slab"
 version = "0.1.7"
@@ -5855,18 +5635,19 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "signal-hook-registry"
-version = "1.4.6"
+version = "1.4.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b2a4719bff48cee6b39d12c020eeb490953ad2443b7055bd0b21fca26bd8c28b"
+checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b"
 dependencies = [
+ "errno",
  "libc",
 ]
 
 [[package]]
 name = "simd-adler32"
-version = "0.3.7"
+version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe"
+checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214"
 
 [[package]]
 name = "simdutf8"
@@ -5882,15 +5663,15 @@ checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa"
 
 [[package]]
 name = "siphasher"
-version = "1.0.1"
+version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d"
+checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e"
 
 [[package]]
 name = "slab"
-version = "0.4.11"
+version = "0.4.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589"
+checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5"
 
 [[package]]
 name = "smallvec"
@@ -5906,37 +5687,37 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b"
 
 [[package]]
 name = "snmalloc-rs"
-version = "0.3.8"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb317153089fdfa4d8a2eec059d40a5a23c3bde43995ea23b19121c3f621e74a"
+checksum = "530a04ae687609072d0edd38866406fbbcd23d2f716791437e312ec4d64a355a"
 dependencies = [
  "snmalloc-sys",
 ]
 
 [[package]]
 name = "snmalloc-sys"
-version = "0.3.8"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "065fea53d32bb77bc36cca466cb191f2e5216ebfd0ed360b1d64889ee6e559ea"
+checksum = "a96cbeb16d6bcc5979f80ec907582a886b7fb3b9a707678b63dd93a10d8ee858"
 dependencies = [
  "cmake",
 ]
 
 [[package]]
 name = "socket2"
-version = "0.6.0"
+version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "233504af464074f9d066d7b5416c5f9b894a5862a6506e306f7b816cdd6f1807"
+checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e"
 dependencies = [
  "libc",
- "windows-sys 0.59.0",
+ "windows-sys 0.60.2",
 ]
 
 [[package]]
 name = "sqllogictest"
-version = "0.28.4"
+version = "0.29.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3566426f72a13e393aa34ca3d542c5b0eb86da4c0db137ee9b5cfccc6179e52d"
+checksum = "d03b2262a244037b0b510edbd25a8e6c9fb8d73ee0237fc6cc95a54c16f94a82"
 dependencies = [
  "async-trait",
  "educe",
@@ -5946,9 +5727,9 @@ dependencies = [
  "humantime",
  "itertools 0.13.0",
  "libtest-mimic",
- "md-5",
+ "md-5 0.10.6",
  "owo-colors",
- "rand 0.8.5",
+ "rand 0.8.6",
  "regex",
  "similar",
  "subst",
@@ -5959,9 +5740,9 @@ dependencies = [
 
 [[package]]
 name = "sqlparser"
-version = "0.59.0"
+version = "0.61.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4591acadbcf52f0af60eafbb2c003232b2b4cd8de5f0e9437cb8b1b59046cc0f"
+checksum = "dbf5ea8d4d7c808e1af1cbabebca9a2abe603bcefc22294c5b95018d53200cb7"
 dependencies = [
  "log",
  "recursive",
@@ -5970,32 +5751,67 @@ dependencies = [
 
 [[package]]
 name = "sqlparser_derive"
-version = "0.3.0"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c"
+checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "stabby"
+version = "72.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "976399a0c48ea769ef7f5dc303bb88240ab8d84008647a6b2303eced3dab3945"
+dependencies = [
+ "rustversion",
+ "stabby-abi",
+]
+
+[[package]]
+name = "stabby-abi"
+version = "72.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f7b54832a9a1f92a0e55e74a5c0332744426edc515bb3fbad82f10b874a87f0d"
+dependencies = [
+ "rustc_version",
+ "rustversion",
+ "sha2-const-stable",
+ "stabby-macros",
+]
+
+[[package]]
+name = "stabby-macros"
+version = "72.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a768b1e51e4dbfa4fa52ae5c01241c0a41e2938fdffbb84add0c8238092f9091"
+dependencies = [
+ "proc-macro-crate",
+ "proc-macro2",
+ "quote",
+ "rand 0.8.6",
+ "syn 1.0.109",
 ]
 
 [[package]]
 name = "stable_deref_trait"
-version = "1.2.0"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
+checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
 
 [[package]]
 name = "stacker"
-version = "0.1.21"
+version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cddb07e32ddb770749da91081d8d0ac3a16f1a569a18b20348cd371f5dead06b"
+checksum = "640c8cdd92b6b12f5bcb1803ca3bbf5ab96e5e6b6b96b9ab77dabe9e880b3190"
 dependencies = [
  "cc",
  "cfg-if",
  "libc",
  "psm",
- "windows-sys 0.59.0",
+ "windows-sys 0.60.2",
 ]
 
 [[package]]
@@ -6024,7 +5840,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "structmeta-derive",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -6035,68 +5851,25 @@ checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
-]
-
-[[package]]
-name = "structopt"
-version = "0.3.26"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10"
-dependencies = [
- "clap 2.34.0",
- "lazy_static",
- "structopt-derive",
-]
-
-[[package]]
-name = "structopt-derive"
-version = "0.4.18"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0"
-dependencies = [
- "heck 0.3.3",
- "proc-macro-error",
- "proc-macro2",
- "quote",
- "syn 1.0.109",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "strum"
-version = "0.26.3"
+version = "0.28.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06"
-
-[[package]]
-name = "strum"
-version = "0.27.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf"
-
-[[package]]
-name = "strum_macros"
-version = "0.26.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
-dependencies = [
- "heck 0.5.0",
- "proc-macro2",
- "quote",
- "rustversion",
- "syn 2.0.108",
-]
+checksum = "9628de9b8791db39ceda2b119bbe13134770b56c138ec1d3af810d045c04f9bd"
 
 [[package]]
 name = "strum_macros"
-version = "0.27.2"
+version = "0.28.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7"
+checksum = "ab85eea0270ee17587ed4156089e10b9e6880ee688791d45a905f5b1ca36f664"
 dependencies = [
- "heck 0.5.0",
+ "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -6111,13 +5884,14 @@ dependencies = [
 
 [[package]]
 name = "substrait"
-version = "0.62.0"
+version = "0.63.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21f1cb6d0bcd097a39fc25f7236236be29881fe122e282e4173d6d007a929927"
+checksum = "e620ff4d5c02fd6f7752931aa74b16a26af66a63022cc1ad412c77edbe0bab47"
 dependencies = [
- "heck 0.5.0",
- "pbjson",
- "pbjson-build",
+ "heck",
+ "indexmap 2.14.0",
+ "pbjson 0.8.0",
+ "pbjson-build 0.8.0",
  "pbjson-types",
  "prettyplease",
  "prost",
@@ -6130,7 +5904,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_yaml",
- "syn 2.0.108",
+ "syn 2.0.117",
  "typify",
  "walkdir",
 ]
@@ -6154,9 +5928,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.108"
+version = "2.0.117"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917"
+checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -6180,46 +5954,34 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "sysinfo"
-version = "0.37.2"
+version = "0.38.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "16607d5caffd1c07ce073528f9ed972d88db15dd44023fa57142963be3feb11f"
+checksum = "92ab6a2f8bfe508deb3c6406578252e491d299cbbf3bc0529ecc3313aee4a52f"
 dependencies = [
  "libc",
- "memchr",
- "ntapi",
- "objc2-core-foundation",
- "objc2-io-kit",
- "windows",
-]
-
-[[package]]
-name = "tap"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
-
-[[package]]
-name = "target-lexicon"
-version = "0.13.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "df7f62577c25e07834649fc3b39fafdc597c0a3527dc1c60129201ccfcbaa50c"
+ "memchr",
+ "ntapi",
+ "objc2-core-foundation",
+ "objc2-io-kit",
+ "windows",
+]
 
 [[package]]
 name = "tempfile"
-version = "3.23.0"
+version = "3.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16"
+checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd"
 dependencies = [
  "fastrand",
- "getrandom 0.3.4",
+ "getrandom 0.4.2",
  "once_cell",
  "rustix",
- "windows-sys 0.61.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -6230,14 +5992,14 @@ dependencies = [
  "chrono-tz",
  "datafusion-common",
  "env_logger",
- "rand 0.9.2",
+ "rand 0.9.4",
 ]
 
 [[package]]
 name = "testcontainers"
-version = "0.25.2"
+version = "0.27.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f3ac71069f20ecfa60c396316c283fbf35e6833a53dff551a31b5458da05edc"
+checksum = "bfd5785b5483672915ed5fe3cddf9f546802779fc1eceff0a6fb7321fac81c1e"
 dependencies = [
  "astral-tokio-tar",
  "async-trait",
@@ -6246,7 +6008,10 @@ dependencies = [
  "docker_credential",
  "either",
  "etcetera",
+ "ferroid",
  "futures",
+ "http 1.4.0",
+ "itertools 0.14.0",
  "log",
  "memchr",
  "parse-display",
@@ -6258,46 +6023,36 @@ dependencies = [
  "tokio",
  "tokio-stream",
  "tokio-util",
- "ulid",
  "url",
 ]
 
 [[package]]
 name = "testcontainers-modules"
-version = "0.13.0"
+version = "0.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1966329d5bb3f89d33602d2db2da971fb839f9297dad16527abf4564e2ae0a6d"
+checksum = "e5985fde5befe4ffa77a052e035e16c2da86e8bae301baa9f9904ad3c494d357"
 dependencies = [
  "testcontainers",
 ]
 
-[[package]]
-name = "textwrap"
-version = "0.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
-dependencies = [
- "unicode-width 0.1.14",
-]
-
 [[package]]
 name = "thiserror"
-version = "2.0.17"
+version = "2.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8"
+checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "2.0.17"
+version = "2.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913"
+checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -6322,30 +6077,30 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.44"
+version = "0.3.47"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d"
+checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c"
 dependencies = [
  "deranged",
  "itoa",
  "num-conv",
  "powerfmt",
- "serde",
+ "serde_core",
  "time-core",
  "time-macros",
 ]
 
 [[package]]
 name = "time-core"
-version = "0.1.6"
+version = "0.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40868e7c1d2f0b8d73e4a8c7f0ff63af4f6d19be117e90bd73eb1d62cf831c6b"
+checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca"
 
 [[package]]
 name = "time-macros"
-version = "0.2.24"
+version = "0.2.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "30cfb0125f12d9c277f35663a0a33f8c30190f4e4574868a330595412d34ebf3"
+checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215"
 dependencies = [
  "num-conv",
  "time-core",
@@ -6362,9 +6117,9 @@ dependencies = [
 
 [[package]]
 name = "tinystr"
-version = "0.8.1"
+version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b"
+checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d"
 dependencies = [
  "displaydoc",
  "zerovec",
@@ -6382,9 +6137,9 @@ dependencies = [
 
 [[package]]
 name = "tinyvec"
-version = "1.10.0"
+version = "1.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa"
+checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3"
 dependencies = [
  "tinyvec_macros",
 ]
@@ -6397,9 +6152,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.48.0"
+version = "1.52.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408"
+checksum = "b67dee974fe86fd92cc45b7a95fdd2f99a36a6d7b0d431a231178d3d670bbcc6"
 dependencies = [
  "bytes",
  "libc",
@@ -6409,25 +6164,25 @@ dependencies = [
  "signal-hook-registry",
  "socket2",
  "tokio-macros",
- "windows-sys 0.61.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
 name = "tokio-macros"
-version = "2.6.0"
+version = "2.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5"
+checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "tokio-postgres"
-version = "0.7.14"
+version = "0.7.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a156efe7fff213168257853e1dfde202eed5f487522cbbbf7d219941d753d853"
+checksum = "4dd8df5ef180f6364759a6f00f7aadda4fbbac86cdee37480826a6ff9f3574ce"
 dependencies = [
  "async-trait",
  "byteorder",
@@ -6442,7 +6197,7 @@ dependencies = [
  "pin-project-lite",
  "postgres-protocol",
  "postgres-types",
- "rand 0.9.2",
+ "rand 0.10.1",
  "socket2",
  "tokio",
  "tokio-util",
@@ -6451,9 +6206,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-rustls"
-version = "0.26.3"
+version = "0.26.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05f63835928ca123f1bef57abbcd23bb2ba0ac9ae1235f1e65bda0d06e7786bd"
+checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61"
 dependencies = [
  "rustls",
  "tokio",
@@ -6461,20 +6216,21 @@ dependencies = [
 
 [[package]]
 name = "tokio-stream"
-version = "0.1.17"
+version = "0.1.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047"
+checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70"
 dependencies = [
  "futures-core",
  "pin-project-lite",
  "tokio",
+ "tokio-util",
 ]
 
 [[package]]
 name = "tokio-util"
-version = "0.7.16"
+version = "0.7.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14307c986784f72ef81c89db7d9e28d6ac26d16213b109ea501696195e6e3ce5"
+checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098"
 dependencies = [
  "bytes",
  "futures-core",
@@ -6485,20 +6241,20 @@ dependencies = [
 
 [[package]]
 name = "toml_datetime"
-version = "0.7.2"
+version = "1.1.1+spec-1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32f1085dec27c2b6632b04c80b3bb1b4300d6495d1e129693bdda7d91e72eec1"
+checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7"
 dependencies = [
  "serde_core",
 ]
 
 [[package]]
 name = "toml_edit"
-version = "0.23.6"
+version = "0.25.11+spec-1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f3effe7c0e86fdff4f69cdd2ccc1b96f933e24811c5441d44904e8683e27184b"
+checksum = "0b59c4d22ed448339746c59b905d24568fcbb3ab65a500494f7b8c3e97739f2b"
 dependencies = [
- "indexmap 2.12.0",
+ "indexmap 2.14.0",
  "toml_datetime",
  "toml_parser",
  "winnow",
@@ -6506,25 +6262,25 @@ dependencies = [
 
 [[package]]
 name = "toml_parser"
-version = "1.0.3"
+version = "1.1.2+spec-1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4cf893c33be71572e0e9aa6dd15e6677937abd686b066eac3f8cd3531688a627"
+checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526"
 dependencies = [
  "winnow",
 ]
 
 [[package]]
 name = "tonic"
-version = "0.14.2"
+version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb7613188ce9f7df5bfe185db26c5814347d110db17920415cf2fbcad85e7203"
+checksum = "fec7c61a0695dc1887c1b53952990f3ad2e3a31453e1f49f10e75424943a93ec"
 dependencies = [
  "async-trait",
  "axum",
  "base64 0.22.1",
  "bytes",
  "h2",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body 1.0.1",
  "http-body-util",
  "hyper",
@@ -6544,9 +6300,9 @@ dependencies = [
 
 [[package]]
 name = "tonic-prost"
-version = "0.14.2"
+version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "66bd50ad6ce1252d87ef024b3d64fe4c3cf54a86fb9ef4c631fdd0ded7aeaa67"
+checksum = "a55376a0bbaa4975a3f10d009ad763d8f4108f067c7c2e74f3001fb49778d309"
 dependencies = [
  "bytes",
  "prost",
@@ -6555,13 +6311,13 @@ dependencies = [
 
 [[package]]
 name = "tower"
-version = "0.5.2"
+version = "0.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
+checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4"
 dependencies = [
  "futures-core",
  "futures-util",
- "indexmap 2.12.0",
+ "indexmap 2.14.0",
  "pin-project-lite",
  "slab",
  "sync_wrapper",
@@ -6574,14 +6330,14 @@ dependencies = [
 
 [[package]]
 name = "tower-http"
-version = "0.6.6"
+version = "0.6.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2"
+checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8"
 dependencies = [
- "bitflags 2.9.4",
+ "bitflags",
  "bytes",
  "futures-util",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body 1.0.1",
  "iri-string",
  "pin-project-lite",
@@ -6604,9 +6360,9 @@ checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
 
 [[package]]
 name = "tracing"
-version = "0.1.41"
+version = "0.1.44"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
+checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100"
 dependencies = [
  "pin-project-lite",
  "tracing-attributes",
@@ -6615,20 +6371,20 @@ dependencies = [
 
 [[package]]
 name = "tracing-attributes"
-version = "0.1.30"
+version = "0.1.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903"
+checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "tracing-core"
-version = "0.1.34"
+version = "0.1.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9d12581f227e93f094d3af2ae690a574abb8a2b9b7a96e7cfe9647b2b617678"
+checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a"
 dependencies = [
  "once_cell",
  "valuable",
@@ -6647,9 +6403,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-subscriber"
-version = "0.3.20"
+version = "0.3.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2054a14f5307d601f88daf0553e1cbf472acc4f2c51afab632431cdcd72124d5"
+checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319"
 dependencies = [
  "nu-ansi-term",
  "sharded-slab",
@@ -6665,44 +6421,17 @@ version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
 
-[[package]]
-name = "tstr"
-version = "0.2.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f8e0294f14baae476d0dd0a2d780b2e24d66e349a9de876f5126777a37bdba7"
-dependencies = [
- "tstr_proc_macros",
-]
-
-[[package]]
-name = "tstr_proc_macros"
-version = "0.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e78122066b0cb818b8afd08f7ed22f7fdbc3e90815035726f0840d0d26c0747a"
-
 [[package]]
 name = "twox-hash"
 version = "2.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c"
 
-[[package]]
-name = "typed-arena"
-version = "2.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a"
-
 [[package]]
 name = "typenum"
-version = "1.18.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f"
-
-[[package]]
-name = "typewit"
-version = "1.14.2"
+version = "1.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8c1ae7cc0fdb8b842d65d127cb981574b0d2b249b74d1c7a2986863dc134f71"
+checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de"
 
 [[package]]
 name = "typify"
@@ -6720,7 +6449,7 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a1eb359f7ffa4f9ebe947fa11a1b2da054564502968db5f317b7e37693cb2240"
 dependencies = [
- "heck 0.5.0",
+ "heck",
  "log",
  "proc-macro2",
  "quote",
@@ -6729,7 +6458,7 @@ dependencies = [
  "semver",
  "serde",
  "serde_json",
- "syn 2.0.108",
+ "syn 2.0.117",
  "thiserror",
  "unicode-ident",
 ]
@@ -6747,20 +6476,10 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_tokenstream",
- "syn 2.0.108",
+ "syn 2.0.117",
  "typify-impl",
 ]
 
-[[package]]
-name = "ulid"
-version = "1.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "470dbf6591da1b39d43c14523b2b469c86879a53e8b758c8e090a470fe7b1fbe"
-dependencies = [
- "rand 0.9.2",
- "web-time",
-]
-
 [[package]]
 name = "unicode-bidi"
 version = "0.3.18"
@@ -6769,30 +6488,30 @@ checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5"
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.19"
+version = "1.0.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d"
+checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
 
 [[package]]
 name = "unicode-normalization"
-version = "0.1.24"
+version = "0.1.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956"
+checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8"
 dependencies = [
  "tinyvec",
 ]
 
 [[package]]
 name = "unicode-properties"
-version = "0.1.3"
+version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0"
+checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d"
 
 [[package]]
 name = "unicode-segmentation"
-version = "1.12.0"
+version = "1.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
+checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c"
 
 [[package]]
 name = "unicode-width"
@@ -6802,21 +6521,21 @@ checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af"
 
 [[package]]
 name = "unicode-width"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c"
+checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
 
 [[package]]
-name = "unindent"
-version = "0.2.4"
+name = "unicode-xid"
+version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
+checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
 
 [[package]]
 name = "unit-prefix"
-version = "0.5.1"
+version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "323402cff2dd658f39ca17c789b502021b3f18707c91cdf22e3838e1b4023817"
+checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3"
 
 [[package]]
 name = "unsafe-libyaml"
@@ -6832,43 +6551,42 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
 
 [[package]]
 name = "ureq"
-version = "3.1.2"
+version = "3.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "99ba1025f18a4a3fc3e9b48c868e9beb4f24f4b4b1a325bada26bd4119f46537"
+checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0"
 dependencies = [
  "base64 0.22.1",
  "log",
  "percent-encoding",
  "rustls",
- "rustls-pemfile",
  "rustls-pki-types",
  "ureq-proto",
- "utf-8",
- "webpki-roots",
+ "utf8-zero",
 ]
 
 [[package]]
 name = "ureq-proto"
-version = "0.5.2"
+version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "60b4531c118335662134346048ddb0e54cc86bd7e81866757873055f0e38f5d2"
+checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c"
 dependencies = [
  "base64 0.22.1",
- "http 1.3.1",
+ "http 1.4.0",
  "httparse",
  "log",
 ]
 
 [[package]]
 name = "url"
-version = "2.5.7"
+version = "2.5.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08bc136a29a3d1758e07a9cca267be308aeebf5cfd5a10f3f67ab2097683ef5b"
+checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed"
 dependencies = [
  "form_urlencoded",
  "idna",
  "percent-encoding",
  "serde",
+ "serde_derive",
 ]
 
 [[package]]
@@ -6878,10 +6596,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
 
 [[package]]
-name = "utf-8"
-version = "0.7.6"
+name = "utf8-zero"
+version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
+checksum = "b8c0a043c9540bae7c578c88f91dda8bd82e59ae27c21baca69c8b191aaf5a6e"
 
 [[package]]
 name = "utf8_iter"
@@ -6897,13 +6615,12 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
 
 [[package]]
 name = "uuid"
-version = "1.18.1"
+version = "1.23.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2"
+checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76"
 dependencies = [
- "getrandom 0.3.4",
+ "getrandom 0.4.2",
  "js-sys",
- "serde",
  "wasm-bindgen",
 ]
 
@@ -6950,26 +6667,47 @@ version = "0.11.1+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
 
+[[package]]
+name = "wasi"
+version = "0.14.7+wasi-0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "883478de20367e224c0090af9cf5f9fa85bed63a95c1abf3afc5c083ebc06e8c"
+dependencies = [
+ "wasip2",
+]
+
 [[package]]
 name = "wasip2"
-version = "1.0.1+wasi-0.2.4"
+version = "1.0.3+wasi-0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6"
+dependencies = [
+ "wit-bindgen 0.57.1",
+]
+
+[[package]]
+name = "wasip3"
+version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7"
+checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
 dependencies = [
- "wit-bindgen",
+ "wit-bindgen 0.51.0",
 ]
 
 [[package]]
 name = "wasite"
-version = "0.1.0"
+version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b"
+checksum = "66fe902b4a6b8028a753d5424909b764ccf79b7a209eac9bf97e59cda9f71a42"
+dependencies = [
+ "wasi 0.14.7+wasi-0.2.4",
+]
 
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.105"
+version = "0.2.118"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da95793dfc411fbbd93f5be7715b0578ec61fe87cb1a42b12eb625caa5c5ea60"
+checksum = "0bf938a0bacb0469e83c1e148908bd7d5a6010354cf4fb73279b7447422e3a89"
 dependencies = [
  "cfg-if",
  "once_cell",
@@ -6980,22 +6718,19 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.55"
+version = "0.4.68"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "551f88106c6d5e7ccc7cd9a16f312dd3b5d36ea8b4954304657d5dfba115d4a0"
+checksum = "f371d383f2fb139252e0bfac3b81b265689bf45b6874af544ffa4c975ac1ebf8"
 dependencies = [
- "cfg-if",
  "js-sys",
- "once_cell",
  "wasm-bindgen",
- "web-sys",
 ]
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.105"
+version = "0.2.118"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "04264334509e04a7bf8690f2384ef5265f05143a4bff3889ab7a3269adab59c2"
+checksum = "eeff24f84126c0ec2db7a449f0c2ec963c6a49efe0698c4242929da037ca28ed"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -7003,48 +6738,85 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.105"
+version = "0.2.118"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "420bc339d9f322e562942d52e115d57e950d12d88983a14c79b86859ee6c7ebc"
+checksum = "9d08065faf983b2b80a79fd87d8254c409281cf7de75fc4b773019824196c904"
 dependencies = [
  "bumpalo",
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.105"
+version = "0.2.118"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76f218a38c84bcb33c25ec7059b07847d465ce0e0a76b995e134a45adcb6af76"
+checksum = "5fd04d9e306f1907bd13c6361b5c6bfc7b3b3c095ed3f8a9246390f8dbdee129"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "wasm-bindgen-test"
-version = "0.3.55"
+version = "0.3.68"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bfc379bfb624eb59050b509c13e77b4eb53150c350db69628141abce842f2373"
+checksum = "6bb55e2540ad1c56eec35fd63e2aea15f83b11ce487fd2de9ad11578dfc047ea"
 dependencies = [
+ "async-trait",
+ "cast",
  "js-sys",
+ "libm",
  "minicov",
+ "nu-ansi-term",
+ "num-traits",
+ "oorandom",
+ "serde",
+ "serde_json",
  "wasm-bindgen",
  "wasm-bindgen-futures",
  "wasm-bindgen-test-macro",
+ "wasm-bindgen-test-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-test-macro"
-version = "0.3.55"
+version = "0.3.68"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "085b2df989e1e6f9620c1311df6c996e83fe16f57792b272ce1e024ac16a90f1"
+checksum = "caf0ca1bd612b988616bac1ab34c4e4290ef18f7148a1d8b7f31c150080e9295"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "wasm-bindgen-test-shared"
+version = "0.2.118"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23cda5ecc67248c48d3e705d3e03e00af905769b78b9d2a1678b663b8b9d4472"
+
+[[package]]
+name = "wasm-encoder"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319"
+dependencies = [
+ "leb128fmt",
+ "wasmparser",
+]
+
+[[package]]
+name = "wasm-metadata"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
+dependencies = [
+ "anyhow",
+ "indexmap 2.14.0",
+ "wasm-encoder",
+ "wasmparser",
 ]
 
 [[package]]
@@ -7060,11 +6832,23 @@ dependencies = [
  "web-sys",
 ]
 
+[[package]]
+name = "wasmparser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
+dependencies = [
+ "bitflags",
+ "hashbrown 0.15.5",
+ "indexmap 2.14.0",
+ "semver",
+]
+
 [[package]]
 name = "web-sys"
-version = "0.3.82"
+version = "0.3.95"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a1f95c0d03a47f4ae1f7a64643a6bb97465d9b740f0fa8f90ea33915c99a9a1"
+checksum = "4f2dfbb17949fa2088e5d39408c48368947b86f7834484e87b73de55bc14d97d"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -7080,22 +6864,15 @@ dependencies = [
  "wasm-bindgen",
 ]
 
-[[package]]
-name = "webpki-roots"
-version = "1.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32b130c0d2d49f8b6889abc456e795e82525204f27c42cf767cf0d7734e089b8"
-dependencies = [
- "rustls-pki-types",
-]
-
 [[package]]
 name = "whoami"
-version = "1.6.1"
+version = "2.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d4a4db5077702ca3015d3d02d74974948aba2ad9e12ab7df718ee64ccd7e97d"
+checksum = "d6a5b12f9df4f978d2cfdb1bd3bac52433f44393342d7ee9c25f5a1c14c0f45d"
 dependencies = [
+ "libc",
  "libredox",
+ "objc2-system-configuration",
  "wasite",
  "web-sys",
 ]
@@ -7122,7 +6899,7 @@ version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
 dependencies = [
- "windows-sys 0.61.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -7133,141 +6910,103 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
 [[package]]
 name = "windows"
-version = "0.61.3"
+version = "0.62.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9babd3a767a4c1aef6900409f85f5d53ce2544ccdfaa86dad48c91782c6d6893"
+checksum = "527fadee13e0c05939a6a05d5bd6eec6cd2e3dbd648b9f8e447c6518133d8580"
 dependencies = [
  "windows-collections",
- "windows-core 0.61.2",
+ "windows-core",
  "windows-future",
- "windows-link 0.1.3",
  "windows-numerics",
 ]
 
 [[package]]
 name = "windows-collections"
-version = "0.2.0"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3beeceb5e5cfd9eb1d76b381630e82c4241ccd0d27f1a39ed41b2760b255c5e8"
+checksum = "23b2d95af1a8a14a3c7367e1ed4fc9c20e0a26e79551b1454d72583c97cc6610"
 dependencies = [
- "windows-core 0.61.2",
-]
-
-[[package]]
-name = "windows-core"
-version = "0.61.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c0fdd3ddb90610c7638aa2b3a3ab2904fb9e5cdbecc643ddb3647212781c4ae3"
-dependencies = [
- "windows-implement",
- "windows-interface",
- "windows-link 0.1.3",
- "windows-result 0.3.4",
- "windows-strings 0.4.2",
+ "windows-core",
 ]
 
 [[package]]
 name = "windows-core"
-version = "0.62.0"
+version = "0.62.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "57fe7168f7de578d2d8a05b07fd61870d2e73b4020e9f49aa00da8471723497c"
+checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb"
 dependencies = [
  "windows-implement",
  "windows-interface",
- "windows-link 0.2.0",
- "windows-result 0.4.0",
- "windows-strings 0.5.0",
+ "windows-link",
+ "windows-result",
+ "windows-strings",
 ]
 
 [[package]]
 name = "windows-future"
-version = "0.2.1"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc6a41e98427b19fe4b73c550f060b59fa592d7d686537eebf9385621bfbad8e"
+checksum = "e1d6f90251fe18a279739e78025bd6ddc52a7e22f921070ccdc67dde84c605cb"
 dependencies = [
- "windows-core 0.61.2",
- "windows-link 0.1.3",
+ "windows-core",
+ "windows-link",
  "windows-threading",
 ]
 
 [[package]]
 name = "windows-implement"
-version = "0.60.0"
+version = "0.60.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836"
+checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "windows-interface"
-version = "0.59.1"
+version = "0.59.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8"
+checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "windows-link"
-version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a"
-
-[[package]]
-name = "windows-link"
-version = "0.2.0"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "45e46c0661abb7180e7b9c281db115305d49ca1709ab8242adf09666d2173c65"
+checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
 
 [[package]]
 name = "windows-numerics"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9150af68066c4c5c07ddc0ce30421554771e528bde427614c61038bc2c92c2b1"
-dependencies = [
- "windows-core 0.61.2",
- "windows-link 0.1.3",
-]
-
-[[package]]
-name = "windows-result"
-version = "0.3.4"
+version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6"
+checksum = "6e2e40844ac143cdb44aead537bbf727de9b044e107a0f1220392177d15b0f26"
 dependencies = [
- "windows-link 0.1.3",
+ "windows-core",
+ "windows-link",
 ]
 
 [[package]]
 name = "windows-result"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7084dcc306f89883455a206237404d3eaf961e5bd7e0f312f7c91f57eb44167f"
-dependencies = [
- "windows-link 0.2.0",
-]
-
-[[package]]
-name = "windows-strings"
-version = "0.4.2"
+version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57"
+checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5"
 dependencies = [
- "windows-link 0.1.3",
+ "windows-link",
 ]
 
 [[package]]
 name = "windows-strings"
-version = "0.5.0"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7218c655a553b0bed4426cf54b20d7ba363ef543b52d515b3e48d7fd55318dda"
+checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091"
 dependencies = [
- "windows-link 0.2.0",
+ "windows-link",
 ]
 
 [[package]]
@@ -7279,31 +7018,22 @@ dependencies = [
  "windows-targets 0.52.6",
 ]
 
-[[package]]
-name = "windows-sys"
-version = "0.59.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
-dependencies = [
- "windows-targets 0.52.6",
-]
-
 [[package]]
 name = "windows-sys"
 version = "0.60.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb"
 dependencies = [
- "windows-targets 0.53.3",
+ "windows-targets 0.53.5",
 ]
 
 [[package]]
 name = "windows-sys"
-version = "0.61.0"
+version = "0.61.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e201184e40b2ede64bc2ea34968b28e33622acdbbf37104f0e4a33f7abe657aa"
+checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
 dependencies = [
- "windows-link 0.2.0",
+ "windows-link",
 ]
 
 [[package]]
@@ -7324,28 +7054,28 @@ dependencies = [
 
 [[package]]
 name = "windows-targets"
-version = "0.53.3"
+version = "0.53.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91"
+checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3"
 dependencies = [
- "windows-link 0.1.3",
- "windows_aarch64_gnullvm 0.53.0",
- "windows_aarch64_msvc 0.53.0",
- "windows_i686_gnu 0.53.0",
- "windows_i686_gnullvm 0.53.0",
- "windows_i686_msvc 0.53.0",
- "windows_x86_64_gnu 0.53.0",
- "windows_x86_64_gnullvm 0.53.0",
- "windows_x86_64_msvc 0.53.0",
+ "windows-link",
+ "windows_aarch64_gnullvm 0.53.1",
+ "windows_aarch64_msvc 0.53.1",
+ "windows_i686_gnu 0.53.1",
+ "windows_i686_gnullvm 0.53.1",
+ "windows_i686_msvc 0.53.1",
+ "windows_x86_64_gnu 0.53.1",
+ "windows_x86_64_gnullvm 0.53.1",
+ "windows_x86_64_msvc 0.53.1",
 ]
 
 [[package]]
 name = "windows-threading"
-version = "0.1.0"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b66463ad2e0ea3bbf808b7f1d371311c80e115c0b71d60efc142cafbcfb057a6"
+checksum = "3949bd5b99cafdf1c7ca86b43ca564028dfe27d66958f2470940f73d86d75b37"
 dependencies = [
- "windows-link 0.1.3",
+ "windows-link",
 ]
 
 [[package]]
@@ -7356,9 +7086,9 @@ checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
 
 [[package]]
 name = "windows_aarch64_gnullvm"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764"
+checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53"
 
 [[package]]
 name = "windows_aarch64_msvc"
@@ -7368,9 +7098,9 @@ checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
 
 [[package]]
 name = "windows_aarch64_msvc"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c"
+checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006"
 
 [[package]]
 name = "windows_i686_gnu"
@@ -7380,9 +7110,9 @@ checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
 
 [[package]]
 name = "windows_i686_gnu"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3"
+checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3"
 
 [[package]]
 name = "windows_i686_gnullvm"
@@ -7392,9 +7122,9 @@ checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
 
 [[package]]
 name = "windows_i686_gnullvm"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11"
+checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c"
 
 [[package]]
 name = "windows_i686_msvc"
@@ -7404,9 +7134,9 @@ checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
 
 [[package]]
 name = "windows_i686_msvc"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d"
+checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2"
 
 [[package]]
 name = "windows_x86_64_gnu"
@@ -7416,9 +7146,9 @@ checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
 
 [[package]]
 name = "windows_x86_64_gnu"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba"
+checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
@@ -7428,9 +7158,9 @@ checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57"
+checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1"
 
 [[package]]
 name = "windows_x86_64_msvc"
@@ -7440,40 +7170,119 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
 [[package]]
 name = "windows_x86_64_msvc"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486"
+checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
 
 [[package]]
 name = "winnow"
-version = "0.7.13"
+version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf"
+checksum = "2ee1708bef14716a11bae175f579062d4554d95be2c6829f518df847b7b3fdd0"
 dependencies = [
  "memchr",
 ]
 
 [[package]]
 name = "wit-bindgen"
-version = "0.46.0"
+version = "0.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
+checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
+dependencies = [
+ "wit-bindgen-rust-macro",
+]
 
 [[package]]
-name = "writeable"
-version = "0.6.1"
+name = "wit-bindgen"
+version = "0.57.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb"
+checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e"
 
 [[package]]
-name = "wyz"
-version = "0.5.1"
+name = "wit-bindgen-core"
+version = "0.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed"
+checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc"
 dependencies = [
- "tap",
+ "anyhow",
+ "heck",
+ "wit-parser",
+]
+
+[[package]]
+name = "wit-bindgen-rust"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
+dependencies = [
+ "anyhow",
+ "heck",
+ "indexmap 2.14.0",
+ "prettyplease",
+ "syn 2.0.117",
+ "wasm-metadata",
+ "wit-bindgen-core",
+ "wit-component",
+]
+
+[[package]]
+name = "wit-bindgen-rust-macro"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a"
+dependencies = [
+ "anyhow",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "wit-bindgen-core",
+ "wit-bindgen-rust",
+]
+
+[[package]]
+name = "wit-component"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
+dependencies = [
+ "anyhow",
+ "bitflags",
+ "indexmap 2.14.0",
+ "log",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "wasm-encoder",
+ "wasm-metadata",
+ "wasmparser",
+ "wit-parser",
+]
+
+[[package]]
+name = "wit-parser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
+dependencies = [
+ "anyhow",
+ "id-arena",
+ "indexmap 2.14.0",
+ "log",
+ "semver",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "unicode-xid",
+ "wasmparser",
 ]
 
+[[package]]
+name = "writeable"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4"
+
 [[package]]
 name = "xattr"
 version = "1.6.1"
@@ -7490,15 +7299,6 @@ version = "0.13.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4"
 
-[[package]]
-name = "xz2"
-version = "0.1.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2"
-dependencies = [
- "lzma-sys",
-]
-
 [[package]]
 name = "yansi"
 version = "1.0.1"
@@ -7507,11 +7307,10 @@ checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049"
 
 [[package]]
 name = "yoke"
-version = "0.8.0"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc"
+checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca"
 dependencies = [
- "serde",
  "stable_deref_trait",
  "yoke-derive",
  "zerofrom",
@@ -7519,68 +7318,68 @@ dependencies = [
 
 [[package]]
 name = "yoke-derive"
-version = "0.8.0"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6"
+checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
  "synstructure",
 ]
 
 [[package]]
 name = "zerocopy"
-version = "0.8.27"
+version = "0.8.48"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c"
+checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9"
 dependencies = [
  "zerocopy-derive",
 ]
 
 [[package]]
 name = "zerocopy-derive"
-version = "0.8.27"
+version = "0.8.48"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831"
+checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "zerofrom"
-version = "0.1.6"
+version = "0.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5"
+checksum = "69faa1f2a1ea75661980b013019ed6687ed0e83d069bc1114e2cc74c6c04c4df"
 dependencies = [
  "zerofrom-derive",
 ]
 
 [[package]]
 name = "zerofrom-derive"
-version = "0.1.6"
+version = "0.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
+checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
  "synstructure",
 ]
 
 [[package]]
 name = "zeroize"
-version = "1.8.1"
+version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"
+checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
 
 [[package]]
 name = "zerotrie"
-version = "0.2.2"
+version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595"
+checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf"
 dependencies = [
  "displaydoc",
  "yoke",
@@ -7589,9 +7388,9 @@ dependencies = [
 
 [[package]]
 name = "zerovec"
-version = "0.11.4"
+version = "0.11.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b"
+checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239"
 dependencies = [
  "yoke",
  "zerofrom",
@@ -7600,20 +7399,26 @@ dependencies = [
 
 [[package]]
 name = "zerovec-derive"
-version = "0.11.1"
+version = "0.11.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f"
+checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.108",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "zlib-rs"
-version = "0.5.2"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513"
+
+[[package]]
+name = "zmij"
+version = "1.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f06ae92f42f5e5c42443fd094f245eb656abf56dd7cce9b8b263236565e00f2"
+checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
 
 [[package]]
 name = "zstd"
diff --git a/Cargo.toml b/Cargo.toml
index f15929b4c2b00..37734211266ba 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -71,7 +71,7 @@ resolver = "2"
 
 [workspace.package]
 authors = ["Apache DataFusion <dev@datafusion.apache.org>"]
-edition = "2021"
+edition = "2024"
 homepage = "https://datafusion.apache.org"
 license = "Apache-2.0"
 readme = "README.md"
@@ -79,7 +79,7 @@ repository = "https://github.com/apache/datafusion"
 # Define Minimum Supported Rust Version (MSRV)
 rust-version = "1.88.0"
 # Define DataFusion version
-version = "50.3.0"
+version = "53.1.0"
 
 [workspace.dependencies]
 # We turn off default-features for some dependencies here so the workspaces which inherit them can
@@ -87,88 +87,103 @@ version = "50.3.0"
 # for the inherited dependency but cannot do the reverse (override from true to false).
 #
 # See for more details: https://github.com/rust-lang/cargo/issues/11329
-ahash = { version = "0.8", default-features = false, features = [
-    "runtime-rng",
-] }
-apache-avro = { version = "0.20", default-features = false }
-arrow = { version = "57.0.0", features = [
+apache-avro = { version = "0.21", default-features = false }
+arrow = { version = "58.1.0", features = [
     "prettyprint",
     "chrono-tz",
 ] }
-arrow-buffer = { version = "57.0.0", default-features = false }
-arrow-flight = { version = "57.0.0", features = [
+arrow-avro = { version = "58.1.0", default-features = false, features = [
+    "deflate",
+    "snappy",
+    "zstd",
+    "bzip2",
+    "xz",
+] }
+arrow-buffer = { version = "58.1.0", default-features = false }
+arrow-data = { version = "58.1.0", default-features = false }
+arrow-flight = { version = "58.1.0", features = [
     "flight-sql-experimental",
 ] }
-arrow-ipc = { version = "57.0.0", default-features = false, features = [
+# Both codecs are required here to make sure that code paths like
+# file-spilling have access to all compression codecs.
+arrow-ipc = { version = "58.1.0", default-features = false, features = [
     "lz4",
+    "zstd",
 ] }
-arrow-ord = { version = "57.0.0", default-features = false }
-arrow-schema = { version = "57.0.0", default-features = false }
+arrow-ord = { version = "58.1.0", default-features = false }
+arrow-schema = { version = "58.1.0", default-features = false }
 async-trait = "0.1.89"
 bigdecimal = "0.4.8"
-bytes = "1.10"
-chrono = { version = "0.4.42", default-features = false }
-criterion = "0.7"
-ctor = "0.6.1"
+bytes = "1.11"
+bzip2 = "0.6.1"
+chrono = { version = "0.4.44", default-features = false }
+criterion = "0.8"
+ctor = "0.10.0"
 dashmap = "6.0.1"
-datafusion = { path = "datafusion/core", version = "50.3.0", default-features = false }
-datafusion-catalog = { path = "datafusion/catalog", version = "50.3.0" }
-datafusion-catalog-listing = { path = "datafusion/catalog-listing", version = "50.3.0" }
-datafusion-common = { path = "datafusion/common", version = "50.3.0", default-features = false }
-datafusion-common-runtime = { path = "datafusion/common-runtime", version = "50.3.0" }
-datafusion-datasource = { path = "datafusion/datasource", version = "50.3.0", default-features = false }
-datafusion-datasource-arrow = { path = "datafusion/datasource-arrow", version = "50.3.0", default-features = false }
-datafusion-datasource-avro = { path = "datafusion/datasource-avro", version = "50.3.0", default-features = false }
-datafusion-datasource-csv = { path = "datafusion/datasource-csv", version = "50.3.0", default-features = false }
-datafusion-datasource-json = { path = "datafusion/datasource-json", version = "50.3.0", default-features = false }
-datafusion-datasource-parquet = { path = "datafusion/datasource-parquet", version = "50.3.0", default-features = false }
-datafusion-doc = { path = "datafusion/doc", version = "50.3.0" }
-datafusion-execution = { path = "datafusion/execution", version = "50.3.0", default-features = false }
-datafusion-expr = { path = "datafusion/expr", version = "50.3.0", default-features = false }
-datafusion-expr-common = { path = "datafusion/expr-common", version = "50.3.0" }
-datafusion-ffi = { path = "datafusion/ffi", version = "50.3.0" }
-datafusion-functions = { path = "datafusion/functions", version = "50.3.0" }
-datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "50.3.0" }
-datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "50.3.0" }
-datafusion-functions-nested = { path = "datafusion/functions-nested", version = "50.3.0", default-features = false }
-datafusion-functions-table = { path = "datafusion/functions-table", version = "50.3.0" }
-datafusion-functions-window = { path = "datafusion/functions-window", version = "50.3.0" }
-datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "50.3.0" }
-datafusion-macros = { path = "datafusion/macros", version = "50.3.0" }
-datafusion-optimizer = { path = "datafusion/optimizer", version = "50.3.0", default-features = false }
-datafusion-physical-expr = { path = "datafusion/physical-expr", version = "50.3.0", default-features = false }
-datafusion-physical-expr-adapter = { path = "datafusion/physical-expr-adapter", version = "50.3.0", default-features = false }
-datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "50.3.0", default-features = false }
-datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "50.3.0" }
-datafusion-physical-plan = { path = "datafusion/physical-plan", version = "50.3.0" }
-datafusion-proto = { path = "datafusion/proto", version = "50.3.0" }
-datafusion-proto-common = { path = "datafusion/proto-common", version = "50.3.0" }
-datafusion-pruning = { path = "datafusion/pruning", version = "50.3.0" }
-datafusion-session = { path = "datafusion/session", version = "50.3.0" }
-datafusion-spark = { path = "datafusion/spark", version = "50.3.0" }
-datafusion-sql = { path = "datafusion/sql", version = "50.3.0" }
-datafusion-substrait = { path = "datafusion/substrait", version = "50.3.0" }
+datafusion = { path = "datafusion/core", version = "53.1.0", default-features = false }
+datafusion-catalog = { path = "datafusion/catalog", version = "53.1.0" }
+datafusion-catalog-listing = { path = "datafusion/catalog-listing", version = "53.1.0" }
+datafusion-common = { path = "datafusion/common", version = "53.1.0", default-features = false }
+datafusion-common-runtime = { path = "datafusion/common-runtime", version = "53.1.0" }
+datafusion-datasource = { path = "datafusion/datasource", version = "53.1.0", default-features = false }
+datafusion-datasource-arrow = { path = "datafusion/datasource-arrow", version = "53.1.0", default-features = false }
+datafusion-datasource-avro = { path = "datafusion/datasource-avro", version = "53.1.0", default-features = false }
+datafusion-datasource-csv = { path = "datafusion/datasource-csv", version = "53.1.0", default-features = false }
+datafusion-datasource-json = { path = "datafusion/datasource-json", version = "53.1.0", default-features = false }
+datafusion-datasource-parquet = { path = "datafusion/datasource-parquet", version = "53.1.0", default-features = false }
+datafusion-doc = { path = "datafusion/doc", version = "53.1.0" }
+datafusion-execution = { path = "datafusion/execution", version = "53.1.0", default-features = false }
+datafusion-expr = { path = "datafusion/expr", version = "53.1.0", default-features = false }
+datafusion-expr-common = { path = "datafusion/expr-common", version = "53.1.0" }
+datafusion-ffi = { path = "datafusion/ffi", version = "53.1.0" }
+datafusion-functions = { path = "datafusion/functions", version = "53.1.0" }
+datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "53.1.0" }
+datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "53.1.0" }
+datafusion-functions-nested = { path = "datafusion/functions-nested", version = "53.1.0", default-features = false }
+datafusion-functions-table = { path = "datafusion/functions-table", version = "53.1.0" }
+datafusion-functions-window = { path = "datafusion/functions-window", version = "53.1.0" }
+datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "53.1.0" }
+datafusion-macros = { path = "datafusion/macros", version = "53.1.0" }
+datafusion-optimizer = { path = "datafusion/optimizer", version = "53.1.0", default-features = false }
+datafusion-physical-expr = { path = "datafusion/physical-expr", version = "53.1.0", default-features = false }
+datafusion-physical-expr-adapter = { path = "datafusion/physical-expr-adapter", version = "53.1.0", default-features = false }
+datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "53.1.0", default-features = false }
+datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "53.1.0" }
+datafusion-physical-plan = { path = "datafusion/physical-plan", version = "53.1.0" }
+datafusion-proto = { path = "datafusion/proto", version = "53.1.0" }
+datafusion-proto-common = { path = "datafusion/proto-common", version = "53.1.0" }
+datafusion-pruning = { path = "datafusion/pruning", version = "53.1.0" }
+datafusion-session = { path = "datafusion/session", version = "53.1.0" }
+datafusion-spark = { path = "datafusion/spark", version = "53.1.0" }
+datafusion-sql = { path = "datafusion/sql", version = "53.1.0" }
+datafusion-substrait = { path = "datafusion/substrait", version = "53.1.0" }
 
 doc-comment = "0.3"
 env_logger = "0.11"
+flate2 = "1.1.9"
 futures = "0.3"
+glob = "0.3.0"
 half = { version = "2.7.0", default-features = false }
-hashbrown = { version = "0.14.5", features = ["raw"] }
+hashbrown = { version = "0.17.0" }
 hex = { version = "0.4.3" }
-indexmap = "2.12.0"
-insta = { version = "1.43.2", features = ["glob", "filters"] }
+indexmap = "2.14.0"
+insta = { version = "1.47.2", features = ["glob", "filters"] }
 itertools = "0.14"
+itoa = "1.0"
+liblzma = { version = "0.4.6", features = ["static"] }
 log = "^0.4"
+memchr = "2.8.0"
 num-traits = { version = "0.2" }
-object_store = { version = "0.12.4", default-features = false }
+object_store = { version = "0.13.2", default-features = false }
 parking_lot = "0.12"
-parquet = { version = "57.0.0", default-features = false, features = [
+parquet = { version = "58.1.0", default-features = false, features = [
     "arrow",
     "async",
     "object_store",
 ] }
-pbjson = { version = "0.8.0" }
-pbjson-types = "0.8"
+pbjson = { version = "0.9.0" }
+pbjson-types = "0.9"
+pin-project = "1"
 # Should match arrow-flight's version of prost.
 prost = "0.14.1"
 rand = "0.9"
@@ -176,12 +191,18 @@ recursive = "0.1.1"
 regex = "1.12"
 rstest = "0.26.1"
 serde_json = "1"
-sqlparser = { version = "0.59.0", default-features = false, features = ["std", "visitor"] }
+sha2 = "^0.11.0"
+sqlparser = { version = "0.61.0", default-features = false, features = ["std", "visitor"] }
+strum = "0.28.0"
+strum_macros = "0.28.0"
 tempfile = "3"
-testcontainers = { version = "0.25.2", features = ["default"] }
-testcontainers-modules = { version = "0.13" }
-tokio = { version = "1.48", features = ["macros", "rt", "sync"] }
+testcontainers-modules = { version = "0.15" }
+tokio = { version = "1.52", features = ["macros", "rt", "sync"] }
+tokio-stream = "0.1"
+tokio-util = "0.7"
 url = "2.5.7"
+uuid = "1.23"
+zstd = { version = "0.13", default-features = false }
 
 [workspace.lints.clippy]
 # Detects large stack-allocated futures that may cause stack overflow crashes (see threshold in clippy.toml)
@@ -191,6 +212,11 @@ or_fun_call = "warn"
 unnecessary_lazy_evaluations = "warn"
 uninlined_format_args = "warn"
 inefficient_to_string = "warn"
+# https://github.com/apache/datafusion/issues/18503
+needless_pass_by_value = "warn"
+# https://github.com/apache/datafusion/issues/18881
+allow_attributes = "warn"
+assigning_clones = "warn"
 
 [workspace.lints.rust]
 unexpected_cfgs = { level = "warn", check-cfg = [
@@ -203,50 +229,56 @@ unused_qualifications = "deny"
 # --------------------
 # Compilation Profiles
 # --------------------
-#  A Cargo profile is a preset for the compiler/linker knobs that trade off:
+# A Cargo profile is a preset for the compiler/linker knobs that trades off:
 # - Build time: how quickly code compiles and links
 # - Runtime performance: how fast the resulting binaries execute
 # - Binary size: how large the executables end up
 # - Debuggability: how much debug information is preserved for debugging and profiling
 #
+# To use a profile: `cargo [ build | run | ... ] --profile <profile-name>`
+#
 # Profiles available:
-# - dev: default debug build; fastest to compile, slowest to run, full debug info
-#     for everyday development.
-#     Run: cargo run
-# - release: optimized build; slowest to compile, fastest to run, smallest
-#     binaries for public releases.
-#     Run: cargo run --release
-# - release-nonlto: skips LTO, so it builds quicker while staying close to
-#     release performance. It is useful when developing performance optimizations.
-#     Run: cargo run --profile release-nonlto
+# - dev: default debug build; fastest to compile, slowest to run, full debug info.
+#     For everyday development; default for "cargo [ build | test | run ]".
+# - release: fully optimized build; slowest to compile, fastest to run, smallest
+#     binaries. For public releases; default for "cargo [ bench | install ]".
+# - release-nonlto: skips LTO, so it builds much faster while staying close to
+#     release performance. Useful when developing performance optimizations.
 # - profiling: inherits release optimizations but retains debug info to support
 #     profiling tools and flamegraphs.
-#     Run: cargo run --profile profiling
-# - ci: derived from `dev` but disables incremental builds and strips dependency
-#     symbols to keep CI artifacts small and reproducible.
-#     Run: cargo run --profile ci
+# - ci: derived from `dev` but disables debug info and incremental builds to keep
+#     CI artifacts small and reproducible.
+# - ci-optimized: derived from `release` but enables debug assertions and uses
+#     less aggressive optimizations for faster builds.  Used for long-running CI
+#     tasks.
 #
 # If you want to optimize compilation, the `compile_profile` benchmark can be useful.
 # See `benchmarks/README.md` for more details.
 [profile.release]
 codegen-units = 1
 lto = true
-strip = true      # Eliminate debug information to minimize binary size
+strip = true      # Eliminate debug info to minimize binary size
 
 [profile.release-nonlto]
-codegen-units = 16
-debug-assertions = false
-incremental = false
 inherits = "release"
+codegen-units = 16
 lto = false
-opt-level = 3
-overflow-checks = false
-rpath = false
-strip = false            # Retain debug info for flamegraphs
+strip = false        # Retain debug info for flamegraphs
+
+[profile.profiling]
+inherits = "release"
+debug = true
+strip = false
+
+[profile.ci-optimized]
+inherits = "release"
+debug-assertions = true
+codegen-units = 16
+lto = "thin"
 
 [profile.ci]
-debug = false
 inherits = "dev"
+debug = false
 incremental = false
 
 # This rule applies to every package except workspace members (dependencies
@@ -257,8 +289,3 @@ debug = false
 debug-assertions = false
 strip = "debuginfo"
 incremental = false
-
-[profile.profiling]
-inherits = "release"
-debug = true
-strip = false
diff --git a/NOTICE.txt b/NOTICE.txt
index 7f3c80d606c07..0bd2d52368fea 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -1,5 +1,5 @@
 Apache DataFusion
-Copyright 2019-2025 The Apache Software Foundation
+Copyright 2019-2026 The Apache Software Foundation
 
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).
diff --git a/README.md b/README.md
index 5191496eaafe3..630d4295bd427 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,7 @@
 [![Build Status][actions-badge]][actions-url]
 ![Commit Activity][commit-activity-badge]
 [![Open Issues][open-issues-badge]][open-issues-url]
+[![Pending PRs][pending-pr-badge]][pending-pr-url]
 [![Discord chat][discord-badge]][discord-url]
 [![Linkedin][linkedin-badge]][linkedin-url]
 ![Crates.io MSRV][msrv-badge]
@@ -39,6 +40,8 @@
 [commit-activity-badge]: https://img.shields.io/github/commit-activity/m/apache/datafusion
 [open-issues-badge]: https://img.shields.io/github/issues-raw/apache/datafusion
 [open-issues-url]: https://github.com/apache/datafusion/issues
+[pending-pr-badge]: https://img.shields.io/github/issues-search/apache/datafusion?query=is%3Apr+is%3Aopen+draft%3Afalse+review%3Arequired+status%3Asuccess&label=Pending%20PRs&logo=github
+[pending-pr-url]: https://github.com/apache/datafusion/pulls?q=is%3Apr+is%3Aopen+draft%3Afalse+review%3Arequired+status%3Asuccess+sort%3Aupdated-desc
 [linkedin-badge]: https://img.shields.io/badge/Follow-Linkedin-blue
 [linkedin-url]: https://www.linkedin.com/company/apache-datafusion/
 [msrv-badge]: https://img.shields.io/crates/msrv/datafusion?label=Min%20Rust%20Version
@@ -55,7 +58,7 @@ DataFusion is an extensible query engine written in [Rust] that
 uses [Apache Arrow] as its in-memory format.
 
 This crate provides libraries and binaries for developers building fast and
-feature rich database and analytic systems, customized to particular workloads.
+feature-rich database and analytic systems, customized for particular workloads.
 See [use cases] for examples. The following related subprojects target end users:
 
 - [DataFusion Python](https://github.com/apache/datafusion-python/) offers a Python interface for SQL and DataFrame
@@ -64,7 +67,7 @@ See [use cases] for examples. The following related subprojects target end users
   DataFusion.
 
 "Out of the box,"
-DataFusion offers [SQL](https://datafusion.apache.org/user-guide/sql/index.html) and [Dataframe](https://datafusion.apache.org/user-guide/dataframe.html) APIs, excellent [performance],
+DataFusion offers [SQL](https://datafusion.apache.org/user-guide/sql/index.html) and [DataFrame](https://datafusion.apache.org/user-guide/dataframe.html) APIs, excellent [performance],
 built-in support for CSV, Parquet, JSON, and Avro, extensive customization, and
 a great community.
 
@@ -81,7 +84,7 @@ See the [Architecture] section for more details.
 [performance]: https://benchmark.clickhouse.com/
 [architecture]: https://datafusion.apache.org/contributor-guide/architecture.html
 
-Here are links to some important information
+Here are links to important resources:
 
 - [Project Site](https://datafusion.apache.org/)
 - [Installation](https://datafusion.apache.org/user-guide/cli/installation.html)
@@ -94,8 +97,8 @@ Here are links to some important information
 
 ## What can you do with this crate?
 
-DataFusion is great for building projects such as domain specific query engines, new database platforms and data pipelines, query languages and more.
-It lets you start quickly from a fully working engine, and then customize those features specific to your use. [Click Here](https://datafusion.apache.org/user-guide/introduction.html#known-users) to see a list known users.
+DataFusion is great for building projects such as domain-specific query engines, new database platforms and data pipelines, query languages and more.
+It lets you start quickly from a fully working engine, and then customize those features specific to your needs. See the [list of known users](https://datafusion.apache.org/user-guide/introduction.html#known-users).
 
 ## Contributing to DataFusion
 
@@ -112,15 +115,15 @@ This crate has several [features] which can be specified in your `Cargo.toml`.
 
 Default features:
 
-- `nested_expressions`: functions for working with nested type function such as `array_to_string`
+- `nested_expressions`: functions for working with nested types such as `array_to_string`
 - `compression`: reading files compressed with `xz2`, `bzip2`, `flate2`, and `zstd`
 - `crypto_expressions`: cryptographic functions such as `md5` and `sha256`
 - `datetime_expressions`: date and time functions such as `to_timestamp`
 - `encoding_expressions`: `encode` and `decode` functions
 - `parquet`: support for reading the [Apache Parquet] format
-- `sql`: Support for sql parsing / planning
+- `sql`: support for SQL parsing and planning
 - `regex_expressions`: regular expression functions, such as `regexp_match`
-- `unicode_expressions`: Include unicode aware functions such as `character_length`
+- `unicode_expressions`: include Unicode-aware functions such as `character_length`
 - `unparser`: enables support to reverse LogicalPlans back into SQL
 - `recursive_protection`: uses [recursive](https://docs.rs/recursive/latest/recursive/) for stack overflow protection.
 
@@ -129,7 +132,6 @@ Optional features:
 - `avro`: support for reading the [Apache Avro] format
 - `backtrace`: include backtrace information in error messages
 - `parquet_encryption`: support for using [Parquet Modular Encryption]
-- `pyarrow`: conversions between PyArrow and DataFusion types
 - `serde`: enable arrow-schema's `serde` feature
 
 [apache avro]: https://avro.apache.org/
diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore
index c35b1a7c1944f..1e59d094eb063 100644
--- a/benchmarks/.gitignore
+++ b/benchmarks/.gitignore
@@ -1,3 +1,5 @@
 data
+data_csv
 results
 venv
+!sql_benchmarks/**/results/
diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml
index b3fd520814dbc..1815f8bc42ca3 100644
--- a/benchmarks/Cargo.toml
+++ b/benchmarks/Cargo.toml
@@ -25,7 +25,11 @@ homepage = { workspace = true }
 repository = { workspace = true }
 license = { workspace = true }
 rust-version = { workspace = true }
+publish = false
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -37,6 +41,10 @@ mimalloc_extended = ["libmimalloc-sys/extended"]
 
 [dependencies]
 arrow = { workspace = true }
+async-trait = "0.1"
+bytes = { workspace = true }
+clap = { version = "4.6.0", features = ["derive", "env"] }
+criterion = { workspace = true, features = ["html_reports"] }
 datafusion = { workspace = true, default-features = true }
 datafusion-common = { workspace = true, default-features = true }
 env_logger = { workspace = true }
@@ -50,10 +58,14 @@ rand = { workspace = true }
 regex.workspace = true
 serde = { version = "1.0.228", features = ["derive"] }
 serde_json = { workspace = true }
-snmalloc-rs = { version = "0.3", optional = true }
-structopt = { version = "0.3", default-features = false }
+snmalloc-rs = { version = "0.7", optional = true }
 tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] }
-tokio-util = { version = "0.7.16" }
+tokio-util = { version = "0.7.17" }
 
 [dev-dependencies]
 datafusion-proto = { workspace = true }
+tempfile = { workspace = true }
+
+[[bench]]
+harness = false
+name = "sql"
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 8fed85fa02b80..a4ddb09e0771c 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -95,7 +95,7 @@ Generate the data required for the compile profile helper (TPC-H SF=1):
 ./bench.sh data compile_profile
 ```
 
-Run the benchmark across all default Cargo profiles (`dev`, `release`, `ci`, `release-nonlto`):
+Run the benchmark across all default Cargo profiles (`dev`, `release`, `ci`, `ci-optimized`, `release-nonlto`, `profiling`):
 
 ```shell
 ./bench.sh run compile_profile
@@ -119,7 +119,6 @@ You can also invoke the helper directly if you need to customise arguments furth
 ./benchmarks/compile_profile.py --profiles dev release --data /path/to/tpch_sf1
 ```
 
-
 ## Benchmark with modified configurations
 
 ### Select join algorithm
@@ -147,6 +146,19 @@ To verify that datafusion picked up your configuration, run the benchmarks with
 
 ## Comparing performance of main and a branch
 
+For TPC-H
+```shell
+./benchmarks/compare_tpch.sh main mybranch
+```
+
+For TPC-DS. 
+To get data in `DATA_DIR` for TPCDS, please follow instructions in `./benchmarks/bench.sh data tcpds` 
+```shell
+DATA_DIR=../../datafusion-benchmarks/tpcds/data/sf1/ ./benchmarks/compare_tpcds.sh main mybranch
+```
+
+Alternatively, you can compare manually following the example below
+
 ```shell
 git checkout main
 
@@ -228,6 +240,23 @@ Benchmark tpch_mem.json
 └──────────────┴──────────────┴──────────────┴───────────────┘
 ```
 
+## Comparing performance of main and a PR
+
+### TPCDS
+
+Considering you already have TPCDS data locally
+
+```shell
+export DATA_DIR=../../datafusion-benchmarks/tpcds/data/sf1/
+export PR_NUMBER=19464
+git fetch upstream pull/$PR_NUMBER/head:pr-$PR_NUMBER
+git checkout main
+git pull
+./benchmarks/compare_tpcds.sh main pr-$PR_NUMBER
+```
+
+Note: if `gh` is installed, you can also run `gh pr checkout $PR_NUMBER` instead of `git fetch upstream pull/$PR_NUMBER/head:pr-$PR_NUMBER`
+
 ### Running Benchmarks Manually
 
 Assuming data is in the `data` directory, the `tpch` benchmark can be run with a command like this:
@@ -243,28 +272,11 @@ See the help for more details.
 You can enable `mimalloc` or `snmalloc` (to use either the mimalloc or snmalloc allocator) as features by passing them in as `--features`. For example:
 
 ```shell
-cargo run --release --features "mimalloc" --bin tpch -- benchmark datafusion --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096
-```
-
-The benchmark program also supports CSV and Parquet input file formats and a utility is provided to convert from `tbl`
-(generated by the `dbgen` utility) to CSV and Parquet.
-
-```bash
-cargo run --release --bin tpch -- convert --input ./data --output /mnt/tpch-parquet --format parquet
+cargo run --release --features "mimalloc" --bin dfbench tpch --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096
 ```
 
 Or if you want to verify and run all the queries in the benchmark, you can just run `cargo test`.
 
-#### Sorted Conversion
-
-The TPCH tables generated by the dbgen utility are sorted by their first column (their primary key for most tables, the `l_orderkey` column for the `lineitem` table.)
-
-To preserve this sorted order information during conversion (useful for benchmarking execution on pre-sorted data) include the `--sort` flag:
-
-```bash
-cargo run --release --bin tpch -- convert --input ./data --output /mnt/tpch-sorted-parquet --format parquet --sort
-```
-
 ### Comparing results between runs
 
 Any `dfbench` execution with `-o <dir>` argument will produce a
@@ -316,7 +328,6 @@ This will produce output like:
 └──────────────┴──────────────┴──────────────┴───────────────┘
 ```
 
-
 # Benchmark Runner
 
 The `dfbench` program contains subcommands to run the various
@@ -356,24 +367,28 @@ FLAGS:
 ```
 
 # Profiling Memory Stats for each benchmark query
+
 The `mem_profile` program wraps benchmark execution to measure memory usage statistics, such as peak RSS. It runs each benchmark query in a separate subprocess, capturing the child process’s stdout to print structured output.
 
 Subcommands supported by mem_profile are the subset of those in `dfbench`.
-Currently supported benchmarks include: Clickbench, H2o, Imdb, SortTpch, Tpch
+Currently supported benchmarks include: Clickbench, H2o, Imdb, SortTpch, Tpch, TPCDS
 
 Before running benchmarks, `mem_profile` automatically compiles the benchmark binary (`dfbench`) using `cargo build`. Note that the build profile used for `dfbench` is not tied to the profile used for running `mem_profile` itself. We can explicitly specify the desired build profile using the `--bench-profile` option (e.g. release-nonlto). By prebuilding the binary and running each query in a separate process, we can ensure accurate memory statistics.
 
 Currently, `mem_profile` only supports `mimalloc` as the memory allocator, since it relies on `mimalloc`'s API to collect memory statistics.
 
-Because it runs the compiled binary directly from the target directory, make sure your working directory is the top-level datafusion/ directory, where the target/ is also located. 
+Because it runs the compiled binary directly from the target directory, make sure your working directory is the top-level datafusion/ directory, where the target/ is also located.
+
+The benchmark subcommand (e.g., `tpch`) and all following arguments are passed directly to `dfbench`. Be sure to specify `--bench-profile` before the benchmark subcommand.
 
-The benchmark subcommand (e.g., `tpch`) and all following arguments are passed directly to `dfbench`. Be sure to specify `--bench-profile` before the benchmark subcommand. 
+Example:
 
-Example: 
 ```shell
 datafusion$ cargo run --profile release-nonlto --bin mem_profile -- --bench-profile release-nonlto tpch --path benchmarks/data/tpch_sf1 --partitions 4 --format parquet
 ```
+
 Example Output:
+
 ```
 Query     Time (ms)     Peak RSS  Peak Commit  Major Page Faults
 ----------------------------------------------------------------
@@ -402,19 +417,21 @@ Query     Time (ms)     Peak RSS  Peak Commit  Major Page Faults
 ```
 
 ## Reported Metrics
+
 When running benchmarks, `mem_profile` collects several memory-related statistics using the mimalloc API:
 
-- Peak RSS (Resident Set Size): 
-The maximum amount of physical memory used by the process.
-This is a process-level metric collected via OS-specific mechanisms and is not mimalloc-specific.
+- Peak RSS (Resident Set Size):
+  The maximum amount of physical memory used by the process.
+  This is a process-level metric collected via OS-specific mechanisms and is not mimalloc-specific.
 
 - Peak Commit:
-The peak amount of memory committed by the allocator (i.e., total virtual memory reserved).
-This is mimalloc-specific. It gives a more allocator-aware view of memory usage than RSS.
+  The peak amount of memory committed by the allocator (i.e., total virtual memory reserved).
+  This is mimalloc-specific. It gives a more allocator-aware view of memory usage than RSS.
 
 - Major Page Faults:
-The number of major page faults triggered during execution.
-This metric is obtained from the operating system and is not mimalloc-specific.
+  The number of major page faults triggered during execution.
+  This metric is obtained from the operating system and is not mimalloc-specific.
+
 # Writing a new benchmark
 
 ## Creating or downloading data outside of the benchmark
@@ -603,6 +620,34 @@ This benchmarks is derived from the [TPC-H][1] version
 [2]: https://github.com/databricks/tpch-dbgen.git,
 [2.17.1]: https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v2.17.1.pdf
 
+## TPCDS
+
+Run the tpcds benchmark.
+
+For data please clone `datafusion-benchmarks` repo which contains the predefined parquet data with SF1.
+
+```shell
+git clone https://github.com/apache/datafusion-benchmarks
+```
+
+Then run the benchmark with the following command:
+
+```shell
+DATA_DIR=../../datafusion-benchmarks/tpcds/data/sf1/ ./benchmarks/bench.sh run tpcds
+```
+
+Alternatively benchmark the specific query
+
+```shell
+DATA_DIR=../../datafusion-benchmarks/tpcds/data/sf1/ ./benchmarks/bench.sh run tpcds 30
+```
+
+More help
+
+```shell
+cargo run --release --bin dfbench -- tpcds --help
+```
+
 ## External Aggregation
 
 Run the benchmark for aggregations with limited memory.
@@ -762,7 +807,7 @@ Different queries are included to test nested loop joins under various workloads
 
 ## Hash Join
 
-This benchmark focuses on the performance of queries with nested hash joins, minimizing other overheads such as scanning data sources or evaluating predicates.
+This benchmark focuses on the performance of queries with hash joins, minimizing other overheads such as scanning data sources or evaluating predicates.
 
 Several queries are included to test hash joins under various workloads.
 
@@ -774,6 +819,19 @@ Several queries are included to test hash joins under various workloads.
 ./bench.sh run hj
 ```
 
+## Sort Merge Join
+
+This benchmark focuses on the performance of queries with sort merge joins joins, minimizing other overheads such as scanning data sources or evaluating predicates.
+
+Several queries are included to test sort merge joins under various workloads.
+
+### Example Run
+
+```bash
+# No need to generate data: this benchmark uses table function `range()` as the data source
+
+./bench.sh run smj
+```
 ## Cancellation
 
 Test performance of cancelling queries.
@@ -804,3 +862,82 @@ Getting results...
 cancelling thread
 done dropping runtime in 83.531417ms
 ```
+
+## Sorted Data Benchmarks
+
+### Data Sorted ClickBench
+
+Benchmark for queries on pre-sorted data to test sort order optimization.
+This benchmark uses a subset of the ClickBench dataset (hits.parquet, ~14GB) that has been pre-sorted by the EventTime column. The queries are designed to test DataFusion's performance when the data is already sorted as is common in timeseries workloads.
+
+The benchmark includes queries that:
+- Scan pre-sorted data with ORDER BY clauses that match the sort order
+- Test reverse scans on sorted data
+- Verify the performance result
+
+#### Generating Sorted Data
+
+The sorted dataset is automatically generated from the ClickBench partitioned dataset. You can configure the memory used during the sorting process with the `DATAFUSION_MEMORY_GB` environment variable. The default memory limit is 12GB.
+```bash
+./bench.sh data clickbench_sorted
+```
+
+To create the sorted dataset, for example with 16GB of memory, run:
+
+```bash
+DATAFUSION_MEMORY_GB=16 ./bench.sh data clickbench_sorted
+```
+
+This command will:
+1. Download the ClickBench partitioned dataset if not present
+2. Sort hits.parquet by EventTime in ascending order
+3. Save the sorted file as hits_sorted.parquet
+
+#### Running the Benchmark
+
+```bash
+./bench.sh run clickbench_sorted
+```
+
+This runs queries against the pre-sorted dataset with the `--sorted-by EventTime` flag, which informs DataFusion that the data is pre-sorted, allowing it to optimize away redundant sort operations.
+
+## Sort Pushdown
+
+Benchmarks for sort pushdown optimizations on TPC-H lineitem data (SF=1).
+
+### Variants
+
+| Benchmark | Description |
+|-----------|-------------|
+| `sort_pushdown` | Baseline — no `WITH ORDER`, tests standard sort behavior |
+| `sort_pushdown_sorted` | With `WITH ORDER` — tests sort elimination on sorted files |
+| `sort_pushdown_inexact` | Inexact path (`--sorted` DESC) — multi-file with scrambled RGs, tests reverse scan + RG reorder |
+| `sort_pushdown_inexact_unsorted` | No `WITH ORDER` — same data, tests Unsupported path + RG reorder |
+| `sort_pushdown_inexact_overlap` | Multi-file scrambled RGs — streaming data scenario |
+
+### Queries
+
+**sort_pushdown / sort_pushdown_sorted** (q1-q8):
+- q1-q4: ASC queries (sort elimination with `--sorted`)
+- q5-q8: DESC LIMIT queries (reverse scan + TopK optimization with `--sorted`)
+
+**sort_pushdown_inexact** (q1-q4): DESC LIMIT queries on scrambled data
+
+### Data Generation
+
+The inexact/overlap data requires pyarrow (`pip install pyarrow`) to generate
+multi-file parquet with scrambled row group order. DataFusion's COPY cannot produce
+narrow-range RGs in scrambled order because the parquet writer merges rows from
+adjacent chunks at RG boundaries.
+
+### Running
+
+```bash
+# Generate data and run all sort pushdown benchmarks
+./bench.sh data sort_pushdown
+./bench.sh data sort_pushdown_inexact
+./bench.sh run sort_pushdown
+./bench.sh run sort_pushdown_sorted
+./bench.sh run sort_pushdown_inexact
+./bench.sh run sort_pushdown_inexact_overlap
+```
diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
index dbfd319dd9ad4..7aa0418e1d74d 100755
--- a/benchmarks/bench.sh
+++ b/benchmarks/bench.sh
@@ -41,8 +41,15 @@ BENCHMARK=all
 DATAFUSION_DIR=${DATAFUSION_DIR:-$SCRIPT_DIR/..}
 DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data}
 CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"}
+SQL_CARGO_COMMAND=${SQL_CARGO_COMMAND:-"cargo bench --bench sql"}
 PREFER_HASH_JOIN=${PREFER_HASH_JOIN:-true}
-VIRTUAL_ENV=${VIRTUAL_ENV:-$SCRIPT_DIR/venv}
+SIMULATE_LATENCY=${SIMULATE_LATENCY:-false}
+
+# Build latency arg based on SIMULATE_LATENCY setting
+LATENCY_ARG=""
+if [ "$SIMULATE_LATENCY" = "true" ]; then
+    LATENCY_ARG="--simulate-latency"
+fi
 
 usage() {
     echo "
@@ -53,7 +60,6 @@ $0 data [benchmark]
 $0 run [benchmark] [query]
 $0 compare <branch1> <branch2>
 $0 compare_detail <branch1> <branch2>
-$0 venv
 
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 Examples:
@@ -71,7 +77,6 @@ data:            Generates or downloads data needed for benchmarking
 run:             Runs the named benchmark
 compare:         Compares fastest results from benchmark runs
 compare_detail:  Compares minimum, average (±stddev), and maximum results from benchmark runs
-venv:            Creates new venv (unless already exists) and installs compare's requirements into it
 
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 Benchmarks
@@ -87,6 +92,9 @@ tpch10:                 TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB),
 tpch_csv10:             TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single csv file per table, hash join
 tpch_mem10:             TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory
 
+# TPC-DS Benchmarks
+tpcds:                  TPCDS inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table, hash join
+
 # Extended TPC-H Benchmarks
 sort_tpch:              Benchmark of sorting speed for end-to-end sort queries on TPC-H dataset (SF=1)
 sort_tpch10:            Benchmark of sorting speed for end-to-end sort queries on TPC-H dataset (SF=10)
@@ -99,6 +107,16 @@ clickbench_partitioned: ClickBench queries against partitioned (100 files) parqu
 clickbench_pushdown:    ClickBench queries against partitioned (100 files) parquet w/ filter_pushdown enabled
 clickbench_extended:    ClickBench \"inspired\" queries against a single parquet (DataFusion specific)
 
+# Sort Pushdown Benchmarks
+sort_pushdown:          Sort pushdown baseline (no WITH ORDER) on TPC-H data (SF=1)
+sort_pushdown_sorted:   Sort pushdown with WITH ORDER — tests sort elimination on non-overlapping files
+sort_pushdown_inexact:  Sort pushdown Inexact path (--sorted DESC) — multi-file with scrambled RGs, tests reverse scan + RG reorder
+sort_pushdown_inexact_unsorted: Sort pushdown Inexact path (no WITH ORDER) — same data, tests Unsupported path + RG reorder
+sort_pushdown_inexact_overlap: Sort pushdown Inexact path — multi-file scrambled RGs (streaming data scenario)
+
+# Sorted Data Benchmarks (ORDER BY Optimization)
+clickbench_sorted:     ClickBench queries on pre-sorted data using prefer_existing_sort (tests sort elimination optimization)
+
 # H2O.ai Benchmarks (Group By, Join, Window)
 h2o_small:                      h2oai benchmark with small dataset (1e7 rows) for groupby,  default file format is csv
 h2o_medium:                     h2oai benchmark with medium dataset (1e8 rows) for groupby, default file format is csv
@@ -126,6 +144,7 @@ imdb:                   Join Order Benchmark (JOB) using the IMDB dataset conver
 cancellation:           How long cancelling a query takes
 nlj:                    Benchmark for simple nested loop joins, testing various join scenarios
 hj:                     Benchmark for simple hash joins, testing various join scenarios
+smj:                    Benchmark for simple sort merge joins, testing various join scenarios
 compile_profile:        Compile and execute TPC-H across selected Cargo profiles, reporting timing and binary size
 
 
@@ -137,7 +156,7 @@ CARGO_COMMAND       command that runs the benchmark binary
 DATAFUSION_DIR      directory to use (default $DATAFUSION_DIR)
 RESULTS_NAME        folder where the benchmark files are stored
 PREFER_HASH_JOIN    Prefer hash join algorithm (default true)
-VENV_PATH           Python venv to use for compare and venv commands (default ./venv, override by <your-venv>/bin/activate)
+SIMULATE_LATENCY    Simulate object store latency to mimic S3 (default false)
 DATAFUSION_*        Set the given datafusion configuration
 "
     exit 1
@@ -189,8 +208,8 @@ main() {
             echo "***************************"
             case "$BENCHMARK" in
                 all)
-                    data_tpch "1"
-                    data_tpch "10"
+                    data_tpch "1" "parquet"
+                    data_tpch "10" "parquet"
                     data_h2o "SMALL"
                     data_h2o "MEDIUM"
                     data_h2o "BIG"
@@ -203,18 +222,25 @@ main() {
                     # nlj uses range() function, no data generation needed
                     ;;
                 tpch)
-                    data_tpch "1"
+                    data_tpch "1" "parquet"
                     ;;
                 tpch_mem)
-                    # same data as for tpch
-                    data_tpch "1"
+                    data_tpch "1" "parquet"
+                    ;;
+                tpch_csv)
+                    data_tpch "1" "csv"
                     ;;
                 tpch10)
-                    data_tpch "10"
+                    data_tpch "10" "parquet"
                     ;;
                 tpch_mem10)
-                    # same data as for tpch10
-                    data_tpch "10"
+                    data_tpch "10" "parquet"
+                    ;;
+                tpch_csv10)
+                    data_tpch "10" "csv"
+                    ;;
+                tpcds)
+                    data_tpcds
                     ;;
                 clickbench_1)
                     data_clickbench_1
@@ -289,30 +315,42 @@ main() {
                     ;;
                 external_aggr)
                     # same data as for tpch
-                    data_tpch "1"
+                    data_tpch "1" "parquet"
+                    ;;
+                sort_pushdown|sort_pushdown_sorted)
+                    data_sort_pushdown
+                    ;;
+                sort_pushdown_inexact|sort_pushdown_inexact_unsorted|sort_pushdown_inexact_overlap)
+                    data_sort_pushdown_inexact
                     ;;
                 sort_tpch)
                     # same data as for tpch
-                    data_tpch "1"
+                    data_tpch "1" "parquet"
                     ;;
                 sort_tpch10)
                     # same data as for tpch10
-                    data_tpch "10"
+                    data_tpch "10" "parquet"
                     ;;
                 topk_tpch)
                     # same data as for tpch
-                    data_tpch "1"
+                    data_tpch "1" "parquet"
                     ;;
                 nlj)
                     # nlj uses range() function, no data generation needed
                     echo "NLJ benchmark does not require data generation"
                     ;;
                 hj)
-                    # hj uses range() function, no data generation needed
-                    echo "HJ benchmark does not require data generation"
+                    data_tpch "10" "parquet"
+                    ;;
+                smj)
+                    # smj uses range() function, no data generation needed
+                    echo "SMJ benchmark does not require data generation"
                     ;;
                 compile_profile)
-                    data_tpch "1"
+                    data_tpch "1" "parquet"
+                    ;;
+                clickbench_sorted)
+                    clickbench_sorted
                     ;;
                 *)
                     echo "Error: unknown benchmark '$BENCHMARK' for data generation"
@@ -355,6 +393,7 @@ main() {
             echo "RESULTS_DIR: ${RESULTS_DIR}"
             echo "CARGO_COMMAND: ${CARGO_COMMAND}"
             echo "PREFER_HASH_JOIN: ${PREFER_HASH_JOIN}"
+            echo "SIMULATE_LATENCY: ${SIMULATE_LATENCY}"
             echo "***************************"
 
             # navigate to the appropriate directory
@@ -384,6 +423,8 @@ main() {
                     run_external_aggr
                     run_nlj
                     run_hj
+                    run_tpcds
+                    run_smj
                     ;;
                 tpch)
                     run_tpch "1" "parquet"
@@ -403,6 +444,9 @@ main() {
                 tpch_mem10)
                     run_tpch_mem "10"
                     ;;
+                tpcds)
+                    run_tpcds
+                    ;;
                 cancellation)
                     run_cancellation
                     ;;
@@ -445,7 +489,7 @@ main() {
                 h2o_medium_window)
                     run_h2o_window "MEDIUM" "CSV" "window"
                     ;;
-                h2o_big_window) 
+                h2o_big_window)
                     run_h2o_window "BIG" "CSV" "window"
                     ;;
                 h2o_small_parquet)
@@ -479,6 +523,21 @@ main() {
                 external_aggr)
                     run_external_aggr
                     ;;
+                sort_pushdown)
+                    run_sort_pushdown
+                    ;;
+                sort_pushdown_sorted)
+                    run_sort_pushdown_sorted
+                    ;;
+                sort_pushdown_inexact)
+                    run_sort_pushdown_inexact
+                    ;;
+                sort_pushdown_inexact_unsorted)
+                    run_sort_pushdown_inexact_unsorted
+                    ;;
+                sort_pushdown_inexact_overlap)
+                    run_sort_pushdown_inexact_overlap
+                    ;;
                 sort_tpch)
                     run_sort_tpch "1"
                     ;;
@@ -494,9 +553,15 @@ main() {
                 hj)
                     run_hj
                     ;;
+                smj)
+                    run_smj
+                    ;;
                 compile_profile)
                     run_compile_profile "${PROFILE_ARGS[@]}"
                     ;;
+                clickbench_sorted)
+                    run_clickbench_sorted
+                    ;;
                 *)
                     echo "Error: unknown benchmark '$BENCHMARK' for run"
                     usage
@@ -511,9 +576,6 @@ main() {
         compare_detail)
             compare_benchmarks "$ARG2" "$ARG3" "--detailed"
             ;;
-        venv)
-            setup_venv
-            ;;
         "")
             usage
             ;;
@@ -529,7 +591,7 @@ main() {
 # Creates TPCH data at a certain scale factor, if it doesn't already
 # exist
 #
-# call like: data_tpch($scale_factor)
+# call like: data_tpch($scale_factor, format)
 #
 # Creates data in $DATA_DIR/tpch_sf1 for scale factor 1
 # Creates data in $DATA_DIR/tpch_sf10 for scale factor 10
@@ -540,20 +602,23 @@ data_tpch() {
         echo "Internal error: Scale factor not specified"
         exit 1
     fi
+    FORMAT=$2
+    if [ -z "$FORMAT" ] ; then
+        echo "Internal error: Format not specified"
+        exit 1
+    fi
 
     TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"
-    echo "Creating tpch dataset at Scale Factor ${SCALE_FACTOR} in ${TPCH_DIR}..."
+    echo "Creating tpch $FORMAT dataset at Scale Factor ${SCALE_FACTOR} in ${TPCH_DIR}..."
 
     # Ensure the target data directory exists
     mkdir -p "${TPCH_DIR}"
 
-    # Create 'tbl' (CSV format) data into $DATA_DIR if it does not already exist
-    FILE="${TPCH_DIR}/supplier.tbl"
-    if test -f "${FILE}"; then
-        echo " tbl files exist ($FILE exists)."
-    else
-        echo " creating tbl files with tpch_dbgen..."
-        docker run -v "${TPCH_DIR}":/data -it --rm ghcr.io/scalytics/tpch-docker:main -vf -s "${SCALE_FACTOR}"
+    # check if tpchgen-cli is installed
+    if ! command -v tpchgen-cli &> /dev/null
+    then
+        echo "tpchgen-cli could not be found, please install it via 'cargo install tpchgen-cli'"
+        exit 1
     fi
 
     # Copy expected answers into the ./data/answers directory if it does not already exist
@@ -566,27 +631,52 @@ data_tpch() {
         docker run -v "${TPCH_DIR}":/data -it --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main  -c "cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/"
     fi
 
-    # Create 'parquet' files from tbl
-    FILE="${TPCH_DIR}/supplier"
-    if test -d "${FILE}"; then
-        echo " parquet files exist ($FILE exists)."
-    else
-        echo " creating parquet files using benchmark binary ..."
-        pushd "${SCRIPT_DIR}" > /dev/null
-        $CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}" --format parquet
-        popd > /dev/null
+    if [ "$FORMAT" = "parquet" ]; then
+      # Create 'parquet' files, one directory per file
+      FILE="${TPCH_DIR}/supplier"
+      if test -d "${FILE}"; then
+          echo " parquet files exist ($FILE exists)."
+      else
+          echo " creating parquet files using tpchgen-cli ..."
+          tpchgen-cli --scale-factor "${SCALE_FACTOR}" --format parquet --parquet-compression='ZSTD(1)' --parts=1 --output-dir "${TPCH_DIR}"
+      fi
+      return
     fi
 
-    # Create 'csv' files from tbl
-    FILE="${TPCH_DIR}/csv/supplier"
-    if test -d "${FILE}"; then
-        echo " csv files exist ($FILE exists)."
-    else
-        echo " creating csv files using benchmark binary ..."
-        pushd "${SCRIPT_DIR}" > /dev/null
-        $CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}/csv" --format csv
-        popd > /dev/null
+    # Create 'csv' files, one directory per file
+    if [ "$FORMAT" = "csv" ]; then
+      FILE="${TPCH_DIR}/csv/supplier"
+      if test -d "${FILE}"; then
+          echo " csv files exist ($FILE exists)."
+      else
+          echo " creating csv files using tpchgen-cli binary ..."
+          tpchgen-cli --scale-factor "${SCALE_FACTOR}" --format csv --parts=1 --output-dir "${TPCH_DIR}/csv"
+      fi
+      return
+    fi
+
+    echo "Error: unknown format '$FORMAT' for tpch data generation, expected 'parquet' or 'csv'"
+    exit 1
+}
+
+# Downloads TPC-DS data
+data_tpcds() {
+    TPCDS_DIR="${DATA_DIR}/tpcds_sf1"
+
+    # Check if `web_site.parquet` exists in the TPCDS data directory to verify data presence
+    echo "Checking TPC-DS data directory: ${TPCDS_DIR}"
+    if [ ! -f "${TPCDS_DIR}/web_site.parquet" ]; then
+        mkdir -p "${TPCDS_DIR}"
+        # Download the DataFusion benchmarks repository zip if it is not already downloaded
+        if [ ! -f "${DATA_DIR}/datafusion-benchmarks.zip" ]; then
+          echo "Downloading DataFusion benchmarks repository zip to: ${DATA_DIR}/datafusion-benchmarks.zip"
+          wget --timeout=30 --tries=3 -O "${DATA_DIR}/datafusion-benchmarks.zip" https://github.com/apache/datafusion-benchmarks/archive/refs/heads/main.zip
+        fi
+        echo "Extracting TPC-DS parquet data to ${TPCDS_DIR}..."
+        unzip -o -j -d "${TPCDS_DIR}" "${DATA_DIR}/datafusion-benchmarks.zip" datafusion-benchmarks-main/tpcds/data/sf1/*
+        echo "TPC-DS data extracted."
     fi
+    echo "Done."
 }
 
 # Runs the tpch benchmark
@@ -596,30 +686,54 @@ run_tpch() {
         echo "Internal error: Scale factor not specified"
         exit 1
     fi
-    TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"
-
-    RESULTS_FILE="${RESULTS_DIR}/tpch_sf${SCALE_FACTOR}.json"
-    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    FORMAT=$2
     echo "Running tpch benchmark..."
 
-    FORMAT=$2
-    debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format ${FORMAT} -o "${RESULTS_FILE}" ${QUERY_ARG}
+    debug_run env BENCH_NAME=tpch \
+      BENCH_SIZE="${SCALE_FACTOR}" \
+      PREFER_HASH_JOIN="${PREFER_HASH_JOIN}" \
+      TPCH_FILE_TYPE="${FORMAT}" \
+      SIMULATE_LATENCY="${SIMULATE_LATENCY}" \
+      ${QUERY:+BENCH_QUERY="${QUERY}"}  \
+      bash -c "$SQL_CARGO_COMMAND"
 }
 
-# Runs the tpch in memory
+# Runs the tpch in memory (needs tpch parquet data)
 run_tpch_mem() {
     SCALE_FACTOR=$1
     if [ -z "$SCALE_FACTOR" ] ; then
         echo "Internal error: Scale factor not specified"
         exit 1
     fi
-    TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"
+    echo "Running tpch_mem benchmark..."
+
+    debug_run env BENCH_NAME=tpch \
+      BENCH_SIZE="${SCALE_FACTOR}" \
+      TPCH_FILE_TYPE="mem" \
+      PREFER_HASH_JOIN="${PREFER_HASH_JOIN}" \
+      SIMULATE_LATENCY="${SIMULATE_LATENCY}" \
+      ${QUERY:+BENCH_QUERY="${QUERY}"}  \
+      bash -c "$SQL_CARGO_COMMAND"
+}
+
+# Runs the tpcds benchmark
+run_tpcds() {
+    TPCDS_DIR="${DATA_DIR}/tpcds_sf1"
 
-    RESULTS_FILE="${RESULTS_DIR}/tpch_mem_sf${SCALE_FACTOR}.json"
+    # Check if TPCDS data directory and representative file exists
+    if [ ! -f "${TPCDS_DIR}/web_site.parquet" ]; then
+        echo "" >&2
+        echo "Please prepare TPC-DS data first by following instructions:" >&2
+        echo "  ./bench.sh data tpcds" >&2
+        echo "" >&2
+        exit 1
+    fi
+
+    RESULTS_FILE="${RESULTS_DIR}/tpcds_sf1.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
-    echo "Running tpch_mem benchmark..."
-    # -m means in memory
-    debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}" ${QUERY_ARG}
+    echo "Running tpcds benchmark..."
+
+    debug_run $CARGO_COMMAND --bin dfbench -- tpcds --iterations 5 --path "${TPCDS_DIR}" --query_path "../datafusion/core/tests/tpc-ds" --prefer_hash_join "${PREFER_HASH_JOIN}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }
 
 # Runs the compile profile benchmark helper
@@ -629,7 +743,7 @@ run_compile_profile() {
     local data_path="${DATA_DIR}/tpch_sf1"
 
     echo "Running compile profile benchmark..."
-    local cmd=(python3 "${runner}" --data "${data_path}")
+    local cmd=(uv run python3 "${runner}" --data "${data_path}")
     if [ ${#profiles[@]} -gt 0 ]; then
         cmd+=(--profiles "${profiles[@]}")
     fi
@@ -641,7 +755,7 @@ run_cancellation() {
     RESULTS_FILE="${RESULTS_DIR}/cancellation.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running cancellation benchmark..."
-    debug_run $CARGO_COMMAND --bin dfbench -- cancellation --iterations 5 --path "${DATA_DIR}/cancellation" -o "${RESULTS_FILE}"
+    debug_run $CARGO_COMMAND --bin dfbench -- cancellation --iterations 5 --path "${DATA_DIR}/cancellation" -o "${RESULTS_FILE}" ${LATENCY_ARG}
 }
 
 
@@ -695,7 +809,7 @@ run_clickbench_1() {
     RESULTS_FILE="${RESULTS_DIR}/clickbench_1.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running clickbench (1 file) benchmark..."
-    debug_run $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits.parquet"  --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}" ${QUERY_ARG}
+    debug_run $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits.parquet"  --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }
 
  # Runs the clickbench benchmark with the partitioned parquet dataset (100 files)
@@ -703,7 +817,7 @@ run_clickbench_partitioned() {
     RESULTS_FILE="${RESULTS_DIR}/clickbench_partitioned.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running clickbench (partitioned, 100 files) benchmark..."
-    debug_run $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}" ${QUERY_ARG}
+    debug_run $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }
 
 
@@ -712,7 +826,7 @@ run_clickbench_pushdown() {
     RESULTS_FILE="${RESULTS_DIR}/clickbench_pushdown.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running clickbench (partitioned, 100 files) benchmark with pushdown_filters=true, reorder_filters=true..."
-    debug_run $CARGO_COMMAND --bin dfbench -- clickbench --pushdown --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}" ${QUERY_ARG}
+    debug_run $CARGO_COMMAND --bin dfbench -- clickbench --pushdown --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }
 
 
@@ -721,7 +835,7 @@ run_clickbench_extended() {
     RESULTS_FILE="${RESULTS_DIR}/clickbench_extended.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running clickbench (1 file) extended benchmark..."
-    debug_run $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended" -o "${RESULTS_FILE}" ${QUERY_ARG}
+    debug_run $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }
 
 # Downloads the csv.gz files IMDB datasets from Peter Boncz's homepage(one of the JOB paper authors)
@@ -806,7 +920,7 @@ data_imdb() {
                 if [ "${DOWNLOADED_SIZE}" != "${expected_size}" ]; then
                     echo "Error: Download size mismatch"
                     echo "Expected: ${expected_size}"
-                    echo "Got: ${DOWNLADED_SIZE}"
+                    echo "Got: ${DOWNLOADED_SIZE}"
                     echo "Please re-initiate the download"
                     return 1
                 fi
@@ -836,7 +950,7 @@ run_imdb() {
     RESULTS_FILE="${RESULTS_DIR}/imdb.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running imdb benchmark..."
-    debug_run $CARGO_COMMAND --bin imdb -- benchmark datafusion --iterations 5 --path "${IMDB_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}" ${QUERY_ARG}
+    debug_run $CARGO_COMMAND --bin imdb -- benchmark datafusion --iterations 5 --path "${IMDB_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }
 
 data_h2o() {
@@ -844,75 +958,13 @@ data_h2o() {
     SIZE=${1:-"SMALL"}
     DATA_FORMAT=${2:-"CSV"}
 
-    # Function to compare Python versions
-    version_ge() {
-        [ "$(printf '%s\n' "$1" "$2" | sort -V | head -n1)" = "$2" ]
-    }
-
-    export PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1
-
-    # Find the highest available Python version (3.10 or higher)
-    REQUIRED_VERSION="3.10"
-    PYTHON_CMD=$(command -v python3 || true)
-
-    if [ -n "$PYTHON_CMD" ]; then
-        PYTHON_VERSION=$($PYTHON_CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
-        if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
-            echo "Found Python version $PYTHON_VERSION, which is suitable."
-        else
-            echo "Python version $PYTHON_VERSION found, but version $REQUIRED_VERSION or higher is required."
-            PYTHON_CMD=""
-        fi
-    fi
-
-   # Search for suitable Python versions if the default is unsuitable
-   if [ -z "$PYTHON_CMD" ]; then
-       # Loop through all available Python3 commands on the system
-       for CMD in $(compgen -c | grep -E '^python3(\.[0-9]+)?$'); do
-           if command -v "$CMD" &> /dev/null; then
-               PYTHON_VERSION=$($CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
-               if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
-                   PYTHON_CMD="$CMD"
-                   echo "Found suitable Python version: $PYTHON_VERSION ($CMD)"
-                   break
-               fi
-           fi
-       done
-   fi
-
-    # If no suitable Python version found, exit with an error
-    if [ -z "$PYTHON_CMD" ]; then
-        echo "Python 3.10 or higher is required. Please install it."
-        return 1
-    fi
-
-    echo "Using Python command: $PYTHON_CMD"
-
-    # Install falsa and other dependencies
-    echo "Installing falsa..."
-
-    # Set virtual environment directory
-    VIRTUAL_ENV="${PWD}/venv"
-
-    # Create a virtual environment using the detected Python command
-    $PYTHON_CMD -m venv "$VIRTUAL_ENV"
-
-    # Activate the virtual environment and install dependencies
-    source "$VIRTUAL_ENV/bin/activate"
-
-    # Ensure 'falsa' is installed (avoid unnecessary reinstall)
-    pip install --quiet --upgrade falsa
-
     # Create directory if it doesn't exist
     H2O_DIR="${DATA_DIR}/h2o"
     mkdir -p "${H2O_DIR}"
 
     # Generate h2o test data
     echo "Generating h2o test data in ${H2O_DIR} with size=${SIZE} and format=${DATA_FORMAT}"
-    falsa groupby --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
-
-    # Deactivate virtual environment after completion
-    deactivate
+    uv run falsa groupby --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
 }
 
 data_h2o_join() {
@@ -920,75 +972,13 @@ data_h2o_join() {
     SIZE=${1:-"SMALL"}
     DATA_FORMAT=${2:-"CSV"}
 
-    # Function to compare Python versions
-    version_ge() {
-        [ "$(printf '%s\n' "$1" "$2" | sort -V | head -n1)" = "$2" ]
-    }
-
-    export PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1
-
-    # Find the highest available Python version (3.10 or higher)
-    REQUIRED_VERSION="3.10"
-    PYTHON_CMD=$(command -v python3 || true)
-
-    if [ -n "$PYTHON_CMD" ]; then
-        PYTHON_VERSION=$($PYTHON_CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
-        if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
-            echo "Found Python version $PYTHON_VERSION, which is suitable."
-        else
-            echo "Python version $PYTHON_VERSION found, but version $REQUIRED_VERSION or higher is required."
-            PYTHON_CMD=""
-        fi
-    fi
-
-   # Search for suitable Python versions if the default is unsuitable
-   if [ -z "$PYTHON_CMD" ]; then
-       # Loop through all available Python3 commands on the system
-       for CMD in $(compgen -c | grep -E '^python3(\.[0-9]+)?$'); do
-           if command -v "$CMD" &> /dev/null; then
-               PYTHON_VERSION=$($CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
-               if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
-                   PYTHON_CMD="$CMD"
-                   echo "Found suitable Python version: $PYTHON_VERSION ($CMD)"
-                   break
-               fi
-           fi
-       done
-   fi
-
-    # If no suitable Python version found, exit with an error
-    if [ -z "$PYTHON_CMD" ]; then
-        echo "Python 3.10 or higher is required. Please install it."
-        return 1
-    fi
-
-    echo "Using Python command: $PYTHON_CMD"
-
-    # Install falsa and other dependencies
-    echo "Installing falsa..."
-
-    # Set virtual environment directory
-    VIRTUAL_ENV="${PWD}/venv"
-
-    # Create a virtual environment using the detected Python command
-    $PYTHON_CMD -m venv "$VIRTUAL_ENV"
-
-    # Activate the virtual environment and install dependencies
-    source "$VIRTUAL_ENV/bin/activate"
-
-    # Ensure 'falsa' is installed (avoid unnecessary reinstall)
-    pip install --quiet --upgrade falsa
-
     # Create directory if it doesn't exist
     H2O_DIR="${DATA_DIR}/h2o"
     mkdir -p "${H2O_DIR}"
 
     # Generate h2o test data
     echo "Generating h2o test data in ${H2O_DIR} with size=${SIZE} and format=${DATA_FORMAT}"
-    falsa join --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
-
-    # Deactivate virtual environment after completion
-    deactivate
+    uv run falsa join --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
 }
 
 # Runner for h2o groupby benchmark
@@ -1032,7 +1022,7 @@ run_h2o() {
         --path "${H2O_DIR}/${FILE_NAME}" \
         --queries-path "${QUERY_FILE}" \
         -o "${RESULTS_FILE}" \
-         ${QUERY_ARG}
+         ${QUERY_ARG} ${LATENCY_ARG}
 }
 
 # Utility function to run h2o join/window benchmark
@@ -1084,7 +1074,7 @@ h2o_runner() {
         --join-paths "${H2O_DIR}/${X_TABLE_FILE_NAME},${H2O_DIR}/${SMALL_TABLE_FILE_NAME},${H2O_DIR}/${MEDIUM_TABLE_FILE_NAME},${H2O_DIR}/${LARGE_TABLE_FILE_NAME}" \
         --queries-path "${QUERY_FILE}" \
         -o "${RESULTS_FILE}" \
-         ${QUERY_ARG}
+         ${QUERY_ARG} ${LATENCY_ARG}
 }
 
 # Runners for h2o join benchmark
@@ -1113,6 +1103,241 @@ run_external_aggr() {
     debug_run $CARGO_COMMAND --bin external_aggr -- benchmark --partitions 4 --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG}
 }
 
+# Runs the sort pushdown benchmark (without WITH ORDER)
+# Generates sort pushdown benchmark data: TPC-H lineitem with 3 parts,
+# renamed so alphabetical order does NOT match sort key order.
+# This forces the sort pushdown optimizer to reorder files by statistics.
+#
+# tpchgen produces 3 sorted, non-overlapping parquet files:
+#   lineitem.1.parquet: l_orderkey 1 ~ 2M        (lowest keys)
+#   lineitem.2.parquet: l_orderkey 2M ~ 4M
+#   lineitem.3.parquet: l_orderkey 4M ~ 6M       (highest keys)
+#
+# We rename them so alphabetical order is reversed:
+#   a_part3.parquet (highest keys, sorts first alphabetically)
+#   b_part2.parquet
+#   c_part1.parquet (lowest keys, sorts last alphabetically)
+data_sort_pushdown() {
+    SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown/lineitem"
+    if [ -d "${SORT_PUSHDOWN_DIR}" ] && [ "$(ls -A ${SORT_PUSHDOWN_DIR}/*.parquet 2>/dev/null)" ]; then
+        echo "Sort pushdown data already exists at ${SORT_PUSHDOWN_DIR}"
+        return
+    fi
+
+    echo "Generating sort pushdown benchmark data (3 parts with reversed naming)..."
+
+    TEMP_DIR="${DATA_DIR}/sort_pushdown_temp"
+    mkdir -p "${TEMP_DIR}" "${SORT_PUSHDOWN_DIR}"
+
+    tpchgen-cli --scale-factor 1 --format parquet --parquet-compression='ZSTD(1)' --parts=3 --output-dir "${TEMP_DIR}"
+
+    # Rename: reverse alphabetical order vs key order
+    mv "${TEMP_DIR}/lineitem/lineitem.3.parquet" "${SORT_PUSHDOWN_DIR}/a_part3.parquet"
+    mv "${TEMP_DIR}/lineitem/lineitem.2.parquet" "${SORT_PUSHDOWN_DIR}/b_part2.parquet"
+    mv "${TEMP_DIR}/lineitem/lineitem.1.parquet" "${SORT_PUSHDOWN_DIR}/c_part1.parquet"
+
+    rm -rf "${TEMP_DIR}"
+
+    echo "Sort pushdown data generated at ${SORT_PUSHDOWN_DIR}"
+    ls -la "${SORT_PUSHDOWN_DIR}"
+}
+
+run_sort_pushdown() {
+    SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown"
+    RESULTS_FILE="${RESULTS_DIR}/sort_pushdown.json"
+    echo "Running sort pushdown benchmark (no WITH ORDER)..."
+    debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${SORT_PUSHDOWN_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
+}
+
+# Runs the sort pushdown benchmark with WITH ORDER (enables sort elimination)
+run_sort_pushdown_sorted() {
+    SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown"
+    RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_sorted.json"
+    echo "Running sort pushdown benchmark (with WITH ORDER)..."
+    debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${SORT_PUSHDOWN_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
+}
+
+# Generates data for sort pushdown Inexact benchmark.
+#
+# Produces multiple parquet files where each file has MULTIPLE row groups
+# with scrambled RG order. This tests both:
+#   - Row-group-level reorder within each file (reorder_by_statistics)
+#   - TopK threshold initialization from RG statistics
+#
+# Strategy:
+# 1. Write a single sorted file with small (100K-row) RGs (~61 RGs total).
+# 2. Use pyarrow to redistribute RGs into N_FILES files, scrambling the
+#    RG order within each file using a deterministic permutation.
+#    Each file gets ~61/N_FILES RGs with narrow, non-overlapping ranges
+#    but in scrambled order.
+#
+# Writing a single file with ORDER BY scramble does NOT work: the parquet
+# writer merges rows from adjacent chunks at RG boundaries, widening
+# ranges and defeating reorder_by_statistics.
+#
+# Requires pyarrow (pip install pyarrow).
+data_sort_pushdown_inexact() {
+    INEXACT_DIR="${DATA_DIR}/sort_pushdown_inexact/lineitem"
+    if [ -d "${INEXACT_DIR}" ] && [ "$(ls -A ${INEXACT_DIR}/*.parquet 2>/dev/null)" ]; then
+        echo "Sort pushdown Inexact data already exists at ${INEXACT_DIR}"
+        return
+    fi
+
+    # Check pyarrow dependency (needed to split/scramble RGs)
+    if ! python3 -c "import pyarrow" 2>/dev/null; then
+        echo "Error: pyarrow is required for sort pushdown Inexact data generation."
+        echo "Install with: pip install pyarrow"
+        return 1
+    fi
+
+    echo "Generating sort pushdown Inexact benchmark data (multi-file, scrambled RGs)..."
+
+    # Re-use the sort_pushdown data as the source (generate if missing)
+    data_sort_pushdown
+
+    mkdir -p "${INEXACT_DIR}"
+    SRC_DIR="${DATA_DIR}/sort_pushdown/lineitem"
+
+    # Step 1: Write a single sorted file with small (100K-row) RGs
+    TMPFILE="${INEXACT_DIR}/_sorted_small_rgs.parquet"
+    (cd "${SCRIPT_DIR}/.." && cargo run --release -p datafusion-cli -- -c "
+        CREATE EXTERNAL TABLE src
+        STORED AS PARQUET
+        LOCATION '${SRC_DIR}';
+
+        COPY (SELECT * FROM src ORDER BY l_orderkey)
+        TO '${TMPFILE}'
+        STORED AS PARQUET
+        OPTIONS ('format.max_row_group_size' '100000');
+    ")
+
+    # Step 2: Redistribute RGs into 3 files with scrambled RG order.
+    # Each file gets ~20 RGs. RG assignment: rg_idx % 3 determines file,
+    # permutation (rg_idx * 41 + 7) % n scrambles the order within file.
+    python3 -c "
+import pyarrow.parquet as pq
+
+pf = pq.ParquetFile('${TMPFILE}')
+n = pf.metadata.num_row_groups
+n_files = 3
+
+# Assign each RG to a file, scramble order within each file
+file_rgs = [[] for _ in range(n_files)]
+for rg_idx in range(n):
+    slot = (rg_idx * 41 + 7) % n  # scrambled index
+    file_id = slot % n_files
+    file_rgs[file_id].append(rg_idx)
+
+# Write each file with its assigned RGs (in scrambled order)
+for file_id in range(n_files):
+    rgs = file_rgs[file_id]
+    if not rgs:
+        continue
+    tables = [pf.read_row_group(rg) for rg in rgs]
+    writer = pq.ParquetWriter(
+        '${INEXACT_DIR}/part_%03d.parquet' % file_id,
+        pf.schema_arrow)
+    for t in tables:
+        writer.write_table(t)
+    writer.close()
+    print(f'File part_{file_id:03d}.parquet: {len(rgs)} RGs')
+"
+
+    rm -f "${TMPFILE}"
+    echo "Sort pushdown Inexact data generated at ${INEXACT_DIR}"
+    ls -la "${INEXACT_DIR}"
+
+    # Also generate overlap data: same strategy but with different file count
+    # and permutation. Simulates streaming data with network delays where
+    # chunks arrive out of sequence.
+    #
+    # Requires pyarrow (pip install pyarrow).
+    OVERLAP_DIR="${DATA_DIR}/sort_pushdown_inexact_overlap/lineitem"
+    if [ -d "${OVERLAP_DIR}" ] && [ "$(ls -A ${OVERLAP_DIR}/*.parquet 2>/dev/null)" ]; then
+        echo "Sort pushdown Inexact overlap data already exists at ${OVERLAP_DIR}"
+        return
+    fi
+
+    echo "Generating sort pushdown Inexact overlap data (multi-file, scrambled RGs)..."
+    mkdir -p "${OVERLAP_DIR}"
+
+    # Step 1: Write a single sorted file with small (100K-row) RGs
+    TMPFILE="${OVERLAP_DIR}/_sorted_small_rgs.parquet"
+    (cd "${SCRIPT_DIR}/.." && cargo run --release -p datafusion-cli -- -c "
+        CREATE EXTERNAL TABLE src
+        STORED AS PARQUET
+        LOCATION '${SRC_DIR}';
+
+        COPY (SELECT * FROM src ORDER BY l_orderkey)
+        TO '${TMPFILE}'
+        STORED AS PARQUET
+        OPTIONS ('format.max_row_group_size' '100000');
+    ")
+
+    # Step 2: Redistribute into 5 files with scrambled RG order.
+    python3 -c "
+import pyarrow.parquet as pq
+
+pf = pq.ParquetFile('${TMPFILE}')
+n = pf.metadata.num_row_groups
+n_files = 5
+
+file_rgs = [[] for _ in range(n_files)]
+for rg_idx in range(n):
+    slot = (rg_idx * 37 + 13) % n
+    file_id = slot % n_files
+    file_rgs[file_id].append(rg_idx)
+
+for file_id in range(n_files):
+    rgs = file_rgs[file_id]
+    if not rgs:
+        continue
+    tables = [pf.read_row_group(rg) for rg in rgs]
+    writer = pq.ParquetWriter(
+        '${OVERLAP_DIR}/part_%03d.parquet' % file_id,
+        pf.schema_arrow)
+    for t in tables:
+        writer.write_table(t)
+    writer.close()
+    print(f'File part_{file_id:03d}.parquet: {len(rgs)} RGs')
+"
+
+    rm -f "${TMPFILE}"
+}
+
+# Runs the sort pushdown Inexact benchmark (tests RG reorder by statistics).
+# Enables pushdown_filters so TopK's dynamic filter is pushed to the parquet
+# reader for late materialization (only needed for Inexact path).
+run_sort_pushdown_inexact() {
+    INEXACT_DIR="${DATA_DIR}/sort_pushdown_inexact"
+    RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_inexact.json"
+    echo "Running sort pushdown Inexact benchmark (multi-file scrambled RGs, --sorted DESC)..."
+    DATAFUSION_EXECUTION_PARQUET_PUSHDOWN_FILTERS=true \
+    debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${INEXACT_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown_inexact" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
+}
+
+# Runs the sort pushdown Inexact benchmark WITHOUT declared ordering.
+# Tests the Unsupported path in try_pushdown_sort where RG reorder by
+# statistics can still help TopK queries without any file ordering guarantee.
+run_sort_pushdown_inexact_unsorted() {
+    INEXACT_DIR="${DATA_DIR}/sort_pushdown_inexact"
+    RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_inexact_unsorted.json"
+    echo "Running sort pushdown Inexact benchmark (no WITH ORDER, Unsupported path)..."
+    DATAFUSION_EXECUTION_PARQUET_PUSHDOWN_FILTERS=true \
+    debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${INEXACT_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown_inexact_unsorted" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
+}
+
+# Runs the sort pushdown benchmark with multi-file scrambled RG order.
+# Simulates streaming data with network delays — multiple files, each with
+# scrambled RGs. Tests both RG-level reorder and TopK stats initialization.
+run_sort_pushdown_inexact_overlap() {
+    OVERLAP_DIR="${DATA_DIR}/sort_pushdown_inexact_overlap"
+    RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_inexact_overlap.json"
+    echo "Running sort pushdown Inexact benchmark (multi-file scrambled RGs, streaming data pattern)..."
+    DATAFUSION_EXECUTION_PARQUET_PUSHDOWN_FILTERS=true \
+    debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${OVERLAP_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown_inexact_overlap" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
+}
+
 # Runs the sort integration benchmark
 run_sort_tpch() {
     SCALE_FACTOR=$1
@@ -1125,7 +1350,7 @@ run_sort_tpch() {
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running sort tpch benchmark..."
 
-    debug_run $CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG}
+    debug_run $CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }
 
 # Runs the sort tpch integration benchmark with limit 100 (topk)
@@ -1135,7 +1360,7 @@ run_topk_tpch() {
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running topk tpch benchmark..."
 
-    $CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" --limit 100 ${QUERY_ARG}
+    $CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" --limit 100 ${QUERY_ARG} ${LATENCY_ARG}
 }
 
 # Runs the nlj benchmark
@@ -1143,15 +1368,24 @@ run_nlj() {
     RESULTS_FILE="${RESULTS_DIR}/nlj.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running nlj benchmark..."
-    debug_run $CARGO_COMMAND --bin dfbench -- nlj --iterations 5 -o "${RESULTS_FILE}" ${QUERY_ARG}
+    debug_run $CARGO_COMMAND --bin dfbench -- nlj --iterations 5 -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }
 
 # Runs the hj benchmark
 run_hj() {
+    TPCH_DIR="${DATA_DIR}/tpch_sf10"
     RESULTS_FILE="${RESULTS_DIR}/hj.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running hj benchmark..."
-    debug_run $CARGO_COMMAND --bin dfbench -- hj --iterations 5 -o "${RESULTS_FILE}" ${QUERY_ARG}
+    debug_run $CARGO_COMMAND --bin dfbench -- hj --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
+}
+
+# Runs the smj benchmark
+run_smj() {
+    RESULTS_FILE="${RESULTS_DIR}/smj.json"
+    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    echo "Running smj benchmark..."
+    debug_run $CARGO_COMMAND --bin dfbench -- smj --iterations 5 -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }
 
 
@@ -1181,7 +1415,7 @@ compare_benchmarks() {
             echo "--------------------"
             echo "Benchmark ${BENCH}"
             echo "--------------------"
-            PATH=$VIRTUAL_ENV/bin:$PATH python3 "${SCRIPT_DIR}"/compare.py $OPTS "${RESULTS_FILE1}" "${RESULTS_FILE2}"
+            uv run python3 "${SCRIPT_DIR}"/compare.py $OPTS "${RESULTS_FILE1}" "${RESULTS_FILE2}"
         else
             echo "Note: Skipping ${RESULTS_FILE1} as ${RESULTS_FILE2} does not exist"
         fi
@@ -1189,10 +1423,113 @@ compare_benchmarks() {
 
 }
 
-setup_venv() {
-    python3 -m venv "$VIRTUAL_ENV"
-    PATH=$VIRTUAL_ENV/bin:$PATH python3 -m pip install -r requirements.txt
+# Creates sorted ClickBench data from hits.parquet (full dataset)
+# The data is sorted by EventTime in ascending order
+# Uses datafusion-cli to reduce dependencies
+clickbench_sorted() {
+    SORTED_FILE="${DATA_DIR}/hits_sorted.parquet"
+    ORIGINAL_FILE="${DATA_DIR}/hits.parquet"
+
+    # Default memory limit is 12GB, can be overridden with DATAFUSION_MEMORY_GB env var
+    MEMORY_LIMIT_GB=${DATAFUSION_MEMORY_GB:-12}
+
+    echo "Creating sorted ClickBench dataset from hits.parquet..."
+    echo "Configuration:"
+    echo "  Memory limit: ${MEMORY_LIMIT_GB}G"
+    echo "  Row group size: 64K rows"
+    echo "  Compression: uncompressed"
+
+    if [ ! -f "${ORIGINAL_FILE}" ]; then
+        echo "hits.parquet not found. Running data_clickbench_1 first..."
+        data_clickbench_1
+    fi
+
+    if [ -f "${SORTED_FILE}" ]; then
+        echo "Sorted hits.parquet already exists at ${SORTED_FILE}"
+        return 0
+    fi
+
+    echo "Sorting hits.parquet by EventTime (this may take several minutes)..."
+
+    pushd "${DATAFUSION_DIR}" > /dev/null
+    echo "Building datafusion-cli..."
+    cargo build --release --bin datafusion-cli
+    DATAFUSION_CLI="${DATAFUSION_DIR}/target/release/datafusion-cli"
+    popd > /dev/null
+
+
+    START_TIME=$(date +%s)
+    echo "Start time: $(date '+%Y-%m-%d %H:%M:%S')"
+    echo "Using datafusion-cli to create sorted parquet file..."
+    "${DATAFUSION_CLI}" << EOF
+-- Memory and performance configuration
+SET datafusion.runtime.memory_limit = '${MEMORY_LIMIT_GB}G';
+SET datafusion.execution.spill_compression = 'uncompressed';
+SET datafusion.execution.sort_spill_reservation_bytes = 10485760; -- 10MB
+SET datafusion.execution.batch_size = 8192;
+SET datafusion.execution.target_partitions = 1;
+
+-- Parquet output configuration
+SET datafusion.execution.parquet.max_row_group_size = 65536;
+SET datafusion.execution.parquet.compression = 'uncompressed';
+
+-- Execute sort and write
+COPY (SELECT * FROM '${ORIGINAL_FILE}' ORDER BY "EventTime")
+TO '${SORTED_FILE}'
+STORED AS PARQUET;
+EOF
+
+    local result=$?
+
+    END_TIME=$(date +%s)
+    DURATION=$((END_TIME - START_TIME))
+    echo "End time: $(date '+%Y-%m-%d %H:%M:%S')"
+
+    if [ $result -eq 0 ]; then
+        echo "✓ Successfully created sorted ClickBench dataset"
+
+        INPUT_SIZE=$(stat -f%z "${ORIGINAL_FILE}" 2>/dev/null || stat -c%s "${ORIGINAL_FILE}" 2>/dev/null)
+        OUTPUT_SIZE=$(stat -f%z "${SORTED_FILE}" 2>/dev/null || stat -c%s "${SORTED_FILE}" 2>/dev/null)
+        INPUT_MB=$((INPUT_SIZE / 1024 / 1024))
+        OUTPUT_MB=$((OUTPUT_SIZE / 1024 / 1024))
+
+        echo "  Input:  ${INPUT_MB} MB"
+        echo "  Output: ${OUTPUT_MB} MB"
+
+        echo ""
+        echo "Time Statistics:"
+        echo "  Total duration: ${DURATION} seconds ($(printf '%02d:%02d:%02d' $((DURATION/3600)) $((DURATION%3600/60)) $((DURATION%60))))"
+        echo "  Throughput: $((INPUT_MB / DURATION)) MB/s"
+
+        return 0
+    else
+        echo "✗ Error: Failed to create sorted dataset"
+        echo "💡 Tip: Try increasing memory with: DATAFUSION_MEMORY_GB=16 ./bench.sh data clickbench_sorted"
+        return 1
+    fi
+}
+
+# Runs the sorted data benchmark with prefer_existing_sort configuration
+run_clickbench_sorted() {
+    RESULTS_FILE="${RESULTS_DIR}/clickbench_sorted.json"
+    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    echo "Running sorted data benchmark with prefer_existing_sort optimization..."
+
+    # Ensure sorted data exists
+    clickbench_sorted
+
+    # Run benchmark with prefer_existing_sort configuration
+    # This allows DataFusion to optimize away redundant sorts while maintaining parallelism
+    debug_run $CARGO_COMMAND --bin dfbench -- clickbench \
+        --iterations 5 \
+        --path "${DATA_DIR}/hits_sorted.parquet" \
+        --queries-path "${SCRIPT_DIR}/queries/clickbench/queries/sorted_data" \
+        --sorted-by "EventTime" \
+        -c datafusion.optimizer.prefer_existing_sort=true \
+        -o "${RESULTS_FILE}" \
+        ${QUERY_ARG} ${LATENCY_ARG}
 }
 
+
 # And start the process up
 main
diff --git a/benchmarks/benches/sql.rs b/benchmarks/benches/sql.rs
new file mode 100644
index 0000000000000..eade3194d1402
--- /dev/null
+++ b/benchmarks/benches/sql.rs
@@ -0,0 +1,321 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Criterion benchmark harness for SQL benchmark files under `sql_benchmarks`.
+//!
+//! SQL benchmarks describe setup, queries, result validation, and cleanup in
+//! `.benchmark` files. Run them with `benchmarks/bench.sh` or directly with
+//! Cargo, for example: `BENCH_NAME=tpch cargo bench --bench sql`.
+
+use clap::Parser;
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion::error::Result;
+use datafusion::prelude::SessionContext;
+use datafusion_benchmarks::sql_benchmark::SqlBenchmark;
+use datafusion_benchmarks::util::{CommonOpt, print_memory_stats};
+use datafusion_common::instant::Instant;
+use log::{debug, info};
+use std::collections::BTreeMap;
+use std::fs;
+use std::sync::LazyLock;
+use tokio::runtime::Runtime;
+
+static SQL_BENCHMARK_DIRECTORY: LazyLock<String> = LazyLock::new(|| {
+    format!(
+        "{}{}{}",
+        env!("CARGO_MANIFEST_DIR"),
+        std::path::MAIN_SEPARATOR,
+        "sql_benchmarks"
+    )
+});
+
+#[cfg(all(feature = "snmalloc", feature = "mimalloc"))]
+compile_error!(
+    "feature \"snmalloc\" and feature \"mimalloc\" cannot be enabled at the same time"
+);
+
+#[cfg(feature = "snmalloc")]
+#[global_allocator]
+static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
+
+#[cfg(feature = "mimalloc")]
+#[global_allocator]
+static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
+
+#[derive(Debug, Parser)]
+#[command(ignore_errors = true)]
+struct EnvParser {
+    #[command(flatten)]
+    options: CommonOpt,
+
+    #[arg(
+        env = "BENCH_PERSIST_RESULTS",
+        long = "persist_results",
+        default_value = "false",
+        action = clap::ArgAction::SetTrue
+    )]
+    persist_results: bool,
+
+    #[arg(
+        env = "BENCH_VALIDATE",
+        long = "validate_results",
+        default_value = "false",
+        action = clap::ArgAction::SetTrue
+    )]
+    validate: bool,
+
+    #[arg(env = "BENCH_NAME")]
+    name: Option<String>,
+
+    #[arg(env = "BENCH_SUBGROUP")]
+    subgroup: Option<String>,
+
+    #[arg(env = "BENCH_QUERY")]
+    query: Option<i32>,
+}
+
+pub fn sql(c: &mut Criterion) {
+    env_logger::init();
+
+    let start = Instant::now();
+    let args = EnvParser::parse();
+    let rt = make_tokio_runtime();
+
+    println!("Loading benchmarks...");
+
+    let benchmarks = rt.block_on(async {
+        let ctx = make_ctx(&args).expect("SessionContext creation failed");
+
+        load_benchmarks(&args, &ctx, &SQL_BENCHMARK_DIRECTORY)
+            .await
+            .unwrap_or_else(|err| panic!("failed load benchmarks: {err:?}"))
+    });
+
+    println!(
+        "Loaded benchmarks in {} ms ...",
+        start.elapsed().as_millis()
+    );
+
+    for (group, benchmarks) in benchmarks {
+        let mut group = c.benchmark_group(group);
+        group.sample_size(10);
+        group.sampling_mode(SamplingMode::Flat);
+
+        for mut benchmark in benchmarks {
+            // create a context
+            let ctx = make_ctx(&args).expect("SessionContext creation failed");
+
+            // initialize the benchmark. This parses the benchmark file and does any pre-execution
+            // work such as loading data into tables
+            rt.block_on(async {
+                benchmark
+                    .initialize(&ctx)
+                    .await
+                    .expect("initialization failed");
+
+                // run assertions
+                benchmark.assert(&ctx).await.expect("assertion failed");
+            });
+
+            let mut name = benchmark.name().to_string();
+            if !benchmark.subgroup().is_empty() {
+                name.push('_');
+                name.push_str(benchmark.subgroup());
+            }
+
+            if args.persist_results {
+                handle_persist(&rt, &ctx, &name, &mut benchmark);
+            } else if args.validate {
+                handle_verify(&rt, &ctx, &name, &mut benchmark);
+            } else {
+                info!("Running benchmark {name} ...");
+
+                let name = name.clone();
+                group.bench_function(name.clone(), |b| {
+                    b.iter(|| handle_run(&rt, &ctx, &args, &mut benchmark, &name))
+                });
+
+                print_memory_stats();
+
+                info!("Benchmark {name} completed");
+            }
+
+            // run cleanup
+            rt.block_on(async {
+                benchmark.cleanup(&ctx).await.expect("Cleanup failed");
+            });
+        }
+
+        group.finish();
+    }
+}
+
+fn handle_run(
+    rt: &Runtime,
+    ctx: &SessionContext,
+    args: &EnvParser,
+    benchmark: &mut SqlBenchmark,
+    name: &str,
+) {
+    rt.block_on(async {
+        benchmark
+            .run(ctx, args.validate)
+            .await
+            .unwrap_or_else(|err| panic!("Failed to run benchmark {name}: {err:?}"))
+    });
+}
+
+fn handle_persist(
+    rt: &Runtime,
+    ctx: &SessionContext,
+    name: &str,
+    benchmark: &mut SqlBenchmark,
+) {
+    info!("Running benchmark {name} prior to persisting results ...");
+
+    rt.block_on(async {
+        info!("Persisting benchmark {name} ...");
+
+        benchmark
+            .persist(ctx)
+            .await
+            .expect("Failed to persist results");
+    });
+
+    info!("Persisted benchmark {name} successfully");
+}
+
+fn handle_verify(
+    rt: &Runtime,
+    ctx: &SessionContext,
+    name: &str,
+    benchmark: &mut SqlBenchmark,
+) {
+    info!("Verifying benchmark {name} results ...");
+
+    rt.block_on(async {
+        benchmark
+            .run(ctx, true)
+            .await
+            .unwrap_or_else(|err| panic!("Failed to run benchmark {name}: {err:?}"));
+        benchmark
+            .verify(ctx)
+            .await
+            .unwrap_or_else(|err| panic!("Verification failed: {err:?}"));
+    });
+
+    info!("Verified benchmark {name} results successfully");
+}
+
+criterion_group!(benches, sql);
+criterion_main!(benches);
+
+fn make_tokio_runtime() -> Runtime {
+    tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .unwrap()
+}
+
+fn make_ctx(args: &EnvParser) -> Result<SessionContext> {
+    let config = args.options.config()?;
+    let rt = args.options.build_runtime()?;
+
+    Ok(SessionContext::new_with_config_rt(config, rt))
+}
+
+/// Recursively walks the directory tree starting at `path` and
+/// calls the call back function for every file encountered.
+pub fn list_files<F>(path: &str, callback: &mut F)
+where
+    F: FnMut(&str),
+{
+    let mut entries: Vec<fs::DirEntry> =
+        fs::read_dir(path).unwrap().filter_map(Result::ok).collect();
+    entries.sort_by_key(|entry| entry.path());
+
+    for dir_entry in entries {
+        let path = dir_entry.path();
+        if path.is_dir() {
+            // Recurse into the sub‑directory
+            list_files(&path.to_string_lossy(), callback);
+        } else {
+            // For files, invoke the callback with the full path as a string
+            let full_str = path.to_string_lossy();
+            callback(&full_str);
+        }
+    }
+}
+
+/// Loads all benchmark files in the `sql_benchmarks` directory.
+/// For each file ending with `.benchmark` it creates a new
+/// `SqlBenchmark` instance.
+async fn load_benchmarks(
+    args: &EnvParser,
+    ctx: &SessionContext,
+    path: &str,
+) -> Result<BTreeMap<String, Vec<SqlBenchmark>>> {
+    let mut benches = BTreeMap::new();
+    let mut paths = Vec::new();
+
+    list_files(path, &mut |path: &str| {
+        if path.ends_with(".benchmark") {
+            paths.push(path.to_string());
+        }
+    });
+
+    for path in paths {
+        debug!("Loading benchmark from {path}");
+
+        let benchmark = SqlBenchmark::new(ctx, &path, &*SQL_BENCHMARK_DIRECTORY).await?;
+        let entries = benches
+            .entry(benchmark.group().to_string())
+            .or_insert(vec![]);
+
+        entries.push(benchmark);
+    }
+
+    benches = filter_benchmarks(args, benches);
+    benches.iter_mut().for_each(|(_, benchmarks)| {
+        benchmarks.sort_by(|b1, b2| b1.name().cmp(b2.name()))
+    });
+
+    Ok(benches)
+}
+
+fn filter_benchmarks(
+    args: &EnvParser,
+    benchmarks: BTreeMap<String, Vec<SqlBenchmark>>,
+) -> BTreeMap<String, Vec<SqlBenchmark>> {
+    match &args.name {
+        Some(bench_name) => benchmarks
+            .into_iter()
+            .filter(|(key, _val)| key.eq_ignore_ascii_case(bench_name))
+            .map(|(key, mut val)| {
+                if let Some(subgroup) = &args.subgroup {
+                    val.retain(|bench| bench.subgroup().eq_ignore_ascii_case(subgroup));
+                }
+                if let Some(query_number) = &args.query {
+                    let padded = format!("Q{query_number:0>2}");
+                    val.retain(|bench| bench.name().eq_ignore_ascii_case(&padded));
+                }
+                (key, val)
+            })
+            .collect(),
+        None => benchmarks,
+    }
+}
diff --git a/benchmarks/compare.py b/benchmarks/compare.py
index 7e51a38a92c2b..9ad1de980abe8 100755
--- a/benchmarks/compare.py
+++ b/benchmarks/compare.py
@@ -154,17 +154,17 @@ def compare(
     baseline = BenchmarkRun.load_from_file(baseline_path)
     comparison = BenchmarkRun.load_from_file(comparison_path)
 
-    console = Console()
+    console = Console(width=200)
 
     # use basename as the column names
-    baseline_header = baseline_path.parent.stem
-    comparison_header = comparison_path.parent.stem
+    baseline_header = baseline_path.parent.name
+    comparison_header = comparison_path.parent.name
 
     table = Table(show_header=True, header_style="bold magenta")
-    table.add_column("Query", style="dim", width=12)
-    table.add_column(baseline_header, justify="right", style="dim")
-    table.add_column(comparison_header, justify="right", style="dim")
-    table.add_column("Change", justify="right", style="dim")
+    table.add_column("Query", style="dim", no_wrap=True)
+    table.add_column(baseline_header, justify="right", style="dim", no_wrap=True)
+    table.add_column(comparison_header, justify="right", style="dim", no_wrap=True)
+    table.add_column("Change", justify="right", style="dim", no_wrap=True)
 
     faster_count = 0
     slower_count = 0
@@ -175,12 +175,12 @@ def compare(
 
     for baseline_result, comparison_result in zip(baseline.queries, comparison.queries):
         assert baseline_result.query == comparison_result.query
-        
+
         base_failed = not baseline_result.success
-        comp_failed = not comparison_result.success 
+        comp_failed = not comparison_result.success
         # If a query fails, its execution time is excluded from the performance comparison
         if base_failed or comp_failed:
-            change_text = "incomparable" 
+            change_text = "incomparable"
             failure_count += 1
             table.add_row(
                 f"Q{baseline_result.query}",
diff --git a/benchmarks/compare_tpcds.sh b/benchmarks/compare_tpcds.sh
new file mode 100755
index 0000000000000..48331a7c7510e
--- /dev/null
+++ b/benchmarks/compare_tpcds.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Compare TPC-DS benchmarks between two branches
+
+set -e
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+usage() {
+    echo "Usage: $0 <branch1> <branch2>"
+    echo ""
+    echo "Example: $0 main dev2"
+    echo ""
+    echo "Note: TPC-DS benchmarks are not currently implemented in bench.sh"
+    exit 1
+}
+
+BRANCH1=${1:-""}
+BRANCH2=${2:-""}
+
+if [ -z "$BRANCH1" ] || [ -z "$BRANCH2" ]; then
+    usage
+fi
+
+# Store current branch
+CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
+
+echo "Comparing TPC-DS benchmarks: ${BRANCH1} vs ${BRANCH2}"
+
+# Run benchmark on first branch
+git checkout "$BRANCH1"
+./benchmarks/bench.sh run tpcds
+
+# Run benchmark on second branch
+git checkout "$BRANCH2"
+./benchmarks/bench.sh run tpcds
+
+# Compare results
+./benchmarks/bench.sh compare "$BRANCH1" "$BRANCH2"
+
+# Return to original branch
+git checkout "$CURRENT_BRANCH"
\ No newline at end of file
diff --git a/benchmarks/compare_tpch.sh b/benchmarks/compare_tpch.sh
new file mode 100755
index 0000000000000..85e8da29ce41d
--- /dev/null
+++ b/benchmarks/compare_tpch.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Compare TPC-H benchmarks between two branches
+
+set -e
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+usage() {
+    echo "Usage: $0 <branch1> <branch2>"
+    echo ""
+    echo "Example: $0 main dev2"
+    exit 1
+}
+
+BRANCH1=${1:-""}
+BRANCH2=${2:-""}
+
+if [ -z "$BRANCH1" ] || [ -z "$BRANCH2" ]; then
+    usage
+fi
+
+# Store current branch
+CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
+
+echo "Comparing TPC-H benchmarks: ${BRANCH1} vs ${BRANCH2}"
+
+# Run benchmark on first branch
+git checkout "$BRANCH1"
+./benchmarks/bench.sh run tpch
+
+# Run benchmark on second branch
+git checkout "$BRANCH2"
+./benchmarks/bench.sh run tpch
+
+# Compare results
+./benchmarks/bench.sh compare "$BRANCH1" "$BRANCH2"
+
+# Return to original branch
+git checkout "$CURRENT_BRANCH"
\ No newline at end of file
diff --git a/benchmarks/compile_profile.py b/benchmarks/compile_profile.py
index ae51de94937bf..a85e15ddacc04 100644
--- a/benchmarks/compile_profile.py
+++ b/benchmarks/compile_profile.py
@@ -19,8 +19,10 @@
 
 """Compile profile benchmark runner for DataFusion.
 
-Builds the `tpch` benchmark binary with several Cargo profiles (e.g. `--release` or `--profile ci`), runs the full TPC-H suite against the Parquet data under `benchmarks/data/tpch_sf1`, and reports compile time, execution time, and resulting 
-binary size.
+Builds the `dfbench` benchmark binary with several Cargo profiles
+(e.g. `--release` or `--profile ci`), runs the full TPC-H suite against
+the Parquet data under `benchmarks/data/tpch_sf1`, and reports compile
+time, execution time, and resulting binary size.
 
 See `benchmarks/README.md` for usages.
 """
@@ -40,12 +42,15 @@
 DEFAULT_ITERATIONS = 1
 DEFAULT_FORMAT = "parquet"
 DEFAULT_PARTITIONS: int | None = None
-TPCH_BINARY = "tpch.exe" if os.name == "nt" else "tpch"
+BENCHMARK_PACKAGE = "datafusion-benchmarks"
+BENCHMARK_BINARY = "dfbench.exe" if os.name == "nt" else "dfbench"
 PROFILE_TARGET_DIR = {
     "dev": "debug",
     "release": "release",
     "ci": "ci",
+    "ci-optimized": "ci-optimized",
     "release-nonlto": "release-nonlto",
+    "profiling": "profiling",
 }
 
 
@@ -62,7 +67,10 @@ def parse_args() -> argparse.Namespace:
         "--profiles",
         nargs="+",
         default=list(PROFILE_TARGET_DIR.keys()),
-        help="Cargo profiles to test (default: dev release ci release-nonlto)",
+        help=(
+            "Cargo profiles to test "
+            "(default: dev release ci ci-optimized release-nonlto profiling)"
+        ),
     )
     parser.add_argument(
         "--data",
@@ -84,9 +92,25 @@ def timed_run(command: Iterable[str]) -> float:
 
 def cargo_build(profile: str) -> float:
     if profile == "dev":
-        command = ["cargo", "build", "--bin", "tpch"]
+        command = [
+            "cargo",
+            "build",
+            "--package",
+            BENCHMARK_PACKAGE,
+            "--bin",
+            "dfbench",
+        ]
     else:
-        command = ["cargo", "build", "--profile", profile, "--bin", "tpch"]
+        command = [
+            "cargo",
+            "build",
+            "--profile",
+            profile,
+            "--package",
+            BENCHMARK_PACKAGE,
+            "--bin",
+            "dfbench",
+        ]
     return timed_run(command)
 
 
@@ -102,14 +126,13 @@ def run_benchmark(profile: str, data_path: Path) -> float:
     binary_dir = PROFILE_TARGET_DIR.get(profile)
     if not binary_dir:
         raise ValueError(f"unknown profile '{profile}'")
-    binary_path = REPO_ROOT / "target" / binary_dir / TPCH_BINARY
+    binary_path = REPO_ROOT / "target" / binary_dir / BENCHMARK_BINARY
     if not binary_path.exists():
         raise FileNotFoundError(f"compiled binary not found at {binary_path}")
 
     command = [
         str(binary_path),
-        "benchmark",
-        "datafusion",
+        "tpch",
         "--iterations",
         str(DEFAULT_ITERATIONS),
         "--path",
@@ -132,7 +155,7 @@ def run_benchmark(profile: str, data_path: Path) -> float:
 
 def binary_size(profile: str) -> int:
     binary_dir = PROFILE_TARGET_DIR[profile]
-    binary_path = REPO_ROOT / "target" / binary_dir / TPCH_BINARY
+    binary_path = REPO_ROOT / "target" / binary_dir / BENCHMARK_BINARY
     return binary_path.stat().st_size
 
 
diff --git a/benchmarks/lineprotocol.py b/benchmarks/lineprotocol.py
index 75e09b662e3e1..40f643499f489 100644
--- a/benchmarks/lineprotocol.py
+++ b/benchmarks/lineprotocol.py
@@ -164,12 +164,12 @@ def lineformat(
 ) -> None:
     baseline = BenchmarkRun.load_from_file(baseline)
     context = baseline.context
-    benchamrk_str = f"benchmark,name={context.name},version={context.benchmark_version},datafusion_version={context.datafusion_version},num_cpus={context.num_cpus}"
+    benchmark_str = f"benchmark,name={context.name},version={context.benchmark_version},datafusion_version={context.datafusion_version},num_cpus={context.num_cpus}"
     for query in baseline.queries:
         query_str = f"query=\"{query.query}\""
         timestamp = f"{query.start_time*10**9}"
         for iter_num, result in enumerate(query.iterations):
-            print(f"{benchamrk_str} {query_str},iteration={iter_num},row_count={result.row_count},elapsed_ms={result.elapsed*1000:.0f} {timestamp}\n")
+            print(f"{benchmark_str} {query_str},iteration={iter_num},row_count={result.row_count},elapsed_ms={result.elapsed*1000:.0f} {timestamp}\n")
     
 def main() -> None:
     parser = ArgumentParser()
diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml
new file mode 100644
index 0000000000000..e6a60582148ce
--- /dev/null
+++ b/benchmarks/pyproject.toml
@@ -0,0 +1,6 @@
+[project]
+name = "datafusion-benchmarks"
+version = "0.1.0"
+requires-python = ">=3.11"
+# typing_extensions is an undeclared dependency of falsa
+dependencies = ["rich", "falsa", "typing_extensions"]
diff --git a/benchmarks/queries/clickbench/README.md b/benchmarks/queries/clickbench/README.md
index 877ea0e0c3192..8b3d08b128866 100644
--- a/benchmarks/queries/clickbench/README.md
+++ b/benchmarks/queries/clickbench/README.md
@@ -228,6 +228,41 @@ Results look like
 Elapsed 30.195 seconds.
 ```
 
+
+### Q9-Q12: FIRST_VALUE Aggregation Performance
+
+These queries test the performance of the `FIRST_VALUE` aggregation function with different data types and grouping cardinalities.
+
+| Query | `FIRST_VALUE` Column | Column Type | Group By Column | Group By Type | Number of Groups |
+|-------|----------------------|-------------|-----------------|---------------|------------------|
+| Q9    | `URL`                | `Utf8`      | `UserID`        | `Int64`       | 17,630,976       |
+| Q10    | `URL`                | `Utf8`      | `OS`            | `Int16`       | 91               |
+| Q11   | `WatchID`            | `Int64`     | `UserID`        | `Int64`       | 17,630,976       |
+| Q12   | `WatchID`            | `Int64`     | `OS`            | `Int16`       | 91               |
+
+
+### Q13: Filter-only URL Range Match
+
+**Question**: "What is the sum of counter IDs for page views with URLs in the normal URL string range?"
+
+**Important Query Properties**: Filter-only string range match. The `URL`
+column is used only by the pushed-down filter and is not projected or
+aggregated. This makes the query useful for measuring optimizations that can
+skip RowFilter evaluation when Parquet row group statistics prove that all rows
+in a row group satisfy the string predicate. The output-side aggregation is
+intentionally lightweight so the scan-time filter evaluation cost remains
+visible. Run this query with Parquet filter pushdown enabled, for example
+`dfbench clickbench --pushdown --query 13`.
+
+```sql
+SELECT SUM("CounterID") AS counter_id_sum
+FROM hits
+WHERE "URL" < 'zzzz';
+```
+
+
+
+
 ## Data Notes
 
 Here are some interesting statistics about the data used in the queries
diff --git a/benchmarks/queries/clickbench/extended/q10.sql b/benchmarks/queries/clickbench/extended/q10.sql
new file mode 100644
index 0000000000000..d6019de17854f
--- /dev/null
+++ b/benchmarks/queries/clickbench/extended/q10.sql
@@ -0,0 +1,8 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT MAX(len) FROM (
+    SELECT LENGTH(FIRST_VALUE("URL" ORDER BY "EventTime")) as len
+    FROM hits
+    GROUP BY "OS"
+);
diff --git a/benchmarks/queries/clickbench/extended/q11.sql b/benchmarks/queries/clickbench/extended/q11.sql
new file mode 100644
index 0000000000000..bca38f836bb95
--- /dev/null
+++ b/benchmarks/queries/clickbench/extended/q11.sql
@@ -0,0 +1,8 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT MAX(fv) FROM (
+    SELECT FIRST_VALUE("WatchID" ORDER BY "EventTime") as fv
+    FROM hits
+    GROUP BY "UserID"
+);
diff --git a/benchmarks/queries/clickbench/extended/q12.sql b/benchmarks/queries/clickbench/extended/q12.sql
new file mode 100644
index 0000000000000..fa062ac1f5cde
--- /dev/null
+++ b/benchmarks/queries/clickbench/extended/q12.sql
@@ -0,0 +1,8 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT MAX(fv) FROM (
+    SELECT FIRST_VALUE("WatchID" ORDER BY "EventTime") as fv
+    FROM hits
+    GROUP BY "OS"
+);
diff --git a/benchmarks/queries/clickbench/extended/q13.sql b/benchmarks/queries/clickbench/extended/q13.sql
new file mode 100644
index 0000000000000..b76a0766566b3
--- /dev/null
+++ b/benchmarks/queries/clickbench/extended/q13.sql
@@ -0,0 +1,6 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT SUM("CounterID") AS counter_id_sum
+FROM hits
+WHERE "URL" < 'zzzz';
diff --git a/benchmarks/queries/clickbench/extended/q8.sql b/benchmarks/queries/clickbench/extended/q8.sql
new file mode 100644
index 0000000000000..e580807841df5
--- /dev/null
+++ b/benchmarks/queries/clickbench/extended/q8.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "RegionID", "UserAgent", "OS", AVG(to_timestamp("ResponseEndTiming")-to_timestamp("ResponseStartTiming")) as avg_response_time, AVG(to_timestamp("ResponseEndTiming")-to_timestamp("ConnectTiming")) as avg_latency FROM hits GROUP BY "RegionID", "UserAgent", "OS" ORDER BY avg_latency DESC limit 10;
\ No newline at end of file
diff --git a/benchmarks/queries/clickbench/extended/q9.sql b/benchmarks/queries/clickbench/extended/q9.sql
new file mode 100644
index 0000000000000..53952ebec2627
--- /dev/null
+++ b/benchmarks/queries/clickbench/extended/q9.sql
@@ -0,0 +1,8 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT MAX(len) FROM (
+    SELECT LENGTH(FIRST_VALUE("URL" ORDER BY "EventTime")) as len
+    FROM hits
+    GROUP BY "UserID"
+);
diff --git a/benchmarks/queries/clickbench/queries/sorted_data/q0.sql b/benchmarks/queries/clickbench/queries/sorted_data/q0.sql
new file mode 100644
index 0000000000000..1170a383bcb22
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/sorted_data/q0.sql
@@ -0,0 +1,3 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+SELECT * FROM hits ORDER BY "EventTime" DESC limit 10;
diff --git a/benchmarks/queries/h2o/window.sql b/benchmarks/queries/h2o/window.sql
index 071540927a4cf..fa16a3de32ca5 100644
--- a/benchmarks/queries/h2o/window.sql
+++ b/benchmarks/queries/h2o/window.sql
@@ -109,4 +109,11 @@ SELECT
     id3,
     v2,
     sum(v2) OVER (PARTITION BY id2 ORDER BY v2 RANGE BETWEEN 3 PRECEDING AND CURRENT ROW) AS my_range_between_by_id2
-FROM large;
\ No newline at end of file
+FROM large;
+
+-- Window Top-N (ROW_NUMBER top-2 per partition)
+SELECT id2, largest2_v2 FROM (
+    SELECT id2, v2 AS largest2_v2,
+           ROW_NUMBER() OVER (PARTITION BY id2 ORDER BY v2 DESC) AS order_v2
+    FROM large WHERE v2 IS NOT NULL
+) sub_query WHERE order_v2 <= 2;
diff --git a/benchmarks/queries/q10.sql b/benchmarks/queries/q10.sql
index 8613fd4962837..8ac2fd90798c9 100644
--- a/benchmarks/queries/q10.sql
+++ b/benchmarks/queries/q10.sql
@@ -16,7 +16,7 @@ where
         c_custkey = o_custkey
   and l_orderkey = o_orderkey
   and o_orderdate >= date '1993-10-01'
-  and o_orderdate < date '1994-01-01'
+  and o_orderdate < date '1993-10-01' + interval '3' month
   and l_returnflag = 'R'
   and c_nationkey = n_nationkey
 group by
diff --git a/benchmarks/queries/q11.sql b/benchmarks/queries/q11.sql
index c23ed1c71bfb3..9a9710d09ec35 100644
--- a/benchmarks/queries/q11.sql
+++ b/benchmarks/queries/q11.sql
@@ -13,7 +13,7 @@ group by
     ps_partkey having
     sum(ps_supplycost * ps_availqty) > (
     select
-    sum(ps_supplycost * ps_availqty) * 0.0001
+    sum(ps_supplycost * ps_availqty) * 0.0001 /* __TPCH_Q11_FRACTION__ */
     from
     partsupp,
     supplier,
@@ -24,4 +24,4 @@ group by
                   and n_name = 'GERMANY'
     )
 order by
-    value desc;
\ No newline at end of file
+    value desc;
diff --git a/benchmarks/queries/q12.sql b/benchmarks/queries/q12.sql
index f8e6d960c8420..c3f4d62344701 100644
--- a/benchmarks/queries/q12.sql
+++ b/benchmarks/queries/q12.sql
@@ -23,8 +23,8 @@ where
   and l_commitdate < l_receiptdate
   and l_shipdate < l_commitdate
   and l_receiptdate >= date '1994-01-01'
-  and l_receiptdate < date '1995-01-01'
+  and l_receiptdate < date '1994-01-01' + interval '1' year
 group by
     l_shipmode
 order by
-    l_shipmode;
\ No newline at end of file
+    l_shipmode;
diff --git a/benchmarks/queries/q14.sql b/benchmarks/queries/q14.sql
index d8ef6afaca9bb..6fe88c42662d0 100644
--- a/benchmarks/queries/q14.sql
+++ b/benchmarks/queries/q14.sql
@@ -10,4 +10,4 @@ from
 where
         l_partkey = p_partkey
   and l_shipdate >= date '1995-09-01'
-  and l_shipdate < date '1995-10-01';
\ No newline at end of file
+  and l_shipdate < date '1995-09-01' + interval '1' month;
diff --git a/benchmarks/queries/q5.sql b/benchmarks/queries/q5.sql
index 5a336b231184b..146980ccd6f76 100644
--- a/benchmarks/queries/q5.sql
+++ b/benchmarks/queries/q5.sql
@@ -17,8 +17,8 @@ where
   and n_regionkey = r_regionkey
   and r_name = 'ASIA'
   and o_orderdate >= date '1994-01-01'
-  and o_orderdate < date '1995-01-01'
+  and o_orderdate < date '1994-01-01' + interval '1' year
 group by
     n_name
 order by
-    revenue desc;
\ No newline at end of file
+    revenue desc;
diff --git a/benchmarks/queries/q6.sql b/benchmarks/queries/q6.sql
index 5806f980f8088..5a13fe7df765a 100644
--- a/benchmarks/queries/q6.sql
+++ b/benchmarks/queries/q6.sql
@@ -4,6 +4,6 @@ from
     lineitem
 where
         l_shipdate >= date '1994-01-01'
-  and l_shipdate < date '1995-01-01'
+  and l_shipdate < date '1994-01-01' + interval '1' year
   and l_discount between 0.06 - 0.01 and 0.06 + 0.01
-  and l_quantity < 24;
\ No newline at end of file
+  and l_quantity < 24;
diff --git a/benchmarks/queries/sort_pushdown/q1.sql b/benchmarks/queries/sort_pushdown/q1.sql
new file mode 100644
index 0000000000000..f5f51a5d4043e
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown/q1.sql
@@ -0,0 +1,6 @@
+-- Sort elimination: ORDER BY sort key ASC (full scan)
+-- With --sorted: SortExec removed, sequential scan in file order
+-- Without --sorted: full SortExec required
+SELECT l_orderkey, l_partkey, l_suppkey
+FROM lineitem
+ORDER BY l_orderkey
diff --git a/benchmarks/queries/sort_pushdown/q2.sql b/benchmarks/queries/sort_pushdown/q2.sql
new file mode 100644
index 0000000000000..29a0e127cb7c6
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown/q2.sql
@@ -0,0 +1,7 @@
+-- Sort elimination + limit pushdown
+-- With --sorted: SortExec removed + limit pushed to DataSourceExec
+-- Without --sorted: TopK sort over all data
+SELECT l_orderkey, l_partkey, l_suppkey
+FROM lineitem
+ORDER BY l_orderkey
+LIMIT 100
diff --git a/benchmarks/queries/sort_pushdown/q3.sql b/benchmarks/queries/sort_pushdown/q3.sql
new file mode 100644
index 0000000000000..e11b48659a2a2
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown/q3.sql
@@ -0,0 +1,5 @@
+-- Sort elimination: wide projection (all columns)
+-- Tests sort elimination benefit with larger row payload
+SELECT *
+FROM lineitem
+ORDER BY l_orderkey
diff --git a/benchmarks/queries/sort_pushdown/q4.sql b/benchmarks/queries/sort_pushdown/q4.sql
new file mode 100644
index 0000000000000..99500c371991a
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown/q4.sql
@@ -0,0 +1,5 @@
+-- Sort elimination + limit: wide projection
+SELECT *
+FROM lineitem
+ORDER BY l_orderkey
+LIMIT 100
diff --git a/benchmarks/queries/sort_pushdown/q5.sql b/benchmarks/queries/sort_pushdown/q5.sql
new file mode 100644
index 0000000000000..60ad636ad3c9c
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown/q5.sql
@@ -0,0 +1,7 @@
+-- Reverse scan: ORDER BY DESC LIMIT (narrow projection)
+-- With --sorted: reverse_row_groups=true + TopK + stats init + cumulative prune
+-- Without --sorted: full TopK sort over all data
+SELECT l_orderkey, l_partkey, l_suppkey
+FROM lineitem
+ORDER BY l_orderkey DESC
+LIMIT 100
diff --git a/benchmarks/queries/sort_pushdown/q6.sql b/benchmarks/queries/sort_pushdown/q6.sql
new file mode 100644
index 0000000000000..d36a35a1e5a0d
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown/q6.sql
@@ -0,0 +1,5 @@
+-- Reverse scan: ORDER BY DESC LIMIT larger fetch (narrow projection)
+SELECT l_orderkey, l_partkey, l_suppkey
+FROM lineitem
+ORDER BY l_orderkey DESC
+LIMIT 1000
diff --git a/benchmarks/queries/sort_pushdown/q7.sql b/benchmarks/queries/sort_pushdown/q7.sql
new file mode 100644
index 0000000000000..3e8856822d83d
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown/q7.sql
@@ -0,0 +1,5 @@
+-- Reverse scan: wide projection + DESC LIMIT
+SELECT *
+FROM lineitem
+ORDER BY l_orderkey DESC
+LIMIT 100
diff --git a/benchmarks/queries/sort_pushdown/q8.sql b/benchmarks/queries/sort_pushdown/q8.sql
new file mode 100644
index 0000000000000..95ba89fdd5089
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown/q8.sql
@@ -0,0 +1,5 @@
+-- Reverse scan: wide projection + DESC LIMIT larger fetch
+SELECT *
+FROM lineitem
+ORDER BY l_orderkey DESC
+LIMIT 1000
diff --git a/benchmarks/queries/sort_pushdown_inexact/q1.sql b/benchmarks/queries/sort_pushdown_inexact/q1.sql
new file mode 100644
index 0000000000000..d772bc486a12b
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown_inexact/q1.sql
@@ -0,0 +1,8 @@
+-- Inexact path: TopK + DESC LIMIT on ASC-declared file.
+-- With RG reorder, the first RG read contains the highest max value,
+-- so TopK's threshold tightens quickly and subsequent RGs get filtered
+-- efficiently via dynamic filter pushdown.
+SELECT l_orderkey, l_partkey, l_suppkey
+FROM lineitem
+ORDER BY l_orderkey DESC
+LIMIT 100
diff --git a/benchmarks/queries/sort_pushdown_inexact/q2.sql b/benchmarks/queries/sort_pushdown_inexact/q2.sql
new file mode 100644
index 0000000000000..6e2bef44fc37e
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown_inexact/q2.sql
@@ -0,0 +1,7 @@
+-- Inexact path: TopK + DESC LIMIT with larger fetch (1000).
+-- Larger LIMIT means more row_replacements; RG reorder reduces the
+-- total replacement count by tightening the threshold faster.
+SELECT l_orderkey, l_partkey, l_suppkey
+FROM lineitem
+ORDER BY l_orderkey DESC
+LIMIT 1000
diff --git a/benchmarks/queries/sort_pushdown_inexact/q3.sql b/benchmarks/queries/sort_pushdown_inexact/q3.sql
new file mode 100644
index 0000000000000..d858ec79a67c9
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown_inexact/q3.sql
@@ -0,0 +1,8 @@
+-- Inexact path: wide projection (all columns) + DESC LIMIT.
+-- Shows the row-level filter benefit: with a tight threshold from the
+-- first RG, subsequent RGs skip decoding non-sort columns for filtered
+-- rows — bigger wins for wide tables.
+SELECT *
+FROM lineitem
+ORDER BY l_orderkey DESC
+LIMIT 100
diff --git a/benchmarks/queries/sort_pushdown_inexact/q4.sql b/benchmarks/queries/sort_pushdown_inexact/q4.sql
new file mode 100644
index 0000000000000..bd2efc5d3b992
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown_inexact/q4.sql
@@ -0,0 +1,7 @@
+-- Inexact path: wide projection + DESC LIMIT with larger fetch.
+-- Combines wide-row row-level filter benefit with larger LIMIT to
+-- demonstrate cumulative gains from RG reorder.
+SELECT *
+FROM lineitem
+ORDER BY l_orderkey DESC
+LIMIT 1000
diff --git a/benchmarks/queries/sort_pushdown_inexact_overlap/q1.sql b/benchmarks/queries/sort_pushdown_inexact_overlap/q1.sql
new file mode 100644
index 0000000000000..0e978bddbed03
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown_inexact_overlap/q1.sql
@@ -0,0 +1,7 @@
+-- Overlapping RGs: TopK + DESC LIMIT on file with partially overlapping
+-- row groups (simulates streaming data with network jitter).
+-- RG reorder places highest-max RG first for fastest threshold convergence.
+SELECT l_orderkey, l_partkey, l_suppkey
+FROM lineitem
+ORDER BY l_orderkey DESC
+LIMIT 100
diff --git a/benchmarks/queries/sort_pushdown_inexact_overlap/q2.sql b/benchmarks/queries/sort_pushdown_inexact_overlap/q2.sql
new file mode 100644
index 0000000000000..34d0a910cbf3a
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown_inexact_overlap/q2.sql
@@ -0,0 +1,5 @@
+-- Overlapping RGs: DESC LIMIT with larger fetch.
+SELECT l_orderkey, l_partkey, l_suppkey
+FROM lineitem
+ORDER BY l_orderkey DESC
+LIMIT 1000
diff --git a/benchmarks/queries/sort_pushdown_inexact_overlap/q3.sql b/benchmarks/queries/sort_pushdown_inexact_overlap/q3.sql
new file mode 100644
index 0000000000000..08b30b24d3dd1
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown_inexact_overlap/q3.sql
@@ -0,0 +1,6 @@
+-- Overlapping RGs: wide projection + DESC LIMIT.
+-- Row-level filter benefit: tight threshold skips decoding non-sort columns.
+SELECT *
+FROM lineitem
+ORDER BY l_orderkey DESC
+LIMIT 100
diff --git a/benchmarks/queries/sort_pushdown_inexact_overlap/q4.sql b/benchmarks/queries/sort_pushdown_inexact_overlap/q4.sql
new file mode 100644
index 0000000000000..4c091424f901c
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown_inexact_overlap/q4.sql
@@ -0,0 +1,5 @@
+-- Overlapping RGs: wide projection + DESC LIMIT larger fetch.
+SELECT *
+FROM lineitem
+ORDER BY l_orderkey DESC
+LIMIT 1000
diff --git a/benchmarks/queries/sort_pushdown_inexact_unsorted/q1.sql b/benchmarks/queries/sort_pushdown_inexact_unsorted/q1.sql
new file mode 100644
index 0000000000000..06748b72a98a3
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown_inexact_unsorted/q1.sql
@@ -0,0 +1,7 @@
+-- Unsupported path: TopK + ASC LIMIT on file without declared ordering.
+-- Tests RG reorder benefit when no WITH ORDER is declared — the
+-- Unsupported path in try_pushdown_sort triggers RG reorder.
+SELECT l_orderkey, l_partkey, l_suppkey
+FROM lineitem
+ORDER BY l_orderkey
+LIMIT 100
diff --git a/benchmarks/queries/sort_pushdown_inexact_unsorted/q2.sql b/benchmarks/queries/sort_pushdown_inexact_unsorted/q2.sql
new file mode 100644
index 0000000000000..384e4647eb0d9
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown_inexact_unsorted/q2.sql
@@ -0,0 +1,5 @@
+-- Unsupported path: TopK + ASC LIMIT with larger fetch.
+SELECT l_orderkey, l_partkey, l_suppkey
+FROM lineitem
+ORDER BY l_orderkey
+LIMIT 1000
diff --git a/benchmarks/queries/sort_pushdown_inexact_unsorted/q3.sql b/benchmarks/queries/sort_pushdown_inexact_unsorted/q3.sql
new file mode 100644
index 0000000000000..d48a2d969c468
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown_inexact_unsorted/q3.sql
@@ -0,0 +1,6 @@
+-- Unsupported path: wide projection + ASC LIMIT.
+-- Shows row-level filter benefit when RG reorder tightens TopK threshold.
+SELECT *
+FROM lineitem
+ORDER BY l_orderkey
+LIMIT 100
diff --git a/benchmarks/queries/sort_pushdown_inexact_unsorted/q4.sql b/benchmarks/queries/sort_pushdown_inexact_unsorted/q4.sql
new file mode 100644
index 0000000000000..d12d48f43a626
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown_inexact_unsorted/q4.sql
@@ -0,0 +1,5 @@
+-- Unsupported path: wide projection + ASC LIMIT with larger fetch.
+SELECT *
+FROM lineitem
+ORDER BY l_orderkey
+LIMIT 1000
diff --git a/benchmarks/queries/sort_pushdown_inexact_unsorted/q5.sql b/benchmarks/queries/sort_pushdown_inexact_unsorted/q5.sql
new file mode 100644
index 0000000000000..ab1dddab408f3
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown_inexact_unsorted/q5.sql
@@ -0,0 +1,5 @@
+-- Unsupported path: DESC LIMIT (no declared ordering = no reverse scan).
+SELECT l_orderkey, l_partkey, l_suppkey
+FROM lineitem
+ORDER BY l_orderkey DESC
+LIMIT 100
diff --git a/benchmarks/queries/sort_pushdown_inexact_unsorted/q6.sql b/benchmarks/queries/sort_pushdown_inexact_unsorted/q6.sql
new file mode 100644
index 0000000000000..8366e96969195
--- /dev/null
+++ b/benchmarks/queries/sort_pushdown_inexact_unsorted/q6.sql
@@ -0,0 +1,5 @@
+-- Unsupported path: wide projection + DESC LIMIT.
+SELECT *
+FROM lineitem
+ORDER BY l_orderkey DESC
+LIMIT 100
diff --git a/benchmarks/sql_benchmarks/README.md b/benchmarks/sql_benchmarks/README.md
new file mode 100644
index 0000000000000..e8899641c024b
--- /dev/null
+++ b/benchmarks/sql_benchmarks/README.md
@@ -0,0 +1,355 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# SQL Benchmarks
+
+This directory contains a collection of benchmarks each driven by a simple '.benchmark' text file and sql queries
+that exercise the DataFusion execution engine against a variety of benchmark suites. The sql benchmark framework
+is intentionally simple so that benchmarks and queries can be added or modified without touching the core
+engine or requiring recompilation.
+
+The sql benchmarks are organized in sub‑directories that correspond to the benchmark suites that are commonly used
+in the community:
+
+| Benchmark Suite       | Description                                                        |
+|-----------------------|--------------------------------------------------------------------|
+| `clickbench`          | ClickBench benchmark                                               |
+| `clickbench extended` | 12 additional, more complex queries against the Clickbench dataset |
+| `clickbench_sorted`   | ClickBench benchmark using a pre-sorted hits file.                 |
+| `h2o`                 | The `h2o` benchmark                                                |
+| `hj`                  | Hash join benchmark                                                |
+| `imdb`                | IMDb benchmark                                                     |
+| `nlj`                 | Nested‑loop join benchmark                                         |
+| `smj`                 | Sort‑merge join benchmark                                          |
+| `sort tpch`           | Sorting benchmarks against the TPC-H lineitem table                |
+| `taxi`                | NYC taxi dataset benchmark                                         |
+| `tpcds`               | TPC‑DS queries                                                     |
+| `tpch`                | TPC‑H queries                                                      |
+
+# Running Benchmarks
+
+The easiest way to run a benchmark is to use the `bench.sh` shell script (up one level from this document)
+as it takes care of configuring any required environment variables and can populate any required data files.
+However, it is possible to directly run a sql benchmark using the `cargo bench` command. For example:
+
+```shell
+BENCH_NAME=tpch cargo bench --bench sql
+```
+
+# Benchmark configuration
+
+Sql benchmarks are configured via environment variables. Cargo's bench command and
+[criterion](https://github.com/criterion-rs/criterion.rs) (the underlying benchmark framework) have an unfortunate
+limitation in that custom command arguments cannot be passed into a benchmark. The alternative is to use environment
+variables to pass in arguments which is what is used here.
+
+The SQL benchmarking tool uses the following environment variables:
+
+| Environment Variable  | Description                                                                                                                                                                                       |
+|-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| BENCH_NAME            | The name of the benchmark suite to run. For example 'imdb'. This should correspond to a directory name in the `sql_benchmarks` directory.                                                         |
+| BENCH_SUBGROUP        | The subgroup with the benchmark suite to run. For example 'window' to run the window subgroup of the h2o benchmark.                                                                               |
+| BENCH_QUERY           | A query number to run.                                                                                                                                                                            |
+| BENCH_PERSIST_RESULTS | true/false to persist benchmark results. Results will be persisted in csv format so be cognizant of the size of the results.                                                                      |
+| BENCH_VALIDATE        | true/false to validate benchmark results against persisted results or result_query's. If both `BENCH_PERSIST_RESULTS` and `BENCH_VALIDATE` are true, persist mode runs and validation is skipped. |
+| SIMULATE_LATENCY      | Simulate object store latency to mimic remote storage (e.g. S3). Adds random latency in the range 20-200ms to each object store operation.                                                        |
+| MEM_POOL_TYPE         | The memory pool type to use, should be one of "fair" or "greedy".                                                                                                                                 |
+| MEMORY_LIMIT          | Memory limit (e.g. '100M', '1.5G'). If not specified, run all pre-defined memory limits for given query if there's any, otherwise run with no memory limit.                                       | |
+
+Example – Run the H2O window benchmarks on the 'small' sized CSV data files:
+
+``` bash
+BENCH_NAME=h2o BENCH_SUBGROUP=window H2O_BENCH_SIZE=small H20_FILE_TYPE=csv cargo bench --bench sql 
+```
+
+Some benchmarks use custom environment variables as outlined below:
+
+| Name                         | Description                                                                                                              | Default value | 
+|------------------------------|--------------------------------------------------------------------------------------------------------------------------|---------------|
+| BENCH_SIZE                   | Used in the tpch, sort-tpch and tpcds benchmarks. The size corresponds to the scale factor.                              | `1`           |
+| TPCH_FILE_TYPE               | Used in the tpch benchmark to specify which file type to query against. The valid options are `csv`, `parquet` and `mem` | `parquet`     |
+| H2O_FILE_TYPE                | Used in the h2o benchmark to specify which file type to query against. The valid options are `csv` and `parquet`         | `csv`         |
+| CLICKBENCH_TYPE              | The type of partitioning for the clickbench benchmark. Valid options are `single` and `partitioned`                      | `single`      | 
+| H2O_BENCH_SIZE               | Used in the h2o benchmark. The valid options are `small`, `medium` and `big`                                             | `small`       |                           
+| PREFER_HASH_JOIN             | Control datafusion's config option `datafusion.optimizer.prefer_hash_join`                                               | true          |
+| HASH_JOIN_BUFFERING_CAPACITY | Control datafusion's config option `datafusion.execution.hash_join_buffering_capacity`                                   | 0             |
+| BENCH_SORTED                 | Used in the sort_tpch benchmark to indicate whether the lineitem table should be sorted.                                 | false         |
+| SORTED_BY                    | Used in the clickbench_sorted benchmark to indicate the column to sort by.                                               | `EventTime`   |
+| SORTED_ORDER                 | Used in the clickbench_sorted benchmark to indicate the sort order of the column.                                        | `ASC`         |
+
+## How it works
+
+SQL benchmarks are run via cargo's bench command using [criterion](https://docs.rs/criterion/latest/criterion/)
+for running and gathering statistics of each sql being benchmarked.
+
+Each individual benchmark is represented by a `<name>.benchmark` file that contains a number of directives instructing
+the tool on how to load data, run initializations, run assertions, run the benchmark, optionally persist and
+validate results, and finally run any cleanup if required.
+
+Variables are supported in two forms:
+
+* string substitution based on environment variables (with default values if unset): \${ENV_VAR} and
+  \${ENV_VAR:-default}.
+* if / else based on whether an environment variable is true or not
+  (\${ENV_VAR:-default|true value|false value}). In this form only the value `true` (case-insensitive) selects the
+  true branch; any other set value selects the false branch. If ENV_VAR is unset, the valud of `default` is used to
+* select the branch.
+
+Comments in files are supported with lines starting with # or --.
+
+Many if not most of the benchmarks are set up using templates to reduce duplication across the .benchmark files. For
+example here is one of the benchmark files for the h2o benchmark suite:
+
+```
+subgroup groupby
+
+template sql_benchmarks/h2o/h2o.benchmark.template
+QUERY_NUMBER=1
+QUERY_NUMBER_PADDED=01
+```
+
+The template directive above defines the subgroup the benchmark is part of, sets two variables (`QUERY_NUMBER` and
+`QUERY_NUMBER_PADDED`) and points to a file containing more directives that are shared across the benchmark suite.
+
+```
+load sql_benchmarks/h2o/init/load_${BENCH_SUBGROUP:-groupby}_${BENCH_SIZE:-small}_${BENCH_FILE_TYPE:-csv}.sql
+
+name Q${QUERY_NUMBER_PADDED}
+group h2o
+
+run sql_benchmarks/h2o/queries/${BENCH_SUBGROUP:-groupby}/q${QUERY_NUMBER_PADDED}.sql
+
+result sql_benchmarks/h2o/results/${BENCH_SUBGROUP:-groupby}/${BENCH_SIZE:-small}/q${QUERY_NUMBER_PADDED}.csv
+```
+
+The above showcases the use of defaults for variables: `${NAME:-default}`
+
+# Directives
+
+<table>
+<tr><th>Directive</th><th>Description</th></tr>
+<tr>
+<td>name</td>
+<td>
+
+The name of the benchmark. This will be used as part of the display name used by criterion.<br/><br/>Example:<br/>
+<blockquote>name Q${QUERY_NUMBER_PADDED}</blockquote>
+
+The `name` directive also makes the value available to benchmark-file replacements as `BENCH_NAME`. This is separate
+from the `BENCH_NAME` environment variable used to select which benchmark group to run.
+
+</td>
+</tr>
+<tr>
+<td>group</td>
+<td>
+
+The group name of the benchmark used for grouping benchmarks together.<br/><br/>Example:<br/>
+<blockquote>group imdb</blockquote>
+
+</td>
+</tr>
+<tr>
+<td>subgroup</td>
+<td>
+
+The sub group name of the benchmark used for filtering to a specific sub group.<br/><br/>Example:<br/>
+<blockquote>subgroup window</blockquote>
+
+</td>
+</tr>
+<tr>
+<td>load</td>
+<td>
+
+The load directive called during initialization of the benchmark. If a path to a file is provided on the same
+line as the load directive that path will be parsed and any sql statements in that file will be executed during
+initialization. If no path is specified the next line is required to be the sql statement to execute. <br/> <br/> The
+load directive (including any following sql statement) must be followed by a blank line. <br/><br/>Example:<br/>
+<blockquote>load sql_benchmarks/h2o/init/load_${BENCH_SUBGROUP:-groupby}_${BENCH_SIZE:-small}_${BENCH_FILE_TYPE:-csv}.sql</blockquote>
+or
+<blockquote>
+load<br/>
+CREATE TABLE test AS (SELECT value as key FROM range(1000000) ORDER BY value);
+</blockquote>
+
+</td>
+</tr>
+<tr>
+<td>init</td>
+<td>
+
+The init directive is called after the load directive prior to benchmark execution. If a path to a file is
+provided on the same line as the init directive that path will be parsed and any sql statements in that file will be
+executed during the benchmark initialization. If no path is specified the next line is required to be the sql statement
+to execute.<br/><br/> The init directive (including any following sql statement) must be followed by a blank
+line.<br/><br/>Example:<br/>
+<blockquote>
+init<br/>
+set datafusion.execution.parquet.binary_as_string = true;  
+</blockquote>
+
+</td>
+</tr>
+<tr>
+<td>run</td>
+<td>
+
+The run directive called during execution of the benchmark. If a path to a file is provided on the same line as
+the run directive that path will be parsed and any sql statements in that file will be executed during the benchmark
+run. If no path is specified the next line is required to be the sql statement to execute. <br/><br/> Multiple
+statements are allowed within a single run directive, however a benchmark file may contain only one run directive. When
+running with `BENCH_PERSIST_RESULTS` or `BENCH_VALIDATE`, only the last `SELECT` or `WITH` statement from that run
+directive will be used for comparison. <br/><br/> The run directive (including any following sql statement) must be
+followed by a blank line.<br/><br/>Example:<br/>
+<blockquote>run sql_benchmarks/imdb/queries/${QUERY_NUMBER_PADDED}.sql</blockquote>
+
+</td>
+</tr>
+<tr>
+<td>cleanup</td>
+<td>
+
+The cleanup directive is called after all other directives and can be used to cleanup after the benchmark -
+e.g. to drop tables. If a path to a file is provided on the same line as the cleanup directive that path will be parsed
+and any sql statements in that file will be executed during cleanup. If no path is specified the next line is
+required to be the sql statement to execute. <br/> <br/> The cleanup directive (including any following sql statement)
+must be followed by a blank line. <br/><br/>Example:<br/>
+<blockquote>
+cleanup<br/>
+DROP TABLE test;
+</blockquote>
+
+</td>
+</tr>
+<tr>
+<td>expect_plan</td>
+<td>
+
+The expect_plan directive will check the physical plan for the string provided on the same line. This
+can be used to validate that a particular join was used. <br/> <br/> Example:<br/>
+<blockquote>expect_plan NestedLoopJoinExec</blockquote>
+
+</td>
+</tr>
+<tr>
+<td>assert</td>
+<td>
+
+The assert directive is run between the init and run directives and can be used to validate system state correctness
+prior to running the benchmark sql. The format is
+<blockquote>
+assert II<br/>
+SELECT name, value = 3 FROM information_schema.df_settings WHERE name IN ('datafusion.execution.target_partitions', 'datafusion.execution.planning_concurrency');<br/>
+----<br/>
+datafusion.execution.planning_concurrency true<br/>
+datafusion.execution.target_partitions true<br/>
+</blockquote>
+
+The number of I's corresponds to the number of columns in the result. The expected results can be either tab delimited
+or pipe delimited.
+
+</td>
+</tr>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
+<tr>
+<td>result_query</td>
+<td>
+
+The result_query directive is run during the verify phase and can be used to verify a different set of results than any
+that might come from queries executed from the `run` directive. The format is the same as the `assert` directive
+above.<br/><br/>Example:
+<blockquote>
+result_query III<br/>
+SELECT COUNT(DISTINCT id2), SUM(r4), COUNT(*) FROM answer;<br/>
+----<br/>
+123 345 45
+</blockquote>
+
+
+Note that the results of the run query are not automatically stored into a table in datafusion. If you want to
+verify a result from queries executed from the `run` directive those queries will have to be saved to a table directly
+using `CREATE TABLE AS (..)` or similar.
+
+</td>
+</tr>
+<tr>
+<td>result</td>
+<td>
+
+The result directive declares the expected result file used during verification. A path to a file is required on the
+same line as the result directive. The file is parsed only during verification, and must be a pipe-delimited CSV file
+with a header row. During verification, these expected rows are compared with the rows produced by the last saved
+`SELECT` or `WITH` statement from the `run` directive. <br/><br/>Example:<br/>
+<blockquote>
+result sql_benchmarks/imdb/results/${QUERY_NUMBER_PADDED}.csv  
+</blockquote>
+
+</td>
+</tr>
+<tr>
+<td>template</td>
+<td>
+
+The template directive allows for inclusion of another file in a benchmark file. A path to a file is
+required on the same line as the template directive which will be parsed as a benchmark file. Parameters can be passed
+to the template file using the format `KEY=value`, one per line after the template directive followed by a blank line.
+<br/><br/>Example:<br/>
+<blockquote>
+template sql_benchmarks/smj/smj.benchmark.template<br/>
+QUERY_NUMBER=1<br/>
+QUERY_NUMBER_PADDED=01
+</blockquote> 
+
+</td>
+</tr>
+<tr>
+<td>include</td>
+<td>The include directive is similar to the template directive except that it does not support parameters.</td>
+</tr>
+<tr>
+<td>echo</td>
+<td>
+
+The echo directive allows for echoing a string to stdout during the execution of the benchmark and may be useful for
+debugging.<br/><br/>Example:<br/>
+<blockquote>
+echo The value for batch size is ${BATCH_SIZE:-8192}
+</blockquote>
+
+</td>
+</tr>
+</table>
+
+# Extending an existing benchmark suite
+
+If you want to add a new query:
+
+* Create a new qXX.sql in the corresponding queries folder of the benchmark.
+* Add a new qXX.benchmark that references the appropriate template (clickbench.benchmark.template,
+  h2o.benchmark.template,
+  etc.).
+* (Optional) Add a new entry to the suite’s load script if the data set is different.
+* (Optional) Manually create a result csv to be compared against benchmark results during verification.
+
+# Adding a new benchmark suite
+
+* Create a new directory named for the new benchmark suite.
+* Within there create a `<name>.benchmark` for each individual benchmark.
+* Populate the benchmark with directives as described above. Use the other benchmarks as examples for standardization.
+* No rust files need to be updated to run the new benchmark suite.
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q01.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q01.benchmark
new file mode 100644
index 0000000000000..d490927df326f
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q01.benchmark
@@ -0,0 +1,34 @@
+name Q01
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select l_returnflag,
+        l_linestatus,
+        sum(l_quantity)                                       as sum_qty,
+        sum(l_extendedprice)                                  as sum_base_price,
+        sum(l_extendedprice * (1 - l_discount))               as sum_disc_price,
+        sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
+        avg(l_quantity)                                       as avg_qty,
+        avg(l_extendedprice)                                  as avg_price,
+        avg(l_discount)                                       as avg_disc,
+        count(*)                                              as count_order
+ from lineitem
+ where l_shipdate <= date '1998-12-01' - interval '90' day
+ group by l_returnflag,
+          l_linestatus
+ order by l_returnflag,
+          l_linestatus;
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q01.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q02.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q02.benchmark
new file mode 100644
index 0000000000000..6f365248b4998
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q02.benchmark
@@ -0,0 +1,62 @@
+name Q02
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select
+    s_acctbal,
+    s_name,
+    n_name,
+    p_partkey,
+    p_mfgr,
+    s_address,
+    s_phone,
+    s_comment
+from
+    part,
+    supplier,
+    partsupp,
+    nation,
+    region
+where
+        p_partkey = ps_partkey
+  and s_suppkey = ps_suppkey
+  and p_size = 15
+  and p_type like '%BRASS'
+  and s_nationkey = n_nationkey
+  and n_regionkey = r_regionkey
+  and r_name = 'EUROPE'
+  and ps_supplycost = (
+    select
+        min(ps_supplycost)
+    from
+        partsupp,
+        supplier,
+        nation,
+        region
+    where
+            p_partkey = ps_partkey
+      and s_suppkey = ps_suppkey
+      and s_nationkey = n_nationkey
+      and n_regionkey = r_regionkey
+      and r_name = 'EUROPE'
+)
+order by
+    s_acctbal desc,
+    n_name,
+    s_name,
+    p_partkey
+limit 100;
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q02.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q03.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q03.benchmark
new file mode 100644
index 0000000000000..cb16b10c2bb5a
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q03.benchmark
@@ -0,0 +1,41 @@
+name Q03
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select
+    l_orderkey,
+    sum(l_extendedprice * (1 - l_discount)) as revenue,
+    o_orderdate,
+    o_shippriority
+from
+    customer,
+    orders,
+    lineitem
+where
+        c_mktsegment = 'BUILDING'
+  and c_custkey = o_custkey
+  and l_orderkey = o_orderkey
+  and o_orderdate < date '1995-03-15'
+  and l_shipdate > date '1995-03-15'
+group by
+    l_orderkey,
+    o_orderdate,
+    o_shippriority
+order by
+    revenue desc,
+    o_orderdate
+limit 10;
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q03.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q04.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q04.benchmark
new file mode 100644
index 0000000000000..f2e6f9a558416
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q04.benchmark
@@ -0,0 +1,39 @@
+name Q04
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select
+    o_orderpriority,
+    count(*) as order_count
+from
+    orders
+where
+        o_orderdate >= '1993-07-01'
+  and o_orderdate < date '1993-07-01' + interval '3' month
+  and exists (
+        select
+            *
+        from
+            lineitem
+        where
+                l_orderkey = o_orderkey
+          and l_commitdate < l_receiptdate
+    )
+group by
+    o_orderpriority
+order by
+    o_orderpriority;
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q04.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q05.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q05.benchmark
new file mode 100644
index 0000000000000..9b5fbda63b4cb
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q05.benchmark
@@ -0,0 +1,37 @@
+name Q05
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select n_name,
+       sum(l_extendedprice * (1 - l_discount)) as revenue
+from customer,
+     orders,
+     lineitem,
+     supplier,
+     nation,
+     region
+where c_custkey = o_custkey
+  and l_orderkey = o_orderkey
+  and l_suppkey = s_suppkey
+  and c_nationkey = s_nationkey
+  and s_nationkey = n_nationkey
+  and n_regionkey = r_regionkey
+  and r_name = 'ASIA'
+  and o_orderdate >= date '1994-01-01'
+  and o_orderdate < date '1994-01-01' + interval '1' year
+group by n_name
+order by revenue desc;
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q05.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q06.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q06.benchmark
new file mode 100644
index 0000000000000..761875cbf3558
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q06.benchmark
@@ -0,0 +1,25 @@
+name Q06
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select sum(l_extendedprice * l_discount) as revenue
+from lineitem
+where l_shipdate >= date '1994-01-01'
+  and l_shipdate < date '1994-01-01' + interval '1' year
+  and l_discount between 0.06 - 0.01 and 0.06 + 0.01
+  and l_quantity < 24;
+  
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q06.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q07.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q07.benchmark
new file mode 100644
index 0000000000000..30c4c520de823
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q07.benchmark
@@ -0,0 +1,57 @@
+name Q07
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select
+    supp_nation,
+    cust_nation,
+    l_year,
+    sum(volume) as revenue
+from
+    (
+        select
+            n1.n_name as supp_nation,
+            n2.n_name as cust_nation,
+            extract(year from l_shipdate) as l_year,
+            l_extendedprice * (1 - l_discount) as volume
+        from
+            supplier,
+            lineitem,
+            orders,
+            customer,
+            nation n1,
+            nation n2
+        where
+                s_suppkey = l_suppkey
+          and o_orderkey = l_orderkey
+          and c_custkey = o_custkey
+          and s_nationkey = n1.n_nationkey
+          and c_nationkey = n2.n_nationkey
+          and (
+                (n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY')
+                or (n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE')
+            )
+          and l_shipdate between date '1995-01-01' and date '1996-12-31'
+    ) as shipping
+group by
+    supp_nation,
+    cust_nation,
+    l_year
+order by
+    supp_nation,
+    cust_nation,
+    l_year;
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q07.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q08.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q08.benchmark
new file mode 100644
index 0000000000000..86caded4b7f1e
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q08.benchmark
@@ -0,0 +1,55 @@
+name Q08
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select
+    o_year,
+    sum(case
+            when nation = 'BRAZIL' then volume
+            else 0
+        end) / sum(volume) as mkt_share
+from
+    (
+        select
+            extract(year from o_orderdate) as o_year,
+            l_extendedprice * (1 - l_discount) as volume,
+            n2.n_name as nation
+        from
+            part,
+            supplier,
+            lineitem,
+            orders,
+            customer,
+            nation n1,
+            nation n2,
+            region
+        where
+                p_partkey = l_partkey
+          and s_suppkey = l_suppkey
+          and l_orderkey = o_orderkey
+          and o_custkey = c_custkey
+          and c_nationkey = n1.n_nationkey
+          and n1.n_regionkey = r_regionkey
+          and r_name = 'AMERICA'
+          and s_nationkey = n2.n_nationkey
+          and o_orderdate between date '1995-01-01' and date '1996-12-31'
+          and p_type = 'ECONOMY ANODIZED STEEL'
+    ) as all_nations
+group by
+    o_year
+order by
+    o_year;
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q08.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q09.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q09.benchmark
new file mode 100644
index 0000000000000..3302cf6f0ba81
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q09.benchmark
@@ -0,0 +1,50 @@
+name Q09
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select
+    nation,
+    o_year,
+    sum(amount) as sum_profit
+from
+    (
+        select
+            n_name as nation,
+            extract(year from o_orderdate) as o_year,
+            l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
+        from
+            part,
+            supplier,
+            lineitem,
+            partsupp,
+            orders,
+            nation
+        where
+                s_suppkey = l_suppkey
+          and ps_suppkey = l_suppkey
+          and ps_partkey = l_partkey
+          and p_partkey = l_partkey
+          and o_orderkey = l_orderkey
+          and s_nationkey = n_nationkey
+          and p_name like '%green%'
+    ) as profit
+group by
+    nation,
+    o_year
+order by
+    nation,
+    o_year desc;
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q09.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q10.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q10.benchmark
new file mode 100644
index 0000000000000..4ef08e3fd2074
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q10.benchmark
@@ -0,0 +1,50 @@
+name Q10
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select
+    c_custkey,
+    c_name,
+    sum(l_extendedprice * (1 - l_discount)) as revenue,
+    c_acctbal,
+    n_name,
+    c_address,
+    c_phone,
+    c_comment
+from
+    customer,
+    orders,
+    lineitem,
+    nation
+where
+        c_custkey = o_custkey
+  and l_orderkey = o_orderkey
+  and o_orderdate >= date '1993-10-01'
+  and o_orderdate < date '1993-10-01' + interval '3' month
+  and l_returnflag = 'R'
+  and c_nationkey = n_nationkey
+group by
+    c_custkey,
+    c_name,
+    c_acctbal,
+    c_phone,
+    n_name,
+    c_address,
+    c_comment
+order by
+    revenue desc
+limit 20;
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q10.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q11.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q11.benchmark
new file mode 100644
index 0000000000000..833799a39d756
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q11.benchmark
@@ -0,0 +1,45 @@
+name Q11
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select
+    ps_partkey,
+    sum(ps_supplycost * ps_availqty) as value
+from
+    partsupp,
+    supplier,
+    nation
+where
+    ps_suppkey = s_suppkey
+  and s_nationkey = n_nationkey
+  and n_name = 'GERMANY'
+group by
+    ps_partkey having
+    sum(ps_supplycost * ps_availqty) > (
+    select
+    sum(ps_supplycost * ps_availqty) * (0.0001/${BENCH_SIZE:-1})
+    from
+    partsupp,
+    supplier,
+    nation
+    where
+    ps_suppkey = s_suppkey
+                  and s_nationkey = n_nationkey
+                  and n_name = 'GERMANY'
+    )
+order by
+    value desc;
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q11.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q12.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q12.benchmark
new file mode 100644
index 0000000000000..37aee848c962b
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q12.benchmark
@@ -0,0 +1,43 @@
+name Q12
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select l_shipmode,
+       sum(case
+               when o_orderpriority = '1-URGENT'
+                   or o_orderpriority = '2-HIGH'
+                   then 1
+               else 0
+           end) as high_line_count,
+       sum(case
+               when o_orderpriority <> '1-URGENT'
+                   and o_orderpriority <> '2-HIGH'
+                   then 1
+               else 0
+           end) as low_line_count
+from lineitem
+         join
+     orders
+     on
+         l_orderkey = o_orderkey
+where l_shipmode in ('MAIL', 'SHIP')
+  and l_commitdate < l_receiptdate
+  and l_shipdate < l_commitdate
+  and l_receiptdate >= date '1994-01-01'
+  and l_receiptdate < date '1994-01-01' + interval '1' year
+group by l_shipmode
+order by l_shipmode;
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q12.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q13.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q13.benchmark
new file mode 100644
index 0000000000000..dfb09853d0987
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q13.benchmark
@@ -0,0 +1,38 @@
+name Q13
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select
+    c_count,
+    count(*) as custdist
+from
+    (
+        select
+            c_custkey,
+            count(o_orderkey)
+        from
+            customer left outer join orders on
+                        c_custkey = o_custkey
+                    and o_comment not like '%special%requests%'
+        group by
+            c_custkey
+    ) as c_orders (c_custkey, c_count)
+group by
+    c_count
+order by
+    custdist desc,
+    c_count desc;
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q13.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q14.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q14.benchmark
new file mode 100644
index 0000000000000..b48d95043fdcb
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q14.benchmark
@@ -0,0 +1,28 @@
+name Q14
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select 100.00 * sum(case
+                        when p_type like 'PROMO%'
+                            then l_extendedprice * (1 - l_discount)
+                        else 0
+    end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue
+from lineitem,
+     part
+where l_partkey = p_partkey
+  and l_shipdate >= date '1995-09-01'
+  and l_shipdate < date '1995-09-01' + interval '1' month;
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q14.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q15.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q15.benchmark
new file mode 100644
index 0000000000000..0f50fc499d0b4
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q15.benchmark
@@ -0,0 +1,49 @@
+name Q15
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+create view revenue0 (supplier_no, total_revenue) as
+	select
+		l_suppkey,
+		sum(l_extendedprice * (1 - l_discount))
+	from
+		lineitem
+	where
+		l_shipdate >= date '1996-01-01'
+		and l_shipdate < date '1996-01-01' + interval '3' month
+	group by
+		l_suppkey;
+select
+	s_suppkey,
+	s_name,
+	s_address,
+	s_phone,
+	total_revenue
+from
+	supplier,
+	revenue0
+where
+	s_suppkey = supplier_no
+	and total_revenue = (
+		select
+			max(total_revenue)
+		from
+			revenue0
+	)
+order by
+	s_suppkey;
+drop view revenue0;
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q15.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q16.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q16.benchmark
new file mode 100644
index 0000000000000..3fa6c68e29985
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q16.benchmark
@@ -0,0 +1,48 @@
+name Q16
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select
+    p_brand,
+    p_type,
+    p_size,
+    count(distinct ps_suppkey) as supplier_cnt
+from
+    partsupp,
+    part
+where
+        p_partkey = ps_partkey
+  and p_brand <> 'Brand#45'
+  and p_type not like 'MEDIUM POLISHED%'
+  and p_size in (49, 14, 23, 45, 19, 3, 36, 9)
+  and ps_suppkey not in (
+    select
+        s_suppkey
+    from
+        supplier
+    where
+            s_comment like '%Customer%Complaints%'
+)
+group by
+    p_brand,
+    p_type,
+    p_size
+order by
+    supplier_cnt desc,
+    p_brand,
+    p_type,
+    p_size;
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q16.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q17.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q17.benchmark
new file mode 100644
index 0000000000000..a31c837d1e164
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q17.benchmark
@@ -0,0 +1,35 @@
+name Q17
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select
+        sum(l_extendedprice) / 7.0 as avg_yearly
+from
+    lineitem,
+    part
+where
+        p_partkey = l_partkey
+  and p_brand = 'Brand#23'
+  and p_container = 'MED BOX'
+  and l_quantity < (
+    select
+            0.2 * avg(l_quantity)
+    from
+        lineitem
+    where
+            l_partkey = p_partkey
+);
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q17.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q18.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q18.benchmark
new file mode 100644
index 0000000000000..149b0efd01c99
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q18.benchmark
@@ -0,0 +1,51 @@
+name Q18
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select
+    c_name,
+    c_custkey,
+    o_orderkey,
+    o_orderdate,
+    o_totalprice,
+    sum(l_quantity)
+from
+    customer,
+    orders,
+    lineitem
+where
+        o_orderkey in (
+        select
+            l_orderkey
+        from
+            lineitem
+        group by
+            l_orderkey having
+                sum(l_quantity) > 300
+    )
+  and c_custkey = o_custkey
+  and o_orderkey = l_orderkey
+group by
+    c_name,
+    c_custkey,
+    o_orderkey,
+    o_orderdate,
+    o_totalprice
+order by
+    o_totalprice desc,
+    o_orderdate
+limit 100;
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q18.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q19.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q19.benchmark
new file mode 100644
index 0000000000000..f93ad6cb73143
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q19.benchmark
@@ -0,0 +1,53 @@
+name Q19
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select
+    sum(l_extendedprice* (1 - l_discount)) as revenue
+from
+    lineitem,
+    part
+where
+    (
+                p_partkey = l_partkey
+            and p_brand = 'Brand#12'
+            and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
+            and l_quantity >= 1 and l_quantity <= 1 + 10
+            and p_size between 1 and 5
+            and l_shipmode in ('AIR', 'AIR REG')
+            and l_shipinstruct = 'DELIVER IN PERSON'
+        )
+   or
+    (
+                p_partkey = l_partkey
+            and p_brand = 'Brand#23'
+            and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
+            and l_quantity >= 10 and l_quantity <= 10 + 10
+            and p_size between 1 and 10
+            and l_shipmode in ('AIR', 'AIR REG')
+            and l_shipinstruct = 'DELIVER IN PERSON'
+        )
+   or
+    (
+                p_partkey = l_partkey
+            and p_brand = 'Brand#34'
+            and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
+            and l_quantity >= 20 and l_quantity <= 20 + 10
+            and p_size between 1 and 15
+            and l_shipmode in ('AIR', 'AIR REG')
+            and l_shipinstruct = 'DELIVER IN PERSON'
+        );
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q19.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q20.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q20.benchmark
new file mode 100644
index 0000000000000..123386055b1ba
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q20.benchmark
@@ -0,0 +1,55 @@
+name Q20
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select
+    s_name,
+    s_address
+from
+    supplier,
+    nation
+where
+        s_suppkey in (
+        select
+            ps_suppkey
+        from
+            partsupp
+        where
+                ps_partkey in (
+                select
+                    p_partkey
+                from
+                    part
+                where
+                        p_name like 'forest%'
+            )
+          and ps_availqty > (
+            select
+                    0.5 * sum(l_quantity)
+            from
+                lineitem
+            where
+                    l_partkey = ps_partkey
+              and l_suppkey = ps_suppkey
+              and l_shipdate >= date '1994-01-01'
+              and l_shipdate < date '1994-01-01' + interval '1' year
+        )
+    )
+  and s_nationkey = n_nationkey
+  and n_name = 'CANADA'
+order by
+    s_name;
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q20.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q21.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q21.benchmark
new file mode 100644
index 0000000000000..24d754a4cbd15
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q21.benchmark
@@ -0,0 +1,58 @@
+name Q21
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select
+    s_name,
+    count(*) as numwait
+from
+    supplier,
+    lineitem l1,
+    orders,
+    nation
+where
+        s_suppkey = l1.l_suppkey
+  and o_orderkey = l1.l_orderkey
+  and o_orderstatus = 'F'
+  and l1.l_receiptdate > l1.l_commitdate
+  and exists (
+        select
+            *
+        from
+            lineitem l2
+        where
+                l2.l_orderkey = l1.l_orderkey
+          and l2.l_suppkey <> l1.l_suppkey
+    )
+  and not exists (
+        select
+            *
+        from
+            lineitem l3
+        where
+                l3.l_orderkey = l1.l_orderkey
+          and l3.l_suppkey <> l1.l_suppkey
+          and l3.l_receiptdate > l3.l_commitdate
+    )
+  and s_nationkey = n_nationkey
+  and n_name = 'SAUDI ARABIA'
+group by
+    s_name
+order by
+    numwait desc,
+    s_name
+limit 100;
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q21.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q22.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q22.benchmark
new file mode 100644
index 0000000000000..7ef6a78496c32
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q22.benchmark
@@ -0,0 +1,55 @@
+name Q22
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run
+select
+    cntrycode,
+    count(*) as numcust,
+    sum(c_acctbal) as totacctbal
+from
+    (
+        select
+            substring(c_phone from 1 for 2) as cntrycode,
+            c_acctbal
+        from
+            customer
+        where
+                substring(c_phone from 1 for 2) in
+                ('13', '31', '23', '29', '30', '18', '17')
+          and c_acctbal > (
+            select
+                avg(c_acctbal)
+            from
+                customer
+            where
+                    c_acctbal > 0.00
+              and substring(c_phone from 1 for 2) in
+                  ('13', '31', '23', '29', '30', '18', '17')
+        )
+          and not exists (
+                select
+                    *
+                from
+                    orders
+                where
+                        o_custkey = c_custkey
+            )
+    ) as custsale
+group by
+    cntrycode
+order by
+    cntrycode;
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q22.csv
+
+cleanup sql_benchmarks/tpch/init/cleanup.sql
diff --git a/benchmarks/sql_benchmarks/tpch/init/cleanup.sql b/benchmarks/sql_benchmarks/tpch/init/cleanup.sql
new file mode 100644
index 0000000000000..c8fb66a6a57e8
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/init/cleanup.sql
@@ -0,0 +1,15 @@
+DROP TABLE IF EXISTS nation;
+
+DROP TABLE IF EXISTS region;
+
+DROP TABLE IF EXISTS supplier;
+
+DROP TABLE IF EXISTS customer;
+
+DROP TABLE IF EXISTS part;
+
+DROP TABLE IF EXISTS partsupp;
+
+DROP TABLE IF EXISTS orders;
+
+DROP TABLE IF EXISTS lineitem;
diff --git a/benchmarks/sql_benchmarks/tpch/init/load_csv.sql b/benchmarks/sql_benchmarks/tpch/init/load_csv.sql
new file mode 100644
index 0000000000000..f9a9b2e988e24
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/init/load_csv.sql
@@ -0,0 +1,99 @@
+CREATE EXTERNAL TABLE nation
+(
+    n_nationkey INT,
+    n_name      CHAR(25),
+    n_regionkey INT,
+    n_comment   VARCHAR(152),
+    PRIMARY KEY (n_nationkey)
+) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/nation/nation.1.csv';
+
+CREATE EXTERNAL TABLE region
+(
+    r_regionkey INT,
+    r_name      CHAR(25),
+    r_comment   VARCHAR(152),
+    PRIMARY KEY (r_regionkey)
+) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/region/region.1.csv';
+
+CREATE EXTERNAL TABLE supplier
+(
+    s_suppkey   INT,
+    s_name      CHAR(25),
+    s_address   VARCHAR(40),
+    s_nationkey INT,
+    s_phone     CHAR(15),
+    s_acctbal   DECIMAL(15, 2),
+    s_comment   VARCHAR(101),
+    PRIMARY KEY (s_suppkey)
+) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/supplier/supplier.1.csv';
+
+CREATE EXTERNAL TABLE customer
+(
+    c_custkey    INT,
+    c_name       VARCHAR(25),
+    c_address    VARCHAR(40),
+    c_nationkey  INT,
+    c_phone      CHAR(15),
+    c_acctbal    DECIMAL(15, 2),
+    c_mktsegment CHAR(10),
+    c_comment    VARCHAR(117),
+    PRIMARY KEY (c_custkey)
+) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/customer/customer.1.csv';
+
+CREATE EXTERNAL TABLE part
+(
+    p_partkey     INT,
+    p_name        VARCHAR(55),
+    p_mfgr        CHAR(25),
+    p_brand       CHAR(10),
+    p_type        VARCHAR(25),
+    p_size        INT,
+    p_container   CHAR(10),
+    p_retailprice DECIMAL(15, 2),
+    p_comment     VARCHAR(23),
+    PRIMARY KEY (p_partkey)
+) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/part/part.1.csv';
+
+CREATE EXTERNAL TABLE partsupp
+(
+    ps_partkey    INT,
+    ps_suppkey    INT,
+    ps_availqty   INT,
+    ps_supplycost DECIMAL(15, 2),
+    ps_comment    VARCHAR(199),
+    PRIMARY KEY (ps_partkey)
+) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/partsupp/partsupp.1.csv';
+
+CREATE EXTERNAL TABLE orders
+(
+    o_orderkey      INT,
+    o_custkey       INT,
+    o_orderstatus   CHAR(1),
+    o_totalprice    DECIMAL(15, 2),
+    o_orderdate     DATE,
+    o_orderpriority CHAR(15),
+    o_clerk         CHAR(15),
+    o_shippriority  INT,
+    o_comment       VARCHAR(79),
+    PRIMARY KEY (o_orderkey)
+) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/orders/orders.1.csv';
+
+CREATE EXTERNAL TABLE lineitem
+(
+    l_orderkey      INT,
+    l_partkey       INT,
+    l_suppkey       INT,
+    l_linenumber    INT,
+    l_quantity      DECIMAL(15, 2),
+    l_extendedprice DECIMAL(15, 2),
+    l_discount      DECIMAL(15, 2),
+    l_tax           DECIMAL(15, 2),
+    l_returnflag    CHAR(1),
+    l_linestatus    CHAR(1),
+    l_shipdate      DATE,
+    l_commitdate    DATE,
+    l_receiptdate   DATE,
+    l_shipinstruct  CHAR(25),
+    l_shipmode      CHAR(10),
+    l_comment       VARCHAR(44)
+) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/lineitem/lineitem.1.csv';
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/init/load_mem.sql b/benchmarks/sql_benchmarks/tpch/init/load_mem.sql
new file mode 100644
index 0000000000000..57d12c22f0c52
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/init/load_mem.sql
@@ -0,0 +1,31 @@
+CREATE EXTERNAL TABLE nation_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/nation/nation.1.parquet';
+
+CREATE EXTERNAL TABLE region_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/region/region.1.parquet';
+
+CREATE EXTERNAL TABLE supplier_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/supplier/supplier.1.parquet';
+
+CREATE EXTERNAL TABLE customer_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/customer/customer.1.parquet';
+
+CREATE EXTERNAL TABLE part_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/part/part.1.parquet';
+
+CREATE EXTERNAL TABLE partsupp_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/partsupp/partsupp.1.parquet';
+
+CREATE EXTERNAL TABLE orders_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/orders/orders.1.parquet';
+
+CREATE EXTERNAL TABLE lineitem_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/lineitem/lineitem.1.parquet';
+
+CREATE TABLE nation as SELECT * FROM nation_raw;
+
+CREATE TABLE region as SELECT * FROM region_raw;
+
+CREATE TABLE supplier as SELECT * FROM supplier_raw;
+
+CREATE TABLE customer as SELECT * FROM customer_raw;
+
+CREATE TABLE part as SELECT * FROM part_raw;
+
+CREATE TABLE partsupp as SELECT * FROM partsupp_raw;
+
+CREATE TABLE orders as SELECT * FROM orders_raw;
+
+CREATE TABLE lineitem as SELECT * FROM lineitem_raw;
diff --git a/benchmarks/sql_benchmarks/tpch/init/load_parquet.sql b/benchmarks/sql_benchmarks/tpch/init/load_parquet.sql
new file mode 100644
index 0000000000000..172a03d82a2cf
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/init/load_parquet.sql
@@ -0,0 +1,15 @@
+CREATE EXTERNAL TABLE nation STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/nation/nation.1.parquet';
+
+CREATE EXTERNAL TABLE region STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/region/region.1.parquet';
+
+CREATE EXTERNAL TABLE supplier STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/supplier/supplier.1.parquet';
+
+CREATE EXTERNAL TABLE customer STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/customer/customer.1.parquet';
+
+CREATE EXTERNAL TABLE part STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/part/part.1.parquet';
+
+CREATE EXTERNAL TABLE partsupp STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/partsupp/partsupp.1.parquet';
+
+CREATE EXTERNAL TABLE orders STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/orders/orders.1.parquet';
+
+CREATE EXTERNAL TABLE lineitem STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/lineitem/lineitem.1.parquet';
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/init/set_config.sql b/benchmarks/sql_benchmarks/tpch/init/set_config.sql
new file mode 100644
index 0000000000000..00457e2bca1ef
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/init/set_config.sql
@@ -0,0 +1,3 @@
+set datafusion.optimizer.prefer_hash_join=${PREFER_HASH_JOIN:-true};
+
+set datafusion.execution.hash_join_buffering_capacity=${HASH_JOIN_BUFFERING_CAPACITY:-0};
diff --git a/benchmarks/src/bin/dfbench.rs b/benchmarks/src/bin/dfbench.rs
index 816cae0e38555..3b1f54291e75c 100644
--- a/benchmarks/src/bin/dfbench.rs
+++ b/benchmarks/src/bin/dfbench.rs
@@ -18,7 +18,7 @@
 //! DataFusion benchmark runner
 use datafusion::error::Result;
 
-use structopt::StructOpt;
+use clap::{Parser, Subcommand};
 
 #[cfg(all(feature = "snmalloc", feature = "mimalloc"))]
 compile_error!(
@@ -34,11 +34,18 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
 static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
 
 use datafusion_benchmarks::{
-    cancellation, clickbench, h2o, hj, imdb, nlj, sort_tpch, tpch,
+    cancellation, clickbench, h2o, hj, imdb, nlj, smj, sort_pushdown, sort_tpch, tpcds,
+    tpch,
 };
 
-#[derive(Debug, StructOpt)]
-#[structopt(about = "benchmark command")]
+#[derive(Debug, Parser)]
+#[command(about = "benchmark command")]
+struct Cli {
+    #[command(subcommand)]
+    command: Options,
+}
+
+#[derive(Debug, Subcommand)]
 enum Options {
     Cancellation(cancellation::RunOpt),
     Clickbench(clickbench::RunOpt),
@@ -46,9 +53,11 @@ enum Options {
     HJ(hj::RunOpt),
     Imdb(imdb::RunOpt),
     Nlj(nlj::RunOpt),
+    Smj(smj::RunOpt),
+    SortPushdown(sort_pushdown::RunOpt),
     SortTpch(sort_tpch::RunOpt),
     Tpch(tpch::RunOpt),
-    TpchConvert(tpch::ConvertOpt),
+    Tpcds(tpcds::RunOpt),
 }
 
 // Main benchmark runner entrypoint
@@ -56,15 +65,18 @@ enum Options {
 pub async fn main() -> Result<()> {
     env_logger::init();
 
-    match Options::from_args() {
+    let cli = Cli::parse();
+    match cli.command {
         Options::Cancellation(opt) => opt.run().await,
         Options::Clickbench(opt) => opt.run().await,
         Options::H2o(opt) => opt.run().await,
         Options::HJ(opt) => opt.run().await,
         Options::Imdb(opt) => Box::pin(opt.run()).await,
         Options::Nlj(opt) => opt.run().await,
+        Options::Smj(opt) => opt.run().await,
+        Options::SortPushdown(opt) => opt.run().await,
         Options::SortTpch(opt) => opt.run().await,
         Options::Tpch(opt) => Box::pin(opt.run()).await,
-        Options::TpchConvert(opt) => opt.run().await,
+        Options::Tpcds(opt) => Box::pin(opt.run()).await,
     }
 }
diff --git a/benchmarks/src/bin/external_aggr.rs b/benchmarks/src/bin/external_aggr.rs
index 46b6cc9a80b24..ee604ec7365a1 100644
--- a/benchmarks/src/bin/external_aggr.rs
+++ b/benchmarks/src/bin/external_aggr.rs
@@ -17,13 +17,13 @@
 
 //! external_aggr binary entrypoint
 
+use clap::{Args, Parser, Subcommand};
 use datafusion::execution::memory_pool::GreedyMemoryPool;
 use datafusion::execution::memory_pool::MemoryPool;
 use std::collections::HashMap;
 use std::path::PathBuf;
 use std::sync::Arc;
 use std::sync::LazyLock;
-use structopt::StructOpt;
 
 use arrow::record_batch::RecordBatch;
 use arrow::util::pretty;
@@ -33,47 +33,53 @@ use datafusion::datasource::listing::{
 };
 use datafusion::datasource::{MemTable, TableProvider};
 use datafusion::error::Result;
+use datafusion::execution::SessionStateBuilder;
 use datafusion::execution::memory_pool::FairSpillPool;
-use datafusion::execution::memory_pool::{human_readable_size, units};
 use datafusion::execution::runtime_env::RuntimeEnvBuilder;
-use datafusion::execution::SessionStateBuilder;
 use datafusion::physical_plan::display::DisplayableExecutionPlan;
 use datafusion::physical_plan::{collect, displayable};
 use datafusion::prelude::*;
 use datafusion_benchmarks::util::{BenchmarkRun, CommonOpt, QueryResult};
 use datafusion_common::instant::Instant;
 use datafusion_common::utils::get_available_parallelism;
-use datafusion_common::{exec_err, DEFAULT_PARQUET_EXTENSION};
+use datafusion_common::{DEFAULT_PARQUET_EXTENSION, exec_err};
+use datafusion_common::{human_readable_size, units};
 
-#[derive(Debug, StructOpt)]
-#[structopt(
+#[derive(Debug, Parser)]
+#[command(
     name = "datafusion-external-aggregation",
     about = "DataFusion external aggregation benchmark"
 )]
+struct Cli {
+    #[command(subcommand)]
+    command: ExternalAggrOpt,
+}
+
+#[derive(Debug, Subcommand)]
 enum ExternalAggrOpt {
     Benchmark(ExternalAggrConfig),
 }
 
-#[derive(Debug, StructOpt)]
+#[derive(Debug, Args)]
 struct ExternalAggrConfig {
     /// Query number. If not specified, runs all queries
-    #[structopt(short, long)]
+    #[arg(short, long)]
     query: Option<usize>,
 
     /// Common options
-    #[structopt(flatten)]
+    #[command(flatten)]
     common: CommonOpt,
 
     /// Path to data files (lineitem). Only parquet format is supported
-    #[structopt(parse(from_os_str), required = true, short = "p", long = "path")]
+    #[arg(required = true, short = 'p', long = "path")]
     path: PathBuf,
 
     /// Load the data into a MemTable before executing the query
-    #[structopt(short = "m", long = "mem-table")]
+    #[arg(short = 'm', long = "mem-table")]
     mem_table: bool,
 
     /// Path to JSON benchmark result to be compare using `compare.py`
-    #[structopt(parse(from_os_str), short = "o", long = "output")]
+    #[arg(short = 'o', long = "output")]
     output_path: Option<PathBuf>,
 }
 
@@ -338,7 +344,8 @@ impl ExternalAggrConfig {
 pub async fn main() -> Result<()> {
     env_logger::init();
 
-    match ExternalAggrOpt::from_args() {
+    let cli = Cli::parse();
+    match cli.command {
         ExternalAggrOpt::Benchmark(opt) => opt.run().await?,
     }
 
diff --git a/benchmarks/src/bin/imdb.rs b/benchmarks/src/bin/imdb.rs
index 5ce99928df662..e86735f87b8f1 100644
--- a/benchmarks/src/bin/imdb.rs
+++ b/benchmarks/src/bin/imdb.rs
@@ -17,9 +17,9 @@
 
 //! IMDB binary entrypoint
 
+use clap::{Parser, Subcommand};
 use datafusion::error::Result;
 use datafusion_benchmarks::imdb;
-use structopt::StructOpt;
 
 #[cfg(all(feature = "snmalloc", feature = "mimalloc"))]
 compile_error!(
@@ -34,24 +34,30 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
 #[global_allocator]
 static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
 
-#[derive(Debug, StructOpt)]
-#[structopt(about = "benchmark command")]
-enum BenchmarkSubCommandOpt {
-    #[structopt(name = "datafusion")]
-    DataFusionBenchmark(imdb::RunOpt),
+#[derive(Debug, Parser)]
+#[command(name = "IMDB", about = "IMDB Dataset Processing.")]
+struct Cli {
+    #[command(subcommand)]
+    command: ImdbOpt,
 }
 
-#[derive(Debug, StructOpt)]
-#[structopt(name = "IMDB", about = "IMDB Dataset Processing.")]
+#[derive(Debug, Subcommand)]
 enum ImdbOpt {
+    #[command(subcommand)]
     Benchmark(BenchmarkSubCommandOpt),
     Convert(imdb::ConvertOpt),
 }
 
+#[derive(Debug, Subcommand)]
+enum BenchmarkSubCommandOpt {
+    #[command(name = "datafusion")]
+    DataFusionBenchmark(imdb::RunOpt),
+}
+
 #[tokio::main]
 pub async fn main() -> Result<()> {
     env_logger::init();
-    match ImdbOpt::from_args() {
+    match Cli::parse().command {
         ImdbOpt::Benchmark(BenchmarkSubCommandOpt::DataFusionBenchmark(opt)) => {
             Box::pin(opt.run()).await
         }
diff --git a/benchmarks/src/bin/mem_profile.rs b/benchmarks/src/bin/mem_profile.rs
index 16fc3871bec86..41a0baecbba86 100644
--- a/benchmarks/src/bin/mem_profile.rs
+++ b/benchmarks/src/bin/mem_profile.rs
@@ -16,6 +16,7 @@
 // under the License.
 
 //! mem_profile binary entrypoint
+use clap::{Parser, Subcommand};
 use datafusion::error::Result;
 use std::{
     env,
@@ -23,7 +24,6 @@ use std::{
     path::Path,
     process::{Command, Stdio},
 };
-use structopt::StructOpt;
 
 use datafusion_benchmarks::{
     clickbench,
@@ -31,19 +31,19 @@ use datafusion_benchmarks::{
     imdb, sort_tpch, tpch,
 };
 
-#[derive(Debug, StructOpt)]
-#[structopt(name = "Memory Profiling Utility")]
-struct MemProfileOpt {
+#[derive(Debug, Parser)]
+#[command(name = "Memory Profiling Utility")]
+struct Cli {
     /// Cargo profile to use in dfbench (e.g. release, release-nonlto)
-    #[structopt(long, default_value = "release")]
+    #[arg(long, default_value = "release")]
     bench_profile: String,
 
-    #[structopt(subcommand)]
+    #[command(subcommand)]
     command: Options,
 }
 
-#[derive(Debug, StructOpt)]
-#[structopt(about = "Benchmark command")]
+#[derive(Debug, Subcommand)]
+#[command(about = "Benchmark command")]
 enum Options {
     Clickbench(clickbench::RunOpt),
     H2o(h2o::RunOpt),
@@ -55,9 +55,9 @@ enum Options {
 #[tokio::main]
 pub async fn main() -> Result<()> {
     // 1. Parse args and check which benchmarks should be run
-    let mem_profile_opt = MemProfileOpt::from_args();
-    let profile = mem_profile_opt.bench_profile;
-    let query_range = match mem_profile_opt.command {
+    let cli = Cli::parse();
+    let profile = cli.bench_profile;
+    let query_range = match cli.command {
         Options::Clickbench(opt) => {
             let entries = std::fs::read_dir(&opt.queries_path)?
                 .filter_map(Result::ok)
@@ -199,21 +199,18 @@ fn run_query(args: &[String], results: &mut Vec<QueryResult>) -> Result<()> {
 
     // Look for lines that contain execution time / memory stats
     while let Some(line) = iter.next() {
-        if let Some((query, duration_ms)) = parse_query_time(line) {
-            if let Some(next_line) = iter.peek() {
-                if let Some((peak_rss, peak_commit, page_faults)) =
-                    parse_vm_line(next_line)
-                {
-                    results.push(QueryResult {
-                        query,
-                        duration_ms,
-                        peak_rss,
-                        peak_commit,
-                        page_faults,
-                    });
-                    break;
-                }
-            }
+        if let Some((query, duration_ms)) = parse_query_time(line)
+            && let Some(next_line) = iter.peek()
+            && let Some((peak_rss, peak_commit, page_faults)) = parse_vm_line(next_line)
+        {
+            results.push(QueryResult {
+                query,
+                duration_ms,
+                peak_rss,
+                peak_commit,
+                page_faults,
+            });
+            break;
         }
     }
 
diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs
deleted file mode 100644
index ca2bb8e57c0ec..0000000000000
--- a/benchmarks/src/bin/tpch.rs
+++ /dev/null
@@ -1,65 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! tpch binary only entrypoint
-
-use datafusion::error::Result;
-use datafusion_benchmarks::tpch;
-use structopt::StructOpt;
-
-#[cfg(all(feature = "snmalloc", feature = "mimalloc"))]
-compile_error!(
-    "feature \"snmalloc\" and feature \"mimalloc\" cannot be enabled at the same time"
-);
-
-#[cfg(feature = "snmalloc")]
-#[global_allocator]
-static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
-
-#[cfg(feature = "mimalloc")]
-#[global_allocator]
-static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
-
-#[derive(Debug, StructOpt)]
-#[structopt(about = "benchmark command")]
-enum BenchmarkSubCommandOpt {
-    #[structopt(name = "datafusion")]
-    DataFusionBenchmark(tpch::RunOpt),
-}
-
-#[derive(Debug, StructOpt)]
-#[structopt(name = "TPC-H", about = "TPC-H Benchmarks.")]
-enum TpchOpt {
-    Benchmark(BenchmarkSubCommandOpt),
-    Convert(tpch::ConvertOpt),
-}
-
-/// 'tpch' entry point, with tortured command line arguments.  Please
-/// use `dbbench` instead.
-///
-/// Note: this is kept to be backwards compatible with the benchmark names prior to
-/// <https://github.com/apache/datafusion/issues/6994>
-#[tokio::main]
-async fn main() -> Result<()> {
-    env_logger::init();
-    match TpchOpt::from_args() {
-        TpchOpt::Benchmark(BenchmarkSubCommandOpt::DataFusionBenchmark(opt)) => {
-            Box::pin(opt.run()).await
-        }
-        TpchOpt::Convert(opt) => opt.run().await,
-    }
-}
diff --git a/benchmarks/src/cancellation.rs b/benchmarks/src/cancellation.rs
index fcf03fbc54550..d3da1b0e83623 100644
--- a/benchmarks/src/cancellation.rs
+++ b/benchmarks/src/cancellation.rs
@@ -24,24 +24,24 @@ use crate::util::{BenchmarkRun, CommonOpt};
 use arrow::array::Array;
 use arrow::datatypes::DataType;
 use arrow::record_batch::RecordBatch;
+use clap::Args;
 use datafusion::common::{Result, ScalarValue};
-use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::file_format::FileFormat;
+use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::listing::{ListingOptions, ListingTableUrl};
-use datafusion::execution::object_store::ObjectStoreUrl;
 use datafusion::execution::TaskContext;
-use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
+use datafusion::execution::object_store::ObjectStoreUrl;
 use datafusion::physical_plan::ExecutionPlan;
+use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion::prelude::*;
 use datafusion_common::instant::Instant;
 use futures::TryStreamExt;
 use object_store::ObjectStore;
-use parquet::arrow::async_writer::ParquetObjectWriter;
 use parquet::arrow::AsyncArrowWriter;
+use parquet::arrow::async_writer::ParquetObjectWriter;
+use rand::Rng;
 use rand::distr::Alphanumeric;
 use rand::rngs::ThreadRng;
-use rand::Rng;
-use structopt::StructOpt;
 use tokio::runtime::Runtime;
 use tokio_util::sync::CancellationToken;
 
@@ -57,31 +57,31 @@ use tokio_util::sync::CancellationToken;
 /// The query is an anonymized version of a real-world query, and the
 /// test starts the query then cancels it and reports how long it takes
 /// for the runtime to fully exit.
-#[derive(Debug, StructOpt, Clone)]
-#[structopt(verbatim_doc_comment)]
+#[derive(Debug, Args, Clone)]
+#[command(verbatim_doc_comment)]
 pub struct RunOpt {
     /// Common options
-    #[structopt(flatten)]
+    #[command(flatten)]
     common: CommonOpt,
 
     /// Path to folder where data will be generated
-    #[structopt(parse(from_os_str), required = true, short = "p", long = "path")]
+    #[arg(required = true, short = 'p', long = "path")]
     path: PathBuf,
 
     /// Path to machine readable output file
-    #[structopt(parse(from_os_str), short = "o", long = "output")]
+    #[arg(short = 'o', long = "output")]
     output_path: Option<PathBuf>,
 
     /// Number of files to generate
-    #[structopt(long = "num-files", default_value = "7")]
+    #[arg(long = "num-files", default_value = "7")]
     num_files: usize,
 
     /// Number of rows per file to generate
-    #[structopt(long = "num-rows-per-file", default_value = "5000000")]
+    #[arg(long = "num-rows-per-file", default_value = "5000000")]
     num_rows_per_file: usize,
 
     /// How long to wait, in milliseconds, before attempting to cancel
-    #[structopt(long = "wait-time", default_value = "100")]
+    #[arg(long = "wait-time", default_value = "100")]
     wait_time: u64,
 }
 
diff --git a/benchmarks/src/clickbench.rs b/benchmarks/src/clickbench.rs
index a550503390c54..70aaeb7d2d192 100644
--- a/benchmarks/src/clickbench.rs
+++ b/benchmarks/src/clickbench.rs
@@ -19,7 +19,8 @@ use std::fs;
 use std::io::ErrorKind;
 use std::path::{Path, PathBuf};
 
-use crate::util::{print_memory_stats, BenchmarkRun, CommonOpt, QueryResult};
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult, print_memory_stats};
+use clap::Args;
 use datafusion::logical_expr::{ExplainFormat, ExplainOption};
 use datafusion::{
     error::{DataFusionError, Result},
@@ -27,7 +28,16 @@ use datafusion::{
 };
 use datafusion_common::exec_datafusion_err;
 use datafusion_common::instant::Instant;
-use structopt::StructOpt;
+
+/// SQL to create the hits view with proper EventDate casting.
+///
+/// ClickBench stores EventDate as UInt16 (days since 1970-01-01) for
+/// storage efficiency (2 bytes vs 4-8 bytes for date types).
+/// This view transforms it to SQL DATE type for query compatibility.
+const HITS_VIEW_DDL: &str = r#"CREATE VIEW hits AS
+SELECT * EXCEPT ("EventDate"),
+       CAST(CAST("EventDate" AS INTEGER) AS DATE) AS "EventDate"
+FROM hits_raw"#;
 
 /// Driver program to run the ClickBench benchmark
 ///
@@ -37,11 +47,11 @@ use structopt::StructOpt;
 ///
 /// [1]: https://github.com/ClickHouse/ClickBench
 /// [2]: https://github.com/ClickHouse/ClickBench/tree/main/datafusion
-#[derive(Debug, StructOpt, Clone)]
-#[structopt(verbatim_doc_comment)]
+#[derive(Debug, Args, Clone)]
+#[command(verbatim_doc_comment)]
 pub struct RunOpt {
     /// Query number (between 0 and 42). If not specified, runs all queries
-    #[structopt(short, long)]
+    #[arg(short, long)]
     pub query: Option<usize>,
 
     /// If specified, enables Parquet Filter Pushdown.
@@ -49,35 +59,54 @@ pub struct RunOpt {
     /// Specifically, it enables:
     /// * `pushdown_filters = true`
     /// * `reorder_filters = true`
-    #[structopt(long = "pushdown")]
+    #[arg(long = "pushdown")]
     pushdown: bool,
 
     /// Common options
-    #[structopt(flatten)]
+    #[command(flatten)]
     common: CommonOpt,
 
     /// Path to hits.parquet (single file) or `hits_partitioned`
     /// (partitioned, 100 files)
-    #[structopt(
-        parse(from_os_str),
-        short = "p",
+    #[arg(
+        short = 'p',
         long = "path",
         default_value = "benchmarks/data/hits.parquet"
     )]
     path: PathBuf,
 
     /// Path to queries directory
-    #[structopt(
-        parse(from_os_str),
-        short = "r",
+    #[arg(
+        short = 'r',
         long = "queries-path",
         default_value = "benchmarks/queries/clickbench/queries"
     )]
     pub queries_path: PathBuf,
 
     /// If present, write results json here
-    #[structopt(parse(from_os_str), short = "o", long = "output")]
+    #[arg(short = 'o', long = "output")]
     output_path: Option<PathBuf>,
+
+    /// Column name that the data is sorted by (e.g., "EventTime")
+    /// If specified, DataFusion will be informed that the data has this sort order
+    /// using CREATE EXTERNAL TABLE with WITH ORDER clause.
+    ///
+    /// Recommended to use with: -c datafusion.optimizer.prefer_existing_sort=true
+    /// This allows DataFusion to optimize away redundant sorts while maintaining
+    /// multi-core parallelism for other operations.
+    #[arg(long = "sorted-by")]
+    sorted_by: Option<String>,
+
+    /// Sort order: ASC or DESC (default: ASC)
+    #[arg(long = "sort-order", default_value = "ASC")]
+    sort_order: String,
+
+    /// Configuration options in the format key=value
+    /// Can be specified multiple times.
+    ///
+    /// Example: -c datafusion.optimizer.prefer_existing_sort=true
+    #[arg(short = 'c', long = "config")]
+    config_options: Vec<String>,
 }
 
 /// Get the SQL file path
@@ -125,6 +154,39 @@ impl RunOpt {
 
         // configure parquet options
         let mut config = self.common.config()?;
+
+        if self.sorted_by.is_some() {
+            println!("ℹ️  Data is registered with sort order");
+
+            let has_prefer_sort = self
+                .config_options
+                .iter()
+                .any(|opt| opt.contains("prefer_existing_sort=true"));
+
+            if !has_prefer_sort {
+                println!(
+                    "ℹ️  Consider using -c datafusion.optimizer.prefer_existing_sort=true"
+                );
+                println!("ℹ️  to optimize queries while maintaining parallelism");
+            }
+        }
+
+        // Apply user-provided configuration options
+        for config_opt in &self.config_options {
+            let parts: Vec<&str> = config_opt.splitn(2, '=').collect();
+            if parts.len() != 2 {
+                return Err(exec_datafusion_err!(
+                    "Invalid config option format: '{}'. Expected 'key=value'",
+                    config_opt
+                ));
+            }
+            let key = parts[0];
+            let value = parts[1];
+
+            println!("Setting config: {key} = {value}");
+            config = config.set_str(key, value);
+        }
+
         {
             let parquet_options = &mut config.options_mut().execution.parquet;
             // The hits_partitioned dataset specifies string columns
@@ -136,10 +198,18 @@ impl RunOpt {
                 parquet_options.pushdown_filters = true;
                 parquet_options.reorder_filters = true;
             }
+
+            if self.sorted_by.is_some() {
+                // We should compare the dynamic topk optimization when data is sorted, so we make the
+                // assumption that filter pushdown is also enabled in this case.
+                parquet_options.pushdown_filters = true;
+                parquet_options.reorder_filters = true;
+            }
         }
 
-        let rt_builder = self.common.runtime_env_builder()?;
-        let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?);
+        let rt = self.common.build_runtime()?;
+        let ctx = SessionContext::new_with_config_rt(config, rt);
+
         self.register_hits(&ctx).await?;
 
         let mut benchmark_run = BenchmarkRun::new();
@@ -214,17 +284,68 @@ impl RunOpt {
     }
 
     /// Registers the `hits.parquet` as a table named `hits`
+    /// If sorted_by is specified, uses CREATE EXTERNAL TABLE with WITH ORDER
     async fn register_hits(&self, ctx: &SessionContext) -> Result<()> {
-        let options = Default::default();
         let path = self.path.as_os_str().to_str().unwrap();
-        ctx.register_parquet("hits", path, options)
-            .await
-            .map_err(|e| {
-                DataFusionError::Context(
-                    format!("Registering 'hits' as {path}"),
-                    Box::new(e),
-                )
-            })
+
+        // If sorted_by is specified, use CREATE EXTERNAL TABLE with WITH ORDER
+        if let Some(ref sort_column) = self.sorted_by {
+            println!(
+                "Registering table with sort order: {} {}",
+                sort_column, self.sort_order
+            );
+
+            // Escape column name with double quotes
+            let escaped_column = if sort_column.contains('"') {
+                sort_column.clone()
+            } else {
+                format!("\"{sort_column}\"")
+            };
+
+            // Build CREATE EXTERNAL TABLE DDL with WITH ORDER clause
+            // Schema will be automatically inferred from the Parquet file
+            let create_table_sql = format!(
+                "CREATE EXTERNAL TABLE hits_raw \
+                 STORED AS PARQUET \
+                 LOCATION '{}' \
+                 WITH ORDER ({} {})",
+                path,
+                escaped_column,
+                self.sort_order.to_uppercase()
+            );
+
+            println!("Executing: {create_table_sql}");
+
+            // Execute the CREATE EXTERNAL TABLE statement
+            ctx.sql(&create_table_sql).await?.collect().await?;
+        } else {
+            // Original registration without sort order
+            let options = Default::default();
+            ctx.register_parquet("hits_raw", path, options)
+                .await
+                .map_err(|e| {
+                    DataFusionError::Context(
+                        format!("Registering 'hits_raw' as {path}"),
+                        Box::new(e),
+                    )
+                })?;
+        }
+
+        // Create the hits view with EventDate transformation
+        Self::create_hits_view(ctx).await
+    }
+
+    /// Creates the hits view with EventDate transformation from UInt16 to DATE.
+    ///
+    /// ClickBench encodes EventDate as UInt16 days since epoch (1970-01-01).
+    async fn create_hits_view(ctx: &SessionContext) -> Result<()> {
+        ctx.sql(HITS_VIEW_DDL).await?.collect().await.map_err(|e| {
+            DataFusionError::Context(
+                "Creating 'hits' view with EventDate transformation".to_string(),
+                Box::new(e),
+            )
+        })?;
+        Ok(())
     }
 
     fn iterations(&self) -> usize {
diff --git a/benchmarks/src/h2o.rs b/benchmarks/src/h2o.rs
index be74252031194..8b6e04932cb39 100644
--- a/benchmarks/src/h2o.rs
+++ b/benchmarks/src/h2o.rs
@@ -20,31 +20,30 @@
 //! - [H2O AI Benchmark](https://duckdb.org/2023/04/14/h2oai.html)
 //! - [Extended window function benchmark](https://duckdb.org/2024/06/26/benchmarks-over-time.html#window-functions-benchmark)
 
-use crate::util::{print_memory_stats, BenchmarkRun, CommonOpt};
+use crate::util::{BenchmarkRun, CommonOpt, print_memory_stats};
+use clap::Args;
 use datafusion::logical_expr::{ExplainFormat, ExplainOption};
 use datafusion::{error::Result, prelude::SessionContext};
 use datafusion_common::{
-    exec_datafusion_err, instant::Instant, internal_err, DataFusionError, TableReference,
+    DataFusionError, TableReference, exec_datafusion_err, instant::Instant, internal_err,
 };
 use std::path::{Path, PathBuf};
-use structopt::StructOpt;
 
 /// Run the H2O benchmark
-#[derive(Debug, StructOpt, Clone)]
-#[structopt(verbatim_doc_comment)]
+#[derive(Debug, Args, Clone)]
+#[command(verbatim_doc_comment)]
 pub struct RunOpt {
-    #[structopt(short, long)]
+    #[arg(short, long)]
     pub query: Option<usize>,
 
     /// Common options
-    #[structopt(flatten)]
+    #[command(flatten)]
     common: CommonOpt,
 
     /// Path to queries.sql (single file)
     /// default value is the groupby.sql file in the h2o benchmark
-    #[structopt(
-        parse(from_os_str),
-        short = "r",
+    #[arg(
+        short = 'r',
         long = "queries-path",
         default_value = "benchmarks/queries/h2o/groupby.sql"
     )]
@@ -53,9 +52,8 @@ pub struct RunOpt {
     /// Path to data file (parquet or csv)
     /// Default value is the G1_1e7_1e7_100_0.csv file in the h2o benchmark
     /// This is the small csv file with 10^7 rows
-    #[structopt(
-        parse(from_os_str),
-        short = "p",
+    #[arg(
+        short = 'p',
         long = "path",
         default_value = "benchmarks/data/h2o/G1_1e7_1e7_100_0.csv"
     )]
@@ -64,15 +62,15 @@ pub struct RunOpt {
     /// Path to data files (parquet or csv), using , to separate the paths
     /// Default value is the small files for join x table, small table, medium table, big table files in the h2o benchmark
     /// This is the small csv file case
-    #[structopt(
-        short = "join-paths",
+    #[arg(
+        short = 'j',
         long = "join-paths",
         default_value = "benchmarks/data/h2o/J1_1e7_NA_0.csv,benchmarks/data/h2o/J1_1e7_1e1_0.csv,benchmarks/data/h2o/J1_1e7_1e4_0.csv,benchmarks/data/h2o/J1_1e7_1e7_NA.csv"
     )]
     join_paths: String,
 
     /// If present, write results json here
-    #[structopt(parse(from_os_str), short = "o", long = "output")]
+    #[arg(short = 'o', long = "output")]
     output_path: Option<PathBuf>,
 }
 
@@ -86,8 +84,8 @@ impl RunOpt {
         };
 
         let config = self.common.config()?;
-        let rt_builder = self.common.runtime_env_builder()?;
-        let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?);
+        let rt = self.common.build_runtime()?;
+        let ctx = SessionContext::new_with_config_rt(config, rt);
 
         // Register tables depending on which h2o benchmark is being run
         // (groupby/join/window)
diff --git a/benchmarks/src/hj.rs b/benchmarks/src/hj.rs
index 505b322745485..301fe0d599cd6 100644
--- a/benchmarks/src/hj.rs
+++ b/benchmarks/src/hj.rs
@@ -16,11 +16,12 @@
 // under the License.
 
 use crate::util::{BenchmarkRun, CommonOpt, QueryResult};
+use clap::Args;
 use datafusion::physical_plan::execute_stream;
 use datafusion::{error::Result, prelude::SessionContext};
 use datafusion_common::instant::Instant;
-use datafusion_common::{exec_datafusion_err, exec_err, DataFusionError};
-use structopt::StructOpt;
+use datafusion_common::{DataFusionError, exec_datafusion_err, exec_err};
+use std::path::PathBuf;
 
 use futures::StreamExt;
 
@@ -32,139 +33,276 @@ use futures::StreamExt;
 /// It uses simple equality predicates to ensure a hash join is selected.
 /// Where we vary selectivity, we do so with additional cheap predicates that
 /// do not change the join key (so the physical operator remains HashJoin).
-#[derive(Debug, StructOpt, Clone)]
-#[structopt(verbatim_doc_comment)]
+#[derive(Debug, Args, Clone)]
+#[command(verbatim_doc_comment)]
 pub struct RunOpt {
-    /// Query number (between 1 and 12). If not specified, runs all queries
-    #[structopt(short, long)]
+    /// Query number. If not specified, runs all queries
+    #[arg(short, long)]
     query: Option<usize>,
 
     /// Common options (iterations, batch size, target_partitions, etc.)
-    #[structopt(flatten)]
+    #[command(flatten)]
     common: CommonOpt,
 
+    /// Path to TPC-H SF10 data
+    #[arg(short = 'p', long = "path")]
+    path: Option<PathBuf>,
+
     /// If present, write results json here
-    #[structopt(parse(from_os_str), short = "o", long = "output")]
-    output_path: Option<std::path::PathBuf>,
+    #[arg(short = 'o', long = "output")]
+    output_path: Option<PathBuf>,
+}
+
+struct HashJoinQuery {
+    sql: &'static str,
+    density: f64,
+    prob_hit: f64,
+    build_size: &'static str,
+    probe_size: &'static str,
 }
 
 /// Inline SQL queries for Hash Join benchmarks
-///
-/// Each query's comment includes:
-///   - Left row count × Right row count
-///   - Join predicate selectivity (approximate output fraction).
-///   - Q11 and Q12 selectivity is relative to cartesian product while the others are
-///     relative to probe side.
-const HASH_QUERIES: &[&str] = &[
-    // Q1: INNER 10 x 10K | LOW ~0.1%
-    // equality on key + cheap filter to downselect
-    r#"
-        SELECT t1.value, t2.value
-        FROM generate_series(0, 9000, 1000) AS t1(value)
-        JOIN range(10000) AS t2
-        ON t1.value = t2.value;
-    "#,
-    // Q2: INNER 10 x 10K | LOW ~0.1%
-    r#"
-        SELECT t1.value, t2.value
-        FROM generate_series(0, 9000, 1000) AS t1
-        JOIN range(10000) AS t2
-          ON t1.value = t2.value
-        WHERE t1.value % 5 = 0
-    "#,
-    // Q3: INNER 10K x 10K | HIGH ~90%
-    r#"
-        SELECT t1.value, t2.value
-        FROM range(10000) AS t1
-        JOIN range(10000) AS t2
-          ON t1.value = t2.value
-        WHERE t1.value % 10 <> 0
-    "#,
-    // Q4: INNER 30 x 30K | LOW ~0.1%
-    r#"
-        SELECT t1.value, t2.value
-        FROM generate_series(0, 29000, 1000) AS t1
-        JOIN range(30000) AS t2
-          ON t1.value = t2.value
-        WHERE t1.value % 5 = 0
-    "#,
-    // Q5: INNER 10 x 200K | VERY LOW ~0.005% (small to large)
-    r#"
-        SELECT t1.value, t2.value
-        FROM generate_series(0, 9000, 1000) AS t1
-        JOIN range(200000) AS t2
-          ON t1.value = t2.value
-        WHERE t1.value % 1000 = 0
-    "#,
-    // Q6: INNER 200K x 10 | VERY LOW ~0.005% (large to small)
-    r#"
-        SELECT t1.value, t2.value
-        FROM range(200000) AS t1
-        JOIN generate_series(0, 9000, 1000) AS t2
-          ON t1.value = t2.value
-        WHERE t1.value % 1000 = 0
-    "#,
-    // Q7: RIGHT OUTER 10 x 200K | LOW ~0.1%
-    // Outer join still uses HashJoin for equi-keys; the extra filter reduces matches
-    r#"
-        SELECT t1.value AS l, t2.value AS r
-        FROM generate_series(0, 9000, 1000) AS t1
-        RIGHT JOIN range(200000) AS t2
-          ON t1.value = t2.value
-        WHERE t2.value % 1000 = 0
-    "#,
-    // Q8: LEFT OUTER 200K x 10 | LOW ~0.1%
-    r#"
-        SELECT t1.value AS l, t2.value AS r
-        FROM range(200000) AS t1
-        LEFT JOIN generate_series(0, 9000, 1000) AS t2
-          ON t1.value = t2.value
-        WHERE t1.value % 1000 = 0
-    "#,
-    // Q9: FULL OUTER 30 x 30K | LOW ~0.1%
-    r#"
-        SELECT t1.value AS l, t2.value AS r
-        FROM generate_series(0, 29000, 1000) AS t1
-        FULL JOIN range(30000) AS t2
-          ON t1.value = t2.value
-        WHERE COALESCE(t1.value, t2.value) % 1000 = 0
-    "#,
-    // Q10: FULL OUTER 30 x 30K | HIGH ~90%
-    r#"
-        SELECT t1.value AS l, t2.value AS r
-        FROM generate_series(0, 29000, 1000) AS t1
-        FULL JOIN range(30000) AS t2
-          ON t1.value = t2.value
-        WHERE COALESCE(t1.value, t2.value) % 10 <> 0
-    "#,
-    // Q11: INNER 30 x 30K | MEDIUM ~50% | cheap predicate on parity
-    r#"
-        SELECT t1.value, t2.value
-        FROM generate_series(0, 29000, 1000) AS t1
-        INNER JOIN range(30000) AS t2
-          ON (t1.value % 2) = (t2.value % 2)
-    "#,
-    // Q12: FULL OUTER 30 x 30K | MEDIUM ~50% | expression key
-    r#"
-        SELECT t1.value AS l, t2.value AS r
-        FROM generate_series(0, 29000, 1000) AS t1
-        FULL JOIN range(30000) AS t2
-          ON (t1.value % 2) = (t2.value % 2)
-    "#,
-    // Q13: INNER 30 x 30K | LOW 0.1% | modulo with adding values
-    r#"
-        SELECT t1.value, t2.value
-        FROM generate_series(0, 29000, 1000) AS t1
-        INNER JOIN range(30000) AS t2
-          ON (t1.value = t2.value) AND ((t1.value + t2.value) % 10 < 1)
-    "#,
-    // Q14: FULL OUTER 30 x 30K | ALL ~100% | modulo
-    r#"
-        SELECT t1.value AS l, t2.value AS r
-        FROM generate_series(0, 29000, 1000) AS t1
-        FULL JOIN range(30000) AS t2
-          ON (t1.value = t2.value) AND ((t1.value + t2.value) % 10 = 0)
-    "#,
+const HASH_QUERIES: &[HashJoinQuery] = &[
+    // Q1: Very Small Build Side (Dense)
+    // Build Side: nation (25 rows) | Probe Side: customer (1.5M rows)
+    HashJoinQuery {
+        sql: r###"SELECT n_nationkey FROM nation JOIN customer ON c_nationkey = n_nationkey"###,
+        density: 1.0,
+        prob_hit: 1.0,
+        build_size: "25",
+        probe_size: "1.5M",
+    },
+    // Q2: Very Small Build Side (Sparse, range < 1024)
+    // Build Side: nation (25 rows, range 961) | Probe Side: customer (1.5M rows)
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+    FROM (
+      SELECT c_nationkey * 40 as k
+      FROM customer
+    ) l
+    JOIN (
+      SELECT n_nationkey * 40 as k FROM nation
+    ) s ON l.k = s.k"###,
+        density: 0.026,
+        prob_hit: 1.0,
+        build_size: "25",
+        probe_size: "1.5M",
+    },
+    // Q3: 100% Density, 100% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT s_suppkey FROM supplier JOIN lineitem ON s_suppkey = l_suppkey"###,
+        density: 1.0,
+        prob_hit: 1.0,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q4: 100% Density, 10% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT CASE WHEN l_suppkey % 10 = 0 THEN l_suppkey ELSE l_suppkey + 1000000 END as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT s_suppkey as k FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 1.0,
+        prob_hit: 0.1,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q5: 75% Density, 100% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT l_suppkey * 4 / 3 as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT s_suppkey * 4 / 3 as k FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 0.75,
+        prob_hit: 1.0,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q6: 75% Density, 10% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT CASE 
+                      WHEN l_suppkey % 10 = 0 THEN l_suppkey * 4 / 3
+                      WHEN l_suppkey % 10 < 9 THEN (l_suppkey * 4 / 3 / 4) * 4 + 3
+                      ELSE l_suppkey * 4 / 3 + 1000000         
+                 END as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT s_suppkey * 4 / 3 as k FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 0.75,
+        prob_hit: 0.1,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q7: 50% Density, 100% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT l_suppkey * 2 as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT s_suppkey * 2 as k FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 0.5,
+        prob_hit: 1.0,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q8: 50% Density, 10% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT CASE 
+                      WHEN l_suppkey % 10 = 0 THEN l_suppkey * 2
+                      WHEN l_suppkey % 10 < 9 THEN l_suppkey * 2 + 1
+                      ELSE l_suppkey * 2 + 1000000
+                 END as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT s_suppkey * 2 as k FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 0.5,
+        prob_hit: 0.1,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q9: 20% Density, 100% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT l_suppkey * 5 as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT s_suppkey * 5 as k FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 0.2,
+        prob_hit: 1.0,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q10: 20% Density, 10% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT CASE 
+                      WHEN l_suppkey % 10 = 0 THEN l_suppkey * 5
+                      WHEN l_suppkey % 10 < 9 THEN l_suppkey * 5 + 1
+                      ELSE l_suppkey * 5 + 1000000
+                 END as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT s_suppkey * 5 as k FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 0.2,
+        prob_hit: 0.1,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q11: 10% Density, 100% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT l_suppkey * 10 as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT s_suppkey * 10 as k FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 0.1,
+        prob_hit: 1.0,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q12: 10% Density, 10% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT CASE 
+                      WHEN l_suppkey % 10 = 0 THEN l_suppkey * 10
+                      WHEN l_suppkey % 10 < 9 THEN l_suppkey * 10 + 1
+                      ELSE l_suppkey * 10 + 1000000
+                 END as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT s_suppkey * 10 as k FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 0.1,
+        prob_hit: 0.1,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q13: 1% Density, 100% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT l_suppkey * 100 as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT s_suppkey * 100 as k FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 0.01,
+        prob_hit: 1.0,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q14: 1% Density, 10% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT CASE 
+                      WHEN l_suppkey % 10 = 0 THEN l_suppkey * 100
+                      WHEN l_suppkey % 10 < 9 THEN l_suppkey * 100 + 1
+                      ELSE l_suppkey * 100 + 11000000                  -- oob
+                 END as k
+          FROM lineitem
+        ) l
+            JOIN (
+              SELECT s_suppkey * 100 as k FROM supplier
+            ) s ON l.k = s.k"###,
+        density: 0.01,
+        prob_hit: 0.1,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q15: 20% Density, 10% Hit rate, 20% Duplicates in Build Side
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT CASE 
+                      WHEN l_suppkey % 10 = 0 THEN ((l_suppkey % 80000) + 1) * 25 / 4
+                      ELSE ((l_suppkey % 80000) + 1) * 25 / 4 + 1
+                 END as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT CASE 
+                      WHEN s_suppkey <= 80000 THEN (s_suppkey * 25) / 4 
+                      ELSE ((s_suppkey - 80000) * 25) / 4 
+                 END as k 
+          FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 0.2,
+        prob_hit: 0.1,
+        build_size: "100K_(20%_dups)",
+        probe_size: "60M",
+    },
 ];
 
 impl RunOpt {
@@ -186,17 +324,47 @@ impl RunOpt {
         };
 
         let config = self.common.config()?;
-        let rt_builder = self.common.runtime_env_builder()?;
-        let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?);
+        let rt = self.common.build_runtime()?;
+        let ctx = SessionContext::new_with_config_rt(config, rt);
+
+        if let Some(path) = &self.path {
+            for table in &["lineitem", "supplier", "nation", "customer"] {
+                let table_path = path.join(table);
+                if !table_path.exists() {
+                    return exec_err!(
+                        "TPC-H table {} not found at {:?}",
+                        table,
+                        table_path
+                    );
+                }
+                ctx.register_parquet(
+                    *table,
+                    table_path.to_str().unwrap(),
+                    Default::default(),
+                )
+                .await?;
+            }
+        }
 
         let mut benchmark_run = BenchmarkRun::new();
 
         for query_id in query_range {
             let query_index = query_id - 1;
-            let sql = HASH_QUERIES[query_index];
+            let query = &HASH_QUERIES[query_index];
+
+            let case_name = format!(
+                "Query {}_density={}_prob_hit={}_{}*{}",
+                query_id,
+                query.density,
+                query.prob_hit,
+                query.build_size,
+                query.probe_size
+            );
+            benchmark_run.start_new_case(&case_name);
 
-            benchmark_run.start_new_case(&format!("Query {query_id}"));
-            let query_run = self.benchmark_query(sql, &query_id.to_string(), &ctx).await;
+            let query_run = self
+                .benchmark_query(query.sql, &query_id.to_string(), &ctx)
+                .await;
             match query_run {
                 Ok(query_results) => {
                     for iter in query_results {
diff --git a/benchmarks/src/imdb/convert.rs b/benchmarks/src/imdb/convert.rs
index e7949aa715c23..aaed186da4905 100644
--- a/benchmarks/src/imdb/convert.rs
+++ b/benchmarks/src/imdb/convert.rs
@@ -20,31 +20,31 @@ use datafusion::logical_expr::select_expr::SelectExpr;
 use datafusion_common::instant::Instant;
 use std::path::PathBuf;
 
+use clap::Args;
 use datafusion::error::Result;
 use datafusion::prelude::*;
-use structopt::StructOpt;
 
 use datafusion::common::not_impl_err;
 
-use super::get_imdb_table_schema;
 use super::IMDB_TABLES;
+use super::get_imdb_table_schema;
 
-#[derive(Debug, StructOpt)]
+#[derive(Debug, Args)]
 pub struct ConvertOpt {
     /// Path to csv files
-    #[structopt(parse(from_os_str), required = true, short = "i", long = "input")]
+    #[arg(required = true, short = 'i', long = "input")]
     input_path: PathBuf,
 
     /// Output path
-    #[structopt(parse(from_os_str), required = true, short = "o", long = "output")]
+    #[arg(required = true, short = 'o', long = "output")]
     output_path: PathBuf,
 
     /// Output file format: `csv` or `parquet`
-    #[structopt(short = "f", long = "format")]
+    #[arg(short = 'f', long = "format")]
     file_format: String,
 
     /// Batch size when reading CSV or Parquet files
-    #[structopt(short = "s", long = "batch-size", default_value = "8192")]
+    #[arg(short = 's', long = "batch-size", default_value = "8192")]
     batch_size: usize,
 }
 
diff --git a/benchmarks/src/imdb/run.rs b/benchmarks/src/imdb/run.rs
index 11bd424ba6866..ca9710a920517 100644
--- a/benchmarks/src/imdb/run.rs
+++ b/benchmarks/src/imdb/run.rs
@@ -19,16 +19,16 @@ use std::path::PathBuf;
 use std::sync::Arc;
 
 use super::{
-    get_imdb_table_schema, get_query_sql, IMDB_QUERY_END_ID, IMDB_QUERY_START_ID,
-    IMDB_TABLES,
+    IMDB_QUERY_END_ID, IMDB_QUERY_START_ID, IMDB_TABLES, get_imdb_table_schema,
+    get_query_sql,
 };
-use crate::util::{print_memory_stats, BenchmarkRun, CommonOpt, QueryResult};
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult, print_memory_stats};
 
 use arrow::record_batch::RecordBatch;
 use arrow::util::pretty::{self, pretty_format_batches};
+use datafusion::datasource::file_format::FileFormat;
 use datafusion::datasource::file_format::csv::CsvFormat;
 use datafusion::datasource::file_format::parquet::ParquetFormat;
-use datafusion::datasource::file_format::FileFormat;
 use datafusion::datasource::listing::{
     ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
 };
@@ -41,8 +41,8 @@ use datafusion_common::instant::Instant;
 use datafusion_common::utils::get_available_parallelism;
 use datafusion_common::{DEFAULT_CSV_EXTENSION, DEFAULT_PARQUET_EXTENSION};
 
+use clap::Args;
 use log::info;
-use structopt::StructOpt;
 
 // hack to avoid `default_value is meaningless for bool` errors
 type BoolDefaultTrue = bool;
@@ -57,41 +57,45 @@ type BoolDefaultTrue = bool;
 /// [2]: https://event.cwi.nl/da/job/imdb.tgz
 /// [3]: https://db.in.tum.de/~leis/qo/job.tgz
 
-#[derive(Debug, StructOpt, Clone)]
-#[structopt(verbatim_doc_comment)]
+#[derive(Debug, Args, Clone)]
+#[command(verbatim_doc_comment)]
 pub struct RunOpt {
     /// Query number. If not specified, runs all queries
-    #[structopt(short, long)]
+    #[arg(short, long)]
     pub query: Option<usize>,
 
     /// Common options
-    #[structopt(flatten)]
+    #[command(flatten)]
     common: CommonOpt,
 
     /// Path to data files
-    #[structopt(parse(from_os_str), required = true, short = "p", long = "path")]
+    #[arg(required = true, short = 'p', long = "path")]
     path: PathBuf,
 
     /// File format: `csv` or `parquet`
-    #[structopt(short = "f", long = "format", default_value = "csv")]
+    #[arg(short = 'f', long = "format", default_value = "csv")]
     file_format: String,
 
     /// Load the data into a MemTable before executing the query
-    #[structopt(short = "m", long = "mem-table")]
+    #[arg(short = 'm', long = "mem-table")]
     mem_table: bool,
 
     /// Path to machine readable output file
-    #[structopt(parse(from_os_str), short = "o", long = "output")]
+    #[arg(short = 'o', long = "output")]
     output_path: Option<PathBuf>,
 
     /// Whether to disable collection of statistics (and cost based optimizations) or not.
-    #[structopt(short = "S", long = "disable-statistics")]
+    #[arg(short = 'S', long = "disable-statistics")]
     disable_statistics: bool,
 
     /// If true then hash join used, if false then sort merge join
     /// True by default.
-    #[structopt(short = "j", long = "prefer_hash_join", default_value = "true")]
+    #[arg(short = 'j', long = "prefer_hash_join", default_value = "true")]
     prefer_hash_join: BoolDefaultTrue,
+
+    /// How many bytes to buffer on the probe side of hash joins.
+    #[arg(long, default_value = "0")]
+    hash_join_buffering_capacity: usize,
 }
 
 fn map_query_id_to_str(query_id: usize) -> &'static str {
@@ -306,8 +310,10 @@ impl RunOpt {
             .config()?
             .with_collect_statistics(!self.disable_statistics);
         config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
-        let rt_builder = self.common.runtime_env_builder()?;
-        let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?);
+        config.options_mut().execution.hash_join_buffering_capacity =
+            self.hash_join_buffering_capacity;
+        let rt = self.common.build_runtime()?;
+        let ctx = SessionContext::new_with_config_rt(config, rt);
 
         // register tables
         self.register_tables(&ctx).await?;
@@ -517,6 +523,7 @@ mod tests {
             memory_limit: None,
             sort_spill_reservation_bytes: None,
             debug: false,
+            simulate_latency: false,
         };
         let opt = RunOpt {
             query: Some(query),
@@ -527,6 +534,7 @@ mod tests {
             output_path: None,
             disable_statistics: false,
             prefer_hash_join: true,
+            hash_join_buffering_capacity: 0,
         };
         opt.register_tables(&ctx).await?;
         let queries = get_query_sql(map_query_id_to_str(query))?;
@@ -553,6 +561,7 @@ mod tests {
             memory_limit: None,
             sort_spill_reservation_bytes: None,
             debug: false,
+            simulate_latency: false,
         };
         let opt = RunOpt {
             query: Some(query),
@@ -563,6 +572,7 @@ mod tests {
             output_path: None,
             disable_statistics: false,
             prefer_hash_join: true,
+            hash_join_buffering_capacity: 0,
         };
         opt.register_tables(&ctx).await?;
         let queries = get_query_sql(map_query_id_to_str(query))?;
diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs
index 07cffa5ae468e..7c598e65d824c 100644
--- a/benchmarks/src/lib.rs
+++ b/benchmarks/src/lib.rs
@@ -22,6 +22,10 @@ pub mod h2o;
 pub mod hj;
 pub mod imdb;
 pub mod nlj;
+pub mod smj;
+pub mod sort_pushdown;
 pub mod sort_tpch;
+pub mod sql_benchmark;
+pub mod tpcds;
 pub mod tpch;
 pub mod util;
diff --git a/benchmarks/src/nlj.rs b/benchmarks/src/nlj.rs
index 7d1e14f69439c..361cc35ec200c 100644
--- a/benchmarks/src/nlj.rs
+++ b/benchmarks/src/nlj.rs
@@ -16,11 +16,11 @@
 // under the License.
 
 use crate::util::{BenchmarkRun, CommonOpt, QueryResult};
+use clap::Args;
 use datafusion::physical_plan::execute_stream;
 use datafusion::{error::Result, prelude::SessionContext};
 use datafusion_common::instant::Instant;
-use datafusion_common::{exec_datafusion_err, exec_err, DataFusionError};
-use structopt::StructOpt;
+use datafusion_common::{DataFusionError, exec_datafusion_err, exec_err};
 
 use futures::StreamExt;
 
@@ -40,19 +40,19 @@ use futures::StreamExt;
 /// - Input size: Different combinations of left (build) side and right (probe)
 ///   side sizes
 /// - Selectivity of join filters
-#[derive(Debug, StructOpt, Clone)]
-#[structopt(verbatim_doc_comment)]
+#[derive(Debug, Args, Clone)]
+#[command(verbatim_doc_comment)]
 pub struct RunOpt {
     /// Query number (between 1 and 10). If not specified, runs all queries
-    #[structopt(short, long)]
+    #[arg(short, long)]
     query: Option<usize>,
 
     /// Common options
-    #[structopt(flatten)]
+    #[command(flatten)]
     common: CommonOpt,
 
     /// If present, write results json here
-    #[structopt(parse(from_os_str), short = "o", long = "output")]
+    #[arg(short = 'o', long = "output")]
     output_path: Option<std::path::PathBuf>,
 }
 
@@ -207,8 +207,8 @@ impl RunOpt {
         };
 
         let config = self.common.config()?;
-        let rt_builder = self.common.runtime_env_builder()?;
-        let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?);
+        let rt = self.common.build_runtime()?;
+        let ctx = SessionContext::new_with_config_rt(config, rt);
 
         let mut benchmark_run = BenchmarkRun::new();
         for query_id in query_range {
@@ -268,8 +268,8 @@ impl RunOpt {
             let elapsed = start.elapsed();
 
             println!(
-                    "Query {query_name} iteration {i} returned {row_count} rows in {elapsed:?}"
-                );
+                "Query {query_name} iteration {i} returned {row_count} rows in {elapsed:?}"
+            );
 
             query_results.push(QueryResult { elapsed, row_count });
         }
diff --git a/benchmarks/src/smj.rs b/benchmarks/src/smj.rs
new file mode 100644
index 0000000000000..3d173b7116e2b
--- /dev/null
+++ b/benchmarks/src/smj.rs
@@ -0,0 +1,647 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult};
+use clap::Args;
+use datafusion::physical_plan::execute_stream;
+use datafusion::{error::Result, prelude::SessionContext};
+use datafusion_common::instant::Instant;
+use datafusion_common::{DataFusionError, exec_datafusion_err, exec_err};
+
+use futures::StreamExt;
+
+/// Run the Sort Merge Join (SMJ) benchmark
+///
+/// This micro-benchmark focuses on the performance characteristics of SMJs.
+///
+/// It uses equality join predicates (to ensure SMJ is selected) and varies:
+/// - Join type: Inner/Left/Right/Full/LeftSemi/LeftAnti/RightSemi/RightAnti
+/// - Key cardinality: 1:1, 1:N, N:M relationships
+/// - Filter selectivity: Low (1%), Medium (10%), High (50%)
+/// - Input sizes: Small to large, balanced and skewed
+///
+/// All inputs are pre-sorted in CTEs before the join to isolate join
+/// performance from sort overhead.
+#[derive(Debug, Args, Clone)]
+#[command(verbatim_doc_comment)]
+pub struct RunOpt {
+    /// Query number (between 1 and 26). If not specified, runs all queries
+    #[arg(short, long)]
+    query: Option<usize>,
+
+    /// Common options
+    #[command(flatten)]
+    common: CommonOpt,
+
+    /// If present, write results json here
+    #[arg(short = 'o', long = "output")]
+    output_path: Option<std::path::PathBuf>,
+}
+
+/// Inline SQL queries for SMJ benchmarks
+///
+/// Each query's comment includes:
+///   - Join type
+///   - Left row count × Right row count
+///   - Key cardinality (rows per key)
+///   - Filter selectivity (if applicable)
+const SMJ_QUERIES: &[&str] = &[
+    // Q1: INNER 1M x 1M | 1:1
+    r#"
+        WITH t1_sorted AS (
+            SELECT value as key FROM range(1000000) ORDER BY value
+        ),
+        t2_sorted AS (
+            SELECT value as key FROM range(1000000) ORDER BY value
+        )
+        SELECT t1_sorted.key as k1, t2_sorted.key as k2
+        FROM t1_sorted JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+    "#,
+    // Q2: INNER 1M x 10M | 1:10
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(10000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2
+        FROM t1_sorted JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+    "#,
+    // Q3: INNER 1M x 1M | 1:100
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2
+        FROM t1_sorted JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+    "#,
+    // Q4: INNER 1M x 10M | 1:10 | 1%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(10000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2
+        FROM t1_sorted JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+        WHERE t2_sorted.data % 100 = 0
+    "#,
+    // Q5: INNER 1M x 1M | 1:100 | 10%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2
+        FROM t1_sorted JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+        WHERE t1_sorted.data <> t2_sorted.data AND t2_sorted.data % 10 = 0
+    "#,
+    // Q6: LEFT 1M x 10M | 1:10
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 105000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(10000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2
+        FROM t1_sorted LEFT JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+    "#,
+    // Q7: LEFT 1M x 10M | 1:10 | 50%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(10000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2
+        FROM t1_sorted LEFT JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+        WHERE t2_sorted.data IS NULL OR t2_sorted.data % 2 = 0
+    "#,
+    // Q8: FULL 1M x 1M | 1:10
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 125000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key as k1, t1_sorted.data as d1,
+               t2_sorted.key as k2, t2_sorted.data as d2
+        FROM t1_sorted FULL JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+    "#,
+    // Q9: FULL 1M x 10M | 1:10 | 10%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(10000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key as k1, t1_sorted.data as d1,
+               t2_sorted.key as k2, t2_sorted.data as d2
+        FROM t1_sorted FULL JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+        WHERE (t1_sorted.data IS NULL OR t2_sorted.data IS NULL
+               OR t1_sorted.data <> t2_sorted.data)
+          AND (t1_sorted.data IS NULL OR t1_sorted.data % 10 = 0)
+    "#,
+    // Q10: LEFT SEMI 1M x 10M | 1:10
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 100000 as key
+            FROM range(10000000)
+            ORDER BY key
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+        )
+    "#,
+    // Q11: LEFT SEMI 1M x 10M | 1:10 | 1%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(10000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+              AND t2_sorted.data <> t1_sorted.data
+              AND t2_sorted.data % 100 = 0
+        )
+    "#,
+    // Q12: LEFT SEMI 1M x 10M | 1:10 | 50%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(10000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+              AND t2_sorted.data <> t1_sorted.data
+              AND t2_sorted.data % 2 = 0
+        )
+    "#,
+    // Q13: LEFT SEMI 1M x 10M | 1:10 | 90%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(10000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+              AND t2_sorted.data <> t1_sorted.data
+              AND t2_sorted.data % 10 <> 0
+        )
+    "#,
+    // Q14: LEFT ANTI 1M x 10M | 1:10
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 105000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 100000 as key
+            FROM range(10000000)
+            ORDER BY key
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE NOT EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+        )
+    "#,
+    // Q15: LEFT ANTI 1M x 10M | 1:10 | partial match
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 120000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 100000 as key
+            FROM range(10000000)
+            ORDER BY key
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE NOT EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+        )
+    "#,
+    // Q16: LEFT ANTI 1M x 1M | 1:1 | stress
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 110000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 100000 as key
+            FROM range(1000000)
+            ORDER BY key
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE NOT EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+        )
+    "#,
+    // Q17: INNER 1M x 50M | 1:50 | 5%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(50000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2
+        FROM t1_sorted JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+        WHERE t2_sorted.data <> t1_sorted.data AND t2_sorted.data % 20 = 0
+    "#,
+    // Q18: LEFT SEMI 1M x 50M | 1:50 | 2%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(50000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+              AND t2_sorted.data <> t1_sorted.data
+              AND t2_sorted.data % 50 = 0
+        )
+    "#,
+    // Q19: LEFT ANTI 1M x 50M | 1:50 | partial match
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 150000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 100000 as key
+            FROM range(50000000)
+            ORDER BY key
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE NOT EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+        )
+    "#,
+    // Q20: INNER 1M x 10M | 1:100 + GROUP BY
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(10000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, count(*) as cnt
+        FROM t1_sorted JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+        GROUP BY t1_sorted.key
+    "#,
+    // Q21: INNER 10M x 10M | unique keys (1:1) | 50% join filter
+    r#"
+        WITH t1_sorted AS (
+            SELECT value as key, value as data
+            FROM range(10000000) ORDER BY value
+        ),
+        t2_sorted AS (
+            SELECT value as key, value as data
+            FROM range(10000000) ORDER BY value
+        )
+        SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2
+        FROM t1_sorted JOIN t2_sorted
+          ON t1_sorted.key = t2_sorted.key
+         AND t1_sorted.data + t2_sorted.data < 10000000
+    "#,
+    // Q22: LEFT 10M x 10M | unique keys (1:1) | 50% join filter
+    r#"
+        WITH t1_sorted AS (
+            SELECT value as key, value as data
+            FROM range(10000000) ORDER BY value
+        ),
+        t2_sorted AS (
+            SELECT value as key, value as data
+            FROM range(10000000) ORDER BY value
+        )
+        SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2
+        FROM t1_sorted LEFT JOIN t2_sorted
+          ON t1_sorted.key = t2_sorted.key
+         AND t1_sorted.data + t2_sorted.data < 10000000
+    "#,
+    // Q23: FULL 10M x 10M | unique keys (1:1) | 50% join filter
+    r#"
+        WITH t1_sorted AS (
+            SELECT value as key, value as data
+            FROM range(10000000) ORDER BY value
+        ),
+        t2_sorted AS (
+            SELECT value as key, value as data
+            FROM range(10000000) ORDER BY value
+        )
+        SELECT t1_sorted.key as k1, t1_sorted.data as d1,
+               t2_sorted.key as k2, t2_sorted.data as d2
+        FROM t1_sorted FULL JOIN t2_sorted
+          ON t1_sorted.key = t2_sorted.key
+         AND t1_sorted.data + t2_sorted.data < 10000000
+    "#,
+    // Q24: LEFT MARK 1M x 10M | 1:10 | 1%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(10000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE t1_sorted.data < 0
+           OR EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+              AND t2_sorted.data <> t1_sorted.data
+              AND t2_sorted.data % 100 = 0
+        )
+    "#,
+    // Q25: LEFT MARK 1M x 10M | 1:10 | 50%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(10000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE t1_sorted.data < 0
+           OR EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+              AND t2_sorted.data <> t1_sorted.data
+              AND t2_sorted.data % 2 = 0
+        )
+    "#,
+    // Q26: LEFT MARK 1M x 10M | 1:10 | 90%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 100000 as key, value as data
+            FROM range(10000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE t1_sorted.data < 0
+           OR EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+              AND t2_sorted.data <> t1_sorted.data
+              AND t2_sorted.data % 10 <> 0
+        )
+    "#,
+];
+
+impl RunOpt {
+    pub async fn run(self) -> Result<()> {
+        println!("Running SMJ benchmarks with the following options: {self:#?}\n");
+
+        // Define query range
+        let query_range = match self.query {
+            Some(query_id) => {
+                if query_id >= 1 && query_id <= SMJ_QUERIES.len() {
+                    query_id..=query_id
+                } else {
+                    return exec_err!(
+                        "Query {query_id} not found. Available queries: 1 to {}",
+                        SMJ_QUERIES.len()
+                    );
+                }
+            }
+            None => 1..=SMJ_QUERIES.len(),
+        };
+
+        let mut config = self.common.config()?;
+        // Disable hash joins to force SMJ
+        config = config.set_bool("datafusion.optimizer.prefer_hash_join", false);
+        let rt = self.common.build_runtime()?;
+        let ctx = SessionContext::new_with_config_rt(config, rt);
+
+        let mut benchmark_run = BenchmarkRun::new();
+        for query_id in query_range {
+            let query_index = query_id - 1; // Convert 1-based to 0-based index
+
+            let sql = SMJ_QUERIES[query_index];
+            benchmark_run.start_new_case(&format!("Query {query_id}"));
+            let expect_mark = query_id >= 24;
+            let query_run = self
+                .benchmark_query(sql, &query_id.to_string(), expect_mark, &ctx)
+                .await;
+            match query_run {
+                Ok(query_results) => {
+                    for iter in query_results {
+                        benchmark_run.write_iter(iter.elapsed, iter.row_count);
+                    }
+                }
+                Err(e) => {
+                    return Err(DataFusionError::Context(
+                        format!("SMJ benchmark Q{query_id} failed with error:"),
+                        Box::new(e),
+                    ));
+                }
+            }
+        }
+
+        benchmark_run.maybe_write_json(self.output_path.as_ref())?;
+        Ok(())
+    }
+
+    async fn benchmark_query(
+        &self,
+        sql: &str,
+        query_name: &str,
+        expect_mark: bool,
+        ctx: &SessionContext,
+    ) -> Result<Vec<QueryResult>> {
+        let mut query_results = vec![];
+
+        // Validate that the query plan includes a Sort Merge Join
+        let df = ctx.sql(sql).await?;
+        let physical_plan = df.create_physical_plan().await?;
+        let plan_string = format!("{physical_plan:#?}");
+
+        if !plan_string.contains("SortMergeJoinExec") {
+            return Err(exec_datafusion_err!(
+                "Query {query_name} does not use Sort Merge Join. Physical plan: {plan_string}"
+            ));
+        }
+
+        if expect_mark && !plan_string.contains("LeftMark") {
+            return Err(exec_datafusion_err!(
+                "Query {query_name} expected LeftMark join. Physical plan: {plan_string}"
+            ));
+        }
+
+        for i in 0..self.common.iterations {
+            let start = Instant::now();
+
+            let row_count = Self::execute_sql_without_result_buffering(sql, ctx).await?;
+
+            let elapsed = start.elapsed();
+
+            println!(
+                "Query {query_name} iteration {i} returned {row_count} rows in {elapsed:?}"
+            );
+
+            query_results.push(QueryResult { elapsed, row_count });
+        }
+
+        Ok(query_results)
+    }
+
+    /// Executes the SQL query and drops each result batch after evaluation, to
+    /// minimizes memory usage by not buffering results.
+    ///
+    /// Returns the total result row count
+    async fn execute_sql_without_result_buffering(
+        sql: &str,
+        ctx: &SessionContext,
+    ) -> Result<usize> {
+        let mut row_count = 0;
+
+        let df = ctx.sql(sql).await?;
+        let physical_plan = df.create_physical_plan().await?;
+        let mut stream = execute_stream(physical_plan, ctx.task_ctx())?;
+
+        while let Some(batch) = stream.next().await {
+            row_count += batch?.num_rows();
+
+            // Evaluate the result and do nothing, the result will be dropped
+            // to reduce memory pressure
+        }
+
+        Ok(row_count)
+    }
+}
diff --git a/benchmarks/src/sort_pushdown.rs b/benchmarks/src/sort_pushdown.rs
new file mode 100644
index 0000000000000..e7fce1921e7a8
--- /dev/null
+++ b/benchmarks/src/sort_pushdown.rs
@@ -0,0 +1,282 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmark for sort pushdown optimization.
+//!
+//! Tests performance of sort elimination when files are non-overlapping and
+//! internally sorted (declared via `--sorted` / `WITH ORDER`).
+//!
+//! Queries are loaded from external SQL files under `queries/sort_pushdown/`
+//! so they can also be run directly with `datafusion-cli`.
+//!
+//! # Usage
+//!
+//! ```text
+//! # Prepare sorted TPCH lineitem data (SF=1)
+//! ./bench.sh data sort_pushdown
+//!
+//! # Baseline (no WITH ORDER, full SortExec)
+//! ./bench.sh run sort_pushdown
+//!
+//! # With sort elimination (WITH ORDER, SortExec removed)
+//! ./bench.sh run sort_pushdown_sorted
+//! ```
+
+use clap::Args;
+use futures::StreamExt;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use datafusion::datasource::TableProvider;
+use datafusion::datasource::file_format::parquet::ParquetFormat;
+use datafusion::datasource::listing::{
+    ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
+};
+use datafusion::error::Result;
+use datafusion::execution::SessionStateBuilder;
+use datafusion::physical_plan::display::DisplayableExecutionPlan;
+use datafusion::physical_plan::{displayable, execute_stream};
+use datafusion::prelude::*;
+use datafusion_common::DEFAULT_PARQUET_EXTENSION;
+use datafusion_common::instant::Instant;
+
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult, print_memory_stats};
+
+/// Default path to query files, relative to the benchmark root
+const SORT_PUSHDOWN_QUERY_DIR: &str = "queries/sort_pushdown";
+
+#[derive(Debug, Args)]
+pub struct RunOpt {
+    /// Common options
+    #[command(flatten)]
+    common: CommonOpt,
+
+    /// Sort pushdown query number (1-4). If not specified, runs all queries
+    #[arg(short, long)]
+    pub query: Option<usize>,
+
+    /// Path to data files (lineitem). Only parquet format is supported.
+    #[arg(required = true, short = 'p', long = "path")]
+    path: PathBuf,
+
+    /// Path to JSON benchmark result to be compared using `compare.py`
+    #[arg(short = 'o', long = "output")]
+    output_path: Option<PathBuf>,
+
+    /// Path to directory containing query SQL files (q1.sql, q2.sql, ...).
+    /// Defaults to `queries/sort_pushdown/` relative to current directory.
+    #[arg(long = "queries-path")]
+    queries_path: Option<PathBuf>,
+
+    /// Mark the first column (l_orderkey) as sorted via WITH ORDER.
+    /// When set, enables sort elimination for matching queries.
+    #[arg(short = 't', long = "sorted")]
+    sorted: bool,
+}
+
+impl RunOpt {
+    const TABLES: [&'static str; 1] = ["lineitem"];
+
+    fn queries_dir(&self) -> PathBuf {
+        self.queries_path
+            .clone()
+            .unwrap_or_else(|| PathBuf::from(SORT_PUSHDOWN_QUERY_DIR))
+    }
+
+    fn load_query(&self, query_id: usize) -> Result<String> {
+        let path = self.queries_dir().join(format!("q{query_id}.sql"));
+        std::fs::read_to_string(&path).map_err(|e| {
+            datafusion_common::DataFusionError::Execution(format!(
+                "Failed to read query file {}: {e}",
+                path.display()
+            ))
+        })
+    }
+
+    fn available_queries(&self) -> Vec<usize> {
+        let dir = self.queries_dir();
+        let mut ids = Vec::new();
+        if let Ok(entries) = std::fs::read_dir(&dir) {
+            for entry in entries.flatten() {
+                let name = entry.file_name();
+                let name = name.to_string_lossy();
+                if let Some(rest) = name.strip_prefix('q')
+                    && let Some(num_str) = rest.strip_suffix(".sql")
+                    && let Ok(id) = num_str.parse::<usize>()
+                {
+                    ids.push(id);
+                }
+            }
+        }
+        ids.sort();
+        ids
+    }
+
+    pub async fn run(&self) -> Result<()> {
+        let mut benchmark_run = BenchmarkRun::new();
+
+        let query_ids = match self.query {
+            Some(query_id) => vec![query_id],
+            None => self.available_queries(),
+        };
+
+        for query_id in query_ids {
+            benchmark_run.start_new_case(&format!("{query_id}"));
+
+            let query_results = self.benchmark_query(query_id).await;
+            match query_results {
+                Ok(query_results) => {
+                    for iter in query_results {
+                        benchmark_run.write_iter(iter.elapsed, iter.row_count);
+                    }
+                }
+                Err(e) => {
+                    benchmark_run.mark_failed();
+                    eprintln!("Query {query_id} failed: {e}");
+                }
+            }
+        }
+
+        benchmark_run.maybe_write_json(self.output_path.as_ref())?;
+        benchmark_run.maybe_print_failures();
+        Ok(())
+    }
+
+    async fn benchmark_query(&self, query_id: usize) -> Result<Vec<QueryResult>> {
+        let sql = self.load_query(query_id)?;
+
+        let config = self.common.config()?;
+        let rt = self.common.build_runtime()?;
+        let state = SessionStateBuilder::new()
+            .with_config(config)
+            .with_runtime_env(rt)
+            .with_default_features()
+            .build();
+        let ctx = SessionContext::from(state);
+
+        self.register_tables(&ctx).await?;
+
+        let mut millis = vec![];
+        let mut query_results = vec![];
+        for i in 0..self.iterations() {
+            let start = Instant::now();
+
+            let row_count = self.execute_query(&ctx, sql.as_str()).await?;
+
+            let elapsed = start.elapsed();
+            let ms = elapsed.as_secs_f64() * 1000.0;
+            millis.push(ms);
+
+            println!(
+                "Query {query_id} iteration {i} took {ms:.1} ms and returned {row_count} rows"
+            );
+            query_results.push(QueryResult { elapsed, row_count });
+        }
+
+        let avg = millis.iter().sum::<f64>() / millis.len() as f64;
+        println!("Query {query_id} avg time: {avg:.2} ms");
+
+        print_memory_stats();
+
+        Ok(query_results)
+    }
+
+    async fn register_tables(&self, ctx: &SessionContext) -> Result<()> {
+        for table in Self::TABLES {
+            let table_provider = self.get_table(ctx, table).await?;
+            ctx.register_table(table, table_provider)?;
+        }
+        Ok(())
+    }
+
+    async fn execute_query(&self, ctx: &SessionContext, sql: &str) -> Result<usize> {
+        let debug = self.common.debug;
+        let plan = ctx.sql(sql).await?;
+        let (state, plan) = plan.into_parts();
+
+        if debug {
+            println!("=== Logical plan ===\n{plan}\n");
+        }
+
+        let plan = state.optimize(&plan)?;
+        if debug {
+            println!("=== Optimized logical plan ===\n{plan}\n");
+        }
+        let physical_plan = state.create_physical_plan(&plan).await?;
+        if debug {
+            println!(
+                "=== Physical plan ===\n{}\n",
+                displayable(physical_plan.as_ref()).indent(true)
+            );
+        }
+
+        let mut row_count = 0;
+        let mut stream = execute_stream(physical_plan.clone(), state.task_ctx())?;
+        while let Some(batch) = stream.next().await {
+            row_count += batch?.num_rows();
+        }
+
+        if debug {
+            println!(
+                "=== Physical plan with metrics ===\n{}\n",
+                DisplayableExecutionPlan::with_metrics(physical_plan.as_ref())
+                    .indent(true)
+            );
+        }
+
+        Ok(row_count)
+    }
+
+    async fn get_table(
+        &self,
+        ctx: &SessionContext,
+        table: &str,
+    ) -> Result<Arc<dyn TableProvider>> {
+        let path = self.path.to_str().unwrap();
+        let state = ctx.state();
+        let path = format!("{path}/{table}");
+        let format = Arc::new(
+            ParquetFormat::default()
+                .with_options(ctx.state().table_options().parquet.clone()),
+        );
+        let extension = DEFAULT_PARQUET_EXTENSION;
+
+        let options = ListingOptions::new(format)
+            .with_file_extension(extension)
+            .with_collect_stat(true); // Always collect statistics for sort pushdown
+
+        let table_path = ListingTableUrl::parse(path)?;
+        let schema = options.infer_schema(&state, &table_path).await?;
+        let options = if self.sorted {
+            let key_column_name = schema.fields()[0].name();
+            options
+                .with_file_sort_order(vec![vec![col(key_column_name).sort(true, false)]])
+        } else {
+            options
+        };
+
+        let config = ListingTableConfig::new(table_path)
+            .with_listing_options(options)
+            .with_schema(schema);
+
+        Ok(Arc::new(ListingTable::try_new(config)?))
+    }
+
+    fn iterations(&self) -> usize {
+        self.common.iterations
+    }
+}
diff --git a/benchmarks/src/sort_tpch.rs b/benchmarks/src/sort_tpch.rs
index 09b5a676bbff1..95c90d826de20 100644
--- a/benchmarks/src/sort_tpch.rs
+++ b/benchmarks/src/sort_tpch.rs
@@ -21,10 +21,10 @@
 //! Another `Sort` benchmark focus on single core execution. This benchmark
 //! runs end-to-end sort queries and test the performance on multiple CPU cores.
 
+use clap::Args;
 use futures::StreamExt;
 use std::path::PathBuf;
 use std::sync::Arc;
-use structopt::StructOpt;
 
 use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::listing::{
@@ -36,41 +36,41 @@ use datafusion::execution::SessionStateBuilder;
 use datafusion::physical_plan::display::DisplayableExecutionPlan;
 use datafusion::physical_plan::{displayable, execute_stream};
 use datafusion::prelude::*;
+use datafusion_common::DEFAULT_PARQUET_EXTENSION;
 use datafusion_common::instant::Instant;
 use datafusion_common::utils::get_available_parallelism;
-use datafusion_common::DEFAULT_PARQUET_EXTENSION;
 
-use crate::util::{print_memory_stats, BenchmarkRun, CommonOpt, QueryResult};
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult, print_memory_stats};
 
-#[derive(Debug, StructOpt)]
+#[derive(Debug, Args)]
 pub struct RunOpt {
     /// Common options
-    #[structopt(flatten)]
+    #[command(flatten)]
     common: CommonOpt,
 
     /// Sort query number. If not specified, runs all queries
-    #[structopt(short, long)]
+    #[arg(short, long)]
     pub query: Option<usize>,
 
     /// Path to data files (lineitem). Only parquet format is supported
-    #[structopt(parse(from_os_str), required = true, short = "p", long = "path")]
+    #[arg(required = true, short = 'p', long = "path")]
     path: PathBuf,
 
     /// Path to JSON benchmark result to be compare using `compare.py`
-    #[structopt(parse(from_os_str), short = "o", long = "output")]
+    #[arg(short = 'o', long = "output")]
     output_path: Option<PathBuf>,
 
     /// Load the data into a MemTable before executing the query
-    #[structopt(short = "m", long = "mem-table")]
+    #[arg(short = 'm', long = "mem-table")]
     mem_table: bool,
 
     /// Mark the first column of each table as sorted in ascending order.
     /// The tables should have been created with the `--sort` option for this to have any effect.
-    #[structopt(short = "t", long = "sorted")]
+    #[arg(short = 't', long = "sorted")]
     sorted: bool,
 
     /// Append a `LIMIT n` clause to the query
-    #[structopt(short = "l", long = "limit")]
+    #[arg(short = 'l', long = "limit")]
     limit: Option<usize>,
 }
 
@@ -209,10 +209,10 @@ impl RunOpt {
     /// Benchmark query `query_id` in `SORT_QUERIES`
     async fn benchmark_query(&self, query_id: usize) -> Result<Vec<QueryResult>> {
         let config = self.common.config()?;
-        let rt_builder = self.common.runtime_env_builder()?;
+        let rt = self.common.build_runtime()?;
         let state = SessionStateBuilder::new()
             .with_config(config)
-            .with_runtime_env(rt_builder.build_arc()?)
+            .with_runtime_env(rt)
             .with_default_features()
             .build();
         let ctx = SessionContext::from(state);
diff --git a/benchmarks/src/sql_benchmark.rs b/benchmarks/src/sql_benchmark.rs
new file mode 100644
index 0000000000000..34614b132483f
--- /dev/null
+++ b/benchmarks/src/sql_benchmark.rs
@@ -0,0 +1,3538 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, RecordBatch};
+use arrow::datatypes::*;
+use arrow::error::ArrowError;
+use arrow::util::display::{ArrayFormatter, FormatOptions};
+use datafusion::dataframe::DataFrameWriteOptions;
+use datafusion::datasource::MemTable;
+use datafusion::physical_plan::execute_stream;
+use datafusion::prelude::{CsvReadOptions, DataFrame, SessionContext};
+use datafusion_common::config::CsvOptions;
+use datafusion_common::{DataFusionError, Result, exec_datafusion_err};
+use futures::StreamExt;
+use log::{debug, info, trace, warn};
+use regex::Regex;
+use std::collections::HashMap;
+use std::fmt::Debug;
+use std::fs::{self, File, OpenOptions};
+use std::io::{BufRead, BufReader};
+use std::path::{Path, PathBuf};
+use std::sync::{Arc, LazyLock};
+
+/// A collection of benchmark configurations and state used by the DataFusion
+/// sql test harness.  Each benchmark is defined by a file that can contain
+/// directives such as `load`, `run`, `assert`, `result`, etc.  The
+/// `SqlBenchmark` struct holds the parsed data from that file and
+/// the impl provides methods to run, assert, persist, verify and cleanup benchmark
+/// results.
+#[derive(Debug, Clone)]
+pub struct SqlBenchmark {
+    /// Human‑readable name of the benchmark.
+    name: String,
+    /// Top‑level group name (derived from the file path or defined in a benchmark).
+    group: String,
+    /// Subgroup name, often a logical grouping.
+    subgroup: String,
+    /// Full path to the benchmark file.
+    benchmark_path: PathBuf,
+    /// Mapping of placeholder keys to concrete values (e.g. `"BENCHMARK_DIR"`).
+    replacement_mapping: HashMap<String, String>,
+    /// Expected string that must appear in the physical plan of the queries.
+    expect: Vec<String>,
+    /// All SQL queries grouped by directive (`load`, `run`, etc.).
+    queries: HashMap<QueryDirective, Vec<String>>,
+    /// Queries whose results are persisted to disk for later comparison.
+    result_queries: Vec<BenchmarkQuery>,
+    /// Queries whose results are asserted against an expected table.
+    assert_queries: Vec<BenchmarkQuery>,
+    /// Flag indicating whether the benchmark has been fully loaded
+    is_loaded: bool,
+    /// Stores the last run results if needed so they can be compared or persisted.
+    last_results: Option<Vec<RecordBatch>>,
+    /// echo statements
+    echo: Vec<String>,
+}
+
+impl SqlBenchmark {
+    pub async fn new(
+        ctx: &SessionContext,
+        full_path: impl AsRef<Path>,
+        benchmark_directory: impl AsRef<Path>,
+    ) -> Result<Self> {
+        let full_path = full_path.as_ref();
+        let benchmark_directory = benchmark_directory.as_ref();
+        let group_name = parse_group_from_path(full_path, benchmark_directory);
+        let mut bm = Self {
+            name: String::new(),
+            group: group_name,
+            subgroup: String::new(),
+            benchmark_path: full_path.to_path_buf(),
+            replacement_mapping: HashMap::new(),
+            expect: vec![],
+            queries: HashMap::new(),
+            result_queries: vec![],
+            assert_queries: vec![],
+            is_loaded: false,
+            last_results: None,
+            echo: vec![],
+        };
+        insert_replacement(
+            &mut bm.replacement_mapping,
+            "BENCHMARK_DIR",
+            benchmark_directory.to_string_lossy().into_owned(),
+        );
+
+        let path = bm.benchmark_path.clone();
+        bm.process_file(ctx, &path).await?;
+
+        Ok(bm)
+    }
+
+    /// Initializes the benchmark by executing `load` and `init` queries.
+    ///
+    /// Registers any required tables or sets up state in the provided
+    /// `SessionContext` before running queries.  This method is idempotent:
+    /// calling it multiple times on the same instance returns
+    /// immediately after the first successful initialization.
+    ///
+    /// # Errors
+    /// Returns an error if any `load` or `init` query fails, or if the
+    /// benchmark file does not contain a `run` query.
+    pub async fn initialize(&mut self, ctx: &SessionContext) -> Result<()> {
+        if self.is_loaded {
+            return Ok(());
+        }
+
+        let path = self.benchmark_path.to_string_lossy().into_owned();
+
+        // validate there was a run query
+        if !self.queries.contains_key(&QueryDirective::Run) {
+            return Err(exec_datafusion_err!(
+                "Invalid benchmark file: no \"run\" query specified: {path}"
+            ));
+        }
+
+        // display any echo's
+        self.echo.iter().for_each(|txt| println!("{txt}"));
+
+        let load_queries = self.queries.get(&QueryDirective::Load);
+
+        if let Some(queries) = load_queries {
+            for query in queries {
+                debug!("Executing load query {query}");
+                ctx.sql(query).await?.collect().await?;
+            }
+        }
+
+        let init_queries = self.queries.get(&QueryDirective::Init);
+
+        if let Some(queries) = init_queries {
+            for query in queries {
+                debug!("Executing init query {query}");
+                ctx.sql(query).await?.collect().await?;
+            }
+        }
+
+        self.is_loaded = true;
+
+        Ok(())
+    }
+
+    /// Executes the `assert` queries and compares actual results against
+    /// expected values.
+    ///
+    /// Each `assert` query must be followed by a result table (separated by
+    /// `----`) in the benchmark file.  The assertion passes only if the
+    /// returned record batches exactly match the expected rows.
+    ///
+    /// # Errors
+    /// Returns an error if any `assert` query fails, or if the actual and
+    /// expected results differ in row count or cell values.
+    pub async fn assert(&mut self, ctx: &SessionContext) -> Result<()> {
+        info!("Running assertions...");
+
+        for assert_query in &self.assert_queries {
+            let query = &assert_query.query;
+
+            info!("Executing assert query {query}");
+
+            let result = ctx.sql(query).await?.collect().await?;
+            let formatted_actual_results = format_record_batches(&result)?;
+
+            Self::compare_results(
+                assert_query,
+                &formatted_actual_results,
+                &assert_query.expected_result,
+            )?;
+        }
+
+        Ok(())
+    }
+
+    /// Executes the `run` queries, optionally saving results for later
+    /// verification. If there are multiple queries only the results for
+    /// the last query are saved.
+    ///
+    /// When `save_results` is `true`, it collects `SELECT`/`WITH` query
+    /// results and stores them in `last_results`.
+    ///
+    /// When `save_results` is `false`, it streams results and counts rows
+    /// without buffering them.
+    ///
+    /// If an 'expect' string is defined this method also validates that
+    /// the physical plan contains that string.
+    ///
+    /// # Errors
+    /// Returns an error if a `run` query fails or if expected plan strings
+    /// are not found.
+    pub async fn run(&mut self, ctx: &SessionContext, save_results: bool) -> Result<()> {
+        let run_queries = self
+            .queries
+            .get(&QueryDirective::Run)
+            .ok_or_else(|| exec_datafusion_err!("Run query should be loaded by now"))?;
+
+        let mut result_count = 0;
+
+        let result: Vec<RecordBatch> = {
+            let mut local_result = vec![];
+
+            for query in run_queries {
+                match save_results {
+                    true => {
+                        debug!(
+                            "Running query (saving results) {}-{}: {query}",
+                            self.group, self.subgroup
+                        );
+
+                        let df = ctx.sql(query).await?;
+                        if !self.expect.is_empty() {
+                            let physical_plan = df.create_physical_plan().await?;
+                            self.validate_expected_plan(&physical_plan)?;
+                        }
+
+                        let result_schema = Arc::new(df.schema().as_arrow().clone());
+                        let mut batches = df.collect().await?;
+                        let trimmed = query.trim_start();
+
+                        // save the output for select/with queries
+                        if starts_with_ignore_ascii_case(trimmed, "select")
+                            || starts_with_ignore_ascii_case(trimmed, "with")
+                        {
+                            if batches.is_empty() {
+                                batches.push(RecordBatch::new_empty(result_schema));
+                            }
+                            let row_count_for_query =
+                                batches.iter().map(RecordBatch::num_rows).sum::<usize>();
+                            debug!(
+                                "Persisting {} batches ({} rows)...",
+                                batches.len(),
+                                row_count_for_query
+                            );
+
+                            result_count = row_count_for_query;
+                            local_result = batches;
+                        }
+                    }
+                    false => {
+                        debug!(
+                            "Running query (ignoring results) {}-{}: {query}",
+                            self.group, self.subgroup
+                        );
+
+                        result_count = self
+                            .execute_sql_without_result_buffering(query, ctx)
+                            .await?;
+                    }
+                }
+            }
+
+            Ok::<Vec<RecordBatch>, DataFusionError>(local_result)
+        }?;
+
+        debug!("Results have {result_count} rows");
+
+        // Store results for verification
+        self.last_results = Some(result);
+
+        Ok(())
+    }
+
+    /// Calls run and persists results to disk as a CSV file.
+    ///
+    /// Requires that the benchmark defines a `result` or `result_query`.
+    /// Registers the results in a memory table and writes them to disk with
+    /// pipe delimiters and a header row.
+    ///
+    /// # Errors
+    /// Returns an error if no results are available or if writing to the
+    /// target path fails.
+    pub async fn persist(&mut self, ctx: &SessionContext) -> Result<()> {
+        self.run(ctx, true).await?;
+
+        // Check if we have result queries to persist for
+        if self.result_queries.is_empty() {
+            info!("No result paths to persist");
+            return Ok(());
+        }
+
+        let results = self
+            .last_results
+            .as_ref()
+            .expect("run should store last_results after successful execution");
+
+        let query = &self.result_queries[0];
+        let path = query.path.as_ref().ok_or_else(|| {
+            exec_datafusion_err!(
+                "Unable to persist results from query '{}', no result specified",
+                query.query
+            )
+        })?;
+
+        info!("Persisting results for query to {path}");
+
+        let first_batch = results
+            .first()
+            .ok_or_else(|| exec_datafusion_err!("Results should be loaded"))?;
+
+        let schema = first_batch.schema();
+        let provider = MemTable::try_new(schema, vec![results.clone()])?;
+
+        ctx.register_table("persist_data", Arc::new(provider))?;
+
+        let df = ctx.table("persist_data").await?;
+        df.write_csv(
+            path,
+            DataFrameWriteOptions::new(),
+            Some(
+                CsvOptions::default()
+                    .with_delimiter(b'|')
+                    .with_has_header(true),
+            ),
+        )
+        .await?;
+
+        ctx.deregister_table("persist_data")?;
+
+        Ok(())
+    }
+
+    /// Verifies persisted results against expected values.
+    ///
+    /// Executes the `result_query` or uses the stored last run results, then
+    /// compares actual output rows to the expected values defined in the
+    /// benchmark file.
+    ///
+    /// # Errors
+    /// Returns an error if no results are available or if the actual and
+    /// expected results differ in count or content.
+    pub async fn verify(&mut self, ctx: &SessionContext) -> Result<()> {
+        // Check if we have result queries to verify
+        if self.result_queries.is_empty() {
+            return Ok(());
+        }
+
+        if self.last_results.is_none() {
+            return Err(exec_datafusion_err!(
+                "No results available for verification. Run the benchmark first."
+            ));
+        }
+
+        info!("Verifying results...");
+
+        self.load_expected_result_files(ctx).await?;
+
+        // Get the first result query (assuming only one for now)
+        let query = &self.result_queries[0];
+        let formatted_actual_results = if !query.query.trim().is_empty() {
+            let results = ctx.sql(&query.query).await?.collect().await?;
+            format_record_batches(&results)
+        } else {
+            let actual_results = self
+                .last_results
+                .as_ref()
+                .expect("last_results should be present after successful run");
+            format_record_batches(actual_results)
+        }?;
+
+        Self::compare_results(query, &formatted_actual_results, &query.expected_result)
+    }
+
+    /// Runs `cleanup` queries to reset state after the benchmark run.
+    pub async fn cleanup(&mut self, ctx: &SessionContext) -> Result<()> {
+        info!("Running cleanup...");
+
+        let cleanup_queries = self.queries.get(&QueryDirective::Cleanup);
+
+        if let Some(queries) = cleanup_queries {
+            for query in queries {
+                let _ = ctx.sql(query).await?.collect().await?;
+            }
+        }
+
+        Ok(())
+    }
+
+    async fn load_expected_result_files(&mut self, ctx: &SessionContext) -> Result<()> {
+        for query in &mut self.result_queries {
+            if query.query.trim().is_empty() {
+                let Some(path) = query.path.clone() else {
+                    continue;
+                };
+
+                let loaded_query =
+                    read_query_from_file(ctx, path, &HashMap::new()).await?;
+                query.column_count = loaded_query.column_count;
+                query.expected_result = loaded_query.expected_result;
+            }
+        }
+
+        Ok(())
+    }
+
+    fn compare_results(
+        query: &BenchmarkQuery,
+        actual_results: &[Vec<String>],
+        expected_results: &[Vec<String>],
+    ) -> Result<()> {
+        if actual_results.is_empty() && expected_results.is_empty() {
+            return Ok(());
+        }
+
+        // Compare row count
+        if actual_results.len() != expected_results.len() {
+            return Err(exec_datafusion_err!(
+                "Error in result: expected {} rows but got {} for query {}",
+                expected_results.len(),
+                actual_results.len(),
+                query.query
+            ));
+        }
+
+        // Compare values
+        let zipped = actual_results
+            .iter()
+            .enumerate()
+            .zip(expected_results.iter());
+
+        for ((row_idx, actual), expected) in zipped {
+            trace!(
+                "row {}\nactual: {actual:?}\nexpected: {expected:?}",
+                row_idx + 1
+            );
+
+            // Compare column count
+            if actual.len() != expected.len() {
+                return Err(exec_datafusion_err!(
+                    "Error in result: expected {} columns but got {} for query {}",
+                    expected.len(),
+                    actual.len(),
+                    query.query
+                ));
+            }
+
+            for (col_idx, expected_val) in
+                expected.iter().enumerate().take(query.column_count)
+            {
+                // The row-width check above guarantees this index exists.
+                let actual_val = &actual[col_idx];
+
+                trace!("actual_val = {actual_val:?}\nexpected_val = {expected_val:?}");
+
+                if (expected_val == "NULL" && actual_val.is_empty())
+                    || (expected_val == actual_val)
+                    || (expected_val == "(empty)"
+                        && (actual_val.is_empty() || actual_val == "NULL"))
+                {
+                    continue;
+                }
+
+                return Err(exec_datafusion_err!(
+                    "Error in result on row {}, column {} running query \"{}\": expected value \
+                    \"{expected_val}\" but got value \"{actual_val}\" in row: {actual:?}",
+                    row_idx + 1,
+                    col_idx + 1,
+                    query.query
+                ));
+            }
+        }
+
+        Ok(())
+    }
+
+    async fn process_file(&mut self, ctx: &SessionContext, path: &Path) -> Result<()> {
+        debug!("Processing file {}", path.display());
+
+        let mut replacement_mapping = self.replacement_mapping.clone();
+        insert_replacement(
+            &mut replacement_mapping,
+            "FILE_PATH",
+            path.to_string_lossy().into_owned(),
+        );
+
+        let mut reader = BenchmarkFileReader::new(path, replacement_mapping)?;
+        let mut line = String::with_capacity(1024);
+        let mut reader_result = reader.read_line(&mut line);
+
+        while let Some(result) = reader_result {
+            match result {
+                Ok(_) => {
+                    if !is_blank_or_comment_line(&line) {
+                        // boxing required because of recursion
+                        Box::pin(self.process_line(ctx, &mut reader, &mut line)).await?;
+                    }
+                }
+                Err(e) => return Err(e),
+            }
+
+            // Clear the line buffer for the next iteration.
+            line.clear();
+            reader_result = reader.read_line(&mut line);
+        }
+
+        Ok(())
+    }
+
+    async fn process_line(
+        &mut self,
+        ctx: &SessionContext,
+        reader: &mut BenchmarkFileReader,
+        line: &mut String,
+    ) -> Result<()> {
+        // Split the line into directive and arguments.
+        let cloned_line = line.trim_start().to_string();
+        let splits: Vec<&str> = cloned_line.split_whitespace().collect();
+
+        BenchmarkDirective::select(reader, splits[0])?
+            .process(ctx, self, reader, line, &splits)
+            .await
+    }
+
+    fn process_query(&mut self, splits: &[&str], mut query: String) -> Result<()> {
+        debug!("Processing query {query}");
+
+        // Trim and validate.
+        query = query.trim().to_string();
+        if query.is_empty() {
+            return Ok(());
+        }
+
+        // remove comments
+        query = query
+            .lines()
+            .filter(|line| !is_comment_line(line))
+            .collect::<Vec<_>>()
+            .join("\n");
+
+        if query.trim().is_empty() {
+            return Ok(());
+        }
+
+        query = process_replacements(&query, self.replacement_mapping())?;
+
+        let directive = QueryDirective::parse(splits[0]).ok_or_else(|| {
+            exec_datafusion_err!("Invalid query directive: {}", splits[0])
+        })?;
+
+        self.queries.entry(directive).or_default().push(query);
+
+        Ok(())
+    }
+
+    fn validate_expected_plan(&self, physical_plan: &impl Debug) -> Result<()> {
+        if self.expect.is_empty() {
+            return Ok(());
+        }
+
+        let plan_string = format!("{physical_plan:#?}");
+
+        for exp_str in &self.expect {
+            if !plan_string.contains(exp_str) {
+                return Err(exec_datafusion_err!(
+                    "The query physical plan does not contain the expected string '{exp_str}'. Physical plan: {plan_string}"
+                ));
+            }
+        }
+
+        Ok(())
+    }
+
+    async fn execute_sql_without_result_buffering(
+        &self,
+        sql: &str,
+        ctx: &SessionContext,
+    ) -> Result<usize> {
+        let mut row_count = 0;
+
+        let df = ctx.sql(sql).await?;
+        let physical_plan = df.create_physical_plan().await?;
+
+        self.validate_expected_plan(&physical_plan)?;
+        let mut stream = execute_stream(physical_plan, ctx.task_ctx())?;
+
+        while let Some(batch) = stream.next().await {
+            row_count += batch?.num_rows();
+
+            // Evaluate the result and do nothing, the result will be dropped
+            // to reduce memory pressure
+        }
+
+        Ok(row_count)
+    }
+
+    pub fn name(&self) -> &str {
+        &self.name
+    }
+
+    pub fn group(&self) -> &str {
+        &self.group
+    }
+
+    pub fn subgroup(&self) -> &str {
+        &self.subgroup
+    }
+
+    pub fn benchmark_path(&self) -> &Path {
+        &self.benchmark_path
+    }
+
+    pub fn replacement_mapping(&self) -> &HashMap<String, String> {
+        &self.replacement_mapping
+    }
+
+    pub fn queries(&self) -> &HashMap<QueryDirective, Vec<String>> {
+        &self.queries
+    }
+
+    pub fn result_queries(&self) -> &[BenchmarkQuery] {
+        &self.result_queries
+    }
+
+    pub fn assert_queries(&self) -> &[BenchmarkQuery] {
+        &self.assert_queries
+    }
+
+    pub fn is_loaded(&self) -> bool {
+        self.is_loaded
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum QueryDirective {
+    Load,
+    Run,
+    Init,
+    Cleanup,
+}
+
+impl QueryDirective {
+    fn parse(value: &str) -> Option<Self> {
+        if value.eq_ignore_ascii_case("load") {
+            Some(Self::Load)
+        } else if value.eq_ignore_ascii_case("init") {
+            Some(Self::Init)
+        } else if value.eq_ignore_ascii_case("run") {
+            Some(Self::Run)
+        } else if value.eq_ignore_ascii_case("cleanup") {
+            Some(Self::Cleanup)
+        } else {
+            None
+        }
+    }
+
+    fn as_str(self) -> &'static str {
+        match self {
+            Self::Load => "load",
+            Self::Run => "run",
+            Self::Init => "init",
+            Self::Cleanup => "cleanup",
+        }
+    }
+}
+
+enum BenchmarkDirective {
+    Load,
+    Run,
+    Init,
+    Cleanup,
+    Name,
+    Group,
+    Subgroup,
+    Expect,
+    Assert,
+    ResultQuery,
+    Results,
+    Template,
+    Include,
+    Echo,
+}
+
+impl BenchmarkDirective {
+    fn select(
+        reader: &BenchmarkFileReader,
+        directive: &str,
+    ) -> Result<BenchmarkDirective> {
+        if directive.eq_ignore_ascii_case("load") {
+            Ok(BenchmarkDirective::Load)
+        } else if directive.eq_ignore_ascii_case("run") {
+            Ok(BenchmarkDirective::Run)
+        } else if directive.eq_ignore_ascii_case("init") {
+            Ok(BenchmarkDirective::Init)
+        } else if directive.eq_ignore_ascii_case("cleanup") {
+            Ok(BenchmarkDirective::Cleanup)
+        } else if directive.eq_ignore_ascii_case("name") {
+            Ok(BenchmarkDirective::Name)
+        } else if directive.eq_ignore_ascii_case("group") {
+            Ok(BenchmarkDirective::Group)
+        } else if directive.eq_ignore_ascii_case("subgroup") {
+            Ok(BenchmarkDirective::Subgroup)
+        } else if directive.eq_ignore_ascii_case("expect_plan") {
+            Ok(BenchmarkDirective::Expect)
+        } else if directive.eq_ignore_ascii_case("assert") {
+            Ok(BenchmarkDirective::Assert)
+        } else if directive.eq_ignore_ascii_case("result_query") {
+            Ok(BenchmarkDirective::ResultQuery)
+        } else if directive.eq_ignore_ascii_case("result") {
+            Ok(BenchmarkDirective::Results)
+        } else if directive.eq_ignore_ascii_case("template") {
+            Ok(BenchmarkDirective::Template)
+        } else if directive.eq_ignore_ascii_case("include") {
+            Ok(BenchmarkDirective::Include)
+        } else if directive.eq_ignore_ascii_case("echo") {
+            Ok(BenchmarkDirective::Echo)
+        } else {
+            Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception(&format!("Unrecognized command: {directive}"))
+            ))
+        }
+    }
+
+    async fn process(
+        &self,
+        ctx: &SessionContext,
+        bench: &mut SqlBenchmark,
+        reader: &mut BenchmarkFileReader,
+        line: &mut String,
+        splits: &[&str],
+    ) -> Result<()> {
+        trace!("-- handling {}", splits[0]);
+
+        match self {
+            BenchmarkDirective::Load
+            | BenchmarkDirective::Run
+            | BenchmarkDirective::Init
+            | BenchmarkDirective::Cleanup => {
+                Self::process_query_directive(bench, reader, line, splits)
+            }
+            BenchmarkDirective::Name => Self::process_metadata_value(
+                bench,
+                reader,
+                line,
+                "name",
+                "BENCH_NAME",
+                "name must be followed by a value",
+            ),
+            BenchmarkDirective::Group => Self::process_metadata_value(
+                bench,
+                reader,
+                line,
+                "group",
+                "BENCH_GROUP",
+                "group must be followed by a value",
+            ),
+            BenchmarkDirective::Subgroup => Self::process_metadata_value(
+                bench,
+                reader,
+                line,
+                "subgroup",
+                "BENCH_SUBGROUP",
+                "subgroup must be followed by a value",
+            ),
+            BenchmarkDirective::Expect => Self::process_expect(bench, reader, splits),
+            BenchmarkDirective::Assert => {
+                Self::process_assert(bench, reader, line, splits)
+            }
+            BenchmarkDirective::ResultQuery => {
+                Self::process_result_query(bench, reader, line, splits)
+            }
+            BenchmarkDirective::Results => Self::process_results(bench, reader, splits),
+            BenchmarkDirective::Template => {
+                Self::process_template(ctx, bench, reader, line, splits).await
+            }
+            BenchmarkDirective::Include => {
+                Self::process_include(ctx, bench, reader, splits).await
+            }
+            BenchmarkDirective::Echo => Self::process_echo(bench, reader, splits),
+        }
+    }
+
+    fn process_query_directive(
+        bench: &mut SqlBenchmark,
+        reader: &mut BenchmarkFileReader,
+        line: &mut String,
+        splits: &[&str],
+    ) -> Result<()> {
+        let directive = QueryDirective::parse(splits[0]).ok_or_else(|| {
+            exec_datafusion_err!("Invalid query directive: {}", splits[0])
+        })?;
+
+        if directive == QueryDirective::Run && bench.queries.contains_key(&directive) {
+            return Err(exec_datafusion_err!(
+                "Multiple calls to run in the same benchmark file"
+            ));
+        }
+
+        line.clear();
+
+        // Read the query body until a blank line or EOF.
+        let mut query = String::new();
+        let mut reader_result = reader.read_line(line);
+
+        loop {
+            match reader_result {
+                Some(Ok(_)) => {
+                    if is_comment_line(line) {
+                        // comment, ignore
+                    } else if is_blank_line(line) {
+                        break;
+                    } else {
+                        query.push_str(line);
+                        query.push('\n');
+                    }
+                }
+                Some(Err(e)) => return Err(e),
+                None => break,
+            }
+
+            // Clear the line buffer for the next iteration.
+            line.clear();
+            reader_result = reader.read_line(line);
+        }
+
+        // Optional file parameter.
+        if splits.len() > 1 && !splits[1].is_empty() {
+            if !query.trim().is_empty() {
+                return Err(exec_datafusion_err!(
+                    "{}",
+                    reader.format_exception(&format!(
+                        "{} directive must use either a query file or inline SQL, not both",
+                        directive.as_str()
+                    ))
+                ));
+            }
+
+            debug!("Processing {} file: {}", splits[0], splits[1]);
+
+            let query_file = fs::read_to_string(splits[1]).map_err(|e| {
+                exec_datafusion_err!("Failed to read query file {}: {e}", splits[1])
+            })?;
+            let query_file = query_file.replace("\r\n", "\n");
+
+            // some files have multiple queries, split apart
+            for query in split_query_statements(&query_file) {
+                bench.process_query(splits, query.to_string())?;
+            }
+        } else if directive == QueryDirective::Run {
+            for query in split_query_statements(&query) {
+                bench.process_query(splits, query.to_string())?;
+            }
+        } else {
+            bench.process_query(splits, query)?;
+        }
+
+        Ok(())
+    }
+
+    fn process_metadata_value(
+        bench: &mut SqlBenchmark,
+        reader: &mut BenchmarkFileReader,
+        line: &str,
+        directive: &str,
+        replacement_key: &str,
+        message: &str,
+    ) -> Result<()> {
+        let value =
+            directive_value(reader, line.trim_start(), directive, message)?.to_string();
+
+        match directive {
+            "name" => bench.name.clone_from(&value),
+            "group" => bench.group.clone_from(&value),
+            "subgroup" => bench.subgroup.clone_from(&value),
+            _ => unreachable!("unsupported metadata directive: {directive}"),
+        }
+
+        insert_replacement(
+            &mut bench.replacement_mapping,
+            replacement_key,
+            value.clone(),
+        );
+        insert_replacement(&mut reader.replacements, replacement_key, value);
+
+        Ok(())
+    }
+
+    fn process_expect(
+        bench: &mut SqlBenchmark,
+        reader: &BenchmarkFileReader,
+        splits: &[&str],
+    ) -> Result<()> {
+        trace!("-- handling {}", splits[0]);
+
+        if splits.len() <= 1 || splits[1].is_empty() {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception(
+                    "expect_plan must be followed by a string to search in the physical plan"
+                )
+            ));
+        }
+
+        bench.expect.push(splits[1..].join(" ").to_string());
+
+        Ok(())
+    }
+
+    fn process_assert(
+        bench: &mut SqlBenchmark,
+        reader: &mut BenchmarkFileReader,
+        line: &mut String,
+        splits: &[&str],
+    ) -> Result<()> {
+        // count the amount of columns based on character count. The actual
+        // character used is irrelevant.
+        if splits.len() <= 1 || splits[1].is_empty() {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception(
+                    "assert must be followed by a column count (e.g. assert III)"
+                )
+            ));
+        }
+
+        line.clear();
+
+        // read the actual query
+        let mut found_break = false;
+        let mut sql = String::new();
+        let mut reader_result = reader.read_line(line);
+
+        loop {
+            match reader_result {
+                Some(Ok(_)) => {
+                    if line.trim() == "----" {
+                        found_break = true;
+                        break;
+                    }
+                    sql.push('\n');
+                    sql.push_str(line);
+                }
+                Some(Err(e)) => return Err(e),
+                None => break,
+            }
+
+            // Clear the line buffer for the next iteration.
+            line.clear();
+            reader_result = reader.read_line(line);
+        }
+
+        if !found_break {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception(
+                    "assert must be followed by a query and a result (separated by ----)"
+                )
+            ));
+        }
+
+        bench
+            .assert_queries
+            .push(read_query_from_reader(reader, &sql, splits[1])?);
+
+        Ok(())
+    }
+
+    fn process_results(
+        bench: &mut SqlBenchmark,
+        reader: &BenchmarkFileReader,
+        splits: &[&str],
+    ) -> Result<()> {
+        if splits.len() <= 1 || splits[1].is_empty() {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception(
+                    "result must be followed by a path to a result file"
+                )
+            ));
+        }
+
+        if !bench.result_queries.is_empty() {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception("multiple results found")
+            ));
+        }
+
+        let path = process_replacements(splits[1], &bench.replacement_mapping)?;
+
+        bench.result_queries.push(BenchmarkQuery {
+            path: Some(path),
+            query: String::new(),
+            column_count: 0,
+            expected_result: vec![],
+        });
+
+        Ok(())
+    }
+
+    fn process_result_query(
+        bench: &mut SqlBenchmark,
+        reader: &mut BenchmarkFileReader,
+        line: &mut String,
+        splits: &[&str],
+    ) -> Result<()> {
+        if splits.len() <= 1 || splits[1].is_empty() {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception(
+                    "result_query must be followed by a column count (e.g. result_query III)"
+                )
+            ));
+        }
+
+        line.clear();
+
+        let mut sql = String::new();
+        let mut found_break = false;
+        let mut reader_result = reader.read_line(line);
+
+        loop {
+            match reader_result {
+                Some(Ok(_)) => {
+                    if line.trim() == "----" {
+                        found_break = true;
+                        break;
+                    }
+                    sql.push_str(line);
+                    sql.push('\n');
+                }
+                Some(Err(e)) => return Err(e),
+                None => break,
+            }
+
+            // Clear the line buffer for the next iteration.
+            line.clear();
+            reader_result = reader.read_line(line);
+        }
+
+        if !found_break {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception(
+                    "result_query must be followed by a query and a result (separated by ----)"
+                )
+            ));
+        }
+
+        let result_check = read_query_from_reader(reader, &sql, splits[1])?;
+
+        if !bench.result_queries.is_empty() {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception("multiple results found")
+            ));
+        }
+        bench.result_queries.push(result_check);
+
+        Ok(())
+    }
+
+    async fn process_template(
+        ctx: &SessionContext,
+        bench: &mut SqlBenchmark,
+        reader: &mut BenchmarkFileReader,
+        line: &mut String,
+        splits: &[&str],
+    ) -> Result<()> {
+        if splits.len() != 2 || splits[1].is_empty() {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception("template requires a single template path")
+            ));
+        }
+
+        // template: update the path to read
+        bench.benchmark_path = PathBuf::from(splits[1]);
+
+        line.clear();
+
+        // now read parameters
+        let mut reader_result = reader.read_line(line);
+
+        loop {
+            match reader_result {
+                Some(Ok(_)) => {
+                    if is_comment_line(line) {
+                        // Clear the line buffer for the next iteration.
+                        line.clear();
+                        reader_result = reader.read_line(line);
+                        continue;
+                    }
+                    if is_blank_line(line) {
+                        break;
+                    }
+
+                    let Some((key, value)) = line.trim_start().split_once('=') else {
+                        return Err(exec_datafusion_err!(
+                            "{}",
+                            reader.format_exception(
+                                "Expected a template parameter in the form of X=Y"
+                            )
+                        ));
+                    };
+                    insert_replacement(
+                        &mut bench.replacement_mapping,
+                        key.trim(),
+                        value.trim().to_string(),
+                    );
+                }
+                Some(Err(e)) => return Err(e),
+                None => break,
+            }
+
+            // Clear the line buffer for the next iteration.
+            line.clear();
+            reader_result = reader.read_line(line);
+        }
+
+        // restart the load from the template file
+        Box::pin(bench.process_file(ctx, Path::new(splits[1]))).await
+    }
+
+    async fn process_include(
+        ctx: &SessionContext,
+        bench: &mut SqlBenchmark,
+        reader: &BenchmarkFileReader,
+        splits: &[&str],
+    ) -> Result<()> {
+        if splits.len() != 2 || splits[1].is_empty() {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception("include requires a single argument")
+            ));
+        }
+
+        Box::pin(bench.process_file(ctx, Path::new(splits[1]))).await
+    }
+
+    fn process_echo(
+        bench: &mut SqlBenchmark,
+        reader: &BenchmarkFileReader,
+        splits: &[&str],
+    ) -> Result<()> {
+        if splits.len() < 2 {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception("Echo requires an argument")
+            ));
+        }
+
+        bench.echo.push(splits[1..].join(" "));
+
+        Ok(())
+    }
+}
+
+struct BenchmarkFileReader {
+    path: PathBuf,
+    reader: BufReader<File>,
+    line_nr: usize,
+    replacements: HashMap<String, String>,
+}
+
+impl BenchmarkFileReader {
+    fn new<P: Into<PathBuf>>(
+        path: P,
+        replacements: HashMap<String, String>,
+    ) -> Result<Self> {
+        let path = path.into();
+        let file = OpenOptions::new().read(true).open(&path)?;
+
+        Ok(Self {
+            path,
+            reader: BufReader::new(file),
+            line_nr: 0,
+            replacements,
+        })
+    }
+
+    /// Read the next line, applying replacements and removing line terminators.
+    fn read_line(&mut self, line: &mut String) -> Option<Result<()>> {
+        match self.reader.read_line(line) {
+            Ok(0) => None,
+            Ok(_) => {
+                self.line_nr += 1;
+
+                // Trim newline and carriage return without changing other content.
+                let trimmed_len = line.trim_end_matches(['\n', '\r']).len();
+                line.truncate(trimmed_len);
+
+                match process_replacements(line, &self.replacements) {
+                    Ok(l) => {
+                        *line = l;
+                        Some(Ok(()))
+                    }
+                    Err(error) => Some(Err(error)),
+                }
+            }
+            Err(e) => Some(Err(e.into())),
+        }
+    }
+
+    fn format_exception(&self, msg: &str) -> String {
+        format!("{}:{} - {}", self.path.display(), self.line_nr, msg)
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct BenchmarkQuery {
+    path: Option<String>,
+    query: String,
+    column_count: usize,
+    expected_result: Vec<Vec<String>>,
+}
+
+// ---- utility function below
+
+fn directive_value<'a>(
+    reader: &BenchmarkFileReader,
+    line: &'a str,
+    directive: &str,
+    message: &str,
+) -> Result<&'a str> {
+    let value = line
+        .get(..directive.len())
+        .filter(|prefix| prefix.eq_ignore_ascii_case(directive))
+        .and_then(|_| line.get(directive.len()..))
+        .map(str::trim)
+        .filter(|s| !s.is_empty())
+        .ok_or_else(|| exec_datafusion_err!("{}", reader.format_exception(message)))?;
+
+    Ok(value)
+}
+
+fn parse_group_from_path(path: &Path, benchmark_directory: &Path) -> String {
+    let mut group_name = String::new();
+    let mut parent = path.parent();
+
+    while let Some(p) = parent {
+        if path_ends_with_ignore_ascii_case(p, benchmark_directory) {
+            break;
+        }
+
+        if let Some(dir_name) = p.file_name() {
+            group_name = dir_name.to_string_lossy().into_owned();
+        }
+
+        parent = p.parent();
+    }
+
+    if group_name.is_empty() {
+        warn!("Unable to find group name in path: {}", path.display());
+    }
+
+    group_name
+}
+
+fn path_ends_with_ignore_ascii_case(path: &Path, suffix: &Path) -> bool {
+    let mut path_components = path.components().rev();
+
+    for suffix_component in suffix.components().rev() {
+        let Some(path_component) = path_components.next() else {
+            return false;
+        };
+
+        if !path_component
+            .as_os_str()
+            .to_string_lossy()
+            .eq_ignore_ascii_case(&suffix_component.as_os_str().to_string_lossy())
+        {
+            return false;
+        }
+    }
+
+    true
+}
+
+fn starts_with_ignore_ascii_case(input: &str, prefix: &str) -> bool {
+    input
+        .get(..prefix.len())
+        .is_some_and(|value| value.eq_ignore_ascii_case(prefix))
+}
+
+fn split_query_statements(sql: &str) -> impl Iterator<Item = &str> {
+    sql.split("\n\n")
+        .flat_map(|query| {
+            query
+                .split_inclusive(";\n")
+                .map(|part| part.trim_end_matches('\n'))
+        })
+        .filter(|query| !query.trim().is_empty())
+}
+
+fn is_blank_line(line: &str) -> bool {
+    line.trim().is_empty()
+}
+
+fn is_comment_line(line: &str) -> bool {
+    let line = line.trim_start();
+    line.starts_with('#') || line.starts_with("--")
+}
+
+fn is_blank_or_comment_line(line: &str) -> bool {
+    is_blank_line(line) || is_comment_line(line)
+}
+
+fn insert_replacement(
+    replacement_map: &mut HashMap<String, String>,
+    key: &str,
+    value: String,
+) {
+    replacement_map.insert(key.to_lowercase(), value);
+}
+
+fn replace_all<E>(
+    re: &Regex,
+    haystack: &str,
+    replacement: impl Fn(&regex::Captures) -> Result<String, E>,
+) -> Result<String, E> {
+    let mut new = String::with_capacity(haystack.len());
+    let mut last_match = 0;
+
+    for caps in re.captures_iter(haystack) {
+        let m = caps.get(0).unwrap();
+
+        new.push_str(&haystack[last_match..m.start()]);
+        new.push_str(&replacement(&caps)?);
+
+        last_match = m.end();
+    }
+
+    new.push_str(&haystack[last_match..]);
+
+    Ok(new)
+}
+
+static TRUE_FALSE_REPLACEMENT_RE: LazyLock<Regex> = LazyLock::new(|| {
+    Regex::new(r"\$\{(\w+)(?::-([^|}]+))?\|([^|]+)\|([^}]+)}")
+        .expect("Regex failed to compile")
+});
+
+static VARIABLE_REPLACEMENT_RE: LazyLock<Regex> = LazyLock::new(|| {
+    Regex::new(r"\$\{(\w+)(?::-([^}]+))?}").expect("Regex failed to compile")
+});
+
+/// Replace all `${KEY}` or `${KEY:-default}` placeholders in a string according to the mapping.
+/// Also handles `${KEY:-default|True value|false value}` syntax.
+fn process_replacements(
+    input: &str,
+    replacement_map: &HashMap<String, String>,
+) -> Result<String> {
+    process_replacements_with_env(input, replacement_map, |key| std::env::var(key).ok())
+}
+
+fn process_replacements_with_env(
+    input: &str,
+    replacement_map: &HashMap<String, String>,
+    get_env: impl Fn(&str) -> Option<String>,
+) -> Result<String> {
+    debug!("processing replacements for line '{input}'");
+
+    // handle ${VAR:-default|true value|false value} syntax
+    let replacement = |caps: &regex::Captures| -> Result<String> {
+        let key = &caps[1];
+        let default = caps.get(2).map(|m| m.as_str().to_string());
+        let true_val = &caps[3];
+        let false_val = &caps[4];
+
+        let value = lookup_replacement_value(key, replacement_map, &get_env).or(default);
+
+        match value {
+            Some(v) if v.eq_ignore_ascii_case("true") => Ok(true_val.to_string()),
+            Some(_) => Ok(false_val.to_string()),
+            None => Err(exec_datafusion_err!("Missing value for key '{key}'")),
+        }
+    };
+    let input = replace_all(&TRUE_FALSE_REPLACEMENT_RE, input, replacement)?;
+
+    // handle ${KEY} and ${KEY:-default}`
+    let replacement = |caps: &regex::Captures| -> Result<String> {
+        let key = &caps[1];
+        let default = caps.get(2);
+
+        if let Some(v) = lookup_replacement_value(key, replacement_map, &get_env) {
+            return Ok(v.to_string());
+        }
+
+        // use default if it was set
+        if let Some(def) = default {
+            Ok(def.as_str().to_string())
+        } else {
+            Err(exec_datafusion_err!("Missing value for key '{key}'"))
+        }
+    };
+
+    replace_all(&VARIABLE_REPLACEMENT_RE, &input, replacement)
+}
+
+fn lookup_replacement_value(
+    key: &str,
+    replacement_map: &HashMap<String, String>,
+    get_env: &impl Fn(&str) -> Option<String>,
+) -> Option<String> {
+    if let Some(v) = replacement_map.get(&key.to_lowercase()) {
+        return Some(v.to_string());
+    }
+
+    // look in env variables
+    get_env(&key.to_uppercase())
+}
+
+fn read_query_from_reader(
+    reader: &mut BenchmarkFileReader,
+    sql: &str,
+    header: &str,
+) -> Result<BenchmarkQuery> {
+    let column_count = header.len();
+    let mut expected_result = vec![];
+    let mut line = String::new();
+    let mut reader_result = reader.read_line(&mut line);
+
+    loop {
+        match reader_result {
+            Some(Ok(_)) => {
+                if is_comment_line(&line) {
+                    // comment, ignore
+                } else if is_blank_line(&line) {
+                    break;
+                } else {
+                    let result_splits: Vec<&str> = line.split(['\t', '|']).collect();
+
+                    if result_splits.len() != column_count {
+                        return Err(exec_datafusion_err!(
+                            "{} {line}",
+                            reader.format_exception(&format!(
+                                "expected {} values but got {}",
+                                column_count,
+                                result_splits.len(),
+                            ))
+                        ));
+                    }
+
+                    expected_result
+                        .push(result_splits.into_iter().map(|s| s.to_string()).collect());
+                }
+            }
+            Some(Err(e)) => return Err(e),
+            None => break,
+        }
+
+        // Clear the line buffer for the next iteration.
+        line.clear();
+        reader_result = reader.read_line(&mut line);
+    }
+
+    Ok(BenchmarkQuery {
+        path: None,
+        query: sql.to_string(),
+        column_count,
+        expected_result,
+    })
+}
+
+async fn read_query_from_file(
+    ctx: &SessionContext,
+    path: impl AsRef<Path>,
+    replacement_mapping: &HashMap<String, String>,
+) -> Result<BenchmarkQuery> {
+    // Process replacements in file path
+    let path = path.as_ref().to_string_lossy();
+    let path = process_replacements(&path, replacement_mapping)?;
+    let df: DataFrame = ctx
+        .read_csv(
+            path.clone(),
+            CsvReadOptions::new()
+                .has_header(true)
+                .delimiter(b'|')
+                .null_regex(Some("NULL".to_string()))
+                // we only want string values, we do not want to infer the schema
+                .schema_infer_max_records(0),
+        )
+        .await?;
+
+    // Get schema to determine column count
+    let schema = df.schema();
+    let column_count = schema.fields().len();
+
+    if column_count == 0 {
+        return Err(exec_datafusion_err!(
+            "Result file {path} did not contain any columns"
+        ));
+    }
+
+    // Execute and collect results
+    let batches = df.collect().await?;
+    // Convert record batches to string vectors
+    let expected_result = format_record_batches(&batches)?;
+
+    Ok(BenchmarkQuery {
+        path: Some(path),
+        query: String::new(),
+        column_count,
+        expected_result,
+    })
+}
+
+fn format_record_batches(
+    batches: &[RecordBatch],
+) -> Result<Vec<Vec<String>>, DataFusionError> {
+    let mut expected_result = vec![];
+    let arrow_format_options = FormatOptions::default()
+        .with_null("NULL")
+        .with_display_error(true);
+
+    for batch in batches {
+        let schema = batch.schema_ref();
+
+        let formatters = batch
+            .columns()
+            .iter()
+            .zip(schema.fields().iter())
+            .map(|(c, field)| make_array_formatter(c, &arrow_format_options, Some(field)))
+            .collect::<Result<Vec<_>, ArrowError>>()?;
+
+        for row in 0..batch.num_rows() {
+            let mut cells = vec![];
+            for formatter in &formatters {
+                cells.push(formatter.value(row).to_string());
+            }
+            expected_result.push(cells);
+        }
+    }
+
+    Ok(expected_result)
+}
+
+fn make_array_formatter<'a>(
+    array: &'a dyn Array,
+    options: &FormatOptions<'a>,
+    field: Option<&'a Field>,
+) -> Result<ArrayFormatter<'a>, ArrowError> {
+    match options.formatter_factory() {
+        None => ArrayFormatter::try_new(array, options),
+        Some(formatters) => formatters
+            .create_array_formatter(array, options, field)
+            .transpose()
+            .unwrap_or_else(|| ArrayFormatter::try_new(array, options)),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use datafusion::prelude::SessionContext;
+    use std::fs;
+    use std::path::{Path, PathBuf};
+    use tempfile::{TempDir, tempdir};
+
+    fn write_test_file(temp_dir: &TempDir, name: &str, contents: &str) -> PathBuf {
+        let path = temp_dir.path().join(name);
+        fs::write(&path, contents).expect("failed to write benchmark test file");
+        path
+    }
+
+    async fn parse_benchmark_file(path: &Path) -> Result<SqlBenchmark> {
+        let ctx = SessionContext::new();
+        let path_string = path.to_string_lossy().into_owned();
+        SqlBenchmark::new(&ctx, &path_string, "/tmp").await
+    }
+
+    async fn parse_benchmark(contents: &str) -> Result<SqlBenchmark> {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let path = write_test_file(&temp_dir, "parser.benchmark", contents);
+
+        parse_benchmark_file(&path).await
+    }
+
+    async fn assert_parse_error(contents: &str, expected_message: &str) {
+        let error = parse_benchmark(contents)
+            .await
+            .expect_err("benchmark parsing should fail");
+
+        let message = error.to_string();
+        assert!(
+            message.contains(expected_message),
+            "expected error containing {expected_message:?}, got {message:?}"
+        );
+    }
+
+    fn assert_result_error_contains<T: Debug>(result: Result<T>, expected_message: &str) {
+        let error = result.expect_err("operation should fail");
+        let message = error.to_string();
+        assert!(
+            message.contains(expected_message),
+            "expected error containing {expected_message:?}, got {message:?}"
+        );
+    }
+
+    fn formatted_last_results(benchmark: &SqlBenchmark) -> Vec<Vec<String>> {
+        format_record_batches(
+            benchmark
+                .last_results
+                .as_ref()
+                .expect("last results should be set"),
+        )
+        .expect("results should format")
+    }
+
+    fn read_all_files_in_dir(path: &Path) -> String {
+        let mut entries = fs::read_dir(path)
+            .expect("directory should be readable")
+            .filter_map(Result::ok)
+            .map(|entry| entry.path())
+            .filter(|path| path.is_file())
+            .collect::<Vec<_>>();
+        entries.sort();
+
+        let mut contents = String::new();
+        for path in entries {
+            contents
+                .push_str(&fs::read_to_string(path).expect("file should be readable"));
+        }
+        contents
+    }
+
+    fn replacement_map(entries: &[(&str, &str)]) -> HashMap<String, String> {
+        let mut replacements = HashMap::new();
+        for (key, value) in entries {
+            insert_replacement(&mut replacements, key, value.to_string());
+        }
+        replacements
+    }
+
+    fn env_map(entries: &[(&str, &str)]) -> HashMap<String, String> {
+        entries
+            .iter()
+            .map(|(key, value)| (key.to_string(), value.to_string()))
+            .collect()
+    }
+
+    // Replacement tests cover benchmark variable expansion syntax.
+
+    #[test]
+    fn process_replacements_replaces_map_values_case_insensitively() {
+        let replacements = replacement_map(&[
+            ("BENCH_NAME", "tpch"),
+            ("QUERY_NUMBER_PADDED", "01"),
+            ("format_1", "parquet"),
+        ]);
+
+        let actual = process_replacements_with_env(
+            "${bench_name}/q${query_number_padded}.${FORMAT_1}",
+            &replacements,
+            |_| None,
+        )
+        .expect("replacement should succeed");
+
+        assert_eq!(actual, "tpch/q01.parquet");
+    }
+
+    #[test]
+    fn process_replacements_uses_env_when_map_value_is_missing() {
+        let replacements = HashMap::new();
+        let env = env_map(&[("DATA_DIR", "/tmp/data")]);
+
+        let actual = process_replacements_with_env(
+            "${data_dir}/lineitem.parquet",
+            &replacements,
+            |key| env.get(key).cloned(),
+        )
+        .expect("replacement should succeed");
+
+        assert_eq!(actual, "/tmp/data/lineitem.parquet");
+    }
+
+    #[test]
+    fn process_replacements_prefers_map_over_env() {
+        let replacements = replacement_map(&[("BENCH_SIZE", "10")]);
+        let env = env_map(&[("BENCH_SIZE", "100")]);
+
+        let actual =
+            process_replacements_with_env("sf${BENCH_SIZE}", &replacements, |key| {
+                env.get(key).cloned()
+            })
+            .expect("replacement should succeed");
+
+        assert_eq!(actual, "sf10");
+    }
+
+    #[test]
+    fn process_replacements_uses_default_for_missing_variable() {
+        let replacements = HashMap::new();
+
+        let actual = process_replacements_with_env(
+            "load_${BENCH_SUBGROUP:-groupby}_${FILE_TYPE:-csv}.sql",
+            &replacements,
+            |_| None,
+        )
+        .expect("replacement should succeed");
+
+        assert_eq!(actual, "load_groupby_csv.sql");
+    }
+
+    #[test]
+    fn process_replacements_reports_missing_variable_without_default() {
+        let replacements = HashMap::new();
+
+        let error = process_replacements_with_env("${MISSING}", &replacements, |_| None)
+            .expect_err("replacement should fail");
+
+        assert!(
+            error
+                .to_string()
+                .contains("Missing value for key 'MISSING'"),
+            "unexpected error: {error}"
+        );
+    }
+
+    #[test]
+    fn process_replacements_applies_true_false_true_branch() {
+        let replacements = HashMap::new();
+        let env = env_map(&[("USE_PARQUET", "TrUe")]);
+
+        let actual = process_replacements_with_env(
+            "load_${USE_PARQUET:-false|parquet|csv}.sql",
+            &replacements,
+            |key| env.get(key).cloned(),
+        )
+        .expect("replacement should succeed");
+
+        assert_eq!(actual, "load_parquet.sql");
+    }
+
+    #[test]
+    fn process_replacements_applies_true_false_false_branch() {
+        let replacements = HashMap::new();
+        let env = env_map(&[("USE_PARQUET", "false")]);
+
+        let actual = process_replacements_with_env(
+            "load_${USE_PARQUET:-true|parquet|csv}.sql",
+            &replacements,
+            |key| env.get(key).cloned(),
+        )
+        .expect("replacement should succeed");
+
+        assert_eq!(actual, "load_csv.sql");
+    }
+
+    #[test]
+    fn process_replacements_uses_map_for_true_false_branch() {
+        let replacements = replacement_map(&[("USE_PARQUET", "true")]);
+
+        let actual = process_replacements_with_env(
+            "load_${USE_PARQUET:-false|parquet|csv}.sql",
+            &replacements,
+            |_| None,
+        )
+        .expect("replacement should succeed");
+
+        assert_eq!(actual, "load_parquet.sql");
+    }
+
+    #[test]
+    fn process_replacements_prefers_map_over_env_for_true_false_branch() {
+        let replacements = replacement_map(&[("USE_PARQUET", "false")]);
+        let env = env_map(&[("USE_PARQUET", "true")]);
+
+        let actual = process_replacements_with_env(
+            "load_${USE_PARQUET:-true|parquet|csv}.sql",
+            &replacements,
+            |key| env.get(key).cloned(),
+        )
+        .expect("replacement should succeed");
+
+        assert_eq!(actual, "load_csv.sql");
+    }
+
+    #[test]
+    fn process_replacements_uses_true_false_default_for_missing_true_value() {
+        let replacements = HashMap::new();
+
+        let actual = process_replacements_with_env(
+            "load_${USE_PARQUET:-true|parquet|csv}.sql",
+            &replacements,
+            |_| None,
+        )
+        .expect("replacement should succeed");
+
+        assert_eq!(actual, "load_parquet.sql");
+    }
+
+    #[test]
+    fn process_replacements_uses_true_false_default_for_missing_false_value() {
+        let replacements = HashMap::new();
+
+        let actual = process_replacements_with_env(
+            "load_${USE_PARQUET:-false|parquet|csv}.sql",
+            &replacements,
+            |_| None,
+        )
+        .expect("replacement should succeed");
+
+        assert_eq!(actual, "load_csv.sql");
+    }
+
+    #[test]
+    fn process_replacements_reports_missing_true_false_variable_without_default() {
+        let replacements = HashMap::new();
+
+        let error = process_replacements_with_env(
+            "load_${USE_PARQUET|parquet|csv}.sql",
+            &replacements,
+            |_| None,
+        )
+        .expect_err("replacement should fail");
+
+        assert!(
+            error
+                .to_string()
+                .contains("Missing value for key 'USE_PARQUET'"),
+            "unexpected error: {error}"
+        );
+    }
+
+    #[test]
+    fn process_replacements_resolves_variables_after_true_false_replacement() {
+        let replacements = replacement_map(&[("FILE_TYPE", "parquet")]);
+        let env = env_map(&[("USE_TYPED_PATH", "true")]);
+
+        let actual = process_replacements_with_env(
+            "${USE_TYPED_PATH:-false|data.${FILE_TYPE}|data.csv}",
+            &replacements,
+            |key| env.get(key).cloned(),
+        )
+        .expect("replacement should succeed");
+
+        assert_eq!(actual, "data.parquet");
+    }
+
+    #[test]
+    fn process_replacements_leaves_unsupported_placeholder_syntax_unchanged() {
+        let replacements = HashMap::new();
+
+        let actual =
+            process_replacements_with_env("${BAD-KEY:-fallback}", &replacements, |_| {
+                None
+            })
+            .expect("unsupported placeholder should not match replacement regex");
+
+        assert_eq!(actual, "${BAD-KEY:-fallback}");
+    }
+
+    // Parser tests cover benchmark directives and parse-time validation.
+
+    #[tokio::test]
+    async fn parser_accepts_metadata_expect_echo_and_sql_sections() {
+        let benchmark = parse_benchmark(
+            r#"
+# top-level comments are ignored
+name Parser Success
+group Parser Group
+subgroup Parser Subgroup
+expect_plan ProjectionExec with details
+echo hello from parser
+
+load
+-- query comments are ignored
+CREATE TABLE t AS VALUES (1);
+
+init
+CREATE VIEW v AS SELECT * FROM t;
+
+run
+SELECT * FROM v;
+
+cleanup
+DROP VIEW v;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+
+        assert_eq!(benchmark.name(), "Parser Success");
+        assert_eq!(benchmark.group(), "Parser Group");
+        assert_eq!(benchmark.subgroup(), "Parser Subgroup");
+        assert_eq!(benchmark.expect, vec!["ProjectionExec with details"]);
+        assert_eq!(benchmark.echo, vec!["hello from parser"]);
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Load)
+                .expect("load query"),
+            &vec!["CREATE TABLE t AS VALUES (1);".to_string()]
+        );
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Init)
+                .expect("init query"),
+            &vec!["CREATE VIEW v AS SELECT * FROM t;".to_string()]
+        );
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run query"),
+            &vec!["SELECT * FROM v;".to_string()]
+        );
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Cleanup)
+                .expect("cleanup query"),
+            &vec!["DROP VIEW v;".to_string()]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_splits_inline_run_block_on_semicolon_newline() {
+        let benchmark = parse_benchmark(
+            r#"
+run
+CREATE TABLE t AS SELECT 1 AS value;
+SELECT value + 1 AS value FROM t;
+DROP TABLE t;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run query"),
+            &vec![
+                "CREATE TABLE t AS SELECT 1 AS value;".to_string(),
+                "SELECT value + 1 AS value FROM t;".to_string(),
+                "DROP TABLE t;".to_string(),
+            ]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_assert_with_expected_rows() {
+        let benchmark = parse_benchmark(
+            r#"
+assert II
+select 1, 'one'
+----
+1|one
+2	two
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+
+        let query = benchmark
+            .assert_queries()
+            .first()
+            .expect("assert query should be parsed");
+
+        assert_eq!(query.column_count, 2);
+        assert!(query.query.contains("select 1, 'one'"));
+        assert_eq!(
+            query.expected_result,
+            vec![
+                vec!["1".to_string(), "one".to_string()],
+                vec!["2".to_string(), "two".to_string()]
+            ]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_result_query_with_expected_rows() {
+        let benchmark = parse_benchmark(
+            r#"
+result_query II
+select 1, 'one'
+----
+1|one
+NULL|(empty)
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+
+        let query = benchmark
+            .result_queries()
+            .first()
+            .expect("result query should be parsed");
+
+        assert_eq!(query.path, None);
+        assert_eq!(query.column_count, 2);
+        assert!(query.query.contains("select 1, 'one'"));
+        assert_eq!(
+            query.expected_result,
+            vec![
+                vec!["1".to_string(), "one".to_string()],
+                vec!["NULL".to_string(), "(empty)".to_string()]
+            ]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_records_result_file_without_parsing_contents() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let result_path =
+            write_test_file(&temp_dir, "result.csv", "col_a|col_b\n1|one\nNULL|two\n");
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "result.benchmark",
+            &format!("result {}\n", result_path.display()),
+        );
+
+        let benchmark = parse_benchmark_file(&benchmark_path)
+            .await
+            .expect("benchmark should parse");
+
+        let query = benchmark
+            .result_queries()
+            .first()
+            .expect("result file should be parsed");
+
+        assert_eq!(query.path, Some(result_path.to_string_lossy().into_owned()));
+        assert_eq!(query.column_count, 0);
+        assert!(query.expected_result.is_empty());
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_include_file() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let include_path =
+            write_test_file(&temp_dir, "include.benchmark", "run\nselect 1\n");
+
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "include_driver.benchmark",
+            &format!("include {}\n", include_path.display()),
+        );
+
+        let result = parse_benchmark_file(&benchmark_path).await;
+
+        let benchmark = result.expect("benchmark should parse");
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run query"),
+            &vec!["select 1".to_string()]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_template_file_with_parameters() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let template_path = write_test_file(
+            &temp_dir,
+            "template_success.benchmark",
+            "# template comments are ignored\nrun\n-- query comments are ignored\nselect '${TABLE_NAME}', '${BENCHMARK_DIR}'\n",
+        );
+
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "template_success_driver.benchmark",
+            &format!(
+                "template {}\n# parameter comments are ignored\nTABLE_NAME=orders\n",
+                template_path.display()
+            ),
+        );
+
+        let result = parse_benchmark_file(&benchmark_path).await;
+
+        let benchmark = result.expect("benchmark should parse");
+        assert_eq!(benchmark.benchmark_path(), template_path.as_path());
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run query"),
+            &vec!["select 'orders', '/tmp'".to_string()]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_trims_template_parameter_keys_and_values() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let template_path = write_test_file(
+            &temp_dir,
+            "template_trim.benchmark",
+            "run\nselect '${TABLE_NAME}'\n",
+        );
+
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "template_trim_driver.benchmark",
+            &format!(
+                "template {}\n  TABLE_NAME = orders  \n",
+                template_path.display()
+            ),
+        );
+
+        let benchmark = parse_benchmark_file(&benchmark_path)
+            .await
+            .expect("benchmark should parse");
+
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run query"),
+            &vec!["select 'orders'".to_string()]
+        );
+        assert_eq!(
+            benchmark.replacement_mapping().get("table_name"),
+            Some(&"orders".to_string())
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_preserves_expected_result_cell_whitespace() {
+        let benchmark = parse_benchmark("assert I\nselect '  x  '\n----\n  x  \n")
+            .await
+            .expect("benchmark should parse");
+
+        let query = benchmark
+            .assert_queries()
+            .first()
+            .expect("assert query should be parsed");
+
+        assert_eq!(query.expected_result, vec![vec!["  x  ".to_string()]]);
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_indented_comments_and_blank_lines() {
+        let benchmark =
+            parse_benchmark("  # comment\n  -- comment\n  run\n  select 1\n   \n")
+                .await
+                .expect("benchmark should parse");
+
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run query"),
+            &vec!["select 1".to_string()]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_case_insensitive_query_directives() {
+        let benchmark = parse_benchmark("RUN\nselect 1\n")
+            .await
+            .expect("benchmark should parse");
+
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run query"),
+            &vec!["select 1".to_string()]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_query_file_and_splits_statements() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let query_path = write_test_file(
+            &temp_dir,
+            "queries.sql",
+            "-- leading comment\nSELECT 1 AS value;\nSELECT 2 AS value;\n\n# another comment\nWITH t AS (SELECT 3 AS value) SELECT * FROM t;\n",
+        );
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "query_file.benchmark",
+            &format!("run {}\n", query_path.display()),
+        );
+
+        let benchmark = parse_benchmark_file(&benchmark_path)
+            .await
+            .expect("benchmark should parse");
+
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run queries"),
+            &vec![
+                "SELECT 1 AS value;".to_string(),
+                "SELECT 2 AS value;".to_string(),
+                "WITH t AS (SELECT 3 AS value) SELECT * FROM t;".to_string(),
+            ]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_replacements_in_query_file_path() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let query_path =
+            write_test_file(&temp_dir, "queries.sql", "SELECT 5 AS value;\n");
+        let template_path = write_test_file(
+            &temp_dir,
+            "query_file_path_template.benchmark",
+            "run ${QUERY_PATH}\n",
+        );
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "query_file_path_driver.benchmark",
+            &format!(
+                "template {}\nQUERY_PATH={}\n",
+                template_path.display(),
+                query_path.display()
+            ),
+        );
+
+        let benchmark = parse_benchmark_file(&benchmark_path)
+            .await
+            .expect("benchmark should parse");
+
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run query"),
+            &vec!["SELECT 5 AS value;".to_string()]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_inline_sql_when_query_file_is_provided() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let query_path =
+            write_test_file(&temp_dir, "queries.sql", "SELECT 1 AS value;\n");
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "query_file_with_inline_body.benchmark",
+            &format!("run {}\nSELECT 999 AS value;\n", query_path.display()),
+        );
+
+        let result = parse_benchmark_file(&benchmark_path).await;
+
+        assert_result_error_contains(
+            result,
+            "run directive must use either a query file or inline SQL, not both",
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_inline_sql_when_load_file_is_provided() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let query_path = write_test_file(
+            &temp_dir,
+            "load.sql",
+            "CREATE TABLE t AS SELECT 1 AS value;\n",
+        );
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "load_file_with_inline_body.benchmark",
+            &format!(
+                "load {}\nCREATE TABLE u AS SELECT 2 AS value;\n",
+                query_path.display()
+            ),
+        );
+
+        let result = parse_benchmark_file(&benchmark_path).await;
+
+        assert_result_error_contains(
+            result,
+            "load directive must use either a query file or inline SQL, not both",
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_inline_sql_when_init_file_is_provided() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let query_path = write_test_file(
+            &temp_dir,
+            "init.sql",
+            "CREATE VIEW v AS SELECT 1 AS value;\n",
+        );
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "init_file_with_inline_body.benchmark",
+            &format!(
+                "init {}\nCREATE VIEW w AS SELECT 2 AS value;\n",
+                query_path.display()
+            ),
+        );
+
+        let result = parse_benchmark_file(&benchmark_path).await;
+
+        assert_result_error_contains(
+            result,
+            "init directive must use either a query file or inline SQL, not both",
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_inline_sql_when_cleanup_file_is_provided() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let query_path = write_test_file(&temp_dir, "cleanup.sql", "DROP TABLE t;\n");
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "cleanup_file_with_inline_body.benchmark",
+            &format!("cleanup {}\nDROP TABLE u;\n", query_path.display()),
+        );
+
+        let result = parse_benchmark_file(&benchmark_path).await;
+
+        assert_result_error_contains(
+            result,
+            "cleanup directive must use either a query file or inline SQL, not both",
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_ignores_query_file_with_only_comments_and_blank_lines() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let query_path = write_test_file(
+            &temp_dir,
+            "queries.sql",
+            "# comment\n\n-- another comment\n\n",
+        );
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "empty_query_file.benchmark",
+            &format!("run {}\n", query_path.display()),
+        );
+
+        let benchmark = parse_benchmark_file(&benchmark_path)
+            .await
+            .expect("benchmark should parse");
+
+        assert!(!benchmark.queries().contains_key(&QueryDirective::Run));
+    }
+
+    #[tokio::test]
+    async fn parser_splits_query_file_with_windows_line_endings() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let query_path = write_test_file(
+            &temp_dir,
+            "queries.sql",
+            "SELECT 1 AS value;\r\nSELECT 2 AS value;\r\n",
+        );
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "windows_query_file.benchmark",
+            &format!("run {}\n", query_path.display()),
+        );
+
+        let benchmark = parse_benchmark_file(&benchmark_path)
+            .await
+            .expect("benchmark should parse");
+
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run queries"),
+            &vec![
+                "SELECT 1 AS value;".to_string(),
+                "SELECT 2 AS value;".to_string()
+            ]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_unknown_command() {
+        assert_parse_error("wat\n", "Unrecognized command: wat").await;
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_assert_without_column_count() {
+        assert_parse_error(
+            "assert\nselect 1\n----\n1\n",
+            "assert must be followed by a column count",
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_assert_without_result_separator() {
+        assert_parse_error(
+            "assert I\nselect 1\n1\n",
+            "assert must be followed by a query and a result (separated by ----)",
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_result_query_without_separator() {
+        assert_parse_error(
+            "result_query I\nselect 1\n1\n",
+            "result_query must be followed by a query and a result (separated by ----)",
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_result_query_with_wrong_column_count() {
+        assert_parse_error(
+            "result_query II\nselect 1\n----\n1\n",
+            "expected 2 values but got 1",
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_multiple_result_queries() {
+        assert_parse_error(
+            "result_query I\nselect 1\n----\n1\n\nresult_query I\nselect 2\n----\n2\n",
+            "multiple results found",
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_duplicate_run_directives() {
+        assert_parse_error("run\nselect 1\n\nrun\nselect 2\n", "Multiple calls to run")
+            .await;
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_multiple_load_directives() {
+        let benchmark = parse_benchmark(
+            "load\nCREATE TABLE t AS SELECT 1;\n\nload\nCREATE TABLE u AS SELECT 2;\n",
+        )
+        .await
+        .expect("benchmark should parse");
+
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Load)
+                .expect("load queries"),
+            &vec![
+                "CREATE TABLE t AS SELECT 1;".to_string(),
+                "CREATE TABLE u AS SELECT 2;".to_string(),
+            ]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_multiple_init_directives() {
+        let benchmark = parse_benchmark(
+            "init\nCREATE VIEW v AS SELECT 1;\n\ninit\nCREATE VIEW w AS SELECT 2;\n",
+        )
+        .await;
+
+        let benchmark = benchmark.expect("benchmark should parse");
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Init)
+                .expect("init queries"),
+            &vec![
+                "CREATE VIEW v AS SELECT 1;".to_string(),
+                "CREATE VIEW w AS SELECT 2;".to_string(),
+            ]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_multiple_cleanup_directives() {
+        let benchmark =
+            parse_benchmark("cleanup\nDROP TABLE t;\n\ncleanup\nDROP TABLE u;\n")
+                .await
+                .expect("benchmark should parse");
+
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Cleanup)
+                .expect("cleanup queries"),
+            &vec!["DROP TABLE t;".to_string(), "DROP TABLE u;".to_string(),]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_missing_query_file() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let missing_path = temp_dir.path().join("missing.sql");
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "missing_query_file.benchmark",
+            &format!("run {}\n", missing_path.display()),
+        );
+
+        let result = parse_benchmark_file(&benchmark_path).await;
+
+        assert_result_error_contains(result, "Failed to read query file");
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_template_with_invalid_parameter_assignment() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let template_path =
+            write_test_file(&temp_dir, "template.benchmark", "run\nselect 1\n");
+
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "template_driver.benchmark",
+            &format!("template {}\nINVALID\n", template_path.display()),
+        );
+
+        let ctx = SessionContext::new();
+        let benchmark_path_string = benchmark_path.to_string_lossy().into_owned();
+        let result = SqlBenchmark::new(&ctx, &benchmark_path_string, "/tmp").await;
+
+        let error = result.expect_err("benchmark parsing should fail");
+        let message = error.to_string();
+        assert!(
+            message.contains("Expected a template parameter in the form of X=Y"),
+            "expected template parameter error, got {message:?}"
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_metadata_and_result_directives_without_values() {
+        assert_parse_error("name\n", "name must be followed by a value").await;
+        assert_parse_error("group\n", "group must be followed by a value").await;
+        assert_parse_error("subgroup\n", "subgroup must be followed by a value").await;
+        assert_parse_error(
+            "expect_plan\n",
+            "expect_plan must be followed by a string to search in the physical plan",
+        )
+        .await;
+        assert_parse_error("echo\n", "Echo requires an argument").await;
+        assert_parse_error(
+            "result\n",
+            "result must be followed by a path to a result file",
+        )
+        .await;
+        assert_parse_error("include\n", "include requires a single argument").await;
+        assert_parse_error("template\n", "template requires a single template path")
+            .await;
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_include_and_template_with_too_many_arguments() {
+        assert_parse_error("include a b\n", "include requires a single argument").await;
+        assert_parse_error("template a b\n", "template requires a single template path")
+            .await;
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_missing_include_file() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let missing_path = temp_dir.path().join("missing_include.benchmark");
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "missing_include_driver.benchmark",
+            &format!("include {}\n", missing_path.display()),
+        );
+
+        let result = parse_benchmark_file(&benchmark_path).await;
+
+        assert_result_error_contains(result, "No such file");
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_missing_template_file() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let missing_path = temp_dir.path().join("missing_template.benchmark");
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "missing_template_driver.benchmark",
+            &format!("template {}\n", missing_path.display()),
+        );
+
+        let result = parse_benchmark_file(&benchmark_path).await;
+
+        assert_result_error_contains(result, "No such file");
+    }
+
+    #[tokio::test]
+    async fn parser_uses_metadata_values_as_replacements() {
+        let benchmark = parse_benchmark(
+            r#"
+name Q01
+group tpch
+subgroup sf1
+
+run
+SELECT '${BENCH_NAME}', '${BENCH_GROUP}', '${BENCH_SUBGROUP}'
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run query"),
+            &vec!["SELECT 'Q01', 'tpch', 'sf1'".to_string()]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_replacement_in_result_file_path() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let result_path = write_test_file(&temp_dir, "result.csv", "value\n1\n");
+        let template_path = write_test_file(
+            &temp_dir,
+            "result_path_template.benchmark",
+            "result ${RESULT_PATH}\n",
+        );
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "result_path_driver.benchmark",
+            &format!(
+                "template {}\nRESULT_PATH={}\n",
+                template_path.display(),
+                result_path.display()
+            ),
+        );
+
+        let benchmark = parse_benchmark_file(&benchmark_path)
+            .await
+            .expect("benchmark should parse");
+
+        let query = benchmark
+            .result_queries()
+            .first()
+            .expect("result query should be parsed");
+        assert_eq!(query.path, Some(result_path.to_string_lossy().into_owned()));
+        assert_eq!(query.column_count, 0);
+        assert!(query.expected_result.is_empty());
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_missing_replacement_in_result_file_path() {
+        assert_parse_error("result ${MISSING_RESULT_PATH}\n", "Missing value for key")
+            .await;
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_missing_result_file() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let missing_path = temp_dir.path().join("missing_result.csv");
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "missing_result_file.benchmark",
+            &format!("result {}\n", missing_path.display()),
+        );
+
+        let benchmark = parse_benchmark_file(&benchmark_path)
+            .await
+            .expect("benchmark should parse");
+
+        let query = benchmark
+            .result_queries()
+            .first()
+            .expect("result file should be parsed");
+        assert_eq!(
+            query.path,
+            Some(missing_path.to_string_lossy().into_owned())
+        );
+        assert!(query.expected_result.is_empty());
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_malformed_result_file() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let result_path = temp_dir.path().join("malformed_result.csv");
+        fs::write(&result_path, [0xff]).expect("failed to write malformed result file");
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "malformed_result_file.benchmark",
+            &format!("result {}\n", result_path.display()),
+        );
+
+        let benchmark = parse_benchmark_file(&benchmark_path)
+            .await
+            .expect("benchmark should parse");
+
+        let query = benchmark
+            .result_queries()
+            .first()
+            .expect("result file should be parsed");
+        assert_eq!(query.path, Some(result_path.to_string_lossy().into_owned()));
+        assert!(query.expected_result.is_empty());
+    }
+
+    // Lifecycle tests cover initialization, assertions, and cleanup execution.
+
+    #[tokio::test]
+    async fn initialize_executes_load_before_init_and_is_idempotent() {
+        let mut benchmark = parse_benchmark(
+            r#"
+load
+CREATE TABLE t AS SELECT 1 AS value;
+
+load
+CREATE TABLE u AS SELECT value + 1 AS value FROM t;
+
+init
+CREATE TABLE v AS SELECT value + 1 AS value FROM u;
+
+init
+CREATE TABLE initialized AS SELECT value + 1 AS value FROM v;
+
+run
+SELECT value FROM initialized;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark
+            .initialize(&ctx)
+            .await
+            .expect("initialize should succeed");
+        benchmark
+            .initialize(&ctx)
+            .await
+            .expect("second initialize should be a no-op");
+
+        assert!(benchmark.is_loaded());
+
+        let rows = ctx
+            .sql("SELECT value FROM initialized")
+            .await
+            .expect("query should plan")
+            .collect()
+            .await
+            .expect("query should run");
+
+        assert_eq!(format_record_batches(&rows).unwrap(), vec![vec!["4"]]);
+    }
+
+    #[tokio::test]
+    async fn initialize_rejects_benchmark_without_run_query() {
+        let mut benchmark = parse_benchmark(
+            r#"
+load
+CREATE TABLE t AS SELECT 1 AS value;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.initialize(&ctx).await,
+            "Invalid benchmark file: no \"run\" query specified",
+        );
+    }
+
+    #[tokio::test]
+    async fn initialize_propagates_load_query_failures() {
+        let mut benchmark = parse_benchmark(
+            r#"
+load
+CREATE TABLE t AS SELECT * FROM missing_load_table;
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.initialize(&ctx).await,
+            "missing_load_table",
+        );
+    }
+
+    #[tokio::test]
+    async fn initialize_propagates_init_query_failures() {
+        let mut benchmark = parse_benchmark(
+            r#"
+init
+CREATE TABLE t AS SELECT * FROM missing_init_table;
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.initialize(&ctx).await,
+            "missing_init_table",
+        );
+    }
+
+    #[tokio::test]
+    async fn cleanup_executes_cleanup_queries() {
+        let mut benchmark = parse_benchmark(
+            r#"
+run
+SELECT 1;
+
+cleanup
+CREATE TABLE cleanup_marker_a AS SELECT 7 AS value;
+
+cleanup
+CREATE TABLE cleanup_marker_b AS SELECT value + 1 AS value FROM cleanup_marker_a;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.cleanup(&ctx).await.expect("cleanup should run");
+
+        let rows = ctx
+            .sql("SELECT value FROM cleanup_marker_b")
+            .await
+            .expect("query should plan")
+            .collect()
+            .await
+            .expect("query should run");
+        assert_eq!(format_record_batches(&rows).unwrap(), vec![vec!["8"]]);
+    }
+
+    #[tokio::test]
+    async fn cleanup_propagates_query_failures() {
+        let mut benchmark = parse_benchmark(
+            r#"
+run
+SELECT 1;
+
+cleanup
+SELECT * FROM missing_cleanup_table;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.cleanup(&ctx).await,
+            "missing_cleanup_table",
+        );
+    }
+
+    #[tokio::test]
+    async fn assert_executes_assert_queries_successfully() {
+        let mut benchmark = parse_benchmark(
+            r#"
+assert I
+SELECT 1 AS value
+----
+1
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.assert(&ctx).await.expect("assert should pass");
+    }
+
+    #[tokio::test]
+    async fn assert_accepts_null_expected_for_empty_actual() {
+        let mut benchmark = parse_benchmark(
+            r#"
+assert I
+SELECT '' AS value
+----
+NULL
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.assert(&ctx).await.expect("assert should pass");
+    }
+
+    #[tokio::test]
+    async fn assert_accepts_empty_marker_for_empty_actual() {
+        let mut benchmark = parse_benchmark(
+            r#"
+assert I
+SELECT '' AS value
+----
+(empty)
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.assert(&ctx).await.expect("assert should pass");
+    }
+
+    #[tokio::test]
+    async fn assert_accepts_empty_marker_for_null_actual() {
+        let mut benchmark = parse_benchmark(
+            r#"
+assert I
+SELECT CAST(NULL AS VARCHAR) AS value
+----
+(empty)
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.assert(&ctx).await.expect("assert should pass");
+    }
+
+    #[tokio::test]
+    async fn assert_succeeds_with_zero_actual_and_expected_rows() {
+        let mut benchmark = parse_benchmark(
+            r#"
+assert I
+SELECT 1 AS value WHERE false
+----
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.assert(&ctx).await.expect("assert should pass");
+    }
+
+    #[tokio::test]
+    async fn assert_propagates_query_failures() {
+        let mut benchmark = parse_benchmark(
+            r#"
+assert I
+SELECT * FROM missing_assert_table
+----
+1
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.assert(&ctx).await,
+            "missing_assert_table",
+        );
+    }
+
+    #[tokio::test]
+    async fn assert_reports_row_count_mismatch() {
+        let mut benchmark = parse_benchmark(
+            r#"
+assert I
+SELECT 1 AS value
+----
+1
+2
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.assert(&ctx).await,
+            "expected 2 rows but got 1",
+        );
+    }
+
+    #[tokio::test]
+    async fn assert_reports_column_count_mismatch() {
+        let mut benchmark = parse_benchmark(
+            r#"
+assert I
+SELECT 1 AS a, 2 AS b
+----
+1
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.assert(&ctx).await,
+            "expected 1 columns but got 2",
+        );
+    }
+
+    #[tokio::test]
+    async fn assert_reports_value_mismatch() {
+        let mut benchmark = parse_benchmark(
+            r#"
+assert I
+SELECT 1 AS value
+----
+2
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.assert(&ctx).await,
+            "expected value \"2\" but got value \"1\"",
+        );
+    }
+
+    // Run tests cover result buffering and physical-plan expectations.
+
+    #[tokio::test]
+    async fn run_saves_uppercase_select_results() {
+        let mut benchmark = parse_benchmark("run\nSELECT 1 AS value\n")
+            .await
+            .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+
+        assert_eq!(formatted_last_results(&benchmark), vec![vec!["1"]]);
+    }
+
+    #[tokio::test]
+    async fn run_saves_with_query_results() {
+        let mut benchmark =
+            parse_benchmark("run\nWITH t AS (SELECT 3 AS value) SELECT value FROM t\n")
+                .await
+                .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+
+        assert_eq!(formatted_last_results(&benchmark), vec![vec!["3"]]);
+    }
+
+    #[tokio::test]
+    async fn run_only_keeps_last_select_or_with_result() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let query_path = write_test_file(
+            &temp_dir,
+            "queries.sql",
+            "SELECT 1 AS value;\nSELECT 2 AS value;\nWITH t AS (SELECT 3 AS value) SELECT value FROM t;\n",
+        );
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "run_file.benchmark",
+            &format!("run {}\n", query_path.display()),
+        );
+        let mut benchmark = parse_benchmark_file(&benchmark_path)
+            .await
+            .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+
+        assert_eq!(formatted_last_results(&benchmark), vec![vec!["3"]]);
+    }
+
+    #[tokio::test]
+    async fn run_inline_multi_statement_only_keeps_last_select_or_with_result() {
+        let mut benchmark = parse_benchmark(
+            "run\nCREATE TABLE t AS SELECT 1 AS value;\nSELECT 2 AS value;\nWITH u AS (SELECT 3 AS value) SELECT value FROM u;\n",
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+
+        assert_eq!(formatted_last_results(&benchmark), vec![vec!["3"]]);
+    }
+
+    #[tokio::test]
+    async fn run_does_not_save_results_for_non_select_statement() {
+        let mut benchmark =
+            parse_benchmark("run\nCREATE TABLE run_created AS SELECT 1 AS value;\n")
+                .await
+                .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+
+        assert!(
+            benchmark
+                .last_results
+                .as_ref()
+                .expect("last results should be set")
+                .is_empty()
+        );
+    }
+
+    #[tokio::test]
+    async fn run_propagates_query_failures_when_buffering_results() {
+        let mut benchmark = parse_benchmark("run\nSELECT * FROM missing_run_table\n")
+            .await
+            .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.run(&ctx, true).await,
+            "missing_run_table",
+        );
+    }
+
+    #[tokio::test]
+    async fn run_propagates_query_failures_when_streaming_results() {
+        let mut benchmark = parse_benchmark("run\nSELECT * FROM missing_stream_table\n")
+            .await
+            .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.run(&ctx, false).await,
+            "missing_stream_table",
+        );
+    }
+
+    #[tokio::test]
+    async fn run_rejects_missing_expect_plan_for_buffered_and_streaming_modes() {
+        let ctx = SessionContext::new();
+        let benchmark_text = "expect_plan definitely_not_in_plan\nrun\nSELECT 1\n";
+
+        let mut buffered = parse_benchmark(benchmark_text)
+            .await
+            .expect("benchmark should parse");
+        assert_result_error_contains(
+            buffered.run(&ctx, true).await,
+            "does not contain the expected string 'definitely_not_in_plan'",
+        );
+
+        let mut streaming = parse_benchmark(benchmark_text)
+            .await
+            .expect("benchmark should parse");
+        assert_result_error_contains(
+            streaming.run(&ctx, false).await,
+            "does not contain the expected string 'definitely_not_in_plan'",
+        );
+    }
+
+    #[tokio::test]
+    async fn run_accepts_matching_expect_plan_for_buffered_and_streaming_modes() {
+        let ctx = SessionContext::new();
+        let benchmark_text = "expect_plan PlaceholderRowExec\nrun\nSELECT 1\n";
+
+        let mut buffered = parse_benchmark(benchmark_text)
+            .await
+            .expect("benchmark should parse");
+        buffered
+            .run(&ctx, true)
+            .await
+            .expect("buffered run should accept matching plan");
+        assert_eq!(formatted_last_results(&buffered), vec![vec!["1"]]);
+
+        let mut streaming = parse_benchmark(benchmark_text)
+            .await
+            .expect("benchmark should parse");
+        streaming
+            .run(&ctx, false)
+            .await
+            .expect("streaming run should accept matching plan");
+    }
+
+    // Verification tests cover result_query and persisted-result comparison paths.
+
+    #[tokio::test]
+    async fn verify_without_result_query_returns_ok() {
+        let mut benchmark = parse_benchmark("run\nSELECT 1 AS value\n")
+            .await
+            .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.verify(&ctx).await.expect("verify should pass");
+    }
+
+    #[tokio::test]
+    async fn verify_errors_when_benchmark_has_not_run() {
+        let mut benchmark = parse_benchmark(
+            r#"
+result_query I
+SELECT 1 AS value
+----
+1
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.verify(&ctx).await,
+            "No results available for verification. Run the benchmark first.",
+        );
+    }
+
+    #[tokio::test]
+    async fn verify_uses_last_results_for_result_file_entries() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let result_path = write_test_file(&temp_dir, "result.csv", "value\n1\n");
+        let mut benchmark = parse_benchmark(&format!(
+            "result {}\n\nrun\nSELECT 1 AS value\n",
+            result_path.display()
+        ))
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+        benchmark.verify(&ctx).await.expect("verify should pass");
+    }
+
+    #[tokio::test]
+    async fn verify_uses_last_results_for_zero_row_result_file_entries() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let result_path = write_test_file(&temp_dir, "result.csv", "value\n");
+        let mut benchmark = parse_benchmark(&format!(
+            "result {}\n\nrun\nSELECT 1 AS value WHERE false\n",
+            result_path.display()
+        ))
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+        benchmark.verify(&ctx).await.expect("verify should pass");
+    }
+
+    #[tokio::test]
+    async fn verify_rejects_missing_result_file() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let missing_path = temp_dir.path().join("missing_result.csv");
+        let mut benchmark = parse_benchmark(&format!(
+            "result {}\n\nrun\nSELECT 1 AS value\n",
+            missing_path.display()
+        ))
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+
+        assert_result_error_contains(benchmark.verify(&ctx).await, "missing_result.csv");
+    }
+
+    #[tokio::test]
+    async fn verify_rejects_malformed_result_file() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let result_path = temp_dir.path().join("malformed_result.csv");
+        fs::write(&result_path, [0xff]).expect("failed to write malformed result file");
+        let mut benchmark = parse_benchmark(&format!(
+            "result {}\n\nrun\nSELECT 1 AS value\n",
+            result_path.display()
+        ))
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+
+        assert_result_error_contains(benchmark.verify(&ctx).await, "CSV");
+    }
+
+    #[tokio::test]
+    async fn verify_executes_result_query_instead_of_last_results() {
+        let mut benchmark = parse_benchmark(
+            r#"
+run
+SELECT 100 AS value
+
+result_query I
+SELECT 1 AS value
+----
+1
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+        benchmark.verify(&ctx).await.expect("verify should pass");
+    }
+
+    #[tokio::test]
+    async fn verify_propagates_result_query_failures() {
+        let mut benchmark = parse_benchmark(
+            r#"
+run
+SELECT 1 AS value
+
+result_query I
+SELECT * FROM missing_verify_table
+----
+1
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+
+        assert_result_error_contains(
+            benchmark.verify(&ctx).await,
+            "missing_verify_table",
+        );
+    }
+
+    #[tokio::test]
+    async fn verify_reports_result_mismatch_context() {
+        let mut benchmark = parse_benchmark(
+            r#"
+run
+SELECT 1 AS value
+
+result_query I
+SELECT 1 AS value
+----
+2
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+
+        let error = benchmark
+            .verify(&ctx)
+            .await
+            .expect_err("verify should fail");
+        let message = error.to_string();
+        assert!(
+            message.contains("row 1, column 1")
+                && message.contains("expected value \"2\"")
+                && message.contains("got value \"1\""),
+            "unexpected error: {message}"
+        );
+    }
+
+    // Persistence tests cover CSV writing and persist-time error paths.
+
+    #[tokio::test]
+    async fn persist_without_result_query_returns_ok() {
+        let mut benchmark = parse_benchmark("run\nSELECT 1 AS value\n")
+            .await
+            .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.persist(&ctx).await.expect("persist should pass");
+    }
+
+    #[tokio::test]
+    async fn persist_rejects_result_query_without_file_path() {
+        let mut benchmark = parse_benchmark(
+            r#"
+run
+SELECT 1 AS value
+
+result_query I
+SELECT 1 AS value
+----
+1
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.persist(&ctx).await,
+            "Unable to persist results from query",
+        );
+    }
+
+    #[tokio::test]
+    async fn persist_rejects_run_without_saved_result_batches() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let output_path = temp_dir.path().join("persisted");
+        let mut benchmark =
+            parse_benchmark("run\nCREATE TABLE persist_source AS SELECT 1 AS value;\n")
+                .await
+                .expect("benchmark should parse");
+        benchmark.result_queries.push(BenchmarkQuery {
+            path: Some(output_path.to_string_lossy().into_owned()),
+            query: String::new(),
+            column_count: 1,
+            expected_result: vec![],
+        });
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.persist(&ctx).await,
+            "Results should be loaded",
+        );
+    }
+
+    #[tokio::test]
+    async fn persist_writes_header_and_pipe_delimited_rows() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let output_path = temp_dir.path().join("persisted");
+        let mut benchmark = parse_benchmark("run\nSELECT 1 AS a, 'one' AS b\n")
+            .await
+            .expect("benchmark should parse");
+        benchmark.result_queries.push(BenchmarkQuery {
+            path: Some(output_path.to_string_lossy().into_owned()),
+            query: String::new(),
+            column_count: 2,
+            expected_result: vec![],
+        });
+        let ctx = SessionContext::new();
+
+        benchmark.persist(&ctx).await.expect("persist should pass");
+
+        let contents = read_all_files_in_dir(&output_path);
+        assert!(
+            contents.contains("a|b\n") && contents.contains("1|one\n"),
+            "unexpected persisted contents: {contents:?}"
+        );
+    }
+
+    #[tokio::test]
+    async fn persist_writes_header_for_zero_row_select_results() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let output_path = temp_dir.path().join("persisted_empty");
+        let mut benchmark = parse_benchmark("run\nSELECT 1 AS value WHERE false\n")
+            .await
+            .expect("benchmark should parse");
+        benchmark.result_queries.push(BenchmarkQuery {
+            path: Some(output_path.to_string_lossy().into_owned()),
+            query: String::new(),
+            column_count: 1,
+            expected_result: vec![],
+        });
+        let ctx = SessionContext::new();
+
+        benchmark.persist(&ctx).await.expect("persist should pass");
+
+        let contents = read_all_files_in_dir(&output_path);
+        assert!(
+            contents.contains("value\n"),
+            "unexpected persisted contents: {contents:?}"
+        );
+    }
+
+    // Path helper tests cover group derivation from benchmark file paths.
+
+    #[test]
+    fn parse_group_from_path_returns_group_under_benchmark_directory() {
+        let group = parse_group_from_path(
+            Path::new("sql_benchmarks/tpch/benchmarks/q01.benchmark"),
+            Path::new("sql_benchmarks"),
+        );
+
+        assert_eq!(group, "tpch");
+    }
+
+    #[test]
+    fn parse_group_from_path_matches_benchmark_directory_case_insensitively() {
+        let group = parse_group_from_path(
+            Path::new("/tmp/SQL_BENCHMARKS/Tpch/benchmarks/q01.benchmark"),
+            Path::new("sql_benchmarks"),
+        );
+
+        assert_eq!(group, "Tpch");
+    }
+
+    #[test]
+    fn parse_group_from_path_handles_relative_and_absolute_paths() {
+        let relative = parse_group_from_path(
+            Path::new("sql_benchmarks/h2o/q01.benchmark"),
+            Path::new("sql_benchmarks"),
+        );
+        let absolute = parse_group_from_path(
+            Path::new("/tmp/sql_benchmarks/imdb/q01.benchmark"),
+            Path::new("sql_benchmarks"),
+        );
+
+        assert_eq!(relative, "h2o");
+        assert_eq!(absolute, "imdb");
+    }
+
+    #[test]
+    fn parse_group_from_path_pins_fallback_for_paths_outside_benchmark_directory() {
+        let group = parse_group_from_path(
+            Path::new("outside/group/q01.benchmark"),
+            Path::new("sql_benchmarks"),
+        );
+
+        assert_eq!(group, "outside");
+    }
+
+    #[test]
+    fn path_ends_with_ignore_ascii_case_matches_component_suffixes() {
+        assert!(path_ends_with_ignore_ascii_case(
+            Path::new("/tmp/SQL_BENCHMARKS"),
+            Path::new("sql_benchmarks")
+        ));
+        assert!(!path_ends_with_ignore_ascii_case(
+            Path::new("/tmp/sql_benchmarks_extra"),
+            Path::new("sql_benchmarks")
+        ));
+    }
+}
diff --git a/datafusion/core/tests/schema_adapter/mod.rs b/benchmarks/src/tpcds/mod.rs
similarity index 95%
rename from datafusion/core/tests/schema_adapter/mod.rs
rename to benchmarks/src/tpcds/mod.rs
index 2f81a43f4736e..4829eb9fd348a 100644
--- a/datafusion/core/tests/schema_adapter/mod.rs
+++ b/benchmarks/src/tpcds/mod.rs
@@ -15,4 +15,5 @@
 // specific language governing permissions and limitations
 // under the License.
 
-mod schema_adapter_integration_tests;
+mod run;
+pub use run::RunOpt;
diff --git a/benchmarks/src/tpcds/run.rs b/benchmarks/src/tpcds/run.rs
new file mode 100644
index 0000000000000..f7ef6991515da
--- /dev/null
+++ b/benchmarks/src/tpcds/run.rs
@@ -0,0 +1,362 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::fs;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult, print_memory_stats};
+
+use arrow::record_batch::RecordBatch;
+use arrow::util::pretty::{self, pretty_format_batches};
+use datafusion::datasource::file_format::parquet::ParquetFormat;
+use datafusion::datasource::listing::{
+    ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
+};
+use datafusion::datasource::{MemTable, TableProvider};
+use datafusion::error::Result;
+use datafusion::physical_plan::display::DisplayableExecutionPlan;
+use datafusion::physical_plan::{collect, displayable};
+use datafusion::prelude::*;
+use datafusion_common::instant::Instant;
+use datafusion_common::utils::get_available_parallelism;
+use datafusion_common::{DEFAULT_PARQUET_EXTENSION, plan_err};
+
+use clap::Args;
+use log::info;
+
+// hack to avoid `default_value is meaningless for bool` errors
+type BoolDefaultTrue = bool;
+pub const TPCDS_QUERY_START_ID: usize = 1;
+pub const TPCDS_QUERY_END_ID: usize = 99;
+
+pub const TPCDS_TABLES: &[&str] = &[
+    "call_center",
+    "customer_address",
+    "household_demographics",
+    "promotion",
+    "store_sales",
+    "web_page",
+    "catalog_page",
+    "customer_demographics",
+    "income_band",
+    "reason",
+    "store",
+    "web_returns",
+    "catalog_returns",
+    "customer",
+    "inventory",
+    "ship_mode",
+    "time_dim",
+    "web_sales",
+    "catalog_sales",
+    "date_dim",
+    "item",
+    "store_returns",
+    "warehouse",
+    "web_site",
+];
+
+/// Get the SQL statements from the specified query file
+pub fn get_query_sql(base_query_path: &str, query: usize) -> Result<Vec<String>> {
+    if query > 0 && query < 100 {
+        let filename = format!("{base_query_path}/{query}.sql");
+        let mut errors = vec![];
+        match fs::read_to_string(&filename) {
+            Ok(contents) => {
+                return Ok(contents
+                    .split(';')
+                    .map(|s| s.trim())
+                    .filter(|s| !s.is_empty())
+                    .map(|s| s.to_string())
+                    .collect());
+            }
+            Err(e) => errors.push(format!("{filename}: {e}")),
+        };
+
+        plan_err!("invalid query. Could not find query: {:?}", errors)
+    } else {
+        plan_err!("invalid query. Expected value between 1 and 99")
+    }
+}
+
+/// Run the tpcds benchmark.
+#[derive(Debug, Args, Clone)]
+#[command(verbatim_doc_comment)]
+pub struct RunOpt {
+    /// Query number. If not specified, runs all queries
+    #[arg(short, long)]
+    pub query: Option<usize>,
+
+    /// Common options
+    #[command(flatten)]
+    common: CommonOpt,
+
+    /// Path to data files
+    #[arg(required = true, short = 'p', long = "path")]
+    path: PathBuf,
+
+    /// Path to query files
+    #[arg(required = true, short = 'Q', long = "query_path")]
+    query_path: PathBuf,
+
+    /// Load the data into a MemTable before executing the query
+    #[arg(short = 'm', long = "mem-table")]
+    mem_table: bool,
+
+    /// Path to machine readable output file
+    #[arg(short = 'o', long = "output")]
+    output_path: Option<PathBuf>,
+
+    /// Whether to disable collection of statistics (and cost based optimizations) or not.
+    #[arg(short = 'S', long = "disable-statistics")]
+    disable_statistics: bool,
+
+    /// If true then hash join used, if false then sort merge join
+    /// True by default.
+    #[arg(short = 'j', long = "prefer_hash_join", default_value = "true")]
+    prefer_hash_join: BoolDefaultTrue,
+
+    /// If true then Piecewise Merge Join can be used, if false then it will opt for Nested Loop Join
+    /// False by default.
+    #[arg(
+        short = 'w',
+        long = "enable_piecewise_merge_join",
+        default_value = "false"
+    )]
+    enable_piecewise_merge_join: BoolDefaultTrue,
+
+    /// Mark the first column of each table as sorted in ascending order.
+    /// The tables should have been created with the `--sort` option for this to have any effect.
+    #[arg(short = 't', long = "sorted")]
+    sorted: bool,
+
+    /// How many bytes to buffer on the probe side of hash joins.
+    #[arg(long, default_value = "0")]
+    hash_join_buffering_capacity: usize,
+}
+
+impl RunOpt {
+    pub async fn run(self) -> Result<()> {
+        println!("Running benchmarks with the following options: {self:?}");
+        let query_range = match self.query {
+            Some(query_id) => query_id..=query_id,
+            None => TPCDS_QUERY_START_ID..=TPCDS_QUERY_END_ID,
+        };
+
+        let mut benchmark_run = BenchmarkRun::new();
+        let mut config = self
+            .common
+            .config()?
+            .with_collect_statistics(!self.disable_statistics);
+        config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
+        config.options_mut().optimizer.enable_piecewise_merge_join =
+            self.enable_piecewise_merge_join;
+        config.options_mut().execution.hash_join_buffering_capacity =
+            self.hash_join_buffering_capacity;
+        let rt = self.common.build_runtime()?;
+        let ctx = SessionContext::new_with_config_rt(config, rt);
+        // register tables
+        self.register_tables(&ctx).await?;
+
+        for query_id in query_range {
+            benchmark_run.start_new_case(&format!("Query {query_id}"));
+            let query_run = self.benchmark_query(query_id, &ctx).await;
+            match query_run {
+                Ok(query_results) => {
+                    for iter in query_results {
+                        benchmark_run.write_iter(iter.elapsed, iter.row_count);
+                    }
+                }
+                Err(e) => {
+                    benchmark_run.mark_failed();
+                    eprintln!("Query {query_id} failed: {e}");
+                }
+            }
+        }
+        benchmark_run.maybe_write_json(self.output_path.as_ref())?;
+        benchmark_run.maybe_print_failures();
+        Ok(())
+    }
+
+    async fn benchmark_query(
+        &self,
+        query_id: usize,
+        ctx: &SessionContext,
+    ) -> Result<Vec<QueryResult>> {
+        let mut millis = vec![];
+        // run benchmark
+        let mut query_results = vec![];
+
+        let sql = &get_query_sql(self.query_path.to_str().unwrap(), query_id)?;
+
+        if self.common.debug {
+            println!("=== SQL for query {query_id} ===\n{}\n", sql.join(";\n"));
+        }
+
+        for i in 0..self.iterations() {
+            let start = Instant::now();
+
+            // query 15 is special, with 3 statements. the second statement is the one from which we
+            // want to capture the results
+            let mut result = vec![];
+
+            for query in sql {
+                result = self.execute_query(ctx, query).await?;
+            }
+
+            let elapsed = start.elapsed();
+            let ms = elapsed.as_secs_f64() * 1000.0;
+            millis.push(ms);
+            info!("output:\n\n{}\n\n", pretty_format_batches(&result)?);
+            let row_count = result.iter().map(|b| b.num_rows()).sum();
+            println!(
+                "Query {query_id} iteration {i} took {ms:.1} ms and returned {row_count} rows"
+            );
+            query_results.push(QueryResult { elapsed, row_count });
+        }
+
+        let avg = millis.iter().sum::<f64>() / millis.len() as f64;
+        println!("Query {query_id} avg time: {avg:.2} ms");
+
+        // Print memory stats using mimalloc (only when compiled with --features mimalloc_extended)
+        print_memory_stats();
+
+        Ok(query_results)
+    }
+
+    async fn register_tables(&self, ctx: &SessionContext) -> Result<()> {
+        for table in TPCDS_TABLES {
+            let table_provider = { self.get_table(ctx, table).await? };
+
+            if self.mem_table {
+                println!("Loading table '{table}' into memory");
+                let start = Instant::now();
+                let memtable =
+                    MemTable::load(table_provider, Some(self.partitions()), &ctx.state())
+                        .await?;
+                println!(
+                    "Loaded table '{}' into memory in {} ms",
+                    table,
+                    start.elapsed().as_millis()
+                );
+                ctx.register_table(*table, Arc::new(memtable))?;
+            } else {
+                ctx.register_table(*table, table_provider)?;
+            }
+        }
+        Ok(())
+    }
+
+    async fn execute_query(
+        &self,
+        ctx: &SessionContext,
+        sql: &str,
+    ) -> Result<Vec<RecordBatch>> {
+        let debug = self.common.debug;
+        let plan = ctx.sql(sql).await?;
+        let (state, plan) = plan.into_parts();
+
+        if debug {
+            println!("=== Logical plan ===\n{plan}\n");
+        }
+
+        let plan = state.optimize(&plan)?;
+        if debug {
+            println!("=== Optimized logical plan ===\n{plan}\n");
+        }
+        let physical_plan = state.create_physical_plan(&plan).await?;
+        if debug {
+            println!(
+                "=== Physical plan ===\n{}\n",
+                displayable(physical_plan.as_ref()).indent(true)
+            );
+        }
+        let result = collect(physical_plan.clone(), state.task_ctx()).await?;
+        if debug {
+            println!(
+                "=== Physical plan with metrics ===\n{}\n",
+                DisplayableExecutionPlan::with_metrics(physical_plan.as_ref())
+                    .indent(true)
+            );
+            if !result.is_empty() {
+                // do not call print_batches if there are no batches as the result is confusing
+                // and makes it look like there is a batch with no columns
+                pretty::print_batches(&result)?;
+            }
+        }
+        Ok(result)
+    }
+
+    async fn get_table(
+        &self,
+        ctx: &SessionContext,
+        table: &str,
+    ) -> Result<Arc<dyn TableProvider>> {
+        let path = self.path.to_str().unwrap();
+        let target_partitions = self.partitions();
+
+        // Obtain a snapshot of the SessionState
+        let state = ctx.state();
+        let path = format!("{path}/{table}.parquet");
+
+        // Check if the file exists
+        if !std::path::Path::new(&path).exists() {
+            eprintln!("Warning registering {table}: Table file does not exist: {path}");
+        }
+
+        let format = ParquetFormat::default()
+            .with_options(ctx.state().table_options().parquet.clone());
+
+        let table_path = ListingTableUrl::parse(path)?;
+        let options = ListingOptions::new(Arc::new(format))
+            .with_file_extension(DEFAULT_PARQUET_EXTENSION)
+            .with_target_partitions(target_partitions)
+            .with_collect_stat(state.config().collect_statistics());
+        let schema = options.infer_schema(&state, &table_path).await?;
+
+        if self.common.debug {
+            println!(
+                "Inferred schema from {table_path} for table '{table}':\n{schema:#?}\n"
+            );
+        }
+
+        let options = if self.sorted {
+            let key_column_name = schema.fields()[0].name();
+            options
+                .with_file_sort_order(vec![vec![col(key_column_name).sort(true, false)]])
+        } else {
+            options
+        };
+
+        let config = ListingTableConfig::new(table_path)
+            .with_listing_options(options)
+            .with_schema(schema);
+
+        Ok(Arc::new(ListingTable::try_new(config)?))
+    }
+
+    fn iterations(&self) -> usize {
+        self.common.iterations
+    }
+
+    fn partitions(&self) -> usize {
+        self.common
+            .partitions
+            .unwrap_or_else(get_available_parallelism)
+    }
+}
diff --git a/benchmarks/src/tpch/convert.rs b/benchmarks/src/tpch/convert.rs
deleted file mode 100644
index 5219e09cd3052..0000000000000
--- a/benchmarks/src/tpch/convert.rs
+++ /dev/null
@@ -1,162 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use datafusion::logical_expr::select_expr::SelectExpr;
-use datafusion_common::instant::Instant;
-use std::fs;
-use std::path::{Path, PathBuf};
-
-use datafusion::common::not_impl_err;
-
-use super::get_tbl_tpch_table_schema;
-use super::TPCH_TABLES;
-use datafusion::error::Result;
-use datafusion::prelude::*;
-use parquet::basic::Compression;
-use parquet::file::properties::WriterProperties;
-use structopt::StructOpt;
-
-/// Convert tpch .slt files to .parquet or .csv files
-#[derive(Debug, StructOpt)]
-pub struct ConvertOpt {
-    /// Path to csv files
-    #[structopt(parse(from_os_str), required = true, short = "i", long = "input")]
-    input_path: PathBuf,
-
-    /// Output path
-    #[structopt(parse(from_os_str), required = true, short = "o", long = "output")]
-    output_path: PathBuf,
-
-    /// Output file format: `csv` or `parquet`
-    #[structopt(short = "f", long = "format")]
-    file_format: String,
-
-    /// Compression to use when writing Parquet files
-    #[structopt(short = "c", long = "compression", default_value = "zstd")]
-    compression: String,
-
-    /// Number of partitions to produce
-    #[structopt(short = "n", long = "partitions", default_value = "1")]
-    partitions: usize,
-
-    /// Batch size when reading CSV or Parquet files
-    #[structopt(short = "s", long = "batch-size", default_value = "8192")]
-    batch_size: usize,
-
-    /// Sort each table by its first column in ascending order.
-    #[structopt(short = "t", long = "sort")]
-    sort: bool,
-}
-
-impl ConvertOpt {
-    pub async fn run(self) -> Result<()> {
-        let compression = self.compression()?;
-
-        let input_path = self.input_path.to_str().unwrap();
-        let output_path = self.output_path.to_str().unwrap();
-
-        let output_root_path = Path::new(output_path);
-        for table in TPCH_TABLES {
-            let start = Instant::now();
-            let schema = get_tbl_tpch_table_schema(table);
-            let key_column_name = schema.fields()[0].name();
-
-            let input_path = format!("{input_path}/{table}.tbl");
-            let options = CsvReadOptions::new()
-                .schema(&schema)
-                .has_header(false)
-                .delimiter(b'|')
-                .file_extension(".tbl");
-            let options = if self.sort {
-                // indicated that the file is already sorted by its first column to speed up the conversion
-                options
-                    .file_sort_order(vec![vec![col(key_column_name).sort(true, false)]])
-            } else {
-                options
-            };
-
-            let config = SessionConfig::new().with_batch_size(self.batch_size);
-            let ctx = SessionContext::new_with_config(config);
-
-            // build plan to read the TBL file
-            let mut csv = ctx.read_csv(&input_path, options).await?;
-
-            // Select all apart from the padding column
-            let selection = csv
-                .schema()
-                .iter()
-                .take(schema.fields.len() - 1)
-                .map(Expr::from)
-                .map(SelectExpr::from)
-                .collect::<Vec<_>>();
-
-            csv = csv.select(selection)?;
-            // optionally, repartition the file
-            let partitions = self.partitions;
-            if partitions > 1 {
-                csv = csv.repartition(Partitioning::RoundRobinBatch(partitions))?
-            }
-            let csv = if self.sort {
-                csv.sort_by(vec![col(key_column_name)])?
-            } else {
-                csv
-            };
-
-            // create the physical plan
-            let csv = csv.create_physical_plan().await?;
-
-            let output_path = output_root_path.join(table);
-            let output_path = output_path.to_str().unwrap().to_owned();
-            fs::create_dir_all(&output_path)?;
-            println!(
-                "Converting '{}' to {} files in directory '{}'",
-                &input_path, self.file_format, &output_path
-            );
-            match self.file_format.as_str() {
-                "csv" => ctx.write_csv(csv, output_path).await?,
-                "parquet" => {
-                    let props = WriterProperties::builder()
-                        .set_compression(compression)
-                        .build();
-                    ctx.write_parquet(csv, output_path, Some(props)).await?
-                }
-                other => {
-                    return not_impl_err!("Invalid output format: {other}");
-                }
-            }
-            println!("Conversion completed in {} ms", start.elapsed().as_millis());
-        }
-
-        Ok(())
-    }
-
-    /// return the compression method to use when writing parquet
-    fn compression(&self) -> Result<Compression> {
-        Ok(match self.compression.as_str() {
-            "none" => Compression::UNCOMPRESSED,
-            "snappy" => Compression::SNAPPY,
-            "brotli" => Compression::BROTLI(Default::default()),
-            "gzip" => Compression::GZIP(Default::default()),
-            "lz4" => Compression::LZ4,
-            "lz0" => Compression::LZO,
-            "zstd" => Compression::ZSTD(Default::default()),
-            other => {
-                return not_impl_err!("Invalid compression format: {other}");
-            }
-        })
-    }
-}
diff --git a/benchmarks/src/tpch/mod.rs b/benchmarks/src/tpch/mod.rs
index 233ea94a05c1a..08cedc0e5b4c3 100644
--- a/benchmarks/src/tpch/mod.rs
+++ b/benchmarks/src/tpch/mod.rs
@@ -27,15 +27,13 @@ use std::fs;
 mod run;
 pub use run::RunOpt;
 
-mod convert;
-pub use convert::ConvertOpt;
-
 pub const TPCH_TABLES: &[&str] = &[
     "part", "supplier", "partsupp", "customer", "orders", "lineitem", "nation", "region",
 ];
 
 pub const TPCH_QUERY_START_ID: usize = 1;
 pub const TPCH_QUERY_END_ID: usize = 22;
+const TPCH_Q11_FRACTION_SENTINEL: &str = "0.0001 /* __TPCH_Q11_FRACTION__ */";
 
 /// The `.tbl` file contains a trailing column
 pub fn get_tbl_tpch_table_schema(table: &str) -> Schema {
@@ -142,6 +140,21 @@ pub fn get_tpch_table_schema(table: &str) -> Schema {
 
 /// Get the SQL statements from the specified query file
 pub fn get_query_sql(query: usize) -> Result<Vec<String>> {
+    get_query_sql_for_scale_factor(query, 1.0)
+}
+
+/// Get the SQL statements from the specified query file using the provided scale factor for
+/// TPC-H substitutions such as Q11 FRACTION.
+pub fn get_query_sql_for_scale_factor(
+    query: usize,
+    scale_factor: f64,
+) -> Result<Vec<String>> {
+    if !(scale_factor.is_finite() && scale_factor > 0.0) {
+        return plan_err!(
+            "invalid scale factor. Expected a positive finite value, got {scale_factor}"
+        );
+    }
+
     if query > 0 && query < 23 {
         let possibilities = vec![
             format!("queries/q{query}.sql"),
@@ -151,6 +164,7 @@ pub fn get_query_sql(query: usize) -> Result<Vec<String>> {
         for filename in possibilities {
             match fs::read_to_string(&filename) {
                 Ok(contents) => {
+                    let contents = customize_query_sql(query, contents, scale_factor)?;
                     return Ok(contents
                         .split(';')
                         .map(|s| s.trim())
@@ -167,6 +181,27 @@ pub fn get_query_sql(query: usize) -> Result<Vec<String>> {
     }
 }
 
+fn customize_query_sql(
+    query: usize,
+    contents: String,
+    scale_factor: f64,
+) -> Result<String> {
+    if query != 11 {
+        return Ok(contents);
+    }
+
+    if !contents.contains(TPCH_Q11_FRACTION_SENTINEL) {
+        return plan_err!(
+            "invalid query 11. Missing fraction marker {TPCH_Q11_FRACTION_SENTINEL}"
+        );
+    }
+
+    Ok(contents.replace(
+        TPCH_Q11_FRACTION_SENTINEL,
+        &format!("(0.0001 / {scale_factor})"),
+    ))
+}
+
 pub const QUERY_LIMIT: [Option<usize>; 22] = [
     None,
     Some(100),
@@ -191,3 +226,51 @@ pub const QUERY_LIMIT: [Option<usize>; 22] = [
     Some(100),
     None,
 ];
+
+#[cfg(test)]
+mod tests {
+    use super::{get_query_sql, get_query_sql_for_scale_factor};
+    use datafusion::error::Result;
+
+    fn get_single_query(query: usize) -> Result<String> {
+        let mut queries = get_query_sql(query)?;
+        assert_eq!(queries.len(), 1);
+        Ok(queries.remove(0))
+    }
+
+    fn get_single_query_for_scale_factor(
+        query: usize,
+        scale_factor: f64,
+    ) -> Result<String> {
+        let mut queries = get_query_sql_for_scale_factor(query, scale_factor)?;
+        assert_eq!(queries.len(), 1);
+        Ok(queries.remove(0))
+    }
+
+    #[test]
+    fn q11_uses_scale_factor_substitution() -> Result<()> {
+        let sf1_sql = get_single_query(11)?;
+        assert!(sf1_sql.contains("(0.0001 / 1)"));
+
+        let sf01_sql = get_single_query_for_scale_factor(11, 0.1)?;
+        assert!(sf01_sql.contains("(0.0001 / 0.1)"));
+
+        let sf10_sql = get_single_query_for_scale_factor(11, 10.0)?;
+        assert!(sf10_sql.contains("(0.0001 / 10)"));
+
+        let sf30_sql = get_single_query_for_scale_factor(11, 30.0)?;
+        assert!(sf30_sql.contains("(0.0001 / 30)"));
+        assert!(!sf10_sql.contains("__TPCH_Q11_FRACTION__"));
+        Ok(())
+    }
+
+    #[test]
+    fn interval_queries_use_interval_arithmetic() -> Result<()> {
+        assert!(get_single_query(5)?.contains("date '1994-01-01' + interval '1' year"));
+        assert!(get_single_query(6)?.contains("date '1994-01-01' + interval '1' year"));
+        assert!(get_single_query(10)?.contains("date '1993-10-01' + interval '3' month"));
+        assert!(get_single_query(12)?.contains("date '1994-01-01' + interval '1' year"));
+        assert!(get_single_query(14)?.contains("date '1995-09-01' + interval '1' month"));
+        Ok(())
+    }
+}
diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs
index cc59b78030360..ec7aa8c554a28 100644
--- a/benchmarks/src/tpch/run.rs
+++ b/benchmarks/src/tpch/run.rs
@@ -15,20 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 use std::sync::Arc;
 
 use super::{
-    get_query_sql, get_tbl_tpch_table_schema, get_tpch_table_schema, TPCH_QUERY_END_ID,
-    TPCH_QUERY_START_ID, TPCH_TABLES,
+    TPCH_QUERY_END_ID, TPCH_QUERY_START_ID, TPCH_TABLES, get_query_sql_for_scale_factor,
+    get_tbl_tpch_table_schema, get_tpch_table_schema,
 };
-use crate::util::{print_memory_stats, BenchmarkRun, CommonOpt, QueryResult};
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult, print_memory_stats};
 
 use arrow::record_batch::RecordBatch;
 use arrow::util::pretty::{self, pretty_format_batches};
+use datafusion::common::exec_err;
+use datafusion::datasource::file_format::FileFormat;
 use datafusion::datasource::file_format::csv::CsvFormat;
 use datafusion::datasource::file_format::parquet::ParquetFormat;
-use datafusion::datasource::file_format::FileFormat;
 use datafusion::datasource::listing::{
     ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
 };
@@ -41,8 +42,8 @@ use datafusion_common::instant::Instant;
 use datafusion_common::utils::get_available_parallelism;
 use datafusion_common::{DEFAULT_CSV_EXTENSION, DEFAULT_PARQUET_EXTENSION};
 
+use clap::Args;
 use log::info;
-use structopt::StructOpt;
 
 // hack to avoid `default_value is meaningless for bool` errors
 type BoolDefaultTrue = bool;
@@ -56,46 +57,51 @@ type BoolDefaultTrue = bool;
 /// [1]: http://www.tpc.org/tpch/
 /// [2]: https://github.com/databricks/tpch-dbgen.git
 /// [2.17.1]: https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v2.17.1.pdf
-#[derive(Debug, StructOpt, Clone)]
-#[structopt(verbatim_doc_comment)]
+#[derive(Debug, Args, Clone)]
+#[command(verbatim_doc_comment)]
 pub struct RunOpt {
     /// Query number. If not specified, runs all queries
-    #[structopt(short, long)]
+    #[arg(short, long)]
     pub query: Option<usize>,
 
     /// Common options
-    #[structopt(flatten)]
+    #[command(flatten)]
     common: CommonOpt,
 
     /// Path to data files
-    #[structopt(parse(from_os_str), required = true, short = "p", long = "path")]
+    #[arg(required = true, short = 'p', long = "path")]
     path: PathBuf,
 
+    /// TPC-H scale factor used for query substitutions such as Q11 FRACTION.
+    /// If omitted, the benchmark tries to infer it from paths like `.../tpch_sf10/...`.
+    #[arg(long)]
+    scale_factor: Option<f64>,
+
     /// File format: `csv` or `parquet`
-    #[structopt(short = "f", long = "format", default_value = "csv")]
+    #[arg(short = 'f', long = "format", default_value = "csv")]
     file_format: String,
 
     /// Load the data into a MemTable before executing the query
-    #[structopt(short = "m", long = "mem-table")]
+    #[arg(short = 'm', long = "mem-table")]
     mem_table: bool,
 
     /// Path to machine readable output file
-    #[structopt(parse(from_os_str), short = "o", long = "output")]
+    #[arg(short = 'o', long = "output")]
     output_path: Option<PathBuf>,
 
     /// Whether to disable collection of statistics (and cost based optimizations) or not.
-    #[structopt(short = "S", long = "disable-statistics")]
+    #[arg(short = 'S', long = "disable-statistics")]
     disable_statistics: bool,
 
     /// If true then hash join used, if false then sort merge join
     /// True by default.
-    #[structopt(short = "j", long = "prefer_hash_join", default_value = "true")]
+    #[arg(short = 'j', long = "prefer_hash_join", default_value = "true")]
     prefer_hash_join: BoolDefaultTrue,
 
     /// If true then Piecewise Merge Join can be used, if false then it will opt for Nested Loop Join
-    /// True by default.
-    #[structopt(
-        short = "j",
+    /// False by default.
+    #[arg(
+        short = 'w',
         long = "enable_piecewise_merge_join",
         default_value = "false"
     )]
@@ -103,8 +109,12 @@ pub struct RunOpt {
 
     /// Mark the first column of each table as sorted in ascending order.
     /// The tables should have been created with the `--sort` option for this to have any effect.
-    #[structopt(short = "t", long = "sorted")]
+    #[arg(short = 't', long = "sorted")]
     sorted: bool,
+
+    /// How many bytes to buffer on the probe side of hash joins.
+    #[arg(long, default_value = "0")]
+    hash_join_buffering_capacity: usize,
 }
 
 impl RunOpt {
@@ -123,14 +133,17 @@ impl RunOpt {
         config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
         config.options_mut().optimizer.enable_piecewise_merge_join =
             self.enable_piecewise_merge_join;
-        let rt_builder = self.common.runtime_env_builder()?;
-        let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?);
+        config.options_mut().execution.hash_join_buffering_capacity =
+            self.hash_join_buffering_capacity;
+        let rt = self.common.build_runtime()?;
+        let ctx = SessionContext::new_with_config_rt(config, rt);
         // register tables
         self.register_tables(&ctx).await?;
+        let scale_factor = self.scale_factor()?;
 
         for query_id in query_range {
             benchmark_run.start_new_case(&format!("Query {query_id}"));
-            let query_run = self.benchmark_query(query_id, &ctx).await;
+            let query_run = self.benchmark_query(query_id, scale_factor, &ctx).await;
             match query_run {
                 Ok(query_results) => {
                     for iter in query_results {
@@ -151,13 +164,14 @@ impl RunOpt {
     async fn benchmark_query(
         &self,
         query_id: usize,
+        scale_factor: f64,
         ctx: &SessionContext,
     ) -> Result<Vec<QueryResult>> {
         let mut millis = vec![];
         // run benchmark
         let mut query_results = vec![];
 
-        let sql = &get_query_sql(query_id)?;
+        let sql = &get_query_sql_for_scale_factor(query_id, scale_factor)?;
 
         for i in 0..self.iterations() {
             let start = Instant::now();
@@ -340,6 +354,82 @@ impl RunOpt {
             .partitions
             .unwrap_or_else(get_available_parallelism)
     }
+
+    fn scale_factor(&self) -> Result<f64> {
+        resolve_scale_factor(self.scale_factor, &self.path)
+    }
+}
+
+fn resolve_scale_factor(scale_factor: Option<f64>, path: &Path) -> Result<f64> {
+    let scale_factor = scale_factor
+        .or_else(|| infer_scale_factor_from_path(path))
+        .unwrap_or(1.0);
+
+    if scale_factor.is_finite() && scale_factor > 0.0 {
+        Ok(scale_factor)
+    } else {
+        exec_err!(
+            "Invalid TPC-H scale factor {scale_factor}. Expected a positive finite value"
+        )
+    }
+}
+
+fn infer_scale_factor_from_path(path: &Path) -> Option<f64> {
+    path.iter().find_map(|component| {
+        component
+            .to_str()?
+            .strip_prefix("tpch_sf")?
+            .parse::<f64>()
+            .ok()
+    })
+}
+
+#[cfg(test)]
+mod scale_factor_tests {
+    use std::path::Path;
+
+    use super::{infer_scale_factor_from_path, resolve_scale_factor};
+    use datafusion::error::Result;
+
+    #[test]
+    fn uses_explicit_scale_factor_when_provided() -> Result<()> {
+        let scale_factor =
+            resolve_scale_factor(Some(30.0), Path::new("benchmarks/data/tpch_sf10"))?;
+        assert_eq!(scale_factor, 30.0);
+        Ok(())
+    }
+
+    #[test]
+    fn infers_scale_factor_from_standard_tpch_path() -> Result<()> {
+        let scale_factor =
+            resolve_scale_factor(None, Path::new("benchmarks/data/tpch_sf10"))?;
+        assert_eq!(scale_factor, 10.0);
+        assert_eq!(
+            infer_scale_factor_from_path(Path::new("benchmarks/data/tpch_sf0.1")),
+            Some(0.1)
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn defaults_to_sf1_when_path_has_no_scale_factor() -> Result<()> {
+        let scale_factor = resolve_scale_factor(None, Path::new("benchmarks/data"))?;
+        assert_eq!(scale_factor, 1.0);
+        Ok(())
+    }
+
+    #[test]
+    fn rejects_invalid_scale_factors() {
+        assert!(resolve_scale_factor(Some(0.0), Path::new("benchmarks/data")).is_err());
+        assert!(resolve_scale_factor(Some(-1.0), Path::new("benchmarks/data")).is_err());
+        assert!(
+            resolve_scale_factor(Some(f64::NAN), Path::new("benchmarks/data")).is_err()
+        );
+        assert!(
+            resolve_scale_factor(Some(f64::INFINITY), Path::new("benchmarks/data"))
+                .is_err()
+        );
+    }
 }
 
 #[cfg(test)]
@@ -380,11 +470,13 @@ mod tests {
             memory_limit: None,
             sort_spill_reservation_bytes: None,
             debug: false,
+            simulate_latency: false,
         };
         let opt = RunOpt {
             query: Some(query),
             common,
             path: PathBuf::from(path.to_string()),
+            scale_factor: Some(1.0),
             file_format: "tbl".to_string(),
             mem_table: false,
             output_path: None,
@@ -392,9 +484,10 @@ mod tests {
             prefer_hash_join: true,
             enable_piecewise_merge_join: false,
             sorted: false,
+            hash_join_buffering_capacity: 0,
         };
         opt.register_tables(&ctx).await?;
-        let queries = get_query_sql(query)?;
+        let queries = crate::tpch::get_query_sql(query)?;
         for query in queries {
             let plan = ctx.sql(&query).await?;
             let plan = plan.into_optimized_plan()?;
@@ -418,11 +511,13 @@ mod tests {
             memory_limit: None,
             sort_spill_reservation_bytes: None,
             debug: false,
+            simulate_latency: false,
         };
         let opt = RunOpt {
             query: Some(query),
             common,
             path: PathBuf::from(path.to_string()),
+            scale_factor: Some(1.0),
             file_format: "tbl".to_string(),
             mem_table: false,
             output_path: None,
@@ -430,9 +525,10 @@ mod tests {
             prefer_hash_join: true,
             enable_piecewise_merge_join: false,
             sorted: false,
+            hash_join_buffering_capacity: 0,
         };
         opt.register_tables(&ctx).await?;
-        let queries = get_query_sql(query)?;
+        let queries = crate::tpch::get_query_sql(query)?;
         for query in queries {
             let plan = ctx.sql(&query).await?;
             let plan = plan.create_physical_plan().await?;
diff --git a/benchmarks/src/util/latency_object_store.rs b/benchmarks/src/util/latency_object_store.rs
new file mode 100644
index 0000000000000..9ef8d1b78b751
--- /dev/null
+++ b/benchmarks/src/util/latency_object_store.rs
@@ -0,0 +1,157 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! An ObjectStore wrapper that adds simulated S3-like latency to get and list operations.
+//!
+//! Cycles through a fixed latency distribution inspired by real S3 performance:
+//! - P50: ~30ms
+//! - P75-P90: ~100-120ms
+//! - P99: ~150-200ms
+
+use std::fmt;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::time::Duration;
+
+use async_trait::async_trait;
+use futures::StreamExt;
+use futures::stream::BoxStream;
+use object_store::path::Path;
+use object_store::{
+    CopyOptions, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta,
+    ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult, Result,
+};
+
+/// GET latency distribution, inspired by S3 latencies.
+/// Deterministic but shuffled to avoid artificial patterns.
+/// 20 values: 11x P50 (~25-35ms), 5x P75-P90 (~70-110ms), 2x P95 (~120-150ms), 2x P99 (~180-200ms)
+/// Sorted: 25,25,28,28,30,30,30,30,32,32,35, 70,85,100,100,110, 130,150, 180,200
+/// P50≈32ms, P90≈110ms, P99≈200ms
+const GET_LATENCIES_MS: &[u64] = &[
+    30, 100, 25, 85, 32, 200, 28, 130, 35, 70, 30, 150, 30, 110, 28, 180, 32, 25, 100, 30,
+];
+
+/// LIST latency distribution, generally higher than GET.
+/// 20 values: 11x P50 (~40-70ms), 5x P75-P90 (~120-180ms), 2x P95 (~200-250ms), 2x P99 (~300-400ms)
+/// Sorted: 40,40,50,50,55,55,60,60,65,65,70, 120,140,160,160,180, 210,250, 300,400
+/// P50≈65ms, P90≈180ms, P99≈400ms
+const LIST_LATENCIES_MS: &[u64] = &[
+    55, 160, 40, 140, 65, 400, 50, 210, 70, 120, 60, 250, 55, 180, 50, 300, 65, 40, 160,
+    60,
+];
+
+/// An ObjectStore wrapper that injects simulated latency on get and list calls.
+#[derive(Debug)]
+pub struct LatencyObjectStore<T: ObjectStore> {
+    inner: T,
+    get_counter: AtomicUsize,
+    list_counter: AtomicUsize,
+}
+
+impl<T: ObjectStore> LatencyObjectStore<T> {
+    pub fn new(inner: T) -> Self {
+        Self {
+            inner,
+            get_counter: AtomicUsize::new(0),
+            list_counter: AtomicUsize::new(0),
+        }
+    }
+
+    fn next_get_latency(&self) -> Duration {
+        let idx =
+            self.get_counter.fetch_add(1, Ordering::Relaxed) % GET_LATENCIES_MS.len();
+        Duration::from_millis(GET_LATENCIES_MS[idx])
+    }
+
+    fn next_list_latency(&self) -> Duration {
+        let idx =
+            self.list_counter.fetch_add(1, Ordering::Relaxed) % LIST_LATENCIES_MS.len();
+        Duration::from_millis(LIST_LATENCIES_MS[idx])
+    }
+}
+
+impl<T: ObjectStore> fmt::Display for LatencyObjectStore<T> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "LatencyObjectStore({})", self.inner)
+    }
+}
+
+#[async_trait]
+impl<T: ObjectStore> ObjectStore for LatencyObjectStore<T> {
+    async fn put_opts(
+        &self,
+        location: &Path,
+        payload: PutPayload,
+        opts: PutOptions,
+    ) -> Result<PutResult> {
+        self.inner.put_opts(location, payload, opts).await
+    }
+
+    async fn put_multipart_opts(
+        &self,
+        location: &Path,
+        opts: PutMultipartOptions,
+    ) -> Result<Box<dyn MultipartUpload>> {
+        self.inner.put_multipart_opts(location, opts).await
+    }
+
+    async fn get_opts(&self, location: &Path, options: GetOptions) -> Result<GetResult> {
+        tokio::time::sleep(self.next_get_latency()).await;
+        self.inner.get_opts(location, options).await
+    }
+
+    async fn get_ranges(
+        &self,
+        location: &Path,
+        ranges: &[std::ops::Range<u64>],
+    ) -> Result<Vec<bytes::Bytes>> {
+        tokio::time::sleep(self.next_get_latency()).await;
+        self.inner.get_ranges(location, ranges).await
+    }
+
+    fn delete_stream(
+        &self,
+        locations: BoxStream<'static, Result<Path>>,
+    ) -> BoxStream<'static, Result<Path>> {
+        self.inner.delete_stream(locations)
+    }
+
+    fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result<ObjectMeta>> {
+        let latency = self.next_list_latency();
+        let stream = self.inner.list(prefix);
+        futures::stream::once(async move {
+            tokio::time::sleep(latency).await;
+            futures::stream::empty()
+        })
+        .flatten()
+        .chain(stream)
+        .boxed()
+    }
+
+    async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result<ListResult> {
+        tokio::time::sleep(self.next_list_latency()).await;
+        self.inner.list_with_delimiter(prefix).await
+    }
+
+    async fn copy_opts(
+        &self,
+        from: &Path,
+        to: &Path,
+        options: CopyOptions,
+    ) -> Result<()> {
+        self.inner.copy_opts(from, to, options).await
+    }
+}
diff --git a/benchmarks/src/util/memory.rs b/benchmarks/src/util/memory.rs
index 944239df31cfd..11b96ef227756 100644
--- a/benchmarks/src/util/memory.rs
+++ b/benchmarks/src/util/memory.rs
@@ -19,7 +19,7 @@
 pub fn print_memory_stats() {
     #[cfg(all(feature = "mimalloc", feature = "mimalloc_extended"))]
     {
-        use datafusion::execution::memory_pool::human_readable_size;
+        use datafusion_common::human_readable_size;
         let mut peak_rss = 0;
         let mut peak_commit = 0;
         let mut page_faults = 0;
diff --git a/benchmarks/src/util/mod.rs b/benchmarks/src/util/mod.rs
index ab4579a566f66..6dc11c0f425bd 100644
--- a/benchmarks/src/util/mod.rs
+++ b/benchmarks/src/util/mod.rs
@@ -16,6 +16,7 @@
 // under the License.
 
 //! Shared benchmark utilities
+pub mod latency_object_store;
 mod memory;
 mod options;
 mod run;
diff --git a/benchmarks/src/util/options.rs b/benchmarks/src/util/options.rs
index 6627a287dfcd4..a3e6d2a4c5538 100644
--- a/benchmarks/src/util/options.rs
+++ b/benchmarks/src/util/options.rs
@@ -17,50 +17,59 @@
 
 use std::{num::NonZeroUsize, sync::Arc};
 
+use clap::Args;
 use datafusion::{
     execution::{
         disk_manager::DiskManagerBuilder,
         memory_pool::{FairSpillPool, GreedyMemoryPool, MemoryPool, TrackConsumersPool},
-        runtime_env::RuntimeEnvBuilder,
+        object_store::ObjectStoreUrl,
+        runtime_env::{RuntimeEnv, RuntimeEnvBuilder},
     },
     prelude::SessionConfig,
 };
 use datafusion_common::{DataFusionError, Result};
-use structopt::StructOpt;
+use object_store::local::LocalFileSystem;
+
+use super::latency_object_store::LatencyObjectStore;
 
 // Common benchmark options (don't use doc comments otherwise this doc
 // shows up in help files)
-#[derive(Debug, StructOpt, Clone)]
+#[derive(Debug, Args, Clone)]
 pub struct CommonOpt {
     /// Number of iterations of each test run
-    #[structopt(short = "i", long = "iterations", default_value = "3")]
+    #[arg(short = 'i', long = "iterations", default_value = "3", env)]
     pub iterations: usize,
 
     /// Number of partitions to process in parallel. Defaults to number of available cores.
-    #[structopt(short = "n", long = "partitions")]
+    #[arg(short = 'n', long = "partitions", env)]
     pub partitions: Option<usize>,
 
     /// Batch size when reading CSV or Parquet files
-    #[structopt(short = "s", long = "batch-size")]
+    #[arg(short = 's', long = "batch-size", env)]
     pub batch_size: Option<usize>,
 
     /// The memory pool type to use, should be one of "fair" or "greedy"
-    #[structopt(long = "mem-pool-type", default_value = "fair")]
+    #[arg(long = "mem-pool-type", default_value = "fair", env)]
     pub mem_pool_type: String,
 
     /// Memory limit (e.g. '100M', '1.5G'). If not specified, run all pre-defined memory limits for given query
     /// if there's any, otherwise run with no memory limit.
-    #[structopt(long = "memory-limit", parse(try_from_str = parse_memory_limit))]
+    #[arg(long = "memory-limit", value_parser = parse_capacity_limit, env)]
     pub memory_limit: Option<usize>,
 
     /// The amount of memory to reserve for sort spill operations. DataFusion's default value will be used
     /// if not specified.
-    #[structopt(long = "sort-spill-reservation-bytes", parse(try_from_str = parse_memory_limit))]
+    #[arg(long = "sort-spill-reservation-bytes", value_parser = parse_capacity_limit, env)]
     pub sort_spill_reservation_bytes: Option<usize>,
 
     /// Activate debug mode to see more details
-    #[structopt(short, long)]
+    #[arg(short, long, env)]
     pub debug: bool,
+
+    /// Simulate object store latency to mimic remote storage (e.g. S3).
+    /// Adds random latency in the range 20-200ms to each object store operation.
+    #[arg(long = "simulate-latency", env)]
+    pub simulate_latency: bool,
 }
 
 impl CommonOpt {
@@ -91,7 +100,15 @@ impl CommonOpt {
     pub fn runtime_env_builder(&self) -> Result<RuntimeEnvBuilder> {
         let mut rt_builder = RuntimeEnvBuilder::new();
         const NUM_TRACKED_CONSUMERS: usize = 5;
-        if let Some(memory_limit) = self.memory_limit {
+        // Use CLI --memory-limit if provided, otherwise fall back to
+        // DATAFUSION_RUNTIME_MEMORY_LIMIT env var
+        let memory_limit = self.memory_limit.or_else(|| {
+            std::env::var("DATAFUSION_RUNTIME_MEMORY_LIMIT")
+                .ok()
+                .and_then(|val| parse_capacity_limit(&val).ok())
+        });
+
+        if let Some(memory_limit) = memory_limit {
             let pool: Arc<dyn MemoryPool> = match self.mem_pool_type.as_str() {
                 "fair" => Arc::new(TrackConsumersPool::new(
                     FairSpillPool::new(memory_limit),
@@ -105,7 +122,7 @@ impl CommonOpt {
                     return Err(DataFusionError::Configuration(format!(
                         "Invalid memory pool type: {}",
                         self.mem_pool_type
-                    )))
+                    )));
                 }
             };
             rt_builder = rt_builder
@@ -114,22 +131,44 @@ impl CommonOpt {
         }
         Ok(rt_builder)
     }
+
+    /// Build the runtime environment, optionally wrapping the local filesystem
+    /// with a throttled object store to simulate remote storage latency.
+    pub fn build_runtime(&self) -> Result<Arc<RuntimeEnv>> {
+        let rt = self.runtime_env_builder()?.build_arc()?;
+        if self.simulate_latency {
+            let store: Arc<dyn object_store::ObjectStore> =
+                Arc::new(LatencyObjectStore::new(LocalFileSystem::new()));
+            let url = ObjectStoreUrl::parse("file:///")?;
+            rt.register_object_store(url.as_ref(), store);
+            println!(
+                "Simulating S3-like object store latency (get: 25-200ms, list: 40-400ms)"
+            );
+        }
+        Ok(rt)
+    }
 }
 
-/// Parse memory limit from string to number of bytes
-/// e.g. '1.5G', '100M' -> 1572864
-fn parse_memory_limit(limit: &str) -> Result<usize, String> {
+/// Parse capacity limit from string to number of bytes by allowing units: K, M and G.
+/// Supports formats like '1.5G' -> 1610612736, '100M' -> 104857600
+fn parse_capacity_limit(limit: &str) -> Result<usize, String> {
+    if limit.trim().is_empty() {
+        return Err("Capacity limit cannot be empty".to_string());
+    }
     let (number, unit) = limit.split_at(limit.len() - 1);
     let number: f64 = number
         .parse()
-        .map_err(|_| format!("Failed to parse number from memory limit '{limit}'"))?;
+        .map_err(|_| format!("Failed to parse number from capacity limit '{limit}'"))?;
+    if number.is_sign_negative() || number.is_infinite() {
+        return Err("Limit value should be positive finite number".to_string());
+    }
 
     match unit {
         "K" => Ok((number * 1024.0) as usize),
         "M" => Ok((number * 1024.0 * 1024.0) as usize),
         "G" => Ok((number * 1024.0 * 1024.0 * 1024.0) as usize),
         _ => Err(format!(
-            "Unsupported unit '{unit}' in memory limit '{limit}'"
+            "Unsupported unit '{unit}' in capacity limit '{limit}'. Unit must be one of: 'K', 'M', 'G'"
         )),
     }
 }
@@ -139,16 +178,59 @@ mod tests {
     use super::*;
 
     #[test]
-    fn test_parse_memory_limit_all() {
+    fn test_runtime_env_builder_reads_env_var() {
+        // Set the env var and verify runtime_env_builder picks it up
+        // when no CLI --memory-limit is provided
+        let opt = CommonOpt {
+            iterations: 3,
+            partitions: None,
+            batch_size: None,
+            mem_pool_type: "fair".to_string(),
+            memory_limit: None,
+            sort_spill_reservation_bytes: None,
+            debug: false,
+            simulate_latency: false,
+        };
+
+        // With env var set, builder should succeed and have a memory pool
+        // SAFETY: This test is single-threaded and the env var is restored after use
+        unsafe {
+            std::env::set_var("DATAFUSION_RUNTIME_MEMORY_LIMIT", "2G");
+        }
+        let builder = opt.runtime_env_builder().unwrap();
+        let runtime = builder.build().unwrap();
+        unsafe {
+            std::env::remove_var("DATAFUSION_RUNTIME_MEMORY_LIMIT");
+        }
+        // A 2G memory pool should be present — verify it reports the correct limit
+        match runtime.memory_pool.memory_limit() {
+            datafusion::execution::memory_pool::MemoryLimit::Finite(limit) => {
+                assert_eq!(limit, 2 * 1024 * 1024 * 1024);
+            }
+            _ => panic!("Expected Finite memory limit"),
+        }
+    }
+
+    #[test]
+    fn test_parse_capacity_limit_all() {
         // Test valid inputs
-        assert_eq!(parse_memory_limit("100K").unwrap(), 102400);
-        assert_eq!(parse_memory_limit("1.5M").unwrap(), 1572864);
-        assert_eq!(parse_memory_limit("2G").unwrap(), 2147483648);
+        assert_eq!(parse_capacity_limit("100K").unwrap(), 102400);
+        assert_eq!(parse_capacity_limit("1.5M").unwrap(), 1572864);
+        assert_eq!(parse_capacity_limit("2G").unwrap(), 2147483648);
 
         // Test invalid unit
-        assert!(parse_memory_limit("500X").is_err());
+        assert!(parse_capacity_limit("500X").is_err());
 
         // Test invalid number
-        assert!(parse_memory_limit("abcM").is_err());
+        assert!(parse_capacity_limit("abcM").is_err());
+
+        // Test negative number
+        assert!(parse_capacity_limit("-1M").is_err());
+
+        // Test infinite number
+        assert!(parse_capacity_limit("infM").is_err());
+
+        // Test negative infinite number
+        assert!(parse_capacity_limit("-infM").is_err());
     }
 }
diff --git a/benchmarks/src/util/run.rs b/benchmarks/src/util/run.rs
index 764ea648ff725..df17674e62961 100644
--- a/benchmarks/src/util/run.rs
+++ b/benchmarks/src/util/run.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion::{error::Result, DATAFUSION_VERSION};
+use datafusion::{DATAFUSION_VERSION, error::Result};
 use datafusion_common::utils::get_available_parallelism;
 use serde::{Serialize, Serializer};
 use serde_json::Value;
diff --git a/ci/scripts/changed_crates.sh b/ci/scripts/changed_crates.sh
new file mode 100755
index 0000000000000..6d014a9492632
--- /dev/null
+++ b/ci/scripts/changed_crates.sh
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Helper script for the breaking-changes-detector workflow.
+#
+# Subcommands:
+#   changed-crates <base_ref>
+#       Print space-separated list of crate names whose files changed vs base_ref.
+#       Only published workspace members (those without `publish = false`) are
+#       considered.
+#
+#   semver-check <base_ref> <packages...>
+#       Run cargo-semver-checks for the given packages against base_ref.
+#       Output and exit code are passed through unchanged; the caller is
+#       responsible for capturing/formatting them.
+
+set -euo pipefail
+
+# ── changed-crates ──────────────────────────────────────────────────
+cmd_changed_crates() {
+  local base_ref="${1:?Usage: changed_crates.sh changed-crates <base_ref>}"
+
+  # 1. Files changed between the PR and the base branch.
+  local changed_files
+  changed_files=$(git diff --name-only "${base_ref}...HEAD")
+
+  # 2. Every publishable workspace member, one per line as
+  #    "<crate-name> <crate-dir>". `publish = false` in Cargo.toml shows
+  #    up as `"publish": []` in cargo metadata, so filtering on that
+  #    excludes internal crates without a manual exclusion list.
+  local crates
+  crates=$(cargo metadata --no-deps --format-version 1 | jq -r '
+    (.workspace_root + "/") as $root
+    | .packages[]
+    | select(.publish != [])
+    | "\(.name) \(.manifest_path | ltrimstr($root) | rtrimstr("/Cargo.toml"))"
+  ')
+
+  # 3. Keep crates whose directory contains a changed file.
+  while read -r name dir; do
+    if grep -q "^${dir}/" <<<"$changed_files"; then
+      echo "$name"
+    fi
+  done <<<"$crates" | xargs
+}
+
+# ── semver-check ────────────────────────────────────────────────────
+cmd_semver_check() {
+  local base_ref="${1:?Usage: changed_crates.sh semver-check <base_ref> <packages...>}"
+  shift
+
+  local args=()
+  for pkg in "$@"; do
+    args+=(--package "$pkg")
+  done
+
+  cargo semver-checks --baseline-rev "$base_ref" "${args[@]}"
+}
+
+# ── main ────────────────────────────────────────────────────────────
+cmd="${1:?Usage: changed_crates.sh <changed-crates|semver-check> [args...]}"
+shift
+
+case "$cmd" in
+  changed-crates) cmd_changed_crates "$@" ;;
+  semver-check)   cmd_semver_check "$@" ;;
+  *) echo "Unknown command: $cmd" >&2; exit 1 ;;
+esac
diff --git a/ci/scripts/check_asf_yaml_status_checks.py b/ci/scripts/check_asf_yaml_status_checks.py
new file mode 100644
index 0000000000000..135654159051c
--- /dev/null
+++ b/ci/scripts/check_asf_yaml_status_checks.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Validate that every entry in .asf.yaml required_status_checks
+matches an actual GitHub Actions job name, and that the workflow
+is not filtered by paths/paths-ignore (which would prevent the
+check from running on some PRs, blocking merges).
+
+A typo or stale entry in required_status_checks will block all
+merges for the project, so this check catches that early.
+"""
+
+import glob
+import os
+import sys
+
+import yaml
+
+
+def get_required_checks(asf_yaml_path):
+    """Extract all required_status_checks contexts from .asf.yaml."""
+    with open(asf_yaml_path) as f:
+        config = yaml.safe_load(f)
+
+    checks = {}  # context -> list of branches requiring it
+    branches = config.get("github", {}).get("protected_branches", {})
+    for branch, settings in branches.items():
+        contexts = (
+            settings.get("required_status_checks", {}).get("contexts", [])
+        )
+        for ctx in contexts:
+            checks.setdefault(ctx, []).append(branch)
+
+    return checks
+
+
+def get_workflow_jobs(workflows_dir):
+    """Collect all jobs with their metadata from GitHub Actions workflow files.
+
+    Returns a dict mapping job identifier (name or key) to a list of
+    (workflow_file, has_path_filters) tuples.
+    """
+    jobs = {}  # identifier -> [(workflow_file, has_path_filters)]
+    for workflow_file in sorted(glob.glob(os.path.join(workflows_dir, "*.yml"))):
+        with open(workflow_file) as f:
+            workflow = yaml.safe_load(f)
+
+        if not workflow or "jobs" not in workflow:
+            continue
+
+        # Check if pull_request trigger has path filters
+        on = workflow.get(True, workflow.get("on", {}))  # yaml parses `on:` as True
+        pr_trigger = on.get("pull_request", {}) if isinstance(on, dict) else {}
+        has_path_filters = bool(
+            isinstance(pr_trigger, dict)
+            and (pr_trigger.get("paths") or pr_trigger.get("paths-ignore"))
+        )
+
+        basename = os.path.basename(workflow_file)
+        for job_key, job_config in workflow.get("jobs", {}).items():
+            if not isinstance(job_config, dict):
+                continue
+            job_name = job_config.get("name", job_key)
+            info = (basename, has_path_filters)
+            jobs.setdefault(job_name, []).append(info)
+            if job_key != job_name:
+                jobs.setdefault(job_key, []).append(info)
+
+    return jobs
+
+
+def main():
+    repo_root = os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    )
+    asf_yaml = os.path.join(repo_root, ".asf.yaml")
+    workflows_dir = os.path.join(repo_root, ".github", "workflows")
+
+    required_checks = get_required_checks(asf_yaml)
+    if not required_checks:
+        print("No required_status_checks found in .asf.yaml — nothing to validate.")
+        return
+
+    jobs = get_workflow_jobs(workflows_dir)
+    errors = []
+
+    for ctx in sorted(required_checks):
+        branches = ", ".join(sorted(required_checks[ctx]))
+        if ctx not in jobs:
+            errors.append(
+                f'  - "{ctx}" (branch: {branches}): '
+                f"not found in any GitHub Actions workflow"
+            )
+            continue
+
+        # Check if ALL workflows providing this job have path filters
+        # (if at least one doesn't, the check will still run)
+        filtered_workflows = [
+            wf for wf, has_filter in jobs[ctx] if has_filter
+        ]
+        unfiltered_workflows = [
+            wf for wf, has_filter in jobs[ctx] if not has_filter
+        ]
+        if filtered_workflows and not unfiltered_workflows:
+            wf_list = ", ".join(filtered_workflows)
+            errors.append(
+                f'  - "{ctx}" (branch: {branches}): '
+                f"workflow {wf_list} uses paths/paths-ignore filters on "
+                f"pull_request, so this check won't run for some PRs "
+                f"and will block merging"
+            )
+
+    if errors:
+        print("ERROR: Problems found with required_status_checks in .asf.yaml:\n")
+        print("\n".join(errors))
+        print()
+        print("Available job names across all workflows:")
+        for name in sorted(jobs):
+            print(f"  - {name}")
+        sys.exit(1)
+
+    print(
+        f"OK: All {len(required_checks)} required_status_checks "
+        "match existing GitHub Actions jobs."
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ci/scripts/check_examples_docs.sh b/ci/scripts/check_examples_docs.sh
new file mode 100755
index 0000000000000..62308b323b535
--- /dev/null
+++ b/ci/scripts/check_examples_docs.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Generates documentation for DataFusion examples using the Rust-based
+# documentation generator and verifies that the committed README.md
+# is up to date.
+#
+# The README is generated from documentation comments in:
+#   datafusion-examples/examples/<group>/main.rs
+#
+# This script is intended to be run in CI to ensure that example
+# documentation stays in sync with the code.
+#
+# To update the README locally, run this script and replace README.md
+# with the generated output.
+
+set -euo pipefail
+
+ROOT_DIR="$(git rev-parse --show-toplevel)"
+
+# Load centralized tool versions
+source "${ROOT_DIR}/ci/scripts/utils/tool_versions.sh"
+
+EXAMPLES_DIR="$ROOT_DIR/datafusion-examples"
+README="$EXAMPLES_DIR/README.md"
+README_NEW="$EXAMPLES_DIR/README-NEW.md"
+
+echo "▶ Generating examples README (Rust generator)…"
+cargo run --quiet \
+  --manifest-path "$EXAMPLES_DIR/Cargo.toml" \
+  --bin examples-docs \
+  > "$README_NEW"
+
+echo "▶ Formatting generated README with prettier ${PRETTIER_VERSION}…"
+npx "prettier@${PRETTIER_VERSION}" \
+  --parser markdown \
+  --write "$README_NEW"
+
+echo "▶ Comparing generated README with committed version…"
+
+if ! diff -u "$README" "$README_NEW" > /tmp/examples-readme.diff; then
+  echo ""
+  echo "❌ Examples README is out of date."
+  echo ""
+  echo "The examples documentation is generated automatically from:"
+  echo "  - datafusion-examples/examples/<group>/main.rs"
+  echo ""
+  echo "To update the README locally, run:"
+  echo ""
+  echo "  cargo run --bin examples-docs \\"
+  echo "    | npx prettier@${PRETTIER_VERSION} --parser markdown --write \\"
+  echo "    > datafusion-examples/README.md"
+  echo ""
+  echo "Diff:"
+  echo "------------------------------------------------------------"
+  cat /tmp/examples-readme.diff
+  echo "------------------------------------------------------------"
+  exit 1
+fi
+
+echo "✅ Examples README is up-to-date."
diff --git a/ci/scripts/doc_prettier_check.sh b/ci/scripts/doc_prettier_check.sh
new file mode 100755
index 0000000000000..95332eb65aaf2
--- /dev/null
+++ b/ci/scripts/doc_prettier_check.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euo pipefail
+
+ROOT_DIR="$(git rev-parse --show-toplevel)"
+SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")"
+
+# Load shared utilities and tool versions
+source "${ROOT_DIR}/ci/scripts/utils/tool_versions.sh"
+source "${ROOT_DIR}/ci/scripts/utils/git.sh"
+
+PRETTIER_TARGETS=(
+  '{datafusion,datafusion-cli,datafusion-examples,dev,docs}/**/*.md'
+  '!datafusion/CHANGELOG.md'
+  README.md
+  CONTRIBUTING.md
+)
+
+MODE="check"
+ALLOW_DIRTY=0
+
+usage() {
+  cat >&2 <<EOF
+Usage: $SCRIPT_NAME [--write] [--allow-dirty]
+
+Runs prettier@${PRETTIER_VERSION} over markdown docs.
+--write         Run with \`--write\` to format files (requires a clean git worktree, no uncommitted changes).
+--allow-dirty   Allow \`--write\` to run even when the git worktree has uncommitted changes.
+EOF
+  exit 1
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --write)
+      MODE="write"
+      ;;
+    --allow-dirty)
+      ALLOW_DIRTY=1
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      usage
+      ;;
+  esac
+  shift
+done
+
+if [[ "$MODE" == "write" && $ALLOW_DIRTY -eq 0 ]]; then
+  require_clean_work_tree "$SCRIPT_NAME" || exit 1
+fi
+
+echo "[${SCRIPT_NAME}] prettier@${PRETTIER_VERSION} ${MODE}"
+
+# Ensure `npx` is available
+if ! command -v npx >/dev/null 2>&1; then
+  echo "npx is required to run the prettier check. Install Node.js (e.g., brew install node) and re-run." >&2
+  exit 1
+fi
+
+PRETTIER_MODE=(--check)
+if [[ "$MODE" == "write" ]]; then
+  PRETTIER_MODE=(--write)
+fi
+
+# Ignore subproject CHANGELOG.md because it is machine generated
+npx "prettier@${PRETTIER_VERSION}" "${PRETTIER_MODE[@]}" "${PRETTIER_TARGETS[@]}"
diff --git a/ci/scripts/license_header.sh b/ci/scripts/license_header.sh
index 5345728f9cdf0..7ab8c9637598b 100755
--- a/ci/scripts/license_header.sh
+++ b/ci/scripts/license_header.sh
@@ -17,6 +17,62 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# Check Apache license header
-set -ex
-hawkeye check --config licenserc.toml
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")"
+
+source "${SCRIPT_DIR}/utils/git.sh"
+
+MODE="check"
+ALLOW_DIRTY=0
+HAWKEYE_CONFIG="licenserc.toml"
+
+usage() {
+  cat >&2 <<EOF
+Usage: $SCRIPT_NAME [--write] [--allow-dirty]
+
+Checks Apache license headers with \`hawkeye check --config $HAWKEYE_CONFIG\`.
+--write         Run \`hawkeye format --config $HAWKEYE_CONFIG\` to auto-add/fix headers (requires a clean git worktree, no uncommitted changes).
+--allow-dirty   Allow \`--write\` to run even when the git worktree has uncommitted changes.
+EOF
+  exit 1
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --write)
+      MODE="write"
+      ;;
+    --allow-dirty)
+      ALLOW_DIRTY=1
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      usage
+      ;;
+  esac
+  shift
+done
+
+if [[ "$MODE" == "write" && $ALLOW_DIRTY -eq 0 ]]; then
+  require_clean_work_tree "$SCRIPT_NAME" || exit 1
+fi
+
+if [[ "$MODE" == "write" ]]; then
+  echo "[${SCRIPT_NAME}] \`hawkeye format --config ${HAWKEYE_CONFIG}\`"
+  if ! hawkeye format --config "${HAWKEYE_CONFIG}"; then
+    status=$?
+    # hawkeye returns exit code 1 when it applies fixes; treat that as success.
+    if [[ $status -eq 1 ]]; then
+      echo "[${SCRIPT_NAME}] hawkeye format applied fixes (exit 1 treated as success)"
+    else
+      exit $status
+    fi
+  fi
+else
+  echo "[${SCRIPT_NAME}] \`hawkeye check --config ${HAWKEYE_CONFIG}\`"
+  hawkeye check --config "${HAWKEYE_CONFIG}"
+fi
diff --git a/datafusion/sqllogictest/test_files/spark/string/unbase64.slt b/ci/scripts/markdown_link_check.sh
similarity index 55%
rename from datafusion/sqllogictest/test_files/spark/string/unbase64.slt
rename to ci/scripts/markdown_link_check.sh
index 5cf3fbee0455d..65fede4111dd8 100644
--- a/datafusion/sqllogictest/test_files/spark/string/unbase64.slt
+++ b/ci/scripts/markdown_link_check.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+#
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -5,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -15,13 +17,17 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
+set -euo pipefail
+
+ROOT_DIR="$(git rev-parse --show-toplevel)"
+
+cd "${ROOT_DIR}"
+
+MARKDOWN_FILES=()
+while IFS= read -r file; do
+  MARKDOWN_FILES+=("${file}")
+done < <(
+  git -C "${ROOT_DIR}" ls-files 'README.md' 'CONTRIBUTING.md' 'docs/**/*.md' 'datafusion-cli/README.md' 'datafusion-examples/README.md' 'dev/**/*.md'
+)
 
-## Original Query: SELECT unbase64('U3BhcmsgU1FM');
-## PySpark 3.5.5 Result: {'unbase64(U3BhcmsgU1FM)': bytearray(b'Spark SQL'), 'typeof(unbase64(U3BhcmsgU1FM))': 'binary', 'typeof(U3BhcmsgU1FM)': 'string'}
-#query
-#SELECT unbase64('U3BhcmsgU1FM'::string);
+lychee --no-progress --config "${ROOT_DIR}/lychee.toml" "${MARKDOWN_FILES[@]}"
diff --git a/ci/scripts/rust_clippy.sh b/ci/scripts/rust_clippy.sh
index 1557bd56eab4a..f8b5c0852fa30 100755
--- a/ci/scripts/rust_clippy.sh
+++ b/ci/scripts/rust_clippy.sh
@@ -17,5 +17,60 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -ex
-cargo clippy --all-targets --workspace --features avro,pyarrow,integration-tests,extended_tests -- -D warnings
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")"
+CLIPPY_FEATURES="avro,integration-tests,extended_tests"
+CLIPPY_ARGS=(--all-targets --workspace --features "$CLIPPY_FEATURES")
+CLIPPY_LINT_ARGS=(-- -D warnings)
+
+source "${SCRIPT_DIR}/utils/git.sh"
+
+MODE="check"
+ALLOW_DIRTY=0
+
+usage() {
+  cat >&2 <<EOF
+Usage: $SCRIPT_NAME [--write] [--allow-dirty]
+
+Runs \`cargo clippy\` to lint.
+--write         Run \`cargo clippy --fix\` to apply fixes for clippy lints (requires a clean git worktree, no uncommitted changes).
+--allow-dirty   Allow \`--write\` to run even when the git worktree has uncommitted or staged changes.
+EOF
+  exit 1
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --write)
+      MODE="write"
+      ;;
+    --allow-dirty)
+      ALLOW_DIRTY=1
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      usage
+      ;;
+  esac
+  shift
+done
+
+if [[ "$MODE" == "write" && $ALLOW_DIRTY -eq 0 ]]; then
+  require_clean_work_tree "$SCRIPT_NAME" || exit 1
+fi
+
+CLIPPY_CMD=(cargo clippy)
+if [[ "$MODE" == "write" ]]; then
+  CLIPPY_CMD+=(--fix)
+  if [[ $ALLOW_DIRTY -eq 1 ]]; then
+    CLIPPY_CMD+=(--allow-dirty --allow-staged)
+  fi
+fi
+CLIPPY_CMD+=("${CLIPPY_ARGS[@]}" "${CLIPPY_LINT_ARGS[@]}")
+
+echo "[${SCRIPT_NAME}] \`${CLIPPY_CMD[*]}\`"
+"${CLIPPY_CMD[@]}"
diff --git a/ci/scripts/rust_docs.sh b/ci/scripts/rust_docs.sh
index e90bfdf8bc277..91cc305f513ea 100755
--- a/ci/scripts/rust_docs.sh
+++ b/ci/scripts/rust_docs.sh
@@ -17,6 +17,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# Note: cargo doc does not support an auto-fix mode; this script runs the check-only build.
 set -ex
 export RUSTDOCFLAGS="-D warnings"
 cargo doc --document-private-items --no-deps --workspace
diff --git a/ci/scripts/rust_example.sh b/ci/scripts/rust_example.sh
index c3efcf2cf2e92..7a5f7825b4e6d 100755
--- a/ci/scripts/rust_example.sh
+++ b/ci/scripts/rust_example.sh
@@ -25,12 +25,26 @@ export CARGO_PROFILE_CI_STRIP=true
 cd datafusion-examples/examples/
 cargo build --profile ci --examples
 
-files=$(ls .)
-for filename in $files
-do
-  example_name=`basename $filename ".rs"`
-  # Skip tests that rely on external storage and flight
-  if [ ! -d $filename ]; then
-    cargo run --profile ci --example $example_name
-  fi
+SKIP_LIST=("external_dependency" "flight" "ffi")
+
+skip_example() {
+    local name="$1"
+    for skip in "${SKIP_LIST[@]}"; do
+        if [ "$name" = "$skip" ]; then
+            return 0
+        fi
+    done
+    return 1
+}
+
+for dir in */; do
+    example_name=$(basename "$dir")
+
+    if skip_example "$example_name"; then
+        echo "Skipping $example_name"
+        continue
+    fi
+
+    echo "Running example group: $example_name"
+    cargo run --profile ci --example "$example_name" -- all
 done
diff --git a/ci/scripts/rust_fmt.sh b/ci/scripts/rust_fmt.sh
index 9d8325877aad5..16c87cea5e0fa 100755
--- a/ci/scripts/rust_fmt.sh
+++ b/ci/scripts/rust_fmt.sh
@@ -17,5 +17,52 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -ex
-cargo fmt --all -- --check
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")"
+source "${SCRIPT_DIR}/utils/git.sh"
+
+MODE="check"
+ALLOW_DIRTY=0
+
+usage() {
+  cat >&2 <<EOF
+Usage: $0 [--write] [--allow-dirty]
+
+Runs \`cargo fmt --all -- --check\` by default to verify Rust formatting.
+--write        Run \`cargo fmt --all\` to auto-fix formatting (requires a clean git worktree, no uncommitted changes).
+--allow-dirty  Allow \`--write\` to run even when the git worktree has uncommitted changes.
+EOF
+  exit 1
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --write)
+      MODE="write"
+      ;;
+    --allow-dirty)
+      ALLOW_DIRTY=1
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      usage
+      ;;
+  esac
+  shift
+done
+
+if [[ "$MODE" == "write" && $ALLOW_DIRTY -eq 0 ]]; then
+  require_clean_work_tree "$SCRIPT_NAME" || exit 1
+fi
+
+if [[ "$MODE" == "write" ]]; then
+  echo "[${SCRIPT_NAME}] \`cargo fmt --all\`"
+  cargo fmt --all
+else
+  echo "[${SCRIPT_NAME}] \`cargo fmt --all -- --check\`"
+  cargo fmt --all -- --check
+fi
diff --git a/ci/scripts/rust_toml_fmt.sh b/ci/scripts/rust_toml_fmt.sh
index 393ad55f41684..b1f9373fbf9ee 100755
--- a/ci/scripts/rust_toml_fmt.sh
+++ b/ci/scripts/rust_toml_fmt.sh
@@ -17,8 +17,53 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# Run `taplo format` with flag `--check` in dry run to check formatting
-# without overwritng the file. If any error occur, you may want to
-# rerun `taplo format` to fix the formatting automatically.
-set -ex
-taplo format --check
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")"
+
+source "${SCRIPT_DIR}/utils/git.sh"
+
+MODE="check"
+ALLOW_DIRTY=0
+
+usage() {
+  cat >&2 <<EOF
+Usage: $0 [--write] [--allow-dirty]
+
+Runs \`taplo format --check\` by default to verify TOML formatting.
+--write        Run \`taplo format\` to auto-fix formatting (best-effort; requires a clean git worktree, no uncommitted changes).
+--allow-dirty  Allow \`--write\` to run even when the git worktree has uncommitted changes.
+EOF
+  exit 1
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --write)
+      MODE="write"
+      ;;
+    --allow-dirty)
+      ALLOW_DIRTY=1
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      usage
+      ;;
+  esac
+  shift
+done
+
+if [[ "$MODE" == "write" && $ALLOW_DIRTY -eq 0 ]]; then
+  require_clean_work_tree "$SCRIPT_NAME" || exit 1
+fi
+
+if [[ "$MODE" == "write" ]]; then
+  echo "[${SCRIPT_NAME}] \`taplo format\`"
+  taplo format
+else
+  echo "[${SCRIPT_NAME}] \`taplo format --check\`"
+  taplo format --check
+fi
diff --git a/ci/scripts/typos_check.sh b/ci/scripts/typos_check.sh
new file mode 100755
index 0000000000000..a567c7b44e609
--- /dev/null
+++ b/ci/scripts/typos_check.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")"
+TYPOS_CONFIG="typos.toml"
+
+source "${SCRIPT_DIR}/utils/git.sh"
+
+MODE="check"
+ALLOW_DIRTY=0
+
+usage() {
+  cat >&2 <<EOF
+Usage: $SCRIPT_NAME [--write] [--allow-dirty]
+
+Runs \`typos --config ${TYPOS_CONFIG}\` by default to check spelling.
+--write         Run \`typos --write-changes --config ${TYPOS_CONFIG}\` to auto-fix spelling issues (requires a clean git worktree, no uncommitted changes).
+--allow-dirty   Allow \`--write\` to run even when the git worktree has uncommitted changes.
+EOF
+  exit 1
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --write)
+      MODE="write"
+      ;;
+    --allow-dirty)
+      ALLOW_DIRTY=1
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      usage
+      ;;
+  esac
+  shift
+done
+
+if [[ "$MODE" == "write" && $ALLOW_DIRTY -eq 0 ]]; then
+  require_clean_work_tree "$SCRIPT_NAME" || exit 1
+fi
+
+if [[ "$MODE" == "write" ]]; then
+  echo "[${SCRIPT_NAME}] \`typos --write-changes --config ${TYPOS_CONFIG}\`"
+  typos --write-changes --config "${TYPOS_CONFIG}"
+else
+  echo "[${SCRIPT_NAME}] \`typos --config ${TYPOS_CONFIG}\`"
+  typos --config "${TYPOS_CONFIG}"
+fi
diff --git a/ci/scripts/utils/git.sh b/ci/scripts/utils/git.sh
new file mode 100644
index 0000000000000..b5baecda758ce
--- /dev/null
+++ b/ci/scripts/utils/git.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Ensure the repository is clean before auto-fixing files.
+require_clean_work_tree() {
+  local caller="${1:-script}"
+  if [[ -n "$(git status --porcelain)" ]]; then
+    echo "[$caller] Uncommitted changes detected. Commit or stash them, or re-run with --allow-dirty." >&2
+    return 1
+  fi
+}
diff --git a/ci/scripts/utils/tool_versions.sh b/ci/scripts/utils/tool_versions.sh
new file mode 100644
index 0000000000000..237b18b62ef40
--- /dev/null
+++ b/ci/scripts/utils/tool_versions.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file defines centralized tool versions used by CI and development scripts.
+# It is intended to be sourced by other scripts and should not be executed directly.
+
+PRETTIER_VERSION="2.7.1"
+LYCHEE_VERSION="0.23.0"
diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml
index f3069b492352d..414b8c6444869 100644
--- a/datafusion-cli/Cargo.toml
+++ b/datafusion-cli/Cargo.toml
@@ -37,10 +37,10 @@ backtrace = ["datafusion/backtrace"]
 [dependencies]
 arrow = { workspace = true }
 async-trait = { workspace = true }
-aws-config = "1.8.7"
-aws-credential-types = "1.2.7"
+aws-config = "1.8.16"
+aws-credential-types = "1.2.13"
 chrono = { workspace = true }
-clap = { version = "4.5.50", features = ["cargo", "derive"] }
+clap = { version = "4.5.60", features = ["cargo", "derive"] }
 datafusion = { workspace = true, features = [
     "avro",
     "compression",
@@ -65,14 +65,24 @@ object_store = { workspace = true, features = ["aws", "gcp", "http"] }
 parking_lot = { workspace = true }
 parquet = { workspace = true, default-features = false }
 regex = { workspace = true }
-rustyline = "17.0"
+rustyline = "18.0"
 tokio = { workspace = true, features = ["macros", "parking_lot", "rt", "rt-multi-thread", "signal", "sync"] }
 url = { workspace = true }
 
+[lints]
+workspace = true
+
 [dev-dependencies]
 ctor = { workspace = true }
 insta = { workspace = true }
 insta-cmd = "0.6.0"
 rstest = { workspace = true }
-testcontainers = { workspace = true }
 testcontainers-modules = { workspace = true, features = ["minio"] }
+# Makes sure `test_display_pg_json` behaves in a consistent way regardless of
+# feature unification with dependencies
+serde_json = { workspace = true, features = ["preserve_order"] }
+
+# Required because we pull serde_json with a feature to get consistent pg display,
+# but its not directly used.
+[package.metadata.cargo-machete]
+ignored = "serde_json"
diff --git a/datafusion-cli/examples/cli-session-context.rs b/datafusion-cli/examples/cli-session-context.rs
index bd2dbb736781f..6095072163870 100644
--- a/datafusion-cli/examples/cli-session-context.rs
+++ b/datafusion-cli/examples/cli-session-context.rs
@@ -23,7 +23,7 @@ use std::sync::Arc;
 use datafusion::{
     dataframe::DataFrame,
     error::DataFusionError,
-    execution::{context::SessionState, TaskContext},
+    execution::{TaskContext, context::SessionState},
     logical_expr::{LogicalPlan, LogicalPlanBuilder},
     prelude::SessionContext,
 };
diff --git a/datafusion-cli/src/catalog.rs b/datafusion-cli/src/catalog.rs
index 20d62eabc3901..185dfb6b08006 100644
--- a/datafusion-cli/src/catalog.rs
+++ b/datafusion-cli/src/catalog.rs
@@ -15,16 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::{Arc, Weak};
 
-use crate::object_storage::{get_object_store, AwsOptions, GcpOptions};
+use crate::object_storage::{AwsOptions, GcpOptions, get_object_store};
 
 use datafusion::catalog::{CatalogProvider, CatalogProviderList, SchemaProvider};
 
 use datafusion::common::plan_datafusion_err;
-use datafusion::datasource::listing::ListingTableUrl;
 use datafusion::datasource::TableProvider;
+use datafusion::datasource::listing::ListingTableUrl;
 use datafusion::error::Result;
 use datafusion::execution::context::SessionState;
 use datafusion::execution::session_state::SessionStateBuilder;
@@ -50,10 +49,6 @@ impl DynamicObjectStoreCatalog {
 }
 
 impl CatalogProviderList for DynamicObjectStoreCatalog {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn register_catalog(
         &self,
         name: String,
@@ -91,10 +86,6 @@ impl DynamicObjectStoreCatalogProvider {
 }
 
 impl CatalogProvider for DynamicObjectStoreCatalogProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema_names(&self) -> Vec<String> {
         self.inner.schema_names()
     }
@@ -134,10 +125,6 @@ impl DynamicObjectStoreSchemaProvider {
 
 #[async_trait]
 impl SchemaProvider for DynamicObjectStoreSchemaProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn table_names(&self) -> Vec<String> {
         self.inner.table_names()
     }
@@ -152,10 +139,10 @@ impl SchemaProvider for DynamicObjectStoreSchemaProvider {
 
     async fn table(&self, name: &str) -> Result<Option<Arc<dyn TableProvider>>> {
         let inner_table = self.inner.table(name).await;
-        if inner_table.is_ok() {
-            if let Some(inner_table) = inner_table? {
-                return Ok(Some(inner_table));
-            }
+        if inner_table.is_ok()
+            && let Some(inner_table) = inner_table?
+        {
+            return Ok(Some(inner_table));
         }
 
         // if the inner schema provider didn't have a table by
@@ -219,12 +206,12 @@ impl SchemaProvider for DynamicObjectStoreSchemaProvider {
 }
 
 pub fn substitute_tilde(cur: String) -> String {
-    if let Some(usr_dir_path) = home_dir() {
-        if let Some(usr_dir) = usr_dir_path.to_str() {
-            if cur.starts_with('~') && !usr_dir.is_empty() {
-                return cur.replacen('~', usr_dir, 1);
-            }
-        }
+    if let Some(usr_dir_path) = home_dir()
+        && let Some(usr_dir) = usr_dir_path.to_str()
+        && cur.starts_with('~')
+        && !usr_dir.is_empty()
+    {
+        return cur.replacen('~', usr_dir, 1);
     }
     cur
 }
@@ -359,10 +346,12 @@ mod tests {
         } else {
             "/home/user"
         };
-        env::set_var(
-            if cfg!(windows) { "USERPROFILE" } else { "HOME" },
-            test_home_path,
-        );
+        unsafe {
+            env::set_var(
+                if cfg!(windows) { "USERPROFILE" } else { "HOME" },
+                test_home_path,
+            );
+        }
         let input = "~/Code/datafusion/benchmarks/data/tpch_sf1/part/part-0.parquet";
         let expected = PathBuf::from(test_home_path)
             .join("Code")
@@ -376,12 +365,16 @@ mod tests {
             .to_string();
         let actual = substitute_tilde(input.to_string());
         assert_eq!(actual, expected);
-        match original_home {
-            Some(home_path) => env::set_var(
-                if cfg!(windows) { "USERPROFILE" } else { "HOME" },
-                home_path.to_str().unwrap(),
-            ),
-            None => env::remove_var(if cfg!(windows) { "USERPROFILE" } else { "HOME" }),
+        unsafe {
+            match original_home {
+                Some(home_path) => env::set_var(
+                    if cfg!(windows) { "USERPROFILE" } else { "HOME" },
+                    home_path.to_str().unwrap(),
+                ),
+                None => {
+                    env::remove_var(if cfg!(windows) { "USERPROFILE" } else { "HOME" })
+                }
+            }
         }
     }
 }
diff --git a/datafusion-cli/src/cli_context.rs b/datafusion-cli/src/cli_context.rs
index 516929ebacf19..a6320f03fe4de 100644
--- a/datafusion-cli/src/cli_context.rs
+++ b/datafusion-cli/src/cli_context.rs
@@ -20,7 +20,7 @@ use std::sync::Arc;
 use datafusion::{
     dataframe::DataFrame,
     error::DataFusionError,
-    execution::{context::SessionState, TaskContext},
+    execution::{TaskContext, context::SessionState},
     logical_expr::LogicalPlan,
     prelude::SessionContext,
 };
diff --git a/datafusion-cli/src/command.rs b/datafusion-cli/src/command.rs
index 3fbfe5680cfcd..8aaa8025d1c3a 100644
--- a/datafusion-cli/src/command.rs
+++ b/datafusion-cli/src/command.rs
@@ -19,7 +19,7 @@
 
 use crate::cli_context::CliSessionContext;
 use crate::exec::{exec_and_print, exec_from_lines};
-use crate::functions::{display_all_functions, Function};
+use crate::functions::{Function, display_all_functions};
 use crate::print_format::PrintFormat;
 use crate::print_options::PrintOptions;
 use clap::ValueEnum;
diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs
index d079a88a6440e..09347d6d7dc2c 100644
--- a/datafusion-cli/src/exec.rs
+++ b/datafusion-cli/src/exec.rs
@@ -35,19 +35,19 @@ use datafusion::execution::memory_pool::MemoryConsumer;
 use datafusion::logical_expr::{DdlStatement, LogicalPlan};
 use datafusion::physical_plan::execution_plan::EmissionType;
 use datafusion::physical_plan::spill::get_record_batch_memory_size;
-use datafusion::physical_plan::{execute_stream, ExecutionPlanProperties};
+use datafusion::physical_plan::{ExecutionPlanProperties, execute_stream};
 use datafusion::sql::parser::{DFParser, Statement};
 use datafusion::sql::sqlparser;
 use datafusion::sql::sqlparser::dialect::dialect_from_str;
 use futures::StreamExt;
 use log::warn;
 use object_store::Error::Generic;
-use rustyline::error::ReadlineError;
 use rustyline::Editor;
+use rustyline::error::ReadlineError;
 use std::collections::HashMap;
 use std::fs::File;
-use std::io::prelude::*;
 use std::io::BufReader;
+use std::io::prelude::*;
 use tokio::signal;
 
 /// run and execute SQL statements and commands, against a context with the given print options
@@ -153,7 +153,7 @@ pub async fn exec_from_repl(
                                     }
                                 } else {
                                     eprintln!(
-                                        "'\\{}' is not a valid command",
+                                        "'\\{}' is not a valid command, you can use '\\?' to see all commands",
                                         &line[1..]
                                     );
                                 }
@@ -168,7 +168,10 @@ pub async fn exec_from_repl(
                         }
                     }
                 } else {
-                    eprintln!("'\\{}' is not a valid command", &line[1..]);
+                    eprintln!(
+                        "'\\{}' is not a valid command, you can use '\\?' to see all commands",
+                        &line[1..]
+                    );
                 }
             }
             Ok(line) => {
@@ -193,6 +196,7 @@ pub async fn exec_from_repl(
             }
             Err(ReadlineError::Interrupted) => {
                 println!("^C");
+                rl.helper().unwrap().reset_hint();
                 continue;
             }
             Err(ReadlineError::Eof) => {
@@ -266,7 +270,7 @@ impl StatementExecutor {
         let options = task_ctx.session_config().options();
 
         // Track memory usage for the query result if it's bounded
-        let mut reservation =
+        let reservation =
             MemoryConsumer::new("DataFusion-Cli").register(task_ctx.memory_pool());
 
         if physical_plan.boundedness().is_unbounded() {
@@ -297,7 +301,7 @@ impl StatementExecutor {
                 let curr_num_rows = batch.num_rows();
                 // Stop collecting results if the number of rows exceeds the limit
                 // results batch should include the last batch that exceeds the limit
-                if row_count < max_rows + curr_num_rows {
+                if row_count < max_rows.saturating_add(curr_num_rows) {
                     // Try to grow the reservation to accommodate the batch in memory
                     reservation.try_grow(get_record_batch_memory_size(&batch))?;
                     results.push(batch);
@@ -334,7 +338,9 @@ impl StatementExecutor {
                 if matches!(err.as_ref(), Generic { store, source: _ } if "S3".eq_ignore_ascii_case(store))
                     && self.statement_for_retry.is_some() =>
             {
-                warn!("S3 region is incorrect, auto-detecting the correct region (this may be slow). Consider updating your region configuration.");
+                warn!(
+                    "S3 region is incorrect, auto-detecting the correct region (this may be slow). Consider updating your region configuration."
+                );
                 let plan =
                     create_plan(ctx, self.statement_for_retry.take().unwrap(), true)
                         .await?;
@@ -516,6 +522,7 @@ mod tests {
     use datafusion::common::plan_err;
 
     use datafusion::prelude::SessionContext;
+    use datafusion_common::assert_contains;
     use url::Url;
 
     async fn create_external_table_test(location: &str, sql: &str) -> Result<()> {
@@ -699,8 +706,7 @@ mod tests {
     #[tokio::test]
     async fn create_object_store_table_gcs() -> Result<()> {
         let service_account_path = "fake_service_account_path";
-        let service_account_key =
-            "{\"private_key\": \"fake_private_key.pem\",\"client_email\":\"fake_client_email\", \"private_key_id\":\"id\"}";
+        let service_account_key = "{\"private_key\": \"fake_private_key.pem\",\"client_email\":\"fake_client_email\", \"private_key_id\":\"id\"}";
         let application_credentials_path = "fake_application_credentials_path";
         let location = "gcs://bucket/path/file.parquet";
 
@@ -710,15 +716,16 @@ mod tests {
         let err = create_external_table_test(location, &sql)
             .await
             .unwrap_err();
-        assert!(err.to_string().contains("os error 2"));
+        assert_contains!(err.to_string(), "os error 2");
 
         // for service_account_key
-        let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('gcp.service_account_key' '{service_account_key}') LOCATION '{location}'");
+        let sql = format!(
+            "CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('gcp.service_account_key' '{service_account_key}') LOCATION '{location}'"
+        );
         let err = create_external_table_test(location, &sql)
             .await
-            .unwrap_err()
-            .to_string();
-        assert!(err.contains("No RSA key found in pem file"), "{err}");
+            .unwrap_err();
+        assert_contains!(err.to_string(), "Error reading pem file: no items found");
 
         // for application_credentials_path
         let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET
@@ -726,7 +733,7 @@ mod tests {
         let err = create_external_table_test(location, &sql)
             .await
             .unwrap_err();
-        assert!(err.to_string().contains("os error 2"));
+        assert_contains!(err.to_string(), "os error 2");
 
         Ok(())
     }
@@ -748,8 +755,9 @@ mod tests {
         let location = "path/to/file.cvs";
 
         // Test with format options
-        let sql =
-            format!("CREATE EXTERNAL TABLE test STORED AS CSV LOCATION '{location}' OPTIONS('format.has_header' 'true')");
+        let sql = format!(
+            "CREATE EXTERNAL TABLE test STORED AS CSV LOCATION '{location}' OPTIONS('format.has_header' 'true')"
+        );
         create_external_table_test(location, &sql).await.unwrap();
 
         Ok(())
diff --git a/datafusion-cli/src/functions.rs b/datafusion-cli/src/functions.rs
index d23b12469e385..26f007cdd3193 100644
--- a/datafusion-cli/src/functions.rs
+++ b/datafusion-cli/src/functions.rs
@@ -17,19 +17,24 @@
 
 //! Functions that are query-able and searchable via the `\h` command
 
+use datafusion_common::instant::Instant;
 use std::fmt;
 use std::fs::File;
 use std::str::FromStr;
 use std::sync::Arc;
 
-use arrow::array::{Int64Array, StringArray, TimestampMillisecondArray, UInt64Array};
-use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit};
+use arrow::array::{
+    DurationMillisecondArray, GenericListArray, Int64Array, StringArray, StructArray,
+    TimestampMillisecondArray, UInt64Array,
+};
+use arrow::buffer::{Buffer, OffsetBuffer, ScalarBuffer};
+use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef, TimeUnit};
 use arrow::record_batch::RecordBatch;
 use arrow::util::pretty::pretty_format_batches;
-use datafusion::catalog::{Session, TableFunctionImpl};
-use datafusion::common::{plan_err, Column};
-use datafusion::datasource::memory::MemorySourceConfig;
+use datafusion::catalog::{Session, TableFunctionArgs, TableFunctionImpl};
+use datafusion::common::{Column, plan_err};
 use datafusion::datasource::TableProvider;
+use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::error::Result;
 use datafusion::execution::cache::cache_manager::CacheManager;
 use datafusion::logical_expr::Expr;
@@ -224,11 +229,7 @@ struct ParquetMetadataTable {
 
 #[async_trait]
 impl TableProvider for ParquetMetadataTable {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
-    fn schema(&self) -> arrow::datatypes::SchemaRef {
+    fn schema(&self) -> SchemaRef {
         self.schema.clone()
     }
 
@@ -321,7 +322,8 @@ fn fixed_len_byte_array_to_string(val: Option<&FixedLenByteArray>) -> Option<Str
 pub struct ParquetMetadataFunc {}
 
 impl TableFunctionImpl for ParquetMetadataFunc {
-    fn call(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
+    fn call_with_args(&self, args: TableFunctionArgs) -> Result<Arc<dyn TableProvider>> {
+        let exprs = args.exprs();
         let filename = match exprs.first() {
             Some(Expr::Literal(ScalarValue::Utf8(Some(s)), _)) => s, // single quote: parquet_metadata('x.parquet')
             Some(Expr::Column(Column { name, .. })) => name, // double quote: parquet_metadata("x.parquet")
@@ -421,7 +423,7 @@ impl TableFunctionImpl for ParquetMetadataFunc {
                 compression_arr.push(format!("{:?}", column.compression()));
                 // need to collect into Vec to format
                 let encodings: Vec<_> = column.encodings().collect();
-                encodings_arr.push(format!("{:?}", encodings));
+                encodings_arr.push(format!("{encodings:?}"));
                 index_page_offset_arr.push(column.index_page_offset());
                 dictionary_page_offset_arr.push(column.dictionary_page_offset());
                 data_page_offset_arr.push(column.data_page_offset());
@@ -473,11 +475,7 @@ struct MetadataCacheTable {
 
 #[async_trait]
 impl TableProvider for MetadataCacheTable {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
-    fn schema(&self) -> arrow::datatypes::SchemaRef {
+    fn schema(&self) -> SchemaRef {
         self.schema.clone()
     }
 
@@ -512,7 +510,8 @@ impl MetadataCacheFunc {
 }
 
 impl TableFunctionImpl for MetadataCacheFunc {
-    fn call(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
+    fn call_with_args(&self, args: TableFunctionArgs) -> Result<Arc<dyn TableProvider>> {
+        let exprs = args.exprs();
         if !exprs.is_empty() {
             return plan_err!("metadata_cache should have no arguments");
         }
@@ -581,3 +580,292 @@ impl TableFunctionImpl for MetadataCacheFunc {
         Ok(Arc::new(metadata_cache))
     }
 }
+
+/// STATISTICS_CACHE table function
+#[derive(Debug)]
+struct StatisticsCacheTable {
+    schema: SchemaRef,
+    batch: RecordBatch,
+}
+
+#[async_trait]
+impl TableProvider for StatisticsCacheTable {
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    fn table_type(&self) -> datafusion::logical_expr::TableType {
+        datafusion::logical_expr::TableType::Base
+    }
+
+    async fn scan(
+        &self,
+        _state: &dyn Session,
+        projection: Option<&Vec<usize>>,
+        _filters: &[Expr],
+        _limit: Option<usize>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(MemorySourceConfig::try_new_exec(
+            &[vec![self.batch.clone()]],
+            TableProvider::schema(self),
+            projection.cloned(),
+        )?)
+    }
+}
+
+#[derive(Debug)]
+pub struct StatisticsCacheFunc {
+    cache_manager: Arc<CacheManager>,
+}
+
+impl StatisticsCacheFunc {
+    pub fn new(cache_manager: Arc<CacheManager>) -> Self {
+        Self { cache_manager }
+    }
+}
+
+impl TableFunctionImpl for StatisticsCacheFunc {
+    fn call_with_args(&self, args: TableFunctionArgs) -> Result<Arc<dyn TableProvider>> {
+        let exprs = args.exprs();
+        if !exprs.is_empty() {
+            return plan_err!("statistics_cache should have no arguments");
+        }
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("path", DataType::Utf8, false),
+            Field::new(
+                "file_modified",
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                false,
+            ),
+            Field::new("file_size_bytes", DataType::UInt64, false),
+            Field::new("e_tag", DataType::Utf8, true),
+            Field::new("version", DataType::Utf8, true),
+            Field::new("num_rows", DataType::Utf8, false),
+            Field::new("num_columns", DataType::UInt64, false),
+            Field::new("table_size_bytes", DataType::Utf8, false),
+            Field::new("statistics_size_bytes", DataType::UInt64, false),
+        ]));
+
+        // construct record batch from metadata
+        let mut path_arr = vec![];
+        let mut file_modified_arr = vec![];
+        let mut file_size_bytes_arr = vec![];
+        let mut e_tag_arr = vec![];
+        let mut version_arr = vec![];
+        let mut num_rows_arr = vec![];
+        let mut num_columns_arr = vec![];
+        let mut table_size_bytes_arr = vec![];
+        let mut statistics_size_bytes_arr = vec![];
+
+        if let Some(file_statistics_cache) = self.cache_manager.get_file_statistic_cache()
+        {
+            for (path, entry) in file_statistics_cache.list_entries() {
+                path_arr.push(path.to_string());
+                file_modified_arr
+                    .push(Some(entry.object_meta.last_modified.timestamp_millis()));
+                file_size_bytes_arr.push(entry.object_meta.size);
+                e_tag_arr.push(entry.object_meta.e_tag);
+                version_arr.push(entry.object_meta.version);
+                num_rows_arr.push(entry.num_rows.to_string());
+                num_columns_arr.push(entry.num_columns as u64);
+                table_size_bytes_arr.push(entry.table_size_bytes.to_string());
+                statistics_size_bytes_arr.push(entry.statistics_size_bytes as u64);
+            }
+        }
+
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(StringArray::from(path_arr)),
+                Arc::new(TimestampMillisecondArray::from(file_modified_arr)),
+                Arc::new(UInt64Array::from(file_size_bytes_arr)),
+                Arc::new(StringArray::from(e_tag_arr)),
+                Arc::new(StringArray::from(version_arr)),
+                Arc::new(StringArray::from(num_rows_arr)),
+                Arc::new(UInt64Array::from(num_columns_arr)),
+                Arc::new(StringArray::from(table_size_bytes_arr)),
+                Arc::new(UInt64Array::from(statistics_size_bytes_arr)),
+            ],
+        )?;
+
+        let statistics_cache = StatisticsCacheTable { schema, batch };
+        Ok(Arc::new(statistics_cache))
+    }
+}
+
+/// Implementation of the `list_files_cache` table function in datafusion-cli.
+///
+/// This function returns the cached results of running a LIST command on a
+/// particular object store path for a table. The object metadata is returned as
+/// a List of Structs, with one Struct for each object. DataFusion uses these
+/// cached results to plan queries against external tables.
+///
+/// # Schema
+/// ```sql
+/// > describe select * from list_files_cache();
+/// +---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+
+/// | column_name         | data_type                                                                                                                                                                | is_nullable |
+/// +---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+
+/// | table               | Utf8                                                                                                                                                                     | NO          |
+/// | path                | Utf8                                                                                                                                                                     | NO          |
+/// | metadata_size_bytes | UInt64                                                                                                                                                                   | NO          |
+/// | expires_in          | Duration(ms)                                                                                                                                                             | YES         |
+/// | metadata_list       | List(Struct("file_path": non-null Utf8, "file_modified": non-null Timestamp(ms), "file_size_bytes": non-null UInt64, "e_tag": Utf8, "version": Utf8), field: 'metadata') | YES         |
+/// +---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+
+/// ```
+#[derive(Debug)]
+struct ListFilesCacheTable {
+    schema: SchemaRef,
+    batch: RecordBatch,
+}
+
+#[async_trait]
+impl TableProvider for ListFilesCacheTable {
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    fn table_type(&self) -> datafusion::logical_expr::TableType {
+        datafusion::logical_expr::TableType::Base
+    }
+
+    async fn scan(
+        &self,
+        _state: &dyn Session,
+        projection: Option<&Vec<usize>>,
+        _filters: &[Expr],
+        _limit: Option<usize>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(MemorySourceConfig::try_new_exec(
+            &[vec![self.batch.clone()]],
+            TableProvider::schema(self),
+            projection.cloned(),
+        )?)
+    }
+}
+
+#[derive(Debug)]
+pub struct ListFilesCacheFunc {
+    cache_manager: Arc<CacheManager>,
+}
+
+impl ListFilesCacheFunc {
+    pub fn new(cache_manager: Arc<CacheManager>) -> Self {
+        Self { cache_manager }
+    }
+}
+
+impl TableFunctionImpl for ListFilesCacheFunc {
+    fn call_with_args(&self, args: TableFunctionArgs) -> Result<Arc<dyn TableProvider>> {
+        let exprs = args.exprs();
+        if !exprs.is_empty() {
+            return plan_err!("list_files_cache should have no arguments");
+        }
+
+        let nested_fields = Fields::from(vec![
+            Field::new("file_path", DataType::Utf8, false),
+            Field::new(
+                "file_modified",
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                false,
+            ),
+            Field::new("file_size_bytes", DataType::UInt64, false),
+            Field::new("e_tag", DataType::Utf8, true),
+            Field::new("version", DataType::Utf8, true),
+        ]);
+
+        let metadata_field =
+            Field::new("metadata", DataType::Struct(nested_fields.clone()), true);
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("table", DataType::Utf8, true),
+            Field::new("path", DataType::Utf8, false),
+            Field::new("metadata_size_bytes", DataType::UInt64, false),
+            // expires field in ListFilesEntry has type Instant when set, from which we cannot get "the number of seconds", hence using Duration instead of Timestamp as data type.
+            Field::new(
+                "expires_in",
+                DataType::Duration(TimeUnit::Millisecond),
+                true,
+            ),
+            Field::new(
+                "metadata_list",
+                DataType::List(Arc::new(metadata_field.clone())),
+                true,
+            ),
+        ]));
+
+        let mut table_arr = vec![];
+        let mut path_arr = vec![];
+        let mut metadata_size_bytes_arr = vec![];
+        let mut expires_arr = vec![];
+
+        let mut file_path_arr = vec![];
+        let mut file_modified_arr = vec![];
+        let mut file_size_bytes_arr = vec![];
+        let mut etag_arr = vec![];
+        let mut version_arr = vec![];
+        let mut offsets: Vec<i32> = vec![0];
+
+        if let Some(list_files_cache) = self.cache_manager.get_list_files_cache() {
+            let now = Instant::now();
+            let mut current_offset: i32 = 0;
+
+            for (path, entry) in list_files_cache.list_entries() {
+                table_arr.push(path.table.map(|t| t.to_string()));
+                path_arr.push(path.path.to_string());
+                metadata_size_bytes_arr.push(entry.size_bytes as u64);
+                // calculates time left before entry expires
+                expires_arr.push(
+                    entry
+                        .expires
+                        .map(|t| t.duration_since(now).as_millis() as i64),
+                );
+
+                for meta in entry.metas.files.iter() {
+                    file_path_arr.push(meta.location.to_string());
+                    file_modified_arr.push(meta.last_modified.timestamp_millis());
+                    file_size_bytes_arr.push(meta.size);
+                    etag_arr.push(meta.e_tag.clone());
+                    version_arr.push(meta.version.clone());
+                }
+                current_offset += entry.metas.files.len() as i32;
+                offsets.push(current_offset);
+            }
+        }
+
+        let struct_arr = StructArray::new(
+            nested_fields,
+            vec![
+                Arc::new(StringArray::from(file_path_arr)),
+                Arc::new(TimestampMillisecondArray::from(file_modified_arr)),
+                Arc::new(UInt64Array::from(file_size_bytes_arr)),
+                Arc::new(StringArray::from(etag_arr)),
+                Arc::new(StringArray::from(version_arr)),
+            ],
+            None,
+        );
+
+        let offsets_buffer: OffsetBuffer<i32> =
+            OffsetBuffer::new(ScalarBuffer::from(Buffer::from_vec(offsets)));
+
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(StringArray::from(table_arr)),
+                Arc::new(StringArray::from(path_arr)),
+                Arc::new(UInt64Array::from(metadata_size_bytes_arr)),
+                Arc::new(DurationMillisecondArray::from(expires_arr)),
+                Arc::new(GenericListArray::new(
+                    Arc::new(metadata_field),
+                    offsets_buffer,
+                    Arc::new(struct_arr),
+                    None,
+                )),
+            ],
+        )?;
+
+        let list_files_cache = ListFilesCacheTable { schema, batch };
+        Ok(Arc::new(list_files_cache))
+    }
+}
diff --git a/datafusion-cli/src/helper.rs b/datafusion-cli/src/helper.rs
index 219637b3460e6..67e203cf7987b 100644
--- a/datafusion-cli/src/helper.rs
+++ b/datafusion-cli/src/helper.rs
@@ -19,8 +19,9 @@
 //! and auto-completion for file name during creating external table.
 
 use std::borrow::Cow;
+use std::cell::Cell;
 
-use crate::highlighter::{NoSyntaxHighlighter, SyntaxHighlighter};
+use crate::highlighter::{Color, NoSyntaxHighlighter, SyntaxHighlighter};
 
 use datafusion::sql::parser::{DFParser, Statement};
 use datafusion::sql::sqlparser::dialect::dialect_from_str;
@@ -33,10 +34,17 @@ use rustyline::hint::Hinter;
 use rustyline::validate::{ValidationContext, ValidationResult, Validator};
 use rustyline::{Context, Helper, Result};
 
+/// Default suggestion shown when the input line is empty.
+const DEFAULT_HINT_SUGGESTION: &str = " \\? for help, \\q to quit";
+
 pub struct CliHelper {
     completer: FilenameCompleter,
     dialect: Dialect,
     highlighter: Box<dyn Highlighter>,
+    /// Tracks whether to show the default hint. Set to `false` once the user
+    /// types anything, so the hint doesn't reappear after deleting back to
+    /// an empty line. Reset to `true` when the line is submitted.
+    show_hint: Cell<bool>,
 }
 
 impl CliHelper {
@@ -50,6 +58,7 @@ impl CliHelper {
             completer: FilenameCompleter::new(),
             dialect: *dialect,
             highlighter,
+            show_hint: Cell::new(true),
         }
     }
 
@@ -59,6 +68,11 @@ impl CliHelper {
         }
     }
 
+    /// Re-enable the default hint for the next prompt.
+    pub fn reset_hint(&self) {
+        self.show_hint.set(true);
+    }
+
     fn validate_input(&self, input: &str) -> Result<ValidationResult> {
         if let Some(sql) = input.strip_suffix(';') {
             let dialect = match dialect_from_str(self.dialect) {
@@ -67,7 +81,7 @@ impl CliHelper {
                     return Ok(ValidationResult::Invalid(Some(format!(
                         "  🤔 Invalid dialect: {}",
                         self.dialect
-                    ))))
+                    ))));
                 }
             };
             let lines = split_from_semicolon(sql);
@@ -110,10 +124,22 @@ impl Highlighter for CliHelper {
     fn highlight_char(&self, line: &str, pos: usize, kind: CmdKind) -> bool {
         self.highlighter.highlight_char(line, pos, kind)
     }
+
+    fn highlight_hint<'h>(&self, hint: &'h str) -> Cow<'h, str> {
+        Color::gray(hint).into()
+    }
 }
 
 impl Hinter for CliHelper {
     type Hint = String;
+
+    fn hint(&self, line: &str, _pos: usize, _ctx: &Context<'_>) -> Option<String> {
+        if !line.is_empty() {
+            self.show_hint.set(false);
+        }
+        (self.show_hint.get() && line.trim().is_empty())
+            .then(|| DEFAULT_HINT_SUGGESTION.to_owned())
+    }
 }
 
 /// returns true if the current position is after the open quote for
@@ -121,12 +147,9 @@ impl Hinter for CliHelper {
 fn is_open_quote_for_location(line: &str, pos: usize) -> bool {
     let mut sql = line[..pos].to_string();
     sql.push('\'');
-    if let Ok(stmts) = DFParser::parse_sql(&sql) {
-        if let Some(Statement::CreateExternalTable(_)) = stmts.back() {
-            return true;
-        }
-    }
-    false
+    DFParser::parse_sql(&sql).is_ok_and(|stmts| {
+        matches!(stmts.back(), Some(Statement::CreateExternalTable(_)))
+    })
 }
 
 impl Completer for CliHelper {
@@ -149,7 +172,9 @@ impl Completer for CliHelper {
 impl Validator for CliHelper {
     fn validate(&self, ctx: &mut ValidationContext<'_>) -> Result<ValidationResult> {
         let input = ctx.input().trim_end();
-        self.validate_input(input)
+        let result = self.validate_input(input);
+        self.reset_hint();
+        result
     }
 }
 
diff --git a/datafusion-cli/src/highlighter.rs b/datafusion-cli/src/highlighter.rs
index f4e57a2e3593a..adcb135bb401f 100644
--- a/datafusion-cli/src/highlighter.rs
+++ b/datafusion-cli/src/highlighter.rs
@@ -23,7 +23,7 @@ use std::{
 };
 
 use datafusion::sql::sqlparser::{
-    dialect::{dialect_from_str, Dialect, GenericDialect},
+    dialect::{Dialect, GenericDialect, dialect_from_str},
     keywords::Keyword,
     tokenizer::{Token, Tokenizer},
 };
@@ -38,7 +38,8 @@ pub struct SyntaxHighlighter {
 
 impl SyntaxHighlighter {
     pub fn new(dialect: &config::Dialect) -> Self {
-        let dialect = dialect_from_str(dialect).unwrap_or(Box::new(GenericDialect {}));
+        let dialect =
+            dialect_from_str(dialect).unwrap_or_else(|| Box::new(GenericDialect {}));
         Self { dialect }
     }
 }
@@ -80,22 +81,26 @@ impl Highlighter for SyntaxHighlighter {
 }
 
 /// Convenient utility to return strings with [ANSI color](https://gist.github.com/JBlond/2fea43a3049b38287e5e9cefc87b2124).
-struct Color {}
+pub(crate) struct Color {}
 
 impl Color {
-    fn green(s: impl Display) -> String {
+    pub(crate) fn green(s: impl Display) -> String {
         format!("\x1b[92m{s}\x1b[0m")
     }
 
-    fn red(s: impl Display) -> String {
+    pub(crate) fn red(s: impl Display) -> String {
         format!("\x1b[91m{s}\x1b[0m")
     }
+
+    pub(crate) fn gray(s: impl Display) -> String {
+        format!("\x1b[90m{s}\x1b[0m")
+    }
 }
 
 #[cfg(test)]
 mod tests {
-    use super::config::Dialect;
     use super::SyntaxHighlighter;
+    use super::config::Dialect;
     use rustyline::highlight::Highlighter;
 
     #[test]
diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs
index 09fa8ef15af84..6bfe1160ecdd6 100644
--- a/datafusion-cli/src/main.rs
+++ b/datafusion-cli/src/main.rs
@@ -31,16 +31,17 @@ use datafusion::execution::runtime_env::RuntimeEnvBuilder;
 use datafusion::logical_expr::ExplainFormat;
 use datafusion::prelude::SessionContext;
 use datafusion_cli::catalog::DynamicObjectStoreCatalog;
-use datafusion_cli::functions::{MetadataCacheFunc, ParquetMetadataFunc};
+use datafusion_cli::functions::{
+    ListFilesCacheFunc, MetadataCacheFunc, ParquetMetadataFunc, StatisticsCacheFunc,
+};
 use datafusion_cli::object_storage::instrumented::{
     InstrumentedObjectStoreMode, InstrumentedObjectStoreRegistry,
 };
 use datafusion_cli::{
-    exec,
+    DATAFUSION_CLI_VERSION, exec,
     pool_type::PoolType,
     print_format::PrintFormat,
     print_options::{MaxRows, PrintOptions},
-    DATAFUSION_CLI_VERSION,
 };
 
 use clap::Parser;
@@ -244,6 +245,21 @@ async fn main_inner() -> Result<()> {
         )),
     );
 
+    // register `statistics_cache` table function to get the contents of the file statistics cache
+    ctx.register_udtf(
+        "statistics_cache",
+        Arc::new(StatisticsCacheFunc::new(
+            ctx.task_ctx().runtime_env().cache_manager.clone(),
+        )),
+    );
+
+    ctx.register_udtf(
+        "list_files_cache",
+        Arc::new(ListFilesCacheFunc::new(
+            ctx.task_ctx().runtime_env().cache_manager.clone(),
+        )),
+    );
+
     let mut print_options = PrintOptions {
         format: args.format,
         quiet: args.quiet,
@@ -422,9 +438,20 @@ pub fn extract_disk_limit(size: &str) -> Result<usize, String> {
 
 #[cfg(test)]
 mod tests {
+    use std::time::Duration;
+
     use super::*;
-    use datafusion::{common::test_util::batches_to_string, prelude::ParquetReadOptions};
+    use datafusion::{
+        common::test_util::batches_to_string,
+        execution::cache::{
+            DefaultListFilesCache, cache_manager::CacheManagerConfig,
+            cache_unit::DefaultFileStatisticsCache,
+        },
+        prelude::{ParquetReadOptions, col, lit, split_part},
+    };
     use insta::assert_snapshot;
+    use object_store::memory::InMemory;
+    use url::Url;
 
     fn assert_conversion(input: &str, expected: Result<usize, String>) {
         let result = extract_memory_pool_size(input);
@@ -488,8 +515,7 @@ mod tests {
         ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc {}));
 
         // input with single quote
-        let sql =
-            "SELECT * FROM parquet_metadata('../datafusion/core/tests/data/fixed_size_list_array.parquet')";
+        let sql = "SELECT * FROM parquet_metadata('../datafusion/core/tests/data/fixed_size_list_array.parquet')";
         let df = ctx.sql(sql).await?;
         let rbs = df.collect().await?;
 
@@ -502,8 +528,7 @@ mod tests {
         "#);
 
         // input with double quote
-        let sql =
-            "SELECT * FROM parquet_metadata(\"../datafusion/core/tests/data/fixed_size_list_array.parquet\")";
+        let sql = "SELECT * FROM parquet_metadata(\"../datafusion/core/tests/data/fixed_size_list_array.parquet\")";
         let df = ctx.sql(sql).await?;
         let rbs = df.collect().await?;
         assert_snapshot!(batches_to_string(&rbs), @r#"
@@ -523,8 +548,7 @@ mod tests {
         ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc {}));
 
         // input with string columns
-        let sql =
-            "SELECT * FROM parquet_metadata('../parquet-testing/data/data_index_bloom_encoding_stats.parquet')";
+        let sql = "SELECT * FROM parquet_metadata('../parquet-testing/data/data_index_bloom_encoding_stats.parquet')";
         let df = ctx.sql(sql).await?;
         let rbs = df.collect().await?;
 
@@ -592,9 +616,9 @@ mod tests {
         +-----------------------------------+-----------------+---------------------+------+------------------+
         | filename                          | file_size_bytes | metadata_size_bytes | hits | extra            |
         +-----------------------------------+-----------------+---------------------+------+------------------+
-        | alltypes_plain.parquet            | 1851            | 6957                | 2    | page_index=false |
-        | alltypes_tiny_pages.parquet       | 454233          | 267014              | 2    | page_index=true  |
-        | lz4_raw_compressed_larger.parquet | 380836          | 996                 | 2    | page_index=false |
+        | alltypes_plain.parquet            | 1851            | 8882                | 2    | page_index=false |
+        | alltypes_tiny_pages.parquet       | 454233          | 269074              | 2    | page_index=true  |
+        | lz4_raw_compressed_larger.parquet | 380836          | 1339                | 2    | page_index=false |
         +-----------------------------------+-----------------+---------------------+------+------------------+
         ");
 
@@ -623,12 +647,205 @@ mod tests {
         +-----------------------------------+-----------------+---------------------+------+------------------+
         | filename                          | file_size_bytes | metadata_size_bytes | hits | extra            |
         +-----------------------------------+-----------------+---------------------+------+------------------+
-        | alltypes_plain.parquet            | 1851            | 6957                | 5    | page_index=false |
-        | alltypes_tiny_pages.parquet       | 454233          | 267014              | 2    | page_index=true  |
-        | lz4_raw_compressed_larger.parquet | 380836          | 996                 | 3    | page_index=false |
+        | alltypes_plain.parquet            | 1851            | 8882                | 5    | page_index=false |
+        | alltypes_tiny_pages.parquet       | 454233          | 269074              | 2    | page_index=true  |
+        | lz4_raw_compressed_larger.parquet | 380836          | 1339                | 3    | page_index=false |
         +-----------------------------------+-----------------+---------------------+------+------------------+
         ");
 
         Ok(())
     }
+
+    /// Shows that the statistics cache is not enabled by default yet
+    /// See https://github.com/apache/datafusion/issues/19217
+    #[tokio::test]
+    async fn test_statistics_cache_default() -> Result<(), DataFusionError> {
+        let ctx = SessionContext::new();
+
+        ctx.register_udtf(
+            "statistics_cache",
+            Arc::new(StatisticsCacheFunc::new(
+                ctx.task_ctx().runtime_env().cache_manager.clone(),
+            )),
+        );
+
+        for filename in [
+            "alltypes_plain",
+            "alltypes_tiny_pages",
+            "lz4_raw_compressed_larger",
+        ] {
+            ctx.sql(
+                format!(
+                    "create external table {filename}
+                    stored as parquet
+                    location '../parquet-testing/data/{filename}.parquet'",
+                )
+                .as_str(),
+            )
+            .await?
+            .collect()
+            .await?;
+        }
+
+        // When the cache manager creates a StatisticsCache by default,
+        // the contents will show up here
+        let sql = "SELECT split_part(path, '/', -1) as filename, file_size_bytes, num_rows, num_columns, table_size_bytes from statistics_cache() order by filename";
+        let df = ctx.sql(sql).await?;
+        let rbs = df.collect().await?;
+        assert_snapshot!(batches_to_string(&rbs),@r"
+        ++
+        ++
+        ");
+
+        Ok(())
+    }
+
+    // Can be removed when https://github.com/apache/datafusion/issues/19217 is resolved
+    #[tokio::test]
+    async fn test_statistics_cache_override() -> Result<(), DataFusionError> {
+        // Install a specific StatisticsCache implementation
+        let file_statistics_cache = Arc::new(DefaultFileStatisticsCache::default());
+        let cache_config = CacheManagerConfig::default()
+            .with_files_statistics_cache(Some(file_statistics_cache.clone()));
+        let runtime = RuntimeEnvBuilder::new()
+            .with_cache_manager(cache_config)
+            .build()?;
+        let config = SessionConfig::new().with_collect_statistics(true);
+        let ctx = SessionContext::new_with_config_rt(config, Arc::new(runtime));
+
+        ctx.register_udtf(
+            "statistics_cache",
+            Arc::new(StatisticsCacheFunc::new(
+                ctx.task_ctx().runtime_env().cache_manager.clone(),
+            )),
+        );
+
+        for filename in [
+            "alltypes_plain",
+            "alltypes_tiny_pages",
+            "lz4_raw_compressed_larger",
+        ] {
+            ctx.sql(
+                format!(
+                    "create external table {filename}
+                    stored as parquet
+                    location '../parquet-testing/data/{filename}.parquet'",
+                )
+                .as_str(),
+            )
+            .await?
+            .collect()
+            .await?;
+        }
+
+        let sql = "SELECT split_part(path, '/', -1) as filename, file_size_bytes, num_rows, num_columns, table_size_bytes from statistics_cache() order by filename";
+        let df = ctx.sql(sql).await?;
+        let rbs = df.collect().await?;
+        assert_snapshot!(batches_to_string(&rbs),@r"
+        +-----------------------------------+-----------------+--------------+-------------+------------------+
+        | filename                          | file_size_bytes | num_rows     | num_columns | table_size_bytes |
+        +-----------------------------------+-----------------+--------------+-------------+------------------+
+        | alltypes_plain.parquet            | 1851            | Exact(8)     | 11          | Absent           |
+        | alltypes_tiny_pages.parquet       | 454233          | Exact(7300)  | 13          | Absent           |
+        | lz4_raw_compressed_larger.parquet | 380836          | Exact(10000) | 1           | Absent           |
+        +-----------------------------------+-----------------+--------------+-------------+------------------+
+        ");
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_list_files_cache() -> Result<(), DataFusionError> {
+        let list_files_cache = Arc::new(DefaultListFilesCache::new(
+            1024,
+            Some(Duration::from_secs(1)),
+        ));
+
+        let rt = RuntimeEnvBuilder::new()
+            .with_cache_manager(
+                CacheManagerConfig::default()
+                    .with_list_files_cache(Some(list_files_cache)),
+            )
+            .build_arc()
+            .unwrap();
+
+        let ctx = SessionContext::new_with_config_rt(SessionConfig::default(), rt);
+
+        ctx.register_object_store(
+            &Url::parse("mem://test_table").unwrap(),
+            Arc::new(InMemory::new()),
+        );
+
+        ctx.register_udtf(
+            "list_files_cache",
+            Arc::new(ListFilesCacheFunc::new(
+                ctx.task_ctx().runtime_env().cache_manager.clone(),
+            )),
+        );
+
+        ctx.sql(
+            "CREATE EXTERNAL TABLE src_table
+            STORED AS PARQUET
+            LOCATION '../parquet-testing/data/alltypes_plain.parquet'",
+        )
+        .await?
+        .collect()
+        .await?;
+
+        ctx.sql("COPY (SELECT * FROM src_table) TO 'mem://test_table/0.parquet' STORED AS PARQUET").await?.collect().await?;
+
+        ctx.sql("COPY (SELECT * FROM src_table) TO 'mem://test_table/1.parquet' STORED AS PARQUET").await?.collect().await?;
+
+        ctx.sql(
+            "CREATE EXTERNAL TABLE test_table 
+            STORED AS PARQUET
+            LOCATION 'mem://test_table/'
+        ",
+        )
+        .await?
+        .collect()
+        .await?;
+
+        let sql = "SELECT metadata_size_bytes, expires_in, metadata_list FROM list_files_cache()";
+        let df = ctx
+            .sql(sql)
+            .await?
+            .unnest_columns(&["metadata_list"])?
+            .with_column_renamed("metadata_list", "metadata")?
+            .unnest_columns(&["metadata"])?;
+
+        assert_eq!(
+            2,
+            df.clone()
+                .filter(col("expires_in").is_not_null())?
+                .count()
+                .await?
+        );
+
+        let df = df
+            .with_column_renamed(r#""metadata.file_size_bytes""#, "file_size_bytes")?
+            .with_column_renamed(r#""metadata.e_tag""#, "etag")?
+            .with_column(
+                "filename",
+                split_part(col(r#""metadata.file_path""#), lit("/"), lit(-1)),
+            )?
+            .select_columns(&[
+                "metadata_size_bytes",
+                "filename",
+                "file_size_bytes",
+                "etag",
+            ])?
+            .sort(vec![col("filename").sort(true, false)])?;
+        let rbs = df.collect().await?;
+        assert_snapshot!(batches_to_string(&rbs),@r"
+        +---------------------+-----------+-----------------+------+
+        | metadata_size_bytes | filename  | file_size_bytes | etag |
+        +---------------------+-----------+-----------------+------+
+        | 212                 | 0.parquet | 3642            | 0    |
+        | 212                 | 1.parquet | 3642            | 1    |
+        +---------------------+-----------+-----------------+------+
+        ");
+
+        Ok(())
+    }
 }
diff --git a/datafusion-cli/src/object_storage.rs b/datafusion-cli/src/object_storage.rs
index e6e6be42c7ad0..34787838929f1 100644
--- a/datafusion-cli/src/object_storage.rs
+++ b/datafusion-cli/src/object_storage.rs
@@ -20,7 +20,7 @@ pub mod instrumented;
 use async_trait::async_trait;
 use aws_config::BehaviorVersion;
 use aws_credential_types::provider::{
-    error::CredentialsError, ProvideCredentials, SharedCredentialsProvider,
+    ProvideCredentials, SharedCredentialsProvider, error::CredentialsError,
 };
 use datafusion::{
     common::{
@@ -33,12 +33,12 @@ use datafusion::{
 };
 use log::debug;
 use object_store::{
-    aws::{AmazonS3Builder, AmazonS3ConfigKey, AwsCredential},
-    gcp::GoogleCloudStorageBuilder,
-    http::HttpBuilder,
     ClientOptions, CredentialProvider,
     Error::Generic,
     ObjectStore,
+    aws::{AmazonS3Builder, AmazonS3ConfigKey, AwsCredential},
+    gcp::GoogleCloudStorageBuilder,
+    http::HttpBuilder,
 };
 use std::{
     any::Any,
@@ -64,6 +64,21 @@ pub async fn get_s3_object_store_builder(
     url: &Url,
     aws_options: &AwsOptions,
     resolve_region: bool,
+) -> Result<AmazonS3Builder> {
+    // Box the inner future to reduce the future size of this async function,
+    // which is deeply nested in the CLI's async call chain.
+    Box::pin(get_s3_object_store_builder_inner(
+        url,
+        aws_options,
+        resolve_region,
+    ))
+    .await
+}
+
+async fn get_s3_object_store_builder_inner(
+    url: &Url,
+    aws_options: &AwsOptions,
+    resolve_region: bool,
 ) -> Result<AmazonS3Builder> {
     let AwsOptions {
         access_key_id,
@@ -124,14 +139,15 @@ pub async fn get_s3_object_store_builder(
     if let Some(endpoint) = endpoint {
         // Make a nicer error if the user hasn't allowed http and the endpoint
         // is http as the default message is "URL scheme is not allowed"
-        if let Ok(endpoint_url) = Url::try_from(endpoint.as_str()) {
-            if !matches!(allow_http, Some(true)) && endpoint_url.scheme() == "http" {
-                return config_err!(
-                    "Invalid endpoint: {endpoint}. \
+        if let Ok(endpoint_url) = Url::try_from(endpoint.as_str())
+            && !matches!(allow_http, Some(true))
+            && endpoint_url.scheme() == "http"
+        {
+            return config_err!(
+                "Invalid endpoint: {endpoint}. \
                 HTTP is not allowed for S3 endpoints. \
                 To allow HTTP, set 'aws.allow_http' to true"
-                );
-            }
+            );
         }
 
         builder = builder.with_endpoint(endpoint);
@@ -208,7 +224,7 @@ impl CredentialsFromConfig {
 
 #[derive(Debug)]
 struct S3CredentialProvider {
-    credentials: aws_credential_types::provider::SharedCredentialsProvider,
+    credentials: SharedCredentialsProvider,
 }
 
 #[async_trait]
@@ -586,8 +602,10 @@ mod tests {
 
         let location = "s3://bucket/path/FAKE/file.parquet";
         // Set it to a non-existent file to avoid reading the default configuration file
-        std::env::set_var("AWS_CONFIG_FILE", "data/aws.config");
-        std::env::set_var("AWS_SHARED_CREDENTIALS_FILE", "data/aws.credentials");
+        unsafe {
+            std::env::set_var("AWS_CONFIG_FILE", "data/aws.config");
+            std::env::set_var("AWS_SHARED_CREDENTIALS_FILE", "data/aws.credentials");
+        }
 
         // No options
         let table_url = ListingTableUrl::parse(location)?;
@@ -716,7 +734,10 @@ mod tests {
             .await
             .unwrap_err();
 
-        assert_eq!(err.to_string().lines().next().unwrap_or_default(), "Invalid or Unsupported Configuration: Invalid endpoint: http://endpoint33. HTTP is not allowed for S3 endpoints. To allow HTTP, set 'aws.allow_http' to true");
+        assert_eq!(
+            err.to_string().lines().next().unwrap_or_default(),
+            "Invalid or Unsupported Configuration: Invalid endpoint: http://endpoint33. HTTP is not allowed for S3 endpoints. To allow HTTP, set 'aws.allow_http' to true"
+        );
 
         // Now add `allow_http` to the options and check if it works
         let sql = format!(
@@ -743,10 +764,11 @@ mod tests {
             eprintln!("{e}");
             return Ok(());
         }
-        let expected_region = "eu-central-1";
         let location = "s3://test-bucket/path/file.parquet";
         // Set it to a non-existent file to avoid reading the default configuration file
-        std::env::set_var("AWS_CONFIG_FILE", "data/aws.config");
+        unsafe {
+            std::env::set_var("AWS_CONFIG_FILE", "data/aws.config");
+        }
 
         let table_url = ListingTableUrl::parse(location)?;
         let aws_options = AwsOptions {
@@ -758,17 +780,18 @@ mod tests {
             get_s3_object_store_builder(table_url.as_ref(), &aws_options, false).await?;
 
         // Verify that the region was auto-detected in test environment
-        assert_eq!(
-            builder.get_config_value(&AmazonS3ConfigKey::Region),
-            Some(expected_region.to_string())
+        assert!(
+            builder
+                .get_config_value(&AmazonS3ConfigKey::Region)
+                .is_some()
         );
 
         Ok(())
     }
 
     #[tokio::test]
-    async fn s3_object_store_builder_overrides_region_when_resolve_region_enabled(
-    ) -> Result<()> {
+    async fn s3_object_store_builder_overrides_region_when_resolve_region_enabled()
+    -> Result<()> {
         if let Err(DataFusionError::Execution(e)) = check_aws_envs().await {
             // Skip test if AWS envs are not set
             eprintln!("{e}");
@@ -806,7 +829,9 @@ mod tests {
 
         let table_url = ListingTableUrl::parse(location)?;
         let scheme = table_url.scheme();
-        let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}', 'aws.oss.endpoint' '{endpoint}') LOCATION '{location}'");
+        let sql = format!(
+            "CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}', 'aws.oss.endpoint' '{endpoint}') LOCATION '{location}'"
+        );
 
         let ctx = SessionContext::new();
         ctx.register_table_options_extension_from_scheme(scheme);
@@ -830,14 +855,15 @@ mod tests {
     #[tokio::test]
     async fn gcs_object_store_builder() -> Result<()> {
         let service_account_path = "fake_service_account_path";
-        let service_account_key =
-            "{\"private_key\": \"fake_private_key.pem\",\"client_email\":\"fake_client_email\"}";
+        let service_account_key = "{\"private_key\": \"fake_private_key.pem\",\"client_email\":\"fake_client_email\"}";
         let application_credentials_path = "fake_application_credentials_path";
         let location = "gcs://bucket/path/file.parquet";
 
         let table_url = ListingTableUrl::parse(location)?;
         let scheme = table_url.scheme();
-        let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('gcp.service_account_path' '{service_account_path}', 'gcp.service_account_key' '{service_account_key}', 'gcp.application_credentials_path' '{application_credentials_path}') LOCATION '{location}'");
+        let sql = format!(
+            "CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('gcp.service_account_path' '{service_account_path}', 'gcp.service_account_key' '{service_account_key}', 'gcp.application_credentials_path' '{application_credentials_path}') LOCATION '{location}'"
+        );
 
         let ctx = SessionContext::new();
         ctx.register_table_options_extension_from_scheme(scheme);
diff --git a/datafusion-cli/src/object_storage/instrumented.rs b/datafusion-cli/src/object_storage/instrumented.rs
index c4b63b417fe42..a0321cacb374b 100644
--- a/datafusion-cli/src/object_storage/instrumented.rs
+++ b/datafusion-cli/src/object_storage/instrumented.rs
@@ -20,8 +20,8 @@ use std::{
     ops::AddAssign,
     str::FromStr,
     sync::{
-        atomic::{AtomicU8, Ordering},
         Arc,
+        atomic::{AtomicU8, AtomicU64, Ordering},
     },
     time::Duration,
 };
@@ -31,18 +31,67 @@ use arrow::util::pretty::pretty_format_batches;
 use async_trait::async_trait;
 use chrono::Utc;
 use datafusion::{
-    common::{instant::Instant, HashMap},
+    common::{HashMap, instant::Instant},
     error::DataFusionError,
     execution::object_store::{DefaultObjectStoreRegistry, ObjectStoreRegistry},
 };
-use futures::stream::BoxStream;
+use futures::stream::{BoxStream, Stream};
+use futures::{StreamExt, TryStreamExt};
 use object_store::{
-    path::Path, GetOptions, GetRange, GetResult, ListResult, MultipartUpload, ObjectMeta,
-    ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult, Result,
+    CopyOptions, GetOptions, GetRange, GetResult, ListResult, MultipartUpload,
+    ObjectMeta, ObjectStore, ObjectStoreExt, PutMultipartOptions, PutOptions, PutPayload,
+    PutResult, Result, path::Path,
 };
 use parking_lot::{Mutex, RwLock};
 use url::Url;
 
+/// A stream wrapper that measures the time until the first response(item or end of stream) is yielded.
+///
+/// The timer starts on the first `poll_next` call (not at stream creation) to avoid
+/// measuring unrelated work between stream creation and first poll.
+/// Duration is stored as nanoseconds in an `AtomicU64` (0 = not yet set).
+struct TimeToFirstItemStream<S> {
+    inner: S,
+    start: Option<Instant>,
+    request_duration: Arc<AtomicU64>,
+    duration_recorded: bool,
+}
+
+impl<S> TimeToFirstItemStream<S> {
+    fn new(inner: S, request_duration: Arc<AtomicU64>) -> Self {
+        Self {
+            inner,
+            start: None,
+            request_duration,
+            duration_recorded: false,
+        }
+    }
+}
+
+impl<S> Stream for TimeToFirstItemStream<S>
+where
+    S: Stream<Item = Result<ObjectMeta>> + Unpin,
+{
+    type Item = Result<ObjectMeta>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> std::task::Poll<Option<Self::Item>> {
+        let start = *self.start.get_or_insert_with(Instant::now);
+
+        let poll_result = std::pin::Pin::new(&mut self.inner).poll_next(cx);
+
+        if !self.duration_recorded && poll_result.is_ready() {
+            self.duration_recorded = true;
+            let nanos = start.elapsed().as_nanos() as u64;
+            self.request_duration.store(nanos, Ordering::Release);
+        }
+
+        poll_result
+    }
+}
+
 /// The profiling mode to use for an [`InstrumentedObjectStore`] instance. Collecting profiling
 /// data will have a small negative impact on both CPU and memory usage. Default is `Disabled`
 #[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
@@ -57,7 +106,7 @@ pub enum InstrumentedObjectStoreMode {
 }
 
 impl fmt::Display for InstrumentedObjectStoreMode {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         write!(f, "{self:?}")
     }
 }
@@ -91,7 +140,7 @@ impl From<u8> for InstrumentedObjectStoreMode {
 pub struct InstrumentedObjectStore {
     inner: Arc<dyn ObjectStore>,
     instrument_mode: AtomicU8,
-    requests: Mutex<Vec<RequestDetails>>,
+    requests: Arc<Mutex<Vec<RequestDetails>>>,
 }
 
 impl InstrumentedObjectStore {
@@ -100,7 +149,7 @@ impl InstrumentedObjectStore {
         Self {
             inner: object_store,
             instrument_mode,
-            requests: Mutex::new(Vec::new()),
+            requests: Arc::new(Mutex::new(Vec::new())),
         }
     }
 
@@ -137,7 +186,7 @@ impl InstrumentedObjectStore {
             op: Operation::Put,
             path: location.clone(),
             timestamp,
-            duration: Some(elapsed),
+            duration_nanos: Arc::new(AtomicU64::new(elapsed.as_nanos() as u64)),
             size: Some(size),
             range: None,
             extra_display: None,
@@ -160,7 +209,7 @@ impl InstrumentedObjectStore {
             op: Operation::Put,
             path: location.clone(),
             timestamp,
-            duration: Some(elapsed),
+            duration_nanos: Arc::new(AtomicU64::new(elapsed.as_nanos() as u64)),
             size: None,
             range: None,
             extra_display: None,
@@ -177,16 +226,26 @@ impl InstrumentedObjectStore {
         let timestamp = Utc::now();
         let range = options.range.clone();
 
+        let head = options.head;
         let start = Instant::now();
         let ret = self.inner.get_opts(location, options).await?;
         let elapsed = start.elapsed();
 
+        let (op, size) = if head {
+            (Operation::Head, None)
+        } else {
+            (
+                Operation::Get,
+                Some((ret.range.end - ret.range.start) as usize),
+            )
+        };
+
         self.requests.lock().push(RequestDetails {
-            op: Operation::Get,
+            op,
             path: location.clone(),
             timestamp,
-            duration: Some(elapsed),
-            size: Some((ret.range.end - ret.range.start) as usize),
+            duration_nanos: Arc::new(AtomicU64::new(elapsed.as_nanos() as u64)),
+            size,
             range,
             extra_display: None,
         });
@@ -194,23 +253,30 @@ impl InstrumentedObjectStore {
         Ok(ret)
     }
 
-    async fn instrumented_delete(&self, location: &Path) -> Result<()> {
+    fn instrumented_delete_stream(
+        &self,
+        locations: BoxStream<'static, Result<Path>>,
+    ) -> BoxStream<'static, Result<Path>> {
+        let requests_captured = Arc::clone(&self.requests);
+
         let timestamp = Utc::now();
         let start = Instant::now();
-        self.inner.delete(location).await?;
-        let elapsed = start.elapsed();
-
-        self.requests.lock().push(RequestDetails {
-            op: Operation::Delete,
-            path: location.clone(),
-            timestamp,
-            duration: Some(elapsed),
-            size: None,
-            range: None,
-            extra_display: None,
-        });
-
-        Ok(())
+        self.inner
+            .delete_stream(locations)
+            .and_then(move |location| {
+                let elapsed = start.elapsed();
+                requests_captured.lock().push(RequestDetails {
+                    op: Operation::Delete,
+                    path: location.clone(),
+                    timestamp,
+                    duration_nanos: Arc::new(AtomicU64::new(elapsed.as_nanos() as u64)),
+                    size: None,
+                    range: None,
+                    extra_display: None,
+                });
+                futures::future::ok(location)
+            })
+            .boxed()
     }
 
     fn instrumented_list(
@@ -218,19 +284,20 @@ impl InstrumentedObjectStore {
         prefix: Option<&Path>,
     ) -> BoxStream<'static, Result<ObjectMeta>> {
         let timestamp = Utc::now();
-        let ret = self.inner.list(prefix);
+        let inner_stream = self.inner.list(prefix);
 
+        let duration_nanos = Arc::new(AtomicU64::new(0));
         self.requests.lock().push(RequestDetails {
             op: Operation::List,
             path: prefix.cloned().unwrap_or_else(|| Path::from("")),
             timestamp,
-            duration: None, // list returns a stream, so the duration isn't meaningful
+            duration_nanos: Arc::clone(&duration_nanos),
             size: None,
             range: None,
             extra_display: None,
         });
 
-        ret
+        Box::pin(TimeToFirstItemStream::new(inner_stream, duration_nanos))
     }
 
     async fn instrumented_list_with_delimiter(
@@ -246,7 +313,7 @@ impl InstrumentedObjectStore {
             op: Operation::List,
             path: prefix.cloned().unwrap_or_else(|| Path::from("")),
             timestamp,
-            duration: Some(elapsed),
+            duration_nanos: Arc::new(AtomicU64::new(elapsed.as_nanos() as u64)),
             size: None,
             range: None,
             extra_display: None,
@@ -265,7 +332,7 @@ impl InstrumentedObjectStore {
             op: Operation::Copy,
             path: from.clone(),
             timestamp,
-            duration: Some(elapsed),
+            duration_nanos: Arc::new(AtomicU64::new(elapsed.as_nanos() as u64)),
             size: None,
             range: None,
             extra_display: Some(format!("copy_to: {to}")),
@@ -288,7 +355,7 @@ impl InstrumentedObjectStore {
             op: Operation::Copy,
             path: from.clone(),
             timestamp,
-            duration: Some(elapsed),
+            duration_nanos: Arc::new(AtomicU64::new(elapsed.as_nanos() as u64)),
             size: None,
             range: None,
             extra_display: Some(format!("copy_to: {to}")),
@@ -296,29 +363,10 @@ impl InstrumentedObjectStore {
 
         Ok(())
     }
-
-    async fn instrumented_head(&self, location: &Path) -> Result<ObjectMeta> {
-        let timestamp = Utc::now();
-        let start = Instant::now();
-        let ret = self.inner.head(location).await?;
-        let elapsed = start.elapsed();
-
-        self.requests.lock().push(RequestDetails {
-            op: Operation::Head,
-            path: location.clone(),
-            timestamp,
-            duration: Some(elapsed),
-            size: None,
-            range: None,
-            extra_display: None,
-        });
-
-        Ok(ret)
-    }
 }
 
 impl fmt::Display for InstrumentedObjectStore {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         let mode: InstrumentedObjectStoreMode =
             self.instrument_mode.load(Ordering::Relaxed).into();
         write!(
@@ -364,12 +412,15 @@ impl ObjectStore for InstrumentedObjectStore {
         self.inner.get_opts(location, options).await
     }
 
-    async fn delete(&self, location: &Path) -> Result<()> {
+    fn delete_stream(
+        &self,
+        locations: BoxStream<'static, Result<Path>>,
+    ) -> BoxStream<'static, Result<Path>> {
         if self.enabled() {
-            return self.instrumented_delete(location).await;
+            return self.instrumented_delete_stream(locations);
         }
 
-        self.inner.delete(location).await
+        self.inner.delete_stream(locations)
     }
 
     fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result<ObjectMeta>> {
@@ -388,28 +439,24 @@ impl ObjectStore for InstrumentedObjectStore {
         self.inner.list_with_delimiter(prefix).await
     }
 
-    async fn copy(&self, from: &Path, to: &Path) -> Result<()> {
-        if self.enabled() {
-            return self.instrumented_copy(from, to).await;
-        }
-
-        self.inner.copy(from, to).await
-    }
-
-    async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> {
-        if self.enabled() {
-            return self.instrumented_copy_if_not_exists(from, to).await;
-        }
-
-        self.inner.copy_if_not_exists(from, to).await
-    }
-
-    async fn head(&self, location: &Path) -> Result<ObjectMeta> {
+    async fn copy_opts(
+        &self,
+        from: &Path,
+        to: &Path,
+        options: CopyOptions,
+    ) -> Result<()> {
         if self.enabled() {
-            return self.instrumented_head(location).await;
+            return match options.mode {
+                object_store::CopyMode::Create => {
+                    self.instrumented_copy_if_not_exists(from, to).await
+                }
+                object_store::CopyMode::Overwrite => {
+                    self.instrumented_copy(from, to).await
+                }
+            };
         }
 
-        self.inner.head(location).await
+        self.inner.copy_opts(from, to, options).await
     }
 }
 
@@ -425,32 +472,57 @@ pub enum Operation {
 }
 
 impl fmt::Display for Operation {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         write!(f, "{self:?}")
     }
 }
 
 /// Holds profiling details about individual requests made through an [`InstrumentedObjectStore`]
-#[derive(Debug)]
 pub struct RequestDetails {
     op: Operation,
     path: Path,
     timestamp: chrono::DateTime<Utc>,
-    duration: Option<Duration>,
+    /// Duration stored as nanoseconds in an AtomicU64. 0 means not yet set.
+    duration_nanos: Arc<AtomicU64>,
     size: Option<usize>,
     range: Option<GetRange>,
     extra_display: Option<String>,
 }
 
+impl fmt::Debug for RequestDetails {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("RequestDetails")
+            .field("op", &self.op)
+            .field("path", &self.path)
+            .field("timestamp", &self.timestamp)
+            .field("duration", &self.duration())
+            .field("size", &self.size)
+            .field("range", &self.range)
+            .field("extra_display", &self.extra_display)
+            .finish()
+    }
+}
+
+impl RequestDetails {
+    fn duration(&self) -> Option<Duration> {
+        let nanos = self.duration_nanos.load(Ordering::Acquire);
+        if nanos == 0 {
+            None
+        } else {
+            Some(Duration::from_nanos(nanos))
+        }
+    }
+}
+
 impl fmt::Display for RequestDetails {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         let mut output_parts = vec![format!(
             "{} operation={:?}",
             self.timestamp.to_rfc3339(),
             self.op
         )];
 
-        if let Some(d) = self.duration {
+        if let Some(d) = self.duration() {
             output_parts.push(format!("duration={:.6}s", d.as_secs_f32()));
         }
         if let Some(s) = self.size {
@@ -637,7 +709,7 @@ impl RequestSummary {
     }
     fn push(&mut self, request: &RequestDetails) {
         self.count += 1;
-        if let Some(dur) = request.duration {
+        if let Some(dur) = request.duration() {
             self.duration_stats.get_or_insert_default().push(dur)
         }
         if let Some(size) = request.size {
@@ -758,6 +830,7 @@ impl ObjectStoreRegistry for InstrumentedObjectStoreRegistry {
 
 #[cfg(test)]
 mod tests {
+    use futures::StreamExt;
     use object_store::WriteMultipart;
 
     use super::*;
@@ -782,9 +855,11 @@ mod tests {
             "TRaCe".parse().unwrap(),
             InstrumentedObjectStoreMode::Trace
         ));
-        assert!("does_not_exist"
-            .parse::<InstrumentedObjectStoreMode>()
-            .is_err());
+        assert!(
+            "does_not_exist"
+                .parse::<InstrumentedObjectStoreMode>()
+                .is_err()
+        );
 
         assert!(matches!(0.into(), InstrumentedObjectStoreMode::Disabled));
         assert!(matches!(1.into(), InstrumentedObjectStoreMode::Summary));
@@ -850,7 +925,7 @@ mod tests {
         let request = requests.pop().unwrap();
         assert_eq!(request.op, Operation::Get);
         assert_eq!(request.path, path);
-        assert!(request.duration.is_some());
+        assert!(request.duration().is_some());
         assert_eq!(request.size, Some(9));
         assert_eq!(request.range, None);
         assert!(request.extra_display.is_none());
@@ -879,7 +954,7 @@ mod tests {
         let request = requests.pop().unwrap();
         assert_eq!(request.op, Operation::Delete);
         assert_eq!(request.path, path);
-        assert!(request.duration.is_some());
+        assert!(request.duration().is_some());
         assert!(request.size.is_none());
         assert!(request.range.is_none());
         assert!(request.extra_display.is_none());
@@ -896,18 +971,58 @@ mod tests {
 
         instrumented.set_instrument_mode(InstrumentedObjectStoreMode::Trace);
         assert!(instrumented.requests.lock().is_empty());
-        let _ = instrumented.list(Some(&path));
+        let mut stream = instrumented.list(Some(&path));
+        // Sleep between stream creation and first poll to verify the timer
+        // starts on first poll, not at stream creation.
+        let delay = Duration::from_millis(50);
+        tokio::time::sleep(delay).await;
+        let _ = stream.next().await;
         assert_eq!(instrumented.requests.lock().len(), 1);
 
         let request = instrumented.take_requests().pop().unwrap();
         assert_eq!(request.op, Operation::List);
         assert_eq!(request.path, path);
-        assert!(request.duration.is_none());
+        let duration = request
+            .duration()
+            .expect("duration should be set after consuming stream");
+        assert!(
+            duration < delay,
+            "duration {duration:?} should exclude the {delay:?} sleep before first poll"
+        );
         assert!(request.size.is_none());
         assert!(request.range.is_none());
         assert!(request.extra_display.is_none());
     }
 
+    #[tokio::test]
+    async fn time_to_first_item_stream_captures_inner_latency() {
+        let inner_delay = Duration::from_millis(50);
+        let inner_stream = futures::stream::once(async move {
+            tokio::time::sleep(inner_delay).await;
+            Ok(ObjectMeta {
+                location: Path::from("test"),
+                last_modified: Utc::now(),
+                size: 0,
+                e_tag: None,
+                version: None,
+            })
+        })
+        .boxed();
+
+        let duration_nanos = Arc::new(AtomicU64::new(0));
+        let mut stream = Box::pin(TimeToFirstItemStream::new(
+            inner_stream,
+            Arc::clone(&duration_nanos),
+        ));
+        let _ = stream.next().await;
+
+        let recorded = Duration::from_nanos(duration_nanos.load(Ordering::Acquire));
+        assert!(
+            recorded >= inner_delay,
+            "recorded duration {recorded:?} should be >= inner stream delay {inner_delay:?}"
+        );
+    }
+
     #[tokio::test]
     async fn instrumented_store_list_with_delimiter() {
         let (instrumented, path) = setup_test_store().await;
@@ -925,7 +1040,7 @@ mod tests {
         let request = instrumented.take_requests().pop().unwrap();
         assert_eq!(request.op, Operation::List);
         assert_eq!(request.path, path);
-        assert!(request.duration.is_some());
+        assert!(request.duration().is_some());
         assert!(request.size.is_none());
         assert!(request.range.is_none());
         assert!(request.extra_display.is_none());
@@ -956,7 +1071,7 @@ mod tests {
         let request = instrumented.take_requests().pop().unwrap();
         assert_eq!(request.op, Operation::Put);
         assert_eq!(request.path, path);
-        assert!(request.duration.is_some());
+        assert!(request.duration().is_some());
         assert_eq!(request.size.unwrap(), size);
         assert!(request.range.is_none());
         assert!(request.extra_display.is_none());
@@ -991,7 +1106,7 @@ mod tests {
         let request = instrumented.take_requests().pop().unwrap();
         assert_eq!(request.op, Operation::Put);
         assert_eq!(request.path, path);
-        assert!(request.duration.is_some());
+        assert!(request.duration().is_some());
         assert!(request.size.is_none());
         assert!(request.range.is_none());
         assert!(request.extra_display.is_none());
@@ -1019,7 +1134,7 @@ mod tests {
         let request = requests.pop().unwrap();
         assert_eq!(request.op, Operation::Copy);
         assert_eq!(request.path, path);
-        assert!(request.duration.is_some());
+        assert!(request.duration().is_some());
         assert!(request.size.is_none());
         assert!(request.range.is_none());
         assert_eq!(
@@ -1058,7 +1173,7 @@ mod tests {
         let request = requests.pop().unwrap();
         assert_eq!(request.op, Operation::Copy);
         assert_eq!(request.path, path);
-        assert!(request.duration.is_some());
+        assert!(request.duration().is_some());
         assert!(request.size.is_none());
         assert!(request.range.is_none());
         assert_eq!(
@@ -1088,7 +1203,7 @@ mod tests {
         let request = requests.pop().unwrap();
         assert_eq!(request.op, Operation::Head);
         assert_eq!(request.path, path);
-        assert!(request.duration.is_some());
+        assert!(request.duration().is_some());
         assert!(request.size.is_none());
         assert!(request.range.is_none());
         assert!(request.extra_display.is_none());
@@ -1100,7 +1215,9 @@ mod tests {
             op: Operation::Get,
             path: Path::from("test"),
             timestamp: chrono::DateTime::from_timestamp(0, 0).unwrap(),
-            duration: Some(Duration::new(5, 0)),
+            duration_nanos: Arc::new(AtomicU64::new(
+                Duration::new(5, 0).as_nanos() as u64
+            )),
             size: Some(10),
             range: Some((..10).into()),
             extra_display: Some(String::from("extra info")),
@@ -1127,7 +1244,9 @@ mod tests {
             op: Operation::Get,
             path: Path::from("test1"),
             timestamp: chrono::DateTime::from_timestamp(0, 0).unwrap(),
-            duration: Some(Duration::from_secs(5)),
+            duration_nanos: Arc::new(AtomicU64::new(
+                Duration::from_secs(5).as_nanos() as u64
+            )),
             size: Some(100),
             range: None,
             extra_display: None,
@@ -1147,7 +1266,9 @@ mod tests {
             op: Operation::Get,
             path: Path::from("test2"),
             timestamp: chrono::DateTime::from_timestamp(1, 0).unwrap(),
-            duration: Some(Duration::from_secs(8)),
+            duration_nanos: Arc::new(AtomicU64::new(
+                Duration::from_secs(8).as_nanos() as u64
+            )),
             size: Some(150),
             range: None,
             extra_display: None,
@@ -1156,7 +1277,9 @@ mod tests {
             op: Operation::Get,
             path: Path::from("test3"),
             timestamp: chrono::DateTime::from_timestamp(2, 0).unwrap(),
-            duration: Some(Duration::from_secs(2)),
+            duration_nanos: Arc::new(AtomicU64::new(
+                Duration::from_secs(2).as_nanos() as u64
+            )),
             size: Some(50),
             range: None,
             extra_display: None,
@@ -1175,7 +1298,9 @@ mod tests {
             op: Operation::Put,
             path: Path::from("test4"),
             timestamp: chrono::DateTime::from_timestamp(3, 0).unwrap(),
-            duration: Some(Duration::from_millis(200)),
+            duration_nanos: Arc::new(AtomicU64::new(
+                Duration::from_millis(200).as_nanos() as u64,
+            )),
             size: Some(75),
             range: None,
             extra_display: None,
@@ -1200,7 +1325,9 @@ mod tests {
             op: Operation::Get,
             path: Path::from("test1"),
             timestamp: chrono::DateTime::from_timestamp(0, 0).unwrap(),
-            duration: Some(Duration::from_secs(3)),
+            duration_nanos: Arc::new(AtomicU64::new(
+                Duration::from_secs(3).as_nanos() as u64
+            )),
             size: None,
             range: None,
             extra_display: None,
@@ -1222,7 +1349,7 @@ mod tests {
             op: Operation::Get,
             path: Path::from("test1"),
             timestamp: chrono::DateTime::from_timestamp(0, 0).unwrap(),
-            duration: None,
+            duration_nanos: Arc::new(AtomicU64::new(0)),
             size: Some(200),
             range: None,
             extra_display: None,
@@ -1244,7 +1371,7 @@ mod tests {
             op: Operation::Get,
             path: Path::from("test1"),
             timestamp: chrono::DateTime::from_timestamp(0, 0).unwrap(),
-            duration: None,
+            duration_nanos: Arc::new(AtomicU64::new(0)),
             size: None,
             range: None,
             extra_display: None,
diff --git a/datafusion-cli/src/print_format.rs b/datafusion-cli/src/print_format.rs
index 56bdb15a315d9..0443a7a289602 100644
--- a/datafusion-cli/src/print_format.rs
+++ b/datafusion-cli/src/print_format.rs
@@ -97,7 +97,7 @@ fn keep_only_maxrows(s: &str, maxrows: usize) -> String {
     let last_line = &lines[lines.len() - 1]; // bottom border line
 
     let spaces = last_line.len().saturating_sub(4);
-    let dotted_line = format!("| .{:<spaces$}|", "", spaces = spaces);
+    let dotted_line = format!("| .{}|", " ".repeat(spaces));
 
     let mut result = lines[0..(maxrows + 3)].to_vec(); // Keep top border and `maxrows` lines
     result.extend(vec![dotted_line; 3]); // Append ... lines
@@ -247,54 +247,54 @@ mod tests {
             .with_schema(three_column_schema())
             .with_batches(vec![])
             .run();
-        assert_snapshot!(output, @r#"
+        assert_snapshot!(output, @r"
         +---+---+---+
         | a | b | c |
         +---+---+---+
         +---+---+---+
-        "#);
+        ");
     }
 
     #[test]
     fn print_csv_no_header() {
         let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Csv)
-            .with_batches(split_batch(three_column_batch()))
+            .with_batches(split_batch(&three_column_batch()))
             .with_header(WithHeader::No)
             .run();
-        assert_snapshot!(output, @r#"
+        assert_snapshot!(output, @r"
         1,4,7
         2,5,8
         3,6,9
-        "#);
+        ");
     }
 
     #[test]
     fn print_csv_with_header() {
         let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Csv)
-            .with_batches(split_batch(three_column_batch()))
+            .with_batches(split_batch(&three_column_batch()))
             .with_header(WithHeader::Yes)
             .run();
-        assert_snapshot!(output, @r#"
+        assert_snapshot!(output, @r"
         a,b,c
         1,4,7
         2,5,8
         3,6,9
-        "#);
+        ");
     }
 
     #[test]
     fn print_tsv_no_header() {
         let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Tsv)
-            .with_batches(split_batch(three_column_batch()))
+            .with_batches(split_batch(&three_column_batch()))
             .with_header(WithHeader::No)
             .run();
-        assert_snapshot!(output, @"
-        1\t4\t7
-        2\t5\t8
-        3\t6\t9
+        assert_snapshot!(output, @r"
+        1	4	7
+        2	5	8
+        3	6	9
         ")
     }
 
@@ -302,14 +302,14 @@ mod tests {
     fn print_tsv_with_header() {
         let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Tsv)
-            .with_batches(split_batch(three_column_batch()))
+            .with_batches(split_batch(&three_column_batch()))
             .with_header(WithHeader::Yes)
             .run();
-        assert_snapshot!(output, @"
-        a\tb\tc
-        1\t4\t7
-        2\t5\t8
-        3\t6\t9
+        assert_snapshot!(output, @r"
+        a	b	c
+        1	4	7
+        2	5	8
+        3	6	9
         ");
     }
 
@@ -317,10 +317,10 @@ mod tests {
     fn print_table() {
         let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Table)
-            .with_batches(split_batch(three_column_batch()))
+            .with_batches(split_batch(&three_column_batch()))
             .with_header(WithHeader::Ignored)
             .run();
-        assert_snapshot!(output, @r#"
+        assert_snapshot!(output, @r"
         +---+---+---+
         | a | b | c |
         +---+---+---+
@@ -328,25 +328,23 @@ mod tests {
         | 2 | 5 | 8 |
         | 3 | 6 | 9 |
         +---+---+---+
-        "#);
+        ");
     }
     #[test]
     fn print_json() {
         let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Json)
-            .with_batches(split_batch(three_column_batch()))
+            .with_batches(split_batch(&three_column_batch()))
             .with_header(WithHeader::Ignored)
             .run();
-        assert_snapshot!(output, @r#"
-        [{"a":1,"b":4,"c":7},{"a":2,"b":5,"c":8},{"a":3,"b":6,"c":9}]
-        "#);
+        assert_snapshot!(output, @r#"[{"a":1,"b":4,"c":7},{"a":2,"b":5,"c":8},{"a":3,"b":6,"c":9}]"#);
     }
 
     #[test]
     fn print_ndjson() {
         let output = PrintBatchesTest::new()
             .with_format(PrintFormat::NdJson)
-            .with_batches(split_batch(three_column_batch()))
+            .with_batches(split_batch(&three_column_batch()))
             .with_header(WithHeader::Ignored)
             .run();
         assert_snapshot!(output, @r#"
@@ -360,28 +358,28 @@ mod tests {
     fn print_automatic_no_header() {
         let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Automatic)
-            .with_batches(split_batch(three_column_batch()))
+            .with_batches(split_batch(&three_column_batch()))
             .with_header(WithHeader::No)
             .run();
-        assert_snapshot!(output, @r#"
+        assert_snapshot!(output, @r"
         1,4,7
         2,5,8
         3,6,9
-        "#);
+        ");
     }
     #[test]
     fn print_automatic_with_header() {
         let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Automatic)
-            .with_batches(split_batch(three_column_batch()))
+            .with_batches(split_batch(&three_column_batch()))
             .with_header(WithHeader::Yes)
             .run();
-        assert_snapshot!(output, @r#"
+        assert_snapshot!(output, @r"
         a,b,c
         1,4,7
         2,5,8
         3,6,9
-        "#);
+        ");
     }
 
     #[test]
@@ -396,7 +394,7 @@ mod tests {
                 .with_maxrows(max_rows)
                 .run();
             allow_duplicates! {
-                assert_snapshot!(output, @r#"
+                assert_snapshot!(output, @r"
                 +---+
                 | a |
                 +---+
@@ -404,7 +402,7 @@ mod tests {
                 | 2 |
                 | 3 |
                 +---+
-                "#);
+                ");
             }
         }
     }
@@ -416,7 +414,7 @@ mod tests {
             .with_batches(vec![one_column_batch()])
             .with_maxrows(MaxRows::Limited(1))
             .run();
-        assert_snapshot!(output, @r#"
+        assert_snapshot!(output, @r"
         +---+
         | a |
         +---+
@@ -425,7 +423,7 @@ mod tests {
         | . |
         | . |
         +---+
-        "#);
+        ");
     }
 
     #[test]
@@ -439,7 +437,7 @@ mod tests {
             ])
             .with_maxrows(MaxRows::Limited(5))
             .run();
-        assert_snapshot!(output, @r#"
+        assert_snapshot!(output, @r"
         +---+
         | a |
         +---+
@@ -452,7 +450,7 @@ mod tests {
         | . |
         | . |
         +---+
-        "#);
+        ");
     }
 
     #[test]
@@ -464,7 +462,7 @@ mod tests {
             .with_format(PrintFormat::Table)
             .with_batches(vec![empty_batch.clone(), batch, empty_batch])
             .run();
-        assert_snapshot!(output, @r#"
+        assert_snapshot!(output, @r"
         +---+
         | a |
         +---+
@@ -472,7 +470,7 @@ mod tests {
         | 2 |
         | 3 |
         +---+
-        "#);
+        ");
     }
 
     #[test]
@@ -486,12 +484,12 @@ mod tests {
             .with_batches(vec![empty_batch])
             .with_header(WithHeader::Yes)
             .run();
-        assert_snapshot!(output, @r#"
+        assert_snapshot!(output, @r"
         +---+
         | a |
         +---+
         +---+
-        "#);
+        ");
 
         // No output for empty batch when schema contains no columns
         let empty_batch = RecordBatch::new_empty(Arc::new(Schema::empty()));
@@ -634,8 +632,43 @@ mod tests {
         .unwrap()
     }
 
+    #[test]
+    fn print_maxrows_limited_wide_table() {
+        let output = PrintBatchesTest::new()
+            .with_format(PrintFormat::Table)
+            .with_batches(vec![wide_column_batch()])
+            .with_maxrows(MaxRows::Limited(1))
+            .run();
+        assert_snapshot!(output, @r"
+        +----+----+----+----+----+----+----+----+----+----+
+        | c0 | c1 | c2 | c3 | c4 | c5 | c6 | c7 | c8 | c9 |
+        +----+----+----+----+----+----+----+----+----+----+
+        | 0  | 0  | 0  | 0  | 0  | 0  | 0  | 0  | 0  | 0  |
+        | .                                               |
+        | .                                               |
+        | .                                               |
+        +----+----+----+----+----+----+----+----+----+----+
+        ");
+    }
+
+    /// return a schema with many columns (to exercise wide table formatting)
+    fn wide_column_schema() -> SchemaRef {
+        let fields: Vec<Field> = (0..10)
+            .map(|i| Field::new(format!("c{i}"), DataType::Int32, false))
+            .collect();
+        Arc::new(Schema::new(fields))
+    }
+
+    /// return a batch with many columns and three rows
+    fn wide_column_batch() -> RecordBatch {
+        let arrays: Vec<Arc<dyn arrow::array::Array>> = (0..10)
+            .map(|_| Arc::new(Int32Array::from(vec![0, 1, 2])) as _)
+            .collect();
+        RecordBatch::try_new(wide_column_schema(), arrays).unwrap()
+    }
+
     /// Slice the record batch into 2 batches
-    fn split_batch(batch: RecordBatch) -> Vec<RecordBatch> {
+    fn split_batch(batch: &RecordBatch) -> Vec<RecordBatch> {
         assert!(batch.num_rows() > 1);
         let split = batch.num_rows() / 2;
         vec![
diff --git a/datafusion-cli/src/print_options.rs b/datafusion-cli/src/print_options.rs
index 93d1d450fd82b..d0810cb034df1 100644
--- a/datafusion-cli/src/print_options.rs
+++ b/datafusion-cli/src/print_options.rs
@@ -28,8 +28,8 @@ use crate::print_format::PrintFormat;
 
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
-use datafusion::common::instant::Instant;
 use datafusion::common::DataFusionError;
+use datafusion::common::instant::Instant;
 use datafusion::error::Result;
 use datafusion::physical_plan::RecordBatchStream;
 
@@ -55,8 +55,10 @@ impl FromStr for MaxRows {
             Ok(Self::Unlimited)
         } else {
             match maxrows.parse::<usize>() {
-                Ok(nrows)  => Ok(Self::Limited(nrows)),
-                _ => Err(format!("Invalid maxrows {maxrows}. Valid inputs are natural numbers or \'none\', \'inf\', or \'infinite\' for no limit.")),
+                Ok(nrows) => Ok(Self::Limited(nrows)),
+                _ => Err(format!(
+                    "Invalid maxrows {maxrows}. Valid inputs are natural numbers or \'none\', \'inf\', or \'infinite\' for no limit."
+                )),
             }
         }
     }
@@ -113,7 +115,7 @@ impl PrintOptions {
         row_count: usize,
         format_options: &FormatOptions,
     ) -> Result<()> {
-        let stdout = std::io::stdout();
+        let stdout = io::stdout();
         let mut writer = stdout.lock();
 
         self.format.print_batches(
@@ -135,7 +137,7 @@ impl PrintOptions {
             query_start_time,
         );
 
-        self.write_output(&mut writer, formatted_exec_details)
+        self.write_output(&mut writer, &formatted_exec_details)
     }
 
     /// Print the stream to stdout using the specified format
@@ -151,7 +153,7 @@ impl PrintOptions {
             ));
         };
 
-        let stdout = std::io::stdout();
+        let stdout = io::stdout();
         let mut writer = stdout.lock();
 
         let mut row_count = 0_usize;
@@ -177,13 +179,13 @@ impl PrintOptions {
             query_start_time,
         );
 
-        self.write_output(&mut writer, formatted_exec_details)
+        self.write_output(&mut writer, &formatted_exec_details)
     }
 
     fn write_output<W: io::Write>(
         &self,
         writer: &mut W,
-        formatted_exec_details: String,
+        formatted_exec_details: &str,
     ) -> Result<()> {
         if !self.quiet {
             writeln!(writer, "{formatted_exec_details}")?;
@@ -235,11 +237,11 @@ mod tests {
 
         let mut print_output: Vec<u8> = Vec::new();
         let exec_out = String::from("Formatted Exec Output");
-        print_options.write_output(&mut print_output, exec_out.clone())?;
+        print_options.write_output(&mut print_output, &exec_out)?;
         assert!(print_output.is_empty());
 
         print_options.quiet = false;
-        print_options.write_output(&mut print_output, exec_out.clone())?;
+        print_options.write_output(&mut print_output, &exec_out)?;
         let out_str: String = print_output
             .clone()
             .try_into()
@@ -251,7 +253,7 @@ mod tests {
         print_options
             .instrumented_registry
             .set_instrument_mode(InstrumentedObjectStoreMode::Trace);
-        print_options.write_output(&mut print_output, exec_out.clone())?;
+        print_options.write_output(&mut print_output, &exec_out)?;
         let out_str: String = print_output
             .clone()
             .try_into()
diff --git a/datafusion-cli/tests/cli_integration.rs b/datafusion-cli/tests/cli_integration.rs
index c1395aa4f562c..be4a2ad4fe197 100644
--- a/datafusion-cli/tests/cli_integration.rs
+++ b/datafusion-cli/tests/cli_integration.rs
@@ -20,14 +20,17 @@ use std::process::Command;
 use rstest::rstest;
 
 use async_trait::async_trait;
-use insta::{glob, Settings};
+use insta::internals::SettingsBindDropGuard;
+use insta::{Settings, glob};
 use insta_cmd::{assert_cmd_snapshot, get_cargo_bin};
 use std::path::PathBuf;
 use std::{env, fs};
-use testcontainers::core::{CmdWaitFor, ExecCommand, Mount};
-use testcontainers::runners::AsyncRunner;
-use testcontainers::{ContainerAsync, ImageExt, TestcontainersError};
 use testcontainers_modules::minio;
+use testcontainers_modules::testcontainers::core::{CmdWaitFor, ExecCommand, Mount};
+use testcontainers_modules::testcontainers::runners::AsyncRunner;
+use testcontainers_modules::testcontainers::{
+    ContainerAsync, ImageExt, TestcontainersError,
+};
 
 fn cli() -> Command {
     Command::new(get_cargo_bin("datafusion-cli"))
@@ -42,7 +45,7 @@ fn make_settings() -> Settings {
     settings
 }
 
-async fn setup_minio_container() -> ContainerAsync<minio::MinIO> {
+async fn setup_minio_container() -> Result<ContainerAsync<minio::MinIO>, String> {
     const MINIO_ROOT_USER: &str = "TEST-DataFusionLogin";
     const MINIO_ROOT_PASSWORD: &str = "TEST-DataFusionPassword";
 
@@ -97,25 +100,23 @@ async fn setup_minio_container() -> ContainerAsync<minio::MinIO> {
                     let stdout = container.stdout_to_vec().await.unwrap_or_default();
                     let stderr = container.stderr_to_vec().await.unwrap_or_default();
 
-                    panic!(
+                    return Err(format!(
                         "Failed to execute command: {}\nError: {}\nStdout: {:?}\nStderr: {:?}",
                         cmd_ref,
                         e,
                         String::from_utf8_lossy(&stdout),
                         String::from_utf8_lossy(&stderr)
-                    );
+                    ));
                 }
             }
 
-            container
+            Ok(container)
         }
 
-        Err(TestcontainersError::Client(e)) => {
-            panic!("Failed to start MinIO container. Ensure Docker is running and accessible: {e}");
-        }
-        Err(e) => {
-            panic!("Failed to start MinIO container: {e}");
-        }
+        Err(TestcontainersError::Client(e)) => Err(format!(
+            "Failed to start MinIO container. Ensure Docker is running and accessible: {e}"
+        )),
+        Err(e) => Err(format!("Failed to start MinIO container: {e}")),
     }
 }
 
@@ -215,6 +216,42 @@ fn test_cli_top_memory_consumers<'a>(
     #[case] snapshot_name: &str,
     #[case] top_memory_consumers: impl IntoIterator<Item = &'a str>,
 ) {
+    let _bound = bind_to_settings(snapshot_name);
+
+    let mut cmd = cli();
+    let sql = "select * from generate_series(1,500000) as t1(v1) order by v1;";
+    cmd.args(["--memory-limit", "10M", "--command", sql]);
+    cmd.args(top_memory_consumers);
+
+    assert_cmd_snapshot!(cmd);
+}
+
+#[rstest]
+#[case("no_track", ["--top-memory-consumers", "0"])]
+#[case("top2", ["--top-memory-consumers", "2"])]
+#[test]
+fn test_cli_top_memory_consumers_with_mem_pool_type<'a>(
+    #[case] snapshot_name: &str,
+    #[case] top_memory_consumers: impl IntoIterator<Item = &'a str>,
+) {
+    let _bound = bind_to_settings(snapshot_name);
+
+    let mut cmd = cli();
+    let sql = "select * from generate_series(1,500000) as t1(v1) order by v1;";
+    cmd.args([
+        "--memory-limit",
+        "10M",
+        "--mem-pool-type",
+        "fair",
+        "--command",
+        sql,
+    ]);
+    cmd.args(top_memory_consumers);
+
+    assert_cmd_snapshot!(cmd);
+}
+
+fn bind_to_settings(snapshot_name: &str) -> SettingsBindDropGuard {
     let mut settings = make_settings();
 
     settings.set_snapshot_suffix(snapshot_name);
@@ -224,20 +261,45 @@ fn test_cli_top_memory_consumers<'a>(
         "Consumer(can spill: bool) consumed XB, peak XB",
     );
     settings.add_filter(
-        r"Error: Failed to allocate additional .*? for .*? with .*? already allocated for this reservation - .*? remain available for the total pool",
+        r"Error: Failed to allocate additional .*? for .*? with .*? already allocated for this reservation - .*? remain available for the total memory pool: '.*?'",
         "Error: Failed to allocate ",
     );
     settings.add_filter(
-        r"Resources exhausted: Failed to allocate additional .*? for .*? with .*? already allocated for this reservation - .*? remain available for the total pool",
+        r"Resources exhausted: Failed to allocate additional .*? for .*? with .*? already allocated for this reservation - .*? remain available for the total memory pool: '.*?'",
         "Resources exhausted: Failed to allocate",
     );
 
+    settings.bind_to_scope()
+}
+
+#[test]
+fn test_cli_with_unbounded_memory_pool() {
+    let mut settings = make_settings();
+
+    settings.set_snapshot_suffix("default");
+
     let _bound = settings.bind_to_scope();
 
     let mut cmd = cli();
     let sql = "select * from generate_series(1,500000) as t1(v1) order by v1;";
-    cmd.args(["--memory-limit", "10M", "--command", sql]);
-    cmd.args(top_memory_consumers);
+    cmd.args(["--maxrows", "10", "--command", sql]);
+
+    assert_cmd_snapshot!(cmd);
+}
+
+#[test]
+fn test_cli_wide_result_set_no_crash() {
+    let mut settings = make_settings();
+
+    settings.set_snapshot_suffix("wide_result_set");
+
+    let _bound = settings.bind_to_scope();
+
+    let mut cmd = cli();
+    let sql = "SELECT v1 as c0, v1+1 as c1, v1+2 as c2, v1+3 as c3, v1+4 as c4, \
+               v1+5 as c5, v1+6 as c6, v1+7 as c7, v1+8 as c8, v1+9 as c9 \
+               FROM generate_series(1, 100) as t1(v1);";
+    cmd.args(["--maxrows", "5", "--command", sql]);
 
     assert_cmd_snapshot!(cmd);
 }
@@ -249,7 +311,14 @@ async fn test_cli() {
         return;
     }
 
-    let container = setup_minio_container().await;
+    let container = match setup_minio_container().await {
+        Ok(c) => c,
+        Err(e) if e.contains("toomanyrequests") => {
+            eprintln!("Skipping test: Docker pull rate limit reached: {e}");
+            return;
+        }
+        e @ Err(_) => e.unwrap(),
+    };
 
     let settings = make_settings();
     let _bound = settings.bind_to_scope();
@@ -258,13 +327,15 @@ async fn test_cli() {
 
     glob!("sql/integration/*.sql", |path| {
         let input = fs::read_to_string(path).unwrap();
-        assert_cmd_snapshot!(cli()
-            .env_clear()
-            .env("AWS_ACCESS_KEY_ID", "TEST-DataFusionLogin")
-            .env("AWS_SECRET_ACCESS_KEY", "TEST-DataFusionPassword")
-            .env("AWS_ENDPOINT", format!("http://localhost:{port}"))
-            .env("AWS_ALLOW_HTTP", "true")
-            .pass_stdin(input))
+        assert_cmd_snapshot!(
+            cli()
+                .env_clear()
+                .env("AWS_ACCESS_KEY_ID", "TEST-DataFusionLogin")
+                .env("AWS_SECRET_ACCESS_KEY", "TEST-DataFusionPassword")
+                .env("AWS_ENDPOINT", format!("http://localhost:{port}"))
+                .env("AWS_ALLOW_HTTP", "true")
+                .pass_stdin(input)
+        )
     });
 }
 
@@ -280,7 +351,14 @@ async fn test_aws_options() {
     let settings = make_settings();
     let _bound = settings.bind_to_scope();
 
-    let container = setup_minio_container().await;
+    let container = match setup_minio_container().await {
+        Ok(c) => c,
+        Err(e) if e.contains("toomanyrequests") => {
+            eprintln!("Skipping test: Docker pull rate limit reached: {e}");
+            return;
+        }
+        e @ Err(_) => e.unwrap(),
+    };
     let port = container.get_host_port_ipv4(9000).await.unwrap();
 
     let input = format!(
@@ -328,10 +406,12 @@ SELECT COUNT(*) FROM hits;
 "#
     );
 
-    assert_cmd_snapshot!(cli()
-        .env("RUST_LOG", "warn")
-        .env_remove("AWS_ENDPOINT")
-        .pass_stdin(input));
+    assert_cmd_snapshot!(
+        cli()
+            .env("RUST_LOG", "warn")
+            .env_remove("AWS_ENDPOINT")
+            .pass_stdin(input)
+    );
 }
 
 /// Ensure backtrace will be printed, if executing `datafusion-cli` with a query
@@ -351,14 +431,12 @@ fn test_backtrace_output(#[case] query: &str) {
     let output = cmd.output().expect("Failed to execute command");
     let stdout = String::from_utf8_lossy(&output.stdout);
     let stderr = String::from_utf8_lossy(&output.stderr);
-    let combined_output = format!("{}{}", stdout, stderr);
+    let combined_output = format!("{stdout}{stderr}");
 
     // Assert that the output includes literal 'backtrace'
     assert!(
         combined_output.to_lowercase().contains("backtrace"),
-        "Expected output to contain 'backtrace', but got stdout: '{}' stderr: '{}'",
-        stdout,
-        stderr
+        "Expected output to contain 'backtrace', but got stdout: '{stdout}' stderr: '{stderr}'"
     );
 }
 
@@ -369,7 +447,14 @@ async fn test_s3_url_fallback() {
         return;
     }
 
-    let container = setup_minio_container().await;
+    let container = match setup_minio_container().await {
+        Ok(c) => c,
+        Err(e) if e.contains("toomanyrequests") => {
+            eprintln!("Skipping test: Docker pull rate limit reached: {e}");
+            return;
+        }
+        e @ Err(_) => e.unwrap(),
+    };
 
     let mut settings = make_settings();
     settings.set_snapshot_suffix("s3_url_fallback");
@@ -399,8 +484,14 @@ async fn test_object_store_profiling() {
         return;
     }
 
-    let container = setup_minio_container().await;
-
+    let container = match setup_minio_container().await {
+        Ok(c) => c,
+        Err(e) if e.contains("toomanyrequests") => {
+            eprintln!("Skipping test: Docker pull rate limit reached: {e}");
+            return;
+        }
+        e @ Err(_) => e.unwrap(),
+    };
     let mut settings = make_settings();
 
     // as the object store profiling contains timestamps and durations, we must
@@ -450,7 +541,7 @@ SELECT * from CARS LIMIT 1;
 #[async_trait]
 trait MinioCommandExt {
     async fn with_minio(&mut self, container: &ContainerAsync<minio::MinIO>)
-        -> &mut Self;
+    -> &mut Self;
 }
 
 #[async_trait]
diff --git a/datafusion-cli/tests/snapshots/cli_explain_environment_overrides@explain_plan_environment_overrides.snap b/datafusion-cli/tests/snapshots/cli_explain_environment_overrides@explain_plan_environment_overrides.snap
index 6b3a247dd7b82..5f43ca88dc9d7 100644
--- a/datafusion-cli/tests/snapshots/cli_explain_environment_overrides@explain_plan_environment_overrides.snap
+++ b/datafusion-cli/tests/snapshots/cli_explain_environment_overrides@explain_plan_environment_overrides.snap
@@ -7,7 +7,6 @@ info:
     - EXPLAIN SELECT 123
   env:
     DATAFUSION_EXPLAIN_FORMAT: pgjson
-snapshot_kind: text
 ---
 success: true
 exit_code: 0
@@ -19,19 +18,19 @@ exit_code: 0
 | logical_plan | [                                       |
 |              |   {                                     |
 |              |     "Plan": {                           |
-|              |       "Expressions": [                  |
-|              |         "Int64(123)"                    |
-|              |       ],                                |
 |              |       "Node Type": "Projection",        |
-|              |       "Output": [                       |
+|              |       "Expressions": [                  |
 |              |         "Int64(123)"                    |
 |              |       ],                                |
 |              |       "Plans": [                        |
 |              |         {                               |
 |              |           "Node Type": "EmptyRelation", |
-|              |           "Output": [],                 |
-|              |           "Plans": []                   |
+|              |           "Plans": [],                  |
+|              |           "Output": []                  |
 |              |         }                               |
+|              |       ],                                |
+|              |       "Output": [                       |
+|              |         "Int64(123)"                    |
 |              |       ]                                 |
 |              |     }                                   |
 |              |   }                                     |
diff --git a/datafusion-cli/tests/snapshots/cli_format@automatic.snap b/datafusion-cli/tests/snapshots/cli_format@automatic.snap
index 2591f493e90a8..76b14d9a3a924 100644
--- a/datafusion-cli/tests/snapshots/cli_format@automatic.snap
+++ b/datafusion-cli/tests/snapshots/cli_format@automatic.snap
@@ -1,5 +1,5 @@
 ---
-source: tests/cli_integration.rs
+source: datafusion-cli/tests/cli_integration.rs
 info:
   program: datafusion-cli
   args:
diff --git a/datafusion-cli/tests/snapshots/cli_format@csv.snap b/datafusion-cli/tests/snapshots/cli_format@csv.snap
index c41b042298eb0..2c969bd91d121 100644
--- a/datafusion-cli/tests/snapshots/cli_format@csv.snap
+++ b/datafusion-cli/tests/snapshots/cli_format@csv.snap
@@ -1,5 +1,5 @@
 ---
-source: tests/cli_integration.rs
+source: datafusion-cli/tests/cli_integration.rs
 info:
   program: datafusion-cli
   args:
diff --git a/datafusion-cli/tests/snapshots/cli_format@json.snap b/datafusion-cli/tests/snapshots/cli_format@json.snap
index 8f804a337cce5..22a9cc4657a91 100644
--- a/datafusion-cli/tests/snapshots/cli_format@json.snap
+++ b/datafusion-cli/tests/snapshots/cli_format@json.snap
@@ -1,5 +1,5 @@
 ---
-source: tests/cli_integration.rs
+source: datafusion-cli/tests/cli_integration.rs
 info:
   program: datafusion-cli
   args:
diff --git a/datafusion-cli/tests/snapshots/cli_format@nd-json.snap b/datafusion-cli/tests/snapshots/cli_format@nd-json.snap
index 7b4ce1e2530cf..513bcb7372ca6 100644
--- a/datafusion-cli/tests/snapshots/cli_format@nd-json.snap
+++ b/datafusion-cli/tests/snapshots/cli_format@nd-json.snap
@@ -1,5 +1,5 @@
 ---
-source: tests/cli_integration.rs
+source: datafusion-cli/tests/cli_integration.rs
 info:
   program: datafusion-cli
   args:
diff --git a/datafusion-cli/tests/snapshots/cli_format@table.snap b/datafusion-cli/tests/snapshots/cli_format@table.snap
index 99914182462aa..8677847588385 100644
--- a/datafusion-cli/tests/snapshots/cli_format@table.snap
+++ b/datafusion-cli/tests/snapshots/cli_format@table.snap
@@ -1,5 +1,5 @@
 ---
-source: tests/cli_integration.rs
+source: datafusion-cli/tests/cli_integration.rs
 info:
   program: datafusion-cli
   args:
diff --git a/datafusion-cli/tests/snapshots/cli_format@tsv.snap b/datafusion-cli/tests/snapshots/cli_format@tsv.snap
index 968268c31dd55..c56e60fcab155 100644
--- a/datafusion-cli/tests/snapshots/cli_format@tsv.snap
+++ b/datafusion-cli/tests/snapshots/cli_format@tsv.snap
@@ -1,5 +1,5 @@
 ---
-source: tests/cli_integration.rs
+source: datafusion-cli/tests/cli_integration.rs
 info:
   program: datafusion-cli
   args:
diff --git a/datafusion-cli/tests/snapshots/cli_quick_test@batch_size.snap b/datafusion-cli/tests/snapshots/cli_quick_test@batch_size.snap
index c27d527df0b6a..9fd07fa6f4e1b 100644
--- a/datafusion-cli/tests/snapshots/cli_quick_test@batch_size.snap
+++ b/datafusion-cli/tests/snapshots/cli_quick_test@batch_size.snap
@@ -1,5 +1,5 @@
 ---
-source: tests/cli_integration.rs
+source: datafusion-cli/tests/cli_integration.rs
 info:
   program: datafusion-cli
   args:
diff --git a/datafusion-cli/tests/snapshots/cli_quick_test@default_explain_plan.snap b/datafusion-cli/tests/snapshots/cli_quick_test@default_explain_plan.snap
index 46ee6be64f624..8620f6da84488 100644
--- a/datafusion-cli/tests/snapshots/cli_quick_test@default_explain_plan.snap
+++ b/datafusion-cli/tests/snapshots/cli_quick_test@default_explain_plan.snap
@@ -5,7 +5,6 @@ info:
   args:
     - "--command"
     - EXPLAIN SELECT 123
-snapshot_kind: text
 ---
 success: true
 exit_code: 0
diff --git a/datafusion-cli/tests/snapshots/cli_quick_test@files.snap b/datafusion-cli/tests/snapshots/cli_quick_test@files.snap
index 7c44e41729a17..df3a10b6bb54b 100644
--- a/datafusion-cli/tests/snapshots/cli_quick_test@files.snap
+++ b/datafusion-cli/tests/snapshots/cli_quick_test@files.snap
@@ -1,5 +1,5 @@
 ---
-source: tests/cli_integration.rs
+source: datafusion-cli/tests/cli_integration.rs
 info:
   program: datafusion-cli
   args:
diff --git a/datafusion-cli/tests/snapshots/cli_quick_test@statements.snap b/datafusion-cli/tests/snapshots/cli_quick_test@statements.snap
index 3b975bb6a927d..a394458768d1b 100644
--- a/datafusion-cli/tests/snapshots/cli_quick_test@statements.snap
+++ b/datafusion-cli/tests/snapshots/cli_quick_test@statements.snap
@@ -1,5 +1,5 @@
 ---
-source: tests/cli_integration.rs
+source: datafusion-cli/tests/cli_integration.rs
 info:
   program: datafusion-cli
   args:
diff --git a/datafusion-cli/tests/snapshots/cli_top_memory_consumers@no_track.snap b/datafusion-cli/tests/snapshots/cli_top_memory_consumers@no_track.snap
index 89b646a531f8b..c34e1202f55da 100644
--- a/datafusion-cli/tests/snapshots/cli_top_memory_consumers@no_track.snap
+++ b/datafusion-cli/tests/snapshots/cli_top_memory_consumers@no_track.snap
@@ -14,8 +14,8 @@ success: false
 exit_code: 1
 ----- stdout -----
 [CLI_VERSION]
-Error: Not enough memory to continue external sort. Consider increasing the memory limit, or decreasing sort_spill_reservation_bytes
+Error: Not enough memory to continue external sort. Consider increasing the memory limit config: 'datafusion.runtime.memory_limit', or decreasing the config: 'datafusion.execution.sort_spill_reservation_bytes'.
 caused by
-Resources exhausted: Failed to allocate
+Resources exhausted: Failed to allocate additional 128.0 KB for ExternalSorter[0] with 0.0 B already allocated for this reservation - 0.0 B remain available for the total memory pool: greedy(used: 10.0 MB, pool_size: 10.0 MB)
 
 ----- stderr -----
diff --git a/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top2.snap b/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top2.snap
index 62f864b3adb6e..ebf7a540d8d44 100644
--- a/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top2.snap
+++ b/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top2.snap
@@ -14,11 +14,11 @@ success: false
 exit_code: 1
 ----- stdout -----
 [CLI_VERSION]
-Error: Not enough memory to continue external sort. Consider increasing the memory limit, or decreasing sort_spill_reservation_bytes
+Error: Not enough memory to continue external sort. Consider increasing the memory limit config: 'datafusion.runtime.memory_limit', or decreasing the config: 'datafusion.execution.sort_spill_reservation_bytes'.
 caused by
 Resources exhausted: Additional allocation failed for ExternalSorter[0] with top memory consumers (across reservations) as:
   Consumer(can spill: bool) consumed XB, peak XB,
   Consumer(can spill: bool) consumed XB, peak XB.
-Error: Failed to allocate 
+Error: Failed to allocate additional 128.0 KB for ExternalSorter[0] with 0.0 B already allocated for this reservation - 0.0 B remain available for the total memory pool: greedy(used: 10.0 MB, pool_size: 10.0 MB)
 
 ----- stderr -----
diff --git a/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top3_default.snap b/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top3_default.snap
index 9845d095c9180..9e279ca93ddcd 100644
--- a/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top3_default.snap
+++ b/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top3_default.snap
@@ -12,12 +12,12 @@ success: false
 exit_code: 1
 ----- stdout -----
 [CLI_VERSION]
-Error: Not enough memory to continue external sort. Consider increasing the memory limit, or decreasing sort_spill_reservation_bytes
+Error: Not enough memory to continue external sort. Consider increasing the memory limit config: 'datafusion.runtime.memory_limit', or decreasing the config: 'datafusion.execution.sort_spill_reservation_bytes'.
 caused by
 Resources exhausted: Additional allocation failed for ExternalSorter[0] with top memory consumers (across reservations) as:
   Consumer(can spill: bool) consumed XB, peak XB,
   Consumer(can spill: bool) consumed XB, peak XB,
   Consumer(can spill: bool) consumed XB, peak XB.
-Error: Failed to allocate 
+Error: Failed to allocate additional 128.0 KB for ExternalSorter[0] with 0.0 B already allocated for this reservation - 0.0 B remain available for the total memory pool: greedy(used: 10.0 MB, pool_size: 10.0 MB)
 
 ----- stderr -----
diff --git a/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@no_track.snap b/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@no_track.snap
new file mode 100644
index 0000000000000..9a228fcfb6e93
--- /dev/null
+++ b/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@no_track.snap
@@ -0,0 +1,23 @@
+---
+source: datafusion-cli/tests/cli_integration.rs
+info:
+  program: datafusion-cli
+  args:
+    - "--memory-limit"
+    - 10M
+    - "--mem-pool-type"
+    - fair
+    - "--command"
+    - "select * from generate_series(1,500000) as t1(v1) order by v1;"
+    - "--top-memory-consumers"
+    - "0"
+---
+success: false
+exit_code: 1
+----- stdout -----
+[CLI_VERSION]
+Error: Not enough memory to continue external sort. Consider increasing the memory limit config: 'datafusion.runtime.memory_limit', or decreasing the config: 'datafusion.execution.sort_spill_reservation_bytes'.
+caused by
+Resources exhausted: Failed to allocate additional 128.0 KB for ExternalSorter[0] with 0.0 B already allocated for this reservation - 0.0 B remain available for the total memory pool: fair(pool_size: 10.0 MB)
+
+----- stderr -----
diff --git a/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@top2.snap b/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@top2.snap
new file mode 100644
index 0000000000000..d7f964a339313
--- /dev/null
+++ b/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@top2.snap
@@ -0,0 +1,26 @@
+---
+source: datafusion-cli/tests/cli_integration.rs
+info:
+  program: datafusion-cli
+  args:
+    - "--memory-limit"
+    - 10M
+    - "--mem-pool-type"
+    - fair
+    - "--command"
+    - "select * from generate_series(1,500000) as t1(v1) order by v1;"
+    - "--top-memory-consumers"
+    - "2"
+---
+success: false
+exit_code: 1
+----- stdout -----
+[CLI_VERSION]
+Error: Not enough memory to continue external sort. Consider increasing the memory limit config: 'datafusion.runtime.memory_limit', or decreasing the config: 'datafusion.execution.sort_spill_reservation_bytes'.
+caused by
+Resources exhausted: Additional allocation failed for ExternalSorter[0] with top memory consumers (across reservations) as:
+  Consumer(can spill: bool) consumed XB, peak XB,
+  Consumer(can spill: bool) consumed XB, peak XB.
+Error: Failed to allocate additional 128.0 KB for ExternalSorter[0] with 0.0 B already allocated for this reservation - 0.0 B remain available for the total memory pool: fair(pool_size: 10.0 MB)
+
+----- stderr -----
diff --git a/datafusion-cli/tests/snapshots/cli_wide_result_set_no_crash@wide_result_set.snap b/datafusion-cli/tests/snapshots/cli_wide_result_set_no_crash@wide_result_set.snap
new file mode 100644
index 0000000000000..30b34f3c12baa
--- /dev/null
+++ b/datafusion-cli/tests/snapshots/cli_wide_result_set_no_crash@wide_result_set.snap
@@ -0,0 +1,32 @@
+---
+source: datafusion-cli/tests/cli_integration.rs
+assertion_line: 307
+info:
+  program: datafusion-cli
+  args:
+    - "--maxrows"
+    - "5"
+    - "--command"
+    - "SELECT v1 as c0, v1+1 as c1, v1+2 as c2, v1+3 as c3, v1+4 as c4, v1+5 as c5, v1+6 as c6, v1+7 as c7, v1+8 as c8, v1+9 as c9 FROM generate_series(1, 100) as t1(v1);"
+---
+success: true
+exit_code: 0
+----- stdout -----
+[CLI_VERSION]
++----+----+----+----+----+----+----+----+----+----+
+| c0 | c1 | c2 | c3 | c4 | c5 | c6 | c7 | c8 | c9 |
++----+----+----+----+----+----+----+----+----+----+
+| 1  | 2  | 3  | 4  | 5  | 6  | 7  | 8  | 9  | 10 |
+| 2  | 3  | 4  | 5  | 6  | 7  | 8  | 9  | 10 | 11 |
+| 3  | 4  | 5  | 6  | 7  | 8  | 9  | 10 | 11 | 12 |
+| 4  | 5  | 6  | 7  | 8  | 9  | 10 | 11 | 12 | 13 |
+| 5  | 6  | 7  | 8  | 9  | 10 | 11 | 12 | 13 | 14 |
+| .                                               |
+| .                                               |
+| .                                               |
++----+----+----+----+----+----+----+----+----+----+
+100 row(s) fetched. (First 5 displayed. Use --maxrows to adjust)
+[ELAPSED]
+
+
+----- stderr -----
diff --git a/datafusion-cli/tests/snapshots/cli_with_unbounded_memory_pool@default.snap b/datafusion-cli/tests/snapshots/cli_with_unbounded_memory_pool@default.snap
new file mode 100644
index 0000000000000..7bdcd63dc7be6
--- /dev/null
+++ b/datafusion-cli/tests/snapshots/cli_with_unbounded_memory_pool@default.snap
@@ -0,0 +1,36 @@
+---
+source: datafusion-cli/tests/cli_integration.rs
+info:
+  program: datafusion-cli
+  args:
+    - "--maxrows"
+    - "10"
+    - "--command"
+    - "select * from generate_series(1,500000) as t1(v1) order by v1;"
+---
+success: true
+exit_code: 0
+----- stdout -----
+[CLI_VERSION]
++----+
+| v1 |
++----+
+| 1  |
+| 2  |
+| 3  |
+| 4  |
+| 5  |
+| 6  |
+| 7  |
+| 8  |
+| 9  |
+| 10 |
+| .  |
+| .  |
+| .  |
++----+
+500000 row(s) fetched. (First 10 displayed. Use --maxrows to adjust)
+[ELAPSED]
+
+
+----- stderr -----
diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml
index bb0525e57753b..e56f5ad6b8ca7 100644
--- a/datafusion-examples/Cargo.toml
+++ b/datafusion-examples/Cargo.toml
@@ -29,63 +29,50 @@ license = { workspace = true }
 authors = { workspace = true }
 rust-version = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
-[[example]]
-name = "flight_sql_server"
-path = "examples/flight/flight_sql_server.rs"
-
-[[example]]
-name = "flight_server"
-path = "examples/flight/flight_server.rs"
-
-[[example]]
-name = "flight_client"
-path = "examples/flight/flight_client.rs"
-
-[[example]]
-name = "dataframe_to_s3"
-path = "examples/external_dependency/dataframe-to-s3.rs"
-
-[[example]]
-name = "query_aws_s3"
-path = "examples/external_dependency/query-aws-s3.rs"
-
-[[example]]
-name = "custom_file_casts"
-path = "examples/custom_file_casts.rs"
+[dependencies]
+arrow = { workspace = true }
+arrow-schema = { workspace = true }
+datafusion = { workspace = true, default-features = true, features = ["parquet_encryption"] }
+datafusion-common = { workspace = true }
+nom = "8.0.0"
+tempfile = { workspace = true }
+tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot", "fs"] }
 
 [dev-dependencies]
-arrow = { workspace = true }
-# arrow_schema is required for record_batch! macro :sad:
 arrow-flight = { workspace = true }
-arrow-schema = { workspace = true }
 async-trait = { workspace = true }
 bytes = { workspace = true }
 dashmap = { workspace = true }
 # note only use main datafusion crate for examples
 base64 = "0.22.1"
-datafusion = { workspace = true, default-features = true, features = ["parquet_encryption"] }
-datafusion-ffi = { workspace = true }
+datafusion-expr = { workspace = true }
 datafusion-physical-expr-adapter = { workspace = true }
 datafusion-proto = { workspace = true }
+datafusion-sql = { workspace = true }
 env_logger = { workspace = true }
 futures = { workspace = true }
+insta = { workspace = true }
 log = { workspace = true }
 mimalloc = { version = "0.1", default-features = false }
 object_store = { workspace = true, features = ["aws", "http"] }
 prost = { workspace = true }
 rand = { workspace = true }
+serde = { version = "1", features = ["derive"] }
 serde_json = { workspace = true }
-tempfile = { workspace = true }
+strum = { workspace = true }
+strum_macros = { workspace = true }
 test-utils = { path = "../test-utils" }
-tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] }
 tonic = "0.14"
 tracing = { version = "0.1" }
 tracing-subscriber = { version = "0.3" }
 url = { workspace = true }
-uuid = "1.18"
+uuid = { workspace = true }
 
 [target.'cfg(not(target_os = "windows"))'.dev-dependencies]
-nix = { version = "0.30.1", features = ["fs"] }
+nix = { version = "0.31.1", features = ["fs"] }
diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md
index f1bcbcce82004..073f269d4a35d 100644
--- a/datafusion-examples/README.md
+++ b/datafusion-examples/README.md
@@ -39,59 +39,193 @@ git submodule update --init
 # Change to the examples directory
 cd datafusion-examples/examples
 
-# Run the `dataframe` example:
-# ... use the equivalent for other examples
-cargo run --example dataframe
+# Run all examples in a group
+cargo run --example <group> -- all
+
+# Run a specific example within a group
+cargo run --example <group> -- <subcommand>
+
+# Run all examples in the `dataframe` group
+cargo run --example dataframe -- all
+
+# Run a single example from the `dataframe` group
+# (apply the same pattern for any other group)
+cargo run --example dataframe -- dataframe
 ```
 
-## Single Process
-
-- [`advanced_udaf.rs`](examples/advanced_udaf.rs): Define and invoke a more complicated User Defined Aggregate Function (UDAF)
-- [`advanced_udf.rs`](examples/advanced_udf.rs): Define and invoke a more complicated User Defined Scalar Function (UDF)
-- [`advanced_udwf.rs`](examples/advanced_udwf.rs): Define and invoke a more complicated User Defined Window Function (UDWF)
-- [`advanced_parquet_index.rs`](examples/advanced_parquet_index.rs): Creates a detailed secondary index that covers the contents of several parquet files
-- [`async_udf.rs`](examples/async_udf.rs): Define and invoke an asynchronous User Defined Scalar Function (UDF)
-- [`analyzer_rule.rs`](examples/analyzer_rule.rs): Use a custom AnalyzerRule to change a query's semantics (row level access control)
-- [`catalog.rs`](examples/catalog.rs): Register the table into a custom catalog
-- [`composed_extension_codec`](examples/composed_extension_codec.rs): Example of using multiple extension codecs for serialization / deserialization
-- [`csv_sql_streaming.rs`](examples/csv_sql_streaming.rs): Build and run a streaming query plan from a SQL statement against a local CSV file
-- [`csv_json_opener.rs`](examples/csv_json_opener.rs): Use low level `FileOpener` APIs to read CSV/JSON into Arrow `RecordBatch`es
-- [`custom_datasource.rs`](examples/custom_datasource.rs): Run queries against a custom datasource (TableProvider)
-- [`custom_file_casts.rs`](examples/custom_file_casts.rs): Implement custom casting rules to adapt file schemas
-- [`custom_file_format.rs`](examples/custom_file_format.rs): Write data to a custom file format
-- [`dataframe-to-s3.rs`](examples/external_dependency/dataframe-to-s3.rs): Run a query using a DataFrame against a parquet file from s3 and writing back to s3
-- [`dataframe.rs`](examples/dataframe.rs): Run a query using a DataFrame API against parquet files, csv files, and in-memory data, including multiple subqueries. Also demonstrates the various methods to write out a DataFrame to a table, parquet file, csv file, and json file.
-- [`default_column_values.rs`](examples/default_column_values.rs): Implement custom default value handling for missing columns using field metadata and PhysicalExprAdapter
-- [`deserialize_to_struct.rs`](examples/deserialize_to_struct.rs): Convert query results (Arrow ArrayRefs) into Rust structs
-- [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify, analyze and coerce `Expr`s
-- [`file_stream_provider.rs`](examples/file_stream_provider.rs): Run a query on `FileStreamProvider` which implements `StreamProvider` for reading and writing to arbitrary stream sources / sinks.
-- [`flight_sql_server.rs`](examples/flight/flight_sql_server.rs): Run DataFusion as a standalone process and execute SQL queries from JDBC clients
-- [`function_factory.rs`](examples/function_factory.rs): Register `CREATE FUNCTION` handler to implement SQL macros
-- [`memory_pool_tracking.rs`](examples/memory_pool_tracking.rs): Demonstrates TrackConsumersPool for memory tracking and debugging with enhanced error messages
-- [`memory_pool_execution_plan.rs`](examples/memory_pool_execution_plan.rs): Shows how to implement memory-aware ExecutionPlan with memory reservation and spilling
-- [`optimizer_rule.rs`](examples/optimizer_rule.rs): Use a custom OptimizerRule to replace certain predicates
-- [`parquet_embedded_index.rs`](examples/parquet_embedded_index.rs): Store a custom index inside a Parquet file and use it to speed up queries
-- [`parquet_encrypted.rs`](examples/parquet_encrypted.rs): Read and write encrypted Parquet files using DataFusion
-- [`parquet_encrypted_with_kms.rs`](examples/parquet_encrypted_with_kms.rs): Read and write encrypted Parquet files using an encryption factory
-- [`parquet_index.rs`](examples/parquet_index.rs): Create an secondary index over several parquet files and use it to speed up queries
-- [`parquet_exec_visitor.rs`](examples/parquet_exec_visitor.rs): Extract statistics by visiting an ExecutionPlan after execution
-- [`parse_sql_expr.rs`](examples/parse_sql_expr.rs): Parse SQL text into DataFusion `Expr`.
-- [`plan_to_sql.rs`](examples/plan_to_sql.rs): Generate SQL from DataFusion `Expr` and `LogicalPlan`
-- [`planner_api.rs`](examples/planner_api.rs) APIs to manipulate logical and physical plans
-- [`pruning.rs`](examples/pruning.rs): Use pruning to rule out files based on statistics
-- [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3
-- [`query-http-csv.rs`](examples/query-http-csv.rs): Configure `object_store` and run a query against files vi HTTP
-- [`regexp.rs`](examples/regexp.rs): Examples of using regular expression functions
-- [`remote_catalog.rs`](examples/regexp.rs): Examples of interfacing with a remote catalog (e.g. over a network)
-- [`simple_udaf.rs`](examples/simple_udaf.rs): Define and invoke a User Defined Aggregate Function (UDAF)
-- [`simple_udf.rs`](examples/simple_udf.rs): Define and invoke a User Defined Scalar Function (UDF)
-- [`simple_udfw.rs`](examples/simple_udwf.rs): Define and invoke a User Defined Window Function (UDWF)
-- [`sql_analysis.rs`](examples/sql_analysis.rs): Analyse SQL queries with DataFusion structures
-- [`sql_frontend.rs`](examples/sql_frontend.rs): Create LogicalPlans (only) from sql strings
-- [`sql_dialect.rs`](examples/sql_dialect.rs): Example of implementing a custom SQL dialect on top of `DFParser`
-- [`sql_query.rs`](examples/memtable.rs): Query data using SQL (in memory `RecordBatches`, local Parquet files)
-- [`date_time_function.rs`](examples/date_time_function.rs): Examples of date-time related functions and queries.
-
-## Distributed
-
-- [`flight_client.rs`](examples/flight/flight_client.rs) and [`flight_server.rs`](examples/flight/flight_server.rs): Run DataFusion as a standalone process and execute SQL queries from a client using the Flight protocol.
+## Builtin Functions Examples
+
+### Group: `builtin_functions`
+
+#### Category: Single Process
+
+| Subcommand       | File Path                                                                                 | Description                                                |
+| ---------------- | ----------------------------------------------------------------------------------------- | ---------------------------------------------------------- |
+| date_time        | [`builtin_functions/date_time.rs`](examples/builtin_functions/date_time.rs)               | Examples of date-time related functions and queries        |
+| function_factory | [`builtin_functions/function_factory.rs`](examples/builtin_functions/function_factory.rs) | Register `CREATE FUNCTION` handler to implement SQL macros |
+| regexp           | [`builtin_functions/regexp.rs`](examples/builtin_functions/regexp.rs)                     | Examples of using regular expression functions             |
+
+## Custom Data Source Examples
+
+### Group: `custom_data_source`
+
+#### Category: Single Process
+
+| Subcommand            | File Path                                                                                             | Description                                                                                                         |
+| --------------------- | ----------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
+| adapter_serialization | [`custom_data_source/adapter_serialization.rs`](examples/custom_data_source/adapter_serialization.rs) | Preserve custom PhysicalExprAdapter information during plan serialization using PhysicalExtensionCodec interception |
+| csv_json_opener       | [`custom_data_source/csv_json_opener.rs`](examples/custom_data_source/csv_json_opener.rs)             | Use low-level FileOpener APIs for CSV/JSON                                                                          |
+| csv_sql_streaming     | [`custom_data_source/csv_sql_streaming.rs`](examples/custom_data_source/csv_sql_streaming.rs)         | Run a streaming SQL query against CSV data                                                                          |
+| custom_datasource     | [`custom_data_source/custom_datasource.rs`](examples/custom_data_source/custom_datasource.rs)         | Query a custom TableProvider                                                                                        |
+| custom_file_casts     | [`custom_data_source/custom_file_casts.rs`](examples/custom_data_source/custom_file_casts.rs)         | Implement custom casting rules                                                                                      |
+| custom_file_format    | [`custom_data_source/custom_file_format.rs`](examples/custom_data_source/custom_file_format.rs)       | Write to a custom file format                                                                                       |
+| default_column_values | [`custom_data_source/default_column_values.rs`](examples/custom_data_source/default_column_values.rs) | Custom default values using metadata                                                                                |
+| file_stream_provider  | [`custom_data_source/file_stream_provider.rs`](examples/custom_data_source/file_stream_provider.rs)   | Read/write via FileStreamProvider for streams                                                                       |
+
+## Data IO Examples
+
+### Group: `data_io`
+
+#### Category: Single Process
+
+| Subcommand             | File Path                                                                                 | Description                                                               |
+| ---------------------- | ----------------------------------------------------------------------------------------- | ------------------------------------------------------------------------- |
+| catalog                | [`data_io/catalog.rs`](examples/data_io/catalog.rs)                                       | Register tables into a custom catalog                                     |
+| in_memory_object_store | [`data_io/in_memory_object_store.rs`](examples/data_io/in_memory_object_store.rs)         | Read CSV from an in-memory object store (pattern applies to JSON/Parquet) |
+| json_shredding         | [`data_io/json_shredding.rs`](examples/data_io/json_shredding.rs)                         | Implement filter rewriting for JSON shredding                             |
+| parquet_adv_idx        | [`data_io/parquet_advanced_index.rs`](examples/data_io/parquet_advanced_index.rs)         | Create a secondary index across multiple parquet files                    |
+| parquet_emb_idx        | [`data_io/parquet_embedded_index.rs`](examples/data_io/parquet_embedded_index.rs)         | Store a custom index inside Parquet files                                 |
+| parquet_enc            | [`data_io/parquet_encrypted.rs`](examples/data_io/parquet_encrypted.rs)                   | Read & write encrypted Parquet files                                      |
+| parquet_enc_with_kms   | [`data_io/parquet_encrypted_with_kms.rs`](examples/data_io/parquet_encrypted_with_kms.rs) | Encrypted Parquet I/O using a KMS-backed factory                          |
+| parquet_exec_visitor   | [`data_io/parquet_exec_visitor.rs`](examples/data_io/parquet_exec_visitor.rs)             | Extract statistics by visiting an ExecutionPlan                           |
+| parquet_idx            | [`data_io/parquet_index.rs`](examples/data_io/parquet_index.rs)                           | Create a secondary index                                                  |
+| query_http_csv         | [`data_io/query_http_csv.rs`](examples/data_io/query_http_csv.rs)                         | Query CSV files via HTTP                                                  |
+| remote_catalog         | [`data_io/remote_catalog.rs`](examples/data_io/remote_catalog.rs)                         | Interact with a remote catalog                                            |
+
+## DataFrame Examples
+
+### Group: `dataframe`
+
+#### Category: Single Process
+
+| Subcommand            | File Path                                                                           | Description                                             |
+| --------------------- | ----------------------------------------------------------------------------------- | ------------------------------------------------------- |
+| cache_factory         | [`dataframe/cache_factory.rs`](examples/dataframe/cache_factory.rs)                 | Custom lazy caching for DataFrames using `CacheFactory` |
+| dataframe             | [`dataframe/dataframe.rs`](examples/dataframe/dataframe.rs)                         | Query DataFrames from various sources and write output  |
+| deserialize_to_struct | [`dataframe/deserialize_to_struct.rs`](examples/dataframe/deserialize_to_struct.rs) | Convert Arrow arrays into Rust structs                  |
+
+## Execution Monitoring Examples
+
+### Group: `execution_monitoring`
+
+#### Category: Single Process
+
+| Subcommand         | File Path                                                                                                           | Description                              |
+| ------------------ | ------------------------------------------------------------------------------------------------------------------- | ---------------------------------------- |
+| mem_pool_exec_plan | [`execution_monitoring/memory_pool_execution_plan.rs`](examples/execution_monitoring/memory_pool_execution_plan.rs) | Memory-aware ExecutionPlan with spilling |
+| mem_pool_tracking  | [`execution_monitoring/memory_pool_tracking.rs`](examples/execution_monitoring/memory_pool_tracking.rs)             | Demonstrates memory tracking             |
+| tracing            | [`execution_monitoring/tracing.rs`](examples/execution_monitoring/tracing.rs)                                       | Demonstrates tracing integration         |
+
+## Extension Types Examples
+
+### Group: `extension_types`
+
+#### Category: Single Process
+
+| Subcommand  | File Path                                                                   | Description                          |
+| ----------- | --------------------------------------------------------------------------- | ------------------------------------ |
+| temperature | [`extension_types/temperature.rs`](examples/extension_types/temperature.rs) | Extension type for temperature data. |
+
+## External Dependency Examples
+
+### Group: `external_dependency`
+
+#### Category: Single Process
+
+| Subcommand      | File Path                                                                                   | Description                              |
+| --------------- | ------------------------------------------------------------------------------------------- | ---------------------------------------- |
+| dataframe_to_s3 | [`external_dependency/dataframe_to_s3.rs`](examples/external_dependency/dataframe_to_s3.rs) | Query DataFrames and write results to S3 |
+| query_aws_s3    | [`external_dependency/query_aws_s3.rs`](examples/external_dependency/query_aws_s3.rs)       | Query S3-backed data using object_store  |
+
+## Flight Examples
+
+### Group: `flight`
+
+#### Category: Distributed
+
+| Subcommand | File Path                                               | Description                                            |
+| ---------- | ------------------------------------------------------- | ------------------------------------------------------ |
+| client     | [`flight/client.rs`](examples/flight/client.rs)         | Execute SQL queries via Arrow Flight protocol          |
+| server     | [`flight/server.rs`](examples/flight/server.rs)         | Run DataFusion server accepting FlightSQL/JDBC queries |
+| sql_server | [`flight/sql_server.rs`](examples/flight/sql_server.rs) | Standalone SQL server for JDBC clients                 |
+
+## Proto Examples
+
+### Group: `proto`
+
+#### Category: Single Process
+
+| Subcommand               | File Path                                                                         | Description                                                                   |
+| ------------------------ | --------------------------------------------------------------------------------- | ----------------------------------------------------------------------------- |
+| composed_extension_codec | [`proto/composed_extension_codec.rs`](examples/proto/composed_extension_codec.rs) | Use multiple extension codecs for serialization/deserialization               |
+| expression_deduplication | [`proto/expression_deduplication.rs`](examples/proto/expression_deduplication.rs) | Example of expression caching/deduplication using the codec decorator pattern |
+
+## Query Planning Examples
+
+### Group: `query_planning`
+
+#### Category: Single Process
+
+| Subcommand     | File Path                                                                       | Description                                            |
+| -------------- | ------------------------------------------------------------------------------- | ------------------------------------------------------ |
+| analyzer_rule  | [`query_planning/analyzer_rule.rs`](examples/query_planning/analyzer_rule.rs)   | Custom AnalyzerRule to change query semantics          |
+| expr_api       | [`query_planning/expr_api.rs`](examples/query_planning/expr_api.rs)             | Create, execute, analyze, and coerce Exprs             |
+| optimizer_rule | [`query_planning/optimizer_rule.rs`](examples/query_planning/optimizer_rule.rs) | Replace predicates via a custom OptimizerRule          |
+| parse_sql_expr | [`query_planning/parse_sql_expr.rs`](examples/query_planning/parse_sql_expr.rs) | Parse SQL into DataFusion Expr                         |
+| plan_to_sql    | [`query_planning/plan_to_sql.rs`](examples/query_planning/plan_to_sql.rs)       | Generate SQL from expressions or plans                 |
+| planner_api    | [`query_planning/planner_api.rs`](examples/query_planning/planner_api.rs)       | APIs for logical and physical plan manipulation        |
+| pruning        | [`query_planning/pruning.rs`](examples/query_planning/pruning.rs)               | Use pruning to skip irrelevant files                   |
+| thread_pools   | [`query_planning/thread_pools.rs`](examples/query_planning/thread_pools.rs)     | Configure custom thread pools for DataFusion execution |
+
+## Relation Planner Examples
+
+### Group: `relation_planner`
+
+#### Category: Single Process
+
+| Subcommand      | File Path                                                                             | Description                                |
+| --------------- | ------------------------------------------------------------------------------------- | ------------------------------------------ |
+| match_recognize | [`relation_planner/match_recognize.rs`](examples/relation_planner/match_recognize.rs) | Implement MATCH_RECOGNIZE pattern matching |
+| pivot_unpivot   | [`relation_planner/pivot_unpivot.rs`](examples/relation_planner/pivot_unpivot.rs)     | Implement PIVOT / UNPIVOT                  |
+| table_sample    | [`relation_planner/table_sample.rs`](examples/relation_planner/table_sample.rs)       | Implement TABLESAMPLE                      |
+
+## SQL Ops Examples
+
+### Group: `sql_ops`
+
+#### Category: Single Process
+
+| Subcommand        | File Path                                                               | Description                                        |
+| ----------------- | ----------------------------------------------------------------------- | -------------------------------------------------- |
+| analysis          | [`sql_ops/analysis.rs`](examples/sql_ops/analysis.rs)                   | Analyze SQL queries                                |
+| custom_sql_parser | [`sql_ops/custom_sql_parser.rs`](examples/sql_ops/custom_sql_parser.rs) | Implement a custom SQL parser to extend DataFusion |
+| frontend          | [`sql_ops/frontend.rs`](examples/sql_ops/frontend.rs)                   | Build LogicalPlans from SQL                        |
+| query             | [`sql_ops/query.rs`](examples/sql_ops/query.rs)                         | Query data using SQL                               |
+
+## UDF Examples
+
+### Group: `udf`
+
+#### Category: Single Process
+
+| Subcommand      | File Path                                                   | Description                                     |
+| --------------- | ----------------------------------------------------------- | ----------------------------------------------- |
+| adv_udaf        | [`udf/advanced_udaf.rs`](examples/udf/advanced_udaf.rs)     | Advanced User Defined Aggregate Function (UDAF) |
+| adv_udf         | [`udf/advanced_udf.rs`](examples/udf/advanced_udf.rs)       | Advanced User Defined Scalar Function (UDF)     |
+| adv_udwf        | [`udf/advanced_udwf.rs`](examples/udf/advanced_udwf.rs)     | Advanced User Defined Window Function (UDWF)    |
+| async_udf       | [`udf/async_udf.rs`](examples/udf/async_udf.rs)             | Asynchronous User Defined Scalar Function       |
+| udaf            | [`udf/simple_udaf.rs`](examples/udf/simple_udaf.rs)         | Simple UDAF example                             |
+| udf             | [`udf/simple_udf.rs`](examples/udf/simple_udf.rs)           | Simple UDF example                              |
+| udtf            | [`udf/simple_udtf.rs`](examples/udf/simple_udtf.rs)         | Simple UDTF example                             |
+| udwf            | [`udf/simple_udwf.rs`](examples/udf/simple_udwf.rs)         | Simple UDWF example                             |
+| table_list_udtf | [`udf/table_list_udtf.rs`](examples/udf/table_list_udtf.rs) | Session-aware UDTF table list example           |
diff --git a/datafusion-examples/data/README.md b/datafusion-examples/data/README.md
new file mode 100644
index 0000000000000..e8296a8856e60
--- /dev/null
+++ b/datafusion-examples/data/README.md
@@ -0,0 +1,25 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+## Example datasets
+
+| Filename    | Path                                    | Description                                                                                                                                                                          |
+| ----------- | --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `cars.csv`  | [`data/csv/cars.csv`](./csv/cars.csv)   | Time-series–like dataset containing car identifiers, speed values, and timestamps. Used in window function and time-based query examples (e.g. ordering, window frames).             |
+| `regex.csv` | [`data/csv/regex.csv`](./csv/regex.csv) | Dataset for regular expression examples. Contains input values, regex patterns, replacement strings, and optional flags. Covers ASCII, Unicode, and locale-specific text processing. |
diff --git a/datafusion-examples/data/csv/cars.csv b/datafusion-examples/data/csv/cars.csv
new file mode 100644
index 0000000000000..bc40f3b01e7a5
--- /dev/null
+++ b/datafusion-examples/data/csv/cars.csv
@@ -0,0 +1,26 @@
+car,speed,time
+red,20.0,1996-04-12T12:05:03.000000000
+red,20.3,1996-04-12T12:05:04.000000000
+red,21.4,1996-04-12T12:05:05.000000000
+red,21.5,1996-04-12T12:05:06.000000000
+red,19.0,1996-04-12T12:05:07.000000000
+red,18.0,1996-04-12T12:05:08.000000000
+red,17.0,1996-04-12T12:05:09.000000000
+red,7.0,1996-04-12T12:05:10.000000000
+red,7.1,1996-04-12T12:05:11.000000000
+red,7.2,1996-04-12T12:05:12.000000000
+red,3.0,1996-04-12T12:05:13.000000000
+red,1.0,1996-04-12T12:05:14.000000000
+red,0.0,1996-04-12T12:05:15.000000000
+green,10.0,1996-04-12T12:05:03.000000000
+green,10.3,1996-04-12T12:05:04.000000000
+green,10.4,1996-04-12T12:05:05.000000000
+green,10.5,1996-04-12T12:05:06.000000000
+green,11.0,1996-04-12T12:05:07.000000000
+green,12.0,1996-04-12T12:05:08.000000000
+green,14.0,1996-04-12T12:05:09.000000000
+green,15.0,1996-04-12T12:05:10.000000000
+green,15.1,1996-04-12T12:05:11.000000000
+green,15.2,1996-04-12T12:05:12.000000000
+green,8.0,1996-04-12T12:05:13.000000000
+green,2.0,1996-04-12T12:05:14.000000000
diff --git a/datafusion-examples/data/csv/regex.csv b/datafusion-examples/data/csv/regex.csv
new file mode 100644
index 0000000000000..b249c39522b60
--- /dev/null
+++ b/datafusion-examples/data/csv/regex.csv
@@ -0,0 +1,12 @@
+values,patterns,replacement,flags
+abc,^(a),bb\1bb,i
+ABC,^(A).*,B,i
+aBc,(b|d),e,i
+AbC,(B|D),e,
+aBC,^(b|c),d,
+4000,\b4([1-9]\d\d|\d[1-9]\d|\d\d[1-9])\b,xyz,
+4010,\b4([1-9]\d\d|\d[1-9]\d|\d\d[1-9])\b,xyz,
+Düsseldorf,[\p{Letter}-]+,München,
+Москва,[\p{L}-]+,Moscow,
+Köln,[a-zA-Z]ö[a-zA-Z]{2},Koln,
+اليوم,^\p{Arabic}+$,Today,
\ No newline at end of file
diff --git a/datafusion-examples/examples/date_time_functions.rs b/datafusion-examples/examples/builtin_functions/date_time.rs
similarity index 96%
rename from datafusion-examples/examples/date_time_functions.rs
rename to datafusion-examples/examples/builtin_functions/date_time.rs
index 2628319ae31f0..08d4bc6e29978 100644
--- a/datafusion-examples/examples/date_time_functions.rs
+++ b/datafusion-examples/examples/builtin_functions/date_time.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use std::sync::Arc;
 
 use arrow::array::{Date32Array, Int32Array};
@@ -26,8 +28,20 @@ use datafusion::common::assert_contains;
 use datafusion::error::Result;
 use datafusion::prelude::*;
 
-#[tokio::main]
-async fn main() -> Result<()> {
+/// Example: Working with Date and Time Functions
+///
+/// This example demonstrates how to work with various date and time
+/// functions in DataFusion using both the DataFrame API and SQL queries.
+///
+/// It includes:
+/// - `make_date`: building `DATE` values from year, month, and day columns
+/// - `to_date`: converting string expressions into `DATE` values
+/// - `to_timestamp`: parsing strings or numeric values into `TIMESTAMP`s
+/// - `to_char`: formatting dates, timestamps, and durations as strings
+///
+/// Together, these examples show how to create, convert, and format temporal
+/// data using DataFusion’s built-in functions.
+pub async fn date_time() -> Result<()> {
     query_make_date().await?;
     query_to_date().await?;
     query_to_timestamp().await?;
@@ -167,12 +181,13 @@ async fn query_make_date() -> Result<()> {
 
     // invalid column values will result in an error
     let result = ctx
-        .sql("select make_date(2024, null, 23)")
+        .sql("select make_date(2024, '', 23)")
         .await?
         .collect()
         .await;
 
-    let expected = "Execution error: Unable to parse date from null/empty value";
+    let expected =
+        "Arrow error: Cast error: Cannot cast string '' to value of Int32 type";
     assert_contains!(result.unwrap_err().to_string(), expected);
 
     // invalid date values will also result in an error
@@ -182,7 +197,7 @@ async fn query_make_date() -> Result<()> {
         .collect()
         .await;
 
-    let expected = "Execution error: Unable to parse date from 2024, 1, 32";
+    let expected = "Execution error: Day value '32' is out of range";
     assert_contains!(result.unwrap_err().to_string(), expected);
 
     Ok(())
diff --git a/datafusion-examples/examples/function_factory.rs b/datafusion-examples/examples/builtin_functions/function_factory.rs
similarity index 96%
rename from datafusion-examples/examples/function_factory.rs
rename to datafusion-examples/examples/builtin_functions/function_factory.rs
index d4312ae594091..3cc77371d44ce 100644
--- a/datafusion-examples/examples/function_factory.rs
+++ b/datafusion-examples/examples/builtin_functions/function_factory.rs
@@ -15,14 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use arrow::datatypes::DataType;
 use datafusion::common::tree_node::{Transformed, TreeNode};
-use datafusion::common::{exec_datafusion_err, exec_err, internal_err, DataFusionError};
+use datafusion::common::{DataFusionError, exec_datafusion_err, exec_err, internal_err};
 use datafusion::error::Result;
 use datafusion::execution::context::{
     FunctionFactory, RegisterFunction, SessionContext, SessionState,
 };
-use datafusion::logical_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion::logical_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion::logical_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion::logical_expr::{
     ColumnarValue, CreateFunction, Expr, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl,
@@ -42,8 +44,7 @@ use std::sync::Arc;
 ///
 /// This example is rather simple and does not cover all cases required for a
 /// real implementation.
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn function_factory() -> Result<()> {
     // First we must configure the SessionContext with our function factory
     let ctx = SessionContext::new()
         // register custom function factory
@@ -117,10 +118,6 @@ struct ScalarFunctionWrapper {
 }
 
 impl ScalarUDFImpl for ScalarFunctionWrapper {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         &self.name
     }
@@ -144,7 +141,7 @@ impl ScalarUDFImpl for ScalarFunctionWrapper {
     fn simplify(
         &self,
         args: Vec<Expr>,
-        _info: &dyn SimplifyInfo,
+        _info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         let replacement = Self::replacement(&self.expr, &args)?;
 
diff --git a/datafusion-examples/examples/builtin_functions/main.rs b/datafusion-examples/examples/builtin_functions/main.rs
new file mode 100644
index 0000000000000..42ca15f91935d
--- /dev/null
+++ b/datafusion-examples/examples/builtin_functions/main.rs
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # These are miscellaneous function-related examples
+//!
+//! These examples demonstrate miscellaneous function-related features.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example builtin_functions -- [all|date_time|function_factory|regexp]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `date_time`
+//!   (file: date_time.rs, desc: Examples of date-time related functions and queries)
+//!
+//! - `function_factory`  
+//!   (file: function_factory.rs, desc: Register `CREATE FUNCTION` handler to implement SQL macros)
+//!
+//! - `regexp`
+//!   (file: regexp.rs, desc: Examples of using regular expression functions)
+
+mod date_time;
+mod function_factory;
+mod regexp;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    DateTime,
+    FunctionFactory,
+    Regexp,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "builtin_functions";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::DateTime => date_time::date_time().await?,
+            ExampleKind::FunctionFactory => function_factory::function_factory().await?,
+            ExampleKind::Regexp => regexp::regexp().await?,
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/regexp.rs b/datafusion-examples/examples/builtin_functions/regexp.rs
similarity index 74%
rename from datafusion-examples/examples/regexp.rs
rename to datafusion-examples/examples/builtin_functions/regexp.rs
index 12d115b9b502c..97dc71b94e934 100644
--- a/datafusion-examples/examples/regexp.rs
+++ b/datafusion-examples/examples/builtin_functions/regexp.rs
@@ -1,5 +1,4 @@
 // Licensed to the Apache Software Foundation (ASF) under one
-// Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
@@ -16,9 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use datafusion::common::{assert_batches_eq, assert_contains};
 use datafusion::error::Result;
 use datafusion::prelude::*;
+use datafusion_examples::utils::datasets::ExampleDataset;
 
 /// This example demonstrates how to use the regexp_* functions
 ///
@@ -28,15 +30,12 @@ use datafusion::prelude::*;
 ///
 /// Supported flags can be found at
 /// https://docs.rs/regex/latest/regex/#grouping-and-flags
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn regexp() -> Result<()> {
     let ctx = SessionContext::new();
-    ctx.register_csv(
-        "examples",
-        "../../datafusion/physical-expr/tests/data/regex.csv",
-        CsvReadOptions::new(),
-    )
-    .await?;
+    let dataset = ExampleDataset::Regex;
+
+    ctx.register_csv("examples", dataset.path_str()?, CsvReadOptions::new())
+        .await?;
 
     //
     //
@@ -112,11 +111,11 @@ async fn main() -> Result<()> {
 
     assert_batches_eq!(
         &[
-    "+---------------------------------------------------+----------------------------------------------------+",
-    "| regexp_like(Utf8(\"John Smith\"),Utf8(\"^.*Smith$\")) | regexp_like(Utf8(\"Smith Jones\"),Utf8(\"^Smith.*$\")) |",
-    "+---------------------------------------------------+----------------------------------------------------+",
-    "| true                                              | true                                               |",
-    "+---------------------------------------------------+----------------------------------------------------+",
+            "+---------------------------------------------------+----------------------------------------------------+",
+            "| regexp_like(Utf8(\"John Smith\"),Utf8(\"^.*Smith$\")) | regexp_like(Utf8(\"Smith Jones\"),Utf8(\"^Smith.*$\")) |",
+            "+---------------------------------------------------+----------------------------------------------------+",
+            "| true                                              | true                                               |",
+            "+---------------------------------------------------+----------------------------------------------------+",
         ],
         &result
     );
@@ -242,11 +241,11 @@ async fn main() -> Result<()> {
 
     assert_batches_eq!(
         &[
-    "+----------------------------------------------------+-----------------------------------------------------+",
-    "| regexp_match(Utf8(\"John Smith\"),Utf8(\"^.*Smith$\")) | regexp_match(Utf8(\"Smith Jones\"),Utf8(\"^Smith.*$\")) |",
-    "+----------------------------------------------------+-----------------------------------------------------+",
-    "| [John Smith]                                       | [Smith Jones]                                       |",
-    "+----------------------------------------------------+-----------------------------------------------------+",
+            "+----------------------------------------------------+-----------------------------------------------------+",
+            "| regexp_match(Utf8(\"John Smith\"),Utf8(\"^.*Smith$\")) | regexp_match(Utf8(\"Smith Jones\"),Utf8(\"^Smith.*$\")) |",
+            "+----------------------------------------------------+-----------------------------------------------------+",
+            "| [John Smith]                                       | [Smith Jones]                                       |",
+            "+----------------------------------------------------+-----------------------------------------------------+",
         ],
         &result
     );
@@ -268,21 +267,21 @@ async fn main() -> Result<()> {
 
     assert_batches_eq!(
         &[
-    "+---------------------------------------------------------------------------------------------------------+",
-    "| regexp_replace(examples.values,examples.patterns,examples.replacement,concat(Utf8(\"g\"),examples.flags)) |",
-    "+---------------------------------------------------------------------------------------------------------+",
-    "| bbabbbc                                                                                                 |",
-    "| B                                                                                                       |",
-    "| aec                                                                                                     |",
-    "| AbC                                                                                                     |",
-    "| aBC                                                                                                     |",
-    "| 4000                                                                                                    |",
-    "| xyz                                                                                                     |",
-    "| München                                                                                                 |",
-    "| Moscow                                                                                                  |",
-    "| Koln                                                                                                    |",
-    "| Today                                                                                                   |",
-    "+---------------------------------------------------------------------------------------------------------+",
+            "+---------------------------------------------------------------------------------------------------------+",
+            "| regexp_replace(examples.values,examples.patterns,examples.replacement,concat(Utf8(\"g\"),examples.flags)) |",
+            "+---------------------------------------------------------------------------------------------------------+",
+            "| bbabbbc                                                                                                 |",
+            "| B                                                                                                       |",
+            "| aec                                                                                                     |",
+            "| AbC                                                                                                     |",
+            "| aBC                                                                                                     |",
+            "| 4000                                                                                                    |",
+            "| xyz                                                                                                     |",
+            "| München                                                                                                 |",
+            "| Moscow                                                                                                  |",
+            "| Koln                                                                                                    |",
+            "| Today                                                                                                   |",
+            "+---------------------------------------------------------------------------------------------------------+",
         ],
         &result
     );
@@ -296,11 +295,11 @@ async fn main() -> Result<()> {
 
     assert_batches_eq!(
         &[
-    "+------------------------------------------------------------------------+",
-    "| regexp_replace(Utf8(\"foobarbaz\"),Utf8(\"b(..)\"),Utf8(\"X\\1Y\"),Utf8(\"g\")) |",
-    "+------------------------------------------------------------------------+",
-    "| fooXarYXazY                                                            |",
-    "+------------------------------------------------------------------------+",
+            "+------------------------------------------------------------------------+",
+            "| regexp_replace(Utf8(\"foobarbaz\"),Utf8(\"b(..)\"),Utf8(\"X\\1Y\"),Utf8(\"g\")) |",
+            "+------------------------------------------------------------------------+",
+            "| fooXarYXazY                                                            |",
+            "+------------------------------------------------------------------------+",
         ],
         &result
     );
diff --git a/datafusion-examples/examples/custom_data_source/adapter_serialization.rs b/datafusion-examples/examples/custom_data_source/adapter_serialization.rs
new file mode 100644
index 0000000000000..d82bd2097ce1d
--- /dev/null
+++ b/datafusion-examples/examples/custom_data_source/adapter_serialization.rs
@@ -0,0 +1,513 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+//!
+//! This example demonstrates how to use the `PhysicalProtoConverterExtension`
+//! trait's interception methods (`execution_plan_to_proto` and
+//! `proto_to_execution_plan`) to implement custom serialization logic.
+//!
+//! The key insight is that `FileScanConfig::expr_adapter_factory` is NOT serialized by
+//! default. This example shows how to:
+//! 1. Detect plans with custom adapters during serialization
+//! 2. Wrap them as Extension nodes with JSON-serialized adapter metadata
+//! 3. Store the inner DataSourceExec (without adapter) as a child in the extension's inputs field
+//! 4. Unwrap and restore the adapter during deserialization
+//!
+//! This demonstrates nested serialization (protobuf outer, JSON inner) and the
+//! power of `PhysicalProtoConverterExtension`. Both plan and expression
+//! serialization route through converter hooks, enabling interception at every
+//! node in the tree.
+
+use std::fmt::Debug;
+use std::sync::Arc;
+
+use arrow::array::record_batch;
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use datafusion::assert_batches_eq;
+use datafusion::common::{Result, not_impl_err};
+use datafusion::datasource::listing::{
+    ListingTable, ListingTableConfig, ListingTableConfigExt, ListingTableUrl,
+};
+use datafusion::datasource::physical_plan::{FileScanConfig, FileScanConfigBuilder};
+use datafusion::datasource::source::DataSourceExec;
+use datafusion::execution::TaskContext;
+use datafusion::execution::context::SessionContext;
+use datafusion::execution::object_store::ObjectStoreUrl;
+use datafusion::parquet::arrow::ArrowWriter;
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion::prelude::SessionConfig;
+use datafusion_physical_expr_adapter::{
+    DefaultPhysicalExprAdapterFactory, PhysicalExprAdapter, PhysicalExprAdapterFactory,
+};
+use datafusion_proto::bytes::{
+    physical_plan_from_bytes_with_proto_converter,
+    physical_plan_to_bytes_with_proto_converter,
+};
+use datafusion_proto::physical_plan::from_proto::parse_physical_expr_with_converter;
+use datafusion_proto::physical_plan::to_proto::serialize_physical_expr_with_converter;
+use datafusion_proto::physical_plan::{
+    PhysicalExtensionCodec, PhysicalPlanDecodeContext, PhysicalProtoConverterExtension,
+};
+use datafusion_proto::protobuf::physical_plan_node::PhysicalPlanType;
+use datafusion_proto::protobuf::{
+    PhysicalExprNode, PhysicalExtensionNode, PhysicalPlanNode,
+};
+use object_store::memory::InMemory;
+use object_store::path::Path;
+use object_store::{ObjectStore, ObjectStoreExt, PutPayload};
+use serde::{Deserialize, Serialize};
+
+/// Example showing how to preserve custom adapter information during plan serialization.
+///
+/// This demonstrates:
+/// 1. Creating a custom PhysicalExprAdapter with metadata
+/// 2. Using PhysicalExtensionCodec to intercept serialization
+/// 3. Wrapping adapter info as Extension nodes
+/// 4. Restoring adapters during deserialization
+pub async fn adapter_serialization() -> Result<()> {
+    println!("=== PhysicalExprAdapter Serialization Example ===\n");
+
+    // Step 1: Create sample Parquet data in memory
+    println!("Step 1: Creating sample Parquet data...");
+    let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+    let batch = record_batch!(("id", Int32, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))?;
+    let path = Path::from("data.parquet");
+    write_parquet(&store, &path, &batch).await?;
+
+    // Step 2: Set up session with custom adapter
+    println!("Step 2: Setting up session with custom adapter...");
+    let logical_schema =
+        Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+
+    let mut cfg = SessionConfig::new();
+    cfg.options_mut().execution.parquet.pushdown_filters = true;
+    let ctx = SessionContext::new_with_config(cfg);
+    ctx.runtime_env().register_object_store(
+        ObjectStoreUrl::parse("memory://")?.as_ref(),
+        Arc::clone(&store),
+    );
+
+    // Create a table with our custom MetadataAdapterFactory
+    let adapter_factory = Arc::new(MetadataAdapterFactory::new("v1"));
+    let listing_config =
+        ListingTableConfig::new(ListingTableUrl::parse("memory:///data.parquet")?)
+            .infer_options(&ctx.state())
+            .await?
+            .with_schema(logical_schema)
+            .with_expr_adapter_factory(
+                Arc::clone(&adapter_factory) as Arc<dyn PhysicalExprAdapterFactory>
+            );
+    let table = ListingTable::try_new(listing_config)?;
+    ctx.register_table("my_table", Arc::new(table))?;
+
+    // Step 3: Create physical plan with filter
+    println!("Step 3: Creating physical plan with filter...");
+    let df = ctx.sql("SELECT * FROM my_table WHERE id > 5").await?;
+    let original_plan = df.create_physical_plan().await?;
+
+    // Verify adapter is present in original plan
+    let has_adapter_before = verify_adapter_in_plan(&original_plan, "original");
+    println!("  Original plan has adapter: {has_adapter_before}");
+
+    // Step 4: Serialize with our custom codec
+    println!("\nStep 4: Serializing plan with AdapterPreservingCodec...");
+    let codec = AdapterPreservingCodec;
+    let bytes = physical_plan_to_bytes_with_proto_converter(
+        Arc::clone(&original_plan),
+        &codec,
+        &codec,
+    )?;
+    println!("  Serialized {} bytes", bytes.len());
+    println!("  (DataSourceExec with adapter was wrapped as PhysicalExtensionNode)");
+
+    // Step 5: Deserialize with our custom codec
+    println!("\nStep 5: Deserializing plan with AdapterPreservingCodec...");
+    let task_ctx = ctx.task_ctx();
+    let restored_plan =
+        physical_plan_from_bytes_with_proto_converter(&bytes, &task_ctx, &codec, &codec)?;
+
+    // Verify adapter is restored
+    let has_adapter_after = verify_adapter_in_plan(&restored_plan, "restored");
+    println!("  Restored plan has adapter: {has_adapter_after}");
+
+    // Step 6: Execute and compare results
+    println!("\nStep 6: Executing plans and comparing results...");
+    let original_results =
+        datafusion::physical_plan::collect(Arc::clone(&original_plan), task_ctx.clone())
+            .await?;
+    let restored_results =
+        datafusion::physical_plan::collect(restored_plan, task_ctx).await?;
+
+    #[rustfmt::skip]
+    let expected = [
+        "+----+",
+        "| id |",
+        "+----+",
+        "| 6  |",
+        "| 7  |",
+        "| 8  |",
+        "| 9  |",
+        "| 10 |",
+        "+----+",
+    ];
+
+    println!("\n  Original plan results:");
+    arrow::util::pretty::print_batches(&original_results)?;
+    assert_batches_eq!(expected, &original_results);
+
+    println!("\n  Restored plan results:");
+    arrow::util::pretty::print_batches(&restored_results)?;
+    assert_batches_eq!(expected, &restored_results);
+
+    println!("\n=== Example Complete! ===");
+    println!("Key takeaways:");
+    println!(
+        "  1. PhysicalProtoConverterExtension provides execution_plan_to_proto/proto_to_execution_plan hooks"
+    );
+    println!("  2. Custom metadata can be wrapped as PhysicalExtensionNode");
+    println!("  3. Nested serialization (protobuf + JSON) works seamlessly");
+    println!(
+        "  4. Both plans produce identical results despite serialization round-trip"
+    );
+    println!("  5. Adapters are fully preserved through the serialization round-trip");
+
+    Ok(())
+}
+
+// ============================================================================
+// MetadataAdapter - A simple custom adapter with a tag
+// ============================================================================
+
+/// A custom PhysicalExprAdapter that wraps another adapter.
+/// The tag metadata is stored in the factory, not the adapter itself.
+#[derive(Debug)]
+struct MetadataAdapter {
+    inner: Arc<dyn PhysicalExprAdapter>,
+}
+
+impl PhysicalExprAdapter for MetadataAdapter {
+    fn rewrite(&self, expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
+        // Simply delegate to inner adapter
+        self.inner.rewrite(expr)
+    }
+}
+
+// ============================================================================
+// MetadataAdapterFactory - Factory for creating MetadataAdapter instances
+// ============================================================================
+
+/// Factory for creating MetadataAdapter instances.
+/// The tag is stored in the factory and extracted via Debug formatting in `extract_adapter_tag`.
+#[derive(Debug)]
+struct MetadataAdapterFactory {
+    // Note: This field is read via Debug formatting in `extract_adapter_tag`.
+    // Rust's dead code analysis doesn't recognize Debug-based field access.
+    // In PR #19234, this field is used by `with_partition_values`, but that method
+    // doesn't exist in upstream DataFusion's PhysicalExprAdapter trait.
+    #[expect(dead_code)]
+    tag: String,
+}
+
+impl MetadataAdapterFactory {
+    fn new(tag: impl Into<String>) -> Self {
+        Self { tag: tag.into() }
+    }
+}
+
+impl PhysicalExprAdapterFactory for MetadataAdapterFactory {
+    fn create(
+        &self,
+        logical_file_schema: SchemaRef,
+        physical_file_schema: SchemaRef,
+    ) -> Result<Arc<dyn PhysicalExprAdapter>> {
+        let inner = DefaultPhysicalExprAdapterFactory
+            .create(logical_file_schema, physical_file_schema)?;
+        Ok(Arc::new(MetadataAdapter { inner }))
+    }
+}
+
+// ============================================================================
+// AdapterPreservingCodec - Custom codec that preserves adapters
+// ============================================================================
+
+/// Extension payload structure for serializing adapter info
+#[derive(Serialize, Deserialize)]
+struct ExtensionPayload {
+    /// Marker to identify this is our custom extension
+    marker: String,
+    /// JSON-serialized adapter metadata
+    adapter_metadata: AdapterMetadata,
+}
+
+/// Metadata about the adapter to recreate it during deserialization
+#[derive(Serialize, Deserialize)]
+struct AdapterMetadata {
+    /// The adapter tag (e.g., "v1")
+    tag: String,
+}
+
+const EXTENSION_MARKER: &str = "adapter_preserving_extension_v1";
+
+/// A codec that intercepts serialization to preserve adapter information.
+#[derive(Debug)]
+struct AdapterPreservingCodec;
+
+impl PhysicalExtensionCodec for AdapterPreservingCodec {
+    // Required method: decode custom extension nodes
+    fn try_decode(
+        &self,
+        buf: &[u8],
+        inputs: &[Arc<dyn ExecutionPlan>],
+        _ctx: &TaskContext,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        // Try to parse as our extension payload
+        if let Ok(payload) = serde_json::from_slice::<ExtensionPayload>(buf)
+            && payload.marker == EXTENSION_MARKER
+        {
+            if inputs.len() != 1 {
+                return Err(datafusion::error::DataFusionError::Plan(format!(
+                    "Extension node expected exactly 1 child, got {}",
+                    inputs.len()
+                )));
+            }
+            let inner_plan = inputs[0].clone();
+
+            // Recreate the adapter factory
+            let adapter_factory = create_adapter_factory(&payload.adapter_metadata.tag);
+
+            // Inject adapter into the plan
+            return inject_adapter_into_plan(inner_plan, adapter_factory);
+        }
+
+        not_impl_err!("Unknown extension type")
+    }
+
+    // Required method: encode custom execution plans
+    fn try_encode(
+        &self,
+        _node: Arc<dyn ExecutionPlan>,
+        _buf: &mut Vec<u8>,
+    ) -> Result<()> {
+        // We don't need this for the example - adapter wrapping happens in
+        // `execution_plan_to_proto` instead.
+        not_impl_err!(
+            "try_encode not used - adapter wrapping happens in execution_plan_to_proto"
+        )
+    }
+}
+
+impl PhysicalProtoConverterExtension for AdapterPreservingCodec {
+    fn execution_plan_to_proto(
+        &self,
+        plan: &Arc<dyn ExecutionPlan>,
+        extension_codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<PhysicalPlanNode> {
+        // Check if this is a DataSourceExec with adapter
+        if let Some(exec) = plan.downcast_ref::<DataSourceExec>()
+            && let Some(config) = exec.data_source().downcast_ref::<FileScanConfig>()
+            && let Some(adapter_factory) = &config.expr_adapter_factory
+            && let Some(tag) = extract_adapter_tag(adapter_factory.as_ref())
+        {
+            // Try to extract our MetadataAdapterFactory's tag
+            println!("    [Serialize] Found DataSourceExec with adapter tag: {tag}");
+
+            // 1. Create adapter metadata
+            let adapter_metadata = AdapterMetadata { tag };
+
+            // 2. Serialize the inner plan to protobuf
+            //    Note that this will drop the custom adapter since the default serialization cannot handle it
+            let inner_proto = PhysicalPlanNode::try_from_physical_plan_with_converter(
+                Arc::clone(plan),
+                extension_codec,
+                self,
+            )?;
+
+            // 3. Create extension payload to wrap the plan
+            //    so that the custom adapter gets re-attached during deserialization
+            //    The choice of JSON is arbitrary; other formats could be used.
+            let payload = ExtensionPayload {
+                marker: EXTENSION_MARKER.to_string(),
+                adapter_metadata,
+            };
+            let payload_bytes = serde_json::to_vec(&payload).map_err(|e| {
+                datafusion::error::DataFusionError::Plan(format!(
+                    "Failed to serialize payload: {e}"
+                ))
+            })?;
+
+            // 4. Return as PhysicalExtensionNode with child plan in inputs
+            return Ok(PhysicalPlanNode {
+                physical_plan_type: Some(PhysicalPlanType::Extension(
+                    PhysicalExtensionNode {
+                        node: payload_bytes,
+                        inputs: vec![inner_proto],
+                    },
+                )),
+            });
+        }
+
+        // No adapter found, not a DataSourceExec, etc. - use default serialization
+        PhysicalPlanNode::try_from_physical_plan_with_converter(
+            Arc::clone(plan),
+            extension_codec,
+            self,
+        )
+    }
+
+    // Interception point: override deserialization to unwrap adapters
+    fn proto_to_execution_plan(
+        &self,
+        proto: &PhysicalPlanNode,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        // Check if this is our custom extension wrapper
+        if let Some(PhysicalPlanType::Extension(extension)) = &proto.physical_plan_type
+            && let Ok(payload) =
+                serde_json::from_slice::<ExtensionPayload>(&extension.node)
+            && payload.marker == EXTENSION_MARKER
+        {
+            println!(
+                "    [Deserialize] Found adapter extension with tag: {}",
+                payload.adapter_metadata.tag
+            );
+
+            // Get the inner plan proto from inputs field
+            if extension.inputs.is_empty() {
+                return Err(datafusion::error::DataFusionError::Plan(
+                    "Extension node missing child plan in inputs".to_string(),
+                ));
+            }
+            let inner_proto = &extension.inputs[0];
+
+            // Deserialize the inner plan
+            let inner_plan = self.default_proto_to_execution_plan(inner_proto, ctx)?;
+
+            // Recreate the adapter factory
+            let adapter_factory = create_adapter_factory(&payload.adapter_metadata.tag);
+
+            // Inject adapter into the plan
+            return inject_adapter_into_plan(inner_plan, adapter_factory);
+        }
+
+        // Not our extension - use default deserialization
+        self.default_proto_to_execution_plan(proto, ctx)
+    }
+
+    fn proto_to_physical_expr(
+        &self,
+        proto: &PhysicalExprNode,
+        input_schema: &Schema,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        parse_physical_expr_with_converter(proto, input_schema, ctx, self)
+    }
+
+    fn physical_expr_to_proto(
+        &self,
+        expr: &Arc<dyn PhysicalExpr>,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<PhysicalExprNode> {
+        serialize_physical_expr_with_converter(expr, codec, self)
+    }
+}
+
+// ============================================================================
+// Helper functions
+// ============================================================================
+
+/// Write a RecordBatch to Parquet in the object store
+async fn write_parquet(
+    store: &dyn ObjectStore,
+    path: &Path,
+    batch: &arrow::record_batch::RecordBatch,
+) -> Result<()> {
+    let mut buf = vec![];
+    let mut writer = ArrowWriter::try_new(&mut buf, batch.schema(), None)?;
+    writer.write(batch)?;
+    writer.close()?;
+
+    let payload = PutPayload::from_bytes(buf.into());
+    store.put(path, payload).await?;
+    Ok(())
+}
+
+/// Extract the tag from a MetadataAdapterFactory.
+///
+/// Note: Since `PhysicalExprAdapterFactory` doesn't provide `as_any()` for downcasting,
+/// we parse the Debug output. In a production system, you might add a dedicated trait
+/// method for metadata extraction.
+fn extract_adapter_tag(factory: &dyn PhysicalExprAdapterFactory) -> Option<String> {
+    let debug_str = format!("{factory:?}");
+    if debug_str.contains("MetadataAdapterFactory") {
+        // Extract tag from debug output: MetadataAdapterFactory { tag: "v1" }
+        if let Some(start) = debug_str.find("tag: \"") {
+            let after_tag = &debug_str[start + 6..];
+            if let Some(end) = after_tag.find('"') {
+                return Some(after_tag[..end].to_string());
+            }
+        }
+    }
+    None
+}
+
+/// Create an adapter factory from a tag
+fn create_adapter_factory(tag: &str) -> Arc<dyn PhysicalExprAdapterFactory> {
+    Arc::new(MetadataAdapterFactory::new(tag))
+}
+
+/// Inject an adapter into a plan (assumes plan is a DataSourceExec with FileScanConfig)
+fn inject_adapter_into_plan(
+    plan: Arc<dyn ExecutionPlan>,
+    adapter_factory: Arc<dyn PhysicalExprAdapterFactory>,
+) -> Result<Arc<dyn ExecutionPlan>> {
+    if let Some(exec) = plan.downcast_ref::<DataSourceExec>()
+        && let Some(config) = exec.data_source().downcast_ref::<FileScanConfig>()
+    {
+        let new_config = FileScanConfigBuilder::from(config.clone())
+            .with_expr_adapter(Some(adapter_factory))
+            .build();
+        return Ok(DataSourceExec::from_data_source(new_config));
+    }
+    // If not a DataSourceExec with FileScanConfig, return as-is
+    Ok(plan)
+}
+
+/// Helper to verify if a plan has an adapter (for testing/validation)
+fn verify_adapter_in_plan(plan: &Arc<dyn ExecutionPlan>, label: &str) -> bool {
+    // Walk the plan tree to find DataSourceExec with adapter
+    fn check_plan(plan: &dyn ExecutionPlan) -> bool {
+        if let Some(exec) = plan.downcast_ref::<DataSourceExec>()
+            && let Some(config) = exec.data_source().downcast_ref::<FileScanConfig>()
+            && config.expr_adapter_factory.is_some()
+        {
+            return true;
+        }
+        // Check children
+        for child in plan.children() {
+            if check_plan(child.as_ref()) {
+                return true;
+            }
+        }
+        false
+    }
+
+    let has_adapter = check_plan(plan.as_ref());
+    println!("    [Verify] {label} plan adapter check: {has_adapter}");
+    has_adapter
+}
diff --git a/datafusion-examples/examples/csv_json_opener.rs b/datafusion-examples/examples/custom_data_source/csv_json_opener.rs
similarity index 60%
rename from datafusion-examples/examples/csv_json_opener.rs
rename to datafusion-examples/examples/custom_data_source/csv_json_opener.rs
index ef2a3eaca0c88..51c0e2167053e 100644
--- a/datafusion-examples/examples/csv_json_opener.rs
+++ b/datafusion-examples/examples/custom_data_source/csv_json_opener.rs
@@ -15,34 +15,36 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use std::sync::Arc;
 
 use arrow::datatypes::{DataType, Field, Schema};
+use datafusion::common::config::CsvOptions;
 use datafusion::{
     assert_batches_eq,
     datasource::{
         file_format::file_compression_type::FileCompressionType,
         listing::PartitionedFile,
         object_store::ObjectStoreUrl,
-        physical_plan::{CsvSource, FileSource, FileStream, JsonOpener, JsonSource},
+        physical_plan::{
+            CsvSource, FileSource, FileStreamBuilder, JsonOpener, JsonSource,
+        },
     },
     error::Result,
     physical_plan::metrics::ExecutionPlanMetricsSet,
-    test_util::aggr_test_schema,
 };
 
-use datafusion::datasource::{
-    physical_plan::FileScanConfigBuilder, table_schema::TableSchema,
-};
+use datafusion::datasource::physical_plan::FileScanConfigBuilder;
+use datafusion_examples::utils::datasets::ExampleDataset;
 use futures::StreamExt;
-use object_store::{local::LocalFileSystem, memory::InMemory, ObjectStore};
+use object_store::{ObjectStoreExt, local::LocalFileSystem, memory::InMemory};
 
 /// This example demonstrates using the low level [`FileStream`] / [`FileOpener`] APIs to directly
 /// read data from (CSV/JSON) into Arrow RecordBatches.
 ///
 /// If you want to query data in CSV or JSON files, see the [`dataframe.rs`] and [`sql_query.rs`] examples
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn csv_json_opener() -> Result<()> {
     csv_opener().await?;
     json_opener().await?;
     Ok(())
@@ -50,48 +52,56 @@ async fn main() -> Result<()> {
 
 async fn csv_opener() -> Result<()> {
     let object_store = Arc::new(LocalFileSystem::new());
-    let schema = aggr_test_schema();
 
-    let testdata = datafusion::test_util::arrow_test_data();
-    let path = format!("{testdata}/csv/aggregate_test_100.csv");
+    let dataset = ExampleDataset::Cars;
+    let csv_path = dataset.path();
+    let schema = dataset.schema();
 
-    let path = std::path::Path::new(&path).canonicalize()?;
+    let options = CsvOptions {
+        has_header: Some(true),
+        delimiter: b',',
+        quote: b'"',
+        ..Default::default()
+    };
 
-    let scan_config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::local_filesystem(),
-        Arc::clone(&schema),
-        Arc::new(CsvSource::default()),
-    )
-    .with_projection_indices(Some(vec![12, 0]))
-    .with_limit(Some(5))
-    .with_file(PartitionedFile::new(path.display().to_string(), 10))
-    .build();
-
-    let config = CsvSource::new(true, b',', b'"')
+    let source = CsvSource::new(Arc::clone(&schema))
+        .with_csv_options(options)
         .with_comment(Some(b'#'))
-        .with_schema(TableSchema::from_file_schema(schema))
-        .with_batch_size(8192)
-        .with_projection(&scan_config);
+        .with_batch_size(8192);
+
+    let scan_config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source)
+            .with_projection_indices(Some(vec![0, 1]))?
+            .with_limit(Some(5))
+            .with_file(PartitionedFile::new(csv_path.display().to_string(), 10))
+            .build();
 
-    let opener = config.create_file_opener(object_store, &scan_config, 0);
+    let opener =
+        scan_config
+            .file_source()
+            .create_file_opener(object_store, &scan_config, 0)?;
 
     let mut result = vec![];
-    let mut stream =
-        FileStream::new(&scan_config, 0, opener, &ExecutionPlanMetricsSet::new())?;
+    let metrics = ExecutionPlanMetricsSet::new();
+    let mut stream = FileStreamBuilder::new(&scan_config)
+        .with_partition(0)
+        .with_file_opener(opener)
+        .with_metrics(&metrics)
+        .build()?;
     while let Some(batch) = stream.next().await.transpose()? {
         result.push(batch);
     }
     assert_batches_eq!(
         &[
-            "+--------------------------------+----+",
-            "| c13                            | c1 |",
-            "+--------------------------------+----+",
-            "| 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW | c  |",
-            "| C2GT5KVyOPZpgKVl110TyZO0NcJ434 | d  |",
-            "| AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz | b  |",
-            "| 0keZ5G8BffGwgF2RwQD59TFzMStxCB | a  |",
-            "| Ig1QcuKsjHXkproePdERo2w0mYzIqd | b  |",
-            "+--------------------------------+----+",
+            "+-----+-------+",
+            "| car | speed |",
+            "+-----+-------+",
+            "| red | 20.0  |",
+            "| red | 20.3  |",
+            "| red | 21.4  |",
+            "| red | 21.5  |",
+            "| red | 19.0  |",
+            "+-----+-------+",
         ],
         &result
     );
@@ -121,24 +131,24 @@ async fn json_opener() -> Result<()> {
         projected,
         FileCompressionType::UNCOMPRESSED,
         Arc::new(object_store),
+        true,
     );
 
     let scan_config = FileScanConfigBuilder::new(
         ObjectStoreUrl::local_filesystem(),
-        schema,
-        Arc::new(JsonSource::default()),
+        Arc::new(JsonSource::new(schema)),
     )
-    .with_projection_indices(Some(vec![1, 0]))
+    .with_projection_indices(Some(vec![1, 0]))?
     .with_limit(Some(5))
     .with_file(PartitionedFile::new(path.to_string(), 10))
     .build();
 
-    let mut stream = FileStream::new(
-        &scan_config,
-        0,
-        Arc::new(opener),
-        &ExecutionPlanMetricsSet::new(),
-    )?;
+    let metrics = ExecutionPlanMetricsSet::new();
+    let mut stream = FileStreamBuilder::new(&scan_config)
+        .with_partition(0)
+        .with_file_opener(Arc::new(opener))
+        .with_metrics(&metrics)
+        .build()?;
     let mut result = vec![];
     while let Some(batch) = stream.next().await.transpose()? {
         result.push(batch);
diff --git a/datafusion-examples/examples/csv_sql_streaming.rs b/datafusion-examples/examples/custom_data_source/csv_sql_streaming.rs
similarity index 82%
rename from datafusion-examples/examples/csv_sql_streaming.rs
rename to datafusion-examples/examples/custom_data_source/csv_sql_streaming.rs
index 99264bbcb486d..4692086a10b26 100644
--- a/datafusion-examples/examples/csv_sql_streaming.rs
+++ b/datafusion-examples/examples/custom_data_source/csv_sql_streaming.rs
@@ -15,44 +15,46 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion::common::test_util::datafusion_test_data;
+//! See `main.rs` for how to run it.
+
 use datafusion::error::Result;
 use datafusion::prelude::*;
+use datafusion_examples::utils::datasets::ExampleDataset;
 
 /// This example demonstrates executing a simple query against an Arrow data source (CSV) and
 /// fetching results with streaming aggregation and streaming window
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn csv_sql_streaming() -> Result<()> {
     // create local execution context
     let ctx = SessionContext::new();
 
-    let testdata = datafusion_test_data();
+    let dataset = ExampleDataset::Cars;
+    let csv_path = dataset.path();
 
-    // Register a table source and tell DataFusion the file is ordered by `ts ASC`.
+    // Register a table source and tell DataFusion the file is ordered by `car ASC`.
     // Note it is the responsibility of the user to make sure
     // that file indeed satisfies this condition or else incorrect answers may be produced.
     let asc = true;
     let nulls_first = true;
-    let sort_expr = vec![col("ts").sort(asc, nulls_first)];
+    let sort_expr = vec![col("car").sort(asc, nulls_first)];
     // register csv file with the execution context
     ctx.register_csv(
         "ordered_table",
-        &format!("{testdata}/window_1.csv"),
+        csv_path.to_str().unwrap(),
         CsvReadOptions::new().file_sort_order(vec![sort_expr]),
     )
     .await?;
 
     // execute the query
-    // Following query can be executed with unbounded sources because group by expressions (e.g ts) is
+    // Following query can be executed with unbounded sources because group by expressions (e.g car) is
     // already ordered at the source.
     //
     // Unbounded sources means that if the input came from a "never ending" source (such as a FIFO
     // file on unix) the query could produce results incrementally as data was read.
     let df = ctx
         .sql(
-            "SELECT ts, MIN(inc_col), MAX(inc_col) \
+            "SELECT car, MIN(speed), MAX(speed) \
         FROM ordered_table \
-        GROUP BY ts",
+        GROUP BY car",
         )
         .await?;
 
@@ -63,7 +65,7 @@ async fn main() -> Result<()> {
     // its result in streaming fashion, because its required ordering is already satisfied at the source.
     let df = ctx
         .sql(
-            "SELECT ts, SUM(inc_col) OVER(ORDER BY ts ASC) \
+            "SELECT car, SUM(speed) OVER(ORDER BY car ASC) \
         FROM ordered_table",
         )
         .await?;
diff --git a/datafusion-examples/examples/custom_datasource.rs b/datafusion-examples/examples/custom_data_source/custom_datasource.rs
similarity index 87%
rename from datafusion-examples/examples/custom_datasource.rs
rename to datafusion-examples/examples/custom_data_source/custom_datasource.rs
index bc865fac5a338..701a886d2a140 100644
--- a/datafusion-examples/examples/custom_datasource.rs
+++ b/datafusion-examples/examples/custom_data_source/custom_datasource.rs
@@ -15,17 +15,19 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
+//! See `main.rs` for how to run it.
+
 use std::collections::{BTreeMap, HashMap};
 use std::fmt::{self, Debug, Formatter};
 use std::sync::{Arc, Mutex};
 use std::time::Duration;
 
 use async_trait::async_trait;
-use datafusion::arrow::array::{UInt64Builder, UInt8Builder};
+use datafusion::arrow::array::{UInt8Builder, UInt64Builder};
 use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use datafusion::arrow::record_batch::RecordBatch;
-use datafusion::datasource::{provider_as_source, TableProvider, TableType};
+use datafusion::common::tree_node::TreeNodeRecursion;
+use datafusion::datasource::{TableProvider, TableType, provider_as_source};
 use datafusion::error::Result;
 use datafusion::execution::context::TaskContext;
 use datafusion::logical_expr::LogicalPlanBuilder;
@@ -33,8 +35,8 @@ use datafusion::physical_expr::EquivalenceProperties;
 use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
 use datafusion::physical_plan::memory::MemoryStream;
 use datafusion::physical_plan::{
-    project_schema, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning,
-    PlanProperties, SendableRecordBatchStream,
+    DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties,
+    SendableRecordBatchStream, project_schema,
 };
 use datafusion::prelude::*;
 
@@ -42,8 +44,7 @@ use datafusion::catalog::Session;
 use tokio::time::timeout;
 
 /// This example demonstrates executing a simple query against a custom datasource
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn custom_datasource() -> Result<()> {
     // create our custom datasource and adding some users
     let db = CustomDataSource::default();
     db.populate_users();
@@ -160,10 +161,6 @@ impl Default for CustomDataSource {
 
 #[async_trait]
 impl TableProvider for CustomDataSource {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         SchemaRef::new(Schema::new(vec![
             Field::new("id", DataType::UInt8, false),
@@ -191,10 +188,11 @@ impl TableProvider for CustomDataSource {
 struct CustomExec {
     db: CustomDataSource,
     projected_schema: SchemaRef,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl CustomExec {
+    #[expect(clippy::needless_pass_by_value)]
     fn new(
         projections: Option<&Vec<usize>>,
         schema: SchemaRef,
@@ -205,7 +203,7 @@ impl CustomExec {
         Self {
             db,
             projected_schema,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -232,11 +230,7 @@ impl ExecutionPlan for CustomExec {
         "CustomExec"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -281,4 +275,20 @@ impl ExecutionPlan for CustomExec {
             None,
         )?))
     }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion::physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.cache.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
 }
diff --git a/datafusion-examples/examples/custom_file_casts.rs b/datafusion-examples/examples/custom_data_source/custom_file_casts.rs
similarity index 77%
rename from datafusion-examples/examples/custom_file_casts.rs
rename to datafusion-examples/examples/custom_data_source/custom_file_casts.rs
index 4d97ecd91dc64..71addc6d1bcb0 100644
--- a/datafusion-examples/examples/custom_file_casts.rs
+++ b/datafusion-examples/examples/custom_data_source/custom_file_casts.rs
@@ -15,43 +15,44 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use std::sync::Arc;
 
-use arrow::array::{record_batch, RecordBatch};
-use arrow::datatypes::{DataType, Field, FieldRef, Schema, SchemaRef};
+use arrow::array::{RecordBatch, record_batch};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 
 use datafusion::assert_batches_eq;
+use datafusion::common::Result;
 use datafusion::common::not_impl_err;
 use datafusion::common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion::common::{Result, ScalarValue};
 use datafusion::datasource::listing::{
     ListingTable, ListingTableConfig, ListingTableConfigExt, ListingTableUrl,
 };
 use datafusion::execution::context::SessionContext;
 use datafusion::execution::object_store::ObjectStoreUrl;
 use datafusion::parquet::arrow::ArrowWriter;
-use datafusion::physical_expr::expressions::CastExpr;
 use datafusion::physical_expr::PhysicalExpr;
+use datafusion::physical_expr::expressions::CastExpr;
 use datafusion::prelude::SessionConfig;
 use datafusion_physical_expr_adapter::{
     DefaultPhysicalExprAdapterFactory, PhysicalExprAdapter, PhysicalExprAdapterFactory,
 };
 use object_store::memory::InMemory;
 use object_store::path::Path;
-use object_store::{ObjectStore, PutPayload};
+use object_store::{ObjectStore, ObjectStoreExt, PutPayload};
 
 // Example showing how to implement custom casting rules to adapt file schemas.
-// This example enforces that casts must be strictly widening: if the file type is Int64 and the table type is Int32, it will error
-// before even reading the data.
-// Without this custom cast rule DataFusion would happily do the narrowing cast, potentially erroring only if it found a row with data it could not cast.
-
-#[tokio::main]
-async fn main() -> Result<()> {
+// This example enforces strictly widening casts: if the file type is Int64 and
+// the table type is Int32, it errors before reading the data. Without this
+// custom cast rule DataFusion would apply the narrowing cast and might only
+// error after reading a row that it could not cast.
+pub async fn custom_file_casts() -> Result<()> {
     println!("=== Creating example data ===");
 
-    // Create a logical / table schema with an Int32 column
+    // Create a logical / table schema with an Int32 column (nullable)
     let logical_schema =
-        Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+        Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, true)]));
 
     // Create some data that can be cast (Int16 -> Int32 is widening) and some that cannot (Int64 -> Int32 is narrowing)
     let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
@@ -139,7 +140,7 @@ async fn write_data(
     Ok(())
 }
 
-/// Factory for creating DefaultValuePhysicalExprAdapter instances
+/// Factory for creating custom cast physical expression adapters
 #[derive(Debug)]
 struct CustomCastPhysicalExprAdapterFactory {
     inner: Arc<dyn PhysicalExprAdapterFactory>,
@@ -156,19 +157,19 @@ impl PhysicalExprAdapterFactory for CustomCastPhysicalExprAdapterFactory {
         &self,
         logical_file_schema: SchemaRef,
         physical_file_schema: SchemaRef,
-    ) -> Arc<dyn PhysicalExprAdapter> {
+    ) -> Result<Arc<dyn PhysicalExprAdapter>> {
         let inner = self
             .inner
-            .create(logical_file_schema, Arc::clone(&physical_file_schema));
-        Arc::new(CustomCastsPhysicalExprAdapter {
+            .create(logical_file_schema, Arc::clone(&physical_file_schema))?;
+        Ok(Arc::new(CustomCastsPhysicalExprAdapter {
             physical_file_schema,
             inner,
-        })
+        }))
     }
 }
 
-/// Custom PhysicalExprAdapter that handles missing columns with default values from metadata
-/// and wraps DefaultPhysicalExprAdapter for standard schema adaptation
+/// Custom `PhysicalExprAdapter` that wraps the default adapter and rejects
+/// narrowing file-schema casts.
 #[derive(Debug, Clone)]
 struct CustomCastsPhysicalExprAdapter {
     physical_file_schema: SchemaRef,
@@ -177,15 +178,17 @@ struct CustomCastsPhysicalExprAdapter {
 
 impl PhysicalExprAdapter for CustomCastsPhysicalExprAdapter {
     fn rewrite(&self, mut expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
-        // First delegate to the inner adapter to handle missing columns and discover any necessary casts
+        // First delegate to the inner adapter to handle standard schema adaptation
+        // and discover any necessary casts.
         expr = self.inner.rewrite(expr)?;
-        // Now we can apply custom casting rules or even swap out all CastExprs for a custom cast kernel / expression
-        // For example, [DataFusion Comet](https://github.com/apache/datafusion-comet) has a [custom cast kernel](https://github.com/apache/datafusion-comet/blob/b4ac876ab420ed403ac7fc8e1b29f42f1f442566/native/spark-expr/src/conversion_funcs/cast.rs#L133-L138).
+        // Now apply custom casting rules or swap CastExprs for a custom cast
+        // kernel / expression. For example, DataFusion Comet has a custom cast
+        // kernel in its native Spark expression implementation.
         expr.transform(|expr| {
-            if let Some(cast) = expr.as_any().downcast_ref::<CastExpr>() {
+            if let Some(cast) = expr.downcast_ref::<CastExpr>() {
                 let input_data_type =
                     cast.expr().data_type(&self.physical_file_schema)?;
-                let output_data_type = cast.data_type(&self.physical_file_schema)?;
+                let output_data_type = cast.target_field().data_type();
                 if !cast.is_bigger_cast(&input_data_type) {
                     return not_impl_err!(
                         "Unsupported CAST from {input_data_type} to {output_data_type}"
@@ -196,14 +199,4 @@ impl PhysicalExprAdapter for CustomCastsPhysicalExprAdapter {
         })
         .data()
     }
-
-    fn with_partition_values(
-        &self,
-        partition_values: Vec<(FieldRef, ScalarValue)>,
-    ) -> Arc<dyn PhysicalExprAdapter> {
-        Arc::new(Self {
-            inner: self.inner.with_partition_values(partition_values),
-            ..self.clone()
-        })
-    }
 }
diff --git a/datafusion-examples/examples/custom_file_format.rs b/datafusion-examples/examples/custom_data_source/custom_file_format.rs
similarity index 92%
rename from datafusion-examples/examples/custom_file_format.rs
rename to datafusion-examples/examples/custom_data_source/custom_file_format.rs
index 67fe642fd46ee..0cfbe11877e4d 100644
--- a/datafusion-examples/examples/custom_file_format.rs
+++ b/datafusion-examples/examples/custom_data_source/custom_file_format.rs
@@ -15,7 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{any::Any, sync::Arc};
+//! See `main.rs` for how to run it.
+
+use std::sync::Arc;
 
 use arrow::{
     array::{AsArray, RecordBatch, StringArray, UInt8Array},
@@ -25,12 +27,13 @@ use datafusion::{
     catalog::Session,
     common::{GetExt, Statistics},
     datasource::{
+        MemTable,
         file_format::{
-            csv::CsvFormatFactory, file_compression_type::FileCompressionType,
-            FileFormat, FileFormatFactory,
+            FileFormat, FileFormatFactory, csv::CsvFormatFactory,
+            file_compression_type::FileCompressionType,
         },
         physical_plan::{FileScanConfig, FileSinkConfig, FileSource},
-        MemTable,
+        table_schema::TableSchema,
     },
     error::Result,
     execution::session_state::SessionStateBuilder,
@@ -47,6 +50,42 @@ use tempfile::tempdir;
 /// TSVFileFormatFactory is responsible for creating instances of TSVFileFormat.
 /// The former, once registered with the SessionState, will then be used
 /// to facilitate SQL operations on TSV files, such as `COPY TO` shown here.
+pub async fn custom_file_format() -> Result<()> {
+    // Create a new context with the default configuration
+    let mut state = SessionStateBuilder::new().with_default_features().build();
+
+    // Register the custom file format
+    let file_format = Arc::new(TSVFileFactory::new());
+    state.register_file_format(file_format, true)?;
+
+    // Create a new context with the custom file format
+    let ctx = SessionContext::new_with_state(state);
+
+    let mem_table = create_mem_table();
+    ctx.register_table("mem_table", mem_table)?;
+
+    let temp_dir = tempdir().unwrap();
+    let table_save_path = temp_dir.path().join("mem_table.tsv");
+
+    let d = ctx
+        .sql(&format!(
+            "COPY mem_table TO '{}' STORED AS TSV;",
+            table_save_path.display(),
+        ))
+        .await?;
+
+    let results = d.collect().await?;
+    println!(
+        "Number of inserted rows: {:?}",
+        (results[0]
+            .column_by_name("count")
+            .unwrap()
+            .as_primitive::<UInt64Type>()
+            .value(0))
+    );
+
+    Ok(())
+}
 
 #[derive(Debug)]
 /// Custom file format that reads and writes TSV files
@@ -65,10 +104,6 @@ impl TSVFileFormat {
 
 #[async_trait::async_trait]
 impl FileFormat for TSVFileFormat {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn get_ext(&self) -> String {
         "tsv".to_string()
     }
@@ -128,8 +163,8 @@ impl FileFormat for TSVFileFormat {
             .await
     }
 
-    fn file_source(&self) -> Arc<dyn FileSource> {
-        self.csv_file_format.file_source()
+    fn file_source(&self, table_schema: TableSchema) -> Arc<dyn FileSource> {
+        self.csv_file_format.file_source(table_schema)
     }
 }
 
@@ -168,10 +203,6 @@ impl FileFormatFactory for TSVFileFactory {
     fn default(&self) -> Arc<dyn FileFormat> {
         todo!()
     }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
 }
 
 impl GetExt for TSVFileFactory {
@@ -180,44 +211,6 @@ impl GetExt for TSVFileFactory {
     }
 }
 
-#[tokio::main]
-async fn main() -> Result<()> {
-    // Create a new context with the default configuration
-    let mut state = SessionStateBuilder::new().with_default_features().build();
-
-    // Register the custom file format
-    let file_format = Arc::new(TSVFileFactory::new());
-    state.register_file_format(file_format, true).unwrap();
-
-    // Create a new context with the custom file format
-    let ctx = SessionContext::new_with_state(state);
-
-    let mem_table = create_mem_table();
-    ctx.register_table("mem_table", mem_table).unwrap();
-
-    let temp_dir = tempdir().unwrap();
-    let table_save_path = temp_dir.path().join("mem_table.tsv");
-
-    let d = ctx
-        .sql(&format!(
-            "COPY mem_table TO '{}' STORED AS TSV;",
-            table_save_path.display(),
-        ))
-        .await?;
-
-    let results = d.collect().await?;
-    println!(
-        "Number of inserted rows: {:?}",
-        (results[0]
-            .column_by_name("count")
-            .unwrap()
-            .as_primitive::<UInt64Type>()
-            .value(0))
-    );
-
-    Ok(())
-}
-
 // create a simple mem table
 fn create_mem_table() -> Arc<MemTable> {
     let fields = vec![
diff --git a/datafusion-examples/examples/default_column_values.rs b/datafusion-examples/examples/custom_data_source/default_column_values.rs
similarity index 61%
rename from datafusion-examples/examples/default_column_values.rs
rename to datafusion-examples/examples/custom_data_source/default_column_values.rs
index d3a7d2ec67f3c..633b98244367e 100644
--- a/datafusion-examples/examples/default_column_values.rs
+++ b/datafusion-examples/examples/custom_data_source/default_column_values.rs
@@ -15,18 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
+//! See `main.rs` for how to run it.
+
 use std::collections::HashMap;
 use std::sync::Arc;
 
 use arrow::array::RecordBatch;
-use arrow::datatypes::{DataType, Field, FieldRef, Schema, SchemaRef};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use async_trait::async_trait;
 
 use datafusion::assert_batches_eq;
 use datafusion::catalog::memory::DataSourceExec;
 use datafusion::catalog::{Session, TableProvider};
-use datafusion::common::tree_node::{Transformed, TransformedResult, TreeNode};
 use datafusion::common::DFSchema;
 use datafusion::common::{Result, ScalarValue};
 use datafusion::datasource::listing::PartitionedFile;
@@ -37,40 +37,37 @@ use datafusion::logical_expr::utils::conjunction;
 use datafusion::logical_expr::{Expr, TableProviderFilterPushDown, TableType};
 use datafusion::parquet::arrow::ArrowWriter;
 use datafusion::parquet::file::properties::WriterProperties;
-use datafusion::physical_expr::expressions::{CastExpr, Column, Literal};
 use datafusion::physical_expr::PhysicalExpr;
 use datafusion::physical_plan::ExecutionPlan;
-use datafusion::prelude::{lit, SessionConfig};
+use datafusion::prelude::{SessionConfig, lit};
 use datafusion_physical_expr_adapter::{
     DefaultPhysicalExprAdapterFactory, PhysicalExprAdapter, PhysicalExprAdapterFactory,
+    replace_columns_with_literals,
 };
 use futures::StreamExt;
 use object_store::memory::InMemory;
 use object_store::path::Path;
-use object_store::{ObjectStore, PutPayload};
+use object_store::{ObjectStore, ObjectStoreExt, PutPayload};
 
 // Metadata key for storing default values in field metadata
 const DEFAULT_VALUE_METADATA_KEY: &str = "example.default_value";
 
-// Example showing how to implement custom default value handling for missing columns
-// using field metadata and PhysicalExprAdapter.
-//
-// This example demonstrates how to:
-// 1. Store default values in field metadata using a constant key
-// 2. Create a custom PhysicalExprAdapter that reads these defaults
-// 3. Inject default values for missing columns in filter predicates
-// 4. Use the DefaultPhysicalExprAdapter as a fallback for standard schema adaptation
-// 5. Wrap string default values in cast expressions for proper type conversion
-//
-// Important: PhysicalExprAdapter is specifically designed for rewriting filter predicates
-// that get pushed down to file scans. For handling missing columns in projections,
-// other mechanisms in DataFusion are used (like SchemaAdapter).
-//
-// The metadata-based approach provides a flexible way to store default values as strings
-// and cast them to the appropriate types at query time.
-
-#[tokio::main]
-async fn main() -> Result<()> {
+/// Example showing how to implement custom default value handling for missing columns
+/// using field metadata and PhysicalExprAdapter.
+///
+/// This example demonstrates how to:
+/// 1. Store default values in field metadata using a constant key
+/// 2. Create a custom PhysicalExprAdapter that reads these defaults
+/// 3. Inject default values for missing columns in filter predicates using `replace_columns_with_literals`
+/// 4. Use the DefaultPhysicalExprAdapter as a fallback for standard schema adaptation
+/// 5. Convert string default values to proper types using `ScalarValue::cast_to()` at planning time
+///
+/// Important: PhysicalExprAdapter handles rewriting both filter predicates and projection
+/// expressions for file scans, including handling missing columns.
+///
+/// The metadata-based approach provides a flexible way to store default values as strings
+/// and cast them to the appropriate types at planning time, avoiding runtime overhead.
+pub async fn default_column_values() -> Result<()> {
     println!("=== Creating example data with missing columns and default values ===");
 
     // Create sample data where the logical schema has more columns than the physical schema
@@ -81,15 +78,14 @@ async fn main() -> Result<()> {
         let mut buf = vec![];
 
         let props = WriterProperties::builder()
-            .set_max_row_group_size(2)
+            .set_max_row_group_row_count(Some(2))
             .build();
 
         let mut writer =
-            ArrowWriter::try_new(&mut buf, physical_schema.clone(), Some(props))
-                .expect("creating writer");
+            ArrowWriter::try_new(&mut buf, physical_schema.clone(), Some(props))?;
 
-        writer.write(&batch).expect("Writing batch");
-        writer.close().unwrap();
+        writer.write(&batch)?;
+        writer.close()?;
         buf
     };
     let path = Path::from("example.parquet");
@@ -138,12 +134,14 @@ async fn main() -> Result<()> {
     println!("\n=== Key Insight ===");
     println!("This example demonstrates how PhysicalExprAdapter works:");
     println!("1. Physical schema only has 'id' and 'name' columns");
-    println!("2. Logical schema has 'id', 'name', 'status', and 'priority' columns with defaults");
-    println!("3. Our custom adapter intercepts filter expressions on missing columns");
-    println!("4. Default values from metadata are injected as cast expressions");
+    println!(
+        "2. Logical schema has 'id', 'name', 'status', and 'priority' columns with defaults"
+    );
+    println!(
+        "3. Our custom adapter uses replace_columns_with_literals to inject default values"
+    );
+    println!("4. Default values from metadata are cast to proper types at planning time");
     println!("5. The DefaultPhysicalExprAdapter handles other schema adaptations");
-    println!("\nNote: PhysicalExprAdapter is specifically for filter predicates.");
-    println!("For projection columns, different mechanisms handle missing columns.");
 
     Ok(())
 }
@@ -202,12 +200,8 @@ impl DefaultValueTableProvider {
 
 #[async_trait]
 impl TableProvider for DefaultValueTableProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
-        self.schema.clone()
+        Arc::clone(&self.schema)
     }
 
     fn table_type(&self) -> TableType {
@@ -228,14 +222,14 @@ impl TableProvider for DefaultValueTableProvider {
         filters: &[Expr],
         limit: Option<usize>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let schema = self.schema.clone();
+        let schema = Arc::clone(&self.schema);
         let df_schema = DFSchema::try_from(schema.clone())?;
         let filter = state.create_physical_expr(
             conjunction(filters.iter().cloned()).unwrap_or_else(|| lit(true)),
             &df_schema,
         )?;
 
-        let parquet_source = ParquetSource::default()
+        let parquet_source = ParquetSource::new(schema.clone())
             .with_predicate(filter)
             .with_pushdown_filters(true);
 
@@ -257,10 +251,9 @@ impl TableProvider for DefaultValueTableProvider {
 
         let file_scan_config = FileScanConfigBuilder::new(
             ObjectStoreUrl::parse("memory://")?,
-            self.schema.clone(),
             Arc::new(parquet_source),
         )
-        .with_projection_indices(projection.cloned())
+        .with_projection_indices(projection.cloned())?
         .with_limit(limit)
         .with_file_group(file_group)
         .with_expr_adapter(Some(Arc::new(DefaultValuePhysicalExprAdapterFactory) as _));
@@ -280,17 +273,18 @@ impl PhysicalExprAdapterFactory for DefaultValuePhysicalExprAdapterFactory {
         &self,
         logical_file_schema: SchemaRef,
         physical_file_schema: SchemaRef,
-    ) -> Arc<dyn PhysicalExprAdapter> {
+    ) -> Result<Arc<dyn PhysicalExprAdapter>> {
         let default_factory = DefaultPhysicalExprAdapterFactory;
-        let default_adapter = default_factory
-            .create(logical_file_schema.clone(), physical_file_schema.clone());
+        let default_adapter = default_factory.create(
+            Arc::clone(&logical_file_schema),
+            Arc::clone(&physical_file_schema),
+        )?;
 
-        Arc::new(DefaultValuePhysicalExprAdapter {
+        Ok(Arc::new(DefaultValuePhysicalExprAdapter {
             logical_file_schema,
             physical_file_schema,
             default_adapter,
-            partition_values: Vec::new(),
-        })
+        }))
     }
 }
 
@@ -301,98 +295,36 @@ struct DefaultValuePhysicalExprAdapter {
     logical_file_schema: SchemaRef,
     physical_file_schema: SchemaRef,
     default_adapter: Arc<dyn PhysicalExprAdapter>,
-    partition_values: Vec<(FieldRef, ScalarValue)>,
 }
 
 impl PhysicalExprAdapter for DefaultValuePhysicalExprAdapter {
     fn rewrite(&self, expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
-        // First try our custom default value injection for missing columns
-        let rewritten = expr
-            .transform(|expr| {
-                self.inject_default_values(
-                    expr,
-                    &self.logical_file_schema,
-                    &self.physical_file_schema,
-                )
-            })
-            .data()?;
-
-        // Then apply the default adapter as a fallback to handle standard schema differences
-        // like type casting, partition column handling, etc.
-        let default_adapter = if !self.partition_values.is_empty() {
-            self.default_adapter
-                .with_partition_values(self.partition_values.clone())
-        } else {
-            self.default_adapter.clone()
-        };
-
-        default_adapter.rewrite(rewritten)
-    }
-
-    fn with_partition_values(
-        &self,
-        partition_values: Vec<(FieldRef, ScalarValue)>,
-    ) -> Arc<dyn PhysicalExprAdapter> {
-        Arc::new(DefaultValuePhysicalExprAdapter {
-            logical_file_schema: self.logical_file_schema.clone(),
-            physical_file_schema: self.physical_file_schema.clone(),
-            default_adapter: self.default_adapter.clone(),
-            partition_values,
-        })
-    }
-}
-
-impl DefaultValuePhysicalExprAdapter {
-    fn inject_default_values(
-        &self,
-        expr: Arc<dyn PhysicalExpr>,
-        logical_file_schema: &Schema,
-        physical_file_schema: &Schema,
-    ) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
-        if let Some(column) = expr.as_any().downcast_ref::<Column>() {
-            let column_name = column.name();
-
-            // Check if this column exists in the physical schema
-            if physical_file_schema.index_of(column_name).is_err() {
-                // Column is missing from physical schema, check if logical schema has a default
-                if let Ok(logical_field) =
-                    logical_file_schema.field_with_name(column_name)
-                {
-                    if let Some(default_value_str) =
-                        logical_field.metadata().get(DEFAULT_VALUE_METADATA_KEY)
-                    {
-                        // Create a string literal and wrap it in a cast expression
-                        let default_literal = self.create_default_value_expr(
-                            default_value_str,
-                            logical_field.data_type(),
-                        )?;
-                        return Ok(Transformed::yes(default_literal));
-                    }
-                }
+        // Pre-compute replacements for missing columns with default values
+        let mut replacements = HashMap::new();
+        for field in self.logical_file_schema.fields() {
+            // Skip columns that exist in physical schema
+            if self.physical_file_schema.index_of(field.name()).is_ok() {
+                continue;
             }
-        }
-
-        // No transformation needed
-        Ok(Transformed::no(expr))
-    }
 
-    fn create_default_value_expr(
-        &self,
-        value_str: &str,
-        data_type: &DataType,
-    ) -> Result<Arc<dyn PhysicalExpr>> {
-        // Create a string literal with the default value
-        let string_literal =
-            Arc::new(Literal::new(ScalarValue::Utf8(Some(value_str.to_string()))));
-
-        // If the target type is already Utf8, return the string literal directly
-        if matches!(data_type, DataType::Utf8) {
-            return Ok(string_literal);
+            // Check if this missing column has a default value in metadata
+            if let Some(default_str) = field.metadata().get(DEFAULT_VALUE_METADATA_KEY) {
+                // Create a Utf8 ScalarValue from the string and cast it to the target type
+                let string_value = ScalarValue::Utf8(Some(default_str.to_string()));
+                let typed_value = string_value.cast_to(field.data_type())?;
+                replacements.insert(field.name().as_str(), typed_value);
+            }
         }
 
-        // Otherwise, wrap the string literal in a cast expression
-        let cast_expr = Arc::new(CastExpr::new(string_literal, data_type.clone(), None));
+        // Replace columns with their default literals if any
+        let rewritten = if !replacements.is_empty() {
+            let refs: HashMap<_, _> = replacements.iter().map(|(k, v)| (*k, v)).collect();
+            replace_columns_with_literals(expr, &refs)?
+        } else {
+            expr
+        };
 
-        Ok(cast_expr)
+        // Apply the default adapter as a fallback for other schema adaptations
+        self.default_adapter.rewrite(rewritten)
     }
 }
diff --git a/datafusion-examples/examples/file_stream_provider.rs b/datafusion-examples/examples/custom_data_source/file_stream_provider.rs
similarity index 90%
rename from datafusion-examples/examples/file_stream_provider.rs
rename to datafusion-examples/examples/custom_data_source/file_stream_provider.rs
index e6c59d57e98de..5b43072d43f80 100644
--- a/datafusion-examples/examples/file_stream_provider.rs
+++ b/datafusion-examples/examples/custom_data_source/file_stream_provider.rs
@@ -15,6 +15,31 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
+/// Demonstrates how to use [`FileStreamProvider`] and [`StreamTable`] to stream data
+/// from a file-like source (FIFO) into DataFusion for continuous querying.
+///
+/// On non-Windows systems, this example creates a named pipe (FIFO) and
+/// writes rows into it asynchronously while DataFusion reads the data
+/// through a `FileStreamProvider`.
+///
+/// This illustrates how to integrate dynamically updated data sources
+/// with DataFusion without needing to reload the entire dataset each time.
+///
+/// This example does not work on Windows.
+pub async fn file_stream_provider() -> datafusion::error::Result<()> {
+    #[cfg(target_os = "windows")]
+    {
+        println!("file_stream_provider example does not work on windows");
+        Ok(())
+    }
+    #[cfg(not(target_os = "windows"))]
+    {
+        non_windows::main().await
+    }
+}
+
 #[cfg(not(target_os = "windows"))]
 mod non_windows {
     use datafusion::assert_batches_eq;
@@ -22,8 +47,8 @@ mod non_windows {
     use std::fs::{File, OpenOptions};
     use std::io::Write;
     use std::path::PathBuf;
-    use std::sync::atomic::{AtomicBool, Ordering};
     use std::sync::Arc;
+    use std::sync::atomic::{AtomicBool, Ordering};
     use std::thread;
     use std::time::Duration;
 
@@ -34,9 +59,9 @@ mod non_windows {
     use tempfile::TempDir;
     use tokio::task::JoinSet;
 
-    use datafusion::common::{exec_err, Result};
-    use datafusion::datasource::stream::{FileStreamProvider, StreamConfig, StreamTable};
+    use datafusion::common::{Result, exec_err};
     use datafusion::datasource::TableProvider;
+    use datafusion::datasource::stream::{FileStreamProvider, StreamConfig, StreamTable};
     use datafusion::logical_expr::SortExpr;
     use datafusion::prelude::{SessionConfig, SessionContext};
 
@@ -101,7 +126,6 @@ mod non_windows {
         let broken_pipe_timeout = Duration::from_secs(10);
         let sa = file_path;
         // Spawn a new thread to write to the FIFO file
-        #[allow(clippy::disallowed_methods)] // spawn allowed only in tests
         tasks.spawn_blocking(move || {
             let file = OpenOptions::new().write(true).open(sa).unwrap();
             // Reference time to use when deciding to fail the test
@@ -186,16 +210,3 @@ mod non_windows {
         Ok(())
     }
 }
-
-#[tokio::main]
-async fn main() -> datafusion::error::Result<()> {
-    #[cfg(target_os = "windows")]
-    {
-        println!("file_stream_provider example does not work on windows");
-        Ok(())
-    }
-    #[cfg(not(target_os = "windows"))]
-    {
-        non_windows::main().await
-    }
-}
diff --git a/datafusion-examples/examples/custom_data_source/main.rs b/datafusion-examples/examples/custom_data_source/main.rs
new file mode 100644
index 0000000000000..40409d3690d3a
--- /dev/null
+++ b/datafusion-examples/examples/custom_data_source/main.rs
@@ -0,0 +1,138 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # These examples are all related to extending or defining how DataFusion reads data
+//!
+//! These examples demonstrate how DataFusion reads data.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example custom_data_source -- [all|adapter_serialization|csv_json_opener|csv_sql_streaming|custom_datasource|custom_file_casts|custom_file_format|default_column_values|file_stream_provider]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `adapter_serialization`  
+//!   (file: adapter_serialization.rs, desc: Preserve custom PhysicalExprAdapter information during plan serialization using PhysicalExtensionCodec interception)
+//!
+//! - `csv_json_opener`  
+//!   (file: csv_json_opener.rs, desc: Use low-level FileOpener APIs for CSV/JSON)
+//!
+//! - `csv_sql_streaming`
+//!   (file: csv_sql_streaming.rs, desc: Run a streaming SQL query against CSV data)
+//!
+//! - `custom_datasource`  
+//!   (file: custom_datasource.rs, desc: Query a custom TableProvider)
+//!
+//! - `custom_file_casts`
+//!   (file: custom_file_casts.rs, desc: Implement custom casting rules)
+//!
+//! - `custom_file_format`
+//!   (file: custom_file_format.rs, desc: Write to a custom file format)
+//!
+//! - `default_column_values`
+//!   (file: default_column_values.rs, desc: Custom default values using metadata)
+//!
+//! - `file_stream_provider`
+//!   (file: file_stream_provider.rs, desc: Read/write via FileStreamProvider for streams)
+
+mod adapter_serialization;
+mod csv_json_opener;
+mod csv_sql_streaming;
+mod custom_datasource;
+mod custom_file_casts;
+mod custom_file_format;
+mod default_column_values;
+mod file_stream_provider;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    AdapterSerialization,
+    CsvJsonOpener,
+    CsvSqlStreaming,
+    CustomDatasource,
+    CustomFileCasts,
+    CustomFileFormat,
+    DefaultColumnValues,
+    FileStreamProvider,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "custom_data_source";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::AdapterSerialization => {
+                adapter_serialization::adapter_serialization().await?
+            }
+            ExampleKind::CsvJsonOpener => csv_json_opener::csv_json_opener().await?,
+            ExampleKind::CsvSqlStreaming => {
+                csv_sql_streaming::csv_sql_streaming().await?
+            }
+            ExampleKind::CustomDatasource => {
+                custom_datasource::custom_datasource().await?
+            }
+            ExampleKind::CustomFileCasts => {
+                custom_file_casts::custom_file_casts().await?
+            }
+            ExampleKind::CustomFileFormat => {
+                custom_file_format::custom_file_format().await?
+            }
+            ExampleKind::DefaultColumnValues => {
+                default_column_values::default_column_values().await?
+            }
+            ExampleKind::FileStreamProvider => {
+                file_stream_provider::file_stream_provider().await?
+            }
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/catalog.rs b/datafusion-examples/examples/data_io/catalog.rs
similarity index 95%
rename from datafusion-examples/examples/catalog.rs
rename to datafusion-examples/examples/data_io/catalog.rs
index 229867cdfc5bb..7e5cc5a4cfc05 100644
--- a/datafusion-examples/examples/catalog.rs
+++ b/datafusion-examples/examples/data_io/catalog.rs
@@ -15,27 +15,29 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+//!
 //! Simple example of a catalog/schema implementation.
 use async_trait::async_trait;
 use datafusion::{
     arrow::util::pretty,
     catalog::{CatalogProvider, CatalogProviderList, SchemaProvider},
     datasource::{
-        file_format::{csv::CsvFormat, FileFormat},
-        listing::{ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl},
         TableProvider,
+        file_format::{FileFormat, csv::CsvFormat},
+        listing::{ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl},
     },
     error::Result,
     execution::context::SessionState,
     prelude::SessionContext,
 };
 use std::sync::RwLock;
-use std::{any::Any, collections::HashMap, path::Path, sync::Arc};
+use std::{collections::HashMap, path::Path, sync::Arc};
 use std::{fs::File, io::Write};
 use tempfile::TempDir;
 
-#[tokio::main]
-async fn main() -> Result<()> {
+/// Register the table into a custom catalog
+pub async fn catalog() -> Result<()> {
     env_logger::builder()
         .filter_level(log::LevelFilter::Info)
         .init();
@@ -134,12 +136,13 @@ struct DirSchemaOpts<'a> {
     dir: &'a Path,
     format: Arc<dyn FileFormat>,
 }
+
 /// Schema where every file with extension `ext` in a given `dir` is a table.
 #[derive(Debug)]
 struct DirSchema {
-    ext: String,
     tables: RwLock<HashMap<String, Arc<dyn TableProvider>>>,
 }
+
 impl DirSchema {
     async fn create(state: &SessionState, opts: DirSchemaOpts<'_>) -> Result<Arc<Self>> {
         let DirSchemaOpts { ext, dir, format } = opts;
@@ -169,21 +172,12 @@ impl DirSchema {
         }
         Ok(Arc::new(Self {
             tables: RwLock::new(tables),
-            ext: ext.to_string(),
         }))
     }
-    #[allow(unused)]
-    fn name(&self) -> &str {
-        &self.ext
-    }
 }
 
 #[async_trait]
 impl SchemaProvider for DirSchema {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn table_names(&self) -> Vec<String> {
         let tables = self.tables.read().unwrap();
         tables.keys().cloned().collect::<Vec<_>>()
@@ -198,6 +192,7 @@ impl SchemaProvider for DirSchema {
         let tables = self.tables.read().unwrap();
         tables.contains_key(name)
     }
+
     fn register_table(
         &self,
         name: String,
@@ -211,7 +206,6 @@ impl SchemaProvider for DirSchema {
 
     /// If supported by the implementation, removes an existing table from this schema and returns it.
     /// If no table of that name exists, returns Ok(None).
-    #[allow(unused_variables)]
     fn deregister_table(&self, name: &str) -> Result<Option<Arc<dyn TableProvider>>> {
         let mut tables = self.tables.write().unwrap();
         log::info!("dropping table {name}");
@@ -223,6 +217,7 @@ impl SchemaProvider for DirSchema {
 struct DirCatalog {
     schemas: RwLock<HashMap<String, Arc<dyn SchemaProvider>>>,
 }
+
 impl DirCatalog {
     fn new() -> Self {
         Self {
@@ -230,10 +225,8 @@ impl DirCatalog {
         }
     }
 }
+
 impl CatalogProvider for DirCatalog {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn register_schema(
         &self,
         name: &str,
@@ -260,11 +253,13 @@ impl CatalogProvider for DirCatalog {
         }
     }
 }
+
 /// Catalog lists holds multiple catalog providers. Each context has a single catalog list.
 #[derive(Debug)]
 struct CustomCatalogProviderList {
     catalogs: RwLock<HashMap<String, Arc<dyn CatalogProvider>>>,
 }
+
 impl CustomCatalogProviderList {
     fn new() -> Self {
         Self {
@@ -272,10 +267,8 @@ impl CustomCatalogProviderList {
         }
     }
 }
+
 impl CatalogProviderList for CustomCatalogProviderList {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn register_catalog(
         &self,
         name: String,
diff --git a/datafusion-examples/examples/data_io/in_memory_object_store.rs b/datafusion-examples/examples/data_io/in_memory_object_store.rs
new file mode 100644
index 0000000000000..9a308f06c5abd
--- /dev/null
+++ b/datafusion-examples/examples/data_io/in_memory_object_store.rs
@@ -0,0 +1,81 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+//!
+//! This follows the recommended approach: implement the `ObjectStore` trait
+//! (or use an existing implementation), register it with DataFusion, and then
+//! read a URL "path" from that store.
+//! See the in-memory reference implementation:
+//! https://docs.rs/object_store/latest/object_store/memory/struct.InMemory.html
+
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, Schema};
+use datafusion::assert_batches_eq;
+use datafusion::common::Result;
+use datafusion::execution::object_store::ObjectStoreUrl;
+use datafusion::prelude::{CsvReadOptions, SessionContext};
+use object_store::memory::InMemory;
+use object_store::path::Path;
+use object_store::{ObjectStore, ObjectStoreExt, PutPayload};
+
+/// Demonstrates reading CSV data from an in-memory object store.
+///
+/// The same pattern applies to JSON/Parquet: register a store for a URL
+/// prefix, write bytes into the store, then read via that URL.
+pub async fn in_memory_object_store() -> Result<()> {
+    let store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
+    let ctx = SessionContext::new();
+    let object_store_url = ObjectStoreUrl::parse("memory://")?;
+    // Register a URL prefix to route reads through this object store.
+    ctx.register_object_store(object_store_url.as_ref(), Arc::clone(&store));
+
+    let schema = Schema::new(vec![
+        Field::new("id", DataType::Int64, false),
+        Field::new("name", DataType::Utf8, false),
+    ]);
+
+    println!("=== CSV from memory ===");
+    let csv_path = Path::from("/people.csv");
+    let csv_data = b"id,name\n1,Alice\n2,Bob\n";
+    // Write bytes into the in-memory object store.
+    store
+        .put(&csv_path, PutPayload::from_static(csv_data))
+        .await?;
+    // Read using the URL that matches the registered prefix.
+    let csv = ctx
+        .read_csv(
+            "memory:///people.csv",
+            CsvReadOptions::new().schema(&schema),
+        )
+        .await?
+        .collect()
+        .await?;
+    #[rustfmt::skip]
+    let expected = [
+        "+----+-------+",
+        "| id | name  |",
+        "+----+-------+",
+        "| 1  | Alice |",
+        "| 2  | Bob   |",
+        "+----+-------+",
+    ];
+    assert_batches_eq!(expected, &csv);
+
+    Ok(())
+}
diff --git a/datafusion-examples/examples/json_shredding.rs b/datafusion-examples/examples/data_io/json_shredding.rs
similarity index 74%
rename from datafusion-examples/examples/json_shredding.rs
rename to datafusion-examples/examples/data_io/json_shredding.rs
index 5ef8b59b64200..72fbb56773123 100644
--- a/datafusion-examples/examples/json_shredding.rs
+++ b/datafusion-examples/examples/data_io/json_shredding.rs
@@ -15,17 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
+//! See `main.rs` for how to run it.
+
 use std::sync::Arc;
 
 use arrow::array::{RecordBatch, StringArray};
-use arrow::datatypes::{DataType, Field, FieldRef, Schema, SchemaRef};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 
 use datafusion::assert_batches_eq;
 use datafusion::common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
 };
-use datafusion::common::{assert_contains, exec_datafusion_err, Result};
+use datafusion::common::{Result, assert_contains, exec_datafusion_err};
 use datafusion::datasource::listing::{
     ListingTable, ListingTableConfig, ListingTableConfigExt, ListingTableUrl,
 };
@@ -37,7 +38,7 @@ use datafusion::logical_expr::{
 use datafusion::parquet::arrow::ArrowWriter;
 use datafusion::parquet::file::properties::WriterProperties;
 use datafusion::physical_expr::PhysicalExpr;
-use datafusion::physical_expr::{expressions, ScalarFunctionExpr};
+use datafusion::physical_expr::{ScalarFunctionExpr, expressions};
 use datafusion::prelude::SessionConfig;
 use datafusion::scalar::ScalarValue;
 use datafusion_physical_expr_adapter::{
@@ -45,7 +46,7 @@ use datafusion_physical_expr_adapter::{
 };
 use object_store::memory::InMemory;
 use object_store::path::Path;
-use object_store::{ObjectStore, PutPayload};
+use object_store::{ObjectStoreExt, PutPayload};
 
 // Example showing how to implement custom filter rewriting for JSON shredding.
 //
@@ -63,8 +64,7 @@ use object_store::{ObjectStore, PutPayload};
 // 1. Push down predicates for better filtering
 // 2. Avoid expensive JSON parsing at query time
 // 3. Leverage columnar storage benefits for the materialized fields
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn json_shredding() -> Result<()> {
     println!("=== Creating example data with flat columns and underscore prefixes ===");
 
     // Create sample data with flat columns using underscore prefixes
@@ -75,7 +75,7 @@ async fn main() -> Result<()> {
         let mut buf = vec![];
 
         let props = WriterProperties::builder()
-            .set_max_row_group_size(2)
+            .set_max_row_group_row_count(Some(2))
             .build();
 
         let mut writer = ArrowWriter::try_new(&mut buf, batch.schema(), Some(props))
@@ -206,10 +206,6 @@ impl Default for JsonGetStr {
 }
 
 impl ScalarUDFImpl for JsonGetStr {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "json_get_str"
     }
@@ -232,7 +228,7 @@ impl ScalarUDFImpl for JsonGetStr {
             _ => {
                 return Err(exec_datafusion_err!(
                     "json_get_str first argument must be a string"
-                ))
+                ));
             }
         };
         // We expect a string array that contains JSON strings
@@ -248,7 +244,7 @@ impl ScalarUDFImpl for JsonGetStr {
             _ => {
                 return Err(exec_datafusion_err!(
                     "json_get_str second argument must be a string array"
-                ))
+                ));
             }
         };
         let values = json_array
@@ -274,17 +270,17 @@ impl PhysicalExprAdapterFactory for ShreddedJsonRewriterFactory {
         &self,
         logical_file_schema: SchemaRef,
         physical_file_schema: SchemaRef,
-    ) -> Arc<dyn PhysicalExprAdapter> {
+    ) -> Result<Arc<dyn PhysicalExprAdapter>> {
         let default_factory = DefaultPhysicalExprAdapterFactory;
-        let default_adapter = default_factory
-            .create(logical_file_schema.clone(), physical_file_schema.clone());
+        let default_adapter = default_factory.create(
+            Arc::clone(&logical_file_schema),
+            Arc::clone(&physical_file_schema),
+        )?;
 
-        Arc::new(ShreddedJsonRewriter {
-            logical_file_schema,
+        Ok(Arc::new(ShreddedJsonRewriter {
             physical_file_schema,
             default_adapter,
-            partition_values: Vec::new(),
-        })
+        }))
     }
 }
 
@@ -292,10 +288,8 @@ impl PhysicalExprAdapterFactory for ShreddedJsonRewriterFactory {
 /// and wraps DefaultPhysicalExprAdapter for standard schema adaptation
 #[derive(Debug)]
 struct ShreddedJsonRewriter {
-    logical_file_schema: SchemaRef,
     physical_file_schema: SchemaRef,
     default_adapter: Arc<dyn PhysicalExprAdapter>,
-    partition_values: Vec<(FieldRef, ScalarValue)>,
 }
 
 impl PhysicalExprAdapter for ShreddedJsonRewriter {
@@ -306,27 +300,8 @@ impl PhysicalExprAdapter for ShreddedJsonRewriter {
             .data()?;
 
         // Then apply the default adapter as a fallback to handle standard schema differences
-        // like type casting, missing columns, and partition column handling
-        let default_adapter = if !self.partition_values.is_empty() {
-            self.default_adapter
-                .with_partition_values(self.partition_values.clone())
-        } else {
-            self.default_adapter.clone()
-        };
-
-        default_adapter.rewrite(rewritten)
-    }
-
-    fn with_partition_values(
-        &self,
-        partition_values: Vec<(FieldRef, ScalarValue)>,
-    ) -> Arc<dyn PhysicalExprAdapter> {
-        Arc::new(ShreddedJsonRewriter {
-            logical_file_schema: self.logical_file_schema.clone(),
-            physical_file_schema: self.physical_file_schema.clone(),
-            default_adapter: self.default_adapter.clone(),
-            partition_values,
-        })
+        // like type casting and missing columns
+        self.default_adapter.rewrite(rewritten)
     }
 }
 
@@ -336,44 +311,39 @@ impl ShreddedJsonRewriter {
         expr: Arc<dyn PhysicalExpr>,
         physical_file_schema: &Schema,
     ) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
-        if let Some(func) = expr.as_any().downcast_ref::<ScalarFunctionExpr>() {
-            if func.name() == "json_get_str" && func.args().len() == 2 {
-                // Get the key from the first argument
-                if let Some(literal) = func.args()[0]
-                    .as_any()
-                    .downcast_ref::<expressions::Literal>()
+        if let Some(func) = expr.downcast_ref::<ScalarFunctionExpr>()
+            && func.name() == "json_get_str"
+            && func.args().len() == 2
+        {
+            // Get the key from the first argument
+            if let Some(literal) = func.args()[0].downcast_ref::<expressions::Literal>()
+                && let ScalarValue::Utf8(Some(field_name)) = literal.value()
+            {
+                // Get the column from the second argument
+                if let Some(column) = func.args()[1].downcast_ref::<expressions::Column>()
                 {
-                    if let ScalarValue::Utf8(Some(field_name)) = literal.value() {
-                        // Get the column from the second argument
-                        if let Some(column) = func.args()[1]
-                            .as_any()
-                            .downcast_ref::<expressions::Column>()
-                        {
-                            let column_name = column.name();
-                            // Check if there's a flat column with underscore prefix
-                            let flat_column_name = format!("_{column_name}.{field_name}");
-
-                            if let Ok(flat_field_index) =
-                                physical_file_schema.index_of(&flat_column_name)
-                            {
-                                let flat_field =
-                                    physical_file_schema.field(flat_field_index);
-
-                                if flat_field.data_type() == &DataType::Utf8 {
-                                    // Replace the whole expression with a direct column reference
-                                    let new_expr = Arc::new(expressions::Column::new(
-                                        &flat_column_name,
-                                        flat_field_index,
-                                    ))
-                                        as Arc<dyn PhysicalExpr>;
-
-                                    return Ok(Transformed {
-                                        data: new_expr,
-                                        tnr: TreeNodeRecursion::Stop,
-                                        transformed: true,
-                                    });
-                                }
-                            }
+                    let column_name = column.name();
+                    // Check if there's a flat column with underscore prefix
+                    let flat_column_name = format!("_{column_name}.{field_name}");
+
+                    if let Ok(flat_field_index) =
+                        physical_file_schema.index_of(&flat_column_name)
+                    {
+                        let flat_field = physical_file_schema.field(flat_field_index);
+
+                        if flat_field.data_type() == &DataType::Utf8 {
+                            // Replace the whole expression with a direct column reference
+                            let new_expr = Arc::new(expressions::Column::new(
+                                &flat_column_name,
+                                flat_field_index,
+                            ))
+                                as Arc<dyn PhysicalExpr>;
+
+                            return Ok(Transformed {
+                                data: new_expr,
+                                tnr: TreeNodeRecursion::Stop,
+                                transformed: true,
+                            });
                         }
                     }
                 }
diff --git a/datafusion-examples/examples/data_io/main.rs b/datafusion-examples/examples/data_io/main.rs
new file mode 100644
index 0000000000000..4656a83670aaf
--- /dev/null
+++ b/datafusion-examples/examples/data_io/main.rs
@@ -0,0 +1,152 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # These examples of data formats and I/O
+//!
+//! These examples demonstrate data formats and I/O.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example data_io -- [all|catalog|in_memory_object_store|json_shredding|parquet_adv_idx|parquet_emb_idx|parquet_enc_with_kms|parquet_enc|parquet_exec_visitor|parquet_idx|query_http_csv|remote_catalog]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `catalog`
+//!   (file: catalog.rs, desc: Register tables into a custom catalog)
+//!
+//! - `in_memory_object_store`
+//!   (file: in_memory_object_store.rs, desc: Read CSV from an in-memory object store (pattern applies to JSON/Parquet))
+//!
+//! - `json_shredding`
+//!   (file: json_shredding.rs, desc: Implement filter rewriting for JSON shredding)
+//!
+//! - `parquet_adv_idx`
+//!   (file: parquet_advanced_index.rs, desc: Create a secondary index across multiple parquet files)
+//!
+//! - `parquet_emb_idx`
+//!   (file: parquet_embedded_index.rs, desc: Store a custom index inside Parquet files)
+//!
+//! - `parquet_enc`  
+//!   (file: parquet_encrypted.rs, desc: Read & write encrypted Parquet files)
+//!
+//! - `parquet_enc_with_kms`
+//!   (file: parquet_encrypted_with_kms.rs, desc: Encrypted Parquet I/O using a KMS-backed factory)
+//!
+//! - `parquet_exec_visitor`
+//!   (file: parquet_exec_visitor.rs, desc: Extract statistics by visiting an ExecutionPlan)
+//!
+//! - `parquet_idx`
+//!   (file: parquet_index.rs, desc: Create a secondary index)
+//!
+//! - `query_http_csv`
+//!   (file: query_http_csv.rs, desc: Query CSV files via HTTP)
+//!
+//! - `remote_catalog`
+//!   (file: remote_catalog.rs, desc: Interact with a remote catalog)
+
+mod catalog;
+mod in_memory_object_store;
+mod json_shredding;
+mod parquet_advanced_index;
+mod parquet_embedded_index;
+mod parquet_encrypted;
+mod parquet_encrypted_with_kms;
+mod parquet_exec_visitor;
+mod parquet_index;
+mod query_http_csv;
+mod remote_catalog;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    Catalog,
+    InMemoryObjectStore,
+    JsonShredding,
+    ParquetAdvIdx,
+    ParquetEmbIdx,
+    ParquetEnc,
+    ParquetEncWithKms,
+    ParquetExecVisitor,
+    ParquetIdx,
+    QueryHttpCsv,
+    RemoteCatalog,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "data_io";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::Catalog => catalog::catalog().await?,
+            ExampleKind::InMemoryObjectStore => {
+                in_memory_object_store::in_memory_object_store().await?
+            }
+            ExampleKind::JsonShredding => json_shredding::json_shredding().await?,
+            ExampleKind::ParquetAdvIdx => {
+                parquet_advanced_index::parquet_advanced_index().await?
+            }
+            ExampleKind::ParquetEmbIdx => {
+                parquet_embedded_index::parquet_embedded_index().await?
+            }
+            ExampleKind::ParquetEncWithKms => {
+                parquet_encrypted_with_kms::parquet_encrypted_with_kms().await?
+            }
+            ExampleKind::ParquetEnc => parquet_encrypted::parquet_encrypted().await?,
+            ExampleKind::ParquetExecVisitor => {
+                parquet_exec_visitor::parquet_exec_visitor().await?
+            }
+            ExampleKind::ParquetIdx => parquet_index::parquet_index().await?,
+            ExampleKind::QueryHttpCsv => query_http_csv::query_http_csv().await?,
+            ExampleKind::RemoteCatalog => remote_catalog::remote_catalog().await?,
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/advanced_parquet_index.rs b/datafusion-examples/examples/data_io/parquet_advanced_index.rs
similarity index 97%
rename from datafusion-examples/examples/advanced_parquet_index.rs
rename to datafusion-examples/examples/data_io/parquet_advanced_index.rs
index 1c560be6d08a6..9e69c7f15a841 100644
--- a/datafusion-examples/examples/advanced_parquet_index.rs
+++ b/datafusion-examples/examples/data_io/parquet_advanced_index.rs
@@ -15,40 +15,41 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
+//! See `main.rs` for how to run it.
+
 use std::collections::{HashMap, HashSet};
 use std::fs::File;
 use std::ops::Range;
 use std::path::{Path, PathBuf};
-use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
 
 use datafusion::catalog::Session;
 use datafusion::common::{
-    internal_datafusion_err, DFSchema, DataFusionError, Result, ScalarValue,
+    DFSchema, DataFusionError, Result, ScalarValue, internal_datafusion_err,
 };
+use datafusion::datasource::TableProvider;
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::physical_plan::parquet::ParquetAccessPlan;
 use datafusion::datasource::physical_plan::{
     FileScanConfigBuilder, ParquetFileReaderFactory, ParquetSource,
 };
-use datafusion::datasource::TableProvider;
 use datafusion::execution::object_store::ObjectStoreUrl;
 use datafusion::logical_expr::utils::conjunction;
 use datafusion::logical_expr::{TableProviderFilterPushDown, TableType};
+use datafusion::parquet::arrow::ArrowWriter;
 use datafusion::parquet::arrow::arrow_reader::{
     ArrowReaderOptions, ParquetRecordBatchReaderBuilder, RowSelection, RowSelector,
 };
 use datafusion::parquet::arrow::async_reader::{AsyncFileReader, ParquetObjectReader};
-use datafusion::parquet::arrow::ArrowWriter;
-use datafusion::parquet::file::metadata::ParquetMetaData;
+use datafusion::parquet::file::metadata::{PageIndexPolicy, ParquetMetaData};
 use datafusion::parquet::file::properties::{EnabledStatistics, WriterProperties};
 use datafusion::parquet::schema::types::ColumnPath;
-use datafusion::physical_expr::utils::{Guarantee, LiteralGuarantee};
 use datafusion::physical_expr::PhysicalExpr;
+use datafusion::physical_expr::utils::{Guarantee, LiteralGuarantee};
 use datafusion::physical_optimizer::pruning::PruningPredicate;
-use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
 use datafusion::physical_plan::ExecutionPlan;
+use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
 use datafusion::prelude::*;
 
 use arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray};
@@ -56,8 +57,8 @@ use arrow::datatypes::SchemaRef;
 use async_trait::async_trait;
 use bytes::Bytes;
 use datafusion::datasource::memory::DataSourceExec;
-use futures::future::BoxFuture;
 use futures::FutureExt;
+use futures::future::BoxFuture;
 use object_store::ObjectStore;
 use tempfile::TempDir;
 use url::Url;
@@ -121,7 +122,6 @@ use url::Url;
 ///         │ ╚═══════════════════╝ │      1. With cached ParquetMetadata, so
 ///         └───────────────────────┘      the ParquetSource does not re-read /
 ///          Parquet File                  decode the thrift footer
-///
 /// ```
 ///
 /// Within a Row Group, Column Chunks store data in DataPages. This example also
@@ -156,8 +156,7 @@ use url::Url;
 ///
 /// [`ListingTable`]: datafusion::datasource::listing::ListingTable
 /// [Page Index](https://github.com/apache/parquet-format/blob/master/PageIndex.md)
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn parquet_advanced_index() -> Result<()> {
     // the object store is used to read the parquet files (in this case, it is
     // a local file system, but in a real system it could be S3, GCS, etc)
     let object_store: Arc<dyn ObjectStore> =
@@ -240,6 +239,7 @@ pub struct IndexTableProvider {
     /// if true, use row selections in addition to row group selections
     use_row_selections: AtomicBool,
 }
+
 impl IndexTableProvider {
     /// Create a new IndexTableProvider
     /// * `object_store` - the object store implementation to use for reading files
@@ -409,7 +409,7 @@ impl IndexedFile {
         let options = ArrowReaderOptions::new()
             // Load the page index when reading metadata to cache
             // so it is available to interpret row selections
-            .with_page_index(true);
+            .with_page_index_policy(PageIndexPolicy::Required);
         let reader =
             ParquetRecordBatchReaderBuilder::try_new_with_options(file, options)?;
         let metadata = reader.metadata().clone();
@@ -450,10 +450,6 @@ impl IndexedFile {
 /// so that we can query it as a table.
 #[async_trait]
 impl TableProvider for IndexTableProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         Arc::clone(&self.indexed_file.schema)
     }
@@ -492,19 +488,18 @@ impl TableProvider for IndexTableProvider {
                 .with_file(indexed_file);
 
         let file_source = Arc::new(
-            ParquetSource::default()
+            ParquetSource::new(schema.clone())
                 // provide the predicate so the DataSourceExec can try and prune
                 // row groups internally
                 .with_predicate(predicate)
                 // provide the factory to create parquet reader without re-reading metadata
                 .with_parquet_file_reader_factory(Arc::new(reader_factory)),
         );
-        let file_scan_config =
-            FileScanConfigBuilder::new(object_store_url, schema, file_source)
-                .with_limit(limit)
-                .with_projection_indices(projection.cloned())
-                .with_file(partitioned_file)
-                .build();
+        let file_scan_config = FileScanConfigBuilder::new(object_store_url, file_source)
+            .with_limit(limit)
+            .with_projection_indices(projection.cloned())?
+            .with_file(partitioned_file)
+            .build();
 
         // Finally, put it all together into a DataSourceExec
         Ok(DataSourceExec::from_data_source(file_scan_config))
@@ -541,6 +536,7 @@ impl CachedParquetFileReaderFactory {
             metadata: HashMap::new(),
         }
     }
+
     /// Add the pre-parsed information about the file to the factor
     fn with_file(mut self, indexed_file: &IndexedFile) -> Self {
         self.metadata.insert(
@@ -566,7 +562,7 @@ impl ParquetFileReaderFactory for CachedParquetFileReaderFactory {
             .object_meta
             .location
             .parts()
-            .last()
+            .next_back()
             .expect("No path in location")
             .as_ref()
             .to_string();
@@ -658,7 +654,7 @@ fn make_demo_file(path: impl AsRef<Path>, value_range: Range<i32>) -> Result<()>
     // enable page statistics for the tag column,
     // for everything else.
     let props = WriterProperties::builder()
-        .set_max_row_group_size(100)
+        .set_max_row_group_row_count(Some(100))
         // compute column chunk (per row group) statistics by default
         .set_statistics_enabled(EnabledStatistics::Chunk)
         // compute column page statistics for the tag column
diff --git a/datafusion-examples/examples/parquet_embedded_index.rs b/datafusion-examples/examples/data_io/parquet_embedded_index.rs
similarity index 94%
rename from datafusion-examples/examples/parquet_embedded_index.rs
rename to datafusion-examples/examples/data_io/parquet_embedded_index.rs
index 3cbe189147752..40b5b468ff5bf 100644
--- a/datafusion-examples/examples/parquet_embedded_index.rs
+++ b/datafusion-examples/examples/data_io/parquet_embedded_index.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+//!
 //! Embedding and using a custom index in Parquet files
 //!
 //! # Background
@@ -116,11 +118,11 @@ use arrow::record_batch::RecordBatch;
 use arrow_schema::{DataType, Field, Schema, SchemaRef};
 use async_trait::async_trait;
 use datafusion::catalog::{Session, TableProvider};
-use datafusion::common::{exec_err, HashMap, HashSet, Result};
+use datafusion::common::{HashMap, HashSet, Result, exec_err};
+use datafusion::datasource::TableType;
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::memory::DataSourceExec;
 use datafusion::datasource::physical_plan::{FileScanConfigBuilder, ParquetSource};
-use datafusion::datasource::TableType;
 use datafusion::execution::object_store::ObjectStoreUrl;
 use datafusion::logical_expr::{Operator, TableProviderFilterPushDown};
 use datafusion::parquet::arrow::ArrowWriter;
@@ -130,12 +132,37 @@ use datafusion::parquet::file::reader::{FileReader, SerializedFileReader};
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::prelude::*;
 use datafusion::scalar::ScalarValue;
-use std::fs::{read_dir, File};
+use std::fs::{File, read_dir};
 use std::io::{Read, Seek, SeekFrom, Write};
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
 use tempfile::TempDir;
 
+/// Store a custom index inside a Parquet file and use it to speed up queries
+pub async fn parquet_embedded_index() -> Result<()> {
+    // 1. Create temp dir and write 3 Parquet files with different category sets
+    let tmp = TempDir::new()?;
+    let dir = tmp.path();
+    write_file_with_index(&dir.join("a.parquet"), &["foo", "bar", "foo"])?;
+    write_file_with_index(&dir.join("b.parquet"), &["baz", "qux"])?;
+    write_file_with_index(&dir.join("c.parquet"), &["foo", "quux", "quux"])?;
+
+    // 2. Register our custom TableProvider
+    let field = Field::new("category", DataType::Utf8, false);
+    let schema_ref = Arc::new(Schema::new(vec![field]));
+    let provider = Arc::new(DistinctIndexTable::try_new(dir, schema_ref.clone())?);
+
+    let ctx = SessionContext::new();
+    ctx.register_table("t", provider)?;
+
+    // 3. Run a query: only files containing 'foo' get scanned. The rest are pruned.
+    // based on the distinct index.
+    let df = ctx.sql("SELECT * FROM t WHERE category = 'foo'").await?;
+    df.show().await?;
+
+    Ok(())
+}
+
 /// An index of distinct values for a single column
 ///
 /// In this example the index is a simple set of strings, but in a real
@@ -366,9 +393,6 @@ fn get_key_value<'a>(file_meta_data: &'a FileMetaData, key: &'_ str) -> Option<&
 /// Implement TableProvider for DistinctIndexTable, using the distinct index to prune files
 #[async_trait]
 impl TableProvider for DistinctIndexTable {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
     fn schema(&self) -> SchemaRef {
         self.schema.clone()
     }
@@ -392,21 +416,15 @@ impl TableProvider for DistinctIndexTable {
         // equality analysis or write your own custom logic.
         let mut target: Option<&str> = None;
 
-        if filters.len() == 1 {
-            if let Expr::BinaryExpr(expr) = &filters[0] {
-                if expr.op == Operator::Eq {
-                    if let (
-                        Expr::Column(c),
-                        Expr::Literal(ScalarValue::Utf8(Some(v)), _),
-                    ) = (&*expr.left, &*expr.right)
-                    {
-                        if c.name == "category" {
-                            println!("Filtering for category: {v}");
-                            target = Some(v);
-                        }
-                    }
-                }
-            }
+        if filters.len() == 1
+            && let Expr::BinaryExpr(expr) = &filters[0]
+            && expr.op == Operator::Eq
+            && let (Expr::Column(c), Expr::Literal(ScalarValue::Utf8(Some(v)), _)) =
+                (&*expr.left, &*expr.right)
+            && c.name == "category"
+        {
+            println!("Filtering for category: {v}");
+            target = Some(v);
         }
         // Determine which files to scan
         let files_to_scan: Vec<_> = self
@@ -426,8 +444,10 @@ impl TableProvider for DistinctIndexTable {
 
         // Build ParquetSource to actually read the files
         let url = ObjectStoreUrl::parse("file://")?;
-        let source = Arc::new(ParquetSource::default().with_enable_page_index(true));
-        let mut builder = FileScanConfigBuilder::new(url, self.schema.clone(), source);
+        let source = Arc::new(
+            ParquetSource::new(self.schema.clone()).with_enable_page_index(true),
+        );
+        let mut builder = FileScanConfigBuilder::new(url, source);
         for file in files_to_scan {
             let path = self.dir.join(file);
             let len = std::fs::metadata(&path)?.len();
@@ -450,28 +470,3 @@ impl TableProvider for DistinctIndexTable {
         Ok(vec![TableProviderFilterPushDown::Inexact; fs.len()])
     }
 }
-
-#[tokio::main]
-async fn main() -> Result<()> {
-    // 1. Create temp dir and write 3 Parquet files with different category sets
-    let tmp = TempDir::new()?;
-    let dir = tmp.path();
-    write_file_with_index(&dir.join("a.parquet"), &["foo", "bar", "foo"])?;
-    write_file_with_index(&dir.join("b.parquet"), &["baz", "qux"])?;
-    write_file_with_index(&dir.join("c.parquet"), &["foo", "quux", "quux"])?;
-
-    // 2. Register our custom TableProvider
-    let field = Field::new("category", DataType::Utf8, false);
-    let schema_ref = Arc::new(Schema::new(vec![field]));
-    let provider = Arc::new(DistinctIndexTable::try_new(dir, schema_ref.clone())?);
-
-    let ctx = SessionContext::new();
-    ctx.register_table("t", provider)?;
-
-    // 3. Run a query: only files containing 'foo' get scanned. The rest are pruned.
-    // based on the distinct index.
-    let df = ctx.sql("SELECT * FROM t WHERE category = 'foo'").await?;
-    df.show().await?;
-
-    Ok(())
-}
diff --git a/datafusion-examples/examples/parquet_encrypted.rs b/datafusion-examples/examples/data_io/parquet_encrypted.rs
similarity index 75%
rename from datafusion-examples/examples/parquet_encrypted.rs
rename to datafusion-examples/examples/data_io/parquet_encrypted.rs
index 690d9f2a5f140..f73c538d1c4d9 100644
--- a/datafusion-examples/examples/parquet_encrypted.rs
+++ b/datafusion-examples/examples/data_io/parquet_encrypted.rs
@@ -15,6 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
+use std::sync::Arc;
+
 use datafusion::common::DataFusionError;
 use datafusion::config::{ConfigFileEncryptionProperties, TableParquetOptions};
 use datafusion::dataframe::{DataFrame, DataFrameWriteOptions};
@@ -22,21 +26,21 @@ use datafusion::logical_expr::{col, lit};
 use datafusion::parquet::encryption::decrypt::FileDecryptionProperties;
 use datafusion::parquet::encryption::encrypt::FileEncryptionProperties;
 use datafusion::prelude::{ParquetReadOptions, SessionContext};
-use std::sync::Arc;
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
 use tempfile::TempDir;
 
-#[tokio::main]
-async fn main() -> datafusion::common::Result<()> {
+/// Read and write encrypted Parquet files using DataFusion
+pub async fn parquet_encrypted() -> datafusion::common::Result<()> {
     // The SessionContext is the main high level API for interacting with DataFusion
     let ctx = SessionContext::new();
 
-    // Find the local path of "alltypes_plain.parquet"
-    let testdata = datafusion::test_util::parquet_test_data();
-    let filename = &format!("{testdata}/alltypes_plain.parquet");
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
 
     // Read the sample parquet file
     let parquet_df = ctx
-        .read_parquet(filename, ParquetReadOptions::default())
+        .read_parquet(parquet_temp.path_str()?, ParquetReadOptions::default())
         .await?;
 
     // Show information from the dataframe
@@ -50,30 +54,33 @@ async fn main() -> datafusion::common::Result<()> {
     let (encrypt, decrypt) = setup_encryption(&parquet_df)?;
 
     // Create a temporary file location for the encrypted parquet file
-    let tmp_dir = TempDir::new()?;
-    let tempfile = tmp_dir.path().join("alltypes_plain-encrypted.parquet");
-    let tempfile_str = tempfile.into_os_string().into_string().unwrap();
+    let tmp_source = TempDir::new()?;
+    let tempfile = tmp_source.path().join("cars_encrypted.parquet");
 
     // Write encrypted parquet
     let mut options = TableParquetOptions::default();
     options.crypto.file_encryption = Some(ConfigFileEncryptionProperties::from(&encrypt));
     parquet_df
         .write_parquet(
-            tempfile_str.as_str(),
+            tempfile.to_str().unwrap(),
             DataFrameWriteOptions::new().with_single_file_output(true),
             Some(options),
         )
         .await?;
 
-    // Read encrypted parquet
+    // Read encrypted parquet back as a DataFrame using matching decryption config
     let ctx: SessionContext = SessionContext::new();
     let read_options =
-        ParquetReadOptions::default().file_decryption_properties((&decrypt).into());
+        ParquetReadOptions::default().file_decryption_properties((&decrypt).try_into()?);
 
-    let encrypted_parquet_df = ctx.read_parquet(tempfile_str, read_options).await?;
+    let encrypted_parquet_df = ctx
+        .read_parquet(tempfile.to_str().unwrap(), read_options)
+        .await?;
 
     // Show information from the dataframe
-    println!("\n\n===============================================================================");
+    println!(
+        "\n\n==============================================================================="
+    );
     println!("Encrypted Parquet DataFrame:");
     query_dataframe(&encrypted_parquet_df).await?;
 
@@ -87,11 +94,12 @@ async fn query_dataframe(df: &DataFrame) -> Result<(), DataFusionError> {
     df.clone().describe().await?.show().await?;
 
     // Select three columns and filter the results
-    // so that only rows where id > 1 are returned
+    // so that only rows where speed > 5 are returned
+    // select car, speed, time from t where speed > 5
     println!("\nSelected rows and columns:");
     df.clone()
-        .select_columns(&["id", "bool_col", "timestamp_col"])?
-        .filter(col("id").gt(lit(5)))?
+        .select_columns(&["car", "speed", "time"])?
+        .filter(col("speed").gt(lit(5)))?
         .show()
         .await?;
 
diff --git a/datafusion-examples/examples/parquet_encrypted_with_kms.rs b/datafusion-examples/examples/data_io/parquet_encrypted_with_kms.rs
similarity index 99%
rename from datafusion-examples/examples/parquet_encrypted_with_kms.rs
rename to datafusion-examples/examples/data_io/parquet_encrypted_with_kms.rs
index 45bfd183773a0..1a9bf56c09b35 100644
--- a/datafusion-examples/examples/parquet_encrypted_with_kms.rs
+++ b/datafusion-examples/examples/data_io/parquet_encrypted_with_kms.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray};
 use arrow_schema::SchemaRef;
 use async_trait::async_trait;
@@ -53,8 +55,7 @@ const ENCRYPTION_FACTORY_ID: &str = "example.mock_kms_encryption";
 /// which is not a secure way to store encryption keys.
 /// For production use, it is recommended to use a key-management service (KMS) to encrypt
 /// data encryption keys.
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn parquet_encrypted_with_kms() -> Result<()> {
     let ctx = SessionContext::new();
 
     // Register an `EncryptionFactory` implementation to be used for Parquet encryption
diff --git a/datafusion-examples/examples/parquet_exec_visitor.rs b/datafusion-examples/examples/data_io/parquet_exec_visitor.rs
similarity index 72%
rename from datafusion-examples/examples/parquet_exec_visitor.rs
rename to datafusion-examples/examples/data_io/parquet_exec_visitor.rs
index 84f92d4f450e1..d1951b2d9904d 100644
--- a/datafusion-examples/examples/parquet_exec_visitor.rs
+++ b/datafusion-examples/examples/data_io/parquet_exec_visitor.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use std::sync::Arc;
 
 use datafusion::datasource::file_format::parquet::ParquetFormat;
@@ -25,34 +27,37 @@ use datafusion::error::DataFusionError;
 use datafusion::execution::context::SessionContext;
 use datafusion::physical_plan::metrics::MetricValue;
 use datafusion::physical_plan::{
-    execute_stream, visit_execution_plan, ExecutionPlan, ExecutionPlanVisitor,
+    ExecutionPlan, ExecutionPlanVisitor, execute_stream, visit_execution_plan,
 };
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
 use futures::StreamExt;
 
 /// Example of collecting metrics after execution by visiting the `ExecutionPlan`
-#[tokio::main]
-async fn main() {
+pub async fn parquet_exec_visitor() -> datafusion::common::Result<()> {
     let ctx = SessionContext::new();
 
-    let test_data = datafusion::test_util::parquet_test_data();
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
 
     // Configure listing options
     let file_format = ParquetFormat::default().with_enable_pruning(true);
     let listing_options = ListingOptions::new(Arc::new(file_format));
 
+    let table_path = parquet_temp.file_uri()?;
+
     // First example were we use an absolute path, which requires no additional setup.
-    let _ = ctx
-        .register_listing_table(
-            "my_table",
-            &format!("file://{test_data}/alltypes_plain.parquet"),
-            listing_options.clone(),
-            None,
-            None,
-        )
-        .await;
-
-    let df = ctx.sql("SELECT * FROM my_table").await.unwrap();
-    let plan = df.create_physical_plan().await.unwrap();
+    ctx.register_listing_table(
+        "my_table",
+        &table_path,
+        listing_options.clone(),
+        None,
+        None,
+    )
+    .await?;
+
+    let df = ctx.sql("SELECT * FROM my_table").await?;
+    let plan = df.create_physical_plan().await?;
 
     // Create empty visitor
     let mut visitor = ParquetExecVisitor {
@@ -63,12 +68,12 @@ async fn main() {
     // Make sure you execute the plan to collect actual execution statistics.
     // For example, in this example the `file_scan_config` is known without executing
     // but the `bytes_scanned` would be None if we did not execute.
-    let mut batch_stream = execute_stream(plan.clone(), ctx.task_ctx()).unwrap();
+    let mut batch_stream = execute_stream(plan.clone(), ctx.task_ctx())?;
     while let Some(batch) = batch_stream.next().await {
         println!("Batch rows: {}", batch.unwrap().num_rows());
     }
 
-    visit_execution_plan(plan.as_ref(), &mut visitor).unwrap();
+    visit_execution_plan(plan.as_ref(), &mut visitor)?;
 
     println!(
         "ParquetExecVisitor bytes_scanned: {:?}",
@@ -78,6 +83,8 @@ async fn main() {
         "ParquetExecVisitor file_groups: {:?}",
         visitor.file_groups.unwrap()
     );
+
+    Ok(())
 }
 
 /// Define a struct with fields to hold the execution information you want to
@@ -97,18 +104,17 @@ impl ExecutionPlanVisitor for ParquetExecVisitor {
     /// or `post_visit` (visit each node after its children/inputs)
     fn pre_visit(&mut self, plan: &dyn ExecutionPlan) -> Result<bool, Self::Error> {
         // If needed match on a specific `ExecutionPlan` node type
-        if let Some(data_source_exec) = plan.as_any().downcast_ref::<DataSourceExec>() {
-            if let Some((file_config, _)) =
+        if let Some(data_source_exec) = plan.downcast_ref::<DataSourceExec>()
+            && let Some((file_config, _)) =
                 data_source_exec.downcast_to_file_source::<ParquetSource>()
-            {
-                self.file_groups = Some(file_config.file_groups.clone());
-
-                let metrics = match data_source_exec.metrics() {
-                    None => return Ok(true),
-                    Some(metrics) => metrics,
-                };
-                self.bytes_scanned = metrics.sum_by_name("bytes_scanned");
-            }
+        {
+            self.file_groups = Some(file_config.file_groups.clone());
+
+            let metrics = match data_source_exec.metrics() {
+                None => return Ok(true),
+                Some(metrics) => metrics,
+            };
+            self.bytes_scanned = metrics.sum_by_name("bytes_scanned");
         }
         Ok(true)
     }
diff --git a/datafusion-examples/examples/parquet_index.rs b/datafusion-examples/examples/data_io/parquet_index.rs
similarity index 97%
rename from datafusion-examples/examples/parquet_index.rs
rename to datafusion-examples/examples/data_io/parquet_index.rs
index 127c55da982c8..9be84d8249342 100644
--- a/datafusion-examples/examples/parquet_index.rs
+++ b/datafusion-examples/examples/data_io/parquet_index.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use arrow::array::{
     Array, ArrayRef, AsArray, BooleanArray, Int32Array, RecordBatch, StringArray,
     UInt64Array,
@@ -25,33 +27,32 @@ use async_trait::async_trait;
 use datafusion::catalog::Session;
 use datafusion::common::pruning::PruningStatistics;
 use datafusion::common::{
-    internal_datafusion_err, DFSchema, DataFusionError, Result, ScalarValue,
+    DFSchema, DataFusionError, Result, ScalarValue, internal_datafusion_err,
 };
+use datafusion::datasource::TableProvider;
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::memory::DataSourceExec;
 use datafusion::datasource::physical_plan::{FileScanConfigBuilder, ParquetSource};
-use datafusion::datasource::TableProvider;
 use datafusion::execution::object_store::ObjectStoreUrl;
 use datafusion::logical_expr::{
-    utils::conjunction, TableProviderFilterPushDown, TableType,
+    TableProviderFilterPushDown, TableType, utils::conjunction,
 };
 use datafusion::parquet::arrow::arrow_reader::statistics::StatisticsConverter;
 use datafusion::parquet::arrow::{
-    arrow_reader::ParquetRecordBatchReaderBuilder, ArrowWriter,
+    ArrowWriter, arrow_reader::ParquetRecordBatchReaderBuilder,
 };
 use datafusion::physical_expr::PhysicalExpr;
 use datafusion::physical_optimizer::pruning::PruningPredicate;
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::prelude::*;
-use std::any::Any;
 use std::collections::HashSet;
 use std::fmt::Display;
 use std::fs;
 use std::fs::{DirEntry, File};
 use std::ops::Range;
 use std::path::{Path, PathBuf};
-use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
 use tempfile::TempDir;
 use url::Url;
 
@@ -99,12 +100,10 @@ use url::Url;
 ///                   Thus some parquet files are      │             │
 ///                   "pruned" and thus are not        └─────────────┘
 ///                   scanned at all                   Parquet Files
-///
 /// ```
 ///
 /// [`ListingTable`]: datafusion::datasource::listing::ListingTable
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn parquet_index() -> Result<()> {
     // Demo data has three files, each with schema
     // * file_name (string)
     // * value (int32)
@@ -208,10 +207,6 @@ impl IndexTableProvider {
 
 #[async_trait]
 impl TableProvider for IndexTableProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         self.index.schema().clone()
     }
@@ -243,10 +238,11 @@ impl TableProvider for IndexTableProvider {
         let files = self.index.get_files(predicate.clone())?;
 
         let object_store_url = ObjectStoreUrl::parse("file://")?;
-        let source = Arc::new(ParquetSource::default().with_predicate(predicate));
+        let source =
+            Arc::new(ParquetSource::new(self.schema()).with_predicate(predicate));
         let mut file_scan_config_builder =
-            FileScanConfigBuilder::new(object_store_url, self.schema(), source)
-                .with_projection_indices(projection.cloned())
+            FileScanConfigBuilder::new(object_store_url, source)
+                .with_projection_indices(projection.cloned())?
                 .with_limit(limit);
 
         // Transform to the format needed to pass to DataSourceExec
@@ -461,7 +457,7 @@ impl PruningStatistics for ParquetMetadataIndex {
     }
 
     /// return the row counts for each file
-    fn row_counts(&self, _column: &Column) -> Option<ArrayRef> {
+    fn row_counts(&self) -> Option<ArrayRef> {
         Some(self.row_counts_ref().clone())
     }
 
@@ -510,7 +506,7 @@ impl ParquetMetadataIndexBuilder {
 
         // Get the schema of the file. A real system might have to handle the
         // case where the schema of the file is not the same as the schema of
-        // the other files e.g. using SchemaAdapter.
+        // the other files e.g. using PhysicalExprAdapterFactory.
         if self.file_schema.is_none() {
             self.file_schema = Some(reader.schema().clone());
         }
diff --git a/datafusion-examples/examples/query-http-csv.rs b/datafusion-examples/examples/data_io/query_http_csv.rs
similarity index 91%
rename from datafusion-examples/examples/query-http-csv.rs
rename to datafusion-examples/examples/data_io/query_http_csv.rs
index fa3fd2ac068df..71421e6270ccb 100644
--- a/datafusion-examples/examples/query-http-csv.rs
+++ b/datafusion-examples/examples/data_io/query_http_csv.rs
@@ -15,16 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use datafusion::error::Result;
 use datafusion::prelude::*;
 use object_store::http::HttpBuilder;
 use std::sync::Arc;
 use url::Url;
 
-/// This example demonstrates executing a simple query against an Arrow data source (CSV) and
-/// fetching results
-#[tokio::main]
-async fn main() -> Result<()> {
+/// Configure `object_store` and run a query against files via HTTP
+pub async fn query_http_csv() -> Result<()> {
     // create local execution context
     let ctx = SessionContext::new();
 
diff --git a/datafusion-examples/examples/remote_catalog.rs b/datafusion-examples/examples/data_io/remote_catalog.rs
similarity index 98%
rename from datafusion-examples/examples/remote_catalog.rs
rename to datafusion-examples/examples/data_io/remote_catalog.rs
index 74575554ec0af..16814752b3ec2 100644
--- a/datafusion-examples/examples/remote_catalog.rs
+++ b/datafusion-examples/examples/data_io/remote_catalog.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+//!
 /// This example shows how to implement the DataFusion [`CatalogProvider`] API
 /// for catalogs that are remote (require network access) and/or offer only
 /// asynchronous APIs such as [Polaris], [Unity], and [Hive].
@@ -39,15 +41,14 @@ use datafusion::common::{assert_batches_eq, internal_datafusion_err, plan_err};
 use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::execution::SendableRecordBatchStream;
 use datafusion::logical_expr::{Expr, TableType};
-use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion::physical_plan::ExecutionPlan;
+use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion::prelude::{DataFrame, SessionContext};
 use futures::TryStreamExt;
-use std::any::Any;
 use std::sync::Arc;
 
-#[tokio::main]
-async fn main() -> Result<()> {
+/// Interfacing with a remote catalog (e.g. over a network)
+pub async fn remote_catalog() -> Result<()> {
     // As always, we create a session context to interact with DataFusion
     let ctx = SessionContext::new();
 
@@ -222,10 +223,6 @@ impl RemoteTable {
 /// Implement the DataFusion Catalog API for [`RemoteTable`]
 #[async_trait]
 impl TableProvider for RemoteTable {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         self.schema.clone()
     }
diff --git a/datafusion-examples/examples/dataframe/cache_factory.rs b/datafusion-examples/examples/dataframe/cache_factory.rs
new file mode 100644
index 0000000000000..a92c3dc4ce26a
--- /dev/null
+++ b/datafusion-examples/examples/dataframe/cache_factory.rs
@@ -0,0 +1,229 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+
+use std::fmt::Debug;
+use std::hash::Hash;
+use std::sync::{Arc, RwLock};
+
+use arrow::array::RecordBatch;
+use async_trait::async_trait;
+use datafusion::catalog::memory::MemorySourceConfig;
+use datafusion::common::DFSchemaRef;
+use datafusion::error::Result;
+use datafusion::execution::context::QueryPlanner;
+use datafusion::execution::session_state::CacheFactory;
+use datafusion::execution::{SessionState, SessionStateBuilder};
+use datafusion::logical_expr::{
+    Extension, LogicalPlan, UserDefinedLogicalNode, UserDefinedLogicalNodeCore,
+};
+use datafusion::physical_plan::{ExecutionPlan, collect_partitioned};
+use datafusion::physical_planner::{
+    DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner,
+};
+use datafusion::prelude::*;
+use datafusion_common::HashMap;
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
+
+/// This example demonstrates how to leverage [CacheFactory] to implement custom caching strategies for dataframes in DataFusion.
+/// By default, [DataFrame::cache] in Datafusion is eager and creates an in-memory table. This example shows a basic alternative implementation for lazy caching.
+/// Specifically, it implements:
+/// - A [CustomCacheFactory] that creates a logical node [CacheNode] representing the cache operation.
+/// - A [CacheNodePlanner] (an [ExtensionPlanner]) that understands [CacheNode] and performs caching.
+/// - A [CacheNodeQueryPlanner] that installs [CacheNodePlanner].
+/// - A simple in-memory [CacheManager] that stores cached [RecordBatch]es. Note that the implementation for this example is very naive and only implements put, but for real production use cases cache eviction and drop should also be implemented.
+pub async fn cache_dataframe_with_custom_logic() -> Result<()> {
+    let session_state = SessionStateBuilder::new()
+        .with_cache_factory(Some(Arc::new(CustomCacheFactory {})))
+        .with_query_planner(Arc::new(CacheNodeQueryPlanner::default()))
+        .build();
+    let ctx = SessionContext::new_with_state(session_state);
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
+    // Read the parquet files and show its schema using 'describe'
+    let parquet_df = ctx
+        .read_parquet(parquet_temp.path_str()?, ParquetReadOptions::default())
+        .await?;
+
+    let df_cached = parquet_df
+        .select_columns(&["car", "speed", "time"])?
+        .filter(col("speed").gt(lit(1.0)))?
+        .cache()
+        .await?;
+
+    let df1 = df_cached.clone().filter(col("car").eq(lit("red")))?;
+    let df2 = df1.clone().sort(vec![col("car").sort(true, false)])?;
+
+    // should see log for caching only once
+    df_cached.show().await?;
+    df1.show().await?;
+    df2.show().await?;
+
+    Ok(())
+}
+
+#[derive(Debug)]
+struct CustomCacheFactory {}
+
+impl CacheFactory for CustomCacheFactory {
+    fn create(
+        &self,
+        plan: LogicalPlan,
+        _session_state: &SessionState,
+    ) -> Result<LogicalPlan> {
+        Ok(LogicalPlan::Extension(Extension {
+            node: Arc::new(CacheNode { input: plan }),
+        }))
+    }
+}
+
+#[derive(PartialEq, Eq, PartialOrd, Hash, Debug)]
+struct CacheNode {
+    input: LogicalPlan,
+}
+
+impl UserDefinedLogicalNodeCore for CacheNode {
+    fn name(&self) -> &str {
+        "CacheNode"
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.input]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        self.input.schema()
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "CacheNode")
+    }
+
+    fn with_exprs_and_inputs(
+        &self,
+        _exprs: Vec<Expr>,
+        mut inputs: Vec<LogicalPlan>,
+    ) -> Result<Self> {
+        assert_eq!(inputs.len(), 1, "input size must be one");
+        Ok(Self {
+            input: inputs.swap_remove(0),
+        })
+    }
+}
+
+struct CacheNodePlanner {
+    cache_manager: Arc<RwLock<CacheManager>>,
+}
+
+#[async_trait]
+impl ExtensionPlanner for CacheNodePlanner {
+    async fn plan_extension(
+        &self,
+        _planner: &dyn PhysicalPlanner,
+        node: &dyn UserDefinedLogicalNode,
+        logical_inputs: &[&LogicalPlan],
+        physical_inputs: &[Arc<dyn ExecutionPlan>],
+        session_state: &SessionState,
+    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+        if let Some(cache_node) = node.as_any().downcast_ref::<CacheNode>() {
+            assert_eq!(logical_inputs.len(), 1, "Inconsistent number of inputs");
+            assert_eq!(physical_inputs.len(), 1, "Inconsistent number of inputs");
+            if self
+                .cache_manager
+                .read()
+                .unwrap()
+                .get(&cache_node.input)
+                .is_none()
+            {
+                let ctx = session_state.task_ctx();
+                println!("caching in memory");
+                let batches =
+                    collect_partitioned(physical_inputs[0].clone(), ctx).await?;
+                self.cache_manager
+                    .write()
+                    .unwrap()
+                    .put(cache_node.input.clone(), batches);
+            } else {
+                println!("fetching directly from cache manager");
+            }
+            Ok(self
+                .cache_manager
+                .read()
+                .unwrap()
+                .get(&cache_node.input)
+                .map(|batches| {
+                    let exec: Arc<dyn ExecutionPlan> = MemorySourceConfig::try_new_exec(
+                        batches,
+                        physical_inputs[0].schema(),
+                        None,
+                    )
+                    .unwrap();
+                    exec
+                }))
+        } else {
+            Ok(None)
+        }
+    }
+}
+
+#[derive(Debug, Default)]
+struct CacheNodeQueryPlanner {
+    cache_manager: Arc<RwLock<CacheManager>>,
+}
+
+#[async_trait]
+impl QueryPlanner for CacheNodeQueryPlanner {
+    async fn create_physical_plan(
+        &self,
+        logical_plan: &LogicalPlan,
+        session_state: &SessionState,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let physical_planner =
+            DefaultPhysicalPlanner::with_extension_planners(vec![Arc::new(
+                CacheNodePlanner {
+                    cache_manager: Arc::clone(&self.cache_manager),
+                },
+            )]);
+        physical_planner
+            .create_physical_plan(logical_plan, session_state)
+            .await
+    }
+}
+
+// This naive implementation only includes put, but for real production use cases cache eviction and drop should also be implemented.
+#[derive(Debug, Default)]
+struct CacheManager {
+    cache: HashMap<LogicalPlan, Vec<Vec<RecordBatch>>>,
+}
+
+impl CacheManager {
+    pub fn put(&mut self, k: LogicalPlan, v: Vec<Vec<RecordBatch>>) {
+        self.cache.insert(k, v);
+    }
+
+    pub fn get(&self, k: &LogicalPlan) -> Option<&Vec<Vec<RecordBatch>>> {
+        self.cache.get(k)
+    }
+}
diff --git a/datafusion-examples/examples/dataframe.rs b/datafusion-examples/examples/dataframe/dataframe.rs
similarity index 73%
rename from datafusion-examples/examples/dataframe.rs
rename to datafusion-examples/examples/dataframe/dataframe.rs
index a5ee571a14764..dde19cb476f14 100644
--- a/datafusion-examples/examples/dataframe.rs
+++ b/datafusion-examples/examples/dataframe/dataframe.rs
@@ -15,22 +15,26 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
+use std::fs::File;
+use std::io::Write;
+use std::sync::Arc;
+
 use arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray, StringViewArray};
 use datafusion::arrow::datatypes::{DataType, Field, Schema};
 use datafusion::catalog::MemTable;
+use datafusion::common::ScalarValue;
 use datafusion::common::config::CsvOptions;
 use datafusion::common::parsers::CompressionTypeVariant;
-use datafusion::common::DataFusionError;
-use datafusion::common::ScalarValue;
 use datafusion::dataframe::DataFrameWriteOptions;
 use datafusion::error::Result;
 use datafusion::functions_aggregate::average::avg;
 use datafusion::functions_aggregate::min_max::max;
 use datafusion::prelude::*;
-use std::fs::File;
-use std::io::Write;
-use std::sync::Arc;
-use tempfile::tempdir;
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
+use tempfile::{TempDir, tempdir};
+use tokio::fs::create_dir_all;
 
 /// This example demonstrates using DataFusion's DataFrame API
 ///
@@ -39,6 +43,7 @@ use tempfile::tempdir;
 /// * [read_parquet]: execute queries against parquet files
 /// * [read_csv]: execute queries against csv files
 /// * [read_memory]: execute queries against in-memory arrow data
+/// * [read_memory_macro]: execute queries against in-memory arrow data using macro
 ///
 /// # Writing out to local storage
 ///
@@ -53,12 +58,7 @@ use tempfile::tempdir;
 /// * [where_scalar_subquery]: execute a scalar subquery
 /// * [where_in_subquery]: execute a subquery with an IN clause
 /// * [where_exist_subquery]: execute a subquery with an EXISTS clause
-///
-/// # Querying data
-///
-/// * [query_to_date]: execute queries against parquet files
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn dataframe_example() -> Result<()> {
     env_logger::init();
     // The SessionContext is the main high level API for interacting with DataFusion
     let ctx = SessionContext::new();
@@ -67,8 +67,8 @@ async fn main() -> Result<()> {
     read_memory(&ctx).await?;
     read_memory_macro().await?;
     write_out(&ctx).await?;
-    register_aggregate_test_data("t1", &ctx).await?;
-    register_aggregate_test_data("t2", &ctx).await?;
+    register_cars_test_data("t1", &ctx).await?;
+    register_cars_test_data("t2", &ctx).await?;
     where_scalar_subquery(&ctx).await?;
     where_in_subquery(&ctx).await?;
     where_exist_subquery(&ctx).await?;
@@ -80,23 +80,24 @@ async fn main() -> Result<()> {
 /// 2. Show the schema
 /// 3. Select columns and rows
 async fn read_parquet(ctx: &SessionContext) -> Result<()> {
-    // Find the local path of "alltypes_plain.parquet"
-    let testdata = datafusion::test_util::parquet_test_data();
-    let filename = &format!("{testdata}/alltypes_plain.parquet");
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(ctx, &dataset.path()).await?;
 
     // Read the parquet files and show its schema using 'describe'
     let parquet_df = ctx
-        .read_parquet(filename, ParquetReadOptions::default())
+        .read_parquet(parquet_temp.path_str()?, ParquetReadOptions::default())
         .await?;
 
     // show its schema using 'describe'
     parquet_df.clone().describe().await?.show().await?;
 
     // Select three columns and filter the results
-    // so that only rows where id > 1 are returned
+    // so that only rows where speed > 1 are returned
+    // select car, speed, time from t where speed > 1
     parquet_df
-        .select_columns(&["id", "bool_col", "timestamp_col"])?
-        .filter(col("id").gt(lit(1)))?
+        .select_columns(&["car", "speed", "time"])?
+        .filter(col("speed").gt(lit(1)))?
         .show()
         .await?;
 
@@ -199,7 +200,7 @@ async fn read_memory_macro() -> Result<()> {
 /// 2. Write out a DataFrame to a parquet file
 /// 3. Write out a DataFrame to a csv file
 /// 4. Write out a DataFrame to a json file
-async fn write_out(ctx: &SessionContext) -> std::result::Result<(), DataFusionError> {
+async fn write_out(ctx: &SessionContext) -> Result<()> {
     let array = StringViewArray::from(vec!["a", "b", "c"]);
     let schema = Arc::new(Schema::new(vec![Field::new(
         "tablecol1",
@@ -211,15 +212,26 @@ async fn write_out(ctx: &SessionContext) -> std::result::Result<(), DataFusionEr
     ctx.register_table("initial_data", Arc::new(mem_table))?;
     let df = ctx.table("initial_data").await?;
 
-    ctx.sql(
-        "create external table
-    test(tablecol1 varchar)
-    stored as parquet
-    location './datafusion-examples/test_table/'",
-    )
-    .await?
-    .collect()
-    .await?;
+    // Create a single temp root with subdirectories
+    let tmp_root = TempDir::new()?;
+    let examples_root = tmp_root.path().join("datafusion-examples");
+    create_dir_all(&examples_root).await?;
+    let table_dir = examples_root.join("test_table");
+    let parquet_dir = examples_root.join("test_parquet");
+    let csv_dir = examples_root.join("test_csv");
+    let json_dir = examples_root.join("test_json");
+    create_dir_all(&table_dir).await?;
+    create_dir_all(&parquet_dir).await?;
+    create_dir_all(&csv_dir).await?;
+    create_dir_all(&json_dir).await?;
+
+    let create_sql = format!(
+        "CREATE EXTERNAL TABLE test(tablecol1 varchar)
+         STORED AS parquet
+         LOCATION '{}'",
+        table_dir.display()
+    );
+    ctx.sql(&create_sql).await?.collect().await?;
 
     // This is equivalent to INSERT INTO test VALUES ('a'), ('b'), ('c').
     // The behavior of write_table depends on the TableProvider's implementation
@@ -230,7 +242,7 @@ async fn write_out(ctx: &SessionContext) -> std::result::Result<(), DataFusionEr
 
     df.clone()
         .write_parquet(
-            "./datafusion-examples/test_parquet/",
+            parquet_dir.to_str().unwrap(),
             DataFrameWriteOptions::new(),
             None,
         )
@@ -238,7 +250,7 @@ async fn write_out(ctx: &SessionContext) -> std::result::Result<(), DataFusionEr
 
     df.clone()
         .write_csv(
-            "./datafusion-examples/test_csv/",
+            csv_dir.to_str().unwrap(),
             // DataFrameWriteOptions contains options which control how data is written
             // such as compression codec
             DataFrameWriteOptions::new(),
@@ -248,7 +260,7 @@ async fn write_out(ctx: &SessionContext) -> std::result::Result<(), DataFusionEr
 
     df.clone()
         .write_json(
-            "./datafusion-examples/test_json/",
+            json_dir.to_str().unwrap(),
             DataFrameWriteOptions::new(),
             None,
         )
@@ -258,7 +270,7 @@ async fn write_out(ctx: &SessionContext) -> std::result::Result<(), DataFusionEr
 }
 
 /// Use the DataFrame API to execute the following subquery:
-/// select c1,c2 from t1 where (select avg(t2.c2) from t2 where t1.c1 = t2.c1)>0 limit 3;
+/// select car, speed from t1 where (select avg(t2.speed) from t2 where t1.car = t2.car) > 0 limit 3;
 async fn where_scalar_subquery(ctx: &SessionContext) -> Result<()> {
     ctx.table("t1")
         .await?
@@ -266,14 +278,14 @@ async fn where_scalar_subquery(ctx: &SessionContext) -> Result<()> {
             scalar_subquery(Arc::new(
                 ctx.table("t2")
                     .await?
-                    .filter(out_ref_col(DataType::Utf8, "t1.c1").eq(col("t2.c1")))?
-                    .aggregate(vec![], vec![avg(col("t2.c2"))])?
-                    .select(vec![avg(col("t2.c2"))])?
+                    .filter(out_ref_col(DataType::Utf8, "t1.car").eq(col("t2.car")))?
+                    .aggregate(vec![], vec![avg(col("t2.speed"))])?
+                    .select(vec![avg(col("t2.speed"))])?
                     .into_unoptimized_plan(),
             ))
-            .gt(lit(0u8)),
+            .gt(lit(0.0)),
         )?
-        .select(vec![col("t1.c1"), col("t1.c2")])?
+        .select(vec![col("t1.car"), col("t1.speed")])?
         .limit(0, Some(3))?
         .show()
         .await?;
@@ -281,22 +293,24 @@ async fn where_scalar_subquery(ctx: &SessionContext) -> Result<()> {
 }
 
 /// Use the DataFrame API to execute the following subquery:
-/// select t1.c1, t1.c2 from t1 where t1.c2 in (select max(t2.c2) from t2 where t2.c1 > 0 ) limit 3;
+/// select t1.car, t1.speed from t1 where t1.speed in (select max(t2.speed) from t2 where t2.car = 'red') limit 3;
 async fn where_in_subquery(ctx: &SessionContext) -> Result<()> {
     ctx.table("t1")
         .await?
         .filter(in_subquery(
-            col("t1.c2"),
+            col("t1.speed"),
             Arc::new(
                 ctx.table("t2")
                     .await?
-                    .filter(col("t2.c1").gt(lit(ScalarValue::UInt8(Some(0)))))?
-                    .aggregate(vec![], vec![max(col("t2.c2"))])?
-                    .select(vec![max(col("t2.c2"))])?
+                    .filter(
+                        col("t2.car").eq(lit(ScalarValue::Utf8(Some("red".to_string())))),
+                    )?
+                    .aggregate(vec![], vec![max(col("t2.speed"))])?
+                    .select(vec![max(col("t2.speed"))])?
                     .into_unoptimized_plan(),
             ),
         ))?
-        .select(vec![col("t1.c1"), col("t1.c2")])?
+        .select(vec![col("t1.car"), col("t1.speed")])?
         .limit(0, Some(3))?
         .show()
         .await?;
@@ -304,31 +318,27 @@ async fn where_in_subquery(ctx: &SessionContext) -> Result<()> {
 }
 
 /// Use the DataFrame API to execute the following subquery:
-/// select t1.c1, t1.c2 from t1 where exists (select t2.c2 from t2 where t1.c1 = t2.c1) limit 3;
+/// select t1.car, t1.speed from t1 where exists (select t2.speed from t2 where t1.car = t2.car) limit 3;
 async fn where_exist_subquery(ctx: &SessionContext) -> Result<()> {
     ctx.table("t1")
         .await?
         .filter(exists(Arc::new(
             ctx.table("t2")
                 .await?
-                .filter(out_ref_col(DataType::Utf8, "t1.c1").eq(col("t2.c1")))?
-                .select(vec![col("t2.c2")])?
+                .filter(out_ref_col(DataType::Utf8, "t1.car").eq(col("t2.car")))?
+                .select(vec![col("t2.speed")])?
                 .into_unoptimized_plan(),
         )))?
-        .select(vec![col("t1.c1"), col("t1.c2")])?
+        .select(vec![col("t1.car"), col("t1.speed")])?
         .limit(0, Some(3))?
         .show()
         .await?;
     Ok(())
 }
 
-async fn register_aggregate_test_data(name: &str, ctx: &SessionContext) -> Result<()> {
-    let testdata = datafusion::test_util::arrow_test_data();
-    ctx.register_csv(
-        name,
-        &format!("{testdata}/csv/aggregate_test_100.csv"),
-        CsvReadOptions::default(),
-    )
-    .await?;
+async fn register_cars_test_data(name: &str, ctx: &SessionContext) -> Result<()> {
+    let dataset = ExampleDataset::Cars;
+    ctx.register_csv(name, dataset.path_str()?, CsvReadOptions::default())
+        .await?;
     Ok(())
 }
diff --git a/datafusion-examples/examples/dataframe/deserialize_to_struct.rs b/datafusion-examples/examples/dataframe/deserialize_to_struct.rs
new file mode 100644
index 0000000000000..b031225dc9b69
--- /dev/null
+++ b/datafusion-examples/examples/dataframe/deserialize_to_struct.rs
@@ -0,0 +1,366 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+
+use arrow::array::{Array, Float64Array, StringViewArray};
+use datafusion::common::assert_batches_eq;
+use datafusion::error::Result;
+use datafusion::prelude::*;
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
+use futures::StreamExt;
+
+/// This example shows how to convert query results into Rust structs by using
+/// the Arrow APIs to convert the results into Rust native types.
+///
+/// This is a bit tricky initially as the results are returned as columns stored
+/// as [ArrayRef]
+///
+/// [ArrayRef]: arrow::array::ArrayRef
+pub async fn deserialize_to_struct() -> Result<()> {
+    // Run a query that returns two columns of data
+    let ctx = SessionContext::new();
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
+    ctx.register_parquet(
+        "cars",
+        parquet_temp.path_str()?,
+        ParquetReadOptions::default(),
+    )
+    .await?;
+
+    let df = ctx
+        .sql("SELECT car, speed FROM cars ORDER BY speed LIMIT 50")
+        .await?;
+
+    // print out the results showing we have car and speed columns and a deterministic ordering
+    let results = df.clone().collect().await?;
+    assert_batches_eq!(
+        [
+            "+-------+-------+",
+            "| car   | speed |",
+            "+-------+-------+",
+            "| red   | 0.0   |",
+            "| red   | 1.0   |",
+            "| green | 2.0   |",
+            "| red   | 3.0   |",
+            "| red   | 7.0   |",
+            "| red   | 7.1   |",
+            "| red   | 7.2   |",
+            "| green | 8.0   |",
+            "| green | 10.0  |",
+            "| green | 10.3  |",
+            "| green | 10.4  |",
+            "| green | 10.5  |",
+            "| green | 11.0  |",
+            "| green | 12.0  |",
+            "| green | 14.0  |",
+            "| green | 15.0  |",
+            "| green | 15.1  |",
+            "| green | 15.2  |",
+            "| red   | 17.0  |",
+            "| red   | 18.0  |",
+            "| red   | 19.0  |",
+            "| red   | 20.0  |",
+            "| red   | 20.3  |",
+            "| red   | 21.4  |",
+            "| red   | 21.5  |",
+            "+-------+-------+",
+        ],
+        &results
+    );
+
+    // We will now convert the query results into a Rust struct
+    let mut stream = df.execute_stream().await?;
+    let mut list: Vec<Data> = vec![];
+
+    // DataFusion produces data in chunks called `RecordBatch`es which are
+    // typically 8000 rows each. This loop processes each `RecordBatch` as it is
+    // produced by the query plan and adds it to the list
+    while let Some(batch) = stream.next().await.transpose()? {
+        // Each `RecordBatch` has one or more columns. Each column is stored as
+        // an `ArrayRef`. To interact with data using Rust native types we need to
+        // convert these `ArrayRef`s into concrete array types using APIs from
+        // the arrow crate.
+
+        // In this case, we know that each batch has two columns of the  Arrow
+        // types StringView and Float64, so first we cast the two columns to the
+        // appropriate Arrow PrimitiveArray (this is a fast / zero-copy cast).:
+        let car_col = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<StringViewArray>()
+            .expect("car column must be Utf8View");
+
+        let speed_col = batch
+            .column(1)
+            .as_any()
+            .downcast_ref::<Float64Array>()
+            .expect("speed column must be Float64");
+
+        // With PrimitiveArrays, we can access to the values as native Rust
+        // types String and f64, and forming the desired `Data` structs
+        for i in 0..batch.num_rows() {
+            let car = if car_col.is_null(i) {
+                None
+            } else {
+                Some(car_col.value(i).to_string())
+            };
+
+            let speed = if speed_col.is_null(i) {
+                None
+            } else {
+                Some(speed_col.value(i))
+            };
+
+            list.push(Data { car, speed });
+        }
+    }
+
+    // Finally, we have the results in the list of Rust structs
+    let res = format!("{list:#?}");
+    assert_eq!(
+        res,
+        r#"[
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            0.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            1.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            2.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            3.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            7.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            7.1,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            7.2,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            8.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            10.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            10.3,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            10.4,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            10.5,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            11.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            12.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            14.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            15.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            15.1,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            15.2,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            17.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            18.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            19.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            20.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            20.3,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            21.4,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            21.5,
+        ),
+    },
+]"#
+    );
+
+    let speed_green_sum: f64 = list
+        .iter()
+        .filter(|data| data.car.as_deref() == Some("green"))
+        .filter_map(|data| data.speed)
+        .sum();
+    let speed_red_sum: f64 = list
+        .iter()
+        .filter(|data| data.car.as_deref() == Some("red"))
+        .filter_map(|data| data.speed)
+        .sum();
+    assert_eq!(speed_green_sum, 133.5);
+    assert_eq!(speed_red_sum, 162.5);
+
+    Ok(())
+}
+
+/// This is target struct where we want the query results.
+#[derive(Debug)]
+struct Data {
+    car: Option<String>,
+    speed: Option<f64>,
+}
diff --git a/datafusion-examples/examples/dataframe/main.rs b/datafusion-examples/examples/dataframe/main.rs
new file mode 100644
index 0000000000000..25b5377d38239
--- /dev/null
+++ b/datafusion-examples/examples/dataframe/main.rs
@@ -0,0 +1,100 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # These are core DataFrame API usage
+//!
+//! These examples demonstrate core DataFrame API usage.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example dataframe -- [all|dataframe|deserialize_to_struct|cache_factory]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `cache_factory`  
+//!   (file: cache_factory.rs, desc: Custom lazy caching for DataFrames using `CacheFactory`)
+//
+//! - `dataframe`
+//!   (file: dataframe.rs, desc: Query DataFrames from various sources and write output)
+//!
+//! - `deserialize_to_struct`
+//!   (file: deserialize_to_struct.rs, desc: Convert Arrow arrays into Rust structs)
+
+mod cache_factory;
+mod dataframe;
+mod deserialize_to_struct;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    Dataframe,
+    DeserializeToStruct,
+    CacheFactory,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "dataframe";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::Dataframe => {
+                dataframe::dataframe_example().await?;
+            }
+            ExampleKind::DeserializeToStruct => {
+                deserialize_to_struct::deserialize_to_struct().await?;
+            }
+            ExampleKind::CacheFactory => {
+                cache_factory::cache_dataframe_with_custom_logic().await?;
+            }
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/deserialize_to_struct.rs b/datafusion-examples/examples/deserialize_to_struct.rs
deleted file mode 100644
index d6655b3b654f9..0000000000000
--- a/datafusion-examples/examples/deserialize_to_struct.rs
+++ /dev/null
@@ -1,150 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use arrow::array::{AsArray, PrimitiveArray};
-use arrow::datatypes::{Float64Type, Int32Type};
-use datafusion::common::assert_batches_eq;
-use datafusion::error::Result;
-use datafusion::prelude::*;
-use futures::StreamExt;
-
-/// This example shows how to convert query results into Rust structs by using
-/// the Arrow APIs to convert the results into Rust native types.
-///
-/// This is a bit tricky initially as the results are returned as columns stored
-/// as [ArrayRef]
-///
-/// [ArrayRef]: arrow::array::ArrayRef
-#[tokio::main]
-async fn main() -> Result<()> {
-    // Run a query that returns two columns of data
-    let ctx = SessionContext::new();
-    let testdata = datafusion::test_util::parquet_test_data();
-    ctx.register_parquet(
-        "alltypes_plain",
-        &format!("{testdata}/alltypes_plain.parquet"),
-        ParquetReadOptions::default(),
-    )
-    .await?;
-    let df = ctx
-        .sql("SELECT int_col, double_col FROM alltypes_plain")
-        .await?;
-
-    // print out the results showing we have an int32 and a float64 column
-    let results = df.clone().collect().await?;
-    assert_batches_eq!(
-        [
-            "+---------+------------+",
-            "| int_col | double_col |",
-            "+---------+------------+",
-            "| 0       | 0.0        |",
-            "| 1       | 10.1       |",
-            "| 0       | 0.0        |",
-            "| 1       | 10.1       |",
-            "| 0       | 0.0        |",
-            "| 1       | 10.1       |",
-            "| 0       | 0.0        |",
-            "| 1       | 10.1       |",
-            "+---------+------------+",
-        ],
-        &results
-    );
-
-    // We will now convert the query results into a Rust struct
-    let mut stream = df.execute_stream().await?;
-    let mut list = vec![];
-
-    // DataFusion produces data in chunks called `RecordBatch`es which are
-    // typically 8000 rows each. This loop processes each `RecordBatch` as it is
-    // produced by the query plan and adds it to the list
-    while let Some(b) = stream.next().await.transpose()? {
-        // Each `RecordBatch` has one or more columns. Each column is stored as
-        // an `ArrayRef`. To interact with data using Rust native types we need to
-        // convert these `ArrayRef`s into concrete array types using APIs from
-        // the arrow crate.
-
-        // In this case, we know that each batch has two columns of the  Arrow
-        // types Int32 and Float64, so first we cast the two columns to the
-        // appropriate Arrow PrimitiveArray (this is a fast / zero-copy cast).:
-        let int_col: &PrimitiveArray<Int32Type> = b.column(0).as_primitive();
-        let float_col: &PrimitiveArray<Float64Type> = b.column(1).as_primitive();
-
-        // With PrimitiveArrays, we can access to the values as native Rust
-        // types i32 and f64, and forming the desired `Data` structs
-        for (i, f) in int_col.values().iter().zip(float_col.values()) {
-            list.push(Data {
-                int_col: *i,
-                double_col: *f,
-            })
-        }
-    }
-
-    // Finally, we have the results in the list of Rust structs
-    let res = format!("{list:#?}");
-    assert_eq!(
-        res,
-        r#"[
-    Data {
-        int_col: 0,
-        double_col: 0.0,
-    },
-    Data {
-        int_col: 1,
-        double_col: 10.1,
-    },
-    Data {
-        int_col: 0,
-        double_col: 0.0,
-    },
-    Data {
-        int_col: 1,
-        double_col: 10.1,
-    },
-    Data {
-        int_col: 0,
-        double_col: 0.0,
-    },
-    Data {
-        int_col: 1,
-        double_col: 10.1,
-    },
-    Data {
-        int_col: 0,
-        double_col: 0.0,
-    },
-    Data {
-        int_col: 1,
-        double_col: 10.1,
-    },
-]"#
-    );
-
-    // Use the fields in the struct to avoid clippy complaints
-    let int_sum = list.iter().fold(0, |acc, x| acc + x.int_col);
-    let double_sum = list.iter().fold(0.0, |acc, x| acc + x.double_col);
-    assert_eq!(int_sum, 4);
-    assert_eq!(double_sum, 40.4);
-
-    Ok(())
-}
-
-/// This is target struct where we want the query results.
-#[derive(Debug)]
-struct Data {
-    int_col: i32,
-    double_col: f64,
-}
diff --git a/datafusion-examples/examples/execution_monitoring/main.rs b/datafusion-examples/examples/execution_monitoring/main.rs
new file mode 100644
index 0000000000000..8f80c36929ca2
--- /dev/null
+++ b/datafusion-examples/examples/execution_monitoring/main.rs
@@ -0,0 +1,98 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # These examples of memory and performance management
+//!
+//! These examples demonstrate memory and performance management.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example execution_monitoring -- [all|mem_pool_exec_plan|mem_pool_tracking|tracing]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `mem_pool_exec_plan`
+//!   (file: memory_pool_execution_plan.rs, desc: Memory-aware ExecutionPlan with spilling)
+//!
+//! - `mem_pool_tracking`
+//!   (file: memory_pool_tracking.rs, desc: Demonstrates memory tracking)
+//!
+//! - `tracing`
+//!   (file: tracing.rs, desc: Demonstrates tracing integration)
+
+mod memory_pool_execution_plan;
+mod memory_pool_tracking;
+mod tracing;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    MemPoolExecPlan,
+    MemPoolTracking,
+    Tracing,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "execution_monitoring";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::MemPoolExecPlan => {
+                memory_pool_execution_plan::memory_pool_execution_plan().await?
+            }
+            ExampleKind::MemPoolTracking => {
+                memory_pool_tracking::mem_pool_tracking().await?
+            }
+            ExampleKind::Tracing => tracing::tracing().await?,
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/memory_pool_execution_plan.rs b/datafusion-examples/examples/execution_monitoring/memory_pool_execution_plan.rs
similarity index 90%
rename from datafusion-examples/examples/memory_pool_execution_plan.rs
rename to datafusion-examples/examples/execution_monitoring/memory_pool_execution_plan.rs
index 3258cde17625f..dc374c7e02fe5 100644
--- a/datafusion-examples/examples/memory_pool_execution_plan.rs
+++ b/datafusion-examples/examples/execution_monitoring/memory_pool_execution_plan.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+//!
 //! This example demonstrates how to implement custom ExecutionPlans that properly
 //! use memory tracking through TrackConsumersPool.
 //!
@@ -27,8 +29,9 @@
 use arrow::record_batch::RecordBatch;
 use arrow_schema::SchemaRef;
 use datafusion::common::record_batch;
+use datafusion::common::tree_node::TreeNodeRecursion;
 use datafusion::common::{exec_datafusion_err, internal_err};
-use datafusion::datasource::{memory::MemTable, DefaultTableSource};
+use datafusion::datasource::{DefaultTableSource, memory::MemTable};
 use datafusion::error::Result;
 use datafusion::execution::memory_pool::{MemoryConsumer, MemoryReservation};
 use datafusion::execution::runtime_env::RuntimeEnvBuilder;
@@ -36,16 +39,15 @@ use datafusion::execution::{SendableRecordBatchStream, TaskContext};
 use datafusion::logical_expr::LogicalPlanBuilder;
 use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion::physical_plan::{
-    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, Statistics,
+    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
 };
 use datafusion::prelude::*;
 use futures::stream::{StreamExt, TryStreamExt};
-use std::any::Any;
 use std::fmt;
 use std::sync::Arc;
 
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
+/// Shows how to implement memory-aware ExecutionPlan with memory reservation and spilling
+pub async fn memory_pool_execution_plan() -> Result<()> {
     println!("=== DataFusion ExecutionPlan Memory Tracking Example ===\n");
 
     // Set up a runtime with memory tracking
@@ -140,6 +142,7 @@ impl ExternalBatchBufferer {
         }
     }
 
+    #[expect(clippy::needless_pass_by_value)]
     fn add_batch(&mut self, batch_data: Vec<u8>) -> Result<()> {
         let additional_memory = batch_data.len();
 
@@ -196,7 +199,7 @@ impl ExternalBatchBufferer {
 struct BufferingExecutionPlan {
     schema: SchemaRef,
     input: Arc<dyn ExecutionPlan>,
-    properties: PlanProperties,
+    properties: Arc<PlanProperties>,
 }
 
 impl BufferingExecutionPlan {
@@ -222,15 +225,11 @@ impl ExecutionPlan for BufferingExecutionPlan {
         "BufferingExecutionPlan"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         self.schema.clone()
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.properties
     }
 
@@ -294,7 +293,19 @@ impl ExecutionPlan for BufferingExecutionPlan {
         )))
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(Statistics::new_unknown(&self.schema))
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion::physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.properties.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
     }
 }
diff --git a/datafusion-examples/examples/memory_pool_tracking.rs b/datafusion-examples/examples/execution_monitoring/memory_pool_tracking.rs
similarity index 89%
rename from datafusion-examples/examples/memory_pool_tracking.rs
rename to datafusion-examples/examples/execution_monitoring/memory_pool_tracking.rs
index d5823b1173ab3..d849a033bc66b 100644
--- a/datafusion-examples/examples/memory_pool_tracking.rs
+++ b/datafusion-examples/examples/execution_monitoring/memory_pool_tracking.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+//!
 //! This example demonstrates how to use TrackConsumersPool for memory tracking and debugging.
 //!
 //! The TrackConsumersPool provides enhanced error messages that show the top memory consumers
@@ -24,11 +26,12 @@
 //!
 //! * [`automatic_usage_example`]: Shows how to use RuntimeEnvBuilder to automatically enable memory tracking
 
+use datafusion::error::Result;
 use datafusion::execution::runtime_env::RuntimeEnvBuilder;
 use datafusion::prelude::*;
 
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
+/// Demonstrates TrackConsumersPool for memory tracking and debugging with enhanced error messages
+pub async fn mem_pool_tracking() -> Result<()> {
     println!("=== DataFusion Memory Pool Tracking Example ===\n");
 
     // Example 1: Automatic Usage with RuntimeEnvBuilder
@@ -41,7 +44,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
 ///
 /// This shows the recommended way to use TrackConsumersPool through RuntimeEnvBuilder,
 /// which automatically creates a TrackConsumersPool with sensible defaults.
-async fn automatic_usage_example() -> datafusion::error::Result<()> {
+async fn automatic_usage_example() -> Result<()> {
     println!("Example 1: Automatic Usage with RuntimeEnvBuilder");
     println!("------------------------------------------------");
 
@@ -107,7 +110,8 @@ async fn automatic_usage_example() -> datafusion::error::Result<()> {
             println!("✓ Expected memory limit error during data processing:");
             println!("Error: {e}");
             /* Example error message:
-                Error: Not enough memory to continue external sort. Consider increasing the memory limit, or decreasing sort_spill_reservation_bytes
+                Error: Not enough memory to continue external sort. Consider increasing the memory limit config: 'datafusion.runtime.memory_limit',
+                or decreasing the config: 'datafusion.execution.sort_spill_reservation_bytes'.
                 caused by
                     Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:
                     ExternalSorterMerge[3]#112(can spill: false) consumed 10.0 MB, peak 10.0 MB,
@@ -115,7 +119,8 @@ async fn automatic_usage_example() -> datafusion::error::Result<()> {
                     ExternalSorter[1]#93(can spill: true) consumed 69.0 KB, peak 69.0 KB,
                     ExternalSorter[13]#155(can spill: true) consumed 67.6 KB, peak 67.6 KB,
                     ExternalSorter[8]#140(can spill: true) consumed 67.2 KB, peak 67.2 KB.
-                Error: Failed to allocate additional 10.0 MB for ExternalSorterMerge[0] with 0.0 B already allocated for this reservation - 7.1 MB remain available for the total pool
+                Error: Failed to allocate additional 10.0 MB for ExternalSorterMerge[0] with 0.0 B already allocated
+                for this reservation - 7.1 MB remain available for the total memory pool
              */
         }
     }
diff --git a/datafusion-examples/examples/tracing.rs b/datafusion-examples/examples/execution_monitoring/tracing.rs
similarity index 82%
rename from datafusion-examples/examples/tracing.rs
rename to datafusion-examples/examples/execution_monitoring/tracing.rs
index 334ee0f4e5686..172c1ca83b3bd 100644
--- a/datafusion-examples/examples/tracing.rs
+++ b/datafusion-examples/examples/execution_monitoring/tracing.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+//!
 //! This example demonstrates the tracing injection feature for the DataFusion runtime.
 //! Tasks spawned on new threads behave differently depending on whether a tracer is injected.
 //! The log output clearly distinguishes the two cases.
@@ -49,20 +51,21 @@
 //! 10:29:40.809  INFO                 main ThreadId(01) tracing: ***** WITH tracer: Non-main tasks DID inherit the `run_instrumented_query` span *****
 //! ```
 
-use datafusion::common::runtime::{set_join_set_tracer, JoinSetTracer};
+use std::any::Any;
+use std::sync::Arc;
+
+use datafusion::common::runtime::{JoinSetTracer, set_join_set_tracer};
 use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::listing::ListingOptions;
 use datafusion::error::Result;
 use datafusion::prelude::*;
-use datafusion::test_util::parquet_test_data;
-use futures::future::BoxFuture;
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
 use futures::FutureExt;
-use std::any::Any;
-use std::sync::Arc;
-use tracing::{info, instrument, Instrument, Level, Span};
+use futures::future::BoxFuture;
+use tracing::{Instrument, Level, Span, info, instrument};
 
-#[tokio::main]
-async fn main() -> Result<()> {
+/// Demonstrates the tracing injection feature for the DataFusion runtime
+pub async fn tracing() -> Result<()> {
     // Initialize tracing subscriber with thread info.
     tracing_subscriber::fmt()
         .with_thread_ids(true)
@@ -73,7 +76,9 @@ async fn main() -> Result<()> {
     // Run query WITHOUT tracer injection.
     info!("***** RUNNING WITHOUT INJECTED TRACER *****");
     run_instrumented_query().await?;
-    info!("***** WITHOUT tracer: `tokio-runtime-worker` tasks did NOT inherit the `run_instrumented_query` span *****");
+    info!(
+        "***** WITHOUT tracer: `tokio-runtime-worker` tasks did NOT inherit the `run_instrumented_query` span *****"
+    );
 
     // Inject custom tracer so tasks run in the current span.
     info!("Injecting custom tracer...");
@@ -82,7 +87,9 @@ async fn main() -> Result<()> {
     // Run query WITH tracer injection.
     info!("***** RUNNING WITH INJECTED TRACER *****");
     run_instrumented_query().await?;
-    info!("***** WITH tracer: `tokio-runtime-worker` tasks DID inherit the `run_instrumented_query` span *****");
+    info!(
+        "***** WITH tracer: `tokio-runtime-worker` tasks DID inherit the `run_instrumented_query` span *****"
+    );
 
     Ok(())
 }
@@ -120,18 +127,27 @@ async fn run_instrumented_query() -> Result<()> {
     info!("Starting query execution");
 
     let ctx = SessionContext::new();
-    let test_data = parquet_test_data();
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
     let file_format = ParquetFormat::default().with_enable_pruning(true);
-    let listing_options = ListingOptions::new(Arc::new(file_format))
-        .with_file_extension("alltypes_tiny_pages_plain.parquet");
+    let listing_options =
+        ListingOptions::new(Arc::new(file_format)).with_file_extension(".parquet");
 
-    let table_path = format!("file://{test_data}/");
-    info!("Registering table 'alltypes' from {}", table_path);
-    ctx.register_listing_table("alltypes", &table_path, listing_options, None, None)
-        .await
-        .expect("Failed to register table");
+    info!("Registering table 'cars' from {}", parquet_temp.path_str()?);
+    ctx.register_listing_table(
+        "cars",
+        parquet_temp.path_str()?,
+        listing_options,
+        None,
+        None,
+    )
+    .await
+    .expect("Failed to register table");
 
-    let sql = "SELECT COUNT(*), string_col FROM alltypes GROUP BY string_col";
+    let sql = "SELECT COUNT(*), car, sum(speed) FROM cars GROUP BY car";
     info!(sql, "Executing SQL query");
     let result = ctx.sql(sql).await?.collect().await?;
     info!("Query complete: {} batches returned", result.len());
diff --git a/datafusion-examples/examples/extension_types/main.rs b/datafusion-examples/examples/extension_types/main.rs
new file mode 100644
index 0000000000000..97c00fdcb64f8
--- /dev/null
+++ b/datafusion-examples/examples/extension_types/main.rs
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # Extension type usage examples
+//!
+//! These examples demonstrate the API for creating and using custom extension types.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example extension_types -- [all|temperature]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `temperature`
+//!   (file: temperature.rs, desc: Extension type for temperature data.)
+
+mod temperature;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    Temperature,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "extension_types";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::Temperature => {
+                temperature::temperature_example().await?;
+            }
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/extension_types/temperature.rs b/datafusion-examples/examples/extension_types/temperature.rs
new file mode 100644
index 0000000000000..478cf5ebbf312
--- /dev/null
+++ b/datafusion-examples/examples/extension_types/temperature.rs
@@ -0,0 +1,316 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Array, ArrowPrimitiveType, AsArray, Float32Array, Float64Array, PrimitiveArray,
+    RecordBatch, StringArray,
+};
+use arrow::datatypes::{Float32Type, Float64Type};
+use arrow::util::display::{ArrayFormatter, DisplayIndex, FormatOptions, FormatResult};
+use arrow_schema::extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY};
+use arrow_schema::{ArrowError, DataType, Field, Schema, SchemaRef};
+use datafusion::dataframe::DataFrame;
+use datafusion::error::Result;
+use datafusion::execution::SessionStateBuilder;
+use datafusion::prelude::SessionContext;
+use datafusion_common::internal_err;
+use datafusion_common::types::DFExtensionType;
+use datafusion_expr::registry::{
+    ExtensionTypeRegistration, ExtensionTypeRegistry, MemoryExtensionTypeRegistry,
+};
+use std::collections::HashMap;
+use std::fmt::{Display, Write};
+use std::sync::Arc;
+
+/// This example demonstrates using DataFusion's extension type API to create a custom
+/// extension type [`TemperatureExtensionType`] for representing different temperature units.
+pub async fn temperature_example() -> Result<()> {
+    let ctx = create_session_context()?;
+    register_temperature_table(&ctx).await?;
+
+    // Print the example table with the custom pretty-printer.
+    ctx.table("example").await?.show().await
+}
+
+/// Creates the DataFusion session context with the custom extension type implementation.
+fn create_session_context() -> Result<SessionContext> {
+    let registry = MemoryExtensionTypeRegistry::new_empty();
+
+    // The registration creates a new instance of the extension type with the deserialized metadata.
+    let temp_registration = ExtensionTypeRegistration::new_arc(
+        TemperatureExtensionType::NAME,
+        |storage_type, metadata| {
+            Ok(Arc::new(TemperatureExtensionType::try_new(
+                storage_type,
+                TemperatureUnit::deserialize(metadata)?,
+            )?))
+        },
+    );
+    registry.add_extension_type_registration(temp_registration)?;
+
+    let state = SessionStateBuilder::default()
+        .with_extension_type_registry(Arc::new(registry))
+        .build();
+    Ok(SessionContext::new_with_state(state))
+}
+
+/// Registers the example table and returns the data frame.
+async fn register_temperature_table(ctx: &SessionContext) -> Result<DataFrame> {
+    let schema = example_schema();
+
+    let city_names = Arc::new(StringArray::from(vec![
+        "Vienna", "Tokyo", "New York", "Sydney",
+    ]));
+
+    // The temperature readings in different units
+    let celsius_temps = vec![15.1, 22.5, 18.98, 25.0];
+    let fahrenheit_temps = vec![59.18, 72.5, 66.164, 77.0];
+    let kelvin_temps = vec![288.25, 295.65, 292.13, 298.15];
+
+    let batch = RecordBatch::try_new(
+        schema,
+        vec![
+            city_names,
+            Arc::new(Float64Array::from(celsius_temps)),
+            Arc::new(Float64Array::from(fahrenheit_temps)),
+            Arc::new(Float32Array::from(kelvin_temps)), // Demonstrate use of different storage type
+        ],
+    )?;
+
+    ctx.register_batch("example", batch)?;
+    ctx.table("example").await
+}
+
+/// The schema of the example table.
+fn example_schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![
+        Field::new("city", DataType::Utf8, false),
+        Field::new("celsius", DataType::Float64, false)
+            .with_metadata(create_metadata(TemperatureUnit::Celsius)),
+        Field::new("fahrenheit", DataType::Float64, false)
+            .with_metadata(create_metadata(TemperatureUnit::Fahrenheit)),
+        Field::new("kelvin", DataType::Float32, false)
+            .with_metadata(create_metadata(TemperatureUnit::Kelvin)),
+    ]))
+}
+
+/// Represents a float that semantically represents a temperature. The temperature can be one of
+/// the supported [`TemperatureUnit`]s.
+///
+/// The unit is realized as an additional extension type metadata and is stored alongside the
+/// extension type name in the Arrow field metadata. This metadata can also be stored within files,
+/// allowing DataFusion to read temperature data from, for example, Parquet files.
+///
+/// The field metadata for a Celsius temperature field will look like this (serialized as JSON):
+/// ```json
+/// {
+///     "ARROW:extension:name": "custom.temperature",
+///     "ARROW:extension:metadata": "celsius"
+/// }
+/// ```
+///
+/// See [the official Arrow documentation](https://arrow.apache.org/docs/format/Columnar.html#extension-types)
+/// for more details on the extension type mechanism.
+#[derive(Debug)]
+pub struct TemperatureExtensionType {
+    /// Extension type instances are always for a specific storage type and metadata pairing.
+    /// Therefore, we store the storage type.
+    storage_type: DataType,
+    /// The unit of the temperature.
+    temperature_unit: TemperatureUnit,
+}
+
+impl TemperatureExtensionType {
+    /// The name of the extension type.
+    pub const NAME: &'static str = "custom.temperature";
+
+    /// Creates a new [`TemperatureExtensionType`].
+    pub fn try_new(
+        storage_type: &DataType,
+        temperature_unit: TemperatureUnit,
+    ) -> Result<Self, ArrowError> {
+        match storage_type {
+            DataType::Float32 | DataType::Float64 => {}
+            _ => {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                    "Invalid data type: {storage_type} for temperature type, expected Float32 or Float64",
+                )));
+            }
+        }
+
+        let result = Self {
+            storage_type: storage_type.clone(),
+            temperature_unit,
+        };
+        Ok(result)
+    }
+}
+
+/// Represents the unit of a temperature reading.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum TemperatureUnit {
+    Celsius,
+    Fahrenheit,
+    Kelvin,
+}
+
+impl TemperatureUnit {
+    /// Arrow extension type metadata is encoded as a string and stored using the
+    /// `ARROW:extension:metadata` key. As we only store the name of the unit, a simple string
+    /// suffices. Extension types can store more complex metadata using serialization formats like
+    /// JSON.
+    pub fn serialize(self) -> String {
+        let result = match self {
+            TemperatureUnit::Celsius => "celsius",
+            TemperatureUnit::Fahrenheit => "fahrenheit",
+            TemperatureUnit::Kelvin => "kelvin",
+        };
+        result.to_owned()
+    }
+
+    /// Inverse operation of [`TemperatureUnit::serialize`]. This creates the [`TemperatureUnit`]
+    /// value from the serialized string.
+    pub fn deserialize(value: Option<&str>) -> std::result::Result<Self, ArrowError> {
+        match value {
+            Some("celsius") => Ok(TemperatureUnit::Celsius),
+            Some("fahrenheit") => Ok(TemperatureUnit::Fahrenheit),
+            Some("kelvin") => Ok(TemperatureUnit::Kelvin),
+            Some(other) => Err(ArrowError::InvalidArgumentError(format!(
+                "Invalid metadata for temperature type: {other}"
+            ))),
+            None => Err(ArrowError::InvalidArgumentError(
+                "Temperature type requires metadata (unit)".to_owned(),
+            )),
+        }
+    }
+}
+
+/// This creates a metadata map for the temperature type. Another way of writing the metadata can be
+/// implemented using arrow-rs' [`ExtensionType`](arrow_schema::extension::ExtensionType) trait.
+fn create_metadata(unit: TemperatureUnit) -> HashMap<String, String> {
+    HashMap::from([
+        (
+            EXTENSION_TYPE_NAME_KEY.to_owned(),
+            TemperatureExtensionType::NAME.to_owned(),
+        ),
+        (EXTENSION_TYPE_METADATA_KEY.to_owned(), unit.serialize()),
+    ])
+}
+
+/// Implementation of [`DFExtensionType`] for [`TemperatureExtensionType`].
+///
+/// This implements the trait for customizing DataFusion.
+impl DFExtensionType for TemperatureExtensionType {
+    fn storage_type(&self) -> DataType {
+        self.storage_type.clone()
+    }
+
+    fn serialize_metadata(&self) -> Option<String> {
+        Some(self.temperature_unit.serialize())
+    }
+
+    fn create_array_formatter<'fmt>(
+        &self,
+        array: &'fmt dyn Array,
+        options: &FormatOptions<'fmt>,
+    ) -> Result<Option<ArrayFormatter<'fmt>>> {
+        match self.storage_type {
+            DataType::Float32 => {
+                let display_index = TemperatureDisplayIndex {
+                    array: array.as_primitive::<Float32Type>(),
+                    null_str: options.null(),
+                    unit: self.temperature_unit,
+                };
+                Ok(Some(ArrayFormatter::new(
+                    Box::new(display_index),
+                    options.safe(),
+                )))
+            }
+            DataType::Float64 => {
+                let display_index = TemperatureDisplayIndex {
+                    array: array.as_primitive::<Float64Type>(),
+                    null_str: options.null(),
+                    unit: self.temperature_unit,
+                };
+                Ok(Some(ArrayFormatter::new(
+                    Box::new(display_index),
+                    options.safe(),
+                )))
+            }
+            _ => internal_err!("Wrong array type for Temperature"),
+        }
+    }
+}
+
+/// Pretty printer for temperatures.
+#[derive(Debug)]
+struct TemperatureDisplayIndex<'a, TNative: ArrowPrimitiveType<Native: Display>> {
+    array: &'a PrimitiveArray<TNative>,
+    null_str: &'a str,
+    unit: TemperatureUnit,
+}
+
+/// Implements the custom display logic.
+impl<TNative: ArrowPrimitiveType<Native: Display>> DisplayIndex
+    for TemperatureDisplayIndex<'_, TNative>
+{
+    fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+        if self.array.is_null(idx) {
+            write!(f, "{}", self.null_str)?;
+            return Ok(());
+        }
+
+        let value = self.array.value(idx);
+        let suffix = match self.unit {
+            TemperatureUnit::Celsius => "°C",
+            TemperatureUnit::Fahrenheit => "°F",
+            TemperatureUnit::Kelvin => "K",
+        };
+
+        write!(f, "{value:.2} {suffix}")?;
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use insta::assert_snapshot;
+
+    #[tokio::test]
+    async fn test_print_example_table() -> Result<()> {
+        let ctx = create_session_context()?;
+        let table = register_temperature_table(&ctx).await?;
+
+        assert_snapshot!(
+            table.to_string().await?,
+            @r"
+        +----------+----------+------------+----------+
+        | city     | celsius  | fahrenheit | kelvin   |
+        +----------+----------+------------+----------+
+        | Vienna   | 15.10 °C | 59.18 °F   | 288.25 K |
+        | Tokyo    | 22.50 °C | 72.50 °F   | 295.65 K |
+        | New York | 18.98 °C | 66.16 °F   | 292.13 K |
+        | Sydney   | 25.00 °C | 77.00 °F   | 298.15 K |
+        +----------+----------+------------+----------+
+        "
+        );
+
+        Ok(())
+    }
+}
diff --git a/datafusion-examples/examples/external_dependency/dataframe-to-s3.rs b/datafusion-examples/examples/external_dependency/dataframe_to_s3.rs
similarity index 87%
rename from datafusion-examples/examples/external_dependency/dataframe-to-s3.rs
rename to datafusion-examples/examples/external_dependency/dataframe_to_s3.rs
index e75ba5dd5328a..fdb8a3c9c051a 100644
--- a/datafusion-examples/examples/external_dependency/dataframe-to-s3.rs
+++ b/datafusion-examples/examples/external_dependency/dataframe_to_s3.rs
@@ -15,12 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use std::env;
 use std::sync::Arc;
 
 use datafusion::dataframe::DataFrameWriteOptions;
-use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::file_format::FileFormat;
+use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::listing::ListingOptions;
 use datafusion::error::Result;
 use datafusion::prelude::*;
@@ -28,14 +30,18 @@ use datafusion::prelude::*;
 use object_store::aws::AmazonS3Builder;
 use url::Url;
 
-/// This example demonstrates querying data from AmazonS3 and writing
-/// the result of a query back to AmazonS3
-#[tokio::main]
-async fn main() -> Result<()> {
+/// This example demonstrates querying data from Amazon S3 and writing
+/// the result of a query back to Amazon S3.
+///
+/// The following environment variables must be defined:
+///
+/// - AWS_ACCESS_KEY_ID
+/// - AWS_SECRET_ACCESS_KEY
+pub async fn dataframe_to_s3() -> Result<()> {
     // create local execution context
     let ctx = SessionContext::new();
 
-    //enter region and bucket to which your credentials have GET and PUT access
+    // enter region and bucket to which your credentials have GET and PUT access
     let region = "<bucket-region-here>";
     let bucket_name = "<bucket-name-here>";
 
@@ -66,13 +72,13 @@ async fn main() -> Result<()> {
         .write_parquet(&out_path, DataFrameWriteOptions::new(), None)
         .await?;
 
-    //write as JSON to s3
+    // write as JSON to s3
     let json_out = format!("s3://{bucket_name}/json_out");
     df.clone()
         .write_json(&json_out, DataFrameWriteOptions::new(), None)
         .await?;
 
-    //write as csv to s3
+    // write as csv to s3
     let csv_out = format!("s3://{bucket_name}/csv_out");
     df.write_csv(&csv_out, DataFrameWriteOptions::new(), None)
         .await?;
diff --git a/datafusion-examples/examples/external_dependency/main.rs b/datafusion-examples/examples/external_dependency/main.rs
new file mode 100644
index 0000000000000..447e7d38bdd5b
--- /dev/null
+++ b/datafusion-examples/examples/external_dependency/main.rs
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # These are using data from Amazon S3 examples
+//!
+//! These examples demonstrate how to work with data from Amazon S3.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example external_dependency -- [all|dataframe_to_s3|query_aws_s3]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `dataframe_to_s3`
+//!   (file: dataframe_to_s3.rs, desc: Query DataFrames and write results to S3)
+//!
+//! - `query_aws_s3`
+//!   (file: query_aws_s3.rs, desc: Query S3-backed data using object_store)
+
+mod dataframe_to_s3;
+mod query_aws_s3;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    DataframeToS3,
+    QueryAwsS3,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "external_dependency";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::DataframeToS3 => dataframe_to_s3::dataframe_to_s3().await?,
+            ExampleKind::QueryAwsS3 => query_aws_s3::query_aws_s3().await?,
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/external_dependency/query-aws-s3.rs b/datafusion-examples/examples/external_dependency/query_aws_s3.rs
similarity index 90%
rename from datafusion-examples/examples/external_dependency/query-aws-s3.rs
rename to datafusion-examples/examples/external_dependency/query_aws_s3.rs
index da2d7e4879f99..63507bb3eed11 100644
--- a/datafusion-examples/examples/external_dependency/query-aws-s3.rs
+++ b/datafusion-examples/examples/external_dependency/query_aws_s3.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use datafusion::error::Result;
 use datafusion::prelude::*;
 use object_store::aws::AmazonS3Builder;
@@ -22,15 +24,13 @@ use std::env;
 use std::sync::Arc;
 use url::Url;
 
-/// This example demonstrates querying data in an S3 bucket.
+/// This example demonstrates querying data in a public S3 bucket
+/// (the NYC TLC open dataset: `s3://nyc-tlc`).
 ///
 /// The following environment variables must be defined:
-///
-/// - AWS_ACCESS_KEY_ID
-/// - AWS_SECRET_ACCESS_KEY
-///
-#[tokio::main]
-async fn main() -> Result<()> {
+/// - `AWS_ACCESS_KEY_ID`
+/// - `AWS_SECRET_ACCESS_KEY`
+pub async fn query_aws_s3() -> Result<()> {
     let ctx = SessionContext::new();
 
     // the region must be set to the region where the bucket exists until the following
diff --git a/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml b/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml
index e9c0c5b43d682..3cfa6dcf90f18 100644
--- a/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml
+++ b/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml
@@ -22,12 +22,14 @@ edition = { workspace = true }
 publish = false
 
 [dependencies]
-abi_stable = "0.11.3"
 arrow = { workspace = true }
 datafusion = { workspace = true }
 datafusion-ffi = { workspace = true }
 ffi_module_interface = { path = "../ffi_module_interface" }
 
+[lints]
+workspace = true
+
 [lib]
 name = "ffi_example_table_provider"
 crate-type = ["cdylib", 'rlib']
diff --git a/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs b/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs
index a83f15926f054..7894e97f3796d 100644
--- a/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs
+++ b/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs
@@ -17,12 +17,12 @@
 
 use std::sync::Arc;
 
-use abi_stable::{export_root_module, prefix_type::PrefixTypeTrait};
 use arrow::array::RecordBatch;
 use arrow::datatypes::{DataType, Field, Schema};
 use datafusion::{common::record_batch, datasource::MemTable};
+use datafusion_ffi::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
 use datafusion_ffi::table_provider::FFI_TableProvider;
-use ffi_module_interface::{TableProviderModule, TableProviderModuleRef};
+use ffi_module_interface::TableProviderModule;
 
 fn create_record_batch(start_value: i32, num_values: usize) -> RecordBatch {
     let end_value = start_value + num_values as i32;
@@ -34,7 +34,9 @@ fn create_record_batch(start_value: i32, num_values: usize) -> RecordBatch {
 
 /// Here we only wish to create a simple table provider as an example.
 /// We create an in-memory table and convert it to it's FFI counterpart.
-extern "C" fn construct_simple_table_provider() -> FFI_TableProvider {
+extern "C" fn construct_simple_table_provider(
+    codec: FFI_LogicalExtensionCodec,
+) -> FFI_TableProvider {
     let schema = Arc::new(Schema::new(vec![
         Field::new("a", DataType::Int32, true),
         Field::new("b", DataType::Float64, true),
@@ -50,14 +52,13 @@ extern "C" fn construct_simple_table_provider() -> FFI_TableProvider {
 
     let table_provider = MemTable::try_new(schema, vec![batches]).unwrap();
 
-    FFI_TableProvider::new(Arc::new(table_provider), true, None)
+    FFI_TableProvider::new_with_ffi_codec(Arc::new(table_provider), true, None, codec)
 }
 
-#[export_root_module]
+#[unsafe(no_mangle)]
 /// This defines the entry point for using the module.
-pub fn get_simple_memory_table() -> TableProviderModuleRef {
+pub extern "C" fn ffi_example_get_module() -> TableProviderModule {
     TableProviderModule {
         create_table: construct_simple_table_provider,
     }
-    .leak_into_prefix()
 }
diff --git a/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml b/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml
index 612a219324763..0244cb2a5ed15 100644
--- a/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml
+++ b/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml
@@ -18,9 +18,11 @@
 [package]
 name = "ffi_module_interface"
 version = "0.1.0"
-edition = "2021"
+edition = "2024"
 publish = false
 
+[lints]
+workspace = true
+
 [dependencies]
-abi_stable = "0.11.3"
 datafusion-ffi = { workspace = true }
diff --git a/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs b/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs
index 88690e9297135..54a59c9e5d073 100644
--- a/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs
+++ b/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs
@@ -15,35 +15,17 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use abi_stable::{
-    declare_root_module_statics,
-    library::{LibraryError, RootModule},
-    package_version_strings,
-    sabi_types::VersionStrings,
-    StableAbi,
-};
+use datafusion_ffi::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
 use datafusion_ffi::table_provider::FFI_TableProvider;
 
-#[repr(C)]
-#[derive(StableAbi)]
-#[sabi(kind(Prefix(prefix_ref = TableProviderModuleRef)))]
 /// This struct defines the module interfaces. It is to be shared by
 /// both the module loading program and library that implements the
 /// module. It is possible to move this definition into the loading
 /// program and reference it in the modules, but this example shows
 /// how a user may wish to separate these concerns.
+#[repr(C)]
 pub struct TableProviderModule {
     /// Constructs the table provider
-    pub create_table: extern "C" fn() -> FFI_TableProvider,
-}
-
-impl RootModule for TableProviderModuleRef {
-    declare_root_module_statics! {TableProviderModuleRef}
-    const BASE_NAME: &'static str = "ffi_example_table_provider";
-    const NAME: &'static str = "ffi_example_table_provider";
-    const VERSION_STRINGS: VersionStrings = package_version_strings!();
-
-    fn initialization(self) -> Result<Self, LibraryError> {
-        Ok(self)
-    }
+    pub create_table:
+        extern "C" fn(codec: FFI_LogicalExtensionCodec) -> FFI_TableProvider,
 }
diff --git a/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml b/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml
index 028a366aab1c0..e7b2dd19009b5 100644
--- a/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml
+++ b/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml
@@ -18,12 +18,15 @@
 [package]
 name = "ffi_module_loader"
 version = "0.1.0"
-edition = "2021"
+edition = "2024"
 publish = false
 
+[lints]
+workspace = true
+
 [dependencies]
-abi_stable = "0.11.3"
 datafusion = { workspace = true }
 datafusion-ffi = { workspace = true }
 ffi_module_interface = { path = "../ffi_module_interface" }
+libloading = "0.9"
 tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] }
diff --git a/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs b/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs
index 6e376ca866e8f..0657c4a08fa86 100644
--- a/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs
+++ b/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs
@@ -18,44 +18,69 @@
 use std::sync::Arc;
 
 use datafusion::{
+    datasource::TableProvider,
     error::{DataFusionError, Result},
+    execution::TaskContextProvider,
     prelude::SessionContext,
 };
-
-use abi_stable::library::{development_utils::compute_library_path, RootModule};
-use datafusion_ffi::table_provider::ForeignTableProvider;
-use ffi_module_interface::TableProviderModuleRef;
+use datafusion_ffi::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use ffi_module_interface::TableProviderModule;
 
 #[tokio::main]
 async fn main() -> Result<()> {
     // Find the location of the library. This is specific to the build environment,
     // so you will need to change the approach here based on your use case.
-    let target: &std::path::Path = "../../../../target/".as_ref();
-    let library_path = compute_library_path::<TableProviderModuleRef>(target)
-        .map_err(|e| DataFusionError::External(Box::new(e)))?;
+    let lib_prefix = if cfg!(target_os = "windows") {
+        ""
+    } else {
+        "lib"
+    };
+    let lib_ext = if cfg!(target_os = "macos") {
+        "dylib"
+    } else if cfg!(target_os = "windows") {
+        "dll"
+    } else {
+        "so"
+    };
+
+    let build_type = if cfg!(debug_assertions) {
+        "debug"
+    } else {
+        "release"
+    };
+
+    let library_path = format!(
+        "../../../../target/{build_type}/{lib_prefix}ffi_example_table_provider.{lib_ext}"
+    );
+
+    // Load the library using libloading
+    let lib = unsafe {
+        libloading::Library::new(&library_path)
+            .map_err(|e| DataFusionError::External(Box::new(e)))?
+    };
 
-    // Load the module
-    let table_provider_module =
-        TableProviderModuleRef::load_from_directory(&library_path)
-            .map_err(|e| DataFusionError::External(Box::new(e)))?;
+    let get_module: libloading::Symbol<extern "C" fn() -> TableProviderModule> = unsafe {
+        lib.get(b"ffi_example_get_module")
+            .map_err(|e| DataFusionError::External(Box::new(e)))?
+    };
+
+    let table_provider_module = get_module();
+
+    let ctx = Arc::new(SessionContext::new());
+    let codec = FFI_LogicalExtensionCodec::new_default(
+        &(Arc::clone(&ctx) as Arc<dyn TaskContextProvider>),
+    );
 
     // By calling the code below, the table provided will be created within
     // the module's code.
-    let ffi_table_provider =
-        table_provider_module
-            .create_table()
-            .ok_or(DataFusionError::NotImplemented(
-                "External table provider failed to implement create_table".to_string(),
-            ))?();
+    let ffi_table_provider = (table_provider_module.create_table)(codec);
 
     // In order to access the table provider within this executable, we need to
-    // turn it into a `ForeignTableProvider`.
-    let foreign_table_provider: ForeignTableProvider = (&ffi_table_provider).into();
-
-    let ctx = SessionContext::new();
+    // turn it into a `TableProvider`.
+    let foreign_table_provider: Arc<dyn TableProvider> = (&ffi_table_provider).into();
 
     // Display the data to show the full cycle works.
-    ctx.register_table("external_table", Arc::new(foreign_table_provider))?;
+    ctx.register_table("external_table", foreign_table_provider)?;
     let df = ctx.table("external_table").await?;
     df.show().await?;
 
diff --git a/datafusion-examples/examples/flight/flight_client.rs b/datafusion-examples/examples/flight/client.rs
similarity index 83%
rename from datafusion-examples/examples/flight/flight_client.rs
rename to datafusion-examples/examples/flight/client.rs
index ff4b5903ad884..8f6856a4e4849 100644
--- a/datafusion-examples/examples/flight/flight_client.rs
+++ b/datafusion-examples/examples/flight/client.rs
@@ -15,24 +15,30 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use std::collections::HashMap;
 use std::sync::Arc;
-use tonic::transport::Endpoint;
-
-use datafusion::arrow::datatypes::Schema;
 
 use arrow_flight::flight_descriptor;
 use arrow_flight::flight_service_client::FlightServiceClient;
 use arrow_flight::utils::flight_data_to_arrow_batch;
 use arrow_flight::{FlightDescriptor, Ticket};
+use datafusion::arrow::datatypes::Schema;
 use datafusion::arrow::util::pretty;
+use datafusion::prelude::SessionContext;
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
+use tonic::transport::Endpoint;
 
 /// This example shows how to wrap DataFusion with `FlightService` to support looking up schema information for
 /// Parquet files and executing SQL queries against them on a remote server.
 /// This example is run along-side the example `flight_server`.
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    let testdata = datafusion::test_util::parquet_test_data();
+pub async fn client() -> Result<(), Box<dyn std::error::Error>> {
+    let ctx = SessionContext::new();
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
 
     // Create Flight client
     let endpoint = Endpoint::new("http://localhost:50051")?;
@@ -43,7 +49,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     let request = tonic::Request::new(FlightDescriptor {
         r#type: flight_descriptor::DescriptorType::Path as i32,
         cmd: Default::default(),
-        path: vec![format!("{testdata}/alltypes_plain.parquet")],
+        path: vec![format!("{}", parquet_temp.path_str()?)],
     });
 
     let schema_result = client.get_schema(request).await?.into_inner();
@@ -52,7 +58,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
 
     // Call do_get to execute a SQL query and receive results
     let request = tonic::Request::new(Ticket {
-        ticket: "SELECT id FROM alltypes_plain".into(),
+        ticket: "SELECT car FROM cars".into(),
     });
 
     let mut stream = client.do_get(request).await?.into_inner();
diff --git a/datafusion-examples/examples/flight/main.rs b/datafusion-examples/examples/flight/main.rs
new file mode 100644
index 0000000000000..426e806486f70
--- /dev/null
+++ b/datafusion-examples/examples/flight/main.rs
@@ -0,0 +1,101 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # Arrow Flight Examples
+//!
+//! These examples demonstrate Arrow Flight usage.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example flight -- [all|client|server|sql_server]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!   Note: The Flight server must be started in a separate process
+//!   before running the `client` example. Therefore, running `all` will
+//!   not produce a full server+client workflow automatically.
+//!
+//! - `client`
+//!   (file: client.rs, desc: Execute SQL queries via Arrow Flight protocol)
+//!
+//! - `server`
+//!   (file: server.rs, desc: Run DataFusion server accepting FlightSQL/JDBC queries)
+//!
+//! - `sql_server`
+//!   (file: sql_server.rs, desc: Standalone SQL server for JDBC clients)
+
+mod client;
+mod server;
+mod sql_server;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+/// The `all` option cannot run all examples end-to-end because the
+/// `server` example must run in a separate process before the `client`
+/// example can connect.  
+/// Therefore, `all` only iterates over individually runnable examples.
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    Client,
+    Server,
+    SqlServer,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "flight";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<(), Box<dyn std::error::Error>> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::Client => client::client().await?,
+            ExampleKind::Server => server::server().await?,
+            ExampleKind::SqlServer => sql_server::sql_server().await?,
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/flight/flight_server.rs b/datafusion-examples/examples/flight/server.rs
similarity index 88%
rename from datafusion-examples/examples/flight/flight_server.rs
rename to datafusion-examples/examples/flight/server.rs
index 22265e415fbdb..b73c81dd7d2c3 100644
--- a/datafusion-examples/examples/flight/flight_server.rs
+++ b/datafusion-examples/examples/flight/server.rs
@@ -15,25 +15,26 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::ipc::writer::{CompressionContext, DictionaryTracker, IpcDataGenerator};
+//! See `main.rs` for how to run it.
+
 use std::sync::Arc;
 
+use arrow::ipc::writer::{CompressionContext, DictionaryTracker, IpcDataGenerator};
+use arrow_flight::{
+    Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo,
+    HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket,
+    flight_service_server::FlightService, flight_service_server::FlightServiceServer,
+};
 use arrow_flight::{PollInfo, SchemaAsIpc};
 use datafusion::arrow::error::ArrowError;
 use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::listing::{ListingOptions, ListingTableUrl};
+use datafusion::prelude::*;
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
 use futures::stream::BoxStream;
 use tonic::transport::Server;
 use tonic::{Request, Response, Status, Streaming};
 
-use datafusion::prelude::*;
-
-use arrow_flight::{
-    flight_service_server::FlightService, flight_service_server::FlightServiceServer,
-    Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo,
-    HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket,
-};
-
 #[derive(Clone)]
 pub struct FlightServiceImpl {}
 
@@ -83,16 +84,21 @@ impl FlightService for FlightServiceImpl {
                 // create local execution context
                 let ctx = SessionContext::new();
 
-                let testdata = datafusion::test_util::parquet_test_data();
+                // Convert the CSV input into a temporary Parquet directory for querying
+                let dataset = ExampleDataset::Cars;
+                let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path())
+                    .await
+                    .map_err(|e| {
+                        Status::internal(format!("Error writing csv to parquet: {e}"))
+                    })?;
+                let parquet_path = parquet_temp.path_str().map_err(|e| {
+                    Status::internal(format!("Error getting parquet path: {e}"))
+                })?;
 
                 // register parquet file with the execution context
-                ctx.register_parquet(
-                    "alltypes_plain",
-                    &format!("{testdata}/alltypes_plain.parquet"),
-                    ParquetReadOptions::default(),
-                )
-                .await
-                .map_err(to_tonic_err)?;
+                ctx.register_parquet("cars", parquet_path, ParquetReadOptions::default())
+                    .await
+                    .map_err(to_tonic_err)?;
 
                 // create the DataFrame
                 let df = ctx.sql(sql).await.map_err(to_tonic_err)?;
@@ -187,6 +193,7 @@ impl FlightService for FlightServiceImpl {
     }
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn to_tonic_err(e: datafusion::error::DataFusionError) -> Status {
     Status::internal(format!("{e:?}"))
 }
@@ -194,8 +201,7 @@ fn to_tonic_err(e: datafusion::error::DataFusionError) -> Status {
 /// This example shows how to wrap DataFusion with `FlightService` to support looking up schema information for
 /// Parquet files and executing SQL queries against them on a remote server.
 /// This example is run along-side the example `flight_client`.
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
+pub async fn server() -> Result<(), Box<dyn std::error::Error>> {
     let addr = "0.0.0.0:50051".parse()?;
     let service = FlightServiceImpl {};
 
diff --git a/datafusion-examples/examples/flight/flight_sql_server.rs b/datafusion-examples/examples/flight/sql_server.rs
similarity index 94%
rename from datafusion-examples/examples/flight/flight_sql_server.rs
rename to datafusion-examples/examples/flight/sql_server.rs
index c35debec7d712..e55aaa7250ea7 100644
--- a/datafusion-examples/examples/flight/flight_sql_server.rs
+++ b/datafusion-examples/examples/flight/sql_server.rs
@@ -15,6 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
+use std::pin::Pin;
+use std::sync::Arc;
+
 use arrow::array::{ArrayRef, StringArray};
 use arrow::datatypes::{DataType, Field, Schema};
 use arrow::ipc::writer::IpcWriteOptions;
@@ -36,12 +41,11 @@ use arrow_flight::{
 use dashmap::DashMap;
 use datafusion::logical_expr::LogicalPlan;
 use datafusion::prelude::{DataFrame, ParquetReadOptions, SessionConfig, SessionContext};
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
 use futures::{Stream, StreamExt, TryStreamExt};
 use log::info;
 use mimalloc::MiMalloc;
 use prost::Message;
-use std::pin::Pin;
-use std::sync::Arc;
 use tonic::metadata::MetadataValue;
 use tonic::transport::Server;
 use tonic::{Request, Response, Status, Streaming};
@@ -68,9 +72,7 @@ macro_rules! status {
 ///
 /// Based heavily on Ballista's implementation: https://github.com/apache/datafusion-ballista/blob/main/ballista/scheduler/src/flight_sql.rs
 /// and the example in arrow-rs: https://github.com/apache/arrow-rs/blob/master/arrow-flight/examples/flight_sql_server.rs
-///
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
+pub async fn sql_server() -> Result<(), Box<dyn std::error::Error>> {
     env_logger::init();
     let addr = "0.0.0.0:50051".parse()?;
     let service = FlightSqlServiceImpl {
@@ -100,22 +102,24 @@ impl FlightSqlServiceImpl {
             .with_information_schema(true);
         let ctx = Arc::new(SessionContext::new_with_config(session_config));
 
-        let testdata = datafusion::test_util::parquet_test_data();
+        // Convert the CSV input into a temporary Parquet directory for querying
+        let dataset = ExampleDataset::Cars;
+        let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path())
+            .await
+            .map_err(|e| status!("Error writing csv to parquet", e))?;
+        let parquet_path = parquet_temp
+            .path_str()
+            .map_err(|e| status!("Error getting parquet path", e))?;
 
         // register parquet file with the execution context
-        ctx.register_parquet(
-            "alltypes_plain",
-            &format!("{testdata}/alltypes_plain.parquet"),
-            ParquetReadOptions::default(),
-        )
-        .await
-        .map_err(|e| status!("Error registering table", e))?;
+        ctx.register_parquet("cars", parquet_path, ParquetReadOptions::default())
+            .await
+            .map_err(|e| status!("Error registering table", e))?;
 
         self.contexts.insert(uuid.clone(), ctx);
         Ok(uuid)
     }
 
-    #[allow(clippy::result_large_err)]
     fn get_ctx<T>(&self, req: &Request<T>) -> Result<Arc<SessionContext>, Status> {
         // get the token from the authorization header on Request
         let auth = req
@@ -141,7 +145,6 @@ impl FlightSqlServiceImpl {
         }
     }
 
-    #[allow(clippy::result_large_err)]
     fn get_plan(&self, handle: &str) -> Result<LogicalPlan, Status> {
         if let Some(plan) = self.statements.get(handle) {
             Ok(plan.clone())
@@ -150,7 +153,6 @@ impl FlightSqlServiceImpl {
         }
     }
 
-    #[allow(clippy::result_large_err)]
     fn get_result(&self, handle: &str) -> Result<Vec<RecordBatch>, Status> {
         if let Some(result) = self.results.get(handle) {
             Ok(result.clone())
@@ -198,13 +200,11 @@ impl FlightSqlServiceImpl {
         .unwrap()
     }
 
-    #[allow(clippy::result_large_err)]
     fn remove_plan(&self, handle: &str) -> Result<(), Status> {
         self.statements.remove(&handle.to_string());
         Ok(())
     }
 
-    #[allow(clippy::result_large_err)]
     fn remove_result(&self, handle: &str) -> Result<(), Status> {
         self.results.remove(&handle.to_string());
         Ok(())
@@ -416,7 +416,9 @@ impl FlightSqlService for FlightSqlServiceImpl {
     ) -> Result<(), Status> {
         let handle = std::str::from_utf8(&handle.prepared_statement_handle);
         if let Ok(handle) = handle {
-            info!("do_action_close_prepared_statement: removing plan and results for {handle}");
+            info!(
+                "do_action_close_prepared_statement: removing plan and results for {handle}"
+            );
             let _ = self.remove_plan(handle);
             let _ = self.remove_result(handle);
         }
diff --git a/datafusion-examples/examples/composed_extension_codec.rs b/datafusion-examples/examples/proto/composed_extension_codec.rs
similarity index 84%
rename from datafusion-examples/examples/composed_extension_codec.rs
rename to datafusion-examples/examples/proto/composed_extension_codec.rs
index 57f2c370413aa..ae9503dd87b19 100644
--- a/datafusion-examples/examples/composed_extension_codec.rs
+++ b/datafusion-examples/examples/proto/composed_extension_codec.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+//!
 //! This example demonstrates how to compose multiple PhysicalExtensionCodecs
 //!
 //! This can be helpful when an Execution plan tree has different nodes from different crates
@@ -30,12 +32,12 @@
 //!           DeltaScan
 //! ```
 
-use std::any::Any;
 use std::fmt::Debug;
 use std::sync::Arc;
 
-use datafusion::common::internal_err;
 use datafusion::common::Result;
+use datafusion::common::internal_err;
+use datafusion::common::tree_node::TreeNodeRecursion;
 use datafusion::execution::TaskContext;
 use datafusion::physical_plan::{DisplayAs, ExecutionPlan};
 use datafusion::prelude::SessionContext;
@@ -44,8 +46,8 @@ use datafusion_proto::physical_plan::{
 };
 use datafusion_proto::protobuf;
 
-#[tokio::main]
-async fn main() {
+/// Example of using multiple extension codecs for serialization / deserialization
+pub async fn composed_extension_codec() -> Result<()> {
     // build execution plan that has both types of nodes
     //
     // Note each node requires a different `PhysicalExtensionCodec` to decode
@@ -66,16 +68,16 @@ async fn main() {
         protobuf::PhysicalPlanNode::try_from_physical_plan(
             exec_plan.clone(),
             &composed_codec,
-        )
-        .expect("to proto");
+        )?;
 
     // deserialize proto back to execution plan
-    let result_exec_plan: Arc<dyn ExecutionPlan> = proto
-        .try_into_physical_plan(&ctx.task_ctx(), &composed_codec)
-        .expect("from proto");
+    let result_exec_plan: Arc<dyn ExecutionPlan> =
+        proto.try_into_physical_plan(&ctx.task_ctx(), &composed_codec)?;
 
     // assert that the original and deserialized execution plans are equal
     assert_eq!(format!("{exec_plan:?}"), format!("{result_exec_plan:?}"));
+
+    Ok(())
 }
 
 /// This example has two types of nodes: `ParentExec` and `ChildExec` which can only
@@ -100,11 +102,7 @@ impl ExecutionPlan for ParentExec {
         "ParentExec"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &datafusion::physical_plan::PlanProperties {
+    fn properties(&self) -> &Arc<datafusion::physical_plan::PlanProperties> {
         unreachable!()
     }
 
@@ -126,6 +124,15 @@ impl ExecutionPlan for ParentExec {
     ) -> Result<datafusion::physical_plan::SendableRecordBatchStream> {
         unreachable!()
     }
+
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(
+            &dyn datafusion::physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
 }
 
 /// A PhysicalExtensionCodec that can serialize and deserialize ParentExec
@@ -149,7 +156,7 @@ impl PhysicalExtensionCodec for ParentPhysicalExtensionCodec {
     }
 
     fn try_encode(&self, node: Arc<dyn ExecutionPlan>, buf: &mut Vec<u8>) -> Result<()> {
-        if node.as_any().downcast_ref::<ParentExec>().is_some() {
+        if node.is::<ParentExec>() {
             buf.extend_from_slice("ParentExec".as_bytes());
             Ok(())
         } else {
@@ -176,11 +183,7 @@ impl ExecutionPlan for ChildExec {
         "ChildExec"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &datafusion::physical_plan::PlanProperties {
+    fn properties(&self) -> &Arc<datafusion::physical_plan::PlanProperties> {
         unreachable!()
     }
 
@@ -202,6 +205,15 @@ impl ExecutionPlan for ChildExec {
     ) -> Result<datafusion::physical_plan::SendableRecordBatchStream> {
         unreachable!()
     }
+
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(
+            &dyn datafusion::physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
 }
 
 /// A PhysicalExtensionCodec that can serialize and deserialize ChildExec
@@ -223,7 +235,7 @@ impl PhysicalExtensionCodec for ChildPhysicalExtensionCodec {
     }
 
     fn try_encode(&self, node: Arc<dyn ExecutionPlan>, buf: &mut Vec<u8>) -> Result<()> {
-        if node.as_any().downcast_ref::<ChildExec>().is_some() {
+        if node.is::<ChildExec>() {
             buf.extend_from_slice("ChildExec".as_bytes());
             Ok(())
         } else {
diff --git a/datafusion-examples/examples/proto/expression_deduplication.rs b/datafusion-examples/examples/proto/expression_deduplication.rs
new file mode 100644
index 0000000000000..26d246b2efca8
--- /dev/null
+++ b/datafusion-examples/examples/proto/expression_deduplication.rs
@@ -0,0 +1,272 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+//!
+//! This example demonstrates how to use the
+//! `PhysicalProtoConverterExtension` trait's interception methods to
+//! implement expression deduplication during deserialization.
+//!
+//! This pattern is inspired by PR #18192, which introduces expression caching
+//! to reduce memory usage when deserializing plans with duplicate expressions.
+//!
+//! The key insight is that identical expressions serialize to identical protobuf bytes.
+//! By caching deserialized expressions keyed by their protobuf bytes, we can:
+//! 1. Return the same Arc for duplicate expressions
+//! 2. Reduce memory allocation during deserialization
+//! 3. Enable downstream optimizations that rely on Arc pointer equality
+//!
+//! This demonstrates the decorator pattern enabled by
+//! `PhysicalProtoConverterExtension`, where physical-expression
+//! serialization and deserialization route through converter hooks.
+
+use std::collections::HashMap;
+use std::fmt::Debug;
+use std::sync::{Arc, RwLock};
+
+use arrow::datatypes::{DataType, Field, Schema};
+use datafusion::common::Result;
+use datafusion::execution::TaskContext;
+use datafusion::logical_expr::Operator;
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion::physical_plan::expressions::{BinaryExpr, col};
+use datafusion::physical_plan::filter::FilterExec;
+use datafusion::physical_plan::placeholder_row::PlaceholderRowExec;
+use datafusion::prelude::SessionContext;
+use datafusion_proto::physical_plan::from_proto::parse_physical_expr_with_converter;
+use datafusion_proto::physical_plan::to_proto::serialize_physical_expr_with_converter;
+use datafusion_proto::physical_plan::{
+    DefaultPhysicalExtensionCodec, PhysicalExtensionCodec, PhysicalPlanDecodeContext,
+    PhysicalProtoConverterExtension,
+};
+use datafusion_proto::protobuf::{PhysicalExprNode, PhysicalPlanNode};
+use prost::Message;
+
+/// Example showing how to implement expression deduplication using the codec decorator pattern.
+///
+/// This demonstrates:
+/// 1. Creating a CachingCodec that caches expressions by their protobuf bytes
+/// 2. Intercepting deserialization to return cached Arcs for duplicate expressions
+/// 3. Verifying that duplicate expressions share the same Arc after deserialization
+///
+/// Deduplication is keyed by the protobuf bytes representing the expression,
+/// in reality deduplication could be done based on e.g. the pointer address of the
+/// serialized expression in memory, but this is simpler to demonstrate.
+///
+/// In this case our expression is trivial and just for demonstration purposes.
+/// In real scenarios, expressions can be much more complex, e.g. a large InList
+/// expression could be megabytes in size, so deduplication can save significant memory
+/// in addition to more correctly representing the original plan structure.
+pub async fn expression_deduplication() -> Result<()> {
+    println!("=== Expression Deduplication Example ===\n");
+
+    // Create a schema for our test expressions
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Boolean, false)]));
+
+    // Step 1: Create expressions with duplicates
+    println!("Step 1: Creating expressions with duplicates...");
+
+    // Create expression: col("a")
+    let a = col("a", &schema)?;
+
+    // Create a clone to show duplicates
+    let a_clone = Arc::clone(&a);
+
+    // Combine: a OR a_clone
+    let combined_expr =
+        Arc::new(BinaryExpr::new(a, Operator::Or, a_clone)) as Arc<dyn PhysicalExpr>;
+    println!("  Created expression: a OR a with duplicates");
+    println!("  Note: a appears twice in the expression tree\n");
+    // Step 2: Create a filter plan with this expression
+    println!("Step 2: Creating physical plan with the expression...");
+
+    let input = Arc::new(PlaceholderRowExec::new(Arc::clone(&schema)));
+    let filter_plan: Arc<dyn ExecutionPlan> =
+        Arc::new(FilterExec::try_new(combined_expr, input)?);
+
+    println!("  Created FilterExec with duplicate sub-expressions\n");
+
+    // Step 3: Serialize with the caching codec
+    println!("Step 3: Serializing plan...");
+
+    let extension_codec = DefaultPhysicalExtensionCodec {};
+    let caching_converter = CachingCodec::new();
+    let proto =
+        caching_converter.execution_plan_to_proto(&filter_plan, &extension_codec)?;
+
+    // Serialize to bytes
+    let mut bytes = Vec::new();
+    proto.encode(&mut bytes).unwrap();
+    println!("  Serialized plan to {} bytes\n", bytes.len());
+
+    // Step 4: Deserialize with the caching codec
+    println!("Step 4: Deserializing plan with CachingCodec...");
+
+    let ctx = SessionContext::new();
+    let deserialized_plan = proto.try_into_physical_plan_with_converter(
+        &ctx.task_ctx(),
+        &extension_codec,
+        &caching_converter,
+    )?;
+
+    // Step 5: check that we deduplicated expressions
+    println!("Step 5: Checking for deduplicated expressions...");
+    let Some(filter_exec) = deserialized_plan.downcast_ref::<FilterExec>() else {
+        panic!("Deserialized plan is not a FilterExec");
+    };
+    let predicate = Arc::clone(filter_exec.predicate());
+    let binary_expr = predicate
+        .downcast_ref::<BinaryExpr>()
+        .expect("Predicate is not a BinaryExpr");
+    let left = &binary_expr.left();
+    let right = &binary_expr.right();
+    // Check if left and right point to the same Arc
+    let deduplicated = Arc::ptr_eq(left, right);
+    if deduplicated {
+        println!("  Success: Duplicate expressions were deduplicated!");
+        println!(
+            "  Cache Stats: hits={}, misses={}",
+            caching_converter.stats.read().unwrap().cache_hits,
+            caching_converter.stats.read().unwrap().cache_misses,
+        );
+    } else {
+        println!("  Failure: Duplicate expressions were NOT deduplicated.");
+    }
+
+    Ok(())
+}
+
+// ============================================================================
+// CachingCodec - Implements expression deduplication
+// ============================================================================
+
+/// Statistics for cache performance monitoring
+#[derive(Debug, Default)]
+struct CacheStats {
+    cache_hits: usize,
+    cache_misses: usize,
+}
+
+/// A codec that caches deserialized expressions to enable deduplication.
+///
+/// When deserializing, if we've already seen the same protobuf bytes,
+/// we return the cached Arc instead of creating a new allocation.
+#[derive(Debug, Default)]
+struct CachingCodec {
+    /// Cache mapping protobuf bytes -> deserialized expression
+    expr_cache: RwLock<HashMap<Vec<u8>, Arc<dyn PhysicalExpr>>>,
+    /// Statistics for demonstration
+    stats: RwLock<CacheStats>,
+}
+
+impl CachingCodec {
+    fn new() -> Self {
+        Self::default()
+    }
+}
+
+impl PhysicalExtensionCodec for CachingCodec {
+    // Required: decode custom extension nodes
+    fn try_decode(
+        &self,
+        _buf: &[u8],
+        _inputs: &[Arc<dyn ExecutionPlan>],
+        _ctx: &TaskContext,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        datafusion::common::not_impl_err!("No custom extension nodes")
+    }
+
+    // Required: encode custom execution plans
+    fn try_encode(
+        &self,
+        _node: Arc<dyn ExecutionPlan>,
+        _buf: &mut Vec<u8>,
+    ) -> Result<()> {
+        datafusion::common::not_impl_err!("No custom extension nodes")
+    }
+}
+
+impl PhysicalProtoConverterExtension for CachingCodec {
+    fn proto_to_execution_plan(
+        &self,
+        proto: &PhysicalPlanNode,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        self.default_proto_to_execution_plan(proto, ctx)
+    }
+
+    fn execution_plan_to_proto(
+        &self,
+        plan: &Arc<dyn ExecutionPlan>,
+        extension_codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<PhysicalPlanNode> {
+        PhysicalPlanNode::try_from_physical_plan_with_converter(
+            Arc::clone(plan),
+            extension_codec,
+            self,
+        )
+    }
+
+    // CACHING IMPLEMENTATION: Intercept expression deserialization
+    fn proto_to_physical_expr(
+        &self,
+        proto: &PhysicalExprNode,
+        input_schema: &Schema,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        // Create cache key from protobuf bytes
+        let mut key = Vec::new();
+        proto.encode(&mut key).map_err(|e| {
+            datafusion::error::DataFusionError::Internal(format!(
+                "Failed to encode proto for cache key: {e}"
+            ))
+        })?;
+
+        // Check cache first
+        {
+            let cache = self.expr_cache.read().unwrap();
+            if let Some(cached) = cache.get(&key) {
+                // Cache hit! Update stats and return cached Arc
+                let mut stats = self.stats.write().unwrap();
+                stats.cache_hits += 1;
+                return Ok(Arc::clone(cached));
+            }
+        }
+
+        // Cache miss - deserialize and store
+        let expr = parse_physical_expr_with_converter(proto, input_schema, ctx, self)?;
+
+        // Store in cache
+        {
+            let mut cache = self.expr_cache.write().unwrap();
+            cache.insert(key, Arc::clone(&expr));
+            let mut stats = self.stats.write().unwrap();
+            stats.cache_misses += 1;
+        }
+
+        Ok(expr)
+    }
+
+    fn physical_expr_to_proto(
+        &self,
+        expr: &Arc<dyn PhysicalExpr>,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<PhysicalExprNode> {
+        serialize_physical_expr_with_converter(expr, codec, self)
+    }
+}
diff --git a/datafusion-examples/examples/proto/main.rs b/datafusion-examples/examples/proto/main.rs
new file mode 100644
index 0000000000000..3f525b5d46afa
--- /dev/null
+++ b/datafusion-examples/examples/proto/main.rs
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # Examples demonstrating DataFusion's plan serialization via the `datafusion-proto` crate
+//!
+//! These examples show how to use multiple extension codecs for serialization / deserialization.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example proto -- [all|composed_extension_codec|expression_deduplication]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `composed_extension_codec`
+//!   (file: composed_extension_codec.rs, desc: Use multiple extension codecs for serialization/deserialization)
+//!
+//! - `expression_deduplication`
+//!   (file: expression_deduplication.rs, desc: Example of expression caching/deduplication using the codec decorator pattern)
+
+mod composed_extension_codec;
+mod expression_deduplication;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    ComposedExtensionCodec,
+    ExpressionDeduplication,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "proto";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::ComposedExtensionCodec => {
+                composed_extension_codec::composed_extension_codec().await?
+            }
+            ExampleKind::ExpressionDeduplication => {
+                expression_deduplication::expression_deduplication().await?
+            }
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/analyzer_rule.rs b/datafusion-examples/examples/query_planning/analyzer_rule.rs
similarity index 97%
rename from datafusion-examples/examples/analyzer_rule.rs
rename to datafusion-examples/examples/query_planning/analyzer_rule.rs
index cb81cd167a88b..a86f5cdd2a5e3 100644
--- a/datafusion-examples/examples/analyzer_rule.rs
+++ b/datafusion-examples/examples/query_planning/analyzer_rule.rs
@@ -15,11 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray};
+use datafusion::common::Result;
 use datafusion::common::config::ConfigOptions;
 use datafusion::common::tree_node::{Transformed, TreeNode};
-use datafusion::common::Result;
-use datafusion::logical_expr::{col, lit, Expr, LogicalPlan, LogicalPlanBuilder};
+use datafusion::logical_expr::{Expr, LogicalPlan, LogicalPlanBuilder, col, lit};
 use datafusion::optimizer::analyzer::AnalyzerRule;
 use datafusion::prelude::SessionContext;
 use std::sync::{Arc, Mutex};
@@ -35,8 +37,7 @@ use std::sync::{Arc, Mutex};
 /// level access control scheme by introducing a filter to the query.
 ///
 /// See [optimizer_rule.rs] for an example of a optimizer rule
-#[tokio::main]
-pub async fn main() -> Result<()> {
+pub async fn analyzer_rule() -> Result<()> {
     // AnalyzerRules run before OptimizerRules.
     //
     // DataFusion includes several built in AnalyzerRules for tasks such as type
diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/query_planning/expr_api.rs
similarity index 95%
rename from datafusion-examples/examples/expr_api.rs
rename to datafusion-examples/examples/query_planning/expr_api.rs
index 56f960870e58a..c087019c687c5 100644
--- a/datafusion-examples/examples/expr_api.rs
+++ b/datafusion-examples/examples/query_planning/expr_api.rs
@@ -15,10 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use std::collections::HashMap;
 use std::sync::Arc;
 
-use arrow::array::{BooleanArray, Int32Array, Int8Array};
+use arrow::array::{BooleanArray, Int8Array, Int32Array};
 use arrow::record_batch::RecordBatch;
 
 use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit};
@@ -35,7 +37,7 @@ use datafusion::logical_expr::simplify::SimplifyContext;
 use datafusion::logical_expr::{ColumnarValue, ExprFunctionExt, ExprSchemable, Operator};
 use datafusion::optimizer::analyzer::type_coercion::TypeCoercionRewriter;
 use datafusion::optimizer::simplify_expressions::ExprSimplifier;
-use datafusion::physical_expr::{analyze, AnalysisContext, ExprBoundaries};
+use datafusion::physical_expr::{AnalysisContext, ExprBoundaries, analyze};
 use datafusion::prelude::*;
 
 /// This example demonstrates the DataFusion [`Expr`] API.
@@ -55,8 +57,7 @@ use datafusion::prelude::*;
 /// 5. Analyze predicates for boundary ranges: [`range_analysis_demo`]
 /// 6. Get the types of the expressions: [`expression_type_demo`]
 /// 7. Apply type coercion to expressions: [`type_coercion_demo`]
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn expr_api() -> Result<()> {
     // The easiest way to do create expressions is to use the
     // "fluent"-style API:
     let expr = col("a") + lit(5);
@@ -174,8 +175,10 @@ fn simplify_demo() -> Result<()> {
     // the ExecutionProps carries information needed to simplify
     // expressions, such as the current time (to evaluate `now()`
     // correctly)
-    let props = ExecutionProps::new();
-    let context = SimplifyContext::new(&props).with_schema(schema);
+    let context = SimplifyContext::builder()
+        .with_schema(schema)
+        .with_current_time()
+        .build();
     let simplifier = ExprSimplifier::new(context);
 
     // And then call the simplify_expr function:
@@ -190,12 +193,15 @@ fn simplify_demo() -> Result<()> {
 
     // here are some other examples of what DataFusion is capable of
     let schema = Schema::new(vec![make_field("i", DataType::Int64)]).to_dfschema_ref()?;
-    let context = SimplifyContext::new(&props).with_schema(schema.clone());
+    let context = SimplifyContext::builder()
+        .with_schema(Arc::clone(&schema))
+        .with_current_time()
+        .build();
     let simplifier = ExprSimplifier::new(context);
 
     // basic arithmetic simplification
     // i + 1 + 2 => i + 3
-    // (note this is not done if the expr is (col("i") + (lit(1) + lit(2))))
+    // (note this is not done if the expr is (col("i") + lit(1) + lit(2)))
     assert_eq!(
         simplifier.simplify(col("i") + (lit(1) + lit(2)))?,
         col("i") + lit(3)
@@ -257,7 +263,7 @@ fn range_analysis_demo() -> Result<()> {
     // You can provide DataFusion any known boundaries on the values of `date`
     // (for example, maybe you know you only have data up to `2020-09-15`), but
     // in this case, let's say we don't know any boundaries beforehand so we use
-    // `try_new_unknown`
+    // `try_new_unbounded`
     let boundaries = ExprBoundaries::try_new_unbounded(&schema)?;
 
     // Now, we invoke the analysis code to perform the range analysis
@@ -302,6 +308,7 @@ fn boundary_analysis_and_selectivity_demo() -> Result<()> {
         min_value: Precision::Exact(ScalarValue::Int64(Some(1))),
         sum_value: Precision::Absent,
         distinct_count: Precision::Absent,
+        byte_size: Precision::Absent,
     };
 
     // We can then build our expression boundaries from the column statistics
@@ -342,9 +349,11 @@ fn boundary_analysis_and_selectivity_demo() -> Result<()> {
     //
     // (a' - b' + 1) / (a - b)
     // (10000 - 5000 + 1) / (10000 - 1)
-    assert!(analysis
-        .selectivity
-        .is_some_and(|selectivity| (0.5..=0.6).contains(&selectivity)));
+    assert!(
+        analysis
+            .selectivity
+            .is_some_and(|selectivity| (0.5..=0.6).contains(&selectivity))
+    );
 
     Ok(())
 }
@@ -369,6 +378,7 @@ fn boundary_analysis_in_conjunctions_demo() -> Result<()> {
         min_value: Precision::Exact(ScalarValue::Int64(Some(14))),
         sum_value: Precision::Absent,
         distinct_count: Precision::Absent,
+        byte_size: Precision::Absent,
     };
 
     let initial_boundaries =
@@ -414,9 +424,11 @@ fn boundary_analysis_in_conjunctions_demo() -> Result<()> {
     //
     // Granted a column such as age will more likely follow a Normal distribution
     // as such our selectivity estimation will not be as good as it can.
-    assert!(analysis
-        .selectivity
-        .is_some_and(|selectivity| (0.1..=0.2).contains(&selectivity)));
+    assert!(
+        analysis
+            .selectivity
+            .is_some_and(|selectivity| (0.1..=0.2).contains(&selectivity))
+    );
 
     // The above example was a good way to look at how we can derive better
     // interval and get a lower selectivity during boundary analysis.
@@ -532,10 +544,11 @@ fn type_coercion_demo() -> Result<()> {
     let physical_expr =
         datafusion::physical_expr::create_physical_expr(&expr, &df_schema, &props)?;
     let e = physical_expr.evaluate(&batch).unwrap_err();
-    assert!(e
-        .find_root()
-        .to_string()
-        .contains("Invalid comparison operation: Int8 > Int32"));
+    assert!(
+        e.find_root()
+            .to_string()
+            .contains("Invalid comparison operation: Int8 > Int32")
+    );
 
     // 1. Type coercion with `SessionContext::create_physical_expr` which implicitly applies type coercion before constructing the physical expr.
     let physical_expr =
@@ -543,7 +556,10 @@ fn type_coercion_demo() -> Result<()> {
     assert!(physical_expr.evaluate(&batch).is_ok());
 
     // 2. Type coercion with `ExprSimplifier::coerce`.
-    let context = SimplifyContext::new(&props).with_schema(Arc::new(df_schema.clone()));
+    let context = SimplifyContext::builder()
+        .with_schema(Arc::new(df_schema.clone()))
+        .with_current_time()
+        .build();
     let simplifier = ExprSimplifier::new(context);
     let coerced_expr = simplifier.coerce(expr.clone(), &df_schema)?;
     let physical_expr = datafusion::physical_expr::create_physical_expr(
diff --git a/datafusion-examples/examples/query_planning/main.rs b/datafusion-examples/examples/query_planning/main.rs
new file mode 100644
index 0000000000000..d3f99aedceb3d
--- /dev/null
+++ b/datafusion-examples/examples/query_planning/main.rs
@@ -0,0 +1,124 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # These are all internal mechanics of the query planning and optimization layers
+//!
+//! These examples demonstrate internal mechanics of the query planning and optimization layers.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example query_planning -- [all|analyzer_rule|expr_api|optimizer_rule|parse_sql_expr|plan_to_sql|planner_api|pruning|thread_pools]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `analyzer_rule`
+//!   (file: analyzer_rule.rs, desc: Custom AnalyzerRule to change query semantics)
+//!
+//! - `expr_api`
+//!   (file: expr_api.rs, desc: Create, execute, analyze, and coerce Exprs)
+//!
+//! - `optimizer_rule`
+//!   (file: optimizer_rule.rs, desc: Replace predicates via a custom OptimizerRule)
+//!
+//! - `parse_sql_expr`
+//!   (file: parse_sql_expr.rs, desc: Parse SQL into DataFusion Expr)
+//!
+//! - `plan_to_sql`
+//!   (file: plan_to_sql.rs, desc: Generate SQL from expressions or plans)
+//!
+//! - `planner_api`
+//!   (file: planner_api.rs, desc: APIs for logical and physical plan manipulation)
+//!
+//! - `pruning`
+//!   (file: pruning.rs, desc: Use pruning to skip irrelevant files)
+//!
+//! - `thread_pools`
+//!   (file: thread_pools.rs, desc: Configure custom thread pools for DataFusion execution)
+
+mod analyzer_rule;
+mod expr_api;
+mod optimizer_rule;
+mod parse_sql_expr;
+mod plan_to_sql;
+mod planner_api;
+mod pruning;
+mod thread_pools;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    AnalyzerRule,
+    ExprApi,
+    OptimizerRule,
+    ParseSqlExpr,
+    PlanToSql,
+    PlannerApi,
+    Pruning,
+    ThreadPools,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "query_planning";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::AnalyzerRule => analyzer_rule::analyzer_rule().await?,
+            ExampleKind::ExprApi => expr_api::expr_api().await?,
+            ExampleKind::OptimizerRule => optimizer_rule::optimizer_rule().await?,
+            ExampleKind::ParseSqlExpr => parse_sql_expr::parse_sql_expr().await?,
+            ExampleKind::PlanToSql => plan_to_sql::plan_to_sql_examples().await?,
+            ExampleKind::PlannerApi => planner_api::planner_api().await?,
+            ExampleKind::Pruning => pruning::pruning().await?,
+            ExampleKind::ThreadPools => thread_pools::thread_pools().await?,
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/optimizer_rule.rs b/datafusion-examples/examples/query_planning/optimizer_rule.rs
similarity index 97%
rename from datafusion-examples/examples/optimizer_rule.rs
rename to datafusion-examples/examples/query_planning/optimizer_rule.rs
index 9c137b67432c5..67683b7fe2827 100644
--- a/datafusion-examples/examples/optimizer_rule.rs
+++ b/datafusion-examples/examples/query_planning/optimizer_rule.rs
@@ -15,10 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray};
 use arrow::datatypes::DataType;
 use datafusion::common::tree_node::{Transformed, TreeNode};
-use datafusion::common::{assert_batches_eq, Result, ScalarValue};
+use datafusion::common::{Result, ScalarValue, assert_batches_eq};
 use datafusion::logical_expr::{
     BinaryExpr, ColumnarValue, Expr, LogicalPlan, Operator, ScalarFunctionArgs,
     ScalarUDF, ScalarUDFImpl, Signature, Volatility,
@@ -26,7 +28,6 @@ use datafusion::logical_expr::{
 use datafusion::optimizer::ApplyOrder;
 use datafusion::optimizer::{OptimizerConfig, OptimizerRule};
 use datafusion::prelude::SessionContext;
-use std::any::Any;
 use std::sync::Arc;
 
 /// This example demonstrates how to add your own [`OptimizerRule`]
@@ -37,8 +38,7 @@ use std::sync::Arc;
 ///
 /// See [analyzer_rule.rs] for an example of AnalyzerRules, which are for
 /// changing plan semantics.
-#[tokio::main]
-pub async fn main() -> Result<()> {
+pub async fn optimizer_rule() -> Result<()> {
     // DataFusion includes many built in OptimizerRules for tasks such as outer
     // to inner join conversion and constant folding.
     //
@@ -189,10 +189,6 @@ impl MyEq {
 }
 
 impl ScalarUDFImpl for MyEq {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "my_eq"
     }
diff --git a/datafusion-examples/examples/parse_sql_expr.rs b/datafusion-examples/examples/query_planning/parse_sql_expr.rs
similarity index 68%
rename from datafusion-examples/examples/parse_sql_expr.rs
rename to datafusion-examples/examples/query_planning/parse_sql_expr.rs
index 5387e7c4a05dc..74072b8480f99 100644
--- a/datafusion-examples/examples/parse_sql_expr.rs
+++ b/datafusion-examples/examples/query_planning/parse_sql_expr.rs
@@ -15,8 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use arrow::datatypes::{DataType, Field, Schema};
 use datafusion::common::DFSchema;
+use datafusion::common::ScalarValue;
 use datafusion::logical_expr::{col, lit};
 use datafusion::sql::unparser::Unparser;
 use datafusion::{
@@ -24,6 +27,7 @@ use datafusion::{
     error::Result,
     prelude::{ParquetReadOptions, SessionContext},
 };
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
 
 /// This example demonstrates the programmatic parsing of SQL expressions using
 /// the DataFusion [`SessionContext::parse_sql_expr`] API or the [`DataFrame::parse_sql_expr`] API.
@@ -32,17 +36,15 @@ use datafusion::{
 /// The code in this example shows how to:
 ///
 /// 1. [`simple_session_context_parse_sql_expr_demo`]: Parse a simple SQL text into a logical
-/// expression using a schema at [`SessionContext`].
+///    expression using a schema at [`SessionContext`].
 ///
 /// 2. [`simple_dataframe_parse_sql_expr_demo`]: Parse a simple SQL text into a logical expression
-/// using a schema at [`DataFrame`].
+///    using a schema at [`DataFrame`].
 ///
 /// 3. [`query_parquet_demo`]: Query a parquet file using the parsed_sql_expr from a DataFrame.
 ///
 /// 4. [`round_trip_parse_sql_expr_demo`]: Parse a SQL text and convert it back to SQL using [`Unparser`].
-
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn parse_sql_expr() -> Result<()> {
     // See how to evaluate expressions
     simple_session_context_parse_sql_expr_demo()?;
     simple_dataframe_parse_sql_expr_demo().await?;
@@ -70,18 +72,19 @@ fn simple_session_context_parse_sql_expr_demo() -> Result<()> {
 
 /// DataFusion can parse a SQL text to an logical expression using schema at [`DataFrame`].
 async fn simple_dataframe_parse_sql_expr_demo() -> Result<()> {
-    let sql = "int_col < 5 OR double_col = 8.0";
-    let expr = col("int_col")
-        .lt(lit(5_i64))
-        .or(col("double_col").eq(lit(8.0_f64)));
+    let sql = "car = 'red' OR speed > 1.0";
+    let expr = col("car")
+        .eq(lit(ScalarValue::Utf8(Some("red".to_string()))))
+        .or(col("speed").gt(lit(1.0_f64)));
 
     let ctx = SessionContext::new();
-    let testdata = datafusion::test_util::parquet_test_data();
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
     let df = ctx
-        .read_parquet(
-            &format!("{testdata}/alltypes_plain.parquet"),
-            ParquetReadOptions::default(),
-        )
+        .read_parquet(parquet_temp.path_str()?, ParquetReadOptions::default())
         .await?;
 
     let parsed_expr = df.parse_sql_expr(sql)?;
@@ -93,39 +96,37 @@ async fn simple_dataframe_parse_sql_expr_demo() -> Result<()> {
 
 async fn query_parquet_demo() -> Result<()> {
     let ctx = SessionContext::new();
-    let testdata = datafusion::test_util::parquet_test_data();
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
     let df = ctx
-        .read_parquet(
-            &format!("{testdata}/alltypes_plain.parquet"),
-            ParquetReadOptions::default(),
-        )
+        .read_parquet(parquet_temp.path_str()?, ParquetReadOptions::default())
         .await?;
 
     let df = df
         .clone()
-        .select(vec![
-            df.parse_sql_expr("int_col")?,
-            df.parse_sql_expr("double_col")?,
-        ])?
-        .filter(df.parse_sql_expr("int_col < 5 OR double_col = 8.0")?)?
+        .select(vec![df.parse_sql_expr("car")?, df.parse_sql_expr("speed")?])?
+        .filter(df.parse_sql_expr("car = 'red' OR speed > 1.0")?)?
         .aggregate(
-            vec![df.parse_sql_expr("double_col")?],
-            vec![df.parse_sql_expr("SUM(int_col) as sum_int_col")?],
+            vec![df.parse_sql_expr("car")?],
+            vec![df.parse_sql_expr("SUM(speed) as sum_speed")?],
         )?
         // Directly parsing the SQL text into a sort expression is not supported yet, so
         // construct it programmatically
-        .sort(vec![col("double_col").sort(false, false)])?
+        .sort(vec![col("car").sort(false, false)])?
         .limit(0, Some(1))?;
 
     let result = df.collect().await?;
 
     assert_batches_eq!(
         &[
-            "+------------+-------------+",
-            "| double_col | sum_int_col |",
-            "+------------+-------------+",
-            "| 10.1       | 4           |",
-            "+------------+-------------+",
+            "+-----+--------------------+",
+            "| car | sum_speed          |",
+            "+-----+--------------------+",
+            "| red | 162.49999999999997 |",
+            "+-----+--------------------+"
         ],
         &result
     );
@@ -135,15 +136,16 @@ async fn query_parquet_demo() -> Result<()> {
 
 /// DataFusion can parse a SQL text and convert it back to SQL using [`Unparser`].
 async fn round_trip_parse_sql_expr_demo() -> Result<()> {
-    let sql = "((int_col < 5) OR (double_col = 8))";
+    let sql = "((car = 'red') OR (speed > 1.0))";
 
     let ctx = SessionContext::new();
-    let testdata = datafusion::test_util::parquet_test_data();
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
     let df = ctx
-        .read_parquet(
-            &format!("{testdata}/alltypes_plain.parquet"),
-            ParquetReadOptions::default(),
-        )
+        .read_parquet(parquet_temp.path_str()?, ParquetReadOptions::default())
         .await?;
 
     let parsed_expr = df.parse_sql_expr(sql)?;
@@ -158,7 +160,7 @@ async fn round_trip_parse_sql_expr_demo() -> Result<()> {
     // difference in precedence rules between DataFusion and target engines.
     let unparser = Unparser::default().with_pretty(true);
 
-    let pretty = "int_col < 5 OR double_col = 8";
+    let pretty = "car = 'red' OR speed > 1.0";
     let pretty_round_trip_sql = unparser.expr_to_sql(&parsed_expr)?.to_string();
     assert_eq!(pretty, pretty_round_trip_sql);
 
diff --git a/datafusion-examples/examples/plan_to_sql.rs b/datafusion-examples/examples/query_planning/plan_to_sql.rs
similarity index 77%
rename from datafusion-examples/examples/plan_to_sql.rs
rename to datafusion-examples/examples/query_planning/plan_to_sql.rs
index 54483b143a169..86aebbc0b2c33 100644
--- a/datafusion-examples/examples/plan_to_sql.rs
+++ b/datafusion-examples/examples/query_planning/plan_to_sql.rs
@@ -15,7 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
+use std::fmt;
+use std::sync::Arc;
+
 use datafusion::common::DFSchemaRef;
+use datafusion::common::ScalarValue;
 use datafusion::error::Result;
 use datafusion::logical_expr::sqlparser::ast::Statement;
 use datafusion::logical_expr::{
@@ -32,9 +38,8 @@ use datafusion::sql::unparser::extension_unparser::UserDefinedLogicalNodeUnparse
 use datafusion::sql::unparser::extension_unparser::{
     UnparseToStatementResult, UnparseWithinStatementResult,
 };
-use datafusion::sql::unparser::{plan_to_sql, Unparser};
-use std::fmt;
-use std::sync::Arc;
+use datafusion::sql::unparser::{Unparser, plan_to_sql};
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
 
 /// This example demonstrates the programmatic construction of SQL strings using
 /// the DataFusion Expr [`Expr`] and LogicalPlan [`LogicalPlan`] API.
@@ -43,28 +48,26 @@ use std::sync::Arc;
 /// The code in this example shows how to:
 ///
 /// 1. [`simple_expr_to_sql_demo`]: Create a simple expression [`Exprs`] with
-/// fluent API and convert to sql suitable for passing to another database
+///    fluent API and convert to sql suitable for passing to another database
 ///
 /// 2. [`simple_expr_to_pretty_sql_demo`] Create a simple expression
-/// [`Exprs`] with fluent API and convert to sql without extra parentheses,
-/// suitable for displaying to humans
+///    [`Exprs`] with fluent API and convert to sql without extra parentheses,
+///    suitable for displaying to humans
 ///
 /// 3. [`simple_expr_to_sql_demo_escape_mysql_style`]" Create a simple
-/// expression [`Exprs`] with fluent API and convert to sql escaping column
-/// names in MySQL style.
+///    expression [`Exprs`] with fluent API and convert to sql escaping column
+///    names in MySQL style.
 ///
 /// 4. [`simple_plan_to_sql_demo`]: Create a simple logical plan using the
-/// DataFrames API and convert to sql string.
+///    DataFrames API and convert to sql string.
 ///
 /// 5. [`round_trip_plan_to_sql_demo`]: Create a logical plan from a SQL string, modify it using the
-/// DataFrames API and convert it back to a  sql string.
+///    DataFrames API and convert it back to a  sql string.
 ///
 /// 6. [`unparse_my_logical_plan_as_statement`]: Create a custom logical plan and unparse it as a statement.
 ///
 /// 7. [`unparse_my_logical_plan_as_subquery`]: Create a custom logical plan and unparse it as a subquery.
-
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn plan_to_sql_examples() -> Result<()> {
     // See how to evaluate expressions
     simple_expr_to_sql_demo()?;
     simple_expr_to_pretty_sql_demo()?;
@@ -114,21 +117,21 @@ fn simple_expr_to_sql_demo_escape_mysql_style() -> Result<()> {
 async fn simple_plan_to_sql_demo() -> Result<()> {
     let ctx = SessionContext::new();
 
-    let testdata = datafusion::test_util::parquet_test_data();
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
     let df = ctx
-        .read_parquet(
-            &format!("{testdata}/alltypes_plain.parquet"),
-            ParquetReadOptions::default(),
-        )
+        .read_parquet(parquet_temp.path_str()?, ParquetReadOptions::default())
         .await?
-        .select_columns(&["id", "int_col", "double_col", "date_string_col"])?;
+        .select_columns(&["car", "speed", "time"])?;
 
     // Convert the data frame to a SQL string
     let sql = plan_to_sql(df.logical_plan())?.to_string();
 
     assert_eq!(
         sql,
-        r#"SELECT "?table?".id, "?table?".int_col, "?table?".double_col, "?table?".date_string_col FROM "?table?""#
+        r#"SELECT "?table?".car, "?table?".speed, "?table?"."time" FROM "?table?""#
     );
 
     Ok(())
@@ -139,35 +142,35 @@ async fn simple_plan_to_sql_demo() -> Result<()> {
 async fn round_trip_plan_to_sql_demo() -> Result<()> {
     let ctx = SessionContext::new();
 
-    let testdata = datafusion::test_util::parquet_test_data();
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
 
     // register parquet file with the execution context
     ctx.register_parquet(
-        "alltypes_plain",
-        &format!("{testdata}/alltypes_plain.parquet"),
+        "cars",
+        parquet_temp.path_str()?,
         ParquetReadOptions::default(),
     )
     .await?;
 
     // create a logical plan from a SQL string and then programmatically add new filters
+    // select car, speed, time from cars where speed > 1 and car = 'red'
     let df = ctx
         // Use SQL to read some data from the parquet file
-        .sql(
-            "SELECT int_col, double_col, CAST(date_string_col as VARCHAR) \
-        FROM alltypes_plain",
-        )
+        .sql("SELECT car, speed, time FROM cars")
         .await?
-        // Add id > 1 and tinyint_col < double_col filter
+        // Add speed > 1 and car = 'red' filter
         .filter(
-            col("id")
+            col("speed")
                 .gt(lit(1))
-                .and(col("tinyint_col").lt(col("double_col"))),
+                .and(col("car").eq(lit(ScalarValue::Utf8(Some("red".to_string()))))),
         )?;
 
     let sql = plan_to_sql(df.logical_plan())?.to_string();
     assert_eq!(
         sql,
-        r#"SELECT alltypes_plain.int_col, alltypes_plain.double_col, CAST(alltypes_plain.date_string_col AS VARCHAR) FROM alltypes_plain WHERE ((alltypes_plain.id > 1) AND (alltypes_plain.tinyint_col < alltypes_plain.double_col))"#
+        r#"SELECT cars.car, cars.speed, cars."time" FROM cars WHERE ((cars.speed > 1) AND (cars.car = 'red'))"#
     );
 
     Ok(())
@@ -211,6 +214,7 @@ impl UserDefinedLogicalNodeCore for MyLogicalPlan {
 }
 
 struct PlanToStatement {}
+
 impl UserDefinedLogicalNodeUnparser for PlanToStatement {
     fn unparse_to_statement(
         &self,
@@ -231,14 +235,15 @@ impl UserDefinedLogicalNodeUnparser for PlanToStatement {
 /// It can be unparse as a statement that reads from the same parquet file.
 async fn unparse_my_logical_plan_as_statement() -> Result<()> {
     let ctx = SessionContext::new();
-    let testdata = datafusion::test_util::parquet_test_data();
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
     let inner_plan = ctx
-        .read_parquet(
-            &format!("{testdata}/alltypes_plain.parquet"),
-            ParquetReadOptions::default(),
-        )
+        .read_parquet(parquet_temp.path_str()?, ParquetReadOptions::default())
         .await?
-        .select_columns(&["id", "int_col", "double_col", "date_string_col"])?
+        .select_columns(&["car", "speed", "time"])?
         .into_unoptimized_plan();
 
     let node = Arc::new(MyLogicalPlan { input: inner_plan });
@@ -249,7 +254,7 @@ async fn unparse_my_logical_plan_as_statement() -> Result<()> {
     let sql = unparser.plan_to_sql(&my_plan)?.to_string();
     assert_eq!(
         sql,
-        r#"SELECT "?table?".id, "?table?".int_col, "?table?".double_col, "?table?".date_string_col FROM "?table?""#
+        r#"SELECT "?table?".car, "?table?".speed, "?table?"."time" FROM "?table?""#
     );
     Ok(())
 }
@@ -284,14 +289,15 @@ impl UserDefinedLogicalNodeUnparser for PlanToSubquery {
 /// It can be unparse as a subquery that reads from the same parquet file, with some columns projected.
 async fn unparse_my_logical_plan_as_subquery() -> Result<()> {
     let ctx = SessionContext::new();
-    let testdata = datafusion::test_util::parquet_test_data();
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
     let inner_plan = ctx
-        .read_parquet(
-            &format!("{testdata}/alltypes_plain.parquet"),
-            ParquetReadOptions::default(),
-        )
+        .read_parquet(parquet_temp.path_str()?, ParquetReadOptions::default())
         .await?
-        .select_columns(&["id", "int_col", "double_col", "date_string_col"])?
+        .select_columns(&["car", "speed", "time"])?
         .into_unoptimized_plan();
 
     let node = Arc::new(MyLogicalPlan { input: inner_plan });
@@ -299,8 +305,8 @@ async fn unparse_my_logical_plan_as_subquery() -> Result<()> {
     let my_plan = LogicalPlan::Extension(Extension { node });
     let plan = LogicalPlanBuilder::from(my_plan)
         .project(vec![
-            col("id").alias("my_id"),
-            col("int_col").alias("my_int"),
+            col("car").alias("my_car"),
+            col("speed").alias("my_speed"),
         ])?
         .build()?;
     let unparser =
@@ -308,8 +314,8 @@ async fn unparse_my_logical_plan_as_subquery() -> Result<()> {
     let sql = unparser.plan_to_sql(&plan)?.to_string();
     assert_eq!(
         sql,
-        "SELECT \"?table?\".id AS my_id, \"?table?\".int_col AS my_int FROM \
-        (SELECT \"?table?\".id, \"?table?\".int_col, \"?table?\".double_col, \"?table?\".date_string_col FROM \"?table?\")",
+        "SELECT \"?table?\".car AS my_car, \"?table?\".speed AS my_speed FROM \
+        (SELECT \"?table?\".car, \"?table?\".speed, \"?table?\".\"time\" FROM \"?table?\")",
     );
     Ok(())
 }
diff --git a/datafusion-examples/examples/planner_api.rs b/datafusion-examples/examples/query_planning/planner_api.rs
similarity index 86%
rename from datafusion-examples/examples/planner_api.rs
rename to datafusion-examples/examples/query_planning/planner_api.rs
index 55aec7b0108a4..8b2c09f4aecba 100644
--- a/datafusion-examples/examples/planner_api.rs
+++ b/datafusion-examples/examples/query_planning/planner_api.rs
@@ -15,11 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use datafusion::error::Result;
 use datafusion::logical_expr::LogicalPlan;
 use datafusion::physical_plan::displayable;
 use datafusion::physical_planner::DefaultPhysicalPlanner;
 use datafusion::prelude::*;
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
 
 /// This example demonstrates the process of converting logical plan
 /// into physical execution plans using DataFusion.
@@ -32,29 +35,26 @@ use datafusion::prelude::*;
 /// physical plan:
 /// - Via the combined `create_physical_plan` API.
 /// - Utilizing the analyzer, optimizer, and query planner APIs separately.
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn planner_api() -> Result<()> {
     // Set up a DataFusion context and load a Parquet file
     let ctx = SessionContext::new();
-    let testdata = datafusion::test_util::parquet_test_data();
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
     let df = ctx
-        .read_parquet(
-            &format!("{testdata}/alltypes_plain.parquet"),
-            ParquetReadOptions::default(),
-        )
+        .read_parquet(parquet_temp.path_str()?, ParquetReadOptions::default())
         .await?;
 
     // Construct the input logical plan using DataFrame API
     let df = df
         .clone()
-        .select(vec![
-            df.parse_sql_expr("int_col")?,
-            df.parse_sql_expr("double_col")?,
-        ])?
-        .filter(df.parse_sql_expr("int_col < 5 OR double_col = 8.0")?)?
+        .select(vec![df.parse_sql_expr("car")?, df.parse_sql_expr("speed")?])?
+        .filter(df.parse_sql_expr("car = 'red' OR speed > 1.0")?)?
         .aggregate(
-            vec![df.parse_sql_expr("double_col")?],
-            vec![df.parse_sql_expr("SUM(int_col) as sum_int_col")?],
+            vec![df.parse_sql_expr("car")?],
+            vec![df.parse_sql_expr("SUM(speed) as sum_speed")?],
         )?
         .limit(0, Some(1))?;
     let logical_plan = df.logical_plan().clone();
diff --git a/datafusion-examples/examples/pruning.rs b/datafusion-examples/examples/query_planning/pruning.rs
similarity index 96%
rename from datafusion-examples/examples/pruning.rs
rename to datafusion-examples/examples/query_planning/pruning.rs
index 9a61789662cdd..7fdc4a7952d68 100644
--- a/datafusion-examples/examples/pruning.rs
+++ b/datafusion-examples/examples/query_planning/pruning.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use std::collections::HashSet;
 use std::sync::Arc;
 
@@ -22,6 +24,7 @@ use arrow::array::{ArrayRef, BooleanArray, Int32Array};
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use datafusion::common::pruning::PruningStatistics;
 use datafusion::common::{DFSchema, ScalarValue};
+use datafusion::error::Result;
 use datafusion::execution::context::ExecutionProps;
 use datafusion::physical_expr::create_physical_expr;
 use datafusion::physical_optimizer::pruning::PruningPredicate;
@@ -40,8 +43,7 @@ use datafusion::prelude::*;
 /// one might do as part of a higher level storage engine. See
 /// `parquet_index.rs` for an example that uses pruning in the context of an
 /// individual query.
-#[tokio::main]
-async fn main() {
+pub async fn pruning() -> Result<()> {
     // In this example, we'll use the PruningPredicate to determine if
     // the expression `x = 5 AND y = 10` can never be true based on statistics
 
@@ -69,7 +71,7 @@ async fn main() {
     let predicate = create_pruning_predicate(expr, &my_catalog.schema);
 
     // Evaluate the predicate for the three files in the catalog
-    let prune_results = predicate.prune(&my_catalog).unwrap();
+    let prune_results = predicate.prune(&my_catalog)?;
     println!("Pruning results: {prune_results:?}");
 
     // The result is a `Vec` of bool values, one for each file in the catalog
@@ -93,6 +95,8 @@ async fn main() {
             false
         ]
     );
+
+    Ok(())
 }
 
 /// A simple model catalog that has information about the three files that store
@@ -170,7 +174,7 @@ impl PruningStatistics for MyCatalog {
         None
     }
 
-    fn row_counts(&self, _column: &Column) -> Option<ArrayRef> {
+    fn row_counts(&self) -> Option<ArrayRef> {
         // In this example, we know nothing about the number of rows in each file
         None
     }
@@ -186,6 +190,7 @@ impl PruningStatistics for MyCatalog {
     }
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn create_pruning_predicate(expr: Expr, schema: &SchemaRef) -> PruningPredicate {
     let df_schema = DFSchema::try_from(Arc::clone(schema)).unwrap();
     let props = ExecutionProps::new();
diff --git a/datafusion-examples/examples/thread_pools.rs b/datafusion-examples/examples/query_planning/thread_pools.rs
similarity index 96%
rename from datafusion-examples/examples/thread_pools.rs
rename to datafusion-examples/examples/query_planning/thread_pools.rs
index bba56b2932abc..2ff73a77c4024 100644
--- a/datafusion-examples/examples/thread_pools.rs
+++ b/datafusion-examples/examples/query_planning/thread_pools.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+//!
 //! This example shows how to use separate thread pools (tokio [`Runtime`]))s to
 //! run the IO and CPU intensive parts of DataFusion plans.
 //!
@@ -35,15 +37,17 @@
 //!
 //! [Architecture section]: https://docs.rs/datafusion/latest/datafusion/index.html#thread-scheduling-cpu--io-thread-pools-and-tokio-runtimes
 
+use std::sync::Arc;
+
 use arrow::util::pretty::pretty_format_batches;
 use datafusion::common::runtime::JoinSet;
 use datafusion::error::Result;
 use datafusion::execution::SendableRecordBatchStream;
 use datafusion::prelude::*;
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
 use futures::stream::StreamExt;
 use object_store::client::SpawnedReqwestConnector;
 use object_store::http::HttpBuilder;
-use std::sync::Arc;
 use tokio::runtime::Handle;
 use tokio::sync::Notify;
 use url::Url;
@@ -64,15 +68,16 @@ use url::Url;
 /// when using Rust libraries such as `tonic`. Using a separate `Runtime` for
 /// CPU bound tasks will often be simpler in larger applications, even though it
 /// makes this example slightly more complex.
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn thread_pools() -> Result<()> {
     // The first two examples read local files. Enabling the URL table feature
     // lets us treat filenames as tables in SQL.
     let ctx = SessionContext::new().enable_url_table();
-    let sql = format!(
-        "SELECT * FROM '{}/alltypes_plain.parquet'",
-        datafusion::test_util::parquet_test_data()
-    );
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
+    let sql = format!("SELECT * FROM '{}'", parquet_temp.path_str()?);
 
     // Run a query on the current runtime. Calling `await` means the future
     // (in this case the `async` function and all spawned work in DataFusion
@@ -121,7 +126,7 @@ async fn same_runtime(ctx: &SessionContext, sql: &str) -> Result<()> {
     // Executing the plan using this pattern intermixes any IO and CPU intensive
     // work on same Runtime
     while let Some(batch) = stream.next().await {
-        println!("{}", pretty_format_batches(&[batch?]).unwrap());
+        println!("{}", pretty_format_batches(&[batch?])?);
     }
     Ok(())
 }
@@ -342,7 +347,7 @@ impl CpuRuntime {
     /// message such as:
     ///
     /// ```text
-    ///A Tokio 1.x context was found, but IO is disabled.
+    /// A Tokio 1.x context was found, but IO is disabled.
     /// ```
     pub fn handle(&self) -> &Handle {
         &self.handle
diff --git a/datafusion-examples/examples/relation_planner/main.rs b/datafusion-examples/examples/relation_planner/main.rs
new file mode 100644
index 0000000000000..babc0d3714f72
--- /dev/null
+++ b/datafusion-examples/examples/relation_planner/main.rs
@@ -0,0 +1,127 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # Relation Planner Examples
+//!
+//! These examples demonstrate how to use custom relation planners to extend
+//! DataFusion's SQL syntax with custom table operators.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example relation_planner -- [all|match_recognize|pivot_unpivot|table_sample]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `match_recognize`
+//!   (file: match_recognize.rs, desc: Implement MATCH_RECOGNIZE pattern matching)
+//!
+//! - `pivot_unpivot`
+//!   (file: pivot_unpivot.rs, desc: Implement PIVOT / UNPIVOT)
+//!
+//! - `table_sample`
+//!   (file: table_sample.rs, desc: Implement TABLESAMPLE)
+//!
+//! ## Snapshot Testing
+//!
+//! These examples use [insta](https://insta.rs) for inline snapshot assertions.
+//! If query output changes, regenerate the snapshots with:
+//! ```bash
+//! cargo insta test --example relation_planner --accept
+//! ```
+
+mod match_recognize;
+mod pivot_unpivot;
+mod table_sample;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    MatchRecognize,
+    PivotUnpivot,
+    TableSample,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "relation_planner";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::MatchRecognize => match_recognize::match_recognize().await?,
+            ExampleKind::PivotUnpivot => pivot_unpivot::pivot_unpivot().await?,
+            ExampleKind::TableSample => table_sample::table_sample().await?,
+        }
+
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
+
+/// Test wrappers that enable `cargo insta test --example relation_planner --accept`
+/// to regenerate inline snapshots. Without these, insta cannot run the examples
+/// in test mode since they only have `main()` functions.
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_match_recognize() {
+        match_recognize::match_recognize().await.unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_pivot_unpivot() {
+        pivot_unpivot::pivot_unpivot().await.unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_table_sample() {
+        table_sample::table_sample().await.unwrap();
+    }
+}
diff --git a/datafusion-examples/examples/relation_planner/match_recognize.rs b/datafusion-examples/examples/relation_planner/match_recognize.rs
new file mode 100644
index 0000000000000..c4b3d522efc17
--- /dev/null
+++ b/datafusion-examples/examples/relation_planner/match_recognize.rs
@@ -0,0 +1,408 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # MATCH_RECOGNIZE Example
+//!
+//! This example demonstrates implementing SQL `MATCH_RECOGNIZE` pattern matching
+//! using a custom [`RelationPlanner`]. Unlike the [`pivot_unpivot`] example that
+//! rewrites SQL to standard operations, this example creates a **custom logical
+//! plan node** (`MiniMatchRecognizeNode`) to represent the operation.
+//!
+//! ## Supported Syntax
+//!
+//! ```sql
+//! SELECT * FROM events
+//!   MATCH_RECOGNIZE (
+//!     PARTITION BY region
+//!     MEASURES SUM(price) AS total, AVG(price) AS average
+//!     PATTERN (A B+ C)
+//!     DEFINE
+//!       A AS price < 100,
+//!       B AS price BETWEEN 100 AND 200,
+//!       C AS price > 200
+//!   ) AS matches
+//! ```
+//!
+//! ## Architecture
+//!
+//! This example demonstrates **logical planning only**. Physical execution would
+//! require implementing an [`ExecutionPlan`] (see the [`table_sample`] example
+//! for a complete implementation with physical planning).
+//!
+//! ```text
+//! SQL Query
+//!     │
+//!     ▼
+//! ┌─────────────────────────────────────┐
+//! │ MatchRecognizePlanner               │
+//! │ (RelationPlanner trait)             │
+//! │                                     │
+//! │ • Parses MATCH_RECOGNIZE syntax     │
+//! │ • Creates MiniMatchRecognizeNode    │
+//! │ • Converts SQL exprs to DataFusion  │
+//! └─────────────────────────────────────┘
+//!     │
+//!     ▼
+//! ┌─────────────────────────────────────┐
+//! │ MiniMatchRecognizeNode              │
+//! │ (UserDefinedLogicalNode)            │
+//! │                                     │
+//! │ • measures: [(alias, expr), ...]    │
+//! │ • definitions: [(symbol, expr), ...]│
+//! └─────────────────────────────────────┘
+//! ```
+//!
+//! [`pivot_unpivot`]: super::pivot_unpivot
+//! [`table_sample`]: super::table_sample
+//! [`ExecutionPlan`]: datafusion::physical_plan::ExecutionPlan
+
+use std::{any::Any, cmp::Ordering, hash::Hasher, sync::Arc};
+
+use arrow::array::{ArrayRef, Float64Array, Int32Array, StringArray};
+use arrow::record_batch::RecordBatch;
+use datafusion::prelude::*;
+use datafusion_common::{DFSchemaRef, Result};
+use datafusion_expr::{
+    Expr, UserDefinedLogicalNode,
+    logical_plan::{Extension, InvariantLevel, LogicalPlan},
+    planner::{
+        PlannedRelation, RelationPlanner, RelationPlannerContext, RelationPlanning,
+    },
+};
+use datafusion_sql::sqlparser::ast::TableFactor;
+use insta::assert_snapshot;
+
+// ============================================================================
+// Example Entry Point
+// ============================================================================
+
+/// Runs the MATCH_RECOGNIZE examples demonstrating pattern matching on event streams.
+///
+/// Note: This example demonstrates **logical planning only**. Physical execution
+/// would require additional implementation of an [`ExecutionPlan`].
+pub async fn match_recognize() -> Result<()> {
+    let ctx = SessionContext::new();
+    ctx.register_relation_planner(Arc::new(MatchRecognizePlanner))?;
+    register_sample_data(&ctx)?;
+
+    println!("MATCH_RECOGNIZE Example (Logical Planning Only)");
+    println!("================================================\n");
+
+    run_examples(&ctx).await
+}
+
+async fn run_examples(ctx: &SessionContext) -> Result<()> {
+    // Example 1: Basic MATCH_RECOGNIZE with MEASURES and DEFINE
+    // Demonstrates: Aggregate measures over matched rows
+    let plan = run_example(
+        ctx,
+        "Example 1: MATCH_RECOGNIZE with aggregations",
+        r#"SELECT * FROM events
+           MATCH_RECOGNIZE (
+             PARTITION BY 1
+             MEASURES SUM(price) AS total_price, AVG(price) AS avg_price
+             PATTERN (A)
+             DEFINE A AS price > 10
+           ) AS matches"#,
+    )
+    .await?;
+    assert_snapshot!(plan, @r"
+    Projection: matches.price
+      SubqueryAlias: matches
+        MiniMatchRecognize measures=[total_price := sum(events.price), avg_price := avg(events.price)] define=[a := events.price > Int64(10)]
+          TableScan: events
+    ");
+
+    // Example 2: Stock price pattern detection
+    // Demonstrates: Real-world use case finding prices above threshold
+    let plan = run_example(
+        ctx,
+        "Example 2: Detect high stock prices",
+        r#"SELECT * FROM stock_prices
+           MATCH_RECOGNIZE (
+             MEASURES
+               MIN(price) AS min_price,
+               MAX(price) AS max_price,
+               AVG(price) AS avg_price
+             PATTERN (HIGH)
+             DEFINE HIGH AS price > 151.0
+           ) AS trends"#,
+    )
+    .await?;
+    assert_snapshot!(plan, @r"
+    Projection: trends.symbol, trends.price
+      SubqueryAlias: trends
+        MiniMatchRecognize measures=[min_price := min(stock_prices.price), max_price := max(stock_prices.price), avg_price := avg(stock_prices.price)] define=[high := stock_prices.price > Float64(151)]
+          TableScan: stock_prices
+    ");
+
+    Ok(())
+}
+
+/// Helper to run a single example query and display the logical plan.
+async fn run_example(ctx: &SessionContext, title: &str, sql: &str) -> Result<String> {
+    println!("{title}:\n{sql}\n");
+    let plan = ctx.sql(sql).await?.into_unoptimized_plan();
+    let plan_str = plan.display_indent().to_string();
+    println!("{plan_str}\n");
+    Ok(plan_str)
+}
+
+/// Register test data tables.
+fn register_sample_data(ctx: &SessionContext) -> Result<()> {
+    // events: simple price series
+    ctx.register_batch(
+        "events",
+        RecordBatch::try_from_iter(vec![(
+            "price",
+            Arc::new(Int32Array::from(vec![5, 12, 8, 15, 20])) as ArrayRef,
+        )])?,
+    )?;
+
+    // stock_prices: realistic stock data
+    ctx.register_batch(
+        "stock_prices",
+        RecordBatch::try_from_iter(vec![
+            (
+                "symbol",
+                Arc::new(StringArray::from(vec!["DDOG", "DDOG", "DDOG", "DDOG"]))
+                    as ArrayRef,
+            ),
+            (
+                "price",
+                Arc::new(Float64Array::from(vec![150.0, 155.0, 152.0, 158.0])),
+            ),
+        ])?,
+    )?;
+
+    Ok(())
+}
+
+// ============================================================================
+// Logical Plan Node: MiniMatchRecognizeNode
+// ============================================================================
+
+/// A custom logical plan node representing MATCH_RECOGNIZE operations.
+///
+/// This is a simplified implementation that captures the essential structure:
+/// - `measures`: Aggregate expressions computed over matched rows
+/// - `definitions`: Symbol definitions (predicate expressions)
+///
+/// A production implementation would also include:
+/// - Pattern specification (regex-like pattern)
+/// - Partition and order by clauses
+/// - Output mode (ONE ROW PER MATCH, ALL ROWS PER MATCH)
+/// - After match skip strategy
+#[derive(Debug)]
+struct MiniMatchRecognizeNode {
+    input: Arc<LogicalPlan>,
+    schema: DFSchemaRef,
+    /// Measures: (alias, aggregate_expr)
+    measures: Vec<(String, Expr)>,
+    /// Symbol definitions: (symbol_name, predicate_expr)
+    definitions: Vec<(String, Expr)>,
+}
+
+impl UserDefinedLogicalNode for MiniMatchRecognizeNode {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "MiniMatchRecognize"
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.input]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.schema
+    }
+
+    fn check_invariants(&self, _check: InvariantLevel) -> Result<()> {
+        Ok(())
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        self.measures
+            .iter()
+            .chain(&self.definitions)
+            .map(|(_, expr)| expr.clone())
+            .collect()
+    }
+
+    fn fmt_for_explain(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "MiniMatchRecognize")?;
+
+        if !self.measures.is_empty() {
+            write!(f, " measures=[")?;
+            for (i, (alias, expr)) in self.measures.iter().enumerate() {
+                if i > 0 {
+                    write!(f, ", ")?;
+                }
+                write!(f, "{alias} := {expr}")?;
+            }
+            write!(f, "]")?;
+        }
+
+        if !self.definitions.is_empty() {
+            write!(f, " define=[")?;
+            for (i, (symbol, expr)) in self.definitions.iter().enumerate() {
+                if i > 0 {
+                    write!(f, ", ")?;
+                }
+                write!(f, "{symbol} := {expr}")?;
+            }
+            write!(f, "]")?;
+        }
+
+        Ok(())
+    }
+
+    fn with_exprs_and_inputs(
+        &self,
+        exprs: Vec<Expr>,
+        inputs: Vec<LogicalPlan>,
+    ) -> Result<Arc<dyn UserDefinedLogicalNode>> {
+        let expected_len = self.measures.len() + self.definitions.len();
+        if exprs.len() != expected_len {
+            return Err(datafusion_common::plan_datafusion_err!(
+                "MiniMatchRecognize: expected {expected_len} expressions, got {}",
+                exprs.len()
+            ));
+        }
+
+        let input = inputs.into_iter().next().ok_or_else(|| {
+            datafusion_common::plan_datafusion_err!(
+                "MiniMatchRecognize requires exactly one input"
+            )
+        })?;
+
+        let (measure_exprs, definition_exprs) = exprs.split_at(self.measures.len());
+
+        let measures = self
+            .measures
+            .iter()
+            .zip(measure_exprs)
+            .map(|((alias, _), expr)| (alias.clone(), expr.clone()))
+            .collect();
+
+        let definitions = self
+            .definitions
+            .iter()
+            .zip(definition_exprs)
+            .map(|((symbol, _), expr)| (symbol.clone(), expr.clone()))
+            .collect();
+
+        Ok(Arc::new(Self {
+            input: Arc::new(input),
+            schema: Arc::clone(&self.schema),
+            measures,
+            definitions,
+        }))
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        state.write_usize(Arc::as_ptr(&self.input) as usize);
+        state.write_usize(self.measures.len());
+        state.write_usize(self.definitions.len());
+    }
+
+    fn dyn_eq(&self, other: &dyn UserDefinedLogicalNode) -> bool {
+        other.as_any().downcast_ref::<Self>().is_some_and(|o| {
+            Arc::ptr_eq(&self.input, &o.input)
+                && self.measures == o.measures
+                && self.definitions == o.definitions
+        })
+    }
+
+    fn dyn_ord(&self, other: &dyn UserDefinedLogicalNode) -> Option<Ordering> {
+        if self.dyn_eq(other) {
+            Some(Ordering::Equal)
+        } else {
+            None
+        }
+    }
+}
+
+// ============================================================================
+// Relation Planner: MatchRecognizePlanner
+// ============================================================================
+
+/// Relation planner that creates `MiniMatchRecognizeNode` for MATCH_RECOGNIZE queries.
+#[derive(Debug)]
+struct MatchRecognizePlanner;
+
+impl RelationPlanner for MatchRecognizePlanner {
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        ctx: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning> {
+        let TableFactor::MatchRecognize {
+            table,
+            measures,
+            symbols,
+            alias,
+            ..
+        } = relation
+        else {
+            return Ok(RelationPlanning::Original(Box::new(relation)));
+        };
+
+        // Plan the input table
+        let input = ctx.plan(*table)?;
+        let schema = input.schema().clone();
+
+        // Convert MEASURES: SQL expressions → DataFusion expressions
+        let planned_measures: Vec<(String, Expr)> = measures
+            .iter()
+            .map(|m| {
+                let alias = ctx.normalize_ident(m.alias.clone());
+                let expr = ctx.sql_to_expr(m.expr.clone(), schema.as_ref())?;
+                Ok((alias, expr))
+            })
+            .collect::<Result<_>>()?;
+
+        // Convert DEFINE: symbol definitions → DataFusion expressions
+        let planned_definitions: Vec<(String, Expr)> = symbols
+            .iter()
+            .map(|s| {
+                let name = ctx.normalize_ident(s.symbol.clone());
+                let expr = ctx.sql_to_expr(s.definition.clone(), schema.as_ref())?;
+                Ok((name, expr))
+            })
+            .collect::<Result<_>>()?;
+
+        // Create the custom node
+        let node = MiniMatchRecognizeNode {
+            input: Arc::new(input),
+            schema,
+            measures: planned_measures,
+            definitions: planned_definitions,
+        };
+
+        let plan = LogicalPlan::Extension(Extension {
+            node: Arc::new(node),
+        });
+
+        Ok(RelationPlanning::Planned(Box::new(PlannedRelation::new(
+            plan, alias,
+        ))))
+    }
+}
diff --git a/datafusion-examples/examples/relation_planner/pivot_unpivot.rs b/datafusion-examples/examples/relation_planner/pivot_unpivot.rs
new file mode 100644
index 0000000000000..4b721346aa72d
--- /dev/null
+++ b/datafusion-examples/examples/relation_planner/pivot_unpivot.rs
@@ -0,0 +1,619 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # PIVOT and UNPIVOT Example
+//!
+//! This example demonstrates implementing SQL `PIVOT` and `UNPIVOT` operations
+//! using a custom [`RelationPlanner`]. Unlike the other examples that create
+//! custom logical/physical nodes, this example shows how to **rewrite** SQL
+//! constructs into equivalent standard SQL operations:
+//!
+//! ## Supported Syntax
+//!
+//! ```sql
+//! -- PIVOT: Transform rows into columns
+//! SELECT * FROM sales
+//!   PIVOT (SUM(amount) FOR quarter IN ('Q1', 'Q2', 'Q3', 'Q4'))
+//!
+//! -- UNPIVOT: Transform columns into rows
+//! SELECT * FROM wide_table
+//!   UNPIVOT (value FOR name IN (col1, col2, col3))
+//! ```
+//!
+//! ## Rewrite Strategy
+//!
+//! **PIVOT** is rewritten to `GROUP BY` with `CASE` expressions:
+//! ```sql
+//! -- Original:
+//! SELECT * FROM sales PIVOT (SUM(amount) FOR quarter IN ('Q1', 'Q2'))
+//!
+//! -- Rewritten to:
+//! SELECT region,
+//!        SUM(CASE quarter WHEN 'Q1' THEN amount END) AS Q1,
+//!        SUM(CASE quarter WHEN 'Q2' THEN amount END) AS Q2
+//! FROM sales
+//! GROUP BY region
+//! ```
+//!
+//! **UNPIVOT** is rewritten to `UNION ALL` of projections:
+//! ```sql
+//! -- Original:
+//! SELECT * FROM wide UNPIVOT (sales FOR quarter IN (q1, q2))
+//!
+//! -- Rewritten to:
+//! SELECT region, 'q1' AS quarter, q1 AS sales FROM wide
+//! UNION ALL
+//! SELECT region, 'q2' AS quarter, q2 AS sales FROM wide
+//! ```
+
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, Int64Array, StringArray};
+use arrow::record_batch::RecordBatch;
+use datafusion::prelude::*;
+use datafusion_common::{Result, ScalarValue, plan_datafusion_err};
+use datafusion_expr::{
+    Expr, case, col, lit,
+    logical_plan::builder::LogicalPlanBuilder,
+    planner::{
+        PlannedRelation, RelationPlanner, RelationPlannerContext, RelationPlanning,
+    },
+};
+use datafusion_sql::sqlparser::ast::{NullInclusion, PivotValueSource, TableFactor};
+use insta::assert_snapshot;
+
+// ============================================================================
+// Example Entry Point
+// ============================================================================
+
+/// Runs the PIVOT/UNPIVOT examples demonstrating data reshaping operations.
+pub async fn pivot_unpivot() -> Result<()> {
+    let ctx = SessionContext::new();
+    ctx.register_relation_planner(Arc::new(PivotUnpivotPlanner))?;
+    register_sample_data(&ctx)?;
+
+    println!("PIVOT and UNPIVOT Example");
+    println!("=========================\n");
+
+    run_examples(&ctx).await
+}
+
+async fn run_examples(ctx: &SessionContext) -> Result<()> {
+    // ----- PIVOT Examples -----
+
+    // Example 1: Basic PIVOT
+    // Transforms: (region, quarter, amount) → (region, Q1, Q2)
+    let results = run_example(
+        ctx,
+        "Example 1: Basic PIVOT",
+        r#"SELECT * FROM quarterly_sales
+           PIVOT (SUM(amount) FOR quarter IN ('Q1', 'Q2')) AS p
+           ORDER BY region"#,
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +--------+------+------+
+    | region | Q1   | Q2   |
+    +--------+------+------+
+    | North  | 1000 | 1500 |
+    | South  | 1200 | 1300 |
+    +--------+------+------+
+    ");
+
+    // Example 2: PIVOT with multiple aggregates
+    // Creates columns for each (aggregate, value) combination
+    let results = run_example(
+        ctx,
+        "Example 2: PIVOT with multiple aggregates",
+        r#"SELECT * FROM quarterly_sales
+           PIVOT (SUM(amount), AVG(amount) FOR quarter IN ('Q1', 'Q2')) AS p
+           ORDER BY region"#,
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +--------+--------+--------+--------+--------+
+    | region | sum_Q1 | sum_Q2 | avg_Q1 | avg_Q2 |
+    +--------+--------+--------+--------+--------+
+    | North  | 1000   | 1500   | 1000.0 | 1500.0 |
+    | South  | 1200   | 1300   | 1200.0 | 1300.0 |
+    +--------+--------+--------+--------+--------+
+    ");
+
+    // Example 3: PIVOT with multiple grouping columns
+    // Non-pivot, non-aggregate columns become GROUP BY columns
+    let results = run_example(
+        ctx,
+        "Example 3: PIVOT with multiple grouping columns",
+        r#"SELECT * FROM product_sales
+           PIVOT (SUM(amount) FOR quarter IN ('Q1', 'Q2')) AS p
+           ORDER BY region, product"#,
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +--------+----------+-----+-----+
+    | region | product  | Q1  | Q2  |
+    +--------+----------+-----+-----+
+    | North  | ProductA | 500 |     |
+    | North  | ProductB | 500 |     |
+    | South  | ProductA |     | 650 |
+    +--------+----------+-----+-----+
+    ");
+
+    // ----- UNPIVOT Examples -----
+
+    // Example 4: Basic UNPIVOT
+    // Transforms: (region, q1, q2) → (region, quarter, sales)
+    let results = run_example(
+        ctx,
+        "Example 4: Basic UNPIVOT",
+        r#"SELECT * FROM wide_sales
+           UNPIVOT (sales FOR quarter IN (q1 AS 'Q1', q2 AS 'Q2')) AS u
+           ORDER BY quarter, region"#,
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +--------+---------+-------+
+    | region | quarter | sales |
+    +--------+---------+-------+
+    | North  | Q1      | 1000  |
+    | South  | Q1      | 1200  |
+    | North  | Q2      | 1500  |
+    | South  | Q2      | 1300  |
+    +--------+---------+-------+
+    ");
+
+    // Example 5: UNPIVOT with INCLUDE NULLS
+    // By default, UNPIVOT excludes rows where the value column is NULL.
+    // INCLUDE NULLS keeps them (same result here since no NULLs in data).
+    let results = run_example(
+        ctx,
+        "Example 5: UNPIVOT INCLUDE NULLS",
+        r#"SELECT * FROM wide_sales
+           UNPIVOT INCLUDE NULLS (sales FOR quarter IN (q1 AS 'Q1', q2 AS 'Q2')) AS u
+           ORDER BY quarter, region"#,
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +--------+---------+-------+
+    | region | quarter | sales |
+    +--------+---------+-------+
+    | North  | Q1      | 1000  |
+    | South  | Q1      | 1200  |
+    | North  | Q2      | 1500  |
+    | South  | Q2      | 1300  |
+    +--------+---------+-------+
+    ");
+
+    // Example 6: PIVOT with column projection
+    // Standard SQL operations work seamlessly after PIVOT
+    let results = run_example(
+        ctx,
+        "Example 6: PIVOT with projection",
+        r#"SELECT region FROM quarterly_sales
+           PIVOT (SUM(amount) FOR quarter IN ('Q1', 'Q2')) AS p
+           ORDER BY region"#,
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +--------+
+    | region |
+    +--------+
+    | North  |
+    | South  |
+    +--------+
+    ");
+
+    // Example 7: PIVOT on a quoted mixed-case column
+    // Reuses the parsed column expression so quoted identifiers keep their case.
+    let results = run_example(
+        ctx,
+        "Example 7: PIVOT with quoted mixed-case column",
+        r#"SELECT * FROM point_stats
+           PIVOT (MAX(max_value) FOR "pointNumber" IN ('16951' AS p16951, '16952' AS p16952)) AS p
+           ORDER BY ts"#,
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +----------------------+------+--------+--------+
+    | ts                   | port | p16951 | p16952 |
+    +----------------------+------+--------+--------+
+    | 2024-09-01T10:00:00Z | 2411 | 10     | 20     |
+    | 2024-09-01T10:01:00Z | 2411 | 30     | 40     |
+    +----------------------+------+--------+--------+
+    ");
+
+    Ok(())
+}
+
+/// Helper to run a single example query and capture results.
+async fn run_example(ctx: &SessionContext, title: &str, sql: &str) -> Result<String> {
+    println!("{title}:\n{sql}\n");
+    let df = ctx.sql(sql).await?;
+    println!("{}\n", df.logical_plan().display_indent());
+
+    let batches = df.collect().await?;
+    let results = arrow::util::pretty::pretty_format_batches(&batches)?.to_string();
+    println!("{results}\n");
+
+    Ok(results)
+}
+
+/// Register test data tables.
+fn register_sample_data(ctx: &SessionContext) -> Result<()> {
+    // quarterly_sales: normalized sales data (region, quarter, amount)
+    ctx.register_batch(
+        "quarterly_sales",
+        RecordBatch::try_from_iter(vec![
+            (
+                "region",
+                Arc::new(StringArray::from(vec!["North", "North", "South", "South"]))
+                    as ArrayRef,
+            ),
+            (
+                "quarter",
+                Arc::new(StringArray::from(vec!["Q1", "Q2", "Q1", "Q2"])),
+            ),
+            (
+                "amount",
+                Arc::new(Int64Array::from(vec![1000, 1500, 1200, 1300])),
+            ),
+        ])?,
+    )?;
+
+    // product_sales: sales with additional grouping dimension
+    ctx.register_batch(
+        "product_sales",
+        RecordBatch::try_from_iter(vec![
+            (
+                "region",
+                Arc::new(StringArray::from(vec!["North", "North", "South"])) as ArrayRef,
+            ),
+            (
+                "quarter",
+                Arc::new(StringArray::from(vec!["Q1", "Q1", "Q2"])),
+            ),
+            (
+                "product",
+                Arc::new(StringArray::from(vec!["ProductA", "ProductB", "ProductA"])),
+            ),
+            ("amount", Arc::new(Int64Array::from(vec![500, 500, 650]))),
+        ])?,
+    )?;
+
+    // wide_sales: denormalized/wide format (for UNPIVOT)
+    ctx.register_batch(
+        "wide_sales",
+        RecordBatch::try_from_iter(vec![
+            (
+                "region",
+                Arc::new(StringArray::from(vec!["North", "South"])) as ArrayRef,
+            ),
+            ("q1", Arc::new(Int64Array::from(vec![1000, 1200]))),
+            ("q2", Arc::new(Int64Array::from(vec![1500, 1300]))),
+        ])?,
+    )?;
+
+    // point_stats: grouped data with a quoted mixed-case pivot column.
+    ctx.register_batch(
+        "point_stats",
+        RecordBatch::try_from_iter(vec![
+            (
+                "ts",
+                Arc::new(StringArray::from(vec![
+                    "2024-09-01T10:00:00Z",
+                    "2024-09-01T10:00:00Z",
+                    "2024-09-01T10:01:00Z",
+                    "2024-09-01T10:01:00Z",
+                ])) as ArrayRef,
+            ),
+            (
+                "pointNumber",
+                Arc::new(StringArray::from(vec!["16951", "16952", "16951", "16952"])),
+            ),
+            (
+                "port",
+                Arc::new(StringArray::from(vec!["2411", "2411", "2411", "2411"])),
+            ),
+            (
+                "max_value",
+                Arc::new(Int64Array::from(vec![10, 20, 30, 40])),
+            ),
+        ])?,
+    )?;
+
+    Ok(())
+}
+
+// ============================================================================
+// Relation Planner: PivotUnpivotPlanner
+// ============================================================================
+
+/// Relation planner that rewrites PIVOT and UNPIVOT into standard SQL.
+#[derive(Debug)]
+struct PivotUnpivotPlanner;
+
+impl RelationPlanner for PivotUnpivotPlanner {
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        ctx: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning> {
+        match relation {
+            TableFactor::Pivot {
+                table,
+                aggregate_functions,
+                value_column,
+                value_source,
+                alias,
+                ..
+            } => plan_pivot(
+                ctx,
+                *table,
+                &aggregate_functions,
+                &value_column,
+                value_source,
+                alias,
+            ),
+
+            TableFactor::Unpivot {
+                table,
+                value,
+                name,
+                columns,
+                null_inclusion,
+                alias,
+            } => plan_unpivot(
+                ctx,
+                *table,
+                &value,
+                name,
+                &columns,
+                null_inclusion.as_ref(),
+                alias,
+            ),
+
+            other => Ok(RelationPlanning::Original(Box::new(other))),
+        }
+    }
+}
+
+// ============================================================================
+// PIVOT Implementation
+// ============================================================================
+
+/// Rewrite PIVOT to GROUP BY with CASE expressions.
+fn plan_pivot(
+    ctx: &mut dyn RelationPlannerContext,
+    table: TableFactor,
+    aggregate_functions: &[datafusion_sql::sqlparser::ast::ExprWithAlias],
+    value_column: &[datafusion_sql::sqlparser::ast::Expr],
+    value_source: PivotValueSource,
+    alias: Option<datafusion_sql::sqlparser::ast::TableAlias>,
+) -> Result<RelationPlanning> {
+    // Plan the input table
+    let input = ctx.plan(table)?;
+    let schema = input.schema();
+
+    // Parse aggregate functions
+    let aggregates: Vec<Expr> = aggregate_functions
+        .iter()
+        .map(|agg| ctx.sql_to_expr(agg.expr.clone(), schema.as_ref()))
+        .collect::<Result<_>>()?;
+
+    // Get the pivot column (only single-column pivot supported)
+    if value_column.len() != 1 {
+        return Err(plan_datafusion_err!(
+            "Only single-column PIVOT is supported"
+        ));
+    }
+    let pivot_col = ctx.sql_to_expr(value_column[0].clone(), schema.as_ref())?;
+    let pivot_col_name = extract_column_name(&pivot_col)?;
+
+    // Parse pivot values
+    let pivot_values = match value_source {
+        PivotValueSource::List(list) => list
+            .iter()
+            .map(|item| {
+                let alias = item
+                    .alias
+                    .as_ref()
+                    .map(|id| ctx.normalize_ident(id.clone()));
+                let expr = ctx.sql_to_expr(item.expr.clone(), schema.as_ref())?;
+                Ok((alias, expr))
+            })
+            .collect::<Result<Vec<_>>>()?,
+        _ => {
+            return Err(plan_datafusion_err!(
+                "Dynamic PIVOT (ANY/Subquery) is not supported"
+            ));
+        }
+    };
+
+    // Determine GROUP BY columns (non-pivot, non-aggregate columns)
+    let agg_input_cols: Vec<&str> = aggregates
+        .iter()
+        .filter_map(|agg| {
+            if let Expr::AggregateFunction(f) = agg {
+                f.params.args.first().and_then(|e| {
+                    if let Expr::Column(c) = e {
+                        Some(c.name.as_str())
+                    } else {
+                        None
+                    }
+                })
+            } else {
+                None
+            }
+        })
+        .collect();
+
+    let group_by_cols: Vec<Expr> = schema
+        .iter()
+        .filter(|(_, field)| {
+            let name = field.name();
+            name != pivot_col_name.as_str() && !agg_input_cols.contains(&name.as_str())
+        })
+        .map(Expr::from)
+        .collect();
+
+    // Build CASE expressions for each (aggregate, pivot_value) pair
+    let mut pivot_exprs = Vec::new();
+    for agg in &aggregates {
+        let Expr::AggregateFunction(agg_fn) = agg else {
+            continue;
+        };
+        let Some(agg_input) = agg_fn.params.args.first().cloned() else {
+            continue;
+        };
+
+        for (value_alias, pivot_value) in &pivot_values {
+            // CASE pivot_col WHEN pivot_value THEN agg_input END
+            let case_expr = case(pivot_col.clone())
+                .when(pivot_value.clone(), agg_input.clone())
+                .end()?;
+
+            // Wrap in aggregate function
+            let pivoted = agg_fn.func.call(vec![case_expr]);
+
+            // Determine column alias
+            let value_str = value_alias
+                .clone()
+                .unwrap_or_else(|| expr_to_string(pivot_value));
+            let col_alias = if aggregates.len() > 1 {
+                format!("{}_{}", agg_fn.func.name(), value_str)
+            } else {
+                value_str
+            };
+
+            pivot_exprs.push(pivoted.alias(col_alias));
+        }
+    }
+
+    let plan = LogicalPlanBuilder::from(input)
+        .aggregate(group_by_cols, pivot_exprs)?
+        .build()?;
+
+    Ok(RelationPlanning::Planned(Box::new(PlannedRelation::new(
+        plan, alias,
+    ))))
+}
+
+// ============================================================================
+// UNPIVOT Implementation
+// ============================================================================
+
+/// Rewrite UNPIVOT to UNION ALL of projections.
+fn plan_unpivot(
+    ctx: &mut dyn RelationPlannerContext,
+    table: TableFactor,
+    value: &datafusion_sql::sqlparser::ast::Expr,
+    name: datafusion_sql::sqlparser::ast::Ident,
+    columns: &[datafusion_sql::sqlparser::ast::ExprWithAlias],
+    null_inclusion: Option<&NullInclusion>,
+    alias: Option<datafusion_sql::sqlparser::ast::TableAlias>,
+) -> Result<RelationPlanning> {
+    // Plan the input table
+    let input = ctx.plan(table)?;
+    let schema = input.schema();
+
+    // Output column names
+    let value_col_name = value.to_string();
+    let name_col_name = ctx.normalize_ident(name);
+
+    // Parse columns to unpivot: (source_column, label)
+    let unpivot_cols: Vec<(String, String)> = columns
+        .iter()
+        .map(|c| {
+            let label = c
+                .alias
+                .as_ref()
+                .map(|id| ctx.normalize_ident(id.clone()))
+                .unwrap_or_else(|| c.expr.to_string());
+            let expr = ctx.sql_to_expr(c.expr.clone(), schema.as_ref())?;
+            let col_name = extract_column_name(&expr)?;
+            Ok((col_name.to_string(), label))
+        })
+        .collect::<Result<_>>()?;
+
+    // Columns to preserve (not being unpivoted)
+    let keep_cols: Vec<&str> = schema
+        .fields()
+        .iter()
+        .map(|f| f.name().as_str())
+        .filter(|name| !unpivot_cols.iter().any(|(c, _)| c == *name))
+        .collect();
+
+    // Build UNION ALL: one SELECT per unpivot column
+    if unpivot_cols.is_empty() {
+        return Err(plan_datafusion_err!("UNPIVOT requires at least one column"));
+    }
+
+    let mut union_inputs: Vec<_> = unpivot_cols
+        .iter()
+        .map(|(col_name, label)| {
+            let mut projection: Vec<Expr> = keep_cols.iter().map(|c| col(*c)).collect();
+            projection.push(lit(label.clone()).alias(&name_col_name));
+            projection.push(col(col_name).alias(&value_col_name));
+
+            LogicalPlanBuilder::from(input.clone())
+                .project(projection)?
+                .build()
+        })
+        .collect::<Result<_>>()?;
+
+    // Combine with UNION ALL
+    let mut plan = union_inputs.remove(0);
+    for branch in union_inputs {
+        plan = LogicalPlanBuilder::from(plan).union(branch)?.build()?;
+    }
+
+    // Apply EXCLUDE NULLS filter (default behavior)
+    let exclude_nulls = null_inclusion.is_none()
+        || matches!(null_inclusion, Some(&NullInclusion::ExcludeNulls));
+    if exclude_nulls {
+        plan = LogicalPlanBuilder::from(plan)
+            .filter(col(&value_col_name).is_not_null())?
+            .build()?;
+    }
+
+    Ok(RelationPlanning::Planned(Box::new(PlannedRelation::new(
+        plan, alias,
+    ))))
+}
+
+// ============================================================================
+// Helpers
+// ============================================================================
+
+/// Extract column name from an expression.
+fn extract_column_name(expr: &Expr) -> Result<String> {
+    match expr {
+        Expr::Column(c) => Ok(c.name.clone()),
+        _ => Err(plan_datafusion_err!(
+            "Expected column reference, got {expr}"
+        )),
+    }
+}
+
+/// Convert an expression to a string for use as column alias.
+fn expr_to_string(expr: &Expr) -> String {
+    match expr {
+        Expr::Literal(ScalarValue::Utf8(Some(s)), _) => s.clone(),
+        Expr::Literal(v, _) => v.to_string(),
+        other => other.to_string(),
+    }
+}
diff --git a/datafusion-examples/examples/relation_planner/table_sample.rs b/datafusion-examples/examples/relation_planner/table_sample.rs
new file mode 100644
index 0000000000000..42342e5f1a641
--- /dev/null
+++ b/datafusion-examples/examples/relation_planner/table_sample.rs
@@ -0,0 +1,831 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # TABLESAMPLE Example
+//!
+//! This example demonstrates implementing SQL `TABLESAMPLE` support using
+//! DataFusion's extensibility APIs.
+//!
+//! This is a working `TABLESAMPLE` implementation that can serve as a starting
+//! point for your own projects. It also works as a template for adding other
+//! custom SQL operators, covering the full pipeline from parsing to execution.
+//!
+//! It shows how to:
+//!
+//! 1. **Parse** TABLESAMPLE syntax via a custom [`RelationPlanner`]
+//! 2. **Plan** sampling as a custom logical node ([`TableSamplePlanNode`])
+//! 3. **Execute** sampling via a custom physical operator ([`SampleExec`])
+//!
+//! ## Supported Syntax
+//!
+//! ```sql
+//! -- Bernoulli sampling (each row has N% chance of selection)
+//! SELECT * FROM table TABLESAMPLE BERNOULLI(10 PERCENT)
+//!
+//! -- Fractional sampling (0.0 to 1.0)
+//! SELECT * FROM table TABLESAMPLE (0.1)
+//!
+//! -- Row count limit
+//! SELECT * FROM table TABLESAMPLE (100 ROWS)
+//!
+//! -- Reproducible sampling with a seed
+//! SELECT * FROM table TABLESAMPLE (10 PERCENT) REPEATABLE(42)
+//! ```
+//!
+//! ## Architecture
+//!
+//! ```text
+//! ┌─────────────────────────────────────────────────────────────────┐
+//! │                         SQL Query                               │
+//! │  SELECT * FROM t TABLESAMPLE BERNOULLI(10 PERCENT) REPEATABLE(1)│
+//! └─────────────────────────────────────────────────────────────────┘
+//!                                │
+//!                                ▼
+//! ┌─────────────────────────────────────────────────────────────────┐
+//! │                    TableSamplePlanner                           │
+//! │    (RelationPlanner: parses TABLESAMPLE, creates logical node)  │
+//! └─────────────────────────────────────────────────────────────────┘
+//!                                │
+//!                                ▼
+//! ┌─────────────────────────────────────────────────────────────────┐
+//! │                   TableSamplePlanNode                           │
+//! │         (UserDefinedLogicalNode: stores sampling params)        │
+//! └─────────────────────────────────────────────────────────────────┘
+//!                                │
+//!                                ▼
+//! ┌─────────────────────────────────────────────────────────────────┐
+//! │                TableSampleExtensionPlanner                      │
+//! │       (ExtensionPlanner: creates physical execution plan)       │
+//! └─────────────────────────────────────────────────────────────────┘
+//!                                │
+//!                                ▼
+//! ┌─────────────────────────────────────────────────────────────────┐
+//! │                        SampleExec                               │
+//! │    (ExecutionPlan: performs actual row sampling at runtime)     │
+//! └─────────────────────────────────────────────────────────────────┘
+//! ```
+
+use std::{
+    fmt::{self, Debug, Formatter},
+    hash::{Hash, Hasher},
+    pin::Pin,
+    sync::Arc,
+    task::{Context, Poll},
+};
+
+use arrow::datatypes::{Float64Type, Int64Type};
+use arrow::{
+    array::{ArrayRef, Int32Array, RecordBatch, StringArray, UInt32Array},
+    compute,
+};
+use arrow_schema::SchemaRef;
+use futures::{
+    ready,
+    stream::{Stream, StreamExt},
+};
+use rand::{Rng, SeedableRng, rngs::StdRng};
+use tonic::async_trait;
+
+use datafusion::optimizer::simplify_expressions::simplify_literal::parse_literal;
+use datafusion::{
+    execution::{
+        RecordBatchStream, SendableRecordBatchStream, SessionState, SessionStateBuilder,
+        TaskContext, context::QueryPlanner,
+    },
+    physical_expr::EquivalenceProperties,
+    physical_plan::{
+        DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
+        metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, RecordOutput},
+    },
+    physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner},
+    prelude::*,
+};
+use datafusion_common::{
+    DFSchemaRef, DataFusionError, Result, Statistics, internal_err, not_impl_err,
+    plan_datafusion_err, plan_err, tree_node::TreeNodeRecursion,
+};
+use datafusion_expr::{
+    UserDefinedLogicalNode, UserDefinedLogicalNodeCore,
+    logical_plan::{Extension, LogicalPlan, LogicalPlanBuilder},
+    planner::{
+        PlannedRelation, RelationPlanner, RelationPlannerContext, RelationPlanning,
+    },
+};
+use datafusion_sql::sqlparser::ast::{
+    self, TableFactor, TableSampleMethod, TableSampleUnit,
+};
+use insta::assert_snapshot;
+
+// ============================================================================
+// Example Entry Point
+// ============================================================================
+
+/// Runs the TABLESAMPLE examples demonstrating various sampling techniques.
+pub async fn table_sample() -> Result<()> {
+    // Build session with custom query planner for physical planning
+    let state = SessionStateBuilder::new()
+        .with_default_features()
+        .with_query_planner(Arc::new(TableSampleQueryPlanner))
+        .build();
+
+    let ctx = SessionContext::new_with_state(state);
+
+    // Register custom relation planner for logical planning
+    ctx.register_relation_planner(Arc::new(TableSamplePlanner))?;
+    register_sample_data(&ctx)?;
+
+    println!("TABLESAMPLE Example");
+    println!("===================\n");
+
+    run_examples(&ctx).await
+}
+
+async fn run_examples(ctx: &SessionContext) -> Result<()> {
+    // Example 1: Baseline - full table scan
+    let results = run_example(
+        ctx,
+        "Example 1: Full table (baseline)",
+        "SELECT * FROM sample_data",
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +---------+---------+
+    | column1 | column2 |
+    +---------+---------+
+    | 1       | row_1   |
+    | 2       | row_2   |
+    | 3       | row_3   |
+    | 4       | row_4   |
+    | 5       | row_5   |
+    | 6       | row_6   |
+    | 7       | row_7   |
+    | 8       | row_8   |
+    | 9       | row_9   |
+    | 10      | row_10  |
+    +---------+---------+
+    ");
+
+    // Example 2: Percentage-based Bernoulli sampling
+    // REPEATABLE(seed) ensures deterministic results for snapshot testing
+    let results = run_example(
+        ctx,
+        "Example 2: BERNOULLI percentage sampling",
+        "SELECT * FROM sample_data TABLESAMPLE BERNOULLI(30 PERCENT) REPEATABLE(123)",
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +---------+---------+
+    | column1 | column2 |
+    +---------+---------+
+    | 1       | row_1   |
+    | 2       | row_2   |
+    | 7       | row_7   |
+    | 8       | row_8   |
+    +---------+---------+
+    ");
+
+    // Example 3: Fractional sampling (0.0 to 1.0)
+    // REPEATABLE(seed) ensures deterministic results for snapshot testing
+    let results = run_example(
+        ctx,
+        "Example 3: Fractional sampling",
+        "SELECT * FROM sample_data TABLESAMPLE (0.5) REPEATABLE(456)",
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +---------+---------+
+    | column1 | column2 |
+    +---------+---------+
+    | 2       | row_2   |
+    | 4       | row_4   |
+    | 8       | row_8   |
+    +---------+---------+
+    ");
+
+    // Example 4: Row count limit (deterministic, no seed needed)
+    let results = run_example(
+        ctx,
+        "Example 4: Row count limit",
+        "SELECT * FROM sample_data TABLESAMPLE (3 ROWS)",
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +---------+---------+
+    | column1 | column2 |
+    +---------+---------+
+    | 1       | row_1   |
+    | 2       | row_2   |
+    | 3       | row_3   |
+    +---------+---------+
+    ");
+
+    // Example 5: Sampling combined with filtering
+    let results = run_example(
+        ctx,
+        "Example 5: Sampling with WHERE clause",
+        "SELECT * FROM sample_data TABLESAMPLE (5 ROWS) WHERE column1 > 2",
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +---------+---------+
+    | column1 | column2 |
+    +---------+---------+
+    | 3       | row_3   |
+    | 4       | row_4   |
+    | 5       | row_5   |
+    +---------+---------+
+    ");
+
+    // Example 6: Sampling in JOIN queries
+    // REPEATABLE(seed) ensures deterministic results for snapshot testing
+    let results = run_example(
+        ctx,
+        "Example 6: Sampling in JOINs",
+        r#"SELECT t1.column1, t2.column1, t1.column2, t2.column2
+           FROM sample_data t1 TABLESAMPLE (0.7) REPEATABLE(789)
+           JOIN sample_data t2 TABLESAMPLE (0.7) REPEATABLE(123)
+           ON t1.column1 = t2.column1"#,
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +---------+---------+---------+---------+
+    | column1 | column1 | column2 | column2 |
+    +---------+---------+---------+---------+
+    | 2       | 2       | row_2   | row_2   |
+    | 5       | 5       | row_5   | row_5   |
+    | 7       | 7       | row_7   | row_7   |
+    | 8       | 8       | row_8   | row_8   |
+    | 10      | 10      | row_10  | row_10  |
+    +---------+---------+---------+---------+
+    ");
+
+    Ok(())
+}
+
+/// Helper to run a single example query and capture results.
+async fn run_example(ctx: &SessionContext, title: &str, sql: &str) -> Result<String> {
+    println!("{title}:\n{sql}\n");
+    let df = ctx.sql(sql).await?;
+    println!("{}\n", df.logical_plan().display_indent());
+
+    let batches = df.collect().await?;
+    let results = arrow::util::pretty::pretty_format_batches(&batches)?.to_string();
+    println!("{results}\n");
+
+    Ok(results)
+}
+
+/// Register test data: 10 rows with column1=1..10 and column2="row_1".."row_10"
+fn register_sample_data(ctx: &SessionContext) -> Result<()> {
+    let column1: ArrayRef = Arc::new(Int32Array::from((1..=10).collect::<Vec<i32>>()));
+    let column2: ArrayRef = Arc::new(StringArray::from(
+        (1..=10).map(|i| format!("row_{i}")).collect::<Vec<_>>(),
+    ));
+    let batch =
+        RecordBatch::try_from_iter(vec![("column1", column1), ("column2", column2)])?;
+    ctx.register_batch("sample_data", batch)?;
+    Ok(())
+}
+
+// ============================================================================
+// Logical Planning: TableSamplePlanner + TableSamplePlanNode
+// ============================================================================
+
+/// Relation planner that intercepts `TABLESAMPLE` clauses in SQL and creates
+/// [`TableSamplePlanNode`] logical nodes.
+#[derive(Debug)]
+struct TableSamplePlanner;
+
+impl RelationPlanner for TableSamplePlanner {
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        context: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning> {
+        // Only handle Table relations with TABLESAMPLE clause
+        let TableFactor::Table {
+            sample: Some(sample),
+            alias,
+            name,
+            args,
+            with_hints,
+            version,
+            with_ordinality,
+            partitions,
+            json_path,
+            index_hints,
+        } = relation
+        else {
+            return Ok(RelationPlanning::Original(Box::new(relation)));
+        };
+
+        // Extract sample spec (handles both before/after alias positions)
+        let sample = match sample {
+            ast::TableSampleKind::BeforeTableAlias(s)
+            | ast::TableSampleKind::AfterTableAlias(s) => s,
+        };
+
+        // Validate sampling method
+        if let Some(method) = &sample.name
+            && *method != TableSampleMethod::Bernoulli
+            && *method != TableSampleMethod::Row
+        {
+            return not_impl_err!(
+                "Sampling method {} is not supported (only BERNOULLI and ROW)",
+                method
+            );
+        }
+
+        // Offset sampling (ClickHouse-style) not supported
+        if sample.offset.is_some() {
+            return not_impl_err!(
+                "TABLESAMPLE with OFFSET is not supported (requires total row count)"
+            );
+        }
+
+        // Parse optional REPEATABLE seed
+        let seed = sample
+            .seed
+            .map(|s| {
+                s.value.to_string().parse::<u64>().map_err(|_| {
+                    plan_datafusion_err!("REPEATABLE seed must be an integer")
+                })
+            })
+            .transpose()?;
+
+        // Plan the underlying table without the sample clause
+        let base_relation = TableFactor::Table {
+            sample: None,
+            alias: alias.clone(),
+            name,
+            args,
+            with_hints,
+            version,
+            with_ordinality,
+            partitions,
+            json_path,
+            index_hints,
+        };
+        let input = context.plan(base_relation)?;
+
+        // Handle bucket sampling (Hive-style: TABLESAMPLE(BUCKET x OUT OF y))
+        if let Some(bucket) = sample.bucket {
+            if bucket.on.is_some() {
+                return not_impl_err!(
+                    "TABLESAMPLE BUCKET with ON clause requires CLUSTERED BY table"
+                );
+            }
+            let bucket_num: u64 =
+                bucket.bucket.to_string().parse().map_err(|_| {
+                    plan_datafusion_err!("bucket number must be an integer")
+                })?;
+            let total: u64 =
+                bucket.total.to_string().parse().map_err(|_| {
+                    plan_datafusion_err!("bucket total must be an integer")
+                })?;
+
+            let fraction = bucket_num as f64 / total as f64;
+            let plan = TableSamplePlanNode::new(input, fraction, seed).into_plan();
+            return Ok(RelationPlanning::Planned(Box::new(PlannedRelation::new(
+                plan, alias,
+            ))));
+        }
+
+        // Handle quantity-based sampling
+        let Some(quantity) = sample.quantity else {
+            return plan_err!(
+                "TABLESAMPLE requires a quantity (percentage, fraction, or row count)"
+            );
+        };
+        let quantity_value_expr = context.sql_to_expr(quantity.value, input.schema())?;
+
+        match quantity.unit {
+            // TABLESAMPLE (N ROWS) - exact row limit
+            Some(TableSampleUnit::Rows) => {
+                let rows: i64 = parse_literal::<Int64Type>(&quantity_value_expr)?;
+                if rows < 0 {
+                    return plan_err!("row count must be non-negative, got {}", rows);
+                }
+                let plan = LogicalPlanBuilder::from(input)
+                    .limit(0, Some(rows as usize))?
+                    .build()?;
+                Ok(RelationPlanning::Planned(Box::new(PlannedRelation::new(
+                    plan, alias,
+                ))))
+            }
+
+            // TABLESAMPLE (N PERCENT) - percentage sampling
+            Some(TableSampleUnit::Percent) => {
+                let percent: f64 = parse_literal::<Float64Type>(&quantity_value_expr)?;
+                let fraction = percent / 100.0;
+                let plan = TableSamplePlanNode::new(input, fraction, seed).into_plan();
+                Ok(RelationPlanning::Planned(Box::new(PlannedRelation::new(
+                    plan, alias,
+                ))))
+            }
+
+            // TABLESAMPLE (N) - fraction if <1.0, row limit if >=1.0
+            None => {
+                let value = parse_literal::<Float64Type>(&quantity_value_expr)?;
+                if value < 0.0 {
+                    return plan_err!("sample value must be non-negative, got {}", value);
+                }
+                let plan = if value >= 1.0 {
+                    // Interpret as row limit
+                    LogicalPlanBuilder::from(input)
+                        .limit(0, Some(value as usize))?
+                        .build()?
+                } else {
+                    // Interpret as fraction
+                    TableSamplePlanNode::new(input, value, seed).into_plan()
+                };
+                Ok(RelationPlanning::Planned(Box::new(PlannedRelation::new(
+                    plan, alias,
+                ))))
+            }
+        }
+    }
+}
+
+/// Custom logical plan node representing a TABLESAMPLE operation.
+///
+/// Stores sampling parameters (bounds, seed) and wraps the input plan.
+/// Gets converted to [`SampleExec`] during physical planning.
+#[derive(Debug, Clone, Hash, Eq, PartialEq, PartialOrd)]
+struct TableSamplePlanNode {
+    input: LogicalPlan,
+    lower_bound: HashableF64,
+    upper_bound: HashableF64,
+    seed: u64,
+}
+
+impl TableSamplePlanNode {
+    /// Create a new sampling node with the given fraction (0.0 to 1.0).
+    fn new(input: LogicalPlan, fraction: f64, seed: Option<u64>) -> Self {
+        Self {
+            input,
+            lower_bound: HashableF64(0.0),
+            upper_bound: HashableF64(fraction),
+            seed: seed.unwrap_or_else(rand::random),
+        }
+    }
+
+    /// Wrap this node in a LogicalPlan::Extension.
+    fn into_plan(self) -> LogicalPlan {
+        LogicalPlan::Extension(Extension {
+            node: Arc::new(self),
+        })
+    }
+}
+
+impl UserDefinedLogicalNodeCore for TableSamplePlanNode {
+    fn name(&self) -> &str {
+        "TableSample"
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.input]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        self.input.schema()
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut Formatter) -> fmt::Result {
+        write!(
+            f,
+            "Sample: bounds=[{}, {}], seed={}",
+            self.lower_bound.0, self.upper_bound.0, self.seed
+        )
+    }
+
+    fn with_exprs_and_inputs(
+        &self,
+        _exprs: Vec<Expr>,
+        mut inputs: Vec<LogicalPlan>,
+    ) -> Result<Self> {
+        Ok(Self {
+            input: inputs.swap_remove(0),
+            lower_bound: self.lower_bound,
+            upper_bound: self.upper_bound,
+            seed: self.seed,
+        })
+    }
+}
+
+/// Wrapper for f64 that implements Hash and Eq (required for LogicalPlan).
+#[derive(Debug, Clone, Copy, PartialOrd)]
+struct HashableF64(f64);
+
+impl PartialEq for HashableF64 {
+    fn eq(&self, other: &Self) -> bool {
+        self.0.to_bits() == other.0.to_bits()
+    }
+}
+
+impl Eq for HashableF64 {}
+
+impl Hash for HashableF64 {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.0.to_bits().hash(state);
+    }
+}
+
+// ============================================================================
+// Physical Planning: TableSampleQueryPlanner + TableSampleExtensionPlanner
+// ============================================================================
+
+/// Custom query planner that registers [`TableSampleExtensionPlanner`] to
+/// convert [`TableSamplePlanNode`] into [`SampleExec`].
+#[derive(Debug)]
+struct TableSampleQueryPlanner;
+
+#[async_trait]
+impl QueryPlanner for TableSampleQueryPlanner {
+    async fn create_physical_plan(
+        &self,
+        logical_plan: &LogicalPlan,
+        session_state: &SessionState,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let planner = DefaultPhysicalPlanner::with_extension_planners(vec![Arc::new(
+            TableSampleExtensionPlanner,
+        )]);
+        planner
+            .create_physical_plan(logical_plan, session_state)
+            .await
+    }
+}
+
+/// Extension planner that converts [`TableSamplePlanNode`] to [`SampleExec`].
+struct TableSampleExtensionPlanner;
+
+#[async_trait]
+impl ExtensionPlanner for TableSampleExtensionPlanner {
+    async fn plan_extension(
+        &self,
+        _planner: &dyn PhysicalPlanner,
+        node: &dyn UserDefinedLogicalNode,
+        _logical_inputs: &[&LogicalPlan],
+        physical_inputs: &[Arc<dyn ExecutionPlan>],
+        _session_state: &SessionState,
+    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+        let Some(sample_node) = node.as_any().downcast_ref::<TableSamplePlanNode>()
+        else {
+            return Ok(None);
+        };
+
+        let exec = SampleExec::try_new(
+            Arc::clone(&physical_inputs[0]),
+            sample_node.lower_bound.0,
+            sample_node.upper_bound.0,
+            sample_node.seed,
+        )?;
+        Ok(Some(Arc::new(exec)))
+    }
+}
+
+// ============================================================================
+// Physical Execution: SampleExec + BernoulliSampler
+// ============================================================================
+
+/// Physical execution plan that samples rows from its input using Bernoulli sampling.
+///
+/// Each row is independently selected with probability `(upper_bound - lower_bound)`
+/// and appears at most once.
+#[derive(Debug, Clone)]
+pub struct SampleExec {
+    input: Arc<dyn ExecutionPlan>,
+    lower_bound: f64,
+    upper_bound: f64,
+    seed: u64,
+    metrics: ExecutionPlanMetricsSet,
+    cache: Arc<PlanProperties>,
+}
+
+impl SampleExec {
+    /// Create a new SampleExec with Bernoulli sampling (without replacement).
+    ///
+    /// # Arguments
+    /// * `input` - The input execution plan
+    /// * `lower_bound` - Lower bound of sampling range (typically 0.0)
+    /// * `upper_bound` - Upper bound of sampling range (0.0 to 1.0)
+    /// * `seed` - Random seed for reproducible sampling
+    pub fn try_new(
+        input: Arc<dyn ExecutionPlan>,
+        lower_bound: f64,
+        upper_bound: f64,
+        seed: u64,
+    ) -> Result<Self> {
+        if lower_bound < 0.0 || upper_bound > 1.0 || lower_bound > upper_bound {
+            return internal_err!(
+                "Sampling bounds must satisfy 0.0 <= lower <= upper <= 1.0, got [{}, {}]",
+                lower_bound,
+                upper_bound
+            );
+        }
+
+        let cache = PlanProperties::new(
+            EquivalenceProperties::new(input.schema()),
+            input.properties().partitioning.clone(),
+            input.properties().emission_type,
+            input.properties().boundedness,
+        );
+
+        Ok(Self {
+            input,
+            lower_bound,
+            upper_bound,
+            seed,
+            metrics: ExecutionPlanMetricsSet::new(),
+            cache: Arc::new(cache),
+        })
+    }
+
+    /// Create a sampler for the given partition.
+    fn create_sampler(&self, partition: usize) -> BernoulliSampler {
+        let seed = self.seed.wrapping_add(partition as u64);
+        BernoulliSampler::new(self.lower_bound, self.upper_bound, seed)
+    }
+}
+
+impl DisplayAs for SampleExec {
+    fn fmt_as(&self, _t: DisplayFormatType, f: &mut Formatter) -> fmt::Result {
+        write!(
+            f,
+            "SampleExec: bounds=[{}, {}], seed={}",
+            self.lower_bound, self.upper_bound, self.seed
+        )
+    }
+}
+
+impl ExecutionPlan for SampleExec {
+    fn name(&self) -> &'static str {
+        "SampleExec"
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.cache
+    }
+
+    fn maintains_input_order(&self) -> Vec<bool> {
+        // Sampling preserves row order (rows are filtered, not reordered)
+        vec![true]
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.input]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(Self::try_new(
+            children.swap_remove(0),
+            self.lower_bound,
+            self.upper_bound,
+            self.seed,
+        )?))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        Ok(Box::pin(SampleStream {
+            input: self.input.execute(partition, context)?,
+            sampler: self.create_sampler(partition),
+            metrics: BaselineMetrics::new(&self.metrics, partition),
+        }))
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let mut stats = Arc::unwrap_or_clone(self.input.partition_statistics(partition)?);
+        let ratio = self.upper_bound - self.lower_bound;
+
+        // Scale statistics by sampling ratio (inexact due to randomness)
+        stats.num_rows = stats
+            .num_rows
+            .map(|n| (n as f64 * ratio) as usize)
+            .to_inexact();
+        stats.total_byte_size = stats
+            .total_byte_size
+            .map(|n| (n as f64 * ratio) as usize)
+            .to_inexact();
+
+        Ok(Arc::new(stats))
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion::physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.cache.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
+}
+
+/// Bernoulli sampler: includes each row with probability `(upper - lower)`.
+/// This is sampling **without replacement** - each row appears at most once.
+struct BernoulliSampler {
+    lower_bound: f64,
+    upper_bound: f64,
+    rng: StdRng,
+}
+
+impl BernoulliSampler {
+    fn new(lower_bound: f64, upper_bound: f64, seed: u64) -> Self {
+        Self {
+            lower_bound,
+            upper_bound,
+            rng: StdRng::seed_from_u64(seed),
+        }
+    }
+
+    fn sample(&mut self, batch: &RecordBatch) -> Result<RecordBatch> {
+        let range = self.upper_bound - self.lower_bound;
+        if range <= 0.0 {
+            return Ok(RecordBatch::new_empty(batch.schema()));
+        }
+
+        // Select rows where random value falls in [lower, upper)
+        let indices: Vec<u32> = (0..batch.num_rows())
+            .filter(|_| {
+                let r: f64 = self.rng.random();
+                r >= self.lower_bound && r < self.upper_bound
+            })
+            .map(|i| i as u32)
+            .collect();
+
+        if indices.is_empty() {
+            return Ok(RecordBatch::new_empty(batch.schema()));
+        }
+
+        compute::take_record_batch(batch, &UInt32Array::from(indices))
+            .map_err(DataFusionError::from)
+    }
+}
+
+/// Stream adapter that applies sampling to each batch.
+struct SampleStream {
+    input: SendableRecordBatchStream,
+    sampler: BernoulliSampler,
+    metrics: BaselineMetrics,
+}
+
+impl Stream for SampleStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        match ready!(self.input.poll_next_unpin(cx)) {
+            Some(Ok(batch)) => {
+                let elapsed = self.metrics.elapsed_compute().clone();
+                let _timer = elapsed.timer();
+                let result = self.sampler.sample(&batch);
+                Poll::Ready(Some(result.record_output(&self.metrics)))
+            }
+            Some(Err(e)) => Poll::Ready(Some(Err(e))),
+            None => Poll::Ready(None),
+        }
+    }
+}
+
+impl RecordBatchStream for SampleStream {
+    fn schema(&self) -> SchemaRef {
+        self.input.schema()
+    }
+}
diff --git a/datafusion-examples/examples/sql_dialect.rs b/datafusion-examples/examples/sql_dialect.rs
deleted file mode 100644
index 20b515506f3b4..0000000000000
--- a/datafusion-examples/examples/sql_dialect.rs
+++ /dev/null
@@ -1,134 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::fmt::Display;
-
-use datafusion::error::{DataFusionError, Result};
-use datafusion::sql::{
-    parser::{CopyToSource, CopyToStatement, DFParser, DFParserBuilder, Statement},
-    sqlparser::{keywords::Keyword, tokenizer::Token},
-};
-
-/// This example demonstrates how to use the DFParser to parse a statement in a custom way
-///
-/// This technique can be used to implement a custom SQL dialect, for example.
-#[tokio::main]
-async fn main() -> Result<()> {
-    let mut my_parser =
-        MyParser::new("COPY source_table TO 'file.fasta' STORED AS FASTA")?;
-
-    let my_statement = my_parser.parse_statement()?;
-
-    match my_statement {
-        MyStatement::DFStatement(s) => println!("df: {s}"),
-        MyStatement::MyCopyTo(s) => println!("my_copy: {s}"),
-    }
-
-    Ok(())
-}
-
-/// Here we define a Parser for our new SQL dialect that wraps the existing `DFParser`
-struct MyParser<'a> {
-    df_parser: DFParser<'a>,
-}
-
-impl<'a> MyParser<'a> {
-    fn new(sql: &'a str) -> Result<Self> {
-        let df_parser = DFParserBuilder::new(sql).build()?;
-        Ok(Self { df_parser })
-    }
-
-    /// Returns true if the next token is `COPY` keyword, false otherwise
-    fn is_copy(&self) -> bool {
-        matches!(
-            self.df_parser.parser.peek_token().token,
-            Token::Word(w) if w.keyword == Keyword::COPY
-        )
-    }
-
-    /// This is the entry point to our parser -- it handles `COPY` statements specially
-    /// but otherwise delegates to the existing DataFusion parser.
-    pub fn parse_statement(&mut self) -> Result<MyStatement, DataFusionError> {
-        if self.is_copy() {
-            self.df_parser.parser.next_token(); // COPY
-            let df_statement = self.df_parser.parse_copy()?;
-
-            if let Statement::CopyTo(s) = df_statement {
-                Ok(MyStatement::from(s))
-            } else {
-                Ok(MyStatement::DFStatement(Box::from(df_statement)))
-            }
-        } else {
-            let df_statement = self.df_parser.parse_statement()?;
-            Ok(MyStatement::from(df_statement))
-        }
-    }
-}
-
-enum MyStatement {
-    DFStatement(Box<Statement>),
-    MyCopyTo(MyCopyToStatement),
-}
-
-impl Display for MyStatement {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            MyStatement::DFStatement(s) => write!(f, "{s}"),
-            MyStatement::MyCopyTo(s) => write!(f, "{s}"),
-        }
-    }
-}
-
-impl From<Statement> for MyStatement {
-    fn from(s: Statement) -> Self {
-        Self::DFStatement(Box::from(s))
-    }
-}
-
-impl From<CopyToStatement> for MyStatement {
-    fn from(s: CopyToStatement) -> Self {
-        if s.stored_as == Some("FASTA".to_string()) {
-            Self::MyCopyTo(MyCopyToStatement::from(s))
-        } else {
-            Self::DFStatement(Box::from(Statement::CopyTo(s)))
-        }
-    }
-}
-
-struct MyCopyToStatement {
-    pub source: CopyToSource,
-    pub target: String,
-}
-
-impl From<CopyToStatement> for MyCopyToStatement {
-    fn from(s: CopyToStatement) -> Self {
-        Self {
-            source: s.source,
-            target: s.target,
-        }
-    }
-}
-
-impl Display for MyCopyToStatement {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "COPY {} TO '{}' STORED AS FASTA",
-            self.source, self.target
-        )
-    }
-}
diff --git a/datafusion-examples/examples/sql_analysis.rs b/datafusion-examples/examples/sql_ops/analysis.rs
similarity index 98%
rename from datafusion-examples/examples/sql_analysis.rs
rename to datafusion-examples/examples/sql_ops/analysis.rs
index 4ff669faf1d0c..4243a2927865b 100644
--- a/datafusion-examples/examples/sql_analysis.rs
+++ b/datafusion-examples/examples/sql_ops/analysis.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+//!
 //! This example shows how to use the structures that DataFusion provides to perform
 //! Analysis on SQL queries and their plans.
 //!
@@ -23,8 +25,8 @@
 
 use std::sync::Arc;
 
-use datafusion::common::tree_node::{TreeNode, TreeNodeRecursion};
 use datafusion::common::Result;
+use datafusion::common::tree_node::{TreeNode, TreeNodeRecursion};
 use datafusion::logical_expr::LogicalPlan;
 use datafusion::{
     datasource::MemTable,
@@ -32,141 +34,9 @@ use datafusion::{
 };
 use test_utils::tpcds::tpcds_schemas;
 
-/// Counts the total number of joins in a plan
-fn total_join_count(plan: &LogicalPlan) -> usize {
-    let mut total = 0;
-
-    // We can use the TreeNode API to walk over a LogicalPlan.
-    plan.apply(|node| {
-        // if we encounter a join we update the running count
-        if matches!(node, LogicalPlan::Join(_)) {
-            total += 1;
-        }
-        Ok(TreeNodeRecursion::Continue)
-    })
-    .unwrap();
-
-    total
-}
-
-/// Counts the total number of joins in a plan and collects every join tree in
-/// the plan with their respective join count.
-///
-/// Join Tree Definition: the largest subtree consisting entirely of joins
-///
-/// For example, this plan:
-///
-/// ```text
-///         JOIN
-///         /  \
-///       A   JOIN
-///            /  \
-///           B    C
-/// ```
-///
-/// has a single join tree `(A-B-C)` which will result in `(2, [2])`
-///
-/// This plan:
-///
-/// ```text
-///         JOIN
-///         /  \
-///       A   GROUP
-///              |
-///             JOIN
-///             /  \
-///            B    C
-/// ```
-///
-/// Has two join trees `(A-, B-C)` which will result in `(2, [1, 1])`
-fn count_trees(plan: &LogicalPlan) -> (usize, Vec<usize>) {
-    // this works the same way as `total_count`, but now when we encounter a Join
-    // we try to collect it's entire tree
-    let mut to_visit = vec![plan];
-    let mut total = 0;
-    let mut groups = vec![];
-
-    while let Some(node) = to_visit.pop() {
-        // if we encounter a join, we know were at the root of the tree
-        // count this tree and recurse on it's inputs
-        if matches!(node, LogicalPlan::Join(_)) {
-            let (group_count, inputs) = count_tree(node);
-            total += group_count;
-            groups.push(group_count);
-            to_visit.extend(inputs);
-        } else {
-            to_visit.extend(node.inputs());
-        }
-    }
-
-    (total, groups)
-}
-
-/// Count the entire join tree and return its inputs using TreeNode API
-///
-/// For example, if this function receives following plan:
-///
-/// ```text
-///         JOIN
-///         /  \
-///       A   GROUP
-///              |
-///             JOIN
-///             /  \
-///            B    C
-/// ```
-///
-/// It will return `(1, [A, GROUP])`
-fn count_tree(join: &LogicalPlan) -> (usize, Vec<&LogicalPlan>) {
-    let mut inputs = Vec::new();
-    let mut total = 0;
-
-    join.apply(|node| {
-        // Some extra knowledge:
-        //
-        // optimized plans have their projections pushed down as far as
-        // possible, which sometimes results in a projection going in between 2
-        // subsequent joins giving the illusion these joins are not "related",
-        // when in fact they are.
-        //
-        // This plan:
-        //   JOIN
-        //   /  \
-        // A   PROJECTION
-        //        |
-        //       JOIN
-        //       /  \
-        //      B    C
-        //
-        // is the same as:
-        //
-        //   JOIN
-        //   /  \
-        // A   JOIN
-        //     /  \
-        //    B    C
-        // we can continue the recursion in this case
-        if let LogicalPlan::Projection(_) = node {
-            return Ok(TreeNodeRecursion::Continue);
-        }
-
-        // any join we count
-        if matches!(node, LogicalPlan::Join(_)) {
-            total += 1;
-            Ok(TreeNodeRecursion::Continue)
-        } else {
-            inputs.push(node);
-            // skip children of input node
-            Ok(TreeNodeRecursion::Jump)
-        }
-    })
-    .unwrap();
-
-    (total, inputs)
-}
-
-#[tokio::main]
-async fn main() -> Result<()> {
+/// Demonstrates how to analyze a SQL query by counting JOINs and identifying
+/// join-trees using DataFusion’s `LogicalPlan` and `TreeNode` API.
+pub async fn analysis() -> Result<()> {
     // To show how we can count the joins in a sql query we'll be using query 88
     // from the TPC-DS benchmark.
     //
@@ -310,3 +180,136 @@ from
 
     Ok(())
 }
+
+/// Counts the total number of joins in a plan
+fn total_join_count(plan: &LogicalPlan) -> usize {
+    let mut total = 0;
+
+    // We can use the TreeNode API to walk over a LogicalPlan.
+    plan.apply(|node| {
+        // if we encounter a join we update the running count
+        if matches!(node, LogicalPlan::Join(_)) {
+            total += 1;
+        }
+        Ok(TreeNodeRecursion::Continue)
+    })
+    .unwrap();
+
+    total
+}
+
+/// Counts the total number of joins in a plan and collects every join tree in
+/// the plan with their respective join count.
+///
+/// Join Tree Definition: the largest subtree consisting entirely of joins
+///
+/// For example, this plan:
+///
+/// ```text
+///         JOIN
+///         /  \
+///       A   JOIN
+///            /  \
+///           B    C
+/// ```
+///
+/// has a single join tree `(A-B-C)` which will result in `(2, [2])`
+///
+/// This plan:
+///
+/// ```text
+///         JOIN
+///         /  \
+///       A   GROUP
+///              |
+///             JOIN
+///             /  \
+///            B    C
+/// ```
+///
+/// Has two join trees `(A-, B-C)` which will result in `(2, [1, 1])`
+fn count_trees(plan: &LogicalPlan) -> (usize, Vec<usize>) {
+    // this works the same way as `total_count`, but now when we encounter a Join
+    // we try to collect it's entire tree
+    let mut to_visit = vec![plan];
+    let mut total = 0;
+    let mut groups = vec![];
+
+    while let Some(node) = to_visit.pop() {
+        // if we encounter a join, we know were at the root of the tree
+        // count this tree and recurse on it's inputs
+        if matches!(node, LogicalPlan::Join(_)) {
+            let (group_count, inputs) = count_tree(node);
+            total += group_count;
+            groups.push(group_count);
+            to_visit.extend(inputs);
+        } else {
+            to_visit.extend(node.inputs());
+        }
+    }
+
+    (total, groups)
+}
+
+/// Count the entire join tree and return its inputs using TreeNode API
+///
+/// For example, if this function receives following plan:
+///
+/// ```text
+///         JOIN
+///         /  \
+///       A   GROUP
+///              |
+///             JOIN
+///             /  \
+///            B    C
+/// ```
+///
+/// It will return `(1, [A, GROUP])`
+fn count_tree(join: &LogicalPlan) -> (usize, Vec<&LogicalPlan>) {
+    let mut inputs = Vec::new();
+    let mut total = 0;
+
+    join.apply(|node| {
+        // Some extra knowledge:
+        //
+        // optimized plans have their projections pushed down as far as
+        // possible, which sometimes results in a projection going in between 2
+        // subsequent joins giving the illusion these joins are not "related",
+        // when in fact they are.
+        //
+        // This plan:
+        //   JOIN
+        //   /  \
+        // A   PROJECTION
+        //        |
+        //       JOIN
+        //       /  \
+        //      B    C
+        //
+        // is the same as:
+        //
+        //   JOIN
+        //   /  \
+        // A   JOIN
+        //     /  \
+        //    B    C
+        // we can continue the recursion in this case
+        if let LogicalPlan::Projection(_) = node {
+            return Ok(TreeNodeRecursion::Continue);
+        }
+
+        // any join we count
+        if matches!(node, LogicalPlan::Join(_)) {
+            total += 1;
+            Ok(TreeNodeRecursion::Continue)
+        } else {
+            inputs.push(node);
+            // skip children of input node
+            Ok(TreeNodeRecursion::Jump)
+        }
+    })
+    .unwrap();
+
+    (total, inputs)
+}
diff --git a/datafusion-examples/examples/sql_ops/custom_sql_parser.rs b/datafusion-examples/examples/sql_ops/custom_sql_parser.rs
new file mode 100644
index 0000000000000..308a0de62a242
--- /dev/null
+++ b/datafusion-examples/examples/sql_ops/custom_sql_parser.rs
@@ -0,0 +1,420 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! This example demonstrates extending the DataFusion SQL parser to support
+//! custom DDL statements, specifically `CREATE EXTERNAL CATALOG`.
+//!
+//! ### Custom Syntax
+//! ```sql
+//! CREATE EXTERNAL CATALOG my_catalog
+//! STORED AS ICEBERG
+//! LOCATION 's3://my-bucket/warehouse/'
+//! OPTIONS (
+//!   'region' = 'us-west-2'
+//! );
+//! ```
+//!
+//! Note: For the purpose of this example, we use `local://workspace/` to
+//! automatically discover and register files from the project's test data.
+
+use std::collections::HashMap;
+use std::fmt::Display;
+use std::sync::Arc;
+
+use datafusion::catalog::{
+    CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, SchemaProvider,
+    TableProviderFactory,
+};
+use datafusion::datasource::listing_table_factory::ListingTableFactory;
+use datafusion::error::{DataFusionError, Result};
+use datafusion::prelude::SessionContext;
+use datafusion::sql::{
+    parser::{DFParser, DFParserBuilder, Statement},
+    sqlparser::{
+        ast::{ObjectName, Value},
+        keywords::Keyword,
+        tokenizer::Token,
+    },
+};
+use datafusion_common::{DFSchema, TableReference, plan_datafusion_err, plan_err};
+use datafusion_expr::CreateExternalTable;
+use futures::StreamExt;
+use insta::assert_snapshot;
+use object_store::ObjectStore;
+use object_store::local::LocalFileSystem;
+
+/// Entry point for the example.
+pub async fn custom_sql_parser() -> Result<()> {
+    // Use standard Parquet testing data as our "external" source.
+    let base_path = datafusion::common::test_util::parquet_test_data();
+    let base_path = std::path::Path::new(&base_path).canonicalize()?;
+
+    // Make the path relative to the workspace root
+    let workspace_root = workspace_root();
+    let location = base_path
+        .strip_prefix(&workspace_root)
+        .map(|p| p.to_string_lossy().to_string())
+        .unwrap_or_else(|_| base_path.to_string_lossy().to_string());
+
+    let create_catalog_sql = format!(
+        "CREATE EXTERNAL CATALOG parquet_testing
+         STORED AS parquet
+         LOCATION 'local://workspace/{location}'
+         OPTIONS (
+           'schema_name' = 'staged_data',
+           'format.pruning' = 'true'
+         )"
+    );
+
+    // =========================================================================
+    // Part 1: Standard DataFusion parser rejects the custom DDL
+    // =========================================================================
+    println!("=== Part 1: Standard DataFusion Parser ===\n");
+    println!("Parsing: {}\n", create_catalog_sql.trim());
+
+    let ctx_standard = SessionContext::new();
+    let err = ctx_standard
+        .sql(&create_catalog_sql)
+        .await
+        .expect_err("Expected the standard parser to reject CREATE EXTERNAL CATALOG (custom DDL syntax)");
+
+    println!("Error: {err}\n");
+    assert_snapshot!(err.to_string(), @r#"SQL error: ParserError("Expected: TABLE, found: CATALOG at Line: 1, Column: 17")"#);
+
+    // =========================================================================
+    // Part 2: Custom parser handles the statement
+    // =========================================================================
+    println!("=== Part 2: Custom Parser ===\n");
+    println!("Parsing: {}\n", create_catalog_sql.trim());
+
+    let ctx = SessionContext::new();
+
+    let mut parser = CustomParser::new(&create_catalog_sql)?;
+    let statement = parser.parse_statement()?;
+    match statement {
+        CustomStatement::CreateExternalCatalog(stmt) => {
+            handle_create_external_catalog(&ctx, stmt).await?;
+        }
+        CustomStatement::DFStatement(_) => {
+            panic!("Expected CreateExternalCatalog statement");
+        }
+    }
+
+    // Query a table from the registered catalog
+    let query_sql = "SELECT id, bool_col, tinyint_col FROM parquet_testing.staged_data.alltypes_plain LIMIT 5";
+    println!("Executing: {query_sql}\n");
+
+    let results = execute_sql(&ctx, query_sql).await?;
+    println!("{results}");
+    assert_snapshot!(results, @r"
+    +----+----------+-------------+
+    | id | bool_col | tinyint_col |
+    +----+----------+-------------+
+    | 4  | true     | 0           |
+    | 5  | false    | 1           |
+    | 6  | true     | 0           |
+    | 7  | false    | 1           |
+    | 2  | true     | 0           |
+    +----+----------+-------------+
+    ");
+
+    Ok(())
+}
+
+/// Execute SQL and return formatted results.
+async fn execute_sql(ctx: &SessionContext, sql: &str) -> Result<String> {
+    let batches = ctx.sql(sql).await?.collect().await?;
+    Ok(arrow::util::pretty::pretty_format_batches(&batches)?.to_string())
+}
+
+/// Custom handler for the `CREATE EXTERNAL CATALOG` statement.
+async fn handle_create_external_catalog(
+    ctx: &SessionContext,
+    stmt: CreateExternalCatalog,
+) -> Result<()> {
+    let factory = ListingTableFactory::new();
+    let catalog = Arc::new(MemoryCatalogProvider::new());
+    let schema = Arc::new(MemorySchemaProvider::new());
+
+    // Extract options
+    let mut schema_name = "public".to_string();
+    let mut table_options = HashMap::new();
+
+    for (k, v) in stmt.options {
+        let val_str = match v {
+            Value::SingleQuotedString(ref s) | Value::DoubleQuotedString(ref s) => {
+                s.to_string()
+            }
+            Value::Number(ref n, _) => n.to_string(),
+            Value::Boolean(b) => b.to_string(),
+            _ => v.to_string(),
+        };
+
+        if k == "schema_name" {
+            schema_name = val_str;
+        } else {
+            table_options.insert(k, val_str);
+        }
+    }
+
+    println!("  Target Catalog: {}", stmt.name);
+    println!("  Data Location: {}", stmt.location);
+    println!("  Resolved Schema: {schema_name}");
+
+    // Register a local object store rooted at the workspace root.
+    // We use a specific authority 'workspace' to ensure consistent resolution.
+    let store = Arc::new(LocalFileSystem::new_with_prefix(workspace_root())?);
+    let store_url = url::Url::parse("local://workspace").unwrap();
+    ctx.register_object_store(&store_url, Arc::clone(&store) as _);
+
+    let target_ext = format!(".{}", stmt.catalog_type.to_lowercase());
+
+    // For 'local://workspace/parquet-testing/data', the path is 'parquet-testing/data'.
+    let path_str = stmt
+        .location
+        .strip_prefix("local://workspace/")
+        .unwrap_or(&stmt.location);
+    let prefix = object_store::path::Path::from(path_str);
+
+    // Discover data files using the ObjectStore API
+    let mut table_count = 0;
+    let mut list_stream = store.list(Some(&prefix));
+
+    while let Some(meta) = list_stream.next().await {
+        let meta = meta?;
+        let path = &meta.location;
+
+        if path.as_ref().ends_with(&target_ext) {
+            let name = std::path::Path::new(path.as_ref())
+                .file_stem()
+                .unwrap()
+                .to_string_lossy()
+                .to_string();
+
+            let table_url = format!("local://workspace/{path}");
+
+            let cmd = CreateExternalTable::builder(
+                TableReference::bare(name.clone()),
+                table_url,
+                stmt.catalog_type.clone(),
+                Arc::new(DFSchema::empty()),
+            )
+            .with_options(table_options.clone())
+            .build();
+
+            match factory.create(&ctx.state(), &cmd).await {
+                Ok(table) => {
+                    schema.register_table(name, table)?;
+                    table_count += 1;
+                }
+                Err(e) => {
+                    eprintln!("Failed to create table {name}: {e}");
+                }
+            }
+        }
+    }
+    println!("  Registered {table_count} tables into schema: {schema_name}");
+
+    catalog.register_schema(&schema_name, schema)?;
+    ctx.register_catalog(stmt.name.to_string(), catalog);
+
+    Ok(())
+}
+
+/// Possible statements returned by our custom parser.
+#[derive(Debug, Clone)]
+pub enum CustomStatement {
+    /// Standard DataFusion statement
+    DFStatement(Box<Statement>),
+    /// Custom `CREATE EXTERNAL CATALOG` statement
+    CreateExternalCatalog(CreateExternalCatalog),
+}
+
+/// Data structure for `CREATE EXTERNAL CATALOG`.
+#[derive(Debug, Clone)]
+pub struct CreateExternalCatalog {
+    pub name: ObjectName,
+    pub catalog_type: String,
+    pub location: String,
+    pub options: Vec<(String, Value)>,
+}
+
+impl Display for CustomStatement {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::DFStatement(s) => write!(f, "{s}"),
+            Self::CreateExternalCatalog(s) => write!(f, "{s}"),
+        }
+    }
+}
+
+impl Display for CreateExternalCatalog {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "CREATE EXTERNAL CATALOG {} STORED AS {} LOCATION '{}'",
+            self.name, self.catalog_type, self.location
+        )?;
+        if !self.options.is_empty() {
+            write!(f, " OPTIONS (")?;
+            for (i, (k, v)) in self.options.iter().enumerate() {
+                if i > 0 {
+                    write!(f, ", ")?;
+                }
+                write!(f, "'{k}' = '{v}'")?;
+            }
+            write!(f, ")")?;
+        }
+        Ok(())
+    }
+}
+
+/// A parser that extends `DFParser` with custom syntax.
+struct CustomParser<'a> {
+    df_parser: DFParser<'a>,
+}
+
+impl<'a> CustomParser<'a> {
+    fn new(sql: &'a str) -> Result<Self> {
+        Ok(Self {
+            df_parser: DFParserBuilder::new(sql).build()?,
+        })
+    }
+
+    pub fn parse_statement(&mut self) -> Result<CustomStatement> {
+        if self.is_create_external_catalog() {
+            return self.parse_create_external_catalog();
+        }
+        Ok(CustomStatement::DFStatement(Box::new(
+            self.df_parser.parse_statement()?,
+        )))
+    }
+
+    fn is_create_external_catalog(&self) -> bool {
+        let t1 = &self.df_parser.parser.peek_nth_token(0).token;
+        let t2 = &self.df_parser.parser.peek_nth_token(1).token;
+        let t3 = &self.df_parser.parser.peek_nth_token(2).token;
+
+        matches!(t1, Token::Word(w) if w.keyword == Keyword::CREATE)
+            && matches!(t2, Token::Word(w) if w.keyword == Keyword::EXTERNAL)
+            && matches!(t3, Token::Word(w) if w.value.to_uppercase() == "CATALOG")
+    }
+
+    fn parse_create_external_catalog(&mut self) -> Result<CustomStatement> {
+        // Consume prefix tokens: CREATE EXTERNAL CATALOG
+        for _ in 0..3 {
+            self.df_parser.parser.next_token();
+        }
+
+        let name = self
+            .df_parser
+            .parser
+            .parse_object_name(false)
+            .map_err(|e| DataFusionError::External(Box::new(e)))?;
+
+        let mut catalog_type = None;
+        let mut location = None;
+        let mut options = vec![];
+
+        while let Some(keyword) = self.df_parser.parser.parse_one_of_keywords(&[
+            Keyword::STORED,
+            Keyword::LOCATION,
+            Keyword::OPTIONS,
+        ]) {
+            match keyword {
+                Keyword::STORED => {
+                    if catalog_type.is_some() {
+                        return plan_err!("Duplicate STORED AS");
+                    }
+                    self.df_parser
+                        .parser
+                        .expect_keyword(Keyword::AS)
+                        .map_err(|e| DataFusionError::External(Box::new(e)))?;
+                    catalog_type = Some(
+                        self.df_parser
+                            .parser
+                            .parse_identifier()
+                            .map_err(|e| DataFusionError::External(Box::new(e)))?
+                            .value,
+                    );
+                }
+                Keyword::LOCATION => {
+                    if location.is_some() {
+                        return plan_err!("Duplicate LOCATION");
+                    }
+                    location = Some(
+                        self.df_parser
+                            .parser
+                            .parse_literal_string()
+                            .map_err(|e| DataFusionError::External(Box::new(e)))?,
+                    );
+                }
+                Keyword::OPTIONS => {
+                    if !options.is_empty() {
+                        return plan_err!("Duplicate OPTIONS");
+                    }
+                    options = self.parse_value_options()?;
+                }
+                _ => unreachable!(),
+            }
+        }
+
+        Ok(CustomStatement::CreateExternalCatalog(
+            CreateExternalCatalog {
+                name,
+                catalog_type: catalog_type
+                    .ok_or_else(|| plan_datafusion_err!("Missing STORED AS"))?,
+                location: location
+                    .ok_or_else(|| plan_datafusion_err!("Missing LOCATION"))?,
+                options,
+            },
+        ))
+    }
+
+    /// Parse options in the form: (key [=] value, key [=] value, ...)
+    fn parse_value_options(&mut self) -> Result<Vec<(String, Value)>> {
+        let mut options = vec![];
+        self.df_parser
+            .parser
+            .expect_token(&Token::LParen)
+            .map_err(|e| DataFusionError::External(Box::new(e)))?;
+
+        loop {
+            let key = self.df_parser.parse_option_key()?;
+            // Support optional '=' between key and value
+            let _ = self.df_parser.parser.consume_token(&Token::Eq);
+            let value = self.df_parser.parse_option_value()?;
+            options.push((key, value));
+
+            let comma = self.df_parser.parser.consume_token(&Token::Comma);
+            if self.df_parser.parser.consume_token(&Token::RParen) {
+                break;
+            } else if !comma {
+                return plan_err!("Expected ',' or ')' in OPTIONS");
+            }
+        }
+        Ok(options)
+    }
+}
+
+/// Returns the workspace root directory (parent of datafusion-examples).
+fn workspace_root() -> std::path::PathBuf {
+    std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .expect("CARGO_MANIFEST_DIR should have a parent")
+        .to_path_buf()
+}
diff --git a/datafusion-examples/examples/sql_frontend.rs b/datafusion-examples/examples/sql_ops/frontend.rs
similarity index 94%
rename from datafusion-examples/examples/sql_frontend.rs
rename to datafusion-examples/examples/sql_ops/frontend.rs
index 1fc9ce24ecbb5..b34c720a78198 100644
--- a/datafusion-examples/examples/sql_frontend.rs
+++ b/datafusion-examples/examples/sql_ops/frontend.rs
@@ -15,13 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
-use datafusion::common::{plan_err, TableReference};
+use datafusion::common::{TableReference, plan_err};
 use datafusion::config::ConfigOptions;
 use datafusion::error::Result;
 use datafusion::logical_expr::{
-    AggregateUDF, Expr, LogicalPlan, ScalarUDF, TableProviderFilterPushDown, TableSource,
-    WindowUDF,
+    AggregateUDF, Expr, HigherOrderUDF, LogicalPlan, ScalarUDF,
+    TableProviderFilterPushDown, TableSource, WindowUDF,
 };
 use datafusion::optimizer::{
     Analyzer, AnalyzerRule, Optimizer, OptimizerConfig, OptimizerContext, OptimizerRule,
@@ -29,7 +31,6 @@ use datafusion::optimizer::{
 use datafusion::sql::planner::{ContextProvider, SqlToRel};
 use datafusion::sql::sqlparser::dialect::PostgreSqlDialect;
 use datafusion::sql::sqlparser::parser::Parser;
-use std::any::Any;
 use std::sync::Arc;
 
 /// This example shows how to use DataFusion's SQL planner to parse SQL text and
@@ -44,7 +45,7 @@ use std::sync::Arc;
 ///
 /// In this example, we demonstrate how to use the lower level APIs directly,
 /// which only requires the `datafusion-sql` dependency.
-pub fn main() -> Result<()> {
+pub fn frontend() -> Result<()> {
     // First, we parse the SQL string. Note that we use the DataFusion
     // Parser, which wraps the `sqlparser-rs` SQL parser and adds DataFusion
     // specific syntax such as `CREATE EXTERNAL TABLE`
@@ -153,6 +154,10 @@ impl ContextProvider for MyContextProvider {
         None
     }
 
+    fn get_higher_order_meta(&self, _name: &str) -> Option<Arc<dyn HigherOrderUDF>> {
+        None
+    }
+
     fn get_aggregate_meta(&self, _name: &str) -> Option<Arc<AggregateUDF>> {
         None
     }
@@ -173,6 +178,10 @@ impl ContextProvider for MyContextProvider {
         Vec::new()
     }
 
+    fn higher_order_function_names(&self) -> Vec<String> {
+        Vec::new()
+    }
+
     fn udaf_names(&self) -> Vec<String> {
         Vec::new()
     }
@@ -188,10 +197,6 @@ struct MyTableSource {
 }
 
 impl TableSource for MyTableSource {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         self.schema.clone()
     }
diff --git a/datafusion-examples/examples/sql_ops/main.rs b/datafusion-examples/examples/sql_ops/main.rs
new file mode 100644
index 0000000000000..ce7be8fa2bada
--- /dev/null
+++ b/datafusion-examples/examples/sql_ops/main.rs
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # SQL Examples
+//!
+//! These examples demonstrate SQL operations in DataFusion.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example sql_ops -- [all|analysis|custom_sql_parser|frontend|query]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `analysis`
+//!   (file: analysis.rs, desc: Analyze SQL queries)
+//!
+//! - `custom_sql_parser`
+//!   (file: custom_sql_parser.rs, desc: Implement a custom SQL parser to extend DataFusion)
+//!
+//! - `frontend`
+//!   (file: frontend.rs, desc: Build LogicalPlans from SQL)
+//!
+//! - `query`  
+//!   (file: query.rs, desc: Query data using SQL)
+
+mod analysis;
+mod custom_sql_parser;
+mod frontend;
+mod query;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    Analysis,
+    CustomSqlParser,
+    Frontend,
+    Query,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "sql_ops";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::Analysis => analysis::analysis().await?,
+            ExampleKind::CustomSqlParser => {
+                custom_sql_parser::custom_sql_parser().await?
+            }
+            ExampleKind::Frontend => frontend::frontend()?,
+            ExampleKind::Query => query::query().await?,
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/sql_query.rs b/datafusion-examples/examples/sql_ops/query.rs
similarity index 66%
rename from datafusion-examples/examples/sql_query.rs
rename to datafusion-examples/examples/sql_ops/query.rs
index 0ac203cfb7e74..60b47c36b9ae2 100644
--- a/datafusion-examples/examples/sql_query.rs
+++ b/datafusion-examples/examples/sql_ops/query.rs
@@ -15,26 +15,27 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion::arrow::array::{UInt64Array, UInt8Array};
+//! See `main.rs` for how to run it.
+
+use std::sync::Arc;
+
+use datafusion::arrow::array::{UInt8Array, UInt64Array};
 use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use datafusion::arrow::record_batch::RecordBatch;
+use datafusion::catalog::MemTable;
 use datafusion::common::{assert_batches_eq, exec_datafusion_err};
 use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::listing::ListingOptions;
-use datafusion::datasource::MemTable;
 use datafusion::error::{DataFusionError, Result};
 use datafusion::prelude::*;
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
 use object_store::local::LocalFileSystem;
-use std::path::Path;
-use std::sync::Arc;
 
 /// Examples of various ways to execute queries using SQL
 ///
 /// [`query_memtable`]: a simple query against a [`MemTable`]
 /// [`query_parquet`]: a simple query against a directory with multiple Parquet files
-///
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn query() -> Result<()> {
     query_memtable().await?;
     query_parquet().await?;
     Ok(())
@@ -113,32 +114,33 @@ async fn query_parquet() -> Result<()> {
     // create local execution context
     let ctx = SessionContext::new();
 
-    let test_data = datafusion::test_util::parquet_test_data();
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
 
     // Configure listing options
     let file_format = ParquetFormat::default().with_enable_pruning(true);
-    let listing_options = ListingOptions::new(Arc::new(file_format))
-        // This is a workaround for this example since `test_data` contains
-        // many different parquet different files,
-        // in practice use FileType::PARQUET.get_ext().
-        .with_file_extension("alltypes_plain.parquet");
+    let listing_options =
+        ListingOptions::new(Arc::new(file_format)).with_file_extension(".parquet");
+
+    let table_path = parquet_temp.file_uri()?;
 
     // First example were we use an absolute path, which requires no additional setup.
     ctx.register_listing_table(
         "my_table",
-        &format!("file://{test_data}/"),
+        &table_path,
         listing_options.clone(),
         None,
         None,
     )
-    .await
-    .unwrap();
+    .await?;
 
     // execute the query
     let df = ctx
         .sql(
             "SELECT * \
         FROM my_table \
+        ORDER BY speed \
         LIMIT 1",
         )
         .await?;
@@ -147,20 +149,22 @@ async fn query_parquet() -> Result<()> {
     let results = df.collect().await?;
     assert_batches_eq!(
         [
-            "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+",
-            "| id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col  | string_col | timestamp_col       |",
-            "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+",
-            "| 4  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30332f30312f3039 | 30         | 2009-03-01T00:00:00 |",
-            "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+",
+            "+-----+-------+---------------------+",
+            "| car | speed | time                |",
+            "+-----+-------+---------------------+",
+            "| red | 0.0   | 1996-04-12T12:05:15 |",
+            "+-----+-------+---------------------+",
         ],
-        &results);
+        &results
+    );
 
-    // Second example were we temporarily move into the test data's parent directory and
-    // simulate a relative path, this requires registering an ObjectStore.
+    // Second example where we change the current working directory and explicitly
+    // register a local filesystem object store. This demonstrates how listing tables
+    // resolve paths via an ObjectStore, even when using filesystem-backed data.
     let cur_dir = std::env::current_dir()?;
-
-    let test_data_path = Path::new(&test_data);
-    let test_data_path_parent = test_data_path
+    let test_data_path_parent = parquet_temp
+        .tmp_dir
+        .path()
         .parent()
         .ok_or(exec_datafusion_err!("test_data path needs a parent"))?;
 
@@ -168,15 +172,15 @@ async fn query_parquet() -> Result<()> {
 
     let local_fs = Arc::new(LocalFileSystem::default());
 
-    let u = url::Url::parse("file://./")
+    let url = url::Url::parse("file://./")
         .map_err(|e| DataFusionError::External(Box::new(e)))?;
-    ctx.register_object_store(&u, local_fs);
+    ctx.register_object_store(&url, local_fs);
 
     // Register a listing table - this will use all files in the directory as data sources
     // for the query
     ctx.register_listing_table(
         "relative_table",
-        "./data",
+        parquet_temp.path_str()?,
         listing_options.clone(),
         None,
         None,
@@ -188,6 +192,7 @@ async fn query_parquet() -> Result<()> {
         .sql(
             "SELECT * \
         FROM relative_table \
+        ORDER BY speed \
         LIMIT 1",
         )
         .await?;
@@ -196,13 +201,14 @@ async fn query_parquet() -> Result<()> {
     let results = df.collect().await?;
     assert_batches_eq!(
         [
-            "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+",
-            "| id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col  | string_col | timestamp_col       |",
-            "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+",
-            "| 4  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30332f30312f3039 | 30         | 2009-03-01T00:00:00 |",
-            "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+",
+            "+-----+-------+---------------------+",
+            "| car | speed | time                |",
+            "+-----+-------+---------------------+",
+            "| red | 0.0   | 1996-04-12T12:05:15 |",
+            "+-----+-------+---------------------+",
         ],
-        &results);
+        &results
+    );
 
     // Reset the current directory
     std::env::set_current_dir(cur_dir)?;
diff --git a/datafusion-examples/examples/advanced_udaf.rs b/datafusion-examples/examples/udf/advanced_udaf.rs
similarity index 96%
rename from datafusion-examples/examples/advanced_udaf.rs
rename to datafusion-examples/examples/udf/advanced_udaf.rs
index 89f0a470e32e4..f1651dbf28913 100644
--- a/datafusion-examples/examples/advanced_udaf.rs
+++ b/datafusion-examples/examples/udf/advanced_udaf.rs
@@ -15,10 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use arrow::datatypes::{Field, Schema};
 use datafusion::physical_expr::NullState;
 use datafusion::{arrow::datatypes::DataType, logical_expr::Volatility};
-use std::{any::Any, sync::Arc};
+use std::sync::Arc;
 
 use arrow::array::{
     ArrayRef, AsArray, Float32Array, PrimitiveArray, PrimitiveBuilder, UInt32Array,
@@ -26,13 +28,13 @@ use arrow::array::{
 use arrow::datatypes::{ArrowNativeTypeOp, ArrowPrimitiveType, Float64Type, UInt32Type};
 use arrow::record_batch::RecordBatch;
 use arrow_schema::FieldRef;
-use datafusion::common::{cast::as_float64_array, ScalarValue};
+use datafusion::common::{ScalarValue, cast::as_float64_array};
 use datafusion::error::Result;
 use datafusion::logical_expr::{
+    Accumulator, AggregateUDF, AggregateUDFImpl, EmitTo, GroupsAccumulator, Signature,
     expr::AggregateFunction,
     function::{AccumulatorArgs, AggregateFunctionSimplification, StateFieldsArgs},
-    simplify::SimplifyInfo,
-    Accumulator, AggregateUDF, AggregateUDFImpl, EmitTo, GroupsAccumulator, Signature,
+    simplify::SimplifyContext,
 };
 use datafusion::prelude::*;
 
@@ -62,11 +64,6 @@ impl GeoMeanUdaf {
 }
 
 impl AggregateUDFImpl for GeoMeanUdaf {
-    /// We implement as_any so that we can downcast the AggregateUDFImpl trait object
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     /// Return the name of this function
     fn name(&self) -> &str {
         "geo_mean"
@@ -312,12 +309,16 @@ impl GroupsAccumulator for GeometricMeanGroupsAccumulator {
         let prods = emit_to.take_needed(&mut self.prods);
         let nulls = self.null_state.build(emit_to);
 
-        assert_eq!(nulls.len(), prods.len());
+        if let Some(nulls) = &nulls {
+            assert_eq!(nulls.len(), counts.len());
+        }
         assert_eq!(counts.len(), prods.len());
 
         // don't evaluate geometric mean with null inputs to avoid errors on null values
 
-        let array: PrimitiveArray<Float64Type> = if nulls.null_count() > 0 {
+        let array: PrimitiveArray<Float64Type> = if let Some(nulls) = &nulls
+            && nulls.null_count() > 0
+        {
             let mut builder = PrimitiveBuilder::<Float64Type>::with_capacity(nulls.len());
             let iter = prods.into_iter().zip(counts).zip(nulls.iter());
 
@@ -335,7 +336,7 @@ impl GroupsAccumulator for GeometricMeanGroupsAccumulator {
                 .zip(counts)
                 .map(|(prod, count)| prod.powf(1.0 / count as f64))
                 .collect::<Vec<_>>();
-            PrimitiveArray::new(geo_mean.into(), Some(nulls)) // no copy
+            PrimitiveArray::new(geo_mean.into(), nulls) // no copy
                 .with_data_type(self.return_data_type.clone())
         };
 
@@ -345,7 +346,6 @@ impl GroupsAccumulator for GeometricMeanGroupsAccumulator {
     // return arrays for counts and prods
     fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
         let nulls = self.null_state.build(emit_to);
-        let nulls = Some(nulls);
 
         let counts = emit_to.take_needed(&mut self.counts);
         let counts = UInt32Array::new(counts.into(), nulls.clone()); // zero copy
@@ -382,10 +382,6 @@ impl SimplifiedGeoMeanUdaf {
 }
 
 impl AggregateUDFImpl for SimplifiedGeoMeanUdaf {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "simplified_geo_mean"
     }
@@ -419,7 +415,7 @@ impl AggregateUDFImpl for SimplifiedGeoMeanUdaf {
 
     /// Optionally replaces a UDAF with another expression during query optimization.
     fn simplify(&self) -> Option<AggregateFunctionSimplification> {
-        let simplify = |aggregate_function: AggregateFunction, _: &dyn SimplifyInfo| {
+        let simplify = |aggregate_function: AggregateFunction, _: &SimplifyContext| {
             // Replaces the UDAF with `GeoMeanUdaf` as a placeholder example to demonstrate the `simplify` method.
             // In real-world scenarios, you might create UDFs from built-in expressions.
             Ok(Expr::AggregateFunction(AggregateFunction::new_udf(
@@ -469,8 +465,9 @@ fn create_context() -> Result<SessionContext> {
     Ok(ctx)
 }
 
-#[tokio::main]
-async fn main() -> Result<()> {
+/// In this example we register `GeoMeanUdaf` and `SimplifiedGeoMeanUdaf`
+/// as user defined aggregate functions and invoke them via the DataFrame API and SQL
+pub async fn advanced_udaf() -> Result<()> {
     let ctx = create_context()?;
 
     let geo_mean_udf = AggregateUDF::from(GeoMeanUdaf::new());
diff --git a/datafusion-examples/examples/advanced_udf.rs b/datafusion-examples/examples/udf/advanced_udf.rs
similarity index 98%
rename from datafusion-examples/examples/advanced_udf.rs
rename to datafusion-examples/examples/udf/advanced_udf.rs
index 56ae599efa11b..d3815459dba52 100644
--- a/datafusion-examples/examples/advanced_udf.rs
+++ b/datafusion-examples/examples/udf/advanced_udf.rs
@@ -15,19 +15,20 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
+//! See `main.rs` for how to run it.
+
 use std::sync::Arc;
 
 use arrow::array::{
-    new_null_array, Array, ArrayRef, AsArray, Float32Array, Float64Array,
+    Array, ArrayRef, AsArray, Float32Array, Float64Array, new_null_array,
 };
 use arrow::compute;
 use arrow::datatypes::{DataType, Float64Type};
 use arrow::record_batch::RecordBatch;
-use datafusion::common::{exec_err, internal_err, ScalarValue};
+use datafusion::common::{ScalarValue, exec_err, internal_err};
 use datafusion::error::Result;
-use datafusion::logical_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion::logical_expr::Volatility;
+use datafusion::logical_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion::logical_expr::{
     ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
 };
@@ -64,10 +65,6 @@ impl PowUdf {
 
 impl ScalarUDFImpl for PowUdf {
     /// We implement as_any so that we can downcast the ScalarUDFImpl trait object
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     /// Return the name of this function
     fn name(&self) -> &str {
         "pow"
@@ -245,10 +242,35 @@ fn maybe_pow_in_place(base: f64, exp_array: ArrayRef) -> Result<ArrayRef> {
     }
 }
 
+/// create local execution context with an in-memory table:
+///
+/// ```text
+/// +-----+-----+
+/// | a   | b   |
+/// +-----+-----+
+/// | 2.1 | 1.0 |
+/// | 3.1 | 2.0 |
+/// | 4.1 | 3.0 |
+/// | 5.1 | 4.0 |
+/// +-----+-----+
+/// ```
+fn create_context() -> Result<SessionContext> {
+    // define data.
+    let a: ArrayRef = Arc::new(Float32Array::from(vec![2.1, 3.1, 4.1, 5.1]));
+    let b: ArrayRef = Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
+    let batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b)])?;
+
+    // declare a new context. In Spark API, this corresponds to a new SparkSession
+    let ctx = SessionContext::new();
+
+    // declare a table in memory. In Spark API, this corresponds to createDataFrame(...).
+    ctx.register_batch("t", batch)?;
+    Ok(ctx)
+}
+
 /// In this example we register `PowUdf` as a user defined function
 /// and invoke it via the DataFrame API and SQL
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn advanced_udf() -> Result<()> {
     let ctx = create_context()?;
 
     // create the UDF
@@ -295,29 +317,3 @@ async fn main() -> Result<()> {
 
     Ok(())
 }
-
-/// create local execution context with an in-memory table:
-///
-/// ```text
-/// +-----+-----+
-/// | a   | b   |
-/// +-----+-----+
-/// | 2.1 | 1.0 |
-/// | 3.1 | 2.0 |
-/// | 4.1 | 3.0 |
-/// | 5.1 | 4.0 |
-/// +-----+-----+
-/// ```
-fn create_context() -> Result<SessionContext> {
-    // define data.
-    let a: ArrayRef = Arc::new(Float32Array::from(vec![2.1, 3.1, 4.1, 5.1]));
-    let b: ArrayRef = Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
-    let batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b)])?;
-
-    // declare a new context. In Spark API, this corresponds to a new SparkSession
-    let ctx = SessionContext::new();
-
-    // declare a table in memory. In Spark API, this corresponds to createDataFrame(...).
-    ctx.register_batch("t", batch)?;
-    Ok(ctx)
-}
diff --git a/datafusion-examples/examples/advanced_udwf.rs b/datafusion-examples/examples/udf/advanced_udwf.rs
similarity index 93%
rename from datafusion-examples/examples/advanced_udwf.rs
rename to datafusion-examples/examples/udf/advanced_udwf.rs
index ba4c377fd6762..2508e6cd60e59 100644
--- a/datafusion-examples/examples/advanced_udwf.rs
+++ b/datafusion-examples/examples/udf/advanced_udwf.rs
@@ -15,6 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
+use std::sync::Arc;
+
 use arrow::datatypes::Field;
 use arrow::{
     array::{ArrayRef, AsArray, Float64Array},
@@ -28,7 +32,7 @@ use datafusion::logical_expr::expr::{WindowFunction, WindowFunctionParams};
 use datafusion::logical_expr::function::{
     PartitionEvaluatorArgs, WindowFunctionSimplification, WindowUDFFieldArgs,
 };
-use datafusion::logical_expr::simplify::SimplifyInfo;
+use datafusion::logical_expr::simplify::SimplifyContext;
 use datafusion::logical_expr::{
     Expr, LimitEffect, PartitionEvaluator, Signature, WindowFrame,
     WindowFunctionDefinition, WindowUDF, WindowUDFImpl,
@@ -36,8 +40,7 @@ use datafusion::logical_expr::{
 use datafusion::physical_expr::PhysicalExpr;
 use datafusion::prelude::*;
 use datafusion::{arrow::datatypes::DataType, logical_expr::Volatility};
-use std::any::Any;
-use std::sync::Arc;
+use datafusion_examples::utils::datasets::ExampleDataset;
 
 /// This example shows how to use the full WindowUDFImpl API to implement a user
 /// defined window function. As in the `simple_udwf.rs` example, this struct implements
@@ -65,11 +68,6 @@ impl SmoothItUdf {
 }
 
 impl WindowUDFImpl for SmoothItUdf {
-    /// We implement as_any so that we can downcast the WindowUDFImpl trait object
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     /// Return the name of this function
     fn name(&self) -> &str {
         "smooth_it"
@@ -173,10 +171,6 @@ impl SimplifySmoothItUdf {
     }
 }
 impl WindowUDFImpl for SimplifySmoothItUdf {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "simplify_smooth_it"
     }
@@ -195,7 +189,7 @@ impl WindowUDFImpl for SimplifySmoothItUdf {
     /// this function will simplify `SimplifySmoothItUdf` to `AggregateUDF` for `Avg`
     /// default implementation will not be called (left as `todo!()`)
     fn simplify(&self) -> Option<WindowFunctionSimplification> {
-        let simplify = |window_function: WindowFunction, _: &dyn SimplifyInfo| {
+        let simplify = |window_function: WindowFunction, _: &SimplifyContext| {
             Ok(Expr::from(WindowFunction {
                 fun: WindowFunctionDefinition::AggregateUDF(avg_udaf()),
                 params: WindowFunctionParams {
@@ -227,17 +221,17 @@ async fn create_context() -> Result<SessionContext> {
     // declare a new context. In spark API, this corresponds to a new spark SQL session
     let ctx = SessionContext::new();
 
-    // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
-    println!("pwd: {}", std::env::current_dir().unwrap().display());
-    let csv_path = "../../datafusion/core/tests/data/cars.csv".to_string();
-    let read_options = CsvReadOptions::default().has_header(true);
+    let dataset = ExampleDataset::Cars;
+
+    ctx.register_csv("cars", dataset.path_str()?, CsvReadOptions::new())
+        .await?;
 
-    ctx.register_csv("cars", &csv_path, read_options).await?;
     Ok(ctx)
 }
 
-#[tokio::main]
-async fn main() -> Result<()> {
+/// In this example we register `SmoothItUdf` as user defined window function
+/// and invoke it via the DataFrame API and SQL
+pub async fn advanced_udwf() -> Result<()> {
     let ctx = create_context().await?;
     let smooth_it = WindowUDF::from(SmoothItUdf::new());
     ctx.register_udwf(smooth_it.clone());
diff --git a/datafusion-examples/examples/async_udf.rs b/datafusion-examples/examples/udf/async_udf.rs
similarity index 81%
rename from datafusion-examples/examples/async_udf.rs
rename to datafusion-examples/examples/udf/async_udf.rs
index b52ec68ea4422..43b82c398c5c6 100644
--- a/datafusion-examples/examples/async_udf.rs
+++ b/datafusion-examples/examples/udf/async_udf.rs
@@ -15,12 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+//!
 //! This example shows how to create and use "Async UDFs" in DataFusion.
 //!
 //! Async UDFs allow you to perform asynchronous operations, such as
 //! making network requests. This can be used for tasks like fetching
 //! data from an external API such as a LLM service or an external database.
 
+use std::sync::Arc;
+
 use arrow::array::{ArrayRef, BooleanArray, Int64Array, RecordBatch, StringArray};
 use arrow_schema::{DataType, Field, Schema};
 use async_trait::async_trait;
@@ -35,11 +39,10 @@ use datafusion::logical_expr::{
     ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
 };
 use datafusion::prelude::{SessionConfig, SessionContext};
-use std::any::Any;
-use std::sync::Arc;
 
-#[tokio::main]
-async fn main() -> Result<()> {
+/// In this example we register `AskLLM` as an asynchronous user defined function
+/// and invoke it via the DataFrame API and SQL
+pub async fn async_udf() -> Result<()> {
     // Use a hard coded parallelism level of 4 so the explain plan
     // is consistent across machines.
     let config = SessionConfig::new().with_target_partitions(4);
@@ -90,20 +93,18 @@ async fn main() -> Result<()> {
 
     assert_batches_eq!(
         [
-    "+---------------+--------------------------------------------------------------------------------------------------------------------------------+",
-    "| plan_type     | plan                                                                                                                           |",
-    "+---------------+--------------------------------------------------------------------------------------------------------------------------------+",
-    "| logical_plan  | SubqueryAlias: a                                                                                                               |",
-    "|               |   Filter: ask_llm(CAST(animal.name AS Utf8View), Utf8View(\"Is this animal furry?\"))                                            |",
-    "|               |     TableScan: animal projection=[id, name]                                                                                    |",
-    "| physical_plan | CoalesceBatchesExec: target_batch_size=8192                                                                                    |",
-    "|               |   FilterExec: __async_fn_0@2, projection=[id@0, name@1]                                                                        |",
-    "|               |     RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1                                                       |",
-    "|               |       AsyncFuncExec: async_expr=[async_expr(name=__async_fn_0, expr=ask_llm(CAST(name@1 AS Utf8View), Is this animal furry?))] |",
-    "|               |         CoalesceBatchesExec: target_batch_size=8192                                                                            |",
-    "|               |           DataSourceExec: partitions=1, partition_sizes=[1]                                                                    |",
-    "|               |                                                                                                                                |",
-    "+---------------+--------------------------------------------------------------------------------------------------------------------------------+",
+            "+---------------+------------------------------------------------------------------------------------------------------------------------------+",
+            "| plan_type     | plan                                                                                                                         |",
+            "+---------------+------------------------------------------------------------------------------------------------------------------------------+",
+            "| logical_plan  | SubqueryAlias: a                                                                                                             |",
+            "|               |   Filter: ask_llm(CAST(animal.name AS Utf8View), Utf8View(\"Is this animal furry?\"))                                          |",
+            "|               |     TableScan: animal projection=[id, name]                                                                                  |",
+            "| physical_plan | FilterExec: __async_fn_0@2, projection=[id@0, name@1]                                                                        |",
+            "|               |   RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1                                                       |",
+            "|               |     AsyncFuncExec: async_expr=[async_expr(name=__async_fn_0, expr=ask_llm(CAST(name@1 AS Utf8View), Is this animal furry?))] |",
+            "|               |       DataSourceExec: partitions=1, partition_sizes=[1]                                                                      |",
+            "|               |                                                                                                                              |",
+            "+---------------+------------------------------------------------------------------------------------------------------------------------------+",
         ],
         &results
     );
@@ -159,10 +160,6 @@ impl AskLLM {
 /// information for the function, such as its name, signature, and return type.
 /// [async_trait]
 impl ScalarUDFImpl for AskLLM {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "ask_llm"
     }
diff --git a/datafusion-examples/examples/udf/main.rs b/datafusion-examples/examples/udf/main.rs
new file mode 100644
index 0000000000000..89f3fd801deec
--- /dev/null
+++ b/datafusion-examples/examples/udf/main.rs
@@ -0,0 +1,131 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # User-Defined Functions Examples
+//!
+//! These examples demonstrate user-defined functions in DataFusion.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example udf -- [all|adv_udaf|adv_udf|adv_udwf|async_udf|udaf|udf|udtf|udwf|table_list_udtf]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `adv_udaf`
+//!   (file: advanced_udaf.rs, desc: Advanced User Defined Aggregate Function (UDAF))
+//!
+//! - `adv_udf`
+//!   (file: advanced_udf.rs, desc: Advanced User Defined Scalar Function (UDF))
+//!
+//! - `adv_udwf`
+//!   (file: advanced_udwf.rs, desc: Advanced User Defined Window Function (UDWF))
+//!
+//! - `async_udf`
+//!   (file: async_udf.rs, desc: Asynchronous User Defined Scalar Function)
+//!
+//! - `udaf`
+//!   (file: simple_udaf.rs, desc: Simple UDAF example)
+//!
+//! - `udf`
+//!   (file: simple_udf.rs, desc: Simple UDF example)
+//!
+//! - `udtf`
+//!   (file: simple_udtf.rs, desc: Simple UDTF example)
+//!
+//! - `udwf`
+//!   (file: simple_udwf.rs, desc: Simple UDWF example)
+//!
+//! - `table_list_udtf`
+//!   (file: table_list_udtf.rs, desc: Session-aware UDTF table list example)
+
+mod advanced_udaf;
+mod advanced_udf;
+mod advanced_udwf;
+mod async_udf;
+mod simple_udaf;
+mod simple_udf;
+mod simple_udtf;
+mod simple_udwf;
+mod table_list_udtf;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    AdvUdaf,
+    AdvUdf,
+    AdvUdwf,
+    AsyncUdf,
+    Udf,
+    Udaf,
+    Udwf,
+    Udtf,
+    TableListUdtf,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "udf";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::AdvUdaf => advanced_udaf::advanced_udaf().await?,
+            ExampleKind::AdvUdf => advanced_udf::advanced_udf().await?,
+            ExampleKind::AdvUdwf => advanced_udwf::advanced_udwf().await?,
+            ExampleKind::AsyncUdf => async_udf::async_udf().await?,
+            ExampleKind::Udaf => simple_udaf::simple_udaf().await?,
+            ExampleKind::Udf => simple_udf::simple_udf().await?,
+            ExampleKind::Udtf => simple_udtf::simple_udtf().await?,
+            ExampleKind::Udwf => simple_udwf::simple_udwf().await?,
+            ExampleKind::TableListUdtf => table_list_udtf::table_list_udtf().await?,
+        }
+
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/simple_udaf.rs b/datafusion-examples/examples/udf/simple_udaf.rs
similarity index 96%
rename from datafusion-examples/examples/simple_udaf.rs
rename to datafusion-examples/examples/udf/simple_udaf.rs
index 82bde7c034a57..42ea0054b759f 100644
--- a/datafusion-examples/examples/simple_udaf.rs
+++ b/datafusion-examples/examples/udf/simple_udaf.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+//!
 /// In this example we will declare a single-type, single return type UDAF that computes the geometric mean.
 /// The geometric mean is described here: https://en.wikipedia.org/wiki/Geometric_mean
 use datafusion::arrow::{
@@ -135,8 +137,9 @@ impl Accumulator for GeometricMean {
     }
 }
 
-#[tokio::main]
-async fn main() -> Result<()> {
+/// In this example we register `GeometricMean`
+/// as user defined aggregate function and invoke it via the DataFrame API and SQL
+pub async fn simple_udaf() -> Result<()> {
     let ctx = create_context()?;
 
     // here is where we define the UDAF. We also declare its signature:
diff --git a/datafusion-examples/examples/simple_udf.rs b/datafusion-examples/examples/udf/simple_udf.rs
similarity index 98%
rename from datafusion-examples/examples/simple_udf.rs
rename to datafusion-examples/examples/udf/simple_udf.rs
index 5612e0939f709..e8d6c9c8173ac 100644
--- a/datafusion-examples/examples/simple_udf.rs
+++ b/datafusion-examples/examples/udf/simple_udf.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use datafusion::{
     arrow::{
         array::{ArrayRef, Float32Array, Float64Array},
@@ -57,8 +59,7 @@ fn create_context() -> Result<SessionContext> {
 }
 
 /// In this example we will declare a single-type, single return type UDF that exponentiates f64, a^b
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn simple_udf() -> Result<()> {
     let ctx = create_context()?;
 
     // First, declare the actual implementation of the calculation
diff --git a/datafusion-examples/examples/simple_udtf.rs b/datafusion-examples/examples/udf/simple_udtf.rs
similarity index 85%
rename from datafusion-examples/examples/simple_udtf.rs
rename to datafusion-examples/examples/udf/simple_udtf.rs
index b65ffb8d71748..af123ab7e5d4a 100644
--- a/datafusion-examples/examples/simple_udtf.rs
+++ b/datafusion-examples/examples/udf/simple_udtf.rs
@@ -15,53 +15,56 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::csv::reader::Format;
+//! See `main.rs` for how to run it.
+
+use std::fs::File;
+use std::io::Seek;
+use std::path::Path;
+use std::sync::Arc;
+
 use arrow::csv::ReaderBuilder;
+use arrow::csv::reader::Format;
 use async_trait::async_trait;
 use datafusion::arrow::datatypes::SchemaRef;
 use datafusion::arrow::record_batch::RecordBatch;
-use datafusion::catalog::Session;
-use datafusion::catalog::TableFunctionImpl;
-use datafusion::common::{plan_err, ScalarValue};
-use datafusion::datasource::memory::MemorySourceConfig;
+use datafusion::catalog::{Session, TableFunctionArgs, TableFunctionImpl};
+use datafusion::common::{ScalarValue, plan_err};
 use datafusion::datasource::TableProvider;
+use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::error::Result;
-use datafusion::execution::context::ExecutionProps;
 use datafusion::logical_expr::simplify::SimplifyContext;
 use datafusion::logical_expr::{Expr, TableType};
 use datafusion::optimizer::simplify_expressions::ExprSimplifier;
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::prelude::*;
-use std::fs::File;
-use std::io::Seek;
-use std::path::Path;
-use std::sync::Arc;
+use datafusion_examples::utils::datasets::ExampleDataset;
+
 // To define your own table function, you only need to do the following 3 things:
 // 1. Implement your own [`TableProvider`]
 // 2. Implement your own [`TableFunctionImpl`] and return your [`TableProvider`]
 // 3. Register the function using [`SessionContext::register_udtf`]
 
 /// This example demonstrates how to register a TableFunction
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn simple_udtf() -> Result<()> {
     // create local execution context
     let ctx = SessionContext::new();
 
     // register the table function that will be called in SQL statements by `read_csv`
     ctx.register_udtf("read_csv", Arc::new(LocalCsvTableFunc {}));
 
-    let testdata = datafusion::test_util::arrow_test_data();
-    let csv_file = format!("{testdata}/csv/aggregate_test_100.csv");
+    let dataset = ExampleDataset::Cars;
 
     // Pass 2 arguments, read csv with at most 2 rows (simplify logic makes 1+1 --> 2)
     let df = ctx
-        .sql(format!("SELECT * FROM read_csv('{csv_file}', 1 + 1);").as_str())
+        .sql(
+            format!("SELECT * FROM read_csv('{}', 1 + 1);", dataset.path_str()?).as_str(),
+        )
         .await?;
     df.show().await?;
 
     // just run, return all rows
     let df = ctx
-        .sql(format!("SELECT * FROM read_csv('{csv_file}');").as_str())
+        .sql(format!("SELECT * FROM read_csv('{}');", dataset.path_str()?).as_str())
         .await?;
     df.show().await?;
 
@@ -82,10 +85,6 @@ struct LocalCsvTable {
 
 #[async_trait]
 impl TableProvider for LocalCsvTable {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         self.schema.clone()
     }
@@ -132,9 +131,9 @@ impl TableProvider for LocalCsvTable {
 struct LocalCsvTableFunc {}
 
 impl TableFunctionImpl for LocalCsvTableFunc {
-    fn call(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
-        let Some(Expr::Literal(ScalarValue::Utf8(Some(ref path)), _)) = exprs.first()
-        else {
+    fn call_with_args(&self, args: TableFunctionArgs) -> Result<Arc<dyn TableProvider>> {
+        let exprs = args.exprs();
+        let Some(Expr::Literal(ScalarValue::Utf8(Some(path)), _)) = exprs.first() else {
             return plan_err!("read_csv requires at least one string argument");
         };
 
@@ -142,8 +141,7 @@ impl TableFunctionImpl for LocalCsvTableFunc {
             .get(1)
             .map(|expr| {
                 // try to simplify the expression, so 1+2 becomes 3, for example
-                let execution_props = ExecutionProps::new();
-                let info = SimplifyContext::new(&execution_props);
+                let info = SimplifyContext::default();
                 let expr = ExprSimplifier::new(info).simplify(expr.clone())?;
 
                 if let Expr::Literal(ScalarValue::Int64(Some(limit)), _) = expr {
diff --git a/datafusion-examples/examples/simple_udwf.rs b/datafusion-examples/examples/udf/simple_udwf.rs
similarity index 79%
rename from datafusion-examples/examples/simple_udwf.rs
rename to datafusion-examples/examples/udf/simple_udwf.rs
index 1736ff00bd700..1842d88b9ba29 100644
--- a/datafusion-examples/examples/simple_udwf.rs
+++ b/datafusion-examples/examples/udf/simple_udwf.rs
@@ -15,35 +15,70 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::sync::Arc;
+//! See `main.rs` for how to run it.
+
+use std::{fs::File, io::Write, sync::Arc};
 
 use arrow::{
     array::{ArrayRef, AsArray, Float64Array},
     datatypes::{DataType, Float64Type},
 };
-
 use datafusion::common::ScalarValue;
 use datafusion::error::Result;
 use datafusion::logical_expr::{PartitionEvaluator, Volatility, WindowFrame};
 use datafusion::prelude::*;
+use tempfile::tempdir;
 
 // create local execution context with `cars.csv` registered as a table named `cars`
 async fn create_context() -> Result<SessionContext> {
     // declare a new context. In spark API, this corresponds to a new spark SQL session
     let ctx = SessionContext::new();
 
-    // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
-    println!("pwd: {}", std::env::current_dir().unwrap().display());
-    let csv_path = "../../datafusion/core/tests/data/cars.csv".to_string();
-    let read_options = CsvReadOptions::default().has_header(true);
+    // content from file 'datafusion/core/tests/data/cars.csv'
+    let csv_data = r#"car,speed,time
+red,20.0,1996-04-12T12:05:03.000000000
+red,20.3,1996-04-12T12:05:04.000000000
+red,21.4,1996-04-12T12:05:05.000000000
+red,21.5,1996-04-12T12:05:06.000000000
+red,19.0,1996-04-12T12:05:07.000000000
+red,18.0,1996-04-12T12:05:08.000000000
+red,17.0,1996-04-12T12:05:09.000000000
+red,7.0,1996-04-12T12:05:10.000000000
+red,7.1,1996-04-12T12:05:11.000000000
+red,7.2,1996-04-12T12:05:12.000000000
+red,3.0,1996-04-12T12:05:13.000000000
+red,1.0,1996-04-12T12:05:14.000000000
+red,0.0,1996-04-12T12:05:15.000000000
+green,10.0,1996-04-12T12:05:03.000000000
+green,10.3,1996-04-12T12:05:04.000000000
+green,10.4,1996-04-12T12:05:05.000000000
+green,10.5,1996-04-12T12:05:06.000000000
+green,11.0,1996-04-12T12:05:07.000000000
+green,12.0,1996-04-12T12:05:08.000000000
+green,14.0,1996-04-12T12:05:09.000000000
+green,15.0,1996-04-12T12:05:10.000000000
+green,15.1,1996-04-12T12:05:11.000000000
+green,15.2,1996-04-12T12:05:12.000000000
+green,8.0,1996-04-12T12:05:13.000000000
+green,2.0,1996-04-12T12:05:14.000000000
+"#;
+    let dir = tempdir()?;
+    let file_path = dir.path().join("cars.csv");
+    {
+        let mut file = File::create(&file_path)?;
+        // write CSV data
+        file.write_all(csv_data.as_bytes())?;
+    } // scope closes the file
+    let file_path = file_path.to_str().unwrap();
+
+    ctx.register_csv("cars", file_path, CsvReadOptions::new())
+        .await?;
 
-    ctx.register_csv("cars", &csv_path, read_options).await?;
     Ok(ctx)
 }
 
 /// In this example we will declare a user defined window function that computes a moving average and then run it using SQL
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn simple_udwf() -> Result<()> {
     let ctx = create_context().await?;
 
     // here is where we define the UDWF. We also declare its signature:
diff --git a/datafusion-examples/examples/udf/table_list_udtf.rs b/datafusion-examples/examples/udf/table_list_udtf.rs
new file mode 100644
index 0000000000000..739f8e11f07e5
--- /dev/null
+++ b/datafusion-examples/examples/udf/table_list_udtf.rs
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+
+use std::sync::{Arc, LazyLock};
+
+use arrow::array::{RecordBatch, StringBuilder};
+use arrow_schema::{DataType, Field, Schema, SchemaRef};
+use datafusion::{
+    catalog::{MemTable, TableFunctionArgs, TableFunctionImpl, TableProvider},
+    common::Result,
+    execution::SessionState,
+    prelude::SessionContext,
+};
+use datafusion_common::{DataFusionError, plan_err};
+use tokio::{runtime::Handle, task::block_in_place};
+
+const FUNCTION_NAME: &str = "table_list";
+
+// The example shows, how to create UDTF that depends on the session state.
+// Defines a `table_list` UDTF that returns a list of tables within the provided session.
+
+pub async fn table_list_udtf() -> Result<()> {
+    let ctx = SessionContext::new();
+    ctx.register_udtf(FUNCTION_NAME, Arc::new(TableListUdtf));
+
+    // Register different kinds of tables.
+    ctx.sql("create view v as select 1")
+        .await?
+        .collect()
+        .await?;
+    ctx.sql("create table t(a int)").await?.collect().await?;
+
+    // Print results.
+    ctx.sql("select * from table_list()").await?.show().await?;
+
+    Ok(())
+}
+
+#[derive(Debug, Default)]
+struct TableListUdtf;
+
+static SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
+    SchemaRef::new(Schema::new(vec![
+        Field::new("catalog", DataType::Utf8, false),
+        Field::new("schema", DataType::Utf8, false),
+        Field::new("table", DataType::Utf8, false),
+        Field::new("type", DataType::Utf8, false),
+    ]))
+});
+
+impl TableFunctionImpl for TableListUdtf {
+    fn call_with_args(&self, args: TableFunctionArgs) -> Result<Arc<dyn TableProvider>> {
+        if !args.exprs().is_empty() {
+            return plan_err!(
+                "{}: unexpected number of arguments: {}, expected: 0",
+                FUNCTION_NAME,
+                args.exprs().len()
+            );
+        }
+        let state = args
+            .session()
+            .as_any()
+            .downcast_ref::<SessionState>()
+            .ok_or_else(|| {
+                DataFusionError::Internal("failed to downcast state".into())
+            })?;
+
+        let mut catalogs = StringBuilder::new();
+        let mut schemas = StringBuilder::new();
+        let mut tables = StringBuilder::new();
+        let mut types = StringBuilder::new();
+
+        let catalog_list = state.catalog_list();
+        for catalog_name in catalog_list.catalog_names() {
+            let Some(catalog) = catalog_list.catalog(&catalog_name) else {
+                continue;
+            };
+            for schema_name in catalog.schema_names() {
+                let Some(schema) = catalog.schema(&schema_name) else {
+                    continue;
+                };
+                for table_name in schema.table_names() {
+                    let Some(provider) = block_in_place(|| {
+                        Handle::current().block_on(schema.table(&table_name))
+                    })?
+                    else {
+                        continue;
+                    };
+                    catalogs.append_value(catalog_name.clone());
+                    schemas.append_value(schema_name.clone());
+                    tables.append_value(table_name.clone());
+                    types.append_value(provider.table_type().to_string())
+                }
+            }
+        }
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&SCHEMA),
+            vec![
+                Arc::new(catalogs.finish()),
+                Arc::new(schemas.finish()),
+                Arc::new(tables.finish()),
+                Arc::new(types.finish()),
+            ],
+        )?;
+
+        Ok(Arc::new(MemTable::try_new(
+            batch.schema(),
+            vec![vec![batch]],
+        )?))
+    }
+}
diff --git a/datafusion-examples/src/bin/examples-docs.rs b/datafusion-examples/src/bin/examples-docs.rs
new file mode 100644
index 0000000000000..7efcf4da15d20
--- /dev/null
+++ b/datafusion-examples/src/bin/examples-docs.rs
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Generates Markdown documentation for DataFusion example groups.
+//!
+//! This binary scans `datafusion-examples/examples`, extracts structured
+//! documentation from each group's `main.rs` file, and renders a README-style
+//! Markdown document.
+//!
+//! By default, documentation is generated for all example groups. If a group
+//! name is provided as the first CLI argument, only that group is rendered.
+//!
+//! ## Usage
+//!
+//! ```bash
+//! # Generate docs for all example groups
+//! cargo run --bin examples-docs
+//!
+//! # Generate docs for a single group
+//! cargo run --bin examples-docs -- dataframe
+//! ```
+
+use datafusion_examples::utils::example_metadata::{
+    RepoLayout, generate_examples_readme,
+};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let layout = RepoLayout::detect()?;
+    let group = std::env::args().nth(1);
+    let markdown = generate_examples_readme(&layout, group.as_deref())?;
+    print!("{markdown}");
+    Ok(())
+}
diff --git a/datafusion-examples/src/lib.rs b/datafusion-examples/src/lib.rs
new file mode 100644
index 0000000000000..7f334aedaafe2
--- /dev/null
+++ b/datafusion-examples/src/lib.rs
@@ -0,0 +1,20 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Internal utilities shared by the DataFusion examples.
+
+pub mod utils;
diff --git a/datafusion-examples/src/utils/csv_to_parquet.rs b/datafusion-examples/src/utils/csv_to_parquet.rs
new file mode 100644
index 0000000000000..1fbf2930e9043
--- /dev/null
+++ b/datafusion-examples/src/utils/csv_to_parquet.rs
@@ -0,0 +1,244 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::path::{Path, PathBuf};
+
+use datafusion::dataframe::DataFrameWriteOptions;
+use datafusion::error::{DataFusionError, Result};
+use datafusion::prelude::{CsvReadOptions, SessionContext};
+use tempfile::TempDir;
+use tokio::fs::create_dir_all;
+
+/// Temporary Parquet directory that is deleted when dropped.
+#[derive(Debug)]
+pub struct ParquetTemp {
+    pub tmp_dir: TempDir,
+    pub parquet_dir: PathBuf,
+}
+
+impl ParquetTemp {
+    pub fn path(&self) -> &Path {
+        &self.parquet_dir
+    }
+
+    pub fn path_str(&self) -> Result<&str> {
+        self.parquet_dir.to_str().ok_or_else(|| {
+            DataFusionError::Execution(format!(
+                "Parquet directory path is not valid UTF-8: {}",
+                self.parquet_dir.display()
+            ))
+        })
+    }
+
+    pub fn file_uri(&self) -> Result<String> {
+        Ok(format!("file://{}", self.path_str()?))
+    }
+}
+
+/// Helper for examples: load a CSV file and materialize it as Parquet
+/// in a temporary directory.
+///
+/// # Example
+/// ```
+/// use std::path::PathBuf;
+/// use datafusion::prelude::*;
+/// use datafusion_examples::utils::write_csv_to_parquet;
+/// # use datafusion::assert_batches_eq;
+/// # use datafusion::error::Result;
+/// # #[tokio::main]
+/// # async fn main() -> Result<()> {
+/// let ctx = SessionContext::new();
+/// let csv_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+///     .join("data")
+///     .join("csv")
+///     .join("cars.csv");
+/// let parquet_dir = write_csv_to_parquet(&ctx, &csv_path).await?;
+/// let df = ctx.read_parquet(parquet_dir.path_str()?, ParquetReadOptions::default()).await?;
+/// let rows = df
+///    .sort(vec![col("speed").sort(true, true)])?
+///    .limit(0, Some(5))?;
+/// assert_batches_eq!(
+///     &[
+///        "+-------+-------+---------------------+",
+///        "| car   | speed | time                |",
+///        "+-------+-------+---------------------+",
+///        "| red   | 0.0   | 1996-04-12T12:05:15 |",
+///        "| red   | 1.0   | 1996-04-12T12:05:14 |",
+///        "| green | 2.0   | 1996-04-12T12:05:14 |",
+///        "| red   | 3.0   | 1996-04-12T12:05:13 |",
+///        "| red   | 7.0   | 1996-04-12T12:05:10 |",
+///        "+-------+-------+---------------------+",
+///      ],
+///        &rows.collect().await?
+/// );
+/// # Ok(())
+/// # }
+/// ```
+pub async fn write_csv_to_parquet(
+    ctx: &SessionContext,
+    csv_path: &Path,
+) -> Result<ParquetTemp> {
+    if !csv_path.is_file() {
+        return Err(DataFusionError::Execution(format!(
+            "CSV file does not exist: {}",
+            csv_path.display()
+        )));
+    }
+
+    let csv_path = csv_path.to_str().ok_or_else(|| {
+        DataFusionError::Execution("CSV path is not valid UTF-8".to_string())
+    })?;
+
+    let csv_df = ctx.read_csv(csv_path, CsvReadOptions::default()).await?;
+
+    let tmp_dir = TempDir::new()?;
+    let parquet_dir = tmp_dir.path().join("parquet_source");
+    create_dir_all(&parquet_dir).await?;
+
+    let path = parquet_dir.to_str().ok_or_else(|| {
+        DataFusionError::Execution("Failed processing tmp directory path".to_string())
+    })?;
+
+    csv_df
+        .write_parquet(path, DataFrameWriteOptions::default(), None)
+        .await?;
+
+    Ok(ParquetTemp {
+        tmp_dir,
+        parquet_dir,
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::path::PathBuf;
+
+    use datafusion::assert_batches_eq;
+    use datafusion::prelude::*;
+
+    #[tokio::test]
+    async fn test_write_csv_to_parquet_with_cars_data() -> Result<()> {
+        let ctx = SessionContext::new();
+        let csv_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join("data")
+            .join("csv")
+            .join("cars.csv");
+
+        let parquet_dir = write_csv_to_parquet(&ctx, &csv_path).await?;
+        let df = ctx
+            .read_parquet(parquet_dir.path_str()?, ParquetReadOptions::default())
+            .await?;
+
+        let rows = df.sort(vec![col("speed").sort(true, true)])?;
+        assert_batches_eq!(
+            &[
+                "+-------+-------+---------------------+",
+                "| car   | speed | time                |",
+                "+-------+-------+---------------------+",
+                "| red   | 0.0   | 1996-04-12T12:05:15 |",
+                "| red   | 1.0   | 1996-04-12T12:05:14 |",
+                "| green | 2.0   | 1996-04-12T12:05:14 |",
+                "| red   | 3.0   | 1996-04-12T12:05:13 |",
+                "| red   | 7.0   | 1996-04-12T12:05:10 |",
+                "| red   | 7.1   | 1996-04-12T12:05:11 |",
+                "| red   | 7.2   | 1996-04-12T12:05:12 |",
+                "| green | 8.0   | 1996-04-12T12:05:13 |",
+                "| green | 10.0  | 1996-04-12T12:05:03 |",
+                "| green | 10.3  | 1996-04-12T12:05:04 |",
+                "| green | 10.4  | 1996-04-12T12:05:05 |",
+                "| green | 10.5  | 1996-04-12T12:05:06 |",
+                "| green | 11.0  | 1996-04-12T12:05:07 |",
+                "| green | 12.0  | 1996-04-12T12:05:08 |",
+                "| green | 14.0  | 1996-04-12T12:05:09 |",
+                "| green | 15.0  | 1996-04-12T12:05:10 |",
+                "| green | 15.1  | 1996-04-12T12:05:11 |",
+                "| green | 15.2  | 1996-04-12T12:05:12 |",
+                "| red   | 17.0  | 1996-04-12T12:05:09 |",
+                "| red   | 18.0  | 1996-04-12T12:05:08 |",
+                "| red   | 19.0  | 1996-04-12T12:05:07 |",
+                "| red   | 20.0  | 1996-04-12T12:05:03 |",
+                "| red   | 20.3  | 1996-04-12T12:05:04 |",
+                "| red   | 21.4  | 1996-04-12T12:05:05 |",
+                "| red   | 21.5  | 1996-04-12T12:05:06 |",
+                "+-------+-------+---------------------+",
+            ],
+            &rows.collect().await?
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_write_csv_to_parquet_with_regex_data() -> Result<()> {
+        let ctx = SessionContext::new();
+        let csv_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join("data")
+            .join("csv")
+            .join("regex.csv");
+
+        let parquet_dir = write_csv_to_parquet(&ctx, &csv_path).await?;
+        let df = ctx
+            .read_parquet(parquet_dir.path_str()?, ParquetReadOptions::default())
+            .await?;
+
+        let rows = df.sort(vec![col("values").sort(true, true)])?;
+        assert_batches_eq!(
+            &[
+                "+------------+--------------------------------------+-------------+-------+",
+                "| values     | patterns                             | replacement | flags |",
+                "+------------+--------------------------------------+-------------+-------+",
+                "| 4000       | \\b4([1-9]\\d\\d|\\d[1-9]\\d|\\d\\d[1-9])\\b | xyz         |       |",
+                "| 4010       | \\b4([1-9]\\d\\d|\\d[1-9]\\d|\\d\\d[1-9])\\b | xyz         |       |",
+                "| ABC        | ^(A).*                               | B           | i     |",
+                "| AbC        | (B|D)                                | e           |       |",
+                "| Düsseldorf | [\\p{Letter}-]+                       | München     |       |",
+                "| Köln       | [a-zA-Z]ö[a-zA-Z]{2}                 | Koln        |       |",
+                "| aBC        | ^(b|c)                               | d           |       |",
+                "| aBc        | (b|d)                                | e           | i     |",
+                "| abc        | ^(a)                                 | bb\\1bb      | i     |",
+                "| Москва     | [\\p{L}-]+                            | Moscow      |       |",
+                "| اليوم      | ^\\p{Arabic}+$                        | Today       |       |",
+                "+------------+--------------------------------------+-------------+-------+",
+            ],
+            &rows.collect().await?
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_write_csv_to_parquet_error() {
+        let ctx = SessionContext::new();
+        let csv_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join("data")
+            .join("csv")
+            .join("file-does-not-exist.csv");
+
+        let err = write_csv_to_parquet(&ctx, &csv_path).await.unwrap_err();
+        match err {
+            DataFusionError::Execution(msg) => {
+                assert!(
+                    msg.contains("CSV file does not exist"),
+                    "unexpected error message: {msg}"
+                );
+            }
+            other => panic!("unexpected error variant: {other:?}"),
+        }
+    }
+}
diff --git a/datafusion/datasource-avro/src/avro_to_arrow/mod.rs b/datafusion-examples/src/utils/datasets/cars.rs
similarity index 58%
rename from datafusion/datasource-avro/src/avro_to_arrow/mod.rs
rename to datafusion-examples/src/utils/datasets/cars.rs
index c1530a4880205..2d8547c16d686 100644
--- a/datafusion/datasource-avro/src/avro_to_arrow/mod.rs
+++ b/datafusion-examples/src/utils/datasets/cars.rs
@@ -15,25 +15,19 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! This module contains code for reading [Avro] data into `RecordBatch`es
-//!
-//! [Avro]: https://avro.apache.org/docs/1.2.0/
+use std::sync::Arc;
 
-mod arrow_array_reader;
-mod reader;
-mod schema;
+use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
 
-use arrow::datatypes::Schema;
-pub use reader::{Reader, ReaderBuilder};
-
-pub use schema::to_arrow_schema;
-use std::io::Read;
-
-/// Read Avro schema given a reader
-pub fn read_avro_schema_from_reader<R: Read>(
-    reader: &mut R,
-) -> datafusion_common::Result<Schema> {
-    let avro_reader = apache_avro::Reader::new(reader)?;
-    let schema = avro_reader.writer_schema();
-    to_arrow_schema(schema)
+/// Schema for the `data/csv/cars.csv` example dataset.
+pub fn schema() -> Arc<Schema> {
+    Arc::new(Schema::new(vec![
+        Field::new("car", DataType::Utf8, false),
+        Field::new("speed", DataType::Float64, false),
+        Field::new(
+            "time",
+            DataType::Timestamp(TimeUnit::Nanosecond, None),
+            false,
+        ),
+    ]))
 }
diff --git a/datafusion-examples/src/utils/datasets/mod.rs b/datafusion-examples/src/utils/datasets/mod.rs
new file mode 100644
index 0000000000000..1857e6af9b559
--- /dev/null
+++ b/datafusion-examples/src/utils/datasets/mod.rs
@@ -0,0 +1,139 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::path::PathBuf;
+
+use arrow_schema::SchemaRef;
+use datafusion::error::{DataFusionError, Result};
+
+pub mod cars;
+pub mod regex;
+
+/// Describes example datasets used across DataFusion examples.
+///
+/// This enum provides a single, discoverable place to define
+/// dataset-specific metadata such as file paths and schemas.
+#[derive(Debug)]
+pub enum ExampleDataset {
+    Cars,
+    Regex,
+}
+
+impl ExampleDataset {
+    pub fn file_stem(&self) -> &'static str {
+        match self {
+            Self::Cars => "cars",
+            Self::Regex => "regex",
+        }
+    }
+
+    pub fn path(&self) -> PathBuf {
+        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join("data")
+            .join("csv")
+            .join(format!("{}.csv", self.file_stem()))
+    }
+
+    pub fn path_str(&self) -> Result<String> {
+        let path = self.path();
+        path.to_str().map(String::from).ok_or_else(|| {
+            DataFusionError::Execution(format!(
+                "CSV directory path is not valid UTF-8: {}",
+                path.display()
+            ))
+        })
+    }
+
+    pub fn schema(&self) -> SchemaRef {
+        match self {
+            Self::Cars => cars::schema(),
+            Self::Regex => regex::schema(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use arrow::datatypes::{DataType, TimeUnit};
+
+    #[test]
+    fn example_dataset_file_stem() {
+        assert_eq!(ExampleDataset::Cars.file_stem(), "cars");
+        assert_eq!(ExampleDataset::Regex.file_stem(), "regex");
+    }
+
+    #[test]
+    fn example_dataset_path_points_to_csv() {
+        let path = ExampleDataset::Cars.path();
+        assert!(path.ends_with("data/csv/cars.csv"));
+
+        let path = ExampleDataset::Regex.path();
+        assert!(path.ends_with("data/csv/regex.csv"));
+    }
+
+    #[test]
+    fn example_dataset_path_str_is_valid_utf8() {
+        let path = ExampleDataset::Cars.path_str().unwrap();
+        assert!(path.ends_with("cars.csv"));
+
+        let path = ExampleDataset::Regex.path_str().unwrap();
+        assert!(path.ends_with("regex.csv"));
+    }
+
+    #[test]
+    fn cars_schema_is_stable() {
+        let schema = ExampleDataset::Cars.schema();
+
+        let fields: Vec<_> = schema
+            .fields()
+            .iter()
+            .map(|f| (f.name().as_str(), f.data_type().clone()))
+            .collect();
+
+        assert_eq!(
+            fields,
+            vec![
+                ("car", DataType::Utf8),
+                ("speed", DataType::Float64),
+                ("time", DataType::Timestamp(TimeUnit::Nanosecond, None)),
+            ]
+        );
+    }
+
+    #[test]
+    fn regex_schema_is_stable() {
+        let schema = ExampleDataset::Regex.schema();
+
+        let fields: Vec<_> = schema
+            .fields()
+            .iter()
+            .map(|f| (f.name().as_str(), f.data_type().clone()))
+            .collect();
+
+        assert_eq!(
+            fields,
+            vec![
+                ("values", DataType::Utf8),
+                ("patterns", DataType::Utf8),
+                ("replacement", DataType::Utf8),
+                ("flags", DataType::Utf8),
+            ]
+        );
+    }
+}
diff --git a/datafusion/sqllogictest/src/engines/postgres_engine/types.rs b/datafusion-examples/src/utils/datasets/regex.rs
similarity index 53%
rename from datafusion/sqllogictest/src/engines/postgres_engine/types.rs
rename to datafusion-examples/src/utils/datasets/regex.rs
index 510462befb086..d44582126a053 100644
--- a/datafusion/sqllogictest/src/engines/postgres_engine/types.rs
+++ b/datafusion-examples/src/utils/datasets/regex.rs
@@ -15,31 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use postgres_types::Type;
-use std::fmt::Display;
-use tokio_postgres::types::FromSql;
+use std::sync::Arc;
 
-pub struct PgRegtype {
-    value: String,
-}
-
-impl<'a> FromSql<'a> for PgRegtype {
-    fn from_sql(
-        _: &Type,
-        buf: &'a [u8],
-    ) -> Result<Self, Box<dyn std::error::Error + Sync + Send>> {
-        let oid = postgres_protocol::types::oid_from_sql(buf)?;
-        let value = Type::from_oid(oid).ok_or("bad type")?.to_string();
-        Ok(PgRegtype { value })
-    }
-
-    fn accepts(ty: &Type) -> bool {
-        matches!(*ty, Type::REGTYPE)
-    }
-}
+use arrow::datatypes::{DataType, Field, Schema};
 
-impl Display for PgRegtype {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.value)
-    }
+/// Schema for the `data/csv/regex.csv` example dataset.
+pub fn schema() -> Arc<Schema> {
+    Arc::new(Schema::new(vec![
+        Field::new("values", DataType::Utf8, false),
+        Field::new("patterns", DataType::Utf8, false),
+        Field::new("replacement", DataType::Utf8, false),
+        Field::new("flags", DataType::Utf8, true),
+    ]))
 }
diff --git a/datafusion-examples/src/utils/example_metadata/discover.rs b/datafusion-examples/src/utils/example_metadata/discover.rs
new file mode 100644
index 0000000000000..1ba5f6d29a14e
--- /dev/null
+++ b/datafusion-examples/src/utils/example_metadata/discover.rs
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utilities for discovering example groups in the repository filesystem.
+//!
+//! An example group is defined as a directory containing a `main.rs` file
+//! under the examples root. This module is intentionally filesystem-focused
+//! and does not perform any parsing or rendering.
+//! Discovery fails if no valid example groups are found.
+
+use std::fs;
+use std::path::{Path, PathBuf};
+
+use datafusion::common::exec_err;
+use datafusion::error::Result;
+
+/// Discovers all example group directories under the given root.
+///
+/// A directory is considered an example group if it contains a `main.rs` file.
+pub fn discover_example_groups(root: &Path) -> Result<Vec<PathBuf>> {
+    let mut groups = Vec::new();
+    for entry in fs::read_dir(root)? {
+        let entry = entry?;
+        let path = entry.path();
+
+        if path.is_dir() && path.join("main.rs").is_file() {
+            groups.push(path);
+        }
+    }
+
+    if groups.is_empty() {
+        return exec_err!("No example groups found under: {}", root.display());
+    }
+
+    groups.sort();
+    Ok(groups)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use crate::utils::example_metadata::test_utils::assert_exec_err_contains;
+
+    use std::fs::{self, File};
+
+    use tempfile::TempDir;
+
+    #[test]
+    fn discover_example_groups_finds_dirs_with_main_rs() -> Result<()> {
+        let tmp = TempDir::new()?;
+        let root = tmp.path();
+
+        // valid example group
+        let group1 = root.join("group1");
+        fs::create_dir(&group1)?;
+        File::create(group1.join("main.rs"))?;
+
+        // not an example group
+        let group2 = root.join("group2");
+        fs::create_dir(&group2)?;
+
+        let groups = discover_example_groups(root)?;
+        assert_eq!(groups.len(), 1);
+        assert_eq!(groups[0], group1);
+        Ok(())
+    }
+
+    #[test]
+    fn discover_example_groups_errors_if_main_rs_is_a_directory() -> Result<()> {
+        let tmp = TempDir::new()?;
+        let root = tmp.path();
+        let group = root.join("group");
+        fs::create_dir(&group)?;
+        fs::create_dir(group.join("main.rs"))?;
+
+        let err = discover_example_groups(root).unwrap_err();
+        assert_exec_err_contains(err, "No example groups found");
+        Ok(())
+    }
+
+    #[test]
+    fn discover_example_groups_errors_if_none_found() -> Result<()> {
+        let tmp = TempDir::new()?;
+        let err = discover_example_groups(tmp.path()).unwrap_err();
+        assert_exec_err_contains(err, "No example groups found");
+        Ok(())
+    }
+}
diff --git a/datafusion-examples/src/utils/example_metadata/layout.rs b/datafusion-examples/src/utils/example_metadata/layout.rs
new file mode 100644
index 0000000000000..ee6fad89855f9
--- /dev/null
+++ b/datafusion-examples/src/utils/example_metadata/layout.rs
@@ -0,0 +1,113 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Repository layout utilities.
+//!
+//! This module provides a small helper (`RepoLayout`) that encapsulates
+//! knowledge about the DataFusion repository structure, in particular
+//! where example groups are located relative to the repository root.
+
+use std::path::{Path, PathBuf};
+
+use datafusion::error::{DataFusionError, Result};
+
+/// Describes the layout of a DataFusion repository.
+///
+/// This type centralizes knowledge about where example-related
+/// directories live relative to the repository root.
+#[derive(Debug, Clone)]
+pub struct RepoLayout {
+    root: PathBuf,
+}
+
+impl From<&Path> for RepoLayout {
+    fn from(path: &Path) -> Self {
+        Self {
+            root: path.to_path_buf(),
+        }
+    }
+}
+
+impl RepoLayout {
+    /// Creates a layout from an explicit repository root.
+    pub fn from_root(root: PathBuf) -> Self {
+        Self { root }
+    }
+
+    /// Detects the repository root based on `CARGO_MANIFEST_DIR`.
+    ///
+    /// This is intended for use from binaries inside the workspace.
+    pub fn detect() -> Result<Self> {
+        let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+
+        let root = manifest_dir.parent().ok_or_else(|| {
+            DataFusionError::Execution(
+                "CARGO_MANIFEST_DIR does not have a parent".to_string(),
+            )
+        })?;
+
+        Ok(Self {
+            root: root.to_path_buf(),
+        })
+    }
+
+    /// Returns the repository root directory.
+    pub fn root(&self) -> &Path {
+        &self.root
+    }
+
+    /// Returns the `datafusion-examples/examples` directory.
+    pub fn examples_root(&self) -> PathBuf {
+        self.root.join("datafusion-examples").join("examples")
+    }
+
+    /// Returns the directory for a single example group.
+    ///
+    /// Example: `examples/udf`
+    pub fn example_group_dir(&self, group: &str) -> PathBuf {
+        self.examples_root().join(group)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn detect_sets_non_empty_root() -> Result<()> {
+        let layout = RepoLayout::detect()?;
+        assert!(!layout.root().as_os_str().is_empty());
+        Ok(())
+    }
+
+    #[test]
+    fn examples_root_is_under_repo_root() -> Result<()> {
+        let layout = RepoLayout::detect()?;
+        let examples_root = layout.examples_root();
+        assert!(examples_root.starts_with(layout.root()));
+        assert!(examples_root.ends_with("datafusion-examples/examples"));
+        Ok(())
+    }
+
+    #[test]
+    fn example_group_dir_appends_group_name() -> Result<()> {
+        let layout = RepoLayout::detect()?;
+        let group_dir = layout.example_group_dir("foo");
+        assert!(group_dir.ends_with("datafusion-examples/examples/foo"));
+        Ok(())
+    }
+}
diff --git a/datafusion-examples/src/utils/example_metadata/mod.rs b/datafusion-examples/src/utils/example_metadata/mod.rs
new file mode 100644
index 0000000000000..ab4c8e4a8e4c2
--- /dev/null
+++ b/datafusion-examples/src/utils/example_metadata/mod.rs
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Documentation generator for DataFusion examples.
+//!
+//! # Design goals
+//!
+//! - Keep README.md in sync with runnable examples
+//! - Fail fast on malformed documentation
+//!
+//! # Overview
+//!
+//! Each example group corresponds to a directory under
+//! `datafusion-examples/examples/<group>` containing a `main.rs` file.
+//! Documentation is extracted from structured `//!` comments in that file.
+//!
+//! For each example group, the generator produces:
+//!
+//! ```text
+//! ## <Group Name> Examples
+//! ### Group: `<group>`
+//! #### Category: Single Process | Distributed
+//!
+//! | Subcommand | File Path | Description |
+//! ```
+//!
+//! # Usage
+//!
+//! Generate documentation for a single group only:
+//!
+//! ```bash
+//! cargo run --bin examples-docs -- dataframe
+//! ```
+//!
+//! Generate documentation for all examples:
+//!
+//! ```bash
+//! cargo run --bin examples-docs  
+//! ```
+
+pub mod discover;
+pub mod layout;
+pub mod model;
+pub mod parser;
+pub mod render;
+
+#[cfg(test)]
+pub mod test_utils;
+
+pub use layout::RepoLayout;
+pub use model::{Category, ExampleEntry, ExampleGroup, GroupName};
+pub use parser::parse_main_rs_docs;
+pub use render::generate_examples_readme;
diff --git a/datafusion-examples/src/utils/example_metadata/model.rs b/datafusion-examples/src/utils/example_metadata/model.rs
new file mode 100644
index 0000000000000..11416d141eb74
--- /dev/null
+++ b/datafusion-examples/src/utils/example_metadata/model.rs
@@ -0,0 +1,418 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Domain model for DataFusion example documentation.
+//!
+//! This module defines the core data structures used to represent
+//! example groups, individual examples, and their categorization
+//! as parsed from `main.rs` documentation comments.
+
+use std::path::Path;
+
+use datafusion::error::{DataFusionError, Result};
+
+use crate::utils::example_metadata::parse_main_rs_docs;
+
+/// Well-known abbreviations used to preserve correct capitalization
+/// when generating human-readable documentation titles.
+const ABBREVIATIONS: &[(&str, &str)] = &[
+    ("dataframe", "DataFrame"),
+    ("io", "IO"),
+    ("sql", "SQL"),
+    ("udf", "UDF"),
+];
+
+/// A group of related examples (e.g. `builtin_functions`, `udf`).
+///
+/// Each group corresponds to a directory containing a `main.rs` file
+/// with structured documentation comments.
+#[derive(Debug)]
+pub struct ExampleGroup {
+    pub name: GroupName,
+    pub examples: Vec<ExampleEntry>,
+    pub category: Category,
+}
+
+impl ExampleGroup {
+    /// Parses an example group from its directory.
+    ///
+    /// The group name is derived from the directory name, and example
+    /// entries are extracted from `main.rs`.
+    pub fn from_dir(dir: &Path, category: Category) -> Result<Self> {
+        let raw_name = dir
+            .file_name()
+            .and_then(|s| s.to_str())
+            .ok_or_else(|| {
+                DataFusionError::Execution("Invalid example group dir".to_string())
+            })?
+            .to_string();
+
+        let name = GroupName::from_dir_name(raw_name);
+        let main_rs = dir.join("main.rs");
+        let examples = parse_main_rs_docs(&main_rs)?;
+
+        Ok(Self {
+            name,
+            examples,
+            category,
+        })
+    }
+}
+
+/// Represents an example group name in both raw and human-readable forms.
+///
+/// For example:
+/// - raw: `builtin_functions`
+/// - title: `Builtin Functions`
+#[derive(Debug)]
+pub struct GroupName {
+    raw: String,
+    title: String,
+}
+
+impl GroupName {
+    /// Creates a group name from a directory name.
+    pub fn from_dir_name(raw: String) -> Self {
+        let title = raw
+            .split('_')
+            .map(format_part)
+            .collect::<Vec<_>>()
+            .join(" ");
+
+        Self { raw, title }
+    }
+
+    /// Returns the raw group name (directory name).
+    pub fn raw(&self) -> &str {
+        &self.raw
+    }
+
+    /// Returns a title-cased name for documentation.
+    pub fn title(&self) -> &str {
+        &self.title
+    }
+}
+
+/// A single runnable example within a group.
+///
+/// Each entry corresponds to a subcommand documented in `main.rs`.
+#[derive(Debug)]
+pub struct ExampleEntry {
+    /// CLI subcommand name.
+    pub subcommand: String,
+    /// Rust source file name.
+    pub file: String,
+    /// Human-readable description.
+    pub desc: String,
+}
+
+/// Execution category of an example group.
+#[derive(Debug, Default)]
+pub enum Category {
+    /// Runs in a single process.
+    #[default]
+    SingleProcess,
+    /// Requires a distributed setup.
+    Distributed,
+}
+
+impl Category {
+    /// Returns the display name used in documentation.
+    pub fn name(&self) -> &str {
+        match self {
+            Self::SingleProcess => "Single Process",
+            Self::Distributed => "Distributed",
+        }
+    }
+
+    /// Determines the category for a group by name.
+    pub fn for_group(name: &str) -> Self {
+        match name {
+            "flight" => Category::Distributed,
+            _ => Category::SingleProcess,
+        }
+    }
+}
+
+/// Formats a single group-name segment for display.
+///
+/// This function applies DataFusion-specific capitalization rules:
+/// - Known abbreviations (e.g. `sql`, `io`, `udf`) are rendered in all caps
+/// - All other segments fall back to standard Title Case
+fn format_part(part: &str) -> String {
+    let lower = part.to_ascii_lowercase();
+
+    if let Some((_, replacement)) = ABBREVIATIONS.iter().find(|(k, _)| *k == lower) {
+        return replacement.to_string();
+    }
+
+    let mut chars = part.chars();
+    match chars.next() {
+        Some(first) => first.to_uppercase().collect::<String>() + chars.as_str(),
+        None => String::new(),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use crate::utils::example_metadata::test_utils::{
+        assert_exec_err_contains, example_group_from_docs,
+    };
+
+    use std::fs;
+
+    use tempfile::TempDir;
+
+    #[test]
+    fn category_for_group_works() {
+        assert!(matches!(
+            Category::for_group("flight"),
+            Category::Distributed
+        ));
+        assert!(matches!(
+            Category::for_group("anything_else"),
+            Category::SingleProcess
+        ));
+    }
+
+    #[test]
+    fn all_subcommand_is_ignored() -> Result<()> {
+        let group = example_group_from_docs(
+            r#"
+        //! - `all` — run all examples included in this module
+        //!
+        //! - `foo`
+        //!   (file: foo.rs, desc: foo example)
+        "#,
+        )?;
+        assert_eq!(group.examples.len(), 1);
+        assert_eq!(group.examples[0].subcommand, "foo");
+        Ok(())
+    }
+
+    #[test]
+    fn metadata_without_subcommand_fails() {
+        let err = example_group_from_docs("//! (file: foo.rs, desc: missing subcommand)")
+            .unwrap_err();
+        assert_exec_err_contains(err, "Metadata without preceding subcommand");
+    }
+
+    #[test]
+    fn group_name_handles_abbreviations() {
+        assert_eq!(
+            GroupName::from_dir_name("dataframe".to_string()).title(),
+            "DataFrame"
+        );
+        assert_eq!(
+            GroupName::from_dir_name("data_io".to_string()).title(),
+            "Data IO"
+        );
+        assert_eq!(
+            GroupName::from_dir_name("sql_ops".to_string()).title(),
+            "SQL Ops"
+        );
+        assert_eq!(GroupName::from_dir_name("udf".to_string()).title(), "UDF");
+    }
+
+    #[test]
+    fn group_name_title_cases() {
+        let cases = [
+            ("very_long_group_name", "Very Long Group Name"),
+            ("foo", "Foo"),
+            ("dataframe", "DataFrame"),
+            ("data_io", "Data IO"),
+            ("sql_ops", "SQL Ops"),
+            ("udf", "UDF"),
+        ];
+        for (input, expected) in cases {
+            let name = GroupName::from_dir_name(input.to_string());
+            assert_eq!(name.title(), expected);
+        }
+    }
+
+    #[test]
+    fn parse_group_example_works() -> Result<()> {
+        let tmp = TempDir::new().unwrap();
+
+        // Simulate: examples/builtin_functions/
+        let group_dir = tmp.path().join("builtin_functions");
+        fs::create_dir(&group_dir)?;
+
+        // Write a fake main.rs with docs
+        let main_rs = group_dir.join("main.rs");
+        fs::write(
+            &main_rs,
+            r#"
+    // Licensed to the Apache Software Foundation (ASF) under one
+    // or more contributor license agreements.  See the NOTICE file
+    // distributed with this work for additional information
+    // regarding copyright ownership.  The ASF licenses this file
+    // to you under the Apache License, Version 2.0 (the
+    // "License"); you may not use this file except in compliance
+    // with the License.  You may obtain a copy of the License at
+    //
+    //   http://www.apache.org/licenses/LICENSE-2.0
+    //
+    // Unless required by applicable law or agreed to in writing,
+    // software distributed under the License is distributed on an
+    // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    // KIND, either express or implied.  See the License for the
+    // specific language governing permissions and limitations
+    // under the License.
+    //
+    //! # These are miscellaneous function-related examples
+    //!
+    //! These examples demonstrate miscellaneous function-related features.
+    //!
+    //! ## Usage
+    //! ```bash
+    //! cargo run --example builtin_functions -- [all|date_time|function_factory|regexp]
+    //! ```
+    //!
+    //! Each subcommand runs a corresponding example:
+    //! - `all` — run all examples included in this module
+    //!
+    //! - `date_time`
+    //!   (file: date_time.rs, desc: Examples of date-time related functions and queries)
+    //!
+    //! - `function_factory`
+    //!   (file: function_factory.rs, desc: Register `CREATE FUNCTION` handler to implement SQL macros)
+    //!
+    //! - `regexp`
+    //!   (file: regexp.rs, desc: Examples of using regular expression functions)
+    "#,
+        )?;
+
+        let group = ExampleGroup::from_dir(&group_dir, Category::SingleProcess)?;
+
+        // Assert group-level data
+        assert_eq!(group.name.title(), "Builtin Functions");
+        assert_eq!(group.examples.len(), 3);
+
+        // Assert 1 example
+        assert_eq!(group.examples[0].subcommand, "date_time");
+        assert_eq!(group.examples[0].file, "date_time.rs");
+        assert_eq!(
+            group.examples[0].desc,
+            "Examples of date-time related functions and queries"
+        );
+
+        // Assert 2 example
+        assert_eq!(group.examples[1].subcommand, "function_factory");
+        assert_eq!(group.examples[1].file, "function_factory.rs");
+        assert_eq!(
+            group.examples[1].desc,
+            "Register `CREATE FUNCTION` handler to implement SQL macros"
+        );
+
+        // Assert 3 example
+        assert_eq!(group.examples[2].subcommand, "regexp");
+        assert_eq!(group.examples[2].file, "regexp.rs");
+        assert_eq!(
+            group.examples[2].desc,
+            "Examples of using regular expression functions"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn duplicate_metadata_without_repeating_subcommand_fails() {
+        let err = example_group_from_docs(
+            r#"
+        //! - `foo`
+        //! (file: a.rs, desc: first)
+        //! (file: b.rs, desc: second)
+        "#,
+        )
+        .unwrap_err();
+        assert_exec_err_contains(err, "Metadata without preceding subcommand");
+    }
+
+    #[test]
+    fn duplicate_metadata_for_same_subcommand_fails() {
+        let err = example_group_from_docs(
+            r#"
+        //! - `foo`
+        //! (file: a.rs, desc: first)
+        //!
+        //! - `foo`
+        //! (file: b.rs, desc: second)
+        "#,
+        )
+        .unwrap_err();
+        assert_exec_err_contains(err, "Duplicate metadata for subcommand `foo`");
+    }
+
+    #[test]
+    fn metadata_must_follow_subcommand() {
+        let err = example_group_from_docs(
+            r#"
+        //! - `foo`
+        //! some unrelated comment
+        //! (file: foo.rs, desc: test)
+        "#,
+        )
+        .unwrap_err();
+        assert_exec_err_contains(err, "Metadata without preceding subcommand");
+    }
+
+    #[test]
+    fn preserves_example_order_from_main_rs() -> Result<()> {
+        let group = example_group_from_docs(
+            r#"
+        //! - `second`
+        //! (file: second.rs, desc: second example)
+        //!
+        //! - `first`
+        //! (file: first.rs, desc: first example)
+        //!
+        //! - `third`
+        //! (file: third.rs, desc: third example)
+        "#,
+        )?;
+
+        let subcommands: Vec<&str> = group
+            .examples
+            .iter()
+            .map(|e| e.subcommand.as_str())
+            .collect();
+
+        assert_eq!(
+            subcommands,
+            vec!["second", "first", "third"],
+            "examples must preserve the order defined in main.rs"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn metadata_can_follow_blank_doc_line() -> Result<()> {
+        let group = example_group_from_docs(
+            r#"
+        //! - `foo`
+        //!
+        //! (file: foo.rs, desc: test)
+        "#,
+        )?;
+        assert_eq!(group.examples.len(), 1);
+        Ok(())
+    }
+}
diff --git a/datafusion-examples/src/utils/example_metadata/parser.rs b/datafusion-examples/src/utils/example_metadata/parser.rs
new file mode 100644
index 0000000000000..4ead3e5a2ae9f
--- /dev/null
+++ b/datafusion-examples/src/utils/example_metadata/parser.rs
@@ -0,0 +1,267 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Parser for example metadata embedded in `main.rs` documentation comments.
+//!
+//! This module scans `//!` doc comments to extract example subcommands
+//! and their associated metadata (file name and description), enforcing
+//! a strict ordering and structure to avoid ambiguous documentation.
+
+use std::{collections::HashSet, fs, path::Path};
+
+use datafusion::common::exec_err;
+use datafusion::error::Result;
+use nom::{
+    Err, IResult, Parser,
+    bytes::complete::{tag, take_until, take_while},
+    character::complete::multispace0,
+    combinator::all_consuming,
+    error::{Error, ErrorKind},
+    sequence::{delimited, preceded},
+};
+
+use crate::utils::example_metadata::ExampleEntry;
+
+/// Parsing state machine used while scanning `main.rs` docs.
+///
+/// This makes the "subcommand - metadata" relationship explicit:
+/// metadata is only valid immediately after a subcommand has been seen.
+enum ParserState<'a> {
+    /// Not currently expecting metadata.
+    Idle,
+    /// A subcommand was just parsed; the next valid metadata (if any)
+    /// must belong to this subcommand.
+    SeenSubcommand(&'a str),
+}
+
+/// Parses a subcommand declaration line from `main.rs` docs.
+///
+/// Expected format:
+/// ```text
+/// //! - `<subcommand>`
+/// ```
+fn parse_subcommand_line(input: &str) -> IResult<&str, &str> {
+    let parser = preceded(
+        multispace0,
+        delimited(tag("//! - `"), take_until("`"), tag("`")),
+    );
+    all_consuming(parser).parse(input)
+}
+
+/// Parses example metadata (file name and description) from `main.rs` docs.
+///
+/// Expected format:
+/// ```text
+/// //! (file: <file>.rs, desc: <description>)
+/// ```
+fn parse_metadata_line(input: &str) -> IResult<&str, (&str, &str)> {
+    let parser = preceded(
+        multispace0,
+        preceded(tag("//!"), preceded(multispace0, take_while(|_| true))),
+    );
+    let (rest, payload) = all_consuming(parser).parse(input)?;
+
+    let content = payload
+        .strip_prefix("(")
+        .and_then(|s| s.strip_suffix(")"))
+        .ok_or_else(|| Err::Error(Error::new(payload, ErrorKind::Tag)))?;
+
+    let (file, desc) = content
+        .strip_prefix("file:")
+        .ok_or_else(|| Err::Error(Error::new(payload, ErrorKind::Tag)))?
+        .split_once(", desc:")
+        .ok_or_else(|| Err::Error(Error::new(payload, ErrorKind::Tag)))?;
+
+    Ok((rest, (file.trim(), desc.trim())))
+}
+
+/// Parses example entries from a group's `main.rs` file.
+pub fn parse_main_rs_docs(path: &Path) -> Result<Vec<ExampleEntry>> {
+    let content = fs::read_to_string(path)?;
+    let mut entries = vec![];
+    let mut state = ParserState::Idle;
+    let mut seen_subcommands = HashSet::new();
+
+    for (line_no, raw_line) in content.lines().enumerate() {
+        let line = raw_line.trim();
+
+        // Try parsing subcommand, excluding `all` because it's not used in README
+        if let Ok((_, sub)) = parse_subcommand_line(line) {
+            state = if sub == "all" {
+                ParserState::Idle
+            } else {
+                ParserState::SeenSubcommand(sub)
+            };
+            continue;
+        }
+
+        // Try parsing metadata
+        if let Ok((_, (file, desc))) = parse_metadata_line(line) {
+            let subcommand = match state {
+                ParserState::SeenSubcommand(s) => s,
+                ParserState::Idle => {
+                    return exec_err!(
+                        "Metadata without preceding subcommand at {}:{}",
+                        path.display(),
+                        line_no + 1
+                    );
+                }
+            };
+
+            if !seen_subcommands.insert(subcommand) {
+                return exec_err!("Duplicate metadata for subcommand `{subcommand}`");
+            }
+
+            entries.push(ExampleEntry {
+                subcommand: subcommand.to_string(),
+                file: file.to_string(),
+                desc: desc.to_string(),
+            });
+
+            state = ParserState::Idle;
+            continue;
+        }
+
+        // If a non-blank doc line interrupts a pending subcommand, reset the state
+        if let ParserState::SeenSubcommand(_) = state
+            && is_non_blank_doc_line(line)
+        {
+            state = ParserState::Idle;
+        }
+    }
+
+    Ok(entries)
+}
+
+/// Returns `true` for non-blank Rust doc comment lines (`//!`).
+///
+/// Used to detect when a subcommand is interrupted by unrelated documentation,
+/// so metadata is only accepted immediately after a subcommand (blank doc lines
+/// are allowed in between).
+fn is_non_blank_doc_line(line: &str) -> bool {
+    line.starts_with("//!") && !line.trim_start_matches("//!").trim().is_empty()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use tempfile::TempDir;
+
+    #[test]
+    fn parse_subcommand_line_accepts_valid_input() {
+        let line = "//! - `date_time`";
+        let sub = parse_subcommand_line(line);
+        assert_eq!(sub, Ok(("", "date_time")));
+    }
+
+    #[test]
+    fn parse_subcommand_line_invalid_inputs() {
+        let err_lines = [
+            "//! - ",
+            "//! - foo",
+            "//! - `foo` bar",
+            "//! --",
+            "//!-",
+            "//!--",
+            "//!",
+            "//",
+            "/",
+            "",
+        ];
+        for line in err_lines {
+            assert!(
+                parse_subcommand_line(line).is_err(),
+                "expected error for input: {line}"
+            );
+        }
+    }
+
+    #[test]
+    fn parse_metadata_line_accepts_valid_input() {
+        let line =
+            "//! (file: date_time.rs, desc: Examples of date-time related functions)";
+        let res = parse_metadata_line(line);
+        assert_eq!(
+            res,
+            Ok((
+                "",
+                ("date_time.rs", "Examples of date-time related functions")
+            ))
+        );
+
+        let line = "//! (file: foo.rs, desc: Foo, bar, baz)";
+        let res = parse_metadata_line(line);
+        assert_eq!(res, Ok(("", ("foo.rs", "Foo, bar, baz"))));
+
+        let line = "//! (file: foo.rs, desc: Foo(FOO))";
+        let res = parse_metadata_line(line);
+        assert_eq!(res, Ok(("", ("foo.rs", "Foo(FOO)"))));
+    }
+
+    #[test]
+    fn parse_metadata_line_invalid_inputs() {
+        let bad_lines = [
+            "//! (file: foo.rs)",
+            "//! (desc: missing file)",
+            "//! file: foo.rs, desc: test",
+            "//! file: foo.rs,desc: test",
+            "//! (file: foo.rs desc: test)",
+            "//! (file: foo.rs,desc: test)",
+            "//! (desc: test, file: foo.rs)",
+            "//! ()",
+            "//! (file: foo.rs, desc: test) extra",
+            "",
+        ];
+        for line in bad_lines {
+            assert!(
+                parse_metadata_line(line).is_err(),
+                "expected error for input: {line}"
+            );
+        }
+    }
+
+    #[test]
+    fn parse_main_rs_docs_extracts_entries() -> Result<()> {
+        let tmp = TempDir::new().unwrap();
+        let main_rs = tmp.path().join("main.rs");
+
+        fs::write(
+            &main_rs,
+            r#"
+        //! - `foo`
+        //! (file: foo.rs, desc: first example)
+        //!
+        //! - `bar`
+        //! (file: bar.rs, desc: second example)
+        "#,
+        )?;
+
+        let entries = parse_main_rs_docs(&main_rs)?;
+
+        assert_eq!(entries.len(), 2);
+
+        assert_eq!(entries[0].subcommand, "foo");
+        assert_eq!(entries[0].file, "foo.rs");
+        assert_eq!(entries[0].desc, "first example");
+
+        assert_eq!(entries[1].subcommand, "bar");
+        assert_eq!(entries[1].file, "bar.rs");
+        assert_eq!(entries[1].desc, "second example");
+        Ok(())
+    }
+}
diff --git a/datafusion-examples/src/utils/example_metadata/render.rs b/datafusion-examples/src/utils/example_metadata/render.rs
new file mode 100644
index 0000000000000..a4ea620e78352
--- /dev/null
+++ b/datafusion-examples/src/utils/example_metadata/render.rs
@@ -0,0 +1,203 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Markdown renderer for DataFusion example documentation.
+//!
+//! This module takes parsed example metadata and generates the
+//! `README.md` content for `datafusion-examples`, including group
+//! sections and example tables.
+
+use std::path::PathBuf;
+
+use datafusion::error::{DataFusionError, Result};
+
+use crate::utils::example_metadata::discover::discover_example_groups;
+use crate::utils::example_metadata::model::ExampleGroup;
+use crate::utils::example_metadata::{Category, RepoLayout};
+
+const STATIC_HEADER: &str = r#"<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# DataFusion Examples
+
+This crate includes end to end, highly commented examples of how to use
+various DataFusion APIs to help you get started.
+
+## Prerequisites
+
+Run `git submodule update --init` to init test files.
+
+## Running Examples
+
+To run an example, use the `cargo run` command, such as:
+
+```bash
+git clone https://github.com/apache/datafusion
+cd datafusion
+# Download test data
+git submodule update --init
+
+# Change to the examples directory
+cd datafusion-examples/examples
+
+# Run all examples in a group
+cargo run --example <group> -- all
+
+# Run a specific example within a group
+cargo run --example <group> -- <subcommand>
+
+# Run all examples in the `dataframe` group
+cargo run --example dataframe -- all
+
+# Run a single example from the `dataframe` group
+# (apply the same pattern for any other group)
+cargo run --example dataframe -- dataframe
+```
+"#;
+
+/// Generates Markdown documentation for DataFusion examples.
+///
+/// If `group` is `None`, documentation is generated for all example groups.
+/// If `group` is `Some`, only that group is rendered.
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - the requested group does not exist
+/// - a `main.rs` file is missing
+/// - documentation comments are malformed
+pub fn generate_examples_readme(
+    layout: &RepoLayout,
+    group: Option<&str>,
+) -> Result<String> {
+    let examples_root = layout.examples_root();
+
+    let mut out = String::new();
+    out.push_str(STATIC_HEADER);
+
+    let group_dirs: Vec<PathBuf> = match group {
+        Some(name) => {
+            let dir = examples_root.join(name);
+            if !dir.is_dir() {
+                return Err(DataFusionError::Execution(format!(
+                    "Example group `{name}` does not exist"
+                )));
+            }
+            vec![dir]
+        }
+        None => discover_example_groups(&examples_root)?,
+    };
+
+    for group_dir in group_dirs {
+        let raw_name =
+            group_dir
+                .file_name()
+                .and_then(|s| s.to_str())
+                .ok_or_else(|| {
+                    DataFusionError::Execution("Invalid example group dir".to_string())
+                })?;
+
+        let category = Category::for_group(raw_name);
+        let group = ExampleGroup::from_dir(&group_dir, category)?;
+
+        out.push_str(&group.render_markdown());
+    }
+
+    Ok(out)
+}
+
+impl ExampleGroup {
+    /// Renders this example group as a Markdown section for the README.
+    pub fn render_markdown(&self) -> String {
+        let mut out = String::new();
+        out.push_str(&format!("\n## {} Examples\n\n", self.name.title()));
+        out.push_str(&format!("### Group: `{}`\n\n", self.name.raw()));
+        out.push_str(&format!("#### Category: {}\n\n", self.category.name()));
+        out.push_str("| Subcommand | File Path | Description |\n");
+        out.push_str("| --- | --- | --- |\n");
+
+        for example in &self.examples {
+            out.push_str(&format!(
+                "| {} | [`{}/{}`](examples/{}/{}) | {} |\n",
+                example.subcommand,
+                self.name.raw(),
+                example.file,
+                self.name.raw(),
+                example.file,
+                example.desc
+            ));
+        }
+
+        out
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use crate::utils::example_metadata::test_utils::assert_exec_err_contains;
+
+    use std::fs;
+
+    use tempfile::TempDir;
+
+    #[test]
+    fn single_group_generation_works() {
+        let tmp = TempDir::new().unwrap();
+        // Fake repo root
+        let layout = RepoLayout::from_root(tmp.path().to_path_buf());
+
+        // Create: datafusion-examples/examples/builtin_functions
+        let examples_dir = layout.example_group_dir("builtin_functions");
+        fs::create_dir_all(&examples_dir).unwrap();
+
+        fs::write(
+            examples_dir.join("main.rs"),
+            "//! - `x`\n//! (file: foo.rs, desc: test)",
+        )
+        .unwrap();
+
+        let out = generate_examples_readme(&layout, Some("builtin_functions")).unwrap();
+        assert!(out.contains("Builtin Functions"));
+        assert!(out.contains("| x | [`builtin_functions/foo.rs`]"));
+    }
+
+    #[test]
+    fn single_group_generation_fails_if_group_missing() {
+        let tmp = TempDir::new().unwrap();
+        let layout = RepoLayout::from_root(tmp.path().to_path_buf());
+        let err = generate_examples_readme(&layout, Some("missing_group")).unwrap_err();
+        assert_exec_err_contains(err, "Example group `missing_group` does not exist");
+    }
+}
diff --git a/datafusion-examples/src/utils/example_metadata/test_utils.rs b/datafusion-examples/src/utils/example_metadata/test_utils.rs
new file mode 100644
index 0000000000000..d6ab3b06ba06d
--- /dev/null
+++ b/datafusion-examples/src/utils/example_metadata/test_utils.rs
@@ -0,0 +1,63 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Test helpers for example metadata parsing and validation.
+//!
+//! This module provides small, focused utilities to reduce duplication
+//! and keep tests readable across the example metadata submodules.
+
+use std::fs;
+
+use datafusion::error::{DataFusionError, Result};
+use tempfile::TempDir;
+
+use crate::utils::example_metadata::{Category, ExampleGroup};
+
+/// Asserts that an `Execution` error contains the expected message fragment.
+///
+/// Keeps tests focused on semantic error causes without coupling them
+/// to full error string formatting.
+pub fn assert_exec_err_contains(err: DataFusionError, needle: &str) {
+    match err {
+        DataFusionError::Execution(msg) => {
+            assert!(
+                msg.contains(needle),
+                "expected '{needle}' in error message, got: {msg}"
+            );
+        }
+        other => panic!("expected Execution error, got: {other:?}"),
+    }
+}
+
+/// Helper for grammar-focused tests.
+///
+/// Creates a minimal temporary example group with a single `main.rs`
+/// containing the provided docs. Intended for testing parsing and
+/// validation rules, not full integration behavior.
+pub fn example_group_from_docs(docs: &str) -> Result<ExampleGroup> {
+    let tmp = TempDir::new().map_err(|e| {
+        DataFusionError::Execution(format!("Failed initializing temp dir: {e}"))
+    })?;
+    let dir = tmp.path().join("group");
+    fs::create_dir(&dir).map_err(|e| {
+        DataFusionError::Execution(format!("Failed creating temp dir: {e}"))
+    })?;
+    fs::write(dir.join("main.rs"), docs).map_err(|e| {
+        DataFusionError::Execution(format!("Failed writing to temp file: {e}"))
+    })?;
+    ExampleGroup::from_dir(&dir, Category::SingleProcess)
+}
diff --git a/datafusion-examples/src/utils/mod.rs b/datafusion-examples/src/utils/mod.rs
new file mode 100644
index 0000000000000..da96724a49cb3
--- /dev/null
+++ b/datafusion-examples/src/utils/mod.rs
@@ -0,0 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod csv_to_parquet;
+pub mod datasets;
+pub mod example_metadata;
+
+pub use csv_to_parquet::write_csv_to_parquet;
diff --git a/datafusion/catalog-listing/Cargo.toml b/datafusion/catalog-listing/Cargo.toml
index 4eaeed675a206..61b55397137df 100644
--- a/datafusion/catalog-listing/Cargo.toml
+++ b/datafusion/catalog-listing/Cargo.toml
@@ -46,11 +46,14 @@ futures = { workspace = true }
 itertools = { workspace = true }
 log = { workspace = true }
 object_store = { workspace = true }
-tokio = { workspace = true }
 
 [dev-dependencies]
+chrono = { workspace = true }
 datafusion-datasource-parquet = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
diff --git a/datafusion/catalog-listing/src/config.rs b/datafusion/catalog-listing/src/config.rs
index 3370d2ea75535..ca4d2abfcd737 100644
--- a/datafusion/catalog-listing/src/config.rs
+++ b/datafusion/catalog-listing/src/config.rs
@@ -19,9 +19,10 @@ use crate::options::ListingOptions;
 use arrow::datatypes::{DataType, Schema, SchemaRef};
 use datafusion_catalog::Session;
 use datafusion_common::{config_err, internal_err};
+use datafusion_datasource::ListingTableUrl;
 use datafusion_datasource::file_compression_type::FileCompressionType;
+#[expect(deprecated)]
 use datafusion_datasource::schema_adapter::SchemaAdapterFactory;
-use datafusion_datasource::ListingTableUrl;
 use datafusion_physical_expr_adapter::PhysicalExprAdapterFactory;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -44,15 +45,12 @@ pub enum SchemaSource {
 /// # Schema Evolution Support
 ///
 /// This configuration supports schema evolution through the optional
-/// [`SchemaAdapterFactory`]. You might want to override the default factory when you need:
+/// [`PhysicalExprAdapterFactory`]. You might want to override the default factory when you need:
 ///
 /// - **Type coercion requirements**: When you need custom logic for converting between
 ///   different Arrow data types (e.g., Int32 ↔ Int64, Utf8 ↔ LargeUtf8)
 /// - **Column mapping**: You need to map columns with a legacy name to a new name
 /// - **Custom handling of missing columns**: By default they are filled in with nulls, but you may e.g. want to fill them in with `0` or `""`.
-///
-/// If not specified, a [`datafusion_datasource::schema_adapter::DefaultSchemaAdapterFactory`]
-/// will be used, which handles basic schema compatibility cases.
 #[derive(Debug, Clone, Default)]
 pub struct ListingTableConfig {
     /// Paths on the `ObjectStore` for creating [`crate::ListingTable`].
@@ -68,8 +66,6 @@ pub struct ListingTableConfig {
     pub options: Option<ListingOptions>,
     /// Tracks the source of the schema information
     pub(crate) schema_source: SchemaSource,
-    /// Optional [`SchemaAdapterFactory`] for creating schema adapters
-    pub(crate) schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
     /// Optional [`PhysicalExprAdapterFactory`] for creating physical expression adapters
     pub(crate) expr_adapter_factory: Option<Arc<dyn PhysicalExprAdapterFactory>>,
 }
@@ -218,8 +214,7 @@ impl ListingTableConfig {
                     file_schema,
                     options: _,
                     schema_source,
-                    schema_adapter_factory,
-                    expr_adapter_factory: physical_expr_adapter_factory,
+                    expr_adapter_factory,
                 } = self;
 
                 let (schema, new_schema_source) = match file_schema {
@@ -241,8 +236,7 @@ impl ListingTableConfig {
                     file_schema: Some(schema),
                     options: Some(options),
                     schema_source: new_schema_source,
-                    schema_adapter_factory,
-                    expr_adapter_factory: physical_expr_adapter_factory,
+                    expr_adapter_factory,
                 })
             }
             None => internal_err!("No `ListingOptions` set for inferring schema"),
@@ -282,7 +276,6 @@ impl ListingTableConfig {
                     file_schema: self.file_schema,
                     options: Some(options),
                     schema_source: self.schema_source,
-                    schema_adapter_factory: self.schema_adapter_factory,
                     expr_adapter_factory: self.expr_adapter_factory,
                 })
             }
@@ -290,63 +283,11 @@ impl ListingTableConfig {
         }
     }
 
-    /// Set the [`SchemaAdapterFactory`] for the [`crate::ListingTable`]
-    ///
-    /// The schema adapter factory is used to create schema adapters that can
-    /// handle schema evolution and type conversions when reading files with
-    /// different schemas than the table schema.
-    ///
-    /// If not provided, a default schema adapter factory will be used.
-    ///
-    /// # Example: Custom Schema Adapter for Type Coercion
-    /// ```rust
-    /// # use std::sync::Arc;
-    /// # use datafusion_catalog_listing::{ListingTableConfig, ListingOptions};
-    /// # use datafusion_datasource::schema_adapter::{SchemaAdapterFactory, SchemaAdapter};
-    /// # use datafusion_datasource::ListingTableUrl;
-    /// # use datafusion_datasource_parquet::file_format::ParquetFormat;
-    /// # use arrow::datatypes::{SchemaRef, Schema, Field, DataType};
-    /// #
-    /// # #[derive(Debug)]
-    /// # struct MySchemaAdapterFactory;
-    /// # impl SchemaAdapterFactory for MySchemaAdapterFactory {
-    /// #     fn create(&self, _projected_table_schema: SchemaRef, _file_schema: SchemaRef) -> Box<dyn SchemaAdapter> {
-    /// #         unimplemented!()
-    /// #     }
-    /// # }
-    /// # let table_paths = ListingTableUrl::parse("file:///path/to/data").unwrap();
-    /// # let listing_options = ListingOptions::new(Arc::new(ParquetFormat::default()));
-    /// # let table_schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int64, false)]));
-    /// let config = ListingTableConfig::new(table_paths)
-    ///     .with_listing_options(listing_options)
-    ///     .with_schema(table_schema)
-    ///     .with_schema_adapter_factory(Arc::new(MySchemaAdapterFactory));
-    /// ```
-    pub fn with_schema_adapter_factory(
-        self,
-        schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
-    ) -> Self {
-        Self {
-            schema_adapter_factory: Some(schema_adapter_factory),
-            ..self
-        }
-    }
-
-    /// Get the [`SchemaAdapterFactory`] for this configuration
-    pub fn schema_adapter_factory(&self) -> Option<&Arc<dyn SchemaAdapterFactory>> {
-        self.schema_adapter_factory.as_ref()
-    }
-
     /// Set the [`PhysicalExprAdapterFactory`] for the [`crate::ListingTable`]
     ///
     /// The expression adapter factory is used to create physical expression adapters that can
     /// handle schema evolution and type conversions when evaluating expressions
     /// with different schemas than the table schema.
-    ///
-    /// If not provided, a default physical expression adapter factory will be used unless a custom
-    /// `SchemaAdapterFactory` is set, in which case only the `SchemaAdapterFactory` will be used.
-    ///
-    /// See <https://github.com/apache/datafusion/issues/16800> for details on this transition.
     pub fn with_expr_adapter_factory(
         self,
         expr_adapter_factory: Arc<dyn PhysicalExprAdapterFactory>,
@@ -356,4 +297,23 @@ impl ListingTableConfig {
             ..self
         }
     }
+
+    /// Deprecated: Set the [`SchemaAdapterFactory`] for the [`crate::ListingTable`]
+    ///
+    /// `SchemaAdapterFactory` has been removed. Use [`Self::with_expr_adapter_factory`]
+    /// and `PhysicalExprAdapterFactory` instead. See `upgrading.md` for more details.
+    ///
+    /// This method is a no-op and returns `self` unchanged.
+    #[deprecated(
+        since = "52.0.0",
+        note = "SchemaAdapterFactory has been removed. Use with_expr_adapter_factory and PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+    )]
+    #[expect(deprecated)]
+    pub fn with_schema_adapter_factory(
+        self,
+        _schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
+    ) -> Self {
+        // No-op - just return self unchanged
+        self
+    }
 }
diff --git a/datafusion/catalog-listing/src/helpers.rs b/datafusion/catalog-listing/src/helpers.rs
index 82cc36867939e..c6305c30008ce 100644
--- a/datafusion/catalog-listing/src/helpers.rs
+++ b/datafusion/catalog-listing/src/helpers.rs
@@ -21,25 +21,23 @@ use std::mem;
 use std::sync::Arc;
 
 use datafusion_catalog::Session;
-use datafusion_common::internal_err;
-use datafusion_common::{HashMap, Result, ScalarValue};
+use datafusion_common::{HashMap, Result, ScalarValue, assert_or_internal_err};
 use datafusion_datasource::ListingTableUrl;
 use datafusion_datasource::PartitionedFile;
-use datafusion_expr::{BinaryExpr, Operator};
+use datafusion_expr::{BinaryExpr, Operator, lit, utils};
 
 use arrow::{
-    array::{Array, ArrayRef, AsArray, StringBuilder},
-    compute::{and, cast, prep_null_mask_filter},
-    datatypes::{DataType, Field, Fields, Schema},
+    array::AsArray,
+    datatypes::{DataType, Field},
     record_batch::RecordBatch,
 };
 use datafusion_expr::execution_props::ExecutionProps;
 use futures::stream::FuturesUnordered;
-use futures::{stream::BoxStream, StreamExt, TryStreamExt};
+use futures::{StreamExt, TryStreamExt, stream::BoxStream};
 use log::{debug, trace};
 
 use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
-use datafusion_common::{Column, DFSchema, DataFusionError};
+use datafusion_common::{Column, DFSchema};
 use datafusion_expr::{Expr, Volatility};
 use datafusion_physical_expr::create_physical_expr;
 use object_store::path::Path;
@@ -53,7 +51,7 @@ use object_store::{ObjectMeta, ObjectStore};
 pub fn expr_applicable_for_cols(col_names: &[&str], expr: &Expr) -> bool {
     let mut is_applicable = true;
     expr.apply(|expr| match expr {
-        Expr::Column(Column { ref name, .. }) => {
+        Expr::Column(Column { name, .. }) => {
             is_applicable &= col_names.contains(&name.as_str());
             if is_applicable {
                 Ok(TreeNodeRecursion::Jump)
@@ -85,13 +83,28 @@ pub fn expr_applicable_for_cols(col_names: &[&str], expr: &Expr) -> bool {
         | Expr::Exists(_)
         | Expr::InSubquery(_)
         | Expr::ScalarSubquery(_)
+        | Expr::SetComparison(_)
         | Expr::GroupingSet(_)
-        | Expr::Case(_) => Ok(TreeNodeRecursion::Continue),
+        | Expr::Case(_)
+        | Expr::Lambda(_)
+        | Expr::LambdaVariable(_) => Ok(TreeNodeRecursion::Continue),
 
         Expr::ScalarFunction(scalar_function) => {
             match scalar_function.func.signature().volatility {
                 Volatility::Immutable => Ok(TreeNodeRecursion::Continue),
                 // TODO: Stable functions could be `applicable`, but that would require access to the context
+                // https://github.com/apache/datafusion/issues/21690
+                Volatility::Stable | Volatility::Volatile => {
+                    is_applicable = false;
+                    Ok(TreeNodeRecursion::Stop)
+                }
+            }
+        }
+        Expr::HigherOrderFunction(hof) => {
+            match hof.func.signature().volatility {
+                Volatility::Immutable => Ok(TreeNodeRecursion::Continue),
+                // TODO: Stable functions could be `applicable`, but that would require access to the context
+                // https://github.com/apache/datafusion/issues/21690
                 Volatility::Stable | Volatility::Volatile => {
                     is_applicable = false;
                     Ok(TreeNodeRecursion::Stop)
@@ -103,6 +116,7 @@ pub fn expr_applicable_for_cols(col_names: &[&str], expr: &Expr) -> bool {
         // - AGGREGATE and WINDOW should not end up in filter conditions, except maybe in some edge cases
         // - Can `Wildcard` be considered as a `Literal`?
         // - ScalarVariable could be `applicable`, but that would require access to the context
+        //   https://github.com/apache/datafusion/issues/21690
         // TODO: remove the next line after `Expr::Wildcard` is removed
         #[expect(deprecated)]
         Expr::AggregateFunction { .. }
@@ -239,105 +253,6 @@ pub async fn list_partitions(
     Ok(out)
 }
 
-async fn prune_partitions(
-    table_path: &ListingTableUrl,
-    partitions: Vec<Partition>,
-    filters: &[Expr],
-    partition_cols: &[(String, DataType)],
-) -> Result<Vec<Partition>> {
-    if filters.is_empty() {
-        // prune partitions which don't contain the partition columns
-        return Ok(partitions
-            .into_iter()
-            .filter(|p| {
-                let cols = partition_cols.iter().map(|x| x.0.as_str());
-                !parse_partitions_for_path(table_path, &p.path, cols)
-                    .unwrap_or_default()
-                    .is_empty()
-            })
-            .collect());
-    }
-
-    let mut builders: Vec<_> = (0..partition_cols.len())
-        .map(|_| StringBuilder::with_capacity(partitions.len(), partitions.len() * 10))
-        .collect();
-
-    for partition in &partitions {
-        let cols = partition_cols.iter().map(|x| x.0.as_str());
-        let parsed = parse_partitions_for_path(table_path, &partition.path, cols)
-            .unwrap_or_default();
-
-        let mut builders = builders.iter_mut();
-        for (p, b) in parsed.iter().zip(&mut builders) {
-            b.append_value(p);
-        }
-        builders.for_each(|b| b.append_null());
-    }
-
-    let arrays = partition_cols
-        .iter()
-        .zip(builders)
-        .map(|((_, d), mut builder)| {
-            let array = builder.finish();
-            cast(&array, d)
-        })
-        .collect::<Result<_, _>>()?;
-
-    let fields: Fields = partition_cols
-        .iter()
-        .map(|(n, d)| Field::new(n, d.clone(), true))
-        .collect();
-    let schema = Arc::new(Schema::new(fields));
-
-    let df_schema = DFSchema::from_unqualified_fields(
-        partition_cols
-            .iter()
-            .map(|(n, d)| Field::new(n, d.clone(), true))
-            .collect(),
-        Default::default(),
-    )?;
-
-    let batch = RecordBatch::try_new(schema, arrays)?;
-
-    // TODO: Plumb this down
-    let props = ExecutionProps::new();
-
-    // Applies `filter` to `batch` returning `None` on error
-    let do_filter = |filter| -> Result<ArrayRef> {
-        let expr = create_physical_expr(filter, &df_schema, &props)?;
-        expr.evaluate(&batch)?.into_array(partitions.len())
-    };
-
-    //.Compute the conjunction of the filters
-    let mask = filters
-        .iter()
-        .map(|f| do_filter(f).map(|a| a.as_boolean().clone()))
-        .reduce(|a, b| Ok(and(&a?, &b?)?));
-
-    let mask = match mask {
-        Some(Ok(mask)) => mask,
-        Some(Err(err)) => return Err(err),
-        None => return Ok(partitions),
-    };
-
-    // Don't retain partitions that evaluated to null
-    let prepared = match mask.null_count() {
-        0 => mask,
-        _ => prep_null_mask_filter(&mask),
-    };
-
-    // Sanity check
-    assert_eq!(prepared.len(), partitions.len());
-
-    let filtered = partitions
-        .into_iter()
-        .zip(prepared.values())
-        .filter_map(|(p, f)| f.then_some(p))
-        .collect();
-
-    Ok(filtered)
-}
-
 #[derive(Debug)]
 enum PartitionValue {
     Single(String),
@@ -348,16 +263,11 @@ fn populate_partition_values<'a>(
     partition_values: &mut HashMap<&'a str, PartitionValue>,
     filter: &'a Expr,
 ) {
-    if let Expr::BinaryExpr(BinaryExpr {
-        ref left,
-        op,
-        ref right,
-    }) = filter
-    {
+    if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = filter {
         match op {
             Operator::Eq => match (left.as_ref(), right.as_ref()) {
-                (Expr::Column(Column { ref name, .. }), Expr::Literal(val, _))
-                | (Expr::Literal(val, _), Expr::Column(Column { ref name, .. })) => {
+                (Expr::Column(Column { name, .. }), Expr::Literal(val, _))
+                | (Expr::Literal(val, _), Expr::Column(Column { name, .. })) => {
                     if partition_values
                         .insert(name, PartitionValue::Single(val.to_string()))
                         .is_some()
@@ -412,6 +322,70 @@ pub fn evaluate_partition_prefix<'a>(
     }
 }
 
+fn filter_partitions(
+    pf: PartitionedFile,
+    filters: &[Expr],
+    df_schema: &DFSchema,
+) -> Result<Option<PartitionedFile>> {
+    if pf.partition_values.is_empty() && !filters.is_empty() {
+        return Ok(None);
+    } else if filters.is_empty() {
+        return Ok(Some(pf));
+    }
+
+    let arrays = pf
+        .partition_values
+        .iter()
+        .map(|v| v.to_array())
+        .collect::<Result<_, _>>()?;
+
+    let batch = RecordBatch::try_new(Arc::clone(df_schema.inner()), arrays)?;
+
+    let filter = utils::conjunction(filters.iter().cloned()).unwrap_or_else(|| lit(true));
+    let props = ExecutionProps::new();
+    let expr = create_physical_expr(&filter, df_schema, &props)?;
+
+    // Since we're only operating on a single file, our batch and resulting "array" holds only one
+    // value indicating if the input file matches the provided filters
+    let matches = expr.evaluate(&batch)?.into_array(1)?;
+    if matches.as_boolean().value(0) {
+        return Ok(Some(pf));
+    }
+
+    Ok(None)
+}
+
+/// Returns `Ok(None)` when the file is not inside a valid partition path
+/// (e.g. a stale file in the table root directory). Such files are skipped
+/// because hive-style partition values are never null and there is no valid
+/// value to assign for non-partitioned files.
+fn try_into_partitioned_file(
+    object_meta: ObjectMeta,
+    partition_cols: &[(String, DataType)],
+    table_path: &ListingTableUrl,
+) -> Result<Option<PartitionedFile>> {
+    let cols = partition_cols.iter().map(|(name, _)| name.as_str());
+    let parsed = parse_partitions_for_path(table_path, &object_meta.location, cols);
+
+    let Some(parsed) = parsed else {
+        // parse_partitions_for_path already logs a debug message
+        return Ok(None);
+    };
+
+    let partition_values = parsed
+        .into_iter()
+        .zip(partition_cols)
+        .map(|(parsed, (_, datatype))| {
+            ScalarValue::try_from_string(parsed.to_string(), datatype)
+        })
+        .collect::<Result<Vec<_>>>()?;
+
+    let mut pf: PartitionedFile = object_meta.into();
+    pf.partition_values = partition_values;
+
+    Ok(Some(pf))
+}
+
 /// Discover the partitions on the given path and prune out files
 /// that belong to irrelevant partitions using `filters` expressions.
 /// `filters` should only contain expressions that can be evaluated
@@ -424,80 +398,48 @@ pub async fn pruned_partition_list<'a>(
     file_extension: &'a str,
     partition_cols: &'a [(String, DataType)],
 ) -> Result<BoxStream<'a, Result<PartitionedFile>>> {
-    // if no partition col => simply list all the files
-    if partition_cols.is_empty() {
-        if !filters.is_empty() {
-            return internal_err!(
-                "Got partition filters for unpartitioned table {}",
-                table_path
-            );
-        }
-        return Ok(Box::pin(
-            table_path
-                .list_all_files(ctx, store, file_extension)
-                .await?
-                .try_filter(|object_meta| futures::future::ready(object_meta.size > 0))
-                .map_ok(|object_meta| object_meta.into()),
-        ));
-    }
-
-    let partition_prefix = evaluate_partition_prefix(partition_cols, filters);
-
-    let partitions =
-        list_partitions(store, table_path, partition_cols.len(), partition_prefix)
-            .await?;
-    debug!("Listed {} partitions", partitions.len());
+    let prefix = if !partition_cols.is_empty() {
+        evaluate_partition_prefix(partition_cols, filters)
+    } else {
+        None
+    };
 
-    let pruned =
-        prune_partitions(table_path, partitions, filters, partition_cols).await?;
+    let objects = table_path
+        .list_prefixed_files(ctx, store, prefix, file_extension)
+        .await?
+        .try_filter(|object_meta| futures::future::ready(object_meta.size > 0));
 
-    debug!("Pruning yielded {} partitions", pruned.len());
+    if partition_cols.is_empty() {
+        assert_or_internal_err!(
+            filters.is_empty(),
+            "Got partition filters for unpartitioned table {}",
+            table_path
+        );
 
-    let stream = futures::stream::iter(pruned)
-        .map(move |partition: Partition| async move {
-            let cols = partition_cols.iter().map(|x| x.0.as_str());
-            let parsed = parse_partitions_for_path(table_path, &partition.path, cols);
+        // if no partition col => simply list all the files
+        Ok(objects.map_ok(|object_meta| object_meta.into()).boxed())
+    } else {
+        let df_schema = DFSchema::from_unqualified_fields(
+            partition_cols
+                .iter()
+                .map(|(n, d)| Field::new(n, d.clone(), true))
+                .collect(),
+            Default::default(),
+        )?;
 
-            let partition_values = parsed
-                .into_iter()
-                .flatten()
-                .zip(partition_cols)
-                .map(|(parsed, (_, datatype))| {
-                    ScalarValue::try_from_string(parsed.to_string(), datatype)
-                })
-                .collect::<Result<Vec<_>>>()?;
-
-            let files = match partition.files {
-                Some(files) => files,
-                None => {
-                    trace!("Recursively listing partition {}", partition.path);
-                    store.list(Some(&partition.path)).try_collect().await?
-                }
-            };
-            let files = files.into_iter().filter(move |o| {
-                let extension_match = o.location.as_ref().ends_with(file_extension);
-                // here need to scan subdirectories(`listing_table_ignore_subdirectory` = false)
-                let glob_match = table_path.contains(&o.location, false);
-                extension_match && glob_match
-            });
-
-            let stream = futures::stream::iter(files.map(move |object_meta| {
-                Ok(PartitionedFile {
+        Ok(objects
+            .try_filter_map(|object_meta| {
+                futures::future::ready(try_into_partitioned_file(
                     object_meta,
-                    partition_values: partition_values.clone(),
-                    range: None,
-                    statistics: None,
-                    extensions: None,
-                    metadata_size_hint: None,
-                })
-            }));
-
-            Ok::<_, DataFusionError>(stream)
-        })
-        .buffer_unordered(CONCURRENCY_LIMIT)
-        .try_flatten()
-        .boxed();
-    Ok(stream)
+                    partition_cols,
+                    table_path,
+                ))
+            })
+            .try_filter_map(move |pf| {
+                futures::future::ready(filter_partitions(pf, filters, &df_schema))
+            })
+            .boxed())
+    }
 }
 
 /// Extract the partition values for the given `file_path` (in the given `table_path`)
@@ -541,22 +483,11 @@ pub fn describe_partition(partition: &Partition) -> (&str, usize, Vec<&str>) {
 
 #[cfg(test)]
 mod tests {
-    use async_trait::async_trait;
-    use datafusion_common::config::TableOptions;
     use datafusion_datasource::file_groups::FileGroup;
-    use datafusion_execution::config::SessionConfig;
-    use datafusion_execution::runtime_env::RuntimeEnv;
-    use futures::FutureExt;
-    use object_store::memory::InMemory;
-    use std::any::Any;
     use std::ops::Not;
 
     use super::*;
-    use datafusion_expr::{
-        case, col, lit, AggregateUDF, Expr, LogicalPlan, ScalarUDF, WindowUDF,
-    };
-    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-    use datafusion_physical_plan::ExecutionPlan;
+    use datafusion_expr::{case, col};
 
     #[test]
     fn test_split_files() {
@@ -599,209 +530,6 @@ mod tests {
         assert_eq!(0, chunks.len());
     }
 
-    #[tokio::test]
-    async fn test_pruned_partition_list_empty() {
-        let (store, state) = make_test_store_and_state(&[
-            ("tablepath/mypartition=val1/notparquetfile", 100),
-            ("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0),
-            ("tablepath/file.parquet", 100),
-            ("tablepath/notapartition/file.parquet", 100),
-            ("tablepath/notmypartition=val1/file.parquet", 100),
-        ]);
-        let filter = Expr::eq(col("mypartition"), lit("val1"));
-        let pruned = pruned_partition_list(
-            state.as_ref(),
-            store.as_ref(),
-            &ListingTableUrl::parse("file:///tablepath/").unwrap(),
-            &[filter],
-            ".parquet",
-            &[(String::from("mypartition"), DataType::Utf8)],
-        )
-        .await
-        .expect("partition pruning failed")
-        .collect::<Vec<_>>()
-        .await;
-
-        assert_eq!(pruned.len(), 0);
-    }
-
-    #[tokio::test]
-    async fn test_pruned_partition_list() {
-        let (store, state) = make_test_store_and_state(&[
-            ("tablepath/mypartition=val1/file.parquet", 100),
-            ("tablepath/mypartition=val2/file.parquet", 100),
-            ("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0),
-            ("tablepath/mypartition=val1/other=val3/file.parquet", 100),
-            ("tablepath/notapartition/file.parquet", 100),
-            ("tablepath/notmypartition=val1/file.parquet", 100),
-        ]);
-        let filter = Expr::eq(col("mypartition"), lit("val1"));
-        let pruned = pruned_partition_list(
-            state.as_ref(),
-            store.as_ref(),
-            &ListingTableUrl::parse("file:///tablepath/").unwrap(),
-            &[filter],
-            ".parquet",
-            &[(String::from("mypartition"), DataType::Utf8)],
-        )
-        .await
-        .expect("partition pruning failed")
-        .try_collect::<Vec<_>>()
-        .await
-        .unwrap();
-
-        assert_eq!(pruned.len(), 2);
-        let f1 = &pruned[0];
-        assert_eq!(
-            f1.object_meta.location.as_ref(),
-            "tablepath/mypartition=val1/file.parquet"
-        );
-        assert_eq!(&f1.partition_values, &[ScalarValue::from("val1")]);
-        let f2 = &pruned[1];
-        assert_eq!(
-            f2.object_meta.location.as_ref(),
-            "tablepath/mypartition=val1/other=val3/file.parquet"
-        );
-        assert_eq!(f2.partition_values, &[ScalarValue::from("val1"),]);
-    }
-
-    #[tokio::test]
-    async fn test_pruned_partition_list_multi() {
-        let (store, state) = make_test_store_and_state(&[
-            ("tablepath/part1=p1v1/file.parquet", 100),
-            ("tablepath/part1=p1v2/part2=p2v1/file1.parquet", 100),
-            ("tablepath/part1=p1v2/part2=p2v1/file2.parquet", 100),
-            ("tablepath/part1=p1v3/part2=p2v1/file2.parquet", 100),
-            ("tablepath/part1=p1v2/part2=p2v2/file2.parquet", 100),
-        ]);
-        let filter1 = Expr::eq(col("part1"), lit("p1v2"));
-        let filter2 = Expr::eq(col("part2"), lit("p2v1"));
-        let pruned = pruned_partition_list(
-            state.as_ref(),
-            store.as_ref(),
-            &ListingTableUrl::parse("file:///tablepath/").unwrap(),
-            &[filter1, filter2],
-            ".parquet",
-            &[
-                (String::from("part1"), DataType::Utf8),
-                (String::from("part2"), DataType::Utf8),
-            ],
-        )
-        .await
-        .expect("partition pruning failed")
-        .try_collect::<Vec<_>>()
-        .await
-        .unwrap();
-
-        assert_eq!(pruned.len(), 2);
-        let f1 = &pruned[0];
-        assert_eq!(
-            f1.object_meta.location.as_ref(),
-            "tablepath/part1=p1v2/part2=p2v1/file1.parquet"
-        );
-        assert_eq!(
-            &f1.partition_values,
-            &[ScalarValue::from("p1v2"), ScalarValue::from("p2v1"),]
-        );
-        let f2 = &pruned[1];
-        assert_eq!(
-            f2.object_meta.location.as_ref(),
-            "tablepath/part1=p1v2/part2=p2v1/file2.parquet"
-        );
-        assert_eq!(
-            &f2.partition_values,
-            &[ScalarValue::from("p1v2"), ScalarValue::from("p2v1")]
-        );
-    }
-
-    #[tokio::test]
-    async fn test_list_partition() {
-        let (store, _) = make_test_store_and_state(&[
-            ("tablepath/part1=p1v1/file.parquet", 100),
-            ("tablepath/part1=p1v2/part2=p2v1/file1.parquet", 100),
-            ("tablepath/part1=p1v2/part2=p2v1/file2.parquet", 100),
-            ("tablepath/part1=p1v3/part2=p2v1/file3.parquet", 100),
-            ("tablepath/part1=p1v2/part2=p2v2/file4.parquet", 100),
-            ("tablepath/part1=p1v2/part2=p2v2/empty.parquet", 0),
-        ]);
-
-        let partitions = list_partitions(
-            store.as_ref(),
-            &ListingTableUrl::parse("file:///tablepath/").unwrap(),
-            0,
-            None,
-        )
-        .await
-        .expect("listing partitions failed");
-
-        assert_eq!(
-            &partitions
-                .iter()
-                .map(describe_partition)
-                .collect::<Vec<_>>(),
-            &vec![
-                ("tablepath", 0, vec![]),
-                ("tablepath/part1=p1v1", 1, vec![]),
-                ("tablepath/part1=p1v2", 1, vec![]),
-                ("tablepath/part1=p1v3", 1, vec![]),
-            ]
-        );
-
-        let partitions = list_partitions(
-            store.as_ref(),
-            &ListingTableUrl::parse("file:///tablepath/").unwrap(),
-            1,
-            None,
-        )
-        .await
-        .expect("listing partitions failed");
-
-        assert_eq!(
-            &partitions
-                .iter()
-                .map(describe_partition)
-                .collect::<Vec<_>>(),
-            &vec![
-                ("tablepath", 0, vec![]),
-                ("tablepath/part1=p1v1", 1, vec!["file.parquet"]),
-                ("tablepath/part1=p1v2", 1, vec![]),
-                ("tablepath/part1=p1v2/part2=p2v1", 2, vec![]),
-                ("tablepath/part1=p1v2/part2=p2v2", 2, vec![]),
-                ("tablepath/part1=p1v3", 1, vec![]),
-                ("tablepath/part1=p1v3/part2=p2v1", 2, vec![]),
-            ]
-        );
-
-        let partitions = list_partitions(
-            store.as_ref(),
-            &ListingTableUrl::parse("file:///tablepath/").unwrap(),
-            2,
-            None,
-        )
-        .await
-        .expect("listing partitions failed");
-
-        assert_eq!(
-            &partitions
-                .iter()
-                .map(describe_partition)
-                .collect::<Vec<_>>(),
-            &vec![
-                ("tablepath", 0, vec![]),
-                ("tablepath/part1=p1v1", 1, vec!["file.parquet"]),
-                ("tablepath/part1=p1v2", 1, vec![]),
-                ("tablepath/part1=p1v3", 1, vec![]),
-                (
-                    "tablepath/part1=p1v2/part2=p2v1",
-                    2,
-                    vec!["file1.parquet", "file2.parquet"]
-                ),
-                ("tablepath/part1=p1v2/part2=p2v2", 2, vec!["file4.parquet"]),
-                ("tablepath/part1=p1v3/part2=p2v1", 2, vec!["file3.parquet"]),
-            ]
-        );
-    }
-
     #[test]
     fn test_parse_partitions_for_path() {
         assert_eq!(
@@ -871,6 +599,130 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_try_into_partitioned_file_valid_partition() {
+        let table_path = ListingTableUrl::parse("file:///bucket/mytable").unwrap();
+        let partition_cols = vec![("year_month".to_string(), DataType::Utf8)];
+        let meta = ObjectMeta {
+            location: Path::from("bucket/mytable/year_month=2024-01/data.parquet"),
+            last_modified: chrono::Utc::now(),
+            size: 100,
+            e_tag: None,
+            version: None,
+        };
+
+        let result =
+            try_into_partitioned_file(meta, &partition_cols, &table_path).unwrap();
+        assert!(result.is_some());
+        let pf = result.unwrap();
+        assert_eq!(pf.partition_values.len(), 1);
+        assert_eq!(
+            pf.partition_values[0],
+            ScalarValue::Utf8(Some("2024-01".to_string()))
+        );
+    }
+
+    #[test]
+    fn test_try_into_partitioned_file_root_file_skipped() {
+        // File in root directory (not inside any partition path) should be
+        // skipped — this is the case where a stale file exists from before
+        // hive partitioning was added.
+        let table_path = ListingTableUrl::parse("file:///bucket/mytable").unwrap();
+        let partition_cols = vec![("year_month".to_string(), DataType::Utf8)];
+        let meta = ObjectMeta {
+            location: Path::from("bucket/mytable/data.parquet"),
+            last_modified: chrono::Utc::now(),
+            size: 100,
+            e_tag: None,
+            version: None,
+        };
+
+        let result =
+            try_into_partitioned_file(meta, &partition_cols, &table_path).unwrap();
+        assert!(
+            result.is_none(),
+            "Files outside partition structure should be skipped"
+        );
+    }
+
+    #[test]
+    fn test_try_into_partitioned_file_wrong_partition_name() {
+        // File in a directory that doesn't match the expected partition column
+        let table_path = ListingTableUrl::parse("file:///bucket/mytable").unwrap();
+        let partition_cols = vec![("year_month".to_string(), DataType::Utf8)];
+        let meta = ObjectMeta {
+            location: Path::from("bucket/mytable/wrong_col=2024-01/data.parquet"),
+            last_modified: chrono::Utc::now(),
+            size: 100,
+            e_tag: None,
+            version: None,
+        };
+
+        let result =
+            try_into_partitioned_file(meta, &partition_cols, &table_path).unwrap();
+        assert!(
+            result.is_none(),
+            "Files with wrong partition column name should be skipped"
+        );
+    }
+
+    #[test]
+    fn test_try_into_partitioned_file_multiple_partitions() {
+        let table_path = ListingTableUrl::parse("file:///bucket/mytable").unwrap();
+        let partition_cols = vec![
+            ("year".to_string(), DataType::Utf8),
+            ("month".to_string(), DataType::Utf8),
+        ];
+        let meta = ObjectMeta {
+            location: Path::from("bucket/mytable/year=2024/month=01/data.parquet"),
+            last_modified: chrono::Utc::now(),
+            size: 100,
+            e_tag: None,
+            version: None,
+        };
+
+        let result =
+            try_into_partitioned_file(meta, &partition_cols, &table_path).unwrap();
+        assert!(result.is_some());
+        let pf = result.unwrap();
+        assert_eq!(pf.partition_values.len(), 2);
+        assert_eq!(
+            pf.partition_values[0],
+            ScalarValue::Utf8(Some("2024".to_string()))
+        );
+        assert_eq!(
+            pf.partition_values[1],
+            ScalarValue::Utf8(Some("01".to_string()))
+        );
+    }
+
+    #[test]
+    fn test_try_into_partitioned_file_partial_partition_skipped() {
+        // File has first partition but not second — should be skipped
+        let table_path = ListingTableUrl::parse("file:///bucket/mytable").unwrap();
+        let partition_cols = vec![
+            ("year".to_string(), DataType::Utf8),
+            ("month".to_string(), DataType::Utf8),
+        ];
+        let meta = ObjectMeta {
+            location: Path::from("bucket/mytable/year=2024/data.parquet"),
+            last_modified: chrono::Utc::now(),
+            size: 100,
+            e_tag: None,
+            version: None,
+        };
+
+        let result =
+            try_into_partitioned_file(meta, &partition_cols, &table_path).unwrap();
+        // File has year=2024 but no month= directory — parse_partitions_for_path
+        // returns None because the path component "data.parquet" doesn't match
+        // the expected "month=..." pattern.
+        assert!(
+            result.is_none(),
+            "Files with incomplete partition structure should be skipped"
+        );
+    }
+
     #[test]
     fn test_expr_applicable_for_cols() {
         assert!(expr_applicable_for_cols(
@@ -1016,86 +868,4 @@ mod tests {
             Some(Path::from("a=1970-01-05")),
         );
     }
-
-    pub fn make_test_store_and_state(
-        files: &[(&str, u64)],
-    ) -> (Arc<InMemory>, Arc<dyn Session>) {
-        let memory = InMemory::new();
-
-        for (name, size) in files {
-            memory
-                .put(&Path::from(*name), vec![0; *size as usize].into())
-                .now_or_never()
-                .unwrap()
-                .unwrap();
-        }
-
-        (Arc::new(memory), Arc::new(MockSession {}))
-    }
-
-    struct MockSession {}
-
-    #[async_trait]
-    impl Session for MockSession {
-        fn session_id(&self) -> &str {
-            unimplemented!()
-        }
-
-        fn config(&self) -> &SessionConfig {
-            unimplemented!()
-        }
-
-        async fn create_physical_plan(
-            &self,
-            _logical_plan: &LogicalPlan,
-        ) -> Result<Arc<dyn ExecutionPlan>> {
-            unimplemented!()
-        }
-
-        fn create_physical_expr(
-            &self,
-            _expr: Expr,
-            _df_schema: &DFSchema,
-        ) -> Result<Arc<dyn PhysicalExpr>> {
-            unimplemented!()
-        }
-
-        fn scalar_functions(&self) -> &std::collections::HashMap<String, Arc<ScalarUDF>> {
-            unimplemented!()
-        }
-
-        fn aggregate_functions(
-            &self,
-        ) -> &std::collections::HashMap<String, Arc<AggregateUDF>> {
-            unimplemented!()
-        }
-
-        fn window_functions(&self) -> &std::collections::HashMap<String, Arc<WindowUDF>> {
-            unimplemented!()
-        }
-
-        fn runtime_env(&self) -> &Arc<RuntimeEnv> {
-            unimplemented!()
-        }
-
-        fn execution_props(&self) -> &ExecutionProps {
-            unimplemented!()
-        }
-
-        fn as_any(&self) -> &dyn Any {
-            unimplemented!()
-        }
-
-        fn table_options(&self) -> &TableOptions {
-            unimplemented!()
-        }
-
-        fn table_options_mut(&mut self) -> &mut TableOptions {
-            unimplemented!()
-        }
-
-        fn task_ctx(&self) -> Arc<datafusion_execution::TaskContext> {
-            unimplemented!()
-        }
-    }
 }
diff --git a/datafusion/catalog-listing/src/mod.rs b/datafusion/catalog-listing/src/mod.rs
index 90d04b46b8067..9efb5aa96267e 100644
--- a/datafusion/catalog-listing/src/mod.rs
+++ b/datafusion/catalog-listing/src/mod.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 #![doc(
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
@@ -31,4 +32,4 @@ mod table;
 
 pub use config::{ListingTableConfig, SchemaSource};
 pub use options::ListingOptions;
-pub use table::ListingTable;
+pub use table::{ListFilesResult, ListingTable};
diff --git a/datafusion/catalog-listing/src/options.rs b/datafusion/catalog-listing/src/options.rs
index 7da8005f90ec2..146f98d62335e 100644
--- a/datafusion/catalog-listing/src/options.rs
+++ b/datafusion/catalog-listing/src/options.rs
@@ -18,12 +18,12 @@
 use arrow::datatypes::{DataType, SchemaRef};
 use datafusion_catalog::Session;
 use datafusion_common::plan_err;
-use datafusion_datasource::file_format::FileFormat;
 use datafusion_datasource::ListingTableUrl;
+use datafusion_datasource::file_format::FileFormat;
 use datafusion_execution::config::SessionConfig;
 use datafusion_expr::SortExpr;
 use futures::StreamExt;
-use futures::{future, TryStreamExt};
+use futures::{TryStreamExt, future};
 use itertools::Itertools;
 use std::sync::Arc;
 
diff --git a/datafusion/catalog-listing/src/table.rs b/datafusion/catalog-listing/src/table.rs
index 95f9523d4401c..06ba8c8113fac 100644
--- a/datafusion/catalog-listing/src/table.rs
+++ b/datafusion/catalog-listing/src/table.rs
@@ -23,19 +23,18 @@ use async_trait::async_trait;
 use datafusion_catalog::{ScanArgs, ScanResult, Session, TableProvider};
 use datafusion_common::stats::Precision;
 use datafusion_common::{
-    internal_datafusion_err, plan_err, project_schema, Constraints, DataFusionError,
-    SchemaExt, Statistics,
+    Constraints, SchemaExt, Statistics, internal_datafusion_err, plan_err, project_schema,
 };
 use datafusion_datasource::file::FileSource;
 use datafusion_datasource::file_groups::FileGroup;
 use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
-use datafusion_datasource::file_sink_config::FileSinkConfig;
-use datafusion_datasource::schema_adapter::{
-    DefaultSchemaAdapterFactory, SchemaAdapter, SchemaAdapterFactory,
-};
+use datafusion_datasource::file_sink_config::{FileOutputMode, FileSinkConfig};
+#[expect(deprecated)]
+use datafusion_datasource::schema_adapter::SchemaAdapterFactory;
 use datafusion_datasource::{
-    compute_all_files_statistics, ListingTableUrl, PartitionedFile,
+    ListingTableUrl, PartitionedFile, TableSchema, compute_all_files_statistics,
 };
+use datafusion_execution::cache::TableScopedPath;
 use datafusion_execution::cache::cache_manager::FileStatisticsCache;
 use datafusion_execution::cache::cache_unit::DefaultFileStatisticsCache;
 use datafusion_expr::dml::InsertOp;
@@ -44,14 +43,24 @@ use datafusion_expr::{Expr, TableProviderFilterPushDown, TableType};
 use datafusion_physical_expr::create_lex_ordering;
 use datafusion_physical_expr_adapter::PhysicalExprAdapterFactory;
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
-use datafusion_physical_plan::empty::EmptyExec;
 use datafusion_physical_plan::ExecutionPlan;
-use futures::{future, stream, Stream, StreamExt, TryStreamExt};
+use datafusion_physical_plan::empty::EmptyExec;
+use futures::{Stream, StreamExt, TryStreamExt, future, stream};
 use object_store::ObjectStore;
-use std::any::Any;
 use std::collections::HashMap;
 use std::sync::Arc;
 
+/// Result of a file listing operation from [`ListingTable::list_files_for_scan`].
+#[derive(Debug)]
+pub struct ListFilesResult {
+    /// File groups organized by the partitioning strategy.
+    pub file_groups: Vec<FileGroup>,
+    /// Aggregated statistics for all files.
+    pub statistics: Statistics,
+    /// Whether files are grouped by partition values (enables Hash partitioning).
+    pub grouped_by_partition: bool,
+}
+
 /// Built in [`TableProvider`] that reads data from one or more files as a single table.
 ///
 /// The files are read using an  [`ObjectStore`] instance, for example from
@@ -178,13 +187,11 @@ pub struct ListingTable {
     /// The SQL definition for this table, if any
     definition: Option<String>,
     /// Cache for collected file statistics
-    collected_statistics: FileStatisticsCache,
+    collected_statistics: Arc<dyn FileStatisticsCache>,
     /// Constraints applied to this table
     constraints: Constraints,
     /// Column default expressions for columns that are not physically present in the data files
     column_defaults: HashMap<String, Expr>,
-    /// Optional [`SchemaAdapterFactory`] for creating schema adapters
-    schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
     /// Optional [`PhysicalExprAdapterFactory`] for creating physical expression adapters
     expr_adapter_factory: Option<Arc<dyn PhysicalExprAdapterFactory>>,
 }
@@ -227,7 +234,6 @@ impl ListingTable {
             collected_statistics: Arc::new(DefaultFileStatisticsCache::default()),
             constraints: Constraints::default(),
             column_defaults: HashMap::new(),
-            schema_adapter_factory: config.schema_adapter_factory,
             expr_adapter_factory: config.expr_adapter_factory,
         };
 
@@ -255,7 +261,7 @@ impl ListingTable {
     /// multiple times in the same session.
     ///
     /// If `None`, creates a new [`DefaultFileStatisticsCache`] scoped to this query.
-    pub fn with_cache(mut self, cache: Option<FileStatisticsCache>) -> Self {
+    pub fn with_cache(mut self, cache: Option<Arc<dyn FileStatisticsCache>>) -> Self {
         self.collected_statistics =
             cache.unwrap_or_else(|| Arc::new(DefaultFileStatisticsCache::default()));
         self
@@ -282,83 +288,151 @@ impl ListingTable {
         self.schema_source
     }
 
-    /// Set the [`SchemaAdapterFactory`] for this [`ListingTable`]
+    /// Deprecated: Set the [`SchemaAdapterFactory`] for this [`ListingTable`]
     ///
-    /// The schema adapter factory is used to create schema adapters that can
-    /// handle schema evolution and type conversions when reading files with
-    /// different schemas than the table schema.
+    /// `SchemaAdapterFactory` has been removed. Use [`ListingTableConfig::with_expr_adapter_factory`]
+    /// and `PhysicalExprAdapterFactory` instead. See `upgrading.md` for more details.
     ///
-    /// # Example: Adding Schema Evolution Support
-    /// ```rust
-    /// # use std::sync::Arc;
-    /// # use datafusion_catalog_listing::{ListingTable, ListingTableConfig, ListingOptions};
-    /// # use datafusion_datasource::ListingTableUrl;
-    /// # use datafusion_datasource::schema_adapter::{DefaultSchemaAdapterFactory, SchemaAdapter};
-    /// # use datafusion_datasource_parquet::file_format::ParquetFormat;
-    /// # use arrow::datatypes::{SchemaRef, Schema, Field, DataType};
-    /// # let table_path = ListingTableUrl::parse("file:///path/to/data").unwrap();
-    /// # let options = ListingOptions::new(Arc::new(ParquetFormat::default()));
-    /// # let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int64, false)]));
-    /// # let config = ListingTableConfig::new(table_path).with_listing_options(options).with_schema(schema);
-    /// # let table = ListingTable::try_new(config).unwrap();
-    /// let table_with_evolution = table
-    ///     .with_schema_adapter_factory(Arc::new(DefaultSchemaAdapterFactory));
-    /// ```
-    /// See [`ListingTableConfig::with_schema_adapter_factory`] for an example of custom SchemaAdapterFactory.
+    /// This method is a no-op and returns `self` unchanged.
+    #[deprecated(
+        since = "52.0.0",
+        note = "SchemaAdapterFactory has been removed. Use ListingTableConfig::with_expr_adapter_factory and PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+    )]
+    #[expect(deprecated)]
     pub fn with_schema_adapter_factory(
         self,
-        schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
+        _schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
     ) -> Self {
-        Self {
-            schema_adapter_factory: Some(schema_adapter_factory),
-            ..self
-        }
-    }
-
-    /// Get the [`SchemaAdapterFactory`] for this table
-    pub fn schema_adapter_factory(&self) -> Option<&Arc<dyn SchemaAdapterFactory>> {
-        self.schema_adapter_factory.as_ref()
+        // No-op - just return self unchanged
+        self
     }
 
-    /// Creates a schema adapter for mapping between file and table schemas
+    /// Deprecated: Returns the [`SchemaAdapterFactory`] used by this [`ListingTable`].
     ///
-    /// Uses the configured schema adapter factory if available, otherwise falls back
-    /// to the default implementation.
-    fn create_schema_adapter(&self) -> Box<dyn SchemaAdapter> {
-        let table_schema = self.schema();
-        match &self.schema_adapter_factory {
-            Some(factory) => {
-                factory.create_with_projected_schema(Arc::clone(&table_schema))
-            }
-            None => DefaultSchemaAdapterFactory::from_schema(Arc::clone(&table_schema)),
-        }
+    /// `SchemaAdapterFactory` has been removed. Use `PhysicalExprAdapterFactory` instead.
+    /// See `upgrading.md` for more details.
+    ///
+    /// Always returns `None`.
+    #[deprecated(
+        since = "52.0.0",
+        note = "SchemaAdapterFactory has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+    )]
+    #[expect(deprecated)]
+    pub fn schema_adapter_factory(&self) -> Option<Arc<dyn SchemaAdapterFactory>> {
+        None
     }
 
-    /// Creates a file source and applies schema adapter factory if available
-    fn create_file_source_with_schema_adapter(
-        &self,
-    ) -> datafusion_common::Result<Arc<dyn FileSource>> {
-        let mut source = self.options.format.file_source();
-        // Apply schema adapter to source if available
-        //
-        // The source will use this SchemaAdapter to adapt data batches as they flow up the plan.
-        // Note: ListingTable also creates a SchemaAdapter in `scan()` but that is only used to adapt collected statistics.
-        if let Some(factory) = &self.schema_adapter_factory {
-            source = source.with_schema_adapter_factory(Arc::clone(factory))?;
-        }
-        Ok(source)
+    /// Creates a file source for this table
+    fn create_file_source(&self) -> Arc<dyn FileSource> {
+        let table_schema = TableSchema::new(
+            Arc::clone(&self.file_schema),
+            self.options
+                .table_partition_cols
+                .iter()
+                .map(|(col, field)| Arc::new(Field::new(col, field.clone(), false)))
+                .collect(),
+        );
+
+        self.options.format.file_source(table_schema)
     }
 
-    /// If file_sort_order is specified, creates the appropriate physical expressions
+    /// Creates output ordering from user-specified file_sort_order or derives
+    /// from file orderings when user doesn't specify.
+    ///
+    /// If user specified `file_sort_order`, that takes precedence.
+    /// Otherwise, attempts to derive common ordering from file orderings in
+    /// the provided file groups.
     pub fn try_create_output_ordering(
         &self,
         execution_props: &ExecutionProps,
+        file_groups: &[FileGroup],
     ) -> datafusion_common::Result<Vec<LexOrdering>> {
-        create_lex_ordering(
-            &self.table_schema,
-            &self.options.file_sort_order,
-            execution_props,
-        )
+        // If user specified sort order, use that
+        if !self.options.file_sort_order.is_empty() {
+            return create_lex_ordering(
+                &self.table_schema,
+                &self.options.file_sort_order,
+                execution_props,
+            );
+        }
+        if let Some(ordering) = derive_common_ordering_from_files(file_groups) {
+            return Ok(vec![ordering]);
+        }
+        Ok(vec![])
+    }
+}
+
+/// Derives a common ordering from file orderings across all file groups.
+///
+/// Returns the common ordering if all files have compatible orderings,
+/// otherwise returns None.
+///
+/// The function finds the longest common prefix among all file orderings.
+/// For example, if files have orderings `[a, b, c]` and `[a, b]`, the common
+/// ordering is `[a, b]`.
+fn derive_common_ordering_from_files(file_groups: &[FileGroup]) -> Option<LexOrdering> {
+    enum CurrentOrderingState {
+        /// Initial state before processing any files
+        FirstFile,
+        /// Some common ordering found so far
+        SomeOrdering(LexOrdering),
+        /// No files have ordering
+        NoOrdering,
+    }
+    let mut state = CurrentOrderingState::FirstFile;
+
+    // Collect file orderings and track counts
+    for group in file_groups {
+        for file in group.iter() {
+            state = match (&state, &file.ordering) {
+                // If this is the first file with ordering, set it as current
+                (CurrentOrderingState::FirstFile, Some(ordering)) => {
+                    CurrentOrderingState::SomeOrdering(ordering.clone())
+                }
+                (CurrentOrderingState::FirstFile, None) => {
+                    CurrentOrderingState::NoOrdering
+                }
+                // If we have an existing ordering, find common prefix with new ordering
+                (CurrentOrderingState::SomeOrdering(current), Some(ordering)) => {
+                    // Find common prefix between current and new ordering
+                    let prefix_len = current
+                        .as_ref()
+                        .iter()
+                        .zip(ordering.as_ref().iter())
+                        .take_while(|(a, b)| a == b)
+                        .count();
+                    if prefix_len == 0 {
+                        log::trace!(
+                            "Cannot derive common ordering: no common prefix between orderings {current:?} and {ordering:?}"
+                        );
+                        return None;
+                    } else {
+                        let ordering =
+                            LexOrdering::new(current.as_ref()[..prefix_len].to_vec())
+                                .expect("prefix_len > 0, so ordering must be valid");
+                        CurrentOrderingState::SomeOrdering(ordering)
+                    }
+                }
+                // If one file has ordering and another doesn't, no common ordering
+                // Return None and log a trace message explaining why
+                (CurrentOrderingState::SomeOrdering(ordering), None)
+                | (CurrentOrderingState::NoOrdering, Some(ordering)) => {
+                    log::trace!(
+                        "Cannot derive common ordering: some files have ordering {ordering:?}, others don't"
+                    );
+                    return None;
+                }
+                // Both have no ordering, remain in NoOrdering state
+                (CurrentOrderingState::NoOrdering, None) => {
+                    CurrentOrderingState::NoOrdering
+                }
+            };
+        }
+    }
+
+    match state {
+        CurrentOrderingState::SomeOrdering(ordering) => Some(ordering),
+        _ => None,
     }
 }
 
@@ -374,10 +448,6 @@ fn can_be_evaluated_for_partition_pruning(
 
 #[async_trait]
 impl TableProvider for ListingTable {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         Arc::clone(&self.table_schema)
     }
@@ -418,7 +488,7 @@ impl TableProvider for ListingTable {
             .options
             .table_partition_cols
             .iter()
-            .map(|col| Ok(self.table_schema.field_with_name(&col.0)?.clone()))
+            .map(|col| Ok(Arc::new(self.table_schema.field_with_name(&col.0)?.clone())))
             .collect::<datafusion_common::Result<Vec<_>>>()?;
 
         let table_partition_col_names = table_partition_cols
@@ -437,7 +507,11 @@ impl TableProvider for ListingTable {
         // at the same time. This is because the limit should be applied after the filters are applied.
         let statistic_file_limit = if filters.is_empty() { limit } else { None };
 
-        let (mut partitioned_file_lists, statistics) = self
+        let ListFilesResult {
+            file_groups: mut partitioned_file_lists,
+            statistics,
+            grouped_by_partition: partitioned_by_file_group,
+        } = self
             .list_files_for_scan(state, &partition_filters, statistic_file_limit)
             .await?;
 
@@ -447,7 +521,10 @@ impl TableProvider for ListingTable {
             return Ok(ScanResult::new(Arc::new(EmptyExec::new(projected_schema))));
         }
 
-        let output_ordering = self.try_create_output_ordering(state.execution_props())?;
+        let output_ordering = self.try_create_output_ordering(
+            state.execution_props(),
+            &partitioned_file_lists,
+        )?;
         match state
             .config_options()
             .execution
@@ -469,7 +546,9 @@ impl TableProvider for ListingTable {
                 if new_groups.len() <= self.options.target_partitions {
                     partitioned_file_lists = new_groups;
                 } else {
-                    log::debug!("attempted to split file groups by statistics, but there were more file groups than target_partitions; falling back to unordered")
+                    log::debug!(
+                        "attempted to split file groups by statistics, but there were more file groups than target_partitions; falling back to unordered"
+                    )
                 }
             }
             None => {} // no ordering required
@@ -483,7 +562,7 @@ impl TableProvider for ListingTable {
             )))));
         };
 
-        let file_source = self.create_file_source_with_schema_adapter()?;
+        let file_source = self.create_file_source();
 
         // create the execution plan
         let plan = self
@@ -491,20 +570,16 @@ impl TableProvider for ListingTable {
             .format
             .create_physical_plan(
                 state,
-                FileScanConfigBuilder::new(
-                    object_store_url,
-                    Arc::clone(&self.file_schema),
-                    file_source,
-                )
-                .with_file_groups(partitioned_file_lists)
-                .with_constraints(self.constraints.clone())
-                .with_statistics(statistics)
-                .with_projection_indices(projection)
-                .with_limit(limit)
-                .with_output_ordering(output_ordering)
-                .with_table_partition_cols(table_partition_cols)
-                .with_expr_adapter(self.expr_adapter_factory.clone())
-                .build(),
+                FileScanConfigBuilder::new(object_store_url, file_source)
+                    .with_file_groups(partitioned_file_lists)
+                    .with_constraints(self.constraints.clone())
+                    .with_statistics(statistics)
+                    .with_projection_indices(projection)?
+                    .with_limit(limit)
+                    .with_output_ordering(output_ordering)
+                    .with_expr_adapter(self.expr_adapter_factory.clone())
+                    .with_partitioned_by_file_group(partitioned_by_file_group)
+                    .build(),
             )
             .await?;
 
@@ -574,6 +649,15 @@ impl TableProvider for ListingTable {
         let keep_partition_by_columns =
             state.config_options().execution.keep_partition_by_columns;
 
+        // Invalidate cache entries for this table if they exist
+        if let Some(lfc) = state.runtime_env().cache_manager.get_list_files_cache() {
+            let key = TableScopedPath {
+                table: table_path.get_table_ref().clone(),
+                path: table_path.prefix().clone(),
+            };
+            let _ = lfc.remove(&key);
+        }
+
         // Sink related option, apart from format
         let config = FileSinkConfig {
             original_url: String::default(),
@@ -585,9 +669,11 @@ impl TableProvider for ListingTable {
             insert_op,
             keep_partition_by_columns,
             file_extension: self.options().format.get_ext(),
+            file_output_mode: FileOutputMode::Automatic,
         };
 
-        let orderings = self.try_create_output_ordering(state.execution_props())?;
+        // For writes, we only use user-specified ordering (no file groups to derive from)
+        let orderings = self.try_create_output_ordering(state.execution_props(), &[])?;
         // It is sufficient to pass only one of the equivalent orderings:
         let order_requirements = orderings.into_iter().next().map(Into::into);
 
@@ -611,11 +697,15 @@ impl ListingTable {
         ctx: &'a dyn Session,
         filters: &'a [Expr],
         limit: Option<usize>,
-    ) -> datafusion_common::Result<(Vec<FileGroup>, Statistics)> {
+    ) -> datafusion_common::Result<ListFilesResult> {
         let store = if let Some(url) = self.table_paths.first() {
             ctx.runtime_env().object_store(url)?
         } else {
-            return Ok((vec![], Statistics::new_unknown(&self.file_schema)));
+            return Ok(ListFilesResult {
+                file_groups: vec![],
+                statistics: Statistics::new_unknown(&self.file_schema),
+                grouped_by_partition: false,
+            });
         };
         // list files (with partitions)
         let file_list = future::try_join_all(self.table_paths.iter().map(|table_path| {
@@ -632,16 +722,19 @@ impl ListingTable {
         let meta_fetch_concurrency =
             ctx.config_options().execution.meta_fetch_concurrency;
         let file_list = stream::iter(file_list).flatten_unordered(meta_fetch_concurrency);
-        // collect the statistics if required by the config
+        // collect the statistics and ordering if required by the config
         let files = file_list
             .map(|part_file| async {
                 let part_file = part_file?;
-                let statistics = if self.options.collect_stat {
-                    self.do_collect_statistics(ctx, &store, &part_file).await?
+                let (statistics, ordering) = if self.options.collect_stat {
+                    self.do_collect_statistics_and_ordering(ctx, &store, &part_file)
+                        .await?
                 } else {
-                    Arc::new(Statistics::new_unknown(&self.file_schema))
+                    (Arc::new(Statistics::new_unknown(&self.file_schema)), None)
                 };
-                Ok(part_file.with_statistics(statistics))
+                Ok(part_file
+                    .with_statistics(statistics)
+                    .with_ordering(ordering))
             })
             .boxed()
             .buffer_unordered(ctx.config_options().execution.meta_fetch_concurrency);
@@ -649,65 +742,97 @@ impl ListingTable {
         let (file_group, inexact_stats) =
             get_files_with_limit(files, limit, self.options.collect_stat).await?;
 
-        let file_groups = file_group.split_files(self.options.target_partitions);
-        let (mut file_groups, mut stats) = compute_all_files_statistics(
+        // Threshold: 0 = disabled, N > 0 = enabled when distinct_keys >= N
+        //
+        // When enabled, files are grouped by their Hive partition column values, allowing
+        // FileScanConfig to declare Hash partitioning. This enables the optimizer to skip
+        // hash repartitioning for aggregates and joins on partition columns.
+        let threshold = ctx.config_options().optimizer.preserve_file_partitions;
+
+        let (file_groups, grouped_by_partition) = if threshold > 0
+            && !self.options.table_partition_cols.is_empty()
+        {
+            let grouped =
+                file_group.group_by_partition_values(self.options.target_partitions);
+            if grouped.len() >= threshold {
+                (grouped, true)
+            } else {
+                let all_files: Vec<_> =
+                    grouped.into_iter().flat_map(|g| g.into_inner()).collect();
+                (
+                    FileGroup::new(all_files).split_files(self.options.target_partitions),
+                    false,
+                )
+            }
+        } else {
+            (
+                file_group.split_files(self.options.target_partitions),
+                false,
+            )
+        };
+
+        let (file_groups, stats) = compute_all_files_statistics(
             file_groups,
             self.schema(),
             self.options.collect_stat,
             inexact_stats,
         )?;
 
-        let schema_adapter = self.create_schema_adapter();
-        let (schema_mapper, _) = schema_adapter.map_schema(self.file_schema.as_ref())?;
-
-        stats.column_statistics =
-            schema_mapper.map_column_statistics(&stats.column_statistics)?;
-        file_groups.iter_mut().try_for_each(|file_group| {
-            if let Some(stat) = file_group.statistics_mut() {
-                stat.column_statistics =
-                    schema_mapper.map_column_statistics(&stat.column_statistics)?;
-            }
-            Ok::<_, DataFusionError>(())
-        })?;
-        Ok((file_groups, stats))
+        // Note: Statistics already include both file columns and partition columns.
+        // PartitionedFile::with_statistics automatically appends exact partition column
+        // statistics (min=max=partition_value, null_count=0, distinct_count=1) computed
+        // from partition_values.
+        Ok(ListFilesResult {
+            file_groups,
+            statistics: stats,
+            grouped_by_partition,
+        })
     }
 
-    /// Collects statistics for a given partitioned file.
+    /// Collects statistics and ordering for a given partitioned file.
     ///
-    /// This method first checks if the statistics for the given file are already cached.
-    /// If they are, it returns the cached statistics.
-    /// If they are not, it infers the statistics from the file and stores them in the cache.
-    async fn do_collect_statistics(
+    /// This method checks if statistics are cached. If cached, it returns the
+    /// cached statistics and infers ordering separately. If not cached, it infers
+    /// both statistics and ordering in a single metadata read for efficiency.
+    async fn do_collect_statistics_and_ordering(
         &self,
         ctx: &dyn Session,
         store: &Arc<dyn ObjectStore>,
         part_file: &PartitionedFile,
-    ) -> datafusion_common::Result<Arc<Statistics>> {
-        match self
-            .collected_statistics
-            .get_with_extra(&part_file.object_meta.location, &part_file.object_meta)
+    ) -> datafusion_common::Result<(Arc<Statistics>, Option<LexOrdering>)> {
+        use datafusion_execution::cache::cache_manager::CachedFileMetadata;
+
+        let path = &part_file.object_meta.location;
+        let meta = &part_file.object_meta;
+
+        // Check cache first - if we have valid cached statistics and ordering
+        if let Some(cached) = self.collected_statistics.get(path)
+            && cached.is_valid_for(meta)
         {
-            Some(statistics) => Ok(statistics),
-            None => {
-                let statistics = self
-                    .options
-                    .format
-                    .infer_stats(
-                        ctx,
-                        store,
-                        Arc::clone(&self.file_schema),
-                        &part_file.object_meta,
-                    )
-                    .await?;
-                let statistics = Arc::new(statistics);
-                self.collected_statistics.put_with_extra(
-                    &part_file.object_meta.location,
-                    Arc::clone(&statistics),
-                    &part_file.object_meta,
-                );
-                Ok(statistics)
-            }
+            // Return cached statistics and ordering
+            return Ok((Arc::clone(&cached.statistics), cached.ordering.clone()));
         }
+
+        // Cache miss or invalid: fetch both statistics and ordering in a single metadata read
+        let file_meta = self
+            .options
+            .format
+            .infer_stats_and_ordering(ctx, store, Arc::clone(&self.file_schema), meta)
+            .await?;
+
+        let statistics = Arc::new(file_meta.statistics);
+
+        // Store in cache
+        self.collected_statistics.put(
+            path,
+            CachedFileMetadata::new(
+                meta.clone(),
+                Arc::clone(&statistics),
+                file_meta.ordering.clone(),
+            ),
+        );
+
+        Ok((statistics, file_meta.ordering))
     }
 }
 
@@ -756,28 +881,25 @@ async fn get_files_with_limit(
         let file = file_result?;
 
         // Update file statistics regardless of state
-        if collect_stats {
-            if let Some(file_stats) = &file.statistics {
-                num_rows = if file_group.is_empty() {
-                    // For the first file, just take its row count
-                    file_stats.num_rows
-                } else {
-                    // For subsequent files, accumulate the counts
-                    num_rows.add(&file_stats.num_rows)
-                };
-            }
+        if collect_stats && let Some(file_stats) = &file.statistics {
+            num_rows = if file_group.is_empty() {
+                // For the first file, just take its row count
+                file_stats.num_rows
+            } else {
+                // For subsequent files, accumulate the counts
+                num_rows.add(&file_stats.num_rows)
+            };
         }
 
         // Always add the file to our group
         file_group.push(file);
 
         // Check if we've hit the limit (if one was specified)
-        if let Some(limit) = limit {
-            if let Precision::Exact(row_count) = num_rows {
-                if row_count > limit {
-                    state = ProcessingState::ReachedLimit;
-                }
-            }
+        if let Some(limit) = limit
+            && let Precision::Exact(row_count) = num_rows
+            && row_count > limit
+        {
+            state = ProcessingState::ReachedLimit;
         }
     }
     // If we still have files in the stream, it means that the limit kicked
@@ -786,3 +908,145 @@ async fn get_files_with_limit(
     let inexact_stats = all_files.next().await.is_some();
     Ok((file_group, inexact_stats))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::compute::SortOptions;
+    use datafusion_physical_expr::expressions::Column;
+    use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
+
+    /// Helper to create a PhysicalSortExpr
+    fn sort_expr(
+        name: &str,
+        idx: usize,
+        descending: bool,
+        nulls_first: bool,
+    ) -> PhysicalSortExpr {
+        PhysicalSortExpr::new(
+            Arc::new(Column::new(name, idx)),
+            SortOptions {
+                descending,
+                nulls_first,
+            },
+        )
+    }
+
+    /// Helper to create a LexOrdering (unwraps the Option)
+    fn lex_ordering(exprs: Vec<PhysicalSortExpr>) -> LexOrdering {
+        LexOrdering::new(exprs).expect("expected non-empty ordering")
+    }
+
+    /// Helper to create a PartitionedFile with optional ordering
+    fn create_file(name: &str, ordering: Option<LexOrdering>) -> PartitionedFile {
+        PartitionedFile::new(name.to_string(), 1024).with_ordering(ordering)
+    }
+
+    #[test]
+    fn test_derive_common_ordering_all_files_same_ordering() {
+        // All files have the same ordering -> returns that ordering
+        let ordering = lex_ordering(vec![
+            sort_expr("a", 0, false, true),
+            sort_expr("b", 1, true, false),
+        ]);
+
+        let file_groups = vec![
+            FileGroup::new(vec![
+                create_file("f1.parquet", Some(ordering.clone())),
+                create_file("f2.parquet", Some(ordering.clone())),
+            ]),
+            FileGroup::new(vec![create_file("f3.parquet", Some(ordering.clone()))]),
+        ];
+
+        let result = derive_common_ordering_from_files(&file_groups);
+        assert_eq!(result, Some(ordering));
+    }
+
+    #[test]
+    fn test_derive_common_ordering_common_prefix() {
+        // Files have different orderings but share a common prefix
+        let ordering_abc = lex_ordering(vec![
+            sort_expr("a", 0, false, true),
+            sort_expr("b", 1, false, true),
+            sort_expr("c", 2, false, true),
+        ]);
+        let ordering_ab = lex_ordering(vec![
+            sort_expr("a", 0, false, true),
+            sort_expr("b", 1, false, true),
+        ]);
+
+        let file_groups = vec![FileGroup::new(vec![
+            create_file("f1.parquet", Some(ordering_abc)),
+            create_file("f2.parquet", Some(ordering_ab.clone())),
+        ])];
+
+        let result = derive_common_ordering_from_files(&file_groups);
+        assert_eq!(result, Some(ordering_ab));
+    }
+
+    #[test]
+    fn test_derive_common_ordering_no_common_prefix() {
+        // Files have completely different orderings -> returns None
+        let ordering_a = lex_ordering(vec![sort_expr("a", 0, false, true)]);
+        let ordering_b = lex_ordering(vec![sort_expr("b", 1, false, true)]);
+
+        let file_groups = vec![FileGroup::new(vec![
+            create_file("f1.parquet", Some(ordering_a)),
+            create_file("f2.parquet", Some(ordering_b)),
+        ])];
+
+        let result = derive_common_ordering_from_files(&file_groups);
+        assert_eq!(result, None);
+    }
+
+    #[test]
+    fn test_derive_common_ordering_mixed_with_none() {
+        // Some files have ordering, some don't -> returns None
+        let ordering = lex_ordering(vec![sort_expr("a", 0, false, true)]);
+
+        let file_groups = vec![FileGroup::new(vec![
+            create_file("f1.parquet", Some(ordering)),
+            create_file("f2.parquet", None),
+        ])];
+
+        let result = derive_common_ordering_from_files(&file_groups);
+        assert_eq!(result, None);
+    }
+
+    #[test]
+    fn test_derive_common_ordering_all_none() {
+        // No files have ordering -> returns None
+        let file_groups = vec![FileGroup::new(vec![
+            create_file("f1.parquet", None),
+            create_file("f2.parquet", None),
+        ])];
+
+        let result = derive_common_ordering_from_files(&file_groups);
+        assert_eq!(result, None);
+    }
+
+    #[test]
+    fn test_derive_common_ordering_empty_groups() {
+        // Empty file groups -> returns None
+        let file_groups: Vec<FileGroup> = vec![];
+        let result = derive_common_ordering_from_files(&file_groups);
+        assert_eq!(result, None);
+    }
+
+    #[test]
+    fn test_derive_common_ordering_single_file() {
+        // Single file with ordering -> returns that ordering
+        let ordering = lex_ordering(vec![
+            sort_expr("a", 0, false, true),
+            sort_expr("b", 1, true, false),
+        ]);
+
+        let file_groups = vec![FileGroup::new(vec![create_file(
+            "f1.parquet",
+            Some(ordering.clone()),
+        )])];
+
+        let result = derive_common_ordering_from_files(&file_groups);
+        assert_eq!(result, Some(ordering));
+    }
+}
diff --git a/datafusion/catalog/Cargo.toml b/datafusion/catalog/Cargo.toml
index a1db45654be01..1009e9aee477b 100644
--- a/datafusion/catalog/Cargo.toml
+++ b/datafusion/catalog/Cargo.toml
@@ -49,5 +49,8 @@ object_store = { workspace = true }
 parking_lot = { workspace = true }
 tokio = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
diff --git a/datafusion/catalog/src/async.rs b/datafusion/catalog/src/async.rs
index 1c830c976d8b8..87b7b7c3431a1 100644
--- a/datafusion/catalog/src/async.rs
+++ b/datafusion/catalog/src/async.rs
@@ -18,7 +18,7 @@
 use std::sync::Arc;
 
 use async_trait::async_trait;
-use datafusion_common::{error::Result, not_impl_err, HashMap, TableReference};
+use datafusion_common::{HashMap, TableReference, error::Result, not_impl_err};
 use datafusion_execution::config::SessionConfig;
 
 use crate::{CatalogProvider, CatalogProviderList, SchemaProvider, TableProvider};
@@ -37,10 +37,6 @@ impl SchemaProvider for ResolvedSchemaProvider {
         self.owner_name.as_deref()
     }
 
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn table_names(&self) -> Vec<String> {
         self.cached_tables.keys().cloned().collect()
     }
@@ -60,7 +56,9 @@ impl SchemaProvider for ResolvedSchemaProvider {
     }
 
     fn deregister_table(&self, name: &str) -> Result<Option<Arc<dyn TableProvider>>> {
-        not_impl_err!("Attempt to deregister table '{name}' with ResolvedSchemaProvider which is not supported")
+        not_impl_err!(
+            "Attempt to deregister table '{name}' with ResolvedSchemaProvider which is not supported"
+        )
     }
 
     fn table_exist(&self, name: &str) -> bool {
@@ -113,10 +111,6 @@ struct ResolvedCatalogProvider {
     cached_schemas: HashMap<String, Arc<dyn SchemaProvider>>,
 }
 impl CatalogProvider for ResolvedCatalogProvider {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn schema_names(&self) -> Vec<String> {
         self.cached_schemas.keys().cloned().collect()
     }
@@ -158,10 +152,6 @@ struct ResolvedCatalogProviderList {
     cached_catalogs: HashMap<String, Arc<dyn CatalogProvider>>,
 }
 impl CatalogProviderList for ResolvedCatalogProviderList {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn register_catalog(
         &self,
         _name: String,
@@ -193,7 +183,7 @@ impl CatalogProviderList for ResolvedCatalogProviderList {
 ///
 /// See the [remote_catalog.rs] for an end to end example
 ///
-/// [remote_catalog.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/remote_catalog.rs
+/// [remote_catalog.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/data_io/remote_catalog.rs
 #[async_trait]
 pub trait AsyncSchemaProvider: Send + Sync {
     /// Lookup a table in the schema provider
@@ -422,17 +412,14 @@ pub trait AsyncCatalogProviderList: Send + Sync {
 
 #[cfg(test)]
 mod tests {
-    use std::{
-        any::Any,
-        sync::{
-            atomic::{AtomicU32, Ordering},
-            Arc,
-        },
+    use std::sync::{
+        Arc,
+        atomic::{AtomicU32, Ordering},
     };
 
     use arrow::datatypes::SchemaRef;
     use async_trait::async_trait;
-    use datafusion_common::{error::Result, Statistics, TableReference};
+    use datafusion_common::{Statistics, TableReference, error::Result};
     use datafusion_execution::config::SessionConfig;
     use datafusion_expr::{Expr, TableType};
     use datafusion_physical_plan::ExecutionPlan;
@@ -445,10 +432,6 @@ mod tests {
     struct MockTableProvider {}
     #[async_trait]
     impl TableProvider for MockTableProvider {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
         /// Get a reference to the schema for this table
         fn schema(&self) -> SchemaRef {
             unimplemented!()
diff --git a/datafusion/catalog/src/catalog.rs b/datafusion/catalog/src/catalog.rs
index 71b9eccf9d657..34cdf74440cb3 100644
--- a/datafusion/catalog/src/catalog.rs
+++ b/datafusion/catalog/src/catalog.rs
@@ -20,8 +20,8 @@ use std::fmt::Debug;
 use std::sync::Arc;
 
 pub use crate::schema::SchemaProvider;
-use datafusion_common::not_impl_err;
 use datafusion_common::Result;
+use datafusion_common::not_impl_err;
 
 /// Represents a catalog, comprising a number of named schemas.
 ///
@@ -61,7 +61,7 @@ use datafusion_common::Result;
 /// schemas and tables exist.
 ///
 /// [Delta Lake]: https://delta.io/
-/// [`remote_catalog`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/remote_catalog.rs
+/// [`remote_catalog`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/data_io/remote_catalog.rs
 ///
 /// The [`CatalogProvider`] can support this use case, but it takes some care.
 /// The planning APIs in DataFusion are not `async` and thus network IO can not
@@ -100,16 +100,12 @@ use datafusion_common::Result;
 ///
 /// [`datafusion-cli`]: https://datafusion.apache.org/user-guide/cli/index.html
 /// [`DynamicFileCatalogProvider`]: https://github.com/apache/datafusion/blob/31b9b48b08592b7d293f46e75707aad7dadd7cbc/datafusion-cli/src/catalog.rs#L75
-/// [`catalog.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/catalog.rs
+/// [`catalog.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/data_io/catalog.rs
 /// [delta-rs]: https://github.com/delta-io/delta-rs
 /// [`UnityCatalogProvider`]: https://github.com/delta-io/delta-rs/blob/951436ecec476ce65b5ed3b58b50fb0846ca7b91/crates/deltalake-core/src/data_catalog/unity/datafusion.rs#L111-L123
 ///
 /// [`TableProvider`]: crate::TableProvider
-pub trait CatalogProvider: Debug + Sync + Send {
-    /// Returns the catalog provider as [`Any`]
-    /// so that it can be downcast to a specific implementation.
-    fn as_any(&self) -> &dyn Any;
-
+pub trait CatalogProvider: Any + Debug + Sync + Send {
     /// Retrieves the list of available schema names in this catalog.
     fn schema_names(&self) -> Vec<String>;
 
@@ -152,15 +148,31 @@ pub trait CatalogProvider: Debug + Sync + Send {
     }
 }
 
+impl dyn CatalogProvider {
+    /// Returns `true` if the catalog provider is of type `T`.
+    ///
+    /// Prefer this over `downcast_ref::<T>().is_some()`. Works correctly when
+    /// called on `Arc<dyn CatalogProvider>` via auto-deref.
+    pub fn is<T: CatalogProvider>(&self) -> bool {
+        (self as &dyn Any).is::<T>()
+    }
+
+    /// Attempts to downcast this catalog provider to a concrete type `T`,
+    /// returning `None` if the provider is not of that type.
+    ///
+    /// Works correctly when called on `Arc<dyn CatalogProvider>` via auto-deref,
+    /// unlike `(&arc as &dyn Any).downcast_ref::<T>()` which would attempt to
+    /// downcast the `Arc` itself.
+    pub fn downcast_ref<T: CatalogProvider>(&self) -> Option<&T> {
+        (self as &dyn Any).downcast_ref()
+    }
+}
+
 /// Represent a list of named [`CatalogProvider`]s.
 ///
 /// Please see the documentation on [`CatalogProvider`] for details of
 /// implementing a custom catalog.
-pub trait CatalogProviderList: Debug + Sync + Send {
-    /// Returns the catalog list as [`Any`]
-    /// so that it can be downcast to a specific implementation.
-    fn as_any(&self) -> &dyn Any;
-
+pub trait CatalogProviderList: Any + Debug + Sync + Send {
     /// Adds a new catalog to this catalog list
     /// If a catalog of the same name existed before, it is replaced in the list and returned.
     fn register_catalog(
@@ -175,3 +187,23 @@ pub trait CatalogProviderList: Debug + Sync + Send {
     /// Retrieves a specific catalog by name, provided it exists.
     fn catalog(&self, name: &str) -> Option<Arc<dyn CatalogProvider>>;
 }
+
+impl dyn CatalogProviderList {
+    /// Returns `true` if the catalog provider list is of type `T`.
+    ///
+    /// Prefer this over `downcast_ref::<T>().is_some()`. Works correctly when
+    /// called on `Arc<dyn CatalogProviderList>` via auto-deref.
+    pub fn is<T: CatalogProviderList>(&self) -> bool {
+        (self as &dyn Any).is::<T>()
+    }
+
+    /// Attempts to downcast this catalog provider list to a concrete type `T`,
+    /// returning `None` if the provider list is not of that type.
+    ///
+    /// Works correctly when called on `Arc<dyn CatalogProviderList>` via
+    /// auto-deref, unlike `(&arc as &dyn Any).downcast_ref::<T>()` which would
+    /// attempt to downcast the `Arc` itself.
+    pub fn downcast_ref<T: CatalogProviderList>(&self) -> Option<&T> {
+        (self as &dyn Any).downcast_ref()
+    }
+}
diff --git a/datafusion/catalog/src/cte_worktable.rs b/datafusion/catalog/src/cte_worktable.rs
index d6b2a453118c9..dd313ebb4cbff 100644
--- a/datafusion/catalog/src/cte_worktable.rs
+++ b/datafusion/catalog/src/cte_worktable.rs
@@ -17,20 +17,17 @@
 
 //! CteWorkTable implementation used for recursive queries
 
+use std::borrow::Cow;
 use std::sync::Arc;
-use std::{any::Any, borrow::Cow};
 
-use crate::Session;
 use arrow::datatypes::SchemaRef;
 use async_trait::async_trait;
-use datafusion_physical_plan::work_table::WorkTableExec;
-
-use datafusion_physical_plan::ExecutionPlan;
-
 use datafusion_common::error::Result;
 use datafusion_expr::{Expr, LogicalPlan, TableProviderFilterPushDown, TableType};
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::work_table::WorkTableExec;
 
-use crate::TableProvider;
+use crate::{ScanArgs, ScanResult, Session, TableProvider};
 
 /// The temporary working table where the previous iteration of a recursive query is stored
 /// Naming is based on PostgreSQL's implementation.
@@ -67,10 +64,6 @@ impl CteWorkTable {
 
 #[async_trait]
 impl TableProvider for CteWorkTable {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn get_logical_plan(&'_ self) -> Option<Cow<'_, LogicalPlan>> {
         None
     }
@@ -85,16 +78,28 @@ impl TableProvider for CteWorkTable {
 
     async fn scan(
         &self,
-        _state: &dyn Session,
-        _projection: Option<&Vec<usize>>,
-        _filters: &[Expr],
-        _limit: Option<usize>,
+        state: &dyn Session,
+        projection: Option<&Vec<usize>>,
+        filters: &[Expr],
+        limit: Option<usize>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        // TODO: pushdown filters and limits
-        Ok(Arc::new(WorkTableExec::new(
+        let options = ScanArgs::default()
+            .with_projection(projection.map(|p| p.as_slice()))
+            .with_filters(Some(filters))
+            .with_limit(limit);
+        Ok(self.scan_with_args(state, options).await?.into_inner())
+    }
+
+    async fn scan_with_args<'a>(
+        &self,
+        _state: &dyn Session,
+        args: ScanArgs<'a>,
+    ) -> Result<ScanResult> {
+        Ok(ScanResult::new(Arc::new(WorkTableExec::new(
             self.name.clone(),
             Arc::clone(&self.table_schema),
-        )))
+            args.projection().map(|p| p.to_vec()),
+        )?)))
     }
 
     fn supports_filters_pushdown(
diff --git a/datafusion/catalog/src/default_table_source.rs b/datafusion/catalog/src/default_table_source.rs
index 11963c06c88f5..60f85891d66e6 100644
--- a/datafusion/catalog/src/default_table_source.rs
+++ b/datafusion/catalog/src/default_table_source.rs
@@ -17,13 +17,13 @@
 
 //! Default TableSource implementation used in DataFusion physical plans
 
+use std::borrow::Cow;
 use std::sync::Arc;
-use std::{any::Any, borrow::Cow};
 
 use crate::TableProvider;
 
 use arrow::datatypes::SchemaRef;
-use datafusion_common::{internal_err, Constraints};
+use datafusion_common::{Constraints, internal_err};
 use datafusion_expr::{Expr, TableProviderFilterPushDown, TableSource, TableType};
 
 /// Implements [`TableSource`] for a [`TableProvider`]
@@ -46,12 +46,6 @@ impl DefaultTableSource {
 }
 
 impl TableSource for DefaultTableSource {
-    /// Returns the table source as [`Any`] so that it can be
-    /// downcast to a specific implementation.
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     /// Get a reference to the schema for this table
     fn schema(&self) -> SchemaRef {
         self.table_provider.schema()
@@ -97,11 +91,7 @@ pub fn provider_as_source(
 pub fn source_as_provider(
     source: &Arc<dyn TableSource>,
 ) -> datafusion_common::Result<Arc<dyn TableProvider>> {
-    match source
-        .as_ref()
-        .as_any()
-        .downcast_ref::<DefaultTableSource>()
-    {
+    match source.as_ref().downcast_ref::<DefaultTableSource>() {
         Some(source) => Ok(Arc::clone(&source.table_provider)),
         _ => internal_err!("TableSource was not DefaultTableSource"),
     }
@@ -117,10 +107,6 @@ fn preserves_table_type() {
 
     #[async_trait]
     impl TableProvider for TestTempTable {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
         fn table_type(&self) -> TableType {
             TableType::Temporary
         }
diff --git a/datafusion/catalog/src/dynamic_file/catalog.rs b/datafusion/catalog/src/dynamic_file/catalog.rs
index ccccb9762eb4c..f93bd35cd7f0a 100644
--- a/datafusion/catalog/src/dynamic_file/catalog.rs
+++ b/datafusion/catalog/src/dynamic_file/catalog.rs
@@ -19,7 +19,6 @@
 
 use crate::{CatalogProvider, CatalogProviderList, SchemaProvider, TableProvider};
 use async_trait::async_trait;
-use std::any::Any;
 use std::fmt::Debug;
 use std::sync::Arc;
 
@@ -42,10 +41,6 @@ impl DynamicFileCatalog {
 }
 
 impl CatalogProviderList for DynamicFileCatalog {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn register_catalog(
         &self,
         name: String,
@@ -87,10 +82,6 @@ impl DynamicFileCatalogProvider {
 }
 
 impl CatalogProvider for DynamicFileCatalogProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema_names(&self) -> Vec<String> {
         self.inner.schema_names()
     }
@@ -137,10 +128,6 @@ impl DynamicFileSchemaProvider {
 
 #[async_trait]
 impl SchemaProvider for DynamicFileSchemaProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn table_names(&self) -> Vec<String> {
         self.inner.table_names()
     }
diff --git a/datafusion/core/src/datasource/empty.rs b/datafusion/catalog/src/empty.rs
similarity index 89%
rename from datafusion/core/src/datasource/empty.rs
rename to datafusion/catalog/src/empty.rs
index 77686c5eb7c27..1ff36ecf360a2 100644
--- a/datafusion/core/src/datasource/empty.rs
+++ b/datafusion/catalog/src/empty.rs
@@ -17,19 +17,17 @@
 
 //! [`EmptyTable`] useful for testing.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::datatypes::*;
 use async_trait::async_trait;
-use datafusion_catalog::Session;
-use datafusion_common::project_schema;
-
-use crate::datasource::{TableProvider, TableType};
-use crate::error::Result;
-use crate::logical_expr::Expr;
-use datafusion_physical_plan::empty::EmptyExec;
+use datafusion_common::{Result, project_schema};
+use datafusion_expr::{Expr, TableType};
 use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::empty::EmptyExec;
+
+use crate::Session;
+use crate::TableProvider;
 
 /// An empty plan that is useful for testing and generating plans
 /// without mapping them to actual data.
@@ -57,10 +55,6 @@ impl EmptyTable {
 
 #[async_trait]
 impl TableProvider for EmptyTable {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         Arc::clone(&self.schema)
     }
diff --git a/datafusion/catalog/src/information_schema.rs b/datafusion/catalog/src/information_schema.rs
index d733551f44051..34c677c3dd43e 100644
--- a/datafusion/catalog/src/information_schema.rs
+++ b/datafusion/catalog/src/information_schema.rs
@@ -24,23 +24,27 @@ use crate::{CatalogProviderList, SchemaProvider, TableProvider};
 use arrow::array::builder::{BooleanBuilder, UInt8Builder};
 use arrow::{
     array::{StringBuilder, UInt64Builder},
-    datatypes::{DataType, Field, Schema, SchemaRef},
+    datatypes::{DataType, Field, FieldRef, Schema, SchemaRef},
     record_batch::RecordBatch,
 };
 use async_trait::async_trait;
+use datafusion_common::DataFusionError;
 use datafusion_common::config::{ConfigEntry, ConfigOptions};
 use datafusion_common::error::Result;
 use datafusion_common::types::NativeType;
-use datafusion_common::DataFusionError;
 use datafusion_execution::TaskContext;
-use datafusion_expr::{AggregateUDF, ScalarUDF, Signature, TypeSignature, WindowUDF};
+use datafusion_execution::runtime_env::RuntimeEnv;
+use datafusion_expr::function::WindowUDFFieldArgs;
+use datafusion_expr::{
+    AggregateUDF, ReturnFieldArgs, ScalarUDF, Signature, TypeSignature, WindowUDF,
+};
 use datafusion_expr::{TableType, Volatility};
+use datafusion_physical_plan::SendableRecordBatchStream;
 use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion_physical_plan::streaming::PartitionStream;
-use datafusion_physical_plan::SendableRecordBatchStream;
 use std::collections::{BTreeSet, HashMap, HashSet};
 use std::fmt::Debug;
-use std::{any::Any, sync::Arc};
+use std::sync::Arc;
 
 pub const INFORMATION_SCHEMA: &str = "information_schema";
 pub(crate) const TABLES: &str = "tables";
@@ -137,11 +141,11 @@ impl InformationSchemaConfig {
             let catalog = self.catalog_list.catalog(&catalog_name).unwrap();
 
             for schema_name in catalog.schema_names() {
-                if schema_name != INFORMATION_SCHEMA {
-                    if let Some(schema) = catalog.schema(&schema_name) {
-                        let schema_owner = schema.owner_name();
-                        builder.add_schemata(&catalog_name, &schema_name, schema_owner);
-                    }
+                if schema_name != INFORMATION_SCHEMA
+                    && let Some(schema) = catalog.schema(&schema_name)
+                {
+                    let schema_owner = schema.owner_name();
+                    builder.add_schemata(&catalog_name, &schema_name, schema_owner);
                 }
             }
         }
@@ -215,11 +219,16 @@ impl InformationSchemaConfig {
     fn make_df_settings(
         &self,
         config_options: &ConfigOptions,
+        runtime_env: &Arc<RuntimeEnv>,
         builder: &mut InformationSchemaDfSettingsBuilder,
     ) {
         for entry in config_options.entries() {
             builder.add_setting(entry);
         }
+        // Add runtime configuration entries
+        for entry in runtime_env.config_entries() {
+            builder.add_setting(entry);
+        }
     }
 
     fn make_routines(
@@ -245,7 +254,7 @@ impl InformationSchemaConfig {
                     name,
                     "FUNCTION",
                     Self::is_deterministic(udf.signature()),
-                    return_type,
+                    return_type.as_ref(),
                     "SCALAR",
                     udf.documentation().map(|d| d.description.to_string()),
                     udf.documentation().map(|d| d.syntax_example.to_string()),
@@ -265,7 +274,7 @@ impl InformationSchemaConfig {
                     name,
                     "FUNCTION",
                     Self::is_deterministic(udaf.signature()),
-                    return_type,
+                    return_type.as_ref(),
                     "AGGREGATE",
                     udaf.documentation().map(|d| d.description.to_string()),
                     udaf.documentation().map(|d| d.syntax_example.to_string()),
@@ -285,7 +294,7 @@ impl InformationSchemaConfig {
                     name,
                     "FUNCTION",
                     Self::is_deterministic(udwf.signature()),
-                    return_type,
+                    return_type.as_ref(),
                     "WINDOW",
                     udwf.documentation().map(|d| d.description.to_string()),
                     udwf.documentation().map(|d| d.syntax_example.to_string()),
@@ -415,14 +424,28 @@ fn get_udf_args_and_return_types(
         Ok(arg_types
             .into_iter()
             .map(|arg_types| {
-                // only handle the function which implemented [`ScalarUDFImpl::return_type`] method
+                let arg_fields: Vec<FieldRef> = arg_types
+                    .iter()
+                    .enumerate()
+                    .map(|(i, t)| {
+                        Arc::new(Field::new(format!("arg_{i}"), t.clone(), true))
+                    })
+                    .collect();
+                let scalar_arguments = vec![None; arg_fields.len()];
                 let return_type = udf
-                    .return_type(&arg_types)
-                    .map(|t| remove_native_type_prefix(NativeType::from(t)))
+                    .return_field_from_args(ReturnFieldArgs {
+                        arg_fields: &arg_fields,
+                        scalar_arguments: &scalar_arguments,
+                    })
+                    .map(|f| {
+                        remove_native_type_prefix(&NativeType::from(
+                            f.data_type().clone(),
+                        ))
+                    })
                     .ok();
                 let arg_types = arg_types
                     .into_iter()
-                    .map(|t| remove_native_type_prefix(NativeType::from(t)))
+                    .map(|t| remove_native_type_prefix(&NativeType::from(t)))
                     .collect::<Vec<_>>();
                 (arg_types, return_type)
             })
@@ -441,14 +464,24 @@ fn get_udaf_args_and_return_types(
         Ok(arg_types
             .into_iter()
             .map(|arg_types| {
-                // only handle the function which implemented [`ScalarUDFImpl::return_type`] method
+                let arg_fields: Vec<FieldRef> = arg_types
+                    .iter()
+                    .enumerate()
+                    .map(|(i, t)| {
+                        Arc::new(Field::new(format!("arg_{i}"), t.clone(), true))
+                    })
+                    .collect();
                 let return_type = udaf
-                    .return_type(&arg_types)
-                    .ok()
-                    .map(|t| remove_native_type_prefix(NativeType::from(t)));
+                    .return_field(&arg_fields)
+                    .map(|f| {
+                        remove_native_type_prefix(&NativeType::from(
+                            f.data_type().clone(),
+                        ))
+                    })
+                    .ok();
                 let arg_types = arg_types
                     .into_iter()
-                    .map(|t| remove_native_type_prefix(NativeType::from(t)))
+                    .map(|t| remove_native_type_prefix(&NativeType::from(t)))
                     .collect::<Vec<_>>();
                 (arg_types, return_type)
             })
@@ -467,28 +500,38 @@ fn get_udwf_args_and_return_types(
         Ok(arg_types
             .into_iter()
             .map(|arg_types| {
-                // only handle the function which implemented [`ScalarUDFImpl::return_type`] method
+                let arg_fields: Vec<FieldRef> = arg_types
+                    .iter()
+                    .enumerate()
+                    .map(|(i, t)| {
+                        Arc::new(Field::new(format!("arg_{i}"), t.clone(), true))
+                    })
+                    .collect();
+                let return_type = udwf
+                    .field(WindowUDFFieldArgs::new(&arg_fields, udwf.name()))
+                    .map(|f| {
+                        remove_native_type_prefix(&NativeType::from(
+                            f.data_type().clone(),
+                        ))
+                    })
+                    .ok();
                 let arg_types = arg_types
                     .into_iter()
-                    .map(|t| remove_native_type_prefix(NativeType::from(t)))
+                    .map(|t| remove_native_type_prefix(&NativeType::from(t)))
                     .collect::<Vec<_>>();
-                (arg_types, None)
+                (arg_types, return_type)
             })
             .collect::<BTreeSet<_>>())
     }
 }
 
 #[inline]
-fn remove_native_type_prefix(native_type: NativeType) -> String {
+fn remove_native_type_prefix(native_type: &NativeType) -> String {
     format!("{native_type}")
 }
 
 #[async_trait]
 impl SchemaProvider for InformationSchemaProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn table_names(&self) -> Vec<String> {
         INFORMATION_SCHEMA_TABLES
             .iter()
@@ -679,7 +722,7 @@ impl InformationSchemaViewBuilder {
         catalog_name: impl AsRef<str>,
         schema_name: impl AsRef<str>,
         table_name: impl AsRef<str>,
-        definition: Option<impl AsRef<str>>,
+        definition: Option<&(impl AsRef<str> + ?Sized)>,
     ) {
         // Note: append_value is actually infallible.
         self.catalog_names.append_value(catalog_name.as_ref());
@@ -1060,7 +1103,12 @@ impl PartitionStream for InformationSchemaDfSettings {
             // TODO: Stream this
             futures::stream::once(async move {
                 // create a mem table with the names of tables
-                config.make_df_settings(ctx.session_config().options(), &mut builder);
+                let runtime_env = ctx.runtime_env();
+                config.make_df_settings(
+                    ctx.session_config().options(),
+                    &runtime_env,
+                    &mut builder,
+                );
                 Ok(builder.finish())
             }),
         ))
@@ -1156,7 +1204,7 @@ struct InformationSchemaRoutinesBuilder {
 }
 
 impl InformationSchemaRoutinesBuilder {
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     fn add_routine(
         &mut self,
         catalog_name: impl AsRef<str>,
@@ -1164,7 +1212,7 @@ impl InformationSchemaRoutinesBuilder {
         routine_name: impl AsRef<str>,
         routine_type: impl AsRef<str>,
         is_deterministic: bool,
-        data_type: Option<impl AsRef<str>>,
+        data_type: Option<&impl AsRef<str>>,
         function_type: impl AsRef<str>,
         description: Option<impl AsRef<str>>,
         syntax_example: Option<impl AsRef<str>>,
@@ -1290,7 +1338,7 @@ struct InformationSchemaParametersBuilder {
 }
 
 impl InformationSchemaParametersBuilder {
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     fn add_parameter(
         &mut self,
         specific_catalog: impl AsRef<str>,
@@ -1298,7 +1346,7 @@ impl InformationSchemaParametersBuilder {
         specific_name: impl AsRef<str>,
         ordinal_position: u64,
         parameter_mode: impl AsRef<str>,
-        parameter_name: Option<impl AsRef<str>>,
+        parameter_name: Option<&(impl AsRef<str> + ?Sized)>,
         data_type: impl AsRef<str>,
         parameter_default: Option<impl AsRef<str>>,
         is_variadic: bool,
@@ -1397,11 +1445,9 @@ mod tests {
         // InformationSchemaConfig::make_tables used this before `table_type`
         // existed but should not, as it may be expensive.
         async fn table(&self, _: &str) -> Result<Option<Arc<dyn TableProvider>>> {
-            panic!("InformationSchemaConfig::make_tables called SchemaProvider::table instead of table_type")
-        }
-
-        fn as_any(&self) -> &dyn Any {
-            unimplemented!("not required for these tests")
+            panic!(
+                "InformationSchemaConfig::make_tables called SchemaProvider::table instead of table_type"
+            )
         }
 
         fn table_names(&self) -> Vec<String> {
@@ -1414,10 +1460,6 @@ mod tests {
     }
 
     impl CatalogProviderList for Fixture {
-        fn as_any(&self) -> &dyn Any {
-            unimplemented!("not required for these tests")
-        }
-
         fn register_catalog(
             &self,
             _: String,
@@ -1436,10 +1478,6 @@ mod tests {
     }
 
     impl CatalogProvider for Fixture {
-        fn as_any(&self) -> &dyn Any {
-            unimplemented!("not required for these tests")
-        }
-
         fn schema_names(&self) -> Vec<String> {
             vec!["aschema".to_string()]
         }
diff --git a/datafusion/catalog/src/lib.rs b/datafusion/catalog/src/lib.rs
index 1c5e38438724e..33d54b7cb89d5 100644
--- a/datafusion/catalog/src/lib.rs
+++ b/datafusion/catalog/src/lib.rs
@@ -23,6 +23,7 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! Interfaces and default implementations of catalogs and schemas.
 //!
@@ -33,6 +34,7 @@
 
 pub mod cte_worktable;
 pub mod default_table_source;
+pub mod empty;
 pub mod information_schema;
 pub mod listing_schema;
 pub mod memory;
@@ -46,13 +48,13 @@ mod dynamic_file;
 mod schema;
 mod table;
 
+pub use r#async::*;
 pub use catalog::*;
 pub use datafusion_session::Session;
 pub use dynamic_file::catalog::*;
 pub use memory::{
     MemTable, MemoryCatalogProvider, MemoryCatalogProviderList, MemorySchemaProvider,
 };
-pub use r#async::*;
 pub use schema::*;
 pub use table::*;
 
diff --git a/datafusion/catalog/src/listing_schema.rs b/datafusion/catalog/src/listing_schema.rs
index af96cfc15fc82..d38fe659aaa97 100644
--- a/datafusion/catalog/src/listing_schema.rs
+++ b/datafusion/catalog/src/listing_schema.rs
@@ -17,7 +17,6 @@
 
 //! [`ListingSchemaProvider`]: [`SchemaProvider`] that scans ObjectStores for tables automatically
 
-use std::any::Any;
 use std::collections::HashSet;
 use std::path::Path;
 use std::sync::{Arc, Mutex};
@@ -26,7 +25,7 @@ use crate::{SchemaProvider, TableProvider, TableProviderFactory};
 
 use crate::Session;
 use datafusion_common::{
-    internal_datafusion_err, DFSchema, DataFusionError, HashMap, TableReference,
+    DFSchema, DataFusionError, HashMap, TableReference, internal_datafusion_err,
 };
 use datafusion_expr::CreateExternalTable;
 
@@ -127,22 +126,13 @@ impl ListingSchemaProvider {
                     .factory
                     .create(
                         state,
-                        &CreateExternalTable {
-                            schema: Arc::new(DFSchema::empty()),
+                        &CreateExternalTable::builder(
                             name,
-                            location: table_url,
-                            file_type: self.format.clone(),
-                            table_partition_cols: vec![],
-                            if_not_exists: false,
-                            or_replace: false,
-                            temporary: false,
-                            definition: None,
-                            order_exprs: vec![],
-                            unbounded: false,
-                            options: Default::default(),
-                            constraints: Default::default(),
-                            column_defaults: Default::default(),
-                        },
+                            table_url,
+                            self.format.clone(),
+                            Arc::new(DFSchema::empty()),
+                        )
+                        .build(),
                     )
                     .await?;
                 let _ =
@@ -155,10 +145,6 @@ impl ListingSchemaProvider {
 
 #[async_trait]
 impl SchemaProvider for ListingSchemaProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn table_names(&self) -> Vec<String> {
         self.tables
             .lock()
diff --git a/datafusion/catalog/src/memory/catalog.rs b/datafusion/catalog/src/memory/catalog.rs
index b71888c54e9d6..ebe6b9dfa0ebc 100644
--- a/datafusion/catalog/src/memory/catalog.rs
+++ b/datafusion/catalog/src/memory/catalog.rs
@@ -21,7 +21,6 @@
 use crate::{CatalogProvider, CatalogProviderList, SchemaProvider};
 use dashmap::DashMap;
 use datafusion_common::exec_err;
-use std::any::Any;
 use std::sync::Arc;
 
 /// Simple in-memory list of catalogs
@@ -47,10 +46,6 @@ impl Default for MemoryCatalogProviderList {
 }
 
 impl CatalogProviderList for MemoryCatalogProviderList {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn register_catalog(
         &self,
         name: String,
@@ -90,10 +85,6 @@ impl Default for MemoryCatalogProvider {
 }
 
 impl CatalogProvider for MemoryCatalogProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema_names(&self) -> Vec<String> {
         self.schemas.iter().map(|s| s.key().clone()).collect()
     }
diff --git a/datafusion/catalog/src/memory/schema.rs b/datafusion/catalog/src/memory/schema.rs
index f1b3628f7affc..46b0beb440613 100644
--- a/datafusion/catalog/src/memory/schema.rs
+++ b/datafusion/catalog/src/memory/schema.rs
@@ -20,8 +20,7 @@
 use crate::{SchemaProvider, TableProvider};
 use async_trait::async_trait;
 use dashmap::DashMap;
-use datafusion_common::{exec_err, DataFusionError};
-use std::any::Any;
+use datafusion_common::{DataFusionError, exec_err};
 use std::sync::Arc;
 
 /// Simple in-memory implementation of a schema.
@@ -47,10 +46,6 @@ impl Default for MemorySchemaProvider {
 
 #[async_trait]
 impl SchemaProvider for MemorySchemaProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn table_names(&self) -> Vec<String> {
         self.tables
             .iter()
diff --git a/datafusion/catalog/src/memory/table.rs b/datafusion/catalog/src/memory/table.rs
index 90224f6a37bc3..8102c15079658 100644
--- a/datafusion/catalog/src/memory/table.rs
+++ b/datafusion/catalog/src/memory/table.rs
@@ -17,27 +17,36 @@
 
 //! [`MemTable`] for querying `Vec<RecordBatch>` by DataFusion.
 
-use std::any::Any;
 use std::collections::HashMap;
 use std::fmt::Debug;
 use std::sync::Arc;
 
 use crate::TableProvider;
 
-use arrow::datatypes::SchemaRef;
+use arrow::array::{
+    Array, ArrayRef, BooleanArray, RecordBatch as ArrowRecordBatch, UInt64Array,
+};
+use arrow::compute::kernels::zip::zip;
+use arrow::compute::{and, filter_record_batch};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use datafusion_common::error::Result;
-use datafusion_common::{not_impl_err, plan_err, Constraints, DFSchema, SchemaExt};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Constraints, DFSchema, SchemaExt, not_impl_err, plan_err};
 use datafusion_common_runtime::JoinSet;
 use datafusion_datasource::memory::{MemSink, MemorySourceConfig};
 use datafusion_datasource::sink::DataSinkExec;
 use datafusion_datasource::source::DataSourceExec;
 use datafusion_expr::dml::InsertOp;
 use datafusion_expr::{Expr, SortExpr, TableType};
-use datafusion_physical_expr::{create_physical_sort_exprs, LexOrdering};
+use datafusion_physical_expr::{
+    LexOrdering, create_physical_expr, create_physical_sort_exprs,
+};
 use datafusion_physical_plan::repartition::RepartitionExec;
+use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion_physical_plan::{
-    common, ExecutionPlan, ExecutionPlanProperties, Partitioning,
+    DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, Partitioning,
+    PhysicalExpr, PlanProperties, common,
 };
 use datafusion_session::Session;
 
@@ -204,10 +213,6 @@ impl MemTable {
 
 #[async_trait]
 impl TableProvider for MemTable {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         Arc::clone(&self.schema)
     }
@@ -295,4 +300,338 @@ impl TableProvider for MemTable {
     fn get_column_default(&self, column: &str) -> Option<&Expr> {
         self.column_defaults.get(column)
     }
+
+    async fn delete_from(
+        &self,
+        state: &dyn Session,
+        filters: Vec<Expr>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        // Early exit if table has no partitions
+        if self.batches.is_empty() {
+            return Ok(Arc::new(DmlResultExec::new(0)));
+        }
+
+        *self.sort_order.lock() = vec![];
+
+        let mut total_deleted: u64 = 0;
+        let df_schema = DFSchema::try_from(Arc::clone(&self.schema))?;
+
+        for partition_data in &self.batches {
+            let mut partition = partition_data.write().await;
+            let mut new_batches = Vec::with_capacity(partition.len());
+
+            for batch in partition.iter() {
+                if batch.num_rows() == 0 {
+                    continue;
+                }
+
+                // Evaluate filters - None means "match all rows"
+                let filter_mask = evaluate_filters_to_mask(
+                    &filters,
+                    batch,
+                    &df_schema,
+                    state.execution_props(),
+                )?;
+
+                let (delete_count, keep_mask) = match filter_mask {
+                    Some(mask) => {
+                        // Count rows where mask is true (will be deleted)
+                        let count = mask.iter().filter(|v| v == &Some(true)).count();
+                        // Keep rows where predicate is false or NULL (SQL three-valued logic)
+                        let keep: BooleanArray =
+                            mask.iter().map(|v| Some(v != Some(true))).collect();
+                        (count, keep)
+                    }
+                    None => {
+                        // No filters = delete all rows
+                        (
+                            batch.num_rows(),
+                            BooleanArray::from(vec![false; batch.num_rows()]),
+                        )
+                    }
+                };
+
+                total_deleted += delete_count as u64;
+
+                let filtered_batch = filter_record_batch(batch, &keep_mask)?;
+                if filtered_batch.num_rows() > 0 {
+                    new_batches.push(filtered_batch);
+                }
+            }
+
+            *partition = new_batches;
+        }
+
+        Ok(Arc::new(DmlResultExec::new(total_deleted)))
+    }
+
+    async fn update(
+        &self,
+        state: &dyn Session,
+        assignments: Vec<(String, Expr)>,
+        filters: Vec<Expr>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        // Early exit if table has no partitions
+        if self.batches.is_empty() {
+            return Ok(Arc::new(DmlResultExec::new(0)));
+        }
+
+        // Validate column names upfront with clear error messages
+        let available_columns: Vec<&str> = self
+            .schema
+            .fields()
+            .iter()
+            .map(|f| f.name().as_str())
+            .collect();
+        for (column_name, _) in &assignments {
+            if self.schema.field_with_name(column_name).is_err() {
+                return plan_err!(
+                    "UPDATE failed: column '{}' does not exist. Available columns: {}",
+                    column_name,
+                    available_columns.join(", ")
+                );
+            }
+        }
+
+        let df_schema = DFSchema::try_from(Arc::clone(&self.schema))?;
+
+        // Create physical expressions for assignments upfront (outside batch loop)
+        let physical_assignments: HashMap<String, Arc<dyn PhysicalExpr>> = assignments
+            .iter()
+            .map(|(name, expr)| {
+                let physical_expr =
+                    create_physical_expr(expr, &df_schema, state.execution_props())?;
+                Ok((name.clone(), physical_expr))
+            })
+            .collect::<Result<_>>()?;
+
+        *self.sort_order.lock() = vec![];
+
+        let mut total_updated: u64 = 0;
+
+        for partition_data in &self.batches {
+            let mut partition = partition_data.write().await;
+            let mut new_batches = Vec::with_capacity(partition.len());
+
+            for batch in partition.iter() {
+                if batch.num_rows() == 0 {
+                    continue;
+                }
+
+                // Evaluate filters - None means "match all rows"
+                let filter_mask = evaluate_filters_to_mask(
+                    &filters,
+                    batch,
+                    &df_schema,
+                    state.execution_props(),
+                )?;
+
+                let (update_count, update_mask) = match filter_mask {
+                    Some(mask) => {
+                        // Count rows where mask is true (will be updated)
+                        let count = mask.iter().filter(|v| v == &Some(true)).count();
+                        // Normalize mask: only true (not NULL) triggers update
+                        let normalized: BooleanArray =
+                            mask.iter().map(|v| Some(v == Some(true))).collect();
+                        (count, normalized)
+                    }
+                    None => {
+                        // No filters = update all rows
+                        (
+                            batch.num_rows(),
+                            BooleanArray::from(vec![true; batch.num_rows()]),
+                        )
+                    }
+                };
+
+                total_updated += update_count as u64;
+
+                if update_count == 0 {
+                    new_batches.push(batch.clone());
+                    continue;
+                }
+
+                let mut new_columns: Vec<ArrayRef> =
+                    Vec::with_capacity(batch.num_columns());
+
+                for field in self.schema.fields() {
+                    let column_name = field.name();
+                    let original_column =
+                        batch.column_by_name(column_name).ok_or_else(|| {
+                            datafusion_common::DataFusionError::Internal(format!(
+                                "Column '{column_name}' not found in batch"
+                            ))
+                        })?;
+
+                    let new_column = if let Some(physical_expr) =
+                        physical_assignments.get(column_name.as_str())
+                    {
+                        // Use evaluate_selection to only evaluate on matching rows.
+                        // This avoids errors (e.g., divide-by-zero) on rows that won't
+                        // be updated. The result is scattered back with nulls for
+                        // non-matching rows, which zip() will replace with originals.
+                        let new_values =
+                            physical_expr.evaluate_selection(batch, &update_mask)?;
+                        let new_array = new_values.into_array(batch.num_rows())?;
+
+                        // Convert to &dyn Array which implements Datum
+                        let new_arr: &dyn Array = new_array.as_ref();
+                        let orig_arr: &dyn Array = original_column.as_ref();
+                        zip(&update_mask, &new_arr, &orig_arr)?
+                    } else {
+                        Arc::clone(original_column)
+                    };
+
+                    new_columns.push(new_column);
+                }
+
+                let updated_batch =
+                    ArrowRecordBatch::try_new(Arc::clone(&self.schema), new_columns)?;
+                new_batches.push(updated_batch);
+            }
+
+            *partition = new_batches;
+        }
+
+        Ok(Arc::new(DmlResultExec::new(total_updated)))
+    }
+}
+
+/// Evaluate filter expressions against a batch and return a combined boolean mask.
+/// Returns None if filters is empty (meaning "match all rows").
+/// The returned mask has true for rows that match the filter predicates.
+fn evaluate_filters_to_mask(
+    filters: &[Expr],
+    batch: &RecordBatch,
+    df_schema: &DFSchema,
+    execution_props: &datafusion_expr::execution_props::ExecutionProps,
+) -> Result<Option<BooleanArray>> {
+    if filters.is_empty() {
+        return Ok(None);
+    }
+
+    let mut combined_mask: Option<BooleanArray> = None;
+
+    for filter_expr in filters {
+        let physical_expr =
+            create_physical_expr(filter_expr, df_schema, execution_props)?;
+
+        let result = physical_expr.evaluate(batch)?;
+        let array = result.into_array(batch.num_rows())?;
+        let bool_array = array
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .ok_or_else(|| {
+                datafusion_common::DataFusionError::Internal(
+                    "Filter did not evaluate to boolean".to_string(),
+                )
+            })?
+            .clone();
+
+        combined_mask = Some(match combined_mask {
+            Some(existing) => and(&existing, &bool_array)?,
+            None => bool_array,
+        });
+    }
+
+    Ok(combined_mask)
+}
+
+/// Returns a single row with the count of affected rows.
+#[derive(Debug)]
+struct DmlResultExec {
+    rows_affected: u64,
+    schema: SchemaRef,
+    properties: Arc<PlanProperties>,
+}
+
+impl DmlResultExec {
+    fn new(rows_affected: u64) -> Self {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "count",
+            DataType::UInt64,
+            false,
+        )]));
+
+        let properties = PlanProperties::new(
+            datafusion_physical_expr::EquivalenceProperties::new(Arc::clone(&schema)),
+            Partitioning::UnknownPartitioning(1),
+            datafusion_physical_plan::execution_plan::EmissionType::Final,
+            datafusion_physical_plan::execution_plan::Boundedness::Bounded,
+        );
+
+        Self {
+            rows_affected,
+            schema,
+            properties: Arc::new(properties),
+        }
+    }
+}
+
+impl DisplayAs for DmlResultExec {
+    fn fmt_as(
+        &self,
+        t: DisplayFormatType,
+        f: &mut std::fmt::Formatter,
+    ) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default
+            | DisplayFormatType::Verbose
+            | DisplayFormatType::TreeRender => {
+                write!(f, "DmlResultExec: rows_affected={}", self.rows_affected)
+            }
+        }
+    }
+}
+
+impl ExecutionPlan for DmlResultExec {
+    fn name(&self) -> &str {
+        "DmlResultExec"
+    }
+
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.properties
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        _children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(self)
+    }
+
+    fn execute(
+        &self,
+        _partition: usize,
+        _context: Arc<datafusion_execution::TaskContext>,
+    ) -> Result<datafusion_execution::SendableRecordBatchStream> {
+        // Create a single batch with the count
+        let count_array = UInt64Array::from(vec![self.rows_affected]);
+        let batch = ArrowRecordBatch::try_new(
+            Arc::clone(&self.schema),
+            vec![Arc::new(count_array) as ArrayRef],
+        )?;
+
+        // Create a stream that yields just this one batch
+        let stream = futures::stream::iter(vec![Ok(batch)]);
+        Ok(Box::pin(RecordBatchStreamAdapter::new(
+            Arc::clone(&self.schema),
+            stream,
+        )))
+    }
+
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
 }
diff --git a/datafusion/catalog/src/schema.rs b/datafusion/catalog/src/schema.rs
index 9ba55256f1824..d99027593ccce 100644
--- a/datafusion/catalog/src/schema.rs
+++ b/datafusion/catalog/src/schema.rs
@@ -19,7 +19,7 @@
 //! representing collections of named tables.
 
 use async_trait::async_trait;
-use datafusion_common::{exec_err, DataFusionError};
+use datafusion_common::{DataFusionError, exec_err};
 use std::any::Any;
 use std::fmt::Debug;
 use std::sync::Arc;
@@ -34,17 +34,13 @@ use datafusion_expr::TableType;
 ///
 /// [`CatalogProvider`]: super::CatalogProvider
 #[async_trait]
-pub trait SchemaProvider: Debug + Sync + Send {
+pub trait SchemaProvider: Any + Debug + Sync + Send {
     /// Returns the owner of the Schema, default is None. This value is reported
     /// as part of `information_tables.schemata
     fn owner_name(&self) -> Option<&str> {
         None
     }
 
-    /// Returns this `SchemaProvider` as [`Any`] so that it can be downcast to a
-    /// specific implementation.
-    fn as_any(&self) -> &dyn Any;
-
     /// Retrieves the list of available table names in this schema.
     fn table_names(&self) -> Vec<String>;
 
@@ -68,7 +64,7 @@ pub trait SchemaProvider: Debug + Sync + Send {
     ///
     /// If a table of the same name was already registered, returns "Table
     /// already exists" error.
-    #[allow(unused_variables)]
+    #[expect(unused_variables)]
     fn register_table(
         &self,
         name: String,
@@ -81,7 +77,7 @@ pub trait SchemaProvider: Debug + Sync + Send {
     /// schema and returns the previously registered [`TableProvider`], if any.
     ///
     /// If no `name` table exists, returns Ok(None).
-    #[allow(unused_variables)]
+    #[expect(unused_variables)]
     fn deregister_table(&self, name: &str) -> Result<Option<Arc<dyn TableProvider>>> {
         exec_err!("schema provider does not support deregistering tables")
     }
@@ -89,3 +85,23 @@ pub trait SchemaProvider: Debug + Sync + Send {
     /// Returns true if table exist in the schema provider, false otherwise.
     fn table_exist(&self, name: &str) -> bool;
 }
+
+impl dyn SchemaProvider {
+    /// Returns `true` if the schema provider is of type `T`.
+    ///
+    /// Prefer this over `downcast_ref::<T>().is_some()`. Works correctly when
+    /// called on `Arc<dyn SchemaProvider>` via auto-deref.
+    pub fn is<T: SchemaProvider>(&self) -> bool {
+        (self as &dyn Any).is::<T>()
+    }
+
+    /// Attempts to downcast this schema provider to a concrete type `T`,
+    /// returning `None` if the provider is not of that type.
+    ///
+    /// Works correctly when called on `Arc<dyn SchemaProvider>` via auto-deref,
+    /// unlike `(&arc as &dyn Any).downcast_ref::<T>()` which would attempt to
+    /// downcast the `Arc` itself.
+    pub fn downcast_ref<T: SchemaProvider>(&self) -> Option<&T> {
+        (self as &dyn Any).downcast_ref()
+    }
+}
diff --git a/datafusion/catalog/src/stream.rs b/datafusion/catalog/src/stream.rs
index f4a2338b8eecb..8501ea65902e2 100644
--- a/datafusion/catalog/src/stream.rs
+++ b/datafusion/catalog/src/stream.rs
@@ -17,7 +17,6 @@
 
 //! TableProvider for stream sources, such as FIFO files
 
-use std::any::Any;
 use std::fmt::Formatter;
 use std::fs::{File, OpenOptions};
 use std::io::BufReader;
@@ -28,7 +27,7 @@ use std::sync::Arc;
 use crate::{Session, TableProvider, TableProviderFactory};
 use arrow::array::{RecordBatch, RecordBatchReader, RecordBatchWriter};
 use arrow::datatypes::SchemaRef;
-use datafusion_common::{config_err, plan_err, Constraints, DataFusionError, Result};
+use datafusion_common::{Constraints, DataFusionError, Result, config_err, plan_err};
 use datafusion_common_runtime::SpawnedTask;
 use datafusion_datasource::sink::{DataSink, DataSinkExec};
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
@@ -303,10 +302,6 @@ impl StreamTable {
 
 #[async_trait]
 impl TableProvider for StreamTable {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         Arc::clone(self.0.source.schema())
     }
@@ -405,10 +400,6 @@ impl DisplayAs for StreamWrite {
 
 #[async_trait]
 impl DataSink for StreamWrite {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> &SchemaRef {
         self.0.source.schema()
     }
diff --git a/datafusion/catalog/src/streaming.rs b/datafusion/catalog/src/streaming.rs
index 082e74dab9a15..e609877c2b778 100644
--- a/datafusion/catalog/src/streaming.rs
+++ b/datafusion/catalog/src/streaming.rs
@@ -17,22 +17,20 @@
 
 //! A simplified [`TableProvider`] for streaming partitioned datasets
 
-use std::any::Any;
 use std::sync::Arc;
 
-use crate::Session;
-use crate::TableProvider;
-
 use arrow::datatypes::SchemaRef;
-use datafusion_common::{plan_err, DFSchema, Result};
+use async_trait::async_trait;
+use datafusion_common::{DFSchema, Result, plan_err};
 use datafusion_expr::{Expr, SortExpr, TableType};
-use datafusion_physical_expr::{create_physical_sort_exprs, LexOrdering};
-use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec};
+use datafusion_physical_expr::equivalence::project_ordering;
+use datafusion_physical_expr::{LexOrdering, create_physical_sort_exprs};
 use datafusion_physical_plan::ExecutionPlan;
-
-use async_trait::async_trait;
+use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec};
 use log::debug;
 
+use crate::{Session, TableProvider};
+
 /// A [`TableProvider`] that streams a set of [`PartitionStream`]
 #[derive(Debug)]
 pub struct StreamingTable {
@@ -82,10 +80,6 @@ impl StreamingTable {
 
 #[async_trait]
 impl TableProvider for StreamingTable {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         Arc::clone(&self.schema)
     }
@@ -105,7 +99,22 @@ impl TableProvider for StreamingTable {
             let df_schema = DFSchema::try_from(Arc::clone(&self.schema))?;
             let eqp = state.execution_props();
 
-            create_physical_sort_exprs(&self.sort_order, &df_schema, eqp)?
+            let original_sort_exprs =
+                create_physical_sort_exprs(&self.sort_order, &df_schema, eqp)?;
+
+            if let Some(p) = projection {
+                // When performing a projection, the output columns will not match
+                // the original physical sort expression indices. Also the sort columns
+                // may not be in the output projection. To correct for these issues
+                // we need to project the ordering based on the output schema.
+                let schema = Arc::new(self.schema.project(p)?);
+                LexOrdering::new(original_sort_exprs)
+                    .and_then(|lex_ordering| project_ordering(&lex_ordering, &schema))
+                    .map(|lex_ordering| lex_ordering.to_vec())
+                    .unwrap_or_default()
+            } else {
+                original_sort_exprs
+            }
         } else {
             vec![]
         };
diff --git a/datafusion/catalog/src/table.rs b/datafusion/catalog/src/table.rs
index 11c9af01a7a54..5d1391bed1172 100644
--- a/datafusion/catalog/src/table.rs
+++ b/datafusion/catalog/src/table.rs
@@ -23,8 +23,8 @@ use std::sync::Arc;
 use crate::session::Session;
 use arrow::datatypes::SchemaRef;
 use async_trait::async_trait;
-use datafusion_common::Result;
-use datafusion_common::{not_impl_err, Constraints, Statistics};
+use datafusion_common::{Constraints, Statistics, not_impl_err};
+use datafusion_common::{Result, internal_err};
 use datafusion_expr::Expr;
 
 use datafusion_expr::dml::InsertOp;
@@ -48,11 +48,7 @@ use datafusion_physical_plan::ExecutionPlan;
 /// [`RecordBatch`]: https://docs.rs/arrow/latest/arrow/record_batch/struct.RecordBatch.html
 /// [`CatalogProvider`]: super::CatalogProvider
 #[async_trait]
-pub trait TableProvider: Debug + Sync + Send {
-    /// Returns the table provider as [`Any`] so that it can be
-    /// downcast to a specific implementation.
-    fn as_any(&self) -> &dyn Any;
-
+pub trait TableProvider: Any + Debug + Sync + Send {
     /// Get a reference to the schema for this table
     fn schema(&self) -> SchemaRef;
 
@@ -84,10 +80,10 @@ pub trait TableProvider: Debug + Sync + Send {
         None
     }
 
-    /// Create an [`ExecutionPlan`] for scanning the table with optionally
-    /// specified `projection`, `filter` and `limit`, described below.
+    /// Create an [`ExecutionPlan`] for scanning the table with optional
+    /// `projection`, `filter`, and `limit`, described below.
     ///
-    /// The `ExecutionPlan` is responsible scanning the datasource's
+    /// The returned `ExecutionPlan` is responsible for scanning the datasource's
     /// partitions in a streaming, parallelized fashion.
     ///
     /// # Projection
@@ -96,33 +92,30 @@ pub trait TableProvider: Debug + Sync + Send {
     /// specified. The projection is a set of indexes of the fields in
     /// [`Self::schema`].
     ///
-    /// DataFusion provides the projection to scan only the columns actually
-    /// used in the query to improve performance, an optimization  called
-    /// "Projection Pushdown". Some datasources, such as Parquet, can use this
-    /// information to go significantly faster when only a subset of columns is
-    /// required.
+    /// DataFusion provides the projection so the scan reads only the columns
+    /// actually used in the query, an optimization called "Projection
+    /// Pushdown". Some datasources, such as Parquet, can use this information
+    /// to go significantly faster when only a subset of columns is required.
     ///
     /// # Filters
     ///
     /// A list of boolean filter [`Expr`]s to evaluate *during* the scan, in the
     /// manner specified by [`Self::supports_filters_pushdown`]. Only rows for
-    /// which *all* of the `Expr`s evaluate to `true` must be returned (aka the
-    /// expressions are `AND`ed together).
+    /// which *all* of the `Expr`s evaluate to `true` must be returned (that is,
+    /// the expressions are `AND`ed together).
     ///
-    /// To enable filter pushdown you must override
-    /// [`Self::supports_filters_pushdown`] as the default implementation does
-    /// not and `filters` will be empty.
+    /// To enable filter pushdown, override
+    /// [`Self::supports_filters_pushdown`]. The default implementation does not
+    /// push down filters, and `filters` will be empty.
     ///
-    /// DataFusion pushes filtering into the scans whenever possible
-    /// ("Filter Pushdown"), and depending on the format and the
-    /// implementation of the format, evaluating the predicate during the scan
-    /// can increase performance significantly.
+    /// DataFusion pushes filters into scans whenever possible ("Filter
+    /// Pushdown"). Depending on the data format and implementation, evaluating
+    /// predicates during the scan can significantly improve performance.
     ///
     /// ## Note: Some columns may appear *only* in Filters
     ///
-    /// In certain cases, a query may only use a certain column in a Filter that
-    /// has been completely pushed down to the scan. In this case, the
-    /// projection will not contain all the columns found in the filter
+    /// In some cases, a query may use a column only in a filter and the
+    /// projection will not contain all columns referenced by the filter
     /// expressions.
     ///
     /// For example, given the query `SELECT t.a FROM t WHERE t.b > 5`,
@@ -154,15 +147,40 @@ pub trait TableProvider: Debug + Sync + Send {
     ///
     /// # Limit
     ///
-    /// If `limit` is specified,  must only produce *at least* this many rows,
-    /// (though it may return more).  Like Projection Pushdown and Filter
-    /// Pushdown, DataFusion pushes `LIMIT`s  as far down in the plan as
-    /// possible, called "Limit Pushdown" as some sources can use this
-    /// information to improve their performance. Note that if there are any
-    /// Inexact filters pushed down, the LIMIT cannot be pushed down. This is
-    /// because inexact filters do not guarantee that every filtered row is
-    /// removed, so applying the limit could lead to too few rows being available
-    /// to return as a final result.
+    /// If `limit` is specified, the scan must produce *at least* this many
+    /// rows, though it may return more. Like Projection Pushdown and Filter
+    /// Pushdown, DataFusion pushes `LIMIT`s as far down in the plan as
+    /// possible. This is called "Limit Pushdown", and some sources can use the
+    /// information to improve performance.
+    ///
+    /// Note: If any pushed-down filters are `Inexact`, the `LIMIT` cannot be
+    /// pushed down. Inexact filters do not guarantee that every filtered row is
+    /// removed, so applying the limit could leave too few rows to return in the
+    /// final result.
+    ///
+    /// # Evaluation Order
+    ///
+    /// The logical evaluation order is `filters`, then `limit`, then
+    /// `projection`.
+    ///
+    /// Note that `limit` applies to the filtered result, not to the unfiltered
+    /// input, and `projection` affects only which columns are returned, not
+    /// which rows qualify.
+    ///
+    /// For example, if a scan receives:
+    ///
+    /// - `projection = [a]`
+    /// - `filters = [b > 5]`
+    /// - `limit = Some(3)`
+    ///
+    /// It must logically produce results equivalent to:
+    ///
+    /// ```text
+    /// PROJECTION a (LIMIT 3 (SCAN WHERE b > 5))
+    /// ```
+    ///
+    /// As noted above, columns referenced only by pushed-down filters may be
+    /// absent from `projection`.
     async fn scan(
         &self,
         state: &dyn Session,
@@ -246,7 +264,6 @@ pub trait TableProvider: Debug + Sync + Send {
     ///
     /// #[async_trait]
     /// impl TableProvider for TestDataSource {
-    /// # fn as_any(&self) -> &dyn Any { todo!() }
     /// # fn schema(&self) -> SchemaRef { todo!() }
     /// # fn table_type(&self) -> TableType { todo!() }
     /// # async fn scan(&self, s: &dyn Session, p: Option<&Vec<usize>>, f: &[Expr], l: Option<usize>) -> Result<Arc<dyn ExecutionPlan>> {
@@ -328,6 +345,59 @@ pub trait TableProvider: Debug + Sync + Send {
     ) -> Result<Arc<dyn ExecutionPlan>> {
         not_impl_err!("Insert into not implemented for this table")
     }
+
+    /// Delete rows matching the filter predicates.
+    ///
+    /// Returns an [`ExecutionPlan`] producing a single row with `count` (UInt64).
+    /// Empty `filters` deletes all rows.
+    async fn delete_from(
+        &self,
+        _state: &dyn Session,
+        _filters: Vec<Expr>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        not_impl_err!("DELETE not supported for {} table", self.table_type())
+    }
+
+    /// Update rows matching the filter predicates.
+    ///
+    /// Returns an [`ExecutionPlan`] producing a single row with `count` (UInt64).
+    /// Empty `filters` updates all rows.
+    async fn update(
+        &self,
+        _state: &dyn Session,
+        _assignments: Vec<(String, Expr)>,
+        _filters: Vec<Expr>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        not_impl_err!("UPDATE not supported for {} table", self.table_type())
+    }
+
+    /// Remove all rows from the table.
+    ///
+    /// Should return an [ExecutionPlan] producing a single row with count (UInt64),
+    /// representing the number of rows removed.
+    async fn truncate(&self, _state: &dyn Session) -> Result<Arc<dyn ExecutionPlan>> {
+        not_impl_err!("TRUNCATE not supported for {} table", self.table_type())
+    }
+}
+
+impl dyn TableProvider {
+    /// Returns `true` if the table provider is of type `T`.
+    ///
+    /// Prefer this over `downcast_ref::<T>().is_some()`. Works correctly when
+    /// called on `Arc<dyn TableProvider>` via auto-deref.
+    pub fn is<T: TableProvider>(&self) -> bool {
+        (self as &dyn Any).is::<T>()
+    }
+
+    /// Attempts to downcast this table provider to a concrete type `T`,
+    /// returning `None` if the provider is not of that type.
+    ///
+    /// Works correctly when called on `Arc<dyn TableProvider>` via auto-deref,
+    /// unlike `(&arc as &dyn Any).downcast_ref::<T>()` which would attempt to
+    /// downcast the `Arc` itself.
+    pub fn downcast_ref<T: TableProvider>(&self) -> Option<&T> {
+        (self as &dyn Any).downcast_ref()
+    }
 }
 
 /// Arguments for scanning a table with [`TableProvider::scan_with_args`].
@@ -452,10 +522,49 @@ pub trait TableProviderFactory: Debug + Sync + Send {
     ) -> Result<Arc<dyn TableProvider>>;
 }
 
+/// Describes arguments provided to the table function call.
+pub struct TableFunctionArgs<'e, 's> {
+    /// Call arguments.
+    exprs: &'e [Expr],
+    /// Session within which the function is called.
+    session: &'s dyn Session,
+}
+
+impl<'e, 's> TableFunctionArgs<'e, 's> {
+    /// Make a new [`TableFunctionArgs`].
+    pub fn new(exprs: &'e [Expr], session: &'s dyn Session) -> Self {
+        Self { exprs, session }
+    }
+
+    /// Get expressions passed as the called function arguments.
+    pub fn exprs(&self) -> &'e [Expr] {
+        self.exprs
+    }
+
+    /// Get a session where the table function is called.
+    pub fn session(&self) -> &'s dyn Session {
+        self.session
+    }
+}
+
 /// A trait for table function implementations
-pub trait TableFunctionImpl: Debug + Sync + Send {
+pub trait TableFunctionImpl: Debug + Sync + Send + Any {
+    /// Create a table provider
+    #[deprecated(
+        since = "53.0.0",
+        note = "Implement `TableFunctionImpl::call_with_args` instead"
+    )]
+    fn call(&self, _exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
+        internal_err!(
+            "TableFunctionImpl::call is not implemented. Implement TableFunctionImpl::call_with_args instead."
+        )
+    }
+
     /// Create a table provider
-    fn call(&self, args: &[Expr]) -> Result<Arc<dyn TableProvider>>;
+    fn call_with_args(&self, args: TableFunctionArgs) -> Result<Arc<dyn TableProvider>> {
+        #[expect(deprecated)]
+        self.call(args.exprs)
+    }
 }
 
 /// A table that uses a function to generate data
@@ -484,7 +593,20 @@ impl TableFunction {
     }
 
     /// Get the function implementation and generate a table
+    #[deprecated(
+        since = "53.0.0",
+        note = "Use `TableFunction::create_table_provider_with_args` instead"
+    )]
     pub fn create_table_provider(&self, args: &[Expr]) -> Result<Arc<dyn TableProvider>> {
+        #[expect(deprecated)]
         self.fun.call(args)
     }
+
+    /// Get the function implementation and generate a table
+    pub fn create_table_provider_with_args(
+        &self,
+        args: TableFunctionArgs,
+    ) -> Result<Arc<dyn TableProvider>> {
+        self.fun.call_with_args(args)
+    }
 }
diff --git a/datafusion/catalog/src/view.rs b/datafusion/catalog/src/view.rs
index 89c6a4a224511..45084e65f23f2 100644
--- a/datafusion/catalog/src/view.rs
+++ b/datafusion/catalog/src/view.rs
@@ -17,15 +17,15 @@
 
 //! View data source which uses a LogicalPlan as it's input.
 
-use std::{any::Any, borrow::Cow, sync::Arc};
+use std::{borrow::Cow, sync::Arc};
 
 use crate::Session;
 use crate::TableProvider;
 
 use arrow::datatypes::SchemaRef;
 use async_trait::async_trait;
-use datafusion_common::error::Result;
 use datafusion_common::Column;
+use datafusion_common::error::Result;
 use datafusion_expr::TableType;
 use datafusion_expr::{Expr, LogicalPlan};
 use datafusion_expr::{LogicalPlanBuilder, TableProviderFilterPushDown};
@@ -83,10 +83,6 @@ impl ViewTable {
 
 #[async_trait]
 impl TableProvider for ViewTable {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn get_logical_plan(&'_ self) -> Option<Cow<'_, LogicalPlan>> {
         Some(Cow::Borrowed(&self.logical_plan))
     }
diff --git a/datafusion/common-runtime/Cargo.toml b/datafusion/common-runtime/Cargo.toml
index e53d97b41360a..fd9a818bcb1d0 100644
--- a/datafusion/common-runtime/Cargo.toml
+++ b/datafusion/common-runtime/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
diff --git a/datafusion/common-runtime/src/common.rs b/datafusion/common-runtime/src/common.rs
index cebd6e04cd1b1..ca618b19ed2f1 100644
--- a/datafusion/common-runtime/src/common.rs
+++ b/datafusion/common-runtime/src/common.rs
@@ -44,7 +44,7 @@ impl<R: 'static> SpawnedTask<R> {
         R: Send,
     {
         // Ok to use spawn here as SpawnedTask handles aborting/cancelling the task on Drop
-        #[allow(clippy::disallowed_methods)]
+        #[expect(clippy::disallowed_methods)]
         let inner = tokio::task::spawn(trace_future(task));
         Self { inner }
     }
@@ -56,7 +56,7 @@ impl<R: 'static> SpawnedTask<R> {
         R: Send,
     {
         // Ok to use spawn_blocking here as SpawnedTask handles aborting/cancelling the task on Drop
-        #[allow(clippy::disallowed_methods)]
+        #[expect(clippy::disallowed_methods)]
         let inner = tokio::task::spawn_blocking(trace_block(task));
         Self { inner }
     }
@@ -115,14 +115,14 @@ impl<R> Drop for SpawnedTask<R> {
 mod tests {
     use super::*;
 
-    use std::future::{pending, Pending};
+    use std::future::{Pending, pending};
 
     use tokio::{runtime::Runtime, sync::oneshot};
 
     #[tokio::test]
     async fn runtime_shutdown() {
         let rt = Runtime::new().unwrap();
-        #[allow(clippy::async_yields_async)]
+        #[expect(clippy::async_yields_async)]
         let task = rt
             .spawn(async {
                 SpawnedTask::spawn(async {
diff --git a/datafusion/common-runtime/src/lib.rs b/datafusion/common-runtime/src/lib.rs
index 5d404d99e7760..cf45ccf3ef63a 100644
--- a/datafusion/common-runtime/src/lib.rs
+++ b/datafusion/common-runtime/src/lib.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 #![doc(
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
@@ -31,5 +32,5 @@ mod trace_utils;
 pub use common::SpawnedTask;
 pub use join_set::JoinSet;
 pub use trace_utils::{
-    set_join_set_tracer, trace_block, trace_future, JoinSetTracer, JoinSetTracerError,
+    JoinSetTracer, JoinSetTracerError, set_join_set_tracer, trace_block, trace_future,
 };
diff --git a/datafusion/common-runtime/src/trace_utils.rs b/datafusion/common-runtime/src/trace_utils.rs
index c3a39c355fc88..f8adbe8825bc1 100644
--- a/datafusion/common-runtime/src/trace_utils.rs
+++ b/datafusion/common-runtime/src/trace_utils.rs
@@ -15,8 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use futures::future::BoxFuture;
 use futures::FutureExt;
+use futures::future::BoxFuture;
 use std::any::Any;
 use std::error::Error;
 use std::fmt::{Display, Formatter, Result as FmtResult};
diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml
index a9eb0f2220c69..740d4e45b8d05 100644
--- a/datafusion/common/Cargo.toml
+++ b/datafusion/common/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -38,49 +41,55 @@ workspace = true
 name = "datafusion_common"
 
 [features]
-avro = ["apache-avro"]
 backtrace = []
 parquet_encryption = [
     "parquet",
     "parquet/encryption",
     "dep:hex",
 ]
-pyarrow = ["pyo3", "arrow/pyarrow", "parquet"]
 force_hash_collisions = []
 recursive_protection = ["dep:recursive"]
 parquet = ["dep:parquet"]
 sql = ["sqlparser"]
 
+[[bench]]
+harness = false
+name = "with_hashes"
+
+[[bench]]
+harness = false
+name = "scalar_to_array"
+
+[[bench]]
+harness = false
+name = "stats_merge"
+
 [dependencies]
-ahash = { workspace = true }
-apache-avro = { version = "0.20", default-features = false, features = [
-    "bzip",
-    "snappy",
-    "xz",
-    "zstandard",
-], optional = true }
 arrow = { workspace = true }
 arrow-ipc = { workspace = true }
+arrow-schema = { workspace = true, features = ["canonical_extension_types"] }
 chrono = { workspace = true }
+foldhash = "0.2"
 half = { workspace = true }
 hashbrown = { workspace = true }
 hex = { workspace = true, optional = true }
 indexmap = { workspace = true }
-libc = "0.2.177"
+itertools = { workspace = true }
+libc = "0.2.185"
 log = { workspace = true }
 object_store = { workspace = true, optional = true }
 parquet = { workspace = true, optional = true, default-features = true }
-paste = "1.0.15"
-pyo3 = { version = "0.26", optional = true }
 recursive = { workspace = true, optional = true }
 sqlparser = { workspace = true, optional = true }
 tokio = { workspace = true }
+uuid = { workspace = true, features = ["v4"] }
 
 [target.'cfg(target_family = "wasm")'.dependencies]
 web-time = "1.1.0"
 
 [dev-dependencies]
 chrono = { workspace = true }
+criterion = { workspace = true }
 insta = { workspace = true }
 rand = { workspace = true }
 sqlparser = { workspace = true }
diff --git a/datafusion/common/benches/scalar_to_array.rs b/datafusion/common/benches/scalar_to_array.rs
new file mode 100644
index 0000000000000..90a152e515fe5
--- /dev/null
+++ b/datafusion/common/benches/scalar_to_array.rs
@@ -0,0 +1,107 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmarks for `ScalarValue::to_array_of_size`, focusing on List
+//! scalars.
+
+use arrow::array::{Array, ArrayRef, AsArray, StringViewBuilder};
+use arrow::datatypes::{DataType, Field};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::utils::SingleRowListArrayBuilder;
+use std::sync::Arc;
+
+/// Build a `ScalarValue::List` of `num_elements` Utf8View strings whose
+/// inner StringViewArray has `num_buffers` data buffers.
+fn make_list_scalar(num_elements: usize, num_buffers: usize) -> ScalarValue {
+    let elements_per_buffer = num_elements.div_ceil(num_buffers);
+
+    let mut small_arrays: Vec<ArrayRef> = Vec::new();
+    let mut remaining = num_elements;
+    for buf_idx in 0..num_buffers {
+        let count = remaining.min(elements_per_buffer);
+        if count == 0 {
+            break;
+        }
+        let start = buf_idx * elements_per_buffer;
+        let mut builder = StringViewBuilder::with_capacity(count);
+        for i in start..start + count {
+            builder.append_value(format!("{i:024x}"));
+        }
+        small_arrays.push(Arc::new(builder.finish()) as ArrayRef);
+        remaining -= count;
+    }
+
+    let refs: Vec<&dyn Array> = small_arrays.iter().map(|a| a.as_ref()).collect();
+    let concated = arrow::compute::concat(&refs).unwrap();
+
+    let list_array = SingleRowListArrayBuilder::new(concated)
+        .with_field(&Field::new_list_field(DataType::Utf8View, true))
+        .build_list_array();
+    ScalarValue::List(Arc::new(list_array))
+}
+
+/// We want to measure the cost of doing the conversion and then also accessing
+/// the results, to model what would happen during query evaluation.
+fn consume_list_array(arr: &ArrayRef) {
+    let list_arr = arr.as_list::<i32>();
+    let mut total_len: usize = 0;
+    for i in 0..list_arr.len() {
+        let inner = list_arr.value(i);
+        let sv = inner.as_string_view();
+        for j in 0..sv.len() {
+            total_len += sv.value(j).len();
+        }
+    }
+    std::hint::black_box(total_len);
+}
+
+fn bench_list_to_array_of_size(c: &mut Criterion) {
+    let mut group = c.benchmark_group("list_to_array_of_size");
+
+    let num_elements = 1245;
+    let scalar_1buf = make_list_scalar(num_elements, 1);
+    let scalar_50buf = make_list_scalar(num_elements, 50);
+
+    for batch_size in [256, 1024] {
+        group.bench_with_input(
+            BenchmarkId::new("1_buffer", batch_size),
+            &batch_size,
+            |b, &sz| {
+                b.iter(|| {
+                    let arr = scalar_1buf.to_array_of_size(sz).unwrap();
+                    consume_list_array(&arr);
+                });
+            },
+        );
+        group.bench_with_input(
+            BenchmarkId::new("50_buffers", batch_size),
+            &batch_size,
+            |b, &sz| {
+                b.iter(|| {
+                    let arr = scalar_50buf.to_array_of_size(sz).unwrap();
+                    consume_list_array(&arr);
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_list_to_array_of_size);
+criterion_main!(benches);
diff --git a/datafusion/common/benches/stats_merge.rs b/datafusion/common/benches/stats_merge.rs
new file mode 100644
index 0000000000000..73229b6379360
--- /dev/null
+++ b/datafusion/common/benches/stats_merge.rs
@@ -0,0 +1,85 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmark for `Statistics::try_merge_iter`.
+
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, Schema};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::stats::Precision;
+use datafusion_common::{ColumnStatistics, ScalarValue, Statistics};
+
+/// Build a vector of `n` with `num_cols` columns
+fn make_stats(n: usize, num_cols: usize) -> Vec<Statistics> {
+    (0..n)
+        .map(|i| {
+            let mut stats = Statistics::default()
+                .with_num_rows(Precision::Exact(100 + i))
+                .with_total_byte_size(Precision::Exact(8000 + i * 80));
+            for c in 0..num_cols {
+                let base = (i * num_cols + c) as i64;
+                stats = stats.add_column_statistics(
+                    ColumnStatistics::new_unknown()
+                        .with_null_count(Precision::Exact(i))
+                        .with_min_value(Precision::Exact(ScalarValue::Int64(Some(base))))
+                        .with_max_value(Precision::Exact(ScalarValue::Int64(Some(
+                            base + 1000,
+                        ))))
+                        .with_sum_value(Precision::Exact(ScalarValue::Int64(Some(
+                            base * 100,
+                        )))),
+                );
+            }
+            stats
+        })
+        .collect()
+}
+
+fn bench_stats_merge(c: &mut Criterion) {
+    let mut group = c.benchmark_group("stats_merge");
+
+    for &num_partitions in &[10, 100, 500] {
+        for &num_cols in &[1, 5, 20] {
+            let items = make_stats(num_partitions, num_cols);
+            let schema = Arc::new(Schema::new(
+                (0..num_cols)
+                    .map(|i| Field::new(format!("col{i}"), DataType::Int64, true))
+                    .collect::<Vec<_>>(),
+            ));
+
+            let param = format!("{num_partitions}parts_{num_cols}cols");
+
+            group.bench_with_input(
+                BenchmarkId::new("try_merge_iter", &param),
+                &(&items, &schema),
+                |b, (items, schema)| {
+                    b.iter(|| {
+                        std::hint::black_box(
+                            Statistics::try_merge_iter(*items, schema).unwrap(),
+                        );
+                    });
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_stats_merge);
+criterion_main!(benches);
diff --git a/datafusion/common/benches/with_hashes.rs b/datafusion/common/benches/with_hashes.rs
new file mode 100644
index 0000000000000..0e9c53c896a5e
--- /dev/null
+++ b/datafusion/common/benches/with_hashes.rs
@@ -0,0 +1,569 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmarks for `with_hashes` function
+
+use arrow::array::{
+    Array, ArrayRef, ArrowPrimitiveType, DictionaryArray, GenericStringArray, Int32Array,
+    Int64Array, ListArray, MapArray, NullBufferBuilder, OffsetSizeTrait, PrimitiveArray,
+    RunArray, StringViewArray, StructArray, UnionArray, make_array,
+};
+use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
+use arrow::datatypes::{
+    ArrowDictionaryKeyType, DataType, Field, Fields, Int32Type, Int64Type, UnionFields,
+};
+use criterion::{Bencher, Criterion, criterion_group, criterion_main};
+use datafusion_common::hash_utils::RandomState;
+use datafusion_common::hash_utils::with_hashes;
+use rand::Rng;
+use rand::SeedableRng;
+use rand::distr::{Alphanumeric, Distribution, StandardUniform};
+use rand::prelude::StdRng;
+use std::sync::Arc;
+
+const BATCH_SIZE: usize = 8192;
+
+struct BenchData {
+    name: &'static str,
+    array: ArrayRef,
+    /// Union arrays can't have null bitmasks added
+    supports_nulls: bool,
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let pool = StringPool::new(100, 64);
+    // poll with small strings for string view tests (<=12 bytes are inlined)
+    let small_pool = StringPool::new(100, 5);
+    let cases = [
+        BenchData {
+            name: "int64",
+            array: primitive_array::<Int64Type>(BATCH_SIZE),
+            supports_nulls: true,
+        },
+        BenchData {
+            name: "utf8",
+            array: pool.string_array::<i32>(BATCH_SIZE),
+            supports_nulls: true,
+        },
+        BenchData {
+            name: "large_utf8",
+            array: pool.string_array::<i64>(BATCH_SIZE),
+            supports_nulls: true,
+        },
+        BenchData {
+            name: "utf8_view",
+            array: pool.string_view_array(BATCH_SIZE),
+            supports_nulls: true,
+        },
+        BenchData {
+            name: "utf8_view (small)",
+            array: small_pool.string_view_array(BATCH_SIZE),
+            supports_nulls: true,
+        },
+        BenchData {
+            name: "dictionary_utf8_int32",
+            array: pool.dictionary_array::<Int32Type>(BATCH_SIZE),
+            supports_nulls: true,
+        },
+        BenchData {
+            name: "list_array",
+            array: list_array(BATCH_SIZE),
+            supports_nulls: true,
+        },
+        BenchData {
+            name: "map_array",
+            array: map_array(BATCH_SIZE),
+            supports_nulls: true,
+        },
+        BenchData {
+            name: "sparse_union",
+            array: sparse_union_array(BATCH_SIZE),
+            supports_nulls: false,
+        },
+        BenchData {
+            name: "dense_union",
+            array: dense_union_array(BATCH_SIZE),
+            supports_nulls: false,
+        },
+        BenchData {
+            name: "struct_array",
+            array: create_struct_array(&pool, BATCH_SIZE),
+            supports_nulls: true,
+        },
+        BenchData {
+            name: "run_array_int32",
+            array: create_run_array::<Int32Type>(BATCH_SIZE),
+            supports_nulls: true,
+        },
+    ];
+
+    for BenchData {
+        name,
+        array,
+        supports_nulls,
+    } in cases
+    {
+        c.bench_function(&format!("{name}: single, no nulls"), |b| {
+            do_hash_test(b, std::slice::from_ref(&array));
+        });
+        c.bench_function(&format!("{name}: multiple, no nulls"), |b| {
+            let arrays = vec![array.clone(), array.clone(), array.clone()];
+            do_hash_test(b, &arrays);
+        });
+        // Union arrays can't have null bitmasks
+        if supports_nulls {
+            let nullable_array = add_nulls(&array);
+            c.bench_function(&format!("{name}: single, nulls"), |b| {
+                do_hash_test(b, std::slice::from_ref(&nullable_array));
+            });
+            c.bench_function(&format!("{name}: multiple, nulls"), |b| {
+                let arrays = vec![
+                    nullable_array.clone(),
+                    nullable_array.clone(),
+                    nullable_array.clone(),
+                ];
+                do_hash_test(b, &arrays);
+            });
+        }
+    }
+}
+
+fn do_hash_test(b: &mut Bencher, arrays: &[ArrayRef]) {
+    let state = RandomState::default();
+    b.iter(|| {
+        with_hashes(arrays, &state, |hashes| {
+            assert_eq!(hashes.len(), BATCH_SIZE); // make sure the result is used
+            Ok(())
+        })
+        .unwrap();
+    });
+}
+
+fn create_null_mask(len: usize) -> NullBuffer
+where
+    StandardUniform: Distribution<bool>,
+{
+    let mut rng = make_rng();
+    let null_density = 0.03;
+    let mut builder = NullBufferBuilder::new(len);
+    for _ in 0..len {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            builder.append_non_null();
+        }
+    }
+    builder.finish().expect("should be nulls in buffer")
+}
+
+// Returns a new array that is the same as array, but with nulls
+// Handles the special case of RunArray where nulls must be in the values array
+fn add_nulls(array: &ArrayRef) -> ArrayRef {
+    use arrow::datatypes::DataType;
+
+    match array.data_type() {
+        DataType::RunEndEncoded(_, _) => {
+            // RunArray can't have top-level nulls, so apply nulls to the values array
+            let run_array = array
+                .as_any()
+                .downcast_ref::<RunArray<Int32Type>>()
+                .expect("Expected RunArray");
+
+            let run_ends_buffer = run_array.run_ends().inner().clone();
+            let run_ends_array = PrimitiveArray::<Int32Type>::new(run_ends_buffer, None);
+            let values = run_array.values().clone();
+
+            // Add nulls to the values array
+            let values_with_nulls = {
+                let array_data = values
+                    .clone()
+                    .into_data()
+                    .into_builder()
+                    .nulls(Some(create_null_mask(values.len())))
+                    .build()
+                    .unwrap();
+                make_array(array_data)
+            };
+
+            Arc::new(
+                RunArray::try_new(&run_ends_array, values_with_nulls.as_ref())
+                    .expect("Failed to create RunArray with null values"),
+            )
+        }
+        _ => {
+            let array_data = array
+                .clone()
+                .into_data()
+                .into_builder()
+                .nulls(Some(create_null_mask(array.len())))
+                .build()
+                .unwrap();
+            make_array(array_data)
+        }
+    }
+}
+
+pub fn make_rng() -> StdRng {
+    StdRng::seed_from_u64(42)
+}
+
+/// String pool for generating low cardinality data (for dictionaries and string views)
+struct StringPool {
+    strings: Vec<String>,
+}
+
+impl StringPool {
+    /// Create a new string pool with the given number of random strings
+    /// each having between 1 and max_length characters.
+    fn new(pool_size: usize, max_length: usize) -> Self {
+        let mut rng = make_rng();
+        let mut strings = Vec::with_capacity(pool_size);
+        for _ in 0..pool_size {
+            let len = rng.random_range(1..=max_length);
+            let value: Vec<u8> =
+                rng.clone().sample_iter(&Alphanumeric).take(len).collect();
+            strings.push(String::from_utf8(value).unwrap());
+        }
+        Self { strings }
+    }
+
+    /// Return an iterator over &str of the given length with values randomly chosen from the pool
+    fn iter_strings(&self, len: usize) -> impl Iterator<Item = &str> {
+        let mut rng = make_rng();
+        (0..len).map(move |_| {
+            let idx = rng.random_range(0..self.strings.len());
+            self.strings[idx].as_str()
+        })
+    }
+
+    /// Return a StringArray of the given length with values randomly chosen from the pool
+    fn string_array<O: OffsetSizeTrait>(&self, array_length: usize) -> ArrayRef {
+        Arc::new(GenericStringArray::<O>::from_iter_values(
+            self.iter_strings(array_length),
+        ))
+    }
+
+    /// Return a StringViewArray of the given length with values randomly chosen from the pool
+    fn string_view_array(&self, array_length: usize) -> ArrayRef {
+        Arc::new(StringViewArray::from_iter_values(
+            self.iter_strings(array_length),
+        ))
+    }
+
+    /// Return a DictionaryArray of the given length with values randomly chosen from the pool
+    fn dictionary_array<T: ArrowDictionaryKeyType>(
+        &self,
+        array_length: usize,
+    ) -> ArrayRef {
+        Arc::new(DictionaryArray::<T>::from_iter(
+            self.iter_strings(array_length),
+        ))
+    }
+}
+
+pub fn primitive_array<T>(array_len: usize) -> ArrayRef
+where
+    T: ArrowPrimitiveType,
+    StandardUniform: Distribution<T::Native>,
+{
+    let mut rng = make_rng();
+
+    let array: PrimitiveArray<T> = (0..array_len)
+        .map(|_| Some(rng.random::<T::Native>()))
+        .collect();
+    Arc::new(array)
+}
+
+/// Benchmark sliced arrays to demonstrate the optimization for when an array is
+/// sliced, the underlying buffer may be much larger than what's referenced by
+/// the slice. The optimization avoids hashing unreferenced elements.
+fn sliced_array_benchmark(c: &mut Criterion) {
+    // Test with different slice ratios: slice_size / total_size
+    // Smaller ratio = more potential savings from the optimization
+    let slice_ratios = [10, 5, 2]; // 1/10, 1/5, 1/2 of total
+
+    for ratio in slice_ratios {
+        let total_rows = BATCH_SIZE * ratio;
+        let slice_offset = BATCH_SIZE * (ratio / 2); // Take from middle
+        let slice_len = BATCH_SIZE;
+
+        // Sliced ListArray
+        {
+            let full_array = list_array(total_rows);
+            let sliced: ArrayRef = Arc::new(
+                full_array
+                    .as_any()
+                    .downcast_ref::<ListArray>()
+                    .unwrap()
+                    .slice(slice_offset, slice_len),
+            );
+            c.bench_function(
+                &format!("list_array_sliced: 1/{ratio} of {total_rows} rows"),
+                |b| {
+                    do_hash_test_with_len(b, std::slice::from_ref(&sliced), slice_len);
+                },
+            );
+        }
+
+        // Sliced MapArray
+        {
+            let full_array = map_array(total_rows);
+            let sliced: ArrayRef = Arc::new(
+                full_array
+                    .as_any()
+                    .downcast_ref::<MapArray>()
+                    .unwrap()
+                    .slice(slice_offset, slice_len),
+            );
+            c.bench_function(
+                &format!("map_array_sliced: 1/{ratio} of {total_rows} rows"),
+                |b| {
+                    do_hash_test_with_len(b, std::slice::from_ref(&sliced), slice_len);
+                },
+            );
+        }
+
+        // Sliced Sparse UnionArray
+        {
+            let full_array = sparse_union_array(total_rows);
+            let sliced: ArrayRef = Arc::new(
+                full_array
+                    .as_any()
+                    .downcast_ref::<UnionArray>()
+                    .unwrap()
+                    .slice(slice_offset, slice_len),
+            );
+            c.bench_function(
+                &format!("sparse_union_sliced: 1/{ratio} of {total_rows} rows"),
+                |b| {
+                    do_hash_test_with_len(b, std::slice::from_ref(&sliced), slice_len);
+                },
+            );
+        }
+    }
+}
+
+fn do_hash_test_with_len(b: &mut Bencher, arrays: &[ArrayRef], expected_len: usize) {
+    let state = RandomState::default();
+    b.iter(|| {
+        with_hashes(arrays, &state, |hashes| {
+            assert_eq!(hashes.len(), expected_len);
+            Ok(())
+        })
+        .unwrap();
+    });
+}
+
+fn list_array(num_rows: usize) -> ArrayRef {
+    let mut rng = make_rng();
+    let elements_per_row = 5;
+    let total_elements = num_rows * elements_per_row;
+
+    let values: Int64Array = (0..total_elements)
+        .map(|_| Some(rng.random::<i64>()))
+        .collect();
+    let offsets: Vec<i32> = (0..=num_rows)
+        .map(|i| (i * elements_per_row) as i32)
+        .collect();
+
+    Arc::new(ListArray::new(
+        Arc::new(Field::new("item", DataType::Int64, true)),
+        OffsetBuffer::new(ScalarBuffer::from(offsets)),
+        Arc::new(values),
+        None,
+    ))
+}
+
+fn map_array(num_rows: usize) -> ArrayRef {
+    let mut rng = make_rng();
+    let entries_per_row = 5;
+    let total_entries = num_rows * entries_per_row;
+
+    let keys: Int32Array = (0..total_entries)
+        .map(|_| Some(rng.random::<i32>()))
+        .collect();
+    let values: Int64Array = (0..total_entries)
+        .map(|_| Some(rng.random::<i64>()))
+        .collect();
+    let offsets: Vec<i32> = (0..=num_rows)
+        .map(|i| (i * entries_per_row) as i32)
+        .collect();
+
+    let entries = StructArray::try_new(
+        Fields::from(vec![
+            Field::new("keys", DataType::Int32, false),
+            Field::new("values", DataType::Int64, true),
+        ]),
+        vec![Arc::new(keys), Arc::new(values)],
+        None,
+    )
+    .unwrap();
+
+    Arc::new(MapArray::new(
+        Arc::new(Field::new(
+            "entries",
+            DataType::Struct(Fields::from(vec![
+                Field::new("keys", DataType::Int32, false),
+                Field::new("values", DataType::Int64, true),
+            ])),
+            false,
+        )),
+        OffsetBuffer::new(ScalarBuffer::from(offsets)),
+        entries,
+        None,
+        false,
+    ))
+}
+
+fn sparse_union_array(num_rows: usize) -> ArrayRef {
+    let mut rng = make_rng();
+    let num_types = 5;
+
+    let type_ids: Vec<i8> = (0..num_rows)
+        .map(|_| rng.random_range(0..num_types) as i8)
+        .collect();
+    let (fields, children): (Vec<_>, Vec<_>) = (0..num_types)
+        .map(|i| {
+            (
+                (
+                    i as i8,
+                    Arc::new(Field::new(format!("f{i}"), DataType::Int64, true)),
+                ),
+                primitive_array::<Int64Type>(num_rows),
+            )
+        })
+        .unzip();
+
+    Arc::new(
+        UnionArray::try_new(
+            UnionFields::from_iter(fields),
+            ScalarBuffer::from(type_ids),
+            None,
+            children,
+        )
+        .unwrap(),
+    )
+}
+
+fn dense_union_array(num_rows: usize) -> ArrayRef {
+    let mut rng = make_rng();
+    let num_types = 5;
+    let type_ids: Vec<i8> = (0..num_rows)
+        .map(|_| rng.random_range(0..num_types) as i8)
+        .collect();
+
+    let mut type_counts = vec![0i32; num_types];
+    for &tid in &type_ids {
+        type_counts[tid as usize] += 1;
+    }
+
+    let mut current_offsets = vec![0i32; num_types];
+    let offsets: Vec<i32> = type_ids
+        .iter()
+        .map(|&tid| {
+            let offset = current_offsets[tid as usize];
+            current_offsets[tid as usize] += 1;
+            offset
+        })
+        .collect();
+
+    let (fields, children): (Vec<_>, Vec<_>) = (0..num_types)
+        .map(|i| {
+            (
+                (
+                    i as i8,
+                    Arc::new(Field::new(format!("f{i}"), DataType::Int64, true)),
+                ),
+                primitive_array::<Int64Type>(type_counts[i] as usize),
+            )
+        })
+        .unzip();
+
+    Arc::new(
+        UnionArray::try_new(
+            UnionFields::from_iter(fields),
+            ScalarBuffer::from(type_ids),
+            Some(ScalarBuffer::from(offsets)),
+            children,
+        )
+        .unwrap(),
+    )
+}
+
+fn boolean_array(array_len: usize) -> ArrayRef {
+    let mut rng = make_rng();
+    Arc::new(
+        (0..array_len)
+            .map(|_| Some(rng.random::<bool>()))
+            .collect::<arrow::array::BooleanArray>(),
+    )
+}
+
+/// Create a StructArray with multiple columns
+fn create_struct_array(pool: &StringPool, array_len: usize) -> ArrayRef {
+    let bool_array = boolean_array(array_len);
+    let int32_array = primitive_array::<Int32Type>(array_len);
+    let int64_array = primitive_array::<Int64Type>(array_len);
+    let str_array = pool.string_array::<i32>(array_len);
+
+    let fields = Fields::from(vec![
+        Field::new("bool_col", DataType::Boolean, false),
+        Field::new("int32_col", DataType::Int32, false),
+        Field::new("int64_col", DataType::Int64, false),
+        Field::new("string_col", DataType::Utf8, false),
+    ]);
+
+    Arc::new(StructArray::new(
+        fields,
+        vec![bool_array, int32_array, int64_array, str_array],
+        None,
+    ))
+}
+
+/// Create a RunArray to test run array hashing.
+fn create_run_array<T>(array_len: usize) -> ArrayRef
+where
+    T: ArrowPrimitiveType,
+    StandardUniform: Distribution<T::Native>,
+{
+    let mut rng = make_rng();
+
+    // Create runs of varying lengths
+    let mut run_ends = Vec::new();
+    let mut values = Vec::new();
+    let mut current_end = 0;
+
+    while current_end < array_len {
+        // Random run length between 1 and 50
+        let run_length = rng.random_range(1..=50).min(array_len - current_end);
+        current_end += run_length;
+        run_ends.push(current_end as i32);
+        values.push(Some(rng.random::<T::Native>()));
+    }
+
+    let run_ends_array = Arc::new(PrimitiveArray::<Int32Type>::from(run_ends));
+    let values_array: Arc<dyn Array> =
+        Arc::new(values.into_iter().collect::<PrimitiveArray<T>>());
+
+    Arc::new(
+        RunArray::try_new(&run_ends_array, values_array.as_ref())
+            .expect("Failed to create RunArray"),
+    )
+}
+
+criterion_group!(benches, criterion_benchmark, sliced_array_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/common/src/alias.rs b/datafusion/common/src/alias.rs
index 2ee2cb4dc7add..99f6447a6acd8 100644
--- a/datafusion/common/src/alias.rs
+++ b/datafusion/common/src/alias.rs
@@ -37,6 +37,16 @@ impl AliasGenerator {
         Self::default()
     }
 
+    /// Advance the counter to at least `min_id`, ensuring future aliases
+    /// won't collide with already-existing ones.
+    ///
+    /// For example, if the query already contains an alias `alias_42`, then calling
+    /// `update_min_id(42)` will ensure that future aliases generated by this
+    /// [`AliasGenerator`] will start from `alias_43`.
+    pub fn update_min_id(&self, min_id: usize) {
+        self.next_id.fetch_max(min_id + 1, Ordering::Relaxed);
+    }
+
     /// Return a unique alias with the provided prefix
     pub fn next(&self, prefix: &str) -> String {
         let id = self.next_id.fetch_add(1, Ordering::Relaxed);
diff --git a/datafusion/common/src/cast.rs b/datafusion/common/src/cast.rs
index e6eda3c585e89..bc4313ed95665 100644
--- a/datafusion/common/src/cast.rs
+++ b/datafusion/common/src/cast.rs
@@ -20,13 +20,14 @@
 //! but provide an error message rather than a panic, as the corresponding
 //! kernels in arrow-rs such as `as_boolean_array` do.
 
-use crate::{downcast_value, Result};
+use crate::{Result, downcast_value};
 use arrow::array::{
     BinaryViewArray, Decimal32Array, Decimal64Array, DurationMicrosecondArray,
     DurationMillisecondArray, DurationNanosecondArray, DurationSecondArray, Float16Array,
-    Int16Array, Int8Array, LargeBinaryArray, LargeStringArray, StringViewArray,
-    UInt16Array,
+    Int8Array, Int16Array, LargeBinaryArray, LargeListViewArray, LargeStringArray,
+    ListViewArray, RunArray, StringViewArray, UInt16Array,
 };
+use arrow::datatypes::RunEndIndexType;
 use arrow::{
     array::{
         Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array,
@@ -37,8 +38,8 @@ use arrow::{
         MapArray, NullArray, OffsetSizeTrait, PrimitiveArray, StringArray, StructArray,
         Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray,
         Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray,
-        TimestampNanosecondArray, TimestampSecondArray, UInt32Array, UInt64Array,
-        UInt8Array, UnionArray,
+        TimestampNanosecondArray, TimestampSecondArray, UInt8Array, UInt32Array,
+        UInt64Array, UnionArray,
     },
     datatypes::{ArrowDictionaryKeyType, ArrowPrimitiveType},
 };
@@ -324,3 +325,18 @@ pub fn as_generic_string_array<T: OffsetSizeTrait>(
 ) -> Result<&GenericStringArray<T>> {
     Ok(downcast_value!(array, GenericStringArray, T))
 }
+
+// Downcast Array to ListViewArray
+pub fn as_list_view_array(array: &dyn Array) -> Result<&ListViewArray> {
+    Ok(downcast_value!(array, ListViewArray))
+}
+
+// Downcast Array to LargeListViewArray
+pub fn as_large_list_view_array(array: &dyn Array) -> Result<&LargeListViewArray> {
+    Ok(downcast_value!(array, LargeListViewArray))
+}
+
+// Downcast Array to RunArray
+pub fn as_run_array<T: RunEndIndexType>(array: &dyn Array) -> Result<&RunArray<T>> {
+    Ok(downcast_value!(array, RunArray, T))
+}
diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
index f4afdf7002078..2889259dd4820 100644
--- a/datafusion/common/src/config.rs
+++ b/datafusion/common/src/config.rs
@@ -22,8 +22,9 @@ use arrow_ipc::CompressionType;
 #[cfg(feature = "parquet_encryption")]
 use crate::encryption::{FileDecryptionProperties, FileEncryptionProperties};
 use crate::error::_config_err;
-use crate::format::{ExplainAnalyzeLevel, ExplainFormat};
-use crate::parsers::CompressionTypeVariant;
+use crate::format::{ExplainAnalyzeCategories, ExplainFormat, MetricType};
+use crate::parquet_config::DFParquetWriterVersion;
+use crate::parsers::{CompressionTypeVariant, CsvQuoteStyle};
 use crate::utils::get_available_parallelism;
 use crate::{DataFusionError, Result};
 #[cfg(feature = "parquet_encryption")]
@@ -157,12 +158,10 @@ macro_rules! config_namespace {
                             // $(#[allow(deprecated)])?
                             {
                                 $(let value = $transform(value);)? // Apply transformation if specified
-                                #[allow(deprecated)]
                                 let ret = self.$field_name.set(rem, value.as_ref());
 
                                 $(if !$warn.is_empty() {
                                     let default: $field_type = $default;
-                                    #[allow(deprecated)]
                                     if default != self.$field_name {
                                         log::warn!($warn);
                                     }
@@ -181,14 +180,36 @@ macro_rules! config_namespace {
                 $(
                     let key = format!(concat!("{}.", stringify!($field_name)), key_prefix);
                     let desc = concat!($($d),*).trim();
-                    #[allow(deprecated)]
                     self.$field_name.visit(v, key.as_str(), desc);
                 )*
             }
+
+            fn reset(&mut self, key: &str) -> $crate::error::Result<()> {
+                let (key, rem) = key.split_once('.').unwrap_or((key, ""));
+                match key {
+                    $(
+                        stringify!($field_name) => {
+                                    {
+                                if rem.is_empty() {
+                                    let default_value: $field_type = $default;
+                                    self.$field_name = default_value;
+                                    Ok(())
+                                } else {
+                                    self.$field_name.reset(rem)
+                                }
+                            }
+                        },
+                    )*
+                    _ => $crate::error::_config_err!(
+                        "Config value \"{}\" not found on {}",
+                        key,
+                        stringify!($struct_name)
+                    ),
+                }
+            }
         }
         impl Default for $struct_name {
             fn default() -> Self {
-                #[allow(deprecated)]
                 Self {
                     $($field_name: $default),*
                 }
@@ -290,6 +311,15 @@ config_namespace! {
         /// By default, `nulls_max` is used to follow Postgres's behavior.
         /// postgres rule: <https://www.postgresql.org/docs/current/queries-order.html>
         pub default_null_ordering: String, default = "nulls_max".to_string()
+
+        /// When set to true, DataFusion may remove `ORDER BY` clauses from
+        /// subqueries or CTEs during SQL planning when their ordering cannot
+        /// affect the result, such as when no `LIMIT` or other
+        /// order-sensitive operator depends on them.
+        ///
+        /// Disable this option to preserve explicit subquery ordering in the
+        /// planned query.
+        pub enable_subquery_sort_elimination: bool, default = true
     }
 }
 
@@ -448,6 +478,25 @@ config_namespace! {
         /// metadata memory consumption
         pub batch_size: usize, default = 8192
 
+        /// A perfect hash join (see `HashJoinExec` for more details) will be considered
+        /// if the range of keys (max - min) on the build side is < this threshold.
+        /// This provides a fast path for joins with very small key ranges,
+        /// bypassing the density check.
+        ///
+        /// Currently only supports cases where build_side.num_rows() < u32::MAX.
+        /// Support for build_side.num_rows() >= u32::MAX will be added in the future.
+        pub perfect_hash_join_small_build_threshold: usize, default = 1024
+
+        /// The minimum required density of join keys on the build side to consider a
+        /// perfect hash join (see `HashJoinExec` for more details). Density is calculated as:
+        /// `(number of rows) / (max_key - min_key + 1)`.
+        /// A perfect hash join may be used if the actual key density > this
+        /// value.
+        ///
+        /// Currently only supports cases where build_side.num_rows() < u32::MAX.
+        /// Support for build_side.num_rows() >= u32::MAX will be added in the future.
+        pub perfect_hash_join_min_key_density: f64, default = 0.15
+
         /// When set to true, record batches will be examined between each operator and
         /// small batches will be coalesced into larger batches. This is helpful when there
         /// are highly selective filters or joins that could produce tiny output batches. The
@@ -517,6 +566,36 @@ config_namespace! {
         /// batches and merged.
         pub sort_in_place_threshold_bytes: usize, default = 1024 * 1024
 
+        /// Maximum buffer capacity (in bytes) per partition for BufferExec
+        /// inserted during sort pushdown optimization.
+        ///
+        /// When PushdownSort eliminates a SortExec under SortPreservingMergeExec,
+        /// a BufferExec is inserted to replace SortExec's buffering role. This
+        /// prevents I/O stalls by allowing the scan to run ahead of the merge.
+        ///
+        /// This uses strictly less memory than the SortExec it replaces (which
+        /// buffers the entire partition). The buffer respects the global memory
+        /// pool limit. Setting this to a large value is safe — actual memory
+        /// usage is bounded by partition size and global memory limits.
+        pub sort_pushdown_buffer_capacity: usize, default = 1024 * 1024 * 1024
+
+        /// Maximum size in bytes for individual spill files before rotating to a new file.
+        ///
+        /// When operators spill data to disk (e.g., RepartitionExec), they write
+        /// multiple batches to the same file until this size limit is reached, then rotate
+        /// to a new file. This reduces syscall overhead compared to one-file-per-batch
+        /// while preventing files from growing too large.
+        ///
+        /// A larger value reduces file creation overhead but may hold more disk space.
+        /// A smaller value creates more files but allows finer-grained space reclamation
+        /// as files can be deleted once fully consumed.
+        ///
+        /// Now only `RepartitionExec` supports this spill file rotation feature, other spilling operators
+        /// may create spill files larger than the limit.
+        ///
+        /// Default: 128 MB
+        pub max_spill_file_size_bytes: usize, default = 128 * 1024 * 1024
+
         /// Number of files to read in parallel when inferring schema and statistics
         pub meta_fetch_concurrency: usize, default = 32
 
@@ -589,6 +668,175 @@ config_namespace! {
         /// written, it may be necessary to increase this size to avoid errors from
         /// the remote end point.
         pub objectstore_writer_buffer_size: usize, default = 10 * 1024 * 1024
+
+        /// Whether to enable ANSI SQL mode.
+        ///
+        /// The flag is experimental and relevant only for DataFusion Spark built-in functions
+        ///
+        /// When `enable_ansi_mode` is set to `true`, the query engine follows ANSI SQL
+        /// semantics for expressions, casting, and error handling. This means:
+        /// - **Strict type coercion rules:** implicit casts between incompatible types are disallowed.
+        /// - **Standard SQL arithmetic behavior:** operations such as division by zero,
+        ///   numeric overflow, or invalid casts raise runtime errors rather than returning
+        ///   `NULL` or adjusted values.
+        /// - **Consistent ANSI behavior** for string concatenation, comparisons, and `NULL` handling.
+        ///
+        /// When `enable_ansi_mode` is `false` (the default), the engine uses a more permissive,
+        /// non-ANSI mode designed for user convenience and backward compatibility. In this mode:
+        /// - Implicit casts between types are allowed (e.g., string to integer when possible).
+        /// - Arithmetic operations are more lenient — for example, `abs()` on the minimum
+        ///   representable integer value returns the input value instead of raising overflow.
+        /// - Division by zero or invalid casts may return `NULL` instead of failing.
+        ///
+        /// # Default
+        /// `false` — ANSI SQL mode is disabled by default.
+        pub enable_ansi_mode: bool, default = false
+
+        /// How many bytes to buffer in the probe side of hash joins while the build side is
+        /// concurrently being built.
+        ///
+        /// Without this, hash joins will wait until the full materialization of the build side
+        /// before polling the probe side. This is useful in scenarios where the query is not
+        /// completely CPU bounded, allowing to do some early work concurrently and reducing the
+        /// latency of the query.
+        ///
+        /// Note that when hash join buffering is enabled, the probe side will start eagerly
+        /// polling data, not giving time for the producer side of dynamic filters to produce any
+        /// meaningful predicate. Queries with dynamic filters might see performance degradation.
+        ///
+        /// Disabled by default, set to a number greater than 0 for enabling it.
+        pub hash_join_buffering_capacity: usize, default = 0
+    }
+}
+
+/// Options for content-defined chunking (CDC) when writing parquet files.
+/// See [`ParquetOptions::use_content_defined_chunking`].
+///
+/// Can be enabled with default options by setting
+/// `use_content_defined_chunking` to `true`, or configured with sub-fields
+/// like `use_content_defined_chunking.min_chunk_size`.
+#[derive(Debug, Clone, PartialEq)]
+pub struct CdcOptions {
+    /// Minimum chunk size in bytes. The rolling hash will not trigger a split
+    /// until this many bytes have been accumulated. Default is 256 KiB.
+    pub min_chunk_size: usize,
+
+    /// Maximum chunk size in bytes. A split is forced when the accumulated
+    /// size exceeds this value. Default is 1 MiB.
+    pub max_chunk_size: usize,
+
+    /// Normalization level. Increasing this improves deduplication ratio
+    /// but increases fragmentation. Recommended range is [-3, 3], default is 0.
+    pub norm_level: i32,
+}
+
+// Note: `CdcOptions` intentionally does NOT implement `Default` so that the
+// blanket `impl<F: ConfigField + Default> ConfigField for Option<F>` does not
+// apply. This allows the specific `impl ConfigField for Option<CdcOptions>`
+// below to handle "true"/"false" for enabling/disabling CDC.
+// Use `CdcOptions::default()` (the inherent method) instead of `Default::default()`.
+impl CdcOptions {
+    /// Returns a new `CdcOptions` with default values.
+    #[expect(clippy::should_implement_trait)]
+    pub fn default() -> Self {
+        Self {
+            min_chunk_size: 256 * 1024,
+            max_chunk_size: 1024 * 1024,
+            norm_level: 0,
+        }
+    }
+}
+
+impl ConfigField for CdcOptions {
+    fn set(&mut self, key: &str, value: &str) -> Result<()> {
+        let (key, rem) = key.split_once('.').unwrap_or((key, ""));
+        match key {
+            "min_chunk_size" => self.min_chunk_size.set(rem, value),
+            "max_chunk_size" => self.max_chunk_size.set(rem, value),
+            "norm_level" => self.norm_level.set(rem, value),
+            _ => _config_err!("Config value \"{}\" not found on CdcOptions", key),
+        }
+    }
+
+    fn visit<V: Visit>(&self, v: &mut V, key_prefix: &str, _description: &'static str) {
+        let key = format!("{key_prefix}.min_chunk_size");
+        self.min_chunk_size.visit(v, &key, "Minimum chunk size in bytes. The rolling hash will not trigger a split until this many bytes have been accumulated. Default is 256 KiB.");
+        let key = format!("{key_prefix}.max_chunk_size");
+        self.max_chunk_size.visit(v, &key, "Maximum chunk size in bytes. A split is forced when the accumulated size exceeds this value. Default is 1 MiB.");
+        let key = format!("{key_prefix}.norm_level");
+        self.norm_level.visit(v, &key, "Normalization level. Increasing this improves deduplication ratio but increases fragmentation. Recommended range is [-3, 3], default is 0.");
+    }
+
+    fn reset(&mut self, key: &str) -> Result<()> {
+        let (key, rem) = key.split_once('.').unwrap_or((key, ""));
+        match key {
+            "min_chunk_size" => {
+                if rem.is_empty() {
+                    self.min_chunk_size = CdcOptions::default().min_chunk_size;
+                    Ok(())
+                } else {
+                    self.min_chunk_size.reset(rem)
+                }
+            }
+            "max_chunk_size" => {
+                if rem.is_empty() {
+                    self.max_chunk_size = CdcOptions::default().max_chunk_size;
+                    Ok(())
+                } else {
+                    self.max_chunk_size.reset(rem)
+                }
+            }
+            "norm_level" => {
+                if rem.is_empty() {
+                    self.norm_level = CdcOptions::default().norm_level;
+                    Ok(())
+                } else {
+                    self.norm_level.reset(rem)
+                }
+            }
+            _ => _config_err!("Config value \"{}\" not found on CdcOptions", key),
+        }
+    }
+}
+
+/// `ConfigField` for `Option<CdcOptions>` — allows setting the option to
+/// `"true"` (enable with defaults) or `"false"` (disable), in addition to
+/// setting individual sub-fields like `min_chunk_size`.
+impl ConfigField for Option<CdcOptions> {
+    fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
+        match self {
+            Some(s) => s.visit(v, key, description),
+            None => v.none(key, description),
+        }
+    }
+
+    fn set(&mut self, key: &str, value: &str) -> Result<()> {
+        if key.is_empty() {
+            match value.to_ascii_lowercase().as_str() {
+                "true" => {
+                    *self = Some(CdcOptions::default());
+                    Ok(())
+                }
+                "false" => {
+                    *self = None;
+                    Ok(())
+                }
+                _ => _config_err!(
+                    "Expected 'true' or 'false' for use_content_defined_chunking, got '{value}'"
+                ),
+            }
+        } else {
+            self.get_or_insert_with(CdcOptions::default).set(key, value)
+        }
+    }
+
+    fn reset(&mut self, key: &str) -> Result<()> {
+        if key.is_empty() {
+            *self = None;
+            Ok(())
+        } else {
+            self.get_or_insert_with(CdcOptions::default).reset(key)
+        }
     }
 }
 
@@ -634,6 +882,12 @@ config_namespace! {
         /// the filters are applied in the same order as written in the query
         pub reorder_filters: bool, default = false
 
+        /// (reading) Force the use of RowSelections for filter results, when
+        /// pushdown_filters is enabled. If false, the reader will automatically
+        /// choose between a RowSelection and a Bitmap based on the number and
+        /// pattern of selected rows.
+        pub force_filter_selections: bool, default = false
+
         /// (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`,
         /// and `Binary/BinaryLarge` with `BinaryView`.
         pub schema_force_view_types: bool, default = true
@@ -671,12 +925,12 @@ config_namespace! {
         /// (writing) Sets best effort maximum size of data page in bytes
         pub data_pagesize_limit: usize, default = 1024 * 1024
 
-        /// (writing) Sets write_batch_size in bytes
+        /// (writing) Sets write_batch_size in rows
         pub write_batch_size: usize, default = 1024
 
         /// (writing) Sets parquet writer version
         /// valid values are "1.0" and "2.0"
-        pub writer_version: String, default = "1.0".to_string()
+        pub writer_version: DFParquetWriterVersion, default = DFParquetWriterVersion::default()
 
         /// (writing) Skip encoding the embedded arrow metadata in the KV_meta
         ///
@@ -686,7 +940,7 @@ config_namespace! {
 
         /// (writing) Sets default parquet compression codec.
         /// Valid values are: uncompressed, snappy, gzip(level),
-        /// lzo, brotli(level), lz4, zstd(level), and lz4_raw.
+        /// brotli(level), lz4, zstd(level), and lz4_raw.
         /// These values are not case sensitive. If NULL, uses
         /// default parquet writer setting
         ///
@@ -771,6 +1025,12 @@ config_namespace! {
         /// writing out already in-memory data, such as from a cached
         /// data frame.
         pub maximum_buffered_record_batches_per_stream: usize, default = 2
+
+        /// (writing) EXPERIMENTAL: Enable content-defined chunking (CDC) when writing
+        /// parquet files. When `Some`, CDC is enabled with the given options; when `None`
+        /// (the default), CDC is disabled. When CDC is enabled, parallel writing is
+        /// automatically disabled since the chunker state must persist across row groups.
+        pub use_content_defined_chunking: Option<CdcOptions>, default = None
     }
 }
 
@@ -836,6 +1096,20 @@ config_namespace! {
         /// past window functions, if possible
         pub enable_window_limits: bool, default = true
 
+        /// When set to true, the optimizer will replace
+        /// Filter(rn<=K) → Window(ROW_NUMBER) → Sort patterns with a
+        /// PartitionedTopKExec that maintains per-partition heaps, avoiding
+        /// a full sort of the input.
+        /// When the window partition key has low cardinality, enabling this optimization
+        /// can improve performance. However, for high cardinality keys, it may
+        /// cause regressions in both memory usage and runtime.
+        pub enable_window_topn: bool, default = false
+
+        /// When set to true, the optimizer will push TopK (Sort with fetch)
+        /// below hash repartition when the partition key is a prefix of the
+        /// sort key, reducing data volume before the shuffle.
+        pub enable_topk_repartition: bool, default = true
+
         /// When set to true, the optimizer will attempt to push down TopK dynamic filters
         /// into the file scan phase.
         pub enable_topk_dynamic_filter_pushdown: bool, default = true
@@ -844,12 +1118,16 @@ config_namespace! {
         /// into the file scan phase.
         pub enable_join_dynamic_filter_pushdown: bool, default = true
 
-        /// When set to true attempts to push down dynamic filters generated by operators (topk & join) into the file scan phase.
+        /// When set to true, the optimizer will attempt to push down Aggregate dynamic filters
+        /// into the file scan phase.
+        pub enable_aggregate_dynamic_filter_pushdown: bool, default = true
+
+        /// When set to true attempts to push down dynamic filters generated by operators (TopK, Join & Aggregate) into the file scan phase.
         /// For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer
         /// will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans.
         /// This means that if we already have 10 timestamps in the year 2025
         /// any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan.
-        /// The config will suppress `enable_join_dynamic_filter_pushdown` & `enable_topk_dynamic_filter_pushdown`
+        /// The config will suppress `enable_join_dynamic_filter_pushdown`, `enable_topk_dynamic_filter_pushdown` & `enable_aggregate_dynamic_filter_pushdown`
         /// So if you disable `enable_topk_dynamic_filter_pushdown`, then enable `enable_dynamic_filter_pushdown`, the `enable_topk_dynamic_filter_pushdown` will be overridden.
         pub enable_dynamic_filter_pushdown: bool, default = true
 
@@ -895,6 +1173,19 @@ config_namespace! {
         /// record tables provided to the MemTable on creation.
         pub repartition_file_scans: bool, default = true
 
+        /// Minimum number of distinct partition values required to group files by their
+        /// Hive partition column values (enabling Hash partitioning declaration).
+        ///
+        /// How the option is used:
+        ///     - preserve_file_partitions=0: Disable it.
+        ///     - preserve_file_partitions=1: Always enable it.
+        ///     - preserve_file_partitions=N, actual file partitions=M: Only enable when M >= N.
+        ///     This threshold preserves I/O parallelism when file partitioning is below it.
+        ///
+        /// Note: This may reduce parallelism, rooting from the I/O level, if the number of distinct
+        /// partitions is less than the target_partitions.
+        pub preserve_file_partitions: usize, default = 0
+
         /// Should DataFusion repartition data using the partitions keys to execute window
         /// functions in parallel using the provided `target_partitions` level
         pub repartition_windows: bool, default = true
@@ -917,6 +1208,34 @@ config_namespace! {
         /// ```
         pub repartition_sorts: bool, default = true
 
+        /// Partition count threshold for subset satisfaction optimization.
+        ///
+        /// When the current partition count is >= this threshold, DataFusion will
+        /// skip repartitioning if the required partitioning expression is a subset
+        /// of the current partition expression such as Hash(a) satisfies Hash(a, b).
+        ///
+        /// When the current partition count is < this threshold, DataFusion will
+        /// repartition to increase parallelism even when subset satisfaction applies.
+        ///
+        /// Set to 0 to always repartition (disable subset satisfaction optimization).
+        /// Set to a high value to always use subset satisfaction.
+        ///
+        /// Example (subset_repartition_threshold = 4):
+        /// ```text
+        ///     Hash([a]) satisfies Hash([a, b]) because (Hash([a, b]) is subset of Hash([a])
+        ///
+        ///     If current partitions (3) < threshold (4), repartition:
+        ///     AggregateExec: mode=FinalPartitioned, gby=[a, b], aggr=[SUM(x)]
+        ///       RepartitionExec: partitioning=Hash([a, b], 8), input_partitions=3
+        ///         AggregateExec: mode=Partial, gby=[a, b], aggr=[SUM(x)]
+        ///           DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 3)
+        ///
+        ///     If current partitions (8) >= threshold (4), use subset satisfaction:
+        ///     AggregateExec: mode=SinglePartitioned, gby=[a, b], aggr=[SUM(x)]
+        ///       DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 8)
+        /// ```
+        pub subset_repartition_threshold: usize, default = 4
+
         /// When true, DataFusion will opportunistically remove sorts when the data is already sorted,
         /// (i.e. setting `preserve_order` to true on `RepartitionExec`  and
         /// using `SortPreservingMergeExec`)
@@ -937,6 +1256,18 @@ config_namespace! {
         /// process to reorder the join keys
         pub top_down_join_key_reordering: bool, default = true
 
+        /// When set to true, the physical plan optimizer may swap join inputs
+        /// based on statistics. When set to false, statistics-driven join
+        /// input reordering is disabled and the original join order in the
+        /// query is used.
+        pub join_reordering: bool, default = true
+
+        /// When set to true, the physical plan optimizer uses the pluggable
+        /// `StatisticsRegistry` for statistics propagation across operators.
+        /// This enables more accurate cardinality estimates compared to each
+        /// operator's built-in `partition_statistics`.
+        pub use_statistics_registry: bool, default = false
+
         /// When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin.
         /// HashJoin can work more efficiently than SortMergeJoin but consumes more memory
         pub prefer_hash_join: bool, default = true
@@ -954,6 +1285,36 @@ config_namespace! {
         /// will be collected into a single partition
         pub hash_join_single_partition_threshold_rows: usize, default = 1024 * 128
 
+        /// Maximum size in bytes for the build side of a hash join to be pushed down as an InList expression for dynamic filtering.
+        /// Build sides larger than this will use hash table lookups instead.
+        /// Set to 0 to always use hash table lookups.
+        ///
+        /// InList pushdown can be more efficient for small build sides because it can result in better
+        /// statistics pruning as well as use any bloom filters present on the scan side.
+        /// InList expressions are also more transparent and easier to serialize over the network in distributed uses of DataFusion.
+        /// On the other hand InList pushdown requires making a copy of the data and thus adds some overhead to the build side and uses more memory.
+        ///
+        /// This setting is per-partition, so we may end up using `hash_join_inlist_pushdown_max_size` * `target_partitions` memory.
+        ///
+        /// The default is 128kB per partition.
+        /// This should allow point lookup joins (e.g. joining on a unique primary key) to use InList pushdown in most cases
+        /// but avoids excessive memory usage or overhead for larger joins.
+        pub hash_join_inlist_pushdown_max_size: usize, default = 128 * 1024
+
+        /// Maximum number of distinct values (rows) in the build side of a hash join to be pushed down as an InList expression for dynamic filtering.
+        /// Build sides with more rows than this will use hash table lookups instead.
+        /// Set to 0 to always use hash table lookups.
+        ///
+        /// This provides an additional limit beyond `hash_join_inlist_pushdown_max_size` to prevent
+        /// very large IN lists that might not provide much benefit over hash table lookups.
+        ///
+        /// This uses the deduplicated row count once the build side has been evaluated.
+        ///
+        /// The default is 150 values per partition.
+        /// This is inspired by Trino's `max-filter-keys-per-column` setting.
+        /// See: <https://trino.io/docs/current/admin/dynamic-filtering.html#dynamic-filter-collection-thresholds>
+        pub hash_join_inlist_pushdown_max_distinct_values: usize, default = 150
+
         /// The default filter selectivity used by Filter Statistics
         /// when an exact selectivity cannot be determined. Valid values are
         /// between 0 (no selectivity) and 100 (all rows are selected).
@@ -966,6 +1327,27 @@ config_namespace! {
         /// then the output will be coerced to a non-view.
         /// Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`.
         pub expand_views_at_output: bool, default = false
+
+        /// Enable sort pushdown optimization.
+        /// When enabled, attempts to push sort requirements down to data sources
+        /// that can natively handle them (e.g., by reversing file/row group read order).
+        ///
+        /// Returns **inexact ordering**: Sort operator is kept for correctness,
+        /// but optimized input enables early termination for TopK queries (ORDER BY ... LIMIT N),
+        /// providing significant speedup.
+        ///
+        /// Memory: No additional overhead (only changes read order).
+        ///
+        /// Future: Will add option to detect perfectly sorted data and eliminate Sort completely.
+        ///
+        /// Default: true
+        pub enable_sort_pushdown: bool, default = true
+
+        /// When set to true, the optimizer will extract leaf expressions
+        /// (such as `get_field`) from filter/sort/join nodes into projections
+        /// closer to the leaf table scans, and push those projections down
+        /// towards the leaf nodes.
+        pub enable_leaf_expression_pushdown: bool, default = true
     }
 }
 
@@ -1003,7 +1385,13 @@ config_namespace! {
         /// Verbosity level for "EXPLAIN ANALYZE". Default is "dev"
         /// "summary" shows common metrics for high-level insights.
         /// "dev" provides deep operator-level introspection for developers.
-        pub analyze_level: ExplainAnalyzeLevel, default = ExplainAnalyzeLevel::Dev
+        pub analyze_level: MetricType, default = MetricType::Dev
+
+        /// Which metric categories to include in "EXPLAIN ANALYZE" output.
+        /// Comma-separated list of: "rows", "bytes", "timing", "uncategorized".
+        /// Use "none" to show plan structure only, or "all" (default) to show everything.
+        /// Metrics without a declared category are treated as "uncategorized".
+        pub analyze_categories: ExplainAnalyzeCategories, default = ExplainAnalyzeCategories::All
     }
 }
 
@@ -1046,35 +1434,35 @@ config_namespace! {
     }
 }
 
-impl<'a> TryInto<arrow::util::display::FormatOptions<'a>> for &'a FormatOptions {
+impl<'a> TryFrom<&'a FormatOptions> for arrow::util::display::FormatOptions<'a> {
     type Error = DataFusionError;
-    fn try_into(self) -> Result<arrow::util::display::FormatOptions<'a>> {
-        let duration_format = match self.duration_format.as_str() {
+    fn try_from(options: &'a FormatOptions) -> Result<Self> {
+        let duration_format = match options.duration_format.as_str() {
             "pretty" => arrow::util::display::DurationFormat::Pretty,
             "iso8601" => arrow::util::display::DurationFormat::ISO8601,
             _ => {
                 return _config_err!(
                     "Invalid duration format: {}. Valid values are pretty or iso8601",
-                    self.duration_format
-                )
+                    options.duration_format
+                );
             }
         };
 
-        Ok(arrow::util::display::FormatOptions::new()
-            .with_display_error(self.safe)
-            .with_null(&self.null)
-            .with_date_format(self.date_format.as_deref())
-            .with_datetime_format(self.datetime_format.as_deref())
-            .with_timestamp_format(self.timestamp_format.as_deref())
-            .with_timestamp_tz_format(self.timestamp_tz_format.as_deref())
-            .with_time_format(self.time_format.as_deref())
+        Ok(Self::new()
+            .with_display_error(options.safe)
+            .with_null(&options.null)
+            .with_date_format(options.date_format.as_deref())
+            .with_datetime_format(options.datetime_format.as_deref())
+            .with_timestamp_format(options.timestamp_format.as_deref())
+            .with_timestamp_tz_format(options.timestamp_tz_format.as_deref())
+            .with_time_format(options.time_format.as_deref())
             .with_duration_format(duration_format)
-            .with_types_info(self.types_info))
+            .with_types_info(options.types_info))
     }
 }
 
 /// A key value pair, with a corresponding description
-#[derive(Debug, Hash, PartialEq, Eq)]
+#[derive(Debug, Clone, Hash, PartialEq, Eq)]
 pub struct ConfigEntry {
     /// A unique string to identify this config value
     pub key: String,
@@ -1107,6 +1495,15 @@ pub struct ConfigOptions {
 }
 
 impl ConfigField for ConfigOptions {
+    fn visit<V: Visit>(&self, v: &mut V, _key_prefix: &str, _description: &'static str) {
+        self.catalog.visit(v, "datafusion.catalog", "");
+        self.execution.visit(v, "datafusion.execution", "");
+        self.optimizer.visit(v, "datafusion.optimizer", "");
+        self.explain.visit(v, "datafusion.explain", "");
+        self.sql_parser.visit(v, "datafusion.sql_parser", "");
+        self.format.visit(v, "datafusion.format", "");
+    }
+
     fn set(&mut self, key: &str, value: &str) -> Result<()> {
         // Extensions are handled in the public `ConfigOptions::set`
         let (key, rem) = key.split_once('.').unwrap_or((key, ""));
@@ -1121,16 +1518,50 @@ impl ConfigField for ConfigOptions {
         }
     }
 
-    fn visit<V: Visit>(&self, v: &mut V, _key_prefix: &str, _description: &'static str) {
-        self.catalog.visit(v, "datafusion.catalog", "");
-        self.execution.visit(v, "datafusion.execution", "");
-        self.optimizer.visit(v, "datafusion.optimizer", "");
-        self.explain.visit(v, "datafusion.explain", "");
-        self.sql_parser.visit(v, "datafusion.sql_parser", "");
-        self.format.visit(v, "datafusion.format", "");
+    /// Reset a configuration option back to its default value
+    fn reset(&mut self, key: &str) -> Result<()> {
+        let Some((prefix, rest)) = key.split_once('.') else {
+            return _config_err!("could not find config namespace for key \"{key}\"");
+        };
+
+        if prefix != "datafusion" {
+            return _config_err!("Could not find config namespace \"{prefix}\"");
+        }
+
+        let (section, rem) = rest.split_once('.').unwrap_or((rest, ""));
+        if rem.is_empty() {
+            return _config_err!("could not find config field for key \"{key}\"");
+        }
+
+        match section {
+            "catalog" => self.catalog.reset(rem),
+            "execution" => self.execution.reset(rem),
+            "optimizer" => {
+                if rem == "enable_dynamic_filter_pushdown" {
+                    let defaults = OptimizerOptions::default();
+                    self.optimizer.enable_dynamic_filter_pushdown =
+                        defaults.enable_dynamic_filter_pushdown;
+                    self.optimizer.enable_topk_dynamic_filter_pushdown =
+                        defaults.enable_topk_dynamic_filter_pushdown;
+                    self.optimizer.enable_join_dynamic_filter_pushdown =
+                        defaults.enable_join_dynamic_filter_pushdown;
+                    Ok(())
+                } else {
+                    self.optimizer.reset(rem)
+                }
+            }
+            "explain" => self.explain.reset(rem),
+            "sql_parser" => self.sql_parser.reset(rem),
+            "format" => self.format.reset(rem),
+            other => _config_err!("Config value \"{other}\" not found on ConfigOptions"),
+        }
     }
 }
 
+/// This namespace is reserved for interacting with Foreign Function Interface
+/// (FFI) based configuration extensions.
+pub const DATAFUSION_FFI_CONFIG_NAMESPACE: &str = "datafusion_ffi";
+
 impl ConfigOptions {
     /// Creates a new [`ConfigOptions`] with default values
     pub fn new() -> Self {
@@ -1145,12 +1576,12 @@ impl ConfigOptions {
 
     /// Set a configuration option
     pub fn set(&mut self, key: &str, value: &str) -> Result<()> {
-        let Some((prefix, key)) = key.split_once('.') else {
+        let Some((mut prefix, mut inner_key)) = key.split_once('.') else {
             return _config_err!("could not find config namespace for key \"{key}\"");
         };
 
         if prefix == "datafusion" {
-            if key == "optimizer.enable_dynamic_filter_pushdown" {
+            if inner_key == "optimizer.enable_dynamic_filter_pushdown" {
                 let bool_value = value.parse::<bool>().map_err(|e| {
                     DataFusionError::Configuration(format!(
                         "Failed to parse '{value}' as bool: {e}",
@@ -1161,16 +1592,27 @@ impl ConfigOptions {
                     self.optimizer.enable_dynamic_filter_pushdown = bool_value;
                     self.optimizer.enable_topk_dynamic_filter_pushdown = bool_value;
                     self.optimizer.enable_join_dynamic_filter_pushdown = bool_value;
+                    self.optimizer.enable_aggregate_dynamic_filter_pushdown = bool_value;
                 }
                 return Ok(());
             }
-            return ConfigField::set(self, key, value);
+            return ConfigField::set(self, inner_key, value);
+        }
+
+        if !self.extensions.0.contains_key(prefix)
+            && self
+                .extensions
+                .0
+                .contains_key(DATAFUSION_FFI_CONFIG_NAMESPACE)
+        {
+            inner_key = key;
+            prefix = DATAFUSION_FFI_CONFIG_NAMESPACE;
         }
 
         let Some(e) = self.extensions.0.get_mut(prefix) else {
             return _config_err!("Could not find config namespace \"{prefix}\"");
         };
-        e.0.set(key, value)
+        e.0.set(inner_key, value)
     }
 
     /// Create new [`ConfigOptions`], taking values from environment variables
@@ -1420,6 +1862,14 @@ impl Extensions {
         let e = self.0.get_mut(T::PREFIX)?;
         e.0.as_any_mut().downcast_mut()
     }
+
+    /// Iterates all the config extension entries yielding their prefix and their
+    /// [ExtensionOptions] implementation.
+    pub fn iter(
+        &self,
+    ) -> impl Iterator<Item = (&'static str, &Box<dyn ExtensionOptions>)> {
+        self.0.iter().map(|(k, v)| (*k, &v.0))
+    }
 }
 
 #[derive(Debug)]
@@ -1437,6 +1887,10 @@ pub trait ConfigField {
     fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str);
 
     fn set(&mut self, key: &str, value: &str) -> Result<()>;
+
+    fn reset(&mut self, key: &str) -> Result<()> {
+        _config_err!("Reset is not supported for this config field, key: {}", key)
+    }
 }
 
 impl<F: ConfigField + Default> ConfigField for Option<F> {
@@ -1450,6 +1904,15 @@ impl<F: ConfigField + Default> ConfigField for Option<F> {
     fn set(&mut self, key: &str, value: &str) -> Result<()> {
         self.get_or_insert_with(Default::default).set(key, value)
     }
+
+    fn reset(&mut self, key: &str) -> Result<()> {
+        if key.is_empty() {
+            *self = Default::default();
+            Ok(())
+        } else {
+            self.get_or_insert_with(Default::default).reset(key)
+        }
+    }
 }
 
 /// Default transformation to parse a [`ConfigField`] for a string.
@@ -1514,6 +1977,19 @@ macro_rules! config_field {
                 *self = $transform;
                 Ok(())
             }
+
+            fn reset(&mut self, key: &str) -> $crate::error::Result<()> {
+                if key.is_empty() {
+                    *self = <$t as Default>::default();
+                    Ok(())
+                } else {
+                    $crate::error::_config_err!(
+                        "Config field is a scalar {} and does not have nested field \"{}\"",
+                        stringify!($t),
+                        key
+                    )
+                }
+            }
         }
     };
 }
@@ -1523,6 +1999,8 @@ config_field!(bool, value => default_config_transform(value.to_lowercase().as_st
 config_field!(usize);
 config_field!(f64);
 config_field!(u64);
+config_field!(u32);
+config_field!(i32);
 
 impl ConfigField for u8 {
     fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
@@ -1564,6 +2042,17 @@ impl ConfigField for CompressionTypeVariant {
     }
 }
 
+impl ConfigField for CsvQuoteStyle {
+    fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
+        v.some(key, self, description)
+    }
+
+    fn set(&mut self, _: &str, value: &str) -> Result<()> {
+        *self = CsvQuoteStyle::from_str(value)?;
+        Ok(())
+    }
+}
+
 /// An implementation trait used to recursively walk configuration
 pub trait Visit {
     fn some<V: Display>(&mut self, key: &str, value: V, description: &'static str);
@@ -1713,8 +2202,7 @@ macro_rules! extensions_options {
                             // Safely apply deprecated attribute if present
                             // $(#[allow(deprecated)])?
                             {
-                                #[allow(deprecated)]
-                                self.$field_name.set(rem, value.as_ref())
+                                            self.$field_name.set(rem, value.as_ref())
                             }
                         },
                     )*
@@ -1728,7 +2216,6 @@ macro_rules! extensions_options {
                 $(
                     let key = stringify!($field_name).to_string();
                     let desc = concat!($($d),*).trim();
-                    #[allow(deprecated)]
                     self.$field_name.visit(v, key.as_str(), desc);
                 )*
             }
@@ -1902,7 +2389,7 @@ impl TableOptions {
     ///
     /// A result indicating success or failure in setting the configuration option.
     pub fn set(&mut self, key: &str, value: &str) -> Result<()> {
-        let Some((prefix, _)) = key.split_once('.') else {
+        let Some((mut prefix, _)) = key.split_once('.') else {
             return _config_err!("could not find config namespace for key \"{key}\"");
         };
 
@@ -1914,6 +2401,15 @@ impl TableOptions {
             return Ok(());
         }
 
+        if !self.extensions.0.contains_key(prefix)
+            && self
+                .extensions
+                .0
+                .contains_key(DATAFUSION_FFI_CONFIG_NAMESPACE)
+        {
+            prefix = DATAFUSION_FFI_CONFIG_NAMESPACE;
+        }
+
         let Some(e) = self.extensions.0.get_mut(prefix) else {
             return _config_err!("Could not find config namespace \"{prefix}\"");
         };
@@ -1999,7 +2495,7 @@ impl TableOptions {
 /// Options that control how Parquet files are read, including global options
 /// that apply to all columns and optional column-specific overrides
 ///
-/// Closely tied to [`ParquetWriterOptions`](crate::file_options::parquet_writer::ParquetWriterOptions).
+/// Closely tied to `ParquetWriterOptions` (see `crate::file_options::parquet_writer::ParquetWriterOptions` when the "parquet" feature is enabled).
 /// Properties not included in [`TableParquetOptions`] may not be configurable at the external API
 /// (e.g. sorting_columns).
 #[derive(Clone, Default, Debug, PartialEq)]
@@ -2119,13 +2615,13 @@ impl ConfigField for TableParquetOptions {
                 [_meta] | [_meta, ""] => {
                     return _config_err!(
                         "Invalid metadata key provided, missing key in metadata::<key>"
-                    )
+                    );
                 }
                 [_meta, k] => k.into(),
                 _ => {
                     return _config_err!(
                         "Invalid metadata key provided, found too many '::' in \"{key}\""
-                    )
+                    );
                 }
             };
             self.key_value_metadata.insert(k, Some(value.into()));
@@ -2171,7 +2667,6 @@ macro_rules! config_namespace_with_hashmap {
                     $(
                        stringify!($field_name) => {
                            // Handle deprecated fields
-                           #[allow(deprecated)] // Allow deprecated fields
                            $(let value = $transform(value);)?
                            self.$field_name.set(rem, value.as_ref())
                        },
@@ -2187,7 +2682,6 @@ macro_rules! config_namespace_with_hashmap {
                 let key = format!(concat!("{}.", stringify!($field_name)), key_prefix);
                 let desc = concat!($($d),*).trim();
                 // Handle deprecated fields
-                #[allow(deprecated)]
                 self.$field_name.visit(v, key.as_str(), desc);
                 )*
             }
@@ -2195,7 +2689,6 @@ macro_rules! config_namespace_with_hashmap {
 
         impl Default for $struct_name {
             fn default() -> Self {
-                #[allow(deprecated)]
                 Self {
                     $($field_name: $default),*
                 }
@@ -2223,7 +2716,6 @@ macro_rules! config_namespace_with_hashmap {
                     $(
                     let key = format!("{}.{field}::{}", key_prefix, column_name, field = stringify!($field_name));
                     let desc = concat!($($d),*).trim();
-                    #[allow(deprecated)]
                     col_options.$field_name.visit(v, key.as_str(), desc);
                     )*
                 }
@@ -2254,7 +2746,7 @@ config_namespace_with_hashmap! {
 
         /// Sets default parquet compression codec for the column path.
         /// Valid values are: uncompressed, snappy, gzip(level),
-        /// lzo, brotli(level), lz4, zstd(level), and lz4_raw.
+        /// brotli(level), lz4, zstd(level), and lz4_raw.
         /// These values are not case-sensitive. If NULL, uses
         /// default parquet options
         pub compression: Option<String>, transform = str::to_lowercase, default = None
@@ -2437,10 +2929,7 @@ impl From<&Arc<FileEncryptionProperties>> for ConfigFileEncryptionProperties {
                 },
             );
         }
-        let mut aad_prefix: Vec<u8> = Vec::new();
-        if let Some(prefix) = f.aad_prefix() {
-            aad_prefix = prefix.clone();
-        }
+        let aad_prefix = f.aad_prefix().cloned().unwrap_or_default();
         ConfigFileEncryptionProperties {
             encrypt_footer: f.encrypt_footer(),
             footer_key_as_hex: hex::encode(f.footer_key()),
@@ -2522,7 +3011,7 @@ impl ConfigField for ConfigFileDecryptionProperties {
                 self.footer_signature_verification.set(rem, value.as_ref())
             }
             _ => _config_err!(
-                "Config value \"{}\" not found on ConfigFileEncryptionProperties",
+                "Config value \"{}\" not found on ConfigFileDecryptionProperties",
                 key
             ),
         }
@@ -2564,8 +3053,18 @@ impl From<ConfigFileDecryptionProperties> for FileDecryptionProperties {
 }
 
 #[cfg(feature = "parquet_encryption")]
-impl From<&Arc<FileDecryptionProperties>> for ConfigFileDecryptionProperties {
-    fn from(f: &Arc<FileDecryptionProperties>) -> Self {
+impl TryFrom<&Arc<FileDecryptionProperties>> for ConfigFileDecryptionProperties {
+    type Error = DataFusionError;
+
+    fn try_from(f: &Arc<FileDecryptionProperties>) -> Result<Self> {
+        let footer_key = f.footer_key(None).map_err(|e| {
+            DataFusionError::Configuration(format!(
+                "Could not retrieve footer key from FileDecryptionProperties. \
+                Note that conversion to ConfigFileDecryptionProperties is not supported \
+                when using a key retriever: {e}"
+            ))
+        })?;
+
         let (column_names_vec, column_keys_vec) = f.column_keys();
         let mut column_decryption_properties: HashMap<
             String,
@@ -2578,18 +3077,13 @@ impl From<&Arc<FileDecryptionProperties>> for ConfigFileDecryptionProperties {
             column_decryption_properties.insert(column_name.clone(), props);
         }
 
-        let mut aad_prefix: Vec<u8> = Vec::new();
-        if let Some(prefix) = f.aad_prefix() {
-            aad_prefix = prefix.clone();
-        }
-        ConfigFileDecryptionProperties {
-            footer_key_as_hex: hex::encode(
-                f.footer_key(None).unwrap_or_default().as_ref(),
-            ),
+        let aad_prefix = f.aad_prefix().cloned().unwrap_or_default();
+        Ok(ConfigFileDecryptionProperties {
+            footer_key_as_hex: hex::encode(footer_key.as_ref()),
             column_decryption_properties,
             aad_prefix_as_hex: hex::encode(aad_prefix),
             footer_signature_verification: f.check_plaintext_footer_integrity(),
-        }
+        })
     }
 }
 
@@ -2639,6 +3133,15 @@ config_namespace! {
         pub terminator: Option<u8>, default = None
         pub escape: Option<u8>, default = None
         pub double_quote: Option<bool>, default = None
+        /// Quote style for CSV writing.
+        /// One of: "Always", "Necessary", "NonNumeric", "Never"
+        pub quote_style: CsvQuoteStyle, default = CsvQuoteStyle::Necessary
+        /// Whether to ignore leading whitespace in string values when writing CSV.
+        /// Defaults to `false` when `None`.
+        pub ignore_leading_whitespace: Option<bool>, default = None
+        /// Whether to ignore trailing whitespace in string values when writing CSV.
+        /// Defaults to `false` when `None`.
+        pub ignore_trailing_whitespace: Option<bool>, default = None
         /// Specifies whether newlines in (quoted) values are supported.
         ///
         /// Parsing newlines in quoted values may be affected by execution behaviour such as
@@ -2648,6 +3151,14 @@ config_namespace! {
         /// The default behaviour depends on the `datafusion.catalog.newlines_in_values` setting.
         pub newlines_in_values: Option<bool>, default = None
         pub compression: CompressionTypeVariant, default = CompressionTypeVariant::UNCOMPRESSED
+        /// Compression level for the output file. The valid range depends on the
+        /// compression algorithm:
+        /// - ZSTD: 1 to 22 (default: 3)
+        /// - GZIP: 0 to 9 (default: 6)
+        /// - BZIP2: 0 to 9 (default: 6)
+        /// - XZ: 0 to 9 (default: 6)
+        /// If not specified, the default level for the compression algorithm is used.
+        pub compression_level: Option<u32>, default = None
         pub schema_infer_max_rec: Option<usize>, default = None
         pub date_format: Option<String>, default = None
         pub datetime_format: Option<String>, default = None
@@ -2739,6 +3250,30 @@ impl CsvOptions {
         self
     }
 
+    /// Set the quote style for CSV writing.
+    pub fn with_quote_style(mut self, quote_style: CsvQuoteStyle) -> Self {
+        self.quote_style = quote_style;
+        self
+    }
+
+    /// Set whether to ignore leading whitespace in string values when writing CSV.
+    pub fn with_ignore_leading_whitespace(
+        mut self,
+        ignore_leading_whitespace: bool,
+    ) -> Self {
+        self.ignore_leading_whitespace = Some(ignore_leading_whitespace);
+        self
+    }
+
+    /// Set whether to ignore trailing whitespace in string values when writing CSV.
+    pub fn with_ignore_trailing_whitespace(
+        mut self,
+        ignore_trailing_whitespace: bool,
+    ) -> Self {
+        self.ignore_trailing_whitespace = Some(ignore_trailing_whitespace);
+        self
+    }
+
     /// Specifies whether newlines in (quoted) values are supported.
     ///
     /// Parsing newlines in quoted values may be affected by execution behaviour such as
@@ -2770,6 +3305,14 @@ impl CsvOptions {
         self
     }
 
+    /// Set the compression level for the output file.
+    /// The valid range depends on the compression algorithm.
+    /// If not specified, the default level for the algorithm is used.
+    pub fn with_compression_level(mut self, level: u32) -> Self {
+        self.compression_level = Some(level);
+        self
+    }
+
     /// The delimiter character.
     pub fn delimiter(&self) -> u8 {
         self.delimiter
@@ -2795,14 +3338,38 @@ config_namespace! {
     /// Options controlling JSON format
     pub struct JsonOptions {
         pub compression: CompressionTypeVariant, default = CompressionTypeVariant::UNCOMPRESSED
+        /// Compression level for the output file. The valid range depends on the
+        /// compression algorithm:
+        /// - ZSTD: 1 to 22 (default: 3)
+        /// - GZIP: 0 to 9 (default: 6)
+        /// - BZIP2: 0 to 9 (default: 6)
+        /// - XZ: 0 to 9 (default: 6)
+        /// If not specified, the default level for the compression algorithm is used.
+        pub compression_level: Option<u32>, default = None
         pub schema_infer_max_rec: Option<usize>, default = None
+       /// The JSON format to use when reading files.
+       ///
+       /// When `true` (default), expects newline-delimited JSON (NDJSON):
+       /// ```text
+       /// {"key1": 1, "key2": "val"}
+       /// {"key1": 2, "key2": "vals"}
+       /// ```
+       ///
+       /// When `false`, expects JSON array format:
+       /// ```text
+       /// [
+       ///   {"key1": 1, "key2": "val"},
+       ///   {"key1": 2, "key2": "vals"}
+       /// ]
+       /// ```
+       pub newline_delimited: bool, default = true
     }
 }
 
 pub trait OutputFormatExt: Display {}
 
 #[derive(Debug, Clone, PartialEq)]
-#[allow(clippy::large_enum_variant)]
+#[cfg_attr(feature = "parquet", expect(clippy::large_enum_variant))]
 pub enum OutputFormat {
     CSV(CsvOptions),
     JSON(JsonOptions),
@@ -2836,7 +3403,6 @@ mod tests {
     };
     use std::any::Any;
     use std::collections::HashMap;
-    use std::sync::Arc;
 
     #[derive(Default, Debug, Clone)]
     pub struct TestExtensionConfig {
@@ -2908,6 +3474,16 @@ mod tests {
         );
     }
 
+    #[test]
+    fn iter_test_extension_config() {
+        let mut extension = Extensions::new();
+        extension.insert(TestExtensionConfig::default());
+        let table_config = TableOptions::new().with_extensions(extension);
+        let extensions = table_config.extensions.iter().collect::<Vec<_>>();
+        assert_eq!(extensions.len(), 1);
+        assert_eq!(extensions[0].0, TestExtensionConfig::PREFIX);
+    }
+
     #[test]
     fn csv_u8_table_options() {
         let mut table_config = TableOptions::new();
@@ -2951,6 +3527,19 @@ mod tests {
         assert_eq!(COUNT.load(std::sync::atomic::Ordering::Relaxed), 1);
     }
 
+    #[test]
+    fn reset_nested_scalar_reports_helpful_error() {
+        let mut value = true;
+        let err = <bool as ConfigField>::reset(&mut value, "nested").unwrap_err();
+        let message = err.to_string();
+        assert!(
+            message.starts_with(
+                "Invalid or Unsupported Configuration: Config field is a scalar bool and does not have nested field \"nested\""
+            ),
+            "unexpected error message: {message}"
+        );
+    }
+
     #[cfg(feature = "parquet")]
     #[test]
     fn parquet_table_options() {
@@ -2973,6 +3562,7 @@ mod tests {
         };
         use parquet::encryption::decrypt::FileDecryptionProperties;
         use parquet::encryption::encrypt::FileEncryptionProperties;
+        use std::sync::Arc;
 
         let footer_key = b"0123456789012345".to_vec(); // 128bit/16
         let column_names = vec!["double_field", "float_field"];
@@ -2999,7 +3589,8 @@ mod tests {
             Arc::new(FileEncryptionProperties::from(config_encrypt.clone()));
         assert_eq!(file_encryption_properties, encryption_properties_built);
 
-        let config_decrypt = ConfigFileDecryptionProperties::from(&decryption_properties);
+        let config_decrypt =
+            ConfigFileDecryptionProperties::try_from(&decryption_properties).unwrap();
         let decryption_properties_built =
             Arc::new(FileDecryptionProperties::from(config_decrypt.clone()));
         assert_eq!(decryption_properties, decryption_properties_built);
@@ -3117,6 +3708,42 @@ mod tests {
         assert_eq!(factory_options.get("key2"), Some(&"value 2".to_string()));
     }
 
+    #[cfg(feature = "parquet_encryption")]
+    struct ParquetEncryptionKeyRetriever {}
+
+    #[cfg(feature = "parquet_encryption")]
+    impl parquet::encryption::decrypt::KeyRetriever for ParquetEncryptionKeyRetriever {
+        fn retrieve_key(&self, key_metadata: &[u8]) -> parquet::errors::Result<Vec<u8>> {
+            if !key_metadata.is_empty() {
+                Ok(b"1234567890123450".to_vec())
+            } else {
+                Err(parquet::errors::ParquetError::General(
+                    "Key metadata not provided".to_string(),
+                ))
+            }
+        }
+    }
+
+    #[cfg(feature = "parquet_encryption")]
+    #[test]
+    fn conversion_from_key_retriever_to_config_file_decryption_properties() {
+        use crate::Result;
+        use crate::config::ConfigFileDecryptionProperties;
+        use crate::encryption::FileDecryptionProperties;
+
+        let retriever = std::sync::Arc::new(ParquetEncryptionKeyRetriever {});
+        let decryption_properties =
+            FileDecryptionProperties::with_key_retriever(retriever)
+                .build()
+                .unwrap();
+        let config_file_decryption_properties: Result<ConfigFileDecryptionProperties> =
+            (&decryption_properties).try_into();
+        assert!(config_file_decryption_properties.is_err());
+        let err = config_file_decryption_properties.unwrap_err().to_string();
+        assert!(err.contains("key retriever"));
+        assert!(err.contains("Key metadata not provided"));
+    }
+
     #[cfg(feature = "parquet")]
     #[test]
     fn parquet_table_options_config_entry() {
@@ -3126,9 +3753,11 @@ mod tests {
             .set("format.bloom_filter_enabled::col1", "true")
             .unwrap();
         let entries = table_config.entries();
-        assert!(entries
-            .iter()
-            .any(|item| item.key == "format.bloom_filter_enabled::col1"))
+        assert!(
+            entries
+                .iter()
+                .any(|item| item.key == "format.bloom_filter_enabled::col1")
+        )
     }
 
     #[cfg(feature = "parquet")]
@@ -3142,10 +3771,10 @@ mod tests {
             )
             .unwrap();
         let entries = table_parquet_options.entries();
-        assert!(entries
-            .iter()
-            .any(|item| item.key
-                == "crypto.file_encryption.column_key_as_hex::double_field"))
+        assert!(
+            entries.iter().any(|item| item.key
+                == "crypto.file_encryption.column_key_as_hex::double_field")
+        )
     }
 
     #[cfg(feature = "parquet")]
@@ -3181,4 +3810,110 @@ mod tests {
         let parsed_metadata = table_config.parquet.key_value_metadata;
         assert_eq!(parsed_metadata.get("key_dupe"), Some(&Some("B".into())));
     }
+    #[cfg(feature = "parquet")]
+    #[test]
+    fn test_parquet_writer_version_validation() {
+        use crate::{config::ConfigOptions, parquet_config::DFParquetWriterVersion};
+
+        let mut config = ConfigOptions::default();
+
+        // Valid values should work
+        config
+            .set("datafusion.execution.parquet.writer_version", "1.0")
+            .unwrap();
+        assert_eq!(
+            config.execution.parquet.writer_version,
+            DFParquetWriterVersion::V1_0
+        );
+
+        config
+            .set("datafusion.execution.parquet.writer_version", "2.0")
+            .unwrap();
+        assert_eq!(
+            config.execution.parquet.writer_version,
+            DFParquetWriterVersion::V2_0
+        );
+
+        // Invalid value should error immediately at SET time
+        let err = config
+            .set("datafusion.execution.parquet.writer_version", "3.0")
+            .unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Invalid or Unsupported Configuration: Invalid parquet writer version: 3.0. Expected one of: 1.0, 2.0"
+        );
+    }
+
+    #[cfg(feature = "parquet")]
+    #[test]
+    fn set_cdc_option_with_boolean_true() {
+        use crate::config::ConfigOptions;
+
+        let mut config = ConfigOptions::default();
+        assert!(
+            config
+                .execution
+                .parquet
+                .use_content_defined_chunking
+                .is_none()
+        );
+
+        // Setting to "true" should enable CDC with default options
+        config
+            .set(
+                "datafusion.execution.parquet.use_content_defined_chunking",
+                "true",
+            )
+            .unwrap();
+        let cdc = config
+            .execution
+            .parquet
+            .use_content_defined_chunking
+            .as_ref()
+            .expect("CDC should be enabled");
+        assert_eq!(cdc.min_chunk_size, 256 * 1024);
+        assert_eq!(cdc.max_chunk_size, 1024 * 1024);
+        assert_eq!(cdc.norm_level, 0);
+
+        // Setting to "false" should disable CDC
+        config
+            .set(
+                "datafusion.execution.parquet.use_content_defined_chunking",
+                "false",
+            )
+            .unwrap();
+        assert!(
+            config
+                .execution
+                .parquet
+                .use_content_defined_chunking
+                .is_none()
+        );
+    }
+
+    #[cfg(feature = "parquet")]
+    #[test]
+    fn set_cdc_option_with_subfields() {
+        use crate::config::ConfigOptions;
+
+        let mut config = ConfigOptions::default();
+
+        // Setting sub-fields should also enable CDC
+        config
+            .set(
+                "datafusion.execution.parquet.use_content_defined_chunking.min_chunk_size",
+                "1024",
+            )
+            .unwrap();
+        let cdc = config
+            .execution
+            .parquet
+            .use_content_defined_chunking
+            .as_ref()
+            .expect("CDC should be enabled");
+        assert_eq!(cdc.min_chunk_size, 1024);
+        // Other fields should be defaults
+        assert_eq!(cdc.max_chunk_size, 1024 * 1024);
+        assert_eq!(cdc.norm_level, 0);
+    }
 }
diff --git a/datafusion/common/src/cse.rs b/datafusion/common/src/cse.rs
index 674d3386171f8..93169d6a02ff1 100644
--- a/datafusion/common/src/cse.rs
+++ b/datafusion/common/src/cse.rs
@@ -19,12 +19,12 @@
 //! a [`CSEController`], that defines how to eliminate common subtrees from a particular
 //! [`TreeNode`] tree.
 
+use crate::Result;
 use crate::hash_utils::combine_hashes;
 use crate::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter,
     TreeNodeVisitor,
 };
-use crate::Result;
 use indexmap::IndexMap;
 use std::collections::HashMap;
 use std::hash::{BuildHasher, Hash, Hasher, RandomState};
@@ -676,13 +676,13 @@ where
 
 #[cfg(test)]
 mod test {
+    use crate::Result;
     use crate::alias::AliasGenerator;
     use crate::cse::{
-        CSEController, HashNode, IdArray, Identifier, NodeStats, NormalizeEq,
-        Normalizeable, CSE,
+        CSE, CSEController, HashNode, IdArray, Identifier, NodeStats, NormalizeEq,
+        Normalizeable,
     };
     use crate::tree_node::tests::TestTreeNode;
-    use crate::Result;
     use std::collections::HashSet;
     use std::hash::{Hash, Hasher};
 
diff --git a/datafusion/common/src/datatype.rs b/datafusion/common/src/datatype.rs
index 65f6395211866..19847f8583505 100644
--- a/datafusion/common/src/datatype.rs
+++ b/datafusion/common/src/datatype.rs
@@ -15,9 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`DataTypeExt`] and [`FieldExt`] extension trait for working with DataTypes to Fields
+//! [`DataTypeExt`] and [`FieldExt`] extension trait for working with Arrow [`DataType`] and [`Field`]s
 
 use crate::arrow::datatypes::{DataType, Field, FieldRef};
+use crate::metadata::FieldMetadata;
 use std::sync::Arc;
 
 /// DataFusion extension methods for Arrow [`DataType`]
@@ -61,7 +62,54 @@ impl DataTypeExt for DataType {
 }
 
 /// DataFusion extension methods for Arrow [`Field`] and [`FieldRef`]
+///
+/// This trait is implemented for both [`Field`] and [`FieldRef`] and
+/// provides convenience methods for efficiently working with both types.
+///
+/// For [`FieldRef`], the methods will attempt to unwrap the `Arc`
+/// to avoid unnecessary cloning when possible.
 pub trait FieldExt {
+    /// Ensure the field is named `new_name`, returning the given field if the
+    /// name matches, and a new field if not.
+    ///
+    /// This method avoids `clone`ing fields and names if the name is the same
+    /// as the field's existing name.
+    ///
+    /// Example:
+    /// ```
+    /// # use std::sync::Arc;
+    /// # use arrow::datatypes::{DataType, Field};
+    /// # use datafusion_common::datatype::FieldExt;
+    /// let int_field = Field::new("my_int", DataType::Int32, true);
+    /// // rename to "your_int"
+    /// let renamed_field = int_field.renamed("your_int");
+    /// assert_eq!(renamed_field.name(), "your_int");
+    /// ```
+    fn renamed(self, new_name: &str) -> Self;
+
+    /// Ensure the field has the given data type
+    ///
+    /// Note this is different than simply calling [`Field::with_data_type`] as
+    /// it avoids copying if the data type is already the same.
+    ///
+    /// Example:
+    /// ```
+    /// # use std::sync::Arc;
+    /// # use arrow::datatypes::{DataType, Field};
+    /// # use datafusion_common::datatype::FieldExt;
+    /// let int_field = Field::new("my_int", DataType::Int32, true);
+    /// // change to Float64
+    /// let retyped_field = int_field.retyped(DataType::Float64);
+    /// assert_eq!(retyped_field.data_type(), &DataType::Float64);
+    /// ```
+    fn retyped(self, new_data_type: DataType) -> Self;
+
+    /// Add field metadata to the Field
+    fn with_field_metadata(self, metadata: &FieldMetadata) -> Self;
+
+    /// Add optional field metadata,
+    fn with_field_metadata_opt(self, metadata: Option<&FieldMetadata>) -> Self;
+
     /// Returns a new Field representing a List of this Field's DataType.
     ///
     /// For example if input represents an `Int32`, the return value will
@@ -130,6 +178,32 @@ pub trait FieldExt {
 }
 
 impl FieldExt for Field {
+    fn renamed(self, new_name: &str) -> Self {
+        // check if this is a new name before allocating a new Field / copying
+        // the existing one
+        if self.name() != new_name {
+            self.with_name(new_name)
+        } else {
+            self
+        }
+    }
+
+    fn retyped(self, new_data_type: DataType) -> Self {
+        self.with_data_type(new_data_type)
+    }
+
+    fn with_field_metadata(self, metadata: &FieldMetadata) -> Self {
+        metadata.add_to_field(self)
+    }
+
+    fn with_field_metadata_opt(self, metadata: Option<&FieldMetadata>) -> Self {
+        if let Some(metadata) = metadata {
+            self.with_field_metadata(metadata)
+        } else {
+            self
+        }
+    }
+
     fn into_list(self) -> Self {
         DataType::List(Arc::new(self.into_list_item())).into_nullable_field()
     }
@@ -149,6 +223,34 @@ impl FieldExt for Field {
 }
 
 impl FieldExt for Arc<Field> {
+    fn renamed(mut self, new_name: &str) -> Self {
+        if self.name() != new_name {
+            // avoid cloning if possible
+            Arc::make_mut(&mut self).set_name(new_name);
+        }
+        self
+    }
+
+    fn retyped(mut self, new_data_type: DataType) -> Self {
+        if self.data_type() != &new_data_type {
+            // avoid cloning if possible
+            Arc::make_mut(&mut self).set_data_type(new_data_type);
+        }
+        self
+    }
+
+    fn with_field_metadata(self, metadata: &FieldMetadata) -> Self {
+        metadata.add_to_field_ref(self)
+    }
+
+    fn with_field_metadata_opt(self, metadata: Option<&FieldMetadata>) -> Self {
+        if let Some(metadata) = metadata {
+            self.with_field_metadata(metadata)
+        } else {
+            self
+        }
+    }
+
     fn into_list(self) -> Self {
         DataType::List(self.into_list_item())
             .into_nullable_field()
@@ -161,13 +263,11 @@ impl FieldExt for Arc<Field> {
             .into()
     }
 
-    fn into_list_item(self) -> Self {
+    fn into_list_item(mut self) -> Self {
         if self.name() != Field::LIST_FIELD_DEFAULT_NAME {
-            Arc::unwrap_or_clone(self)
-                .with_name(Field::LIST_FIELD_DEFAULT_NAME)
-                .into()
-        } else {
-            self
+            // avoid cloning if possible
+            Arc::make_mut(&mut self).set_name(Field::LIST_FIELD_DEFAULT_NAME);
         }
+        self
     }
 }
diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs
index 24d152a7dba8c..e3da99163ed69 100644
--- a/datafusion/common/src/dfschema.rs
+++ b/datafusion/common/src/dfschema.rs
@@ -21,12 +21,12 @@
 use std::collections::{BTreeSet, HashMap, HashSet};
 use std::fmt::{Display, Formatter};
 use std::hash::Hash;
-use std::sync::Arc;
+use std::sync::{Arc, LazyLock};
 
-use crate::error::{DataFusionError, Result, _plan_err, _schema_err};
+use crate::error::{_plan_err, _schema_err, DataFusionError, Result};
 use crate::{
-    field_not_found, unqualified_field_not_found, Column, FunctionalDependencies,
-    SchemaError, TableReference,
+    Column, FunctionalDependencies, SchemaError, TableReference, field_not_found,
+    unqualified_field_not_found,
 };
 
 use arrow::compute::can_cast_types;
@@ -37,7 +37,7 @@ use arrow::datatypes::{
 /// A reference-counted reference to a [DFSchema].
 pub type DFSchemaRef = Arc<DFSchema>;
 
-/// DFSchema wraps an Arrow schema and adds relation names.
+/// DFSchema wraps an Arrow schema and add a relation (table) name.
 ///
 /// The schema may hold the fields across multiple tables. Some fields may be
 /// qualified and some unqualified. A qualified field is a field that has a
@@ -47,8 +47,14 @@ pub type DFSchemaRef = Arc<DFSchema>;
 /// have a distinct name from any qualified field names. This allows finding a
 /// qualified field by name to be possible, so long as there aren't multiple
 /// qualified fields with the same name.
+///]
+/// # See Also
+/// * [DFSchemaRef], an alias to `Arc<DFSchema>`
+/// * [DataTypeExt], common methods for working with Arrow [DataType]s
+/// * [FieldExt], extension methods for working with Arrow [Field]s
 ///
-/// There is an alias to `Arc<DFSchema>` named [DFSchemaRef].
+/// [DataTypeExt]: crate::datatype::DataTypeExt
+/// [FieldExt]: crate::datatype::FieldExt
 ///
 /// # Creating qualified schemas
 ///
@@ -123,6 +129,13 @@ impl DFSchema {
         }
     }
 
+    /// Returns a reference to a shared empty [`DFSchema`].
+    pub fn empty_ref() -> &'static DFSchemaRef {
+        static EMPTY: LazyLock<DFSchemaRef> =
+            LazyLock::new(|| Arc::new(DFSchema::empty()));
+        &EMPTY
+    }
+
     /// Return a reference to the inner Arrow [`Schema`]
     ///
     /// Note this does not have the qualifier information
@@ -346,20 +359,22 @@ impl DFSchema {
         self.field_qualifiers.extend(qualifiers);
     }
 
-    /// Get a list of fields
+    /// Get a list of fields for this schema
     pub fn fields(&self) -> &Fields {
         &self.inner.fields
     }
 
-    /// Returns an immutable reference of a specific `Field` instance selected using an
-    /// offset within the internal `fields` vector
-    pub fn field(&self, i: usize) -> &Field {
+    /// Returns a reference to [`FieldRef`] for a column at specific index
+    /// within the schema.
+    ///
+    /// See also [Self::qualified_field] to get both qualifier and field
+    pub fn field(&self, i: usize) -> &FieldRef {
         &self.inner.fields[i]
     }
 
-    /// Returns an immutable reference of a specific `Field` instance selected using an
-    /// offset within the internal `fields` vector and its qualifier
-    pub fn qualified_field(&self, i: usize) -> (Option<&TableReference>, &Field) {
+    /// Returns the qualifier (if any) and [`FieldRef`] for a column at specific
+    /// index within the schema.
+    pub fn qualified_field(&self, i: usize) -> (Option<&TableReference>, &FieldRef) {
         (self.field_qualifiers[i].as_ref(), self.field(i))
     }
 
@@ -410,12 +425,12 @@ impl DFSchema {
             .is_some()
     }
 
-    /// Find the field with the given name
+    /// Find the [`FieldRef`] with the given name and optional qualifier
     pub fn field_with_name(
         &self,
         qualifier: Option<&TableReference>,
         name: &str,
-    ) -> Result<&Field> {
+    ) -> Result<&FieldRef> {
         if let Some(qualifier) = qualifier {
             self.field_with_qualified_name(qualifier, name)
         } else {
@@ -428,7 +443,7 @@ impl DFSchema {
         &self,
         qualifier: Option<&TableReference>,
         name: &str,
-    ) -> Result<(Option<&TableReference>, &Field)> {
+    ) -> Result<(Option<&TableReference>, &FieldRef)> {
         if let Some(qualifier) = qualifier {
             let idx = self
                 .index_of_column_by_name(Some(qualifier), name)
@@ -440,10 +455,10 @@ impl DFSchema {
     }
 
     /// Find all fields having the given qualifier
-    pub fn fields_with_qualified(&self, qualifier: &TableReference) -> Vec<&Field> {
+    pub fn fields_with_qualified(&self, qualifier: &TableReference) -> Vec<&FieldRef> {
         self.iter()
             .filter(|(q, _)| q.map(|q| q.eq(qualifier)).unwrap_or(false))
-            .map(|(_, f)| f.as_ref())
+            .map(|(_, f)| f)
             .collect()
     }
 
@@ -459,11 +474,10 @@ impl DFSchema {
     }
 
     /// Find all fields that match the given name
-    pub fn fields_with_unqualified_name(&self, name: &str) -> Vec<&Field> {
+    pub fn fields_with_unqualified_name(&self, name: &str) -> Vec<&FieldRef> {
         self.fields()
             .iter()
             .filter(|field| field.name() == name)
-            .map(|f| f.as_ref())
             .collect()
     }
 
@@ -471,10 +485,9 @@ impl DFSchema {
     pub fn qualified_fields_with_unqualified_name(
         &self,
         name: &str,
-    ) -> Vec<(Option<&TableReference>, &Field)> {
+    ) -> Vec<(Option<&TableReference>, &FieldRef)> {
         self.iter()
             .filter(|(_, field)| field.name() == name)
-            .map(|(qualifier, field)| (qualifier, field.as_ref()))
             .collect()
     }
 
@@ -499,7 +512,7 @@ impl DFSchema {
     pub fn qualified_field_with_unqualified_name(
         &self,
         name: &str,
-    ) -> Result<(Option<&TableReference>, &Field)> {
+    ) -> Result<(Option<&TableReference>, &FieldRef)> {
         let matches = self.qualified_fields_with_unqualified_name(name);
         match matches.len() {
             0 => Err(unqualified_field_not_found(name, self)),
@@ -528,7 +541,7 @@ impl DFSchema {
     }
 
     /// Find the field with the given name
-    pub fn field_with_unqualified_name(&self, name: &str) -> Result<&Field> {
+    pub fn field_with_unqualified_name(&self, name: &str) -> Result<&FieldRef> {
         self.qualified_field_with_unqualified_name(name)
             .map(|(_, field)| field)
     }
@@ -538,7 +551,7 @@ impl DFSchema {
         &self,
         qualifier: &TableReference,
         name: &str,
-    ) -> Result<&Field> {
+    ) -> Result<&FieldRef> {
         let idx = self
             .index_of_column_by_name(Some(qualifier), name)
             .ok_or_else(|| field_not_found(Some(qualifier.clone()), name, self))?;
@@ -550,7 +563,7 @@ impl DFSchema {
     pub fn qualified_field_from_column(
         &self,
         column: &Column,
-    ) -> Result<(Option<&TableReference>, &Field)> {
+    ) -> Result<(Option<&TableReference>, &FieldRef)> {
         self.qualified_field_with_name(column.relation.as_ref(), &column.name)
     }
 
@@ -692,10 +705,12 @@ impl DFSchema {
         // check nested fields
         match (dt1, dt2) {
             (DataType::Dictionary(_, v1), DataType::Dictionary(_, v2)) => {
-                v1.as_ref() == v2.as_ref()
+                Self::datatype_is_logically_equal(v1.as_ref(), v2.as_ref())
+            }
+            (DataType::Dictionary(_, v1), othertype)
+            | (othertype, DataType::Dictionary(_, v1)) => {
+                Self::datatype_is_logically_equal(v1.as_ref(), othertype)
             }
-            (DataType::Dictionary(_, v1), othertype) => v1.as_ref() == othertype,
-            (othertype, DataType::Dictionary(_, v1)) => v1.as_ref() == othertype,
             (DataType::List(f1), DataType::List(f2))
             | (DataType::LargeList(f1), DataType::LargeList(f2))
             | (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _)) => {
@@ -982,36 +997,35 @@ fn format_field_with_indent(
             result.push_str(&format!(
                 "{indent}|-- {field_name}: map (nullable = {nullable_str})\n"
             ));
-            if let DataType::Struct(inner_fields) = field.data_type() {
-                if inner_fields.len() == 2 {
-                    format_field_with_indent(
-                        result,
-                        "key",
-                        inner_fields[0].data_type(),
-                        inner_fields[0].is_nullable(),
-                        &child_indent,
-                    );
-                    let value_contains_null =
-                        field.is_nullable().to_string().to_lowercase();
-                    // Handle complex value types properly
-                    match inner_fields[1].data_type() {
-                        DataType::Struct(_)
-                        | DataType::List(_)
-                        | DataType::LargeList(_)
-                        | DataType::FixedSizeList(_, _)
-                        | DataType::Map(_, _) => {
-                            format_field_with_indent(
-                                result,
-                                "value",
-                                inner_fields[1].data_type(),
-                                inner_fields[1].is_nullable(),
-                                &child_indent,
-                            );
-                        }
-                        _ => {
-                            result.push_str(&format!("{child_indent}|-- value: {} (nullable = {value_contains_null})\n",
+            if let DataType::Struct(inner_fields) = field.data_type()
+                && inner_fields.len() == 2
+            {
+                format_field_with_indent(
+                    result,
+                    "key",
+                    inner_fields[0].data_type(),
+                    inner_fields[0].is_nullable(),
+                    &child_indent,
+                );
+                let value_contains_null = field.is_nullable().to_string().to_lowercase();
+                // Handle complex value types properly
+                match inner_fields[1].data_type() {
+                    DataType::Struct(_)
+                    | DataType::List(_)
+                    | DataType::LargeList(_)
+                    | DataType::FixedSizeList(_, _)
+                    | DataType::Map(_, _) => {
+                        format_field_with_indent(
+                            result,
+                            "value",
+                            inner_fields[1].data_type(),
+                            inner_fields[1].is_nullable(),
+                            &child_indent,
+                        );
+                    }
+                    _ => {
+                        result.push_str(&format!("{child_indent}|-- value: {} (nullable = {value_contains_null})\n",
                                 format_simple_data_type(inner_fields[1].data_type())));
-                        }
                     }
                 }
             }
@@ -1129,6 +1143,12 @@ impl TryFrom<SchemaRef> for DFSchema {
     }
 }
 
+impl From<DFSchema> for SchemaRef {
+    fn from(dfschema: DFSchema) -> Self {
+        Arc::clone(&dfschema.inner)
+    }
+}
+
 // Hashing refers to a subset of fields considered in PartialEq.
 impl Hash for DFSchema {
     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
@@ -1221,7 +1241,7 @@ pub trait ExprSchema: std::fmt::Debug {
     }
 
     // Return the column's field
-    fn field_from_column(&self, col: &Column) -> Result<&Field>;
+    fn field_from_column(&self, col: &Column) -> Result<&FieldRef>;
 }
 
 // Implement `ExprSchema` for `Arc<DFSchema>`
@@ -1242,13 +1262,13 @@ impl<P: AsRef<DFSchema> + std::fmt::Debug> ExprSchema for P {
         self.as_ref().data_type_and_nullable(col)
     }
 
-    fn field_from_column(&self, col: &Column) -> Result<&Field> {
+    fn field_from_column(&self, col: &Column) -> Result<&FieldRef> {
         self.as_ref().field_from_column(col)
     }
 }
 
 impl ExprSchema for DFSchema {
-    fn field_from_column(&self, col: &Column) -> Result<&Field> {
+    fn field_from_column(&self, col: &Column) -> Result<&FieldRef> {
         match &col.relation {
             Some(r) => self.field_with_qualified_name(r, &col.name),
             None => self.field_with_unqualified_name(&col.name),
@@ -1325,11 +1345,44 @@ impl SchemaExt for Schema {
     }
 }
 
+/// Build a fully-qualified field name string. This is equivalent to
+/// `format!("{q}.{name}")` when `qualifier` is `Some`, or just `name` when
+/// `None`. We avoid going through the `fmt` machinery for performance reasons.
 pub fn qualified_name(qualifier: Option<&TableReference>, name: &str) -> String {
-    match qualifier {
-        Some(q) => format!("{q}.{name}"),
-        None => name.to_string(),
-    }
+    let qualifier = match qualifier {
+        None => return name.to_string(),
+        Some(q) => q,
+    };
+    let (first, second, third) = match qualifier {
+        TableReference::Bare { table } => (table.as_ref(), None, None),
+        TableReference::Partial { schema, table } => {
+            (schema.as_ref(), Some(table.as_ref()), None)
+        }
+        TableReference::Full {
+            catalog,
+            schema,
+            table,
+        } => (
+            catalog.as_ref(),
+            Some(schema.as_ref()),
+            Some(table.as_ref()),
+        ),
+    };
+
+    let extra = second.map_or(0, str::len) + third.map_or(0, str::len);
+    let mut s = String::with_capacity(first.len() + extra + 3 + name.len());
+    s.push_str(first);
+    if let Some(second) = second {
+        s.push('.');
+        s.push_str(second);
+    }
+    if let Some(third) = third {
+        s.push('.');
+        s.push_str(third);
+    }
+    s.push('.');
+    s.push_str(name);
+    s
 }
 
 #[cfg(test)]
@@ -1338,6 +1391,36 @@ mod tests {
 
     use super::*;
 
+    /// `qualified_name` doesn't use `TableReference::Display` for performance
+    /// reasons, but check that the output is consistent.
+    #[test]
+    fn qualified_name_agrees_with_display() {
+        let cases: &[(Option<TableReference>, &str)] = &[
+            (None, "col"),
+            (Some(TableReference::bare("t")), "c0"),
+            (Some(TableReference::partial("s", "t")), "c0"),
+            (Some(TableReference::full("c", "s", "t")), "c0"),
+            (Some(TableReference::bare("mytable")), "some_column_name"),
+            // Empty segments must be preserved so that distinct qualified
+            // fields don't collide in `DFSchema::field_names()`.
+            (Some(TableReference::bare("")), "col"),
+            (Some(TableReference::partial("s", "")), "col"),
+            (Some(TableReference::partial("", "t")), "col"),
+            (Some(TableReference::full("c", "", "t")), "col"),
+            (Some(TableReference::full("", "s", "t")), "col"),
+            (Some(TableReference::full("c", "s", "")), "col"),
+            (Some(TableReference::full("", "", "")), "col"),
+        ];
+        for (qualifier, name) in cases {
+            let actual = qualified_name(qualifier.as_ref(), name);
+            let expected = match qualifier {
+                Some(q) => format!("{q}.{name}"),
+                None => name.to_string(),
+            };
+            assert_eq!(actual, expected, "qualifier={qualifier:?} name={name}");
+        }
+    }
+
     #[test]
     fn qualifier_in_name() -> Result<()> {
         let col = Column::from_name("t1.c0");
@@ -1433,12 +1516,14 @@ mod tests {
             join.to_string()
         );
         // test valid access
-        assert!(join
-            .field_with_qualified_name(&TableReference::bare("t1"), "c0")
-            .is_ok());
-        assert!(join
-            .field_with_qualified_name(&TableReference::bare("t2"), "c0")
-            .is_ok());
+        assert!(
+            join.field_with_qualified_name(&TableReference::bare("t1"), "c0")
+                .is_ok()
+        );
+        assert!(
+            join.field_with_qualified_name(&TableReference::bare("t2"), "c0")
+                .is_ok()
+        );
         // test invalid access
         assert!(join.field_with_unqualified_name("c0").is_err());
         assert!(join.field_with_unqualified_name("t1.c0").is_err());
@@ -1480,18 +1565,20 @@ mod tests {
             join.to_string()
         );
         // test valid access
-        assert!(join
-            .field_with_qualified_name(&TableReference::bare("t1"), "c0")
-            .is_ok());
+        assert!(
+            join.field_with_qualified_name(&TableReference::bare("t1"), "c0")
+                .is_ok()
+        );
         assert!(join.field_with_unqualified_name("c0").is_ok());
         assert!(join.field_with_unqualified_name("c100").is_ok());
         assert!(join.field_with_name(None, "c100").is_ok());
         // test invalid access
         assert!(join.field_with_unqualified_name("t1.c0").is_err());
         assert!(join.field_with_unqualified_name("t1.c100").is_err());
-        assert!(join
-            .field_with_qualified_name(&TableReference::bare(""), "c100")
-            .is_err());
+        assert!(
+            join.field_with_qualified_name(&TableReference::bare(""), "c100")
+                .is_err()
+        );
         Ok(())
     }
 
@@ -1500,9 +1587,11 @@ mod tests {
         let left = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?;
         let right = DFSchema::try_from(test_schema_1())?;
         let join = left.join(&right);
-        assert_contains!(join.unwrap_err().to_string(),
-                         "Schema error: Schema contains qualified \
-                          field name t1.c0 and unqualified field name c0 which would be ambiguous");
+        assert_contains!(
+            join.unwrap_err().to_string(),
+            "Schema error: Schema contains qualified \
+                          field name t1.c0 and unqualified field name c0 which would be ambiguous"
+        );
         Ok(())
     }
 
@@ -1781,6 +1870,27 @@ mod tests {
             &DataType::Utf8,
             &DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8))
         ));
+
+        // Dictionary is logically equal to the logically equivalent value type
+        assert!(DFSchema::datatype_is_logically_equal(
+            &DataType::Utf8View,
+            &DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8))
+        ));
+
+        assert!(DFSchema::datatype_is_logically_equal(
+            &DataType::Dictionary(
+                Box::new(DataType::Int32),
+                Box::new(DataType::List(
+                    Field::new("element", DataType::Utf8, false).into()
+                ))
+            ),
+            &DataType::Dictionary(
+                Box::new(DataType::Int32),
+                Box::new(DataType::List(
+                    Field::new("element", DataType::Utf8View, false).into()
+                ))
+            )
+        ));
     }
 
     #[test]
@@ -2059,7 +2169,7 @@ mod tests {
     fn test_print_schema_empty() {
         let schema = DFSchema::empty();
         let output = schema.tree_string();
-        insta::assert_snapshot!(output, @r###"root"###);
+        insta::assert_snapshot!(output, @"root");
     }
 
     #[test]
diff --git a/datafusion/common/src/display/human_readable.rs b/datafusion/common/src/display/human_readable.rs
new file mode 100644
index 0000000000000..0e0d677bd8904
--- /dev/null
+++ b/datafusion/common/src/display/human_readable.rs
@@ -0,0 +1,139 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Helpers for rendering sizes, counts, and durations in human readable form.
+
+/// Common data size units
+pub mod units {
+    pub const TB: u64 = 1 << 40;
+    pub const GB: u64 = 1 << 30;
+    pub const MB: u64 = 1 << 20;
+    pub const KB: u64 = 1 << 10;
+}
+
+/// Present size in human-readable form
+pub fn human_readable_size(size: usize) -> String {
+    use units::*;
+
+    let size = size as u64;
+    let (value, unit) = {
+        if size >= 2 * TB {
+            (size as f64 / TB as f64, "TB")
+        } else if size >= 2 * GB {
+            (size as f64 / GB as f64, "GB")
+        } else if size >= 2 * MB {
+            (size as f64 / MB as f64, "MB")
+        } else if size >= 2 * KB {
+            (size as f64 / KB as f64, "KB")
+        } else {
+            (size as f64, "B")
+        }
+    };
+    format!("{value:.1} {unit}")
+}
+
+/// Present count in human-readable form with K, M, B, T suffixes
+pub fn human_readable_count(count: usize) -> String {
+    let count = count as u64;
+    let (value, unit) = {
+        if count >= 1_000_000_000_000 {
+            (count as f64 / 1_000_000_000_000.0, " T")
+        } else if count >= 1_000_000_000 {
+            (count as f64 / 1_000_000_000.0, " B")
+        } else if count >= 1_000_000 {
+            (count as f64 / 1_000_000.0, " M")
+        } else if count >= 1_000 {
+            (count as f64 / 1_000.0, " K")
+        } else {
+            return count.to_string();
+        }
+    };
+
+    // Format with appropriate precision
+    // For values >= 100, show 1 decimal place (e.g., 123.4 K)
+    // For values < 100, show 2 decimal places (e.g., 10.12 K)
+    if value >= 100.0 {
+        format!("{value:.1}{unit}")
+    } else {
+        format!("{value:.2}{unit}")
+    }
+}
+
+/// Present duration in human-readable form with 2 decimal places
+pub fn human_readable_duration(nanos: u64) -> String {
+    const NANOS_PER_SEC: f64 = 1_000_000_000.0;
+    const NANOS_PER_MILLI: f64 = 1_000_000.0;
+    const NANOS_PER_MICRO: f64 = 1_000.0;
+
+    let nanos_f64 = nanos as f64;
+
+    if nanos >= 1_000_000_000 {
+        // >= 1 second: show in seconds
+        format!("{:.2}s", nanos_f64 / NANOS_PER_SEC)
+    } else if nanos >= 1_000_000 {
+        // >= 1 millisecond: show in milliseconds
+        format!("{:.2}ms", nanos_f64 / NANOS_PER_MILLI)
+    } else if nanos >= 1_000 {
+        // >= 1 microsecond: show in microseconds
+        format!("{:.2}µs", nanos_f64 / NANOS_PER_MICRO)
+    } else {
+        // < 1 microsecond: show in nanoseconds
+        format!("{nanos}ns")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_human_readable_count() {
+        assert_eq!(human_readable_count(0), "0");
+        assert_eq!(human_readable_count(1), "1");
+        assert_eq!(human_readable_count(999), "999");
+        assert_eq!(human_readable_count(1_000), "1.00 K");
+        assert_eq!(human_readable_count(10_100), "10.10 K");
+        assert_eq!(human_readable_count(1_532), "1.53 K");
+        assert_eq!(human_readable_count(99_999), "100.00 K");
+        assert_eq!(human_readable_count(1_000_000), "1.00 M");
+        assert_eq!(human_readable_count(1_532_000), "1.53 M");
+        assert_eq!(human_readable_count(99_000_000), "99.00 M");
+        assert_eq!(human_readable_count(123_456_789), "123.5 M");
+        assert_eq!(human_readable_count(1_000_000_000), "1.00 B");
+        assert_eq!(human_readable_count(1_532_000_000), "1.53 B");
+        assert_eq!(human_readable_count(999_999_999_999), "1000.0 B");
+        assert_eq!(human_readable_count(1_000_000_000_000), "1.00 T");
+        assert_eq!(human_readable_count(42_000_000_000_000), "42.00 T");
+    }
+
+    #[test]
+    fn test_human_readable_duration() {
+        assert_eq!(human_readable_duration(0), "0ns");
+        assert_eq!(human_readable_duration(1), "1ns");
+        assert_eq!(human_readable_duration(999), "999ns");
+        assert_eq!(human_readable_duration(1_000), "1.00µs");
+        assert_eq!(human_readable_duration(1_234), "1.23µs");
+        assert_eq!(human_readable_duration(999_999), "1000.00µs");
+        assert_eq!(human_readable_duration(1_000_000), "1.00ms");
+        assert_eq!(human_readable_duration(11_295_377), "11.30ms");
+        assert_eq!(human_readable_duration(1_234_567), "1.23ms");
+        assert_eq!(human_readable_duration(999_999_999), "1000.00ms");
+        assert_eq!(human_readable_duration(1_000_000_000), "1.00s");
+        assert_eq!(human_readable_duration(1_234_567_890), "1.23s");
+        assert_eq!(human_readable_duration(42_000_000_000), "42.00s");
+    }
+}
diff --git a/datafusion/common/src/display/mod.rs b/datafusion/common/src/display/mod.rs
index bad51c45f8ee8..a6a97b243f06a 100644
--- a/datafusion/common/src/display/mod.rs
+++ b/datafusion/common/src/display/mod.rs
@@ -18,6 +18,7 @@
 //! Types for plan display
 
 mod graphviz;
+pub mod human_readable;
 pub use graphviz::*;
 
 use std::{
diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs
index fde52944d0497..c6c50371c26c1 100644
--- a/datafusion/common/src/error.rs
+++ b/datafusion/common/src/error.rs
@@ -15,7 +15,25 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! DataFusion error types
+//! # Error Handling in DataFusion
+//!
+//! In DataFusion, there are two types of errors that can be raised:
+//!
+//! 1. Expected errors – These indicate invalid operations performed by the caller,
+//!    such as attempting to open a non-existent file. Different categories exist to
+//!    distinguish their sources (e.g., [`DataFusionError::ArrowError`],
+//!    [`DataFusionError::IoError`], etc.).
+//!
+//! 2. Unexpected errors – Represented by [`DataFusionError::Internal`], these
+//!    indicate that an internal invariant has been broken, suggesting a potential
+//!    bug in the system.
+//!
+//! There are several convenient macros for throwing errors. For example, use
+//! `exec_err!` for expected errors.
+//! For invariant checks, you can use `assert_or_internal_err!`,
+//! `assert_eq_or_internal_err!`, `assert_ne_or_internal_err!` for easier assertions.
+//! On the performance-critical path, use `debug_assert!` instead to reduce overhead.
+
 #[cfg(feature = "backtrace")]
 use std::backtrace::{Backtrace, BacktraceStatus};
 
@@ -30,8 +48,6 @@ use std::sync::Arc;
 use crate::utils::datafusion_strsim::normalized_levenshtein;
 use crate::utils::quote_identifier;
 use crate::{Column, DFSchema, Diagnostic, TableReference};
-#[cfg(feature = "avro")]
-use apache_avro::Error as AvroError;
 use arrow::error::ArrowError;
 #[cfg(feature = "parquet")]
 use parquet::errors::ParquetError;
@@ -58,9 +74,6 @@ pub enum DataFusionError {
     /// Error when reading / writing Parquet data.
     #[cfg(feature = "parquet")]
     ParquetError(Box<ParquetError>),
-    /// Error when reading Avro data.
-    #[cfg(feature = "avro")]
-    AvroError(Box<AvroError>),
     /// Error when reading / writing to / from an object_store (e.g. S3 or LocalFile)
     #[cfg(feature = "object_store")]
     ObjectStore(Box<object_store::Error>),
@@ -153,6 +166,10 @@ pub enum DataFusionError {
     /// to multiple receivers. For example, when the source of a repartition
     /// errors and the error is propagated to multiple consumers.
     Shared(Arc<DataFusionError>),
+    /// An error that originated during a foreign function interface call.
+    /// Transferring errors across the FFI boundary is difficult, so the original
+    /// error will be converted to a string.
+    Ffi(String),
 }
 
 #[macro_export]
@@ -310,13 +327,6 @@ impl From<ParquetError> for DataFusionError {
     }
 }
 
-#[cfg(feature = "avro")]
-impl From<AvroError> for DataFusionError {
-    fn from(e: AvroError) -> Self {
-        DataFusionError::AvroError(Box::new(e))
-    }
-}
-
 #[cfg(feature = "object_store")]
 impl From<object_store::Error> for DataFusionError {
     fn from(e: object_store::Error) -> Self {
@@ -367,8 +377,6 @@ impl Error for DataFusionError {
             DataFusionError::ArrowError(e, _) => Some(e.as_ref()),
             #[cfg(feature = "parquet")]
             DataFusionError::ParquetError(e) => Some(e.as_ref()),
-            #[cfg(feature = "avro")]
-            DataFusionError::AvroError(e) => Some(e.as_ref()),
             #[cfg(feature = "object_store")]
             DataFusionError::ObjectStore(e) => Some(e.as_ref()),
             DataFusionError::IoError(e) => Some(e),
@@ -395,6 +403,7 @@ impl Error for DataFusionError {
             // can't be executed.
             DataFusionError::Collection(errs) => errs.first().map(|e| e as &dyn Error),
             DataFusionError::Shared(e) => Some(e.as_ref()),
+            DataFusionError::Ffi(_) => None,
         }
     }
 }
@@ -497,8 +506,6 @@ impl DataFusionError {
             DataFusionError::ArrowError(_, _) => "Arrow error: ",
             #[cfg(feature = "parquet")]
             DataFusionError::ParquetError(_) => "Parquet error: ",
-            #[cfg(feature = "avro")]
-            DataFusionError::AvroError(_) => "Avro error: ",
             #[cfg(feature = "object_store")]
             DataFusionError::ObjectStore(_) => "Object Store error: ",
             DataFusionError::IoError(_) => "IO error: ",
@@ -526,6 +533,7 @@ impl DataFusionError {
                 errs.first().expect("cannot construct DataFusionError::Collection with 0 errors, but got one such case").error_prefix()
             }
             DataFusionError::Shared(_) => "",
+            DataFusionError::Ffi(_) => "FFI error: ",
         }
     }
 
@@ -537,8 +545,6 @@ impl DataFusionError {
             }
             #[cfg(feature = "parquet")]
             DataFusionError::ParquetError(ref desc) => Cow::Owned(desc.to_string()),
-            #[cfg(feature = "avro")]
-            DataFusionError::AvroError(ref desc) => Cow::Owned(desc.to_string()),
             DataFusionError::IoError(ref desc) => Cow::Owned(desc.to_string()),
             #[cfg(feature = "sql")]
             DataFusionError::SQL(ref desc, ref backtrace) => {
@@ -578,6 +584,7 @@ impl DataFusionError {
                 .expect("cannot construct DataFusionError::Collection with 0 errors")
                 .message(),
             DataFusionError::Shared(ref desc) => Cow::Owned(desc.to_string()),
+            DataFusionError::Ffi(ref desc) => Cow::Owned(desc.to_string()),
         }
     }
 
@@ -750,7 +757,7 @@ impl DataFusionErrorBuilder {
 macro_rules! unwrap_or_internal_err {
     ($Value: ident) => {
         $Value.ok_or_else(|| {
-            DataFusionError::Internal(format!(
+            $crate::DataFusionError::Internal(format!(
                 "{} should not be None",
                 stringify!($Value)
             ))
@@ -758,6 +765,116 @@ macro_rules! unwrap_or_internal_err {
     };
 }
 
+/// Assert a condition, returning `DataFusionError::Internal` on failure.
+///
+/// # Examples
+///
+/// ```text
+/// assert_or_internal_err!(predicate);
+/// assert_or_internal_err!(predicate, "human readable message");
+/// assert_or_internal_err!(predicate, format!("details: {}", value));
+/// ```
+#[macro_export]
+macro_rules! assert_or_internal_err {
+    ($cond:expr) => {
+        if !$cond {
+            return Err($crate::DataFusionError::Internal(format!(
+                "Assertion failed: {}",
+                stringify!($cond)
+            )));
+        }
+    };
+    ($cond:expr, $($arg:tt)+) => {
+        if !$cond {
+            return Err($crate::DataFusionError::Internal(format!(
+                "Assertion failed: {}: {}",
+                stringify!($cond),
+                format!($($arg)+)
+            )));
+        }
+    };
+}
+
+/// Assert equality, returning `DataFusionError::Internal` on failure.
+///
+/// # Examples
+///
+/// ```text
+/// assert_eq_or_internal_err!(actual, expected);
+/// assert_eq_or_internal_err!(left_expr, right_expr, "values must match");
+/// assert_eq_or_internal_err!(lhs, rhs, "metadata: {}", extra);
+/// ```
+#[macro_export]
+macro_rules! assert_eq_or_internal_err {
+    ($left:expr, $right:expr $(,)?) => {{
+        let left_val = &$left;
+        let right_val = &$right;
+        if left_val != right_val {
+            return Err($crate::DataFusionError::Internal(format!(
+                "Assertion failed: {} == {} (left: {:?}, right: {:?})",
+                stringify!($left),
+                stringify!($right),
+                left_val,
+                right_val
+            )));
+        }
+    }};
+    ($left:expr, $right:expr, $($arg:tt)+) => {{
+        let left_val = &$left;
+        let right_val = &$right;
+        if left_val != right_val {
+            return Err($crate::DataFusionError::Internal(format!(
+                "Assertion failed: {} == {} (left: {:?}, right: {:?}): {}",
+                stringify!($left),
+                stringify!($right),
+                left_val,
+                right_val,
+                format!($($arg)+)
+            )));
+        }
+    }};
+}
+
+/// Assert inequality, returning `DataFusionError::Internal` on failure.
+///
+/// # Examples
+///
+/// ```text
+/// assert_ne_or_internal_err!(left, right);
+/// assert_ne_or_internal_err!(lhs_expr, rhs_expr, "values must differ");
+/// assert_ne_or_internal_err!(a, b, "context {}", info);
+/// ```
+#[macro_export]
+macro_rules! assert_ne_or_internal_err {
+    ($left:expr, $right:expr $(,)?) => {{
+        let left_val = &$left;
+        let right_val = &$right;
+        if left_val == right_val {
+            return Err($crate::DataFusionError::Internal(format!(
+                "Assertion failed: {} != {} (left: {:?}, right: {:?})",
+                stringify!($left),
+                stringify!($right),
+                left_val,
+                right_val
+            )));
+        }
+    }};
+    ($left:expr, $right:expr, $($arg:tt)+) => {{
+        let left_val = &$left;
+        let right_val = &$right;
+        if left_val == right_val {
+            return Err($crate::DataFusionError::Internal(format!(
+                "Assertion failed: {} != {} (left: {:?}, right: {:?}): {}",
+                stringify!($left),
+                stringify!($right),
+                left_val,
+                right_val,
+                format!($($arg)+)
+            )));
+        }
+    }};
+}
+
 /// Add a macros for concise  DataFusionError::* errors declaration
 /// supports placeholders the same way as `format!`
 /// Examples:
@@ -768,84 +885,131 @@ macro_rules! unwrap_or_internal_err {
 ///     plan_err!("Error {val:?}")
 ///
 /// `NAME_ERR` -  macro name for wrapping Err(DataFusionError::*)
+/// `PREFIXED_NAME_ERR` - underscore-prefixed alias for NAME_ERR (e.g., _plan_err)
+/// (Needed to avoid compiler error when using macro in the same crate: `macros from the current crate cannot be referred to by absolute paths`)
 /// `NAME_DF_ERR` -  macro name for wrapping DataFusionError::*. Needed to keep backtrace opportunity
 /// in construction where DataFusionError::* used directly, like `map_err`, `ok_or_else`, etc
+/// `PREFIXED_NAME_DF_ERR` - underscore-prefixed alias for NAME_DF_ERR (e.g., _plan_datafusion_err).
+/// (Needed to avoid compiler error when using macro in the same crate: `macros from the current crate cannot be referred to by absolute paths`)
 macro_rules! make_error {
-    ($NAME_ERR:ident, $NAME_DF_ERR: ident, $ERR:ident) => { make_error!(@inner ($), $NAME_ERR, $NAME_DF_ERR, $ERR); };
-    (@inner ($d:tt), $NAME_ERR:ident, $NAME_DF_ERR:ident, $ERR:ident) => {
-        ::paste::paste!{
-            /// Macro wraps `$ERR` to add backtrace feature
-            #[macro_export]
-            macro_rules! $NAME_DF_ERR {
-                ($d($d args:expr),* $d(; diagnostic=$d DIAG:expr)?) => {{
-                    let err =$crate::DataFusionError::$ERR(
-                        ::std::format!(
-                            "{}{}",
-                            ::std::format!($d($d args),*),
-                            $crate::DataFusionError::get_back_trace(),
-                        ).into()
-                    );
-                    $d (
-                        let err = err.with_diagnostic($d DIAG);
-                    )?
-                    err
-                }
-            }
+    ($NAME_ERR:ident, $PREFIXED_NAME_ERR:ident, $NAME_DF_ERR:ident, $PREFIXED_NAME_DF_ERR:ident, $ERR:ident) => {
+        make_error!(@inner ($), $NAME_ERR, $PREFIXED_NAME_ERR, $NAME_DF_ERR, $PREFIXED_NAME_DF_ERR, $ERR);
+    };
+    (@inner ($d:tt), $NAME_ERR:ident, $PREFIXED_NAME_ERR:ident, $NAME_DF_ERR:ident, $PREFIXED_NAME_DF_ERR:ident, $ERR:ident) => {
+        /// Macro wraps `$ERR` to add backtrace feature
+        #[macro_export]
+        macro_rules! $NAME_DF_ERR {
+            ($d($d args:expr),* $d(; diagnostic = $d DIAG:expr)?) => {{
+                let err = $crate::DataFusionError::$ERR(
+                    ::std::format!(
+                        "{}{}",
+                        ::std::format!($d($d args),*),
+                        $crate::DataFusionError::get_back_trace(),
+                    ).into()
+                );
+                $d (
+                    let err = err.with_diagnostic($d DIAG);
+                )?
+                err
+            }}
         }
 
-            /// Macro wraps Err(`$ERR`) to add backtrace feature
-            #[macro_export]
-            macro_rules! $NAME_ERR {
-                ($d($d args:expr),* $d(; diagnostic = $d DIAG:expr)?) => {{
-                    let err = $crate::[<_ $NAME_DF_ERR>]!($d($d args),*);
-                    $d (
-                        let err = err.with_diagnostic($d DIAG);
-                    )?
-                    Err(err)
-
-                }}
-            }
-
-
-            // Note: Certain macros are used in this  crate, but not all.
-            // This macro generates a use or all of them in case they are needed
-            // so we allow unused code to avoid warnings when they are not used
-            #[doc(hidden)]
-            #[allow(unused)]
-            pub use $NAME_ERR as [<_ $NAME_ERR>];
-            #[doc(hidden)]
-            #[allow(unused)]
-            pub use $NAME_DF_ERR as [<_ $NAME_DF_ERR>];
+        /// Macro wraps Err(`$ERR`) to add backtrace feature
+        #[macro_export]
+        macro_rules! $NAME_ERR {
+            ($d($d args:expr),* $d(; diagnostic = $d DIAG:expr)?) => {{
+                let err = $crate::$PREFIXED_NAME_DF_ERR!($d($d args),*);
+                $d (
+                    let err = err.with_diagnostic($d DIAG);
+                )?
+                Err(err)
+            }}
         }
+
+        #[doc(hidden)]
+        pub use $NAME_ERR as $PREFIXED_NAME_ERR;
+        #[doc(hidden)]
+        pub use $NAME_DF_ERR as $PREFIXED_NAME_DF_ERR;
     };
 }
 
 // Exposes a macro to create `DataFusionError::Plan` with optional backtrace
-make_error!(plan_err, plan_datafusion_err, Plan);
+make_error!(
+    plan_err,
+    _plan_err,
+    plan_datafusion_err,
+    _plan_datafusion_err,
+    Plan
+);
 
 // Exposes a macro to create `DataFusionError::Internal` with optional backtrace
-make_error!(internal_err, internal_datafusion_err, Internal);
+make_error!(
+    internal_err,
+    _internal_err,
+    internal_datafusion_err,
+    _internal_datafusion_err,
+    Internal
+);
 
 // Exposes a macro to create `DataFusionError::NotImplemented` with optional backtrace
-make_error!(not_impl_err, not_impl_datafusion_err, NotImplemented);
+make_error!(
+    not_impl_err,
+    _not_impl_err,
+    not_impl_datafusion_err,
+    _not_impl_datafusion_err,
+    NotImplemented
+);
 
 // Exposes a macro to create `DataFusionError::Execution` with optional backtrace
-make_error!(exec_err, exec_datafusion_err, Execution);
+make_error!(
+    exec_err,
+    _exec_err,
+    exec_datafusion_err,
+    _exec_datafusion_err,
+    Execution
+);
 
 // Exposes a macro to create `DataFusionError::Configuration` with optional backtrace
-make_error!(config_err, config_datafusion_err, Configuration);
+make_error!(
+    config_err,
+    _config_err,
+    config_datafusion_err,
+    _config_datafusion_err,
+    Configuration
+);
 
 // Exposes a macro to create `DataFusionError::Substrait` with optional backtrace
-make_error!(substrait_err, substrait_datafusion_err, Substrait);
+make_error!(
+    substrait_err,
+    _substrait_err,
+    substrait_datafusion_err,
+    _substrait_datafusion_err,
+    Substrait
+);
 
 // Exposes a macro to create `DataFusionError::ResourcesExhausted` with optional backtrace
-make_error!(resources_err, resources_datafusion_err, ResourcesExhausted);
+make_error!(
+    resources_err,
+    _resources_err,
+    resources_datafusion_err,
+    _resources_datafusion_err,
+    ResourcesExhausted
+);
+
+// Exposes a macro to create `DataFusionError::Ffi` with optional backtrace
+make_error!(
+    ffi_err,
+    _ffi_err,
+    ffi_datafusion_err,
+    _ffi_datafusion_err,
+    Ffi
+);
 
 // Exposes a macro to create `DataFusionError::SQL` with optional backtrace
 #[macro_export]
 macro_rules! sql_datafusion_err {
     ($ERR:expr $(; diagnostic = $DIAG:expr)?) => {{
-        let err = DataFusionError::SQL(Box::new($ERR), Some(DataFusionError::get_back_trace()));
+        let err = $crate::DataFusionError::SQL(Box::new($ERR), Some($crate::DataFusionError::get_back_trace()));
         $(
             let err = err.with_diagnostic($DIAG);
         )?
@@ -857,7 +1021,7 @@ macro_rules! sql_datafusion_err {
 #[macro_export]
 macro_rules! sql_err {
     ($ERR:expr $(; diagnostic = $DIAG:expr)?) => {{
-        let err = datafusion_common::sql_datafusion_err!($ERR);
+        let err = $crate::sql_datafusion_err!($ERR);
         $(
             let err = err.with_diagnostic($DIAG);
         )?
@@ -869,7 +1033,7 @@ macro_rules! sql_err {
 #[macro_export]
 macro_rules! arrow_datafusion_err {
     ($ERR:expr $(; diagnostic = $DIAG:expr)?) => {{
-        let err = DataFusionError::ArrowError(Box::new($ERR), Some(DataFusionError::get_back_trace()));
+        let err = $crate::DataFusionError::ArrowError(Box::new($ERR), Some($crate::DataFusionError::get_back_trace()));
         $(
             let err = err.with_diagnostic($DIAG);
         )?
@@ -882,7 +1046,7 @@ macro_rules! arrow_datafusion_err {
 macro_rules! arrow_err {
     ($ERR:expr $(; diagnostic = $DIAG:expr)?) => {
     {
-        let err = datafusion_common::arrow_datafusion_err!($ERR);
+        let err = $crate::arrow_datafusion_err!($ERR);
         $(
             let err = err.with_diagnostic($DIAG);
         )?
@@ -894,9 +1058,9 @@ macro_rules! arrow_err {
 #[macro_export]
 macro_rules! schema_datafusion_err {
     ($ERR:expr $(; diagnostic = $DIAG:expr)?) => {{
-        let err = $crate::error::DataFusionError::SchemaError(
+        let err = $crate::DataFusionError::SchemaError(
             Box::new($ERR),
-            Box::new(Some($crate::error::DataFusionError::get_back_trace())),
+            Box::new(Some($crate::DataFusionError::get_back_trace())),
         );
         $(
             let err = err.with_diagnostic($DIAG);
@@ -909,9 +1073,9 @@ macro_rules! schema_datafusion_err {
 #[macro_export]
 macro_rules! schema_err {
     ($ERR:expr $(; diagnostic = $DIAG:expr)?) => {{
-        let err = $crate::error::DataFusionError::SchemaError(
+        let err = $crate::DataFusionError::SchemaError(
             Box::new($ERR),
-            Box::new(Some($crate::error::DataFusionError::get_back_trace())),
+            Box::new(Some($crate::DataFusionError::get_back_trace())),
         );
         $(
             let err = err.with_diagnostic($DIAG);
@@ -974,6 +1138,115 @@ mod test {
     use std::sync::Arc;
 
     use arrow::error::ArrowError;
+    use insta::assert_snapshot;
+
+    fn ok_result() -> Result<()> {
+        Ok(())
+    }
+
+    #[test]
+    fn test_assert_eq_or_internal_err_passes() -> Result<()> {
+        assert_eq_or_internal_err!(1, 1);
+        ok_result()
+    }
+
+    #[test]
+    fn test_assert_eq_or_internal_err_fails() {
+        fn check() -> Result<()> {
+            assert_eq_or_internal_err!(1, 2, "expected equality");
+            ok_result()
+        }
+
+        let err = check().unwrap_err();
+        assert_snapshot!(
+            err.to_string(),
+            @r"
+        Internal error: Assertion failed: 1 == 2 (left: 1, right: 2): expected equality.
+        This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues
+        "
+        );
+    }
+
+    #[test]
+    fn test_assert_ne_or_internal_err_passes() -> Result<()> {
+        assert_ne_or_internal_err!(1, 2);
+        ok_result()
+    }
+
+    #[test]
+    fn test_assert_ne_or_internal_err_fails() {
+        fn check() -> Result<()> {
+            assert_ne_or_internal_err!(3, 3, "values must differ");
+            ok_result()
+        }
+
+        let err = check().unwrap_err();
+        assert_snapshot!(
+            err.to_string(),
+            @r"
+        Internal error: Assertion failed: 3 != 3 (left: 3, right: 3): values must differ.
+        This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues
+        "
+        );
+    }
+
+    #[test]
+    fn test_assert_or_internal_err_passes() -> Result<()> {
+        assert_or_internal_err!(true);
+        assert_or_internal_err!(true, "message");
+        ok_result()
+    }
+
+    #[test]
+    fn test_assert_or_internal_err_fails_default() {
+        fn check() -> Result<()> {
+            assert_or_internal_err!(false);
+            ok_result()
+        }
+
+        let err = check().unwrap_err();
+        assert_snapshot!(
+            err.to_string(),
+            @r"
+        Internal error: Assertion failed: false.
+        This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues
+        "
+        );
+    }
+
+    #[test]
+    fn test_assert_or_internal_err_fails_with_message() {
+        fn check() -> Result<()> {
+            assert_or_internal_err!(false, "custom message");
+            ok_result()
+        }
+
+        let err = check().unwrap_err();
+        assert_snapshot!(
+            err.to_string(),
+            @r"
+        Internal error: Assertion failed: false: custom message.
+        This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues
+        "
+        );
+    }
+
+    #[test]
+    fn test_assert_or_internal_err_with_format_arguments() {
+        fn check() -> Result<()> {
+            assert_or_internal_err!(false, "custom {}", 42);
+            ok_result()
+        }
+
+        let err = check().unwrap_err();
+        assert_snapshot!(
+            err.to_string(),
+            @r"
+        Internal error: Assertion failed: false: custom 42.
+        This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues
+        "
+        );
+    }
 
     #[test]
     fn test_error_size() {
@@ -986,9 +1259,10 @@ mod test {
     #[test]
     fn datafusion_error_to_arrow() {
         let res = return_arrow_error().unwrap_err();
-        assert!(res
-            .to_string()
-            .starts_with("External error: Error during planning: foo"));
+        assert!(
+            res.to_string()
+                .starts_with("External error: Error during planning: foo")
+        );
     }
 
     #[test]
@@ -1000,7 +1274,6 @@ mod test {
     // To pass the test the environment variable RUST_BACKTRACE should be set to 1 to enforce backtrace
     #[cfg(feature = "backtrace")]
     #[test]
-    #[allow(clippy::unnecessary_literal_unwrap)]
     fn test_enabled_backtrace() {
         match std::env::var("RUST_BACKTRACE") {
             Ok(val) if val == "1" => {}
@@ -1017,17 +1290,17 @@ mod test {
                 .unwrap(),
             &"Error during planning: Err"
         );
-        assert!(!err
-            .split(DataFusionError::BACK_TRACE_SEP)
-            .collect::<Vec<&str>>()
-            .get(1)
-            .unwrap()
-            .is_empty());
+        assert!(
+            !err.split(DataFusionError::BACK_TRACE_SEP)
+                .collect::<Vec<&str>>()
+                .get(1)
+                .unwrap()
+                .is_empty()
+        );
     }
 
     #[cfg(not(feature = "backtrace"))]
     #[test]
-    #[allow(clippy::unnecessary_literal_unwrap)]
     fn test_disabled_backtrace() {
         let res: Result<(), DataFusionError> = plan_err!("Err");
         let res = res.unwrap_err().to_string();
@@ -1097,7 +1370,6 @@ mod test {
     }
 
     #[test]
-    #[allow(clippy::unnecessary_literal_unwrap)]
     fn test_make_error_parse_input() {
         let res: Result<(), DataFusionError> = plan_err!("Err");
         let res = res.unwrap_err();
@@ -1166,9 +1438,11 @@ mod test {
         let external_error_2: DataFusionError = generic_error_2.into();
 
         println!("{external_error_2}");
-        assert!(external_error_2
-            .to_string()
-            .starts_with("External error: io error"));
+        assert!(
+            external_error_2
+                .to_string()
+                .starts_with("External error: io error")
+        );
     }
 
     /// Model what happens when implementing SendableRecordBatchStream:
diff --git a/datafusion/common/src/file_options/csv_writer.rs b/datafusion/common/src/file_options/csv_writer.rs
index 943288af91642..fa116d17277cc 100644
--- a/datafusion/common/src/file_options/csv_writer.rs
+++ b/datafusion/common/src/file_options/csv_writer.rs
@@ -31,6 +31,8 @@ pub struct CsvWriterOptions {
     /// Compression to apply after ArrowWriter serializes RecordBatches.
     /// This compression is applied by DataFusion not the ArrowWriter itself.
     pub compression: CompressionTypeVariant,
+    /// Compression level for the output file.
+    pub compression_level: Option<u32>,
 }
 
 impl CsvWriterOptions {
@@ -41,6 +43,20 @@ impl CsvWriterOptions {
         Self {
             writer_options,
             compression,
+            compression_level: None,
+        }
+    }
+
+    /// Create a new `CsvWriterOptions` with the specified compression level.
+    pub fn new_with_level(
+        writer_options: WriterBuilder,
+        compression: CompressionTypeVariant,
+        compression_level: u32,
+    ) -> Self {
+        Self {
+            writer_options,
+            compression,
+            compression_level: Some(compression_level),
         }
     }
 }
@@ -78,9 +94,17 @@ impl TryFrom<&CsvOptions> for CsvWriterOptions {
         if let Some(v) = &value.double_quote {
             builder = builder.with_double_quote(*v)
         }
+        builder = builder.with_quote_style(value.quote_style.into());
+        if let Some(v) = &value.ignore_leading_whitespace {
+            builder = builder.with_ignore_leading_whitespace(*v)
+        }
+        if let Some(v) = &value.ignore_trailing_whitespace {
+            builder = builder.with_ignore_trailing_whitespace(*v)
+        }
         Ok(CsvWriterOptions {
             writer_options: builder,
             compression: value.compression,
+            compression_level: value.compression_level,
         })
     }
 }
diff --git a/datafusion/common/src/file_options/json_writer.rs b/datafusion/common/src/file_options/json_writer.rs
index 750d2972329bb..a537192c8128a 100644
--- a/datafusion/common/src/file_options/json_writer.rs
+++ b/datafusion/common/src/file_options/json_writer.rs
@@ -27,11 +27,26 @@ use crate::{
 #[derive(Clone, Debug)]
 pub struct JsonWriterOptions {
     pub compression: CompressionTypeVariant,
+    pub compression_level: Option<u32>,
 }
 
 impl JsonWriterOptions {
     pub fn new(compression: CompressionTypeVariant) -> Self {
-        Self { compression }
+        Self {
+            compression,
+            compression_level: None,
+        }
+    }
+
+    /// Create a new `JsonWriterOptions` with the specified compression and level.
+    pub fn new_with_level(
+        compression: CompressionTypeVariant,
+        compression_level: u32,
+    ) -> Self {
+        Self {
+            compression,
+            compression_level: Some(compression_level),
+        }
     }
 }
 
@@ -41,6 +56,7 @@ impl TryFrom<&JsonOptions> for JsonWriterOptions {
     fn try_from(value: &JsonOptions) -> Result<Self> {
         Ok(JsonWriterOptions {
             compression: value.compression,
+            compression_level: value.compression_level,
         })
     }
 }
diff --git a/datafusion/common/src/file_options/mod.rs b/datafusion/common/src/file_options/mod.rs
index 02667e0165717..5d2abd23172ed 100644
--- a/datafusion/common/src/file_options/mod.rs
+++ b/datafusion/common/src/file_options/mod.rs
@@ -31,10 +31,10 @@ mod tests {
     use std::collections::HashMap;
 
     use crate::{
+        Result,
         config::{ConfigFileType, TableOptions},
         file_options::{csv_writer::CsvWriterOptions, json_writer::JsonWriterOptions},
         parsers::CompressionTypeVariant,
-        Result,
     };
 
     use parquet::{
@@ -84,7 +84,7 @@ mod tests {
         .build();
 
         // Verify the expected options propagated down to parquet crate WriterProperties struct
-        assert_eq!(properties.max_row_group_size(), 123);
+        assert_eq!(properties.max_row_group_row_count(), Some(123));
         assert_eq!(properties.data_page_size_limit(), 123);
         assert_eq!(properties.write_batch_size(), 123);
         assert_eq!(properties.writer_version(), WriterVersion::PARQUET_2_0);
diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs
index 564929c61bab0..eaf5a1642e8e2 100644
--- a/datafusion/common/src/file_options/parquet_writer.rs
+++ b/datafusion/common/src/file_options/parquet_writer.rs
@@ -20,22 +20,20 @@
 use std::sync::Arc;
 
 use crate::{
+    _internal_datafusion_err, DataFusionError, Result,
     config::{ParquetOptions, TableParquetOptions},
-    DataFusionError, Result, _internal_datafusion_err,
 };
 
 use arrow::datatypes::Schema;
 use parquet::arrow::encode_arrow_schema;
-// TODO: handle once deprecated
-#[allow(deprecated)]
 use parquet::{
     arrow::ARROW_SCHEMA_META_KEY,
     basic::{BrotliLevel, GzipLevel, ZstdLevel},
     file::{
         metadata::KeyValue,
         properties::{
-            EnabledStatistics, WriterProperties, WriterPropertiesBuilder, WriterVersion,
-            DEFAULT_STATISTICS_ENABLED,
+            DEFAULT_STATISTICS_ENABLED, EnabledStatistics, WriterProperties,
+            WriterPropertiesBuilder,
         },
     },
     schema::types::ColumnPath,
@@ -97,7 +95,7 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder {
             global,
             column_specific_options,
             key_value_metadata,
-            crypto: _,
+            ..
         } = table_parquet_options;
 
         let mut builder = global.into_writer_properties_builder()?;
@@ -106,7 +104,9 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder {
         if !global.skip_arrow_metadata
             && !key_value_metadata.contains_key(ARROW_SCHEMA_META_KEY)
         {
-            return Err(_internal_datafusion_err!("arrow schema was not added to the kv_metadata, even though it is required by configuration settings"));
+            return Err(_internal_datafusion_err!(
+                "arrow schema was not added to the kv_metadata, even though it is required by configuration settings"
+            ));
         }
 
         // add kv_meta, if any
@@ -174,7 +174,6 @@ impl ParquetOptions {
     ///
     /// Note that this method does not include the key_value_metadata from [`TableParquetOptions`].
     pub fn into_writer_properties_builder(&self) -> Result<WriterPropertiesBuilder> {
-        #[allow(deprecated)]
         let ParquetOptions {
             data_pagesize_limit,
             write_batch_size,
@@ -192,6 +191,7 @@ impl ParquetOptions {
             bloom_filter_on_write,
             bloom_filter_fpp,
             bloom_filter_ndv,
+            use_content_defined_chunking,
 
             // not in WriterProperties
             enable_page_index: _,
@@ -200,6 +200,7 @@ impl ParquetOptions {
             metadata_size_hint: _,
             pushdown_filters: _,
             reorder_filters: _,
+            force_filter_selections: _, // not used for writer props
             allow_single_file_parallelism: _,
             maximum_parallel_row_group_writers: _,
             maximum_buffered_record_batches_per_stream: _,
@@ -214,7 +215,7 @@ impl ParquetOptions {
         let mut builder = WriterProperties::builder()
             .set_data_page_size_limit(*data_pagesize_limit)
             .set_write_batch_size(*write_batch_size)
-            .set_writer_version(parse_version_string(writer_version.as_str())?)
+            .set_writer_version((*writer_version).into())
             .set_dictionary_page_size_limit(*dictionary_page_size_limit)
             .set_statistics_enabled(
                 statistics_enabled
@@ -222,7 +223,7 @@ impl ParquetOptions {
                     .and_then(|s| parse_statistics_string(s).ok())
                     .unwrap_or(DEFAULT_STATISTICS_ENABLED),
             )
-            .set_max_row_group_size(*max_row_group_size)
+            .set_max_row_group_row_count(Some(*max_row_group_size))
             .set_created_by(created_by.clone())
             .set_column_index_truncate_length(*column_index_truncate_length)
             .set_statistics_truncate_length(*statistics_truncate_length)
@@ -247,6 +248,26 @@ impl ParquetOptions {
         if let Some(encoding) = encoding {
             builder = builder.set_encoding(parse_encoding_string(encoding)?);
         }
+        if let Some(cdc) = use_content_defined_chunking {
+            if cdc.min_chunk_size == 0 {
+                return Err(DataFusionError::Configuration(
+                    "CDC min_chunk_size must be greater than 0".to_string(),
+                ));
+            }
+            if cdc.max_chunk_size <= cdc.min_chunk_size {
+                return Err(DataFusionError::Configuration(format!(
+                    "CDC max_chunk_size ({}) must be greater than min_chunk_size ({})",
+                    cdc.max_chunk_size, cdc.min_chunk_size
+                )));
+            }
+            builder = builder.set_content_defined_chunking(Some(
+                parquet::file::properties::CdcOptions {
+                    min_chunk_size: cdc.min_chunk_size,
+                    max_chunk_size: cdc.max_chunk_size,
+                    norm_level: cdc.norm_level,
+                },
+            ));
+        }
 
         Ok(builder)
     }
@@ -261,7 +282,7 @@ pub(crate) fn parse_encoding_string(
         "plain" => Ok(parquet::basic::Encoding::PLAIN),
         "plain_dictionary" => Ok(parquet::basic::Encoding::PLAIN_DICTIONARY),
         "rle" => Ok(parquet::basic::Encoding::RLE),
-        #[allow(deprecated)]
+        #[expect(deprecated)]
         "bit_packed" => Ok(parquet::basic::Encoding::BIT_PACKED),
         "delta_binary_packed" => Ok(parquet::basic::Encoding::DELTA_BINARY_PACKED),
         "delta_length_byte_array" => {
@@ -341,10 +362,6 @@ pub fn parse_compression_string(
                 level,
             )?))
         }
-        "lzo" => {
-            check_level_is_none(codec, &level)?;
-            Ok(parquet::basic::Compression::LZO)
-        }
         "brotli" => {
             let level = require_level(codec, level)?;
             Ok(parquet::basic::Compression::BROTLI(BrotliLevel::try_new(
@@ -368,19 +385,7 @@ pub fn parse_compression_string(
         _ => Err(DataFusionError::Configuration(format!(
             "Unknown or unsupported parquet compression: \
         {str_setting}. Valid values are: uncompressed, snappy, gzip(level), \
-        lzo, brotli(level), lz4, zstd(level), and lz4_raw."
-        ))),
-    }
-}
-
-pub(crate) fn parse_version_string(str_setting: &str) -> Result<WriterVersion> {
-    let str_setting_lower: &str = &str_setting.to_lowercase();
-    match str_setting_lower {
-        "1.0" => Ok(WriterVersion::PARQUET_1_0),
-        "2.0" => Ok(WriterVersion::PARQUET_2_0),
-        _ => Err(DataFusionError::Configuration(format!(
-            "Unknown or unsupported parquet writer version {str_setting} \
-            valid options are 1.0 and 2.0"
+        brotli(level), lz4, zstd(level), and lz4_raw."
         ))),
     }
 }
@@ -402,14 +407,16 @@ pub(crate) fn parse_statistics_string(str_setting: &str) -> Result<EnabledStatis
 #[cfg(test)]
 mod tests {
     use super::*;
+    #[cfg(feature = "parquet_encryption")]
+    use crate::config::ConfigFileEncryptionProperties;
     use crate::config::{
-        ConfigFileEncryptionProperties, ParquetColumnOptions, ParquetEncryptionOptions,
-        ParquetOptions,
+        CdcOptions, ParquetColumnOptions, ParquetEncryptionOptions, ParquetOptions,
     };
+    use crate::parquet_config::DFParquetWriterVersion;
     use parquet::basic::Compression;
     use parquet::file::properties::{
-        BloomFilterProperties, EnabledStatistics, DEFAULT_BLOOM_FILTER_FPP,
-        DEFAULT_BLOOM_FILTER_NDV,
+        BloomFilterProperties, DEFAULT_BLOOM_FILTER_FPP, DEFAULT_BLOOM_FILTER_NDV,
+        DEFAULT_MAX_ROW_GROUP_ROW_COUNT, EnabledStatistics,
     };
     use std::collections::HashMap;
 
@@ -432,17 +439,17 @@ mod tests {
 
     fn parquet_options_with_non_defaults() -> ParquetOptions {
         let defaults = ParquetOptions::default();
-        let writer_version = if defaults.writer_version.eq("1.0") {
-            "2.0"
+        let writer_version = if defaults.writer_version.eq(&DFParquetWriterVersion::V1_0)
+        {
+            DFParquetWriterVersion::V2_0
         } else {
-            "1.0"
+            DFParquetWriterVersion::V1_0
         };
 
-        #[allow(deprecated)] // max_statistics_size
         ParquetOptions {
             data_pagesize_limit: 42,
             write_batch_size: 42,
-            writer_version: writer_version.into(),
+            writer_version,
             compression: Some("zstd(22)".into()),
             dictionary_enabled: Some(!defaults.dictionary_enabled.unwrap_or(false)),
             dictionary_page_size_limit: 42,
@@ -464,6 +471,7 @@ mod tests {
             metadata_size_hint: defaults.metadata_size_hint,
             pushdown_filters: defaults.pushdown_filters,
             reorder_filters: defaults.reorder_filters,
+            force_filter_selections: defaults.force_filter_selections,
             allow_single_file_parallelism: defaults.allow_single_file_parallelism,
             maximum_parallel_row_group_writers: defaults
                 .maximum_parallel_row_group_writers,
@@ -475,6 +483,7 @@ mod tests {
             skip_arrow_metadata: defaults.skip_arrow_metadata,
             coerce_int96: None,
             max_predicate_cache_size: defaults.max_predicate_cache_size,
+            use_content_defined_chunking: defaults.use_content_defined_chunking.clone(),
         }
     }
 
@@ -484,7 +493,6 @@ mod tests {
     ) -> ParquetColumnOptions {
         let bloom_filter_default_props = props.bloom_filter_properties(&col);
 
-        #[allow(deprecated)] // max_statistics_size
         ParquetColumnOptions {
             bloom_filter_enabled: Some(bloom_filter_default_props.is_some()),
             encoding: props.encoding(&col).map(|s| s.to_string()),
@@ -545,15 +553,16 @@ mod tests {
         #[cfg(not(feature = "parquet_encryption"))]
         let fep = None;
 
-        #[allow(deprecated)] // max_statistics_size
         TableParquetOptions {
             global: ParquetOptions {
                 // global options
                 data_pagesize_limit: props.dictionary_page_size_limit(),
                 write_batch_size: props.write_batch_size(),
-                writer_version: format!("{}.0", props.writer_version().as_num()),
+                writer_version: props.writer_version().into(),
                 dictionary_page_size_limit: props.dictionary_page_size_limit(),
-                max_row_group_size: props.max_row_group_size(),
+                max_row_group_size: props
+                    .max_row_group_row_count()
+                    .unwrap_or(DEFAULT_MAX_ROW_GROUP_ROW_COUNT),
                 created_by: props.created_by().to_string(),
                 column_index_truncate_length: props.column_index_truncate_length(),
                 statistics_truncate_length: props.statistics_truncate_length(),
@@ -577,6 +586,7 @@ mod tests {
                 metadata_size_hint: global_options_defaults.metadata_size_hint,
                 pushdown_filters: global_options_defaults.pushdown_filters,
                 reorder_filters: global_options_defaults.reorder_filters,
+                force_filter_selections: global_options_defaults.force_filter_selections,
                 allow_single_file_parallelism: global_options_defaults
                     .allow_single_file_parallelism,
                 maximum_parallel_row_group_writers: global_options_defaults
@@ -590,6 +600,13 @@ mod tests {
                 binary_as_string: global_options_defaults.binary_as_string,
                 skip_arrow_metadata: global_options_defaults.skip_arrow_metadata,
                 coerce_int96: None,
+                use_content_defined_chunking: props.content_defined_chunking().map(|c| {
+                    CdcOptions {
+                        min_chunk_size: c.min_chunk_size,
+                        max_chunk_size: c.max_chunk_size,
+                        norm_level: c.norm_level,
+                    }
+                }),
             },
             column_specific_options,
             key_value_metadata,
@@ -674,8 +691,7 @@ mod tests {
         let mut default_table_writer_opts = TableParquetOptions::default();
         let default_parquet_opts = ParquetOptions::default();
         assert_eq!(
-            default_table_writer_opts.global,
-            default_parquet_opts,
+            default_table_writer_opts.global, default_parquet_opts,
             "should have matching defaults for TableParquetOptions.global and ParquetOptions",
         );
 
@@ -699,7 +715,9 @@ mod tests {
             "should have different created_by sources",
         );
         assert!(
-            default_writer_props.created_by().starts_with("parquet-rs version"),
+            default_writer_props
+                .created_by()
+                .starts_with("parquet-rs version"),
             "should indicate that writer_props defaults came from the extern parquet crate",
         );
         assert!(
@@ -733,8 +751,7 @@ mod tests {
         from_extern_parquet.global.skip_arrow_metadata = true;
 
         assert_eq!(
-            default_table_writer_opts,
-            from_extern_parquet,
+            default_table_writer_opts, from_extern_parquet,
             "the default writer_props should have the same configuration as the session's default TableParquetOptions",
         );
     }
@@ -800,6 +817,74 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_cdc_enabled_with_custom_options() {
+        let mut opts = TableParquetOptions::default();
+        opts.global.use_content_defined_chunking = Some(CdcOptions {
+            min_chunk_size: 128 * 1024,
+            max_chunk_size: 512 * 1024,
+            norm_level: 2,
+        });
+        opts.arrow_schema(&Arc::new(Schema::empty()));
+
+        let props = WriterPropertiesBuilder::try_from(&opts).unwrap().build();
+        let cdc = props.content_defined_chunking().expect("CDC should be set");
+        assert_eq!(cdc.min_chunk_size, 128 * 1024);
+        assert_eq!(cdc.max_chunk_size, 512 * 1024);
+        assert_eq!(cdc.norm_level, 2);
+    }
+
+    #[test]
+    fn test_cdc_disabled_by_default() {
+        let mut opts = TableParquetOptions::default();
+        opts.arrow_schema(&Arc::new(Schema::empty()));
+
+        let props = WriterPropertiesBuilder::try_from(&opts).unwrap().build();
+        assert!(props.content_defined_chunking().is_none());
+    }
+
+    #[test]
+    fn test_cdc_round_trip_through_writer_props() {
+        let mut opts = TableParquetOptions::default();
+        opts.global.use_content_defined_chunking = Some(CdcOptions {
+            min_chunk_size: 64 * 1024,
+            max_chunk_size: 2 * 1024 * 1024,
+            norm_level: -1,
+        });
+        opts.arrow_schema(&Arc::new(Schema::empty()));
+
+        let props = WriterPropertiesBuilder::try_from(&opts).unwrap().build();
+        let recovered = session_config_from_writer_props(&props);
+
+        let cdc = recovered.global.use_content_defined_chunking.unwrap();
+        assert_eq!(cdc.min_chunk_size, 64 * 1024);
+        assert_eq!(cdc.max_chunk_size, 2 * 1024 * 1024);
+        assert_eq!(cdc.norm_level, -1);
+    }
+
+    #[test]
+    fn test_cdc_validation_zero_min_chunk_size() {
+        let mut opts = TableParquetOptions::default();
+        opts.global.use_content_defined_chunking = Some(CdcOptions {
+            min_chunk_size: 0,
+            ..CdcOptions::default()
+        });
+        opts.arrow_schema(&Arc::new(Schema::empty()));
+        assert!(WriterPropertiesBuilder::try_from(&opts).is_err());
+    }
+
+    #[test]
+    fn test_cdc_validation_max_not_greater_than_min() {
+        let mut opts = TableParquetOptions::default();
+        opts.global.use_content_defined_chunking = Some(CdcOptions {
+            min_chunk_size: 512 * 1024,
+            max_chunk_size: 256 * 1024,
+            ..CdcOptions::default()
+        });
+        opts.arrow_schema(&Arc::new(Schema::empty()));
+        assert!(WriterPropertiesBuilder::try_from(&opts).is_err());
+    }
+
     #[test]
     fn test_bloom_filter_set_ndv_only() {
         // the TableParquetOptions::default, with only ndv set
diff --git a/datafusion/common/src/format.rs b/datafusion/common/src/format.rs
index 764190e1189bf..a6bd42be691a9 100644
--- a/datafusion/common/src/format.rs
+++ b/datafusion/common/src/format.rs
@@ -176,9 +176,9 @@ impl FromStr for ExplainFormat {
             "tree" => Ok(ExplainFormat::Tree),
             "pgjson" => Ok(ExplainFormat::PostgresJSON),
             "graphviz" => Ok(ExplainFormat::Graphviz),
-            _ => {
-                Err(DataFusionError::Configuration(format!("Invalid explain format. Expected 'indent', 'tree', 'pgjson' or 'graphviz'. Got '{format}'")))
-            }
+            _ => Err(DataFusionError::Configuration(format!(
+                "Invalid explain format. Expected 'indent', 'tree', 'pgjson' or 'graphviz'. Got '{format}'"
+            ))),
         }
     }
 }
@@ -206,23 +206,50 @@ impl ConfigField for ExplainFormat {
     }
 }
 
-/// Verbosity levels controlling how `EXPLAIN ANALYZE` renders metrics
+/// Categorizes metrics so the display layer can choose the desired verbosity.
+///
+/// The `datafusion.explain.analyze_level` configuration controls which
+/// type is shown:
+/// - `"dev"` (the default): all metrics are shown.
+/// - `"summary"`: only metrics tagged as `Summary` are shown.
+///
+/// This is orthogonal to [`MetricCategory`], which filters by *what kind*
+/// of value a metric represents (rows / bytes / timing).
+///
+/// # Difference from `EXPLAIN ANALYZE VERBOSE`
+///
+/// The `VERBOSE` keyword controls whether per-partition metrics are shown
+/// (when specified) or aggregated metrics are displayed (when omitted).
+/// In contrast, `MetricType` determines which *levels* of metrics are
+/// displayed.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub enum ExplainAnalyzeLevel {
-    /// Show a compact view containing high-level metrics
+pub enum MetricType {
+    /// Common metrics for high-level insights (answering which operator is slow)
     Summary,
-    /// Show a developer-focused view with per-operator details
+    /// For deep operator-level introspection for developers
     Dev,
-    // When adding new enum, update the error message in `from_str()` accordingly.
 }
 
-impl FromStr for ExplainAnalyzeLevel {
+impl MetricType {
+    /// Returns the set of metric types that should be shown for this level.
+    ///
+    /// `Dev` is a superset of `Summary`: when the user selects
+    /// `analyze_level = 'dev'`, both `Summary` and `Dev` metrics are shown.
+    pub fn included_types(self) -> Vec<MetricType> {
+        match self {
+            MetricType::Summary => vec![MetricType::Summary],
+            MetricType::Dev => vec![MetricType::Summary, MetricType::Dev],
+        }
+    }
+}
+
+impl FromStr for MetricType {
     type Err = DataFusionError;
 
-    fn from_str(level: &str) -> Result<Self, Self::Err> {
-        match level.to_lowercase().as_str() {
-            "summary" => Ok(ExplainAnalyzeLevel::Summary),
-            "dev" => Ok(ExplainAnalyzeLevel::Dev),
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s.trim().to_lowercase().as_str() {
+            "summary" => Ok(Self::Summary),
+            "dev" => Ok(Self::Dev),
             other => Err(DataFusionError::Configuration(format!(
                 "Invalid explain analyze level. Expected 'summary' or 'dev'. Got '{other}'"
             ))),
@@ -230,23 +257,176 @@ impl FromStr for ExplainAnalyzeLevel {
     }
 }
 
-impl Display for ExplainAnalyzeLevel {
+impl Display for MetricType {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let s = match self {
-            ExplainAnalyzeLevel::Summary => "summary",
-            ExplainAnalyzeLevel::Dev => "dev",
-        };
-        write!(f, "{s}")
+        match self {
+            Self::Summary => write!(f, "summary"),
+            Self::Dev => write!(f, "dev"),
+        }
+    }
+}
+
+impl ConfigField for MetricType {
+    fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
+        v.some(key, self, description)
+    }
+
+    fn set(&mut self, _: &str, value: &str) -> Result<()> {
+        *self = MetricType::from_str(value)?;
+        Ok(())
+    }
+}
+
+/// Classifies a metric by what it measures.
+///
+/// This is orthogonal to [`MetricType`] (Summary / Dev), which controls
+/// *verbosity*. `MetricCategory` controls *what kind of value* is shown,
+/// so that `EXPLAIN ANALYZE` output can be narrowed to only the categories
+/// that are useful in a given context.
+///
+/// In particular this is useful for testing since metrics differ in their stability across runs:
+/// - [`Rows`](Self::Rows) and [`Bytes`](Self::Bytes) depend only on the plan
+///   and the data, so they are mostly deterministic across runs (given the same
+///   input). Variations can existing e.g. because of non-deterministic ordering
+///   of evaluation between threads.
+///   Running with a single target partition often makes these metrics stable enough to assert on in tests.
+/// - [`Timing`](Self::Timing) depends on hardware, system load, scheduling,
+///   etc., so it varies from run to run even on the same machine.
+///
+/// [`MetricCategory`] is especially useful in sqllogictest (`.slt`) files:
+/// setting `datafusion.explain.analyze_categories = 'rows'` lets a test
+/// assert on row-count metrics without sprinkling `<slt:ignore>` over every
+/// timing value.
+///
+/// Metrics that do not declare a category (the default for custom
+/// `Count` / `Gauge` metrics) are treated as
+/// [`Uncategorized`](Self::Uncategorized) for filtering purposes.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum MetricCategory {
+    /// Row counts and related dimensionless counters: `output_rows`,
+    /// `spilled_rows`, `output_batches`, pruning metrics, ratios, etc.
+    ///
+    /// Mostly deterministic given the same plan and data.
+    Rows,
+    /// Byte measurements: `output_bytes`, `spilled_bytes`,
+    /// `current_memory_usage`, `bytes_scanned`, etc.
+    ///
+    /// Mostly deterministic given the same plan and data.
+    Bytes,
+    /// Wall-clock durations and timestamps: `elapsed_compute`,
+    /// operator-defined `Time` metrics, `start_timestamp` /
+    /// `end_timestamp`, etc.
+    ///
+    /// **Non-deterministic** — varies across runs even on the same hardware.
+    Timing,
+    /// Catch-all for metrics that do not fit into [`Rows`](Self::Rows),
+    /// [`Bytes`](Self::Bytes), or [`Timing`](Self::Timing).
+    ///
+    /// Custom `Count` / `Gauge` metrics that are not explicitly assigned
+    /// a category are treated as `Uncategorized` for filtering purposes.
+    ///
+    /// This variant lets users explicitly include or exclude these
+    /// metrics, e.g.:
+    /// ```sql
+    /// SET datafusion.explain.analyze_categories = 'rows, bytes, uncategorized';
+    /// ```
+    Uncategorized,
+}
+
+impl FromStr for MetricCategory {
+    type Err = DataFusionError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s.trim().to_lowercase().as_str() {
+            "rows" => Ok(Self::Rows),
+            "bytes" => Ok(Self::Bytes),
+            "timing" => Ok(Self::Timing),
+            "uncategorized" => Ok(Self::Uncategorized),
+            other => Err(DataFusionError::Configuration(format!(
+                "Invalid metric category '{other}'. \
+                 Expected 'rows', 'bytes', 'timing', or 'uncategorized'."
+            ))),
+        }
+    }
+}
+
+impl Display for MetricCategory {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Rows => write!(f, "rows"),
+            Self::Bytes => write!(f, "bytes"),
+            Self::Timing => write!(f, "timing"),
+            Self::Uncategorized => write!(f, "uncategorized"),
+        }
+    }
+}
+
+/// Controls which [`MetricCategory`] values are shown in `EXPLAIN ANALYZE`.
+///
+/// Set via `SET datafusion.explain.analyze_categories = '...'`.
+///
+/// See [`MetricCategory`] for the determinism properties that motivate
+/// this filter.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
+pub enum ExplainAnalyzeCategories {
+    /// Show all metrics regardless of category (the default).
+    #[default]
+    All,
+    /// Show only metrics whose category is in the list.
+    /// Metrics with no declared category are treated as
+    /// [`Uncategorized`](MetricCategory::Uncategorized) for filtering.
+    ///
+    /// An **empty** vec means "plan only" — suppress all metrics.
+    Only(Vec<MetricCategory>),
+}
+
+impl FromStr for ExplainAnalyzeCategories {
+    type Err = DataFusionError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let s = s.trim().to_lowercase();
+        match s.as_str() {
+            "all" => Ok(Self::All),
+            "none" => Ok(Self::Only(vec![])),
+            other => {
+                let mut cats = Vec::new();
+                for part in other.split(',') {
+                    cats.push(part.trim().parse::<MetricCategory>()?);
+                }
+                cats.dedup();
+                Ok(Self::Only(cats))
+            }
+        }
+    }
+}
+
+impl Display for ExplainAnalyzeCategories {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::All => write!(f, "all"),
+            Self::Only(cats) if cats.is_empty() => write!(f, "none"),
+            Self::Only(cats) => {
+                let mut first = true;
+                for cat in cats {
+                    if !first {
+                        write!(f, ",")?;
+                    }
+                    first = false;
+                    write!(f, "{cat}")?;
+                }
+                Ok(())
+            }
+        }
     }
 }
 
-impl ConfigField for ExplainAnalyzeLevel {
+impl ConfigField for ExplainAnalyzeCategories {
     fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
         v.some(key, self, description)
     }
 
     fn set(&mut self, _: &str, value: &str) -> Result<()> {
-        *self = ExplainAnalyzeLevel::from_str(value)?;
+        *self = ExplainAnalyzeCategories::from_str(value)?;
         Ok(())
     }
 }
diff --git a/datafusion/common/src/functional_dependencies.rs b/datafusion/common/src/functional_dependencies.rs
index 63962998ad18b..24ca33c0c2c90 100644
--- a/datafusion/common/src/functional_dependencies.rs
+++ b/datafusion/common/src/functional_dependencies.rs
@@ -590,6 +590,53 @@ pub fn get_required_group_by_exprs_indices(
         .collect()
 }
 
+/// Returns indices for the minimal subset of ORDER BY expressions that are
+/// functionally equivalent to the original set of ORDER BY expressions.
+pub fn get_required_sort_exprs_indices(
+    schema: &DFSchema,
+    sort_expr_names: &[String],
+) -> Vec<usize> {
+    let dependencies = schema.functional_dependencies();
+    let field_names = schema.field_names();
+
+    let mut known_field_indices = HashSet::new();
+    let mut required_sort_expr_indices = Vec::new();
+
+    for (sort_expr_idx, sort_expr_name) in sort_expr_names.iter().enumerate() {
+        // If the sort expression doesn't correspond to a known schema field
+        // (e.g. a computed expression), we can't reason about it via functional
+        // dependencies, so conservatively keep it.
+        let Some(field_idx) = field_names
+            .iter()
+            .position(|field_name| field_name == sort_expr_name)
+        else {
+            required_sort_expr_indices.push(sort_expr_idx);
+            continue;
+        };
+
+        // A sort expression is removable if its value is functionally determined
+        // by fields that already appear earlier in the sort order: if the earlier
+        // fields are fixed, this one's value is fixed too, so it adds no ordering
+        // information.
+        let removable = dependencies.deps.iter().any(|dependency| {
+            dependency.target_indices.contains(&field_idx)
+                && dependency
+                    .source_indices
+                    .iter()
+                    .all(|source_idx| known_field_indices.contains(source_idx))
+        });
+
+        if removable {
+            continue;
+        }
+
+        known_field_indices.insert(field_idx);
+        required_sort_expr_indices.push(sort_expr_idx);
+    }
+
+    required_sort_expr_indices
+}
+
 /// Updates entries inside the `entries` vector with their corresponding
 /// indices inside the `proj_indices` vector.
 fn update_elements_with_matching_indices(
diff --git a/datafusion/common/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs
index 4b18351f708b7..fcc2e919b6cc2 100644
--- a/datafusion/common/src/hash_utils.rs
+++ b/datafusion/common/src/hash_utils.rs
@@ -17,25 +17,33 @@
 
 //! Functionality used both on logical and physical plans
 
-#[cfg(not(feature = "force_hash_collisions"))]
-use std::sync::Arc;
-
-use ahash::RandomState;
 use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano};
 use arrow::array::*;
+#[cfg(not(feature = "force_hash_collisions"))]
+use arrow::compute::take;
 use arrow::datatypes::*;
 #[cfg(not(feature = "force_hash_collisions"))]
 use arrow::{downcast_dictionary_array, downcast_primitive_array};
+use foldhash::fast::FixedState;
+#[cfg(not(feature = "force_hash_collisions"))]
+use itertools::Itertools;
+#[cfg(not(feature = "force_hash_collisions"))]
+use std::collections::HashMap;
+use std::hash::{BuildHasher, Hash, Hasher};
+
+/// The hash random state used throughout DataFusion for hashing.
+pub type RandomState = FixedState;
 
 #[cfg(not(feature = "force_hash_collisions"))]
 use crate::cast::{
     as_binary_view_array, as_boolean_array, as_fixed_size_list_array,
-    as_generic_binary_array, as_large_list_array, as_list_array, as_map_array,
-    as_string_array, as_string_view_array, as_struct_array,
+    as_generic_binary_array, as_large_list_array, as_large_list_view_array,
+    as_list_array, as_list_view_array, as_map_array, as_string_array,
+    as_string_view_array, as_struct_array, as_union_array,
 };
 use crate::error::Result;
-#[cfg(not(feature = "force_hash_collisions"))]
-use crate::error::_internal_err;
+use crate::error::{_internal_datafusion_err, _internal_err};
+use std::cell::RefCell;
 
 // Combines two hashes into one hash
 #[inline]
@@ -44,6 +52,94 @@ pub fn combine_hashes(l: u64, r: u64) -> u64 {
     hash.wrapping_mul(37).wrapping_add(r)
 }
 
+/// Maximum size for the thread-local hash buffer before truncation (4MB = 524,288 u64 elements).
+/// The goal of this is to avoid unbounded memory growth that would appear as a memory leak.
+/// We allow temporary allocations beyond this size, but after use the buffer is truncated
+/// to this size.
+const MAX_BUFFER_SIZE: usize = 524_288;
+
+thread_local! {
+    /// Thread-local buffer for hash computations to avoid repeated allocations.
+    /// The buffer is reused across calls and truncated if it exceeds MAX_BUFFER_SIZE.
+    /// Defaults to a capacity of 8192 u64 elements which is the default batch size.
+    /// This corresponds to 64KB of memory.
+    static HASH_BUFFER: RefCell<Vec<u64>> = const { RefCell::new(Vec::new()) };
+}
+
+/// Creates hashes for the given arrays using a thread-local buffer, then calls the provided callback
+/// with an immutable reference to the computed hashes.
+///
+/// This function manages a thread-local buffer to avoid repeated allocations. The buffer is automatically
+/// truncated if it exceeds `MAX_BUFFER_SIZE` after use.
+///
+/// # Arguments
+/// * `arrays` - The arrays to hash (must contain at least one array)
+/// * `random_state` - The random state for hashing
+/// * `callback` - A function that receives an immutable reference to the hash slice and returns a result
+///
+/// # Errors
+/// Returns an error if:
+/// - No arrays are provided
+/// - The function is called reentrantly (i.e., the callback invokes `with_hashes` again on the same thread)
+/// - The function is called during or after thread destruction
+///
+/// # Example
+/// ```ignore
+/// use datafusion_common::hash_utils::{with_hashes, RandomState};
+/// use arrow::array::{Int32Array, ArrayRef};
+/// use std::sync::Arc;
+///
+/// let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
+/// let random_state = RandomState::default();
+///
+/// let result = with_hashes([&array], &random_state, |hashes| {
+///     // Use the hashes here
+///     Ok(hashes.len())
+/// })?;
+/// ```
+pub fn with_hashes<I, T, F, R>(
+    arrays: I,
+    random_state: &RandomState,
+    callback: F,
+) -> Result<R>
+where
+    I: IntoIterator<Item = T>,
+    T: AsDynArray,
+    F: FnOnce(&[u64]) -> Result<R>,
+{
+    // Peek at the first array to determine buffer size without fully collecting
+    let mut iter = arrays.into_iter().peekable();
+
+    // Get the required size from the first array
+    let required_size = match iter.peek() {
+        Some(arr) => arr.as_dyn_array().len(),
+        None => return _internal_err!("with_hashes requires at least one array"),
+    };
+
+    HASH_BUFFER.try_with(|cell| {
+        let mut buffer = cell.try_borrow_mut()
+            .map_err(|_| _internal_datafusion_err!("with_hashes cannot be called reentrantly on the same thread"))?;
+
+        // Ensure buffer has sufficient length, clearing old values
+        buffer.clear();
+        buffer.resize(required_size, 0);
+
+        // Create hashes in the buffer - this consumes the iterator
+        create_hashes(iter, random_state, &mut buffer[..required_size])?;
+
+        // Execute the callback with an immutable slice
+        let result = callback(&buffer[..required_size])?;
+
+        // Cleanup: truncate if buffer grew too large
+        if buffer.capacity() > MAX_BUFFER_SIZE {
+            buffer.truncate(MAX_BUFFER_SIZE);
+            buffer.shrink_to_fit();
+        }
+
+        Ok(result)
+    }).map_err(|_| _internal_datafusion_err!("with_hashes cannot access thread-local storage during or after thread destruction"))?
+}
+
 #[cfg(not(feature = "force_hash_collisions"))]
 fn hash_null(random_state: &RandomState, hashes_buffer: &'_ mut [u64], mul_col: bool) {
     if mul_col {
@@ -60,12 +156,17 @@ fn hash_null(random_state: &RandomState, hashes_buffer: &'_ mut [u64], mul_col:
 
 pub trait HashValue {
     fn hash_one(&self, state: &RandomState) -> u64;
+    /// Write this value into an existing hasher (same data as `hash_one`).
+    fn hash_write(&self, hasher: &mut impl Hasher);
 }
 
 impl<T: HashValue + ?Sized> HashValue for &T {
     fn hash_one(&self, state: &RandomState) -> u64 {
         T::hash_one(self, state)
     }
+    fn hash_write(&self, hasher: &mut impl Hasher) {
+        T::hash_write(self, hasher)
+    }
 }
 
 macro_rules! hash_value {
@@ -74,10 +175,13 @@ macro_rules! hash_value {
             fn hash_one(&self, state: &RandomState) -> u64 {
                 state.hash_one(self)
             }
+            fn hash_write(&self, hasher: &mut impl Hasher) {
+                Hash::hash(self, hasher)
+            }
         })+
     };
 }
-hash_value!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64);
+hash_value!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64, u128);
 hash_value!(bool, str, [u8], IntervalDayTime, IntervalMonthDayNano);
 
 macro_rules! hash_float_value {
@@ -86,14 +190,29 @@ macro_rules! hash_float_value {
             fn hash_one(&self, state: &RandomState) -> u64 {
                 state.hash_one(<$i>::from_ne_bytes(self.to_ne_bytes()))
             }
+            fn hash_write(&self, hasher: &mut impl Hasher) {
+                hasher.write(&self.to_ne_bytes())
+            }
         })+
     };
 }
 hash_float_value!((half::f16, u16), (f32, u32), (f64, u64));
 
+/// Create a `SeedableRandomState` whose per-hasher seed incorporates `seed`.
+/// This folds the previous hash into the hasher's initial state so only the
+/// new value needs to pass through the hash function — same cost as `hash_one`.
+#[cfg(not(feature = "force_hash_collisions"))]
+#[inline]
+fn seeded_state(seed: u64) -> foldhash::fast::SeedableRandomState {
+    foldhash::fast::SeedableRandomState::with_seed(
+        seed,
+        foldhash::SharedSeed::global_fixed(),
+    )
+}
+
 /// Builds hash values of PrimitiveArray and writes them into `hashes_buffer`
-/// If `rehash==true` this combines the previous hash value in the buffer
-/// with the new hash using `combine_hashes`
+/// If `rehash==true` this folds the existing hash into the hasher state
+/// and hashes only the new value (avoiding a separate combine step).
 #[cfg(not(feature = "force_hash_collisions"))]
 fn hash_array_primitive<T>(
     array: &PrimitiveArray<T>,
@@ -112,7 +231,9 @@ fn hash_array_primitive<T>(
     if array.null_count() == 0 {
         if rehash {
             for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) {
-                *hash = combine_hashes(value.hash_one(random_state), *hash);
+                let mut hasher = seeded_state(*hash).build_hasher();
+                value.hash_write(&mut hasher);
+                *hash = hasher.finish();
             }
         } else {
             for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) {
@@ -120,18 +241,16 @@ fn hash_array_primitive<T>(
             }
         }
     } else if rehash {
-        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
-            if !array.is_null(i) {
-                let value = unsafe { array.value_unchecked(i) };
-                *hash = combine_hashes(value.hash_one(random_state), *hash);
-            }
+        for i in array.nulls().unwrap().valid_indices() {
+            let value = unsafe { array.value_unchecked(i) };
+            let mut hasher = seeded_state(hashes_buffer[i]).build_hasher();
+            value.hash_write(&mut hasher);
+            hashes_buffer[i] = hasher.finish();
         }
     } else {
-        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
-            if !array.is_null(i) {
-                let value = unsafe { array.value_unchecked(i) };
-                *hash = value.hash_one(random_state);
-            }
+        for i in array.nulls().unwrap().valid_indices() {
+            let value = unsafe { array.value_unchecked(i) };
+            hashes_buffer[i] = value.hash_one(random_state);
         }
     }
 }
@@ -141,7 +260,7 @@ fn hash_array_primitive<T>(
 /// with the new hash using `combine_hashes`
 #[cfg(not(feature = "force_hash_collisions"))]
 fn hash_array<T>(
-    array: T,
+    array: &T,
     random_state: &RandomState,
     hashes_buffer: &mut [u64],
     rehash: bool,
@@ -168,74 +287,257 @@ fn hash_array<T>(
             }
         }
     } else if rehash {
-        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
-            if !array.is_null(i) {
-                let value = unsafe { array.value_unchecked(i) };
-                *hash = combine_hashes(value.hash_one(random_state), *hash);
-            }
+        for i in array.nulls().unwrap().valid_indices() {
+            let value = unsafe { array.value_unchecked(i) };
+            hashes_buffer[i] =
+                combine_hashes(value.hash_one(random_state), hashes_buffer[i]);
         }
     } else {
-        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
-            if !array.is_null(i) {
-                let value = unsafe { array.value_unchecked(i) };
-                *hash = value.hash_one(random_state);
-            }
+        for i in array.nulls().unwrap().valid_indices() {
+            let value = unsafe { array.value_unchecked(i) };
+            hashes_buffer[i] = value.hash_one(random_state);
         }
     }
 }
 
-/// Helper function to update hash for a dictionary key if the value is valid
+/// Hash a StringView or BytesView array
+///
+/// Templated to optimize inner loop based on presence of nulls and external buffers.
+///
+/// HAS_NULLS: do we have to check null in the inner loop
+/// HAS_BUFFERS: if true, array has external buffers; if false, all strings are inlined/ less then 12 bytes
+/// REHASH: if true, combining with existing hash, otherwise initializing
 #[cfg(not(feature = "force_hash_collisions"))]
-#[inline]
-fn update_hash_for_dict_key(
-    hash: &mut u64,
-    dict_hashes: &[u64],
-    dict_values: &dyn Array,
-    idx: usize,
-    multi_col: bool,
+#[inline(never)]
+fn hash_string_view_array_inner<
+    T: ByteViewType,
+    const HAS_NULLS: bool,
+    const HAS_BUFFERS: bool,
+    const REHASH: bool,
+>(
+    array: &GenericByteViewArray<T>,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
 ) {
-    if dict_values.is_valid(idx) {
-        if multi_col {
-            *hash = combine_hashes(dict_hashes[idx], *hash);
+    assert_eq!(
+        hashes_buffer.len(),
+        array.len(),
+        "hashes_buffer and array should be of equal length"
+    );
+
+    let buffers = array.data_buffers();
+    let view_bytes = |view_len: u32, view: u128| {
+        let view = ByteView::from(view);
+        let offset = view.offset as usize;
+        // SAFETY: view is a valid view as it came from the array
+        unsafe {
+            let data = buffers.get_unchecked(view.buffer_index as usize);
+            data.get_unchecked(offset..offset + view_len as usize)
+        }
+    };
+
+    let hashes_and_views = hashes_buffer.iter_mut().zip(array.views().iter());
+    for (i, (hash, &v)) in hashes_and_views.enumerate() {
+        if HAS_NULLS && array.is_null(i) {
+            continue;
+        }
+        let view_len = v as u32;
+        // all views are inlined, no need to access external buffers
+        if !HAS_BUFFERS || view_len <= 12 {
+            if REHASH {
+                let mut hasher = seeded_state(*hash).build_hasher();
+                v.hash_write(&mut hasher);
+                *hash = hasher.finish();
+            } else {
+                *hash = v.hash_one(random_state);
+            }
+            continue;
+        }
+        // view is not inlined, so we need to hash the bytes as well
+        let value = view_bytes(view_len, v);
+        if REHASH {
+            let mut hasher = seeded_state(*hash).build_hasher();
+            value.hash_write(&mut hasher);
+            *hash = hasher.finish();
         } else {
-            *hash = dict_hashes[idx];
+            *hash = value.hash_one(random_state);
         }
     }
-    // no update for invalid dictionary value
 }
 
-/// Hash the values in a dictionary array
+/// Builds hash values for array views and writes them into `hashes_buffer`
+/// If `rehash==true` this combines the previous hash value in the buffer
+/// with the new hash using `combine_hashes`
 #[cfg(not(feature = "force_hash_collisions"))]
-fn hash_dictionary<K: ArrowDictionaryKeyType>(
+fn hash_generic_byte_view_array<T: ByteViewType>(
+    array: &GenericByteViewArray<T>,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+    rehash: bool,
+) {
+    // instantiate the correct version based on presence of nulls and external buffers
+    match (
+        array.null_count() != 0,
+        !array.data_buffers().is_empty(),
+        rehash,
+    ) {
+        // no nulls or buffers ==> hash the inlined views directly
+        // don't call the inner function as Rust seems better able to inline this simpler code (2-3% faster)
+        (false, false, false) => {
+            for (hash, &view) in hashes_buffer.iter_mut().zip(array.views().iter()) {
+                *hash = view.hash_one(random_state);
+            }
+        }
+        (false, false, true) => {
+            for (hash, &view) in hashes_buffer.iter_mut().zip(array.views().iter()) {
+                let mut hasher = seeded_state(*hash).build_hasher();
+                view.hash_write(&mut hasher);
+                *hash = hasher.finish();
+            }
+        }
+        (false, true, false) => hash_string_view_array_inner::<T, false, true, false>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (false, true, true) => hash_string_view_array_inner::<T, false, true, true>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (true, false, false) => hash_string_view_array_inner::<T, true, false, false>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (true, false, true) => hash_string_view_array_inner::<T, true, false, true>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (true, true, false) => hash_string_view_array_inner::<T, true, true, false>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (true, true, true) => hash_string_view_array_inner::<T, true, true, true>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+    }
+}
+
+/// Hash dictionary array with compile-time specialization for null handling.
+///
+/// Uses const generics to eliminate runtim branching in the hot loop:
+/// - `HAS_NULL_KEYS`: Whether to check for null dictionary keys
+/// - `HAS_NULL_VALUES`: Whether to check for null dictionary values
+/// - `MULTI_COL`: Whether to combine with existing hash (true) or initialize (false)
+#[cfg(not(feature = "force_hash_collisions"))]
+#[inline(never)]
+fn hash_dictionary_inner<
+    K: ArrowDictionaryKeyType,
+    const HAS_NULL_KEYS: bool,
+    const HAS_NULL_VALUES: bool,
+    const MULTI_COL: bool,
+>(
     array: &DictionaryArray<K>,
     random_state: &RandomState,
     hashes_buffer: &mut [u64],
-    multi_col: bool,
 ) -> Result<()> {
     // Hash each dictionary value once, and then use that computed
     // hash for each key value to avoid a potentially expensive
     // redundant hashing for large dictionary elements (e.g. strings)
-    let dict_values = Arc::clone(array.values());
-    let mut dict_hashes = vec![0; dict_values.len()];
-    create_hashes(&[dict_values], random_state, &mut dict_hashes)?;
-
-    // combine hash for each index in values
     let dict_values = array.values();
-    for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
-        if let Some(key) = key {
+    let mut dict_hashes = vec![0; dict_values.len()];
+    create_hashes([dict_values], random_state, &mut dict_hashes)?;
+
+    if HAS_NULL_KEYS {
+        for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
+            if let Some(key) = key {
+                let idx = key.as_usize();
+                if !HAS_NULL_VALUES || dict_values.is_valid(idx) {
+                    if MULTI_COL {
+                        *hash = combine_hashes(dict_hashes[idx], *hash);
+                    } else {
+                        *hash = dict_hashes[idx];
+                    }
+                }
+            }
+        }
+    } else {
+        for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().values()) {
             let idx = key.as_usize();
-            update_hash_for_dict_key(
-                hash,
-                &dict_hashes,
-                dict_values.as_ref(),
-                idx,
-                multi_col,
-            );
-        } // no update for Null key
+            if !HAS_NULL_VALUES || dict_values.is_valid(idx) {
+                if MULTI_COL {
+                    *hash = combine_hashes(dict_hashes[idx], *hash);
+                } else {
+                    *hash = dict_hashes[idx];
+                }
+            }
+        }
     }
     Ok(())
 }
 
+/// Hash the values in a dictionary array
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_dictionary<K: ArrowDictionaryKeyType>(
+    array: &DictionaryArray<K>,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+    multi_col: bool,
+) -> Result<()> {
+    let has_null_keys = array.keys().null_count() != 0;
+    let has_null_values = array.values().null_count() != 0;
+
+    // Dispatcher based on null presence and multi-column mode
+    // Should reduce branching within hot loops
+    match (has_null_keys, has_null_values, multi_col) {
+        (false, false, false) => hash_dictionary_inner::<K, false, false, false>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (false, false, true) => hash_dictionary_inner::<K, false, false, true>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (false, true, false) => hash_dictionary_inner::<K, false, true, false>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (false, true, true) => hash_dictionary_inner::<K, false, true, true>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (true, false, false) => hash_dictionary_inner::<K, true, false, false>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (true, false, true) => hash_dictionary_inner::<K, true, false, true>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (true, true, false) => hash_dictionary_inner::<K, true, true, false>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (true, true, true) => hash_dictionary_inner::<K, true, true, true>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+    }
+}
+
 #[cfg(not(feature = "force_hash_collisions"))]
 fn hash_struct_array(
     array: &StructArray,
@@ -245,19 +547,21 @@ fn hash_struct_array(
     let nulls = array.nulls();
     let row_len = array.len();
 
-    let valid_row_indices: Vec<usize> = if let Some(nulls) = nulls {
-        nulls.valid_indices().collect()
-    } else {
-        (0..row_len).collect()
-    };
-
     // Create hashes for each row that combines the hashes over all the column at that row.
     let mut values_hashes = vec![0u64; row_len];
     create_hashes(array.columns(), random_state, &mut values_hashes)?;
 
-    for i in valid_row_indices {
-        let hash = &mut hashes_buffer[i];
-        *hash = combine_hashes(*hash, values_hashes[i]);
+    // Separate paths to avoid allocating Vec when there are no nulls
+    if let Some(nulls) = nulls {
+        for i in nulls.valid_indices() {
+            let hash = &mut hashes_buffer[i];
+            *hash = combine_hashes(*hash, values_hashes[i]);
+        }
+    } else {
+        for i in 0..row_len {
+            let hash = &mut hashes_buffer[i];
+            *hash = combine_hashes(*hash, values_hashes[i]);
+        }
     }
 
     Ok(())
@@ -274,15 +578,29 @@ fn hash_map_array(
     let offsets = array.offsets();
 
     // Create hashes for each entry in each row
-    let mut values_hashes = vec![0u64; array.entries().len()];
-    create_hashes(array.entries().columns(), random_state, &mut values_hashes)?;
+    let first_offset = offsets.first().copied().unwrap_or_default() as usize;
+    let last_offset = offsets.last().copied().unwrap_or_default() as usize;
+    let entries_len = last_offset - first_offset;
+
+    // Only hash the entries that are actually referenced
+    let mut values_hashes = vec![0u64; entries_len];
+    let entries = array.entries();
+    let sliced_columns: Vec<ArrayRef> = entries
+        .columns()
+        .iter()
+        .map(|col| col.slice(first_offset, entries_len))
+        .collect();
+    create_hashes(&sliced_columns, random_state, &mut values_hashes)?;
 
     // Combine the hashes for entries on each row with each other and previous hash for that row
+    // Adjust indices by first_offset since values_hashes is sliced starting from first_offset
     if let Some(nulls) = nulls {
         for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
             if nulls.is_valid(i) {
                 let hash = &mut hashes_buffer[i];
-                for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
+                for values_hash in &values_hashes
+                    [start.as_usize() - first_offset..stop.as_usize() - first_offset]
+                {
                     *hash = combine_hashes(*hash, *values_hash);
                 }
             }
@@ -290,7 +608,9 @@ fn hash_map_array(
     } else {
         for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
             let hash = &mut hashes_buffer[i];
-            for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
+            for values_hash in &values_hashes
+                [start.as_usize() - first_offset..stop.as_usize() - first_offset]
+            {
                 *hash = combine_hashes(*hash, *values_hash);
             }
         }
@@ -308,24 +628,80 @@ fn hash_list_array<OffsetSize>(
 where
     OffsetSize: OffsetSizeTrait,
 {
-    let values = Arc::clone(array.values());
+    // In case values is sliced, hash only the bytes used by the offsets of this ListArray
+    let first_offset = array.value_offsets().first().cloned().unwrap_or_default();
+    let last_offset = array.value_offsets().last().cloned().unwrap_or_default();
+    let value_bytes_len = (last_offset - first_offset).as_usize();
+    let mut values_hashes = vec![0u64; value_bytes_len];
+    create_hashes(
+        [array
+            .values()
+            .slice(first_offset.as_usize(), value_bytes_len)],
+        random_state,
+        &mut values_hashes,
+    )?;
+
+    if array.null_count() > 0 {
+        for (i, (start, stop)) in array.value_offsets().iter().tuple_windows().enumerate()
+        {
+            if array.is_valid(i) {
+                let hash = &mut hashes_buffer[i];
+                for values_hash in &values_hashes[(*start - first_offset).as_usize()
+                    ..(*stop - first_offset).as_usize()]
+                {
+                    *hash = combine_hashes(*hash, *values_hash);
+                }
+            }
+        }
+    } else {
+        for ((start, stop), hash) in array
+            .value_offsets()
+            .iter()
+            .tuple_windows()
+            .zip(hashes_buffer.iter_mut())
+        {
+            for values_hash in &values_hashes
+                [(*start - first_offset).as_usize()..(*stop - first_offset).as_usize()]
+            {
+                *hash = combine_hashes(*hash, *values_hash);
+            }
+        }
+    }
+    Ok(())
+}
+
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_list_view_array<OffsetSize>(
+    array: &GenericListViewArray<OffsetSize>,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+) -> Result<()>
+where
+    OffsetSize: OffsetSizeTrait,
+{
+    let values = array.values();
     let offsets = array.value_offsets();
+    let sizes = array.value_sizes();
     let nulls = array.nulls();
     let mut values_hashes = vec![0u64; values.len()];
-    create_hashes(&[values], random_state, &mut values_hashes)?;
+    create_hashes([values], random_state, &mut values_hashes)?;
     if let Some(nulls) = nulls {
-        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
+        for (i, (offset, size)) in offsets.iter().zip(sizes.iter()).enumerate() {
             if nulls.is_valid(i) {
                 let hash = &mut hashes_buffer[i];
-                for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
+                let start = offset.as_usize();
+                let end = start + size.as_usize();
+                for values_hash in &values_hashes[start..end] {
                     *hash = combine_hashes(*hash, *values_hash);
                 }
             }
         }
     } else {
-        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
+        for (i, (offset, size)) in offsets.iter().zip(sizes.iter()).enumerate() {
             let hash = &mut hashes_buffer[i];
-            for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
+            let start = offset.as_usize();
+            let end = start + size.as_usize();
+            for values_hash in &values_hashes[start..end] {
                 *hash = combine_hashes(*hash, *values_hash);
             }
         }
@@ -333,17 +709,145 @@ where
     Ok(())
 }
 
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_union_array(
+    array: &UnionArray,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+) -> Result<()> {
+    let DataType::Union(union_fields, _mode) = array.data_type() else {
+        unreachable!()
+    };
+
+    if array.is_dense() {
+        // Dense union: children only contain values of their type, so they're already compact.
+        // Use the default hashing approach which is efficient for dense unions.
+        hash_union_array_default(array, union_fields, random_state, hashes_buffer)
+    } else {
+        // Sparse union: each child has the same length as the union array.
+        // Optimization: only hash the elements that are actually referenced by type_ids,
+        // instead of hashing all K*N elements (where K = num types, N = array length).
+        hash_sparse_union_array(array, union_fields, random_state, hashes_buffer)
+    }
+}
+
+/// Default hashing for union arrays - hashes all elements of each child array fully.
+///
+/// This approach works for both dense and sparse union arrays:
+/// - Dense unions: children are compact (each child only contains values of that type)
+/// - Sparse unions: children have the same length as the union array
+///
+/// For sparse unions with 3+ types, the optimized take/scatter approach in
+/// `hash_sparse_union_array` is more efficient, but for 1-2 types or dense unions,
+/// this simpler approach is preferred.
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_union_array_default(
+    array: &UnionArray,
+    union_fields: &UnionFields,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+) -> Result<()> {
+    let mut child_hashes: HashMap<i8, Vec<u64>> =
+        HashMap::with_capacity(union_fields.len());
+
+    // Hash each child array fully
+    for (type_id, _field) in union_fields.iter() {
+        let child = array.child(type_id);
+        let mut child_hash_buffer = vec![0; child.len()];
+        create_hashes([child], random_state, &mut child_hash_buffer)?;
+
+        child_hashes.insert(type_id, child_hash_buffer);
+    }
+
+    // Combine hashes for each row using the appropriate child offset
+    // For dense unions: value_offset points to the actual position in the child
+    // For sparse unions: value_offset equals the row index
+    #[expect(clippy::needless_range_loop)]
+    for i in 0..array.len() {
+        let type_id = array.type_id(i);
+        let child_offset = array.value_offset(i);
+
+        let child_hash = child_hashes.get(&type_id).expect("invalid type_id");
+        hashes_buffer[i] = combine_hashes(hashes_buffer[i], child_hash[child_offset]);
+    }
+
+    Ok(())
+}
+
+/// Hash a sparse union array.
+/// Sparse unions have child arrays with the same length as the union array.
+/// For 3+ types, we optimize by only hashing the N elements that are actually used
+/// (via take/scatter), instead of hashing all K*N elements.
+///
+/// For 1-2 types, the overhead of take/scatter outweighs the benefit, so we use
+/// the default approach of hashing all children (same as dense unions).
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_sparse_union_array(
+    array: &UnionArray,
+    union_fields: &UnionFields,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+) -> Result<()> {
+    use std::collections::HashMap;
+
+    // For 1-2 types, the take/scatter overhead isn't worth it.
+    // Fall back to the default approach (same as dense union).
+    if union_fields.len() <= 2 {
+        return hash_union_array_default(
+            array,
+            union_fields,
+            random_state,
+            hashes_buffer,
+        );
+    }
+
+    let type_ids = array.type_ids();
+
+    // Group indices by type_id
+    let mut indices_by_type: HashMap<i8, Vec<u32>> = HashMap::new();
+    for (i, &type_id) in type_ids.iter().enumerate() {
+        indices_by_type.entry(type_id).or_default().push(i as u32);
+    }
+
+    // For each type, extract only the needed elements, hash them, and scatter back
+    for (type_id, _field) in union_fields.iter() {
+        if let Some(indices) = indices_by_type.get(&type_id) {
+            if indices.is_empty() {
+                continue;
+            }
+
+            let child = array.child(type_id);
+            let indices_array = UInt32Array::from(indices.clone());
+
+            // Extract only the elements we need using take()
+            let filtered = take(child.as_ref(), &indices_array, None)?;
+
+            // Hash the filtered array
+            let mut filtered_hashes = vec![0u64; filtered.len()];
+            create_hashes([&filtered], random_state, &mut filtered_hashes)?;
+
+            // Scatter hashes back to correct positions
+            for (hash, &idx) in filtered_hashes.iter().zip(indices.iter()) {
+                hashes_buffer[idx as usize] =
+                    combine_hashes(hashes_buffer[idx as usize], *hash);
+            }
+        }
+    }
+
+    Ok(())
+}
+
 #[cfg(not(feature = "force_hash_collisions"))]
 fn hash_fixed_list_array(
     array: &FixedSizeListArray,
     random_state: &RandomState,
     hashes_buffer: &mut [u64],
 ) -> Result<()> {
-    let values = Arc::clone(array.values());
+    let values = array.values();
     let value_length = array.value_length() as usize;
     let nulls = array.nulls();
     let mut values_hashes = vec![0u64; values.len()];
-    create_hashes(&[values], random_state, &mut values_hashes)?;
+    create_hashes([values], random_state, &mut values_hashes)?;
     if let Some(nulls) = nulls {
         for i in 0..array.len() {
             if nulls.is_valid(i) {
@@ -366,83 +870,246 @@ fn hash_fixed_list_array(
     Ok(())
 }
 
-/// Test version of `create_hashes` that produces the same value for
-/// all hashes (to test collisions)
-///
-/// See comments on `hashes_buffer` for more details
+/// Inner hash function for RunArray
+#[inline(never)]
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_run_array_inner<
+    R: RunEndIndexType,
+    const HAS_NULL_VALUES: bool,
+    const REHASH: bool,
+>(
+    array: &RunArray<R>,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+) -> Result<()> {
+    // We find the relevant runs that cover potentially sliced arrays, so we can only hash those
+    // values. Then we find the runs that refer to the original runs and ensure that we apply
+    // hashes correctly to the sliced, whether sliced at the start, end, or both.
+    let array_offset = array.offset();
+    let array_len = array.len();
+
+    if array_len == 0 {
+        return Ok(());
+    }
+
+    let run_ends = array.run_ends();
+    let run_ends_values = run_ends.values();
+    let values = array.values();
+
+    let start_physical_index = array.get_start_physical_index();
+    // get_end_physical_index returns the inclusive last index, but we need the exclusive range end
+    // for the operations we use below.
+    let end_physical_index = array.get_end_physical_index() + 1;
+
+    let sliced_values = values.slice(
+        start_physical_index,
+        end_physical_index - start_physical_index,
+    );
+    let mut values_hashes = vec![0u64; sliced_values.len()];
+    create_hashes(
+        std::slice::from_ref(&sliced_values),
+        random_state,
+        &mut values_hashes,
+    )?;
+
+    let mut start_in_slice = 0;
+    for (adjusted_physical_index, &absolute_run_end) in run_ends_values
+        [start_physical_index..end_physical_index]
+        .iter()
+        .enumerate()
+    {
+        let absolute_run_end = absolute_run_end.as_usize();
+        let end_in_slice = (absolute_run_end - array_offset).min(array_len);
+
+        if HAS_NULL_VALUES && sliced_values.is_null(adjusted_physical_index) {
+            start_in_slice = end_in_slice;
+            continue;
+        }
+
+        let value_hash = values_hashes[adjusted_physical_index];
+        let run_slice = &mut hashes_buffer[start_in_slice..end_in_slice];
+
+        if REHASH {
+            for hash in run_slice.iter_mut() {
+                *hash = combine_hashes(value_hash, *hash);
+            }
+        } else {
+            run_slice.fill(value_hash);
+        }
+
+        start_in_slice = end_in_slice;
+    }
+
+    Ok(())
+}
+
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_run_array<R: RunEndIndexType>(
+    array: &RunArray<R>,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+    rehash: bool,
+) -> Result<()> {
+    let has_null_values = array.values().null_count() != 0;
+
+    match (has_null_values, rehash) {
+        (false, false) => {
+            hash_run_array_inner::<R, false, false>(array, random_state, hashes_buffer)
+        }
+        (false, true) => {
+            hash_run_array_inner::<R, false, true>(array, random_state, hashes_buffer)
+        }
+        (true, false) => {
+            hash_run_array_inner::<R, true, false>(array, random_state, hashes_buffer)
+        }
+        (true, true) => {
+            hash_run_array_inner::<R, true, true>(array, random_state, hashes_buffer)
+        }
+    }
+}
+
+/// Internal helper function that hashes a single array and either initializes or combines
+/// the hash values in the buffer.
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_single_array(
+    array: &dyn Array,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+    rehash: bool,
+) -> Result<()> {
+    downcast_primitive_array! {
+        array => hash_array_primitive(array, random_state, hashes_buffer, rehash),
+        DataType::Null => hash_null(random_state, hashes_buffer, rehash),
+        DataType::Boolean => hash_array(&as_boolean_array(array)?, random_state, hashes_buffer, rehash),
+        DataType::Utf8 => hash_array(&as_string_array(array)?, random_state, hashes_buffer, rehash),
+        DataType::Utf8View => hash_generic_byte_view_array(as_string_view_array(array)?, random_state, hashes_buffer, rehash),
+        DataType::LargeUtf8 => hash_array(&as_largestring_array(array), random_state, hashes_buffer, rehash),
+        DataType::Binary => hash_array(&as_generic_binary_array::<i32>(array)?, random_state, hashes_buffer, rehash),
+        DataType::BinaryView => hash_generic_byte_view_array(as_binary_view_array(array)?, random_state, hashes_buffer, rehash),
+        DataType::LargeBinary => hash_array(&as_generic_binary_array::<i64>(array)?, random_state, hashes_buffer, rehash),
+        DataType::FixedSizeBinary(_) => {
+            let array: &FixedSizeBinaryArray = array.as_any().downcast_ref().unwrap();
+            hash_array(&array, random_state, hashes_buffer, rehash)
+        }
+        DataType::Dictionary(_, _) => downcast_dictionary_array! {
+            array => hash_dictionary(array, random_state, hashes_buffer, rehash)?,
+            _ => unreachable!()
+        }
+        DataType::Struct(_) => {
+            let array = as_struct_array(array)?;
+            hash_struct_array(array, random_state, hashes_buffer)?;
+        }
+        DataType::List(_) => {
+            let array = as_list_array(array)?;
+            hash_list_array(array, random_state, hashes_buffer)?;
+        }
+        DataType::LargeList(_) => {
+            let array = as_large_list_array(array)?;
+            hash_list_array(array, random_state, hashes_buffer)?;
+        }
+        DataType::ListView(_) => {
+            let array = as_list_view_array(array)?;
+            hash_list_view_array(array, random_state, hashes_buffer)?;
+        }
+        DataType::LargeListView(_) => {
+            let array = as_large_list_view_array(array)?;
+            hash_list_view_array(array, random_state, hashes_buffer)?;
+        }
+        DataType::Map(_, _) => {
+            let array = as_map_array(array)?;
+            hash_map_array(array, random_state, hashes_buffer)?;
+        }
+        DataType::FixedSizeList(_,_) => {
+            let array = as_fixed_size_list_array(array)?;
+            hash_fixed_list_array(array, random_state, hashes_buffer)?;
+        }
+        DataType::Union(_, _) => {
+            let array = as_union_array(array)?;
+            hash_union_array(array, random_state, hashes_buffer)?;
+        }
+        DataType::RunEndEncoded(_, _) => downcast_run_array! {
+            array => hash_run_array(array, random_state, hashes_buffer, rehash)?,
+            _ => unreachable!()
+        }
+        _ => {
+            // This is internal because we should have caught this before.
+            return _internal_err!(
+                "Unsupported data type in hasher: {}",
+                array.data_type()
+            );
+        }
+    }
+    Ok(())
+}
+
+/// Test version of `hash_single_array` that forces all hashes to collide to zero.
 #[cfg(feature = "force_hash_collisions")]
-pub fn create_hashes<'a>(
-    _arrays: &[ArrayRef],
+fn hash_single_array(
+    _array: &dyn Array,
     _random_state: &RandomState,
-    hashes_buffer: &'a mut Vec<u64>,
-) -> Result<&'a mut Vec<u64>> {
+    hashes_buffer: &mut [u64],
+    _rehash: bool,
+) -> Result<()> {
     for hash in hashes_buffer.iter_mut() {
         *hash = 0
     }
-    Ok(hashes_buffer)
+    Ok(())
 }
 
-/// Creates hash values for every row, based on the values in the
-/// columns.
+/// Something that can be returned as a `&dyn Array`.
+///
+/// We want `create_hashes` to accept either `&dyn Array` or `ArrayRef`,
+/// and this seems the best way to do so.
+///
+/// We tried having it accept `AsRef<dyn Array>`
+/// but that is not implemented for and cannot be implemented for
+/// `&dyn Array` so callers that have the latter would not be able
+/// to call `create_hashes` directly. This shim trait makes it possible.
+pub trait AsDynArray {
+    fn as_dyn_array(&self) -> &dyn Array;
+}
+
+impl AsDynArray for dyn Array {
+    fn as_dyn_array(&self) -> &dyn Array {
+        self
+    }
+}
+
+impl AsDynArray for &dyn Array {
+    fn as_dyn_array(&self) -> &dyn Array {
+        *self
+    }
+}
+
+impl AsDynArray for ArrayRef {
+    fn as_dyn_array(&self) -> &dyn Array {
+        self.as_ref()
+    }
+}
+
+impl AsDynArray for &ArrayRef {
+    fn as_dyn_array(&self) -> &dyn Array {
+        self.as_ref()
+    }
+}
+
+/// Creates hash values for every row, based on the values in the columns.
 ///
 /// The number of rows to hash is determined by `hashes_buffer.len()`.
-/// `hashes_buffer` should be pre-sized appropriately
-#[cfg(not(feature = "force_hash_collisions"))]
-pub fn create_hashes<'a>(
-    arrays: &[ArrayRef],
+/// `hashes_buffer` should be pre-sized appropriately.
+pub fn create_hashes<'a, I, T>(
+    arrays: I,
     random_state: &RandomState,
-    hashes_buffer: &'a mut Vec<u64>,
-) -> Result<&'a mut Vec<u64>> {
-    for (i, col) in arrays.iter().enumerate() {
-        let array = col.as_ref();
+    hashes_buffer: &'a mut [u64],
+) -> Result<&'a mut [u64]>
+where
+    I: IntoIterator<Item = T>,
+    T: AsDynArray,
+{
+    for (i, array) in arrays.into_iter().enumerate() {
         // combine hashes with `combine_hashes` for all columns besides the first
         let rehash = i >= 1;
-        downcast_primitive_array! {
-            array => hash_array_primitive(array, random_state, hashes_buffer, rehash),
-            DataType::Null => hash_null(random_state, hashes_buffer, rehash),
-            DataType::Boolean => hash_array(as_boolean_array(array)?, random_state, hashes_buffer, rehash),
-            DataType::Utf8 => hash_array(as_string_array(array)?, random_state, hashes_buffer, rehash),
-            DataType::Utf8View => hash_array(as_string_view_array(array)?, random_state, hashes_buffer, rehash),
-            DataType::LargeUtf8 => hash_array(as_largestring_array(array), random_state, hashes_buffer, rehash),
-            DataType::Binary => hash_array(as_generic_binary_array::<i32>(array)?, random_state, hashes_buffer, rehash),
-            DataType::BinaryView => hash_array(as_binary_view_array(array)?, random_state, hashes_buffer, rehash),
-            DataType::LargeBinary => hash_array(as_generic_binary_array::<i64>(array)?, random_state, hashes_buffer, rehash),
-            DataType::FixedSizeBinary(_) => {
-                let array: &FixedSizeBinaryArray = array.as_any().downcast_ref().unwrap();
-                hash_array(array, random_state, hashes_buffer, rehash)
-            }
-            DataType::Dictionary(_, _) => downcast_dictionary_array! {
-                array => hash_dictionary(array, random_state, hashes_buffer, rehash)?,
-                _ => unreachable!()
-            }
-            DataType::Struct(_) => {
-                let array = as_struct_array(array)?;
-                hash_struct_array(array, random_state, hashes_buffer)?;
-            }
-            DataType::List(_) => {
-                let array = as_list_array(array)?;
-                hash_list_array(array, random_state, hashes_buffer)?;
-            }
-            DataType::LargeList(_) => {
-                let array = as_large_list_array(array)?;
-                hash_list_array(array, random_state, hashes_buffer)?;
-            }
-            DataType::Map(_, _) => {
-                let array = as_map_array(array)?;
-                hash_map_array(array, random_state, hashes_buffer)?;
-            }
-            DataType::FixedSizeList(_,_) => {
-                let array = as_fixed_size_list_array(array)?;
-                hash_fixed_list_array(array, random_state, hashes_buffer)?;
-            }
-            _ => {
-                // This is internal because we should have caught this before.
-                return _internal_err!(
-                    "Unsupported data type in hasher: {}",
-                    col.data_type()
-                );
-            }
-        }
+        hash_single_array(array.as_dyn_array(), random_state, hashes_buffer, rehash)?;
     }
     Ok(hashes_buffer)
 }
@@ -465,8 +1132,8 @@ mod tests {
             .collect::<Decimal128Array>()
             .with_precision_and_scale(20, 3)
             .unwrap();
-        let array_ref = Arc::new(array);
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let array_ref: ArrayRef = Arc::new(array);
+        let random_state = RandomState::with_seed(0);
         let hashes_buff = &mut vec![0; array_ref.len()];
         let hashes = create_hashes(&[array_ref], &random_state, hashes_buff)?;
         assert_eq!(hashes.len(), 4);
@@ -476,19 +1143,25 @@ mod tests {
     #[test]
     fn create_hashes_for_empty_fixed_size_lit() -> Result<()> {
         let empty_array = FixedSizeListBuilder::new(StringBuilder::new(), 1).finish();
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
-        let hashes_buff = &mut vec![0; 0];
-        let hashes = create_hashes(&[Arc::new(empty_array)], &random_state, hashes_buff)?;
+        let random_state = RandomState::with_seed(0);
+        let hashes_buff = &mut [0; 0];
+        let hashes = create_hashes(
+            &[Arc::new(empty_array) as ArrayRef],
+            &random_state,
+            hashes_buff,
+        )?;
         assert_eq!(hashes, &Vec::<u64>::new());
         Ok(())
     }
 
     #[test]
     fn create_hashes_for_float_arrays() -> Result<()> {
-        let f32_arr = Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7]));
-        let f64_arr = Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7]));
+        let f32_arr: ArrayRef =
+            Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7]));
+        let f64_arr: ArrayRef =
+            Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7]));
 
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
         let hashes_buff = &mut vec![0; f32_arr.len()];
         let hashes = create_hashes(&[f32_arr], &random_state, hashes_buff)?;
         assert_eq!(hashes.len(), 4,);
@@ -514,18 +1187,15 @@ mod tests {
                     Some(b"Longer than 12 bytes string"),
                 ];
 
-                let binary_array = Arc::new(binary.iter().cloned().collect::<$ARRAY>());
-                let ref_array = Arc::new(binary.iter().cloned().collect::<BinaryArray>());
+                let binary_array: ArrayRef =
+                    Arc::new(binary.iter().cloned().collect::<$ARRAY>());
 
-                let random_state = RandomState::with_seeds(0, 0, 0, 0);
+                let random_state = RandomState::with_seed(0);
 
                 let mut binary_hashes = vec![0; binary.len()];
                 create_hashes(&[binary_array], &random_state, &mut binary_hashes)
                     .unwrap();
 
-                let mut ref_hashes = vec![0; binary.len()];
-                create_hashes(&[ref_array], &random_state, &mut ref_hashes).unwrap();
-
                 // Null values result in a zero hash,
                 for (val, hash) in binary.iter().zip(binary_hashes.iter()) {
                     match val {
@@ -534,9 +1204,6 @@ mod tests {
                     }
                 }
 
-                // same logical values should hash to the same hash value
-                assert_eq!(binary_hashes, ref_hashes);
-
                 // Same values should map to same hash values
                 assert_eq!(binary[0], binary[5]);
                 assert_eq!(binary[4], binary[6]);
@@ -548,15 +1215,16 @@ mod tests {
     }
 
     create_hash_binary!(binary_array, BinaryArray);
+    create_hash_binary!(large_binary_array, LargeBinaryArray);
     create_hash_binary!(binary_view_array, BinaryViewArray);
 
     #[test]
     fn create_hashes_fixed_size_binary() -> Result<()> {
         let input_arg = vec![vec![1, 2], vec![5, 6], vec![5, 6]];
-        let fixed_size_binary_array =
+        let fixed_size_binary_array: ArrayRef =
             Arc::new(FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap());
 
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
         let hashes_buff = &mut vec![0; fixed_size_binary_array.len()];
         let hashes =
             create_hashes(&[fixed_size_binary_array], &random_state, hashes_buff)?;
@@ -580,15 +1248,16 @@ mod tests {
                     Some("Longer than 12 bytes string"),
                 ];
 
-                let string_array = Arc::new(strings.iter().cloned().collect::<$ARRAY>());
-                let dict_array = Arc::new(
+                let string_array: ArrayRef =
+                    Arc::new(strings.iter().cloned().collect::<$ARRAY>());
+                let dict_array: ArrayRef = Arc::new(
                     strings
                         .iter()
                         .cloned()
                         .collect::<DictionaryArray<Int8Type>>(),
                 );
 
-                let random_state = RandomState::with_seeds(0, 0, 0, 0);
+                let random_state = RandomState::with_seed(0);
 
                 let mut string_hashes = vec![0; strings.len()];
                 create_hashes(&[string_array], &random_state, &mut string_hashes)
@@ -623,21 +1292,90 @@ mod tests {
     create_hash_string!(string_view_array, StringArray);
     create_hash_string!(dict_string_array, DictionaryArray<Int8Type>);
 
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_hashes_for_run_array() -> Result<()> {
+        let values = Arc::new(Int32Array::from(vec![10, 20, 30]));
+        let run_ends = Arc::new(Int32Array::from(vec![2, 5, 7]));
+        let array = Arc::new(RunArray::try_new(&run_ends, values.as_ref()).unwrap());
+
+        let random_state = RandomState::with_seed(0);
+        let hashes_buff = &mut vec![0; array.len()];
+        let hashes = create_hashes(
+            &[Arc::clone(&array) as ArrayRef],
+            &random_state,
+            hashes_buff,
+        )?;
+
+        assert_eq!(hashes.len(), 7);
+        assert_eq!(hashes[0], hashes[1]);
+        assert_eq!(hashes[2], hashes[3]);
+        assert_eq!(hashes[3], hashes[4]);
+        assert_eq!(hashes[5], hashes[6]);
+        assert_ne!(hashes[0], hashes[2]);
+        assert_ne!(hashes[2], hashes[5]);
+        assert_ne!(hashes[0], hashes[5]);
+
+        Ok(())
+    }
+
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_multi_column_hash_with_run_array() -> Result<()> {
+        let int_array = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7]));
+        let values = Arc::new(StringArray::from(vec!["foo", "bar", "baz"]));
+        let run_ends = Arc::new(Int32Array::from(vec![2, 5, 7]));
+        let run_array = Arc::new(RunArray::try_new(&run_ends, values.as_ref()).unwrap());
+
+        let random_state = RandomState::with_seed(0);
+        let mut one_col_hashes = vec![0; int_array.len()];
+        create_hashes(
+            &[Arc::clone(&int_array) as ArrayRef],
+            &random_state,
+            &mut one_col_hashes,
+        )?;
+
+        let mut two_col_hashes = vec![0; int_array.len()];
+        create_hashes(
+            &[
+                Arc::clone(&int_array) as ArrayRef,
+                Arc::clone(&run_array) as ArrayRef,
+            ],
+            &random_state,
+            &mut two_col_hashes,
+        )?;
+
+        assert_eq!(one_col_hashes.len(), 7);
+        assert_eq!(two_col_hashes.len(), 7);
+        assert_ne!(one_col_hashes, two_col_hashes);
+
+        let diff_0_vs_1_one_col = one_col_hashes[0] != one_col_hashes[1];
+        let diff_0_vs_1_two_col = two_col_hashes[0] != two_col_hashes[1];
+        assert_eq!(diff_0_vs_1_one_col, diff_0_vs_1_two_col);
+
+        let diff_2_vs_3_one_col = one_col_hashes[2] != one_col_hashes[3];
+        let diff_2_vs_3_two_col = two_col_hashes[2] != two_col_hashes[3];
+        assert_eq!(diff_2_vs_3_one_col, diff_2_vs_3_two_col);
+
+        Ok(())
+    }
+
     #[test]
     // Tests actual values of hashes, which are different if forcing collisions
     #[cfg(not(feature = "force_hash_collisions"))]
     fn create_hashes_for_dict_arrays() {
         let strings = [Some("foo"), None, Some("bar"), Some("foo"), None];
 
-        let string_array = Arc::new(strings.iter().cloned().collect::<StringArray>());
-        let dict_array = Arc::new(
+        let string_array: ArrayRef =
+            Arc::new(strings.iter().cloned().collect::<StringArray>());
+        let dict_array: ArrayRef = Arc::new(
             strings
                 .iter()
                 .cloned()
                 .collect::<DictionaryArray<Int8Type>>(),
         );
 
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
 
         let mut string_hashes = vec![0; strings.len()];
         create_hashes(&[string_array], &random_state, &mut string_hashes).unwrap();
@@ -682,7 +1420,7 @@ mod tests {
         ];
         let list_array =
             Arc::new(ListArray::from_iter_primitive::<Int32Type, _, _>(data)) as ArrayRef;
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
         let mut hashes = vec![0; list_array.len()];
         create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
         assert_eq!(hashes[0], hashes[5]);
@@ -691,6 +1429,130 @@ mod tests {
         assert_eq!(hashes[1], hashes[6]); // null vs empty list
     }
 
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_hashes_for_sliced_list_arrays() {
+        let data = vec![
+            Some(vec![Some(0), Some(1), Some(2)]),
+            None,
+            // Slice from here
+            Some(vec![Some(3), None, Some(5)]),
+            Some(vec![Some(3), None, Some(5)]),
+            None,
+            // To here
+            Some(vec![Some(0), Some(1), Some(2)]),
+            Some(vec![]),
+        ];
+        let list_array =
+            Arc::new(ListArray::from_iter_primitive::<Int32Type, _, _>(data)) as ArrayRef;
+        let list_array = list_array.slice(2, 3);
+        let random_state = RandomState::with_seed(0);
+        let mut hashes = vec![0; list_array.len()];
+        create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
+        assert_eq!(hashes[0], hashes[1]);
+        assert_ne!(hashes[1], hashes[2]);
+    }
+
+    #[test]
+    // Tests actual values of hashes, which are different if forcing collisions
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_hashes_for_list_view_arrays() {
+        use arrow::buffer::{NullBuffer, ScalarBuffer};
+
+        // Create values array: [0, 1, 2, 3, null, 5]
+        let values = Arc::new(Int32Array::from(vec![
+            Some(0),
+            Some(1),
+            Some(2),
+            Some(3),
+            None,
+            Some(5),
+        ])) as ArrayRef;
+        let field = Arc::new(Field::new("item", DataType::Int32, true));
+
+        // Create ListView with the following logical structure:
+        // Row 0: [0, 1, 2]        (offset=0, size=3)
+        // Row 1: null             (null bit set)
+        // Row 2: [3, null, 5]     (offset=3, size=3)
+        // Row 3: [3, null, 5]     (offset=3, size=3) - same as row 2
+        // Row 4: null             (null bit set)
+        // Row 5: [0, 1, 2]        (offset=0, size=3) - same as row 0
+        // Row 6: []               (offset=0, size=0) - empty list
+        let offsets = ScalarBuffer::from(vec![0i32, 0, 3, 3, 0, 0, 0]);
+        let sizes = ScalarBuffer::from(vec![3i32, 0, 3, 3, 0, 3, 0]);
+        let nulls = Some(NullBuffer::from(vec![
+            true, false, true, true, false, true, true,
+        ]));
+
+        let list_view_array =
+            Arc::new(ListViewArray::new(field, offsets, sizes, values, nulls))
+                as ArrayRef;
+
+        let random_state = RandomState::with_seed(0);
+        let mut hashes = vec![0; list_view_array.len()];
+        create_hashes(&[list_view_array], &random_state, &mut hashes).unwrap();
+
+        assert_eq!(hashes[0], hashes[5]); // same content [0, 1, 2]
+        assert_eq!(hashes[1], hashes[4]); // both null
+        assert_eq!(hashes[2], hashes[3]); // same content [3, null, 5]
+        assert_eq!(hashes[1], hashes[6]); // null vs empty list
+
+        // Negative tests: different content should produce different hashes
+        assert_ne!(hashes[0], hashes[2]); // [0, 1, 2] vs [3, null, 5]
+        assert_ne!(hashes[0], hashes[6]); // [0, 1, 2] vs []
+        assert_ne!(hashes[2], hashes[6]); // [3, null, 5] vs []
+    }
+
+    #[test]
+    // Tests actual values of hashes, which are different if forcing collisions
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_hashes_for_large_list_view_arrays() {
+        use arrow::buffer::{NullBuffer, ScalarBuffer};
+
+        // Create values array: [0, 1, 2, 3, null, 5]
+        let values = Arc::new(Int32Array::from(vec![
+            Some(0),
+            Some(1),
+            Some(2),
+            Some(3),
+            None,
+            Some(5),
+        ])) as ArrayRef;
+        let field = Arc::new(Field::new("item", DataType::Int32, true));
+
+        // Create LargeListView with the following logical structure:
+        // Row 0: [0, 1, 2]        (offset=0, size=3)
+        // Row 1: null             (null bit set)
+        // Row 2: [3, null, 5]     (offset=3, size=3)
+        // Row 3: [3, null, 5]     (offset=3, size=3) - same as row 2
+        // Row 4: null             (null bit set)
+        // Row 5: [0, 1, 2]        (offset=0, size=3) - same as row 0
+        // Row 6: []               (offset=0, size=0) - empty list
+        let offsets = ScalarBuffer::from(vec![0i64, 0, 3, 3, 0, 0, 0]);
+        let sizes = ScalarBuffer::from(vec![3i64, 0, 3, 3, 0, 3, 0]);
+        let nulls = Some(NullBuffer::from(vec![
+            true, false, true, true, false, true, true,
+        ]));
+
+        let large_list_view_array = Arc::new(LargeListViewArray::new(
+            field, offsets, sizes, values, nulls,
+        )) as ArrayRef;
+
+        let random_state = RandomState::with_seed(0);
+        let mut hashes = vec![0; large_list_view_array.len()];
+        create_hashes(&[large_list_view_array], &random_state, &mut hashes).unwrap();
+
+        assert_eq!(hashes[0], hashes[5]); // same content [0, 1, 2]
+        assert_eq!(hashes[1], hashes[4]); // both null
+        assert_eq!(hashes[2], hashes[3]); // same content [3, null, 5]
+        assert_eq!(hashes[1], hashes[6]); // null vs empty list
+
+        // Negative tests: different content should produce different hashes
+        assert_ne!(hashes[0], hashes[2]); // [0, 1, 2] vs [3, null, 5]
+        assert_ne!(hashes[0], hashes[6]); // [0, 1, 2] vs []
+        assert_ne!(hashes[2], hashes[6]); // [3, null, 5] vs []
+    }
+
     #[test]
     // Tests actual values of hashes, which are different if forcing collisions
     #[cfg(not(feature = "force_hash_collisions"))]
@@ -707,7 +1569,7 @@ mod tests {
             Arc::new(FixedSizeListArray::from_iter_primitive::<Int32Type, _, _>(
                 data, 3,
             )) as ArrayRef;
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
         let mut hashes = vec![0; list_array.len()];
         create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
         assert_eq!(hashes[0], hashes[5]);
@@ -757,7 +1619,7 @@ mod tests {
 
         let array = Arc::new(struct_array) as ArrayRef;
 
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
         let mut hashes = vec![0; array.len()];
         create_hashes(&[array], &random_state, &mut hashes).unwrap();
         assert_eq!(hashes[0], hashes[1]);
@@ -794,7 +1656,7 @@ mod tests {
         assert!(struct_array.is_valid(1));
 
         let array = Arc::new(struct_array) as ArrayRef;
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
         let mut hashes = vec![0; array.len()];
         create_hashes(&[array], &random_state, &mut hashes).unwrap();
         assert_eq!(hashes[0], hashes[1]);
@@ -847,7 +1709,7 @@ mod tests {
 
         let array = Arc::new(builder.finish()) as ArrayRef;
 
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
         let mut hashes = vec![0; array.len()];
         create_hashes(&[array], &random_state, &mut hashes).unwrap();
         assert_eq!(hashes[0], hashes[1]); // same value
@@ -865,15 +1727,16 @@ mod tests {
         let strings1 = [Some("foo"), None, Some("bar")];
         let strings2 = [Some("blarg"), Some("blah"), None];
 
-        let string_array = Arc::new(strings1.iter().cloned().collect::<StringArray>());
-        let dict_array = Arc::new(
+        let string_array: ArrayRef =
+            Arc::new(strings1.iter().cloned().collect::<StringArray>());
+        let dict_array: ArrayRef = Arc::new(
             strings2
                 .iter()
                 .cloned()
                 .collect::<DictionaryArray<Int32Type>>(),
         );
 
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
 
         let mut one_col_hashes = vec![0; strings1.len()];
         create_hashes(
@@ -896,4 +1759,345 @@ mod tests {
 
         assert_ne!(one_col_hashes, two_col_hashes);
     }
+
+    #[test]
+    fn test_create_hashes_from_arrays() {
+        let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
+        let float_array: ArrayRef =
+            Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
+
+        let random_state = RandomState::with_seed(0);
+        let hashes_buff = &mut vec![0; int_array.len()];
+        let hashes =
+            create_hashes(&[int_array, float_array], &random_state, hashes_buff).unwrap();
+        assert_eq!(hashes.len(), 4,);
+    }
+
+    #[test]
+    fn test_create_hashes_from_dyn_arrays() {
+        let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
+        let float_array: ArrayRef =
+            Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
+
+        // Verify that we can call create_hashes with only &dyn Array
+        fn test(arr1: &dyn Array, arr2: &dyn Array) {
+            let random_state = RandomState::with_seed(0);
+            let hashes_buff = &mut vec![0; arr1.len()];
+            let hashes = create_hashes([arr1, arr2], &random_state, hashes_buff).unwrap();
+            assert_eq!(hashes.len(), 4,);
+        }
+        test(&*int_array, &*float_array);
+    }
+
+    #[test]
+    fn test_create_hashes_equivalence() {
+        let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
+        let random_state = RandomState::with_seed(0);
+
+        let mut hashes1 = vec![0; array.len()];
+        create_hashes(
+            &[Arc::clone(&array) as ArrayRef],
+            &random_state,
+            &mut hashes1,
+        )
+        .unwrap();
+
+        let mut hashes2 = vec![0; array.len()];
+        create_hashes([array], &random_state, &mut hashes2).unwrap();
+
+        assert_eq!(hashes1, hashes2);
+    }
+
+    #[test]
+    fn test_with_hashes() {
+        let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
+        let random_state = RandomState::with_seed(0);
+
+        // Test that with_hashes produces the same results as create_hashes
+        let mut expected_hashes = vec![0; array.len()];
+        create_hashes([&array], &random_state, &mut expected_hashes).unwrap();
+
+        let result = with_hashes([&array], &random_state, |hashes| {
+            assert_eq!(hashes.len(), 4);
+            // Verify hashes match expected values
+            assert_eq!(hashes, &expected_hashes[..]);
+            // Return a copy of the hashes
+            Ok(hashes.to_vec())
+        })
+        .unwrap();
+
+        // Verify callback result is returned correctly
+        assert_eq!(result, expected_hashes);
+    }
+
+    #[test]
+    fn test_with_hashes_multi_column() {
+        let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
+        let str_array: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c"]));
+        let random_state = RandomState::with_seed(0);
+
+        // Test multi-column hashing
+        let mut expected_hashes = vec![0; int_array.len()];
+        create_hashes(
+            [&int_array, &str_array],
+            &random_state,
+            &mut expected_hashes,
+        )
+        .unwrap();
+
+        with_hashes([&int_array, &str_array], &random_state, |hashes| {
+            assert_eq!(hashes.len(), 3);
+            assert_eq!(hashes, &expected_hashes[..]);
+            Ok(())
+        })
+        .unwrap();
+    }
+
+    #[test]
+    fn test_with_hashes_empty_arrays() {
+        let random_state = RandomState::with_seed(0);
+
+        // Test that passing no arrays returns an error
+        let empty: [&ArrayRef; 0] = [];
+        let result = with_hashes(empty, &random_state, |_hashes| Ok(()));
+
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("requires at least one array")
+        );
+    }
+
+    #[test]
+    fn test_with_hashes_reentrancy() {
+        let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
+        let array2: ArrayRef = Arc::new(Int32Array::from(vec![4, 5, 6]));
+        let random_state = RandomState::with_seed(0);
+
+        // Test that reentrant calls return an error instead of panicking
+        let result = with_hashes([&array], &random_state, |_hashes| {
+            // Try to call with_hashes again inside the callback
+            with_hashes([&array2], &random_state, |_inner_hashes| Ok(()))
+        });
+
+        assert!(result.is_err());
+        let err_msg = result.unwrap_err().to_string();
+        assert!(
+            err_msg.contains("reentrantly") || err_msg.contains("cannot be called"),
+            "Error message should mention reentrancy: {err_msg}",
+        );
+    }
+
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_hashes_for_sparse_union_arrays() {
+        // logical array: [int(5), str("foo"), int(10), int(5)]
+        let int_array = Int32Array::from(vec![Some(5), None, Some(10), Some(5)]);
+        let str_array = StringArray::from(vec![None, Some("foo"), None, None]);
+
+        let type_ids = vec![0_i8, 1, 0, 0].into();
+        let children = vec![
+            Arc::new(int_array) as ArrayRef,
+            Arc::new(str_array) as ArrayRef,
+        ];
+
+        let union_fields = [
+            (0, Arc::new(Field::new("a", DataType::Int32, true))),
+            (1, Arc::new(Field::new("b", DataType::Utf8, true))),
+        ]
+        .into_iter()
+        .collect();
+
+        let array = UnionArray::try_new(union_fields, type_ids, None, children).unwrap();
+        let array_ref = Arc::new(array) as ArrayRef;
+
+        let random_state = RandomState::with_seed(0);
+        let mut hashes = vec![0; array_ref.len()];
+        create_hashes(&[array_ref], &random_state, &mut hashes).unwrap();
+
+        // Rows 0 and 3 both have type_id=0 (int) with value 5
+        assert_eq!(hashes[0], hashes[3]);
+        // Row 0 (int 5) vs Row 2 (int 10) - different values
+        assert_ne!(hashes[0], hashes[2]);
+        // Row 0 (int) vs Row 1 (string) - different types
+        assert_ne!(hashes[0], hashes[1]);
+    }
+
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_hashes_for_sparse_union_arrays_with_nulls() {
+        // logical array: [int(5), str("foo"), int(null), str(null)]
+        let int_array = Int32Array::from(vec![Some(5), None, None, None]);
+        let str_array = StringArray::from(vec![None, Some("foo"), None, None]);
+
+        let type_ids = vec![0, 1, 0, 1].into();
+        let children = vec![
+            Arc::new(int_array) as ArrayRef,
+            Arc::new(str_array) as ArrayRef,
+        ];
+
+        let union_fields = [
+            (0, Arc::new(Field::new("a", DataType::Int32, true))),
+            (1, Arc::new(Field::new("b", DataType::Utf8, true))),
+        ]
+        .into_iter()
+        .collect();
+
+        let array = UnionArray::try_new(union_fields, type_ids, None, children).unwrap();
+        let array_ref = Arc::new(array) as ArrayRef;
+
+        let random_state = RandomState::with_seed(0);
+        let mut hashes = vec![0; array_ref.len()];
+        create_hashes(&[array_ref], &random_state, &mut hashes).unwrap();
+
+        // row 2 (int null) and row 3 (str null) should have the same hash
+        // because they are both null values
+        assert_eq!(hashes[2], hashes[3]);
+
+        // row 0 (int 5) vs row 2 (int null) - different (value vs null)
+        assert_ne!(hashes[0], hashes[2]);
+
+        // row 1 (str "foo") vs row 3 (str null) - different (value vs null)
+        assert_ne!(hashes[1], hashes[3]);
+    }
+
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_hashes_for_dense_union_arrays() {
+        // creates a dense union array with int and string types
+        // [67, "norm", 100, "macdonald", 67]
+        let int_array = Int32Array::from(vec![67, 100, 67]);
+        let str_array = StringArray::from(vec!["norm", "macdonald"]);
+
+        let type_ids = vec![0, 1, 0, 1, 0].into();
+        let offsets = vec![0, 0, 1, 1, 2].into();
+        let children = vec![
+            Arc::new(int_array) as ArrayRef,
+            Arc::new(str_array) as ArrayRef,
+        ];
+
+        let union_fields = [
+            (0, Arc::new(Field::new("a", DataType::Int32, false))),
+            (1, Arc::new(Field::new("b", DataType::Utf8, false))),
+        ]
+        .into_iter()
+        .collect();
+
+        let array =
+            UnionArray::try_new(union_fields, type_ids, Some(offsets), children).unwrap();
+        let array_ref = Arc::new(array) as ArrayRef;
+
+        let random_state = RandomState::with_seed(0);
+        let mut hashes = vec![0; array_ref.len()];
+        create_hashes(&[array_ref], &random_state, &mut hashes).unwrap();
+
+        // 67 vs "norm"
+        assert_ne!(hashes[0], hashes[1]);
+        // 67 vs 100
+        assert_ne!(hashes[0], hashes[2]);
+        // "norm" vs "macdonald"
+        assert_ne!(hashes[1], hashes[3]);
+        // 100 vs "macdonald"
+        assert_ne!(hashes[2], hashes[3]);
+        // 67 vs 67
+        assert_eq!(hashes[0], hashes[4]);
+    }
+
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_hashes_for_sliced_run_array() -> Result<()> {
+        let values = Arc::new(Int32Array::from(vec![10, 20, 30]));
+        let run_ends = Arc::new(Int32Array::from(vec![2, 5, 7]));
+        let array = Arc::new(RunArray::try_new(&run_ends, values.as_ref()).unwrap());
+
+        let random_state = RandomState::with_seed(0);
+        let mut full_hashes = vec![0; array.len()];
+        create_hashes(
+            &[Arc::clone(&array) as ArrayRef],
+            &random_state,
+            &mut full_hashes,
+        )?;
+
+        let array_ref: ArrayRef = Arc::clone(&array) as ArrayRef;
+        let sliced_array = array_ref.slice(2, 3);
+
+        let mut sliced_hashes = vec![0; sliced_array.len()];
+        create_hashes(
+            std::slice::from_ref(&sliced_array),
+            &random_state,
+            &mut sliced_hashes,
+        )?;
+
+        assert_eq!(sliced_hashes.len(), 3);
+        assert_eq!(sliced_hashes[0], sliced_hashes[1]);
+        assert_eq!(sliced_hashes[1], sliced_hashes[2]);
+        assert_eq!(&sliced_hashes, &full_hashes[2..5]);
+
+        Ok(())
+    }
+
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn test_run_array_with_nulls() -> Result<()> {
+        let values = Arc::new(Int32Array::from(vec![Some(10), None, Some(20)]));
+        let run_ends = Arc::new(Int32Array::from(vec![2, 4, 6]));
+        let array = Arc::new(RunArray::try_new(&run_ends, values.as_ref()).unwrap());
+
+        let random_state = RandomState::with_seed(0);
+        let mut hashes = vec![0; array.len()];
+        create_hashes(
+            &[Arc::clone(&array) as ArrayRef],
+            &random_state,
+            &mut hashes,
+        )?;
+
+        assert_eq!(hashes[0], hashes[1]);
+        assert_ne!(hashes[0], 0);
+        assert_eq!(hashes[2], hashes[3]);
+        assert_eq!(hashes[2], 0);
+        assert_eq!(hashes[4], hashes[5]);
+        assert_ne!(hashes[4], 0);
+        assert_ne!(hashes[0], hashes[4]);
+
+        Ok(())
+    }
+
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn test_run_array_with_nulls_multicolumn() -> Result<()> {
+        let primitive_array = Arc::new(Int32Array::from(vec![Some(10), None, Some(20)]));
+        let run_values = Arc::new(Int32Array::from(vec![Some(10), None, Some(20)]));
+        let run_ends = Arc::new(Int32Array::from(vec![1, 2, 3]));
+        let run_array =
+            Arc::new(RunArray::try_new(&run_ends, run_values.as_ref()).unwrap());
+        let second_col = Arc::new(Int32Array::from(vec![100, 200, 300]));
+
+        let random_state = RandomState::with_seed(0);
+
+        let mut primitive_hashes = vec![0; 3];
+        create_hashes(
+            &[
+                Arc::clone(&primitive_array) as ArrayRef,
+                Arc::clone(&second_col) as ArrayRef,
+            ],
+            &random_state,
+            &mut primitive_hashes,
+        )?;
+
+        let mut run_hashes = vec![0; 3];
+        create_hashes(
+            &[
+                Arc::clone(&run_array) as ArrayRef,
+                Arc::clone(&second_col) as ArrayRef,
+            ],
+            &random_state,
+            &mut run_hashes,
+        )?;
+
+        assert_eq!(primitive_hashes, run_hashes);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/common/src/instant.rs b/datafusion/common/src/instant.rs
index 42f21c061c0c2..a5dfb28292581 100644
--- a/datafusion/common/src/instant.rs
+++ b/datafusion/common/src/instant.rs
@@ -22,7 +22,7 @@
 /// under `wasm` feature gate. It provides the same API as [`std::time::Instant`].
 pub type Instant = web_time::Instant;
 
-#[allow(clippy::disallowed_types)]
+#[expect(clippy::disallowed_types)]
 #[cfg(not(target_family = "wasm"))]
 /// DataFusion wrapper around [`std::time::Instant`]. This is only a type alias.
 pub type Instant = std::time::Instant;
diff --git a/datafusion/common/src/join_type.rs b/datafusion/common/src/join_type.rs
index e6a90db2dc3eb..d517844db48b4 100644
--- a/datafusion/common/src/join_type.rs
+++ b/datafusion/common/src/join_type.rs
@@ -97,6 +97,35 @@ impl JoinType {
         }
     }
 
+    /// Whether each side of the join is preserved for ON-clause filter pushdown.
+    ///
+    /// It is only correct to push ON-clause filters below a join for preserved
+    /// inputs.
+    ///
+    /// # "Preserved" input definition
+    ///
+    /// A join side is preserved if the join returns all or a subset of the rows
+    /// from that side, such that each output row directly maps to an input row.
+    /// If a side is not preserved, the join can produce extra null rows that
+    /// don't map to any input row.
+    ///
+    /// # Return Value
+    ///
+    /// A tuple of booleans - (left_preserved, right_preserved).
+    pub fn on_lr_is_preserved(&self) -> (bool, bool) {
+        match self {
+            JoinType::Inner => (true, true),
+            JoinType::Left => (false, true),
+            JoinType::Right => (true, false),
+            JoinType::Full => (false, false),
+            JoinType::LeftSemi | JoinType::RightSemi => (true, true),
+            JoinType::LeftAnti => (false, true),
+            JoinType::RightAnti => (true, false),
+            JoinType::LeftMark => (false, true),
+            JoinType::RightMark => (true, false),
+        }
+    }
+
     /// Does the join type support swapping inputs?
     pub fn supports_swap(&self) -> bool {
         matches!(
@@ -113,6 +142,20 @@ impl JoinType {
                 | JoinType::RightMark
         )
     }
+
+    /// Returns true when an empty build side necessarily produces an empty
+    /// result for this join type.
+    pub fn empty_build_side_produces_empty_result(self) -> bool {
+        matches!(
+            self,
+            JoinType::Inner
+                | JoinType::Left
+                | JoinType::LeftSemi
+                | JoinType::LeftAnti
+                | JoinType::LeftMark
+                | JoinType::RightSemi
+        )
+    }
 }
 
 impl Display for JoinType {
diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs
index 76c7b46e32737..996c563f0d8a2 100644
--- a/datafusion/common/src/lib.rs
+++ b/datafusion/common/src/lib.rs
@@ -23,14 +23,13 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 mod column;
 mod dfschema;
 mod functional_dependencies;
 mod join_type;
 mod param_value;
-#[cfg(feature = "pyarrow")]
-mod pyarrow;
 mod schema_reference;
 mod table_reference;
 mod unnest;
@@ -51,6 +50,7 @@ pub mod instant;
 pub mod metadata;
 pub mod nested_struct;
 mod null_equality;
+pub mod parquet_config;
 pub mod parsers;
 pub mod pruning;
 pub mod rounding;
@@ -61,28 +61,30 @@ pub mod test_util;
 pub mod tree_node;
 pub mod types;
 pub mod utils;
-
 /// Reexport arrow crate
 pub use arrow;
 pub use column::Column;
 pub use dfschema::{
-    qualified_name, DFSchema, DFSchemaRef, ExprSchema, SchemaExt, ToDFSchema,
+    DFSchema, DFSchemaRef, ExprSchema, SchemaExt, ToDFSchema, qualified_name,
 };
 pub use diagnostic::Diagnostic;
+pub use display::human_readable::{
+    human_readable_count, human_readable_duration, human_readable_size, units,
+};
 pub use error::{
-    field_not_found, unqualified_field_not_found, DataFusionError, Result, SchemaError,
-    SharedResult,
+    DataFusionError, Result, SchemaError, SharedResult, field_not_found,
+    unqualified_field_not_found,
 };
 pub use file_options::file_type::{
-    GetExt, DEFAULT_ARROW_EXTENSION, DEFAULT_AVRO_EXTENSION, DEFAULT_CSV_EXTENSION,
-    DEFAULT_JSON_EXTENSION, DEFAULT_PARQUET_EXTENSION,
+    DEFAULT_ARROW_EXTENSION, DEFAULT_AVRO_EXTENSION, DEFAULT_CSV_EXTENSION,
+    DEFAULT_JSON_EXTENSION, DEFAULT_PARQUET_EXTENSION, GetExt,
 };
 pub use functional_dependencies::{
+    Constraint, Constraints, Dependency, FunctionalDependence, FunctionalDependencies,
     aggregate_functional_dependencies, get_required_group_by_exprs_indices,
-    get_target_functional_dependencies, Constraint, Constraints, Dependency,
-    FunctionalDependence, FunctionalDependencies,
+    get_required_sort_exprs_indices, get_target_functional_dependencies,
 };
-use hashbrown::hash_map::DefaultHashBuilder;
+use hashbrown::DefaultHashBuilder;
 pub use join_type::{JoinConstraint, JoinSide, JoinType};
 pub use nested_struct::cast_column;
 pub use null_equality::NullEquality;
@@ -102,9 +104,9 @@ pub use utils::project_schema;
 // https://github.com/rust-lang/rust/pull/52234#issuecomment-976702997
 #[doc(hidden)]
 pub use error::{
-    _config_datafusion_err, _exec_datafusion_err, _internal_datafusion_err,
-    _not_impl_datafusion_err, _plan_datafusion_err, _resources_datafusion_err,
-    _substrait_datafusion_err,
+    _config_datafusion_err, _exec_datafusion_err, _ffi_datafusion_err,
+    _internal_datafusion_err, _not_impl_datafusion_err, _plan_datafusion_err,
+    _resources_datafusion_err, _substrait_datafusion_err,
 };
 
 // The HashMap and HashSet implementations that should be used as the uniform defaults
@@ -136,10 +138,10 @@ macro_rules! downcast_value {
 // Not public API.
 #[doc(hidden)]
 pub mod __private {
-    use crate::error::_internal_datafusion_err;
     use crate::Result;
+    use crate::error::_internal_datafusion_err;
     use arrow::array::Array;
-    use std::any::{type_name, Any};
+    use std::any::{Any, type_name};
 
     #[doc(hidden)]
     pub trait DowncastArrayHelper {
@@ -190,7 +192,7 @@ mod tests {
 
         assert_starts_with(
             error.to_string(),
-            "Internal error: could not cast array of type Int32 to arrow_array::array::primitive_array::PrimitiveArray<arrow_array::types::UInt64Type>"
+            "Internal error: could not cast array of type Int32 to arrow_array::array::primitive_array::PrimitiveArray<arrow_array::types::UInt64Type>",
         );
     }
 
diff --git a/datafusion/common/src/metadata.rs b/datafusion/common/src/metadata.rs
index 3a10cc2b42f9f..d6d8fb7b0ed0c 100644
--- a/datafusion/common/src/metadata.rs
+++ b/datafusion/common/src/metadata.rs
@@ -17,10 +17,10 @@
 
 use std::{collections::BTreeMap, sync::Arc};
 
-use arrow::datatypes::{DataType, Field};
+use arrow::datatypes::{DataType, Field, FieldRef};
 use hashbrown::HashMap;
 
-use crate::{error::_plan_err, DataFusionError, ScalarValue};
+use crate::{DataFusionError, ScalarValue, error::_plan_err};
 
 /// A [`ScalarValue`] with optional [`FieldMetadata`]
 #[derive(Debug, Clone)]
@@ -171,6 +171,10 @@ pub fn format_type_and_metadata(
 /// // Add any metadata from `FieldMetadata` to `Field`
 /// let updated_field = metadata.add_to_field(field);
 /// ```
+///
+/// For more background, please also see the [Implementing User Defined Types and Custom Metadata in DataFusion blog]
+///
+/// [Implementing User Defined Types and Custom Metadata in DataFusion blog]: https://datafusion.apache.org/blog/2025/09/21/custom-types-using-metadata
 #[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
 pub struct FieldMetadata {
     /// The inner metadata of a literal expression, which is a map of string
@@ -320,6 +324,16 @@ impl FieldMetadata {
 
         field.with_metadata(self.to_hashmap())
     }
+
+    /// Updates the metadata on the FieldRef with this metadata, if it is not empty.
+    pub fn add_to_field_ref(&self, mut field_ref: FieldRef) -> FieldRef {
+        if self.inner.is_empty() {
+            return field_ref;
+        }
+
+        Arc::make_mut(&mut field_ref).set_metadata(self.to_hashmap());
+        field_ref
+    }
 }
 
 impl From<&Field> for FieldMetadata {
diff --git a/datafusion/common/src/nested_struct.rs b/datafusion/common/src/nested_struct.rs
index d43816f75b0ed..cdd6215d08e2f 100644
--- a/datafusion/common/src/nested_struct.rs
+++ b/datafusion/common/src/nested_struct.rs
@@ -15,13 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::error::{Result, _plan_err};
+use crate::error::{_plan_err, Result};
 use arrow::{
-    array::{new_null_array, Array, ArrayRef, StructArray},
-    compute::{cast_with_options, CastOptions},
-    datatypes::{DataType::Struct, Field, FieldRef},
+    array::{
+        Array, ArrayRef, DictionaryArray, GenericListArray, GenericListViewArray,
+        StructArray, downcast_integer, new_null_array,
+    },
+    compute::{CastOptions, can_cast_types, cast_with_options},
+    datatypes::{DataType, DataType::Struct, Field, FieldRef},
 };
-use std::sync::Arc;
+use std::{collections::HashSet, sync::Arc};
 
 /// Cast a struct column to match target struct fields, handling nested structs recursively.
 ///
@@ -31,6 +34,7 @@ use std::sync::Arc;
 ///
 /// ## Field Matching Strategy
 /// - **By Name**: Source struct fields are matched to target fields by name (case-sensitive)
+/// - **No Positional Mapping**: Structs with no overlapping field names are rejected
 /// - **Type Adaptation**: When a matching field is found, it is recursively cast to the target field's type
 /// - **Missing Fields**: Target fields not present in the source are filled with null values
 /// - **Extra Fields**: Source fields not present in the target are ignored
@@ -54,25 +58,42 @@ fn cast_struct_column(
     target_fields: &[Arc<Field>],
     cast_options: &CastOptions,
 ) -> Result<ArrayRef> {
-    if let Some(source_struct) = source_col.as_any().downcast_ref::<StructArray>() {
-        validate_struct_compatibility(source_struct.fields(), target_fields)?;
+    if source_col.data_type() == &DataType::Null
+        || (!source_col.is_empty() && source_col.null_count() == source_col.len())
+    {
+        return Ok(new_null_array(
+            &Struct(target_fields.to_vec().into()),
+            source_col.len(),
+        ));
+    }
 
+    if let Some(source_struct) = source_col.as_any().downcast_ref::<StructArray>() {
+        let source_fields = source_struct.fields();
+        validate_struct_compatibility(source_fields, target_fields)?;
         let mut fields: Vec<Arc<Field>> = Vec::with_capacity(target_fields.len());
         let mut arrays: Vec<ArrayRef> = Vec::with_capacity(target_fields.len());
         let num_rows = source_col.len();
 
-        for target_child_field in target_fields {
+        // Iterate target fields and pick source child by name when present.
+        for target_child_field in target_fields.iter() {
             fields.push(Arc::clone(target_child_field));
-            match source_struct.column_by_name(target_child_field.name()) {
+
+            let source_child_opt =
+                source_struct.column_by_name(target_child_field.name());
+
+            match source_child_opt {
                 Some(source_child_col) => {
-                    let adapted_child =
-                        cast_column(source_child_col, target_child_field, cast_options)
-                            .map_err(|e| {
-                            e.context(format!(
-                                "While casting struct field '{}'",
-                                target_child_field.name()
-                            ))
-                        })?;
+                    let adapted_child = cast_column(
+                        source_child_col,
+                        target_child_field.data_type(),
+                        cast_options,
+                    )
+                    .map_err(|e| {
+                        e.context(format!(
+                            "While casting struct field '{}'",
+                            target_child_field.name()
+                        ))
+                    })?;
                     arrays.push(adapted_child);
                 }
                 None => {
@@ -112,18 +133,17 @@ fn cast_struct_column(
 /// ```
 /// use arrow::array::{ArrayRef, Int64Array};
 /// use arrow::compute::CastOptions;
-/// use arrow::datatypes::{DataType, Field};
+/// use arrow::datatypes::DataType;
 /// use datafusion_common::nested_struct::cast_column;
 /// use std::sync::Arc;
 ///
 /// let source: ArrayRef = Arc::new(Int64Array::from(vec![1, i64::MAX]));
-/// let target = Field::new("ints", DataType::Int32, true);
 /// // Permit lossy conversions by producing NULL on overflow instead of erroring
 /// let options = CastOptions {
 ///     safe: true,
 ///     ..Default::default()
 /// };
-/// let result = cast_column(&source, &target, &options).unwrap();
+/// let result = cast_column(&source, &DataType::Int32, &options).unwrap();
 /// assert!(result.is_null(1));
 /// ```
 ///
@@ -136,7 +156,7 @@ fn cast_struct_column(
 ///
 /// # Arguments
 /// * `source_col` - The source array to cast
-/// * `target_field` - The target field definition (including type and metadata)
+/// * `target_type` - The target data type to cast to
 /// * `cast_options` - Options that govern strictness and formatting of the cast
 ///
 /// # Returns
@@ -150,18 +170,139 @@ fn cast_struct_column(
 /// - Invalid data type combinations are encountered
 pub fn cast_column(
     source_col: &ArrayRef,
-    target_field: &Field,
+    target_type: &DataType,
     cast_options: &CastOptions,
 ) -> Result<ArrayRef> {
-    match target_field.data_type() {
-        Struct(target_fields) => {
+    match (source_col.data_type(), target_type) {
+        (_, Struct(target_fields)) => {
             cast_struct_column(source_col, target_fields, cast_options)
         }
-        _ => Ok(cast_with_options(
+        (DataType::List(_), DataType::List(target_inner)) => {
+            cast_list_column::<i32>(source_col, target_inner, cast_options)
+        }
+        (DataType::LargeList(_), DataType::LargeList(target_inner)) => {
+            cast_list_column::<i64>(source_col, target_inner, cast_options)
+        }
+        (DataType::ListView(_), DataType::ListView(target_inner)) => {
+            cast_list_view_column::<i32>(source_col, target_inner, cast_options)
+        }
+        (DataType::LargeListView(_), DataType::LargeListView(target_inner)) => {
+            cast_list_view_column::<i64>(source_col, target_inner, cast_options)
+        }
+        (
+            DataType::Dictionary(source_key_type, _),
+            DataType::Dictionary(target_key_type, target_value_type),
+        ) => cast_dictionary_column(
             source_col,
-            target_field.data_type(),
+            source_key_type,
+            target_key_type,
+            target_value_type,
             cast_options,
-        )?),
+        ),
+        _ => Ok(cast_with_options(source_col, target_type, cast_options)?),
+    }
+}
+
+fn cast_list_column<O: arrow::array::OffsetSizeTrait>(
+    source_col: &ArrayRef,
+    target_inner_field: &FieldRef,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef> {
+    let source_list = source_col
+        .as_any()
+        .downcast_ref::<GenericListArray<O>>()
+        .ok_or_else(|| {
+            crate::error::DataFusionError::Plan(format!(
+                "Expected list array but got {}",
+                source_col.data_type()
+            ))
+        })?;
+
+    let cast_values = cast_column(
+        source_list.values(),
+        target_inner_field.data_type(),
+        cast_options,
+    )?;
+
+    let result = GenericListArray::<O>::new(
+        Arc::clone(target_inner_field),
+        source_list.offsets().clone(),
+        cast_values,
+        source_list.nulls().cloned(),
+    );
+    Ok(Arc::new(result))
+}
+
+fn cast_list_view_column<O: arrow::array::OffsetSizeTrait>(
+    source_col: &ArrayRef,
+    target_inner_field: &FieldRef,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef> {
+    let source_list = source_col
+        .as_any()
+        .downcast_ref::<GenericListViewArray<O>>()
+        .ok_or_else(|| {
+            crate::error::DataFusionError::Plan(format!(
+                "Expected list view array but got {}",
+                source_col.data_type()
+            ))
+        })?;
+
+    let cast_values = cast_column(
+        source_list.values(),
+        target_inner_field.data_type(),
+        cast_options,
+    )?;
+
+    let result = GenericListViewArray::<O>::try_new(
+        Arc::clone(target_inner_field),
+        source_list.offsets().clone(),
+        source_list.sizes().clone(),
+        cast_values,
+        source_list.nulls().cloned(),
+    )?;
+    Ok(Arc::new(result))
+}
+
+fn cast_dictionary_column(
+    source_col: &ArrayRef,
+    source_key_type: &DataType,
+    target_key_type: &DataType,
+    target_value_type: &DataType,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef> {
+    // Dispatch on source key type to access keys/values, then recursively
+    // cast values. Rebuild with the source key type first.
+    macro_rules! cast_dict_values {
+        ($t:ty) => {{
+            let source_dict = source_col
+                .as_any()
+                .downcast_ref::<DictionaryArray<$t>>()
+                .expect("downcast must succeed");
+            let cast_values =
+                cast_column(source_dict.values(), target_value_type, cast_options)?;
+            Ok(Arc::new(DictionaryArray::<$t>::new(
+                source_dict.keys().clone(),
+                cast_values,
+            )) as ArrayRef)
+        }};
+    }
+
+    let result: Result<ArrayRef> = downcast_integer! {
+        source_key_type => (cast_dict_values),
+        k => _plan_err!("Unsupported dictionary key type: {k}")
+    };
+    let result = result?;
+
+    // If key types differ, delegate key casting to Arrow.
+    if source_key_type != target_key_type {
+        let target_dict_type = DataType::Dictionary(
+            Box::new(target_key_type.clone()),
+            Box::new(target_value_type.clone()),
+        );
+        Ok(cast_with_options(&result, &target_dict_type, cast_options)?)
+    } else {
+        Ok(result)
     }
 }
 
@@ -200,10 +341,20 @@ pub fn cast_column(
 /// // Target: {a: binary}
 /// // Result: Err(...) - string cannot cast to binary
 /// ```
+///
 pub fn validate_struct_compatibility(
     source_fields: &[FieldRef],
     target_fields: &[FieldRef],
 ) -> Result<()> {
+    let has_overlap = has_one_of_more_common_fields(source_fields, target_fields);
+    if !has_overlap {
+        return _plan_err!(
+            "Cannot cast struct with {} fields to {} fields because there is no field name overlap",
+            source_fields.len(),
+            target_fields.len()
+        );
+    }
+
     // Check compatibility for each target field
     for target_field in target_fields {
         // Look for matching field in source by name
@@ -211,55 +362,156 @@ pub fn validate_struct_compatibility(
             .iter()
             .find(|f| f.name() == target_field.name())
         {
-            // Ensure nullability is compatible. It is invalid to cast a nullable
-            // source field to a non-nullable target field as this may discard
-            // null values.
-            if source_field.is_nullable() && !target_field.is_nullable() {
+            validate_field_compatibility(source_field, target_field)?;
+        } else {
+            // Target field is missing from source
+            // If it's non-nullable, we cannot fill it with NULL
+            if !target_field.is_nullable() {
                 return _plan_err!(
-                    "Cannot cast nullable struct field '{}' to non-nullable field",
+                    "Cannot cast struct: target field '{}' is non-nullable but missing from source. \
+                     Cannot fill with NULL.",
                     target_field.name()
                 );
             }
-            // Check if the matching field types are compatible
-            match (source_field.data_type(), target_field.data_type()) {
-                // Recursively validate nested structs
-                (Struct(source_nested), Struct(target_nested)) => {
-                    validate_struct_compatibility(source_nested, target_nested)?;
-                }
-                // For non-struct types, use the existing castability check
-                _ => {
-                    if !arrow::compute::can_cast_types(
-                        source_field.data_type(),
-                        target_field.data_type(),
-                    ) {
-                        return _plan_err!(
-                            "Cannot cast struct field '{}' from type {} to type {}",
-                            target_field.name(),
-                            source_field.data_type(),
-                            target_field.data_type()
-                        );
-                    }
-                }
-            }
         }
-        // Missing fields in source are OK - they'll be filled with nulls
     }
 
     // Extra fields in source are OK - they'll be ignored
     Ok(())
 }
 
+fn validate_field_compatibility(
+    source_field: &Field,
+    target_field: &Field,
+) -> Result<()> {
+    if source_field.data_type() == &DataType::Null {
+        // Validate that target allows nulls before returning early.
+        // It is invalid to cast a NULL source field to a non-nullable target field.
+        if !target_field.is_nullable() {
+            return _plan_err!(
+                "Cannot cast NULL struct field '{}' to non-nullable field '{}'",
+                source_field.name(),
+                target_field.name()
+            );
+        }
+        return Ok(());
+    }
+
+    // Ensure nullability is compatible. It is invalid to cast a nullable
+    // source field to a non-nullable target field as this may discard
+    // null values.
+    if source_field.is_nullable() && !target_field.is_nullable() {
+        return _plan_err!(
+            "Cannot cast nullable struct field '{}' to non-nullable field",
+            target_field.name()
+        );
+    }
+
+    validate_data_type_compatibility(
+        target_field.name(),
+        source_field.data_type(),
+        target_field.data_type(),
+    )
+}
+
+/// Validates that `source_type` can be cast to `target_type`, recursively
+/// handling container types that wrap structs.
+pub fn validate_data_type_compatibility(
+    field_name: &str,
+    source_type: &DataType,
+    target_type: &DataType,
+) -> Result<()> {
+    match (source_type, target_type) {
+        (Struct(source_nested), Struct(target_nested)) => {
+            validate_struct_compatibility(source_nested, target_nested)?;
+        }
+        (DataType::List(s), DataType::List(t))
+        | (DataType::LargeList(s), DataType::LargeList(t))
+        | (DataType::ListView(s), DataType::ListView(t))
+        | (DataType::LargeListView(s), DataType::LargeListView(t)) => {
+            validate_field_compatibility(s, t)?;
+        }
+        (DataType::Dictionary(s_key, s_val), DataType::Dictionary(t_key, t_val)) => {
+            if !can_cast_types(s_key, t_key) {
+                return _plan_err!(
+                    "Cannot cast dictionary key type {} to {} for field '{}'",
+                    s_key,
+                    t_key,
+                    field_name
+                );
+            }
+            validate_data_type_compatibility(field_name, s_val, t_val)?;
+        }
+        _ => {
+            if !can_cast_types(source_type, target_type) {
+                return _plan_err!(
+                    "Cannot cast struct field '{}' from type {} to type {}",
+                    field_name,
+                    source_type,
+                    target_type
+                );
+            }
+        }
+    }
+    Ok(())
+}
+
+/// Returns true if casting from `source_type` to `target_type` requires
+/// name-based nested struct casting logic, rather than Arrow's standard cast.
+///
+/// This is the case when both types are struct types, or both are the same
+/// container type (List, LargeList, ListView, LargeListView, Dictionary) wrapping
+/// types that recursively contain structs.
+///
+/// Use this predicate at both planning time (to decide whether to apply struct
+/// compatibility validation) and execution time (to decide whether to route
+/// through [`cast_column`] instead of Arrow's generic cast).
+pub fn requires_nested_struct_cast(
+    source_type: &DataType,
+    target_type: &DataType,
+) -> bool {
+    match (source_type, target_type) {
+        (Struct(_), Struct(_)) => true,
+        (DataType::List(s), DataType::List(t))
+        | (DataType::LargeList(s), DataType::LargeList(t))
+        | (DataType::ListView(s), DataType::ListView(t))
+        | (DataType::LargeListView(s), DataType::LargeListView(t)) => {
+            requires_nested_struct_cast(s.data_type(), t.data_type())
+        }
+        (DataType::Dictionary(_, s_val), DataType::Dictionary(_, t_val)) => {
+            requires_nested_struct_cast(s_val, t_val)
+        }
+        _ => false,
+    }
+}
+
+/// Check if two field lists have at least one common field by name.
+///
+/// This is useful for validating struct compatibility when casting between structs,
+/// ensuring that source and target fields have overlapping names.
+pub fn has_one_of_more_common_fields(
+    source_fields: &[FieldRef],
+    target_fields: &[FieldRef],
+) -> bool {
+    let source_names: HashSet<&str> = source_fields
+        .iter()
+        .map(|field| field.name().as_str())
+        .collect();
+    target_fields
+        .iter()
+        .any(|field| source_names.contains(field.name().as_str()))
+}
+
 #[cfg(test)]
 mod tests {
-
     use super::*;
-    use crate::format::DEFAULT_CAST_OPTIONS;
+    use crate::{assert_contains, format::DEFAULT_CAST_OPTIONS};
     use arrow::{
         array::{
-            BinaryArray, Int32Array, Int32Builder, Int64Array, ListArray, MapArray,
-            MapBuilder, StringArray, StringBuilder,
+            BinaryArray, Int32Array, Int32Builder, Int64Array, ListArray, ListViewArray,
+            MapArray, MapBuilder, NullArray, StringArray, StringBuilder,
         },
-        buffer::NullBuffer,
+        buffer::{NullBuffer, ScalarBuffer},
         datatypes::{DataType, Field, FieldRef, Int32Type},
     };
     /// Macro to extract and downcast a column from a StructArray
@@ -302,7 +554,9 @@ mod tests {
     fn test_cast_simple_column() {
         let source = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef;
         let target_field = field("ints", DataType::Int64);
-        let result = cast_column(&source, &target_field, &DEFAULT_CAST_OPTIONS).unwrap();
+        let result =
+            cast_column(&source, target_field.data_type(), &DEFAULT_CAST_OPTIONS)
+                .unwrap();
         let result = result.as_any().downcast_ref::<Int64Array>().unwrap();
         assert_eq!(result.len(), 3);
         assert_eq!(result.value(0), 1);
@@ -320,14 +574,15 @@ mod tests {
             safe: false,
             ..DEFAULT_CAST_OPTIONS
         };
-        assert!(cast_column(&source, &target_field, &safe_opts).is_err());
+        assert!(cast_column(&source, target_field.data_type(), &safe_opts).is_err());
 
         let unsafe_opts = CastOptions {
             // safe: true - return Null for failure
             safe: true,
             ..DEFAULT_CAST_OPTIONS
         };
-        let result = cast_column(&source, &target_field, &unsafe_opts).unwrap();
+        let result =
+            cast_column(&source, target_field.data_type(), &unsafe_opts).unwrap();
         let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
         assert_eq!(result.value(0), 1);
         assert!(result.is_null(1));
@@ -348,7 +603,8 @@ mod tests {
         );
 
         let result =
-            cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS).unwrap();
+            cast_column(&source_col, target_field.data_type(), &DEFAULT_CAST_OPTIONS)
+                .unwrap();
         let struct_array = result.as_any().downcast_ref::<StructArray>().unwrap();
         assert_eq!(struct_array.fields().len(), 2);
         let a_result = get_column_as!(&struct_array, "a", Int32Array);
@@ -366,7 +622,8 @@ mod tests {
         let source = Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef;
         let target_field = struct_field("s", vec![field("a", DataType::Int32)]);
 
-        let result = cast_column(&source, &target_field, &DEFAULT_CAST_OPTIONS);
+        let result =
+            cast_column(&source, target_field.data_type(), &DEFAULT_CAST_OPTIONS);
         assert!(result.is_err());
         let error_msg = result.unwrap_err().to_string();
         assert!(error_msg.contains("Cannot cast column of type"));
@@ -386,7 +643,8 @@ mod tests {
 
         let target_field = struct_field("s", vec![field("a", DataType::Int32)]);
 
-        let result = cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS);
+        let result =
+            cast_column(&source_col, target_field.data_type(), &DEFAULT_CAST_OPTIONS);
         assert!(result.is_err());
         let error_msg = result.unwrap_err().to_string();
         assert!(error_msg.contains("Cannot cast struct field 'a'"));
@@ -428,11 +686,14 @@ mod tests {
 
     #[test]
     fn test_validate_struct_compatibility_missing_field_in_source() {
-        // Source struct: {field2: String} (missing field1)
-        let source_fields = vec![arc_field("field2", DataType::Utf8)];
+        // Source struct: {field1: Int32} (missing field2)
+        let source_fields = vec![arc_field("field1", DataType::Int32)];
 
-        // Target struct: {field1: Int32}
-        let target_fields = vec![arc_field("field1", DataType::Int32)];
+        // Target struct: {field1: Int32, field2: Utf8}
+        let target_fields = vec![
+            arc_field("field1", DataType::Int32),
+            arc_field("field2", DataType::Utf8),
+        ];
 
         // Should be OK - missing fields will be filled with nulls
         let result = validate_struct_compatibility(&source_fields, &target_fields);
@@ -455,6 +716,20 @@ mod tests {
         assert!(result.is_ok());
     }
 
+    #[test]
+    fn test_validate_struct_compatibility_no_overlap_mismatch_len() {
+        let source_fields = vec![
+            arc_field("left", DataType::Int32),
+            arc_field("right", DataType::Int32),
+        ];
+        let target_fields = vec![arc_field("alpha", DataType::Int32)];
+
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_err());
+        let error_msg = result.unwrap_err().to_string();
+        assert_contains!(error_msg, "no field name overlap");
+    }
+
     #[test]
     fn test_cast_struct_parent_nulls_retained() {
         let a_array = Arc::new(Int32Array::from(vec![Some(1), Some(2)])) as ArrayRef;
@@ -466,7 +741,8 @@ mod tests {
         let target_field = struct_field("s", vec![field("a", DataType::Int64)]);
 
         let result =
-            cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS).unwrap();
+            cast_column(&source_col, target_field.data_type(), &DEFAULT_CAST_OPTIONS)
+                .unwrap();
         let struct_array = result.as_any().downcast_ref::<StructArray>().unwrap();
         assert_eq!(struct_array.null_count(), 1);
         assert!(struct_array.is_valid(0));
@@ -525,6 +801,117 @@ mod tests {
         assert!(error_msg.contains("non-nullable"));
     }
 
+    #[test]
+    fn test_validate_struct_compatibility_by_name() {
+        // Source struct: {field1: Int32, field2: String}
+        let source_fields = vec![
+            arc_field("field1", DataType::Int32),
+            arc_field("field2", DataType::Utf8),
+        ];
+
+        // Target struct: {field2: String, field1: Int64}
+        let target_fields = vec![
+            arc_field("field2", DataType::Utf8),
+            arc_field("field1", DataType::Int64),
+        ];
+
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_validate_struct_compatibility_by_name_with_type_mismatch() {
+        // Source struct: {field1: Binary}
+        let source_fields = vec![arc_field("field1", DataType::Binary)];
+
+        // Target struct: {field1: Int32} (incompatible type)
+        let target_fields = vec![arc_field("field1", DataType::Int32)];
+
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_err());
+        let error_msg = result.unwrap_err().to_string();
+        assert_contains!(
+            error_msg,
+            "Cannot cast struct field 'field1' from type Binary to type Int32"
+        );
+    }
+
+    #[test]
+    fn test_validate_struct_compatibility_no_overlap_equal_len() {
+        let source_fields = vec![
+            arc_field("left", DataType::Int32),
+            arc_field("right", DataType::Utf8),
+        ];
+
+        let target_fields = vec![
+            arc_field("alpha", DataType::Int32),
+            arc_field("beta", DataType::Utf8),
+        ];
+
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_err());
+        let error_msg = result.unwrap_err().to_string();
+        assert_contains!(error_msg, "no field name overlap");
+    }
+
+    #[test]
+    fn test_validate_struct_compatibility_mixed_name_overlap() {
+        // Source struct: {a: Int32, b: String, extra: Boolean}
+        let source_fields = vec![
+            arc_field("a", DataType::Int32),
+            arc_field("b", DataType::Utf8),
+            arc_field("extra", DataType::Boolean),
+        ];
+
+        // Target struct: {b: String, a: Int64, c: Float32}
+        // Name overlap with a and b, missing c (nullable)
+        let target_fields = vec![
+            arc_field("b", DataType::Utf8),
+            arc_field("a", DataType::Int64),
+            arc_field("c", DataType::Float32),
+        ];
+
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_validate_struct_compatibility_by_name_missing_required_field() {
+        // Source struct: {field1: Int32} (missing field2)
+        let source_fields = vec![arc_field("field1", DataType::Int32)];
+
+        // Target struct: {field1: Int32, field2: Int32 non-nullable}
+        let target_fields = vec![
+            arc_field("field1", DataType::Int32),
+            Arc::new(non_null_field("field2", DataType::Int32)),
+        ];
+
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_err());
+        let error_msg = result.unwrap_err().to_string();
+        assert_contains!(
+            error_msg,
+            "Cannot cast struct: target field 'field2' is non-nullable but missing from source. Cannot fill with NULL."
+        );
+    }
+
+    #[test]
+    fn test_validate_struct_compatibility_partial_name_overlap_with_count_mismatch() {
+        // Source struct: {a: Int32} (only one field)
+        let source_fields = vec![arc_field("a", DataType::Int32)];
+
+        // Target struct: {a: Int32, b: String} (two fields, but 'a' overlaps)
+        let target_fields = vec![
+            arc_field("a", DataType::Int32),
+            arc_field("b", DataType::Utf8),
+        ];
+
+        // This should succeed - partial overlap means by-name mapping
+        // and missing field 'b' is nullable
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_ok());
+    }
+
     #[test]
     fn test_cast_nested_struct_with_extra_and_missing_fields() {
         // Source inner struct has fields a, b, extra
@@ -565,7 +952,8 @@ mod tests {
         );
 
         let result =
-            cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS).unwrap();
+            cast_column(&source_col, target_field.data_type(), &DEFAULT_CAST_OPTIONS)
+                .unwrap();
         let outer = result.as_any().downcast_ref::<StructArray>().unwrap();
         let inner = get_column_as!(&outer, "inner", StructArray);
         assert_eq!(inner.fields().len(), 3);
@@ -585,6 +973,34 @@ mod tests {
         assert!(missing.is_null(1));
     }
 
+    #[test]
+    fn test_cast_null_struct_field_to_nested_struct() {
+        let null_inner = Arc::new(NullArray::new(2)) as ArrayRef;
+        let source_struct = StructArray::from(vec![(
+            arc_field("inner", DataType::Null),
+            Arc::clone(&null_inner),
+        )]);
+        let source_col = Arc::new(source_struct) as ArrayRef;
+
+        let target_field = struct_field(
+            "outer",
+            vec![struct_field("inner", vec![field("a", DataType::Int32)])],
+        );
+
+        let result =
+            cast_column(&source_col, target_field.data_type(), &DEFAULT_CAST_OPTIONS)
+                .unwrap();
+        let outer = result.as_any().downcast_ref::<StructArray>().unwrap();
+        let inner = get_column_as!(&outer, "inner", StructArray);
+        assert_eq!(inner.len(), 2);
+        assert!(inner.is_null(0));
+        assert!(inner.is_null(1));
+
+        let inner_a = get_column_as!(inner, "a", Int32Array);
+        assert!(inner_a.is_null(0));
+        assert!(inner_a.is_null(1));
+    }
+
     #[test]
     fn test_cast_struct_with_array_and_map_fields() {
         // Array field with second row null
@@ -654,7 +1070,8 @@ mod tests {
         );
 
         let result =
-            cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS).unwrap();
+            cast_column(&source_col, target_field.data_type(), &DEFAULT_CAST_OPTIONS)
+                .unwrap();
         let struct_array = result.as_any().downcast_ref::<StructArray>().unwrap();
 
         let arr = get_column_as!(&struct_array, "arr", ListArray);
@@ -693,7 +1110,8 @@ mod tests {
         );
 
         let result =
-            cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS).unwrap();
+            cast_column(&source_col, target_field.data_type(), &DEFAULT_CAST_OPTIONS)
+                .unwrap();
         let struct_array = result.as_any().downcast_ref::<StructArray>().unwrap();
 
         let b_col = get_column_as!(&struct_array, "b", Int64Array);
@@ -704,4 +1122,218 @@ mod tests {
         assert_eq!(a_col.value(0), 1);
         assert_eq!(a_col.value(1), 2);
     }
+
+    #[test]
+    fn test_cast_struct_no_overlap_rejected() {
+        let first = Arc::new(Int32Array::from(vec![Some(10), Some(20)])) as ArrayRef;
+        let second =
+            Arc::new(StringArray::from(vec![Some("alpha"), Some("beta")])) as ArrayRef;
+
+        let source_struct = StructArray::from(vec![
+            (arc_field("left", DataType::Int32), first),
+            (arc_field("right", DataType::Utf8), second),
+        ]);
+        let source_col = Arc::new(source_struct) as ArrayRef;
+
+        let target_field = struct_field(
+            "s",
+            vec![field("a", DataType::Int64), field("b", DataType::Utf8)],
+        );
+
+        let result =
+            cast_column(&source_col, target_field.data_type(), &DEFAULT_CAST_OPTIONS);
+        assert!(result.is_err());
+        let error_msg = result.unwrap_err().to_string();
+        assert_contains!(error_msg, "no field name overlap");
+    }
+
+    #[test]
+    fn test_cast_struct_missing_non_nullable_field_fails() {
+        // Source has only field 'a'
+        let a = Arc::new(Int32Array::from(vec![Some(1), Some(2)])) as ArrayRef;
+        let source_struct = StructArray::from(vec![(arc_field("a", DataType::Int32), a)]);
+        let source_col = Arc::new(source_struct) as ArrayRef;
+
+        // Target has fields 'a' (nullable) and 'b' (non-nullable)
+        let target_field = struct_field(
+            "s",
+            vec![
+                field("a", DataType::Int32),
+                non_null_field("b", DataType::Int32),
+            ],
+        );
+
+        // Should fail because 'b' is non-nullable but missing from source
+        let result =
+            cast_column(&source_col, target_field.data_type(), &DEFAULT_CAST_OPTIONS);
+        assert!(result.is_err());
+        let err = result.unwrap_err();
+        assert!(
+            err.to_string()
+                .contains("target field 'b' is non-nullable but missing from source"),
+            "Unexpected error: {err}"
+        );
+    }
+
+    #[test]
+    fn test_cast_struct_missing_nullable_field_succeeds() {
+        // Source has only field 'a'
+        let a = Arc::new(Int32Array::from(vec![Some(1), Some(2)])) as ArrayRef;
+        let source_struct = StructArray::from(vec![(arc_field("a", DataType::Int32), a)]);
+        let source_col = Arc::new(source_struct) as ArrayRef;
+
+        // Target has fields 'a' and 'b' (both nullable)
+        let target_field = struct_field(
+            "s",
+            vec![field("a", DataType::Int32), field("b", DataType::Int32)],
+        );
+
+        // Should succeed - 'b' is nullable so can be filled with NULL
+        let result =
+            cast_column(&source_col, target_field.data_type(), &DEFAULT_CAST_OPTIONS)
+                .unwrap();
+        let struct_array = result.as_any().downcast_ref::<StructArray>().unwrap();
+
+        let a_col = get_column_as!(&struct_array, "a", Int32Array);
+        assert_eq!(a_col.value(0), 1);
+        assert_eq!(a_col.value(1), 2);
+
+        let b_col = get_column_as!(&struct_array, "b", Int32Array);
+        assert!(b_col.is_null(0));
+        assert!(b_col.is_null(1));
+    }
+
+    #[test]
+    fn test_validate_dictionary_value_evolution() {
+        let source_inner = struct_type(vec![field("a", DataType::Int32)]);
+        let target_inner = struct_type(vec![
+            field("a", DataType::Int32),
+            field("b", DataType::Utf8),
+        ]);
+        let source =
+            DataType::Dictionary(Box::new(DataType::Int32), Box::new(source_inner));
+        let target =
+            DataType::Dictionary(Box::new(DataType::Int32), Box::new(target_inner));
+        assert!(validate_data_type_compatibility("col", &source, &target).is_ok());
+    }
+
+    #[test]
+    fn test_cast_dictionary_struct_value() {
+        // Build a Dictionary<Int32, Struct{a: Int32}> and cast to
+        // Dictionary<Int32, Struct{a: Int64, b: Utf8}> (field added, type widened).
+        let struct_arr = StructArray::from(vec![(
+            arc_field("a", DataType::Int32),
+            Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef,
+        )]);
+        // keys: [0, null, 1] mapping into the 2-element struct values array.
+        let keys = Int32Array::from(vec![Some(0), None, Some(1)]);
+        let source_dict = DictionaryArray::<Int32Type>::new(keys, Arc::new(struct_arr));
+        let source_col: ArrayRef = Arc::new(source_dict);
+
+        let target_type = DataType::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(struct_type(vec![
+                field("a", DataType::Int64),
+                field("b", DataType::Utf8),
+            ])),
+        );
+
+        let result =
+            cast_column(&source_col, &target_type, &DEFAULT_CAST_OPTIONS).unwrap();
+        let result_dict = result
+            .as_any()
+            .downcast_ref::<DictionaryArray<Int32Type>>()
+            .unwrap();
+
+        assert!(result_dict.is_valid(0));
+        assert!(result_dict.is_null(1));
+        assert!(result_dict.is_valid(2));
+
+        let struct_values = result_dict
+            .values()
+            .as_any()
+            .downcast_ref::<StructArray>()
+            .unwrap();
+        let a_col = get_column_as!(&struct_values, "a", Int64Array);
+        assert_eq!(a_col.values(), &[10, 20]);
+        let b_col = get_column_as!(&struct_values, "b", StringArray);
+        assert!(b_col.iter().all(|v| v.is_none()));
+    }
+
+    #[test]
+    fn test_cast_list_view_struct() {
+        // Build a ListView<Struct{a: Int32}> and cast to
+        // ListView<Struct{a: Int64, b: Utf8}>.
+        let struct_arr = StructArray::from(vec![(
+            arc_field("a", DataType::Int32),
+            Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef,
+        )]);
+
+        let source_field =
+            arc_field("item", struct_type(vec![field("a", DataType::Int32)]));
+        let target_field = arc_field(
+            "item",
+            struct_type(vec![
+                field("a", DataType::Int64),
+                field("b", DataType::Utf8),
+            ]),
+        );
+
+        // Two list-view entries: [0..2] and [2..3]
+        let list_view = ListViewArray::new(
+            source_field,
+            ScalarBuffer::from(vec![0i32, 2]),
+            ScalarBuffer::from(vec![2i32, 1]),
+            Arc::new(struct_arr),
+            None,
+        );
+        let source_col: ArrayRef = Arc::new(list_view);
+
+        let target_type = DataType::ListView(target_field);
+
+        let result =
+            cast_column(&source_col, &target_type, &DEFAULT_CAST_OPTIONS).unwrap();
+        let result_lv = result.as_any().downcast_ref::<ListViewArray>().unwrap();
+        assert_eq!(result_lv.len(), 2);
+
+        let struct_values = result_lv
+            .values()
+            .as_any()
+            .downcast_ref::<StructArray>()
+            .unwrap();
+        let a_col = get_column_as!(&struct_values, "a", Int64Array);
+        assert_eq!(a_col.values(), &[1, 2, 3]);
+        let b_col = get_column_as!(&struct_values, "b", StringArray);
+        assert!(b_col.iter().all(|v| v.is_none()));
+    }
+
+    #[test]
+    fn test_requires_nested_struct_cast() {
+        let s1 = struct_type(vec![field("a", DataType::Int32)]);
+        let s2 = struct_type(vec![field("a", DataType::Int64)]);
+
+        assert!(requires_nested_struct_cast(&s1, &s2));
+        assert!(requires_nested_struct_cast(
+            &DataType::List(arc_field("item", s1.clone())),
+            &DataType::List(arc_field("item", s2.clone())),
+        ));
+        assert!(requires_nested_struct_cast(
+            &DataType::Dictionary(Box::new(DataType::Int32), Box::new(s1.clone())),
+            &DataType::Dictionary(Box::new(DataType::Int32), Box::new(s2.clone())),
+        ));
+        assert!(requires_nested_struct_cast(
+            &DataType::ListView(arc_field("item", s1)),
+            &DataType::ListView(arc_field("item", s2)),
+        ));
+
+        // Non-struct types should return false.
+        assert!(!requires_nested_struct_cast(
+            &DataType::Int32,
+            &DataType::Int64
+        ));
+        assert!(!requires_nested_struct_cast(
+            &DataType::List(arc_field("item", DataType::Int32)),
+            &DataType::List(arc_field("item", DataType::Int64)),
+        ));
+    }
 }
diff --git a/datafusion/common/src/param_value.rs b/datafusion/common/src/param_value.rs
index ebf68e4dd210d..0fac6b529eb0f 100644
--- a/datafusion/common/src/param_value.rs
+++ b/datafusion/common/src/param_value.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use crate::error::{_plan_datafusion_err, _plan_err};
-use crate::metadata::{check_metadata_with_storage_equal, ScalarAndMetadata};
+use crate::metadata::{ScalarAndMetadata, check_metadata_with_storage_equal};
 use crate::{Result, ScalarValue};
 use arrow::datatypes::{DataType, Field, FieldRef};
 use std::collections::HashMap;
diff --git a/datafusion/common/src/parquet_config.rs b/datafusion/common/src/parquet_config.rs
new file mode 100644
index 0000000000000..9d6d7a88566a7
--- /dev/null
+++ b/datafusion/common/src/parquet_config.rs
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::fmt::{self, Display};
+use std::str::FromStr;
+
+use crate::config::{ConfigField, Visit};
+use crate::error::{DataFusionError, Result};
+
+/// Parquet writer version options for controlling the Parquet file format version
+///
+/// This enum validates parquet writer version values at configuration time,
+/// ensuring only valid versions ("1.0" or "2.0") can be set via `SET` commands
+/// or proto deserialization.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub enum DFParquetWriterVersion {
+    /// Parquet format version 1.0
+    #[default]
+    V1_0,
+    /// Parquet format version 2.0
+    V2_0,
+}
+
+/// Implement parsing strings to `DFParquetWriterVersion`
+impl FromStr for DFParquetWriterVersion {
+    type Err = DataFusionError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s.to_lowercase().as_str() {
+            "1.0" => Ok(DFParquetWriterVersion::V1_0),
+            "2.0" => Ok(DFParquetWriterVersion::V2_0),
+            other => Err(DataFusionError::Configuration(format!(
+                "Invalid parquet writer version: {other}. Expected one of: 1.0, 2.0"
+            ))),
+        }
+    }
+}
+
+impl Display for DFParquetWriterVersion {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let s = match self {
+            DFParquetWriterVersion::V1_0 => "1.0",
+            DFParquetWriterVersion::V2_0 => "2.0",
+        };
+        write!(f, "{s}")
+    }
+}
+
+impl ConfigField for DFParquetWriterVersion {
+    fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
+        v.some(key, self, description)
+    }
+
+    fn set(&mut self, _: &str, value: &str) -> Result<()> {
+        *self = DFParquetWriterVersion::from_str(value)?;
+        Ok(())
+    }
+}
+
+/// Convert `DFParquetWriterVersion` to parquet crate's `WriterVersion`
+///
+/// This conversion is infallible since `DFParquetWriterVersion` only contains
+/// valid values that have been validated at configuration time.
+#[cfg(feature = "parquet")]
+impl From<DFParquetWriterVersion> for parquet::file::properties::WriterVersion {
+    fn from(value: DFParquetWriterVersion) -> Self {
+        match value {
+            DFParquetWriterVersion::V1_0 => {
+                parquet::file::properties::WriterVersion::PARQUET_1_0
+            }
+            DFParquetWriterVersion::V2_0 => {
+                parquet::file::properties::WriterVersion::PARQUET_2_0
+            }
+        }
+    }
+}
+
+/// Convert parquet crate's `WriterVersion` to `DFParquetWriterVersion`
+///
+/// This is used when converting from existing parquet writer properties,
+/// such as when reading from proto or test code.
+#[cfg(feature = "parquet")]
+impl From<parquet::file::properties::WriterVersion> for DFParquetWriterVersion {
+    fn from(version: parquet::file::properties::WriterVersion) -> Self {
+        match version {
+            parquet::file::properties::WriterVersion::PARQUET_1_0 => {
+                DFParquetWriterVersion::V1_0
+            }
+            parquet::file::properties::WriterVersion::PARQUET_2_0 => {
+                DFParquetWriterVersion::V2_0
+            }
+        }
+    }
+}
diff --git a/datafusion/common/src/parsers.rs b/datafusion/common/src/parsers.rs
index cd3d607dacd88..6b930d110f47b 100644
--- a/datafusion/common/src/parsers.rs
+++ b/datafusion/common/src/parsers.rs
@@ -73,3 +73,59 @@ impl CompressionTypeVariant {
         !matches!(self, &Self::UNCOMPRESSED)
     }
 }
+
+/// CSV quote style
+///
+/// Controls when fields are quoted when writing CSV files.
+/// Corresponds to [`arrow::csv::QuoteStyle`].
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
+pub enum CsvQuoteStyle {
+    /// Quote all fields
+    Always,
+    /// Only quote fields when necessary (default)
+    #[default]
+    Necessary,
+    /// Quote all non-numeric fields
+    NonNumeric,
+    /// Never quote fields
+    Never,
+}
+
+impl FromStr for CsvQuoteStyle {
+    type Err = DataFusionError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s.to_lowercase().as_str() {
+            "always" => Ok(Self::Always),
+            "necessary" => Ok(Self::Necessary),
+            "non_numeric" | "nonnumeric" => Ok(Self::NonNumeric),
+            "never" => Ok(Self::Never),
+            _ => Err(DataFusionError::NotImplemented(format!(
+                "Unsupported CSV quote style {s}"
+            ))),
+        }
+    }
+}
+
+impl From<CsvQuoteStyle> for arrow::csv::QuoteStyle {
+    fn from(style: CsvQuoteStyle) -> Self {
+        match style {
+            CsvQuoteStyle::Always => Self::Always,
+            CsvQuoteStyle::NonNumeric => Self::NonNumeric,
+            CsvQuoteStyle::Never => Self::Never,
+            CsvQuoteStyle::Necessary => Self::Necessary,
+        }
+    }
+}
+
+impl Display for CsvQuoteStyle {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let str = match self {
+            Self::Always => "Always",
+            Self::Necessary => "Necessary",
+            Self::NonNumeric => "NonNumeric",
+            Self::Never => "Never",
+        };
+        write!(f, "{str}")
+    }
+}
diff --git a/datafusion/common/src/pruning.rs b/datafusion/common/src/pruning.rs
index 48750e3c995c4..ebae23f0723a1 100644
--- a/datafusion/common/src/pruning.rs
+++ b/datafusion/common/src/pruning.rs
@@ -95,15 +95,17 @@ pub trait PruningStatistics {
     /// [`UInt64Array`]: arrow::array::UInt64Array
     fn null_counts(&self, column: &Column) -> Option<ArrayRef>;
 
-    /// Return the number of rows for the named column in each container
-    /// as an [`UInt64Array`].
+    /// Return the number of rows in each container as an [`UInt64Array`].
+    ///
+    /// Row counts are container-level (not column-specific) — the value
+    /// is the same regardless of which column is being considered.
     ///
     /// See [`Self::min_values`] for when to return `None` and null values.
     ///
     /// Note: the returned array must contain [`Self::num_containers`] rows
     ///
     /// [`UInt64Array`]: arrow::array::UInt64Array
-    fn row_counts(&self, column: &Column) -> Option<ArrayRef>;
+    fn row_counts(&self) -> Option<ArrayRef>;
 
     /// Returns [`BooleanArray`] where each row represents information known
     /// about specific literal `values` in a column.
@@ -121,6 +123,7 @@ pub trait PruningStatistics {
     /// container, return `None` (the default).
     ///
     /// Note: the returned array must contain [`Self::num_containers`] rows
+    #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key
     fn contained(
         &self,
         column: &Column,
@@ -135,6 +138,10 @@ pub trait PruningStatistics {
 /// This feeds into [`CompositePruningStatistics`] to allow pruning
 /// with filters that depend both on partition columns and data columns
 /// (e.g. `WHERE partition_col = data_col`).
+#[deprecated(
+    since = "52.0.0",
+    note = "This struct is no longer used internally. Use `replace_columns_with_literals` from `datafusion-physical-expr-adapter` to substitute partition column values before pruning. It will be removed in 58.0.0 or 6 months after 52.0.0 is released, whichever comes first."
+)]
 #[derive(Clone)]
 pub struct PartitionPruningStatistics {
     /// Values for each column for each container.
@@ -156,6 +163,7 @@ pub struct PartitionPruningStatistics {
     partition_schema: SchemaRef,
 }
 
+#[expect(deprecated)]
 impl PartitionPruningStatistics {
     /// Create a new instance of [`PartitionPruningStatistics`].
     ///
@@ -169,6 +177,36 @@ impl PartitionPruningStatistics {
     ///   This must **not** be the schema of the entire file or table:
     ///   instead it must only be the schema of the partition columns,
     ///   in the same order as the values in `partition_values`.
+    ///
+    /// # Example
+    ///
+    /// To create [`PartitionPruningStatistics`] for two partition columns `a` and `b`,
+    /// for three containers like this:
+    ///
+    /// | a | b |
+    /// | - | - |
+    /// | 1 | 2 |
+    /// | 3 | 4 |
+    /// | 5 | 6 |
+    ///
+    /// ```
+    /// # use std::sync::Arc;
+    /// # use datafusion_common::ScalarValue;
+    /// # use arrow::datatypes::{DataType, Field};
+    /// # use datafusion_common::pruning::PartitionPruningStatistics;
+    ///
+    /// let partition_values = vec![
+    ///     vec![ScalarValue::from(1i32), ScalarValue::from(2i32)],
+    ///     vec![ScalarValue::from(3i32), ScalarValue::from(4i32)],
+    ///     vec![ScalarValue::from(5i32), ScalarValue::from(6i32)],
+    /// ];
+    /// let partition_fields = vec![
+    ///     Arc::new(Field::new("a", DataType::Int32, false)),
+    ///     Arc::new(Field::new("b", DataType::Int32, false)),
+    /// ];
+    /// let partition_stats =
+    ///     PartitionPruningStatistics::try_new(partition_values, partition_fields).unwrap();
+    /// ```
     pub fn try_new(
         partition_values: Vec<Vec<ScalarValue>>,
         partition_fields: Vec<FieldRef>,
@@ -202,6 +240,7 @@ impl PartitionPruningStatistics {
     }
 }
 
+#[expect(deprecated)]
 impl PruningStatistics for PartitionPruningStatistics {
     fn min_values(&self, column: &Column) -> Option<ArrayRef> {
         let index = self.partition_schema.index_of(column.name()).ok()?;
@@ -228,7 +267,7 @@ impl PruningStatistics for PartitionPruningStatistics {
         None
     }
 
-    fn row_counts(&self, _column: &Column) -> Option<ArrayRef> {
+    fn row_counts(&self) -> Option<ArrayRef> {
         None
     }
 
@@ -245,7 +284,7 @@ impl PruningStatistics for PartitionPruningStatistics {
             match acc {
                 None => Some(Some(eq_result)),
                 Some(acc_array) => {
-                    arrow::compute::kernels::boolean::and(&acc_array, &eq_result)
+                    arrow::compute::kernels::boolean::or_kleene(&acc_array, &eq_result)
                         .map(Some)
                         .ok()
                 }
@@ -361,11 +400,7 @@ impl PruningStatistics for PrunableStatistics {
         }
     }
 
-    fn row_counts(&self, column: &Column) -> Option<ArrayRef> {
-        // If the column does not exist in the schema, return None
-        if self.schema.index_of(column.name()).is_err() {
-            return None;
-        }
+    fn row_counts(&self) -> Option<ArrayRef> {
         if self
             .statistics
             .iter()
@@ -409,10 +444,15 @@ impl PruningStatistics for PrunableStatistics {
 /// the first one is returned without any regard for completeness or accuracy.
 /// That is: if the first statistics has information for a column, even if it is incomplete,
 /// that is returned even if a later statistics has more complete information.
+#[deprecated(
+    since = "52.0.0",
+    note = "This struct is no longer used internally. It may be removed in 58.0.0 or 6 months after 52.0.0 is released, whichever comes first. Please open an issue if you have a use case for it."
+)]
 pub struct CompositePruningStatistics {
     pub statistics: Vec<Box<dyn PruningStatistics>>,
 }
 
+#[expect(deprecated)]
 impl CompositePruningStatistics {
     /// Create a new instance of [`CompositePruningStatistics`] from
     /// a vector of [`PruningStatistics`].
@@ -427,6 +467,7 @@ impl CompositePruningStatistics {
     }
 }
 
+#[expect(deprecated)]
 impl PruningStatistics for CompositePruningStatistics {
     fn min_values(&self, column: &Column) -> Option<ArrayRef> {
         for stats in &self.statistics {
@@ -459,9 +500,9 @@ impl PruningStatistics for CompositePruningStatistics {
         None
     }
 
-    fn row_counts(&self, column: &Column) -> Option<ArrayRef> {
+    fn row_counts(&self) -> Option<ArrayRef> {
         for stats in &self.statistics {
-            if let Some(array) = stats.row_counts(column) {
+            if let Some(array) = stats.row_counts() {
                 return Some(array);
             }
         }
@@ -483,18 +524,26 @@ impl PruningStatistics for CompositePruningStatistics {
 }
 
 #[cfg(test)]
+#[expect(deprecated)]
+#[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key
 mod tests {
     use crate::{
-        cast::{as_int32_array, as_uint64_array},
         ColumnStatistics,
+        cast::{as_int32_array, as_uint64_array},
     };
 
     use super::*;
     use arrow::datatypes::{DataType, Field};
     use std::sync::Arc;
 
-    #[test]
-    fn test_partition_pruning_statistics() {
+    /// return a PartitionPruningStatistics for two columns 'a' and 'b'
+    /// and the following stats
+    ///
+    /// | a | b |
+    /// | - | - |
+    /// | 1 | 2 |
+    /// | 3 | 4 |
+    fn partition_pruning_statistics_setup() -> PartitionPruningStatistics {
         let partition_values = vec![
             vec![ScalarValue::from(1i32), ScalarValue::from(2i32)],
             vec![ScalarValue::from(3i32), ScalarValue::from(4i32)],
@@ -503,18 +552,21 @@ mod tests {
             Arc::new(Field::new("a", DataType::Int32, false)),
             Arc::new(Field::new("b", DataType::Int32, false)),
         ];
-        let partition_stats =
-            PartitionPruningStatistics::try_new(partition_values, partition_fields)
-                .unwrap();
+        PartitionPruningStatistics::try_new(partition_values, partition_fields).unwrap()
+    }
+
+    #[test]
+    fn test_partition_pruning_statistics() {
+        let partition_stats = partition_pruning_statistics_setup();
 
         let column_a = Column::new_unqualified("a");
         let column_b = Column::new_unqualified("b");
 
         // Partition values don't know anything about nulls or row counts
         assert!(partition_stats.null_counts(&column_a).is_none());
-        assert!(partition_stats.row_counts(&column_a).is_none());
+        assert!(partition_stats.row_counts().is_none());
         assert!(partition_stats.null_counts(&column_b).is_none());
-        assert!(partition_stats.row_counts(&column_b).is_none());
+        assert!(partition_stats.row_counts().is_none());
 
         // Min/max values are the same as the partition values
         let min_values_a =
@@ -560,6 +612,85 @@ mod tests {
         assert_eq!(partition_stats.num_containers(), 2);
     }
 
+    #[test]
+    fn test_partition_pruning_statistics_multiple_positive_values() {
+        let partition_stats = partition_pruning_statistics_setup();
+
+        let column_a = Column::new_unqualified("a");
+
+        // The two containers have `a` values 1 and 3, so they both only contain values from 1 and 3
+        let values = HashSet::from([ScalarValue::from(1i32), ScalarValue::from(3i32)]);
+        let contained_a = partition_stats.contained(&column_a, &values).unwrap();
+        let expected_contained_a = BooleanArray::from(vec![true, true]);
+        assert_eq!(contained_a, expected_contained_a);
+    }
+
+    #[test]
+    fn test_partition_pruning_statistics_multiple_negative_values() {
+        let partition_stats = partition_pruning_statistics_setup();
+
+        let column_a = Column::new_unqualified("a");
+
+        // The two containers have `a` values 1 and 3,
+        // so the first contains ONLY values from 1,2
+        // but the second does not
+        let values = HashSet::from([ScalarValue::from(1i32), ScalarValue::from(2i32)]);
+        let contained_a = partition_stats.contained(&column_a, &values).unwrap();
+        let expected_contained_a = BooleanArray::from(vec![true, false]);
+        assert_eq!(contained_a, expected_contained_a);
+    }
+
+    #[test]
+    fn test_partition_pruning_statistics_null_in_values() {
+        let partition_values = vec![
+            vec![
+                ScalarValue::from(1i32),
+                ScalarValue::from(2i32),
+                ScalarValue::from(3i32),
+            ],
+            vec![
+                ScalarValue::from(4i32),
+                ScalarValue::from(5i32),
+                ScalarValue::from(6i32),
+            ],
+        ];
+        let partition_fields = vec![
+            Arc::new(Field::new("a", DataType::Int32, false)),
+            Arc::new(Field::new("b", DataType::Int32, false)),
+            Arc::new(Field::new("c", DataType::Int32, false)),
+        ];
+        let partition_stats =
+            PartitionPruningStatistics::try_new(partition_values, partition_fields)
+                .unwrap();
+
+        let column_a = Column::new_unqualified("a");
+        let column_b = Column::new_unqualified("b");
+        let column_c = Column::new_unqualified("c");
+
+        let values_a = HashSet::from([ScalarValue::from(1i32), ScalarValue::Int32(None)]);
+        let contained_a = partition_stats.contained(&column_a, &values_a).unwrap();
+        let mut builder = BooleanArray::builder(2);
+        builder.append_value(true);
+        builder.append_null();
+        let expected_contained_a = builder.finish();
+        assert_eq!(contained_a, expected_contained_a);
+
+        // First match creates a NULL boolean array
+        // The accumulator should update the value to true for the second value
+        let values_b = HashSet::from([ScalarValue::Int32(None), ScalarValue::from(5i32)]);
+        let contained_b = partition_stats.contained(&column_b, &values_b).unwrap();
+        let mut builder = BooleanArray::builder(2);
+        builder.append_null();
+        builder.append_value(true);
+        let expected_contained_b = builder.finish();
+        assert_eq!(contained_b, expected_contained_b);
+
+        // All matches are null, contained should return None
+        let values_c = HashSet::from([ScalarValue::Int32(None)]);
+        let contained_c = partition_stats.contained(&column_c, &values_c);
+        assert!(contained_c.is_none());
+    }
+
     #[test]
     fn test_partition_pruning_statistics_empty() {
         let partition_values = vec![];
@@ -576,9 +707,9 @@ mod tests {
 
         // Partition values don't know anything about nulls or row counts
         assert!(partition_stats.null_counts(&column_a).is_none());
-        assert!(partition_stats.row_counts(&column_a).is_none());
+        assert!(partition_stats.row_counts().is_none());
         assert!(partition_stats.null_counts(&column_b).is_none());
-        assert!(partition_stats.row_counts(&column_b).is_none());
+        assert!(partition_stats.row_counts().is_none());
 
         // Min/max values are all missing
         assert!(partition_stats.min_values(&column_a).is_none());
@@ -681,13 +812,13 @@ mod tests {
         assert_eq!(null_counts_b, expected_null_counts_b);
 
         // Row counts are the same as the statistics
-        let row_counts_a = as_uint64_array(&pruning_stats.row_counts(&column_a).unwrap())
+        let row_counts_a = as_uint64_array(&pruning_stats.row_counts().unwrap())
             .unwrap()
             .into_iter()
             .collect::<Vec<_>>();
         let expected_row_counts_a = vec![Some(100), Some(200)];
         assert_eq!(row_counts_a, expected_row_counts_a);
-        let row_counts_b = as_uint64_array(&pruning_stats.row_counts(&column_b).unwrap())
+        let row_counts_b = as_uint64_array(&pruning_stats.row_counts().unwrap())
             .unwrap()
             .into_iter()
             .collect::<Vec<_>>();
@@ -712,7 +843,7 @@ mod tests {
         // This is debatable, personally I think `row_count` should not take a `Column` as an argument
         // at all since all columns should have the same number of rows.
         // But for now we just document the current behavior in this test.
-        let row_counts_c = as_uint64_array(&pruning_stats.row_counts(&column_c).unwrap())
+        let row_counts_c = as_uint64_array(&pruning_stats.row_counts().unwrap())
             .unwrap()
             .into_iter()
             .collect::<Vec<_>>();
@@ -720,12 +851,13 @@ mod tests {
         assert_eq!(row_counts_c, expected_row_counts_c);
         assert!(pruning_stats.contained(&column_c, &values).is_none());
 
-        // Test with a column that doesn't exist
+        // Test with a column that doesn't exist — column-specific stats
+        // return None, but row_counts is container-level and still available
         let column_d = Column::new_unqualified("d");
         assert!(pruning_stats.min_values(&column_d).is_none());
         assert!(pruning_stats.max_values(&column_d).is_none());
         assert!(pruning_stats.null_counts(&column_d).is_none());
-        assert!(pruning_stats.row_counts(&column_d).is_none());
+        assert!(pruning_stats.row_counts().is_some());
         assert!(pruning_stats.contained(&column_d, &values).is_none());
     }
 
@@ -753,8 +885,8 @@ mod tests {
         assert!(pruning_stats.null_counts(&column_b).is_none());
 
         // Row counts are all missing
-        assert!(pruning_stats.row_counts(&column_a).is_none());
-        assert!(pruning_stats.row_counts(&column_b).is_none());
+        assert!(pruning_stats.row_counts().is_none());
+        assert!(pruning_stats.row_counts().is_none());
 
         // Contained values are all empty
         let values = HashSet::from([ScalarValue::from(1i32)]);
@@ -894,13 +1026,11 @@ mod tests {
         let expected_null_counts_col_x = vec![Some(0), Some(10)];
         assert_eq!(null_counts_col_x, expected_null_counts_col_x);
 
-        // Test row counts - only available from file statistics
-        assert!(composite_stats.row_counts(&part_a).is_none());
-        let row_counts_col_x =
-            as_uint64_array(&composite_stats.row_counts(&col_x).unwrap())
-                .unwrap()
-                .into_iter()
-                .collect::<Vec<_>>();
+        // Test row counts — container-level, available from file statistics
+        let row_counts_col_x = as_uint64_array(&composite_stats.row_counts().unwrap())
+            .unwrap()
+            .into_iter()
+            .collect::<Vec<_>>();
         let expected_row_counts = vec![Some(100), Some(200)];
         assert_eq!(row_counts_col_x, expected_row_counts);
 
@@ -913,12 +1043,13 @@ mod tests {
         // File statistics don't implement contained
         assert!(composite_stats.contained(&col_x, &values).is_none());
 
-        // Non-existent column should return None for everything
+        // Non-existent column should return None for column-specific stats,
+        // but row_counts is container-level and still available
         let non_existent = Column::new_unqualified("non_existent");
         assert!(composite_stats.min_values(&non_existent).is_none());
         assert!(composite_stats.max_values(&non_existent).is_none());
         assert!(composite_stats.null_counts(&non_existent).is_none());
-        assert!(composite_stats.row_counts(&non_existent).is_none());
+        assert!(composite_stats.row_counts().is_some());
         assert!(composite_stats.contained(&non_existent, &values).is_none());
 
         // Verify num_containers matches
@@ -1022,7 +1153,7 @@ mod tests {
         let expected_null_counts = vec![Some(0), Some(5)];
         assert_eq!(null_counts, expected_null_counts);
 
-        let row_counts = as_uint64_array(&composite_stats.row_counts(&col_a).unwrap())
+        let row_counts = as_uint64_array(&composite_stats.row_counts().unwrap())
             .unwrap()
             .into_iter()
             .collect::<Vec<_>>();
@@ -1062,11 +1193,10 @@ mod tests {
         let expected_null_counts = vec![Some(10), Some(20)];
         assert_eq!(null_counts, expected_null_counts);
 
-        let row_counts =
-            as_uint64_array(&composite_stats_reversed.row_counts(&col_a).unwrap())
-                .unwrap()
-                .into_iter()
-                .collect::<Vec<_>>();
+        let row_counts = as_uint64_array(&composite_stats_reversed.row_counts().unwrap())
+            .unwrap()
+            .into_iter()
+            .collect::<Vec<_>>();
         let expected_row_counts = vec![Some(1000), Some(2000)];
         assert_eq!(row_counts, expected_row_counts);
     }
diff --git a/datafusion/common/src/pyarrow.rs b/datafusion/common/src/pyarrow.rs
deleted file mode 100644
index 18c6739735ff7..0000000000000
--- a/datafusion/common/src/pyarrow.rs
+++ /dev/null
@@ -1,169 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Conversions between PyArrow and DataFusion types
-
-use arrow::array::{Array, ArrayData};
-use arrow::pyarrow::{FromPyArrow, ToPyArrow};
-use pyo3::exceptions::PyException;
-use pyo3::prelude::PyErr;
-use pyo3::types::{PyAnyMethods, PyList};
-use pyo3::{Bound, FromPyObject, IntoPyObject, PyAny, PyResult, Python};
-
-use crate::{DataFusionError, ScalarValue};
-
-impl From<DataFusionError> for PyErr {
-    fn from(err: DataFusionError) -> PyErr {
-        PyException::new_err(err.to_string())
-    }
-}
-
-impl FromPyArrow for ScalarValue {
-    fn from_pyarrow_bound(value: &Bound<'_, PyAny>) -> PyResult<Self> {
-        let py = value.py();
-        let typ = value.getattr("type")?;
-        let val = value.call_method0("as_py")?;
-
-        // construct pyarrow array from the python value and pyarrow type
-        let factory = py.import("pyarrow")?.getattr("array")?;
-        let args = PyList::new(py, [val])?;
-        let array = factory.call1((args, typ))?;
-
-        // convert the pyarrow array to rust array using C data interface
-        let array = arrow::array::make_array(ArrayData::from_pyarrow_bound(&array)?);
-        let scalar = ScalarValue::try_from_array(&array, 0)?;
-
-        Ok(scalar)
-    }
-}
-
-impl ToPyArrow for ScalarValue {
-    fn to_pyarrow<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
-        let array = self.to_array()?;
-        // convert to pyarrow array using C data interface
-        let pyarray = array.to_data().to_pyarrow(py)?;
-        let pyscalar = pyarray.call_method1("__getitem__", (0,))?;
-
-        Ok(pyscalar)
-    }
-}
-
-impl<'source> FromPyObject<'source> for ScalarValue {
-    fn extract_bound(value: &Bound<'source, PyAny>) -> PyResult<Self> {
-        Self::from_pyarrow_bound(value)
-    }
-}
-
-impl<'source> IntoPyObject<'source> for ScalarValue {
-    type Target = PyAny;
-
-    type Output = Bound<'source, Self::Target>;
-
-    type Error = PyErr;
-
-    fn into_pyobject(self, py: Python<'source>) -> Result<Self::Output, Self::Error> {
-        let array = self.to_array()?;
-        // convert to pyarrow array using C data interface
-        let pyarray = array.to_data().to_pyarrow(py)?;
-        pyarray.call_method1("__getitem__", (0,))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use pyo3::ffi::c_str;
-    use pyo3::py_run;
-    use pyo3::types::PyDict;
-    use pyo3::Python;
-
-    use super::*;
-
-    fn init_python() {
-        Python::initialize();
-        Python::attach(|py| {
-            if py.run(c_str!("import pyarrow"), None, None).is_err() {
-                let locals = PyDict::new(py);
-                py.run(
-                    c_str!(
-                        "import sys; executable = sys.executable; python_path = sys.path"
-                    ),
-                    None,
-                    Some(&locals),
-                )
-                .expect("Couldn't get python info");
-                let executable = locals.get_item("executable").unwrap();
-                let executable: String = executable.extract().unwrap();
-
-                let python_path = locals.get_item("python_path").unwrap();
-                let python_path: Vec<String> = python_path.extract().unwrap();
-
-                panic!("pyarrow not found\nExecutable: {executable}\nPython path: {python_path:?}\n\
-                         HINT: try `pip install pyarrow`\n\
-                         NOTE: On Mac OS, you must compile against a Framework Python \
-                         (default in python.org installers and brew, but not pyenv)\n\
-                         NOTE: On Mac OS, PYO3 might point to incorrect Python library \
-                         path when using virtual environments. Try \
-                         `export PYTHONPATH=$(python -c \"import sys; print(sys.path[-1])\")`\n")
-            }
-        })
-    }
-
-    #[test]
-    fn test_roundtrip() {
-        init_python();
-
-        let example_scalars = [
-            ScalarValue::Boolean(Some(true)),
-            ScalarValue::Int32(Some(23)),
-            ScalarValue::Float64(Some(12.34)),
-            ScalarValue::from("Hello!"),
-            ScalarValue::Date32(Some(1234)),
-        ];
-
-        Python::attach(|py| {
-            for scalar in example_scalars.iter() {
-                let result =
-                    ScalarValue::from_pyarrow_bound(&scalar.to_pyarrow(py).unwrap())
-                        .unwrap();
-                assert_eq!(scalar, &result);
-            }
-        });
-    }
-
-    #[test]
-    fn test_py_scalar() -> PyResult<()> {
-        init_python();
-
-        Python::attach(|py| -> PyResult<()> {
-            let scalar_float = ScalarValue::Float64(Some(12.34));
-            let py_float = scalar_float
-                .into_pyobject(py)?
-                .call_method0("as_py")
-                .unwrap();
-            py_run!(py, py_float, "assert py_float == 12.34");
-
-            let scalar_string = ScalarValue::Utf8(Some("Hello!".to_string()));
-            let py_string = scalar_string
-                .into_pyobject(py)?
-                .call_method0("as_py")
-                .unwrap();
-            py_run!(py, py_string, "assert py_string == 'Hello!'");
-
-            Ok(())
-        })
-    }
-}
diff --git a/datafusion/common/src/rounding.rs b/datafusion/common/src/rounding.rs
index 95eefd3235b5f..1796143d7cf1a 100644
--- a/datafusion/common/src/rounding.rs
+++ b/datafusion/common/src/rounding.rs
@@ -47,7 +47,7 @@ extern crate libc;
     any(target_arch = "x86_64", target_arch = "aarch64"),
     not(target_os = "windows")
 ))]
-extern "C" {
+unsafe extern "C" {
     fn fesetround(round: i32);
     fn fegetround() -> i32;
 }
diff --git a/datafusion/common/src/scalar/cache.rs b/datafusion/common/src/scalar/cache.rs
index f1476a518774b..5b1ad4e4ede01 100644
--- a/datafusion/common/src/scalar/cache.rs
+++ b/datafusion/common/src/scalar/cache.rs
@@ -20,10 +20,10 @@
 use std::iter::repeat_n;
 use std::sync::{Arc, LazyLock, Mutex};
 
-use arrow::array::{new_null_array, Array, ArrayRef, PrimitiveArray};
+use arrow::array::{Array, ArrayRef, PrimitiveArray, new_null_array};
 use arrow::datatypes::{
-    ArrowDictionaryKeyType, DataType, Int16Type, Int32Type, Int64Type, Int8Type,
-    UInt16Type, UInt32Type, UInt64Type, UInt8Type,
+    ArrowDictionaryKeyType, DataType, Int8Type, Int16Type, Int32Type, Int64Type,
+    UInt8Type, UInt16Type, UInt32Type, UInt64Type,
 };
 
 /// Maximum number of rows to cache to be conservative on memory usage
diff --git a/datafusion/common/src/scalar/consts.rs b/datafusion/common/src/scalar/consts.rs
index 8cb446b1c9211..599c2523cd2c7 100644
--- a/datafusion/common/src/scalar/consts.rs
+++ b/datafusion/common/src/scalar/consts.rs
@@ -17,24 +17,36 @@
 
 // Constants defined for scalar construction.
 
+// Next F16 value above π (upper bound)
+pub(super) const PI_UPPER_F16: half::f16 = half::f16::from_bits(0x4249);
+
 // Next f32 value above π (upper bound)
 pub(super) const PI_UPPER_F32: f32 = std::f32::consts::PI.next_up();
 
 // Next f64 value above π (upper bound)
 pub(super) const PI_UPPER_F64: f64 = std::f64::consts::PI.next_up();
 
+// Next f16 value below -π (lower bound)
+pub(super) const NEGATIVE_PI_LOWER_F16: half::f16 = half::f16::from_bits(0xC249);
+
 // Next f32 value below -π (lower bound)
 pub(super) const NEGATIVE_PI_LOWER_F32: f32 = (-std::f32::consts::PI).next_down();
 
 // Next f64 value below -π (lower bound)
 pub(super) const NEGATIVE_PI_LOWER_F64: f64 = (-std::f64::consts::PI).next_down();
 
+// Next f16 value above π/2 (upper bound)
+pub(super) const FRAC_PI_2_UPPER_F16: half::f16 = half::f16::from_bits(0x3E49);
+
 // Next f32 value above π/2 (upper bound)
 pub(super) const FRAC_PI_2_UPPER_F32: f32 = std::f32::consts::FRAC_PI_2.next_up();
 
 // Next f64 value above π/2 (upper bound)
 pub(super) const FRAC_PI_2_UPPER_F64: f64 = std::f64::consts::FRAC_PI_2.next_up();
 
+// Next f32 value below -π/2 (lower bound)
+pub(super) const NEGATIVE_FRAC_PI_2_LOWER_F16: half::f16 = half::f16::from_bits(0xBE49);
+
 // Next f32 value below -π/2 (lower bound)
 pub(super) const NEGATIVE_FRAC_PI_2_LOWER_F32: f32 =
     (-std::f32::consts::FRAC_PI_2).next_down();
diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs
index 188a169a3dd2f..d726b5c94016f 100644
--- a/datafusion/common/src/scalar/mod.rs
+++ b/datafusion/common/src/scalar/mod.rs
@@ -26,6 +26,7 @@ use std::cmp::Ordering;
 use std::collections::{HashSet, VecDeque};
 use std::convert::Infallible;
 use std::fmt;
+use std::fmt::Write;
 use std::hash::Hash;
 use std::hash::Hasher;
 use std::iter::repeat_n;
@@ -33,64 +34,163 @@ use std::mem::{size_of, size_of_val};
 use std::str::FromStr;
 use std::sync::Arc;
 
+use crate::assert_or_internal_err;
 use crate::cast::{
     as_binary_array, as_binary_view_array, as_boolean_array, as_date32_array,
-    as_date64_array, as_decimal128_array, as_decimal256_array, as_decimal32_array,
-    as_decimal64_array, as_dictionary_array, as_duration_microsecond_array,
+    as_date64_array, as_decimal32_array, as_decimal64_array, as_decimal128_array,
+    as_decimal256_array, as_dictionary_array, as_duration_microsecond_array,
     as_duration_millisecond_array, as_duration_nanosecond_array,
     as_duration_second_array, as_fixed_size_binary_array, as_fixed_size_list_array,
-    as_float16_array, as_float32_array, as_float64_array, as_int16_array, as_int32_array,
-    as_int64_array, as_int8_array, as_interval_dt_array, as_interval_mdn_array,
+    as_float16_array, as_float32_array, as_float64_array, as_int8_array, as_int16_array,
+    as_int32_array, as_int64_array, as_interval_dt_array, as_interval_mdn_array,
     as_interval_ym_array, as_large_binary_array, as_large_list_array,
-    as_large_string_array, as_string_array, as_string_view_array,
-    as_time32_millisecond_array, as_time32_second_array, as_time64_microsecond_array,
-    as_time64_nanosecond_array, as_timestamp_microsecond_array,
-    as_timestamp_millisecond_array, as_timestamp_nanosecond_array,
-    as_timestamp_second_array, as_uint16_array, as_uint32_array, as_uint64_array,
-    as_uint8_array, as_union_array,
+    as_large_list_view_array, as_large_string_array, as_list_view_array, as_run_array,
+    as_string_array, as_string_view_array, as_time32_millisecond_array,
+    as_time32_second_array, as_time64_microsecond_array, as_time64_nanosecond_array,
+    as_timestamp_microsecond_array, as_timestamp_millisecond_array,
+    as_timestamp_nanosecond_array, as_timestamp_second_array, as_uint8_array,
+    as_uint16_array, as_uint32_array, as_uint64_array, as_union_array,
 };
-use crate::error::{DataFusionError, Result, _exec_err, _internal_err, _not_impl_err};
+use crate::error::{_exec_err, _internal_err, _not_impl_err, DataFusionError, Result};
 use crate::format::DEFAULT_CAST_OPTIONS;
 use crate::hash_utils::create_hashes;
 use crate::utils::SingleRowListArrayBuilder;
 use crate::{_internal_datafusion_err, arrow_datafusion_err};
 use arrow::array::{
-    new_empty_array, new_null_array, Array, ArrayData, ArrayRef, ArrowNativeTypeOp,
-    ArrowPrimitiveType, AsArray, BinaryArray, BinaryViewArray, BooleanArray, Date32Array,
-    Date64Array, Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array,
+    Array, ArrayData, ArrayDataBuilder, ArrayRef, ArrowNativeTypeOp, ArrowPrimitiveType,
+    AsArray, BinaryArray, BinaryViewArray, BinaryViewBuilder, BooleanArray, Date32Array,
+    Date64Array, Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array,
     DictionaryArray, DurationMicrosecondArray, DurationMillisecondArray,
     DurationNanosecondArray, DurationSecondArray, FixedSizeBinaryArray,
     FixedSizeListArray, Float16Array, Float32Array, Float64Array, GenericListArray,
-    Int16Array, Int32Array, Int64Array, Int8Array, IntervalDayTimeArray,
-    IntervalMonthDayNanoArray, IntervalYearMonthArray, LargeBinaryArray, LargeListArray,
-    LargeStringArray, ListArray, MapArray, MutableArrayData, OffsetSizeTrait,
-    PrimitiveArray, Scalar, StringArray, StringViewArray, StructArray,
-    Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray,
-    Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray,
-    TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array,
-    UInt64Array, UInt8Array, UnionArray,
+    GenericListViewArray, Int8Array, Int16Array, Int32Array, Int64Array,
+    IntervalDayTimeArray, IntervalMonthDayNanoArray, IntervalYearMonthArray,
+    LargeBinaryArray, LargeListArray, LargeListViewArray, LargeStringArray, ListArray,
+    ListViewArray, MapArray, MutableArrayData, PrimitiveArray, RunArray, Scalar,
+    StringArray, StringViewArray, StringViewBuilder, StructArray, Time32MillisecondArray,
+    Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
+    TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
+    TimestampSecondArray, UInt8Array, UInt16Array, UInt32Array, UInt64Array, UnionArray,
+    downcast_run_array, new_empty_array, new_null_array,
 };
 use arrow::buffer::{BooleanBuffer, ScalarBuffer};
-use arrow::compute::kernels::cast::{cast_with_options, CastOptions};
+use arrow::compute::kernels::cast::{CastOptions, cast_with_options};
 use arrow::compute::kernels::numeric::{
     add, add_wrapping, div, mul, mul_wrapping, rem, sub, sub_wrapping,
 };
 use arrow::datatypes::{
-    i256, validate_decimal_precision_and_scale, ArrowDictionaryKeyType, ArrowNativeType,
-    ArrowTimestampType, DataType, Date32Type, Decimal128Type, Decimal256Type,
-    Decimal32Type, Decimal64Type, Field, Float32Type, Int16Type, Int32Type, Int64Type,
-    Int8Type, IntervalDayTime, IntervalDayTimeType, IntervalMonthDayNano,
-    IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType, TimeUnit,
-    TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
-    TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, UnionFields,
-    UnionMode, DECIMAL128_MAX_PRECISION,
+    ArrowDictionaryKeyType, ArrowNativeType, ArrowTimestampType, DataType, Date32Type,
+    Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, DecimalType, Field,
+    FieldRef, Float32Type, Int8Type, Int16Type, Int32Type, Int64Type, IntervalDayTime,
+    IntervalDayTimeType, IntervalMonthDayNano, IntervalMonthDayNanoType, IntervalUnit,
+    IntervalYearMonthType, RunEndIndexType, TimeUnit, TimestampMicrosecondType,
+    TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt8Type,
+    UInt16Type, UInt32Type, UInt64Type, UnionFields, UnionMode, i256,
+    validate_decimal_precision_and_scale,
 };
-use arrow::util::display::{array_value_to_string, ArrayFormatter, FormatOptions};
+use arrow::util::display::{ArrayFormatter, FormatOptions, array_value_to_string};
 use cache::{get_or_create_cached_key_array, get_or_create_cached_null_array};
 use chrono::{Duration, NaiveDate};
 use half::f16;
 pub use struct_builder::ScalarStructBuilder;
 
+const SECONDS_PER_DAY: i64 = 86_400;
+const MILLIS_PER_DAY: i64 = SECONDS_PER_DAY * 1_000;
+const MICROS_PER_DAY: i64 = MILLIS_PER_DAY * 1_000;
+const NANOS_PER_DAY: i64 = MICROS_PER_DAY * 1_000;
+const MICROS_PER_MILLISECOND: i64 = 1_000;
+const NANOS_PER_MILLISECOND: i64 = 1_000_000;
+
+/// Returns the multiplier that converts the input date representation into the
+/// desired timestamp unit, if the conversion requires a multiplication that can
+/// overflow an `i64`.
+pub fn date_to_timestamp_multiplier(
+    source_type: &DataType,
+    target_type: &DataType,
+) -> Option<i64> {
+    let DataType::Timestamp(target_unit, _) = target_type else {
+        return None;
+    };
+
+    // Only `Timestamp` target types have a time unit; otherwise no
+    // multiplier applies (handled above). The function returns `Some(m)`
+    // when converting the `source_type` to `target_type` requires a
+    // multiplication that could overflow `i64`. It returns `None` when
+    // the conversion is a division or otherwise doesn't require a
+    // multiplication (e.g. Date64 -> Second).
+    match source_type {
+        // Date32 stores days since epoch. Converting to any timestamp
+        // unit requires multiplying by the per-day factor (seconds,
+        // milliseconds, microseconds, nanoseconds).
+        DataType::Date32 => Some(match target_unit {
+            TimeUnit::Second => SECONDS_PER_DAY,
+            TimeUnit::Millisecond => MILLIS_PER_DAY,
+            TimeUnit::Microsecond => MICROS_PER_DAY,
+            TimeUnit::Nanosecond => NANOS_PER_DAY,
+        }),
+
+        // Date64 stores milliseconds since epoch. Converting to
+        // seconds is a division (no multiplication), so return `None`.
+        // Converting to milliseconds is 1:1 (multiplier 1). Converting
+        // to micro/nano requires multiplying by 1_000 / 1_000_000.
+        DataType::Date64 => match target_unit {
+            TimeUnit::Second => None,
+            // Converting Date64 (ms since epoch) to millisecond timestamps
+            // is an identity conversion and does not require multiplication.
+            // Returning `None` indicates no multiplication-based overflow
+            // check is necessary.
+            TimeUnit::Millisecond => None,
+            TimeUnit::Microsecond => Some(MICROS_PER_MILLISECOND),
+            TimeUnit::Nanosecond => Some(NANOS_PER_MILLISECOND),
+        },
+
+        _ => None,
+    }
+}
+
+/// Ensures the provided value can be represented as a timestamp with the given
+/// multiplier. Returns an [`DataFusionError::Execution`] when the converted
+/// value would overflow the timestamp range.
+pub fn ensure_timestamp_in_bounds(
+    value: i64,
+    multiplier: i64,
+    source_type: &DataType,
+    target_type: &DataType,
+) -> Result<()> {
+    if multiplier <= 1 {
+        return Ok(());
+    }
+
+    if value.checked_mul(multiplier).is_none() {
+        let target = format_timestamp_type_for_error(target_type);
+        _exec_err!(
+            "Cannot cast {} value {} to {}: converted value exceeds the representable i64 range",
+            source_type,
+            value,
+            target
+        )
+    } else {
+        Ok(())
+    }
+}
+
+/// Format a `DataType::Timestamp` into a short, stable string used in
+/// user-facing error messages.
+pub(crate) fn format_timestamp_type_for_error(target_type: &DataType) -> String {
+    match target_type {
+        DataType::Timestamp(unit, _) => {
+            let s = match unit {
+                TimeUnit::Second => "s",
+                TimeUnit::Millisecond => "ms",
+                TimeUnit::Microsecond => "us",
+                TimeUnit::Nanosecond => "ns",
+            };
+            format!("Timestamp({s})")
+        }
+        other => format!("{other}"),
+    }
+}
+
 /// A dynamically typed, nullable single value.
 ///
 /// While an arrow  [`Array`]) stores one or more values of the same type, in a
@@ -158,8 +258,8 @@ pub use struct_builder::ScalarStructBuilder;
 ///
 /// # Nested Types
 ///
-/// `List` / `LargeList` / `FixedSizeList` / `Struct` / `Map` are represented as a
-/// single element array of the corresponding type.
+/// `List` / `LargeList` / `FixedSizeList` / `ListView` / `LargeListView` / `Struct` / `Map`
+/// are represented as a single element array of the corresponding type.
 ///
 /// ## Example: Creating [`ScalarValue::Struct`] using [`ScalarStructBuilder`]
 /// ```
@@ -282,6 +382,14 @@ pub enum ScalarValue {
     List(Arc<ListArray>),
     /// The array must be a LargeListArray with length 1.
     LargeList(Arc<LargeListArray>),
+    /// Represents a single element of a [`ListViewArray`] as an [`ArrayRef`]
+    ///
+    /// The array must be a ListViewArray with length 1.
+    ListView(Arc<ListViewArray>),
+    /// Represents a single element of a [`LargeListViewArray`] as an [`ArrayRef`]
+    ///
+    /// The array must be a LargeListViewArray with length 1.
+    LargeListView(Arc<LargeListViewArray>),
     /// Represents a single element [`StructArray`] as an [`ArrayRef`]. See
     /// [`ScalarValue`] for examples of how to create instances of this type.
     Struct(Arc<StructArray>),
@@ -331,6 +439,8 @@ pub enum ScalarValue {
     Union(Option<(i8, Box<ScalarValue>)>, UnionFields, UnionMode),
     /// Dictionary type: index type and value
     Dictionary(Box<DataType>, Box<ScalarValue>),
+    /// (run-ends field, value field, value)
+    RunEndEncoded(FieldRef, FieldRef, Box<ScalarValue>),
 }
 
 impl Hash for Fl<f16> {
@@ -416,6 +526,10 @@ impl PartialEq for ScalarValue {
             (List(_), _) => false,
             (LargeList(v1), LargeList(v2)) => v1.eq(v2),
             (LargeList(_), _) => false,
+            (ListView(v1), ListView(v2)) => v1.eq(v2),
+            (ListView(_), _) => false,
+            (LargeListView(v1), LargeListView(v2)) => v1.eq(v2),
+            (LargeListView(_), _) => false,
             (Struct(v1), Struct(v2)) => v1.eq(v2),
             (Struct(_), _) => false,
             (Map(v1), Map(v2)) => v1.eq(v2),
@@ -460,6 +574,10 @@ impl PartialEq for ScalarValue {
             (Union(_, _, _), _) => false,
             (Dictionary(k1, v1), Dictionary(k2, v2)) => k1.eq(k2) && v1.eq(v2),
             (Dictionary(_, _), _) => false,
+            (RunEndEncoded(rf1, vf1, v1), RunEndEncoded(rf2, vf2, v2)) => {
+                rf1.eq(rf2) && vf1.eq(vf2) && v1.eq(v2)
+            }
+            (RunEndEncoded(_, _, _), _) => false,
             (Null, Null) => true,
             (Null, _) => false,
         }
@@ -557,7 +675,8 @@ impl PartialOrd for ScalarValue {
             (FixedSizeBinary(_, _), _) => None,
             (LargeBinary(v1), LargeBinary(v2)) => v1.partial_cmp(v2),
             (LargeBinary(_), _) => None,
-            // ScalarValue::List / ScalarValue::FixedSizeList / ScalarValue::LargeList are ensure to have length 1
+            // ScalarValue::List / ScalarValue::FixedSizeList / ScalarValue::LargeList / ScalarValue::ListView / ScalarValue::LargeListView
+            // are guaranteed to have length 1
             (List(arr1), List(arr2)) => partial_cmp_list(arr1.as_ref(), arr2.as_ref()),
             (FixedSizeList(arr1), FixedSizeList(arr2)) => {
                 partial_cmp_list(arr1.as_ref(), arr2.as_ref())
@@ -565,7 +684,17 @@ impl PartialOrd for ScalarValue {
             (LargeList(arr1), LargeList(arr2)) => {
                 partial_cmp_list(arr1.as_ref(), arr2.as_ref())
             }
-            (List(_), _) | (LargeList(_), _) | (FixedSizeList(_), _) => None,
+            (ListView(arr1), ListView(arr2)) => {
+                partial_cmp_list(arr1.as_ref(), arr2.as_ref())
+            }
+            (LargeListView(arr1), LargeListView(arr2)) => {
+                partial_cmp_list(arr1.as_ref(), arr2.as_ref())
+            }
+            (List(_), _)
+            | (LargeList(_), _)
+            | (FixedSizeList(_), _)
+            | (ListView(_), _)
+            | (LargeListView(_), _) => None,
             (Struct(struct_arr1), Struct(struct_arr2)) => {
                 partial_cmp_struct(struct_arr1.as_ref(), struct_arr2.as_ref())
             }
@@ -622,20 +751,25 @@ impl PartialOrd for ScalarValue {
             (Union(_, _, _), _) => None,
             (Dictionary(k1, v1), Dictionary(k2, v2)) => {
                 // Don't compare if the key types don't match (it is effectively a different datatype)
-                if k1 == k2 {
+                if k1 == k2 { v1.partial_cmp(v2) } else { None }
+            }
+            (Dictionary(_, _), _) => None,
+            (RunEndEncoded(rf1, vf1, v1), RunEndEncoded(rf2, vf2, v2)) => {
+                // Don't compare if the run ends fields don't match (it is effectively a different datatype)
+                if rf1 == rf2 && vf1 == vf2 {
                     v1.partial_cmp(v2)
                 } else {
                     None
                 }
             }
-            (Dictionary(_, _), _) => None,
+            (RunEndEncoded(_, _, _), _) => None,
             (Null, Null) => Some(Ordering::Equal),
             (Null, _) => None,
         }
     }
 }
 
-/// List/LargeList/FixedSizeList scalars always have a single element
+/// List/LargeList/FixedSizeList/ListView/LargeListView scalars always have a single element
 /// array. This function returns that array
 fn first_array_for_list(arr: &dyn Array) -> ArrayRef {
     assert_eq!(arr.len(), 1);
@@ -645,12 +779,18 @@ fn first_array_for_list(arr: &dyn Array) -> ArrayRef {
         arr.value(0)
     } else if let Some(arr) = arr.as_fixed_size_list_opt() {
         arr.value(0)
+    } else if let Some(arr) = arr.as_list_view_opt::<i32>() {
+        arr.value(0)
+    } else if let Some(arr) = arr.as_list_view_opt::<i64>() {
+        arr.value(0)
     } else {
-        unreachable!("Since only List / LargeList / FixedSizeList are supported, this should never happen")
+        unreachable!(
+            "Since only List / LargeList / FixedSizeList / ListView / LargeListView are supported, this should never happen"
+        )
     }
 }
 
-/// Compares two List/LargeList/FixedSizeList scalars
+/// Compares two List/LargeList/FixedSizeList/ListView/LargeListView scalars
 fn partial_cmp_list(arr1: &dyn Array, arr2: &dyn Array) -> Option<Ordering> {
     if arr1.data_type() != arr2.data_type() {
         return None;
@@ -838,6 +978,12 @@ impl Hash for ScalarValue {
             FixedSizeList(arr) => {
                 hash_nested_array(arr.to_owned() as ArrayRef, state);
             }
+            ListView(arr) => {
+                hash_nested_array(arr.to_owned() as ArrayRef, state);
+            }
+            LargeListView(arr) => {
+                hash_nested_array(arr.to_owned() as ArrayRef, state);
+            }
             Struct(arr) => {
                 hash_nested_array(arr.to_owned() as ArrayRef, state);
             }
@@ -870,6 +1016,11 @@ impl Hash for ScalarValue {
                 k.hash(state);
                 v.hash(state);
             }
+            RunEndEncoded(rf, vf, v) => {
+                rf.hash(state);
+                vf.hash(state);
+                v.hash(state);
+            }
             // stable hash for Null value
             Null => 1.hash(state),
         }
@@ -878,10 +1029,10 @@ impl Hash for ScalarValue {
 
 fn hash_nested_array<H: Hasher>(arr: ArrayRef, state: &mut H) {
     let len = arr.len();
-    let arrays = vec![arr];
     let hashes_buffer = &mut vec![0; len];
-    let random_state = ahash::RandomState::with_seeds(0, 0, 0, 0);
-    let hashes = create_hashes(&arrays, &random_state, hashes_buffer).unwrap();
+    let random_state = crate::hash_utils::RandomState::with_seed(0);
+    let hashes = create_hashes(&[arr], &random_state, hashes_buffer)
+        .expect("hash_nested_array: failed to create row hashes");
     // Hash back to std::hash::Hasher
     hashes.hash(state);
 }
@@ -1055,13 +1206,8 @@ impl ScalarValue {
 
     /// Create a decimal Scalar from value/precision and scale.
     pub fn try_new_decimal128(value: i128, precision: u8, scale: i8) -> Result<Self> {
-        // make sure the precision and scale is valid
-        if precision <= DECIMAL128_MAX_PRECISION && scale.unsigned_abs() <= precision {
-            return Ok(ScalarValue::Decimal128(Some(value), precision, scale));
-        }
-        _internal_err!(
-            "Can not new a decimal type ScalarValue for precision {precision} and scale {scale}"
-        )
+        Self::validate_decimal_or_internal_err::<Decimal128Type>(precision, scale)?;
+        Ok(ScalarValue::Decimal128(Some(value), precision, scale))
     }
 
     /// Create a Null instance of ScalarValue for this datatype
@@ -1153,7 +1299,14 @@ impl ScalarValue {
                 index_type.clone(),
                 Box::new(value_type.as_ref().try_into()?),
             ),
-            // `ScalaValue::List` contains single element `ListArray`.
+            DataType::RunEndEncoded(run_ends_field, value_field) => {
+                ScalarValue::RunEndEncoded(
+                    Arc::clone(run_ends_field),
+                    Arc::clone(value_field),
+                    Box::new(value_field.data_type().try_into()?),
+                )
+            }
+            // `ScalarValue::List` contains single element `ListArray`.
             DataType::List(field_ref) => ScalarValue::List(Arc::new(
                 GenericListArray::new_null(Arc::clone(field_ref), 1),
             )),
@@ -1161,7 +1314,7 @@ impl ScalarValue {
             DataType::LargeList(field_ref) => ScalarValue::LargeList(Arc::new(
                 GenericListArray::new_null(Arc::clone(field_ref), 1),
             )),
-            // `ScalaValue::FixedSizeList` contains single element `FixedSizeList`.
+            // `ScalarValue::FixedSizeList` contains single element `FixedSizeList`.
             DataType::FixedSizeList(field_ref, fixed_length) => {
                 ScalarValue::FixedSizeList(Arc::new(FixedSizeListArray::new_null(
                     Arc::clone(field_ref),
@@ -1169,6 +1322,12 @@ impl ScalarValue {
                     1,
                 )))
             }
+            DataType::ListView(field_ref) => ScalarValue::ListView(Arc::new(
+                GenericListViewArray::new_null(Arc::clone(field_ref), 1),
+            )),
+            DataType::LargeListView(field_ref) => ScalarValue::LargeListView(Arc::new(
+                GenericListViewArray::new_null(Arc::clone(field_ref), 1),
+            )),
             DataType::Struct(fields) => ScalarValue::Struct(
                 new_null_array(&DataType::Struct(fields.to_owned()), 1)
                     .as_struct()
@@ -1241,6 +1400,7 @@ impl ScalarValue {
     /// Returns a [`ScalarValue`] representing PI
     pub fn new_pi(datatype: &DataType) -> Result<ScalarValue> {
         match datatype {
+            DataType::Float16 => Ok(ScalarValue::from(f16::PI)),
             DataType::Float32 => Ok(ScalarValue::from(std::f32::consts::PI)),
             DataType::Float64 => Ok(ScalarValue::from(std::f64::consts::PI)),
             _ => _internal_err!("PI is not supported for data type: {}", datatype),
@@ -1250,6 +1410,7 @@ impl ScalarValue {
     /// Returns a [`ScalarValue`] representing PI's upper bound
     pub fn new_pi_upper(datatype: &DataType) -> Result<ScalarValue> {
         match datatype {
+            DataType::Float16 => Ok(ScalarValue::Float16(Some(consts::PI_UPPER_F16))),
             DataType::Float32 => Ok(ScalarValue::from(consts::PI_UPPER_F32)),
             DataType::Float64 => Ok(ScalarValue::from(consts::PI_UPPER_F64)),
             _ => {
@@ -1261,6 +1422,9 @@ impl ScalarValue {
     /// Returns a [`ScalarValue`] representing -PI's lower bound
     pub fn new_negative_pi_lower(datatype: &DataType) -> Result<ScalarValue> {
         match datatype {
+            DataType::Float16 => {
+                Ok(ScalarValue::Float16(Some(consts::NEGATIVE_PI_LOWER_F16)))
+            }
             DataType::Float32 => Ok(ScalarValue::from(consts::NEGATIVE_PI_LOWER_F32)),
             DataType::Float64 => Ok(ScalarValue::from(consts::NEGATIVE_PI_LOWER_F64)),
             _ => {
@@ -1272,6 +1436,9 @@ impl ScalarValue {
     /// Returns a [`ScalarValue`] representing FRAC_PI_2's upper bound
     pub fn new_frac_pi_2_upper(datatype: &DataType) -> Result<ScalarValue> {
         match datatype {
+            DataType::Float16 => {
+                Ok(ScalarValue::Float16(Some(consts::FRAC_PI_2_UPPER_F16)))
+            }
             DataType::Float32 => Ok(ScalarValue::from(consts::FRAC_PI_2_UPPER_F32)),
             DataType::Float64 => Ok(ScalarValue::from(consts::FRAC_PI_2_UPPER_F64)),
             _ => {
@@ -1283,6 +1450,9 @@ impl ScalarValue {
     // Returns a [`ScalarValue`] representing FRAC_PI_2's lower bound
     pub fn new_neg_frac_pi_2_lower(datatype: &DataType) -> Result<ScalarValue> {
         match datatype {
+            DataType::Float16 => Ok(ScalarValue::Float16(Some(
+                consts::NEGATIVE_FRAC_PI_2_LOWER_F16,
+            ))),
             DataType::Float32 => {
                 Ok(ScalarValue::from(consts::NEGATIVE_FRAC_PI_2_LOWER_F32))
             }
@@ -1298,6 +1468,7 @@ impl ScalarValue {
     /// Returns a [`ScalarValue`] representing -PI
     pub fn new_negative_pi(datatype: &DataType) -> Result<ScalarValue> {
         match datatype {
+            DataType::Float16 => Ok(ScalarValue::from(-f16::PI)),
             DataType::Float32 => Ok(ScalarValue::from(-std::f32::consts::PI)),
             DataType::Float64 => Ok(ScalarValue::from(-std::f64::consts::PI)),
             _ => _internal_err!("-PI is not supported for data type: {}", datatype),
@@ -1307,6 +1478,7 @@ impl ScalarValue {
     /// Returns a [`ScalarValue`] representing PI/2
     pub fn new_frac_pi_2(datatype: &DataType) -> Result<ScalarValue> {
         match datatype {
+            DataType::Float16 => Ok(ScalarValue::from(f16::FRAC_PI_2)),
             DataType::Float32 => Ok(ScalarValue::from(std::f32::consts::FRAC_PI_2)),
             DataType::Float64 => Ok(ScalarValue::from(std::f64::consts::FRAC_PI_2)),
             _ => _internal_err!("PI/2 is not supported for data type: {}", datatype),
@@ -1316,6 +1488,7 @@ impl ScalarValue {
     /// Returns a [`ScalarValue`] representing -PI/2
     pub fn new_neg_frac_pi_2(datatype: &DataType) -> Result<ScalarValue> {
         match datatype {
+            DataType::Float16 => Ok(ScalarValue::from(-f16::FRAC_PI_2)),
             DataType::Float32 => Ok(ScalarValue::from(-std::f32::consts::FRAC_PI_2)),
             DataType::Float64 => Ok(ScalarValue::from(-std::f64::consts::FRAC_PI_2)),
             _ => _internal_err!("-PI/2 is not supported for data type: {}", datatype),
@@ -1325,6 +1498,7 @@ impl ScalarValue {
     /// Returns a [`ScalarValue`] representing infinity
     pub fn new_infinity(datatype: &DataType) -> Result<ScalarValue> {
         match datatype {
+            DataType::Float16 => Ok(ScalarValue::from(f16::INFINITY)),
             DataType::Float32 => Ok(ScalarValue::from(f32::INFINITY)),
             DataType::Float64 => Ok(ScalarValue::from(f64::INFINITY)),
             _ => {
@@ -1336,6 +1510,7 @@ impl ScalarValue {
     /// Returns a [`ScalarValue`] representing negative infinity
     pub fn new_neg_infinity(datatype: &DataType) -> Result<ScalarValue> {
         match datatype {
+            DataType::Float16 => Ok(ScalarValue::from(f16::NEG_INFINITY)),
             DataType::Float32 => Ok(ScalarValue::from(f32::NEG_INFINITY)),
             DataType::Float64 => Ok(ScalarValue::from(f64::NEG_INFINITY)),
             _ => {
@@ -1359,7 +1534,7 @@ impl ScalarValue {
             DataType::UInt16 => ScalarValue::UInt16(Some(0)),
             DataType::UInt32 => ScalarValue::UInt32(Some(0)),
             DataType::UInt64 => ScalarValue::UInt64(Some(0)),
-            DataType::Float16 => ScalarValue::Float16(Some(f16::from_f32(0.0))),
+            DataType::Float16 => ScalarValue::Float16(Some(f16::ZERO)),
             DataType::Float32 => ScalarValue::Float32(Some(0.0)),
             DataType::Float64 => ScalarValue::Float64(Some(0.0)),
             DataType::Decimal32(precision, scale) => {
@@ -1467,6 +1642,8 @@ impl ScalarValue {
             | DataType::Float16
             | DataType::Float32
             | DataType::Float64
+            | DataType::Decimal32(_, _)
+            | DataType::Decimal64(_, _)
             | DataType::Decimal128(_, _)
             | DataType::Decimal256(_, _)
             | DataType::Timestamp(_, _)
@@ -1503,7 +1680,7 @@ impl ScalarValue {
                 let empty_arr = new_empty_array(field.data_type());
                 let values = Arc::new(
                     SingleRowListArrayBuilder::new(empty_arr)
-                        .with_nullable(field.is_nullable())
+                        .with_field(field)
                         .build_fixed_size_list_array(0),
                 );
                 Ok(ScalarValue::FixedSizeList(values))
@@ -1512,6 +1689,24 @@ impl ScalarValue {
                 let list = ScalarValue::new_large_list(&[], field.data_type());
                 Ok(ScalarValue::LargeList(list))
             }
+            DataType::ListView(field) => {
+                let empty_arr = new_empty_array(field.data_type());
+                let values = Arc::new(
+                    SingleRowListArrayBuilder::new(empty_arr)
+                        .with_field(field)
+                        .build_list_view_array(),
+                );
+                Ok(ScalarValue::ListView(values))
+            }
+            DataType::LargeListView(field) => {
+                let empty_arr = new_empty_array(field.data_type());
+                let values = Arc::new(
+                    SingleRowListArrayBuilder::new(empty_arr)
+                        .with_field(field)
+                        .build_large_list_view_array(),
+                );
+                Ok(ScalarValue::LargeListView(values))
+            }
 
             // Struct types
             DataType::Struct(fields) => {
@@ -1535,6 +1730,14 @@ impl ScalarValue {
                 Box::new(ScalarValue::new_default(value_type)?),
             )),
 
+            DataType::RunEndEncoded(run_ends_field, value_field) => {
+                Ok(ScalarValue::RunEndEncoded(
+                    Arc::clone(run_ends_field),
+                    Arc::clone(value_field),
+                    Box::new(ScalarValue::new_default(value_field.data_type())?),
+                ))
+            }
+
             // Map types
             DataType::Map(field, _) => Ok(ScalarValue::Map(Arc::new(MapArray::from(
                 ArrayData::new_empty(field.data_type()),
@@ -1553,13 +1756,6 @@ impl ScalarValue {
                     _internal_err!("Union type must have at least one field")
                 }
             }
-
-            // Unsupported types for now
-            _ => {
-                _not_impl_err!(
-                    "Default value for data_type \"{datatype}\" is not implemented yet"
-                )
-            }
         }
     }
 
@@ -1574,16 +1770,14 @@ impl ScalarValue {
             DataType::UInt16 => ScalarValue::UInt16(Some(1)),
             DataType::UInt32 => ScalarValue::UInt32(Some(1)),
             DataType::UInt64 => ScalarValue::UInt64(Some(1)),
-            DataType::Float16 => ScalarValue::Float16(Some(f16::from_f32(1.0))),
+            DataType::Float16 => ScalarValue::Float16(Some(f16::ONE)),
             DataType::Float32 => ScalarValue::Float32(Some(1.0)),
             DataType::Float64 => ScalarValue::Float64(Some(1.0)),
             DataType::Decimal32(precision, scale) => {
-                validate_decimal_precision_and_scale::<Decimal32Type>(
+                Self::validate_decimal_or_internal_err::<Decimal32Type>(
                     *precision, *scale,
                 )?;
-                if *scale < 0 {
-                    return _internal_err!("Negative scale is not supported");
-                }
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
                 match 10_i32.checked_pow(*scale as u32) {
                     Some(value) => {
                         ScalarValue::Decimal32(Some(value), *precision, *scale)
@@ -1592,12 +1786,10 @@ impl ScalarValue {
                 }
             }
             DataType::Decimal64(precision, scale) => {
-                validate_decimal_precision_and_scale::<Decimal64Type>(
+                Self::validate_decimal_or_internal_err::<Decimal64Type>(
                     *precision, *scale,
                 )?;
-                if *scale < 0 {
-                    return _internal_err!("Negative scale is not supported");
-                }
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
                 match i64::from(10).checked_pow(*scale as u32) {
                     Some(value) => {
                         ScalarValue::Decimal64(Some(value), *precision, *scale)
@@ -1606,12 +1798,10 @@ impl ScalarValue {
                 }
             }
             DataType::Decimal128(precision, scale) => {
-                validate_decimal_precision_and_scale::<Decimal128Type>(
+                Self::validate_decimal_or_internal_err::<Decimal128Type>(
                     *precision, *scale,
                 )?;
-                if *scale < 0 {
-                    return _internal_err!("Negative scale is not supported");
-                }
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
                 match i128::from(10).checked_pow(*scale as u32) {
                     Some(value) => {
                         ScalarValue::Decimal128(Some(value), *precision, *scale)
@@ -1620,12 +1810,10 @@ impl ScalarValue {
                 }
             }
             DataType::Decimal256(precision, scale) => {
-                validate_decimal_precision_and_scale::<Decimal256Type>(
+                Self::validate_decimal_or_internal_err::<Decimal256Type>(
                     *precision, *scale,
                 )?;
-                if *scale < 0 {
-                    return _internal_err!("Negative scale is not supported");
-                }
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
                 match i256::from(10).checked_pow(*scale as u32) {
                     Some(value) => {
                         ScalarValue::Decimal256(Some(value), *precision, *scale)
@@ -1648,16 +1836,14 @@ impl ScalarValue {
             DataType::Int16 | DataType::UInt16 => ScalarValue::Int16(Some(-1)),
             DataType::Int32 | DataType::UInt32 => ScalarValue::Int32(Some(-1)),
             DataType::Int64 | DataType::UInt64 => ScalarValue::Int64(Some(-1)),
-            DataType::Float16 => ScalarValue::Float16(Some(f16::from_f32(-1.0))),
+            DataType::Float16 => ScalarValue::Float16(Some(f16::NEG_ONE)),
             DataType::Float32 => ScalarValue::Float32(Some(-1.0)),
             DataType::Float64 => ScalarValue::Float64(Some(-1.0)),
             DataType::Decimal32(precision, scale) => {
-                validate_decimal_precision_and_scale::<Decimal32Type>(
+                Self::validate_decimal_or_internal_err::<Decimal32Type>(
                     *precision, *scale,
                 )?;
-                if *scale < 0 {
-                    return _internal_err!("Negative scale is not supported");
-                }
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
                 match 10_i32.checked_pow(*scale as u32) {
                     Some(value) => {
                         ScalarValue::Decimal32(Some(-value), *precision, *scale)
@@ -1666,12 +1852,10 @@ impl ScalarValue {
                 }
             }
             DataType::Decimal64(precision, scale) => {
-                validate_decimal_precision_and_scale::<Decimal64Type>(
+                Self::validate_decimal_or_internal_err::<Decimal64Type>(
                     *precision, *scale,
                 )?;
-                if *scale < 0 {
-                    return _internal_err!("Negative scale is not supported");
-                }
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
                 match i64::from(10).checked_pow(*scale as u32) {
                     Some(value) => {
                         ScalarValue::Decimal64(Some(-value), *precision, *scale)
@@ -1680,12 +1864,10 @@ impl ScalarValue {
                 }
             }
             DataType::Decimal128(precision, scale) => {
-                validate_decimal_precision_and_scale::<Decimal128Type>(
+                Self::validate_decimal_or_internal_err::<Decimal128Type>(
                     *precision, *scale,
                 )?;
-                if *scale < 0 {
-                    return _internal_err!("Negative scale is not supported");
-                }
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
                 match i128::from(10).checked_pow(*scale as u32) {
                     Some(value) => {
                         ScalarValue::Decimal128(Some(-value), *precision, *scale)
@@ -1694,12 +1876,10 @@ impl ScalarValue {
                 }
             }
             DataType::Decimal256(precision, scale) => {
-                validate_decimal_precision_and_scale::<Decimal256Type>(
+                Self::validate_decimal_or_internal_err::<Decimal256Type>(
                     *precision, *scale,
                 )?;
-                if *scale < 0 {
-                    return _internal_err!("Negative scale is not supported");
-                }
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
                 match i256::from(10).checked_pow(*scale as u32) {
                     Some(value) => {
                         ScalarValue::Decimal256(Some(-value), *precision, *scale)
@@ -1729,14 +1909,10 @@ impl ScalarValue {
             DataType::Float32 => ScalarValue::Float32(Some(10.0)),
             DataType::Float64 => ScalarValue::Float64(Some(10.0)),
             DataType::Decimal32(precision, scale) => {
-                if let Err(err) = validate_decimal_precision_and_scale::<Decimal32Type>(
+                Self::validate_decimal_or_internal_err::<Decimal32Type>(
                     *precision, *scale,
-                ) {
-                    return _internal_err!("Invalid precision and scale {err}");
-                }
-                if *scale <= 0 {
-                    return _internal_err!("Negative scale is not supported");
-                }
+                )?;
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
                 match 10_i32.checked_pow((*scale + 1) as u32) {
                     Some(value) => {
                         ScalarValue::Decimal32(Some(value), *precision, *scale)
@@ -1745,14 +1921,10 @@ impl ScalarValue {
                 }
             }
             DataType::Decimal64(precision, scale) => {
-                if let Err(err) = validate_decimal_precision_and_scale::<Decimal64Type>(
+                Self::validate_decimal_or_internal_err::<Decimal64Type>(
                     *precision, *scale,
-                ) {
-                    return _internal_err!("Invalid precision and scale {err}");
-                }
-                if *scale <= 0 {
-                    return _internal_err!("Negative scale is not supported");
-                }
+                )?;
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
                 match i64::from(10).checked_pow((*scale + 1) as u32) {
                     Some(value) => {
                         ScalarValue::Decimal64(Some(value), *precision, *scale)
@@ -1761,14 +1933,10 @@ impl ScalarValue {
                 }
             }
             DataType::Decimal128(precision, scale) => {
-                if let Err(err) = validate_decimal_precision_and_scale::<Decimal128Type>(
+                Self::validate_decimal_or_internal_err::<Decimal128Type>(
                     *precision, *scale,
-                ) {
-                    return _internal_err!("Invalid precision and scale {err}");
-                }
-                if *scale < 0 {
-                    return _internal_err!("Negative scale is not supported");
-                }
+                )?;
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
                 match i128::from(10).checked_pow((*scale + 1) as u32) {
                     Some(value) => {
                         ScalarValue::Decimal128(Some(value), *precision, *scale)
@@ -1777,14 +1945,10 @@ impl ScalarValue {
                 }
             }
             DataType::Decimal256(precision, scale) => {
-                if let Err(err) = validate_decimal_precision_and_scale::<Decimal256Type>(
+                Self::validate_decimal_or_internal_err::<Decimal256Type>(
                     *precision, *scale,
-                ) {
-                    return _internal_err!("Invalid precision and scale {err}");
-                }
-                if *scale < 0 {
-                    return _internal_err!("Negative scale is not supported");
-                }
+                )?;
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
                 match i256::from(10).checked_pow((*scale + 1) as u32) {
                     Some(value) => {
                         ScalarValue::Decimal256(Some(value), *precision, *scale)
@@ -1849,6 +2013,8 @@ impl ScalarValue {
             ScalarValue::List(arr) => arr.data_type().to_owned(),
             ScalarValue::LargeList(arr) => arr.data_type().to_owned(),
             ScalarValue::FixedSizeList(arr) => arr.data_type().to_owned(),
+            ScalarValue::ListView(arr) => arr.data_type().to_owned(),
+            ScalarValue::LargeListView(arr) => arr.data_type().to_owned(),
             ScalarValue::Struct(arr) => arr.data_type().to_owned(),
             ScalarValue::Map(arr) => arr.data_type().to_owned(),
             ScalarValue::Date32(_) => DataType::Date32,
@@ -1878,10 +2044,219 @@ impl ScalarValue {
             ScalarValue::Dictionary(k, v) => {
                 DataType::Dictionary(k.clone(), Box::new(v.data_type()))
             }
+            ScalarValue::RunEndEncoded(run_ends_field, value_field, _) => {
+                DataType::RunEndEncoded(
+                    Arc::clone(run_ends_field),
+                    Arc::clone(value_field),
+                )
+            }
             ScalarValue::Null => DataType::Null,
         }
     }
 
+    #[inline]
+    fn can_use_direct_add(lhs: &ScalarValue, rhs: &ScalarValue) -> bool {
+        matches!(
+            (lhs, rhs),
+            (ScalarValue::Int8(_), ScalarValue::Int8(_))
+                | (ScalarValue::Int16(_), ScalarValue::Int16(_))
+                | (ScalarValue::Int32(_), ScalarValue::Int32(_))
+                | (ScalarValue::Int64(_), ScalarValue::Int64(_))
+                | (ScalarValue::UInt8(_), ScalarValue::UInt8(_))
+                | (ScalarValue::UInt16(_), ScalarValue::UInt16(_))
+                | (ScalarValue::UInt32(_), ScalarValue::UInt32(_))
+                | (ScalarValue::UInt64(_), ScalarValue::UInt64(_))
+                | (ScalarValue::Float16(_), ScalarValue::Float16(_))
+                | (ScalarValue::Float32(_), ScalarValue::Float32(_))
+                | (ScalarValue::Float64(_), ScalarValue::Float64(_))
+                | (
+                    ScalarValue::Decimal32(_, _, _),
+                    ScalarValue::Decimal32(_, _, _)
+                )
+                | (
+                    ScalarValue::Decimal64(_, _, _),
+                    ScalarValue::Decimal64(_, _, _)
+                )
+                | (
+                    ScalarValue::Decimal128(_, _, _),
+                    ScalarValue::Decimal128(_, _, _),
+                )
+                | (
+                    ScalarValue::Decimal256(_, _, _),
+                    ScalarValue::Decimal256(_, _, _),
+                )
+        )
+    }
+
+    #[inline]
+    fn add_optional<T: ArrowNativeTypeOp>(
+        lhs: &mut Option<T>,
+        rhs: Option<T>,
+        checked: bool,
+    ) -> Result<()> {
+        match rhs {
+            Some(rhs) => {
+                if let Some(lhs) = lhs.as_mut() {
+                    *lhs = if checked {
+                        lhs.add_checked(rhs).map_err(|e| arrow_datafusion_err!(e))?
+                    } else {
+                        lhs.add_wrapping(rhs)
+                    };
+                }
+            }
+            None => *lhs = None,
+        }
+        Ok(())
+    }
+
+    #[inline]
+    fn add_decimal_values<T: DecimalType>(
+        lhs_value: &mut Option<T::Native>,
+        lhs_precision: &mut u8,
+        lhs_scale: &mut i8,
+        rhs_value: Option<T::Native>,
+        rhs_precision: u8,
+        rhs_scale: i8,
+    ) -> Result<()>
+    where
+        T::Native: ArrowNativeTypeOp,
+    {
+        Self::validate_decimal_or_internal_err::<T>(*lhs_precision, *lhs_scale)?;
+        Self::validate_decimal_or_internal_err::<T>(rhs_precision, rhs_scale)?;
+
+        let result_scale = (*lhs_scale).max(rhs_scale);
+        // Decimal scales can be negative, so use a wider signed type for the
+        // intermediate precision arithmetic.
+        let lhs_precision_delta = i16::from(*lhs_precision) - i16::from(*lhs_scale);
+        let rhs_precision_delta = i16::from(rhs_precision) - i16::from(rhs_scale);
+        let result_precision =
+            (i16::from(result_scale) + lhs_precision_delta.max(rhs_precision_delta) + 1)
+                .min(i16::from(T::MAX_PRECISION)) as u8;
+
+        Self::validate_decimal_or_internal_err::<T>(result_precision, result_scale)?;
+
+        let lhs_mul = T::Native::usize_as(10)
+            .pow_checked((result_scale - *lhs_scale) as u32)
+            .map_err(|e| arrow_datafusion_err!(e))?;
+        let rhs_mul = T::Native::usize_as(10)
+            .pow_checked((result_scale - rhs_scale) as u32)
+            .map_err(|e| arrow_datafusion_err!(e))?;
+
+        let result_value = match (*lhs_value, rhs_value) {
+            (Some(lhs_value), Some(rhs_value)) => Some(
+                lhs_value
+                    .mul_checked(lhs_mul)
+                    .and_then(|lhs| {
+                        rhs_value
+                            .mul_checked(rhs_mul)
+                            .and_then(|rhs| lhs.add_checked(rhs))
+                    })
+                    .map_err(|e| arrow_datafusion_err!(e))?,
+            ),
+            _ => None,
+        };
+
+        *lhs_value = result_value;
+        *lhs_precision = result_precision;
+        *lhs_scale = result_scale;
+
+        Ok(())
+    }
+
+    #[inline]
+    fn try_add_in_place_impl(
+        &mut self,
+        other: &ScalarValue,
+        checked: bool,
+    ) -> Result<bool> {
+        match (self, other) {
+            (ScalarValue::Int8(lhs), ScalarValue::Int8(rhs)) => {
+                Self::add_optional(lhs, *rhs, checked)?;
+            }
+            (ScalarValue::Int16(lhs), ScalarValue::Int16(rhs)) => {
+                Self::add_optional(lhs, *rhs, checked)?;
+            }
+            (ScalarValue::Int32(lhs), ScalarValue::Int32(rhs)) => {
+                Self::add_optional(lhs, *rhs, checked)?;
+            }
+            (ScalarValue::Int64(lhs), ScalarValue::Int64(rhs)) => {
+                Self::add_optional(lhs, *rhs, checked)?;
+            }
+            (ScalarValue::UInt8(lhs), ScalarValue::UInt8(rhs)) => {
+                Self::add_optional(lhs, *rhs, checked)?;
+            }
+            (ScalarValue::UInt16(lhs), ScalarValue::UInt16(rhs)) => {
+                Self::add_optional(lhs, *rhs, checked)?;
+            }
+            (ScalarValue::UInt32(lhs), ScalarValue::UInt32(rhs)) => {
+                Self::add_optional(lhs, *rhs, checked)?;
+            }
+            (ScalarValue::UInt64(lhs), ScalarValue::UInt64(rhs)) => {
+                Self::add_optional(lhs, *rhs, checked)?;
+            }
+            (ScalarValue::Float16(lhs), ScalarValue::Float16(rhs)) => {
+                Self::add_optional(lhs, *rhs, checked)?;
+            }
+            (ScalarValue::Float32(lhs), ScalarValue::Float32(rhs)) => {
+                Self::add_optional(lhs, *rhs, checked)?;
+            }
+            (ScalarValue::Float64(lhs), ScalarValue::Float64(rhs)) => {
+                Self::add_optional(lhs, *rhs, checked)?;
+            }
+            (
+                ScalarValue::Decimal32(lhs, p, s),
+                ScalarValue::Decimal32(rhs, rhs_p, rhs_s),
+            ) => {
+                Self::add_decimal_values::<Decimal32Type>(
+                    lhs, p, s, *rhs, *rhs_p, *rhs_s,
+                )?;
+            }
+            (
+                ScalarValue::Decimal64(lhs, p, s),
+                ScalarValue::Decimal64(rhs, rhs_p, rhs_s),
+            ) => {
+                Self::add_decimal_values::<Decimal64Type>(
+                    lhs, p, s, *rhs, *rhs_p, *rhs_s,
+                )?;
+            }
+            (
+                ScalarValue::Decimal128(lhs, p, s),
+                ScalarValue::Decimal128(rhs, rhs_p, rhs_s),
+            ) => {
+                Self::add_decimal_values::<Decimal128Type>(
+                    lhs, p, s, *rhs, *rhs_p, *rhs_s,
+                )?;
+            }
+            (
+                ScalarValue::Decimal256(lhs, p, s),
+                ScalarValue::Decimal256(rhs, rhs_p, rhs_s),
+            ) => {
+                Self::add_decimal_values::<Decimal256Type>(
+                    lhs, p, s, *rhs, *rhs_p, *rhs_s,
+                )?;
+            }
+            _ => return Ok(false),
+        }
+
+        Ok(true)
+    }
+
+    #[inline]
+    pub(crate) fn try_add_wrapping_in_place(
+        &mut self,
+        other: &ScalarValue,
+    ) -> Result<bool> {
+        self.try_add_in_place_impl(other, false)
+    }
+
+    #[inline]
+    pub(crate) fn try_add_checked_in_place(
+        &mut self,
+        other: &ScalarValue,
+    ) -> Result<bool> {
+        self.try_add_in_place_impl(other, true)
+    }
+
     /// Calculate arithmetic negation for a scalar value
     pub fn arithmetic_negate(&self) -> Result<Self> {
         fn neg_checked_with_ctx<T: ArrowNativeTypeOp>(
@@ -1899,9 +2274,7 @@ impl ScalarValue {
             | ScalarValue::Float16(None)
             | ScalarValue::Float32(None)
             | ScalarValue::Float64(None) => Ok(self.clone()),
-            ScalarValue::Float16(Some(v)) => {
-                Ok(ScalarValue::Float16(Some(f16::from_f32(-v.to_f32()))))
-            }
+            ScalarValue::Float16(Some(v)) => Ok(ScalarValue::Float16(Some(-v))),
             ScalarValue::Float64(Some(v)) => Ok(ScalarValue::Float64(Some(-v))),
             ScalarValue::Float32(Some(v)) => Ok(ScalarValue::Float32(Some(-v))),
             ScalarValue::Int8(Some(v)) => Ok(ScalarValue::Int8(Some(v.neg_checked()?))),
@@ -2019,15 +2392,34 @@ impl ScalarValue {
     /// NB: operating on `ScalarValue` directly is not efficient, performance sensitive code
     /// should operate on Arrays directly, using vectorized array kernels
     pub fn add<T: Borrow<ScalarValue>>(&self, other: T) -> Result<ScalarValue> {
-        let r = add_wrapping(&self.to_scalar()?, &other.borrow().to_scalar()?)?;
+        let other = other.borrow();
+        if Self::can_use_direct_add(self, other) {
+            let mut result = self.clone();
+            if result.try_add_wrapping_in_place(other)? {
+                return Ok(result);
+            }
+            debug_assert!(false, "fast-path eligibility drifted from implementation");
+        }
+
+        let r = add_wrapping(&self.to_scalar()?, &other.to_scalar()?)?;
         Self::try_from_array(r.as_ref(), 0)
     }
+
     /// Checked addition of `ScalarValue`
     ///
     /// NB: operating on `ScalarValue` directly is not efficient, performance sensitive code
     /// should operate on Arrays directly, using vectorized array kernels
     pub fn add_checked<T: Borrow<ScalarValue>>(&self, other: T) -> Result<ScalarValue> {
-        let r = add(&self.to_scalar()?, &other.borrow().to_scalar()?)?;
+        let other = other.borrow();
+        if Self::can_use_direct_add(self, other) {
+            let mut result = self.clone();
+            if result.try_add_checked_in_place(other)? {
+                return Ok(result);
+            }
+            debug_assert!(false, "fast-path eligibility drifted from implementation");
+        }
+
+        let r = add(&self.to_scalar()?, &other.to_scalar()?)?;
         Self::try_from_array(r.as_ref(), 0)
     }
 
@@ -2133,6 +2525,8 @@ impl ScalarValue {
             ScalarValue::List(arr) => arr.len() == arr.null_count(),
             ScalarValue::LargeList(arr) => arr.len() == arr.null_count(),
             ScalarValue::FixedSizeList(arr) => arr.len() == arr.null_count(),
+            ScalarValue::ListView(arr) => arr.len() == arr.null_count(),
+            ScalarValue::LargeListView(arr) => arr.len() == arr.null_count(),
             ScalarValue::Struct(arr) => arr.len() == arr.null_count(),
             ScalarValue::Map(arr) => arr.len() == arr.null_count(),
             ScalarValue::Date32(v) => v.is_none(),
@@ -2157,6 +2551,7 @@ impl ScalarValue {
                 None => true,
             },
             ScalarValue::Dictionary(_, v) => v.is_null(),
+            ScalarValue::RunEndEncoded(_, _, v) => v.is_null(),
         }
     }
 
@@ -2187,6 +2582,25 @@ impl ScalarValue {
             (Self::Float64(Some(l)), Self::Float64(Some(r))) => {
                 Some((l - r).abs().round() as _)
             }
+            (Self::Date32(Some(l)), Self::Date32(Some(r))) => Some(l.abs_diff(*r) as _),
+            (Self::Date64(Some(l)), Self::Date64(Some(r))) => Some(l.abs_diff(*r) as _),
+            // Timestamp values are stored as epoch ticks regardless of timezone
+            // annotation, so the distance is tz-independent (tz is display metadata).
+            (Self::TimestampSecond(Some(l), _), Self::TimestampSecond(Some(r), _)) => {
+                Some(l.abs_diff(*r) as _)
+            }
+            (
+                Self::TimestampMillisecond(Some(l), _),
+                Self::TimestampMillisecond(Some(r), _),
+            ) => Some(l.abs_diff(*r) as _),
+            (
+                Self::TimestampMicrosecond(Some(l), _),
+                Self::TimestampMicrosecond(Some(r), _),
+            ) => Some(l.abs_diff(*r) as _),
+            (
+                Self::TimestampNanosecond(Some(l), _),
+                Self::TimestampNanosecond(Some(r), _),
+            ) => Some(l.abs_diff(*r) as _),
             (
                 Self::Decimal128(Some(l), lprecision, lscale),
                 Self::Decimal128(Some(r), rprecision, rscale),
@@ -2293,18 +2707,20 @@ impl ScalarValue {
         macro_rules! build_array_primitive {
             ($ARRAY_TY:ident, $SCALAR_TY:ident) => {{
                 {
-                    let array = scalars.map(|sv| {
-                        if let ScalarValue::$SCALAR_TY(v) = sv {
-                            Ok(v)
-                        } else {
-                            _exec_err!(
-                                "Inconsistent types in ScalarValue::iter_to_array. \
+                    let array = scalars
+                        .map(|sv| {
+                            if let ScalarValue::$SCALAR_TY(v) = sv {
+                                Ok(v)
+                            } else {
+                                _exec_err!(
+                                    "Inconsistent types in ScalarValue::iter_to_array. \
                                     Expected {:?}, got {:?}",
-                                data_type, sv
-                            )
-                        }
-                    })
-                    .collect::<Result<$ARRAY_TY>>()?;
+                                    data_type,
+                                    sv
+                                )
+                            }
+                        })
+                        .collect::<Result<$ARRAY_TY>>()?;
                     Arc::new(array)
                 }
             }};
@@ -2313,18 +2729,20 @@ impl ScalarValue {
         macro_rules! build_array_primitive_tz {
             ($ARRAY_TY:ident, $SCALAR_TY:ident, $TZ:expr) => {{
                 {
-                    let array = scalars.map(|sv| {
-                        if let ScalarValue::$SCALAR_TY(v, _) = sv {
-                            Ok(v)
-                        } else {
-                            _exec_err!(
-                                "Inconsistent types in ScalarValue::iter_to_array. \
+                    let array = scalars
+                        .map(|sv| {
+                            if let ScalarValue::$SCALAR_TY(v, _) = sv {
+                                Ok(v)
+                            } else {
+                                _exec_err!(
+                                    "Inconsistent types in ScalarValue::iter_to_array. \
                                     Expected {:?}, got {:?}",
-                                data_type, sv
-                            )
-                        }
-                    })
-                    .collect::<Result<$ARRAY_TY>>()?;
+                                    data_type,
+                                    sv
+                                )
+                            }
+                        })
+                        .collect::<Result<$ARRAY_TY>>()?;
                     Arc::new(array.with_timezone_opt($TZ.clone()))
                 }
             }};
@@ -2335,18 +2753,20 @@ impl ScalarValue {
         macro_rules! build_array_string {
             ($ARRAY_TY:ident, $SCALAR_TY:ident) => {{
                 {
-                    let array = scalars.map(|sv| {
-                        if let ScalarValue::$SCALAR_TY(v) = sv {
-                            Ok(v)
-                        } else {
-                            _exec_err!(
-                                "Inconsistent types in ScalarValue::iter_to_array. \
+                    let array = scalars
+                        .map(|sv| {
+                            if let ScalarValue::$SCALAR_TY(v) = sv {
+                                Ok(v)
+                            } else {
+                                _exec_err!(
+                                    "Inconsistent types in ScalarValue::iter_to_array. \
                                     Expected {:?}, got {:?}",
-                                data_type, sv
-                            )
-                        }
-                    })
-                    .collect::<Result<$ARRAY_TY>>()?;
+                                    data_type,
+                                    sv
+                                )
+                            }
+                        })
+                        .collect::<Result<$ARRAY_TY>>()?;
                     Arc::new(array)
                 }
             }};
@@ -2477,6 +2897,8 @@ impl ScalarValue {
             }
             DataType::List(_)
             | DataType::LargeList(_)
+            | DataType::ListView(_)
+            | DataType::LargeListView(_)
             | DataType::Map(_, _)
             | DataType::Struct(_)
             | DataType::Union(_, _) => {
@@ -2518,6 +2940,94 @@ impl ScalarValue {
                     _ => unreachable!("Invalid dictionary keys type: {}", key_type),
                 }
             }
+            DataType::RunEndEncoded(run_ends_field, value_field) => {
+                fn make_run_array<R: RunEndIndexType>(
+                    scalars: impl IntoIterator<Item = ScalarValue>,
+                    run_ends_field: &FieldRef,
+                    values_field: &FieldRef,
+                ) -> Result<ArrayRef> {
+                    let mut scalars = scalars.into_iter();
+
+                    let mut run_ends = vec![];
+                    let mut value_scalars = vec![];
+
+                    let mut len = R::Native::ONE;
+                    let mut current =
+                        if let Some(ScalarValue::RunEndEncoded(_, _, scalar)) =
+                            scalars.next()
+                        {
+                            *scalar
+                        } else {
+                            // We are guaranteed to have one element of correct
+                            // type because we peeked above
+                            unreachable!()
+                        };
+                    for scalar in scalars {
+                        let scalar = match scalar {
+                            ScalarValue::RunEndEncoded(
+                                inner_run_ends_field,
+                                inner_value_field,
+                                scalar,
+                            ) if &inner_run_ends_field == run_ends_field
+                                && &inner_value_field == values_field =>
+                            {
+                                *scalar
+                            }
+                            _ => {
+                                return _exec_err!(
+                                    "Expected RunEndEncoded scalar with run-ends field {run_ends_field} but got: {scalar:?}"
+                                );
+                            }
+                        };
+
+                        // new run
+                        if scalar != current {
+                            run_ends.push(len);
+                            value_scalars.push(current);
+                            current = scalar;
+                        }
+
+                        len = len.add_checked(R::Native::ONE).map_err(|_| {
+                            DataFusionError::Execution(format!(
+                                "Cannot construct RunArray: Overflows run-ends type {}",
+                                run_ends_field.data_type()
+                            ))
+                        })?;
+                    }
+
+                    run_ends.push(len);
+                    value_scalars.push(current);
+
+                    let run_ends = PrimitiveArray::<R>::from_iter_values(run_ends);
+                    let values = ScalarValue::iter_to_array(value_scalars)?;
+
+                    // Using ArrayDataBuilder so we can maintain the fields
+                    let dt = DataType::RunEndEncoded(
+                        Arc::clone(run_ends_field),
+                        Arc::clone(values_field),
+                    );
+                    let builder = ArrayDataBuilder::new(dt)
+                        .len(RunArray::logical_len(&run_ends))
+                        .add_child_data(run_ends.to_data())
+                        .add_child_data(values.to_data());
+                    let run_array = RunArray::<R>::from(builder.build()?);
+
+                    Ok(Arc::new(run_array))
+                }
+
+                match run_ends_field.data_type() {
+                    DataType::Int16 => {
+                        make_run_array::<Int16Type>(scalars, run_ends_field, value_field)?
+                    }
+                    DataType::Int32 => {
+                        make_run_array::<Int32Type>(scalars, run_ends_field, value_field)?
+                    }
+                    DataType::Int64 => {
+                        make_run_array::<Int64Type>(scalars, run_ends_field, value_field)?
+                    }
+                    dt => unreachable!("Invalid run-ends type: {dt}"),
+                }
+            }
             DataType::FixedSizeBinary(size) => {
                 let array = scalars
                     .map(|sv| {
@@ -2545,10 +3055,7 @@ impl ScalarValue {
             DataType::Time32(TimeUnit::Microsecond)
             | DataType::Time32(TimeUnit::Nanosecond)
             | DataType::Time64(TimeUnit::Second)
-            | DataType::Time64(TimeUnit::Millisecond)
-            | DataType::RunEndEncoded(_, _)
-            | DataType::ListView(_)
-            | DataType::LargeListView(_) => {
+            | DataType::Time64(TimeUnit::Millisecond) => {
                 return _not_impl_err!(
                     "Unsupported creation of {:?} array from ScalarValue {:?}",
                     data_type,
@@ -2648,71 +3155,6 @@ impl ScalarValue {
         Ok(array)
     }
 
-    fn build_decimal32_array(
-        value: Option<i32>,
-        precision: u8,
-        scale: i8,
-        size: usize,
-    ) -> Result<Decimal32Array> {
-        Ok(match value {
-            Some(val) => Decimal32Array::from(vec![val; size])
-                .with_precision_and_scale(precision, scale)?,
-            None => {
-                let mut builder = Decimal32Array::builder(size)
-                    .with_precision_and_scale(precision, scale)?;
-                builder.append_nulls(size);
-                builder.finish()
-            }
-        })
-    }
-
-    fn build_decimal64_array(
-        value: Option<i64>,
-        precision: u8,
-        scale: i8,
-        size: usize,
-    ) -> Result<Decimal64Array> {
-        Ok(match value {
-            Some(val) => Decimal64Array::from(vec![val; size])
-                .with_precision_and_scale(precision, scale)?,
-            None => {
-                let mut builder = Decimal64Array::builder(size)
-                    .with_precision_and_scale(precision, scale)?;
-                builder.append_nulls(size);
-                builder.finish()
-            }
-        })
-    }
-
-    fn build_decimal128_array(
-        value: Option<i128>,
-        precision: u8,
-        scale: i8,
-        size: usize,
-    ) -> Result<Decimal128Array> {
-        Ok(match value {
-            Some(val) => Decimal128Array::from(vec![val; size])
-                .with_precision_and_scale(precision, scale)?,
-            None => {
-                let mut builder = Decimal128Array::builder(size)
-                    .with_precision_and_scale(precision, scale)?;
-                builder.append_nulls(size);
-                builder.finish()
-            }
-        })
-    }
-
-    fn build_decimal256_array(
-        value: Option<i256>,
-        precision: u8,
-        scale: i8,
-        size: usize,
-    ) -> Result<Decimal256Array> {
-        Ok(repeat_n(value, size)
-            .collect::<Decimal256Array>()
-            .with_precision_and_scale(precision, scale)?)
-    }
-
     /// Converts `Vec<ScalarValue>` where each element has type corresponding to
     /// `data_type`, to a single element [`ListArray`].
     ///
@@ -2863,23 +3305,40 @@ impl ScalarValue {
     ///
     /// Errors if `self` is
     /// - a decimal that fails be converted to a decimal array of size
-    /// - a `FixedsizeList` that fails to be concatenated into an array of size
+    /// - a `FixedSizeList` that fails to be concatenated into an array of size
     /// - a `List` that fails to be concatenated into an array of size
     /// - a `Dictionary` that fails be converted to a dictionary array of size
     pub fn to_array_of_size(&self, size: usize) -> Result<ArrayRef> {
         Ok(match self {
-            ScalarValue::Decimal32(e, precision, scale) => Arc::new(
-                ScalarValue::build_decimal32_array(*e, *precision, *scale, size)?,
+            ScalarValue::Decimal32(Some(e), precision, scale) => Arc::new(
+                Decimal32Array::from_value(*e, size)
+                    .with_precision_and_scale(*precision, *scale)?,
             ),
-            ScalarValue::Decimal64(e, precision, scale) => Arc::new(
-                ScalarValue::build_decimal64_array(*e, *precision, *scale, size)?,
+            ScalarValue::Decimal32(None, precision, scale) => {
+                new_null_array(&DataType::Decimal32(*precision, *scale), size)
+            }
+            ScalarValue::Decimal64(Some(e), precision, scale) => Arc::new(
+                Decimal64Array::from_value(*e, size)
+                    .with_precision_and_scale(*precision, *scale)?,
             ),
-            ScalarValue::Decimal128(e, precision, scale) => Arc::new(
-                ScalarValue::build_decimal128_array(*e, *precision, *scale, size)?,
+            ScalarValue::Decimal64(None, precision, scale) => {
+                new_null_array(&DataType::Decimal64(*precision, *scale), size)
+            }
+            ScalarValue::Decimal128(Some(e), precision, scale) => Arc::new(
+                Decimal128Array::from_value(*e, size)
+                    .with_precision_and_scale(*precision, *scale)?,
             ),
-            ScalarValue::Decimal256(e, precision, scale) => Arc::new(
-                ScalarValue::build_decimal256_array(*e, *precision, *scale, size)?,
+            ScalarValue::Decimal128(None, precision, scale) => {
+                new_null_array(&DataType::Decimal128(*precision, *scale), size)
+            }
+            ScalarValue::Decimal256(Some(e), precision, scale) => Arc::new(
+                Decimal256Array::from_value(*e, size)
+                    .with_precision_and_scale(*precision, *scale)?,
             ),
+            ScalarValue::Decimal256(None, precision, scale) => {
+                new_null_array(&DataType::Decimal256(*precision, *scale), size)
+            }
+
             ScalarValue::Boolean(e) => match e {
                 None => new_null_array(&DataType::Boolean, size),
                 Some(true) => {
@@ -2952,33 +3411,35 @@ impl ScalarValue {
                 )
             }
             ScalarValue::Utf8(e) => match e {
-                Some(value) => {
-                    Arc::new(StringArray::from_iter_values(repeat_n(value, size)))
-                }
+                Some(value) => Arc::new(StringArray::new_repeated(value, size)),
                 None => new_null_array(&DataType::Utf8, size),
             },
             ScalarValue::Utf8View(e) => match e {
                 Some(value) => {
-                    Arc::new(StringViewArray::from_iter_values(repeat_n(value, size)))
+                    let mut builder = StringViewBuilder::with_capacity(size);
+                    builder.try_append_value_n(value, size)?;
+                    let array = builder.finish();
+                    Arc::new(array)
                 }
                 None => new_null_array(&DataType::Utf8View, size),
             },
             ScalarValue::LargeUtf8(e) => match e {
-                Some(value) => {
-                    Arc::new(LargeStringArray::from_iter_values(repeat_n(value, size)))
-                }
+                Some(value) => Arc::new(LargeStringArray::new_repeated(value, size)),
                 None => new_null_array(&DataType::LargeUtf8, size),
             },
             ScalarValue::Binary(e) => match e {
-                Some(value) => Arc::new(
-                    repeat_n(Some(value.as_slice()), size).collect::<BinaryArray>(),
-                ),
+                Some(value) => {
+                    Arc::new(BinaryArray::new_repeated(value.as_slice(), size))
+                }
                 None => new_null_array(&DataType::Binary, size),
             },
             ScalarValue::BinaryView(e) => match e {
-                Some(value) => Arc::new(
-                    repeat_n(Some(value.as_slice()), size).collect::<BinaryViewArray>(),
-                ),
+                Some(value) => {
+                    let mut builder = BinaryViewBuilder::with_capacity(size);
+                    builder.try_append_value_n(value, size)?;
+                    let array = builder.finish();
+                    Arc::new(array)
+                }
                 None => new_null_array(&DataType::BinaryView, size),
             },
             ScalarValue::FixedSizeBinary(s, e) => match e {
@@ -2992,9 +3453,9 @@ impl ScalarValue {
                 None => Arc::new(FixedSizeBinaryArray::new_null(*s, size)),
             },
             ScalarValue::LargeBinary(e) => match e {
-                Some(value) => Arc::new(
-                    repeat_n(Some(value.as_slice()), size).collect::<LargeBinaryArray>(),
-                ),
+                Some(value) => {
+                    Arc::new(LargeBinaryArray::new_repeated(value.as_slice(), size))
+                }
                 None => new_null_array(&DataType::LargeBinary, size),
             },
             ScalarValue::List(arr) => {
@@ -3015,6 +3476,18 @@ impl ScalarValue {
                 }
                 Self::list_to_array_of_size(arr.as_ref() as &dyn Array, size)?
             }
+            ScalarValue::ListView(arr) => {
+                if size == 1 {
+                    return Ok(Arc::clone(arr) as Arc<dyn Array>);
+                }
+                Self::list_to_array_of_size(arr.as_ref() as &dyn Array, size)?
+            }
+            ScalarValue::LargeListView(arr) => {
+                if size == 1 {
+                    return Ok(Arc::clone(arr) as Arc<dyn Array>);
+                }
+                Self::list_to_array_of_size(arr.as_ref() as &dyn Array, size)?
+            }
             ScalarValue::Struct(arr) => {
                 if size == 1 {
                     return Ok(Arc::clone(arr) as Arc<dyn Array>);
@@ -3153,10 +3626,7 @@ impl ScalarValue {
                     .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?;
                     Arc::new(ar)
                 }
-                None => {
-                    let dt = self.data_type();
-                    new_null_array(&dt, size)
-                }
+                None => new_null_array(&DataType::Union(fields.clone(), *mode), size),
             },
             ScalarValue::Dictionary(key_type, v) => {
                 // values array is one element long (the value)
@@ -3172,6 +3642,54 @@ impl ScalarValue {
                     _ => unreachable!("Invalid dictionary keys type: {}", key_type),
                 }
             }
+            ScalarValue::RunEndEncoded(run_ends_field, values_field, value) => {
+                fn make_run_array<R: RunEndIndexType>(
+                    run_ends_field: &Arc<Field>,
+                    values_field: &Arc<Field>,
+                    value: &ScalarValue,
+                    size: usize,
+                ) -> Result<ArrayRef> {
+                    let size_native = R::Native::from_usize(size)
+                        .ok_or_else(|| DataFusionError::Execution(format!("Cannot construct RunArray of size {size}: Overflows run-ends type {}", R::DATA_TYPE)))?;
+                    let values = value.to_array_of_size(1)?;
+                    let run_ends =
+                        PrimitiveArray::<R>::new(vec![size_native].into(), None);
+
+                    // Using ArrayDataBuilder so we can maintain the fields
+                    let dt = DataType::RunEndEncoded(
+                        Arc::clone(run_ends_field),
+                        Arc::clone(values_field),
+                    );
+                    let builder = ArrayDataBuilder::new(dt)
+                        .len(size)
+                        .add_child_data(run_ends.to_data())
+                        .add_child_data(values.to_data());
+                    let run_array = RunArray::<R>::from(builder.build()?);
+
+                    Ok(Arc::new(run_array))
+                }
+                match run_ends_field.data_type() {
+                    DataType::Int16 => make_run_array::<Int16Type>(
+                        run_ends_field,
+                        values_field,
+                        value,
+                        size,
+                    )?,
+                    DataType::Int32 => make_run_array::<Int32Type>(
+                        run_ends_field,
+                        values_field,
+                        value,
+                        size,
+                    )?,
+                    DataType::Int64 => make_run_array::<Int64Type>(
+                        run_ends_field,
+                        values_field,
+                        value,
+                        size,
+                    )?,
+                    dt => unreachable!("Invalid run-ends type: {dt}"),
+                }
+            }
             ScalarValue::Null => get_or_create_cached_null_array(size),
         })
     }
@@ -3225,13 +3743,22 @@ impl ScalarValue {
         }
     }
 
+    /// Repeats the rows of `arr` `size` times, producing an array with
+    /// `arr.len() * size` total rows.
     fn list_to_array_of_size(arr: &dyn Array, size: usize) -> Result<ArrayRef> {
-        let arrays = repeat_n(arr, size).collect::<Vec<_>>();
-        let ret = match !arrays.is_empty() {
-            true => arrow::compute::concat(arrays.as_slice())?,
-            false => arr.slice(0, 0),
-        };
-        Ok(ret)
+        if size == 0 {
+            return Ok(arr.slice(0, 0));
+        }
+
+        // Examples: given `arr = [[A, B, C]]` and `size = 3`, `indices = [0, 0, 0]` and
+        // the result is `[[A, B, C], [A, B, C], [A, B, C]]`.
+        //
+        // Given `arr = [[A, B], [C]]` and `size = 2`, `indices = [0, 1, 0, 1]` and the
+        // result is `[[A, B], [C], [A, B], [C]]`. (But in practice, we are always called
+        // with `arr.len() == 1`.)
+        let n = arr.len() as u32;
+        let indices = UInt32Array::from_iter_values((0..size).flat_map(|_| 0..n));
+        Ok(arrow::compute::take(arr, &indices, None)?)
     }
 
     /// Retrieve ScalarValue for each row in `array`
@@ -3338,29 +3865,35 @@ impl ScalarValue {
     pub fn convert_array_to_scalar_vec(
         array: &dyn Array,
     ) -> Result<Vec<Option<Vec<Self>>>> {
-        fn generic_collect<OffsetSize: OffsetSizeTrait>(
-            array: &dyn Array,
-        ) -> Result<Vec<Option<Vec<ScalarValue>>>> {
-            array
-                .as_list::<OffsetSize>()
-                .iter()
-                .map(|nested_array| {
-                    nested_array
-                        .map(|array| {
-                            (0..array.len())
-                                .map(|i| ScalarValue::try_from_array(&array, i))
-                                .collect::<Result<Vec<_>>>()
-                        })
-                        .transpose()
+        fn map_element(
+            nested_array: Option<ArrayRef>,
+        ) -> Result<Option<Vec<ScalarValue>>> {
+            nested_array
+                .map(|array| {
+                    (0..array.len())
+                        .map(|i| ScalarValue::try_from_array(&array, i))
+                        .collect::<Result<Vec<_>>>()
                 })
-                .collect()
+                .transpose()
         }
 
         match array.data_type() {
-            DataType::List(_) => generic_collect::<i32>(array),
-            DataType::LargeList(_) => generic_collect::<i64>(array),
+            DataType::List(_) => array.as_list::<i32>().iter().map(map_element).collect(),
+            DataType::LargeList(_) => {
+                array.as_list::<i64>().iter().map(map_element).collect()
+            }
+            DataType::ListView(_) => array
+                .as_list_view::<i32>()
+                .iter()
+                .map(map_element)
+                .collect(),
+            DataType::LargeListView(_) => array
+                .as_list_view::<i64>()
+                .iter()
+                .map(map_element)
+                .collect(),
             _ => _internal_err!(
-                "ScalarValue::convert_array_to_scalar_vec input must be a List/LargeList type"
+                "ScalarValue::convert_array_to_scalar_vec input must be a List/LargeList/ListView/LargeListView type"
             ),
         }
     }
@@ -3379,7 +3912,7 @@ impl ScalarValue {
     /// Converts a value in `array` at `index` into a ScalarValue
     pub fn try_from_array(array: &dyn Array, index: usize) -> Result<Self> {
         // handle NULL value
-        if !array.is_valid(index) {
+        if array.is_null(index) {
             return array.data_type().try_into();
         }
 
@@ -3457,6 +3990,22 @@ impl ScalarValue {
                     .with_field(field)
                     .build_fixed_size_list_scalar(list_size)
             }
+            DataType::ListView(field) => {
+                let list_array = as_list_view_array(array)?;
+                let nested_array = list_array.value(index);
+                // Produces a single element `ListViewArray` with the value at `index`.
+                SingleRowListArrayBuilder::new(nested_array)
+                    .with_field(field)
+                    .build_list_view_scalar()
+            }
+            DataType::LargeListView(field) => {
+                let list_array = as_large_list_view_array(array)?;
+                let nested_array = list_array.value(index);
+                // Produces a single element `LargeListViewArray` with the value at `index`.
+                SingleRowListArrayBuilder::new(nested_array)
+                    .with_field(field)
+                    .build_large_list_view_scalar()
+            }
             DataType::Date32 => typed_cast!(array, index, as_date32_array, Date32)?,
             DataType::Date64 => typed_cast!(array, index, as_date64_array, Date64)?,
             DataType::Time32(TimeUnit::Second) => {
@@ -3522,6 +4071,28 @@ impl ScalarValue {
 
                 Self::Dictionary(key_type.clone(), Box::new(value))
             }
+            DataType::RunEndEncoded(run_ends_field, value_field) => {
+                // Explicitly check length here since get_physical_index() doesn't
+                // bound check for us
+                if index > array.len() {
+                    return _exec_err!(
+                        "Index {index} out of bounds for array of length {}",
+                        array.len()
+                    );
+                }
+                let scalar = downcast_run_array!(
+                    array => {
+                        let index = array.get_physical_index(index);
+                        ScalarValue::try_from_array(array.values(), index)?
+                    },
+                    dt => unreachable!("Invalid run-ends type: {dt}")
+                );
+                Self::RunEndEncoded(
+                    Arc::clone(run_ends_field),
+                    Arc::clone(value_field),
+                    Box::new(scalar),
+                )
+            }
             DataType::Struct(_) => {
                 let a = array.slice(index, 1);
                 Self::Struct(Arc::new(a.as_struct().to_owned()))
@@ -3634,6 +4205,7 @@ impl ScalarValue {
             ScalarValue::LargeUtf8(v) => v,
             ScalarValue::Utf8View(v) => v,
             ScalarValue::Dictionary(_, v) => return v.try_as_str(),
+            ScalarValue::RunEndEncoded(_, _, v) => return v.try_as_str(),
             _ => return None,
         };
         Some(v.as_ref().map(|v| v.as_str()))
@@ -3650,11 +4222,38 @@ impl ScalarValue {
         target_type: &DataType,
         cast_options: &CastOptions<'static>,
     ) -> Result<Self> {
+        let source_type = self.data_type();
+        if let Some(multiplier) = date_to_timestamp_multiplier(&source_type, target_type)
+            && let Some(value) = self.date_scalar_value_as_i64()
+        {
+            ensure_timestamp_in_bounds(value, multiplier, &source_type, target_type)?;
+        }
+
         let scalar_array = self.to_array()?;
-        let cast_arr = cast_with_options(&scalar_array, target_type, cast_options)?;
+
+        // For types that contain structs (including nested inside Lists, Dictionaries,
+        // etc.), use name-based casting logic that matches struct fields by name and
+        // recursively casts nested structs.
+        let cast_arr = if crate::nested_struct::requires_nested_struct_cast(
+            scalar_array.data_type(),
+            target_type,
+        ) {
+            crate::nested_struct::cast_column(&scalar_array, target_type, cast_options)?
+        } else {
+            cast_with_options(&scalar_array, target_type, cast_options)?
+        };
+
         ScalarValue::try_from_array(&cast_arr, 0)
     }
 
+    fn date_scalar_value_as_i64(&self) -> Option<i64> {
+        match self {
+            ScalarValue::Date32(Some(value)) => Some(i64::from(*value)),
+            ScalarValue::Date64(Some(value)) => Some(*value),
+            _ => None,
+        }
+    }
+
     fn eq_array_decimal32(
         array: &ArrayRef,
         index: usize,
@@ -3862,6 +4461,12 @@ impl ScalarValue {
             ScalarValue::FixedSizeList(arr) => {
                 Self::eq_array_list(&(arr.to_owned() as ArrayRef), array, index)
             }
+            ScalarValue::ListView(arr) => {
+                Self::eq_array_list(&(arr.to_owned() as ArrayRef), array, index)
+            }
+            ScalarValue::LargeListView(arr) => {
+                Self::eq_array_list(&(arr.to_owned() as ArrayRef), array, index)
+            }
             ScalarValue::Struct(arr) => {
                 Self::eq_array_list(&(arr.to_owned() as ArrayRef), array, index)
             }
@@ -3947,6 +4552,34 @@ impl ScalarValue {
                     None => v.is_null(),
                 }
             }
+            ScalarValue::RunEndEncoded(run_ends_field, _, value) => {
+                // Explicitly check length here since get_physical_index() doesn't
+                // bound check for us
+                if index > array.len() {
+                    return _exec_err!(
+                        "Index {index} out of bounds for array of length {}",
+                        array.len()
+                    );
+                }
+                match run_ends_field.data_type() {
+                    DataType::Int16 => {
+                        let array = as_run_array::<Int16Type>(array)?;
+                        let index = array.get_physical_index(index);
+                        value.eq_array(array.values(), index)?
+                    }
+                    DataType::Int32 => {
+                        let array = as_run_array::<Int32Type>(array)?;
+                        let index = array.get_physical_index(index);
+                        value.eq_array(array.values(), index)?
+                    }
+                    DataType::Int64 => {
+                        let array = as_run_array::<Int64Type>(array)?;
+                        let index = array.get_physical_index(index);
+                        value.eq_array(array.values(), index)?
+                    }
+                    dt => unreachable!("Invalid run-ends type: {dt}"),
+                }
+            }
             ScalarValue::Null => array.is_null(index),
         })
     }
@@ -4021,6 +4654,8 @@ impl ScalarValue {
                 ScalarValue::List(arr) => arr.get_array_memory_size(),
                 ScalarValue::LargeList(arr) => arr.get_array_memory_size(),
                 ScalarValue::FixedSizeList(arr) => arr.get_array_memory_size(),
+                ScalarValue::ListView(arr) => arr.get_array_memory_size(),
+                ScalarValue::LargeListView(arr) => arr.get_array_memory_size(),
                 ScalarValue::Struct(arr) => arr.get_array_memory_size(),
                 ScalarValue::Map(arr) => arr.get_array_memory_size(),
                 ScalarValue::Union(vals, fields, _mode) => {
@@ -4036,6 +4671,7 @@ impl ScalarValue {
                     // `dt` and `sv` are boxed, so they are NOT already included in `self`
                     dt.size() + sv.size()
                 }
+                ScalarValue::RunEndEncoded(rf, vf, v) => rf.size() + vf.size() + v.size(),
             }
     }
 
@@ -4066,6 +4702,7 @@ impl ScalarValue {
     /// Estimates [size](Self::size) of [`HashSet`] in bytes.
     ///
     /// Includes the size of the [`HashSet`] container itself.
+    #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key
     pub fn size_of_hashset<S>(set: &HashSet<Self, S>) -> usize {
         size_of_val(set)
             + (size_of::<ScalarValue>() * set.capacity())
@@ -4135,6 +4772,14 @@ impl ScalarValue {
                 let array = copy_array_data(&arr.to_data());
                 *Arc::make_mut(arr) = LargeListArray::from(array)
             }
+            ScalarValue::ListView(arr) => {
+                let array = copy_array_data(&arr.to_data());
+                *Arc::make_mut(arr) = ListViewArray::from(array);
+            }
+            ScalarValue::LargeListView(arr) => {
+                let array = copy_array_data(&arr.to_data());
+                *Arc::make_mut(arr) = LargeListViewArray::from(array)
+            }
             ScalarValue::Struct(arr) => {
                 let array = copy_array_data(&arr.to_data());
                 *Arc::make_mut(arr) = StructArray::from(array);
@@ -4151,6 +4796,9 @@ impl ScalarValue {
             ScalarValue::Dictionary(_, value) => {
                 value.compact();
             }
+            ScalarValue::RunEndEncoded(_, _, value) => {
+                value.compact();
+            }
         }
     }
 
@@ -4354,6 +5002,20 @@ impl ScalarValue {
             _ => None,
         }
     }
+
+    /// A thin wrapper on Arrow's validation that throws internal error if validation
+    /// fails.
+    fn validate_decimal_or_internal_err<T: DecimalType>(
+        precision: u8,
+        scale: i8,
+    ) -> Result<()> {
+        validate_decimal_precision_and_scale::<T>(precision, scale).map_err(|err| {
+            _internal_datafusion_err!(
+                "Decimal precision/scale invariant violated \
+                 (precision={precision}, scale={scale}): {err}"
+            )
+        })
+    }
 }
 
 /// Compacts the data of an `ArrayData` into a new `ArrayData`.
@@ -4407,6 +5069,7 @@ macro_rules! impl_scalar {
 
 impl_scalar!(f64, Float64);
 impl_scalar!(f32, Float32);
+impl_scalar!(f16, Float16);
 impl_scalar!(i8, Int8);
 impl_scalar!(i16, Int16);
 impl_scalar!(i32, Int32);
@@ -4563,6 +5226,7 @@ impl_try_from!(UInt8, u8);
 impl_try_from!(UInt16, u16);
 impl_try_from!(UInt32, u32);
 impl_try_from!(UInt64, u64);
+impl_try_from!(Float16, f16);
 impl_try_from!(Float32, f32);
 impl_try_from!(Float64, f64);
 impl_try_from!(Boolean, bool);
@@ -4639,8 +5303,10 @@ impl fmt::Display for ScalarValue {
             | ScalarValue::BinaryView(e) => match e {
                 Some(bytes) => {
                     // print up to first 10 bytes, with trailing ... if needed
+                    const HEX_CHARS_UPPER: &[u8; 16] = b"0123456789ABCDEF";
                     for b in bytes.iter().take(10) {
-                        write!(f, "{b:02X}")?;
+                        f.write_char(HEX_CHARS_UPPER[(b >> 4) as usize] as char)?;
+                        f.write_char(HEX_CHARS_UPPER[(b & 0x0f) as usize] as char)?;
                     }
                     if bytes.len() > 10 {
                         write!(f, "...")?;
@@ -4648,9 +5314,11 @@ impl fmt::Display for ScalarValue {
                 }
                 None => write!(f, "NULL")?,
             },
-            ScalarValue::List(arr) => fmt_list(arr.to_owned() as ArrayRef, f)?,
-            ScalarValue::LargeList(arr) => fmt_list(arr.to_owned() as ArrayRef, f)?,
-            ScalarValue::FixedSizeList(arr) => fmt_list(arr.to_owned() as ArrayRef, f)?,
+            ScalarValue::List(arr) => fmt_list(arr.as_ref(), f)?,
+            ScalarValue::LargeList(arr) => fmt_list(arr.as_ref(), f)?,
+            ScalarValue::FixedSizeList(arr) => fmt_list(arr.as_ref(), f)?,
+            ScalarValue::ListView(arr) => fmt_list(arr.as_ref(), f)?,
+            ScalarValue::LargeListView(arr) => fmt_list(arr.as_ref(), f)?,
             ScalarValue::Date32(e) => format_option!(
                 f,
                 e.map(|v| {
@@ -4766,18 +5434,18 @@ impl fmt::Display for ScalarValue {
                 None => write!(f, "NULL")?,
             },
             ScalarValue::Dictionary(_k, v) => write!(f, "{v}")?,
+            ScalarValue::RunEndEncoded(_, _, v) => write!(f, "{v}")?,
             ScalarValue::Null => write!(f, "NULL")?,
         };
         Ok(())
     }
 }
 
-fn fmt_list(arr: ArrayRef, f: &mut fmt::Formatter) -> fmt::Result {
-    // ScalarValue List, LargeList, FixedSizeList should always have a single element
+fn fmt_list(arr: &dyn Array, f: &mut fmt::Formatter) -> fmt::Result {
+    // ScalarValue List, LargeList, FixedSizeList, ListView, LargeListView should always have a single element
     assert_eq!(arr.len(), 1);
     let options = FormatOptions::default().with_display_error(true);
-    let formatter =
-        ArrayFormatter::try_new(arr.as_ref() as &dyn Array, &options).unwrap();
+    let formatter = ArrayFormatter::try_new(arr, &options).unwrap();
     let value_formatter = formatter.value(0);
     write!(f, "{value_formatter}")
 }
@@ -4860,6 +5528,8 @@ impl fmt::Debug for ScalarValue {
             ScalarValue::FixedSizeList(_) => write!(f, "FixedSizeList({self})"),
             ScalarValue::List(_) => write!(f, "List({self})"),
             ScalarValue::LargeList(_) => write!(f, "LargeList({self})"),
+            ScalarValue::ListView(_) => write!(f, "ListView({self})"),
+            ScalarValue::LargeListView(_) => write!(f, "LargeListView({self})"),
             ScalarValue::Struct(struct_arr) => {
                 // ScalarValue Struct should always have a single element
                 assert_eq!(struct_arr.len(), 1);
@@ -4945,6 +5615,9 @@ impl fmt::Debug for ScalarValue {
                 None => write!(f, "Union(NULL)"),
             },
             ScalarValue::Dictionary(k, v) => write!(f, "Dictionary({k:?}, {v:?})"),
+            ScalarValue::RunEndEncoded(rf, vf, v) => {
+                write!(f, "RunEndEncoded({rf:?}, {vf:?}, {v:?})")
+            }
             ScalarValue::Null => write!(f, "NULL"),
         }
     }
@@ -4994,24 +5667,26 @@ impl ScalarType<i32> for Date32Type {
 
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
 
     use super::*;
-    use crate::cast::{as_list_array, as_map_array, as_struct_array};
+    use crate::cast::{
+        as_large_list_view_array, as_list_array, as_map_array, as_struct_array,
+    };
     use crate::test_util::batches_to_string;
     use arrow::array::{
-        FixedSizeListBuilder, Int32Builder, LargeListBuilder, ListBuilder, MapBuilder,
-        NullArray, NullBufferBuilder, OffsetSizeTrait, PrimitiveBuilder, RecordBatch,
-        StringBuilder, StringDictionaryBuilder, StructBuilder, UnionBuilder,
+        FixedSizeListBuilder, Int32Builder, LargeListBuilder, LargeListViewBuilder,
+        ListBuilder, ListViewBuilder, MapBuilder, NullArray, NullBufferBuilder,
+        OffsetSizeTrait, PrimitiveBuilder, RecordBatch, StringBuilder,
+        StringDictionaryBuilder, StructBuilder, UnionBuilder,
     };
     use arrow::buffer::{Buffer, NullBuffer, OffsetBuffer};
     use arrow::compute::{is_null, kernels};
     use arrow::datatypes::{
-        ArrowNumericType, Fields, Float64Type, DECIMAL256_MAX_PRECISION,
+        ArrowNumericType, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, Fields,
+        Float64Type, TimeUnit,
     };
     use arrow::error::ArrowError;
     use arrow::util::pretty::pretty_format_columns;
-    use chrono::NaiveDate;
     use insta::assert_snapshot;
     use rand::Rng;
 
@@ -5040,6 +5715,52 @@ mod tests {
         assert_eq!(actual, &expected);
     }
 
+    #[test]
+    fn test_format_timestamp_type_for_error_and_bounds() {
+        // format helper
+        let ts_ns = format_timestamp_type_for_error(&DataType::Timestamp(
+            TimeUnit::Nanosecond,
+            None,
+        ));
+        assert_eq!(ts_ns, "Timestamp(ns)");
+
+        let ts_us = format_timestamp_type_for_error(&DataType::Timestamp(
+            TimeUnit::Microsecond,
+            None,
+        ));
+        assert_eq!(ts_us, "Timestamp(us)");
+
+        // ensure_timestamp_in_bounds: Date32 non-overflow
+        let ok = ensure_timestamp_in_bounds(
+            1000,
+            NANOS_PER_DAY,
+            &DataType::Date32,
+            &DataType::Timestamp(TimeUnit::Nanosecond, None),
+        );
+        assert!(ok.is_ok());
+
+        // Date32 overflow -- known large day value (9999-12-31 -> 2932896)
+        let err = ensure_timestamp_in_bounds(
+            2932896,
+            NANOS_PER_DAY,
+            &DataType::Date32,
+            &DataType::Timestamp(TimeUnit::Nanosecond, None),
+        );
+        assert!(err.is_err());
+        let msg = err.unwrap_err().to_string();
+        assert!(msg.contains("Cannot cast Date32 value 2932896 to Timestamp(ns): converted value exceeds the representable i64 range"));
+
+        // Date64 overflow for ns (millis * 1_000_000)
+        let overflow_millis: i64 = (i64::MAX / NANOS_PER_MILLISECOND) + 1;
+        let err2 = ensure_timestamp_in_bounds(
+            overflow_millis,
+            NANOS_PER_MILLISECOND,
+            &DataType::Date64,
+            &DataType::Timestamp(TimeUnit::Nanosecond, None),
+        );
+        assert!(err2.is_err());
+    }
+
     #[test]
     fn test_scalar_value_from_for_struct() {
         let boolean = Arc::new(BooleanArray::from(vec![false]));
@@ -5142,6 +5863,27 @@ mod tests {
         ]);
 
         assert_eq!(&arr, actual_list_arr);
+
+        // ListView
+        let arr =
+            ListViewArray::from_iter_primitive::<Int32Type, _, _>(vec![Some(vec![
+                Some(1),
+                None,
+                Some(2),
+            ])]);
+
+        let sv = ScalarValue::ListView(Arc::new(arr));
+        let actual_arr = sv
+            .to_array_of_size(2)
+            .expect("Failed to convert to array of size");
+        let actual_list_arr = actual_arr.as_list_view::<i32>();
+
+        let arr = ListViewArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), None, Some(2)]),
+            Some(vec![Some(1), None, Some(2)]),
+        ]);
+
+        assert_eq!(&arr, actual_list_arr);
     }
 
     #[test]
@@ -5171,6 +5913,91 @@ mod tests {
         assert_eq!(empty_array.len(), 0);
     }
 
+    #[test]
+    fn test_to_array_of_size_list_size_one() {
+        // size=1 takes the fast path (Arc::clone)
+        let arr = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![Some(vec![
+            Some(10),
+            Some(20),
+        ])]);
+        let sv = ScalarValue::List(Arc::new(arr.clone()));
+        let result = sv.to_array_of_size(1).unwrap();
+        assert_eq!(result.as_list::<i32>(), &arr);
+    }
+
+    #[test]
+    fn test_to_array_of_size_list_empty_inner() {
+        // A list scalar containing an empty list: [[]]
+        let arr = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![Some(vec![])]);
+        let sv = ScalarValue::List(Arc::new(arr));
+        let result = sv.to_array_of_size(3).unwrap();
+        let result_list = result.as_list::<i32>();
+        assert_eq!(result_list.len(), 3);
+        for i in 0..3 {
+            assert_eq!(result_list.value(i).len(), 0);
+        }
+    }
+
+    #[test]
+    fn test_to_array_of_size_large_list() {
+        let arr =
+            LargeListArray::from_iter_primitive::<Int32Type, _, _>(vec![Some(vec![
+                Some(100),
+                Some(200),
+            ])]);
+        let sv = ScalarValue::LargeList(Arc::new(arr));
+        let result = sv.to_array_of_size(3).unwrap();
+        let expected = LargeListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(100), Some(200)]),
+            Some(vec![Some(100), Some(200)]),
+            Some(vec![Some(100), Some(200)]),
+        ]);
+        assert_eq!(result.as_list::<i64>(), &expected);
+    }
+
+    #[test]
+    fn test_list_to_array_of_size_multi_row() {
+        // Call list_to_array_of_size directly with arr.len() > 1
+        let arr = Int32Array::from(vec![Some(10), None, Some(30)]);
+        let result = ScalarValue::list_to_array_of_size(&arr, 3).unwrap();
+        let result = result.as_primitive::<Int32Type>();
+        assert_eq!(
+            result.iter().collect::<Vec<_>>(),
+            vec![
+                Some(10),
+                None,
+                Some(30),
+                Some(10),
+                None,
+                Some(30),
+                Some(10),
+                None,
+                Some(30),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_to_array_of_size_null_list() {
+        let dt = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true)));
+        let sv = ScalarValue::try_from(&dt).unwrap();
+        let result = sv.to_array_of_size(3).unwrap();
+        assert_eq!(result.len(), 3);
+        assert_eq!(result.null_count(), 3);
+    }
+
+    /// See https://github.com/apache/datafusion/issues/18870
+    #[test]
+    fn test_to_array_of_size_for_none_fsb() {
+        let sv = ScalarValue::FixedSizeBinary(5, None);
+        let result = sv
+            .to_array_of_size(2)
+            .expect("Failed to convert to array of size");
+        assert_eq!(result.len(), 2);
+        assert_eq!(result.null_count(), 2);
+        assert_eq!(result.as_fixed_size_binary().values().len(), 10);
+    }
+
     #[test]
     fn test_list_to_array_string() {
         let scalars = vec![
@@ -5196,29 +6023,11 @@ mod tests {
         values
             .into_iter()
             .map(|v| {
-                let arr = if v.is_some() {
-                    Arc::new(
-                        GenericListArray::<O>::from_iter_primitive::<Int64Type, _, _>(
-                            vec![v],
-                        ),
-                    )
-                } else if O::IS_LARGE {
-                    new_null_array(
-                        &DataType::LargeList(Arc::new(Field::new_list_field(
-                            DataType::Int64,
-                            true,
-                        ))),
-                        1,
-                    )
-                } else {
-                    new_null_array(
-                        &DataType::List(Arc::new(Field::new_list_field(
-                            DataType::Int64,
-                            true,
-                        ))),
-                        1,
-                    )
-                };
+                let arr = Arc::new(GenericListArray::<O>::from_iter_primitive::<
+                    Int64Type,
+                    _,
+                    _,
+                >(vec![v])) as ArrayRef;
 
                 if O::IS_LARGE {
                     ScalarValue::LargeList(arr.as_list::<i64>().to_owned().into())
@@ -5229,6 +6038,29 @@ mod tests {
             .collect()
     }
 
+    fn build_list_view<O: OffsetSizeTrait>(
+        values: Vec<Option<Vec<Option<i64>>>>,
+    ) -> Vec<ScalarValue> {
+        values
+            .into_iter()
+            .map(|v| {
+                let arr = Arc::new(GenericListViewArray::<O>::from_iter_primitive::<
+                    Int64Type,
+                    _,
+                    _,
+                >(vec![v])) as ArrayRef;
+
+                if O::IS_LARGE {
+                    ScalarValue::LargeListView(
+                        arr.as_list_view::<i64>().to_owned().into(),
+                    )
+                } else {
+                    ScalarValue::ListView(arr.as_list_view::<i32>().to_owned().into())
+                }
+            })
+            .collect()
+    }
+
     #[test]
     fn test_iter_to_array_fixed_size_list() {
         let field = Arc::new(Field::new_list_field(DataType::Int32, true));
@@ -5357,13 +6189,13 @@ mod tests {
 
     #[test]
     fn iter_to_array_primitive_test() {
+        // List
         // List[[1,2,3]], List[null], List[[4,5]]
         let scalars = build_list::<i32>(vec![
             Some(vec![Some(1), Some(2), Some(3)]),
             None,
             Some(vec![Some(4), Some(5)]),
         ]);
-
         let array = ScalarValue::iter_to_array(scalars).unwrap();
         let list_array = as_list_array(&array).unwrap();
         // List[[1,2,3], null, [4,5]]
@@ -5374,20 +6206,57 @@ mod tests {
         ]);
         assert_eq!(list_array, &expected);
 
+        // LargeList
+        // List[[1,2,3]], List[null], List[[4,5]]
         let scalars = build_list::<i64>(vec![
             Some(vec![Some(1), Some(2), Some(3)]),
             None,
             Some(vec![Some(4), Some(5)]),
         ]);
-
         let array = ScalarValue::iter_to_array(scalars).unwrap();
-        let list_array = as_large_list_array(&array).unwrap();
+        let large_list_array = as_large_list_array(&array).unwrap();
         let expected = LargeListArray::from_iter_primitive::<Int64Type, _, _>(vec![
             Some(vec![Some(1), Some(2), Some(3)]),
             None,
             Some(vec![Some(4), Some(5)]),
         ]);
-        assert_eq!(list_array, &expected);
+        assert_eq!(large_list_array, &expected);
+
+        // ListView
+        // ListView[[1,2,3]], ListView[null], ListView[[4,5]]
+        let scalars = build_list_view::<i32>(vec![
+            Some(vec![Some(1), Some(2), Some(3)]),
+            None,
+            Some(vec![Some(4), Some(5)]),
+        ]);
+
+        let array = ScalarValue::iter_to_array(scalars).unwrap();
+        let list_view_array = as_list_view_array(&array).unwrap();
+        // ListView[[1,2,3], null, [4,5]]
+        let expected = ListViewArray::from_iter_primitive::<Int64Type, _, _>(vec![
+            Some(vec![Some(1), Some(2), Some(3)]),
+            None,
+            Some(vec![Some(4), Some(5)]),
+        ]);
+        assert_eq!(list_view_array, &expected);
+
+        // LargeListView
+        // LargeListView[[1,2,3]], LargeListView[null], LargeListView[[4,5]]
+        let scalars = build_list_view::<i64>(vec![
+            Some(vec![Some(1), Some(2), Some(3)]),
+            None,
+            Some(vec![Some(4), Some(5)]),
+        ]);
+
+        let array = ScalarValue::iter_to_array(scalars).unwrap();
+        let large_list_view_array = as_large_list_view_array(&array).unwrap();
+        // LargeListView[[1,2,3], null, [4,5]]
+        let expected = LargeListViewArray::from_iter_primitive::<Int64Type, _, _>(vec![
+            Some(vec![Some(1), Some(2), Some(3)]),
+            None,
+            Some(vec![Some(4), Some(5)]),
+        ]);
+        assert_eq!(large_list_view_array, &expected);
     }
 
     #[test]
@@ -5430,16 +6299,36 @@ mod tests {
             ]));
 
         let fsl_array: ArrayRef =
-            Arc::new(ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Arc::new(FixedSizeListArray::from_iter_primitive::<Int32Type, _, _>(
+                vec![
+                    Some(vec![Some(0), Some(1), Some(2)]),
+                    None,
+                    Some(vec![Some(3), None, Some(5)]),
+                ],
+                3,
+            ));
+        let list_view_array: ArrayRef =
+            Arc::new(ListViewArray::from_iter_primitive::<Int32Type, _, _>(vec![
                 Some(vec![Some(0), Some(1), Some(2)]),
                 None,
-                Some(vec![Some(3), None, Some(5)]),
+                Some(vec![None, Some(5)]),
             ]));
 
-        for arr in [list_array, fsl_array] {
+        for arr in [list_array, fsl_array, list_view_array] {
             for i in 0..arr.len() {
-                let scalar =
-                    ScalarValue::List(arr.slice(i, 1).as_list::<i32>().to_owned().into());
+                let slice = arr.slice(i, 1);
+                let scalar = match arr.data_type() {
+                    DataType::List(_) => {
+                        ScalarValue::List(slice.as_list::<i32>().to_owned().into())
+                    }
+                    DataType::FixedSizeList(_, _) => ScalarValue::FixedSizeList(
+                        slice.as_fixed_size_list().to_owned().into(),
+                    ),
+                    DataType::ListView(_) => ScalarValue::ListView(
+                        slice.as_list_view::<i32>().to_owned().into(),
+                    ),
+                    _ => unreachable!(),
+                };
                 assert!(scalar.eq_array(&arr, i).unwrap());
             }
         }
@@ -5471,6 +6360,68 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn scalar_add_trait_null_test() -> Result<()> {
+        let int_value = ScalarValue::Int32(Some(42));
+
+        assert_eq!(
+            int_value.add(ScalarValue::Int32(None))?,
+            ScalarValue::Int32(None)
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn scalar_add_trait_wrapping_overflow_test() -> Result<()> {
+        let int_value = ScalarValue::Int32(Some(i32::MAX));
+        let one = ScalarValue::Int32(Some(1));
+
+        assert_eq!(int_value.add(one)?, ScalarValue::Int32(Some(i32::MIN)));
+
+        Ok(())
+    }
+
+    #[test]
+    fn scalar_add_trait_decimal_scale_test() -> Result<()> {
+        let decimal = ScalarValue::Decimal128(Some(123), 10, 2);
+        let decimal_2 = ScalarValue::Decimal128(Some(4), 9, 1);
+
+        assert_eq!(
+            decimal.add(decimal_2)?,
+            ScalarValue::Decimal128(Some(163), 11, 2)
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn scalar_add_trait_decimal256_scale_test() -> Result<()> {
+        let decimal = ScalarValue::Decimal256(Some(i256::from(123)), 10, 2);
+        let decimal_2 = ScalarValue::Decimal256(Some(i256::from(4)), 9, 1);
+
+        assert_eq!(
+            decimal.add(decimal_2)?,
+            ScalarValue::Decimal256(Some(i256::from(163)), 11, 2)
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn scalar_add_trait_decimal_negative_scale_test() -> Result<()> {
+        let decimal = ScalarValue::Decimal128(Some(1), DECIMAL128_MAX_PRECISION, i8::MIN);
+        let decimal_2 =
+            ScalarValue::Decimal128(Some(1), DECIMAL128_MAX_PRECISION, i8::MIN);
+
+        assert_eq!(
+            decimal.add(decimal_2)?,
+            ScalarValue::Decimal128(Some(2), DECIMAL128_MAX_PRECISION, i8::MIN)
+        );
+
+        Ok(())
+    }
+
     #[test]
     fn scalar_sub_trait_test() -> Result<()> {
         let float_value = ScalarValue::Float64(Some(123.));
@@ -5526,7 +6477,10 @@ mod tests {
             .sub_checked(&int_value_2)
             .unwrap_err()
             .strip_backtrace();
-        assert_eq!(err, "Arrow error: Arithmetic overflow: Overflow happened on: 9223372036854775807 - -9223372036854775808")
+        assert_eq!(
+            err,
+            "Arrow error: Arithmetic overflow: Overflow happened on: 9223372036854775807 - -9223372036854775808"
+        )
     }
 
     #[test]
@@ -5567,6 +6521,43 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn scalar_decimal_add_overflow_test() {
+        check_scalar_decimal_add_overflow::<Decimal128Type>(
+            ScalarValue::Decimal128(Some(i128::MAX), DECIMAL128_MAX_PRECISION, 0),
+            ScalarValue::Decimal128(Some(1), DECIMAL128_MAX_PRECISION, 0),
+        );
+        check_scalar_decimal_add_overflow::<Decimal256Type>(
+            ScalarValue::Decimal256(Some(i256::MAX), DECIMAL256_MAX_PRECISION, 0),
+            ScalarValue::Decimal256(Some(i256::ONE), DECIMAL256_MAX_PRECISION, 0),
+        );
+    }
+
+    #[test]
+    fn scalar_decimal_in_place_add_error_preserves_lhs() {
+        let mut lhs =
+            ScalarValue::Decimal128(Some(i128::MAX), DECIMAL128_MAX_PRECISION, 0);
+        let original = lhs.clone();
+
+        let err = lhs
+            .try_add_checked_in_place(&ScalarValue::Decimal128(
+                Some(1),
+                DECIMAL128_MAX_PRECISION,
+                0,
+            ))
+            .unwrap_err()
+            .strip_backtrace();
+
+        assert_eq!(
+            err,
+            format!(
+                "Arrow error: Arithmetic overflow: Overflow happened on: {} + 1",
+                i128::MAX
+            )
+        );
+        assert_eq!(lhs, original);
+    }
+
     // Verifies that ScalarValue has the same behavior with compute kernel when it overflows.
     fn check_scalar_add_overflow<T>(left: ScalarValue, right: ScalarValue)
     where
@@ -5583,6 +6574,22 @@ mod tests {
         assert_eq!(scalar_result.is_ok(), arrow_result.is_ok());
     }
 
+    // Verifies the decimal fast path preserves the same overflow behavior as Arrow kernels.
+    fn check_scalar_decimal_add_overflow<T>(left: ScalarValue, right: ScalarValue)
+    where
+        T: ArrowPrimitiveType,
+    {
+        let scalar_result = left.add(&right);
+
+        let left_array = left.to_array().expect("Failed to convert to array");
+        let right_array = right.to_array().expect("Failed to convert to array");
+        let arrow_left_array = left_array.as_primitive::<T>();
+        let arrow_right_array = right_array.as_primitive::<T>();
+        let arrow_result = add_wrapping(arrow_left_array, arrow_right_array);
+
+        assert_eq!(scalar_result.is_ok(), arrow_result.is_ok());
+    }
+
     #[test]
     fn test_interval_add_timestamp() -> Result<()> {
         let interval = ScalarValue::IntervalMonthDayNano(Some(IntervalMonthDayNano {
@@ -5674,12 +6681,16 @@ mod tests {
         assert_eq!(123i128, array_decimal.value(0));
         assert_eq!(123i128, array_decimal.value(9));
         // test eq array
-        assert!(decimal_value
-            .eq_array(&array, 1)
-            .expect("Failed to compare arrays"));
-        assert!(decimal_value
-            .eq_array(&array, 5)
-            .expect("Failed to compare arrays"));
+        assert!(
+            decimal_value
+                .eq_array(&array, 1)
+                .expect("Failed to compare arrays")
+        );
+        assert!(
+            decimal_value
+                .eq_array(&array, 5)
+                .expect("Failed to compare arrays")
+        );
         // test try from array
         assert_eq!(
             decimal_value,
@@ -5724,18 +6735,24 @@ mod tests {
         assert_eq!(4, array.len());
         assert_eq!(DataType::Decimal128(10, 2), array.data_type().clone());
 
-        assert!(ScalarValue::try_new_decimal128(1, 10, 2)
-            .unwrap()
-            .eq_array(&array, 0)
-            .expect("Failed to compare arrays"));
-        assert!(ScalarValue::try_new_decimal128(2, 10, 2)
-            .unwrap()
-            .eq_array(&array, 1)
-            .expect("Failed to compare arrays"));
-        assert!(ScalarValue::try_new_decimal128(3, 10, 2)
-            .unwrap()
-            .eq_array(&array, 2)
-            .expect("Failed to compare arrays"));
+        assert!(
+            ScalarValue::try_new_decimal128(1, 10, 2)
+                .unwrap()
+                .eq_array(&array, 0)
+                .expect("Failed to compare arrays")
+        );
+        assert!(
+            ScalarValue::try_new_decimal128(2, 10, 2)
+                .unwrap()
+                .eq_array(&array, 1)
+                .expect("Failed to compare arrays")
+        );
+        assert!(
+            ScalarValue::try_new_decimal128(3, 10, 2)
+                .unwrap()
+                .eq_array(&array, 2)
+                .expect("Failed to compare arrays")
+        );
         assert_eq!(
             ScalarValue::Decimal128(None, 10, 2),
             ScalarValue::try_from_array(&array, 3).unwrap()
@@ -6010,6 +7027,40 @@ mod tests {
             ),
         ));
         assert_eq!(a.partial_cmp(&b), Some(Ordering::Greater));
+
+        let a = ScalarValue::ListView(Arc::new(ListViewArray::from_iter_primitive::<
+            Int64Type,
+            _,
+            _,
+        >(vec![Some(vec![
+            None,
+            Some(2),
+            Some(3),
+        ])])));
+        let b = ScalarValue::ListView(Arc::new(ListViewArray::from_iter_primitive::<
+            Int64Type,
+            _,
+            _,
+        >(vec![Some(vec![
+            Some(1),
+            Some(2),
+            Some(3),
+        ])])));
+        assert_eq!(a.partial_cmp(&b), Some(Ordering::Greater));
+
+        let a =
+            ScalarValue::LargeListView(Arc::new(
+                LargeListViewArray::from_iter_primitive::<Int64Type, _, _>(vec![Some(
+                    vec![None, Some(2), Some(3)],
+                )]),
+            ));
+        let b =
+            ScalarValue::LargeListView(Arc::new(
+                LargeListViewArray::from_iter_primitive::<Int64Type, _, _>(vec![Some(
+                    vec![Some(1), Some(2), Some(3)],
+                )]),
+            ));
+        assert_eq!(a.partial_cmp(&b), Some(Ordering::Greater));
     }
 
     #[test]
@@ -6171,8 +7222,6 @@ mod tests {
     }
 
     #[test]
-    // despite clippy claiming they are useless, the code doesn't compile otherwise.
-    #[allow(clippy::useless_vec)]
     fn scalar_iter_to_array_boolean() {
         check_scalar_iter!(Boolean, BooleanArray, vec![Some(true), None, Some(false)]);
         check_scalar_iter!(Float32, Float32Array, vec![Some(1.9), None, Some(-2.1)]);
@@ -6222,12 +7271,12 @@ mod tests {
         check_scalar_iter_binary!(
             Binary,
             BinaryArray,
-            vec![Some(b"foo"), None, Some(b"bar")]
+            [Some(b"foo"), None, Some(b"bar")]
         );
         check_scalar_iter_binary!(
             LargeBinary,
             LargeBinaryArray,
-            vec![Some(b"foo"), None, Some(b"bar")]
+            [Some(b"foo"), None, Some(b"bar")]
         );
     }
 
@@ -6359,6 +7408,30 @@ mod tests {
         );
         assert_eq!(expected, scalar);
         assert!(expected.is_null());
+
+        // Test for ListView
+        let data_type = &DataType::ListView(Arc::clone(&inner_field));
+        let scalar: ScalarValue = data_type.try_into().unwrap();
+        let expected = ScalarValue::ListView(
+            new_null_array(data_type, 1)
+                .as_list_view::<i32>()
+                .to_owned()
+                .into(),
+        );
+        assert_eq!(expected, scalar);
+        assert!(expected.is_null());
+
+        // Test for LargeListView
+        let data_type = &DataType::LargeListView(Arc::clone(&inner_field));
+        let scalar: ScalarValue = data_type.try_into().unwrap();
+        let expected = ScalarValue::LargeListView(
+            new_null_array(data_type, 1)
+                .as_list_view::<i64>()
+                .to_owned()
+                .into(),
+        );
+        assert_eq!(expected, scalar);
+        assert!(expected.is_null());
     }
 
     #[test]
@@ -6448,6 +7521,8 @@ mod tests {
             size_of::<Vec<ScalarValue>>() + (9 * size_of::<ScalarValue>()) + sv_size,
         );
 
+        #[allow(clippy::allow_attributes, clippy::mutable_key_type)]
+        // ScalarValue has interior mutability but is intentionally used as hash key
         let mut s = HashSet::with_capacity(0);
         // do NOT clone `sv` here because this may shrink the vector capacity
         s.insert(v.pop().unwrap());
@@ -6680,7 +7755,9 @@ mod tests {
                 for other_index in 0..array.len() {
                     if index != other_index {
                         assert!(
-                            !scalar.eq_array(&array, other_index).expect("Failed to compare arrays"),
+                            !scalar
+                                .eq_array(&array, other_index)
+                                .expect("Failed to compare arrays"),
                             "Expected {scalar:?} to be NOT equal to {array:?} at index {other_index}"
                         );
                     }
@@ -7069,6 +8146,34 @@ mod tests {
                 builder.append(true);
                 Arc::new(builder.finish())
             },
+            // list view array
+            {
+                let values_builder = StringBuilder::new();
+                let mut builder = ListViewBuilder::new(values_builder);
+                // [A, B]
+                builder.values().append_value("A");
+                builder.values().append_value("B");
+                builder.append(true);
+                // [ ] (empty list)
+                builder.append(true);
+                // Null
+                builder.append(false);
+                Arc::new(builder.finish())
+            },
+            // large list view array
+            {
+                let values_builder = StringBuilder::new();
+                let mut builder = LargeListViewBuilder::new(values_builder);
+                // [A, B]
+                builder.values().append_value("A");
+                builder.values().append_value("B");
+                builder.append(true);
+                // [ ] (empty list)
+                builder.append(true);
+                // Null
+                builder.append(false);
+                Arc::new(builder.finish())
+            },
             // map
             {
                 let string_builder = StringBuilder::new();
@@ -7108,6 +8213,31 @@ mod tests {
         }
     }
 
+    #[test]
+    fn roundtrip_run_array() {
+        // Comparison logic in round_trip_through_scalar doesn't work for RunArrays
+        // so we have a custom test for them
+        // TODO: https://github.com/apache/arrow-rs/pull/9213 might fix this ^
+        let run_ends = Int16Array::from(vec![2, 3]);
+        let values = Int64Array::from(vec![Some(1), None]);
+        let run_array = RunArray::try_new(&run_ends, &values).unwrap();
+        let run_array = run_array.downcast::<Int64Array>().unwrap();
+
+        let expected_values = run_array.into_iter().collect::<Vec<_>>();
+
+        for i in 0..run_array.len() {
+            let scalar = ScalarValue::try_from_array(&run_array, i).unwrap();
+            let array = scalar.to_array_of_size(1).unwrap();
+            assert_eq!(array.data_type(), run_array.data_type());
+            let array = array.as_run::<Int16Type>();
+            let array = array.downcast::<Int64Array>().unwrap();
+            assert_eq!(
+                array.into_iter().collect::<Vec<_>>(),
+                expected_values[i..i + 1]
+            );
+        }
+    }
+
     #[test]
     fn test_scalar_union_sparse() {
         let field_a = Arc::new(Field::new("A", DataType::Int32, true));
@@ -7566,6 +8696,38 @@ mod tests {
             },
             DataType::LargeList(Arc::new(Field::new("element", DataType::Int64, true))),
         );
+        check_scalar_cast(
+            {
+                let element_field =
+                    Arc::new(Field::new("element", DataType::Int32, true));
+
+                let mut builder =
+                    ListViewBuilder::new(Int32Builder::new()).with_field(element_field);
+                builder.append_value([Some(1)]);
+                builder.append(true);
+
+                ScalarValue::ListView(Arc::new(builder.finish()))
+            },
+            DataType::ListView(Arc::new(Field::new("element", DataType::Int64, true))),
+        );
+        check_scalar_cast(
+            {
+                let element_field =
+                    Arc::new(Field::new("element", DataType::Int32, true));
+
+                let mut builder = LargeListViewBuilder::new(Int32Builder::new())
+                    .with_field(element_field);
+                builder.append_value([Some(1)]);
+                builder.append(true);
+
+                ScalarValue::LargeListView(Arc::new(builder.finish()))
+            },
+            DataType::LargeListView(Arc::new(Field::new(
+                "element",
+                DataType::Int64,
+                true,
+            ))),
+        );
     }
 
     // mimics how casting work on scalar values by `casting` `scalar` to `desired_type`
@@ -7605,7 +8767,6 @@ mod tests {
     }
 
     #[test]
-    #[allow(arithmetic_overflow)] // we want to test them
     fn test_scalar_negative_overflows() -> Result<()> {
         macro_rules! test_overflow_on_value {
             ($($val:expr),* $(,)?) => {$(
@@ -7922,6 +9083,42 @@ mod tests {
                 ScalarValue::Decimal256(Some(10.into()), 1, 0),
                 5,
             ),
+            // Temporal types
+            (
+                ScalarValue::Date32(Some(0)),
+                ScalarValue::Date32(Some(10)),
+                10,
+            ),
+            (
+                ScalarValue::Date32(Some(10)),
+                ScalarValue::Date32(Some(0)),
+                10,
+            ),
+            (
+                ScalarValue::Date64(Some(1000)),
+                ScalarValue::Date64(Some(5000)),
+                4000,
+            ),
+            (
+                ScalarValue::TimestampSecond(Some(100), None),
+                ScalarValue::TimestampSecond(Some(200), None),
+                100,
+            ),
+            (
+                ScalarValue::TimestampMillisecond(Some(1000), None),
+                ScalarValue::TimestampMillisecond(Some(5000), None),
+                4000,
+            ),
+            (
+                ScalarValue::TimestampMicrosecond(Some(0), None),
+                ScalarValue::TimestampMicrosecond(Some(1_000_000), None),
+                1_000_000,
+            ),
+            (
+                ScalarValue::TimestampNanosecond(Some(1_000_000_000), None),
+                ScalarValue::TimestampNanosecond(Some(2_000_000_000), None),
+                1_000_000_000,
+            ),
         ];
         for (lhs, rhs, expected) in cases.iter() {
             let distance = lhs.distance(rhs).unwrap();
@@ -7984,8 +9181,6 @@ mod tests {
                 ScalarValue::Boolean(Some(true)),
                 ScalarValue::Boolean(Some(false)),
             ),
-            (ScalarValue::Date32(Some(0)), ScalarValue::Date32(Some(1))),
-            (ScalarValue::Date64(Some(0)), ScalarValue::Date64(Some(1))),
             (
                 ScalarValue::Decimal128(Some(123), 5, 5),
                 ScalarValue::Decimal128(Some(120), 5, 3),
@@ -8192,7 +9387,7 @@ mod tests {
             )))),
         ];
 
-        let check_array = |array| {
+        let check_array = |array: Arc<dyn Array>| {
             let is_null = is_null(&array).unwrap();
             assert_eq!(is_null, BooleanArray::from(vec![true, false, false]));
 
@@ -8257,6 +9452,21 @@ mod tests {
         ");
     }
 
+    #[test]
+    fn test_list_view_display() {
+        let s = ScalarValue::ListView(
+            ListViewArray::from_iter_primitive::<Int64Type, _, _>(vec![Some(vec![
+                Some(1),
+                None,
+                Some(3),
+            ])])
+            .into(),
+        );
+
+        assert_eq!(s.to_string(), "[1, , 3]");
+        assert_eq!(format!("{s:?}"), "ListView([1, , 3])");
+    }
+
     #[test]
     fn test_null_bug() {
         let field_a = Field::new("a", DataType::Int32, true);
@@ -8621,6 +9831,19 @@ mod tests {
         assert!(dense_scalar.is_null());
     }
 
+    #[test]
+    fn cast_date_to_timestamp_overflow_returns_error() {
+        let scalar = ScalarValue::Date32(Some(i32::MAX));
+        let err = scalar
+            .cast_to(&DataType::Timestamp(TimeUnit::Nanosecond, None))
+            .expect_err("expected cast to fail");
+        assert!(
+            err.to_string()
+                .contains("converted value exceeds the representable i64 range"),
+            "unexpected error: {err}"
+        );
+    }
+
     #[test]
     fn null_dictionary_scalar_produces_null_dictionary_array() {
         let dictionary_scalar = ScalarValue::Dictionary(
@@ -8702,13 +9925,17 @@ mod tests {
                 42,
             ))
             .unwrap(),
+            ScalarValue::try_new_null(&DataType::ListView(Arc::clone(&field_ref)))
+                .unwrap(),
+            ScalarValue::try_new_null(&DataType::LargeListView(Arc::clone(&field_ref)))
+                .unwrap(),
             ScalarValue::try_new_null(&DataType::Struct(
                 vec![Arc::clone(&field_ref)].into(),
             ))
             .unwrap(),
             ScalarValue::try_new_null(&DataType::Map(map_field_ref, false)).unwrap(),
             ScalarValue::try_new_null(&DataType::Union(
-                UnionFields::new(vec![42], vec![field_ref]),
+                UnionFields::try_new(vec![42], vec![field_ref]).unwrap(),
                 UnionMode::Dense,
             ))
             .unwrap(),
@@ -8794,6 +10021,41 @@ mod tests {
             _ => panic!("Expected List"),
         }
 
+        let list_field = Field::new_list_field(DataType::Int32, true);
+        let list_result =
+            ScalarValue::new_default(&DataType::LargeList(Arc::new(list_field.clone())))
+                .unwrap();
+        match list_result {
+            ScalarValue::LargeList(arr) => {
+                assert_eq!(arr.len(), 1);
+                assert_eq!(arr.value_length(0), 0); // empty list
+            }
+            _ => panic!("Expected LargeList"),
+        }
+
+        let list_result =
+            ScalarValue::new_default(&DataType::ListView(Arc::new(list_field.clone())))
+                .unwrap();
+        match list_result {
+            ScalarValue::ListView(arr) => {
+                assert_eq!(arr.len(), 1);
+                assert_eq!(arr.value_size(0), 0); // empty list
+            }
+            _ => panic!("Expected ListView"),
+        }
+
+        let list_result = ScalarValue::new_default(&DataType::LargeListView(Arc::new(
+            list_field.clone(),
+        )))
+        .unwrap();
+        match list_result {
+            ScalarValue::LargeListView(arr) => {
+                assert_eq!(arr.len(), 1);
+                assert_eq!(arr.value_size(0), 0); // empty list
+            }
+            _ => panic!("Expected LargeListView"),
+        }
+
         // Test struct type
         let struct_fields = Fields::from(vec![
             Field::new("a", DataType::Int32, false),
@@ -8811,13 +10073,14 @@ mod tests {
         }
 
         // Test union type
-        let union_fields = UnionFields::new(
+        let union_fields = UnionFields::try_new(
             vec![0, 1],
             vec![
                 Field::new("i32", DataType::Int32, false),
                 Field::new("f64", DataType::Float64, false),
             ],
-        );
+        )
+        .unwrap();
         let union_result = ScalarValue::new_default(&DataType::Union(
             union_fields.clone(),
             UnionMode::Sparse,
@@ -8902,6 +10165,30 @@ mod tests {
             )))),
             None
         );
+        assert_eq!(
+            ScalarValue::min(&DataType::LargeList(Arc::new(Field::new(
+                "item",
+                DataType::Int32,
+                true
+            )))),
+            None
+        );
+        assert_eq!(
+            ScalarValue::min(&DataType::ListView(Arc::new(Field::new(
+                "item",
+                DataType::Int32,
+                true
+            )))),
+            None
+        );
+        assert_eq!(
+            ScalarValue::min(&DataType::LargeListView(Arc::new(Field::new(
+                "item",
+                DataType::Int32,
+                true
+            )))),
+            None
+        );
     }
 
     #[test]
@@ -8978,6 +10265,22 @@ mod tests {
             )]))),
             None
         );
+        assert_eq!(
+            ScalarValue::max(&DataType::ListView(Arc::new(Field::new(
+                "item",
+                DataType::Int32,
+                true
+            )))),
+            None
+        );
+        assert_eq!(
+            ScalarValue::max(&DataType::LargeListView(Arc::new(Field::new(
+                "item",
+                DataType::Int32,
+                true
+            )))),
+            None
+        );
     }
 
     #[test]
@@ -9046,6 +10349,196 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_views_minimize_memory() {
+        let value = "this string is longer than 12 bytes".to_string();
+
+        let scalar = ScalarValue::Utf8View(Some(value.clone()));
+        let array = scalar.to_array_of_size(10).unwrap();
+        let array = array.as_string_view();
+        let buffers = array.data_buffers();
+        assert_eq!(1, buffers.len());
+        // Ensure we only have a single copy of the value string
+        assert_eq!(value.len(), buffers[0].len());
+
+        // Same but for BinaryView
+        let scalar = ScalarValue::BinaryView(Some(value.bytes().collect()));
+        let array = scalar.to_array_of_size(10).unwrap();
+        let array = array.as_binary_view();
+        let buffers = array.data_buffers();
+        assert_eq!(1, buffers.len());
+        assert_eq!(value.len(), buffers[0].len());
+    }
+
+    #[test]
+    fn test_to_array_of_size_run_end_encoded() {
+        fn run_test<R: RunEndIndexType>() {
+            let value = Box::new(ScalarValue::Float32(Some(1.0)));
+            let size = 5;
+            let scalar = ScalarValue::RunEndEncoded(
+                Field::new("run_ends", R::DATA_TYPE, false).into(),
+                Field::new("values", DataType::Float32, true).into(),
+                value.clone(),
+            );
+            let array = scalar.to_array_of_size(size).unwrap();
+            let array = array.as_run::<R>();
+            let array = array.downcast::<Float32Array>().unwrap();
+            assert_eq!(vec![Some(1.0); size], array.into_iter().collect::<Vec<_>>());
+            assert_eq!(1, array.values().len());
+        }
+
+        run_test::<Int16Type>();
+        run_test::<Int32Type>();
+        run_test::<Int64Type>();
+
+        let scalar = ScalarValue::RunEndEncoded(
+            Field::new("run_ends", DataType::Int16, false).into(),
+            Field::new("values", DataType::Float32, true).into(),
+            Box::new(ScalarValue::Float32(Some(1.0))),
+        );
+        let err = scalar.to_array_of_size(i16::MAX as usize + 10).unwrap_err();
+        assert_eq!(
+            "Execution error: Cannot construct RunArray of size 32777: Overflows run-ends type Int16",
+            err.to_string()
+        )
+    }
+
+    #[test]
+    fn test_eq_array_run_end_encoded() {
+        let run_ends = Int16Array::from(vec![1, 3]);
+        let values = Float32Array::from(vec![None, Some(1.0)]);
+        let run_array =
+            Arc::new(RunArray::try_new(&run_ends, &values).unwrap()) as ArrayRef;
+
+        let scalar = ScalarValue::RunEndEncoded(
+            Field::new("run_ends", DataType::Int16, false).into(),
+            Field::new("values", DataType::Float32, true).into(),
+            Box::new(ScalarValue::Float32(None)),
+        );
+        assert!(scalar.eq_array(&run_array, 0).unwrap());
+
+        let scalar = ScalarValue::RunEndEncoded(
+            Field::new("run_ends", DataType::Int16, false).into(),
+            Field::new("values", DataType::Float32, true).into(),
+            Box::new(ScalarValue::Float32(Some(1.0))),
+        );
+        assert!(scalar.eq_array(&run_array, 1).unwrap());
+        assert!(scalar.eq_array(&run_array, 2).unwrap());
+
+        // value types must match
+        let scalar = ScalarValue::RunEndEncoded(
+            Field::new("run_ends", DataType::Int16, false).into(),
+            Field::new("values", DataType::Float64, true).into(),
+            Box::new(ScalarValue::Float64(Some(1.0))),
+        );
+        let err = scalar.eq_array(&run_array, 1).unwrap_err();
+        let expected = "Internal error: could not cast array of type Float32 to arrow_array::array::primitive_array::PrimitiveArray<arrow_array::types::Float64Type>";
+        assert!(err.to_string().starts_with(expected));
+
+        // run ends type must match
+        let scalar = ScalarValue::RunEndEncoded(
+            Field::new("run_ends", DataType::Int32, false).into(),
+            Field::new("values", DataType::Float32, true).into(),
+            Box::new(ScalarValue::Float32(None)),
+        );
+        let err = scalar.eq_array(&run_array, 0).unwrap_err();
+        let expected = "Internal error: could not cast array of type RunEndEncoded(\"run_ends\": non-null Int16, \"values\": Float32) to arrow_array::array::run_array::RunArray<arrow_array::types::Int32Type>";
+        assert!(err.to_string().starts_with(expected));
+    }
+
+    #[test]
+    fn test_iter_to_array_run_end_encoded() {
+        let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int16, false));
+        let values_field = Arc::new(Field::new("values", DataType::Int64, true));
+        let scalars = vec![
+            ScalarValue::RunEndEncoded(
+                Arc::clone(&run_ends_field),
+                Arc::clone(&values_field),
+                Box::new(ScalarValue::Int64(Some(1))),
+            ),
+            ScalarValue::RunEndEncoded(
+                Arc::clone(&run_ends_field),
+                Arc::clone(&values_field),
+                Box::new(ScalarValue::Int64(Some(1))),
+            ),
+            ScalarValue::RunEndEncoded(
+                Arc::clone(&run_ends_field),
+                Arc::clone(&values_field),
+                Box::new(ScalarValue::Int64(None)),
+            ),
+            ScalarValue::RunEndEncoded(
+                Arc::clone(&run_ends_field),
+                Arc::clone(&values_field),
+                Box::new(ScalarValue::Int64(Some(2))),
+            ),
+            ScalarValue::RunEndEncoded(
+                Arc::clone(&run_ends_field),
+                Arc::clone(&values_field),
+                Box::new(ScalarValue::Int64(Some(2))),
+            ),
+            ScalarValue::RunEndEncoded(
+                Arc::clone(&run_ends_field),
+                Arc::clone(&values_field),
+                Box::new(ScalarValue::Int64(Some(2))),
+            ),
+        ];
+
+        let run_array = ScalarValue::iter_to_array(scalars).unwrap();
+        let expected = RunArray::try_new(
+            &Int16Array::from(vec![2, 3, 6]),
+            &Int64Array::from(vec![Some(1), None, Some(2)]),
+        )
+        .unwrap();
+        assert_eq!(&expected as &dyn Array, run_array.as_ref());
+
+        // inconsistent run-ends type
+        let scalars = vec![
+            ScalarValue::RunEndEncoded(
+                Arc::clone(&run_ends_field),
+                Arc::clone(&values_field),
+                Box::new(ScalarValue::Int64(Some(1))),
+            ),
+            ScalarValue::RunEndEncoded(
+                Field::new("run_ends", DataType::Int32, false).into(),
+                Arc::clone(&values_field),
+                Box::new(ScalarValue::Int64(Some(1))),
+            ),
+        ];
+        let err = ScalarValue::iter_to_array(scalars).unwrap_err();
+        let expected = "Execution error: Expected RunEndEncoded scalar with run-ends field Field { \"run_ends\": Int16 } but got: RunEndEncoded(Field { name: \"run_ends\", data_type: Int32 }, Field { name: \"values\", data_type: Int64, nullable: true }, Int64(1))";
+        assert!(err.to_string().starts_with(expected));
+
+        // inconsistent value type
+        let scalars = vec![
+            ScalarValue::RunEndEncoded(
+                Arc::clone(&run_ends_field),
+                Arc::clone(&values_field),
+                Box::new(ScalarValue::Int64(Some(1))),
+            ),
+            ScalarValue::RunEndEncoded(
+                Arc::clone(&run_ends_field),
+                Field::new("values", DataType::Int32, true).into(),
+                Box::new(ScalarValue::Int32(Some(1))),
+            ),
+        ];
+        let err = ScalarValue::iter_to_array(scalars).unwrap_err();
+        let expected = "Execution error: Expected RunEndEncoded scalar with run-ends field Field { \"run_ends\": Int16 } but got: RunEndEncoded(Field { name: \"run_ends\", data_type: Int16 }, Field { name: \"values\", data_type: Int32, nullable: true }, Int32(1))";
+        assert!(err.to_string().starts_with(expected));
+
+        // inconsistent scalars type
+        let scalars = vec![
+            ScalarValue::RunEndEncoded(
+                Arc::clone(&run_ends_field),
+                Arc::clone(&values_field),
+                Box::new(ScalarValue::Int64(Some(1))),
+            ),
+            ScalarValue::Int64(Some(1)),
+        ];
+        let err = ScalarValue::iter_to_array(scalars).unwrap_err();
+        let expected = "Execution error: Expected RunEndEncoded scalar with run-ends field Field { \"run_ends\": Int16 } but got: Int64(1)";
+        assert!(err.to_string().starts_with(expected));
+    }
+
     #[test]
     fn test_convert_array_to_scalar_vec() {
         // 1: Regular ListArray
@@ -9166,5 +10659,52 @@ mod tests {
                 ]),
             ]
         );
+
+        // 6: Regular ListViewArray
+        let list = ListViewArray::from_iter_primitive::<Int64Type, _, _>(vec![
+            Some(vec![Some(1), Some(2)]),
+            None,
+            Some(vec![Some(3), None, Some(4)]),
+        ]);
+        let converted = ScalarValue::convert_array_to_scalar_vec(&list).unwrap();
+        assert_eq!(
+            converted,
+            vec![
+                Some(vec![
+                    ScalarValue::Int64(Some(1)),
+                    ScalarValue::Int64(Some(2))
+                ]),
+                None,
+                Some(vec![
+                    ScalarValue::Int64(Some(3)),
+                    ScalarValue::Int64(None),
+                    ScalarValue::Int64(Some(4))
+                ]),
+            ]
+        );
+
+        // 7: Regular LargeListViewArray
+        let large_list =
+            LargeListViewArray::from_iter_primitive::<Int64Type, _, _>(vec![
+                Some(vec![Some(1), Some(2)]),
+                None,
+                Some(vec![Some(3), None, Some(4)]),
+            ]);
+        let converted = ScalarValue::convert_array_to_scalar_vec(&large_list).unwrap();
+        assert_eq!(
+            converted,
+            vec![
+                Some(vec![
+                    ScalarValue::Int64(Some(1)),
+                    ScalarValue::Int64(Some(2))
+                ]),
+                None,
+                Some(vec![
+                    ScalarValue::Int64(Some(3)),
+                    ScalarValue::Int64(None),
+                    ScalarValue::Int64(Some(4))
+                ]),
+            ]
+        );
     }
 }
diff --git a/datafusion/common/src/scalar/struct_builder.rs b/datafusion/common/src/scalar/struct_builder.rs
index 56daee904514a..045b5778243df 100644
--- a/datafusion/common/src/scalar/struct_builder.rs
+++ b/datafusion/common/src/scalar/struct_builder.rs
@@ -83,6 +83,7 @@ impl ScalarStructBuilder {
     }
 
     /// Add the specified field and `ScalarValue` to the struct.
+    #[expect(clippy::needless_pass_by_value)] // Skip for public API's compatibility
     pub fn with_scalar(self, field: impl IntoFieldRef, value: ScalarValue) -> Self {
         // valid scalar value should not fail
         let array = value.to_array().unwrap();
diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs
index da298c20ebcb4..29b9c36c0a7ea 100644
--- a/datafusion/common/src/stats.rs
+++ b/datafusion/common/src/stats.rs
@@ -22,17 +22,40 @@ use std::fmt::{self, Debug, Display};
 use crate::{Result, ScalarValue};
 
 use crate::error::_plan_err;
+use crate::utils::aggregate::precision_add;
 use arrow::datatypes::{DataType, Schema};
 
 /// Represents a value with a degree of certainty. `Precision` is used to
 /// propagate information the precision of statistical values.
 #[derive(Clone, PartialEq, Eq, Default, Copy)]
 pub enum Precision<T: Debug + Clone + PartialEq + Eq + PartialOrd> {
-    /// The exact value is known
+    /// The exact value is known. Used for guaranteeing correctness.
+    ///
+    /// Comes from definitive sources such as:
+    /// - Parquet file metadata (row counts, byte sizes)
+    /// - In-memory RecordBatch data (actual row counts, byte sizes, null counts)
+    /// - and more...
     Exact(T),
-    /// The value is not known exactly, but is likely close to this value
+    /// The value is not known exactly, but is likely close to this value.
+    /// Used for cost-based optimizations.
+    ///
+    /// Some operations that would result in `Inexact(T)` would be:
+    /// - Applying a filter (selectivity is unknown)
+    /// - Mixing exact and inexact values in arithmetic
+    /// - and more...
     Inexact(T),
-    /// Nothing is known about the value
+    /// Nothing is known about the value. This is the default state.
+    ///
+    /// Acts as an absorbing element in arithmetic -> any operation
+    /// involving `Absent` yields `Absent`. [`Precision::to_inexact`]
+    /// on `Absent` returns `Absent`, not `Inexact` — it represents
+    /// a fundamentally different state.
+    ///
+    /// Common sources include:
+    /// - Data sources without statistics
+    /// - Parquet columns missing from file metadata
+    /// - Statistics that cannot be derived for an operation (e.g.,
+    ///   `distinct_count` after a union, `total_byte_size` for joins)
     #[default]
     Absent,
 }
@@ -180,24 +203,74 @@ impl Precision<usize> {
 }
 
 impl Precision<ScalarValue> {
+    fn sum_data_type(data_type: &DataType) -> DataType {
+        match data_type {
+            DataType::Int8 | DataType::Int16 | DataType::Int32 => DataType::Int64,
+            DataType::UInt8 | DataType::UInt16 | DataType::UInt32 => DataType::UInt64,
+            _ => data_type.clone(),
+        }
+    }
+
+    fn cast_scalar_to_sum_type(value: &ScalarValue) -> Result<ScalarValue> {
+        let source_type = value.data_type();
+        let target_type = Self::sum_data_type(&source_type);
+        if source_type == target_type {
+            Ok(value.clone())
+        } else {
+            value.cast_to(&target_type)
+        }
+    }
+
     /// Calculates the sum of two (possibly inexact) [`ScalarValue`] values,
     /// conservatively propagating exactness information. If one of the input
     /// values is [`Precision::Absent`], the result is `Absent` too.
+    ///
+    /// Uses [`ScalarValue::add_checked`] so that integer overflow returns
+    /// an error (mapped to `Absent`) instead of silently wrapping.
+    ///
+    /// For performance-sensitive paths prefer `precision_add` which
+    /// avoids the Arrow array round-trip.
     pub fn add(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
         match (self, other) {
-            (Precision::Exact(a), Precision::Exact(b)) => {
-                a.add(b).map(Precision::Exact).unwrap_or(Precision::Absent)
-            }
+            (Precision::Exact(a), Precision::Exact(b)) => a
+                .add_checked(b)
+                .map(Precision::Exact)
+                .unwrap_or(Precision::Absent),
             (Precision::Inexact(a), Precision::Exact(b))
             | (Precision::Exact(a), Precision::Inexact(b))
             | (Precision::Inexact(a), Precision::Inexact(b)) => a
-                .add(b)
+                .add_checked(b)
+                .map(Precision::Inexact)
+                .unwrap_or(Precision::Absent),
+            (_, _) => Precision::Absent,
+        }
+    }
+
+    /// Casts integer values to the wider SQL `SUM` return type.
+    ///
+    /// This narrows overflow risk when `sum_value` statistics are merged:
+    /// `Int8/Int16/Int32 -> Int64` and `UInt8/UInt16/UInt32 -> UInt64`.
+    pub fn cast_to_sum_type(&self) -> Precision<ScalarValue> {
+        match (self.is_exact(), self.get_value()) {
+            (Some(true), Some(value)) => Self::cast_scalar_to_sum_type(value)
+                .map(Precision::Exact)
+                .unwrap_or(Precision::Absent),
+            (Some(false), Some(value)) => Self::cast_scalar_to_sum_type(value)
                 .map(Precision::Inexact)
                 .unwrap_or(Precision::Absent),
             (_, _) => Precision::Absent,
         }
     }
 
+    /// SUM-style addition with integer widening to match SQL `SUM` return
+    /// types for smaller integral inputs.
+    pub fn add_for_sum(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
+        let mut lhs = self.cast_to_sum_type();
+        let rhs = other.cast_to_sum_type();
+        precision_add(&mut lhs, &rhs);
+        lhs
+    }
+
     /// Calculates the difference of two (possibly inexact) [`ScalarValue`] values,
     /// conservatively propagating exactness information. If one of the input
     /// values is [`Precision::Absent`], the result is `Absent` too.
@@ -283,9 +356,14 @@ impl From<Precision<usize>> for Precision<ScalarValue> {
 /// and the transformations output are not always predictable.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct Statistics {
-    /// The number of table rows.
+    /// The number of rows estimated to be scanned.
     pub num_rows: Precision<usize>,
-    /// Total bytes of the table rows.
+    /// The total bytes of the output data.
+    ///
+    /// Note that this is not the same as the total bytes that may be scanned,
+    /// processed, etc.
+    /// E.g. we may read 1GB of data from a Parquet file but the Arrow data
+    /// the node produces may be 2GB; it's this 2GB that is tracked here.
     pub total_byte_size: Precision<usize>,
     /// Statistics on a column level.
     ///
@@ -294,6 +372,27 @@ pub struct Statistics {
     pub column_statistics: Vec<ColumnStatistics>,
 }
 
+/// Fallback to use when NDV overlap can not be estimated from column bounds.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub enum NdvFallback {
+    /// Use the larger input NDV. This is the conservative default for
+    /// related fragments such as files from the same table.
+    #[default]
+    Max,
+    /// Sum the input NDVs. This is a conservative upper bound for
+    /// independent inputs such as `UNION ALL`.
+    Sum,
+}
+
+impl NdvFallback {
+    fn merge(self, left: usize, right: usize) -> usize {
+        match self {
+            Self::Max => usize::max(left, right),
+            Self::Sum => left.saturating_add(right),
+        }
+    }
+}
+
 impl Default for Statistics {
     /// Returns a new [`Statistics`] instance with all fields set to unknown
     /// and no columns.
@@ -317,6 +416,31 @@ impl Statistics {
         }
     }
 
+    /// Calculates `total_byte_size` based on the schema and `num_rows`.
+    /// If any of the columns has non-primitive width, `total_byte_size` is set to inexact.
+    pub fn calculate_total_byte_size(&mut self, schema: &Schema) {
+        let mut row_size = Some(0);
+        for field in schema.fields() {
+            match field.data_type().primitive_width() {
+                Some(width) => {
+                    row_size = row_size.map(|s| s + width);
+                }
+                None => {
+                    row_size = None;
+                    break;
+                }
+            }
+        }
+        match row_size {
+            None => {
+                self.total_byte_size = self.total_byte_size.to_inexact();
+            }
+            Some(size) => {
+                self.total_byte_size = self.num_rows.multiply(&Precision::Exact(size));
+            }
+        }
+    }
+
     /// Returns an unbounded `ColumnStatistics` for each field in the schema.
     pub fn unknown_column(schema: &Schema) -> Vec<ColumnStatistics> {
         schema
@@ -362,12 +486,17 @@ impl Statistics {
     /// For example, if we had statistics for columns `{"a", "b", "c"}`,
     /// projecting to `vec![2, 1]` would return statistics for columns `{"c",
     /// "b"}`.
-    pub fn project(mut self, projection: Option<&Vec<usize>>) -> Self {
-        let Some(projection) = projection else {
+    pub fn project(self, projection: Option<&impl AsRef<[usize]>>) -> Self {
+        let projection = projection.map(AsRef::as_ref);
+        self.project_impl(projection)
+    }
+
+    fn project_impl(mut self, projection: Option<&[usize]>) -> Self {
+        let Some(projection) = projection.map(AsRef::as_ref) else {
             return self;
         };
 
-        #[allow(clippy::large_enum_variant)]
+        #[expect(clippy::large_enum_variant)]
         enum Slot {
             /// The column is taken and put into the specified statistics location
             Taken(usize),
@@ -381,7 +510,7 @@ impl Statistics {
             .map(Slot::Present)
             .collect();
 
-        for idx in projection {
+        for idx in projection.iter() {
             let next_idx = self.column_statistics.len();
             let slot = std::mem::replace(
                 columns.get_mut(*idx).expect("projection out of bounds"),
@@ -477,15 +606,42 @@ impl Statistics {
         self.column_statistics = self
             .column_statistics
             .into_iter()
-            .map(ColumnStatistics::to_inexact)
+            .map(|cs| {
+                let mut cs = cs.to_inexact();
+                // Scale byte_size by the row ratio
+                cs.byte_size = match cs.byte_size {
+                    Precision::Exact(n) | Precision::Inexact(n) => {
+                        Precision::Inexact((n as f64 * ratio) as usize)
+                    }
+                    Precision::Absent => Precision::Absent,
+                };
+                // NDV can never exceed the number of rows
+                if let Some(&rows) = self.num_rows.get_value() {
+                    cs.distinct_count = cs.distinct_count.min(&Precision::Inexact(rows));
+                }
+                cs
+            })
             .collect();
-        // Adjust the total_byte_size for the ratio of rows before and after, also marking it as inexact
-        self.total_byte_size = match &self.total_byte_size {
-            Precision::Exact(n) | Precision::Inexact(n) => {
-                let adjusted = (*n as f64 * ratio) as usize;
-                Precision::Inexact(adjusted)
+
+        // Compute total_byte_size as sum of column byte_size values if all are present,
+        // otherwise fall back to scaling the original total_byte_size
+        let sum_scan_bytes: Option<usize> = self
+            .column_statistics
+            .iter()
+            .map(|cs| cs.byte_size.get_value().copied())
+            .try_fold(0usize, |acc, val| val.map(|v| acc + v));
+
+        self.total_byte_size = match sum_scan_bytes {
+            Some(sum) => Precision::Inexact(sum),
+            None => {
+                // Fall back to scaling original total_byte_size if not all columns have byte_size
+                match &self.total_byte_size {
+                    Precision::Exact(n) | Precision::Inexact(n) => {
+                        Precision::Inexact((*n as f64 * ratio) as usize)
+                    }
+                    Precision::Absent => Precision::Absent,
+                }
             }
-            Precision::Absent => Precision::Absent,
         };
         Ok(self)
     }
@@ -495,24 +651,10 @@ impl Statistics {
     /// The method assumes that all statistics are for the same schema.
     /// If not, maybe you can call `SchemaMapper::map_column_statistics` to make them consistent.
     ///
-    /// Returns an error if the statistics do not match the specified schemas.
-    pub fn try_merge_iter<'a, I>(items: I, schema: &Schema) -> Result<Statistics>
-    where
-        I: IntoIterator<Item = &'a Statistics>,
-    {
-        let mut items = items.into_iter();
-
-        let Some(init) = items.next() else {
-            return Ok(Statistics::new_unknown(schema));
-        };
-        items.try_fold(init.clone(), |acc: Statistics, item_stats: &Statistics| {
-            acc.try_merge(item_stats)
-        })
-    }
-
-    /// Merge this Statistics value with another Statistics value.
+    /// This method uses [`NdvFallback::Max`] when `distinct_count` overlap
+    /// can not be estimated from column bounds.
     ///
-    /// Returns an error if the statistics do not match (different schemas).
+    /// Returns an error if the statistics do not match the specified schemas.
     ///
     /// # Example
     /// ```
@@ -520,67 +662,110 @@ impl Statistics {
     /// # use arrow::datatypes::{Field, Schema, DataType};
     /// # use datafusion_common::stats::Precision;
     /// let stats1 = Statistics::default()
-    ///     .with_num_rows(Precision::Exact(1))
-    ///     .with_total_byte_size(Precision::Exact(2))
+    ///     .with_num_rows(Precision::Exact(10))
     ///     .add_column_statistics(
     ///         ColumnStatistics::new_unknown()
-    ///             .with_null_count(Precision::Exact(3))
-    ///             .with_min_value(Precision::Exact(ScalarValue::from(4)))
-    ///             .with_max_value(Precision::Exact(ScalarValue::from(5))),
+    ///             .with_min_value(Precision::Exact(ScalarValue::from(1)))
+    ///             .with_max_value(Precision::Exact(ScalarValue::from(100)))
+    ///             .with_sum_value(Precision::Exact(ScalarValue::from(500))),
     ///     );
     ///
     /// let stats2 = Statistics::default()
-    ///     .with_num_rows(Precision::Exact(10))
-    ///     .with_total_byte_size(Precision::Inexact(20))
+    ///     .with_num_rows(Precision::Exact(20))
     ///     .add_column_statistics(
     ///         ColumnStatistics::new_unknown()
-    ///             // absent null count
-    ///             .with_min_value(Precision::Exact(ScalarValue::from(40)))
-    ///             .with_max_value(Precision::Exact(ScalarValue::from(50))),
+    ///             .with_min_value(Precision::Exact(ScalarValue::from(5)))
+    ///             .with_max_value(Precision::Exact(ScalarValue::from(200)))
+    ///             .with_sum_value(Precision::Exact(ScalarValue::from(1000))),
     ///     );
     ///
-    /// let merged_stats = stats1.try_merge(&stats2).unwrap();
-    /// let expected_stats = Statistics::default()
-    ///     .with_num_rows(Precision::Exact(11))
-    ///     .with_total_byte_size(Precision::Inexact(22)) // inexact in stats2 --> inexact
-    ///     .add_column_statistics(
-    ///         ColumnStatistics::new_unknown()
-    ///             .with_null_count(Precision::Absent) // missing from stats2 --> absent
-    ///             .with_min_value(Precision::Exact(ScalarValue::from(4)))
-    ///             .with_max_value(Precision::Exact(ScalarValue::from(50))),
-    ///     );
+    /// let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+    /// let merged = Statistics::try_merge_iter(
+    ///     &[stats1, stats2],
+    ///     &schema,
+    /// ).unwrap();
     ///
-    /// assert_eq!(merged_stats, expected_stats)
+    /// assert_eq!(merged.num_rows, Precision::Exact(30));
+    /// assert_eq!(merged.column_statistics[0].min_value,
+    ///     Precision::Exact(ScalarValue::from(1)));
+    /// assert_eq!(merged.column_statistics[0].max_value,
+    ///     Precision::Exact(ScalarValue::from(200)));
+    /// assert_eq!(merged.column_statistics[0].sum_value,
+    ///     Precision::Exact(ScalarValue::Int64(Some(1500))));
     /// ```
-    pub fn try_merge(self, other: &Statistics) -> Result<Self> {
-        let Self {
-            mut num_rows,
-            mut total_byte_size,
-            mut column_statistics,
-        } = self;
-
-        // Accumulate statistics for subsequent items
-        num_rows = num_rows.add(&other.num_rows);
-        total_byte_size = total_byte_size.add(&other.total_byte_size);
-
-        if column_statistics.len() != other.column_statistics.len() {
-            return _plan_err!(
-                "Cannot merge statistics with different number of columns: {} vs {}",
-                column_statistics.len(),
-                other.column_statistics.len()
-            );
+    pub fn try_merge_iter<'a, I>(items: I, schema: &Schema) -> Result<Statistics>
+    where
+        I: IntoIterator<Item = &'a Statistics>,
+    {
+        Self::try_merge_iter_with_ndv_fallback(items, schema, NdvFallback::Max)
+    }
+
+    /// Same as [`Statistics::try_merge_iter`], but lets callers choose the
+    /// fallback used when `distinct_count` overlap can not be estimated.
+    pub fn try_merge_iter_with_ndv_fallback<'a, I>(
+        items: I,
+        schema: &Schema,
+        ndv_fallback: NdvFallback,
+    ) -> Result<Statistics>
+    where
+        I: IntoIterator<Item = &'a Statistics>,
+    {
+        let mut items = items.into_iter();
+        let Some(first) = items.next() else {
+            return Ok(Statistics::new_unknown(schema));
+        };
+        let Some(second) = items.next() else {
+            return Ok(first.clone());
+        };
+
+        let num_cols = first.column_statistics.len();
+        let mut num_rows = first.num_rows;
+        let mut total_byte_size = first.total_byte_size;
+        let mut column_statistics = first.column_statistics.clone();
+        for col_stats in &mut column_statistics {
+            cast_sum_value_to_sum_type_in_place(&mut col_stats.sum_value);
         }
 
-        for (item_col_stats, col_stats) in other
-            .column_statistics
-            .iter()
-            .zip(column_statistics.iter_mut())
-        {
-            col_stats.null_count = col_stats.null_count.add(&item_col_stats.null_count);
-            col_stats.max_value = col_stats.max_value.max(&item_col_stats.max_value);
-            col_stats.min_value = col_stats.min_value.min(&item_col_stats.min_value);
-            col_stats.sum_value = col_stats.sum_value.add(&item_col_stats.sum_value);
-            col_stats.distinct_count = Precision::Absent;
+        // Merge the remaining items in a single pass.
+        for (i, stat) in std::iter::once(second).chain(items).enumerate() {
+            if stat.column_statistics.len() != num_cols {
+                return _plan_err!(
+                    "Cannot merge statistics with different number of columns: {} vs {} (item {})",
+                    num_cols,
+                    stat.column_statistics.len(),
+                    i + 1
+                );
+            }
+            num_rows = num_rows.add(&stat.num_rows);
+            total_byte_size = total_byte_size.add(&stat.total_byte_size);
+
+            // Uses precision_add for sum (reuses the lhs accumulator for
+            // direct numeric addition), while preserving the NDV update
+            // ordering required by estimate_ndv_with_overlap.
+            for (col_stats, item_cs) in
+                column_statistics.iter_mut().zip(&stat.column_statistics)
+            {
+                col_stats.null_count = col_stats.null_count.add(&item_cs.null_count);
+
+                // NDV must be computed before min/max update (needs pre-merge ranges)
+                col_stats.distinct_count = match (
+                    col_stats.distinct_count.get_value(),
+                    item_cs.distinct_count.get_value(),
+                ) {
+                    (Some(&l), Some(&r)) => Precision::Inexact(
+                        estimate_ndv_with_overlap(col_stats, item_cs, l, r)
+                            .unwrap_or_else(|| ndv_fallback.merge(l, r)),
+                    ),
+                    _ => Precision::Absent,
+                };
+                precision_min(&mut col_stats.min_value, &item_cs.min_value);
+                precision_max(&mut col_stats.max_value, &item_cs.max_value);
+                precision_add_for_sum_in_place(
+                    &mut col_stats.sum_value,
+                    &item_cs.sum_value,
+                );
+                col_stats.byte_size = col_stats.byte_size.add(&item_cs.byte_size);
+            }
         }
 
         Ok(Statistics {
@@ -591,6 +776,205 @@ impl Statistics {
     }
 }
 
+/// Estimates the combined number of distinct values (NDV) when merging two
+/// column statistics, using range overlap to avoid double-counting shared values.
+///
+/// Assumes values are distributed uniformly within each input's
+/// `[min, max]` range (the standard assumption when only summary
+/// statistics are available). Under uniformity the fraction of an input's
+/// distinct values that land in a sub-range equals the fraction of
+/// the range that sub-range covers.
+///
+/// The combined value space is split into three disjoint regions:
+///
+/// ```text
+///   |-- only A --|-- overlap --|-- only B --|
+/// ```
+///
+/// * **Only in A/B** - values outside the other input's range
+///   contribute `(1 - overlap_a) * NDV_a` and `(1 - overlap_b) * NDV_b`.
+/// * **Overlap** - both inputs may produce values here. We take
+///   `max(overlap_a * NDV_a, overlap_b * NDV_b)` rather than the
+///   sum because values in the same sub-range are likely shared
+///   (the smaller set is assumed to be a subset of the larger).
+///
+/// The formula ranges between `[max(NDV_a, NDV_b), NDV_a + NDV_b]`,
+/// from full overlap to no overlap.
+///
+/// ```text
+/// NDV = max(overlap_a * NDV_a, overlap_b * NDV_b)   [intersection]
+///     + (1 - overlap_a) * NDV_a                      [only in A]
+///     + (1 - overlap_b) * NDV_b                      [only in B]
+/// ```
+///
+/// Returns `None` when min/max are absent or distance is unsupported
+/// (e.g. strings), in which case the caller should fall back to a simpler
+/// estimate.
+pub fn estimate_ndv_with_overlap(
+    left: &ColumnStatistics,
+    right: &ColumnStatistics,
+    ndv_left: usize,
+    ndv_right: usize,
+) -> Option<usize> {
+    let left_min = left.min_value.get_value()?;
+    let left_max = left.max_value.get_value()?;
+    let right_min = right.min_value.get_value()?;
+    let right_max = right.max_value.get_value()?;
+
+    let range_left = left_max.distance(left_min)?;
+    let range_right = right_max.distance(right_min)?;
+
+    // Constant columns (range == 0) can't use the proportional overlap
+    // formula below, so check interval overlap directly instead.
+    if range_left == 0 || range_right == 0 {
+        let overlaps = left_min <= right_max && right_min <= left_max;
+        return Some(if overlaps {
+            usize::max(ndv_left, ndv_right)
+        } else {
+            ndv_left + ndv_right
+        });
+    }
+
+    let overlap_min = if left_min >= right_min {
+        left_min
+    } else {
+        right_min
+    };
+    let overlap_max = if left_max <= right_max {
+        left_max
+    } else {
+        right_max
+    };
+
+    // Disjoint ranges: no overlap, NDVs are additive
+    if overlap_min > overlap_max {
+        return Some(ndv_left + ndv_right);
+    }
+
+    let overlap_range = overlap_max.distance(overlap_min)? as f64;
+
+    let overlap_left = overlap_range / range_left as f64;
+    let overlap_right = overlap_range / range_right as f64;
+
+    let intersection = f64::max(
+        overlap_left * ndv_left as f64,
+        overlap_right * ndv_right as f64,
+    );
+    let only_left = (1.0 - overlap_left) * ndv_left as f64;
+    let only_right = (1.0 - overlap_right) * ndv_right as f64;
+
+    Some((intersection + only_left + only_right).round() as usize)
+}
+
+/// Returns the minimum precision while not allocating a new value,
+/// mirrors the semantics of `PartialOrd`.
+#[inline]
+fn precision_min<T>(lhs: &mut Precision<T>, rhs: &Precision<T>)
+where
+    T: Debug + Clone + PartialEq + Eq + PartialOrd,
+{
+    *lhs = match (std::mem::take(lhs), rhs) {
+        (Precision::Exact(left), Precision::Exact(right)) => {
+            if left <= *right {
+                Precision::Exact(left)
+            } else {
+                Precision::Exact(right.clone())
+            }
+        }
+        (Precision::Exact(left), Precision::Inexact(right))
+        | (Precision::Inexact(left), Precision::Exact(right))
+        | (Precision::Inexact(left), Precision::Inexact(right)) => {
+            if left <= *right {
+                Precision::Inexact(left)
+            } else {
+                Precision::Inexact(right.clone())
+            }
+        }
+        (_, _) => Precision::Absent,
+    };
+}
+
+/// Returns the maximum precision while not allocating a new value,
+/// mirrors the semantics of `PartialOrd`.
+#[inline]
+fn precision_max<T>(lhs: &mut Precision<T>, rhs: &Precision<T>)
+where
+    T: Debug + Clone + PartialEq + Eq + PartialOrd,
+{
+    *lhs = match (std::mem::take(lhs), rhs) {
+        (Precision::Exact(left), Precision::Exact(right)) => {
+            if left >= *right {
+                Precision::Exact(left)
+            } else {
+                Precision::Exact(right.clone())
+            }
+        }
+        (Precision::Exact(left), Precision::Inexact(right))
+        | (Precision::Inexact(left), Precision::Exact(right))
+        | (Precision::Inexact(left), Precision::Inexact(right)) => {
+            if left >= *right {
+                Precision::Inexact(left)
+            } else {
+                Precision::Inexact(right.clone())
+            }
+        }
+        (_, _) => Precision::Absent,
+    };
+}
+
+#[inline]
+fn cast_sum_value_to_sum_type_in_place(value: &mut Precision<ScalarValue>) {
+    let (is_exact, inner) = match std::mem::take(value) {
+        Precision::Exact(v) => (true, v),
+        Precision::Inexact(v) => (false, v),
+        Precision::Absent => return,
+    };
+    let source_type = inner.data_type();
+    let target_type = Precision::<ScalarValue>::sum_data_type(&source_type);
+
+    let wrap_precision_fn: fn(ScalarValue) -> Precision<ScalarValue> = if is_exact {
+        Precision::Exact
+    } else {
+        Precision::Inexact
+    };
+
+    *value = if source_type == target_type {
+        wrap_precision_fn(inner)
+    } else {
+        inner
+            .cast_to(&target_type)
+            .map(wrap_precision_fn)
+            .unwrap_or(Precision::Absent)
+    };
+}
+
+#[inline]
+fn precision_add_for_sum_in_place(
+    lhs: &mut Precision<ScalarValue>,
+    rhs: &Precision<ScalarValue>,
+) {
+    let (value, wrap_fn): (&ScalarValue, fn(ScalarValue) -> Precision<ScalarValue>) =
+        match rhs {
+            Precision::Exact(v) => (v, Precision::Exact),
+            Precision::Inexact(v) => (v, Precision::Inexact),
+            Precision::Absent => {
+                *lhs = Precision::Absent;
+                return;
+            }
+        };
+    let source_type = value.data_type();
+    let target_type = Precision::<ScalarValue>::sum_data_type(&source_type);
+    if source_type == target_type {
+        precision_add(lhs, rhs);
+    } else {
+        let rhs = value
+            .cast_to(&target_type)
+            .map(wrap_fn)
+            .unwrap_or(Precision::Absent);
+        precision_add(lhs, &rhs);
+    }
+}
+
 /// Creates an estimate of the number of rows in the output using the given
 /// optional value and exactness flag.
 fn check_num_rows(value: Option<usize>, is_exact: bool) -> Precision<usize> {
@@ -642,6 +1026,11 @@ impl Display for Statistics {
                 } else {
                     s
                 };
+                let s = if cs.byte_size != Precision::Absent {
+                    format!("{} ScanBytes={}", s, cs.byte_size)
+                } else {
+                    s
+                };
 
                 s + ")"
             })
@@ -667,10 +1056,33 @@ pub struct ColumnStatistics {
     pub max_value: Precision<ScalarValue>,
     /// Minimum value of column
     pub min_value: Precision<ScalarValue>,
-    /// Sum value of a column
+    /// Sum value of a column.
+    ///
+    /// For integral columns, values should be kept in SUM-compatible widened
+    /// types (`Int8/Int16/Int32 -> Int64`, `UInt8/UInt16/UInt32 -> UInt64`) to
+    /// reduce overflow risk during statistics propagation.
+    ///
+    /// Callers should prefer [`ColumnStatistics::with_sum_value`] for setting
+    /// this field and [`Precision<ScalarValue>::add_for_sum`] /
+    /// [`Precision<ScalarValue>::cast_to_sum_type`] for sum arithmetic.
     pub sum_value: Precision<ScalarValue>,
     /// Number of distinct values
     pub distinct_count: Precision<usize>,
+    /// Estimated size of this column's data in bytes for the output.
+    ///
+    /// Note that this is not the same as the total bytes that may be scanned,
+    /// processed, etc.
+    ///
+    /// E.g. we may read 1GB of data from a Parquet file but the Arrow data
+    /// the node produces may be 2GB; it's this 2GB that is tracked here.
+    ///
+    /// Currently this is accurately calculated for primitive types only.
+    /// For complex types (like Utf8, List, Struct, etc), this value may be
+    /// absent or inexact (e.g. estimated from the size of the data in the source Parquet files).
+    ///
+    /// This value is automatically scaled when operations like limits or
+    /// filters reduce the number of rows (see [`Statistics::with_fetch`]).
+    pub byte_size: Precision<usize>,
 }
 
 impl ColumnStatistics {
@@ -693,6 +1105,7 @@ impl ColumnStatistics {
             min_value: Precision::Absent,
             sum_value: Precision::Absent,
             distinct_count: Precision::Absent,
+            byte_size: Precision::Absent,
         }
     }
 
@@ -716,7 +1129,19 @@ impl ColumnStatistics {
 
     /// Set the sum value
     pub fn with_sum_value(mut self, sum_value: Precision<ScalarValue>) -> Self {
-        self.sum_value = sum_value;
+        self.sum_value = match sum_value {
+            Precision::Exact(value) => {
+                Precision::<ScalarValue>::cast_scalar_to_sum_type(&value)
+                    .map(Precision::Exact)
+                    .unwrap_or(Precision::Absent)
+            }
+            Precision::Inexact(value) => {
+                Precision::<ScalarValue>::cast_scalar_to_sum_type(&value)
+                    .map(Precision::Inexact)
+                    .unwrap_or(Precision::Absent)
+            }
+            Precision::Absent => Precision::Absent,
+        };
         self
     }
 
@@ -726,6 +1151,13 @@ impl ColumnStatistics {
         self
     }
 
+    /// Set the scan byte size
+    /// This should initially be set to the total size of the column.
+    pub fn with_byte_size(mut self, byte_size: Precision<usize>) -> Self {
+        self.byte_size = byte_size;
+        self
+    }
+
     /// If the exactness of a [`ColumnStatistics`] instance is lost, this
     /// function relaxes the exactness of all information by converting them
     /// [`Precision::Inexact`].
@@ -735,6 +1167,7 @@ impl ColumnStatistics {
         self.min_value = self.min_value.to_inexact();
         self.sum_value = self.sum_value.to_inexact();
         self.distinct_count = self.distinct_count.to_inexact();
+        self.byte_size = self.byte_size.to_inexact();
         self
     }
 }
@@ -861,6 +1294,45 @@ mod tests {
         assert_eq!(precision.add(&Precision::Absent), Precision::Absent);
     }
 
+    #[test]
+    fn test_add_for_sum_scalar_integer_widening() {
+        let precision = Precision::Exact(ScalarValue::Int32(Some(42)));
+
+        assert_eq!(
+            precision.add_for_sum(&Precision::Exact(ScalarValue::Int32(Some(23)))),
+            Precision::Exact(ScalarValue::Int64(Some(65))),
+        );
+        assert_eq!(
+            precision.add_for_sum(&Precision::Inexact(ScalarValue::Int32(Some(23)))),
+            Precision::Inexact(ScalarValue::Int64(Some(65))),
+        );
+    }
+
+    #[test]
+    fn test_add_for_sum_prevents_int32_overflow() {
+        let lhs = Precision::Exact(ScalarValue::Int32(Some(i32::MAX)));
+        let rhs = Precision::Exact(ScalarValue::Int32(Some(1)));
+
+        assert_eq!(
+            lhs.add_for_sum(&rhs),
+            Precision::Exact(ScalarValue::Int64(Some(i64::from(i32::MAX) + 1))),
+        );
+    }
+
+    #[test]
+    fn test_add_for_sum_scalar_unsigned_integer_widening() {
+        let precision = Precision::Exact(ScalarValue::UInt32(Some(42)));
+
+        assert_eq!(
+            precision.add_for_sum(&Precision::Exact(ScalarValue::UInt32(Some(23)))),
+            Precision::Exact(ScalarValue::UInt64(Some(65))),
+        );
+        assert_eq!(
+            precision.add_for_sum(&Precision::Inexact(ScalarValue::UInt32(Some(23)))),
+            Precision::Inexact(ScalarValue::UInt64(Some(65))),
+        );
+    }
+
     #[test]
     fn test_sub() {
         let precision1 = Precision::Exact(42);
@@ -961,9 +1433,11 @@ mod tests {
             Precision::Exact(ScalarValue::Int64(None)),
         );
         // Overflow returns error
-        assert!(Precision::Exact(ScalarValue::Int32(Some(256)))
-            .cast_to(&DataType::Int8)
-            .is_err());
+        assert!(
+            Precision::Exact(ScalarValue::Int32(Some(256)))
+                .cast_to(&DataType::Int8)
+                .is_err()
+        );
     }
 
     #[test]
@@ -976,15 +1450,13 @@ mod tests {
         // Precision<ScalarValue> is not copy (requires .clone())
         let precision: Precision<ScalarValue> =
             Precision::Exact(ScalarValue::Int64(Some(42)));
-        // Clippy would complain about this if it were Copy
-        #[allow(clippy::redundant_clone)]
         let p2 = precision.clone();
         assert_eq!(precision, p2);
     }
 
     #[test]
     fn test_project_none() {
-        let projection = None;
+        let projection: Option<Vec<usize>> = None;
         let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
         assert_eq!(stats, make_stats(vec![10, 20, 30]));
     }
@@ -1026,11 +1498,50 @@ mod tests {
             min_value: Precision::Exact(ScalarValue::Int64(Some(64))),
             sum_value: Precision::Exact(ScalarValue::Int64(Some(4600))),
             distinct_count: Precision::Exact(100),
+            byte_size: Precision::Exact(800),
         }
     }
 
+    fn make_single_i64_ndv_stats(
+        distinct_count: Precision<usize>,
+        min_value: Option<i64>,
+        max_value: Option<i64>,
+    ) -> Statistics {
+        let to_precision = |value| Precision::Exact(ScalarValue::Int64(Some(value)));
+
+        Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_distinct_count(distinct_count)
+                    .with_min_value(
+                        min_value.map(to_precision).unwrap_or(Precision::Absent),
+                    )
+                    .with_max_value(
+                        max_value.map(to_precision).unwrap_or(Precision::Absent),
+                    ),
+            )
+    }
+
+    fn merge_single_i64_ndv_distinct_count(
+        left: Statistics,
+        right: Statistics,
+        ndv_fallback: NdvFallback,
+    ) -> Precision<usize> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
+
+        Statistics::try_merge_iter_with_ndv_fallback(
+            [&left, &right],
+            &schema,
+            ndv_fallback,
+        )
+        .unwrap()
+        .column_statistics[0]
+            .distinct_count
+    }
+
     #[test]
-    fn test_try_merge_basic() {
+    fn test_try_merge() {
         // Create a schema with two columns
         let schema = Arc::new(Schema::new(vec![
             Field::new("col1", DataType::Int32, false),
@@ -1048,6 +1559,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
                     sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
                     distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(40),
                 },
                 ColumnStatistics {
                     null_count: Precision::Exact(2),
@@ -1055,6 +1567,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::Int32(Some(10))),
                     sum_value: Precision::Exact(ScalarValue::Int32(Some(1000))),
                     distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(40),
                 },
             ],
         };
@@ -1069,6 +1582,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
                     sum_value: Precision::Exact(ScalarValue::Int32(Some(600))),
                     distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(60),
                 },
                 ColumnStatistics {
                     null_count: Precision::Exact(3),
@@ -1076,6 +1590,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::Int32(Some(5))),
                     sum_value: Precision::Exact(ScalarValue::Int32(Some(1200))),
                     distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(60),
                 },
             ],
         };
@@ -1101,7 +1616,7 @@ mod tests {
         );
         assert_eq!(
             col1_stats.sum_value,
-            Precision::Exact(ScalarValue::Int32(Some(1100)))
+            Precision::Exact(ScalarValue::Int64(Some(1100)))
         ); // 500 + 600
 
         let col2_stats = &summary_stats.column_statistics[1];
@@ -1116,7 +1631,7 @@ mod tests {
         );
         assert_eq!(
             col2_stats.sum_value,
-            Precision::Exact(ScalarValue::Int32(Some(2200)))
+            Precision::Exact(ScalarValue::Int64(Some(2200)))
         ); // 1000 + 1200
     }
 
@@ -1139,6 +1654,7 @@ mod tests {
                 min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
                 sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
                 distinct_count: Precision::Absent,
+                byte_size: Precision::Exact(40),
             }],
         };
 
@@ -1151,6 +1667,7 @@ mod tests {
                 min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
                 sum_value: Precision::Absent,
                 distinct_count: Precision::Absent,
+                byte_size: Precision::Inexact(60),
             }],
         };
 
@@ -1171,7 +1688,7 @@ mod tests {
             col_stats.min_value,
             Precision::Inexact(ScalarValue::Int32(Some(-10)))
         );
-        assert!(matches!(col_stats.sum_value, Precision::Absent));
+        assert_eq!(col_stats.sum_value, Precision::Absent);
     }
 
     #[test]
@@ -1215,7 +1732,10 @@ mod tests {
         let items = vec![stats1, stats2];
 
         let e = Statistics::try_merge_iter(&items, &schema).unwrap_err();
-        assert_contains!(e.to_string(), "Error during planning: Cannot merge statistics with different number of columns: 0 vs 1");
+        assert_contains!(
+            e.to_string(),
+            "Error during planning: Cannot merge statistics with different number of columns: 0 vs 1"
+        );
     }
 
     #[test]
@@ -1244,7 +1764,9 @@ mod tests {
             );
 
         // Merge statistics
-        let merged_stats = stats1.try_merge(&stats2).unwrap();
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let merged_stats =
+            Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
 
         // Verify the results
         assert_eq!(merged_stats.num_rows, Precision::Exact(25));
@@ -1260,66 +1782,486 @@ mod tests {
             col_stats.max_value,
             Precision::Exact(ScalarValue::Int32(Some(20)))
         );
-        // Distinct count should be Absent after merge
-        assert_eq!(col_stats.distinct_count, Precision::Absent);
+        // Overlap-based NDV: ranges [1,10] and [5,20], overlap [5,10]
+        // range_left=9, range_right=15, overlap=5
+        // overlap_left=5*(5/9)=2.78, overlap_right=7*(5/15)=2.33
+        // result = max(2.78, 2.33) + (5-2.78) + (7-2.33) = 9.67 -> 10
+        assert_eq!(col_stats.distinct_count, Precision::Inexact(10));
     }
 
     #[test]
-    fn test_with_fetch_basic_preservation() {
-        // Test that column statistics and byte size are preserved (as inexact) when applying fetch
-        let original_stats = Statistics {
-            num_rows: Precision::Exact(1000),
-            total_byte_size: Precision::Exact(8000),
-            column_statistics: vec![
-                ColumnStatistics {
-                    null_count: Precision::Exact(10),
-                    max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
-                    min_value: Precision::Exact(ScalarValue::Int32(Some(0))),
-                    sum_value: Precision::Exact(ScalarValue::Int32(Some(5050))),
-                    distinct_count: Precision::Exact(50),
-                },
-                ColumnStatistics {
-                    null_count: Precision::Exact(20),
-                    max_value: Precision::Exact(ScalarValue::Int64(Some(200))),
-                    min_value: Precision::Exact(ScalarValue::Int64(Some(10))),
-                    sum_value: Precision::Exact(ScalarValue::Int64(Some(10100))),
-                    distinct_count: Precision::Exact(75),
-                },
-            ],
-        };
-
-        // Apply fetch of 100 rows (10% of original)
-        let result = original_stats.clone().with_fetch(Some(100), 0, 1).unwrap();
-
-        // Check num_rows
-        assert_eq!(result.num_rows, Precision::Exact(100));
-
-        // Check total_byte_size is scaled proportionally and marked as inexact
-        // 100/1000 = 0.1, so 8000 * 0.1 = 800
-        assert_eq!(result.total_byte_size, Precision::Inexact(800));
-
-        // Check column statistics are preserved but marked as inexact
-        assert_eq!(result.column_statistics.len(), 2);
+    fn test_try_merge_ndv_disjoint_ranges() {
+        let stats1 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(0))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(10))))
+                    .with_distinct_count(Precision::Exact(5)),
+            );
+        let stats2 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(20))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(30))))
+                    .with_distinct_count(Precision::Exact(8)),
+            );
 
-        // First column
-        assert_eq!(
-            result.column_statistics[0].null_count,
-            Precision::Inexact(10)
-        );
-        assert_eq!(
-            result.column_statistics[0].max_value,
-            Precision::Inexact(ScalarValue::Int32(Some(100)))
-        );
-        assert_eq!(
-            result.column_statistics[0].min_value,
-            Precision::Inexact(ScalarValue::Int32(Some(0)))
-        );
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let merged = Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
+        // No overlap -> sum of NDVs
         assert_eq!(
-            result.column_statistics[0].sum_value,
-            Precision::Inexact(ScalarValue::Int32(Some(5050)))
+            merged.column_statistics[0].distinct_count,
+            Precision::Inexact(13)
         );
+    }
+
+    #[test]
+    fn test_try_merge_ndv_identical_ranges() {
+        let stats1 = Statistics::default()
+            .with_num_rows(Precision::Exact(100))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(0))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(100))))
+                    .with_distinct_count(Precision::Exact(50)),
+            );
+        let stats2 = Statistics::default()
+            .with_num_rows(Precision::Exact(100))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(0))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(100))))
+                    .with_distinct_count(Precision::Exact(30)),
+            );
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let merged = Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
+        // Full overlap -> max(50, 30) = 50
         assert_eq!(
-            result.column_statistics[0].distinct_count,
+            merged.column_statistics[0].distinct_count,
+            Precision::Inexact(50)
+        );
+    }
+
+    #[test]
+    fn test_try_merge_ndv_partial_overlap() {
+        let stats1 = Statistics::default()
+            .with_num_rows(Precision::Exact(100))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(0))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(100))))
+                    .with_distinct_count(Precision::Exact(80)),
+            );
+        let stats2 = Statistics::default()
+            .with_num_rows(Precision::Exact(100))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(50))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(150))))
+                    .with_distinct_count(Precision::Exact(60)),
+            );
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let merged = Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
+        // overlap=[50,100], range_left=100, range_right=100, overlap_range=50
+        // overlap_left=80*(50/100)=40, overlap_right=60*(50/100)=30
+        // result = max(40,30) + (80-40) + (60-30) = 40 + 40 + 30 = 110
+        assert_eq!(
+            merged.column_statistics[0].distinct_count,
+            Precision::Inexact(110)
+        );
+    }
+
+    #[test]
+    fn test_try_merge_ndv_missing_min_max() {
+        let stats1 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown().with_distinct_count(Precision::Exact(5)),
+            );
+        let stats2 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown().with_distinct_count(Precision::Exact(8)),
+            );
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let merged = Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
+        // No min/max -> default fallback is max
+        assert_eq!(
+            merged.column_statistics[0].distinct_count,
+            Precision::Inexact(8)
+        );
+    }
+
+    #[test]
+    fn test_try_merge_ndv_non_numeric_types() {
+        let stats1 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Utf8(Some(
+                        "aaa".to_string(),
+                    ))))
+                    .with_max_value(Precision::Exact(ScalarValue::Utf8(Some(
+                        "zzz".to_string(),
+                    ))))
+                    .with_distinct_count(Precision::Exact(5)),
+            );
+        let stats2 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Utf8(Some(
+                        "bbb".to_string(),
+                    ))))
+                    .with_max_value(Precision::Exact(ScalarValue::Utf8(Some(
+                        "yyy".to_string(),
+                    ))))
+                    .with_distinct_count(Precision::Exact(8)),
+            );
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]);
+        let merged = Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
+        // distance() unsupported for strings -> default fallback is max
+        assert_eq!(
+            merged.column_statistics[0].distinct_count,
+            Precision::Inexact(8)
+        );
+    }
+
+    #[test]
+    fn test_try_merge_ndv_non_numeric_types_sum_fallback() {
+        let stats1 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Utf8(Some(
+                        "aaa".to_string(),
+                    ))))
+                    .with_max_value(Precision::Exact(ScalarValue::Utf8(Some(
+                        "zzz".to_string(),
+                    ))))
+                    .with_distinct_count(Precision::Exact(5)),
+            );
+        let stats2 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Utf8(Some(
+                        "bbb".to_string(),
+                    ))))
+                    .with_max_value(Precision::Exact(ScalarValue::Utf8(Some(
+                        "yyy".to_string(),
+                    ))))
+                    .with_distinct_count(Precision::Exact(8)),
+            );
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]);
+        let merged = Statistics::try_merge_iter_with_ndv_fallback(
+            [&stats1, &stats2],
+            &schema,
+            NdvFallback::Sum,
+        )
+        .unwrap();
+
+        // distance() unsupported for strings -> sum fallback is caller-selected
+        assert_eq!(
+            merged.column_statistics[0].distinct_count,
+            Precision::Inexact(13)
+        );
+    }
+
+    #[test]
+    fn test_try_merge_ndv_constant_columns() {
+        // Same constant: [5,5]+[5,5] -> max
+        let stats1 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(5))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(5))))
+                    .with_distinct_count(Precision::Exact(1)),
+            );
+        let stats2 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(5))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(5))))
+                    .with_distinct_count(Precision::Exact(1)),
+            );
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let merged = Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
+        assert_eq!(
+            merged.column_statistics[0].distinct_count,
+            Precision::Inexact(1)
+        );
+
+        // Different constants: [5,5]+[10,10] -> sum
+        let stats3 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(5))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(5))))
+                    .with_distinct_count(Precision::Exact(1)),
+            );
+        let stats4 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(10))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(10))))
+                    .with_distinct_count(Precision::Exact(1)),
+            );
+
+        let merged = Statistics::try_merge_iter([&stats3, &stats4], &schema).unwrap();
+        assert_eq!(
+            merged.column_statistics[0].distinct_count,
+            Precision::Inexact(2)
+        );
+    }
+
+    #[test]
+    fn test_try_merge_ndv_original_union_edge_cases() {
+        struct NdvTestCase {
+            name: &'static str,
+            left_ndv: Precision<usize>,
+            left_min: Option<i64>,
+            left_max: Option<i64>,
+            right_ndv: Precision<usize>,
+            right_min: Option<i64>,
+            right_max: Option<i64>,
+            expected: Precision<usize>,
+        }
+
+        let cases = vec![
+            NdvTestCase {
+                name: "disjoint ranges",
+                left_ndv: Precision::Exact(5),
+                left_min: Some(0),
+                left_max: Some(10),
+                right_ndv: Precision::Exact(3),
+                right_min: Some(20),
+                right_max: Some(30),
+                expected: Precision::Inexact(8),
+            },
+            NdvTestCase {
+                name: "identical ranges",
+                left_ndv: Precision::Exact(10),
+                left_min: Some(0),
+                left_max: Some(100),
+                right_ndv: Precision::Exact(8),
+                right_min: Some(0),
+                right_max: Some(100),
+                expected: Precision::Inexact(10),
+            },
+            NdvTestCase {
+                name: "partial overlap",
+                left_ndv: Precision::Exact(100),
+                left_min: Some(0),
+                left_max: Some(100),
+                right_ndv: Precision::Exact(50),
+                right_min: Some(50),
+                right_max: Some(150),
+                expected: Precision::Inexact(125),
+            },
+            NdvTestCase {
+                name: "right contained in left",
+                left_ndv: Precision::Exact(100),
+                left_min: Some(0),
+                left_max: Some(100),
+                right_ndv: Precision::Exact(50),
+                right_min: Some(25),
+                right_max: Some(75),
+                expected: Precision::Inexact(100),
+            },
+            NdvTestCase {
+                name: "same constant value",
+                left_ndv: Precision::Exact(1),
+                left_min: Some(5),
+                left_max: Some(5),
+                right_ndv: Precision::Exact(1),
+                right_min: Some(5),
+                right_max: Some(5),
+                expected: Precision::Inexact(1),
+            },
+            NdvTestCase {
+                name: "different constant values",
+                left_ndv: Precision::Exact(1),
+                left_min: Some(5),
+                left_max: Some(5),
+                right_ndv: Precision::Exact(1),
+                right_min: Some(10),
+                right_max: Some(10),
+                expected: Precision::Inexact(2),
+            },
+            NdvTestCase {
+                name: "left constant within right range",
+                left_ndv: Precision::Exact(1),
+                left_min: Some(5),
+                left_max: Some(5),
+                right_ndv: Precision::Exact(10),
+                right_min: Some(0),
+                right_max: Some(10),
+                expected: Precision::Inexact(10),
+            },
+            NdvTestCase {
+                name: "left constant outside right range",
+                left_ndv: Precision::Exact(1),
+                left_min: Some(20),
+                left_max: Some(20),
+                right_ndv: Precision::Exact(10),
+                right_min: Some(0),
+                right_max: Some(10),
+                expected: Precision::Inexact(11),
+            },
+            NdvTestCase {
+                name: "right constant within left range",
+                left_ndv: Precision::Exact(10),
+                left_min: Some(0),
+                left_max: Some(10),
+                right_ndv: Precision::Exact(1),
+                right_min: Some(5),
+                right_max: Some(5),
+                expected: Precision::Inexact(10),
+            },
+            NdvTestCase {
+                name: "right constant outside left range",
+                left_ndv: Precision::Exact(10),
+                left_min: Some(0),
+                left_max: Some(10),
+                right_ndv: Precision::Exact(1),
+                right_min: Some(20),
+                right_max: Some(20),
+                expected: Precision::Inexact(11),
+            },
+            NdvTestCase {
+                name: "missing bounds exact plus exact",
+                left_ndv: Precision::Exact(10),
+                left_min: None,
+                left_max: None,
+                right_ndv: Precision::Exact(5),
+                right_min: None,
+                right_max: None,
+                expected: Precision::Inexact(15),
+            },
+            NdvTestCase {
+                name: "missing bounds exact plus inexact",
+                left_ndv: Precision::Exact(10),
+                left_min: None,
+                left_max: None,
+                right_ndv: Precision::Inexact(5),
+                right_min: None,
+                right_max: None,
+                expected: Precision::Inexact(15),
+            },
+            NdvTestCase {
+                name: "missing bounds inexact plus inexact",
+                left_ndv: Precision::Inexact(7),
+                left_min: None,
+                left_max: None,
+                right_ndv: Precision::Inexact(3),
+                right_min: None,
+                right_max: None,
+                expected: Precision::Inexact(10),
+            },
+            NdvTestCase {
+                name: "exact plus absent",
+                left_ndv: Precision::Exact(10),
+                left_min: None,
+                left_max: None,
+                right_ndv: Precision::Absent,
+                right_min: None,
+                right_max: None,
+                expected: Precision::Absent,
+            },
+            NdvTestCase {
+                name: "inexact plus absent",
+                left_ndv: Precision::Inexact(4),
+                left_min: None,
+                left_max: None,
+                right_ndv: Precision::Absent,
+                right_min: None,
+                right_max: None,
+                expected: Precision::Absent,
+            },
+        ];
+
+        for case in cases {
+            let actual = merge_single_i64_ndv_distinct_count(
+                make_single_i64_ndv_stats(case.left_ndv, case.left_min, case.left_max),
+                make_single_i64_ndv_stats(case.right_ndv, case.right_min, case.right_max),
+                NdvFallback::Sum,
+            );
+
+            assert_eq!(actual, case.expected, "case {} failed", case.name);
+        }
+    }
+
+    #[test]
+    fn test_with_fetch_basic_preservation() {
+        // Test that column statistics and byte size are preserved (as inexact) when applying fetch
+        let original_stats = Statistics {
+            num_rows: Precision::Exact(1000),
+            total_byte_size: Precision::Exact(8000),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(10),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(0))),
+                    sum_value: Precision::Exact(ScalarValue::Int32(Some(5050))),
+                    distinct_count: Precision::Exact(50),
+                    byte_size: Precision::Exact(4000),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(20),
+                    max_value: Precision::Exact(ScalarValue::Int64(Some(200))),
+                    min_value: Precision::Exact(ScalarValue::Int64(Some(10))),
+                    sum_value: Precision::Exact(ScalarValue::Int64(Some(10100))),
+                    distinct_count: Precision::Exact(75),
+                    byte_size: Precision::Exact(8000),
+                },
+            ],
+        };
+
+        // Apply fetch of 100 rows (10% of original)
+        let result = original_stats.clone().with_fetch(Some(100), 0, 1).unwrap();
+
+        // Check num_rows
+        assert_eq!(result.num_rows, Precision::Exact(100));
+
+        // Check total_byte_size is computed as sum of scaled column byte_size values
+        // Column 1: 4000 * 0.1 = 400, Column 2: 8000 * 0.1 = 800, Sum = 1200
+        assert_eq!(result.total_byte_size, Precision::Inexact(1200));
+
+        // Check column statistics are preserved but marked as inexact
+        assert_eq!(result.column_statistics.len(), 2);
+
+        // First column
+        assert_eq!(
+            result.column_statistics[0].null_count,
+            Precision::Inexact(10)
+        );
+        assert_eq!(
+            result.column_statistics[0].max_value,
+            Precision::Inexact(ScalarValue::Int32(Some(100)))
+        );
+        assert_eq!(
+            result.column_statistics[0].min_value,
+            Precision::Inexact(ScalarValue::Int32(Some(0)))
+        );
+        assert_eq!(
+            result.column_statistics[0].sum_value,
+            Precision::Inexact(ScalarValue::Int32(Some(5050)))
+        );
+        assert_eq!(
+            result.column_statistics[0].distinct_count,
             Precision::Inexact(50)
         );
 
@@ -1358,6 +2300,7 @@ mod tests {
                 min_value: Precision::Inexact(ScalarValue::Int32(Some(0))),
                 sum_value: Precision::Inexact(ScalarValue::Int32(Some(5050))),
                 distinct_count: Precision::Inexact(50),
+                byte_size: Precision::Inexact(4000),
             }],
         };
 
@@ -1366,9 +2309,9 @@ mod tests {
         // Check num_rows is inexact
         assert_eq!(result.num_rows, Precision::Inexact(500));
 
-        // Check total_byte_size is scaled and inexact
-        // 500/1000 = 0.5, so 8000 * 0.5 = 4000
-        assert_eq!(result.total_byte_size, Precision::Inexact(4000));
+        // Check total_byte_size is computed as sum of scaled column byte_size values
+        // Column 1: 4000 * 0.5 = 2000, Sum = 2000
+        assert_eq!(result.total_byte_size, Precision::Inexact(2000));
 
         // Column stats remain inexact
         assert_eq!(
@@ -1425,8 +2368,8 @@ mod tests {
             .unwrap();
 
         assert_eq!(result.num_rows, Precision::Exact(300));
-        // 300/1000 = 0.3, so 8000 * 0.3 = 2400
-        assert_eq!(result.total_byte_size, Precision::Inexact(2400));
+        // Column 1: byte_size 800 * (300/500) = 240, Sum = 240
+        assert_eq!(result.total_byte_size, Precision::Inexact(240));
     }
 
     #[test]
@@ -1442,8 +2385,8 @@ mod tests {
         let result = original_stats.clone().with_fetch(Some(100), 0, 4).unwrap();
 
         assert_eq!(result.num_rows, Precision::Exact(400));
-        // 400/1000 = 0.4, so 8000 * 0.4 = 3200
-        assert_eq!(result.total_byte_size, Precision::Inexact(3200));
+        // Column 1: byte_size 800 * 0.4 = 320, Sum = 320
+        assert_eq!(result.total_byte_size, Precision::Inexact(320));
     }
 
     #[test]
@@ -1458,6 +2401,7 @@ mod tests {
                 min_value: Precision::Absent,
                 sum_value: Precision::Absent,
                 distinct_count: Precision::Absent,
+                byte_size: Precision::Absent,
             }],
         };
 
@@ -1496,6 +2440,7 @@ mod tests {
             min_value: Precision::Exact(ScalarValue::Int32(Some(-100))),
             sum_value: Precision::Exact(ScalarValue::Int32(Some(123456))),
             distinct_count: Precision::Exact(789),
+            byte_size: Precision::Exact(4000),
         };
 
         let original_stats = Statistics {
@@ -1522,6 +2467,780 @@ mod tests {
             result_col_stats.sum_value,
             Precision::Inexact(ScalarValue::Int32(Some(123456)))
         );
-        assert_eq!(result_col_stats.distinct_count, Precision::Inexact(789));
+        // NDV is capped at the new row count (250) since 789 > 250
+        assert_eq!(result_col_stats.distinct_count, Precision::Inexact(250));
+    }
+
+    #[test]
+    fn test_byte_size_to_inexact() {
+        let col_stats = ColumnStatistics {
+            null_count: Precision::Exact(10),
+            max_value: Precision::Absent,
+            min_value: Precision::Absent,
+            sum_value: Precision::Absent,
+            distinct_count: Precision::Absent,
+            byte_size: Precision::Exact(5000),
+        };
+
+        let inexact = col_stats.to_inexact();
+        assert_eq!(inexact.byte_size, Precision::Inexact(5000));
+    }
+
+    #[test]
+    fn test_with_byte_size_builder() {
+        let col_stats =
+            ColumnStatistics::new_unknown().with_byte_size(Precision::Exact(8192));
+        assert_eq!(col_stats.byte_size, Precision::Exact(8192));
+    }
+
+    #[test]
+    fn test_with_sum_value_builder_widens_small_integers() {
+        let col_stats = ColumnStatistics::new_unknown()
+            .with_sum_value(Precision::Exact(ScalarValue::UInt32(Some(123))));
+        assert_eq!(
+            col_stats.sum_value,
+            Precision::Exact(ScalarValue::UInt64(Some(123)))
+        );
+    }
+
+    #[test]
+    fn test_with_fetch_scales_byte_size() {
+        // Test that byte_size is scaled by the row ratio in with_fetch
+        let original_stats = Statistics {
+            num_rows: Precision::Exact(1000),
+            total_byte_size: Precision::Exact(8000),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(10),
+                    max_value: Precision::Absent,
+                    min_value: Precision::Absent,
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(4000),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(20),
+                    max_value: Precision::Absent,
+                    min_value: Precision::Absent,
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(8000),
+                },
+            ],
+        };
+
+        // Apply fetch of 100 rows (10% of original)
+        let result = original_stats.with_fetch(Some(100), 0, 1).unwrap();
+
+        // byte_size should be scaled: 4000 * 0.1 = 400, 8000 * 0.1 = 800
+        assert_eq!(
+            result.column_statistics[0].byte_size,
+            Precision::Inexact(400)
+        );
+        assert_eq!(
+            result.column_statistics[1].byte_size,
+            Precision::Inexact(800)
+        );
+
+        // total_byte_size should be computed as sum of byte_size values: 400 + 800 = 1200
+        assert_eq!(result.total_byte_size, Precision::Inexact(1200));
+    }
+
+    #[test]
+    fn test_with_fetch_total_byte_size_fallback() {
+        // Test that total_byte_size falls back to scaling when not all columns have byte_size
+        let original_stats = Statistics {
+            num_rows: Precision::Exact(1000),
+            total_byte_size: Precision::Exact(8000),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(10),
+                    max_value: Precision::Absent,
+                    min_value: Precision::Absent,
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(4000),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(20),
+                    max_value: Precision::Absent,
+                    min_value: Precision::Absent,
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Absent, // One column has no byte_size
+                },
+            ],
+        };
+
+        // Apply fetch of 100 rows (10% of original)
+        let result = original_stats.with_fetch(Some(100), 0, 1).unwrap();
+
+        // total_byte_size should fall back to scaling: 8000 * 0.1 = 800
+        assert_eq!(result.total_byte_size, Precision::Inexact(800));
+    }
+
+    #[test]
+    fn test_with_fetch_caps_ndv_at_row_count() {
+        // NDV=500 but after LIMIT 10, NDV should be capped at 10
+        let stats = Statistics {
+            num_rows: Precision::Exact(1000),
+            total_byte_size: Precision::Exact(8000),
+            column_statistics: vec![ColumnStatistics {
+                distinct_count: Precision::Inexact(500),
+                ..Default::default()
+            }],
+        };
+
+        let result = stats.with_fetch(Some(10), 0, 1).unwrap();
+        assert_eq!(result.num_rows, Precision::Exact(10));
+        assert_eq!(
+            result.column_statistics[0].distinct_count,
+            Precision::Inexact(10)
+        );
+    }
+
+    #[test]
+    fn test_with_fetch_caps_ndv_with_skip() {
+        // 1000 rows, NDV=500, OFFSET 5 LIMIT 10
+        // with_fetch computes num_rows = min(1000 - 5, 10) = 10
+        // NDV should be capped at 10
+        let stats = Statistics {
+            num_rows: Precision::Exact(1000),
+            total_byte_size: Precision::Exact(8000),
+            column_statistics: vec![ColumnStatistics {
+                distinct_count: Precision::Inexact(500),
+                ..Default::default()
+            }],
+        };
+
+        let result = stats.with_fetch(Some(10), 5, 1).unwrap();
+        assert_eq!(result.num_rows, Precision::Exact(10));
+        assert_eq!(
+            result.column_statistics[0].distinct_count,
+            Precision::Inexact(10)
+        );
+    }
+
+    #[test]
+    fn test_with_fetch_caps_ndv_with_large_skip() {
+        // 1000 rows, NDV=500, OFFSET 995 LIMIT 100
+        // with_fetch computes num_rows = min(1000 - 995, 100) = 5
+        // NDV should be capped at 5
+        let stats = Statistics {
+            num_rows: Precision::Exact(1000),
+            total_byte_size: Precision::Exact(8000),
+            column_statistics: vec![ColumnStatistics {
+                distinct_count: Precision::Inexact(500),
+                ..Default::default()
+            }],
+        };
+
+        let result = stats.with_fetch(Some(100), 995, 1).unwrap();
+        assert_eq!(result.num_rows, Precision::Exact(5));
+        assert_eq!(
+            result.column_statistics[0].distinct_count,
+            Precision::Inexact(5)
+        );
+    }
+
+    #[test]
+    fn test_with_fetch_ndv_below_row_count_unchanged() {
+        // NDV=5 and LIMIT 10: NDV should stay at 5
+        let stats = Statistics {
+            num_rows: Precision::Exact(1000),
+            total_byte_size: Precision::Exact(8000),
+            column_statistics: vec![ColumnStatistics {
+                distinct_count: Precision::Inexact(5),
+                ..Default::default()
+            }],
+        };
+
+        let result = stats.with_fetch(Some(10), 0, 1).unwrap();
+        assert_eq!(result.num_rows, Precision::Exact(10));
+        assert_eq!(
+            result.column_statistics[0].distinct_count,
+            Precision::Inexact(5)
+        );
+    }
+
+    #[test]
+    fn test_try_merge_iter_basic() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("col1", DataType::Int32, false),
+            Field::new("col2", DataType::Int32, false),
+        ]));
+
+        let stats1 = Statistics {
+            num_rows: Precision::Exact(10),
+            total_byte_size: Precision::Exact(100),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(1),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
+                    sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(40),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(2),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(200))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(10))),
+                    sum_value: Precision::Exact(ScalarValue::Int32(Some(1000))),
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(40),
+                },
+            ],
+        };
+
+        let stats2 = Statistics {
+            num_rows: Precision::Exact(15),
+            total_byte_size: Precision::Exact(150),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(2),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(120))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
+                    sum_value: Precision::Exact(ScalarValue::Int32(Some(600))),
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(60),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(3),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(180))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(5))),
+                    sum_value: Precision::Exact(ScalarValue::Int32(Some(1200))),
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(60),
+                },
+            ],
+        };
+
+        let items = vec![&stats1, &stats2];
+        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
+
+        assert_eq!(summary_stats.num_rows, Precision::Exact(25));
+        assert_eq!(summary_stats.total_byte_size, Precision::Exact(250));
+
+        let col1_stats = &summary_stats.column_statistics[0];
+        assert_eq!(col1_stats.null_count, Precision::Exact(3));
+        assert_eq!(
+            col1_stats.max_value,
+            Precision::Exact(ScalarValue::Int32(Some(120)))
+        );
+        assert_eq!(
+            col1_stats.min_value,
+            Precision::Exact(ScalarValue::Int32(Some(-10)))
+        );
+        assert_eq!(
+            col1_stats.sum_value,
+            Precision::Exact(ScalarValue::Int64(Some(1100)))
+        );
+
+        let col2_stats = &summary_stats.column_statistics[1];
+        assert_eq!(col2_stats.null_count, Precision::Exact(5));
+        assert_eq!(
+            col2_stats.max_value,
+            Precision::Exact(ScalarValue::Int32(Some(200)))
+        );
+        assert_eq!(
+            col2_stats.min_value,
+            Precision::Exact(ScalarValue::Int32(Some(5)))
+        );
+        assert_eq!(
+            col2_stats.sum_value,
+            Precision::Exact(ScalarValue::Int64(Some(2200)))
+        );
+    }
+
+    #[test]
+    fn test_try_merge_iter_mixed_precision() {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "col1",
+            DataType::Int32,
+            false,
+        )]));
+
+        let stats1 = Statistics {
+            num_rows: Precision::Exact(10),
+            total_byte_size: Precision::Inexact(100),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(1),
+                max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
+                min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
+                sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Exact(40),
+            }],
+        };
+
+        let stats2 = Statistics {
+            num_rows: Precision::Inexact(15),
+            total_byte_size: Precision::Exact(150),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Inexact(2),
+                max_value: Precision::Inexact(ScalarValue::Int32(Some(120))),
+                min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
+                sum_value: Precision::Absent,
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Inexact(60),
+            }],
+        };
+
+        let items = vec![&stats1, &stats2];
+        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
+
+        assert_eq!(summary_stats.num_rows, Precision::Inexact(25));
+        assert_eq!(summary_stats.total_byte_size, Precision::Inexact(250));
+
+        let col_stats = &summary_stats.column_statistics[0];
+        assert_eq!(col_stats.null_count, Precision::Inexact(3));
+        assert_eq!(
+            col_stats.max_value,
+            Precision::Inexact(ScalarValue::Int32(Some(120)))
+        );
+        assert_eq!(
+            col_stats.min_value,
+            Precision::Inexact(ScalarValue::Int32(Some(-10)))
+        );
+        // sum_value becomes Absent because stats2 has Absent sum
+        assert_eq!(col_stats.sum_value, Precision::Absent);
+    }
+
+    #[test]
+    fn test_try_merge_iter_empty() {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "col1",
+            DataType::Int32,
+            false,
+        )]));
+
+        let items: Vec<&Statistics> = vec![];
+        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
+
+        assert_eq!(summary_stats.num_rows, Precision::Absent);
+        assert_eq!(summary_stats.total_byte_size, Precision::Absent);
+        assert_eq!(summary_stats.column_statistics.len(), 1);
+        assert_eq!(
+            summary_stats.column_statistics[0].null_count,
+            Precision::Absent
+        );
+    }
+
+    #[test]
+    fn test_try_merge_iter_single_item() {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "col1",
+            DataType::Int32,
+            false,
+        )]));
+
+        let stats = Statistics {
+            num_rows: Precision::Exact(10),
+            total_byte_size: Precision::Exact(100),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(1),
+                max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
+                min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
+                sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
+                distinct_count: Precision::Exact(10),
+                byte_size: Precision::Exact(40),
+            }],
+        };
+
+        let items = vec![&stats];
+        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
+
+        assert_eq!(summary_stats, stats);
+    }
+
+    #[test]
+    fn test_try_merge_iter_mismatched_columns() {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "col1",
+            DataType::Int32,
+            false,
+        )]));
+
+        let stats1 = Statistics::default();
+        let stats2 =
+            Statistics::default().add_column_statistics(ColumnStatistics::new_unknown());
+
+        let items = vec![&stats1, &stats2];
+        let e = Statistics::try_merge_iter(items, &schema).unwrap_err();
+        assert_contains!(
+            e.to_string(),
+            "Cannot merge statistics with different number of columns: 0 vs 1"
+        );
+    }
+
+    #[test]
+    fn test_try_merge_iter_three_items() {
+        // Verify that merging three items works correctly
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "col1",
+            DataType::Int64,
+            false,
+        )]));
+
+        let stats1 = Statistics {
+            num_rows: Precision::Exact(10),
+            total_byte_size: Precision::Exact(100),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(1),
+                max_value: Precision::Exact(ScalarValue::Int64(Some(100))),
+                min_value: Precision::Exact(ScalarValue::Int64(Some(10))),
+                sum_value: Precision::Exact(ScalarValue::Int64(Some(500))),
+                distinct_count: Precision::Exact(8),
+                byte_size: Precision::Exact(80),
+            }],
+        };
+
+        let stats2 = Statistics {
+            num_rows: Precision::Exact(20),
+            total_byte_size: Precision::Exact(200),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(2),
+                max_value: Precision::Exact(ScalarValue::Int64(Some(200))),
+                min_value: Precision::Exact(ScalarValue::Int64(Some(5))),
+                sum_value: Precision::Exact(ScalarValue::Int64(Some(1000))),
+                distinct_count: Precision::Exact(15),
+                byte_size: Precision::Exact(160),
+            }],
+        };
+
+        let stats3 = Statistics {
+            num_rows: Precision::Exact(30),
+            total_byte_size: Precision::Exact(300),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(3),
+                max_value: Precision::Exact(ScalarValue::Int64(Some(150))),
+                min_value: Precision::Exact(ScalarValue::Int64(Some(1))),
+                sum_value: Precision::Exact(ScalarValue::Int64(Some(2000))),
+                distinct_count: Precision::Exact(25),
+                byte_size: Precision::Exact(240),
+            }],
+        };
+
+        let items = vec![&stats1, &stats2, &stats3];
+        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
+
+        assert_eq!(summary_stats.num_rows, Precision::Exact(60));
+        assert_eq!(summary_stats.total_byte_size, Precision::Exact(600));
+
+        let col_stats = &summary_stats.column_statistics[0];
+        assert_eq!(col_stats.null_count, Precision::Exact(6));
+        assert_eq!(
+            col_stats.max_value,
+            Precision::Exact(ScalarValue::Int64(Some(200)))
+        );
+        assert_eq!(
+            col_stats.min_value,
+            Precision::Exact(ScalarValue::Int64(Some(1)))
+        );
+        assert_eq!(
+            col_stats.sum_value,
+            Precision::Exact(ScalarValue::Int64(Some(3500)))
+        );
+        assert_eq!(col_stats.byte_size, Precision::Exact(480));
+        // Overlap-based NDV merge (pairwise left-to-right):
+        // stats1+stats2: [10,100]+[5,200] -> NDV=16, then +stats3: [5,200]+[1,150] -> NDV=29
+        assert_eq!(col_stats.distinct_count, Precision::Inexact(29));
+    }
+
+    #[test]
+    fn test_try_merge_iter_float_types() {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "col1",
+            DataType::Float64,
+            false,
+        )]));
+
+        let stats1 = Statistics {
+            num_rows: Precision::Exact(10),
+            total_byte_size: Precision::Exact(80),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(0),
+                max_value: Precision::Exact(ScalarValue::Float64(Some(99.9))),
+                min_value: Precision::Exact(ScalarValue::Float64(Some(1.1))),
+                sum_value: Precision::Exact(ScalarValue::Float64(Some(500.5))),
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Exact(80),
+            }],
+        };
+
+        let stats2 = Statistics {
+            num_rows: Precision::Exact(10),
+            total_byte_size: Precision::Exact(80),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(0),
+                max_value: Precision::Exact(ScalarValue::Float64(Some(200.0))),
+                min_value: Precision::Exact(ScalarValue::Float64(Some(0.5))),
+                sum_value: Precision::Exact(ScalarValue::Float64(Some(1000.0))),
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Exact(80),
+            }],
+        };
+
+        let items = vec![&stats1, &stats2];
+        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
+
+        let col_stats = &summary_stats.column_statistics[0];
+        assert_eq!(
+            col_stats.max_value,
+            Precision::Exact(ScalarValue::Float64(Some(200.0)))
+        );
+        assert_eq!(
+            col_stats.min_value,
+            Precision::Exact(ScalarValue::Float64(Some(0.5)))
+        );
+        assert_eq!(
+            col_stats.sum_value,
+            Precision::Exact(ScalarValue::Float64(Some(1500.5)))
+        );
+    }
+
+    #[test]
+    fn test_try_merge_iter_string_types() {
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("col1", DataType::Utf8, false)]));
+
+        let stats1 = Statistics {
+            num_rows: Precision::Exact(10),
+            total_byte_size: Precision::Exact(100),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(0),
+                max_value: Precision::Exact(ScalarValue::Utf8(Some("dog".to_string()))),
+                min_value: Precision::Exact(ScalarValue::Utf8(Some("ant".to_string()))),
+                sum_value: Precision::Absent,
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Exact(100),
+            }],
+        };
+
+        let stats2 = Statistics {
+            num_rows: Precision::Exact(10),
+            total_byte_size: Precision::Exact(100),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(0),
+                max_value: Precision::Exact(ScalarValue::Utf8(Some("zebra".to_string()))),
+                min_value: Precision::Exact(ScalarValue::Utf8(Some("bat".to_string()))),
+                sum_value: Precision::Absent,
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Exact(100),
+            }],
+        };
+
+        let items = vec![&stats1, &stats2];
+        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
+
+        let col_stats = &summary_stats.column_statistics[0];
+        assert_eq!(
+            col_stats.max_value,
+            Precision::Exact(ScalarValue::Utf8(Some("zebra".to_string())))
+        );
+        assert_eq!(
+            col_stats.min_value,
+            Precision::Exact(ScalarValue::Utf8(Some("ant".to_string())))
+        );
+        assert_eq!(col_stats.sum_value, Precision::Absent);
+    }
+
+    #[test]
+    fn test_try_merge_iter_all_inexact() {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "col1",
+            DataType::Int32,
+            false,
+        )]));
+
+        let stats1 = Statistics {
+            num_rows: Precision::Inexact(10),
+            total_byte_size: Precision::Inexact(100),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Inexact(1),
+                max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
+                min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
+                sum_value: Precision::Inexact(ScalarValue::Int32(Some(500))),
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Inexact(40),
+            }],
+        };
+
+        let stats2 = Statistics {
+            num_rows: Precision::Inexact(20),
+            total_byte_size: Precision::Inexact(200),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Inexact(2),
+                max_value: Precision::Inexact(ScalarValue::Int32(Some(200))),
+                min_value: Precision::Inexact(ScalarValue::Int32(Some(-5))),
+                sum_value: Precision::Inexact(ScalarValue::Int32(Some(1000))),
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Inexact(60),
+            }],
+        };
+
+        let items = vec![&stats1, &stats2];
+        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
+
+        assert_eq!(summary_stats.num_rows, Precision::Inexact(30));
+        assert_eq!(summary_stats.total_byte_size, Precision::Inexact(300));
+
+        let col_stats = &summary_stats.column_statistics[0];
+        assert_eq!(col_stats.null_count, Precision::Inexact(3));
+        assert_eq!(
+            col_stats.max_value,
+            Precision::Inexact(ScalarValue::Int32(Some(200)))
+        );
+        assert_eq!(
+            col_stats.min_value,
+            Precision::Inexact(ScalarValue::Int32(Some(-5)))
+        );
+        assert_eq!(
+            col_stats.sum_value,
+            Precision::Inexact(ScalarValue::Int64(Some(1500)))
+        );
+    }
+
+    #[test]
+    fn test_precision_min_in_place() {
+        // Exact vs Exact: keeps the smaller
+        let mut lhs = Precision::Exact(10);
+        precision_min(&mut lhs, &Precision::Exact(20));
+        assert_eq!(lhs, Precision::Exact(10));
+
+        let mut lhs = Precision::Exact(20);
+        precision_min(&mut lhs, &Precision::Exact(10));
+        assert_eq!(lhs, Precision::Exact(10));
+
+        // Equal exact values
+        let mut lhs = Precision::Exact(5);
+        precision_min(&mut lhs, &Precision::Exact(5));
+        assert_eq!(lhs, Precision::Exact(5));
+
+        // Mixed exact/inexact: result is Inexact with smaller value
+        let mut lhs = Precision::Exact(10);
+        precision_min(&mut lhs, &Precision::Inexact(20));
+        assert_eq!(lhs, Precision::Inexact(10));
+
+        let mut lhs = Precision::Inexact(10);
+        precision_min(&mut lhs, &Precision::Exact(5));
+        assert_eq!(lhs, Precision::Inexact(5));
+
+        // Inexact vs Inexact
+        let mut lhs = Precision::Inexact(30);
+        precision_min(&mut lhs, &Precision::Inexact(20));
+        assert_eq!(lhs, Precision::Inexact(20));
+
+        // Absent makes result Absent
+        let mut lhs = Precision::Exact(10);
+        precision_min(&mut lhs, &Precision::Absent);
+        assert_eq!(lhs, Precision::Absent);
+
+        let mut lhs = Precision::<i32>::Absent;
+        precision_min(&mut lhs, &Precision::Exact(10));
+        assert_eq!(lhs, Precision::Absent);
+    }
+
+    #[test]
+    fn test_precision_max_in_place() {
+        // Exact vs Exact: keeps the larger
+        let mut lhs = Precision::Exact(10);
+        precision_max(&mut lhs, &Precision::Exact(20));
+        assert_eq!(lhs, Precision::Exact(20));
+
+        let mut lhs = Precision::Exact(20);
+        precision_max(&mut lhs, &Precision::Exact(10));
+        assert_eq!(lhs, Precision::Exact(20));
+
+        // Equal exact values
+        let mut lhs = Precision::Exact(5);
+        precision_max(&mut lhs, &Precision::Exact(5));
+        assert_eq!(lhs, Precision::Exact(5));
+
+        // Mixed exact/inexact: result is Inexact with larger value
+        let mut lhs = Precision::Exact(10);
+        precision_max(&mut lhs, &Precision::Inexact(20));
+        assert_eq!(lhs, Precision::Inexact(20));
+
+        let mut lhs = Precision::Inexact(10);
+        precision_max(&mut lhs, &Precision::Exact(5));
+        assert_eq!(lhs, Precision::Inexact(10));
+
+        // Inexact vs Inexact
+        let mut lhs = Precision::Inexact(20);
+        precision_max(&mut lhs, &Precision::Inexact(30));
+        assert_eq!(lhs, Precision::Inexact(30));
+
+        // Absent makes result Absent
+        let mut lhs = Precision::Exact(10);
+        precision_max(&mut lhs, &Precision::Absent);
+        assert_eq!(lhs, Precision::Absent);
+
+        let mut lhs = Precision::<i32>::Absent;
+        precision_max(&mut lhs, &Precision::Exact(10));
+        assert_eq!(lhs, Precision::Absent);
+    }
+
+    #[test]
+    fn test_cast_sum_value_to_sum_type_in_place_widens_int32() {
+        let mut value = Precision::Exact(ScalarValue::Int32(Some(42)));
+        cast_sum_value_to_sum_type_in_place(&mut value);
+        assert_eq!(value, Precision::Exact(ScalarValue::Int64(Some(42))));
+    }
+
+    #[test]
+    fn test_cast_sum_value_to_sum_type_in_place_preserves_int64() {
+        // Int64 is already the sum type for Int64, no widening needed
+        let mut value = Precision::Exact(ScalarValue::Int64(Some(100)));
+        cast_sum_value_to_sum_type_in_place(&mut value);
+        assert_eq!(value, Precision::Exact(ScalarValue::Int64(Some(100))));
+    }
+
+    #[test]
+    fn test_cast_sum_value_to_sum_type_in_place_inexact() {
+        let mut value = Precision::Inexact(ScalarValue::Int32(Some(42)));
+        cast_sum_value_to_sum_type_in_place(&mut value);
+        assert_eq!(value, Precision::Inexact(ScalarValue::Int64(Some(42))));
+    }
+
+    #[test]
+    fn test_cast_sum_value_to_sum_type_in_place_absent() {
+        let mut value = Precision::<ScalarValue>::Absent;
+        cast_sum_value_to_sum_type_in_place(&mut value);
+        assert_eq!(value, Precision::Absent);
+    }
+
+    #[test]
+    fn test_precision_add_for_sum_in_place_same_type() {
+        // Int64 + Int64: no widening needed, straight add
+        let mut lhs = Precision::Exact(ScalarValue::Int64(Some(10)));
+        let rhs = Precision::Exact(ScalarValue::Int64(Some(20)));
+        precision_add_for_sum_in_place(&mut lhs, &rhs);
+        assert_eq!(lhs, Precision::Exact(ScalarValue::Int64(Some(30))));
+    }
+
+    #[test]
+    fn test_precision_add_for_sum_in_place_widens_rhs() {
+        // lhs is already Int64 (widened), rhs is Int32 -> gets cast to Int64
+        let mut lhs = Precision::Exact(ScalarValue::Int64(Some(10)));
+        let rhs = Precision::Exact(ScalarValue::Int32(Some(5)));
+        precision_add_for_sum_in_place(&mut lhs, &rhs);
+        assert_eq!(lhs, Precision::Exact(ScalarValue::Int64(Some(15))));
+    }
+
+    #[test]
+    fn test_precision_add_for_sum_in_place_inexact() {
+        let mut lhs = Precision::Inexact(ScalarValue::Int64(Some(10)));
+        let rhs = Precision::Inexact(ScalarValue::Int32(Some(5)));
+        precision_add_for_sum_in_place(&mut lhs, &rhs);
+        assert_eq!(lhs, Precision::Inexact(ScalarValue::Int64(Some(15))));
+    }
+
+    #[test]
+    fn test_precision_add_for_sum_in_place_absent_rhs() {
+        let mut lhs = Precision::Exact(ScalarValue::Int64(Some(10)));
+        precision_add_for_sum_in_place(&mut lhs, &Precision::Absent);
+        assert_eq!(lhs, Precision::Absent);
     }
 }
diff --git a/datafusion/common/src/test_util.rs b/datafusion/common/src/test_util.rs
index c51dea1c4de04..f060704944233 100644
--- a/datafusion/common/src/test_util.rs
+++ b/datafusion/common/src/test_util.rs
@@ -735,32 +735,34 @@ mod tests {
         let non_existing = cwd.join("non-existing-dir").display().to_string();
         let non_existing_str = non_existing.as_str();
 
-        env::set_var(udf_env, non_existing_str);
-        let res = get_data_dir(udf_env, existing_str);
-        assert!(res.is_err());
-
-        env::set_var(udf_env, "");
-        let res = get_data_dir(udf_env, existing_str);
-        assert!(res.is_ok());
-        assert_eq!(res.unwrap(), existing_pb);
-
-        env::set_var(udf_env, " ");
-        let res = get_data_dir(udf_env, existing_str);
-        assert!(res.is_ok());
-        assert_eq!(res.unwrap(), existing_pb);
-
-        env::set_var(udf_env, existing_str);
-        let res = get_data_dir(udf_env, existing_str);
-        assert!(res.is_ok());
-        assert_eq!(res.unwrap(), existing_pb);
-
-        env::remove_var(udf_env);
-        let res = get_data_dir(udf_env, non_existing_str);
-        assert!(res.is_err());
-
-        let res = get_data_dir(udf_env, existing_str);
-        assert!(res.is_ok());
-        assert_eq!(res.unwrap(), existing_pb);
+        unsafe {
+            env::set_var(udf_env, non_existing_str);
+            let res = get_data_dir(udf_env, existing_str);
+            assert!(res.is_err());
+
+            env::set_var(udf_env, "");
+            let res = get_data_dir(udf_env, existing_str);
+            assert!(res.is_ok());
+            assert_eq!(res.unwrap(), existing_pb);
+
+            env::set_var(udf_env, " ");
+            let res = get_data_dir(udf_env, existing_str);
+            assert!(res.is_ok());
+            assert_eq!(res.unwrap(), existing_pb);
+
+            env::set_var(udf_env, existing_str);
+            let res = get_data_dir(udf_env, existing_str);
+            assert!(res.is_ok());
+            assert_eq!(res.unwrap(), existing_pb);
+
+            env::remove_var(udf_env);
+            let res = get_data_dir(udf_env, non_existing_str);
+            assert!(res.is_err());
+
+            let res = get_data_dir(udf_env, existing_str);
+            assert!(res.is_ok());
+            assert_eq!(res.unwrap(), existing_pb);
+        }
     }
 
     #[test]
diff --git a/datafusion/common/src/tree_node.rs b/datafusion/common/src/tree_node.rs
index 9b36266eec2e9..39300b9564621 100644
--- a/datafusion/common/src/tree_node.rs
+++ b/datafusion/common/src/tree_node.rs
@@ -796,7 +796,9 @@ pub trait TreeNodeContainer<'a, T: 'a>: Sized {
     ) -> Result<Transformed<Self>>;
 }
 
-impl<'a, T: 'a, C: TreeNodeContainer<'a, T>> TreeNodeContainer<'a, T> for Box<C> {
+impl<'a, T: 'a, C: TreeNodeContainer<'a, T> + Default> TreeNodeContainer<'a, T>
+    for Box<C>
+{
     fn apply_elements<F: FnMut(&'a T) -> Result<TreeNodeRecursion>>(
         &'a self,
         f: F,
@@ -805,14 +807,24 @@ impl<'a, T: 'a, C: TreeNodeContainer<'a, T>> TreeNodeContainer<'a, T> for Box<C>
     }
 
     fn map_elements<F: FnMut(T) -> Result<Transformed<T>>>(
-        self,
+        mut self,
         f: F,
     ) -> Result<Transformed<Self>> {
-        (*self).map_elements(f)?.map_data(|c| Ok(Self::new(c)))
+        // Rewrite in place so the existing heap allocation can be reused.
+        // `mem::take` hands the inner `C` to `f` while leaving
+        // `C::default()` in the slot, so an unwinding drop finds a valid
+        // `C` even if `f` panics or the `?` short-circuits.
+        let inner = std::mem::take(&mut *self);
+        Ok(inner.map_elements(f)?.update_data(|c| {
+            *self = c;
+            self
+        }))
     }
 }
 
-impl<'a, T: 'a, C: TreeNodeContainer<'a, T> + Clone> TreeNodeContainer<'a, T> for Arc<C> {
+impl<'a, T: 'a, C: TreeNodeContainer<'a, T> + Clone + Default> TreeNodeContainer<'a, T>
+    for Arc<C>
+{
     fn apply_elements<F: FnMut(&'a T) -> Result<TreeNodeRecursion>>(
         &'a self,
         f: F,
@@ -821,12 +833,18 @@ impl<'a, T: 'a, C: TreeNodeContainer<'a, T> + Clone> TreeNodeContainer<'a, T> fo
     }
 
     fn map_elements<F: FnMut(T) -> Result<Transformed<T>>>(
-        self,
+        mut self,
         f: F,
     ) -> Result<Transformed<Self>> {
-        Arc::unwrap_or_clone(self)
-            .map_elements(f)?
-            .map_data(|c| Ok(Arc::new(c)))
+        // Rewrite in place using the same `mem::take` strategy as
+        // `Box<C>::map_elements`. `Arc::make_mut` gives us exclusive
+        // access (cloning `C` first if we were sharing), after which
+        // `get_mut` is infallible.
+        let inner = std::mem::take(Arc::make_mut(&mut self));
+        Ok(inner.map_elements(f)?.update_data(|c| {
+            *Arc::get_mut(&mut self).unwrap() = c;
+            self
+        }))
     }
 }
 
@@ -956,12 +974,12 @@ impl<'a, T: 'a, C0: TreeNodeContainer<'a, T>, C1: TreeNodeContainer<'a, T>>
 }
 
 impl<
-        'a,
-        T: 'a,
-        C0: TreeNodeContainer<'a, T>,
-        C1: TreeNodeContainer<'a, T>,
-        C2: TreeNodeContainer<'a, T>,
-    > TreeNodeContainer<'a, T> for (C0, C1, C2)
+    'a,
+    T: 'a,
+    C0: TreeNodeContainer<'a, T>,
+    C1: TreeNodeContainer<'a, T>,
+    C2: TreeNodeContainer<'a, T>,
+> TreeNodeContainer<'a, T> for (C0, C1, C2)
 {
     fn apply_elements<F: FnMut(&'a T) -> Result<TreeNodeRecursion>>(
         &'a self,
@@ -992,13 +1010,13 @@ impl<
 }
 
 impl<
-        'a,
-        T: 'a,
-        C0: TreeNodeContainer<'a, T>,
-        C1: TreeNodeContainer<'a, T>,
-        C2: TreeNodeContainer<'a, T>,
-        C3: TreeNodeContainer<'a, T>,
-    > TreeNodeContainer<'a, T> for (C0, C1, C2, C3)
+    'a,
+    T: 'a,
+    C0: TreeNodeContainer<'a, T>,
+    C1: TreeNodeContainer<'a, T>,
+    C2: TreeNodeContainer<'a, T>,
+    C3: TreeNodeContainer<'a, T>,
+> TreeNodeContainer<'a, T> for (C0, C1, C2, C3)
 {
     fn apply_elements<F: FnMut(&'a T) -> Result<TreeNodeRecursion>>(
         &'a self,
@@ -1090,12 +1108,12 @@ impl<'a, T: 'a, C0: TreeNodeContainer<'a, T>, C1: TreeNodeContainer<'a, T>>
 }
 
 impl<
-        'a,
-        T: 'a,
-        C0: TreeNodeContainer<'a, T>,
-        C1: TreeNodeContainer<'a, T>,
-        C2: TreeNodeContainer<'a, T>,
-    > TreeNodeRefContainer<'a, T> for (&'a C0, &'a C1, &'a C2)
+    'a,
+    T: 'a,
+    C0: TreeNodeContainer<'a, T>,
+    C1: TreeNodeContainer<'a, T>,
+    C2: TreeNodeContainer<'a, T>,
+> TreeNodeRefContainer<'a, T> for (&'a C0, &'a C1, &'a C2)
 {
     fn apply_ref_elements<F: FnMut(&'a T) -> Result<TreeNodeRecursion>>(
         &self,
@@ -1109,13 +1127,13 @@ impl<
 }
 
 impl<
-        'a,
-        T: 'a,
-        C0: TreeNodeContainer<'a, T>,
-        C1: TreeNodeContainer<'a, T>,
-        C2: TreeNodeContainer<'a, T>,
-        C3: TreeNodeContainer<'a, T>,
-    > TreeNodeRefContainer<'a, T> for (&'a C0, &'a C1, &'a C2, &'a C3)
+    'a,
+    T: 'a,
+    C0: TreeNodeContainer<'a, T>,
+    C1: TreeNodeContainer<'a, T>,
+    C2: TreeNodeContainer<'a, T>,
+    C3: TreeNodeContainer<'a, T>,
+> TreeNodeRefContainer<'a, T> for (&'a C0, &'a C1, &'a C2, &'a C3)
 {
     fn apply_ref_elements<F: FnMut(&'a T) -> Result<TreeNodeRecursion>>(
         &self,
@@ -1335,14 +1353,15 @@ impl<T: ConcreteTreeNode> TreeNode for T {
 pub(crate) mod tests {
     use std::collections::HashMap;
     use std::fmt::Display;
+    use std::sync::Arc;
 
+    use crate::Result;
     use crate::tree_node::{
         Transformed, TreeNode, TreeNodeContainer, TreeNodeRecursion, TreeNodeRewriter,
         TreeNodeVisitor,
     };
-    use crate::Result;
 
-    #[derive(Debug, Eq, Hash, PartialEq, Clone)]
+    #[derive(Debug, Default, Eq, Hash, PartialEq, Clone)]
     pub struct TestTreeNode<T> {
         pub(crate) children: Vec<TestTreeNode<T>>,
         pub(crate) data: T,
@@ -2431,4 +2450,46 @@ pub(crate) mod tests {
 
         item.visit(&mut visitor).unwrap();
     }
+
+    #[test]
+    fn box_map_elements_reuses_allocation() {
+        let boxed = Box::new(TestTreeNode::new_leaf(42i32));
+        let before: *const TestTreeNode<i32> = &*boxed;
+        let out = boxed.map_elements(|n| Ok(Transformed::no(n))).unwrap();
+        let after: *const TestTreeNode<i32> = &*out.data;
+        assert_eq!(after, before);
+    }
+
+    #[test]
+    fn arc_map_elements_reuses_allocation_when_unique() {
+        let arc = Arc::new(TestTreeNode::new_leaf(42i32));
+        let before = Arc::as_ptr(&arc);
+        let out = arc.map_elements(|n| Ok(Transformed::no(n))).unwrap();
+        assert_eq!(Arc::as_ptr(&out.data), before);
+    }
+
+    #[test]
+    fn arc_map_elements_clones_when_shared() {
+        // When the input `Arc` is shared, `make_mut` clones into a fresh
+        // allocation, so the reuse optimization does not apply.
+        let arc = Arc::new(TestTreeNode::new_leaf(42i32));
+        let _keepalive = Arc::clone(&arc);
+        let before = Arc::as_ptr(&arc);
+        let out = arc.map_elements(|n| Ok(Transformed::no(n))).unwrap();
+        assert_ne!(Arc::as_ptr(&out.data), before);
+    }
+
+    #[test]
+    fn box_map_elements_panic() {
+        use std::panic::{AssertUnwindSafe, catch_unwind};
+        let boxed = Box::new(TestTreeNode::new_leaf(42i32));
+        let result = catch_unwind(AssertUnwindSafe(|| {
+            boxed
+                .map_elements(|_: TestTreeNode<i32>| -> Result<_> {
+                    panic!("simulated panic during rewrite")
+                })
+                .ok()
+        }));
+        assert!(result.is_err());
+    }
 }
diff --git a/datafusion/common/src/types/builtin.rs b/datafusion/common/src/types/builtin.rs
index 314529b99a342..dfd2cc4cf2d8b 100644
--- a/datafusion/common/src/types/builtin.rs
+++ b/datafusion/common/src/types/builtin.rs
@@ -16,6 +16,7 @@
 // under the License.
 
 use arrow::datatypes::IntervalUnit::*;
+use arrow::datatypes::TimeUnit::*;
 
 use crate::types::{LogicalTypeRef, NativeType};
 use std::sync::{Arc, LazyLock};
@@ -82,3 +83,17 @@ singleton_variant!(
     Interval,
     MonthDayNano
 );
+
+singleton_variant!(
+    LOGICAL_INTERVAL_YEAR_MONTH,
+    logical_interval_year_month,
+    Interval,
+    YearMonth
+);
+
+singleton_variant!(
+    LOGICAL_DURATION_MICROSECOND,
+    logical_duration_microsecond,
+    Duration,
+    Microsecond
+);
diff --git a/datafusion/common/src/types/canonical_extensions/bool8.rs b/datafusion/common/src/types/canonical_extensions/bool8.rs
new file mode 100644
index 0000000000000..e0f7a5914a6ed
--- /dev/null
+++ b/datafusion/common/src/types/canonical_extensions/bool8.rs
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::Result;
+use crate::error::_internal_err;
+use crate::types::extension::DFExtensionType;
+use arrow::array::{Array, Int8Array};
+use arrow::datatypes::DataType;
+use arrow::util::display::{ArrayFormatter, DisplayIndex, FormatOptions, FormatResult};
+use arrow_schema::extension::{Bool8, ExtensionType};
+use std::fmt::Write;
+
+/// Defines the extension type logic for the canonical `arrow.bool8` extension type. This extension
+/// type allows storing a Boolean value in a single byte, instead of a single bit.
+///
+/// See [`DFExtensionType`] for information on DataFusion's extension type mechanism. See also
+/// [`Bool8`] for the implementation of arrow-rs, which this type uses internally.
+///
+/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#bit-boolean>
+#[derive(Debug, Clone)]
+pub struct DFBool8(Bool8);
+
+impl DFBool8 {
+    /// Creates a new [`DFBool8`], validating that the storage type is compatible with the
+    /// extension type.
+    ///
+    /// Even though [`DFBool8`] only supports a single storage type ([`DataType::Int8`]), passing-in
+    /// the storage type allows conveniently validating whether this extension type is compatible
+    /// with a given [`DataType`].
+    pub fn try_new(
+        data_type: &DataType,
+        metadata: <Bool8 as ExtensionType>::Metadata,
+    ) -> Result<Self> {
+        // Validates the storage type
+        Ok(Self(<Bool8 as ExtensionType>::try_new(
+            data_type, metadata,
+        )?))
+    }
+}
+
+impl DFExtensionType for DFBool8 {
+    fn storage_type(&self) -> DataType {
+        DataType::Int8
+    }
+
+    fn serialize_metadata(&self) -> Option<String> {
+        self.0.serialize_metadata()
+    }
+
+    fn create_array_formatter<'fmt>(
+        &self,
+        array: &'fmt dyn Array,
+        options: &FormatOptions<'fmt>,
+    ) -> Result<Option<ArrayFormatter<'fmt>>> {
+        if array.data_type() != &DataType::Int8 {
+            return _internal_err!("Wrong array type for Bool8");
+        }
+
+        let display_index = Bool8ValueDisplayIndex {
+            array: array.as_any().downcast_ref().unwrap(),
+            null_str: options.null(),
+        };
+        Ok(Some(ArrayFormatter::new(
+            Box::new(display_index),
+            options.safe(),
+        )))
+    }
+}
+
+/// Pretty printer for binary bool8 values.
+#[derive(Debug, Clone, Copy)]
+struct Bool8ValueDisplayIndex<'a> {
+    array: &'a Int8Array,
+    null_str: &'a str,
+}
+
+impl DisplayIndex for Bool8ValueDisplayIndex<'_> {
+    fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+        if self.array.is_null(idx) {
+            write!(f, "{}", self.null_str)?;
+            return Ok(());
+        }
+
+        let bytes = self.array.value(idx);
+        write!(f, "{}", bytes != 0)?;
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    pub fn test_pretty_bool8() {
+        let values = Int8Array::from_iter([Some(0), Some(1), Some(-20), None]);
+
+        let extension_type = DFBool8(Bool8 {});
+        let formatter = extension_type
+            .create_array_formatter(&values, &FormatOptions::default().with_null("NULL"))
+            .unwrap()
+            .unwrap();
+
+        assert_eq!(formatter.value(0).to_string(), "false");
+        assert_eq!(formatter.value(1).to_string(), "true");
+        assert_eq!(formatter.value(2).to_string(), "true");
+        assert_eq!(formatter.value(3).to_string(), "NULL");
+    }
+}
diff --git a/datafusion/common/src/types/canonical_extensions/fixed_shape_tensor.rs b/datafusion/common/src/types/canonical_extensions/fixed_shape_tensor.rs
new file mode 100644
index 0000000000000..9148d9a1b39f2
--- /dev/null
+++ b/datafusion/common/src/types/canonical_extensions/fixed_shape_tensor.rs
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::Result;
+use crate::types::extension::DFExtensionType;
+use arrow::datatypes::DataType;
+use arrow_schema::extension::{ExtensionType, FixedShapeTensor};
+
+/// Defines the extension type logic for the canonical `arrow.fixed_shape_tensor` extension type.
+/// This extension type can be used to store a [tensor](https://en.wikipedia.org/wiki/Tensor) of
+/// a fixed shape.
+///
+/// See [`DFExtensionType`] for information on DataFusion's extension type mechanism. See also
+/// [`FixedShapeTensor`] for the implementation of arrow-rs, which this type uses internally.
+///
+/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#fixed-shape-tensor>
+#[derive(Debug, Clone)]
+pub struct DFFixedShapeTensor {
+    inner: FixedShapeTensor,
+    /// The storage type of the tensor.
+    ///
+    /// While we could reconstruct the storage type from the inner [`FixedShapeTensor`], we may
+    /// choose a different name for the field within the [`DataType::FixedSizeList`] which can
+    /// cause problems down the line (e.g., checking for equality).
+    storage_type: DataType,
+}
+
+impl DFFixedShapeTensor {
+    /// Creates a new [`DFFixedShapeTensor`], validating that the storage type is compatible with
+    /// the extension type.
+    pub fn try_new(
+        data_type: &DataType,
+        metadata: <FixedShapeTensor as ExtensionType>::Metadata,
+    ) -> Result<Self> {
+        Ok(Self {
+            inner: <FixedShapeTensor as ExtensionType>::try_new(data_type, metadata)?,
+            storage_type: data_type.clone(),
+        })
+    }
+}
+
+impl DFExtensionType for DFFixedShapeTensor {
+    fn storage_type(&self) -> DataType {
+        self.storage_type.clone()
+    }
+
+    fn serialize_metadata(&self) -> Option<String> {
+        self.inner.serialize_metadata()
+    }
+}
diff --git a/datafusion/common/src/types/canonical_extensions/json.rs b/datafusion/common/src/types/canonical_extensions/json.rs
new file mode 100644
index 0000000000000..8be9993a26061
--- /dev/null
+++ b/datafusion/common/src/types/canonical_extensions/json.rs
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::Result;
+use crate::types::extension::DFExtensionType;
+use arrow::datatypes::DataType;
+use arrow_schema::extension::{ExtensionType, Json};
+
+/// Defines the extension type logic for the canonical `arrow.json` extension type. This extension
+/// type defines that a particular string field stores JSON values.
+///
+/// See [`DFExtensionType`] for information on DataFusion's extension type mechanism. See also
+/// [`Json`] for the implementation of arrow-rs, which this type uses internally.
+///
+/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#json>
+#[derive(Debug, Clone)]
+pub struct DFJson {
+    inner: Json,
+    storage_type: DataType,
+}
+
+impl DFJson {
+    /// Creates a new [`DFJson`], validating that the storage type is compatible with the
+    /// extension type.
+    pub fn try_new(
+        data_type: &DataType,
+        metadata: <Json as ExtensionType>::Metadata,
+    ) -> Result<Self> {
+        Ok(Self {
+            inner: <Json as ExtensionType>::try_new(data_type, metadata)?,
+            storage_type: data_type.clone(),
+        })
+    }
+}
+
+impl DFExtensionType for DFJson {
+    fn storage_type(&self) -> DataType {
+        self.storage_type.clone()
+    }
+
+    fn serialize_metadata(&self) -> Option<String> {
+        self.inner.serialize_metadata()
+    }
+}
diff --git a/datafusion/common/src/types/canonical_extensions/mod.rs b/datafusion/common/src/types/canonical_extensions/mod.rs
new file mode 100644
index 0000000000000..2d74d0669d213
--- /dev/null
+++ b/datafusion/common/src/types/canonical_extensions/mod.rs
@@ -0,0 +1,32 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod bool8;
+mod fixed_shape_tensor;
+mod json;
+mod opaque;
+mod timestamp_with_offset;
+mod uuid;
+mod variable_shape_tensor;
+
+pub use bool8::DFBool8;
+pub use fixed_shape_tensor::DFFixedShapeTensor;
+pub use json::DFJson;
+pub use opaque::DFOpaque;
+pub use timestamp_with_offset::DFTimestampWithOffset;
+pub use uuid::DFUuid;
+pub use variable_shape_tensor::DFVariableShapeTensor;
diff --git a/datafusion/common/src/types/canonical_extensions/opaque.rs b/datafusion/common/src/types/canonical_extensions/opaque.rs
new file mode 100644
index 0000000000000..d14f07737b6a7
--- /dev/null
+++ b/datafusion/common/src/types/canonical_extensions/opaque.rs
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::Result;
+use crate::types::extension::DFExtensionType;
+use arrow::datatypes::DataType;
+use arrow_schema::extension::{ExtensionType, Opaque};
+
+/// Defines the extension type logic for the canonical `arrow.opaque` extension type. This extension
+/// type represents types that DataFusion cannot interpret.
+///
+/// See [`DFExtensionType`] for information on DataFusion's extension type mechanism. See also
+/// [`Opaque`] for the implementation of arrow-rs, which this type uses internally.
+///
+/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#opaque>
+#[derive(Debug, Clone)]
+pub struct DFOpaque {
+    inner: Opaque,
+    storage_type: DataType,
+}
+
+impl DFOpaque {
+    /// Creates a new [`DFOpaque`], validating that the storage type is compatible with the
+    /// extension type.
+    pub fn try_new(
+        data_type: &DataType,
+        metadata: <Opaque as ExtensionType>::Metadata,
+    ) -> Result<Self> {
+        Ok(Self {
+            inner: <Opaque as ExtensionType>::try_new(data_type, metadata)?,
+            storage_type: data_type.clone(),
+        })
+    }
+}
+
+impl DFExtensionType for DFOpaque {
+    fn storage_type(&self) -> DataType {
+        self.storage_type.clone()
+    }
+
+    fn serialize_metadata(&self) -> Option<String> {
+        self.inner.serialize_metadata()
+    }
+}
diff --git a/datafusion/common/src/types/canonical_extensions/timestamp_with_offset.rs b/datafusion/common/src/types/canonical_extensions/timestamp_with_offset.rs
new file mode 100644
index 0000000000000..58a5fff9d0c28
--- /dev/null
+++ b/datafusion/common/src/types/canonical_extensions/timestamp_with_offset.rs
@@ -0,0 +1,304 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::Result;
+use crate::ScalarValue;
+use crate::error::_internal_err;
+use crate::types::extension::DFExtensionType;
+use arrow::array::{Array, AsArray, Int16Array};
+use arrow::buffer::NullBuffer;
+use arrow::compute::cast;
+use arrow::datatypes::{
+    DataType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType,
+    TimestampNanosecondType, TimestampSecondType,
+};
+use arrow::util::display::{ArrayFormatter, DisplayIndex, FormatOptions, FormatResult};
+use arrow_schema::ArrowError;
+use arrow_schema::extension::{ExtensionType, TimestampWithOffset};
+use std::fmt::Write;
+
+/// Defines the extension type logic for the canonical `arrow.timestamp_with_offset` extension type.
+/// This extension type allows associating a different offset for each timestamp in a column.
+///
+/// See [`DFExtensionType`] for information on DataFusion's extension type mechanism. See also
+/// [`TimestampWithOffset`] for the implementation of arrow-rs, which this type uses internally.
+///
+/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#timestamp-with-offset>
+#[derive(Debug, Clone)]
+pub struct DFTimestampWithOffset {
+    inner: TimestampWithOffset,
+    storage_type: DataType,
+}
+
+impl DFTimestampWithOffset {
+    /// Creates a new [`DFTimestampWithOffset`], validating that the storage type is compatible with
+    /// the extension type.
+    pub fn try_new(
+        data_type: &DataType,
+        metadata: <TimestampWithOffset as ExtensionType>::Metadata,
+    ) -> Result<Self> {
+        Ok(Self {
+            inner: <TimestampWithOffset as ExtensionType>::try_new(data_type, metadata)?,
+            storage_type: data_type.clone(),
+        })
+    }
+}
+
+impl DFExtensionType for DFTimestampWithOffset {
+    fn storage_type(&self) -> DataType {
+        self.storage_type.clone()
+    }
+
+    fn serialize_metadata(&self) -> Option<String> {
+        self.inner.serialize_metadata()
+    }
+
+    fn create_array_formatter<'fmt>(
+        &self,
+        array: &'fmt dyn Array,
+        options: &FormatOptions<'fmt>,
+    ) -> Result<Option<ArrayFormatter<'fmt>>> {
+        if array.data_type() != &self.storage_type {
+            return _internal_err!(
+                "Unexpected data type for TimestampWithOffset: {}",
+                array.data_type()
+            );
+        }
+
+        let struct_array = array.as_struct();
+        let timestamp_array = struct_array
+            .column_by_name("timestamp")
+            .expect("Type checked above")
+            .as_ref();
+        let raw_offset_array = struct_array
+            .column_by_name("offset_minutes")
+            .expect("Type checked above");
+
+        // Get a regular [`Int16Array`], if the offset array is a dictionary or run-length encoded.
+        let offset_array = cast(&raw_offset_array, &DataType::Int16)?
+            .as_primitive()
+            .clone();
+
+        let display_index = TimestampWithOffsetDisplayIndex {
+            null_buffer: struct_array.nulls(),
+            timestamp_array,
+            offset_array,
+            options: options.clone(),
+        };
+
+        Ok(Some(ArrayFormatter::new(
+            Box::new(display_index),
+            options.safe(),
+        )))
+    }
+}
+
+struct TimestampWithOffsetDisplayIndex<'a> {
+    /// The inner arrays are always non-null. Use the null buffer of the struct array to check
+    /// whether an element is null.
+    null_buffer: Option<&'a NullBuffer>,
+    timestamp_array: &'a dyn Array,
+    offset_array: Int16Array,
+    options: FormatOptions<'a>,
+}
+
+impl DisplayIndex for TimestampWithOffsetDisplayIndex<'_> {
+    fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+        if self.null_buffer.map(|nb| nb.is_null(idx)).unwrap_or(false) {
+            write!(f, "{}", self.options.null())?;
+            return Ok(());
+        }
+
+        let offset_minutes = self.offset_array.value(idx);
+        let offset = format_offset(offset_minutes);
+
+        // The timestamp array must be UTC, so we can ignore the timezone.
+        let scalar = match self.timestamp_array.data_type() {
+            DataType::Timestamp(TimeUnit::Second, _) => {
+                let ts = self
+                    .timestamp_array
+                    .as_primitive::<TimestampSecondType>()
+                    .value(idx);
+                ScalarValue::TimestampSecond(Some(ts), Some(offset.into()))
+            }
+            DataType::Timestamp(TimeUnit::Millisecond, _) => {
+                let ts = self
+                    .timestamp_array
+                    .as_primitive::<TimestampMillisecondType>()
+                    .value(idx);
+                ScalarValue::TimestampMillisecond(Some(ts), Some(offset.into()))
+            }
+            DataType::Timestamp(TimeUnit::Microsecond, _) => {
+                let ts = self
+                    .timestamp_array
+                    .as_primitive::<TimestampMicrosecondType>()
+                    .value(idx);
+                ScalarValue::TimestampMicrosecond(Some(ts), Some(offset.into()))
+            }
+            DataType::Timestamp(TimeUnit::Nanosecond, _) => {
+                let ts = self
+                    .timestamp_array
+                    .as_primitive::<TimestampNanosecondType>()
+                    .value(idx);
+                ScalarValue::TimestampNanosecond(Some(ts), Some(offset.into()))
+            }
+            _ => unreachable!("TimestampWithOffset storage must be a Timestamp array"),
+        };
+
+        let array = scalar.to_array().map_err(|_| {
+            ArrowError::ComputeError("Failed to convert scalar to array".to_owned())
+        })?;
+        let formatter = ArrayFormatter::try_new(&array, &self.options)?;
+        formatter.value(0).write(f)?;
+
+        Ok(())
+    }
+}
+
+/// Formats the offset in the format `+/-HH:MM`, which can be used as an offset in the regular
+/// timestamp types.
+fn format_offset(minutes: i16) -> String {
+    let sign = if minutes >= 0 { '+' } else { '-' };
+    let minutes = minutes.abs();
+    let hours = minutes / 60;
+    let minutes = minutes % 60;
+    format!("{sign}{hours:02}:{minutes:02}")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{
+        Array, DictionaryArray, Int16Array, Int32Array, RunArray, StructArray,
+        TimestampSecondArray,
+    };
+    use arrow::buffer::NullBuffer;
+    use arrow::datatypes::{Field, Fields, Int16Type, Int32Type};
+    use chrono::{TimeZone, Utc};
+    use std::sync::Arc;
+
+    #[test]
+    fn test_pretty_print_timestamp_with_offset() -> Result<(), ArrowError> {
+        let ts = Utc
+            .with_ymd_and_hms(2024, 4, 1, 0, 0, 0)
+            .unwrap()
+            .timestamp();
+
+        let offset_array = Arc::new(Int16Array::from(vec![60, -105, 0]));
+
+        run_formatting_test(
+            vec![ts, ts, ts],
+            offset_array,
+            Some(NullBuffer::from(vec![true, true, false])),
+            FormatOptions::default().with_null("NULL"),
+            &[
+                "2024-04-01T01:00:00+01:00",
+                "2024-03-31T22:15:00-01:45",
+                "NULL",
+            ],
+        )
+    }
+
+    #[test]
+    fn test_pretty_print_dictionary_offset() -> Result<(), ArrowError> {
+        let ts = Utc
+            .with_ymd_and_hms(2024, 4, 1, 12, 0, 0)
+            .unwrap()
+            .timestamp();
+
+        let offset_array = Arc::new(DictionaryArray::<Int16Type>::new(
+            Int16Array::from(vec![0, 1, 0]),
+            Arc::new(Int16Array::from(vec![60, -60])),
+        ));
+
+        run_formatting_test(
+            vec![ts, ts, ts],
+            offset_array,
+            None,
+            FormatOptions::default(),
+            &[
+                "2024-04-01T13:00:00+01:00",
+                "2024-04-01T11:00:00-01:00",
+                "2024-04-01T13:00:00+01:00",
+            ],
+        )
+    }
+
+    #[test]
+    fn test_pretty_print_rle_offset() -> Result<(), ArrowError> {
+        let ts = Utc
+            .with_ymd_and_hms(2024, 4, 1, 12, 0, 0)
+            .unwrap()
+            .timestamp();
+
+        let run_ends = Int32Array::from(vec![2]);
+        let values = Int16Array::from(vec![120]);
+        let offset_array = Arc::new(RunArray::<Int32Type>::try_new(&run_ends, &values)?);
+
+        run_formatting_test(
+            vec![ts, ts],
+            offset_array,
+            None,
+            FormatOptions::default(),
+            &["2024-04-01T14:00:00+02:00", "2024-04-01T14:00:00+02:00"],
+        )
+    }
+
+    /// Create valid fields with flexible offset types
+    fn create_fields_custom_offset(time_unit: TimeUnit, offset_type: DataType) -> Fields {
+        let ts_field = Field::new(
+            "timestamp",
+            DataType::Timestamp(time_unit, Some("UTC".into())),
+            false,
+        );
+        let offset_field = Field::new("offset_minutes", offset_type, false);
+        Fields::from(vec![ts_field, offset_field])
+    }
+
+    /// Helper to construct the arrays, run the formatter, and assert the expected strings.
+    fn run_formatting_test(
+        timestamps: Vec<i64>,
+        offset_array: Arc<dyn Array>,
+        null_buffer: Option<NullBuffer>,
+        options: FormatOptions,
+        expected: &[&str],
+    ) -> Result<(), ArrowError> {
+        let fields = create_fields_custom_offset(
+            TimeUnit::Second,
+            offset_array.data_type().clone(),
+        );
+
+        let struct_array = StructArray::try_new(
+            fields,
+            vec![
+                Arc::new(TimestampSecondArray::from(timestamps).with_timezone("UTC")),
+                offset_array,
+            ],
+            null_buffer,
+        )?;
+
+        let formatter = DFTimestampWithOffset::try_new(struct_array.data_type(), ())?
+            .create_array_formatter(&struct_array, &options)?
+            .unwrap();
+
+        for (i, expected_str) in expected.iter().enumerate() {
+            assert_eq!(formatter.value(i).to_string(), *expected_str);
+        }
+
+        Ok(())
+    }
+}
diff --git a/datafusion/common/src/types/canonical_extensions/uuid.rs b/datafusion/common/src/types/canonical_extensions/uuid.rs
new file mode 100644
index 0000000000000..8cbcf3f58a80e
--- /dev/null
+++ b/datafusion/common/src/types/canonical_extensions/uuid.rs
@@ -0,0 +1,124 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::Result;
+use crate::error::_internal_err;
+use crate::types::extension::DFExtensionType;
+use arrow::array::{Array, FixedSizeBinaryArray};
+use arrow::datatypes::DataType;
+use arrow::util::display::{ArrayFormatter, DisplayIndex, FormatOptions, FormatResult};
+use arrow_schema::extension::{ExtensionType, Uuid};
+use std::fmt::Write;
+use uuid::Bytes;
+
+/// Defines the extension type logic for the canonical `arrow.uuid` extension type. This extension
+/// type defines that a field should be interpreted as a
+/// [UUID](https://de.wikipedia.org/wiki/Universally_Unique_Identifier).
+///
+/// See [`DFExtensionType`] for information on DataFusion's extension type mechanism. See also
+/// [`Uuid`] for the implementation of arrow-rs, which this type uses internally.
+///
+/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#uuid>
+#[derive(Debug, Clone)]
+pub struct DFUuid(Uuid);
+
+impl DFUuid {
+    /// Creates a new [`DFUuid`], validating that the storage type is compatible with the
+    /// extension type.
+    pub fn try_new(
+        data_type: &DataType,
+        metadata: <Uuid as ExtensionType>::Metadata,
+    ) -> Result<Self> {
+        Ok(Self(<Uuid as ExtensionType>::try_new(data_type, metadata)?))
+    }
+}
+
+impl DFExtensionType for DFUuid {
+    fn storage_type(&self) -> DataType {
+        DataType::FixedSizeBinary(16)
+    }
+
+    fn serialize_metadata(&self) -> Option<String> {
+        self.0.serialize_metadata()
+    }
+
+    fn create_array_formatter<'fmt>(
+        &self,
+        array: &'fmt dyn Array,
+        options: &FormatOptions<'fmt>,
+    ) -> Result<Option<ArrayFormatter<'fmt>>> {
+        if array.data_type() != &DataType::FixedSizeBinary(16) {
+            return _internal_err!("Wrong array type for Uuid");
+        }
+
+        let display_index = UuidValueDisplayIndex {
+            array: array.as_any().downcast_ref().unwrap(),
+            null_str: options.null(),
+        };
+        Ok(Some(ArrayFormatter::new(
+            Box::new(display_index),
+            options.safe(),
+        )))
+    }
+}
+
+/// Pretty printer for binary UUID values.
+#[derive(Debug, Clone, Copy)]
+struct UuidValueDisplayIndex<'a> {
+    array: &'a FixedSizeBinaryArray,
+    null_str: &'a str,
+}
+
+impl DisplayIndex for UuidValueDisplayIndex<'_> {
+    fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
+        if self.array.is_null(idx) {
+            write!(f, "{}", self.null_str)?;
+            return Ok(());
+        }
+
+        let bytes = Bytes::try_from(self.array.value(idx))
+            .expect("FixedSizeBinaryArray length checked in create_array_formatter");
+        let uuid = uuid::Uuid::from_bytes(bytes);
+        write!(f, "{uuid}")?;
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::ScalarValue;
+    use arrow_schema::ArrowError;
+
+    #[test]
+    pub fn test_pretty_print_uuid() -> Result<(), ArrowError> {
+        let my_uuid = uuid::Uuid::nil();
+        let uuid = ScalarValue::FixedSizeBinary(16, Some(my_uuid.as_bytes().to_vec()))
+            .to_array_of_size(1)?;
+
+        let formatter = DFUuid::try_new(uuid.data_type(), ())?
+            .create_array_formatter(uuid.as_ref(), &FormatOptions::default())?
+            .unwrap();
+
+        assert_eq!(
+            formatter.value(0).to_string(),
+            "00000000-0000-0000-0000-000000000000"
+        );
+
+        Ok(())
+    }
+}
diff --git a/datafusion/common/src/types/canonical_extensions/variable_shape_tensor.rs b/datafusion/common/src/types/canonical_extensions/variable_shape_tensor.rs
new file mode 100644
index 0000000000000..00f59c70160e5
--- /dev/null
+++ b/datafusion/common/src/types/canonical_extensions/variable_shape_tensor.rs
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::Result;
+use crate::types::extension::DFExtensionType;
+use arrow::datatypes::DataType;
+use arrow_schema::extension::{ExtensionType, VariableShapeTensor};
+
+/// Defines the extension type logic for the canonical `arrow.variable_shape_tensor` extension type.
+/// This extension type can be used to store a [tensor](https://en.wikipedia.org/wiki/Tensor) with
+/// variable shape that can change for each element.
+///
+/// See [`DFExtensionType`] for information on DataFusion's extension type mechanism. See also
+/// [`VariableShapeTensor`] for the implementation of arrow-rs, which this type uses internally.
+///
+/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#variable-shape-tensor>
+#[derive(Debug, Clone)]
+pub struct DFVariableShapeTensor {
+    inner: VariableShapeTensor,
+    /// While we could reconstruct the storage type from the inner [`VariableShapeTensor`], we may
+    /// choose a different name for the field within the [`DataType::List`] which can cause problems
+    /// down the line (e.g., checking for equality).
+    storage_type: DataType,
+}
+
+impl DFVariableShapeTensor {
+    /// Creates a new [`DFVariableShapeTensor`], validating that the storage type is compatible with
+    /// the extension type.
+    pub fn try_new(
+        data_type: &DataType,
+        metadata: <VariableShapeTensor as ExtensionType>::Metadata,
+    ) -> Result<Self> {
+        Ok(Self {
+            inner: <VariableShapeTensor as ExtensionType>::try_new(data_type, metadata)?,
+            storage_type: data_type.clone(),
+        })
+    }
+}
+
+impl DFExtensionType for DFVariableShapeTensor {
+    fn storage_type(&self) -> DataType {
+        self.storage_type.clone()
+    }
+
+    fn serialize_metadata(&self) -> Option<String> {
+        self.inner.serialize_metadata()
+    }
+}
diff --git a/datafusion/common/src/types/extension.rs b/datafusion/common/src/types/extension.rs
new file mode 100644
index 0000000000000..3bcb533dbf9e6
--- /dev/null
+++ b/datafusion/common/src/types/extension.rs
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::Result;
+use arrow::array::Array;
+use arrow::util::display::{ArrayFormatter, FormatOptions};
+use arrow_schema::DataType;
+use std::fmt::Debug;
+use std::sync::Arc;
+
+/// A cheaply cloneable pointer to a [`DFExtensionType`].
+pub type DFExtensionTypeRef = Arc<dyn DFExtensionType>;
+
+/// Represents an implementation of a DataFusion extension type, including the storage [`DataType`].
+/// While, in general, an extension type can support several different storage types, a specific
+/// instance of it is always locked into just one exact storage type and metadata pairing.
+///
+/// This trait allows users to customize the behavior of DataFusion for certain types. Having this
+/// ability is necessary because extension types affect how columns should be treated by the query
+/// engine. This effect includes which operations are possible on a column and what are the expected
+/// results from these operations. The extension type mechanism allows users to define how these
+/// operations apply to a particular extension type.
+///
+/// For example, adding two values of [`Int64`](arrow::datatypes::DataType::Int64) is a sensible
+/// thing to do. However, if the same column is annotated with an extension type like `custom.id`,
+/// the correct interpretation of a column changes. Adding together two `custom.id` values, even
+/// though they are stored as integers, may no longer make sense.
+///
+/// Note that DataFusion's extension type support is still young and therefore might not cover all
+/// relevant use cases. Currently, the following operations can be customized:
+/// - Pretty-printing values in record batches
+///
+/// # Relation to Arrow's [`ExtensionType`](arrow_schema::extension::ExtensionType)
+///
+/// The purpose of Arrow's [`ExtensionType`](arrow_schema::extension::ExtensionType) trait, for the
+/// time being, is to allow reading and writing extension type metadata in a type-safe manner. The
+/// trait does not provide any customization options. Therefore, downstream users (such as
+/// DataFusion) have the flexibility to implement the extension type mechanism according to their
+/// needs. [`DFExtensionType`] is DataFusion's implementation of this extension type mechanism.
+///
+/// Furthermore, the current trait in arrow-rs is not dyn-compatible, which we need for implementing
+/// extension type registries. In the future, the two implementations may increasingly converge.
+///
+/// Another difference is that [`DFExtensionType`] represents a fully resolved extension type that
+/// has a fixed storage type (i.e., [`DataType`]). This is different from arrow-rs, which only
+/// stores the extension type's metadata. For example, an instance of DataFusion's JSON extension
+/// type fixes one of the three possible storage types: [`DataType::Utf8`],
+/// [`DataType::LargeUtf8`], or [`DataType::Utf8View`]. This fixed storaga type is returned in
+/// [`DFExtensionType::storage_type`]. This is not possible in arrow-rs' extension type instances.
+/// This is the reason why we have different types in DataFusion that usually delegate the metadata
+/// processing to the underlying arrow-rs extension type instance
+/// (e.g., [`DFJson`](crate::types::DFJson) instead of [`Json`](arrow_schema::extension::Json)).
+///
+/// # Examples
+///
+/// Examples for using the extension type machinery can be found in the DataFusion examples
+/// directory.
+pub trait DFExtensionType: Debug + Send + Sync {
+    /// Returns the underlying storage type.
+    fn storage_type(&self) -> DataType;
+
+    /// Returns the serialized metadata.
+    fn serialize_metadata(&self) -> Option<String>;
+
+    /// Returns an [`ArrayFormatter`] that can format values of this type.
+    ///
+    /// If `Ok(None)` is returned, the default implementation will be used.
+    /// If an error is returned, there was an error creating the formatter.
+    fn create_array_formatter<'fmt>(
+        &self,
+        _array: &'fmt dyn Array,
+        _options: &FormatOptions<'fmt>,
+    ) -> Result<Option<ArrayFormatter<'fmt>>> {
+        Ok(None)
+    }
+}
diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs
index 674b1a41204d1..f11f1b47b16d3 100644
--- a/datafusion/common/src/types/logical.rs
+++ b/datafusion/common/src/types/logical.rs
@@ -100,7 +100,10 @@ impl fmt::Debug for dyn LogicalType {
 
 impl std::fmt::Display for dyn LogicalType {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{self:?}")
+        match self.signature() {
+            TypeSignature::Native(_) => write!(f, "{}", self.native()),
+            TypeSignature::Extension { name, .. } => write!(f, "{name}"),
+        }
     }
 }
 
@@ -132,3 +135,118 @@ impl Hash for dyn LogicalType {
         self.signature().hash(state);
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::types::{
+        LogicalField, LogicalFields, logical_boolean, logical_date, logical_float32,
+        logical_float64, logical_int32, logical_int64, logical_null, logical_string,
+    };
+    use arrow::datatypes::{Field, Fields};
+    use insta::assert_snapshot;
+
+    #[test]
+    fn test_logical_type_display_simple() {
+        assert_snapshot!(logical_null(), @"Null");
+        assert_snapshot!(logical_boolean(), @"Boolean");
+        assert_snapshot!(logical_int32(), @"Int32");
+        assert_snapshot!(logical_int64(), @"Int64");
+        assert_snapshot!(logical_float32(), @"Float32");
+        assert_snapshot!(logical_float64(), @"Float64");
+        assert_snapshot!(logical_string(), @"String");
+        assert_snapshot!(logical_date(), @"Date");
+    }
+
+    #[test]
+    fn test_logical_type_display_list() {
+        let list_type: Arc<dyn LogicalType> = Arc::new(NativeType::List(Arc::new(
+            LogicalField::from(&Field::new("item", DataType::Int32, true)),
+        )));
+        assert_snapshot!(list_type, @"List(Int32)");
+    }
+
+    #[test]
+    fn test_logical_type_display_struct() {
+        let struct_type: Arc<dyn LogicalType> = Arc::new(NativeType::Struct(
+            LogicalFields::from(&Fields::from(vec![
+                Field::new("x", DataType::Float64, false),
+                Field::new("y", DataType::Float64, true),
+            ])),
+        ));
+        assert_snapshot!(struct_type, @r#"Struct("x": non-null Float64, "y": Float64)"#);
+    }
+
+    #[test]
+    fn test_logical_type_display_fixed_size_list() {
+        let fsl_type: Arc<dyn LogicalType> = Arc::new(NativeType::FixedSizeList(
+            Arc::new(LogicalField::from(&Field::new(
+                "item",
+                DataType::Float32,
+                false,
+            ))),
+            3,
+        ));
+        assert_snapshot!(fsl_type, @"FixedSizeList(3 x non-null Float32)");
+    }
+
+    #[test]
+    fn test_logical_type_display_map() {
+        let map_type: Arc<dyn LogicalType> = Arc::new(NativeType::Map(Arc::new(
+            LogicalField::from(&Field::new("entries", DataType::Utf8, false)),
+        )));
+        assert_snapshot!(map_type, @"Map(non-null String)");
+    }
+
+    #[test]
+    fn test_logical_type_display_union() {
+        use arrow::datatypes::UnionFields;
+
+        let union_fields = UnionFields::try_new(
+            vec![0, 1],
+            vec![
+                Field::new("int_val", DataType::Int32, false),
+                Field::new("str_val", DataType::Utf8, true),
+            ],
+        )
+        .unwrap();
+        let union_type: Arc<dyn LogicalType> = Arc::new(NativeType::Union(
+            crate::types::LogicalUnionFields::from(&union_fields),
+        ));
+        assert_snapshot!(union_type, @r#"Union(0: ("int_val": non-null Int32), 1: ("str_val": String))"#);
+    }
+
+    #[test]
+    fn test_logical_type_display_nullable_vs_non_nullable() {
+        let nullable_list: Arc<dyn LogicalType> = Arc::new(NativeType::List(Arc::new(
+            LogicalField::from(&Field::new("item", DataType::Int32, true)),
+        )));
+        let non_nullable_list: Arc<dyn LogicalType> =
+            Arc::new(NativeType::List(Arc::new(LogicalField::from(&Field::new(
+                "item",
+                DataType::Int32,
+                false,
+            )))));
+
+        assert_snapshot!(nullable_list, @"List(Int32)");
+        assert_snapshot!(non_nullable_list, @"List(non-null Int32)");
+    }
+
+    #[test]
+    fn test_logical_type_display_extension() {
+        struct JsonType;
+        impl LogicalType for JsonType {
+            fn native(&self) -> &NativeType {
+                &NativeType::String
+            }
+            fn signature(&self) -> TypeSignature<'_> {
+                TypeSignature::Extension {
+                    name: "JSON",
+                    parameters: &[],
+                }
+            }
+        }
+        let json: Arc<dyn LogicalType> = Arc::new(JsonType);
+        assert_snapshot!(json, @"JSON");
+    }
+}
diff --git a/datafusion/common/src/types/mod.rs b/datafusion/common/src/types/mod.rs
index 2f9ce4ce02827..57bf921a6d564 100644
--- a/datafusion/common/src/types/mod.rs
+++ b/datafusion/common/src/types/mod.rs
@@ -16,11 +16,15 @@
 // under the License.
 
 mod builtin;
+mod canonical_extensions;
+mod extension;
 mod field;
 mod logical;
 mod native;
 
 pub use builtin::*;
+pub use canonical_extensions::*;
+pub use extension::*;
 pub use field::*;
 pub use logical::*;
 pub use native::*;
diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs
index 8c41701ae5768..580d572af4c0f 100644
--- a/datafusion/common/src/types/native.rs
+++ b/datafusion/common/src/types/native.rs
@@ -19,11 +19,11 @@ use super::{
     LogicalField, LogicalFieldRef, LogicalFields, LogicalType, LogicalUnionFields,
     TypeSignature,
 };
-use crate::error::{Result, _internal_err};
+use crate::error::{_internal_err, Result};
 use arrow::compute::can_cast_types;
 use arrow::datatypes::{
-    DataType, Field, FieldRef, Fields, IntervalUnit, TimeUnit, UnionFields,
-    DECIMAL128_MAX_PRECISION, DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION,
+    DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION, DECIMAL128_MAX_PRECISION, DataType,
+    Field, FieldRef, Fields, IntervalUnit, TimeUnit, UnionFields,
 };
 use std::{fmt::Display, sync::Arc};
 
@@ -184,9 +184,82 @@ pub enum NativeType {
     Map(LogicalFieldRef),
 }
 
+/// Format a [`LogicalField`] for display, matching [`arrow::datatypes::DataType`]'s
+/// Display convention of showing a `"non-null "` prefix for non-nullable fields.
+fn format_logical_field(
+    f: &mut std::fmt::Formatter<'_>,
+    field: &LogicalField,
+) -> std::fmt::Result {
+    let non_null = if field.nullable { "" } else { "non-null " };
+    write!(f, "{:?}: {non_null}{}", field.name, field.logical_type)
+}
+
 impl Display for NativeType {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{self:?}") // TODO: nicer formatting
+        // Match the format used by arrow::datatypes::DataType's Display impl
+        match self {
+            Self::Null => write!(f, "Null"),
+            Self::Boolean => write!(f, "Boolean"),
+            Self::Int8 => write!(f, "Int8"),
+            Self::Int16 => write!(f, "Int16"),
+            Self::Int32 => write!(f, "Int32"),
+            Self::Int64 => write!(f, "Int64"),
+            Self::UInt8 => write!(f, "UInt8"),
+            Self::UInt16 => write!(f, "UInt16"),
+            Self::UInt32 => write!(f, "UInt32"),
+            Self::UInt64 => write!(f, "UInt64"),
+            Self::Float16 => write!(f, "Float16"),
+            Self::Float32 => write!(f, "Float32"),
+            Self::Float64 => write!(f, "Float64"),
+            Self::Timestamp(unit, Some(tz)) => write!(f, "Timestamp({unit}, {tz:?})"),
+            Self::Timestamp(unit, None) => write!(f, "Timestamp({unit})"),
+            Self::Date => write!(f, "Date"),
+            Self::Time(unit) => write!(f, "Time({unit})"),
+            Self::Duration(unit) => write!(f, "Duration({unit})"),
+            Self::Interval(unit) => write!(f, "Interval({unit:?})"),
+            Self::Binary => write!(f, "Binary"),
+            Self::FixedSizeBinary(size) => write!(f, "FixedSizeBinary({size})"),
+            Self::String => write!(f, "String"),
+            Self::List(field) => {
+                let non_null = if field.nullable { "" } else { "non-null " };
+                write!(f, "List({non_null}{})", field.logical_type)
+            }
+            Self::FixedSizeList(field, size) => {
+                let non_null = if field.nullable { "" } else { "non-null " };
+                write!(
+                    f,
+                    "FixedSizeList({size} x {non_null}{})",
+                    field.logical_type
+                )
+            }
+            Self::Struct(fields) => {
+                write!(f, "Struct(")?;
+                for (i, field) in fields.iter().enumerate() {
+                    if i > 0 {
+                        write!(f, ", ")?;
+                    }
+                    format_logical_field(f, field)?;
+                }
+                write!(f, ")")
+            }
+            Self::Union(fields) => {
+                write!(f, "Union(")?;
+                for (i, (type_id, field)) in fields.iter().enumerate() {
+                    if i > 0 {
+                        write!(f, ", ")?;
+                    }
+                    write!(f, "{type_id}: (")?;
+                    format_logical_field(f, field)?;
+                    write!(f, ")")?;
+                }
+                write!(f, ")")
+            }
+            Self::Decimal(precision, scale) => write!(f, "Decimal({precision}, {scale})"),
+            Self::Map(field) => {
+                let non_null = if field.nullable { "" } else { "non-null " };
+                write!(f, "Map({non_null}{})", field.logical_type)
+            }
+        }
     }
 }
 
@@ -241,9 +314,7 @@ impl LogicalType for NativeType {
             (Self::Decimal(p, s), _) => Decimal256(*p, *s),
             (Self::Timestamp(tu, tz), _) => Timestamp(*tu, tz.clone()),
             // If given type is Date, return the same type
-            (Self::Date, origin) if matches!(origin, Date32 | Date64) => {
-                origin.to_owned()
-            }
+            (Self::Date, Date32 | Date64) => origin.to_owned(),
             (Self::Date, _) => Date32,
             (Self::Time(tu), _) => match tu {
                 TimeUnit::Second | TimeUnit::Millisecond => Time32(*tu),
@@ -253,6 +324,8 @@ impl LogicalType for NativeType {
             (Self::Interval(iu), _) => Interval(*iu),
             (Self::Binary, LargeUtf8) => LargeBinary,
             (Self::Binary, Utf8View) => BinaryView,
+            // We don't cast to another kind of binary type if the origin one is already a binary type
+            (Self::Binary, Binary | LargeBinary | BinaryView) => origin.to_owned(),
             (Self::Binary, data_type) if can_cast_types(data_type, &BinaryView) => {
                 BinaryView
             }
@@ -364,7 +437,7 @@ impl LogicalType for NativeType {
                     "Unavailable default cast for native type {} from physical type {}",
                     self,
                     origin
-                )
+                );
             }
         })
     }
@@ -430,22 +503,7 @@ impl From<DataType> for NativeType {
 impl NativeType {
     #[inline]
     pub fn is_numeric(&self) -> bool {
-        use NativeType::*;
-        matches!(
-            self,
-            UInt8
-                | UInt16
-                | UInt32
-                | UInt64
-                | Int8
-                | Int16
-                | Int32
-                | Int64
-                | Float16
-                | Float32
-                | Float64
-                | Decimal(_, _)
-        )
+        self.is_integer() || self.is_float() || self.is_decimal()
     }
 
     #[inline]
@@ -464,7 +522,7 @@ impl NativeType {
 
     #[inline]
     pub fn is_date(&self) -> bool {
-        matches!(self, NativeType::Date)
+        *self == NativeType::Date
     }
 
     #[inline]
@@ -489,6 +547,102 @@ impl NativeType {
 
     #[inline]
     pub fn is_null(&self) -> bool {
-        matches!(self, NativeType::Null)
+        *self == NativeType::Null
+    }
+
+    #[inline]
+    pub fn is_decimal(&self) -> bool {
+        matches!(self, Self::Decimal(_, _))
+    }
+
+    #[inline]
+    pub fn is_float(&self) -> bool {
+        matches!(self, Self::Float16 | Self::Float32 | Self::Float64)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::types::LogicalField;
+    use insta::assert_snapshot;
+
+    #[test]
+    fn test_native_type_display() {
+        assert_snapshot!(NativeType::Null, @"Null");
+        assert_snapshot!(NativeType::Boolean, @"Boolean");
+        assert_snapshot!(NativeType::Int8, @"Int8");
+        assert_snapshot!(NativeType::Int16, @"Int16");
+        assert_snapshot!(NativeType::Int32, @"Int32");
+        assert_snapshot!(NativeType::Int64, @"Int64");
+        assert_snapshot!(NativeType::UInt8, @"UInt8");
+        assert_snapshot!(NativeType::UInt16, @"UInt16");
+        assert_snapshot!(NativeType::UInt32, @"UInt32");
+        assert_snapshot!(NativeType::UInt64, @"UInt64");
+        assert_snapshot!(NativeType::Float16, @"Float16");
+        assert_snapshot!(NativeType::Float32, @"Float32");
+        assert_snapshot!(NativeType::Float64, @"Float64");
+        assert_snapshot!(NativeType::Date, @"Date");
+        assert_snapshot!(NativeType::Binary, @"Binary");
+        assert_snapshot!(NativeType::String, @"String");
+        assert_snapshot!(NativeType::FixedSizeBinary(16), @"FixedSizeBinary(16)");
+        assert_snapshot!(NativeType::Decimal(10, 2), @"Decimal(10, 2)");
+    }
+
+    #[test]
+    fn test_native_type_display_timestamp() {
+        assert_snapshot!(
+            NativeType::Timestamp(TimeUnit::Second, None),
+            @"Timestamp(s)"
+        );
+        assert_snapshot!(
+            NativeType::Timestamp(TimeUnit::Millisecond, None),
+            @"Timestamp(ms)"
+        );
+        assert_snapshot!(
+            NativeType::Timestamp(TimeUnit::Nanosecond, Some(Arc::from("UTC"))),
+            @r#"Timestamp(ns, "UTC")"#
+        );
+    }
+
+    #[test]
+    fn test_native_type_display_time_duration_interval() {
+        assert_snapshot!(NativeType::Time(TimeUnit::Microsecond), @"Time(µs)");
+        assert_snapshot!(NativeType::Duration(TimeUnit::Nanosecond), @"Duration(ns)");
+        assert_snapshot!(NativeType::Interval(IntervalUnit::YearMonth), @"Interval(YearMonth)");
+        assert_snapshot!(NativeType::Interval(IntervalUnit::MonthDayNano), @"Interval(MonthDayNano)");
+    }
+
+    #[test]
+    fn test_native_type_display_nested() {
+        let list = NativeType::List(Arc::new(LogicalField::from(&Field::new(
+            "item",
+            DataType::Int32,
+            true,
+        ))));
+        assert_snapshot!(list, @"List(Int32)");
+
+        let fixed_list = NativeType::FixedSizeList(
+            Arc::new(LogicalField::from(&Field::new(
+                "item",
+                DataType::Float64,
+                false,
+            ))),
+            3,
+        );
+        assert_snapshot!(fixed_list, @"FixedSizeList(3 x non-null Float64)");
+
+        let struct_type = NativeType::Struct(LogicalFields::from(&Fields::from(vec![
+            Field::new("name", DataType::Utf8, false),
+            Field::new("age", DataType::Int32, true),
+        ])));
+        assert_snapshot!(struct_type, @r#"Struct("name": non-null String, "age": Int32)"#);
+
+        let map = NativeType::Map(Arc::new(LogicalField::from(&Field::new(
+            "entries",
+            DataType::Utf8,
+            false,
+        ))));
+        assert_snapshot!(map, @"Map(non-null String)");
     }
 }
diff --git a/datafusion/common/src/utils/aggregate.rs b/datafusion/common/src/utils/aggregate.rs
new file mode 100644
index 0000000000000..783ec665f3355
--- /dev/null
+++ b/datafusion/common/src/utils/aggregate.rs
@@ -0,0 +1,132 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Scalar-level aggregation utilities for statistics merging.
+//!
+//! Provides in-place accumulation helpers that reuse the existing
+//! [`ScalarValue`] accumulator when possible.
+
+use crate::stats::Precision;
+use crate::{Result, ScalarValue};
+
+/// Adds `rhs` into `lhs`, mutating the accumulator in place when
+/// possible and otherwise falling back to `ScalarValue::add_checked`.
+pub(crate) fn scalar_add(lhs: &mut ScalarValue, rhs: &ScalarValue) -> Result<()> {
+    if lhs.try_add_checked_in_place(rhs)? {
+        return Ok(());
+    }
+
+    *lhs = lhs.add_checked(rhs)?;
+    Ok(())
+}
+
+/// [`Precision`]-aware sum that mutates `lhs` in place when possible.
+///
+/// Mirrors the semantics of `Precision<ScalarValue>::add`, including
+/// checked overflow handling, but avoids allocating a fresh
+/// [`ScalarValue`] for the common numeric fast path.
+pub(crate) fn precision_add(
+    lhs: &mut Precision<ScalarValue>,
+    rhs: &Precision<ScalarValue>,
+) {
+    let (mut lhs_value, lhs_is_exact) = match std::mem::take(lhs) {
+        Precision::Exact(value) => (value, true),
+        Precision::Inexact(value) => (value, false),
+        Precision::Absent => return,
+    };
+
+    let (rhs_value, rhs_is_exact) = match rhs {
+        Precision::Exact(value) => (value, true),
+        Precision::Inexact(value) => (value, false),
+        Precision::Absent => return,
+    };
+
+    if scalar_add(&mut lhs_value, rhs_value).is_err() {
+        return;
+    }
+
+    *lhs = if lhs_is_exact && rhs_is_exact {
+        Precision::Exact(lhs_value)
+    } else {
+        Precision::Inexact(lhs_value)
+    };
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_scalar_add_null_propagates() -> Result<()> {
+        let mut lhs = ScalarValue::Int32(Some(42));
+
+        scalar_add(&mut lhs, &ScalarValue::Int32(None))?;
+
+        assert_eq!(lhs, ScalarValue::Int32(None));
+        Ok(())
+    }
+
+    #[test]
+    fn test_scalar_add_overflow_returns_error() {
+        let mut lhs = ScalarValue::Int32(Some(i32::MAX));
+
+        let err = scalar_add(&mut lhs, &ScalarValue::Int32(Some(1)))
+            .unwrap_err()
+            .strip_backtrace();
+
+        assert_eq!(
+            err,
+            "Arrow error: Arithmetic overflow: Overflow happened on: 2147483647 + 1"
+        );
+    }
+
+    #[test]
+    fn test_precision_add_null_propagates() {
+        let mut lhs = Precision::Exact(ScalarValue::Int32(Some(42)));
+
+        precision_add(&mut lhs, &Precision::Exact(ScalarValue::Int32(None)));
+
+        assert_eq!(lhs, Precision::Exact(ScalarValue::Int32(None)));
+    }
+
+    #[test]
+    fn test_precision_add_overflow_becomes_absent() {
+        let mut lhs = Precision::Exact(ScalarValue::Int32(Some(i32::MAX)));
+
+        precision_add(&mut lhs, &Precision::Exact(ScalarValue::Int32(Some(1))));
+
+        assert_eq!(lhs, Precision::Absent);
+    }
+
+    #[test]
+    fn test_precision_add_rhs_absent_absorbs() {
+        let mut lhs = Precision::Exact(ScalarValue::Int32(Some(42)));
+
+        precision_add(&mut lhs, &Precision::Absent);
+
+        assert_eq!(lhs, Precision::Absent);
+    }
+
+    #[test]
+    fn test_precision_add_mixed_exactness() {
+        let mut lhs = Precision::Exact(ScalarValue::Int32(Some(10)));
+
+        precision_add(&mut lhs, &Precision::Inexact(ScalarValue::Int32(Some(5))));
+
+        assert_eq!(lhs, Precision::Inexact(ScalarValue::Int32(Some(15))));
+    }
+}
diff --git a/datafusion/common/src/utils/memory.rs b/datafusion/common/src/utils/memory.rs
index a56b940fab666..78ec434d2b577 100644
--- a/datafusion/common/src/utils/memory.rs
+++ b/datafusion/common/src/utils/memory.rs
@@ -18,8 +18,10 @@
 //! This module provides a function to estimate the memory size of a HashTable prior to allocation
 
 use crate::error::_exec_datafusion_err;
-use crate::Result;
-use std::mem::size_of;
+use crate::{HashSet, Result};
+use arrow::array::ArrayData;
+use arrow::record_batch::RecordBatch;
+use std::{mem::size_of, ptr::NonNull};
 
 /// Estimates the memory size required for a hash table prior to allocation.
 ///
@@ -99,6 +101,74 @@ pub fn estimate_memory_size<T>(num_elements: usize, fixed_size: usize) -> Result
         })
 }
 
+/// Calculate total used memory of this batch.
+///
+/// This function is used to estimate the physical memory usage of the `RecordBatch`.
+/// It only counts the memory of large data `Buffer`s, and ignores metadata like
+/// types and pointers.
+/// The implementation will add up all unique `Buffer`'s memory
+/// size, due to:
+/// - The data pointer inside `Buffer` are memory regions returned by global memory
+///   allocator, those regions can't have overlap.
+/// - The actual used range of `ArrayRef`s inside `RecordBatch` can have overlap
+///   or reuse the same `Buffer`. For example: taking a slice from `Array`.
+///
+/// Example:
+/// For a `RecordBatch` with two columns: `col1` and `col2`, two columns are pointing
+/// to a sub-region of the same buffer.
+///
+/// {xxxxxxxxxxxxxxxxxxx} <--- buffer
+///       ^    ^  ^    ^
+///       |    |  |    |
+/// col1->{    }  |    |
+/// col2--------->{    }
+///
+/// In the above case, `get_record_batch_memory_size` will return the size of
+/// the buffer, instead of the sum of `col1` and `col2`'s actual memory size.
+///
+/// Note: Current `RecordBatch`.get_array_memory_size()` will double count the
+/// buffer memory size if multiple arrays within the batch are sharing the same
+/// `Buffer`. This method provides temporary fix until the issue is resolved:
+/// <https://github.com/apache/arrow-rs/issues/6439>
+pub fn get_record_batch_memory_size(batch: &RecordBatch) -> usize {
+    // Store pointers to `Buffer`'s start memory address (instead of actual
+    // used data region's pointer represented by current `Array`)
+    let mut counted_buffers: HashSet<NonNull<u8>> = HashSet::new();
+    let mut total_size = 0;
+
+    for array in batch.columns() {
+        let array_data = array.to_data();
+        count_array_data_memory_size(&array_data, &mut counted_buffers, &mut total_size);
+    }
+
+    total_size
+}
+
+/// Count the memory usage of `array_data` and its children recursively.
+fn count_array_data_memory_size(
+    array_data: &ArrayData,
+    counted_buffers: &mut HashSet<NonNull<u8>>,
+    total_size: &mut usize,
+) {
+    // Count memory usage for `array_data`
+    for buffer in array_data.buffers() {
+        if counted_buffers.insert(buffer.data_ptr()) {
+            *total_size += buffer.capacity();
+        } // Otherwise the buffer's memory is already counted
+    }
+
+    if let Some(null_buffer) = array_data.nulls()
+        && counted_buffers.insert(null_buffer.inner().inner().data_ptr())
+    {
+        *total_size += null_buffer.inner().inner().capacity();
+    }
+
+    // Count all children `ArrayData` recursively
+    for child in array_data.child_data() {
+        count_array_data_memory_size(child, counted_buffers, total_size);
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use std::{collections::HashSet, mem::size_of};
@@ -132,3 +202,129 @@ mod tests {
         assert!(estimated.is_err());
     }
 }
+
+#[cfg(test)]
+mod record_batch_tests {
+    use super::*;
+    use arrow::array::{Float64Array, Int32Array, ListArray};
+    use arrow::datatypes::{DataType, Field, Int32Type, Schema};
+    use std::sync::Arc;
+
+    #[test]
+    fn test_get_record_batch_memory_size() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("ints", DataType::Int32, true),
+            Field::new("float64", DataType::Float64, false),
+        ]));
+
+        let int_array =
+            Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4), Some(5)]);
+        let float64_array = Float64Array::from(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
+
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![Arc::new(int_array), Arc::new(float64_array)],
+        )
+        .unwrap();
+
+        let size = get_record_batch_memory_size(&batch);
+        assert_eq!(size, 60);
+    }
+
+    #[test]
+    fn test_get_record_batch_memory_size_with_null() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("ints", DataType::Int32, true),
+            Field::new("float64", DataType::Float64, false),
+        ]));
+
+        let int_array = Int32Array::from(vec![None, Some(2), Some(3)]);
+        let float64_array = Float64Array::from(vec![1.0, 2.0, 3.0]);
+
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![Arc::new(int_array), Arc::new(float64_array)],
+        )
+        .unwrap();
+
+        let size = get_record_batch_memory_size(&batch);
+        assert_eq!(size, 100);
+    }
+
+    #[test]
+    fn test_get_record_batch_memory_size_empty() {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "ints",
+            DataType::Int32,
+            false,
+        )]));
+
+        let int_array: Int32Array = Int32Array::from(vec![] as Vec<i32>);
+        let batch = RecordBatch::try_new(schema, vec![Arc::new(int_array)]).unwrap();
+
+        let size = get_record_batch_memory_size(&batch);
+        assert_eq!(size, 0, "Empty batch should have 0 memory size");
+    }
+
+    #[test]
+    fn test_get_record_batch_memory_size_shared_buffer() {
+        let original = Int32Array::from(vec![1, 2, 3, 4, 5]);
+        let slice1 = original.slice(0, 3);
+        let slice2 = original.slice(2, 3);
+
+        let schema_origin = Arc::new(Schema::new(vec![Field::new(
+            "origin_col",
+            DataType::Int32,
+            false,
+        )]));
+        let batch_origin =
+            RecordBatch::try_new(schema_origin, vec![Arc::new(original)]).unwrap();
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("slice1", DataType::Int32, false),
+            Field::new("slice2", DataType::Int32, false),
+        ]));
+
+        let batch_sliced =
+            RecordBatch::try_new(schema, vec![Arc::new(slice1), Arc::new(slice2)])
+                .unwrap();
+
+        let size_origin = get_record_batch_memory_size(&batch_origin);
+        let size_sliced = get_record_batch_memory_size(&batch_sliced);
+
+        assert_eq!(size_origin, size_sliced);
+    }
+
+    #[test]
+    fn test_get_record_batch_memory_size_nested_array() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new(
+                "nested_int",
+                DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))),
+                false,
+            ),
+            Field::new(
+                "nested_int2",
+                DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))),
+                false,
+            ),
+        ]));
+
+        let int_list_array = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), Some(2), Some(3)]),
+        ]);
+
+        let int_list_array2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(4), Some(5), Some(6)]),
+        ]);
+
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![Arc::new(int_list_array), Arc::new(int_list_array2)],
+        )
+        .unwrap();
+
+        let size = get_record_batch_memory_size(&batch);
+        assert_eq!(size, 8208);
+    }
+}
diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs
index 7b145ac3ae21d..8c88be03fd5c8 100644
--- a/datafusion/common/src/utils/mod.rs
+++ b/datafusion/common/src/utils/mod.rs
@@ -17,28 +17,36 @@
 
 //! This module provides the bisect function, which implements binary search.
 
+pub(crate) mod aggregate;
 pub mod expr;
 pub mod memory;
 pub mod proxy;
 pub mod string_utils;
 
-use crate::error::{_exec_datafusion_err, _internal_datafusion_err, _internal_err};
+use crate::assert_or_internal_err;
+use crate::error::{_exec_datafusion_err, _exec_err, _internal_datafusion_err};
 use crate::{Result, ScalarValue};
 use arrow::array::{
-    cast::AsArray, Array, ArrayRef, FixedSizeListArray, LargeListArray, ListArray,
-    OffsetSizeTrait,
+    Array, ArrayRef, FixedSizeListArray, LargeListArray, ListArray, OffsetSizeTrait,
+    cast::AsArray,
 };
-use arrow::buffer::OffsetBuffer;
-use arrow::compute::{partition, SortColumn, SortOptions};
+use arrow::array::{
+    Datum, GenericListArray, Int32Array, Int64Array, MutableArrayData, make_array,
+};
+use arrow::array::{LargeListViewArray, ListViewArray};
+use arrow::buffer::{OffsetBuffer, ScalarBuffer};
+use arrow::compute::kernels::cmp::neq;
+use arrow::compute::kernels::length::length;
+use arrow::compute::{SortColumn, SortOptions, partition};
 use arrow::datatypes::{DataType, Field, SchemaRef};
 #[cfg(feature = "sql")]
 use sqlparser::{ast::Ident, dialect::GenericDialect, parser::Parser};
 use std::borrow::{Borrow, Cow};
-use std::cmp::{min, Ordering};
+use std::cmp::{Ordering, min};
 use std::collections::HashSet;
 use std::num::NonZero;
 use std::ops::Range;
-use std::sync::Arc;
+use std::sync::{Arc, LazyLock};
 use std::thread::available_parallelism;
 
 /// Applies an optional projection to a [`SchemaRef`], returning the
@@ -69,10 +77,10 @@ use std::thread::available_parallelism;
 /// ```
 pub fn project_schema(
     schema: &SchemaRef,
-    projection: Option<&Vec<usize>>,
+    projection: Option<&impl AsRef<[usize]>>,
 ) -> Result<SchemaRef> {
     let schema = match projection {
-        Some(columns) => Arc::new(schema.project(columns)?),
+        Some(columns) => Arc::new(schema.project(columns.as_ref())?),
         None => Arc::clone(schema),
     };
     Ok(schema)
@@ -265,10 +273,10 @@ fn needs_quotes(s: &str) -> bool {
     let mut chars = s.chars();
 
     // first char can not be a number unless escaped
-    if let Some(first_char) = chars.next() {
-        if !(first_char.is_ascii_lowercase() || first_char == '_') {
-            return true;
-        }
+    if let Some(first_char) = chars.next()
+        && !(first_char.is_ascii_lowercase() || first_char == '_')
+    {
+        return true;
     }
 
     !chars.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '_')
@@ -478,6 +486,34 @@ impl SingleRowListArrayBuilder {
         ScalarValue::FixedSizeList(Arc::new(self.build_fixed_size_list_array(list_size)))
     }
 
+    /// Build a single element [`ListViewArray`]
+    pub fn build_list_view_array(self) -> ListViewArray {
+        let (field, arr) = self.into_field_and_arr();
+        let offsets = ScalarBuffer::from(vec![0]);
+        let sizes = ScalarBuffer::from(vec![i32::try_from(arr.len()).expect(
+            "Trying to construct a ListView where element length exceeds i32::MAX",
+        )]);
+        ListViewArray::new(field, offsets, sizes, arr, None)
+    }
+
+    /// Build a single element [`ListViewArray`] and wrap as [`ScalarValue::ListView`]
+    pub fn build_list_view_scalar(self) -> ScalarValue {
+        ScalarValue::ListView(Arc::new(self.build_list_view_array()))
+    }
+
+    /// Build a single element [`LargeListViewArray`]
+    pub fn build_large_list_view_array(self) -> LargeListViewArray {
+        let (field, arr) = self.into_field_and_arr();
+        let offsets = ScalarBuffer::from(vec![0]);
+        let sizes = ScalarBuffer::from(vec![arr.len() as i64]);
+        LargeListViewArray::new(field, offsets, sizes, arr, None)
+    }
+
+    /// Build a single element [`LargeListViewArray`] and wrap as [`ScalarValue::LargeListView`]
+    pub fn build_large_list_view_scalar(self) -> ScalarValue {
+        ScalarValue::LargeListView(Arc::new(self.build_large_list_view_array()))
+    }
+
     /// Helper function: convert this builder into a tuple of field and array
     fn into_field_and_arr(self) -> (Arc<Field>, ArrayRef) {
         let Self {
@@ -515,13 +551,12 @@ impl SingleRowListArrayBuilder {
 /// );
 ///
 /// assert_eq!(list_arr, expected);
+/// ```
 pub fn arrays_into_list_array(
     arr: impl IntoIterator<Item = ArrayRef>,
 ) -> Result<ListArray> {
     let arr = arr.into_iter().collect::<Vec<_>>();
-    if arr.is_empty() {
-        return _internal_err!("Cannot wrap empty array into list array");
-    }
+    assert_or_internal_err!(!arr.is_empty(), "Cannot wrap empty array into list array");
 
     let lens = arr.iter().map(|x| x.len()).collect::<Vec<_>>();
     // Assume data type is consistent
@@ -564,11 +599,17 @@ pub fn base_type(data_type: &DataType) -> DataType {
     match data_type {
         DataType::List(field)
         | DataType::LargeList(field)
+        | DataType::ListView(field)
+        | DataType::LargeListView(field)
         | DataType::FixedSizeList(field, _) => base_type(field.data_type()),
         _ => data_type.to_owned(),
     }
 }
 
+// TODO: Modify this to also allow specifying how listviews should be treated.
+//       For example if cast to List (default) or maintain as ListView (requires
+//       function to implement support for ListViews)
+//       https://github.com/apache/datafusion/issues/21777
 /// Information about how to coerce lists.
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)]
 pub enum ListCoercion {
@@ -588,6 +629,7 @@ pub enum ListCoercion {
 /// let base_type = DataType::Float64;
 /// let coerced_type = coerced_type_with_base_type_only(&data_type, &base_type, None);
 /// assert_eq!(coerced_type, DataType::List(Arc::new(Field::new_list_field(DataType::Float64, true))));
+/// ```
 pub fn coerced_type_with_base_type_only(
     data_type: &DataType,
     base_type: &DataType,
@@ -621,6 +663,19 @@ pub fn coerced_type_with_base_type_only(
                 *len,
             )
         }
+        (DataType::ListView(field), _) => {
+            let field_type = coerced_type_with_base_type_only(
+                field.data_type(),
+                base_type,
+                array_coercion,
+            );
+
+            DataType::ListView(Arc::new(Field::new(
+                field.name(),
+                field_type,
+                field.is_nullable(),
+            )))
+        }
         (DataType::LargeList(field), _) => {
             let field_type = coerced_type_with_base_type_only(
                 field.data_type(),
@@ -634,6 +689,19 @@ pub fn coerced_type_with_base_type_only(
                 field.is_nullable(),
             )))
         }
+        (DataType::LargeListView(field), _) => {
+            let field_type = coerced_type_with_base_type_only(
+                field.data_type(),
+                base_type,
+                array_coercion,
+            );
+
+            DataType::LargeListView(Arc::new(Field::new(
+                field.name(),
+                field_type,
+                field.is_nullable(),
+            )))
+        }
 
         _ => base_type.clone(),
     }
@@ -651,6 +719,15 @@ pub fn coerced_fixed_size_list_to_list(data_type: &DataType) -> DataType {
                 field.is_nullable(),
             )))
         }
+        DataType::ListView(field) => {
+            let field_type = coerced_fixed_size_list_to_list(field.data_type());
+
+            DataType::ListView(Arc::new(Field::new(
+                field.name(),
+                field_type,
+                field.is_nullable(),
+            )))
+        }
         DataType::LargeList(field) => {
             let field_type = coerced_fixed_size_list_to_list(field.data_type());
 
@@ -660,6 +737,15 @@ pub fn coerced_fixed_size_list_to_list(data_type: &DataType) -> DataType {
                 field.is_nullable(),
             )))
         }
+        DataType::LargeListView(field) => {
+            let field_type = coerced_fixed_size_list_to_list(field.data_type());
+
+            DataType::LargeListView(Arc::new(Field::new(
+                field.name(),
+                field_type,
+                field.is_nullable(),
+            )))
+        }
 
         _ => data_type.clone(),
     }
@@ -670,6 +756,8 @@ pub fn list_ndims(data_type: &DataType) -> u64 {
     match data_type {
         DataType::List(field)
         | DataType::LargeList(field)
+        | DataType::ListView(field)
+        | DataType::LargeListView(field)
         | DataType::FixedSizeList(field, _) => 1 + list_ndims(field.data_type()),
         _ => 0,
     }
@@ -694,10 +782,14 @@ pub mod datafusion_strsim {
     }
 
     /// Calculates the minimum number of insertions, deletions, and substitutions
-    /// required to change one sequence into the other.
-    fn generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(
+    /// required to change one sequence into the other, using a reusable cache buffer.
+    ///
+    /// This is the generic implementation that works with any iterator types.
+    /// The `cache` buffer will be resized as needed and reused across calls.
+    fn generic_levenshtein_with_buffer<'a, 'b, Iter1, Iter2, Elem1, Elem2>(
         a: &'a Iter1,
         b: &'b Iter2,
+        cache: &mut Vec<usize>,
     ) -> usize
     where
         &'a Iter1: IntoIterator<Item = Elem1>,
@@ -710,7 +802,9 @@ pub mod datafusion_strsim {
             return b_len;
         }
 
-        let mut cache: Vec<usize> = (1..b_len + 1).collect();
+        // Resize cache to fit b_len elements
+        cache.clear();
+        cache.extend(1..=b_len);
 
         let mut result = 0;
 
@@ -730,6 +824,21 @@ pub mod datafusion_strsim {
         result
     }
 
+    /// Calculates the minimum number of insertions, deletions, and substitutions
+    /// required to change one sequence into the other.
+    fn generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(
+        a: &'a Iter1,
+        b: &'b Iter2,
+    ) -> usize
+    where
+        &'a Iter1: IntoIterator<Item = Elem1>,
+        &'b Iter2: IntoIterator<Item = Elem2>,
+        Elem1: PartialEq<Elem2>,
+    {
+        let mut cache = Vec::new();
+        generic_levenshtein_with_buffer(a, b, &mut cache)
+    }
+
     /// Calculates the minimum number of insertions, deletions, and substitutions
     /// required to change one string into the other.
     ///
@@ -742,6 +851,15 @@ pub mod datafusion_strsim {
         generic_levenshtein(&StringWrapper(a), &StringWrapper(b))
     }
 
+    /// Calculates the Levenshtein distance using a reusable cache buffer.
+    /// This avoids allocating a new Vec for each call, improving performance
+    /// when computing many distances.
+    ///
+    /// The `cache` buffer will be resized as needed and reused across calls.
+    pub fn levenshtein_with_buffer(a: &str, b: &str, cache: &mut Vec<usize>) -> usize {
+        generic_levenshtein_with_buffer(&StringWrapper(a), &StringWrapper(b), cache)
+    }
+
     /// Calculates the normalized Levenshtein distance between two strings.
     /// The normalized distance is a value between 0.0 and 1.0, where 1.0 indicates
     /// that the strings are identical and 0.0 indicates no similarity.
@@ -891,10 +1009,15 @@ pub fn combine_limit(
 ///
 /// This is a wrapper around `std::thread::available_parallelism`, providing a default value
 /// of `1` if the system's parallelism cannot be determined.
+///
+/// The result is cached after the first call.
 pub fn get_available_parallelism() -> usize {
-    available_parallelism()
-        .unwrap_or(NonZero::new(1).expect("literal value `1` shouldn't be zero"))
-        .get()
+    static PARALLELISM: LazyLock<usize> = LazyLock::new(|| {
+        available_parallelism()
+            .unwrap_or(NonZero::new(1).expect("literal value `1` shouldn't be zero"))
+            .get()
+    });
+    *PARALLELISM
 }
 
 /// Converts a collection of function arguments into a fixed-size array of length N
@@ -939,13 +1062,137 @@ pub fn take_function_args<const N: usize, T>(
     })
 }
 
+/// Returns the inner values of a list, or an error otherwise
+/// For [`ListArray`] and [`LargeListArray`], if it's sliced, it returns a
+/// sliced array too. Therefore, too reconstruct a list using it,
+/// you must adjust the offsets using [`adjust_offsets_for_slice`]
+pub fn list_values(array: &dyn Array) -> Result<ArrayRef> {
+    match array.data_type() {
+        DataType::List(_) => Ok(sliced_list_values(array.as_list::<i32>())),
+        DataType::LargeList(_) => Ok(sliced_list_values(array.as_list::<i64>())),
+        DataType::FixedSizeList(_, _) => {
+            Ok(Arc::clone(array.as_fixed_size_list().values()))
+        }
+        other => _exec_err!("expected list, got {other}"),
+    }
+}
+
+fn sliced_list_values<O: OffsetSizeTrait>(list: &GenericListArray<O>) -> ArrayRef {
+    let values = list.values();
+    let offsets = list.offsets();
+
+    if let (Some(first), Some(last)) = (offsets.first(), offsets.last()) {
+        let first = first.as_usize();
+        let last = last.as_usize();
+
+        if first != 0 || last != values.len() {
+            return values.slice(first, last - first);
+        }
+    }
+
+    Arc::clone(values)
+}
+
+/// If `list` is sliced, returns an adjusted offset buffer so that
+/// it points to the sliced portion of the list values, and not the whole list values
+pub fn adjust_offsets_for_slice<O: OffsetSizeTrait>(
+    list: &GenericListArray<O>,
+) -> OffsetBuffer<O> {
+    let offsets = list.offsets();
+
+    if let (Some(first), Some(last)) = (offsets.first(), offsets.last())
+        && (!first.is_zero() || last.as_usize() != list.values().len())
+    {
+        let offsets = offsets.iter().map(|offset| *offset - *first).collect();
+
+        //todo: use unsafe Offset::new_unchecked?
+        return OffsetBuffer::new(offsets);
+    }
+
+    offsets.clone()
+}
+
+/// For lists and large lists, truncates the sublist of null values
+/// Otherwise returns an error
+pub fn remove_list_null_values(array: &ArrayRef) -> Result<ArrayRef> {
+    // todo: handle list view and map
+    match array.data_type() {
+        DataType::List(_) => Ok(Arc::new(truncate_list_nulls(array.as_list::<i32>())?)),
+        DataType::LargeList(_) => {
+            Ok(Arc::new(truncate_list_nulls(array.as_list::<i64>())?))
+        }
+        dt => _exec_err!("expected List or LargeList, got {dt}"),
+    }
+}
+
+fn truncate_list_nulls<O: OffsetSizeTrait>(
+    list: &GenericListArray<O>,
+) -> Result<GenericListArray<O>> {
+    if let Some(nulls) = list.nulls()
+        && nulls.null_count() > 0
+    {
+        let lengths = length(list)?;
+        let zero: &dyn Datum = if lengths.data_type() == &DataType::Int32 {
+            &Int32Array::new_scalar(0)
+        } else {
+            &Int64Array::new_scalar(0)
+        };
+
+        let not_empty = neq(&lengths, zero)?;
+        let null_and_non_empty = &!nulls.inner() & not_empty.values();
+
+        if null_and_non_empty.count_set_bits() > 0 {
+            let array_data = list.values().to_data();
+            let offsets = list.offsets();
+            let capacity = offsets[offsets.len() - 1] - offsets[0];
+            let mut mutable_array_data =
+                MutableArrayData::new(vec![&array_data], false, capacity.as_usize());
+
+            let valid_or_empty = nulls.inner() | &!not_empty.values();
+
+            for (start, end) in valid_or_empty.set_slices() {
+                mutable_array_data.extend(
+                    0,
+                    offsets[start].as_usize(),
+                    offsets[end].as_usize(),
+                );
+            }
+
+            let lengths = std::iter::zip(offsets.lengths(), nulls)
+                .map(|(length, is_valid)| if is_valid { length } else { 0 });
+
+            let offsets = OffsetBuffer::from_lengths(lengths);
+            let values = make_array(mutable_array_data.freeze());
+
+            let field = match list.data_type() {
+                DataType::List(field) => field,
+                DataType::LargeList(field) => field,
+                _ => unreachable!(),
+            };
+
+            return Ok(GenericListArray::try_new(
+                Arc::clone(field),
+                offsets,
+                values,
+                list.nulls().cloned(),
+            )?);
+        }
+    }
+    Ok(list.clone())
+}
+
 #[cfg(test)]
 mod tests {
+    use std::sync::Arc;
+
     use super::*;
     use crate::ScalarValue::Null;
-    use arrow::array::Float64Array;
+    use arrow::{
+        array::{Float64Array, Int32Array},
+        buffer::NullBuffer,
+        datatypes::Int32Type,
+    };
     use sqlparser::ast::Ident;
-    use sqlparser::tokenizer::Span;
 
     #[test]
     fn test_bisect_linear_left_and_right() -> Result<()> {
@@ -1174,7 +1421,7 @@ mod tests {
             let expected_parsed = vec![Ident {
                 value: identifier.to_string(),
                 quote_style,
-                span: Span::empty(),
+                span: sqlparser::tokenizer::Span::empty(),
             }];
 
             assert_eq!(
@@ -1245,4 +1492,129 @@ mod tests {
         assert_eq!(expected, transposed);
         Ok(())
     }
+
+    #[test]
+    fn test_sliced_list_values() {
+        let data = vec![
+            Some(vec![Some(0), Some(1), Some(2)]),
+            None,
+            Some(vec![Some(3), None, Some(5)]),
+            Some(vec![Some(6), Some(7)]),
+        ];
+
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(data);
+
+        assert_eq!(
+            sliced_list_values(&list).as_primitive(),
+            &Int32Array::from(vec![
+                Some(0),
+                Some(1),
+                Some(2),
+                Some(3),
+                None,
+                Some(5),
+                Some(6),
+                Some(7)
+            ])
+        );
+
+        assert_eq!(
+            sliced_list_values(&list.slice(0, 1)).as_primitive(),
+            &Int32Array::from(vec![Some(0), Some(1), Some(2)])
+        );
+
+        assert_eq!(
+            sliced_list_values(&list.slice(2, 1)).as_primitive(),
+            &Int32Array::from(vec![Some(3), None, Some(5)])
+        );
+
+        assert_eq!(
+            sliced_list_values(&list.slice(3, 1)).as_primitive(),
+            &Int32Array::from(vec![Some(6), Some(7)])
+        );
+
+        assert!(sliced_list_values(&list.slice(0, 0)).is_empty());
+        assert!(sliced_list_values(&list.slice(1, 0)).is_empty());
+        assert!(sliced_list_values(&list.slice(3, 0)).is_empty());
+    }
+
+    #[test]
+    fn test_adjust_offsets() {
+        let data = vec![
+            Some(vec![Some(0), Some(1), Some(2)]),
+            None,
+            Some(vec![Some(3), None, Some(5)]),
+            Some(vec![Some(6), Some(7)]),
+        ];
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(data);
+
+        assert_eq!(
+            adjust_offsets_for_slice(&list),
+            OffsetBuffer::from_lengths([3, 0, 3, 2])
+        );
+
+        assert_eq!(
+            adjust_offsets_for_slice(&list.slice(0, 1)),
+            OffsetBuffer::from_lengths([3])
+        );
+
+        assert_eq!(
+            adjust_offsets_for_slice(&list.slice(1, 2)),
+            OffsetBuffer::from_lengths([0, 3])
+        );
+
+        assert_eq!(
+            adjust_offsets_for_slice(&list.slice(1, 3)),
+            OffsetBuffer::from_lengths([0, 3, 2])
+        );
+
+        assert_eq!(
+            adjust_offsets_for_slice(&list.slice(0, 0)),
+            OffsetBuffer::from_lengths([])
+        );
+
+        assert_eq!(
+            adjust_offsets_for_slice(&list.slice(1, 0)),
+            OffsetBuffer::from_lengths([])
+        );
+
+        assert_eq!(
+            adjust_offsets_for_slice(&list.slice(3, 0)),
+            OffsetBuffer::from_lengths([])
+        );
+    }
+
+    fn create_i32_list(
+        values: impl Into<Int32Array>,
+        offsets: OffsetBuffer<i32>,
+        nulls: Option<NullBuffer>,
+    ) -> ListArray {
+        let list_field = Arc::new(Field::new_list_field(DataType::Int32, true));
+
+        ListArray::new(list_field, offsets, Arc::new(values.into()), nulls)
+    }
+
+    #[test]
+    fn test_remove_list_null_values_list() {
+        let list = Arc::new(create_i32_list(
+            vec![100, 20, 10, 0, 0, 0, 0, 1, 50],
+            OffsetBuffer::<i32>::from_lengths(vec![3, 4, 0, 2, 0]),
+            Some(NullBuffer::from(vec![true, false, false, true, false])),
+        )) as ArrayRef;
+
+        let res = remove_list_null_values(&list).unwrap();
+        let res = res.as_list::<i32>();
+
+        let expected = Arc::new(create_i32_list(
+            vec![100, 20, 10, 1, 50],
+            OffsetBuffer::<i32>::from_lengths(vec![3, 0, 0, 2, 0]),
+            Some(NullBuffer::from(vec![true, false, false, true, false])),
+        )) as ArrayRef;
+        let expected = expected.as_list::<i32>();
+
+        assert_eq!(res, expected);
+        // check above skips inner value of nulls
+        assert_eq!(res.values(), expected.values());
+        assert_eq!(res.offsets(), expected.offsets());
+    }
 }
diff --git a/datafusion/common/src/utils/proxy.rs b/datafusion/common/src/utils/proxy.rs
index fb951aa3b0289..846c928515d60 100644
--- a/datafusion/common/src/utils/proxy.rs
+++ b/datafusion/common/src/utils/proxy.rs
@@ -15,12 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`VecAllocExt`] and [`RawTableAllocExt`] to help tracking of memory allocations
+//! [`VecAllocExt`] to help tracking of memory allocations
 
-use hashbrown::{
-    hash_table::HashTable,
-    raw::{Bucket, RawTable},
-};
+use hashbrown::hash_table::HashTable;
 use std::mem::size_of;
 
 /// Extension trait for [`Vec`] to account for allocations.
@@ -114,75 +111,6 @@ impl<T> VecAllocExt for Vec<T> {
     }
 }
 
-/// Extension trait for hash browns [`RawTable`] to account for allocations.
-pub trait RawTableAllocExt {
-    /// Item type.
-    type T;
-
-    /// [Insert](RawTable::insert) new element into table and increase
-    /// `accounting` by any newly allocated bytes.
-    ///
-    /// Returns the bucket where the element was inserted.
-    /// Note that allocation counts capacity, not size.
-    ///
-    /// # Example:
-    /// ```
-    /// # use datafusion_common::utils::proxy::RawTableAllocExt;
-    /// # use hashbrown::raw::RawTable;
-    /// let mut table = RawTable::new();
-    /// let mut allocated = 0;
-    /// let hash_fn = |x: &u32| (*x as u64) % 1000;
-    /// // pretend 0x3117 is the hash value for 1
-    /// table.insert_accounted(1, hash_fn, &mut allocated);
-    /// assert_eq!(allocated, 64);
-    ///
-    /// // insert more values
-    /// for i in 0..100 {
-    ///     table.insert_accounted(i, hash_fn, &mut allocated);
-    /// }
-    /// assert_eq!(allocated, 400);
-    /// ```
-    fn insert_accounted(
-        &mut self,
-        x: Self::T,
-        hasher: impl Fn(&Self::T) -> u64,
-        accounting: &mut usize,
-    ) -> Bucket<Self::T>;
-}
-
-impl<T> RawTableAllocExt for RawTable<T> {
-    type T = T;
-
-    fn insert_accounted(
-        &mut self,
-        x: Self::T,
-        hasher: impl Fn(&Self::T) -> u64,
-        accounting: &mut usize,
-    ) -> Bucket<Self::T> {
-        let hash = hasher(&x);
-
-        match self.try_insert_no_grow(hash, x) {
-            Ok(bucket) => bucket,
-            Err(x) => {
-                // need to request more memory
-
-                let bump_elements = self.capacity().max(16);
-                let bump_size = bump_elements * size_of::<T>();
-                *accounting = (*accounting).checked_add(bump_size).expect("overflow");
-
-                self.reserve(bump_elements, hasher);
-
-                // still need to insert the element since first try failed
-                // Note: cannot use `.expect` here because `T` may not implement `Debug`
-                match self.try_insert_no_grow(hash, x) {
-                    Ok(bucket) => bucket,
-                    Err(_) => panic!("just grew the container"),
-                }
-            }
-        }
-    }
-}
-
 /// Extension trait for hash browns [`HashTable`] to account for allocations.
 pub trait HashTableAllocExt {
     /// Item type.
@@ -193,6 +121,8 @@ pub trait HashTableAllocExt {
     ///
     /// Returns the bucket where the element was inserted.
     /// Note that allocation counts capacity, not size.
+    /// Panics:
+    ///     Assumes the element is not already present, and may panic if it does
     ///
     /// # Example:
     /// ```
@@ -206,7 +136,7 @@ pub trait HashTableAllocExt {
     /// assert_eq!(allocated, 64);
     ///
     /// // insert more values
-    /// for i in 0..100 {
+    /// for i in 2..100 {
     ///     table.insert_accounted(i, hash_fn, &mut allocated);
     /// }
     /// assert_eq!(allocated, 400);
@@ -233,22 +163,24 @@ where
     ) {
         let hash = hasher(&x);
 
-        // NOTE: `find_entry` does NOT grow!
-        match self.find_entry(hash, |y| y == &x) {
-            Ok(_occupied) => {}
-            Err(_absent) => {
-                if self.len() == self.capacity() {
-                    // need to request more memory
-                    let bump_elements = self.capacity().max(16);
-                    let bump_size = bump_elements * size_of::<T>();
-                    *accounting = (*accounting).checked_add(bump_size).expect("overflow");
+        if cfg!(debug_assertions) {
+            // In debug mode, check that the element is not already present
+            debug_assert!(
+                self.find_entry(hash, |y| y == &x).is_err(),
+                "attempted to insert duplicate element into HashTableAllocExt::insert_accounted"
+            );
+        }
 
-                    self.reserve(bump_elements, &hasher);
-                }
+        if self.len() == self.capacity() {
+            // need to request more memory
+            let bump_elements = self.capacity().max(16);
+            let bump_size = bump_elements * size_of::<T>();
+            *accounting = (*accounting).checked_add(bump_size).expect("overflow");
 
-                // still need to insert the element since first try failed
-                self.entry(hash, |y| y == &x, hasher).insert(x);
-            }
+            self.reserve(bump_elements, &hasher);
         }
+
+        // We assume the element is not already present
+        self.insert_unique(hash, x, hasher);
     }
 }
diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
index f672e3a946816..ebd05392c926d 100644
--- a/datafusion/core/Cargo.toml
+++ b/datafusion/core/Cargo.toml
@@ -19,7 +19,7 @@
 name = "datafusion"
 description = "DataFusion is an in-memory query engine that uses Apache Arrow as the memory model"
 keywords = ["arrow", "query", "sql"]
-include = ["benches/*.rs", "src/**/*.rs", "Cargo.toml", "LICENSE.txt", "NOTICE.txt"]
+include = ["benches/*.rs", "src/**/*.md", "src/**/*.rs", "Cargo.toml", "LICENSE.txt", "NOTICE.txt"]
 readme = "../../README.md"
 version = { workspace = true }
 edition = { workspace = true }
@@ -32,6 +32,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -40,10 +43,10 @@ nested_expressions = ["datafusion-functions-nested"]
 # This feature is deprecated. Use the `nested_expressions` feature instead.
 array_expressions = ["nested_expressions"]
 # Used to enable the avro format
-avro = ["datafusion-common/avro", "datafusion-datasource-avro"]
+avro = ["datafusion-datasource-avro"]
 backtrace = ["datafusion-common/backtrace"]
 compression = [
-    "xz2",
+    "liblzma",
     "bzip2",
     "flate2",
     "zstd",
@@ -76,7 +79,6 @@ parquet_encryption = [
     "datafusion-common/parquet_encryption",
     "datafusion-datasource-parquet/parquet_encryption",
 ]
-pyarrow = ["datafusion-common/pyarrow", "parquet"]
 regex_expressions = [
     "datafusion-functions/regex_expressions",
 ]
@@ -85,8 +87,9 @@ recursive_protection = [
     "datafusion-expr/recursive_protection",
     "datafusion-optimizer/recursive_protection",
     "datafusion-physical-optimizer/recursive_protection",
-    "datafusion-sql/recursive_protection",
-    "sqlparser/recursive-protection",
+    "datafusion-physical-expr/recursive_protection",
+    "datafusion-sql?/recursive_protection",
+    "sqlparser?/recursive-protection",
 ]
 serde = [
     "dep:serde",
@@ -111,8 +114,7 @@ extended_tests = []
 arrow = { workspace = true }
 arrow-schema = { workspace = true, features = ["canonical_extension_types"] }
 async-trait = { workspace = true }
-bytes = { workspace = true }
-bzip2 = { version = "0.6.1", optional = true }
+bzip2 = { workspace = true, optional = true }
 chrono = { workspace = true }
 datafusion-catalog = { workspace = true }
 datafusion-catalog-listing = { workspace = true }
@@ -140,24 +142,22 @@ datafusion-physical-optimizer = { workspace = true }
 datafusion-physical-plan = { workspace = true }
 datafusion-session = { workspace = true }
 datafusion-sql = { workspace = true, optional = true }
-flate2 = { version = "1.1.4", optional = true }
+flate2 = { workspace = true, optional = true }
 futures = { workspace = true }
+indexmap = { workspace = true }
 itertools = { workspace = true }
+liblzma = { workspace = true, optional = true }
 log = { workspace = true }
 object_store = { workspace = true }
 parking_lot = { workspace = true }
 parquet = { workspace = true, optional = true, default-features = true }
-rand = { workspace = true }
-regex = { workspace = true }
-rstest = { workspace = true }
 serde = { version = "1.0", default-features = false, features = ["derive"], optional = true }
 sqlparser = { workspace = true, optional = true }
 tempfile = { workspace = true }
 tokio = { workspace = true }
 url = { workspace = true }
-uuid = { version = "1.18", features = ["v4", "js"] }
-xz2 = { version = "0.1", optional = true, features = ["static"] }
-zstd = { version = "0.13", optional = true, default-features = false }
+uuid = { workspace = true, features = ["v4", "js"] }
+zstd = { workspace = true, optional = true }
 
 [dev-dependencies]
 async-trait = { workspace = true }
@@ -169,16 +169,17 @@ datafusion-functions-window-common = { workspace = true }
 datafusion-macros = { workspace = true }
 datafusion-physical-optimizer = { workspace = true }
 doc-comment = { workspace = true }
+bytes = { workspace = true }
 env_logger = { workspace = true }
-glob = { version = "0.3.0" }
+glob = { workspace = true }
 insta = { workspace = true }
-paste = "^1.0"
 rand = { workspace = true, features = ["small_rng"] }
 rand_distr = "0.5"
+recursive = { workspace = true }
 regex = { workspace = true }
 rstest = { workspace = true }
 serde_json = { workspace = true }
-sysinfo = "0.37.2"
+sysinfo = "0.38.2"
 test-utils = { path = "../../test-utils" }
 tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot", "fs"] }
 
@@ -186,7 +187,7 @@ tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot", "fs"]
 ignored = ["datafusion-doc", "datafusion-macros", "dashmap"]
 
 [target.'cfg(not(target_os = "windows"))'.dev-dependencies]
-nix = { version = "0.30.1", features = ["fs"] }
+nix = { version = "0.31.1", features = ["fs"] }
 
 [[bench]]
 harness = false
@@ -224,6 +225,10 @@ name = "struct_query_sql"
 harness = false
 name = "window_query_sql"
 
+[[bench]]
+harness = false
+name = "topk_repartition"
+
 [[bench]]
 harness = false
 name = "scalar"
@@ -237,6 +242,20 @@ harness = false
 name = "parquet_query_sql"
 required-features = ["parquet"]
 
+[[bench]]
+harness = false
+name = "parquet_struct_query"
+required-features = ["parquet"]
+
+[[bench]]
+harness = false
+name = "parquet_struct_projection"
+required-features = ["parquet"]
+
+[[bench]]
+harness = false
+name = "range_and_generate_series"
+
 [[bench]]
 harness = false
 name = "sql_planner"
@@ -269,3 +288,12 @@ name = "dataframe"
 [[bench]]
 harness = false
 name = "spm"
+
+[[bench]]
+harness = false
+name = "preserve_file_partitioning"
+required-features = ["parquet"]
+
+[[bench]]
+harness = false
+name = "reset_plan_states"
diff --git a/datafusion/core/benches/aggregate_query_sql.rs b/datafusion/core/benches/aggregate_query_sql.rs
index 87aeed49337eb..d7e24aceba170 100644
--- a/datafusion/core/benches/aggregate_query_sql.rs
+++ b/datafusion/core/benches/aggregate_query_sql.rs
@@ -15,14 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#[macro_use]
-extern crate criterion;
-extern crate arrow;
-extern crate datafusion;
-
 mod data_utils;
 
-use crate::criterion::Criterion;
+use criterion::{Criterion, criterion_group, criterion_main};
 use data_utils::create_table_provider;
 use datafusion::error::Result;
 use datafusion::execution::context::SessionContext;
@@ -31,6 +26,7 @@ use std::hint::black_box;
 use std::sync::Arc;
 use tokio::runtime::Runtime;
 
+#[expect(clippy::needless_pass_by_value)]
 fn query(ctx: Arc<Mutex<SessionContext>>, rt: &Runtime, sql: &str) {
     let df = rt.block_on(ctx.lock().sql(sql)).unwrap();
     black_box(rt.block_on(df.collect()).unwrap());
@@ -255,6 +251,83 @@ fn criterion_benchmark(c: &mut Criterion) {
             )
         })
     });
+
+    c.bench_function("array_agg_query_group_by_few_groups", |b| {
+        b.iter(|| {
+            query(
+                ctx.clone(),
+                &rt,
+                "SELECT u64_narrow, array_agg(f64) \
+                 FROM t GROUP BY u64_narrow",
+            )
+        })
+    });
+
+    c.bench_function("array_agg_query_group_by_mid_groups", |b| {
+        b.iter(|| {
+            query(
+                ctx.clone(),
+                &rt,
+                "SELECT u64_mid, array_agg(f64) \
+                 FROM t GROUP BY u64_mid",
+            )
+        })
+    });
+
+    c.bench_function("array_agg_query_group_by_many_groups", |b| {
+        b.iter(|| {
+            query(
+                ctx.clone(),
+                &rt,
+                "SELECT u64_wide, array_agg(f64) \
+                 FROM t GROUP BY u64_wide",
+            )
+        })
+    });
+
+    c.bench_function("array_agg_struct_query_group_by_mid_groups", |b| {
+        b.iter(|| {
+            query(
+                ctx.clone(),
+                &rt,
+                "SELECT u64_mid, array_agg(named_struct('market', dict10, 'price', f64)) \
+                 FROM t GROUP BY u64_mid",
+            )
+        })
+    });
+
+    c.bench_function("string_agg_query_group_by_few_groups", |b| {
+        b.iter(|| {
+            query(
+                ctx.clone(),
+                &rt,
+                "SELECT u64_narrow, string_agg(utf8, ',') \
+                 FROM t GROUP BY u64_narrow",
+            )
+        })
+    });
+
+    c.bench_function("string_agg_query_group_by_mid_groups", |b| {
+        b.iter(|| {
+            query(
+                ctx.clone(),
+                &rt,
+                "SELECT u64_mid, string_agg(utf8, ',') \
+                 FROM t GROUP BY u64_mid",
+            )
+        })
+    });
+
+    c.bench_function("string_agg_query_group_by_many_groups", |b| {
+        b.iter(|| {
+            query(
+                ctx.clone(),
+                &rt,
+                "SELECT u64_wide, string_agg(utf8, ',') \
+                 FROM t GROUP BY u64_wide",
+            )
+        })
+    });
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/core/benches/csv_load.rs b/datafusion/core/benches/csv_load.rs
index de0f0d8250572..13843dadddd0c 100644
--- a/datafusion/core/benches/csv_load.rs
+++ b/datafusion/core/benches/csv_load.rs
@@ -15,14 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#[macro_use]
-extern crate criterion;
-extern crate arrow;
-extern crate datafusion;
-
 mod data_utils;
 
-use crate::criterion::Criterion;
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion::error::Result;
 use datafusion::execution::context::SessionContext;
 use datafusion::prelude::CsvReadOptions;
@@ -34,6 +29,7 @@ use std::time::Duration;
 use test_utils::AccessLogGenerator;
 use tokio::runtime::Runtime;
 
+#[expect(clippy::needless_pass_by_value)]
 fn load_csv(
     ctx: Arc<Mutex<SessionContext>>,
     rt: &Runtime,
diff --git a/datafusion/core/benches/data_utils/mod.rs b/datafusion/core/benches/data_utils/mod.rs
index fffe2e2d17522..728c6490c72bd 100644
--- a/datafusion/core/benches/data_utils/mod.rs
+++ b/datafusion/core/benches/data_utils/mod.rs
@@ -18,10 +18,11 @@
 //! This module provides the in-memory table for more realistic benchmarking.
 
 use arrow::array::{
-    builder::{Int64Builder, StringBuilder},
     ArrayRef, Float32Array, Float64Array, RecordBatch, StringArray, StringViewBuilder,
     UInt64Array,
+    builder::{Int64Builder, StringBuilder, StringDictionaryBuilder},
 };
+use arrow::datatypes::Int32Type;
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use datafusion::datasource::MemTable;
 use datafusion::error::Result;
@@ -36,6 +37,7 @@ use std::sync::Arc;
 
 /// create an in-memory table given the partition len, array len, and batch size,
 /// and the result table will be of array_len in total, and then partitioned, and batched.
+#[expect(clippy::allow_attributes)] // some issue where expect(dead_code) doesn't fire properly
 #[allow(dead_code)]
 pub fn create_table_provider(
     partitions_len: usize,
@@ -44,7 +46,7 @@ pub fn create_table_provider(
 ) -> Result<Arc<MemTable>> {
     let schema = Arc::new(create_schema());
     let partitions =
-        create_record_batches(schema.clone(), array_len, partitions_len, batch_size);
+        create_record_batches(&schema, array_len, partitions_len, batch_size);
     // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
     MemTable::try_new(schema, partitions).map(Arc::new)
 }
@@ -55,21 +57,24 @@ pub fn create_schema() -> Schema {
         Field::new("utf8", DataType::Utf8, false),
         Field::new("f32", DataType::Float32, false),
         Field::new("f64", DataType::Float64, true),
-        // This field will contain integers randomly selected from a large
-        // range of values, i.e. [0, u64::MAX], such that there are none (or
-        // very few) repeated values.
-        Field::new("u64_wide", DataType::UInt64, true),
-        // This field will contain integers randomly selected from a narrow
-        // range of values such that there are a few distinct values, but they
-        // are repeated often.
+        // Integers randomly selected from a wide range of values, i.e. [0,
+        // u64::MAX], such that there are ~no repeated values.
+        Field::new("u64_wide", DataType::UInt64, false),
+        // Integers randomly selected from a mid-range of values [0, 1000),
+        // providing ~1000 distinct groups.
+        Field::new("u64_mid", DataType::UInt64, false),
+        // Integers randomly selected from a narrow range of values such that
+        // there are a few distinct values, but they are repeated often.
         Field::new("u64_narrow", DataType::UInt64, false),
+        Field::new(
+            "dict10",
+            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+            true,
+        ),
     ])
 }
 
-fn create_data(size: usize, null_density: f64) -> Vec<Option<f64>> {
-    // use random numbers to avoid spurious compiler optimizations wrt to branching
-    let mut rng = StdRng::seed_from_u64(42);
-
+fn create_data(rng: &mut StdRng, size: usize, null_density: f64) -> Vec<Option<f64>> {
     (0..size)
         .map(|_| {
             if rng.random::<f64>() > null_density {
@@ -81,57 +86,54 @@ fn create_data(size: usize, null_density: f64) -> Vec<Option<f64>> {
         .collect()
 }
 
-fn create_integer_data(
-    rng: &mut StdRng,
-    size: usize,
-    value_density: f64,
-) -> Vec<Option<u64>> {
-    (0..size)
-        .map(|_| {
-            if rng.random::<f64>() > value_density {
-                None
-            } else {
-                Some(rng.random::<u64>())
-            }
-        })
-        .collect()
-}
-
 fn create_record_batch(
     schema: SchemaRef,
     rng: &mut StdRng,
     batch_size: usize,
-    i: usize,
+    batch_index: usize,
 ) -> RecordBatch {
-    // the 4 here is the number of different keys.
-    // a higher number increase sparseness
-    let vs = [0, 1, 2, 3];
-    let keys: Vec<String> = (0..batch_size)
-        .map(
-            // use random numbers to avoid spurious compiler optimizations wrt to branching
-            |_| format!("hi{:?}", vs.choose(rng)),
-        )
-        .collect();
-    let keys: Vec<&str> = keys.iter().map(|e| &**e).collect();
+    // Randomly choose from 4 distinct key values; a higher number increases sparseness.
+    let key_suffixes = [0, 1, 2, 3];
+    let keys = StringArray::from_iter_values(
+        (0..batch_size).map(|_| format!("hi{}", key_suffixes.choose(rng).unwrap())),
+    );
 
-    let values = create_data(batch_size, 0.5);
+    let values = create_data(rng, batch_size, 0.5);
 
     // Integer values between [0, u64::MAX].
-    let integer_values_wide = create_integer_data(rng, batch_size, 9.0);
+    let integer_values_wide = (0..batch_size)
+        .map(|_| rng.random::<u64>())
+        .collect::<Vec<_>>();
 
-    // Integer values between [0, 9].
+    // Integer values between [0, 1000).
+    let integer_values_mid = (0..batch_size)
+        .map(|_| rng.random_range(0..1000))
+        .collect::<Vec<_>>();
+
+    // Integer values between [0, 10).
     let integer_values_narrow = (0..batch_size)
-        .map(|_| rng.random_range(0_u64..10))
+        .map(|_| rng.random_range(0..10))
         .collect::<Vec<_>>();
 
+    let mut dict_builder = StringDictionaryBuilder::<Int32Type>::new();
+    for _ in 0..batch_size {
+        if rng.random::<f64>() > 0.9 {
+            dict_builder.append_null();
+        } else {
+            dict_builder.append_value(format!("market_{}", rng.random_range(0..10)));
+        }
+    }
+
     RecordBatch::try_new(
         schema,
         vec![
-            Arc::new(StringArray::from(keys)),
-            Arc::new(Float32Array::from(vec![i as f32; batch_size])),
+            Arc::new(keys),
+            Arc::new(Float32Array::from(vec![batch_index as f32; batch_size])),
             Arc::new(Float64Array::from(values)),
             Arc::new(UInt64Array::from(integer_values_wide)),
+            Arc::new(UInt64Array::from(integer_values_mid)),
             Arc::new(UInt64Array::from(integer_values_narrow)),
+            Arc::new(dict_builder.finish()),
         ],
     )
     .unwrap()
@@ -140,19 +142,28 @@ fn create_record_batch(
 /// Create record batches of `partitions_len` partitions and `batch_size` for each batch,
 /// with a total number of `array_len` records
 pub fn create_record_batches(
-    schema: SchemaRef,
+    schema: &SchemaRef,
     array_len: usize,
     partitions_len: usize,
     batch_size: usize,
 ) -> Vec<Vec<RecordBatch>> {
     let mut rng = StdRng::seed_from_u64(42);
-    (0..partitions_len)
-        .map(|_| {
-            (0..array_len / batch_size / partitions_len)
-                .map(|i| create_record_batch(schema.clone(), &mut rng, batch_size, i))
-                .collect::<Vec<_>>()
-        })
-        .collect::<Vec<_>>()
+    let mut partitions = Vec::with_capacity(partitions_len);
+    let batches_per_partition = array_len / batch_size / partitions_len;
+
+    for _ in 0..partitions_len {
+        let mut batches = Vec::with_capacity(batches_per_partition);
+        for batch_index in 0..batches_per_partition {
+            batches.push(create_record_batch(
+                schema.clone(),
+                &mut rng,
+                batch_size,
+                batch_index,
+            ));
+        }
+        partitions.push(batches);
+    }
+    partitions
 }
 
 /// An enum that wraps either a regular StringBuilder or a GenericByteViewBuilder
@@ -182,6 +193,7 @@ impl TraceIdBuilder {
 
 /// Create time series data with `partition_cnt` partitions and `sample_cnt` rows per partition
 /// in ascending order, if `asc` is true, otherwise randomly sampled using a Pareto distribution
+#[expect(clippy::allow_attributes)] // some issue where expect(dead_code) doesn't fire properly
 #[allow(dead_code)]
 pub(crate) fn make_data(
     partition_cnt: i32,
diff --git a/datafusion/core/benches/dataframe.rs b/datafusion/core/benches/dataframe.rs
index 00fa85918347a..5aeade315cc7b 100644
--- a/datafusion/core/benches/dataframe.rs
+++ b/datafusion/core/benches/dataframe.rs
@@ -15,13 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate arrow;
-#[macro_use]
-extern crate criterion;
-extern crate datafusion;
-
 use arrow_schema::{DataType, Field, Schema};
-use criterion::Criterion;
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion::datasource::MemTable;
 use datafusion::prelude::SessionContext;
 use datafusion_expr::col;
@@ -45,6 +40,7 @@ fn create_context(field_count: u32) -> datafusion_common::Result<Arc<SessionCont
     Ok(Arc::new(ctx))
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn run(column_count: u32, ctx: Arc<SessionContext>, rt: &Runtime) {
     black_box(rt.block_on(async {
         let mut data_frame = ctx.table("t").await.unwrap();
diff --git a/datafusion/core/benches/distinct_query_sql.rs b/datafusion/core/benches/distinct_query_sql.rs
index d05e8b13b2af3..d389b1b3d6a22 100644
--- a/datafusion/core/benches/distinct_query_sql.rs
+++ b/datafusion/core/benches/distinct_query_sql.rs
@@ -15,25 +15,22 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#[macro_use]
-extern crate criterion;
-extern crate arrow;
-extern crate datafusion;
-
 mod data_utils;
-use crate::criterion::Criterion;
+
+use criterion::{Criterion, criterion_group, criterion_main};
 use data_utils::{create_table_provider, make_data};
 use datafusion::execution::context::SessionContext;
-use datafusion::physical_plan::{collect, ExecutionPlan};
+use datafusion::physical_plan::{ExecutionPlan, collect};
 use datafusion::{datasource::MemTable, error::Result};
-use datafusion_execution::config::SessionConfig;
 use datafusion_execution::TaskContext;
+use datafusion_execution::config::SessionConfig;
 
 use parking_lot::Mutex;
 use std::hint::black_box;
 use std::{sync::Arc, time::Duration};
 use tokio::runtime::Runtime;
 
+#[expect(clippy::needless_pass_by_value)]
 fn query(ctx: Arc<Mutex<SessionContext>>, rt: &Runtime, sql: &str) {
     let df = rt.block_on(ctx.lock().sql(sql)).unwrap();
     black_box(rt.block_on(df.collect()).unwrap());
@@ -124,6 +121,7 @@ async fn distinct_with_limit(
     Ok(())
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn run(rt: &Runtime, plan: Arc<dyn ExecutionPlan>, ctx: Arc<TaskContext>) {
     black_box(rt.block_on(distinct_with_limit(plan.clone(), ctx.clone()))).unwrap();
 }
diff --git a/datafusion/core/benches/filter_query_sql.rs b/datafusion/core/benches/filter_query_sql.rs
index 16905e0f96605..3b80518d32dcd 100644
--- a/datafusion/core/benches/filter_query_sql.rs
+++ b/datafusion/core/benches/filter_query_sql.rs
@@ -20,7 +20,7 @@ use arrow::{
     datatypes::{DataType, Field, Schema},
     record_batch::RecordBatch,
 };
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion::prelude::SessionContext;
 use datafusion::{datasource::MemTable, error::Result};
 use futures::executor::block_on;
diff --git a/datafusion/core/benches/map_query_sql.rs b/datafusion/core/benches/map_query_sql.rs
index 09234546b2dfe..67904197bc257 100644
--- a/datafusion/core/benches/map_query_sql.rs
+++ b/datafusion/core/benches/map_query_sql.rs
@@ -15,14 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::collections::HashSet;
 use std::hint::black_box;
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, Int32Array, RecordBatch};
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use parking_lot::Mutex;
-use rand::prelude::ThreadRng;
 use rand::Rng;
+use rand::prelude::ThreadRng;
 use tokio::runtime::Runtime;
 
 use datafusion::prelude::SessionContext;
@@ -33,11 +34,12 @@ use datafusion_functions_nested::map::map;
 mod data_utils;
 
 fn build_keys(rng: &mut ThreadRng) -> Vec<String> {
-    let mut keys = vec![];
-    for _ in 0..1000 {
-        keys.push(rng.random_range(0..9999).to_string());
+    let mut keys = HashSet::with_capacity(1000);
+    while keys.len() < 1000 {
+        let key = rng.random_range(0..9999).to_string();
+        keys.insert(key);
     }
-    keys
+    keys.into_iter().collect()
 }
 
 fn build_values(rng: &mut ThreadRng) -> Vec<i32> {
diff --git a/datafusion/core/benches/math_query_sql.rs b/datafusion/core/benches/math_query_sql.rs
index 76824850c114c..f5df56e95a2d8 100644
--- a/datafusion/core/benches/math_query_sql.rs
+++ b/datafusion/core/benches/math_query_sql.rs
@@ -15,18 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#[macro_use]
-extern crate criterion;
-use criterion::Criterion;
+use criterion::{Criterion, criterion_group, criterion_main};
 
 use parking_lot::Mutex;
 use std::sync::Arc;
 
 use tokio::runtime::Runtime;
 
-extern crate arrow;
-extern crate datafusion;
-
 use arrow::{
     array::{Float32Array, Float64Array},
     datatypes::{DataType, Field, Schema},
@@ -36,6 +31,7 @@ use datafusion::datasource::MemTable;
 use datafusion::error::Result;
 use datafusion::execution::context::SessionContext;
 
+#[expect(clippy::needless_pass_by_value)]
 fn query(ctx: Arc<Mutex<SessionContext>>, rt: &Runtime, sql: &str) {
     // execute the query
     let df = rt.block_on(ctx.lock().sql(sql)).unwrap();
diff --git a/datafusion/core/benches/parquet_query_sql.rs b/datafusion/core/benches/parquet_query_sql.rs
index e2b3810480130..f099137973592 100644
--- a/datafusion/core/benches/parquet_query_sql.rs
+++ b/datafusion/core/benches/parquet_query_sql.rs
@@ -23,14 +23,14 @@ use arrow::datatypes::{
     SchemaRef,
 };
 use arrow::record_batch::RecordBatch;
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion::prelude::{SessionConfig, SessionContext};
 use datafusion_common::instant::Instant;
 use futures::stream::StreamExt;
 use parquet::arrow::ArrowWriter;
 use parquet::file::properties::{WriterProperties, WriterVersion};
-use rand::distr::uniform::SampleUniform;
 use rand::distr::Alphanumeric;
+use rand::distr::uniform::SampleUniform;
 use rand::prelude::*;
 use rand::rng;
 use std::fs::File;
@@ -45,7 +45,7 @@ const NUM_BATCHES: usize = 2048;
 /// The number of rows in each record batch to write
 const WRITE_RECORD_BATCH_SIZE: usize = 1024;
 /// The number of rows in a row group
-const ROW_GROUP_SIZE: usize = 1024 * 1024;
+const ROW_GROUP_ROW_COUNT: usize = 1024 * 1024;
 /// The number of row groups expected
 const EXPECTED_ROW_GROUPS: usize = 2;
 
@@ -154,7 +154,7 @@ fn generate_file() -> NamedTempFile {
 
     let properties = WriterProperties::builder()
         .set_writer_version(WriterVersion::PARQUET_2_0)
-        .set_max_row_group_size(ROW_GROUP_SIZE)
+        .set_max_row_group_row_count(Some(ROW_GROUP_ROW_COUNT))
         .build();
 
     let mut writer =
diff --git a/datafusion/core/benches/parquet_struct_projection.rs b/datafusion/core/benches/parquet_struct_projection.rs
new file mode 100644
index 0000000000000..7d5b220d397f8
--- /dev/null
+++ b/datafusion/core/benches/parquet_struct_projection.rs
@@ -0,0 +1,496 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmarks for struct leaf-level projection pruning in Parquet.
+//!
+//! Measures the benefit of reading only the needed leaf columns from a
+//! struct column. Three dataset shapes are tested:
+//!
+//! 1. **Narrow struct** (2 leaves): one 128 KiB UTF-8 field + one INT field
+//! 2. **Wide struct** (5 leaves): four 128 KiB UTF-8 fields + one INT field
+//! 3. **Nested struct** (3 leaves): `STRUCT<inner: STRUCT<large_string, small_int>, extra_string>`
+//!
+//! In all cases, projecting just the small integer should skip decoding
+//! all of the large string leaves, including through nested struct levels.
+
+use arrow::array::{ArrayRef, Int32Array, StringBuilder, StructArray};
+use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef};
+use arrow::record_batch::RecordBatch;
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion::prelude::SessionContext;
+use datafusion_common::instant::Instant;
+use parquet::arrow::ArrowWriter;
+use parquet::file::properties::{WriterProperties, WriterVersion};
+use std::hint::black_box;
+use std::path::Path;
+use std::sync::Arc;
+use std::time::Duration;
+use tempfile::NamedTempFile;
+use tokio::runtime::Runtime;
+
+const NUM_BATCHES: usize = 2;
+const WRITE_RECORD_BATCH_SIZE: usize = 256;
+const ROW_GROUP_ROW_COUNT: usize = 256;
+const EXPECTED_ROW_GROUPS: usize = 2;
+const LARGE_STRING_LEN: usize = 16 * 1024;
+
+fn narrow_schema() -> SchemaRef {
+    let struct_fields = Fields::from(vec![
+        Field::new("large_string", DataType::Utf8, false),
+        Field::new("small_int", DataType::Int32, false),
+    ]);
+    Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("s", DataType::Struct(struct_fields), false),
+    ]))
+}
+
+fn narrow_batch(batch_id: usize) -> RecordBatch {
+    let schema = narrow_schema();
+    let len = WRITE_RECORD_BATCH_SIZE;
+
+    let base_id = (batch_id * len) as i32;
+    let id_values: Vec<i32> = (0..len).map(|i| base_id + i as i32).collect();
+    let id_array = Arc::new(Int32Array::from(id_values.clone()));
+
+    let small_int_array = Arc::new(Int32Array::from(id_values));
+
+    let large_string: String = "x".repeat(LARGE_STRING_LEN);
+    let mut string_builder = StringBuilder::new();
+    for _ in 0..len {
+        string_builder.append_value(&large_string);
+    }
+    let large_string_array = Arc::new(string_builder.finish());
+
+    let struct_array = StructArray::from(vec![
+        (
+            Arc::new(Field::new("large_string", DataType::Utf8, false)),
+            large_string_array as ArrayRef,
+        ),
+        (
+            Arc::new(Field::new("small_int", DataType::Int32, false)),
+            small_int_array as ArrayRef,
+        ),
+    ]);
+
+    RecordBatch::try_new(schema, vec![id_array, Arc::new(struct_array)]).unwrap()
+}
+
+fn wide_schema() -> SchemaRef {
+    let struct_fields = Fields::from(vec![
+        Field::new("str_a", DataType::Utf8, false),
+        Field::new("str_b", DataType::Utf8, false),
+        Field::new("str_c", DataType::Utf8, false),
+        Field::new("str_d", DataType::Utf8, false),
+        Field::new("small_int", DataType::Int32, false),
+    ]);
+    Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("s", DataType::Struct(struct_fields), false),
+    ]))
+}
+
+fn wide_batch(batch_id: usize) -> RecordBatch {
+    let schema = wide_schema();
+    let len = WRITE_RECORD_BATCH_SIZE;
+
+    let base_id = (batch_id * len) as i32;
+    let id_values: Vec<i32> = (0..len).map(|i| base_id + i as i32).collect();
+    let id_array = Arc::new(Int32Array::from(id_values.clone()));
+    let small_int_array = Arc::new(Int32Array::from(id_values));
+
+    let large_string: String = "x".repeat(LARGE_STRING_LEN);
+    let mut string_fields: Vec<(Arc<Field>, ArrayRef)> = Vec::new();
+    for name in &["str_a", "str_b", "str_c", "str_d"] {
+        let mut sb = StringBuilder::new();
+        for _ in 0..len {
+            sb.append_value(&large_string);
+        }
+        string_fields.push((
+            Arc::new(Field::new(*name, DataType::Utf8, false)),
+            Arc::new(sb.finish()) as ArrayRef,
+        ));
+    }
+    string_fields.push((
+        Arc::new(Field::new("small_int", DataType::Int32, false)),
+        small_int_array as ArrayRef,
+    ));
+
+    let struct_array = StructArray::from(string_fields);
+    RecordBatch::try_new(schema, vec![id_array, Arc::new(struct_array)]).unwrap()
+}
+
+fn generate_file(
+    schema: SchemaRef,
+    batch_fn: fn(usize) -> RecordBatch,
+    prefix: &str,
+) -> NamedTempFile {
+    let now = Instant::now();
+    let mut named_file = tempfile::Builder::new()
+        .prefix(prefix)
+        .suffix(".parquet")
+        .tempfile()
+        .unwrap();
+
+    println!("Generating parquet file - {}", named_file.path().display());
+
+    let properties = WriterProperties::builder()
+        .set_writer_version(WriterVersion::PARQUET_2_0)
+        .set_max_row_group_row_count(Some(ROW_GROUP_ROW_COUNT))
+        .build();
+
+    let mut writer =
+        ArrowWriter::try_new(&mut named_file, schema, Some(properties)).unwrap();
+
+    for batch_id in 0..NUM_BATCHES {
+        let batch = batch_fn(batch_id);
+        writer.write(&batch).unwrap();
+    }
+
+    let metadata = writer.close().unwrap();
+    let file_metadata = metadata.file_metadata();
+    let expected_rows = WRITE_RECORD_BATCH_SIZE * NUM_BATCHES;
+    assert_eq!(
+        file_metadata.num_rows() as usize,
+        expected_rows,
+        "Expected {expected_rows} rows but got {}",
+        file_metadata.num_rows()
+    );
+    assert_eq!(
+        metadata.row_groups().len(),
+        EXPECTED_ROW_GROUPS,
+        "Expected {EXPECTED_ROW_GROUPS} row groups but got {}",
+        metadata.row_groups().len()
+    );
+
+    println!(
+        "Generated parquet file with {} rows and {} row groups in {:.2}s",
+        file_metadata.num_rows(),
+        metadata.row_groups().len(),
+        now.elapsed().as_secs_f32()
+    );
+
+    named_file
+}
+
+fn create_context(rt: &Runtime, file_path: &str, table: &str) -> SessionContext {
+    let ctx = SessionContext::new();
+    rt.block_on(ctx.register_parquet(table, file_path, Default::default()))
+        .unwrap();
+    ctx
+}
+
+fn query(ctx: &SessionContext, rt: &Runtime, sql: &str) {
+    let ctx = ctx.clone();
+    let sql = sql.to_string();
+    let df = rt.block_on(ctx.sql(&sql)).unwrap();
+    black_box(rt.block_on(df.collect()).unwrap());
+}
+
+fn narrow_benchmarks(c: &mut Criterion) {
+    let temp_file = generate_file(narrow_schema(), narrow_batch, "narrow_struct");
+    let file_path = temp_file.path().display().to_string();
+    assert!(Path::new(&file_path).exists(), "path not found");
+
+    let rt = Runtime::new().unwrap();
+    let ctx = create_context(&rt, &file_path, "t");
+
+    let mut group = c.benchmark_group("narrow_struct");
+    group.sample_size(10);
+    group.warm_up_time(Duration::from_secs(1));
+    group.measurement_time(Duration::from_secs(2));
+
+    // baseline: full struct, must decode both leaves
+    group.bench_function("select_struct", |b| {
+        b.iter(|| query(&ctx, &rt, "SELECT s FROM t"))
+    });
+
+    // pruned: skip large_string, read only small_int
+    group.bench_function("select_small_field", |b| {
+        b.iter(|| query(&ctx, &rt, "SELECT s['small_int'] FROM t"))
+    });
+
+    // pruned: skip small_int, read only large_string
+    group.bench_function("select_large_field", |b| {
+        b.iter(|| query(&ctx, &rt, "SELECT s['large_string'] FROM t"))
+    });
+
+    // no pruning: all columns
+    group.bench_function("select_all", |b| {
+        b.iter(|| query(&ctx, &rt, "SELECT * FROM t"))
+    });
+
+    // top-level column + pruned struct sub-field
+    group.bench_function("select_id_and_small_field", |b| {
+        b.iter(|| query(&ctx, &rt, "SELECT id, s['small_int'] FROM t"))
+    });
+
+    // aggregation on pruned sub-field, realistic analytical pattern
+    group.bench_function("sum_small_field", |b| {
+        b.iter(|| query(&ctx, &rt, "SELECT SUM(s['small_int']) FROM t"))
+    });
+
+    group.finish();
+    drop(temp_file);
+}
+
+fn wide_benchmarks(c: &mut Criterion) {
+    let temp_file = generate_file(wide_schema(), wide_batch, "wide_struct");
+    let file_path = temp_file.path().display().to_string();
+    assert!(Path::new(&file_path).exists(), "path not found");
+
+    let rt = Runtime::new().unwrap();
+    let ctx = create_context(&rt, &file_path, "t");
+
+    let mut group = c.benchmark_group("wide_struct");
+    group.sample_size(10);
+    group.warm_up_time(Duration::from_secs(1));
+    group.measurement_time(Duration::from_secs(2));
+
+    // baseline: full struct, must decode all 5 leaves
+    group.bench_function("select_struct", |b| {
+        b.iter(|| query(&ctx, &rt, "SELECT s FROM t"))
+    });
+
+    // pruned: skip all 4 large string leaves
+    group.bench_function("select_small_field", |b| {
+        b.iter(|| query(&ctx, &rt, "SELECT s['small_int'] FROM t"))
+    });
+
+    // pruned: read 1 of 4 string leaves + skip the rest
+    group.bench_function("select_one_string_field", |b| {
+        b.iter(|| query(&ctx, &rt, "SELECT s['str_a'] FROM t"))
+    });
+
+    // pruned: read 2 of 4 string leaves
+    group.bench_function("select_two_string_fields", |b| {
+        b.iter(|| query(&ctx, &rt, "SELECT s['str_a'], s['str_b'] FROM t"))
+    });
+
+    // no pruning: all columns
+    group.bench_function("select_all", |b| {
+        b.iter(|| query(&ctx, &rt, "SELECT * FROM t"))
+    });
+
+    // aggregation on pruned sub-field, skips all 4 large leaves
+    group.bench_function("sum_small_field", |b| {
+        b.iter(|| query(&ctx, &rt, "SELECT SUM(s['small_int']) FROM t"))
+    });
+
+    group.finish();
+    drop(temp_file);
+}
+
+fn nested_schema() -> SchemaRef {
+    let inner_fields = Fields::from(vec![
+        Field::new("large_string", DataType::Utf8, false),
+        Field::new("small_int", DataType::Int32, false),
+    ]);
+    let outer_fields = Fields::from(vec![
+        Field::new("inner", DataType::Struct(inner_fields), false),
+        Field::new("extra_string", DataType::Utf8, false),
+    ]);
+    Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("s", DataType::Struct(outer_fields), false),
+    ]))
+}
+
+fn nested_batch(batch_id: usize) -> RecordBatch {
+    let schema = nested_schema();
+    let len = WRITE_RECORD_BATCH_SIZE;
+
+    let base_id = (batch_id * len) as i32;
+    let id_values: Vec<i32> = (0..len).map(|i| base_id + i as i32).collect();
+    let id_array = Arc::new(Int32Array::from(id_values.clone()));
+    let small_int_array = Arc::new(Int32Array::from(id_values));
+
+    let large_string: String = "x".repeat(LARGE_STRING_LEN);
+
+    let mut sb1 = StringBuilder::new();
+    let mut sb2 = StringBuilder::new();
+    for _ in 0..len {
+        sb1.append_value(&large_string);
+        sb2.append_value(&large_string);
+    }
+
+    let inner_struct = StructArray::from(vec![
+        (
+            Arc::new(Field::new("large_string", DataType::Utf8, false)),
+            Arc::new(sb1.finish()) as ArrayRef,
+        ),
+        (
+            Arc::new(Field::new("small_int", DataType::Int32, false)),
+            small_int_array as ArrayRef,
+        ),
+    ]);
+
+    let inner_fields = Fields::from(vec![
+        Field::new("large_string", DataType::Utf8, false),
+        Field::new("small_int", DataType::Int32, false),
+    ]);
+    let outer_struct = StructArray::from(vec![
+        (
+            Arc::new(Field::new("inner", DataType::Struct(inner_fields), false)),
+            Arc::new(inner_struct) as ArrayRef,
+        ),
+        (
+            Arc::new(Field::new("extra_string", DataType::Utf8, false)),
+            Arc::new(sb2.finish()) as ArrayRef,
+        ),
+    ]);
+
+    RecordBatch::try_new(schema, vec![id_array, Arc::new(outer_struct)]).unwrap()
+}
+
+fn nested_benchmarks(c: &mut Criterion) {
+    let temp_file = generate_file(nested_schema(), nested_batch, "nested_struct");
+    let file_path = temp_file.path().display().to_string();
+    assert!(Path::new(&file_path).exists(), "path not found");
+
+    let rt = Runtime::new().unwrap();
+    let ctx = create_context(&rt, &file_path, "t");
+
+    let mut group = c.benchmark_group("nested_struct");
+    group.sample_size(10);
+    group.warm_up_time(Duration::from_secs(1));
+    group.measurement_time(Duration::from_secs(2));
+
+    // baseline: full outer struct, decode all 3 leaves
+    group.bench_function("select_struct", |b| {
+        b.iter(|| query(&ctx, &rt, "SELECT s FROM t"))
+    });
+
+    // pruned outer: read only inner struct, skip extra_string
+    group.bench_function("select_inner_struct", |b| {
+        b.iter(|| query(&ctx, &rt, "SELECT s['inner'] FROM t"))
+    });
+
+    // pruned both levels: reach through outer + inner, skip both large strings
+    group.bench_function("select_inner_small_field", |b| {
+        b.iter(|| query(&ctx, &rt, "SELECT s['inner']['small_int'] FROM t"))
+    });
+
+    // pruned outer only: skip inner struct entirely, read extra_string
+    group.bench_function("select_extra_string", |b| {
+        b.iter(|| query(&ctx, &rt, "SELECT s['extra_string'] FROM t"))
+    });
+
+    // no pruning: all columns
+    group.bench_function("select_all", |b| {
+        b.iter(|| query(&ctx, &rt, "SELECT * FROM t"))
+    });
+
+    // aggregation reaching through two levels of nesting
+    group.bench_function("sum_inner_small_field", |b| {
+        b.iter(|| query(&ctx, &rt, "SELECT SUM(s['inner']['small_int']) FROM t"))
+    });
+
+    group.finish();
+    drop(temp_file);
+}
+
+fn flat_schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("large_string", DataType::Utf8, false),
+        Field::new("small_int", DataType::Int32, false),
+    ]))
+}
+
+fn flat_batch(batch_id: usize) -> RecordBatch {
+    let schema = flat_schema();
+    let len = WRITE_RECORD_BATCH_SIZE;
+
+    let base_id = (batch_id * len) as i32;
+    let id_values: Vec<i32> = (0..len).map(|i| base_id + i as i32).collect();
+    let id_array = Arc::new(Int32Array::from(id_values.clone()));
+    let small_int_array = Arc::new(Int32Array::from(id_values));
+
+    let large_string: String = "x".repeat(LARGE_STRING_LEN);
+    let mut string_builder = StringBuilder::new();
+    for _ in 0..len {
+        string_builder.append_value(&large_string);
+    }
+    let large_string_array = Arc::new(string_builder.finish());
+
+    RecordBatch::try_new(
+        schema,
+        vec![id_array, large_string_array as ArrayRef, small_int_array],
+    )
+    .unwrap()
+}
+
+/// Compare selecting a small field from a flat (top-level) schema vs from
+/// inside a struct. Both files contain the same logical data — the only
+/// difference is whether `small_int` lives at the top level or nested inside
+/// a struct column.
+fn flat_vs_struct_benchmarks(c: &mut Criterion) {
+    let flat_file = generate_file(flat_schema(), flat_batch, "flat");
+    let flat_path = flat_file.path().display().to_string();
+    assert!(Path::new(&flat_path).exists(), "path not found");
+
+    let struct_file = generate_file(narrow_schema(), narrow_batch, "narrow_struct_cmp");
+    let struct_path = struct_file.path().display().to_string();
+    assert!(Path::new(&struct_path).exists(), "path not found");
+
+    let rt = Runtime::new().unwrap();
+    let flat_ctx = create_context(&rt, &flat_path, "t");
+    let struct_ctx = create_context(&rt, &struct_path, "t");
+
+    let mut group = c.benchmark_group("flat_vs_struct");
+    group.sample_size(10);
+    group.warm_up_time(Duration::from_secs(1));
+    group.measurement_time(Duration::from_secs(2));
+
+    // small int: top-level vs struct field
+    group.bench_function("flat_select_small_int", |b| {
+        b.iter(|| query(&flat_ctx, &rt, "SELECT small_int FROM t"))
+    });
+    group.bench_function("struct_select_small_int", |b| {
+        b.iter(|| query(&struct_ctx, &rt, "SELECT s['small_int'] FROM t"))
+    });
+
+    // large string: top-level vs struct field
+    group.bench_function("flat_select_large_string", |b| {
+        b.iter(|| query(&flat_ctx, &rt, "SELECT large_string FROM t"))
+    });
+    group.bench_function("struct_select_large_string", |b| {
+        b.iter(|| query(&struct_ctx, &rt, "SELECT s['large_string'] FROM t"))
+    });
+
+    // aggregation: SUM of small int
+    group.bench_function("flat_sum_small_int", |b| {
+        b.iter(|| query(&flat_ctx, &rt, "SELECT SUM(small_int) FROM t"))
+    });
+    group.bench_function("struct_sum_small_int", |b| {
+        b.iter(|| query(&struct_ctx, &rt, "SELECT SUM(s['small_int']) FROM t"))
+    });
+
+    group.finish();
+    drop(flat_file);
+    drop(struct_file);
+}
+
+criterion_group!(
+    benches,
+    narrow_benchmarks,
+    wide_benchmarks,
+    nested_benchmarks,
+    flat_vs_struct_benchmarks,
+);
+criterion_main!(benches);
diff --git a/datafusion/core/benches/parquet_struct_query.rs b/datafusion/core/benches/parquet_struct_query.rs
new file mode 100644
index 0000000000000..e7e91f0dd0e1e
--- /dev/null
+++ b/datafusion/core/benches/parquet_struct_query.rs
@@ -0,0 +1,312 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmarks of SQL queries on struct columns in parquet data
+
+use arrow::array::{ArrayRef, Int32Array, StringArray, StructArray};
+use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef};
+use arrow::record_batch::RecordBatch;
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion::prelude::SessionContext;
+use datafusion_common::instant::Instant;
+use parquet::arrow::ArrowWriter;
+use parquet::file::properties::{WriterProperties, WriterVersion};
+use rand::distr::Alphanumeric;
+use rand::prelude::*;
+use rand::rng;
+use std::hint::black_box;
+use std::ops::Range;
+use std::path::Path;
+use std::sync::Arc;
+use tempfile::NamedTempFile;
+use tokio::runtime::Runtime;
+
+/// The number of batches to write
+const NUM_BATCHES: usize = 128;
+/// The number of rows in each record batch to write
+const WRITE_RECORD_BATCH_SIZE: usize = 4096;
+/// The number of rows in a row group
+const ROW_GROUP_ROW_COUNT: usize = 65536;
+/// The number of row groups expected
+const EXPECTED_ROW_GROUPS: usize = 8;
+/// The range for random string lengths
+const STRING_LENGTH_RANGE: Range<usize> = 50..200;
+
+fn schema() -> SchemaRef {
+    let struct_fields = Fields::from(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("value", DataType::Utf8, false),
+    ]);
+    let struct_type = DataType::Struct(struct_fields);
+
+    Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("s", struct_type, false),
+    ]))
+}
+
+fn generate_strings(len: usize) -> ArrayRef {
+    let mut rng = rng();
+    Arc::new(StringArray::from_iter((0..len).map(|_| {
+        let string_len = rng.random_range(STRING_LENGTH_RANGE.clone());
+        Some(
+            (0..string_len)
+                .map(|_| char::from(rng.sample(Alphanumeric)))
+                .collect::<String>(),
+        )
+    })))
+}
+
+fn generate_batch(batch_id: usize) -> RecordBatch {
+    let schema = schema();
+    let len = WRITE_RECORD_BATCH_SIZE;
+
+    // Generate sequential IDs based on batch_id for uniqueness
+    let base_id = (batch_id * len) as i32;
+    let id_values: Vec<i32> = (0..len).map(|i| base_id + i as i32).collect();
+    let id_array = Arc::new(Int32Array::from(id_values.clone()));
+
+    // Create struct id array (matching top-level id)
+    let struct_id_array = Arc::new(Int32Array::from(id_values));
+
+    // Generate random strings for struct value field
+    let value_array = generate_strings(len);
+
+    // Construct StructArray
+    let struct_array = StructArray::from(vec![
+        (
+            Arc::new(Field::new("id", DataType::Int32, false)),
+            struct_id_array as ArrayRef,
+        ),
+        (
+            Arc::new(Field::new("value", DataType::Utf8, false)),
+            value_array,
+        ),
+    ]);
+
+    RecordBatch::try_new(schema, vec![id_array, Arc::new(struct_array)]).unwrap()
+}
+
+fn generate_file() -> NamedTempFile {
+    let now = Instant::now();
+    let mut named_file = tempfile::Builder::new()
+        .prefix("parquet_struct_query")
+        .suffix(".parquet")
+        .tempfile()
+        .unwrap();
+
+    println!("Generating parquet file - {}", named_file.path().display());
+    let schema = schema();
+
+    let properties = WriterProperties::builder()
+        .set_writer_version(WriterVersion::PARQUET_2_0)
+        .set_max_row_group_row_count(Some(ROW_GROUP_ROW_COUNT))
+        .build();
+
+    let mut writer =
+        ArrowWriter::try_new(&mut named_file, schema, Some(properties)).unwrap();
+
+    for batch_id in 0..NUM_BATCHES {
+        let batch = generate_batch(batch_id);
+        writer.write(&batch).unwrap();
+    }
+
+    let metadata = writer.close().unwrap();
+    let file_metadata = metadata.file_metadata();
+    let expected_rows = WRITE_RECORD_BATCH_SIZE * NUM_BATCHES;
+    assert_eq!(
+        file_metadata.num_rows() as usize,
+        expected_rows,
+        "Expected {} rows but got {}",
+        expected_rows,
+        file_metadata.num_rows()
+    );
+    assert_eq!(
+        metadata.row_groups().len(),
+        EXPECTED_ROW_GROUPS,
+        "Expected {} row groups but got {}",
+        EXPECTED_ROW_GROUPS,
+        metadata.row_groups().len()
+    );
+
+    println!(
+        "Generated parquet file with {} rows and {} row groups in {} seconds",
+        file_metadata.num_rows(),
+        metadata.row_groups().len(),
+        now.elapsed().as_secs_f32()
+    );
+
+    named_file
+}
+
+fn create_context(file_path: &str) -> SessionContext {
+    let ctx = SessionContext::new();
+    let rt = Runtime::new().unwrap();
+    rt.block_on(ctx.register_parquet("t", file_path, Default::default()))
+        .unwrap();
+    ctx
+}
+
+fn query(ctx: &SessionContext, rt: &Runtime, sql: &str) {
+    let ctx = ctx.clone();
+    let sql = sql.to_string();
+    let df = rt.block_on(ctx.sql(&sql)).unwrap();
+    black_box(rt.block_on(df.collect()).unwrap());
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let (file_path, temp_file) = match std::env::var("PARQUET_FILE") {
+        Ok(file) => (file, None),
+        Err(_) => {
+            let temp_file = generate_file();
+            (temp_file.path().display().to_string(), Some(temp_file))
+        }
+    };
+
+    assert!(Path::new(&file_path).exists(), "path not found");
+    println!("Using parquet file {file_path}");
+
+    let ctx = create_context(&file_path);
+    let rt = Runtime::new().unwrap();
+
+    // Basic struct access
+    c.bench_function("struct_access", |b| {
+        b.iter(|| query(&ctx, &rt, "select id, s['id'] from t"))
+    });
+
+    // Filter queries
+    c.bench_function("filter_struct_field_eq", |b| {
+        b.iter(|| query(&ctx, &rt, "select id from t where s['id'] = 5"))
+    });
+
+    c.bench_function("filter_struct_field_with_select", |b| {
+        b.iter(|| query(&ctx, &rt, "select id, s['id'] from t where s['id'] = 5"))
+    });
+
+    c.bench_function("filter_top_level_with_struct_select", |b| {
+        b.iter(|| query(&ctx, &rt, "select s['id'] from t where id = 5"))
+    });
+
+    c.bench_function("filter_struct_string_length", |b| {
+        b.iter(|| query(&ctx, &rt, "select id from t where length(s['value']) > 100"))
+    });
+
+    c.bench_function("filter_struct_range", |b| {
+        b.iter(|| {
+            query(
+                &ctx,
+                &rt,
+                "select id from t where s['id'] > 100 and s['id'] < 200",
+            )
+        })
+    });
+
+    // Join queries (limited with WHERE id < 1000 for performance)
+    c.bench_function("join_struct_to_struct", |b| {
+        b.iter(|| query(
+            &ctx,
+            &rt,
+            "select t1.id from t t1 join t t2 on t1.s['id'] = t2.s['id'] where t1.id < 1000"
+        ))
+    });
+
+    c.bench_function("join_struct_to_toplevel", |b| {
+        b.iter(|| query(
+            &ctx,
+            &rt,
+            "select t1.id from t t1 join t t2 on t1.s['id'] = t2.id where t1.id < 1000"
+        ))
+    });
+
+    c.bench_function("join_toplevel_to_struct", |b| {
+        b.iter(|| query(
+            &ctx,
+            &rt,
+            "select t1.id from t t1 join t t2 on t1.id = t2.s['id'] where t1.id < 1000"
+        ))
+    });
+
+    c.bench_function("join_struct_to_struct_with_top_level", |b| {
+        b.iter(|| query(
+            &ctx,
+            &rt,
+            "select t1.id from t t1 join t t2 on t1.s['id'] = t2.s['id'] and t1.id = t2.id where t1.id < 1000"
+        ))
+    });
+
+    c.bench_function("join_struct_and_struct_value", |b| {
+        b.iter(|| query(
+            &ctx,
+            &rt,
+            "select t1.s['id'], t2.s['value'] from t t1 join t t2 on t1.id = t2.id where t1.id < 1000"
+        ))
+    });
+
+    // Group by queries
+    c.bench_function("group_by_struct_field", |b| {
+        b.iter(|| query(&ctx, &rt, "select s['id'] from t group by s['id']"))
+    });
+
+    c.bench_function("group_by_struct_select_toplevel", |b| {
+        b.iter(|| query(&ctx, &rt, "select max(id) from t group by s['id']"))
+    });
+
+    c.bench_function("group_by_toplevel_select_struct", |b| {
+        b.iter(|| query(&ctx, &rt, "select max(s['id']) from t group by id"))
+    });
+
+    c.bench_function("group_by_struct_with_count", |b| {
+        b.iter(|| {
+            query(
+                &ctx,
+                &rt,
+                "select s['id'], count(*) from t group by s['id']",
+            )
+        })
+    });
+
+    c.bench_function("group_by_multiple_with_count", |b| {
+        b.iter(|| {
+            query(
+                &ctx,
+                &rt,
+                "select id, s['id'], count(*) from t group by id, s['id']",
+            )
+        })
+    });
+
+    // Additional queries
+    c.bench_function("order_by_struct_limit", |b| {
+        b.iter(|| {
+            query(
+                &ctx,
+                &rt,
+                "select id, s['id'] from t order by s['id'] limit 1000",
+            )
+        })
+    });
+
+    c.bench_function("distinct_struct_field", |b| {
+        b.iter(|| query(&ctx, &rt, "select distinct s['id'] from t"))
+    });
+
+    // Temporary file must outlive the benchmarks, it is deleted when dropped
+    drop(temp_file);
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/core/benches/physical_plan.rs b/datafusion/core/benches/physical_plan.rs
index e4838572f60fb..7b66996b05929 100644
--- a/datafusion/core/benches/physical_plan.rs
+++ b/datafusion/core/benches/physical_plan.rs
@@ -15,11 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#[macro_use]
-extern crate criterion;
-use criterion::{BatchSize, Criterion};
-extern crate arrow;
-extern crate datafusion;
+use criterion::{BatchSize, Criterion, criterion_group, criterion_main};
 
 use std::sync::Arc;
 
@@ -32,7 +28,7 @@ use tokio::runtime::Runtime;
 use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
 use datafusion::physical_plan::{
     collect,
-    expressions::{col, PhysicalSortExpr},
+    expressions::{PhysicalSortExpr, col},
 };
 use datafusion::prelude::SessionContext;
 use datafusion_datasource::memory::MemorySourceConfig;
@@ -40,6 +36,7 @@ use datafusion_physical_expr_common::sort_expr::LexOrdering;
 
 // Initialize the operator using the provided record batches and the sort key
 // as inputs. All record batches must have the same schema.
+#[expect(clippy::needless_pass_by_value)]
 fn sort_preserving_merge_operator(
     session_ctx: Arc<SessionContext>,
     rt: &Runtime,
diff --git a/datafusion/core/benches/preserve_file_partitioning.rs b/datafusion/core/benches/preserve_file_partitioning.rs
new file mode 100644
index 0000000000000..9b1f59adc6823
--- /dev/null
+++ b/datafusion/core/benches/preserve_file_partitioning.rs
@@ -0,0 +1,838 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmark for `preserve_file_partitions` optimization.
+//!
+//! When enabled, this optimization declares Hive-partitioned tables as
+//! `Hash([partition_col])` partitioned, allowing the query optimizer to
+//! skip unnecessary repartitioning and sorting operations.
+//!
+//! When This Optimization Helps
+//! - Window functions: PARTITION BY on partition column eliminates RepartitionExec and SortExec
+//! - Aggregates with ORDER BY: GROUP BY partition column and ORDER BY eliminates post aggregate sort
+//!
+//! When This Optimization Does NOT Help
+//! - GROUP BY non-partition columns: Required Hash distribution doesn't match declared partitioning
+//! - When the number of distinct file partitioning groups < the number of CPUs available: Reduces
+//!   parallelization, thus may outweigh the pros of reduced shuffles
+//!
+//! Usage
+//! - BENCH_SIZE=small|medium|large cargo bench -p datafusion --bench preserve_file_partitions
+//! - SAVE_PLANS=1 cargo bench ...  # Save query plans to files
+
+use arrow::array::{ArrayRef, Float64Array, StringArray, TimestampMillisecondArray};
+use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+use arrow::record_batch::RecordBatch;
+use arrow::util::pretty::pretty_format_batches;
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion::prelude::{ParquetReadOptions, SessionConfig, SessionContext, col};
+use datafusion_expr::SortExpr;
+use parquet::arrow::ArrowWriter;
+use parquet::file::properties::WriterProperties;
+use std::fs::{self, File};
+use std::io::Write;
+use std::path::Path;
+use std::sync::Arc;
+use tempfile::TempDir;
+use tokio::runtime::Runtime;
+
+#[derive(Debug, Clone, Copy)]
+struct BenchConfig {
+    fact_partitions: usize,
+    rows_per_partition: usize,
+    target_partitions: usize,
+    measurement_time_secs: u64,
+}
+
+impl BenchConfig {
+    fn small() -> Self {
+        Self {
+            fact_partitions: 10,
+            rows_per_partition: 1_000_000,
+            target_partitions: 10,
+            measurement_time_secs: 15,
+        }
+    }
+
+    fn medium() -> Self {
+        Self {
+            fact_partitions: 30,
+            rows_per_partition: 3_000_000,
+            target_partitions: 30,
+            measurement_time_secs: 30,
+        }
+    }
+
+    fn large() -> Self {
+        Self {
+            fact_partitions: 50,
+            rows_per_partition: 6_000_000,
+            target_partitions: 50,
+            measurement_time_secs: 90,
+        }
+    }
+
+    fn from_env() -> Self {
+        match std::env::var("BENCH_SIZE").as_deref() {
+            Ok("small") | Ok("SMALL") => Self::small(),
+            Ok("medium") | Ok("MEDIUM") => Self::medium(),
+            Ok("large") | Ok("LARGE") => Self::large(),
+            _ => {
+                println!("Using SMALL dataset (set BENCH_SIZE=small|medium|large)");
+                Self::small()
+            }
+        }
+    }
+
+    fn total_rows(&self) -> usize {
+        self.fact_partitions * self.rows_per_partition
+    }
+
+    fn high_cardinality(base: &Self) -> Self {
+        Self {
+            fact_partitions: (base.fact_partitions as f64 * 2.5) as usize,
+            rows_per_partition: base.rows_per_partition / 2,
+            target_partitions: base.target_partitions,
+            measurement_time_secs: base.measurement_time_secs,
+        }
+    }
+}
+
+fn dkey_names(count: usize) -> Vec<String> {
+    (0..count)
+        .map(|i| {
+            if i < 26 {
+                ((b'A' + i as u8) as char).to_string()
+            } else {
+                format!(
+                    "{}{}",
+                    (b'A' + ((i / 26) - 1) as u8) as char,
+                    (b'A' + (i % 26) as u8) as char
+                )
+            }
+        })
+        .collect()
+}
+
+/// Hive-partitioned fact table, sorted by timestamp within each partition.
+fn generate_fact_table(
+    base_dir: &Path,
+    num_partitions: usize,
+    rows_per_partition: usize,
+) {
+    let fact_dir = base_dir.join("fact");
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new(
+            "timestamp",
+            DataType::Timestamp(TimeUnit::Millisecond, None),
+            false,
+        ),
+        Field::new("value", DataType::Float64, false),
+    ]));
+
+    let props = WriterProperties::builder()
+        .set_compression(parquet::basic::Compression::SNAPPY)
+        .build();
+
+    let dkeys = dkey_names(num_partitions);
+
+    for dkey in &dkeys {
+        let part_dir = fact_dir.join(format!("f_dkey={dkey}"));
+        fs::create_dir_all(&part_dir).unwrap();
+        let file_path = part_dir.join("data.parquet");
+        let file = File::create(file_path).unwrap();
+
+        let mut writer =
+            ArrowWriter::try_new(file, schema.clone(), Some(props.clone())).unwrap();
+
+        let base_ts = 1672567200000i64; // 2023-01-01T09:00:00
+        let timestamps: Vec<i64> = (0..rows_per_partition)
+            .map(|i| base_ts + (i as i64 * 10000))
+            .collect();
+
+        let values: Vec<f64> = (0..rows_per_partition)
+            .map(|i| 50.0 + (i % 100) as f64 + ((i % 7) as f64 * 10.0))
+            .collect();
+
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(TimestampMillisecondArray::from(timestamps)) as ArrayRef,
+                Arc::new(Float64Array::from(values)),
+            ],
+        )
+        .unwrap();
+
+        writer.write(&batch).unwrap();
+        writer.close().unwrap();
+    }
+}
+
+/// Single-file dimension table for CollectLeft joins.
+fn generate_dimension_table(base_dir: &Path, num_partitions: usize) {
+    let dim_dir = base_dir.join("dimension");
+    fs::create_dir_all(&dim_dir).unwrap();
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("d_dkey", DataType::Utf8, false),
+        Field::new("env", DataType::Utf8, false),
+        Field::new("service", DataType::Utf8, false),
+        Field::new("host", DataType::Utf8, false),
+    ]));
+
+    let props = WriterProperties::builder()
+        .set_compression(parquet::basic::Compression::SNAPPY)
+        .build();
+
+    let file_path = dim_dir.join("data.parquet");
+    let file = File::create(file_path).unwrap();
+    let mut writer = ArrowWriter::try_new(file, schema.clone(), Some(props)).unwrap();
+
+    let dkeys = dkey_names(num_partitions);
+    let envs = ["dev", "prod", "staging", "test"];
+    let services = ["log", "trace", "metric"];
+    let hosts = ["ma", "vim", "nano", "emacs"];
+
+    let d_dkey_vals: Vec<String> = dkeys.clone();
+    let env_vals: Vec<String> = dkeys
+        .iter()
+        .enumerate()
+        .map(|(i, _)| envs[i % envs.len()].to_string())
+        .collect();
+    let service_vals: Vec<String> = dkeys
+        .iter()
+        .enumerate()
+        .map(|(i, _)| services[i % services.len()].to_string())
+        .collect();
+    let host_vals: Vec<String> = dkeys
+        .iter()
+        .enumerate()
+        .map(|(i, _)| hosts[i % hosts.len()].to_string())
+        .collect();
+
+    let batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![
+            Arc::new(StringArray::from(d_dkey_vals)) as ArrayRef,
+            Arc::new(StringArray::from(env_vals)),
+            Arc::new(StringArray::from(service_vals)),
+            Arc::new(StringArray::from(host_vals)),
+        ],
+    )
+    .unwrap();
+
+    writer.write(&batch).unwrap();
+    writer.close().unwrap();
+}
+
+struct BenchVariant {
+    name: &'static str,
+    preserve_file_partitions: usize,
+    prefer_existing_sort: bool,
+}
+
+const BENCH_VARIANTS: [BenchVariant; 3] = [
+    BenchVariant {
+        name: "with_optimization",
+        preserve_file_partitions: 1,
+        prefer_existing_sort: false,
+    },
+    BenchVariant {
+        name: "prefer_existing_sort",
+        preserve_file_partitions: 0,
+        prefer_existing_sort: true,
+    },
+    BenchVariant {
+        name: "without_optimization",
+        preserve_file_partitions: 0,
+        prefer_existing_sort: false,
+    },
+];
+
+async fn save_plans(
+    output_file: &Path,
+    fact_path: &str,
+    dim_path: Option<&str>,
+    target_partitions: usize,
+    query: &str,
+    file_sort_order: Option<Vec<Vec<SortExpr>>>,
+) {
+    let mut file = File::create(output_file).unwrap();
+    writeln!(file, "Query: {query}\n").unwrap();
+
+    for variant in &BENCH_VARIANTS {
+        let session_config = SessionConfig::new()
+            .with_target_partitions(target_partitions)
+            .set_usize(
+                "datafusion.optimizer.preserve_file_partitions",
+                variant.preserve_file_partitions,
+            )
+            .set_bool(
+                "datafusion.optimizer.prefer_existing_sort",
+                variant.prefer_existing_sort,
+            );
+        let ctx = SessionContext::new_with_config(session_config);
+
+        let mut fact_options = ParquetReadOptions {
+            table_partition_cols: vec![("f_dkey".to_string(), DataType::Utf8)],
+            ..Default::default()
+        };
+        if let Some(ref order) = file_sort_order {
+            fact_options.file_sort_order = order.clone();
+        }
+        ctx.register_parquet("fact", fact_path, fact_options)
+            .await
+            .unwrap();
+
+        if let Some(dim) = dim_path {
+            let dim_schema = Arc::new(Schema::new(vec![
+                Field::new("d_dkey", DataType::Utf8, false),
+                Field::new("env", DataType::Utf8, false),
+                Field::new("service", DataType::Utf8, false),
+                Field::new("host", DataType::Utf8, false),
+            ]));
+            let dim_options = ParquetReadOptions {
+                schema: Some(&dim_schema),
+                ..Default::default()
+            };
+            ctx.register_parquet("dimension", dim, dim_options)
+                .await
+                .unwrap();
+        }
+
+        let df = ctx.sql(query).await.unwrap();
+        let plan = df.explain(false, false).unwrap().collect().await.unwrap();
+        writeln!(file, "=== {} ===", variant.name).unwrap();
+        writeln!(file, "{}\n", pretty_format_batches(&plan).unwrap()).unwrap();
+    }
+}
+
+#[expect(clippy::too_many_arguments)]
+fn run_benchmark(
+    c: &mut Criterion,
+    rt: &Runtime,
+    name: &str,
+    fact_path: &str,
+    dim_path: Option<&str>,
+    target_partitions: usize,
+    query: &str,
+    file_sort_order: &Option<Vec<Vec<SortExpr>>>,
+) {
+    if std::env::var("SAVE_PLANS").is_ok() {
+        let output_path = format!("{name}_plans.txt");
+        rt.block_on(save_plans(
+            Path::new(&output_path),
+            fact_path,
+            dim_path,
+            target_partitions,
+            query,
+            file_sort_order.clone(),
+        ));
+        println!("Plans saved to {output_path}");
+    }
+
+    let mut group = c.benchmark_group(name);
+
+    for variant in &BENCH_VARIANTS {
+        let fact_path_owned = fact_path.to_string();
+        let dim_path_owned = dim_path.map(|s| s.to_string());
+        let sort_order = file_sort_order.clone();
+        let query_owned = query.to_string();
+        let preserve_file_partitions = variant.preserve_file_partitions;
+        let prefer_existing_sort = variant.prefer_existing_sort;
+
+        group.bench_function(variant.name, |b| {
+            b.to_async(rt).iter(|| {
+                let fact_path = fact_path_owned.clone();
+                let dim_path = dim_path_owned.clone();
+                let sort_order = sort_order.clone();
+                let query = query_owned.clone();
+                async move {
+                    let session_config = SessionConfig::new()
+                        .with_target_partitions(target_partitions)
+                        .set_usize(
+                            "datafusion.optimizer.preserve_file_partitions",
+                            preserve_file_partitions,
+                        )
+                        .set_bool(
+                            "datafusion.optimizer.prefer_existing_sort",
+                            prefer_existing_sort,
+                        );
+                    let ctx = SessionContext::new_with_config(session_config);
+
+                    let mut fact_options = ParquetReadOptions {
+                        table_partition_cols: vec![(
+                            "f_dkey".to_string(),
+                            DataType::Utf8,
+                        )],
+                        ..Default::default()
+                    };
+                    if let Some(ref order) = sort_order {
+                        fact_options.file_sort_order = order.clone();
+                    }
+                    ctx.register_parquet("fact", &fact_path, fact_options)
+                        .await
+                        .unwrap();
+
+                    if let Some(ref dim) = dim_path {
+                        let dim_schema = Arc::new(Schema::new(vec![
+                            Field::new("d_dkey", DataType::Utf8, false),
+                            Field::new("env", DataType::Utf8, false),
+                            Field::new("service", DataType::Utf8, false),
+                            Field::new("host", DataType::Utf8, false),
+                        ]));
+                        let dim_options = ParquetReadOptions {
+                            schema: Some(&dim_schema),
+                            ..Default::default()
+                        };
+                        ctx.register_parquet("dimension", dim, dim_options)
+                            .await
+                            .unwrap();
+                    }
+
+                    let df = ctx.sql(&query).await.unwrap();
+                    df.collect().await.unwrap()
+                }
+            })
+        });
+    }
+
+    group.finish();
+}
+
+/// Aggregate on high-cardinality partitions which eliminates repartition and sort.
+///
+/// Query: SELECT f_dkey, COUNT(*), SUM(value) FROM fact GROUP BY f_dkey ORDER BY f_dkey
+///
+/// ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+/// │                                          with_optimization                                              │
+/// │                                   (preserve_file_partitions=enabled)                                    │
+/// │                                                                                                         │
+/// │   ┌───────────────────────────┐                                                                         │
+/// │   │  SortPreservingMergeExec  │ Sort Preserved                                                          │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     AggregateExec         │ No repartitioning needed                                                │
+/// │   │   (SinglePartitioned)     │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     DataSourceExec        │ partitioning=Hash([f_dkey])                                             │
+/// │   │   file_groups={N groups}  │                                                                         │
+/// │   └───────────────────────────┘                                                                         │
+/// └─────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+///
+/// ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+/// │                                        prefer_existing_sort                                             │
+/// │                         (preserve_file_partitions=disabled, prefer_existing_sort=true)                  │
+/// │                                                                                                         │
+/// │   ┌───────────────────────────┐                                                                         │
+/// │   │  SortPreservingMergeExec  │ Sort Preserved                                                          │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      AggregateExec        │                                                                         │
+/// │   │    (FinalPartitioned)     │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     RepartitionExec       │ Hash shuffle with order preservation                                    │
+/// │   │  Hash([f_dkey], N)        │ Uses k-way merge to maintain sort, has overhead                         │
+/// │   │  preserve_order=true      │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      AggregateExec        │                                                                         │
+/// │   │        (Partial)          │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     DataSourceExec        │ partitioning=UnknownPartitioning                                        │
+/// │   └───────────────────────────┘                                                                         │
+/// └─────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+///
+/// ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+/// │                                       without_optimization                                              │
+/// │                        (preserve_file_partitions=disabled, prefer_existing_sort=false)                  │
+/// │                                                                                                         │
+/// │   ┌───────────────────────────┐                                                                         │
+/// │   │  SortPreservingMergeExec  │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      AggregateExec        │ FinalPartitioned                                                        │
+/// │   │    (FinalPartitioned)     │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │        SortExec           │ Must sort after shuffle                                                 │
+/// │   │    [f_dkey ASC]           │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     RepartitionExec       │ Hash shuffle destroys ordering                                          │
+/// │   │     Hash([f_dkey], N)     │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      AggregateExec        │                                                                         │
+/// │   │        (Partial)          │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     DataSourceExec        │ partitioning=UnknownPartitioning                                                     │
+/// │   └───────────────────────────┘                                                                         │
+/// └─────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+fn preserve_order_bench(
+    c: &mut Criterion,
+    rt: &Runtime,
+    hc_fact_path: &str,
+    target_partitions: usize,
+) {
+    let query = "SELECT f_dkey, COUNT(*) as cnt, SUM(value) as total \
+                 FROM fact \
+                 GROUP BY f_dkey \
+                 ORDER BY f_dkey";
+
+    let file_sort_order = vec![vec![col("f_dkey").sort(true, false)]];
+
+    run_benchmark(
+        c,
+        rt,
+        "preserve_order",
+        hc_fact_path,
+        None,
+        target_partitions,
+        query,
+        &Some(file_sort_order),
+    );
+}
+
+/// Join and aggregate on partition column which demonstrates propagation through join.
+///
+/// Query: SELECT f.f_dkey, MAX(d.env), ... FROM fact f JOIN dimension d ON f.f_dkey = d.d_dkey
+///        WHERE d.service = 'log' GROUP BY f.f_dkey ORDER BY f.f_dkey
+///
+/// ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+/// │                                          with_optimization                                              │
+/// │                                   (preserve_file_partitions=enabled)                                    │
+/// │                                                                                                         │
+/// │   ┌───────────────────────────┐                                                                         │
+/// │   │  SortPreservingMergeExec  │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      AggregateExec        │ Hash partitioning propagates through join                               │
+/// │   │    (SinglePartitioned)    │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      HashJoinExec         │ Hash partitioning preserved on probe side                               │
+/// │   │     (CollectLeft)         │                                                                         │
+/// │   └──────────┬────────────────┘                                                                         │
+/// │              │                                                                                          │
+/// │       ┌──────┴──────┐                                                                                   │
+/// │       │             │                                                                                   │
+/// │   ┌───▼───┐    ┌────▼────────────────┐                                                                  │
+/// │   │ Dim   │    │   DataSourceExec    │  partitioning=Hash([f_dkey]), output_ordering=[f_dkey]           │
+/// │   │ Table │    │  (fact, N groups)   │                                                                  │
+/// │   └───────┘    └─────────────────────┘                                                                  │
+/// └─────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+///
+/// ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+/// │                                        prefer_existing_sort                                             │
+/// │                         (preserve_file_partitions=disabled, prefer_existing_sort=true)                  │
+/// │                                                                                                         │
+/// │   ┌───────────────────────────┐                                                                         │
+/// │   │  SortPreservingMergeExec  │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      AggregateExec        │                                                                         │
+/// │   │    (FinalPartitioned)     │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     RepartitionExec       │  Hash shuffle with order preservation                                   │
+/// │   │     preserve_order=true   │  Uses k-way merge to maintain sort, has overhead                        │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      AggregateExec        │                                                                         │
+/// │   │        (Partial)          │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      HashJoinExec         │                                                                         │
+/// │   │     (CollectLeft)         │                                                                         │
+/// │   └──────────┬────────────────┘                                                                         │
+/// │              │                                                                                          │
+/// │       ┌──────┴──────┐                                                                                   │
+/// │       │             │                                                                                   │
+/// │   ┌───▼───┐    ┌────▼────────────────┐                                                                  │
+/// │   │ Dim   │    │   DataSourceExec    │ partitioning=UnknownPartitioning, output_ordering=[f_dkey]       │
+/// │   │ Table │    │      (fact)         │                                                                  │
+/// │   └───────┘    └─────────────────────┘                                                                  │
+/// └─────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+///
+/// ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+/// │                                       without_optimization                                              │
+/// │                        (preserve_file_partitions=disabled, prefer_existing_sort=false)                  │
+/// │                                                                                                         │
+/// │   ┌───────────────────────────┐                                                                         │
+/// │   │  SortPreservingMergeExec  │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      AggregateExec        │                                                                         │
+/// │   │    (FinalPartitioned)     │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │        SortExec           │ Must sort after shuffle                                                 │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     RepartitionExec       │ Hash shuffle destroys ordering                                          │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      AggregateExec        │                                                                         │
+/// │   │        (Partial)          │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      HashJoinExec         │                                                                         │
+/// │   │     (CollectLeft)         │                                                                         │
+/// │   └──────────┬────────────────┘                                                                         │
+/// │              │                                                                                          │
+/// │       ┌──────┴──────┐                                                                                   │
+/// │       │             │                                                                                   │
+/// │   ┌───▼───┐    ┌────▼────────────────┐                                                                  │
+/// │   │ Dim   │    │   DataSourceExec    │ partitioning=UnknownPartitioning, output_ordering=[f_dkey]       │
+/// │   │ Table │    │      (fact)         │                                                                  │
+/// │   └───────┘    └─────────────────────┘                                                                  │
+/// └─────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+fn preserve_order_join_bench(
+    c: &mut Criterion,
+    rt: &Runtime,
+    hc_fact_path: &str,
+    dim_path: &str,
+    target_partitions: usize,
+) {
+    let query = "SELECT f.f_dkey, MAX(d.env), MAX(d.service), COUNT(*), SUM(f.value) \
+                 FROM fact f \
+                 INNER JOIN dimension d ON f.f_dkey = d.d_dkey \
+                 WHERE d.service = 'log' \
+                 GROUP BY f.f_dkey \
+                 ORDER BY f.f_dkey";
+
+    let file_sort_order = vec![vec![col("f_dkey").sort(true, false)]];
+
+    run_benchmark(
+        c,
+        rt,
+        "preserve_order_join",
+        hc_fact_path,
+        Some(dim_path),
+        target_partitions,
+        query,
+        &Some(file_sort_order),
+    );
+}
+
+/// Window function with LIMIT which demonstrates partition and sort elimination.
+///
+/// Query: SELECT f_dkey, timestamp, value,
+///               ROW_NUMBER() OVER (PARTITION BY f_dkey ORDER BY timestamp) as rn
+///        FROM fact LIMIT 1000
+///
+/// ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+/// │                                          with_optimization                                              │
+/// │                                   (preserve_file_partitions=enabled)                                    │
+/// │                                                                                                         │
+/// │   ┌───────────────────────────┐                                                                         │
+/// │   │       GlobalLimitExec     │                                                                         │
+/// │   │        (LIMIT 1000)       │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │   BoundedWindowAggExec    │ No repaartition needed                                                  │
+/// │   │  PARTITION BY f_dkey      │                                                                         │
+/// │   │  ORDER BY timestamp       │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     DataSourceExec        │ partitioning=Hash([f_dkey]), output_ordering=[f_dkey, timestamp]        │
+/// │   │   file_groups={N groups}  │                                                                         │
+/// │   └───────────────────────────┘                                                                         │
+/// └─────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+///
+/// ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+/// │                                        prefer_existing_sort                                             │
+/// │                         (preserve_file_partitions=disabled, prefer_existing_sort=true)                  │
+/// │                                                                                                         │
+/// │   ┌───────────────────────────┐                                                                         │
+/// │   │       GlobalLimitExec     │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │   BoundedWindowAggExec    │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     RepartitionExec       │ Hash shuffle with order preservation                                    │
+/// │   │  Hash([f_dkey], N)        │ Uses k-way merge to maintain sort, has overhead                         │
+/// │   │  preserve_order=true      │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     DataSourceExec        │ partitioning=UnknownPartitioning, output_ordering=[f_dkey, timestamp]   │
+/// │   └───────────────────────────┘                                                                         │
+/// └─────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+///
+/// ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+/// │                                       without_optimization                                              │
+/// │                        (preserve_file_partitions=disabled, prefer_existing_sort=false)                  │
+/// │                                                                                                         │
+/// │   ┌───────────────────────────┐                                                                         │
+/// │   │       GlobalLimitExec     │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │   BoundedWindowAggExec    │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │        SortExec           │ Must sort after shuffle                                                 │
+/// │   │  [f_dkey, timestamp]      │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     RepartitionExec       │ Hash shuffle destroys ordering                                          │
+/// │   │     Hash([f_dkey], N)     │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     DataSourceExec        │ partitioning=UnknownPartitioning, output_ordering=[f_dkey, timestamp]   │
+/// │   └───────────────────────────┘                                                                         │
+/// └─────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+fn preserve_order_window_bench(
+    c: &mut Criterion,
+    rt: &Runtime,
+    fact_path: &str,
+    target_partitions: usize,
+) {
+    let query = "SELECT f_dkey, timestamp, value, \
+                        ROW_NUMBER() OVER (PARTITION BY f_dkey ORDER BY timestamp) as rn \
+                 FROM fact \
+                 LIMIT 1000";
+
+    let file_sort_order = vec![vec![
+        col("f_dkey").sort(true, false),
+        col("timestamp").sort(true, false),
+    ]];
+
+    run_benchmark(
+        c,
+        rt,
+        "preserve_order_window",
+        fact_path,
+        None,
+        target_partitions,
+        query,
+        &Some(file_sort_order),
+    );
+}
+
+fn benchmark_main(c: &mut Criterion) {
+    let config = BenchConfig::from_env();
+    let hc_config = BenchConfig::high_cardinality(&config);
+
+    println!("\n=== Preserve File Partitioning Benchmark ===");
+    println!(
+        "Normal config: {} partitions × {} rows = {} total rows",
+        config.fact_partitions,
+        config.rows_per_partition,
+        config.total_rows()
+    );
+    println!(
+        "High-cardinality config: {} partitions × {} rows = {} total rows",
+        hc_config.fact_partitions,
+        hc_config.rows_per_partition,
+        hc_config.total_rows()
+    );
+    println!("Target partitions: {}\n", config.target_partitions);
+
+    let tmp_dir = TempDir::new().unwrap();
+    println!("Generating data...");
+
+    // High-cardinality fact table
+    generate_fact_table(
+        tmp_dir.path(),
+        hc_config.fact_partitions,
+        hc_config.rows_per_partition,
+    );
+    let hc_fact_dir = tmp_dir.path().join("fact_hc");
+    fs::rename(tmp_dir.path().join("fact"), &hc_fact_dir).unwrap();
+    let hc_fact_path = hc_fact_dir.to_str().unwrap().to_string();
+
+    // Normal fact table
+    generate_fact_table(
+        tmp_dir.path(),
+        config.fact_partitions,
+        config.rows_per_partition,
+    );
+    let fact_path = tmp_dir.path().join("fact").to_str().unwrap().to_string();
+
+    // Dimension table (for join)
+    generate_dimension_table(tmp_dir.path(), hc_config.fact_partitions);
+    let dim_path = tmp_dir
+        .path()
+        .join("dimension")
+        .to_str()
+        .unwrap()
+        .to_string();
+
+    println!("Done.\n");
+
+    let rt = Runtime::new().unwrap();
+
+    preserve_order_bench(c, &rt, &hc_fact_path, hc_config.target_partitions);
+    preserve_order_join_bench(
+        c,
+        &rt,
+        &hc_fact_path,
+        &dim_path,
+        hc_config.target_partitions,
+    );
+    preserve_order_window_bench(c, &rt, &fact_path, config.target_partitions);
+}
+
+criterion_group! {
+    name = benches;
+    config = {
+        let config = BenchConfig::from_env();
+        Criterion::default()
+            .measurement_time(std::time::Duration::from_secs(config.measurement_time_secs))
+            .sample_size(10)
+    };
+    targets = benchmark_main
+}
+criterion_main!(benches);
diff --git a/datafusion/core/benches/push_down_filter.rs b/datafusion/core/benches/push_down_filter.rs
index 139fb12c30947..d41085907dbc8 100644
--- a/datafusion/core/benches/push_down_filter.rs
+++ b/datafusion/core/benches/push_down_filter.rs
@@ -18,16 +18,16 @@
 use arrow::array::RecordBatch;
 use arrow::datatypes::{DataType, Field, Schema};
 use bytes::{BufMut, BytesMut};
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion::config::ConfigOptions;
 use datafusion::prelude::{ParquetReadOptions, SessionContext};
 use datafusion_execution::object_store::ObjectStoreUrl;
-use datafusion_physical_optimizer::filter_pushdown::FilterPushdown;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::filter_pushdown::FilterPushdown;
 use datafusion_physical_plan::ExecutionPlan;
 use object_store::memory::InMemory;
 use object_store::path::Path;
-use object_store::ObjectStore;
+use object_store::{ObjectStore, ObjectStoreExt};
 use parquet::arrow::ArrowWriter;
 use std::sync::Arc;
 
diff --git a/datafusion/core/benches/range_and_generate_series.rs b/datafusion/core/benches/range_and_generate_series.rs
new file mode 100644
index 0000000000000..10d560df0813e
--- /dev/null
+++ b/datafusion/core/benches/range_and_generate_series.rs
@@ -0,0 +1,86 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod data_utils;
+
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion::execution::context::SessionContext;
+use parking_lot::Mutex;
+use std::hint::black_box;
+use std::sync::Arc;
+use tokio::runtime::Runtime;
+
+#[expect(clippy::needless_pass_by_value)]
+fn query(ctx: Arc<Mutex<SessionContext>>, rt: &Runtime, sql: &str) {
+    let df = rt.block_on(ctx.lock().sql(sql)).unwrap();
+    black_box(rt.block_on(df.collect()).unwrap());
+}
+
+fn create_context() -> Arc<Mutex<SessionContext>> {
+    let ctx = SessionContext::new();
+    Arc::new(Mutex::new(ctx))
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let ctx = create_context();
+    let rt = Runtime::new().unwrap();
+
+    c.bench_function("range(1000000)", |b| {
+        b.iter(|| query(ctx.clone(), &rt, "SELECT value from range(1000000)"))
+    });
+
+    c.bench_function("generate_series(1000000)", |b| {
+        b.iter(|| {
+            query(
+                ctx.clone(),
+                &rt,
+                "SELECT value from generate_series(1000000)",
+            )
+        })
+    });
+
+    c.bench_function("range(0, 1000000, 5)", |b| {
+        b.iter(|| query(ctx.clone(), &rt, "SELECT value from range(0, 1000000, 5)"))
+    });
+
+    c.bench_function("generate_series(0, 1000000, 5)", |b| {
+        b.iter(|| {
+            query(
+                ctx.clone(),
+                &rt,
+                "SELECT value from generate_series(0, 1000000, 5)",
+            )
+        })
+    });
+
+    c.bench_function("range(1000000, 0, -5)", |b| {
+        b.iter(|| query(ctx.clone(), &rt, "SELECT value from range(1000000, 0, -5)"))
+    });
+
+    c.bench_function("generate_series(1000000, 0, -5)", |b| {
+        b.iter(|| {
+            query(
+                ctx.clone(),
+                &rt,
+                "SELECT value from generate_series(1000000, 0, -5)",
+            )
+        })
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/core/benches/reset_plan_states.rs b/datafusion/core/benches/reset_plan_states.rs
new file mode 100644
index 0000000000000..5afae7f43242d
--- /dev/null
+++ b/datafusion/core/benches/reset_plan_states.rs
@@ -0,0 +1,200 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::{Arc, LazyLock};
+
+use arrow_schema::{DataType, Field, Fields, Schema, SchemaRef};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion::prelude::SessionContext;
+use datafusion_catalog::MemTable;
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::displayable;
+use datafusion_physical_plan::execution_plan::reset_plan_states;
+use tokio::runtime::Runtime;
+
+const NUM_FIELDS: usize = 1000;
+const PREDICATE_LEN: usize = 50;
+
+static SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
+    Arc::new(Schema::new(
+        (0..NUM_FIELDS)
+            .map(|i| Arc::new(Field::new(format!("x_{i}"), DataType::Int64, false)))
+            .collect::<Fields>(),
+    ))
+});
+
+fn col_name(i: usize) -> String {
+    format!("x_{i}")
+}
+
+fn aggr_name(i: usize) -> String {
+    format!("aggr_{i}")
+}
+
+fn physical_plan(
+    ctx: &SessionContext,
+    rt: &Runtime,
+    sql: &str,
+) -> Arc<dyn ExecutionPlan> {
+    rt.block_on(async {
+        ctx.sql(sql)
+            .await
+            .unwrap()
+            .create_physical_plan()
+            .await
+            .unwrap()
+    })
+}
+
+fn predicate(col_name: impl Fn(usize) -> String, len: usize) -> String {
+    let mut predicate = String::new();
+    for i in 0..len {
+        if i > 0 {
+            predicate.push_str(" AND ");
+        }
+        predicate.push_str(&col_name(i));
+        predicate.push_str(" = ");
+        predicate.push_str(&i.to_string());
+    }
+    predicate
+}
+
+/// Returns a typical plan for the query like:
+///
+/// ```sql
+/// SELECT aggr1(col1) as aggr1, aggr2(col2) as aggr2 FROM t
+/// WHERE p1
+/// HAVING p2
+/// ```
+///
+/// Where `p1` and `p2` some long predicates.
+///
+fn query1() -> String {
+    let mut query = String::new();
+    query.push_str("SELECT ");
+    for i in 0..NUM_FIELDS {
+        if i > 0 {
+            query.push_str(", ");
+        }
+        query.push_str("AVG(");
+        query.push_str(&col_name(i));
+        query.push_str(") AS ");
+        query.push_str(&aggr_name(i));
+    }
+    query.push_str(" FROM t WHERE ");
+    query.push_str(&predicate(col_name, PREDICATE_LEN));
+    query.push_str(" HAVING ");
+    query.push_str(&predicate(aggr_name, PREDICATE_LEN));
+    query
+}
+
+/// Returns a typical plan for the query like:
+///
+/// ```sql
+/// SELECT projection FROM t JOIN v ON t.a = v.a
+/// WHERE p1
+/// ```
+///
+fn query2() -> String {
+    let mut query = String::new();
+    query.push_str("SELECT ");
+    for i in (0..NUM_FIELDS).step_by(2) {
+        if i > 0 {
+            query.push_str(", ");
+        }
+        if (i / 2) % 2 == 0 {
+            query.push_str(&format!("t.{}", col_name(i)));
+        } else {
+            query.push_str(&format!("v.{}", col_name(i)));
+        }
+    }
+    query.push_str(" FROM t JOIN v ON t.x_0 = v.x_0 WHERE ");
+
+    fn qualified_name(i: usize) -> String {
+        format!("t.{}", col_name(i))
+    }
+
+    query.push_str(&predicate(qualified_name, PREDICATE_LEN));
+    query
+}
+
+/// Returns a typical plan for the query like:
+///
+/// ```sql
+/// SELECT projection FROM t
+/// WHERE p
+/// ```
+///
+fn query3() -> String {
+    let mut query = String::new();
+    query.push_str("SELECT ");
+
+    // Create non-trivial projection.
+    for i in 0..NUM_FIELDS / 2 {
+        if i > 0 {
+            query.push_str(", ");
+        }
+        query.push_str(&col_name(i * 2));
+        query.push_str(" + ");
+        query.push_str(&col_name(i * 2 + 1));
+    }
+
+    query.push_str(" FROM t WHERE ");
+    query.push_str(&predicate(col_name, PREDICATE_LEN));
+    query
+}
+
+fn run_reset_states(b: &mut criterion::Bencher, plan: &Arc<dyn ExecutionPlan>) {
+    b.iter(|| std::hint::black_box(reset_plan_states(Arc::clone(plan)).unwrap()));
+}
+
+/// Benchmark is intended to measure overhead of actions, required to perform
+/// making an independent instance of the execution plan to re-execute it, avoiding
+/// re-planning stage.
+fn bench_reset_plan_states(c: &mut Criterion) {
+    env_logger::init();
+
+    let rt = Runtime::new().unwrap();
+    let ctx = SessionContext::new();
+    ctx.register_table(
+        "t",
+        Arc::new(MemTable::try_new(Arc::clone(&SCHEMA), vec![vec![], vec![]]).unwrap()),
+    )
+    .unwrap();
+
+    ctx.register_table(
+        "v",
+        Arc::new(MemTable::try_new(Arc::clone(&SCHEMA), vec![vec![], vec![]]).unwrap()),
+    )
+    .unwrap();
+
+    macro_rules! bench_query {
+        ($query_producer: expr) => {{
+            let sql = $query_producer();
+            let plan = physical_plan(&ctx, &rt, &sql);
+            log::debug!("plan:\n{}", displayable(plan.as_ref()).indent(true));
+            move |b| run_reset_states(b, &plan)
+        }};
+    }
+
+    c.bench_function("query1", bench_query!(query1));
+    c.bench_function("query2", bench_query!(query2));
+    c.bench_function("query3", bench_query!(query3));
+}
+
+criterion_group!(benches, bench_reset_plan_states);
+criterion_main!(benches);
diff --git a/datafusion/core/benches/scalar.rs b/datafusion/core/benches/scalar.rs
index 540f7212e96e9..d06ed3f28b743 100644
--- a/datafusion/core/benches/scalar.rs
+++ b/datafusion/core/benches/scalar.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion::scalar::ScalarValue;
 
 fn criterion_benchmark(c: &mut Criterion) {
diff --git a/datafusion/core/benches/sort.rs b/datafusion/core/benches/sort.rs
index 276151e253f7e..7544f7ae26d43 100644
--- a/datafusion/core/benches/sort.rs
+++ b/datafusion/core/benches/sort.rs
@@ -78,18 +78,18 @@ use datafusion::physical_plan::sorts::sort::SortExec;
 use datafusion::{
     execution::context::TaskContext,
     physical_plan::{
+        ExecutionPlan, ExecutionPlanProperties,
         coalesce_partitions::CoalescePartitionsExec,
-        sorts::sort_preserving_merge::SortPreservingMergeExec, ExecutionPlan,
-        ExecutionPlanProperties,
+        sorts::sort_preserving_merge::SortPreservingMergeExec,
     },
     prelude::SessionContext,
 };
 use datafusion_datasource::memory::MemorySourceConfig;
-use datafusion_physical_expr::{expressions::col, PhysicalSortExpr};
+use datafusion_physical_expr::{PhysicalSortExpr, expressions::col};
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
 
 /// Benchmarks for SortPreservingMerge stream
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use futures::StreamExt;
 use rand::rngs::StdRng;
 use rand::{Rng, SeedableRng};
@@ -102,61 +102,104 @@ const NUM_STREAMS: usize = 8;
 /// The size of each batch within each stream
 const BATCH_SIZE: usize = 1024;
 
-/// Total number of input rows to generate
-const INPUT_SIZE: u64 = 100000;
+/// Input sizes to benchmark. The small size (100K) exercises the
+/// in-memory concat-and-sort path; the large size (10M) exercises
+/// the sort-then-merge path with high fan-in.
+const INPUT_SIZES: &[(u64, &str)] = &[(100_000, "100k"), (1_000_000, "1M")];
 
 type PartitionedBatches = Vec<Vec<RecordBatch>>;
+type StreamGenerator = Box<dyn Fn(bool) -> PartitionedBatches>;
 
 fn criterion_benchmark(c: &mut Criterion) {
-    let cases: Vec<(&str, &dyn Fn(bool) -> PartitionedBatches)> = vec![
-        ("i64", &i64_streams),
-        ("f64", &f64_streams),
-        ("utf8 low cardinality", &utf8_low_cardinality_streams),
-        ("utf8 high cardinality", &utf8_high_cardinality_streams),
-        (
-            "utf8 view low cardinality",
-            &utf8_view_low_cardinality_streams,
-        ),
-        (
-            "utf8 view high cardinality",
-            &utf8_view_high_cardinality_streams,
-        ),
-        ("utf8 tuple", &utf8_tuple_streams),
-        ("utf8 view tuple", &utf8_view_tuple_streams),
-        ("utf8 dictionary", &dictionary_streams),
-        ("utf8 dictionary tuple", &dictionary_tuple_streams),
-        ("mixed dictionary tuple", &mixed_dictionary_tuple_streams),
-        ("mixed tuple", &mixed_tuple_streams),
-        (
-            "mixed tuple with utf8 view",
-            &mixed_tuple_with_utf8_view_streams,
-        ),
-    ];
-
-    for (name, f) in cases {
-        c.bench_function(&format!("merge sorted {name}"), |b| {
-            let data = f(true);
-            let case = BenchCase::merge_sorted(&data);
-            b.iter(move || case.run())
-        });
-
-        c.bench_function(&format!("sort merge {name}"), |b| {
-            let data = f(false);
-            let case = BenchCase::sort_merge(&data);
-            b.iter(move || case.run())
-        });
-
-        c.bench_function(&format!("sort {name}"), |b| {
-            let data = f(false);
-            let case = BenchCase::sort(&data);
-            b.iter(move || case.run())
-        });
-
-        c.bench_function(&format!("sort partitioned {name}"), |b| {
-            let data = f(false);
-            let case = BenchCase::sort_partitioned(&data);
-            b.iter(move || case.run())
-        });
+    for &(input_size, size_label) in INPUT_SIZES {
+        let cases: Vec<(&str, StreamGenerator)> = vec![
+            (
+                "i64",
+                Box::new(move |sorted| i64_streams(sorted, input_size)),
+            ),
+            (
+                "f64",
+                Box::new(move |sorted| f64_streams(sorted, input_size)),
+            ),
+            (
+                "utf8 low cardinality",
+                Box::new(move |sorted| utf8_low_cardinality_streams(sorted, input_size)),
+            ),
+            (
+                "utf8 high cardinality",
+                Box::new(move |sorted| utf8_high_cardinality_streams(sorted, input_size)),
+            ),
+            (
+                "utf8 view low cardinality",
+                Box::new(move |sorted| {
+                    utf8_view_low_cardinality_streams(sorted, input_size)
+                }),
+            ),
+            (
+                "utf8 view high cardinality",
+                Box::new(move |sorted| {
+                    utf8_view_high_cardinality_streams(sorted, input_size)
+                }),
+            ),
+            (
+                "utf8 tuple",
+                Box::new(move |sorted| utf8_tuple_streams(sorted, input_size)),
+            ),
+            (
+                "utf8 view tuple",
+                Box::new(move |sorted| utf8_view_tuple_streams(sorted, input_size)),
+            ),
+            (
+                "utf8 dictionary",
+                Box::new(move |sorted| dictionary_streams(sorted, input_size)),
+            ),
+            (
+                "utf8 dictionary tuple",
+                Box::new(move |sorted| dictionary_tuple_streams(sorted, input_size)),
+            ),
+            (
+                "mixed dictionary tuple",
+                Box::new(move |sorted| {
+                    mixed_dictionary_tuple_streams(sorted, input_size)
+                }),
+            ),
+            (
+                "mixed tuple",
+                Box::new(move |sorted| mixed_tuple_streams(sorted, input_size)),
+            ),
+            (
+                "mixed tuple with utf8 view",
+                Box::new(move |sorted| {
+                    mixed_tuple_with_utf8_view_streams(sorted, input_size)
+                }),
+            ),
+        ];
+
+        for (name, f) in &cases {
+            c.bench_function(&format!("merge sorted {name} {size_label}"), |b| {
+                let data = f(true);
+                let case = BenchCase::merge_sorted(&data);
+                b.iter(move || case.run())
+            });
+
+            c.bench_function(&format!("sort merge {name} {size_label}"), |b| {
+                let data = f(false);
+                let case = BenchCase::sort_merge(&data);
+                b.iter(move || case.run())
+            });
+
+            c.bench_function(&format!("sort {name} {size_label}"), |b| {
+                let data = f(false);
+                let case = BenchCase::sort(&data);
+                b.iter(move || case.run())
+            });
+
+            c.bench_function(&format!("sort partitioned {name} {size_label}"), |b| {
+                let data = f(false);
+                let case = BenchCase::sort_partitioned(&data);
+                b.iter(move || case.run())
+            });
+        }
     }
 }
 
@@ -279,8 +322,8 @@ fn make_sort_exprs(schema: &Schema) -> LexOrdering {
 }
 
 /// Create streams of int64 (where approximately 1/3 values is repeated)
-fn i64_streams(sorted: bool) -> PartitionedBatches {
-    let mut values = DataGenerator::new().i64_values();
+fn i64_streams(sorted: bool, input_size: u64) -> PartitionedBatches {
+    let mut values = DataGenerator::new(input_size).i64_values();
     if sorted {
         values.sort_unstable();
     }
@@ -293,8 +336,8 @@ fn i64_streams(sorted: bool) -> PartitionedBatches {
 
 /// Create streams of f64 (where approximately 1/3 values are repeated)
 /// with the same distribution as i64_streams
-fn f64_streams(sorted: bool) -> PartitionedBatches {
-    let mut values = DataGenerator::new().f64_values();
+fn f64_streams(sorted: bool, input_size: u64) -> PartitionedBatches {
+    let mut values = DataGenerator::new(input_size).f64_values();
     if sorted {
         values.sort_unstable_by(|a, b| a.total_cmp(b));
     }
@@ -306,8 +349,8 @@ fn f64_streams(sorted: bool) -> PartitionedBatches {
 }
 
 /// Create streams of random low cardinality utf8 values
-fn utf8_low_cardinality_streams(sorted: bool) -> PartitionedBatches {
-    let mut values = DataGenerator::new().utf8_low_cardinality_values();
+fn utf8_low_cardinality_streams(sorted: bool, input_size: u64) -> PartitionedBatches {
+    let mut values = DataGenerator::new(input_size).utf8_low_cardinality_values();
     if sorted {
         values.sort_unstable();
     }
@@ -318,8 +361,11 @@ fn utf8_low_cardinality_streams(sorted: bool) -> PartitionedBatches {
 }
 
 /// Create streams of random low cardinality utf8_view values
-fn utf8_view_low_cardinality_streams(sorted: bool) -> PartitionedBatches {
-    let mut values = DataGenerator::new().utf8_low_cardinality_values();
+fn utf8_view_low_cardinality_streams(
+    sorted: bool,
+    input_size: u64,
+) -> PartitionedBatches {
+    let mut values = DataGenerator::new(input_size).utf8_low_cardinality_values();
     if sorted {
         values.sort_unstable();
     }
@@ -330,8 +376,11 @@ fn utf8_view_low_cardinality_streams(sorted: bool) -> PartitionedBatches {
 }
 
 /// Create streams of high  cardinality (~ no duplicates) utf8_view values
-fn utf8_view_high_cardinality_streams(sorted: bool) -> PartitionedBatches {
-    let mut values = DataGenerator::new().utf8_high_cardinality_values();
+fn utf8_view_high_cardinality_streams(
+    sorted: bool,
+    input_size: u64,
+) -> PartitionedBatches {
+    let mut values = DataGenerator::new(input_size).utf8_high_cardinality_values();
     if sorted {
         values.sort_unstable();
     }
@@ -342,8 +391,8 @@ fn utf8_view_high_cardinality_streams(sorted: bool) -> PartitionedBatches {
 }
 
 /// Create streams of high  cardinality (~ no duplicates) utf8 values
-fn utf8_high_cardinality_streams(sorted: bool) -> PartitionedBatches {
-    let mut values = DataGenerator::new().utf8_high_cardinality_values();
+fn utf8_high_cardinality_streams(sorted: bool, input_size: u64) -> PartitionedBatches {
+    let mut values = DataGenerator::new(input_size).utf8_high_cardinality_values();
     if sorted {
         values.sort_unstable();
     }
@@ -354,15 +403,15 @@ fn utf8_high_cardinality_streams(sorted: bool) -> PartitionedBatches {
 }
 
 /// Create a batch of (utf8_low, utf8_low, utf8_high)
-fn utf8_tuple_streams(sorted: bool) -> PartitionedBatches {
-    let mut gen = DataGenerator::new();
+fn utf8_tuple_streams(sorted: bool, input_size: u64) -> PartitionedBatches {
+    let mut data_gen = DataGenerator::new(input_size);
 
     // need to sort by the combined key, so combine them together
-    let mut tuples: Vec<_> = gen
+    let mut tuples: Vec<_> = data_gen
         .utf8_low_cardinality_values()
         .into_iter()
-        .zip(gen.utf8_low_cardinality_values())
-        .zip(gen.utf8_high_cardinality_values())
+        .zip(data_gen.utf8_low_cardinality_values())
+        .zip(data_gen.utf8_high_cardinality_values())
         .collect();
 
     if sorted {
@@ -387,15 +436,15 @@ fn utf8_tuple_streams(sorted: bool) -> PartitionedBatches {
 }
 
 /// Create a batch of (utf8_view_low, utf8_view_low, utf8_view_high)
-fn utf8_view_tuple_streams(sorted: bool) -> PartitionedBatches {
-    let mut gen = DataGenerator::new();
+fn utf8_view_tuple_streams(sorted: bool, input_size: u64) -> PartitionedBatches {
+    let mut data_gen = DataGenerator::new(input_size);
 
     // need to sort by the combined key, so combine them together
-    let mut tuples: Vec<_> = gen
+    let mut tuples: Vec<_> = data_gen
         .utf8_low_cardinality_values()
         .into_iter()
-        .zip(gen.utf8_low_cardinality_values())
-        .zip(gen.utf8_high_cardinality_values())
+        .zip(data_gen.utf8_low_cardinality_values())
+        .zip(data_gen.utf8_high_cardinality_values())
         .collect();
 
     if sorted {
@@ -420,16 +469,16 @@ fn utf8_view_tuple_streams(sorted: bool) -> PartitionedBatches {
 }
 
 /// Create a batch of (f64, utf8_low, utf8_low, i64)
-fn mixed_tuple_streams(sorted: bool) -> PartitionedBatches {
-    let mut gen = DataGenerator::new();
+fn mixed_tuple_streams(sorted: bool, input_size: u64) -> PartitionedBatches {
+    let mut data_gen = DataGenerator::new(input_size);
 
     // need to sort by the combined key, so combine them together
-    let mut tuples: Vec<_> = gen
+    let mut tuples: Vec<_> = data_gen
         .i64_values()
         .into_iter()
-        .zip(gen.utf8_low_cardinality_values())
-        .zip(gen.utf8_low_cardinality_values())
-        .zip(gen.i64_values())
+        .zip(data_gen.utf8_low_cardinality_values())
+        .zip(data_gen.utf8_low_cardinality_values())
+        .zip(data_gen.i64_values())
         .collect();
 
     if sorted {
@@ -458,16 +507,19 @@ fn mixed_tuple_streams(sorted: bool) -> PartitionedBatches {
 }
 
 /// Create a batch of (f64, utf8_view_low, utf8_view_low, i64)
-fn mixed_tuple_with_utf8_view_streams(sorted: bool) -> PartitionedBatches {
-    let mut gen = DataGenerator::new();
+fn mixed_tuple_with_utf8_view_streams(
+    sorted: bool,
+    input_size: u64,
+) -> PartitionedBatches {
+    let mut data_gen = DataGenerator::new(input_size);
 
     // need to sort by the combined key, so combine them together
-    let mut tuples: Vec<_> = gen
+    let mut tuples: Vec<_> = data_gen
         .i64_values()
         .into_iter()
-        .zip(gen.utf8_low_cardinality_values())
-        .zip(gen.utf8_low_cardinality_values())
-        .zip(gen.i64_values())
+        .zip(data_gen.utf8_low_cardinality_values())
+        .zip(data_gen.utf8_low_cardinality_values())
+        .zip(data_gen.i64_values())
         .collect();
 
     if sorted {
@@ -496,9 +548,9 @@ fn mixed_tuple_with_utf8_view_streams(sorted: bool) -> PartitionedBatches {
 }
 
 /// Create a batch of (utf8_dict)
-fn dictionary_streams(sorted: bool) -> PartitionedBatches {
-    let mut gen = DataGenerator::new();
-    let mut values = gen.utf8_low_cardinality_values();
+fn dictionary_streams(sorted: bool, input_size: u64) -> PartitionedBatches {
+    let mut data_gen = DataGenerator::new(input_size);
+    let mut values = data_gen.utf8_low_cardinality_values();
     if sorted {
         values.sort_unstable();
     }
@@ -511,13 +563,13 @@ fn dictionary_streams(sorted: bool) -> PartitionedBatches {
 }
 
 /// Create a batch of (utf8_dict, utf8_dict, utf8_dict)
-fn dictionary_tuple_streams(sorted: bool) -> PartitionedBatches {
-    let mut gen = DataGenerator::new();
-    let mut tuples: Vec<_> = gen
+fn dictionary_tuple_streams(sorted: bool, input_size: u64) -> PartitionedBatches {
+    let mut data_gen = DataGenerator::new(input_size);
+    let mut tuples: Vec<_> = data_gen
         .utf8_low_cardinality_values()
         .into_iter()
-        .zip(gen.utf8_low_cardinality_values())
-        .zip(gen.utf8_low_cardinality_values())
+        .zip(data_gen.utf8_low_cardinality_values())
+        .zip(data_gen.utf8_low_cardinality_values())
         .collect();
 
     if sorted {
@@ -542,14 +594,14 @@ fn dictionary_tuple_streams(sorted: bool) -> PartitionedBatches {
 }
 
 /// Create a batch of (utf8_dict, utf8_dict, utf8_dict, i64)
-fn mixed_dictionary_tuple_streams(sorted: bool) -> PartitionedBatches {
-    let mut gen = DataGenerator::new();
-    let mut tuples: Vec<_> = gen
+fn mixed_dictionary_tuple_streams(sorted: bool, input_size: u64) -> PartitionedBatches {
+    let mut data_gen = DataGenerator::new(input_size);
+    let mut tuples: Vec<_> = data_gen
         .utf8_low_cardinality_values()
         .into_iter()
-        .zip(gen.utf8_low_cardinality_values())
-        .zip(gen.utf8_low_cardinality_values())
-        .zip(gen.i64_values())
+        .zip(data_gen.utf8_low_cardinality_values())
+        .zip(data_gen.utf8_low_cardinality_values())
+        .zip(data_gen.i64_values())
         .collect();
 
     if sorted {
@@ -579,19 +631,21 @@ fn mixed_dictionary_tuple_streams(sorted: bool) -> PartitionedBatches {
 /// Encapsulates creating data for this test
 struct DataGenerator {
     rng: StdRng,
+    input_size: u64,
 }
 
 impl DataGenerator {
-    fn new() -> Self {
+    fn new(input_size: u64) -> Self {
         Self {
             rng: StdRng::seed_from_u64(42),
+            input_size,
         }
     }
 
     /// Create an array of i64 sorted values (where approximately 1/3 values is repeated)
     fn i64_values(&mut self) -> Vec<i64> {
-        let mut vec: Vec<_> = (0..INPUT_SIZE)
-            .map(|_| self.rng.random_range(0..INPUT_SIZE as i64))
+        let mut vec: Vec<_> = (0..self.input_size)
+            .map(|_| self.rng.random_range(0..self.input_size as i64))
             .collect();
 
         vec.sort_unstable();
@@ -614,7 +668,7 @@ impl DataGenerator {
             .collect::<Vec<_>>();
 
         // pick from the 100 strings randomly
-        let mut input = (0..INPUT_SIZE)
+        let mut input = (0..self.input_size)
             .map(|_| {
                 let idx = self.rng.random_range(0..strings.len());
                 let s = Arc::clone(&strings[idx]);
@@ -629,7 +683,7 @@ impl DataGenerator {
     /// Create sorted values of high  cardinality (~ no duplicates) utf8 values
     fn utf8_high_cardinality_values(&mut self) -> Vec<Option<String>> {
         // make random strings
-        let mut input = (0..INPUT_SIZE)
+        let mut input = (0..self.input_size)
             .map(|_| Some(self.random_string()))
             .collect::<Vec<_>>();
 
diff --git a/datafusion/core/benches/sort_limit_query_sql.rs b/datafusion/core/benches/sort_limit_query_sql.rs
index e535a018161f1..54cd9a0bcd547 100644
--- a/datafusion/core/benches/sort_limit_query_sql.rs
+++ b/datafusion/core/benches/sort_limit_query_sql.rs
@@ -15,9 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#[macro_use]
-extern crate criterion;
-use criterion::Criterion;
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion::datasource::file_format::csv::CsvFormat;
 use datafusion::datasource::listing::{
     ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
@@ -27,9 +25,6 @@ use datafusion::prelude::SessionConfig;
 use parking_lot::Mutex;
 use std::sync::Arc;
 
-extern crate arrow;
-extern crate datafusion;
-
 use arrow::datatypes::{DataType, Field, Schema};
 
 use datafusion::datasource::MemTable;
@@ -37,6 +32,7 @@ use datafusion::execution::context::SessionContext;
 
 use tokio::runtime::Runtime;
 
+#[expect(clippy::needless_pass_by_value)]
 fn query(ctx: Arc<Mutex<SessionContext>>, rt: &Runtime, sql: &str) {
     // execute the query
     let df = rt.block_on(ctx.lock().sql(sql)).unwrap();
@@ -97,8 +93,7 @@ fn create_context() -> Arc<Mutex<SessionContext>> {
         ctx_holder.lock().push(Arc::new(Mutex::new(ctx)))
     });
 
-    let ctx = ctx_holder.lock().first().unwrap().clone();
-    ctx
+    ctx_holder.lock().first().unwrap().clone()
 }
 
 fn criterion_benchmark(c: &mut Criterion) {
diff --git a/datafusion/core/benches/spm.rs b/datafusion/core/benches/spm.rs
index ecc3f908d4b15..afd384f7b170e 100644
--- a/datafusion/core/benches/spm.rs
+++ b/datafusion/core/benches/spm.rs
@@ -20,13 +20,13 @@ use std::sync::Arc;
 
 use arrow::array::{ArrayRef, Int32Array, Int64Array, RecordBatch, StringArray};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::expressions::col;
 use datafusion_physical_expr::PhysicalSortExpr;
+use datafusion_physical_expr::expressions::col;
 use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
-use datafusion_physical_plan::{collect, ExecutionPlan};
+use datafusion_physical_plan::{ExecutionPlan, collect};
 
 use criterion::async_executor::FuturesExecutor;
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_datasource::memory::MemorySourceConfig;
 
 fn generate_spm_for_round_robin_tie_breaker(
@@ -66,10 +66,9 @@ fn generate_spm_for_round_robin_tie_breaker(
         RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap()
     };
 
-    let rbs = (0..batch_count).map(|_| rb.clone()).collect::<Vec<_>>();
-    let partitions = vec![rbs.clone(); partition_count];
-
     let schema = rb.schema();
+    let rbs = std::iter::repeat_n(rb, batch_count).collect::<Vec<_>>();
+    let partitions = vec![rbs.clone(); partition_count];
     let sort = [
         PhysicalSortExpr {
             expr: col("b", &schema).unwrap(),
diff --git a/datafusion/core/benches/sql_planner.rs b/datafusion/core/benches/sql_planner.rs
index 6266a7184cf51..fcc8da30fedd9 100644
--- a/datafusion/core/benches/sql_planner.rs
+++ b/datafusion/core/benches/sql_planner.rs
@@ -15,27 +15,26 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate arrow;
-#[macro_use]
-extern crate criterion;
-extern crate datafusion;
-
 mod data_utils;
 
-use crate::criterion::Criterion;
+use arrow::array::PrimitiveArray;
 use arrow::array::{ArrayRef, RecordBatch};
+use arrow::datatypes::ArrowNativeTypeOp;
+use arrow::datatypes::ArrowPrimitiveType;
 use arrow::datatypes::{DataType, Field, Fields, Schema};
 use criterion::Bencher;
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion::datasource::MemTable;
 use datafusion::execution::context::SessionContext;
-use datafusion_common::{config::Dialect, ScalarValue};
+use datafusion_common::{ScalarValue, config::Dialect};
 use datafusion_expr::col;
+use rand_distr::num_traits::NumCast;
 use std::hint::black_box;
 use std::path::PathBuf;
 use std::sync::Arc;
+use test_utils::TableDef;
 use test_utils::tpcds::tpcds_schemas;
 use test_utils::tpch::tpch_schemas;
-use test_utils::TableDef;
 use tokio::runtime::Runtime;
 
 const BENCHMARKS_PATH_1: &str = "../../benchmarks/";
@@ -74,6 +73,21 @@ fn create_table_provider(column_prefix: &str, num_columns: usize) -> Arc<MemTabl
         .unwrap()
 }
 
+/// Create a table provider with a struct column: `id` (Int32) and `props` (Struct { value: Int32, label: Utf8 })
+fn create_struct_table_provider() -> Arc<MemTable> {
+    let struct_fields = Fields::from(vec![
+        Field::new("value", DataType::Int32, true),
+        Field::new("label", DataType::Utf8, true),
+    ]);
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, true),
+        Field::new("props", DataType::Struct(struct_fields), true),
+    ]));
+    MemTable::try_new(schema, vec![vec![]])
+        .map(Arc::new)
+        .unwrap()
+}
+
 fn create_context() -> SessionContext {
     let ctx = SessionContext::new();
     ctx.register_table("t1", create_table_provider("a", 200))
@@ -84,11 +98,16 @@ fn create_context() -> SessionContext {
         .unwrap();
     ctx.register_table("t1000", create_table_provider("d", 1000))
         .unwrap();
+    ctx.register_table("struct_t1", create_struct_table_provider())
+        .unwrap();
+    ctx.register_table("struct_t2", create_struct_table_provider())
+        .unwrap();
     ctx
 }
 
 /// Register the table definitions as a MemTable with the context and return the
 /// context
+#[expect(clippy::needless_pass_by_value)]
 fn register_defs(ctx: SessionContext, defs: Vec<TableDef>) -> SessionContext {
     defs.iter().for_each(|TableDef { name, schema }| {
         ctx.register_table(
@@ -111,10 +130,27 @@ fn register_clickbench_hits_table(rt: &Runtime) -> SessionContext {
             format!("{BENCHMARKS_PATH_2}{CLICKBENCH_DATA_PATH}")
         };
 
-    let sql = format!("CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION '{path}'");
+    let sql =
+        format!("CREATE EXTERNAL TABLE hits_raw STORED AS PARQUET LOCATION '{path}'");
 
+    // ClickBench partitioned dataset was written by an ancient version of pyarrow that
+    // that wrote strings with the wrong logical type. To read it correctly, we must
+    // automatically convert binary to string.
+    rt.block_on(ctx.sql("SET datafusion.execution.parquet.binary_as_string  = true;"))
+        .unwrap();
     rt.block_on(ctx.sql(&sql)).unwrap();
 
+    // ClickBench stores EventDate as UInt16 (days since 1970-01-01). Create a view
+    // that exposes it as SQL DATE so that queries comparing it with date literals
+    // (e.g. "EventDate >= '2013-07-01'") work correctly during planning.
+    rt.block_on(ctx.sql(
+        "CREATE VIEW hits AS \
+         SELECT * EXCEPT (\"EventDate\"), \
+                CAST(CAST(\"EventDate\" AS INTEGER) AS DATE) AS \"EventDate\" \
+         FROM hits_raw",
+    ))
+    .unwrap();
+
     let count =
         rt.block_on(async { ctx.table("hits").await.unwrap().count().await.unwrap() });
     assert!(count > 0);
@@ -155,18 +191,30 @@ fn benchmark_with_param_values_many_columns(
 /// 0,100...9900
 /// 0,200...19800
 /// 0,300...29700
-fn register_union_order_table(ctx: &SessionContext, num_columns: usize, num_rows: usize) {
-    // ("c0", [0, 0, ...])
-    // ("c1": [100, 200, ...])
-    // etc
-    let iter = (0..num_columns).map(|i| i as u64).map(|i| {
-        let array: ArrayRef = Arc::new(arrow::array::UInt64Array::from_iter_values(
-            (0..num_rows)
-                .map(|j| j as u64 * 100 + i)
-                .collect::<Vec<_>>(),
-        ));
+fn register_union_order_table_generic<T>(
+    ctx: &SessionContext,
+    num_columns: usize,
+    num_rows: usize,
+) where
+    T: ArrowPrimitiveType,
+    T::Native: ArrowNativeTypeOp + NumCast,
+{
+    let iter = (0..num_columns).map(|i| {
+        let array_data: Vec<T::Native> = (0..num_rows)
+            .map(|j| {
+                let value = (j as u64) * 100 + (i as u64);
+                <T::Native as NumCast>::from(value).unwrap_or_else(|| {
+                    panic!("Failed to cast numeric value to Native type")
+                })
+            })
+            .collect();
+
+        // Use PrimitiveArray which is generic over the ArrowPrimitiveType T
+        let array: ArrayRef = Arc::new(PrimitiveArray::<T>::from_iter_values(array_data));
+
         (format!("c{i}"), array)
     });
+
     let batch = RecordBatch::try_from_iter(iter).unwrap();
     let schema = batch.schema();
     let partitions = vec![vec![batch]];
@@ -183,7 +231,6 @@ fn register_union_order_table(ctx: &SessionContext, num_columns: usize, num_rows
 
     ctx.register_table("t", Arc::new(table)).unwrap();
 }
-
 /// return a query like
 /// ```sql
 /// select c1, 2 as c2, ... n as cn from t ORDER BY c1
@@ -226,8 +273,10 @@ fn criterion_benchmark(c: &mut Criterion) {
     if !PathBuf::from(format!("{BENCHMARKS_PATH_1}{CLICKBENCH_DATA_PATH}")).exists()
         && !PathBuf::from(format!("{BENCHMARKS_PATH_2}{CLICKBENCH_DATA_PATH}")).exists()
     {
-        panic!("benchmarks/data/hits_partitioned/ could not be loaded. Please run \
-         'benchmarks/bench.sh data clickbench_partitioned' prior to running this benchmark")
+        panic!(
+            "benchmarks/data/hits_partitioned/ could not be loaded. Please run \
+         'benchmarks/bench.sh data clickbench_partitioned' prior to running this benchmark"
+        )
     }
 
     let ctx = create_context();
@@ -401,15 +450,61 @@ fn criterion_benchmark(c: &mut Criterion) {
         });
     });
 
+    let struct_agg_sort_query = "SELECT \
+         struct_t1.props['label'], \
+         SUM(struct_t1.props['value']), \
+         MAX(struct_t2.props['value']), \
+         COUNT(*) \
+     FROM struct_t1 \
+     JOIN struct_t2 ON struct_t1.id = struct_t2.id \
+     WHERE struct_t1.props['value'] > 50 \
+     GROUP BY struct_t1.props['label'] \
+     ORDER BY SUM(struct_t1.props['value']) DESC";
+
+    // -- Struct column benchmarks --
+    c.bench_function("logical_plan_struct_join_agg_sort", |b| {
+        b.iter(|| logical_plan(&ctx, &rt, struct_agg_sort_query))
+    });
+    c.bench_function("physical_plan_struct_join_agg_sort", |b| {
+        b.iter(|| physical_plan(&ctx, &rt, struct_agg_sort_query))
+    });
+
     // -- Sorted Queries --
     // 100, 200 && 300 is taking too long - https://github.com/apache/datafusion/issues/18366
+    // Logical Plan for datatype Int64 and UInt64 differs, UInt64 Logical Plan's Union are wrapped
+    // up in Projection, and EliminateNestedUnion OptimezerRule is not applied leading to significantly
+    // longer execution time.
+    // https://github.com/apache/datafusion/issues/17261
+
     for column_count in [10, 50 /* 100, 200, 300 */] {
-        register_union_order_table(&ctx, column_count, 1000);
+        register_union_order_table_generic::<arrow::datatypes::Int64Type>(
+            &ctx,
+            column_count,
+            1000,
+        );
 
         // this query has many expressions in its sort order so stresses
         // order equivalence validation
         c.bench_function(
-            &format!("physical_sorted_union_order_by_{column_count}"),
+            &format!("physical_sorted_union_order_by_{column_count}_int64"),
+            |b| {
+                // SELECT ... UNION ALL ...
+                let query = union_orderby_query(column_count);
+                b.iter(|| physical_plan(&ctx, &rt, &query))
+            },
+        );
+
+        let _ = ctx.deregister_table("t");
+    }
+
+    for column_count in [10, 50 /* 100, 200, 300 */] {
+        register_union_order_table_generic::<arrow::datatypes::UInt64Type>(
+            &ctx,
+            column_count,
+            1000,
+        );
+        c.bench_function(
+            &format!("physical_sorted_union_order_by_{column_count}_uint64"),
             |b| {
                 // SELECT ... UNION ALL ...
                 let query = union_orderby_query(column_count);
@@ -477,9 +572,6 @@ fn criterion_benchmark(c: &mut Criterion) {
     };
 
     let raw_tpcds_sql_queries = (1..100)
-        // skip query 75 until it is fixed
-        // https://github.com/apache/datafusion/issues/17801
-        .filter(|q| *q != 75)
         .map(|q| std::fs::read_to_string(format!("{tests_path}tpc-ds/{q}.sql")).unwrap())
         .collect::<Vec<_>>();
 
diff --git a/datafusion/core/benches/sql_planner_extended.rs b/datafusion/core/benches/sql_planner_extended.rs
index aff7cb4d101d5..d4955313c79c3 100644
--- a/datafusion/core/benches/sql_planner_extended.rs
+++ b/datafusion/core/benches/sql_planner_extended.rs
@@ -18,7 +18,7 @@
 use arrow::array::{ArrayRef, RecordBatch};
 use arrow_schema::DataType;
 use arrow_schema::TimeUnit::Nanosecond;
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
 use datafusion::prelude::{DataFrame, SessionContext};
 use datafusion_catalog::MemTable;
 use datafusion_common::ScalarValue;
@@ -27,6 +27,7 @@ use datafusion_expr::{cast, col, lit, not, try_cast, when};
 use datafusion_functions::expr_fn::{
     btrim, length, regexp_like, regexp_replace, to_timestamp, upper,
 };
+use std::fmt::Write;
 use std::hint::black_box;
 use std::ops::Rem;
 use std::sync::Arc;
@@ -212,14 +213,127 @@ fn build_test_data_frame(ctx: &SessionContext, rt: &Runtime) -> DataFrame {
     })
 }
 
-fn criterion_benchmark(c: &mut Criterion) {
+/// Build a CASE-heavy dataframe over a non-inner join to stress
+/// planner-time filter pushdown and nullability/type inference.
+fn build_case_heavy_left_join_df(ctx: &SessionContext, rt: &Runtime) -> DataFrame {
+    register_string_table(ctx, 100, 1000);
+    let query = build_case_heavy_left_join_query(30, 1);
+    rt.block_on(async { ctx.sql(&query).await.unwrap() })
+}
+
+fn build_case_heavy_left_join_query(predicate_count: usize, case_depth: usize) -> String {
+    let mut query = String::from(
+        "SELECT l.c0, r.c0 AS rc0 FROM t l LEFT JOIN t r ON l.c0 = r.c0 WHERE ",
+    );
+
+    if predicate_count == 0 {
+        query.push_str("TRUE");
+        return query;
+    }
+
+    // Keep this deterministic so comparisons between profiles are stable.
+    for i in 0..predicate_count {
+        if i > 0 {
+            query.push_str(" AND ");
+        }
+
+        let mut expr = format!("length(l.c{})", i % 20);
+        for depth in 0..case_depth {
+            let left_col = (i + depth + 1) % 20;
+            let right_col = (i + depth + 2) % 20;
+            expr = format!(
+                "CASE WHEN l.c{left_col} IS NOT NULL THEN {expr} ELSE length(r.c{right_col}) END"
+            );
+        }
+
+        let _ = write!(&mut query, "{expr} > 2");
+    }
+
+    query
+}
+
+fn build_case_heavy_left_join_df_with_push_down_filter(
+    rt: &Runtime,
+    predicate_count: usize,
+    case_depth: usize,
+    push_down_filter_enabled: bool,
+) -> DataFrame {
+    let ctx = SessionContext::new();
+    register_string_table(&ctx, 100, 1000);
+    if !push_down_filter_enabled {
+        let removed = ctx.remove_optimizer_rule("push_down_filter");
+        assert!(
+            removed,
+            "push_down_filter rule should be present in the default optimizer"
+        );
+    }
+
+    let query = build_case_heavy_left_join_query(predicate_count, case_depth);
+    rt.block_on(async { ctx.sql(&query).await.unwrap() })
+}
+
+fn build_non_case_left_join_query(
+    predicate_count: usize,
+    nesting_depth: usize,
+) -> String {
+    let mut query = String::from(
+        "SELECT l.c0, r.c0 AS rc0 FROM t l LEFT JOIN t r ON l.c0 = r.c0 WHERE ",
+    );
+
+    if predicate_count == 0 {
+        query.push_str("TRUE");
+        return query;
+    }
+
+    // Keep this deterministic so comparisons between profiles are stable.
+    for i in 0..predicate_count {
+        if i > 0 {
+            query.push_str(" AND ");
+        }
+
+        let left_col = i % 20;
+        let mut expr = format!("l.c{left_col}");
+        for depth in 0..nesting_depth {
+            let right_col = (i + depth + 1) % 20;
+            expr = format!("coalesce({expr}, r.c{right_col})");
+        }
+
+        let _ = write!(&mut query, "length({expr}) > 2");
+    }
+
+    query
+}
+
+fn build_non_case_left_join_df_with_push_down_filter(
+    rt: &Runtime,
+    predicate_count: usize,
+    nesting_depth: usize,
+    push_down_filter_enabled: bool,
+) -> DataFrame {
     let ctx = SessionContext::new();
+    register_string_table(&ctx, 100, 1000);
+    if !push_down_filter_enabled {
+        let removed = ctx.remove_optimizer_rule("push_down_filter");
+        assert!(
+            removed,
+            "push_down_filter rule should be present in the default optimizer"
+        );
+    }
+
+    let query = build_non_case_left_join_query(predicate_count, nesting_depth);
+    rt.block_on(async { ctx.sql(&query).await.unwrap() })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let baseline_ctx = SessionContext::new();
+    let case_heavy_ctx = SessionContext::new();
     let rt = Runtime::new().unwrap();
 
     // validate logical plan optimize performance
     // https://github.com/apache/datafusion/issues/17261
 
-    let df = build_test_data_frame(&ctx, &rt);
+    let df = build_test_data_frame(&baseline_ctx, &rt);
+    let case_heavy_left_join_df = build_case_heavy_left_join_df(&case_heavy_ctx, &rt);
 
     c.bench_function("logical_plan_optimize", |b| {
         b.iter(|| {
@@ -227,6 +341,125 @@ fn criterion_benchmark(c: &mut Criterion) {
             black_box(rt.block_on(async { df_clone.into_optimized_plan().unwrap() }));
         })
     });
+
+    c.bench_function("logical_plan_optimize_hotspot_case_heavy_left_join", |b| {
+        b.iter(|| {
+            let df_clone = case_heavy_left_join_df.clone();
+            black_box(rt.block_on(async { df_clone.into_optimized_plan().unwrap() }));
+        })
+    });
+
+    let predicate_sweep = [10, 20, 30, 40, 60];
+    let case_depth_sweep = [1, 2, 3];
+
+    let mut hotspot_group =
+        c.benchmark_group("push_down_filter_hotspot_case_heavy_left_join_ab");
+    for case_depth in case_depth_sweep {
+        for predicate_count in predicate_sweep {
+            let with_push_down_filter =
+                build_case_heavy_left_join_df_with_push_down_filter(
+                    &rt,
+                    predicate_count,
+                    case_depth,
+                    true,
+                );
+            let without_push_down_filter =
+                build_case_heavy_left_join_df_with_push_down_filter(
+                    &rt,
+                    predicate_count,
+                    case_depth,
+                    false,
+                );
+
+            let input_label =
+                format!("predicates={predicate_count},case_depth={case_depth}");
+            // A/B interpretation:
+            // - with_push_down_filter: default optimizer path (rule enabled)
+            // - without_push_down_filter: control path with the rule removed
+            // Compare both IDs at the same sweep point to isolate rule impact.
+            hotspot_group.bench_with_input(
+                BenchmarkId::new("with_push_down_filter", &input_label),
+                &with_push_down_filter,
+                |b, df| {
+                    b.iter(|| {
+                        let df_clone = df.clone();
+                        black_box(
+                            rt.block_on(async {
+                                df_clone.into_optimized_plan().unwrap()
+                            }),
+                        );
+                    })
+                },
+            );
+            hotspot_group.bench_with_input(
+                BenchmarkId::new("without_push_down_filter", &input_label),
+                &without_push_down_filter,
+                |b, df| {
+                    b.iter(|| {
+                        let df_clone = df.clone();
+                        black_box(
+                            rt.block_on(async {
+                                df_clone.into_optimized_plan().unwrap()
+                            }),
+                        );
+                    })
+                },
+            );
+        }
+    }
+    hotspot_group.finish();
+
+    let mut control_group =
+        c.benchmark_group("push_down_filter_control_non_case_left_join_ab");
+    for nesting_depth in case_depth_sweep {
+        for predicate_count in predicate_sweep {
+            let with_push_down_filter = build_non_case_left_join_df_with_push_down_filter(
+                &rt,
+                predicate_count,
+                nesting_depth,
+                true,
+            );
+            let without_push_down_filter =
+                build_non_case_left_join_df_with_push_down_filter(
+                    &rt,
+                    predicate_count,
+                    nesting_depth,
+                    false,
+                );
+
+            let input_label =
+                format!("predicates={predicate_count},nesting_depth={nesting_depth}");
+            control_group.bench_with_input(
+                BenchmarkId::new("with_push_down_filter", &input_label),
+                &with_push_down_filter,
+                |b, df| {
+                    b.iter(|| {
+                        let df_clone = df.clone();
+                        black_box(
+                            rt.block_on(async {
+                                df_clone.into_optimized_plan().unwrap()
+                            }),
+                        );
+                    })
+                },
+            );
+            control_group.bench_with_input(
+                BenchmarkId::new("without_push_down_filter", &input_label),
+                &without_push_down_filter,
+                |b, df| {
+                    b.iter(|| {
+                        let df_clone = df.clone();
+                        black_box(
+                            rt.block_on(async {
+                                df_clone.into_optimized_plan().unwrap()
+                            }),
+                        );
+                    })
+                },
+            );
+        }
+    }
+    control_group.finish();
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/core/benches/sql_query_with_io.rs b/datafusion/core/benches/sql_query_with_io.rs
index 58797dfed6b67..fc8caf31acd11 100644
--- a/datafusion/core/benches/sql_query_with_io.rs
+++ b/datafusion/core/benches/sql_query_with_io.rs
@@ -20,7 +20,7 @@ use std::{fmt::Write, sync::Arc, time::Duration};
 use arrow::array::{Int64Builder, RecordBatch, UInt64Builder};
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use bytes::Bytes;
-use criterion::{criterion_group, criterion_main, Criterion, SamplingMode};
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
 use datafusion::{
     datasource::{
         file_format::parquet::ParquetFormat,
@@ -31,13 +31,13 @@ use datafusion::{
 use datafusion_execution::runtime_env::RuntimeEnv;
 use itertools::Itertools;
 use object_store::{
+    ObjectStore, ObjectStoreExt,
     memory::InMemory,
     path::Path,
     throttle::{ThrottleConfig, ThrottledStore},
-    ObjectStore,
 };
 use parquet::arrow::ArrowWriter;
-use rand::{rngs::StdRng, Rng, SeedableRng};
+use rand::{Rng, SeedableRng, rngs::StdRng};
 use tokio::runtime::Runtime;
 use url::Url;
 
diff --git a/datafusion/core/benches/struct_query_sql.rs b/datafusion/core/benches/struct_query_sql.rs
index 5c7b427310827..96434fc379ea6 100644
--- a/datafusion/core/benches/struct_query_sql.rs
+++ b/datafusion/core/benches/struct_query_sql.rs
@@ -20,7 +20,7 @@ use arrow::{
     datatypes::{DataType, Field, Schema},
     record_batch::RecordBatch,
 };
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion::prelude::SessionContext;
 use datafusion::{datasource::MemTable, error::Result};
 use futures::executor::block_on;
diff --git a/datafusion/core/benches/topk_aggregate.rs b/datafusion/core/benches/topk_aggregate.rs
index 9a5fb7163be5c..c78b1ea494407 100644
--- a/datafusion/core/benches/topk_aggregate.rs
+++ b/datafusion/core/benches/topk_aggregate.rs
@@ -17,26 +17,70 @@
 
 mod data_utils;
 
+use arrow::array::Int64Builder;
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use arrow::record_batch::RecordBatch;
 use arrow::util::pretty::pretty_format_batches;
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use data_utils::make_data;
-use datafusion::physical_plan::{collect, displayable, ExecutionPlan};
+use datafusion::physical_plan::{collect, displayable};
 use datafusion::prelude::SessionContext;
 use datafusion::{datasource::MemTable, error::Result};
 use datafusion_execution::config::SessionConfig;
-use datafusion_execution::TaskContext;
+use rand::SeedableRng;
+use rand::seq::SliceRandom;
 use std::hint::black_box;
 use std::sync::Arc;
 use tokio::runtime::Runtime;
 
+const LIMIT: usize = 10;
+
+/// Create deterministic data for DISTINCT benchmarks with predictable trace_ids
+/// This ensures consistent results across benchmark runs
+fn make_distinct_data(
+    partition_cnt: i32,
+    sample_cnt: i32,
+) -> Result<(Arc<Schema>, Vec<Vec<RecordBatch>>)> {
+    let mut rng = rand::rngs::SmallRng::from_seed([42; 32]);
+    let total_samples = partition_cnt as usize * sample_cnt as usize;
+    let mut ids = Vec::new();
+    for i in 0..total_samples {
+        ids.push(i as i64);
+    }
+    ids.shuffle(&mut rng);
+
+    let mut global_idx = 0;
+    let schema = test_distinct_schema();
+    let mut partitions = vec![];
+    for _ in 0..partition_cnt {
+        let mut id_builder = Int64Builder::new();
+
+        for _ in 0..sample_cnt {
+            let id = ids[global_idx];
+            id_builder.append_value(id);
+            global_idx += 1;
+        }
+
+        let id_col = Arc::new(id_builder.finish());
+        let batch = RecordBatch::try_new(schema.clone(), vec![id_col])?;
+        partitions.push(vec![batch]);
+    }
+
+    Ok((schema, partitions))
+}
+
+/// Returns a Schema for distinct benchmarks with i64 trace_id
+fn test_distinct_schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![Field::new("id", DataType::Int64, false)]))
+}
+
 async fn create_context(
-    limit: usize,
     partition_cnt: i32,
     sample_cnt: i32,
     asc: bool,
     use_topk: bool,
     use_view: bool,
-) -> Result<(Arc<dyn ExecutionPlan>, Arc<TaskContext>)> {
+) -> Result<SessionContext> {
     let (schema, parts) = make_data(partition_cnt, sample_cnt, asc, use_view).unwrap();
     let mem_table = Arc::new(MemTable::try_new(schema, parts).unwrap());
 
@@ -46,165 +90,408 @@ async fn create_context(
     opts.optimizer.enable_topk_aggregation = use_topk;
     let ctx = SessionContext::new_with_config(cfg);
     let _ = ctx.register_table("traces", mem_table)?;
-    let sql = format!("select trace_id, max(timestamp_ms) from traces group by trace_id order by max(timestamp_ms) desc limit {limit};");
+
+    Ok(ctx)
+}
+
+async fn create_context_distinct(
+    partition_cnt: i32,
+    sample_cnt: i32,
+    use_topk: bool,
+) -> Result<SessionContext> {
+    // Use deterministic data generation for DISTINCT queries to ensure consistent results
+    let (schema, parts) = make_distinct_data(partition_cnt, sample_cnt).unwrap();
+    let mem_table = Arc::new(MemTable::try_new(schema, parts).unwrap());
+
+    // Create the DataFrame
+    let mut cfg = SessionConfig::new();
+    let opts = cfg.options_mut();
+    opts.optimizer.enable_topk_aggregation = use_topk;
+    let ctx = SessionContext::new_with_config(cfg);
+    let _ = ctx.register_table("traces", mem_table)?;
+
+    Ok(ctx)
+}
+
+fn run(rt: &Runtime, ctx: SessionContext, limit: usize, use_topk: bool, asc: bool) {
+    black_box(rt.block_on(async { aggregate(ctx, limit, use_topk, asc).await })).unwrap();
+}
+
+fn run_string(rt: &Runtime, ctx: SessionContext, limit: usize, use_topk: bool) {
+    black_box(rt.block_on(async { aggregate_string(ctx, limit, use_topk).await }))
+        .unwrap();
+}
+
+fn run_distinct(
+    rt: &Runtime,
+    ctx: SessionContext,
+    limit: usize,
+    use_topk: bool,
+    asc: bool,
+) {
+    black_box(rt.block_on(async { aggregate_distinct(ctx, limit, use_topk, asc).await }))
+        .unwrap();
+}
+
+async fn aggregate(
+    ctx: SessionContext,
+    limit: usize,
+    use_topk: bool,
+    asc: bool,
+) -> Result<()> {
+    let sql = format!(
+        "select max(timestamp_ms) from traces group by trace_id order by max(timestamp_ms) desc limit {limit};"
+    );
     let df = ctx.sql(sql.as_str()).await?;
-    let physical_plan = df.create_physical_plan().await?;
-    let actual_phys_plan = displayable(physical_plan.as_ref()).indent(true).to_string();
+    let plan = df.create_physical_plan().await?;
+    let actual_phys_plan = displayable(plan.as_ref()).indent(true).to_string();
     assert_eq!(
         actual_phys_plan.contains(&format!("lim=[{limit}]")),
         use_topk
     );
 
-    Ok((physical_plan, ctx.task_ctx()))
+    let batches = collect(plan, ctx.task_ctx()).await?;
+    assert_eq!(batches.len(), 1);
+    let batch = batches.first().unwrap();
+    assert_eq!(batch.num_rows(), LIMIT);
+
+    let actual = format!("{}", pretty_format_batches(&batches)?).to_lowercase();
+    let expected_asc = r#"
++--------------------------+
+| max(traces.timestamp_ms) |
++--------------------------+
+| 16909009999999           |
+| 16909009999998           |
+| 16909009999997           |
+| 16909009999996           |
+| 16909009999995           |
+| 16909009999994           |
+| 16909009999993           |
+| 16909009999992           |
+| 16909009999991           |
+| 16909009999990           |
++--------------------------+
+        "#
+    .trim();
+    if asc {
+        assert_eq!(actual.trim(), expected_asc);
+    }
+
+    Ok(())
 }
 
-fn run(rt: &Runtime, plan: Arc<dyn ExecutionPlan>, ctx: Arc<TaskContext>, asc: bool) {
-    black_box(rt.block_on(async { aggregate(plan.clone(), ctx.clone(), asc).await }))
-        .unwrap();
+/// Benchmark for string aggregate functions with topk optimization.
+/// This tests grouping by a numeric column (timestamp_ms) and aggregating
+/// a string column (trace_id) with Utf8 or Utf8View data types.
+async fn aggregate_string(
+    ctx: SessionContext,
+    limit: usize,
+    use_topk: bool,
+) -> Result<Vec<RecordBatch>> {
+    let sql = format!(
+        "select max(trace_id) from traces group by timestamp_ms order by max(trace_id) desc limit {limit};"
+    );
+    let df = ctx.sql(sql.as_str()).await?;
+    let plan = df.create_physical_plan().await?;
+    let actual_phys_plan = displayable(plan.as_ref()).indent(true).to_string();
+    assert_eq!(
+        actual_phys_plan.contains(&format!("lim=[{limit}]")),
+        use_topk
+    );
+
+    let batches = collect(plan, ctx.task_ctx()).await?;
+    assert_eq!(batches.len(), 1);
+    let batch = batches.first().unwrap();
+    assert_eq!(batch.num_rows(), LIMIT);
+
+    Ok(batches)
 }
 
-async fn aggregate(
-    plan: Arc<dyn ExecutionPlan>,
-    ctx: Arc<TaskContext>,
+async fn aggregate_distinct(
+    ctx: SessionContext,
+    limit: usize,
+    use_topk: bool,
     asc: bool,
 ) -> Result<()> {
-    let batches = collect(plan, ctx).await?;
+    let order_direction = if asc { "asc" } else { "desc" };
+    let sql = format!(
+        "select id from traces group by id order by id {order_direction} limit {limit};"
+    );
+    let df = ctx.sql(sql.as_str()).await?;
+    let plan = df.create_physical_plan().await?;
+    let actual_phys_plan = displayable(plan.as_ref()).indent(true).to_string();
+    assert_eq!(
+        actual_phys_plan.contains(&format!("lim=[{limit}]")),
+        use_topk
+    );
+    let batches = collect(plan, ctx.task_ctx()).await?;
     assert_eq!(batches.len(), 1);
     let batch = batches.first().unwrap();
-    assert_eq!(batch.num_rows(), 10);
+    assert_eq!(batch.num_rows(), LIMIT);
 
     let actual = format!("{}", pretty_format_batches(&batches)?).to_lowercase();
+
     let expected_asc = r#"
-+----------------------------------+--------------------------+
-| trace_id                         | max(traces.timestamp_ms) |
-+----------------------------------+--------------------------+
-| 5868861a23ed31355efc5200eb80fe74 | 16909009999999           |
-| 4040e64656804c3d77320d7a0e7eb1f0 | 16909009999998           |
-| 02801bbe533190a9f8713d75222f445d | 16909009999997           |
-| 9e31b3b5a620de32b68fefa5aeea57f1 | 16909009999996           |
-| 2d88a860e9bd1cfaa632d8e7caeaa934 | 16909009999995           |
-| a47edcef8364ab6f191dd9103e51c171 | 16909009999994           |
-| 36a3fa2ccfbf8e00337f0b1254384db6 | 16909009999993           |
-| 0756be84f57369012e10de18b57d8a2f | 16909009999992           |
-| d4d6bf9845fa5897710e3a8db81d5907 | 16909009999991           |
-| 3c2cc1abe728a66b61e14880b53482a0 | 16909009999990           |
-+----------------------------------+--------------------------+
-        "#
++----+
+| id |
++----+
+| 0  |
+| 1  |
+| 2  |
+| 3  |
+| 4  |
+| 5  |
+| 6  |
+| 7  |
+| 8  |
+| 9  |
++----+
+"#
+    .trim();
+
+    let expected_desc = r#"
++---------+
+| id      |
++---------+
+| 9999999 |
+| 9999998 |
+| 9999997 |
+| 9999996 |
+| 9999995 |
+| 9999994 |
+| 9999993 |
+| 9999992 |
+| 9999991 |
+| 9999990 |
++---------+
+"#
     .trim();
+
+    // Verify exact results match expected values
     if asc {
-        assert_eq!(actual.trim(), expected_asc);
+        assert_eq!(
+            actual.trim(),
+            expected_asc,
+            "Ascending DISTINCT results do not match expected values"
+        );
+    } else {
+        assert_eq!(
+            actual.trim(),
+            expected_desc,
+            "Descending DISTINCT results do not match expected values"
+        );
     }
 
     Ok(())
 }
 
+struct BenchCase<'a> {
+    name_tpl: &'a str,
+    asc: bool,
+    use_topk: bool,
+    use_view: bool,
+}
+
+struct StringCase {
+    asc: bool,
+    use_topk: bool,
+    use_view: bool,
+}
+
+fn assert_utf8_utf8view_match(
+    rt: &Runtime,
+    partitions: i32,
+    samples: i32,
+    limit: usize,
+    asc: bool,
+    use_topk: bool,
+) {
+    let ctx_utf8 = rt
+        .block_on(create_context(partitions, samples, asc, use_topk, false))
+        .unwrap();
+    let ctx_view = rt
+        .block_on(create_context(partitions, samples, asc, use_topk, true))
+        .unwrap();
+    let batches_utf8 = rt
+        .block_on(aggregate_string(ctx_utf8, limit, use_topk))
+        .unwrap();
+    let batches_view = rt
+        .block_on(aggregate_string(ctx_view, limit, use_topk))
+        .unwrap();
+    let result_utf8 = pretty_format_batches(&batches_utf8).unwrap().to_string();
+    let result_view = pretty_format_batches(&batches_view).unwrap().to_string();
+    assert_eq!(
+        result_utf8, result_view,
+        "Utf8 vs Utf8View mismatch for asc={asc}, use_topk={use_topk}"
+    );
+}
+
+fn assert_string_results_match(
+    rt: &Runtime,
+    partitions: i32,
+    samples: i32,
+    limit: usize,
+) {
+    for asc in [false, true] {
+        for use_topk in [false, true] {
+            assert_utf8_utf8view_match(rt, partitions, samples, limit, asc, use_topk);
+        }
+    }
+}
+
 fn criterion_benchmark(c: &mut Criterion) {
     let rt = Runtime::new().unwrap();
-    let limit = 10;
+    let limit = LIMIT;
     let partitions = 10;
     let samples = 1_000_000;
+    let total_rows = partitions * samples;
 
-    c.bench_function(
-        format!("aggregate {} time-series rows", partitions * samples).as_str(),
-        |b| {
-            b.iter(|| {
-                let real = rt.block_on(async {
-                    create_context(limit, partitions, samples, false, false, false)
-                        .await
-                        .unwrap()
-                });
-                run(&rt, real.0.clone(), real.1.clone(), false)
-            })
+    // Numeric aggregate benchmarks
+    let numeric_cases = &[
+        BenchCase {
+            name_tpl: "aggregate {rows} time-series rows",
+            asc: false,
+            use_topk: false,
+            use_view: false,
         },
-    );
-
-    c.bench_function(
-        format!("aggregate {} worst-case rows", partitions * samples).as_str(),
-        |b| {
-            b.iter(|| {
-                let asc = rt.block_on(async {
-                    create_context(limit, partitions, samples, true, false, false)
-                        .await
-                        .unwrap()
-                });
-                run(&rt, asc.0.clone(), asc.1.clone(), true)
-            })
+        BenchCase {
+            name_tpl: "aggregate {rows} worst-case rows",
+            asc: true,
+            use_topk: false,
+            use_view: false,
         },
-    );
-
-    c.bench_function(
-        format!(
-            "top k={limit} aggregate {} time-series rows",
-            partitions * samples
-        )
-        .as_str(),
-        |b| {
-            b.iter(|| {
-                let topk_real = rt.block_on(async {
-                    create_context(limit, partitions, samples, false, true, false)
-                        .await
-                        .unwrap()
-                });
-                run(&rt, topk_real.0.clone(), topk_real.1.clone(), false)
-            })
+        BenchCase {
+            name_tpl: "top k={limit} aggregate {rows} time-series rows",
+            asc: false,
+            use_topk: true,
+            use_view: false,
         },
-    );
-
-    c.bench_function(
-        format!(
-            "top k={limit} aggregate {} worst-case rows",
-            partitions * samples
-        )
-        .as_str(),
-        |b| {
-            b.iter(|| {
-                let topk_asc = rt.block_on(async {
-                    create_context(limit, partitions, samples, true, true, false)
-                        .await
-                        .unwrap()
-                });
-                run(&rt, topk_asc.0.clone(), topk_asc.1.clone(), true)
-            })
+        BenchCase {
+            name_tpl: "top k={limit} aggregate {rows} worst-case rows",
+            asc: true,
+            use_topk: true,
+            use_view: false,
         },
-    );
-
-    // Utf8View schema，time-series rows
-    c.bench_function(
-        format!(
-            "top k={limit} aggregate {} time-series rows [Utf8View]",
-            partitions * samples
-        )
-        .as_str(),
-        |b| {
-            b.iter(|| {
-                let topk_real = rt.block_on(async {
-                    create_context(limit, partitions, samples, false, true, true)
-                        .await
-                        .unwrap()
-                });
-                run(&rt, topk_real.0.clone(), topk_real.1.clone(), false)
-            })
+        BenchCase {
+            name_tpl: "top k={limit} aggregate {rows} time-series rows [Utf8View]",
+            asc: false,
+            use_topk: true,
+            use_view: true,
         },
-    );
+        BenchCase {
+            name_tpl: "top k={limit} aggregate {rows} worst-case rows [Utf8View]",
+            asc: true,
+            use_topk: true,
+            use_view: true,
+        },
+    ];
+    for case in numeric_cases {
+        let name = case
+            .name_tpl
+            .replace("{rows}", &total_rows.to_string())
+            .replace("{limit}", &limit.to_string());
+        let ctx = rt
+            .block_on(create_context(
+                partitions,
+                samples,
+                case.asc,
+                case.use_topk,
+                case.use_view,
+            ))
+            .unwrap();
+        c.bench_function(&name, |b| {
+            b.iter(|| run(&rt, ctx.clone(), limit, case.use_topk, case.asc))
+        });
+    }
 
-    // Utf8View schema，worst-case rows
-    c.bench_function(
-        format!(
-            "top k={limit} aggregate {} worst-case rows [Utf8View]",
-            partitions * samples
-        )
-        .as_str(),
-        |b| {
-            b.iter(|| {
-                let topk_asc = rt.block_on(async {
-                    create_context(limit, partitions, samples, true, true, true)
-                        .await
-                        .unwrap()
-                });
-                run(&rt, topk_asc.0.clone(), topk_asc.1.clone(), true)
-            })
+    assert_string_results_match(&rt, partitions, samples, limit);
+
+    let string_cases = &[
+        StringCase {
+            asc: false,
+            use_topk: false,
+            use_view: false,
         },
-    );
+        StringCase {
+            asc: true,
+            use_topk: false,
+            use_view: false,
+        },
+        StringCase {
+            asc: false,
+            use_topk: false,
+            use_view: true,
+        },
+        StringCase {
+            asc: true,
+            use_topk: false,
+            use_view: true,
+        },
+        StringCase {
+            asc: false,
+            use_topk: true,
+            use_view: false,
+        },
+        StringCase {
+            asc: true,
+            use_topk: true,
+            use_view: false,
+        },
+        StringCase {
+            asc: false,
+            use_topk: true,
+            use_view: true,
+        },
+        StringCase {
+            asc: true,
+            use_topk: true,
+            use_view: true,
+        },
+    ];
+    for case in string_cases {
+        let scenario = if case.asc {
+            "worst-case"
+        } else {
+            "time-series"
+        };
+        let type_label = if case.use_view { "Utf8View" } else { "Utf8" };
+        let name = if case.use_topk {
+            format!(
+                "top k={limit} string aggregate {total_rows} {scenario} rows [{type_label}]"
+            )
+        } else {
+            format!("string aggregate {total_rows} {scenario} rows [{type_label}]")
+        };
+        let ctx = rt
+            .block_on(create_context(
+                partitions,
+                samples,
+                case.asc,
+                case.use_topk,
+                case.use_view,
+            ))
+            .unwrap();
+        c.bench_function(&name, |b| {
+            b.iter(|| run_string(&rt, ctx.clone(), limit, case.use_topk))
+        });
+    }
+
+    // DISTINCT benchmarks
+    for use_topk in [false, true] {
+        let ctx = rt.block_on(async {
+            create_context_distinct(partitions, samples, use_topk)
+                .await
+                .unwrap()
+        });
+        let topk_label = if use_topk { "TopK" } else { "no TopK" };
+        for asc in [false, true] {
+            let dir = if asc { "asc" } else { "desc" };
+            let name = format!("distinct {total_rows} rows {dir} [{topk_label}]");
+            c.bench_function(&name, |b| {
+                b.iter(|| run_distinct(&rt, ctx.clone(), limit, use_topk, asc))
+            });
+        }
+    }
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/core/benches/topk_repartition.rs b/datafusion/core/benches/topk_repartition.rs
new file mode 100644
index 0000000000000..e1f14e4aaa633
--- /dev/null
+++ b/datafusion/core/benches/topk_repartition.rs
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmark for the TopKRepartition optimizer rule.
+//!
+//! Measures the benefit of pushing TopK (Sort with fetch) below hash
+//! repartition when running partitioned window functions with LIMIT.
+
+mod data_utils;
+
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use data_utils::create_table_provider;
+use datafusion::prelude::{SessionConfig, SessionContext};
+use parking_lot::Mutex;
+use std::hint::black_box;
+use std::sync::Arc;
+use tokio::runtime::Runtime;
+
+#[expect(clippy::needless_pass_by_value)]
+fn query(ctx: Arc<Mutex<SessionContext>>, rt: &Runtime, sql: &str) {
+    let df = rt.block_on(ctx.lock().sql(sql)).unwrap();
+    black_box(rt.block_on(df.collect()).unwrap());
+}
+
+fn create_context(
+    partitions_len: usize,
+    target_partitions: usize,
+    enable_topk_repartition: bool,
+) -> Arc<Mutex<SessionContext>> {
+    let array_len = 1024 * 1024;
+    let batch_size = 8 * 1024;
+    let mut config = SessionConfig::new().with_target_partitions(target_partitions);
+    config.options_mut().optimizer.enable_topk_repartition = enable_topk_repartition;
+    let ctx = SessionContext::new_with_config(config);
+    let rt = Runtime::new().unwrap();
+    rt.block_on(async {
+        let provider =
+            create_table_provider(partitions_len, array_len, batch_size).unwrap();
+        ctx.register_table("t", provider).unwrap();
+    });
+    Arc::new(Mutex::new(ctx))
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let rt = Runtime::new().unwrap();
+
+    let limits = [10, 1_000, 10_000, 100_000];
+    let scans = 16;
+    let target_partitions = 4;
+
+    let group = format!("topk_repartition_{scans}_to_{target_partitions}");
+    let mut group = c.benchmark_group(group);
+    for limit in limits {
+        let sql = format!(
+            "SELECT \
+                SUM(f64) OVER (PARTITION BY u64_narrow ORDER BY u64_wide ROWS UNBOUNDED PRECEDING) \
+                FROM t \
+                ORDER BY u64_narrow, u64_wide \
+                LIMIT {limit}"
+        );
+
+        let ctx_disabled = create_context(scans, target_partitions, false);
+        group.bench_function(BenchmarkId::new("disabled", limit), |b| {
+            b.iter(|| query(ctx_disabled.clone(), &rt, &sql))
+        });
+
+        let ctx_enabled = create_context(scans, target_partitions, true);
+        group.bench_function(BenchmarkId::new("enabled", limit), |b| {
+            b.iter(|| query(ctx_enabled.clone(), &rt, &sql))
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/core/benches/window_query_sql.rs b/datafusion/core/benches/window_query_sql.rs
index 6d83959f7eb3c..1657cae913fef 100644
--- a/datafusion/core/benches/window_query_sql.rs
+++ b/datafusion/core/benches/window_query_sql.rs
@@ -15,14 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#[macro_use]
-extern crate criterion;
-extern crate arrow;
-extern crate datafusion;
-
 mod data_utils;
 
-use crate::criterion::Criterion;
+use criterion::{Criterion, criterion_group, criterion_main};
 use data_utils::create_table_provider;
 use datafusion::error::Result;
 use datafusion::execution::context::SessionContext;
@@ -31,6 +26,7 @@ use std::hint::black_box;
 use std::sync::Arc;
 use tokio::runtime::Runtime;
 
+#[expect(clippy::needless_pass_by_value)]
 fn query(ctx: Arc<Mutex<SessionContext>>, rt: &Runtime, sql: &str) {
     let df = rt.block_on(ctx.lock().sql(sql)).unwrap();
     black_box(rt.block_on(df.collect()).unwrap());
diff --git a/datafusion/core/src/bin/print_functions_docs.rs b/datafusion/core/src/bin/print_functions_docs.rs
index 63387c023b11a..c34865a32d532 100644
--- a/datafusion/core/src/bin/print_functions_docs.rs
+++ b/datafusion/core/src/bin/print_functions_docs.rs
@@ -16,14 +16,15 @@
 // under the License.
 
 use datafusion::execution::SessionStateDefaults;
-use datafusion_common::{not_impl_err, HashSet, Result};
+use datafusion_common::{HashSet, Result, not_impl_err};
 use datafusion_expr::{
-    aggregate_doc_sections, scalar_doc_sections, window_doc_sections, AggregateUDF,
-    DocSection, Documentation, ScalarUDF, WindowUDF,
+    AggregateUDF, DocSection, Documentation, HigherOrderUDF, ScalarUDF, WindowUDF,
+    aggregate_doc_sections, scalar_doc_sections, window_doc_sections,
 };
 use itertools::Itertools;
 use std::env::args;
 use std::fmt::Write as _;
+use std::sync::Arc;
 
 /// Print documentation for all functions of a given type to stdout
 ///
@@ -71,6 +72,10 @@ fn print_scalar_docs() -> Result<String> {
         providers.push(Box::new(f.as_ref().clone()));
     }
 
+    for f in SessionStateDefaults::default_higher_order_functions() {
+        providers.push(Box::new(f));
+    }
+
     print_docs(providers, scalar_doc_sections::doc_sections())
 }
 
@@ -84,30 +89,7 @@ fn print_window_docs() -> Result<String> {
     print_docs(providers, window_doc_sections::doc_sections())
 }
 
-// Temporary method useful to semi automate
-// the migration of UDF documentation generation from code based
-// to attribute based
-// To be removed
-#[allow(dead_code)]
-fn save_doc_code_text(documentation: &Documentation, name: &str) {
-    let attr_text = documentation.to_doc_attribute();
-
-    let file_path = format!("{name}.txt");
-    if std::path::Path::new(&file_path).exists() {
-        std::fs::remove_file(&file_path).unwrap();
-    }
-
-    // Open the file in append mode, create it if it doesn't exist
-    let mut file = std::fs::OpenOptions::new()
-        .append(true) // Open in append mode
-        .create(true) // Create the file if it doesn't exist
-        .open(file_path)
-        .unwrap();
-
-    use std::io::Write;
-    file.write_all(attr_text.as_bytes()).unwrap();
-}
-
+#[expect(clippy::needless_pass_by_value)]
 fn print_docs(
     providers: Vec<Box<dyn DocProvider>>,
     doc_sections: Vec<DocSection>,
@@ -254,7 +236,9 @@ fn print_docs(
         for f in &providers_with_no_docs {
             eprintln!("  - {f}");
         }
-        not_impl_err!("Some functions do not have documentation. Please implement `documentation` for: {providers_with_no_docs:?}")
+        not_impl_err!(
+            "Some functions do not have documentation. Please implement `documentation` for: {providers_with_no_docs:?}"
+        )
     } else {
         Ok(docs)
     }
@@ -303,8 +287,19 @@ impl DocProvider for WindowUDF {
     }
 }
 
-#[allow(clippy::borrowed_box)]
-#[allow(clippy::ptr_arg)]
+impl DocProvider for Arc<dyn HigherOrderUDF> {
+    fn get_name(&self) -> String {
+        self.name().to_string()
+    }
+    fn get_aliases(&self) -> Vec<String> {
+        self.aliases().iter().map(|a| a.to_string()).collect()
+    }
+    fn get_documentation(&self) -> Option<&Documentation> {
+        self.documentation()
+    }
+}
+
+#[expect(clippy::borrowed_box)]
 fn get_names_and_aliases(functions: &Vec<&Box<dyn DocProvider>>) -> Vec<String> {
     functions
         .iter()
diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs
index 98804e424b407..0f38988c69405 100644
--- a/datafusion/core/src/dataframe/mod.rs
+++ b/datafusion/core/src/dataframe/mod.rs
@@ -26,22 +26,21 @@ use crate::datasource::file_format::csv::CsvFormatFactory;
 use crate::datasource::file_format::format_as_file_type;
 use crate::datasource::file_format::json::JsonFormatFactory;
 use crate::datasource::{
-    provider_as_source, DefaultTableSource, MemTable, TableProvider,
+    DefaultTableSource, MemTable, TableProvider, provider_as_source,
 };
 use crate::error::Result;
-use crate::execution::context::{SessionState, TaskContext};
 use crate::execution::FunctionRegistry;
+use crate::execution::context::{SessionState, TaskContext};
 use crate::logical_expr::utils::find_window_exprs;
 use crate::logical_expr::{
-    col, ident, Expr, JoinType, LogicalPlan, LogicalPlanBuilder,
-    LogicalPlanBuilderOptions, Partitioning, TableType,
+    Expr, JoinType, LogicalPlan, LogicalPlanBuilder, LogicalPlanBuilderOptions,
+    Partitioning, TableType, col, ident,
 };
 use crate::physical_plan::{
-    collect, collect_partitioned, execute_stream, execute_stream_partitioned,
-    ExecutionPlan, SendableRecordBatchStream,
+    ExecutionPlan, SendableRecordBatchStream, collect, collect_partitioned,
+    execute_stream, execute_stream_partitioned,
 };
 use crate::prelude::SessionContext;
-use std::any::Any;
 use std::borrow::Cow;
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
@@ -49,20 +48,20 @@ use std::sync::Arc;
 use arrow::array::{Array, ArrayRef, Int64Array, StringArray};
 use arrow::compute::{cast, concat};
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use arrow_schema::FieldRef;
 use datafusion_common::config::{CsvOptions, JsonOptions};
 use datafusion_common::{
-    exec_err, internal_datafusion_err, not_impl_err, plan_datafusion_err, plan_err,
     Column, DFSchema, DataFusionError, ParamValues, ScalarValue, SchemaError,
-    TableReference, UnnestOptions,
+    TableReference, UnnestOptions, exec_err, internal_datafusion_err, not_impl_err,
+    plan_datafusion_err, plan_err, unqualified_field_not_found,
 };
 use datafusion_expr::select_expr::SelectExpr;
 use datafusion_expr::{
-    case,
+    ExplainOption, SortExpr, TableProviderFilterPushDown, UNNAMED_TABLE, case,
     dml::InsertOp,
     expr::{Alias, ScalarFunction},
     is_null, lit,
     utils::COUNT_STAR_EXPANSION,
-    ExplainOption, SortExpr, TableProviderFilterPushDown, UNNAMED_TABLE,
 };
 use datafusion_functions::core::coalesce;
 use datafusion_functions_aggregate::expr_fn::{
@@ -71,6 +70,7 @@ use datafusion_functions_aggregate::expr_fn::{
 
 use async_trait::async_trait;
 use datafusion_catalog::Session;
+use datafusion_expr::extension_types::DFArrayFormatterFactory;
 
 /// Contains options that control how data is
 /// written out from a DataFrame
@@ -78,9 +78,11 @@ pub struct DataFrameWriteOptions {
     /// Controls how new data should be written to the table, determining whether
     /// to append, overwrite, or replace existing data.
     insert_op: InsertOp,
-    /// Controls if all partitions should be coalesced into a single output file
-    /// Generally will have slower performance when set to true.
-    single_file_output: bool,
+    /// Controls if all partitions should be coalesced into a single output file.
+    /// - `None`: Use automatic mode (extension-based heuristic)
+    /// - `Some(true)`: Force single file output at exact path
+    /// - `Some(false)`: Force directory output with generated filenames
+    single_file_output: Option<bool>,
     /// Sets which columns should be used for hive-style partitioned writes by name.
     /// Can be set to empty vec![] for non-partitioned writes.
     partition_by: Vec<String>,
@@ -94,7 +96,7 @@ impl DataFrameWriteOptions {
     pub fn new() -> Self {
         DataFrameWriteOptions {
             insert_op: InsertOp::Append,
-            single_file_output: false,
+            single_file_output: None,
             partition_by: vec![],
             sort_by: vec![],
         }
@@ -107,8 +109,14 @@ impl DataFrameWriteOptions {
     }
 
     /// Set the single_file_output value to true or false
+    ///
+    /// - `true`: Force single file output at the exact path specified
+    /// - `false`: Force directory output with generated filenames
+    ///
+    /// When not called, automatic mode is used (extension-based heuristic).
+    /// When set to true, an output file will always be created even if the DataFrame is empty.
     pub fn with_single_file_output(mut self, single_file_output: bool) -> Self {
-        self.single_file_output = single_file_output;
+        self.single_file_output = Some(single_file_output);
         self
     }
 
@@ -123,6 +131,15 @@ impl DataFrameWriteOptions {
         self.sort_by = sort_by;
         self
     }
+
+    /// Build the options HashMap to pass to CopyTo for sink configuration.
+    fn build_sink_options(&self) -> HashMap<String, String> {
+        let mut options = HashMap::new();
+        if let Some(single_file) = self.single_file_output {
+            options.insert("single_file_output".to_string(), single_file.to_string());
+        }
+        options
+    }
 }
 
 impl Default for DataFrameWriteOptions {
@@ -277,8 +294,11 @@ impl DataFrame {
         self.session_state.create_logical_expr(sql, df_schema)
     }
 
-    /// Consume the DataFrame and produce a physical plan
-    pub async fn create_physical_plan(self) -> Result<Arc<dyn ExecutionPlan>> {
+    /// Create a physical plan from this DataFrame.
+    ///
+    /// The `DataFrame` remains accessible after this call, so you can inspect
+    /// the plan and still call [`DataFrame::collect`] or other execution methods.
+    pub async fn create_physical_plan(&self) -> Result<Arc<dyn ExecutionPlan>> {
         self.session_state.create_physical_plan(&self.plan).await
     }
 
@@ -310,11 +330,20 @@ impl DataFrame {
     pub fn select_columns(self, columns: &[&str]) -> Result<DataFrame> {
         let fields = columns
             .iter()
-            .flat_map(|name| {
-                self.plan
+            .map(|name| {
+                let fields = self
+                    .plan
                     .schema()
-                    .qualified_fields_with_unqualified_name(name)
+                    .qualified_fields_with_unqualified_name(name);
+                if fields.is_empty() {
+                    Err(unqualified_field_not_found(name, self.plan.schema()))
+                } else {
+                    Ok(fields)
+                }
             })
+            .collect::<Result<Vec<_>, _>>()?
+            .into_iter()
+            .flatten()
             .collect::<Vec<_>>();
         let expr: Vec<Expr> = fields
             .into_iter()
@@ -436,15 +465,31 @@ impl DataFrame {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn drop_columns(self, columns: &[&str]) -> Result<DataFrame> {
+    pub fn drop_columns<T>(self, columns: &[T]) -> Result<DataFrame>
+    where
+        T: Into<Column> + Clone,
+    {
         let fields_to_drop = columns
             .iter()
-            .flat_map(|name| {
-                self.plan
-                    .schema()
-                    .qualified_fields_with_unqualified_name(name)
+            .flat_map(|col| {
+                let column: Column = col.clone().into();
+                match column.relation.as_ref() {
+                    Some(_) => {
+                        // qualified_field_from_column returns Result<(Option<&TableReference>, &FieldRef)>
+                        vec![self.plan.schema().qualified_field_from_column(&column)]
+                    }
+                    None => {
+                        // qualified_fields_with_unqualified_name returns Vec<(Option<&TableReference>, &FieldRef)>
+                        self.plan
+                            .schema()
+                            .qualified_fields_with_unqualified_name(&column.name)
+                            .into_iter()
+                            .map(Ok)
+                            .collect::<Vec<_>>()
+                    }
+                }
             })
-            .collect::<Vec<_>>();
+            .collect::<Result<Vec<_>, _>>()?;
         let expr: Vec<Expr> = self
             .plan
             .schema()
@@ -470,7 +515,7 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_json("tests/data/unnest.json", NdJsonReadOptions::default()).await?;
+    /// let df = ctx.read_json("tests/data/unnest.json", JsonReadOptions::default()).await?;
     /// // expand into multiple columns if it's json array, flatten field name if it's nested structure
     /// let df = df.unnest_columns(&["b","c","d"])?;
     /// let expected = vec![
@@ -1474,6 +1519,11 @@ impl DataFrame {
         let options = self.session_state.config().options().format.clone();
         let arrow_options: arrow::util::display::FormatOptions = (&options).try_into()?;
 
+        let registry = self.session_state.extension_type_registry();
+        let formatter_factory = DFArrayFormatterFactory::new(Arc::clone(registry));
+        let arrow_options =
+            arrow_options.with_formatter_factory(Some(&formatter_factory));
+
         let results = self.collect().await?;
         Ok(
             pretty::pretty_format_batches_with_options(&results, &arrow_options)?
@@ -1655,7 +1705,7 @@ impl DataFrame {
     pub fn into_view(self) -> Arc<dyn TableProvider> {
         Arc::new(DataFrameTableProvider {
             plan: self.plan,
-            table_type: TableType::Temporary,
+            table_type: TableType::View,
         })
     }
 
@@ -2013,6 +2063,8 @@ impl DataFrame {
 
         let file_type = format_as_file_type(format);
 
+        let copy_options = options.build_sink_options();
+
         let plan = if options.sort_by.is_empty() {
             self.plan
         } else {
@@ -2025,7 +2077,7 @@ impl DataFrame {
             plan,
             path.into(),
             file_type,
-            HashMap::new(),
+            copy_options,
             options.partition_by,
         )?
         .build()?;
@@ -2081,6 +2133,8 @@ impl DataFrame {
 
         let file_type = format_as_file_type(format);
 
+        let copy_options = options.build_sink_options();
+
         let plan = if options.sort_by.is_empty() {
             self.plan
         } else {
@@ -2093,7 +2147,7 @@ impl DataFrame {
             plan,
             path.into(),
             file_type,
-            Default::default(),
+            copy_options,
             options.partition_by,
         )?
         .build()?;
@@ -2232,7 +2286,7 @@ impl DataFrame {
             .schema()
             .iter()
             .map(|(qualifier, field)| {
-                if qualifier.eq(&qualifier_rename) && field.as_ref() == field_rename {
+                if qualifier.eq(&qualifier_rename) && field == field_rename {
                     (
                         col(Column::from((qualifier, field)))
                             .alias_qualified(qualifier.cloned(), new_name),
@@ -2321,6 +2375,10 @@ impl DataFrame {
 
     /// Cache DataFrame as a memory table.
     ///
+    /// Default behavior could be changed using
+    /// a [`crate::execution::session_state::CacheFactory`]
+    /// configured via [`SessionState`].
+    ///
     /// ```
     /// # use datafusion::prelude::*;
     /// # use datafusion::error::Result;
@@ -2335,14 +2393,20 @@ impl DataFrame {
     /// # }
     /// ```
     pub async fn cache(self) -> Result<DataFrame> {
-        let context = SessionContext::new_with_state((*self.session_state).clone());
-        // The schema is consistent with the output
-        let plan = self.clone().create_physical_plan().await?;
-        let schema = plan.schema();
-        let task_ctx = Arc::new(self.task_ctx());
-        let partitions = collect_partitioned(plan, task_ctx).await?;
-        let mem_table = MemTable::try_new(schema, partitions)?;
-        context.read_table(Arc::new(mem_table))
+        if let Some(cache_factory) = self.session_state.cache_factory() {
+            let new_plan =
+                cache_factory.create(self.plan, self.session_state.as_ref())?;
+            Ok(Self::new(*self.session_state, new_plan))
+        } else {
+            let context = SessionContext::new_with_state((*self.session_state).clone());
+            // The schema is consistent with the output
+            let plan = self.create_physical_plan().await?;
+            let schema = plan.schema();
+            let task_ctx = Arc::new(self.task_ctx());
+            let partitions = collect_partitioned(plan, task_ctx).await?;
+            let mem_table = MemTable::try_new(schema, partitions)?;
+            context.read_table(Arc::new(mem_table))
+        }
     }
 
     /// Apply an alias to the DataFrame.
@@ -2383,6 +2447,7 @@ impl DataFrame {
     /// # Ok(())
     /// # }
     /// ```
+    #[expect(clippy::needless_pass_by_value)]
     pub fn fill_null(
         &self,
         value: ScalarValue,
@@ -2393,7 +2458,7 @@ impl DataFrame {
                 .schema()
                 .fields()
                 .iter()
-                .map(|f| f.as_ref().clone())
+                .map(Arc::clone)
                 .collect()
         } else {
             self.find_columns(&columns)?
@@ -2430,7 +2495,7 @@ impl DataFrame {
     }
 
     // Helper to find columns from names
-    fn find_columns(&self, names: &[String]) -> Result<Vec<Field>> {
+    fn find_columns(&self, names: &[String]) -> Result<Vec<FieldRef>> {
         let schema = self.logical_plan().schema();
         names
             .iter()
@@ -2443,6 +2508,48 @@ impl DataFrame {
             .collect()
     }
 
+    /// Find qualified columns for this dataframe from names
+    ///
+    /// # Arguments
+    /// * `names` - Unqualified names to find.
+    ///
+    /// # Example
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # use datafusion_common::ScalarValue;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let ctx = SessionContext::new();
+    /// ctx.register_csv("first_table", "tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
+    /// let df = ctx.table("first_table").await?;
+    /// ctx.register_csv("second_table", "tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
+    /// let df2 = ctx.table("second_table").await?;
+    /// let join_expr = df.find_qualified_columns(&["a"])?.iter()
+    ///     .zip(df2.find_qualified_columns(&["a"])?.iter())
+    ///     .map(|(col1, col2)| col(*col1).eq(col(*col2)))
+    ///     .collect::<Vec<Expr>>();
+    /// let df3 = df.join_on(df2, JoinType::Inner, join_expr)?;
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub fn find_qualified_columns(
+        &self,
+        names: &[&str],
+    ) -> Result<Vec<(Option<&TableReference>, &FieldRef)>> {
+        let schema = self.logical_plan().schema();
+        names
+            .iter()
+            .map(|name| {
+                schema
+                    .qualified_field_from_column(&Column::from_name(*name))
+                    .map_err(|_| plan_datafusion_err!("Column '{}' not found", name))
+            })
+            .collect()
+    }
+
     /// Helper for creating DataFrame.
     /// # Example
     /// ```
@@ -2540,10 +2647,6 @@ struct DataFrameTableProvider {
 
 #[async_trait]
 impl TableProvider for DataFrameTableProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn get_logical_plan(&self) -> Option<Cow<'_, LogicalPlan>> {
         Some(Cow::Borrowed(&self.plan))
     }
diff --git a/datafusion/core/src/dataframe/parquet.rs b/datafusion/core/src/dataframe/parquet.rs
index cb8a6cf29541b..83ffbb151773b 100644
--- a/datafusion/core/src/dataframe/parquet.rs
+++ b/datafusion/core/src/dataframe/parquet.rs
@@ -76,6 +76,8 @@ impl DataFrame {
 
         let file_type = format_as_file_type(format);
 
+        let copy_options = options.build_sink_options();
+
         let plan = if options.sort_by.is_empty() {
             self.plan
         } else {
@@ -88,7 +90,7 @@ impl DataFrame {
             plan,
             path.into(),
             file_type,
-            Default::default(),
+            copy_options,
             options.partition_by,
         )?
         .build()?;
@@ -105,7 +107,6 @@ impl DataFrame {
 #[cfg(test)]
 mod tests {
     use std::collections::HashMap;
-    use std::sync::Arc;
 
     use super::super::Result;
     use super::*;
@@ -125,6 +126,19 @@ mod tests {
     use tempfile::TempDir;
     use url::Url;
 
+    /// Helper to extract a metric value by name from aggregated metrics.
+    fn metric_usize(
+        aggregated: &datafusion_physical_expr_common::metrics::MetricsSet,
+        name: &str,
+    ) -> usize {
+        aggregated
+            .iter()
+            .find(|m| m.value().name() == name)
+            .unwrap_or_else(|| panic!("should have {name} metric"))
+            .value()
+            .as_usize()
+    }
+
     #[tokio::test]
     async fn filter_pushdown_dataframe() -> Result<()> {
         let ctx = SessionContext::new();
@@ -150,7 +164,7 @@ mod tests {
         let plan = df.explain(false, false)?.collect().await?;
         // Filters all the way to Parquet
         let formatted = pretty::pretty_format_batches(&plan)?.to_string();
-        assert!(formatted.contains("FilterExec: id@0 = 1"));
+        assert!(formatted.contains("FilterExec: id@0 = 1"), "{formatted}");
 
         Ok(())
     }
@@ -298,8 +312,8 @@ mod tests {
 
         // Read encrypted parquet
         let ctx: SessionContext = SessionContext::new();
-        let read_options =
-            ParquetReadOptions::default().file_decryption_properties((&decrypt).into());
+        let read_options = ParquetReadOptions::default()
+            .file_decryption_properties((&decrypt).try_into()?);
 
         ctx.register_parquet("roundtrip_parquet", &tempfile_str, read_options.clone())
             .await?;
@@ -324,4 +338,357 @@ mod tests {
 
         Ok(())
     }
+
+    /// Test FileOutputMode::SingleFile - explicitly request single file output
+    /// for paths WITHOUT file extensions. This verifies the fix for the regression
+    /// where extension heuristics ignored the explicit with_single_file_output(true).
+    #[tokio::test]
+    async fn test_file_output_mode_single_file() -> Result<()> {
+        use arrow::array::Int32Array;
+        use arrow::datatypes::{DataType, Field, Schema};
+        use arrow::record_batch::RecordBatch;
+
+        let ctx = SessionContext::new();
+        let tmp_dir = TempDir::new()?;
+
+        // Path WITHOUT .parquet extension - this is the key scenario
+        let output_path = tmp_dir.path().join("data_no_ext");
+        let output_path_str = output_path.to_str().unwrap();
+
+        let df = ctx.read_batch(RecordBatch::try_new(
+            Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])),
+            vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
+        )?)?;
+
+        // Explicitly request single file output
+        df.write_parquet(
+            output_path_str,
+            DataFrameWriteOptions::new().with_single_file_output(true),
+            None,
+        )
+        .await?;
+
+        // Verify: output should be a FILE, not a directory
+        assert!(
+            output_path.is_file(),
+            "Expected single file at {:?}, but got is_file={}, is_dir={}",
+            output_path,
+            output_path.is_file(),
+            output_path.is_dir()
+        );
+
+        // Verify the file is readable as parquet
+        let file = std::fs::File::open(&output_path)?;
+        let reader = parquet::file::reader::SerializedFileReader::new(file)?;
+        let metadata = reader.metadata();
+        assert_eq!(metadata.num_row_groups(), 1);
+        assert_eq!(metadata.file_metadata().num_rows(), 3);
+
+        Ok(())
+    }
+
+    /// Test FileOutputMode::Automatic - uses extension heuristic.
+    /// Path WITH extension -> single file; path WITHOUT extension -> directory.
+    #[tokio::test]
+    async fn test_file_output_mode_automatic() -> Result<()> {
+        use arrow::array::Int32Array;
+        use arrow::datatypes::{DataType, Field, Schema};
+        use arrow::record_batch::RecordBatch;
+
+        let ctx = SessionContext::new();
+        let tmp_dir = TempDir::new()?;
+
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
+        )?;
+
+        // Case 1: Path WITH extension -> should create single file (Automatic mode)
+        let output_with_ext = tmp_dir.path().join("data.parquet");
+        let df = ctx.read_batch(batch.clone())?;
+        df.write_parquet(
+            output_with_ext.to_str().unwrap(),
+            DataFrameWriteOptions::new(), // Automatic mode (default)
+            None,
+        )
+        .await?;
+
+        assert!(
+            output_with_ext.is_file(),
+            "Path with extension should be a single file, got is_file={}, is_dir={}",
+            output_with_ext.is_file(),
+            output_with_ext.is_dir()
+        );
+
+        // Case 2: Path WITHOUT extension -> should create directory (Automatic mode)
+        let output_no_ext = tmp_dir.path().join("data_dir");
+        let df = ctx.read_batch(batch)?;
+        df.write_parquet(
+            output_no_ext.to_str().unwrap(),
+            DataFrameWriteOptions::new(), // Automatic mode (default)
+            None,
+        )
+        .await?;
+
+        assert!(
+            output_no_ext.is_dir(),
+            "Path without extension should be a directory, got is_file={}, is_dir={}",
+            output_no_ext.is_file(),
+            output_no_ext.is_dir()
+        );
+
+        Ok(())
+    }
+
+    /// Test that ParquetSink exposes rows_written, bytes_written, and
+    /// elapsed_compute metrics via DataSinkExec.
+    #[tokio::test]
+    async fn test_parquet_sink_metrics() -> Result<()> {
+        use arrow::array::Int32Array;
+        use arrow::datatypes::{DataType, Field, Schema};
+        use arrow::record_batch::RecordBatch;
+        use datafusion_execution::TaskContext;
+
+        use futures::TryStreamExt;
+
+        let ctx = SessionContext::new();
+        let tmp_dir = TempDir::new()?;
+        let output_path = tmp_dir.path().join("metrics_test.parquet");
+        let output_path_str = output_path.to_str().unwrap();
+
+        // Register a table with 100 rows
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("val", DataType::Int32, false),
+        ]));
+        let ids: Vec<i32> = (0..100).collect();
+        let vals: Vec<i32> = (100..200).collect();
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(ids)),
+                Arc::new(Int32Array::from(vals)),
+            ],
+        )?;
+        ctx.register_batch("source", batch)?;
+
+        // Create the physical plan for COPY TO
+        let df = ctx
+            .sql(&format!(
+                "COPY source TO '{output_path_str}' STORED AS PARQUET"
+            ))
+            .await?;
+        let plan = df.create_physical_plan().await?;
+
+        // Execute the plan
+        let task_ctx = Arc::new(TaskContext::from(&ctx.state()));
+        let stream = plan.execute(0, task_ctx)?;
+        let _batches: Vec<_> = stream.try_collect().await?;
+
+        // Check metrics on the DataSinkExec (top-level plan)
+        let metrics = plan
+            .metrics()
+            .expect("DataSinkExec should return metrics from ParquetSink");
+        let aggregated = metrics.aggregate_by_name();
+
+        // rows_written should be 100
+        assert_eq!(
+            metric_usize(&aggregated, "rows_written"),
+            100,
+            "expected 100 rows written"
+        );
+
+        // bytes_written should be > 0
+        let bytes_written = metric_usize(&aggregated, "bytes_written");
+        assert!(
+            bytes_written > 0,
+            "expected bytes_written > 0, got {bytes_written}"
+        );
+
+        // elapsed_compute should be > 0
+        let elapsed = metric_usize(&aggregated, "elapsed_compute");
+        assert!(elapsed > 0, "expected elapsed_compute > 0");
+
+        Ok(())
+    }
+
+    /// Test that ParquetSink metrics work with single_file_parallelism enabled.
+    #[tokio::test]
+    async fn test_parquet_sink_metrics_parallel() -> Result<()> {
+        use arrow::array::Int32Array;
+        use arrow::datatypes::{DataType, Field, Schema};
+        use arrow::record_batch::RecordBatch;
+        use datafusion_execution::TaskContext;
+
+        use futures::TryStreamExt;
+
+        let ctx = SessionContext::new();
+        ctx.sql("SET datafusion.execution.parquet.allow_single_file_parallelism = true")
+            .await?
+            .collect()
+            .await?;
+
+        let tmp_dir = TempDir::new()?;
+        let output_path = tmp_dir.path().join("metrics_parallel.parquet");
+        let output_path_str = output_path.to_str().unwrap();
+
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+        let ids: Vec<i32> = (0..50).collect();
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(ids))],
+        )?;
+        ctx.register_batch("source2", batch)?;
+
+        let df = ctx
+            .sql(&format!(
+                "COPY source2 TO '{output_path_str}' STORED AS PARQUET"
+            ))
+            .await?;
+        let plan = df.create_physical_plan().await?;
+        let task_ctx = Arc::new(TaskContext::from(&ctx.state()));
+        let stream = plan.execute(0, task_ctx)?;
+        let _batches: Vec<_> = stream.try_collect().await?;
+
+        let metrics = plan.metrics().expect("DataSinkExec should return metrics");
+        let aggregated = metrics.aggregate_by_name();
+
+        assert_eq!(metric_usize(&aggregated, "rows_written"), 50);
+        assert!(metric_usize(&aggregated, "bytes_written") > 0);
+        assert!(
+            metric_usize(&aggregated, "elapsed_compute") > 0,
+            "expected elapsed_compute > 0 on parallel path"
+        );
+
+        Ok(())
+    }
+
+    /// Test that ParquetSink reports a non-zero elapsed_compute on the sequential
+    /// write path (allow_single_file_parallelism = false), where elapsed_compute
+    /// is computed as total_write_time - io_time via TimingWriter.
+    #[tokio::test]
+    async fn test_parquet_sink_metrics_sequential() -> Result<()> {
+        use arrow::array::Int32Array;
+        use arrow::datatypes::{DataType, Field, Schema};
+        use arrow::record_batch::RecordBatch;
+        use datafusion_execution::TaskContext;
+
+        use futures::TryStreamExt;
+
+        let ctx = SessionContext::new();
+        ctx.sql("SET datafusion.execution.parquet.allow_single_file_parallelism = false")
+            .await?
+            .collect()
+            .await?;
+
+        let tmp_dir = TempDir::new()?;
+        let output_path = tmp_dir.path().join("metrics_sequential.parquet");
+        let output_path_str = output_path.to_str().unwrap();
+
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+        let ids: Vec<i32> = (0..50).collect();
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(ids))],
+        )?;
+        ctx.register_batch("source_seq", batch)?;
+
+        let df = ctx
+            .sql(&format!(
+                "COPY source_seq TO '{output_path_str}' STORED AS PARQUET"
+            ))
+            .await?;
+        let plan = df.create_physical_plan().await?;
+        let task_ctx = Arc::new(TaskContext::from(&ctx.state()));
+        let stream = plan.execute(0, task_ctx)?;
+        let _batches: Vec<_> = stream.try_collect().await?;
+
+        let metrics = plan
+            .metrics()
+            .expect("DataSinkExec should return metrics from ParquetSink");
+        let aggregated = metrics.aggregate_by_name();
+
+        assert_eq!(metric_usize(&aggregated, "rows_written"), 50);
+        assert!(metric_usize(&aggregated, "bytes_written") > 0);
+        assert!(
+            metric_usize(&aggregated, "elapsed_compute") > 0,
+            "expected elapsed_compute > 0 on sequential path"
+        );
+
+        Ok(())
+    }
+
+    /// Test FileOutputMode::Directory - explicitly request directory output
+    /// even for paths WITH file extensions.
+    #[tokio::test]
+    async fn test_file_output_mode_directory() -> Result<()> {
+        use arrow::array::Int32Array;
+        use arrow::datatypes::{DataType, Field, Schema};
+        use arrow::record_batch::RecordBatch;
+
+        let ctx = SessionContext::new();
+        let tmp_dir = TempDir::new()?;
+
+        // Path WITH .parquet extension but explicitly requesting directory output
+        let output_path = tmp_dir.path().join("output.parquet");
+        let output_path_str = output_path.to_str().unwrap();
+
+        let df = ctx.read_batch(RecordBatch::try_new(
+            Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])),
+            vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
+        )?)?;
+
+        // Explicitly request directory output (single_file_output = false)
+        df.write_parquet(
+            output_path_str,
+            DataFrameWriteOptions::new().with_single_file_output(false),
+            None,
+        )
+        .await?;
+
+        // Verify: output should be a DIRECTORY, not a single file
+        assert!(
+            output_path.is_dir(),
+            "Expected directory at {:?}, but got is_file={}, is_dir={}",
+            output_path,
+            output_path.is_file(),
+            output_path.is_dir()
+        );
+
+        // Verify the directory contains parquet file(s)
+        let entries: Vec<_> = std::fs::read_dir(&output_path)?
+            .filter_map(|e| e.ok())
+            .collect();
+        assert!(
+            !entries.is_empty(),
+            "Directory should contain at least one file"
+        );
+
+        Ok(())
+    }
+
+    /// Test that `create_physical_plan` does not consume the `DataFrame`, so
+    /// callers can inspect (e.g. log) the physical plan and then still call
+    /// `write_parquet` or any other execution method on the same `DataFrame`.
+    #[tokio::test]
+    async fn create_physical_plan_does_not_consume_dataframe() -> Result<()> {
+        use crate::prelude::CsvReadOptions;
+        let ctx = SessionContext::new();
+        let df = ctx
+            .read_csv("tests/data/example.csv", CsvReadOptions::new())
+            .await?;
+
+        // Obtain the physical plan for inspection without consuming `df`.
+        let _physical_plan = df.create_physical_plan().await?;
+
+        // `df` is still usable — collect the results.
+        let batches = df.collect().await?;
+        assert!(!batches.is_empty());
+
+        Ok(())
+    }
 }
diff --git a/datafusion/core/src/datasource/dynamic_file.rs b/datafusion/core/src/datasource/dynamic_file.rs
index 256a11ba693b5..50ee96da3dff0 100644
--- a/datafusion/core/src/datasource/dynamic_file.rs
+++ b/datafusion/core/src/datasource/dynamic_file.rs
@@ -20,9 +20,9 @@
 
 use std::sync::Arc;
 
+use crate::datasource::TableProvider;
 use crate::datasource::listing::ListingTableConfigExt;
 use crate::datasource::listing::{ListingTable, ListingTableConfig, ListingTableUrl};
-use crate::datasource::TableProvider;
 use crate::error::Result;
 use crate::execution::context::SessionState;
 
diff --git a/datafusion/core/src/datasource/file_format/arrow.rs b/datafusion/core/src/datasource/file_format/arrow.rs
index 8701f96eb3b84..338de76b1353b 100644
--- a/datafusion/core/src/datasource/file_format/arrow.rs
+++ b/datafusion/core/src/datasource/file_format/arrow.rs
@@ -17,3 +17,96 @@
 
 //! Re-exports the [`datafusion_datasource_arrow::file_format`] module, and contains tests for it.
 pub use datafusion_datasource_arrow::file_format::*;
+
+#[cfg(test)]
+mod tests {
+    use futures::StreamExt;
+    use std::sync::Arc;
+
+    use arrow::array::{Int64Array, StringArray};
+    use arrow::datatypes::{DataType, Field, Schema};
+    use arrow::record_batch::RecordBatch;
+    use datafusion_common::Result;
+
+    use crate::execution::options::ArrowReadOptions;
+    use crate::prelude::SessionContext;
+
+    #[tokio::test]
+    async fn test_write_empty_arrow_from_sql() -> Result<()> {
+        let ctx = SessionContext::new();
+
+        let tmp_dir = tempfile::TempDir::new()?;
+        let path = format!("{}/empty_sql.arrow", tmp_dir.path().to_string_lossy());
+
+        ctx.sql(&format!(
+            "COPY (SELECT CAST(1 AS BIGINT) AS id LIMIT 0) TO '{path}' STORED AS ARROW",
+        ))
+        .await?
+        .collect()
+        .await?;
+
+        assert!(std::path::Path::new(&path).exists());
+
+        let read_df = ctx.read_arrow(&path, ArrowReadOptions::default()).await?;
+        let stream = read_df.execute_stream().await?;
+
+        assert_eq!(stream.schema().fields().len(), 1);
+        assert_eq!(stream.schema().field(0).name(), "id");
+
+        let results: Vec<_> = stream.collect().await;
+        let total_rows: usize = results
+            .iter()
+            .filter_map(|r| r.as_ref().ok())
+            .map(|b| b.num_rows())
+            .sum();
+        assert_eq!(total_rows, 0);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_write_empty_arrow_from_record_batch() -> Result<()> {
+        let ctx = SessionContext::new();
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new("name", DataType::Utf8, true),
+        ]));
+        let empty_batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(Int64Array::from(Vec::<i64>::new())),
+                Arc::new(StringArray::from(Vec::<Option<&str>>::new())),
+            ],
+        )?;
+
+        let tmp_dir = tempfile::TempDir::new()?;
+        let path = format!("{}/empty_batch.arrow", tmp_dir.path().to_string_lossy());
+
+        ctx.register_batch("empty_table", empty_batch)?;
+
+        ctx.sql(&format!("COPY empty_table TO '{path}' STORED AS ARROW"))
+            .await?
+            .collect()
+            .await?;
+
+        assert!(std::path::Path::new(&path).exists());
+
+        let read_df = ctx.read_arrow(&path, ArrowReadOptions::default()).await?;
+        let stream = read_df.execute_stream().await?;
+
+        assert_eq!(stream.schema().fields().len(), 2);
+        assert_eq!(stream.schema().field(0).name(), "id");
+        assert_eq!(stream.schema().field(1).name(), "name");
+
+        let results: Vec<_> = stream.collect().await;
+        let total_rows: usize = results
+            .iter()
+            .filter_map(|r| r.as_ref().ok())
+            .map(|b| b.num_rows())
+            .sum();
+        assert_eq!(total_rows, 0);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/core/src/datasource/file_format/avro.rs b/datafusion/core/src/datasource/file_format/avro.rs
index 3428d08a6ae52..a8b48cc736c92 100644
--- a/datafusion/core/src/datasource/file_format/avro.rs
+++ b/datafusion/core/src/datasource/file_format/avro.rs
@@ -26,20 +26,21 @@ mod tests {
     use crate::{
         datasource::file_format::test_util::scan_format, prelude::SessionContext,
     };
-    use arrow::array::{as_string_array, Array};
+    use arrow::array::{Array, as_string_array};
     use datafusion_catalog::Session;
     use datafusion_common::test_util::batches_to_string;
     use datafusion_common::{
+        Result,
         cast::{
             as_binary_array, as_boolean_array, as_float32_array, as_float64_array,
             as_int32_array, as_timestamp_microsecond_array,
         },
-        test_util, Result,
+        test_util,
     };
 
     use datafusion_datasource_avro::AvroFormat;
     use datafusion_execution::config::SessionConfig;
-    use datafusion_physical_plan::{collect, ExecutionPlan};
+    use datafusion_physical_plan::{ExecutionPlan, collect};
     use futures::StreamExt;
     use insta::assert_snapshot;
 
@@ -94,7 +95,7 @@ mod tests {
             .schema()
             .fields()
             .iter()
-            .map(|f| format!("{}: {:?}", f.name(), f.data_type()))
+            .map(|f| format!("{}: {}", f.name(), f.data_type()))
             .collect();
         assert_eq!(
             vec![
@@ -108,7 +109,7 @@ mod tests {
                 "double_col: Float64",
                 "date_string_col: Binary",
                 "string_col: Binary",
-                "timestamp_col: Timestamp(Microsecond, None)",
+                "timestamp_col: Timestamp(µs, \"+00:00\")",
             ],
             x
         );
@@ -116,20 +117,20 @@ mod tests {
         let batches = collect(exec, task_ctx).await?;
         assert_eq!(batches.len(), 1);
 
-        assert_snapshot!(batches_to_string(&batches),@r###"
-            +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+
-            | id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col  | string_col | timestamp_col       |
-            +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+
-            | 4  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30332f30312f3039 | 30         | 2009-03-01T00:00:00 |
-            | 5  | false    | 1           | 1            | 1       | 10         | 1.1       | 10.1       | 30332f30312f3039 | 31         | 2009-03-01T00:01:00 |
-            | 6  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30342f30312f3039 | 30         | 2009-04-01T00:00:00 |
-            | 7  | false    | 1           | 1            | 1       | 10         | 1.1       | 10.1       | 30342f30312f3039 | 31         | 2009-04-01T00:01:00 |
-            | 2  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30322f30312f3039 | 30         | 2009-02-01T00:00:00 |
-            | 3  | false    | 1           | 1            | 1       | 10         | 1.1       | 10.1       | 30322f30312f3039 | 31         | 2009-02-01T00:01:00 |
-            | 0  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30312f30312f3039 | 30         | 2009-01-01T00:00:00 |
-            | 1  | false    | 1           | 1            | 1       | 10         | 1.1       | 10.1       | 30312f30312f3039 | 31         | 2009-01-01T00:01:00 |
-            +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+
-        "###);
+        assert_snapshot!(batches_to_string(&batches),@r"
+        +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+----------------------+
+        | id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col  | string_col | timestamp_col        |
+        +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+----------------------+
+        | 4  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30332f30312f3039 | 30         | 2009-03-01T00:00:00Z |
+        | 5  | false    | 1           | 1            | 1       | 10         | 1.1       | 10.1       | 30332f30312f3039 | 31         | 2009-03-01T00:01:00Z |
+        | 6  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30342f30312f3039 | 30         | 2009-04-01T00:00:00Z |
+        | 7  | false    | 1           | 1            | 1       | 10         | 1.1       | 10.1       | 30342f30312f3039 | 31         | 2009-04-01T00:01:00Z |
+        | 2  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30322f30312f3039 | 30         | 2009-02-01T00:00:00Z |
+        | 3  | false    | 1           | 1            | 1       | 10         | 1.1       | 10.1       | 30322f30312f3039 | 31         | 2009-02-01T00:01:00Z |
+        | 0  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30312f30312f3039 | 30         | 2009-01-01T00:00:00Z |
+        | 1  | false    | 1           | 1            | 1       | 10         | 1.1       | 10.1       | 30312f30312f3039 | 31         | 2009-01-01T00:01:00Z |
+        +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+----------------------+
+        ");
         Ok(())
     }
 
@@ -245,7 +246,10 @@ mod tests {
             values.push(array.value(i));
         }
 
-        assert_eq!("[1235865600000000, 1235865660000000, 1238544000000000, 1238544060000000, 1233446400000000, 1233446460000000, 1230768000000000, 1230768060000000]", format!("{values:?}"));
+        assert_eq!(
+            "[1235865600000000, 1235865660000000, 1238544000000000, 1238544060000000, 1233446400000000, 1233446460000000, 1230768000000000, 1230768060000000]",
+            format!("{values:?}")
+        );
 
         Ok(())
     }
diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs
index 52fb8ae904ebf..a068b4f5c0413 100644
--- a/datafusion/core/src/datasource/file_format/csv.rs
+++ b/datafusion/core/src/datasource/file_format/csv.rs
@@ -32,12 +32,12 @@ mod tests {
     use crate::prelude::{CsvReadOptions, SessionConfig, SessionContext};
     use arrow_schema::{DataType, Field, Schema, SchemaRef};
     use datafusion_catalog::Session;
+    use datafusion_common::Result;
     use datafusion_common::cast::as_string_array;
     use datafusion_common::config::CsvOptions;
     use datafusion_common::internal_err;
     use datafusion_common::stats::Precision;
     use datafusion_common::test_util::{arrow_test_data, batches_to_string};
-    use datafusion_common::Result;
     use datafusion_datasource::decoder::{
         BatchDeserializer, DecoderDeserializer, DeserializerOutput,
     };
@@ -45,7 +45,7 @@ mod tests {
     use datafusion_datasource::file_format::FileFormat;
     use datafusion_datasource::write::BatchSerializer;
     use datafusion_expr::{col, lit};
-    use datafusion_physical_plan::{collect, ExecutionPlan};
+    use datafusion_physical_plan::{ExecutionPlan, collect};
 
     use arrow::array::{
         Array, BooleanArray, Float64Array, Int32Array, RecordBatch, StringArray,
@@ -57,15 +57,16 @@ mod tests {
     use bytes::Bytes;
     use chrono::DateTime;
     use datafusion_common::parsers::CompressionTypeVariant;
-    use futures::stream::BoxStream;
     use futures::StreamExt;
+    use futures::stream::BoxStream;
     use insta::assert_snapshot;
     use object_store::chunked::ChunkedStore;
     use object_store::local::LocalFileSystem;
     use object_store::path::Path;
     use object_store::{
         Attributes, GetOptions, GetResult, GetResultPayload, ListResult, MultipartUpload,
-        ObjectMeta, ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult,
+        ObjectMeta, ObjectStore, ObjectStoreExt, PutMultipartOptions, PutOptions,
+        PutPayload, PutResult,
     };
     use regex::Regex;
     use rstest::*;
@@ -104,10 +105,6 @@ mod tests {
             unimplemented!()
         }
 
-        async fn get(&self, location: &Path) -> object_store::Result<GetResult> {
-            self.get_opts(location, GetOptions::default()).await
-        }
-
         async fn get_opts(
             &self,
             location: &Path,
@@ -117,6 +114,8 @@ mod tests {
             let len = bytes.len() as u64;
             let range = 0..len * self.max_iterations;
             let arc = self.iterations_detected.clone();
+            #[expect(clippy::result_large_err)]
+            // closure only ever returns Ok; Err type is never constructed
             let stream = futures::stream::repeat_with(move || {
                 let arc_inner = arc.clone();
                 *arc_inner.lock().unwrap() += 1;
@@ -147,14 +146,6 @@ mod tests {
             unimplemented!()
         }
 
-        async fn head(&self, _location: &Path) -> object_store::Result<ObjectMeta> {
-            unimplemented!()
-        }
-
-        async fn delete(&self, _location: &Path) -> object_store::Result<()> {
-            unimplemented!()
-        }
-
         fn list(
             &self,
             _prefix: Option<&Path>,
@@ -169,17 +160,21 @@ mod tests {
             unimplemented!()
         }
 
-        async fn copy(&self, _from: &Path, _to: &Path) -> object_store::Result<()> {
-            unimplemented!()
-        }
-
-        async fn copy_if_not_exists(
+        async fn copy_opts(
             &self,
             _from: &Path,
             _to: &Path,
+            _options: object_store::CopyOptions,
         ) -> object_store::Result<()> {
             unimplemented!()
         }
+
+        fn delete_stream(
+            &self,
+            _locations: BoxStream<'static, object_store::Result<Path>>,
+        ) -> BoxStream<'static, object_store::Result<Path>> {
+            unimplemented!()
+        }
     }
 
     impl VariableStream {
@@ -621,15 +616,15 @@ mod tests {
             .collect()
             .await?;
 
-        assert_snapshot!(batches_to_string(&record_batch), @r###"
-            +----+------+
-            | c2 | c3   |
-            +----+------+
-            | 5  | 36   |
-            | 5  | -31  |
-            | 5  | -101 |
-            +----+------+
-        "###);
+        assert_snapshot!(batches_to_string(&record_batch), @r"
+        +----+------+
+        | c2 | c3   |
+        +----+------+
+        | 5  | 36   |
+        | 5  | -31  |
+        | 5  | -101 |
+        +----+------+
+        ");
 
         Ok(())
     }
@@ -706,11 +701,11 @@ mod tests {
 
         let re = Regex::new(r"DataSourceExec: file_groups=\{(\d+) group").unwrap();
 
-        if let Some(captures) = re.captures(&plan) {
-            if let Some(match_) = captures.get(1) {
-                let n_partitions = match_.as_str().parse::<usize>().unwrap();
-                return Ok(n_partitions);
-            }
+        if let Some(captures) = re.captures(&plan)
+            && let Some(match_) = captures.get(1)
+        {
+            let n_partitions = match_.as_str().parse::<usize>().unwrap();
+            return Ok(n_partitions);
         }
 
         internal_err!("query contains no DataSourceExec")
@@ -736,13 +731,13 @@ mod tests {
         let query_result = ctx.sql(query).await?.collect().await?;
         let actual_partitions = count_query_csv_partitions(&ctx, query).await?;
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r###"
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r"
         +--------------+
         | sum(aggr.c2) |
         +--------------+
         | 285          |
         +--------------+
-        "###);
+        ");
         }
 
         assert_eq!(n_partitions, actual_partitions);
@@ -775,13 +770,13 @@ mod tests {
         let query_result = ctx.sql(query).await?.collect().await?;
         let actual_partitions = count_query_csv_partitions(&ctx, query).await?;
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r###"
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r"
         +--------------+
         | sum(aggr.c3) |
         +--------------+
         | 781          |
         +--------------+
-        "###);
+        ");
         }
 
         assert_eq!(1, actual_partitions); // Compressed csv won't be scanned in parallel
@@ -812,13 +807,13 @@ mod tests {
         let query_result = ctx.sql(query).await?.collect().await?;
         let actual_partitions = count_query_csv_partitions(&ctx, query).await?;
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r###"
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r"
         +--------------+
         | sum(aggr.c3) |
         +--------------+
         | 781          |
         +--------------+
-        "###);
+        ");
         }
 
         assert_eq!(1, actual_partitions); // csv won't be scanned in parallel when newlines_in_values is set
@@ -843,10 +838,10 @@ mod tests {
         let query = "select * from empty where random() > 0.5;";
         let query_result = ctx.sql(query).await?.collect().await?;
 
-        assert_snapshot!(batches_to_string(&query_result),@r###"
-            ++
-            ++
-        "###);
+        assert_snapshot!(batches_to_string(&query_result),@r"
+        ++
+        ++
+        ");
 
         Ok(())
     }
@@ -868,10 +863,10 @@ mod tests {
         let query = "select * from empty where random() > 0.5;";
         let query_result = ctx.sql(query).await?.collect().await?;
 
-        assert_snapshot!(batches_to_string(&query_result),@r###"
-            ++
-            ++
-        "###);
+        assert_snapshot!(batches_to_string(&query_result),@r"
+        ++
+        ++
+        ");
 
         Ok(())
     }
@@ -944,17 +939,19 @@ mod tests {
 
         let files: Vec<_> = std::fs::read_dir(&path).unwrap().collect();
         assert_eq!(files.len(), 1);
-        assert!(files
-            .last()
-            .unwrap()
-            .as_ref()
-            .unwrap()
-            .path()
-            .file_name()
-            .unwrap()
-            .to_str()
-            .unwrap()
-            .ends_with(".csv.gz"));
+        assert!(
+            files
+                .last()
+                .unwrap()
+                .as_ref()
+                .unwrap()
+                .path()
+                .file_name()
+                .unwrap()
+                .to_str()
+                .unwrap()
+                .ends_with(".csv.gz")
+        );
 
         Ok(())
     }
@@ -983,17 +980,19 @@ mod tests {
 
         let files: Vec<_> = std::fs::read_dir(&path).unwrap().collect();
         assert_eq!(files.len(), 1);
-        assert!(files
-            .last()
-            .unwrap()
-            .as_ref()
-            .unwrap()
-            .path()
-            .file_name()
-            .unwrap()
-            .to_str()
-            .unwrap()
-            .ends_with(".csv"));
+        assert!(
+            files
+                .last()
+                .unwrap()
+                .as_ref()
+                .unwrap()
+                .path()
+                .file_name()
+                .unwrap()
+                .to_str()
+                .unwrap()
+                .ends_with(".csv")
+        );
 
         Ok(())
     }
@@ -1032,10 +1031,10 @@ mod tests {
         let query = "select * from empty where random() > 0.5;";
         let query_result = ctx.sql(query).await?.collect().await?;
 
-        assert_snapshot!(batches_to_string(&query_result),@r###"
-            ++
-            ++
-        "###);
+        assert_snapshot!(batches_to_string(&query_result),@r"
+        ++
+        ++
+        ");
 
         Ok(())
     }
@@ -1084,13 +1083,13 @@ mod tests {
         let query_result = ctx.sql(query).await?.collect().await?;
         let actual_partitions = count_query_csv_partitions(&ctx, query).await?;
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r###"
-            +---------------------+
-            | sum(empty.column_1) |
-            +---------------------+
-            | 10                  |
-            +---------------------+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r"
+        +---------------------+
+        | sum(empty.column_1) |
+        +---------------------+
+        | 10                  |
+        +---------------------+
+        ");}
 
         assert_eq!(n_partitions, actual_partitions); // Won't get partitioned if all files are empty
 
@@ -1132,13 +1131,13 @@ mod tests {
             file_size
         };
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r###"
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r"
         +-----------------------+
         | sum(one_col.column_1) |
         +-----------------------+
         | 50                    |
         +-----------------------+
-        "###);
+        ");
         }
 
         assert_eq!(expected_partitions, actual_partitions);
@@ -1171,13 +1170,13 @@ mod tests {
         let query_result = ctx.sql(query).await?.collect().await?;
         let actual_partitions = count_query_csv_partitions(&ctx, query).await?;
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r###"
-            +---------------+
-            | sum_of_5_cols |
-            +---------------+
-            | 15            |
-            +---------------+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r"
+        +---------------+
+        | sum_of_5_cols |
+        +---------------+
+        | 15            |
+        +---------------+
+        ");}
 
         assert_eq!(n_partitions, actual_partitions);
 
@@ -1191,7 +1190,9 @@ mod tests {
     ) -> Result<()> {
         let schema = csv_schema();
         let generator = CsvBatchGenerator::new(batch_size, line_count);
-        let mut deserializer = csv_deserializer(batch_size, &schema);
+
+        let schema_clone = Arc::clone(&schema);
+        let mut deserializer = csv_deserializer(batch_size, &schema_clone);
 
         for data in generator {
             deserializer.digest(data);
@@ -1230,7 +1231,8 @@ mod tests {
     ) -> Result<()> {
         let schema = csv_schema();
         let generator = CsvBatchGenerator::new(batch_size, line_count);
-        let mut deserializer = csv_deserializer(batch_size, &schema);
+        let schema_clone = Arc::clone(&schema);
+        let mut deserializer = csv_deserializer(batch_size, &schema_clone);
 
         for data in generator {
             deserializer.digest(data);
@@ -1499,7 +1501,7 @@ mod tests {
 
         // Create a temp file with a .csv suffix so the reader accepts it
         let mut tmp = tempfile::Builder::new().suffix(".csv").tempfile()?; // ensures path ends with .csv
-                                                                           // CSV has header "a,b,c". First data row is truncated (only "1,2"), second row is complete.
+        // CSV has header "a,b,c". First data row is truncated (only "1,2"), second row is complete.
         write!(tmp, "a,b,c\n1,2\n3,4,5\n")?;
         let path = tmp.path().to_str().unwrap().to_string();
 
@@ -1529,4 +1531,94 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_write_empty_csv_from_sql() -> Result<()> {
+        let ctx = SessionContext::new();
+        let tmp_dir = tempfile::TempDir::new()?;
+        let path = format!("{}/empty_sql.csv", tmp_dir.path().to_string_lossy());
+        let df = ctx.sql("SELECT CAST(1 AS BIGINT) AS id LIMIT 0").await?;
+        df.write_csv(&path, crate::dataframe::DataFrameWriteOptions::new(), None)
+            .await?;
+        assert!(std::path::Path::new(&path).exists());
+
+        let read_df = ctx
+            .read_csv(&path, CsvReadOptions::default().has_header(true))
+            .await?;
+        let stream = read_df.execute_stream().await?;
+        assert_eq!(stream.schema().fields().len(), 1);
+        assert_eq!(stream.schema().field(0).name(), "id");
+
+        let results: Vec<_> = stream.collect().await;
+        assert_eq!(results.len(), 0);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_write_empty_csv_from_record_batch() -> Result<()> {
+        let ctx = SessionContext::new();
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new("name", DataType::Utf8, true),
+        ]));
+        let empty_batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(arrow::array::Int64Array::from(Vec::<i64>::new())),
+                Arc::new(StringArray::from(Vec::<Option<&str>>::new())),
+            ],
+        )?;
+
+        let tmp_dir = tempfile::TempDir::new()?;
+        let path = format!("{}/empty_batch.csv", tmp_dir.path().to_string_lossy());
+
+        // Write empty RecordBatch
+        let df = ctx.read_batch(empty_batch.clone())?;
+        df.write_csv(&path, crate::dataframe::DataFrameWriteOptions::new(), None)
+            .await?;
+        // Expected the file to exist
+        assert!(std::path::Path::new(&path).exists());
+
+        let read_df = ctx
+            .read_csv(&path, CsvReadOptions::default().has_header(true))
+            .await?;
+        let stream = read_df.execute_stream().await?;
+        assert_eq!(stream.schema().fields().len(), 2);
+        assert_eq!(stream.schema().field(0).name(), "id");
+        assert_eq!(stream.schema().field(1).name(), "name");
+
+        let results: Vec<_> = stream.collect().await;
+        assert_eq!(results.len(), 0);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_infer_schema_with_zero_max_records() -> Result<()> {
+        let session_ctx = SessionContext::new();
+        let state = session_ctx.state();
+
+        let root = format!("{}/csv", arrow_test_data());
+        let format = CsvFormat::default()
+            .with_has_header(true)
+            .with_schema_infer_max_rec(0); // Set to 0 to disable inference
+        let exec = scan_format(
+            &state,
+            &format,
+            None,
+            &root,
+            "aggregate_test_100.csv",
+            None,
+            None,
+        )
+        .await?;
+
+        // related to https://github.com/apache/datafusion/issues/19417
+        for f in exec.schema().fields() {
+            assert_eq!(*f.data_type(), DataType::Utf8);
+        }
+
+        Ok(())
+    }
 }
diff --git a/datafusion/core/src/datasource/file_format/json.rs b/datafusion/core/src/datasource/file_format/json.rs
index 34d3d64f07fb2..5b3e22705620e 100644
--- a/datafusion/core/src/datasource/file_format/json.rs
+++ b/datafusion/core/src/datasource/file_format/json.rs
@@ -25,7 +25,7 @@ mod tests {
     use super::*;
 
     use crate::datasource::file_format::test_util::scan_format;
-    use crate::prelude::{NdJsonReadOptions, SessionConfig, SessionContext};
+    use crate::prelude::{SessionConfig, SessionContext};
     use crate::test::object_store::local_unpartitioned_file;
     use arrow::array::RecordBatch;
     use arrow_schema::Schema;
@@ -36,7 +36,7 @@ mod tests {
         BatchDeserializer, DecoderDeserializer, DeserializerOutput,
     };
     use datafusion_datasource::file_format::FileFormat;
-    use datafusion_physical_plan::{collect, ExecutionPlan};
+    use datafusion_physical_plan::{ExecutionPlan, collect};
 
     use arrow::compute::concat_batches;
     use arrow::datatypes::{DataType, Field};
@@ -46,12 +46,54 @@ mod tests {
     use datafusion_common::internal_err;
     use datafusion_common::stats::Precision;
 
+    use crate::execution::options::JsonReadOptions;
     use datafusion_common::Result;
+    use datafusion_datasource::file_compression_type::FileCompressionType;
     use futures::StreamExt;
     use insta::assert_snapshot;
     use object_store::local::LocalFileSystem;
     use regex::Regex;
     use rstest::rstest;
+    // ==================== Test Helpers ====================
+
+    /// Create a temporary JSON file and return (TempDir, path)
+    fn create_temp_json(content: &str) -> (tempfile::TempDir, String) {
+        let tmp_dir = tempfile::TempDir::new().unwrap();
+        let path = tmp_dir.path().join("test.json");
+        std::fs::write(&path, content).unwrap();
+        (tmp_dir, path.to_string_lossy().to_string())
+    }
+
+    /// Infer schema from JSON array format file
+    async fn infer_json_array_schema(
+        content: &str,
+    ) -> Result<arrow::datatypes::SchemaRef> {
+        let (_tmp_dir, path) = create_temp_json(content);
+        let session = SessionContext::new();
+        let ctx = session.state();
+        let store = Arc::new(LocalFileSystem::new()) as _;
+        let format = JsonFormat::default().with_newline_delimited(false);
+        format
+            .infer_schema(&ctx, &store, &[local_unpartitioned_file(&path)])
+            .await
+    }
+
+    /// Register a JSON array table and run a query
+    async fn query_json_array(content: &str, query: &str) -> Result<Vec<RecordBatch>> {
+        let (_tmp_dir, path) = create_temp_json(content);
+        let ctx = SessionContext::new();
+        let options = JsonReadOptions::default().newline_delimited(false);
+        ctx.register_json("test_table", &path, options).await?;
+        ctx.sql(query).await?.collect().await
+    }
+
+    /// Register a JSON array table and run a query, return formatted string
+    async fn query_json_array_str(content: &str, query: &str) -> Result<String> {
+        let result = query_json_array(content, query).await?;
+        Ok(batches_to_string(&result))
+    }
+
+    // ==================== Existing Tests ====================
 
     #[tokio::test]
     async fn read_small_batches() -> Result<()> {
@@ -187,11 +229,11 @@ mod tests {
 
         let re = Regex::new(r"file_groups=\{(\d+) group").unwrap();
 
-        if let Some(captures) = re.captures(&plan) {
-            if let Some(match_) = captures.get(1) {
-                let count = match_.as_str().parse::<usize>().unwrap();
-                return Ok(count);
-            }
+        if let Some(captures) = re.captures(&plan)
+            && let Some(match_) = captures.get(1)
+        {
+            let count = match_.as_str().parse::<usize>().unwrap();
+            return Ok(count);
         }
 
         internal_err!("Query contains no Exec: file_groups")
@@ -208,7 +250,7 @@ mod tests {
         let ctx = SessionContext::new_with_config(config);
 
         let table_path = "tests/data/1.json";
-        let options = NdJsonReadOptions::default();
+        let options = JsonReadOptions::default();
 
         ctx.register_json("json_parallel", table_path, options)
             .await?;
@@ -218,13 +260,13 @@ mod tests {
         let result = ctx.sql(query).await?.collect().await?;
         let actual_partitions = count_num_partitions(&ctx, query).await?;
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&result),@r###"
-            +----------------------+
-            | sum(json_parallel.a) |
-            +----------------------+
-            | -7                   |
-            +----------------------+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&result),@r"
+        +----------------------+
+        | sum(json_parallel.a) |
+        +----------------------+
+        | -7                   |
+        +----------------------+
+        ");}
 
         assert_eq!(n_partitions, actual_partitions);
 
@@ -240,7 +282,7 @@ mod tests {
         let ctx = SessionContext::new_with_config(config);
 
         let table_path = "tests/data/empty.json";
-        let options = NdJsonReadOptions::default();
+        let options = JsonReadOptions::default();
 
         ctx.register_json("json_parallel_empty", table_path, options)
             .await?;
@@ -249,10 +291,10 @@ mod tests {
 
         let result = ctx.sql(query).await?.collect().await?;
 
-        assert_snapshot!(batches_to_string(&result),@r###"
-            ++
-            ++
-        "###);
+        assert_snapshot!(batches_to_string(&result),@r"
+        ++
+        ++
+        ");
 
         Ok(())
     }
@@ -284,15 +326,15 @@ mod tests {
         }
         assert_eq!(deserializer.next()?, DeserializerOutput::InputExhausted);
 
-        assert_snapshot!(batches_to_string(&[all_batches]),@r###"
-            +----+----+----+----+----+
-            | c1 | c2 | c3 | c4 | c5 |
-            +----+----+----+----+----+
-            | 1  | 2  | 3  | 4  | 5  |
-            | 6  | 7  | 8  | 9  | 10 |
-            | 11 | 12 | 13 | 14 | 15 |
-            +----+----+----+----+----+
-        "###);
+        assert_snapshot!(batches_to_string(&[all_batches]),@r"
+        +----+----+----+----+----+
+        | c1 | c2 | c3 | c4 | c5 |
+        +----+----+----+----+----+
+        | 1  | 2  | 3  | 4  | 5  |
+        | 6  | 7  | 8  | 9  | 10 |
+        | 11 | 12 | 13 | 14 | 15 |
+        +----+----+----+----+----+
+        ");
 
         Ok(())
     }
@@ -314,7 +356,6 @@ mod tests {
             .digest(r#"{ "c1": 11, "c2": 12, "c3": 13, "c4": 14, "c5": 15 }"#.into());
 
         let mut all_batches = RecordBatch::new_empty(schema.clone());
-        // We get RequiresMoreData after 2 batches because of how json::Decoder works
         for _ in 0..2 {
             let output = deserializer.next()?;
             let DeserializerOutput::RecordBatch(batch) = output else {
@@ -324,14 +365,14 @@ mod tests {
         }
         assert_eq!(deserializer.next()?, DeserializerOutput::RequiresMoreData);
 
-        insta::assert_snapshot!(fmt_batches(&[all_batches]),@r###"
-            +----+----+----+----+----+
-            | c1 | c2 | c3 | c4 | c5 |
-            +----+----+----+----+----+
-            | 1  | 2  | 3  | 4  | 5  |
-            | 6  | 7  | 8  | 9  | 10 |
-            +----+----+----+----+----+
-        "###);
+        insta::assert_snapshot!(fmt_batches(&[all_batches]),@r"
+        +----+----+----+----+----+
+        | c1 | c2 | c3 | c4 | c5 |
+        +----+----+----+----+----+
+        | 1  | 2  | 3  | 4  | 5  |
+        | 6  | 7  | 8  | 9  | 10 |
+        +----+----+----+----+----+
+        ");
 
         Ok(())
     }
@@ -349,4 +390,248 @@ mod tests {
     fn fmt_batches(batches: &[RecordBatch]) -> String {
         pretty::pretty_format_batches(batches).unwrap().to_string()
     }
+
+    #[tokio::test]
+    async fn test_write_empty_json_from_sql() -> Result<()> {
+        let ctx = SessionContext::new();
+        let tmp_dir = tempfile::TempDir::new()?;
+        let path = tmp_dir.path().join("empty_sql.json");
+        let path = path.to_string_lossy().to_string();
+        let df = ctx.sql("SELECT CAST(1 AS BIGINT) AS id LIMIT 0").await?;
+        df.write_json(&path, crate::dataframe::DataFrameWriteOptions::new(), None)
+            .await?;
+        assert!(std::path::Path::new(&path).exists());
+        let metadata = std::fs::metadata(&path)?;
+        assert_eq!(metadata.len(), 0);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_write_empty_json_from_record_batch() -> Result<()> {
+        let ctx = SessionContext::new();
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new("name", DataType::Utf8, true),
+        ]));
+        let empty_batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(arrow::array::Int64Array::from(Vec::<i64>::new())),
+                Arc::new(arrow::array::StringArray::from(Vec::<Option<&str>>::new())),
+            ],
+        )?;
+
+        let tmp_dir = tempfile::TempDir::new()?;
+        let path = tmp_dir.path().join("empty_batch.json");
+        let path = path.to_string_lossy().to_string();
+        let df = ctx.read_batch(empty_batch.clone())?;
+        df.write_json(&path, crate::dataframe::DataFrameWriteOptions::new(), None)
+            .await?;
+        assert!(std::path::Path::new(&path).exists());
+        let metadata = std::fs::metadata(&path)?;
+        assert_eq!(metadata.len(), 0);
+        Ok(())
+    }
+
+    // ==================== JSON Array Format Tests ====================
+
+    #[tokio::test]
+    async fn test_json_array_schema_inference() -> Result<()> {
+        let schema = infer_json_array_schema(
+            r#"[{"a": 1, "b": 2.0, "c": true}, {"a": 2, "b": 3.5, "c": false}]"#,
+        )
+        .await?;
+
+        let fields: Vec<_> = schema
+            .fields()
+            .iter()
+            .map(|f| format!("{}: {:?}", f.name(), f.data_type()))
+            .collect();
+        assert_eq!(vec!["a: Int64", "b: Float64", "c: Boolean"], fields);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_empty() -> Result<()> {
+        let schema = infer_json_array_schema("[]").await?;
+        assert_eq!(schema.fields().len(), 0);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_nested_struct() -> Result<()> {
+        let schema = infer_json_array_schema(
+            r#"[{"id": 1, "info": {"name": "Alice", "age": 30}}]"#,
+        )
+        .await?;
+
+        let info_field = schema.field_with_name("info").unwrap();
+        assert!(matches!(info_field.data_type(), DataType::Struct(_)));
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_list_type() -> Result<()> {
+        let schema =
+            infer_json_array_schema(r#"[{"id": 1, "tags": ["a", "b", "c"]}]"#).await?;
+
+        let tags_field = schema.field_with_name("tags").unwrap();
+        assert!(matches!(tags_field.data_type(), DataType::List(_)));
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_basic_query() -> Result<()> {
+        let result = query_json_array_str(
+            r#"[{"a": 1, "b": "hello"}, {"a": 2, "b": "world"}, {"a": 3, "b": "test"}]"#,
+            "SELECT a, b FROM test_table ORDER BY a",
+        )
+        .await?;
+
+        assert_snapshot!(result, @r"
+        +---+-------+
+        | a | b     |
+        +---+-------+
+        | 1 | hello |
+        | 2 | world |
+        | 3 | test  |
+        +---+-------+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_with_nulls() -> Result<()> {
+        let result = query_json_array_str(
+            r#"[{"id": 1, "name": "Alice"}, {"id": 2, "name": null}, {"id": 3, "name": "Charlie"}]"#,
+            "SELECT id, name FROM test_table ORDER BY id",
+        )
+            .await?;
+
+        assert_snapshot!(result, @r"
+        +----+---------+
+        | id | name    |
+        +----+---------+
+        | 1  | Alice   |
+        | 2  |         |
+        | 3  | Charlie |
+        +----+---------+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_unnest() -> Result<()> {
+        let result = query_json_array_str(
+            r#"[{"id": 1, "values": [10, 20, 30]}, {"id": 2, "values": [40, 50]}]"#,
+            "SELECT id, unnest(values) as value FROM test_table ORDER BY id, value",
+        )
+        .await?;
+
+        assert_snapshot!(result, @r"
+        +----+-------+
+        | id | value |
+        +----+-------+
+        | 1  | 10    |
+        | 1  | 20    |
+        | 1  | 30    |
+        | 2  | 40    |
+        | 2  | 50    |
+        +----+-------+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_unnest_struct() -> Result<()> {
+        let result = query_json_array_str(
+            r#"[{"id": 1, "orders": [{"product": "A", "qty": 2}, {"product": "B", "qty": 3}]}, {"id": 2, "orders": [{"product": "C", "qty": 1}]}]"#,
+            "SELECT id, unnest(orders)['product'] as product, unnest(orders)['qty'] as qty FROM test_table ORDER BY id, product",
+        )
+            .await?;
+
+        assert_snapshot!(result, @r"
+        +----+---------+-----+
+        | id | product | qty |
+        +----+---------+-----+
+        | 1  | A       | 2   |
+        | 1  | B       | 3   |
+        | 2  | C       | 1   |
+        +----+---------+-----+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_nested_struct_access() -> Result<()> {
+        let result = query_json_array_str(
+            r#"[{"id": 1, "dept": {"name": "Engineering", "head": "Alice"}}, {"id": 2, "dept": {"name": "Sales", "head": "Bob"}}]"#,
+            "SELECT id, dept['name'] as dept_name, dept['head'] as head FROM test_table ORDER BY id",
+        )
+            .await?;
+
+        assert_snapshot!(result, @r"
+        +----+-------------+-------+
+        | id | dept_name   | head  |
+        +----+-------------+-------+
+        | 1  | Engineering | Alice |
+        | 2  | Sales       | Bob   |
+        +----+-------------+-------+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_with_compression() -> Result<()> {
+        use flate2::Compression;
+        use flate2::write::GzEncoder;
+        use std::io::Write;
+
+        let tmp_dir = tempfile::TempDir::new()?;
+        let path = tmp_dir.path().join("array.json.gz");
+        let path = path.to_string_lossy().to_string();
+
+        let file = std::fs::File::create(&path)?;
+        let mut encoder = GzEncoder::new(file, Compression::default());
+        encoder.write_all(
+            r#"[{"a": 1, "b": "hello"}, {"a": 2, "b": "world"}]"#.as_bytes(),
+        )?;
+        encoder.finish()?;
+
+        let ctx = SessionContext::new();
+        let options = JsonReadOptions::default()
+            .newline_delimited(false)
+            .file_compression_type(FileCompressionType::GZIP)
+            .file_extension(".json.gz");
+
+        ctx.register_json("test_table", &path, options).await?;
+        let result = ctx
+            .sql("SELECT a, b FROM test_table ORDER BY a")
+            .await?
+            .collect()
+            .await?;
+
+        assert_snapshot!(batches_to_string(&result), @r"
+        +---+-------+
+        | a | b     |
+        +---+-------+
+        | 1 | hello |
+        | 2 | world |
+        +---+-------+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_list_of_structs() -> Result<()> {
+        let batches = query_json_array(
+            r#"[{"id": 1, "items": [{"name": "x", "price": 10.5}]}, {"id": 2, "items": []}]"#,
+            "SELECT id, items FROM test_table ORDER BY id",
+        )
+            .await?;
+
+        assert_eq!(1, batches.len());
+        assert_eq!(2, batches[0].num_rows());
+        Ok(())
+    }
 }
diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs
index 4881783eeba69..b04238ebc9b37 100644
--- a/datafusion/core/src/datasource/file_format/mod.rs
+++ b/datafusion/core/src/datasource/file_format/mod.rs
@@ -39,8 +39,9 @@ pub(crate) mod test_util {
     use arrow_schema::SchemaRef;
     use datafusion_catalog::Session;
     use datafusion_common::Result;
+    use datafusion_datasource::TableSchema;
     use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
-    use datafusion_datasource::{file_format::FileFormat, PartitionedFile};
+    use datafusion_datasource::{PartitionedFile, file_format::FileFormat};
     use datafusion_execution::object_store::ObjectStoreUrl;
     use std::sync::Arc;
 
@@ -66,31 +67,24 @@ pub(crate) mod test_util {
                 .await?
         };
 
+        let table_schema = TableSchema::new(file_schema.clone(), vec![]);
+
         let statistics = format
             .infer_stats(state, &store, file_schema.clone(), &meta)
             .await?;
 
-        let file_groups = vec![vec![PartitionedFile {
-            object_meta: meta,
-            partition_values: vec![],
-            range: None,
-            statistics: None,
-            extensions: None,
-            metadata_size_hint: None,
-        }]
-        .into()];
+        let file_groups = vec![vec![PartitionedFile::new_from_meta(meta)].into()];
 
         let exec = format
             .create_physical_plan(
                 state,
                 FileScanConfigBuilder::new(
                     ObjectStoreUrl::local_filesystem(),
-                    file_schema,
-                    format.file_source(),
+                    format.file_source(table_schema),
                 )
                 .with_file_groups(file_groups)
                 .with_statistics(statistics)
-                .with_projection_indices(projection)
+                .with_projection_indices(projection)?
                 .with_limit(limit)
                 .build(),
             )
@@ -131,7 +125,10 @@ mod tests {
             .write_parquet(out_dir_url, DataFrameWriteOptions::new(), None)
             .await
             .expect_err("should fail because input file does not match inferred schema");
-        assert_eq!(e.strip_backtrace(), "Arrow error: Parser error: Error while parsing value 'd' as type 'Int64' for column 0 at line 4. Row data: '[d,4]'");
+        assert_eq!(
+            e.strip_backtrace(),
+            "Arrow error: Parser error: Error while parsing value 'd' as type 'Int64' for column 0 at line 4. Row data: '[d,4]'"
+        );
         Ok(())
     }
 }
diff --git a/datafusion/core/src/datasource/file_format/options.rs b/datafusion/core/src/datasource/file_format/options.rs
index e78c5f09553cc..bd0ac36087381 100644
--- a/datafusion/core/src/datasource/file_format/options.rs
+++ b/datafusion/core/src/datasource/file_format/options.rs
@@ -25,9 +25,9 @@ use crate::datasource::file_format::avro::AvroFormat;
 #[cfg(feature = "parquet")]
 use crate::datasource::file_format::parquet::ParquetFormat;
 
+use crate::datasource::file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD;
 use crate::datasource::file_format::arrow::ArrowFormat;
 use crate::datasource::file_format::file_compression_type::FileCompressionType;
-use crate::datasource::file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD;
 use crate::datasource::listing::ListingTableUrl;
 use crate::datasource::{file_format::csv::CsvFormat, listing::ListingOptions};
 use crate::error::Result;
@@ -442,14 +442,23 @@ impl<'a> AvroReadOptions<'a> {
     }
 }
 
-/// Options that control the reading of Line-delimited JSON files (NDJson)
+#[deprecated(
+    since = "53.0.0",
+    note = "Use `JsonReadOptions` instead. This alias will be removed in a future version."
+)]
+#[doc = "Deprecated: Use [`JsonReadOptions`] instead."]
+pub type NdJsonReadOptions<'a> = JsonReadOptions<'a>;
+
+/// Options that control the reading of JSON files.
+///
+/// Supports both newline-delimited JSON (NDJSON) and JSON array formats.
 ///
 /// Note this structure is supplied when a datasource is created and
-/// can not not vary from statement to statement. For settings that
+/// can not vary from statement to statement. For settings that
 /// can vary statement to statement see
 /// [`ConfigOptions`](crate::config::ConfigOptions).
 #[derive(Clone)]
-pub struct NdJsonReadOptions<'a> {
+pub struct JsonReadOptions<'a> {
     /// The data source schema.
     pub schema: Option<&'a Schema>,
     /// Max number of rows to read from JSON files for schema inference if needed. Defaults to `DEFAULT_SCHEMA_INFER_MAX_RECORD`.
@@ -465,9 +474,25 @@ pub struct NdJsonReadOptions<'a> {
     pub infinite: bool,
     /// Indicates how the file is sorted
     pub file_sort_order: Vec<Vec<SortExpr>>,
+    /// Whether to read as newline-delimited JSON (default: true).
+    ///
+    /// When `true` (default), expects newline-delimited JSON (NDJSON):
+    /// ```text
+    /// {"key1": 1, "key2": "val"}
+    /// {"key1": 2, "key2": "vals"}
+    /// ```
+    ///
+    /// When `false`, expects JSON array format:
+    /// ```text
+    /// [
+    ///   {"key1": 1, "key2": "val"},
+    ///   {"key1": 2, "key2": "vals"}
+    /// ]
+    /// ```
+    pub newline_delimited: bool,
 }
 
-impl Default for NdJsonReadOptions<'_> {
+impl Default for JsonReadOptions<'_> {
     fn default() -> Self {
         Self {
             schema: None,
@@ -477,11 +502,12 @@ impl Default for NdJsonReadOptions<'_> {
             file_compression_type: FileCompressionType::UNCOMPRESSED,
             infinite: false,
             file_sort_order: vec![],
+            newline_delimited: true,
         }
     }
 }
 
-impl<'a> NdJsonReadOptions<'a> {
+impl<'a> JsonReadOptions<'a> {
     /// Specify table_partition_cols for partition pruning
     pub fn table_partition_cols(
         mut self,
@@ -523,6 +549,32 @@ impl<'a> NdJsonReadOptions<'a> {
         self.file_sort_order = file_sort_order;
         self
     }
+
+    /// Specify how many rows to read for schema inference
+    pub fn schema_infer_max_records(mut self, schema_infer_max_records: usize) -> Self {
+        self.schema_infer_max_records = schema_infer_max_records;
+        self
+    }
+
+    /// Set whether to read as newline-delimited JSON.
+    ///
+    /// When `true` (default), expects newline-delimited JSON (NDJSON):
+    /// ```text
+    /// {"key1": 1, "key2": "val"}
+    /// {"key1": 2, "key2": "vals"}
+    /// ```
+    ///
+    /// When `false`, expects JSON array format:
+    /// ```text
+    /// [
+    ///   {"key1": 1, "key2": "val"},
+    ///   {"key1": 2, "key2": "vals"}
+    /// ]
+    /// ```
+    pub fn newline_delimited(mut self, newline_delimited: bool) -> Self {
+        self.newline_delimited = newline_delimited;
+        self
+    }
 }
 
 #[async_trait]
@@ -648,7 +700,7 @@ impl ReadOptions<'_> for ParquetReadOptions<'_> {
 }
 
 #[async_trait]
-impl ReadOptions<'_> for NdJsonReadOptions<'_> {
+impl ReadOptions<'_> for JsonReadOptions<'_> {
     fn to_listing_options(
         &self,
         config: &SessionConfig,
@@ -657,7 +709,8 @@ impl ReadOptions<'_> for NdJsonReadOptions<'_> {
         let file_format = JsonFormat::default()
             .with_options(table_options.json)
             .with_schema_infer_max_rec(self.schema_infer_max_records)
-            .with_file_compression_type(self.file_compression_type.to_owned());
+            .with_file_compression_type(self.file_compression_type.to_owned())
+            .with_newline_delimited(self.newline_delimited);
 
         ListingOptions::new(Arc::new(file_format))
             .with_file_extension(self.file_extension)
diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs
index 52c5393e10319..6a8f7ab999757 100644
--- a/datafusion/core/src/datasource/file_format/parquet.rs
+++ b/datafusion/core/src/datasource/file_format/parquet.rs
@@ -107,8 +107,8 @@ pub(crate) mod test_util {
 mod tests {
 
     use std::fmt::{self, Display, Formatter};
-    use std::sync::atomic::{AtomicUsize, Ordering};
     use std::sync::Arc;
+    use std::sync::atomic::{AtomicUsize, Ordering};
     use std::time::Duration;
 
     use crate::datasource::file_format::parquet::test_util::store_parquet;
@@ -120,6 +120,7 @@ mod tests {
     use arrow::array::RecordBatch;
     use arrow_schema::Schema;
     use datafusion_catalog::Session;
+    use datafusion_common::ScalarValue::Utf8;
     use datafusion_common::cast::{
         as_binary_array, as_binary_view_array, as_boolean_array, as_float32_array,
         as_float64_array, as_int32_array, as_timestamp_nanosecond_array,
@@ -127,43 +128,45 @@ mod tests {
     use datafusion_common::config::{ParquetOptions, TableParquetOptions};
     use datafusion_common::stats::Precision;
     use datafusion_common::test_util::batches_to_string;
-    use datafusion_common::ScalarValue::Utf8;
     use datafusion_common::{Result, ScalarValue};
     use datafusion_datasource::file_format::FileFormat;
-    use datafusion_datasource::file_sink_config::{FileSink, FileSinkConfig};
+    use datafusion_datasource::file_sink_config::{
+        FileOutputMode, FileSink, FileSinkConfig,
+    };
     use datafusion_datasource::{ListingTableUrl, PartitionedFile};
     use datafusion_datasource_parquet::{
         ParquetFormat, ParquetFormatFactory, ParquetSink,
     };
+    use datafusion_execution::TaskContext;
     use datafusion_execution::object_store::ObjectStoreUrl;
     use datafusion_execution::runtime_env::RuntimeEnv;
-    use datafusion_execution::TaskContext;
     use datafusion_expr::dml::InsertOp;
     use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
-    use datafusion_physical_plan::{collect, ExecutionPlan};
+    use datafusion_physical_plan::{ExecutionPlan, collect};
 
     use crate::test_util::bounded_stream;
     use arrow::array::{
-        types::Int32Type, Array, ArrayRef, DictionaryArray, Int32Array, Int64Array,
-        StringArray,
+        Array, ArrayRef, DictionaryArray, Int32Array, Int64Array, StringArray,
+        types::Int32Type,
     };
     use arrow::datatypes::{DataType, Field};
     use async_trait::async_trait;
     use datafusion_datasource::file_groups::FileGroup;
     use datafusion_datasource_parquet::metadata::DFParquetMetadata;
-    use futures::stream::BoxStream;
     use futures::StreamExt;
+    use futures::stream::BoxStream;
     use insta::assert_snapshot;
     use object_store::local::LocalFileSystem;
-    use object_store::ObjectMeta;
+    use object_store::{CopyOptions, ObjectMeta};
     use object_store::{
-        path::Path, GetOptions, GetResult, ListResult, MultipartUpload, ObjectStore,
-        PutMultipartOptions, PutOptions, PutPayload, PutResult,
+        GetOptions, GetResult, ListResult, MultipartUpload, ObjectStore,
+        PutMultipartOptions, PutOptions, PutPayload, PutResult, path::Path,
     };
-    use parquet::arrow::arrow_reader::ArrowReaderOptions;
     use parquet::arrow::ParquetRecordBatchStreamBuilder;
+    use parquet::arrow::arrow_reader::ArrowReaderOptions;
     use parquet::file::metadata::{
-        KeyValue, ParquetColumnIndex, ParquetMetaData, ParquetOffsetIndex,
+        KeyValue, PageIndexPolicy, ParquetColumnIndex, ParquetMetaData,
+        ParquetOffsetIndex,
     };
     use parquet::file::page_index::column_index::ColumnIndexMetaData;
     use tokio::fs::File;
@@ -308,7 +311,7 @@ mod tests {
             _payload: PutPayload,
             _opts: PutOptions,
         ) -> object_store::Result<PutResult> {
-            Err(object_store::Error::NotImplemented)
+            unimplemented!()
         }
 
         async fn put_multipart_opts(
@@ -316,7 +319,7 @@ mod tests {
             _location: &Path,
             _opts: PutMultipartOptions,
         ) -> object_store::Result<Box<dyn MultipartUpload>> {
-            Err(object_store::Error::NotImplemented)
+            unimplemented!()
         }
 
         async fn get_opts(
@@ -328,40 +331,34 @@ mod tests {
             self.inner.get_opts(location, options).await
         }
 
-        async fn head(&self, _location: &Path) -> object_store::Result<ObjectMeta> {
-            Err(object_store::Error::NotImplemented)
-        }
-
-        async fn delete(&self, _location: &Path) -> object_store::Result<()> {
-            Err(object_store::Error::NotImplemented)
+        fn delete_stream(
+            &self,
+            _locations: BoxStream<'static, object_store::Result<Path>>,
+        ) -> BoxStream<'static, object_store::Result<Path>> {
+            unimplemented!()
         }
 
         fn list(
             &self,
             _prefix: Option<&Path>,
         ) -> BoxStream<'static, object_store::Result<ObjectMeta>> {
-            Box::pin(futures::stream::once(async {
-                Err(object_store::Error::NotImplemented)
-            }))
+            unimplemented!()
         }
 
         async fn list_with_delimiter(
             &self,
             _prefix: Option<&Path>,
         ) -> object_store::Result<ListResult> {
-            Err(object_store::Error::NotImplemented)
-        }
-
-        async fn copy(&self, _from: &Path, _to: &Path) -> object_store::Result<()> {
-            Err(object_store::Error::NotImplemented)
+            unimplemented!()
         }
 
-        async fn copy_if_not_exists(
+        async fn copy_opts(
             &self,
             _from: &Path,
             _to: &Path,
+            _options: CopyOptions,
         ) -> object_store::Result<()> {
-            Err(object_store::Error::NotImplemented)
+            unimplemented!()
         }
     }
 
@@ -724,7 +721,7 @@ mod tests {
         // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
         assert_eq!(
             exec.partition_statistics(None)?.total_byte_size,
-            Precision::Exact(671)
+            Precision::Absent,
         );
 
         Ok(())
@@ -770,10 +767,9 @@ mod tests {
             exec.partition_statistics(None)?.num_rows,
             Precision::Exact(8)
         );
-        // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
         assert_eq!(
             exec.partition_statistics(None)?.total_byte_size,
-            Precision::Exact(671)
+            Precision::Absent,
         );
         let batches = collect(exec, task_ctx).await?;
         assert_eq!(1, batches.len());
@@ -816,7 +812,7 @@ mod tests {
             .schema()
             .fields()
             .iter()
-            .map(|f| format!("{}: {:?}", f.name(), f.data_type()))
+            .map(|f| format!("{}: {}", f.name(), f.data_type()))
             .collect();
         let y = x.join("\n");
         assert_eq!(expected, y);
@@ -842,7 +838,7 @@ mod tests {
              double_col: Float64\n\
              date_string_col: Binary\n\
              string_col: Binary\n\
-             timestamp_col: Timestamp(Nanosecond, None)";
+             timestamp_col: Timestamp(ns)";
         _run_read_alltypes_plain_parquet(ForceViews::No, no_views).await?;
 
         let with_views = "id: Int32\n\
@@ -855,7 +851,7 @@ mod tests {
              double_col: Float64\n\
              date_string_col: BinaryView\n\
              string_col: BinaryView\n\
-             timestamp_col: Timestamp(Nanosecond, None)";
+             timestamp_col: Timestamp(ns)";
         _run_read_alltypes_plain_parquet(ForceViews::Yes, with_views).await?;
 
         Ok(())
@@ -931,7 +927,10 @@ mod tests {
             values.push(array.value(i));
         }
 
-        assert_eq!("[1235865600000000000, 1235865660000000000, 1238544000000000000, 1238544060000000000, 1233446400000000000, 1233446460000000000, 1230768000000000000, 1230768060000000000]", format!("{values:?}"));
+        assert_eq!(
+            "[1235865600000000000, 1235865660000000000, 1238544000000000000, 1238544060000000000, 1233446400000000000, 1233446460000000000, 1230768000000000000, 1230768060000000000]",
+            format!("{values:?}")
+        );
 
         Ok(())
     }
@@ -1101,7 +1100,8 @@ mod tests {
         let testdata = datafusion_common::test_util::parquet_test_data();
         let path = format!("{testdata}/alltypes_tiny_pages.parquet");
         let file = File::open(path).await?;
-        let options = ArrowReaderOptions::new().with_page_index(true);
+        let options =
+            ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required);
         let builder =
             ParquetRecordBatchStreamBuilder::new_with_options(file, options.clone())
                 .await?
@@ -1204,10 +1204,10 @@ mod tests {
 
         let result = df.collect().await?;
 
-        assert_snapshot!(batches_to_string(&result), @r###"
-            ++
-            ++
-       "###);
+        assert_snapshot!(batches_to_string(&result), @r"
+        ++
+        ++
+        ");
 
         Ok(())
     }
@@ -1233,10 +1233,10 @@ mod tests {
 
         let result = df.collect().await?;
 
-        assert_snapshot!(batches_to_string(&result), @r###"
-            ++
-            ++
-       "###);
+        assert_snapshot!(batches_to_string(&result), @r"
+        ++
+        ++
+        ");
 
         Ok(())
     }
@@ -1364,6 +1364,28 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_write_empty_parquet_from_sql() -> Result<()> {
+        let ctx = SessionContext::new();
+
+        let tmp_dir = tempfile::TempDir::new()?;
+        let path = format!("{}/empty_sql.parquet", tmp_dir.path().to_string_lossy());
+        let df = ctx.sql("SELECT CAST(1 AS INT) AS id LIMIT 0").await?;
+        df.write_parquet(&path, crate::dataframe::DataFrameWriteOptions::new(), None)
+            .await?;
+        // Expected the file to exist
+        assert!(std::path::Path::new(&path).exists());
+        let read_df = ctx.read_parquet(&path, ParquetReadOptions::new()).await?;
+        let stream = read_df.execute_stream().await?;
+        assert_eq!(stream.schema().fields().len(), 1);
+        assert_eq!(stream.schema().field(0).name(), "id");
+
+        let results: Vec<_> = stream.collect().await;
+        assert_eq!(results.len(), 0);
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn parquet_sink_write_insert_schema_into_metadata() -> Result<()> {
         // expected kv metadata without schema
@@ -1523,6 +1545,7 @@ mod tests {
             insert_op: InsertOp::Overwrite,
             keep_partition_by_columns: false,
             file_extension: "parquet".into(),
+            file_output_mode: FileOutputMode::Automatic,
         };
         let parquet_sink = Arc::new(ParquetSink::new(
             file_sink_config,
@@ -1614,6 +1637,7 @@ mod tests {
             insert_op: InsertOp::Overwrite,
             keep_partition_by_columns: false,
             file_extension: "parquet".into(),
+            file_output_mode: FileOutputMode::Automatic,
         };
         let parquet_sink = Arc::new(ParquetSink::new(
             file_sink_config,
@@ -1704,6 +1728,7 @@ mod tests {
                 insert_op: InsertOp::Overwrite,
                 keep_partition_by_columns: false,
                 file_extension: "parquet".into(),
+                file_output_mode: FileOutputMode::Automatic,
             };
             let parquet_sink = Arc::new(ParquetSink::new(
                 file_sink_config,
diff --git a/datafusion/core/src/datasource/listing/mod.rs b/datafusion/core/src/datasource/listing/mod.rs
index c206566a65941..85dee3f91cffb 100644
--- a/datafusion/core/src/datasource/listing/mod.rs
+++ b/datafusion/core/src/datasource/listing/mod.rs
@@ -21,7 +21,8 @@
 mod table;
 pub use datafusion_catalog_listing::helpers;
 pub use datafusion_catalog_listing::{ListingOptions, ListingTable, ListingTableConfig};
-pub use datafusion_datasource::{
-    FileRange, ListingTableUrl, PartitionedFile, PartitionedFileStream,
-};
+// Keep for backwards compatibility until removed
+#[expect(deprecated)]
+pub use datafusion_datasource::PartitionedFileStream;
+pub use datafusion_datasource::{FileRange, ListingTableUrl, PartitionedFile};
 pub use table::ListingTableConfigExt;
diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index 3333b70676203..d14ec1f56dce2 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -107,14 +107,16 @@ impl ListingTableConfigExt for ListingTableConfig {
 
 #[cfg(test)]
 mod tests {
+
     #[cfg(feature = "parquet")]
     use crate::datasource::file_format::parquet::ParquetFormat;
     use crate::datasource::listing::table::ListingTableConfigExt;
+    use crate::execution::options::JsonReadOptions;
     use crate::prelude::*;
     use crate::{
         datasource::{
-            file_format::csv::CsvFormat, file_format::json::JsonFormat,
-            provider_as_source, DefaultTableSource, MemTable,
+            DefaultTableSource, MemTable, file_format::csv::CsvFormat,
+            file_format::json::JsonFormat, provider_as_source,
         },
         execution::options::ArrowReadOptions,
         test::{
@@ -129,33 +131,26 @@ mod tests {
         ListingOptions, ListingTable, ListingTableConfig, SchemaSource,
     };
     use datafusion_common::{
-        assert_contains, plan_err,
+        DataFusionError, Result, ScalarValue, assert_contains,
         stats::Precision,
         test_util::{batches_to_string, datafusion_test_data},
-        ColumnStatistics, DataFusionError, Result, ScalarValue,
     };
+    use datafusion_datasource::ListingTableUrl;
     use datafusion_datasource::file_compression_type::FileCompressionType;
     use datafusion_datasource::file_format::FileFormat;
-    use datafusion_datasource::schema_adapter::{
-        SchemaAdapter, SchemaAdapterFactory, SchemaMapper,
-    };
-    use datafusion_datasource::ListingTableUrl;
     use datafusion_expr::dml::InsertOp;
     use datafusion_expr::{BinaryExpr, LogicalPlanBuilder, Operator};
-    use datafusion_physical_expr::expressions::binary;
     use datafusion_physical_expr::PhysicalSortExpr;
+    use datafusion_physical_expr::expressions::binary;
     use datafusion_physical_expr_common::sort_expr::LexOrdering;
     use datafusion_physical_plan::empty::EmptyExec;
-    use datafusion_physical_plan::{collect, ExecutionPlanProperties};
-    use rstest::rstest;
+    use datafusion_physical_plan::{ExecutionPlanProperties, collect};
     use std::collections::HashMap;
     use std::io::Write;
     use std::sync::Arc;
     use tempfile::TempDir;
     use url::Url;
 
-    const DUMMY_NULL_COUNT: Precision<usize> = Precision::Exact(42);
-
     /// Creates a test schema with standard field types used in tests
     fn create_test_schema() -> SchemaRef {
         Arc::new(Schema::new(vec![
@@ -257,7 +252,7 @@ mod tests {
         );
         assert_eq!(
             exec.partition_statistics(None)?.total_byte_size,
-            Precision::Exact(671)
+            Precision::Absent,
         );
 
         Ok(())
@@ -289,32 +284,36 @@ mod tests {
             // sort expr, but non column
             (
                 vec![vec![col("int_col").add(lit(1)).sort(true, true)]],
-                Ok(vec![[PhysicalSortExpr {
-                    expr: binary(
-                        physical_col("int_col", &schema).unwrap(),
-                        Operator::Plus,
-                        physical_lit(1),
-                        &schema,
-                    )
-                    .unwrap(),
-                    options: SortOptions {
-                        descending: false,
-                        nulls_first: true,
-                    },
-                }]
-                .into()]),
+                Ok(vec![
+                    [PhysicalSortExpr {
+                        expr: binary(
+                            physical_col("int_col", &schema).unwrap(),
+                            Operator::Plus,
+                            physical_lit(1),
+                            &schema,
+                        )
+                        .unwrap(),
+                        options: SortOptions {
+                            descending: false,
+                            nulls_first: true,
+                        },
+                    }]
+                    .into(),
+                ]),
             ),
             // ok with one column
             (
                 vec![vec![col("string_col").sort(true, false)]],
-                Ok(vec![[PhysicalSortExpr {
-                    expr: physical_col("string_col", &schema).unwrap(),
-                    options: SortOptions {
-                        descending: false,
-                        nulls_first: false,
-                    },
-                }]
-                .into()]),
+                Ok(vec![
+                    [PhysicalSortExpr {
+                        expr: physical_col("string_col", &schema).unwrap(),
+                        options: SortOptions {
+                            descending: false,
+                            nulls_first: false,
+                        },
+                    }]
+                    .into(),
+                ]),
             ),
             // ok with two columns, different options
             (
@@ -322,19 +321,21 @@ mod tests {
                     col("string_col").sort(true, false),
                     col("int_col").sort(false, true),
                 ]],
-                Ok(vec![[
-                    PhysicalSortExpr::new_default(
-                        physical_col("string_col", &schema).unwrap(),
-                    )
-                    .asc()
-                    .nulls_last(),
-                    PhysicalSortExpr::new_default(
-                        physical_col("int_col", &schema).unwrap(),
-                    )
-                    .desc()
-                    .nulls_first(),
-                ]
-                .into()]),
+                Ok(vec![
+                    [
+                        PhysicalSortExpr::new_default(
+                            physical_col("string_col", &schema).unwrap(),
+                        )
+                        .asc()
+                        .nulls_last(),
+                        PhysicalSortExpr::new_default(
+                            physical_col("int_col", &schema).unwrap(),
+                        )
+                        .desc()
+                        .nulls_first(),
+                    ]
+                    .into(),
+                ]),
             ),
         ];
 
@@ -348,7 +349,7 @@ mod tests {
             let table =
                 ListingTable::try_new(config.clone()).expect("Creating the table");
             let ordering_result =
-                table.try_create_output_ordering(state.execution_props());
+                table.try_create_output_ordering(state.execution_props(), &[]);
 
             match (expected_result, ordering_result) {
                 (Ok(expected), Ok(result)) => {
@@ -404,7 +405,7 @@ mod tests {
             .await
             .expect("Empty execution plan");
 
-        assert!(scan.as_any().is::<EmptyExec>());
+        assert!(scan.is::<EmptyExec>());
         assert_eq!(
             columns(&scan.schema()),
             vec!["a".to_owned(), "p1".to_owned()]
@@ -453,9 +454,9 @@ mod tests {
 
         let table = ListingTable::try_new(config)?;
 
-        let (file_list, _) = table.list_files_for_scan(&ctx.state(), &[], None).await?;
+        let result = table.list_files_for_scan(&ctx.state(), &[], None).await?;
 
-        assert_eq!(file_list.len(), output_partitioning);
+        assert_eq!(result.file_groups.len(), output_partitioning);
 
         Ok(())
     }
@@ -488,9 +489,9 @@ mod tests {
 
         let table = ListingTable::try_new(config)?;
 
-        let (file_list, _) = table.list_files_for_scan(&ctx.state(), &[], None).await?;
+        let result = table.list_files_for_scan(&ctx.state(), &[], None).await?;
 
-        assert_eq!(file_list.len(), output_partitioning);
+        assert_eq!(result.file_groups.len(), output_partitioning);
 
         Ok(())
     }
@@ -538,9 +539,9 @@ mod tests {
 
         let table = ListingTable::try_new(config)?;
 
-        let (file_list, _) = table.list_files_for_scan(&ctx.state(), &[], None).await?;
+        let result = table.list_files_for_scan(&ctx.state(), &[], None).await?;
 
-        assert_eq!(file_list.len(), output_partitioning);
+        assert_eq!(result.file_groups.len(), output_partitioning);
 
         Ok(())
     }
@@ -731,8 +732,8 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_insert_into_append_new_parquet_files_invalid_session_fails(
-    ) -> Result<()> {
+    async fn test_insert_into_append_new_parquet_files_invalid_session_fails()
+    -> Result<()> {
         let mut config_map: HashMap<String, String> = HashMap::new();
         config_map.insert(
             "datafusion.execution.parquet.compression".into(),
@@ -746,7 +747,10 @@ mod tests {
         )
         .await
         .expect_err("Example should fail!");
-        assert_eq!(e.strip_backtrace(), "Invalid or Unsupported Configuration: zstd compression requires specifying a level such as zstd(4)");
+        assert_eq!(
+            e.strip_backtrace(),
+            "Invalid or Unsupported Configuration: zstd compression requires specifying a level such as zstd(4)"
+        );
 
         Ok(())
     }
@@ -806,7 +810,7 @@ mod tests {
                     .register_json(
                         "t",
                         tmp_dir.path().to_str().unwrap(),
-                        NdJsonReadOptions::default()
+                        JsonReadOptions::default()
                             .schema(schema.as_ref())
                             .file_compression_type(file_compression_type),
                     )
@@ -873,13 +877,13 @@ mod tests {
         let res = collect(plan, session_ctx.task_ctx()).await?;
         // Insert returns the number of rows written, in our case this would be 6.
 
-        insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&res),@r###"
-            +-------+
-            | count |
-            +-------+
-            | 20    |
-            +-------+
-        "###);}
+        insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&res),@r"
+        +-------+
+        | count |
+        +-------+
+        | 20    |
+        +-------+
+        ");}
 
         // Read the records in the table
         let batches = session_ctx
@@ -888,13 +892,13 @@ mod tests {
             .collect()
             .await?;
 
-        insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&batches),@r###"
-            +-------+
-            | count |
-            +-------+
-            | 20    |
-            +-------+
-        "###);}
+        insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&batches),@r"
+        +-------+
+        | count |
+        +-------+
+        | 20    |
+        +-------+
+        ");}
 
         // Assert that `target_partition_number` many files were added to the table.
         let num_files = tmp_dir.path().read_dir()?.count();
@@ -909,13 +913,13 @@ mod tests {
         // Again, execute the physical plan and collect the results
         let res = collect(plan, session_ctx.task_ctx()).await?;
 
-        insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&res),@r###"
-            +-------+
-            | count |
-            +-------+
-            | 20    |
-            +-------+
-        "###);}
+        insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&res),@r"
+        +-------+
+        | count |
+        +-------+
+        | 20    |
+        +-------+
+        ");}
 
         // Read the contents of the table
         let batches = session_ctx
@@ -924,13 +928,13 @@ mod tests {
             .collect()
             .await?;
 
-        insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&batches),@r###"
-            +-------+
-            | count |
-            +-------+
-            | 40    |
-            +-------+
-        "###);}
+        insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&batches),@r"
+        +-------+
+        | count |
+        +-------+
+        | 40    |
+        +-------+
+        ");}
 
         // Assert that another `target_partition_number` many files were added to the table.
         let num_files = tmp_dir.path().read_dir()?.count();
@@ -988,15 +992,15 @@ mod tests {
             .collect()
             .await?;
 
-        insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&batches),@r###"
-            +-----+-----+---+
-            | a   | b   | c |
-            +-----+-----+---+
-            | foo | bar | 1 |
-            | foo | bar | 2 |
-            | foo | bar | 3 |
-            +-----+-----+---+
-        "###);}
+        insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&batches),@r"
+        +-----+-----+---+
+        | a   | b   | c |
+        +-----+-----+---+
+        | foo | bar | 1 |
+        | foo | bar | 2 |
+        | foo | bar | 3 |
+        +-----+-----+---+
+        ");}
 
         Ok(())
     }
@@ -1307,10 +1311,10 @@ mod tests {
 
         let table = ListingTable::try_new(config)?;
 
-        let (file_list, _) = table.list_files_for_scan(&ctx.state(), &[], None).await?;
-        assert_eq!(file_list.len(), 1);
+        let result = table.list_files_for_scan(&ctx.state(), &[], None).await?;
+        assert_eq!(result.file_groups.len(), 1);
 
-        let files = file_list[0].clone();
+        let files = result.file_groups[0].clone();
 
         assert_eq!(
             files
@@ -1397,7 +1401,7 @@ mod tests {
         // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
         assert_eq!(
             exec_enabled.partition_statistics(None)?.total_byte_size,
-            Precision::Exact(671)
+            Precision::Absent,
         );
 
         Ok(())
@@ -1416,7 +1420,9 @@ mod tests {
         ];
 
         for (format, batch_size, soft_max_rows, expected_files) in test_cases {
-            println!("Testing insert with format: {format}, batch_size: {batch_size}, expected files: {expected_files}");
+            println!(
+                "Testing insert with format: {format}, batch_size: {batch_size}, expected files: {expected_files}"
+            );
 
             let mut config_map = HashMap::new();
             config_map.insert(
@@ -1449,33 +1455,10 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_statistics_mapping_with_custom_factory() -> Result<()> {
-        let ctx = SessionContext::new();
-        let table = create_test_listing_table_with_json_and_adapter(
-            &ctx,
-            false,
-            // NullStatsAdapterFactory sets column_statistics null_count to DUMMY_NULL_COUNT
-            Arc::new(NullStatsAdapterFactory {}),
-        )?;
-
-        let (groups, stats) = table.list_files_for_scan(&ctx.state(), &[], None).await?;
-
-        assert_eq!(stats.column_statistics[0].null_count, DUMMY_NULL_COUNT);
-        for g in groups {
-            if let Some(s) = g.file_statistics(None) {
-                assert_eq!(s.column_statistics[0].null_count, DUMMY_NULL_COUNT);
-            }
-        }
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_statistics_mapping_with_default_factory() -> Result<()> {
+    async fn test_basic_table_scan() -> Result<()> {
         let ctx = SessionContext::new();
 
-        // Create a table without providing a custom schema adapter factory
-        // This should fall back to using DefaultSchemaAdapterFactory
+        // Test basic table creation and scanning
         let path = "table/file.json";
         register_test_store(&ctx, &[(path, 10)]);
 
@@ -1487,222 +1470,20 @@ mod tests {
         let config = ListingTableConfig::new(table_path)
             .with_listing_options(opt)
             .with_schema(Arc::new(schema));
-        // Note: NOT calling .with_schema_adapter_factory() to test default behavior
 
         let table = ListingTable::try_new(config)?;
 
-        // Verify that no custom schema adapter factory is set
-        assert!(table.schema_adapter_factory().is_none());
-
-        // The scan should work correctly with the default schema adapter
+        // The scan should work correctly
         let scan_result = table.scan(&ctx.state(), None, &[], None).await;
-        assert!(
-            scan_result.is_ok(),
-            "Scan should succeed with default schema adapter"
-        );
-
-        // Verify that the default adapter handles basic schema compatibility
-        let (groups, _stats) = table.list_files_for_scan(&ctx.state(), &[], None).await?;
-        assert!(
-            !groups.is_empty(),
-            "Should list files successfully with default adapter"
-        );
-
-        Ok(())
-    }
-
-    #[rstest]
-    #[case(MapSchemaError::TypeIncompatible, "Cannot map incompatible types")]
-    #[case(MapSchemaError::GeneralFailure, "Schema adapter mapping failed")]
-    #[case(
-        MapSchemaError::InvalidProjection,
-        "Invalid projection in schema mapping"
-    )]
-    #[tokio::test]
-    async fn test_schema_adapter_map_schema_errors(
-        #[case] error_type: MapSchemaError,
-        #[case] expected_error_msg: &str,
-    ) -> Result<()> {
-        let ctx = SessionContext::new();
-        let table = create_test_listing_table_with_json_and_adapter(
-            &ctx,
-            false,
-            Arc::new(FailingMapSchemaAdapterFactory { error_type }),
-        )?;
-
-        // The error should bubble up from the scan operation when schema mapping fails
-        let scan_result = table.scan(&ctx.state(), None, &[], None).await;
-
-        assert!(scan_result.is_err());
-        let error_msg = scan_result.unwrap_err().to_string();
-        assert!(
-            error_msg.contains(expected_error_msg),
-            "Expected error containing '{expected_error_msg}', got: {error_msg}"
-        );
-
-        Ok(())
-    }
-
-    // Test that errors during file listing also bubble up correctly
-    #[tokio::test]
-    async fn test_schema_adapter_error_during_file_listing() -> Result<()> {
-        let ctx = SessionContext::new();
-        let table = create_test_listing_table_with_json_and_adapter(
-            &ctx,
-            true,
-            Arc::new(FailingMapSchemaAdapterFactory {
-                error_type: MapSchemaError::TypeIncompatible,
-            }),
-        )?;
+        assert!(scan_result.is_ok(), "Scan should succeed");
 
-        // The error should bubble up from list_files_for_scan when collecting statistics
-        let list_result = table.list_files_for_scan(&ctx.state(), &[], None).await;
-
-        assert!(list_result.is_err());
-        let error_msg = list_result.unwrap_err().to_string();
+        // Verify file listing works
+        let result = table.list_files_for_scan(&ctx.state(), &[], None).await?;
         assert!(
-            error_msg.contains("Cannot map incompatible types"),
-            "Expected type incompatibility error during file listing, got: {error_msg}"
+            !result.file_groups.is_empty(),
+            "Should list files successfully"
         );
 
         Ok(())
     }
-
-    #[derive(Debug, Copy, Clone)]
-    enum MapSchemaError {
-        TypeIncompatible,
-        GeneralFailure,
-        InvalidProjection,
-    }
-
-    #[derive(Debug)]
-    struct FailingMapSchemaAdapterFactory {
-        error_type: MapSchemaError,
-    }
-
-    impl SchemaAdapterFactory for FailingMapSchemaAdapterFactory {
-        fn create(
-            &self,
-            projected_table_schema: SchemaRef,
-            _table_schema: SchemaRef,
-        ) -> Box<dyn SchemaAdapter> {
-            Box::new(FailingMapSchemaAdapter {
-                schema: projected_table_schema,
-                error_type: self.error_type,
-            })
-        }
-    }
-
-    #[derive(Debug)]
-    struct FailingMapSchemaAdapter {
-        schema: SchemaRef,
-        error_type: MapSchemaError,
-    }
-
-    impl SchemaAdapter for FailingMapSchemaAdapter {
-        fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option<usize> {
-            let field = self.schema.field(index);
-            file_schema.fields.find(field.name()).map(|(i, _)| i)
-        }
-
-        fn map_schema(
-            &self,
-            _file_schema: &Schema,
-        ) -> Result<(Arc<dyn SchemaMapper>, Vec<usize>)> {
-            // Always fail with different error types based on the configured error_type
-            match self.error_type {
-                MapSchemaError::TypeIncompatible => {
-                    plan_err!(
-                        "Cannot map incompatible types: Boolean cannot be cast to Utf8"
-                    )
-                }
-                MapSchemaError::GeneralFailure => {
-                    plan_err!("Schema adapter mapping failed due to internal error")
-                }
-                MapSchemaError::InvalidProjection => {
-                    plan_err!("Invalid projection in schema mapping: column index out of bounds")
-                }
-            }
-        }
-    }
-
-    #[derive(Debug)]
-    struct NullStatsAdapterFactory;
-
-    impl SchemaAdapterFactory for NullStatsAdapterFactory {
-        fn create(
-            &self,
-            projected_table_schema: SchemaRef,
-            _table_schema: SchemaRef,
-        ) -> Box<dyn SchemaAdapter> {
-            Box::new(NullStatsAdapter {
-                schema: projected_table_schema,
-            })
-        }
-    }
-
-    #[derive(Debug)]
-    struct NullStatsAdapter {
-        schema: SchemaRef,
-    }
-
-    impl SchemaAdapter for NullStatsAdapter {
-        fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option<usize> {
-            let field = self.schema.field(index);
-            file_schema.fields.find(field.name()).map(|(i, _)| i)
-        }
-
-        fn map_schema(
-            &self,
-            file_schema: &Schema,
-        ) -> Result<(Arc<dyn SchemaMapper>, Vec<usize>)> {
-            let projection = (0..file_schema.fields().len()).collect();
-            Ok((Arc::new(NullStatsMapper {}), projection))
-        }
-    }
-
-    #[derive(Debug)]
-    struct NullStatsMapper;
-
-    impl SchemaMapper for NullStatsMapper {
-        fn map_batch(&self, batch: RecordBatch) -> Result<RecordBatch> {
-            Ok(batch)
-        }
-
-        fn map_column_statistics(
-            &self,
-            stats: &[ColumnStatistics],
-        ) -> Result<Vec<ColumnStatistics>> {
-            Ok(stats
-                .iter()
-                .map(|s| {
-                    let mut s = s.clone();
-                    s.null_count = DUMMY_NULL_COUNT;
-                    s
-                })
-                .collect())
-        }
-    }
-
-    /// Helper function to create a test ListingTable with JSON format and custom schema adapter factory
-    fn create_test_listing_table_with_json_and_adapter(
-        ctx: &SessionContext,
-        collect_stat: bool,
-        schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
-    ) -> Result<ListingTable> {
-        let path = "table/file.json";
-        register_test_store(ctx, &[(path, 10)]);
-
-        let format = JsonFormat::default();
-        let opt = ListingOptions::new(Arc::new(format)).with_collect_stat(collect_stat);
-        let schema = Schema::new(vec![Field::new("a", DataType::Boolean, false)]);
-        let table_path = ListingTableUrl::parse("test:///table/")?;
-
-        let config = ListingTableConfig::new(table_path)
-            .with_listing_options(opt)
-            .with_schema(Arc::new(schema))
-            .with_schema_adapter_factory(schema_adapter_factory);
-
-        ListingTable::try_new(config)
-    }
 }
diff --git a/datafusion/core/src/datasource/listing_table_factory.rs b/datafusion/core/src/datasource/listing_table_factory.rs
index f98297d0e3f7f..a5139346752a9 100644
--- a/datafusion/core/src/datasource/listing_table_factory.rs
+++ b/datafusion/core/src/datasource/listing_table_factory.rs
@@ -28,8 +28,8 @@ use crate::datasource::listing::{
 use crate::execution::context::SessionState;
 
 use arrow::datatypes::DataType;
-use datafusion_common::{arrow_datafusion_err, plan_err, DataFusionError, ToDFSchema};
-use datafusion_common::{config_datafusion_err, Result};
+use datafusion_common::{Result, config_datafusion_err};
+use datafusion_common::{ToDFSchema, arrow_datafusion_err, plan_err};
 use datafusion_expr::CreateExternalTable;
 
 use async_trait::async_trait;
@@ -54,7 +54,15 @@ impl TableProviderFactory for ListingTableFactory {
         cmd: &CreateExternalTable,
     ) -> Result<Arc<dyn TableProvider>> {
         // TODO (https://github.com/apache/datafusion/issues/11600) remove downcast_ref from here. Should file format factory be an extension to session state?
-        let session_state = state.as_any().downcast_ref::<SessionState>().unwrap();
+        let session_state =
+            state
+                .as_any()
+                .downcast_ref::<SessionState>()
+                .ok_or_else(|| {
+                    datafusion_common::internal_datafusion_err!(
+                        "ListingTableFactory requires SessionState"
+                    )
+                })?;
         let file_format = session_state
             .get_file_format_factory(cmd.file_type.as_str())
             .ok_or(config_datafusion_err!(
@@ -63,7 +71,8 @@ impl TableProviderFactory for ListingTableFactory {
             ))?
             .create(session_state, &cmd.options)?;
 
-        let mut table_path = ListingTableUrl::parse(&cmd.location)?;
+        let mut table_path =
+            ListingTableUrl::parse(&cmd.location)?.with_table_ref(cmd.name.clone());
         let file_extension = match table_path.is_collection() {
             // Setting the extension to be empty instead of allowing the default extension seems
             // odd, but was done to ensure existing behavior isn't modified. It seems like this
@@ -190,6 +199,16 @@ impl TableProviderFactory for ListingTableFactory {
             .with_definition(cmd.definition.clone())
             .with_constraints(cmd.constraints.clone())
             .with_column_defaults(cmd.column_defaults.clone());
+
+        // Pre-warm statistics cache if collect_statistics is enabled
+        if session_state.config().collect_statistics() {
+            let filters = &[];
+            let limit = None;
+            if let Err(e) = table.list_files_for_scan(state, filters, limit).await {
+                log::warn!("Failed to pre-warm statistics cache: {e}");
+            }
+        }
+
         Ok(Arc::new(table))
     }
 }
@@ -205,19 +224,24 @@ fn get_extension(path: &str) -> String {
 
 #[cfg(test)]
 mod tests {
+    use super::*;
+    use crate::{
+        datasource::file_format::csv::CsvFormat, execution::context::SessionContext,
+        test_util::parquet_test_data,
+    };
+    use datafusion_execution::cache::CacheAccessor;
+    use datafusion_execution::cache::cache_manager::CacheManagerConfig;
+    use datafusion_execution::cache::cache_unit::DefaultFileStatisticsCache;
     use datafusion_execution::config::SessionConfig;
+    use datafusion_execution::runtime_env::RuntimeEnvBuilder;
     use glob::Pattern;
     use std::collections::HashMap;
     use std::fs;
     use std::path::PathBuf;
 
-    use super::*;
-    use crate::{
-        datasource::file_format::csv::CsvFormat, execution::context::SessionContext,
-    };
-
     use datafusion_common::parsers::CompressionTypeVariant;
-    use datafusion_common::{Constraints, DFSchema, TableReference};
+    use datafusion_common::{DFSchema, TableReference};
+    use datafusion_expr::registry::ExtensionTypeRegistryRef;
 
     #[tokio::test]
     async fn test_create_using_non_std_file_ext() {
@@ -231,27 +255,16 @@ mod tests {
         let context = SessionContext::new();
         let state = context.state();
         let name = TableReference::bare("foo");
-        let cmd = CreateExternalTable {
+        let cmd = CreateExternalTable::builder(
             name,
-            location: csv_file.path().to_str().unwrap().to_string(),
-            file_type: "csv".to_string(),
-            schema: Arc::new(DFSchema::empty()),
-            table_partition_cols: vec![],
-            if_not_exists: false,
-            or_replace: false,
-            temporary: false,
-            definition: None,
-            order_exprs: vec![],
-            unbounded: false,
-            options: HashMap::from([("format.has_header".into(), "true".into())]),
-            constraints: Constraints::default(),
-            column_defaults: HashMap::new(),
-        };
+            csv_file.path().to_str().unwrap().to_string(),
+            "csv",
+            Arc::new(DFSchema::empty()),
+        )
+        .with_options(HashMap::from([("format.has_header".into(), "true".into())]))
+        .build();
         let table_provider = factory.create(&state, &cmd).await.unwrap();
-        let listing_table = table_provider
-            .as_any()
-            .downcast_ref::<ListingTable>()
-            .unwrap();
+        let listing_table = table_provider.downcast_ref::<ListingTable>().unwrap();
         let listing_options = listing_table.options();
         assert_eq!(".tbl", listing_options.file_extension);
     }
@@ -272,30 +285,19 @@ mod tests {
         let mut options = HashMap::new();
         options.insert("format.schema_infer_max_rec".to_owned(), "1000".to_owned());
         options.insert("format.has_header".into(), "true".into());
-        let cmd = CreateExternalTable {
+        let cmd = CreateExternalTable::builder(
             name,
-            location: csv_file.path().to_str().unwrap().to_string(),
-            file_type: "csv".to_string(),
-            schema: Arc::new(DFSchema::empty()),
-            table_partition_cols: vec![],
-            if_not_exists: false,
-            or_replace: false,
-            temporary: false,
-            definition: None,
-            order_exprs: vec![],
-            unbounded: false,
-            options,
-            constraints: Constraints::default(),
-            column_defaults: HashMap::new(),
-        };
+            csv_file.path().to_str().unwrap().to_string(),
+            "csv",
+            Arc::new(DFSchema::empty()),
+        )
+        .with_options(options)
+        .build();
         let table_provider = factory.create(&state, &cmd).await.unwrap();
-        let listing_table = table_provider
-            .as_any()
-            .downcast_ref::<ListingTable>()
-            .unwrap();
+        let listing_table = table_provider.downcast_ref::<ListingTable>().unwrap();
 
         let format = listing_table.options().format.clone();
-        let csv_format = format.as_any().downcast_ref::<CsvFormat>().unwrap();
+        let csv_format = format.downcast_ref::<CsvFormat>().unwrap();
         let csv_options = csv_format.options().clone();
         assert_eq!(csv_options.schema_infer_max_rec, Some(1000));
         let listing_options = listing_table.options();
@@ -317,31 +319,20 @@ mod tests {
         options.insert("format.schema_infer_max_rec".to_owned(), "1000".to_owned());
         options.insert("format.has_header".into(), "true".into());
         options.insert("format.compression".into(), "gzip".into());
-        let cmd = CreateExternalTable {
+        let cmd = CreateExternalTable::builder(
             name,
-            location: dir.path().to_str().unwrap().to_string(),
-            file_type: "csv".to_string(),
-            schema: Arc::new(DFSchema::empty()),
-            table_partition_cols: vec![],
-            if_not_exists: false,
-            or_replace: false,
-            temporary: false,
-            definition: None,
-            order_exprs: vec![],
-            unbounded: false,
-            options,
-            constraints: Constraints::default(),
-            column_defaults: HashMap::new(),
-        };
+            dir.path().to_str().unwrap().to_string(),
+            "csv",
+            Arc::new(DFSchema::empty()),
+        )
+        .with_options(options)
+        .build();
         let table_provider = factory.create(&state, &cmd).await.unwrap();
-        let listing_table = table_provider
-            .as_any()
-            .downcast_ref::<ListingTable>()
-            .unwrap();
+        let listing_table = table_provider.downcast_ref::<ListingTable>().unwrap();
 
         // Verify compression is used
         let format = listing_table.options().format.clone();
-        let csv_format = format.as_any().downcast_ref::<CsvFormat>().unwrap();
+        let csv_format = format.downcast_ref::<CsvFormat>().unwrap();
         let csv_options = csv_format.options().clone();
         assert_eq!(csv_options.compression, CompressionTypeVariant::GZIP);
 
@@ -369,27 +360,16 @@ mod tests {
         let mut options = HashMap::new();
         options.insert("format.schema_infer_max_rec".to_owned(), "1000".to_owned());
         options.insert("format.has_header".into(), "true".into());
-        let cmd = CreateExternalTable {
+        let cmd = CreateExternalTable::builder(
             name,
-            location: dir.path().to_str().unwrap().to_string(),
-            file_type: "csv".to_string(),
-            schema: Arc::new(DFSchema::empty()),
-            table_partition_cols: vec![],
-            if_not_exists: false,
-            or_replace: false,
-            temporary: false,
-            definition: None,
-            order_exprs: vec![],
-            unbounded: false,
-            options,
-            constraints: Constraints::default(),
-            column_defaults: HashMap::new(),
-        };
+            dir.path().to_str().unwrap().to_string(),
+            "csv",
+            Arc::new(DFSchema::empty()),
+        )
+        .with_options(options)
+        .build();
         let table_provider = factory.create(&state, &cmd).await.unwrap();
-        let listing_table = table_provider
-            .as_any()
-            .downcast_ref::<ListingTable>()
-            .unwrap();
+        let listing_table = table_provider.downcast_ref::<ListingTable>().unwrap();
 
         let listing_options = listing_table.options();
         assert_eq!("", listing_options.file_extension);
@@ -413,27 +393,15 @@ mod tests {
         let state = context.state();
         let name = TableReference::bare("foo");
 
-        let cmd = CreateExternalTable {
+        let cmd = CreateExternalTable::builder(
             name,
-            location: String::from(path.to_str().unwrap()),
-            file_type: "parquet".to_string(),
-            schema: Arc::new(DFSchema::empty()),
-            table_partition_cols: vec![],
-            if_not_exists: false,
-            or_replace: false,
-            temporary: false,
-            definition: None,
-            order_exprs: vec![],
-            unbounded: false,
-            options: HashMap::new(),
-            constraints: Constraints::default(),
-            column_defaults: HashMap::new(),
-        };
+            String::from(path.to_str().unwrap()),
+            "parquet",
+            Arc::new(DFSchema::empty()),
+        )
+        .build();
         let table_provider = factory.create(&state, &cmd).await.unwrap();
-        let listing_table = table_provider
-            .as_any()
-            .downcast_ref::<ListingTable>()
-            .unwrap();
+        let listing_table = table_provider.downcast_ref::<ListingTable>().unwrap();
 
         let listing_options = listing_table.options();
         assert_eq!("", listing_options.file_extension);
@@ -453,27 +421,15 @@ mod tests {
         let state = context.state();
         let name = TableReference::bare("foo");
 
-        let cmd = CreateExternalTable {
+        let cmd = CreateExternalTable::builder(
             name,
-            location: dir.path().to_str().unwrap().to_string(),
-            file_type: "parquet".to_string(),
-            schema: Arc::new(DFSchema::empty()),
-            table_partition_cols: vec![],
-            if_not_exists: false,
-            or_replace: false,
-            temporary: false,
-            definition: None,
-            order_exprs: vec![],
-            unbounded: false,
-            options: HashMap::new(),
-            constraints: Constraints::default(),
-            column_defaults: HashMap::new(),
-        };
+            dir.path().to_str().unwrap(),
+            "parquet",
+            Arc::new(DFSchema::empty()),
+        )
+        .build();
         let table_provider = factory.create(&state, &cmd).await.unwrap();
-        let listing_table = table_provider
-            .as_any()
-            .downcast_ref::<ListingTable>()
-            .unwrap();
+        let listing_table = table_provider.downcast_ref::<ListingTable>().unwrap();
 
         let listing_options = listing_table.options();
         let dtype =
@@ -494,29 +450,193 @@ mod tests {
         let state = context.state();
         let name = TableReference::bare("foo");
 
-        let cmd = CreateExternalTable {
+        let cmd = CreateExternalTable::builder(
             name,
-            location: dir.path().to_str().unwrap().to_string(),
-            file_type: "parquet".to_string(),
-            schema: Arc::new(DFSchema::empty()),
-            table_partition_cols: vec![],
-            if_not_exists: false,
-            or_replace: false,
-            temporary: false,
-            definition: None,
-            order_exprs: vec![],
-            unbounded: false,
-            options: HashMap::new(),
-            constraints: Constraints::default(),
-            column_defaults: HashMap::new(),
-        };
+            dir.path().to_str().unwrap().to_string(),
+            "parquet",
+            Arc::new(DFSchema::empty()),
+        )
+        .build();
         let table_provider = factory.create(&state, &cmd).await.unwrap();
-        let listing_table = table_provider
-            .as_any()
-            .downcast_ref::<ListingTable>()
-            .unwrap();
+        let listing_table = table_provider.downcast_ref::<ListingTable>().unwrap();
 
         let listing_options = listing_table.options();
         assert!(listing_options.table_partition_cols.is_empty());
     }
+
+    #[tokio::test]
+    async fn test_statistics_cache_prewarming() {
+        let factory = ListingTableFactory::new();
+
+        let location = PathBuf::from(parquet_test_data())
+            .join("alltypes_tiny_pages_plain.parquet")
+            .to_string_lossy()
+            .to_string();
+
+        // Test with collect_statistics enabled
+        let file_statistics_cache = Arc::new(DefaultFileStatisticsCache::default());
+        let cache_config = CacheManagerConfig::default()
+            .with_files_statistics_cache(Some(file_statistics_cache.clone()));
+        let runtime = RuntimeEnvBuilder::new()
+            .with_cache_manager(cache_config)
+            .build_arc()
+            .unwrap();
+
+        let mut config = SessionConfig::new();
+        config.options_mut().execution.collect_statistics = true;
+        let context = SessionContext::new_with_config_rt(config, runtime);
+        let state = context.state();
+        let name = TableReference::bare("test");
+
+        let cmd = CreateExternalTable::builder(
+            name,
+            location.clone(),
+            "parquet",
+            Arc::new(DFSchema::empty()),
+        )
+        .build();
+
+        let _table_provider = factory.create(&state, &cmd).await.unwrap();
+
+        assert!(
+            file_statistics_cache.len() > 0,
+            "Statistics cache should be pre-warmed when collect_statistics is enabled"
+        );
+
+        // Test with collect_statistics disabled
+        let file_statistics_cache = Arc::new(DefaultFileStatisticsCache::default());
+        let cache_config = CacheManagerConfig::default()
+            .with_files_statistics_cache(Some(file_statistics_cache.clone()));
+        let runtime = RuntimeEnvBuilder::new()
+            .with_cache_manager(cache_config)
+            .build_arc()
+            .unwrap();
+
+        let mut config = SessionConfig::new();
+        config.options_mut().execution.collect_statistics = false;
+        let context = SessionContext::new_with_config_rt(config, runtime);
+        let state = context.state();
+        let name = TableReference::bare("test");
+
+        let cmd = CreateExternalTable::builder(
+            name,
+            location,
+            "parquet",
+            Arc::new(DFSchema::empty()),
+        )
+        .build();
+
+        let _table_provider = factory.create(&state, &cmd).await.unwrap();
+
+        assert_eq!(
+            file_statistics_cache.len(),
+            0,
+            "Statistics cache should not be pre-warmed when collect_statistics is disabled"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_create_with_invalid_session() {
+        use datafusion_common::config::TableOptions;
+        use datafusion_execution::TaskContext;
+        use datafusion_execution::config::SessionConfig;
+        use datafusion_physical_expr::PhysicalExpr;
+        use datafusion_physical_plan::ExecutionPlan;
+        use std::any::Any;
+        use std::collections::HashMap;
+
+        // A mock Session that is NOT SessionState
+        #[derive(Debug)]
+        struct MockSession;
+
+        #[async_trait]
+        impl Session for MockSession {
+            fn session_id(&self) -> &str {
+                "mock_session"
+            }
+            fn config(&self) -> &SessionConfig {
+                unimplemented!()
+            }
+            async fn create_physical_plan(
+                &self,
+                _logical_plan: &datafusion_expr::LogicalPlan,
+            ) -> Result<Arc<dyn ExecutionPlan>> {
+                unimplemented!()
+            }
+            fn create_physical_expr(
+                &self,
+                _expr: datafusion_expr::Expr,
+                _df_schema: &DFSchema,
+            ) -> Result<Arc<dyn PhysicalExpr>> {
+                unimplemented!()
+            }
+            fn scalar_functions(
+                &self,
+            ) -> &HashMap<String, Arc<datafusion_expr::ScalarUDF>> {
+                unimplemented!()
+            }
+            fn higher_order_functions(
+                &self,
+            ) -> &HashMap<String, Arc<dyn datafusion_expr::HigherOrderUDF>> {
+                unimplemented!()
+            }
+            fn aggregate_functions(
+                &self,
+            ) -> &HashMap<String, Arc<datafusion_expr::AggregateUDF>> {
+                unimplemented!()
+            }
+            fn window_functions(
+                &self,
+            ) -> &HashMap<String, Arc<datafusion_expr::WindowUDF>> {
+                unimplemented!()
+            }
+
+            fn extension_type_registry(&self) -> &ExtensionTypeRegistryRef {
+                unreachable!()
+            }
+
+            fn runtime_env(&self) -> &Arc<datafusion_execution::runtime_env::RuntimeEnv> {
+                unimplemented!()
+            }
+            fn execution_props(
+                &self,
+            ) -> &datafusion_expr::execution_props::ExecutionProps {
+                unimplemented!()
+            }
+            fn as_any(&self) -> &dyn Any {
+                self
+            }
+            fn table_options(&self) -> &TableOptions {
+                unimplemented!()
+            }
+            fn table_options_mut(&mut self) -> &mut TableOptions {
+                unimplemented!()
+            }
+            fn task_ctx(&self) -> Arc<TaskContext> {
+                unimplemented!()
+            }
+        }
+
+        let factory = ListingTableFactory::new();
+        let mock_session = MockSession;
+
+        let name = TableReference::bare("foo");
+        let cmd = CreateExternalTable::builder(
+            name,
+            "foo.csv".to_string(),
+            "csv",
+            Arc::new(DFSchema::empty()),
+        )
+        .build();
+
+        // This should return an error, not panic
+        let result = factory.create(&mock_session, &cmd).await;
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .strip_backtrace()
+                .contains("Internal error: ListingTableFactory requires SessionState")
+        );
+    }
 }
diff --git a/datafusion/core/src/datasource/memory_test.rs b/datafusion/core/src/datasource/memory_test.rs
index c16837c73b4f1..c7721cafb02ea 100644
--- a/datafusion/core/src/datasource/memory_test.rs
+++ b/datafusion/core/src/datasource/memory_test.rs
@@ -19,7 +19,7 @@
 mod tests {
 
     use crate::datasource::MemTable;
-    use crate::datasource::{provider_as_source, DefaultTableSource};
+    use crate::datasource::{DefaultTableSource, provider_as_source};
     use crate::physical_plan::collect;
     use crate::prelude::SessionContext;
     use arrow::array::{AsArray, Int32Array};
@@ -29,8 +29,8 @@ mod tests {
     use arrow_schema::SchemaRef;
     use datafusion_catalog::TableProvider;
     use datafusion_common::{DataFusionError, Result};
-    use datafusion_expr::dml::InsertOp;
     use datafusion_expr::LogicalPlanBuilder;
+    use datafusion_expr::dml::InsertOp;
     use futures::StreamExt;
     use std::collections::HashMap;
     use std::sync::Arc;
@@ -329,12 +329,11 @@ mod tests {
         );
         let col = batch.column(0).as_primitive::<UInt64Type>();
         assert_eq!(col.len(), 1, "expected 1 row, got {}", col.len());
-        let val = col
-            .iter()
+
+        col.iter()
             .next()
             .expect("had value")
-            .expect("expected non null");
-        val
+            .expect("expected non null")
     }
 
     // Test inserting a single batch of data into a single partition
diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs
index 37b9663111a53..de54078aafef4 100644
--- a/datafusion/core/src/datasource/mod.rs
+++ b/datafusion/core/src/datasource/mod.rs
@@ -20,7 +20,6 @@
 //! [`ListingTable`]: crate::datasource::listing::ListingTable
 
 pub mod dynamic_file;
-pub mod empty;
 pub mod file_format;
 pub mod listing;
 pub mod listing_table_factory;
@@ -31,7 +30,7 @@ mod view_test;
 
 // backwards compatibility
 pub use self::default_table_source::{
-    provider_as_source, source_as_provider, DefaultTableSource,
+    DefaultTableSource, provider_as_source, source_as_provider,
 };
 pub use self::memory::MemTable;
 pub use self::view::ViewTable;
@@ -39,9 +38,11 @@ pub use crate::catalog::TableProvider;
 pub use crate::logical_expr::TableType;
 pub use datafusion_catalog::cte_worktable;
 pub use datafusion_catalog::default_table_source;
+pub use datafusion_catalog::empty;
 pub use datafusion_catalog::memory;
 pub use datafusion_catalog::stream;
 pub use datafusion_catalog::view;
+pub use datafusion_datasource::projection;
 pub use datafusion_datasource::schema_adapter;
 pub use datafusion_datasource::sink;
 pub use datafusion_datasource::source;
@@ -53,32 +54,35 @@ pub use datafusion_physical_expr::create_ordering;
 mod tests {
 
     use crate::prelude::SessionContext;
-    use ::object_store::{path::Path, ObjectMeta};
+    use ::object_store::{ObjectMeta, path::Path};
     use arrow::{
-        array::{Int32Array, StringArray},
+        array::Int32Array,
         datatypes::{DataType, Field, Schema, SchemaRef},
         record_batch::RecordBatch,
     };
-    use datafusion_common::{record_batch, test_util::batches_to_sort_string};
+    use datafusion_common::{
+        Result, ScalarValue,
+        test_util::batches_to_sort_string,
+        tree_node::{Transformed, TransformedResult, TreeNode},
+    };
     use datafusion_datasource::{
-        file::FileSource,
-        file_scan_config::FileScanConfigBuilder,
-        schema_adapter::{
-            DefaultSchemaAdapterFactory, SchemaAdapter, SchemaAdapterFactory,
-            SchemaMapper,
-        },
-        source::DataSourceExec,
-        PartitionedFile,
+        PartitionedFile, file_scan_config::FileScanConfigBuilder, source::DataSourceExec,
     };
     use datafusion_datasource_parquet::source::ParquetSource;
+    use datafusion_physical_expr::expressions::{Column, Literal};
+    use datafusion_physical_expr_adapter::{
+        PhysicalExprAdapter, PhysicalExprAdapterFactory,
+    };
+    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
     use datafusion_physical_plan::collect;
     use std::{fs, sync::Arc};
     use tempfile::TempDir;
+    use url::Url;
 
     #[tokio::test]
-    async fn can_override_schema_adapter() {
-        // Test shows that SchemaAdapter can add a column that doesn't existing in the
-        // record batches returned from parquet.  This can be useful for schema evolution
+    async fn can_override_physical_expr_adapter() {
+        // Test shows that PhysicalExprAdapter can add a column that doesn't exist in the
+        // record batches returned from parquet. This can be useful for schema evolution
         // where older files may not have all columns.
 
         use datafusion_execution::object_store::ObjectStoreUrl;
@@ -101,7 +105,8 @@ mod tests {
         writer.write(&rec_batch).unwrap();
         writer.close().unwrap();
 
-        let location = Path::parse(path.to_str().unwrap()).unwrap();
+        let url = Url::from_file_path(path.canonicalize().unwrap()).unwrap();
+        let location = Path::from_url_path(url.path()).unwrap();
         let metadata = fs::metadata(path.as_path()).expect("Local file metadata");
         let meta = ObjectMeta {
             location,
@@ -111,29 +116,18 @@ mod tests {
             version: None,
         };
 
-        let partitioned_file = PartitionedFile {
-            object_meta: meta,
-            partition_values: vec![],
-            range: None,
-            statistics: None,
-            extensions: None,
-            metadata_size_hint: None,
-        };
+        let partitioned_file = PartitionedFile::new_from_meta(meta);
 
         let f1 = Field::new("id", DataType::Int32, true);
         let f2 = Field::new("extra_column", DataType::Utf8, true);
 
         let schema = Arc::new(Schema::new(vec![f1.clone(), f2.clone()]));
-        let source = ParquetSource::default()
-            .with_schema_adapter_factory(Arc::new(TestSchemaAdapterFactory {}))
-            .unwrap();
-        let base_conf = FileScanConfigBuilder::new(
-            ObjectStoreUrl::local_filesystem(),
-            schema,
-            source,
-        )
-        .with_file(partitioned_file)
-        .build();
+        let source = Arc::new(ParquetSource::new(Arc::clone(&schema)));
+        let base_conf =
+            FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source)
+                .with_file(partitioned_file)
+                .with_expr_adapter(Some(Arc::new(TestPhysicalExprAdapterFactory)))
+                .build();
 
         let parquet_exec = DataSourceExec::from_data_source(base_conf);
 
@@ -141,134 +135,52 @@ mod tests {
         let task_ctx = session_ctx.task_ctx();
         let read = collect(parquet_exec, task_ctx).await.unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&read),@r###"
+        insta::assert_snapshot!(batches_to_sort_string(&read),@r"
         +----+--------------+
         | id | extra_column |
         +----+--------------+
         | 1  | foo          |
         +----+--------------+
-        "###);
-    }
-
-    #[test]
-    fn default_schema_adapter() {
-        let table_schema = Schema::new(vec![
-            Field::new("a", DataType::Int32, true),
-            Field::new("b", DataType::Utf8, true),
-        ]);
-
-        // file has a subset of the table schema fields and different type
-        let file_schema = Schema::new(vec![
-            Field::new("c", DataType::Float64, true), // not in table schema
-            Field::new("b", DataType::Float64, true),
-        ]);
-
-        let adapter = DefaultSchemaAdapterFactory::from_schema(Arc::new(table_schema));
-        let (mapper, indices) = adapter.map_schema(&file_schema).unwrap();
-        assert_eq!(indices, vec![1]);
-
-        let file_batch = record_batch!(("b", Float64, vec![1.0, 2.0])).unwrap();
-
-        let mapped_batch = mapper.map_batch(file_batch).unwrap();
-
-        // the mapped batch has the correct schema and the "b" column has been cast to Utf8
-        let expected_batch = record_batch!(
-            ("a", Int32, vec![None, None]), // missing column filled with nulls
-            ("b", Utf8, vec!["1.0", "2.0"])  // b was cast to string and order was changed
-        )
-        .unwrap();
-        assert_eq!(mapped_batch, expected_batch);
-    }
-
-    #[test]
-    fn default_schema_adapter_non_nullable_columns() {
-        let table_schema = Schema::new(vec![
-            Field::new("a", DataType::Int32, false), // "a"" is declared non nullable
-            Field::new("b", DataType::Utf8, true),
-        ]);
-        let file_schema = Schema::new(vec![
-            // since file doesn't have "a" it will be filled with nulls
-            Field::new("b", DataType::Float64, true),
-        ]);
-
-        let adapter = DefaultSchemaAdapterFactory::from_schema(Arc::new(table_schema));
-        let (mapper, indices) = adapter.map_schema(&file_schema).unwrap();
-        assert_eq!(indices, vec![0]);
-
-        let file_batch = record_batch!(("b", Float64, vec![1.0, 2.0])).unwrap();
-
-        // Mapping fails because it tries to fill in a non-nullable column with nulls
-        let err = mapper.map_batch(file_batch).unwrap_err().to_string();
-        assert!(err.contains("Invalid argument error: Column 'a' is declared as non-nullable but contains null values"), "{err}");
+        ");
     }
 
     #[derive(Debug)]
-    struct TestSchemaAdapterFactory;
+    struct TestPhysicalExprAdapterFactory;
 
-    impl SchemaAdapterFactory for TestSchemaAdapterFactory {
+    impl PhysicalExprAdapterFactory for TestPhysicalExprAdapterFactory {
         fn create(
             &self,
-            projected_table_schema: SchemaRef,
-            _table_schema: SchemaRef,
-        ) -> Box<dyn SchemaAdapter> {
-            Box::new(TestSchemaAdapter {
-                table_schema: projected_table_schema,
-            })
+            _logical_file_schema: SchemaRef,
+            physical_file_schema: SchemaRef,
+        ) -> Result<Arc<dyn PhysicalExprAdapter>> {
+            Ok(Arc::new(TestPhysicalExprAdapter {
+                physical_file_schema,
+            }))
         }
     }
 
-    struct TestSchemaAdapter {
-        /// Schema for the table
-        table_schema: SchemaRef,
+    #[derive(Debug)]
+    struct TestPhysicalExprAdapter {
+        physical_file_schema: SchemaRef,
     }
 
-    impl SchemaAdapter for TestSchemaAdapter {
-        fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option<usize> {
-            let field = self.table_schema.field(index);
-            Some(file_schema.fields.find(field.name())?.0)
-        }
-
-        fn map_schema(
-            &self,
-            file_schema: &Schema,
-        ) -> datafusion_common::Result<(Arc<dyn SchemaMapper>, Vec<usize>)> {
-            let mut projection = Vec::with_capacity(file_schema.fields().len());
-
-            for (file_idx, file_field) in file_schema.fields.iter().enumerate() {
-                if self.table_schema.fields().find(file_field.name()).is_some() {
-                    projection.push(file_idx);
+    impl PhysicalExprAdapter for TestPhysicalExprAdapter {
+        fn rewrite(&self, expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
+            expr.transform(|e| {
+                if let Some(column) = e.downcast_ref::<Column>() {
+                    // If column is "extra_column" and missing from physical schema, inject "foo"
+                    if column.name() == "extra_column"
+                        && self.physical_file_schema.index_of("extra_column").is_err()
+                    {
+                        return Ok(Transformed::yes(Arc::new(Literal::new(
+                            ScalarValue::Utf8(Some("foo".to_string())),
+                        ))
+                            as Arc<dyn PhysicalExpr>));
+                    }
                 }
-            }
-
-            Ok((Arc::new(TestSchemaMapping {}), projection))
-        }
-    }
-
-    #[derive(Debug)]
-    struct TestSchemaMapping {}
-
-    impl SchemaMapper for TestSchemaMapping {
-        fn map_batch(
-            &self,
-            batch: RecordBatch,
-        ) -> datafusion_common::Result<RecordBatch> {
-            let f1 = Field::new("id", DataType::Int32, true);
-            let f2 = Field::new("extra_column", DataType::Utf8, true);
-
-            let schema = Arc::new(Schema::new(vec![f1, f2]));
-
-            let extra_column = Arc::new(StringArray::from(vec!["foo"]));
-            let mut new_columns = batch.columns().to_vec();
-            new_columns.push(extra_column);
-
-            Ok(RecordBatch::try_new(schema, new_columns).unwrap())
-        }
-
-        fn map_column_statistics(
-            &self,
-            _file_col_statistics: &[datafusion_common::ColumnStatistics],
-        ) -> datafusion_common::Result<Vec<datafusion_common::ColumnStatistics>> {
-            unimplemented!()
+                Ok(Transformed::no(e))
+            })
+            .data()
         }
     }
 }
diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs
index 9068c9758179d..2954a47403299 100644
--- a/datafusion/core/src/datasource/physical_plan/avro.rs
+++ b/datafusion/core/src/datasource/physical_plan/avro.rs
@@ -31,21 +31,21 @@ mod tests {
     use crate::test::object_store::local_unpartitioned_file;
     use arrow::datatypes::{DataType, Field, SchemaBuilder};
     use datafusion_common::test_util::batches_to_string;
-    use datafusion_common::{test_util, Result, ScalarValue};
+    use datafusion_common::{Result, ScalarValue, test_util};
     use datafusion_datasource::file_format::FileFormat;
     use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
-    use datafusion_datasource::PartitionedFile;
-    use datafusion_datasource_avro::source::AvroSource;
+    use datafusion_datasource::{PartitionedFile, TableSchema};
     use datafusion_datasource_avro::AvroFormat;
+    use datafusion_datasource_avro::source::AvroSource;
     use datafusion_execution::object_store::ObjectStoreUrl;
     use datafusion_physical_plan::ExecutionPlan;
 
     use datafusion_datasource::source::DataSourceExec;
     use futures::StreamExt;
     use insta::assert_snapshot;
+    use object_store::ObjectStore;
     use object_store::chunked::ChunkedStore;
     use object_store::local::LocalFileSystem;
-    use object_store::ObjectStore;
     use rstest::*;
     use url::Url;
 
@@ -81,15 +81,11 @@ mod tests {
             .infer_schema(&state, &store, std::slice::from_ref(&meta))
             .await?;
 
-        let source = Arc::new(AvroSource::new());
-        let conf = FileScanConfigBuilder::new(
-            ObjectStoreUrl::local_filesystem(),
-            file_schema,
-            source,
-        )
-        .with_file(meta.into())
-        .with_projection_indices(Some(vec![0, 1, 2]))
-        .build();
+        let source = Arc::new(AvroSource::new(Arc::clone(&file_schema)));
+        let conf = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source)
+            .with_file(meta.into())
+            .with_projection_indices(Some(vec![0, 1, 2]))?
+            .build();
 
         let source_exec = DataSourceExec::from_data_source(conf);
         assert_eq!(
@@ -109,20 +105,20 @@ mod tests {
             .expect("plan iterator empty")
             .expect("plan iterator returned an error");
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch]), @r###"
-            +----+----------+-------------+
-            | id | bool_col | tinyint_col |
-            +----+----------+-------------+
-            | 4  | true     | 0           |
-            | 5  | false    | 1           |
-            | 6  | true     | 0           |
-            | 7  | false    | 1           |
-            | 2  | true     | 0           |
-            | 3  | false    | 1           |
-            | 0  | true     | 0           |
-            | 1  | false    | 1           |
-            +----+----------+-------------+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch]), @r"
+        +----+----------+-------------+
+        | id | bool_col | tinyint_col |
+        +----+----------+-------------+
+        | 4  | true     | 0           |
+        | 5  | false    | 1           |
+        | 6  | true     | 0           |
+        | 7  | false    | 1           |
+        | 2  | true     | 0           |
+        | 3  | false    | 1           |
+        | 0  | true     | 0           |
+        | 1  | false    | 1           |
+        +----+----------+-------------+
+        ");}
 
         let batch = results.next().await;
         assert!(batch.is_none());
@@ -157,10 +153,10 @@ mod tests {
         // Include the missing column in the projection
         let projection = Some(vec![0, 1, 2, actual_schema.fields().len()]);
 
-        let source = Arc::new(AvroSource::new());
-        let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source)
+        let source = Arc::new(AvroSource::new(Arc::clone(&file_schema)));
+        let conf = FileScanConfigBuilder::new(object_store_url, source)
             .with_file(meta.into())
-            .with_projection_indices(projection)
+            .with_projection_indices(projection)?
             .build();
 
         let source_exec = DataSourceExec::from_data_source(conf);
@@ -182,20 +178,20 @@ mod tests {
             .expect("plan iterator empty")
             .expect("plan iterator returned an error");
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch]), @r###"
-            +----+----------+-------------+-------------+
-            | id | bool_col | tinyint_col | missing_col |
-            +----+----------+-------------+-------------+
-            | 4  | true     | 0           |             |
-            | 5  | false    | 1           |             |
-            | 6  | true     | 0           |             |
-            | 7  | false    | 1           |             |
-            | 2  | true     | 0           |             |
-            | 3  | false    | 1           |             |
-            | 0  | true     | 0           |             |
-            | 1  | false    | 1           |             |
-            +----+----------+-------------+-------------+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch]), @r"
+        +----+----------+-------------+-------------+
+        | id | bool_col | tinyint_col | missing_col |
+        +----+----------+-------------+-------------+
+        | 4  | true     | 0           |             |
+        | 5  | false    | 1           |             |
+        | 6  | true     | 0           |             |
+        | 7  | false    | 1           |             |
+        | 2  | true     | 0           |             |
+        | 3  | false    | 1           |             |
+        | 0  | true     | 0           |             |
+        | 1  | false    | 1           |             |
+        +----+----------+-------------+-------------+
+        ");}
 
         let batch = results.next().await;
         assert!(batch.is_none());
@@ -227,13 +223,16 @@ mod tests {
         partitioned_file.partition_values = vec![ScalarValue::from("2021-10-26")];
 
         let projection = Some(vec![0, 1, file_schema.fields().len(), 2]);
-        let source = Arc::new(AvroSource::new());
-        let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source)
+        let table_schema = TableSchema::new(
+            file_schema.clone(),
+            vec![Arc::new(Field::new("date", DataType::Utf8, false))],
+        );
+        let source = Arc::new(AvroSource::new(table_schema.clone()));
+        let conf = FileScanConfigBuilder::new(object_store_url, source)
             // select specific columns of the files as well as the partitioning
             // column which is supposed to be the last column in the table schema.
-            .with_projection_indices(projection)
+            .with_projection_indices(projection)?
             .with_file(partitioned_file)
-            .with_table_partition_cols(vec![Field::new("date", DataType::Utf8, false)])
             .build();
 
         let source_exec = DataSourceExec::from_data_source(conf);
@@ -256,20 +255,20 @@ mod tests {
             .expect("plan iterator empty")
             .expect("plan iterator returned an error");
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch]), @r###"
-            +----+----------+------------+-------------+
-            | id | bool_col | date       | tinyint_col |
-            +----+----------+------------+-------------+
-            | 4  | true     | 2021-10-26 | 0           |
-            | 5  | false    | 2021-10-26 | 1           |
-            | 6  | true     | 2021-10-26 | 0           |
-            | 7  | false    | 2021-10-26 | 1           |
-            | 2  | true     | 2021-10-26 | 0           |
-            | 3  | false    | 2021-10-26 | 1           |
-            | 0  | true     | 2021-10-26 | 0           |
-            | 1  | false    | 2021-10-26 | 1           |
-            +----+----------+------------+-------------+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch]), @r"
+        +----+----------+------------+-------------+
+        | id | bool_col | date       | tinyint_col |
+        +----+----------+------------+-------------+
+        | 4  | true     | 2021-10-26 | 0           |
+        | 5  | false    | 2021-10-26 | 1           |
+        | 6  | true     | 2021-10-26 | 0           |
+        | 7  | false    | 2021-10-26 | 1           |
+        | 2  | true     | 2021-10-26 | 0           |
+        | 3  | false    | 2021-10-26 | 1           |
+        | 0  | true     | 2021-10-26 | 0           |
+        | 1  | false    | 2021-10-26 | 1           |
+        +----+----------+------------+-------------+
+        ");}
 
         let batch = results.next().await;
         assert!(batch.is_none());
diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs
index 4f46a57d8b137..82c47b6c7281c 100644
--- a/datafusion/core/src/datasource/physical_plan/csv.rs
+++ b/datafusion/core/src/datasource/physical_plan/csv.rs
@@ -29,18 +29,21 @@ mod tests {
     use std::io::Write;
     use std::sync::Arc;
 
+    use datafusion_datasource::TableSchema;
     use datafusion_datasource_csv::CsvFormat;
-    use object_store::ObjectStore;
+    use object_store::{ObjectStore, ObjectStoreExt};
 
+    use crate::datasource::file_format::FileFormat;
     use crate::prelude::CsvReadOptions;
     use crate::prelude::SessionContext;
     use crate::test::partitioned_file_groups;
+    use datafusion_common::config::CsvOptions;
     use datafusion_common::test_util::arrow_test_data;
     use datafusion_common::test_util::batches_to_string;
-    use datafusion_common::{assert_batches_eq, Result};
+    use datafusion_common::{Result, assert_batches_eq};
     use datafusion_execution::config::SessionConfig;
-    use datafusion_physical_plan::metrics::MetricsSet;
     use datafusion_physical_plan::ExecutionPlan;
+    use datafusion_physical_plan::metrics::MetricsSet;
 
     #[cfg(feature = "compression")]
     use datafusion_datasource::file_compression_type::FileCompressionType;
@@ -94,32 +97,39 @@ mod tests {
     async fn csv_exec_with_projection(
         file_compression_type: FileCompressionType,
     ) -> Result<()> {
+        use datafusion_datasource::TableSchema;
+
         let session_ctx = SessionContext::new();
         let task_ctx = session_ctx.task_ctx();
         let file_schema = aggr_test_schema();
         let path = format!("{}/csv", arrow_test_data());
         let filename = "aggregate_test_100.csv";
         let tmp_dir = TempDir::new()?;
+        let csv_format: Arc<dyn FileFormat> = Arc::new(CsvFormat::default());
 
         let file_groups = partitioned_file_groups(
             path.as_str(),
             filename,
             1,
-            Arc::new(CsvFormat::default()),
+            &csv_format,
             file_compression_type.to_owned(),
             tmp_dir.path(),
         )?;
 
-        let source = Arc::new(CsvSource::new(true, b',', b'"'));
-        let config = FileScanConfigBuilder::from(partitioned_csv_config(
-            file_schema,
-            file_groups,
-            source,
-        ))
-        .with_file_compression_type(file_compression_type)
-        .with_newlines_in_values(false)
-        .with_projection_indices(Some(vec![0, 2, 4]))
-        .build();
+        let options = CsvOptions {
+            has_header: Some(true),
+            delimiter: b',',
+            quote: b'"',
+            ..Default::default()
+        };
+        let table_schema = TableSchema::from_file_schema(Arc::clone(&file_schema));
+        let source =
+            Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
+        let config =
+            FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
+                .with_file_compression_type(file_compression_type)
+                .with_projection_indices(Some(vec![0, 2, 4]))?
+                .build();
 
         assert_eq!(13, config.file_schema().fields().len());
         let csv = DataSourceExec::from_data_source(config);
@@ -131,17 +141,17 @@ mod tests {
         assert_eq!(3, batch.num_columns());
         assert_eq!(100, batch.num_rows());
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch.slice(0, 5)]), @r###"
-            +----+-----+------------+
-            | c1 | c3  | c5         |
-            +----+-----+------------+
-            | c  | 1   | 2033001162 |
-            | d  | -40 | 706441268  |
-            | b  | 29  | 994303988  |
-            | a  | -85 | 1171968280 |
-            | b  | -82 | 1824882165 |
-            +----+-----+------------+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch.slice(0, 5)]), @r"
+        +----+-----+------------+
+        | c1 | c3  | c5         |
+        +----+-----+------------+
+        | c  | 1   | 2033001162 |
+        | d  | -40 | 706441268  |
+        | b  | 29  | 994303988  |
+        | a  | -85 | 1171968280 |
+        | b  | -82 | 1824882165 |
+        +----+-----+------------+
+        ");}
         Ok(())
     }
 
@@ -158,6 +168,8 @@ mod tests {
     async fn csv_exec_with_mixed_order_projection(
         file_compression_type: FileCompressionType,
     ) -> Result<()> {
+        use datafusion_datasource::TableSchema;
+
         let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true");
         let session_ctx = SessionContext::new_with_config(cfg);
         let task_ctx = session_ctx.task_ctx();
@@ -165,26 +177,31 @@ mod tests {
         let path = format!("{}/csv", arrow_test_data());
         let filename = "aggregate_test_100.csv";
         let tmp_dir = TempDir::new()?;
+        let csv_format: Arc<dyn FileFormat> = Arc::new(CsvFormat::default());
 
         let file_groups = partitioned_file_groups(
             path.as_str(),
             filename,
             1,
-            Arc::new(CsvFormat::default()),
+            &csv_format,
             file_compression_type.to_owned(),
             tmp_dir.path(),
         )?;
 
-        let source = Arc::new(CsvSource::new(true, b',', b'"'));
-        let config = FileScanConfigBuilder::from(partitioned_csv_config(
-            file_schema,
-            file_groups,
-            source,
-        ))
-        .with_newlines_in_values(false)
-        .with_file_compression_type(file_compression_type.to_owned())
-        .with_projection_indices(Some(vec![4, 0, 2]))
-        .build();
+        let options = CsvOptions {
+            has_header: Some(true),
+            delimiter: b',',
+            quote: b'"',
+            ..Default::default()
+        };
+        let table_schema = TableSchema::from_file_schema(Arc::clone(&file_schema));
+        let source =
+            Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
+        let config =
+            FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
+                .with_file_compression_type(file_compression_type.to_owned())
+                .with_projection_indices(Some(vec![4, 0, 2]))?
+                .build();
         assert_eq!(13, config.file_schema().fields().len());
         let csv = DataSourceExec::from_data_source(config);
         assert_eq!(3, csv.schema().fields().len());
@@ -194,17 +211,17 @@ mod tests {
         assert_eq!(3, batch.num_columns());
         assert_eq!(100, batch.num_rows());
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch.slice(0, 5)]), @r###"
-            +------------+----+-----+
-            | c5         | c1 | c3  |
-            +------------+----+-----+
-            | 2033001162 | c  | 1   |
-            | 706441268  | d  | -40 |
-            | 994303988  | b  | 29  |
-            | 1171968280 | a  | -85 |
-            | 1824882165 | b  | -82 |
-            +------------+----+-----+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch.slice(0, 5)]), @r"
+        +------------+----+-----+
+        | c5         | c1 | c3  |
+        +------------+----+-----+
+        | 2033001162 | c  | 1   |
+        | 706441268  | d  | -40 |
+        | 994303988  | b  | 29  |
+        | 1171968280 | a  | -85 |
+        | 1824882165 | b  | -82 |
+        +------------+----+-----+
+        ");}
         Ok(())
     }
 
@@ -221,6 +238,7 @@ mod tests {
     async fn csv_exec_with_limit(
         file_compression_type: FileCompressionType,
     ) -> Result<()> {
+        use datafusion_datasource::TableSchema;
         use futures::StreamExt;
 
         let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true");
@@ -230,26 +248,31 @@ mod tests {
         let path = format!("{}/csv", arrow_test_data());
         let filename = "aggregate_test_100.csv";
         let tmp_dir = TempDir::new()?;
+        let csv_format: Arc<dyn FileFormat> = Arc::new(CsvFormat::default());
 
         let file_groups = partitioned_file_groups(
             path.as_str(),
             filename,
             1,
-            Arc::new(CsvFormat::default()),
+            &csv_format,
             file_compression_type.to_owned(),
             tmp_dir.path(),
         )?;
 
-        let source = Arc::new(CsvSource::new(true, b',', b'"'));
-        let config = FileScanConfigBuilder::from(partitioned_csv_config(
-            file_schema,
-            file_groups,
-            source,
-        ))
-        .with_newlines_in_values(false)
-        .with_file_compression_type(file_compression_type.to_owned())
-        .with_limit(Some(5))
-        .build();
+        let options = CsvOptions {
+            has_header: Some(true),
+            delimiter: b',',
+            quote: b'"',
+            ..Default::default()
+        };
+        let table_schema = TableSchema::from_file_schema(Arc::clone(&file_schema));
+        let source =
+            Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
+        let config =
+            FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
+                .with_file_compression_type(file_compression_type.to_owned())
+                .with_limit(Some(5))
+                .build();
         assert_eq!(13, config.file_schema().fields().len());
         let csv = DataSourceExec::from_data_source(config);
         assert_eq!(13, csv.schema().fields().len());
@@ -259,17 +282,17 @@ mod tests {
         assert_eq!(13, batch.num_columns());
         assert_eq!(5, batch.num_rows());
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch]), @r###"
-            +----+----+-----+--------+------------+----------------------+-----+-------+------------+----------------------+-------------+---------------------+--------------------------------+
-            | c1 | c2 | c3  | c4     | c5         | c6                   | c7  | c8    | c9         | c10                  | c11         | c12                 | c13                            |
-            +----+----+-----+--------+------------+----------------------+-----+-------+------------+----------------------+-------------+---------------------+--------------------------------+
-            | c  | 2  | 1   | 18109  | 2033001162 | -6513304855495910254 | 25  | 43062 | 1491205016 | 5863949479783605708  | 0.110830784 | 0.9294097332465232  | 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW |
-            | d  | 5  | -40 | 22614  | 706441268  | -7542719935673075327 | 155 | 14337 | 3373581039 | 11720144131976083864 | 0.69632107  | 0.3114712539863804  | C2GT5KVyOPZpgKVl110TyZO0NcJ434 |
-            | b  | 1  | 29  | -18218 | 994303988  | 5983957848665088916  | 204 | 9489  | 3275293996 | 14857091259186476033 | 0.53840446  | 0.17909035118828576 | AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz |
-            | a  | 1  | -85 | -15154 | 1171968280 | 1919439543497968449  | 77  | 52286 | 774637006  | 12101411955859039553 | 0.12285209  | 0.6864391962767343  | 0keZ5G8BffGwgF2RwQD59TFzMStxCB |
-            | b  | 5  | -82 | 22080  | 1824882165 | 7373730676428214987  | 208 | 34331 | 3342719438 | 3330177516592499461  | 0.82634634  | 0.40975383525297016 | Ig1QcuKsjHXkproePdERo2w0mYzIqd |
-            +----+----+-----+--------+------------+----------------------+-----+-------+------------+----------------------+-------------+---------------------+--------------------------------+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch]), @r"
+        +----+----+-----+--------+------------+----------------------+-----+-------+------------+----------------------+-------------+---------------------+--------------------------------+
+        | c1 | c2 | c3  | c4     | c5         | c6                   | c7  | c8    | c9         | c10                  | c11         | c12                 | c13                            |
+        +----+----+-----+--------+------------+----------------------+-----+-------+------------+----------------------+-------------+---------------------+--------------------------------+
+        | c  | 2  | 1   | 18109  | 2033001162 | -6513304855495910254 | 25  | 43062 | 1491205016 | 5863949479783605708  | 0.110830784 | 0.9294097332465232  | 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW |
+        | d  | 5  | -40 | 22614  | 706441268  | -7542719935673075327 | 155 | 14337 | 3373581039 | 11720144131976083864 | 0.69632107  | 0.3114712539863804  | C2GT5KVyOPZpgKVl110TyZO0NcJ434 |
+        | b  | 1  | 29  | -18218 | 994303988  | 5983957848665088916  | 204 | 9489  | 3275293996 | 14857091259186476033 | 0.53840446  | 0.17909035118828576 | AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz |
+        | a  | 1  | -85 | -15154 | 1171968280 | 1919439543497968449  | 77  | 52286 | 774637006  | 12101411955859039553 | 0.12285209  | 0.6864391962767343  | 0keZ5G8BffGwgF2RwQD59TFzMStxCB |
+        | b  | 5  | -82 | 22080  | 1824882165 | 7373730676428214987  | 208 | 34331 | 3342719438 | 3330177516592499461  | 0.82634634  | 0.40975383525297016 | Ig1QcuKsjHXkproePdERo2w0mYzIqd |
+        +----+----+-----+--------+------------+----------------------+-----+-------+------------+----------------------+-------------+---------------------+--------------------------------+
+        ");}
 
         Ok(())
     }
@@ -287,32 +310,39 @@ mod tests {
     async fn csv_exec_with_missing_column(
         file_compression_type: FileCompressionType,
     ) -> Result<()> {
+        use datafusion_datasource::TableSchema;
+
         let session_ctx = SessionContext::new();
         let task_ctx = session_ctx.task_ctx();
         let file_schema = aggr_test_schema_with_missing_col();
         let path = format!("{}/csv", arrow_test_data());
         let filename = "aggregate_test_100.csv";
         let tmp_dir = TempDir::new()?;
+        let csv_format: Arc<dyn FileFormat> = Arc::new(CsvFormat::default());
 
         let file_groups = partitioned_file_groups(
             path.as_str(),
             filename,
             1,
-            Arc::new(CsvFormat::default()),
+            &csv_format,
             file_compression_type.to_owned(),
             tmp_dir.path(),
         )?;
 
-        let source = Arc::new(CsvSource::new(true, b',', b'"'));
-        let config = FileScanConfigBuilder::from(partitioned_csv_config(
-            file_schema,
-            file_groups,
-            source,
-        ))
-        .with_newlines_in_values(false)
-        .with_file_compression_type(file_compression_type.to_owned())
-        .with_limit(Some(5))
-        .build();
+        let options = CsvOptions {
+            has_header: Some(true),
+            delimiter: b',',
+            quote: b'"',
+            ..Default::default()
+        };
+        let table_schema = TableSchema::from_file_schema(Arc::clone(&file_schema));
+        let source =
+            Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
+        let config =
+            FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
+                .with_file_compression_type(file_compression_type.to_owned())
+                .with_limit(Some(5))
+                .build();
         assert_eq!(14, config.file_schema().fields().len());
         let csv = DataSourceExec::from_data_source(config);
         assert_eq!(14, csv.schema().fields().len());
@@ -341,6 +371,7 @@ mod tests {
         file_compression_type: FileCompressionType,
     ) -> Result<()> {
         use datafusion_common::ScalarValue;
+        use datafusion_datasource::TableSchema;
 
         let session_ctx = SessionContext::new();
         let task_ctx = session_ctx.task_ctx();
@@ -348,12 +379,13 @@ mod tests {
         let path = format!("{}/csv", arrow_test_data());
         let filename = "aggregate_test_100.csv";
         let tmp_dir = TempDir::new()?;
+        let csv_format: Arc<dyn FileFormat> = Arc::new(CsvFormat::default());
 
         let mut file_groups = partitioned_file_groups(
             path.as_str(),
             filename,
             1,
-            Arc::new(CsvFormat::default()),
+            &csv_format,
             file_compression_type.to_owned(),
             tmp_dir.path(),
         )?;
@@ -362,19 +394,25 @@ mod tests {
 
         let num_file_schema_fields = file_schema.fields().len();
 
-        let source = Arc::new(CsvSource::new(true, b',', b'"'));
-        let config = FileScanConfigBuilder::from(partitioned_csv_config(
-            file_schema,
-            file_groups,
-            source,
-        ))
-        .with_newlines_in_values(false)
-        .with_file_compression_type(file_compression_type.to_owned())
-        .with_table_partition_cols(vec![Field::new("date", DataType::Utf8, false)])
-        // We should be able to project on the partition column
-        // Which is supposed to be after the file fields
-        .with_projection_indices(Some(vec![0, num_file_schema_fields]))
-        .build();
+        let options = CsvOptions {
+            has_header: Some(true),
+            delimiter: b',',
+            quote: b'"',
+            ..Default::default()
+        };
+        let table_schema = TableSchema::new(
+            Arc::clone(&file_schema),
+            vec![Arc::new(Field::new("date", DataType::Utf8, false))],
+        );
+        let source =
+            Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
+        let config =
+            FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
+                .with_file_compression_type(file_compression_type.to_owned())
+                // We should be able to project on the partition column
+                // Which is supposed to be after the file fields
+                .with_projection_indices(Some(vec![0, num_file_schema_fields]))?
+                .build();
 
         // we don't have `/date=xx/` in the path but that is ok because
         // partitions are resolved during scan anyway
@@ -388,17 +426,17 @@ mod tests {
         assert_eq!(2, batch.num_columns());
         assert_eq!(100, batch.num_rows());
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch.slice(0, 5)]), @r###"
-            +----+------------+
-            | c1 | date       |
-            +----+------------+
-            | c  | 2021-10-26 |
-            | d  | 2021-10-26 |
-            | b  | 2021-10-26 |
-            | a  | 2021-10-26 |
-            | b  | 2021-10-26 |
-            +----+------------+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch.slice(0, 5)]), @r"
+        +----+------------+
+        | c1 | date       |
+        +----+------------+
+        | c  | 2021-10-26 |
+        | d  | 2021-10-26 |
+        | b  | 2021-10-26 |
+        | a  | 2021-10-26 |
+        | b  | 2021-10-26 |
+        +----+------------+
+        ");}
 
         let metrics = csv.metrics().expect("doesn't found metrics");
         let time_elapsed_processing = get_value(&metrics, "time_elapsed_processing");
@@ -452,26 +490,31 @@ mod tests {
         let path = format!("{}/csv", arrow_test_data());
         let filename = "aggregate_test_100.csv";
         let tmp_dir = TempDir::new()?;
+        let csv_format: Arc<dyn FileFormat> = Arc::new(CsvFormat::default());
 
         let file_groups = partitioned_file_groups(
             path.as_str(),
             filename,
             1,
-            Arc::new(CsvFormat::default()),
+            &csv_format,
             file_compression_type.to_owned(),
             tmp_dir.path(),
         )
         .unwrap();
 
-        let source = Arc::new(CsvSource::new(true, b',', b'"'));
-        let config = FileScanConfigBuilder::from(partitioned_csv_config(
-            file_schema,
-            file_groups,
-            source,
-        ))
-        .with_newlines_in_values(false)
-        .with_file_compression_type(file_compression_type.to_owned())
-        .build();
+        let options = CsvOptions {
+            has_header: Some(true),
+            delimiter: b',',
+            quote: b'"',
+            ..Default::default()
+        };
+        let table_schema = TableSchema::from_file_schema(Arc::clone(&file_schema));
+        let source =
+            Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
+        let config =
+            FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
+                .with_file_compression_type(file_compression_type.to_owned())
+                .build();
         let csv = DataSourceExec::from_data_source(config);
 
         let it = csv.execute(0, task_ctx).unwrap();
@@ -527,14 +570,14 @@ mod tests {
 
         let result = df.collect().await.unwrap();
 
-        assert_snapshot!(batches_to_string(&result), @r###"
-            +---+---+
-            | a | b |
-            +---+---+
-            | 1 | 2 |
-            | 3 | 4 |
-            +---+---+
-        "###);
+        assert_snapshot!(batches_to_string(&result), @r"
+        +---+---+
+        | a | b |
+        +---+---+
+        | 1 | 2 |
+        | 3 | 4 |
+        +---+---+
+        ");
     }
 
     #[tokio::test]
@@ -556,14 +599,14 @@ mod tests {
 
         let result = df.collect().await.unwrap();
 
-        assert_snapshot!(batches_to_string(&result),@r###"
-            +---+---+
-            | a | b |
-            +---+---+
-            | 1 | 2 |
-            | 3 | 4 |
-            +---+---+
-        "###);
+        assert_snapshot!(batches_to_string(&result),@r"
+        +---+---+
+        | a | b |
+        +---+---+
+        | 1 | 2 |
+        | 3 | 4 |
+        +---+---+
+        ");
 
         let e = session_ctx
             .read_csv("memory:///", CsvReadOptions::new().terminator(Some(b'\n')))
@@ -572,7 +615,10 @@ mod tests {
             .collect()
             .await
             .unwrap_err();
-        assert_eq!(e.strip_backtrace(), "Arrow error: Csv error: incorrect number of fields for line 1, expected 2 got more than 2")
+        assert_eq!(
+            e.strip_backtrace(),
+            "Arrow error: Csv error: incorrect number of fields for line 1, expected 2 got more than 2"
+        )
     }
 
     #[tokio::test]
@@ -593,22 +639,22 @@ mod tests {
         .await?;
 
         let df = ctx.sql(r#"select * from t1"#).await?.collect().await?;
-        assert_snapshot!(batches_to_string(&df),@r###"
-            +------+--------+
-            | col1 | col2   |
-            +------+--------+
-            | id0  | value0 |
-            | id1  | value1 |
-            | id2  | value2 |
-            | id3  | value3 |
-            +------+--------+
-        "###);
+        assert_snapshot!(batches_to_string(&df),@r"
+        +------+--------+
+        | col1 | col2   |
+        +------+--------+
+        | id0  | value0 |
+        | id1  | value1 |
+        | id2  | value2 |
+        | id3  | value3 |
+        +------+--------+
+        ");
         Ok(())
     }
 
     #[tokio::test]
-    async fn test_create_external_table_with_terminator_with_newlines_in_values(
-    ) -> Result<()> {
+    async fn test_create_external_table_with_terminator_with_newlines_in_values()
+    -> Result<()> {
         let ctx = SessionContext::new();
         ctx.sql(r#"
             CREATE EXTERNAL TABLE t1 (
@@ -658,7 +704,10 @@ mod tests {
             )
             .await
             .expect_err("should fail because input file does not match inferred schema");
-        assert_eq!(e.strip_backtrace(), "Arrow error: Parser error: Error while parsing value 'd' as type 'Int64' for column 0 at line 4. Row data: '[d,4]'");
+        assert_eq!(
+            e.strip_backtrace(),
+            "Arrow error: Parser error: Error while parsing value 'd' as type 'Int64' for column 0 at line 4. Row data: '[d,4]'"
+        );
         Ok(())
     }
 
diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs
index f7d5c710bf48a..b70791c7b2390 100644
--- a/datafusion/core/src/datasource/physical_plan/json.rs
+++ b/datafusion/core/src/datasource/physical_plan/json.rs
@@ -32,11 +32,11 @@ mod tests {
 
     use crate::dataframe::DataFrameWriteOptions;
     use crate::execution::SessionState;
-    use crate::prelude::{CsvReadOptions, NdJsonReadOptions, SessionContext};
+    use crate::prelude::{CsvReadOptions, JsonReadOptions, SessionContext};
     use crate::test::partitioned_file_groups;
+    use datafusion_common::Result;
     use datafusion_common::cast::{as_int32_array, as_int64_array, as_string_array};
     use datafusion_common::test_util::batches_to_string;
-    use datafusion_common::Result;
     use datafusion_datasource::file_compression_type::FileCompressionType;
     use datafusion_datasource::file_format::FileFormat;
     use datafusion_datasource_json::JsonFormat;
@@ -51,9 +51,9 @@ mod tests {
     use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
     use datafusion_datasource::source::DataSourceExec;
     use insta::assert_snapshot;
+    use object_store::ObjectStore;
     use object_store::chunked::ChunkedStore;
     use object_store::local::LocalFileSystem;
-    use object_store::ObjectStore;
     use rstest::*;
     use tempfile::TempDir;
     use url::Url;
@@ -69,11 +69,13 @@ mod tests {
         let store = state.runtime_env().object_store(&store_url).unwrap();
 
         let filename = "1.json";
+        let json_format: Arc<dyn FileFormat> = Arc::new(JsonFormat::default());
+
         let file_groups = partitioned_file_groups(
             TEST_DATA_BASE,
             filename,
             1,
-            Arc::new(JsonFormat::default()),
+            &json_format,
             file_compression_type.to_owned(),
             work_dir,
         )
@@ -104,11 +106,13 @@ mod tests {
         ctx.register_object_store(&url, store.clone());
         let filename = "1.json";
         let tmp_dir = TempDir::new()?;
+        let json_format: Arc<dyn FileFormat> = Arc::new(JsonFormat::default());
+
         let file_groups = partitioned_file_groups(
             TEST_DATA_BASE,
             filename,
             1,
-            Arc::new(JsonFormat::default()),
+            &json_format,
             file_compression_type.to_owned(),
             tmp_dir.path(),
         )
@@ -132,22 +136,22 @@ mod tests {
             .get_ext_with_compression(&file_compression_type)
             .unwrap();
 
-        let read_options = NdJsonReadOptions::default()
+        let read_options = JsonReadOptions::default()
             .file_extension(ext.as_str())
             .file_compression_type(file_compression_type.to_owned());
         let frame = ctx.read_json(path, read_options).await.unwrap();
         let results = frame.collect().await.unwrap();
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&results), @r###"
-            +-----+------------------+---------------+------+
-            | a   | b                | c             | d    |
-            +-----+------------------+---------------+------+
-            | 1   | [2.0, 1.3, -6.1] | [false, true] | 4    |
-            | -10 | [2.0, 1.3, -6.1] | [true, true]  | 4    |
-            | 2   | [2.0, , -6.1]    | [false, ]     | text |
-            |     |                  |               |      |
-            +-----+------------------+---------------+------+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&results), @r"
+        +-----+------------------+---------------+------+
+        | a   | b                | c             | d    |
+        +-----+------------------+---------------+------+
+        | 1   | [2.0, 1.3, -6.1] | [false, true] | 4    |
+        | -10 | [2.0, 1.3, -6.1] | [true, true]  | 4    |
+        | 2   | [2.0, , -6.1]    | [false, ]     | text |
+        |     |                  |               |      |
+        +-----+------------------+---------------+------+
+        ");}
 
         Ok(())
     }
@@ -176,8 +180,8 @@ mod tests {
         let (object_store_url, file_groups, file_schema) =
             prepare_store(&state, file_compression_type.to_owned(), tmp_dir.path()).await;
 
-        let source = Arc::new(JsonSource::new());
-        let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source)
+        let source = Arc::new(JsonSource::new(Arc::clone(&file_schema)));
+        let conf = FileScanConfigBuilder::new(object_store_url, source)
             .with_file_groups(file_groups)
             .with_limit(Some(3))
             .with_file_compression_type(file_compression_type.to_owned())
@@ -251,8 +255,8 @@ mod tests {
         let file_schema = Arc::new(builder.finish());
         let missing_field_idx = file_schema.fields.len() - 1;
 
-        let source = Arc::new(JsonSource::new());
-        let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source)
+        let source = Arc::new(JsonSource::new(Arc::clone(&file_schema)));
+        let conf = FileScanConfigBuilder::new(object_store_url, source)
             .with_file_groups(file_groups)
             .with_limit(Some(3))
             .with_file_compression_type(file_compression_type.to_owned())
@@ -294,10 +298,11 @@ mod tests {
         let (object_store_url, file_groups, file_schema) =
             prepare_store(&state, file_compression_type.to_owned(), tmp_dir.path()).await;
 
-        let source = Arc::new(JsonSource::new());
-        let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source)
+        let source = Arc::new(JsonSource::new(Arc::clone(&file_schema)));
+        let conf = FileScanConfigBuilder::new(object_store_url, source)
             .with_file_groups(file_groups)
             .with_projection_indices(Some(vec![0, 2]))
+            .unwrap()
             .with_file_compression_type(file_compression_type.to_owned())
             .build();
         let exec = DataSourceExec::from_data_source(conf);
@@ -342,10 +347,10 @@ mod tests {
         let (object_store_url, file_groups, file_schema) =
             prepare_store(&state, file_compression_type.to_owned(), tmp_dir.path()).await;
 
-        let source = Arc::new(JsonSource::new());
-        let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source)
+        let source = Arc::new(JsonSource::new(Arc::clone(&file_schema)));
+        let conf = FileScanConfigBuilder::new(object_store_url, source)
             .with_file_groups(file_groups)
-            .with_projection_indices(Some(vec![3, 0, 2]))
+            .with_projection_indices(Some(vec![3, 0, 2]))?
             .with_file_compression_type(file_compression_type.to_owned())
             .build();
         let exec = DataSourceExec::from_data_source(conf);
@@ -384,7 +389,7 @@ mod tests {
         let path = format!("{TEST_DATA_BASE}/1.json");
 
         // register json file with the execution context
-        ctx.register_json("test", path.as_str(), NdJsonReadOptions::default())
+        ctx.register_json("test", path.as_str(), JsonReadOptions::default())
             .await?;
 
         // register a local file system object store for /tmp directory
@@ -426,7 +431,7 @@ mod tests {
         }
 
         // register each partition as well as the top level dir
-        let json_read_option = NdJsonReadOptions::default();
+        let json_read_option = JsonReadOptions::default();
         ctx.register_json(
             "part0",
             &format!("{out_dir}/{part_0_name}"),
@@ -494,7 +499,10 @@ mod tests {
             .write_json(out_dir_url, DataFrameWriteOptions::new(), None)
             .await
             .expect_err("should fail because input file does not match inferred schema");
-        assert_eq!(e.strip_backtrace(), "Arrow error: Parser error: Error while parsing value 'd' as type 'Int64' for column 0 at line 4. Row data: '[d,4]'");
+        assert_eq!(
+            e.strip_backtrace(),
+            "Arrow error: Parser error: Error while parsing value 'd' as type 'Int64' for column 0 at line 4. Row data: '[d,4]'"
+        );
         Ok(())
     }
 
@@ -503,7 +511,7 @@ mod tests {
         async fn read_test_data(schema_infer_max_records: usize) -> Result<SchemaRef> {
             let ctx = SessionContext::new();
 
-            let options = NdJsonReadOptions {
+            let options = JsonReadOptions {
                 schema_infer_max_records,
                 ..Default::default()
             };
@@ -579,7 +587,7 @@ mod tests {
             .get_ext_with_compression(&file_compression_type)
             .unwrap();
 
-        let read_option = NdJsonReadOptions::default()
+        let read_option = JsonReadOptions::default()
             .file_compression_type(file_compression_type)
             .file_extension(ext.as_str());
 
diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs
index 1ac292e260fdf..8e4855afa66bb 100644
--- a/datafusion/core/src/datasource/physical_plan/mod.rs
+++ b/datafusion/core/src/datasource/physical_plan/mod.rs
@@ -43,146 +43,11 @@ pub use datafusion_datasource::file::FileSource;
 pub use datafusion_datasource::file_groups::FileGroup;
 pub use datafusion_datasource::file_groups::FileGroupPartitioner;
 pub use datafusion_datasource::file_scan_config::{
-    wrap_partition_type_in_dict, wrap_partition_value_in_dict, FileScanConfig,
-    FileScanConfigBuilder,
+    FileScanConfig, FileScanConfigBuilder, wrap_partition_type_in_dict,
+    wrap_partition_value_in_dict,
 };
 pub use datafusion_datasource::file_sink_config::*;
 
 pub use datafusion_datasource::file_stream::{
-    FileOpenFuture, FileOpener, FileStream, OnError,
+    FileOpenFuture, FileOpener, FileStream, FileStreamBuilder, OnError,
 };
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Arc;
-
-    use arrow::array::{
-        cast::AsArray,
-        types::{Float32Type, Float64Type, UInt32Type},
-        BinaryArray, BooleanArray, Float32Array, Int32Array, Int64Array, RecordBatch,
-        StringArray, UInt64Array,
-    };
-    use arrow::datatypes::{DataType, Field, Schema};
-    use arrow_schema::SchemaRef;
-
-    use crate::datasource::schema_adapter::{
-        DefaultSchemaAdapterFactory, SchemaAdapterFactory,
-    };
-
-    #[test]
-    fn schema_mapping_map_batch() {
-        let table_schema = Arc::new(Schema::new(vec![
-            Field::new("c1", DataType::Utf8, true),
-            Field::new("c2", DataType::UInt32, true),
-            Field::new("c3", DataType::Float64, true),
-        ]));
-
-        let adapter = DefaultSchemaAdapterFactory
-            .create(table_schema.clone(), table_schema.clone());
-
-        let file_schema = Schema::new(vec![
-            Field::new("c1", DataType::Utf8, true),
-            Field::new("c2", DataType::UInt64, true),
-            Field::new("c3", DataType::Float32, true),
-        ]);
-
-        let (mapping, _) = adapter.map_schema(&file_schema).expect("map schema failed");
-
-        let c1 = StringArray::from(vec!["hello", "world"]);
-        let c2 = UInt64Array::from(vec![9_u64, 5_u64]);
-        let c3 = Float32Array::from(vec![2.0_f32, 7.0_f32]);
-        let batch = RecordBatch::try_new(
-            Arc::new(file_schema),
-            vec![Arc::new(c1), Arc::new(c2), Arc::new(c3)],
-        )
-        .unwrap();
-
-        let mapped_batch = mapping.map_batch(batch).unwrap();
-
-        assert_eq!(mapped_batch.schema(), table_schema);
-        assert_eq!(mapped_batch.num_columns(), 3);
-        assert_eq!(mapped_batch.num_rows(), 2);
-
-        let c1 = mapped_batch.column(0).as_string::<i32>();
-        let c2 = mapped_batch.column(1).as_primitive::<UInt32Type>();
-        let c3 = mapped_batch.column(2).as_primitive::<Float64Type>();
-
-        assert_eq!(c1.value(0), "hello");
-        assert_eq!(c1.value(1), "world");
-        assert_eq!(c2.value(0), 9_u32);
-        assert_eq!(c2.value(1), 5_u32);
-        assert_eq!(c3.value(0), 2.0_f64);
-        assert_eq!(c3.value(1), 7.0_f64);
-    }
-
-    #[test]
-    fn schema_adapter_map_schema_with_projection() {
-        let table_schema = Arc::new(Schema::new(vec![
-            Field::new("c0", DataType::Utf8, true),
-            Field::new("c1", DataType::Utf8, true),
-            Field::new("c2", DataType::Float64, true),
-            Field::new("c3", DataType::Int32, true),
-            Field::new("c4", DataType::Float32, true),
-        ]));
-
-        let file_schema = Schema::new(vec![
-            Field::new("id", DataType::Int32, true),
-            Field::new("c1", DataType::Boolean, true),
-            Field::new("c2", DataType::Float32, true),
-            Field::new("c3", DataType::Binary, true),
-            Field::new("c4", DataType::Int64, true),
-        ]);
-
-        let indices = vec![1, 2, 4];
-        let schema = SchemaRef::from(table_schema.project(&indices).unwrap());
-        let adapter = DefaultSchemaAdapterFactory.create(schema, table_schema.clone());
-        let (mapping, projection) = adapter.map_schema(&file_schema).unwrap();
-
-        let id = Int32Array::from(vec![Some(1), Some(2), Some(3)]);
-        let c1 = BooleanArray::from(vec![Some(true), Some(false), Some(true)]);
-        let c2 = Float32Array::from(vec![Some(2.0_f32), Some(7.0_f32), Some(3.0_f32)]);
-        let c3 = BinaryArray::from_opt_vec(vec![
-            Some(b"hallo"),
-            Some(b"danke"),
-            Some(b"super"),
-        ]);
-        let c4 = Int64Array::from(vec![1, 2, 3]);
-        let batch = RecordBatch::try_new(
-            Arc::new(file_schema),
-            vec![
-                Arc::new(id),
-                Arc::new(c1),
-                Arc::new(c2),
-                Arc::new(c3),
-                Arc::new(c4),
-            ],
-        )
-        .unwrap();
-        let rows_num = batch.num_rows();
-        let projected = batch.project(&projection).unwrap();
-        let mapped_batch = mapping.map_batch(projected).unwrap();
-
-        assert_eq!(
-            mapped_batch.schema(),
-            Arc::new(table_schema.project(&indices).unwrap())
-        );
-        assert_eq!(mapped_batch.num_columns(), indices.len());
-        assert_eq!(mapped_batch.num_rows(), rows_num);
-
-        let c1 = mapped_batch.column(0).as_string::<i32>();
-        let c2 = mapped_batch.column(1).as_primitive::<Float64Type>();
-        let c4 = mapped_batch.column(2).as_primitive::<Float32Type>();
-
-        assert_eq!(c1.value(0), "true");
-        assert_eq!(c1.value(1), "false");
-        assert_eq!(c1.value(2), "true");
-
-        assert_eq!(c2.value(0), 2.0_f64);
-        assert_eq!(c2.value(1), 7.0_f64);
-        assert_eq!(c2.value(2), 3.0_f64);
-
-        assert_eq!(c4.value(0), 1.0_f32);
-        assert_eq!(c4.value(1), 2.0_f32);
-        assert_eq!(c4.value(2), 3.0_f32);
-    }
-}
diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs
index 0ffb252a66052..dd8c20628b43e 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet.rs
@@ -38,29 +38,29 @@ mod tests {
     use crate::prelude::{ParquetReadOptions, SessionConfig, SessionContext};
     use crate::test::object_store::local_unpartitioned_file;
     use arrow::array::{
-        ArrayRef, AsArray, Date64Array, Int32Array, Int64Array, Int8Array, StringArray,
-        StringViewArray, StructArray, TimestampNanosecondArray,
+        ArrayRef, AsArray, Date64Array, DictionaryArray, Int8Array, Int32Array,
+        Int64Array, StringArray, StringViewArray, StructArray, TimestampNanosecondArray,
     };
-    use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaBuilder};
+    use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaBuilder, UInt16Type};
     use arrow::record_batch::RecordBatch;
     use arrow::util::pretty::pretty_format_batches;
     use arrow_schema::{SchemaRef, TimeUnit};
     use bytes::{BufMut, BytesMut};
     use datafusion_common::config::TableParquetOptions;
     use datafusion_common::test_util::{batches_to_sort_string, batches_to_string};
-    use datafusion_common::{assert_contains, Result, ScalarValue};
+    use datafusion_common::{Result, ScalarValue, assert_contains};
     use datafusion_datasource::file_format::FileFormat;
     use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
     use datafusion_datasource::source::DataSourceExec;
 
     use datafusion_datasource::file::FileSource;
-    use datafusion_datasource::{FileRange, PartitionedFile, TableSchema};
+    use datafusion_datasource::{PartitionedFile, TableSchema};
     use datafusion_datasource_parquet::source::ParquetSource;
     use datafusion_datasource_parquet::{
         DefaultParquetFileReaderFactory, ParquetFileReaderFactory, ParquetFormat,
     };
     use datafusion_execution::object_store::ObjectStoreUrl;
-    use datafusion_expr::{col, lit, when, Expr};
+    use datafusion_expr::{Expr, col, lit, when};
     use datafusion_physical_expr::planner::logical2physical;
     use datafusion_physical_plan::analyze::AnalyzeExec;
     use datafusion_physical_plan::collect;
@@ -161,7 +161,7 @@ mod tests {
                 .as_ref()
                 .map(|p| logical2physical(p, &table_schema));
 
-            let mut source = ParquetSource::default();
+            let mut source = ParquetSource::new(table_schema);
             if let Some(predicate) = predicate {
                 source = source.with_predicate(predicate);
             }
@@ -186,23 +186,20 @@ mod tests {
                 source = source.with_bloom_filter_on_read(false);
             }
 
-            source.with_schema(TableSchema::new(Arc::clone(&table_schema), vec![]))
+            Arc::new(source)
         }
 
         fn build_parquet_exec(
             &self,
-            file_schema: SchemaRef,
             file_group: FileGroup,
             source: Arc<dyn FileSource>,
         ) -> Arc<DataSourceExec> {
-            let base_config = FileScanConfigBuilder::new(
-                ObjectStoreUrl::local_filesystem(),
-                file_schema,
-                source,
-            )
-            .with_file_group(file_group)
-            .with_projection_indices(self.projection.clone())
-            .build();
+            let base_config =
+                FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source)
+                    .with_file_group(file_group)
+                    .with_projection_indices(self.projection.clone())
+                    .unwrap()
+                    .build();
             DataSourceExec::from_data_source(base_config)
         }
 
@@ -231,19 +228,16 @@ mod tests {
 
             // build a ParquetExec to return the results
             let parquet_source = self.build_file_source(Arc::clone(table_schema));
-            let parquet_exec = self.build_parquet_exec(
-                Arc::clone(table_schema),
-                file_group.clone(),
-                Arc::clone(&parquet_source),
-            );
+            let parquet_exec =
+                self.build_parquet_exec(file_group.clone(), Arc::clone(&parquet_source));
 
             let analyze_exec = Arc::new(AnalyzeExec::new(
                 false,
                 false,
-                vec![MetricType::SUMMARY, MetricType::DEV],
+                vec![MetricType::Summary, MetricType::Dev],
+                None,
                 // use a new ParquetSource to avoid sharing execution metrics
                 self.build_parquet_exec(
-                    Arc::clone(table_schema),
                     file_group.clone(),
                     self.build_file_source(Arc::clone(table_schema)),
                 ),
@@ -313,7 +307,7 @@ mod tests {
 
         let batch = RecordBatch::try_new(file_schema.clone(), vec![c1]).unwrap();
 
-        // Since c2 is missing from the file and we didn't supply a custom `SchemaAdapterFactory`,
+        // Since c2 is missing from the file and we didn't supply a custom `PhysicalExprAdapterFactory`,
         // the default behavior is to fill in missing columns with nulls.
         // Thus this predicate will come back as false.
         let filter = col("c2").eq(lit(1_i32));
@@ -344,13 +338,13 @@ mod tests {
             .await;
         let batches = rt.batches.unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&batches),@r###"
+        insta::assert_snapshot!(batches_to_sort_string(&batches),@r"
         +----+----+
         | c1 | c2 |
         +----+----+
         | 1  |    |
         +----+----+
-        "###);
+        ");
 
         let metrics = rt.parquet_exec.metrics().unwrap();
         let metric = get_value(&metrics, "pushdown_rows_pruned");
@@ -371,7 +365,7 @@ mod tests {
 
         let batch = RecordBatch::try_new(file_schema.clone(), vec![c1]).unwrap();
 
-        // Since c2 is missing from the file and we didn't supply a custom `SchemaAdapterFactory`,
+        // Since c2 is missing from the file and we didn't supply a custom `PhysicalExprAdapterFactory`,
         // the default behavior is to fill in missing columns with nulls.
         // Thus this predicate will come back as false.
         let filter = col("c2").eq(lit("abc"));
@@ -402,13 +396,13 @@ mod tests {
             .await;
         let batches = rt.batches.unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&batches),@r###"
+        insta::assert_snapshot!(batches_to_sort_string(&batches),@r"
         +----+----+
         | c1 | c2 |
         +----+----+
         | 1  |    |
         +----+----+
-        "###);
+        ");
 
         let metrics = rt.parquet_exec.metrics().unwrap();
         let metric = get_value(&metrics, "pushdown_rows_pruned");
@@ -433,7 +427,7 @@ mod tests {
 
         let batch = RecordBatch::try_new(file_schema.clone(), vec![c1, c3]).unwrap();
 
-        // Since c2 is missing from the file and we didn't supply a custom `SchemaAdapterFactory`,
+        // Since c2 is missing from the file and we didn't supply a custom `PhysicalExprAdapterFactory`,
         // the default behavior is to fill in missing columns with nulls.
         // Thus this predicate will come back as false.
         let filter = col("c2").eq(lit("abc"));
@@ -464,13 +458,13 @@ mod tests {
             .await;
         let batches = rt.batches.unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&batches),@r###"
+        insta::assert_snapshot!(batches_to_sort_string(&batches),@r"
         +----+----+----+
         | c1 | c2 | c3 |
         +----+----+----+
         | 1  |    | 7  |
         +----+----+----+
-        "###);
+        ");
 
         let metrics = rt.parquet_exec.metrics().unwrap();
         let metric = get_value(&metrics, "pushdown_rows_pruned");
@@ -495,7 +489,7 @@ mod tests {
         let batch =
             RecordBatch::try_new(file_schema.clone(), vec![c3.clone(), c3]).unwrap();
 
-        // Since c2 is missing from the file and we didn't supply a custom `SchemaAdapterFactory`,
+        // Since c2 is missing from the file and we didn't supply a custom `PhysicalExprAdapterFactory`,
         // the default behavior is to fill in missing columns with nulls.
         // Thus this predicate will come back as false.
         let filter = col("c2").eq(lit("abc"));
@@ -526,13 +520,13 @@ mod tests {
             .await;
         let batches = rt.batches.unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&batches),@r###"
+        insta::assert_snapshot!(batches_to_sort_string(&batches),@r"
         +----+----+----+
         | c1 | c2 | c3 |
         +----+----+----+
         |    |    | 7  |
         +----+----+----+
-        "###);
+        ");
 
         let metrics = rt.parquet_exec.metrics().unwrap();
         let metric = get_value(&metrics, "pushdown_rows_pruned");
@@ -575,13 +569,13 @@ mod tests {
 
         let batches = rt.batches.unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&batches),@r###"
+        insta::assert_snapshot!(batches_to_sort_string(&batches),@r"
         +----+----+----+
         | c1 | c2 | c3 |
         +----+----+----+
         | 1  |    | 10 |
         +----+----+----+
-        "###);
+        ");
 
         let metrics = rt.parquet_exec.metrics().unwrap();
         let metric = get_value(&metrics, "pushdown_rows_pruned");
@@ -605,7 +599,7 @@ mod tests {
 
         let batches = rt.batches.unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&batches),@r###"
+        insta::assert_snapshot!(batches_to_sort_string(&batches),@r"
         +----+----+----+
         | c1 | c2 | c3 |
         +----+----+----+
@@ -613,7 +607,7 @@ mod tests {
         | 4  |    | 40 |
         | 5  |    | 50 |
         +----+----+----+
-        "###);
+        ");
 
         let metrics = rt.parquet_exec.metrics().unwrap();
         let metric = get_value(&metrics, "pushdown_rows_pruned");
@@ -642,7 +636,7 @@ mod tests {
             .await
             .unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&read), @r###"
+        insta::assert_snapshot!(batches_to_sort_string(&read), @r"
         +-----+----+----+
         | c1  | c2 | c3 |
         +-----+----+----+
@@ -656,7 +650,7 @@ mod tests {
         | bar |    |    |
         | bar |    |    |
         +-----+----+----+
-        "###);
+        ");
     }
 
     #[tokio::test]
@@ -757,18 +751,18 @@ mod tests {
             .await
             .unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&read),@r###"
-            +-----+----+----+
-            | c1  | c3 | c2 |
-            +-----+----+----+
-            |     |    |    |
-            |     | 10 | 1  |
-            |     | 20 |    |
-            |     | 20 | 2  |
-            | Foo | 10 |    |
-            | bar |    |    |
-            +-----+----+----+
-        "###);
+        insta::assert_snapshot!(batches_to_sort_string(&read),@r"
+        +-----+----+----+
+        | c1  | c3 | c2 |
+        +-----+----+----+
+        |     |    |    |
+        |     | 10 | 1  |
+        |     | 20 |    |
+        |     | 20 | 2  |
+        | Foo | 10 |    |
+        | bar |    |    |
+        +-----+----+----+
+        ");
     }
 
     #[tokio::test]
@@ -789,14 +783,14 @@ mod tests {
             .round_trip(vec![batch1, batch2])
             .await;
 
-        insta::assert_snapshot!(batches_to_sort_string(&rt.batches.unwrap()), @r###"
+        insta::assert_snapshot!(batches_to_sort_string(&rt.batches.unwrap()), @r"
         +----+----+----+
         | c1 | c3 | c2 |
         +----+----+----+
         |    | 10 | 1  |
         |    | 20 | 2  |
         +----+----+----+
-        "###);
+        ");
         let metrics = rt.parquet_exec.metrics().unwrap();
         // Note there are were 6 rows in total (across three batches)
         assert_eq!(get_value(&metrics, "pushdown_rows_pruned"), 4);
@@ -832,7 +826,7 @@ mod tests {
             .await
             .unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&read), @r###"
+        insta::assert_snapshot!(batches_to_sort_string(&read), @r"
         +-----+-----+
         | c1  | c4  |
         +-----+-----+
@@ -843,7 +837,7 @@ mod tests {
         | bar |     |
         | bar |     |
         +-----+-----+
-        "###);
+        ");
     }
 
     #[tokio::test]
@@ -1002,6 +996,7 @@ mod tests {
         assert_eq!(read, 1, "Expected 1 rows to match the predicate");
         assert_eq!(get_value(&metrics, "row_groups_pruned_statistics"), 0);
         assert_eq!(get_value(&metrics, "page_index_rows_pruned"), 2);
+        assert_eq!(get_value(&metrics, "page_index_pages_pruned"), 1);
         assert_eq!(get_value(&metrics, "pushdown_rows_pruned"), 1);
         // If we filter with a value that is completely out of the range of the data
         // we prune at the row group level.
@@ -1056,18 +1051,18 @@ mod tests {
         // In a real query where this predicate was pushed down from a filter stage instead of created directly in the `DataSourceExec`,
         // the filter stage would be preserved as a separate execution plan stage so the actual query results would be as expected.
 
-        insta::assert_snapshot!(batches_to_sort_string(&read),@r###"
-            +-----+----+
-            | c1  | c2 |
-            +-----+----+
-            |     |    |
-            |     |    |
-            |     | 1  |
-            |     | 2  |
-            | Foo |    |
-            | bar |    |
-            +-----+----+
-        "###);
+        insta::assert_snapshot!(batches_to_sort_string(&read),@r"
+        +-----+----+
+        | c1  | c2 |
+        +-----+----+
+        |     |    |
+        |     |    |
+        |     | 1  |
+        |     | 2  |
+        | Foo |    |
+        | bar |    |
+        +-----+----+
+        ");
     }
 
     #[tokio::test]
@@ -1092,13 +1087,13 @@ mod tests {
             .round_trip(vec![batch1, batch2])
             .await;
 
-        insta::assert_snapshot!(batches_to_sort_string(&rt.batches.unwrap()), @r###"
+        insta::assert_snapshot!(batches_to_sort_string(&rt.batches.unwrap()), @r"
         +----+----+
         | c1 | c2 |
         +----+----+
         |    | 1  |
         +----+----+
-        "###);
+        ");
         let metrics = rt.parquet_exec.metrics().unwrap();
         // Note there are were 6 rows in total (across three batches)
         assert_eq!(get_value(&metrics, "pushdown_rows_pruned"), 5);
@@ -1152,7 +1147,7 @@ mod tests {
             .round_trip(vec![batch1, batch2, batch3, batch4])
             .await;
 
-        insta::assert_snapshot!(batches_to_sort_string(&rt.batches.unwrap()), @r###"
+        insta::assert_snapshot!(batches_to_sort_string(&rt.batches.unwrap()), @r"
         +------+----+
         | c1   | c2 |
         +------+----+
@@ -1169,16 +1164,22 @@ mod tests {
         | Foo2 |    |
         | Foo3 |    |
         +------+----+
-        "###);
+        ");
         let metrics = rt.parquet_exec.metrics().unwrap();
 
         // There are 4 rows pruned in each of batch2, batch3, and
         // batch4 for a total of 12. batch1 had no pruning as c2 was
         // filled in as null
-        let (page_index_pruned, page_index_matched) =
+        let (page_index_rows_pruned, page_index_rows_matched) =
             get_pruning_metric(&metrics, "page_index_rows_pruned");
-        assert_eq!(page_index_pruned, 12);
-        assert_eq!(page_index_matched, 6);
+        assert_eq!(page_index_rows_pruned, 12);
+        assert_eq!(page_index_rows_matched, 6);
+
+        // each page has 2 rows, so the num of pages is 1/2 the number of rows
+        let (page_index_pages_pruned, page_index_pages_matched) =
+            get_pruning_metric(&metrics, "page_index_pages_pruned");
+        assert_eq!(page_index_pages_pruned, 6);
+        assert_eq!(page_index_pages_matched, 3);
     }
 
     #[tokio::test]
@@ -1201,14 +1202,14 @@ mod tests {
             .await
             .unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&read),@r###"
-            +-----+----+
-            | c1  | c2 |
-            +-----+----+
-            | Foo | 1  |
-            | bar |    |
-            +-----+----+
-        "###);
+        insta::assert_snapshot!(batches_to_sort_string(&read),@r"
+        +-----+----+
+        | c1  | c2 |
+        +-----+----+
+        | Foo | 1  |
+        | bar |    |
+        +-----+----+
+        ");
     }
 
     #[tokio::test]
@@ -1231,15 +1232,15 @@ mod tests {
             .await
             .unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&read),@r###"
-            +-----+----+
-            | c1  | c2 |
-            +-----+----+
-            |     | 2  |
-            | Foo | 1  |
-            | bar |    |
-            +-----+----+
-        "###);
+        insta::assert_snapshot!(batches_to_sort_string(&read),@r"
+        +-----+----+
+        | c1  | c2 |
+        +-----+----+
+        |     | 2  |
+        | Foo | 1  |
+        | bar |    |
+        +-----+----+
+        ");
     }
 
     #[tokio::test]
@@ -1264,7 +1265,7 @@ mod tests {
             ("c3", c3.clone()),
         ]);
 
-        // batch2: c3(int8), c2(int64), c1(string), c4(string)
+        // batch2: c3(date64), c2(int64), c1(string)
         let batch2 = create_batch(vec![("c3", c4), ("c2", c2), ("c1", c1)]);
 
         let table_schema = Schema::new(vec![
@@ -1278,8 +1279,10 @@ mod tests {
             .with_table_schema(Arc::new(table_schema))
             .round_trip_to_batches(vec![batch1, batch2])
             .await;
-        assert_contains!(read.unwrap_err().to_string(),
-            "Cannot cast file schema field c3 of type Date64 to table schema field of type Int8");
+        assert_contains!(
+            read.unwrap_err().to_string(),
+            "Cannot cast column 'c3' from 'Date64' (physical data type) to 'Int8' (logical data type)"
+        );
     }
 
     #[tokio::test]
@@ -1329,7 +1332,7 @@ mod tests {
     async fn parquet_exec_with_int96_from_spark() -> Result<()> {
         // arrow-rs relies on the chrono library to convert between timestamps and strings, so
         // instead compare as Int64. The underlying type should be a PrimitiveArray of Int64
-        // anyway, so this should be a zero-copy non-modifying cast at the SchemaAdapter.
+        // anyway, so this should be a zero-copy non-modifying cast.
 
         let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, true)]));
         let testdata = datafusion_common::test_util::parquet_test_data();
@@ -1532,14 +1535,7 @@ mod tests {
     #[tokio::test]
     async fn parquet_exec_with_range() -> Result<()> {
         fn file_range(meta: &ObjectMeta, start: i64, end: i64) -> PartitionedFile {
-            PartitionedFile {
-                object_meta: meta.clone(),
-                partition_values: vec![],
-                range: Some(FileRange { start, end }),
-                statistics: None,
-                extensions: None,
-                metadata_size_hint: None,
-            }
+            PartitionedFile::new_from_meta(meta.clone()).with_range(start, end)
         }
 
         async fn assert_parquet_read(
@@ -1550,8 +1546,7 @@ mod tests {
         ) -> Result<()> {
             let config = FileScanConfigBuilder::new(
                 ObjectStoreUrl::local_filesystem(),
-                file_schema,
-                Arc::new(ParquetSource::default()),
+                Arc::new(ParquetSource::new(file_schema)),
             )
             .with_file_groups(file_groups)
             .build();
@@ -1622,21 +1617,15 @@ mod tests {
             .await
             .unwrap();
 
-        let partitioned_file = PartitionedFile {
-            object_meta: meta,
-            partition_values: vec![
+        let partitioned_file = PartitionedFile::new_from_meta(meta)
+            .with_partition_values(vec![
                 ScalarValue::from("2021"),
                 ScalarValue::UInt8(Some(10)),
                 ScalarValue::Dictionary(
                     Box::new(DataType::UInt16),
                     Box::new(ScalarValue::from("26")),
                 ),
-            ],
-            range: None,
-            statistics: None,
-            extensions: None,
-            metadata_size_hint: None,
-        };
+            ]);
 
         let expected_schema = Schema::new(vec![
             Field::new("id", DataType::Int32, true),
@@ -1653,23 +1642,27 @@ mod tests {
             ),
         ]);
 
-        let source = Arc::new(ParquetSource::default());
-        let config = FileScanConfigBuilder::new(object_store_url, schema.clone(), source)
-            .with_file(partitioned_file)
-            // file has 10 cols so index 12 should be month and 13 should be day
-            .with_projection_indices(Some(vec![0, 1, 2, 12, 13]))
-            .with_table_partition_cols(vec![
-                Field::new("year", DataType::Utf8, false),
-                Field::new("month", DataType::UInt8, false),
-                Field::new(
+        let table_schema = TableSchema::new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Field::new("year", DataType::Utf8, false)),
+                Arc::new(Field::new("month", DataType::UInt8, false)),
+                Arc::new(Field::new(
                     "day",
                     DataType::Dictionary(
                         Box::new(DataType::UInt16),
                         Box::new(DataType::Utf8),
                     ),
                     false,
-                ),
-            ])
+                )),
+            ],
+        );
+        let source = Arc::new(ParquetSource::new(table_schema.clone()));
+        let config = FileScanConfigBuilder::new(object_store_url, source)
+            .with_file(partitioned_file)
+            // file has 10 cols so index 12 should be month and 13 should be day
+            .with_projection_indices(Some(vec![0, 1, 2, 12, 13]))
+            .unwrap()
             .build();
 
         let parquet_exec = DataSourceExec::from_data_source(config);
@@ -1684,20 +1677,20 @@ mod tests {
         let batch = results.next().await.unwrap()?;
         assert_eq!(batch.schema().as_ref(), &expected_schema);
 
-        assert_snapshot!(batches_to_string(&[batch]),@r###"
-            +----+----------+-------------+-------+-----+
-            | id | bool_col | tinyint_col | month | day |
-            +----+----------+-------------+-------+-----+
-            | 4  | true     | 0           | 10    | 26  |
-            | 5  | false    | 1           | 10    | 26  |
-            | 6  | true     | 0           | 10    | 26  |
-            | 7  | false    | 1           | 10    | 26  |
-            | 2  | true     | 0           | 10    | 26  |
-            | 3  | false    | 1           | 10    | 26  |
-            | 0  | true     | 0           | 10    | 26  |
-            | 1  | false    | 1           | 10    | 26  |
-            +----+----------+-------------+-------+-----+
-        "###);
+        assert_snapshot!(batches_to_string(&[batch]),@r"
+        +----+----------+-------------+-------+-----+
+        | id | bool_col | tinyint_col | month | day |
+        +----+----------+-------------+-------+-----+
+        | 4  | true     | 0           | 10    | 26  |
+        | 5  | false    | 1           | 10    | 26  |
+        | 6  | true     | 0           | 10    | 26  |
+        | 7  | false    | 1           | 10    | 26  |
+        | 2  | true     | 0           | 10    | 26  |
+        | 3  | false    | 1           | 10    | 26  |
+        | 0  | true     | 0           | 10    | 26  |
+        | 1  | false    | 1           | 10    | 26  |
+        +----+----------+-------------+-------+-----+
+        ");
 
         let batch = results.next().await;
         assert!(batch.is_none());
@@ -1711,28 +1704,20 @@ mod tests {
         let state = session_ctx.state();
         let location = Path::from_filesystem_path(".")
             .unwrap()
-            .child("invalid.parquet");
+            .join("invalid.parquet");
 
-        let partitioned_file = PartitionedFile {
-            object_meta: ObjectMeta {
-                location,
-                last_modified: Utc.timestamp_nanos(0),
-                size: 1337,
-                e_tag: None,
-                version: None,
-            },
-            partition_values: vec![],
-            range: None,
-            statistics: None,
-            extensions: None,
-            metadata_size_hint: None,
-        };
+        let partitioned_file = PartitionedFile::new_from_meta(ObjectMeta {
+            location,
+            last_modified: Utc.timestamp_nanos(0),
+            size: 1337,
+            e_tag: None,
+            version: None,
+        });
 
         let file_schema = Arc::new(Schema::empty());
         let config = FileScanConfigBuilder::new(
             ObjectStoreUrl::local_filesystem(),
-            file_schema,
-            Arc::new(ParquetSource::default()),
+            Arc::new(ParquetSource::new(file_schema)),
         )
         .with_file(partitioned_file)
         .build();
@@ -1757,6 +1742,7 @@ mod tests {
             Some(3),
             Some(4),
             Some(5),
+            Some(6), // last page with only one row
         ]));
         let batch1 = create_batch(vec![("int", c1.clone())]);
 
@@ -1765,27 +1751,53 @@ mod tests {
         let rt = RoundTrip::new()
             .with_predicate(filter)
             .with_page_index_predicate()
-            .round_trip(vec![batch1])
+            .round_trip(vec![batch1.clone()])
             .await;
 
         let metrics = rt.parquet_exec.metrics().unwrap();
 
-        assert_snapshot!(batches_to_sort_string(&rt.batches.unwrap()),@r###"
-            +-----+
-            | int |
-            +-----+
-            | 4   |
-            | 5   |
-            +-----+
-        "###);
-        let (page_index_pruned, page_index_matched) =
+        assert_snapshot!(batches_to_sort_string(&rt.batches.unwrap()),@r"
+        +-----+
+        | int |
+        +-----+
+        | 4   |
+        | 5   |
+        +-----+
+        ");
+        let (page_index_rows_pruned, page_index_rows_matched) =
             get_pruning_metric(&metrics, "page_index_rows_pruned");
-        assert_eq!(page_index_pruned, 4);
-        assert_eq!(page_index_matched, 2);
+        assert_eq!(page_index_rows_pruned, 5);
+        assert_eq!(page_index_rows_matched, 2);
         assert!(
             get_value(&metrics, "page_index_eval_time") > 0,
             "no eval time in metrics: {metrics:#?}"
         );
+
+        // each page has 2 rows, so the num of pages is 1/2 the number of rows
+        let (page_index_pages_pruned, page_index_pages_matched) =
+            get_pruning_metric(&metrics, "page_index_pages_pruned");
+        assert_eq!(page_index_pages_pruned, 3);
+        assert_eq!(page_index_pages_matched, 1);
+
+        // test with a filter that matches the page with one row
+        let filter = col("int").eq(lit(6_i32));
+        let rt = RoundTrip::new()
+            .with_predicate(filter)
+            .with_page_index_predicate()
+            .round_trip(vec![batch1])
+            .await;
+
+        let metrics = rt.parquet_exec.metrics().unwrap();
+
+        let (page_index_rows_pruned, page_index_rows_matched) =
+            get_pruning_metric(&metrics, "page_index_rows_pruned");
+        assert_eq!(page_index_rows_pruned, 6);
+        assert_eq!(page_index_rows_matched, 1);
+
+        let (page_index_pages_pruned, page_index_pages_matched) =
+            get_pruning_metric(&metrics, "page_index_pages_pruned");
+        assert_eq!(page_index_pages_pruned, 3);
+        assert_eq!(page_index_pages_matched, 1);
     }
 
     /// Returns a string array with contents:
@@ -1823,14 +1835,14 @@ mod tests {
         let metrics = rt.parquet_exec.metrics().unwrap();
 
         // assert the batches and some metrics
-        assert_snapshot!(batches_to_string(&rt.batches.unwrap()),@r###"
-            +-----+
-            | c1  |
-            +-----+
-            | Foo |
-            | zzz |
-            +-----+
-        "###);
+        assert_snapshot!(batches_to_string(&rt.batches.unwrap()),@r"
+        +-----+
+        | c1  |
+        +-----+
+        | Foo |
+        | zzz |
+        +-----+
+        ");
 
         // pushdown predicates have eliminated all 4 bar rows and the
         // null row for 5 rows total
@@ -1879,6 +1891,100 @@ mod tests {
         assert_contains!(&explain, "projection=[c1]");
     }
 
+    #[tokio::test]
+    async fn parquet_exec_metrics_with_multiple_predicates() {
+        // Test that metrics are correctly calculated when multiple predicates
+        // are pushed down (connected with AND). This ensures we don't double-count
+        // rows when multiple predicates filter the data sequentially.
+
+        // Create a batch with two columns: c1 (string) and c2 (int32)
+        // Total: 10 rows
+        let c1: ArrayRef = Arc::new(StringArray::from(vec![
+            Some("foo"), // 0 - passes c1 filter, fails c2 filter (5 <= 10)
+            Some("bar"), // 1 - fails c1 filter
+            Some("bar"), // 2 - fails c1 filter
+            Some("baz"), // 3 - passes both filters (20 > 10)
+            Some("foo"), // 4 - passes both filters (12 > 10)
+            Some("bar"), // 5 - fails c1 filter
+            Some("baz"), // 6 - passes both filters (25 > 10)
+            Some("foo"), // 7 - passes c1 filter, fails c2 filter (7 <= 10)
+            Some("bar"), // 8 - fails c1 filter
+            Some("qux"), // 9 - passes both filters (30 > 10)
+        ]));
+
+        let c2: ArrayRef = Arc::new(Int32Array::from(vec![
+            Some(5),
+            Some(15),
+            Some(8),
+            Some(20),
+            Some(12),
+            Some(9),
+            Some(25),
+            Some(7),
+            Some(18),
+            Some(30),
+        ]));
+
+        let batch = create_batch(vec![("c1", c1), ("c2", c2)]);
+
+        // Create filter: c1 != 'bar' AND c2 > 10
+        //
+        // First predicate (c1 != 'bar'):
+        //   - Rows passing: 0, 3, 4, 6, 7, 9 (6 rows)
+        //   - Rows pruned: 1, 2, 5, 8 (4 rows)
+        //
+        // Second predicate (c2 > 10) on remaining 6 rows:
+        //   - Rows passing: 3, 4, 6, 9 (4 rows with c2 = 20, 12, 25, 30)
+        //   - Rows pruned: 0, 7 (2 rows with c2 = 5, 7)
+        //
+        // Expected final metrics:
+        //   - pushdown_rows_matched: 4 (final result)
+        //   - pushdown_rows_pruned: 4 + 2 = 6 (cumulative)
+        //   - Total: 4 + 6 = 10
+
+        let filter = col("c1").not_eq(lit("bar")).and(col("c2").gt(lit(10)));
+
+        let rt = RoundTrip::new()
+            .with_predicate(filter)
+            .with_pushdown_predicate()
+            .round_trip(vec![batch])
+            .await;
+
+        let metrics = rt.parquet_exec.metrics().unwrap();
+
+        // Verify the result rows
+        assert_snapshot!(batches_to_string(&rt.batches.unwrap()),@r"
+        +-----+----+
+        | c1  | c2 |
+        +-----+----+
+        | baz | 20 |
+        | foo | 12 |
+        | baz | 25 |
+        | qux | 30 |
+        +-----+----+
+        ");
+
+        // Verify metrics - this is the key test
+        let pushdown_rows_matched = get_value(&metrics, "pushdown_rows_matched");
+        let pushdown_rows_pruned = get_value(&metrics, "pushdown_rows_pruned");
+
+        assert_eq!(
+            pushdown_rows_matched, 4,
+            "Expected 4 rows to pass both predicates"
+        );
+        assert_eq!(
+            pushdown_rows_pruned, 6,
+            "Expected 6 rows to be pruned (4 by first predicate + 2 by second predicate)"
+        );
+
+        // The sum should equal the total number of rows
+        assert_eq!(
+            pushdown_rows_matched + pushdown_rows_pruned,
+            10,
+            "matched + pruned should equal total rows"
+        );
+    }
+
     #[tokio::test]
     async fn parquet_exec_has_no_pruning_predicate_if_can_not_prune() {
         // batch1: c1(string)
@@ -2119,13 +2225,13 @@ mod tests {
         let sql = "select * from base_table where name='test02'";
         let batch = ctx.sql(sql).await.unwrap().collect().await.unwrap();
         assert_eq!(batch.len(), 1);
-        insta::assert_snapshot!(batches_to_string(&batch),@r###"
-            +---------------------+----+--------+
-            | struct              | id | name   |
-            +---------------------+----+--------+
-            | {id: 4, name: aaa2} | 2  | test02 |
-            +---------------------+----+--------+
-        "###);
+        insta::assert_snapshot!(batches_to_string(&batch),@r"
+        +---------------------+----+--------+
+        | struct              | id | name   |
+        +---------------------+----+--------+
+        | {id: 4, name: aaa2} | 2  | test02 |
+        +---------------------+----+--------+
+        ");
         Ok(())
     }
 
@@ -2148,13 +2254,55 @@ mod tests {
         let sql = "select * from base_table where name='test02'";
         let batch = ctx.sql(sql).await.unwrap().collect().await.unwrap();
         assert_eq!(batch.len(), 1);
-        insta::assert_snapshot!(batches_to_string(&batch),@r###"
-            +---------------------+----+--------+
-            | struct              | id | name   |
-            +---------------------+----+--------+
-            | {id: 4, name: aaa2} | 2  | test02 |
-            +---------------------+----+--------+
-        "###);
+        insta::assert_snapshot!(batches_to_string(&batch),@r"
+        +---------------------+----+--------+
+        | struct              | id | name   |
+        +---------------------+----+--------+
+        | {id: 4, name: aaa2} | 2  | test02 |
+        +---------------------+----+--------+
+        ");
+        Ok(())
+    }
+
+    /// Tests that constant dictionary columns (where min == max in statistics)
+    /// are correctly handled. This reproduced a bug where the constant value
+    /// from statistics had type Utf8 but the schema expected Dictionary.
+    #[tokio::test]
+    async fn test_constant_dictionary_column_parquet() -> Result<()> {
+        let tmp_dir = TempDir::new()?;
+        let path = tmp_dir.path().to_str().unwrap().to_string() + "/test.parquet";
+
+        // Write parquet with dictionary column where all values are the same
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "status",
+            DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8)),
+            false,
+        )]));
+        let status: DictionaryArray<UInt16Type> =
+            vec!["active", "active"].into_iter().collect();
+        let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(status)])?;
+        let file = File::create(&path)?;
+        let props = WriterProperties::builder()
+            .set_statistics_enabled(parquet::file::properties::EnabledStatistics::Page)
+            .build();
+        let mut writer = ArrowWriter::try_new(file, schema, Some(props))?;
+        writer.write(&batch)?;
+        writer.close()?;
+
+        // Query the constant dictionary column
+        let ctx = SessionContext::new();
+        ctx.register_parquet("t", &path, ParquetReadOptions::default())
+            .await?;
+        let result = ctx.sql("SELECT status FROM t").await?.collect().await?;
+
+        insta::assert_snapshot!(batches_to_string(&result),@r"
+        +--------+
+        | status |
+        +--------+
+        | active |
+        | active |
+        +--------+
+        ");
         Ok(())
     }
 
@@ -2279,42 +2427,28 @@ mod tests {
         let size_hint_calls = reader_factory.metadata_size_hint_calls.clone();
 
         let source = Arc::new(
-            ParquetSource::default()
+            ParquetSource::new(Arc::clone(&schema))
                 .with_parquet_file_reader_factory(reader_factory)
                 .with_metadata_size_hint(456),
         );
-        let config = FileScanConfigBuilder::new(store_url, schema, source)
+        let config = FileScanConfigBuilder::new(store_url, source)
             .with_file(
-                PartitionedFile {
-                    object_meta: ObjectMeta {
-                        location: Path::from(name_1),
-                        last_modified: Utc::now(),
-                        size: total_size_1,
-                        e_tag: None,
-                        version: None,
-                    },
-                    partition_values: vec![],
-                    range: None,
-                    statistics: None,
-                    extensions: None,
-                    metadata_size_hint: None,
-                }
-                .with_metadata_size_hint(123),
-            )
-            .with_file(PartitionedFile {
-                object_meta: ObjectMeta {
-                    location: Path::from(name_2),
+                PartitionedFile::new_from_meta(ObjectMeta {
+                    location: Path::from(name_1),
                     last_modified: Utc::now(),
-                    size: total_size_2,
+                    size: total_size_1,
                     e_tag: None,
                     version: None,
-                },
-                partition_values: vec![],
-                range: None,
-                statistics: None,
-                extensions: None,
-                metadata_size_hint: None,
-            })
+                })
+                .with_metadata_size_hint(123),
+            )
+            .with_file(PartitionedFile::new_from_meta(ObjectMeta {
+                location: Path::from(name_2),
+                last_modified: Utc::now(),
+                size: total_size_2,
+                e_tag: None,
+                version: None,
+            }))
             .build();
 
         let exec = DataSourceExec::from_data_source(config);
diff --git a/datafusion/core/src/datasource/view_test.rs b/datafusion/core/src/datasource/view_test.rs
index 85ad9ff664ade..35418d6dea632 100644
--- a/datafusion/core/src/datasource/view_test.rs
+++ b/datafusion/core/src/datasource/view_test.rs
@@ -46,13 +46,13 @@ mod tests {
             .collect()
             .await?;
 
-        insta::assert_snapshot!(batches_to_string(&results),@r###"
+        insta::assert_snapshot!(batches_to_string(&results),@r"
         +---+
         | b |
         +---+
         | 2 |
         +---+
-        "###);
+        ");
 
         Ok(())
     }
@@ -96,14 +96,14 @@ mod tests {
             .collect()
             .await?;
 
-        insta::assert_snapshot!(batches_to_string(&results),@r###"
+        insta::assert_snapshot!(batches_to_string(&results),@r"
         +---------+---------+---------+
         | column1 | column2 | column3 |
         +---------+---------+---------+
         | 1       | 2       | 3       |
         | 4       | 5       | 6       |
         +---------+---------+---------+
-        "###);
+        ");
 
         let view_sql =
             "CREATE VIEW replace_xyz AS SELECT * REPLACE (column1*2 as column1) FROM xyz";
@@ -115,14 +115,14 @@ mod tests {
             .collect()
             .await?;
 
-        insta::assert_snapshot!(batches_to_string(&results),@r###"
+        insta::assert_snapshot!(batches_to_string(&results),@r"
         +---------+---------+---------+
         | column1 | column2 | column3 |
         +---------+---------+---------+
         | 2       | 2       | 3       |
         | 8       | 5       | 6       |
         +---------+---------+---------+
-        "###);
+        ");
 
         Ok(())
     }
@@ -146,14 +146,14 @@ mod tests {
             .collect()
             .await?;
 
-        insta::assert_snapshot!(batches_to_string(&results),@r###"
+        insta::assert_snapshot!(batches_to_string(&results),@r"
         +---------------+
         | column1_alias |
         +---------------+
         | 1             |
         | 4             |
         +---------------+
-        "###);
+        ");
 
         Ok(())
     }
@@ -177,14 +177,14 @@ mod tests {
             .collect()
             .await?;
 
-        insta::assert_snapshot!(batches_to_string(&results),@r###"
+        insta::assert_snapshot!(batches_to_string(&results),@r"
         +---------------+---------------+
         | column2_alias | column1_alias |
         +---------------+---------------+
         | 2             | 1             |
         | 5             | 4             |
         +---------------+---------------+
-        "###);
+        ");
 
         Ok(())
     }
@@ -213,14 +213,14 @@ mod tests {
             .collect()
             .await?;
 
-        insta::assert_snapshot!(batches_to_string(&results),@r###"
+        insta::assert_snapshot!(batches_to_string(&results),@r"
         +---------+
         | column1 |
         +---------+
         | 1       |
         | 4       |
         +---------+
-        "###);
+        ");
 
         Ok(())
     }
@@ -249,13 +249,13 @@ mod tests {
             .collect()
             .await?;
 
-        insta::assert_snapshot!(batches_to_string(&results),@r###"
+        insta::assert_snapshot!(batches_to_string(&results),@r"
         +---------+
         | column1 |
         +---------+
         | 4       |
         +---------+
-        "###);
+        ");
 
         Ok(())
     }
@@ -287,14 +287,14 @@ mod tests {
             .collect()
             .await?;
 
-        insta::assert_snapshot!(batches_to_string(&results),@r###"
+        insta::assert_snapshot!(batches_to_string(&results),@r"
         +---------+---------+---------+
         | column2 | column1 | column3 |
         +---------+---------+---------+
         | 2       | 1       | 3       |
         | 5       | 4       | 6       |
         +---------+---------+---------+
-        "###);
+        ");
 
         Ok(())
     }
@@ -358,7 +358,10 @@ mod tests {
             .to_string();
         assert!(formatted.contains("DataSourceExec: "));
         assert!(formatted.contains("file_type=parquet"));
-        assert!(formatted.contains("projection=[bool_col, int_col], limit=10"));
+        assert!(
+            formatted.contains("projection=[bool_col, int_col], limit=10"),
+            "{formatted}"
+        );
         Ok(())
     }
 
@@ -442,14 +445,14 @@ mod tests {
             .collect()
             .await?;
 
-        insta::assert_snapshot!(batches_to_string(&results),@r###"
+        insta::assert_snapshot!(batches_to_string(&results),@r"
         +---------+
         | column1 |
         +---------+
         | 1       |
         | 4       |
         +---------+
-        "###);
+        ");
 
         Ok(())
     }
diff --git a/datafusion/core/src/execution/context/json.rs b/datafusion/core/src/execution/context/json.rs
index e9d799400863d..f7df2ad7a1cd6 100644
--- a/datafusion/core/src/execution/context/json.rs
+++ b/datafusion/core/src/execution/context/json.rs
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use super::super::options::ReadOptions;
+use super::{DataFilePaths, DataFrame, ExecutionPlan, Result, SessionContext};
+use crate::execution::options::JsonReadOptions;
 use datafusion_common::TableReference;
 use datafusion_datasource_json::source::plan_to_json;
 use std::sync::Arc;
 
-use super::super::options::{NdJsonReadOptions, ReadOptions};
-use super::{DataFilePaths, DataFrame, ExecutionPlan, Result, SessionContext};
-
 impl SessionContext {
     /// Creates a [`DataFrame`] for reading an JSON data source.
     ///
@@ -32,7 +32,7 @@ impl SessionContext {
     pub async fn read_json<P: DataFilePaths>(
         &self,
         table_paths: P,
-        options: NdJsonReadOptions<'_>,
+        options: JsonReadOptions<'_>,
     ) -> Result<DataFrame> {
         self._read_type(table_paths, options).await
     }
@@ -43,7 +43,7 @@ impl SessionContext {
         &self,
         table_ref: impl Into<TableReference>,
         table_path: impl AsRef<str>,
-        options: NdJsonReadOptions<'_>,
+        options: JsonReadOptions<'_>,
     ) -> Result<()> {
         let listing_options = options
             .to_listing_options(&self.copied_config(), self.copied_table_options());
diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs
index 687779787ab50..87170f595f413 100644
--- a/datafusion/core/src/execution/context/mod.rs
+++ b/datafusion/core/src/execution/context/mod.rs
@@ -20,6 +20,7 @@
 use std::collections::HashSet;
 use std::fmt::Debug;
 use std::sync::{Arc, Weak};
+use std::time::Duration;
 
 use super::options::ReadOptions;
 use crate::datasource::dynamic_file::DynamicListTableFactory;
@@ -33,20 +34,20 @@ use crate::{
     datasource::listing::{
         ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
     },
-    datasource::{provider_as_source, MemTable, ViewTable},
+    datasource::{MemTable, ViewTable, provider_as_source},
     error::Result,
     execution::{
+        FunctionRegistry,
         options::ArrowReadOptions,
         runtime_env::{RuntimeEnv, RuntimeEnvBuilder},
-        FunctionRegistry,
     },
     logical_expr::AggregateUDF,
     logical_expr::ScalarUDF,
     logical_expr::{
         CreateCatalog, CreateCatalogSchema, CreateExternalTable, CreateFunction,
         CreateMemoryTable, CreateView, DropCatalogSchema, DropFunction, DropTable,
-        DropView, Execute, LogicalPlan, LogicalPlanBuilder, Prepare, SetVariable,
-        TableType, UNNAMED_TABLE,
+        DropView, Execute, LogicalPlan, LogicalPlanBuilder, Prepare, ResetVariable,
+        SetVariable, TableType, UNNAMED_TABLE,
     },
     physical_expr::PhysicalExpr,
     physical_plan::ExecutionPlan,
@@ -58,32 +59,44 @@ pub use crate::execution::session_state::SessionState;
 
 use arrow::datatypes::{Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
-use datafusion_catalog::memory::MemorySchemaProvider;
 use datafusion_catalog::MemoryCatalogProvider;
+use datafusion_catalog::memory::MemorySchemaProvider;
 use datafusion_catalog::{
     DynamicFileCatalog, TableFunction, TableFunctionImpl, UrlTableFactory,
 };
-use datafusion_common::config::ConfigOptions;
+use datafusion_common::config::{ConfigField, ConfigOptions};
 use datafusion_common::metadata::ScalarAndMetadata;
 use datafusion_common::{
+    DFSchema, DataFusionError, ParamValues, SchemaReference, TableReference,
     config::{ConfigExtension, TableOptions},
     exec_datafusion_err, exec_err, internal_datafusion_err, not_impl_err,
     plan_datafusion_err, plan_err,
     tree_node::{TreeNodeRecursion, TreeNodeVisitor},
-    DFSchema, DataFusionError, ParamValues, SchemaReference, TableReference,
+};
+pub use datafusion_execution::TaskContext;
+use datafusion_execution::cache::cache_manager::{
+    DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT, DEFAULT_LIST_FILES_CACHE_TTL,
+    DEFAULT_METADATA_CACHE_LIMIT,
 };
 pub use datafusion_execution::config::SessionConfig;
+use datafusion_execution::disk_manager::{
+    DEFAULT_MAX_TEMP_DIRECTORY_SIZE, DiskManagerBuilder,
+};
 use datafusion_execution::registry::SerializerRegistry;
-pub use datafusion_execution::TaskContext;
+use datafusion_expr::HigherOrderUDF;
 pub use datafusion_expr::execution_props::ExecutionProps;
+#[cfg(feature = "sql")]
+use datafusion_expr::planner::RelationPlanner;
+use datafusion_expr::simplify::SimplifyContext;
 use datafusion_expr::{
+    Expr, UserDefinedLogicalNode, WindowUDF,
     expr_rewriter::FunctionRewrite,
     logical_plan::{DdlStatement, Statement},
     planner::ExprPlanner,
-    Expr, UserDefinedLogicalNode, WindowUDF,
 };
 use datafusion_optimizer::analyzer::type_coercion::TypeCoercion;
-use datafusion_optimizer::Analyzer;
+use datafusion_optimizer::simplify_expressions::ExprSimplifier;
+use datafusion_optimizer::{Analyzer, OptimizerContext};
 use datafusion_optimizer::{AnalyzerRule, OptimizerRule};
 use datafusion_session::SessionStore;
 
@@ -242,7 +255,7 @@ where
 /// let state = SessionStateBuilder::new()
 ///     .with_config(config)
 ///     .with_runtime_env(runtime_env)
-///     // include support for built in functions and configurations
+///     // include support for built-in functions and configurations
 ///     .with_default_features()
 ///     .build();
 ///
@@ -308,7 +321,7 @@ impl SessionContext {
                 let schema = cat
                     .schema(schema_name.as_str())
                     .ok_or_else(|| internal_datafusion_err!("Schema not found!"))?;
-                let lister = schema.as_any().downcast_ref::<ListingSchemaProvider>();
+                let lister = schema.downcast_ref::<ListingSchemaProvider>();
                 if let Some(lister) = lister {
                     lister.refresh(&self.state()).await?;
                 }
@@ -476,6 +489,11 @@ impl SessionContext {
         self.state.write().append_optimizer_rule(optimizer_rule);
     }
 
+    /// Removes an optimizer rule by name, returning `true` if it existed.
+    pub fn remove_optimizer_rule(&self, name: &str) -> bool {
+        self.state.write().remove_optimizer_rule(name)
+    }
+
     /// Adds an analyzer rule to the end of the existing rules.
     ///
     /// See [`SessionState`] for more control of when the rule is applied.
@@ -513,19 +531,14 @@ impl SessionContext {
         self.runtime_env().deregister_object_store(url)
     }
 
-    /// Registers the [`RecordBatch`] as the specified table name
+    /// Registers the given [`RecordBatch`] as the specified table reference.
     pub fn register_batch(
         &self,
-        table_name: &str,
+        table_ref: impl Into<TableReference>,
         batch: RecordBatch,
     ) -> Result<Option<Arc<dyn TableProvider>>> {
         let table = MemTable::try_new(batch.schema(), vec![vec![batch]])?;
-        self.register_table(
-            TableReference::Bare {
-                table: table_name.into(),
-            },
-            Arc::new(table),
-        )
+        self.register_table(table_ref, Arc::new(table))
     }
 
     /// Return the [RuntimeEnv] used to run queries with this `SessionContext`
@@ -678,7 +691,7 @@ impl SessionContext {
                 match ddl {
                     DdlStatement::CreateExternalTable(cmd) => {
                         (Box::pin(async move { self.create_external_table(&cmd).await })
-                            as std::pin::Pin<Box<dyn futures::Future<Output = _> + Send>>)
+                            as std::pin::Pin<Box<dyn Future<Output = _> + Send>>)
                             .await
                     }
                     DdlStatement::CreateMemoryTable(cmd) => {
@@ -709,7 +722,12 @@ impl SessionContext {
             }
             // TODO what about the other statements (like TransactionStart and TransactionEnd)
             LogicalPlan::Statement(Statement::SetVariable(stmt)) => {
-                self.set_variable(stmt).await
+                self.set_variable(stmt).await?;
+                self.return_empty_dataframe()
+            }
+            LogicalPlan::Statement(Statement::ResetVariable(stmt)) => {
+                self.reset_variable(stmt).await?;
+                self.return_empty_dataframe()
             }
             LogicalPlan::Statement(Statement::Prepare(Prepare {
                 name,
@@ -727,12 +745,19 @@ impl SessionContext {
                         );
                     }
                 }
-                // Store the unoptimized plan into the session state. Although storing the
-                // optimized plan or the physical plan would be more efficient, doing so is
-                // not currently feasible. This is because `now()` would be optimized to a
-                // constant value, causing each EXECUTE to yield the same result, which is
-                // incorrect behavior.
-                self.state.write().store_prepared(name, fields, input)?;
+                // Optimize the plan without evaluating expressions like now()
+                let optimizer_context = OptimizerContext::new_with_config_options(
+                    Arc::clone(self.state().config().options()),
+                )
+                .without_query_execution_start_time();
+                let plan = self.state().optimizer().optimize(
+                    Arc::unwrap_or_clone(input),
+                    &optimizer_context,
+                    |_1, _2| {},
+                )?;
+                self.state
+                    .write()
+                    .store_prepared(name, fields, Arc::new(plan))?;
                 self.return_empty_dataframe()
             }
             LogicalPlan::Statement(Statement::Execute(execute)) => {
@@ -774,7 +799,7 @@ impl SessionContext {
     /// * [`SessionState::create_physical_expr`] for a lower level API
     ///
     /// [simplified]: datafusion_optimizer::simplify_expressions
-    /// [expr_api]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/expr_api.rs
+    /// [expr_api]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/expr_api.rs
     pub fn create_physical_expr(
         &self,
         expr: Expr,
@@ -926,13 +951,13 @@ impl SessionContext {
         match (or_replace, view) {
             (true, Ok(_)) => {
                 self.deregister_table(name.clone())?;
-                let input = Self::apply_type_coercion(input.as_ref().clone())?;
+                let input = Self::apply_type_coercion(Arc::unwrap_or_clone(input))?;
                 let table = Arc::new(ViewTable::new(input, definition));
                 self.register_table(name, table)?;
                 self.return_empty_dataframe()
             }
             (_, Err(_)) => {
-                let input = Self::apply_type_coercion(input.as_ref().clone())?;
+                let input = Self::apply_type_coercion(Arc::unwrap_or_clone(input))?;
                 let table = Arc::new(ViewTable::new(input, definition));
                 self.register_table(name, table)?;
                 self.return_empty_dataframe()
@@ -1052,22 +1077,22 @@ impl SessionContext {
             } else if allow_missing {
                 return self.return_empty_dataframe();
             } else {
-                return self.schema_doesnt_exist_err(name);
+                return self.schema_doesnt_exist_err(&name);
             }
         };
         let dereg = catalog.deregister_schema(name.schema_name(), cascade)?;
         match (dereg, allow_missing) {
             (None, true) => self.return_empty_dataframe(),
-            (None, false) => self.schema_doesnt_exist_err(name),
+            (None, false) => self.schema_doesnt_exist_err(&name),
             (Some(_), _) => self.return_empty_dataframe(),
         }
     }
 
-    fn schema_doesnt_exist_err(&self, schemaref: SchemaReference) -> Result<DataFrame> {
-        exec_err!("Schema '{schemaref}' doesn't exist.")
+    fn schema_doesnt_exist_err(&self, schema_ref: &SchemaReference) -> Result<DataFrame> {
+        exec_err!("Schema '{schema_ref}' doesn't exist.")
     }
 
-    async fn set_variable(&self, stmt: SetVariable) -> Result<DataFrame> {
+    async fn set_variable(&self, stmt: SetVariable) -> Result<()> {
         let SetVariable {
             variable, value, ..
         } = stmt;
@@ -1097,11 +1122,37 @@ impl SessionContext {
             for udf in udfs_to_update {
                 state.register_udf(udf)?;
             }
+        }
 
-            drop(state);
+        Ok(())
+    }
+
+    async fn reset_variable(&self, stmt: ResetVariable) -> Result<()> {
+        let variable = stmt.variable;
+        if variable.starts_with("datafusion.runtime.") {
+            return self.reset_runtime_variable(&variable);
         }
 
-        self.return_empty_dataframe()
+        let mut state = self.state.write();
+        state.config_mut().options_mut().reset(&variable)?;
+
+        // Refresh UDFs to ensure configuration-dependent behavior updates
+        let config_options = state.config().options();
+        let udfs_to_update: Vec<_> = state
+            .scalar_functions()
+            .values()
+            .filter_map(|udf| {
+                udf.inner()
+                    .with_updated_config(config_options)
+                    .map(Arc::new)
+            })
+            .collect();
+
+        for udf in udfs_to_update {
+            state.register_udf(udf)?;
+        }
+
+        Ok(())
     }
 
     fn set_runtime_variable(&self, variable: &str, value: &str) -> Result<()> {
@@ -1112,18 +1163,65 @@ impl SessionContext {
         let mut builder = RuntimeEnvBuilder::from_runtime_env(state.runtime_env());
         builder = match key {
             "memory_limit" => {
-                let memory_limit = Self::parse_memory_limit(value)?;
+                let memory_limit = Self::parse_capacity_limit(variable, value)?;
                 builder.with_memory_limit(memory_limit, 1.0)
             }
             "max_temp_directory_size" => {
-                let directory_size = Self::parse_memory_limit(value)?;
+                let directory_size = Self::parse_capacity_limit(variable, value)?;
                 builder.with_max_temp_directory_size(directory_size as u64)
             }
             "temp_directory" => builder.with_temp_file_path(value),
             "metadata_cache_limit" => {
-                let limit = Self::parse_memory_limit(value)?;
+                let limit = Self::parse_capacity_limit(variable, value)?;
                 builder.with_metadata_cache_limit(limit)
             }
+            "list_files_cache_limit" => {
+                let limit = Self::parse_capacity_limit(variable, value)?;
+                builder.with_object_list_cache_limit(limit)
+            }
+            "list_files_cache_ttl" => {
+                let duration = Self::parse_duration(variable, value)?;
+                builder.with_object_list_cache_ttl(Some(duration))
+            }
+            _ => return plan_err!("Unknown runtime configuration: {variable}"),
+            // Remember to update `reset_runtime_variable()` when adding new options
+        };
+
+        *state = SessionStateBuilder::from(state.clone())
+            .with_runtime_env(Arc::new(builder.build()?))
+            .build();
+
+        Ok(())
+    }
+
+    fn reset_runtime_variable(&self, variable: &str) -> Result<()> {
+        let key = variable.strip_prefix("datafusion.runtime.").unwrap();
+
+        let mut state = self.state.write();
+
+        let mut builder = RuntimeEnvBuilder::from_runtime_env(state.runtime_env());
+        match key {
+            "memory_limit" => {
+                builder.memory_pool = None;
+            }
+            "max_temp_directory_size" => {
+                builder =
+                    builder.with_max_temp_directory_size(DEFAULT_MAX_TEMP_DIRECTORY_SIZE);
+            }
+            "temp_directory" => {
+                builder.disk_manager_builder = Some(DiskManagerBuilder::default());
+            }
+            "metadata_cache_limit" => {
+                builder = builder.with_metadata_cache_limit(DEFAULT_METADATA_CACHE_LIMIT);
+            }
+            "list_files_cache_limit" => {
+                builder = builder
+                    .with_object_list_cache_limit(DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT);
+            }
+            "list_files_cache_ttl" => {
+                builder =
+                    builder.with_object_list_cache_ttl(DEFAULT_LIST_FILES_CACHE_TTL);
+            }
             _ => return plan_err!("Unknown runtime configuration: {variable}"),
         };
 
@@ -1150,11 +1248,23 @@ impl SessionContext {
     ///     (1.5 * 1024.0 * 1024.0 * 1024.0) as usize
     /// );
     /// ```
+    #[deprecated(
+        since = "53.0.0",
+        note = "please use `parse_capacity_limit` function instead."
+    )]
     pub fn parse_memory_limit(limit: &str) -> Result<usize> {
+        if limit.trim().is_empty() {
+            return Err(plan_datafusion_err!("Empty limit value found!"));
+        }
         let (number, unit) = limit.split_at(limit.len() - 1);
         let number: f64 = number.parse().map_err(|_| {
             plan_datafusion_err!("Failed to parse number from memory limit '{limit}'")
         })?;
+        if number.is_sign_negative() || number.is_infinite() {
+            return Err(plan_datafusion_err!(
+                "Limit value should be positive finite number"
+            ));
+        }
 
         match unit {
             "K" => Ok((number * 1024.0) as usize),
@@ -1164,6 +1274,111 @@ impl SessionContext {
         }
     }
 
+    /// Parse capacity limit from string to number of bytes by allowing units: K, M and G.
+    /// Supports formats like '1.5G', '100M', '512K'
+    ///
+    /// # Examples
+    /// ```
+    /// use datafusion::execution::context::SessionContext;
+    ///
+    /// assert_eq!(
+    ///     SessionContext::parse_capacity_limit("datafusion.runtime.memory_limit", "1M").unwrap(),
+    ///     1024 * 1024
+    /// );
+    /// assert_eq!(
+    ///     SessionContext::parse_capacity_limit("datafusion.runtime.memory_limit", "1.5G").unwrap(),
+    ///     (1.5 * 1024.0 * 1024.0 * 1024.0) as usize
+    /// );
+    /// ```
+    pub fn parse_capacity_limit(config_name: &str, limit: &str) -> Result<usize> {
+        if limit.trim().is_empty() {
+            return Err(plan_datafusion_err!(
+                "Empty limit value found for '{config_name}'"
+            ));
+        }
+        let (number, unit) = limit.split_at(limit.len() - 1);
+        let number: f64 = number.parse().map_err(|_| {
+            plan_datafusion_err!(
+                "Failed to parse number from '{config_name}', limit '{limit}'"
+            )
+        })?;
+        if number.is_sign_negative() || number.is_infinite() {
+            return Err(plan_datafusion_err!(
+                "Limit value should be positive finite number for '{config_name}'"
+            ));
+        }
+
+        match unit {
+            "K" => Ok((number * 1024.0) as usize),
+            "M" => Ok((number * 1024.0 * 1024.0) as usize),
+            "G" => Ok((number * 1024.0 * 1024.0 * 1024.0) as usize),
+            _ => plan_err!(
+                "Unsupported unit '{unit}' in '{config_name}', limit '{limit}'. \
+            Unit must be one of: 'K', 'M', 'G'"
+            ),
+        }
+    }
+
+    fn parse_duration(config_name: &str, duration: &str) -> Result<Duration> {
+        if duration.trim().is_empty() {
+            return Err(plan_datafusion_err!(
+                "Duration should not be empty or blank for '{config_name}'"
+            ));
+        }
+
+        let mut minutes = None;
+        let mut seconds = None;
+
+        for duration in duration.split_inclusive(&['m', 's']) {
+            let (number, unit) = duration.split_at(duration.len() - 1);
+            let number: u64 = number.parse().map_err(|_| {
+                plan_datafusion_err!("Failed to parse number from duration '{duration}' for '{config_name}'")
+            })?;
+
+            match unit {
+                "m" if minutes.is_none() && seconds.is_none() => minutes = Some(number),
+                "s" if seconds.is_none() => seconds = Some(number),
+                other => plan_err!(
+                    "Invalid duration unit: '{other}'. The unit must be either 'm' (minutes), or 's' (seconds), and be in the correct order for '{config_name}'"
+                )?,
+            }
+        }
+
+        let secs = Self::check_overflow(config_name, minutes, 60, seconds)?;
+        let duration = Duration::from_secs(secs);
+
+        if duration.is_zero() {
+            return plan_err!(
+                "Duration must be greater than 0 seconds for '{config_name}'"
+            );
+        }
+
+        Ok(duration)
+    }
+
+    fn check_overflow(
+        config_name: &str,
+        mins: Option<u64>,
+        multiplier: u64,
+        secs: Option<u64>,
+    ) -> Result<u64> {
+        let first_part_of_secs = mins.unwrap_or_default().checked_mul(multiplier);
+        if first_part_of_secs.is_none() {
+            plan_err!(
+                "Duration has overflowed allowed maximum limit due to 'mins * {multiplier}' when setting '{config_name}'"
+            )?
+        }
+        let second_part_of_secs = first_part_of_secs
+            .unwrap()
+            .checked_add(secs.unwrap_or_default());
+        if second_part_of_secs.is_none() {
+            plan_err!(
+                "Duration has overflowed allowed maximum limit due to 'mins * {multiplier} + secs' when setting '{config_name}'"
+            )?
+        }
+        Ok(second_part_of_secs.unwrap())
+    }
+
     async fn create_custom_table(
         &self,
         cmd: &CreateExternalTable,
@@ -1190,20 +1405,24 @@ impl SessionContext {
         let table = table_ref.table().to_owned();
         let maybe_schema = {
             let state = self.state.read();
-            let resolved = state.resolve_table_ref(table_ref);
+            let resolved = state.resolve_table_ref(table_ref.clone());
             state
                 .catalog_list()
                 .catalog(&resolved.catalog)
                 .and_then(|c| c.schema(&resolved.schema))
         };
 
-        if let Some(schema) = maybe_schema {
-            if let Some(table_provider) = schema.table(&table).await? {
-                if table_provider.table_type() == table_type {
-                    schema.deregister_table(&table)?;
-                    return Ok(true);
-                }
+        if let Some(schema) = maybe_schema
+            && let Some(table_provider) = schema.table(&table).await?
+            && table_provider.table_type() == table_type
+        {
+            schema.deregister_table(&table)?;
+            if table_type == TableType::Base
+                && let Some(lfc) = self.runtime_env().cache_manager.get_list_files_cache()
+            {
+                lfc.drop_table_entries(&Some(table_ref))?;
             }
+            return Ok(true);
         }
 
         Ok(false)
@@ -1219,7 +1438,7 @@ impl SessionContext {
                 _ => {
                     return Err(DataFusionError::Configuration(
                         "Function factory has not been configured".to_string(),
-                    ))
+                    ));
                 }
             }
         };
@@ -1269,14 +1488,24 @@ impl SessionContext {
             exec_datafusion_err!("Prepared statement '{}' does not exist", name)
         })?;
 
+        let state = self.state.read();
+        let context = SimplifyContext::builder()
+            .with_schema(Arc::clone(prepared.plan.schema()))
+            .with_config_options(Arc::clone(state.config_options()))
+            .with_query_execution_start_time(
+                state.execution_props().query_execution_start_time,
+            )
+            .build();
+        let simplifier = ExprSimplifier::new(context);
+
         // Only allow literals as parameters for now.
         let mut params: Vec<ScalarAndMetadata> = parameters
             .into_iter()
-            .map(|e| match e {
+            .map(|e| match simplifier.simplify(e)? {
                 Expr::Literal(scalar, metadata) => {
                     Ok(ScalarAndMetadata::new(scalar, metadata))
                 }
-                _ => not_impl_err!("Unsupported parameter type: {}", e),
+                e => not_impl_err!("Unsupported parameter type: {e}"),
             })
             .collect::<Result<_>>()?;
 
@@ -1359,6 +1588,18 @@ impl SessionContext {
         self.state.write().register_udwf(Arc::new(f)).ok();
     }
 
+    #[cfg(feature = "sql")]
+    /// Registers a [`RelationPlanner`] to customize SQL table-factor planning.
+    ///
+    /// Planners are invoked in reverse registration order, allowing newer
+    /// planners to take precedence over existing ones.
+    pub fn register_relation_planner(
+        &self,
+        planner: Arc<dyn RelationPlanner>,
+    ) -> Result<()> {
+        self.state.write().register_relation_planner(planner)
+    }
+
     /// Deregisters a UDF within this context.
     pub fn deregister_udf(&self, name: &str) {
         self.state.write().deregister_udf(name).ok();
@@ -1544,15 +1785,14 @@ impl SessionContext {
     /// SQL statements executed against this context.
     pub async fn register_arrow(
         &self,
-        name: &str,
-        table_path: &str,
+        table_ref: impl Into<TableReference>,
+        table_path: impl AsRef<str>,
         options: ArrowReadOptions<'_>,
     ) -> Result<()> {
         let listing_options = options
             .to_listing_options(&self.copied_config(), self.copied_table_options());
-
         self.register_listing_table(
-            name,
+            table_ref,
             table_path,
             listing_options,
             options.schema.map(|s| Arc::new(s.to_owned())),
@@ -1738,6 +1978,10 @@ impl FunctionRegistry for SessionContext {
         self.state.read().udf(name)
     }
 
+    fn higher_order_function(&self, name: &str) -> Result<Arc<dyn HigherOrderUDF>> {
+        self.state.read().higher_order_function(name)
+    }
+
     fn udaf(&self, name: &str) -> Result<Arc<AggregateUDF>> {
         self.state.read().udaf(name)
     }
@@ -1750,6 +1994,13 @@ impl FunctionRegistry for SessionContext {
         self.state.write().register_udf(udf)
     }
 
+    fn register_higher_order_function(
+        &mut self,
+        function: Arc<dyn HigherOrderUDF>,
+    ) -> Result<Option<Arc<dyn HigherOrderUDF>>> {
+        self.state.write().register_higher_order_function(function)
+    }
+
     fn register_udaf(
         &mut self,
         udaf: Arc<AggregateUDF>,
@@ -1779,6 +2030,10 @@ impl FunctionRegistry for SessionContext {
         self.state.write().register_expr_planner(expr_planner)
     }
 
+    fn higher_order_function_names(&self) -> HashSet<String> {
+        self.state.read().higher_order_function_names()
+    }
+
     fn udafs(&self) -> HashSet<String> {
         self.state.read().udafs()
     }
@@ -1788,6 +2043,12 @@ impl FunctionRegistry for SessionContext {
     }
 }
 
+impl datafusion_execution::TaskContextProvider for SessionContext {
+    fn task_ctx(&self) -> Arc<TaskContext> {
+        SessionContext::task_ctx(self)
+    }
+}
+
 /// Create a new task context instance from SessionContext
 impl From<&SessionContext> for TaskContext {
     fn from(session: &SessionContext) -> Self {
@@ -1831,7 +2092,7 @@ pub trait QueryPlanner: Debug {
 /// because the implementation and requirements vary widely. Please see
 /// [function_factory example] for a reference implementation.
 ///
-/// [function_factory example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/function_factory.rs
+/// [function_factory example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/function_factory.rs
 ///
 /// # Examples of syntax that can be supported
 ///
@@ -1998,7 +2259,9 @@ mod tests {
     use crate::test;
     use crate::test_util::{plan_and_collect, populate_csv_partitions};
     use arrow::datatypes::{DataType, TimeUnit};
+    use arrow_schema::FieldRef;
     use datafusion_common::DataFusionError;
+    use datafusion_common::datatype::DataTypeExt;
     use std::error::Error;
     use std::path::PathBuf;
 
@@ -2023,7 +2286,7 @@ mod tests {
         // configure with same memory / disk manager
         let memory_pool = ctx1.runtime_env().memory_pool.clone();
 
-        let mut reservation = MemoryConsumer::new("test").register(&memory_pool);
+        let reservation = MemoryConsumer::new("test").register(&memory_pool);
         reservation.grow(100);
 
         let disk_manager = ctx1.runtime_env().disk_manager.clone();
@@ -2515,7 +2778,7 @@ mod tests {
     struct MyTypePlanner {}
 
     impl TypePlanner for MyTypePlanner {
-        fn plan_type(&self, sql_type: &ast::DataType) -> Result<Option<DataType>> {
+        fn plan_type_field(&self, sql_type: &ast::DataType) -> Result<Option<FieldRef>> {
             match sql_type {
                 ast::DataType::Datetime(precision) => {
                     let precision = match precision {
@@ -2525,10 +2788,213 @@ mod tests {
                         None | Some(9) => TimeUnit::Nanosecond,
                         _ => unreachable!(),
                     };
-                    Ok(Some(DataType::Timestamp(precision, None)))
+                    Ok(Some(
+                        DataType::Timestamp(precision, None).into_nullable_field_ref(),
+                    ))
                 }
                 _ => Ok(None),
             }
         }
     }
+
+    #[tokio::test]
+    async fn remove_optimizer_rule() -> Result<()> {
+        let get_optimizer_rules = |ctx: &SessionContext| {
+            ctx.state()
+                .optimizer()
+                .rules
+                .iter()
+                .map(|r| r.name().to_owned())
+                .collect::<HashSet<_>>()
+        };
+
+        let ctx = SessionContext::new();
+        assert!(get_optimizer_rules(&ctx).contains("simplify_expressions"));
+
+        // default plan
+        let plan = ctx
+            .sql("select 1 + 1")
+            .await?
+            .into_optimized_plan()?
+            .to_string();
+        assert_snapshot!(plan, @r"
+        Projection: Int64(2) AS Int64(1) + Int64(1)
+          EmptyRelation: rows=1
+        ");
+
+        assert!(ctx.remove_optimizer_rule("simplify_expressions"));
+        assert!(!get_optimizer_rules(&ctx).contains("simplify_expressions"));
+
+        // plan without the simplify_expressions rule
+        let plan = ctx
+            .sql("select 1 + 1")
+            .await?
+            .into_optimized_plan()?
+            .to_string();
+        assert_snapshot!(plan, @r"
+        Projection: Int64(1) + Int64(1)
+          EmptyRelation: rows=1
+        ");
+
+        // attempting to remove a non-existing rule returns false
+        assert!(!ctx.remove_optimizer_rule("simplify_expressions"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_parse_duration() {
+        const LIST_FILES_CACHE_TTL: &str = "datafusion.runtime.list_files_cache_ttl";
+
+        // Valid durations
+        for (duration, want) in [
+            ("1s", Duration::from_secs(1)),
+            ("1m", Duration::from_secs(60)),
+            ("1m0s", Duration::from_secs(60)),
+            ("1m1s", Duration::from_secs(61)),
+        ] {
+            let have =
+                SessionContext::parse_duration(LIST_FILES_CACHE_TTL, duration).unwrap();
+            assert_eq!(want, have);
+        }
+
+        // Invalid durations
+        for duration in [
+            "0s", "0m", "1s0m", "1s1m", "XYZ", "1h", "XYZm2s", "", " ", "-1m", "1m 1s",
+            "1m1s ", " 1m1s",
+        ] {
+            let have = SessionContext::parse_duration(LIST_FILES_CACHE_TTL, duration);
+            assert!(have.is_err());
+            assert!(
+                have.unwrap_err()
+                    .message()
+                    .to_string()
+                    .contains(LIST_FILES_CACHE_TTL)
+            );
+        }
+    }
+
+    #[test]
+    fn test_parse_duration_with_overflow_check() {
+        const LIST_FILES_CACHE_TTL: &str = "datafusion.runtime.list_files_cache_ttl";
+
+        // Valid durations which are close to max allowed limit
+        for (duration, want) in [
+            (
+                "18446744073709551615s",
+                Duration::from_secs(18446744073709551615),
+            ),
+            (
+                "307445734561825860m",
+                Duration::from_secs(307445734561825860 * 60),
+            ),
+            (
+                "307445734561825860m10s",
+                Duration::from_secs(307445734561825860 * 60 + 10),
+            ),
+            (
+                "1m18446744073709551555s",
+                Duration::from_secs(60 + 18446744073709551555),
+            ),
+        ] {
+            let have =
+                SessionContext::parse_duration(LIST_FILES_CACHE_TTL, duration).unwrap();
+            assert_eq!(want, have);
+        }
+
+        // Invalid durations which overflow max allowed limit
+        for (duration, error_message_prefix) in [
+            (
+                "18446744073709551616s",
+                "Failed to parse number from duration",
+            ),
+            (
+                "307445734561825861m",
+                "Duration has overflowed allowed maximum limit due to",
+            ),
+            (
+                "307445734561825860m60s",
+                "Duration has overflowed allowed maximum limit due to",
+            ),
+            (
+                "1m18446744073709551556s",
+                "Duration has overflowed allowed maximum limit due to",
+            ),
+        ] {
+            let have = SessionContext::parse_duration(LIST_FILES_CACHE_TTL, duration);
+            assert!(have.is_err());
+            let error_message = have.unwrap_err().message().to_string();
+            assert!(
+                error_message.contains(error_message_prefix)
+                    && error_message.contains(LIST_FILES_CACHE_TTL)
+            );
+        }
+    }
+
+    #[test]
+    fn test_parse_memory_limit() {
+        // Valid memory_limit
+        for (limit, want) in [
+            ("1.5K", (1.5 * 1024.0) as usize),
+            ("2M", (2f64 * 1024.0 * 1024.0) as usize),
+            ("1G", (1f64 * 1024.0 * 1024.0 * 1024.0) as usize),
+        ] {
+            #[expect(deprecated)]
+            let have = SessionContext::parse_memory_limit(limit).unwrap();
+            assert_eq!(want, have);
+        }
+
+        // Invalid memory_limit
+        for limit in [
+            "1B",
+            "1T",
+            "",
+            " ",
+            "XYZG",
+            "-1G",
+            "infG",
+            "-infG",
+            "G",
+            "1024B",
+            "invalid_size",
+        ] {
+            #[expect(deprecated)]
+            let have = SessionContext::parse_memory_limit(limit);
+            assert!(have.is_err());
+        }
+    }
+
+    #[test]
+    fn test_parse_capacity_limit() {
+        const MEMORY_LIMIT: &str = "datafusion.runtime.memory_limit";
+
+        // Valid capacity_limit
+        for (limit, want) in [
+            ("1.5K", (1.5 * 1024.0) as usize),
+            ("2M", (2f64 * 1024.0 * 1024.0) as usize),
+            ("1G", (1f64 * 1024.0 * 1024.0 * 1024.0) as usize),
+        ] {
+            let have = SessionContext::parse_capacity_limit(MEMORY_LIMIT, limit).unwrap();
+            assert_eq!(want, have);
+        }
+
+        // Invalid capacity_limit
+        for limit in [
+            "1B",
+            "1T",
+            "",
+            " ",
+            "XYZG",
+            "-1G",
+            "infG",
+            "-infG",
+            "G",
+            "1024B",
+            "invalid_size",
+        ] {
+            let have = SessionContext::parse_capacity_limit(MEMORY_LIMIT, limit);
+            assert!(have.is_err());
+            assert!(have.unwrap_err().to_string().contains(MEMORY_LIMIT));
+        }
+    }
 }
diff --git a/datafusion/core/src/execution/context/parquet.rs b/datafusion/core/src/execution/context/parquet.rs
index 731f7e59ecfaf..823dc946ea732 100644
--- a/datafusion/core/src/execution/context/parquet.rs
+++ b/datafusion/core/src/execution/context/parquet.rs
@@ -113,7 +113,7 @@ mod tests {
     };
     use datafusion_execution::config::SessionConfig;
 
-    use tempfile::{tempdir, TempDir};
+    use tempfile::{TempDir, tempdir};
 
     #[tokio::test]
     async fn read_with_glob_path() -> Result<()> {
@@ -355,7 +355,9 @@ mod tests {
         let expected_path = binding[0].as_str();
         assert_eq!(
             read_df.unwrap_err().strip_backtrace(),
-            format!("Execution error: File path '{expected_path}' does not match the expected extension '.parquet'")
+            format!(
+                "Execution error: File path '{expected_path}' does not match the expected extension '.parquet'"
+            )
         );
 
         // Read the dataframe from 'output3.parquet.snappy.parquet' with the correct file extension.
diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs
index c15b7eae08432..de5e6b97c1af9 100644
--- a/datafusion/core/src/execution/session_state.rs
+++ b/datafusion/core/src/execution/session_state.rs
@@ -27,14 +27,14 @@ use crate::catalog::{CatalogProviderList, SchemaProvider, TableProviderFactory};
 use crate::datasource::file_format::FileFormatFactory;
 #[cfg(feature = "sql")]
 use crate::datasource::provider_as_source;
-use crate::execution::context::{EmptySerializerRegistry, FunctionFactory, QueryPlanner};
 use crate::execution::SessionStateDefaults;
+use crate::execution::context::{EmptySerializerRegistry, FunctionFactory, QueryPlanner};
 use crate::physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner};
 use arrow_schema::{DataType, FieldRef};
+use datafusion_catalog::MemoryCatalogProviderList;
 use datafusion_catalog::information_schema::{
-    InformationSchemaProvider, INFORMATION_SCHEMA,
+    INFORMATION_SCHEMA, InformationSchemaProvider,
 };
-use datafusion_catalog::MemoryCatalogProviderList;
 use datafusion_catalog::{TableFunction, TableFunctionImpl};
 use datafusion_common::alias::AliasGenerator;
 #[cfg(feature = "sql")]
@@ -43,23 +43,26 @@ use datafusion_common::config::{ConfigExtension, ConfigOptions, TableOptions};
 use datafusion_common::display::{PlanType, StringifiedPlan, ToStringifiedPlan};
 use datafusion_common::tree_node::TreeNode;
 use datafusion_common::{
-    config_err, exec_err, plan_datafusion_err, DFSchema, DataFusionError,
-    ResolvedTableReference, TableReference,
+    DFSchema, DataFusionError, ResolvedTableReference, TableReference, config_err,
+    exec_err, plan_datafusion_err,
 };
+use datafusion_execution::TaskContext;
 use datafusion_execution::config::SessionConfig;
 use datafusion_execution::runtime_env::RuntimeEnv;
-use datafusion_execution::TaskContext;
+#[cfg(feature = "sql")]
+use datafusion_expr::TableSource;
 use datafusion_expr::execution_props::ExecutionProps;
 use datafusion_expr::expr_rewriter::FunctionRewrite;
 use datafusion_expr::planner::ExprPlanner;
 #[cfg(feature = "sql")]
-use datafusion_expr::planner::TypePlanner;
-use datafusion_expr::registry::{FunctionRegistry, SerializerRegistry};
-use datafusion_expr::simplify::SimplifyInfo;
-#[cfg(feature = "sql")]
-use datafusion_expr::TableSource;
+use datafusion_expr::planner::{RelationPlanner, TypePlanner};
+use datafusion_expr::registry::{
+    ExtensionTypeRegistryRef, FunctionRegistry, MemoryExtensionTypeRegistry,
+    SerializerRegistry,
+};
+use datafusion_expr::simplify::SimplifyContext;
 use datafusion_expr::{
-    AggregateUDF, Explain, Expr, ExprSchemable, LogicalPlan, ScalarUDF, WindowUDF,
+    AggregateUDF, Explain, Expr, HigherOrderUDF, LogicalPlan, ScalarUDF, WindowUDF,
 };
 use datafusion_optimizer::simplify_expressions::ExprSimplifier;
 use datafusion_optimizer::{
@@ -67,9 +70,11 @@ use datafusion_optimizer::{
 };
 use datafusion_physical_expr::create_physical_expr;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-use datafusion_physical_optimizer::optimizer::PhysicalOptimizer;
+use datafusion_physical_optimizer::PhysicalOptimizerContext;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::optimizer::PhysicalOptimizer;
 use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::operator_statistics::StatisticsRegistry;
 use datafusion_session::Session;
 #[cfg(feature = "sql")]
 use datafusion_sql::{
@@ -139,6 +144,8 @@ pub struct SessionState {
     analyzer: Analyzer,
     /// Provides support for customizing the SQL planner, e.g. to add support for custom operators like `->>` or `?`
     expr_planners: Vec<Arc<dyn ExprPlanner>>,
+    #[cfg(feature = "sql")]
+    relation_planners: Vec<Arc<dyn RelationPlanner>>,
     /// Provides support for customizing the SQL type planning
     #[cfg(feature = "sql")]
     type_planner: Option<Arc<dyn TypePlanner>>,
@@ -154,10 +161,14 @@ pub struct SessionState {
     table_functions: HashMap<String, Arc<TableFunction>>,
     /// Scalar functions that are registered with the context
     scalar_functions: HashMap<String, Arc<ScalarUDF>>,
+    /// Higher order functions that are registered with the context
+    higher_order_functions: HashMap<String, Arc<dyn HigherOrderUDF>>,
     /// Aggregate functions registered in the context
     aggregate_functions: HashMap<String, Arc<AggregateUDF>>,
     /// Window functions registered in the context
     window_functions: HashMap<String, Arc<WindowUDF>>,
+    /// Extension types registry for extensions.
+    extension_types: ExtensionTypeRegistryRef,
     /// Deserializer registry for extensions.
     serializer_registry: Arc<dyn SerializerRegistry>,
     /// Holds registered external FileFormat implementations
@@ -185,11 +196,28 @@ pub struct SessionState {
     /// It will be invoked on `CREATE FUNCTION` statements.
     /// thus, changing dialect o PostgreSql is required
     function_factory: Option<Arc<dyn FunctionFactory>>,
+    cache_factory: Option<Arc<dyn CacheFactory>>,
+    /// Optional statistics registry for pluggable statistics providers.
+    ///
+    /// When set, physical optimizer rules can use this registry to obtain
+    /// enhanced statistics (e.g., NDV overrides, histograms) beyond what
+    /// is available from `ExecutionPlan::partition_statistics()`.
+    statistics_registry: Option<StatisticsRegistry>,
     /// Cache logical plans of prepared statements for later execution.
     /// Key is the prepared statement name.
     prepared_plans: HashMap<String, Arc<PreparedPlan>>,
 }
 
+impl PhysicalOptimizerContext for SessionState {
+    fn config_options(&self) -> &ConfigOptions {
+        self.config_options()
+    }
+
+    fn statistics_registry(&self) -> Option<&StatisticsRegistry> {
+        self.statistics_registry.as_ref()
+    }
+}
+
 impl Debug for SessionState {
     /// Prefer having short fields at the top and long vector fields near the end
     /// Group fields by
@@ -206,8 +234,12 @@ impl Debug for SessionState {
             .field("table_options", &self.table_options)
             .field("table_factories", &self.table_factories)
             .field("function_factory", &self.function_factory)
+            .field("cache_factory", &self.cache_factory)
             .field("expr_planners", &self.expr_planners);
 
+        #[cfg(feature = "sql")]
+        let ret = ret.field("relation_planners", &self.relation_planners);
+
         #[cfg(feature = "sql")]
         let ret = ret.field("type_planner", &self.type_planner);
 
@@ -217,6 +249,7 @@ impl Debug for SessionState {
             .field("physical_optimizers", &self.physical_optimizers)
             .field("table_functions", &self.table_functions)
             .field("scalar_functions", &self.scalar_functions)
+            .field("higher_order_functions", &self.higher_order_functions)
             .field("aggregate_functions", &self.aggregate_functions)
             .field("window_functions", &self.window_functions)
             .field("prepared_plans", &self.prepared_plans)
@@ -253,6 +286,10 @@ impl Session for SessionState {
         &self.scalar_functions
     }
 
+    fn higher_order_functions(&self) -> &HashMap<String, Arc<dyn HigherOrderUDF>> {
+        &self.higher_order_functions
+    }
+
     fn aggregate_functions(&self) -> &HashMap<String, Arc<AggregateUDF>> {
         &self.aggregate_functions
     }
@@ -261,6 +298,10 @@ impl Session for SessionState {
         &self.window_functions
     }
 
+    fn extension_type_registry(&self) -> &ExtensionTypeRegistryRef {
+        &self.extension_types
+    }
+
     fn runtime_env(&self) -> &Arc<RuntimeEnv> {
         self.runtime_env()
     }
@@ -345,6 +386,13 @@ impl SessionState {
         self.optimizer.rules.push(optimizer_rule);
     }
 
+    /// Removes an optimizer rule by name, returning `true` if it existed.
+    pub(crate) fn remove_optimizer_rule(&mut self, name: &str) -> bool {
+        let original_len = self.optimizer.rules.len();
+        self.optimizer.rules.retain(|r| r.name() != name);
+        self.optimizer.rules.len() < original_len
+    }
+
     /// Registers a [`FunctionFactory`] to handle `CREATE FUNCTION` statements
     pub fn set_function_factory(&mut self, function_factory: Arc<dyn FunctionFactory>) {
         self.function_factory = Some(function_factory);
@@ -355,6 +403,16 @@ impl SessionState {
         self.function_factory.as_ref()
     }
 
+    /// Register a [`CacheFactory`] for custom caching strategy
+    pub fn set_cache_factory(&mut self, cache_factory: Arc<dyn CacheFactory>) {
+        self.cache_factory = Some(cache_factory);
+    }
+
+    /// Get the cache factory
+    pub fn cache_factory(&self) -> Option<&Arc<dyn CacheFactory>> {
+        self.cache_factory.as_ref()
+    }
+
     /// Get the table factories
     pub fn table_factories(&self) -> &HashMap<String, Arc<dyn TableProviderFactory>> {
         &self.table_factories
@@ -480,10 +538,10 @@ impl SessionState {
             let resolved = self.resolve_table_ref(reference);
             if let Entry::Vacant(v) = provider.tables.entry(resolved) {
                 let resolved = v.key();
-                if let Ok(schema) = self.schema_for_ref(resolved.clone()) {
-                    if let Some(table) = schema.table(&resolved.table).await? {
-                        v.insert(provider_as_source(table));
-                    }
+                if let Ok(schema) = self.schema_for_ref(resolved.clone())
+                    && let Some(table) = schema.table(&resolved.table).await?
+                {
+                    v.insert(provider_as_source(table));
                 }
             }
         }
@@ -547,6 +605,16 @@ impl SessionState {
 
         let sql_expr = self.sql_to_expr_with_alias(sql, &dialect)?;
 
+        self.create_logical_expr_from_sql_expr(sql_expr, df_schema)
+    }
+
+    /// Creates a datafusion style AST [`Expr`] from a SQL expression.
+    #[cfg(feature = "sql")]
+    pub fn create_logical_expr_from_sql_expr(
+        &self,
+        sql_expr: SQLExprWithAlias,
+        df_schema: &DFSchema,
+    ) -> datafusion_common::Result<Expr> {
         let provider = SessionContextProvider {
             state: self,
             tables: HashMap::new(),
@@ -571,6 +639,24 @@ impl SessionState {
         &self.expr_planners
     }
 
+    #[cfg(feature = "sql")]
+    /// Returns the registered relation planners in priority order.
+    pub fn relation_planners(&self) -> &[Arc<dyn RelationPlanner>] {
+        &self.relation_planners
+    }
+
+    #[cfg(feature = "sql")]
+    /// Registers a [`RelationPlanner`] to customize SQL relation planning.
+    ///
+    /// Newly registered planners are given higher priority than existing ones.
+    pub fn register_relation_planner(
+        &mut self,
+        planner: Arc<dyn RelationPlanner>,
+    ) -> datafusion_common::Result<()> {
+        self.relation_planners.insert(0, planner);
+        Ok(())
+    }
+
     /// Returns the [`QueryPlanner`] for this session
     pub fn query_planner(&self) -> &Arc<dyn QueryPlanner + Send + Sync> {
         &self.query_planner
@@ -685,20 +771,26 @@ impl SessionState {
     /// * [`create_physical_expr`] for a lower-level API
     ///
     /// [simplified]: datafusion_optimizer::simplify_expressions
-    /// [expr_api]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/expr_api.rs
+    /// [expr_api]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/expr_api.rs
     /// [`SessionContext::create_physical_expr`]: crate::execution::context::SessionContext::create_physical_expr
     pub fn create_physical_expr(
         &self,
         expr: Expr,
         df_schema: &DFSchema,
     ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
-        let simplifier =
-            ExprSimplifier::new(SessionSimplifyProvider::new(self, df_schema));
+        let config_options = self.config_options();
+        let simplify_context = SimplifyContext::builder()
+            .with_schema(Arc::new(df_schema.clone()))
+            .with_config_options(Arc::clone(config_options))
+            .with_query_execution_start_time(
+                self.execution_props().query_execution_start_time,
+            )
+            .build();
+        let simplifier = ExprSimplifier::new(simplify_context);
         // apply type coercion here to ensure types match
         let mut expr = simplifier.coerce(expr, df_schema)?;
 
         // rewrite Exprs to functions if necessary
-        let config_options = self.config_options();
         for rewrite in self.analyzer.function_rewrites() {
             expr = expr
                 .transform_up(|expr| rewrite.rewrite(expr, df_schema, config_options))?
@@ -752,6 +844,14 @@ impl SessionState {
         self.config.options()
     }
 
+    /// Returns the statistics registry if one is configured.
+    ///
+    /// The registry provides pluggable statistics providers for enhanced
+    /// cardinality estimation (e.g., NDV overrides, histograms).
+    pub fn statistics_registry(&self) -> Option<&StatisticsRegistry> {
+        self.statistics_registry.as_ref()
+    }
+
     /// Mark the start of the execution
     pub fn mark_start_execution(&mut self) {
         let config = Arc::clone(self.config.options());
@@ -788,10 +888,18 @@ impl SessionState {
         overwrite: bool,
     ) -> Result<(), DataFusionError> {
         let ext = file_format.get_ext().to_lowercase();
-        match (self.file_formats.entry(ext.clone()), overwrite){
-            (Entry::Vacant(e), _) => {e.insert(file_format);},
-            (Entry::Occupied(mut e), true)  => {e.insert(file_format);},
-            (Entry::Occupied(_), false) => return config_err!("File type already registered for extension {ext}. Set overwrite to true to replace this extension."),
+        match (self.file_formats.entry(ext.clone()), overwrite) {
+            (Entry::Vacant(e), _) => {
+                e.insert(file_format);
+            }
+            (Entry::Occupied(mut e), true) => {
+                e.insert(file_format);
+            }
+            (Entry::Occupied(_), false) => {
+                return config_err!(
+                    "File type already registered for extension {ext}. Set overwrite to true to replace this extension."
+                );
+            }
         };
         Ok(())
     }
@@ -815,11 +923,8 @@ impl SessionState {
         &self.catalog_list
     }
 
-    /// set the catalog list
-    pub(crate) fn register_catalog_list(
-        &mut self,
-        catalog_list: Arc<dyn CatalogProviderList>,
-    ) {
+    /// Set the catalog list
+    pub fn register_catalog_list(&mut self, catalog_list: Arc<dyn CatalogProviderList>) {
         self.catalog_list = catalog_list;
     }
 
@@ -828,6 +933,11 @@ impl SessionState {
         &self.scalar_functions
     }
 
+    /// Return reference to higher_order_functions
+    pub fn higher_order_functions(&self) -> &HashMap<String, Arc<dyn HigherOrderUDF>> {
+        &self.higher_order_functions
+    }
+
     /// Return reference to aggregate_functions
     pub fn aggregate_functions(&self) -> &HashMap<String, Arc<AggregateUDF>> {
         &self.aggregate_functions
@@ -909,11 +1019,14 @@ impl SessionState {
 /// be used for all values unless explicitly provided.
 ///
 /// See example on [`SessionState`]
+#[derive(Clone)]
 pub struct SessionStateBuilder {
     session_id: Option<String>,
     analyzer: Option<Analyzer>,
     expr_planners: Option<Vec<Arc<dyn ExprPlanner>>>,
     #[cfg(feature = "sql")]
+    relation_planners: Option<Vec<Arc<dyn RelationPlanner>>>,
+    #[cfg(feature = "sql")]
     type_planner: Option<Arc<dyn TypePlanner>>,
     optimizer: Option<Optimizer>,
     physical_optimizers: Option<PhysicalOptimizer>,
@@ -921,8 +1034,10 @@ pub struct SessionStateBuilder {
     catalog_list: Option<Arc<dyn CatalogProviderList>>,
     table_functions: Option<HashMap<String, Arc<TableFunction>>>,
     scalar_functions: Option<Vec<Arc<ScalarUDF>>>,
+    higher_order_functions: Option<Vec<Arc<dyn HigherOrderUDF>>>,
     aggregate_functions: Option<Vec<Arc<AggregateUDF>>>,
     window_functions: Option<Vec<Arc<WindowUDF>>>,
+    extension_types: Option<ExtensionTypeRegistryRef>,
     serializer_registry: Option<Arc<dyn SerializerRegistry>>,
     file_formats: Option<Vec<Arc<dyn FileFormatFactory>>>,
     config: Option<SessionConfig>,
@@ -931,6 +1046,8 @@ pub struct SessionStateBuilder {
     table_factories: Option<HashMap<String, Arc<dyn TableProviderFactory>>>,
     runtime_env: Option<Arc<RuntimeEnv>>,
     function_factory: Option<Arc<dyn FunctionFactory>>,
+    cache_factory: Option<Arc<dyn CacheFactory>>,
+    statistics_registry: Option<StatisticsRegistry>,
     // fields to support convenience functions
     analyzer_rules: Option<Vec<Arc<dyn AnalyzerRule + Send + Sync>>>,
     optimizer_rules: Option<Vec<Arc<dyn OptimizerRule + Send + Sync>>>,
@@ -951,6 +1068,8 @@ impl SessionStateBuilder {
             analyzer: None,
             expr_planners: None,
             #[cfg(feature = "sql")]
+            relation_planners: None,
+            #[cfg(feature = "sql")]
             type_planner: None,
             optimizer: None,
             physical_optimizers: None,
@@ -958,8 +1077,10 @@ impl SessionStateBuilder {
             catalog_list: None,
             table_functions: None,
             scalar_functions: None,
+            higher_order_functions: None,
             aggregate_functions: None,
             window_functions: None,
+            extension_types: None,
             serializer_registry: None,
             file_formats: None,
             table_options: None,
@@ -968,6 +1089,8 @@ impl SessionStateBuilder {
             table_factories: None,
             runtime_env: None,
             function_factory: None,
+            cache_factory: None,
+            statistics_registry: None,
             // fields to support convenience functions
             analyzer_rules: None,
             optimizer_rules: None,
@@ -1001,6 +1124,8 @@ impl SessionStateBuilder {
             analyzer: Some(existing.analyzer),
             expr_planners: Some(existing.expr_planners),
             #[cfg(feature = "sql")]
+            relation_planners: Some(existing.relation_planners),
+            #[cfg(feature = "sql")]
             type_planner: existing.type_planner,
             optimizer: Some(existing.optimizer),
             physical_optimizers: Some(existing.physical_optimizers),
@@ -1008,10 +1133,14 @@ impl SessionStateBuilder {
             catalog_list: Some(existing.catalog_list),
             table_functions: Some(existing.table_functions),
             scalar_functions: Some(existing.scalar_functions.into_values().collect_vec()),
+            higher_order_functions: Some(
+                existing.higher_order_functions.into_values().collect_vec(),
+            ),
             aggregate_functions: Some(
                 existing.aggregate_functions.into_values().collect_vec(),
             ),
             window_functions: Some(existing.window_functions.into_values().collect_vec()),
+            extension_types: Some(existing.extension_types),
             serializer_registry: Some(existing.serializer_registry),
             file_formats: Some(existing.file_formats.into_values().collect_vec()),
             config: Some(new_config),
@@ -1020,7 +1149,8 @@ impl SessionStateBuilder {
             table_factories: Some(existing.table_factories),
             runtime_env: Some(existing.runtime_env),
             function_factory: existing.function_factory,
-
+            cache_factory: existing.cache_factory,
+            statistics_registry: existing.statistics_registry,
             // fields to support convenience functions
             analyzer_rules: None,
             optimizer_rules: None,
@@ -1049,6 +1179,10 @@ impl SessionStateBuilder {
             .get_or_insert_with(Vec::new)
             .extend(SessionStateDefaults::default_scalar_functions());
 
+        self.higher_order_functions
+            .get_or_insert_with(Vec::new)
+            .extend(SessionStateDefaults::default_higher_order_functions());
+
         self.aggregate_functions
             .get_or_insert_with(Vec::new)
             .extend(SessionStateDefaults::default_aggregate_functions());
@@ -1057,6 +1191,11 @@ impl SessionStateBuilder {
             .get_or_insert_with(Vec::new)
             .extend(SessionStateDefaults::default_window_functions());
 
+        self.extension_types
+            .get_or_insert_with(|| Arc::new(MemoryExtensionTypeRegistry::new_empty()))
+            .extend(&SessionStateDefaults::default_extension_types())
+            .expect("MemoryExtensionTypeRegistry is not read-only.");
+
         self.table_functions
             .get_or_insert_with(HashMap::new)
             .extend(
@@ -1141,6 +1280,16 @@ impl SessionStateBuilder {
         self
     }
 
+    #[cfg(feature = "sql")]
+    /// Sets the [`RelationPlanner`]s used to customize SQL relation planning.
+    pub fn with_relation_planners(
+        mut self,
+        relation_planners: Vec<Arc<dyn RelationPlanner>>,
+    ) -> Self {
+        self.relation_planners = Some(relation_planners);
+        self
+    }
+
     /// Set the [`TypePlanner`] used to customize the behavior of the SQL planner.
     #[cfg(feature = "sql")]
     pub fn with_type_planner(mut self, type_planner: Arc<dyn TypePlanner>) -> Self {
@@ -1219,6 +1368,15 @@ impl SessionStateBuilder {
         self
     }
 
+    /// Set the map of [`HigherOrderUDF`]s
+    pub fn with_higher_order_functions(
+        mut self,
+        higher_order_functions: Vec<Arc<dyn HigherOrderUDF>>,
+    ) -> Self {
+        self.higher_order_functions = Some(higher_order_functions);
+        self
+    }
+
     /// Set the map of [`AggregateUDF`]s
     pub fn with_aggregate_functions(
         mut self,
@@ -1237,6 +1395,15 @@ impl SessionStateBuilder {
         self
     }
 
+    /// Sets the [`ExtensionTypeRegistry`](datafusion_expr::registry::ExtensionTypeRegistry).
+    pub fn with_extension_type_registry(
+        mut self,
+        registry: ExtensionTypeRegistryRef,
+    ) -> Self {
+        self.extension_types = Some(registry);
+        self
+    }
+
     /// Set the [`SerializerRegistry`]
     pub fn with_serializer_registry(
         mut self,
@@ -1309,6 +1476,25 @@ impl SessionStateBuilder {
         self
     }
 
+    /// Set a [`CacheFactory`] for custom caching strategy
+    pub fn with_cache_factory(
+        mut self,
+        cache_factory: Option<Arc<dyn CacheFactory>>,
+    ) -> Self {
+        self.cache_factory = cache_factory;
+        self
+    }
+
+    /// Set a [`StatisticsRegistry`] for pluggable statistics providers.
+    ///
+    /// The registry allows physical optimizer rules to access enhanced statistics
+    /// (e.g., NDV overrides, histograms) beyond what is available from
+    /// `ExecutionPlan::partition_statistics()`.
+    pub fn with_statistics_registry(mut self, registry: StatisticsRegistry) -> Self {
+        self.statistics_registry = Some(registry);
+        self
+    }
+
     /// Register an `ObjectStore` to the [`RuntimeEnv`]. See [`RuntimeEnv::register_object_store`]
     /// for more details.
     ///
@@ -1355,6 +1541,8 @@ impl SessionStateBuilder {
             analyzer,
             expr_planners,
             #[cfg(feature = "sql")]
+            relation_planners,
+            #[cfg(feature = "sql")]
             type_planner,
             optimizer,
             physical_optimizers,
@@ -1362,8 +1550,10 @@ impl SessionStateBuilder {
             catalog_list,
             table_functions,
             scalar_functions,
+            higher_order_functions,
             aggregate_functions,
             window_functions,
+            extension_types,
             serializer_registry,
             file_formats,
             table_options,
@@ -1372,6 +1562,8 @@ impl SessionStateBuilder {
             table_factories,
             runtime_env,
             function_factory,
+            cache_factory,
+            statistics_registry,
             analyzer_rules,
             optimizer_rules,
             physical_optimizer_rules,
@@ -1385,6 +1577,8 @@ impl SessionStateBuilder {
             analyzer: analyzer.unwrap_or_default(),
             expr_planners: expr_planners.unwrap_or_default(),
             #[cfg(feature = "sql")]
+            relation_planners: relation_planners.unwrap_or_default(),
+            #[cfg(feature = "sql")]
             type_planner,
             optimizer: optimizer.unwrap_or_default(),
             physical_optimizers: physical_optimizers.unwrap_or_default(),
@@ -1395,8 +1589,10 @@ impl SessionStateBuilder {
             }),
             table_functions: table_functions.unwrap_or_default(),
             scalar_functions: HashMap::new(),
+            higher_order_functions: HashMap::new(),
             aggregate_functions: HashMap::new(),
             window_functions: HashMap::new(),
+            extension_types: Arc::new(MemoryExtensionTypeRegistry::default()),
             serializer_registry: serializer_registry
                 .unwrap_or_else(|| Arc::new(EmptySerializerRegistry)),
             file_formats: HashMap::new(),
@@ -1408,6 +1604,8 @@ impl SessionStateBuilder {
             table_factories: table_factories.unwrap_or_default(),
             runtime_env,
             function_factory,
+            cache_factory,
+            statistics_registry,
             prepared_plans: HashMap::new(),
         };
 
@@ -1447,6 +1645,29 @@ impl SessionStateBuilder {
             }
         }
 
+        if let Some(higher_order_functions) = higher_order_functions {
+            for function in higher_order_functions {
+                match state.register_higher_order_function(Arc::clone(&function)) {
+                    Ok(Some(existing)) => {
+                        debug!(
+                            "Overwrote existing higher-order function '{}'",
+                            existing.name()
+                        );
+                    }
+                    Ok(None) => {
+                        debug!("Registered higher-order function '{}'", function.name());
+                    }
+                    Err(err) => {
+                        debug!(
+                            "Failed to register higher-order function '{}': {}",
+                            function.name(),
+                            err
+                        );
+                    }
+                }
+            }
+        }
+
         if let Some(aggregate_functions) = aggregate_functions {
             aggregate_functions.into_iter().for_each(|udaf| {
                 let existing_udf = state.register_udaf(udaf);
@@ -1465,6 +1686,10 @@ impl SessionStateBuilder {
             });
         }
 
+        if let Some(extension_types) = extension_types {
+            state.extension_types = extension_types;
+        }
+
         if state.config.create_default_catalog_and_schema() {
             let default_catalog = SessionStateDefaults::default_catalog(
                 &state.config,
@@ -1521,6 +1746,12 @@ impl SessionStateBuilder {
         &mut self.expr_planners
     }
 
+    #[cfg(feature = "sql")]
+    /// Returns a mutable reference to the current [`RelationPlanner`] list.
+    pub fn relation_planners(&mut self) -> &mut Option<Vec<Arc<dyn RelationPlanner>>> {
+        &mut self.relation_planners
+    }
+
     /// Returns the current type_planner value
     #[cfg(feature = "sql")]
     pub fn type_planner(&mut self) -> &mut Option<Arc<dyn TypePlanner>> {
@@ -1559,6 +1790,13 @@ impl SessionStateBuilder {
         &mut self.scalar_functions
     }
 
+    /// Returns the current scalar_functions value
+    pub fn higher_order_functions(
+        &mut self,
+    ) -> &mut Option<Vec<Arc<dyn HigherOrderUDF>>> {
+        &mut self.higher_order_functions
+    }
+
     /// Returns the current aggregate_functions value
     pub fn aggregate_functions(&mut self) -> &mut Option<Vec<Arc<AggregateUDF>>> {
         &mut self.aggregate_functions
@@ -1611,6 +1849,11 @@ impl SessionStateBuilder {
         &mut self.function_factory
     }
 
+    /// Returns the cache factory
+    pub fn cache_factory(&mut self) -> &mut Option<Arc<dyn CacheFactory>> {
+        &mut self.cache_factory
+    }
+
     /// Returns the current analyzer_rules value
     pub fn analyzer_rules(
         &mut self,
@@ -1649,6 +1892,7 @@ impl Debug for SessionStateBuilder {
             .field("table_options", &self.table_options)
             .field("table_factories", &self.table_factories)
             .field("function_factory", &self.function_factory)
+            .field("cache_factory", &self.cache_factory)
             .field("expr_planners", &self.expr_planners);
         #[cfg(feature = "sql")]
         let ret = ret.field("type_planner", &self.type_planner);
@@ -1661,6 +1905,7 @@ impl Debug for SessionStateBuilder {
             .field("physical_optimizers", &self.physical_optimizers)
             .field("table_functions", &self.table_functions)
             .field("scalar_functions", &self.scalar_functions)
+            .field("higher_order_functions", &self.higher_order_functions)
             .field("aggregate_functions", &self.aggregate_functions)
             .field("window_functions", &self.window_functions)
             .finish()
@@ -1695,6 +1940,10 @@ impl ContextProvider for SessionContextProvider<'_> {
         self.state.expr_planners()
     }
 
+    fn get_relation_planners(&self) -> &[Arc<dyn RelationPlanner>] {
+        self.state.relation_planners()
+    }
+
     fn get_type_planner(&self) -> Option<Arc<dyn TypePlanner>> {
         if let Some(type_planner) = &self.state.type_planner {
             Some(Arc::clone(type_planner))
@@ -1719,20 +1968,32 @@ impl ContextProvider for SessionContextProvider<'_> {
         name: &str,
         args: Vec<Expr>,
     ) -> datafusion_common::Result<Arc<dyn TableSource>> {
+        use datafusion_catalog::TableFunctionArgs;
+
         let tbl_func = self
             .state
             .table_functions
             .get(name)
             .cloned()
             .ok_or_else(|| plan_datafusion_err!("table function '{name}' not found"))?;
-        let dummy_schema = DFSchema::empty();
-        let simplifier =
-            ExprSimplifier::new(SessionSimplifyProvider::new(self.state, &dummy_schema));
+        let simplify_context = SimplifyContext::builder()
+            .with_config_options(Arc::clone(self.state.config_options()))
+            .with_query_execution_start_time(
+                self.state.execution_props().query_execution_start_time,
+            )
+            .build();
+        let simplifier = ExprSimplifier::new(simplify_context);
+        let schema = DFSchema::empty();
         let args = args
             .into_iter()
-            .map(|arg| simplifier.simplify(arg))
+            .map(|arg| {
+                simplifier
+                    .coerce(arg, &schema)
+                    .and_then(|e| simplifier.simplify(e))
+            })
             .collect::<datafusion_common::Result<Vec<_>>>()?;
-        let provider = tbl_func.create_table_provider(&args)?;
+        let provider = tbl_func
+            .create_table_provider_with_args(TableFunctionArgs::new(&args, self.state))?;
 
         Ok(provider_as_source(provider))
     }
@@ -1755,6 +2016,10 @@ impl ContextProvider for SessionContextProvider<'_> {
         self.state.scalar_functions().get(name).cloned()
     }
 
+    fn get_higher_order_meta(&self, name: &str) -> Option<Arc<dyn HigherOrderUDF>> {
+        self.state.higher_order_functions().get(name).cloned()
+    }
+
     fn get_aggregate_meta(&self, name: &str) -> Option<Arc<AggregateUDF>> {
         self.state.aggregate_functions().get(name).cloned()
     }
@@ -1764,7 +2029,7 @@ impl ContextProvider for SessionContextProvider<'_> {
     }
 
     fn get_variable_type(&self, variable_names: &[String]) -> Option<DataType> {
-        use datafusion_expr::var_provider::{is_system_variables, VarType};
+        use datafusion_expr::var_provider::{VarType, is_system_variables};
 
         if variable_names.is_empty() {
             return None;
@@ -1791,6 +2056,14 @@ impl ContextProvider for SessionContextProvider<'_> {
         self.state.scalar_functions().keys().cloned().collect()
     }
 
+    fn higher_order_function_names(&self) -> Vec<String> {
+        self.state
+            .higher_order_functions()
+            .keys()
+            .cloned()
+            .collect()
+    }
+
     fn udaf_names(&self) -> Vec<String> {
         self.state.aggregate_functions().keys().cloned().collect()
     }
@@ -1830,6 +2103,16 @@ impl FunctionRegistry for SessionState {
         })
     }
 
+    fn higher_order_function(
+        &self,
+        name: &str,
+    ) -> datafusion_common::Result<Arc<dyn HigherOrderUDF>> {
+        self.higher_order_functions
+            .get(name)
+            .cloned()
+            .ok_or_else(|| plan_datafusion_err!("Higher Order Function {name} not found"))
+    }
+
     fn udaf(&self, name: &str) -> datafusion_common::Result<Arc<AggregateUDF>> {
         let result = self.aggregate_functions.get(name);
 
@@ -1857,6 +2140,19 @@ impl FunctionRegistry for SessionState {
         Ok(self.scalar_functions.insert(udf.name().into(), udf))
     }
 
+    fn register_higher_order_function(
+        &mut self,
+        function: Arc<dyn HigherOrderUDF>,
+    ) -> datafusion_common::Result<Option<Arc<dyn HigherOrderUDF>>> {
+        function.aliases().iter().for_each(|alias| {
+            self.higher_order_functions
+                .insert(alias.clone(), Arc::clone(&function));
+        });
+        Ok(self
+            .higher_order_functions
+            .insert(function.name().into(), function))
+    }
+
     fn register_udaf(
         &mut self,
         udaf: Arc<AggregateUDF>,
@@ -1892,6 +2188,19 @@ impl FunctionRegistry for SessionState {
         Ok(udf)
     }
 
+    fn deregister_higher_order_function(
+        &mut self,
+        name: &str,
+    ) -> datafusion_common::Result<Option<Arc<dyn HigherOrderUDF>>> {
+        let function = self.higher_order_functions.remove(name);
+        if let Some(function) = &function {
+            for alias in function.aliases() {
+                self.higher_order_functions.remove(alias);
+            }
+        }
+        Ok(function)
+    }
+
     fn deregister_udaf(
         &mut self,
         name: &str,
@@ -1938,6 +2247,10 @@ impl FunctionRegistry for SessionState {
         Ok(())
     }
 
+    fn higher_order_function_names(&self) -> HashSet<String> {
+        self.higher_order_functions.keys().cloned().collect()
+    }
+
     fn udafs(&self) -> HashSet<String> {
         self.aggregate_functions.keys().cloned().collect()
     }
@@ -1947,8 +2260,14 @@ impl FunctionRegistry for SessionState {
     }
 }
 
+impl datafusion_execution::TaskContextProvider for SessionState {
+    fn task_ctx(&self) -> Arc<TaskContext> {
+        SessionState::task_ctx(self)
+    }
+}
+
 impl OptimizerConfig for SessionState {
-    fn query_execution_start_time(&self) -> DateTime<Utc> {
+    fn query_execution_start_time(&self) -> Option<DateTime<Utc>> {
         self.execution_props.query_execution_start_time
     }
 
@@ -1974,6 +2293,7 @@ impl From<&SessionState> for TaskContext {
             state.session_id.clone(),
             state.config.clone(),
             state.scalar_functions.clone(),
+            state.higher_order_functions.clone(),
             state.aggregate_functions.clone(),
             state.window_functions.clone(),
             Arc::clone(&state.runtime_env),
@@ -2000,35 +2320,6 @@ impl QueryPlanner for DefaultQueryPlanner {
     }
 }
 
-struct SessionSimplifyProvider<'a> {
-    state: &'a SessionState,
-    df_schema: &'a DFSchema,
-}
-
-impl<'a> SessionSimplifyProvider<'a> {
-    fn new(state: &'a SessionState, df_schema: &'a DFSchema) -> Self {
-        Self { state, df_schema }
-    }
-}
-
-impl SimplifyInfo for SessionSimplifyProvider<'_> {
-    fn is_boolean_type(&self, expr: &Expr) -> datafusion_common::Result<bool> {
-        Ok(expr.get_type(self.df_schema)? == DataType::Boolean)
-    }
-
-    fn nullable(&self, expr: &Expr) -> datafusion_common::Result<bool> {
-        expr.nullable(self.df_schema)
-    }
-
-    fn execution_props(&self) -> &ExecutionProps {
-        self.state.execution_props()
-    }
-
-    fn get_data_type(&self, expr: &Expr) -> datafusion_common::Result<DataType> {
-        expr.get_type(self.df_schema)
-    }
-}
-
 #[derive(Debug)]
 pub(crate) struct PreparedPlan {
     /// Data types of the parameters
@@ -2037,14 +2328,27 @@ pub(crate) struct PreparedPlan {
     pub(crate) plan: Arc<LogicalPlan>,
 }
 
+/// A [`CacheFactory`] can be registered via [`SessionState`]
+/// to create a custom logical plan for [`crate::dataframe::DataFrame::cache`].
+/// Additionally, a custom [`crate::physical_planner::ExtensionPlanner`]/[`QueryPlanner`]
+/// may need to be implemented to handle such plans.
+pub trait CacheFactory: Debug + Send + Sync {
+    /// Create a logical plan for caching
+    fn create(
+        &self,
+        plan: LogicalPlan,
+        session_state: &SessionState,
+    ) -> datafusion_common::Result<LogicalPlan>;
+}
+
 #[cfg(test)]
 mod tests {
     use super::{SessionContextProvider, SessionStateBuilder};
     use crate::common::assert_contains;
     use crate::config::ConfigOptions;
+    use crate::datasource::MemTable;
     use crate::datasource::empty::EmptyTable;
     use crate::datasource::provider_as_source;
-    use crate::datasource::MemTable;
     use crate::execution::context::SessionState;
     use crate::logical_expr::planner::ExprPlanner;
     use crate::logical_expr::{AggregateUDF, ScalarUDF, TableSource, WindowUDF};
@@ -2054,13 +2358,14 @@ mod tests {
     use arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray};
     use arrow::datatypes::{DataType, Field, Schema};
     use datafusion_catalog::MemoryCatalogProviderList;
-    use datafusion_common::config::Dialect;
     use datafusion_common::DFSchema;
     use datafusion_common::Result;
+    use datafusion_common::config::Dialect;
     use datafusion_execution::config::SessionConfig;
     use datafusion_expr::Expr;
-    use datafusion_optimizer::optimizer::OptimizerRule;
+    use datafusion_expr::HigherOrderUDF;
     use datafusion_optimizer::Optimizer;
+    use datafusion_optimizer::optimizer::OptimizerRule;
     use datafusion_physical_plan::display::DisplayableExecutionPlan;
     use datafusion_sql::planner::{PlannerContext, SqlToRel};
     use std::collections::HashMap;
@@ -2097,6 +2402,36 @@ mod tests {
         assert!(sql_to_expr(&state).is_err())
     }
 
+    #[test]
+    #[cfg(feature = "sql")]
+    fn test_create_logical_expr_from_sql_expr() {
+        let state = SessionStateBuilder::new().with_default_features().build();
+
+        let provider = SessionContextProvider {
+            state: &state,
+            tables: HashMap::new(),
+        };
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let df_schema = DFSchema::try_from(schema).unwrap();
+        let dialect = state.config.options().sql_parser.dialect;
+        let query = SqlToRel::new_with_options(&provider, state.get_parser_options());
+
+        for sql in ["[1,2,3]", "a > 10", "SUM(a)"] {
+            let sql_expr = state.sql_to_expr(sql, &dialect).unwrap();
+            let from_str = query
+                .sql_to_expr(sql_expr, &df_schema, &mut PlannerContext::new())
+                .unwrap();
+
+            let sql_expr_with_alias =
+                state.sql_to_expr_with_alias(sql, &dialect).unwrap();
+            let from_expr = state
+                .create_logical_expr_from_sql_expr(sql_expr_with_alias, &df_schema)
+                .unwrap();
+            assert_eq!(from_str, from_expr);
+        }
+    }
+
     #[test]
     fn test_from_existing() -> Result<()> {
         fn employee_batch() -> RecordBatch {
@@ -2137,13 +2472,15 @@ mod tests {
             .table_exist("employee");
         assert!(is_exist);
         let new_state = SessionStateBuilder::new_from_existing(session_state).build();
-        assert!(new_state
-            .catalog_list()
-            .catalog(default_catalog.as_str())
-            .unwrap()
-            .schema(default_schema.as_str())
-            .unwrap()
-            .table_exist("employee"));
+        assert!(
+            new_state
+                .catalog_list()
+                .catalog(default_catalog.as_str())
+                .unwrap()
+                .schema(default_schema.as_str())
+                .unwrap()
+                .table_exist("employee")
+        );
 
         // if `with_create_default_catalog_and_schema` is disabled, the new one shouldn't create default catalog and schema
         let disable_create_default =
@@ -2151,10 +2488,12 @@ mod tests {
         let without_default_state = SessionStateBuilder::new()
             .with_config(disable_create_default)
             .build();
-        assert!(without_default_state
-            .catalog_list()
-            .catalog(&default_catalog)
-            .is_none());
+        assert!(
+            without_default_state
+                .catalog_list()
+                .catalog(&default_catalog)
+                .is_none()
+        );
         let new_state =
             SessionStateBuilder::new_from_existing(without_default_state).build();
         assert!(new_state.catalog_list().catalog(&default_catalog).is_none());
@@ -2338,6 +2677,10 @@ mod tests {
             self.state.scalar_functions().get(name).cloned()
         }
 
+        fn get_higher_order_meta(&self, name: &str) -> Option<Arc<dyn HigherOrderUDF>> {
+            self.state.higher_order_functions().get(name).cloned()
+        }
+
         fn get_aggregate_meta(&self, name: &str) -> Option<Arc<AggregateUDF>> {
             self.state.aggregate_functions().get(name).cloned()
         }
@@ -2358,6 +2701,14 @@ mod tests {
             self.state.scalar_functions().keys().cloned().collect()
         }
 
+        fn higher_order_function_names(&self) -> Vec<String> {
+            self.state
+                .higher_order_functions()
+                .keys()
+                .cloned()
+                .collect()
+        }
+
         fn udaf_names(&self) -> Vec<String> {
             self.state.aggregate_functions().keys().cloned().collect()
         }
diff --git a/datafusion/core/src/execution/session_state_defaults.rs b/datafusion/core/src/execution/session_state_defaults.rs
index 62a575541a5d8..5e85c1bbc5e9e 100644
--- a/datafusion/core/src/execution/session_state_defaults.rs
+++ b/datafusion/core/src/execution/session_state_defaults.rs
@@ -17,6 +17,7 @@
 
 use crate::catalog::listing_schema::ListingSchemaProvider;
 use crate::catalog::{CatalogProvider, TableProviderFactory};
+use crate::datasource::file_format::FileFormatFactory;
 use crate::datasource::file_format::arrow::ArrowFormatFactory;
 #[cfg(feature = "avro")]
 use crate::datasource::file_format::avro::AvroFormatFactory;
@@ -24,7 +25,6 @@ use crate::datasource::file_format::csv::CsvFormatFactory;
 use crate::datasource::file_format::json::JsonFormatFactory;
 #[cfg(feature = "parquet")]
 use crate::datasource::file_format::parquet::ParquetFormatFactory;
-use crate::datasource::file_format::FileFormatFactory;
 use crate::datasource::provider::DefaultTableFactory;
 use crate::execution::context::SessionState;
 #[cfg(feature = "nested_expressions")]
@@ -36,7 +36,8 @@ use datafusion_execution::config::SessionConfig;
 use datafusion_execution::object_store::ObjectStoreUrl;
 use datafusion_execution::runtime_env::RuntimeEnv;
 use datafusion_expr::planner::ExprPlanner;
-use datafusion_expr::{AggregateUDF, ScalarUDF, WindowUDF};
+use datafusion_expr::registry::ExtensionTypeRegistrationRef;
+use datafusion_expr::{AggregateUDF, HigherOrderUDF, ScalarUDF, WindowUDF};
 use std::collections::HashMap;
 use std::sync::Arc;
 use url::Url;
@@ -103,7 +104,7 @@ impl SessionStateDefaults {
 
     /// returns the list of default [`ScalarUDF`]s
     pub fn default_scalar_functions() -> Vec<Arc<ScalarUDF>> {
-        #[cfg_attr(not(feature = "nested_expressions"), allow(unused_mut))]
+        #[cfg_attr(not(feature = "nested_expressions"), expect(unused_mut))]
         let mut functions: Vec<Arc<ScalarUDF>> = functions::all_default_functions();
 
         #[cfg(feature = "nested_expressions")]
@@ -112,6 +113,15 @@ impl SessionStateDefaults {
         functions
     }
 
+    /// returns the list of default [`HigherOrderUDF`]s
+    pub fn default_higher_order_functions() -> Vec<Arc<dyn HigherOrderUDF>> {
+        #[cfg(feature = "nested_expressions")]
+        return functions_nested::all_default_higher_order_functions();
+
+        #[cfg(not(feature = "nested_expressions"))]
+        return Vec::new();
+    }
+
     /// returns the list of default [`AggregateUDF`]s
     pub fn default_aggregate_functions() -> Vec<Arc<AggregateUDF>> {
         functions_aggregate::all_default_aggregate_functions()
@@ -122,6 +132,13 @@ impl SessionStateDefaults {
         functions_window::all_default_window_functions()
     }
 
+    /// Returns the list of default extension types.
+    ///
+    /// For now, we do not register any extension types by default.
+    pub fn default_extension_types() -> Vec<ExtensionTypeRegistrationRef> {
+        vec![]
+    }
+
     /// returns the list of default [`TableFunction`]s
     pub fn default_table_functions() -> Vec<Arc<TableFunction>> {
         functions_table::all_default_table_functions()
@@ -155,7 +172,7 @@ impl SessionStateDefaults {
     }
 
     /// registers all the builtin array functions
-    #[cfg_attr(not(feature = "nested_expressions"), allow(unused_variables))]
+    #[cfg_attr(not(feature = "nested_expressions"), expect(unused_variables))]
     pub fn register_array_functions(state: &mut SessionState) {
         // register crate of array expressions (if enabled)
         #[cfg(feature = "nested_expressions")]
diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs
index 381dd5e9e8482..3170f4be7f683 100644
--- a/datafusion/core/src/lib.rs
+++ b/datafusion/core/src/lib.rs
@@ -35,6 +35,9 @@
     )
 )]
 #![warn(missing_docs, clippy::needless_borrow)]
+// Use `allow` instead of `expect` for test configuration to explicitly
+// disable the lint for all test code rather than expecting violations
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! [DataFusion] is an extensible query engine written in Rust that
 //! uses [Apache Arrow] as its in-memory format. DataFusion's target users are
@@ -358,7 +361,7 @@
 //! [`TreeNode`]: datafusion_common::tree_node::TreeNode
 //! [`tree_node module`]: datafusion_expr::logical_plan::tree_node
 //! [`ExprSimplifier`]: crate::optimizer::simplify_expressions::ExprSimplifier
-//! [`expr_api`.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/expr_api.rs
+//! [`expr_api`.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/expr_api.rs
 //!
 //! ### Physical Plans
 //!
@@ -647,7 +650,7 @@
 //!
 //! [Tokio]:  https://tokio.rs
 //! [`Runtime`]: tokio::runtime::Runtime
-//! [thread_pools example]: https://github.com/apache/datafusion/tree/main/datafusion-examples/examples/thread_pools.rs
+//! [thread_pools example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/thread_pools.rs
 //! [`task`]: tokio::task
 //! [Using Rustlang’s Async Tokio Runtime for CPU-Bound Tasks]: https://thenewstack.io/using-rustlangs-async-tokio-runtime-for-cpu-bound-tasks/
 //! [`RepartitionExec`]: physical_plan::repartition::RepartitionExec
@@ -758,14 +761,13 @@
 //! [`RecordBatch`]: arrow::array::RecordBatch
 //! [`RecordBatchReader`]: arrow::record_batch::RecordBatchReader
 //! [`Array`]: arrow::array::Array
-
-/// DataFusion crate version
-pub const DATAFUSION_VERSION: &str = env!("CARGO_PKG_VERSION");
+#![doc = include_str!("optimizer_rule_reference.md")]
 
 extern crate core;
-
 #[cfg(feature = "sql")]
 extern crate sqlparser;
+/// DataFusion crate version
+pub const DATAFUSION_VERSION: &str = env!("CARGO_PKG_VERSION");
 
 pub mod dataframe;
 pub mod datasource;
@@ -783,7 +785,10 @@ pub use object_store;
 pub use parquet;
 
 #[cfg(feature = "avro")]
-pub use datafusion_datasource_avro::apache_avro;
+pub use datafusion_datasource_avro::arrow_avro;
+
+#[cfg(test)]
+mod optimizer_rule_reference;
 
 // re-export DataFusion sub-crates at the top level. Use `pub use *`
 // so that the contents of the subcrates appears in rustdocs
@@ -1177,8 +1182,56 @@ doc_comment::doctest!(
 
 #[cfg(doctest)]
 doc_comment::doctest!(
-    "../../../docs/source/library-user-guide/upgrading.md",
-    library_user_guide_upgrading
+    "../../../docs/source/library-user-guide/upgrading/46.0.0.md",
+    library_user_guide_upgrading_46_0_0
+);
+
+#[cfg(doctest)]
+doc_comment::doctest!(
+    "../../../docs/source/library-user-guide/upgrading/47.0.0.md",
+    library_user_guide_upgrading_47_0_0
+);
+
+#[cfg(doctest)]
+doc_comment::doctest!(
+    "../../../docs/source/library-user-guide/upgrading/48.0.0.md",
+    library_user_guide_upgrading_48_0_0
+);
+
+#[cfg(doctest)]
+doc_comment::doctest!(
+    "../../../docs/source/library-user-guide/upgrading/48.0.1.md",
+    library_user_guide_upgrading_48_0_1
+);
+
+#[cfg(doctest)]
+doc_comment::doctest!(
+    "../../../docs/source/library-user-guide/upgrading/49.0.0.md",
+    library_user_guide_upgrading_49_0_0
+);
+
+#[cfg(doctest)]
+doc_comment::doctest!(
+    "../../../docs/source/library-user-guide/upgrading/50.0.0.md",
+    library_user_guide_upgrading_50_0_0
+);
+
+#[cfg(doctest)]
+doc_comment::doctest!(
+    "../../../docs/source/library-user-guide/upgrading/51.0.0.md",
+    library_user_guide_upgrading_51_0_0
+);
+
+#[cfg(doctest)]
+doc_comment::doctest!(
+    "../../../docs/source/library-user-guide/upgrading/52.0.0.md",
+    library_user_guide_upgrading_52_0_0
+);
+
+#[cfg(doctest)]
+doc_comment::doctest!(
+    "../../../docs/source/library-user-guide/upgrading/53.0.0.md",
+    library_user_guide_upgrading_53_0_0
 );
 
 #[cfg(doctest)]
diff --git a/datafusion/core/src/optimizer_rule_reference.md b/datafusion/core/src/optimizer_rule_reference.md
new file mode 100644
index 0000000000000..fcbb200c71624
--- /dev/null
+++ b/datafusion/core/src/optimizer_rule_reference.md
@@ -0,0 +1,93 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+## Built-in Optimizer Rules
+
+DataFusion applies a default analyzer, logical optimizer, and physical
+optimizer pipeline.
+
+The rule names listed here match the names shown by `EXPLAIN VERBOSE`.
+
+Rule order matters. The default pipeline may change between releases.
+
+### Analyzer Rules
+
+| order | rule                        | summary                                                                                 |
+| ----- | --------------------------- | --------------------------------------------------------------------------------------- |
+| 1     | `resolve_grouping_function` | Rewrites `GROUPING(...)` calls into expressions over DataFusion's internal grouping id. |
+| 2     | `type_coercion`             | Adds implicit casts so operators and functions receive valid input types.               |
+
+### Logical Optimizer Rules
+
+| order | rule                                      | summary                                                                                                                     |
+| ----- | ----------------------------------------- | --------------------------------------------------------------------------------------------------------------------------- |
+| 1     | `rewrite_set_comparison`                  | Rewrites `ANY` and `ALL` set-comparison subqueries into `EXISTS`-based boolean expressions with correct SQL NULL semantics. |
+| 2     | `optimize_unions`                         | Flattens nested unions and removes unions with a single input.                                                              |
+| 3     | `simplify_expressions`                    | Constant-folds and simplifies expressions while preserving output names.                                                    |
+| 4     | `replace_distinct_aggregate`              | Rewrites `DISTINCT` and `DISTINCT ON` operators into aggregate-based plans that later rules can optimize further.           |
+| 5     | `eliminate_join`                          | Replaces keyless inner joins with a literal `false` filter by an empty relation.                                            |
+| 6     | `decorrelate_predicate_subquery`          | Converts eligible `IN` and `EXISTS` predicate subqueries into semi or anti joins.                                           |
+| 7     | `scalar_subquery_to_join`                 | Rewrites eligible scalar subqueries into joins and adds schema-preserving projections.                                      |
+| 8     | `decorrelate_lateral_join`                | Rewrites eligible lateral joins into regular joins.                                                                         |
+| 9     | `extract_equijoin_predicate`              | Splits join filters into equijoin keys and residual predicates.                                                             |
+| 10    | `eliminate_duplicated_expr`               | Removes duplicate expressions from projections, aggregates, and similar operators.                                          |
+| 11    | `eliminate_filter`                        | Drops always-true filters and replaces always-false or NULL filters with empty relations.                                   |
+| 12    | `eliminate_cross_join`                    | Uses filter predicates to replace cross joins with inner joins when join keys can be found.                                 |
+| 13    | `eliminate_limit`                         | Removes no-op limits and simplifies trivial limit shapes.                                                                   |
+| 14    | `propagate_empty_relation`                | Pushes empty-relation knowledge upward so operators fed by no rows collapse early.                                          |
+| 15    | `filter_null_join_keys`                   | Adds `IS NOT NULL` filters to nullable equijoin keys that can never match.                                                  |
+| 16    | `eliminate_outer_join`                    | Rewrites outer joins to inner joins when later filters reject the NULL-extended rows.                                       |
+| 17    | `push_down_limit`                         | Moves literal limits closer to scans and unions and merges adjacent limits.                                                 |
+| 18    | `push_down_filter`                        | Moves filters as early as possible through filter-commutative operators.                                                    |
+| 19    | `single_distinct_aggregation_to_group_by` | Rewrites single-column `DISTINCT` aggregations into two-stage `GROUP BY` plans.                                             |
+| 20    | `eliminate_group_by_constant`             | Removes constant or functionally redundant expressions from `GROUP BY`.                                                     |
+| 21    | `common_sub_expression_eliminate`         | Computes repeated subexpressions once and reuses the result.                                                                |
+| 22    | `extract_leaf_expressions`                | Pulls cheap leaf expressions closer to data sources so later pruning and filter rules can act earlier.                      |
+| 23    | `push_down_leaf_projections`              | Pushes the helper projections created by leaf extraction toward leaf inputs.                                                |
+| 24    | `optimize_projections`                    | Prunes unused columns and removes unnecessary logical projections.                                                          |
+
+### Physical Optimizer Rules
+
+The same rule name may appear more than once when the default pipeline runs it
+in multiple phases.
+
+| order | rule                           | phase                   | summary                                                                                                      |
+| ----- | ------------------------------ | ----------------------- | ------------------------------------------------------------------------------------------------------------ |
+| 1     | `OutputRequirements`           | add phase               | Adds helper nodes so output requirements survive later physical rewrites.                                    |
+| 2     | `aggregate_statistics`         | -                       | Uses exact source statistics to answer some aggregates without scanning data.                                |
+| 3     | `join_selection`               | -                       | Chooses join implementation, build side, and partition mode from statistics and stream properties.           |
+| 4     | `LimitedDistinctAggregation`   | -                       | Pushes limit hints into grouped distinct-style aggregations when only a small result is needed.              |
+| 5     | `FilterPushdown`               | pre-optimization phase  | Pushes supported physical filters down toward data sources before distribution and sorting are enforced.     |
+| 6     | `EnforceDistribution`          | -                       | Adds repartitioning only where needed to satisfy physical distribution requirements.                         |
+| 7     | `CombinePartialFinalAggregate` | -                       | Collapses adjacent partial and final aggregates when the distributed shape makes them redundant.             |
+| 8     | `EnforceSorting`               | -                       | Adds or removes local sorts to satisfy required input orderings.                                             |
+| 9     | `OptimizeAggregateOrder`       | -                       | Updates aggregate expressions to use the best ordering once sort requirements are known.                     |
+| 10    | `WindowTopN`                   | -                       | Replaces eligible row-number window and filter patterns with per-partition TopK execution.                   |
+| 11    | `ProjectionPushdown`           | early pass              | Pushes projections toward inputs before later physical rewrites add more limit and TopK structure.           |
+| 12    | `OutputRequirements`           | remove phase            | Removes the temporary output-requirement helper nodes after requirement-sensitive planning is done.          |
+| 13    | `LimitAggregation`             | -                       | Passes a limit hint into eligible aggregations so they can keep fewer accumulator buckets.                   |
+| 14    | `LimitPushPastWindows`         | -                       | Pushes fetch limits through bounded window operators when doing so keeps the result correct.                 |
+| 15    | `HashJoinBuffering`            | -                       | Adds buffering on the probe side of hash joins so probing can start before build completion.                 |
+| 16    | `LimitPushdown`                | -                       | Moves physical limits into child operators or fetch-enabled variants to cut data early.                      |
+| 17    | `TopKRepartition`              | -                       | Pushes TopK below hash repartition when the partition key is a prefix of the sort key.                       |
+| 18    | `ProjectionPushdown`           | late pass               | Runs projection pushdown again after limit and TopK rewrites expose new pruning opportunities.               |
+| 19    | `PushdownSort`                 | -                       | Pushes sort requirements into data sources that can already return sorted output.                            |
+| 20    | `EnsureCooperative`            | -                       | Wraps non-cooperative plan parts so long-running tasks yield fairly.                                         |
+| 21    | `FilterPushdown(Post)`         | post-optimization phase | Pushes dynamic filters at the end of optimization, after plan references stop moving.                        |
+| 22    | `SanityCheckPlan`              | -                       | Validates that the final physical plan meets ordering, distribution, and infinite-input safety requirements. |
diff --git a/datafusion/core/src/optimizer_rule_reference.rs b/datafusion/core/src/optimizer_rule_reference.rs
new file mode 100644
index 0000000000000..64db51b290fdc
--- /dev/null
+++ b/datafusion/core/src/optimizer_rule_reference.rs
@@ -0,0 +1,85 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_optimizer::analyzer::Analyzer;
+use datafusion_optimizer::optimizer::Optimizer;
+use datafusion_physical_optimizer::optimizer::PhysicalOptimizer;
+
+const OPTIMIZER_RULE_REFERENCE: &str = include_str!("optimizer_rule_reference.md");
+
+fn documented_rules(section_heading: &str) -> Vec<String> {
+    let mut in_section = false;
+    let mut names = vec![];
+
+    for line in OPTIMIZER_RULE_REFERENCE.lines() {
+        if line == section_heading {
+            in_section = true;
+            continue;
+        }
+
+        if in_section && line.starts_with("### ") {
+            break;
+        }
+
+        if !in_section || !line.starts_with('|') || line.contains("---") {
+            continue;
+        }
+
+        let columns: Vec<_> = line.split('|').map(str::trim).collect();
+
+        if columns.len() < 4 || columns[1] == "order" {
+            continue;
+        }
+
+        names.push(columns[2].trim_matches('`').to_string());
+    }
+
+    names
+}
+
+#[test]
+fn analyzer_rules_match_documented_order() {
+    let rules: Vec<_> = Analyzer::new()
+        .rules
+        .iter()
+        .map(|rule| rule.name().to_string())
+        .collect();
+
+    assert_eq!(documented_rules("### Analyzer Rules"), rules);
+}
+
+#[test]
+fn logical_rules_match_documented_order() {
+    let rules: Vec<_> = Optimizer::new()
+        .rules
+        .iter()
+        .map(|rule| rule.name().to_string())
+        .collect();
+
+    assert_eq!(documented_rules("### Logical Optimizer Rules"), rules);
+}
+
+#[test]
+fn physical_rules_match_documented_order() {
+    let rules: Vec<_> = PhysicalOptimizer::new()
+        .rules
+        .iter()
+        .map(|rule| rule.name().to_string())
+        .collect();
+
+    assert_eq!(documented_rules("### Physical Optimizer Rules"), rules);
+}
diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs
index c280b50a9f07a..3b2c7a78e898e 100644
--- a/datafusion/core/src/physical_planner.rs
+++ b/datafusion/core/src/physical_planner.rs
@@ -18,13 +18,13 @@
 //! Planner for [`LogicalPlan`] to [`ExecutionPlan`]
 
 use std::borrow::Cow;
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
 use crate::datasource::file_format::file_type_to_format;
 use crate::datasource::listing::ListingTableUrl;
-use crate::datasource::physical_plan::FileSinkConfig;
-use crate::datasource::{source_as_provider, DefaultTableSource};
+use crate::datasource::physical_plan::{FileOutputMode, FileSinkConfig};
+use crate::datasource::{DefaultTableSource, source_as_provider};
 use crate::error::{DataFusionError, Result};
 use crate::execution::context::{ExecutionProps, SessionState};
 use crate::logical_expr::utils::generate_sort_key;
@@ -39,7 +39,7 @@ use crate::physical_expr::{create_physical_expr, create_physical_exprs};
 use crate::physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy};
 use crate::physical_plan::analyze::AnalyzeExec;
 use crate::physical_plan::explain::ExplainExec;
-use crate::physical_plan::filter::FilterExec;
+use crate::physical_plan::filter::FilterExecBuilder;
 use crate::physical_plan::joins::utils as join_utils;
 use crate::physical_plan::joins::{
     CrossJoinExec, HashJoinExec, NestedLoopJoinExec, PartitionMode, SortMergeJoinExec,
@@ -52,33 +52,42 @@ use crate::physical_plan::union::UnionExec;
 use crate::physical_plan::unnest::UnnestExec;
 use crate::physical_plan::windows::{BoundedWindowAggExec, WindowAggExec};
 use crate::physical_plan::{
-    displayable, windows, ExecutionPlan, ExecutionPlanProperties, InputOrderMode,
-    Partitioning, PhysicalExpr, WindowExpr,
+    ExecutionPlan, ExecutionPlanProperties, InputOrderMode, Partitioning, PhysicalExpr,
+    WindowExpr, displayable, windows,
 };
 use crate::schema_equivalence::schema_satisfied_by;
 
-use arrow::array::{builder::StringBuilder, RecordBatch};
+use arrow::array::{RecordBatch, builder::StringBuilder};
 use arrow::compute::SortOptions;
 use arrow::datatypes::Schema;
+use arrow_schema::Field;
 use datafusion_catalog::ScanArgs;
+use datafusion_common::Column;
+use datafusion_common::HashMap as DFHashMap;
 use datafusion_common::display::ToStringifiedPlan;
-use datafusion_common::format::ExplainAnalyzeLevel;
-use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor};
-use datafusion_common::TableReference;
+use datafusion_common::format::ExplainAnalyzeCategories;
+use datafusion_common::tree_node::{
+    Transformed, TreeNode, TreeNodeRecursion, TreeNodeVisitor,
+};
+use datafusion_common::{
+    DFSchema, DFSchemaRef, ScalarValue, exec_err, internal_datafusion_err, internal_err,
+    not_impl_err, plan_err,
+};
 use datafusion_common::{
-    exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema,
-    ScalarValue,
+    TableReference, assert_eq_or_internal_err, assert_or_internal_err,
 };
 use datafusion_datasource::file_groups::FileGroup;
 use datafusion_datasource::memory::MemorySourceConfig;
 use datafusion_expr::dml::{CopyTo, InsertOp};
+use datafusion_expr::execution_props::{ScalarSubqueryResults, SubqueryIndex};
 use datafusion_expr::expr::{
-    physical_name, AggregateFunction, AggregateFunctionParams, Alias, GroupingSet,
-    NullTreatment, WindowFunction, WindowFunctionParams,
+    AggregateFunction, AggregateFunctionParams, Alias, GroupingSet, NullTreatment,
+    WindowFunction, WindowFunctionParams, physical_name,
 };
 use datafusion_expr::expr_rewriter::unnormalize_cols;
+use datafusion_expr::logical_plan::Subquery;
 use datafusion_expr::logical_plan::builder::wrap_projection_for_join_if_necessary;
-use datafusion_expr::utils::split_conjunction;
+use datafusion_expr::utils::{expr_to_columns, split_conjunction};
 use datafusion_expr::{
     Analyze, BinaryExpr, DescribeTable, DmlStatement, Explain, ExplainFormat, Extension,
     FetchType, Filter, JoinType, Operator, RecursiveQuery, SkipType, StringifiedPlan,
@@ -87,21 +96,22 @@ use datafusion_expr::{
 use datafusion_physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctionExpr};
 use datafusion_physical_expr::expressions::Literal;
 use datafusion_physical_expr::{
-    create_physical_sort_exprs, LexOrdering, PhysicalSortExpr,
+    LexOrdering, PhysicalSortExpr, create_physical_sort_exprs,
 };
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
 use datafusion_physical_plan::empty::EmptyExec;
 use datafusion_physical_plan::execution_plan::InvariantLevel;
 use datafusion_physical_plan::joins::PiecewiseMergeJoinExec;
-use datafusion_physical_plan::metrics::MetricType;
 use datafusion_physical_plan::placeholder_row::PlaceholderRowExec;
 use datafusion_physical_plan::recursive_query::RecursiveQueryExec;
+use datafusion_physical_plan::scalar_subquery::{ScalarSubqueryExec, ScalarSubqueryLink};
 use datafusion_physical_plan::unnest::ListUnnest;
 
 use async_trait::async_trait;
 use datafusion_physical_plan::async_func::{AsyncFuncExec, AsyncMapper};
 use futures::{StreamExt, TryStreamExt};
-use itertools::{multiunzip, Itertools};
+use indexmap::IndexSet;
+use itertools::{Itertools, multiunzip};
 use log::debug;
 use tokio::sync::Mutex;
 
@@ -151,6 +161,80 @@ pub trait ExtensionPlanner {
         physical_inputs: &[Arc<dyn ExecutionPlan>],
         session_state: &SessionState,
     ) -> Result<Option<Arc<dyn ExecutionPlan>>>;
+
+    /// Create a physical plan for a [`LogicalPlan::TableScan`].
+    ///
+    /// This is useful for planning valid [`TableSource`]s that are not [`TableProvider`]s.
+    ///
+    /// Returns:
+    /// * `Ok(Some(plan))` if the planner knows how to plan the `scan`
+    /// * `Ok(None)` if the planner does not know how to plan the `scan` and wants to delegate the planning to another [`ExtensionPlanner`]
+    /// * `Err` if the planner knows how to plan the `scan` but errors while doing so
+    ///
+    /// # Example
+    ///
+    /// ```rust,ignore
+    /// use std::sync::Arc;
+    /// use datafusion::physical_plan::ExecutionPlan;
+    /// use datafusion::logical_expr::TableScan;
+    /// use datafusion::execution::context::SessionState;
+    /// use datafusion::error::Result;
+    /// use datafusion_physical_planner::{ExtensionPlanner, PhysicalPlanner};
+    /// use async_trait::async_trait;
+    ///
+    /// // Your custom table source type
+    /// struct MyCustomTableSource { /* ... */ }
+    ///
+    /// // Your custom execution plan
+    /// struct MyCustomExec { /* ... */ }
+    ///
+    /// struct MyExtensionPlanner;
+    ///
+    /// #[async_trait]
+    /// impl ExtensionPlanner for MyExtensionPlanner {
+    ///     async fn plan_extension(
+    ///         &self,
+    ///         _planner: &dyn PhysicalPlanner,
+    ///         _node: &dyn UserDefinedLogicalNode,
+    ///         _logical_inputs: &[&LogicalPlan],
+    ///         _physical_inputs: &[Arc<dyn ExecutionPlan>],
+    ///         _session_state: &SessionState,
+    ///     ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+    ///         Ok(None)
+    ///     }
+    ///
+    ///     async fn plan_table_scan(
+    ///         &self,
+    ///         _planner: &dyn PhysicalPlanner,
+    ///         scan: &TableScan,
+    ///         _session_state: &SessionState,
+    ///     ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+    ///         // Check if this is your custom table source
+    ///         if scan.source.is::<MyCustomTableSource>() {
+    ///             // Create a custom execution plan for your table source
+    ///             let exec = MyCustomExec::new(
+    ///                 scan.table_name.clone(),
+    ///                 Arc::clone(scan.projected_schema.inner()),
+    ///             );
+    ///             Ok(Some(Arc::new(exec)))
+    ///         } else {
+    ///             // Return None to let other extension planners handle it
+    ///             Ok(None)
+    ///         }
+    ///     }
+    /// }
+    /// ```
+    ///
+    /// [`TableSource`]: datafusion_expr::TableSource
+    /// [`TableProvider`]: datafusion_catalog::TableProvider
+    async fn plan_table_scan(
+        &self,
+        _planner: &dyn PhysicalPlanner,
+        _scan: &TableScan,
+        _session_state: &SessionState,
+    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+        Ok(None)
+    }
 }
 
 /// Default single node physical query planner that converts a
@@ -272,7 +356,8 @@ struct LogicalNode<'a> {
 
 impl DefaultPhysicalPlanner {
     /// Create a physical planner that uses `extension_planners` to
-    /// plan user-defined logical nodes [`LogicalPlan::Extension`].
+    /// plan user-defined logical nodes [`LogicalPlan::Extension`]
+    /// or user-defined table sources in [`LogicalPlan::TableScan`].
     /// The planner uses the first [`ExtensionPlanner`] to return a non-`None`
     /// plan.
     pub fn with_extension_planners(
@@ -281,8 +366,111 @@ impl DefaultPhysicalPlanner {
         Self { extension_planners }
     }
 
-    /// Create a physical plan from a logical plan
-    async fn create_initial_plan(
+    fn ensure_schema_matches(
+        &self,
+        logical_schema: &DFSchemaRef,
+        physical_plan: &Arc<dyn ExecutionPlan>,
+        context: &str,
+    ) -> Result<()> {
+        if !logical_schema.matches_arrow_schema(&physical_plan.schema()) {
+            return plan_err!(
+                "{} created an ExecutionPlan with mismatched schema. \
+                    LogicalPlan schema: {:?}, ExecutionPlan schema: {:?}",
+                context,
+                logical_schema,
+                physical_plan.schema()
+            );
+        }
+        Ok(())
+    }
+
+    /// Collect uncorrelated scalar subqueries. We don't descend into nested
+    /// subqueries here: each call to `create_initial_plan` handles subqueries
+    /// at its level and then recurses in order to handle nested subqueries.
+    #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // Subquery contains Arc with interior mutability but is intentionally used as hash key
+    fn collect_scalar_subqueries(plan: &LogicalPlan) -> Vec<Subquery> {
+        let mut subqueries = IndexSet::new();
+        plan.apply(|node| {
+            for expr in node.expressions() {
+                expr.apply(|e| {
+                    if let Expr::ScalarSubquery(sq) = e
+                        && sq.outer_ref_columns.is_empty()
+                    {
+                        subqueries.insert(sq.clone());
+                    }
+                    Ok(TreeNodeRecursion::Continue)
+                })
+                .expect("infallible");
+            }
+            Ok(TreeNodeRecursion::Continue)
+        })
+        .expect("infallible");
+        subqueries.into_iter().collect()
+    }
+
+    /// Create a physical plan from a logical plan.
+    ///
+    /// Uncorrelated scalar subqueries in the plan's own expressions are
+    /// collected, planned as separate physical plans, and each assigned an
+    /// index in a shared [`ScalarSubqueryResults`] container that will hold its
+    /// result at execution time. The index map and shared results container are
+    /// registered in [`ExecutionProps`] so that [`create_physical_expr`] can
+    /// convert `Expr::ScalarSubquery` into [`ScalarSubqueryExpr`] nodes that
+    /// read from that container.
+    ///
+    /// The resulting physical plan is wrapped in a [`ScalarSubqueryExec`] node
+    /// that executes those subquery plans before any data flows through the
+    /// main plan. If a subquery itself contains nested uncorrelated subqueries,
+    /// the recursive call produces its own [`ScalarSubqueryExec`] inside the
+    /// subquery plan — each level manages only its own subqueries.
+    ///
+    /// Returns a [`BoxFuture`] rather than using `async fn` because of
+    /// this recursion.
+    ///
+    /// [`ScalarSubqueryExpr`]: datafusion_physical_expr::scalar_subquery::ScalarSubqueryExpr
+    /// [`BoxFuture`]: futures::future::BoxFuture
+    fn create_initial_plan<'a>(
+        &'a self,
+        logical_plan: &'a LogicalPlan,
+        session_state: &'a SessionState,
+    ) -> futures::future::BoxFuture<'a, Result<Arc<dyn ExecutionPlan>>> {
+        Box::pin(async move {
+            let all_subqueries = Self::collect_scalar_subqueries(logical_plan);
+            let (links, index_map) = self
+                .plan_scalar_subqueries(all_subqueries, session_state)
+                .await?;
+
+            if links.is_empty() {
+                return self
+                    .create_initial_plan_inner(logical_plan, session_state)
+                    .await;
+            }
+
+            // Create the shared `ScalarSubqueryResults` container and register
+            // it in `ExecutionProps` so that `create_physical_expr` can resolve
+            // `Expr::ScalarSubquery` into `ScalarSubqueryExpr` nodes. We clone
+            // the `SessionState` so these are available throughout physical
+            // planning without mutating the caller's state.
+            //
+            // Ideally, the subquery state would live in a dedicated planning
+            // context rather than in `ExecutionProps`. It's here because
+            // `create_physical_expr` only receives `&ExecutionProps`.
+            let results = ScalarSubqueryResults::new(links.len());
+            let mut owned = session_state.clone();
+            owned.execution_props_mut().subquery_indexes = index_map;
+            owned.execution_props_mut().subquery_results = results.clone();
+            let session_state = Cow::Owned(owned);
+
+            let plan = self
+                .create_initial_plan_inner(logical_plan, &session_state)
+                .await?;
+            Ok(Arc::new(ScalarSubqueryExec::new(plan, links, results)))
+        })
+    }
+
+    /// Inner physical planning that converts a logical plan tree into an
+    /// execution plan tree without collecting scalar subqueries.
+    async fn create_initial_plan_inner(
         &self,
         logical_plan: &LogicalPlan,
         session_state: &SessionState,
@@ -347,11 +535,11 @@ impl DefaultPhysicalPlanner {
             .flatten()
             .collect::<Vec<_>>();
         // Ideally this never happens if we have a valid LogicalPlan tree
-        if outputs.len() != 1 {
-            return internal_err!(
-                "Failed to convert LogicalPlan to ExecutionPlan: More than one root detected"
-            );
-        }
+        assert_eq_or_internal_err!(
+            outputs.len(),
+            1,
+            "Failed to convert LogicalPlan to ExecutionPlan: More than one root detected"
+        );
         let plan = outputs.pop().unwrap();
         Ok(plan)
     }
@@ -447,27 +635,56 @@ impl DefaultPhysicalPlanner {
         session_state: &SessionState,
         children: ChildrenContainer,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        let execution_props = session_state.execution_props();
         let exec_node: Arc<dyn ExecutionPlan> = match node {
             // Leaves (no children)
-            LogicalPlan::TableScan(TableScan {
-                source,
-                projection,
-                filters,
-                fetch,
-                ..
-            }) => {
-                let source = source_as_provider(source)?;
-                // Remove all qualifiers from the scan as the provider
-                // doesn't know (nor should care) how the relation was
-                // referred to in the query
-                let filters = unnormalize_cols(filters.iter().cloned());
-                let filters_vec = filters.into_iter().collect::<Vec<_>>();
-                let opts = ScanArgs::default()
-                    .with_projection(projection.as_deref())
-                    .with_filters(Some(&filters_vec))
-                    .with_limit(*fetch);
-                let res = source.scan_with_args(session_state, opts).await?;
-                Arc::clone(res.plan())
+            LogicalPlan::TableScan(scan) => {
+                let TableScan {
+                    source,
+                    projection,
+                    filters,
+                    fetch,
+                    projected_schema,
+                    ..
+                } = scan;
+
+                if let Ok(source) = source_as_provider(source) {
+                    // Remove all qualifiers from the scan as the provider
+                    // doesn't know (nor should care) how the relation was
+                    // referred to in the query
+                    let filters = unnormalize_cols(filters.iter().cloned());
+                    let filters_vec = filters.into_iter().collect::<Vec<_>>();
+                    let opts = ScanArgs::default()
+                        .with_projection(projection.as_deref())
+                        .with_filters(Some(&filters_vec))
+                        .with_limit(*fetch);
+                    let res = source.scan_with_args(session_state, opts).await?;
+                    Arc::clone(res.plan())
+                } else {
+                    let mut maybe_plan = None;
+                    for planner in &self.extension_planners {
+                        if maybe_plan.is_some() {
+                            break;
+                        }
+
+                        maybe_plan =
+                            planner.plan_table_scan(self, scan, session_state).await?;
+                    }
+
+                    let plan = match maybe_plan {
+                        Some(plan) => plan,
+                        None => {
+                            return plan_err!(
+                                "No installed planner was able to plan TableScan for custom TableSource: {:?}",
+                                scan.table_name
+                            );
+                        }
+                    };
+                    let context =
+                        format!("Extension planner for table scan {}", scan.table_name);
+                    self.ensure_schema_matches(projected_schema, &plan, &context)?;
+                    plan
+                }
             }
             LogicalPlan::Values(Values { values, schema }) => {
                 let exprs = values
@@ -475,7 +692,7 @@ impl DefaultPhysicalPlanner {
                     .map(|row| {
                         row.iter()
                             .map(|expr| {
-                                self.create_physical_expr(expr, schema, session_state)
+                                create_physical_expr(expr, schema, execution_props)
                             })
                             .collect::<Result<Vec<Arc<dyn PhysicalExpr>>>>()
                     })
@@ -496,7 +713,7 @@ impl DefaultPhysicalPlanner {
                 output_schema,
             }) => {
                 let output_schema = Arc::clone(output_schema.inner());
-                self.plan_describe(Arc::clone(schema), output_schema)?
+                self.plan_describe(&Arc::clone(schema), output_schema)?
             }
 
             // 1 Child
@@ -525,16 +742,48 @@ impl DefaultPhysicalPlanner {
 
                 let keep_partition_by_columns = match source_option_tuples
                     .get("execution.keep_partition_by_columns")
-                    .map(|v| v.trim()) {
-                    None => session_state.config().options().execution.keep_partition_by_columns,
+                    .map(|v| v.trim())
+                {
+                    None => {
+                        session_state
+                            .config()
+                            .options()
+                            .execution
+                            .keep_partition_by_columns
+                    }
                     Some("true") => true,
                     Some("false") => false,
-                    Some(value) =>
-                        return Err(DataFusionError::Configuration(format!("provided value for 'execution.keep_partition_by_columns' was not recognized: \"{value}\""))),
+                    Some(value) => {
+                        return Err(DataFusionError::Configuration(format!(
+                            "provided value for 'execution.keep_partition_by_columns' was not recognized: \"{value}\""
+                        )));
+                    }
                 };
 
+                // Parse single_file_output option if explicitly set
+                let file_output_mode = match source_option_tuples
+                    .get("single_file_output")
+                    .map(|v| v.trim())
+                {
+                    None => FileOutputMode::Automatic,
+                    Some("true") => FileOutputMode::SingleFile,
+                    Some("false") => FileOutputMode::Directory,
+                    Some(value) => {
+                        return Err(DataFusionError::Configuration(format!(
+                            "provided value for 'single_file_output' was not recognized: \"{value}\""
+                        )));
+                    }
+                };
+
+                // Filter out sink-related options that are not format options
+                let format_options: HashMap<String, String> = source_option_tuples
+                    .iter()
+                    .filter(|(k, _)| k.as_str() != "single_file_output")
+                    .map(|(k, v)| (k.clone(), v.clone()))
+                    .collect();
+
                 let sink_format = file_type_to_format(file_type)?
-                    .create(session_state, source_option_tuples)?;
+                    .create(session_state, &format_options)?;
 
                 // Determine extension based on format extension and compression
                 let file_extension = match sink_format.compression_type() {
@@ -555,6 +804,7 @@ impl DefaultPhysicalPlanner {
                     insert_op: InsertOp::Append,
                     keep_partition_by_columns,
                     file_extension,
+                    file_output_mode,
                 };
 
                 let ordering = input_exec.properties().output_ordering().cloned();
@@ -573,9 +823,7 @@ impl DefaultPhysicalPlanner {
                 op: WriteOp::Insert(insert_op),
                 ..
             }) => {
-                if let Some(provider) =
-                    target.as_any().downcast_ref::<DefaultTableSource>()
-                {
+                if let Some(provider) = target.downcast_ref::<DefaultTableSource>() {
                     let input_exec = children.one()?;
                     provider
                         .table_provider
@@ -587,18 +835,89 @@ impl DefaultPhysicalPlanner {
                     );
                 }
             }
-            LogicalPlan::Window(Window { window_expr, .. }) => {
-                if window_expr.is_empty() {
-                    return internal_err!("Impossibly got empty window expression");
+            LogicalPlan::Dml(DmlStatement {
+                table_name,
+                target,
+                op: WriteOp::Delete,
+                input,
+                ..
+            }) => {
+                if let Some(provider) = target.downcast_ref::<DefaultTableSource>() {
+                    let filters = extract_dml_filters(input, table_name)?;
+                    provider
+                        .table_provider
+                        .delete_from(session_state, filters)
+                        .await
+                        .map_err(|e| {
+                            e.context(format!("DELETE operation on table '{table_name}'"))
+                        })?
+                } else {
+                    return exec_err!(
+                        "Table source can't be downcasted to DefaultTableSource"
+                    );
                 }
+            }
+            LogicalPlan::Dml(DmlStatement {
+                table_name,
+                target,
+                op: WriteOp::Update,
+                input,
+                ..
+            }) => {
+                if let Some(provider) = target.downcast_ref::<DefaultTableSource>() {
+                    // For UPDATE, the assignments are encoded in the projection of input
+                    // We pass the filters and let the provider handle the projection
+                    let filters = extract_dml_filters(input, table_name)?;
+                    // Extract assignments from the projection in input plan
+                    let assignments = extract_update_assignments(input)?;
+                    provider
+                        .table_provider
+                        .update(session_state, assignments, filters)
+                        .await
+                        .map_err(|e| {
+                            e.context(format!("UPDATE operation on table '{table_name}'"))
+                        })?
+                } else {
+                    return exec_err!(
+                        "Table source can't be downcasted to DefaultTableSource"
+                    );
+                }
+            }
+            LogicalPlan::Dml(DmlStatement {
+                table_name,
+                target,
+                op: WriteOp::Truncate,
+                ..
+            }) => {
+                if let Some(provider) = target.downcast_ref::<DefaultTableSource>() {
+                    provider
+                        .table_provider
+                        .truncate(session_state)
+                        .await
+                        .map_err(|e| {
+                            e.context(format!(
+                                "TRUNCATE operation on table '{table_name}'"
+                            ))
+                        })?
+                } else {
+                    return exec_err!(
+                        "Table source can't be downcasted to DefaultTableSource"
+                    );
+                }
+            }
+            LogicalPlan::Window(Window { window_expr, .. }) => {
+                assert_or_internal_err!(
+                    !window_expr.is_empty(),
+                    "Impossibly got empty window expression"
+                );
 
                 let input_exec = children.one()?;
 
                 let get_sort_keys = |expr: &Expr| match expr {
                     Expr::WindowFunction(window_fun) => {
                         let WindowFunctionParams {
-                            ref partition_by,
-                            ref order_by,
+                            partition_by,
+                            order_by,
                             ..
                         } = &window_fun.as_ref().params;
                         generate_sort_key(partition_by, order_by)
@@ -608,8 +927,8 @@ impl DefaultPhysicalPlanner {
                         match &**expr {
                             Expr::WindowFunction(window_fun) => {
                                 let WindowFunctionParams {
-                                    ref partition_by,
-                                    ref order_by,
+                                    partition_by,
+                                    order_by,
                                     ..
                                 } = &window_fun.as_ref().params;
                                 generate_sort_key(partition_by, order_by)
@@ -622,23 +941,17 @@ impl DefaultPhysicalPlanner {
                 let sort_keys = get_sort_keys(&window_expr[0])?;
                 if window_expr.len() > 1 {
                     debug_assert!(
-                            window_expr[1..]
-                                .iter()
-                                .all(|expr| get_sort_keys(expr).unwrap() == sort_keys),
-                            "all window expressions shall have the same sort keys, as guaranteed by logical planning"
-                        );
+                        window_expr[1..]
+                            .iter()
+                            .all(|expr| get_sort_keys(expr).unwrap() == sort_keys),
+                        "all window expressions shall have the same sort keys, as guaranteed by logical planning"
+                    );
                 }
 
                 let logical_schema = node.schema();
                 let window_expr = window_expr
                     .iter()
-                    .map(|e| {
-                        create_window_expr(
-                            e,
-                            logical_schema,
-                            session_state.execution_props(),
-                        )
-                    })
+                    .map(|e| create_window_expr(e, logical_schema, execution_props))
                     .collect::<Result<Vec<_>>>()?;
 
                 let can_repartition = session_state.config().target_partitions() > 1
@@ -683,6 +996,17 @@ impl DefaultPhysicalPlanner {
                     )
                 {
                     let mut differences = Vec::new();
+
+                    if physical_input_schema.metadata()
+                        != physical_input_schema_from_logical.metadata()
+                    {
+                        differences.push(format!(
+                            "schema metadata differs: (physical) {:?} vs (logical) {:?}",
+                            physical_input_schema.metadata(),
+                            physical_input_schema_from_logical.metadata()
+                        ));
+                    }
+
                     if physical_input_schema.fields().len()
                         != physical_input_schema_from_logical.fields().len()
                     {
@@ -712,18 +1036,27 @@ impl DefaultPhysicalPlanner {
                         if physical_field.is_nullable() && !logical_field.is_nullable() {
                             differences.push(format!("field nullability at index {} [{}]: (physical) {} vs (logical) {}", i, physical_field.name(), physical_field.is_nullable(), logical_field.is_nullable()));
                         }
+                        if physical_field.metadata() != logical_field.metadata() {
+                            differences.push(format!(
+                                "field metadata at index {} [{}]: (physical) {:?} vs (logical) {:?}",
+                                i,
+                                physical_field.name(),
+                                physical_field.metadata(),
+                                logical_field.metadata()
+                            ));
+                        }
                     }
-                    return internal_err!("Physical input schema should be the same as the one converted from logical input schema. Differences: {}", differences
-                        .iter()
-                        .map(|s| format!("\n\t- {s}"))
-                        .join(""));
+                    return internal_err!(
+                        "Physical input schema should be the same as the one converted from logical input schema. Differences: {}",
+                        differences.iter().map(|s| format!("\n\t- {s}")).join("")
+                    );
                 }
 
                 let groups = self.create_grouping_physical_expr(
                     group_expr,
                     logical_input_schema,
                     &physical_input_schema,
-                    session_state,
+                    execution_props,
                 )?;
 
                 let agg_filter = aggr_expr
@@ -733,7 +1066,7 @@ impl DefaultPhysicalPlanner {
                             e,
                             logical_input_schema,
                             &physical_input_schema,
-                            session_state.execution_props(),
+                            execution_props,
                         )
                     })
                     .collect::<Result<Vec<_>>>()?;
@@ -776,7 +1109,7 @@ impl DefaultPhysicalPlanner {
                         _ => {
                             return internal_err!(
                                 "Unexpected result from try_plan_async_exprs"
-                            )
+                            );
                         }
                     }
                 }
@@ -827,8 +1160,8 @@ impl DefaultPhysicalPlanner {
                 )?)
             }
             LogicalPlan::Projection(Projection { input, expr, .. }) => self
-                .create_project_physical_exec(
-                    session_state,
+                .create_project_physical_exec_with_props(
+                    execution_props,
                     children.one()?,
                     input,
                     expr,
@@ -838,9 +1171,8 @@ impl DefaultPhysicalPlanner {
             }) => {
                 let physical_input = children.one()?;
                 let input_dfschema = input.schema();
-
                 let runtime_expr =
-                    self.create_physical_expr(predicate, input_dfschema, session_state)?;
+                    create_physical_expr(predicate, input_dfschema, execution_props)?;
 
                 let input_schema = input.schema();
                 let filter = match self.try_plan_async_exprs(
@@ -849,7 +1181,12 @@ impl DefaultPhysicalPlanner {
                     input_schema.as_arrow(),
                 )? {
                     PlanAsyncExpr::Sync(PlannedExprResult::Expr(runtime_expr)) => {
-                        FilterExec::try_new(Arc::clone(&runtime_expr[0]), physical_input)?
+                        FilterExecBuilder::new(
+                            Arc::clone(&runtime_expr[0]),
+                            physical_input,
+                        )
+                        .with_batch_size(session_state.config().batch_size())
+                        .build()?
                     }
                     PlanAsyncExpr::Async(
                         async_map,
@@ -859,20 +1196,22 @@ impl DefaultPhysicalPlanner {
                             async_map.async_exprs,
                             physical_input,
                         )?;
-                        FilterExec::try_new(
+                        FilterExecBuilder::new(
                             Arc::clone(&runtime_expr[0]),
                             Arc::new(async_exec),
-                        )?
+                        )
                         // project the output columns excluding the async functions
                         // The async functions are always appended to the end of the schema.
-                        .with_projection(Some(
-                            (0..input.schema().fields().len()).collect(),
+                        .apply_projection(Some(
+                            (0..input.schema().fields().len()).collect::<Vec<_>>(),
                         ))?
+                        .with_batch_size(session_state.config().batch_size())
+                        .build()?
                     }
                     _ => {
                         return internal_err!(
                             "Unexpected result from try_plan_async_exprs"
-                        )
+                        );
                     }
                 };
 
@@ -881,7 +1220,9 @@ impl DefaultPhysicalPlanner {
                     .options()
                     .optimizer
                     .default_filter_selectivity;
-                Arc::new(filter.with_default_selectivity(selectivity)?)
+                let filter_exec: Arc<dyn ExecutionPlan> =
+                    Arc::new(filter.with_default_selectivity(selectivity)?);
+                filter_exec
             }
             LogicalPlan::Repartition(Repartition {
                 input,
@@ -897,11 +1238,7 @@ impl DefaultPhysicalPlanner {
                         let runtime_expr = expr
                             .iter()
                             .map(|e| {
-                                self.create_physical_expr(
-                                    e,
-                                    input_dfschema,
-                                    session_state,
-                                )
+                                create_physical_expr(e, input_dfschema, execution_props)
                             })
                             .collect::<Result<Vec<_>>>()?;
                         Partitioning::Hash(runtime_expr, *n)
@@ -922,11 +1259,8 @@ impl DefaultPhysicalPlanner {
             }) => {
                 let physical_input = children.one()?;
                 let input_dfschema = input.as_ref().schema();
-                let sort_exprs = create_physical_sort_exprs(
-                    expr,
-                    input_dfschema,
-                    session_state.execution_props(),
-                )?;
+                let sort_exprs =
+                    create_physical_sort_exprs(expr, input_dfschema, execution_props)?;
                 let Some(ordering) = LexOrdering::new(sort_exprs) else {
                     return internal_err!(
                         "SortExec requires at least one sort expression"
@@ -935,7 +1269,14 @@ impl DefaultPhysicalPlanner {
                 let new_sort = SortExec::new(ordering, physical_input).with_fetch(*fetch);
                 Arc::new(new_sort)
             }
-            LogicalPlan::Subquery(_) => todo!(),
+            // The optimizer's decorrelation passes remove Subquery nodes
+            // for supported patterns. This error is hit for correlated
+            // patterns that the optimizer cannot (yet) decorrelate.
+            LogicalPlan::Subquery(_) => {
+                return not_impl_err!(
+                    "Physical plan does not support undecorrelated Subquery"
+                );
+            }
             LogicalPlan::SubqueryAlias(_) => children.one()?,
             LogicalPlan::Limit(limit) => {
                 let input = children.one()?;
@@ -1000,6 +1341,7 @@ impl DefaultPhysicalPlanner {
                 filter,
                 join_type,
                 null_equality,
+                null_aware,
                 schema: join_schema,
                 ..
             }) => {
@@ -1045,8 +1387,8 @@ impl DefaultPhysicalPlanner {
                         (
                             true,
                             LogicalPlan::Projection(Projection { input, expr, .. }),
-                        ) => self.create_project_physical_exec(
-                            session_state,
+                        ) => self.create_project_physical_exec_with_props(
+                            execution_props,
                             physical_left,
                             input,
                             expr,
@@ -1058,8 +1400,8 @@ impl DefaultPhysicalPlanner {
                         (
                             true,
                             LogicalPlan::Projection(Projection { input, expr, .. }),
-                        ) => self.create_project_physical_exec(
-                            session_state,
+                        ) => self.create_project_physical_exec_with_props(
+                            execution_props,
                             physical_right,
                             input,
                             expr,
@@ -1124,7 +1466,6 @@ impl DefaultPhysicalPlanner {
                 // All equi-join keys are columns now, create physical join plan
                 let left_df_schema = left.schema();
                 let right_df_schema = right.schema();
-                let execution_props = session_state.execution_props();
                 let join_on = keys
                     .iter()
                     .map(|(l, r)| {
@@ -1207,7 +1548,7 @@ impl DefaultPhysicalPlanner {
                         let filter_df_fields = filter_df_fields
                             .into_iter()
                             .map(|(qualifier, field)| {
-                                (qualifier.cloned(), Arc::new(field.clone()))
+                                (qualifier.cloned(), Arc::clone(field))
                             })
                             .collect();
 
@@ -1230,7 +1571,7 @@ impl DefaultPhysicalPlanner {
                         let filter_expr = create_physical_expr(
                             expr,
                             &filter_df_schema,
-                            session_state.execution_props(),
+                            execution_props,
                         )?;
                         let column_indices = join_utils::JoinFilter::build_column_indices(
                             left_field_indices,
@@ -1251,7 +1592,7 @@ impl DefaultPhysicalPlanner {
 
                 // TODO: Allow PWMJ to deal with residual equijoin conditions
                 let join: Arc<dyn ExecutionPlan> = if join_on.is_empty() {
-                    if join_filter.is_none() && matches!(join_type, JoinType::Inner) {
+                    if join_filter.is_none() && *join_type == JoinType::Inner {
                         // cross join if there is no join conditions and no join filter set
                         Arc::new(CrossJoinExec::new(physical_left, physical_right))
                     } else if num_range_filters == 1
@@ -1326,9 +1667,7 @@ impl DefaultPhysicalPlanner {
 
                         let left_side = side_of(lhs_logical)?;
                         let right_side = side_of(rhs_logical)?;
-                        if matches!(left_side, Side::Both)
-                            || matches!(right_side, Side::Both)
-                        {
+                        if left_side == Side::Both || right_side == Side::Both {
                             return Ok(Arc::new(NestedLoopJoinExec::try_new(
                                 physical_left,
                                 physical_right,
@@ -1352,12 +1691,12 @@ impl DefaultPhysicalPlanner {
                         let on_left = create_physical_expr(
                             lhs_logical,
                             left_df_schema,
-                            session_state.execution_props(),
+                            execution_props,
                         )?;
                         let on_right = create_physical_expr(
                             rhs_logical,
                             right_df_schema,
-                            session_state.execution_props(),
+                            execution_props,
                         )?;
 
                         Arc::new(PiecewiseMergeJoinExec::try_new(
@@ -1396,6 +1735,8 @@ impl DefaultPhysicalPlanner {
                 } else if session_state.config().target_partitions() > 1
                     && session_state.config().repartition_joins()
                     && prefer_hash_join
+                    && !*null_aware
+                // Null-aware joins must use CollectLeft
                 {
                     Arc::new(HashJoinExec::try_new(
                         physical_left,
@@ -1406,6 +1747,7 @@ impl DefaultPhysicalPlanner {
                         None,
                         PartitionMode::Auto,
                         *null_equality,
+                        *null_aware,
                     )?)
                 } else {
                     Arc::new(HashJoinExec::try_new(
@@ -1417,13 +1759,19 @@ impl DefaultPhysicalPlanner {
                         None,
                         PartitionMode::CollectLeft,
                         *null_equality,
+                        *null_aware,
                     )?)
                 };
 
                 // If plan was mutated previously then need to create the ExecutionPlan
                 // for the new Projection that was applied on top.
                 if let Some((input, expr)) = new_project {
-                    self.create_project_physical_exec(session_state, join, input, expr)?
+                    self.create_project_physical_exec_with_props(
+                        execution_props,
+                        join,
+                        input,
+                        expr,
+                    )?
                 } else {
                     join
                 }
@@ -1463,22 +1811,16 @@ impl DefaultPhysicalPlanner {
                 }
 
                 let plan = match maybe_plan {
-                        Some(v) => Ok(v),
-                        _ => plan_err!("No installed planner was able to convert the custom node to an execution plan: {:?}", node)
-                    }?;
-
-                // Ensure the ExecutionPlan's schema matches the
-                // declared logical schema to catch and warn about
-                // logic errors when creating user defined plans.
-                if !node.schema().matches_arrow_schema(&plan.schema()) {
-                    return plan_err!(
-                            "Extension planner for {:?} created an ExecutionPlan with mismatched schema. \
-                            LogicalPlan schema: {:?}, ExecutionPlan schema: {:?}",
-                            node, node.schema(), plan.schema()
-                        );
-                } else {
-                    plan
-                }
+                    Some(v) => Ok(v),
+                    _ => plan_err!(
+                        "No installed planner was able to convert the custom node to an execution plan: {:?}",
+                        node
+                    ),
+                }?;
+
+                let context = format!("Extension planner for {node:?}");
+                self.ensure_schema_matches(node.schema(), &plan, &context)?;
+                plan
             }
 
             // Other
@@ -1502,17 +1844,17 @@ impl DefaultPhysicalPlanner {
             LogicalPlan::Explain(_) => {
                 return internal_err!(
                     "Unsupported logical plan: Explain must be root of the plan"
-                )
+                );
             }
             LogicalPlan::Distinct(_) => {
                 return internal_err!(
                     "Unsupported logical plan: Distinct should be replaced to Aggregate"
-                )
+                );
             }
             LogicalPlan::Analyze(_) => {
                 return internal_err!(
                     "Unsupported logical plan: Analyze must be root of the plan"
-                )
+                );
             }
         };
         Ok(exec_node)
@@ -1523,7 +1865,7 @@ impl DefaultPhysicalPlanner {
         group_expr: &[Expr],
         input_dfschema: &DFSchema,
         input_schema: &Schema,
-        session_state: &SessionState,
+        execution_props: &ExecutionProps,
     ) -> Result<PhysicalGroupBy> {
         if group_expr.len() == 1 {
             match &group_expr[0] {
@@ -1532,38 +1874,39 @@ impl DefaultPhysicalPlanner {
                         grouping_sets,
                         input_dfschema,
                         input_schema,
-                        session_state,
+                        execution_props,
                     )
                 }
                 Expr::GroupingSet(GroupingSet::Cube(exprs)) => create_cube_physical_expr(
                     exprs,
                     input_dfschema,
                     input_schema,
-                    session_state,
+                    execution_props,
                 ),
                 Expr::GroupingSet(GroupingSet::Rollup(exprs)) => {
                     create_rollup_physical_expr(
                         exprs,
                         input_dfschema,
                         input_schema,
-                        session_state,
+                        execution_props,
                     )
                 }
                 expr => Ok(PhysicalGroupBy::new_single(vec![tuple_err((
-                    self.create_physical_expr(expr, input_dfschema, session_state),
+                    create_physical_expr(expr, input_dfschema, execution_props),
                     physical_name(expr),
                 ))?])),
             }
         } else if group_expr.is_empty() {
             // No GROUP BY clause - create empty PhysicalGroupBy
-            Ok(PhysicalGroupBy::new(vec![], vec![], vec![]))
+            // no expressions, no null expressions and no grouping expressions
+            Ok(PhysicalGroupBy::new(vec![], vec![], vec![], false))
         } else {
             Ok(PhysicalGroupBy::new_single(
                 group_expr
                     .iter()
                     .map(|e| {
                         tuple_err((
-                            self.create_physical_expr(e, input_dfschema, session_state),
+                            create_physical_expr(e, input_dfschema, execution_props),
                             physical_name(e),
                         ))
                     })
@@ -1587,7 +1930,7 @@ fn merge_grouping_set_physical_expr(
     grouping_sets: &[Vec<Expr>],
     input_dfschema: &DFSchema,
     input_schema: &Schema,
-    session_state: &SessionState,
+    execution_props: &ExecutionProps,
 ) -> Result<PhysicalGroupBy> {
     let num_groups = grouping_sets.len();
     let mut all_exprs: Vec<Expr> = vec![];
@@ -1601,14 +1944,14 @@ fn merge_grouping_set_physical_expr(
             grouping_set_expr.push(get_physical_expr_pair(
                 expr,
                 input_dfschema,
-                session_state,
+                execution_props,
             )?);
 
             null_exprs.push(get_null_physical_expr_pair(
                 expr,
                 input_dfschema,
                 input_schema,
-                session_state,
+                execution_props,
             )?);
         }
     }
@@ -1628,6 +1971,7 @@ fn merge_grouping_set_physical_expr(
         grouping_set_expr,
         null_exprs,
         merged_sets,
+        true,
     ))
 }
 
@@ -1637,7 +1981,7 @@ fn create_cube_physical_expr(
     exprs: &[Expr],
     input_dfschema: &DFSchema,
     input_schema: &Schema,
-    session_state: &SessionState,
+    execution_props: &ExecutionProps,
 ) -> Result<PhysicalGroupBy> {
     let num_of_exprs = exprs.len();
     let num_groups = num_of_exprs * num_of_exprs;
@@ -1652,10 +1996,14 @@ fn create_cube_physical_expr(
             expr,
             input_dfschema,
             input_schema,
-            session_state,
+            execution_props,
         )?);
 
-        all_exprs.push(get_physical_expr_pair(expr, input_dfschema, session_state)?)
+        all_exprs.push(get_physical_expr_pair(
+            expr,
+            input_dfschema,
+            execution_props,
+        )?)
     }
 
     let mut groups: Vec<Vec<bool>> = Vec::with_capacity(num_groups);
@@ -1670,7 +2018,7 @@ fn create_cube_physical_expr(
         }
     }
 
-    Ok(PhysicalGroupBy::new(all_exprs, null_exprs, groups))
+    Ok(PhysicalGroupBy::new(all_exprs, null_exprs, groups, true))
 }
 
 /// Expand and align a ROLLUP expression. This is a special case of GROUPING SETS
@@ -1679,7 +2027,7 @@ fn create_rollup_physical_expr(
     exprs: &[Expr],
     input_dfschema: &DFSchema,
     input_schema: &Schema,
-    session_state: &SessionState,
+    execution_props: &ExecutionProps,
 ) -> Result<PhysicalGroupBy> {
     let num_of_exprs = exprs.len();
 
@@ -1695,10 +2043,14 @@ fn create_rollup_physical_expr(
             expr,
             input_dfschema,
             input_schema,
-            session_state,
+            execution_props,
         )?);
 
-        all_exprs.push(get_physical_expr_pair(expr, input_dfschema, session_state)?)
+        all_exprs.push(get_physical_expr_pair(
+            expr,
+            input_dfschema,
+            execution_props,
+        )?)
     }
 
     for total in 0..=num_of_exprs {
@@ -1715,7 +2067,7 @@ fn create_rollup_physical_expr(
         groups.push(group)
     }
 
-    Ok(PhysicalGroupBy::new(all_exprs, null_exprs, groups))
+    Ok(PhysicalGroupBy::new(all_exprs, null_exprs, groups, true))
 }
 
 /// For a given logical expr, get a properly typed NULL ScalarValue physical expression
@@ -1723,10 +2075,9 @@ fn get_null_physical_expr_pair(
     expr: &Expr,
     input_dfschema: &DFSchema,
     input_schema: &Schema,
-    session_state: &SessionState,
+    execution_props: &ExecutionProps,
 ) -> Result<(Arc<dyn PhysicalExpr>, String)> {
-    let physical_expr =
-        create_physical_expr(expr, input_dfschema, session_state.execution_props())?;
+    let physical_expr = create_physical_expr(expr, input_dfschema, execution_props)?;
     let physical_name = physical_name(&expr.clone())?;
 
     let data_type = physical_expr.data_type(input_schema)?;
@@ -1752,11 +2103,11 @@ fn qualify_join_schema_sides(
     let join_fields = join_schema.fields();
 
     // Validate lengths
-    if join_fields.len() != left_fields.len() + right_fields.len() {
-        return internal_err!(
-            "Join schema field count must match left and right field count."
-        );
-    }
+    assert_eq_or_internal_err!(
+        join_fields.len(),
+        left_fields.len() + right_fields.len(),
+        "Join schema field count must match left and right field count."
+    );
 
     // Validate field names match
     for (i, (field, expected)) in join_fields
@@ -1764,14 +2115,12 @@ fn qualify_join_schema_sides(
         .zip(left_fields.iter().chain(right_fields.iter()))
         .enumerate()
     {
-        if field.name() != expected.name() {
-            return internal_err!(
-                "Field name mismatch at index {}: expected '{}', found '{}'",
-                i,
-                expected.name(),
-                field.name()
-            );
-        }
+        assert_eq_or_internal_err!(
+            field.name(),
+            expected.name(),
+            "Field name mismatch at index {}",
+            i
+        );
     }
 
     // qualify sides
@@ -1797,14 +2146,240 @@ fn qualify_join_schema_sides(
 fn get_physical_expr_pair(
     expr: &Expr,
     input_dfschema: &DFSchema,
-    session_state: &SessionState,
+    execution_props: &ExecutionProps,
 ) -> Result<(Arc<dyn PhysicalExpr>, String)> {
-    let physical_expr =
-        create_physical_expr(expr, input_dfschema, session_state.execution_props())?;
+    let physical_expr = create_physical_expr(expr, input_dfschema, execution_props)?;
     let physical_name = physical_name(expr)?;
     Ok((physical_expr, physical_name))
 }
 
+/// Extract filter predicates from a DML input plan (DELETE/UPDATE).
+///
+/// Walks the logical plan tree and collects Filter predicates and any filters
+/// pushed down into TableScan nodes, splitting AND conjunctions into individual expressions.
+///
+/// For UPDATE...FROM queries involving multiple tables, this function only extracts predicates
+/// that reference the target table. Filters from source table scans are excluded to prevent
+/// incorrect filter semantics.
+///
+/// Column qualifiers are stripped so expressions can be evaluated against the TableProvider's
+/// schema. Deduplication is performed because filters may appear in both Filter nodes and
+/// TableScan.filters when the optimizer performs partial (Inexact) filter pushdown.
+///
+/// # Parameters
+/// - `input`: The logical plan tree to extract filters from (typically a DELETE or UPDATE plan)
+/// - `target`: The target table reference to scope filter extraction (prevents multi-table filter leakage)
+///
+/// # Returns
+/// A vector of unqualified filter expressions that can be passed to the TableProvider for execution.
+/// Returns an empty vector if no applicable filters are found.
+///
+#[allow(clippy::allow_attributes, clippy::mutable_key_type)] // Expr contains Arc with interior mutability but is intentionally used as hash key
+fn extract_dml_filters(
+    input: &Arc<LogicalPlan>,
+    target: &TableReference,
+) -> Result<Vec<Expr>> {
+    let mut filters = Vec::new();
+    let mut allowed_refs = vec![target.clone()];
+
+    // First pass: collect any alias references to the target table
+    input.apply(|node| {
+        if let LogicalPlan::SubqueryAlias(alias) = node
+            // Check if this alias points to the target table
+            && let LogicalPlan::TableScan(scan) = alias.input.as_ref()
+            && scan.table_name.resolved_eq(target)
+        {
+            allowed_refs.push(TableReference::bare(alias.alias.to_string()));
+        }
+        Ok(TreeNodeRecursion::Continue)
+    })?;
+
+    input.apply(|node| {
+        match node {
+            LogicalPlan::Filter(filter) => {
+                // Split AND predicates into individual expressions
+                for predicate in split_conjunction(&filter.predicate) {
+                    if predicate_is_on_target_multi(predicate, &allowed_refs)? {
+                        filters.push(predicate.clone());
+                    }
+                }
+            }
+            LogicalPlan::TableScan(TableScan {
+                table_name,
+                filters: scan_filters,
+                ..
+            }) => {
+                // Only extract filters from the target table scan.
+                // This prevents incorrect filter extraction in UPDATE...FROM scenarios
+                // where multiple table scans may have filters.
+                if table_name.resolved_eq(target) {
+                    for filter in scan_filters {
+                        filters.extend(split_conjunction(filter).into_iter().cloned());
+                    }
+                }
+            }
+            // Plans without filter information
+            LogicalPlan::EmptyRelation(_)
+            | LogicalPlan::Values(_)
+            | LogicalPlan::DescribeTable(_)
+            | LogicalPlan::Explain(_)
+            | LogicalPlan::Analyze(_)
+            | LogicalPlan::Distinct(_)
+            | LogicalPlan::Extension(_)
+            | LogicalPlan::Statement(_)
+            | LogicalPlan::Dml(_)
+            | LogicalPlan::Ddl(_)
+            | LogicalPlan::Copy(_)
+            | LogicalPlan::Unnest(_)
+            | LogicalPlan::RecursiveQuery(_) => {
+                // No filters to extract from leaf/meta plans
+            }
+            // Plans with inputs (may contain filters in children)
+            LogicalPlan::Projection(_)
+            | LogicalPlan::SubqueryAlias(_)
+            | LogicalPlan::Limit(_)
+            | LogicalPlan::Sort(_)
+            | LogicalPlan::Union(_)
+            | LogicalPlan::Join(_)
+            | LogicalPlan::Repartition(_)
+            | LogicalPlan::Aggregate(_)
+            | LogicalPlan::Window(_)
+            | LogicalPlan::Subquery(_) => {
+                // Filter information may appear in child nodes; continue traversal
+                // to extract filters from Filter/TableScan nodes deeper in the plan
+            }
+        }
+        Ok(TreeNodeRecursion::Continue)
+    })?;
+
+    // Strip qualifiers and deduplicate. This ensures:
+    // 1. Only target-table predicates are retained from Filter nodes
+    // 2. Qualifiers stripped for TableProvider compatibility
+    // 3. Duplicates removed (from Filter nodes + TableScan.filters)
+    //
+    // Deduplication is necessary because filters may appear in both Filter nodes
+    // and TableScan.filters when the optimizer performs partial (Inexact) pushdown.
+    let mut seen_filters = HashSet::new();
+    filters
+        .into_iter()
+        .try_fold(Vec::new(), |mut deduped, filter| {
+            let unqualified = strip_column_qualifiers(filter).map_err(|e| {
+                e.context(format!(
+                    "Failed to strip column qualifiers for DML filter on table '{target}'"
+                ))
+            })?;
+            if seen_filters.insert(unqualified.clone()) {
+                deduped.push(unqualified);
+            }
+            Ok(deduped)
+        })
+}
+
+/// Determine whether a predicate references only columns from the target table
+/// or its aliases.
+///
+/// Columns may be qualified with the target table name or any of its aliases.
+/// Unqualified columns are also accepted as they implicitly belong to the target table.
+fn predicate_is_on_target_multi(
+    expr: &Expr,
+    allowed_refs: &[TableReference],
+) -> Result<bool> {
+    let mut columns = HashSet::new();
+    expr_to_columns(expr, &mut columns)?;
+
+    // Short-circuit on first mismatch: returns false if any column references a table not in allowed_refs.
+    // Columns are accepted if:
+    // 1. They are unqualified (no relation specified), OR
+    // 2. Their relation matches one of the allowed table references using resolved equality
+    Ok(!columns.iter().any(|column| {
+        column.relation.as_ref().is_some_and(|relation| {
+            !allowed_refs
+                .iter()
+                .any(|allowed| relation.resolved_eq(allowed))
+        })
+    }))
+}
+
+/// Strip table qualifiers from column references in an expression.
+/// This is needed because DML filter expressions contain qualified column names
+/// (e.g., "table.column") but the TableProvider's schema only has simple names.
+fn strip_column_qualifiers(expr: Expr) -> Result<Expr> {
+    expr.transform(|e| {
+        if let Expr::Column(col) = &e
+            && col.relation.is_some()
+        {
+            // Strip the qualifier
+            return Ok(Transformed::yes(Expr::Column(Column::new_unqualified(
+                col.name.clone(),
+            ))));
+        }
+        Ok(Transformed::no(e))
+    })
+    .map(|t| t.data)
+}
+
+/// Extract column assignments from an UPDATE input plan.
+/// For UPDATE statements, the SQL planner encodes assignments as a projection
+/// over the source table. This function extracts column name and expression pairs
+/// from the projection. Column qualifiers are stripped from the expressions.
+///
+fn extract_update_assignments(input: &Arc<LogicalPlan>) -> Result<Vec<(String, Expr)>> {
+    // The UPDATE input plan structure is:
+    // Projection(updated columns as expressions with aliases)
+    //   Filter(optional WHERE clause)
+    //     TableScan
+    //
+    // Each projected expression has an alias matching the column name
+    let mut assignments = Vec::new();
+
+    // Find the top-level projection
+    if let LogicalPlan::Projection(projection) = input.as_ref() {
+        for expr in &projection.expr {
+            if let Expr::Alias(alias) = expr {
+                // The alias name is the column name being updated
+                // The inner expression is the new value
+                let column_name = alias.name.clone();
+                // Only include if it's not just a column reference to itself
+                // (those are columns that aren't being updated)
+                if !is_identity_assignment(&alias.expr, &column_name) {
+                    // Strip qualifiers from the assignment expression
+                    let stripped_expr = strip_column_qualifiers((*alias.expr).clone())?;
+                    assignments.push((column_name, stripped_expr));
+                }
+            }
+        }
+    } else {
+        // Try to find projection deeper in the plan
+        input.apply(|node| {
+            if let LogicalPlan::Projection(projection) = node {
+                for expr in &projection.expr {
+                    if let Expr::Alias(alias) = expr {
+                        let column_name = alias.name.clone();
+                        if !is_identity_assignment(&alias.expr, &column_name) {
+                            let stripped_expr =
+                                strip_column_qualifiers((*alias.expr).clone())?;
+                            assignments.push((column_name, stripped_expr));
+                        }
+                    }
+                }
+                return Ok(TreeNodeRecursion::Stop);
+            }
+            Ok(TreeNodeRecursion::Continue)
+        })?;
+    }
+
+    Ok(assignments)
+}
+
+/// Check if an assignment is an identity assignment (column = column)
+/// These are columns that are not being modified in the UPDATE
+fn is_identity_assignment(expr: &Expr, column_name: &str) -> bool {
+    match expr {
+        Expr::Column(col) => col.name == column_name,
+        _ => false,
+    }
+}
+
 /// Check if window bounds are valid after schema information is available, and
 /// window_frame bounds are casted to the corresponding column type.
 /// queries like:
@@ -1858,9 +2433,10 @@ pub fn create_window_expr_with_name(
 
             if !is_window_frame_bound_valid(window_frame) {
                 return plan_err!(
-                        "Invalid window frame: start bound ({}) cannot be larger than end bound ({})",
-                        window_frame.start_bound, window_frame.end_bound
-                    );
+                    "Invalid window frame: start bound ({}) cannot be larger than end bound ({})",
+                    window_frame.start_bound,
+                    window_frame.end_bound
+                );
             }
 
             let window_frame = Arc::new(window_frame.clone());
@@ -2228,14 +2804,21 @@ impl DefaultPhysicalPlanner {
         let schema = Arc::clone(a.schema.inner());
         let show_statistics = session_state.config_options().explain.show_statistics;
         let analyze_level = session_state.config_options().explain.analyze_level;
-        let metric_types = match analyze_level {
-            ExplainAnalyzeLevel::Summary => vec![MetricType::SUMMARY],
-            ExplainAnalyzeLevel::Dev => vec![MetricType::SUMMARY, MetricType::DEV],
+        let metric_types = analyze_level.included_types();
+        let analyze_categories = session_state
+            .config_options()
+            .explain
+            .analyze_categories
+            .clone();
+        let metric_categories = match analyze_categories {
+            ExplainAnalyzeCategories::All => None,
+            ExplainAnalyzeCategories::Only(cats) => Some(cats),
         };
         Ok(Arc::new(AnalyzeExec::new(
             a.verbose,
             show_statistics,
             metric_types,
+            metric_categories,
             input,
             schema,
         )))
@@ -2243,6 +2826,7 @@ impl DefaultPhysicalPlanner {
 
     /// Optimize a physical plan by applying each physical optimizer,
     /// calling observer(plan, optimizer after each one)
+    #[expect(clippy::needless_pass_by_value)]
     pub fn optimize_physical_plan<F>(
         &self,
         plan: Arc<dyn ExecutionPlan>,
@@ -2270,14 +2854,14 @@ impl DefaultPhysicalPlanner {
         for optimizer in optimizers {
             let before_schema = new_plan.schema();
             new_plan = optimizer
-                .optimize(new_plan, session_state.config_options())
+                .optimize_with_context(new_plan, session_state)
                 .map_err(|e| {
                     DataFusionError::Context(optimizer.name().to_string(), Box::new(e))
                 })?;
 
             // This only checks the schema in release build, and performs additional checks in debug mode.
             OptimizationInvariantChecker::new(optimizer)
-                .check(&new_plan, before_schema)?;
+                .check(&new_plan, &before_schema)?;
 
             debug!(
                 "Optimized physical plan by {}:\n{}\n",
@@ -2310,7 +2894,7 @@ impl DefaultPhysicalPlanner {
     // return an record_batch which describes a table's schema.
     fn plan_describe(
         &self,
-        table_schema: Arc<Schema>,
+        table_schema: &Arc<Schema>,
         output_schema: Arc<Schema>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let mut column_names = StringBuilder::new();
@@ -2344,9 +2928,37 @@ impl DefaultPhysicalPlanner {
         Ok(mem_exec)
     }
 
-    fn create_project_physical_exec(
+    /// Build physical plans for scalar subqueries and assign each an ordinal
+    /// `SubqueryIndex`. Returns the links (plan + index) and a map from logical
+    /// `Subquery` to its index.
+    async fn plan_scalar_subqueries(
         &self,
+        subqueries: Vec<Subquery>,
         session_state: &SessionState,
+    ) -> Result<(Vec<ScalarSubqueryLink>, DFHashMap<Subquery, SubqueryIndex>)> {
+        let mut links = Vec::with_capacity(subqueries.len());
+        let mut index_map = DFHashMap::with_capacity(subqueries.len());
+        for sq in subqueries {
+            // Callers deduplicate, but guard against accidental double-planning.
+            if index_map.contains_key(&sq) {
+                continue;
+            }
+            let physical_plan = self
+                .create_initial_plan(&sq.subquery, session_state)
+                .await?;
+            let index = SubqueryIndex::new(links.len());
+            links.push(ScalarSubqueryLink {
+                plan: physical_plan,
+                index,
+            });
+            index_map.insert(sq, index);
+        }
+        Ok((links, index_map))
+    }
+
+    fn create_project_physical_exec_with_props(
+        &self,
+        execution_props: &ExecutionProps,
         input_exec: Arc<dyn ExecutionPlan>,
         input: &Arc<LogicalPlan>,
         expr: &[Expr],
@@ -2385,7 +2997,7 @@ impl DefaultPhysicalPlanner {
                 };
 
                 let physical_expr =
-                    self.create_physical_expr(e, input_logical_schema, session_state);
+                    create_physical_expr(e, input_logical_schema, execution_props);
 
                 tuple_err((physical_expr, physical_name))
             })
@@ -2513,11 +3125,14 @@ impl<'a> OptimizationInvariantChecker<'a> {
     pub fn check(
         &mut self,
         plan: &Arc<dyn ExecutionPlan>,
-        previous_schema: Arc<Schema>,
+        previous_schema: &Arc<Schema>,
     ) -> Result<()> {
         // if the rule is not permitted to change the schema, confirm that it did not change.
-        if self.rule.schema_check() && plan.schema() != previous_schema {
-            internal_err!("PhysicalOptimizer rule '{}' failed. Schema mismatch. Expected original schema: {:?}, got new schema: {:?}",
+        if self.rule.schema_check()
+            && !is_allowed_schema_change(previous_schema.as_ref(), plan.schema().as_ref())
+        {
+            internal_err!(
+                "PhysicalOptimizer rule '{}' failed. Schema mismatch. Expected original schema: {}, got new schema: {}",
                 self.rule.name(),
                 previous_schema,
                 plan.schema()
@@ -2532,6 +3147,38 @@ impl<'a> OptimizationInvariantChecker<'a> {
     }
 }
 
+/// Checks if the change from `old` schema to `new` is allowed or not.
+///
+/// The current implementation only allows nullability of individual fields to change
+/// from 'nullable' to 'not nullable'. This can happen due to physical expressions knowing
+/// more about their null-ness than their logical counterparts.
+/// This change is allowed because for any field the non-nullable domain `F` is a strict subset
+/// of the nullable domain `F ∪ { NULL }`. A physical schema that guarantees a stricter subset
+/// of values will not violate any assumptions made based on the less strict schema.
+fn is_allowed_schema_change(old: &Schema, new: &Schema) -> bool {
+    if new.metadata != old.metadata {
+        return false;
+    }
+
+    if new.fields.len() != old.fields.len() {
+        return false;
+    }
+
+    let new_fields = new.fields.iter().map(|f| f.as_ref());
+    let old_fields = old.fields.iter().map(|f| f.as_ref());
+    old_fields
+        .zip(new_fields)
+        .all(|(old, new)| is_allowed_field_change(old, new))
+}
+
+fn is_allowed_field_change(old_field: &Field, new_field: &Field) -> bool {
+    new_field.name() == old_field.name()
+        && new_field.data_type() == old_field.data_type()
+        && new_field.metadata() == old_field.metadata()
+        && (new_field.is_nullable() == old_field.is_nullable()
+            || !new_field.is_nullable())
+}
+
 impl<'n> TreeNodeVisitor<'n> for OptimizationInvariantChecker<'_> {
     type Node = Arc<dyn ExecutionPlan>;
 
@@ -2574,17 +3221,16 @@ impl<'n> TreeNodeVisitor<'n> for InvariantChecker {
 
 #[cfg(test)]
 mod tests {
-    use std::any::Any;
     use std::cmp::Ordering;
     use std::fmt::{self, Debug};
     use std::ops::{BitAnd, Not};
 
     use super::*;
-    use crate::datasource::file_format::options::CsvReadOptions;
     use crate::datasource::MemTable;
+    use crate::datasource::file_format::options::CsvReadOptions;
     use crate::physical_plan::{
-        expressions, DisplayAs, DisplayFormatType, PlanProperties,
-        SendableRecordBatchStream,
+        DisplayAs, DisplayFormatType, PlanProperties, SendableRecordBatchStream,
+        expressions,
     };
     use crate::prelude::{SessionConfig, SessionContext};
     use crate::test_util::{scan_empty, scan_empty_with_partitions};
@@ -2595,12 +3241,14 @@ mod tests {
     use arrow_schema::SchemaRef;
     use datafusion_common::config::ConfigOptions;
     use datafusion_common::{
-        assert_contains, DFSchemaRef, TableReference, ToDFSchema as _,
+        DFSchemaRef, TableReference, ToDFSchema as _, assert_batches_eq, assert_contains,
     };
-    use datafusion_execution::runtime_env::RuntimeEnv;
     use datafusion_execution::TaskContext;
+    use datafusion_execution::runtime_env::RuntimeEnv;
     use datafusion_expr::builder::subquery_alias;
-    use datafusion_expr::{col, lit, LogicalPlanBuilder, UserDefinedLogicalNodeCore};
+    use datafusion_expr::{
+        LogicalPlanBuilder, TableSource, UserDefinedLogicalNodeCore, col, lit,
+    };
     use datafusion_functions_aggregate::count::count_all;
     use datafusion_functions_aggregate::expr_fn::sum;
     use datafusion_physical_expr::EquivalenceProperties;
@@ -2627,6 +3275,16 @@ mod tests {
             .await
     }
 
+    async fn plan_sql(query: &str) -> Result<Arc<dyn ExecutionPlan>> {
+        let ctx = SessionContext::new();
+        ctx.sql(query).await?.create_physical_plan().await
+    }
+
+    async fn collect_sql(query: &str) -> Result<Vec<RecordBatch>> {
+        let ctx = SessionContext::new();
+        ctx.sql(query).await?.collect().await
+    }
+
     #[tokio::test]
     async fn test_all_operators() -> Result<()> {
         let logical_plan = test_csv_scan()
@@ -2650,6 +3308,132 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn scalar_subquery_in_sort_expr_plans() -> Result<()> {
+        let plan = plan_sql(
+            "SELECT x \
+             FROM (VALUES (2), (1)) AS t(x) \
+             ORDER BY x + (SELECT max(y) FROM (VALUES (10), (20)) AS u(y))",
+        )
+        .await?;
+
+        assert_contains!(format!("{plan:?}"), "ScalarSubqueryExec");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn scalar_subquery_in_sort_expr_executes() -> Result<()> {
+        let batches = collect_sql(
+            "SELECT x \
+             FROM (VALUES (2), (1), (3)) AS t(x) \
+             ORDER BY x + (SELECT max(y) FROM (VALUES (10), (20)) AS u(y)) DESC",
+        )
+        .await?;
+
+        assert_batches_eq!(
+            &[
+                "+---+", "| x |", "+---+", "| 3 |", "| 2 |", "| 1 |", "+---+",
+            ],
+            &batches
+        );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn scalar_subquery_in_aggregate_arg_plans() -> Result<()> {
+        let plan = plan_sql(
+            "SELECT sum(x + (SELECT max(y) FROM (VALUES (10), (20)) AS u(y))) \
+             FROM (VALUES (2), (1)) AS t(x)",
+        )
+        .await?;
+
+        assert_contains!(format!("{plan:?}"), "ScalarSubqueryExec");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn scalar_subquery_in_aggregate_arg_executes() -> Result<()> {
+        let batches = collect_sql(
+            "SELECT sum(x + (SELECT max(y) FROM (VALUES (10), (20)) AS u(y))) AS s \
+             FROM (VALUES (2), (1)) AS t(x)",
+        )
+        .await?;
+
+        assert_batches_eq!(
+            &["+----+", "| s  |", "+----+", "| 43 |", "+----+",],
+            &batches
+        );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn scalar_subquery_in_join_on_plans() -> Result<()> {
+        let plan = plan_sql(
+            "SELECT l.x, r.y \
+             FROM (VALUES (1), (2)) AS l(x) \
+             JOIN (VALUES (11), (12)) AS r(y) \
+             ON l.x + (SELECT 10) = r.y",
+        )
+        .await?;
+
+        let formatted = format!("{plan:?}");
+        assert_contains!(&formatted, "ScalarSubqueryExec");
+        assert!(
+            formatted.contains("HashJoinExec")
+                || formatted.contains("SortMergeJoinExec")
+                || formatted.contains("NestedLoopJoinExec")
+        );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn scalar_subquery_mixed_correlated_and_uncorrelated_executes() -> Result<()> {
+        let query = "SELECT t.x, \
+                     (SELECT max(y) FROM (VALUES (10), (20)) AS u(y)) + \
+                     (SELECT count(*) FROM (VALUES (1), (1), (2)) AS v(z) WHERE v.z = t.x) AS total \
+                     FROM (VALUES (1), (2), (3)) AS t(x) \
+                     ORDER BY x";
+        let plan = plan_sql(query).await?;
+
+        let formatted = format!("{plan:?}");
+        assert_eq!(formatted.matches("ScalarSubqueryExec").count(), 1);
+        assert!(
+            formatted.contains("HashJoinExec")
+                || formatted.contains("SortMergeJoinExec")
+                || formatted.contains("NestedLoopJoinExec")
+        );
+
+        let batches = collect_sql(query).await?;
+        assert_batches_eq!(
+            &[
+                "+---+-------+",
+                "| x | total |",
+                "+---+-------+",
+                "| 1 | 22    |",
+                "| 2 | 21    |",
+                "| 3 | 20    |",
+                "+---+-------+",
+            ],
+            &batches
+        );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn scalar_subquery_in_projection_and_filter_plans() -> Result<()> {
+        let plan = plan_sql(
+            "SELECT x + (SELECT max(y) FROM (VALUES (10), (20)) AS u(y)) \
+             FROM (VALUES (2), (1)) AS t(x) \
+             WHERE x > (SELECT min(y) FROM (VALUES (0), (1)) AS v(y))",
+        )
+        .await?;
+
+        let formatted = format!("{plan:?}");
+        // All uncorrelated scalar subqueries are hoisted to a single root node.
+        assert_eq!(formatted.matches("ScalarSubqueryExec").count(), 1);
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_create_cube_expr() -> Result<()> {
         let logical_plan = test_csv_scan().await?.build()?;
@@ -2667,7 +3451,7 @@ mod tests {
             &exprs,
             logical_input_schema,
             physical_input_schema,
-            &session_state,
+            session_state.execution_props(),
         );
 
         insta::assert_debug_snapshot!(cube, @r#"
@@ -2773,6 +3557,7 @@ mod tests {
                         true,
                     ],
                 ],
+                has_grouping_set: true,
             },
         )
         "#);
@@ -2797,7 +3582,7 @@ mod tests {
             &exprs,
             logical_input_schema,
             physical_input_schema,
-            &session_state,
+            session_state.execution_props(),
         );
 
         insta::assert_debug_snapshot!(rollup, @r#"
@@ -2883,6 +3668,7 @@ mod tests {
                         false,
                     ],
                 ],
+                has_grouping_set: true,
             },
         )
         "#);
@@ -3000,8 +3786,7 @@ mod tests {
             .create_physical_plan(&logical_plan, &session_state)
             .await;
 
-        let expected_error =
-            "No installed planner was able to convert the custom node to an execution plan: NoOp";
+        let expected_error = "No installed planner was able to convert the custom node to an execution plan: NoOp";
         match plan {
             Ok(_) => panic!("Expected planning failure"),
             Err(e) => assert!(
@@ -3033,21 +3818,17 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn in_list_types() -> Result<()> {
-        // expression: "a in ('a', 1)"
+    async fn in_list_types_mixed_string_int_error() -> Result<()> {
+        // expression: "c1 in ('a', 1)" where c1 is Utf8
         let list = vec![lit("a"), lit(1i64)];
         let logical_plan = test_csv_scan()
             .await?
-            // filter clause needs the type coercion rule applied
             .filter(col("c12").lt(lit(0.05)))?
             .project(vec![col("c1").in_list(list, false)])?
             .build()?;
-        let execution_plan = plan(&logical_plan).await?;
-        // verify that the plan correctly adds cast from Int64(1) to Utf8, and the const will be evaluated.
-
-        let expected = r#"expr: BinaryExpr { left: BinaryExpr { left: Column { name: "c1", index: 0 }, op: Eq, right: Literal { value: Utf8("a"), field: Field { name: "lit", data_type: Utf8 } }, fail_on_overflow: false }"#;
+        let e = plan(&logical_plan).await.unwrap_err().to_string();
 
-        assert_contains!(format!("{execution_plan:?}"), expected);
+        assert_contains!(&e, "Cannot cast string 'a' to value of Int64 type");
 
         Ok(())
     }
@@ -3067,7 +3848,7 @@ mod tests {
 
         assert_contains!(
             &e,
-            r#"Error during planning: Can not find compatible types to compare Boolean with [Struct("foo": Boolean), Utf8]"#
+            r#"Error during planning: Can not find compatible types to compare Boolean with [Struct("foo": non-null Boolean), Utf8]"#
         );
 
         Ok(())
@@ -3092,7 +3873,6 @@ mod tests {
 
         let execution_plan = plan(&logical_plan).await?;
         let final_hash_agg = execution_plan
-            .as_any()
             .downcast_ref::<AggregateExec>()
             .expect("hash aggregate");
         assert_eq!(
@@ -3120,7 +3900,6 @@ mod tests {
 
         let execution_plan = plan(&logical_plan).await?;
         let final_hash_agg = execution_plan
-            .as_any()
             .downcast_ref::<AggregateExec>()
             .expect("hash aggregate");
         assert_eq!(
@@ -3255,21 +4034,30 @@ mod tests {
             .unwrap();
 
         let plan = plan(&logical_plan).await.unwrap();
-        if let Some(plan) = plan.as_any().downcast_ref::<ExplainExec>() {
+        if let Some(plan) = plan.downcast_ref::<ExplainExec>() {
             let stringified_plans = plan.stringified_plans();
             assert!(stringified_plans.len() >= 4);
-            assert!(stringified_plans
-                .iter()
-                .any(|p| matches!(p.plan_type, PlanType::FinalLogicalPlan)));
-            assert!(stringified_plans
-                .iter()
-                .any(|p| matches!(p.plan_type, PlanType::InitialPhysicalPlan)));
-            assert!(stringified_plans
-                .iter()
-                .any(|p| matches!(p.plan_type, PlanType::OptimizedPhysicalPlan { .. })));
-            assert!(stringified_plans
-                .iter()
-                .any(|p| matches!(p.plan_type, PlanType::FinalPhysicalPlan)));
+            assert!(
+                stringified_plans
+                    .iter()
+                    .any(|p| p.plan_type == PlanType::FinalLogicalPlan)
+            );
+            assert!(
+                stringified_plans
+                    .iter()
+                    .any(|p| p.plan_type == PlanType::InitialPhysicalPlan)
+            );
+            assert!(
+                stringified_plans.iter().any(|p| matches!(
+                    p.plan_type,
+                    PlanType::OptimizedPhysicalPlan { .. }
+                ))
+            );
+            assert!(
+                stringified_plans
+                    .iter()
+                    .any(|p| p.plan_type == PlanType::FinalPhysicalPlan)
+            );
         } else {
             panic!(
                 "Plan was not an explain plan: {}",
@@ -3314,7 +4102,7 @@ mod tests {
             .handle_explain(&explain, &ctx.state())
             .await
             .unwrap();
-        if let Some(plan) = plan.as_any().downcast_ref::<ExplainExec>() {
+        if let Some(plan) = plan.downcast_ref::<ExplainExec>() {
             let stringified_plans = plan.stringified_plans();
             assert_eq!(stringified_plans.len(), 1);
             assert_eq!(stringified_plans[0].plan.as_str(), "Test Err");
@@ -3412,13 +4200,15 @@ mod tests {
 
     #[derive(Debug)]
     struct NoOpExecutionPlan {
-        cache: PlanProperties,
+        cache: Arc<PlanProperties>,
     }
 
     impl NoOpExecutionPlan {
         fn new(schema: SchemaRef) -> Self {
             let cache = Self::compute_properties(schema);
-            Self { cache }
+            Self {
+                cache: Arc::new(cache),
+            }
         }
 
         /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
@@ -3452,11 +4242,7 @@ mod tests {
         }
 
         /// Return a reference to Any that can be used for downcasting
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             &self.cache
         }
 
@@ -3478,6 +4264,20 @@ mod tests {
         ) -> Result<SendableRecordBatchStream> {
             unimplemented!("NoOpExecutionPlan::execute");
         }
+
+        fn apply_expressions(
+            &self,
+            f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            // Visit expressions in the output ordering from equivalence properties
+            let mut tnr = TreeNodeRecursion::Continue;
+            if let Some(ordering) = self.cache.output_ordering() {
+                for sort_expr in ordering {
+                    tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+                }
+            }
+            Ok(tnr)
+        }
     }
 
     //  Produces an execution plan where the schema is mismatched from
@@ -3604,13 +4404,10 @@ digraph {
         fn schema(&self) -> SchemaRef {
             Arc::new(Schema::empty())
         }
-        fn as_any(&self) -> &dyn Any {
-            unimplemented!()
-        }
         fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
             self.0.iter().collect::<Vec<_>>()
         }
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             unimplemented!()
         }
         fn execute(
@@ -3620,6 +4417,12 @@ digraph {
         ) -> Result<SendableRecordBatchStream> {
             unimplemented!()
         }
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
     }
     impl DisplayAs for OkExtensionNode {
         fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
@@ -3636,8 +4439,12 @@ digraph {
         }
         fn check_invariants(&self, check: InvariantLevel) -> Result<()> {
             match check {
-                InvariantLevel::Always => plan_err!("extension node failed it's user-defined always-invariant check"),
-                InvariantLevel::Executable => panic!("the OptimizationInvariantChecker should not be checking for executableness"),
+                InvariantLevel::Always => plan_err!(
+                    "extension node failed it's user-defined always-invariant check"
+                ),
+                InvariantLevel::Executable => panic!(
+                    "the OptimizationInvariantChecker should not be checking for executableness"
+                ),
             }
         }
         fn schema(&self) -> SchemaRef {
@@ -3649,13 +4456,10 @@ digraph {
         ) -> Result<Arc<dyn ExecutionPlan>> {
             unimplemented!()
         }
-        fn as_any(&self) -> &dyn Any {
-            unimplemented!()
-        }
         fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
             unimplemented!()
         }
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             unimplemented!()
         }
         fn execute(
@@ -3665,6 +4469,12 @@ digraph {
         ) -> Result<SendableRecordBatchStream> {
             unimplemented!()
         }
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
     }
     impl DisplayAs for InvariantFailsExtensionNode {
         fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
@@ -3706,24 +4516,26 @@ digraph {
 
         // Test: check should pass with same schema
         let equal_schema = ok_plan.schema();
-        OptimizationInvariantChecker::new(&rule).check(&ok_plan, equal_schema)?;
+        OptimizationInvariantChecker::new(&rule).check(&ok_plan, &equal_schema)?;
 
         // Test: should fail with schema changed
         let different_schema =
             Arc::new(Schema::new(vec![Field::new("a", DataType::Boolean, false)]));
         let expected_err = OptimizationInvariantChecker::new(&rule)
-            .check(&ok_plan, different_schema)
+            .check(&ok_plan, &different_schema)
             .unwrap_err();
         assert!(expected_err.to_string().contains("PhysicalOptimizer rule 'OptimizerRuleWithSchemaCheck' failed. Schema mismatch. Expected original schema"));
 
         // Test: should fail when extension node fails it's own invariant check
         let failing_node: Arc<dyn ExecutionPlan> = Arc::new(InvariantFailsExtensionNode);
         let expected_err = OptimizationInvariantChecker::new(&rule)
-            .check(&failing_node, ok_plan.schema())
+            .check(&failing_node, &ok_plan.schema())
             .unwrap_err();
-        assert!(expected_err
-            .to_string()
-            .contains("extension node failed it's user-defined always-invariant check"));
+        assert!(
+            expected_err.to_string().contains(
+                "extension node failed it's user-defined always-invariant check"
+            )
+        );
 
         // Test: should fail when descendent extension node fails
         let failing_node: Arc<dyn ExecutionPlan> = Arc::new(InvariantFailsExtensionNode);
@@ -3732,11 +4544,13 @@ digraph {
             Arc::clone(&child),
         ])?;
         let expected_err = OptimizationInvariantChecker::new(&rule)
-            .check(&invalid_plan, ok_plan.schema())
+            .check(&invalid_plan, &ok_plan.schema())
             .unwrap_err();
-        assert!(expected_err
-            .to_string()
-            .contains("extension node failed it's user-defined always-invariant check"));
+        assert!(
+            expected_err.to_string().contains(
+                "extension node failed it's user-defined always-invariant check"
+            )
+        );
 
         Ok(())
     }
@@ -3766,13 +4580,10 @@ digraph {
         ) -> Result<Arc<dyn ExecutionPlan>> {
             unimplemented!()
         }
-        fn as_any(&self) -> &dyn Any {
-            unimplemented!()
-        }
         fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
             vec![]
         }
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             unimplemented!()
         }
         fn execute(
@@ -3782,6 +4593,12 @@ digraph {
         ) -> Result<SendableRecordBatchStream> {
             unimplemented!()
         }
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
     }
     impl DisplayAs for ExecutableInvariantFails {
         fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
@@ -3857,8 +4674,8 @@ digraph {
         let right = LogicalPlanBuilder::scan("right", source, None)?.build()?;
 
         let join_keys = (
-            vec![datafusion_common::Column::new(Some("left"), "a")],
-            vec![datafusion_common::Column::new(Some("right"), "a")],
+            vec![Column::new(Some("left"), "a")],
+            vec![Column::new(Some("right"), "a")],
         );
 
         let join = left.join(right, JoinType::Full, join_keys, None)?.build()?;
@@ -3879,4 +4696,293 @@ digraph {
 
         Ok(())
     }
+
+    // --- Tests for aggregate schema mismatch error messages ---
+
+    use crate::catalog::TableProvider;
+    use datafusion_catalog::Session;
+    use datafusion_expr::TableType;
+
+    /// A TableProvider that returns schemas for logical planning vs physical planning.
+    /// Used to test schema mismatch error messages.
+    #[derive(Debug)]
+    struct MockSchemaTableProvider {
+        logical_schema: SchemaRef,
+        physical_schema: SchemaRef,
+    }
+
+    #[async_trait]
+    impl TableProvider for MockSchemaTableProvider {
+        fn schema(&self) -> SchemaRef {
+            Arc::clone(&self.logical_schema)
+        }
+
+        fn table_type(&self) -> TableType {
+            TableType::Base
+        }
+
+        async fn scan(
+            &self,
+            _state: &dyn Session,
+            _projection: Option<&Vec<usize>>,
+            _filters: &[Expr],
+            _limit: Option<usize>,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
+            Ok(Arc::new(NoOpExecutionPlan::new(Arc::clone(
+                &self.physical_schema,
+            ))))
+        }
+    }
+
+    /// Attempts to plan a query with potentially mismatched schemas.
+    async fn plan_with_schemas(
+        logical_schema: SchemaRef,
+        physical_schema: SchemaRef,
+        query: &str,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let provider = MockSchemaTableProvider {
+            logical_schema,
+            physical_schema,
+        };
+        let ctx = SessionContext::new();
+        ctx.register_table("test", Arc::new(provider)).unwrap();
+
+        ctx.sql(query).await.unwrap().create_physical_plan().await
+    }
+
+    #[tokio::test]
+    // When schemas match, planning proceeds past the schema_satisfied_by check.
+    // It then panics on unimplemented error in NoOpExecutionPlan.
+    #[should_panic(expected = "NoOpExecutionPlan")]
+    async fn test_aggregate_schema_check_passes() {
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
+
+        plan_with_schemas(
+            Arc::clone(&schema),
+            schema,
+            "SELECT count(*) FROM test GROUP BY c1",
+        )
+        .await
+        .unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_aggregate_schema_mismatch_metadata() {
+        let logical_schema =
+            Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
+        let physical_schema = Arc::new(
+            Schema::new(vec![Field::new("c1", DataType::Int32, false)])
+                .with_metadata(HashMap::from([("key".into(), "value".into())])),
+        );
+
+        let err = plan_with_schemas(
+            logical_schema,
+            physical_schema,
+            "SELECT count(*) FROM test GROUP BY c1",
+        )
+        .await
+        .unwrap_err();
+
+        assert_contains!(err.to_string(), "schema metadata differs");
+    }
+
+    #[tokio::test]
+    async fn test_aggregate_schema_mismatch_field_count() {
+        let logical_schema =
+            Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
+        let physical_schema = Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Int32, false),
+            Field::new("c2", DataType::Int32, false),
+        ]));
+
+        let err = plan_with_schemas(
+            logical_schema,
+            physical_schema,
+            "SELECT count(*) FROM test GROUP BY c1",
+        )
+        .await
+        .unwrap_err();
+
+        assert_contains!(err.to_string(), "Different number of fields");
+    }
+
+    #[tokio::test]
+    async fn test_aggregate_schema_mismatch_field_name() {
+        let logical_schema =
+            Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
+        let physical_schema = Arc::new(Schema::new(vec![Field::new(
+            "different_name",
+            DataType::Int32,
+            false,
+        )]));
+
+        let err = plan_with_schemas(
+            logical_schema,
+            physical_schema,
+            "SELECT count(*) FROM test GROUP BY c1",
+        )
+        .await
+        .unwrap_err();
+
+        assert_contains!(err.to_string(), "field name at index");
+    }
+
+    #[tokio::test]
+    async fn test_aggregate_schema_mismatch_field_type() {
+        let logical_schema =
+            Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
+        let physical_schema =
+            Arc::new(Schema::new(vec![Field::new("c1", DataType::Int64, false)]));
+
+        let err = plan_with_schemas(
+            logical_schema,
+            physical_schema,
+            "SELECT count(*) FROM test GROUP BY c1",
+        )
+        .await
+        .unwrap_err();
+
+        assert_contains!(err.to_string(), "field data type at index");
+    }
+
+    #[tokio::test]
+    async fn test_aggregate_schema_mismatch_field_nullability() {
+        let logical_schema =
+            Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
+        let physical_schema =
+            Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, true)]));
+
+        let err = plan_with_schemas(
+            logical_schema,
+            physical_schema,
+            "SELECT count(*) FROM test GROUP BY c1",
+        )
+        .await
+        .unwrap_err();
+
+        assert_contains!(err.to_string(), "field nullability at index");
+    }
+
+    #[tokio::test]
+    async fn test_aggregate_schema_mismatch_field_metadata() {
+        let logical_schema =
+            Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
+        let physical_schema = Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Int32, false)
+                .with_metadata(HashMap::from([("key".into(), "value".into())])),
+        ]));
+
+        let err = plan_with_schemas(
+            logical_schema,
+            physical_schema,
+            "SELECT count(*) FROM test GROUP BY c1",
+        )
+        .await
+        .unwrap_err();
+
+        assert_contains!(err.to_string(), "field metadata at index");
+    }
+
+    #[tokio::test]
+    async fn test_aggregate_schema_mismatch_multiple() {
+        let logical_schema = Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Int32, false),
+            Field::new("c2", DataType::Utf8, false),
+        ]));
+        let physical_schema = Arc::new(
+            Schema::new(vec![
+                Field::new("c1", DataType::Int64, true)
+                    .with_metadata(HashMap::from([("key".into(), "value".into())])),
+                Field::new("c2", DataType::Utf8, false),
+            ])
+            .with_metadata(HashMap::from([(
+                "schema_key".into(),
+                "schema_value".into(),
+            )])),
+        );
+
+        let err = plan_with_schemas(
+            logical_schema,
+            physical_schema,
+            "SELECT count(*) FROM test GROUP BY c1",
+        )
+        .await
+        .unwrap_err();
+
+        // Verify all applicable error fragments are present
+        let err_str = err.to_string();
+        assert_contains!(&err_str, "schema metadata differs");
+        assert_contains!(&err_str, "field data type at index");
+        assert_contains!(&err_str, "field nullability at index");
+        assert_contains!(&err_str, "field metadata at index");
+    }
+
+    #[derive(Debug)]
+    struct MockTableSource {
+        schema: SchemaRef,
+    }
+
+    impl TableSource for MockTableSource {
+        fn schema(&self) -> SchemaRef {
+            Arc::clone(&self.schema)
+        }
+    }
+
+    struct MockTableScanExtensionPlanner;
+
+    #[async_trait]
+    impl ExtensionPlanner for MockTableScanExtensionPlanner {
+        async fn plan_extension(
+            &self,
+            _planner: &dyn PhysicalPlanner,
+            _node: &dyn UserDefinedLogicalNode,
+            _logical_inputs: &[&LogicalPlan],
+            _physical_inputs: &[Arc<dyn ExecutionPlan>],
+            _session_state: &SessionState,
+        ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+            Ok(None)
+        }
+
+        async fn plan_table_scan(
+            &self,
+            _planner: &dyn PhysicalPlanner,
+            scan: &TableScan,
+            _session_state: &SessionState,
+        ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+            if scan.source.is::<MockTableSource>() {
+                Ok(Some(Arc::new(EmptyExec::new(Arc::clone(
+                    scan.projected_schema.inner(),
+                )))))
+            } else {
+                Ok(None)
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_table_scan_extension_planner() {
+        let session_state = make_session_state();
+        let planner = Arc::new(MockTableScanExtensionPlanner);
+        let physical_planner =
+            DefaultPhysicalPlanner::with_extension_planners(vec![planner]);
+
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        let table_source = Arc::new(MockTableSource {
+            schema: Arc::clone(&schema),
+        });
+        let logical_plan = LogicalPlanBuilder::scan("test", table_source, None)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let plan = physical_planner
+            .create_physical_plan(&logical_plan, &session_state)
+            .await
+            .unwrap();
+
+        assert_eq!(plan.schema(), schema);
+        assert!(plan.is::<EmptyExec>());
+    }
 }
diff --git a/datafusion/core/src/prelude.rs b/datafusion/core/src/prelude.rs
index d723620d32323..31d9d7eb471f0 100644
--- a/datafusion/core/src/prelude.rs
+++ b/datafusion/core/src/prelude.rs
@@ -29,15 +29,15 @@ pub use crate::dataframe;
 pub use crate::dataframe::DataFrame;
 pub use crate::execution::context::{SQLOptions, SessionConfig, SessionContext};
 pub use crate::execution::options::{
-    AvroReadOptions, CsvReadOptions, NdJsonReadOptions, ParquetReadOptions,
+    AvroReadOptions, CsvReadOptions, JsonReadOptions, ParquetReadOptions,
 };
 
 pub use datafusion_common::Column;
 pub use datafusion_expr::{
+    Expr,
     expr_fn::*,
     lit, lit_timestamp_nano,
     logical_plan::{JoinType, Partitioning},
-    Expr,
 };
 pub use datafusion_functions::expr_fn::*;
 #[cfg(feature = "nested_expressions")]
diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs
index 68f83e7f1f115..717182f1d3d5b 100644
--- a/datafusion/core/src/test/mod.rs
+++ b/datafusion/core/src/test/mod.rs
@@ -25,9 +25,9 @@ use std::io::{BufReader, BufWriter};
 use std::path::Path;
 use std::sync::Arc;
 
+use crate::datasource::file_format::FileFormat;
 use crate::datasource::file_format::csv::CsvFormat;
 use crate::datasource::file_format::file_compression_type::FileCompressionType;
-use crate::datasource::file_format::FileFormat;
 
 use crate::datasource::physical_plan::CsvSource;
 use crate::datasource::{MemTable, TableProvider};
@@ -35,28 +35,31 @@ use crate::error::Result;
 use crate::logical_expr::LogicalPlan;
 use crate::test_util::{aggr_test_schema, arrow_test_data};
 
+use datafusion_common::config::CsvOptions;
+
 use arrow::array::{self, Array, ArrayRef, Decimal128Builder, Int32Array};
 use arrow::datatypes::{DataType, Field, Schema};
 use arrow::record_batch::RecordBatch;
 #[cfg(feature = "compression")]
 use datafusion_common::DataFusionError;
+use datafusion_datasource::TableSchema;
 use datafusion_datasource::source::DataSourceExec;
 
-#[cfg(feature = "compression")]
-use bzip2::write::BzEncoder;
 #[cfg(feature = "compression")]
 use bzip2::Compression as BzCompression;
+#[cfg(feature = "compression")]
+use bzip2::write::BzEncoder;
 use datafusion_datasource::file_groups::FileGroup;
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use datafusion_datasource_csv::partitioned_csv_config;
 #[cfg(feature = "compression")]
+use flate2::Compression as GzCompression;
+#[cfg(feature = "compression")]
 use flate2::write::GzEncoder;
 #[cfg(feature = "compression")]
-use flate2::Compression as GzCompression;
+use liblzma::write::XzEncoder;
 use object_store::local_unpartitioned_file;
 #[cfg(feature = "compression")]
-use xz2::write::XzEncoder;
-#[cfg(feature = "compression")]
 use zstd::Encoder as ZstdEncoder;
 
 pub fn create_table_dual() -> Arc<dyn TableProvider> {
@@ -84,17 +87,26 @@ pub fn scan_partitioned_csv(
     let schema = aggr_test_schema();
     let filename = "aggregate_test_100.csv";
     let path = format!("{}/csv", arrow_test_data());
+    let csv_format: Arc<dyn FileFormat> = Arc::new(CsvFormat::default());
+
     let file_groups = partitioned_file_groups(
         path.as_str(),
         filename,
         partitions,
-        Arc::new(CsvFormat::default()),
+        &csv_format,
         FileCompressionType::UNCOMPRESSED,
         work_dir,
     )?;
-    let source = Arc::new(CsvSource::new(true, b'"', b'"'));
+    let options = CsvOptions {
+        has_header: Some(true),
+        delimiter: b',',
+        quote: b'"',
+        ..Default::default()
+    };
+    let table_schema = TableSchema::from_file_schema(schema);
+    let source = Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
     let config =
-        FileScanConfigBuilder::from(partitioned_csv_config(schema, file_groups, source))
+        FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
             .with_file_compression_type(FileCompressionType::UNCOMPRESSED)
             .build();
     Ok(DataSourceExec::from_data_source(config))
@@ -105,7 +117,7 @@ pub fn partitioned_file_groups(
     path: &str,
     filename: &str,
     partitions: usize,
-    file_format: Arc<dyn FileFormat>,
+    file_format: &Arc<dyn FileFormat>,
     file_compression_type: FileCompressionType,
     work_dir: &Path,
 ) -> Result<Vec<FileGroup>> {
@@ -189,7 +201,7 @@ pub fn partitioned_file_groups(
         .collect::<Vec<_>>())
 }
 
-pub fn assert_fields_eq(plan: &LogicalPlan, expected: Vec<&str>) {
+pub fn assert_fields_eq(plan: &LogicalPlan, expected: &[&str]) {
     let actual: Vec<String> = plan
         .schema()
         .fields()
diff --git a/datafusion/core/src/test/object_store.rs b/datafusion/core/src/test/object_store.rs
index d31c2719973ec..62c6699f8fcd1 100644
--- a/datafusion/core/src/test/object_store.rs
+++ b/datafusion/core/src/test/object_store.rs
@@ -20,20 +20,21 @@
 use crate::{
     execution::{context::SessionState, session_state::SessionStateBuilder},
     object_store::{
-        memory::InMemory, path::Path, Error, GetOptions, GetResult, ListResult,
-        MultipartUpload, ObjectMeta, ObjectStore, PutMultipartOptions, PutOptions,
-        PutPayload, PutResult,
+        Error, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta,
+        ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult,
+        memory::InMemory, path::Path,
     },
     prelude::SessionContext,
 };
-use futures::{stream::BoxStream, FutureExt};
+use futures::{FutureExt, stream::BoxStream};
+use object_store::{CopyOptions, ObjectStoreExt};
 use std::{
     fmt::{Debug, Display, Formatter},
     sync::Arc,
 };
 use tokio::{
     sync::Barrier,
-    time::{timeout, Duration},
+    time::{Duration, timeout},
 };
 use url::Url;
 
@@ -130,39 +131,40 @@ impl ObjectStore for BlockingObjectStore {
         location: &Path,
         options: GetOptions,
     ) -> object_store::Result<GetResult> {
-        self.inner.get_opts(location, options).await
-    }
-
-    async fn head(&self, location: &Path) -> object_store::Result<ObjectMeta> {
-        println!(
-            "{} received head call for {location}",
-            BlockingObjectStore::NAME
-        );
-        // Wait until the expected number of concurrent calls is reached, but timeout after 1 second to avoid hanging failing tests.
-        let wait_result = timeout(Duration::from_secs(1), self.barrier.wait()).await;
-        match wait_result {
-            Ok(_) => println!(
-                "{} barrier reached for {location}",
+        if options.head {
+            println!(
+                "{} received head call for {location}",
                 BlockingObjectStore::NAME
-            ),
-            Err(_) => {
-                let error_message = format!(
-                    "{} barrier wait timed out for {location}",
+            );
+            // Wait until the expected number of concurrent calls is reached, but timeout after 1 second to avoid hanging failing tests.
+            let wait_result = timeout(Duration::from_secs(1), self.barrier.wait()).await;
+            match wait_result {
+                Ok(_) => println!(
+                    "{} barrier reached for {location}",
                     BlockingObjectStore::NAME
-                );
-                log::error!("{error_message}");
-                return Err(Error::Generic {
-                    store: BlockingObjectStore::NAME,
-                    source: error_message.into(),
-                });
+                ),
+                Err(_) => {
+                    let error_message = format!(
+                        "{} barrier wait timed out for {location}",
+                        BlockingObjectStore::NAME
+                    );
+                    log::error!("{error_message}");
+                    return Err(Error::Generic {
+                        store: BlockingObjectStore::NAME,
+                        source: error_message.into(),
+                    });
+                }
             }
         }
+
         // Forward the call to the inner object store.
-        self.inner.head(location).await
+        self.inner.get_opts(location, options).await
     }
-
-    async fn delete(&self, location: &Path) -> object_store::Result<()> {
-        self.inner.delete(location).await
+    fn delete_stream(
+        &self,
+        locations: BoxStream<'static, object_store::Result<Path>>,
+    ) -> BoxStream<'static, object_store::Result<Path>> {
+        self.inner.delete_stream(locations)
     }
 
     fn list(
@@ -179,15 +181,12 @@ impl ObjectStore for BlockingObjectStore {
         self.inner.list_with_delimiter(prefix).await
     }
 
-    async fn copy(&self, from: &Path, to: &Path) -> object_store::Result<()> {
-        self.inner.copy(from, to).await
-    }
-
-    async fn copy_if_not_exists(
+    async fn copy_opts(
         &self,
         from: &Path,
         to: &Path,
+        options: CopyOptions,
     ) -> object_store::Result<()> {
-        self.inner.copy_if_not_exists(from, to).await
+        self.inner.copy_opts(from, to, options).await
     }
 }
diff --git a/datafusion/core/src/test_util/mod.rs b/datafusion/core/src/test_util/mod.rs
index 7149c5b0bd8ca..aad659eacbe55 100644
--- a/datafusion/core/src/test_util/mod.rs
+++ b/datafusion/core/src/test_util/mod.rs
@@ -23,8 +23,8 @@ pub mod parquet;
 pub mod csv;
 
 use futures::Stream;
-use std::any::Any;
 use std::collections::HashMap;
+use std::fmt::Formatter;
 use std::fs::File;
 use std::io::Write;
 use std::path::Path;
@@ -36,16 +36,20 @@ use crate::dataframe::DataFrame;
 use crate::datasource::stream::{FileStreamProvider, StreamConfig, StreamTable};
 use crate::datasource::{empty::EmptyTable, provider_as_source};
 use crate::error::Result;
+use crate::execution::session_state::CacheFactory;
 use crate::logical_expr::{LogicalPlanBuilder, UNNAMED_TABLE};
 use crate::physical_plan::ExecutionPlan;
 use crate::prelude::{CsvReadOptions, SessionContext};
 
-use crate::execution::SendableRecordBatchStream;
+use crate::execution::{SendableRecordBatchStream, SessionState, SessionStateBuilder};
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use datafusion_catalog::Session;
-use datafusion_common::TableReference;
-use datafusion_expr::{CreateExternalTable, Expr, SortExpr, TableType};
+use datafusion_common::{DFSchemaRef, TableReference};
+use datafusion_expr::{
+    CreateExternalTable, Expr, LogicalPlan, SortExpr, TableType,
+    UserDefinedLogicalNodeCore,
+};
 use std::pin::Pin;
 
 use async_trait::async_trait;
@@ -203,10 +207,6 @@ impl TestTableProvider {}
 
 #[async_trait]
 impl TableProvider for TestTableProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         Arc::clone(&self.schema)
     }
@@ -282,3 +282,67 @@ impl RecordBatchStream for BoundedStream {
         self.record_batch.schema()
     }
 }
+
+#[derive(Hash, Eq, PartialEq, PartialOrd, Debug)]
+struct CacheNode {
+    input: LogicalPlan,
+}
+
+impl UserDefinedLogicalNodeCore for CacheNode {
+    fn name(&self) -> &str {
+        "CacheNode"
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.input]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        self.input.schema()
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result {
+        write!(f, "CacheNode")
+    }
+
+    fn with_exprs_and_inputs(
+        &self,
+        _exprs: Vec<Expr>,
+        inputs: Vec<LogicalPlan>,
+    ) -> Result<Self> {
+        assert_eq!(inputs.len(), 1, "input size inconsistent");
+        Ok(Self {
+            input: inputs[0].clone(),
+        })
+    }
+}
+
+#[derive(Debug)]
+struct TestCacheFactory {}
+
+impl CacheFactory for TestCacheFactory {
+    fn create(
+        &self,
+        plan: LogicalPlan,
+        _session_state: &SessionState,
+    ) -> Result<LogicalPlan> {
+        Ok(LogicalPlan::Extension(datafusion_expr::Extension {
+            node: Arc::new(CacheNode { input: plan }),
+        }))
+    }
+}
+
+/// Create a test table registered to a session context with an associated cache factory
+pub async fn test_table_with_cache_factory() -> Result<DataFrame> {
+    let session_state = SessionStateBuilder::new()
+        .with_cache_factory(Some(Arc::new(TestCacheFactory {})))
+        .build();
+    let ctx = SessionContext::new_with_state(session_state);
+    let name = "aggregate_test_100";
+    register_aggregate_csv(&ctx, name).await?;
+    ctx.table(name).await
+}
diff --git a/datafusion/core/src/test_util/parquet.rs b/datafusion/core/src/test_util/parquet.rs
index 203d9e97d2a8c..c53495421307b 100644
--- a/datafusion/core/src/test_util/parquet.rs
+++ b/datafusion/core/src/test_util/parquet.rs
@@ -32,17 +32,15 @@ use crate::logical_expr::execution_props::ExecutionProps;
 use crate::logical_expr::simplify::SimplifyContext;
 use crate::optimizer::simplify_expressions::ExprSimplifier;
 use crate::physical_expr::create_physical_expr;
+use crate::physical_plan::ExecutionPlan;
 use crate::physical_plan::filter::FilterExec;
 use crate::physical_plan::metrics::MetricsSet;
-use crate::physical_plan::ExecutionPlan;
 use crate::prelude::{Expr, SessionConfig, SessionContext};
 
-use datafusion_datasource::file::FileSource;
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use datafusion_datasource::source::DataSourceExec;
-use datafusion_datasource::TableSchema;
-use object_store::path::Path;
 use object_store::ObjectMeta;
+use object_store::path::Path;
 use parquet::arrow::ArrowWriter;
 use parquet::file::properties::WriterProperties;
 
@@ -157,26 +155,20 @@ impl TestParquetFile {
         maybe_filter: Option<Expr>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let parquet_options = ctx.copied_table_options().parquet;
-        let source = Arc::new(ParquetSource::new(parquet_options.clone()));
-        let scan_config_builder = FileScanConfigBuilder::new(
-            self.object_store_url.clone(),
-            Arc::clone(&self.schema),
-            source,
-        )
-        .with_file(PartitionedFile {
-            object_meta: self.object_meta.clone(),
-            partition_values: vec![],
-            range: None,
-            statistics: None,
-            extensions: None,
-            metadata_size_hint: None,
-        });
+        let source = Arc::new(
+            ParquetSource::new(Arc::clone(&self.schema))
+                .with_table_parquet_options(parquet_options.clone()),
+        );
+        let scan_config_builder =
+            FileScanConfigBuilder::new(self.object_store_url.clone(), source)
+                .with_file(PartitionedFile::new_from_meta(self.object_meta.clone()));
 
         let df_schema = Arc::clone(&self.schema).to_dfschema_ref()?;
 
         // run coercion on the filters to coerce types etc.
-        let props = ExecutionProps::new();
-        let context = SimplifyContext::new(&props).with_schema(Arc::clone(&df_schema));
+        let context = SimplifyContext::builder()
+            .with_schema(Arc::clone(&df_schema))
+            .build();
         if let Some(filter) = maybe_filter {
             let simplifier = ExprSimplifier::new(context);
             let filter = simplifier.coerce(filter, &df_schema).unwrap();
@@ -184,10 +176,10 @@ impl TestParquetFile {
                 create_physical_expr(&filter, &df_schema, &ExecutionProps::default())?;
 
             let source = Arc::new(
-                ParquetSource::new(parquet_options)
+                ParquetSource::new(Arc::clone(&self.schema))
+                    .with_table_parquet_options(parquet_options)
                     .with_predicate(Arc::clone(&physical_filter_expr)),
-            )
-            .with_schema(TableSchema::from_file_schema(Arc::clone(&self.schema)));
+            );
             let config = scan_config_builder.with_source(source).build();
             let parquet_exec = DataSourceExec::from_data_source(config);
 
@@ -204,13 +196,12 @@ impl TestParquetFile {
     /// Recursively searches for DataSourceExec and returns the metrics
     /// on the first one it finds
     pub fn parquet_metrics(plan: &Arc<dyn ExecutionPlan>) -> Option<MetricsSet> {
-        if let Some(data_source_exec) = plan.as_any().downcast_ref::<DataSourceExec>() {
-            if data_source_exec
+        if let Some(data_source_exec) = plan.downcast_ref::<DataSourceExec>()
+            && data_source_exec
                 .downcast_to_file_source::<ParquetSource>()
                 .is_some()
-            {
-                return data_source_exec.metrics();
-            }
+        {
+            return data_source_exec.metrics();
         }
 
         for child in plan.children() {
diff --git a/datafusion/core/tests/catalog/memory.rs b/datafusion/core/tests/catalog/memory.rs
index 06ed141b2e8bd..b49183e92e387 100644
--- a/datafusion/core/tests/catalog/memory.rs
+++ b/datafusion/core/tests/catalog/memory.rs
@@ -26,7 +26,6 @@ use datafusion_catalog::memory::*;
 use datafusion_catalog::{SchemaProvider, TableProvider};
 use datafusion_common::test_util::batches_to_string;
 use insta::assert_snapshot;
-use std::any::Any;
 use std::sync::Arc;
 
 #[test]
@@ -83,10 +82,6 @@ fn default_register_schema_not_supported() {
     #[derive(Debug)]
     struct TestProvider {}
     impl CatalogProvider for TestProvider {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
         fn schema_names(&self) -> Vec<String> {
             unimplemented!()
         }
@@ -116,10 +111,12 @@ async fn test_mem_provider() {
     assert!(provider.deregister_table(table_name).unwrap().is_none());
     let test_table = EmptyTable::new(Arc::new(Schema::empty()));
     // register table successfully
-    assert!(provider
-        .register_table(table_name.to_string(), Arc::new(test_table))
-        .unwrap()
-        .is_none());
+    assert!(
+        provider
+            .register_table(table_name.to_string(), Arc::new(test_table))
+            .unwrap()
+            .is_none()
+    );
     assert!(provider.table_exist(table_name));
     let other_table = EmptyTable::new(Arc::new(Schema::empty()));
     let result = provider.register_table(table_name.to_string(), Arc::new(other_table));
diff --git a/datafusion/core/tests/catalog_listing/mod.rs b/datafusion/core/tests/catalog_listing/mod.rs
new file mode 100644
index 0000000000000..cb6cac4fb0672
--- /dev/null
+++ b/datafusion/core/tests/catalog_listing/mod.rs
@@ -0,0 +1,18 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod pruned_partition_list;
diff --git a/datafusion/core/tests/catalog_listing/pruned_partition_list.rs b/datafusion/core/tests/catalog_listing/pruned_partition_list.rs
new file mode 100644
index 0000000000000..8f93dc17dbad2
--- /dev/null
+++ b/datafusion/core/tests/catalog_listing/pruned_partition_list.rs
@@ -0,0 +1,251 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow_schema::DataType;
+use futures::{FutureExt, StreamExt as _, TryStreamExt as _};
+use object_store::{ObjectStoreExt, memory::InMemory, path::Path};
+
+use datafusion::execution::SessionStateBuilder;
+use datafusion_catalog_listing::helpers::{
+    describe_partition, list_partitions, pruned_partition_list,
+};
+use datafusion_common::ScalarValue;
+use datafusion_datasource::ListingTableUrl;
+use datafusion_expr::{Expr, col, lit};
+use datafusion_session::Session;
+
+#[tokio::test]
+async fn test_pruned_partition_list_empty() {
+    let (store, state) = make_test_store_and_state(&[
+        ("tablepath/mypartition=val1/notparquetfile", 100),
+        ("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0),
+        ("tablepath/file.parquet", 100),
+        ("tablepath/notapartition/file.parquet", 100),
+        ("tablepath/notmypartition=val1/file.parquet", 100),
+    ]);
+    let filter = Expr::eq(col("mypartition"), lit("val1"));
+    let pruned = pruned_partition_list(
+        state.as_ref(),
+        store.as_ref(),
+        &ListingTableUrl::parse("file:///tablepath/").unwrap(),
+        &[filter],
+        ".parquet",
+        &[(String::from("mypartition"), DataType::Utf8)],
+    )
+    .await
+    .expect("partition pruning failed")
+    .collect::<Vec<_>>()
+    .await;
+
+    assert_eq!(pruned.len(), 0);
+}
+
+#[tokio::test]
+async fn test_pruned_partition_list() {
+    let (store, state) = make_test_store_and_state(&[
+        ("tablepath/mypartition=val1/file.parquet", 100),
+        ("tablepath/mypartition=val2/file.parquet", 100),
+        ("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0),
+        ("tablepath/mypartition=val1/other=val3/file.parquet", 100),
+        ("tablepath/notapartition/file.parquet", 100),
+        ("tablepath/notmypartition=val1/file.parquet", 100),
+    ]);
+    let filter = Expr::eq(col("mypartition"), lit("val1"));
+    let pruned = pruned_partition_list(
+        state.as_ref(),
+        store.as_ref(),
+        &ListingTableUrl::parse("file:///tablepath/").unwrap(),
+        &[filter],
+        ".parquet",
+        &[(String::from("mypartition"), DataType::Utf8)],
+    )
+    .await
+    .expect("partition pruning failed")
+    .try_collect::<Vec<_>>()
+    .await
+    .unwrap();
+
+    assert_eq!(pruned.len(), 2);
+    let f1 = &pruned[0];
+    assert_eq!(
+        f1.object_meta.location.as_ref(),
+        "tablepath/mypartition=val1/file.parquet"
+    );
+    assert_eq!(&f1.partition_values, &[ScalarValue::from("val1")]);
+    let f2 = &pruned[1];
+    assert_eq!(
+        f2.object_meta.location.as_ref(),
+        "tablepath/mypartition=val1/other=val3/file.parquet"
+    );
+    assert_eq!(f2.partition_values, &[ScalarValue::from("val1"),]);
+}
+
+#[tokio::test]
+async fn test_pruned_partition_list_multi() {
+    let (store, state) = make_test_store_and_state(&[
+        ("tablepath/part1=p1v1/file.parquet", 100),
+        ("tablepath/part1=p1v2/part2=p2v1/file1.parquet", 100),
+        ("tablepath/part1=p1v2/part2=p2v1/file2.parquet", 100),
+        ("tablepath/part1=p1v3/part2=p2v1/file2.parquet", 100),
+        ("tablepath/part1=p1v2/part2=p2v2/file2.parquet", 100),
+    ]);
+    let filter1 = Expr::eq(col("part1"), lit("p1v2"));
+    let filter2 = Expr::eq(col("part2"), lit("p2v1"));
+    let pruned = pruned_partition_list(
+        state.as_ref(),
+        store.as_ref(),
+        &ListingTableUrl::parse("file:///tablepath/").unwrap(),
+        &[filter1, filter2],
+        ".parquet",
+        &[
+            (String::from("part1"), DataType::Utf8),
+            (String::from("part2"), DataType::Utf8),
+        ],
+    )
+    .await
+    .expect("partition pruning failed")
+    .try_collect::<Vec<_>>()
+    .await
+    .unwrap();
+
+    assert_eq!(pruned.len(), 2);
+    let f1 = &pruned[0];
+    assert_eq!(
+        f1.object_meta.location.as_ref(),
+        "tablepath/part1=p1v2/part2=p2v1/file1.parquet"
+    );
+    assert_eq!(
+        &f1.partition_values,
+        &[ScalarValue::from("p1v2"), ScalarValue::from("p2v1"),]
+    );
+    let f2 = &pruned[1];
+    assert_eq!(
+        f2.object_meta.location.as_ref(),
+        "tablepath/part1=p1v2/part2=p2v1/file2.parquet"
+    );
+    assert_eq!(
+        &f2.partition_values,
+        &[ScalarValue::from("p1v2"), ScalarValue::from("p2v1")]
+    );
+}
+
+#[tokio::test]
+async fn test_list_partition() {
+    let (store, _) = make_test_store_and_state(&[
+        ("tablepath/part1=p1v1/file.parquet", 100),
+        ("tablepath/part1=p1v2/part2=p2v1/file1.parquet", 100),
+        ("tablepath/part1=p1v2/part2=p2v1/file2.parquet", 100),
+        ("tablepath/part1=p1v3/part2=p2v1/file3.parquet", 100),
+        ("tablepath/part1=p1v2/part2=p2v2/file4.parquet", 100),
+        ("tablepath/part1=p1v2/part2=p2v2/empty.parquet", 0),
+    ]);
+
+    let partitions = list_partitions(
+        store.as_ref(),
+        &ListingTableUrl::parse("file:///tablepath/").unwrap(),
+        0,
+        None,
+    )
+    .await
+    .expect("listing partitions failed");
+
+    assert_eq!(
+        &partitions
+            .iter()
+            .map(describe_partition)
+            .collect::<Vec<_>>(),
+        &vec![
+            ("tablepath", 0, vec![]),
+            ("tablepath/part1=p1v1", 1, vec![]),
+            ("tablepath/part1=p1v2", 1, vec![]),
+            ("tablepath/part1=p1v3", 1, vec![]),
+        ]
+    );
+
+    let partitions = list_partitions(
+        store.as_ref(),
+        &ListingTableUrl::parse("file:///tablepath/").unwrap(),
+        1,
+        None,
+    )
+    .await
+    .expect("listing partitions failed");
+
+    assert_eq!(
+        &partitions
+            .iter()
+            .map(describe_partition)
+            .collect::<Vec<_>>(),
+        &vec![
+            ("tablepath", 0, vec![]),
+            ("tablepath/part1=p1v1", 1, vec!["file.parquet"]),
+            ("tablepath/part1=p1v2", 1, vec![]),
+            ("tablepath/part1=p1v2/part2=p2v1", 2, vec![]),
+            ("tablepath/part1=p1v2/part2=p2v2", 2, vec![]),
+            ("tablepath/part1=p1v3", 1, vec![]),
+            ("tablepath/part1=p1v3/part2=p2v1", 2, vec![]),
+        ]
+    );
+
+    let partitions = list_partitions(
+        store.as_ref(),
+        &ListingTableUrl::parse("file:///tablepath/").unwrap(),
+        2,
+        None,
+    )
+    .await
+    .expect("listing partitions failed");
+
+    assert_eq!(
+        &partitions
+            .iter()
+            .map(describe_partition)
+            .collect::<Vec<_>>(),
+        &vec![
+            ("tablepath", 0, vec![]),
+            ("tablepath/part1=p1v1", 1, vec!["file.parquet"]),
+            ("tablepath/part1=p1v2", 1, vec![]),
+            ("tablepath/part1=p1v3", 1, vec![]),
+            (
+                "tablepath/part1=p1v2/part2=p2v1",
+                2,
+                vec!["file1.parquet", "file2.parquet"]
+            ),
+            ("tablepath/part1=p1v2/part2=p2v2", 2, vec!["file4.parquet"]),
+            ("tablepath/part1=p1v3/part2=p2v1", 2, vec!["file3.parquet"]),
+        ]
+    );
+}
+
+pub fn make_test_store_and_state(
+    files: &[(&str, u64)],
+) -> (Arc<InMemory>, Arc<dyn Session>) {
+    let memory = InMemory::new();
+
+    for (name, size) in files {
+        memory
+            .put(&Path::from(*name), vec![0; *size as usize].into())
+            .now_or_never()
+            .unwrap()
+            .unwrap();
+    }
+
+    let state = SessionStateBuilder::new().build();
+    (Arc::new(memory), Arc::new(state))
+}
diff --git a/datafusion/core/tests/config_from_env.rs b/datafusion/core/tests/config_from_env.rs
index 976597c8a9ac5..6375d4e25d8eb 100644
--- a/datafusion/core/tests/config_from_env.rs
+++ b/datafusion/core/tests/config_from_env.rs
@@ -20,35 +20,43 @@ use std::env;
 
 #[test]
 fn from_env() {
-    // Note: these must be a single test to avoid interference from concurrent execution
-    let env_key = "DATAFUSION_OPTIMIZER_FILTER_NULL_JOIN_KEYS";
-    // valid testing in different cases
-    for bool_option in ["true", "TRUE", "True", "tRUe"] {
-        env::set_var(env_key, bool_option);
-        let config = ConfigOptions::from_env().unwrap();
-        env::remove_var(env_key);
-        assert!(config.optimizer.filter_null_join_keys);
-    }
+    unsafe {
+        // Note: these must be a single test to avoid interference from concurrent execution
+        let env_key = "DATAFUSION_OPTIMIZER_FILTER_NULL_JOIN_KEYS";
+        // valid testing in different cases
+        for bool_option in ["true", "TRUE", "True", "tRUe"] {
+            env::set_var(env_key, bool_option);
+            let config = ConfigOptions::from_env().unwrap();
+            env::remove_var(env_key);
+            assert!(config.optimizer.filter_null_join_keys);
+        }
 
-    // invalid testing
-    env::set_var(env_key, "ttruee");
-    let err = ConfigOptions::from_env().unwrap_err().strip_backtrace();
-    assert_eq!(err, "Error parsing 'ttruee' as bool\ncaused by\nExternal error: provided string was not `true` or `false`");
-    env::remove_var(env_key);
+        // invalid testing
+        env::set_var(env_key, "ttruee");
+        let err = ConfigOptions::from_env().unwrap_err().strip_backtrace();
+        assert_eq!(
+            err,
+            "Error parsing 'ttruee' as bool\ncaused by\nExternal error: provided string was not `true` or `false`"
+        );
+        env::remove_var(env_key);
 
-    let env_key = "DATAFUSION_EXECUTION_BATCH_SIZE";
+        let env_key = "DATAFUSION_EXECUTION_BATCH_SIZE";
 
-    // for valid testing
-    env::set_var(env_key, "4096");
-    let config = ConfigOptions::from_env().unwrap();
-    assert_eq!(config.execution.batch_size, 4096);
+        // for valid testing
+        env::set_var(env_key, "4096");
+        let config = ConfigOptions::from_env().unwrap();
+        assert_eq!(config.execution.batch_size, 4096);
 
-    // for invalid testing
-    env::set_var(env_key, "abc");
-    let err = ConfigOptions::from_env().unwrap_err().strip_backtrace();
-    assert_eq!(err, "Error parsing 'abc' as usize\ncaused by\nExternal error: invalid digit found in string");
+        // for invalid testing
+        env::set_var(env_key, "abc");
+        let err = ConfigOptions::from_env().unwrap_err().strip_backtrace();
+        assert_eq!(
+            err,
+            "Error parsing 'abc' as usize\ncaused by\nExternal error: invalid digit found in string"
+        );
 
-    env::remove_var(env_key);
-    let config = ConfigOptions::from_env().unwrap();
-    assert_eq!(config.execution.batch_size, 8192); // set to its default value
+        env::remove_var(env_key);
+        let config = ConfigOptions::from_env().unwrap();
+        assert_eq!(config.execution.batch_size, 8192); // set to its default value
+    }
 }
diff --git a/datafusion/core/tests/core_integration.rs b/datafusion/core/tests/core_integration.rs
index edcf039e4e704..99783427f022e 100644
--- a/datafusion/core/tests/core_integration.rs
+++ b/datafusion/core/tests/core_integration.rs
@@ -48,18 +48,21 @@ mod optimizer;
 /// Run all tests that are found in the `physical_optimizer` directory
 mod physical_optimizer;
 
-/// Run all tests that are found in the `schema_adapter` directory
-mod schema_adapter;
-
 /// Run all tests that are found in the `serde` directory
 mod serde;
 
 /// Run all tests that are found in the `catalog` directory
 mod catalog;
 
+/// Run all tests that are found in the `catalog_listing` directory
+mod catalog_listing;
+
 /// Run all tests that are found in the `tracing` directory
 mod tracing;
 
+/// Run all tests that are found in the `extension_types` directory
+mod extension_types;
+
 #[cfg(test)]
 #[ctor::ctor]
 fn init() {
diff --git a/datafusion/core/tests/custom_sources_cases/dml_planning.rs b/datafusion/core/tests/custom_sources_cases/dml_planning.rs
new file mode 100644
index 0000000000000..24a3df7e0a8fa
--- /dev/null
+++ b/datafusion/core/tests/custom_sources_cases/dml_planning.rs
@@ -0,0 +1,806 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Tests for DELETE, UPDATE, and TRUNCATE planning to verify filter and assignment extraction.
+
+use std::sync::{Arc, Mutex};
+
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use async_trait::async_trait;
+use datafusion::datasource::{TableProvider, TableType};
+use datafusion::error::Result;
+use datafusion::execution::context::{SessionConfig, SessionContext};
+use datafusion::logical_expr::{
+    Expr, LogicalPlan, TableProviderFilterPushDown, TableScan,
+};
+use datafusion_catalog::Session;
+use datafusion_common::ScalarValue;
+use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::empty::EmptyExec;
+
+/// A TableProvider that captures the filters passed to delete_from().
+struct CaptureDeleteProvider {
+    schema: SchemaRef,
+    received_filters: Arc<Mutex<Option<Vec<Expr>>>>,
+    filter_pushdown: TableProviderFilterPushDown,
+    per_filter_pushdown: Option<Vec<TableProviderFilterPushDown>>,
+}
+
+impl CaptureDeleteProvider {
+    fn new(schema: SchemaRef) -> Self {
+        Self {
+            schema,
+            received_filters: Arc::new(Mutex::new(None)),
+            filter_pushdown: TableProviderFilterPushDown::Unsupported,
+            per_filter_pushdown: None,
+        }
+    }
+
+    fn new_with_filter_pushdown(
+        schema: SchemaRef,
+        filter_pushdown: TableProviderFilterPushDown,
+    ) -> Self {
+        Self {
+            schema,
+            received_filters: Arc::new(Mutex::new(None)),
+            filter_pushdown,
+            per_filter_pushdown: None,
+        }
+    }
+
+    fn new_with_per_filter_pushdown(
+        schema: SchemaRef,
+        per_filter_pushdown: Vec<TableProviderFilterPushDown>,
+    ) -> Self {
+        Self {
+            schema,
+            received_filters: Arc::new(Mutex::new(None)),
+            filter_pushdown: TableProviderFilterPushDown::Unsupported,
+            per_filter_pushdown: Some(per_filter_pushdown),
+        }
+    }
+
+    fn captured_filters(&self) -> Option<Vec<Expr>> {
+        self.received_filters.lock().unwrap().clone()
+    }
+}
+
+impl std::fmt::Debug for CaptureDeleteProvider {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("CaptureDeleteProvider")
+            .field("schema", &self.schema)
+            .finish()
+    }
+}
+
+#[async_trait]
+impl TableProvider for CaptureDeleteProvider {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn table_type(&self) -> TableType {
+        TableType::Base
+    }
+
+    async fn scan(
+        &self,
+        _state: &dyn Session,
+        _projection: Option<&Vec<usize>>,
+        _filters: &[Expr],
+        _limit: Option<usize>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(EmptyExec::new(Arc::clone(&self.schema))))
+    }
+
+    async fn delete_from(
+        &self,
+        _state: &dyn Session,
+        filters: Vec<Expr>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        *self.received_filters.lock().unwrap() = Some(filters);
+        Ok(Arc::new(EmptyExec::new(Arc::new(Schema::new(vec![
+            Field::new("count", DataType::UInt64, false),
+        ])))))
+    }
+
+    fn supports_filters_pushdown(
+        &self,
+        filters: &[&Expr],
+    ) -> Result<Vec<TableProviderFilterPushDown>> {
+        if let Some(per_filter) = &self.per_filter_pushdown
+            && per_filter.len() == filters.len()
+        {
+            return Ok(per_filter.clone());
+        }
+
+        Ok(vec![self.filter_pushdown.clone(); filters.len()])
+    }
+}
+
+/// A TableProvider that captures filters and assignments passed to update().
+#[expect(clippy::type_complexity)]
+struct CaptureUpdateProvider {
+    schema: SchemaRef,
+    received_filters: Arc<Mutex<Option<Vec<Expr>>>>,
+    received_assignments: Arc<Mutex<Option<Vec<(String, Expr)>>>>,
+    filter_pushdown: TableProviderFilterPushDown,
+    per_filter_pushdown: Option<Vec<TableProviderFilterPushDown>>,
+}
+
+impl CaptureUpdateProvider {
+    fn new(schema: SchemaRef) -> Self {
+        Self {
+            schema,
+            received_filters: Arc::new(Mutex::new(None)),
+            received_assignments: Arc::new(Mutex::new(None)),
+            filter_pushdown: TableProviderFilterPushDown::Unsupported,
+            per_filter_pushdown: None,
+        }
+    }
+
+    fn new_with_filter_pushdown(
+        schema: SchemaRef,
+        filter_pushdown: TableProviderFilterPushDown,
+    ) -> Self {
+        Self {
+            schema,
+            received_filters: Arc::new(Mutex::new(None)),
+            received_assignments: Arc::new(Mutex::new(None)),
+            filter_pushdown,
+            per_filter_pushdown: None,
+        }
+    }
+
+    fn captured_filters(&self) -> Option<Vec<Expr>> {
+        self.received_filters.lock().unwrap().clone()
+    }
+
+    fn captured_assignments(&self) -> Option<Vec<(String, Expr)>> {
+        self.received_assignments.lock().unwrap().clone()
+    }
+}
+
+impl std::fmt::Debug for CaptureUpdateProvider {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("CaptureUpdateProvider")
+            .field("schema", &self.schema)
+            .finish()
+    }
+}
+
+#[async_trait]
+impl TableProvider for CaptureUpdateProvider {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn table_type(&self) -> TableType {
+        TableType::Base
+    }
+
+    async fn scan(
+        &self,
+        _state: &dyn Session,
+        _projection: Option<&Vec<usize>>,
+        _filters: &[Expr],
+        _limit: Option<usize>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(EmptyExec::new(Arc::clone(&self.schema))))
+    }
+
+    async fn update(
+        &self,
+        _state: &dyn Session,
+        assignments: Vec<(String, Expr)>,
+        filters: Vec<Expr>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        *self.received_filters.lock().unwrap() = Some(filters);
+        *self.received_assignments.lock().unwrap() = Some(assignments);
+        Ok(Arc::new(EmptyExec::new(Arc::new(Schema::new(vec![
+            Field::new("count", DataType::UInt64, false),
+        ])))))
+    }
+
+    fn supports_filters_pushdown(
+        &self,
+        filters: &[&Expr],
+    ) -> Result<Vec<TableProviderFilterPushDown>> {
+        if let Some(per_filter) = &self.per_filter_pushdown
+            && per_filter.len() == filters.len()
+        {
+            return Ok(per_filter.clone());
+        }
+
+        Ok(vec![self.filter_pushdown.clone(); filters.len()])
+    }
+}
+
+/// A TableProvider that captures whether truncate() was called.
+struct CaptureTruncateProvider {
+    schema: SchemaRef,
+    truncate_called: Arc<Mutex<bool>>,
+}
+
+impl CaptureTruncateProvider {
+    fn new(schema: SchemaRef) -> Self {
+        Self {
+            schema,
+            truncate_called: Arc::new(Mutex::new(false)),
+        }
+    }
+
+    fn was_truncated(&self) -> bool {
+        *self.truncate_called.lock().unwrap()
+    }
+}
+
+impl std::fmt::Debug for CaptureTruncateProvider {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("CaptureTruncateProvider")
+            .field("schema", &self.schema)
+            .finish()
+    }
+}
+
+#[async_trait]
+impl TableProvider for CaptureTruncateProvider {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn table_type(&self) -> TableType {
+        TableType::Base
+    }
+
+    async fn scan(
+        &self,
+        _state: &dyn Session,
+        _projection: Option<&Vec<usize>>,
+        _filters: &[Expr],
+        _limit: Option<usize>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(EmptyExec::new(Arc::clone(&self.schema))))
+    }
+
+    async fn truncate(&self, _state: &dyn Session) -> Result<Arc<dyn ExecutionPlan>> {
+        *self.truncate_called.lock().unwrap() = true;
+
+        Ok(Arc::new(EmptyExec::new(Arc::new(Schema::new(vec![
+            Field::new("count", DataType::UInt64, false),
+        ])))))
+    }
+}
+
+fn test_schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("status", DataType::Utf8, true),
+        Field::new("value", DataType::Int32, true),
+    ]))
+}
+
+#[tokio::test]
+async fn test_delete_single_filter() -> Result<()> {
+    let provider = Arc::new(CaptureDeleteProvider::new(test_schema()));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    ctx.sql("DELETE FROM t WHERE id = 1")
+        .await?
+        .collect()
+        .await?;
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert_eq!(filters.len(), 1);
+    assert!(filters[0].to_string().contains("id"));
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_delete_multiple_filters() -> Result<()> {
+    let provider = Arc::new(CaptureDeleteProvider::new(test_schema()));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    ctx.sql("DELETE FROM t WHERE id = 1 AND status = 'x'")
+        .await?
+        .collect()
+        .await?;
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert!(!filters.is_empty());
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_delete_no_filters() -> Result<()> {
+    let provider = Arc::new(CaptureDeleteProvider::new(test_schema()));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    ctx.sql("DELETE FROM t").await?.collect().await?;
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert!(
+        filters.is_empty(),
+        "DELETE without WHERE should have empty filters"
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_delete_complex_expr() -> Result<()> {
+    let provider = Arc::new(CaptureDeleteProvider::new(test_schema()));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    ctx.sql("DELETE FROM t WHERE id > 5 AND (status = 'a' OR status = 'b')")
+        .await?
+        .collect()
+        .await?;
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert!(!filters.is_empty());
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_delete_filter_pushdown_extracts_table_scan_filters() -> Result<()> {
+    let provider = Arc::new(CaptureDeleteProvider::new_with_filter_pushdown(
+        test_schema(),
+        TableProviderFilterPushDown::Exact,
+    ));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    let df = ctx.sql("DELETE FROM t WHERE id = 1").await?;
+    let optimized_plan = df.clone().into_optimized_plan()?;
+
+    let mut scan_filters = Vec::new();
+    optimized_plan.apply(|node| {
+        if let LogicalPlan::TableScan(TableScan { filters, .. }) = node {
+            scan_filters.extend(filters.clone());
+        }
+        Ok(TreeNodeRecursion::Continue)
+    })?;
+
+    assert_eq!(scan_filters.len(), 1);
+    assert!(scan_filters[0].to_string().contains("id"));
+
+    df.collect().await?;
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert_eq!(filters.len(), 1);
+    assert!(filters[0].to_string().contains("id"));
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_delete_compound_filters_with_pushdown() -> Result<()> {
+    let provider = Arc::new(CaptureDeleteProvider::new_with_filter_pushdown(
+        test_schema(),
+        TableProviderFilterPushDown::Exact,
+    ));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    ctx.sql("DELETE FROM t WHERE id = 1 AND status = 'active'")
+        .await?
+        .collect()
+        .await?;
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    // Should receive both filters, not deduplicate valid separate predicates
+    assert_eq!(
+        filters.len(),
+        2,
+        "compound filters should not be over-suppressed"
+    );
+
+    let filter_strs: Vec<String> = filters.iter().map(|f| f.to_string()).collect();
+    assert!(
+        filter_strs.iter().any(|s| s.contains("id")),
+        "should contain id filter"
+    );
+    assert!(
+        filter_strs.iter().any(|s| s.contains("status")),
+        "should contain status filter"
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_delete_mixed_filter_locations() -> Result<()> {
+    // Test mixed-location filters: some in Filter node, some in TableScan.filters
+    // This happens when provider uses TableProviderFilterPushDown::Inexact,
+    // meaning it can push down some predicates but not others.
+    let provider = Arc::new(CaptureDeleteProvider::new_with_filter_pushdown(
+        test_schema(),
+        TableProviderFilterPushDown::Inexact,
+    ));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    // Execute DELETE with compound WHERE clause
+    ctx.sql("DELETE FROM t WHERE id = 1 AND status = 'active'")
+        .await?
+        .collect()
+        .await?;
+
+    // Verify that both predicates are extracted and passed to delete_from(),
+    // even though they may be split between Filter node and TableScan.filters
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert_eq!(
+        filters.len(),
+        2,
+        "should extract both predicates (union of Filter and TableScan.filters)"
+    );
+
+    let filter_strs: Vec<String> = filters.iter().map(|f| f.to_string()).collect();
+    assert!(
+        filter_strs.iter().any(|s| s.contains("id")),
+        "should contain id filter"
+    );
+    assert!(
+        filter_strs.iter().any(|s| s.contains("status")),
+        "should contain status filter"
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_delete_per_filter_pushdown_mixed_locations() -> Result<()> {
+    // Force per-filter pushdown decisions to exercise mixed locations in one query.
+    // First predicate is pushed down (Exact), second stays as residual (Unsupported).
+    let provider = Arc::new(CaptureDeleteProvider::new_with_per_filter_pushdown(
+        test_schema(),
+        vec![
+            TableProviderFilterPushDown::Exact,
+            TableProviderFilterPushDown::Unsupported,
+        ],
+    ));
+
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    let df = ctx
+        .sql("DELETE FROM t WHERE id = 1 AND status = 'active'")
+        .await?;
+    let optimized_plan = df.clone().into_optimized_plan()?;
+
+    // Only the first predicate should be pushed to TableScan.filters.
+    let mut scan_filters = Vec::new();
+    optimized_plan.apply(|node| {
+        if let LogicalPlan::TableScan(TableScan { filters, .. }) = node {
+            scan_filters.extend(filters.clone());
+        }
+        Ok(TreeNodeRecursion::Continue)
+    })?;
+    assert_eq!(scan_filters.len(), 1);
+    assert!(scan_filters[0].to_string().contains("id"));
+
+    // Both predicates should still reach the provider (union + dedup behavior).
+    df.collect().await?;
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert_eq!(filters.len(), 2);
+
+    let filter_strs: Vec<String> = filters.iter().map(|f| f.to_string()).collect();
+    assert!(
+        filter_strs.iter().any(|s| s.contains("id")),
+        "should contain pushed-down id filter"
+    );
+    assert!(
+        filter_strs.iter().any(|s| s.contains("status")),
+        "should contain residual status filter"
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_update_assignments() -> Result<()> {
+    let provider = Arc::new(CaptureUpdateProvider::new(test_schema()));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    ctx.sql("UPDATE t SET value = 100, status = 'updated' WHERE id = 5")
+        .await?
+        .collect()
+        .await?;
+
+    let assignments = provider
+        .captured_assignments()
+        .expect("assignments should be captured");
+    assert_eq!(assignments.len(), 2, "should have 2 assignments");
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert!(!filters.is_empty(), "should have filter for WHERE clause");
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_update_filter_pushdown_extracts_table_scan_filters() -> Result<()> {
+    let provider = Arc::new(CaptureUpdateProvider::new_with_filter_pushdown(
+        test_schema(),
+        TableProviderFilterPushDown::Exact,
+    ));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    let df = ctx.sql("UPDATE t SET value = 100 WHERE id = 1").await?;
+    let optimized_plan = df.clone().into_optimized_plan()?;
+
+    // Verify that the optimizer pushed down the filter into TableScan
+    let mut scan_filters = Vec::new();
+    optimized_plan.apply(|node| {
+        if let LogicalPlan::TableScan(TableScan { filters, .. }) = node {
+            scan_filters.extend(filters.clone());
+        }
+        Ok(TreeNodeRecursion::Continue)
+    })?;
+
+    assert_eq!(scan_filters.len(), 1);
+    assert!(scan_filters[0].to_string().contains("id"));
+
+    // Execute the UPDATE and verify filters were extracted and passed to update()
+    df.collect().await?;
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert_eq!(filters.len(), 1);
+    assert!(filters[0].to_string().contains("id"));
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_update_filter_pushdown_passes_table_scan_filters() -> Result<()> {
+    let provider = Arc::new(CaptureUpdateProvider::new_with_filter_pushdown(
+        test_schema(),
+        TableProviderFilterPushDown::Exact,
+    ));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    let df = ctx
+        .sql("UPDATE t SET value = 42 WHERE status = 'ready'")
+        .await?;
+    let optimized_plan = df.clone().into_optimized_plan()?;
+
+    let mut scan_filters = Vec::new();
+    optimized_plan.apply(|node| {
+        if let LogicalPlan::TableScan(TableScan { filters, .. }) = node {
+            scan_filters.extend(filters.clone());
+        }
+        Ok(TreeNodeRecursion::Continue)
+    })?;
+
+    assert!(
+        !scan_filters.is_empty(),
+        "expected filter pushdown to populate TableScan filters"
+    );
+
+    df.collect().await?;
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert!(
+        !filters.is_empty(),
+        "expected filters extracted from TableScan during UPDATE"
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_truncate_calls_provider() -> Result<()> {
+    let provider = Arc::new(CaptureTruncateProvider::new(test_schema()));
+    let config = SessionConfig::new().set(
+        "datafusion.optimizer.max_passes",
+        &ScalarValue::UInt64(Some(0)),
+    );
+
+    let ctx = SessionContext::new_with_config(config);
+
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    ctx.sql("TRUNCATE TABLE t").await?.collect().await?;
+
+    assert!(
+        provider.was_truncated(),
+        "truncate() should be called on the TableProvider"
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_unsupported_table_delete() -> Result<()> {
+    let schema = test_schema();
+    let ctx = SessionContext::new();
+
+    let empty_table = datafusion::datasource::empty::EmptyTable::new(schema);
+    ctx.register_table("empty_t", Arc::new(empty_table))?;
+
+    let result = ctx.sql("DELETE FROM empty_t WHERE id = 1").await;
+    assert!(result.is_err() || result.unwrap().collect().await.is_err());
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_unsupported_table_update() -> Result<()> {
+    let schema = test_schema();
+    let ctx = SessionContext::new();
+
+    let empty_table = datafusion::datasource::empty::EmptyTable::new(schema);
+    ctx.register_table("empty_t", Arc::new(empty_table))?;
+
+    let result = ctx.sql("UPDATE empty_t SET value = 1 WHERE id = 1").await;
+
+    assert!(result.is_err() || result.unwrap().collect().await.is_err());
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_delete_target_table_scoping() -> Result<()> {
+    // Test that DELETE only extracts filters from the target table,
+    // not from other tables (important for DELETE...FROM safety)
+    let target_provider = Arc::new(CaptureDeleteProvider::new_with_filter_pushdown(
+        test_schema(),
+        TableProviderFilterPushDown::Exact,
+    ));
+    let ctx = SessionContext::new();
+    ctx.register_table(
+        "target_t",
+        Arc::clone(&target_provider) as Arc<dyn TableProvider>,
+    )?;
+
+    // For now, we test single-table DELETE
+    // and validate that the scoping logic is correct
+    let df = ctx.sql("DELETE FROM target_t WHERE id > 5").await?;
+    df.collect().await?;
+
+    let filters = target_provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert_eq!(filters.len(), 1);
+    assert!(
+        filters[0].to_string().contains("id"),
+        "Filter should be for id column"
+    );
+    assert!(
+        filters[0].to_string().contains("5"),
+        "Filter should contain the value 5"
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_update_from_drops_non_target_predicates() -> Result<()> {
+    // UPDATE ... FROM is currently not working
+    // TODO fix https://github.com/apache/datafusion/issues/19950
+    let target_provider = Arc::new(CaptureUpdateProvider::new_with_filter_pushdown(
+        test_schema(),
+        TableProviderFilterPushDown::Exact,
+    ));
+    let ctx = SessionContext::new();
+    ctx.register_table("t1", Arc::clone(&target_provider) as Arc<dyn TableProvider>)?;
+
+    let source_schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("status", DataType::Utf8, true),
+        // t2-only column to avoid false negatives after qualifier stripping
+        Field::new("src_only", DataType::Utf8, true),
+    ]));
+    let source_table = datafusion::datasource::empty::EmptyTable::new(source_schema);
+    ctx.register_table("t2", Arc::new(source_table))?;
+
+    let result = ctx
+        .sql(
+            "UPDATE t1 SET value = 1 FROM t2 \
+             WHERE t1.id = t2.id AND t2.src_only = 'active' AND t1.value > 10",
+        )
+        .await;
+
+    // Verify UPDATE ... FROM is rejected with appropriate error
+    // TODO fix https://github.com/apache/datafusion/issues/19950
+    assert!(result.is_err());
+    let err = result.unwrap_err();
+    assert!(
+        err.to_string().contains("UPDATE ... FROM is not supported"),
+        "Expected 'UPDATE ... FROM is not supported' error, got: {err}"
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_delete_qualifier_stripping_and_validation() -> Result<()> {
+    // Test that filter qualifiers are properly stripped and validated
+    // Unqualified predicates should work fine
+    let provider = Arc::new(CaptureDeleteProvider::new_with_filter_pushdown(
+        test_schema(),
+        TableProviderFilterPushDown::Exact,
+    ));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    // Execute DELETE with unqualified column reference
+    // (After parsing, the planner adds qualifiers, but our validation should accept them)
+    let df = ctx.sql("DELETE FROM t WHERE id = 1").await?;
+    df.collect().await?;
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert!(!filters.is_empty(), "Should have extracted filter");
+
+    // Verify qualifiers are stripped: check that Column expressions have no qualifier
+    let has_qualified_column = filters[0]
+        .exists(|expr| Ok(matches!(expr, Expr::Column(col) if col.relation.is_some())))?;
+    assert!(
+        !has_qualified_column,
+        "Filter should have unqualified columns after stripping"
+    );
+
+    // Also verify the string representation doesn't contain table qualifiers
+    let filter_str = filters[0].to_string();
+    assert!(
+        !filter_str.contains("t.id"),
+        "Filter should not contain qualified column reference, got: {filter_str}"
+    );
+    assert!(
+        filter_str.contains("id") || filter_str.contains("1"),
+        "Filter should reference id column or the value 1, got: {filter_str}"
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_unsupported_table_truncate() -> Result<()> {
+    let schema = test_schema();
+    let ctx = SessionContext::new();
+
+    let empty_table = datafusion::datasource::empty::EmptyTable::new(schema);
+    ctx.register_table("empty_t", Arc::new(empty_table))?;
+
+    let result = ctx.sql("TRUNCATE TABLE empty_t").await;
+
+    assert!(result.is_err() || result.unwrap().collect().await.is_err());
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/custom_sources_cases/mod.rs b/datafusion/core/tests/custom_sources_cases/mod.rs
index cbdc4a448ea41..cef75b444f6fe 100644
--- a/datafusion/core/tests/custom_sources_cases/mod.rs
+++ b/datafusion/core/tests/custom_sources_cases/mod.rs
@@ -15,7 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};
@@ -28,25 +27,27 @@ use datafusion::datasource::{TableProvider, TableType};
 use datafusion::error::Result;
 use datafusion::execution::context::{SessionContext, TaskContext};
 use datafusion::logical_expr::{
-    col, Expr, LogicalPlan, LogicalPlanBuilder, TableScan, UNNAMED_TABLE,
+    Expr, LogicalPlan, LogicalPlanBuilder, TableScan, UNNAMED_TABLE, col,
 };
 use datafusion::physical_plan::{
-    collect, ColumnStatistics, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning,
-    RecordBatchStream, SendableRecordBatchStream, Statistics,
+    ColumnStatistics, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning,
+    RecordBatchStream, SendableRecordBatchStream, Statistics, collect,
 };
 use datafusion::scalar::ScalarValue;
 use datafusion_catalog::Session;
 use datafusion_common::cast::as_primitive_array;
 use datafusion_common::project_schema;
 use datafusion_common::stats::Precision;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_physical_expr::EquivalenceProperties;
+use datafusion_physical_plan::PlanProperties;
 use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
 use datafusion_physical_plan::placeholder_row::PlaceholderRowExec;
-use datafusion_physical_plan::PlanProperties;
 
 use async_trait::async_trait;
 use futures::stream::Stream;
 
+mod dml_planning;
 mod provider_filter_pushdown;
 mod statistics;
 
@@ -78,7 +79,7 @@ struct CustomTableProvider;
 #[derive(Debug, Clone)]
 struct CustomExecutionPlan {
     projection: Option<Vec<usize>>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl CustomExecutionPlan {
@@ -87,7 +88,10 @@ impl CustomExecutionPlan {
         let schema =
             project_schema(&schema, projection.as_ref()).expect("projected schema");
         let cache = Self::compute_properties(schema);
-        Self { projection, cache }
+        Self {
+            projection,
+            cache: Arc::new(cache),
+        }
     }
 
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
@@ -152,11 +156,7 @@ impl ExecutionPlan for CustomExecutionPlan {
         Self::static_name()
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -179,16 +179,12 @@ impl ExecutionPlan for CustomExecutionPlan {
         Ok(Box::pin(TestCustomRecordBatchStream { nb_batch: 1 }))
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         if partition.is_some() {
-            return Ok(Statistics::new_unknown(&self.schema()));
+            return Ok(Arc::new(Statistics::new_unknown(&self.schema())));
         }
         let batch = TEST_CUSTOM_RECORD_BATCH!().unwrap();
-        Ok(Statistics {
+        Ok(Arc::new(Statistics {
             num_rows: Precision::Exact(batch.num_rows()),
             total_byte_size: Precision::Absent,
             column_statistics: self
@@ -207,16 +203,28 @@ impl ExecutionPlan for CustomExecutionPlan {
                     ..Default::default()
                 })
                 .collect(),
-        })
+        }))
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion::physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.cache.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
     }
 }
 
 #[async_trait]
 impl TableProvider for CustomTableProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         TEST_CUSTOM_SCHEMA_REF!()
     }
@@ -316,8 +324,9 @@ async fn optimizers_catch_all_statistics() {
     assert_eq!(format!("{:?}", actual[0]), format!("{expected:?}"));
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn contains_place_holder_exec(plan: Arc<dyn ExecutionPlan>) -> bool {
-    if plan.as_any().is::<PlaceholderRowExec>() {
+    if plan.is::<PlaceholderRowExec>() {
         true
     } else if plan.children().len() != 1 {
         false
diff --git a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs
index c80c0b4bf54ba..e52c559ec79ef 100644
--- a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs
+++ b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs
@@ -29,13 +29,14 @@ use datafusion::logical_expr::TableProviderFilterPushDown;
 use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion::physical_plan::{
     DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties,
-    SendableRecordBatchStream, Statistics,
+    SendableRecordBatchStream,
 };
 use datafusion::prelude::*;
 use datafusion::scalar::ScalarValue;
 use datafusion_catalog::Session;
 use datafusion_common::cast::as_primitive_array;
-use datafusion_common::{internal_err, not_impl_err};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{DataFusionError, internal_err, not_impl_err};
 use datafusion_expr::expr::{BinaryExpr, Cast};
 use datafusion_functions_aggregate::expr_fn::count;
 use datafusion_physical_expr::EquivalenceProperties;
@@ -62,13 +63,16 @@ fn create_batch(value: i32, num_rows: usize) -> Result<RecordBatch> {
 #[derive(Debug)]
 struct CustomPlan {
     batches: Vec<RecordBatch>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl CustomPlan {
     fn new(schema: SchemaRef, batches: Vec<RecordBatch>) -> Self {
         let cache = Self::compute_properties(schema);
-        Self { batches, cache }
+        Self {
+            batches,
+            cache: Arc::new(cache),
+        }
     }
 
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
@@ -105,11 +109,7 @@ impl ExecutionPlan for CustomPlan {
         Self::static_name()
     }
 
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -134,16 +134,36 @@ impl ExecutionPlan for CustomPlan {
         _partition: usize,
         _context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
+        let schema_captured = self.schema().clone();
         Ok(Box::pin(RecordBatchStreamAdapter::new(
             self.schema(),
-            futures::stream::iter(self.batches.clone().into_iter().map(Ok)),
+            futures::stream::iter(self.batches.clone().into_iter().map(move |batch| {
+                let projection: Vec<usize> = schema_captured
+                    .fields()
+                    .iter()
+                    .filter_map(|field| batch.schema().index_of(field.name()).ok())
+                    .collect();
+                batch
+                    .project(&projection)
+                    .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))
+            })),
         )))
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        // here we could provide more accurate statistics
-        // but we want to test the filter pushdown not the CBOs
-        Ok(Statistics::new_unknown(&self.schema()))
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion::physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.cache.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
     }
 }
 
@@ -155,10 +175,6 @@ struct CustomProvider {
 
 #[async_trait]
 impl TableProvider for CustomProvider {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         self.zero_batch.schema()
     }
@@ -183,7 +199,7 @@ impl TableProvider for CustomProvider {
                     Expr::Literal(ScalarValue::Int16(Some(i)), _) => *i as i64,
                     Expr::Literal(ScalarValue::Int32(Some(i)), _) => *i as i64,
                     Expr::Literal(ScalarValue::Int64(Some(i)), _) => *i,
-                    Expr::Cast(Cast { expr, data_type: _ }) => match expr.deref() {
+                    Expr::Cast(Cast { expr, field: _ }) => match expr.deref() {
                         Expr::Literal(lit_value, _) => match lit_value {
                             ScalarValue::Int8(Some(v)) => *v as i64,
                             ScalarValue::Int16(Some(v)) => *v as i64,
diff --git a/datafusion/core/tests/custom_sources_cases/statistics.rs b/datafusion/core/tests/custom_sources_cases/statistics.rs
index 403c04f1737e1..01c4deac5ccd3 100644
--- a/datafusion/core/tests/custom_sources_cases/statistics.rs
+++ b/datafusion/core/tests/custom_sources_cases/statistics.rs
@@ -17,7 +17,7 @@
 
 //! This module contains end to end tests of statistics propagation
 
-use std::{any::Any, sync::Arc};
+use std::sync::Arc;
 
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use datafusion::execution::context::TaskContext;
@@ -33,6 +33,7 @@ use datafusion::{
     scalar::ScalarValue,
 };
 use datafusion_catalog::Session;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::{project_schema, stats::Precision};
 use datafusion_physical_expr::EquivalenceProperties;
 use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
@@ -45,7 +46,7 @@ use async_trait::async_trait;
 struct StatisticsValidation {
     stats: Statistics,
     schema: Arc<Schema>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl StatisticsValidation {
@@ -59,7 +60,7 @@ impl StatisticsValidation {
         Self {
             stats,
             schema,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -76,10 +77,6 @@ impl StatisticsValidation {
 
 #[async_trait]
 impl TableProvider for StatisticsValidation {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         Arc::clone(&self.schema)
     }
@@ -154,11 +151,7 @@ impl ExecutionPlan for StatisticsValidation {
         Self::static_name()
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -181,17 +174,29 @@ impl ExecutionPlan for StatisticsValidation {
         unimplemented!("This plan only serves for testing statistics")
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(self.stats.clone())
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         if partition.is_some() {
-            Ok(Statistics::new_unknown(&self.schema))
+            Ok(Arc::new(Statistics::new_unknown(&self.schema)))
         } else {
-            Ok(self.stats.clone())
+            Ok(Arc::new(self.stats.clone()))
         }
     }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion::physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.cache.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
 }
 
 fn init_ctx(stats: Statistics, schema: Schema) -> Result<SessionContext> {
@@ -214,6 +219,7 @@ fn fully_defined() -> (Statistics, Schema) {
                     min_value: Precision::Exact(ScalarValue::Int32(Some(-24))),
                     sum_value: Precision::Exact(ScalarValue::Int64(Some(10))),
                     null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(13),
@@ -221,6 +227,7 @@ fn fully_defined() -> (Statistics, Schema) {
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-6783))),
                     sum_value: Precision::Exact(ScalarValue::Int64(Some(10))),
                     null_count: Precision::Exact(5),
+                    byte_size: Precision::Absent,
                 },
             ],
         },
@@ -240,7 +247,7 @@ async fn sql_basic() -> Result<()> {
     let physical_plan = df.create_physical_plan().await.unwrap();
 
     // the statistics should be those of the source
-    assert_eq!(stats, physical_plan.partition_statistics(None)?);
+    assert_eq!(stats, *physical_plan.partition_statistics(None)?);
 
     Ok(())
 }
@@ -257,7 +264,7 @@ async fn sql_filter() -> Result<()> {
 
     let physical_plan = df.create_physical_plan().await.unwrap();
     let stats = physical_plan.partition_statistics(None)?;
-    assert_eq!(stats.num_rows, Precision::Inexact(1));
+    assert_eq!(stats.num_rows, Precision::Inexact(7));
 
     Ok(())
 }
@@ -270,17 +277,18 @@ async fn sql_limit() -> Result<()> {
     let df = ctx.sql("SELECT * FROM stats_table LIMIT 5").await.unwrap();
     let physical_plan = df.create_physical_plan().await.unwrap();
     // when the limit is smaller than the original number of lines we mark the statistics as inexact
+    // and cap NDV at the new row count
+    let limit_stats = physical_plan.partition_statistics(None)?;
+    assert_eq!(limit_stats.num_rows, Precision::Exact(5));
+    // c1: NDV=2 stays at 2 (already below limit of 5)
     assert_eq!(
-        Statistics {
-            num_rows: Precision::Exact(5),
-            column_statistics: stats
-                .column_statistics
-                .iter()
-                .map(|c| c.clone().to_inexact())
-                .collect(),
-            total_byte_size: Precision::Absent
-        },
-        physical_plan.partition_statistics(None)?
+        limit_stats.column_statistics[0].distinct_count,
+        Precision::Inexact(2)
+    );
+    // c2: NDV=13 capped to 5 (the limit row count)
+    assert_eq!(
+        limit_stats.column_statistics[1].distinct_count,
+        Precision::Inexact(5)
     );
 
     let df = ctx
@@ -289,7 +297,7 @@ async fn sql_limit() -> Result<()> {
         .unwrap();
     let physical_plan = df.create_physical_plan().await.unwrap();
     // when the limit is larger than the original number of lines, statistics remain unchanged
-    assert_eq!(stats, physical_plan.partition_statistics(None)?);
+    assert_eq!(stats, *physical_plan.partition_statistics(None)?);
 
     Ok(())
 }
@@ -309,7 +317,7 @@ async fn sql_window() -> Result<()> {
     let result = physical_plan.partition_statistics(None)?;
 
     assert_eq!(stats.num_rows, result.num_rows);
-    let col_stats = result.column_statistics;
+    let col_stats = &result.column_statistics;
     assert_eq!(2, col_stats.len());
     assert_eq!(stats.column_statistics[1], col_stats[0]);
 
diff --git a/datafusion/core/tests/data/json_array.json b/datafusion/core/tests/data/json_array.json
new file mode 100644
index 0000000000000..1a8716dbf4beb
--- /dev/null
+++ b/datafusion/core/tests/data/json_array.json
@@ -0,0 +1,5 @@
+[
+    {"a": 1, "b": "hello"},
+    {"a": 2, "b": "world"},
+    {"a": 3, "b": "test"}
+]
diff --git a/datafusion/core/tests/data/json_empty_array.json b/datafusion/core/tests/data/json_empty_array.json
new file mode 100644
index 0000000000000..fe51488c7066f
--- /dev/null
+++ b/datafusion/core/tests/data/json_empty_array.json
@@ -0,0 +1 @@
+[]
diff --git a/datafusion/core/tests/data/partitioned_table_arrow_stream/part=123/data.arrow b/datafusion/core/tests/data/partitioned_table_arrow_stream/part=123/data.arrow
new file mode 100644
index 0000000000000..bad9e3de4a57f
Binary files /dev/null and b/datafusion/core/tests/data/partitioned_table_arrow_stream/part=123/data.arrow differ
diff --git a/datafusion/core/tests/data/partitioned_table_arrow_stream/part=456/data.arrow b/datafusion/core/tests/data/partitioned_table_arrow_stream/part=456/data.arrow
new file mode 100644
index 0000000000000..4a07fbfa47f32
Binary files /dev/null and b/datafusion/core/tests/data/partitioned_table_arrow_stream/part=456/data.arrow differ
diff --git a/datafusion/core/tests/data/recursive_cte/closure.csv b/datafusion/core/tests/data/recursive_cte/closure.csv
new file mode 100644
index 0000000000000..a31e2bfbf36b6
--- /dev/null
+++ b/datafusion/core/tests/data/recursive_cte/closure.csv
@@ -0,0 +1,6 @@
+start,end
+1,2
+2,3
+2,4
+2,4
+4,1
\ No newline at end of file
diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs
index 265862ff9af8a..2ada0411f4f8c 100644
--- a/datafusion/core/tests/dataframe/dataframe_functions.rs
+++ b/datafusion/core/tests/dataframe/dataframe_functions.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{types::Int32Type, ListArray};
+use arrow::array::{ListArray, types::Int32Type};
 use arrow::datatypes::SchemaRef;
 use arrow::datatypes::{DataType, Field, Schema};
 use arrow::{
@@ -31,7 +31,7 @@ use datafusion::prelude::*;
 use datafusion_common::test_util::batches_to_string;
 use datafusion_common::{DFSchema, ScalarValue};
 use datafusion_expr::expr::Alias;
-use datafusion_expr::{table_scan, ExprSchemable, LogicalPlanBuilder};
+use datafusion_expr::{ExprSchemable, LogicalPlanBuilder, table_scan};
 use datafusion_functions_aggregate::expr_fn::{approx_median, approx_percentile_cont};
 use datafusion_functions_nested::map::map;
 use insta::assert_snapshot;
@@ -313,10 +313,10 @@ async fn test_fn_arrow_typeof() -> Result<()> {
     +----------------------+
     | arrow_typeof(test.l) |
     +----------------------+
-    | List(nullable Int32) |
-    | List(nullable Int32) |
-    | List(nullable Int32) |
-    | List(nullable Int32) |
+    | List(Int32)          |
+    | List(Int32)          |
+    | List(Int32)          |
+    | List(Int32)          |
     +----------------------+
     ");
 
@@ -402,7 +402,7 @@ async fn test_fn_approx_median() -> Result<()> {
     +-----------------------+
     | approx_median(test.b) |
     +-----------------------+
-    | 10                    |
+    | 10.0                  |
     +-----------------------+
     ");
 
@@ -422,7 +422,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> {
     +---------------------------------------------------------------------------+
     | approx_percentile_cont(Float64(0.5)) WITHIN GROUP [test.b ASC NULLS LAST] |
     +---------------------------------------------------------------------------+
-    | 10                                                                        |
+    | 10.0                                                                      |
     +---------------------------------------------------------------------------+
     ");
 
@@ -437,7 +437,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> {
     +----------------------------------------------------------------------------+
     | approx_percentile_cont(Float64(0.1)) WITHIN GROUP [test.b DESC NULLS LAST] |
     +----------------------------------------------------------------------------+
-    | 100                                                                        |
+    | 100.0                                                                      |
     +----------------------------------------------------------------------------+
     ");
 
@@ -457,7 +457,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> {
     +--------------------------------------------------------------------+
     | approx_percentile_cont(arg_2) WITHIN GROUP [test.b ASC NULLS LAST] |
     +--------------------------------------------------------------------+
-    | 10                                                                 |
+    | 10.0                                                               |
     +--------------------------------------------------------------------+
     "
     );
@@ -477,7 +477,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> {
     +---------------------------------------------------------------------+
     | approx_percentile_cont(arg_2) WITHIN GROUP [test.b DESC NULLS LAST] |
     +---------------------------------------------------------------------+
-    | 100                                                                 |
+    | 100.0                                                               |
     +---------------------------------------------------------------------+
     "
     );
@@ -494,7 +494,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> {
     +------------------------------------------------------------------------------------+
     | approx_percentile_cont(Float64(0.5),Int32(2)) WITHIN GROUP [test.b ASC NULLS LAST] |
     +------------------------------------------------------------------------------------+
-    | 30                                                                                 |
+    | 30.25                                                                              |
     +------------------------------------------------------------------------------------+
     ");
 
@@ -510,7 +510,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> {
     +-------------------------------------------------------------------------------------+
     | approx_percentile_cont(Float64(0.1),Int32(2)) WITHIN GROUP [test.b DESC NULLS LAST] |
     +-------------------------------------------------------------------------------------+
-    | 69                                                                                  |
+    | 69.85                                                                               |
     +-------------------------------------------------------------------------------------+
     ");
 
diff --git a/datafusion/core/tests/dataframe/describe.rs b/datafusion/core/tests/dataframe/describe.rs
index 9bd69dfa72b4c..c61fe4fed1615 100644
--- a/datafusion/core/tests/dataframe/describe.rs
+++ b/datafusion/core/tests/dataframe/describe.rs
@@ -17,7 +17,7 @@
 
 use datafusion::prelude::{ParquetReadOptions, SessionContext};
 use datafusion_common::test_util::batches_to_string;
-use datafusion_common::{test_util::parquet_test_data, Result};
+use datafusion_common::{Result, test_util::parquet_test_data};
 use insta::assert_snapshot;
 
 #[tokio::test]
diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs
index 05f5a204c0963..e0830754399db 100644
--- a/datafusion/core/tests/dataframe/mod.rs
+++ b/datafusion/core/tests/dataframe/mod.rs
@@ -20,10 +20,10 @@ mod dataframe_functions;
 mod describe;
 
 use arrow::array::{
-    record_batch, Array, ArrayRef, BooleanArray, DictionaryArray, FixedSizeListArray,
-    FixedSizeListBuilder, Float32Array, Float64Array, Int32Array, Int32Builder,
-    Int8Array, LargeListArray, ListArray, ListBuilder, RecordBatch, StringArray,
-    StringBuilder, StructBuilder, UInt32Array, UInt32Builder, UnionArray,
+    Array, ArrayRef, BooleanArray, DictionaryArray, FixedSizeListArray,
+    FixedSizeListBuilder, Float32Array, Float64Array, Int8Array, Int32Array,
+    Int32Builder, LargeListArray, ListArray, ListBuilder, RecordBatch, StringArray,
+    StringBuilder, StructBuilder, UInt32Array, UInt32Builder, UnionArray, record_batch,
 };
 use arrow::buffer::ScalarBuffer;
 use arrow::datatypes::{
@@ -43,6 +43,7 @@ use datafusion_functions_nested::make_array::make_array_udf;
 use datafusion_functions_window::expr_fn::{first_value, lead, row_number};
 use insta::assert_snapshot;
 use object_store::local::LocalFileSystem;
+use rstest::rstest;
 use std::collections::HashMap;
 use std::fs;
 use std::path::Path;
@@ -56,18 +57,16 @@ use datafusion::error::Result;
 use datafusion::execution::context::SessionContext;
 use datafusion::execution::session_state::SessionStateBuilder;
 use datafusion::logical_expr::{ColumnarValue, Volatility};
-use datafusion::prelude::{
-    CsvReadOptions, JoinType, NdJsonReadOptions, ParquetReadOptions,
-};
+use datafusion::prelude::{CsvReadOptions, JoinType, ParquetReadOptions};
 use datafusion::test_util::{
     parquet_test_data, populate_csv_partitions, register_aggregate_csv, test_table,
-    test_table_with_name,
+    test_table_with_cache_factory, test_table_with_name,
 };
 use datafusion_catalog::TableProvider;
 use datafusion_common::test_util::{batches_to_sort_string, batches_to_string};
 use datafusion_common::{
-    assert_contains, internal_datafusion_err, Constraint, Constraints, DFSchema,
-    DataFusionError, ScalarValue, TableReference, UnnestOptions,
+    Constraint, Constraints, DFSchema, DataFusionError, ScalarValue, SchemaError,
+    TableReference, UnnestOptions, assert_contains, internal_datafusion_err,
 };
 use datafusion_common_runtime::SpawnedTask;
 use datafusion_datasource::file_format::format_as_file_type;
@@ -76,23 +75,24 @@ use datafusion_execution::runtime_env::RuntimeEnv;
 use datafusion_expr::expr::{GroupingSet, NullTreatment, Sort, WindowFunction};
 use datafusion_expr::var_provider::{VarProvider, VarType};
 use datafusion_expr::{
-    cast, col, create_udf, exists, in_subquery, lit, out_ref_col, placeholder,
-    scalar_subquery, when, wildcard, Expr, ExprFunctionExt, ExprSchemable, LogicalPlan,
-    LogicalPlanBuilder, ScalarFunctionImplementation, SortExpr, TableType, WindowFrame,
-    WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition,
+    Expr, ExprFunctionExt, ExprSchemable, LogicalPlan, LogicalPlanBuilder,
+    ScalarFunctionImplementation, SortExpr, TableType, WindowFrame, WindowFrameBound,
+    WindowFrameUnits, WindowFunctionDefinition, cast, col, create_udf, exists,
+    in_subquery, lit, out_ref_col, placeholder, scalar_subquery, when, wildcard,
 };
+use datafusion_physical_expr::Partitioning;
 use datafusion_physical_expr::aggregate::AggregateExprBuilder;
 use datafusion_physical_expr::expressions::Column;
-use datafusion_physical_expr::Partitioning;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 use datafusion_physical_plan::aggregates::{
     AggregateExec, AggregateMode, PhysicalGroupBy,
 };
 use datafusion_physical_plan::empty::EmptyExec;
-use datafusion_physical_plan::{displayable, ExecutionPlan, ExecutionPlanProperties};
+use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties, displayable};
 
 use datafusion::error::Result as DataFusionResult;
+use datafusion::execution::options::JsonReadOptions;
 use datafusion_functions_window::expr_fn::lag;
 
 // Get string representation of the plan
@@ -305,6 +305,27 @@ async fn select_columns() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn select_columns_with_nonexistent_columns() -> Result<()> {
+    let t = test_table().await?;
+    let t2 = t.select_columns(&["canada", "c2", "rocks"]);
+
+    match t2 {
+        Err(DataFusionError::SchemaError(boxed_err, _)) => {
+            // Verify it's the first invalid column
+            match boxed_err.as_ref() {
+                SchemaError::FieldNotFound { field, .. } => {
+                    assert_eq!(field.name(), "canada");
+                }
+                _ => panic!("Expected SchemaError::FieldNotFound for 'canada'"),
+            }
+        }
+        _ => panic!("Expected SchemaError"),
+    }
+
+    Ok(())
+}
+
 #[tokio::test]
 async fn select_expr() -> Result<()> {
     // build plan using Table API
@@ -392,14 +413,14 @@ async fn select_with_periods() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +------+
     | f.c1 |
     +------+
     | 1    |
     | 10   |
     +------+
-    "###
+    "
     );
 
     Ok(())
@@ -513,7 +534,8 @@ async fn drop_columns_with_nonexistent_columns() -> Result<()> {
 async fn drop_columns_with_empty_array() -> Result<()> {
     // build plan using Table API
     let t = test_table().await?;
-    let t2 = t.drop_columns(&[])?;
+    let drop_columns = vec![] as Vec<&str>;
+    let t2 = t.drop_columns(&drop_columns)?;
     let plan = t2.logical_plan().clone();
 
     // build query using SQL
@@ -528,6 +550,107 @@ async fn drop_columns_with_empty_array() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn drop_columns_qualified() -> Result<()> {
+    // build plan using Table API
+    let mut t = test_table().await?;
+    t = t.select_columns(&["c1", "c2", "c11"])?;
+    let mut t2 = test_table_with_name("another_table").await?;
+    t2 = t2.select_columns(&["c1", "c2", "c11"])?;
+    let mut t3 = t.join_on(
+        t2,
+        JoinType::Inner,
+        [col("aggregate_test_100.c1").eq(col("another_table.c1"))],
+    )?;
+    t3 = t3.drop_columns(&["another_table.c2", "another_table.c11"])?;
+
+    let plan = t3.logical_plan().clone();
+
+    let sql = "SELECT aggregate_test_100.c1, aggregate_test_100.c2, aggregate_test_100.c11, another_table.c1 FROM (SELECT c1, c2, c11 FROM aggregate_test_100) INNER JOIN (SELECT c1, c2, c11 FROM another_table) ON aggregate_test_100.c1 = another_table.c1";
+    let ctx = SessionContext::new();
+    register_aggregate_csv(&ctx, "aggregate_test_100").await?;
+    register_aggregate_csv(&ctx, "another_table").await?;
+    let sql_plan = ctx.sql(sql).await?.into_unoptimized_plan();
+
+    // the two plans should be identical
+    assert_same_plan(&plan, &sql_plan);
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn drop_columns_qualified_find_qualified() -> Result<()> {
+    // build plan using Table API
+    let mut t = test_table().await?;
+    t = t.select_columns(&["c1", "c2", "c11"])?;
+    let mut t2 = test_table_with_name("another_table").await?;
+    t2 = t2.select_columns(&["c1", "c2", "c11"])?;
+    let mut t3 = t.join_on(
+        t2.clone(),
+        JoinType::Inner,
+        [col("aggregate_test_100.c1").eq(col("another_table.c1"))],
+    )?;
+    t3 = t3.drop_columns(&t2.find_qualified_columns(&["c2", "c11"])?)?;
+
+    let plan = t3.logical_plan().clone();
+
+    let sql = "SELECT aggregate_test_100.c1, aggregate_test_100.c2, aggregate_test_100.c11, another_table.c1 FROM (SELECT c1, c2, c11 FROM aggregate_test_100) INNER JOIN (SELECT c1, c2, c11 FROM another_table) ON aggregate_test_100.c1 = another_table.c1";
+    let ctx = SessionContext::new();
+    register_aggregate_csv(&ctx, "aggregate_test_100").await?;
+    register_aggregate_csv(&ctx, "another_table").await?;
+    let sql_plan = ctx.sql(sql).await?.into_unoptimized_plan();
+
+    // the two plans should be identical
+    assert_same_plan(&plan, &sql_plan);
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_find_qualified_names() -> Result<()> {
+    let t = test_table().await?;
+    let column_names = ["c1", "c2", "c3"];
+    let columns = t.find_qualified_columns(&column_names)?;
+
+    // Expected results for each column
+    let binding = TableReference::bare("aggregate_test_100");
+    let expected = [
+        (Some(&binding), "c1"),
+        (Some(&binding), "c2"),
+        (Some(&binding), "c3"),
+    ];
+
+    // Verify we got the expected number of results
+    assert_eq!(
+        columns.len(),
+        expected.len(),
+        "Expected {} columns, got {}",
+        expected.len(),
+        columns.len()
+    );
+
+    // Iterate over the results and check each one individually
+    for (i, (actual, expected)) in columns.iter().zip(expected.iter()).enumerate() {
+        let (actual_table_ref, actual_field_ref) = actual;
+        let (expected_table_ref, expected_field_name) = expected;
+
+        // Check table reference
+        assert_eq!(
+            actual_table_ref, expected_table_ref,
+            "Column {i}: expected table reference {expected_table_ref:?}, got {actual_table_ref:?}"
+        );
+
+        // Check field name
+        assert_eq!(
+            actual_field_ref.name(),
+            *expected_field_name,
+            "Column {i}: expected field name '{expected_field_name}', got '{actual_field_ref}'"
+        );
+    }
+
+    Ok(())
+}
+
 #[tokio::test]
 async fn drop_with_quotes() -> Result<()> {
     // define data with a column name that has a "." in it:
@@ -547,14 +670,14 @@ async fn drop_with_quotes() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r#"
     +------+
     | f"c2 |
     +------+
     | 11   |
     | 2    |
     +------+
-    "###
+    "#
     );
 
     Ok(())
@@ -573,20 +696,20 @@ async fn drop_with_periods() -> Result<()> {
     let ctx = SessionContext::new();
     ctx.register_batch("t", batch)?;
 
-    let df = ctx.table("t").await?.drop_columns(&["f.c1"])?;
+    let df = ctx.table("t").await?.drop_columns(&["\"f.c1\""])?;
 
     let df_results = df.collect().await?;
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +------+
     | f.c2 |
     +------+
     | 11   |
     | 2    |
     +------+
-    "###
+    "
     );
 
     Ok(())
@@ -723,23 +846,23 @@ async fn test_aggregate_with_pk() -> Result<()> {
 
     assert_snapshot!(
         physical_plan_to_string(&df).await,
-        @r###"
+        @r"
     AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[]
       DataSourceExec: partitions=1, partition_sizes=[1]
-    "###
+    "
     );
 
     let df_results = df.collect().await?;
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+------+
     | id | name |
     +----+------+
     | 1  | a    |
     +----+------+
-    "###
+    "
     );
 
     Ok(())
@@ -766,9 +889,8 @@ async fn test_aggregate_with_pk2() -> Result<()> {
         physical_plan_to_string(&df).await,
         @r"
     AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[], ordering_mode=Sorted
-      CoalesceBatchesExec: target_batch_size=8192
-        FilterExec: id@0 = 1 AND name@1 = a
-          DataSourceExec: partitions=1, partition_sizes=[1]
+      FilterExec: id@0 = 1 AND name@1 = a
+        DataSourceExec: partitions=1, partition_sizes=[1]
     "
     );
 
@@ -778,13 +900,13 @@ async fn test_aggregate_with_pk2() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+------+
     | id | name |
     +----+------+
     | 1  | a    |
     +----+------+
-    "###
+    "
     );
 
     Ok(())
@@ -815,9 +937,8 @@ async fn test_aggregate_with_pk3() -> Result<()> {
         physical_plan_to_string(&df).await,
         @r"
     AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[], ordering_mode=PartiallySorted([0])
-      CoalesceBatchesExec: target_batch_size=8192
-        FilterExec: id@0 = 1
-          DataSourceExec: partitions=1, partition_sizes=[1]
+      FilterExec: id@0 = 1
+        DataSourceExec: partitions=1, partition_sizes=[1]
     "
     );
 
@@ -827,13 +948,13 @@ async fn test_aggregate_with_pk3() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+------+
     | id | name |
     +----+------+
     | 1  | a    |
     +----+------+
-    "###
+    "
     );
 
     Ok(())
@@ -866,9 +987,8 @@ async fn test_aggregate_with_pk4() -> Result<()> {
         physical_plan_to_string(&df).await,
         @r"
     AggregateExec: mode=Single, gby=[id@0 as id], aggr=[], ordering_mode=Sorted
-      CoalesceBatchesExec: target_batch_size=8192
-        FilterExec: id@0 = 1
-          DataSourceExec: partitions=1, partition_sizes=[1]
+      FilterExec: id@0 = 1
+        DataSourceExec: partitions=1, partition_sizes=[1]
     "
     );
 
@@ -876,13 +996,13 @@ async fn test_aggregate_with_pk4() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+
     | id |
     +----+
     | 1  |
     +----+
-    "###
+    "
     );
 
     Ok(())
@@ -904,7 +1024,7 @@ async fn test_aggregate_alias() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+
     | c2 |
     +----+
@@ -914,7 +1034,7 @@ async fn test_aggregate_alias() -> Result<()> {
     | 5  |
     | 6  |
     +----+
-    "###
+    "
     );
 
     Ok(())
@@ -951,7 +1071,7 @@ async fn test_aggregate_with_union() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+------------+
     | c1 | sum_result |
     +----+------------+
@@ -961,7 +1081,7 @@ async fn test_aggregate_with_union() -> Result<()> {
     | d  | 126        |
     | e  | 121        |
     +----+------------+
-    "###
+    "
     );
     Ok(())
 }
@@ -987,7 +1107,7 @@ async fn test_aggregate_subexpr() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----------------+------+
     | c2 + Int32(10) | sum  |
     +----------------+------+
@@ -997,7 +1117,7 @@ async fn test_aggregate_subexpr() -> Result<()> {
     | 15             | 95   |
     | 16             | -146 |
     +----------------+------+
-    "###
+    "
     );
 
     Ok(())
@@ -1020,7 +1140,7 @@ async fn test_aggregate_name_collision() -> Result<()> {
         // The select expr has the same display_name as the group_expr,
         // but since they are different expressions, it should fail.
         .expect_err("Expected error");
-    assert_snapshot!(df.strip_backtrace(), @r###"Schema error: No field named aggregate_test_100.c2. Valid fields are "aggregate_test_100.c2 + aggregate_test_100.c3"."###);
+    assert_snapshot!(df.strip_backtrace(), @r#"Schema error: No field named aggregate_test_100.c2. Valid fields are "aggregate_test_100.c2 + aggregate_test_100.c3"."#);
 
     Ok(())
 }
@@ -1079,33 +1199,33 @@ async fn window_using_aggregates() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df),
-        @r###"
+        @r"
     +-------------+----------+-----------------+---------------+--------+-----+------+----+------+
     | first_value | last_val | approx_distinct | approx_median | median | max | min  | c2 | c3   |
     +-------------+----------+-----------------+---------------+--------+-----+------+----+------+
     |             |          |                 |               |        |     |      | 1  | -85  |
-    | -85         | -101     | 14              | -12           | -101   | 83  | -101 | 4  | -54  |
-    | -85         | -101     | 17              | -25           | -101   | 83  | -101 | 5  | -31  |
-    | -85         | -12      | 10              | -32           | -12    | 83  | -85  | 3  | 13   |
-    | -85         | -25      | 3               | -56           | -25    | -25 | -85  | 1  | -5   |
-    | -85         | -31      | 18              | -29           | -31    | 83  | -101 | 5  | 36   |
-    | -85         | -38      | 16              | -25           | -38    | 83  | -101 | 4  | 65   |
-    | -85         | -43      | 7               | -43           | -43    | 83  | -85  | 2  | 45   |
-    | -85         | -48      | 6               | -35           | -48    | 83  | -85  | 2  | -43  |
-    | -85         | -5       | 4               | -37           | -5     | -5  | -85  | 1  | 83   |
-    | -85         | -54      | 15              | -17           | -54    | 83  | -101 | 4  | -38  |
-    | -85         | -56      | 2               | -70           | -56    | -56 | -85  | 1  | -25  |
-    | -85         | -72      | 9               | -43           | -72    | 83  | -85  | 3  | -12  |
-    | -85         | -85      | 1               | -85           | -85    | -85 | -85  | 1  | -56  |
-    | -85         | 13       | 11              | -17           | 13     | 83  | -85  | 3  | 14   |
-    | -85         | 13       | 11              | -25           | 13     | 83  | -85  | 3  | 13   |
-    | -85         | 14       | 12              | -12           | 14     | 83  | -85  | 3  | 17   |
-    | -85         | 17       | 13              | -11           | 17     | 83  | -85  | 4  | -101 |
-    | -85         | 45       | 8               | -34           | 45     | 83  | -85  | 3  | -72  |
-    | -85         | 65       | 17              | -17           | 65     | 83  | -101 | 5  | -101 |
-    | -85         | 83       | 5               | -25           | 83     | 83  | -85  | 2  | -48  |
+    | -85         | -101     | 14              | -12.0         | -12    | 83  | -101 | 4  | -54  |
+    | -85         | -101     | 17              | -25.0         | -25    | 83  | -101 | 5  | -31  |
+    | -85         | -12      | 10              | -32.75        | -34    | 83  | -85  | 3  | 13   |
+    | -85         | -25      | 3               | -56.0         | -56    | -25 | -85  | 1  | -5   |
+    | -85         | -31      | 18              | -29.75        | -28    | 83  | -101 | 5  | 36   |
+    | -85         | -38      | 16              | -25.0         | -25    | 83  | -101 | 4  | 65   |
+    | -85         | -43      | 7               | -43.0         | -43    | 83  | -85  | 2  | 45   |
+    | -85         | -48      | 6               | -35.75        | -36    | 83  | -85  | 2  | -43  |
+    | -85         | -5       | 4               | -37.75        | -40    | -5  | -85  | 1  | 83   |
+    | -85         | -54      | 15              | -17.0         | -18    | 83  | -101 | 4  | -38  |
+    | -85         | -56      | 2               | -70.5         | -70    | -56 | -85  | 1  | -25  |
+    | -85         | -72      | 9               | -43.0         | -43    | 83  | -85  | 3  | -12  |
+    | -85         | -85      | 1               | -85.0         | -85    | -85 | -85  | 1  | -56  |
+    | -85         | 13       | 11              | -17.0         | -18    | 83  | -85  | 3  | 14   |
+    | -85         | 13       | 11              | -25.0         | -25    | 83  | -85  | 3  | 13   |
+    | -85         | 14       | 12              | -12.0         | -12    | 83  | -85  | 3  | 17   |
+    | -85         | 17       | 13              | -11.25        | -8     | 83  | -85  | 4  | -101 |
+    | -85         | 45       | 8               | -34.5         | -34    | 83  | -85  | 3  | -72  |
+    | -85         | 65       | 17              | -17.0         | -18    | 83  | -101 | 5  | -101 |
+    | -85         | 83       | 5               | -25.0         | -25    | 83  | -85  | 2  | -48  |
     +-------------+----------+-----------------+---------------+--------+-----+------+----+------+
-    "###
+    "
     );
 
     Ok(())
@@ -1172,7 +1292,7 @@ async fn window_aggregates_with_filter() -> Result<()> {
 
     assert_snapshot!(
         batches_to_string(&results),
-        @r###"
+        @r"
     +---------+---------+---------+---------+---------+----+-----+
     | sum_pos | avg_pos | min_pos | max_pos | cnt_pos | ts | val |
     +---------+---------+---------+---------+---------+----+-----+
@@ -1182,7 +1302,7 @@ async fn window_aggregates_with_filter() -> Result<()> {
     | 5       | 2.5     | 1       | 4       | 2       | 4  | 4   |
     | 5       | 2.5     | 1       | 4       | 2       | 5  | -1  |
     +---------+---------+---------+---------+---------+----+-----+
-    "###
+    "
     );
 
     Ok(())
@@ -1238,7 +1358,7 @@ async fn test_distinct_sort_by() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+
     | c1 |
     +----+
@@ -1248,7 +1368,7 @@ async fn test_distinct_sort_by() -> Result<()> {
     | d  |
     | e  |
     +----+
-    "###
+    "
     );
 
     Ok(())
@@ -1286,7 +1406,7 @@ async fn test_distinct_on() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+
     | c1 |
     +----+
@@ -1296,7 +1416,7 @@ async fn test_distinct_on() -> Result<()> {
     | d  |
     | e  |
     +----+
-    "###
+    "
     );
 
     Ok(())
@@ -1321,7 +1441,7 @@ async fn test_distinct_on_sort_by() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+
     | c1 |
     +----+
@@ -1331,7 +1451,7 @@ async fn test_distinct_on_sort_by() -> Result<()> {
     | d  |
     | e  |
     +----+
-    "###
+    "
     );
 
     Ok(())
@@ -1395,13 +1515,13 @@ async fn join_coercion_unnamed() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----+------+
     | id | name |
     +----+------+
     | 10 | d    |
     +----+------+
-    "###
+    "
     );
     Ok(())
 }
@@ -1420,13 +1540,13 @@ async fn join_on() -> Result<()> {
         [col("a.c1").not_eq(col("b.c1")), col("a.c2").eq(col("b.c2"))],
     )?;
 
-    assert_snapshot!(join.logical_plan(), @r###"
+    assert_snapshot!(join.logical_plan(), @r"
     Inner Join:  Filter: a.c1 != b.c1 AND a.c2 = b.c2
       Projection: a.c1, a.c2
         TableScan: a
       Projection: b.c1, b.c2
         TableScan: b
-    "###);
+    ");
 
     Ok(())
 }
@@ -1449,7 +1569,11 @@ async fn join_on_filter_datatype() -> Result<()> {
     let err = join.into_optimized_plan().unwrap_err();
     assert_snapshot!(
         err.strip_backtrace(),
-        @"type_coercion\ncaused by\nError during planning: Join condition must be boolean type, but got Utf8"
+        @r"
+    type_coercion
+    caused by
+    Error during planning: Join condition must be boolean type, but got Utf8
+    "
     );
     Ok(())
 }
@@ -1627,7 +1751,9 @@ async fn register_table() -> Result<()> {
     let df_impl = DataFrame::new(ctx.state(), df.logical_plan().clone());
 
     // register a dataframe as a table
-    ctx.register_table("test_table", df_impl.clone().into_view())?;
+    let table_provider = df_impl.clone().into_view();
+    assert_eq!(table_provider.table_type(), TableType::View);
+    ctx.register_table("test_table", table_provider)?;
 
     // pull the table out
     let table = ctx.table("test_table").await?;
@@ -1644,7 +1770,7 @@ async fn register_table() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+-----------------------------+
     | c1 | sum(aggregate_test_100.c12) |
     +----+-----------------------------+
@@ -1654,13 +1780,13 @@ async fn register_table() -> Result<()> {
     | d  | 8.793968289758968           |
     | e  | 10.206140546981722          |
     +----+-----------------------------+
-    "###
+    "
     );
 
     // the results are the same as the results from the view, modulo the leaf table name
     assert_snapshot!(
         batches_to_sort_string(table_results),
-        @r###"
+        @r"
     +----+---------------------+
     | c1 | sum(test_table.c12) |
     +----+---------------------+
@@ -1670,7 +1796,7 @@ async fn register_table() -> Result<()> {
     | d  | 8.793968289758968   |
     | e  | 10.206140546981722  |
     +----+---------------------+
-    "###
+    "
     );
     Ok(())
 }
@@ -1719,7 +1845,7 @@ async fn with_column() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+----+-----+-----+
     | c1 | c2 | c3  | sum |
     +----+----+-----+-----+
@@ -1730,7 +1856,7 @@ async fn with_column() -> Result<()> {
     | a  | 3  | 14  | 17  |
     | a  | 3  | 17  | 20  |
     +----+----+-----+-----+
-    "###
+    "
     );
 
     // check that col with the same name overwritten
@@ -1742,7 +1868,7 @@ async fn with_column() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results_overwrite),
-        @r###"
+        @r"
     +-----+----+-----+-----+
     | c1  | c2 | c3  | sum |
     +-----+----+-----+-----+
@@ -1753,7 +1879,7 @@ async fn with_column() -> Result<()> {
     | 17  | 3  | 14  | 17  |
     | 20  | 3  | 17  | 20  |
     +-----+----+-----+-----+
-    "###
+    "
     );
 
     // check that col with the same name overwritten using same name as reference
@@ -1765,7 +1891,7 @@ async fn with_column() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results_overwrite_self),
-        @r###"
+        @r"
     +----+----+-----+-----+
     | c1 | c2 | c3  | sum |
     +----+----+-----+-----+
@@ -1776,7 +1902,7 @@ async fn with_column() -> Result<()> {
     | a  | 4  | 14  | 17  |
     | a  | 4  | 17  | 20  |
     +----+----+-----+-----+
-    "###
+    "
     );
 
     Ok(())
@@ -1804,14 +1930,14 @@ async fn test_window_function_with_column() -> Result<()> {
     let df_results = df.clone().collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+----+-----+-----+---+
     | c1 | c2 | c3  | s   | r |
     +----+----+-----+-----+---+
     | c  | 2  | 1   | 3   | 1 |
     | d  | 5  | -40 | -35 | 2 |
     +----+----+-----+-----+---+
-    "###
+    "
     );
 
     Ok(())
@@ -1846,13 +1972,13 @@ async fn with_column_join_same_columns() -> Result<()> {
     let df_results = df.clone().collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+----+
     | c1 | c1 |
     +----+----+
     | a  | a  |
     +----+----+
-    "###
+    "
     );
 
     let df_with_column = df.clone().with_column("new_column", lit(true))?;
@@ -1875,7 +2001,7 @@ async fn with_column_join_same_columns() -> Result<()> {
 
     assert_snapshot!(
         df_with_column.clone().into_optimized_plan().unwrap(),
-        @r###"
+        @r"
     Projection: t1.c1, t2.c1, Boolean(true) AS new_column
       Sort: t1.c1 ASC NULLS FIRST, fetch=1
         Inner Join: t1.c1 = t2.c1
@@ -1883,20 +2009,20 @@ async fn with_column_join_same_columns() -> Result<()> {
             TableScan: aggregate_test_100 projection=[c1]
           SubqueryAlias: t2
             TableScan: aggregate_test_100 projection=[c1]
-    "###
+    "
     );
 
     let df_results = df_with_column.collect().await?;
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+----+------------+
     | c1 | c1 | new_column |
     +----+----+------------+
     | a  | a  | true       |
     +----+----+------------+
-    "###
+    "
     );
 
     Ok(())
@@ -1946,13 +2072,13 @@ async fn with_column_renamed() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(batches),
-        @r###"
+        @r"
     +-----+-----+-----+-------+
     | one | two | c3  | total |
     +-----+-----+-----+-------+
     | a   | 3   | -72 | -69   |
     +-----+-----+-----+-------+
-    "###
+    "
     );
 
     Ok(())
@@ -2017,13 +2143,13 @@ async fn with_column_renamed_join() -> Result<()> {
     let df_results = df.clone().collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+----+-----+----+----+-----+
     | c1 | c2 | c3  | c1 | c2 | c3  |
     +----+----+-----+----+----+-----+
     | a  | 1  | -85 | a  | 1  | -85 |
     +----+----+-----+----+----+-----+
-    "###
+    "
     );
 
     let df_renamed = df.clone().with_column_renamed("t1.c1", "AAA")?;
@@ -2046,7 +2172,7 @@ async fn with_column_renamed_join() -> Result<()> {
 
     assert_snapshot!(
         df_renamed.clone().into_optimized_plan().unwrap(),
-        @r###"
+        @r"
     Projection: t1.c1 AS AAA, t1.c2, t1.c3, t2.c1, t2.c2, t2.c3
       Sort: t1.c1 ASC NULLS FIRST, t1.c2 ASC NULLS FIRST, t1.c3 ASC NULLS FIRST, t2.c1 ASC NULLS FIRST, t2.c2 ASC NULLS FIRST, t2.c3 ASC NULLS FIRST, fetch=1
         Inner Join: t1.c1 = t2.c1
@@ -2054,20 +2180,20 @@ async fn with_column_renamed_join() -> Result<()> {
             TableScan: aggregate_test_100 projection=[c1, c2, c3]
           SubqueryAlias: t2
             TableScan: aggregate_test_100 projection=[c1, c2, c3]
-    "###
+    "
     );
 
     let df_results = df_renamed.collect().await?;
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +-----+----+-----+----+----+-----+
     | AAA | c2 | c3  | c1 | c2 | c3  |
     +-----+----+-----+----+----+-----+
     | a   | 1  | -85 | a  | 1  | -85 |
     +-----+----+-----+----+----+-----+
-    "###
+    "
     );
 
     Ok(())
@@ -2102,13 +2228,13 @@ async fn with_column_renamed_case_sensitive() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(res),
-        @r###"
+        @r"
     +---------+
     | CoLuMn1 |
     +---------+
     | a       |
     +---------+
-    "###
+    "
     );
 
     let df_renamed = df_renamed
@@ -2118,13 +2244,13 @@ async fn with_column_renamed_case_sensitive() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_renamed),
-        @r###"
+        @r"
     +----+
     | c1 |
     +----+
     | a  |
     +----+
-    "###
+    "
     );
 
     Ok(())
@@ -2162,19 +2288,19 @@ async fn describe_lookup_via_quoted_identifier() -> Result<()> {
         .await?;
     assert_snapshot!(
         batches_to_sort_string(&describe_result.clone().collect().await?),
-        @r###"
-        +------------+--------------+
-        | describe   | CoLu.Mn["1"] |
-        +------------+--------------+
-        | count      | 1            |
-        | max        | a            |
-        | mean       | null         |
-        | median     | null         |
-        | min        | a            |
-        | null_count | 0            |
-        | std        | null         |
-        +------------+--------------+
-    "###
+        @r#"
+    +------------+--------------+
+    | describe   | CoLu.Mn["1"] |
+    +------------+--------------+
+    | count      | 1            |
+    | max        | a            |
+    | mean       | null         |
+    | median     | null         |
+    | min        | a            |
+    | null_count | 0            |
+    | std        | null         |
+    +------------+--------------+
+    "#
     );
 
     Ok(())
@@ -2192,13 +2318,13 @@ async fn cast_expr_test() -> Result<()> {
     df.clone().show().await?;
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+----+-----+
     | c2 | c3 | sum |
     +----+----+-----+
     | 2  | 1  | 3   |
     +----+----+-----+
-    "###
+    "
     );
 
     Ok(())
@@ -2214,12 +2340,14 @@ async fn row_writer_resize_test() -> Result<()> {
 
     let data = RecordBatch::try_new(
         schema,
-        vec![
-            Arc::new(StringArray::from(vec![
-                Some("2a0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"),
-                Some("3a0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000800"),
-            ]))
-        ],
+        vec![Arc::new(StringArray::from(vec![
+            Some(
+                "2a0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
+            ),
+            Some(
+                "3a0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000800",
+            ),
+        ]))],
     )?;
 
     let ctx = SessionContext::new();
@@ -2258,14 +2386,14 @@ async fn with_column_name() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +------+-------+
     | f.c1 | f.c2  |
     +------+-------+
     | 1    | hello |
     | 10   | hello |
     +------+-------+
-    "###
+    "
     );
 
     Ok(())
@@ -2301,13 +2429,13 @@ async fn cache_test() -> Result<()> {
     let cached_df_results = cached_df.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&cached_df_results),
-        @r###"
+        @r"
     +----+----+-----+
     | c2 | c3 | sum |
     +----+----+-----+
     | 2  | 1  | 3   |
     +----+----+-----+
-    "###
+    "
     );
 
     assert_eq!(&df_results, &cached_df_results);
@@ -2315,6 +2443,28 @@ async fn cache_test() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn cache_producer_test() -> Result<()> {
+    let df = test_table_with_cache_factory()
+        .await?
+        .select_columns(&["c2", "c3"])?
+        .limit(0, Some(1))?
+        .with_column("sum", cast(col("c2") + col("c3"), DataType::Int64))?;
+
+    let cached_df = df.clone().cache().await?;
+
+    assert_snapshot!(
+        cached_df.clone().into_optimized_plan().unwrap(),
+        @r"
+    CacheNode
+      Projection: aggregate_test_100.c2, aggregate_test_100.c3, CAST(CAST(aggregate_test_100.c2 AS Int64) + CAST(aggregate_test_100.c3 AS Int64) AS Int64) AS sum
+        Limit: skip=0, fetch=1
+          TableScan: aggregate_test_100 projection=[c2, c3], fetch=1
+    "
+    );
+    Ok(())
+}
+
 #[tokio::test]
 async fn partition_aware_union() -> Result<()> {
     let left = test_table().await?.select_columns(&["c1", "c2"])?;
@@ -2584,13 +2734,13 @@ async fn filtered_aggr_with_param_values() -> Result<()> {
     let df_results = df?.collect().await?;
     assert_snapshot!(
         batches_to_string(&df_results),
-        @r###"
+        @r"
     +------------------------------------------------+
     | count(table1.c2) FILTER (WHERE table1.c3 > $1) |
     +------------------------------------------------+
     | 54                                             |
     +------------------------------------------------+
-    "###
+    "
     );
 
     Ok(())
@@ -2638,7 +2788,7 @@ async fn write_parquet_with_order() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +---+---+
     | a | b |
     +---+---+
@@ -2648,7 +2798,7 @@ async fn write_parquet_with_order() -> Result<()> {
     | 5 | 3 |
     | 7 | 4 |
     +---+---+
-    "###
+    "
     );
 
     Ok(())
@@ -2696,7 +2846,7 @@ async fn write_csv_with_order() -> Result<()> {
 
     assert_snapshot!(
         batches_to_string(&results),
-        @r###"
+        @r"
     +---+---+
     | a | b |
     +---+---+
@@ -2706,7 +2856,7 @@ async fn write_csv_with_order() -> Result<()> {
     | 5 | 3 |
     | 7 | 4 |
     +---+---+
-    "###
+    "
     );
     Ok(())
 }
@@ -2744,7 +2894,7 @@ async fn write_json_with_order() -> Result<()> {
     ctx.register_json(
         "data",
         test_path.to_str().unwrap(),
-        NdJsonReadOptions::default().schema(&schema),
+        JsonReadOptions::default().schema(&schema),
     )
     .await?;
 
@@ -2753,7 +2903,7 @@ async fn write_json_with_order() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +---+---+
     | a | b |
     +---+---+
@@ -2763,7 +2913,7 @@ async fn write_json_with_order() -> Result<()> {
     | 5 | 3 |
     | 7 | 4 |
     +---+---+
-    "###
+    "
     );
     Ok(())
 }
@@ -2812,7 +2962,7 @@ async fn write_table_with_order() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +-----------+
     | tablecol1 |
     +-----------+
@@ -2822,7 +2972,7 @@ async fn write_table_with_order() -> Result<()> {
     | x         |
     | z         |
     +-----------+
-    "###
+    "
     );
     Ok(())
 }
@@ -2849,50 +2999,44 @@ async fn test_count_wildcard_on_sort() -> Result<()> {
 
     assert_snapshot!(
         pretty_format_batches(&sql_results).unwrap(),
-        @r###"
-    +---------------+------------------------------------------------------------------------------------------------------------+
-    | plan_type     | plan                                                                                                       |
-    +---------------+------------------------------------------------------------------------------------------------------------+
-    | logical_plan  | Projection: t1.b, count(*)                                                                                 |
-    |               |   Sort: count(Int64(1)) AS count(*) AS count(*) ASC NULLS LAST                                             |
-    |               |     Projection: t1.b, count(Int64(1)) AS count(*), count(Int64(1))                                         |
-    |               |       Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1))]]                                                |
-    |               |         TableScan: t1 projection=[b]                                                                       |
-    | physical_plan | ProjectionExec: expr=[b@0 as b, count(*)@1 as count(*)]                                                    |
-    |               |   SortPreservingMergeExec: [count(Int64(1))@2 ASC NULLS LAST]                                              |
-    |               |     SortExec: expr=[count(*)@1 ASC NULLS LAST], preserve_partitioning=[true]                               |
-    |               |       ProjectionExec: expr=[b@0 as b, count(Int64(1))@1 as count(*), count(Int64(1))@1 as count(Int64(1))] |
-    |               |         AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(Int64(1))]                       |
-    |               |           CoalesceBatchesExec: target_batch_size=8192                                                      |
-    |               |             RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4                               |
-    |               |               RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1                         |
-    |               |                 AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(Int64(1))]                        |
-    |               |                   DataSourceExec: partitions=1, partition_sizes=[1]                                        |
-    |               |                                                                                                            |
-    +---------------+------------------------------------------------------------------------------------------------------------+
-    "###
+        @r"
+    +---------------+------------------------------------------------------------------------------------+
+    | plan_type     | plan                                                                               |
+    +---------------+------------------------------------------------------------------------------------+
+    | logical_plan  | Sort: count(*) ASC NULLS LAST                                                      |
+    |               |   Projection: t1.b, count(Int64(1)) AS count(*)                                    |
+    |               |     Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1))]]                          |
+    |               |       TableScan: t1 projection=[b]                                                 |
+    | physical_plan | SortPreservingMergeExec: [count(*)@1 ASC NULLS LAST]                               |
+    |               |   SortExec: expr=[count(*)@1 ASC NULLS LAST], preserve_partitioning=[true]         |
+    |               |     ProjectionExec: expr=[b@0 as b, count(Int64(1))@1 as count(*)]                 |
+    |               |       AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(Int64(1))] |
+    |               |         RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=1           |
+    |               |           AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(Int64(1))]      |
+    |               |             DataSourceExec: partitions=1, partition_sizes=[1]                      |
+    |               |                                                                                    |
+    +---------------+------------------------------------------------------------------------------------+
+    "
     );
 
     assert_snapshot!(
         pretty_format_batches(&df_results).unwrap(),
-        @r###"
-    +---------------+--------------------------------------------------------------------------------+
-    | plan_type     | plan                                                                           |
-    +---------------+--------------------------------------------------------------------------------+
-    | logical_plan  | Sort: count(*) ASC NULLS LAST                                                  |
-    |               |   Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1)) AS count(*)]]            |
-    |               |     TableScan: t1 projection=[b]                                               |
-    | physical_plan | SortPreservingMergeExec: [count(*)@1 ASC NULLS LAST]                           |
-    |               |   SortExec: expr=[count(*)@1 ASC NULLS LAST], preserve_partitioning=[true]     |
-    |               |     AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(*)]      |
-    |               |       CoalesceBatchesExec: target_batch_size=8192                              |
-    |               |         RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4       |
-    |               |           RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 |
-    |               |             AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(*)]       |
-    |               |               DataSourceExec: partitions=1, partition_sizes=[1]                |
-    |               |                                                                                |
-    +---------------+--------------------------------------------------------------------------------+
-    "###
+        @r"
+    +---------------+----------------------------------------------------------------------------+
+    | plan_type     | plan                                                                       |
+    +---------------+----------------------------------------------------------------------------+
+    | logical_plan  | Sort: count(*) AS count(*) ASC NULLS LAST                                  |
+    |               |   Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1)) AS count(*)]]        |
+    |               |     TableScan: t1 projection=[b]                                           |
+    | physical_plan | SortPreservingMergeExec: [count(*)@1 ASC NULLS LAST]                       |
+    |               |   SortExec: expr=[count(*)@1 ASC NULLS LAST], preserve_partitioning=[true] |
+    |               |     AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(*)]  |
+    |               |       RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=1     |
+    |               |         AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(*)]       |
+    |               |           DataSourceExec: partitions=1, partition_sizes=[1]                |
+    |               |                                                                            |
+    +---------------+----------------------------------------------------------------------------+
+    "
     );
     Ok(())
 }
@@ -2910,23 +3054,22 @@ async fn test_count_wildcard_on_where_in() -> Result<()> {
     assert_snapshot!(
         pretty_format_batches(&sql_results).unwrap(),
         @r"
-    +---------------+------------------------------------------------------------------------------------------------------------------------+
-    | plan_type     | plan                                                                                                                   |
-    +---------------+------------------------------------------------------------------------------------------------------------------------+
-    | logical_plan  | LeftSemi Join: CAST(t1.a AS Int64) = __correlated_sq_1.count(*)                                                        |
-    |               |   TableScan: t1 projection=[a, b]                                                                                      |
-    |               |   SubqueryAlias: __correlated_sq_1                                                                                     |
-    |               |     Projection: count(Int64(1)) AS count(*)                                                                            |
-    |               |       Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]                                                                |
-    |               |         TableScan: t2 projection=[]                                                                                    |
-    | physical_plan | CoalesceBatchesExec: target_batch_size=8192                                                                            |
-    |               |   HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(count(*)@0, CAST(t1.a AS Int64)@2)], projection=[a@0, b@1] |
-    |               |     ProjectionExec: expr=[4 as count(*)]                                                                               |
-    |               |       PlaceholderRowExec                                                                                               |
-    |               |     ProjectionExec: expr=[a@0 as a, b@1 as b, CAST(a@0 AS Int64) as CAST(t1.a AS Int64)]                               |
-    |               |       DataSourceExec: partitions=1, partition_sizes=[1]                                                                |
-    |               |                                                                                                                        |
-    +---------------+------------------------------------------------------------------------------------------------------------------------+
+    +---------------+----------------------------------------------------------------------------------------------------------------------+
+    | plan_type     | plan                                                                                                                 |
+    +---------------+----------------------------------------------------------------------------------------------------------------------+
+    | logical_plan  | LeftSemi Join: CAST(t1.a AS Int64) = __correlated_sq_1.count(*)                                                      |
+    |               |   TableScan: t1 projection=[a, b]                                                                                    |
+    |               |   SubqueryAlias: __correlated_sq_1                                                                                   |
+    |               |     Projection: count(Int64(1)) AS count(*)                                                                          |
+    |               |       Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]                                                              |
+    |               |         TableScan: t2 projection=[]                                                                                  |
+    | physical_plan | HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(count(*)@0, CAST(t1.a AS Int64)@2)], projection=[a@0, b@1] |
+    |               |   ProjectionExec: expr=[4 as count(*)]                                                                               |
+    |               |     PlaceholderRowExec                                                                                               |
+    |               |   ProjectionExec: expr=[a@0 as a, b@1 as b, CAST(a@0 AS Int64) as CAST(t1.a AS Int64)]                               |
+    |               |     DataSourceExec: partitions=1, partition_sizes=[1]                                                                |
+    |               |                                                                                                                      |
+    +---------------+----------------------------------------------------------------------------------------------------------------------+
     "
     );
 
@@ -2956,22 +3099,21 @@ async fn test_count_wildcard_on_where_in() -> Result<()> {
     assert_snapshot!(
         pretty_format_batches(&df_results).unwrap(),
         @r"
-    +---------------+------------------------------------------------------------------------------------------------------------------------+
-    | plan_type     | plan                                                                                                                   |
-    +---------------+------------------------------------------------------------------------------------------------------------------------+
-    | logical_plan  | LeftSemi Join: CAST(t1.a AS Int64) = __correlated_sq_1.count(*)                                                        |
-    |               |   TableScan: t1 projection=[a, b]                                                                                      |
-    |               |   SubqueryAlias: __correlated_sq_1                                                                                     |
-    |               |     Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]]                                                      |
-    |               |       TableScan: t2 projection=[]                                                                                      |
-    | physical_plan | CoalesceBatchesExec: target_batch_size=8192                                                                            |
-    |               |   HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(count(*)@0, CAST(t1.a AS Int64)@2)], projection=[a@0, b@1] |
-    |               |     ProjectionExec: expr=[4 as count(*)]                                                                               |
-    |               |       PlaceholderRowExec                                                                                               |
-    |               |     ProjectionExec: expr=[a@0 as a, b@1 as b, CAST(a@0 AS Int64) as CAST(t1.a AS Int64)]                               |
-    |               |       DataSourceExec: partitions=1, partition_sizes=[1]                                                                |
-    |               |                                                                                                                        |
-    +---------------+------------------------------------------------------------------------------------------------------------------------+
+    +---------------+----------------------------------------------------------------------------------------------------------------------+
+    | plan_type     | plan                                                                                                                 |
+    +---------------+----------------------------------------------------------------------------------------------------------------------+
+    | logical_plan  | LeftSemi Join: CAST(t1.a AS Int64) = __correlated_sq_1.count(*)                                                      |
+    |               |   TableScan: t1 projection=[a, b]                                                                                    |
+    |               |   SubqueryAlias: __correlated_sq_1                                                                                   |
+    |               |     Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]]                                                    |
+    |               |       TableScan: t2 projection=[]                                                                                    |
+    | physical_plan | HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(count(*)@0, CAST(t1.a AS Int64)@2)], projection=[a@0, b@1] |
+    |               |   ProjectionExec: expr=[4 as count(*)]                                                                               |
+    |               |     PlaceholderRowExec                                                                                               |
+    |               |   ProjectionExec: expr=[a@0 as a, b@1 as b, CAST(a@0 AS Int64) as CAST(t1.a AS Int64)]                               |
+    |               |     DataSourceExec: partitions=1, partition_sizes=[1]                                                                |
+    |               |                                                                                                                      |
+    +---------------+----------------------------------------------------------------------------------------------------------------------+
     "
     );
 
@@ -3077,15 +3219,17 @@ async fn test_count_wildcard_on_window() -> Result<()> {
     let df_results = ctx
         .table("t1")
         .await?
-        .select(vec![count_all_window()
-            .order_by(vec![Sort::new(col("a"), false, true)])
-            .window_frame(WindowFrame::new_bounds(
-                WindowFrameUnits::Range,
-                WindowFrameBound::Preceding(ScalarValue::UInt32(Some(6))),
-                WindowFrameBound::Following(ScalarValue::UInt32(Some(2))),
-            ))
-            .build()
-            .unwrap()])?
+        .select(vec![
+            count_all_window()
+                .order_by(vec![Sort::new(col("a"), false, true)])
+                .window_frame(WindowFrame::new_bounds(
+                    WindowFrameUnits::Range,
+                    WindowFrameBound::Preceding(ScalarValue::UInt32(Some(6))),
+                    WindowFrameBound::Following(ScalarValue::UInt32(Some(2))),
+                ))
+                .build()
+                .unwrap(),
+        ])?
         .explain(false, false)?
         .collect()
         .await?;
@@ -3113,30 +3257,29 @@ async fn test_count_wildcard_on_window() -> Result<()> {
 
 #[tokio::test]
 // Test with `repartition_sorts` disabled, causing a full resort of the data
-async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_with_repartition_sorts_false(
-) -> Result<()> {
+async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_with_repartition_sorts_false()
+-> Result<()> {
     assert_snapshot!(
         union_with_mix_of_presorted_and_explicitly_resorted_inputs_impl(false).await?,
-        @r#"
+        @r"
     AggregateExec: mode=Final, gby=[id@0 as id], aggr=[], ordering_mode=Sorted
-      SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
-        CoalescePartitionsExec
-          AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[]
-            UnionExec
-              DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+      SortPreservingMergeExec: [id@0 ASC NULLS LAST]
+        AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[], ordering_mode=Sorted
+          UnionExec
+            DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+            SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
               DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], file_type=parquet
-    "#);
+    ");
     Ok(())
 }
 
-#[ignore] // See https://github.com/apache/datafusion/issues/18380
 #[tokio::test]
 // Test with `repartition_sorts` enabled to preserve pre-sorted partitions and avoid resorting
-async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_with_repartition_sorts_true(
-) -> Result<()> {
+async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_with_repartition_sorts_true()
+-> Result<()> {
     assert_snapshot!(
         union_with_mix_of_presorted_and_explicitly_resorted_inputs_impl(true).await?,
-        @r#"
+        @r"
     AggregateExec: mode=Final, gby=[id@0 as id], aggr=[], ordering_mode=Sorted
       SortPreservingMergeExec: [id@0 ASC NULLS LAST]
         AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[], ordering_mode=Sorted
@@ -3144,53 +3287,7 @@ async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_with_reparti
             DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
             SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
               DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], file_type=parquet
-    "#);
-
-    // 💥 Doesn't pass, and generates this plan:
-    //
-    // AggregateExec: mode=Final, gby=[id@0 as id], aggr=[], ordering_mode=Sorted
-    //   SortPreservingMergeExec: [id@0 ASC NULLS LAST]
-    //     SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true]
-    //       AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[]
-    //         UnionExec
-    //           DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
-    //           DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], file_type=parquet
-    //
-    //
-    // === Excerpt from the verbose explain ===
-    //
-    // +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-    // | plan_type                                                  | plan                                                                                                                                                                                                                                                                                                                                        |
-    // +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-    // | initial_physical_plan                                      | AggregateExec: mode=Final, gby=[id@0 as id], aggr=[], ordering_mode=Sorted                                                                                                                                                                                                                                                                  |
-    // |                                                            |   AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[], ordering_mode=Sorted                                                                                                                                                                                                                                                              |
-    // |                                                            |     UnionExec                                                                                                                                                                                                                                                                                                                               |
-    // |                                                            |       DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet                                                                                                                               |
-    // |                                                            |       SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]                                                                                                                                                                                                                                                                   |
-    // |                                                            |         DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], file_type=parquet                                                                                                                                                                    |
-    // ...
-    // | physical_plan after EnforceDistribution                    | OutputRequirementExec: order_by=[], dist_by=Unspecified                                                                                                                                                                                                                                                                                     |
-    // |                                                            |   AggregateExec: mode=Final, gby=[id@0 as id], aggr=[], ordering_mode=Sorted                                                                                                                                                                                                                                                                |
-    // |                                                            |     SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]                                                                                                                                                                                                                                                                     |
-    // |                                                            |       CoalescePartitionsExec                                                                                                                                                                                                                                                                                                                |
-    // |                                                            |         AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[], ordering_mode=Sorted                                                                                                                                                                                                                                                        |
-    // |                                                            |           UnionExec                                                                                                                                                                                                                                                                                                                         |
-    // |                                                            |             DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet                                                                                                                         |
-    // |                                                            |             SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]                                                                                                                                                                                                                                                             |
-    // |                                                            |               DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], file_type=parquet                                                                                                                                                              |
-    // |                                                            |                                                                                                                                                                                                                                                                                                                                             |
-    // | physical_plan after CombinePartialFinalAggregate           | SAME TEXT AS ABOVE
-    // |                                                            |                                                                                                                                                                                                                                                                                                                                             |
-    // | physical_plan after EnforceSorting                         | OutputRequirementExec: order_by=[], dist_by=Unspecified                                                                                                                                                                                                                                                                                     |
-    // |                                                            |   AggregateExec: mode=Final, gby=[id@0 as id], aggr=[], ordering_mode=Sorted                                                                                                                                                                                                                                                                |
-    // |                                                            |     SortPreservingMergeExec: [id@0 ASC NULLS LAST]                                                                                                                                                                                                                                                                                          |
-    // |                                                            |       SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                                                                                                                                    |
-    // |                                                            |         AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[]                                                                                                                                                                                                                                                                              |
-    // |                                                            |           UnionExec                                                                                                                                                                                                                                                                                                                         |
-    // |                                                            |             DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet                                                                                                                         |
-    // |                                                            |             DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], file_type=parquet                                                                                                                                                                |
-    // ...
-    // +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    ");
 
     Ok(())
 }
@@ -3275,7 +3372,7 @@ async fn test_count_wildcard_on_aggregate() -> Result<()> {
 
     assert_snapshot!(
         pretty_format_batches(&sql_results).unwrap(),
-        @r###"
+        @r"
     +---------------+-----------------------------------------------------+
     | plan_type     | plan                                                |
     +---------------+-----------------------------------------------------+
@@ -3286,7 +3383,7 @@ async fn test_count_wildcard_on_aggregate() -> Result<()> {
     |               |   PlaceholderRowExec                                |
     |               |                                                     |
     +---------------+-----------------------------------------------------+
-    "###
+    "
     );
 
     // add `.select(vec![count_wildcard()])?` to make sure we can analyze all node instead of just top node.
@@ -3301,7 +3398,7 @@ async fn test_count_wildcard_on_aggregate() -> Result<()> {
 
     assert_snapshot!(
         pretty_format_batches(&df_results).unwrap(),
-        @r###"
+        @r"
     +---------------+---------------------------------------------------------------+
     | plan_type     | plan                                                          |
     +---------------+---------------------------------------------------------------+
@@ -3311,7 +3408,7 @@ async fn test_count_wildcard_on_aggregate() -> Result<()> {
     |               |   PlaceholderRowExec                                          |
     |               |                                                               |
     +---------------+---------------------------------------------------------------+
-    "###
+    "
     );
 
     Ok(())
@@ -3331,32 +3428,30 @@ async fn test_count_wildcard_on_where_scalar_subquery() -> Result<()> {
     assert_snapshot!(
         pretty_format_batches(&sql_results).unwrap(),
         @r"
-    +---------------+---------------------------------------------------------------------------------------------------------------------------+
-    | plan_type     | plan                                                                                                                      |
-    +---------------+---------------------------------------------------------------------------------------------------------------------------+
-    | logical_plan  | Projection: t1.a, t1.b                                                                                                    |
-    |               |   Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.count(*) END > Int64(0)          |
-    |               |     Projection: t1.a, t1.b, __scalar_sq_1.count(*), __scalar_sq_1.__always_true                                           |
-    |               |       Left Join: t1.a = __scalar_sq_1.a                                                                                   |
-    |               |         TableScan: t1 projection=[a, b]                                                                                   |
-    |               |         SubqueryAlias: __scalar_sq_1                                                                                      |
-    |               |           Projection: count(Int64(1)) AS count(*), t2.a, Boolean(true) AS __always_true                                   |
-    |               |             Aggregate: groupBy=[[t2.a]], aggr=[[count(Int64(1))]]                                                         |
-    |               |               TableScan: t2 projection=[a]                                                                                |
-    | physical_plan | CoalesceBatchesExec: target_batch_size=8192                                                                               |
-    |               |   FilterExec: CASE WHEN __always_true@3 IS NULL THEN 0 ELSE count(*)@2 END > 0, projection=[a@0, b@1]                     |
-    |               |     CoalesceBatchesExec: target_batch_size=8192                                                                           |
-    |               |       HashJoinExec: mode=CollectLeft, join_type=Left, on=[(a@0, a@1)], projection=[a@0, b@1, count(*)@2, __always_true@4] |
-    |               |         DataSourceExec: partitions=1, partition_sizes=[1]                                                                 |
-    |               |         ProjectionExec: expr=[count(Int64(1))@1 as count(*), a@0 as a, true as __always_true]                             |
-    |               |           AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(Int64(1))]                                    |
-    |               |             CoalesceBatchesExec: target_batch_size=8192                                                                   |
-    |               |               RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4                                            |
-    |               |                 RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1                                      |
-    |               |                   AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(Int64(1))]                                     |
-    |               |                     DataSourceExec: partitions=1, partition_sizes=[1]                                                     |
-    |               |                                                                                                                           |
-    +---------------+---------------------------------------------------------------------------------------------------------------------------+
+    +---------------+--------------------------------------------------------------------------------------------------------------------------+
+    | plan_type     | plan                                                                                                                     |
+    +---------------+--------------------------------------------------------------------------------------------------------------------------+
+    | logical_plan  | Projection: t1.a, t1.b                                                                                                   |
+    |               |   Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.count(*) END > Int64(0)         |
+    |               |     Projection: t1.a, t1.b, __scalar_sq_1.count(*), __scalar_sq_1.__always_true                                          |
+    |               |       Left Join: t1.a = __scalar_sq_1.a                                                                                  |
+    |               |         TableScan: t1 projection=[a, b]                                                                                  |
+    |               |         SubqueryAlias: __scalar_sq_1                                                                                     |
+    |               |           Projection: count(Int64(1)) AS count(*), t2.a, Boolean(true) AS __always_true                                  |
+    |               |             Aggregate: groupBy=[[t2.a]], aggr=[[count(Int64(1))]]                                                        |
+    |               |               TableScan: t2 projection=[a]                                                                               |
+    | physical_plan | FilterExec: CASE WHEN __always_true@3 IS NULL THEN 0 ELSE count(*)@2 END > 0, projection=[a@0, b@1]                      |
+    |               |   RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1                                                   |
+    |               |     HashJoinExec: mode=CollectLeft, join_type=Right, on=[(a@1, a@0)], projection=[a@3, b@4, count(*)@0, __always_true@2] |
+    |               |       CoalescePartitionsExec                                                                                             |
+    |               |         ProjectionExec: expr=[count(Int64(1))@1 as count(*), a@0 as a, true as __always_true]                            |
+    |               |           AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(Int64(1))]                                   |
+    |               |             RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1                                             |
+    |               |               AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(Int64(1))]                                        |
+    |               |                 DataSourceExec: partitions=1, partition_sizes=[1]                                                        |
+    |               |       DataSourceExec: partitions=1, partition_sizes=[1]                                                                  |
+    |               |                                                                                                                          |
+    +---------------+--------------------------------------------------------------------------------------------------------------------------+
     "
     );
 
@@ -3388,32 +3483,30 @@ async fn test_count_wildcard_on_where_scalar_subquery() -> Result<()> {
     assert_snapshot!(
         pretty_format_batches(&df_results).unwrap(),
         @r"
-    +---------------+---------------------------------------------------------------------------------------------------------------------------+
-    | plan_type     | plan                                                                                                                      |
-    +---------------+---------------------------------------------------------------------------------------------------------------------------+
-    | logical_plan  | Projection: t1.a, t1.b                                                                                                    |
-    |               |   Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.count(*) END > Int64(0)          |
-    |               |     Projection: t1.a, t1.b, __scalar_sq_1.count(*), __scalar_sq_1.__always_true                                           |
-    |               |       Left Join: t1.a = __scalar_sq_1.a                                                                                   |
-    |               |         TableScan: t1 projection=[a, b]                                                                                   |
-    |               |         SubqueryAlias: __scalar_sq_1                                                                                      |
-    |               |           Projection: count(*), t2.a, Boolean(true) AS __always_true                                                      |
-    |               |             Aggregate: groupBy=[[t2.a]], aggr=[[count(Int64(1)) AS count(*)]]                                             |
-    |               |               TableScan: t2 projection=[a]                                                                                |
-    | physical_plan | CoalesceBatchesExec: target_batch_size=8192                                                                               |
-    |               |   FilterExec: CASE WHEN __always_true@3 IS NULL THEN 0 ELSE count(*)@2 END > 0, projection=[a@0, b@1]                     |
-    |               |     CoalesceBatchesExec: target_batch_size=8192                                                                           |
-    |               |       HashJoinExec: mode=CollectLeft, join_type=Left, on=[(a@0, a@1)], projection=[a@0, b@1, count(*)@2, __always_true@4] |
-    |               |         DataSourceExec: partitions=1, partition_sizes=[1]                                                                 |
-    |               |         ProjectionExec: expr=[count(*)@1 as count(*), a@0 as a, true as __always_true]                                    |
-    |               |           AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(*)]                                           |
-    |               |             CoalesceBatchesExec: target_batch_size=8192                                                                   |
-    |               |               RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4                                            |
-    |               |                 RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1                                      |
-    |               |                   AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(*)]                                            |
-    |               |                     DataSourceExec: partitions=1, partition_sizes=[1]                                                     |
-    |               |                                                                                                                           |
-    +---------------+---------------------------------------------------------------------------------------------------------------------------+
+    +---------------+--------------------------------------------------------------------------------------------------------------------------+
+    | plan_type     | plan                                                                                                                     |
+    +---------------+--------------------------------------------------------------------------------------------------------------------------+
+    | logical_plan  | Projection: t1.a, t1.b                                                                                                   |
+    |               |   Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.count(*) END > Int64(0)         |
+    |               |     Projection: t1.a, t1.b, __scalar_sq_1.count(*), __scalar_sq_1.__always_true                                          |
+    |               |       Left Join: t1.a = __scalar_sq_1.a                                                                                  |
+    |               |         TableScan: t1 projection=[a, b]                                                                                  |
+    |               |         SubqueryAlias: __scalar_sq_1                                                                                     |
+    |               |           Projection: count(*), t2.a, Boolean(true) AS __always_true                                                     |
+    |               |             Aggregate: groupBy=[[t2.a]], aggr=[[count(Int64(1)) AS count(*)]]                                            |
+    |               |               TableScan: t2 projection=[a]                                                                               |
+    | physical_plan | FilterExec: CASE WHEN __always_true@3 IS NULL THEN 0 ELSE count(*)@2 END > 0, projection=[a@0, b@1]                      |
+    |               |   RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1                                                   |
+    |               |     HashJoinExec: mode=CollectLeft, join_type=Right, on=[(a@1, a@0)], projection=[a@3, b@4, count(*)@0, __always_true@2] |
+    |               |       CoalescePartitionsExec                                                                                             |
+    |               |         ProjectionExec: expr=[count(*)@1 as count(*), a@0 as a, true as __always_true]                                   |
+    |               |           AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(*)]                                          |
+    |               |             RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1                                             |
+    |               |               AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(*)]                                               |
+    |               |                 DataSourceExec: partitions=1, partition_sizes=[1]                                                        |
+    |               |       DataSourceExec: partitions=1, partition_sizes=[1]                                                                  |
+    |               |                                                                                                                          |
+    +---------------+--------------------------------------------------------------------------------------------------------------------------+
     "
     );
 
@@ -3498,7 +3591,7 @@ async fn sort_on_unprojected_columns() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +-----+
     | a   |
     +-----+
@@ -3507,7 +3600,7 @@ async fn sort_on_unprojected_columns() -> Result<()> {
     | 10  |
     | 1   |
     +-----+
-    "###
+    "
     );
 
     Ok(())
@@ -3545,7 +3638,7 @@ async fn sort_on_distinct_columns() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +-----+
     | a   |
     +-----+
@@ -3553,7 +3646,7 @@ async fn sort_on_distinct_columns() -> Result<()> {
     | 10  |
     | 1   |
     +-----+
-    "###
+    "
     );
     Ok(())
 }
@@ -3684,14 +3777,14 @@ async fn filter_with_alias_overwrite() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +------+
     | a    |
     +------+
     | true |
     | true |
     +------+
-    "###
+    "
     );
 
     Ok(())
@@ -3720,7 +3813,7 @@ async fn select_with_alias_overwrite() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +-------+
     | a     |
     +-------+
@@ -3729,7 +3822,7 @@ async fn select_with_alias_overwrite() -> Result<()> {
     | true  |
     | false |
     +-------+
-    "###
+    "
     );
 
     Ok(())
@@ -3755,7 +3848,7 @@ async fn test_grouping_sets() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +-----------+-----+---------------+
     | a         | b   | count(test.a) |
     +-----------+-----+---------------+
@@ -3771,7 +3864,7 @@ async fn test_grouping_sets() -> Result<()> {
     | 123AbcDef |     | 1             |
     | 123AbcDef | 100 | 1             |
     +-----------+-----+---------------+
-    "###
+    "
     );
 
     Ok(())
@@ -3798,7 +3891,7 @@ async fn test_grouping_sets_count() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +----+----+-----------------+
     | c1 | c2 | count(Int32(1)) |
     +----+----+-----------------+
@@ -3813,7 +3906,7 @@ async fn test_grouping_sets_count() -> Result<()> {
     | b  |    | 19              |
     | a  |    | 21              |
     +----+----+-----------------+
-    "###
+    "
     );
 
     Ok(())
@@ -3847,7 +3940,7 @@ async fn test_grouping_set_array_agg_with_overflow() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +----+----+--------+---------------------+
     | c1 | c2 | sum_c3 | avg_c3              |
     +----+----+--------+---------------------+
@@ -3887,7 +3980,7 @@ async fn test_grouping_set_array_agg_with_overflow() -> Result<()> {
     | a  | 2  | -46    | -15.333333333333334 |
     | a  | 1  | -88    | -17.6               |
     +----+----+--------+---------------------+
-    "###
+    "
     );
 
     Ok(())
@@ -3924,25 +4017,25 @@ async fn join_with_alias_filter() -> Result<()> {
     let actual = formatted.trim();
     assert_snapshot!(
         actual,
-        @r###"
+        @r"
     Projection: t1.a, t2.a, t1.b, t1.c, t2.b, t2.c [a:UInt32, a:UInt32, b:Utf8, c:Int32, b:Utf8, c:Int32]
       Inner Join: t1.a + UInt32(3) = t2.a + UInt32(1) [a:UInt32, b:Utf8, c:Int32, a:UInt32, b:Utf8, c:Int32]
         TableScan: t1 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32]
         TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32]
-    "###
+    "
     );
 
     let results = df.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----+----+---+----+---+---+
     | a  | a  | b | c  | b | c |
     +----+----+---+----+---+---+
     | 1  | 3  | a | 10 | a | 1 |
     | 11 | 13 | c | 30 | c | 3 |
     +----+----+---+----+---+---+
-    "###
+    "
     );
 
     Ok(())
@@ -3969,27 +4062,27 @@ async fn right_semi_with_alias_filter() -> Result<()> {
     let actual = formatted.trim();
     assert_snapshot!(
         actual,
-        @r###"
+        @r"
     RightSemi Join: t1.a = t2.a [a:UInt32, b:Utf8, c:Int32]
       Projection: t1.a [a:UInt32]
         Filter: t1.c > Int32(1) [a:UInt32, c:Int32]
           TableScan: t1 projection=[a, c] [a:UInt32, c:Int32]
       Filter: t2.c > Int32(1) [a:UInt32, b:Utf8, c:Int32]
         TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32]
-    "###
+    "
     );
 
     let results = df.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +-----+---+---+
     | a   | b | c |
     +-----+---+---+
     | 10  | b | 2 |
     | 100 | d | 4 |
     +-----+---+---+
-    "###
+    "
     );
 
     Ok(())
@@ -4016,26 +4109,26 @@ async fn right_anti_filter_push_down() -> Result<()> {
     let actual = formatted.trim();
     assert_snapshot!(
         actual,
-        @r###"
+        @r"
     RightAnti Join: t1.a = t2.a Filter: t2.c > Int32(1) [a:UInt32, b:Utf8, c:Int32]
       Projection: t1.a [a:UInt32]
         Filter: t1.c > Int32(1) [a:UInt32, c:Int32]
           TableScan: t1 projection=[a, c] [a:UInt32, c:Int32]
       TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32]
-    "###
+    "
     );
 
     let results = df.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----+---+---+
     | a  | b | c |
     +----+---+---+
     | 13 | c | 3 |
     | 3  | a | 1 |
     +----+---+---+
-    "###
+    "
     );
 
     Ok(())
@@ -4048,37 +4141,37 @@ async fn unnest_columns() -> Result<()> {
     let results = df.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
-          +----------+---------------------------------+--------------------------+
-          | shape_id | points                          | tags                     |
-          +----------+---------------------------------+--------------------------+
-          | 1        | [{x: 5, y: -8}, {x: -3, y: -4}] | [tag1]                   |
-          | 2        | [{x: 6, y: 2}, {x: -2, y: -8}]  | [tag1]                   |
-          | 3        | [{x: -9, y: -7}, {x: -2, y: 5}] | [tag1, tag2, tag3, tag4] |
-          | 4        |                                 | [tag1, tag2, tag3]       |
-          +----------+---------------------------------+--------------------------+
-        "###);
+        @r"
+    +----------+---------------------------------+--------------------------+
+    | shape_id | points                          | tags                     |
+    +----------+---------------------------------+--------------------------+
+    | 1        | [{x: 5, y: -8}, {x: -3, y: -4}] | [tag1]                   |
+    | 2        | [{x: 6, y: 2}, {x: -2, y: -8}]  | [tag1]                   |
+    | 3        | [{x: -9, y: -7}, {x: -2, y: 5}] | [tag1, tag2, tag3, tag4] |
+    | 4        |                                 | [tag1, tag2, tag3]       |
+    +----------+---------------------------------+--------------------------+
+    ");
 
     // Unnest tags
     let df = table_with_nested_types(NUM_ROWS).await?;
     let results = df.unnest_columns(&["tags"])?.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
-          +----------+---------------------------------+------+
-          | shape_id | points                          | tags |
-          +----------+---------------------------------+------+
-          | 1        | [{x: 5, y: -8}, {x: -3, y: -4}] | tag1 |
-          | 2        | [{x: 6, y: 2}, {x: -2, y: -8}]  | tag1 |
-          | 3        | [{x: -9, y: -7}, {x: -2, y: 5}] | tag1 |
-          | 3        | [{x: -9, y: -7}, {x: -2, y: 5}] | tag2 |
-          | 3        | [{x: -9, y: -7}, {x: -2, y: 5}] | tag3 |
-          | 3        | [{x: -9, y: -7}, {x: -2, y: 5}] | tag4 |
-          | 4        |                                 | tag1 |
-          | 4        |                                 | tag2 |
-          | 4        |                                 | tag3 |
-          +----------+---------------------------------+------+
-        "###);
+        @r"
+    +----------+---------------------------------+------+
+    | shape_id | points                          | tags |
+    +----------+---------------------------------+------+
+    | 1        | [{x: 5, y: -8}, {x: -3, y: -4}] | tag1 |
+    | 2        | [{x: 6, y: 2}, {x: -2, y: -8}]  | tag1 |
+    | 3        | [{x: -9, y: -7}, {x: -2, y: 5}] | tag1 |
+    | 3        | [{x: -9, y: -7}, {x: -2, y: 5}] | tag2 |
+    | 3        | [{x: -9, y: -7}, {x: -2, y: 5}] | tag3 |
+    | 3        | [{x: -9, y: -7}, {x: -2, y: 5}] | tag4 |
+    | 4        |                                 | tag1 |
+    | 4        |                                 | tag2 |
+    | 4        |                                 | tag3 |
+    +----------+---------------------------------+------+
+    ");
 
     // Test aggregate results for tags.
     let df = table_with_nested_types(NUM_ROWS).await?;
@@ -4090,19 +4183,19 @@ async fn unnest_columns() -> Result<()> {
     let results = df.unnest_columns(&["points"])?.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
-          +----------+----------------+--------------------------+
-          | shape_id | points         | tags                     |
-          +----------+----------------+--------------------------+
-          | 1        | {x: -3, y: -4} | [tag1]                   |
-          | 1        | {x: 5, y: -8}  | [tag1]                   |
-          | 2        | {x: -2, y: -8} | [tag1]                   |
-          | 2        | {x: 6, y: 2}   | [tag1]                   |
-          | 3        | {x: -2, y: 5}  | [tag1, tag2, tag3, tag4] |
-          | 3        | {x: -9, y: -7} | [tag1, tag2, tag3, tag4] |
-          | 4        |                | [tag1, tag2, tag3]       |
-          +----------+----------------+--------------------------+
-        "###);
+        @r"
+    +----------+----------------+--------------------------+
+    | shape_id | points         | tags                     |
+    +----------+----------------+--------------------------+
+    | 1        | {x: -3, y: -4} | [tag1]                   |
+    | 1        | {x: 5, y: -8}  | [tag1]                   |
+    | 2        | {x: -2, y: -8} | [tag1]                   |
+    | 2        | {x: 6, y: 2}   | [tag1]                   |
+    | 3        | {x: -2, y: 5}  | [tag1, tag2, tag3, tag4] |
+    | 3        | {x: -9, y: -7} | [tag1, tag2, tag3, tag4] |
+    | 4        |                | [tag1, tag2, tag3]       |
+    +----------+----------------+--------------------------+
+    ");
 
     // Test aggregate results for points.
     let df = table_with_nested_types(NUM_ROWS).await?;
@@ -4118,27 +4211,27 @@ async fn unnest_columns() -> Result<()> {
         .await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
-          +----------+----------------+------+
-          | shape_id | points         | tags |
-          +----------+----------------+------+
-          | 1        | {x: -3, y: -4} | tag1 |
-          | 1        | {x: 5, y: -8}  | tag1 |
-          | 2        | {x: -2, y: -8} | tag1 |
-          | 2        | {x: 6, y: 2}   | tag1 |
-          | 3        | {x: -2, y: 5}  | tag1 |
-          | 3        | {x: -2, y: 5}  | tag2 |
-          | 3        | {x: -2, y: 5}  | tag3 |
-          | 3        | {x: -2, y: 5}  | tag4 |
-          | 3        | {x: -9, y: -7} | tag1 |
-          | 3        | {x: -9, y: -7} | tag2 |
-          | 3        | {x: -9, y: -7} | tag3 |
-          | 3        | {x: -9, y: -7} | tag4 |
-          | 4        |                | tag1 |
-          | 4        |                | tag2 |
-          | 4        |                | tag3 |
-          +----------+----------------+------+
-    "###);
+        @r"
+    +----------+----------------+------+
+    | shape_id | points         | tags |
+    +----------+----------------+------+
+    | 1        | {x: -3, y: -4} | tag1 |
+    | 1        | {x: 5, y: -8}  | tag1 |
+    | 2        | {x: -2, y: -8} | tag1 |
+    | 2        | {x: 6, y: 2}   | tag1 |
+    | 3        | {x: -2, y: 5}  | tag1 |
+    | 3        | {x: -2, y: 5}  | tag2 |
+    | 3        | {x: -2, y: 5}  | tag3 |
+    | 3        | {x: -2, y: 5}  | tag4 |
+    | 3        | {x: -9, y: -7} | tag1 |
+    | 3        | {x: -9, y: -7} | tag2 |
+    | 3        | {x: -9, y: -7} | tag3 |
+    | 3        | {x: -9, y: -7} | tag4 |
+    | 4        |                | tag1 |
+    | 4        |                | tag2 |
+    | 4        |                | tag3 |
+    +----------+----------------+------+
+    ");
 
     // Test aggregate results for points and tags.
     let df = table_with_nested_types(NUM_ROWS).await?;
@@ -4178,7 +4271,7 @@ async fn unnest_dict_encoded_columns() -> Result<()> {
     let results = df.collect().await.unwrap();
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +-----------------+---------+
     | make_array_expr | column1 |
     +-----------------+---------+
@@ -4186,7 +4279,7 @@ async fn unnest_dict_encoded_columns() -> Result<()> {
     | y               | y       |
     | z               | z       |
     +-----------------+---------+
-    "###
+    "
     );
 
     // make_array(dict_encoded_string,literal string)
@@ -4206,7 +4299,7 @@ async fn unnest_dict_encoded_columns() -> Result<()> {
     let results = df.collect().await.unwrap();
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +-----------------+---------+
     | make_array_expr | column1 |
     +-----------------+---------+
@@ -4217,7 +4310,7 @@ async fn unnest_dict_encoded_columns() -> Result<()> {
     | z               | z       |
     | fixed_string    | z       |
     +-----------------+---------+
-    "###
+    "
     );
     Ok(())
 }
@@ -4228,7 +4321,7 @@ async fn unnest_column_nulls() -> Result<()> {
     let results = df.clone().collect().await?;
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +--------+----+
     | list   | id |
     +--------+----+
@@ -4237,7 +4330,7 @@ async fn unnest_column_nulls() -> Result<()> {
     | []     | C  |
     | [3]    | D  |
     +--------+----+
-    "###
+    "
     );
 
     // Unnest, preserving nulls (row with B is preserved)
@@ -4250,7 +4343,7 @@ async fn unnest_column_nulls() -> Result<()> {
         .await?;
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +------+----+
     | list | id |
     +------+----+
@@ -4259,7 +4352,7 @@ async fn unnest_column_nulls() -> Result<()> {
     |      | B  |
     | 3    | D  |
     +------+----+
-    "###
+    "
     );
 
     let options = UnnestOptions::new().with_preserve_nulls(false);
@@ -4269,7 +4362,7 @@ async fn unnest_column_nulls() -> Result<()> {
         .await?;
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +------+----+
     | list | id |
     +------+----+
@@ -4277,7 +4370,7 @@ async fn unnest_column_nulls() -> Result<()> {
     | 2    | A  |
     | 3    | D  |
     +------+----+
-    "###
+    "
     );
 
     Ok(())
@@ -4294,7 +4387,7 @@ async fn unnest_fixed_list() -> Result<()> {
     let results = df.clone().collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+----------------+
     | shape_id | tags           |
     +----------+----------------+
@@ -4305,7 +4398,7 @@ async fn unnest_fixed_list() -> Result<()> {
     | 5        | [tag51, tag52] |
     | 6        | [tag61, tag62] |
     +----------+----------------+
-    "###
+    "
     );
 
     let options = UnnestOptions::new().with_preserve_nulls(true);
@@ -4316,7 +4409,7 @@ async fn unnest_fixed_list() -> Result<()> {
         .await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+-------+
     | shape_id | tags  |
     +----------+-------+
@@ -4331,7 +4424,7 @@ async fn unnest_fixed_list() -> Result<()> {
     | 6        | tag61 |
     | 6        | tag62 |
     +----------+-------+
-    "###
+    "
     );
 
     Ok(())
@@ -4348,7 +4441,7 @@ async fn unnest_fixed_list_drop_nulls() -> Result<()> {
     let results = df.clone().collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+----------------+
     | shape_id | tags           |
     +----------+----------------+
@@ -4359,7 +4452,7 @@ async fn unnest_fixed_list_drop_nulls() -> Result<()> {
     | 5        | [tag51, tag52] |
     | 6        | [tag61, tag62] |
     +----------+----------------+
-    "###
+    "
     );
 
     let options = UnnestOptions::new().with_preserve_nulls(false);
@@ -4370,7 +4463,7 @@ async fn unnest_fixed_list_drop_nulls() -> Result<()> {
         .await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+-------+
     | shape_id | tags  |
     +----------+-------+
@@ -4383,7 +4476,7 @@ async fn unnest_fixed_list_drop_nulls() -> Result<()> {
     | 6        | tag61 |
     | 6        | tag62 |
     +----------+-------+
-    "###
+    "
     );
 
     Ok(())
@@ -4419,7 +4512,7 @@ async fn unnest_fixed_list_non_null() -> Result<()> {
     let results = df.clone().collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+----------------+
     | shape_id | tags           |
     +----------+----------------+
@@ -4430,7 +4523,7 @@ async fn unnest_fixed_list_non_null() -> Result<()> {
     | 5        | [tag51, tag52] |
     | 6        | [tag61, tag62] |
     +----------+----------------+
-    "###
+    "
     );
 
     let options = UnnestOptions::new().with_preserve_nulls(true);
@@ -4440,7 +4533,7 @@ async fn unnest_fixed_list_non_null() -> Result<()> {
         .await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+-------+
     | shape_id | tags  |
     +----------+-------+
@@ -4457,7 +4550,7 @@ async fn unnest_fixed_list_non_null() -> Result<()> {
     | 6        | tag61 |
     | 6        | tag62 |
     +----------+-------+
-    "###
+    "
     );
 
     Ok(())
@@ -4471,17 +4564,17 @@ async fn unnest_aggregate_columns() -> Result<()> {
     let results = df.select_columns(&["tags"])?.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
-        +--------------------------+
-        | tags                     |
-        +--------------------------+
-        | [tag1, tag2, tag3, tag4] |
-        | [tag1, tag2, tag3]       |
-        | [tag1, tag2]             |
-        | [tag1]                   |
-        | [tag1]                   |
-        +--------------------------+
-    "###
+        @r"
+    +--------------------------+
+    | tags                     |
+    +--------------------------+
+    | [tag1, tag2, tag3, tag4] |
+    | [tag1, tag2, tag3]       |
+    | [tag1, tag2]             |
+    | [tag1]                   |
+    | [tag1]                   |
+    +--------------------------+
+    "
     );
 
     let df = table_with_nested_types(NUM_ROWS).await?;
@@ -4492,13 +4585,13 @@ async fn unnest_aggregate_columns() -> Result<()> {
         .await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +-------------+
     | count(tags) |
     +-------------+
     | 11          |
     +-------------+
-    "###
+    "
     );
 
     Ok(())
@@ -4571,7 +4664,7 @@ async fn unnest_array_agg() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+--------+
     | shape_id | tag_id |
     +----------+--------+
@@ -4585,7 +4678,7 @@ async fn unnest_array_agg() -> Result<()> {
     | 3        | 32     |
     | 3        | 33     |
     +----------+--------+
-    "###
+    "
     );
 
     // Doing an `array_agg` by `shape_id` produces:
@@ -4599,7 +4692,7 @@ async fn unnest_array_agg() -> Result<()> {
         .await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+--------------+
     | shape_id | tag_id       |
     +----------+--------------+
@@ -4607,7 +4700,7 @@ async fn unnest_array_agg() -> Result<()> {
     | 2        | [21, 22, 23] |
     | 3        | [31, 32, 33] |
     +----------+--------------+
-    "###
+    "
     );
 
     // Unnesting again should produce the original batch.
@@ -4623,7 +4716,7 @@ async fn unnest_array_agg() -> Result<()> {
         .await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+--------+
     | shape_id | tag_id |
     +----------+--------+
@@ -4637,7 +4730,7 @@ async fn unnest_array_agg() -> Result<()> {
     | 3        | 32     |
     | 3        | 33     |
     +----------+--------+
-    "###
+    "
     );
 
     Ok(())
@@ -4667,7 +4760,7 @@ async fn unnest_with_redundant_columns() -> Result<()> {
     let results = df.clone().collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+--------+
     | shape_id | tag_id |
     +----------+--------+
@@ -4681,7 +4774,7 @@ async fn unnest_with_redundant_columns() -> Result<()> {
     | 3        | 32     |
     | 3        | 33     |
     +----------+--------+
-    "###
+    "
     );
 
     // Doing an `array_agg` by `shape_id` produces:
@@ -4703,7 +4796,7 @@ async fn unnest_with_redundant_columns() -> Result<()> {
         @r"
     Projection: shapes.shape_id [shape_id:UInt32]
       Unnest: lists[shape_id2|depth=1] structs[] [shape_id:UInt32, shape_id2:UInt32;N]
-        Aggregate: groupBy=[[shapes.shape_id]], aggr=[[array_agg(shapes.shape_id) AS shape_id2]] [shape_id:UInt32, shape_id2:List(Field { data_type: UInt32, nullable: true });N]
+        Aggregate: groupBy=[[shapes.shape_id]], aggr=[[array_agg(shapes.shape_id) AS shape_id2]] [shape_id:UInt32, shape_id2:List(UInt32);N]
           TableScan: shapes projection=[shape_id] [shape_id:UInt32]
     "
     );
@@ -4711,7 +4804,7 @@ async fn unnest_with_redundant_columns() -> Result<()> {
     let results = df.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+
     | shape_id |
     +----------+
@@ -4725,7 +4818,7 @@ async fn unnest_with_redundant_columns() -> Result<()> {
     | 3        |
     | 3        |
     +----------+
-    "###
+    "
     );
 
     Ok(())
@@ -4766,7 +4859,7 @@ async fn unnest_multiple_columns() -> Result<()> {
     // string:      a, b, c, d
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +------+------------+------------+--------+
     | list | large_list | fixed_list | string |
     +------+------------+------------+--------+
@@ -4780,7 +4873,7 @@ async fn unnest_multiple_columns() -> Result<()> {
     |      |            | 4          | c      |
     |      |            |            | d      |
     +------+------------+------------+--------+
-    "###
+    "
     );
 
     // Test with `preserve_nulls = false``
@@ -4797,7 +4890,7 @@ async fn unnest_multiple_columns() -> Result<()> {
     // string:      a, b, c, d
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +------+------------+------------+--------+
     | list | large_list | fixed_list | string |
     +------+------------+------------+--------+
@@ -4810,7 +4903,7 @@ async fn unnest_multiple_columns() -> Result<()> {
     |      |            | 3          | c      |
     |      |            | 4          | c      |
     +------+------------+------------+--------+
-    "###
+    "
     );
 
     Ok(())
@@ -4839,7 +4932,7 @@ async fn unnest_non_nullable_list() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +----+
     | c1 |
     +----+
@@ -4847,7 +4940,7 @@ async fn unnest_non_nullable_list() -> Result<()> {
     | 2  |
     |    |
     +----+
-    "###
+    "
     );
 
     Ok(())
@@ -4892,7 +4985,7 @@ async fn test_read_batches() -> Result<()> {
     let results = df.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----+--------+
     | id | number |
     +----+--------+
@@ -4905,7 +4998,7 @@ async fn test_read_batches() -> Result<()> {
     | 5  | 3.33   |
     | 5  | 6.66   |
     +----+--------+
-    "###
+    "
     );
     Ok(())
 }
@@ -4926,10 +5019,10 @@ async fn test_read_batches_empty() -> Result<()> {
     let results = df.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     ++
     ++
-    "###
+    "
     );
     Ok(())
 }
@@ -4978,14 +5071,14 @@ async fn consecutive_projection_same_schema() -> Result<()> {
     let results = df.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----+----+----+
     | id | t  | t2 |
     +----+----+----+
     | 0  |    |    |
     | 1  | 10 | 10 |
     +----+----+----+
-    "###
+    "
     );
 
     Ok(())
@@ -5299,13 +5392,13 @@ async fn test_array_agg() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +-------------------------------------+
     | array_agg(test.a)                   |
     +-------------------------------------+
     | [abcDEF, abc123, CBAdef, 123AbcDef] |
     +-------------------------------------+
-    "###
+    "
     );
 
     Ok(())
@@ -5373,10 +5466,10 @@ async fn test_dataframe_placeholder_missing_param_values() -> Result<()> {
     // N.B., the test is basically `SELECT 1 as a WHERE a = 3;` which returns no results.
     assert_snapshot!(
        batches_to_string(&df.collect().await.unwrap()),
-        @r###"
+        @r"
     ++
     ++
-    "###
+    "
     );
 
     Ok(())
@@ -5425,20 +5518,20 @@ async fn test_dataframe_placeholder_column_parameter() -> Result<()> {
     assert_snapshot!(
         actual,
         @r"
-    Projection: Int32(3) AS $1 [$1:Null;N]
+    Projection: Int32(3) AS $1 [$1:Int32]
       EmptyRelation: rows=1 []
     "
     );
 
     assert_snapshot!(
        batches_to_string(&df.collect().await.unwrap()),
-        @r###"
+        @r"
     +----+
     | $1 |
     +----+
     | 3  |
     +----+
-    "###
+    "
     );
 
     Ok(())
@@ -5505,42 +5598,45 @@ async fn test_dataframe_placeholder_like_expression() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&df.collect().await.unwrap()),
-        @r###"
+        @r"
     +-----+
     | a   |
     +-----+
     | foo |
     +-----+
-    "###
+    "
     );
 
     Ok(())
 }
 
+#[rstest]
+#[case(DataType::Utf8)]
+#[case(DataType::LargeUtf8)]
+#[case(DataType::Utf8View)]
 #[tokio::test]
-async fn write_partitioned_parquet_results() -> Result<()> {
-    // create partitioned input file and context
-    let tmp_dir = TempDir::new()?;
-
-    let ctx = SessionContext::new();
-
+async fn write_partitioned_parquet_results(#[case] string_type: DataType) -> Result<()> {
     // Create an in memory table with schema C1 and C2, both strings
     let schema = Arc::new(Schema::new(vec![
-        Field::new("c1", DataType::Utf8, false),
-        Field::new("c2", DataType::Utf8, false),
+        Field::new("c1", string_type.clone(), false),
+        Field::new("c2", string_type.clone(), false),
     ]));
 
-    let record_batch = RecordBatch::try_new(
-        schema.clone(),
-        vec![
-            Arc::new(StringArray::from(vec!["abc", "def"])),
-            Arc::new(StringArray::from(vec!["123", "456"])),
-        ],
-    )?;
+    let columns = [
+        Arc::new(StringArray::from(vec!["abc", "def"])) as ArrayRef,
+        Arc::new(StringArray::from(vec!["123", "456"])) as ArrayRef,
+    ]
+    .map(|col| arrow::compute::cast(&col, &string_type).unwrap())
+    .to_vec();
+
+    let record_batch = RecordBatch::try_new(schema.clone(), columns)?;
 
     let mem_table = Arc::new(MemTable::try_new(schema, vec![vec![record_batch]])?);
 
     // Register the table in the context
+    // create partitioned input file and context
+    let tmp_dir = TempDir::new()?;
+    let ctx = SessionContext::new();
     ctx.register_table("test", mem_table)?;
 
     let local = Arc::new(LocalFileSystem::new_with_prefix(&tmp_dir)?);
@@ -5567,16 +5663,17 @@ async fn write_partitioned_parquet_results() -> Result<()> {
 
     // Check that the c2 column is gone and that c1 is abc.
     let results = filter_df.collect().await?;
+    insta::allow_duplicates! {
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +-----+
     | c1  |
     +-----+
     | abc |
     +-----+
-    "###
-    );
+    "
+    )};
 
     // Read the entire set of parquet files
     let df = ctx
@@ -5589,17 +5686,19 @@ async fn write_partitioned_parquet_results() -> Result<()> {
 
     // Check that the df has the entire set of data
     let results = df.collect().await?;
-    assert_snapshot!(
-        batches_to_sort_string(&results),
-        @r###"
+    insta::allow_duplicates! {
+        assert_snapshot!(
+            batches_to_sort_string(&results),
+            @r"
     +-----+-----+
     | c1  | c2  |
     +-----+-----+
     | abc | 123 |
     | def | 456 |
     +-----+-----+
-    "###
-    );
+    "
+        )
+    };
 
     Ok(())
 }
@@ -5755,7 +5854,7 @@ async fn sparse_union_is_null() {
     // view_all
     assert_snapshot!(
         batches_to_sort_string(&df.clone().collect().await.unwrap()),
-        @r###"
+        @r"
     +----------+
     | my_union |
     +----------+
@@ -5766,14 +5865,14 @@ async fn sparse_union_is_null() {
     | {C=a}    |
     | {C=}     |
     +----------+
-    "###
+    "
     );
 
     // filter where is null
     let result_df = df.clone().filter(col("my_union").is_null()).unwrap();
     assert_snapshot!(
         batches_to_sort_string(&result_df.collect().await.unwrap()),
-        @r###"
+        @r"
     +----------+
     | my_union |
     +----------+
@@ -5781,14 +5880,14 @@ async fn sparse_union_is_null() {
     | {B=}     |
     | {C=}     |
     +----------+
-    "###
+    "
     );
 
     // filter where is not null
     let result_df = df.filter(col("my_union").is_not_null()).unwrap();
     assert_snapshot!(
         batches_to_sort_string(&result_df.collect().await.unwrap()),
-        @r###"
+        @r"
     +----------+
     | my_union |
     +----------+
@@ -5796,7 +5895,7 @@ async fn sparse_union_is_null() {
     | {B=3.2}  |
     | {C=a}    |
     +----------+
-    "###
+    "
     );
 }
 
@@ -5838,7 +5937,7 @@ async fn dense_union_is_null() {
     // view_all
     assert_snapshot!(
         batches_to_sort_string(&df.clone().collect().await.unwrap()),
-        @r###"
+        @r"
     +----------+
     | my_union |
     +----------+
@@ -5849,14 +5948,14 @@ async fn dense_union_is_null() {
     | {C=a}    |
     | {C=}     |
     +----------+
-    "###
+    "
     );
 
     // filter where is null
     let result_df = df.clone().filter(col("my_union").is_null()).unwrap();
     assert_snapshot!(
         batches_to_sort_string(&result_df.collect().await.unwrap()),
-        @r###"
+        @r"
     +----------+
     | my_union |
     +----------+
@@ -5864,14 +5963,14 @@ async fn dense_union_is_null() {
     | {B=}     |
     | {C=}     |
     +----------+
-    "###
+    "
     );
 
     // filter where is not null
     let result_df = df.filter(col("my_union").is_not_null()).unwrap();
     assert_snapshot!(
         batches_to_sort_string(&result_df.collect().await.unwrap()),
-        @r###"
+        @r"
     +----------+
     | my_union |
     +----------+
@@ -5879,7 +5978,7 @@ async fn dense_union_is_null() {
     | {B=3.2}  |
     | {C=a}    |
     +----------+
-    "###
+    "
     );
 }
 
@@ -5911,7 +6010,7 @@ async fn boolean_dictionary_as_filter() {
     // view_all
     assert_snapshot!(
        batches_to_string(&df.clone().collect().await.unwrap()),
-        @r###"
+        @r"
     +---------+
     | my_dict |
     +---------+
@@ -5923,14 +6022,14 @@ async fn boolean_dictionary_as_filter() {
     | true    |
     | false   |
     +---------+
-    "###
+    "
     );
 
     let result_df = df.clone().filter(col("my_dict")).unwrap();
 
     assert_snapshot!(
        batches_to_string(&result_df.collect().await.unwrap()),
-        @r###"
+        @r"
     +---------+
     | my_dict |
     +---------+
@@ -5938,7 +6037,7 @@ async fn boolean_dictionary_as_filter() {
     | true    |
     | true    |
     +---------+
-    "###
+    "
     );
 
     // test nested dictionary
@@ -5969,26 +6068,26 @@ async fn boolean_dictionary_as_filter() {
     // view_all
     assert_snapshot!(
        batches_to_string(&df.clone().collect().await.unwrap()),
-        @r###"
+        @r"
     +----------------+
     | my_nested_dict |
     +----------------+
     | true           |
     | false          |
     +----------------+
-    "###
+    "
     );
 
     let result_df = df.clone().filter(col("my_nested_dict")).unwrap();
     assert_snapshot!(
        batches_to_string(&result_df.collect().await.unwrap()),
-        @r###"
+        @r"
     +----------------+
     | my_nested_dict |
     +----------------+
     | true           |
     +----------------+
-    "###
+    "
     );
 }
 
@@ -6066,11 +6165,11 @@ async fn test_alias() -> Result<()> {
         .into_unoptimized_plan()
         .display_indent_schema()
         .to_string();
-    assert_snapshot!(plan, @r###"
+    assert_snapshot!(plan, @r"
     SubqueryAlias: table_alias [a:Utf8, b:Int32, one:Int32]
       Projection: test.a, test.b, Int32(1) AS one [a:Utf8, b:Int32, one:Int32]
         TableScan: test [a:Utf8, b:Int32]
-    "###);
+    ");
 
     // Select over the aliased DataFrame
     let df = df.select(vec![
@@ -6079,7 +6178,7 @@ async fn test_alias() -> Result<()> {
     ])?;
     assert_snapshot!(
         batches_to_sort_string(&df.collect().await.unwrap()),
-        @r###"
+        @r"
     +-----------+---------------------------------+
     | a         | table_alias.b + table_alias.one |
     +-----------+---------------------------------+
@@ -6088,7 +6187,7 @@ async fn test_alias() -> Result<()> {
     | abc123    | 11                              |
     | abcDEF    | 2                               |
     +-----------+---------------------------------+
-    "###
+    "
     );
     Ok(())
 }
@@ -6118,7 +6217,7 @@ async fn test_alias_self_join() -> Result<()> {
     let joined = left.join(right, JoinType::Full, &["a"], &["a"], None)?;
     assert_snapshot!(
         batches_to_sort_string(&joined.collect().await.unwrap()),
-        @r###"
+        @r"
     +-----------+-----+-----------+-----+
     | a         | b   | a         | b   |
     +-----------+-----+-----------+-----+
@@ -6127,7 +6226,7 @@ async fn test_alias_self_join() -> Result<()> {
     | abc123    | 10  | abc123    | 10  |
     | abcDEF    | 1   | abcDEF    | 1   |
     +-----------+-----+-----------+-----+
-    "###
+    "
     );
     Ok(())
 }
@@ -6140,14 +6239,14 @@ async fn test_alias_empty() -> Result<()> {
         .into_unoptimized_plan()
         .display_indent_schema()
         .to_string();
-    assert_snapshot!(plan, @r###"
+    assert_snapshot!(plan, @r"
     SubqueryAlias:  [a:Utf8, b:Int32]
       TableScan: test [a:Utf8, b:Int32]
-    "###);
+    ");
 
     assert_snapshot!(
         batches_to_sort_string(&df.select(vec![col("a"), col("b")])?.collect().await.unwrap()),
-        @r###"
+        @r"
     +-----------+-----+
     | a         | b   |
     +-----------+-----+
@@ -6156,7 +6255,7 @@ async fn test_alias_empty() -> Result<()> {
     | abc123    | 10  |
     | abcDEF    | 1   |
     +-----------+-----+
-    "###
+    "
     );
 
     Ok(())
@@ -6175,12 +6274,12 @@ async fn test_alias_nested() -> Result<()> {
         .into_optimized_plan()?
         .display_indent_schema()
         .to_string();
-    assert_snapshot!(plan, @r###"
+    assert_snapshot!(plan, @r"
     SubqueryAlias: alias2 [a:Utf8, b:Int32, one:Int32]
       SubqueryAlias: alias1 [a:Utf8, b:Int32, one:Int32]
         Projection: test.a, test.b, Int32(1) AS one [a:Utf8, b:Int32, one:Int32]
           TableScan: test projection=[a, b] [a:Utf8, b:Int32]
-    "###);
+    ");
 
     // Select over the aliased DataFrame
     let select1 = df
@@ -6189,7 +6288,7 @@ async fn test_alias_nested() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&select1.collect().await.unwrap()),
-        @r###"
+        @r"
     +-----------+-----------------------+
     | a         | alias2.b + alias2.one |
     +-----------+-----------------------+
@@ -6198,7 +6297,7 @@ async fn test_alias_nested() -> Result<()> {
     | abc123    | 11                    |
     | abcDEF    | 2                     |
     +-----------+-----------------------+
-    "###
+    "
     );
 
     // Only the outermost alias is visible
@@ -6217,7 +6316,7 @@ async fn register_non_json_file() {
         .register_json(
             "data",
             "tests/data/test_binary.parquet",
-            NdJsonReadOptions::default(),
+            JsonReadOptions::default(),
         )
         .await;
     assert_contains!(
@@ -6318,7 +6417,10 @@ async fn test_insert_into_checking() -> Result<()> {
         .await
         .unwrap_err();
 
-    assert_contains!(e.to_string(), "Inserting query schema mismatch: Expected table field 'a' with type Int64, but got 'column1' with type Utf8");
+    assert_contains!(
+        e.to_string(),
+        "Inserting query schema mismatch: Expected table field 'a' with type Int64, but got 'column1' with type Utf8"
+    );
 
     Ok(())
 }
@@ -6365,7 +6467,7 @@ async fn test_fill_null() -> Result<()> {
     let results = df_filled.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +---+---------+
     | a | b       |
     +---+---------+
@@ -6373,7 +6475,7 @@ async fn test_fill_null() -> Result<()> {
     | 1 | x       |
     | 3 | z       |
     +---+---------+
-    "###
+    "
     );
 
     Ok(())
@@ -6393,7 +6495,7 @@ async fn test_fill_null_all_columns() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +---+---------+
     | a | b       |
     +---+---------+
@@ -6401,7 +6503,7 @@ async fn test_fill_null_all_columns() -> Result<()> {
     | 1 | x       |
     | 3 | z       |
     +---+---------+
-    "###
+    "
     );
 
     // Fill column "a" null values with a value that cannot be cast to Int32.
@@ -6410,7 +6512,7 @@ async fn test_fill_null_all_columns() -> Result<()> {
     let results = df_filled.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +---+---------+
     | a | b       |
     +---+---------+
@@ -6418,7 +6520,7 @@ async fn test_fill_null_all_columns() -> Result<()> {
     | 1 | x       |
     | 3 | z       |
     +---+---------+
-    "###
+    "
     );
     Ok(())
 }
@@ -6427,7 +6529,7 @@ async fn test_fill_null_all_columns() -> Result<()> {
 async fn test_insert_into_casting_support() -> Result<()> {
     // Testing case1:
     // Inserting query schema mismatch: Expected table field 'a' with type Float16, but got 'a' with type Utf8.
-    // And the cast is not supported from Utf8 to Float16.
+    // And the cast is not supported from Binary to Float16.
 
     // Create a new schema with one field called "a" of type Float16, and setting nullable to false
     let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Float16, false)]));
@@ -6438,7 +6540,10 @@ async fn test_insert_into_casting_support() -> Result<()> {
     let initial_table = Arc::new(MemTable::try_new(schema.clone(), vec![vec![]])?);
     session_ctx.register_table("t", initial_table.clone())?;
 
-    let mut write_df = session_ctx.sql("values ('a123'), ('b456')").await.unwrap();
+    let mut write_df = session_ctx
+        .sql("values (x'a123'), (x'b456')")
+        .await
+        .unwrap();
 
     write_df = write_df
         .clone()
@@ -6450,7 +6555,10 @@ async fn test_insert_into_casting_support() -> Result<()> {
         .await
         .unwrap_err();
 
-    assert_contains!(e.to_string(), "Inserting query schema mismatch: Expected table field 'a' with type Float16, but got 'a' with type Utf8.");
+    assert_contains!(
+        e.to_string(),
+        "Inserting query schema mismatch: Expected table field 'a' with type Float16, but got 'a' with type Binary."
+    );
 
     // Testing case2:
     // Inserting query schema mismatch: Expected table field 'a' with type Utf8View, but got 'a' with type Utf8.
@@ -6488,14 +6596,14 @@ async fn test_insert_into_casting_support() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&res),
-        @r###"
+        @r"
     +------+
     | a    |
     +------+
     | a123 |
     | b456 |
     +------+
-    "###
+    "
     );
     Ok(())
 }
@@ -6631,13 +6739,13 @@ async fn test_copy_to_preserves_order() -> Result<()> {
     // Expect that input to the DataSinkExec is sorted correctly
     assert_snapshot!(
         physical_plan_format,
-        @r###"
+        @r"
     UnionExec
       DataSinkExec: sink=CsvSink(file_groups=[])
         SortExec: expr=[column1@0 DESC], preserve_partitioning=[false]
           DataSourceExec: partitions=1, partition_sizes=[1]
       DataSourceExec: partitions=1, partition_sizes=[1]
-    "###
+    "
     );
     Ok(())
 }
@@ -6743,3 +6851,50 @@ async fn test_duplicate_state_fields_for_dfschema_construct() -> Result<()> {
 
     Ok(())
 }
+
+/// Regression test for https://github.com/apache/datafusion/issues/21411
+/// grouping() should work when wrapped in an alias via the DataFrame API.
+///
+/// This bug only manifests through the DataFrame API because `.alias()` wraps
+/// the `grouping()` call in an `Expr::Alias` node at the aggregate expression
+/// level. The SQL planner handles aliasing separately (via projection), so the
+/// `ResolveGroupingFunction` analyzer rule never sees an `Expr::Alias` wrapper
+/// around the aggregate function in SQL queries — making SQL-based tests
+/// insufficient to cover this case.
+#[tokio::test]
+async fn test_grouping_with_alias() -> Result<()> {
+    use datafusion_functions_aggregate::expr_fn::grouping;
+
+    let df = create_test_table("test")
+        .await?
+        .aggregate(vec![col("a")], vec![grouping(col("a")).alias("g")])?
+        .sort(vec![Sort::new(col("a"), true, false)])?;
+
+    let results = df.collect().await?;
+
+    let expected = [
+        "+-----------+---+",
+        "| a         | g |",
+        "+-----------+---+",
+        "| 123AbcDef | 0 |",
+        "| CBAdef    | 0 |",
+        "| abc123    | 0 |",
+        "| abcDEF    | 0 |",
+        "+-----------+---+",
+    ];
+    assert_batches_eq!(expected, &results);
+
+    // Also verify that nested aliases (e.g. .alias("x").alias("g")) work correctly
+    let df = create_test_table("test")
+        .await?
+        .aggregate(
+            vec![col("a")],
+            vec![grouping(col("a")).alias("x").alias("g")],
+        )?
+        .sort(vec![Sort::new(col("a"), true, false)])?;
+
+    let results = df.collect().await?;
+    assert_batches_eq!(expected, &results);
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/datasource/object_store_access.rs b/datafusion/core/tests/datasource/object_store_access.rs
index f89ca9e049147..83b84f6f9284e 100644
--- a/datafusion/core/tests/datasource/object_store_access.rs
+++ b/datafusion/core/tests/datasource/object_store_access.rs
@@ -27,17 +27,21 @@
 use arrow::array::{ArrayRef, Int32Array, RecordBatch};
 use async_trait::async_trait;
 use bytes::Bytes;
-use datafusion::prelude::{CsvReadOptions, ParquetReadOptions, SessionContext};
+use datafusion::prelude::{
+    CsvReadOptions, JsonReadOptions, ParquetReadOptions, SessionContext,
+};
 use datafusion_catalog_listing::{ListingOptions, ListingTable, ListingTableConfig};
 use datafusion_datasource::ListingTableUrl;
 use datafusion_datasource_csv::CsvFormat;
+use datafusion_datasource_json::JsonFormat;
 use futures::stream::BoxStream;
 use insta::assert_snapshot;
 use object_store::memory::InMemory;
 use object_store::path::Path;
 use object_store::{
-    GetOptions, GetRange, GetResult, ListResult, MultipartUpload, ObjectMeta,
-    ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult,
+    CopyOptions, GetOptions, GetRange, GetResult, ListResult, MultipartUpload,
+    ObjectMeta, ObjectStore, ObjectStoreExt, PutMultipartOptions, PutOptions, PutPayload,
+    PutResult,
 };
 use parking_lot::Mutex;
 use std::fmt;
@@ -54,8 +58,8 @@ async fn create_single_csv_file() {
         @r"
     RequestCountingObjectStore()
     Total Requests: 2
-    - HEAD path=csv_table.csv
-    - GET  path=csv_table.csv
+    - GET  (opts) path=csv_table.csv head=true
+    - GET  (opts) path=csv_table.csv
     "
     );
 }
@@ -76,7 +80,7 @@ async fn query_single_csv_file() {
     ------- Object Store Request Summary -------
     RequestCountingObjectStore()
     Total Requests: 2
-    - HEAD path=csv_table.csv
+    - GET  (opts) path=csv_table.csv head=true
     - GET  (opts) path=csv_table.csv
     "
     );
@@ -91,15 +95,15 @@ async fn create_multi_file_csv_file() {
     RequestCountingObjectStore()
     Total Requests: 4
     - LIST prefix=data
-    - GET  path=data/file_0.csv
-    - GET  path=data/file_1.csv
-    - GET  path=data/file_2.csv
+    - GET  (opts) path=data/file_0.csv
+    - GET  (opts) path=data/file_1.csv
+    - GET  (opts) path=data/file_2.csv
     "
     );
 }
 
 #[tokio::test]
-async fn query_multi_csv_file() {
+async fn multi_query_multi_file_csv_file() {
     let test = Test::new().with_multi_file_csv().await;
     assert_snapshot!(
         test.query("select * from csv_table").await,
@@ -117,6 +121,56 @@ async fn query_multi_csv_file() {
     +---------+-------+-------+
     ------- Object Store Request Summary -------
     RequestCountingObjectStore()
+    Total Requests: 3
+    - GET  (opts) path=data/file_0.csv
+    - GET  (opts) path=data/file_1.csv
+    - GET  (opts) path=data/file_2.csv
+    "
+    );
+
+    // Force a cache eviction by removing the data limit for the cache
+    assert_snapshot!(
+        test.query("set datafusion.runtime.list_files_cache_limit=\"0K\"").await,
+        @r"
+    ------- Query Output (0 rows) -------
+    ++
+    ++
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 0
+    "
+    );
+
+    // Then re-enable the cache
+    assert_snapshot!(
+        test.query("set datafusion.runtime.list_files_cache_limit=\"1M\"").await,
+        @r"
+    ------- Query Output (0 rows) -------
+    ++
+    ++
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 0
+    "
+    );
+
+    // this query should list the table since the cache entries were evicted
+    assert_snapshot!(
+        test.query("select * from csv_table").await,
+        @r"
+    ------- Query Output (6 rows) -------
+    +---------+-------+-------+
+    | c1      | c2    | c3    |
+    +---------+-------+-------+
+    | 0.0     | 0.0   | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00001 | 1e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00002 | 2e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    +---------+-------+-------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
     Total Requests: 4
     - LIST prefix=data
     - GET  (opts) path=data/file_0.csv
@@ -124,6 +178,114 @@ async fn query_multi_csv_file() {
     - GET  (opts) path=data/file_2.csv
     "
     );
+
+    // this query should not list the table since the entries were added in the previous query
+    assert_snapshot!(
+        test.query("select * from csv_table").await,
+        @r"
+    ------- Query Output (6 rows) -------
+    +---------+-------+-------+
+    | c1      | c2    | c3    |
+    +---------+-------+-------+
+    | 0.0     | 0.0   | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00001 | 1e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00002 | 2e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    +---------+-------+-------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 3
+    - GET  (opts) path=data/file_0.csv
+    - GET  (opts) path=data/file_1.csv
+    - GET  (opts) path=data/file_2.csv
+    "
+    );
+}
+
+#[tokio::test]
+async fn query_multi_csv_file() {
+    let test = Test::new().with_multi_file_csv().await;
+    assert_snapshot!(
+        test.query("select * from csv_table").await,
+        @r"
+    ------- Query Output (6 rows) -------
+    +---------+-------+-------+
+    | c1      | c2    | c3    |
+    +---------+-------+-------+
+    | 0.0     | 0.0   | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00001 | 1e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00002 | 2e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    +---------+-------+-------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 3
+    - GET  (opts) path=data/file_0.csv
+    - GET  (opts) path=data/file_1.csv
+    - GET  (opts) path=data/file_2.csv
+    "
+    );
+}
+
+/// Test that a CSV file split into byte ranges via repartitioning exercises
+/// range-based object store access.
+///
+/// With a single file and `target_partitions=3`, the repartitioner produces
+/// exactly 3 ranges.  For each range, `calculate_range` calls
+/// `find_first_newline` via a GET for every non-file boundary it touches
+/// (the start boundary if `start > 0`, the end boundary if `end < file_size`),
+/// plus one GET for the actual data — so 2 GETs for the first range (end scan
+/// + data), 3 for the middle range (start scan + end scan + data), and 2 for
+/// the last range (start scan + data) = 7 data GETs total.  Additionally,
+/// adjacent ranges share a boundary position, so each shared boundary is scanned
+/// twice — once as the left range's end and again as the right range's start —
+/// producing the duplicate GETs visible in the snapshot.  Add the 1 HEAD for
+/// file-size metadata = **8 total**.
+///
+/// This differs from the JSON reader which uses [`AlignedBoundaryStream`] and
+/// needs only 1 GET per range.
+///
+/// This test documents the current request pattern to catch regressions.
+#[tokio::test]
+async fn query_csv_file_with_byte_range_partitions() {
+    let test = Test::new().with_single_file_csv_for_range_test().await;
+    // Lower the repartition_file_min_size threshold so the small test file gets
+    // split, and set target_partitions=3 to produce exactly 3 ranges.
+    test.query("SET datafusion.optimizer.repartition_file_min_size = 0")
+        .await;
+    test.query("SET datafusion.execution.target_partitions = 3")
+        .await;
+    assert_snapshot!(
+        test.query("select * from csv_range_table").await,
+        @r"
+    ------- Query Output (6 rows) -------
+    +---------+-------+-------+
+    | c1      | c2    | c3    |
+    +---------+-------+-------+
+    | 0.00001 | 1e-12 | false |
+    | 0.00002 | 2e-12 | false |
+    | 0.00003 | 3e-12 | false |
+    | 0.00004 | 4e-12 | false |
+    | 0.00005 | 5e-12 | false |
+    | 0.00006 | 6e-12 | false |
+    +---------+-------+-------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 8
+    - GET  (opts) path=csv_range_table.csv head=true
+    - GET  (opts) path=csv_range_table.csv range=42-129
+    - GET  (opts) path=csv_range_table.csv range=0-49
+    - GET  (opts) path=csv_range_table.csv range=42-129
+    - GET  (opts) path=csv_range_table.csv range=85-129
+    - GET  (opts) path=csv_range_table.csv range=49-89
+    - GET  (opts) path=csv_range_table.csv range=85-129
+    - GET  (opts) path=csv_range_table.csv range=89-129
+    "
+    );
 }
 
 #[tokio::test]
@@ -145,17 +307,7 @@ async fn query_partitioned_csv_file() {
     +---------+-------+-------+---+----+-----+
     ------- Object Store Request Summary -------
     RequestCountingObjectStore()
-    Total Requests: 13
-    - LIST (with delimiter) prefix=data
-    - LIST (with delimiter) prefix=data/a=1
-    - LIST (with delimiter) prefix=data/a=2
-    - LIST (with delimiter) prefix=data/a=3
-    - LIST (with delimiter) prefix=data/a=1/b=10
-    - LIST (with delimiter) prefix=data/a=2/b=20
-    - LIST (with delimiter) prefix=data/a=3/b=30
-    - LIST (with delimiter) prefix=data/a=1/b=10/c=100
-    - LIST (with delimiter) prefix=data/a=2/b=20/c=200
-    - LIST (with delimiter) prefix=data/a=3/b=30/c=300
+    Total Requests: 3
     - GET  (opts) path=data/a=1/b=10/c=100/file_1.csv
     - GET  (opts) path=data/a=2/b=20/c=200/file_2.csv
     - GET  (opts) path=data/a=3/b=30/c=300/file_3.csv
@@ -174,10 +326,7 @@ async fn query_partitioned_csv_file() {
     +---------+-------+-------+---+----+-----+
     ------- Object Store Request Summary -------
     RequestCountingObjectStore()
-    Total Requests: 4
-    - LIST (with delimiter) prefix=data/a=2
-    - LIST (with delimiter) prefix=data/a=2/b=20
-    - LIST (with delimiter) prefix=data/a=2/b=20/c=200
+    Total Requests: 1
     - GET  (opts) path=data/a=2/b=20/c=200/file_2.csv
     "
     );
@@ -194,17 +343,7 @@ async fn query_partitioned_csv_file() {
     +---------+-------+-------+---+----+-----+
     ------- Object Store Request Summary -------
     RequestCountingObjectStore()
-    Total Requests: 11
-    - LIST (with delimiter) prefix=data
-    - LIST (with delimiter) prefix=data/a=1
-    - LIST (with delimiter) prefix=data/a=2
-    - LIST (with delimiter) prefix=data/a=3
-    - LIST (with delimiter) prefix=data/a=1/b=10
-    - LIST (with delimiter) prefix=data/a=2/b=20
-    - LIST (with delimiter) prefix=data/a=3/b=30
-    - LIST (with delimiter) prefix=data/a=1/b=10/c=100
-    - LIST (with delimiter) prefix=data/a=2/b=20/c=200
-    - LIST (with delimiter) prefix=data/a=3/b=30/c=300
+    Total Requests: 1
     - GET  (opts) path=data/a=2/b=20/c=200/file_2.csv
     "
     );
@@ -221,17 +360,7 @@ async fn query_partitioned_csv_file() {
     +---------+-------+-------+---+----+-----+
     ------- Object Store Request Summary -------
     RequestCountingObjectStore()
-    Total Requests: 11
-    - LIST (with delimiter) prefix=data
-    - LIST (with delimiter) prefix=data/a=1
-    - LIST (with delimiter) prefix=data/a=2
-    - LIST (with delimiter) prefix=data/a=3
-    - LIST (with delimiter) prefix=data/a=1/b=10
-    - LIST (with delimiter) prefix=data/a=2/b=20
-    - LIST (with delimiter) prefix=data/a=3/b=30
-    - LIST (with delimiter) prefix=data/a=1/b=10/c=100
-    - LIST (with delimiter) prefix=data/a=2/b=20/c=200
-    - LIST (with delimiter) prefix=data/a=3/b=30/c=300
+    Total Requests: 1
     - GET  (opts) path=data/a=2/b=20/c=200/file_2.csv
     "
     );
@@ -248,9 +377,7 @@ async fn query_partitioned_csv_file() {
     +---------+-------+-------+---+----+-----+
     ------- Object Store Request Summary -------
     RequestCountingObjectStore()
-    Total Requests: 3
-    - LIST (with delimiter) prefix=data/a=2/b=20
-    - LIST (with delimiter) prefix=data/a=2/b=20/c=200
+    Total Requests: 1
     - GET  (opts) path=data/a=2/b=20/c=200/file_2.csv
     "
     );
@@ -267,22 +394,354 @@ async fn query_partitioned_csv_file() {
     +---------+-------+-------+---+----+-----+
     ------- Object Store Request Summary -------
     RequestCountingObjectStore()
-    Total Requests: 11
-    - LIST (with delimiter) prefix=data
-    - LIST (with delimiter) prefix=data/a=1
-    - LIST (with delimiter) prefix=data/a=2
-    - LIST (with delimiter) prefix=data/a=3
-    - LIST (with delimiter) prefix=data/a=1/b=10
-    - LIST (with delimiter) prefix=data/a=2/b=20
-    - LIST (with delimiter) prefix=data/a=3/b=30
-    - LIST (with delimiter) prefix=data/a=1/b=10/c=100
-    - LIST (with delimiter) prefix=data/a=2/b=20/c=200
-    - LIST (with delimiter) prefix=data/a=3/b=30/c=300
+    Total Requests: 1
     - GET  (opts) path=data/a=1/b=10/c=100/file_1.csv
     "
     );
 }
 
+// =====================================================================
+// JSON (NDJSON) tests — mirrors the CSV tests above
+// =====================================================================
+
+#[tokio::test]
+async fn create_single_json_file() {
+    let test = Test::new().with_single_file_json().await;
+    assert_snapshot!(
+        test.requests(),
+        @r"
+    RequestCountingObjectStore()
+    Total Requests: 2
+    - GET  (opts) path=json_table.json head=true
+    - GET  (opts) path=json_table.json
+    "
+    );
+}
+
+#[tokio::test]
+async fn query_single_json_file() {
+    let test = Test::new().with_single_file_json().await;
+    assert_snapshot!(
+        test.query("select * from json_table").await,
+        @r"
+    ------- Query Output (2 rows) -------
+    +---------+-------+-------+
+    | c1      | c2    | c3    |
+    +---------+-------+-------+
+    | 0.00001 | 5e-12 | true  |
+    | 0.00002 | 4e-12 | false |
+    +---------+-------+-------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 2
+    - GET  (opts) path=json_table.json head=true
+    - GET  (opts) path=json_table.json
+    "
+    );
+}
+
+#[tokio::test]
+async fn create_multi_file_json() {
+    let test = Test::new().with_multi_file_json().await;
+    assert_snapshot!(
+        test.requests(),
+        @r"
+    RequestCountingObjectStore()
+    Total Requests: 4
+    - LIST prefix=data
+    - GET  (opts) path=data/file_0.json
+    - GET  (opts) path=data/file_1.json
+    - GET  (opts) path=data/file_2.json
+    "
+    );
+}
+
+#[tokio::test]
+async fn multi_query_multi_file_json() {
+    let test = Test::new().with_multi_file_json().await;
+    assert_snapshot!(
+        test.query("select * from json_table").await,
+        @r"
+    ------- Query Output (6 rows) -------
+    +---------+-------+-------+
+    | c1      | c2    | c3    |
+    +---------+-------+-------+
+    | 0.0     | 0.0   | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00001 | 1e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00002 | 2e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    +---------+-------+-------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 3
+    - GET  (opts) path=data/file_0.json
+    - GET  (opts) path=data/file_1.json
+    - GET  (opts) path=data/file_2.json
+    "
+    );
+
+    // Force a cache eviction by removing the data limit for the cache
+    assert_snapshot!(
+        test.query("set datafusion.runtime.list_files_cache_limit=\"0K\"").await,
+        @r"
+    ------- Query Output (0 rows) -------
+    ++
+    ++
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 0
+    "
+    );
+
+    // Then re-enable the cache
+    assert_snapshot!(
+        test.query("set datafusion.runtime.list_files_cache_limit=\"1M\"").await,
+        @r"
+    ------- Query Output (0 rows) -------
+    ++
+    ++
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 0
+    "
+    );
+
+    // this query should list the table since the cache entries were evicted
+    assert_snapshot!(
+        test.query("select * from json_table").await,
+        @r"
+    ------- Query Output (6 rows) -------
+    +---------+-------+-------+
+    | c1      | c2    | c3    |
+    +---------+-------+-------+
+    | 0.0     | 0.0   | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00001 | 1e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00002 | 2e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    +---------+-------+-------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 4
+    - LIST prefix=data
+    - GET  (opts) path=data/file_0.json
+    - GET  (opts) path=data/file_1.json
+    - GET  (opts) path=data/file_2.json
+    "
+    );
+
+    // this query should not list the table since the entries were added in the previous query
+    assert_snapshot!(
+        test.query("select * from json_table").await,
+        @r"
+    ------- Query Output (6 rows) -------
+    +---------+-------+-------+
+    | c1      | c2    | c3    |
+    +---------+-------+-------+
+    | 0.0     | 0.0   | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00001 | 1e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00002 | 2e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    +---------+-------+-------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 3
+    - GET  (opts) path=data/file_0.json
+    - GET  (opts) path=data/file_1.json
+    - GET  (opts) path=data/file_2.json
+    "
+    );
+}
+
+#[tokio::test]
+async fn query_multi_json_file() {
+    let test = Test::new().with_multi_file_json().await;
+    assert_snapshot!(
+        test.query("select * from json_table").await,
+        @r"
+    ------- Query Output (6 rows) -------
+    +---------+-------+-------+
+    | c1      | c2    | c3    |
+    +---------+-------+-------+
+    | 0.0     | 0.0   | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00001 | 1e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00002 | 2e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    +---------+-------+-------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 3
+    - GET  (opts) path=data/file_0.json
+    - GET  (opts) path=data/file_1.json
+    - GET  (opts) path=data/file_2.json
+    "
+    );
+}
+
+#[tokio::test]
+async fn query_partitioned_json_file() {
+    let test = Test::new().with_partitioned_json().await;
+    assert_snapshot!(
+        test.query("select * from json_table_partitioned").await,
+        @r"
+    ------- Query Output (6 rows) -------
+    +---------+-------+-------+---+----+-----+
+    | d1      | d2    | d3    | a | b  | c   |
+    +---------+-------+-------+---+----+-----+
+    | 0.00001 | 1e-12 | true  | 1 | 10 | 100 |
+    | 0.00003 | 5e-12 | false | 1 | 10 | 100 |
+    | 0.00002 | 2e-12 | true  | 2 | 20 | 200 |
+    | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+    | 0.00003 | 3e-12 | true  | 3 | 30 | 300 |
+    | 0.00003 | 5e-12 | false | 3 | 30 | 300 |
+    +---------+-------+-------+---+----+-----+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 3
+    - GET  (opts) path=data/a=1/b=10/c=100/file_1.json
+    - GET  (opts) path=data/a=2/b=20/c=200/file_2.json
+    - GET  (opts) path=data/a=3/b=30/c=300/file_3.json
+    "
+    );
+
+    assert_snapshot!(
+        test.query("select * from json_table_partitioned WHERE a=2").await,
+        @r"
+    ------- Query Output (2 rows) -------
+    +---------+-------+-------+---+----+-----+
+    | d1      | d2    | d3    | a | b  | c   |
+    +---------+-------+-------+---+----+-----+
+    | 0.00002 | 2e-12 | true  | 2 | 20 | 200 |
+    | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+    +---------+-------+-------+---+----+-----+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 1
+    - GET  (opts) path=data/a=2/b=20/c=200/file_2.json
+    "
+    );
+
+    assert_snapshot!(
+        test.query("select * from json_table_partitioned WHERE b=20").await,
+        @r"
+    ------- Query Output (2 rows) -------
+    +---------+-------+-------+---+----+-----+
+    | d1      | d2    | d3    | a | b  | c   |
+    +---------+-------+-------+---+----+-----+
+    | 0.00002 | 2e-12 | true  | 2 | 20 | 200 |
+    | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+    +---------+-------+-------+---+----+-----+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 1
+    - GET  (opts) path=data/a=2/b=20/c=200/file_2.json
+    "
+    );
+
+    assert_snapshot!(
+        test.query("select * from json_table_partitioned WHERE c=200").await,
+        @r"
+    ------- Query Output (2 rows) -------
+    +---------+-------+-------+---+----+-----+
+    | d1      | d2    | d3    | a | b  | c   |
+    +---------+-------+-------+---+----+-----+
+    | 0.00002 | 2e-12 | true  | 2 | 20 | 200 |
+    | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+    +---------+-------+-------+---+----+-----+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 1
+    - GET  (opts) path=data/a=2/b=20/c=200/file_2.json
+    "
+    );
+
+    assert_snapshot!(
+        test.query("select * from json_table_partitioned WHERE a=2 AND b=20").await,
+        @r"
+    ------- Query Output (2 rows) -------
+    +---------+-------+-------+---+----+-----+
+    | d1      | d2    | d3    | a | b  | c   |
+    +---------+-------+-------+---+----+-----+
+    | 0.00002 | 2e-12 | true  | 2 | 20 | 200 |
+    | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+    +---------+-------+-------+---+----+-----+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 1
+    - GET  (opts) path=data/a=2/b=20/c=200/file_2.json
+    "
+    );
+
+    assert_snapshot!(
+        test.query("select * from json_table_partitioned WHERE a<2 AND b=10 AND c=100").await,
+        @r"
+    ------- Query Output (2 rows) -------
+    +---------+-------+-------+---+----+-----+
+    | d1      | d2    | d3    | a | b  | c   |
+    +---------+-------+-------+---+----+-----+
+    | 0.00001 | 1e-12 | true  | 1 | 10 | 100 |
+    | 0.00003 | 5e-12 | false | 1 | 10 | 100 |
+    +---------+-------+-------+---+----+-----+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 1
+    - GET  (opts) path=data/a=1/b=10/c=100/file_1.json
+    "
+    );
+}
+
+/// Test that a JSON file split into byte ranges via repartitioning produces
+/// exactly one GET request per byte range — no extra requests for boundary seeking.
+///
+/// With a single file and `target_partitions=3`, the repartitioner produces
+/// exactly 3 ranges.  Each range is served by a single [`AlignedBoundaryStream`]
+/// which issues exactly one bounded `get_opts` call, so there are 3 data GETs
+/// plus 1 HEAD (to determine file size) = **4 total**.
+///
+/// This differs from the CSV reader, which needs multiple GETs per range.
+///
+/// This test documents the current request pattern to catch regressions.
+#[tokio::test]
+async fn query_json_file_with_byte_range_partitions() {
+    let test = Test::new().with_single_file_json_for_range_test().await;
+    // Lower the repartition_file_min_size threshold so the small test file gets
+    // split, and set target_partitions=3 to produce exactly 3 ranges.
+    test.query("SET datafusion.optimizer.repartition_file_min_size = 0")
+        .await;
+    test.query("SET datafusion.execution.target_partitions = 3")
+        .await;
+    assert_snapshot!(
+        test.query("select * from json_range_table").await,
+        @r"
+    ------- Query Output (6 rows) -------
+    +---------+-------+------+
+    | c1      | c2    | c3   |
+    +---------+-------+------+
+    | 0.00001 | 1e-12 | true |
+    | 0.00002 | 2e-12 | true |
+    | 0.00003 | 3e-12 | true |
+    | 0.00004 | 4e-12 | true |
+    | 0.00005 | 5e-12 | true |
+    | 0.00006 | 6e-12 | true |
+    +---------+-------+------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 4
+    - GET  (opts) path=json_range_table.json head=true
+    - GET  (opts) path=json_range_table.json range=0-216
+    - GET  (opts) path=json_range_table.json range=71-216
+    - GET  (opts) path=json_range_table.json range=143-216
+    "
+    );
+}
+
 #[tokio::test]
 async fn create_single_parquet_file_default() {
     // The default metadata size hint is 512KB
@@ -295,8 +754,8 @@ async fn create_single_parquet_file_default() {
         @r"
     RequestCountingObjectStore()
     Total Requests: 2
-    - HEAD path=parquet_table.parquet
-    - GET  (range) range=0-2994 path=parquet_table.parquet
+    - GET  (opts) path=parquet_table.parquet head=true
+    - GET  (ranges) path=parquet_table.parquet ranges=0-2994
     "
     );
 }
@@ -314,8 +773,8 @@ async fn create_single_parquet_file_prefetch() {
         @r"
     RequestCountingObjectStore()
     Total Requests: 2
-    - HEAD path=parquet_table.parquet
-    - GET  (range) range=1994-2994 path=parquet_table.parquet
+    - GET  (opts) path=parquet_table.parquet head=true
+    - GET  (ranges) path=parquet_table.parquet ranges=1994-2994
     "
     );
 }
@@ -343,10 +802,10 @@ async fn create_single_parquet_file_too_small_prefetch() {
         @r"
     RequestCountingObjectStore()
     Total Requests: 4
-    - HEAD path=parquet_table.parquet
-    - GET  (range) range=2494-2994 path=parquet_table.parquet
-    - GET  (range) range=2264-2986 path=parquet_table.parquet
-    - GET  (range) range=2124-2264 path=parquet_table.parquet
+    - GET  (opts) path=parquet_table.parquet head=true
+    - GET  (ranges) path=parquet_table.parquet ranges=2494-2994
+    - GET  (ranges) path=parquet_table.parquet ranges=2264-2986
+    - GET  (ranges) path=parquet_table.parquet ranges=2124-2264
     "
     );
 }
@@ -375,9 +834,9 @@ async fn create_single_parquet_file_small_prefetch() {
         @r"
     RequestCountingObjectStore()
     Total Requests: 3
-    - HEAD path=parquet_table.parquet
-    - GET  (range) range=2254-2994 path=parquet_table.parquet
-    - GET  (range) range=2124-2264 path=parquet_table.parquet
+    - GET  (opts) path=parquet_table.parquet head=true
+    - GET  (ranges) path=parquet_table.parquet ranges=2254-2994
+    - GET  (ranges) path=parquet_table.parquet ranges=2124-2264
     "
     );
 }
@@ -399,8 +858,8 @@ async fn create_single_parquet_file_no_prefetch() {
         @r"
     RequestCountingObjectStore()
     Total Requests: 2
-    - HEAD path=parquet_table.parquet
-    - GET  (range) range=0-2994 path=parquet_table.parquet
+    - GET  (opts) path=parquet_table.parquet head=true
+    - GET  (ranges) path=parquet_table.parquet ranges=0-2994
     "
     );
 }
@@ -420,7 +879,7 @@ async fn query_single_parquet_file() {
     ------- Object Store Request Summary -------
     RequestCountingObjectStore()
     Total Requests: 3
-    - HEAD path=parquet_table.parquet
+    - GET  (opts) path=parquet_table.parquet head=true
     - GET  (ranges) path=parquet_table.parquet ranges=4-534,534-1064
     - GET  (ranges) path=parquet_table.parquet ranges=1064-1594,1594-2124
     "
@@ -444,7 +903,7 @@ async fn query_single_parquet_file_with_single_predicate() {
     ------- Object Store Request Summary -------
     RequestCountingObjectStore()
     Total Requests: 2
-    - HEAD path=parquet_table.parquet
+    - GET  (opts) path=parquet_table.parquet head=true
     - GET  (ranges) path=parquet_table.parquet ranges=1064-1481,1481-1594,1594-2011,2011-2124
     "
     );
@@ -468,7 +927,7 @@ async fn query_single_parquet_file_multi_row_groups_multiple_predicates() {
     ------- Object Store Request Summary -------
     RequestCountingObjectStore()
     Total Requests: 3
-    - HEAD path=parquet_table.parquet
+    - GET  (opts) path=parquet_table.parquet head=true
     - GET  (ranges) path=parquet_table.parquet ranges=4-421,421-534,534-951,951-1064
     - GET  (ranges) path=parquet_table.parquet ranges=1064-1481,1481-1594,1594-2011,2011-2124
     "
@@ -630,6 +1089,116 @@ impl Test {
             .await
     }
 
+    /// Register a single CSV file with six equal-length rows for byte-range
+    /// repartitioning tests. With a single file and `target_partitions=3`, the
+    /// repartitioner creates exactly 3 ranges.
+    async fn with_single_file_csv_for_range_test(self) -> Test {
+        let csv_data = "c1,c2,c3\n\
+                        0.00001,1e-12,false\n\
+                        0.00002,2e-12,false\n\
+                        0.00003,3e-12,false\n\
+                        0.00004,4e-12,false\n\
+                        0.00005,5e-12,false\n\
+                        0.00006,6e-12,false\n";
+        self.with_bytes("/csv_range_table.csv", csv_data)
+            .await
+            .register_csv("csv_range_table", "/csv_range_table.csv")
+            .await
+    }
+
+    /// Register a JSON (NDJSON) file at the given path
+    async fn register_json(self, table_name: &str, path: &str) -> Self {
+        let url = format!("mem://{path}");
+        self.session_context
+            .register_json(table_name, url, JsonReadOptions::default())
+            .await
+            .unwrap();
+        self
+    }
+
+    /// Register a partitioned JSON table at the given path
+    async fn register_partitioned_json(self, table_name: &str, path: &str) -> Self {
+        let file_format = Arc::new(JsonFormat::default());
+        let options = ListingOptions::new(file_format);
+
+        let url = format!("mem://{path}").parse().unwrap();
+        let table_url = ListingTableUrl::try_new(url, None).unwrap();
+
+        let session_state = self.session_context.state();
+        let mut config = ListingTableConfig::new(table_url).with_listing_options(options);
+        config = config
+            .infer_partitions_from_path(&session_state)
+            .await
+            .unwrap();
+        config = config.infer_schema(&session_state).await.unwrap();
+
+        let table = Arc::new(ListingTable::try_new(config).unwrap());
+        self.session_context
+            .register_table(table_name, table)
+            .unwrap();
+        self
+    }
+
+    /// Register a single NDJSON file with three columns and two rows named `json_table`
+    async fn with_single_file_json(self) -> Test {
+        let json_data = "{\"c1\":0.00001,\"c2\":5e-12,\"c3\":true}\n\
+                         {\"c1\":0.00002,\"c2\":4e-12,\"c3\":false}\n";
+        self.with_bytes("/json_table.json", json_data)
+            .await
+            .register_json("json_table", "/json_table.json")
+            .await
+    }
+
+    /// Register a single NDJSON file with six equal-length rows for byte-range
+    /// repartitioning tests. With a single file and `target_partitions=3`, the
+    /// repartitioner creates exactly 3 ranges.
+    async fn with_single_file_json_for_range_test(self) -> Test {
+        let json_data = r#"{"c1":0.00001,"c2":1e-12,"c3":true}
+{"c1":0.00002,"c2":2e-12,"c3":true}
+{"c1":0.00003,"c2":3e-12,"c3":true}
+{"c1":0.00004,"c2":4e-12,"c3":true}
+{"c1":0.00005,"c2":5e-12,"c3":true}
+{"c1":0.00006,"c2":6e-12,"c3":true}
+"#;
+        self.with_bytes("/json_range_table.json", json_data)
+            .await
+            .register_json("json_range_table", "/json_range_table.json")
+            .await
+    }
+
+    /// Register three NDJSON files in a directory, called `json_table`
+    async fn with_multi_file_json(mut self) -> Test {
+        for i in 0..3 {
+            let json_data = format!(
+                "{{\"c1\":0.0000{i},\"c2\":{i}e-12,\"c3\":true}}\n\
+                 {{\"c1\":0.00003,\"c2\":5e-12,\"c3\":false}}\n"
+            );
+            self = self
+                .with_bytes(&format!("/data/file_{i}.json"), json_data)
+                .await;
+        }
+        self.register_json("json_table", "/data/").await
+    }
+
+    /// Register three NDJSON files in a partitioned directory structure, called
+    /// `json_table_partitioned`
+    async fn with_partitioned_json(mut self) -> Test {
+        for i in 1..4 {
+            let json_data = format!(
+                "{{\"d1\":0.0000{i},\"d2\":{i}e-12,\"d3\":true}}\n\
+                 {{\"d1\":0.00003,\"d2\":5e-12,\"d3\":false}}\n"
+            );
+            self = self
+                .with_bytes(
+                    &format!("/data/a={i}/b={}/c={}/file_{i}.json", i * 10, i * 100),
+                    json_data,
+                )
+                .await;
+        }
+        self.register_partitioned_json("json_table_partitioned", "/data/")
+            .await
+    }
+
     /// Add a single parquet file that has two columns and two row groups named `parquet_table`
     ///
     /// Column "a": Int32 with values 0-100] in row group 1
@@ -645,7 +1214,7 @@ impl Test {
 
         let mut buffer = vec![];
         let props = parquet::file::properties::WriterProperties::builder()
-            .set_max_row_group_size(100)
+            .set_max_row_group_row_count(Some(100))
             .build();
         let mut writer = parquet::arrow::ArrowWriter::try_new(
             &mut buffer,
@@ -696,11 +1265,8 @@ impl Test {
 /// Details of individual requests made through the [`RequestCountingObjectStore`]
 #[derive(Clone, Debug)]
 enum RequestDetails {
-    Get { path: Path },
     GetOpts { path: Path, get_options: GetOptions },
     GetRanges { path: Path, ranges: Vec<Range<u64>> },
-    GetRange { path: Path, range: Range<u64> },
-    Head { path: Path },
     List { prefix: Option<Path> },
     ListWithDelimiter { prefix: Option<Path> },
     ListWithOffset { prefix: Option<Path>, offset: Path },
@@ -718,9 +1284,6 @@ fn display_range(range: &Range<u64>) -> impl Display + '_ {
 impl Display for RequestDetails {
     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
         match self {
-            RequestDetails::Get { path } => {
-                write!(f, "GET  path={path}")
-            }
             RequestDetails::GetOpts { path, get_options } => {
                 write!(f, "GET  (opts) path={path}")?;
                 if let Some(range) = &get_options.range {
@@ -758,13 +1321,6 @@ impl Display for RequestDetails {
                 }
                 Ok(())
             }
-            RequestDetails::GetRange { path, range } => {
-                let range = display_range(range);
-                write!(f, "GET  (range) range={range} path={path}")
-            }
-            RequestDetails::Head { path } => {
-                write!(f, "HEAD path={path}")
-            }
             RequestDetails::List { prefix } => {
                 write!(f, "LIST")?;
                 if let Some(prefix) = prefix {
@@ -837,7 +1393,7 @@ impl ObjectStore for RequestCountingObjectStore {
         _payload: PutPayload,
         _opts: PutOptions,
     ) -> object_store::Result<PutResult> {
-        Err(object_store::Error::NotImplemented)
+        unimplemented!()
     }
 
     async fn put_multipart_opts(
@@ -845,15 +1401,7 @@ impl ObjectStore for RequestCountingObjectStore {
         _location: &Path,
         _opts: PutMultipartOptions,
     ) -> object_store::Result<Box<dyn MultipartUpload>> {
-        Err(object_store::Error::NotImplemented)
-    }
-
-    async fn get(&self, location: &Path) -> object_store::Result<GetResult> {
-        let result = self.inner.get(location).await?;
-        self.requests.lock().push(RequestDetails::Get {
-            path: location.to_owned(),
-        });
-        Ok(result)
+        unimplemented!()
     }
 
     async fn get_opts(
@@ -869,19 +1417,6 @@ impl ObjectStore for RequestCountingObjectStore {
         Ok(result)
     }
 
-    async fn get_range(
-        &self,
-        location: &Path,
-        range: Range<u64>,
-    ) -> object_store::Result<Bytes> {
-        let result = self.inner.get_range(location, range.clone()).await?;
-        self.requests.lock().push(RequestDetails::GetRange {
-            path: location.to_owned(),
-            range: range.clone(),
-        });
-        Ok(result)
-    }
-
     async fn get_ranges(
         &self,
         location: &Path,
@@ -895,18 +1430,6 @@ impl ObjectStore for RequestCountingObjectStore {
         Ok(result)
     }
 
-    async fn head(&self, location: &Path) -> object_store::Result<ObjectMeta> {
-        let result = self.inner.head(location).await?;
-        self.requests.lock().push(RequestDetails::Head {
-            path: location.to_owned(),
-        });
-        Ok(result)
-    }
-
-    async fn delete(&self, _location: &Path) -> object_store::Result<()> {
-        Err(object_store::Error::NotImplemented)
-    }
-
     fn list(
         &self,
         prefix: Option<&Path>,
@@ -942,15 +1465,19 @@ impl ObjectStore for RequestCountingObjectStore {
         self.inner.list_with_delimiter(prefix).await
     }
 
-    async fn copy(&self, _from: &Path, _to: &Path) -> object_store::Result<()> {
-        Err(object_store::Error::NotImplemented)
+    fn delete_stream(
+        &self,
+        _locations: BoxStream<'static, object_store::Result<Path>>,
+    ) -> BoxStream<'static, object_store::Result<Path>> {
+        unimplemented!()
     }
 
-    async fn copy_if_not_exists(
+    async fn copy_opts(
         &self,
         _from: &Path,
         _to: &Path,
+        _options: CopyOptions,
     ) -> object_store::Result<()> {
-        Err(object_store::Error::NotImplemented)
+        unimplemented!()
     }
 }
diff --git a/datafusion/core/tests/execution/coop.rs b/datafusion/core/tests/execution/coop.rs
index b6f406e967509..e02364a0530cc 100644
--- a/datafusion/core/tests/execution/coop.rs
+++ b/datafusion/core/tests/execution/coop.rs
@@ -22,26 +22,25 @@ use datafusion::common::NullEquality;
 use datafusion::functions_aggregate::sum;
 use datafusion::physical_expr::aggregate::AggregateExprBuilder;
 use datafusion::physical_plan;
+use datafusion::physical_plan::ExecutionPlan;
 use datafusion::physical_plan::aggregates::{
-    AggregateExec, AggregateMode, PhysicalGroupBy,
+    AggregateExec, AggregateMode, LimitOptions, PhysicalGroupBy,
 };
 use datafusion::physical_plan::execution_plan::Boundedness;
-use datafusion::physical_plan::ExecutionPlan;
 use datafusion::prelude::SessionContext;
-use datafusion_common::{exec_datafusion_err, DataFusionError, JoinType, ScalarValue};
+use datafusion_common::{DataFusionError, JoinType, ScalarValue, exec_datafusion_err};
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use datafusion_expr_common::operator::Operator;
 use datafusion_expr_common::operator::Operator::{Divide, Eq, Gt, Modulo};
 use datafusion_functions_aggregate::min_max;
+use datafusion_physical_expr::Partitioning;
 use datafusion_physical_expr::expressions::{
-    binary, col, lit, BinaryExpr, Column, Literal,
+    BinaryExpr, Column, Literal, binary, col, lit,
 };
-use datafusion_physical_expr::Partitioning;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
-use datafusion_physical_optimizer::ensure_coop::EnsureCooperative;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
-use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec;
+use datafusion_physical_optimizer::ensure_coop::EnsureCooperative;
 use datafusion_physical_plan::coop::make_cooperative;
 use datafusion_physical_plan::filter::FilterExec;
 use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode, SortMergeJoinExec};
@@ -64,13 +63,14 @@ use std::time::Duration;
 use tokio::runtime::{Handle, Runtime};
 use tokio::select;
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 struct RangeBatchGenerator {
     schema: SchemaRef,
     value_range: Range<i64>,
     boundedness: Boundedness,
     batch_size: usize,
     poll_count: usize,
+    original_range: Range<i64>,
 }
 
 impl std::fmt::Display for RangeBatchGenerator {
@@ -110,6 +110,13 @@ impl LazyBatchGenerator for RangeBatchGenerator {
             RecordBatch::try_new(Arc::clone(&self.schema), vec![Arc::new(array)])?;
         Ok(Some(batch))
     }
+
+    fn reset_state(&self) -> Arc<RwLock<dyn LazyBatchGenerator>> {
+        let mut new = self.clone();
+        new.poll_count = 0;
+        new.value_range = new.original_range.clone();
+        Arc::new(RwLock::new(new))
+    }
 }
 
 fn make_lazy_exec(column_name: &str, pretend_infinite: bool) -> LazyMemoryExec {
@@ -136,16 +143,17 @@ fn make_lazy_exec_with_range(
     };
 
     // Instantiate the generator with the batch and limit
-    let gen = RangeBatchGenerator {
+    let batch_gen = RangeBatchGenerator {
         schema: Arc::clone(&schema),
         boundedness,
-        value_range: range,
+        value_range: range.clone(),
         batch_size: 8192,
         poll_count: 0,
+        original_range: range,
     };
 
     // Wrap the generator in a trait object behind Arc<RwLock<_>>
-    let generator: Arc<RwLock<dyn LazyBatchGenerator>> = Arc::new(RwLock::new(gen));
+    let generator: Arc<RwLock<dyn LazyBatchGenerator>> = Arc::new(RwLock::new(batch_gen));
 
     // Create a LazyMemoryExec with one partition using our generator
     let mut exec = LazyMemoryExec::try_new(schema, vec![generator]).unwrap();
@@ -170,7 +178,7 @@ async fn agg_no_grouping_yields(
     let inf = Arc::new(make_lazy_exec("value", pretend_infinite));
     let aggr = Arc::new(AggregateExec::try_new(
         AggregateMode::Single,
-        PhysicalGroupBy::new(vec![], vec![], vec![]),
+        PhysicalGroupBy::new(vec![], vec![], vec![], false),
         vec![Arc::new(
             AggregateExprBuilder::new(
                 sum::sum_udaf(),
@@ -204,7 +212,7 @@ async fn agg_grouping_yields(
 
     let aggr = Arc::new(AggregateExec::try_new(
         AggregateMode::Single,
-        PhysicalGroupBy::new(vec![(group, "group".to_string())], vec![], vec![]),
+        PhysicalGroupBy::new(vec![(group, "group".to_string())], vec![], vec![], false),
         vec![Arc::new(
             AggregateExprBuilder::new(sum::sum_udaf(), vec![value_col.clone()])
                 .schema(inf.schema())
@@ -225,6 +233,7 @@ async fn agg_grouped_topk_yields(
     #[values(false, true)] pretend_infinite: bool,
 ) -> Result<(), Box<dyn Error>> {
     // build session
+
     let session_ctx = SessionContext::new();
 
     // set up a top-k aggregation
@@ -240,6 +249,7 @@ async fn agg_grouped_topk_yields(
                 vec![(group, "group".to_string())],
                 vec![],
                 vec![vec![false]],
+                false,
             ),
             vec![Arc::new(
                 AggregateExprBuilder::new(min_max::max_udaf(), vec![value_col.clone()])
@@ -251,7 +261,7 @@ async fn agg_grouped_topk_yields(
             inf.clone(),
             inf.schema(),
         )?
-        .with_limit(Some(100)),
+        .with_limit_options(Some(LimitOptions::new(100))),
     );
 
     query_yields(aggr, session_ctx.task_ctx()).await
@@ -415,10 +425,7 @@ async fn filter_reject_all_batches_yields(
     ));
     let filtered = Arc::new(FilterExec::try_new(false_predicate, Arc::new(infinite))?);
 
-    // Use CoalesceBatchesExec to guarantee each Filter pull always yields an 8192-row batch
-    let coalesced = Arc::new(CoalesceBatchesExec::new(filtered, 8_192));
-
-    query_yields(coalesced, session_ctx.task_ctx()).await
+    query_yields(filtered, session_ctx.task_ctx()).await
 }
 
 #[rstest]
@@ -545,6 +552,7 @@ async fn interleave_then_aggregate_yields(
             vec![], // no GROUP BY columns
             vec![], // no GROUP BY expressions
             vec![], // no GROUP BY physical expressions
+            false,
         ),
         vec![Arc::new(aggregate_expr)],
         vec![None], // no “distinct” flags
@@ -573,17 +581,18 @@ async fn join_yields(
     let left_keys: Vec<Arc<dyn PhysicalExpr>> = vec![Arc::new(Column::new("value", 0))];
     let right_keys: Vec<Arc<dyn PhysicalExpr>> = vec![Arc::new(Column::new("value", 0))];
 
-    // Wrap each side in CoalesceBatches + Repartition so they are both hashed into 1 partition
-    let coalesced_left =
-        Arc::new(CoalesceBatchesExec::new(Arc::new(infinite_left), 8_192));
-    let coalesced_right =
-        Arc::new(CoalesceBatchesExec::new(Arc::new(infinite_right), 8_192));
-
     let part_left = Partitioning::Hash(left_keys, 1);
     let part_right = Partitioning::Hash(right_keys, 1);
 
-    let hashed_left = Arc::new(RepartitionExec::try_new(coalesced_left, part_left)?);
-    let hashed_right = Arc::new(RepartitionExec::try_new(coalesced_right, part_right)?);
+    // Wrap each side in Repartition so they are both hashed into 1 partition
+    let hashed_left = Arc::new(RepartitionExec::try_new(
+        Arc::new(infinite_left),
+        part_left,
+    )?);
+    let hashed_right = Arc::new(RepartitionExec::try_new(
+        Arc::new(infinite_right),
+        part_right,
+    )?);
 
     // Build an Inner HashJoinExec → left.value = right.value
     let join = Arc::new(HashJoinExec::try_new(
@@ -598,6 +607,7 @@ async fn join_yields(
         None,
         PartitionMode::CollectLeft,
         NullEquality::NullEqualsNull,
+        false,
     )?);
 
     query_yields(join, session_ctx.task_ctx()).await
@@ -621,17 +631,18 @@ async fn join_agg_yields(
     let left_keys: Vec<Arc<dyn PhysicalExpr>> = vec![Arc::new(Column::new("value", 0))];
     let right_keys: Vec<Arc<dyn PhysicalExpr>> = vec![Arc::new(Column::new("value", 0))];
 
-    // Wrap each side in CoalesceBatches + Repartition so they are both hashed into 1 partition
-    let coalesced_left =
-        Arc::new(CoalesceBatchesExec::new(Arc::new(infinite_left), 8_192));
-    let coalesced_right =
-        Arc::new(CoalesceBatchesExec::new(Arc::new(infinite_right), 8_192));
-
     let part_left = Partitioning::Hash(left_keys, 1);
     let part_right = Partitioning::Hash(right_keys, 1);
 
-    let hashed_left = Arc::new(RepartitionExec::try_new(coalesced_left, part_left)?);
-    let hashed_right = Arc::new(RepartitionExec::try_new(coalesced_right, part_right)?);
+    // Wrap each side in Repartition so they are both hashed into 1 partition
+    let hashed_left = Arc::new(RepartitionExec::try_new(
+        Arc::new(infinite_left),
+        part_left,
+    )?);
+    let hashed_right = Arc::new(RepartitionExec::try_new(
+        Arc::new(infinite_right),
+        part_right,
+    )?);
 
     // Build an Inner HashJoinExec → left.value = right.value
     let join = Arc::new(HashJoinExec::try_new(
@@ -646,6 +657,7 @@ async fn join_agg_yields(
         None,
         PartitionMode::CollectLeft,
         NullEquality::NullEqualsNull,
+        false,
     )?);
 
     // Project only one column (“value” from the left side) because we just want to sum that
@@ -653,7 +665,7 @@ async fn join_agg_yields(
 
     let proj_expr = vec![ProjectionExpr::new(
         Arc::new(Column::new_with_schema("value", &input_schema)?) as _,
-        "value".to_string(),
+        "value",
     )];
 
     let projection = Arc::new(ProjectionExec::try_new(proj_expr, join)?);
@@ -676,7 +688,7 @@ async fn join_agg_yields(
 
     let aggr = Arc::new(AggregateExec::try_new(
         AggregateMode::Single,
-        PhysicalGroupBy::new(vec![], vec![], vec![]),
+        PhysicalGroupBy::new(vec![], vec![], vec![], false),
         vec![Arc::new(aggregate_expr)],
         vec![None],
         projection,
@@ -711,6 +723,7 @@ async fn hash_join_yields(
         None,
         PartitionMode::CollectLeft,
         NullEquality::NullEqualsNull,
+        false,
     )?);
 
     query_yields(join, session_ctx.task_ctx()).await
@@ -742,9 +755,10 @@ async fn hash_join_without_repartition_and_no_agg(
         /* filter */ None,
         &JoinType::Inner,
         /* output64 */ None,
-        // Using CollectLeft is fine—just avoid RepartitionExec’s partitioned channels.
+        // Using CollectLeft is fine—just avoid RepartitionExec's partitioned channels.
         PartitionMode::CollectLeft,
         NullEquality::NullEqualsNull,
+        false,
     )?);
 
     query_yields(join, session_ctx.task_ctx()).await
@@ -753,7 +767,7 @@ async fn hash_join_without_repartition_and_no_agg(
 #[derive(Debug)]
 enum Yielded {
     ReadyOrPending,
-    Err(#[allow(dead_code)] DataFusionError),
+    Err(#[expect(dead_code)] DataFusionError),
     Timeout,
 }
 
@@ -780,9 +794,9 @@ async fn stream_yields(
     let yielded = select! {
         result = join_handle => {
             match result {
-                Ok(Pending) => Yielded::ReadyOrPending,
-                Ok(Ready(Ok(_))) => Yielded::ReadyOrPending,
-                Ok(Ready(Err(e))) => Yielded::Err(e),
+                Ok(Poll::Pending) => Yielded::ReadyOrPending,
+                Ok(Poll::Ready(Ok(_))) => Yielded::ReadyOrPending,
+                Ok(Poll::Ready(Err(e))) => Yielded::Err(e),
                 Err(_) => Yielded::Err(exec_datafusion_err!("join error")),
             }
         },
diff --git a/datafusion/core/tests/execution/datasource_split.rs b/datafusion/core/tests/execution/datasource_split.rs
index 0b90c6f326168..370249cd8044e 100644
--- a/datafusion/core/tests/execution/datasource_split.rs
+++ b/datafusion/core/tests/execution/datasource_split.rs
@@ -22,7 +22,7 @@ use arrow::{
 };
 use datafusion_datasource::memory::MemorySourceConfig;
 use datafusion_execution::TaskContext;
-use datafusion_physical_plan::{common::collect, ExecutionPlan};
+use datafusion_physical_plan::{ExecutionPlan, common::collect};
 use std::sync::Arc;
 
 /// Helper function to create a memory source with the given batch size and collect all batches
diff --git a/datafusion/core/tests/execution/logical_plan.rs b/datafusion/core/tests/execution/logical_plan.rs
index ef2e263f2c467..3eaa3fb2ed5e6 100644
--- a/datafusion/core/tests/execution/logical_plan.rs
+++ b/datafusion/core/tests/execution/logical_plan.rs
@@ -20,7 +20,7 @@
 
 use arrow::array::Int64Array;
 use arrow::datatypes::{DataType, Field, Schema};
-use datafusion::datasource::{provider_as_source, ViewTable};
+use datafusion::datasource::{ViewTable, provider_as_source};
 use datafusion::execution::session_state::SessionStateBuilder;
 use datafusion_common::{Column, DFSchema, DFSchemaRef, Result, ScalarValue, Spans};
 use datafusion_execution::TaskContext;
diff --git a/datafusion/core/tests/execution/mod.rs b/datafusion/core/tests/execution/mod.rs
index 8770b2a201051..f33ef87aa3023 100644
--- a/datafusion/core/tests/execution/mod.rs
+++ b/datafusion/core/tests/execution/mod.rs
@@ -18,3 +18,4 @@
 mod coop;
 mod datasource_split;
 mod logical_plan;
+mod register_arrow;
diff --git a/datafusion/core/tests/execution/register_arrow.rs b/datafusion/core/tests/execution/register_arrow.rs
new file mode 100644
index 0000000000000..4ce16dc0906c1
--- /dev/null
+++ b/datafusion/core/tests/execution/register_arrow.rs
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Integration tests for register_arrow API
+
+use datafusion::{execution::options::ArrowReadOptions, prelude::*};
+use datafusion_common::Result;
+
+#[tokio::test]
+async fn test_register_arrow_auto_detects_format() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    ctx.register_arrow(
+        "file_format",
+        "../../datafusion/datasource-arrow/tests/data/example.arrow",
+        ArrowReadOptions::default(),
+    )
+    .await?;
+
+    ctx.register_arrow(
+        "stream_format",
+        "../../datafusion/datasource-arrow/tests/data/example_stream.arrow",
+        ArrowReadOptions::default(),
+    )
+    .await?;
+
+    let file_result = ctx.sql("SELECT * FROM file_format ORDER BY f0").await?;
+    let stream_result = ctx.sql("SELECT * FROM stream_format ORDER BY f0").await?;
+
+    let file_batches = file_result.collect().await?;
+    let stream_batches = stream_result.collect().await?;
+
+    assert_eq!(file_batches.len(), stream_batches.len());
+    assert_eq!(file_batches[0].schema(), stream_batches[0].schema());
+
+    let file_rows: usize = file_batches.iter().map(|b| b.num_rows()).sum();
+    let stream_rows: usize = stream_batches.iter().map(|b| b.num_rows()).sum();
+    assert_eq!(file_rows, stream_rows);
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_register_arrow_join_file_and_stream() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    ctx.register_arrow(
+        "file_table",
+        "../../datafusion/datasource-arrow/tests/data/example.arrow",
+        ArrowReadOptions::default(),
+    )
+    .await?;
+
+    ctx.register_arrow(
+        "stream_table",
+        "../../datafusion/datasource-arrow/tests/data/example_stream.arrow",
+        ArrowReadOptions::default(),
+    )
+    .await?;
+
+    let result = ctx
+        .sql(
+            "SELECT a.f0, a.f1, b.f0, b.f1
+             FROM file_table a
+             JOIN stream_table b ON a.f0 = b.f0
+             WHERE a.f0 <= 2
+             ORDER BY a.f0",
+        )
+        .await?;
+    let batches = result.collect().await?;
+
+    let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+    assert_eq!(total_rows, 2);
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/expr_api/mod.rs b/datafusion/core/tests/expr_api/mod.rs
index 84e644480a4fd..19ff3933193de 100644
--- a/datafusion/core/tests/expr_api/mod.rs
+++ b/datafusion/core/tests/expr_api/mod.rs
@@ -16,17 +16,16 @@
 // under the License.
 
 use arrow::array::{
-    builder::{ListBuilder, StringBuilder},
     ArrayRef, Int64Array, RecordBatch, StringArray, StructArray,
+    builder::{ListBuilder, StringBuilder},
 };
 use arrow::datatypes::{DataType, Field};
 use arrow::util::pretty::{pretty_format_batches, pretty_format_columns};
 use datafusion::prelude::*;
 use datafusion_common::{DFSchema, ScalarValue};
-use datafusion_expr::execution_props::ExecutionProps;
+use datafusion_expr::ExprFunctionExt;
 use datafusion_expr::expr::NullTreatment;
 use datafusion_expr::simplify::SimplifyContext;
-use datafusion_expr::ExprFunctionExt;
 use datafusion_functions::core::expr_ext::FieldAccessor;
 use datafusion_functions_aggregate::first_last::first_value_udaf;
 use datafusion_functions_aggregate::sum::sum_udaf;
@@ -36,6 +35,7 @@ use datafusion_optimizer::simplify_expressions::ExprSimplifier;
 use std::sync::{Arc, LazyLock};
 
 mod parse_sql_expr;
+#[expect(clippy::needless_pass_by_value)]
 mod simplification;
 
 #[test]
@@ -342,20 +342,26 @@ fn test_create_physical_expr_nvl2() {
 
 #[tokio::test]
 async fn test_create_physical_expr_coercion() {
-    // create_physical_expr does apply type coercion and unwrapping in cast
+    // create_physical_expr applies type coercion (and can unwrap/fold
+    // literal casts). Comparison coercion prefers numeric types, so
+    // string/int comparisons cast the string side to the numeric type.
     //
-    // expect the cast on the literals
-    // compare string function to int  `id = 1`
-    create_expr_test(col("id").eq(lit(1i32)), "id@0 = CAST(1 AS Utf8)");
-    create_expr_test(lit(1i32).eq(col("id")), "CAST(1 AS Utf8) = id@0");
-    // compare int col to string literal `i = '202410'`
-    // Note this casts the column (not the field)
-    create_expr_test(col("i").eq(lit("202410")), "CAST(i@1 AS Utf8) = 202410");
-    create_expr_test(lit("202410").eq(col("i")), "202410 = CAST(i@1 AS Utf8)");
-    // however, when simplified the casts on i should removed
-    // https://github.com/apache/datafusion/issues/14944
-    create_simplified_expr_test(col("i").eq(lit("202410")), "CAST(i@1 AS Utf8) = 202410");
-    create_simplified_expr_test(lit("202410").eq(col("i")), "CAST(i@1 AS Utf8) = 202410");
+    // string column vs int literal: id (Utf8) is cast to Int32
+    create_expr_test(col("id").eq(lit(1i32)), "CAST(id@0 AS Int32) = 1");
+    create_expr_test(lit(1i32).eq(col("id")), "1 = CAST(id@0 AS Int32)");
+    // int column vs string literal: the string literal is cast to Int64
+    create_expr_test(col("i").eq(lit("202410")), "i@1 = CAST(202410 AS Int64)");
+    create_expr_test(lit("202410").eq(col("i")), "CAST(202410 AS Int64) = i@1");
+    // The simplifier operates on the logical expression before type
+    // coercion adds the CAST, so the output is unchanged.
+    create_simplified_expr_test(
+        col("i").eq(lit("202410")),
+        "i@1 = CAST(202410 AS Int64)",
+    );
+    create_simplified_expr_test(
+        lit("202410").eq(col("i")),
+        "i@1 = CAST(202410 AS Int64)",
+    );
 }
 
 /// Evaluates the specified expr as an aggregate and compares the result to the
@@ -384,6 +390,7 @@ async fn evaluate_agg_test(expr: Expr, expected_lines: Vec<&str>) {
 
 /// Converts the `Expr` to a `PhysicalExpr`, evaluates it against the provided
 /// `RecordBatch` and compares the result to the expected result.
+#[expect(clippy::needless_pass_by_value)]
 fn evaluate_expr_test(expr: Expr, expected_lines: Vec<&str>) {
     let batch = &TEST_BATCH;
     let df_schema = DFSchema::try_from(batch.schema()).unwrap();
@@ -420,9 +427,9 @@ fn create_simplified_expr_test(expr: Expr, expected_expr: &str) {
     let df_schema = DFSchema::try_from(batch.schema()).unwrap();
 
     // Simplify the expression first
-    let props = ExecutionProps::new();
-    let simplify_context =
-        SimplifyContext::new(&props).with_schema(df_schema.clone().into());
+    let simplify_context = SimplifyContext::builder()
+        .with_schema(Arc::new(df_schema))
+        .build();
     let simplifier = ExprSimplifier::new(simplify_context).with_max_cycles(10);
     let simplified = simplifier.simplify(expr).unwrap();
     create_expr_test(simplified, expected_expr);
diff --git a/datafusion/core/tests/expr_api/parse_sql_expr.rs b/datafusion/core/tests/expr_api/parse_sql_expr.rs
index 92c18204324f7..b0d8b3a349ae2 100644
--- a/datafusion/core/tests/expr_api/parse_sql_expr.rs
+++ b/datafusion/core/tests/expr_api/parse_sql_expr.rs
@@ -19,9 +19,9 @@ use arrow::datatypes::{DataType, Field, Schema};
 use datafusion::prelude::{CsvReadOptions, SessionContext};
 use datafusion_common::DFSchema;
 use datafusion_common::{DFSchemaRef, Result, ToDFSchema};
+use datafusion_expr::Expr;
 use datafusion_expr::col;
 use datafusion_expr::lit;
-use datafusion_expr::Expr;
 use datafusion_sql::unparser::Unparser;
 /// A schema like:
 ///
diff --git a/datafusion/core/tests/expr_api/simplification.rs b/datafusion/core/tests/expr_api/simplification.rs
index 46c36c6abdacc..245aba66849ce 100644
--- a/datafusion/core/tests/expr_api/simplification.rs
+++ b/datafusion/core/tests/expr_api/simplification.rs
@@ -23,16 +23,16 @@ use arrow::array::types::IntervalDayTime;
 use arrow::array::{ArrayRef, Int32Array};
 use arrow::datatypes::{DataType, Field, Schema};
 use chrono::{DateTime, TimeZone, Utc};
-use datafusion::{error::Result, execution::context::ExecutionProps, prelude::*};
-use datafusion_common::cast::as_int32_array;
+use datafusion::{error::Result, prelude::*};
 use datafusion_common::ScalarValue;
+use datafusion_common::cast::as_int32_array;
 use datafusion_common::{DFSchemaRef, ToDFSchema};
 use datafusion_expr::expr::ScalarFunction;
 use datafusion_expr::logical_plan::builder::table_scan_with_filters;
-use datafusion_expr::simplify::SimplifyInfo;
+use datafusion_expr::simplify::SimplifyContext;
 use datafusion_expr::{
-    table_scan, Cast, ColumnarValue, ExprSchemable, LogicalPlan, LogicalPlanBuilder,
-    ScalarUDF, Volatility,
+    Cast, ColumnarValue, ExprSchemable, LogicalPlan, LogicalPlanBuilder, Projection,
+    ScalarUDF, Volatility, table_scan,
 };
 use datafusion_functions::math;
 use datafusion_optimizer::optimizer::Optimizer;
@@ -40,50 +40,6 @@ use datafusion_optimizer::simplify_expressions::{ExprSimplifier, SimplifyExpress
 use datafusion_optimizer::{OptimizerContext, OptimizerRule};
 use std::sync::Arc;
 
-/// In order to simplify expressions, DataFusion must have information
-/// about the expressions.
-///
-/// You can provide that information using DataFusion [DFSchema]
-/// objects or from some other implementation
-struct MyInfo {
-    /// The input schema
-    schema: DFSchemaRef,
-
-    /// Execution specific details needed for constant evaluation such
-    /// as the current time for `now()` and [VariableProviders]
-    execution_props: ExecutionProps,
-}
-
-impl SimplifyInfo for MyInfo {
-    fn is_boolean_type(&self, expr: &Expr) -> Result<bool> {
-        Ok(matches!(
-            expr.get_type(self.schema.as_ref())?,
-            DataType::Boolean
-        ))
-    }
-
-    fn nullable(&self, expr: &Expr) -> Result<bool> {
-        expr.nullable(self.schema.as_ref())
-    }
-
-    fn execution_props(&self) -> &ExecutionProps {
-        &self.execution_props
-    }
-
-    fn get_data_type(&self, expr: &Expr) -> Result<DataType> {
-        expr.get_type(self.schema.as_ref())
-    }
-}
-
-impl From<DFSchemaRef> for MyInfo {
-    fn from(schema: DFSchemaRef) -> Self {
-        Self {
-            schema,
-            execution_props: ExecutionProps::new(),
-        }
-    }
-}
-
 /// A schema like:
 ///
 /// a: Int32 (possibly with nulls)
@@ -132,14 +88,11 @@ fn test_evaluate_with_start_time(
     expected_expr: Expr,
     date_time: &DateTime<Utc>,
 ) {
-    let execution_props =
-        ExecutionProps::new().with_query_execution_start_time(*date_time);
-
-    let info: MyInfo = MyInfo {
-        schema: schema(),
-        execution_props,
-    };
-    let simplifier = ExprSimplifier::new(info);
+    let context = SimplifyContext::builder()
+        .with_schema(schema())
+        .with_query_execution_start_time(Some(*date_time))
+        .build();
+    let simplifier = ExprSimplifier::new(context);
     let simplified_expr = simplifier
         .simplify(input_expr.clone())
         .expect("successfully evaluated");
@@ -201,7 +154,10 @@ fn to_timestamp_expr(arg: impl Into<String>) -> Expr {
 
 #[test]
 fn basic() {
-    let info: MyInfo = schema().into();
+    let context = SimplifyContext::builder()
+        .with_schema(schema())
+        .with_query_execution_start_time(Some(Utc::now()))
+        .build();
 
     // The `Expr` is a core concept in DataFusion, and DataFusion can
     // help simplify it.
@@ -210,21 +166,21 @@ fn basic() {
     // optimize form `a < 5` automatically
     let expr = col("a").lt(lit(2i32) + lit(3i32));
 
-    let simplifier = ExprSimplifier::new(info);
+    let simplifier = ExprSimplifier::new(context);
     let simplified = simplifier.simplify(expr).unwrap();
     assert_eq!(simplified, col("a").lt(lit(5i32)));
 }
 
 #[test]
 fn fold_and_simplify() {
-    let info: MyInfo = schema().into();
+    let context = SimplifyContext::builder().with_schema(schema()).build();
 
     // What will it do with the expression `concat('foo', 'bar') == 'foobar')`?
     let expr = concat(vec![lit("foo"), lit("bar")]).eq(lit("foobar"));
 
     // Since datafusion applies both simplification *and* rewriting
     // some expressions can be entirely simplified
-    let simplifier = ExprSimplifier::new(info);
+    let simplifier = ExprSimplifier::new(context);
     let simplified = simplifier.simplify(expr).unwrap();
     assert_eq!(simplified, lit(true))
 }
@@ -243,10 +199,10 @@ fn to_timestamp_expr_folded() -> Result<()> {
     let actual = formatted.trim();
     assert_snapshot!(
         actual,
-        @r###"
+        @r#"
     Projection: TimestampNanosecond(1599566400000000000, None) AS to_timestamp(Utf8("2020-09-08T12:00:00+00:00"))
       TableScan: test
-    "###
+    "#
     );
     Ok(())
 }
@@ -273,10 +229,10 @@ fn now_less_than_timestamp() -> Result<()> {
 
     assert_snapshot!(
         actual,
-        @r###"
+        @r"
     Filter: Boolean(true)
       TableScan: test
-    "###
+    "
     );
     Ok(())
 }
@@ -312,10 +268,10 @@ fn select_date_plus_interval() -> Result<()> {
 
     assert_snapshot!(
         actual,
-        @r###"
+        @r#"
     Projection: Date32("2021-01-09") AS to_timestamp(Utf8("2020-09-08T12:05:00+00:00")) + IntervalDayTime("IntervalDayTime { days: 123, milliseconds: 0 }")
       TableScan: test
-    "###
+    "#
     );
     Ok(())
 }
@@ -334,10 +290,10 @@ fn simplify_project_scalar_fn() -> Result<()> {
     let actual = formatter.trim();
     assert_snapshot!(
         actual,
-        @r###"
+        @r"
     Projection: test.f AS power(test.f,Float64(1))
       TableScan: test
-    "###
+    "
     );
     Ok(())
 }
@@ -523,6 +479,72 @@ fn multiple_now() -> Result<()> {
     Ok(())
 }
 
+/// Unwraps an alias expression to get the inner expression
+fn unrwap_aliases(expr: &Expr) -> &Expr {
+    match expr {
+        Expr::Alias(alias) => unrwap_aliases(&alias.expr),
+        expr => expr,
+    }
+}
+
+/// Test that `now()` is simplified to a literal when execution start time is set,
+/// but remains as an expression when no execution start time is available.
+#[test]
+fn now_simplification_with_and_without_start_time() {
+    let plan = LogicalPlanBuilder::empty(false)
+        .project(vec![now()])
+        .unwrap()
+        .build()
+        .unwrap();
+
+    // Case 1: With execution start time set, now() should be simplified to a literal
+    {
+        let time = DateTime::<Utc>::from_timestamp_nanos(123);
+        let ctx: OptimizerContext =
+            OptimizerContext::new().with_query_execution_start_time(time);
+        let optimizer = SimplifyExpressions {};
+        let simplified = optimizer
+            .rewrite(plan.clone(), &ctx)
+            .expect("rewrite should succeed")
+            .data;
+        let LogicalPlan::Projection(Projection { expr, .. }) = simplified else {
+            panic!("Expected Projection plan");
+        };
+        assert_eq!(expr.len(), 1);
+        let simplified = unrwap_aliases(expr.first().unwrap());
+        // Should be a literal timestamp
+        match simplified {
+            Expr::Literal(ScalarValue::TimestampNanosecond(Some(ts), _), _) => {
+                assert_eq!(*ts, time.timestamp_nanos_opt().unwrap());
+            }
+            other => panic!("Expected timestamp literal, got: {other:?}"),
+        }
+    }
+
+    // Case 2: Without execution start time, now() should remain as a function call
+    {
+        let ctx: OptimizerContext =
+            OptimizerContext::new().without_query_execution_start_time();
+        let optimizer = SimplifyExpressions {};
+        let simplified = optimizer
+            .rewrite(plan, &ctx)
+            .expect("rewrite should succeed")
+            .data;
+        let LogicalPlan::Projection(Projection { expr, .. }) = simplified else {
+            panic!("Expected Projection plan");
+        };
+        assert_eq!(expr.len(), 1);
+        let simplified = unrwap_aliases(expr.first().unwrap());
+        // Should still be a now() function call
+        match simplified {
+            Expr::ScalarFunction(ScalarFunction { func, .. }) => {
+                assert_eq!(func.name(), "now");
+            }
+            other => panic!("Expected now() function call, got: {other:?}"),
+        }
+    }
+}
+
 // ------------------------------
 // --- Simplifier tests -----
 // ------------------------------
@@ -545,11 +567,10 @@ fn expr_test_schema() -> DFSchemaRef {
 }
 
 fn test_simplify(input_expr: Expr, expected_expr: Expr) {
-    let info: MyInfo = MyInfo {
-        schema: expr_test_schema(),
-        execution_props: ExecutionProps::new(),
-    };
-    let simplifier = ExprSimplifier::new(info);
+    let context = SimplifyContext::builder()
+        .with_schema(expr_test_schema())
+        .build();
+    let simplifier = ExprSimplifier::new(context);
     let simplified_expr = simplifier
         .simplify(input_expr.clone())
         .expect("successfully evaluated");
@@ -564,11 +585,11 @@ fn test_simplify_with_cycle_count(
     expected_expr: Expr,
     expected_count: u32,
 ) {
-    let info: MyInfo = MyInfo {
-        schema: expr_test_schema(),
-        execution_props: ExecutionProps::new(),
-    };
-    let simplifier = ExprSimplifier::new(info);
+    let context = SimplifyContext::builder()
+        .with_schema(expr_test_schema())
+        .with_query_execution_start_time(Some(Utc::now()))
+        .build();
+    let simplifier = ExprSimplifier::new(context);
     let (simplified_expr, count) = simplifier
         .simplify_with_cycle_count_transformed(input_expr.clone())
         .expect("successfully evaluated");
diff --git a/datafusion/core/tests/extension_types/mod.rs b/datafusion/core/tests/extension_types/mod.rs
new file mode 100644
index 0000000000000..bfe0c2e34927e
--- /dev/null
+++ b/datafusion/core/tests/extension_types/mod.rs
@@ -0,0 +1,18 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod pretty_printing;
diff --git a/datafusion/core/tests/extension_types/pretty_printing.rs b/datafusion/core/tests/extension_types/pretty_printing.rs
new file mode 100644
index 0000000000000..c0796887b8b6e
--- /dev/null
+++ b/datafusion/core/tests/extension_types/pretty_printing.rs
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{FixedSizeBinaryArray, RecordBatch};
+use arrow_schema::extension::Uuid;
+use arrow_schema::{DataType, Field, Schema, SchemaRef};
+use datafusion::dataframe::DataFrame;
+use datafusion::error::Result;
+use datafusion::execution::SessionStateBuilder;
+use datafusion::prelude::SessionContext;
+use datafusion_expr::registry::MemoryExtensionTypeRegistry;
+use insta::assert_snapshot;
+use std::sync::Arc;
+
+fn test_schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![uuid_field()]))
+}
+
+fn uuid_field() -> Field {
+    Field::new("my_uuids", DataType::FixedSizeBinary(16), false).with_extension_type(Uuid)
+}
+
+async fn create_test_table() -> Result<DataFrame> {
+    let schema = test_schema();
+
+    // define data.
+    let batch = RecordBatch::try_new(
+        schema,
+        vec![Arc::new(FixedSizeBinaryArray::from(vec![
+            &[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+            &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 5, 6],
+        ]))],
+    )?;
+
+    let state = SessionStateBuilder::default()
+        .with_extension_type_registry(Arc::new(
+            MemoryExtensionTypeRegistry::new_with_canonical_extension_types(),
+        ))
+        .build();
+    let ctx = SessionContext::new_with_state(state);
+
+    ctx.register_batch("test", batch)?;
+
+    ctx.table("test").await
+}
+
+#[tokio::test]
+async fn test_pretty_print_extension_type_formatter() -> Result<()> {
+    let result = create_test_table().await?.to_string().await?;
+
+    assert_snapshot!(
+        result,
+        @r"
+    +--------------------------------------+
+    | my_uuids                             |
+    +--------------------------------------+
+    | 00000000-0000-0000-0000-000000000000 |
+    | 00010203-0405-0607-0809-000102030506 |
+    +--------------------------------------+
+    "
+    );
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/fifo/mod.rs b/datafusion/core/tests/fifo/mod.rs
index 141a3f3b75586..3d99cc72fa590 100644
--- a/datafusion/core/tests/fifo/mod.rs
+++ b/datafusion/core/tests/fifo/mod.rs
@@ -22,21 +22,21 @@
 mod unix_test {
     use std::fs::File;
     use std::path::PathBuf;
-    use std::sync::atomic::{AtomicBool, Ordering};
     use std::sync::Arc;
+    use std::sync::atomic::{AtomicBool, Ordering};
     use std::time::Duration;
 
     use arrow::array::Array;
     use arrow::csv::ReaderBuilder;
     use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
-    use datafusion::datasource::stream::{FileStreamProvider, StreamConfig, StreamTable};
     use datafusion::datasource::TableProvider;
+    use datafusion::datasource::stream::{FileStreamProvider, StreamConfig, StreamTable};
     use datafusion::{
         prelude::{CsvReadOptions, SessionConfig, SessionContext},
         test_util::{aggr_test_schema, arrow_test_data},
     };
     use datafusion_common::instant::Instant;
-    use datafusion_common::{exec_err, Result};
+    use datafusion_common::{Result, exec_err};
     use datafusion_expr::SortExpr;
 
     use futures::StreamExt;
@@ -44,7 +44,7 @@ mod unix_test {
     use nix::unistd;
     use tempfile::TempDir;
     use tokio::io::AsyncWriteExt;
-    use tokio::task::{spawn_blocking, JoinHandle};
+    use tokio::task::{JoinHandle, spawn_blocking};
 
     /// Makes a TableProvider for a fifo file
     fn fifo_table(
@@ -94,7 +94,6 @@ mod unix_test {
     /// This function creates a writing task for the FIFO file. To verify
     /// incremental processing, it waits for a signal to continue writing after
     /// a certain number of lines are written.
-    #[allow(clippy::disallowed_methods)]
     fn create_writing_task(
         file_path: PathBuf,
         header: String,
@@ -105,6 +104,7 @@ mod unix_test {
         // Timeout for a long period of BrokenPipe error
         let broken_pipe_timeout = Duration::from_secs(10);
         // Spawn a new task to write to the FIFO file
+        #[expect(clippy::disallowed_methods)]
         tokio::spawn(async move {
             let mut file = tokio::fs::OpenOptions::new()
                 .write(true)
@@ -357,7 +357,7 @@ mod unix_test {
             (sink_fifo_path.clone(), sink_fifo_path.display());
 
         // Spawn a new thread to read sink EXTERNAL TABLE.
-        #[allow(clippy::disallowed_methods)] // spawn allowed only in tests
+        #[expect(clippy::disallowed_methods)] // spawn allowed only in tests
         tasks.push(spawn_blocking(move || {
             let file = File::open(sink_fifo_path_thread).unwrap();
             let schema = Arc::new(Schema::new(vec![
diff --git a/datafusion/core/tests/fuzz.rs b/datafusion/core/tests/fuzz.rs
index 92646e8b37636..5e94f12b5805d 100644
--- a/datafusion/core/tests/fuzz.rs
+++ b/datafusion/core/tests/fuzz.rs
@@ -15,7 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-/// Run all tests that are found in the `fuzz_cases` directory
+/// Run all tests that are found in the `fuzz_cases` directory.
+/// Fuzz tests are slow and gated behind the `extended_tests` feature.
+/// Run with: cargo test --features extended_tests
+#[cfg(feature = "extended_tests")]
 mod fuzz_cases;
 
 #[cfg(test)]
diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
index 4e04da26f70b6..4726e7c4aca5c 100644
--- a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
@@ -24,37 +24,37 @@ use crate::fuzz_cases::aggregation_fuzzer::{
 };
 
 use arrow::array::{
-    types::Int64Type, Array, ArrayRef, AsArray, Int32Array, Int64Array, RecordBatch,
-    StringArray,
+    Array, ArrayRef, AsArray, Int32Array, Int64Array, RecordBatch, StringArray,
+    types::Int64Type,
 };
 use arrow::compute::concat_batches;
 use arrow::datatypes::DataType;
 use arrow::util::pretty::pretty_format_batches;
 use arrow_schema::{Field, Schema, SchemaRef};
+use datafusion::datasource::MemTable;
 use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::datasource::source::DataSourceExec;
-use datafusion::datasource::MemTable;
 use datafusion::prelude::{DataFrame, SessionConfig, SessionContext};
 use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor};
 use datafusion_common::{HashMap, Result};
 use datafusion_common_runtime::JoinSet;
 use datafusion_functions_aggregate::sum::sum_udaf;
-use datafusion_physical_expr::expressions::{col, lit, Column};
 use datafusion_physical_expr::PhysicalSortExpr;
+use datafusion_physical_expr::expressions::{Column, col, lit};
 use datafusion_physical_plan::InputOrderMode;
-use test_utils::{add_empty_batches, StringBatchGenerator};
+use test_utils::{StringBatchGenerator, add_empty_batches};
 
+use datafusion_execution::TaskContext;
 use datafusion_execution::memory_pool::FairSpillPool;
 use datafusion_execution::runtime_env::RuntimeEnvBuilder;
-use datafusion_execution::TaskContext;
 use datafusion_physical_expr::aggregate::AggregateExprBuilder;
 use datafusion_physical_plan::aggregates::{
     AggregateExec, AggregateMode, PhysicalGroupBy,
 };
 use datafusion_physical_plan::metrics::MetricValue;
-use datafusion_physical_plan::{collect, displayable, ExecutionPlan};
+use datafusion_physical_plan::{ExecutionPlan, collect, displayable};
 use rand::rngs::StdRng;
-use rand::{random, rng, Rng, SeedableRng};
+use rand::{Rng, SeedableRng, random, rng};
 
 // ========================================================================
 //  The new aggregation fuzz tests based on [`AggregationFuzzer`]
@@ -326,15 +326,14 @@ async fn run_aggregate_test(input1: Vec<RecordBatch>, group_by_columns: Vec<&str
             .unwrap(),
     );
 
-    let aggregate_expr =
-        vec![
-            AggregateExprBuilder::new(sum_udaf(), vec![col("d", &schema).unwrap()])
-                .schema(Arc::clone(&schema))
-                .alias("sum1")
-                .build()
-                .map(Arc::new)
-                .unwrap(),
-        ];
+    let aggregate_expr = vec![
+        AggregateExprBuilder::new(sum_udaf(), vec![col("d", &schema).unwrap()])
+            .schema(Arc::clone(&schema))
+            .alias("sum1")
+            .build()
+            .map(Arc::new)
+            .unwrap(),
+    ];
     let expr = group_by_columns
         .iter()
         .map(|elem| (col(elem, &schema).unwrap(), (*elem).to_string()))
@@ -548,14 +547,14 @@ async fn verify_ordered_aggregate(frame: &DataFrame, expected_sort: bool) {
         type Node = Arc<dyn ExecutionPlan>;
 
         fn f_down(&mut self, node: &'n Self::Node) -> Result<TreeNodeRecursion> {
-            if let Some(exec) = node.as_any().downcast_ref::<AggregateExec>() {
+            if let Some(exec) = node.downcast_ref::<AggregateExec>() {
                 if self.expected_sort {
                     assert!(matches!(
                         exec.input_order_mode(),
                         InputOrderMode::PartiallySorted(_) | InputOrderMode::Sorted
                     ));
                 } else {
-                    assert!(matches!(exec.input_order_mode(), InputOrderMode::Linear));
+                    assert_eq!(*exec.input_order_mode(), InputOrderMode::Linear);
                 }
             }
             Ok(TreeNodeRecursion::Continue)
@@ -650,7 +649,9 @@ pub(crate) fn assert_spill_count_metric(
         if expect_spill && spill_count == 0 {
             panic!("Expected spill but SpillCount metric not found or SpillCount was 0.");
         } else if !expect_spill && spill_count > 0 {
-            panic!("Expected no spill but found SpillCount metric with value greater than 0.");
+            panic!(
+                "Expected no spill but found SpillCount metric with value greater than 0."
+            );
         }
 
         spill_count
diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs
index fa8ea0b31c023..fe31098622c58 100644
--- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs
+++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs
@@ -25,7 +25,7 @@ use datafusion_catalog::TableProvider;
 use datafusion_common::ScalarValue;
 use datafusion_common::{error::Result, utils::get_available_parallelism};
 use datafusion_expr::col;
-use rand::{rng, Rng};
+use rand::{Rng, rng};
 
 use crate::fuzz_cases::aggregation_fuzzer::data_generator::Dataset;
 
@@ -214,7 +214,7 @@ impl GeneratedSessionContextBuilder {
 
 /// The generated params for [`SessionContext`]
 #[derive(Debug)]
-#[allow(dead_code)]
+#[expect(dead_code)]
 pub struct SessionContextParams {
     batch_size: usize,
     target_partitions: usize,
diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs
index aaf2d1b9bad4f..e49cffa89b04e 100644
--- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs
+++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs
@@ -18,7 +18,7 @@
 use arrow::array::RecordBatch;
 use arrow::datatypes::DataType;
 use datafusion_common::Result;
-use datafusion_physical_expr::{expressions::col, PhysicalSortExpr};
+use datafusion_physical_expr::{PhysicalSortExpr, expressions::col};
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
 use datafusion_physical_plan::sorts::sort::sort_batch;
 use test_utils::stagger_batch;
@@ -209,8 +209,8 @@ mod test {
             sort_keys_set: vec![vec!["b".to_string()]],
         };
 
-        let mut gen = DatasetGenerator::new(config);
-        let datasets = gen.generate().unwrap();
+        let mut data_gen = DatasetGenerator::new(config);
+        let datasets = data_gen.generate().unwrap();
 
         // Should Generate 2 datasets
         assert_eq!(datasets.len(), 2);
diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs
index 1a8ef278cc299..430762b1c28db 100644
--- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs
+++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs
@@ -19,9 +19,9 @@ use std::sync::Arc;
 
 use arrow::array::RecordBatch;
 use arrow::util::pretty::pretty_format_batches;
-use datafusion_common::{internal_datafusion_err, Result};
+use datafusion_common::{Result, internal_datafusion_err};
 use datafusion_common_runtime::JoinSet;
-use rand::{rng, Rng};
+use rand::{Rng, rng};
 
 use crate::fuzz_cases::aggregation_fuzzer::query_builder::QueryBuilder;
 use crate::fuzz_cases::aggregation_fuzzer::{
diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs
index 766e2bedd74c2..7bb6177c31010 100644
--- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs
+++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs
@@ -17,7 +17,7 @@
 
 use std::{collections::HashSet, str::FromStr};
 
-use rand::{rng, seq::SliceRandom, Rng};
+use rand::{Rng, rng, seq::SliceRandom};
 
 /// Random aggregate query builder
 ///
@@ -182,13 +182,13 @@ impl QueryBuilder {
 
     /// Add max columns num in group by(default: 3), for example if it is set to 1,
     /// the generated sql will group by at most 1 column
-    #[allow(dead_code)]
+    #[expect(dead_code)]
     pub fn with_max_group_by_columns(mut self, max_group_by_columns: usize) -> Self {
         self.max_group_by_columns = max_group_by_columns;
         self
     }
 
-    #[allow(dead_code)]
+    #[expect(dead_code)]
     pub fn with_min_group_by_columns(mut self, min_group_by_columns: usize) -> Self {
         self.min_group_by_columns = min_group_by_columns;
         self
@@ -202,7 +202,7 @@ impl QueryBuilder {
     }
 
     /// Add if also test the no grouping aggregation case(default: true)
-    #[allow(dead_code)]
+    #[expect(dead_code)]
     pub fn with_no_grouping(mut self, no_grouping: bool) -> Self {
         self.no_grouping = no_grouping;
         self
diff --git a/datafusion/core/tests/fuzz_cases/distinct_count_string_fuzz.rs b/datafusion/core/tests/fuzz_cases/distinct_count_string_fuzz.rs
index 3049631d4b3fe..92adda200d1a5 100644
--- a/datafusion/core/tests/fuzz_cases/distinct_count_string_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/distinct_count_string_fuzz.rs
@@ -19,7 +19,7 @@
 
 use std::sync::Arc;
 
-use arrow::array::{cast::AsArray, Array, OffsetSizeTrait, RecordBatch};
+use arrow::array::{Array, OffsetSizeTrait, RecordBatch, cast::AsArray};
 
 use datafusion::datasource::MemTable;
 use datafusion_common_runtime::JoinSet;
diff --git a/datafusion/core/tests/fuzz_cases/equivalence/ordering.rs b/datafusion/core/tests/fuzz_cases/equivalence/ordering.rs
index 171839b390ffa..a57095066ee12 100644
--- a/datafusion/core/tests/fuzz_cases/equivalence/ordering.rs
+++ b/datafusion/core/tests/fuzz_cases/equivalence/ordering.rs
@@ -16,19 +16,19 @@
 // under the License.
 
 use crate::fuzz_cases::equivalence::utils::{
-    create_random_schema, create_test_params, create_test_schema_2,
+    TestScalarUDF, create_random_schema, create_test_params, create_test_schema_2,
     generate_table_for_eq_properties, generate_table_for_orderings,
-    is_table_same_after_sort, TestScalarUDF,
+    is_table_same_after_sort,
 };
 use arrow::compute::SortOptions;
-use datafusion_common::config::ConfigOptions;
 use datafusion_common::Result;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{Operator, ScalarUDF};
+use datafusion_physical_expr::ScalarFunctionExpr;
 use datafusion_physical_expr::equivalence::{
     convert_to_orderings, convert_to_sort_exprs,
 };
-use datafusion_physical_expr::expressions::{col, BinaryExpr};
-use datafusion_physical_expr::ScalarFunctionExpr;
+use datafusion_physical_expr::expressions::{BinaryExpr, col};
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
 use itertools::Itertools;
diff --git a/datafusion/core/tests/fuzz_cases/equivalence/projection.rs b/datafusion/core/tests/fuzz_cases/equivalence/projection.rs
index a72a1558b2e41..2f67e211ce915 100644
--- a/datafusion/core/tests/fuzz_cases/equivalence/projection.rs
+++ b/datafusion/core/tests/fuzz_cases/equivalence/projection.rs
@@ -16,15 +16,15 @@
 // under the License.
 
 use crate::fuzz_cases::equivalence::utils::{
-    apply_projection, create_random_schema, generate_table_for_eq_properties,
-    is_table_same_after_sort, TestScalarUDF,
+    TestScalarUDF, apply_projection, create_random_schema,
+    generate_table_for_eq_properties, is_table_same_after_sort,
 };
 use arrow::compute::SortOptions;
-use datafusion_common::config::ConfigOptions;
 use datafusion_common::Result;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{Operator, ScalarUDF};
 use datafusion_physical_expr::equivalence::ProjectionMapping;
-use datafusion_physical_expr::expressions::{col, BinaryExpr};
+use datafusion_physical_expr::expressions::{BinaryExpr, col};
 use datafusion_physical_expr::{PhysicalExprRef, ScalarFunctionExpr};
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
diff --git a/datafusion/core/tests/fuzz_cases/equivalence/properties.rs b/datafusion/core/tests/fuzz_cases/equivalence/properties.rs
index 382c4da943219..1490eb08a0291 100644
--- a/datafusion/core/tests/fuzz_cases/equivalence/properties.rs
+++ b/datafusion/core/tests/fuzz_cases/equivalence/properties.rs
@@ -18,13 +18,13 @@
 use std::sync::Arc;
 
 use crate::fuzz_cases::equivalence::utils::{
-    create_random_schema, generate_table_for_eq_properties, is_table_same_after_sort,
-    TestScalarUDF,
+    TestScalarUDF, create_random_schema, generate_table_for_eq_properties,
+    is_table_same_after_sort,
 };
 
 use datafusion_common::Result;
 use datafusion_expr::{Operator, ScalarUDF};
-use datafusion_physical_expr::expressions::{col, BinaryExpr};
+use datafusion_physical_expr::expressions::{BinaryExpr, col};
 use datafusion_physical_expr::{LexOrdering, ScalarFunctionExpr};
 use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 
diff --git a/datafusion/core/tests/fuzz_cases/equivalence/utils.rs b/datafusion/core/tests/fuzz_cases/equivalence/utils.rs
index be35ddca8f02d..8350cafb215cb 100644
--- a/datafusion/core/tests/fuzz_cases/equivalence/utils.rs
+++ b/datafusion/core/tests/fuzz_cases/equivalence/utils.rs
@@ -15,26 +15,25 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::cmp::Ordering;
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, Float32Array, Float64Array, RecordBatch, UInt32Array};
-use arrow::compute::{lexsort_to_indices, take_record_batch, SortColumn, SortOptions};
+use arrow::compute::{SortColumn, SortOptions, lexsort_to_indices, take_record_batch};
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use datafusion_common::utils::{compare_rows, get_row_at_idx};
-use datafusion_common::{exec_err, internal_datafusion_err, plan_err, Result};
+use datafusion_common::{Result, exec_err, internal_datafusion_err, plan_err};
 use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion_expr::{
     ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
 };
 use datafusion_physical_expr::equivalence::{
-    convert_to_orderings, EquivalenceClass, ProjectionMapping,
+    EquivalenceClass, ProjectionMapping, convert_to_orderings,
 };
 use datafusion_physical_expr::{ConstExpr, EquivalenceProperties};
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
-use datafusion_physical_plan::expressions::{col, Column};
+use datafusion_physical_plan::expressions::{Column, col};
 
 use itertools::izip;
 use rand::prelude::*;
@@ -50,7 +49,7 @@ pub fn output_schema(
         let data_type = source.data_type(input_schema)?;
         let nullable = source.nullable(input_schema)?;
         for (target, _) in targets.iter() {
-            let Some(column) = target.as_any().downcast_ref::<Column>() else {
+            let Some(column) = target.downcast_ref::<Column>() else {
                 return plan_err!("Expects to have column");
             };
             fields.push(Field::new(column.name(), data_type.clone(), nullable));
@@ -283,7 +282,7 @@ fn get_representative_arr(
     schema: SchemaRef,
 ) -> Option<ArrayRef> {
     for expr in eq_group.iter() {
-        let col = expr.as_any().downcast_ref::<Column>().unwrap();
+        let col = expr.downcast_ref::<Column>().unwrap();
         let (idx, _field) = schema.column_with_name(col.name()).unwrap();
         if let Some(res) = &existing_vec[idx] {
             return Some(Arc::clone(res));
@@ -371,7 +370,7 @@ pub fn generate_table_for_eq_properties(
 
     // Fill constant columns
     for constant in eq_properties.constants() {
-        let col = constant.expr.as_any().downcast_ref::<Column>().unwrap();
+        let col = constant.expr.downcast_ref::<Column>().unwrap();
         let (idx, _field) = schema.column_with_name(col.name()).unwrap();
         let arr =
             Arc::new(Float64Array::from_iter_values(vec![0 as f64; n_elem])) as ArrayRef;
@@ -383,7 +382,7 @@ pub fn generate_table_for_eq_properties(
         let (sort_columns, indices): (Vec<_>, Vec<_>) = ordering
             .iter()
             .map(|PhysicalSortExpr { expr, options }| {
-                let col = expr.as_any().downcast_ref::<Column>().unwrap();
+                let col = expr.downcast_ref::<Column>().unwrap();
                 let (idx, _field) = schema.column_with_name(col.name()).unwrap();
                 let arr = generate_random_array(n_elem, n_distinct);
                 (
@@ -409,7 +408,7 @@ pub fn generate_table_for_eq_properties(
                 .unwrap_or_else(|| generate_random_array(n_elem, n_distinct));
 
         for expr in eq_group.iter() {
-            let col = expr.as_any().downcast_ref::<Column>().unwrap();
+            let col = expr.downcast_ref::<Column>().unwrap();
             let (idx, _field) = schema.column_with_name(col.name()).unwrap();
             schema_vec[idx] = Some(Arc::clone(&representative_array));
         }
@@ -531,9 +530,6 @@ impl TestScalarUDF {
 }
 
 impl ScalarUDFImpl for TestScalarUDF {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "test-scalar-udf"
     }
diff --git a/datafusion/core/tests/fuzz_cases/join_fuzz.rs b/datafusion/core/tests/fuzz_cases/join_fuzz.rs
index e8ff1ccf06704..fdb2934817bc5 100644
--- a/datafusion/core/tests/fuzz_cases/join_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/join_fuzz.rs
@@ -38,8 +38,11 @@ use datafusion::physical_plan::joins::{
 };
 use datafusion::prelude::{SessionConfig, SessionContext};
 use datafusion_common::{NullEquality, ScalarValue};
-use datafusion_physical_expr::expressions::Literal;
+use datafusion_execution::TaskContext;
+use datafusion_execution::disk_manager::{DiskManagerBuilder, DiskManagerMode};
+use datafusion_execution::runtime_env::RuntimeEnvBuilder;
 use datafusion_physical_expr::PhysicalExprRef;
+use datafusion_physical_expr::expressions::Literal;
 
 use itertools::Itertools;
 use rand::Rng;
@@ -91,484 +94,564 @@ fn col_lt_col_filter(schema1: Arc<Schema>, schema2: Arc<Schema>) -> JoinFilter {
 
 #[tokio::test]
 async fn test_inner_join_1k_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_i32(1000),
-        make_staggered_batches_i32(1000),
-        JoinType::Inner,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::Inner,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_inner_join_1k() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_i32(1000),
-        make_staggered_batches_i32(1000),
-        JoinType::Inner,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::Inner,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_join_1k() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_i32(1000),
-        make_staggered_batches_i32(1000),
-        JoinType::Left,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::Left,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_join_1k_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_i32(1000),
-        make_staggered_batches_i32(1000),
-        JoinType::Left,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::Left,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_right_join_1k() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_i32(1000),
-        make_staggered_batches_i32(1000),
-        JoinType::Right,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::Right,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_right_join_1k_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_i32(1000),
-        make_staggered_batches_i32(1000),
-        JoinType::Right,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::Right,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_full_join_1k() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_i32(1000),
-        make_staggered_batches_i32(1000),
-        JoinType::Full,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::Full,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_full_join_1k_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_i32(1000),
-        make_staggered_batches_i32(1000),
-        JoinType::Full,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[NljHj, HjSmj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::Full,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[NljHj, HjSmj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_semi_join_1k() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_i32(1000),
-        make_staggered_batches_i32(1000),
-        JoinType::LeftSemi,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::LeftSemi,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_semi_join_1k_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_i32(1000),
-        make_staggered_batches_i32(1000),
-        JoinType::LeftSemi,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::LeftSemi,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_right_semi_join_1k() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_i32(1000),
-        make_staggered_batches_i32(1000),
-        JoinType::RightSemi,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::RightSemi,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_right_semi_join_1k_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_i32(1000),
-        make_staggered_batches_i32(1000),
-        JoinType::RightSemi,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::RightSemi,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_anti_join_1k() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_i32(1000),
-        make_staggered_batches_i32(1000),
-        JoinType::LeftAnti,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::LeftAnti,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_anti_join_1k_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_i32(1000),
-        make_staggered_batches_i32(1000),
-        JoinType::LeftAnti,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::LeftAnti,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_right_anti_join_1k() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_i32(1000),
-        make_staggered_batches_i32(1000),
-        JoinType::RightAnti,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::RightAnti,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_right_anti_join_1k_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_i32(1000),
-        make_staggered_batches_i32(1000),
-        JoinType::RightAnti,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::RightAnti,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_mark_join_1k() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_i32(1000),
-        make_staggered_batches_i32(1000),
-        JoinType::LeftMark,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::LeftMark,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_mark_join_1k_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_i32(1000),
-        make_staggered_batches_i32(1000),
-        JoinType::LeftMark,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::LeftMark,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 // todo: add JoinTestType::HjSmj after Right mark SortMergeJoin support
 #[tokio::test]
 async fn test_right_mark_join_1k() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_i32(1000),
-        make_staggered_batches_i32(1000),
-        JoinType::RightMark,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::RightMark,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_right_mark_join_1k_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_i32(1000),
-        make_staggered_batches_i32(1000),
-        JoinType::RightMark,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::RightMark,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_inner_join_1k_binary_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_binary(1000),
-        make_staggered_batches_binary(1000),
-        JoinType::Inner,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::Inner,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_inner_join_1k_binary() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_binary(1000),
-        make_staggered_batches_binary(1000),
-        JoinType::Inner,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::Inner,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_join_1k_binary() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_binary(1000),
-        make_staggered_batches_binary(1000),
-        JoinType::Left,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::Left,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_join_1k_binary_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_binary(1000),
-        make_staggered_batches_binary(1000),
-        JoinType::Left,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::Left,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_right_join_1k_binary() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_binary(1000),
-        make_staggered_batches_binary(1000),
-        JoinType::Right,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::Right,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_right_join_1k_binary_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_binary(1000),
-        make_staggered_batches_binary(1000),
-        JoinType::Right,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::Right,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_full_join_1k_binary() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_binary(1000),
-        make_staggered_batches_binary(1000),
-        JoinType::Full,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::Full,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_full_join_1k_binary_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_binary(1000),
-        make_staggered_batches_binary(1000),
-        JoinType::Full,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[NljHj, HjSmj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::Full,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[NljHj, HjSmj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_semi_join_1k_binary() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_binary(1000),
-        make_staggered_batches_binary(1000),
-        JoinType::LeftSemi,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::LeftSemi,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_semi_join_1k_binary_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_binary(1000),
-        make_staggered_batches_binary(1000),
-        JoinType::LeftSemi,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::LeftSemi,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_right_semi_join_1k_binary() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_binary(1000),
-        make_staggered_batches_binary(1000),
-        JoinType::RightSemi,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::RightSemi,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_right_semi_join_1k_binary_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_binary(1000),
-        make_staggered_batches_binary(1000),
-        JoinType::RightSemi,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::RightSemi,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_anti_join_1k_binary() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_binary(1000),
-        make_staggered_batches_binary(1000),
-        JoinType::LeftAnti,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::LeftAnti,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_anti_join_1k_binary_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_binary(1000),
-        make_staggered_batches_binary(1000),
-        JoinType::LeftAnti,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::LeftAnti,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_right_anti_join_1k_binary() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_binary(1000),
-        make_staggered_batches_binary(1000),
-        JoinType::RightAnti,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::RightAnti,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_right_anti_join_1k_binary_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_binary(1000),
-        make_staggered_batches_binary(1000),
-        JoinType::RightAnti,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::RightAnti,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_mark_join_1k_binary() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_binary(1000),
-        make_staggered_batches_binary(1000),
-        JoinType::LeftMark,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::LeftMark,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_mark_join_1k_binary_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_binary(1000),
-        make_staggered_batches_binary(1000),
-        JoinType::LeftMark,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::LeftMark,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 // todo: add JoinTestType::HjSmj after Right mark SortMergeJoin support
 #[tokio::test]
 async fn test_right_mark_join_1k_binary() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_binary(1000),
-        make_staggered_batches_binary(1000),
-        JoinType::RightMark,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::RightMark,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_right_mark_join_1k_binary_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches_binary(1000),
-        make_staggered_batches_binary(1000),
-        JoinType::RightMark,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::RightMark,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 type JoinFilterBuilder = Box<dyn Fn(Arc<Schema>, Arc<Schema>) -> JoinFilter>;
@@ -769,6 +852,7 @@ impl JoinFuzzTestCase {
                 None,
                 PartitionMode::Partitioned,
                 NullEquality::NullEqualsNothing,
+                false,
             )
             .unwrap(),
         )
@@ -841,7 +925,9 @@ impl JoinFuzzTestCase {
                 std::fs::remove_dir_all(fuzz_debug).unwrap_or(());
                 std::fs::create_dir_all(fuzz_debug).unwrap();
                 let out_dir_name = &format!("{fuzz_debug}/batch_size_{batch_size}");
-                println!("Test result data mismatch found. HJ rows {hj_rows}, SMJ rows {smj_rows}, NLJ rows {nlj_rows}");
+                println!(
+                    "Test result data mismatch found. HJ rows {hj_rows}, SMJ rows {smj_rows}, NLJ rows {nlj_rows}"
+                );
                 println!("The debug is ON. Input data will be saved to {out_dir_name}");
 
                 Self::save_partitioned_batches_as_parquet(
@@ -892,10 +978,18 @@ impl JoinFuzzTestCase {
             }
 
             if join_tests.contains(&NljHj) {
-                let err_msg_rowcnt = format!("NestedLoopJoinExec and HashJoinExec produced different row counts, batch_size: {batch_size}");
+                let err_msg_rowcnt = format!(
+                    "NestedLoopJoinExec and HashJoinExec produced different row counts, batch_size: {batch_size}"
+                );
                 assert_eq!(nlj_rows, hj_rows, "{}", err_msg_rowcnt.as_str());
+                if nlj_rows == 0 && hj_rows == 0 {
+                    // both joins returned no rows, skip content comparison
+                    continue;
+                }
 
-                let err_msg_contents = format!("NestedLoopJoinExec and HashJoinExec produced different results, batch_size: {batch_size}");
+                let err_msg_contents = format!(
+                    "NestedLoopJoinExec and HashJoinExec produced different results, batch_size: {batch_size}"
+                );
                 // row level compare if any of joins returns the result
                 // the reason is different formatting when there is no rows
                 for (i, (nlj_line, hj_line)) in nlj_formatted_sorted
@@ -913,10 +1007,16 @@ impl JoinFuzzTestCase {
             }
 
             if join_tests.contains(&HjSmj) {
-                let err_msg_row_cnt = format!("HashJoinExec and SortMergeJoinExec produced different row counts, batch_size: {}", &batch_size);
+                let err_msg_row_cnt = format!(
+                    "HashJoinExec and SortMergeJoinExec produced different row counts, batch_size: {}",
+                    &batch_size
+                );
                 assert_eq!(hj_rows, smj_rows, "{}", err_msg_row_cnt.as_str());
 
-                let err_msg_contents = format!("SortMergeJoinExec and HashJoinExec produced different results, batch_size: {}", &batch_size);
+                let err_msg_contents = format!(
+                    "SortMergeJoinExec and HashJoinExec produced different results, batch_size: {}",
+                    &batch_size
+                );
                 // row level compare if any of joins returns the result
                 // the reason is different formatting when there is no rows
                 if smj_rows > 0 || hj_rows > 0 {
@@ -990,7 +1090,7 @@ impl JoinFuzzTestCase {
     /// Files can be of different sizes
     /// The method can be useful to read partitions have been saved by `save_partitioned_batches_as_parquet`
     /// for test debugging purposes
-    #[allow(dead_code)]
+    #[expect(dead_code)]
     async fn load_partitioned_batches_from_parquet(
         dir: &str,
     ) -> std::io::Result<Vec<RecordBatch>> {
@@ -1028,10 +1128,142 @@ impl JoinFuzzTestCase {
     }
 }
 
+/// Fuzz test: compare SMJ (with spilling) against HJ (no spill) for filtered
+/// outer joins under memory pressure. This exercises the deferred filtering +
+/// spill read-back path that unit tests can't easily cover with random data.
+#[tokio::test]
+async fn test_filtered_join_spill_fuzz() {
+    let join_types = [JoinType::Left, JoinType::Right, JoinType::Full];
+
+    let runtime_spill = RuntimeEnvBuilder::new()
+        .with_memory_limit(4096, 1.0)
+        .with_disk_manager_builder(
+            DiskManagerBuilder::default().with_mode(DiskManagerMode::OsTmpDirectory),
+        )
+        .build_arc()
+        .unwrap();
+
+    for join_type in &join_types {
+        for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+            let input1 = make_staggered_batches_i32(1000, left_extra);
+            let input2 = make_staggered_batches_i32(1000, right_extra);
+
+            let schema1 = input1[0].schema();
+            let schema2 = input2[0].schema();
+            let filter = col_lt_col_filter(schema1.clone(), schema2.clone());
+
+            let on = vec![
+                (
+                    Arc::new(Column::new_with_schema("a", &schema1).unwrap()) as _,
+                    Arc::new(Column::new_with_schema("a", &schema2).unwrap()) as _,
+                ),
+                (
+                    Arc::new(Column::new_with_schema("b", &schema1).unwrap()) as _,
+                    Arc::new(Column::new_with_schema("b", &schema2).unwrap()) as _,
+                ),
+            ];
+
+            for batch_size in [2, 49, 100] {
+                let session_config = SessionConfig::new().with_batch_size(batch_size);
+
+                // HJ baseline (no memory limit)
+                let left_hj = MemorySourceConfig::try_new_exec(
+                    std::slice::from_ref(&input1),
+                    schema1.clone(),
+                    None,
+                )
+                .unwrap();
+                let right_hj = MemorySourceConfig::try_new_exec(
+                    std::slice::from_ref(&input2),
+                    schema2.clone(),
+                    None,
+                )
+                .unwrap();
+                let hj = Arc::new(
+                    HashJoinExec::try_new(
+                        left_hj,
+                        right_hj,
+                        on.clone(),
+                        Some(filter.clone()),
+                        join_type,
+                        None,
+                        PartitionMode::Partitioned,
+                        NullEquality::NullEqualsNothing,
+                        false,
+                    )
+                    .unwrap(),
+                );
+                let ctx_hj = SessionContext::new_with_config(session_config.clone());
+                let hj_collected = collect(hj, ctx_hj.task_ctx()).await.unwrap();
+
+                // SMJ with spilling
+                let left_smj = MemorySourceConfig::try_new_exec(
+                    std::slice::from_ref(&input1),
+                    schema1.clone(),
+                    None,
+                )
+                .unwrap();
+                let right_smj = MemorySourceConfig::try_new_exec(
+                    std::slice::from_ref(&input2),
+                    schema2.clone(),
+                    None,
+                )
+                .unwrap();
+                let smj = Arc::new(
+                    SortMergeJoinExec::try_new(
+                        left_smj,
+                        right_smj,
+                        on.clone(),
+                        Some(filter.clone()),
+                        *join_type,
+                        vec![SortOptions::default(); on.len()],
+                        NullEquality::NullEqualsNothing,
+                    )
+                    .unwrap(),
+                );
+                let task_ctx_spill = Arc::new(
+                    TaskContext::default()
+                        .with_session_config(session_config)
+                        .with_runtime(Arc::clone(&runtime_spill)),
+                );
+                let smj_collected = collect(smj, task_ctx_spill).await.unwrap();
+
+                let hj_rows: usize = hj_collected.iter().map(|b| b.num_rows()).sum();
+                let smj_rows: usize = smj_collected.iter().map(|b| b.num_rows()).sum();
+
+                assert_eq!(
+                    hj_rows, smj_rows,
+                    "Row count mismatch for {join_type:?} batch_size={batch_size} \
+                     left_extra={left_extra} right_extra={right_extra}: \
+                     HJ={hj_rows} SMJ={smj_rows}"
+                );
+
+                if hj_rows > 0 {
+                    let hj_fmt =
+                        pretty_format_batches(&hj_collected).unwrap().to_string();
+                    let smj_fmt =
+                        pretty_format_batches(&smj_collected).unwrap().to_string();
+
+                    let mut hj_sorted: Vec<&str> = hj_fmt.trim().lines().collect();
+                    hj_sorted.sort_unstable();
+                    let mut smj_sorted: Vec<&str> = smj_fmt.trim().lines().collect();
+                    smj_sorted.sort_unstable();
+
+                    assert_eq!(
+                        hj_sorted, smj_sorted,
+                        "Content mismatch for {join_type:?} batch_size={batch_size} \
+                         left_extra={left_extra} right_extra={right_extra}"
+                    );
+                }
+            }
+        }
+    }
+}
+
 /// Return randomly sized record batches with:
 /// two sorted int32 columns 'a', 'b' ranged from 0..99 as join columns
 /// two random int32 columns 'x', 'y' as other columns
-fn make_staggered_batches_i32(len: usize) -> Vec<RecordBatch> {
+fn make_staggered_batches_i32(len: usize, with_extra_column: bool) -> Vec<RecordBatch> {
     let mut rng = rand::rng();
     let mut input12: Vec<(i32, i32)> = vec![(0, 0); len];
     let mut input3: Vec<i32> = vec![0; len];
@@ -1044,17 +1276,28 @@ fn make_staggered_batches_i32(len: usize) -> Vec<RecordBatch> {
     input12.sort_unstable();
     let input1 = Int32Array::from_iter_values(input12.clone().into_iter().map(|k| k.0));
     let input2 = Int32Array::from_iter_values(input12.clone().into_iter().map(|k| k.1));
-    let input3 = Int32Array::from_iter_values(input3);
+    let input3 = Int32Array::from_iter(input3.into_iter().map(|v| {
+        // ~10% NULLs in filter column to exercise NULL filter handling
+        if rng.random_range(0..10) == 0 {
+            None
+        } else {
+            Some(v)
+        }
+    }));
     let input4 = Int32Array::from_iter_values(input4);
 
-    // split into several record batches
-    let batch = RecordBatch::try_from_iter(vec![
+    let mut columns = vec![
         ("a", Arc::new(input1) as ArrayRef),
         ("b", Arc::new(input2) as ArrayRef),
         ("x", Arc::new(input3) as ArrayRef),
-        ("y", Arc::new(input4) as ArrayRef),
-    ])
-    .unwrap();
+    ];
+
+    if with_extra_column {
+        columns.push(("y", Arc::new(input4) as ArrayRef));
+    }
+
+    // split into several record batches
+    let batch = RecordBatch::try_from_iter(columns).unwrap();
 
     // use a random number generator to pick a random sized output
     stagger_batch_with_seed(batch, 42)
@@ -1070,7 +1313,10 @@ fn rand_bytes<R: Rng>(rng: &mut R, min: usize, max: usize) -> Vec<u8> {
 /// Return randomly sized record batches with:
 /// two sorted binary columns 'a', 'b' (lexicographically) as join columns
 /// two random binary columns 'x', 'y' as other columns
-fn make_staggered_batches_binary(len: usize) -> Vec<RecordBatch> {
+fn make_staggered_batches_binary(
+    len: usize,
+    with_extra_column: bool,
+) -> Vec<RecordBatch> {
     let mut rng = rand::rng();
 
     // produce (a,b) pairs then sort lexicographically so SMJ has naturally sorted keys
@@ -1088,13 +1334,17 @@ fn make_staggered_batches_binary(len: usize) -> Vec<RecordBatch> {
     let x = BinaryArray::from_iter_values(input3.iter());
     let y = BinaryArray::from_iter_values(input4.iter());
 
-    let batch = RecordBatch::try_from_iter(vec![
+    let mut columns = vec![
         ("a", Arc::new(a) as ArrayRef),
         ("b", Arc::new(b) as ArrayRef),
         ("x", Arc::new(x) as ArrayRef),
-        ("y", Arc::new(y) as ArrayRef),
-    ])
-    .unwrap();
+    ];
+
+    if with_extra_column {
+        columns.push(("y", Arc::new(y) as ArrayRef));
+    }
+
+    let batch = RecordBatch::try_from_iter(columns).unwrap();
 
     // preserve your existing randomized partitioning
     stagger_batch_with_seed(batch, 42)
diff --git a/datafusion/core/tests/fuzz_cases/limit_fuzz.rs b/datafusion/core/tests/fuzz_cases/limit_fuzz.rs
index 4c5ebf0402414..1c5741e7a21b3 100644
--- a/datafusion/core/tests/fuzz_cases/limit_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/limit_fuzz.rs
@@ -24,7 +24,7 @@ use arrow::util::pretty::pretty_format_batches;
 use datafusion::datasource::MemTable;
 use datafusion::prelude::SessionContext;
 use datafusion_common::assert_contains;
-use rand::{rng, Rng};
+use rand::{Rng, rng};
 use std::sync::Arc;
 use test_utils::stagger_batch;
 
diff --git a/datafusion/core/tests/fuzz_cases/merge_fuzz.rs b/datafusion/core/tests/fuzz_cases/merge_fuzz.rs
index b92dec64e3f19..59430a98cc4b4 100644
--- a/datafusion/core/tests/fuzz_cases/merge_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/merge_fuzz.rs
@@ -27,7 +27,7 @@ use arrow::{
 use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::physical_plan::{
     collect,
-    expressions::{col, PhysicalSortExpr},
+    expressions::{PhysicalSortExpr, col},
     sorts::sort_preserving_merge::SortPreservingMergeExec,
 };
 use datafusion::prelude::{SessionConfig, SessionContext};
diff --git a/datafusion/core/tests/fuzz_cases/mod.rs b/datafusion/core/tests/fuzz_cases/mod.rs
index 9e2fd170f7f0c..edb53df382c62 100644
--- a/datafusion/core/tests/fuzz_cases/mod.rs
+++ b/datafusion/core/tests/fuzz_cases/mod.rs
@@ -15,20 +15,26 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#[expect(clippy::needless_pass_by_value)]
 mod aggregate_fuzz;
 mod distinct_count_string_fuzz;
+#[expect(clippy::needless_pass_by_value)]
 mod join_fuzz;
 mod merge_fuzz;
+#[expect(clippy::needless_pass_by_value)]
 mod sort_fuzz;
+#[expect(clippy::needless_pass_by_value)]
 mod sort_query_fuzz;
 mod topk_filter_pushdown;
 
 mod aggregation_fuzzer;
+#[expect(clippy::needless_pass_by_value)]
 mod equivalence;
 
 mod pruning;
 
 mod limit_fuzz;
+#[expect(clippy::needless_pass_by_value)]
 mod sort_preserving_repartition_fuzz;
 mod window_fuzz;
 
diff --git a/datafusion/core/tests/fuzz_cases/once_exec.rs b/datafusion/core/tests/fuzz_cases/once_exec.rs
index 49e2caaa7417c..403e377a690e2 100644
--- a/datafusion/core/tests/fuzz_cases/once_exec.rs
+++ b/datafusion/core/tests/fuzz_cases/once_exec.rs
@@ -17,13 +17,13 @@
 
 use arrow_schema::SchemaRef;
 use datafusion_common::internal_datafusion_err;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use datafusion_physical_expr::{EquivalenceProperties, Partitioning};
 use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
 use datafusion_physical_plan::{
     DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
 };
-use std::any::Any;
 use std::fmt::{Debug, Formatter};
 use std::sync::{Arc, Mutex};
 
@@ -32,7 +32,7 @@ use std::sync::{Arc, Mutex};
 pub struct OnceExec {
     /// the results to send back
     stream: Mutex<Option<SendableRecordBatchStream>>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl Debug for OnceExec {
@@ -46,7 +46,7 @@ impl OnceExec {
         let cache = Self::compute_properties(stream.schema());
         Self {
             stream: Mutex::new(Some(stream)),
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -79,11 +79,7 @@ impl ExecutionPlan for OnceExec {
         Self::static_name()
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -110,4 +106,20 @@ impl ExecutionPlan for OnceExec {
 
         stream.ok_or_else(|| internal_datafusion_err!("Stream already consumed"))
     }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion_physical_plan::PhysicalExpr,
+        ) -> datafusion_common::Result<TreeNodeRecursion>,
+    ) -> datafusion_common::Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.cache.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
 }
diff --git a/datafusion/core/tests/fuzz_cases/pruning.rs b/datafusion/core/tests/fuzz_cases/pruning.rs
index f8bd4dbc1a768..8ce5207f91190 100644
--- a/datafusion/core/tests/fuzz_cases/pruning.rs
+++ b/datafusion/core/tests/fuzz_cases/pruning.rs
@@ -29,9 +29,11 @@ use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use datafusion_datasource::source::DataSourceExec;
 use datafusion_execution::object_store::ObjectStoreUrl;
 use datafusion_physical_expr::PhysicalExpr;
-use datafusion_physical_plan::{collect, filter::FilterExec, ExecutionPlan};
+use datafusion_physical_plan::{ExecutionPlan, collect, filter::FilterExec};
 use itertools::Itertools;
-use object_store::{memory::InMemory, path::Path, ObjectStore, PutPayload};
+use object_store::{
+    ObjectStore, ObjectStoreExt, PutPayload, memory::InMemory, path::Path,
+};
 use parquet::{
     arrow::ArrowWriter,
     file::properties::{EnabledStatistics, WriterProperties},
@@ -276,13 +278,12 @@ async fn execute_with_predicate(
     ctx: &SessionContext,
 ) -> Vec<String> {
     let parquet_source = if prune_stats {
-        ParquetSource::default().with_predicate(predicate.clone())
+        ParquetSource::new(schema.clone()).with_predicate(predicate.clone())
     } else {
-        ParquetSource::default()
+        ParquetSource::new(schema.clone())
     };
     let config = FileScanConfigBuilder::new(
         ObjectStoreUrl::parse("memory://").unwrap(),
-        schema.clone(),
         Arc::new(parquet_source),
     )
     .with_file_group(
diff --git a/datafusion/core/tests/fuzz_cases/record_batch_generator.rs b/datafusion/core/tests/fuzz_cases/record_batch_generator.rs
index 45dba5f7864b1..22b145f5095a7 100644
--- a/datafusion/core/tests/fuzz_cases/record_batch_generator.rs
+++ b/datafusion/core/tests/fuzz_cases/record_batch_generator.rs
@@ -19,23 +19,23 @@ use std::sync::Arc;
 
 use arrow::array::{ArrayRef, DictionaryArray, PrimitiveArray, RecordBatch};
 use arrow::datatypes::{
-    ArrowPrimitiveType, BooleanType, DataType, Date32Type, Date64Type, Decimal128Type,
-    Decimal256Type, Decimal32Type, Decimal64Type, DurationMicrosecondType,
+    ArrowPrimitiveType, BooleanType, DataType, Date32Type, Date64Type, Decimal32Type,
+    Decimal64Type, Decimal128Type, Decimal256Type, DurationMicrosecondType,
     DurationMillisecondType, DurationNanosecondType, DurationSecondType, Field,
-    Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type,
+    Float32Type, Float64Type, Int8Type, Int16Type, Int32Type, Int64Type,
     IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType,
     Schema, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
     Time64NanosecondType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType,
-    TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type,
-    UInt8Type,
+    TimestampNanosecondType, TimestampSecondType, UInt8Type, UInt16Type, UInt32Type,
+    UInt64Type,
 };
 use arrow_schema::{
-    DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION,
-    DECIMAL256_MAX_SCALE, DECIMAL32_MAX_PRECISION, DECIMAL32_MAX_SCALE,
-    DECIMAL64_MAX_PRECISION, DECIMAL64_MAX_SCALE,
+    DECIMAL32_MAX_PRECISION, DECIMAL32_MAX_SCALE, DECIMAL64_MAX_PRECISION,
+    DECIMAL64_MAX_SCALE, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE,
+    DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE,
 };
-use datafusion_common::{arrow_datafusion_err, DataFusionError, Result};
-use rand::{rng, rngs::StdRng, Rng, SeedableRng};
+use datafusion_common::{Result, arrow_datafusion_err};
+use rand::{Rng, SeedableRng, rng, rngs::StdRng};
 use test_utils::array_gen::{
     BinaryArrayGenerator, BooleanArrayGenerator, DecimalArrayGenerator,
     PrimitiveArrayGenerator, StringArrayGenerator,
diff --git a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs
index 28d28a6622a76..0d8a066d432dd 100644
--- a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs
@@ -20,7 +20,7 @@
 use std::sync::Arc;
 
 use arrow::{
-    array::{as_string_array, ArrayRef, Int32Array, StringArray},
+    array::{ArrayRef, Int32Array, StringArray, as_string_array},
     compute::SortOptions,
     record_batch::RecordBatch,
 };
@@ -28,7 +28,7 @@ use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::execution::runtime_env::RuntimeEnvBuilder;
 use datafusion::physical_plan::expressions::PhysicalSortExpr;
 use datafusion::physical_plan::sorts::sort::SortExec;
-use datafusion::physical_plan::{collect, ExecutionPlan};
+use datafusion::physical_plan::{ExecutionPlan, collect};
 use datafusion::prelude::{SessionConfig, SessionContext};
 use datafusion_common::cast::as_int32_array;
 use datafusion_execution::memory_pool::GreedyMemoryPool;
diff --git a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs
index 99b20790fc46b..a1f38f161d6ea 100644
--- a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs
@@ -20,34 +20,33 @@ mod sp_repartition_fuzz_tests {
     use std::sync::Arc;
 
     use arrow::array::{ArrayRef, Int64Array, RecordBatch, UInt64Array};
-    use arrow::compute::{concat_batches, lexsort, SortColumn, SortOptions};
+    use arrow::compute::{SortColumn, SortOptions, concat_batches, lexsort};
     use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 
     use datafusion::datasource::memory::MemorySourceConfig;
     use datafusion::datasource::source::DataSourceExec;
     use datafusion::physical_plan::{
-        collect,
+        ExecutionPlan, Partitioning, collect,
         metrics::{BaselineMetrics, ExecutionPlanMetricsSet},
         repartition::RepartitionExec,
         sorts::sort_preserving_merge::SortPreservingMergeExec,
         sorts::streaming_merge::StreamingMergeBuilder,
         stream::RecordBatchStreamAdapter,
-        ExecutionPlan, Partitioning,
     };
     use datafusion::prelude::SessionContext;
     use datafusion_common::Result;
     use datafusion_execution::{config::SessionConfig, memory_pool::MemoryConsumer};
+    use datafusion_physical_expr::ConstExpr;
     use datafusion_physical_expr::equivalence::{
         EquivalenceClass, EquivalenceProperties,
     };
-    use datafusion_physical_expr::expressions::{col, Column};
-    use datafusion_physical_expr::ConstExpr;
+    use datafusion_physical_expr::expressions::{Column, col};
     use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
     use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
     use test_utils::add_empty_batches;
 
     use itertools::izip;
-    use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng};
+    use rand::{Rng, SeedableRng, rngs::StdRng, seq::SliceRandom};
 
     // Generate a schema which consists of 6 columns (a, b, c, d, e, f)
     fn create_test_schema() -> Result<SchemaRef> {
@@ -119,7 +118,7 @@ mod sp_repartition_fuzz_tests {
         schema: SchemaRef,
     ) -> Option<ArrayRef> {
         for expr in eq_group.iter() {
-            let col = expr.as_any().downcast_ref::<Column>().unwrap();
+            let col = expr.downcast_ref::<Column>().unwrap();
             let (idx, _field) = schema.column_with_name(col.name()).unwrap();
             if let Some(res) = &existing_vec[idx] {
                 return Some(res.clone());
@@ -150,7 +149,7 @@ mod sp_repartition_fuzz_tests {
 
         // Fill constant columns
         for constant in eq_properties.constants() {
-            let col = constant.expr.as_any().downcast_ref::<Column>().unwrap();
+            let col = constant.expr.downcast_ref::<Column>().unwrap();
             let (idx, _field) = schema.column_with_name(col.name()).unwrap();
             let arr =
                 Arc::new(UInt64Array::from_iter_values(vec![0; n_elem])) as ArrayRef;
@@ -162,7 +161,7 @@ mod sp_repartition_fuzz_tests {
             let (sort_columns, indices): (Vec<_>, Vec<_>) = ordering
                 .iter()
                 .map(|PhysicalSortExpr { expr, options }| {
-                    let col = expr.as_any().downcast_ref::<Column>().unwrap();
+                    let col = expr.downcast_ref::<Column>().unwrap();
                     let (idx, _field) = schema.column_with_name(col.name()).unwrap();
                     let arr = generate_random_array(n_elem, n_distinct);
                     (
@@ -188,7 +187,7 @@ mod sp_repartition_fuzz_tests {
                     .unwrap_or_else(|| generate_random_array(n_elem, n_distinct));
 
             for expr in eq_group.iter() {
-                let col = expr.as_any().downcast_ref::<Column>().unwrap();
+                let col = expr.downcast_ref::<Column>().unwrap();
                 let (idx, _field) = schema.column_with_name(col.name()).unwrap();
                 schema_vec[idx] = Some(representative_array.clone());
             }
@@ -302,7 +301,7 @@ mod sp_repartition_fuzz_tests {
                 let mut handles = Vec::new();
 
                 for seed in seed_start..seed_end {
-                    #[allow(clippy::disallowed_methods)] // spawn allowed only in tests
+                    #[expect(clippy::disallowed_methods)] // spawn allowed only in tests
                     let job = tokio::spawn(run_sort_preserving_repartition_test(
                         make_staggered_batches::<true>(n_row, n_distinct, seed as u64),
                         is_first_roundrobin,
diff --git a/datafusion/core/tests/fuzz_cases/sort_query_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_query_fuzz.rs
index 2ce7db3ea4bc7..376306f3e0659 100644
--- a/datafusion/core/tests/fuzz_cases/sort_query_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/sort_query_fuzz.rs
@@ -24,24 +24,22 @@ use arrow::array::RecordBatch;
 use arrow_schema::SchemaRef;
 use datafusion::datasource::MemTable;
 use datafusion::prelude::{SessionConfig, SessionContext};
-use datafusion_common::{instant::Instant, Result};
+use datafusion_common::{Result, human_readable_size, instant::Instant};
 use datafusion_execution::disk_manager::DiskManagerBuilder;
-use datafusion_execution::memory_pool::{
-    human_readable_size, MemoryPool, UnboundedMemoryPool,
-};
+use datafusion_execution::memory_pool::{MemoryPool, UnboundedMemoryPool};
 use datafusion_expr::display_schema;
 use datafusion_physical_plan::spill::get_record_batch_memory_size;
 use std::time::Duration;
 
 use datafusion_execution::{memory_pool::FairSpillPool, runtime_env::RuntimeEnvBuilder};
-use rand::prelude::IndexedRandom;
 use rand::Rng;
-use rand::{rngs::StdRng, SeedableRng};
+use rand::prelude::IndexedRandom;
+use rand::{SeedableRng, rngs::StdRng};
 
 use crate::fuzz_cases::aggregation_fuzzer::check_equality_of_batches;
 
 use super::aggregation_fuzzer::ColumnDescr;
-use super::record_batch_generator::{get_supported_types_columns, RecordBatchGenerator};
+use super::record_batch_generator::{RecordBatchGenerator, get_supported_types_columns};
 
 /// Entry point for executing the sort query fuzzer.
 ///
@@ -177,16 +175,16 @@ impl SortQueryFuzzer {
         n_round: usize,
         n_query: usize,
     ) -> bool {
-        if let Some(time_limit) = self.time_limit {
-            if Instant::now().duration_since(start_time) > time_limit {
-                println!(
-                    "[SortQueryFuzzer] Time limit reached: {} queries ({} random configs each) in {} rounds",
-                    n_round * self.queries_per_round + n_query,
-                    self.config_variations_per_query,
-                    n_round
-                );
-                return true;
-            }
+        if let Some(time_limit) = self.time_limit
+            && Instant::now().duration_since(start_time) > time_limit
+        {
+            println!(
+                "[SortQueryFuzzer] Time limit reached: {} queries ({} random configs each) in {} rounds",
+                n_round * self.queries_per_round + n_query,
+                self.config_variations_per_query,
+                n_round
+            );
+            return true;
         }
         false
     }
diff --git a/datafusion/core/tests/fuzz_cases/spilling_fuzz_in_memory_constrained_env.rs b/datafusion/core/tests/fuzz_cases/spilling_fuzz_in_memory_constrained_env.rs
index 6c1bd316cdd39..d401557e966d6 100644
--- a/datafusion/core/tests/fuzz_cases/spilling_fuzz_in_memory_constrained_env.rs
+++ b/datafusion/core/tests/fuzz_cases/spilling_fuzz_in_memory_constrained_env.rs
@@ -27,18 +27,18 @@ use arrow::{array::StringArray, compute::SortOptions, record_batch::RecordBatch}
 use arrow_schema::{DataType, Field, Schema};
 use datafusion::common::Result;
 use datafusion::execution::runtime_env::RuntimeEnvBuilder;
+use datafusion::physical_plan::ExecutionPlan;
 use datafusion::physical_plan::expressions::PhysicalSortExpr;
 use datafusion::physical_plan::sorts::sort::SortExec;
-use datafusion::physical_plan::ExecutionPlan;
 use datafusion::prelude::SessionConfig;
-use datafusion_execution::memory_pool::units::{KB, MB};
+use datafusion_common::units::{KB, MB};
 use datafusion_execution::memory_pool::{
     FairSpillPool, MemoryConsumer, MemoryReservation,
 };
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use datafusion_functions_aggregate::array_agg::array_agg_udaf;
 use datafusion_physical_expr::aggregate::AggregateExprBuilder;
-use datafusion_physical_expr::expressions::{col, Column};
+use datafusion_physical_expr::expressions::{Column, col};
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
 use datafusion_physical_plan::aggregates::{
     AggregateExec, AggregateMode, PhysicalGroupBy,
@@ -80,9 +80,9 @@ async fn test_sort_with_limited_memory() -> Result<()> {
 
     let total_spill_files_size = spill_count * record_batch_size;
     assert!(
-    total_spill_files_size > pool_size,
-    "Total spill files size {total_spill_files_size} should be greater than pool size {pool_size}",
-  );
+        total_spill_files_size > pool_size,
+        "Total spill files size {total_spill_files_size} should be greater than pool size {pool_size}",
+    );
 
     Ok(())
 }
@@ -126,8 +126,8 @@ async fn test_sort_with_limited_memory_and_different_sizes_of_record_batch() ->
 }
 
 #[tokio::test]
-async fn test_sort_with_limited_memory_and_different_sizes_of_record_batch_and_changing_memory_reservation(
-) -> Result<()> {
+async fn test_sort_with_limited_memory_and_different_sizes_of_record_batch_and_changing_memory_reservation()
+-> Result<()> {
     let record_batch_size = 8192;
     let pool_size = 2 * MB as usize;
     let task_ctx = {
@@ -164,8 +164,8 @@ async fn test_sort_with_limited_memory_and_different_sizes_of_record_batch_and_c
 }
 
 #[tokio::test]
-async fn test_sort_with_limited_memory_and_different_sizes_of_record_batch_and_take_all_memory(
-) -> Result<()> {
+async fn test_sort_with_limited_memory_and_different_sizes_of_record_batch_and_take_all_memory()
+-> Result<()> {
     let record_batch_size = 8192;
     let pool_size = 2 * MB as usize;
     let task_ctx = {
@@ -278,9 +278,11 @@ async fn run_sort_test_with_limited_memory(
 
                     let string_item_size =
                         record_batch_memory_size / record_batch_size as usize;
-                    let string_array = Arc::new(StringArray::from_iter_values(
-                        (0..record_batch_size).map(|_| "a".repeat(string_item_size)),
-                    ));
+                    let string_array =
+                        Arc::new(StringArray::from_iter_values(std::iter::repeat_n(
+                            "a".repeat(string_item_size),
+                            record_batch_size as usize,
+                        )));
 
                     RecordBatch::try_new(
                         Arc::clone(&schema),
@@ -356,16 +358,16 @@ async fn test_aggregate_with_high_cardinality_with_limited_memory() -> Result<()
 
     let total_spill_files_size = spill_count * record_batch_size;
     assert!(
-    total_spill_files_size > pool_size,
-    "Total spill files size {total_spill_files_size} should be greater than pool size {pool_size}",
-  );
+        total_spill_files_size > pool_size,
+        "Total spill files size {total_spill_files_size} should be greater than pool size {pool_size}",
+    );
 
     Ok(())
 }
 
 #[tokio::test]
-async fn test_aggregate_with_high_cardinality_with_limited_memory_and_different_sizes_of_record_batch(
-) -> Result<()> {
+async fn test_aggregate_with_high_cardinality_with_limited_memory_and_different_sizes_of_record_batch()
+-> Result<()> {
     let record_batch_size = 8192;
     let pool_size = 2 * MB as usize;
     let task_ctx = {
@@ -398,8 +400,8 @@ async fn test_aggregate_with_high_cardinality_with_limited_memory_and_different_
 }
 
 #[tokio::test]
-async fn test_aggregate_with_high_cardinality_with_limited_memory_and_different_sizes_of_record_batch_and_changing_memory_reservation(
-) -> Result<()> {
+async fn test_aggregate_with_high_cardinality_with_limited_memory_and_different_sizes_of_record_batch_and_changing_memory_reservation()
+-> Result<()> {
     let record_batch_size = 8192;
     let pool_size = 2 * MB as usize;
     let task_ctx = {
@@ -432,8 +434,8 @@ async fn test_aggregate_with_high_cardinality_with_limited_memory_and_different_
 }
 
 #[tokio::test]
-async fn test_aggregate_with_high_cardinality_with_limited_memory_and_different_sizes_of_record_batch_and_take_all_memory(
-) -> Result<()> {
+async fn test_aggregate_with_high_cardinality_with_limited_memory_and_different_sizes_of_record_batch_and_take_all_memory()
+-> Result<()> {
     let record_batch_size = 8192;
     let pool_size = 2 * MB as usize;
     let task_ctx = {
@@ -466,8 +468,8 @@ async fn test_aggregate_with_high_cardinality_with_limited_memory_and_different_
 }
 
 #[tokio::test]
-async fn test_aggregate_with_high_cardinality_with_limited_memory_and_large_record_batch(
-) -> Result<()> {
+async fn test_aggregate_with_high_cardinality_with_limited_memory_and_large_record_batch()
+-> Result<()> {
     let record_batch_size = 8192;
     let pool_size = 2 * MB as usize;
     let task_ctx = {
@@ -536,9 +538,11 @@ async fn run_test_aggregate_with_high_cardinality(
 
                     let string_item_size =
                         record_batch_memory_size / record_batch_size as usize;
-                    let string_array = Arc::new(StringArray::from_iter_values(
-                        (0..record_batch_size).map(|_| "a".repeat(string_item_size)),
-                    ));
+                    let string_array =
+                        Arc::new(StringArray::from_iter_values(std::iter::repeat_n(
+                            "a".repeat(string_item_size),
+                            record_batch_size as usize,
+                        )));
 
                     RecordBatch::try_new(
                         Arc::clone(&schema),
diff --git a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs
index 7f994daeaa58c..d14afaf1b3267 100644
--- a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs
+++ b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs
@@ -31,7 +31,7 @@ use datafusion_execution::object_store::ObjectStoreUrl;
 use itertools::Itertools;
 use object_store::memory::InMemory;
 use object_store::path::Path;
-use object_store::{ObjectStore, PutPayload};
+use object_store::{ObjectStore, ObjectStoreExt, PutPayload};
 use parquet::arrow::ArrowWriter;
 use rand::rngs::StdRng;
 use rand::{Rng, SeedableRng};
diff --git a/datafusion/core/tests/fuzz_cases/window_fuzz.rs b/datafusion/core/tests/fuzz_cases/window_fuzz.rs
index 65a41d39d3c54..82b6d0e4e9d89 100644
--- a/datafusion/core/tests/fuzz_cases/window_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/window_fuzz.rs
@@ -18,24 +18,24 @@
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, Int32Array, StringArray};
-use arrow::compute::{concat_batches, SortOptions};
+use arrow::compute::{SortOptions, concat_batches};
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
 use arrow::util::pretty::pretty_format_batches;
 use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::datasource::source::DataSourceExec;
 use datafusion::functions_window::row_number::row_number_udwf;
+use datafusion::physical_plan::InputOrderMode::{Linear, PartiallySorted, Sorted};
 use datafusion::physical_plan::sorts::sort::SortExec;
 use datafusion::physical_plan::windows::{
-    create_window_expr, schema_add_window_field, BoundedWindowAggExec, WindowAggExec,
+    BoundedWindowAggExec, WindowAggExec, create_window_expr, schema_add_window_field,
 };
-use datafusion::physical_plan::InputOrderMode::{Linear, PartiallySorted, Sorted};
-use datafusion::physical_plan::{collect, InputOrderMode};
+use datafusion::physical_plan::{InputOrderMode, collect};
 use datafusion::prelude::{SessionConfig, SessionContext};
 use datafusion_common::HashMap;
 use datafusion_common::{Result, ScalarValue};
 use datafusion_common_runtime::SpawnedTask;
-use datafusion_expr::type_coercion::functions::fields_with_aggregate_udf;
+use datafusion_expr::type_coercion::functions::fields_with_udf;
 use datafusion_expr::{
     WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition,
 };
@@ -445,14 +445,14 @@ fn get_random_function(
     let fn_name = window_fn_map.keys().collect::<Vec<_>>()[rand_fn_idx];
     let (window_fn, args) = window_fn_map.values().collect::<Vec<_>>()[rand_fn_idx];
     let mut args = args.clone();
-    if let WindowFunctionDefinition::AggregateUDF(udf) = window_fn {
-        if !args.is_empty() {
-            // Do type coercion first argument
-            let a = args[0].clone();
-            let dt = a.return_field(schema.as_ref()).unwrap();
-            let coerced = fields_with_aggregate_udf(&[dt], udf).unwrap();
-            args[0] = cast(a, schema, coerced[0].data_type().clone()).unwrap();
-        }
+    if let WindowFunctionDefinition::AggregateUDF(udf) = window_fn
+        && !args.is_empty()
+    {
+        // Do type coercion first argument
+        let a = args[0].clone();
+        let dt = a.return_field(schema.as_ref()).unwrap();
+        let coerced = fields_with_udf(&[dt], udf.as_ref()).unwrap();
+        args[0] = cast(a, schema, coerced[0].data_type().clone()).unwrap();
     }
 
     (window_fn.clone(), args, (*fn_name).to_string())
@@ -569,10 +569,11 @@ fn convert_bound_to_current_row_if_applicable(
 ) {
     match bound {
         WindowFrameBound::Preceding(value) | WindowFrameBound::Following(value) => {
-            if let Ok(zero) = ScalarValue::new_zero(&value.data_type()) {
-                if value == &zero && rng.random_range(0..2) == 0 {
-                    *bound = WindowFrameBound::CurrentRow;
-                }
+            if let Ok(zero) = ScalarValue::new_zero(&value.data_type())
+                && value == &zero
+                && rng.random_range(0..2) == 0
+            {
+                *bound = WindowFrameBound::CurrentRow;
             }
         }
         _ => {}
@@ -588,7 +589,7 @@ async fn run_window_test(
     orderby_columns: Vec<&str>,
     search_mode: InputOrderMode,
 ) -> Result<()> {
-    let is_linear = !matches!(search_mode, Sorted);
+    let is_linear = search_mode != Sorted;
     let mut rng = StdRng::seed_from_u64(random_seed);
     let schema = input1[0].schema();
     let session_config = SessionConfig::new().with_batch_size(50);
@@ -644,10 +645,8 @@ async fn run_window_test(
     ) as _;
     // Table is ordered according to ORDER BY a, b, c In linear test we use PARTITION BY b, ORDER BY a
     // For WindowAggExec  to produce correct result it need table to be ordered by b,a. Hence add a sort.
-    if is_linear {
-        if let Some(ordering) = LexOrdering::new(sort_keys) {
-            exec1 = Arc::new(SortExec::new(ordering, exec1)) as _;
-        }
+    if is_linear && let Some(ordering) = LexOrdering::new(sort_keys) {
+        exec1 = Arc::new(SortExec::new(ordering, exec1)) as _;
     }
 
     let extended_schema = schema_add_window_field(&args, &schema, &window_fn, &fn_name)?;
@@ -699,7 +698,9 @@ async fn run_window_test(
 
     // BoundedWindowAggExec should produce more chunk than the usual WindowAggExec.
     // Otherwise it means that we cannot generate result in running mode.
-    let err_msg = format!("Inconsistent result for window_frame: {window_frame:?}, window_fn: {window_fn:?}, args:{args:?}, random_seed: {random_seed:?}, search_mode: {search_mode:?}, partition_by_columns:{partition_by_columns:?}, orderby_columns: {orderby_columns:?}");
+    let err_msg = format!(
+        "Inconsistent result for window_frame: {window_frame:?}, window_fn: {window_fn:?}, args:{args:?}, random_seed: {random_seed:?}, search_mode: {search_mode:?}, partition_by_columns:{partition_by_columns:?}, orderby_columns: {orderby_columns:?}"
+    );
     // Below check makes sure that, streaming execution generates more chunks than the bulk execution.
     // Since algorithms and operators works on sliding windows in the streaming execution.
     // However, in the current test setup for some random generated window frame clauses: It is not guaranteed
@@ -731,8 +732,12 @@ async fn run_window_test(
         .enumerate()
     {
         if !usual_line.eq(running_line) {
-            println!("Inconsistent result for window_frame at line:{i:?}: {window_frame:?}, window_fn: {window_fn:?}, args:{args:?}, pb_cols:{partition_by_columns:?}, ob_cols:{orderby_columns:?}, search_mode:{search_mode:?}");
-            println!("--------usual_formatted_sorted----------------running_formatted_sorted--------");
+            println!(
+                "Inconsistent result for window_frame at line:{i:?}: {window_frame:?}, window_fn: {window_fn:?}, args:{args:?}, pb_cols:{partition_by_columns:?}, ob_cols:{orderby_columns:?}, search_mode:{search_mode:?}"
+            );
+            println!(
+                "--------usual_formatted_sorted----------------running_formatted_sorted--------"
+            );
             for (line1, line2) in
                 usual_formatted_sorted.iter().zip(running_formatted_sorted)
             {
diff --git a/datafusion/core/tests/macro_hygiene/mod.rs b/datafusion/core/tests/macro_hygiene/mod.rs
index c9f33f6fdf0f4..9fd60cd1f06f3 100644
--- a/datafusion/core/tests/macro_hygiene/mod.rs
+++ b/datafusion/core/tests/macro_hygiene/mod.rs
@@ -73,7 +73,7 @@ mod config_field {
     #[test]
     fn test_macro() {
         #[derive(Debug)]
-        #[allow(dead_code)]
+        #[expect(dead_code)]
         struct E;
 
         impl std::fmt::Display for E {
@@ -84,7 +84,8 @@ mod config_field {
 
         impl std::error::Error for E {}
 
-        #[allow(dead_code)]
+        #[expect(dead_code)]
+        #[derive(Default)]
         struct S;
 
         impl std::str::FromStr for S {
diff --git a/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs b/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs
index e1d5f1b1ab198..bf04123fff7fa 100644
--- a/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs
+++ b/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs
@@ -21,14 +21,10 @@
 //! This file is organized as:
 //! - Test runners that spawn individual test processes
 //! - Test cases that contain the actual validation logic
-use log::info;
-use std::sync::Once;
 use std::{process::Command, str};
 
 use crate::memory_limit::memory_limit_validation::utils;
 
-static INIT: Once = Once::new();
-
 // ===========================================================================
 // Test runners:
 // Runners are split into multiple tests to run in parallel
@@ -69,49 +65,16 @@ fn sort_with_mem_limit_2_cols_2_runner() {
     spawn_test_process("sort_with_mem_limit_2_cols_2");
 }
 
-/// `spawn_test_process` might trigger multiple recompilations and the test binary
-/// size might grow indefinitely. This initializer ensures recompilation is only done
-/// once and the target size is bounded.
-///
-/// TODO: This is a hack, can be cleaned up if we have a better way to let multiple
-/// test cases run in different processes (instead of different threads by default)
-fn init_once() {
-    INIT.call_once(|| {
-        let _ = Command::new("cargo")
-            .arg("test")
-            .arg("--no-run")
-            .arg("--package")
-            .arg("datafusion")
-            .arg("--test")
-            .arg("core_integration")
-            .arg("--features")
-            .arg("extended_tests")
-            .env("DATAFUSION_TEST_MEM_LIMIT_VALIDATION", "1")
-            .output()
-            .expect("Failed to execute test command");
-    });
-}
-
-/// Helper function that executes a test in a separate process with the required environment
-/// variable set. Memory limit validation tasks need to measure memory resident set
-/// size (RSS), so they must run in a separate process.
+/// Helper function that executes a test in a separate process with the required
+/// environment variable set. Re-invokes the current test binary directly,
+/// avoiding cargo overhead and recompilation.
 fn spawn_test_process(test: &str) {
-    init_once();
-
     let test_path =
         format!("memory_limit::memory_limit_validation::sort_mem_validation::{test}");
-    info!("Running test: {test_path}");
-
-    // Run the test command
-    let output = Command::new("cargo")
-        .arg("test")
-        .arg("--package")
-        .arg("datafusion")
-        .arg("--test")
-        .arg("core_integration")
-        .arg("--features")
-        .arg("extended_tests")
-        .arg("--")
+
+    let exe = std::env::current_exe().expect("Failed to get test binary path");
+
+    let output = Command::new(exe)
         .arg(&test_path)
         .arg("--exact")
         .arg("--nocapture")
@@ -119,12 +82,9 @@ fn spawn_test_process(test: &str) {
         .output()
         .expect("Failed to execute test command");
 
-    // Convert output to strings
     let stdout = str::from_utf8(&output.stdout).unwrap_or("");
     let stderr = str::from_utf8(&output.stderr).unwrap_or("");
 
-    info!("{stdout}");
-
     assert!(
         output.status.success(),
         "Test '{}' failed with status: {}\nstdout:\n{}\nstderr:\n{}",
diff --git a/datafusion/core/tests/memory_limit/memory_limit_validation/utils.rs b/datafusion/core/tests/memory_limit/memory_limit_validation/utils.rs
index 7b157b707a6de..2c9fae20c8606 100644
--- a/datafusion/core/tests/memory_limit/memory_limit_validation/utils.rs
+++ b/datafusion/core/tests/memory_limit/memory_limit_validation/utils.rs
@@ -16,16 +16,14 @@
 // under the License.
 
 use datafusion_common_runtime::SpawnedTask;
-use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
 use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, System};
-use tokio::time::{interval, Duration};
+use tokio::time::{Duration, interval};
 
 use datafusion::prelude::{SessionConfig, SessionContext};
-use datafusion_execution::{
-    memory_pool::{human_readable_size, FairSpillPool},
-    runtime_env::RuntimeEnvBuilder,
-};
+use datafusion_common::human_readable_size;
+use datafusion_execution::{memory_pool::FairSpillPool, runtime_env::RuntimeEnvBuilder};
 
 /// Measures the maximum RSS (in bytes) during the execution of an async task. RSS
 /// will be sampled every 7ms.
@@ -40,7 +38,7 @@ use datafusion_execution::{
 async fn measure_max_rss<F, Fut, T>(f: F) -> (T, usize)
 where
     F: FnOnce() -> Fut,
-    Fut: std::future::Future<Output = T>,
+    Fut: Future<Output = T>,
 {
     // Initialize system information
     let mut system = System::new_all();
diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs
index 5d8a1d24181cb..90459960c5561 100644
--- a/datafusion/core/tests/memory_limit/mod.rs
+++ b/datafusion/core/tests/memory_limit/mod.rs
@@ -17,13 +17,13 @@
 
 //! This module contains tests for limiting memory at runtime in DataFusion
 
-use std::any::Any;
 use std::num::NonZeroUsize;
 use std::sync::{Arc, LazyLock};
 
 #[cfg(feature = "extended_tests")]
 mod memory_limit_validation;
 mod repartition_mem_limit;
+mod union_nullable_spill;
 use arrow::array::{ArrayRef, DictionaryArray, Int32Array, RecordBatch, StringViewArray};
 use arrow::compute::SortOptions;
 use arrow::datatypes::{Int32Type, SchemaRef};
@@ -39,19 +39,19 @@ use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion::physical_plan::streaming::PartitionStream;
 use datafusion::physical_plan::{ExecutionPlan, SendableRecordBatchStream};
 use datafusion::prelude::{SessionConfig, SessionContext};
-use datafusion_catalog::streaming::StreamingTable;
 use datafusion_catalog::Session;
-use datafusion_common::{assert_contains, Result};
+use datafusion_catalog::streaming::StreamingTable;
+use datafusion_common::{Result, assert_contains};
+use datafusion_execution::TaskContext;
 use datafusion_execution::disk_manager::{DiskManagerBuilder, DiskManagerMode};
 use datafusion_execution::memory_pool::{
     FairSpillPool, GreedyMemoryPool, MemoryPool, TrackConsumersPool,
 };
 use datafusion_execution::runtime_env::RuntimeEnv;
-use datafusion_execution::TaskContext;
 use datafusion_expr::{Expr, TableType};
 use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr};
-use datafusion_physical_optimizer::join_selection::JoinSelection;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::join_selection::JoinSelection;
 use datafusion_physical_plan::collect as collect_batches;
 use datafusion_physical_plan::common::collect;
 use datafusion_physical_plan::spill::get_record_batch_memory_size;
@@ -212,6 +212,7 @@ async fn sort_merge_join_spill() {
         .with_config(config)
         .with_disk_manager_builder(DiskManagerBuilder::default())
         .with_scenario(Scenario::AccessLogStreaming)
+        .with_expected_success()
         .run()
         .await
 }
@@ -602,11 +603,16 @@ async fn test_disk_spill_limit_reached() -> Result<()> {
         .await
         .unwrap();
 
-    let err = df.collect().await.unwrap_err();
-    assert_contains!(
-    err.to_string(),
-    "The used disk space during the spilling process has exceeded the allowable limit"
-    );
+    let error_message = df.collect().await.unwrap_err().to_string();
+    for expected in [
+        "The used disk space during the spilling process has exceeded the allowable limit",
+        "datafusion.runtime.max_temp_directory_size",
+    ] {
+        assert!(
+            error_message.contains(expected),
+            "'{expected}' is not contained by '{error_message}'"
+        );
+    }
 
     Ok(())
 }
@@ -977,11 +983,13 @@ impl Scenario {
                     descending: false,
                     nulls_first: false,
                 };
-                let sort_information = vec![[
-                    PhysicalSortExpr::new(col("a", &schema).unwrap(), options),
-                    PhysicalSortExpr::new(col("b", &schema).unwrap(), options),
-                ]
-                .into()];
+                let sort_information = vec![
+                    [
+                        PhysicalSortExpr::new(col("a", &schema).unwrap(), options),
+                        PhysicalSortExpr::new(col("b", &schema).unwrap(), options),
+                    ]
+                    .into(),
+                ];
 
                 let table = SortedTableProvider::new(batches, sort_information);
                 Arc::new(table)
@@ -1057,7 +1065,7 @@ fn make_dict_batches() -> Vec<RecordBatch> {
     let batch_size = 50;
 
     let mut i = 0;
-    let gen = std::iter::from_fn(move || {
+    let batch_gen = std::iter::from_fn(move || {
         // create values like
         // 0000000001
         // 0000000002
@@ -1080,7 +1088,7 @@ fn make_dict_batches() -> Vec<RecordBatch> {
 
     let num_batches = 5;
 
-    let batches: Vec<_> = gen.take(num_batches).collect();
+    let batches: Vec<_> = batch_gen.take(num_batches).collect();
 
     batches.iter().enumerate().for_each(|(i, batch)| {
         println!("Dict batch[{i}] size is: {}", batch.get_array_memory_size());
@@ -1136,10 +1144,6 @@ impl SortedTableProvider {
 
 #[async_trait]
 impl TableProvider for SortedTableProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         self.schema.clone()
     }
diff --git a/datafusion/core/tests/memory_limit/repartition_mem_limit.rs b/datafusion/core/tests/memory_limit/repartition_mem_limit.rs
index a7af2f01d1cc9..27bcd33926e99 100644
--- a/datafusion/core/tests/memory_limit/repartition_mem_limit.rs
+++ b/datafusion/core/tests/memory_limit/repartition_mem_limit.rs
@@ -25,7 +25,7 @@ use datafusion::{
 use datafusion_catalog::MemTable;
 use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_execution::runtime_env::RuntimeEnvBuilder;
-use datafusion_physical_plan::{repartition::RepartitionExec, ExecutionPlanProperties};
+use datafusion_physical_plan::{ExecutionPlanProperties, repartition::RepartitionExec};
 use futures::TryStreamExt;
 use itertools::Itertools;
 
@@ -45,11 +45,14 @@ async fn test_repartition_memory_limit() {
         .with_batch_size(32)
         .with_target_partitions(2);
     let ctx = SessionContext::new_with_config_rt(config, Arc::new(runtime));
-    let batches = vec![RecordBatch::try_from_iter(vec![(
-        "c1",
-        Arc::new(Int32Array::from_iter_values((0..10).cycle().take(100_000))) as ArrayRef,
-    )])
-    .unwrap()];
+    let batches = vec![
+        RecordBatch::try_from_iter(vec![(
+            "c1",
+            Arc::new(Int32Array::from_iter_values((0..10).cycle().take(100_000)))
+                as ArrayRef,
+        )])
+        .unwrap(),
+    ];
     let table = Arc::new(MemTable::try_new(batches[0].schema(), vec![batches]).unwrap());
     ctx.register_table("t", table).unwrap();
     let plan = ctx
@@ -71,7 +74,7 @@ async fn test_repartition_memory_limit() {
     let mut metrics = None;
     Arc::clone(&plan)
         .transform_down(|node| {
-            if node.as_any().is::<RepartitionExec>() {
+            if node.is::<RepartitionExec>() {
                 metrics = node.metrics();
             }
             Ok(Transformed::no(node))
diff --git a/datafusion/core/tests/memory_limit/union_nullable_spill.rs b/datafusion/core/tests/memory_limit/union_nullable_spill.rs
new file mode 100644
index 0000000000000..c5ef2387d3cdc
--- /dev/null
+++ b/datafusion/core/tests/memory_limit/union_nullable_spill.rs
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{Array, Int64Array, RecordBatch};
+use arrow::compute::SortOptions;
+use arrow::datatypes::{DataType, Field, Schema};
+use datafusion::datasource::memory::MemorySourceConfig;
+use datafusion_execution::config::SessionConfig;
+use datafusion_execution::memory_pool::FairSpillPool;
+use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+use datafusion_physical_expr::expressions::col;
+use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr};
+use datafusion_physical_plan::repartition::RepartitionExec;
+use datafusion_physical_plan::sorts::sort::sort_batch;
+use datafusion_physical_plan::union::UnionExec;
+use datafusion_physical_plan::{ExecutionPlan, Partitioning};
+use futures::StreamExt;
+
+const NUM_BATCHES: usize = 200;
+const ROWS_PER_BATCH: usize = 10;
+
+fn non_nullable_schema() -> Arc<Schema> {
+    Arc::new(Schema::new(vec![
+        Field::new("key", DataType::Int64, false),
+        Field::new("val", DataType::Int64, false),
+    ]))
+}
+
+fn nullable_schema() -> Arc<Schema> {
+    Arc::new(Schema::new(vec![
+        Field::new("key", DataType::Int64, false),
+        Field::new("val", DataType::Int64, true),
+    ]))
+}
+
+fn non_nullable_batches() -> Vec<RecordBatch> {
+    (0..NUM_BATCHES)
+        .map(|i| {
+            let start = (i * ROWS_PER_BATCH) as i64;
+            let keys: Vec<i64> = (start..start + ROWS_PER_BATCH as i64).collect();
+            RecordBatch::try_new(
+                non_nullable_schema(),
+                vec![
+                    Arc::new(Int64Array::from(keys)),
+                    Arc::new(Int64Array::from(vec![0i64; ROWS_PER_BATCH])),
+                ],
+            )
+            .unwrap()
+        })
+        .collect()
+}
+
+fn nullable_batches() -> Vec<RecordBatch> {
+    (0..NUM_BATCHES)
+        .map(|i| {
+            let start = (i * ROWS_PER_BATCH) as i64;
+            let keys: Vec<i64> = (start..start + ROWS_PER_BATCH as i64).collect();
+            let vals: Vec<Option<i64>> = (0..ROWS_PER_BATCH)
+                .map(|j| if j % 3 == 1 { None } else { Some(j as i64) })
+                .collect();
+            RecordBatch::try_new(
+                nullable_schema(),
+                vec![
+                    Arc::new(Int64Array::from(keys)),
+                    Arc::new(Int64Array::from(vals)),
+                ],
+            )
+            .unwrap()
+        })
+        .collect()
+}
+
+fn build_task_ctx(pool_size: usize) -> Arc<datafusion_execution::TaskContext> {
+    let session_config = SessionConfig::new().with_batch_size(2);
+    let runtime = RuntimeEnvBuilder::new()
+        .with_memory_pool(Arc::new(FairSpillPool::new(pool_size)))
+        .build_arc()
+        .unwrap();
+    Arc::new(
+        datafusion_execution::TaskContext::default()
+            .with_session_config(session_config)
+            .with_runtime(runtime),
+    )
+}
+
+/// Exercises spilling through UnionExec -> RepartitionExec where union children
+/// have mismatched nullability (one child's `val` is non-nullable, the other's
+/// is nullable with NULLs). A tiny FairSpillPool forces all batches to spill.
+///
+/// UnionExec returns child streams without schema coercion, so batches from
+/// different children carry different per-field nullability into the shared
+/// SpillPool. The IPC writer must use the SpillManager's canonical (nullable)
+/// schema — not the first batch's schema — so readback batches are valid.
+///
+/// Otherwise, sort_batch will panic with
+/// `Column 'val' is declared as non-nullable but contains null values`
+#[tokio::test]
+async fn test_sort_union_repartition_spill_mixed_nullability() {
+    let non_nullable_exec = MemorySourceConfig::try_new_exec(
+        &[non_nullable_batches()],
+        non_nullable_schema(),
+        None,
+    )
+    .unwrap();
+
+    let nullable_exec =
+        MemorySourceConfig::try_new_exec(&[nullable_batches()], nullable_schema(), None)
+            .unwrap();
+
+    let union_exec = UnionExec::try_new(vec![non_nullable_exec, nullable_exec]).unwrap();
+    assert!(union_exec.schema().field(1).is_nullable());
+
+    let repartition = Arc::new(
+        RepartitionExec::try_new(union_exec, Partitioning::RoundRobinBatch(1)).unwrap(),
+    );
+
+    let task_ctx = build_task_ctx(200);
+    let mut stream = repartition.execute(0, task_ctx).unwrap();
+
+    let sort_expr = LexOrdering::new(vec![PhysicalSortExpr {
+        expr: col("key", &nullable_schema()).unwrap(),
+        options: SortOptions::default(),
+    }])
+    .unwrap();
+
+    let mut total_rows = 0usize;
+    let mut total_nulls = 0usize;
+    while let Some(result) = stream.next().await {
+        let batch = result.unwrap();
+
+        let batch = sort_batch(&batch, &sort_expr, None).unwrap();
+
+        total_rows += batch.num_rows();
+        total_nulls += batch.column(1).null_count();
+    }
+
+    assert_eq!(
+        total_rows,
+        NUM_BATCHES * ROWS_PER_BATCH * 2,
+        "All rows from both UNION branches should be present"
+    );
+    assert!(
+        total_nulls > 0,
+        "Expected some null values in output (i.e. nullable batches were processed)"
+    );
+}
diff --git a/datafusion/core/tests/optimizer/mod.rs b/datafusion/core/tests/optimizer/mod.rs
index 9b2a5596827d0..a461c6f6c5962 100644
--- a/datafusion/core/tests/optimizer/mod.rs
+++ b/datafusion/core/tests/optimizer/mod.rs
@@ -19,7 +19,6 @@
 //! datafusion-functions crate.
 
 use insta::assert_snapshot;
-use std::any::Any;
 use std::collections::HashMap;
 use std::sync::Arc;
 
@@ -27,17 +26,16 @@ use arrow::datatypes::{
     DataType, Field, Fields, Schema, SchemaBuilder, SchemaRef, TimeUnit,
 };
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::tree_node::{TransformedResult, TreeNode};
-use datafusion_common::{plan_err, DFSchema, Result, ScalarValue, TableReference};
+use datafusion_common::tree_node::TransformedResult;
+use datafusion_common::{DFSchema, Result, ScalarValue, TableReference, plan_err};
 use datafusion_expr::interval_arithmetic::{Interval, NullableInterval};
 use datafusion_expr::{
-    col, lit, AggregateUDF, BinaryExpr, Expr, ExprSchemable, LogicalPlan, Operator,
-    ScalarUDF, TableSource, WindowUDF,
+    AggregateUDF, BinaryExpr, Expr, ExprSchemable, HigherOrderUDF, LogicalPlan, Operator,
+    ScalarUDF, TableSource, WindowUDF, col, lit,
 };
 use datafusion_functions::core::expr_ext::FieldAccessor;
 use datafusion_optimizer::analyzer::Analyzer;
 use datafusion_optimizer::optimizer::Optimizer;
-use datafusion_optimizer::simplify_expressions::GuaranteeRewriter;
 use datafusion_optimizer::{OptimizerConfig, OptimizerContext};
 use datafusion_sql::planner::{ContextProvider, SqlToRel};
 use datafusion_sql::sqlparser::ast::Statement;
@@ -45,6 +43,7 @@ use datafusion_sql::sqlparser::dialect::GenericDialect;
 use datafusion_sql::sqlparser::parser::Parser;
 
 use chrono::DateTime;
+use datafusion_expr::expr_rewriter::rewrite_with_guarantees;
 use datafusion_functions::datetime;
 
 #[cfg(test)]
@@ -217,6 +216,10 @@ impl ContextProvider for MyContextProvider {
         self.udfs.get(name).cloned()
     }
 
+    fn get_higher_order_meta(&self, _name: &str) -> Option<Arc<dyn HigherOrderUDF>> {
+        None
+    }
+
     fn get_aggregate_meta(&self, _name: &str) -> Option<Arc<AggregateUDF>> {
         None
     }
@@ -237,6 +240,10 @@ impl ContextProvider for MyContextProvider {
         Vec::new()
     }
 
+    fn higher_order_function_names(&self) -> Vec<String> {
+        Vec::new()
+    }
+
     fn udaf_names(&self) -> Vec<String> {
         Vec::new()
     }
@@ -251,10 +258,6 @@ struct MyTableSource {
 }
 
 impl TableSource for MyTableSource {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         self.schema.clone()
     }
@@ -304,8 +307,6 @@ fn test_inequalities_non_null_bounded() {
         ),
     ];
 
-    let mut rewriter = GuaranteeRewriter::new(guarantees.iter());
-
     // (original_expr, expected_simplification)
     let simplified_cases = &[
         (col("x").lt(lit(0)), false),
@@ -337,7 +338,7 @@ fn test_inequalities_non_null_bounded() {
         ),
     ];
 
-    validate_simplified_cases(&mut rewriter, simplified_cases);
+    validate_simplified_cases(&guarantees, simplified_cases);
 
     let unchanged_cases = &[
         col("x").gt(lit(2)),
@@ -348,16 +349,20 @@ fn test_inequalities_non_null_bounded() {
         col("x").not_between(lit(3), lit(10)),
     ];
 
-    validate_unchanged_cases(&mut rewriter, unchanged_cases);
+    validate_unchanged_cases(&guarantees, unchanged_cases);
 }
 
-fn validate_simplified_cases<T>(rewriter: &mut GuaranteeRewriter, cases: &[(Expr, T)])
-where
+fn validate_simplified_cases<T>(
+    guarantees: &[(Expr, NullableInterval)],
+    cases: &[(Expr, T)],
+) where
     ScalarValue: From<T>,
     T: Clone,
 {
     for (expr, expected_value) in cases {
-        let output = expr.clone().rewrite(rewriter).data().unwrap();
+        let output = rewrite_with_guarantees(expr.clone(), guarantees)
+            .data()
+            .unwrap();
         let expected = lit(ScalarValue::from(expected_value.clone()));
         assert_eq!(
             output, expected,
@@ -365,9 +370,11 @@ where
         );
     }
 }
-fn validate_unchanged_cases(rewriter: &mut GuaranteeRewriter, cases: &[Expr]) {
+fn validate_unchanged_cases(guarantees: &[(Expr, NullableInterval)], cases: &[Expr]) {
     for expr in cases {
-        let output = expr.clone().rewrite(rewriter).data().unwrap();
+        let output = rewrite_with_guarantees(expr.clone(), guarantees)
+            .data()
+            .unwrap();
         assert_eq!(
             &output, expr,
             "{expr} was simplified to {output}, but expected it to be unchanged"
diff --git a/datafusion/core/tests/parquet/content_defined_chunking.rs b/datafusion/core/tests/parquet/content_defined_chunking.rs
new file mode 100644
index 0000000000000..6a98ded1bd4cf
--- /dev/null
+++ b/datafusion/core/tests/parquet/content_defined_chunking.rs
@@ -0,0 +1,182 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Tests for parquet content-defined chunking (CDC).
+//!
+//! These tests verify that CDC options are correctly wired through to the
+//! parquet writer by inspecting file metadata (compressed sizes, page
+//! boundaries) on the written files.
+
+use arrow::array::{AsArray, Int32Array, StringArray};
+use arrow::datatypes::{DataType, Field, Int32Type, Int64Type, Schema};
+use arrow::record_batch::RecordBatch;
+use datafusion::prelude::{ParquetReadOptions, SessionContext};
+use datafusion_common::config::{CdcOptions, TableParquetOptions};
+use parquet::arrow::ArrowWriter;
+use parquet::arrow::arrow_reader::ArrowReaderMetadata;
+use parquet::file::properties::WriterProperties;
+use std::fs::File;
+use std::sync::Arc;
+use tempfile::NamedTempFile;
+
+/// Create a RecordBatch with enough data to exercise CDC chunking.
+fn make_test_batch(num_rows: usize) -> RecordBatch {
+    let ids: Vec<i32> = (0..num_rows as i32).collect();
+    // ~100 bytes per row to generate enough data for CDC page splits
+    let payloads: Vec<String> = (0..num_rows)
+        .map(|i| format!("row-{i:06}-payload-{}", "x".repeat(80)))
+        .collect();
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("payload", DataType::Utf8, false),
+    ]));
+
+    RecordBatch::try_new(
+        schema,
+        vec![
+            Arc::new(Int32Array::from(ids)),
+            Arc::new(StringArray::from(payloads)),
+        ],
+    )
+    .unwrap()
+}
+
+/// Build WriterProperties from TableParquetOptions, exercising the same
+/// code path that DataFusion's parquet sink uses.
+fn writer_props(
+    opts: &mut TableParquetOptions,
+    schema: &Arc<Schema>,
+) -> WriterProperties {
+    opts.arrow_schema(schema);
+    parquet::file::properties::WriterPropertiesBuilder::try_from(
+        opts as &TableParquetOptions,
+    )
+    .unwrap()
+    .build()
+}
+
+/// Write a batch to a temp parquet file and return the file handle.
+fn write_parquet_file(batch: &RecordBatch, props: WriterProperties) -> NamedTempFile {
+    let tmp = tempfile::Builder::new()
+        .suffix(".parquet")
+        .tempfile()
+        .unwrap();
+    let mut writer =
+        ArrowWriter::try_new(tmp.reopen().unwrap(), batch.schema(), Some(props)).unwrap();
+    writer.write(batch).unwrap();
+    writer.close().unwrap();
+    tmp
+}
+
+/// Read parquet metadata from a file.
+fn read_metadata(file: &NamedTempFile) -> parquet::file::metadata::ParquetMetaData {
+    let f = File::open(file.path()).unwrap();
+    let reader_meta = ArrowReaderMetadata::load(&f, Default::default()).unwrap();
+    reader_meta.metadata().as_ref().clone()
+}
+
+/// Write parquet with CDC enabled, read it back via DataFusion, and verify
+/// the data round-trips correctly.
+#[tokio::test]
+async fn cdc_data_round_trip() {
+    let batch = make_test_batch(5000);
+
+    let mut opts = TableParquetOptions::default();
+    opts.global.use_content_defined_chunking = Some(CdcOptions::default());
+    let props = writer_props(&mut opts, &batch.schema());
+
+    let tmp = write_parquet_file(&batch, props);
+
+    // Read back via DataFusion and verify row count
+    let ctx = SessionContext::new();
+    ctx.register_parquet(
+        "data",
+        tmp.path().to_str().unwrap(),
+        ParquetReadOptions::default(),
+    )
+    .await
+    .unwrap();
+
+    let result = ctx
+        .sql("SELECT COUNT(*), MIN(id), MAX(id) FROM data")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    let row = &result[0];
+    let count = row.column(0).as_primitive::<Int64Type>().value(0);
+    let min_id = row.column(1).as_primitive::<Int32Type>().value(0);
+    let max_id = row.column(2).as_primitive::<Int32Type>().value(0);
+
+    assert_eq!(count, 5000);
+    assert_eq!(min_id, 0);
+    assert_eq!(max_id, 4999);
+}
+
+/// Verify that CDC options are reflected in the parquet file metadata.
+/// With small chunk sizes, CDC should produce different page boundaries
+/// compared to default (no CDC) writing.
+#[tokio::test]
+async fn cdc_affects_page_boundaries() {
+    let batch = make_test_batch(5000);
+
+    // Write WITHOUT CDC
+    let mut no_cdc_opts = TableParquetOptions::default();
+    let no_cdc_file =
+        write_parquet_file(&batch, writer_props(&mut no_cdc_opts, &batch.schema()));
+    let no_cdc_meta = read_metadata(&no_cdc_file);
+
+    // Write WITH CDC using small chunk sizes to maximize effect
+    let mut cdc_opts = TableParquetOptions::default();
+    cdc_opts.global.use_content_defined_chunking = Some(CdcOptions {
+        min_chunk_size: 512,
+        max_chunk_size: 2048,
+        norm_level: 0,
+    });
+    let cdc_file =
+        write_parquet_file(&batch, writer_props(&mut cdc_opts, &batch.schema()));
+    let cdc_meta = read_metadata(&cdc_file);
+
+    // Both files should have the same number of rows
+    assert_eq!(
+        no_cdc_meta.file_metadata().num_rows(),
+        cdc_meta.file_metadata().num_rows(),
+    );
+
+    // Compare the uncompressed sizes of columns across all row groups.
+    // CDC with small chunk sizes should produce different page boundaries.
+    let no_cdc_sizes: Vec<i64> = no_cdc_meta
+        .row_groups()
+        .iter()
+        .flat_map(|rg| rg.columns().iter().map(|c| c.uncompressed_size()))
+        .collect();
+
+    let cdc_sizes: Vec<i64> = cdc_meta
+        .row_groups()
+        .iter()
+        .flat_map(|rg| rg.columns().iter().map(|c| c.uncompressed_size()))
+        .collect();
+
+    assert_ne!(
+        no_cdc_sizes, cdc_sizes,
+        "CDC with small chunk sizes should produce different page layouts \
+         than default writing. no_cdc={no_cdc_sizes:?}, cdc={cdc_sizes:?}"
+    );
+}
diff --git a/datafusion/core/tests/parquet/custom_reader.rs b/datafusion/core/tests/parquet/custom_reader.rs
index 3a1f06656236c..ae11fa9a11334 100644
--- a/datafusion/core/tests/parquet/custom_reader.rs
+++ b/datafusion/core/tests/parquet/custom_reader.rs
@@ -20,7 +20,7 @@ use std::ops::Range;
 use std::sync::Arc;
 use std::time::SystemTime;
 
-use arrow::array::{ArrayRef, Int64Array, Int8Array, StringArray};
+use arrow::array::{ArrayRef, Int8Array, Int64Array, StringArray};
 use arrow::datatypes::{Field, Schema, SchemaBuilder};
 use arrow::record_batch::RecordBatch;
 use datafusion::datasource::listing::PartitionedFile;
@@ -31,8 +31,8 @@ use datafusion::datasource::physical_plan::{
 use datafusion::physical_plan::collect;
 use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
 use datafusion::prelude::SessionContext;
-use datafusion_common::test_util::batches_to_sort_string;
 use datafusion_common::Result;
+use datafusion_common::test_util::batches_to_sort_string;
 
 use bytes::Bytes;
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
@@ -43,10 +43,10 @@ use futures::{FutureExt, TryFutureExt};
 use insta::assert_snapshot;
 use object_store::memory::InMemory;
 use object_store::path::Path;
-use object_store::{ObjectMeta, ObjectStore};
+use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt};
+use parquet::arrow::ArrowWriter;
 use parquet::arrow::arrow_reader::ArrowReaderOptions;
 use parquet::arrow::async_reader::AsyncFileReader;
-use parquet::arrow::ArrowWriter;
 use parquet::errors::ParquetError;
 use parquet::file::metadata::ParquetMetaData;
 
@@ -69,18 +69,14 @@ async fn route_data_access_ops_to_parquet_file_reader_factory() {
         store_parquet_in_memory(vec![batch]).await;
     let file_group = parquet_files_meta
         .into_iter()
-        .map(|meta| PartitionedFile {
-            object_meta: meta,
-            partition_values: vec![],
-            range: None,
-            statistics: None,
-            extensions: Some(Arc::new(String::from(EXPECTED_USER_DEFINED_METADATA))),
-            metadata_size_hint: None,
+        .map(|meta| {
+            PartitionedFile::new_from_meta(meta)
+                .with_extensions(Arc::new(String::from(EXPECTED_USER_DEFINED_METADATA)))
         })
         .collect();
 
     let source = Arc::new(
-        ParquetSource::default()
+        ParquetSource::new(file_schema.clone())
             // prepare the scan
             .with_parquet_file_reader_factory(Arc::new(
                 InMemoryParquetFileReaderFactory(Arc::clone(&in_memory_object_store)),
@@ -89,7 +85,6 @@ async fn route_data_access_ops_to_parquet_file_reader_factory() {
     let base_config = FileScanConfigBuilder::new(
         // just any url that doesn't point to in memory object store
         ObjectStoreUrl::local_filesystem(),
-        file_schema,
         source,
     )
     .with_file_group(file_group)
diff --git a/datafusion/core/tests/parquet/encryption.rs b/datafusion/core/tests/parquet/encryption.rs
index 09b93f06ce85d..12bdb600c2ac9 100644
--- a/datafusion/core/tests/parquet/encryption.rs
+++ b/datafusion/core/tests/parquet/encryption.rs
@@ -25,11 +25,11 @@ use datafusion::dataframe::DataFrameWriteOptions;
 use datafusion::datasource::listing::ListingOptions;
 use datafusion::prelude::{ParquetReadOptions, SessionContext};
 use datafusion_common::config::{EncryptionFactoryOptions, TableParquetOptions};
-use datafusion_common::{assert_batches_sorted_eq, exec_datafusion_err, DataFusionError};
+use datafusion_common::{DataFusionError, assert_batches_sorted_eq, exec_datafusion_err};
 use datafusion_datasource_parquet::ParquetFormat;
 use datafusion_execution::parquet_encryption::EncryptionFactory;
-use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions};
 use parquet::arrow::ArrowWriter;
+use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions};
 use parquet::encryption::decrypt::FileDecryptionProperties;
 use parquet::encryption::encrypt::FileEncryptionProperties;
 use parquet::file::column_crypto_metadata::ColumnCryptoMetaData;
@@ -54,6 +54,7 @@ async fn read_parquet_test_data<'a, T: Into<String>>(
         .unwrap()
 }
 
+#[expect(clippy::needless_pass_by_value)]
 pub fn write_batches(
     path: PathBuf,
     props: WriterProperties,
@@ -114,8 +115,8 @@ async fn round_trip_encryption() {
 
     // Read encrypted parquet
     let ctx: SessionContext = SessionContext::new();
-    let options =
-        ParquetReadOptions::default().file_decryption_properties((&decrypt).into());
+    let options = ParquetReadOptions::default()
+        .file_decryption_properties((&decrypt).try_into().unwrap());
 
     let encrypted_batches = read_parquet_test_data(
         tempfile.into_os_string().into_string().unwrap(),
diff --git a/datafusion/core/tests/parquet/expr_adapter.rs b/datafusion/core/tests/parquet/expr_adapter.rs
new file mode 100644
index 0000000000000..fd70d74a9140c
--- /dev/null
+++ b/datafusion/core/tests/parquet/expr_adapter.rs
@@ -0,0 +1,1128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{
+    Array, ArrayRef, BooleanArray, Int32Array, Int64Array, LargeListArray, ListArray,
+    RecordBatch, StringArray, StructArray, record_batch,
+};
+use arrow::buffer::OffsetBuffer;
+use arrow::compute::concat_batches;
+use arrow_schema::{DataType, Field, Fields, Schema, SchemaRef};
+use bytes::{BufMut, BytesMut};
+use datafusion::assert_batches_eq;
+use datafusion::common::Result;
+use datafusion::datasource::listing::{
+    ListingTable, ListingTableConfig, ListingTableConfigExt,
+};
+use datafusion::prelude::{SessionConfig, SessionContext};
+use datafusion_common::DataFusionError;
+use datafusion_common::ScalarValue;
+use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
+use datafusion_datasource::ListingTableUrl;
+use datafusion_execution::object_store::ObjectStoreUrl;
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_expr::expressions::{self, Column};
+use datafusion_physical_expr_adapter::{
+    DefaultPhysicalExprAdapter, DefaultPhysicalExprAdapterFactory, PhysicalExprAdapter,
+    PhysicalExprAdapterFactory,
+};
+use object_store::{ObjectStore, ObjectStoreExt, memory::InMemory, path::Path};
+use parquet::arrow::ArrowWriter;
+
+async fn write_parquet(batch: RecordBatch, store: Arc<dyn ObjectStore>, path: &str) {
+    let mut out = BytesMut::new().writer();
+    {
+        let mut writer = ArrowWriter::try_new(&mut out, batch.schema(), None).unwrap();
+        writer.write(&batch).unwrap();
+        writer.finish().unwrap();
+    }
+    let data = out.into_inner().freeze();
+    store.put(&Path::from(path), data.into()).await.unwrap();
+}
+
+#[derive(Debug, Clone, Copy)]
+enum NestedListKind {
+    List,
+    LargeList,
+}
+
+impl NestedListKind {
+    fn field_data_type(self, item_field: Arc<Field>) -> DataType {
+        match self {
+            Self::List => DataType::List(item_field),
+            Self::LargeList => DataType::LargeList(item_field),
+        }
+    }
+
+    fn array(
+        self,
+        item_field: Arc<Field>,
+        lengths: Vec<usize>,
+        values: ArrayRef,
+    ) -> ArrayRef {
+        match self {
+            Self::List => Arc::new(ListArray::new(
+                item_field,
+                OffsetBuffer::<i32>::from_lengths(lengths),
+                values,
+                None,
+            )),
+            Self::LargeList => Arc::new(LargeListArray::new(
+                item_field,
+                OffsetBuffer::<i64>::from_lengths(lengths),
+                values,
+                None,
+            )),
+        }
+    }
+
+    fn name(self) -> &'static str {
+        match self {
+            Self::List => "list",
+            Self::LargeList => "large_list",
+        }
+    }
+}
+
+#[derive(Debug)]
+// Fixture row for one nested struct element inside the `messages` list column.
+struct NestedMessageRow<'a> {
+    id: i32,
+    name: &'a str,
+    chain: Option<&'a str>,
+    ignored: Option<i32>,
+}
+
+fn message_fields(
+    chain_type: DataType,
+    chain_nullable: bool,
+    include_chain: bool,
+    include_ignored: bool,
+) -> Fields {
+    let mut fields = vec![
+        Arc::new(Field::new("id", DataType::Int32, false)),
+        Arc::new(Field::new("name", DataType::Utf8, true)),
+    ];
+    if include_chain {
+        fields.push(Arc::new(Field::new("chain", chain_type, chain_nullable)));
+    }
+    if include_ignored {
+        fields.push(Arc::new(Field::new("ignored", DataType::Int32, true)));
+    }
+    fields.into()
+}
+
+// Helper to construct the target message schema for struct evolution tests.
+// The schema always has id (Int64), name (Utf8), and chain with parameterized type.
+fn target_message_fields(chain_type: DataType, chain_nullable: bool) -> Fields {
+    vec![
+        Arc::new(Field::new("id", DataType::Int64, false)),
+        Arc::new(Field::new("name", DataType::Utf8, true)),
+        Arc::new(Field::new("chain", chain_type, chain_nullable)),
+    ]
+    .into()
+}
+
+// Helper to build message columns in canonical order (id, name, chain, ignored)
+// based on which optional fields are present in the schema.
+fn build_message_columns(
+    id_array: &ArrayRef,
+    name_array: &ArrayRef,
+    chain_vec: &[Option<&str>],
+    ignored_array: &ArrayRef,
+    fields: &Fields,
+) -> Vec<ArrayRef> {
+    let mut columns = vec![Arc::clone(id_array), Arc::clone(name_array)];
+
+    for field in fields.iter().skip(2) {
+        match field.name().as_str() {
+            "chain" => {
+                let chain_array = match field.data_type() {
+                    DataType::Utf8 => {
+                        Arc::new(StringArray::from(chain_vec.to_vec())) as ArrayRef
+                    }
+                    DataType::Struct(chain_fields) => {
+                        let chain_struct = StructArray::new(
+                            chain_fields.clone(),
+                            vec![Arc::new(StringArray::from(chain_vec.to_vec()))
+                                as ArrayRef],
+                            None,
+                        );
+                        Arc::new(chain_struct) as ArrayRef
+                    }
+                    other => panic!("unexpected chain field type: {other:?}"),
+                };
+                columns.push(chain_array);
+            }
+            "ignored" => columns.push(Arc::clone(ignored_array)),
+            _ => {}
+        }
+    }
+    columns
+}
+
+fn nested_messages_batch(
+    kind: NestedListKind,
+    row_id: i32,
+    messages: &[NestedMessageRow<'_>],
+    fields: &Fields,
+) -> RecordBatch {
+    let item_field = Arc::new(Field::new("item", DataType::Struct(fields.clone()), true));
+
+    let (ids_vec, names_vec, chain_vec, ignored_vec) = messages.iter().fold(
+        (
+            Vec::with_capacity(messages.len()),
+            Vec::with_capacity(messages.len()),
+            Vec::with_capacity(messages.len()),
+            Vec::with_capacity(messages.len()),
+        ),
+        |(mut ids, mut names, mut chains, mut ignoreds), msg| {
+            ids.push(msg.id);
+            names.push(Some(msg.name));
+            chains.push(msg.chain);
+            ignoreds.push(msg.ignored);
+            (ids, names, chains, ignoreds)
+        },
+    );
+
+    // Build all arrays once
+    let id_array = Arc::new(Int32Array::from(ids_vec)) as ArrayRef;
+    let name_array = Arc::new(StringArray::from(names_vec)) as ArrayRef;
+    let ignored_array = Arc::new(Int32Array::from(ignored_vec)) as ArrayRef;
+
+    // Build columns in canonical order (id, name, chain, ignored) based on field schema
+    let columns =
+        build_message_columns(&id_array, &name_array, &chain_vec, &ignored_array, fields);
+
+    let struct_array = StructArray::new(fields.clone(), columns, None);
+
+    // Compute the message data type first, then move item_field into kind.array()
+    let message_data_type = kind.field_data_type(item_field.clone());
+    let messages_array =
+        kind.array(item_field, vec![messages.len()], Arc::new(struct_array));
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("row_id", DataType::Int32, false),
+        Field::new("messages", message_data_type, true),
+    ]));
+
+    RecordBatch::try_new(
+        schema,
+        vec![
+            Arc::new(Int32Array::from(vec![row_id])) as ArrayRef,
+            messages_array,
+        ],
+    )
+    .unwrap()
+}
+
+async fn register_memory_listing_table(
+    ctx: &SessionContext,
+    store: Arc<dyn ObjectStore>,
+    base_path: &str,
+    table_schema: SchemaRef,
+) {
+    let store_url = ObjectStoreUrl::parse("memory://").unwrap();
+    ctx.register_object_store(store_url.as_ref(), Arc::clone(&store));
+
+    let listing_table_config =
+        ListingTableConfig::new(ListingTableUrl::parse(base_path).unwrap())
+            .infer_options(&ctx.state())
+            .await
+            .unwrap()
+            .with_schema(table_schema)
+            .with_expr_adapter_factory(Arc::new(DefaultPhysicalExprAdapterFactory));
+
+    let table = ListingTable::try_new(listing_table_config).unwrap();
+    ctx.register_table("t", Arc::new(table)).unwrap();
+}
+
+fn test_context() -> SessionContext {
+    let mut cfg = SessionConfig::new()
+        .with_collect_statistics(false)
+        .with_parquet_pruning(false)
+        .with_parquet_page_index_pruning(false);
+    cfg.options_mut().execution.parquet.pushdown_filters = true;
+    SessionContext::new_with_config(cfg)
+}
+
+fn nested_list_table_schema(
+    kind: NestedListKind,
+    target_message_fields: Fields,
+) -> SchemaRef {
+    let target_item = Arc::new(Field::new(
+        "item",
+        DataType::Struct(target_message_fields),
+        true,
+    ));
+    Arc::new(Schema::new(vec![
+        Field::new("row_id", DataType::Int32, false),
+        Field::new("messages", kind.field_data_type(target_item), true),
+    ]))
+}
+
+// Helper to extract message values from a nested list column.
+// Returns the values at indices 0 and 1 from either a ListArray or LargeListArray.
+fn extract_nested_list_values(
+    kind: NestedListKind,
+    column: &ArrayRef,
+) -> (ArrayRef, ArrayRef) {
+    match kind {
+        NestedListKind::List => {
+            let list = column
+                .as_any()
+                .downcast_ref::<ListArray>()
+                .expect("messages should be a ListArray");
+            (list.value(0), list.value(1))
+        }
+        NestedListKind::LargeList => {
+            let list = column
+                .as_any()
+                .downcast_ref::<LargeListArray>()
+                .expect("messages should be a LargeListArray");
+            (list.value(0), list.value(1))
+        }
+    }
+}
+
+// Helper to set up a nested list test fixture.
+// Creates an in-memory store, writes the provided batches to parquet files,
+// creates a SessionContext, and registers the resulting table.
+// Returns the prepared context ready for queries.
+async fn setup_nested_list_test(
+    kind: NestedListKind,
+    prefix_base: &str,
+    batches: Vec<(String, RecordBatch)>,
+    table_schema: SchemaRef,
+) -> SessionContext {
+    let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+    let prefix = format!("{}_{}", kind.name(), prefix_base);
+
+    for (filename, batch) in batches {
+        write_parquet(batch, Arc::clone(&store), &format!("{prefix}/{filename}")).await;
+    }
+
+    let ctx = test_context();
+    register_memory_listing_table(
+        &ctx,
+        Arc::clone(&store),
+        &format!("memory:///{prefix}/"),
+        table_schema,
+    )
+    .await;
+
+    ctx
+}
+
+async fn assert_nested_list_struct_schema_evolution(kind: NestedListKind) -> Result<()> {
+    // old.parquet shape: messages item struct has only (id, name), no `chain`.
+    let old_batch = nested_messages_batch(
+        kind,
+        1,
+        &[
+            NestedMessageRow {
+                id: 10,
+                name: "alpha",
+                chain: None,
+                ignored: None,
+            },
+            NestedMessageRow {
+                id: 20,
+                name: "beta",
+                chain: None,
+                ignored: None,
+            },
+        ],
+        &message_fields(DataType::Utf8, true, false, false),
+    );
+
+    // new.parquet shape: messages item struct adds nullable `chain` and extra `ignored`.
+    let new_batch = nested_messages_batch(
+        kind,
+        2,
+        &[NestedMessageRow {
+            id: 30,
+            name: "gamma",
+            chain: Some("eth"),
+            ignored: Some(99),
+        }],
+        &message_fields(DataType::Utf8, true, true, true),
+    );
+
+    // Logical table schema expects evolved shape (id, name, nullable `chain`) and
+    // should ignore source-only `ignored` during reads.
+    let table_schema =
+        nested_list_table_schema(kind, target_message_fields(DataType::Utf8, true));
+
+    let ctx = setup_nested_list_test(
+        kind,
+        "struct_evolution",
+        vec![
+            ("old.parquet".to_string(), old_batch),
+            ("new.parquet".to_string(), new_batch),
+        ],
+        table_schema,
+    )
+    .await;
+
+    let select_all = ctx
+        .sql("SELECT * FROM t ORDER BY row_id")
+        .await?
+        .collect()
+        .await?;
+    let all_rows = concat_batches(&select_all[0].schema(), &select_all)?;
+
+    let row_ids = all_rows
+        .column(0)
+        .as_any()
+        .downcast_ref::<Int32Array>()
+        .expect("row_id should be Int32");
+    assert_eq!(row_ids.values(), &[1, 2]);
+
+    let (messages0, messages1) = extract_nested_list_values(kind, all_rows.column(1));
+
+    let messages0 = messages0
+        .as_any()
+        .downcast_ref::<StructArray>()
+        .expect("messages[0] should be a StructArray");
+    let old_ids = messages0
+        .column_by_name("id")
+        .unwrap()
+        .as_any()
+        .downcast_ref::<Int64Array>()
+        .unwrap();
+    assert_eq!(old_ids.values(), &[10, 20]);
+    let old_chain = messages0
+        .column_by_name("chain")
+        .unwrap()
+        .as_any()
+        .downcast_ref::<StringArray>()
+        .unwrap();
+    assert_eq!(old_chain.iter().collect::<Vec<_>>(), vec![None, None]);
+
+    let messages1 = messages1
+        .as_any()
+        .downcast_ref::<StructArray>()
+        .expect("messages[1] should be a StructArray");
+    assert!(
+        messages1.column_by_name("ignored").is_none(),
+        "extra source fields should not appear in the logical schema"
+    );
+    let new_chain = messages1
+        .column_by_name("chain")
+        .unwrap()
+        .as_any()
+        .downcast_ref::<StringArray>()
+        .unwrap();
+    assert_eq!(new_chain.iter().collect::<Vec<_>>(), vec![Some("eth")]);
+
+    let projected = ctx
+        .sql(
+            "SELECT row_id, get_field(messages[1], 'id') AS msg_id, \
+             get_field(messages[1], 'chain') AS chain \
+             FROM t ORDER BY row_id",
+        )
+        .await?
+        .collect()
+        .await?;
+
+    #[rustfmt::skip]
+    let expected = [
+        "+--------+--------+-------+",
+        "| row_id | msg_id | chain |",
+        "+--------+--------+-------+",
+        "| 1      | 10     |       |",
+        "| 2      | 30     | eth   |",
+        "+--------+--------+-------+",
+    ];
+    assert_batches_eq!(expected, &projected);
+
+    Ok(())
+}
+
+// Implement a custom PhysicalExprAdapterFactory that fills in missing columns with
+// the default value for the field type:
+// - Int64 columns are filled with `1`
+// - Utf8 columns are filled with `'b'`
+#[derive(Debug)]
+struct CustomPhysicalExprAdapterFactory;
+
+impl PhysicalExprAdapterFactory for CustomPhysicalExprAdapterFactory {
+    fn create(
+        &self,
+        logical_file_schema: SchemaRef,
+        physical_file_schema: SchemaRef,
+    ) -> Result<Arc<dyn PhysicalExprAdapter>> {
+        Ok(Arc::new(CustomPhysicalExprAdapter {
+            logical_file_schema: Arc::clone(&logical_file_schema),
+            physical_file_schema: Arc::clone(&physical_file_schema),
+            inner: Arc::new(DefaultPhysicalExprAdapter::new(
+                logical_file_schema,
+                physical_file_schema,
+            )),
+        }))
+    }
+}
+
+#[derive(Debug, Clone)]
+struct CustomPhysicalExprAdapter {
+    logical_file_schema: SchemaRef,
+    physical_file_schema: SchemaRef,
+    inner: Arc<dyn PhysicalExprAdapter>,
+}
+
+impl PhysicalExprAdapter for CustomPhysicalExprAdapter {
+    fn rewrite(&self, mut expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
+        expr = expr
+            .transform(|expr| {
+                if let Some(column) = expr.downcast_ref::<Column>() {
+                    let field_name = column.name();
+                    if self
+                        .physical_file_schema
+                        .field_with_name(field_name)
+                        .ok()
+                        .is_none()
+                    {
+                        let field = self
+                            .logical_file_schema
+                            .field_with_name(field_name)
+                            .map_err(|_| {
+                                DataFusionError::Plan(format!(
+                                    "Field '{field_name}' not found in logical file schema",
+                                ))
+                            })?;
+                        // If the field does not exist, create a default value expression
+                        // Note that we use slightly different logic here to create a default value so that we can see different behavior in tests
+                        let default_value = match field.data_type() {
+                            DataType::Int64 => ScalarValue::Int64(Some(1)),
+                            DataType::Utf8 => ScalarValue::Utf8(Some("b".to_string())),
+                            _ => unimplemented!(
+                                "Unsupported data type: {}",
+                                field.data_type()
+                            ),
+                        };
+                        return Ok(Transformed::yes(Arc::new(
+                            expressions::Literal::new(default_value),
+                        )));
+                    }
+                }
+
+                Ok(Transformed::no(expr))
+            })
+            .data()?;
+        self.inner.rewrite(expr)
+    }
+}
+
+#[tokio::test]
+async fn test_custom_schema_adapter_and_custom_expression_adapter() {
+    let batch =
+        record_batch!(("extra", Int64, [1, 2, 3]), ("c1", Int32, [1, 2, 3])).unwrap();
+
+    let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+    let store_url = ObjectStoreUrl::parse("memory://").unwrap();
+    let path = "test.parquet";
+    write_parquet(batch, store.clone(), path).await;
+
+    let table_schema = Arc::new(Schema::new(vec![
+        Field::new("c1", DataType::Int64, false),
+        Field::new("c2", DataType::Utf8, true),
+    ]));
+
+    let mut cfg = SessionConfig::new()
+        // Disable statistics collection for this test otherwise early pruning makes it hard to demonstrate data adaptation
+        .with_collect_statistics(false)
+        .with_parquet_pruning(false)
+        .with_parquet_page_index_pruning(false);
+    cfg.options_mut().execution.parquet.pushdown_filters = true;
+    let ctx = SessionContext::new_with_config(cfg);
+    ctx.register_object_store(store_url.as_ref(), Arc::clone(&store));
+    assert!(
+        !ctx.state()
+            .config_mut()
+            .options_mut()
+            .execution
+            .collect_statistics
+    );
+    assert!(!ctx.state().config().collect_statistics());
+
+    // Test with DefaultPhysicalExprAdapterFactory - missing columns are filled with NULL
+    let listing_table_config =
+        ListingTableConfig::new(ListingTableUrl::parse("memory:///").unwrap())
+            .infer_options(&ctx.state())
+            .await
+            .unwrap()
+            .with_schema(table_schema.clone())
+            .with_expr_adapter_factory(Arc::new(DefaultPhysicalExprAdapterFactory));
+
+    let table = ListingTable::try_new(listing_table_config).unwrap();
+    ctx.register_table("t", Arc::new(table)).unwrap();
+
+    let batches = ctx
+        .sql("SELECT c2, c1 FROM t WHERE c1 = 2 AND c2 IS NULL")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    let expected = [
+        "+----+----+",
+        "| c2 | c1 |",
+        "+----+----+",
+        "|    | 2  |",
+        "+----+----+",
+    ];
+    assert_batches_eq!(expected, &batches);
+
+    // Test with a custom physical expr adapter
+    // PhysicalExprAdapterFactory now handles both predicates AND projections
+    // CustomPhysicalExprAdapterFactory fills missing columns with 'b' for Utf8
+    let listing_table_config =
+        ListingTableConfig::new(ListingTableUrl::parse("memory:///").unwrap())
+            .infer_options(&ctx.state())
+            .await
+            .unwrap()
+            .with_schema(table_schema.clone())
+            .with_expr_adapter_factory(Arc::new(CustomPhysicalExprAdapterFactory));
+    let table = ListingTable::try_new(listing_table_config).unwrap();
+    ctx.deregister_table("t").unwrap();
+    ctx.register_table("t", Arc::new(table)).unwrap();
+    let batches = ctx
+        .sql("SELECT c2, c1 FROM t WHERE c1 = 2 AND c2 = 'b'")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+    // With CustomPhysicalExprAdapterFactory, missing column c2 is filled with 'b'
+    // in both the predicate (c2 = 'b' becomes 'b' = 'b' -> true) and the projection
+    let expected = [
+        "+----+----+",
+        "| c2 | c1 |",
+        "+----+----+",
+        "| b  | 2  |",
+        "+----+----+",
+    ];
+    assert_batches_eq!(expected, &batches);
+}
+
+/// Test demonstrating how to implement a custom PhysicalExprAdapterFactory
+/// that fills missing columns with non-null default values.
+///
+/// PhysicalExprAdapterFactory rewrites expressions to use literals for
+/// missing columns, handling schema evolution efficiently at planning time.
+#[tokio::test]
+async fn test_physical_expr_adapter_with_non_null_defaults() {
+    // File only has c1 column
+    let batch = record_batch!(("c1", Int32, [10, 20, 30])).unwrap();
+
+    let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+    let store_url = ObjectStoreUrl::parse("memory://").unwrap();
+    write_parquet(batch, store.clone(), "defaults_test.parquet").await;
+
+    // Table schema has additional columns c2 (Utf8) and c3 (Int64) that don't exist in file
+    let table_schema = Arc::new(Schema::new(vec![
+        Field::new("c1", DataType::Int64, false), // type differs from file (Int32 vs Int64)
+        Field::new("c2", DataType::Utf8, true),   // missing from file
+        Field::new("c3", DataType::Int64, true),  // missing from file
+    ]));
+
+    let mut cfg = SessionConfig::new()
+        .with_collect_statistics(false)
+        .with_parquet_pruning(false);
+    cfg.options_mut().execution.parquet.pushdown_filters = true;
+    let ctx = SessionContext::new_with_config(cfg);
+    ctx.register_object_store(store_url.as_ref(), Arc::clone(&store));
+
+    // CustomPhysicalExprAdapterFactory fills:
+    // - missing Utf8 columns with 'b'
+    // - missing Int64 columns with 1
+    let listing_table_config =
+        ListingTableConfig::new(ListingTableUrl::parse("memory:///").unwrap())
+            .infer_options(&ctx.state())
+            .await
+            .unwrap()
+            .with_schema(table_schema.clone())
+            .with_expr_adapter_factory(Arc::new(CustomPhysicalExprAdapterFactory));
+
+    let table = ListingTable::try_new(listing_table_config).unwrap();
+    ctx.register_table("t", Arc::new(table)).unwrap();
+
+    // Query all columns - missing columns should have default values
+    let batches = ctx
+        .sql("SELECT c1, c2, c3 FROM t ORDER BY c1")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    // c1 is cast from Int32 to Int64, c2 defaults to 'b', c3 defaults to 1
+    let expected = [
+        "+----+----+----+",
+        "| c1 | c2 | c3 |",
+        "+----+----+----+",
+        "| 10 | b  | 1  |",
+        "| 20 | b  | 1  |",
+        "| 30 | b  | 1  |",
+        "+----+----+----+",
+    ];
+    assert_batches_eq!(expected, &batches);
+
+    // Verify predicates work with default values
+    // c3 = 1 should match all rows since default is 1
+    let batches = ctx
+        .sql("SELECT c1 FROM t WHERE c3 = 1 ORDER BY c1")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    #[rustfmt::skip]
+    let expected = [
+        "+----+",
+        "| c1 |",
+        "+----+",
+        "| 10 |",
+        "| 20 |",
+        "| 30 |",
+        "+----+",
+    ];
+    assert_batches_eq!(expected, &batches);
+
+    // c3 = 999 should match no rows
+    let batches = ctx
+        .sql("SELECT c1 FROM t WHERE c3 = 999")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    #[rustfmt::skip]
+    let expected = [
+        "++",
+        "++",
+    ];
+    assert_batches_eq!(expected, &batches);
+}
+
+#[tokio::test]
+async fn test_struct_schema_evolution_projection_and_filter() -> Result<()> {
+    use std::collections::HashMap;
+
+    // Physical struct: {id: Int32, name: Utf8}
+    let physical_struct_fields: Fields = vec![
+        Arc::new(Field::new("id", DataType::Int32, false)),
+        Arc::new(Field::new("name", DataType::Utf8, true)),
+    ]
+    .into();
+
+    let struct_array = StructArray::new(
+        physical_struct_fields.clone(),
+        vec![
+            Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef,
+            Arc::new(StringArray::from(vec!["a", "b", "c"])) as ArrayRef,
+        ],
+        None,
+    );
+
+    let physical_schema = Arc::new(Schema::new(vec![Field::new(
+        "s",
+        DataType::Struct(physical_struct_fields),
+        true,
+    )]));
+
+    let batch =
+        RecordBatch::try_new(Arc::clone(&physical_schema), vec![Arc::new(struct_array)])?;
+
+    let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+    let store_url = ObjectStoreUrl::parse("memory://").unwrap();
+    write_parquet(batch, store.clone(), "struct_evolution.parquet").await;
+
+    // Logical struct: {id: Int64?, name: Utf8?, extra: Boolean?} + metadata
+    let logical_struct_fields: Fields = vec![
+        Arc::new(Field::new("id", DataType::Int64, true)),
+        Arc::new(Field::new("name", DataType::Utf8, true)),
+        Arc::new(Field::new("extra", DataType::Boolean, true).with_metadata(
+            HashMap::from([("nested_meta".to_string(), "1".to_string())]),
+        )),
+    ]
+    .into();
+
+    let table_schema = Arc::new(Schema::new(vec![
+        Field::new("s", DataType::Struct(logical_struct_fields), false)
+            .with_metadata(HashMap::from([("top_meta".to_string(), "1".to_string())])),
+    ]));
+
+    let mut cfg = SessionConfig::new()
+        .with_collect_statistics(false)
+        .with_parquet_pruning(false)
+        .with_parquet_page_index_pruning(false);
+    cfg.options_mut().execution.parquet.pushdown_filters = true;
+
+    let ctx = SessionContext::new_with_config(cfg);
+    ctx.register_object_store(store_url.as_ref(), Arc::clone(&store));
+
+    let listing_table_config =
+        ListingTableConfig::new(ListingTableUrl::parse("memory:///").unwrap())
+            .infer_options(&ctx.state())
+            .await
+            .unwrap()
+            .with_schema(table_schema.clone())
+            .with_expr_adapter_factory(Arc::new(DefaultPhysicalExprAdapterFactory));
+
+    let table = ListingTable::try_new(listing_table_config).unwrap();
+    ctx.register_table("t", Arc::new(table)).unwrap();
+
+    let batches = ctx
+        .sql("SELECT s FROM t")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+    assert_eq!(batches.len(), 1);
+
+    // Verify top-level metadata propagation
+    let output_schema = batches[0].schema();
+    let s_field = output_schema.field_with_name("s").unwrap();
+    assert_eq!(
+        s_field.metadata().get("top_meta").map(String::as_str),
+        Some("1")
+    );
+
+    // Verify nested struct type/field propagation + values
+    let s_array = batches[0]
+        .column(0)
+        .as_any()
+        .downcast_ref::<StructArray>()
+        .expect("expected struct array");
+
+    let id_array = s_array
+        .column_by_name("id")
+        .expect("id column")
+        .as_any()
+        .downcast_ref::<Int64Array>()
+        .expect("id should be cast to Int64");
+    assert_eq!(id_array.values(), &[1, 2, 3]);
+
+    let extra_array = s_array.column_by_name("extra").expect("extra column");
+    assert_eq!(extra_array.null_count(), 3);
+
+    // Verify nested field metadata propagation
+    let extra_field = match s_field.data_type() {
+        DataType::Struct(fields) => fields
+            .iter()
+            .find(|f| f.name() == "extra")
+            .expect("extra field"),
+        other => panic!("expected struct type for s, got {other:?}"),
+    };
+    assert_eq!(
+        extra_field
+            .metadata()
+            .get("nested_meta")
+            .map(String::as_str),
+        Some("1")
+    );
+
+    // Smoke test: filtering on a missing nested field evaluates correctly
+    let filtered = ctx
+        .sql("SELECT get_field(s, 'extra') AS extra FROM t WHERE get_field(s, 'extra') IS NULL")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+    assert_eq!(filtered.len(), 1);
+    assert_eq!(filtered[0].num_rows(), 3);
+    let extra = filtered[0]
+        .column(0)
+        .as_any()
+        .downcast_ref::<BooleanArray>()
+        .expect("extra should be a boolean array");
+    assert_eq!(extra.null_count(), 3);
+
+    Ok(())
+}
+
+/// Macro to generate paired test functions for List and LargeList variants.
+/// Expands to two `#[tokio::test]` functions with the specified names.
+macro_rules! test_struct_schema_evolution_pair {
+    (
+        list: $list_test:ident,
+        large_list: $large_list_test:ident,
+        fn: $assertion_fn:path $(, args: $($arg:expr),+)?
+    ) => {
+        #[tokio::test]
+        async fn $list_test() {
+            $assertion_fn(NestedListKind::List $(, $($arg),+)?).await;
+        }
+
+        #[tokio::test]
+        async fn $large_list_test() {
+            $assertion_fn(NestedListKind::LargeList $(, $($arg),+)?).await;
+        }
+    };
+    (
+        list: $list_test:ident,
+        large_list: $large_list_test:ident,
+        fn_result: $assertion_fn:path
+    ) => {
+        #[tokio::test]
+        async fn $list_test() -> Result<()> {
+            $assertion_fn(NestedListKind::List).await
+        }
+
+        #[tokio::test]
+        async fn $large_list_test() -> Result<()> {
+            $assertion_fn(NestedListKind::LargeList).await
+        }
+    };
+}
+
+test_struct_schema_evolution_pair!(
+    list: test_list_struct_schema_evolution_end_to_end,
+    large_list: test_large_list_struct_schema_evolution_end_to_end,
+    fn_result: assert_nested_list_struct_schema_evolution
+);
+
+async fn assert_nested_list_struct_schema_evolution_errors(
+    kind: NestedListKind,
+    chain_type: DataType,
+    chain_nullable: bool,
+    expected_error: &str,
+) {
+    let batch = nested_messages_batch(
+        kind,
+        1,
+        &[NestedMessageRow {
+            id: 10,
+            name: "alpha",
+            chain: Some("eth"),
+            ignored: None,
+        }],
+        &message_fields(DataType::Utf8, true, true, false),
+    );
+
+    let table_schema =
+        nested_list_table_schema(kind, target_message_fields(chain_type, chain_nullable));
+
+    let ctx = setup_nested_list_test(
+        kind,
+        "struct_evolution_error",
+        vec![("data.parquet".to_string(), batch)],
+        table_schema,
+    )
+    .await;
+
+    let err = ctx
+        .sql("SELECT * FROM t")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap_err();
+    assert!(
+        err.to_string().contains(expected_error),
+        "expected error to contain '{expected_error}', got: {err}"
+    );
+}
+
+async fn assert_non_nullable_missing_chain_field_fails(kind: NestedListKind) {
+    assert_nested_list_struct_schema_evolution_errors(
+        kind,
+        DataType::Utf8,
+        false,
+        "non-nullable",
+    )
+    .await;
+}
+
+async fn assert_incompatible_chain_field_fails(kind: NestedListKind) {
+    assert_nested_list_struct_schema_evolution_errors(
+        kind,
+        incompatible_chain_type(),
+        true,
+        "Cannot cast struct field 'chain'",
+    )
+    .await;
+}
+
+fn incompatible_chain_type() -> DataType {
+    DataType::Struct(vec![Arc::new(Field::new("value", DataType::Utf8, true))].into())
+}
+
+test_struct_schema_evolution_pair!(
+    list: test_list_struct_schema_evolution_non_nullable_missing_field_fails,
+    large_list: test_large_list_struct_schema_evolution_non_nullable_missing_field_fails,
+    fn: assert_non_nullable_missing_chain_field_fails
+);
+
+test_struct_schema_evolution_pair!(
+    list: test_list_struct_schema_evolution_incompatible_field_fails,
+    large_list: test_large_list_struct_schema_evolution_incompatible_field_fails,
+    fn: assert_incompatible_chain_field_fails
+);
+
+/// Test demonstrating that a single PhysicalExprAdapterFactory instance can be
+/// reused across multiple ListingTable instances.
+///
+/// This addresses the concern: "This is important for ListingTable. A test for
+/// ListingTable would add assurance that the functionality is retained [i.e. we
+/// can re-use a PhysicalExprAdapterFactory]"
+#[tokio::test]
+async fn test_physical_expr_adapter_factory_reuse_across_tables() {
+    // Create two different parquet files with different schemas
+    // File 1: has column c1 only
+    let batch1 = record_batch!(("c1", Int32, [1, 2, 3])).unwrap();
+    // File 2: has column c1 only but different data
+    let batch2 = record_batch!(("c1", Int32, [10, 20, 30])).unwrap();
+
+    let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+    let store_url = ObjectStoreUrl::parse("memory://").unwrap();
+
+    // Write files to different paths
+    write_parquet(batch1, store.clone(), "table1/data.parquet").await;
+    write_parquet(batch2, store.clone(), "table2/data.parquet").await;
+
+    // Table schema has additional columns that don't exist in files
+    let table_schema = Arc::new(Schema::new(vec![
+        Field::new("c1", DataType::Int64, false),
+        Field::new("c2", DataType::Utf8, true), // missing from files
+    ]));
+
+    let mut cfg = SessionConfig::new()
+        .with_collect_statistics(false)
+        .with_parquet_pruning(false);
+    cfg.options_mut().execution.parquet.pushdown_filters = true;
+    let ctx = SessionContext::new_with_config(cfg);
+    ctx.register_object_store(store_url.as_ref(), Arc::clone(&store));
+
+    // Create ONE factory instance wrapped in Arc - this will be REUSED
+    let factory: Arc<dyn PhysicalExprAdapterFactory> =
+        Arc::new(CustomPhysicalExprAdapterFactory);
+
+    // Create ListingTable 1 using the shared factory
+    let listing_table_config1 =
+        ListingTableConfig::new(ListingTableUrl::parse("memory:///table1/").unwrap())
+            .infer_options(&ctx.state())
+            .await
+            .unwrap()
+            .with_schema(table_schema.clone())
+            .with_expr_adapter_factory(Arc::clone(&factory)); // Clone the Arc, not create new factory
+
+    let table1 = ListingTable::try_new(listing_table_config1).unwrap();
+    ctx.register_table("t1", Arc::new(table1)).unwrap();
+
+    // Create ListingTable 2 using the SAME factory instance
+    let listing_table_config2 =
+        ListingTableConfig::new(ListingTableUrl::parse("memory:///table2/").unwrap())
+            .infer_options(&ctx.state())
+            .await
+            .unwrap()
+            .with_schema(table_schema.clone())
+            .with_expr_adapter_factory(Arc::clone(&factory)); // Reuse same factory
+
+    let table2 = ListingTable::try_new(listing_table_config2).unwrap();
+    ctx.register_table("t2", Arc::new(table2)).unwrap();
+
+    // Verify table 1 works correctly with the shared factory
+    // CustomPhysicalExprAdapterFactory fills missing Utf8 columns with 'b'
+    let batches = ctx
+        .sql("SELECT c1, c2 FROM t1 ORDER BY c1")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    let expected = [
+        "+----+----+",
+        "| c1 | c2 |",
+        "+----+----+",
+        "| 1  | b  |",
+        "| 2  | b  |",
+        "| 3  | b  |",
+        "+----+----+",
+    ];
+    assert_batches_eq!(expected, &batches);
+
+    // Verify table 2 also works correctly with the SAME shared factory
+    let batches = ctx
+        .sql("SELECT c1, c2 FROM t2 ORDER BY c1")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    let expected = [
+        "+----+----+",
+        "| c1 | c2 |",
+        "+----+----+",
+        "| 10 | b  |",
+        "| 20 | b  |",
+        "| 30 | b  |",
+        "+----+----+",
+    ];
+    assert_batches_eq!(expected, &batches);
+
+    // Verify predicates work on both tables with the shared factory
+    let batches = ctx
+        .sql("SELECT c1 FROM t1 WHERE c2 = 'b' ORDER BY c1")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    #[rustfmt::skip]
+    let expected = [
+        "+----+",
+        "| c1 |",
+        "+----+",
+        "| 1  |",
+        "| 2  |",
+        "| 3  |",
+        "+----+",
+    ];
+    assert_batches_eq!(expected, &batches);
+
+    let batches = ctx
+        .sql("SELECT c1 FROM t2 WHERE c2 = 'b' ORDER BY c1")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    #[rustfmt::skip]
+    let expected = [
+        "+----+",
+        "| c1 |",
+        "+----+",
+        "| 10 |",
+        "| 20 |",
+        "| 30 |",
+        "+----+",
+    ];
+    assert_batches_eq!(expected, &batches);
+}
diff --git a/datafusion/core/tests/parquet/external_access_plan.rs b/datafusion/core/tests/parquet/external_access_plan.rs
index 5135f956852c3..9ff8137687c95 100644
--- a/datafusion/core/tests/parquet/external_access_plan.rs
+++ b/datafusion/core/tests/parquet/external_access_plan.rs
@@ -21,7 +21,7 @@ use std::path::Path;
 use std::sync::Arc;
 
 use crate::parquet::utils::MetricsFinder;
-use crate::parquet::{create_data_batch, Scenario};
+use crate::parquet::{Scenario, create_data_batch};
 
 use arrow::datatypes::SchemaRef;
 use arrow::util::pretty::pretty_format_batches;
@@ -29,17 +29,17 @@ use datafusion::common::Result;
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::physical_plan::ParquetSource;
 use datafusion::prelude::SessionContext;
-use datafusion_common::{assert_contains, DFSchema};
+use datafusion_common::{DFSchema, assert_contains};
 use datafusion_datasource_parquet::{ParquetAccessPlan, RowGroupAccess};
 use datafusion_execution::object_store::ObjectStoreUrl;
-use datafusion_expr::{col, lit, Expr};
-use datafusion_physical_plan::metrics::{MetricValue, MetricsSet};
+use datafusion_expr::{Expr, col, lit};
 use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::metrics::{MetricValue, MetricsSet};
 
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use datafusion_datasource::source::DataSourceExec;
-use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
 use parquet::arrow::ArrowWriter;
+use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
 use parquet::file::properties::WriterProperties;
 use tempfile::NamedTempFile;
 
@@ -257,7 +257,10 @@ async fn bad_selection() {
     .await
     .unwrap_err();
     let err_string = err.to_string();
-    assert_contains!(&err_string, "Internal error: Invalid ParquetAccessPlan Selection. Row group 0 has 5 rows but selection only specifies 4 rows");
+    assert_contains!(
+        &err_string,
+        "Row group 0 has 5 rows but selection only specifies 4 rows."
+    );
 }
 
 /// Return a RowSelection of 1 rows from a row group of 5 rows
@@ -355,11 +358,11 @@ impl TestFull {
         let source = if let Some(predicate) = predicate {
             let df_schema = DFSchema::try_from(schema.clone())?;
             let predicate = ctx.create_physical_expr(predicate, &df_schema)?;
-            Arc::new(ParquetSource::default().with_predicate(predicate))
+            Arc::new(ParquetSource::new(schema.clone()).with_predicate(predicate))
         } else {
-            Arc::new(ParquetSource::default())
+            Arc::new(ParquetSource::new(schema.clone()))
         };
-        let config = FileScanConfigBuilder::new(object_store_url, schema.clone(), source)
+        let config = FileScanConfigBuilder::new(object_store_url, source)
             .with_file(partitioned_file)
             .build();
 
@@ -406,7 +409,7 @@ fn get_test_data() -> TestData {
         .expect("tempfile creation");
 
     let props = WriterProperties::builder()
-        .set_max_row_group_size(row_per_group)
+        .set_max_row_group_row_count(Some(row_per_group))
         .build();
 
     let batches = create_data_batch(scenario);
diff --git a/datafusion/core/tests/parquet/file_statistics.rs b/datafusion/core/tests/parquet/file_statistics.rs
index 64ee92eda2545..84396be8a6a67 100644
--- a/datafusion/core/tests/parquet/file_statistics.rs
+++ b/datafusion/core/tests/parquet/file_statistics.rs
@@ -18,31 +18,30 @@
 use std::fs;
 use std::sync::Arc;
 
+use datafusion::datasource::TableProvider;
 use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::listing::{
     ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
 };
 use datafusion::datasource::source::DataSourceExec;
-use datafusion::datasource::TableProvider;
 use datafusion::execution::context::SessionState;
 use datafusion::execution::session_state::SessionStateBuilder;
 use datafusion::prelude::SessionContext;
-use datafusion_common::stats::Precision;
 use datafusion_common::DFSchema;
+use datafusion_common::stats::Precision;
+use datafusion_execution::cache::DefaultListFilesCache;
 use datafusion_execution::cache::cache_manager::CacheManagerConfig;
-use datafusion_execution::cache::cache_unit::{
-    DefaultFileStatisticsCache, DefaultListFilesCache,
-};
+use datafusion_execution::cache::cache_unit::DefaultFileStatisticsCache;
 use datafusion_execution::config::SessionConfig;
 use datafusion_execution::runtime_env::RuntimeEnvBuilder;
-use datafusion_expr::{col, lit, Expr};
+use datafusion_expr::{Expr, col, lit};
 
 use datafusion::datasource::physical_plan::FileScanConfig;
 use datafusion_common::config::ConfigOptions;
-use datafusion_physical_optimizer::filter_pushdown::FilterPushdown;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
-use datafusion_physical_plan::filter::FilterExec;
+use datafusion_physical_optimizer::filter_pushdown::FilterPushdown;
 use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::filter::FilterExec;
 use tempfile::tempdir;
 
 #[tokio::test]
@@ -89,7 +88,7 @@ async fn check_stats_precision_with_filter_pushdown() {
         .unwrap();
 
     assert!(
-        optimized_exec.as_any().is::<DataSourceExec>(),
+        optimized_exec.is::<DataSourceExec>(),
         "Sanity check that the pushdown did what we expected"
     );
     // Scan with filter pushdown, stats are inexact
@@ -127,8 +126,9 @@ async fn load_table_stats_with_session_level_cache() {
     );
     assert_eq!(
         exec1.partition_statistics(None).unwrap().total_byte_size,
-        // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
-        Precision::Exact(671),
+        // Byte size is absent because we cannot estimate the output size
+        // of the Arrow data since there are variable length columns.
+        Precision::Absent,
     );
     assert_eq!(get_static_cache_size(&state1), 1);
 
@@ -142,8 +142,8 @@ async fn load_table_stats_with_session_level_cache() {
     );
     assert_eq!(
         exec2.partition_statistics(None).unwrap().total_byte_size,
-        // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
-        Precision::Exact(671),
+        // Absent because the data contains variable length columns
+        Precision::Absent,
     );
     assert_eq!(get_static_cache_size(&state2), 1);
 
@@ -157,8 +157,8 @@ async fn load_table_stats_with_session_level_cache() {
     );
     assert_eq!(
         exec3.partition_statistics(None).unwrap().total_byte_size,
-        // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
-        Precision::Exact(671),
+        // Absent because the data contains variable length columns
+        Precision::Absent,
     );
     // List same file no increase
     assert_eq!(get_static_cache_size(&state1), 1);
@@ -196,12 +196,9 @@ async fn list_files_with_session_level_cache() {
     //Session 1 first time list files
     assert_eq!(get_list_file_cache_size(&state1), 0);
     let exec1 = table1.scan(&state1, None, &[], None).await.unwrap();
-    let data_source_exec = exec1.as_any().downcast_ref::<DataSourceExec>().unwrap();
+    let data_source_exec = exec1.downcast_ref::<DataSourceExec>().unwrap();
     let data_source = data_source_exec.data_source();
-    let parquet1 = data_source
-        .as_any()
-        .downcast_ref::<FileScanConfig>()
-        .unwrap();
+    let parquet1 = data_source.downcast_ref::<FileScanConfig>().unwrap();
 
     assert_eq!(get_list_file_cache_size(&state1), 1);
     let fg = &parquet1.file_groups;
@@ -212,12 +209,9 @@ async fn list_files_with_session_level_cache() {
     //check session 1 cache result not show in session 2
     assert_eq!(get_list_file_cache_size(&state2), 0);
     let exec2 = table2.scan(&state2, None, &[], None).await.unwrap();
-    let data_source_exec = exec2.as_any().downcast_ref::<DataSourceExec>().unwrap();
+    let data_source_exec = exec2.downcast_ref::<DataSourceExec>().unwrap();
     let data_source = data_source_exec.data_source();
-    let parquet2 = data_source
-        .as_any()
-        .downcast_ref::<FileScanConfig>()
-        .unwrap();
+    let parquet2 = data_source.downcast_ref::<FileScanConfig>().unwrap();
 
     assert_eq!(get_list_file_cache_size(&state2), 1);
     let fg2 = &parquet2.file_groups;
@@ -228,12 +222,9 @@ async fn list_files_with_session_level_cache() {
     //check session 1 cache result not show in session 2
     assert_eq!(get_list_file_cache_size(&state1), 1);
     let exec3 = table1.scan(&state1, None, &[], None).await.unwrap();
-    let data_source_exec = exec3.as_any().downcast_ref::<DataSourceExec>().unwrap();
+    let data_source_exec = exec3.downcast_ref::<DataSourceExec>().unwrap();
     let data_source = data_source_exec.data_source();
-    let parquet3 = data_source
-        .as_any()
-        .downcast_ref::<FileScanConfig>()
-        .unwrap();
+    let parquet3 = data_source.downcast_ref::<FileScanConfig>().unwrap();
 
     assert_eq!(get_list_file_cache_size(&state1), 1);
     let fg = &parquet3.file_groups;
diff --git a/datafusion/core/tests/parquet/filter_pushdown.rs b/datafusion/core/tests/parquet/filter_pushdown.rs
index 966f251613979..e6266b2c088d7 100644
--- a/datafusion/core/tests/parquet/filter_pushdown.rs
+++ b/datafusion/core/tests/parquet/filter_pushdown.rs
@@ -31,7 +31,7 @@ use arrow::record_batch::RecordBatch;
 use datafusion::physical_plan::collect;
 use datafusion::physical_plan::metrics::{MetricValue, MetricsSet};
 use datafusion::prelude::{
-    col, lit, lit_timestamp_nano, Expr, ParquetReadOptions, SessionContext,
+    Expr, ParquetReadOptions, SessionContext, col, lit, lit_timestamp_nano,
 };
 use datafusion::test_util::parquet::{ParquetScanOptions, TestParquetFile};
 use datafusion_expr::utils::{conjunction, disjunction, split_conjunction};
@@ -63,7 +63,7 @@ async fn single_file() {
 
     // Set the row group size smaller so can test with fewer rows
     let props = WriterProperties::builder()
-        .set_max_row_group_size(1024)
+        .set_max_row_group_row_count(Some(1024))
         .build();
 
     // Only create the parquet file once as it is fairly large
@@ -220,7 +220,6 @@ async fn single_file() {
 }
 
 #[tokio::test]
-#[allow(dead_code)]
 async fn single_file_small_data_pages() {
     let batches = read_parquet_test_data(
         "tests/data/filter_pushdown/single_file_small_pages.gz.parquet",
@@ -231,7 +230,7 @@ async fn single_file_small_data_pages() {
 
     // Set a low row count limit to improve page filtering
     let props = WriterProperties::builder()
-        .set_max_row_group_size(2048)
+        .set_max_row_group_row_count(Some(2048))
         .set_data_page_row_count_limit(512)
         .set_write_batch_size(512)
         .build();
@@ -636,6 +635,43 @@ async fn predicate_cache_pushdown_default() -> datafusion_common::Result<()> {
     config.options_mut().execution.parquet.pushdown_filters = true;
     let ctx = SessionContext::new_with_config(config);
     // The cache is on by default, and used when filter pushdown is enabled
+    PredicateCacheTest {
+        expected_inner_records: 8,
+        expected_records: 7, // reads more than necessary from the cache as then another bitmap is applied
+    }
+    .run(&ctx)
+    .await
+}
+
+#[tokio::test]
+async fn predicate_cache_stats_issue_19561() -> datafusion_common::Result<()> {
+    let mut config = SessionConfig::new();
+    config.options_mut().execution.parquet.pushdown_filters = true;
+    // force to get multiple batches to trigger repeated metric compound bug
+    config.options_mut().execution.batch_size = 1;
+    let ctx = SessionContext::new_with_config(config);
+    // The cache is on by default, and used when filter pushdown is enabled
+    PredicateCacheTest {
+        expected_inner_records: 8,
+        expected_records: 4,
+    }
+    .run(&ctx)
+    .await
+}
+
+#[tokio::test]
+async fn predicate_cache_pushdown_default_selections_only()
+-> datafusion_common::Result<()> {
+    let mut config = SessionConfig::new();
+    config.options_mut().execution.parquet.pushdown_filters = true;
+    // forcing filter selections minimizes the number of rows read from the cache
+    config
+        .options_mut()
+        .execution
+        .parquet
+        .force_filter_selections = true;
+    let ctx = SessionContext::new_with_config(config);
+    // The cache is on by default, and used when filter pushdown is enabled
     PredicateCacheTest {
         expected_inner_records: 8,
         expected_records: 4,
diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs
index 097600e45eadd..e96bd49b9ace9 100644
--- a/datafusion/core/tests/parquet/mod.rs
+++ b/datafusion/core/tests/parquet/mod.rs
@@ -19,20 +19,21 @@
 use crate::parquet::utils::MetricsFinder;
 use arrow::{
     array::{
-        make_array, Array, ArrayRef, BinaryArray, Date32Array, Date64Array,
-        Decimal128Array, DictionaryArray, FixedSizeBinaryArray, Float64Array, Int16Array,
-        Int32Array, Int64Array, Int8Array, LargeBinaryArray, LargeStringArray,
-        StringArray, TimestampMicrosecondArray, TimestampMillisecondArray,
-        TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array,
-        UInt64Array, UInt8Array,
+        Array, ArrayRef, BinaryArray, Date32Array, Date64Array, Decimal128Array,
+        DictionaryArray, FixedSizeBinaryArray, Float64Array, Int8Array, Int16Array,
+        Int32Array, Int64Array, LargeBinaryArray, LargeStringArray, StringArray,
+        TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
+        TimestampSecondArray, UInt8Array, UInt16Array, UInt32Array, UInt64Array,
+        make_array,
     },
     datatypes::{DataType, Field, Schema},
     record_batch::RecordBatch,
     util::pretty::pretty_format_batches,
 };
+use arrow_schema::SchemaRef;
 use chrono::{Datelike, Duration, TimeDelta};
 use datafusion::{
-    datasource::{provider_as_source, TableProvider},
+    datasource::{TableProvider, provider_as_source},
     physical_plan::metrics::MetricsSet,
     prelude::{ParquetReadOptions, SessionConfig, SessionContext},
 };
@@ -43,16 +44,18 @@ use parquet::file::properties::{EnabledStatistics, WriterProperties};
 use std::sync::Arc;
 use tempfile::NamedTempFile;
 
+mod content_defined_chunking;
 mod custom_reader;
 #[cfg(feature = "parquet_encryption")]
 mod encryption;
+mod expr_adapter;
 mod external_access_plan;
 mod file_statistics;
 mod filter_pushdown;
+mod ordering;
 mod page_pruning;
 mod row_group_pruning;
 mod schema;
-mod schema_adapter;
 mod schema_coercion;
 mod utils;
 
@@ -109,6 +112,26 @@ struct ContextWithParquet {
     ctx: SessionContext,
 }
 
+struct PruningMetric {
+    total_pruned: usize,
+    total_matched: usize,
+    total_fully_matched: usize,
+}
+
+impl PruningMetric {
+    pub fn total_pruned(&self) -> usize {
+        self.total_pruned
+    }
+
+    pub fn total_matched(&self) -> usize {
+        self.total_matched
+    }
+
+    pub fn total_fully_matched(&self) -> usize {
+        self.total_fully_matched
+    }
+}
+
 /// The output of running one of the test cases
 struct TestOutput {
     /// The input query SQL
@@ -126,8 +149,8 @@ struct TestOutput {
 impl TestOutput {
     /// retrieve the value of the named metric, if any
     fn metric_value(&self, metric_name: &str) -> Option<usize> {
-        if let Some((pruned, _matched)) = self.pruning_metric(metric_name) {
-            return Some(pruned);
+        if let Some(pm) = self.pruning_metric(metric_name) {
+            return Some(pm.total_pruned());
         }
 
         self.parquet_metrics
@@ -140,27 +163,33 @@ impl TestOutput {
             })
     }
 
-    fn pruning_metric(&self, metric_name: &str) -> Option<(usize, usize)> {
+    fn pruning_metric(&self, metric_name: &str) -> Option<PruningMetric> {
         let mut total_pruned = 0;
         let mut total_matched = 0;
+        let mut total_fully_matched = 0;
         let mut found = false;
 
         for metric in self.parquet_metrics.iter() {
             let metric = metric.as_ref();
-            if metric.value().name() == metric_name {
-                if let MetricValue::PruningMetrics {
+            if metric.value().name() == metric_name
+                && let MetricValue::PruningMetrics {
                     pruning_metrics, ..
                 } = metric.value()
-                {
-                    total_pruned += pruning_metrics.pruned();
-                    total_matched += pruning_metrics.matched();
-                    found = true;
-                }
+            {
+                total_pruned += pruning_metrics.pruned();
+                total_matched += pruning_metrics.matched();
+                total_fully_matched += pruning_metrics.fully_matched();
+
+                found = true;
             }
         }
 
         if found {
-            Some((total_pruned, total_matched))
+            Some(PruningMetric {
+                total_pruned,
+                total_matched,
+                total_fully_matched,
+            })
         } else {
             None
         }
@@ -172,27 +201,33 @@ impl TestOutput {
     }
 
     /// The number of row_groups pruned / matched by bloom filter
-    fn row_groups_bloom_filter(&self) -> Option<(usize, usize)> {
+    fn row_groups_bloom_filter(&self) -> Option<PruningMetric> {
         self.pruning_metric("row_groups_pruned_bloom_filter")
     }
 
     /// The number of row_groups matched by statistics
     fn row_groups_matched_statistics(&self) -> Option<usize> {
         self.pruning_metric("row_groups_pruned_statistics")
-            .map(|(_pruned, matched)| matched)
+            .map(|pm| pm.total_matched())
+    }
+
+    /// The number of row_groups fully matched by statistics
+    fn row_groups_fully_matched_statistics(&self) -> Option<usize> {
+        self.pruning_metric("row_groups_pruned_statistics")
+            .map(|pm| pm.total_fully_matched())
     }
 
     /// The number of row_groups pruned by statistics
     fn row_groups_pruned_statistics(&self) -> Option<usize> {
         self.pruning_metric("row_groups_pruned_statistics")
-            .map(|(pruned, _matched)| pruned)
+            .map(|pm| pm.total_pruned())
     }
 
     /// Metric `files_ranges_pruned_statistics` tracks both pruned and matched count,
     /// for testing purpose, here it only aggregate the `pruned` count.
     fn files_ranges_pruned_statistics(&self) -> Option<usize> {
         self.pruning_metric("files_ranges_pruned_statistics")
-            .map(|(pruned, _matched)| pruned)
+            .map(|pm| pm.total_pruned())
     }
 
     /// The number of row_groups matched by bloom filter or statistics
@@ -201,14 +236,13 @@ impl TestOutput {
     /// filter: 7 total -> 3 matched, this function returns 3 for the final matched
     /// count.
     fn row_groups_matched(&self) -> Option<usize> {
-        self.row_groups_bloom_filter()
-            .map(|(_pruned, matched)| matched)
+        self.row_groups_bloom_filter().map(|pm| pm.total_matched())
     }
 
     /// The number of row_groups pruned
     fn row_groups_pruned(&self) -> Option<usize> {
         self.row_groups_bloom_filter()
-            .map(|(pruned, _matched)| pruned)
+            .map(|pm| pm.total_pruned())
             .zip(self.row_groups_pruned_statistics())
             .map(|(a, b)| a + b)
     }
@@ -216,7 +250,13 @@ impl TestOutput {
     /// The number of row pages pruned
     fn row_pages_pruned(&self) -> Option<usize> {
         self.pruning_metric("page_index_rows_pruned")
-            .map(|(pruned, _matched)| pruned)
+            .map(|pm| pm.total_pruned())
+    }
+
+    /// The number of row groups pruned by limit pruning
+    fn limit_pruned_row_groups(&self) -> Option<usize> {
+        self.pruning_metric("limit_pruned_row_groups")
+            .map(|pm| pm.total_pruned())
     }
 
     fn description(&self) -> String {
@@ -232,20 +272,41 @@ impl TestOutput {
 /// and the appropriate scenario
 impl ContextWithParquet {
     async fn new(scenario: Scenario, unit: Unit) -> Self {
-        Self::with_config(scenario, unit, SessionConfig::new()).await
+        Self::with_config(scenario, unit, SessionConfig::new(), None, None).await
+    }
+
+    /// Set custom schema and batches for the test
+    pub async fn with_custom_data(
+        scenario: Scenario,
+        unit: Unit,
+        schema: Arc<Schema>,
+        batches: Vec<RecordBatch>,
+    ) -> Self {
+        Self::with_config(
+            scenario,
+            unit,
+            SessionConfig::new(),
+            Some(schema),
+            Some(batches),
+        )
+        .await
     }
 
     async fn with_config(
         scenario: Scenario,
         unit: Unit,
         mut config: SessionConfig,
+        custom_schema: Option<SchemaRef>,
+        custom_batches: Option<Vec<RecordBatch>>,
     ) -> Self {
         // Use a single partition for deterministic results no matter how many CPUs the host has
         config = config.with_target_partitions(1);
         let file = match unit {
             Unit::RowGroup(row_per_group) => {
                 config = config.with_parquet_bloom_filter_pruning(true);
-                make_test_file_rg(scenario, row_per_group).await
+                config.options_mut().execution.parquet.pushdown_filters = true;
+                make_test_file_rg(scenario, row_per_group, custom_schema, custom_batches)
+                    .await
             }
             Unit::Page(row_per_page) => {
                 config = config.with_parquet_page_index_pruning(true);
@@ -516,9 +577,9 @@ fn make_uint_batches(start: u8, end: u8) -> RecordBatch {
         Field::new("u64", DataType::UInt64, true),
     ]));
     let v8: Vec<u8> = (start..end).collect();
-    let v16: Vec<u16> = (start as _..end as _).collect();
-    let v32: Vec<u32> = (start as _..end as _).collect();
-    let v64: Vec<u64> = (start as _..end as _).collect();
+    let v16: Vec<u16> = (start as u16..end as u16).collect();
+    let v32: Vec<u32> = (start as u32..end as u32).collect();
+    let v64: Vec<u64> = (start as u64..end as u64).collect();
     RecordBatch::try_new(
         schema,
         vec![
@@ -652,6 +713,7 @@ fn make_date_batch(offset: Duration) -> RecordBatch {
 /// of the column. It is *not* a table named service.name
 ///
 /// name | service.name
+#[expect(clippy::needless_pass_by_value)]
 fn make_bytearray_batch(
     name: &str,
     string_values: Vec<&str>,
@@ -707,6 +769,7 @@ fn make_bytearray_batch(
 /// of the column. It is *not* a table named service.name
 ///
 /// name | service.name
+#[expect(clippy::needless_pass_by_value)]
 fn make_names_batch(name: &str, service_name_values: Vec<&str>) -> RecordBatch {
     let num_rows = service_name_values.len();
     let name: StringArray = std::iter::repeat_n(Some(name), num_rows).collect();
@@ -791,6 +854,7 @@ fn make_utf8_batch(value: Vec<Option<&str>>) -> RecordBatch {
     .unwrap()
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn make_dictionary_batch(strings: Vec<&str>, integers: Vec<i32>) -> RecordBatch {
     let keys = Int32Array::from_iter(0..strings.len() as i32);
     let small_keys = Int16Array::from_iter(0..strings.len() as i16);
@@ -839,6 +903,7 @@ fn make_dictionary_batch(strings: Vec<&str>, integers: Vec<i32>) -> RecordBatch
     .unwrap()
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
     match scenario {
         Scenario::Timestamps => {
@@ -1071,7 +1136,12 @@ fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
 }
 
 /// Create a test parquet file with various data types
-async fn make_test_file_rg(scenario: Scenario, row_per_group: usize) -> NamedTempFile {
+async fn make_test_file_rg(
+    scenario: Scenario,
+    row_per_group: usize,
+    custom_schema: Option<SchemaRef>,
+    custom_batches: Option<Vec<RecordBatch>>,
+) -> NamedTempFile {
     let mut output_file = tempfile::Builder::new()
         .prefix("parquet_pruning")
         .suffix(".parquet")
@@ -1079,13 +1149,19 @@ async fn make_test_file_rg(scenario: Scenario, row_per_group: usize) -> NamedTem
         .expect("tempfile creation");
 
     let props = WriterProperties::builder()
-        .set_max_row_group_size(row_per_group)
+        .set_max_row_group_row_count(Some(row_per_group))
         .set_bloom_filter_enabled(true)
         .set_statistics_enabled(EnabledStatistics::Page)
         .build();
 
-    let batches = create_data_batch(scenario);
-    let schema = batches[0].schema();
+    let (batches, schema) =
+        if let (Some(schema), Some(batches)) = (custom_schema, custom_batches) {
+            (batches, schema)
+        } else {
+            let batches = create_data_batch(scenario);
+            let schema = batches[0].schema();
+            (batches, schema)
+        };
 
     let mut writer = ArrowWriter::try_new(&mut output_file, schema, Some(props)).unwrap();
 
diff --git a/datafusion/core/tests/parquet/ordering.rs b/datafusion/core/tests/parquet/ordering.rs
new file mode 100644
index 0000000000000..faecb4ca6a861
--- /dev/null
+++ b/datafusion/core/tests/parquet/ordering.rs
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Tests for ordering in Parquet sorting_columns metadata
+
+use datafusion::prelude::SessionContext;
+use datafusion_common::Result;
+use tempfile::tempdir;
+
+/// Test that CREATE TABLE ... WITH ORDER writes sorting_columns to Parquet metadata
+#[tokio::test]
+async fn test_create_table_with_order_writes_sorting_columns() -> Result<()> {
+    use parquet::file::reader::FileReader;
+    use parquet::file::serialized_reader::SerializedFileReader;
+    use std::fs::File;
+
+    let ctx = SessionContext::new();
+    let tmp_dir = tempdir()?;
+    let table_path = tmp_dir.path().join("sorted_table");
+    std::fs::create_dir_all(&table_path)?;
+
+    // Create external table with ordering
+    let create_table_sql = format!(
+        "CREATE EXTERNAL TABLE sorted_data (a INT, b VARCHAR) \
+         STORED AS PARQUET \
+         LOCATION '{}' \
+         WITH ORDER (a ASC NULLS FIRST, b DESC NULLS LAST)",
+        table_path.display()
+    );
+    ctx.sql(&create_table_sql).await?;
+
+    // Insert sorted data
+    ctx.sql("INSERT INTO sorted_data VALUES (1, 'x'), (2, 'y'), (3, 'z')")
+        .await?
+        .collect()
+        .await?;
+
+    // Find the parquet file that was written
+    let parquet_files: Vec<_> = std::fs::read_dir(&table_path)?
+        .filter_map(|e| e.ok())
+        .filter(|e| e.path().extension().is_some_and(|ext| ext == "parquet"))
+        .collect();
+
+    assert!(
+        !parquet_files.is_empty(),
+        "Expected at least one parquet file in {}",
+        table_path.display()
+    );
+
+    // Read the parquet file and verify sorting_columns metadata
+    let file = File::open(parquet_files[0].path())?;
+    let reader = SerializedFileReader::new(file)?;
+    let metadata = reader.metadata();
+
+    // Check that row group has sorting_columns
+    let row_group = metadata.row_group(0);
+    let sorting_columns = row_group.sorting_columns();
+
+    assert!(
+        sorting_columns.is_some(),
+        "Expected sorting_columns in row group metadata"
+    );
+    let sorting = sorting_columns.unwrap();
+    assert_eq!(sorting.len(), 2, "Expected 2 sorting columns");
+
+    // First column: a ASC NULLS FIRST (column_idx = 0)
+    assert_eq!(sorting[0].column_idx, 0, "First sort column should be 'a'");
+    assert!(
+        !sorting[0].descending,
+        "First column should be ASC (descending=false)"
+    );
+    assert!(
+        sorting[0].nulls_first,
+        "First column should have NULLS FIRST"
+    );
+
+    // Second column: b DESC NULLS LAST (column_idx = 1)
+    assert_eq!(sorting[1].column_idx, 1, "Second sort column should be 'b'");
+    assert!(
+        sorting[1].descending,
+        "Second column should be DESC (descending=true)"
+    );
+    assert!(
+        !sorting[1].nulls_first,
+        "Second column should have NULLS LAST"
+    );
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/parquet/page_pruning.rs b/datafusion/core/tests/parquet/page_pruning.rs
index 27bee10234b57..a41803191ad05 100644
--- a/datafusion/core/tests/parquet/page_pruning.rs
+++ b/datafusion/core/tests/parquet/page_pruning.rs
@@ -20,26 +20,29 @@ use std::sync::Arc;
 use crate::parquet::Unit::Page;
 use crate::parquet::{ContextWithParquet, Scenario};
 
-use arrow::array::RecordBatch;
-use datafusion::datasource::file_format::parquet::ParquetFormat;
+use arrow::array::{Int32Array, RecordBatch};
+use arrow::datatypes::{DataType, Field, Schema};
 use datafusion::datasource::file_format::FileFormat;
+use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::object_store::ObjectStoreUrl;
 use datafusion::datasource::physical_plan::ParquetSource;
 use datafusion::datasource::source::DataSourceExec;
 use datafusion::execution::context::SessionState;
-use datafusion::physical_plan::metrics::MetricValue;
 use datafusion::physical_plan::ExecutionPlan;
-use datafusion::prelude::SessionContext;
+use datafusion::physical_plan::metrics::MetricValue;
+use datafusion::prelude::{SessionConfig, SessionContext};
 use datafusion_common::{ScalarValue, ToDFSchema};
 use datafusion_expr::execution_props::ExecutionProps;
-use datafusion_expr::{col, lit, Expr};
+use datafusion_expr::{Expr, col, lit};
 use datafusion_physical_expr::create_physical_expr;
 
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use futures::StreamExt;
-use object_store::path::Path;
 use object_store::ObjectMeta;
+use object_store::path::Path;
+use parquet::arrow::ArrowWriter;
+use parquet::file::properties::WriterProperties;
 
 async fn get_parquet_exec(
     state: &SessionState,
@@ -67,26 +70,19 @@ async fn get_parquet_exec(
         .await
         .unwrap();
 
-    let partitioned_file = PartitionedFile {
-        object_meta: meta,
-        partition_values: vec![],
-        range: None,
-        statistics: None,
-        extensions: None,
-        metadata_size_hint: None,
-    };
+    let partitioned_file = PartitionedFile::new_from_meta(meta);
 
     let df_schema = schema.clone().to_dfschema().unwrap();
     let execution_props = ExecutionProps::new();
     let predicate = create_physical_expr(&filter, &df_schema, &execution_props).unwrap();
 
     let source = Arc::new(
-        ParquetSource::default()
+        ParquetSource::new(schema.clone())
             .with_predicate(predicate)
             .with_enable_page_index(true)
             .with_pushdown_filters(pushdown_filters),
     );
-    let base_config = FileScanConfigBuilder::new(object_store_url, schema, source)
+    let base_config = FileScanConfigBuilder::new(object_store_url, source)
         .with_file(partitioned_file)
         .build();
 
@@ -370,281 +366,367 @@ async fn prune_date64() {
 }
 
 macro_rules! int_tests {
-    ($bits:expr) => {
-        paste::item! {
-            #[tokio::test]
-            //                      null count  min                                       max
-            // page-0                         0  -5                                        -1
-            // page-1                         0  -4                                        0
-            // page-2                         0  0                                         4
-            // page-3                         0  5                                         9
-            async fn [<prune_int $bits _lt>]() {
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where i{} < 1", $bits),
-                    Some(0),
-                    Some(5),
-                    11,
-                    5,
-                )
-                .await;
-                // result of sql "SELECT * FROM t where i < 1" is same as
-                // "SELECT * FROM t where -i > -1"
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where -i{} > -1", $bits),
-                    Some(0),
-                    Some(5),
-                    11,
-                    5,
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _gt >]() {
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where i{} > 8", $bits),
-                    Some(0),
-                    Some(15),
-                    1,
-                    5,
-                )
-                .await;
-
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where -i{} < -8", $bits),
-                    Some(0),
-                    Some(15),
-                    1,
-                    5,
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _eq >]() {
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where i{} = 1", $bits),
-                    Some(0),
-                    Some(15),
-                    1,
-                    5
-                )
-                .await;
-            }
-            #[tokio::test]
-            async fn [<prune_int $bits _scalar_fun_and_eq >]() {
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where abs(i{}) = 1  and i{} = 1", $bits, $bits),
-                    Some(0),
-                    Some(15),
-                    1,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _scalar_fun >]() {
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where abs(i{}) = 1", $bits),
-                    Some(0),
-                    Some(0),
-                    3,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _complex_expr>]() {
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where i{}+1 = 1", $bits),
-                    Some(0),
-                    Some(0),
-                    2,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _complex_expr_subtract >]() {
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where 1-i{} > 1", $bits),
-                    Some(0),
-                    Some(0),
-                    9,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _eq_in_list >]() {
-                // result of sql "SELECT * FROM t where in (1)"
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where i{} in (1)", $bits),
-                    Some(0),
-                    Some(15),
-                    1,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _eq_in_list_negated >]() {
-                // result of sql "SELECT * FROM t where not in (1)" prune nothing
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where i{} not in (1)", $bits),
-                    Some(0),
-                    Some(0),
-                    19,
-                    5
-                )
-                .await;
-            }
+    ($bits:expr, $fn_lt:ident, $fn_gt:ident, $fn_eq:ident, $fn_scalar_fun_and_eq:ident, $fn_scalar_fun:ident, $fn_complex_expr:ident, $fn_complex_expr_subtract:ident, $fn_eq_in_list:ident, $fn_eq_in_list_negated:ident) => {
+        #[tokio::test]
+        //                      null count  min                                       max
+        // page-0                         0  -5                                        -1
+        // page-1                         0  -4                                        0
+        // page-2                         0  0                                         4
+        // page-3                         0  5                                         9
+        async fn $fn_lt() {
+            test_prune(
+                Scenario::Int,
+                &format!("SELECT * FROM t where i{} < 1", $bits),
+                Some(0),
+                Some(5),
+                11,
+                5,
+            )
+            .await;
+            // result of sql "SELECT * FROM t where i < 1" is same as
+            // "SELECT * FROM t where -i > -1"
+            test_prune(
+                Scenario::Int,
+                &format!("SELECT * FROM t where -i{} > -1", $bits),
+                Some(0),
+                Some(5),
+                11,
+                5,
+            )
+            .await;
         }
-    }
+
+        #[tokio::test]
+        async fn $fn_gt() {
+            test_prune(
+                Scenario::Int,
+                &format!("SELECT * FROM t where i{} > 8", $bits),
+                Some(0),
+                Some(15),
+                1,
+                5,
+            )
+            .await;
+
+            test_prune(
+                Scenario::Int,
+                &format!("SELECT * FROM t where -i{} < -8", $bits),
+                Some(0),
+                Some(15),
+                1,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq() {
+            test_prune(
+                Scenario::Int,
+                &format!("SELECT * FROM t where i{} = 1", $bits),
+                Some(0),
+                Some(15),
+                1,
+                5,
+            )
+            .await;
+        }
+        #[tokio::test]
+        async fn $fn_scalar_fun_and_eq() {
+            test_prune(
+                Scenario::Int,
+                &format!(
+                    "SELECT * FROM t where abs(i{}) = 1  and i{} = 1",
+                    $bits, $bits
+                ),
+                Some(0),
+                Some(15),
+                1,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_scalar_fun() {
+            test_prune(
+                Scenario::Int,
+                &format!("SELECT * FROM t where abs(i{}) = 1", $bits),
+                Some(0),
+                Some(0),
+                3,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_complex_expr() {
+            test_prune(
+                Scenario::Int,
+                &format!("SELECT * FROM t where i{}+1 = 1", $bits),
+                Some(0),
+                Some(0),
+                2,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_complex_expr_subtract() {
+            test_prune(
+                Scenario::Int,
+                &format!("SELECT * FROM t where 1-i{} > 1", $bits),
+                Some(0),
+                Some(0),
+                9,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq_in_list() {
+            // result of sql "SELECT * FROM t where in (1)"
+            test_prune(
+                Scenario::Int,
+                &format!("SELECT * FROM t where i{} in (1)", $bits),
+                Some(0),
+                Some(15),
+                1,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq_in_list_negated() {
+            // result of sql "SELECT * FROM t where not in (1)" prune nothing
+            test_prune(
+                Scenario::Int,
+                &format!("SELECT * FROM t where i{} not in (1)", $bits),
+                Some(0),
+                Some(0),
+                19,
+                5,
+            )
+            .await;
+        }
+    };
 }
 
-int_tests!(8);
-int_tests!(16);
-int_tests!(32);
-int_tests!(64);
+int_tests!(
+    8,
+    prune_int8_lt,
+    prune_int8_gt,
+    prune_int8_eq,
+    prune_int8_scalar_fun_and_eq,
+    prune_int8_scalar_fun,
+    prune_int8_complex_expr,
+    prune_int8_complex_expr_subtract,
+    prune_int8_eq_in_list,
+    prune_int8_eq_in_list_negated
+);
+int_tests!(
+    16,
+    prune_int16_lt,
+    prune_int16_gt,
+    prune_int16_eq,
+    prune_int16_scalar_fun_and_eq,
+    prune_int16_scalar_fun,
+    prune_int16_complex_expr,
+    prune_int16_complex_expr_subtract,
+    prune_int16_eq_in_list,
+    prune_int16_eq_in_list_negated
+);
+int_tests!(
+    32,
+    prune_int32_lt,
+    prune_int32_gt,
+    prune_int32_eq,
+    prune_int32_scalar_fun_and_eq,
+    prune_int32_scalar_fun,
+    prune_int32_complex_expr,
+    prune_int32_complex_expr_subtract,
+    prune_int32_eq_in_list,
+    prune_int32_eq_in_list_negated
+);
+int_tests!(
+    64,
+    prune_int64_lt,
+    prune_int64_gt,
+    prune_int64_eq,
+    prune_int64_scalar_fun_and_eq,
+    prune_int64_scalar_fun,
+    prune_int64_complex_expr,
+    prune_int64_complex_expr_subtract,
+    prune_int64_eq_in_list,
+    prune_int64_eq_in_list_negated
+);
 
 macro_rules! uint_tests {
-    ($bits:expr) => {
-        paste::item! {
-            #[tokio::test]
-            //                      null count  min                                       max
-            // page-0                         0  0                                         4
-            // page-1                         0  1                                         5
-            // page-2                         0  5                                         9
-            // page-3                         0  250                                       254
-            async fn [<prune_uint $bits _lt>]() {
-                test_prune(
-                    Scenario::UInt,
-                    &format!("SELECT * FROM t where u{} < 6", $bits),
-                    Some(0),
-                    Some(5),
-                    11,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _gt >]() {
-                test_prune(
-                    Scenario::UInt,
-                    &format!("SELECT * FROM t where u{} > 253", $bits),
-                    Some(0),
-                    Some(15),
-                    1,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _eq >]() {
-                test_prune(
-                    Scenario::UInt,
-                    &format!("SELECT * FROM t where u{} = 6", $bits),
-                    Some(0),
-                    Some(15),
-                    1,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _scalar_fun_and_eq >]() {
-                test_prune(
-                    Scenario::UInt,
-                    &format!("SELECT * FROM t where power(u{}, 2) = 36 and u{} = 6", $bits, $bits),
-                    Some(0),
-                    Some(15),
-                    1,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _scalar_fun >]() {
-                test_prune(
-                    Scenario::UInt,
-                    &format!("SELECT * FROM t where power(u{}, 2) = 25", $bits),
-                    Some(0),
-                    Some(0),
-                    2,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _complex_expr>]() {
-                test_prune(
-                    Scenario::UInt,
-                    &format!("SELECT * FROM t where u{}+1 = 6", $bits),
-                    Some(0),
-                    Some(0),
-                    2,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _eq_in_list >]() {
-                // result of sql "SELECT * FROM t where in (1)"
-                test_prune(
-                    Scenario::UInt,
-                    &format!("SELECT * FROM t where u{} in (6)", $bits),
-                    Some(0),
-                    Some(15),
-                    1,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _eq_in_list_negated >]() {
-                // result of sql "SELECT * FROM t where not in (6)" prune nothing
-                test_prune(
-                    Scenario::UInt,
-                    &format!("SELECT * FROM t where u{} not in (6)", $bits),
-                    Some(0),
-                    Some(0),
-                    19,
-                    5
-                )
-                .await;
-            }
+    ($bits:expr, $fn_lt:ident, $fn_gt:ident, $fn_eq:ident, $fn_scalar_fun_and_eq:ident, $fn_scalar_fun:ident, $fn_complex_expr:ident, $fn_eq_in_list:ident, $fn_eq_in_list_negated:ident) => {
+        #[tokio::test]
+        //                      null count  min                                       max
+        // page-0                         0  0                                         4
+        // page-1                         0  1                                         5
+        // page-2                         0  5                                         9
+        // page-3                         0  250                                       254
+        async fn $fn_lt() {
+            test_prune(
+                Scenario::UInt,
+                &format!("SELECT * FROM t where u{} < 6", $bits),
+                Some(0),
+                Some(5),
+                11,
+                5,
+            )
+            .await;
         }
-    }
+
+        #[tokio::test]
+        async fn $fn_gt() {
+            test_prune(
+                Scenario::UInt,
+                &format!("SELECT * FROM t where u{} > 253", $bits),
+                Some(0),
+                Some(15),
+                1,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq() {
+            test_prune(
+                Scenario::UInt,
+                &format!("SELECT * FROM t where u{} = 6", $bits),
+                Some(0),
+                Some(15),
+                1,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_scalar_fun_and_eq() {
+            test_prune(
+                Scenario::UInt,
+                &format!(
+                    "SELECT * FROM t where power(u{}, 2) = 36 and u{} = 6",
+                    $bits, $bits
+                ),
+                Some(0),
+                Some(15),
+                1,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_scalar_fun() {
+            test_prune(
+                Scenario::UInt,
+                &format!("SELECT * FROM t where power(u{}, 2) = 25", $bits),
+                Some(0),
+                Some(0),
+                2,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_complex_expr() {
+            test_prune(
+                Scenario::UInt,
+                &format!("SELECT * FROM t where u{}+1 = 6", $bits),
+                Some(0),
+                Some(0),
+                2,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq_in_list() {
+            // result of sql "SELECT * FROM t where in (1)"
+            test_prune(
+                Scenario::UInt,
+                &format!("SELECT * FROM t where u{} in (6)", $bits),
+                Some(0),
+                Some(15),
+                1,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq_in_list_negated() {
+            // result of sql "SELECT * FROM t where not in (6)" prune nothing
+            test_prune(
+                Scenario::UInt,
+                &format!("SELECT * FROM t where u{} not in (6)", $bits),
+                Some(0),
+                Some(0),
+                19,
+                5,
+            )
+            .await;
+        }
+    };
 }
 
-uint_tests!(8);
-uint_tests!(16);
-uint_tests!(32);
-uint_tests!(64);
+uint_tests!(
+    8,
+    prune_uint8_lt,
+    prune_uint8_gt,
+    prune_uint8_eq,
+    prune_uint8_scalar_fun_and_eq,
+    prune_uint8_scalar_fun,
+    prune_uint8_complex_expr,
+    prune_uint8_eq_in_list,
+    prune_uint8_eq_in_list_negated
+);
+uint_tests!(
+    16,
+    prune_uint16_lt,
+    prune_uint16_gt,
+    prune_uint16_eq,
+    prune_uint16_scalar_fun_and_eq,
+    prune_uint16_scalar_fun,
+    prune_uint16_complex_expr,
+    prune_uint16_eq_in_list,
+    prune_uint16_eq_in_list_negated
+);
+uint_tests!(
+    32,
+    prune_uint32_lt,
+    prune_uint32_gt,
+    prune_uint32_eq,
+    prune_uint32_scalar_fun_and_eq,
+    prune_uint32_scalar_fun,
+    prune_uint32_complex_expr,
+    prune_uint32_eq_in_list,
+    prune_uint32_eq_in_list_negated
+);
+uint_tests!(
+    64,
+    prune_uint64_lt,
+    prune_uint64_gt,
+    prune_uint64_eq,
+    prune_uint64_scalar_fun_and_eq,
+    prune_uint64_scalar_fun,
+    prune_uint64_complex_expr,
+    prune_uint64_eq_in_list,
+    prune_uint64_eq_in_list_negated
+);
 
 #[tokio::test]
 //                      null count  min                                       max
@@ -968,3 +1050,56 @@ fn cast_count_metric(metric: MetricValue) -> Option<usize> {
         _ => None,
     }
 }
+
+#[tokio::test]
+async fn test_parquet_opener_without_page_index() {
+    // Defines a simple schema and batch
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)]));
+    let batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
+    )
+    .unwrap();
+
+    // Create a temp file
+    let file = tempfile::Builder::new()
+        .suffix(".parquet")
+        .tempfile()
+        .unwrap();
+    let path = file.path().to_str().unwrap().to_string();
+
+    // Write parquet WITHOUT page index
+    // The default WriterProperties does not write page index, but we set it explicitly
+    // to be robust against future changes in defaults as requested by reviewers.
+    let props = WriterProperties::builder()
+        .set_statistics_enabled(parquet::file::properties::EnabledStatistics::None)
+        .build();
+
+    let file_fs = std::fs::File::create(&path).unwrap();
+    let mut writer = ArrowWriter::try_new(file_fs, batch.schema(), Some(props)).unwrap();
+    writer.write(&batch).unwrap();
+    writer.close().unwrap();
+
+    // Setup SessionContext with PageIndex enabled
+    // This triggers the ParquetOpener to try and load page index if available
+    let config = SessionConfig::new().with_parquet_page_index_pruning(true);
+
+    let ctx = SessionContext::new_with_config(config);
+
+    // Register the table
+    ctx.register_parquet("t", &path, Default::default())
+        .await
+        .unwrap();
+
+    // Query the table
+    // If the bug exists, this might fail because Opener tries to load PageIndex forcefully
+    let df = ctx.sql("SELECT * FROM t").await.unwrap();
+    let batches = df
+        .collect()
+        .await
+        .expect("Failed to read parquet file without page index");
+
+    // We expect this to succeed, but currently it might fail
+    assert_eq!(batches.len(), 1);
+    assert_eq!(batches[0].num_rows(), 3);
+}
diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs
index 0411298055f26..3ec3541af977a 100644
--- a/datafusion/core/tests/parquet/row_group_pruning.rs
+++ b/datafusion/core/tests/parquet/row_group_pruning.rs
@@ -18,8 +18,12 @@
 //! This file contains an end to end test of parquet pruning. It writes
 //! data into a parquet file and then verifies row groups are pruned as
 //! expected.
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, Int32Array, RecordBatch};
+use arrow_schema::{DataType, Field, Schema};
 use datafusion::prelude::SessionConfig;
-use datafusion_common::ScalarValue;
+use datafusion_common::{DataFusionError, ScalarValue};
 use itertools::Itertools;
 
 use crate::parquet::Unit::RowGroup;
@@ -30,10 +34,12 @@ struct RowGroupPruningTest {
     query: String,
     expected_errors: Option<usize>,
     expected_row_group_matched_by_statistics: Option<usize>,
+    expected_row_group_fully_matched_by_statistics: Option<usize>,
     expected_row_group_pruned_by_statistics: Option<usize>,
     expected_files_pruned_by_statistics: Option<usize>,
     expected_row_group_matched_by_bloom_filter: Option<usize>,
     expected_row_group_pruned_by_bloom_filter: Option<usize>,
+    expected_limit_pruned_row_groups: Option<usize>,
     expected_rows: usize,
 }
 impl RowGroupPruningTest {
@@ -45,9 +51,11 @@ impl RowGroupPruningTest {
             expected_errors: None,
             expected_row_group_matched_by_statistics: None,
             expected_row_group_pruned_by_statistics: None,
+            expected_row_group_fully_matched_by_statistics: None,
             expected_files_pruned_by_statistics: None,
             expected_row_group_matched_by_bloom_filter: None,
             expected_row_group_pruned_by_bloom_filter: None,
+            expected_limit_pruned_row_groups: None,
             expected_rows: 0,
         }
     }
@@ -76,6 +84,15 @@ impl RowGroupPruningTest {
         self
     }
 
+    // Set the expected fully matched row groups by statistics
+    fn with_fully_matched_by_stats(
+        mut self,
+        fully_matched_by_stats: Option<usize>,
+    ) -> Self {
+        self.expected_row_group_fully_matched_by_statistics = fully_matched_by_stats;
+        self
+    }
+
     // Set the expected pruned row groups by statistics
     fn with_pruned_by_stats(mut self, pruned_by_stats: Option<usize>) -> Self {
         self.expected_row_group_pruned_by_statistics = pruned_by_stats;
@@ -99,6 +116,11 @@ impl RowGroupPruningTest {
         self
     }
 
+    fn with_limit_pruned_row_groups(mut self, pruned_by_limit: Option<usize>) -> Self {
+        self.expected_limit_pruned_row_groups = pruned_by_limit;
+        self
+    }
+
     /// Set the number of expected rows from the output of this test
     fn with_expected_rows(mut self, rows: usize) -> Self {
         self.expected_rows = rows;
@@ -135,15 +157,74 @@ impl RowGroupPruningTest {
         );
         let bloom_filter_metrics = output.row_groups_bloom_filter();
         assert_eq!(
-            bloom_filter_metrics.map(|(_pruned, matched)| matched),
+            bloom_filter_metrics.as_ref().map(|pm| pm.total_matched()),
             self.expected_row_group_matched_by_bloom_filter,
             "mismatched row_groups_matched_bloom_filter",
         );
         assert_eq!(
-            bloom_filter_metrics.map(|(pruned, _matched)| pruned),
+            bloom_filter_metrics.map(|pm| pm.total_pruned()),
             self.expected_row_group_pruned_by_bloom_filter,
             "mismatched row_groups_pruned_bloom_filter",
         );
+
+        assert_eq!(
+            output.result_rows,
+            self.expected_rows,
+            "Expected {} rows, got {}: {}",
+            output.result_rows,
+            self.expected_rows,
+            output.description(),
+        );
+    }
+
+    // Execute the test with the current configuration
+    async fn test_row_group_prune_with_custom_data(
+        self,
+        schema: Arc<Schema>,
+        batches: Vec<RecordBatch>,
+        max_row_per_group: usize,
+    ) {
+        let output = ContextWithParquet::with_custom_data(
+            self.scenario,
+            RowGroup(max_row_per_group),
+            schema,
+            batches,
+        )
+        .await
+        .query(&self.query)
+        .await;
+
+        println!("{}", output.description());
+        assert_eq!(
+            output.predicate_evaluation_errors(),
+            self.expected_errors,
+            "mismatched predicate_evaluation error"
+        );
+        assert_eq!(
+            output.row_groups_matched_statistics(),
+            self.expected_row_group_matched_by_statistics,
+            "mismatched row_groups_matched_statistics",
+        );
+        assert_eq!(
+            output.row_groups_fully_matched_statistics(),
+            self.expected_row_group_fully_matched_by_statistics,
+            "mismatched row_groups_fully_matched_statistics",
+        );
+        assert_eq!(
+            output.row_groups_pruned_statistics(),
+            self.expected_row_group_pruned_by_statistics,
+            "mismatched row_groups_pruned_statistics",
+        );
+        assert_eq!(
+            output.files_ranges_pruned_statistics(),
+            self.expected_files_pruned_by_statistics,
+            "mismatched files_ranges_pruned_statistics",
+        );
+        assert_eq!(
+            output.limit_pruned_row_groups(),
+            self.expected_limit_pruned_row_groups,
+            "mismatched limit_pruned_row_groups",
+        );
         assert_eq!(
             output.result_rows,
             self.expected_rows,
@@ -289,11 +370,16 @@ async fn prune_disabled() {
     let expected_rows = 10;
     let config = SessionConfig::new().with_parquet_pruning(false);
 
-    let output =
-        ContextWithParquet::with_config(Scenario::Timestamps, RowGroup(5), config)
-            .await
-            .query(query)
-            .await;
+    let output = ContextWithParquet::with_config(
+        Scenario::Timestamps,
+        RowGroup(5),
+        config,
+        None,
+        None,
+    )
+    .await
+    .query(query)
+    .await;
     println!("{}", output.description());
 
     // This should not prune any
@@ -313,321 +399,365 @@ async fn prune_disabled() {
 // https://github.com/apache/datafusion/issues/9779 bug so that tests pass
 // if and only if Bloom filters on Int8 and Int16 columns are still buggy.
 macro_rules! int_tests {
-    ($bits:expr) => {
-        paste::item! {
-            #[tokio::test]
-            async fn [<prune_int $bits _lt >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::Int)
-                    .with_query(&format!("SELECT * FROM t where i{} < 1", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(3))
-                    .with_pruned_by_stats(Some(1))
-                    .with_pruned_files(Some(0))
-                    .with_matched_by_bloom_filter(Some(3))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(11)
-                    .test_row_group_prune()
-                    .await;
-
-                // result of sql "SELECT * FROM t where i < 1" is same as
-                // "SELECT * FROM t where -i > -1"
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::Int)
-                    .with_query(&format!("SELECT * FROM t where -i{} > -1", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(3))
-                    .with_pruned_by_stats(Some(1))
-                    .with_pruned_files(Some(0))
-                    .with_matched_by_bloom_filter(Some(3))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(11)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _eq >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::Int)
-                    .with_query(&format!("SELECT * FROM t where i{} = 1", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(1))
-                    .with_pruned_by_stats(Some(3))
-                    .with_pruned_files(Some(0))
-                    .with_matched_by_bloom_filter(Some(1))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(1)
-                    .test_row_group_prune()
-                    .await;
-            }
-            #[tokio::test]
-            async fn [<prune_int $bits _scalar_fun_and_eq >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::Int)
-                    .with_query(&format!("SELECT * FROM t where abs(i{}) = 1 and i{} = 1", $bits, $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(1))
-                    .with_pruned_by_stats(Some(3))
-                    .with_pruned_files(Some(0))
-                    .with_matched_by_bloom_filter(Some(1))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(1)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _scalar_fun >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::Int)
-                    .with_query(&format!("SELECT * FROM t where abs(i{}) = 1", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(4))
-                    .with_pruned_by_stats(Some(0))
-                    .with_pruned_files(Some(0))
-                    .with_matched_by_bloom_filter(Some(4))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(3)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _complex_expr >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::Int)
-                    .with_query(&format!("SELECT * FROM t where i{}+1 = 1", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(4))
-                    .with_pruned_by_stats(Some(0))
-                    .with_pruned_files(Some(0))
-                    .with_matched_by_bloom_filter(Some(4))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(2)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _complex_expr_subtract >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::Int)
-                    .with_query(&format!("SELECT * FROM t where 1-i{} > 1", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(4))
-                    .with_pruned_by_stats(Some(0))
-                    .with_pruned_files(Some(0))
-                    .with_matched_by_bloom_filter(Some(4))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(9)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _eq_in_list >]() {
-                // result of sql "SELECT * FROM t where in (1)"
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::Int)
-                    .with_query(&format!("SELECT * FROM t where i{} in (1)", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(1))
-                    .with_pruned_by_stats(Some(3))
-                    .with_pruned_files(Some(0))
-                    .with_matched_by_bloom_filter(Some(1))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(1)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _eq_in_list_2 >]() {
-                // result of sql "SELECT * FROM t where in (1000)", prune all
-                // test whether statistics works
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::Int)
-                    .with_query(&format!("SELECT * FROM t where i{} in (100)", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(0))
-                    .with_pruned_by_stats(Some(0))
-                    .with_pruned_files(Some(1))
-                    .with_matched_by_bloom_filter(Some(0))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(0)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _eq_in_list_negated >]() {
-                // result of sql "SELECT * FROM t where not in (1)" prune nothing
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::Int)
-                    .with_query(&format!("SELECT * FROM t where i{} not in (1)", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(4))
-                    .with_pruned_by_stats(Some(0))
-                    .with_pruned_files(Some(0))
-                    .with_matched_by_bloom_filter(Some(4))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(19)
-                    .test_row_group_prune()
-                    .await;
-            }
+    ($bits:expr, $fn_lt:ident, $fn_eq:ident, $fn_scalar_fun_and_eq:ident, $fn_scalar_fun:ident, $fn_complex_expr:ident, $fn_complex_expr_subtract:ident, $fn_eq_in_list:ident, $fn_eq_in_list_2:ident, $fn_eq_in_list_negated:ident) => {
+        #[tokio::test]
+        async fn $fn_lt() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::Int)
+                .with_query(&format!("SELECT * FROM t where i{} < 1", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(3))
+                .with_pruned_by_stats(Some(1))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(3))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(11)
+                .test_row_group_prune()
+                .await;
+
+            // result of sql "SELECT * FROM t where i < 1" is same as
+            // "SELECT * FROM t where -i > -1"
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::Int)
+                .with_query(&format!("SELECT * FROM t where -i{} > -1", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(3))
+                .with_pruned_by_stats(Some(1))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(3))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(11)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::Int)
+                .with_query(&format!("SELECT * FROM t where i{} = 1", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(1))
+                .with_pruned_by_stats(Some(3))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(1))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(1)
+                .test_row_group_prune()
+                .await;
+        }
+        #[tokio::test]
+        async fn $fn_scalar_fun_and_eq() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::Int)
+                .with_query(&format!(
+                    "SELECT * FROM t where abs(i{}) = 1 and i{} = 1",
+                    $bits, $bits
+                ))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(1))
+                .with_pruned_by_stats(Some(3))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(1))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(1)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_scalar_fun() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::Int)
+                .with_query(&format!("SELECT * FROM t where abs(i{}) = 1", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(4))
+                .with_pruned_by_stats(Some(0))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(4))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(3)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_complex_expr() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::Int)
+                .with_query(&format!("SELECT * FROM t where i{}+1 = 1", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(4))
+                .with_pruned_by_stats(Some(0))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(4))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(2)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_complex_expr_subtract() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::Int)
+                .with_query(&format!("SELECT * FROM t where 1-i{} > 1", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(4))
+                .with_pruned_by_stats(Some(0))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(4))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(9)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq_in_list() {
+            // result of sql "SELECT * FROM t where in (1)"
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::Int)
+                .with_query(&format!("SELECT * FROM t where i{} in (1)", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(1))
+                .with_pruned_by_stats(Some(3))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(1))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(1)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq_in_list_2() {
+            // result of sql "SELECT * FROM t where in (1000)", prune all
+            // test whether statistics works
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::Int)
+                .with_query(&format!("SELECT * FROM t where i{} in (100)", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(0))
+                .with_pruned_by_stats(Some(0))
+                .with_pruned_files(Some(1))
+                .with_matched_by_bloom_filter(Some(0))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(0)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq_in_list_negated() {
+            // result of sql "SELECT * FROM t where not in (1)" prune nothing
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::Int)
+                .with_query(&format!("SELECT * FROM t where i{} not in (1)", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(4))
+                .with_pruned_by_stats(Some(0))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(4))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(19)
+                .test_row_group_prune()
+                .await;
         }
     };
 }
 
 // int8/int16 are incorrect: https://github.com/apache/datafusion/issues/9779
-int_tests!(32);
-int_tests!(64);
+int_tests!(
+    32,
+    prune_int32_lt,
+    prune_int32_eq,
+    prune_int32_scalar_fun_and_eq,
+    prune_int32_scalar_fun,
+    prune_int32_complex_expr,
+    prune_int32_complex_expr_subtract,
+    prune_int32_eq_in_list,
+    prune_int32_eq_in_list_2,
+    prune_int32_eq_in_list_negated
+);
+int_tests!(
+    64,
+    prune_int64_lt,
+    prune_int64_eq,
+    prune_int64_scalar_fun_and_eq,
+    prune_int64_scalar_fun,
+    prune_int64_complex_expr,
+    prune_int64_complex_expr_subtract,
+    prune_int64_eq_in_list,
+    prune_int64_eq_in_list_2,
+    prune_int64_eq_in_list_negated
+);
 
 // $bits: number of bits of the integer to test (8, 16, 32, 64)
 // $correct_bloom_filters: if false, replicates the
 // https://github.com/apache/datafusion/issues/9779 bug so that tests pass
 // if and only if Bloom filters on UInt8 and UInt16 columns are still buggy.
 macro_rules! uint_tests {
-    ($bits:expr) => {
-        paste::item! {
-            #[tokio::test]
-            async fn [<prune_uint $bits _lt >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::UInt)
-                    .with_query(&format!("SELECT * FROM t where u{} < 6", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(3))
-                    .with_pruned_by_stats(Some(1))
-                    .with_pruned_files(Some(0))
-                    .with_matched_by_bloom_filter(Some(3))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(11)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _eq >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::UInt)
-                    .with_query(&format!("SELECT * FROM t where u{} = 6", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(1))
-                    .with_pruned_by_stats(Some(3))
-                    .with_pruned_files(Some(0))
-                    .with_matched_by_bloom_filter(Some(1))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(1)
-                    .test_row_group_prune()
-                    .await;
-            }
-            #[tokio::test]
-            async fn [<prune_uint $bits _scalar_fun_and_eq >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::UInt)
-                    .with_query(&format!("SELECT * FROM t where power(u{}, 2) = 36 and u{} = 6", $bits, $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(1))
-                    .with_pruned_by_stats(Some(3))
-                    .with_pruned_files(Some(0))
-                    .with_matched_by_bloom_filter(Some(1))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(1)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _scalar_fun >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::UInt)
-                    .with_query(&format!("SELECT * FROM t where power(u{}, 2) = 25", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(4))
-                    .with_pruned_by_stats(Some(0))
-                    .with_pruned_files(Some(0))
-                    .with_matched_by_bloom_filter(Some(4))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(2)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _complex_expr >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::UInt)
-                    .with_query(&format!("SELECT * FROM t where u{}+1 = 6", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(4))
-                    .with_pruned_by_stats(Some(0))
-                    .with_pruned_files(Some(0))
-                    .with_matched_by_bloom_filter(Some(4))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(2)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _eq_in_list >]() {
-                // result of sql "SELECT * FROM t where in (1)"
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::UInt)
-                    .with_query(&format!("SELECT * FROM t where u{} in (6)", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(1))
-                    .with_pruned_by_stats(Some(3))
-                    .with_pruned_files(Some(0))
-                    .with_matched_by_bloom_filter(Some(1))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(1)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _eq_in_list_2 >]() {
-                // result of sql "SELECT * FROM t where in (1000)", prune all
-                // test whether statistics works
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::UInt)
-                    .with_query(&format!("SELECT * FROM t where u{} in (100)", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(0))
-                    .with_pruned_by_stats(Some(4))
-                    .with_pruned_files(Some(0))
-                    .with_matched_by_bloom_filter(Some(0))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(0)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _eq_in_list_negated >]() {
-                // result of sql "SELECT * FROM t where not in (1)" prune nothing
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::UInt)
-                    .with_query(&format!("SELECT * FROM t where u{} not in (6)", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(4))
-                    .with_pruned_by_stats(Some(0))
-                    .with_pruned_files(Some(0))
-                    .with_matched_by_bloom_filter(Some(4))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(19)
-                    .test_row_group_prune()
-                    .await;
-            }
+    ($bits:expr, $fn_lt:ident, $fn_eq:ident, $fn_scalar_fun_and_eq:ident, $fn_scalar_fun:ident, $fn_complex_expr:ident, $fn_eq_in_list:ident, $fn_eq_in_list_2:ident, $fn_eq_in_list_negated:ident) => {
+        #[tokio::test]
+        async fn $fn_lt() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::UInt)
+                .with_query(&format!("SELECT * FROM t where u{} < 6", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(3))
+                .with_pruned_by_stats(Some(1))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(3))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(11)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::UInt)
+                .with_query(&format!("SELECT * FROM t where u{} = 6", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(1))
+                .with_pruned_by_stats(Some(3))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(1))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(1)
+                .test_row_group_prune()
+                .await;
+        }
+        #[tokio::test]
+        async fn $fn_scalar_fun_and_eq() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::UInt)
+                .with_query(&format!(
+                    "SELECT * FROM t where power(u{}, 2) = 36 and u{} = 6",
+                    $bits, $bits
+                ))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(1))
+                .with_pruned_by_stats(Some(3))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(1))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(1)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_scalar_fun() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::UInt)
+                .with_query(&format!("SELECT * FROM t where power(u{}, 2) = 25", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(4))
+                .with_pruned_by_stats(Some(0))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(4))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(2)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_complex_expr() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::UInt)
+                .with_query(&format!("SELECT * FROM t where u{}+1 = 6", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(4))
+                .with_pruned_by_stats(Some(0))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(4))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(2)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq_in_list() {
+            // result of sql "SELECT * FROM t where in (1)"
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::UInt)
+                .with_query(&format!("SELECT * FROM t where u{} in (6)", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(1))
+                .with_pruned_by_stats(Some(3))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(1))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(1)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq_in_list_2() {
+            // result of sql "SELECT * FROM t where in (1000)", prune all
+            // test whether statistics works
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::UInt)
+                .with_query(&format!("SELECT * FROM t where u{} in (100)", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(0))
+                .with_pruned_by_stats(Some(4))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(0))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(0)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq_in_list_negated() {
+            // result of sql "SELECT * FROM t where not in (1)" prune nothing
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::UInt)
+                .with_query(&format!("SELECT * FROM t where u{} not in (6)", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(4))
+                .with_pruned_by_stats(Some(0))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(4))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(19)
+                .test_row_group_prune()
+                .await;
         }
     };
 }
 
 // uint8/uint16 are incorrect: https://github.com/apache/datafusion/issues/9779
-uint_tests!(32);
-uint_tests!(64);
+uint_tests!(
+    32,
+    prune_uint32_lt,
+    prune_uint32_eq,
+    prune_uint32_scalar_fun_and_eq,
+    prune_uint32_scalar_fun,
+    prune_uint32_complex_expr,
+    prune_uint32_eq_in_list,
+    prune_uint32_eq_in_list_2,
+    prune_uint32_eq_in_list_negated
+);
+uint_tests!(
+    64,
+    prune_uint64_lt,
+    prune_uint64_eq,
+    prune_uint64_scalar_fun_and_eq,
+    prune_uint64_scalar_fun,
+    prune_uint64_complex_expr,
+    prune_uint64_eq_in_list,
+    prune_uint64_eq_in_list_2,
+    prune_uint64_eq_in_list_negated
+);
 
 #[tokio::test]
 async fn prune_int32_eq_large_in_list() {
@@ -1636,3 +1766,240 @@ async fn test_bloom_filter_decimal_dict() {
         .test_row_group_prune()
         .await;
 }
+
+// Helper function to create a batch with a single Int32 column.
+fn make_i32_batch(
+    name: &str,
+    values: Vec<i32>,
+) -> datafusion_common::error::Result<RecordBatch> {
+    let schema = Arc::new(Schema::new(vec![Field::new(name, DataType::Int32, false)]));
+    let array: ArrayRef = Arc::new(Int32Array::from(values));
+    RecordBatch::try_new(schema, vec![array]).map_err(DataFusionError::from)
+}
+
+// Helper function to create a batch with two Int32 columns
+fn make_two_col_i32_batch(
+    name_a: &str,
+    name_b: &str,
+    values_a: Vec<i32>,
+    values_b: Vec<i32>,
+) -> datafusion_common::error::Result<RecordBatch> {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new(name_a, DataType::Int32, false),
+        Field::new(name_b, DataType::Int32, false),
+    ]));
+    let array_a: ArrayRef = Arc::new(Int32Array::from(values_a));
+    let array_b: ArrayRef = Arc::new(Int32Array::from(values_b));
+    RecordBatch::try_new(schema, vec![array_a, array_b]).map_err(DataFusionError::from)
+}
+
+#[tokio::test]
+async fn test_limit_pruning_basic() -> datafusion_common::error::Result<()> {
+    // Scenario: Simple integer column, multiple row groups
+    // Query: SELECT c1 FROM  t WHERE c1 = 0 LIMIT 2
+    // We expect 2 rows in total.
+
+    // Row Group 0: c1 = [0, -2] -> Partially matched, 1 row
+    // Row Group 1: c1 = [1, 2] -> Fully matched, 2 rows
+    // Row Group 2: c1 = [3, 4] -> Fully matched, 2 rows
+    // Row Group 3: c1 = [5, 6] -> Fully matched, 2 rows
+    // Row Group 4: c1 = [-1, -2] -> Not matched
+
+    // If limit = 2, and RG1 is fully matched and has 2 rows, we should
+    // only scan RG1 and prune other row groups
+    // RG4 is pruned by statistics. RG2 and RG3 are pruned by limit.
+    // So 2 row groups are effectively pruned due to limit pruning.
+
+    let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
+    let query = "SELECT c1 FROM t WHERE c1 >= 0 LIMIT 2";
+
+    let batches = vec![
+        make_i32_batch("c1", vec![0, -2])?,
+        make_i32_batch("c1", vec![0, 0])?,
+        make_i32_batch("c1", vec![0, 0])?,
+        make_i32_batch("c1", vec![0, 0])?,
+        make_i32_batch("c1", vec![-1, -2])?,
+    ];
+
+    RowGroupPruningTest::new()
+        .with_scenario(Scenario::Int) // Assuming Scenario::Int can handle this data
+        .with_query(query)
+        .with_expected_errors(Some(0))
+        .with_expected_rows(2)
+        .with_pruned_files(Some(0))
+        .with_matched_by_stats(Some(4))
+        .with_fully_matched_by_stats(Some(3))
+        .with_pruned_by_stats(Some(1))
+        .with_limit_pruned_row_groups(Some(3))
+        .test_row_group_prune_with_custom_data(schema, batches, 2)
+        .await;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_limit_pruning_complex_filter() -> datafusion_common::error::Result<()> {
+    // Test Case 1: Complex filter with two columns (a = 1 AND b > 1 AND b < 4)
+    // Row Group 0: a=[1,1,1], b=[0,2,3] -> Partially matched, 2 rows match (b=2,3)
+    // Row Group 1: a=[1,1,1], b=[2,2,2] -> Fully matched, 3 rows
+    // Row Group 2: a=[1,1,1], b=[2,3,3] -> Fully matched, 3 rows
+    // Row Group 3: a=[1,1,1], b=[2,2,3] -> Fully matched, 3 rows
+    // Row Group 4: a=[2,2,2], b=[2,2,2] -> Not matched (a != 1)
+    // Row Group 5: a=[1,1,1], b=[5,6,7] -> Not matched (b >= 4)
+
+    // With LIMIT 5, we need RG1 (3 rows) + RG2 (2 rows from 3) = 5 rows
+    // RG4 and RG5 should be pruned by statistics
+    // RG3 should be pruned by limit
+    // RG0 is partially matched, so it depends on the order
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Int32, false),
+        Field::new("b", DataType::Int32, false),
+    ]));
+    let query = "SELECT a, b FROM t WHERE a = 1 AND b > 1 AND b < 4 LIMIT 5";
+
+    let batches = vec![
+        make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![0, 2, 3])?,
+        make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![2, 2, 2])?,
+        make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![2, 3, 3])?,
+        make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![2, 2, 3])?,
+        make_two_col_i32_batch("a", "b", vec![2, 2, 2], vec![2, 2, 2])?,
+        make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![5, 6, 7])?,
+    ];
+
+    RowGroupPruningTest::new()
+        .with_scenario(Scenario::Int)
+        .with_query(query)
+        .with_expected_errors(Some(0))
+        .with_expected_rows(5)
+        .with_pruned_files(Some(0))
+        .with_matched_by_stats(Some(4)) // RG0,1,2,3 are matched
+        .with_fully_matched_by_stats(Some(3))
+        .with_pruned_by_stats(Some(2)) // RG4,5 are pruned
+        .with_limit_pruned_row_groups(Some(2)) // RG0, RG3 is pruned by limit
+        .test_row_group_prune_with_custom_data(schema, batches, 3)
+        .await;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_limit_pruning_multiple_fully_matched()
+-> datafusion_common::error::Result<()> {
+    // Test Case 2: Limit requires multiple fully matched row groups
+    // Row Group 0: a=[5,5,5,5] -> Fully matched, 4 rows
+    // Row Group 1: a=[5,5,5,5] -> Fully matched, 4 rows
+    // Row Group 2: a=[5,5,5,5] -> Fully matched, 4 rows
+    // Row Group 3: a=[5,5,5,5] -> Fully matched, 4 rows
+    // Row Group 4: a=[1,2,3,4] -> Not matched
+
+    // With LIMIT 8, we need RG0 (4 rows) + RG1 (4 rows)  8 rows
+    // RG2,3 should be pruned by limit
+    // RG4 should be pruned by statistics
+
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+    let query = "SELECT a FROM t WHERE a = 5 LIMIT 8";
+
+    let batches = vec![
+        make_i32_batch("a", vec![5, 5, 5, 5])?,
+        make_i32_batch("a", vec![5, 5, 5, 5])?,
+        make_i32_batch("a", vec![5, 5, 5, 5])?,
+        make_i32_batch("a", vec![5, 5, 5, 5])?,
+        make_i32_batch("a", vec![1, 2, 3, 4])?,
+    ];
+
+    RowGroupPruningTest::new()
+        .with_scenario(Scenario::Int)
+        .with_query(query)
+        .with_expected_errors(Some(0))
+        .with_expected_rows(8)
+        .with_pruned_files(Some(0))
+        .with_matched_by_stats(Some(4)) // RG0,1,2,3 matched
+        .with_fully_matched_by_stats(Some(4))
+        .with_pruned_by_stats(Some(1)) // RG4 pruned
+        .with_limit_pruned_row_groups(Some(2)) // RG2,3 pruned by limit
+        .test_row_group_prune_with_custom_data(schema, batches, 4)
+        .await;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_limit_pruning_no_fully_matched() -> datafusion_common::error::Result<()> {
+    // Test Case 3: No fully matched row groups - all are partially matched
+    // Row Group 0: a=[1,2,3] -> Partially matched, 1 row (a=2)
+    // Row Group 1: a=[2,3,4] -> Partially matched, 1 row (a=2)
+    // Row Group 2: a=[2,5,6] -> Partially matched, 1 row (a=2)
+    // Row Group 3: a=[2,7,8] -> Partially matched, 1 row (a=2)
+    // Row Group 4: a=[9,10,11] -> Not matched
+
+    // With LIMIT 3, we need to scan RG0,1,2 to get 3 matching rows
+    // Cannot prune much by limit since all matching RGs are partial
+    // RG4 should be pruned by statistics
+
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+    let query = "SELECT a FROM t WHERE a = 2 LIMIT 3";
+
+    let batches = vec![
+        make_i32_batch("a", vec![1, 2, 3])?,
+        make_i32_batch("a", vec![2, 3, 4])?,
+        make_i32_batch("a", vec![2, 5, 6])?,
+        make_i32_batch("a", vec![2, 7, 8])?,
+        make_i32_batch("a", vec![9, 10, 11])?,
+    ];
+
+    RowGroupPruningTest::new()
+        .with_scenario(Scenario::Int)
+        .with_query(query)
+        .with_expected_errors(Some(0))
+        .with_expected_rows(3)
+        .with_pruned_files(Some(0))
+        .with_matched_by_stats(Some(4)) // RG0,1,2,3 matched
+        .with_fully_matched_by_stats(Some(0))
+        .with_pruned_by_stats(Some(1)) // RG4 pruned
+        .with_limit_pruned_row_groups(Some(0)) // RG3 pruned by limit
+        .test_row_group_prune_with_custom_data(schema, batches, 3)
+        .await;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_limit_pruning_exceeds_fully_matched() -> datafusion_common::error::Result<()>
+{
+    // Test Case 4: Limit exceeds all fully matched rows, need partially matched
+    // Row Group 0: a=[10,11,12,12] -> Partially matched, 1 row (a=10)
+    // Row Group 1: a=[10,10,10,10] -> Fully matched, 4 rows
+    // Row Group 2: a=[10,10,10,10] -> Fully matched, 4 rows
+    // Row Group 3: a=[10,13,14,11] -> Partially matched, 1 row (a=10)
+    // Row Group 4: a=[20,21,22,22] -> Not matched
+
+    // With LIMIT 10, we need RG1 (4) + RG2 (4) = 8 from fully matched
+    // Still need 2 more, so we need to scan partially matched RG0 and RG3
+    // All matching row groups should be scanned, only RG4 pruned by statistics
+
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+    let query = "SELECT a FROM t WHERE a = 10 LIMIT 10";
+
+    let batches = vec![
+        make_i32_batch("a", vec![10, 11, 12, 12])?,
+        make_i32_batch("a", vec![10, 10, 10, 10])?,
+        make_i32_batch("a", vec![10, 10, 10, 10])?,
+        make_i32_batch("a", vec![10, 13, 14, 11])?,
+        make_i32_batch("a", vec![20, 21, 22, 22])?,
+    ];
+
+    RowGroupPruningTest::new()
+        .with_scenario(Scenario::Int)
+        .with_query(query)
+        .with_expected_errors(Some(0))
+        .with_expected_rows(10) // Total: 1 + 4 + 4 + 1 = 10
+        .with_pruned_files(Some(0))
+        .with_matched_by_stats(Some(4)) // RG0,1,2,3 matched
+        .with_fully_matched_by_stats(Some(2))
+        .with_pruned_by_stats(Some(1)) // RG4 pruned
+        .with_limit_pruned_row_groups(Some(0)) // No limit pruning since we need all RGs
+        .test_row_group_prune_with_custom_data(schema, batches, 4)
+        .await;
+    Ok(())
+}
diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs
deleted file mode 100644
index 40fc6176e212b..0000000000000
--- a/datafusion/core/tests/parquet/schema_adapter.rs
+++ /dev/null
@@ -1,553 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::sync::Arc;
-
-use arrow::array::{record_batch, RecordBatch, RecordBatchOptions};
-use arrow::compute::{cast_with_options, CastOptions};
-use arrow_schema::{DataType, Field, FieldRef, Schema, SchemaRef};
-use bytes::{BufMut, BytesMut};
-use datafusion::assert_batches_eq;
-use datafusion::common::Result;
-use datafusion::datasource::listing::{
-    ListingTable, ListingTableConfig, ListingTableConfigExt,
-};
-use datafusion::prelude::{SessionConfig, SessionContext};
-use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::DataFusionError;
-use datafusion_common::{ColumnStatistics, ScalarValue};
-use datafusion_datasource::file::FileSource;
-use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
-use datafusion_datasource::schema_adapter::{
-    DefaultSchemaAdapterFactory, SchemaAdapter, SchemaAdapterFactory, SchemaMapper,
-};
-use datafusion_datasource::ListingTableUrl;
-use datafusion_datasource_parquet::source::ParquetSource;
-use datafusion_execution::object_store::ObjectStoreUrl;
-use datafusion_physical_expr::expressions::{self, Column};
-use datafusion_physical_expr::PhysicalExpr;
-use datafusion_physical_expr_adapter::{
-    DefaultPhysicalExprAdapter, DefaultPhysicalExprAdapterFactory, PhysicalExprAdapter,
-    PhysicalExprAdapterFactory,
-};
-use itertools::Itertools;
-use object_store::{memory::InMemory, path::Path, ObjectStore};
-use parquet::arrow::ArrowWriter;
-
-async fn write_parquet(batch: RecordBatch, store: Arc<dyn ObjectStore>, path: &str) {
-    let mut out = BytesMut::new().writer();
-    {
-        let mut writer = ArrowWriter::try_new(&mut out, batch.schema(), None).unwrap();
-        writer.write(&batch).unwrap();
-        writer.finish().unwrap();
-    }
-    let data = out.into_inner().freeze();
-    store.put(&Path::from(path), data.into()).await.unwrap();
-}
-
-#[derive(Debug)]
-struct CustomSchemaAdapterFactory;
-
-impl SchemaAdapterFactory for CustomSchemaAdapterFactory {
-    fn create(
-        &self,
-        projected_table_schema: SchemaRef,
-        _table_schema: SchemaRef,
-    ) -> Box<dyn SchemaAdapter> {
-        Box::new(CustomSchemaAdapter {
-            logical_file_schema: projected_table_schema,
-        })
-    }
-}
-
-#[derive(Debug)]
-struct CustomSchemaAdapter {
-    logical_file_schema: SchemaRef,
-}
-
-impl SchemaAdapter for CustomSchemaAdapter {
-    fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option<usize> {
-        for (idx, field) in file_schema.fields().iter().enumerate() {
-            if field.name() == self.logical_file_schema.field(index).name() {
-                return Some(idx);
-            }
-        }
-        None
-    }
-
-    fn map_schema(
-        &self,
-        file_schema: &Schema,
-    ) -> Result<(Arc<dyn SchemaMapper>, Vec<usize>)> {
-        let projection = (0..file_schema.fields().len()).collect_vec();
-        Ok((
-            Arc::new(CustomSchemaMapper {
-                logical_file_schema: Arc::clone(&self.logical_file_schema),
-            }),
-            projection,
-        ))
-    }
-}
-
-#[derive(Debug)]
-struct CustomSchemaMapper {
-    logical_file_schema: SchemaRef,
-}
-
-impl SchemaMapper for CustomSchemaMapper {
-    fn map_batch(&self, batch: RecordBatch) -> Result<RecordBatch> {
-        let mut output_columns =
-            Vec::with_capacity(self.logical_file_schema.fields().len());
-        for field in self.logical_file_schema.fields() {
-            if let Some(array) = batch.column_by_name(field.name()) {
-                output_columns.push(cast_with_options(
-                    array,
-                    field.data_type(),
-                    &CastOptions::default(),
-                )?);
-            } else {
-                // Create a new array with the default value for the field type
-                let default_value = match field.data_type() {
-                    DataType::Int64 => ScalarValue::Int64(Some(0)),
-                    DataType::Utf8 => ScalarValue::Utf8(Some("a".to_string())),
-                    _ => unimplemented!("Unsupported data type: {}", field.data_type()),
-                };
-                output_columns
-                    .push(default_value.to_array_of_size(batch.num_rows()).unwrap());
-            }
-        }
-        let batch = RecordBatch::try_new_with_options(
-            Arc::clone(&self.logical_file_schema),
-            output_columns,
-            &RecordBatchOptions::new().with_row_count(Some(batch.num_rows())),
-        )
-        .unwrap();
-        Ok(batch)
-    }
-
-    fn map_column_statistics(
-        &self,
-        _file_col_statistics: &[ColumnStatistics],
-    ) -> Result<Vec<ColumnStatistics>> {
-        Ok(vec![
-            ColumnStatistics::new_unknown();
-            self.logical_file_schema.fields().len()
-        ])
-    }
-}
-
-// Implement a custom PhysicalExprAdapterFactory that fills in missing columns with the default value for the field type
-#[derive(Debug)]
-struct CustomPhysicalExprAdapterFactory;
-
-impl PhysicalExprAdapterFactory for CustomPhysicalExprAdapterFactory {
-    fn create(
-        &self,
-        logical_file_schema: SchemaRef,
-        physical_file_schema: SchemaRef,
-    ) -> Arc<dyn PhysicalExprAdapter> {
-        Arc::new(CustomPhysicalExprAdapter {
-            logical_file_schema: Arc::clone(&logical_file_schema),
-            physical_file_schema: Arc::clone(&physical_file_schema),
-            inner: Arc::new(DefaultPhysicalExprAdapter::new(
-                logical_file_schema,
-                physical_file_schema,
-            )),
-        })
-    }
-}
-
-#[derive(Debug, Clone)]
-struct CustomPhysicalExprAdapter {
-    logical_file_schema: SchemaRef,
-    physical_file_schema: SchemaRef,
-    inner: Arc<dyn PhysicalExprAdapter>,
-}
-
-impl PhysicalExprAdapter for CustomPhysicalExprAdapter {
-    fn rewrite(&self, mut expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
-        expr = expr
-            .transform(|expr| {
-                if let Some(column) = expr.as_any().downcast_ref::<Column>() {
-                    let field_name = column.name();
-                    if self
-                        .physical_file_schema
-                        .field_with_name(field_name)
-                        .ok()
-                        .is_none()
-                    {
-                        let field = self
-                            .logical_file_schema
-                            .field_with_name(field_name)
-                            .map_err(|_| {
-                                DataFusionError::Plan(format!(
-                                    "Field '{field_name}' not found in logical file schema",
-                                ))
-                            })?;
-                        // If the field does not exist, create a default value expression
-                        // Note that we use slightly different logic here to create a default value so that we can see different behavior in tests
-                        let default_value = match field.data_type() {
-                            DataType::Int64 => ScalarValue::Int64(Some(1)),
-                            DataType::Utf8 => ScalarValue::Utf8(Some("b".to_string())),
-                            _ => unimplemented!(
-                                "Unsupported data type: {}",
-                                field.data_type()
-                            ),
-                        };
-                        return Ok(Transformed::yes(Arc::new(
-                            expressions::Literal::new(default_value),
-                        )));
-                    }
-                }
-
-                Ok(Transformed::no(expr))
-            })
-            .data()?;
-        self.inner.rewrite(expr)
-    }
-
-    fn with_partition_values(
-        &self,
-        partition_values: Vec<(FieldRef, ScalarValue)>,
-    ) -> Arc<dyn PhysicalExprAdapter> {
-        assert!(
-            partition_values.is_empty(),
-            "Partition values are not supported in this test"
-        );
-        Arc::new(self.clone())
-    }
-}
-
-#[tokio::test]
-async fn test_custom_schema_adapter_and_custom_expression_adapter() {
-    let batch =
-        record_batch!(("extra", Int64, [1, 2, 3]), ("c1", Int32, [1, 2, 3])).unwrap();
-
-    let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
-    let store_url = ObjectStoreUrl::parse("memory://").unwrap();
-    let path = "test.parquet";
-    write_parquet(batch, store.clone(), path).await;
-
-    let table_schema = Arc::new(Schema::new(vec![
-        Field::new("c1", DataType::Int64, false),
-        Field::new("c2", DataType::Utf8, true),
-    ]));
-
-    let mut cfg = SessionConfig::new()
-        // Disable statistics collection for this test otherwise early pruning makes it hard to demonstrate data adaptation
-        .with_collect_statistics(false)
-        .with_parquet_pruning(false)
-        .with_parquet_page_index_pruning(false);
-    cfg.options_mut().execution.parquet.pushdown_filters = true;
-    let ctx = SessionContext::new_with_config(cfg);
-    ctx.register_object_store(store_url.as_ref(), Arc::clone(&store));
-    assert!(
-        !ctx.state()
-            .config_mut()
-            .options_mut()
-            .execution
-            .collect_statistics
-    );
-    assert!(!ctx.state().config().collect_statistics());
-
-    let listing_table_config =
-        ListingTableConfig::new(ListingTableUrl::parse("memory:///").unwrap())
-            .infer_options(&ctx.state())
-            .await
-            .unwrap()
-            .with_schema(table_schema.clone())
-            .with_schema_adapter_factory(Arc::new(DefaultSchemaAdapterFactory))
-            .with_expr_adapter_factory(Arc::new(DefaultPhysicalExprAdapterFactory));
-
-    let table = ListingTable::try_new(listing_table_config).unwrap();
-    ctx.register_table("t", Arc::new(table)).unwrap();
-
-    let batches = ctx
-        .sql("SELECT c2, c1 FROM t WHERE c1 = 2 AND c2 IS NULL")
-        .await
-        .unwrap()
-        .collect()
-        .await
-        .unwrap();
-
-    let expected = [
-        "+----+----+",
-        "| c2 | c1 |",
-        "+----+----+",
-        "|    | 2  |",
-        "+----+----+",
-    ];
-    assert_batches_eq!(expected, &batches);
-
-    // Test using a custom schema adapter and no explicit physical expr adapter
-    // This should use the custom schema adapter both for projections and predicate pushdown
-    let listing_table_config =
-        ListingTableConfig::new(ListingTableUrl::parse("memory:///").unwrap())
-            .infer_options(&ctx.state())
-            .await
-            .unwrap()
-            .with_schema(table_schema.clone())
-            .with_schema_adapter_factory(Arc::new(CustomSchemaAdapterFactory));
-    let table = ListingTable::try_new(listing_table_config).unwrap();
-    ctx.deregister_table("t").unwrap();
-    ctx.register_table("t", Arc::new(table)).unwrap();
-    let batches = ctx
-        .sql("SELECT c2, c1 FROM t WHERE c1 = 2 AND c2 = 'a'")
-        .await
-        .unwrap()
-        .collect()
-        .await
-        .unwrap();
-    let expected = [
-        "+----+----+",
-        "| c2 | c1 |",
-        "+----+----+",
-        "| a  | 2  |",
-        "+----+----+",
-    ];
-    assert_batches_eq!(expected, &batches);
-
-    // Do the same test but with a custom physical expr adapter
-    // Now the default schema adapter will be used for projections, but the custom physical expr adapter will be used for predicate pushdown
-    let listing_table_config =
-        ListingTableConfig::new(ListingTableUrl::parse("memory:///").unwrap())
-            .infer_options(&ctx.state())
-            .await
-            .unwrap()
-            .with_schema(table_schema.clone())
-            .with_expr_adapter_factory(Arc::new(CustomPhysicalExprAdapterFactory));
-    let table = ListingTable::try_new(listing_table_config).unwrap();
-    ctx.deregister_table("t").unwrap();
-    ctx.register_table("t", Arc::new(table)).unwrap();
-    let batches = ctx
-        .sql("SELECT c2, c1 FROM t WHERE c1 = 2 AND c2 = 'b'")
-        .await
-        .unwrap()
-        .collect()
-        .await
-        .unwrap();
-    let expected = [
-        "+----+----+",
-        "| c2 | c1 |",
-        "+----+----+",
-        "|    | 2  |",
-        "+----+----+",
-    ];
-    assert_batches_eq!(expected, &batches);
-
-    // If we use both then the custom physical expr adapter will be used for predicate pushdown and the custom schema adapter will be used for projections
-    let listing_table_config =
-        ListingTableConfig::new(ListingTableUrl::parse("memory:///").unwrap())
-            .infer_options(&ctx.state())
-            .await
-            .unwrap()
-            .with_schema(table_schema.clone())
-            .with_schema_adapter_factory(Arc::new(CustomSchemaAdapterFactory))
-            .with_expr_adapter_factory(Arc::new(CustomPhysicalExprAdapterFactory));
-    let table = ListingTable::try_new(listing_table_config).unwrap();
-    ctx.deregister_table("t").unwrap();
-    ctx.register_table("t", Arc::new(table)).unwrap();
-    let batches = ctx
-        .sql("SELECT c2, c1 FROM t WHERE c1 = 2 AND c2 = 'b'")
-        .await
-        .unwrap()
-        .collect()
-        .await
-        .unwrap();
-    let expected = [
-        "+----+----+",
-        "| c2 | c1 |",
-        "+----+----+",
-        "| a  | 2  |",
-        "+----+----+",
-    ];
-    assert_batches_eq!(expected, &batches);
-}
-
-/// A test schema adapter factory that adds prefix to column names
-#[derive(Debug)]
-struct PrefixAdapterFactory {
-    prefix: String,
-}
-
-impl SchemaAdapterFactory for PrefixAdapterFactory {
-    fn create(
-        &self,
-        projected_table_schema: SchemaRef,
-        _table_schema: SchemaRef,
-    ) -> Box<dyn SchemaAdapter> {
-        Box::new(PrefixAdapter {
-            input_schema: projected_table_schema,
-            prefix: self.prefix.clone(),
-        })
-    }
-}
-
-/// A test schema adapter that adds prefix to column names
-#[derive(Debug)]
-struct PrefixAdapter {
-    input_schema: SchemaRef,
-    prefix: String,
-}
-
-impl SchemaAdapter for PrefixAdapter {
-    fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option<usize> {
-        let field = self.input_schema.field(index);
-        file_schema.fields.find(field.name()).map(|(i, _)| i)
-    }
-
-    fn map_schema(
-        &self,
-        file_schema: &Schema,
-    ) -> Result<(Arc<dyn SchemaMapper>, Vec<usize>)> {
-        let mut projection = Vec::with_capacity(file_schema.fields().len());
-        for (file_idx, file_field) in file_schema.fields().iter().enumerate() {
-            if self.input_schema.fields().find(file_field.name()).is_some() {
-                projection.push(file_idx);
-            }
-        }
-
-        // Create a schema mapper that adds a prefix to column names
-        #[derive(Debug)]
-        struct PrefixSchemaMapping {
-            // Keep only the prefix field which is actually used in the implementation
-            prefix: String,
-        }
-
-        impl SchemaMapper for PrefixSchemaMapping {
-            fn map_batch(&self, batch: RecordBatch) -> Result<RecordBatch> {
-                // Create a new schema with prefixed field names
-                let prefixed_fields: Vec<Field> = batch
-                    .schema()
-                    .fields()
-                    .iter()
-                    .map(|field| {
-                        Field::new(
-                            format!("{}{}", self.prefix, field.name()),
-                            field.data_type().clone(),
-                            field.is_nullable(),
-                        )
-                    })
-                    .collect();
-                let prefixed_schema = Arc::new(Schema::new(prefixed_fields));
-
-                // Create a new batch with the prefixed schema but the same data
-                let options = RecordBatchOptions::default();
-                RecordBatch::try_new_with_options(
-                    prefixed_schema,
-                    batch.columns().to_vec(),
-                    &options,
-                )
-                .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))
-            }
-
-            fn map_column_statistics(
-                &self,
-                stats: &[ColumnStatistics],
-            ) -> Result<Vec<ColumnStatistics>> {
-                // For testing, just return the input statistics
-                Ok(stats.to_vec())
-            }
-        }
-
-        Ok((
-            Arc::new(PrefixSchemaMapping {
-                prefix: self.prefix.clone(),
-            }),
-            projection,
-        ))
-    }
-}
-
-#[test]
-fn test_apply_schema_adapter_with_factory() {
-    // Create a schema
-    let schema = Arc::new(Schema::new(vec![
-        Field::new("id", DataType::Int32, false),
-        Field::new("name", DataType::Utf8, true),
-    ]));
-
-    // Create a parquet source
-    let source = ParquetSource::default();
-
-    // Create a file scan config with source that has a schema adapter factory
-    let factory = Arc::new(PrefixAdapterFactory {
-        prefix: "test_".to_string(),
-    });
-
-    let file_source = source.clone().with_schema_adapter_factory(factory).unwrap();
-
-    let config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::local_filesystem(),
-        schema.clone(),
-        file_source,
-    )
-    .build();
-
-    // Apply schema adapter to a new source
-    let result_source = source.apply_schema_adapter(&config).unwrap();
-
-    // Verify the adapter was applied
-    assert!(result_source.schema_adapter_factory().is_some());
-
-    // Create adapter and test it produces expected schema
-    let adapter_factory = result_source.schema_adapter_factory().unwrap();
-    let adapter = adapter_factory.create(schema.clone(), schema.clone());
-
-    // Create a dummy batch to test the schema mapping
-    let dummy_batch = RecordBatch::new_empty(schema.clone());
-
-    // Get the file schema (which is the same as the table schema in this test)
-    let (mapper, _) = adapter.map_schema(&schema).unwrap();
-
-    // Apply the mapping to get the output schema
-    let mapped_batch = mapper.map_batch(dummy_batch).unwrap();
-    let output_schema = mapped_batch.schema();
-
-    // Check the column names have the prefix
-    assert_eq!(output_schema.field(0).name(), "test_id");
-    assert_eq!(output_schema.field(1).name(), "test_name");
-}
-
-#[test]
-fn test_apply_schema_adapter_without_factory() {
-    // Create a schema
-    let schema = Arc::new(Schema::new(vec![
-        Field::new("id", DataType::Int32, false),
-        Field::new("name", DataType::Utf8, true),
-    ]));
-
-    // Create a parquet source
-    let source = ParquetSource::default();
-
-    // Convert to Arc<dyn FileSource>
-    let file_source: Arc<dyn FileSource> = Arc::new(source.clone());
-
-    // Create a file scan config without a schema adapter factory
-    let config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::local_filesystem(),
-        schema.clone(),
-        file_source,
-    )
-    .build();
-
-    // Apply schema adapter function - should pass through the source unchanged
-    let result_source = source.apply_schema_adapter(&config).unwrap();
-
-    // Verify no adapter was applied
-    assert!(result_source.schema_adapter_factory().is_none());
-}
diff --git a/datafusion/core/tests/parquet/schema_coercion.rs b/datafusion/core/tests/parquet/schema_coercion.rs
index 9be391a9108e6..6f7e2e328d0c3 100644
--- a/datafusion/core/tests/parquet/schema_coercion.rs
+++ b/datafusion/core/tests/parquet/schema_coercion.rs
@@ -18,16 +18,16 @@
 use std::sync::Arc;
 
 use arrow::array::{
-    types::Int32Type, ArrayRef, DictionaryArray, Float32Array, Int64Array, RecordBatch,
-    StringArray,
+    ArrayRef, DictionaryArray, Float32Array, Int64Array, RecordBatch, StringArray,
+    types::Int32Type,
 };
 use arrow::datatypes::{DataType, Field, Schema};
 use datafusion::datasource::physical_plan::ParquetSource;
 use datafusion::physical_plan::collect;
 use datafusion::prelude::SessionContext;
 use datafusion::test::object_store::local_unpartitioned_file;
-use datafusion_common::test_util::batches_to_sort_string;
 use datafusion_common::Result;
+use datafusion_common::test_util::batches_to_sort_string;
 use datafusion_execution::object_store::ObjectStoreUrl;
 
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
@@ -62,14 +62,10 @@ async fn multi_parquet_coercion() {
         Field::new("c2", DataType::Int32, true),
         Field::new("c3", DataType::Float64, true),
     ]));
-    let source = Arc::new(ParquetSource::default());
-    let conf = FileScanConfigBuilder::new(
-        ObjectStoreUrl::local_filesystem(),
-        file_schema,
-        source,
-    )
-    .with_file_group(file_group)
-    .build();
+    let source = Arc::new(ParquetSource::new(file_schema.clone()));
+    let conf = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source)
+        .with_file_group(file_group)
+        .build();
 
     let parquet_exec = DataSourceExec::from_data_source(conf);
 
@@ -122,11 +118,11 @@ async fn multi_parquet_coercion_projection() {
     ]));
     let config = FileScanConfigBuilder::new(
         ObjectStoreUrl::local_filesystem(),
-        file_schema,
-        Arc::new(ParquetSource::default()),
+        Arc::new(ParquetSource::new(file_schema)),
     )
     .with_file_group(file_group)
     .with_projection_indices(Some(vec![1, 0, 2]))
+    .unwrap()
     .build();
 
     let parquet_exec = DataSourceExec::from_data_source(config);
diff --git a/datafusion/core/tests/parquet/utils.rs b/datafusion/core/tests/parquet/utils.rs
index 24b6cadc148f8..77bc808f1ea08 100644
--- a/datafusion/core/tests/parquet/utils.rs
+++ b/datafusion/core/tests/parquet/utils.rs
@@ -20,7 +20,7 @@
 use datafusion::datasource::physical_plan::ParquetSource;
 use datafusion::datasource::source::DataSourceExec;
 use datafusion_physical_plan::metrics::MetricsSet;
-use datafusion_physical_plan::{accept, ExecutionPlan, ExecutionPlanVisitor};
+use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanVisitor, accept};
 
 /// Find the metrics from the first DataSourceExec encountered in the plan
 #[derive(Debug)]
@@ -47,13 +47,12 @@ impl MetricsFinder {
 impl ExecutionPlanVisitor for MetricsFinder {
     type Error = std::convert::Infallible;
     fn pre_visit(&mut self, plan: &dyn ExecutionPlan) -> Result<bool, Self::Error> {
-        if let Some(data_source_exec) = plan.as_any().downcast_ref::<DataSourceExec>() {
-            if data_source_exec
+        if let Some(data_source_exec) = plan.downcast_ref::<DataSourceExec>()
+            && data_source_exec
                 .downcast_to_file_source::<ParquetSource>()
                 .is_some()
-            {
-                self.metrics = data_source_exec.metrics();
-            }
+        {
+            self.metrics = data_source_exec.metrics();
         }
         // stop searching once we have found the metrics
         Ok(self.metrics.is_none())
diff --git a/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs b/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs
index a79d743cb253d..808e163b08369 100644
--- a/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs
+++ b/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs
@@ -20,26 +20,38 @@ use std::sync::Arc;
 use crate::physical_optimizer::test_utils::TestAggregate;
 
 use arrow::array::Int32Array;
+use arrow::array::{Int64Array, StringArray};
 use arrow::datatypes::{DataType, Field, Schema};
 use arrow::record_batch::RecordBatch;
+use datafusion::datasource::listing::PartitionedFile;
+use datafusion::datasource::memory::MemTable;
 use datafusion::datasource::memory::MemorySourceConfig;
+use datafusion::datasource::physical_plan::ParquetSource;
 use datafusion::datasource::source::DataSourceExec;
+use datafusion::prelude::{SessionConfig, SessionContext};
+use datafusion_common::assert_batches_eq;
 use datafusion_common::cast::as_int64_array;
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::Result;
+use datafusion_common::stats::Precision;
+use datafusion_common::{ColumnStatistics, Result, Statistics};
+use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use datafusion_execution::TaskContext;
+use datafusion_execution::object_store::ObjectStoreUrl;
 use datafusion_expr::Operator;
+use datafusion_functions_aggregate::count::count_udaf;
+use datafusion_physical_expr::aggregate::AggregateExprBuilder;
 use datafusion_physical_expr::expressions::{self, cast};
-use datafusion_physical_optimizer::aggregate_statistics::AggregateStatistics;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::aggregate_statistics::AggregateStatistics;
+use datafusion_physical_plan::ExecutionPlan;
 use datafusion_physical_plan::aggregates::AggregateExec;
 use datafusion_physical_plan::aggregates::AggregateMode;
 use datafusion_physical_plan::aggregates::PhysicalGroupBy;
 use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion_physical_plan::common;
+use datafusion_physical_plan::displayable;
 use datafusion_physical_plan::filter::FilterExec;
 use datafusion_physical_plan::projection::ProjectionExec;
-use datafusion_physical_plan::ExecutionPlan;
 
 /// Mock data using a MemorySourceConfig which has an exact count statistic
 fn mock_data() -> Result<Arc<DataSourceExec>> {
@@ -71,7 +83,7 @@ async fn assert_count_optim_success(
     let optimized = AggregateStatistics::new().optimize(Arc::clone(&plan), &config)?;
 
     // A ProjectionExec is a sign that the count optimization was applied
-    assert!(optimized.as_any().is::<ProjectionExec>());
+    assert!(optimized.is::<ProjectionExec>());
 
     // run both the optimized and nonoptimized plan
     let optimized_result =
@@ -268,7 +280,7 @@ async fn test_count_inexact_stat() -> Result<()> {
     let optimized = AggregateStatistics::new().optimize(Arc::new(final_agg), &conf)?;
 
     // check that the original ExecutionPlan was not replaced
-    assert!(optimized.as_any().is::<AggregateExec>());
+    assert!(optimized.is::<AggregateExec>());
 
     Ok(())
 }
@@ -312,7 +324,232 @@ async fn test_count_with_nulls_inexact_stat() -> Result<()> {
     let optimized = AggregateStatistics::new().optimize(Arc::new(final_agg), &conf)?;
 
     // check that the original ExecutionPlan was not replaced
-    assert!(optimized.as_any().is::<AggregateExec>());
+    assert!(optimized.is::<AggregateExec>());
+
+    Ok(())
+}
+
+/// Tests that TopK aggregation correctly handles UTF-8 (string) types in both grouping keys and aggregate values.
+///
+/// The TopK optimization is designed to efficiently handle `GROUP BY ... ORDER BY aggregate LIMIT n` queries
+/// by maintaining only the top K groups during aggregation. However, not all type combinations are supported.
+///
+/// This test verifies two scenarios:
+/// 1. **Supported case**: UTF-8 grouping key with numeric aggregate (max/min) - should use TopK optimization
+/// 2. **Unsupported case**: UTF-8 grouping key with UTF-8 aggregate value - must gracefully fall back to
+///    standard aggregation without panicking
+///
+/// The fallback behavior is critical because attempting to use TopK with unsupported types could cause
+/// runtime panics. This test ensures the optimizer correctly detects incompatible types and chooses
+/// the appropriate execution path.
+#[tokio::test]
+async fn utf8_grouping_min_max_limit_fallbacks() -> Result<()> {
+    let mut config = SessionConfig::new();
+    config.options_mut().optimizer.enable_topk_aggregation = true;
+    let ctx = SessionContext::new_with_config(config);
+
+    let batch = RecordBatch::try_new(
+        Arc::new(Schema::new(vec![
+            Field::new("g", DataType::Utf8, false),
+            Field::new("val_str", DataType::Utf8, false),
+            Field::new("val_num", DataType::Int64, false),
+        ])),
+        vec![
+            Arc::new(StringArray::from(vec!["a", "b", "a"])),
+            Arc::new(StringArray::from(vec!["alpha", "bravo", "charlie"])),
+            Arc::new(Int64Array::from(vec![1, 2, 3])),
+        ],
+    )?;
+    let table = MemTable::try_new(batch.schema(), vec![vec![batch]])?;
+    ctx.register_table("t", Arc::new(table))?;
+
+    // Supported path: numeric min/max with UTF-8 grouping should still use TopK aggregation
+    // and return correct results.
+    let supported_df = ctx
+        .sql("SELECT g, max(val_num) AS m FROM t GROUP BY g ORDER BY m DESC LIMIT 1")
+        .await?;
+    let supported_batches = supported_df.collect().await?;
+    assert_batches_eq!(
+        &[
+            "+---+---+",
+            "| g | m |",
+            "+---+---+",
+            "| a | 3 |",
+            "+---+---+"
+        ],
+        &supported_batches
+    );
+
+    // Unsupported TopK value type: string min/max should fall back without panicking.
+    let unsupported_df = ctx
+        .sql("SELECT g, max(val_str) AS s FROM t GROUP BY g ORDER BY s DESC LIMIT 1")
+        .await?;
+    let unsupported_plan = unsupported_df.clone().create_physical_plan().await?;
+    let unsupported_batches = unsupported_df.collect().await?;
+
+    // Ensure the plan avoided the TopK-specific stream implementation.
+    let plan_display = displayable(unsupported_plan.as_ref())
+        .indent(true)
+        .to_string();
+    assert!(
+        !plan_display.contains("GroupedTopKAggregateStream"),
+        "Unsupported UTF-8 aggregate value should not use TopK: {plan_display}"
+    );
+
+    assert_batches_eq!(
+        &[
+            "+---+---------+",
+            "| g | s       |",
+            "+---+---------+",
+            "| a | charlie |",
+            "+---+---------+"
+        ],
+        &unsupported_batches
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_count_distinct_optimization() -> Result<()> {
+    struct TestCase {
+        name: &'static str,
+        distinct_count: Precision<usize>,
+        use_column_expr: bool,
+        expect_optimized: bool,
+        expected_value: Option<i64>,
+    }
+
+    let cases = vec![
+        TestCase {
+            name: "exact statistics",
+            distinct_count: Precision::Exact(42),
+            use_column_expr: true,
+            expect_optimized: true,
+            expected_value: Some(42),
+        },
+        TestCase {
+            name: "absent statistics",
+            distinct_count: Precision::Absent,
+            use_column_expr: true,
+            expect_optimized: false,
+            expected_value: None,
+        },
+        TestCase {
+            name: "inexact statistics",
+            distinct_count: Precision::Inexact(42),
+            use_column_expr: true,
+            expect_optimized: false,
+            expected_value: None,
+        },
+        TestCase {
+            name: "non-column expression with exact statistics",
+            distinct_count: Precision::Exact(42),
+            use_column_expr: false,
+            expect_optimized: false,
+            expected_value: None,
+        },
+    ];
+
+    for case in cases {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+        ]));
+
+        let statistics = Statistics {
+            num_rows: Precision::Exact(100),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![
+                ColumnStatistics {
+                    distinct_count: case.distinct_count,
+                    null_count: Precision::Exact(10),
+                    ..Default::default()
+                },
+                ColumnStatistics::default(),
+            ],
+        };
+
+        let config = FileScanConfigBuilder::new(
+            ObjectStoreUrl::parse("test:///").unwrap(),
+            Arc::new(ParquetSource::new(Arc::clone(&schema))),
+        )
+        .with_file(PartitionedFile::new("x".to_string(), 100))
+        .with_statistics(statistics)
+        .build();
+
+        let source: Arc<dyn ExecutionPlan> = DataSourceExec::from_data_source(config);
+        let schema = source.schema();
+
+        let (agg_args, alias): (Vec<Arc<dyn datafusion_physical_expr::PhysicalExpr>>, _) =
+            if case.use_column_expr {
+                (vec![expressions::col("a", &schema)?], "COUNT(DISTINCT a)")
+            } else {
+                (
+                    vec![expressions::binary(
+                        expressions::col("a", &schema)?,
+                        Operator::Plus,
+                        expressions::col("b", &schema)?,
+                        &schema,
+                    )?],
+                    "COUNT(DISTINCT a + b)",
+                )
+            };
+
+        let count_distinct_expr = AggregateExprBuilder::new(count_udaf(), agg_args)
+            .schema(Arc::clone(&schema))
+            .alias(alias)
+            .distinct()
+            .build()?;
+
+        let partial_agg = AggregateExec::try_new(
+            AggregateMode::Partial,
+            PhysicalGroupBy::default(),
+            vec![Arc::new(count_distinct_expr.clone())],
+            vec![None],
+            source,
+            Arc::clone(&schema),
+        )?;
+
+        let final_agg = AggregateExec::try_new(
+            AggregateMode::Final,
+            PhysicalGroupBy::default(),
+            vec![Arc::new(count_distinct_expr)],
+            vec![None],
+            Arc::new(partial_agg),
+            Arc::clone(&schema),
+        )?;
+
+        let conf = ConfigOptions::new();
+        let optimized =
+            AggregateStatistics::new().optimize(Arc::new(final_agg), &conf)?;
+
+        if case.expect_optimized {
+            assert!(
+                optimized.is::<ProjectionExec>(),
+                "'{}': expected ProjectionExec",
+                case.name
+            );
+
+            if let Some(expected_val) = case.expected_value {
+                let task_ctx = Arc::new(TaskContext::default());
+                let result = common::collect(optimized.execute(0, task_ctx)?).await?;
+                assert_eq!(result.len(), 1, "'{}': expected 1 batch", case.name);
+                assert_eq!(
+                    as_int64_array(result[0].column(0)).unwrap().values(),
+                    &[expected_val],
+                    "'{}': unexpected value",
+                    case.name
+                );
+            }
+        } else {
+            assert!(
+                optimized.is::<AggregateExec>(),
+                "'{}': expected AggregateExec (not optimized)",
+                case.name
+            );
+        }
+    }
 
     Ok(())
 }
diff --git a/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs b/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs
index 9c76f6ab6f58b..9e63c341c92d9 100644
--- a/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs
+++ b/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs
@@ -29,18 +29,18 @@ use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use datafusion_common::config::ConfigOptions;
 use datafusion_functions_aggregate::count::count_udaf;
 use datafusion_functions_aggregate::sum::sum_udaf;
+use datafusion_physical_expr::Partitioning;
 use datafusion_physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctionExpr};
 use datafusion_physical_expr::expressions::{col, lit};
-use datafusion_physical_expr::Partitioning;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-use datafusion_physical_optimizer::combine_partial_final_agg::CombinePartialFinalAggregate;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::combine_partial_final_agg::CombinePartialFinalAggregate;
+use datafusion_physical_plan::ExecutionPlan;
 use datafusion_physical_plan::aggregates::{
-    AggregateExec, AggregateMode, PhysicalGroupBy,
+    AggregateExec, AggregateMode, LimitOptions, PhysicalGroupBy,
 };
 use datafusion_physical_plan::displayable;
 use datafusion_physical_plan::repartition::RepartitionExec;
-use datafusion_physical_plan::ExecutionPlan;
 
 /// Runs the CombinePartialFinalAggregate optimizer and asserts the plan against the expected
 macro_rules! assert_optimized {
@@ -191,7 +191,7 @@ fn aggregations_combined() -> datafusion_common::Result<()> {
     // should combine the Partial/Final AggregateExecs to the Single AggregateExec
     assert_optimized!(
         plan,
-        @ "
+        @ r"
     AggregateExec: mode=Single, gby=[], aggr=[COUNT(1)]
       DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet
     "
@@ -260,7 +260,7 @@ fn aggregations_with_limit_combined() -> datafusion_common::Result<()> {
             schema,
         )
         .unwrap()
-        .with_limit(Some(5)),
+        .with_limit_options(Some(LimitOptions::new(5))),
     );
     let plan: Arc<dyn ExecutionPlan> = final_agg;
     // should combine the Partial/Final AggregateExecs to a Single AggregateExec
diff --git a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs
index db011c4be43ab..78bb02ab1108b 100644
--- a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs
+++ b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs
@@ -26,36 +26,42 @@ use crate::physical_optimizer::test_utils::{
     sort_preserving_merge_exec, union_exec,
 };
 
-use arrow::array::{RecordBatch, UInt64Array, UInt8Array};
+use arrow::array::{RecordBatch, UInt8Array, UInt64Array};
 use arrow::compute::SortOptions;
 use arrow_schema::{DataType, Field, Schema, SchemaRef};
 use datafusion::config::ConfigOptions;
+use datafusion::datasource::MemTable;
 use datafusion::datasource::file_format::file_compression_type::FileCompressionType;
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::object_store::ObjectStoreUrl;
 use datafusion::datasource::physical_plan::{CsvSource, ParquetSource};
 use datafusion::datasource::source::DataSourceExec;
-use datafusion::datasource::MemTable;
 use datafusion::prelude::{SessionConfig, SessionContext};
-use datafusion_common::error::Result;
-use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
 use datafusion_common::ScalarValue;
+use datafusion_common::config::CsvOptions;
+use datafusion_common::error::Result;
+use datafusion_common::tree_node::{
+    Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
+};
 use datafusion_datasource::file_groups::FileGroup;
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use datafusion_expr::{JoinType, Operator};
-use datafusion_physical_expr::expressions::{binary, lit, BinaryExpr, Column, Literal};
+use datafusion_functions_aggregate::count::count_udaf;
+use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal, binary, lit};
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use datafusion_physical_expr_common::sort_expr::{
     LexOrdering, OrderingRequirements, PhysicalSortExpr,
 };
+use datafusion_physical_optimizer::PhysicalOptimizerRule;
 use datafusion_physical_optimizer::enforce_distribution::*;
 use datafusion_physical_optimizer::enforce_sorting::EnforceSorting;
 use datafusion_physical_optimizer::output_requirements::OutputRequirements;
-use datafusion_physical_optimizer::PhysicalOptimizerRule;
 use datafusion_physical_plan::aggregates::{
     AggregateExec, AggregateMode, PhysicalGroupBy,
 };
-use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec;
+
+use datafusion_physical_expr::Distribution;
 use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion_physical_plan::execution_plan::ExecutionPlan;
 use datafusion_physical_plan::expressions::col;
@@ -66,8 +72,7 @@ use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr};
 use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
 use datafusion_physical_plan::union::UnionExec;
 use datafusion_physical_plan::{
-    displayable, get_plan_string, DisplayAs, DisplayFormatType, ExecutionPlanProperties,
-    PlanProperties, Statistics,
+    DisplayAs, DisplayFormatType, ExecutionPlanProperties, PlanProperties, displayable,
 };
 use insta::Settings;
 
@@ -119,7 +124,7 @@ macro_rules! assert_plan {
 struct SortRequiredExec {
     input: Arc<dyn ExecutionPlan>,
     expr: LexOrdering,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl SortRequiredExec {
@@ -131,7 +136,7 @@ impl SortRequiredExec {
         Self {
             input,
             expr: requirement,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -169,11 +174,7 @@ impl ExecutionPlan for SortRequiredExec {
         "SortRequiredExec"
     }
 
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -202,6 +203,20 @@ impl ExecutionPlan for SortRequiredExec {
         )))
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.cache.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
+
     fn execute(
         &self,
         _partition: usize,
@@ -209,12 +224,104 @@ impl ExecutionPlan for SortRequiredExec {
     ) -> Result<datafusion_physical_plan::SendableRecordBatchStream> {
         unreachable!();
     }
+}
+
+#[derive(Debug)]
+struct SinglePartitionMaintainsOrderExec {
+    input: Arc<dyn ExecutionPlan>,
+    cache: Arc<PlanProperties>,
+}
+
+impl SinglePartitionMaintainsOrderExec {
+    fn new(input: Arc<dyn ExecutionPlan>) -> Self {
+        let cache = Self::compute_properties(&input);
+        Self {
+            input,
+            cache: Arc::new(cache),
+        }
+    }
+
+    fn compute_properties(input: &Arc<dyn ExecutionPlan>) -> PlanProperties {
+        PlanProperties::new(
+            input.equivalence_properties().clone(),
+            input.output_partitioning().clone(),
+            input.pipeline_behavior(),
+            input.boundedness(),
+        )
+    }
+}
+
+impl DisplayAs for SinglePartitionMaintainsOrderExec {
+    fn fmt_as(
+        &self,
+        t: DisplayFormatType,
+        f: &mut std::fmt::Formatter,
+    ) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "SinglePartitionMaintainsOrderExec")
+            }
+            DisplayFormatType::TreeRender => write!(f, ""),
+        }
+    }
+}
+
+impl ExecutionPlan for SinglePartitionMaintainsOrderExec {
+    fn name(&self) -> &'static str {
+        "SinglePartitionMaintainsOrderExec"
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.cache
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.input]
+    }
+
+    fn required_input_distribution(&self) -> Vec<Distribution> {
+        vec![Distribution::SinglePartition]
+    }
+
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![true]
+    }
+
+    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
+        vec![false]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        assert_eq!(children.len(), 1);
+        let child = children.pop().unwrap();
+        Ok(Arc::new(Self::new(child)))
+    }
+
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.input.partition_statistics(None)
+    fn execute(
+        &self,
+        _partition: usize,
+        _context: Arc<datafusion::execution::context::TaskContext>,
+    ) -> Result<datafusion_physical_plan::SendableRecordBatchStream> {
+        unreachable!();
     }
 }
 
+fn single_partition_maintains_order_exec(
+    input: Arc<dyn ExecutionPlan>,
+) -> Arc<dyn ExecutionPlan> {
+    Arc::new(SinglePartitionMaintainsOrderExec::new(input))
+}
+
 fn parquet_exec() -> Arc<DataSourceExec> {
     parquet_exec_with_sort(schema(), vec![])
 }
@@ -229,8 +336,7 @@ fn parquet_exec_multiple_sorted(
 ) -> Arc<DataSourceExec> {
     let config = FileScanConfigBuilder::new(
         ObjectStoreUrl::parse("test:///").unwrap(),
-        schema(),
-        Arc::new(ParquetSource::default()),
+        Arc::new(ParquetSource::new(schema())),
     )
     .with_file_groups(vec![
         FileGroup::new(vec![PartitionedFile::new("x".to_string(), 100)]),
@@ -247,14 +353,19 @@ fn csv_exec() -> Arc<DataSourceExec> {
 }
 
 fn csv_exec_with_sort(output_ordering: Vec<LexOrdering>) -> Arc<DataSourceExec> {
-    let config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::parse("test:///").unwrap(),
-        schema(),
-        Arc::new(CsvSource::new(false, b',', b'"')),
-    )
-    .with_file(PartitionedFile::new("x".to_string(), 100))
-    .with_output_ordering(output_ordering)
-    .build();
+    let config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), {
+            let options = CsvOptions {
+                has_header: Some(false),
+                delimiter: b',',
+                quote: b'"',
+                ..Default::default()
+            };
+            Arc::new(CsvSource::new(schema()).with_csv_options(options))
+        })
+        .with_file(PartitionedFile::new("x".to_string(), 100))
+        .with_output_ordering(output_ordering)
+        .build();
 
     DataSourceExec::from_data_source(config)
 }
@@ -265,17 +376,22 @@ fn csv_exec_multiple() -> Arc<DataSourceExec> {
 
 // Created a sorted parquet exec with multiple files
 fn csv_exec_multiple_sorted(output_ordering: Vec<LexOrdering>) -> Arc<DataSourceExec> {
-    let config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::parse("test:///").unwrap(),
-        schema(),
-        Arc::new(CsvSource::new(false, b',', b'"')),
-    )
-    .with_file_groups(vec![
-        FileGroup::new(vec![PartitionedFile::new("x".to_string(), 100)]),
-        FileGroup::new(vec![PartitionedFile::new("y".to_string(), 100)]),
-    ])
-    .with_output_ordering(output_ordering)
-    .build();
+    let config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), {
+            let options = CsvOptions {
+                has_header: Some(false),
+                delimiter: b',',
+                quote: b'"',
+                ..Default::default()
+            };
+            Arc::new(CsvSource::new(schema()).with_csv_options(options))
+        })
+        .with_file_groups(vec![
+            FileGroup::new(vec![PartitionedFile::new("x".to_string(), 100)]),
+            FileGroup::new(vec![PartitionedFile::new("y".to_string(), 100)]),
+        ])
+        .with_output_ordering(output_ordering)
+        .build();
 
     DataSourceExec::from_data_source(config)
 }
@@ -340,6 +456,71 @@ fn aggregate_exec_with_alias(
     )
 }
 
+fn partitioned_count_aggregate_exec(
+    input: Arc<dyn ExecutionPlan>,
+    group_alias_pairs: Vec<(String, String)>,
+    count_column: &str,
+) -> Arc<dyn ExecutionPlan> {
+    let input_schema = input.schema();
+    let group_by_expr = group_alias_pairs
+        .iter()
+        .map(|(column, alias)| {
+            (
+                col(column, &input_schema).unwrap() as Arc<dyn PhysicalExpr>,
+                alias.clone(),
+            )
+        })
+        .collect::<Vec<_>>();
+    let partial_group_by = PhysicalGroupBy::new_single(group_by_expr.clone());
+    let final_group_by = PhysicalGroupBy::new_single(
+        group_by_expr
+            .iter()
+            .enumerate()
+            .map(|(idx, (_expr, alias))| {
+                (
+                    Arc::new(Column::new(alias, idx)) as Arc<dyn PhysicalExpr>,
+                    alias.clone(),
+                )
+            })
+            .collect::<Vec<_>>(),
+    );
+
+    let aggr_expr = vec![Arc::new(
+        AggregateExprBuilder::new(
+            count_udaf(),
+            vec![col(count_column, &input_schema).unwrap()],
+        )
+        .schema(Arc::clone(&input_schema))
+        .alias(format!("COUNT({count_column})"))
+        .build()
+        .unwrap(),
+    )];
+
+    let partial = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Partial,
+            partial_group_by,
+            aggr_expr.clone(),
+            vec![None],
+            input,
+            Arc::clone(&input_schema),
+        )
+        .unwrap(),
+    );
+
+    Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::FinalPartitioned,
+            final_group_by,
+            aggr_expr,
+            vec![None],
+            Arc::clone(&partial) as _,
+            partial.schema(),
+        )
+        .unwrap(),
+    )
+}
+
 fn hash_join_exec(
     left: Arc<dyn ExecutionPlan>,
     right: Arc<dyn ExecutionPlan>,
@@ -469,83 +650,6 @@ impl TestConfig {
         self
     }
 
-    // This be deleted in https://github.com/apache/datafusion/pull/18185
-    /// Perform a series of runs using the current [`TestConfig`],
-    /// assert the expected plan result,
-    /// and return the result plan (for potential subsequent runs).
-    fn run(
-        &self,
-        expected_lines: &[&str],
-        plan: Arc<dyn ExecutionPlan>,
-        optimizers_to_run: &[Run],
-    ) -> Result<Arc<dyn ExecutionPlan>> {
-        let expected_lines: Vec<&str> = expected_lines.to_vec();
-
-        // Add the ancillary output requirements operator at the start:
-        let optimizer = OutputRequirements::new_add_mode();
-        let mut optimized = optimizer.optimize(plan.clone(), &self.config)?;
-
-        // This file has 2 rules that use tree node, apply these rules to original plan consecutively
-        // After these operations tree nodes should be in a consistent state.
-        // This code block makes sure that these rules doesn't violate tree node integrity.
-        {
-            let adjusted = if self.config.optimizer.top_down_join_key_reordering {
-                // Run adjust_input_keys_ordering rule
-                let plan_requirements =
-                    PlanWithKeyRequirements::new_default(plan.clone());
-                let adjusted = plan_requirements
-                    .transform_down(adjust_input_keys_ordering)
-                    .data()
-                    .and_then(check_integrity)?;
-                // TODO: End state payloads will be checked here.
-                adjusted.plan
-            } else {
-                // Run reorder_join_keys_to_inputs rule
-                plan.clone()
-                    .transform_up(|plan| {
-                        Ok(Transformed::yes(reorder_join_keys_to_inputs(plan)?))
-                    })
-                    .data()?
-            };
-
-            // Then run ensure_distribution rule
-            DistributionContext::new_default(adjusted)
-                .transform_up(|distribution_context| {
-                    ensure_distribution(distribution_context, &self.config)
-                })
-                .data()
-                .and_then(check_integrity)?;
-            // TODO: End state payloads will be checked here.
-        }
-
-        for run in optimizers_to_run {
-            optimized = match run {
-                Run::Distribution => {
-                    let optimizer = EnforceDistribution::new();
-                    optimizer.optimize(optimized, &self.config)?
-                }
-                Run::Sorting => {
-                    let optimizer = EnforceSorting::new();
-                    optimizer.optimize(optimized, &self.config)?
-                }
-            };
-        }
-
-        // Remove the ancillary output requirements operator when done:
-        let optimizer = OutputRequirements::new_remove_mode();
-        let optimized = optimizer.optimize(optimized, &self.config)?;
-
-        // Now format correctly
-        let actual_lines = get_plan_string(&optimized);
-
-        assert_eq!(
-            &expected_lines, &actual_lines,
-            "\n\nexpected:\n\n{expected_lines:#?}\nactual:\n\n{actual_lines:#?}\n\n"
-        );
-
-        Ok(optimized)
-    }
-
     /// Perform a series of runs using the current [`TestConfig`],
     /// assert the expected plan result,
     /// and return the result plan (for potential subsequent runs).
@@ -695,16 +799,13 @@ fn multi_hash_joins() -> Result<()> {
                                 assert_plan!(plan_distrib, @r"
                                 HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, c@2)]
                                   HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, b1@1)]
-                                    RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10
-                                      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-                                        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-                                    RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10
-                                      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-                                        ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
-                                          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-                                  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10
-                                    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                                    RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
                                       DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                    RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1
+                                      ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+                                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
                                 ");
                             },
                     // Should include 4 RepartitionExecs
@@ -713,16 +814,13 @@ fn multi_hash_joins() -> Result<()> {
                                 HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, c@2)]
                                   RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10
                                     HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, b1@1)]
-                                      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10
-                                        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                                      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+                                        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                      RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1
+                                        ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
                                           DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-                                      RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10
-                                        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-                                          ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
-                                            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-                                  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10
-                                    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-                                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+                                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
                                 ");
                             },
                 };
@@ -767,16 +865,13 @@ fn multi_hash_joins() -> Result<()> {
                             assert_plan!(plan_distrib, @r"
                             HashJoinExec: mode=Partitioned, join_type=..., on=[(b1@1, c@2)]
                               HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, b1@1)]
-                                RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10
-                                  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-                                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-                                RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10
-                                  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-                                    ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
-                                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-                              RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10
-                                RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                                RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
                                   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1
+                                  ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                              RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+                                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
                             ");
 
                             }
@@ -787,16 +882,13 @@ fn multi_hash_joins() -> Result<()> {
                             HashJoinExec: mode=Partitioned, join_type=..., on=[(b1@6, c@2)]
                               RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10
                                 HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, b1@1)]
-                                  RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10
-                                    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                                  RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+                                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                  RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1
+                                    ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
                                       DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-                                  RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10
-                                    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-                                      ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
-                                        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-                              RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10
-                                RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-                                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                              RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+                                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
                             ");
 
                             },
@@ -857,15 +949,12 @@ fn multi_joins_after_alias() -> Result<()> {
     HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a1@0, c@2)]
       ProjectionExec: expr=[a@0 as a1, a@0 as a2]
         HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)]
-          RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10
-            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-          RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10
-            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-      RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10
-        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+          RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+          RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
     "
     );
     let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB);
@@ -888,15 +977,12 @@ fn multi_joins_after_alias() -> Result<()> {
     HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a2@1, c@2)]
       ProjectionExec: expr=[a@0 as a1, a@0 as a2]
         HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)]
-          RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10
-            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-          RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10
-            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-      RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10
-        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+          RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+          RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
     "
     );
     let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB);
@@ -946,15 +1032,12 @@ fn multi_joins_after_multi_alias() -> Result<()> {
         ProjectionExec: expr=[c1@0 as a]
           ProjectionExec: expr=[c@2 as c1]
             HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)]
-              RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10
-                RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-              RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10
-                RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-      RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10
-        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+              RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+              RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=1
+                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
     "
     );
     let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB);
@@ -1175,23 +1258,19 @@ fn multi_hash_join_key_ordering() -> Result<()> {
       HashJoinExec: mode=Partitioned, join_type=Inner, on=[(B@2, b1@6), (C@3, c@2), (AA@1, a1@5)]
         ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C]
           HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b@1, b1@1), (c@2, c1@2), (a@0, a1@0)]
-            RepartitionExec: partitioning=Hash([b@1, c@2, a@0], 10), input_partitions=10
-              RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-            RepartitionExec: partitioning=Hash([b1@1, c1@2, a1@0], 10), input_partitions=10
-              RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-                ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]
-                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-        HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b@1, b1@1), (c@2, c1@2), (a@0, a1@0)]
-          RepartitionExec: partitioning=Hash([b@1, c@2, a@0], 10), input_partitions=10
-            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            RepartitionExec: partitioning=Hash([b@1, c@2, a@0], 10), input_partitions=1
               DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-          RepartitionExec: partitioning=Hash([b1@1, c1@2, a1@0], 10), input_partitions=10
-            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            RepartitionExec: partitioning=Hash([b1@1, c1@2, a1@0], 10), input_partitions=1
               ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]
                 DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-    "
-    );
+        HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b@1, b1@1), (c@2, c1@2), (a@0, a1@0)]
+          RepartitionExec: partitioning=Hash([b@1, c@2, a@0], 10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+          RepartitionExec: partitioning=Hash([b1@1, c1@2, a1@0], 10), input_partitions=1
+            ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    "
+    );
     let plan_sort = test_config.to_plan(filter_top_join, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_distrib, plan_sort);
 
@@ -1313,25 +1392,21 @@ fn reorder_join_keys_to_left_input() -> Result<()> {
         assert_eq!(captured_join_type, join_type.to_string());
 
         insta::allow_duplicates! {insta::assert_snapshot!(modified_plan, @r"
-HashJoinExec: mode=Partitioned, join_type=..., on=[(AA@1, a1@5), (B@2, b1@6), (C@3, c@2)]
-  ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C]
-    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a1@0), (b@1, b1@1), (c@2, c1@2)]
-      RepartitionExec: partitioning=Hash([a@0, b@1, c@2], 10), input_partitions=10
-        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-      RepartitionExec: partitioning=Hash([a1@0, b1@1, c1@2], 10), input_partitions=10
-        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-          ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]
-            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-  HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@2, c1@2), (b@1, b1@1), (a@0, a1@0)]
-    RepartitionExec: partitioning=Hash([c@2, b@1, a@0], 10), input_partitions=10
-      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-    RepartitionExec: partitioning=Hash([c1@2, b1@1, a1@0], 10), input_partitions=10
-      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-        ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]
-          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");}
+        HashJoinExec: mode=Partitioned, join_type=..., on=[(AA@1, a1@5), (B@2, b1@6), (C@3, c@2)]
+          ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C]
+            HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a1@0), (b@1, b1@1), (c@2, c1@2)]
+              RepartitionExec: partitioning=Hash([a@0, b@1, c@2], 10), input_partitions=1
+                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+              RepartitionExec: partitioning=Hash([a1@0, b1@1, c1@2], 10), input_partitions=1
+                ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]
+                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+          HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@2, c1@2), (b@1, b1@1), (a@0, a1@0)]
+            RepartitionExec: partitioning=Hash([c@2, b@1, a@0], 10), input_partitions=1
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+            RepartitionExec: partitioning=Hash([c1@2, b1@1, a1@0], 10), input_partitions=1
+              ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]
+                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+        ");}
     }
 
     Ok(())
@@ -1445,25 +1520,21 @@ fn reorder_join_keys_to_right_input() -> Result<()> {
         let (_, plan_str) =
             hide_first(reordered.as_ref(), r"join_type=(\w+)", "join_type=...");
         insta::allow_duplicates! {insta::assert_snapshot!(plan_str, @r"
-HashJoinExec: mode=Partitioned, join_type=..., on=[(C@3, c@2), (B@2, b1@6), (AA@1, a1@5)]
-  ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C]
-    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a1@0), (b@1, b1@1)]
-      RepartitionExec: partitioning=Hash([a@0, b@1], 10), input_partitions=10
-        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-      RepartitionExec: partitioning=Hash([a1@0, b1@1], 10), input_partitions=10
-        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-          ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]
-            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-  HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@2, c1@2), (b@1, b1@1), (a@0, a1@0)]
-    RepartitionExec: partitioning=Hash([c@2, b@1, a@0], 10), input_partitions=10
-      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-    RepartitionExec: partitioning=Hash([c1@2, b1@1, a1@0], 10), input_partitions=10
-      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-        ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]
-          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");}
+        HashJoinExec: mode=Partitioned, join_type=..., on=[(C@3, c@2), (B@2, b1@6), (AA@1, a1@5)]
+          ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C]
+            HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a1@0), (b@1, b1@1)]
+              RepartitionExec: partitioning=Hash([a@0, b@1], 10), input_partitions=1
+                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+              RepartitionExec: partitioning=Hash([a1@0, b1@1], 10), input_partitions=1
+                ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]
+                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+          HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@2, c1@2), (b@1, b1@1), (a@0, a1@0)]
+            RepartitionExec: partitioning=Hash([c@2, b@1, a@0], 10), input_partitions=1
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+            RepartitionExec: partitioning=Hash([c1@2, b1@1, a1@0], 10), input_partitions=1
+              ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]
+                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+        ");}
     }
 
     Ok(())
@@ -1503,15 +1574,6 @@ fn multi_smj_joins() -> Result<()> {
     for join_type in join_types {
         let join =
             sort_merge_join_exec(left.clone(), right.clone(), &join_on, &join_type);
-        let join_plan = |shift| -> String {
-            format!(
-                "{}SortMergeJoin: join_type={join_type}, on=[(a@0, b1@1)]",
-                " ".repeat(shift)
-            )
-        };
-        let join_plan_indent2 = join_plan(2);
-        let join_plan_indent6 = join_plan(6);
-        let join_plan_indent10 = join_plan(10);
 
         // Top join on (a == c)
         let top_join_on = vec![(
@@ -1520,235 +1582,220 @@ fn multi_smj_joins() -> Result<()> {
         )];
         let top_join =
             sort_merge_join_exec(join.clone(), parquet_exec(), &top_join_on, &join_type);
-        let top_join_plan =
-            format!("SortMergeJoin: join_type={join_type}, on=[(a@0, c@2)]");
-
-        let expected = match join_type {
-            // Should include 6 RepartitionExecs (3 hash, 3 round-robin), 3 SortExecs
-            JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti =>
-                vec![
-                    top_join_plan.as_str(),
-                    &join_plan_indent2,
-                    "    SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-                    "      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-                    "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                    "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    "    SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]",
-                    "      RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10",
-                    "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                    "          ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]",
-                    "            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    "  SortExec: expr=[c@2 ASC], preserve_partitioning=[true]",
-                    "    RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-                    "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                    "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                ],
-            // Should include 7 RepartitionExecs (4 hash, 3 round-robin), 4 SortExecs
-            // Since ordering of the left child is not preserved after SortMergeJoin
-            // when mode is Right, RightSemi, RightAnti, Full
-            // - We need to add one additional SortExec after SortMergeJoin in contrast the test cases
-            //   when mode is Inner, Left, LeftSemi, LeftAnti
-            // Similarly, since partitioning of the left side is not preserved
-            // when mode is Right, RightSemi, RightAnti, Full
-            // - We need to add one additional Hash Repartition after SortMergeJoin in contrast the test
-            //   cases when mode is Inner, Left, LeftSemi, LeftAnti
-            _ => vec![
-                    top_join_plan.as_str(),
-                    // Below 2 operators are differences introduced, when join mode is changed
-                    "  SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-                    "    RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-                    &join_plan_indent6,
-                    "        SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-                    "          RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-                    "            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                    "              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    "        SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]",
-                    "          RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10",
-                    "            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                    "              ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]",
-                    "                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    "  SortExec: expr=[c@2 ASC], preserve_partitioning=[true]",
-                    "    RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-                    "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                    "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-            ],
-        };
-        // TODO(wiedld): show different test result if enforce sorting first.
-        test_config.run(&expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?;
-
-        let expected_first_sort_enforcement = match join_type {
-            // Should include 6 RepartitionExecs (3 hash, 3 round-robin), 3 SortExecs
-            JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti =>
-                vec![
-                    top_join_plan.as_str(),
-                    &join_plan_indent2,
-                    "    RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC",
-                    "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                    "        SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-                    "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    "    RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC",
-                    "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                    "        SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]",
-                    "          ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]",
-                    "            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    "  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC",
-                    "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                    "      SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-                    "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                ],
-            // Should include 8 RepartitionExecs (4 hash, 8 round-robin), 4 SortExecs
-            // Since ordering of the left child is not preserved after SortMergeJoin
-            // when mode is Right, RightSemi, RightAnti, Full
-            // - We need to add one additional SortExec after SortMergeJoin in contrast the test cases
-            //   when mode is Inner, Left, LeftSemi, LeftAnti
-            // Similarly, since partitioning of the left side is not preserved
-            // when mode is Right, RightSemi, RightAnti, Full
-            // - We need to add one additional Hash Repartition and Roundrobin repartition after
-            //   SortMergeJoin in contrast the test cases when mode is Inner, Left, LeftSemi, LeftAnti
-            _ => vec![
-                top_join_plan.as_str(),
-                // Below 4 operators are differences introduced, when join mode is changed
-                "  RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC",
-                "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                "      SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-                "        CoalescePartitionsExec",
-                &join_plan_indent10,
-                "            RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC",
-                "              RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                "                SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-                "                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                "            RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC",
-                "              RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                "                SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]",
-                "                  ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]",
-                "                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                "  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC",
-                "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                "      SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-                "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-            ],
-        };
-        // TODO(wiedld): show different test result if enforce distribution first.
-        test_config.run(
-            &expected_first_sort_enforcement,
-            top_join,
-            &SORT_DISTRIB_DISTRIB,
-        )?;
 
-        match join_type {
-            JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => {
-                // This time we use (b1 == c) for top join
-                // Join on (b1 == c)
-                let top_join_on = vec![(
-                    Arc::new(Column::new_with_schema("b1", &join.schema()).unwrap()) as _,
-                    Arc::new(Column::new_with_schema("c", &schema()).unwrap()) as _,
-                )];
-                let top_join =
-                    sort_merge_join_exec(join, parquet_exec(), &top_join_on, &join_type);
-                let top_join_plan =
-                    format!("SortMergeJoin: join_type={join_type}, on=[(b1@6, c@2)]");
-
-                let expected = match join_type {
-                    // Should include 6 RepartitionExecs(3 hash, 3 round-robin) and 3 SortExecs
-                    JoinType::Inner | JoinType::Right => vec![
-                        top_join_plan.as_str(),
-                        &join_plan_indent2,
-                        "    SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-                        "      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-                        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        "    SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]",
-                        "      RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10",
-                        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "          ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]",
-                        "            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        "  SortExec: expr=[c@2 ASC], preserve_partitioning=[true]",
-                        "    RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-                        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    ],
-                    // Should include 7 RepartitionExecs (4 hash, 3 round-robin) and 4 SortExecs
-                    JoinType::Left | JoinType::Full => vec![
-                        top_join_plan.as_str(),
-                        "  SortExec: expr=[b1@6 ASC], preserve_partitioning=[true]",
-                        "    RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10",
-                        &join_plan_indent6,
-                        "        SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-                        "          RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-                        "            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        "        SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]",
-                        "          RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10",
-                        "            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "              ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]",
-                        "                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        "  SortExec: expr=[c@2 ASC], preserve_partitioning=[true]",
-                        "    RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-                        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    ],
-                    // this match arm cannot be reached
-                    _ => unreachable!()
-                };
-                // TODO(wiedld): show different test result if enforce sorting first.
-                test_config.run(&expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?;
-
-                let expected_first_sort_enforcement = match join_type {
-                    // Should include 6 RepartitionExecs (3 of them preserves order) and 3 SortExecs
-                    JoinType::Inner | JoinType::Right => vec![
-                        top_join_plan.as_str(),
-                        &join_plan_indent2,
-                        "    RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC",
-                        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "        SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-                        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        "    RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC",
-                        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "        SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]",
-                        "          ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]",
-                        "            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        "  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC",
-                        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "      SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-                        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    ],
-                    // Should include 8 RepartitionExecs (4 of them preserves order) and 4 SortExecs
-                    JoinType::Left | JoinType::Full => vec![
-                        top_join_plan.as_str(),
-                        "  RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@6 ASC",
-                        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "      SortExec: expr=[b1@6 ASC], preserve_partitioning=[false]",
-                        "        CoalescePartitionsExec",
-                        &join_plan_indent10,
-                        "            RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC",
-                        "              RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "                SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-                        "                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        "            RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC",
-                        "              RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "                SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]",
-                        "                  ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]",
-                        "                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        "  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC",
-                        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "      SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-                        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    ],
-                    // this match arm cannot be reached
-                    _ => unreachable!()
-                };
+        let mut settings = Settings::clone_current();
+        settings.add_filter(&format!("join_type={join_type}"), "join_type=...");
 
-                // TODO(wiedld): show different test result if enforce distribution first.
-                test_config.run(
-                    &expected_first_sort_enforcement,
-                    top_join,
-                    &SORT_DISTRIB_DISTRIB,
-                )?;
-            }
-            _ => {}
+        #[rustfmt::skip]
+        insta::allow_duplicates! {
+            settings.bind(|| {
+                let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT);
+
+                match join_type {
+                    // Should include 6 RepartitionExecs (3 hash, 3 round-robin), 3 SortExecs
+                    JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti => {
+                        assert_plan!(plan_distrib, @r"
+                        SortMergeJoinExec: join_type=..., on=[(a@0, c@2)]
+                          SortMergeJoinExec: join_type=..., on=[(a@0, b1@1)]
+                            SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+                              RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+                                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                            SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]
+                              RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1
+                                ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                          SortExec: expr=[c@2 ASC], preserve_partitioning=[true]
+                            RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+                              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                        ");
+                    }
+                    // Should include 7 RepartitionExecs (4 hash, 3 round-robin), 4 SortExecs
+                    // Since ordering of the left child is not preserved after SortMergeJoinExec
+                    // when mode is Right, RightSemi, RightAnti, Full
+                    // - We need to add one additional SortExec after SortMergeJoinExec in contrast the test cases
+                    //   when mode is Inner, Left, LeftSemi, LeftAnti
+                    // Similarly, since partitioning of the left side is not preserved
+                    // when mode is Right, RightSemi, RightAnti, Full
+                    // - We need to add one additional Hash Repartition after SortMergeJoinExec in contrast the test
+                    //   cases when mode is Inner, Left, LeftSemi, LeftAnti
+                    _ => {
+                        assert_plan!(plan_distrib, @r"
+                        SortMergeJoinExec: join_type=..., on=[(a@0, c@2)]
+                          SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+                            RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10
+                              SortMergeJoinExec: join_type=..., on=[(a@0, b1@1)]
+                                SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+                                  RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+                                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]
+                                  RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1
+                                    ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                          SortExec: expr=[c@2 ASC], preserve_partitioning=[true]
+                            RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+                              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                        ");
+                    }
+                }
+
+                let plan_sort = test_config.to_plan(top_join.clone(), &SORT_DISTRIB_DISTRIB);
+
+                match join_type {
+                    // Should include 6 RepartitionExecs (3 hash, 3 round-robin), 3 SortExecs
+                    JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti => {
+                        // TODO(wiedld): show different test result if enforce distribution first.
+                        assert_plan!(plan_sort, @r"
+                        SortMergeJoinExec: join_type=..., on=[(a@0, c@2)]
+                          SortMergeJoinExec: join_type=..., on=[(a@0, b1@1)]
+                            RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1, maintains_sort_order=true
+                              SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+                                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                            RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1, maintains_sort_order=true
+                              SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]
+                                ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                          RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1, maintains_sort_order=true
+                            SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+                              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                        ");
+                    }
+                    // Should include 8 RepartitionExecs (4 hash, 8 round-robin), 4 SortExecs
+                    // Since ordering of the left child is not preserved after SortMergeJoinExec
+                    // when mode is Right, RightSemi, RightAnti, Full
+                    // - We need to add one additional SortExec after SortMergeJoinExec in contrast the test cases
+                    //   when mode is Inner, Left, LeftSemi, LeftAnti
+                    // Similarly, since partitioning of the left side is not preserved
+                    // when mode is Right, RightSemi, RightAnti, Full
+                    // - We need to add one additional Hash Repartition and Roundrobin repartition after
+                    //   SortMergeJoinExec in contrast the test cases when mode is Inner, Left, LeftSemi, LeftAnti
+                    _ => {
+                        // TODO(wiedld): show different test result if enforce distribution first.
+                        assert_plan!(plan_sort, @r"
+                        SortMergeJoinExec: join_type=..., on=[(a@0, c@2)]
+                          RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1, maintains_sort_order=true
+                            SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+                              CoalescePartitionsExec
+                                SortMergeJoinExec: join_type=..., on=[(a@0, b1@1)]
+                                  RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1, maintains_sort_order=true
+                                    SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+                                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                  RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1, maintains_sort_order=true
+                                    SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]
+                                      ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                          RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1, maintains_sort_order=true
+                            SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+                              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                        ");
+                    }
+                }
+
+                match join_type {
+                    JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => {
+                        // This time we use (b1 == c) for top join
+                        // Join on (b1 == c)
+                        let top_join_on = vec![(
+                            Arc::new(Column::new_with_schema("b1", &join.schema()).unwrap()) as _,
+                            Arc::new(Column::new_with_schema("c", &schema()).unwrap()) as _,
+                        )];
+                        let top_join = sort_merge_join_exec(join, parquet_exec(), &top_join_on, &join_type);
+
+                        let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT);
+
+                        match join_type {
+                            // Should include 6 RepartitionExecs(3 hash, 3 round-robin) and 3 SortExecs
+                            JoinType::Inner | JoinType::Right => {
+                                // TODO(wiedld): show different test result if enforce sorting first.
+                                assert_plan!(plan_distrib, @r"
+                                SortMergeJoinExec: join_type=..., on=[(b1@6, c@2)]
+                                  SortMergeJoinExec: join_type=..., on=[(a@0, b1@1)]
+                                    SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+                                      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+                                        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                    SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]
+                                      RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1
+                                        ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                  SortExec: expr=[c@2 ASC], preserve_partitioning=[true]
+                                    RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+                                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                ");
+                            }
+                            // Should include 7 RepartitionExecs (4 hash, 3 round-robin) and 4 SortExecs
+                            JoinType::Left | JoinType::Full => {
+                                // TODO(wiedld): show different test result if enforce sorting first.
+                                assert_plan!(plan_distrib, @r"
+                                SortMergeJoinExec: join_type=..., on=[(b1@6, c@2)]
+                                  SortExec: expr=[b1@6 ASC], preserve_partitioning=[true]
+                                    RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10
+                                      SortMergeJoinExec: join_type=..., on=[(a@0, b1@1)]
+                                        SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+                                          RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+                                            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                        SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]
+                                          RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1
+                                            ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                  SortExec: expr=[c@2 ASC], preserve_partitioning=[true]
+                                    RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+                                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                ");
+                            }
+                            // this match arm cannot be reached
+                            _ => unreachable!()
+                        }
+
+                        let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB);
+
+                        match join_type {
+                            // Should include 6 RepartitionExecs (3 of them preserves order) and 3 SortExecs
+                            JoinType::Inner | JoinType::Right => {
+                                // TODO(wiedld): show different test result if enforce distribution first.
+                                assert_plan!(plan_sort, @r"
+                                SortMergeJoinExec: join_type=..., on=[(b1@6, c@2)]
+                                  SortMergeJoinExec: join_type=..., on=[(a@0, b1@1)]
+                                    RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1, maintains_sort_order=true
+                                      SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+                                        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                    RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1, maintains_sort_order=true
+                                      SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]
+                                        ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1, maintains_sort_order=true
+                                    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+                                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                ");
+                            }
+                            // Should include 8 RepartitionExecs (4 of them preserves order) and 4 SortExecs
+                            JoinType::Left | JoinType::Full => {
+                                // TODO(wiedld): show different test result if enforce distribution first.
+                                assert_plan!(plan_sort, @r"
+                                SortMergeJoinExec: join_type=..., on=[(b1@6, c@2)]
+                                  RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=1, maintains_sort_order=true
+                                    SortExec: expr=[b1@6 ASC], preserve_partitioning=[false]
+                                      CoalescePartitionsExec
+                                        SortMergeJoinExec: join_type=..., on=[(a@0, b1@1)]
+                                          RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1, maintains_sort_order=true
+                                            SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+                                              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                          RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1, maintains_sort_order=true
+                                            SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]
+                                              ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1, maintains_sort_order=true
+                                    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+                                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                ");
+                            }
+                            // this match arm cannot be reached
+                            _ => unreachable!()
+                        }
+                    }
+                    _ => {}
+                }
+            });
         }
     }
-
     Ok(())
 }
 
@@ -1806,50 +1853,48 @@ fn smj_join_key_ordering() -> Result<()> {
     // Only two RepartitionExecs added
     let plan_distrib = test_config.to_plan(join.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib, @r"
-SortMergeJoin: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)]
-  SortExec: expr=[b3@1 ASC, a3@0 ASC], preserve_partitioning=[true]
-    ProjectionExec: expr=[a1@0 as a3, b1@1 as b3]
-      ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]
-        AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]
-          RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10
-            AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]
-              RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-  SortExec: expr=[b2@1 ASC, a2@0 ASC], preserve_partitioning=[true]
-    ProjectionExec: expr=[a@1 as a2, b@0 as b2]
-      AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]
-        RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10
-          AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]
-            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    SortMergeJoinExec: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)]
+      SortExec: expr=[b3@1 ASC, a3@0 ASC], preserve_partitioning=[true]
+        ProjectionExec: expr=[a1@0 as a3, b1@1 as b3]
+          ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]
+            AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]
+              RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10
+                AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]
+                  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      SortExec: expr=[b2@1 ASC, a2@0 ASC], preserve_partitioning=[true]
+        ProjectionExec: expr=[a@1 as a2, b@0 as b2]
+          AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]
+            RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10
+              AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]
+                RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     // Test: result IS DIFFERENT, if EnforceSorting is run first:
     let plan_sort = test_config.to_plan(join, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_sort, @r"
-SortMergeJoin: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)]
-  RepartitionExec: partitioning=Hash([b3@1, a3@0], 10), input_partitions=10, preserve_order=true, sort_exprs=b3@1 ASC, a3@0 ASC
-    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-      SortExec: expr=[b3@1 ASC, a3@0 ASC], preserve_partitioning=[false]
-        CoalescePartitionsExec
-          ProjectionExec: expr=[a1@0 as a3, b1@1 as b3]
-            ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]
-              AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]
-                RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10
-                  AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]
+    SortMergeJoinExec: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)]
+      RepartitionExec: partitioning=Hash([b3@1, a3@0], 10), input_partitions=1, maintains_sort_order=true
+        SortExec: expr=[b3@1 ASC, a3@0 ASC], preserve_partitioning=[false]
+          CoalescePartitionsExec
+            ProjectionExec: expr=[a1@0 as a3, b1@1 as b3]
+              ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]
+                AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]
+                  RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10
+                    AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]
+                      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      RepartitionExec: partitioning=Hash([b2@1, a2@0], 10), input_partitions=1, maintains_sort_order=true
+        SortExec: expr=[b2@1 ASC, a2@0 ASC], preserve_partitioning=[false]
+          CoalescePartitionsExec
+            ProjectionExec: expr=[a@1 as a2, b@0 as b2]
+              AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]
+                RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10
+                  AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]
                     RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
                       DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-  RepartitionExec: partitioning=Hash([b2@1, a2@0], 10), input_partitions=10, preserve_order=true, sort_exprs=b2@1 ASC, a2@0 ASC
-    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-      SortExec: expr=[b2@1 ASC, a2@0 ASC], preserve_partitioning=[false]
-        CoalescePartitionsExec
-          ProjectionExec: expr=[a@1 as a2, b@0 as b2]
-            AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]
-              RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10
-                AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]
-                  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    ");
 
     Ok(())
 }
@@ -1867,9 +1912,6 @@ fn merge_does_not_need_sort() -> Result<()> {
     // Scan some sorted parquet files
     let exec = parquet_exec_multiple_sorted(vec![sort_key.clone()]);
 
-    // CoalesceBatchesExec to mimic behavior after a filter
-    let exec = Arc::new(CoalesceBatchesExec::new(exec, 4096));
-
     // Merge from multiple parquet files and keep the data sorted
     let exec: Arc<dyn ExecutionPlan> =
         Arc::new(SortPreservingMergeExec::new(sort_key, exec));
@@ -1882,10 +1924,9 @@ fn merge_does_not_need_sort() -> Result<()> {
     let plan_distrib = test_config.to_plan(exec.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
                                                                                     @r"
-SortPreservingMergeExec: [a@0 ASC]
-  CoalesceBatchesExec: target_batch_size=4096
-    DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
-");
+    SortPreservingMergeExec: [a@0 ASC]
+      DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    ");
 
     // Test: result IS DIFFERENT, if EnforceSorting is run first:
     //
@@ -1896,11 +1937,10 @@ SortPreservingMergeExec: [a@0 ASC]
     let plan_sort = test_config.to_plan(exec, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_sort,
                                                                                     @r"
-SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
-  CoalescePartitionsExec
-    CoalesceBatchesExec: target_batch_size=4096
-      DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
-");
+    SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    ");
 
     Ok(())
 }
@@ -2077,11 +2117,11 @@ fn repartition_sorted_limit() -> Result<()> {
     let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
                                                                                         @r"
-GlobalLimitExec: skip=0, fetch=100
-  LocalLimitExec: fetch=100
-    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
-      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    GlobalLimitExec: skip=0, fetch=100
+      LocalLimitExec: fetch=100
+        SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
     // data is sorted so can't repartition here
     let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_distrib, plan_sort);
@@ -2106,12 +2146,12 @@ fn repartition_sorted_limit_with_filter() -> Result<()> {
     let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
                                                                                         @r"
-SortRequiredExec: [c@2 ASC]
-  FilterExec: c@2 = 0
-    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-      SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
-        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    SortRequiredExec: [c@2 ASC]
+      FilterExec: c@2 = 0
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+          SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
     // We can use repartition here, ordering requirement by SortRequiredExec
     // is still satisfied.
     let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
@@ -2132,19 +2172,19 @@ fn repartition_ignores_limit() -> Result<()> {
     let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
                                                                                         @r"
-AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
-  RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10
-    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
-      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-        GlobalLimitExec: skip=0, fetch=100
-          CoalescePartitionsExec
-            LocalLimitExec: fetch=100
-              FilterExec: c@2 = 0
-                RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-                  GlobalLimitExec: skip=0, fetch=100
-                    LocalLimitExec: fetch=100
-                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10
+        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            GlobalLimitExec: skip=0, fetch=100
+              CoalescePartitionsExec
+                LocalLimitExec: fetch=100
+                  FilterExec: c@2 = 0
+                    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                      GlobalLimitExec: skip=0, fetch=100
+                        LocalLimitExec: fetch=100
+                          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
     // repartition should happen prior to the filter to maximize parallelism
     // Expect no repartition to happen for local limit (DataSourceExec)
 
@@ -2162,13 +2202,13 @@ fn repartition_ignores_union() -> Result<()> {
     let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
                                                                                         @r"
-UnionExec
-  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    UnionExec
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
     // Expect no repartition of DataSourceExec
     let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_distrib, plan_sort);
@@ -2191,9 +2231,9 @@ fn repartition_through_sort_preserving_merge() -> Result<()> {
     let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
                                                                                         @r"
-SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
-  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
     let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_distrib, plan_sort);
 
@@ -2219,9 +2259,9 @@ fn repartition_ignores_sort_preserving_merge() -> Result<()> {
     // Test: run EnforceDistribution, then EnforceSort
     assert_plan!(plan_distrib,
                                                                                         @r"
-SortPreservingMergeExec: [c@2 ASC]
-  DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
-");
+    SortPreservingMergeExec: [c@2 ASC]
+      DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
     // should not sort (as the data was already sorted)
     // should not repartition, since increased parallelism is not beneficial for SortPReservingMerge
 
@@ -2229,10 +2269,10 @@ SortPreservingMergeExec: [c@2 ASC]
     let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_sort,
                                                                                         @r"
-SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
-  CoalescePartitionsExec
-    DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
-");
+    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
 
     Ok(())
 }
@@ -2257,11 +2297,11 @@ fn repartition_ignores_sort_preserving_merge_with_union() -> Result<()> {
     // Test: run EnforceDistribution, then EnforceSort.
     assert_plan!(plan_distrib,
                                                                                         @r"
-SortPreservingMergeExec: [c@2 ASC]
-  UnionExec
-    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
-    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
-");
+    SortPreservingMergeExec: [c@2 ASC]
+      UnionExec
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
     //
     // should not repartition / sort (as the data was already sorted)
 
@@ -2269,12 +2309,12 @@ SortPreservingMergeExec: [c@2 ASC]
     let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_sort,
                                                                                         @r"
-SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
-  CoalescePartitionsExec
-    UnionExec
-      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
-      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
-");
+    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        UnionExec
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
 
     Ok(())
 }
@@ -2301,11 +2341,11 @@ fn repartition_does_not_destroy_sort() -> Result<()> {
     let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
                                                                                         @r"
-SortRequiredExec: [d@3 ASC]
-  FilterExec: c@2 = 0
-    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[d@3 ASC], file_type=parquet
-");
+    SortRequiredExec: [d@3 ASC]
+      FilterExec: c@2 = 0
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[d@3 ASC], file_type=parquet
+    ");
     // during repartitioning ordering is preserved
     let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_distrib, plan_sort);
@@ -2341,13 +2381,13 @@ fn repartition_does_not_destroy_sort_more_complex() -> Result<()> {
     let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
                                                                                         @r"
-UnionExec
-  SortRequiredExec: [c@2 ASC]
-    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
-  FilterExec: c@2 = 0
-    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    UnionExec
+      SortRequiredExec: [c@2 ASC]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+      FilterExec: c@2 = 0
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
     // union input 1: no repartitioning
     // union input 2: should repartition
     //
@@ -2384,23 +2424,23 @@ fn repartition_transitively_with_projection() -> Result<()> {
     let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
                                                                                         @r"
-SortPreservingMergeExec: [sum@0 ASC]
-  SortExec: expr=[sum@0 ASC], preserve_partitioning=[true]
-    ProjectionExec: expr=[a@0 + b@1 as sum]
-      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    SortPreservingMergeExec: [sum@0 ASC]
+      SortExec: expr=[sum@0 ASC], preserve_partitioning=[true]
+        ProjectionExec: expr=[a@0 + b@1 as sum]
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     // Test: result IS DIFFERENT, if EnforceSorting is run first:
     let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_sort,
                                                                                         @r"
-SortExec: expr=[sum@0 ASC], preserve_partitioning=[false]
-  CoalescePartitionsExec
-    ProjectionExec: expr=[a@0 + b@1 as sum]
-      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    SortExec: expr=[sum@0 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        ProjectionExec: expr=[a@0 + b@1 as sum]
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
     // Since this projection is not trivial, increasing parallelism is beneficial
 
     Ok(())
@@ -2432,10 +2472,10 @@ fn repartition_ignores_transitively_with_projection() -> Result<()> {
     let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
                                                                                         @r"
-SortRequiredExec: [c@2 ASC]
-  ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]
-    DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
-");
+    SortRequiredExec: [c@2 ASC]
+      ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]
+        DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
     // Since this projection is trivial, increasing parallelism is not beneficial
 
     let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
@@ -2469,10 +2509,10 @@ fn repartition_transitively_past_sort_with_projection() -> Result<()> {
     let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
                                                                                         @r"
-SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
-  ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]
-    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+      ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
     // Since this projection is trivial, increasing parallelism is not beneficial
     let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_distrib, plan_sort);
@@ -2494,12 +2534,12 @@ fn repartition_transitively_past_sort_with_filter() -> Result<()> {
     let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
                                                                                         @r"
-SortPreservingMergeExec: [a@0 ASC]
-  SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
-    FilterExec: c@2 = 0
-      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    SortPreservingMergeExec: [a@0 ASC]
+      SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+        FilterExec: c@2 = 0
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     // Expect repartition on the input to the sort (as it can benefit from additional parallelism)
 
@@ -2507,12 +2547,12 @@ SortPreservingMergeExec: [a@0 ASC]
     let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_sort,
                                                                                         @r"
-SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
-  CoalescePartitionsExec
-    FilterExec: c@2 = 0
-      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        FilterExec: c@2 = 0
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
     // Expect repartition on the input of the filter (as it can benefit from additional parallelism)
 
     Ok(())
@@ -2543,13 +2583,13 @@ fn repartition_transitively_past_sort_with_projection_and_filter() -> Result<()>
     let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
                                                                                         @r"
-SortPreservingMergeExec: [a@0 ASC]
-  SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
-    ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]
-      FilterExec: c@2 = 0
-        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    SortPreservingMergeExec: [a@0 ASC]
+      SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+        ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]
+          FilterExec: c@2 = 0
+            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     // Expect repartition on the input to the sort (as it can benefit from additional parallelism)
     // repartition is lowest down
@@ -2558,13 +2598,13 @@ SortPreservingMergeExec: [a@0 ASC]
     let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_sort,
                                                                                         @r"
-SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
-  CoalescePartitionsExec
-    ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]
-      FilterExec: c@2 = 0
-        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]
+          FilterExec: c@2 = 0
+            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     Ok(())
 }
@@ -2584,11 +2624,11 @@ fn parallelization_single_partition() -> Result<()> {
         test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_parquet_distrib,
                                                                                         @r"
-AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
-  RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
-    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
-      DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
+        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+          DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
     let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_parquet_distrib, plan_parquet_sort);
 
@@ -2596,11 +2636,11 @@ AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
     let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_csv_distrib,
                                                                                         @r"
-AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
-  RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
-    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
-      DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
-");
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
+        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+          DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    ");
     let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_csv_distrib, plan_csv_sort);
 
@@ -2632,10 +2672,10 @@ fn parallelization_multiple_files() -> Result<()> {
         test_config_concurrency_3.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_3_distrib,
                                                                                         @r"
-SortRequiredExec: [a@0 ASC]
-  FilterExec: c@2 = 0
-    DataSourceExec: file_groups={3 groups: [[x:0..50], [y:0..100], [x:50..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
-");
+    SortRequiredExec: [a@0 ASC]
+      FilterExec: c@2 = 0
+        DataSourceExec: file_groups={3 groups: [[x:0..50], [y:0..100], [x:50..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    ");
     let plan_3_sort =
         test_config_concurrency_3.to_plan(plan.clone(), &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_3_distrib, plan_3_sort);
@@ -2645,10 +2685,10 @@ SortRequiredExec: [a@0 ASC]
         test_config_concurrency_8.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_8_distrib,
                                                                                         @r"
-SortRequiredExec: [a@0 ASC]
-  FilterExec: c@2 = 0
-    DataSourceExec: file_groups={8 groups: [[x:0..25], [y:0..25], [x:25..50], [y:25..50], [x:50..75], [y:50..75], [x:75..100], [y:75..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
-");
+    SortRequiredExec: [a@0 ASC]
+      FilterExec: c@2 = 0
+        DataSourceExec: file_groups={8 groups: [[x:0..25], [y:0..25], [x:25..50], [y:25..50], [x:50..75], [y:50..75], [x:75..100], [y:75..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    ");
     let plan_8_sort = test_config_concurrency_8.to_plan(plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_8_distrib, plan_8_sort);
 
@@ -2667,46 +2707,55 @@ fn parallelization_compressed_csv() -> Result<()> {
         FileCompressionType::UNCOMPRESSED,
     ];
 
-    let expected_not_partitioned = [
-        "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]",
-        "  RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2",
-        "    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]",
-        "      RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1",
-        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-    ];
-
-    let expected_partitioned = [
-        "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]",
-        "  RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2",
-        "    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]",
-        "      DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-    ];
+    #[rustfmt::skip]
+    insta::allow_duplicates! {
+        for compression_type in compression_types {
+            let plan = aggregate_exec_with_alias(
+                DataSourceExec::from_data_source(
+                    FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), {
+                        let options = CsvOptions {
+                            has_header: Some(false),
+                            delimiter: b',',
+                            quote: b'"',
+                            ..Default::default()
+                        };
+                        Arc::new(CsvSource::new(schema()).with_csv_options(options))
+                    })
+                    .with_file(PartitionedFile::new("x".to_string(), 100))
+                    .with_file_compression_type(compression_type)
+                    .build(),
+                ),
+                vec![("a".to_string(), "a".to_string())],
+            );
+            let test_config = TestConfig::default()
+                .with_query_execution_partitions(2)
+                .with_prefer_repartition_file_scans(10);
+
+            let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+            if compression_type.is_compressed() {
+                // Compressed files cannot be partitioned
+                assert_plan!(plan_distrib,
+                    @r"
+                AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+                  RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
+                    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+                      RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+                        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+                ");
+            } else {
+                // Uncompressed files can be partitioned
+                assert_plan!(plan_distrib,
+                    @r"
+                AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+                  RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
+                    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+                      DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+                ");
+            }
 
-    for compression_type in compression_types {
-        let expected = if compression_type.is_compressed() {
-            &expected_not_partitioned[..]
-        } else {
-            &expected_partitioned[..]
-        };
-
-        let plan = aggregate_exec_with_alias(
-            DataSourceExec::from_data_source(
-                FileScanConfigBuilder::new(
-                    ObjectStoreUrl::parse("test:///").unwrap(),
-                    schema(),
-                    Arc::new(CsvSource::new(false, b',', b'"')),
-                )
-                .with_file(PartitionedFile::new("x".to_string(), 100))
-                .with_file_compression_type(compression_type)
-                .build(),
-            ),
-            vec![("a".to_string(), "a".to_string())],
-        );
-        let test_config = TestConfig::default()
-            .with_query_execution_partitions(2)
-            .with_prefer_repartition_file_scans(10);
-        test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-        test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?;
+            let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+            assert_plan!(plan_distrib, plan_sort);
+        }
     }
     Ok(())
 }
@@ -2726,23 +2775,23 @@ fn parallelization_two_partitions() -> Result<()> {
         test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_parquet_distrib,
                                                                                     @r"
-AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
-  RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
-    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
-      DataSourceExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], file_type=parquet
-");
-    // Plan already has two partitions
-    let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB);
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
+        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+          DataSourceExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    // Plan already has two partitions
+    let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_parquet_distrib, plan_parquet_sort);
 
     // Test: with csv
     let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_csv_distrib, @r"
-AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
-  RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
-    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
-      DataSourceExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
-");
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
+        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+          DataSourceExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    ");
     // Plan already has two partitions
     let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_csv_distrib, plan_csv_sort);
@@ -2766,11 +2815,11 @@ fn parallelization_two_partitions_into_four() -> Result<()> {
     // Multiple source files split across partitions
     assert_plan!(plan_parquet_distrib,
                                                                                     @r"
-AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
-  RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
-    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
-      DataSourceExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
+        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+          DataSourceExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
     // Multiple source files split across partitions
     let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_parquet_distrib, plan_parquet_sort);
@@ -2779,11 +2828,11 @@ AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
     let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT);
     // Multiple source files split across partitions
     assert_plan!(plan_csv_distrib, @r"
-AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
-  RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
-    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
-      DataSourceExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
-");
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
+        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+          DataSourceExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    ");
     // Multiple source files split across partitions
     let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_csv_distrib, plan_csv_sort);
@@ -2808,11 +2857,11 @@ fn parallelization_sorted_limit() -> Result<()> {
     let plan_parquet_distrib =
         test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_parquet_distrib, @r"
-GlobalLimitExec: skip=0, fetch=100
-  LocalLimitExec: fetch=100
-    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
-      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    GlobalLimitExec: skip=0, fetch=100
+      LocalLimitExec: fetch=100
+        SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
     // data is sorted so can't repartition here
     // Doesn't parallelize for SortExec without preserve_partitioning
     let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB);
@@ -2822,11 +2871,11 @@ GlobalLimitExec: skip=0, fetch=100
     let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_csv_distrib,
         @r"
-GlobalLimitExec: skip=0, fetch=100
-  LocalLimitExec: fetch=100
-    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
-      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
-");
+    GlobalLimitExec: skip=0, fetch=100
+      LocalLimitExec: fetch=100
+        SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    ");
     // data is sorted so can't repartition here
     // Doesn't parallelize for SortExec without preserve_partitioning
     let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB);
@@ -2857,14 +2906,14 @@ fn parallelization_limit_with_filter() -> Result<()> {
     // SortExec doesn't benefit from input partitioning
     assert_plan!(plan_parquet_distrib,
         @r"
-GlobalLimitExec: skip=0, fetch=100
-  CoalescePartitionsExec
-    LocalLimitExec: fetch=100
-      FilterExec: c@2 = 0
-        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-          SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
-            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    GlobalLimitExec: skip=0, fetch=100
+      CoalescePartitionsExec
+        LocalLimitExec: fetch=100
+          FilterExec: c@2 = 0
+            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+              SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
     let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_parquet_distrib, plan_parquet_sort);
 
@@ -2875,14 +2924,14 @@ GlobalLimitExec: skip=0, fetch=100
     // SortExec doesn't benefit from input partitioning
     assert_plan!(plan_csv_distrib,
                                                                                     @r"
-GlobalLimitExec: skip=0, fetch=100
-  CoalescePartitionsExec
-    LocalLimitExec: fetch=100
-      FilterExec: c@2 = 0
-        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-          SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
-            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
-");
+    GlobalLimitExec: skip=0, fetch=100
+      CoalescePartitionsExec
+        LocalLimitExec: fetch=100
+          FilterExec: c@2 = 0
+            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+              SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    ");
     let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_csv_distrib, plan_csv_sort);
 
@@ -2961,13 +3010,13 @@ fn parallelization_union_inputs() -> Result<()> {
         test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_parquet_distrib,
                                                                                     @r"
-UnionExec
-  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    UnionExec
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
     // Union doesn't benefit from input partitioning - no parallelism
     let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_parquet_distrib, plan_parquet_sort);
@@ -2976,13 +3025,13 @@ UnionExec
     let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_csv_distrib,
                                                                                     @r"
-UnionExec
-  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
-  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
-  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
-  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
-  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
-");
+    UnionExec
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    ");
     // Union doesn't benefit from input partitioning - no parallelism
     let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_csv_distrib, plan_csv_sort);
@@ -3188,9 +3237,9 @@ fn parallelization_ignores_transitively_with_projection_parquet() -> Result<()>
     // data should not be repartitioned / resorted
     assert_plan!(plan_parquet_distrib,
                                                                                     @r"
-ProjectionExec: expr=[a@0 as a2, c@2 as c2]
-  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
-");
+    ProjectionExec: expr=[a@0 as a2, c@2 as c2]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
     let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_parquet_distrib, plan_parquet_sort);
 
@@ -3223,18 +3272,18 @@ fn parallelization_ignores_transitively_with_projection_csv() -> Result<()> {
     let plan_csv = sort_preserving_merge_exec(sort_key_after_projection, proj_csv);
     assert_plan!(plan_csv,
                                                                                         @r"
-SortPreservingMergeExec: [c2@1 ASC]
-  ProjectionExec: expr=[a@0 as a2, c@2 as c2]
-    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false
-");
+    SortPreservingMergeExec: [c2@1 ASC]
+      ProjectionExec: expr=[a@0 as a2, c@2 as c2]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false
+    ");
 
     let test_config = TestConfig::default();
     let plan_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
                                                                                         @r"
-ProjectionExec: expr=[a@0 as a2, c@2 as c2]
-  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false
-");
+    ProjectionExec: expr=[a@0 as a2, c@2 as c2]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false
+    ");
     // Expected Outcome:
     // data should not be repartitioned / resorted
     let plan_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB);
@@ -3250,21 +3299,21 @@ fn remove_redundant_roundrobins() -> Result<()> {
     let physical_plan = repartition_exec(filter_exec(repartition));
     assert_plan!(physical_plan,
                                                                                         @r"
-RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10
-  FilterExec: c@2 = 0
     RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10
-      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+      FilterExec: c@2 = 0
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     let test_config = TestConfig::default();
     let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
                                                                                         @r"
-FilterExec: c@2 = 0
-  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    FilterExec: c@2 = 0
+      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
     let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_distrib, plan_sort);
 
@@ -3292,11 +3341,11 @@ fn remove_unnecessary_spm_after_filter() -> Result<()> {
     // This is still satisfied since, after filter that column is constant.
     assert_plan!(plan_distrib,
                                                                                         @r"
-CoalescePartitionsExec
-  FilterExec: c@2 = 0
-    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=c@2 ASC
-      DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
-");
+    CoalescePartitionsExec
+      FilterExec: c@2 = 0
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=c@2 ASC
+          DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
     let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_distrib, plan_sort);
 
@@ -3321,11 +3370,76 @@ fn preserve_ordering_through_repartition() -> Result<()> {
     let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
                                                                                         @r"
-SortPreservingMergeExec: [d@3 ASC]
-  FilterExec: c@2 = 0
-    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=d@3 ASC
-      DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[d@3 ASC], file_type=parquet
-");
+    SortPreservingMergeExec: [d@3 ASC]
+      FilterExec: c@2 = 0
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=d@3 ASC
+          DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[d@3 ASC], file_type=parquet
+    ");
+    let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
+
+    Ok(())
+}
+
+#[test]
+fn preserve_ordering_for_streaming_sorted_aggregate() -> Result<()> {
+    let schema = schema();
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("a", &schema)?,
+        options: SortOptions::default(),
+    }]
+    .into();
+    let input = parquet_exec_multiple_sorted(vec![sort_key]);
+    let physical_plan = partitioned_count_aggregate_exec(
+        input,
+        vec![("a".to_string(), "a".to_string())],
+        "b",
+    );
+
+    let test_config = TestConfig::default().with_query_execution_partitions(2);
+
+    let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib, @r"
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[COUNT(b)], ordering_mode=Sorted
+      RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2, preserve_order=true, sort_exprs=a@0 ASC
+        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[COUNT(b)], ordering_mode=Sorted
+          DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    ");
+
+    let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
+
+    Ok(())
+}
+
+#[test]
+fn preserve_ordering_for_streaming_partially_sorted_aggregate() -> Result<()> {
+    let schema = schema();
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("a", &schema)?,
+        options: SortOptions::default(),
+    }]
+    .into();
+    let input = parquet_exec_multiple_sorted(vec![sort_key]);
+    let physical_plan = partitioned_count_aggregate_exec(
+        input,
+        vec![
+            ("a".to_string(), "a".to_string()),
+            ("b".to_string(), "b".to_string()),
+        ],
+        "c",
+    );
+
+    let test_config = TestConfig::default().with_query_execution_partitions(2);
+
+    let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib, @r"
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b], aggr=[COUNT(c)], ordering_mode=PartiallySorted([0])
+      RepartitionExec: partitioning=Hash([a@0, b@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@0 ASC
+        AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b], aggr=[COUNT(c)], ordering_mode=PartiallySorted([0])
+          DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    ");
+
     let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_distrib, plan_sort);
 
@@ -3349,23 +3463,23 @@ fn do_not_preserve_ordering_through_repartition() -> Result<()> {
     // Test: run EnforceDistribution, then EnforceSort.
     assert_plan!(plan_distrib,
                                                                                         @r"
-SortPreservingMergeExec: [a@0 ASC]
-  SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
-    FilterExec: c@2 = 0
-      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
-        DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
-");
+    SortPreservingMergeExec: [a@0 ASC]
+      SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+        FilterExec: c@2 = 0
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
+            DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    ");
 
     // Test: result IS DIFFERENT, if EnforceSorting is run first:
     let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_sort,
                                                                                         @r"
-SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
-  CoalescePartitionsExec
-    FilterExec: c@2 = 0
-      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
-        DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
-");
+    SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        FilterExec: c@2 = 0
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
+            DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    ");
 
     Ok(())
 }
@@ -3384,11 +3498,11 @@ fn no_need_for_sort_after_filter() -> Result<()> {
     let test_config = TestConfig::default();
     let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib, @r"
-CoalescePartitionsExec
-  FilterExec: c@2 = 0
-    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
-      DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
-");
+    CoalescePartitionsExec
+      FilterExec: c@2 = 0
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
+          DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
     let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_distrib, plan_sort);
     // After CoalescePartitionsExec c is still constant. Hence c@2 ASC ordering is already satisfied.
@@ -3420,24 +3534,24 @@ fn do_not_preserve_ordering_through_repartition2() -> Result<()> {
     // Test: run EnforceDistribution, then EnforceSort.
     assert_plan!(plan_distrib,
                                                                                         @r"
-SortPreservingMergeExec: [a@0 ASC]
-  SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
-    FilterExec: c@2 = 0
-      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
-        DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
-");
+    SortPreservingMergeExec: [a@0 ASC]
+      SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+        FilterExec: c@2 = 0
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
+            DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
 
     // Test: result IS DIFFERENT, if EnforceSorting is run first:
     let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_sort,
                                                                                         @r"
-SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
-  CoalescePartitionsExec
-    SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
-      FilterExec: c@2 = 0
-        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
-          DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
-");
+    SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+          FilterExec: c@2 = 0
+            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
+              DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
 
     Ok(())
 }
@@ -3457,10 +3571,10 @@ fn do_not_preserve_ordering_through_repartition3() -> Result<()> {
     let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
                                                                                         @r"
-FilterExec: c@2 = 0
-  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
-    DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
-");
+    FilterExec: c@2 = 0
+      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
+        DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
     let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_distrib, plan_sort);
 
@@ -3480,10 +3594,10 @@ fn do_not_put_sort_when_input_is_invalid() -> Result<()> {
     // Ordering requirement of sort required exec is NOT satisfied
     // by existing ordering at the source.
     assert_plan!(physical_plan, @r"
-SortRequiredExec: [a@0 ASC]
-  FilterExec: c@2 = 0
-    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    SortRequiredExec: [a@0 ASC]
+      FilterExec: c@2 = 0
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     let mut config = ConfigOptions::new();
     config.execution.target_partitions = 10;
@@ -3493,11 +3607,11 @@ SortRequiredExec: [a@0 ASC]
     // Since at the start of the rule ordering requirement is not satisfied
     // EnforceDistribution rule doesn't satisfy this requirement either.
     assert_plan!(dist_plan, @r"
-SortRequiredExec: [a@0 ASC]
-  FilterExec: c@2 = 0
-    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    SortRequiredExec: [a@0 ASC]
+      FilterExec: c@2 = 0
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     Ok(())
 }
@@ -3516,10 +3630,10 @@ fn put_sort_when_input_is_valid() -> Result<()> {
     // Ordering requirement of sort required exec is satisfied
     // by existing ordering at the source.
     assert_plan!(physical_plan, @r"
-SortRequiredExec: [a@0 ASC]
-  FilterExec: c@2 = 0
-    DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
-");
+    SortRequiredExec: [a@0 ASC]
+      FilterExec: c@2 = 0
+        DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    ");
 
     let mut config = ConfigOptions::new();
     config.execution.target_partitions = 10;
@@ -3529,10 +3643,10 @@ SortRequiredExec: [a@0 ASC]
     // Since at the start of the rule ordering requirement is satisfied
     // EnforceDistribution rule satisfy this requirement also.
     assert_plan!(dist_plan, @r"
-SortRequiredExec: [a@0 ASC]
-  FilterExec: c@2 = 0
-    DataSourceExec: file_groups={10 groups: [[x:0..20], [y:0..20], [x:20..40], [y:20..40], [x:40..60], [y:40..60], [x:60..80], [y:60..80], [x:80..100], [y:80..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
-");
+    SortRequiredExec: [a@0 ASC]
+      FilterExec: c@2 = 0
+        DataSourceExec: file_groups={10 groups: [[x:0..20], [y:0..20], [x:20..40], [y:20..40], [x:40..60], [y:40..60], [x:60..80], [y:60..80], [x:80..100], [y:80..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    ");
 
     Ok(())
 }
@@ -3556,10 +3670,10 @@ fn do_not_add_unnecessary_hash() -> Result<()> {
     let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
                                                                                         @r"
-AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
-  AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
-    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
-");
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
     let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_distrib, plan_sort);
 
@@ -3586,14 +3700,14 @@ fn do_not_add_unnecessary_hash2() -> Result<()> {
     let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
                                                                                         @r"
-AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
-  AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
     AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
-      RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
-        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
-          RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
-            DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
-");
+      AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+        AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+          RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
+            AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+              RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
+                DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
     // Since hash requirements of this operator is satisfied. There shouldn't be
     // a hash repartition here
     let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
@@ -3607,17 +3721,15 @@ fn optimize_away_unnecessary_repartition() -> Result<()> {
     let physical_plan = coalesce_partitions_exec(repartition_exec(parquet_exec()));
     assert_plan!(physical_plan,
                                                                                         @r"
-CoalescePartitionsExec
-  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    CoalescePartitionsExec
+      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     let test_config = TestConfig::default();
     let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
-                                                                                        @r"
-DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+                                                                                        @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet");
     let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_distrib, plan_sort);
 
@@ -3631,23 +3743,23 @@ fn optimize_away_unnecessary_repartition2() -> Result<()> {
     )));
     assert_plan!(physical_plan,
                                                                                         @r"
-FilterExec: c@2 = 0
-  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-    CoalescePartitionsExec
-      FilterExec: c@2 = 0
-        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    FilterExec: c@2 = 0
+      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+        CoalescePartitionsExec
+          FilterExec: c@2 = 0
+            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     let test_config = TestConfig::default();
     let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
     assert_plan!(plan_distrib,
                                                                                         @r"
-FilterExec: c@2 = 0
-  FilterExec: c@2 = 0
-    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
-      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    FilterExec: c@2 = 0
+      FilterExec: c@2 = 0
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
     let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
     assert_plan!(plan_distrib, plan_sort);
 
@@ -3671,29 +3783,29 @@ async fn test_distribute_sort_parquet() -> Result<()> {
     // prior to optimization, this is the starting plan
     assert_plan!(physical_plan,
                                                                                         @r"
-SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
-  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     // what the enforce distribution run does.
     let plan_distribution =
         test_config.to_plan(physical_plan.clone(), &[Run::Distribution]);
     assert_plan!(plan_distribution,
                                                                                         @r"
-SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
-  CoalescePartitionsExec
-    DataSourceExec: file_groups={10 groups: [[x:0..8192000], [x:8192000..16384000], [x:16384000..24576000], [x:24576000..32768000], [x:32768000..40960000], [x:40960000..49152000], [x:49152000..57344000], [x:57344000..65536000], [x:65536000..73728000], [x:73728000..81920000]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        DataSourceExec: file_groups={10 groups: [[x:0..8192000], [x:8192000..16384000], [x:16384000..24576000], [x:24576000..32768000], [x:32768000..40960000], [x:40960000..49152000], [x:49152000..57344000], [x:57344000..65536000], [x:65536000..73728000], [x:73728000..81920000]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     // what the sort parallelization (in enforce sorting), does after the enforce distribution changes
     let plan_both =
         test_config.to_plan(physical_plan, &[Run::Distribution, Run::Sorting]);
     assert_plan!(plan_both,
                                                                                         @r"
-SortPreservingMergeExec: [c@2 ASC]
-  SortExec: expr=[c@2 ASC], preserve_partitioning=[true]
-    DataSourceExec: file_groups={10 groups: [[x:0..8192000], [x:8192000..16384000], [x:16384000..24576000], [x:24576000..32768000], [x:32768000..40960000], [x:40960000..49152000], [x:49152000..57344000], [x:57344000..65536000], [x:65536000..73728000], [x:73728000..81920000]]}, projection=[a, b, c, d, e], file_type=parquet
-");
+    SortPreservingMergeExec: [c@2 ASC]
+      SortExec: expr=[c@2 ASC], preserve_partitioning=[true]
+        DataSourceExec: file_groups={10 groups: [[x:0..8192000], [x:8192000..16384000], [x:16384000..24576000], [x:24576000..32768000], [x:32768000..40960000], [x:40960000..49152000], [x:49152000..57344000], [x:57344000..65536000], [x:65536000..73728000], [x:73728000..81920000]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
     Ok(())
 }
 
@@ -3720,10 +3832,10 @@ async fn test_distribute_sort_memtable() -> Result<()> {
     // this is the final, optimized plan
     assert_plan!(physical_plan,
                                                                                         @r"
-SortPreservingMergeExec: [id@0 ASC NULLS LAST]
-  SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true]
-    DataSourceExec: partitions=3, partition_sizes=[34, 33, 33]
-");
+    SortPreservingMergeExec: [id@0 ASC NULLS LAST]
+      SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true]
+        DataSourceExec: partitions=3, partition_sizes=[34, 33, 33]
+    ");
 
     Ok(())
 }
@@ -3781,7 +3893,6 @@ fn test_replace_order_preserving_variants_with_fetch() -> Result<()> {
     // Verify the plan was transformed to CoalescePartitionsExec
     result
         .plan
-        .as_any()
         .downcast_ref::<CoalescePartitionsExec>()
         .expect("Expected CoalescePartitionsExec");
 
@@ -3794,3 +3905,106 @@ fn test_replace_order_preserving_variants_with_fetch() -> Result<()> {
 
     Ok(())
 }
+
+/// When a parent requires SinglePartition and maintains input order, order-preserving
+/// variants (e.g. SortPreservingMergeExec) should be kept so that ordering can
+/// propagate to ancestors. Replacing them with CoalescePartitionsExec would destroy
+/// ordering and force unnecessary sorts later.
+#[test]
+fn maintains_order_preserves_spm_for_single_partition() -> Result<()> {
+    let schema = schema();
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
+        options: SortOptions::default(),
+    }]
+    .into();
+
+    // GlobalLimitExec -> LocalLimitExec -> sorted multi-partition parquet
+    let plan: Arc<dyn ExecutionPlan> =
+        limit_exec(parquet_exec_multiple_sorted(vec![sort_key.clone()]));
+
+    // Test EnforceDistribution in isolation: SPM should be preserved because
+    // GlobalLimitExec maintains input order.
+    let result = ensure_distribution_helper(plan, 10, false)?;
+    assert_plan!(result,
+        @r"
+    GlobalLimitExec: skip=0, fetch=100
+      SortPreservingMergeExec: [c@2 ASC]
+        LocalLimitExec: fetch=100
+          DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
+
+    Ok(())
+}
+
+/// Tests the cascading effect through a UnionExec with the full optimizer
+/// pipeline and `prefer_existing_sort=true`. Each Union branch has an operator
+/// that requires SinglePartition and maintains input order. SortPreservingMergeExec
+/// should be preserved in each branch, allowing ordering to flow through to the
+/// ancestor SortRequiredExec.
+#[test]
+fn maintains_order_preserves_spm_through_union_with_prefer_existing_sort() -> Result<()> {
+    let schema = schema();
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
+        options: SortOptions::default(),
+    }]
+    .into();
+
+    let branch1 =
+        single_partition_maintains_order_exec(parquet_exec_multiple_sorted(vec![
+            sort_key.clone(),
+        ]));
+    let branch2 =
+        single_partition_maintains_order_exec(parquet_exec_multiple_sorted(vec![
+            sort_key.clone(),
+        ]));
+    let plan = sort_required_exec_with_req(union_exec(vec![branch1, branch2]), sort_key);
+
+    let test_config = TestConfig::default().with_prefer_existing_sort();
+
+    let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+        @r"
+    SortRequiredExec: [c@2 ASC]
+      UnionExec
+        SinglePartitionMaintainsOrderExec
+          SortPreservingMergeExec: [c@2 ASC]
+            DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+        SinglePartitionMaintainsOrderExec
+          SortPreservingMergeExec: [c@2 ASC]
+            DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
+
+    Ok(())
+}
+
+/// Verifies that `adjust_input_keys_ordering` returns `Transformed::no`
+/// for a simple scan plan with no key requirements, avoiding an
+/// unnecessary plan rebuild.
+#[test]
+fn adjust_input_keys_ordering_no_transform_for_scan() -> Result<()> {
+    let plan: Arc<dyn ExecutionPlan> = parquet_exec();
+    let requirements = PlanWithKeyRequirements::new_default(plan);
+    let result = adjust_input_keys_ordering(requirements)?;
+    assert!(
+        !result.transformed,
+        "expected Transformed::no for a scan plan with empty requirements"
+    );
+    Ok(())
+}
+
+/// Verifies that `adjust_input_keys_ordering` applied via `transform_down`
+/// over a filter -> scan tree returns `Transformed::no` when there are no
+/// join/aggregate key requirements.
+#[test]
+fn adjust_input_keys_ordering_no_transform_for_filter_scan() -> Result<()> {
+    let plan: Arc<dyn ExecutionPlan> = filter_exec(parquet_exec());
+    let requirements = PlanWithKeyRequirements::new_default(plan);
+    let result = requirements.transform_down(adjust_input_keys_ordering)?;
+    assert!(
+        !result.transformed,
+        "expected Transformed::no for a filter->scan tree with no key requirements"
+    );
+    Ok(())
+}
diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs
index e3a0eb7e1aa6f..40bcdbbd6efef 100644
--- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs
+++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs
@@ -19,21 +19,21 @@ use std::sync::Arc;
 
 use crate::memory_limit::DummyStreamPartition;
 use crate::physical_optimizer::test_utils::{
-    aggregate_exec, bounded_window_exec, bounded_window_exec_with_partition,
-    check_integrity, coalesce_batches_exec, coalesce_partitions_exec, create_test_schema,
-    create_test_schema2, create_test_schema3, filter_exec, global_limit_exec,
-    hash_join_exec, local_limit_exec, memory_exec, parquet_exec, parquet_exec_with_sort,
-    projection_exec, repartition_exec, sort_exec, sort_exec_with_fetch, sort_expr,
-    sort_expr_options, sort_merge_join_exec, sort_preserving_merge_exec,
-    sort_preserving_merge_exec_with_fetch, spr_repartition_exec, stream_exec_ordered,
-    union_exec, RequirementsTestExec,
+    RequirementsTestExec, aggregate_exec, bounded_window_exec,
+    bounded_window_exec_with_partition, check_integrity, coalesce_partitions_exec,
+    create_test_schema, create_test_schema2, create_test_schema3, filter_exec,
+    global_limit_exec, hash_join_exec, local_limit_exec, memory_exec, parquet_exec,
+    parquet_exec_with_sort, projection_exec, repartition_exec, sort_exec,
+    sort_exec_with_fetch, sort_expr, sort_expr_options, sort_merge_join_exec,
+    sort_preserving_merge_exec, sort_preserving_merge_exec_with_fetch,
+    spr_repartition_exec, stream_exec_ordered, union_exec,
 };
 
-use arrow::compute::SortOptions;
+use arrow::compute::{SortOptions};
 use arrow::datatypes::{DataType, SchemaRef};
-use datafusion_common::config::ConfigOptions;
+use datafusion_common::config::{ConfigOptions, CsvOptions};
 use datafusion_common::tree_node::{TreeNode, TransformedResult};
-use datafusion_common::{Result,  TableReference};
+use datafusion_common::{create_array, Result, TableReference};
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use datafusion_datasource::source::DataSourceExec;
 use datafusion_expr_common::operator::Operator;
@@ -58,24 +58,29 @@ use datafusion_physical_optimizer::enforce_distribution::EnforceDistribution;
 use datafusion_physical_optimizer::output_requirements::OutputRequirementExec;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
 use datafusion::prelude::*;
-use arrow::array::{Int32Array, RecordBatch};
+use arrow::array::{record_batch, ArrayRef, Int32Array, RecordBatch};
 use arrow::datatypes::{Field};
 use arrow_schema::Schema;
 use datafusion_execution::TaskContext;
 use datafusion_catalog::streaming::StreamingTable;
 
 use futures::StreamExt;
-use insta::{assert_snapshot, Settings};
+use insta::{Settings, assert_snapshot};
 
 /// Create a sorted Csv exec
 fn csv_exec_sorted(
     schema: &SchemaRef,
     sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
 ) -> Arc<dyn ExecutionPlan> {
+    let options = CsvOptions {
+        has_header: Some(false),
+        delimiter: 0,
+        quote: 0,
+        ..Default::default()
+    };
     let mut builder = FileScanConfigBuilder::new(
         ObjectStoreUrl::parse("test:///").unwrap(),
-        schema.clone(),
-        Arc::new(CsvSource::new(false, 0, 0)),
+        Arc::new(CsvSource::new(schema.clone()).with_csv_options(options)),
     )
     .with_file(PartitionedFile::new("x".to_string(), 100));
     if let Some(ordering) = LexOrdering::new(sort_exprs) {
@@ -361,8 +366,8 @@ async fn test_union_inputs_different_sorted2() -> Result<()> {
 
 #[tokio::test]
 // Test with `repartition_sorts` enabled to preserve pre-sorted partitions and avoid resorting
-async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_with_repartition_sorts_true(
-) -> Result<()> {
+async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_with_repartition_sorts_true()
+-> Result<()> {
     assert_snapshot!(
         union_with_mix_of_presorted_and_explicitly_resorted_inputs_impl(true).await?,
         @r"
@@ -387,8 +392,8 @@ async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_with_reparti
 
 #[tokio::test]
 // Test with `repartition_sorts` disabled, causing a full resort of the data
-async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_with_repartition_sorts_false(
-) -> Result<()> {
+async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_with_repartition_sorts_false()
+-> Result<()> {
     assert_snapshot!(
         union_with_mix_of_presorted_and_explicitly_resorted_inputs_impl(false).await?,
         @r"
@@ -659,21 +664,13 @@ async fn test_union_inputs_different_sorted7() -> Result<()> {
     // Union has unnecessarily fine ordering below it. We should be able to replace them with absolutely necessary ordering.
     let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
     assert_snapshot!(test.run(), @r"
-    Input Plan:
+    Input / Optimized Plan:
     SortPreservingMergeExec: [nullable_col@0 ASC]
       UnionExec
         SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
           DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
         SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
           DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
-
-    Optimized Plan:
-    SortPreservingMergeExec: [nullable_col@0 ASC]
-      UnionExec
-        SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
-          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
-        SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
-          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
     ");
     // Union preserves the inputs ordering, and we should not change any of the SortExecs under UnionExec
 
@@ -773,8 +770,8 @@ async fn test_soft_hard_requirements_remove_soft_requirement() -> Result<()> {
 }
 
 #[tokio::test]
-async fn test_soft_hard_requirements_remove_soft_requirement_without_pushdowns(
-) -> Result<()> {
+async fn test_soft_hard_requirements_remove_soft_requirement_without_pushdowns()
+-> Result<()> {
     let schema = create_test_schema()?;
     let source = parquet_exec(schema.clone());
     let ordering = [sort_expr_options(
@@ -1072,8 +1069,8 @@ async fn test_soft_hard_requirements_multiple_sorts() -> Result<()> {
 }
 
 #[tokio::test]
-async fn test_soft_hard_requirements_with_multiple_soft_requirements_and_output_requirement(
-) -> Result<()> {
+async fn test_soft_hard_requirements_with_multiple_soft_requirements_and_output_requirement()
+-> Result<()> {
     let schema = create_test_schema()?;
     let source = parquet_exec(schema.clone());
     let ordering = [sort_expr_options(
@@ -1259,7 +1256,8 @@ async fn test_union_inputs_different_sorted_with_limit() -> Result<()> {
     let physical_plan = sort_preserving_merge_exec(ordering3, union);
 
     let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
-    // Should not change the unnecessarily fine `SortExec`s because there is `LimitExec`
+    // Should not change the unnecessarily fine `SortExec`s because there are
+    // explicit limit nodes above the second sort.
     assert_snapshot!(test.run(), @r"
     Input Plan:
     SortPreservingMergeExec: [nullable_col@0 ASC]
@@ -1342,12 +1340,12 @@ async fn test_sort_merge_join_order_by_left() -> Result<()> {
                 assert_snapshot!(test.run(), @r"
                 Input Plan:
                 SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]
-                  SortMergeJoin: join_type=..., on=[(nullable_col@0, col_a@0)]
+                  SortMergeJoinExec: join_type=..., on=[(nullable_col@0, col_a@0)]
                     DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
                     DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
 
                 Optimized Plan:
-                SortMergeJoin: join_type=..., on=[(nullable_col@0, col_a@0)]
+                SortMergeJoinExec: join_type=..., on=[(nullable_col@0, col_a@0)]
                   SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
                     DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
                   SortExec: expr=[col_a@0 ASC], preserve_partitioning=[false]
@@ -1359,13 +1357,13 @@ async fn test_sort_merge_join_order_by_left() -> Result<()> {
                 assert_snapshot!(test.run(), @r"
                 Input Plan:
                 SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]
-                  SortMergeJoin: join_type=..., on=[(nullable_col@0, col_a@0)]
+                  SortMergeJoinExec: join_type=..., on=[(nullable_col@0, col_a@0)]
                     DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
                     DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
 
                 Optimized Plan:
                 SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
-                  SortMergeJoin: join_type=..., on=[(nullable_col@0, col_a@0)]
+                  SortMergeJoinExec: join_type=..., on=[(nullable_col@0, col_a@0)]
                     SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
                       DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
                     SortExec: expr=[col_a@0 ASC], preserve_partitioning=[false]
@@ -1432,12 +1430,12 @@ async fn test_sort_merge_join_order_by_right() -> Result<()> {
                 assert_snapshot!(test.run(), @r"
                 Input Plan:
                 SortPreservingMergeExec: [col_a@2 ASC, col_b@3 ASC]
-                  SortMergeJoin: join_type=..., on=[(nullable_col@0, col_a@0)]
+                  SortMergeJoinExec: join_type=..., on=[(nullable_col@0, col_a@0)]
                     DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
                     DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
 
                 Optimized Plan:
-                SortMergeJoin: join_type=..., on=[(nullable_col@0, col_a@0)]
+                SortMergeJoinExec: join_type=..., on=[(nullable_col@0, col_a@0)]
                   SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
                     DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
                   SortExec: expr=[col_a@0 ASC, col_b@1 ASC], preserve_partitioning=[false]
@@ -1449,12 +1447,12 @@ async fn test_sort_merge_join_order_by_right() -> Result<()> {
                 assert_snapshot!(test.run(), @r"
                 Input Plan:
                 SortPreservingMergeExec: [col_a@0 ASC, col_b@1 ASC]
-                  SortMergeJoin: join_type=..., on=[(nullable_col@0, col_a@0)]
+                  SortMergeJoinExec: join_type=..., on=[(nullable_col@0, col_a@0)]
                     DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
                     DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
 
                 Optimized Plan:
-                SortMergeJoin: join_type=..., on=[(nullable_col@0, col_a@0)]
+                SortMergeJoinExec: join_type=..., on=[(nullable_col@0, col_a@0)]
                   SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
                     DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
                   SortExec: expr=[col_a@0 ASC, col_b@1 ASC], preserve_partitioning=[false]
@@ -1466,13 +1464,13 @@ async fn test_sort_merge_join_order_by_right() -> Result<()> {
                 assert_snapshot!(test.run(), @r"
                 Input Plan:
                 SortPreservingMergeExec: [col_a@2 ASC, col_b@3 ASC]
-                  SortMergeJoin: join_type=..., on=[(nullable_col@0, col_a@0)]
+                  SortMergeJoinExec: join_type=..., on=[(nullable_col@0, col_a@0)]
                     DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
                     DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
 
                 Optimized Plan:
                 SortExec: expr=[col_a@2 ASC, col_b@3 ASC], preserve_partitioning=[false]
-                  SortMergeJoin: join_type=..., on=[(nullable_col@0, col_a@0)]
+                  SortMergeJoinExec: join_type=..., on=[(nullable_col@0, col_a@0)]
                     SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
                       DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
                     SortExec: expr=[col_a@0 ASC], preserve_partitioning=[false]
@@ -1515,13 +1513,13 @@ async fn test_sort_merge_join_complex_order_by() -> Result<()> {
     assert_snapshot!(test.run(), @r"
     Input Plan:
     SortPreservingMergeExec: [col_b@3 ASC, col_a@2 ASC]
-      SortMergeJoin: join_type=Inner, on=[(nullable_col@0, col_a@0)]
+      SortMergeJoinExec: join_type=Inner, on=[(nullable_col@0, col_a@0)]
         DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
         DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
 
     Optimized Plan:
     SortExec: expr=[col_b@3 ASC, nullable_col@0 ASC], preserve_partitioning=[false]
-      SortMergeJoin: join_type=Inner, on=[(nullable_col@0, col_a@0)]
+      SortMergeJoinExec: join_type=Inner, on=[(nullable_col@0, col_a@0)]
         SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
           DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
         SortExec: expr=[col_a@0 ASC], preserve_partitioning=[false]
@@ -1542,12 +1540,12 @@ async fn test_sort_merge_join_complex_order_by() -> Result<()> {
     assert_snapshot!(test.run(), @r"
     Input Plan:
     SortPreservingMergeExec: [nullable_col@0 ASC, col_b@3 ASC, col_a@2 ASC]
-      SortMergeJoin: join_type=Inner, on=[(nullable_col@0, col_a@0)]
+      SortMergeJoinExec: join_type=Inner, on=[(nullable_col@0, col_a@0)]
         DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
         DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
 
     Optimized Plan:
-    SortMergeJoin: join_type=Inner, on=[(nullable_col@0, col_a@0)]
+    SortMergeJoinExec: join_type=Inner, on=[(nullable_col@0, col_a@0)]
       SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
         DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
       SortExec: expr=[col_a@0 ASC, col_b@1 ASC], preserve_partitioning=[false]
@@ -1626,13 +1624,13 @@ async fn test_with_lost_ordering_unbounded() -> Result<()> {
     SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
       CoalescePartitionsExec
         RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10
-          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
             StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC]
 
     Optimized Plan:
     SortPreservingMergeExec: [a@0 ASC]
       RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC
-        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
           StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC]
     ");
 
@@ -1644,13 +1642,13 @@ async fn test_with_lost_ordering_unbounded() -> Result<()> {
     SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
       CoalescePartitionsExec
         RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10
-          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
             StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC]
 
     Optimized Plan:
     SortPreservingMergeExec: [a@0 ASC]
       RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC
-        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
           StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC]
     ");
 
@@ -1669,7 +1667,7 @@ async fn test_with_lost_ordering_bounded() -> Result<()> {
     SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
       CoalescePartitionsExec
         RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10
-          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
             DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=csv, has_header=false
     ");
 
@@ -1681,14 +1679,14 @@ async fn test_with_lost_ordering_bounded() -> Result<()> {
     SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
       CoalescePartitionsExec
         RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10
-          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
             DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=csv, has_header=false
 
     Optimized Plan:
     SortPreservingMergeExec: [a@0 ASC]
       SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
         RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10
-          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
             DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=csv, has_header=false
     ");
 
@@ -1710,7 +1708,7 @@ async fn test_do_not_pushdown_through_spm() -> Result<()> {
     Input / Optimized Plan:
     SortExec: expr=[b@1 ASC], preserve_partitioning=[false]
       SortPreservingMergeExec: [a@0 ASC, b@1 ASC]
-        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
           DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC, b@1 ASC], file_type=csv, has_header=false
     ");
 
@@ -1739,13 +1737,13 @@ async fn test_pushdown_through_spm() -> Result<()> {
     Input Plan:
     SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC], preserve_partitioning=[false]
       SortPreservingMergeExec: [a@0 ASC, b@1 ASC]
-        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
           DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC, b@1 ASC], file_type=csv, has_header=false
 
     Optimized Plan:
     SortPreservingMergeExec: [a@0 ASC, b@1 ASC]
       SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC], preserve_partitioning=[true]
-        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
           DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC, b@1 ASC], file_type=csv, has_header=false
     ");
     Ok(())
@@ -1769,7 +1767,7 @@ async fn test_window_multi_layer_requirement() -> Result<()> {
     BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
       SortPreservingMergeExec: [a@0 ASC, b@1 ASC]
         RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC, b@1 ASC
-          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
             SortExec: expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[false]
               DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
 
@@ -1847,9 +1845,7 @@ async fn test_remove_unnecessary_sort_window_multilayer() -> Result<()> {
     )]
     .into();
     let sort = sort_exec(ordering.clone(), source);
-    // Add dummy layer propagating Sort above, to test whether sort can be removed from multi layer before
-    let coalesce_batches = coalesce_batches_exec(sort, 128);
-    let window_agg = bounded_window_exec("non_nullable_col", ordering, coalesce_batches);
+    let window_agg = bounded_window_exec("non_nullable_col", ordering, sort);
     let ordering2: LexOrdering = [sort_expr_options(
         "non_nullable_col",
         &window_agg.schema(),
@@ -1875,17 +1871,15 @@ async fn test_remove_unnecessary_sort_window_multilayer() -> Result<()> {
       FilterExec: NOT non_nullable_col@1
         SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false]
           BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
-            CoalesceBatchesExec: target_batch_size=128
-              SortExec: expr=[non_nullable_col@1 DESC], preserve_partitioning=[false]
-                DataSourceExec: partitions=1, partition_sizes=[0]
+            SortExec: expr=[non_nullable_col@1 DESC], preserve_partitioning=[false]
+              DataSourceExec: partitions=1, partition_sizes=[0]
 
     Optimized Plan:
     WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
       FilterExec: NOT non_nullable_col@1
         BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
-          CoalesceBatchesExec: target_batch_size=128
-            SortExec: expr=[non_nullable_col@1 DESC], preserve_partitioning=[false]
-              DataSourceExec: partitions=1, partition_sizes=[0]
+          SortExec: expr=[non_nullable_col@1 DESC], preserve_partitioning=[false]
+            DataSourceExec: partitions=1, partition_sizes=[0]
     "#);
 
     Ok(())
@@ -1964,7 +1958,7 @@ async fn test_remove_unnecessary_sort2() -> Result<()> {
     assert_snapshot!(test.run(), @r"
     Input Plan:
     RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10
-      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
         SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
           SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]
             SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
@@ -2011,7 +2005,7 @@ async fn test_remove_unnecessary_sort3() -> Result<()> {
     AggregateExec: mode=Final, gby=[], aggr=[]
       SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]
         SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[true]
-          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
             SortPreservingMergeExec: [non_nullable_col@1 ASC]
               SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]
                 DataSourceExec: partitions=1, partition_sizes=[0]
@@ -2360,7 +2354,7 @@ async fn test_commutativity() -> Result<()> {
 
     assert_snapshot!(displayable(orig_plan.as_ref()).indent(true), @r#"
     SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
-      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
         BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
           DataSourceExec: partitions=1, partition_sizes=[0]
     "#);
@@ -2812,3 +2806,47 @@ async fn test_partial_sort_with_homogeneous_batches() -> Result<()> {
 
     Ok(())
 }
+
+#[tokio::test]
+async fn test_sort_with_streaming_table() -> Result<()> {
+    let batch = record_batch!(("a", Int32, [1, 2, 3]), ("b", Int32, [1, 2, 3]))?;
+
+    let ctx = SessionContext::new();
+
+    let sort_order = vec![
+        SortExpr::new(
+            Expr::Column(datafusion_common::Column::new(
+                Option::<TableReference>::None,
+                "a",
+            )),
+            true,
+            false,
+        ),
+        SortExpr::new(
+            Expr::Column(datafusion_common::Column::new(
+                Option::<TableReference>::None,
+                "b",
+            )),
+            true,
+            false,
+        ),
+    ];
+    let schema = batch.schema();
+    let batches = Arc::new(DummyStreamPartition {
+        schema: schema.clone(),
+        batches: vec![batch],
+    }) as _;
+    let provider = StreamingTable::try_new(schema.clone(), vec![batches])?
+        .with_sort_order(sort_order);
+    ctx.register_table("test_table", Arc::new(provider))?;
+
+    let sql = "SELECT a FROM test_table GROUP BY a ORDER BY a";
+    let results = ctx.sql(sql).await?.collect().await?;
+
+    assert_eq!(results.len(), 1);
+    assert_eq!(results[0].num_columns(), 1);
+    let expected = create_array!(Int32, vec![1, 2, 3]) as ArrayRef;
+    assert_eq!(results[0].column(0), &expected);
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs
index ef233e222912c..de7611ff211a5 100644
--- a/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs
+++ b/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs
@@ -31,7 +31,7 @@ use datafusion_physical_expr::expressions::col;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
 use datafusion_physical_plan::windows::{
-    create_window_expr, BoundedWindowAggExec, WindowAggExec,
+    BoundedWindowAggExec, WindowAggExec, create_window_expr,
 };
 use datafusion_physical_plan::{ExecutionPlan, InputOrderMode};
 use insta::assert_snapshot;
diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown.rs
new file mode 100644
index 0000000000000..5f64c9e4a5400
--- /dev/null
+++ b/datafusion/core/tests/physical_optimizer/filter_pushdown.rs
@@ -0,0 +1,3376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::{Arc, LazyLock};
+
+use arrow::{
+    array::record_batch,
+    datatypes::{DataType, Field, Schema, SchemaRef},
+    util::pretty::pretty_format_batches,
+};
+use arrow_schema::SortOptions;
+use datafusion::{
+    assert_batches_eq,
+    logical_expr::Operator,
+    physical_plan::{
+        PhysicalExpr,
+        expressions::{BinaryExpr, Column, Literal},
+    },
+    prelude::{SessionConfig, SessionContext},
+    scalar::ScalarValue,
+};
+use datafusion_catalog::memory::DataSourceExec;
+use datafusion_common::config::ConfigOptions;
+use datafusion_datasource::{
+    PartitionedFile, file_groups::FileGroup, file_scan_config::FileScanConfigBuilder,
+};
+use datafusion_execution::object_store::ObjectStoreUrl;
+use datafusion_expr::ScalarUDF;
+use datafusion_functions::math::random::RandomFunc;
+use datafusion_functions_aggregate::{count::count_udaf, min_max::min_udaf};
+use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr, expressions::col};
+use datafusion_physical_expr::{
+    Partitioning, ScalarFunctionExpr, aggregate::AggregateExprBuilder,
+};
+use datafusion_physical_optimizer::{
+    PhysicalOptimizerRule, filter_pushdown::FilterPushdown,
+};
+use datafusion_physical_plan::{
+    ExecutionPlan,
+    aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy},
+    coalesce_partitions::CoalescePartitionsExec,
+    collect,
+    filter::{FilterExec, FilterExecBuilder},
+    projection::ProjectionExec,
+    repartition::RepartitionExec,
+    sorts::sort::SortExec,
+};
+
+use super::pushdown_utils::{
+    OptimizationTest, TestNode, TestScanBuilder, TestSource, format_plan_for_test,
+};
+use datafusion_physical_plan::union::UnionExec;
+use object_store::memory::InMemory;
+
+#[test]
+fn test_pushdown_into_scan() {
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, scan).unwrap());
+
+    // expect the predicate to be pushed down into the DataSource
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+    "
+    );
+}
+
+#[test]
+fn test_pushdown_volatile_functions_not_allowed() {
+    // Test that we do not push down filters with volatile functions
+    // Use random() as an example of a volatile function
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+    let cfg = Arc::new(ConfigOptions::default());
+    let predicate = Arc::new(BinaryExpr::new(
+        Arc::new(Column::new_with_schema("a", &schema()).unwrap()),
+        Operator::Eq,
+        Arc::new(
+            ScalarFunctionExpr::try_new(
+                Arc::new(ScalarUDF::from(RandomFunc::new())),
+                vec![],
+                &schema(),
+                cfg,
+            )
+            .unwrap(),
+        ),
+    )) as Arc<dyn PhysicalExpr>;
+    let plan = Arc::new(FilterExec::try_new(predicate, scan).unwrap());
+    // expect the filter to not be pushed down
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = random()
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - FilterExec: a@0 = random()
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+    ",
+    );
+}
+
+/// Show that we can use config options to determine how to do pushdown.
+#[test]
+fn test_pushdown_into_scan_with_config_options() {
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, scan).unwrap()) as _;
+
+    let mut cfg = ConfigOptions::default();
+    insta::assert_snapshot!(
+        OptimizationTest::new(
+            Arc::clone(&plan),
+            FilterPushdown::new(),
+            false
+        ),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - FilterExec: a@0 = foo
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+    "
+    );
+
+    cfg.execution.parquet.pushdown_filters = true;
+    insta::assert_snapshot!(
+        OptimizationTest::new(
+            plan,
+            FilterPushdown::new(),
+            true
+        ),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+    "
+    );
+}
+
+// Inner-join part is covered by push_down_filter_parquet.slt::test_hashjoin_parent_filter_pushdown.
+// The Left-join part stays in Rust: SQL's outer-join-elimination rewrites
+// `LEFT JOIN ... WHERE <probe-side-null-rejecting>` into an INNER JOIN
+// before physical filter pushdown runs, so the preserved-vs-non-preserved
+// distinction this test exercises is not reachable via SQL.
+#[tokio::test]
+async fn test_static_filter_pushdown_through_hash_join() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    // Create build side with limited values
+    let build_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab"]),
+            ("b", Utf8View, ["ba", "bb"]),
+            ("c", Float64, [1.0, 2.0])
+        )
+        .unwrap(),
+    ];
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8View, false),
+        Field::new("c", DataType::Float64, false),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side with more values
+    let probe_batches = vec![
+        record_batch!(
+            ("d", Utf8, ["aa", "ab", "ac", "ad"]),
+            ("e", Utf8View, ["ba", "bb", "bc", "bd"]),
+            ("f", Float64, [1.0, 2.0, 3.0, 4.0])
+        )
+        .unwrap(),
+    ];
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("d", DataType::Utf8, false),
+        Field::new("e", DataType::Utf8View, false),
+        Field::new("f", DataType::Float64, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create HashJoinExec
+    let on = vec![(
+        col("a", &build_side_schema).unwrap(),
+        col("d", &probe_side_schema).unwrap(),
+    )];
+    let join = Arc::new(
+        HashJoinExec::try_new(
+            build_scan,
+            probe_scan,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::Partitioned,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    // Create filters that can be pushed down to different sides
+    // We need to create filters in the context of the join output schema
+    let join_schema = join.schema();
+
+    // Filter on build side column: a = 'aa'
+    let left_filter = col_lit_predicate("a", "aa", &join_schema);
+    // Filter on probe side column: e = 'ba'
+    let right_filter = col_lit_predicate("e", "ba", &join_schema);
+    // Filter that references both sides: a = d (should not be pushed down)
+    let cross_filter = Arc::new(BinaryExpr::new(
+        col("a", &join_schema).unwrap(),
+        Operator::Eq,
+        col("d", &join_schema).unwrap(),
+    )) as Arc<dyn PhysicalExpr>;
+
+    let filter =
+        Arc::new(FilterExec::try_new(left_filter, Arc::clone(&join) as _).unwrap());
+    let filter = Arc::new(FilterExec::try_new(right_filter, filter).unwrap());
+    let plan = Arc::new(FilterExec::try_new(cross_filter, filter).unwrap())
+        as Arc<dyn ExecutionPlan>;
+
+    // Test that filters are pushed down correctly to each side of the join
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = d@3
+        -   FilterExec: e@4 = ba
+        -     FilterExec: a@0 = aa
+        -       HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, d@0)]
+        -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+        -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - FilterExec: a@0 = d@3
+          -   HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, d@0)]
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = aa
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=e@1 = ba
+    "
+    );
+
+    // Test left join: filter on preserved (build) side is pushed down,
+    // filter on non-preserved (probe) side is NOT pushed down.
+    let join = Arc::new(
+        HashJoinExec::try_new(
+            TestScanBuilder::new(Arc::clone(&build_side_schema))
+                .with_support(true)
+                .build(),
+            TestScanBuilder::new(Arc::clone(&probe_side_schema))
+                .with_support(true)
+                .build(),
+            vec![(
+                col("a", &build_side_schema).unwrap(),
+                col("d", &probe_side_schema).unwrap(),
+            )],
+            None,
+            &JoinType::Left,
+            None,
+            PartitionMode::Partitioned,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    let join_schema = join.schema();
+    // Filter on build side column (preserved): should be pushed down
+    let left_filter = col_lit_predicate("a", "aa", &join_schema);
+    // Filter on probe side column (not preserved): should NOT be pushed down
+    let right_filter = col_lit_predicate("e", "ba", &join_schema);
+    let filter =
+        Arc::new(FilterExec::try_new(left_filter, Arc::clone(&join) as _).unwrap());
+    let plan = Arc::new(FilterExec::try_new(right_filter, filter).unwrap())
+        as Arc<dyn ExecutionPlan>;
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: e@4 = ba
+        -   FilterExec: a@0 = aa
+        -     HashJoinExec: mode=Partitioned, join_type=Left, on=[(a@0, d@0)]
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - FilterExec: e@4 = ba
+          -   HashJoinExec: mode=Partitioned, join_type=Left, on=[(a@0, d@0)]
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = aa
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true
+    "
+    );
+}
+
+#[test]
+fn test_filter_collapse() {
+    // filter should be pushed down into the parquet scan with two filters
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+    let predicate1 = col_lit_predicate("a", "foo", &schema());
+    let filter1 = Arc::new(FilterExec::try_new(predicate1, scan).unwrap());
+    let predicate2 = col_lit_predicate("b", "bar", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate2, filter1).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: b@1 = bar
+        -   FilterExec: a@0 = foo
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo AND b@1 = bar
+    "
+    );
+}
+
+#[test]
+fn test_filter_with_projection() {
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+    let projection = vec![1, 0];
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(
+        FilterExecBuilder::new(predicate, Arc::clone(&scan))
+            .apply_projection(Some(projection))
+            .unwrap()
+            .build()
+            .unwrap(),
+    );
+
+    // expect the predicate to be pushed down into the DataSource but the FilterExec to be converted to ProjectionExec
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo, projection=[b@1, a@0]
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - ProjectionExec: expr=[b@1 as b, a@0 as a]
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+    ",
+    );
+
+    // add a test where the filter is on a column that isn't included in the output
+    let projection = vec![1];
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(
+        FilterExecBuilder::new(predicate, scan)
+            .apply_projection(Some(projection))
+            .unwrap()
+            .build()
+            .unwrap(),
+    );
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(),true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo, projection=[b@1]
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - ProjectionExec: expr=[b@1 as b]
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+    "
+    );
+}
+
+#[test]
+fn test_filter_collapse_outer_fetch_preserved() {
+    // When the outer filter has fetch and inner does not, the merged filter should preserve fetch
+    let scan = TestScanBuilder::new(schema()).with_support(false).build();
+    let predicate1 = col_lit_predicate("a", "foo", &schema());
+    let filter1 = Arc::new(FilterExec::try_new(predicate1, scan).unwrap());
+    let predicate2 = col_lit_predicate("b", "bar", &schema());
+    let plan = Arc::new(
+        FilterExecBuilder::new(predicate2, filter1)
+            .with_fetch(Some(10))
+            .build()
+            .unwrap(),
+    );
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: b@1 = bar, fetch=10
+        -   FilterExec: a@0 = foo
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+      output:
+        Ok:
+          - FilterExec: b@1 = bar AND a@0 = foo, fetch=10
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+    "
+    );
+}
+
+#[test]
+fn test_filter_collapse_inner_fetch_preserved() {
+    // When the inner filter has fetch and outer does not, the merged filter should preserve fetch
+    let scan = TestScanBuilder::new(schema()).with_support(false).build();
+    let predicate1 = col_lit_predicate("a", "foo", &schema());
+    let filter1 = Arc::new(
+        FilterExecBuilder::new(predicate1, scan)
+            .with_fetch(Some(5))
+            .build()
+            .unwrap(),
+    );
+    let predicate2 = col_lit_predicate("b", "bar", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate2, filter1).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: b@1 = bar
+        -   FilterExec: a@0 = foo, fetch=5
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+      output:
+        Ok:
+          - FilterExec: b@1 = bar AND a@0 = foo, fetch=5
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+    "
+    );
+}
+
+#[test]
+fn test_filter_collapse_both_fetch_uses_minimum() {
+    // When both filters have fetch, the merged filter should use the smaller (tighter) fetch.
+    // Inner fetch=5 is tighter than outer fetch=10, so the result should be fetch=5.
+    let scan = TestScanBuilder::new(schema()).with_support(false).build();
+    let predicate1 = col_lit_predicate("a", "foo", &schema());
+    let filter1 = Arc::new(
+        FilterExecBuilder::new(predicate1, scan)
+            .with_fetch(Some(5))
+            .build()
+            .unwrap(),
+    );
+    let predicate2 = col_lit_predicate("b", "bar", &schema());
+    let plan = Arc::new(
+        FilterExecBuilder::new(predicate2, filter1)
+            .with_fetch(Some(10))
+            .build()
+            .unwrap(),
+    );
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: b@1 = bar, fetch=10
+        -   FilterExec: a@0 = foo, fetch=5
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+      output:
+        Ok:
+          - FilterExec: b@1 = bar AND a@0 = foo, fetch=5
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+    "
+    );
+}
+
+#[test]
+fn test_filter_with_fetch_fully_pushed_to_scan() {
+    // When a FilterExec has a fetch limit and all predicates are pushed down
+    // to a supportive DataSourceExec, the FilterExec is removed and the fetch
+    // must be propagated to the DataSourceExec.
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(
+        FilterExecBuilder::new(predicate, scan)
+            .with_fetch(Some(10))
+            .build()
+            .unwrap(),
+    );
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo, fetch=10
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], limit=10, file_type=test, pushdown_supported=true, predicate=a@0 = foo
+    "
+    );
+}
+
+#[test]
+fn test_filter_with_fetch_and_projection_fully_pushed_to_scan() {
+    // When a FilterExec has both fetch and projection, and all predicates are
+    // pushed down, the filter is replaced by a ProjectionExec and the fetch
+    // must still be propagated to the DataSourceExec.
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+    let projection = vec![1, 0];
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(
+        FilterExecBuilder::new(predicate, scan)
+            .with_fetch(Some(5))
+            .apply_projection(Some(projection))
+            .unwrap()
+            .build()
+            .unwrap(),
+    );
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo, projection=[b@1, a@0], fetch=5
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - ProjectionExec: expr=[b@1 as b, a@0 as a]
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], limit=5, file_type=test, pushdown_supported=true, predicate=a@0 = foo
+    "
+    );
+}
+
+#[test]
+fn test_filter_with_fetch_partially_pushed_to_scan() {
+    // When a FilterExec has fetch and only some predicates are pushed down,
+    // the FilterExec remains with the unpushed predicate and keeps its fetch.
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+    let pushed_predicate = col_lit_predicate("a", "foo", &schema());
+    let volatile_predicate = {
+        let cfg = Arc::new(ConfigOptions::default());
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new_with_schema("a", &schema()).unwrap()),
+            Operator::Eq,
+            Arc::new(
+                ScalarFunctionExpr::try_new(
+                    Arc::new(ScalarUDF::from(RandomFunc::new())),
+                    vec![],
+                    &schema(),
+                    cfg,
+                )
+                .unwrap(),
+            ),
+        )) as Arc<dyn PhysicalExpr>
+    };
+    // Combine: a = 'foo' AND a = random()
+    let combined = Arc::new(BinaryExpr::new(
+        pushed_predicate,
+        Operator::And,
+        volatile_predicate,
+    )) as Arc<dyn PhysicalExpr>;
+    let plan = Arc::new(
+        FilterExecBuilder::new(combined, scan)
+            .with_fetch(Some(7))
+            .build()
+            .unwrap(),
+    );
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo AND a@0 = random(), fetch=7
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - FilterExec: a@0 = random(), fetch=7
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+    "
+    );
+}
+
+#[test]
+fn test_filter_with_fetch_not_pushed_to_unsupportive_scan() {
+    // When the DataSourceExec does not support pushdown, the FilterExec
+    // remains unchanged with its fetch intact.
+    let scan = TestScanBuilder::new(schema()).with_support(false).build();
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(
+        FilterExecBuilder::new(predicate, scan)
+            .with_fetch(Some(3))
+            .build()
+            .unwrap(),
+    );
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo, fetch=3
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+      output:
+        Ok:
+          - FilterExec: a@0 = foo, fetch=3
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+    "
+    );
+}
+
+#[test]
+fn test_push_down_through_transparent_nodes() {
+    // expect the predicate to be pushed down into the DataSource
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let filter = Arc::new(FilterExec::try_new(predicate, scan).unwrap());
+    let repartition = Arc::new(
+        RepartitionExec::try_new(filter, Partitioning::RoundRobinBatch(1)).unwrap(),
+    );
+    let predicate = col_lit_predicate("b", "bar", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, repartition).unwrap());
+
+    // expect the predicate to be pushed down into the DataSource
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(),true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: b@1 = bar
+        -   RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=1
+        -     FilterExec: a@0 = foo
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=1
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo AND b@1 = bar
+    "
+    );
+}
+
+#[test]
+fn test_pushdown_through_aggregates_on_grouping_columns() {
+    // Test that filters on grouping columns can be pushed through AggregateExec.
+    // This test has two filters:
+    // 1. An inner filter (a@0 = foo) below the aggregate - gets pushed to DataSource
+    // 2. An outer filter (b@1 = bar) above the aggregate - also gets pushed through because 'b' is a grouping column
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+
+    let filter = Arc::new(
+        FilterExecBuilder::new(col_lit_predicate("a", "foo", &schema()), scan)
+            .with_batch_size(10)
+            .build()
+            .unwrap(),
+    );
+
+    let aggregate_expr = vec![
+        AggregateExprBuilder::new(count_udaf(), vec![col("a", &schema()).unwrap()])
+            .schema(schema())
+            .alias("cnt")
+            .build()
+            .map(Arc::new)
+            .unwrap(),
+    ];
+    let group_by = PhysicalGroupBy::new_single(vec![
+        (col("a", &schema()).unwrap(), "a".to_string()),
+        (col("b", &schema()).unwrap(), "b".to_string()),
+    ]);
+    let aggregate = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Final,
+            group_by,
+            aggregate_expr.clone(),
+            vec![None],
+            filter,
+            schema(),
+        )
+        .unwrap(),
+    );
+
+    let predicate = col_lit_predicate("b", "bar", &schema());
+    let plan = Arc::new(
+        FilterExecBuilder::new(predicate, aggregate)
+            .with_batch_size(100)
+            .build()
+            .unwrap(),
+    );
+
+    // Both filters should be pushed down to the DataSource since both reference grouping columns
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: b@1 = bar
+        -   AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt], ordering_mode=PartiallySorted([0])
+        -     FilterExec: a@0 = foo
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt], ordering_mode=Sorted
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo AND b@1 = bar
+    "
+    );
+}
+
+/// Test various combinations of handling of child pushdown results
+/// in an ExecutionPlan in combination with support/not support in a DataSource.
+#[test]
+fn test_node_handles_child_pushdown_result() {
+    // If we set `with_support(true)` + `inject_filter = true` then the filter is pushed down to the DataSource
+    // and no FilterExec is created.
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(TestNode::new(true, Arc::clone(&scan), predicate));
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - TestInsertExec { inject_filter: true }
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - TestInsertExec { inject_filter: true }
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+    ",
+    );
+
+    // If we set `with_support(false)` + `inject_filter = true` then the filter is not pushed down to the DataSource
+    // and a FilterExec is created.
+    let scan = TestScanBuilder::new(schema()).with_support(false).build();
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(TestNode::new(true, Arc::clone(&scan), predicate));
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - TestInsertExec { inject_filter: true }
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+      output:
+        Ok:
+          - TestInsertExec { inject_filter: false }
+          -   FilterExec: a@0 = foo
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+    ",
+    );
+
+    // If we set `with_support(false)` + `inject_filter = false` then the filter is not pushed down to the DataSource
+    // and no FilterExec is created.
+    let scan = TestScanBuilder::new(schema()).with_support(false).build();
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(TestNode::new(false, Arc::clone(&scan), predicate));
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - TestInsertExec { inject_filter: false }
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+      output:
+        Ok:
+          - TestInsertExec { inject_filter: false }
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+    ",
+    );
+}
+
+// Not portable to sqllogictest: requires manually constructing
+// `SortExec(CoalescePartitionsExec(scan))`. A SQL `ORDER BY ... LIMIT` over a
+// multi-partition scan plans as `SortPreservingMergeExec(SortExec(scan))`
+// instead, so the filter-through-coalesce path this test exercises is not
+// reachable via SQL.
+#[tokio::test]
+async fn test_topk_filter_passes_through_coalesce_partitions() {
+    // Create multiple batches for different partitions
+    let batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab"]),
+            ("b", Utf8, ["bd", "bc"]),
+            ("c", Float64, [1.0, 2.0])
+        )
+        .unwrap(),
+        record_batch!(
+            ("a", Utf8, ["ac", "ad"]),
+            ("b", Utf8, ["bb", "ba"]),
+            ("c", Float64, [2.0, 1.0])
+        )
+        .unwrap(),
+    ];
+
+    // Create a source that supports all batches
+    let source = Arc::new(TestSource::new(schema(), true, batches));
+
+    let base_config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::parse("test://").unwrap(), source)
+            .with_file_groups(vec![
+                // Partition 0
+                FileGroup::new(vec![PartitionedFile::new("test1.parquet", 123)]),
+                // Partition 1
+                FileGroup::new(vec![PartitionedFile::new("test2.parquet", 123)]),
+            ])
+            .build();
+
+    let scan = DataSourceExec::from_data_source(base_config);
+
+    // Add CoalescePartitionsExec to merge the two partitions
+    let coalesce = Arc::new(CoalescePartitionsExec::new(scan)) as Arc<dyn ExecutionPlan>;
+
+    // Add SortExec with TopK
+    let plan = Arc::new(
+        SortExec::new(
+            LexOrdering::new(vec![PhysicalSortExpr::new(
+                col("b", &schema()).unwrap(),
+                SortOptions::new(true, false),
+            )])
+            .unwrap(),
+            coalesce,
+        )
+        .with_fetch(Some(1)),
+    ) as Arc<dyn ExecutionPlan>;
+
+    // Test optimization - the filter SHOULD pass through CoalescePartitionsExec
+    // if it properly implements from_children (not all_unsupported)
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false]
+        -   CoalescePartitionsExec
+        -     DataSourceExec: file_groups={2 groups: [[test1.parquet], [test2.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false]
+          -   CoalescePartitionsExec
+          -     DataSourceExec: file_groups={2 groups: [[test1.parquet], [test2.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
+    "
+    );
+}
+
+// Not portable to sqllogictest: this test pins `PartitionMode::Partitioned`
+// by hand-wiring `RepartitionExec(Hash, 12)` on both join sides. A SQL
+// INNER JOIN over small parquet inputs plans as `CollectLeft`, so the
+// per-partition CASE filter this test exercises is not reachable via SQL.
+#[tokio::test]
+async fn test_hashjoin_dynamic_filter_pushdown_partitioned() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    // Rough sketch of the MRE we're trying to recreate:
+    // COPY (select i as k from generate_series(1, 10000000) as t(i))
+    // TO 'test_files/scratch/push_down_filter/t1.parquet'
+    // STORED AS PARQUET;
+    // COPY (select i as k, i as v from generate_series(1, 10000000) as t(i))
+    // TO 'test_files/scratch/push_down_filter/t2.parquet'
+    // STORED AS PARQUET;
+    // create external table t1 stored as parquet location 'test_files/scratch/push_down_filter/t1.parquet';
+    // create external table t2 stored as parquet location 'test_files/scratch/push_down_filter/t2.parquet';
+    // explain
+    // select *
+    // from t1
+    // join t2 on t1.k = t2.k;
+    // +---------------+------------------------------------------------------------+
+    // | plan_type     | plan                                                       |
+    // +---------------+------------------------------------------------------------+
+    // | physical_plan | ┌───────────────────────────┐                              |
+    // |               | │        HashJoinExec       │                              |
+    // |               | │    --------------------   ├──────────────┐               |
+    // |               | │        on: (k = k)        │              │               |
+    // |               | └─────────────┬─────────────┘              │               |
+    // |               | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ |
+    // |               | │      RepartitionExec      ││      RepartitionExec      │ |
+    // |               | │    --------------------   ││    --------------------   │ |
+    // |               | │ partition_count(in->out): ││ partition_count(in->out): │ |
+    // |               | │          12 -> 12         ││          12 -> 12         │ |
+    // |               | │                           ││                           │ |
+    // |               | │    partitioning_scheme:   ││    partitioning_scheme:   │ |
+    // |               | │      Hash([k@0], 12)      ││      Hash([k@0], 12)      │ |
+    // |               | └─────────────┬─────────────┘└─────────────┬─────────────┘ |
+    // |               | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ |
+    // |               | │       DataSourceExec      ││       DataSourceExec      │ |
+    // |               | │    --------------------   ││    --------------------   │ |
+    // |               | │         files: 12         ││         files: 12         │ |
+    // |               | │      format: parquet      ││      format: parquet      │ |
+    // |               | │                           ││      predicate: true      │ |
+    // |               | └───────────────────────────┘└───────────────────────────┘ |
+    // |               |                                                            |
+    // +---------------+------------------------------------------------------------+
+
+    // Create build side with limited values
+    let build_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab"]),
+            ("b", Utf8, ["ba", "bb"]),
+            ("c", Float64, [1.0, 2.0]) // Extra column not used in join
+        )
+        .unwrap(),
+    ];
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("c", DataType::Float64, false),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side with more values
+    let probe_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab", "ac", "ad"]),
+            ("b", Utf8, ["ba", "bb", "bc", "bd"]),
+            ("e", Float64, [1.0, 2.0, 3.0, 4.0]) // Extra column not used in join
+        )
+        .unwrap(),
+    ];
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("e", DataType::Float64, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create RepartitionExec nodes for both sides with hash partitioning on join keys
+    let partition_count = 12;
+
+    // Build side: DataSource -> RepartitionExec (Hash)
+    let build_hash_exprs = vec![
+        col("a", &build_side_schema).unwrap(),
+        col("b", &build_side_schema).unwrap(),
+    ];
+    let build_repartition = Arc::new(
+        RepartitionExec::try_new(
+            build_scan,
+            Partitioning::Hash(build_hash_exprs, partition_count),
+        )
+        .unwrap(),
+    );
+
+    // Probe side: DataSource -> RepartitionExec (Hash)
+    let probe_hash_exprs = vec![
+        col("a", &probe_side_schema).unwrap(),
+        col("b", &probe_side_schema).unwrap(),
+    ];
+    let probe_repartition = Arc::new(
+        RepartitionExec::try_new(
+            Arc::clone(&probe_scan),
+            Partitioning::Hash(probe_hash_exprs, partition_count),
+        )
+        .unwrap(),
+    );
+
+    // Create HashJoinExec with partitioned inputs
+    let on = vec![
+        (
+            col("a", &build_side_schema).unwrap(),
+            col("a", &probe_side_schema).unwrap(),
+        ),
+        (
+            col("b", &build_side_schema).unwrap(),
+            col("b", &probe_side_schema).unwrap(),
+        ),
+    ];
+    let hash_join = Arc::new(
+        HashJoinExec::try_new(
+            build_repartition,
+            probe_repartition,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::Partitioned,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    // Top-level CoalescePartitionsExec
+    let cp = Arc::new(CoalescePartitionsExec::new(hash_join)) as Arc<dyn ExecutionPlan>;
+    // Add a sort for deterministic output
+    let plan = Arc::new(SortExec::new(
+        LexOrdering::new(vec![PhysicalSortExpr::new(
+            col("a", &probe_side_schema).unwrap(),
+            SortOptions::new(true, false), // descending, nulls_first
+        )])
+        .unwrap(),
+        cp,
+    )) as Arc<dyn ExecutionPlan>;
+
+    // expect the predicate to be pushed down into the probe side DataSource
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   CoalescePartitionsExec
+        -     HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+        -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+        -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+        -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+        -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   CoalescePartitionsExec
+          -     HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+          -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+          -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+          -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+          -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
+    "
+    );
+
+    // Actually apply the optimization to the plan and execute to see the filter in action
+    let mut config = ConfigOptions::default();
+    config.execution.parquet.pushdown_filters = true;
+    config.optimizer.enable_dynamic_filter_pushdown = true;
+    let plan = FilterPushdown::new_post_optimization()
+        .optimize(plan, &config)
+        .unwrap();
+    let config = SessionConfig::new().with_batch_size(10);
+    let session_ctx = SessionContext::new_with_config(config);
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    let batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx))
+        .await
+        .unwrap();
+
+    // Now check what our filter looks like
+    #[cfg(not(feature = "force_hash_collisions"))]
+    insta::assert_snapshot!(
+        format!("{}", format_plan_for_test(&plan)),
+        @r"
+    - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+    -   CoalescePartitionsExec
+    -     HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+    -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+    -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+    -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+    -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ CASE hash_repartition % 12 WHEN 5 THEN a@0 >= ab AND a@0 <= ab AND b@1 >= bb AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:ab,c1:bb}]) WHEN 8 THEN a@0 >= aa AND a@0 <= aa AND b@1 >= ba AND b@1 <= ba AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}]) ELSE false END ]
+    "
+    );
+
+    // When hash collisions force all data into a single partition, we optimize away the CASE expression.
+    // This avoids calling create_hashes() for every row on the probe side, since hash % 1 == 0 always,
+    // meaning the WHEN 0 branch would always match. This optimization is also important for primary key
+    // joins or any scenario where all build-side data naturally lands in one partition.
+    #[cfg(feature = "force_hash_collisions")]
+    insta::assert_snapshot!(
+        format!("{}", format_plan_for_test(&plan)),
+        @r"
+    - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+    -   CoalescePartitionsExec
+    -     HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+    -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+    -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+    -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+    -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ]
+    "
+    );
+
+    let result = format!("{}", pretty_format_batches(&batches).unwrap());
+
+    let probe_scan_metrics = probe_scan.metrics().unwrap();
+
+    // The probe side had 4 rows, but after applying the dynamic filter only 2 rows should remain.
+    // The number of output rows from the probe side scan should stay consistent across executions.
+    // Issue: https://github.com/apache/datafusion/issues/17451
+    assert_eq!(probe_scan_metrics.output_rows().unwrap(), 2);
+
+    insta::assert_snapshot!(
+        result,
+        @r"
+    +----+----+-----+----+----+-----+
+    | a  | b  | c   | a  | b  | e   |
+    +----+----+-----+----+----+-----+
+    | ab | bb | 2.0 | ab | bb | 2.0 |
+    | aa | ba | 1.0 | aa | ba | 1.0 |
+    +----+----+-----+----+----+-----+
+    ",
+    );
+}
+
+// Not portable to sqllogictest: this test specifically pins a
+// `RepartitionExec(Hash, 12)` between `HashJoinExec(CollectLeft)` and the
+// probe-side scan to verify the dynamic filter link survives that boundary
+// (regression for #17451). The same CollectLeft filter content and
+// pushdown counters are already covered by the simpler slt port
+// (push_down_filter_parquet.slt::test_hashjoin_dynamic_filter_pushdown).
+#[tokio::test]
+async fn test_hashjoin_dynamic_filter_pushdown_collect_left() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    let build_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab"]),
+            ("b", Utf8, ["ba", "bb"]),
+            ("c", Float64, [1.0, 2.0]) // Extra column not used in join
+        )
+        .unwrap(),
+    ];
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("c", DataType::Float64, false),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side with more values
+    let probe_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab", "ac", "ad"]),
+            ("b", Utf8, ["ba", "bb", "bc", "bd"]),
+            ("e", Float64, [1.0, 2.0, 3.0, 4.0]) // Extra column not used in join
+        )
+        .unwrap(),
+    ];
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("e", DataType::Float64, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create RepartitionExec nodes for both sides with hash partitioning on join keys
+    let partition_count = 12;
+
+    // Probe side: DataSource -> RepartitionExec(Hash)
+    let probe_hash_exprs = vec![
+        col("a", &probe_side_schema).unwrap(),
+        col("b", &probe_side_schema).unwrap(),
+    ];
+    let probe_repartition = Arc::new(
+        RepartitionExec::try_new(
+            Arc::clone(&probe_scan),
+            Partitioning::Hash(probe_hash_exprs, partition_count), // create multi partitions on probSide
+        )
+        .unwrap(),
+    );
+
+    let on = vec![
+        (
+            col("a", &build_side_schema).unwrap(),
+            col("a", &probe_side_schema).unwrap(),
+        ),
+        (
+            col("b", &build_side_schema).unwrap(),
+            col("b", &probe_side_schema).unwrap(),
+        ),
+    ];
+    let hash_join = Arc::new(
+        HashJoinExec::try_new(
+            build_scan,
+            probe_repartition,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::CollectLeft,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    // Top-level CoalescePartitionsExec
+    let cp = Arc::new(CoalescePartitionsExec::new(hash_join)) as Arc<dyn ExecutionPlan>;
+    // Add a sort for deterministic output
+    let plan = Arc::new(SortExec::new(
+        LexOrdering::new(vec![PhysicalSortExpr::new(
+            col("a", &probe_side_schema).unwrap(),
+            SortOptions::new(true, false), // descending, nulls_first
+        )])
+        .unwrap(),
+        cp,
+    )) as Arc<dyn ExecutionPlan>;
+
+    // expect the predicate to be pushed down into the probe side DataSource
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   CoalescePartitionsExec
+        -     HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+        -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+        -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   CoalescePartitionsExec
+          -     HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+          -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+          -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+          -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
+    "
+    );
+
+    // Actually apply the optimization to the plan and execute to see the filter in action
+    let mut config = ConfigOptions::default();
+    config.execution.parquet.pushdown_filters = true;
+    config.optimizer.enable_dynamic_filter_pushdown = true;
+    let plan = FilterPushdown::new_post_optimization()
+        .optimize(plan, &config)
+        .unwrap();
+    let config = SessionConfig::new().with_batch_size(10);
+    let session_ctx = SessionContext::new_with_config(config);
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    let batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx))
+        .await
+        .unwrap();
+
+    // Now check what our filter looks like
+    insta::assert_snapshot!(
+        format!("{}", format_plan_for_test(&plan)),
+        @r"
+    - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+    -   CoalescePartitionsExec
+    -     HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+    -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+    -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+    -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ]
+    "
+    );
+
+    let result = format!("{}", pretty_format_batches(&batches).unwrap());
+
+    let probe_scan_metrics = probe_scan.metrics().unwrap();
+
+    // The probe side had 4 rows, but after applying the dynamic filter only 2 rows should remain.
+    // The number of output rows from the probe side scan should stay consistent across executions.
+    // Issue: https://github.com/apache/datafusion/issues/17451
+    assert_eq!(probe_scan_metrics.output_rows().unwrap(), 2);
+
+    insta::assert_snapshot!(
+        result,
+        @r"
+    +----+----+-----+----+----+-----+
+    | a  | b  | c   | a  | b  | e   |
+    +----+----+-----+----+----+-----+
+    | ab | bb | 2.0 | ab | bb | 2.0 |
+    | aa | ba | 1.0 | aa | ba | 1.0 |
+    +----+----+-----+----+----+-----+
+    ",
+    );
+}
+
+#[test]
+fn test_hashjoin_parent_filter_pushdown_same_column_names() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Utf8, false),
+        Field::new("build_val", DataType::Utf8, false),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .build();
+
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Utf8, false),
+        Field::new("probe_val", DataType::Utf8, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .build();
+
+    let on = vec![(
+        col("id", &build_side_schema).unwrap(),
+        col("id", &probe_side_schema).unwrap(),
+    )];
+    let join = Arc::new(
+        HashJoinExec::try_new(
+            build_scan,
+            probe_scan,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::Partitioned,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    let join_schema = join.schema();
+
+    let build_id_filter = col_lit_predicate("id", "aa", &join_schema);
+    let probe_val_filter = col_lit_predicate("probe_val", "x", &join_schema);
+
+    let filter =
+        Arc::new(FilterExec::try_new(build_id_filter, Arc::clone(&join) as _).unwrap());
+    let plan = Arc::new(FilterExec::try_new(probe_val_filter, filter).unwrap())
+        as Arc<dyn ExecutionPlan>;
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: probe_val@3 = x
+        -   FilterExec: id@0 = aa
+        -     HashJoinExec: mode=Partitioned, join_type=Inner, on=[(id@0, id@0)]
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[id, build_val], file_type=test, pushdown_supported=true
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[id, probe_val], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(id@0, id@0)]
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[id, build_val], file_type=test, pushdown_supported=true, predicate=id@0 = aa
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[id, probe_val], file_type=test, pushdown_supported=true, predicate=probe_val@1 = x
+    "
+    );
+}
+
+#[test]
+fn test_hashjoin_parent_filter_pushdown_mark_join() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    let left_schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Utf8, false),
+        Field::new("val", DataType::Utf8, false),
+    ]));
+    let left_scan = TestScanBuilder::new(Arc::clone(&left_schema))
+        .with_support(true)
+        .build();
+
+    let right_schema =
+        Arc::new(Schema::new(vec![Field::new("id", DataType::Utf8, false)]));
+    let right_scan = TestScanBuilder::new(Arc::clone(&right_schema))
+        .with_support(true)
+        .build();
+
+    let on = vec![(
+        col("id", &left_schema).unwrap(),
+        col("id", &right_schema).unwrap(),
+    )];
+    let join = Arc::new(
+        HashJoinExec::try_new(
+            left_scan,
+            right_scan,
+            on,
+            None,
+            &JoinType::LeftMark,
+            None,
+            PartitionMode::Partitioned,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    let join_schema = join.schema();
+
+    let left_filter = col_lit_predicate("val", "x", &join_schema);
+    let mark_filter = col_lit_predicate("mark", true, &join_schema);
+
+    let filter =
+        Arc::new(FilterExec::try_new(left_filter, Arc::clone(&join) as _).unwrap());
+    let plan = Arc::new(FilterExec::try_new(mark_filter, filter).unwrap())
+        as Arc<dyn ExecutionPlan>;
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: mark@2 = true
+        -   FilterExec: val@1 = x
+        -     HashJoinExec: mode=Partitioned, join_type=LeftMark, on=[(id@0, id@0)]
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[id, val], file_type=test, pushdown_supported=true
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[id], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - FilterExec: mark@2 = true
+          -   HashJoinExec: mode=Partitioned, join_type=LeftMark, on=[(id@0, id@0)]
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[id, val], file_type=test, pushdown_supported=true, predicate=val@1 = x
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[id], file_type=test, pushdown_supported=true
+    "
+    );
+}
+
+/// Test that filters on join key columns are pushed to both sides of semi/anti joins.
+/// For LeftSemi/LeftAnti, the output only contains left columns, but filters on
+/// join key columns can also be pushed to the right (non-preserved) side because
+/// the equijoin condition guarantees the key values match.
+#[test]
+fn test_hashjoin_parent_filter_pushdown_semi_anti_join() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    let left_schema = Arc::new(Schema::new(vec![
+        Field::new("k", DataType::Utf8, false),
+        Field::new("v", DataType::Utf8, false),
+    ]));
+    let left_scan = TestScanBuilder::new(Arc::clone(&left_schema))
+        .with_support(true)
+        .build();
+
+    let right_schema = Arc::new(Schema::new(vec![
+        Field::new("k", DataType::Utf8, false),
+        Field::new("w", DataType::Utf8, false),
+    ]));
+    let right_scan = TestScanBuilder::new(Arc::clone(&right_schema))
+        .with_support(true)
+        .build();
+
+    let on = vec![(
+        col("k", &left_schema).unwrap(),
+        col("k", &right_schema).unwrap(),
+    )];
+
+    let join = Arc::new(
+        HashJoinExec::try_new(
+            left_scan,
+            right_scan,
+            on,
+            None,
+            &JoinType::LeftSemi,
+            None,
+            PartitionMode::Partitioned,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    let join_schema = join.schema();
+    // Filter on join key column: k = 'x' — should be pushed to BOTH sides
+    let key_filter = col_lit_predicate("k", "x", &join_schema);
+    // Filter on non-key column: v = 'y' — should only be pushed to the left side
+    let val_filter = col_lit_predicate("v", "y", &join_schema);
+
+    let filter =
+        Arc::new(FilterExec::try_new(key_filter, Arc::clone(&join) as _).unwrap());
+    let plan = Arc::new(FilterExec::try_new(val_filter, filter).unwrap())
+        as Arc<dyn ExecutionPlan>;
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: v@1 = y
+        -   FilterExec: k@0 = x
+        -     HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(k@0, k@0)]
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[k, v], file_type=test, pushdown_supported=true
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[k, w], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(k@0, k@0)]
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[k, v], file_type=test, pushdown_supported=true, predicate=k@0 = x AND v@1 = y
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[k, w], file_type=test, pushdown_supported=true, predicate=k@0 = x
+    "
+    );
+}
+
+#[test]
+fn test_filter_pushdown_through_union() {
+    let scan1 = TestScanBuilder::new(schema()).with_support(true).build();
+    let scan2 = TestScanBuilder::new(schema()).with_support(true).build();
+
+    let union = UnionExec::try_new(vec![scan1, scan2]).unwrap();
+
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, union).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   UnionExec
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - UnionExec
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+    "
+    );
+}
+
+#[test]
+fn test_filter_pushdown_through_union_mixed_support() {
+    // Test case where one child supports filter pushdown and one doesn't
+    let scan1 = TestScanBuilder::new(schema()).with_support(true).build();
+    let scan2 = TestScanBuilder::new(schema()).with_support(false).build();
+
+    let union = UnionExec::try_new(vec![scan1, scan2]).unwrap();
+
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, union).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   UnionExec
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+      output:
+        Ok:
+          - UnionExec
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+          -   FilterExec: a@0 = foo
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+    "
+    );
+}
+
+#[test]
+fn test_filter_pushdown_through_union_does_not_support() {
+    // Test case where one child supports filter pushdown and one doesn't
+    let scan1 = TestScanBuilder::new(schema()).with_support(false).build();
+    let scan2 = TestScanBuilder::new(schema()).with_support(false).build();
+
+    let union = UnionExec::try_new(vec![scan1, scan2]).unwrap();
+
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, union).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   UnionExec
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+      output:
+        Ok:
+          - UnionExec
+          -   FilterExec: a@0 = foo
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+          -   FilterExec: a@0 = foo
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+    "
+    );
+}
+
+#[test]
+fn test_filter_with_fetch_fully_pushed_through_union() {
+    // When a FilterExec with fetch wraps a UnionExec and all predicates are
+    // pushed down, UnionExec does not support with_fetch, so a LocalLimitExec
+    // should be inserted to preserve the fetch limit.
+    let scan1 = TestScanBuilder::new(schema()).with_support(true).build();
+    let scan2 = TestScanBuilder::new(schema()).with_support(true).build();
+    let union = UnionExec::try_new(vec![scan1, scan2]).unwrap();
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(
+        FilterExecBuilder::new(predicate, union)
+            .with_fetch(Some(10))
+            .build()
+            .unwrap(),
+    );
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo, fetch=10
+        -   UnionExec
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - LocalLimitExec: fetch=10
+          -   UnionExec
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+    "
+    );
+}
+
+#[test]
+fn test_filter_with_fetch_and_projection_fully_pushed_through_union() {
+    // When a FilterExec with both fetch and projection wraps a UnionExec and
+    // all predicates are pushed down, we should get a ProjectionExec on top of
+    // a LocalLimitExec wrapping the UnionExec.
+    let scan1 = TestScanBuilder::new(schema()).with_support(true).build();
+    let scan2 = TestScanBuilder::new(schema()).with_support(true).build();
+    let union = UnionExec::try_new(vec![scan1, scan2]).unwrap();
+    let projection = vec![1, 0];
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(
+        FilterExecBuilder::new(predicate, union)
+            .with_fetch(Some(5))
+            .apply_projection(Some(projection))
+            .unwrap()
+            .build()
+            .unwrap(),
+    );
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo, projection=[b@1, a@0], fetch=5
+        -   UnionExec
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - ProjectionExec: expr=[b@1 as b, a@0 as a]
+          -   LocalLimitExec: fetch=5
+          -     UnionExec
+          -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+          -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+    "
+    );
+}
+
+#[test]
+fn test_filter_with_fetch_not_fully_pushed_through_union() {
+    // When a FilterExec with fetch wraps a UnionExec but children don't support
+    // pushdown, the FilterExec remains with its fetch — no LocalLimitExec needed.
+    let scan1 = TestScanBuilder::new(schema()).with_support(false).build();
+    let scan2 = TestScanBuilder::new(schema()).with_support(false).build();
+    let union = UnionExec::try_new(vec![scan1, scan2]).unwrap();
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(
+        FilterExecBuilder::new(predicate, union)
+            .with_fetch(Some(8))
+            .build()
+            .unwrap(),
+    );
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo, fetch=8
+        -   UnionExec
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+      output:
+        Ok:
+          - LocalLimitExec: fetch=8
+          -   UnionExec
+          -     FilterExec: a@0 = foo
+          -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+          -     FilterExec: a@0 = foo
+          -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+    "
+    );
+}
+
+/// Schema:
+/// a: String
+/// b: String
+/// c: f64
+static TEST_SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
+    let fields = vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("c", DataType::Float64, false),
+    ];
+    Arc::new(Schema::new(fields))
+});
+
+fn schema() -> SchemaRef {
+    Arc::clone(&TEST_SCHEMA)
+}
+
+// test_topk_with_projection_transformation_on_dyn_filter has been ported
+// to datafusion/sqllogictest/test_files/push_down_filter_parquet.slt; see
+// `topk_proj` fixture for the 4 representative cases (reorder, prune,
+// expression, alias shadowing). The `run_projection_dyn_filter_case`
+// harness was removed along with it.
+
+/// Returns a predicate that is a binary expression col = lit
+fn col_lit_predicate(
+    column_name: &str,
+    scalar_value: impl Into<ScalarValue>,
+    schema: &Schema,
+) -> Arc<dyn PhysicalExpr> {
+    let scalar_value = scalar_value.into();
+    Arc::new(BinaryExpr::new(
+        Arc::new(Column::new_with_schema(column_name, schema).unwrap()),
+        Operator::Eq,
+        Arc::new(Literal::new(scalar_value)),
+    ))
+}
+
+// ==== Aggregate Dynamic Filter tests ====
+//
+// The end-to-end min/max dynamic filter cases (simple/min/max/mixed/all-nulls)
+// have been ported to
+// `datafusion/sqllogictest/test_files/push_down_filter_regression.slt`.
+// The `run_aggregate_dyn_filter_case` harness used to drive them was removed
+// along with the test functions.
+
+/// Non-partial (Single) aggregates should skip dynamic filter initialization.
+#[test]
+fn test_aggregate_dynamic_filter_not_created_for_single_mode() {
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)]));
+    let batches = vec![record_batch!(("a", Int32, [5, 1, 3, 8])).unwrap()];
+
+    let scan = TestScanBuilder::new(Arc::clone(&schema))
+        .with_support(true)
+        .with_batches(batches)
+        .build();
+
+    let min_expr =
+        AggregateExprBuilder::new(min_udaf(), vec![col("a", &schema).unwrap()])
+            .schema(Arc::clone(&schema))
+            .alias("min_a")
+            .build()
+            .unwrap();
+
+    let plan: Arc<dyn ExecutionPlan> = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Single,
+            PhysicalGroupBy::new_single(vec![]),
+            vec![min_expr.into()],
+            vec![None],
+            scan,
+            Arc::clone(&schema),
+        )
+        .unwrap(),
+    );
+
+    let mut config = ConfigOptions::default();
+    config.execution.parquet.pushdown_filters = true;
+    config.optimizer.enable_dynamic_filter_pushdown = true;
+
+    let optimized = FilterPushdown::new_post_optimization()
+        .optimize(plan, &config)
+        .unwrap();
+
+    let formatted = format_plan_for_test(&optimized);
+    assert!(
+        !formatted.contains("DynamicFilter ["),
+        "dynamic filter should not be created for AggregateMode::Single: {formatted}"
+    );
+}
+
+#[test]
+fn test_pushdown_filter_on_non_first_grouping_column() {
+    // Test that filters on non-first grouping columns are still pushed down
+    // SELECT a, b, count(*) as cnt FROM table GROUP BY a, b HAVING b = 'bar'
+    // The filter is on 'b' (second grouping column), should push down
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+
+    let aggregate_expr = vec![
+        AggregateExprBuilder::new(count_udaf(), vec![col("c", &schema()).unwrap()])
+            .schema(schema())
+            .alias("cnt")
+            .build()
+            .map(Arc::new)
+            .unwrap(),
+    ];
+
+    let group_by = PhysicalGroupBy::new_single(vec![
+        (col("a", &schema()).unwrap(), "a".to_string()),
+        (col("b", &schema()).unwrap(), "b".to_string()),
+    ]);
+
+    let aggregate = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Final,
+            group_by,
+            aggregate_expr.clone(),
+            vec![None],
+            scan,
+            schema(),
+        )
+        .unwrap(),
+    );
+
+    let predicate = col_lit_predicate("b", "bar", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: b@1 = bar
+        -   AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt]
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt], ordering_mode=PartiallySorted([1])
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=b@1 = bar
+    "
+    );
+}
+
+#[test]
+fn test_no_pushdown_grouping_sets_filter_on_missing_column() {
+    // Test that filters on columns missing from some grouping sets are NOT pushed through
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+
+    let aggregate_expr = vec![
+        AggregateExprBuilder::new(count_udaf(), vec![col("c", &schema()).unwrap()])
+            .schema(schema())
+            .alias("cnt")
+            .build()
+            .map(Arc::new)
+            .unwrap(),
+    ];
+
+    // Create GROUPING SETS with (a, b) and (b)
+    let group_by = PhysicalGroupBy::new(
+        vec![
+            (col("a", &schema()).unwrap(), "a".to_string()),
+            (col("b", &schema()).unwrap(), "b".to_string()),
+        ],
+        vec![
+            (
+                Arc::new(Literal::new(ScalarValue::Utf8(None))),
+                "a".to_string(),
+            ),
+            (
+                Arc::new(Literal::new(ScalarValue::Utf8(None))),
+                "b".to_string(),
+            ),
+        ],
+        vec![
+            vec![false, false], // (a, b) - both present
+            vec![true, false],  // (b) - a is NULL, b present
+        ],
+        true,
+    );
+
+    let aggregate = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Final,
+            group_by,
+            aggregate_expr.clone(),
+            vec![None],
+            scan,
+            schema(),
+        )
+        .unwrap(),
+    );
+
+    // Filter on column 'a' which is missing in the second grouping set, should not be pushed down
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   AggregateExec: mode=Final, gby=[(a@0 as a, b@1 as b), (NULL as a, b@1 as b)], aggr=[cnt]
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - FilterExec: a@0 = foo
+          -   AggregateExec: mode=Final, gby=[(a@0 as a, b@1 as b), (NULL as a, b@1 as b)], aggr=[cnt]
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+    "
+    );
+}
+
+#[test]
+fn test_pushdown_grouping_sets_filter_on_common_column() {
+    // Test that filters on columns present in ALL grouping sets ARE pushed through
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+
+    let aggregate_expr = vec![
+        AggregateExprBuilder::new(count_udaf(), vec![col("c", &schema()).unwrap()])
+            .schema(schema())
+            .alias("cnt")
+            .build()
+            .map(Arc::new)
+            .unwrap(),
+    ];
+
+    // Create GROUPING SETS with (a, b) and (b)
+    let group_by = PhysicalGroupBy::new(
+        vec![
+            (col("a", &schema()).unwrap(), "a".to_string()),
+            (col("b", &schema()).unwrap(), "b".to_string()),
+        ],
+        vec![
+            (
+                Arc::new(Literal::new(ScalarValue::Utf8(None))),
+                "a".to_string(),
+            ),
+            (
+                Arc::new(Literal::new(ScalarValue::Utf8(None))),
+                "b".to_string(),
+            ),
+        ],
+        vec![
+            vec![false, false], // (a, b) - both present
+            vec![true, false],  // (b) - a is NULL, b present
+        ],
+        true,
+    );
+
+    let aggregate = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Final,
+            group_by,
+            aggregate_expr.clone(),
+            vec![None],
+            scan,
+            schema(),
+        )
+        .unwrap(),
+    );
+
+    // Filter on column 'b' which is present in all grouping sets will be pushed down
+    let predicate = col_lit_predicate("b", "bar", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: b@1 = bar
+        -   AggregateExec: mode=Final, gby=[(a@0 as a, b@1 as b), (NULL as a, b@1 as b)], aggr=[cnt]
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - AggregateExec: mode=Final, gby=[(a@0 as a, b@1 as b), (NULL as a, b@1 as b)], aggr=[cnt], ordering_mode=PartiallySorted([1])
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=b@1 = bar
+    "
+    );
+}
+
+#[test]
+fn test_pushdown_with_empty_group_by() {
+    // Test that filters can be pushed down when GROUP BY is empty (no grouping columns)
+    // SELECT count(*) as cnt FROM table WHERE a = 'foo'
+    // There are no grouping columns, so the filter should still push down
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+
+    let aggregate_expr = vec![
+        AggregateExprBuilder::new(count_udaf(), vec![col("c", &schema()).unwrap()])
+            .schema(schema())
+            .alias("cnt")
+            .build()
+            .map(Arc::new)
+            .unwrap(),
+    ];
+
+    // Empty GROUP BY - no grouping columns
+    let group_by = PhysicalGroupBy::new_single(vec![]);
+
+    let aggregate = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Final,
+            group_by,
+            aggregate_expr.clone(),
+            vec![None],
+            scan,
+            schema(),
+        )
+        .unwrap(),
+    );
+
+    // Filter on 'a'
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap());
+
+    // The filter should be pushed down even with empty GROUP BY
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   AggregateExec: mode=Final, gby=[], aggr=[cnt]
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - AggregateExec: mode=Final, gby=[], aggr=[cnt]
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+    "
+    );
+}
+
+#[test]
+fn test_pushdown_through_aggregate_with_reordered_input_columns() {
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+
+    // Reorder scan output from (a, b, c) to (c, a, b)
+    let reordered_schema = Arc::new(Schema::new(vec![
+        Field::new("c", DataType::Float64, false),
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+    ]));
+    let projection = Arc::new(
+        ProjectionExec::try_new(
+            vec![
+                (col("c", &schema()).unwrap(), "c".to_string()),
+                (col("a", &schema()).unwrap(), "a".to_string()),
+                (col("b", &schema()).unwrap(), "b".to_string()),
+            ],
+            scan,
+        )
+        .unwrap(),
+    );
+
+    let aggregate_expr = vec![
+        AggregateExprBuilder::new(
+            count_udaf(),
+            vec![col("c", &reordered_schema).unwrap()],
+        )
+        .schema(reordered_schema.clone())
+        .alias("cnt")
+        .build()
+        .map(Arc::new)
+        .unwrap(),
+    ];
+
+    // Group by a@1, b@2 (input indices in reordered schema)
+    let group_by = PhysicalGroupBy::new_single(vec![
+        (col("a", &reordered_schema).unwrap(), "a".to_string()),
+        (col("b", &reordered_schema).unwrap(), "b".to_string()),
+    ]);
+
+    let aggregate = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Final,
+            group_by,
+            aggregate_expr,
+            vec![None],
+            projection,
+            reordered_schema,
+        )
+        .unwrap(),
+    );
+
+    // Filter on b@1 in aggregate's output schema (a@0, b@1, cnt@2)
+    // The grouping expr for b references input index 2, but output index is 1.
+    let agg_output_schema = aggregate.schema();
+    let predicate = col_lit_predicate("b", "bar", &agg_output_schema);
+    let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap());
+
+    // The filter should be pushed down
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: b@1 = bar
+        -   AggregateExec: mode=Final, gby=[a@1 as a, b@2 as b], aggr=[cnt]
+        -     ProjectionExec: expr=[c@2 as c, a@0 as a, b@1 as b]
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - AggregateExec: mode=Final, gby=[a@1 as a, b@2 as b], aggr=[cnt], ordering_mode=PartiallySorted([1])
+          -   ProjectionExec: expr=[c@2 as c, a@0 as a, b@1 as b]
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=b@1 = bar
+    "
+    );
+}
+
+#[test]
+fn test_pushdown_through_aggregate_grouping_sets_with_reordered_input() {
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+
+    let reordered_schema = Arc::new(Schema::new(vec![
+        Field::new("c", DataType::Float64, false),
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+    ]));
+    let projection = Arc::new(
+        ProjectionExec::try_new(
+            vec![
+                (col("c", &schema()).unwrap(), "c".to_string()),
+                (col("a", &schema()).unwrap(), "a".to_string()),
+                (col("b", &schema()).unwrap(), "b".to_string()),
+            ],
+            scan,
+        )
+        .unwrap(),
+    );
+
+    let aggregate_expr = vec![
+        AggregateExprBuilder::new(
+            count_udaf(),
+            vec![col("c", &reordered_schema).unwrap()],
+        )
+        .schema(reordered_schema.clone())
+        .alias("cnt")
+        .build()
+        .map(Arc::new)
+        .unwrap(),
+    ];
+
+    // Use grouping sets (a, b) and (b).
+    let group_by = PhysicalGroupBy::new(
+        vec![
+            (col("a", &reordered_schema).unwrap(), "a".to_string()),
+            (col("b", &reordered_schema).unwrap(), "b".to_string()),
+        ],
+        vec![
+            (
+                Arc::new(Literal::new(ScalarValue::Utf8(None))),
+                "a".to_string(),
+            ),
+            (
+                Arc::new(Literal::new(ScalarValue::Utf8(None))),
+                "b".to_string(),
+            ),
+        ],
+        vec![vec![false, false], vec![true, false]],
+        true,
+    );
+
+    let aggregate = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Final,
+            group_by,
+            aggregate_expr,
+            vec![None],
+            projection,
+            reordered_schema,
+        )
+        .unwrap(),
+    );
+
+    let agg_output_schema = aggregate.schema();
+
+    // Filter on b (present in all grouping sets) should be pushed down
+    let predicate = col_lit_predicate("b", "bar", &agg_output_schema);
+    let plan = Arc::new(FilterExec::try_new(predicate, aggregate.clone()).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: b@1 = bar
+        -   AggregateExec: mode=Final, gby=[(a@1 as a, b@2 as b), (NULL as a, b@2 as b)], aggr=[cnt]
+        -     ProjectionExec: expr=[c@2 as c, a@0 as a, b@1 as b]
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - AggregateExec: mode=Final, gby=[(a@1 as a, b@2 as b), (NULL as a, b@2 as b)], aggr=[cnt], ordering_mode=PartiallySorted([1])
+          -   ProjectionExec: expr=[c@2 as c, a@0 as a, b@1 as b]
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=b@1 = bar
+    "
+    );
+
+    // Filter on a (missing from second grouping set) should not be pushed down
+    let predicate = col_lit_predicate("a", "foo", &agg_output_schema);
+    let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   AggregateExec: mode=Final, gby=[(a@1 as a, b@2 as b), (NULL as a, b@2 as b)], aggr=[cnt]
+        -     ProjectionExec: expr=[c@2 as c, a@0 as a, b@1 as b]
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - FilterExec: a@0 = foo
+          -   AggregateExec: mode=Final, gby=[(a@1 as a, b@2 as b), (NULL as a, b@2 as b)], aggr=[cnt]
+          -     ProjectionExec: expr=[c@2 as c, a@0 as a, b@1 as b]
+          -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+    "
+    );
+}
+
+/// Regression test for https://github.com/apache/datafusion/issues/21065.
+///
+/// Given a plan similar to the following, ensure that the filter is pushed down
+/// through an AggregateExec whose input columns are reordered by a ProjectionExec.
+#[test]
+fn test_pushdown_with_computed_grouping_key() {
+    // Test filter pushdown with computed grouping expression
+    // SELECT (c + 1.0) as c_plus_1, count(*) FROM table WHERE c > 5.0 GROUP BY (c + 1.0)
+
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+
+    let predicate = Arc::new(BinaryExpr::new(
+        col("c", &schema()).unwrap(),
+        Operator::Gt,
+        Arc::new(Literal::new(ScalarValue::Float64(Some(5.0)))),
+    )) as Arc<dyn PhysicalExpr>;
+    let filter = Arc::new(FilterExec::try_new(predicate, scan).unwrap());
+
+    let aggregate_expr = vec![
+        AggregateExprBuilder::new(count_udaf(), vec![col("a", &schema()).unwrap()])
+            .schema(schema())
+            .alias("cnt")
+            .build()
+            .map(Arc::new)
+            .unwrap(),
+    ];
+
+    let c_plus_one = Arc::new(BinaryExpr::new(
+        col("c", &schema()).unwrap(),
+        Operator::Plus,
+        Arc::new(Literal::new(ScalarValue::Float64(Some(1.0)))),
+    )) as Arc<dyn PhysicalExpr>;
+
+    let group_by =
+        PhysicalGroupBy::new_single(vec![(c_plus_one, "c_plus_1".to_string())]);
+
+    let plan = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Final,
+            group_by,
+            aggregate_expr.clone(),
+            vec![None],
+            filter,
+            schema(),
+        )
+        .unwrap(),
+    );
+
+    // The filter should be pushed down because 'c' is extracted from the grouping expression (c + 1.0)
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - AggregateExec: mode=Final, gby=[c@2 + 1 as c_plus_1], aggr=[cnt]
+        -   FilterExec: c@2 > 5
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - AggregateExec: mode=Final, gby=[c@2 + 1 as c_plus_1], aggr=[cnt]
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=c@2 > 5
+    "
+    );
+}
+
+// Not portable to sqllogictest: in CollectLeft (the mode SQL picks for small
+// data), an empty build side short-circuits the HashJoin and the probe scan
+// is never executed, so its dynamic filter stays at `[ empty ]` rather than
+// collapsing to `[ false ]`. The Rust test uses PartitionMode::Partitioned
+// on a hand-wired plan, which does trigger the `false` path.
+#[tokio::test]
+async fn test_hashjoin_dynamic_filter_all_partitions_empty() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    // Test scenario where all build-side partitions are empty
+    // This validates the code path that sets the filter to `false` when no rows can match
+
+    // Create empty build side
+    let build_batches = vec![];
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side with some data
+    let probe_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab", "ac"]),
+            ("b", Utf8, ["ba", "bb", "bc"])
+        )
+        .unwrap(),
+    ];
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create RepartitionExec nodes for both sides
+    let partition_count = 4;
+
+    let build_hash_exprs = vec![
+        col("a", &build_side_schema).unwrap(),
+        col("b", &build_side_schema).unwrap(),
+    ];
+    let build_repartition = Arc::new(
+        RepartitionExec::try_new(
+            build_scan,
+            Partitioning::Hash(build_hash_exprs, partition_count),
+        )
+        .unwrap(),
+    );
+
+    let probe_hash_exprs = vec![
+        col("a", &probe_side_schema).unwrap(),
+        col("b", &probe_side_schema).unwrap(),
+    ];
+    let probe_repartition = Arc::new(
+        RepartitionExec::try_new(
+            Arc::clone(&probe_scan),
+            Partitioning::Hash(probe_hash_exprs, partition_count),
+        )
+        .unwrap(),
+    );
+
+    // Create HashJoinExec
+    let on = vec![
+        (
+            col("a", &build_side_schema).unwrap(),
+            col("a", &probe_side_schema).unwrap(),
+        ),
+        (
+            col("b", &build_side_schema).unwrap(),
+            col("b", &probe_side_schema).unwrap(),
+        ),
+    ];
+    let plan = Arc::new(
+        HashJoinExec::try_new(
+            build_repartition,
+            probe_repartition,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::Partitioned,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    // Apply the filter pushdown optimizer
+    let mut config = SessionConfig::new();
+    config.options_mut().execution.parquet.pushdown_filters = true;
+    let optimizer = FilterPushdown::new_post_optimization();
+    let plan = optimizer.optimize(plan, config.options()).unwrap();
+
+    insta::assert_snapshot!(
+        format_plan_for_test(&plan),
+        @r"
+    - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+    -   RepartitionExec: partitioning=Hash([a@0, b@1], 4), input_partitions=1
+    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true
+    -   RepartitionExec: partitioning=Hash([a@0, b@1], 4), input_partitions=1
+    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
+    "
+    );
+
+    // Put some data through the plan to check that the filter is updated to reflect the TopK state
+    let session_ctx = SessionContext::new_with_config(config);
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    // Execute all partitions (required for partitioned hash join coordination)
+    let _batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx))
+        .await
+        .unwrap();
+
+    // Test that filters are pushed down correctly to each side of the join
+    insta::assert_snapshot!(
+        format_plan_for_test(&plan),
+        @r"
+    - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+    -   RepartitionExec: partitioning=Hash([a@0, b@1], 4), input_partitions=1
+    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true
+    -   RepartitionExec: partitioning=Hash([a@0, b@1], 4), input_partitions=1
+    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ false ]
+    "
+    );
+}
+
+// Not portable to sqllogictest: same reason as
+// test_hashjoin_dynamic_filter_pushdown_partitioned — hand-wires
+// PartitionMode::Partitioned, which SQL never picks for small parquet inputs.
+#[tokio::test]
+async fn test_hashjoin_hash_table_pushdown_partitioned() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    // Create build side with limited values
+    let build_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab"]),
+            ("b", Utf8, ["ba", "bb"]),
+            ("c", Float64, [1.0, 2.0]) // Extra column not used in join
+        )
+        .unwrap(),
+    ];
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("c", DataType::Float64, false),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side with more values
+    let probe_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab", "ac", "ad"]),
+            ("b", Utf8, ["ba", "bb", "bc", "bd"]),
+            ("e", Float64, [1.0, 2.0, 3.0, 4.0]) // Extra column not used in join
+        )
+        .unwrap(),
+    ];
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("e", DataType::Float64, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create RepartitionExec nodes for both sides with hash partitioning on join keys
+    let partition_count = 12;
+
+    // Build side: DataSource -> RepartitionExec (Hash)
+    let build_hash_exprs = vec![
+        col("a", &build_side_schema).unwrap(),
+        col("b", &build_side_schema).unwrap(),
+    ];
+    let build_repartition = Arc::new(
+        RepartitionExec::try_new(
+            build_scan,
+            Partitioning::Hash(build_hash_exprs, partition_count),
+        )
+        .unwrap(),
+    );
+
+    // Probe side: DataSource -> RepartitionExec (Hash)
+    let probe_hash_exprs = vec![
+        col("a", &probe_side_schema).unwrap(),
+        col("b", &probe_side_schema).unwrap(),
+    ];
+    let probe_repartition = Arc::new(
+        RepartitionExec::try_new(
+            Arc::clone(&probe_scan),
+            Partitioning::Hash(probe_hash_exprs, partition_count),
+        )
+        .unwrap(),
+    );
+
+    // Create HashJoinExec with partitioned inputs
+    let on = vec![
+        (
+            col("a", &build_side_schema).unwrap(),
+            col("a", &probe_side_schema).unwrap(),
+        ),
+        (
+            col("b", &build_side_schema).unwrap(),
+            col("b", &probe_side_schema).unwrap(),
+        ),
+    ];
+    let hash_join = Arc::new(
+        HashJoinExec::try_new(
+            build_repartition,
+            probe_repartition,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::Partitioned,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    // Top-level CoalescePartitionsExec
+    let cp = Arc::new(CoalescePartitionsExec::new(hash_join)) as Arc<dyn ExecutionPlan>;
+    // Add a sort for deterministic output
+    let plan = Arc::new(SortExec::new(
+        LexOrdering::new(vec![PhysicalSortExpr::new(
+            col("a", &probe_side_schema).unwrap(),
+            SortOptions::new(true, false), // descending, nulls_first
+        )])
+        .unwrap(),
+        cp,
+    )) as Arc<dyn ExecutionPlan>;
+
+    // Apply the optimization with config setting that forces HashTable strategy
+    let session_config = SessionConfig::default()
+        .with_batch_size(10)
+        .set_usize("datafusion.optimizer.hash_join_inlist_pushdown_max_size", 1)
+        .set_bool("datafusion.execution.parquet.pushdown_filters", true)
+        .set_bool("datafusion.optimizer.enable_dynamic_filter_pushdown", true);
+    let plan = FilterPushdown::new_post_optimization()
+        .optimize(plan, session_config.options())
+        .unwrap();
+    let session_ctx = SessionContext::new_with_config(session_config);
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    let batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx))
+        .await
+        .unwrap();
+
+    // Verify that hash_lookup is used instead of IN (SET)
+    let plan_str = format_plan_for_test(&plan).to_string();
+    assert!(
+        plan_str.contains("hash_lookup"),
+        "Expected hash_lookup in plan but got: {plan_str}"
+    );
+    assert!(
+        !plan_str.contains("IN (SET)"),
+        "Expected no IN (SET) in plan but got: {plan_str}"
+    );
+
+    let result = format!("{}", pretty_format_batches(&batches).unwrap());
+
+    let probe_scan_metrics = probe_scan.metrics().unwrap();
+
+    // The probe side had 4 rows, but after applying the dynamic filter only 2 rows should remain.
+    assert_eq!(probe_scan_metrics.output_rows().unwrap(), 2);
+
+    // Results should be identical to the InList version
+    insta::assert_snapshot!(
+        result,
+        @r"
+    +----+----+-----+----+----+-----+
+    | a  | b  | c   | a  | b  | e   |
+    +----+----+-----+----+----+-----+
+    | ab | bb | 2.0 | ab | bb | 2.0 |
+    | aa | ba | 1.0 | aa | ba | 1.0 |
+    +----+----+-----+----+----+-----+
+    ",
+    );
+}
+
+// Ported to push_down_filter_parquet.slt (`hl_build`/`hl_probe` fixture).
+// Rust version retained only because the slt port cannot hand-wire the
+// RepartitionExec-above-probe shape this test uses; the hash_lookup vs
+// IN (SET) invariant is captured in the slt port.
+#[tokio::test]
+async fn test_hashjoin_hash_table_pushdown_collect_left() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    let build_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab"]),
+            ("b", Utf8, ["ba", "bb"]),
+            ("c", Float64, [1.0, 2.0]) // Extra column not used in join
+        )
+        .unwrap(),
+    ];
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("c", DataType::Float64, false),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side with more values
+    let probe_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab", "ac", "ad"]),
+            ("b", Utf8, ["ba", "bb", "bc", "bd"]),
+            ("e", Float64, [1.0, 2.0, 3.0, 4.0]) // Extra column not used in join
+        )
+        .unwrap(),
+    ];
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("e", DataType::Float64, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create RepartitionExec nodes for both sides with hash partitioning on join keys
+    let partition_count = 12;
+
+    // Probe side: DataSource -> RepartitionExec(Hash)
+    let probe_hash_exprs = vec![
+        col("a", &probe_side_schema).unwrap(),
+        col("b", &probe_side_schema).unwrap(),
+    ];
+    let probe_repartition = Arc::new(
+        RepartitionExec::try_new(
+            Arc::clone(&probe_scan),
+            Partitioning::Hash(probe_hash_exprs, partition_count), // create multi partitions on probSide
+        )
+        .unwrap(),
+    );
+
+    let on = vec![
+        (
+            col("a", &build_side_schema).unwrap(),
+            col("a", &probe_side_schema).unwrap(),
+        ),
+        (
+            col("b", &build_side_schema).unwrap(),
+            col("b", &probe_side_schema).unwrap(),
+        ),
+    ];
+    let hash_join = Arc::new(
+        HashJoinExec::try_new(
+            build_scan,
+            probe_repartition,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::CollectLeft,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    // Top-level CoalescePartitionsExec
+    let cp = Arc::new(CoalescePartitionsExec::new(hash_join)) as Arc<dyn ExecutionPlan>;
+    // Add a sort for deterministic output
+    let plan = Arc::new(SortExec::new(
+        LexOrdering::new(vec![PhysicalSortExpr::new(
+            col("a", &probe_side_schema).unwrap(),
+            SortOptions::new(true, false), // descending, nulls_first
+        )])
+        .unwrap(),
+        cp,
+    )) as Arc<dyn ExecutionPlan>;
+
+    // Apply the optimization with config setting that forces HashTable strategy
+    let session_config = SessionConfig::default()
+        .with_batch_size(10)
+        .set_usize("datafusion.optimizer.hash_join_inlist_pushdown_max_size", 1)
+        .set_bool("datafusion.execution.parquet.pushdown_filters", true)
+        .set_bool("datafusion.optimizer.enable_dynamic_filter_pushdown", true);
+    let plan = FilterPushdown::new_post_optimization()
+        .optimize(plan, session_config.options())
+        .unwrap();
+    let session_ctx = SessionContext::new_with_config(session_config);
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    let batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx))
+        .await
+        .unwrap();
+
+    // Verify that hash_lookup is used instead of IN (SET)
+    let plan_str = format_plan_for_test(&plan).to_string();
+    assert!(
+        plan_str.contains("hash_lookup"),
+        "Expected hash_lookup in plan but got: {plan_str}"
+    );
+    assert!(
+        !plan_str.contains("IN (SET)"),
+        "Expected no IN (SET) in plan but got: {plan_str}"
+    );
+
+    let result = format!("{}", pretty_format_batches(&batches).unwrap());
+
+    let probe_scan_metrics = probe_scan.metrics().unwrap();
+
+    // The probe side had 4 rows, but after applying the dynamic filter only 2 rows should remain.
+    assert_eq!(probe_scan_metrics.output_rows().unwrap(), 2);
+
+    // Results should be identical to the InList version
+    insta::assert_snapshot!(
+        result,
+        @r"
+    +----+----+-----+----+----+-----+
+    | a  | b  | c   | a  | b  | e   |
+    +----+----+-----+----+----+-----+
+    | ab | bb | 2.0 | ab | bb | 2.0 |
+    | aa | ba | 1.0 | aa | ba | 1.0 |
+    +----+----+-----+----+----+-----+
+    ",
+    );
+}
+
+// Not portable to sqllogictest: asserts on `HashJoinExec::dynamic_filter_for_test().is_used()`
+// which is a debug-only API. The observable behavior (probe-side scan
+// receiving the dynamic filter when the data source supports it) is
+// already covered by the simpler CollectLeft port in push_down_filter_parquet.slt;
+// the with_support(false) branch has no SQL analog (parquet always supports
+// pushdown).
+#[tokio::test]
+async fn test_hashjoin_dynamic_filter_pushdown_is_used() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    // Test both cases: probe side with and without filter pushdown support
+    for (probe_supports_pushdown, expected_is_used) in [(false, false), (true, true)] {
+        let build_side_schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Utf8, false),
+            Field::new("b", DataType::Utf8, false),
+        ]));
+        let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+            .with_support(true)
+            .with_batches(vec![
+                record_batch!(("a", Utf8, ["aa", "ab"]), ("b", Utf8, ["ba", "bb"]))
+                    .unwrap(),
+            ])
+            .build();
+
+        let probe_side_schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Utf8, false),
+            Field::new("b", DataType::Utf8, false),
+        ]));
+        let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+            .with_support(probe_supports_pushdown)
+            .with_batches(vec![
+                record_batch!(
+                    ("a", Utf8, ["aa", "ab", "ac", "ad"]),
+                    ("b", Utf8, ["ba", "bb", "bc", "bd"])
+                )
+                .unwrap(),
+            ])
+            .build();
+
+        let on = vec![
+            (
+                col("a", &build_side_schema).unwrap(),
+                col("a", &probe_side_schema).unwrap(),
+            ),
+            (
+                col("b", &build_side_schema).unwrap(),
+                col("b", &probe_side_schema).unwrap(),
+            ),
+        ];
+        let plan = Arc::new(
+            HashJoinExec::try_new(
+                build_scan,
+                probe_scan,
+                on,
+                None,
+                &JoinType::Inner,
+                None,
+                PartitionMode::CollectLeft,
+                datafusion_common::NullEquality::NullEqualsNothing,
+                false,
+            )
+            .unwrap(),
+        ) as Arc<dyn ExecutionPlan>;
+
+        // Apply filter pushdown optimization
+        let mut config = ConfigOptions::default();
+        config.execution.parquet.pushdown_filters = true;
+        config.optimizer.enable_dynamic_filter_pushdown = true;
+        let plan = FilterPushdown::new_post_optimization()
+            .optimize(plan, &config)
+            .unwrap();
+
+        // Get the HashJoinExec to check the dynamic filter
+        let hash_join = plan
+            .downcast_ref::<HashJoinExec>()
+            .expect("Plan should be HashJoinExec");
+
+        // Verify that a dynamic filter was created
+        let dynamic_filter = hash_join
+            .dynamic_filter()
+            .expect("Dynamic filter should be created");
+
+        // Verify that is_used() returns the expected value based on probe side support.
+        // When probe_supports_pushdown=false: no consumer holds a reference (is_used=false)
+        // When probe_supports_pushdown=true: probe side holds a reference (is_used=true)
+        assert_eq!(
+            dynamic_filter.is_used(),
+            expected_is_used,
+            "is_used() should return {expected_is_used} when probe side support is {probe_supports_pushdown}"
+        );
+    }
+}
+
+/// Regression test for https://github.com/apache/datafusion/issues/20109.
+///
+/// Not portable to sqllogictest: the regression specifically targets the
+/// physical FilterPushdown rule running over *stacked* FilterExecs with
+/// projections on a MemorySourceConfig. In SQL the logical optimizer
+/// collapses the two filters before physical planning, so the stacked
+/// FilterExec shape this test exercises is unreachable.
+#[tokio::test]
+async fn test_filter_with_projection_pushdown() {
+    use arrow::array::{Int64Array, RecordBatch, StringArray};
+    use datafusion_physical_plan::collect;
+    use datafusion_physical_plan::filter::FilterExecBuilder;
+
+    // Create schema: [time, event, size]
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("time", DataType::Int64, false),
+        Field::new("event", DataType::Utf8, false),
+        Field::new("size", DataType::Int64, false),
+    ]));
+
+    // Create sample data
+    let timestamps = vec![100i64, 200, 300, 400, 500];
+    let events = vec!["Ingestion", "Ingestion", "Query", "Ingestion", "Query"];
+    let sizes = vec![10i64, 20, 30, 40, 50];
+
+    let batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![
+            Arc::new(Int64Array::from(timestamps)),
+            Arc::new(StringArray::from(events)),
+            Arc::new(Int64Array::from(sizes)),
+        ],
+    )
+    .unwrap();
+
+    // Create data source
+    let memory_exec = datafusion_datasource::memory::MemorySourceConfig::try_new_exec(
+        &[vec![batch]],
+        schema.clone(),
+        None,
+    )
+    .unwrap();
+
+    // First FilterExec: time < 350 with projection=[event@1, size@2]
+    let time_col = col("time", &memory_exec.schema()).unwrap();
+    let time_filter = Arc::new(BinaryExpr::new(
+        time_col,
+        Operator::Lt,
+        Arc::new(Literal::new(ScalarValue::Int64(Some(350)))),
+    ));
+    let filter1 = Arc::new(
+        FilterExecBuilder::new(time_filter, memory_exec)
+            .apply_projection(Some(vec![1, 2]))
+            .unwrap()
+            .build()
+            .unwrap(),
+    );
+
+    // Second FilterExec: event = 'Ingestion' with projection=[size@1]
+    let event_col = col("event", &filter1.schema()).unwrap();
+    let event_filter = Arc::new(BinaryExpr::new(
+        event_col,
+        Operator::Eq,
+        Arc::new(Literal::new(ScalarValue::Utf8(Some(
+            "Ingestion".to_string(),
+        )))),
+    ));
+    let filter2 = Arc::new(
+        FilterExecBuilder::new(event_filter, filter1)
+            .apply_projection(Some(vec![1]))
+            .unwrap()
+            .build()
+            .unwrap(),
+    );
+
+    // Apply filter pushdown optimization
+    let config = ConfigOptions::default();
+    let optimized_plan = FilterPushdown::new()
+        .optimize(Arc::clone(&filter2) as Arc<dyn ExecutionPlan>, &config)
+        .unwrap();
+
+    // Execute the optimized plan - this should not error
+    let ctx = SessionContext::new();
+    let result = collect(optimized_plan, ctx.task_ctx()).await.unwrap();
+
+    // Verify results: should return rows where time < 350 AND event = 'Ingestion'
+    // That's rows with time=100,200 (both have event='Ingestion'), so sizes 10,20
+    let expected = [
+        "+------+", "| size |", "+------+", "| 10   |", "| 20   |", "+------+",
+    ];
+    assert_batches_eq!(expected, &result);
+}
+
+/// Test that ExecutionPlan::apply_expressions() can discover dynamic filters across the plan tree.
+///
+/// Not portable to sqllogictest: asserts by walking the plan tree with
+/// `apply_expressions` + `downcast_ref::<DynamicFilterPhysicalExpr>` and
+/// counting nodes. Neither API is observable from SQL.
+#[tokio::test]
+async fn test_discover_dynamic_filters_via_expressions_api() {
+    use datafusion_common::JoinType;
+    use datafusion_common::tree_node::TreeNodeRecursion;
+    use datafusion_physical_expr::expressions::DynamicFilterPhysicalExpr;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    fn count_dynamic_filters(plan: &Arc<dyn ExecutionPlan>) -> usize {
+        let mut count = 0;
+
+        // Check expressions from this node using apply_expressions
+        let _ = plan.apply_expressions(&mut |expr| {
+            if let Some(_df) = expr.downcast_ref::<DynamicFilterPhysicalExpr>() {
+                count += 1;
+            }
+            Ok(TreeNodeRecursion::Continue)
+        });
+
+        // Recursively visit children
+        for child in plan.children() {
+            count += count_dynamic_filters(child);
+        }
+
+        count
+    }
+
+    // Create build side (left)
+    let build_batches =
+        vec![record_batch!(("a", Utf8, ["foo", "bar"]), ("b", Int32, [1, 2])).unwrap()];
+    let build_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Int32, false),
+    ]));
+    let build_scan = TestScanBuilder::new(build_schema.clone())
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side (right)
+    let probe_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["foo", "bar", "baz", "qux"]),
+            ("c", Float64, [1.0, 2.0, 3.0, 4.0])
+        )
+        .unwrap(),
+    ];
+    let probe_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("c", DataType::Float64, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(probe_schema.clone())
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create HashJoinExec
+    let plan = Arc::new(
+        HashJoinExec::try_new(
+            build_scan,
+            probe_scan,
+            vec![(
+                col("a", &build_schema).unwrap(),
+                col("a", &probe_schema).unwrap(),
+            )],
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::CollectLeft,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    ) as Arc<dyn ExecutionPlan>;
+
+    // Before optimization: no dynamic filters
+    let count_before = count_dynamic_filters(&plan);
+    assert_eq!(
+        count_before, 0,
+        "Before optimization, should have no dynamic filters"
+    );
+
+    // Apply filter pushdown optimization (this creates dynamic filters)
+    let mut config = ConfigOptions::default();
+    config.optimizer.enable_dynamic_filter_pushdown = true;
+    config.execution.parquet.pushdown_filters = true;
+    let optimized_plan = FilterPushdown::new_post_optimization()
+        .optimize(plan, &config)
+        .unwrap();
+
+    // After optimization: should discover dynamic filters
+    // We expect 2 dynamic filters:
+    // 1. In the HashJoinExec (producer)
+    // 2. In the DataSourceExec (consumer, pushed down to the probe side)
+    let count_after = count_dynamic_filters(&optimized_plan);
+    assert_eq!(
+        count_after, 2,
+        "After optimization, should discover exactly 2 dynamic filters (1 in HashJoinExec, 1 in DataSourceExec), found {count_after}"
+    );
+}
+
+// ==== Filter pushdown through SortExec tests ====
+
+/// FilterExec above a plain SortExec (no fetch) should be pushed below it.
+/// The scan supports pushdown, so the filter lands in the DataSourceExec.
+#[test]
+fn test_filter_pushdown_through_sort_into_scan() {
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+    let sort = Arc::new(SortExec::new(
+        LexOrdering::new(vec![PhysicalSortExpr::new_default(
+            col("a", &schema()).unwrap(),
+        )])
+        .unwrap(),
+        scan,
+    ));
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, sort).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+    "
+    );
+}
+
+/// FilterExec above a plain SortExec (no fetch) when the scan does NOT
+/// support pushdown. The filter should still move below the sort, landing
+/// as a new FilterExec between SortExec and DataSourceExec.
+#[test]
+fn test_filter_pushdown_through_sort_no_scan_support() {
+    let scan = TestScanBuilder::new(schema()).with_support(false).build();
+    let sort = Arc::new(SortExec::new(
+        LexOrdering::new(vec![PhysicalSortExpr::new_default(
+            col("a", &schema()).unwrap(),
+        )])
+        .unwrap(),
+        scan,
+    ));
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, sort).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), false),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+      output:
+        Ok:
+          - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+          -   FilterExec: a@0 = foo
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+    "
+    );
+}
+
+/// Multiple conjunctive filters above a plain SortExec should all be
+/// pushed below the sort as a single FilterExec.
+#[test]
+fn test_multiple_filters_pushdown_through_sort() {
+    let scan = TestScanBuilder::new(schema()).with_support(false).build();
+    let sort = Arc::new(SortExec::new(
+        LexOrdering::new(vec![PhysicalSortExpr::new_default(
+            col("a", &schema()).unwrap(),
+        )])
+        .unwrap(),
+        scan,
+    ));
+    // WHERE a = 'foo' AND b = 'bar'
+    let predicate = Arc::new(BinaryExpr::new(
+        col_lit_predicate("a", "foo", &schema()),
+        Operator::And,
+        col_lit_predicate("b", "bar", &schema()),
+    )) as Arc<dyn PhysicalExpr>;
+    let plan = Arc::new(FilterExec::try_new(predicate, sort).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), false),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo AND b@1 = bar
+        -   SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+      output:
+        Ok:
+          - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+          -   FilterExec: a@0 = foo AND b@1 = bar
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+    "
+    );
+}
+
+/// FilterExec above a SortExec with fetch (TopK) must NOT be pushed below,
+/// because limiting happens after filtering — changing the order would alter
+/// the result set.
+#[test]
+fn test_filter_not_pushed_through_sort_with_fetch() {
+    let scan = TestScanBuilder::new(schema()).with_support(false).build();
+    let sort = Arc::new(
+        SortExec::new(
+            LexOrdering::new(vec![PhysicalSortExpr::new_default(
+                col("a", &schema()).unwrap(),
+            )])
+            .unwrap(),
+            scan,
+        )
+        .with_fetch(Some(10)),
+    );
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, sort).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), false),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   SortExec: TopK(fetch=10), expr=[a@0 ASC], preserve_partitioning=[false]
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+      output:
+        Ok:
+          - FilterExec: a@0 = foo
+          -   SortExec: TopK(fetch=10), expr=[a@0 ASC], preserve_partitioning=[false]
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+    "
+    );
+}
+
+/// FilterExec above a SortExec with fetch (TopK) must NOT be pushed below,
+/// because limiting happens after filtering — changing the order would alter
+/// the result set.
+#[test]
+fn test_filter_pushed_through_sort_with_fetch() {
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+    let sort = Arc::new(
+        SortExec::new(
+            LexOrdering::new(vec![PhysicalSortExpr::new_default(
+                col("a", &schema()).unwrap(),
+            )])
+            .unwrap(),
+            scan,
+        )
+        .with_fetch(Some(10)),
+    );
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, sort).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), false),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   SortExec: TopK(fetch=10), expr=[a@0 ASC], preserve_partitioning=[false]
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - FilterExec: a@0 = foo
+          -   SortExec: TopK(fetch=10), expr=[a@0 ASC], preserve_partitioning=[false]
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+    "
+    );
+}
+
+/// FilterExec with a projection above SortExec. The filter should be pushed
+/// below the sort, and the projection should be preserved as a
+/// ProjectionExec on top.
+#[test]
+fn test_filter_with_projection_pushdown_through_sort() {
+    let scan = TestScanBuilder::new(schema()).with_support(false).build();
+    let sort = Arc::new(SortExec::new(
+        LexOrdering::new(vec![PhysicalSortExpr::new_default(
+            col("a", &schema()).unwrap(),
+        )])
+        .unwrap(),
+        scan,
+    ));
+    // FilterExec: b = 'bar', projection=[a] (only output column a)
+    let predicate = col_lit_predicate("b", "bar", &schema());
+    let plan = Arc::new(
+        FilterExecBuilder::new(predicate, sort)
+            .apply_projection(Some(vec![0]))
+            .unwrap()
+            .build()
+            .unwrap(),
+    );
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), false),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: b@1 = bar, projection=[a@0]
+        -   SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+      output:
+        Ok:
+          - ProjectionExec: expr=[a@0 as a]
+          -   SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+          -     FilterExec: b@1 = bar
+          -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+    "
+    );
+}
+
+/// SortExec with preserve_partitioning=true should keep that setting after
+/// filters are pushed below it.
+#[test]
+fn test_filter_pushdown_through_sort_preserves_partitioning() {
+    let scan = TestScanBuilder::new(schema()).with_support(false).build();
+    let sort = Arc::new(
+        SortExec::new(
+            LexOrdering::new(vec![PhysicalSortExpr::new_default(
+                col("a", &schema()).unwrap(),
+            )])
+            .unwrap(),
+            scan,
+        )
+        .with_preserve_partitioning(true),
+    );
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, sort).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), false),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+      output:
+        Ok:
+          - SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+          -   FilterExec: a@0 = foo
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+    "
+    );
+}
+
+/// FilterExec **with a fetch limit** above a plain SortExec. When the filter
+/// is pushed below the sort the fetch should be propagated to the SortExec
+/// (turning it into a TopK).
+#[test]
+fn test_filter_with_fetch_pushdown_through_sort() {
+    let scan = TestScanBuilder::new(schema()).with_support(false).build();
+    let sort = Arc::new(SortExec::new(
+        LexOrdering::new(vec![PhysicalSortExpr::new_default(
+            col("a", &schema()).unwrap(),
+        )])
+        .unwrap(),
+        scan,
+    ));
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(
+        FilterExecBuilder::new(predicate, sort)
+            .with_fetch(Some(10))
+            .build()
+            .unwrap(),
+    );
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), false),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo, fetch=10
+        -   SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+      output:
+        Ok:
+          - SortExec: TopK(fetch=10), expr=[a@0 ASC], preserve_partitioning=[false]
+          -   FilterExec: a@0 = foo
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+    "
+    );
+}
+
+#[test]
+fn test_filter_pushdown_through_sort_with_projection() {
+    let scan = TestScanBuilder::new(schema()).with_support(false).build();
+    let sort = Arc::new(SortExec::new(
+        LexOrdering::new(vec![PhysicalSortExpr::new(
+            col("a", &schema()).unwrap(),
+            SortOptions::new(true, false), // descending, nulls_last
+        )])
+        .unwrap(),
+        scan,
+    ));
+    // FilterExec: b = 'bar', projection=[a] (only output column a)
+    let predicate = col_lit_predicate("b", "bar", &schema());
+    let plan = Arc::new(
+        FilterExecBuilder::new(predicate, sort)
+            .apply_projection(Some(vec![0]))
+            .unwrap()
+            .build()
+            .unwrap(),
+    );
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), false),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: b@1 = bar, projection=[a@0]
+        -   SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+      output:
+        Ok:
+          - ProjectionExec: expr=[a@0 as a]
+          -   SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -     FilterExec: b@1 = bar
+          -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+    "
+    );
+}
diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs
deleted file mode 100644
index de61149508904..0000000000000
--- a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs
+++ /dev/null
@@ -1,2335 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::sync::{Arc, LazyLock};
-
-use arrow::{
-    array::record_batch,
-    datatypes::{DataType, Field, Schema, SchemaRef},
-    util::pretty::pretty_format_batches,
-};
-use arrow_schema::SortOptions;
-use datafusion::{
-    assert_batches_eq,
-    logical_expr::Operator,
-    physical_plan::{
-        expressions::{BinaryExpr, Column, Literal},
-        PhysicalExpr,
-    },
-    prelude::{ParquetReadOptions, SessionConfig, SessionContext},
-    scalar::ScalarValue,
-};
-use datafusion_catalog::memory::DataSourceExec;
-use datafusion_common::config::ConfigOptions;
-use datafusion_datasource::{
-    file_groups::FileGroup, file_scan_config::FileScanConfigBuilder, PartitionedFile,
-};
-use datafusion_execution::object_store::ObjectStoreUrl;
-use datafusion_expr::ScalarUDF;
-use datafusion_functions::math::random::RandomFunc;
-use datafusion_functions_aggregate::count::count_udaf;
-use datafusion_physical_expr::{
-    aggregate::AggregateExprBuilder, Partitioning, ScalarFunctionExpr,
-};
-use datafusion_physical_expr::{expressions::col, LexOrdering, PhysicalSortExpr};
-use datafusion_physical_optimizer::{
-    filter_pushdown::FilterPushdown, PhysicalOptimizerRule,
-};
-use datafusion_physical_plan::{
-    aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy},
-    coalesce_batches::CoalesceBatchesExec,
-    coalesce_partitions::CoalescePartitionsExec,
-    collect,
-    filter::FilterExec,
-    repartition::RepartitionExec,
-    sorts::sort::SortExec,
-    ExecutionPlan,
-};
-
-use datafusion_physical_plan::union::UnionExec;
-use futures::StreamExt;
-use object_store::{memory::InMemory, ObjectStore};
-use util::{format_plan_for_test, OptimizationTest, TestNode, TestScanBuilder};
-
-use crate::physical_optimizer::filter_pushdown::util::TestSource;
-
-mod util;
-
-#[test]
-fn test_pushdown_into_scan() {
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-    let predicate = col_lit_predicate("a", "foo", &schema());
-    let plan = Arc::new(FilterExec::try_new(predicate, scan).unwrap());
-
-    // expect the predicate to be pushed down into the DataSource
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown::new(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: a@0 = foo
-        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
-    "
-    );
-}
-
-#[test]
-fn test_pushdown_volatile_functions_not_allowed() {
-    // Test that we do not push down filters with volatile functions
-    // Use random() as an example of a volatile function
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-    let cfg = Arc::new(ConfigOptions::default());
-    let predicate = Arc::new(BinaryExpr::new(
-        Arc::new(Column::new_with_schema("a", &schema()).unwrap()),
-        Operator::Eq,
-        Arc::new(
-            ScalarFunctionExpr::try_new(
-                Arc::new(ScalarUDF::from(RandomFunc::new())),
-                vec![],
-                &schema(),
-                cfg,
-            )
-            .unwrap(),
-        ),
-    )) as Arc<dyn PhysicalExpr>;
-    let plan = Arc::new(FilterExec::try_new(predicate, scan).unwrap());
-    // expect the filter to not be pushed down
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown::new(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: a@0 = random()
-        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - FilterExec: a@0 = random()
-          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-    ",
-    );
-}
-
-/// Show that we can use config options to determine how to do pushdown.
-#[test]
-fn test_pushdown_into_scan_with_config_options() {
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-    let predicate = col_lit_predicate("a", "foo", &schema());
-    let plan = Arc::new(FilterExec::try_new(predicate, scan).unwrap()) as _;
-
-    let mut cfg = ConfigOptions::default();
-    insta::assert_snapshot!(
-        OptimizationTest::new(
-            Arc::clone(&plan),
-            FilterPushdown::new(),
-            false
-        ),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: a@0 = foo
-        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - FilterExec: a@0 = foo
-          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-    "
-    );
-
-    cfg.execution.parquet.pushdown_filters = true;
-    insta::assert_snapshot!(
-        OptimizationTest::new(
-            plan,
-            FilterPushdown::new(),
-            true
-        ),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: a@0 = foo
-        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
-    "
-    );
-}
-
-#[tokio::test]
-async fn test_dynamic_filter_pushdown_through_hash_join_with_topk() {
-    use datafusion_common::JoinType;
-    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
-
-    // Create build side with limited values
-    let build_batches = vec![record_batch!(
-        ("a", Utf8, ["aa", "ab"]),
-        ("b", Utf8View, ["ba", "bb"]),
-        ("c", Float64, [1.0, 2.0])
-    )
-    .unwrap()];
-    let build_side_schema = Arc::new(Schema::new(vec![
-        Field::new("a", DataType::Utf8, false),
-        Field::new("b", DataType::Utf8View, false),
-        Field::new("c", DataType::Float64, false),
-    ]));
-    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
-        .with_support(true)
-        .with_batches(build_batches)
-        .build();
-
-    // Create probe side with more values
-    let probe_batches = vec![record_batch!(
-        ("d", Utf8, ["aa", "ab", "ac", "ad"]),
-        ("e", Utf8View, ["ba", "bb", "bc", "bd"]),
-        ("f", Float64, [1.0, 2.0, 3.0, 4.0])
-    )
-    .unwrap()];
-    let probe_side_schema = Arc::new(Schema::new(vec![
-        Field::new("d", DataType::Utf8, false),
-        Field::new("e", DataType::Utf8View, false),
-        Field::new("f", DataType::Float64, false),
-    ]));
-    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
-        .with_support(true)
-        .with_batches(probe_batches)
-        .build();
-
-    // Create HashJoinExec
-    let on = vec![(
-        col("a", &build_side_schema).unwrap(),
-        col("d", &probe_side_schema).unwrap(),
-    )];
-    let join = Arc::new(
-        HashJoinExec::try_new(
-            build_scan,
-            probe_scan,
-            on,
-            None,
-            &JoinType::Inner,
-            None,
-            PartitionMode::Partitioned,
-            datafusion_common::NullEquality::NullEqualsNothing,
-        )
-        .unwrap(),
-    );
-
-    let join_schema = join.schema();
-
-    // Finally let's add a SortExec on the outside to test pushdown of dynamic filters
-    let sort_expr =
-        PhysicalSortExpr::new(col("e", &join_schema).unwrap(), SortOptions::default());
-    let plan = Arc::new(
-        SortExec::new(LexOrdering::new(vec![sort_expr]).unwrap(), join)
-            .with_fetch(Some(2)),
-    ) as Arc<dyn ExecutionPlan>;
-
-    let mut config = ConfigOptions::default();
-    config.optimizer.enable_dynamic_filter_pushdown = true;
-    config.execution.parquet.pushdown_filters = true;
-
-    // Apply the FilterPushdown optimizer rule
-    let plan = FilterPushdown::new_post_optimization()
-        .optimize(Arc::clone(&plan), &config)
-        .unwrap();
-
-    // Test that filters are pushed down correctly to each side of the join
-    insta::assert_snapshot!(
-        format_plan_for_test(&plan),
-        @r"
-    - SortExec: TopK(fetch=2), expr=[e@4 ASC], preserve_partitioning=[false]
-    -   HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, d@0)]
-    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ] AND DynamicFilter [ empty ]
-    "
-    );
-
-    // Put some data through the plan to check that the filter is updated to reflect the TopK state
-    let session_ctx = SessionContext::new_with_config(SessionConfig::new());
-    session_ctx.register_object_store(
-        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
-        Arc::new(InMemory::new()),
-    );
-    let state = session_ctx.state();
-    let task_ctx = state.task_ctx();
-    let mut stream = plan.execute(0, Arc::clone(&task_ctx)).unwrap();
-    // Iterate one batch
-    stream.next().await.unwrap().unwrap();
-
-    // Test that filters are pushed down correctly to each side of the join
-    insta::assert_snapshot!(
-        format_plan_for_test(&plan),
-        @r"
-    - SortExec: TopK(fetch=2), expr=[e@4 ASC], preserve_partitioning=[false], filter=[e@4 IS NULL OR e@4 < bb]
-    -   HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, d@0)]
-    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ d@0 >= aa AND d@0 <= ab ] AND DynamicFilter [ e@1 IS NULL OR e@1 < bb ]
-    "
-    );
-}
-
-// Test both static and dynamic filter pushdown in HashJoinExec.
-// Note that static filter pushdown is rare: it should have already happened in the logical optimizer phase.
-// However users may manually construct plans that could result in a FilterExec -> HashJoinExec -> Scan setup.
-// Dynamic filters arise in cases such as nested inner joins or TopK -> HashJoinExec -> Scan setups.
-#[tokio::test]
-async fn test_static_filter_pushdown_through_hash_join() {
-    use datafusion_common::JoinType;
-    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
-
-    // Create build side with limited values
-    let build_batches = vec![record_batch!(
-        ("a", Utf8, ["aa", "ab"]),
-        ("b", Utf8View, ["ba", "bb"]),
-        ("c", Float64, [1.0, 2.0])
-    )
-    .unwrap()];
-    let build_side_schema = Arc::new(Schema::new(vec![
-        Field::new("a", DataType::Utf8, false),
-        Field::new("b", DataType::Utf8View, false),
-        Field::new("c", DataType::Float64, false),
-    ]));
-    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
-        .with_support(true)
-        .with_batches(build_batches)
-        .build();
-
-    // Create probe side with more values
-    let probe_batches = vec![record_batch!(
-        ("d", Utf8, ["aa", "ab", "ac", "ad"]),
-        ("e", Utf8View, ["ba", "bb", "bc", "bd"]),
-        ("f", Float64, [1.0, 2.0, 3.0, 4.0])
-    )
-    .unwrap()];
-    let probe_side_schema = Arc::new(Schema::new(vec![
-        Field::new("d", DataType::Utf8, false),
-        Field::new("e", DataType::Utf8View, false),
-        Field::new("f", DataType::Float64, false),
-    ]));
-    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
-        .with_support(true)
-        .with_batches(probe_batches)
-        .build();
-
-    // Create HashJoinExec
-    let on = vec![(
-        col("a", &build_side_schema).unwrap(),
-        col("d", &probe_side_schema).unwrap(),
-    )];
-    let join = Arc::new(
-        HashJoinExec::try_new(
-            build_scan,
-            probe_scan,
-            on,
-            None,
-            &JoinType::Inner,
-            None,
-            PartitionMode::Partitioned,
-            datafusion_common::NullEquality::NullEqualsNothing,
-        )
-        .unwrap(),
-    );
-
-    // Create filters that can be pushed down to different sides
-    // We need to create filters in the context of the join output schema
-    let join_schema = join.schema();
-
-    // Filter on build side column: a = 'aa'
-    let left_filter = col_lit_predicate("a", "aa", &join_schema);
-    // Filter on probe side column: e = 'ba'
-    let right_filter = col_lit_predicate("e", "ba", &join_schema);
-    // Filter that references both sides: a = d (should not be pushed down)
-    let cross_filter = Arc::new(BinaryExpr::new(
-        col("a", &join_schema).unwrap(),
-        Operator::Eq,
-        col("d", &join_schema).unwrap(),
-    )) as Arc<dyn PhysicalExpr>;
-
-    let filter =
-        Arc::new(FilterExec::try_new(left_filter, Arc::clone(&join) as _).unwrap());
-    let filter = Arc::new(FilterExec::try_new(right_filter, filter).unwrap());
-    let plan = Arc::new(FilterExec::try_new(cross_filter, filter).unwrap())
-        as Arc<dyn ExecutionPlan>;
-
-    // Test that filters are pushed down correctly to each side of the join
-    insta::assert_snapshot!(
-        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: a@0 = d@3
-        -   FilterExec: e@4 = ba
-        -     FilterExec: a@0 = aa
-        -       HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, d@0)]
-        -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-        -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - FilterExec: a@0 = d@3
-          -   HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, d@0)]
-          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = aa
-          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=e@1 = ba
-    "
-    );
-
-    // Test left join - filters should NOT be pushed down
-    let join = Arc::new(
-        HashJoinExec::try_new(
-            TestScanBuilder::new(Arc::clone(&build_side_schema))
-                .with_support(true)
-                .build(),
-            TestScanBuilder::new(Arc::clone(&probe_side_schema))
-                .with_support(true)
-                .build(),
-            vec![(
-                col("a", &build_side_schema).unwrap(),
-                col("d", &probe_side_schema).unwrap(),
-            )],
-            None,
-            &JoinType::Left,
-            None,
-            PartitionMode::Partitioned,
-            datafusion_common::NullEquality::NullEqualsNothing,
-        )
-        .unwrap(),
-    );
-
-    let join_schema = join.schema();
-    let filter = col_lit_predicate("a", "aa", &join_schema);
-    let plan =
-        Arc::new(FilterExec::try_new(filter, join).unwrap()) as Arc<dyn ExecutionPlan>;
-
-    // Test that filters are NOT pushed down for left join
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown::new(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: a@0 = aa
-        -   HashJoinExec: mode=Partitioned, join_type=Left, on=[(a@0, d@0)]
-        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - FilterExec: a@0 = aa
-          -   HashJoinExec: mode=Partitioned, join_type=Left, on=[(a@0, d@0)]
-          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true
-    "
-    );
-}
-
-#[test]
-fn test_filter_collapse() {
-    // filter should be pushed down into the parquet scan with two filters
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-    let predicate1 = col_lit_predicate("a", "foo", &schema());
-    let filter1 = Arc::new(FilterExec::try_new(predicate1, scan).unwrap());
-    let predicate2 = col_lit_predicate("b", "bar", &schema());
-    let plan = Arc::new(FilterExec::try_new(predicate2, filter1).unwrap());
-
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown::new(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: b@1 = bar
-        -   FilterExec: a@0 = foo
-        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo AND b@1 = bar
-    "
-    );
-}
-
-#[test]
-fn test_filter_with_projection() {
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-    let projection = vec![1, 0];
-    let predicate = col_lit_predicate("a", "foo", &schema());
-    let plan = Arc::new(
-        FilterExec::try_new(predicate, Arc::clone(&scan))
-            .unwrap()
-            .with_projection(Some(projection))
-            .unwrap(),
-    );
-
-    // expect the predicate to be pushed down into the DataSource but the FilterExec to be converted to ProjectionExec
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown::new(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: a@0 = foo, projection=[b@1, a@0]
-        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - ProjectionExec: expr=[b@1 as b, a@0 as a]
-          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
-    ",
-    );
-
-    // add a test where the filter is on a column that isn't included in the output
-    let projection = vec![1];
-    let predicate = col_lit_predicate("a", "foo", &schema());
-    let plan = Arc::new(
-        FilterExec::try_new(predicate, scan)
-            .unwrap()
-            .with_projection(Some(projection))
-            .unwrap(),
-    );
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown::new(),true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: a@0 = foo, projection=[b@1]
-        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - ProjectionExec: expr=[b@1 as b]
-          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
-    "
-    );
-}
-
-#[test]
-fn test_push_down_through_transparent_nodes() {
-    // expect the predicate to be pushed down into the DataSource
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-    let coalesce = Arc::new(CoalesceBatchesExec::new(scan, 1));
-    let predicate = col_lit_predicate("a", "foo", &schema());
-    let filter = Arc::new(FilterExec::try_new(predicate, coalesce).unwrap());
-    let repartition = Arc::new(
-        RepartitionExec::try_new(filter, Partitioning::RoundRobinBatch(1)).unwrap(),
-    );
-    let predicate = col_lit_predicate("b", "bar", &schema());
-    let plan = Arc::new(FilterExec::try_new(predicate, repartition).unwrap());
-
-    // expect the predicate to be pushed down into the DataSource
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown::new(),true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: b@1 = bar
-        -   RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=1
-        -     FilterExec: a@0 = foo
-        -       CoalesceBatchesExec: target_batch_size=1
-        -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=1
-          -   CoalesceBatchesExec: target_batch_size=1
-          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo AND b@1 = bar
-    "
-    );
-}
-
-#[test]
-fn test_pushdown_through_aggregates_on_grouping_columns() {
-    // Test that filters on grouping columns can be pushed through AggregateExec.
-    // This test has two filters:
-    // 1. An inner filter (a@0 = foo) below the aggregate - gets pushed to DataSource
-    // 2. An outer filter (b@1 = bar) above the aggregate - also gets pushed through because 'b' is a grouping column
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-
-    let coalesce = Arc::new(CoalesceBatchesExec::new(scan, 10));
-
-    let filter = Arc::new(
-        FilterExec::try_new(col_lit_predicate("a", "foo", &schema()), coalesce).unwrap(),
-    );
-
-    let aggregate_expr =
-        vec![
-            AggregateExprBuilder::new(count_udaf(), vec![col("a", &schema()).unwrap()])
-                .schema(schema())
-                .alias("cnt")
-                .build()
-                .map(Arc::new)
-                .unwrap(),
-        ];
-    let group_by = PhysicalGroupBy::new_single(vec![
-        (col("a", &schema()).unwrap(), "a".to_string()),
-        (col("b", &schema()).unwrap(), "b".to_string()),
-    ]);
-    let aggregate = Arc::new(
-        AggregateExec::try_new(
-            AggregateMode::Final,
-            group_by,
-            aggregate_expr.clone(),
-            vec![None],
-            filter,
-            schema(),
-        )
-        .unwrap(),
-    );
-
-    let coalesce = Arc::new(CoalesceBatchesExec::new(aggregate, 100));
-
-    let predicate = col_lit_predicate("b", "bar", &schema());
-    let plan = Arc::new(FilterExec::try_new(predicate, coalesce).unwrap());
-
-    // Both filters should be pushed down to the DataSource since both reference grouping columns
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown::new(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: b@1 = bar
-        -   CoalesceBatchesExec: target_batch_size=100
-        -     AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt], ordering_mode=PartiallySorted([0])
-        -       FilterExec: a@0 = foo
-        -         CoalesceBatchesExec: target_batch_size=10
-        -           DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - CoalesceBatchesExec: target_batch_size=100
-          -   AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt], ordering_mode=Sorted
-          -     CoalesceBatchesExec: target_batch_size=10
-          -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo AND b@1 = bar
-    "
-    );
-}
-
-/// Test various combinations of handling of child pushdown results
-/// in an ExecutionPlan in combination with support/not support in a DataSource.
-#[test]
-fn test_node_handles_child_pushdown_result() {
-    // If we set `with_support(true)` + `inject_filter = true` then the filter is pushed down to the DataSource
-    // and no FilterExec is created.
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-    let predicate = col_lit_predicate("a", "foo", &schema());
-    let plan = Arc::new(TestNode::new(true, Arc::clone(&scan), predicate));
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown::new(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - TestInsertExec { inject_filter: true }
-        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - TestInsertExec { inject_filter: true }
-          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
-    ",
-    );
-
-    // If we set `with_support(false)` + `inject_filter = true` then the filter is not pushed down to the DataSource
-    // and a FilterExec is created.
-    let scan = TestScanBuilder::new(schema()).with_support(false).build();
-    let predicate = col_lit_predicate("a", "foo", &schema());
-    let plan = Arc::new(TestNode::new(true, Arc::clone(&scan), predicate));
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown::new(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - TestInsertExec { inject_filter: true }
-        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
-      output:
-        Ok:
-          - TestInsertExec { inject_filter: false }
-          -   FilterExec: a@0 = foo
-          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
-    ",
-    );
-
-    // If we set `with_support(false)` + `inject_filter = false` then the filter is not pushed down to the DataSource
-    // and no FilterExec is created.
-    let scan = TestScanBuilder::new(schema()).with_support(false).build();
-    let predicate = col_lit_predicate("a", "foo", &schema());
-    let plan = Arc::new(TestNode::new(false, Arc::clone(&scan), predicate));
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown::new(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - TestInsertExec { inject_filter: false }
-        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
-      output:
-        Ok:
-          - TestInsertExec { inject_filter: false }
-          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
-    ",
-    );
-}
-
-#[tokio::test]
-async fn test_topk_dynamic_filter_pushdown() {
-    let batches = vec![
-        record_batch!(
-            ("a", Utf8, ["aa", "ab"]),
-            ("b", Utf8, ["bd", "bc"]),
-            ("c", Float64, [1.0, 2.0])
-        )
-        .unwrap(),
-        record_batch!(
-            ("a", Utf8, ["ac", "ad"]),
-            ("b", Utf8, ["bb", "ba"]),
-            ("c", Float64, [2.0, 1.0])
-        )
-        .unwrap(),
-    ];
-    let scan = TestScanBuilder::new(schema())
-        .with_support(true)
-        .with_batches(batches)
-        .build();
-    let plan = Arc::new(
-        SortExec::new(
-            LexOrdering::new(vec![PhysicalSortExpr::new(
-                col("b", &schema()).unwrap(),
-                SortOptions::new(true, false), // descending, nulls_first
-            )])
-            .unwrap(),
-            Arc::clone(&scan),
-        )
-        .with_fetch(Some(1)),
-    ) as Arc<dyn ExecutionPlan>;
-
-    // expect the predicate to be pushed down into the DataSource
-    insta::assert_snapshot!(
-        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false]
-        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false]
-          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
-    "
-    );
-
-    // Actually apply the optimization to the plan and put some data through it to check that the filter is updated to reflect the TopK state
-    let mut config = ConfigOptions::default();
-    config.execution.parquet.pushdown_filters = true;
-    let plan = FilterPushdown::new_post_optimization()
-        .optimize(plan, &config)
-        .unwrap();
-    let config = SessionConfig::new().with_batch_size(2);
-    let session_ctx = SessionContext::new_with_config(config);
-    session_ctx.register_object_store(
-        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
-        Arc::new(InMemory::new()),
-    );
-    let state = session_ctx.state();
-    let task_ctx = state.task_ctx();
-    let mut stream = plan.execute(0, Arc::clone(&task_ctx)).unwrap();
-    // Iterate one batch
-    stream.next().await.unwrap().unwrap();
-    // Now check what our filter looks like
-    insta::assert_snapshot!(
-        format!("{}", format_plan_for_test(&plan)),
-        @r"
-    - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false], filter=[b@1 > bd]
-    -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ b@1 > bd ]
-    "
-    );
-}
-
-#[tokio::test]
-async fn test_topk_dynamic_filter_pushdown_multi_column_sort() {
-    let batches = vec![
-        // We are going to do ORDER BY b ASC NULLS LAST, a DESC
-        // And we put the values in such a way that the first batch will fill the TopK
-        // and we skip the second batch.
-        record_batch!(
-            ("a", Utf8, ["ac", "ad"]),
-            ("b", Utf8, ["bb", "ba"]),
-            ("c", Float64, [2.0, 1.0])
-        )
-        .unwrap(),
-        record_batch!(
-            ("a", Utf8, ["aa", "ab"]),
-            ("b", Utf8, ["bc", "bd"]),
-            ("c", Float64, [1.0, 2.0])
-        )
-        .unwrap(),
-    ];
-    let scan = TestScanBuilder::new(schema())
-        .with_support(true)
-        .with_batches(batches)
-        .build();
-    let plan = Arc::new(
-        SortExec::new(
-            LexOrdering::new(vec![
-                PhysicalSortExpr::new(
-                    col("b", &schema()).unwrap(),
-                    SortOptions::default().asc().nulls_last(),
-                ),
-                PhysicalSortExpr::new(
-                    col("a", &schema()).unwrap(),
-                    SortOptions::default().desc().nulls_first(),
-                ),
-            ])
-            .unwrap(),
-            Arc::clone(&scan),
-        )
-        .with_fetch(Some(2)),
-    ) as Arc<dyn ExecutionPlan>;
-
-    // expect the predicate to be pushed down into the DataSource
-    insta::assert_snapshot!(
-        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - SortExec: TopK(fetch=2), expr=[b@1 ASC NULLS LAST, a@0 DESC], preserve_partitioning=[false]
-        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - SortExec: TopK(fetch=2), expr=[b@1 ASC NULLS LAST, a@0 DESC], preserve_partitioning=[false]
-          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
-    "
-    );
-
-    // Actually apply the optimization to the plan and put some data through it to check that the filter is updated to reflect the TopK state
-    let mut config = ConfigOptions::default();
-    config.execution.parquet.pushdown_filters = true;
-    let plan = FilterPushdown::new_post_optimization()
-        .optimize(plan, &config)
-        .unwrap();
-    let config = SessionConfig::new().with_batch_size(2);
-    let session_ctx = SessionContext::new_with_config(config);
-    session_ctx.register_object_store(
-        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
-        Arc::new(InMemory::new()),
-    );
-    let state = session_ctx.state();
-    let task_ctx = state.task_ctx();
-    let mut stream = plan.execute(0, Arc::clone(&task_ctx)).unwrap();
-    // Iterate one batch
-    let res = stream.next().await.unwrap().unwrap();
-    #[rustfmt::skip]
-    let expected = [
-        "+----+----+-----+",
-        "| a  | b  | c   |",
-        "+----+----+-----+",
-        "| ad | ba | 1.0 |",
-        "| ac | bb | 2.0 |",
-        "+----+----+-----+",
-    ];
-    assert_batches_eq!(expected, &[res]);
-    // Now check what our filter looks like
-    insta::assert_snapshot!(
-        format!("{}", format_plan_for_test(&plan)),
-        @r"
-    - SortExec: TopK(fetch=2), expr=[b@1 ASC NULLS LAST, a@0 DESC], preserve_partitioning=[false], filter=[b@1 < bb OR b@1 = bb AND (a@0 IS NULL OR a@0 > ac)]
-    -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ b@1 < bb OR b@1 = bb AND (a@0 IS NULL OR a@0 > ac) ]
-    "
-    );
-    // There should be no more batches
-    assert!(stream.next().await.is_none());
-}
-
-#[tokio::test]
-async fn test_topk_filter_passes_through_coalesce_partitions() {
-    // Create multiple batches for different partitions
-    let batches = vec![
-        record_batch!(
-            ("a", Utf8, ["aa", "ab"]),
-            ("b", Utf8, ["bd", "bc"]),
-            ("c", Float64, [1.0, 2.0])
-        )
-        .unwrap(),
-        record_batch!(
-            ("a", Utf8, ["ac", "ad"]),
-            ("b", Utf8, ["bb", "ba"]),
-            ("c", Float64, [2.0, 1.0])
-        )
-        .unwrap(),
-    ];
-
-    // Create a source that supports all batches
-    let source = Arc::new(TestSource::new(true, batches));
-
-    let base_config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::parse("test://").unwrap(),
-        Arc::clone(&schema()),
-        source,
-    )
-    .with_file_groups(vec![
-        // Partition 0
-        FileGroup::new(vec![PartitionedFile::new("test1.parquet", 123)]),
-        // Partition 1
-        FileGroup::new(vec![PartitionedFile::new("test2.parquet", 123)]),
-    ])
-    .build();
-
-    let scan = DataSourceExec::from_data_source(base_config);
-
-    // Add CoalescePartitionsExec to merge the two partitions
-    let coalesce = Arc::new(CoalescePartitionsExec::new(scan)) as Arc<dyn ExecutionPlan>;
-
-    // Add SortExec with TopK
-    let plan = Arc::new(
-        SortExec::new(
-            LexOrdering::new(vec![PhysicalSortExpr::new(
-                col("b", &schema()).unwrap(),
-                SortOptions::new(true, false),
-            )])
-            .unwrap(),
-            coalesce,
-        )
-        .with_fetch(Some(1)),
-    ) as Arc<dyn ExecutionPlan>;
-
-    // Test optimization - the filter SHOULD pass through CoalescePartitionsExec
-    // if it properly implements from_children (not all_unsupported)
-    insta::assert_snapshot!(
-        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false]
-        -   CoalescePartitionsExec
-        -     DataSourceExec: file_groups={2 groups: [[test1.parquet], [test2.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false]
-          -   CoalescePartitionsExec
-          -     DataSourceExec: file_groups={2 groups: [[test1.parquet], [test2.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
-    "
-    );
-}
-
-#[tokio::test]
-async fn test_topk_filter_passes_through_coalesce_batches() {
-    let batches = vec![
-        record_batch!(
-            ("a", Utf8, ["aa", "ab"]),
-            ("b", Utf8, ["bd", "bc"]),
-            ("c", Float64, [1.0, 2.0])
-        )
-        .unwrap(),
-        record_batch!(
-            ("a", Utf8, ["ac", "ad"]),
-            ("b", Utf8, ["bb", "ba"]),
-            ("c", Float64, [2.0, 1.0])
-        )
-        .unwrap(),
-    ];
-
-    let scan = TestScanBuilder::new(schema())
-        .with_support(true)
-        .with_batches(batches)
-        .build();
-
-    let coalesce_batches =
-        Arc::new(CoalesceBatchesExec::new(scan, 1024)) as Arc<dyn ExecutionPlan>;
-
-    // Add SortExec with TopK
-    let plan = Arc::new(
-        SortExec::new(
-            LexOrdering::new(vec![PhysicalSortExpr::new(
-                col("b", &schema()).unwrap(),
-                SortOptions::new(true, false),
-            )])
-            .unwrap(),
-            coalesce_batches,
-        )
-        .with_fetch(Some(1)),
-    ) as Arc<dyn ExecutionPlan>;
-
-    insta::assert_snapshot!(
-        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false]
-        -   CoalesceBatchesExec: target_batch_size=1024
-        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false]
-          -   CoalesceBatchesExec: target_batch_size=1024
-          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
-    "
-    );
-}
-
-#[tokio::test]
-async fn test_hashjoin_dynamic_filter_pushdown() {
-    use datafusion_common::JoinType;
-    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
-
-    // Create build side with limited values
-    let build_batches = vec![record_batch!(
-        ("a", Utf8, ["aa", "ab"]),
-        ("b", Utf8, ["ba", "bb"]),
-        ("c", Float64, [1.0, 2.0]) // Extra column not used in join
-    )
-    .unwrap()];
-    let build_side_schema = Arc::new(Schema::new(vec![
-        Field::new("a", DataType::Utf8, false),
-        Field::new("b", DataType::Utf8, false),
-        Field::new("c", DataType::Float64, false),
-    ]));
-    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
-        .with_support(true)
-        .with_batches(build_batches)
-        .build();
-
-    // Create probe side with more values
-    let probe_batches = vec![record_batch!(
-        ("a", Utf8, ["aa", "ab", "ac", "ad"]),
-        ("b", Utf8, ["ba", "bb", "bc", "bd"]),
-        ("e", Float64, [1.0, 2.0, 3.0, 4.0]) // Extra column not used in join
-    )
-    .unwrap()];
-    let probe_side_schema = Arc::new(Schema::new(vec![
-        Field::new("a", DataType::Utf8, false),
-        Field::new("b", DataType::Utf8, false),
-        Field::new("e", DataType::Float64, false),
-    ]));
-    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
-        .with_support(true)
-        .with_batches(probe_batches)
-        .build();
-
-    // Create HashJoinExec with dynamic filter
-    let on = vec![
-        (
-            col("a", &build_side_schema).unwrap(),
-            col("a", &probe_side_schema).unwrap(),
-        ),
-        (
-            col("b", &build_side_schema).unwrap(),
-            col("b", &probe_side_schema).unwrap(),
-        ),
-    ];
-    let plan = Arc::new(
-        HashJoinExec::try_new(
-            build_scan,
-            probe_scan,
-            on,
-            None,
-            &JoinType::Inner,
-            None,
-            PartitionMode::CollectLeft,
-            datafusion_common::NullEquality::NullEqualsNothing,
-        )
-        .unwrap(),
-    ) as Arc<dyn ExecutionPlan>;
-
-    // expect the predicate to be pushed down into the probe side DataSource
-    insta::assert_snapshot!(
-        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
-        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
-          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
-    ",
-    );
-
-    // Actually apply the optimization to the plan and execute to see the filter in action
-    let mut config = ConfigOptions::default();
-    config.execution.parquet.pushdown_filters = true;
-    config.optimizer.enable_dynamic_filter_pushdown = true;
-    let plan = FilterPushdown::new_post_optimization()
-        .optimize(plan, &config)
-        .unwrap();
-
-    // Test for https://github.com/apache/datafusion/pull/17371: dynamic filter linking survives `with_new_children`
-    let children = plan.children().into_iter().map(Arc::clone).collect();
-    let plan = plan.with_new_children(children).unwrap();
-
-    let config = SessionConfig::new().with_batch_size(10);
-    let session_ctx = SessionContext::new_with_config(config);
-    session_ctx.register_object_store(
-        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
-        Arc::new(InMemory::new()),
-    );
-    let state = session_ctx.state();
-    let task_ctx = state.task_ctx();
-    let mut stream = plan.execute(0, Arc::clone(&task_ctx)).unwrap();
-    // Iterate one batch
-    stream.next().await.unwrap().unwrap();
-
-    // Now check what our filter looks like
-    insta::assert_snapshot!(
-        format!("{}", format_plan_for_test(&plan)),
-        @r"
-    - HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
-    -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-    -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb ]
-    "
-    );
-}
-
-#[tokio::test]
-async fn test_hashjoin_dynamic_filter_pushdown_partitioned() {
-    use datafusion_common::JoinType;
-    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
-
-    // Rough sketch of the MRE we're trying to recreate:
-    // COPY (select i as k from generate_series(1, 10000000) as t(i))
-    // TO 'test_files/scratch/push_down_filter/t1.parquet'
-    // STORED AS PARQUET;
-    // COPY (select i as k, i as v from generate_series(1, 10000000) as t(i))
-    // TO 'test_files/scratch/push_down_filter/t2.parquet'
-    // STORED AS PARQUET;
-    // create external table t1 stored as parquet location 'test_files/scratch/push_down_filter/t1.parquet';
-    // create external table t2 stored as parquet location 'test_files/scratch/push_down_filter/t2.parquet';
-    // explain
-    // select *
-    // from t1
-    // join t2 on t1.k = t2.k;
-    // +---------------+------------------------------------------------------------+
-    // | plan_type     | plan                                                       |
-    // +---------------+------------------------------------------------------------+
-    // | physical_plan | ┌───────────────────────────┐                              |
-    // |               | │    CoalesceBatchesExec    │                              |
-    // |               | │    --------------------   │                              |
-    // |               | │     target_batch_size:    │                              |
-    // |               | │            8192           │                              |
-    // |               | └─────────────┬─────────────┘                              |
-    // |               | ┌─────────────┴─────────────┐                              |
-    // |               | │        HashJoinExec       │                              |
-    // |               | │    --------------------   ├──────────────┐               |
-    // |               | │        on: (k = k)        │              │               |
-    // |               | └─────────────┬─────────────┘              │               |
-    // |               | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ |
-    // |               | │    CoalesceBatchesExec    ││    CoalesceBatchesExec    │ |
-    // |               | │    --------------------   ││    --------------------   │ |
-    // |               | │     target_batch_size:    ││     target_batch_size:    │ |
-    // |               | │            8192           ││            8192           │ |
-    // |               | └─────────────┬─────────────┘└─────────────┬─────────────┘ |
-    // |               | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ |
-    // |               | │      RepartitionExec      ││      RepartitionExec      │ |
-    // |               | │    --------------------   ││    --------------------   │ |
-    // |               | │ partition_count(in->out): ││ partition_count(in->out): │ |
-    // |               | │          12 -> 12         ││          12 -> 12         │ |
-    // |               | │                           ││                           │ |
-    // |               | │    partitioning_scheme:   ││    partitioning_scheme:   │ |
-    // |               | │      Hash([k@0], 12)      ││      Hash([k@0], 12)      │ |
-    // |               | └─────────────┬─────────────┘└─────────────┬─────────────┘ |
-    // |               | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ |
-    // |               | │       DataSourceExec      ││       DataSourceExec      │ |
-    // |               | │    --------------------   ││    --------------------   │ |
-    // |               | │         files: 12         ││         files: 12         │ |
-    // |               | │      format: parquet      ││      format: parquet      │ |
-    // |               | │                           ││      predicate: true      │ |
-    // |               | └───────────────────────────┘└───────────────────────────┘ |
-    // |               |                                                            |
-    // +---------------+------------------------------------------------------------+
-
-    // Create build side with limited values
-    let build_batches = vec![record_batch!(
-        ("a", Utf8, ["aa", "ab"]),
-        ("b", Utf8, ["ba", "bb"]),
-        ("c", Float64, [1.0, 2.0]) // Extra column not used in join
-    )
-    .unwrap()];
-    let build_side_schema = Arc::new(Schema::new(vec![
-        Field::new("a", DataType::Utf8, false),
-        Field::new("b", DataType::Utf8, false),
-        Field::new("c", DataType::Float64, false),
-    ]));
-    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
-        .with_support(true)
-        .with_batches(build_batches)
-        .build();
-
-    // Create probe side with more values
-    let probe_batches = vec![record_batch!(
-        ("a", Utf8, ["aa", "ab", "ac", "ad"]),
-        ("b", Utf8, ["ba", "bb", "bc", "bd"]),
-        ("e", Float64, [1.0, 2.0, 3.0, 4.0]) // Extra column not used in join
-    )
-    .unwrap()];
-    let probe_side_schema = Arc::new(Schema::new(vec![
-        Field::new("a", DataType::Utf8, false),
-        Field::new("b", DataType::Utf8, false),
-        Field::new("e", DataType::Float64, false),
-    ]));
-    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
-        .with_support(true)
-        .with_batches(probe_batches)
-        .build();
-
-    // Create RepartitionExec nodes for both sides with hash partitioning on join keys
-    let partition_count = 12;
-
-    // Build side: DataSource -> RepartitionExec (Hash) -> CoalesceBatchesExec
-    let build_hash_exprs = vec![
-        col("a", &build_side_schema).unwrap(),
-        col("b", &build_side_schema).unwrap(),
-    ];
-    let build_repartition = Arc::new(
-        RepartitionExec::try_new(
-            build_scan,
-            Partitioning::Hash(build_hash_exprs, partition_count),
-        )
-        .unwrap(),
-    );
-    let build_coalesce = Arc::new(CoalesceBatchesExec::new(build_repartition, 8192));
-
-    // Probe side: DataSource -> RepartitionExec (Hash) -> CoalesceBatchesExec
-    let probe_hash_exprs = vec![
-        col("a", &probe_side_schema).unwrap(),
-        col("b", &probe_side_schema).unwrap(),
-    ];
-    let probe_repartition = Arc::new(
-        RepartitionExec::try_new(
-            Arc::clone(&probe_scan),
-            Partitioning::Hash(probe_hash_exprs, partition_count),
-        )
-        .unwrap(),
-    );
-    let probe_coalesce = Arc::new(CoalesceBatchesExec::new(probe_repartition, 8192));
-
-    // Create HashJoinExec with partitioned inputs
-    let on = vec![
-        (
-            col("a", &build_side_schema).unwrap(),
-            col("a", &probe_side_schema).unwrap(),
-        ),
-        (
-            col("b", &build_side_schema).unwrap(),
-            col("b", &probe_side_schema).unwrap(),
-        ),
-    ];
-    let hash_join = Arc::new(
-        HashJoinExec::try_new(
-            build_coalesce,
-            probe_coalesce,
-            on,
-            None,
-            &JoinType::Inner,
-            None,
-            PartitionMode::Partitioned,
-            datafusion_common::NullEquality::NullEqualsNothing,
-        )
-        .unwrap(),
-    );
-
-    // Top-level CoalesceBatchesExec
-    let cb =
-        Arc::new(CoalesceBatchesExec::new(hash_join, 8192)) as Arc<dyn ExecutionPlan>;
-    // Top-level CoalescePartitionsExec
-    let cp = Arc::new(CoalescePartitionsExec::new(cb)) as Arc<dyn ExecutionPlan>;
-    // Add a sort for deterministic output
-    let plan = Arc::new(SortExec::new(
-        LexOrdering::new(vec![PhysicalSortExpr::new(
-            col("a", &probe_side_schema).unwrap(),
-            SortOptions::new(true, false), // descending, nulls_first
-        )])
-        .unwrap(),
-        cp,
-    )) as Arc<dyn ExecutionPlan>;
-
-    // expect the predicate to be pushed down into the probe side DataSource
-    insta::assert_snapshot!(
-        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
-        -   CoalescePartitionsExec
-        -     CoalesceBatchesExec: target_batch_size=8192
-        -       HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
-        -         CoalesceBatchesExec: target_batch_size=8192
-        -           RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
-        -             DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-        -         CoalesceBatchesExec: target_batch_size=8192
-        -           RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
-        -             DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
-          -   CoalescePartitionsExec
-          -     CoalesceBatchesExec: target_batch_size=8192
-          -       HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
-          -         CoalesceBatchesExec: target_batch_size=8192
-          -           RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
-          -             DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-          -         CoalesceBatchesExec: target_batch_size=8192
-          -           RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
-          -             DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
-    "
-    );
-
-    // Actually apply the optimization to the plan and execute to see the filter in action
-    let mut config = ConfigOptions::default();
-    config.execution.parquet.pushdown_filters = true;
-    config.optimizer.enable_dynamic_filter_pushdown = true;
-    let plan = FilterPushdown::new_post_optimization()
-        .optimize(plan, &config)
-        .unwrap();
-    let config = SessionConfig::new().with_batch_size(10);
-    let session_ctx = SessionContext::new_with_config(config);
-    session_ctx.register_object_store(
-        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
-        Arc::new(InMemory::new()),
-    );
-    let state = session_ctx.state();
-    let task_ctx = state.task_ctx();
-    let batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx))
-        .await
-        .unwrap();
-
-    // Now check what our filter looks like
-    #[cfg(not(feature = "force_hash_collisions"))]
-    insta::assert_snapshot!(
-        format!("{}", format_plan_for_test(&plan)),
-        @r"
-    - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
-    -   CoalescePartitionsExec
-    -     CoalesceBatchesExec: target_batch_size=8192
-    -       HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
-    -         CoalesceBatchesExec: target_batch_size=8192
-    -           RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
-    -             DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-    -         CoalesceBatchesExec: target_batch_size=8192
-    -           RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
-    -             DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= ab AND a@0 <= ab AND b@1 >= bb AND b@1 <= bb OR a@0 >= aa AND a@0 <= aa AND b@1 >= ba AND b@1 <= ba ]
-    "
-    );
-
-    #[cfg(feature = "force_hash_collisions")]
-    insta::assert_snapshot!(
-        format!("{}", format_plan_for_test(&plan)),
-        @r"
-    - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
-    -   CoalescePartitionsExec
-    -     CoalesceBatchesExec: target_batch_size=8192
-    -       HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
-    -         CoalesceBatchesExec: target_batch_size=8192
-    -           RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
-    -             DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-    -         CoalesceBatchesExec: target_batch_size=8192
-    -           RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
-    -             DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb ]
-    "
-    );
-
-    let result = format!("{}", pretty_format_batches(&batches).unwrap());
-
-    let probe_scan_metrics = probe_scan.metrics().unwrap();
-
-    // The probe side had 4 rows, but after applying the dynamic filter only 2 rows should remain.
-    // The number of output rows from the probe side scan should stay consistent across executions.
-    // Issue: https://github.com/apache/datafusion/issues/17451
-    assert_eq!(probe_scan_metrics.output_rows().unwrap(), 2);
-
-    insta::assert_snapshot!(
-        result,
-        @r"
-    +----+----+-----+----+----+-----+
-    | a  | b  | c   | a  | b  | e   |
-    +----+----+-----+----+----+-----+
-    | ab | bb | 2.0 | ab | bb | 2.0 |
-    | aa | ba | 1.0 | aa | ba | 1.0 |
-    +----+----+-----+----+----+-----+
-    ",
-    );
-}
-
-#[tokio::test]
-async fn test_hashjoin_dynamic_filter_pushdown_collect_left() {
-    use datafusion_common::JoinType;
-    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
-
-    let build_batches = vec![record_batch!(
-        ("a", Utf8, ["aa", "ab"]),
-        ("b", Utf8, ["ba", "bb"]),
-        ("c", Float64, [1.0, 2.0]) // Extra column not used in join
-    )
-    .unwrap()];
-    let build_side_schema = Arc::new(Schema::new(vec![
-        Field::new("a", DataType::Utf8, false),
-        Field::new("b", DataType::Utf8, false),
-        Field::new("c", DataType::Float64, false),
-    ]));
-    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
-        .with_support(true)
-        .with_batches(build_batches)
-        .build();
-
-    // Create probe side with more values
-    let probe_batches = vec![record_batch!(
-        ("a", Utf8, ["aa", "ab", "ac", "ad"]),
-        ("b", Utf8, ["ba", "bb", "bc", "bd"]),
-        ("e", Float64, [1.0, 2.0, 3.0, 4.0]) // Extra column not used in join
-    )
-    .unwrap()];
-    let probe_side_schema = Arc::new(Schema::new(vec![
-        Field::new("a", DataType::Utf8, false),
-        Field::new("b", DataType::Utf8, false),
-        Field::new("e", DataType::Float64, false),
-    ]));
-    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
-        .with_support(true)
-        .with_batches(probe_batches)
-        .build();
-
-    // Create RepartitionExec nodes for both sides with hash partitioning on join keys
-    let partition_count = 12;
-
-    // Probe side: DataSource -> RepartitionExec(Hash) -> CoalesceBatchesExec
-    let probe_hash_exprs = vec![
-        col("a", &probe_side_schema).unwrap(),
-        col("b", &probe_side_schema).unwrap(),
-    ];
-    let probe_repartition = Arc::new(
-        RepartitionExec::try_new(
-            Arc::clone(&probe_scan),
-            Partitioning::Hash(probe_hash_exprs, partition_count), // create multi partitions on probSide
-        )
-        .unwrap(),
-    );
-    let probe_coalesce = Arc::new(CoalesceBatchesExec::new(probe_repartition, 8192));
-
-    let on = vec![
-        (
-            col("a", &build_side_schema).unwrap(),
-            col("a", &probe_side_schema).unwrap(),
-        ),
-        (
-            col("b", &build_side_schema).unwrap(),
-            col("b", &probe_side_schema).unwrap(),
-        ),
-    ];
-    let hash_join = Arc::new(
-        HashJoinExec::try_new(
-            build_scan,
-            probe_coalesce,
-            on,
-            None,
-            &JoinType::Inner,
-            None,
-            PartitionMode::CollectLeft,
-            datafusion_common::NullEquality::NullEqualsNothing,
-        )
-        .unwrap(),
-    );
-
-    // Top-level CoalesceBatchesExec
-    let cb =
-        Arc::new(CoalesceBatchesExec::new(hash_join, 8192)) as Arc<dyn ExecutionPlan>;
-    // Top-level CoalescePartitionsExec
-    let cp = Arc::new(CoalescePartitionsExec::new(cb)) as Arc<dyn ExecutionPlan>;
-    // Add a sort for deterministic output
-    let plan = Arc::new(SortExec::new(
-        LexOrdering::new(vec![PhysicalSortExpr::new(
-            col("a", &probe_side_schema).unwrap(),
-            SortOptions::new(true, false), // descending, nulls_first
-        )])
-        .unwrap(),
-        cp,
-    )) as Arc<dyn ExecutionPlan>;
-
-    // expect the predicate to be pushed down into the probe side DataSource
-    insta::assert_snapshot!(
-        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
-        -   CoalescePartitionsExec
-        -     CoalesceBatchesExec: target_batch_size=8192
-        -       HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
-        -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-        -         CoalesceBatchesExec: target_batch_size=8192
-        -           RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
-        -             DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
-          -   CoalescePartitionsExec
-          -     CoalesceBatchesExec: target_batch_size=8192
-          -       HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
-          -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-          -         CoalesceBatchesExec: target_batch_size=8192
-          -           RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
-          -             DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
-    "
-    );
-
-    // Actually apply the optimization to the plan and execute to see the filter in action
-    let mut config = ConfigOptions::default();
-    config.execution.parquet.pushdown_filters = true;
-    config.optimizer.enable_dynamic_filter_pushdown = true;
-    let plan = FilterPushdown::new_post_optimization()
-        .optimize(plan, &config)
-        .unwrap();
-    let config = SessionConfig::new().with_batch_size(10);
-    let session_ctx = SessionContext::new_with_config(config);
-    session_ctx.register_object_store(
-        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
-        Arc::new(InMemory::new()),
-    );
-    let state = session_ctx.state();
-    let task_ctx = state.task_ctx();
-    let batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx))
-        .await
-        .unwrap();
-
-    // Now check what our filter looks like
-    insta::assert_snapshot!(
-        format!("{}", format_plan_for_test(&plan)),
-        @r"
-    - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
-    -   CoalescePartitionsExec
-    -     CoalesceBatchesExec: target_batch_size=8192
-    -       HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
-    -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-    -         CoalesceBatchesExec: target_batch_size=8192
-    -           RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
-    -             DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb ]
-    "
-    );
-
-    let result = format!("{}", pretty_format_batches(&batches).unwrap());
-
-    let probe_scan_metrics = probe_scan.metrics().unwrap();
-
-    // The probe side had 4 rows, but after applying the dynamic filter only 2 rows should remain.
-    // The number of output rows from the probe side scan should stay consistent across executions.
-    // Issue: https://github.com/apache/datafusion/issues/17451
-    assert_eq!(probe_scan_metrics.output_rows().unwrap(), 2);
-
-    insta::assert_snapshot!(
-        result,
-        @r"
-    +----+----+-----+----+----+-----+
-    | a  | b  | c   | a  | b  | e   |
-    +----+----+-----+----+----+-----+
-    | ab | bb | 2.0 | ab | bb | 2.0 |
-    | aa | ba | 1.0 | aa | ba | 1.0 |
-    +----+----+-----+----+----+-----+
-    ",
-    );
-}
-
-#[tokio::test]
-async fn test_nested_hashjoin_dynamic_filter_pushdown() {
-    use datafusion_common::JoinType;
-    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
-
-    // Create test data for three tables: t1, t2, t3
-    // t1: small table with limited values (will be build side of outer join)
-    let t1_batches =
-        vec![
-            record_batch!(("a", Utf8, ["aa", "ab"]), ("x", Float64, [1.0, 2.0])).unwrap(),
-        ];
-    let t1_schema = Arc::new(Schema::new(vec![
-        Field::new("a", DataType::Utf8, false),
-        Field::new("x", DataType::Float64, false),
-    ]));
-    let t1_scan = TestScanBuilder::new(Arc::clone(&t1_schema))
-        .with_support(true)
-        .with_batches(t1_batches)
-        .build();
-
-    // t2: larger table (will be probe side of inner join, build side of outer join)
-    let t2_batches = vec![record_batch!(
-        ("b", Utf8, ["aa", "ab", "ac", "ad", "ae"]),
-        ("c", Utf8, ["ca", "cb", "cc", "cd", "ce"]),
-        ("y", Float64, [1.0, 2.0, 3.0, 4.0, 5.0])
-    )
-    .unwrap()];
-    let t2_schema = Arc::new(Schema::new(vec![
-        Field::new("b", DataType::Utf8, false),
-        Field::new("c", DataType::Utf8, false),
-        Field::new("y", DataType::Float64, false),
-    ]));
-    let t2_scan = TestScanBuilder::new(Arc::clone(&t2_schema))
-        .with_support(true)
-        .with_batches(t2_batches)
-        .build();
-
-    // t3: largest table (will be probe side of inner join)
-    let t3_batches = vec![record_batch!(
-        ("d", Utf8, ["ca", "cb", "cc", "cd", "ce", "cf", "cg", "ch"]),
-        ("z", Float64, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0])
-    )
-    .unwrap()];
-    let t3_schema = Arc::new(Schema::new(vec![
-        Field::new("d", DataType::Utf8, false),
-        Field::new("z", DataType::Float64, false),
-    ]));
-    let t3_scan = TestScanBuilder::new(Arc::clone(&t3_schema))
-        .with_support(true)
-        .with_batches(t3_batches)
-        .build();
-
-    // Create nested join structure:
-    // Join (t1.a = t2.b)
-    // /        \
-    // t1    Join(t2.c = t3.d)
-    //         /    \
-    //        t2   t3
-
-    // First create inner join: t2.c = t3.d
-    let inner_join_on =
-        vec![(col("c", &t2_schema).unwrap(), col("d", &t3_schema).unwrap())];
-    let inner_join = Arc::new(
-        HashJoinExec::try_new(
-            t2_scan,
-            t3_scan,
-            inner_join_on,
-            None,
-            &JoinType::Inner,
-            None,
-            PartitionMode::Partitioned,
-            datafusion_common::NullEquality::NullEqualsNothing,
-        )
-        .unwrap(),
-    );
-
-    // Then create outer join: t1.a = t2.b (from inner join result)
-    let outer_join_on = vec![(
-        col("a", &t1_schema).unwrap(),
-        col("b", &inner_join.schema()).unwrap(),
-    )];
-    let outer_join = Arc::new(
-        HashJoinExec::try_new(
-            t1_scan,
-            inner_join as Arc<dyn ExecutionPlan>,
-            outer_join_on,
-            None,
-            &JoinType::Inner,
-            None,
-            PartitionMode::Partitioned,
-            datafusion_common::NullEquality::NullEqualsNothing,
-        )
-        .unwrap(),
-    ) as Arc<dyn ExecutionPlan>;
-
-    // Test that dynamic filters are pushed down correctly through nested joins
-    insta::assert_snapshot!(
-        OptimizationTest::new(Arc::clone(&outer_join), FilterPushdown::new_post_optimization(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@0)]
-        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, x], file_type=test, pushdown_supported=true
-        -   HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@1, d@0)]
-        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[b, c, y], file_type=test, pushdown_supported=true
-        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, z], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@0)]
-          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, x], file_type=test, pushdown_supported=true
-          -   HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@1, d@0)]
-          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[b, c, y], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
-          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, z], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
-    ",
-    );
-
-    // Execute the plan to verify the dynamic filters are properly updated
-    let mut config = ConfigOptions::default();
-    config.execution.parquet.pushdown_filters = true;
-    config.optimizer.enable_dynamic_filter_pushdown = true;
-    let plan = FilterPushdown::new_post_optimization()
-        .optimize(outer_join, &config)
-        .unwrap();
-    let config = SessionConfig::new().with_batch_size(10);
-    let session_ctx = SessionContext::new_with_config(config);
-    session_ctx.register_object_store(
-        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
-        Arc::new(InMemory::new()),
-    );
-    let state = session_ctx.state();
-    let task_ctx = state.task_ctx();
-    let mut stream = plan.execute(0, Arc::clone(&task_ctx)).unwrap();
-    // Execute to populate the dynamic filters
-    stream.next().await.unwrap().unwrap();
-
-    // Verify that both the inner and outer join have updated dynamic filters
-    insta::assert_snapshot!(
-        format!("{}", format_plan_for_test(&plan)),
-        @r"
-    - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@0)]
-    -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, x], file_type=test, pushdown_supported=true
-    -   HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@1, d@0)]
-    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[b, c, y], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ b@0 >= aa AND b@0 <= ab ]
-    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, z], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ d@0 >= ca AND d@0 <= cb ]
-    "
-    );
-}
-
-#[tokio::test]
-async fn test_hashjoin_parent_filter_pushdown() {
-    use datafusion_common::JoinType;
-    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
-
-    // Create build side with limited values
-    let build_batches = vec![record_batch!(
-        ("a", Utf8, ["aa", "ab"]),
-        ("b", Utf8, ["ba", "bb"]),
-        ("c", Float64, [1.0, 2.0])
-    )
-    .unwrap()];
-    let build_side_schema = Arc::new(Schema::new(vec![
-        Field::new("a", DataType::Utf8, false),
-        Field::new("b", DataType::Utf8, false),
-        Field::new("c", DataType::Float64, false),
-    ]));
-    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
-        .with_support(true)
-        .with_batches(build_batches)
-        .build();
-
-    // Create probe side with more values
-    let probe_batches = vec![record_batch!(
-        ("d", Utf8, ["aa", "ab", "ac", "ad"]),
-        ("e", Utf8, ["ba", "bb", "bc", "bd"]),
-        ("f", Float64, [1.0, 2.0, 3.0, 4.0])
-    )
-    .unwrap()];
-    let probe_side_schema = Arc::new(Schema::new(vec![
-        Field::new("d", DataType::Utf8, false),
-        Field::new("e", DataType::Utf8, false),
-        Field::new("f", DataType::Float64, false),
-    ]));
-    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
-        .with_support(true)
-        .with_batches(probe_batches)
-        .build();
-
-    // Create HashJoinExec
-    let on = vec![(
-        col("a", &build_side_schema).unwrap(),
-        col("d", &probe_side_schema).unwrap(),
-    )];
-    let join = Arc::new(
-        HashJoinExec::try_new(
-            build_scan,
-            probe_scan,
-            on,
-            None,
-            &JoinType::Inner,
-            None,
-            PartitionMode::Partitioned,
-            datafusion_common::NullEquality::NullEqualsNothing,
-        )
-        .unwrap(),
-    );
-
-    // Create filters that can be pushed down to different sides
-    // We need to create filters in the context of the join output schema
-    let join_schema = join.schema();
-
-    // Filter on build side column: a = 'aa'
-    let left_filter = col_lit_predicate("a", "aa", &join_schema);
-    // Filter on probe side column: e = 'ba'
-    let right_filter = col_lit_predicate("e", "ba", &join_schema);
-    // Filter that references both sides: a = d (should not be pushed down)
-    let cross_filter = Arc::new(BinaryExpr::new(
-        col("a", &join_schema).unwrap(),
-        Operator::Eq,
-        col("d", &join_schema).unwrap(),
-    )) as Arc<dyn PhysicalExpr>;
-
-    let filter =
-        Arc::new(FilterExec::try_new(left_filter, Arc::clone(&join) as _).unwrap());
-    let filter = Arc::new(FilterExec::try_new(right_filter, filter).unwrap());
-    let plan = Arc::new(FilterExec::try_new(cross_filter, filter).unwrap())
-        as Arc<dyn ExecutionPlan>;
-
-    // Test that filters are pushed down correctly to each side of the join
-    insta::assert_snapshot!(
-        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: a@0 = d@3
-        -   FilterExec: e@4 = ba
-        -     FilterExec: a@0 = aa
-        -       HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, d@0)]
-        -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-        -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - FilterExec: a@0 = d@3
-          -   HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, d@0)]
-          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = aa
-          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=e@1 = ba
-    "
-    );
-}
-
-/// Integration test for dynamic filter pushdown with TopK.
-/// We use an integration test because there are complex interactions in the optimizer rules
-/// that the unit tests applying a single optimizer rule do not cover.
-#[tokio::test]
-async fn test_topk_dynamic_filter_pushdown_integration() {
-    let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
-    let mut cfg = SessionConfig::new();
-    cfg.options_mut().execution.parquet.pushdown_filters = true;
-    cfg.options_mut().execution.parquet.max_row_group_size = 128;
-    let ctx = SessionContext::new_with_config(cfg);
-    ctx.register_object_store(
-        ObjectStoreUrl::parse("memory://").unwrap().as_ref(),
-        Arc::clone(&store),
-    );
-    ctx.sql(
-        r"
-COPY  (
-  SELECT 1372708800 + value AS t
-  FROM generate_series(0, 99999)
-  ORDER BY t
- ) TO 'memory:///1.parquet'
-STORED AS PARQUET;
-  ",
-    )
-    .await
-    .unwrap()
-    .collect()
-    .await
-    .unwrap();
-
-    // Register the file with the context
-    ctx.register_parquet(
-        "topk_pushdown",
-        "memory:///1.parquet",
-        ParquetReadOptions::default(),
-    )
-    .await
-    .unwrap();
-
-    // Create a TopK query that will use dynamic filter pushdown
-    let df = ctx
-        .sql(r"EXPLAIN ANALYZE SELECT t FROM topk_pushdown ORDER BY t LIMIT 10;")
-        .await
-        .unwrap();
-    let batches = df.collect().await.unwrap();
-    let explain = format!("{}", pretty_format_batches(&batches).unwrap());
-
-    assert!(explain.contains("output_rows=128")); // Read 1 row group
-    assert!(explain.contains("t@0 < 1372708809")); // Dynamic filter was applied
-    assert!(
-        explain.contains("pushdown_rows_matched=128, pushdown_rows_pruned=99872"),
-        "{explain}"
-    );
-    // Pushdown pruned most rows
-}
-
-#[test]
-fn test_filter_pushdown_through_union() {
-    let scan1 = TestScanBuilder::new(schema()).with_support(true).build();
-    let scan2 = TestScanBuilder::new(schema()).with_support(true).build();
-
-    let union = UnionExec::try_new(vec![scan1, scan2]).unwrap();
-
-    let predicate = col_lit_predicate("a", "foo", &schema());
-    let plan = Arc::new(FilterExec::try_new(predicate, union).unwrap());
-
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown::new(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: a@0 = foo
-        -   UnionExec
-        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - UnionExec
-          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
-          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
-    "
-    );
-}
-
-/// Schema:
-/// a: String
-/// b: String
-/// c: f64
-static TEST_SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
-    let fields = vec![
-        Field::new("a", DataType::Utf8, false),
-        Field::new("b", DataType::Utf8, false),
-        Field::new("c", DataType::Float64, false),
-    ];
-    Arc::new(Schema::new(fields))
-});
-
-fn schema() -> SchemaRef {
-    Arc::clone(&TEST_SCHEMA)
-}
-
-/// Returns a predicate that is a binary expression col = lit
-fn col_lit_predicate(
-    column_name: &str,
-    scalar_value: impl Into<ScalarValue>,
-    schema: &Schema,
-) -> Arc<dyn PhysicalExpr> {
-    let scalar_value = scalar_value.into();
-    Arc::new(BinaryExpr::new(
-        Arc::new(Column::new_with_schema(column_name, schema).unwrap()),
-        Operator::Eq,
-        Arc::new(Literal::new(scalar_value)),
-    ))
-}
-
-#[tokio::test]
-async fn test_aggregate_filter_pushdown() {
-    // Test that filters can pass through AggregateExec even with aggregate functions
-    // when the filter references grouping columns
-    // Simulates: SELECT a, COUNT(b) FROM table WHERE a = 'x' GROUP BY a
-
-    let batches =
-        vec![
-            record_batch!(("a", Utf8, ["x", "y"]), ("b", Utf8, ["foo", "bar"])).unwrap(),
-        ];
-
-    let scan = TestScanBuilder::new(schema())
-        .with_support(true)
-        .with_batches(batches)
-        .build();
-
-    // Create an aggregate: GROUP BY a with COUNT(b)
-    let group_by = PhysicalGroupBy::new_single(vec![(
-        col("a", &schema()).unwrap(),
-        "a".to_string(),
-    )]);
-
-    // Add COUNT aggregate
-    let count_expr =
-        AggregateExprBuilder::new(count_udaf(), vec![col("b", &schema()).unwrap()])
-            .schema(schema())
-            .alias("count")
-            .build()
-            .unwrap();
-
-    let aggregate = Arc::new(
-        AggregateExec::try_new(
-            AggregateMode::Partial,
-            group_by,
-            vec![count_expr.into()], // Has aggregate function
-            vec![None],              // No filter on the aggregate function
-            Arc::clone(&scan),
-            schema(),
-        )
-        .unwrap(),
-    );
-
-    // Add a filter on the grouping column 'a'
-    let predicate = col_lit_predicate("a", "x", &schema());
-    let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap())
-        as Arc<dyn ExecutionPlan>;
-
-    // Even with aggregate functions, filter on grouping column should be pushed through
-    insta::assert_snapshot!(
-        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: a@0 = x
-        -   AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count]
-        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count], ordering_mode=Sorted
-          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = x
-    "
-    );
-}
-
-#[tokio::test]
-async fn test_no_pushdown_filter_on_aggregate_result() {
-    // Test that filters on aggregate results (not grouping columns) are NOT pushed through
-    // SELECT a, COUNT(b) as cnt FROM table GROUP BY a HAVING cnt > 5
-    // The filter on 'cnt' cannot be pushed down because it's an aggregate result
-
-    let batches =
-        vec![
-            record_batch!(("a", Utf8, ["x", "y"]), ("b", Utf8, ["foo", "bar"])).unwrap(),
-        ];
-
-    let scan = TestScanBuilder::new(schema())
-        .with_support(true)
-        .with_batches(batches)
-        .build();
-
-    // Create an aggregate: GROUP BY a with COUNT(b)
-    let group_by = PhysicalGroupBy::new_single(vec![(
-        col("a", &schema()).unwrap(),
-        "a".to_string(),
-    )]);
-
-    // Add COUNT aggregate
-    let count_expr =
-        AggregateExprBuilder::new(count_udaf(), vec![col("b", &schema()).unwrap()])
-            .schema(schema())
-            .alias("count")
-            .build()
-            .unwrap();
-
-    let aggregate = Arc::new(
-        AggregateExec::try_new(
-            AggregateMode::Partial,
-            group_by,
-            vec![count_expr.into()],
-            vec![None],
-            Arc::clone(&scan),
-            schema(),
-        )
-        .unwrap(),
-    );
-
-    // Add a filter on the aggregate output column
-    // This simulates filtering on COUNT result, which should NOT be pushed through
-    let agg_schema = aggregate.schema();
-    let predicate = Arc::new(BinaryExpr::new(
-        Arc::new(Column::new_with_schema("count[count]", &agg_schema).unwrap()),
-        Operator::Gt,
-        Arc::new(Literal::new(ScalarValue::Int64(Some(5)))),
-    ));
-    let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap())
-        as Arc<dyn ExecutionPlan>;
-
-    // The filter should NOT be pushed through the aggregate since it's on an aggregate result
-    insta::assert_snapshot!(
-        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: count[count]@1 > 5
-        -   AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count]
-        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - FilterExec: count[count]@1 > 5
-          -   AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count]
-          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-    "
-    );
-}
-
-#[test]
-fn test_pushdown_filter_on_non_first_grouping_column() {
-    // Test that filters on non-first grouping columns are still pushed down
-    // SELECT a, b, count(*) as cnt FROM table GROUP BY a, b HAVING b = 'bar'
-    // The filter is on 'b' (second grouping column), should push down
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-
-    let aggregate_expr =
-        vec![
-            AggregateExprBuilder::new(count_udaf(), vec![col("c", &schema()).unwrap()])
-                .schema(schema())
-                .alias("cnt")
-                .build()
-                .map(Arc::new)
-                .unwrap(),
-        ];
-
-    let group_by = PhysicalGroupBy::new_single(vec![
-        (col("a", &schema()).unwrap(), "a".to_string()),
-        (col("b", &schema()).unwrap(), "b".to_string()),
-    ]);
-
-    let aggregate = Arc::new(
-        AggregateExec::try_new(
-            AggregateMode::Final,
-            group_by,
-            aggregate_expr.clone(),
-            vec![None],
-            scan,
-            schema(),
-        )
-        .unwrap(),
-    );
-
-    let predicate = col_lit_predicate("b", "bar", &schema());
-    let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap());
-
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown::new(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: b@1 = bar
-        -   AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt]
-        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt], ordering_mode=PartiallySorted([1])
-          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=b@1 = bar
-    "
-    );
-}
-
-#[test]
-fn test_no_pushdown_grouping_sets_filter_on_missing_column() {
-    // Test that filters on columns missing from some grouping sets are NOT pushed through
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-
-    let aggregate_expr =
-        vec![
-            AggregateExprBuilder::new(count_udaf(), vec![col("c", &schema()).unwrap()])
-                .schema(schema())
-                .alias("cnt")
-                .build()
-                .map(Arc::new)
-                .unwrap(),
-        ];
-
-    // Create GROUPING SETS with (a, b) and (b)
-    let group_by = PhysicalGroupBy::new(
-        vec![
-            (col("a", &schema()).unwrap(), "a".to_string()),
-            (col("b", &schema()).unwrap(), "b".to_string()),
-        ],
-        vec![
-            (
-                Arc::new(Literal::new(ScalarValue::Utf8(None))),
-                "a".to_string(),
-            ),
-            (
-                Arc::new(Literal::new(ScalarValue::Utf8(None))),
-                "b".to_string(),
-            ),
-        ],
-        vec![
-            vec![false, false], // (a, b) - both present
-            vec![true, false],  // (b) - a is NULL, b present
-        ],
-    );
-
-    let aggregate = Arc::new(
-        AggregateExec::try_new(
-            AggregateMode::Final,
-            group_by,
-            aggregate_expr.clone(),
-            vec![None],
-            scan,
-            schema(),
-        )
-        .unwrap(),
-    );
-
-    // Filter on column 'a' which is missing in the second grouping set, should not be pushed down
-    let predicate = col_lit_predicate("a", "foo", &schema());
-    let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap());
-
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown::new(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: a@0 = foo
-        -   AggregateExec: mode=Final, gby=[(a@0 as a, b@1 as b), (NULL as a, b@1 as b)], aggr=[cnt]
-        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - FilterExec: a@0 = foo
-          -   AggregateExec: mode=Final, gby=[(a@0 as a, b@1 as b), (NULL as a, b@1 as b)], aggr=[cnt]
-          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-    "
-    );
-}
-
-#[test]
-fn test_pushdown_grouping_sets_filter_on_common_column() {
-    // Test that filters on columns present in ALL grouping sets ARE pushed through
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-
-    let aggregate_expr =
-        vec![
-            AggregateExprBuilder::new(count_udaf(), vec![col("c", &schema()).unwrap()])
-                .schema(schema())
-                .alias("cnt")
-                .build()
-                .map(Arc::new)
-                .unwrap(),
-        ];
-
-    // Create GROUPING SETS with (a, b) and (b)
-    let group_by = PhysicalGroupBy::new(
-        vec![
-            (col("a", &schema()).unwrap(), "a".to_string()),
-            (col("b", &schema()).unwrap(), "b".to_string()),
-        ],
-        vec![
-            (
-                Arc::new(Literal::new(ScalarValue::Utf8(None))),
-                "a".to_string(),
-            ),
-            (
-                Arc::new(Literal::new(ScalarValue::Utf8(None))),
-                "b".to_string(),
-            ),
-        ],
-        vec![
-            vec![false, false], // (a, b) - both present
-            vec![true, false],  // (b) - a is NULL, b present
-        ],
-    );
-
-    let aggregate = Arc::new(
-        AggregateExec::try_new(
-            AggregateMode::Final,
-            group_by,
-            aggregate_expr.clone(),
-            vec![None],
-            scan,
-            schema(),
-        )
-        .unwrap(),
-    );
-
-    // Filter on column 'b' which is present in all grouping sets will be pushed down
-    let predicate = col_lit_predicate("b", "bar", &schema());
-    let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap());
-
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown::new(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: b@1 = bar
-        -   AggregateExec: mode=Final, gby=[(a@0 as a, b@1 as b), (NULL as a, b@1 as b)], aggr=[cnt]
-        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - AggregateExec: mode=Final, gby=[(a@0 as a, b@1 as b), (NULL as a, b@1 as b)], aggr=[cnt], ordering_mode=PartiallySorted([1])
-          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=b@1 = bar
-    "
-    );
-}
-
-#[test]
-fn test_pushdown_with_empty_group_by() {
-    // Test that filters can be pushed down when GROUP BY is empty (no grouping columns)
-    // SELECT count(*) as cnt FROM table WHERE a = 'foo'
-    // There are no grouping columns, so the filter should still push down
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-
-    let aggregate_expr =
-        vec![
-            AggregateExprBuilder::new(count_udaf(), vec![col("c", &schema()).unwrap()])
-                .schema(schema())
-                .alias("cnt")
-                .build()
-                .map(Arc::new)
-                .unwrap(),
-        ];
-
-    // Empty GROUP BY - no grouping columns
-    let group_by = PhysicalGroupBy::new_single(vec![]);
-
-    let aggregate = Arc::new(
-        AggregateExec::try_new(
-            AggregateMode::Final,
-            group_by,
-            aggregate_expr.clone(),
-            vec![None],
-            scan,
-            schema(),
-        )
-        .unwrap(),
-    );
-
-    // Filter on 'a'
-    let predicate = col_lit_predicate("a", "foo", &schema());
-    let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap());
-
-    // The filter should be pushed down even with empty GROUP BY
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown::new(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: a@0 = foo
-        -   AggregateExec: mode=Final, gby=[], aggr=[cnt]
-        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - AggregateExec: mode=Final, gby=[], aggr=[cnt]
-          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
-    "
-    );
-}
-
-#[test]
-fn test_pushdown_with_computed_grouping_key() {
-    // Test filter pushdown with computed grouping expression
-    // SELECT (c + 1.0) as c_plus_1, count(*) FROM table WHERE c > 5.0 GROUP BY (c + 1.0)
-
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-
-    let predicate = Arc::new(BinaryExpr::new(
-        col("c", &schema()).unwrap(),
-        Operator::Gt,
-        Arc::new(Literal::new(ScalarValue::Float64(Some(5.0)))),
-    )) as Arc<dyn PhysicalExpr>;
-    let filter = Arc::new(FilterExec::try_new(predicate, scan).unwrap());
-
-    let aggregate_expr =
-        vec![
-            AggregateExprBuilder::new(count_udaf(), vec![col("a", &schema()).unwrap()])
-                .schema(schema())
-                .alias("cnt")
-                .build()
-                .map(Arc::new)
-                .unwrap(),
-        ];
-
-    let c_plus_one = Arc::new(BinaryExpr::new(
-        col("c", &schema()).unwrap(),
-        Operator::Plus,
-        Arc::new(Literal::new(ScalarValue::Float64(Some(1.0)))),
-    )) as Arc<dyn PhysicalExpr>;
-
-    let group_by =
-        PhysicalGroupBy::new_single(vec![(c_plus_one, "c_plus_1".to_string())]);
-
-    let plan = Arc::new(
-        AggregateExec::try_new(
-            AggregateMode::Final,
-            group_by,
-            aggregate_expr.clone(),
-            vec![None],
-            filter,
-            schema(),
-        )
-        .unwrap(),
-    );
-
-    // The filter should be pushed down because 'c' is extracted from the grouping expression (c + 1.0)
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown::new(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - AggregateExec: mode=Final, gby=[c@2 + 1 as c_plus_1], aggr=[cnt]
-        -   FilterExec: c@2 > 5
-        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - AggregateExec: mode=Final, gby=[c@2 + 1 as c_plus_1], aggr=[cnt]
-          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=c@2 > 5
-    "
-    );
-}
diff --git a/datafusion/core/tests/physical_optimizer/join_selection.rs b/datafusion/core/tests/physical_optimizer/join_selection.rs
index f9d3a045469e1..050baa9e792e9 100644
--- a/datafusion/core/tests/physical_optimizer/join_selection.rs
+++ b/datafusion/core/tests/physical_optimizer/join_selection.rs
@@ -18,7 +18,6 @@
 use insta::assert_snapshot;
 use std::sync::Arc;
 use std::{
-    any::Any,
     pin::Pin,
     task::{Context, Poll},
 };
@@ -26,27 +25,28 @@ use std::{
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::{stats::Precision, ColumnStatistics, JoinType, ScalarValue};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{ColumnStatistics, JoinType, ScalarValue, stats::Precision};
 use datafusion_common::{JoinSide, NullEquality};
 use datafusion_common::{Result, Statistics};
 use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream, TaskContext};
 use datafusion_expr::Operator;
+use datafusion_physical_expr::PhysicalExprRef;
 use datafusion_physical_expr::expressions::col;
 use datafusion_physical_expr::expressions::{BinaryExpr, Column, NegativeExpr};
 use datafusion_physical_expr::intervals::utils::check_support;
-use datafusion_physical_expr::PhysicalExprRef;
 use datafusion_physical_expr::{EquivalenceProperties, Partitioning, PhysicalExpr};
-use datafusion_physical_optimizer::join_selection::JoinSelection;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::join_selection::JoinSelection;
+use datafusion_physical_plan::ExecutionPlanProperties;
 use datafusion_physical_plan::displayable;
 use datafusion_physical_plan::joins::utils::ColumnIndex;
 use datafusion_physical_plan::joins::utils::JoinFilter;
 use datafusion_physical_plan::joins::{HashJoinExec, NestedLoopJoinExec, PartitionMode};
 use datafusion_physical_plan::projection::ProjectionExec;
-use datafusion_physical_plan::ExecutionPlanProperties;
 use datafusion_physical_plan::{
-    execution_plan::{Boundedness, EmissionType},
     DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
+    execution_plan::{Boundedness, EmissionType},
 };
 
 use futures::Stream;
@@ -222,6 +222,7 @@ async fn test_join_with_swap() {
             None,
             PartitionMode::CollectLeft,
             NullEquality::NullEqualsNothing,
+            false,
         )
         .unwrap(),
     );
@@ -231,7 +232,6 @@ async fn test_join_with_swap() {
         .unwrap();
 
     let swapping_projection = optimized_join
-        .as_any()
         .downcast_ref::<ProjectionExec>()
         .expect("A proj is required to swap columns back to their original order");
 
@@ -245,7 +245,6 @@ async fn test_join_with_swap() {
 
     let swapped_join = swapping_projection
         .input()
-        .as_any()
         .downcast_ref::<HashJoinExec>()
         .expect("The type of the plan should not be changed");
 
@@ -284,6 +283,7 @@ async fn test_left_join_no_swap() {
             None,
             PartitionMode::CollectLeft,
             NullEquality::NullEqualsNothing,
+            false,
         )
         .unwrap(),
     );
@@ -293,7 +293,6 @@ async fn test_left_join_no_swap() {
         .unwrap();
 
     let swapped_join = optimized_join
-        .as_any()
         .downcast_ref::<HashJoinExec>()
         .expect("The type of the plan should not be changed");
 
@@ -333,6 +332,7 @@ async fn test_join_with_swap_semi() {
             None,
             PartitionMode::Partitioned,
             NullEquality::NullEqualsNothing,
+            false,
         )
         .unwrap();
 
@@ -342,12 +342,9 @@ async fn test_join_with_swap_semi() {
             .optimize(Arc::new(join), &ConfigOptions::new())
             .unwrap();
 
-        let swapped_join = optimized_join
-            .as_any()
-            .downcast_ref::<HashJoinExec>()
-            .expect(
-                "A proj is not required to swap columns back to their original order",
-            );
+        let swapped_join = optimized_join.downcast_ref::<HashJoinExec>().expect(
+            "A proj is not required to swap columns back to their original order",
+        );
 
         assert_eq!(swapped_join.schema().fields().len(), 1);
         assert_eq!(
@@ -388,6 +385,7 @@ async fn test_join_with_swap_mark() {
             None,
             PartitionMode::Partitioned,
             NullEquality::NullEqualsNothing,
+            false,
         )
         .unwrap();
 
@@ -397,12 +395,9 @@ async fn test_join_with_swap_mark() {
             .optimize(Arc::new(join), &ConfigOptions::new())
             .unwrap();
 
-        let swapped_join = optimized_join
-            .as_any()
-            .downcast_ref::<HashJoinExec>()
-            .expect(
-                "A proj is not required to swap columns back to their original order",
-            );
+        let swapped_join = optimized_join.downcast_ref::<HashJoinExec>().expect(
+            "A proj is not required to swap columns back to their original order",
+        );
 
         assert_eq!(swapped_join.schema().fields().len(), 2);
         assert_eq!(
@@ -461,6 +456,7 @@ async fn test_nested_join_swap() {
         None,
         PartitionMode::CollectLeft,
         NullEquality::NullEqualsNothing,
+        false,
     )
     .unwrap();
     let child_schema = child_join.schema();
@@ -478,6 +474,7 @@ async fn test_nested_join_swap() {
         None,
         PartitionMode::CollectLeft,
         NullEquality::NullEqualsNothing,
+        false,
     )
     .unwrap();
 
@@ -518,6 +515,7 @@ async fn test_join_no_swap() {
             None,
             PartitionMode::CollectLeft,
             NullEquality::NullEqualsNothing,
+            false,
         )
         .unwrap(),
     );
@@ -527,7 +525,6 @@ async fn test_join_no_swap() {
         .unwrap();
 
     let swapped_join = optimized_join
-        .as_any()
         .downcast_ref::<HashJoinExec>()
         .expect("The type of the plan should not be changed");
 
@@ -576,7 +573,6 @@ async fn test_nl_join_with_swap(join_type: JoinType) {
         .unwrap();
 
     let swapping_projection = optimized_join
-        .as_any()
         .downcast_ref::<ProjectionExec>()
         .expect("A proj is required to swap columns back to their original order");
 
@@ -590,7 +586,6 @@ async fn test_nl_join_with_swap(join_type: JoinType) {
 
     let swapped_join = swapping_projection
         .input()
-        .as_any()
         .downcast_ref::<NestedLoopJoinExec>()
         .expect("The type of the plan should not be changed");
 
@@ -657,7 +652,6 @@ async fn test_nl_join_with_swap_no_proj(join_type: JoinType) {
         .unwrap();
 
     let swapped_join = optimized_join
-        .as_any()
         .downcast_ref::<NestedLoopJoinExec>()
         .expect("The type of the plan should not be changed");
 
@@ -745,16 +739,19 @@ async fn test_hash_join_swap_on_joins_with_projections(
         Some(projection),
         PartitionMode::Partitioned,
         NullEquality::NullEqualsNothing,
+        false,
     )?);
 
     let swapped = join
         .swap_inputs(PartitionMode::Partitioned)
         .expect("swap_hash_join must support joins with projections");
-    let swapped_join = swapped.as_any().downcast_ref::<HashJoinExec>().expect(
+    let swapped_join = swapped
+        .downcast_ref::<HashJoinExec>()
+        .expect(
             "ProjectionExec won't be added above if HashJoinExec contains embedded projection",
         );
 
-    assert_eq!(swapped_join.projection, Some(vec![0_usize]));
+    assert_eq!(swapped_join.projection.as_deref().unwrap(), &[0_usize]);
     assert_eq!(swapped.schema().fields.len(), 1);
     assert_eq!(swapped.schema().fields[0].name(), "small_col");
     Ok(())
@@ -762,7 +759,6 @@ async fn test_hash_join_swap_on_joins_with_projections(
 
 fn assert_col_expr(expr: &Arc<dyn PhysicalExpr>, name: &str, index: usize) {
     let col = expr
-        .as_any()
         .downcast_ref::<Column>()
         .expect("Projection items should be Column expression");
     assert_eq!(col.name(), name);
@@ -906,6 +902,7 @@ fn check_join_partition_mode(
             None,
             PartitionMode::Auto,
             NullEquality::NullEqualsNothing,
+            false,
         )
         .unwrap(),
     );
@@ -916,18 +913,15 @@ fn check_join_partition_mode(
 
     if !is_swapped {
         let swapped_join = optimized_join
-            .as_any()
             .downcast_ref::<HashJoinExec>()
             .expect("The type of the plan should not be changed");
         assert_eq!(*swapped_join.partition_mode(), expected_mode);
     } else {
         let swapping_projection = optimized_join
-            .as_any()
             .downcast_ref::<ProjectionExec>()
             .expect("A proj is required to swap columns back to their original order");
         let swapped_join = swapping_projection
             .input()
-            .as_any()
             .downcast_ref::<HashJoinExec>()
             .expect("The type of the plan should not be changed");
 
@@ -949,10 +943,10 @@ impl Stream for UnboundedStream {
         mut self: Pin<&mut Self>,
         _cx: &mut Context<'_>,
     ) -> Poll<Option<Self::Item>> {
-        if let Some(val) = self.batch_produce {
-            if val <= self.count {
-                return Poll::Ready(None);
-            }
+        if let Some(val) = self.batch_produce
+            && val <= self.count
+        {
+            return Poll::Ready(None);
         }
         self.count += 1;
         Poll::Ready(Some(Ok(self.batch.clone())))
@@ -970,7 +964,7 @@ impl RecordBatchStream for UnboundedStream {
 pub struct UnboundedExec {
     batch_produce: Option<usize>,
     batch: RecordBatch,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl UnboundedExec {
@@ -986,7 +980,7 @@ impl UnboundedExec {
         Self {
             batch_produce,
             batch,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -1039,11 +1033,7 @@ impl ExecutionPlan for UnboundedExec {
         Self::static_name()
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -1069,6 +1059,20 @@ impl ExecutionPlan for UnboundedExec {
             batch: self.batch.clone(),
         }))
     }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.cache.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
 }
 
 #[derive(Eq, PartialEq, Debug)]
@@ -1082,20 +1086,21 @@ pub enum SourceType {
 pub struct StatisticsExec {
     stats: Statistics,
     schema: Arc<Schema>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl StatisticsExec {
     pub fn new(stats: Statistics, schema: Schema) -> Self {
         assert_eq!(
-                stats.column_statistics.len(), schema.fields().len(),
-                "if defined, the column statistics vector length should be the number of fields"
-            );
+            stats.column_statistics.len(),
+            schema.fields().len(),
+            "if defined, the column statistics vector length should be the number of fields"
+        );
         let cache = Self::compute_properties(Arc::new(schema.clone()));
         Self {
             stats,
             schema: Arc::new(schema),
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -1139,11 +1144,7 @@ impl ExecutionPlan for StatisticsExec {
         Self::static_name()
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -1166,16 +1167,26 @@ impl ExecutionPlan for StatisticsExec {
         unimplemented!("This plan only serves for testing statistics")
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(self.stats.clone())
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        Ok(if partition.is_some() {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        Ok(Arc::new(if partition.is_some() {
             Statistics::new_unknown(&self.schema)
         } else {
             self.stats.clone()
-        })
+        }))
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.cache.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
     }
 }
 
@@ -1553,16 +1564,16 @@ async fn test_join_with_maybe_swap_unbounded_case(t: TestCase) -> Result<()> {
         None,
         t.initial_mode,
         NullEquality::NullEqualsNothing,
+        false,
     )?) as _;
 
     let optimized_join_plan =
         JoinSelection::new().optimize(Arc::clone(&join), &ConfigOptions::new())?;
 
     // If swap did happen
-    let projection_added = optimized_join_plan.as_any().is::<ProjectionExec>();
+    let projection_added = optimized_join_plan.is::<ProjectionExec>();
     let plan = if projection_added {
         let proj = optimized_join_plan
-            .as_any()
             .downcast_ref::<ProjectionExec>()
             .expect("A proj is required to swap columns back to their original order");
         Arc::<dyn ExecutionPlan>::clone(proj.input())
@@ -1576,7 +1587,7 @@ async fn test_join_with_maybe_swap_unbounded_case(t: TestCase) -> Result<()> {
         join_type,
         mode,
         ..
-    }) = plan.as_any().downcast_ref::<HashJoinExec>()
+    }) = plan.downcast_ref::<HashJoinExec>()
     {
         let left_changed = Arc::ptr_eq(left, &right_exec);
         let right_changed = Arc::ptr_eq(right, &left_exec);
diff --git a/datafusion/core/tests/physical_optimizer/limit_pushdown.rs b/datafusion/core/tests/physical_optimizer/limit_pushdown.rs
index 56d48901f284d..5f9b7e50848fd 100644
--- a/datafusion/core/tests/physical_optimizer/limit_pushdown.rs
+++ b/datafusion/core/tests/physical_optimizer/limit_pushdown.rs
@@ -18,7 +18,7 @@
 use std::sync::Arc;
 
 use crate::physical_optimizer::test_utils::{
-    coalesce_batches_exec, coalesce_partitions_exec, global_limit_exec, local_limit_exec,
+    coalesce_partitions_exec, global_limit_exec, hash_join_exec, local_limit_exec,
     sort_exec, sort_preserving_merge_exec, stream_exec,
 };
 
@@ -26,17 +26,19 @@ use arrow::compute::SortOptions;
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::error::Result;
-use datafusion_expr::Operator;
-use datafusion_physical_expr::expressions::{col, lit, BinaryExpr};
+use datafusion_expr::{JoinType, Operator};
 use datafusion_physical_expr::Partitioning;
+use datafusion_physical_expr::expressions::{BinaryExpr, col, lit};
+use datafusion_physical_expr_common::physical_expr::PhysicalExprRef;
 use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
-use datafusion_physical_optimizer::limit_pushdown::LimitPushdown;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::limit_pushdown::LimitPushdown;
 use datafusion_physical_plan::empty::EmptyExec;
 use datafusion_physical_plan::filter::FilterExec;
+use datafusion_physical_plan::joins::NestedLoopJoinExec;
 use datafusion_physical_plan::projection::ProjectionExec;
 use datafusion_physical_plan::repartition::RepartitionExec;
-use datafusion_physical_plan::{get_plan_string, ExecutionPlan};
+use datafusion_physical_plan::{ExecutionPlan, get_plan_string};
 
 fn create_schema() -> SchemaRef {
     Arc::new(Schema::new(vec![
@@ -87,6 +89,20 @@ fn empty_exec(schema: SchemaRef) -> Arc<dyn ExecutionPlan> {
     Arc::new(EmptyExec::new(schema))
 }
 
+fn nested_loop_join_exec(
+    left: Arc<dyn ExecutionPlan>,
+    right: Arc<dyn ExecutionPlan>,
+    join_type: JoinType,
+) -> Result<Arc<dyn ExecutionPlan>> {
+    Ok(Arc::new(NestedLoopJoinExec::try_new(
+        left, right, None, &join_type, None,
+    )?))
+}
+
+fn format_plan(plan: &Arc<dyn ExecutionPlan>) -> String {
+    get_plan_string(plan).join("\n")
+}
+
 #[test]
 fn transforms_streaming_table_exec_into_fetching_version_when_skip_is_zero() -> Result<()>
 {
@@ -94,148 +110,251 @@ fn transforms_streaming_table_exec_into_fetching_version_when_skip_is_zero() ->
     let streaming_table = stream_exec(&schema);
     let global_limit = global_limit_exec(streaming_table, 0, Some(5));
 
-    let initial = get_plan_string(&global_limit);
-    let expected_initial = [
-            "GlobalLimitExec: skip=0, fetch=5",
-            "  StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-    assert_eq!(initial, expected_initial);
+    let initial = format_plan(&global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=0, fetch=5
+      StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?;
 
-    let expected = [
-            "StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true, fetch=5"
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @"StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true, fetch=5"
+    );
 
     Ok(())
 }
 
 #[test]
-fn transforms_streaming_table_exec_into_fetching_version_and_keeps_the_global_limit_when_skip_is_nonzero(
-) -> Result<()> {
+fn transforms_streaming_table_exec_into_fetching_version_and_keeps_the_global_limit_when_skip_is_nonzero()
+-> Result<()> {
     let schema = create_schema();
     let streaming_table = stream_exec(&schema);
     let global_limit = global_limit_exec(streaming_table, 2, Some(5));
 
-    let initial = get_plan_string(&global_limit);
-    let expected_initial = [
-            "GlobalLimitExec: skip=2, fetch=5",
-            "  StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-    assert_eq!(initial, expected_initial);
+    let initial = format_plan(&global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=2, fetch=5
+      StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?;
 
-    let expected = [
-            "GlobalLimitExec: skip=2, fetch=5",
-            "  StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true, fetch=7"
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    GlobalLimitExec: skip=2, fetch=5
+      StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true, fetch=7
+    "
+    );
 
     Ok(())
 }
 
+fn join_on_columns(
+    left_col: &str,
+    right_col: &str,
+) -> Vec<(PhysicalExprRef, PhysicalExprRef)> {
+    vec![(
+        Arc::new(datafusion_physical_expr::expressions::Column::new(
+            left_col, 0,
+        )) as _,
+        Arc::new(datafusion_physical_expr::expressions::Column::new(
+            right_col, 0,
+        )) as _,
+    )]
+}
+
 #[test]
-fn transforms_coalesce_batches_exec_into_fetching_version_and_removes_local_limit(
-) -> Result<()> {
+fn absorbs_limit_into_hash_join_inner() -> Result<()> {
+    // HashJoinExec with Inner join should absorb limit via with_fetch
     let schema = create_schema();
-    let streaming_table = stream_exec(&schema);
-    let repartition = repartition_exec(streaming_table)?;
-    let filter = filter_exec(schema, repartition)?;
-    let coalesce_batches = coalesce_batches_exec(filter, 8192);
-    let local_limit = local_limit_exec(coalesce_batches, 5);
-    let coalesce_partitions = coalesce_partitions_exec(local_limit);
-    let global_limit = global_limit_exec(coalesce_partitions, 0, Some(5));
-
-    let initial = get_plan_string(&global_limit);
-    let expected_initial = [
-            "GlobalLimitExec: skip=0, fetch=5",
-            "  CoalescePartitionsExec",
-            "    LocalLimitExec: fetch=5",
-            "      CoalesceBatchesExec: target_batch_size=8192",
-            "        FilterExec: c3@2 > 0",
-            "          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "            StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-    assert_eq!(initial, expected_initial);
+    let left = empty_exec(Arc::clone(&schema));
+    let right = empty_exec(Arc::clone(&schema));
+    let on = join_on_columns("c1", "c1");
+    let hash_join = hash_join_exec(left, right, on, None, &JoinType::Inner)?;
+    let global_limit = global_limit_exec(hash_join, 0, Some(5));
+
+    let initial = format_plan(&global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=0, fetch=5
+      HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c1@0, c1@0)]
+        EmptyExec
+        EmptyExec
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?;
-
-    let expected = [
-        "CoalescePartitionsExec: fetch=5",
-        "  CoalesceBatchesExec: target_batch_size=8192, fetch=5",
-        "    FilterExec: c3@2 > 0",
-        "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-        "        StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let optimized = format_plan(&after_optimize);
+    // The limit should be absorbed by the hash join (not pushed to children)
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c1@0, c1@0)], fetch=5
+      EmptyExec
+      EmptyExec
+    "
+    );
 
     Ok(())
 }
 
 #[test]
-fn pushes_global_limit_exec_through_projection_exec() -> Result<()> {
+fn absorbs_limit_into_hash_join_right() -> Result<()> {
+    // HashJoinExec with Right join should absorb limit via with_fetch
     let schema = create_schema();
-    let streaming_table = stream_exec(&schema);
-    let filter = filter_exec(Arc::clone(&schema), streaming_table)?;
-    let projection = projection_exec(schema, filter)?;
-    let global_limit = global_limit_exec(projection, 0, Some(5));
+    let left = empty_exec(Arc::clone(&schema));
+    let right = empty_exec(Arc::clone(&schema));
+    let on = join_on_columns("c1", "c1");
+    let hash_join = hash_join_exec(left, right, on, None, &JoinType::Right)?;
+    let global_limit = global_limit_exec(hash_join, 0, Some(10));
+
+    let initial = format_plan(&global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=0, fetch=10
+      HashJoinExec: mode=Partitioned, join_type=Right, on=[(c1@0, c1@0)]
+        EmptyExec
+        EmptyExec
+    "
+    );
+
+    let after_optimize =
+        LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?;
+    let optimized = format_plan(&after_optimize);
+    // The limit should be absorbed by the hash join
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    HashJoinExec: mode=Partitioned, join_type=Right, on=[(c1@0, c1@0)], fetch=10
+      EmptyExec
+      EmptyExec
+    "
+    );
 
-    let initial = get_plan_string(&global_limit);
-    let expected_initial = [
-            "GlobalLimitExec: skip=0, fetch=5",
-            "  ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]",
-            "    FilterExec: c3@2 > 0",
-            "      StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-    assert_eq!(initial, expected_initial);
+    Ok(())
+}
+
+#[test]
+fn absorbs_limit_into_hash_join_left() -> Result<()> {
+    // during probing, then unmatched rows at the end, stopping when limit is reached
+    let schema = create_schema();
+    let left = empty_exec(Arc::clone(&schema));
+    let right = empty_exec(Arc::clone(&schema));
+    let on = join_on_columns("c1", "c1");
+    let hash_join = hash_join_exec(left, right, on, None, &JoinType::Left)?;
+    let global_limit = global_limit_exec(hash_join, 0, Some(5));
+
+    let initial = format_plan(&global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=0, fetch=5
+      HashJoinExec: mode=Partitioned, join_type=Left, on=[(c1@0, c1@0)]
+        EmptyExec
+        EmptyExec
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?;
+    let optimized = format_plan(&after_optimize);
+    // Left join now absorbs the limit
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    HashJoinExec: mode=Partitioned, join_type=Left, on=[(c1@0, c1@0)], fetch=5
+      EmptyExec
+      EmptyExec
+    "
+    );
 
-    let expected = [
-            "ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]",
-            "  GlobalLimitExec: skip=0, fetch=5",
-            "    FilterExec: c3@2 > 0",
-            "      StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    Ok(())
+}
+
+#[test]
+fn absorbs_limit_with_skip_into_hash_join() -> Result<()> {
+    let schema = create_schema();
+    let left = empty_exec(Arc::clone(&schema));
+    let right = empty_exec(Arc::clone(&schema));
+    let on = join_on_columns("c1", "c1");
+    let hash_join = hash_join_exec(left, right, on, None, &JoinType::Inner)?;
+    let global_limit = global_limit_exec(hash_join, 3, Some(5));
+
+    let initial = format_plan(&global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=3, fetch=5
+      HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c1@0, c1@0)]
+        EmptyExec
+        EmptyExec
+    "
+    );
+
+    let after_optimize =
+        LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?;
+    let optimized = format_plan(&after_optimize);
+    // With skip, GlobalLimit is kept but fetch (skip + limit = 8) is absorbed by the join
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    GlobalLimitExec: skip=3, fetch=5
+      HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c1@0, c1@0)], fetch=8
+        EmptyExec
+        EmptyExec
+    "
+    );
 
     Ok(())
 }
 
 #[test]
-fn pushes_global_limit_exec_through_projection_exec_and_transforms_coalesce_batches_exec_into_fetching_version(
-) -> Result<()> {
+fn pushes_global_limit_exec_through_projection_exec() -> Result<()> {
     let schema = create_schema();
     let streaming_table = stream_exec(&schema);
-    let coalesce_batches = coalesce_batches_exec(streaming_table, 8192);
-    let projection = projection_exec(schema, coalesce_batches)?;
+    let filter = filter_exec(Arc::clone(&schema), streaming_table)?;
+    let projection = projection_exec(schema, filter)?;
     let global_limit = global_limit_exec(projection, 0, Some(5));
 
-    let initial = get_plan_string(&global_limit);
-    let expected_initial = [
-            "GlobalLimitExec: skip=0, fetch=5",
-            "  ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]",
-            "    CoalesceBatchesExec: target_batch_size=8192",
-            "      StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-
-    assert_eq!(initial, expected_initial);
+    let initial = format_plan(&global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=0, fetch=5
+      ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]
+        FilterExec: c3@2 > 0
+          StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?;
 
-    let expected = [
-            "ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]",
-            "  CoalesceBatchesExec: target_batch_size=8192, fetch=5",
-            "    StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]
+      FilterExec: c3@2 > 0, fetch=5
+        StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true
+    "
+    );
 
     Ok(())
 }
@@ -244,8 +363,7 @@ fn pushes_global_limit_exec_through_projection_exec_and_transforms_coalesce_batc
 fn pushes_global_limit_into_multiple_fetch_plans() -> Result<()> {
     let schema = create_schema();
     let streaming_table = stream_exec(&schema);
-    let coalesce_batches = coalesce_batches_exec(streaming_table, 8192);
-    let projection = projection_exec(Arc::clone(&schema), coalesce_batches)?;
+    let projection = projection_exec(Arc::clone(&schema), streaming_table)?;
     let repartition = repartition_exec(projection)?;
     let ordering: LexOrdering = [PhysicalSortExpr {
         expr: col("c1", &schema)?,
@@ -256,31 +374,33 @@ fn pushes_global_limit_into_multiple_fetch_plans() -> Result<()> {
     let spm = sort_preserving_merge_exec(ordering, sort);
     let global_limit = global_limit_exec(spm, 0, Some(5));
 
-    let initial = get_plan_string(&global_limit);
-    let expected_initial = [
-            "GlobalLimitExec: skip=0, fetch=5",
-            "  SortPreservingMergeExec: [c1@0 ASC]",
-            "    SortExec: expr=[c1@0 ASC], preserve_partitioning=[false]",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "        ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]",
-            "          CoalesceBatchesExec: target_batch_size=8192",
-            "            StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-
-    assert_eq!(initial, expected_initial);
+    let initial = format_plan(&global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=0, fetch=5
+      SortPreservingMergeExec: [c1@0 ASC]
+        SortExec: expr=[c1@0 ASC], preserve_partitioning=[false]
+          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+            ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]
+              StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?;
 
-    let expected = [
-            "SortPreservingMergeExec: [c1@0 ASC], fetch=5",
-            "  SortExec: TopK(fetch=5), expr=[c1@0 ASC], preserve_partitioning=[false]",
-            "    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "      ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]",
-            "        CoalesceBatchesExec: target_batch_size=8192",
-            "          StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    SortPreservingMergeExec: [c1@0 ASC], fetch=5
+      SortExec: TopK(fetch=5), expr=[c1@0 ASC], preserve_partitioning=[false]
+        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+          ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]
+            StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true
+    "
+    );
 
     Ok(())
 }
@@ -295,26 +415,31 @@ fn keeps_pushed_local_limit_exec_when_there_are_multiple_input_partitions() -> R
     let coalesce_partitions = coalesce_partitions_exec(filter);
     let global_limit = global_limit_exec(coalesce_partitions, 0, Some(5));
 
-    let initial = get_plan_string(&global_limit);
-    let expected_initial = [
-            "GlobalLimitExec: skip=0, fetch=5",
-            "  CoalescePartitionsExec",
-            "    FilterExec: c3@2 > 0",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "        StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-    assert_eq!(initial, expected_initial);
+    let initial = format_plan(&global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=0, fetch=5
+      CoalescePartitionsExec
+        FilterExec: c3@2 > 0
+          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+            StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?;
 
-    let expected = [
-            "CoalescePartitionsExec: fetch=5",
-            "  FilterExec: c3@2 > 0",
-            "    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "      StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    CoalescePartitionsExec: fetch=5
+      FilterExec: c3@2 > 0, fetch=5
+        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+          StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true
+    "
+    );
 
     Ok(())
 }
@@ -326,20 +451,27 @@ fn merges_local_limit_with_local_limit() -> Result<()> {
     let child_local_limit = local_limit_exec(empty_exec, 10);
     let parent_local_limit = local_limit_exec(child_local_limit, 20);
 
-    let initial = get_plan_string(&parent_local_limit);
-    let expected_initial = [
-        "LocalLimitExec: fetch=20",
-        "  LocalLimitExec: fetch=10",
-        "    EmptyExec",
-    ];
-
-    assert_eq!(initial, expected_initial);
+    let initial = format_plan(&parent_local_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    LocalLimitExec: fetch=20
+      LocalLimitExec: fetch=10
+        EmptyExec
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(parent_local_limit, &ConfigOptions::new())?;
 
-    let expected = ["GlobalLimitExec: skip=0, fetch=10", "  EmptyExec"];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    GlobalLimitExec: skip=0, fetch=10
+      EmptyExec
+    "
+    );
 
     Ok(())
 }
@@ -351,20 +483,27 @@ fn merges_global_limit_with_global_limit() -> Result<()> {
     let child_global_limit = global_limit_exec(empty_exec, 10, Some(30));
     let parent_global_limit = global_limit_exec(child_global_limit, 10, Some(20));
 
-    let initial = get_plan_string(&parent_global_limit);
-    let expected_initial = [
-        "GlobalLimitExec: skip=10, fetch=20",
-        "  GlobalLimitExec: skip=10, fetch=30",
-        "    EmptyExec",
-    ];
-
-    assert_eq!(initial, expected_initial);
+    let initial = format_plan(&parent_global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=10, fetch=20
+      GlobalLimitExec: skip=10, fetch=30
+        EmptyExec
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(parent_global_limit, &ConfigOptions::new())?;
 
-    let expected = ["GlobalLimitExec: skip=20, fetch=20", "  EmptyExec"];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    GlobalLimitExec: skip=20, fetch=20
+      EmptyExec
+    "
+    );
 
     Ok(())
 }
@@ -376,20 +515,27 @@ fn merges_global_limit_with_local_limit() -> Result<()> {
     let local_limit = local_limit_exec(empty_exec, 40);
     let global_limit = global_limit_exec(local_limit, 20, Some(30));
 
-    let initial = get_plan_string(&global_limit);
-    let expected_initial = [
-        "GlobalLimitExec: skip=20, fetch=30",
-        "  LocalLimitExec: fetch=40",
-        "    EmptyExec",
-    ];
-
-    assert_eq!(initial, expected_initial);
+    let initial = format_plan(&global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=20, fetch=30
+      LocalLimitExec: fetch=40
+        EmptyExec
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?;
 
-    let expected = ["GlobalLimitExec: skip=20, fetch=20", "  EmptyExec"];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    GlobalLimitExec: skip=20, fetch=20
+      EmptyExec
+    "
+    );
 
     Ok(())
 }
@@ -401,20 +547,173 @@ fn merges_local_limit_with_global_limit() -> Result<()> {
     let global_limit = global_limit_exec(empty_exec, 20, Some(30));
     let local_limit = local_limit_exec(global_limit, 20);
 
-    let initial = get_plan_string(&local_limit);
-    let expected_initial = [
-        "LocalLimitExec: fetch=20",
-        "  GlobalLimitExec: skip=20, fetch=30",
-        "    EmptyExec",
-    ];
-
-    assert_eq!(initial, expected_initial);
+    let initial = format_plan(&local_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    LocalLimitExec: fetch=20
+      GlobalLimitExec: skip=20, fetch=30
+        EmptyExec
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(local_limit, &ConfigOptions::new())?;
 
-    let expected = ["GlobalLimitExec: skip=20, fetch=20", "  EmptyExec"];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    GlobalLimitExec: skip=20, fetch=20
+      EmptyExec
+    "
+    );
+
+    Ok(())
+}
+
+#[test]
+fn preserves_nested_global_limit() -> Result<()> {
+    // If there are multiple limits in an execution plan, they all need to be
+    // preserved in the optimized plan.
+    //
+    // Plan structure:
+    // GlobalLimitExec: skip=1, fetch=1
+    //   NestedLoopJoinExec (Left)
+    //     EmptyExec (left side)
+    //     GlobalLimitExec: skip=2, fetch=1
+    //       NestedLoopJoinExec (Right)
+    //         EmptyExec (left side)
+    //         EmptyExec (right side)
+    let schema = create_schema();
+
+    // Build inner join: NestedLoopJoin(Empty, Empty)
+    let inner_left = empty_exec(Arc::clone(&schema));
+    let inner_right = empty_exec(Arc::clone(&schema));
+    let inner_join = nested_loop_join_exec(inner_left, inner_right, JoinType::Right)?;
+
+    // Add inner limit: GlobalLimitExec: skip=2, fetch=1
+    let inner_limit = global_limit_exec(inner_join, 2, Some(1));
+
+    // Build outer join: NestedLoopJoin(Empty, GlobalLimit)
+    let outer_left = empty_exec(Arc::clone(&schema));
+    let outer_join = nested_loop_join_exec(outer_left, inner_limit, JoinType::Left)?;
+
+    // Add outer limit: GlobalLimitExec: skip=1, fetch=1
+    let outer_limit = global_limit_exec(outer_join, 1, Some(1));
+
+    let initial = format_plan(&outer_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=1, fetch=1
+      NestedLoopJoinExec: join_type=Left
+        EmptyExec
+        GlobalLimitExec: skip=2, fetch=1
+          NestedLoopJoinExec: join_type=Right
+            EmptyExec
+            EmptyExec
+    "
+    );
+
+    let after_optimize =
+        LimitPushdown::new().optimize(outer_limit, &ConfigOptions::new())?;
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    GlobalLimitExec: skip=1, fetch=1
+      NestedLoopJoinExec: join_type=Left
+        EmptyExec
+        GlobalLimitExec: skip=2, fetch=1
+          NestedLoopJoinExec: join_type=Right
+            EmptyExec
+            EmptyExec
+    "
+    );
+
+    Ok(())
+}
+
+#[test]
+fn preserves_skip_before_sort() -> Result<()> {
+    // If there's a limit with skip before a node that (1) supports fetch but
+    // (2) does not support limit pushdown, that limit should not be removed.
+    //
+    // Plan structure:
+    // GlobalLimitExec: skip=1, fetch=None
+    //   SortExec: TopK(fetch=4)
+    //     EmptyExec
+    let schema = create_schema();
+
+    let empty = empty_exec(Arc::clone(&schema));
+
+    let ordering = [PhysicalSortExpr {
+        expr: col("c1", &schema)?,
+        options: SortOptions::default(),
+    }];
+    let sort = sort_exec(ordering.into(), empty)
+        .with_fetch(Some(4))
+        .unwrap();
+
+    let outer_limit = global_limit_exec(sort, 1, None);
+
+    let initial = format_plan(&outer_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=1, fetch=None
+      SortExec: TopK(fetch=4), expr=[c1@0 ASC], preserve_partitioning=[false]
+        EmptyExec
+    "
+    );
+
+    let after_optimize =
+        LimitPushdown::new().optimize(outer_limit, &ConfigOptions::new())?;
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    GlobalLimitExec: skip=1, fetch=3
+      SortExec: TopK(fetch=4), expr=[c1@0 ASC], preserve_partitioning=[false]
+        EmptyExec
+    "
+    );
+
+    Ok(())
+}
+
+#[test]
+fn no_limit_preserves_plan_identity() -> Result<()> {
+    // When there is no limit in the plan, the optimizer should return the
+    // exact same Arc (pointer-equal) for every node, avoiding unnecessary
+    // plan reconstruction and property recomputation.
+    let schema = create_schema();
+
+    let left = empty_exec(Arc::clone(&schema));
+    let right = empty_exec(Arc::clone(&schema));
+    let on = join_on_columns("c1", "c1");
+    let join = hash_join_exec(left, right, on, None, &JoinType::Inner)?;
+    let plan = filter_exec(Arc::clone(&schema), join)?;
+
+    let optimized =
+        LimitPushdown::new().optimize(Arc::clone(&plan), &ConfigOptions::new())?;
+
+    assert!(
+        Arc::ptr_eq(&plan, &optimized),
+        "Expected optimizer to return the same Arc when no limit is present"
+    );
+
+    let optimized = format_plan(&optimized);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    FilterExec: c3@2 > 0
+      HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c1@0, c1@0)]
+        EmptyExec
+        EmptyExec
+    "
+    );
 
     Ok(())
 }
diff --git a/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs b/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs
index ad15d6803413b..c523b4a752a82 100644
--- a/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs
+++ b/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs
@@ -21,8 +21,8 @@ use insta::assert_snapshot;
 use std::sync::Arc;
 
 use crate::physical_optimizer::test_utils::{
-    build_group_by, get_optimized_plan, mock_data, parquet_exec_with_sort, schema,
-    TestAggregate,
+    TestAggregate, build_group_by, get_optimized_plan, mock_data, parquet_exec_with_sort,
+    schema,
 };
 
 use arrow::datatypes::DataType;
@@ -34,10 +34,10 @@ use datafusion_expr::Operator;
 use datafusion_physical_expr::expressions::{self, cast, col};
 use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 use datafusion_physical_plan::{
+    ExecutionPlan,
     aggregates::{AggregateExec, AggregateMode},
     collect,
     limit::{GlobalLimitExec, LocalLimitExec},
-    ExecutionPlan,
 };
 
 async fn run_plan_and_format(plan: Arc<dyn ExecutionPlan>) -> Result<String> {
diff --git a/datafusion/core/tests/physical_optimizer/mod.rs b/datafusion/core/tests/physical_optimizer/mod.rs
index 936c02eb2a02d..b7ba661d2343a 100644
--- a/datafusion/core/tests/physical_optimizer/mod.rs
+++ b/datafusion/core/tests/physical_optimizer/mod.rs
@@ -17,18 +17,26 @@
 
 //! Physical Optimizer integration tests
 
+#[expect(clippy::needless_pass_by_value)]
 mod aggregate_statistics;
 mod combine_partial_final_agg;
+#[expect(clippy::needless_pass_by_value)]
 mod enforce_distribution;
 mod enforce_sorting;
 mod enforce_sorting_monotonicity;
 mod filter_pushdown;
 mod join_selection;
+#[expect(clippy::needless_pass_by_value)]
 mod limit_pushdown;
 mod limited_distinct_aggregation;
 mod partition_statistics;
 mod projection_pushdown;
+mod pushdown_sort;
 mod replace_with_order_preserving_variants;
 mod sanity_checker;
+#[expect(clippy::needless_pass_by_value)]
 mod test_utils;
 mod window_optimize;
+mod window_topn;
+
+mod pushdown_utils;
diff --git a/datafusion/core/tests/physical_optimizer/partition_statistics.rs b/datafusion/core/tests/physical_optimizer/partition_statistics.rs
index 49dc5b845605d..f84d79146b24d 100644
--- a/datafusion/core/tests/physical_optimizer/partition_statistics.rs
+++ b/datafusion/core/tests/physical_optimizer/partition_statistics.rs
@@ -25,36 +25,41 @@ mod test {
     use datafusion::datasource::listing::ListingTable;
     use datafusion::prelude::SessionContext;
     use datafusion_catalog::TableProvider;
-    use datafusion_common::stats::Precision;
     use datafusion_common::Result;
-    use datafusion_common::{ColumnStatistics, ScalarValue, Statistics};
-    use datafusion_execution::config::SessionConfig;
+    use datafusion_common::stats::Precision;
+    use datafusion_common::{
+        ColumnStatistics, JoinType, NullEquality, ScalarValue, Statistics,
+    };
     use datafusion_execution::TaskContext;
+    use datafusion_execution::config::SessionConfig;
+    use datafusion_expr::{WindowFrame, WindowFunctionDefinition};
     use datafusion_expr_common::operator::Operator;
     use datafusion_functions_aggregate::count::count_udaf;
-    use datafusion_physical_expr::aggregate::AggregateExprBuilder;
-    use datafusion_physical_expr::expressions::{binary, col, lit, Column};
     use datafusion_physical_expr::Partitioning;
+    use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+    use datafusion_physical_expr::expressions::{Column, binary, col, lit};
     use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
     use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
     use datafusion_physical_plan::aggregates::{
         AggregateExec, AggregateMode, PhysicalGroupBy,
     };
-    use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec;
     use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
     use datafusion_physical_plan::common::compute_record_batch_statistics;
     use datafusion_physical_plan::empty::EmptyExec;
     use datafusion_physical_plan::filter::FilterExec;
-    use datafusion_physical_plan::joins::CrossJoinExec;
+    use datafusion_physical_plan::joins::{
+        CrossJoinExec, HashJoinExec, NestedLoopJoinExec, PartitionMode,
+    };
     use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
     use datafusion_physical_plan::placeholder_row::PlaceholderRowExec;
     use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr};
     use datafusion_physical_plan::repartition::RepartitionExec;
     use datafusion_physical_plan::sorts::sort::SortExec;
     use datafusion_physical_plan::union::{InterleaveExec, UnionExec};
+    use datafusion_physical_plan::windows::{WindowAggExec, create_window_expr};
     use datafusion_physical_plan::{
-        execute_stream_partitioned, get_plan_string, ExecutionPlan,
-        ExecutionPlanProperties,
+        ExecutionPlan, ExecutionPlanProperties, execute_stream_partitioned,
+        get_plan_string,
     };
 
     use futures::TryStreamExt;
@@ -67,7 +72,7 @@ mod test {
     /// - Each partition has an "id" column (INT) with the following values:
     ///   - First partition: [3, 4]
     ///   - Second partition: [1, 2]
-    /// - Each row is 110 bytes in size
+    /// - Each partition has 16 bytes total (Int32 id: 4 bytes × 2 rows + Date32 date: 4 bytes × 2 rows)
     ///
     /// @param create_table_sql Optional parameter to set the create table SQL
     /// @param target_partition Optional parameter to set the target partitions
@@ -101,40 +106,61 @@ mod test {
             .await
             .unwrap();
         let table = ctx.table_provider(table_name.as_str()).await.unwrap();
-        let listing_table = table
-            .as_any()
-            .downcast_ref::<ListingTable>()
-            .unwrap()
-            .clone();
+        let listing_table = table.downcast_ref::<ListingTable>().unwrap().clone();
         listing_table
             .scan(&ctx.state(), None, &[], None)
             .await
             .unwrap()
     }
 
+    // Date32 values for test data (days since 1970-01-01):
+    // 2025-03-01 = 20148
+    // 2025-03-02 = 20149
+    // 2025-03-03 = 20150
+    // 2025-03-04 = 20151
+    const DATE_2025_03_01: i32 = 20148;
+    const DATE_2025_03_02: i32 = 20149;
+    const DATE_2025_03_03: i32 = 20150;
+    const DATE_2025_03_04: i32 = 20151;
+
     /// Helper function to create expected statistics for a partition with Int32 column
+    ///
+    /// If `date_range` is provided, includes exact statistics for the partition date column.
+    /// Partition column statistics are exact because all rows in a partition share the same value.
     fn create_partition_statistics(
         num_rows: usize,
         total_byte_size: usize,
         min_value: i32,
         max_value: i32,
-        include_date_column: bool,
+        date_range: Option<(i32, i32)>,
     ) -> Statistics {
+        // Int32 is 4 bytes per row
+        let int32_byte_size = num_rows * 4;
         let mut column_stats = vec![ColumnStatistics {
             null_count: Precision::Exact(0),
             max_value: Precision::Exact(ScalarValue::Int32(Some(max_value))),
             min_value: Precision::Exact(ScalarValue::Int32(Some(min_value))),
             sum_value: Precision::Absent,
             distinct_count: Precision::Absent,
+            byte_size: Precision::Exact(int32_byte_size),
         }];
 
-        if include_date_column {
+        if let Some((min_date, max_date)) = date_range {
+            // Partition column stats are computed from partition values:
+            // - null_count = 0 (partition values from paths are never null)
+            // - min/max are the merged partition values across files in the group
+            // - byte_size = num_rows * 4 (Date32 is 4 bytes per row)
+            // - distinct_count = Inexact(max_date - min_date + 1), derived from the
+            //   date range via interval analysis for temporal types
+            let date32_byte_size = num_rows * 4;
+            let distinct_dates = (max_date - min_date + 1) as usize;
             column_stats.push(ColumnStatistics {
-                null_count: Precision::Absent,
-                max_value: Precision::Absent,
-                min_value: Precision::Absent,
+                null_count: Precision::Exact(0),
+                max_value: Precision::Exact(ScalarValue::Date32(Some(max_date))),
+                min_value: Precision::Exact(ScalarValue::Date32(Some(min_date))),
                 sum_value: Precision::Absent,
-                distinct_count: Precision::Absent,
+                distinct_count: Precision::Inexact(distinct_dates),
+                byte_size: Precision::Exact(date32_byte_size),
             });
         }
 
@@ -214,14 +240,26 @@ mod test {
         let statistics = (0..scan.output_partitioning().partition_count())
             .map(|idx| scan.partition_statistics(Some(idx)))
             .collect::<Result<Vec<_>>>()?;
-        let expected_statistic_partition_1 =
-            create_partition_statistics(2, 110, 3, 4, true);
-        let expected_statistic_partition_2 =
-            create_partition_statistics(2, 110, 1, 2, true);
+        // Partition 1: ids [3,4], dates [2025-03-01, 2025-03-02]
+        let expected_statistic_partition_1 = create_partition_statistics(
+            2,
+            16,
+            3,
+            4,
+            Some((DATE_2025_03_01, DATE_2025_03_02)),
+        );
+        // Partition 2: ids [1,2], dates [2025-03-03, 2025-03-04]
+        let expected_statistic_partition_2 = create_partition_statistics(
+            2,
+            16,
+            1,
+            2,
+            Some((DATE_2025_03_03, DATE_2025_03_04)),
+        );
         // Check the statistics of each partition
         assert_eq!(statistics.len(), 2);
-        assert_eq!(statistics[0], expected_statistic_partition_1);
-        assert_eq!(statistics[1], expected_statistic_partition_2);
+        assert_eq!(*statistics[0], expected_statistic_partition_1);
+        assert_eq!(*statistics[1], expected_statistic_partition_2);
 
         // Check the statistics_by_partition with real results
         let expected_stats = vec![
@@ -246,14 +284,15 @@ mod test {
         let statistics = (0..projection.output_partitioning().partition_count())
             .map(|idx| projection.partition_statistics(Some(idx)))
             .collect::<Result<Vec<_>>>()?;
+        // Projection only includes id column, not the date partition column
         let expected_statistic_partition_1 =
-            create_partition_statistics(2, 8, 3, 4, false);
+            create_partition_statistics(2, 8, 3, 4, None);
         let expected_statistic_partition_2 =
-            create_partition_statistics(2, 8, 1, 2, false);
+            create_partition_statistics(2, 8, 1, 2, None);
         // Check the statistics of each partition
         assert_eq!(statistics.len(), 2);
-        assert_eq!(statistics[0], expected_statistic_partition_1);
-        assert_eq!(statistics[1], expected_statistic_partition_2);
+        assert_eq!(*statistics[0], expected_statistic_partition_1);
+        assert_eq!(*statistics[1], expected_statistic_partition_2);
 
         // Check the statistics_by_partition with real results
         let expected_stats = vec![
@@ -277,10 +316,16 @@ mod test {
         let statistics = (0..sort_exec.output_partitioning().partition_count())
             .map(|idx| sort_exec.partition_statistics(Some(idx)))
             .collect::<Result<Vec<_>>>()?;
-        let expected_statistic_partition =
-            create_partition_statistics(4, 220, 1, 4, true);
+        // All 4 files merged: ids [1-4], dates [2025-03-01, 2025-03-04]
+        let expected_statistic_partition = create_partition_statistics(
+            4,
+            32,
+            1,
+            4,
+            Some((DATE_2025_03_01, DATE_2025_03_04)),
+        );
         assert_eq!(statistics.len(), 1);
-        assert_eq!(statistics[0], expected_statistic_partition);
+        assert_eq!(*statistics[0], expected_statistic_partition);
         // Check the statistics_by_partition with real results
         let expected_stats = vec![ExpectedStatistics::NonEmpty(1, 4, 4)];
         validate_statistics_with_data(sort_exec.clone(), expected_stats, 0).await?;
@@ -291,16 +336,28 @@ mod test {
         let sort_exec: Arc<dyn ExecutionPlan> = Arc::new(
             SortExec::new(ordering.into(), scan_2).with_preserve_partitioning(true),
         );
-        let expected_statistic_partition_1 =
-            create_partition_statistics(2, 110, 3, 4, true);
-        let expected_statistic_partition_2 =
-            create_partition_statistics(2, 110, 1, 2, true);
+        // Partition 1: ids [3,4], dates [2025-03-01, 2025-03-02]
+        let expected_statistic_partition_1 = create_partition_statistics(
+            2,
+            16,
+            3,
+            4,
+            Some((DATE_2025_03_01, DATE_2025_03_02)),
+        );
+        // Partition 2: ids [1,2], dates [2025-03-03, 2025-03-04]
+        let expected_statistic_partition_2 = create_partition_statistics(
+            2,
+            16,
+            1,
+            2,
+            Some((DATE_2025_03_03, DATE_2025_03_04)),
+        );
         let statistics = (0..sort_exec.output_partitioning().partition_count())
             .map(|idx| sort_exec.partition_statistics(Some(idx)))
             .collect::<Result<Vec<_>>>()?;
         assert_eq!(statistics.len(), 2);
-        assert_eq!(statistics[0], expected_statistic_partition_1);
-        assert_eq!(statistics[1], expected_statistic_partition_2);
+        assert_eq!(*statistics[0], expected_statistic_partition_1);
+        assert_eq!(*statistics[1], expected_statistic_partition_2);
 
         // Check the statistics_by_partition with real results
         let expected_stats = vec![
@@ -324,34 +381,61 @@ mod test {
         let filter: Arc<dyn ExecutionPlan> =
             Arc::new(FilterExec::try_new(predicate, scan)?);
         let full_statistics = filter.partition_statistics(None)?;
+        // Filter preserves original total_rows and byte_size from input
+        // (4 total rows = 2 partitions * 2 rows each, byte_size = 4 * 4 = 16 bytes for int32)
         let expected_full_statistic = Statistics {
             num_rows: Precision::Inexact(0),
             total_byte_size: Precision::Inexact(0),
             column_statistics: vec![
                 ColumnStatistics {
                     null_count: Precision::Exact(0),
-                    max_value: Precision::Exact(ScalarValue::Null),
-                    min_value: Precision::Exact(ScalarValue::Null),
-                    sum_value: Precision::Exact(ScalarValue::Null),
+                    max_value: Precision::Exact(ScalarValue::Int32(None)),
+                    min_value: Precision::Exact(ScalarValue::Int32(None)),
+                    sum_value: Precision::Exact(ScalarValue::Int32(None)),
                     distinct_count: Precision::Exact(0),
+                    byte_size: Precision::Exact(16),
                 },
                 ColumnStatistics {
                     null_count: Precision::Exact(0),
-                    max_value: Precision::Exact(ScalarValue::Null),
-                    min_value: Precision::Exact(ScalarValue::Null),
-                    sum_value: Precision::Exact(ScalarValue::Null),
+                    max_value: Precision::Exact(ScalarValue::Date32(None)),
+                    min_value: Precision::Exact(ScalarValue::Date32(None)),
+                    sum_value: Precision::Exact(ScalarValue::Date32(None)),
                     distinct_count: Precision::Exact(0),
+                    byte_size: Precision::Exact(16), // 4 rows * 4 bytes (Date32)
                 },
             ],
         };
-        assert_eq!(full_statistics, expected_full_statistic);
+        assert_eq!(*full_statistics, expected_full_statistic);
 
         let statistics = (0..filter.output_partitioning().partition_count())
             .map(|idx| filter.partition_statistics(Some(idx)))
             .collect::<Result<Vec<_>>>()?;
         assert_eq!(statistics.len(), 2);
-        assert_eq!(statistics[0], expected_full_statistic);
-        assert_eq!(statistics[1], expected_full_statistic);
+        // Per-partition stats: each partition has 2 rows, byte_size = 2 * 4 = 8
+        let expected_partition_statistic = Statistics {
+            num_rows: Precision::Inexact(0),
+            total_byte_size: Precision::Inexact(0),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(None)),
+                    min_value: Precision::Exact(ScalarValue::Int32(None)),
+                    sum_value: Precision::Exact(ScalarValue::Int32(None)),
+                    distinct_count: Precision::Exact(0),
+                    byte_size: Precision::Exact(8),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(None)),
+                    min_value: Precision::Exact(ScalarValue::Date32(None)),
+                    sum_value: Precision::Exact(ScalarValue::Date32(None)),
+                    distinct_count: Precision::Exact(0),
+                    byte_size: Precision::Exact(8), // 2 rows * 4 bytes (Date32)
+                },
+            ],
+        };
+        assert_eq!(*statistics[0], expected_partition_statistic);
+        assert_eq!(*statistics[1], expected_partition_statistic);
         Ok(())
     }
 
@@ -365,18 +449,30 @@ mod test {
             .collect::<Result<Vec<_>>>()?;
         // Check that we have 4 partitions (2 from each scan)
         assert_eq!(statistics.len(), 4);
-        let expected_statistic_partition_1 =
-            create_partition_statistics(2, 110, 3, 4, true);
-        let expected_statistic_partition_2 =
-            create_partition_statistics(2, 110, 1, 2, true);
+        // Partition 1: ids [3,4], dates [2025-03-01, 2025-03-02]
+        let expected_statistic_partition_1 = create_partition_statistics(
+            2,
+            16,
+            3,
+            4,
+            Some((DATE_2025_03_01, DATE_2025_03_02)),
+        );
+        // Partition 2: ids [1,2], dates [2025-03-03, 2025-03-04]
+        let expected_statistic_partition_2 = create_partition_statistics(
+            2,
+            16,
+            1,
+            2,
+            Some((DATE_2025_03_03, DATE_2025_03_04)),
+        );
         // Verify first partition (from first scan)
-        assert_eq!(statistics[0], expected_statistic_partition_1);
+        assert_eq!(*statistics[0], expected_statistic_partition_1);
         // Verify second partition (from first scan)
-        assert_eq!(statistics[1], expected_statistic_partition_2);
+        assert_eq!(*statistics[1], expected_statistic_partition_2);
         // Verify third partition (from second scan - same as first partition)
-        assert_eq!(statistics[2], expected_statistic_partition_1);
+        assert_eq!(*statistics[2], expected_statistic_partition_1);
         // Verify fourth partition (from second scan - same as second partition)
-        assert_eq!(statistics[3], expected_statistic_partition_2);
+        assert_eq!(*statistics[3], expected_statistic_partition_2);
 
         // Check the statistics_by_partition with real results
         let expected_stats = vec![
@@ -416,16 +512,17 @@ mod test {
             .collect::<Result<Vec<_>>>()?;
         assert_eq!(stats.len(), 2);
 
+        // Each partition gets half of combined input, total_rows per partition = 4
         let expected_stats = Statistics {
             num_rows: Precision::Inexact(4),
-            total_byte_size: Precision::Inexact(220),
+            total_byte_size: Precision::Inexact(32),
             column_statistics: vec![
                 ColumnStatistics::new_unknown(),
                 ColumnStatistics::new_unknown(),
             ],
         };
-        assert_eq!(stats[0], expected_stats);
-        assert_eq!(stats[1], expected_stats);
+        assert_eq!(*stats[0], expected_stats);
+        assert_eq!(*stats[1], expected_stats);
 
         // Verify the execution results
         let partitions = execute_stream_partitioned(
@@ -461,30 +558,78 @@ mod test {
             .collect::<Result<Vec<_>>>()?;
         // Check that we have 2 partitions
         assert_eq!(statistics.len(), 2);
-        let mut expected_statistic_partition_1 =
-            create_partition_statistics(8, 48400, 1, 4, true);
-        expected_statistic_partition_1
-            .column_statistics
-            .push(ColumnStatistics {
-                null_count: Precision::Exact(0),
-                max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
-                min_value: Precision::Exact(ScalarValue::Int32(Some(3))),
-                sum_value: Precision::Absent,
-                distinct_count: Precision::Absent,
-            });
-        let mut expected_statistic_partition_2 =
-            create_partition_statistics(8, 48400, 1, 4, true);
-        expected_statistic_partition_2
-            .column_statistics
-            .push(ColumnStatistics {
-                null_count: Precision::Exact(0),
-                max_value: Precision::Exact(ScalarValue::Int32(Some(2))),
-                min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
-                sum_value: Precision::Absent,
-                distinct_count: Precision::Absent,
-            });
-        assert_eq!(statistics[0], expected_statistic_partition_1);
-        assert_eq!(statistics[1], expected_statistic_partition_2);
+        // Cross join output schema: [left.id, left.date, right.id]
+        // Cross join doesn't propagate Column's byte_size
+        let expected_statistic_partition_1 = Statistics {
+            num_rows: Precision::Exact(8),
+            total_byte_size: Precision::Exact(512),
+            column_statistics: vec![
+                // column 0: left.id (Int32, file column from t1)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Absent,
+                },
+                // column 1: left.date (Date32, partition column from t1)
+                // Partition column statistics are exact because all rows in a partition share the same value.
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(Some(20151))),
+                    min_value: Precision::Exact(ScalarValue::Date32(Some(20148))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Inexact(4),
+                    byte_size: Precision::Absent,
+                },
+                // column 2: right.id (Int32, file column from t2) - right partition 0: ids [3,4]
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(3))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Absent,
+                },
+            ],
+        };
+        let expected_statistic_partition_2 = Statistics {
+            num_rows: Precision::Exact(8),
+            total_byte_size: Precision::Exact(512),
+            column_statistics: vec![
+                // column 0: left.id (Int32, file column from t1)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Absent,
+                },
+                // column 1: left.date (Date32, partition column from t1)
+                // Partition column statistics are exact because all rows in a partition share the same value.
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(Some(20151))),
+                    min_value: Precision::Exact(ScalarValue::Date32(Some(20148))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Inexact(4),
+                    byte_size: Precision::Absent,
+                },
+                // column 2: right.id (Int32, file column from t2) - right partition 1: ids [1,2]
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(2))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Absent,
+                },
+            ],
+        };
+        assert_eq!(*statistics[0], expected_statistic_partition_1);
+        assert_eq!(*statistics[1], expected_statistic_partition_2);
 
         // Check the statistics_by_partition with real results
         let expected_stats = vec![
@@ -496,27 +641,77 @@ mod test {
     }
 
     #[tokio::test]
-    async fn test_statistic_by_partition_of_coalesce_batches() -> Result<()> {
-        let scan = create_scan_exec_with_statistics(None, Some(2)).await;
-        let coalesce_batches: Arc<dyn ExecutionPlan> =
-            Arc::new(CoalesceBatchesExec::new(scan, 2));
-        let expected_statistic_partition_1 =
-            create_partition_statistics(2, 110, 3, 4, true);
-        let expected_statistic_partition_2 =
-            create_partition_statistics(2, 110, 1, 2, true);
-        let statistics = (0..coalesce_batches.output_partitioning().partition_count())
-            .map(|idx| coalesce_batches.partition_statistics(Some(idx)))
+    async fn test_statistic_by_partition_of_nested_loop_join() -> Result<()> {
+        use datafusion_expr::JoinType;
+
+        let left_scan = create_scan_exec_with_statistics(None, Some(2)).await;
+        let left_scan_coalesced: Arc<dyn ExecutionPlan> =
+            Arc::new(CoalescePartitionsExec::new(left_scan));
+
+        let right_scan = create_scan_exec_with_statistics(None, Some(2)).await;
+
+        let nested_loop_join: Arc<dyn ExecutionPlan> =
+            Arc::new(NestedLoopJoinExec::try_new(
+                left_scan_coalesced,
+                right_scan,
+                None,
+                &JoinType::RightSemi,
+                None,
+            )?);
+
+        // Test partition_statistics(None) - returns overall statistics
+        // For RightSemi join, output columns come from right side only
+        let full_statistics = nested_loop_join.partition_statistics(None)?;
+        // With empty join columns, estimate_join_statistics returns Inexact row count
+        // based on the outer side (right side for RightSemi)
+        let mut expected_full_statistics = create_partition_statistics(
+            4,
+            32,
+            1,
+            4,
+            Some((DATE_2025_03_01, DATE_2025_03_04)),
+        );
+        expected_full_statistics.num_rows = Precision::Inexact(4);
+        expected_full_statistics.total_byte_size = Precision::Absent;
+        assert_eq!(*full_statistics, expected_full_statistics);
+
+        // Test partition_statistics(Some(idx)) - returns partition-specific statistics
+        // Partition 1: ids [3,4], dates [2025-03-01, 2025-03-02]
+        let mut expected_statistic_partition_1 = create_partition_statistics(
+            2,
+            16,
+            3,
+            4,
+            Some((DATE_2025_03_01, DATE_2025_03_02)),
+        );
+        expected_statistic_partition_1.num_rows = Precision::Inexact(2);
+        expected_statistic_partition_1.total_byte_size = Precision::Absent;
+
+        // Partition 2: ids [1,2], dates [2025-03-03, 2025-03-04]
+        let mut expected_statistic_partition_2 = create_partition_statistics(
+            2,
+            16,
+            1,
+            2,
+            Some((DATE_2025_03_03, DATE_2025_03_04)),
+        );
+        expected_statistic_partition_2.num_rows = Precision::Inexact(2);
+        expected_statistic_partition_2.total_byte_size = Precision::Absent;
+
+        let statistics = (0..nested_loop_join.output_partitioning().partition_count())
+            .map(|idx| nested_loop_join.partition_statistics(Some(idx)))
             .collect::<Result<Vec<_>>>()?;
         assert_eq!(statistics.len(), 2);
-        assert_eq!(statistics[0], expected_statistic_partition_1);
-        assert_eq!(statistics[1], expected_statistic_partition_2);
+        assert_eq!(*statistics[0], expected_statistic_partition_1);
+        assert_eq!(*statistics[1], expected_statistic_partition_2);
 
         // Check the statistics_by_partition with real results
         let expected_stats = vec![
             ExpectedStatistics::NonEmpty(3, 4, 2),
             ExpectedStatistics::NonEmpty(1, 2, 2),
         ];
-        validate_statistics_with_data(coalesce_batches, expected_stats, 0).await?;
+        validate_statistics_with_data(nested_loop_join, expected_stats, 0).await?;
+
         Ok(())
     }
 
@@ -525,13 +720,19 @@ mod test {
         let scan = create_scan_exec_with_statistics(None, Some(2)).await;
         let coalesce_partitions: Arc<dyn ExecutionPlan> =
             Arc::new(CoalescePartitionsExec::new(scan));
-        let expected_statistic_partition =
-            create_partition_statistics(4, 220, 1, 4, true);
+        // All files merged: ids [1-4], dates [2025-03-01, 2025-03-04]
+        let expected_statistic_partition = create_partition_statistics(
+            4,
+            32,
+            1,
+            4,
+            Some((DATE_2025_03_01, DATE_2025_03_04)),
+        );
         let statistics = (0..coalesce_partitions.output_partitioning().partition_count())
             .map(|idx| coalesce_partitions.partition_statistics(Some(idx)))
             .collect::<Result<Vec<_>>>()?;
         assert_eq!(statistics.len(), 1);
-        assert_eq!(statistics[0], expected_statistic_partition);
+        assert_eq!(*statistics[0], expected_statistic_partition);
 
         // Check the statistics_by_partition with real results
         let expected_stats = vec![ExpectedStatistics::NonEmpty(1, 4, 4)];
@@ -548,20 +749,20 @@ mod test {
             .map(|idx| local_limit.partition_statistics(Some(idx)))
             .collect::<Result<Vec<_>>>()?;
         assert_eq!(statistics.len(), 2);
-        let mut expected_0 = statistics[0].clone();
+        let mut expected_0 = Statistics::clone(&statistics[0]);
         expected_0.column_statistics = expected_0
             .column_statistics
             .into_iter()
             .map(|c| c.to_inexact())
             .collect();
-        let mut expected_1 = statistics[1].clone();
+        let mut expected_1 = Statistics::clone(&statistics[1]);
         expected_1.column_statistics = expected_1
             .column_statistics
             .into_iter()
             .map(|c| c.to_inexact())
             .collect();
-        assert_eq!(statistics[0], expected_0);
-        assert_eq!(statistics[1], expected_1);
+        assert_eq!(*statistics[0], expected_0);
+        assert_eq!(*statistics[1], expected_1);
         Ok(())
     }
 
@@ -575,9 +776,15 @@ mod test {
             .map(|idx| global_limit.partition_statistics(Some(idx)))
             .collect::<Result<Vec<_>>>()?;
         assert_eq!(statistics.len(), 1);
-        let expected_statistic_partition =
-            create_partition_statistics(2, 110, 3, 4, true);
-        assert_eq!(statistics[0], expected_statistic_partition);
+        // GlobalLimit takes from first partition: ids [3,4], dates [2025-03-01, 2025-03-02]
+        let expected_statistic_partition = create_partition_statistics(
+            2,
+            16,
+            3,
+            4,
+            Some((DATE_2025_03_01, DATE_2025_03_02)),
+        );
+        assert_eq!(*statistics[0], expected_statistic_partition);
         Ok(())
     }
 
@@ -601,11 +808,13 @@ mod test {
             ),
         ]);
 
-        let aggr_expr = vec![AggregateExprBuilder::new(count_udaf(), vec![lit(1)])
-            .schema(Arc::clone(&scan_schema))
-            .alias(String::from("COUNT(c)"))
-            .build()
-            .map(Arc::new)?];
+        let aggr_expr = vec![
+            AggregateExprBuilder::new(count_udaf(), vec![lit(1)])
+                .schema(Arc::clone(&scan_schema))
+                .alias(String::from("COUNT(c)"))
+                .build()
+                .map(Arc::new)?,
+        ];
 
         let aggregate_exec_partial: Arc<dyn ExecutionPlan> =
             Arc::new(AggregateExec::try_new(
@@ -625,9 +834,10 @@ mod test {
 
         let p0_statistics = aggregate_exec_partial.partition_statistics(Some(0))?;
 
+        // Aggregate doesn't propagate num_rows and ColumnStatistics byte_size from input
         let expected_p0_statistics = Statistics {
             num_rows: Precision::Inexact(2),
-            total_byte_size: Precision::Absent,
+            total_byte_size: Precision::Inexact(16),
             column_statistics: vec![
                 ColumnStatistics {
                     null_count: Precision::Absent,
@@ -635,17 +845,18 @@ mod test {
                     min_value: Precision::Exact(ScalarValue::Int32(Some(3))),
                     sum_value: Precision::Absent,
                     distinct_count: Precision::Absent,
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics::new_unknown(),
                 ColumnStatistics::new_unknown(),
             ],
         };
 
-        assert_eq!(&p0_statistics, &expected_p0_statistics);
+        assert_eq!(*p0_statistics, expected_p0_statistics);
 
         let expected_p1_statistics = Statistics {
             num_rows: Precision::Inexact(2),
-            total_byte_size: Precision::Absent,
+            total_byte_size: Precision::Inexact(16),
             column_statistics: vec![
                 ColumnStatistics {
                     null_count: Precision::Absent,
@@ -653,6 +864,7 @@ mod test {
                     min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
                     sum_value: Precision::Absent,
                     distinct_count: Precision::Absent,
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics::new_unknown(),
                 ColumnStatistics::new_unknown(),
@@ -660,7 +872,7 @@ mod test {
         };
 
         let p1_statistics = aggregate_exec_partial.partition_statistics(Some(1))?;
-        assert_eq!(&p1_statistics, &expected_p1_statistics);
+        assert_eq!(*p1_statistics, expected_p1_statistics);
 
         validate_statistics_with_data(
             aggregate_exec_partial.clone(),
@@ -682,10 +894,10 @@ mod test {
         )?);
 
         let p0_statistics = agg_final.partition_statistics(Some(0))?;
-        assert_eq!(&p0_statistics, &expected_p0_statistics);
+        assert_eq!(*p0_statistics, expected_p0_statistics);
 
         let p1_statistics = agg_final.partition_statistics(Some(1))?;
-        assert_eq!(&p1_statistics, &expected_p1_statistics);
+        assert_eq!(*p1_statistics, expected_p1_statistics);
 
         validate_statistics_with_data(
             agg_final.clone(),
@@ -720,14 +932,17 @@ mod test {
             num_rows: Precision::Exact(0),
             total_byte_size: Precision::Absent,
             column_statistics: vec![
-                ColumnStatistics::new_unknown(),
+                ColumnStatistics {
+                    distinct_count: Precision::Exact(0),
+                    ..ColumnStatistics::new_unknown()
+                },
                 ColumnStatistics::new_unknown(),
                 ColumnStatistics::new_unknown(),
             ],
         };
 
-        assert_eq!(&empty_stat, &agg_partial.partition_statistics(Some(0))?);
-        assert_eq!(&empty_stat, &agg_partial.partition_statistics(Some(1))?);
+        assert_eq!(empty_stat, *agg_partial.partition_statistics(Some(0))?);
+        assert_eq!(empty_stat, *agg_partial.partition_statistics(Some(1))?);
         validate_statistics_with_data(
             agg_partial.clone(),
             vec![ExpectedStatistics::Empty, ExpectedStatistics::Empty],
@@ -753,8 +968,8 @@ mod test {
             agg_partial.schema(),
         )?);
 
-        assert_eq!(&empty_stat, &agg_final.partition_statistics(Some(0))?);
-        assert_eq!(&empty_stat, &agg_final.partition_statistics(Some(1))?);
+        assert_eq!(empty_stat, *agg_final.partition_statistics(Some(0))?);
+        assert_eq!(empty_stat, *agg_final.partition_statistics(Some(1))?);
 
         validate_statistics_with_data(
             agg_final,
@@ -790,7 +1005,7 @@ mod test {
             column_statistics: vec![ColumnStatistics::new_unknown()],
         };
 
-        assert_eq!(&expect_stat, &agg_final.partition_statistics(Some(0))?);
+        assert_eq!(expect_stat, *agg_final.partition_statistics(Some(0))?);
 
         // Verify that the aggregate final result has exactly one partition with one row
         let mut partitions = execute_stream_partitioned(
@@ -824,13 +1039,13 @@ mod test {
                 &schema,
                 None,
             );
-            assert_eq!(actual, expected);
+            assert_eq!(*actual, expected);
             all_batches.push(batches);
         }
 
         let actual = plan.partition_statistics(None)?;
         let expected = compute_record_batch_statistics(&all_batches, &schema, None);
-        assert_eq!(actual, expected);
+        assert_eq!(*actual, expected);
 
         Ok(())
     }
@@ -849,9 +1064,10 @@ mod test {
             .collect::<Result<Vec<_>>>()?;
         assert_eq!(statistics.len(), 3);
 
+        // Repartition preserves original total_rows from input (4 rows total)
         let expected_stats = Statistics {
             num_rows: Precision::Inexact(1),
-            total_byte_size: Precision::Inexact(73),
+            total_byte_size: Precision::Inexact(10),
             column_statistics: vec![
                 ColumnStatistics::new_unknown(),
                 ColumnStatistics::new_unknown(),
@@ -860,7 +1076,7 @@ mod test {
 
         // All partitions should have the same statistics
         for stat in statistics.iter() {
-            assert_eq!(stat, &expected_stats);
+            assert_eq!(**stat, expected_stats);
         }
 
         // Verify that the result has exactly 3 partitions
@@ -878,9 +1094,9 @@ mod test {
             partition_row_counts.push(total_rows);
         }
         assert_eq!(partition_row_counts.len(), 3);
-        assert_eq!(partition_row_counts[0], 2);
+        assert_eq!(partition_row_counts[0], 1);
         assert_eq!(partition_row_counts[1], 2);
-        assert_eq!(partition_row_counts[2], 0);
+        assert_eq!(partition_row_counts[2], 1);
 
         Ok(())
     }
@@ -898,9 +1114,11 @@ mod test {
         let result = repartition.partition_statistics(Some(2));
         assert!(result.is_err());
         let error = result.unwrap_err();
-        assert!(error
-            .to_string()
-            .contains("RepartitionExec invalid partition 2 (expected less than 2)"));
+        assert!(
+            error
+                .to_string()
+                .contains("RepartitionExec invalid partition 2 (expected less than 2)")
+        );
 
         let partitions = execute_stream_partitioned(
             repartition.clone(),
@@ -923,7 +1141,7 @@ mod test {
         )?);
 
         let result = repartition.partition_statistics(Some(0))?;
-        assert_eq!(result, Statistics::new_unknown(&scan_schema));
+        assert_eq!(*result, Statistics::new_unknown(&scan_schema));
 
         // Verify that the result has exactly 0 partitions
         let partitions = execute_stream_partitioned(
@@ -953,16 +1171,17 @@ mod test {
             .collect::<Result<Vec<_>>>()?;
         assert_eq!(stats.len(), 2);
 
+        // Repartition preserves original total_rows from input (4 rows total)
         let expected_stats = Statistics {
             num_rows: Precision::Inexact(2),
-            total_byte_size: Precision::Inexact(110),
+            total_byte_size: Precision::Inexact(16),
             column_statistics: vec![
                 ColumnStatistics::new_unknown(),
                 ColumnStatistics::new_unknown(),
             ],
         };
-        assert_eq!(stats[0], expected_stats);
-        assert_eq!(stats[1], expected_stats);
+        assert_eq!(*stats[0], expected_stats);
+        assert_eq!(*stats[1], expected_stats);
 
         // Verify the repartition execution results
         let partitions =
@@ -980,4 +1199,412 @@ mod test {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_statistic_by_partition_of_window_agg() -> Result<()> {
+        let scan = create_scan_exec_with_statistics(None, Some(2)).await;
+
+        let window_expr = create_window_expr(
+            &WindowFunctionDefinition::AggregateUDF(count_udaf()),
+            "count".to_owned(),
+            &[col("id", &scan.schema())?],
+            &[], // no partition by
+            &[PhysicalSortExpr::new(
+                col("id", &scan.schema())?,
+                SortOptions::default(),
+            )],
+            Arc::new(WindowFrame::new(Some(false))),
+            scan.schema(),
+            false,
+            false,
+            None,
+        )?;
+
+        let window_agg: Arc<dyn ExecutionPlan> =
+            Arc::new(WindowAggExec::try_new(vec![window_expr], scan, true)?);
+
+        // Verify partition statistics are properly propagated (not unknown)
+        let statistics = (0..window_agg.output_partitioning().partition_count())
+            .map(|idx| window_agg.partition_statistics(Some(idx)))
+            .collect::<Result<Vec<_>>>()?;
+
+        assert_eq!(statistics.len(), 2);
+
+        // Window functions preserve input row counts and column statistics
+        // but add unknown statistics for the new window column
+        let expected_statistic_partition_1 = Statistics {
+            num_rows: Precision::Exact(2),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(3))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(8),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_02,
+                    ))),
+                    min_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_01,
+                    ))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Inexact(2),
+                    byte_size: Precision::Exact(8),
+                },
+                ColumnStatistics::new_unknown(), // window column
+            ],
+        };
+
+        let expected_statistic_partition_2 = Statistics {
+            num_rows: Precision::Exact(2),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(2))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(8),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_04,
+                    ))),
+                    min_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_03,
+                    ))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Inexact(2),
+                    byte_size: Precision::Exact(8),
+                },
+                ColumnStatistics::new_unknown(), // window column
+            ],
+        };
+
+        assert_eq!(*statistics[0], expected_statistic_partition_1);
+        assert_eq!(*statistics[1], expected_statistic_partition_2);
+
+        // Verify the statistics match actual execution results
+        let expected_stats = vec![
+            ExpectedStatistics::NonEmpty(3, 4, 2),
+            ExpectedStatistics::NonEmpty(1, 2, 2),
+        ];
+        validate_statistics_with_data(window_agg, expected_stats, 0).await?;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_statistics_by_partition_of_empty_exec() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("name", DataType::Utf8, true),
+        ]));
+
+        // Try to test with single partition
+        let empty_single = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+
+        let stats = empty_single.partition_statistics(Some(0))?;
+        assert_eq!(stats.num_rows, Precision::Exact(0));
+        assert_eq!(stats.total_byte_size, Precision::Exact(0));
+        assert_eq!(stats.column_statistics.len(), 2);
+
+        for col_stat in &stats.column_statistics {
+            assert_eq!(col_stat.null_count, Precision::Exact(0));
+            assert_eq!(col_stat.distinct_count, Precision::Exact(0));
+            assert_eq!(col_stat.byte_size, Precision::Exact(0));
+            assert_eq!(col_stat.min_value, Precision::<ScalarValue>::Absent);
+            assert_eq!(col_stat.max_value, Precision::<ScalarValue>::Absent);
+            assert_eq!(col_stat.sum_value, Precision::<ScalarValue>::Absent);
+            assert_eq!(col_stat.byte_size, Precision::Exact(0));
+        }
+
+        let overall_stats = empty_single.partition_statistics(None)?;
+        assert_eq!(stats, overall_stats);
+
+        validate_statistics_with_data(empty_single, vec![ExpectedStatistics::Empty], 0)
+            .await?;
+
+        // Test with multiple partitions
+        let empty_multi: Arc<dyn ExecutionPlan> =
+            Arc::new(EmptyExec::new(Arc::clone(&schema)).with_partitions(3));
+
+        let statistics = (0..empty_multi.output_partitioning().partition_count())
+            .map(|idx| empty_multi.partition_statistics(Some(idx)))
+            .collect::<Result<Vec<_>>>()?;
+
+        assert_eq!(statistics.len(), 3);
+
+        for stat in &statistics {
+            assert_eq!(stat.num_rows, Precision::Exact(0));
+            assert_eq!(stat.total_byte_size, Precision::Exact(0));
+        }
+
+        validate_statistics_with_data(
+            empty_multi,
+            vec![
+                ExpectedStatistics::Empty,
+                ExpectedStatistics::Empty,
+                ExpectedStatistics::Empty,
+            ],
+            0,
+        )
+        .await?;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_hash_join_partition_statistics() -> Result<()> {
+        // Create left table scan and coalesce to 1 partition for CollectLeft mode
+        let left_scan = create_scan_exec_with_statistics(None, Some(2)).await;
+        let left_scan_coalesced = Arc::new(CoalescePartitionsExec::new(left_scan.clone()))
+            as Arc<dyn ExecutionPlan>;
+
+        // Create right table scan with different table name
+        let right_create_table_sql = "CREATE EXTERNAL TABLE t2 (id INT NOT NULL, date DATE) \
+                                     STORED AS PARQUET LOCATION './tests/data/test_statistics_per_partition'\
+                                     PARTITIONED BY (date) \
+                                     WITH ORDER (id ASC);";
+        let right_scan =
+            create_scan_exec_with_statistics(Some(right_create_table_sql), Some(2)).await;
+
+        // Create join condition: t1.id = t2.id
+        let on = vec![(
+            Arc::new(Column::new("id", 0)) as Arc<dyn PhysicalExpr>,
+            Arc::new(Column::new("id", 0)) as Arc<dyn PhysicalExpr>,
+        )];
+
+        // Test CollectLeft mode - left child must have 1 partition
+        let collect_left_join = Arc::new(HashJoinExec::try_new(
+            left_scan_coalesced,
+            Arc::clone(&right_scan),
+            on.clone(),
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNothing,
+            false,
+        )?) as Arc<dyn ExecutionPlan>;
+
+        // Test partition statistics for CollectLeft mode
+        let statistics = (0..collect_left_join.output_partitioning().partition_count())
+            .map(|idx| collect_left_join.partition_statistics(Some(idx)))
+            .collect::<Result<Vec<_>>>()?;
+
+        // Check that we have the expected number of partitions
+        assert_eq!(statistics.len(), 2);
+
+        // For collect left mode, the min/max values are from the entire left table and the specific partition of the right table.
+        let expected_p0_statistics = Statistics {
+            num_rows: Precision::Inexact(2),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![
+                // Left id column: all partitions (id 1..4)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(16),
+                },
+                // Left date column: all partitions (2025-03-01..2025-03-04)
+                // NDV is Inexact(4) derived from the date range via interval analysis
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_04,
+                    ))),
+                    min_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_01,
+                    ))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Inexact(4),
+                    byte_size: Precision::Exact(16),
+                },
+                // Right id column: partition 0 only (id 3..4)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(3))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(8),
+                },
+                // Right date column: partition 0 only (2025-03-01..2025-03-02)
+                // NDV is Inexact(2) derived from the date range via interval analysis
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_02,
+                    ))),
+                    min_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_01,
+                    ))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Inexact(2),
+                    byte_size: Precision::Exact(8),
+                },
+            ],
+        };
+        assert_eq!(*statistics[0], expected_p0_statistics);
+
+        // Test Partitioned mode
+        let partitioned_join = Arc::new(HashJoinExec::try_new(
+            Arc::clone(&left_scan),
+            Arc::clone(&right_scan),
+            on.clone(),
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::Partitioned,
+            NullEquality::NullEqualsNothing,
+            false,
+        )?) as Arc<dyn ExecutionPlan>;
+
+        // Test partition statistics for Partitioned mode
+        let statistics = (0..partitioned_join.output_partitioning().partition_count())
+            .map(|idx| partitioned_join.partition_statistics(Some(idx)))
+            .collect::<Result<Vec<_>>>()?;
+
+        // Check that we have the expected number of partitions
+        assert_eq!(statistics.len(), 2);
+
+        // For partitioned mode, the min/max values are from the specific partition for each side.
+        let expected_p0_statistics = Statistics {
+            num_rows: Precision::Inexact(2),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![
+                // Left id column: partition 0 only (id 3..4)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(3))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(8),
+                },
+                // Left date column: partition 0 only (2025-03-01..2025-03-02)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_02,
+                    ))),
+                    min_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_01,
+                    ))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Inexact(2),
+                    byte_size: Precision::Exact(8),
+                },
+                // Right id column: partition 0 only (id 3..4)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(3))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(8),
+                },
+                // Right date column: partition 0 only (2025-03-01..2025-03-02)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_02,
+                    ))),
+                    min_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_01,
+                    ))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Inexact(2),
+                    byte_size: Precision::Exact(8),
+                },
+            ],
+        };
+        assert_eq!(*statistics[0], expected_p0_statistics);
+
+        // Test Auto mode - should fall back to getting all partition statistics
+        let auto_join = Arc::new(HashJoinExec::try_new(
+            Arc::clone(&left_scan),
+            Arc::clone(&right_scan),
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::Auto,
+            NullEquality::NullEqualsNothing,
+            false,
+        )?) as Arc<dyn ExecutionPlan>;
+
+        // Test partition statistics for Auto mode
+        let statistics = (0..auto_join.output_partitioning().partition_count())
+            .map(|idx| auto_join.partition_statistics(Some(idx)))
+            .collect::<Result<Vec<_>>>()?;
+
+        // Check that we have the expected number of partitions
+        assert_eq!(statistics.len(), 2);
+
+        // For auto mode, the min/max values are from the entire left and right tables.
+        let expected_p0_statistics = Statistics {
+            num_rows: Precision::Inexact(4),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![
+                // Left id column: all partitions (id 1..4)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(16),
+                },
+                // Left date column: all partitions (2025-03-01..2025-03-04)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_04,
+                    ))),
+                    min_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_01,
+                    ))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Inexact(4),
+                    byte_size: Precision::Exact(16),
+                },
+                // Right id column: all partitions (id 1..4)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(16),
+                },
+                // Right date column: all partitions (2025-03-01..2025-03-04)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_04,
+                    ))),
+                    min_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_01,
+                    ))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Inexact(4),
+                    byte_size: Precision::Exact(16),
+                },
+            ],
+        };
+        assert_eq!(*statistics[0], expected_p0_statistics);
+        Ok(())
+    }
 }
diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs
index 8631613c3925e..6635220cf2028 100644
--- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs
+++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs
@@ -15,7 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::compute::SortOptions;
@@ -24,8 +23,9 @@ use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::datasource::physical_plan::CsvSource;
 use datafusion::datasource::source::DataSourceExec;
-use datafusion_common::config::ConfigOptions;
+use datafusion_common::config::{ConfigOptions, CsvOptions};
 use datafusion_common::{JoinSide, JoinType, NullEquality, Result, ScalarValue};
+use datafusion_datasource::TableSchema;
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use datafusion_execution::object_store::ObjectStoreUrl;
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
@@ -34,30 +34,31 @@ use datafusion_expr::{
 };
 use datafusion_expr_common::columnar_value::ColumnarValue;
 use datafusion_physical_expr::expressions::{
-    binary, cast, col, BinaryExpr, CaseExpr, CastExpr, Column, Literal, NegativeExpr,
+    BinaryExpr, CaseExpr, CastExpr, Column, Literal, NegativeExpr, binary, cast, col,
 };
 use datafusion_physical_expr::{Distribution, Partitioning, ScalarFunctionExpr};
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use datafusion_physical_expr_common::sort_expr::{
     OrderingRequirements, PhysicalSortExpr, PhysicalSortRequirement,
 };
+use datafusion_physical_optimizer::PhysicalOptimizerRule;
 use datafusion_physical_optimizer::output_requirements::OutputRequirementExec;
 use datafusion_physical_optimizer::projection_pushdown::ProjectionPushdown;
-use datafusion_physical_optimizer::PhysicalOptimizerRule;
 use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
-use datafusion_physical_plan::filter::FilterExec;
+use datafusion_physical_plan::coop::CooperativeExec;
+use datafusion_physical_plan::filter::{FilterExec, FilterExecBuilder};
 use datafusion_physical_plan::joins::utils::{ColumnIndex, JoinFilter};
 use datafusion_physical_plan::joins::{
     HashJoinExec, NestedLoopJoinExec, PartitionMode, StreamJoinPartitionMode,
     SymmetricHashJoinExec,
 };
-use datafusion_physical_plan::projection::{update_expr, ProjectionExec, ProjectionExpr};
+use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr, update_expr};
 use datafusion_physical_plan::repartition::RepartitionExec;
 use datafusion_physical_plan::sorts::sort::SortExec;
 use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
 use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec};
 use datafusion_physical_plan::union::UnionExec;
-use datafusion_physical_plan::{displayable, ExecutionPlan};
+use datafusion_physical_plan::{ExecutionPlan, displayable};
 
 use insta::assert_snapshot;
 use itertools::Itertools;
@@ -77,10 +78,6 @@ impl DummyUDF {
 }
 
 impl ScalarUDFImpl for DummyUDF {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "dummy_udf"
     }
@@ -229,9 +226,11 @@ fn test_update_matching_exprs() -> Result<()> {
         .map(|(expr, alias)| ProjectionExpr::new(expr.clone(), alias.clone()))
         .collect();
     for (expr, expected_expr) in exprs.into_iter().zip(expected_exprs.into_iter()) {
-        assert!(update_expr(&expr, &child_exprs, true)?
-            .unwrap()
-            .eq(&expected_expr));
+        assert!(
+            update_expr(&expr, &child_exprs, true)?
+                .unwrap()
+                .eq(&expected_expr)
+        );
     }
 
     Ok(())
@@ -368,9 +367,11 @@ fn test_update_projected_exprs() -> Result<()> {
         .map(|(expr, alias)| ProjectionExpr::new(expr.clone(), alias.clone()))
         .collect();
     for (expr, expected_expr) in exprs.into_iter().zip(expected_exprs.into_iter()) {
-        assert!(update_expr(&expr, &proj_exprs, false)?
-            .unwrap()
-            .eq(&expected_expr));
+        assert!(
+            update_expr(&expr, &proj_exprs, false)?
+                .unwrap()
+                .eq(&expected_expr)
+        );
     }
 
     Ok(())
@@ -384,14 +385,20 @@ fn create_simple_csv_exec() -> Arc<dyn ExecutionPlan> {
         Field::new("d", DataType::Int32, true),
         Field::new("e", DataType::Int32, true),
     ]));
-    let config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::parse("test:///").unwrap(),
-        schema,
-        Arc::new(CsvSource::new(false, 0, 0)),
-    )
-    .with_file(PartitionedFile::new("x".to_string(), 100))
-    .with_projection_indices(Some(vec![0, 1, 2, 3, 4]))
-    .build();
+    let config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), {
+            let options = CsvOptions {
+                has_header: Some(false),
+                delimiter: 0,
+                quote: 0,
+                ..Default::default()
+            };
+            Arc::new(CsvSource::new(schema.clone()).with_csv_options(options))
+        })
+        .with_file(PartitionedFile::new("x", 100))
+        .with_projection_indices(Some(vec![0, 1, 2, 3, 4]))
+        .unwrap()
+        .build();
 
     DataSourceExec::from_data_source(config)
 }
@@ -403,14 +410,20 @@ fn create_projecting_csv_exec() -> Arc<dyn ExecutionPlan> {
         Field::new("c", DataType::Int32, true),
         Field::new("d", DataType::Int32, true),
     ]));
-    let config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::parse("test:///").unwrap(),
-        schema,
-        Arc::new(CsvSource::new(false, 0, 0)),
-    )
-    .with_file(PartitionedFile::new("x".to_string(), 100))
-    .with_projection_indices(Some(vec![3, 2, 1]))
-    .build();
+    let config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), {
+            let options = CsvOptions {
+                has_header: Some(false),
+                delimiter: 0,
+                quote: 0,
+                ..Default::default()
+            };
+            Arc::new(CsvSource::new(schema.clone()).with_csv_options(options))
+        })
+        .with_file(PartitionedFile::new("x", 100))
+        .with_projection_indices(Some(vec![3, 2, 1]))
+        .unwrap()
+        .build();
 
     DataSourceExec::from_data_source(config)
 }
@@ -432,8 +445,8 @@ fn test_csv_after_projection() -> Result<()> {
     let csv = create_projecting_csv_exec();
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            ProjectionExpr::new(Arc::new(Column::new("b", 2)), "b".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("d", 0)), "d".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("b", 2)), "b"),
+            ProjectionExpr::new(Arc::new(Column::new("d", 0)), "d"),
         ],
         csv.clone(),
     )?);
@@ -469,9 +482,9 @@ fn test_memory_after_projection() -> Result<()> {
     let memory = create_projecting_memory_exec();
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            ProjectionExpr::new(Arc::new(Column::new("d", 2)), "d".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("e", 3)), "e".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("a", 1)), "a".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("d", 2)), "d"),
+            ProjectionExpr::new(Arc::new(Column::new("e", 3)), "e"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 1)), "a"),
         ],
         memory.clone(),
     )?);
@@ -502,11 +515,9 @@ fn test_memory_after_projection() -> Result<()> {
     assert_eq!(
         after_optimize
             .clone()
-            .as_any()
             .downcast_ref::<DataSourceExec>()
             .unwrap()
             .data_source()
-            .as_any()
             .downcast_ref::<MemorySourceConfig>()
             .unwrap()
             .projection()
@@ -575,9 +586,9 @@ fn test_streaming_table_after_projection() -> Result<()> {
     )?;
     let projection = Arc::new(ProjectionExec::try_new(
         vec![
-            ProjectionExpr::new(Arc::new(Column::new("d", 3)), "d".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("e", 2)), "e".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("d", 3)), "d"),
+            ProjectionExpr::new(Arc::new(Column::new("e", 2)), "e"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a"),
         ],
         Arc::new(streaming_table) as _,
     )?) as _;
@@ -585,10 +596,7 @@ fn test_streaming_table_after_projection() -> Result<()> {
     let after_optimize =
         ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
 
-    let result = after_optimize
-        .as_any()
-        .downcast_ref::<StreamingTableExec>()
-        .unwrap();
+    let result = after_optimize.downcast_ref::<StreamingTableExec>().unwrap();
     assert_eq!(
         result.partition_schema(),
         &Arc::new(Schema::new(vec![
@@ -642,28 +650,25 @@ fn test_projection_after_projection() -> Result<()> {
     let csv = create_simple_csv_exec();
     let child_projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("e", 4)), "new_e".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "new_b".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c"),
+            ProjectionExpr::new(Arc::new(Column::new("e", 4)), "new_e"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "new_b"),
         ],
         csv.clone(),
     )?);
     let top_projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            ProjectionExpr::new(Arc::new(Column::new("new_b", 3)), "new_b".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("new_b", 3)), "new_b"),
             ProjectionExpr::new(
                 Arc::new(BinaryExpr::new(
                     Arc::new(Column::new("c", 0)),
                     Operator::Plus,
                     Arc::new(Column::new("new_e", 1)),
                 )),
-                "binary".to_string(),
-            ),
-            ProjectionExpr::new(
-                Arc::new(Column::new("new_b", 3)),
-                "newest_b".to_string(),
+                "binary",
             ),
+            ProjectionExpr::new(Arc::new(Column::new("new_b", 3)), "newest_b"),
         ],
         child_projection.clone(),
     )?);
@@ -692,10 +697,7 @@ fn test_projection_after_projection() -> Result<()> {
 
     assert_snapshot!(
         actual,
-        @r"
-    ProjectionExec: expr=[b@1 as new_b, c@2 + e@4 as binary, b@1 as newest_b]
-      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
-    "
+        @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[b@1 as new_b, c@2 + e@4 as binary, b@1 as newest_b], file_type=csv, has_header=false"
     );
 
     Ok(())
@@ -731,9 +733,9 @@ fn test_output_req_after_projection() -> Result<()> {
     ));
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "new_a".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "new_a"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
         ],
         sort_req.clone(),
     )?);
@@ -762,8 +764,7 @@ fn test_output_req_after_projection() -> Result<()> {
         actual,
         @r"
     OutputRequirementExec: order_by=[(b@2, asc), (c@0 + new_a@1, asc)], dist_by=HashPartitioned[[new_a@1, b@2]])
-      ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]
-        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[c, a@0 as new_a, b], file_type=csv, has_header=false
     "
     );
 
@@ -786,7 +787,6 @@ fn test_output_req_after_projection() -> Result<()> {
     );
     assert_eq!(
         after_optimize
-            .as_any()
             .downcast_ref::<OutputRequirementExec>()
             .unwrap()
             .required_input_ordering()[0]
@@ -799,16 +799,16 @@ fn test_output_req_after_projection() -> Result<()> {
         Arc::new(Column::new("b", 2)),
     ];
     if let Distribution::HashPartitioned(vec) = after_optimize
-        .as_any()
         .downcast_ref::<OutputRequirementExec>()
         .unwrap()
         .required_input_distribution()[0]
         .clone()
     {
-        assert!(vec
-            .iter()
-            .zip(expected_distribution)
-            .all(|(actual, expected)| actual.eq(&expected)));
+        assert!(
+            vec.iter()
+                .zip(expected_distribution)
+                .all(|(actual, expected)| actual.eq(&expected))
+        );
     } else {
         panic!("Expected HashPartitioned distribution!");
     };
@@ -823,9 +823,9 @@ fn test_coalesce_partitions_after_projection() -> Result<()> {
         Arc::new(CoalescePartitionsExec::new(csv));
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a_new".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("d", 3)), "d".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a_new"),
+            ProjectionExpr::new(Arc::new(Column::new("d", 3)), "d"),
         ],
         coalesce_partitions,
     )?);
@@ -853,8 +853,7 @@ fn test_coalesce_partitions_after_projection() -> Result<()> {
         actual,
         @r"
     CoalescePartitionsExec
-      ProjectionExec: expr=[b@1 as b, a@0 as a_new, d@3 as d]
-        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[b, a@0 as a_new, d], file_type=csv, has_header=false
     "
     );
 
@@ -880,9 +879,9 @@ fn test_filter_after_projection() -> Result<()> {
     let filter = Arc::new(FilterExec::try_new(predicate, csv)?);
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a_new".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("d", 3)), "d".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a_new"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
+            ProjectionExpr::new(Arc::new(Column::new("d", 3)), "d"),
         ],
         filter.clone(),
     )?) as _;
@@ -911,8 +910,7 @@ fn test_filter_after_projection() -> Result<()> {
         actual,
         @r"
     FilterExec: b@1 - a_new@0 > d@2 - a_new@0
-      ProjectionExec: expr=[a@0 as a_new, b@1 as b, d@3 as d]
-        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a@0 as a_new, b, d], file_type=csv, has_header=false
     "
     );
 
@@ -975,17 +973,11 @@ fn test_join_after_projection() -> Result<()> {
     )?);
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c_from_left".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b_from_left".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a_from_left".to_string()),
-            ProjectionExpr::new(
-                Arc::new(Column::new("a", 5)),
-                "a_from_right".to_string(),
-            ),
-            ProjectionExpr::new(
-                Arc::new(Column::new("c", 7)),
-                "c_from_right".to_string(),
-            ),
+            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c_from_left"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b_from_left"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a_from_left"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 5)), "a_from_right"),
+            ProjectionExpr::new(Arc::new(Column::new("c", 7)), "c_from_right"),
         ],
         join,
     )?) as _;
@@ -1014,10 +1006,8 @@ fn test_join_after_projection() -> Result<()> {
         actual,
         @r"
     SymmetricHashJoinExec: mode=SinglePartition, join_type=Inner, on=[(b_from_left@1, c_from_right@1)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2
-      ProjectionExec: expr=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left]
-        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
-      ProjectionExec: expr=[a@0 as a_from_right, c@2 as c_from_right]
-        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a@0 as a_from_right, c@2 as c_from_right], file_type=csv, has_header=false
     "
     );
 
@@ -1039,7 +1029,6 @@ fn test_join_after_projection() -> Result<()> {
     assert_eq!(
         expected_filter_col_ind,
         after_optimize
-            .as_any()
             .downcast_ref::<SymmetricHashJoinExec>()
             .unwrap()
             .filter()
@@ -1106,16 +1095,16 @@ fn test_join_after_required_projection() -> Result<()> {
     )?);
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            ProjectionExpr::new(Arc::new(Column::new("a", 5)), "a".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("b", 6)), "b".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("c", 7)), "c".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("d", 8)), "d".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("e", 9)), "e".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("d", 3)), "d".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("e", 4)), "e".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("a", 5)), "a"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 6)), "b"),
+            ProjectionExpr::new(Arc::new(Column::new("c", 7)), "c"),
+            ProjectionExpr::new(Arc::new(Column::new("d", 8)), "d"),
+            ProjectionExpr::new(Arc::new(Column::new("e", 9)), "e"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
+            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c"),
+            ProjectionExpr::new(Arc::new(Column::new("d", 3)), "d"),
+            ProjectionExpr::new(Arc::new(Column::new("e", 4)), "e"),
         ],
         join,
     )?) as _;
@@ -1195,7 +1184,7 @@ fn test_nested_loop_join_after_projection() -> Result<()> {
     )?) as _;
 
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
-        vec![ProjectionExpr::new(col_left_c, "c".to_string())],
+        vec![ProjectionExpr::new(col_left_c, "c")],
         Arc::clone(&join),
     )?) as _;
     let initial = displayable(projection.as_ref()).indent(true).to_string();
@@ -1282,16 +1271,14 @@ fn test_hash_join_after_projection() -> Result<()> {
         None,
         PartitionMode::Auto,
         NullEquality::NullEqualsNothing,
+        false,
     )?);
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c_from_left".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b_from_left".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a_from_left".to_string()),
-            ProjectionExpr::new(
-                Arc::new(Column::new("c", 7)),
-                "c_from_right".to_string(),
-            ),
+            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c_from_left"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b_from_left"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a_from_left"),
+            ProjectionExpr::new(Arc::new(Column::new("c", 7)), "c_from_right"),
         ],
         join.clone(),
     )?) as _;
@@ -1318,8 +1305,8 @@ fn test_hash_join_after_projection() -> Result<()> {
     assert_snapshot!(
         actual,
         @r"
-    ProjectionExec: expr=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left, c@3 as c_from_right]
-      HashJoinExec: mode=Auto, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2, projection=[a@0, b@1, c@2, c@7]
+    ProjectionExec: expr=[c@0 as c_from_left, b@1 as b_from_left, a@2 as a_from_left, c@3 as c_from_right]
+      HashJoinExec: mode=Auto, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2, projection=[c@2, b@1, a@0, c@7]
         DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
         DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
     "
@@ -1327,10 +1314,10 @@ fn test_hash_join_after_projection() -> Result<()> {
 
     let projection = Arc::new(ProjectionExec::try_new(
         vec![
-            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("c", 7)), "c".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
+            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c"),
+            ProjectionExpr::new(Arc::new(Column::new("c", 7)), "c"),
         ],
         join.clone(),
     )?);
@@ -1371,9 +1358,9 @@ fn test_repartition_after_projection() -> Result<()> {
     )?);
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b_new".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("d", 3)), "d_new".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b_new"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a"),
+            ProjectionExpr::new(Arc::new(Column::new("d", 3)), "d_new"),
         ],
         repartition,
     )?) as _;
@@ -1399,14 +1386,12 @@ fn test_repartition_after_projection() -> Result<()> {
         actual,
         @r"
     RepartitionExec: partitioning=Hash([a@1, b_new@0, d_new@2], 6), input_partitions=1
-      ProjectionExec: expr=[b@1 as b_new, a@0 as a, d@3 as d_new]
-        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[b@1 as b_new, a, d@3 as d_new], file_type=csv, has_header=false
     "
     );
 
     assert_eq!(
         after_optimize
-            .as_any()
             .downcast_ref::<RepartitionExec>()
             .unwrap()
             .partitioning()
@@ -1441,9 +1426,9 @@ fn test_sort_after_projection() -> Result<()> {
     );
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "new_a".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "new_a"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
         ],
         Arc::new(sort_exec),
     )?) as _;
@@ -1470,8 +1455,7 @@ fn test_sort_after_projection() -> Result<()> {
         actual,
         @r"
     SortExec: expr=[b@2 ASC, c@0 + new_a@1 ASC], preserve_partitioning=[false]
-      ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]
-        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[c, a@0 as new_a, b], file_type=csv, has_header=false
     "
     );
 
@@ -1495,9 +1479,9 @@ fn test_sort_preserving_after_projection() -> Result<()> {
     );
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "new_a".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "new_a"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
         ],
         Arc::new(sort_exec),
     )?) as _;
@@ -1524,8 +1508,7 @@ fn test_sort_preserving_after_projection() -> Result<()> {
         actual,
         @r"
     SortPreservingMergeExec: [b@2 ASC, c@0 + new_a@1 ASC]
-      ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]
-        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[c, a@0 as new_a, b], file_type=csv, has_header=false
     "
     );
 
@@ -1538,9 +1521,9 @@ fn test_union_after_projection() -> Result<()> {
     let union = UnionExec::try_new(vec![csv.clone(), csv.clone(), csv])?;
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "new_a".to_string()),
-            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "new_a"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
         ],
         union.clone(),
     )?) as _;
@@ -1569,12 +1552,9 @@ fn test_union_after_projection() -> Result<()> {
         actual,
         @r"
     UnionExec
-      ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]
-        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
-      ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]
-        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
-      ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]
-        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[c, a@0 as new_a, b], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[c, a@0 as new_a, b], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[c, a@0 as new_a, b], file_type=csv, has_header=false
     "
     );
 
@@ -1589,14 +1569,23 @@ fn partitioned_data_source() -> Arc<DataSourceExec> {
         Field::new("string_col", DataType::Utf8, true),
     ]));
 
+    let options = CsvOptions {
+        has_header: Some(false),
+        delimiter: b',',
+        quote: b'"',
+        ..Default::default()
+    };
+    let table_schema = TableSchema::new(
+        Arc::clone(&file_schema),
+        vec![Arc::new(Field::new("partition_col", DataType::Utf8, true))],
+    );
     let config = FileScanConfigBuilder::new(
         ObjectStoreUrl::parse("test:///").unwrap(),
-        file_schema.clone(),
-        Arc::new(CsvSource::default()),
+        Arc::new(CsvSource::new(table_schema).with_csv_options(options)),
     )
-    .with_file(PartitionedFile::new("x".to_string(), 100))
-    .with_table_partition_cols(vec![Field::new("partition_col", DataType::Utf8, true)])
+    .with_file(PartitionedFile::new("x", 100))
     .with_projection_indices(Some(vec![0, 1, 2]))
+    .unwrap()
     .build();
 
     DataSourceExec::from_data_source(config)
@@ -1611,16 +1600,13 @@ fn test_partition_col_projection_pushdown() -> Result<()> {
         vec![
             ProjectionExpr::new(
                 col("string_col", partitioned_schema.as_ref())?,
-                "string_col".to_string(),
+                "string_col",
             ),
             ProjectionExpr::new(
                 col("partition_col", partitioned_schema.as_ref())?,
-                "partition_col".to_string(),
-            ),
-            ProjectionExpr::new(
-                col("int_col", partitioned_schema.as_ref())?,
-                "int_col".to_string(),
+                "partition_col",
             ),
+            ProjectionExpr::new(col("int_col", partitioned_schema.as_ref())?, "int_col"),
         ],
         source,
     )?);
@@ -1634,10 +1620,7 @@ fn test_partition_col_projection_pushdown() -> Result<()> {
     let actual = after_optimize_string.trim();
     assert_snapshot!(
         actual,
-        @r"
-    ProjectionExec: expr=[string_col@1 as string_col, partition_col@2 as partition_col, int_col@0 as int_col]
-      DataSourceExec: file_groups={1 group: [[x]]}, projection=[int_col, string_col, partition_col], file_type=csv, has_header=false
-    "
+        @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[string_col, partition_col, int_col], file_type=csv, has_header=false"
     );
 
     Ok(())
@@ -1652,7 +1635,7 @@ fn test_partition_col_projection_pushdown_expr() -> Result<()> {
         vec![
             ProjectionExpr::new(
                 col("string_col", partitioned_schema.as_ref())?,
-                "string_col".to_string(),
+                "string_col",
             ),
             ProjectionExpr::new(
                 // CAST(partition_col, Utf8View)
@@ -1661,12 +1644,9 @@ fn test_partition_col_projection_pushdown_expr() -> Result<()> {
                     partitioned_schema.as_ref(),
                     DataType::Utf8View,
                 )?,
-                "partition_col".to_string(),
-            ),
-            ProjectionExpr::new(
-                col("int_col", partitioned_schema.as_ref())?,
-                "int_col".to_string(),
+                "partition_col",
             ),
+            ProjectionExpr::new(col("int_col", partitioned_schema.as_ref())?, "int_col"),
         ],
         source,
     )?);
@@ -1678,11 +1658,214 @@ fn test_partition_col_projection_pushdown_expr() -> Result<()> {
         .indent(true)
         .to_string();
     let actual = after_optimize_string.trim();
+    assert_snapshot!(
+        actual,
+        @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[string_col, CAST(partition_col@2 AS Utf8View) as partition_col, int_col], file_type=csv, has_header=false"
+    );
+
+    Ok(())
+}
+
+#[test]
+fn test_cooperative_exec_after_projection() -> Result<()> {
+    let csv = create_simple_csv_exec();
+    let cooperative: Arc<dyn ExecutionPlan> = Arc::new(CooperativeExec::new(csv));
+    let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
+        vec![
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
+        ],
+        cooperative,
+    )?);
+
+    let initial = displayable(projection.as_ref()).indent(true).to_string();
+    let actual = initial.trim();
+
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[a@0 as a, b@1 as b]
+      CooperativeExec
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
+
+    let after_optimize =
+        ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
+
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+
+    // Projection should be pushed down through CooperativeExec
+    assert_snapshot!(
+        actual,
+        @r"
+    CooperativeExec
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b], file_type=csv, has_header=false
+    "
+    );
+
+    Ok(())
+}
+
+#[test]
+fn test_hash_join_empty_projection_embeds() -> Result<()> {
+    let left_csv = create_simple_csv_exec();
+    let right_csv = create_simple_csv_exec();
+
+    let join = Arc::new(HashJoinExec::try_new(
+        left_csv,
+        right_csv,
+        vec![(Arc::new(Column::new("a", 0)), Arc::new(Column::new("a", 0)))],
+        None,
+        &JoinType::Right,
+        None,
+        PartitionMode::CollectLeft,
+        NullEquality::NullEqualsNothing,
+        false,
+    )?);
+
+    // Empty projection: no columns needed from the join output
+    let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
+        vec![] as Vec<ProjectionExpr>,
+        join,
+    )?);
+
+    let after_optimize =
+        ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+
+    // The empty projection should be embedded into the HashJoinExec,
+    // resulting in projection=[] on the join and no ProjectionExec wrapper.
+    assert_snapshot!(
+        actual,
+        @r"
+    HashJoinExec: mode=CollectLeft, join_type=Right, on=[(a@0, a@0)], projection=[]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
+
+    Ok(())
+}
+
+/// Regression test for <https://github.com/apache/datafusion/issues/21459>
+///
+/// When a `ProjectionExec` sits on top of a `FilterExec` that already carries
+/// an embedded projection, the `ProjectionPushdown` optimizer must not panic.
+///
+/// Before the fix, `FilterExecBuilder::from(self)` copied stale projection
+/// indices (e.g. `[0, 1, 2]`). After swapping, the new input was narrower
+/// (2 columns), so `.build()` panicked with "project index out of bounds".
+#[test]
+fn test_filter_with_embedded_projection_after_projection() -> Result<()> {
+    // DataSourceExec: [a, b, c, d, e]
+    let csv = create_simple_csv_exec();
+
+    // FilterExec: a > 0, projection=[0, 1, 2] → output: [a, b, c]
+    let predicate = Arc::new(BinaryExpr::new(
+        Arc::new(Column::new("a", 0)),
+        Operator::Gt,
+        Arc::new(Literal::new(ScalarValue::Int32(Some(0)))),
+    ));
+    let filter: Arc<dyn ExecutionPlan> = Arc::new(
+        FilterExecBuilder::new(predicate, csv)
+            .apply_projection(Some(vec![0, 1, 2]))?
+            .build()?,
+    );
+
+    // ProjectionExec: narrows [a, b, c] → [a, b]
+    let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
+        vec![
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
+        ],
+        filter,
+    )?);
+
+    let initial = displayable(projection.as_ref()).indent(true).to_string();
+    let actual = initial.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[a@0 as a, b@1 as b]
+      FilterExec: a@0 > 0, projection=[a@0, b@1, c@2]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
+
+    // This must not panic
+    let after_optimize =
+        ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    FilterExec: a@0 > 0
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b], file_type=csv, has_header=false
+    "
+    );
+
+    Ok(())
+}
+
+/// Same as above, but the outer ProjectionExec also renames columns.
+/// Ensures the rename is preserved after the projection pushdown swap.
+#[test]
+fn test_filter_with_embedded_projection_after_renaming_projection() -> Result<()> {
+    let csv = create_simple_csv_exec();
+
+    // FilterExec: b > 10, projection=[0, 1, 2, 3] → output: [a, b, c, d]
+    let predicate = Arc::new(BinaryExpr::new(
+        Arc::new(Column::new("b", 1)),
+        Operator::Gt,
+        Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+    ));
+    let filter: Arc<dyn ExecutionPlan> = Arc::new(
+        FilterExecBuilder::new(predicate, csv)
+            .apply_projection(Some(vec![0, 1, 2, 3]))?
+            .build()?,
+    );
+
+    // ProjectionExec: [a as x, b as y] — narrows and renames
+    let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
+        vec![
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "x"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "y"),
+        ],
+        filter,
+    )?);
+
+    let initial = displayable(projection.as_ref()).indent(true).to_string();
+    let actual = initial.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[a@0 as x, b@1 as y]
+      FilterExec: b@1 > 10, projection=[a@0, b@1, c@2, d@3]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
+
+    let after_optimize =
+        ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
     assert_snapshot!(
         actual,
         @r"
-    ProjectionExec: expr=[string_col@1 as string_col, CAST(partition_col@2 AS Utf8View) as partition_col, int_col@0 as int_col]
-      DataSourceExec: file_groups={1 group: [[x]]}, projection=[int_col, string_col, partition_col], file_type=csv, has_header=false
+    FilterExec: y@1 > 10
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a@0 as x, b@1 as y], file_type=csv, has_header=false
     "
     );
 
diff --git a/datafusion/core/tests/physical_optimizer/pushdown_sort.rs b/datafusion/core/tests/physical_optimizer/pushdown_sort.rs
new file mode 100644
index 0000000000000..e2700c3174a16
--- /dev/null
+++ b/datafusion/core/tests/physical_optimizer/pushdown_sort.rs
@@ -0,0 +1,1086 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Tests for sort pushdown optimizer rule (Phase 1)
+//!
+//! Phase 1 tests verify that:
+//! 1. Reverse scan is enabled (reverse_row_groups=true)
+//! 2. SortExec is kept (because ordering is inexact)
+//! 3. output_ordering remains unchanged
+//! 4. Early termination is enabled for TopK queries
+//! 5. Prefix matching works correctly
+
+use datafusion_physical_expr::expressions;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::pushdown_sort::PushdownSort;
+use std::sync::Arc;
+
+use crate::physical_optimizer::test_utils::{
+    OptimizationTest, TestScan, coalesce_partitions_exec, parquet_exec,
+    parquet_exec_with_sort, projection_exec, projection_exec_with_alias,
+    repartition_exec, schema, simple_projection_exec, sort_exec, sort_exec_with_fetch,
+    sort_expr, sort_expr_named, test_scan_with_ordering,
+};
+
+#[test]
+fn test_sort_pushdown_disabled() {
+    // When pushdown is disabled, plan should remain unchanged
+    let schema = schema();
+    let source = parquet_exec(schema.clone());
+    let sort_exprs = LexOrdering::new(vec![sort_expr("a", &schema)]).unwrap();
+    let plan = sort_exec(sort_exprs, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), false),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    "
+    );
+}
+
+#[test]
+fn test_sort_pushdown_basic_phase1() {
+    // Phase 1: Reverse scan enabled, Sort kept, output_ordering unchanged
+    let schema = schema();
+
+    // Source has ASC NULLS LAST ordering (default)
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Request DESC NULLS LAST ordering (exact reverse)
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_sort_with_limit_phase1() {
+    // Phase 1: Sort with fetch enables early termination but keeps Sort
+    let schema = schema();
+
+    // Source has ASC ordering
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Request DESC ordering with limit
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let plan = sort_exec_with_fetch(desc_ordering, Some(10), source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: TopK(fetch=10), expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: TopK(fetch=10), expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_sort_multiple_columns_phase1() {
+    // Phase 1: Sort on multiple columns - reverse multi-column ordering
+    let schema = schema();
+
+    // Source has [a DESC NULLS LAST, b ASC] ordering
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone().reverse(), b.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Request [a ASC NULLS FIRST, b DESC] ordering (exact reverse)
+    let reverse_ordering =
+        LexOrdering::new(vec![a.clone().asc().nulls_first(), b.reverse()]).unwrap();
+    let plan = sort_exec(reverse_ordering, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 ASC, b@1 DESC NULLS LAST], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 DESC NULLS LAST, b@1 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 ASC, b@1 DESC NULLS LAST], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+// ============================================================================
+// PREFIX MATCHING TESTS
+// ============================================================================
+
+#[test]
+fn test_prefix_match_single_column() {
+    // Test prefix matching: source has [a DESC, b ASC], query needs [a ASC]
+    // After reverse: [a ASC, b DESC] which satisfies [a ASC] prefix
+    let schema = schema();
+
+    // Source has [a DESC NULLS LAST, b ASC NULLS LAST] ordering
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone().reverse(), b]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Request only [a ASC NULLS FIRST] - a prefix of the reversed ordering
+    let prefix_ordering = LexOrdering::new(vec![a.clone().asc().nulls_first()]).unwrap();
+    let plan = sort_exec(prefix_ordering, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 DESC NULLS LAST, b@1 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_prefix_match_with_limit() {
+    // Test prefix matching with LIMIT - important for TopK optimization
+    let schema = schema();
+
+    // Source has [a ASC, b DESC, c ASC] ordering
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let c = sort_expr("c", &schema);
+    let source_ordering =
+        LexOrdering::new(vec![a.clone(), b.clone().reverse(), c]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Request [a DESC NULLS LAST, b ASC NULLS FIRST] with LIMIT 100
+    // This is a prefix (2 columns) of the reversed 3-column ordering
+    let prefix_ordering =
+        LexOrdering::new(vec![a.reverse(), b.clone().asc().nulls_first()]).unwrap();
+    let plan = sort_exec_with_fetch(prefix_ordering, Some(100), source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: TopK(fetch=100), expr=[a@0 DESC NULLS LAST, b@1 ASC], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC, b@1 DESC NULLS LAST, c@2 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: TopK(fetch=100), expr=[a@0 DESC NULLS LAST, b@1 ASC], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_prefix_match_through_transparent_nodes() {
+    // Test prefix matching works through transparent nodes
+    let schema = schema();
+
+    // Source has [a DESC NULLS LAST, b ASC, c DESC] ordering
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let c = sort_expr("c", &schema);
+    let source_ordering =
+        LexOrdering::new(vec![a.clone().reverse(), b, c.reverse()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+    let repartition = repartition_exec(source);
+
+    // Request only [a ASC NULLS FIRST] - prefix of reversed ordering
+    let prefix_ordering = LexOrdering::new(vec![a.clone().asc().nulls_first()]).unwrap();
+    let plan = sort_exec(prefix_ordering, repartition);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+        -   RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 DESC NULLS LAST, b@1 ASC, c@2 DESC NULLS LAST], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+          -   RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_exact_prefix_match_same_direction() {
+    // Test that when the requested sort [a DESC] matches a prefix of the source's
+    // natural ordering [a DESC, b ASC], the Sort is eliminated (Exact pushdown).
+    let schema = schema();
+
+    // Source has [a DESC, b ASC] ordering
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone().reverse(), b]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Request [a DESC] - same direction as source prefix, Sort should be eliminated
+    let same_direction = LexOrdering::new(vec![a.clone().reverse()]).unwrap();
+    let plan = sort_exec(same_direction, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 DESC NULLS LAST, b@1 ASC], file_type=parquet
+      output:
+        Ok:
+          - DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 DESC NULLS LAST, b@1 ASC], file_type=parquet
+    "
+    );
+}
+
+#[test]
+fn test_no_prefix_match_longer_than_source() {
+    // Test that prefix matching does NOT work if requested is longer than source
+    let schema = schema();
+
+    // Source has [a DESC] ordering (single column)
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone().reverse()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Request [a ASC, b DESC] - longer than source, can't be a prefix
+    let longer_ordering =
+        LexOrdering::new(vec![a.clone().asc().nulls_first(), b.reverse()]).unwrap();
+    let plan = sort_exec(longer_ordering, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 ASC, b@1 DESC NULLS LAST], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 DESC NULLS LAST], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 ASC, b@1 DESC NULLS LAST], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 DESC NULLS LAST], file_type=parquet
+    "
+    );
+}
+
+// ============================================================================
+// ORIGINAL TESTS
+// ============================================================================
+
+#[test]
+fn test_sort_through_repartition() {
+    // Sort should push through RepartitionExec
+    let schema = schema();
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+    let repartition = repartition_exec(source);
+
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, repartition);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_nested_sorts() {
+    // Nested sort operations - only innermost can be optimized
+    let schema = schema();
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let inner_sort = sort_exec(desc_ordering, source);
+
+    let sort_exprs2 = LexOrdering::new(vec![b]).unwrap();
+    let plan = sort_exec(sort_exprs2, inner_sort);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[b@1 ASC], preserve_partitioning=[false]
+        -   SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[b@1 ASC], preserve_partitioning=[false]
+          -   SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_non_sort_plans_unchanged() {
+    // Plans without SortExec should pass through unchanged
+    let schema = schema();
+    let plan = parquet_exec(schema.clone());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      output:
+        Ok:
+          - DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    "
+    );
+}
+
+#[test]
+fn test_optimizer_properties() {
+    // Test optimizer metadata
+    let optimizer = PushdownSort::new();
+
+    assert_eq!(optimizer.name(), "PushdownSort");
+    assert!(optimizer.schema_check());
+}
+
+#[test]
+fn test_sort_through_coalesce_partitions() {
+    // Sort should push through CoalescePartitionsExec
+    let schema = schema();
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+    let repartition = repartition_exec(source);
+    let coalesce_parts = coalesce_partitions_exec(repartition);
+
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, coalesce_parts);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   CoalescePartitionsExec
+        -     RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+        -       DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   CoalescePartitionsExec
+          -     RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          -       DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_complex_plan_with_multiple_operators() {
+    // Test a complex plan with multiple operators between sort and source
+    let schema = schema();
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+    let repartition = repartition_exec(source);
+    let coalesce_parts = coalesce_partitions_exec(repartition);
+
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, coalesce_parts);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   CoalescePartitionsExec
+        -     RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+        -       DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   CoalescePartitionsExec
+          -     RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          -       DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_multiple_sorts_different_columns() {
+    // Test nested sorts on different columns - only innermost can optimize
+    let schema = schema();
+    let a = sort_expr("a", &schema);
+    let c = sort_expr("c", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // First sort by column 'a' DESC (reverse of source)
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let sort1 = sort_exec(desc_ordering, source);
+
+    // Then sort by column 'c' (different column, can't optimize)
+    let sort_exprs2 = LexOrdering::new(vec![c]).unwrap();
+    let plan = sort_exec(sort_exprs2, sort1);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+        -   SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+          -   SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_no_pushdown_for_unordered_source() {
+    // Verify pushdown does NOT happen for sources without ordering
+    let schema = schema();
+    let source = parquet_exec(schema.clone()); // No output_ordering
+    let sort_exprs = LexOrdering::new(vec![sort_expr("a", &schema)]).unwrap();
+    let plan = sort_exec(sort_exprs, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    "
+    );
+}
+
+#[test]
+fn test_no_pushdown_for_non_reverse_sort() {
+    // Verify pushdown does NOT happen when sort doesn't reverse source ordering
+    let schema = schema();
+
+    // Source sorted by 'a' ASC
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let source_ordering = LexOrdering::new(vec![a]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Request sort by 'b' (different column)
+    let sort_exprs = LexOrdering::new(vec![b]).unwrap();
+    let plan = sort_exec(sort_exprs, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[b@1 ASC], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[b@1 ASC], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    "
+    );
+}
+
+#[test]
+fn test_pushdown_through_blocking_node() {
+    // Test that pushdown works for inner sort even when outer sort is blocked
+    // Structure: Sort -> Aggregate (blocks pushdown) -> Sort -> Scan
+    // The outer sort can't push through aggregate, but the inner sort should still optimize
+    use datafusion_functions_aggregate::count::count_udaf;
+    use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+    use datafusion_physical_plan::aggregates::{
+        AggregateExec, AggregateMode, PhysicalGroupBy,
+    };
+    use std::sync::Arc;
+
+    let schema = schema();
+
+    // Bottom: DataSource with [a ASC NULLS LAST] ordering
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Inner Sort: [a DESC NULLS FIRST] - exact reverse, CAN push down to source
+    let inner_sort_ordering = LexOrdering::new(vec![a.clone().reverse()]).unwrap();
+    let inner_sort = sort_exec(inner_sort_ordering, source);
+
+    // Middle: Aggregate (blocks pushdown from outer sort)
+    // GROUP BY a, COUNT(b)
+    let group_by = PhysicalGroupBy::new_single(vec![(
+        Arc::new(expressions::Column::new("a", 0)) as _,
+        "a".to_string(),
+    )]);
+
+    let count_expr = Arc::new(
+        AggregateExprBuilder::new(
+            count_udaf(),
+            vec![Arc::new(expressions::Column::new("b", 1)) as _],
+        )
+        .schema(Arc::clone(&schema))
+        .alias("COUNT(b)")
+        .build()
+        .unwrap(),
+    );
+
+    let aggregate = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Final,
+            group_by,
+            vec![count_expr],
+            vec![None],
+            inner_sort,
+            Arc::clone(&schema),
+        )
+        .unwrap(),
+    );
+
+    // Outer Sort: [a ASC] - this CANNOT push down through aggregate
+    let outer_sort_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let plan = sort_exec(outer_sort_ordering, aggregate);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+        -   AggregateExec: mode=Final, gby=[a@0 as a], aggr=[COUNT(b)], ordering_mode=Sorted
+        -     SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -       DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+          -   AggregateExec: mode=Final, gby=[a@0 as a], aggr=[COUNT(b)], ordering_mode=Sorted
+          -     SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -       DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+// ============================================================================
+// PROJECTION TESTS
+// ============================================================================
+
+#[test]
+fn test_sort_pushdown_through_simple_projection() {
+    // Sort pushes through projection with simple column references
+    let schema = schema();
+
+    // Source has [a ASC] ordering
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Projection: SELECT a, b (simple column references)
+    let projection = simple_projection_exec(source, vec![0, 1]); // columns a, b
+
+    // Request [a DESC] - should push through projection to source
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, projection);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   ProjectionExec: expr=[a@0 as a, b@1 as b]
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   ProjectionExec: expr=[a@0 as a, b@1 as b]
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_sort_pushdown_through_projection_with_alias() {
+    // Sort pushes through projection with column aliases
+    let schema = schema();
+
+    // Source has [a ASC] ordering
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Projection: SELECT a AS id, b AS value
+    let projection = projection_exec_with_alias(source, vec![(0, "id"), (1, "value")]);
+
+    // Request [id DESC] - should map to [a DESC] and push down
+    let id_expr = sort_expr_named("id", 0);
+    let desc_ordering = LexOrdering::new(vec![id_expr.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, projection);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[id@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   ProjectionExec: expr=[a@0 as id, b@1 as value]
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[id@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   ProjectionExec: expr=[a@0 as id, b@1 as value]
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_no_sort_pushdown_through_computed_projection() {
+    use datafusion_expr::Operator;
+
+    // Sort should NOT push through projection with computed columns
+    let schema = schema();
+
+    // Source has [a ASC] ordering
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Projection: SELECT a+b as sum, c
+    let projection = projection_exec(
+        vec![
+            (
+                Arc::new(expressions::BinaryExpr::new(
+                    Arc::new(expressions::Column::new("a", 0)),
+                    Operator::Plus,
+                    Arc::new(expressions::Column::new("b", 1)),
+                )) as Arc<dyn PhysicalExpr>,
+                "sum".to_string(),
+            ),
+            (
+                Arc::new(expressions::Column::new("c", 2)) as Arc<dyn PhysicalExpr>,
+                "c".to_string(),
+            ),
+        ],
+        source,
+    )
+    .unwrap();
+
+    // Request [sum DESC] - should NOT push down (sum is computed)
+    let sum_expr = sort_expr_named("sum", 0);
+    let desc_ordering = LexOrdering::new(vec![sum_expr.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, projection);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[sum@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   ProjectionExec: expr=[a@0 + b@1 as sum, c@2 as c]
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[sum@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   ProjectionExec: expr=[a@0 + b@1 as sum, c@2 as c]
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    "
+    );
+}
+
+#[test]
+fn test_sort_pushdown_projection_reordered_columns() {
+    // Sort pushes through projection that reorders columns
+    let schema = schema();
+
+    // Source has [a ASC] ordering
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Projection: SELECT c, b, a (columns reordered)
+    let projection = simple_projection_exec(source, vec![2, 1, 0]); // c, b, a
+
+    // Request [a DESC] where a is now at index 2 in projection output
+    let a_expr_at_2 = sort_expr_named("a", 2);
+    let desc_ordering = LexOrdering::new(vec![a_expr_at_2.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, projection);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@2 DESC NULLS LAST], preserve_partitioning=[false]
+        -   ProjectionExec: expr=[c@2 as c, b@1 as b, a@0 as a]
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@2 DESC NULLS LAST], preserve_partitioning=[false]
+          -   ProjectionExec: expr=[c@2 as c, b@1 as b, a@0 as a]
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_sort_pushdown_projection_with_limit() {
+    // Sort with LIMIT pushes through simple projection
+    let schema = schema();
+
+    // Source has [a ASC] ordering
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Projection: SELECT a, b
+    let projection = simple_projection_exec(source, vec![0, 1]);
+
+    // Request [a DESC] with LIMIT 10
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let plan = sort_exec_with_fetch(desc_ordering, Some(10), projection);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: TopK(fetch=10), expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   ProjectionExec: expr=[a@0 as a, b@1 as b]
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: TopK(fetch=10), expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   ProjectionExec: expr=[a@0 as a, b@1 as b]
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_sort_pushdown_through_projection() {
+    // Sort pushes through both projection and coalesce batches
+    let schema = schema();
+
+    // Source has [a ASC] ordering
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Projection: SELECT a, b
+    let projection = simple_projection_exec(source, vec![0, 1]);
+
+    // Request [a DESC]
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, projection);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   ProjectionExec: expr=[a@0 as a, b@1 as b]
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   ProjectionExec: expr=[a@0 as a, b@1 as b]
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_sort_pushdown_projection_subset_of_columns() {
+    // Sort pushes through projection that selects subset of columns
+    let schema = schema();
+
+    // Source has [a ASC, b ASC] ordering
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone(), b.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Projection: SELECT a (subset of columns)
+    let projection = simple_projection_exec(source, vec![0]);
+
+    // Request [a DESC]
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, projection);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   ProjectionExec: expr=[a@0 as a]
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC, b@1 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   ProjectionExec: expr=[a@0 as a]
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+// ============================================================================
+// TESTSCAN DEMONSTRATION TESTS
+// ============================================================================
+// These tests use TestScan to demonstrate how sort pushdown works more clearly
+// than ParquetExec. TestScan can accept ANY ordering (not just reverse) and
+// displays the requested ordering explicitly in the output.
+
+#[test]
+fn test_sort_pushdown_with_test_scan_basic() {
+    // Demonstrates TestScan showing requested ordering clearly
+    let schema = schema();
+
+    // Source has [a ASC] ordering
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = test_scan_with_ordering(schema.clone(), source_ordering);
+
+    // Request [a DESC] ordering
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   TestScan: output_ordering=[a@0 ASC]
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   TestScan: output_ordering=[a@0 ASC], requested_ordering=[a@0 DESC NULLS LAST]
+    "
+    );
+}
+
+#[test]
+fn test_sort_pushdown_with_test_scan_multi_column() {
+    // Demonstrates TestScan with multi-column ordering
+    let schema = schema();
+
+    // Source has [a ASC, b DESC] ordering
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone(), b.clone().reverse()]).unwrap();
+    let source = test_scan_with_ordering(schema.clone(), source_ordering);
+
+    // Request [a DESC, b ASC] ordering (reverse of source)
+    let reverse_ordering = LexOrdering::new(vec![a.reverse(), b]).unwrap();
+    let plan = sort_exec(reverse_ordering, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST, b@1 ASC], preserve_partitioning=[false]
+        -   TestScan: output_ordering=[a@0 ASC, b@1 DESC NULLS LAST]
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST, b@1 ASC], preserve_partitioning=[false]
+          -   TestScan: output_ordering=[a@0 ASC, b@1 DESC NULLS LAST], requested_ordering=[a@0 DESC NULLS LAST, b@1 ASC]
+    "
+    );
+}
+
+#[test]
+fn test_sort_pushdown_with_test_scan_arbitrary_ordering() {
+    // Demonstrates that TestScan can accept ANY ordering (not just reverse)
+    // This is different from ParquetExec which only supports reverse scans
+    let schema = schema();
+
+    // Source has [a ASC, b ASC] ordering
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone(), b.clone()]).unwrap();
+    let source = test_scan_with_ordering(schema.clone(), source_ordering);
+
+    // Request [a ASC, b DESC] - NOT a simple reverse, but TestScan accepts it
+    let mixed_ordering = LexOrdering::new(vec![a, b.reverse()]).unwrap();
+    let plan = sort_exec(mixed_ordering, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 ASC, b@1 DESC NULLS LAST], preserve_partitioning=[false]
+        -   TestScan: output_ordering=[a@0 ASC, b@1 ASC]
+      output:
+        Ok:
+          - SortExec: expr=[a@0 ASC, b@1 DESC NULLS LAST], preserve_partitioning=[false]
+          -   TestScan: output_ordering=[a@0 ASC, b@1 ASC], requested_ordering=[a@0 ASC, b@1 DESC NULLS LAST]
+    "
+    );
+}
+
+// ============================================================================
+// EXACT PUSHDOWN TESTS (source guarantees ordering, SortExec removed)
+// ============================================================================
+
+#[test]
+fn test_sort_pushdown_exact_no_fetch_no_limit() {
+    // When a source returns Exact (without fetch), the SortExec should be
+    // removed entirely with no GlobalLimitExec wrapper.
+    let schema = schema();
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let source =
+        Arc::new(TestScan::new(schema.clone(), vec![]).with_exact_pushdown(true));
+
+    let ordering = LexOrdering::new(vec![a, b.reverse()]).unwrap();
+    let plan = sort_exec(ordering, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 ASC, b@1 DESC NULLS LAST], preserve_partitioning=[false]
+        -   TestScan
+      output:
+        Ok:
+          - TestScan: requested_ordering=[a@0 ASC, b@1 DESC NULLS LAST]
+    "
+    );
+}
+
+#[test]
+fn test_sort_pushdown_exact_preserves_fetch_with_global_limit() {
+    // When a source returns Exact but does NOT support with_fetch(),
+    // the optimizer must wrap the result with GlobalLimitExec to preserve
+    // the LIMIT from the eliminated SortExec.
+    let schema = schema();
+    let a = sort_expr("a", &schema);
+    let source =
+        Arc::new(TestScan::new(schema.clone(), vec![]).with_exact_pushdown(true));
+
+    let ordering = LexOrdering::new(vec![a]).unwrap();
+    let plan = sort_exec_with_fetch(ordering, Some(10), source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: TopK(fetch=10), expr=[a@0 ASC], preserve_partitioning=[false]
+        -   TestScan
+      output:
+        Ok:
+          - GlobalLimitExec: skip=0, fetch=10
+          -   TestScan: requested_ordering=[a@0 ASC]
+    "
+    );
+}
+
+#[test]
+fn test_sort_pushdown_exact_preserves_fetch_with_source_support() {
+    // When a source returns Exact AND supports with_fetch(),
+    // the limit should be pushed into the source directly (no GlobalLimitExec).
+    let schema = schema();
+    let a = sort_expr("a", &schema);
+    let source = Arc::new(
+        TestScan::new(schema.clone(), vec![])
+            .with_exact_pushdown(true)
+            .with_supports_fetch(true),
+    );
+
+    let ordering = LexOrdering::new(vec![a]).unwrap();
+    let plan = sort_exec_with_fetch(ordering, Some(10), source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: TopK(fetch=10), expr=[a@0 ASC], preserve_partitioning=[false]
+        -   TestScan
+      output:
+        Ok:
+          - TestScan: requested_ordering=[a@0 ASC], fetch=10
+    "
+    );
+}
diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs b/datafusion/core/tests/physical_optimizer/pushdown_utils.rs
similarity index 79%
rename from datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs
rename to datafusion/core/tests/physical_optimizer/pushdown_utils.rs
index 7d8a9c7c2125c..8b659e757aa2a 100644
--- a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs
+++ b/datafusion/core/tests/physical_optimizer/pushdown_utils.rs
@@ -18,33 +18,31 @@
 use arrow::datatypes::SchemaRef;
 use arrow::{array::RecordBatch, compute::concat_batches};
 use datafusion::{datasource::object_store::ObjectStoreUrl, physical_plan::PhysicalExpr};
-use datafusion_common::{config::ConfigOptions, internal_err, Result, Statistics};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, config::ConfigOptions, internal_err};
 use datafusion_datasource::{
-    file::FileSource, file_scan_config::FileScanConfig,
+    PartitionedFile, file::FileSource, file_scan_config::FileScanConfig,
     file_scan_config::FileScanConfigBuilder, file_stream::FileOpenFuture,
-    file_stream::FileOpener, schema_adapter::DefaultSchemaAdapterFactory,
-    schema_adapter::SchemaAdapterFactory, source::DataSourceExec, PartitionedFile,
-    TableSchema,
+    file_stream::FileOpener, source::DataSourceExec,
 };
+use datafusion_physical_expr::projection::ProjectionExprs;
 use datafusion_physical_expr_common::physical_expr::fmt_sql;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
 use datafusion_physical_plan::filter::batch_filter;
 use datafusion_physical_plan::filter_pushdown::{FilterPushdownPhase, PushedDown};
 use datafusion_physical_plan::{
-    displayable,
+    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, displayable,
     filter::FilterExec,
     filter_pushdown::{
         ChildFilterDescription, ChildPushdownResult, FilterDescription,
         FilterPushdownPropagation,
     },
     metrics::ExecutionPlanMetricsSet,
-    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
 };
 use futures::StreamExt;
 use futures::{FutureExt, Stream};
 use object_store::ObjectStore;
 use std::{
-    any::Any,
     fmt::{Display, Formatter},
     pin::Pin,
     sync::Arc,
@@ -53,14 +51,17 @@ use std::{
 pub struct TestOpener {
     batches: Vec<RecordBatch>,
     batch_size: Option<usize>,
-    schema: Option<SchemaRef>,
-    projection: Option<Vec<usize>>,
+    projection: Option<ProjectionExprs>,
     predicate: Option<Arc<dyn PhysicalExpr>>,
 }
 
 impl FileOpener for TestOpener {
     fn open(&self, _partitioned_file: PartitionedFile) -> Result<FileOpenFuture> {
         let mut batches = self.batches.clone();
+        if self.batches.is_empty() {
+            return Ok((async { Ok(TestStream::new(vec![]).boxed()) }).boxed());
+        }
+        let schema = self.batches[0].schema();
         if let Some(batch_size) = self.batch_size {
             let batch = concat_batches(&batches[0].schema(), &batches)?;
             let mut new_batches = Vec::new();
@@ -71,27 +72,23 @@ impl FileOpener for TestOpener {
             }
             batches = new_batches.into_iter().collect();
         }
-        if let Some(schema) = &self.schema {
-            let factory = DefaultSchemaAdapterFactory::from_schema(Arc::clone(schema));
-            let (mapper, projection) = factory.map_schema(&batches[0].schema()).unwrap();
-            let mut new_batches = Vec::new();
-            for batch in batches {
-                let batch = if let Some(predicate) = &self.predicate {
-                    batch_filter(&batch, predicate)?
-                } else {
-                    batch
-                };
 
-                let batch = batch.project(&projection).unwrap();
-                let batch = mapper.map_batch(batch).unwrap();
-                new_batches.push(batch);
-            }
-            batches = new_batches;
+        let mut new_batches = Vec::new();
+        for batch in batches {
+            let batch = if let Some(predicate) = &self.predicate {
+                batch_filter(&batch, predicate)?
+            } else {
+                batch
+            };
+            new_batches.push(batch);
         }
+        batches = new_batches;
+
         if let Some(projection) = &self.projection {
+            let projector = projection.make_projector(&schema)?;
             batches = batches
                 .into_iter()
-                .map(|batch| batch.project(projection).unwrap())
+                .map(|batch| projector.project_batch(&batch).unwrap())
                 .collect();
         }
 
@@ -102,26 +99,28 @@ impl FileOpener for TestOpener {
 }
 
 /// A placeholder data source that accepts filter pushdown
-#[derive(Clone, Default)]
+#[derive(Clone)]
 pub struct TestSource {
     support: bool,
     predicate: Option<Arc<dyn PhysicalExpr>>,
-    statistics: Option<Statistics>,
     batch_size: Option<usize>,
     batches: Vec<RecordBatch>,
-    schema: Option<SchemaRef>,
     metrics: ExecutionPlanMetricsSet,
-    projection: Option<Vec<usize>>,
-    schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
+    projection: Option<ProjectionExprs>,
+    table_schema: datafusion_datasource::TableSchema,
 }
 
 impl TestSource {
-    pub fn new(support: bool, batches: Vec<RecordBatch>) -> Self {
+    pub fn new(schema: SchemaRef, support: bool, batches: Vec<RecordBatch>) -> Self {
+        let table_schema = datafusion_datasource::TableSchema::new(schema, vec![]);
         Self {
             support,
             metrics: ExecutionPlanMetricsSet::new(),
             batches,
-            ..Default::default()
+            predicate: None,
+            batch_size: None,
+            projection: None,
+            table_schema,
         }
     }
 }
@@ -132,24 +131,19 @@ impl FileSource for TestSource {
         _object_store: Arc<dyn ObjectStore>,
         _base_config: &FileScanConfig,
         _partition: usize,
-    ) -> Arc<dyn FileOpener> {
-        Arc::new(TestOpener {
+    ) -> Result<Arc<dyn FileOpener>> {
+        Ok(Arc::new(TestOpener {
             batches: self.batches.clone(),
             batch_size: self.batch_size,
-            schema: self.schema.clone(),
             projection: self.projection.clone(),
             predicate: self.predicate.clone(),
-        })
+        }))
     }
 
     fn filter(&self) -> Option<Arc<dyn PhysicalExpr>> {
         self.predicate.clone()
     }
 
-    fn as_any(&self) -> &dyn Any {
-        todo!("should not be called")
-    }
-
     fn with_batch_size(&self, batch_size: usize) -> Arc<dyn FileSource> {
         Arc::new(TestSource {
             batch_size: Some(batch_size),
@@ -157,43 +151,10 @@ impl FileSource for TestSource {
         })
     }
 
-    fn with_schema(&self, schema: TableSchema) -> Arc<dyn FileSource> {
-        assert!(
-            schema.table_partition_cols().is_empty(),
-            "TestSource does not support partition columns"
-        );
-        Arc::new(TestSource {
-            schema: Some(schema.file_schema().clone()),
-            ..self.clone()
-        })
-    }
-
-    fn with_projection(&self, config: &FileScanConfig) -> Arc<dyn FileSource> {
-        Arc::new(TestSource {
-            projection: config.projection_exprs.as_ref().map(|p| p.column_indices()),
-            ..self.clone()
-        })
-    }
-
-    fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource> {
-        Arc::new(TestSource {
-            statistics: Some(statistics),
-            ..self.clone()
-        })
-    }
-
     fn metrics(&self) -> &ExecutionPlanMetricsSet {
         &self.metrics
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(self
-            .statistics
-            .as_ref()
-            .expect("statistics not set")
-            .clone())
-    }
-
     fn file_type(&self) -> &str {
         "test"
     }
@@ -247,18 +208,51 @@ impl FileSource for TestSource {
         }
     }
 
-    fn with_schema_adapter_factory(
+    fn try_pushdown_projection(
         &self,
-        schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
-    ) -> Result<Arc<dyn FileSource>> {
-        Ok(Arc::new(Self {
-            schema_adapter_factory: Some(schema_adapter_factory),
-            ..self.clone()
-        }))
+        projection: &ProjectionExprs,
+    ) -> Result<Option<Arc<dyn FileSource>>> {
+        if let Some(existing_projection) = &self.projection {
+            // Combine existing projection with new projection
+            let combined_projection = existing_projection.try_merge(projection)?;
+            Ok(Some(Arc::new(TestSource {
+                projection: Some(combined_projection),
+                table_schema: self.table_schema.clone(),
+                ..self.clone()
+            })))
+        } else {
+            Ok(Some(Arc::new(TestSource {
+                projection: Some(projection.clone()),
+                ..self.clone()
+            })))
+        }
     }
 
-    fn schema_adapter_factory(&self) -> Option<Arc<dyn SchemaAdapterFactory>> {
-        self.schema_adapter_factory.clone()
+    fn projection(&self) -> Option<&ProjectionExprs> {
+        self.projection.as_ref()
+    }
+
+    fn table_schema(&self) -> &datafusion_datasource::TableSchema {
+        &self.table_schema
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit predicate (filter) expression if present
+        if let Some(predicate) = &self.predicate {
+            f(predicate.as_ref())?;
+        }
+
+        // Visit projection expressions if present
+        if let Some(projection) = &self.projection {
+            for proj_expr in projection {
+                f(proj_expr.expr.as_ref())?;
+            }
+        }
+
+        Ok(TreeNodeRecursion::Continue)
     }
 }
 
@@ -289,14 +283,15 @@ impl TestScanBuilder {
     }
 
     pub fn build(self) -> Arc<dyn ExecutionPlan> {
-        let source = Arc::new(TestSource::new(self.support, self.batches));
-        let base_config = FileScanConfigBuilder::new(
-            ObjectStoreUrl::parse("test://").unwrap(),
+        let source = Arc::new(TestSource::new(
             Arc::clone(&self.schema),
-            source,
-        )
-        .with_file(PartitionedFile::new("test.parquet", 123))
-        .build();
+            self.support,
+            self.batches,
+        ));
+        let base_config =
+            FileScanConfigBuilder::new(ObjectStoreUrl::parse("test://").unwrap(), source)
+                .with_file(PartitionedFile::new("test.parquet", 123))
+                .build();
         DataSourceExec::from_data_source(base_config)
     }
 }
@@ -335,11 +330,12 @@ impl TestStream {
     /// least one entry in data (for the schema)
     pub fn new(data: Vec<RecordBatch>) -> Self {
         // check that there is at least one entry in data and that all batches have the same schema
-        assert!(!data.is_empty(), "data must not be empty");
-        assert!(
-            data.iter().all(|batch| batch.schema() == data[0].schema()),
-            "all batches must have the same schema"
-        );
+        if let Some(first) = data.first() {
+            assert!(
+                data.iter().all(|batch| batch.schema() == first.schema()),
+                "all batches must have the same schema"
+            );
+        }
         Self {
             data,
             ..Default::default()
@@ -377,6 +373,7 @@ pub struct OptimizationTest {
 }
 
 impl OptimizationTest {
+    #[expect(clippy::needless_pass_by_value)]
     pub fn new<O>(
         input_plan: Arc<dyn ExecutionPlan>,
         opt: O,
@@ -488,11 +485,7 @@ impl ExecutionPlan for TestNode {
         "TestInsertExec"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         self.input.properties()
     }
 
@@ -576,4 +569,13 @@ impl ExecutionPlan for TestNode {
             Ok(res)
         }
     }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit the predicate expression
+        f(self.predicate.as_ref())?;
+        Ok(TreeNodeRecursion::Continue)
+    }
 }
diff --git a/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs b/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs
index 066e52614a12e..601667ea02c0d 100644
--- a/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs
+++ b/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs
@@ -18,10 +18,10 @@
 use std::sync::Arc;
 
 use crate::physical_optimizer::test_utils::{
-    check_integrity, coalesce_batches_exec, coalesce_partitions_exec,
-    create_test_schema3, parquet_exec_with_sort, sort_exec,
-    sort_exec_with_preserve_partitioning, sort_preserving_merge_exec,
-    sort_preserving_merge_exec_with_fetch, stream_exec_ordered_with_projection,
+    check_integrity, coalesce_partitions_exec, create_test_schema3,
+    parquet_exec_with_sort, sort_exec, sort_exec_with_preserve_partitioning,
+    sort_preserving_merge_exec, sort_preserving_merge_exec_with_fetch,
+    stream_exec_ordered_with_projection,
 };
 
 use datafusion::prelude::SessionContext;
@@ -41,7 +41,6 @@ use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
 use datafusion_physical_optimizer::enforce_sorting::replace_with_order_preserving_variants::{
     plan_with_order_breaking_variants, plan_with_order_preserving_variants, replace_with_order_preserving_variants, OrderPreservationContext
 };
-use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec;
 use datafusion_physical_plan::filter::FilterExec;
 use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
 use datafusion::datasource::memory::MemorySourceConfig;
@@ -50,8 +49,8 @@ use datafusion_physical_plan::{
     collect, displayable, ExecutionPlan, Partitioning,
 };
 
+use object_store::ObjectStoreExt;
 use object_store::memory::InMemory;
-use object_store::ObjectStore;
 use rstest::rstest;
 use url::Url;
 
@@ -138,7 +137,8 @@ impl ReplaceTest {
             assert!(
                 res.is_ok(),
                 "Some errors occurred while executing the optimized physical plan: {:?}\nPlan: {}",
-                res.unwrap_err(), optimized_plan_string
+                res.unwrap_err(),
+                optimized_plan_string
             );
         }
 
@@ -192,7 +192,7 @@ async fn test_replace_multiple_input_repartition_1(
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
               SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
                 RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                     DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
             ");
         },
@@ -202,13 +202,13 @@ async fn test_replace_multiple_input_repartition_1(
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
               SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
                 RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                     StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
 
             Optimized:
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
               RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
-                RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                   StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
             ");
         },
@@ -218,13 +218,13 @@ async fn test_replace_multiple_input_repartition_1(
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
               SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
                 RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                     DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
 
             Optimized:
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
               RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
-                RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                   DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
             ");
         }
@@ -275,21 +275,21 @@ async fn test_with_inter_children_change_only(
               SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
                 FilterExec: c@1 > 3
                   RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                       SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
                         CoalescePartitionsExec
                           RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                               StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC]
 
             Optimized:
             SortPreservingMergeExec: [a@0 ASC]
               FilterExec: c@1 > 3
                 RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC
-                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                     SortPreservingMergeExec: [a@0 ASC]
                       RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC
-                        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                           StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC]
             ");
         },
@@ -300,11 +300,11 @@ async fn test_with_inter_children_change_only(
               SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
                 FilterExec: c@1 > 3
                   RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                       SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
                         CoalescePartitionsExec
                           RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                               DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC
             ");
         },
@@ -315,21 +315,21 @@ async fn test_with_inter_children_change_only(
               SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
                 FilterExec: c@1 > 3
                   RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                       SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
                         CoalescePartitionsExec
                           RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                               DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC
 
             Optimized:
             SortPreservingMergeExec: [a@0 ASC]
               FilterExec: c@1 > 3
                 RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC
-                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                     SortPreservingMergeExec: [a@0 ASC]
                       RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC
-                        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                           DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC
             ");
         }
@@ -375,14 +375,14 @@ async fn test_replace_multiple_input_repartition_2(
               SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
                 RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
                   FilterExec: c@1 > 3
-                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                       StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
 
             Optimized:
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
               RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
                 FilterExec: c@1 > 3
-                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                     StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
             ");
         },
@@ -393,7 +393,7 @@ async fn test_replace_multiple_input_repartition_2(
               SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
                 RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
                   FilterExec: c@1 > 3
-                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                       DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
             ");
         },
@@ -404,14 +404,14 @@ async fn test_replace_multiple_input_repartition_2(
               SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
                 RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
                   FilterExec: c@1 > 3
-                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                       DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
 
             Optimized:
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
               RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
                 FilterExec: c@1 > 3
-                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                     DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
             ");
         }
@@ -439,9 +439,7 @@ async fn test_replace_multiple_input_repartition_with_extra_steps(
     let repartition_rr = repartition_exec_round_robin(source);
     let repartition_hash = repartition_exec_hash(repartition_rr);
     let filter = filter_exec(repartition_hash);
-    let coalesce_batches_exec = coalesce_batches_exec(filter, 8192);
-    let sort =
-        sort_exec_with_preserve_partitioning(ordering.clone(), coalesce_batches_exec);
+    let sort = sort_exec_with_preserve_partitioning(ordering.clone(), filter);
     let physical_plan = sort_preserving_merge_exec(ordering, sort);
 
     let run = ReplaceTest::new(physical_plan)
@@ -457,19 +455,17 @@ async fn test_replace_multiple_input_repartition_with_extra_steps(
             Input:
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
               SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
-                CoalesceBatchesExec: target_batch_size=8192
-                  FilterExec: c@1 > 3
-                    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                        StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
 
             Optimized:
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
-              CoalesceBatchesExec: target_batch_size=8192
-                FilterExec: c@1 > 3
-                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
-                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                      StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+              FilterExec: c@1 > 3
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
             ");
         },
         (Boundedness::Bounded, SortPreference::MaximizeParallelism) => {
@@ -477,11 +473,10 @@ async fn test_replace_multiple_input_repartition_with_extra_steps(
             Input / Optimized:
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
               SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
-                CoalesceBatchesExec: target_batch_size=8192
-                  FilterExec: c@1 > 3
-                    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                        DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
             ");
         },
         (Boundedness::Bounded, SortPreference::PreserveOrder) => {
@@ -489,19 +484,17 @@ async fn test_replace_multiple_input_repartition_with_extra_steps(
             Input:
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
               SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
-                CoalesceBatchesExec: target_batch_size=8192
-                  FilterExec: c@1 > 3
-                    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                        DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
 
             Optimized:
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
-              CoalesceBatchesExec: target_batch_size=8192
-                FilterExec: c@1 > 3
-                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
-                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+              FilterExec: c@1 > 3
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
             ");
         }
     }
@@ -526,12 +519,9 @@ async fn test_replace_multiple_input_repartition_with_extra_steps_2(
         Boundedness::Bounded => memory_exec_sorted(&schema, ordering.clone()),
     };
     let repartition_rr = repartition_exec_round_robin(source);
-    let coalesce_batches_exec_1 = coalesce_batches_exec(repartition_rr, 8192);
-    let repartition_hash = repartition_exec_hash(coalesce_batches_exec_1);
+    let repartition_hash = repartition_exec_hash(repartition_rr);
     let filter = filter_exec(repartition_hash);
-    let coalesce_batches_exec_2 = coalesce_batches_exec(filter, 8192);
-    let sort =
-        sort_exec_with_preserve_partitioning(ordering.clone(), coalesce_batches_exec_2);
+    let sort = sort_exec_with_preserve_partitioning(ordering.clone(), filter);
     let physical_plan = sort_preserving_merge_exec(ordering, sort);
 
     let run = ReplaceTest::new(physical_plan)
@@ -547,21 +537,17 @@ async fn test_replace_multiple_input_repartition_with_extra_steps_2(
             Input:
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
               SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
-                CoalesceBatchesExec: target_batch_size=8192
-                  FilterExec: c@1 > 3
-                    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                      CoalesceBatchesExec: target_batch_size=8192
-                        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                          StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
 
             Optimized:
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
-              CoalesceBatchesExec: target_batch_size=8192
-                FilterExec: c@1 > 3
-                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
-                    CoalesceBatchesExec: target_batch_size=8192
-                      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                        StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+              FilterExec: c@1 > 3
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
             ");
         },
         (Boundedness::Bounded, SortPreference::MaximizeParallelism) => {
@@ -569,12 +555,10 @@ async fn test_replace_multiple_input_repartition_with_extra_steps_2(
             Input / Optimized:
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
               SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
-                CoalesceBatchesExec: target_batch_size=8192
-                  FilterExec: c@1 > 3
-                    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                      CoalesceBatchesExec: target_batch_size=8192
-                        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                          DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
             ");
         },
         (Boundedness::Bounded, SortPreference::PreserveOrder) => {
@@ -582,21 +566,17 @@ async fn test_replace_multiple_input_repartition_with_extra_steps_2(
             Input:
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
               SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
-                CoalesceBatchesExec: target_batch_size=8192
-                  FilterExec: c@1 > 3
-                    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                      CoalesceBatchesExec: target_batch_size=8192
-                        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                          DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
 
             Optimized:
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
-              CoalesceBatchesExec: target_batch_size=8192
-                FilterExec: c@1 > 3
-                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
-                    CoalesceBatchesExec: target_batch_size=8192
-                      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                        DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+              FilterExec: c@1 > 3
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
             ");
         }
     }
@@ -621,8 +601,7 @@ async fn test_not_replacing_when_no_need_to_preserve_sorting(
     let repartition_rr = repartition_exec_round_robin(source);
     let repartition_hash = repartition_exec_hash(repartition_rr);
     let filter = filter_exec(repartition_hash);
-    let coalesce_batches_exec = coalesce_batches_exec(filter, 8192);
-    let physical_plan = coalesce_partitions_exec(coalesce_batches_exec);
+    let physical_plan = coalesce_partitions_exec(filter);
 
     let run = ReplaceTest::new(physical_plan)
         .with_boundedness(boundedness)
@@ -636,22 +615,20 @@ async fn test_not_replacing_when_no_need_to_preserve_sorting(
             assert_snapshot!(physical_plan, @r"
             Input / Optimized:
             CoalescePartitionsExec
-              CoalesceBatchesExec: target_batch_size=8192
-                FilterExec: c@1 > 3
-                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                      StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+              FilterExec: c@1 > 3
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
             ");
         },
         (Boundedness::Bounded, SortPreference::MaximizeParallelism) => {
             assert_snapshot!(physical_plan, @r"
             Input / Optimized:
             CoalescePartitionsExec
-              CoalesceBatchesExec: target_batch_size=8192
-                FilterExec: c@1 > 3
-                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+              FilterExec: c@1 > 3
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
             ");
                 // Expected bounded results same with and without flag, because there is no executor  with ordering requirement
         },
@@ -659,11 +636,10 @@ async fn test_not_replacing_when_no_need_to_preserve_sorting(
             assert_snapshot!(physical_plan, @r"
             Input / Optimized:
             CoalescePartitionsExec
-              CoalesceBatchesExec: target_batch_size=8192
-                FilterExec: c@1 > 3
-                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+              FilterExec: c@1 > 3
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
             ");
         }
     }
@@ -690,8 +666,7 @@ async fn test_with_multiple_replaceable_repartitions(
     let repartition_rr = repartition_exec_round_robin(source);
     let repartition_hash = repartition_exec_hash(repartition_rr);
     let filter = filter_exec(repartition_hash);
-    let coalesce_batches = coalesce_batches_exec(filter, 8192);
-    let repartition_hash_2 = repartition_exec_hash(coalesce_batches);
+    let repartition_hash_2 = repartition_exec_hash(filter);
     let sort = sort_exec_with_preserve_partitioning(ordering.clone(), repartition_hash_2);
     let physical_plan = sort_preserving_merge_exec(ordering, sort);
 
@@ -709,20 +684,18 @@ async fn test_with_multiple_replaceable_repartitions(
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
               SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
                 RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                  CoalesceBatchesExec: target_batch_size=8192
-                    FilterExec: c@1 > 3
-                      RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                          StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+                  FilterExec: c@1 > 3
+                    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                        StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
 
             Optimized:
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
               RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
-                CoalesceBatchesExec: target_batch_size=8192
-                  FilterExec: c@1 > 3
-                    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
-                      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                        StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
             ");
         },
         (Boundedness::Bounded, SortPreference::MaximizeParallelism) => {
@@ -731,11 +704,10 @@ async fn test_with_multiple_replaceable_repartitions(
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
               SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
                 RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                  CoalesceBatchesExec: target_batch_size=8192
-                    FilterExec: c@1 > 3
-                      RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                          DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+                  FilterExec: c@1 > 3
+                    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                        DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
             ");
         },
         (Boundedness::Bounded, SortPreference::PreserveOrder) => {
@@ -744,20 +716,18 @@ async fn test_with_multiple_replaceable_repartitions(
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
               SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
                 RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                  CoalesceBatchesExec: target_batch_size=8192
-                    FilterExec: c@1 > 3
-                      RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                          DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+                  FilterExec: c@1 > 3
+                    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                        DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
 
             Optimized:
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
               RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
-                CoalesceBatchesExec: target_batch_size=8192
-                  FilterExec: c@1 > 3
-                    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
-                      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                        DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
             ");
         }
     }
@@ -804,7 +774,7 @@ async fn test_not_replace_with_different_orderings(
             SortPreservingMergeExec: [c@1 ASC]
               SortExec: expr=[c@1 ASC], preserve_partitioning=[true]
                 RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                     StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
             ");
         },
@@ -814,7 +784,7 @@ async fn test_not_replace_with_different_orderings(
             SortPreservingMergeExec: [c@1 ASC]
               SortExec: expr=[c@1 ASC], preserve_partitioning=[true]
                 RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                     DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
             ");
                 // Expected bounded results same with and without flag, because ordering requirement of the executor is
@@ -826,7 +796,7 @@ async fn test_not_replace_with_different_orderings(
             SortPreservingMergeExec: [c@1 ASC]
               SortExec: expr=[c@1 ASC], preserve_partitioning=[true]
                 RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                     DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
             ");
         }
@@ -870,13 +840,13 @@ async fn test_with_lost_ordering(
             SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false]
               CoalescePartitionsExec
                 RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                     StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
 
             Optimized:
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
               RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
-                RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                   StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
             ");
         },
@@ -886,7 +856,7 @@ async fn test_with_lost_ordering(
             SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false]
               CoalescePartitionsExec
                 RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                     DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
             ");
         },
@@ -896,13 +866,13 @@ async fn test_with_lost_ordering(
             SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false]
               CoalescePartitionsExec
                 RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                     DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
 
             Optimized:
             SortPreservingMergeExec: [a@0 ASC NULLS LAST]
               RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
-                RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                   DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
             ");
         }
@@ -956,22 +926,22 @@ async fn test_with_lost_and_kept_ordering(
               SortExec: expr=[c@1 ASC], preserve_partitioning=[true]
                 FilterExec: c@1 > 3
                   RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                       SortExec: expr=[c@1 ASC], preserve_partitioning=[false]
                         CoalescePartitionsExec
                           RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                               StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
 
             Optimized:
             SortPreservingMergeExec: [c@1 ASC]
               FilterExec: c@1 > 3
                 RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=c@1 ASC
-                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                     SortExec: expr=[c@1 ASC], preserve_partitioning=[false]
                       CoalescePartitionsExec
                         RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                             StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
             ");
         },
@@ -982,11 +952,11 @@ async fn test_with_lost_and_kept_ordering(
               SortExec: expr=[c@1 ASC], preserve_partitioning=[true]
                 FilterExec: c@1 > 3
                   RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                       SortExec: expr=[c@1 ASC], preserve_partitioning=[false]
                         CoalescePartitionsExec
                           RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                               DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
             ");
         },
@@ -997,22 +967,22 @@ async fn test_with_lost_and_kept_ordering(
               SortExec: expr=[c@1 ASC], preserve_partitioning=[true]
                 FilterExec: c@1 > 3
                   RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                       SortExec: expr=[c@1 ASC], preserve_partitioning=[false]
                         CoalescePartitionsExec
                           RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                               DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
 
             Optimized:
             SortPreservingMergeExec: [c@1 ASC]
               FilterExec: c@1 > 3
                 RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=c@1 ASC
-                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                     SortExec: expr=[c@1 ASC], preserve_partitioning=[false]
                       CoalescePartitionsExec
                         RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+                          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
                             DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
             ");
         }
@@ -1040,8 +1010,6 @@ async fn test_with_multiple_child_trees(
     };
     let left_repartition_rr = repartition_exec_round_robin(left_source);
     let left_repartition_hash = repartition_exec_hash(left_repartition_rr);
-    let left_coalesce_partitions =
-        Arc::new(CoalesceBatchesExec::new(left_repartition_hash, 4096));
 
     let right_ordering = [sort_expr("a", &schema)].into();
     let right_source = match boundedness {
@@ -1052,11 +1020,8 @@ async fn test_with_multiple_child_trees(
     };
     let right_repartition_rr = repartition_exec_round_robin(right_source);
     let right_repartition_hash = repartition_exec_hash(right_repartition_rr);
-    let right_coalesce_partitions =
-        Arc::new(CoalesceBatchesExec::new(right_repartition_hash, 4096));
 
-    let hash_join_exec =
-        hash_join_exec(left_coalesce_partitions, right_coalesce_partitions);
+    let hash_join_exec = hash_join_exec(left_repartition_hash, right_repartition_hash);
     let ordering: LexOrdering = [sort_expr_default("a", &hash_join_exec.schema())].into();
     let sort = sort_exec_with_preserve_partitioning(ordering.clone(), hash_join_exec);
     let physical_plan = sort_preserving_merge_exec(ordering, sort);
@@ -1075,14 +1040,12 @@ async fn test_with_multiple_child_trees(
             SortPreservingMergeExec: [a@0 ASC]
               SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
                 HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@1, c@1)]
-                  CoalesceBatchesExec: target_batch_size=4096
-                    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                        StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
-                  CoalesceBatchesExec: target_batch_size=4096
-                    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                        StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
             ");
         },
         (Boundedness::Bounded, _) => {
@@ -1091,14 +1054,12 @@ async fn test_with_multiple_child_trees(
             SortPreservingMergeExec: [a@0 ASC]
               SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
                 HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@1, c@1)]
-                  CoalesceBatchesExec: target_batch_size=4096
-                    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                        DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
-                  CoalesceBatchesExec: target_batch_size=4096
-                    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
-                      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-                        DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
             ");
                 // Expected bounded results same with and without flag, because ordering get lost during intermediate executor anyway.
                 //  Hence, no need to preserve existing ordering.
@@ -1166,8 +1127,8 @@ fn hash_join_exec(
 ) -> Arc<dyn ExecutionPlan> {
     let left_on = col("c", &left.schema()).unwrap();
     let right_on = col("c", &right.schema()).unwrap();
-    let left_col = left_on.as_any().downcast_ref::<Column>().unwrap();
-    let right_col = right_on.as_any().downcast_ref::<Column>().unwrap();
+    let left_col = left_on.downcast_ref::<Column>().unwrap();
+    let right_col = right_on.downcast_ref::<Column>().unwrap();
     Arc::new(
         HashJoinExec::try_new(
             left,
@@ -1178,6 +1139,7 @@ fn hash_join_exec(
             None,
             PartitionMode::Partitioned,
             NullEquality::NullEqualsNothing,
+            false,
         )
         .unwrap(),
     )
@@ -1248,7 +1210,10 @@ fn test_plan_with_order_preserving_variants_preserves_fetch() -> Result<()> {
         )],
     );
     let res = plan_with_order_preserving_variants(requirements, false, true, Some(15));
-    assert_contains!(res.unwrap_err().to_string(), "CoalescePartitionsExec fetch [10] should be greater than or equal to SortExec fetch [15]");
+    assert_contains!(
+        res.unwrap_err().to_string(),
+        "CoalescePartitionsExec fetch [10] should be greater than or equal to SortExec fetch [15]"
+    );
 
     // Test sort is without fetch, expected to get the fetch value from the coalesced
     let requirements = OrderPreservationContext::new(
diff --git a/datafusion/core/tests/physical_optimizer/sanity_checker.rs b/datafusion/core/tests/physical_optimizer/sanity_checker.rs
index 9867ed1733413..217570846d56e 100644
--- a/datafusion/core/tests/physical_optimizer/sanity_checker.rs
+++ b/datafusion/core/tests/physical_optimizer/sanity_checker.rs
@@ -30,13 +30,13 @@ use datafusion::datasource::stream::{FileStreamProvider, StreamConfig, StreamTab
 use datafusion::prelude::{CsvReadOptions, SessionContext};
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::{JoinType, Result, ScalarValue};
-use datafusion_physical_expr::expressions::{col, Literal};
 use datafusion_physical_expr::Partitioning;
+use datafusion_physical_expr::expressions::{Literal, col};
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
-use datafusion_physical_optimizer::sanity_checker::SanityCheckPlan;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::sanity_checker::SanityCheckPlan;
 use datafusion_physical_plan::repartition::RepartitionExec;
-use datafusion_physical_plan::{displayable, ExecutionPlan};
+use datafusion_physical_plan::{ExecutionPlan, displayable};
 
 use async_trait::async_trait;
 
@@ -555,11 +555,11 @@ async fn test_sort_merge_join_satisfied() -> Result<()> {
     assert_snapshot!(
         actual,
         @r"
-    SortMergeJoin: join_type=Inner, on=[(c9@0, a@0)]
-      RepartitionExec: partitioning=Hash([c9@0], 10), input_partitions=1
+    SortMergeJoinExec: join_type=Inner, on=[(c9@0, a@0)]
+      RepartitionExec: partitioning=Hash([c9@0], 10), input_partitions=1, maintains_sort_order=true
         SortExec: expr=[c9@0 ASC], preserve_partitioning=[false]
           DataSourceExec: partitions=1, partition_sizes=[0]
-      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1, maintains_sort_order=true
         SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
           DataSourceExec: partitions=1, partition_sizes=[0]
     "
@@ -605,8 +605,8 @@ async fn test_sort_merge_join_order_missing() -> Result<()> {
     assert_snapshot!(
         actual,
         @r"
-    SortMergeJoin: join_type=Inner, on=[(c9@0, a@0)]
-      RepartitionExec: partitioning=Hash([c9@0], 10), input_partitions=1
+    SortMergeJoinExec: join_type=Inner, on=[(c9@0, a@0)]
+      RepartitionExec: partitioning=Hash([c9@0], 10), input_partitions=1, maintains_sort_order=true
         SortExec: expr=[c9@0 ASC], preserve_partitioning=[false]
           DataSourceExec: partitions=1, partition_sizes=[0]
       RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
@@ -653,11 +653,11 @@ async fn test_sort_merge_join_dist_missing() -> Result<()> {
     assert_snapshot!(
         actual,
         @r"
-    SortMergeJoin: join_type=Inner, on=[(c9@0, a@0)]
-      RepartitionExec: partitioning=Hash([c9@0], 10), input_partitions=1
+    SortMergeJoinExec: join_type=Inner, on=[(c9@0, a@0)]
+      RepartitionExec: partitioning=Hash([c9@0], 10), input_partitions=1, maintains_sort_order=true
         SortExec: expr=[c9@0 ASC], preserve_partitioning=[false]
           DataSourceExec: partitions=1, partition_sizes=[0]
-      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
         SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
           DataSourceExec: partitions=1, partition_sizes=[0]
     "
diff --git a/datafusion/core/tests/physical_optimizer/test_utils.rs b/datafusion/core/tests/physical_optimizer/test_utils.rs
index 8ca33f3d4abb9..6814ab2358ffc 100644
--- a/datafusion/core/tests/physical_optimizer/test_utils.rs
+++ b/datafusion/core/tests/physical_optimizer/test_utils.rs
@@ -17,8 +17,7 @@
 
 //! Test utilities for physical optimizer tests
 
-use std::any::Any;
-use std::fmt::Formatter;
+use std::fmt::{Display, Formatter};
 use std::sync::{Arc, LazyLock};
 
 use arrow::array::Int32Array;
@@ -31,27 +30,32 @@ use datafusion::datasource::physical_plan::ParquetSource;
 use datafusion::datasource::source::DataSourceExec;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::stats::Precision;
-use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
+use datafusion_common::tree_node::{
+    Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
+};
 use datafusion_common::utils::expr::COUNT_STAR_EXPANSION;
-use datafusion_common::{ColumnStatistics, JoinType, NullEquality, Result, Statistics};
+use datafusion_common::{
+    ColumnStatistics, JoinType, NullEquality, Result, Statistics, internal_err,
+};
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use datafusion_execution::object_store::ObjectStoreUrl;
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use datafusion_expr::{WindowFrame, WindowFunctionDefinition};
 use datafusion_functions_aggregate::count::count_udaf;
+use datafusion_physical_expr::EquivalenceProperties;
 use datafusion_physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctionExpr};
 use datafusion_physical_expr::expressions::{self, col};
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use datafusion_physical_expr_common::sort_expr::{
     LexOrdering, OrderingRequirements, PhysicalSortExpr,
 };
-use datafusion_physical_optimizer::limited_distinct_aggregation::LimitedDistinctAggregation;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::limited_distinct_aggregation::LimitedDistinctAggregation;
 use datafusion_physical_plan::aggregates::{
     AggregateExec, AggregateMode, PhysicalGroupBy,
 };
-use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec;
 use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
+use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
 use datafusion_physical_plan::filter::FilterExec;
 use datafusion_physical_plan::joins::utils::{JoinFilter, JoinOn};
 use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode, SortMergeJoinExec};
@@ -63,18 +67,17 @@ use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeE
 use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec};
 use datafusion_physical_plan::tree_node::PlanContext;
 use datafusion_physical_plan::union::UnionExec;
-use datafusion_physical_plan::windows::{create_window_expr, BoundedWindowAggExec};
+use datafusion_physical_plan::windows::{BoundedWindowAggExec, create_window_expr};
 use datafusion_physical_plan::{
-    displayable, DisplayAs, DisplayFormatType, ExecutionPlan, InputOrderMode,
-    Partitioning, PlanProperties,
+    DisplayAs, DisplayFormatType, ExecutionPlan, InputOrderMode, Partitioning,
+    PlanProperties, SortOrderPushdownResult, displayable,
 };
 
 /// Create a non sorted parquet exec
 pub fn parquet_exec(schema: SchemaRef) -> Arc<DataSourceExec> {
     let config = FileScanConfigBuilder::new(
         ObjectStoreUrl::parse("test:///").unwrap(),
-        schema,
-        Arc::new(ParquetSource::default()),
+        Arc::new(ParquetSource::new(schema)),
     )
     .with_file(PartitionedFile::new("x".to_string(), 100))
     .build();
@@ -89,8 +92,7 @@ pub(crate) fn parquet_exec_with_sort(
 ) -> Arc<DataSourceExec> {
     let config = FileScanConfigBuilder::new(
         ObjectStoreUrl::parse("test:///").unwrap(),
-        schema,
-        Arc::new(ParquetSource::default()),
+        Arc::new(ParquetSource::new(schema)),
     )
     .with_file(PartitionedFile::new("x".to_string(), 100))
     .with_output_ordering(output_ordering)
@@ -106,6 +108,7 @@ fn int64_stats() -> ColumnStatistics {
         max_value: Precision::Exact(1_000_000.into()),
         min_value: Precision::Exact(0.into()),
         distinct_count: Precision::Absent,
+        byte_size: Precision::Absent,
     }
 }
 
@@ -127,17 +130,13 @@ pub(crate) fn parquet_exec_with_stats(file_size: u64) -> Arc<DataSourceExec> {
 
     let config = FileScanConfigBuilder::new(
         ObjectStoreUrl::parse("test:///").unwrap(),
-        schema(),
-        Arc::new(ParquetSource::new(Default::default())),
+        Arc::new(ParquetSource::new(schema())),
     )
     .with_file(PartitionedFile::new("x".to_string(), file_size))
     .with_statistics(statistics)
     .build();
 
-    assert_eq!(
-        config.file_source.statistics().unwrap().num_rows,
-        Precision::Inexact(10000)
-    );
+    assert_eq!(config.statistics().num_rows, Precision::Inexact(10000));
     DataSourceExec::from_data_source(config)
 }
 
@@ -249,6 +248,7 @@ pub fn hash_join_exec(
         None,
         PartitionMode::Partitioned,
         NullEquality::NullEqualsNothing,
+        false,
     )?))
 }
 
@@ -361,13 +361,6 @@ pub fn aggregate_exec(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
     )
 }
 
-pub fn coalesce_batches_exec(
-    input: Arc<dyn ExecutionPlan>,
-    batch_size: usize,
-) -> Arc<dyn ExecutionPlan> {
-    Arc::new(CoalesceBatchesExec::new(input, batch_size))
-}
-
 pub fn sort_exec(
     ordering: LexOrdering,
     input: Arc<dyn ExecutionPlan>,
@@ -458,19 +451,16 @@ impl ExecutionPlan for RequirementsTestExec {
         "RequiredInputOrderingExec"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         self.input.properties()
     }
 
     fn required_input_ordering(&self) -> Vec<Option<OrderingRequirements>> {
-        vec![self
-            .required_input_ordering
-            .as_ref()
-            .map(|ordering| OrderingRequirements::from(ordering.clone()))]
+        vec![
+            self.required_input_ordering
+                .as_ref()
+                .map(|ordering| OrderingRequirements::from(ordering.clone())),
+        ]
     }
 
     fn maintains_input_order(&self) -> Vec<bool> {
@@ -499,6 +489,20 @@ impl ExecutionPlan for RequirementsTestExec {
     ) -> Result<SendableRecordBatchStream> {
         unimplemented!("Test exec does not support execution")
     }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in required_input_ordering if present
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = &self.required_input_ordering {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
 }
 
 /// A [`PlanContext`] object is susceptible to being left in an inconsistent state after
@@ -704,3 +708,361 @@ impl TestAggregate {
         }
     }
 }
+
+/// A harness for testing physical optimizers.
+#[derive(Debug)]
+pub struct OptimizationTest {
+    input: Vec<String>,
+    output: Result<Vec<String>, String>,
+}
+
+impl OptimizationTest {
+    pub fn new<O>(
+        input_plan: Arc<dyn ExecutionPlan>,
+        opt: O,
+        enable_sort_pushdown: bool,
+    ) -> Self
+    where
+        O: PhysicalOptimizerRule,
+    {
+        let input = format_execution_plan(&input_plan);
+        let input_schema = input_plan.schema();
+
+        let mut config = ConfigOptions::new();
+        config.optimizer.enable_sort_pushdown = enable_sort_pushdown;
+        let output_result = opt.optimize(input_plan, &config);
+        let output = output_result
+            .and_then(|plan| {
+                if opt.schema_check() && (plan.schema() != input_schema) {
+                    internal_err!(
+                        "Schema mismatch:\n\nBefore:\n{:?}\n\nAfter:\n{:?}",
+                        input_schema,
+                        plan.schema()
+                    )
+                } else {
+                    Ok(plan)
+                }
+            })
+            .map(|plan| format_execution_plan(&plan))
+            .map_err(|e| e.to_string());
+
+        Self { input, output }
+    }
+}
+
+impl Display for OptimizationTest {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        writeln!(f, "OptimizationTest:")?;
+        writeln!(f, "  input:")?;
+        for line in &self.input {
+            writeln!(f, "    - {line}")?;
+        }
+        writeln!(f, "  output:")?;
+        match &self.output {
+            Ok(output) => {
+                writeln!(f, "    Ok:")?;
+                for line in output {
+                    writeln!(f, "      - {line}")?;
+                }
+            }
+            Err(err) => {
+                writeln!(f, "    Err: {err}")?;
+            }
+        }
+        Ok(())
+    }
+}
+
+pub fn format_execution_plan(plan: &Arc<dyn ExecutionPlan>) -> Vec<String> {
+    format_lines(&displayable(plan.as_ref()).indent(false).to_string())
+}
+
+fn format_lines(s: &str) -> Vec<String> {
+    s.trim().split('\n').map(|s| s.to_string()).collect()
+}
+
+/// Create a simple ProjectionExec with column indices (simplified version)
+pub fn simple_projection_exec(
+    input: Arc<dyn ExecutionPlan>,
+    columns: Vec<usize>,
+) -> Arc<dyn ExecutionPlan> {
+    let schema = input.schema();
+    let exprs: Vec<(Arc<dyn PhysicalExpr>, String)> = columns
+        .iter()
+        .map(|&i| {
+            let field = schema.field(i);
+            (
+                Arc::new(expressions::Column::new(field.name(), i))
+                    as Arc<dyn PhysicalExpr>,
+                field.name().to_string(),
+            )
+        })
+        .collect();
+
+    projection_exec(exprs, input).unwrap()
+}
+
+/// Create a ProjectionExec with column aliases
+pub fn projection_exec_with_alias(
+    input: Arc<dyn ExecutionPlan>,
+    columns: Vec<(usize, &str)>,
+) -> Arc<dyn ExecutionPlan> {
+    let schema = input.schema();
+    let exprs: Vec<(Arc<dyn PhysicalExpr>, String)> = columns
+        .iter()
+        .map(|&(i, alias)| {
+            (
+                Arc::new(expressions::Column::new(schema.field(i).name(), i))
+                    as Arc<dyn PhysicalExpr>,
+                alias.to_string(),
+            )
+        })
+        .collect();
+
+    projection_exec(exprs, input).unwrap()
+}
+
+/// Create a sort expression with custom name and index
+pub fn sort_expr_named(name: &str, index: usize) -> PhysicalSortExpr {
+    PhysicalSortExpr {
+        expr: Arc::new(expressions::Column::new(name, index)),
+        options: SortOptions::default(),
+    }
+}
+
+/// A test data source that can display any requested ordering.
+/// This is useful for testing sort pushdown behavior.
+///
+/// ## Configuration
+///
+/// - `exact_pushdown`: if `true`, `try_pushdown_sort` returns `Exact`
+///   (source guarantees ordering, SortExec can be removed); if `false`
+///   (default), returns `Inexact` (SortExec kept).
+/// - `supports_fetch`: if `true`, `with_fetch()` returns `Some` so the
+///   optimizer can push a LIMIT into the source; if `false` (default),
+///   `with_fetch()` returns `None`, forcing a `GlobalLimitExec` wrapper.
+#[derive(Debug, Clone)]
+pub struct TestScan {
+    schema: SchemaRef,
+    output_ordering: Vec<LexOrdering>,
+    plan_properties: Arc<PlanProperties>,
+    // Store the requested ordering for display
+    requested_ordering: Option<LexOrdering>,
+    /// If true, `try_pushdown_sort` returns `Exact` instead of `Inexact`.
+    exact_pushdown: bool,
+    /// If true, `with_fetch()` returns `Some(...)` (source absorbs the limit).
+    supports_fetch: bool,
+    /// The fetch (LIMIT) value pushed into this scan via `with_fetch()`.
+    fetch: Option<usize>,
+}
+
+impl TestScan {
+    /// Create a new TestScan with the given schema and output ordering
+    pub fn new(schema: SchemaRef, output_ordering: Vec<LexOrdering>) -> Self {
+        let eq_properties = if !output_ordering.is_empty() {
+            // Convert Vec<LexOrdering> to the format expected by new_with_orderings
+            // We need to extract the inner Vec<PhysicalSortExpr> from each LexOrdering
+            let orderings: Vec<Vec<PhysicalSortExpr>> = output_ordering
+                .iter()
+                .map(|lex_ordering| {
+                    // LexOrdering implements IntoIterator, so we can collect it
+                    lex_ordering.iter().cloned().collect()
+                })
+                .collect();
+
+            EquivalenceProperties::new_with_orderings(Arc::clone(&schema), orderings)
+        } else {
+            EquivalenceProperties::new(Arc::clone(&schema))
+        };
+
+        let plan_properties = PlanProperties::new(
+            eq_properties,
+            Partitioning::UnknownPartitioning(1),
+            EmissionType::Incremental,
+            Boundedness::Bounded,
+        );
+
+        Self {
+            schema,
+            output_ordering,
+            plan_properties: Arc::new(plan_properties),
+            requested_ordering: None,
+            exact_pushdown: false,
+            supports_fetch: false,
+            fetch: None,
+        }
+    }
+
+    /// Create a TestScan with a single output ordering
+    pub fn with_ordering(schema: SchemaRef, ordering: LexOrdering) -> Self {
+        Self::new(schema, vec![ordering])
+    }
+
+    /// Set whether `try_pushdown_sort` returns `Exact` (true) or `Inexact` (false).
+    pub fn with_exact_pushdown(mut self, exact: bool) -> Self {
+        self.exact_pushdown = exact;
+        self
+    }
+
+    /// Set whether `with_fetch()` returns `Some` (true) or `None` (false).
+    pub fn with_supports_fetch(mut self, supports: bool) -> Self {
+        self.supports_fetch = supports;
+        self
+    }
+}
+
+impl DisplayAs for TestScan {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "TestScan")?;
+                let mut sep = ": ";
+                if !self.output_ordering.is_empty() {
+                    write!(f, "{sep}output_ordering=[")?;
+                    for (i, sort_expr) in self.output_ordering[0].iter().enumerate() {
+                        if i > 0 {
+                            write!(f, ", ")?;
+                        }
+                        write!(f, "{sort_expr}")?;
+                    }
+                    write!(f, "]")?;
+                    sep = ", ";
+                }
+                if let Some(ref req) = self.requested_ordering {
+                    write!(f, "{sep}requested_ordering=[")?;
+                    for (i, sort_expr) in req.iter().enumerate() {
+                        if i > 0 {
+                            write!(f, ", ")?;
+                        }
+                        write!(f, "{sort_expr}")?;
+                    }
+                    write!(f, "]")?;
+                    sep = ", ";
+                }
+                if let Some(fetch) = self.fetch {
+                    write!(f, "{sep}fetch={fetch}")?;
+                }
+                Ok(())
+            }
+            DisplayFormatType::TreeRender => {
+                write!(f, "TestScan")
+            }
+        }
+    }
+}
+
+impl ExecutionPlan for TestScan {
+    fn name(&self) -> &str {
+        "TestScan"
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.plan_properties
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        if children.is_empty() {
+            Ok(self)
+        } else {
+            internal_err!("TestScan should have no children")
+        }
+    }
+
+    fn execute(
+        &self,
+        _partition: usize,
+        _context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        internal_err!("TestScan is for testing optimizer only, not for execution")
+    }
+
+    fn partition_statistics(&self, _partition: Option<usize>) -> Result<Arc<Statistics>> {
+        Ok(Arc::new(Statistics::new_unknown(&self.schema)))
+    }
+
+    fn with_fetch(&self, fetch: Option<usize>) -> Option<Arc<dyn ExecutionPlan>> {
+        if self.supports_fetch {
+            let mut new_scan = self.clone();
+            new_scan.fetch = fetch;
+            Some(Arc::new(new_scan))
+        } else {
+            None
+        }
+    }
+
+    fn fetch(&self) -> Option<usize> {
+        self.fetch
+    }
+
+    // This is the key method - implement sort pushdown
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn ExecutionPlan>>> {
+        // For testing purposes, accept ANY ordering request
+        // and create a new TestScan that shows what was requested
+        let requested_ordering = LexOrdering::new(order.to_vec());
+
+        let mut new_scan = self.clone();
+        new_scan.requested_ordering = requested_ordering;
+
+        if self.exact_pushdown {
+            // Update plan properties to reflect the guaranteed ordering
+            let orderings: Vec<Vec<PhysicalSortExpr>> = vec![order.to_vec()];
+            let eq_properties = EquivalenceProperties::new_with_orderings(
+                Arc::clone(&self.schema),
+                orderings,
+            );
+            new_scan.plan_properties = Arc::new(PlanProperties::new(
+                eq_properties,
+                Partitioning::UnknownPartitioning(1),
+                EmissionType::Incremental,
+                Boundedness::Bounded,
+            ));
+            Ok(SortOrderPushdownResult::Exact {
+                inner: Arc::new(new_scan),
+            })
+        } else {
+            Ok(SortOrderPushdownResult::Inexact {
+                inner: Arc::new(new_scan),
+            })
+        }
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in output_ordering
+        let mut tnr = TreeNodeRecursion::Continue;
+        for ordering in &self.output_ordering {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+
+        // Visit expressions in requested_ordering if present
+        if let Some(ordering) = &self.requested_ordering {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+
+        Ok(tnr)
+    }
+}
+
+/// Helper function to create a TestScan with ordering
+pub fn test_scan_with_ordering(
+    schema: SchemaRef,
+    ordering: LexOrdering,
+) -> Arc<dyn ExecutionPlan> {
+    Arc::new(TestScan::with_ordering(schema, ordering))
+}
diff --git a/datafusion/core/tests/physical_optimizer/window_optimize.rs b/datafusion/core/tests/physical_optimizer/window_optimize.rs
index fc1e6444d756e..796f6b6259716 100644
--- a/datafusion/core/tests/physical_optimizer/window_optimize.rs
+++ b/datafusion/core/tests/physical_optimizer/window_optimize.rs
@@ -26,10 +26,10 @@ mod test {
     use datafusion_expr::WindowFrame;
     use datafusion_functions_aggregate::count::count_udaf;
     use datafusion_physical_expr::aggregate::AggregateExprBuilder;
-    use datafusion_physical_expr::expressions::{col, Column};
+    use datafusion_physical_expr::expressions::{Column, col};
     use datafusion_physical_expr::window::PlainAggregateWindowExpr;
     use datafusion_physical_plan::windows::BoundedWindowAggExec;
-    use datafusion_physical_plan::{common, ExecutionPlan, InputOrderMode};
+    use datafusion_physical_plan::{ExecutionPlan, InputOrderMode, common};
     use std::sync::Arc;
 
     /// Test case for <https://github.com/apache/datafusion/issues/16308>
diff --git a/datafusion/core/tests/physical_optimizer/window_topn.rs b/datafusion/core/tests/physical_optimizer/window_topn.rs
new file mode 100644
index 0000000000000..e3f73a85353cc
--- /dev/null
+++ b/datafusion/core/tests/physical_optimizer/window_topn.rs
@@ -0,0 +1,425 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Tests for the WindowTopN physical optimizer rule.
+
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, Schema};
+use datafusion_common::Result;
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::Operator;
+use datafusion_expr::{WindowFrame, WindowFrameBound, WindowFrameUnits};
+use datafusion_functions_window::row_number::row_number_udwf;
+use datafusion_physical_expr::expressions::{BinaryExpr, Column, col, lit};
+use datafusion_physical_expr::window::StandardWindowExpr;
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::window_topn::WindowTopN;
+use datafusion_physical_plan::displayable;
+use datafusion_physical_plan::filter::FilterExec;
+use datafusion_physical_plan::placeholder_row::PlaceholderRowExec;
+use datafusion_physical_plan::projection::ProjectionExec;
+use datafusion_physical_plan::sorts::sort::SortExec;
+use datafusion_physical_plan::windows::{BoundedWindowAggExec, create_udwf_window_expr};
+use datafusion_physical_plan::{ExecutionPlan, InputOrderMode};
+use insta::assert_snapshot;
+
+fn schema() -> Arc<Schema> {
+    Arc::new(Schema::new(vec![
+        Field::new("pk", DataType::Int64, false),
+        Field::new("val", DataType::Int64, false),
+    ]))
+}
+
+fn plan_str(plan: &dyn ExecutionPlan) -> String {
+    displayable(plan).indent(true).to_string()
+}
+
+fn optimize(plan: Arc<dyn ExecutionPlan>) -> Result<Arc<dyn ExecutionPlan>> {
+    let mut config = ConfigOptions::new();
+    config.optimizer.enable_window_topn = true;
+    WindowTopN::new().optimize(plan, &config)
+}
+
+fn optimize_disabled(plan: Arc<dyn ExecutionPlan>) -> Result<Arc<dyn ExecutionPlan>> {
+    let mut config = ConfigOptions::new();
+    config.optimizer.enable_window_topn = false;
+    WindowTopN::new().optimize(plan, &config)
+}
+
+/// Build: FilterExec(rn <= limit) → BoundedWindowAggExec(ROW_NUMBER PBY pk OBY val) → SortExec(pk, val)
+fn build_window_topn_plan(
+    limit_value: i64,
+    op: Operator,
+) -> Result<Arc<dyn ExecutionPlan>> {
+    let s = schema();
+    let input: Arc<dyn ExecutionPlan> = Arc::new(PlaceholderRowExec::new(Arc::clone(&s)));
+
+    // Sort by pk ASC, val ASC
+    let ordering = LexOrdering::new(vec![
+        PhysicalSortExpr::new_default(col("pk", &s)?).asc(),
+        PhysicalSortExpr::new_default(col("val", &s)?).asc(),
+    ])
+    .unwrap();
+
+    let sort: Arc<dyn ExecutionPlan> =
+        Arc::new(SortExec::new(ordering.clone(), input).with_preserve_partitioning(true));
+
+    // ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val)
+    let partition_by = vec![col("pk", &s)?];
+    let order_by = vec![PhysicalSortExpr::new_default(col("val", &s)?).asc()];
+
+    let window_expr = Arc::new(StandardWindowExpr::new(
+        create_udwf_window_expr(
+            &row_number_udwf(),
+            &[],
+            &s,
+            "row_number".to_string(),
+            false,
+        )?,
+        &partition_by,
+        &order_by,
+        Arc::new(WindowFrame::new_bounds(
+            WindowFrameUnits::Rows,
+            WindowFrameBound::Preceding(ScalarValue::UInt64(None)),
+            WindowFrameBound::CurrentRow,
+        )),
+    ));
+
+    let window: Arc<dyn ExecutionPlan> = Arc::new(BoundedWindowAggExec::try_new(
+        vec![window_expr],
+        sort,
+        InputOrderMode::Sorted,
+        true,
+    )?);
+
+    // FilterExec: rn op limit_value
+    // The ROW_NUMBER column is at index 2 (after pk=0, val=1)
+    let rn_col = Arc::new(Column::new("row_number", 2));
+    let limit_lit = lit(ScalarValue::UInt64(Some(limit_value as u64)));
+    let predicate = Arc::new(BinaryExpr::new(rn_col, op, limit_lit));
+    let filter: Arc<dyn ExecutionPlan> =
+        Arc::new(FilterExec::try_new(predicate, window)?);
+
+    Ok(filter)
+}
+
+/// Build a plan with no partition-by: ROW_NUMBER() OVER (ORDER BY val)
+fn build_window_topn_no_partition(limit_value: i64) -> Result<Arc<dyn ExecutionPlan>> {
+    let s = schema();
+    let input: Arc<dyn ExecutionPlan> = Arc::new(PlaceholderRowExec::new(Arc::clone(&s)));
+
+    // Sort by val ASC only (no partition key)
+    let ordering =
+        LexOrdering::new(vec![PhysicalSortExpr::new_default(col("val", &s)?).asc()])
+            .unwrap();
+
+    let sort: Arc<dyn ExecutionPlan> =
+        Arc::new(SortExec::new(ordering.clone(), input).with_preserve_partitioning(true));
+
+    // ROW_NUMBER() OVER (ORDER BY val) — no partition by
+    let order_by = vec![PhysicalSortExpr::new_default(col("val", &s)?).asc()];
+
+    let window_expr = Arc::new(StandardWindowExpr::new(
+        create_udwf_window_expr(
+            &row_number_udwf(),
+            &[],
+            &s,
+            "row_number".to_string(),
+            false,
+        )?,
+        &[], // empty partition_by
+        &order_by,
+        Arc::new(WindowFrame::new_bounds(
+            WindowFrameUnits::Rows,
+            WindowFrameBound::Preceding(ScalarValue::UInt64(None)),
+            WindowFrameBound::CurrentRow,
+        )),
+    ));
+
+    let window: Arc<dyn ExecutionPlan> = Arc::new(BoundedWindowAggExec::try_new(
+        vec![window_expr],
+        sort,
+        InputOrderMode::Sorted,
+        true,
+    )?);
+
+    let rn_col = Arc::new(Column::new("row_number", 2));
+    let limit_lit = lit(ScalarValue::UInt64(Some(limit_value as u64)));
+    let predicate = Arc::new(BinaryExpr::new(rn_col, Operator::LtEq, limit_lit));
+    let filter: Arc<dyn ExecutionPlan> =
+        Arc::new(FilterExec::try_new(predicate, window)?);
+
+    Ok(filter)
+}
+
+/// Build a plan where filter is on a data column (not window output)
+fn build_non_window_filter_plan() -> Result<Arc<dyn ExecutionPlan>> {
+    let s = schema();
+    let input: Arc<dyn ExecutionPlan> = Arc::new(PlaceholderRowExec::new(Arc::clone(&s)));
+
+    let ordering = LexOrdering::new(vec![
+        PhysicalSortExpr::new_default(col("pk", &s)?).asc(),
+        PhysicalSortExpr::new_default(col("val", &s)?).asc(),
+    ])
+    .unwrap();
+
+    let sort: Arc<dyn ExecutionPlan> =
+        Arc::new(SortExec::new(ordering.clone(), input).with_preserve_partitioning(true));
+
+    let partition_by = vec![col("pk", &s)?];
+    let order_by = vec![PhysicalSortExpr::new_default(col("val", &s)?).asc()];
+
+    let window_expr = Arc::new(StandardWindowExpr::new(
+        create_udwf_window_expr(
+            &row_number_udwf(),
+            &[],
+            &s,
+            "row_number".to_string(),
+            false,
+        )?,
+        &partition_by,
+        &order_by,
+        Arc::new(WindowFrame::new_bounds(
+            WindowFrameUnits::Rows,
+            WindowFrameBound::Preceding(ScalarValue::UInt64(None)),
+            WindowFrameBound::CurrentRow,
+        )),
+    ));
+
+    let window: Arc<dyn ExecutionPlan> = Arc::new(BoundedWindowAggExec::try_new(
+        vec![window_expr],
+        sort,
+        InputOrderMode::Sorted,
+        true,
+    )?);
+
+    // Filter on data column val (index 1), NOT on window output
+    let val_col = Arc::new(Column::new("val", 1));
+    let limit_lit = lit(ScalarValue::Int64(Some(3)));
+    let predicate = Arc::new(BinaryExpr::new(val_col, Operator::LtEq, limit_lit));
+    let filter: Arc<dyn ExecutionPlan> =
+        Arc::new(FilterExec::try_new(predicate, window)?);
+
+    Ok(filter)
+}
+
+#[test]
+fn basic_row_number_rn_lteq_3() -> Result<()> {
+    let plan = build_window_topn_plan(3, Operator::LtEq)?;
+    let optimized = optimize(plan)?;
+    assert_snapshot!(plan_str(optimized.as_ref()), @r#"
+    BoundedWindowAggExec: wdw=[row_number: Field { "row_number": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      PartitionedTopKExec: fetch=3, partition=[pk@0], order=[val@1 ASC]
+        PlaceholderRowExec
+    "#);
+    Ok(())
+}
+
+#[test]
+fn rn_lt_3_becomes_fetch_2() -> Result<()> {
+    let plan = build_window_topn_plan(3, Operator::Lt)?;
+    let optimized = optimize(plan)?;
+    assert_snapshot!(plan_str(optimized.as_ref()), @r#"
+    BoundedWindowAggExec: wdw=[row_number: Field { "row_number": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      PartitionedTopKExec: fetch=2, partition=[pk@0], order=[val@1 ASC]
+        PlaceholderRowExec
+    "#);
+    Ok(())
+}
+
+#[test]
+fn flipped_3_gteq_rn() -> Result<()> {
+    let plan = {
+        let s = schema();
+        let input: Arc<dyn ExecutionPlan> =
+            Arc::new(PlaceholderRowExec::new(Arc::clone(&s)));
+
+        let ordering = LexOrdering::new(vec![
+            PhysicalSortExpr::new_default(col("pk", &s)?).asc(),
+            PhysicalSortExpr::new_default(col("val", &s)?).asc(),
+        ])
+        .unwrap();
+
+        let sort: Arc<dyn ExecutionPlan> = Arc::new(
+            SortExec::new(ordering.clone(), input).with_preserve_partitioning(true),
+        );
+
+        let partition_by = vec![col("pk", &s)?];
+        let order_by = vec![PhysicalSortExpr::new_default(col("val", &s)?).asc()];
+
+        let window_expr = Arc::new(StandardWindowExpr::new(
+            create_udwf_window_expr(
+                &row_number_udwf(),
+                &[],
+                &s,
+                "row_number".to_string(),
+                false,
+            )?,
+            &partition_by,
+            &order_by,
+            Arc::new(WindowFrame::new_bounds(
+                WindowFrameUnits::Rows,
+                WindowFrameBound::Preceding(ScalarValue::UInt64(None)),
+                WindowFrameBound::CurrentRow,
+            )),
+        ));
+
+        let window: Arc<dyn ExecutionPlan> = Arc::new(BoundedWindowAggExec::try_new(
+            vec![window_expr],
+            sort,
+            InputOrderMode::Sorted,
+            true,
+        )?);
+
+        // Flipped: 3 >= rn  (Literal GtEq Column)
+        let rn_col = Arc::new(Column::new("row_number", 2));
+        let limit_lit = lit(ScalarValue::UInt64(Some(3)));
+        let predicate = Arc::new(BinaryExpr::new(limit_lit, Operator::GtEq, rn_col));
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(predicate, window)?);
+        filter
+    };
+
+    let optimized = optimize(plan)?;
+    assert_snapshot!(plan_str(optimized.as_ref()), @r#"
+    BoundedWindowAggExec: wdw=[row_number: Field { "row_number": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      PartitionedTopKExec: fetch=3, partition=[pk@0], order=[val@1 ASC]
+        PlaceholderRowExec
+    "#);
+    Ok(())
+}
+
+#[test]
+fn non_window_column_filter_no_change() -> Result<()> {
+    let plan = build_non_window_filter_plan()?;
+    let before = plan_str(plan.as_ref());
+    let optimized = optimize(plan)?;
+    let after = plan_str(optimized.as_ref());
+    assert_eq!(
+        before, after,
+        "Plan should not change when filter is on data column"
+    );
+    Ok(())
+}
+
+#[test]
+fn config_disabled_no_change() -> Result<()> {
+    let plan = build_window_topn_plan(3, Operator::LtEq)?;
+    let before = plan_str(plan.as_ref());
+    let optimized = optimize_disabled(plan)?;
+    let after = plan_str(optimized.as_ref());
+    assert_eq!(
+        before, after,
+        "Plan should not change when config is disabled"
+    );
+    Ok(())
+}
+
+#[test]
+fn no_partition_by_no_change() -> Result<()> {
+    // Without PARTITION BY, this is a global top-K which SortExec with
+    // fetch already handles — the rule should not fire.
+    let plan = build_window_topn_no_partition(5)?;
+    let optimized = optimize(plan)?;
+    assert_snapshot!(plan_str(optimized.as_ref()), @r#"
+    FilterExec: row_number@2 <= 5
+      BoundedWindowAggExec: wdw=[row_number: Field { "row_number": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        SortExec: expr=[val@1 ASC], preserve_partitioning=[true]
+          PlaceholderRowExec
+    "#);
+    Ok(())
+}
+
+#[test]
+fn with_projection_between() -> Result<()> {
+    let s = schema();
+    let input: Arc<dyn ExecutionPlan> = Arc::new(PlaceholderRowExec::new(Arc::clone(&s)));
+
+    let ordering = LexOrdering::new(vec![
+        PhysicalSortExpr::new_default(col("pk", &s)?).asc(),
+        PhysicalSortExpr::new_default(col("val", &s)?).asc(),
+    ])
+    .unwrap();
+
+    let sort: Arc<dyn ExecutionPlan> =
+        Arc::new(SortExec::new(ordering.clone(), input).with_preserve_partitioning(true));
+
+    let partition_by = vec![col("pk", &s)?];
+    let order_by = vec![PhysicalSortExpr::new_default(col("val", &s)?).asc()];
+
+    let window_expr = Arc::new(StandardWindowExpr::new(
+        create_udwf_window_expr(
+            &row_number_udwf(),
+            &[],
+            &s,
+            "row_number".to_string(),
+            false,
+        )?,
+        &partition_by,
+        &order_by,
+        Arc::new(WindowFrame::new_bounds(
+            WindowFrameUnits::Rows,
+            WindowFrameBound::Preceding(ScalarValue::UInt64(None)),
+            WindowFrameBound::CurrentRow,
+        )),
+    ));
+
+    let window: Arc<dyn ExecutionPlan> = Arc::new(BoundedWindowAggExec::try_new(
+        vec![window_expr],
+        sort,
+        InputOrderMode::Sorted,
+        true,
+    )?);
+
+    // Add a ProjectionExec between Filter and Window
+    let window_schema = window.schema();
+    let proj_exprs: Vec<(Arc<dyn datafusion_physical_expr::PhysicalExpr>, String)> =
+        window_schema
+            .fields()
+            .iter()
+            .enumerate()
+            .map(|(i, f)| {
+                (
+                    Arc::new(Column::new(f.name(), i))
+                        as Arc<dyn datafusion_physical_expr::PhysicalExpr>,
+                    f.name().to_string(),
+                )
+            })
+            .collect();
+
+    let projection: Arc<dyn ExecutionPlan> =
+        Arc::new(ProjectionExec::try_new(proj_exprs, window)?);
+
+    // rn column is still at index 2 in the projected schema
+    let rn_col = Arc::new(Column::new("row_number", 2));
+    let limit_lit = lit(ScalarValue::UInt64(Some(3)));
+    let predicate = Arc::new(BinaryExpr::new(rn_col, Operator::LtEq, limit_lit));
+    let filter: Arc<dyn ExecutionPlan> =
+        Arc::new(FilterExec::try_new(predicate, projection)?);
+
+    let optimized = optimize(filter)?;
+    assert_snapshot!(plan_str(optimized.as_ref()), @r#"
+    ProjectionExec: expr=[pk@0 as pk, val@1 as val, row_number@2 as row_number]
+      BoundedWindowAggExec: wdw=[row_number: Field { "row_number": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        PartitionedTopKExec: fetch=3, partition=[pk@0], order=[val@1 ASC]
+          PlaceholderRowExec
+    "#);
+    Ok(())
+}
diff --git a/datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs b/datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs
deleted file mode 100644
index c3c92a9028d67..0000000000000
--- a/datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs
+++ /dev/null
@@ -1,363 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::sync::Arc;
-
-use arrow::array::RecordBatch;
-use arrow_schema::{DataType, Field, Schema, SchemaRef};
-use bytes::{BufMut, BytesMut};
-use datafusion::common::Result;
-use datafusion::datasource::listing::PartitionedFile;
-use datafusion::datasource::physical_plan::{
-    ArrowSource, CsvSource, FileSource, JsonSource, ParquetSource,
-};
-use datafusion::physical_plan::ExecutionPlan;
-use datafusion::prelude::SessionContext;
-use datafusion_common::ColumnStatistics;
-use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
-use datafusion_datasource::schema_adapter::{
-    SchemaAdapter, SchemaAdapterFactory, SchemaMapper,
-};
-use datafusion_datasource::source::DataSourceExec;
-use datafusion_execution::object_store::ObjectStoreUrl;
-use object_store::{memory::InMemory, path::Path, ObjectStore};
-use parquet::arrow::ArrowWriter;
-
-async fn write_parquet(batch: RecordBatch, store: Arc<dyn ObjectStore>, path: &str) {
-    let mut out = BytesMut::new().writer();
-    {
-        let mut writer = ArrowWriter::try_new(&mut out, batch.schema(), None).unwrap();
-        writer.write(&batch).unwrap();
-        writer.finish().unwrap();
-    }
-    let data = out.into_inner().freeze();
-    store.put(&Path::from(path), data.into()).await.unwrap();
-}
-
-/// A schema adapter factory that transforms column names to uppercase
-#[derive(Debug, PartialEq)]
-struct UppercaseAdapterFactory {}
-
-impl SchemaAdapterFactory for UppercaseAdapterFactory {
-    fn create(
-        &self,
-        projected_table_schema: SchemaRef,
-        _table_schema: SchemaRef,
-    ) -> Box<dyn SchemaAdapter> {
-        Box::new(UppercaseAdapter {
-            table_schema: projected_table_schema,
-        })
-    }
-}
-
-/// Schema adapter that transforms column names to uppercase
-#[derive(Debug)]
-struct UppercaseAdapter {
-    table_schema: SchemaRef,
-}
-
-impl SchemaAdapter for UppercaseAdapter {
-    fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option<usize> {
-        let field = self.table_schema.field(index);
-        let uppercase_name = field.name().to_uppercase();
-        file_schema
-            .fields()
-            .iter()
-            .position(|f| f.name().to_uppercase() == uppercase_name)
-    }
-
-    fn map_schema(
-        &self,
-        file_schema: &Schema,
-    ) -> Result<(Arc<dyn SchemaMapper>, Vec<usize>)> {
-        let mut projection = Vec::new();
-
-        // Map each field in the table schema to the corresponding field in the file schema
-        for table_field in self.table_schema.fields() {
-            let uppercase_name = table_field.name().to_uppercase();
-            if let Some(pos) = file_schema
-                .fields()
-                .iter()
-                .position(|f| f.name().to_uppercase() == uppercase_name)
-            {
-                projection.push(pos);
-            }
-        }
-
-        let mapper = UppercaseSchemaMapper {
-            output_schema: self.output_schema(),
-            projection: projection.clone(),
-        };
-
-        Ok((Arc::new(mapper), projection))
-    }
-}
-
-impl UppercaseAdapter {
-    fn output_schema(&self) -> SchemaRef {
-        let fields: Vec<Field> = self
-            .table_schema
-            .fields()
-            .iter()
-            .map(|f| {
-                Field::new(
-                    f.name().to_uppercase().as_str(),
-                    f.data_type().clone(),
-                    f.is_nullable(),
-                )
-            })
-            .collect();
-
-        Arc::new(Schema::new(fields))
-    }
-}
-
-#[derive(Debug)]
-struct UppercaseSchemaMapper {
-    output_schema: SchemaRef,
-    projection: Vec<usize>,
-}
-
-impl SchemaMapper for UppercaseSchemaMapper {
-    fn map_batch(&self, batch: RecordBatch) -> Result<RecordBatch> {
-        let columns = self
-            .projection
-            .iter()
-            .map(|&i| batch.column(i).clone())
-            .collect::<Vec<_>>();
-        Ok(RecordBatch::try_new(self.output_schema.clone(), columns)?)
-    }
-
-    fn map_column_statistics(
-        &self,
-        stats: &[ColumnStatistics],
-    ) -> Result<Vec<ColumnStatistics>> {
-        Ok(self
-            .projection
-            .iter()
-            .map(|&i| stats.get(i).cloned().unwrap_or_default())
-            .collect())
-    }
-}
-
-#[cfg(feature = "parquet")]
-#[tokio::test]
-async fn test_parquet_integration_with_schema_adapter() -> Result<()> {
-    // Create test data
-    let batch = RecordBatch::try_new(
-        Arc::new(Schema::new(vec![
-            Field::new("id", DataType::Int32, false),
-            Field::new("name", DataType::Utf8, true),
-        ])),
-        vec![
-            Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])),
-            Arc::new(arrow::array::StringArray::from(vec!["a", "b", "c"])),
-        ],
-    )?;
-
-    let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
-    let store_url = ObjectStoreUrl::parse("memory://").unwrap();
-    let path = "test.parquet";
-    write_parquet(batch.clone(), store.clone(), path).await;
-
-    // Get the actual file size from the object store
-    let object_meta = store.head(&Path::from(path)).await?;
-    let file_size = object_meta.size;
-
-    // Create a session context and register the object store
-    let ctx = SessionContext::new();
-    ctx.register_object_store(store_url.as_ref(), Arc::clone(&store));
-
-    // Create a ParquetSource with the adapter factory
-    let file_source = ParquetSource::default()
-        .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?;
-
-    // Create a table schema with uppercase column names
-    let table_schema = Arc::new(Schema::new(vec![
-        Field::new("ID", DataType::Int32, false),
-        Field::new("NAME", DataType::Utf8, true),
-    ]));
-
-    let config = FileScanConfigBuilder::new(store_url, table_schema.clone(), file_source)
-        .with_file(PartitionedFile::new(path, file_size))
-        .build();
-
-    // Create a data source executor
-    let exec = DataSourceExec::from_data_source(config);
-
-    // Collect results
-    let task_ctx = ctx.task_ctx();
-    let stream = exec.execute(0, task_ctx)?;
-    let batches = datafusion::physical_plan::common::collect(stream).await?;
-
-    // There should be one batch
-    assert_eq!(batches.len(), 1);
-
-    // Verify the schema has the uppercase column names
-    let result_schema = batches[0].schema();
-    assert_eq!(result_schema.field(0).name(), "ID");
-    assert_eq!(result_schema.field(1).name(), "NAME");
-
-    Ok(())
-}
-
-#[cfg(feature = "parquet")]
-#[tokio::test]
-async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter(
-) -> Result<()> {
-    // Create test data
-    let batch = RecordBatch::try_new(
-        Arc::new(Schema::new(vec![
-            Field::new("id", DataType::Int32, false),
-            Field::new("name", DataType::Utf8, true),
-        ])),
-        vec![
-            Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])),
-            Arc::new(arrow::array::StringArray::from(vec!["a", "b", "c"])),
-        ],
-    )?;
-
-    let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
-    let store_url = ObjectStoreUrl::parse("memory://").unwrap();
-    let path = "test.parquet";
-    write_parquet(batch.clone(), store.clone(), path).await;
-
-    // Get the actual file size from the object store
-    let object_meta = store.head(&Path::from(path)).await?;
-    let file_size = object_meta.size;
-
-    // Create a session context and register the object store
-    let ctx = SessionContext::new();
-    ctx.register_object_store(store_url.as_ref(), Arc::clone(&store));
-
-    // Create a ParquetSource with the adapter factory
-    let file_source = ParquetSource::default()
-        .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?;
-
-    let config = FileScanConfigBuilder::new(store_url, batch.schema(), file_source)
-        .with_file(PartitionedFile::new(path, file_size))
-        .build();
-
-    // Create a data source executor
-    let exec = DataSourceExec::from_data_source(config);
-
-    // Collect results
-    let task_ctx = ctx.task_ctx();
-    let stream = exec.execute(0, task_ctx)?;
-    let batches = datafusion::physical_plan::common::collect(stream).await?;
-
-    // There should be one batch
-    assert_eq!(batches.len(), 1);
-
-    // Verify the schema has the original column names (schema adapter not applied in DataSourceExec)
-    let result_schema = batches[0].schema();
-    assert_eq!(result_schema.field(0).name(), "id");
-    assert_eq!(result_schema.field(1).name(), "name");
-
-    Ok(())
-}
-
-#[tokio::test]
-async fn test_multi_source_schema_adapter_reuse() -> Result<()> {
-    // This test verifies that the same schema adapter factory can be reused
-    // across different file source types. This is important for ensuring that:
-    // 1. The schema adapter factory interface works uniformly across all source types
-    // 2. The factory can be shared and cloned efficiently using Arc
-    // 3. Various data source implementations correctly implement the schema adapter factory pattern
-
-    // Create a test factory
-    let factory = Arc::new(UppercaseAdapterFactory {});
-
-    // Test ArrowSource
-    {
-        let source = ArrowSource::default();
-        let source_with_adapter = source
-            .clone()
-            .with_schema_adapter_factory(factory.clone())
-            .unwrap();
-
-        let base_source: Arc<dyn FileSource> = source.into();
-        assert!(base_source.schema_adapter_factory().is_none());
-        assert!(source_with_adapter.schema_adapter_factory().is_some());
-
-        let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap();
-        assert_eq!(
-            format!("{:?}", retrieved_factory.as_ref()),
-            format!("{:?}", factory.as_ref())
-        );
-    }
-
-    // Test ParquetSource
-    #[cfg(feature = "parquet")]
-    {
-        let source = ParquetSource::default();
-        let source_with_adapter = source
-            .clone()
-            .with_schema_adapter_factory(factory.clone())
-            .unwrap();
-
-        let base_source: Arc<dyn FileSource> = source.into();
-        assert!(base_source.schema_adapter_factory().is_none());
-        assert!(source_with_adapter.schema_adapter_factory().is_some());
-
-        let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap();
-        assert_eq!(
-            format!("{:?}", retrieved_factory.as_ref()),
-            format!("{:?}", factory.as_ref())
-        );
-    }
-
-    // Test CsvSource
-    {
-        let source = CsvSource::default();
-        let source_with_adapter = source
-            .clone()
-            .with_schema_adapter_factory(factory.clone())
-            .unwrap();
-
-        let base_source: Arc<dyn FileSource> = source.into();
-        assert!(base_source.schema_adapter_factory().is_none());
-        assert!(source_with_adapter.schema_adapter_factory().is_some());
-
-        let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap();
-        assert_eq!(
-            format!("{:?}", retrieved_factory.as_ref()),
-            format!("{:?}", factory.as_ref())
-        );
-    }
-
-    // Test JsonSource
-    {
-        let source = JsonSource::default();
-        let source_with_adapter = source
-            .clone()
-            .with_schema_adapter_factory(factory.clone())
-            .unwrap();
-
-        let base_source: Arc<dyn FileSource> = source.into();
-        assert!(base_source.schema_adapter_factory().is_none());
-        assert!(source_with_adapter.schema_adapter_factory().is_some());
-
-        let retrieved_factory = source_with_adapter.schema_adapter_factory().unwrap();
-        assert_eq!(
-            format!("{:?}", retrieved_factory.as_ref()),
-            format!("{:?}", factory.as_ref())
-        );
-    }
-
-    Ok(())
-}
diff --git a/datafusion/core/tests/set_comparison.rs b/datafusion/core/tests/set_comparison.rs
new file mode 100644
index 0000000000000..464d6c937b328
--- /dev/null
+++ b/datafusion/core/tests/set_comparison.rs
@@ -0,0 +1,193 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{Int32Array, StringArray};
+use arrow::datatypes::{DataType, Field, Schema};
+use arrow::record_batch::RecordBatch;
+use datafusion::prelude::SessionContext;
+use datafusion_common::{Result, assert_batches_eq, assert_contains};
+
+fn build_table(values: &[i32]) -> Result<RecordBatch> {
+    let schema = Arc::new(Schema::new(vec![Field::new("v", DataType::Int32, true)]));
+    let array =
+        Arc::new(Int32Array::from(values.to_vec())) as Arc<dyn arrow::array::Array>;
+    RecordBatch::try_new(schema, vec![array]).map_err(Into::into)
+}
+
+#[tokio::test]
+async fn set_comparison_any() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    ctx.register_batch("t", build_table(&[1, 6, 10])?)?;
+    // Include a NULL in the subquery input to ensure we propagate UNKNOWN correctly.
+    ctx.register_batch("s", {
+        let schema = Arc::new(Schema::new(vec![Field::new("v", DataType::Int32, true)]));
+        let array = Arc::new(Int32Array::from(vec![Some(5), None]))
+            as Arc<dyn arrow::array::Array>;
+        RecordBatch::try_new(schema, vec![array])?
+    })?;
+
+    let df = ctx
+        .sql("select v from t where v > any(select v from s)")
+        .await?;
+    let results = df.collect().await?;
+
+    assert_batches_eq!(
+        &["+----+", "| v  |", "+----+", "| 6  |", "| 10 |", "+----+",],
+        &results
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn set_comparison_any_aggregate_subquery() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    ctx.register_batch("t", build_table(&[1, 7])?)?;
+    ctx.register_batch("s", build_table(&[1, 2, 3])?)?;
+
+    let df = ctx
+        .sql(
+            "select v from t where v > any(select sum(v) from s group by v % 2) order by v",
+        )
+        .await?;
+    let results = df.collect().await?;
+
+    assert_batches_eq!(&["+---+", "| v |", "+---+", "| 7 |", "+---+",], &results);
+    Ok(())
+}
+
+#[tokio::test]
+async fn set_comparison_all_empty() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    ctx.register_batch("t", build_table(&[1, 6, 10])?)?;
+    ctx.register_batch(
+        "e",
+        RecordBatch::new_empty(Arc::new(Schema::new(vec![Field::new(
+            "v",
+            DataType::Int32,
+            true,
+        )]))),
+    )?;
+
+    let df = ctx
+        .sql("select v from t where v < all(select v from e)")
+        .await?;
+    let results = df.collect().await?;
+
+    assert_batches_eq!(
+        &[
+            "+----+", "| v  |", "+----+", "| 1  |", "| 6  |", "| 10 |", "+----+",
+        ],
+        &results
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn set_comparison_type_mismatch() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    ctx.register_batch("t", build_table(&[1])?)?;
+    ctx.register_batch("strings", {
+        let schema = Arc::new(Schema::new(vec![Field::new("s", DataType::Utf8, true)]));
+        let array = Arc::new(StringArray::from(vec![Some("a"), Some("b")]))
+            as Arc<dyn arrow::array::Array>;
+        RecordBatch::try_new(schema, vec![array])?
+    })?;
+
+    let df = ctx
+        .sql("select v from t where v > any(select s from strings)")
+        .await?;
+    let err = df.collect().await.unwrap_err();
+    assert_contains!(
+        err.to_string(),
+        "expr type Int32 can't cast to Utf8 in SetComparison"
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn set_comparison_multiple_operators() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    ctx.register_batch("t", build_table(&[1, 2, 3, 4])?)?;
+    ctx.register_batch("s", build_table(&[2, 3])?)?;
+
+    let df = ctx
+        .sql("select v from t where v = any(select v from s) order by v")
+        .await?;
+    let results = df.collect().await?;
+    assert_batches_eq!(
+        &["+---+", "| v |", "+---+", "| 2 |", "| 3 |", "+---+",],
+        &results
+    );
+
+    let df = ctx
+        .sql("select v from t where v != all(select v from s) order by v")
+        .await?;
+    let results = df.collect().await?;
+    assert_batches_eq!(
+        &["+---+", "| v |", "+---+", "| 1 |", "| 4 |", "+---+",],
+        &results
+    );
+
+    let df = ctx
+        .sql("select v from t where v >= all(select v from s) order by v")
+        .await?;
+    let results = df.collect().await?;
+    assert_batches_eq!(
+        &["+---+", "| v |", "+---+", "| 3 |", "| 4 |", "+---+",],
+        &results
+    );
+
+    let df = ctx
+        .sql("select v from t where v <= any(select v from s) order by v")
+        .await?;
+    let results = df.collect().await?;
+    assert_batches_eq!(
+        &[
+            "+---+", "| v |", "+---+", "| 1 |", "| 2 |", "| 3 |", "+---+",
+        ],
+        &results
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn set_comparison_null_semantics_all() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    ctx.register_batch("t", build_table(&[5])?)?;
+    ctx.register_batch("s", {
+        let schema = Arc::new(Schema::new(vec![Field::new("v", DataType::Int32, true)]));
+        let array = Arc::new(Int32Array::from(vec![Some(1), None]))
+            as Arc<dyn arrow::array::Array>;
+        RecordBatch::try_new(schema, vec![array])?
+    })?;
+
+    let df = ctx
+        .sql("select v from t where v != all(select v from s)")
+        .await?;
+    let results = df.collect().await?;
+    let row_count: usize = results.iter().map(|batch| batch.num_rows()).sum();
+    assert_eq!(0, row_count);
+    Ok(())
+}
diff --git a/datafusion/core/tests/sql/aggregates/basic.rs b/datafusion/core/tests/sql/aggregates/basic.rs
index 4b421b5294e01..3e5dc6a0b1872 100644
--- a/datafusion/core/tests/sql/aggregates/basic.rs
+++ b/datafusion/core/tests/sql/aggregates/basic.rs
@@ -365,7 +365,7 @@ async fn count_distinct_dictionary_all_null_values() -> Result<()> {
 
     assert_snapshot!(
         batches_to_string(&results),
-        @r###"
+        @r"
     +-----+---------------+
     | cnt | count(t.num2) |
     +-----+---------------+
@@ -375,7 +375,7 @@ async fn count_distinct_dictionary_all_null_values() -> Result<()> {
     | 0   | 1             |
     | 0   | 1             |
     +-----+---------------+
-    "###
+    "
     );
 
     // Test with multiple partitions
@@ -430,13 +430,68 @@ async fn count_distinct_dictionary_mixed_values() -> Result<()> {
 
     assert_snapshot!(
         batches_to_string(&results),
-        @r###"
+        @r"
     +------------------------+
     | count(DISTINCT t.dict) |
     +------------------------+
     | 2                      |
     +------------------------+
-    "###
+    "
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn group_by_ree_dict_column() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    let run_ends = Int32Array::from(vec![2, 4, 5]);
+    let dict = DictionaryArray::new(
+        UInt32Array::from(vec![0, 1, 2]),
+        Arc::new(StringArray::from(vec!["alpha", "beta", "gamma"])),
+    );
+    let ree_col = RunArray::<Int32Type>::try_new(&run_ends, &dict).unwrap();
+    let value_col = Int32Array::from(vec![1, 2, 3, 4, 5]);
+
+    let dict_type =
+        DataType::Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8));
+    let schema = Arc::new(Schema::new(vec![
+        Field::new(
+            "group_col",
+            DataType::RunEndEncoded(
+                Arc::new(Field::new("run_ends", DataType::Int32, false)),
+                Arc::new(Field::new("values", dict_type, true)),
+            ),
+            true,
+        ),
+        Field::new("value", DataType::Int32, false),
+    ]));
+
+    let batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![Arc::new(ree_col), Arc::new(value_col)],
+    )?;
+    let table = MemTable::try_new(schema, vec![vec![batch]])?;
+    ctx.register_table("t", Arc::new(table))?;
+
+    let results = ctx
+        .sql("SELECT group_col, SUM(value) as total FROM t GROUP BY group_col ORDER BY group_col")
+        .await?
+        .collect()
+        .await?;
+
+    assert_snapshot!(
+        batches_to_string(&results),
+        @r"
+    +-----------+-------+
+    | group_col | total |
+    +-----------+-------+
+    | alpha     | 3     |
+    | beta      | 7     |
+    | gamma     | 5     |
+    +-----------+-------+
+    "
     );
 
     Ok(())
diff --git a/datafusion/core/tests/sql/aggregates/dict_nulls.rs b/datafusion/core/tests/sql/aggregates/dict_nulls.rs
index da4b2c8d25c9d..f9e15a71a20f8 100644
--- a/datafusion/core/tests/sql/aggregates/dict_nulls.rs
+++ b/datafusion/core/tests/sql/aggregates/dict_nulls.rs
@@ -34,7 +34,7 @@ async fn test_aggregates_null_handling_comprehensive() -> Result<()> {
 
     assert_snapshot!(
         batches_to_string(&results_count),
-        @r###"
+        @r"
     +----------------+-----+
     | dict_null_keys | cnt |
     +----------------+-----+
@@ -42,7 +42,7 @@ async fn test_aggregates_null_handling_comprehensive() -> Result<()> {
     | group_a        | 2   |
     | group_b        | 1   |
     +----------------+-----+
-    "###
+    "
     );
 
     // Test SUM null handling with extended data
@@ -69,7 +69,7 @@ async fn test_aggregates_null_handling_comprehensive() -> Result<()> {
 
     assert_snapshot!(
         batches_to_string(&results_min),
-        @r###"
+        @r"
     +----------------+---------+
     | dict_null_keys | minimum |
     +----------------+---------+
@@ -78,7 +78,7 @@ async fn test_aggregates_null_handling_comprehensive() -> Result<()> {
     | group_b        | 1       |
     | group_c        | 7       |
     +----------------+---------+
-    "###
+    "
     );
 
     // Test MEDIAN null handling with median data
@@ -168,7 +168,7 @@ async fn test_first_last_value_order_by_null_handling() -> Result<()> {
 
     assert_snapshot!(
         batches_to_string(&results),
-        @r###"
+        @r"
     +------------+-------+--------------------+---------------------+-------------------+--------------------+
     | dict_group | value | first_ignore_nulls | first_respect_nulls | last_ignore_nulls | last_respect_nulls |
     +------------+-------+--------------------+---------------------+-------------------+--------------------+
@@ -178,7 +178,7 @@ async fn test_first_last_value_order_by_null_handling() -> Result<()> {
     | group_a    |       | 5                  |                     | 20                |                    |
     | group_b    |       | 5                  |                     | 20                |                    |
     +------------+-------+--------------------+---------------------+-------------------+--------------------+
-    "###
+    "
     );
 
     Ok(())
@@ -249,7 +249,7 @@ async fn test_first_last_value_group_by_dict_nulls() -> Result<()> {
 
     assert_snapshot!(
         batches_to_string(&results),
-        @r###"
+        @r"
     +----------------+-----------+----------+-----+
     | dict_null_keys | first_val | last_val | cnt |
     +----------------+-----------+----------+-----+
@@ -257,7 +257,7 @@ async fn test_first_last_value_group_by_dict_nulls() -> Result<()> {
     | group_a        | 10        | 50       | 2   |
     | group_b        | 30        | 30       | 1   |
     +----------------+-----------+----------+-----+
-    "###
+    "
     );
 
     // Test GROUP BY with null values in dictionary
@@ -275,7 +275,7 @@ async fn test_first_last_value_group_by_dict_nulls() -> Result<()> {
 
     assert_snapshot!(
         batches_to_string(&results2),
-        @r###"
+        @r"
     +----------------+-----------+----------+-----+
     | dict_null_vals | first_val | last_val | cnt |
     +----------------+-----------+----------+-----+
@@ -283,7 +283,7 @@ async fn test_first_last_value_group_by_dict_nulls() -> Result<()> {
     | val_x          | 10        | 50       | 2   |
     | val_y          | 30        | 30       | 1   |
     +----------------+-----------+----------+-----+
-    "###
+    "
     );
 
     Ok(())
@@ -394,7 +394,7 @@ async fn test_count_distinct_with_fuzz_table_dict_nulls() -> Result<()> {
 
     assert_snapshot!(
         batches_to_string(&results),
-        @r###"
+        @r"
     +--------+----------+---------------------+------+------+
     | u8_low | utf8_low | dictionary_utf8_low | col1 | col2 |
     +--------+----------+---------------------+------+------+
@@ -405,7 +405,7 @@ async fn test_count_distinct_with_fuzz_table_dict_nulls() -> Result<()> {
     | 20     | text_e   |                     | 0    | 1    |
     | 25     | text_f   | group_gamma         | 1    | 1    |
     +--------+----------+---------------------+------+------+
-    "###
+    "
     );
 
     Ok(())
diff --git a/datafusion/core/tests/sql/aggregates/mod.rs b/datafusion/core/tests/sql/aggregates/mod.rs
index 321c158628e43..ede40d5c4ceca 100644
--- a/datafusion/core/tests/sql/aggregates/mod.rs
+++ b/datafusion/core/tests/sql/aggregates/mod.rs
@@ -20,15 +20,15 @@
 use super::*;
 use arrow::{
     array::{
-        types::UInt32Type, Decimal128Array, DictionaryArray, DurationNanosecondArray,
-        Int32Array, LargeBinaryArray, StringArray, TimestampMicrosecondArray,
-        UInt16Array, UInt32Array, UInt64Array, UInt8Array,
+        Decimal128Array, DictionaryArray, DurationNanosecondArray, Int32Array,
+        LargeBinaryArray, StringArray, TimestampMicrosecondArray, UInt8Array,
+        UInt16Array, UInt32Array, UInt64Array, types::UInt32Type,
     },
     datatypes::{DataType, Field, Schema, TimeUnit},
     record_batch::RecordBatch,
 };
 use datafusion::{
-    common::{test_util::batches_to_string, Result},
+    common::{Result, test_util::batches_to_string},
     execution::{config::SessionConfig, context::SessionContext},
 };
 use datafusion_catalog::MemTable;
@@ -959,8 +959,8 @@ impl FuzzTimestampTestData {
 }
 
 /// Sets up test contexts for fuzz table with timestamps and both single and multiple partitions
-pub async fn setup_fuzz_timestamp_test_contexts(
-) -> Result<(SessionContext, SessionContext)> {
+pub async fn setup_fuzz_timestamp_test_contexts()
+-> Result<(SessionContext, SessionContext)> {
     let test_data = FuzzTimestampTestData::new();
 
     // Single partition context
diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs
index 8d98b91547fe7..8ab0d150a7272 100644
--- a/datafusion/core/tests/sql/explain_analyze.rs
+++ b/datafusion/core/tests/sql/explain_analyze.rs
@@ -22,7 +22,7 @@ use rstest::rstest;
 use datafusion::config::ConfigOptions;
 use datafusion::physical_plan::display::DisplayableExecutionPlan;
 use datafusion::physical_plan::metrics::Timestamp;
-use datafusion_common::format::ExplainAnalyzeLevel;
+use datafusion_common::format::{ExplainAnalyzeCategories, MetricCategory, MetricType};
 use object_store::path::Path;
 
 #[tokio::test]
@@ -61,87 +61,92 @@ async fn explain_analyze_baseline_metrics() {
     assert_metrics!(
         &formatted,
         "AggregateExec: mode=Partial, gby=[]",
-        "metrics=[output_rows=3, elapsed_compute="
-    );
-    assert_metrics!(
-        &formatted,
-        "AggregateExec: mode=Partial, gby=[]",
-        "output_bytes="
+        "metrics=[output_rows=3, elapsed_compute=",
+        "output_bytes=",
+        "output_batches=3"
     );
 
     assert_metrics!(
         &formatted,
         "AggregateExec: mode=Partial, gby=[c1@0 as c1]",
-        "reduction_factor=5.1% (5/99)"
+        "reduction_factor=5.05% (5/99)"
     );
 
-    assert_metrics!(
-        &formatted,
-        "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1]",
-        "metrics=[output_rows=5, elapsed_compute="
-    );
-    assert_metrics!(
-        &formatted,
-        "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1]",
-        "output_bytes="
-    );
-    assert_metrics!(
-        &formatted,
-        "FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434",
-        "metrics=[output_rows=99, elapsed_compute="
-    );
+    {
+        let expected_batch_count_after_repartition =
+            if cfg!(not(feature = "force_hash_collisions")) {
+                "output_batches=3"
+            } else {
+                "output_batches=1"
+            };
+
+        assert_metrics!(
+            &formatted,
+            "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1]",
+            "metrics=[output_rows=5, elapsed_compute=",
+            "output_bytes=",
+            expected_batch_count_after_repartition
+        );
+
+        assert_metrics!(
+            &formatted,
+            "RepartitionExec: partitioning=Hash([c1@0], 3), input_partitions=3",
+            "metrics=[output_rows=5, elapsed_compute=",
+            "output_bytes=",
+            expected_batch_count_after_repartition
+        );
+
+        assert_metrics!(
+            &formatted,
+            "ProjectionExec: expr=[]",
+            "metrics=[output_rows=5, elapsed_compute=",
+            "output_bytes=",
+            expected_batch_count_after_repartition
+        );
+    }
+
     assert_metrics!(
         &formatted,
         "FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434",
-        "output_bytes="
+        "metrics=[output_rows=99, elapsed_compute=",
+        "output_bytes=",
+        "output_batches=1"
     );
+
     assert_metrics!(
         &formatted,
         "FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434",
         "selectivity=99% (99/100)"
     );
-    assert_metrics!(
-        &formatted,
-        "ProjectionExec: expr=[]",
-        "metrics=[output_rows=5, elapsed_compute="
-    );
-    assert_metrics!(&formatted, "ProjectionExec: expr=[]", "output_bytes=");
-    assert_metrics!(
-        &formatted,
-        "CoalesceBatchesExec: target_batch_size=4096",
-        "metrics=[output_rows=5, elapsed_compute"
-    );
-    assert_metrics!(
-        &formatted,
-        "CoalesceBatchesExec: target_batch_size=4096",
-        "output_bytes="
-    );
+
     assert_metrics!(
         &formatted,
         "UnionExec",
-        "metrics=[output_rows=3, elapsed_compute="
+        "metrics=[output_rows=3, elapsed_compute=",
+        "output_bytes=",
+        "output_batches=3"
     );
-    assert_metrics!(&formatted, "UnionExec", "output_bytes=");
+
     assert_metrics!(
         &formatted,
         "WindowAggExec",
-        "metrics=[output_rows=1, elapsed_compute="
+        "metrics=[output_rows=1, elapsed_compute=",
+        "output_bytes=",
+        "output_batches=1"
     );
-    assert_metrics!(&formatted, "WindowAggExec", "output_bytes=");
 
     fn expected_to_have_metrics(plan: &dyn ExecutionPlan) -> bool {
         use datafusion::physical_plan;
         use datafusion::physical_plan::sorts;
 
-        plan.as_any().downcast_ref::<sorts::sort::SortExec>().is_some()
-            || plan.as_any().downcast_ref::<physical_plan::aggregates::AggregateExec>().is_some()
-            || plan.as_any().downcast_ref::<physical_plan::filter::FilterExec>().is_some()
-            || plan.as_any().downcast_ref::<physical_plan::limit::LocalLimitExec>().is_some()
-            || plan.as_any().downcast_ref::<physical_plan::projection::ProjectionExec>().is_some()
-            || plan.as_any().downcast_ref::<physical_plan::coalesce_batches::CoalesceBatchesExec>().is_some()
-            || plan.as_any().downcast_ref::<physical_plan::coalesce_partitions::CoalescePartitionsExec>().is_some()
-            || plan.as_any().downcast_ref::<physical_plan::union::UnionExec>().is_some()
-            || plan.as_any().downcast_ref::<physical_plan::windows::WindowAggExec>().is_some()
+        plan.is::<sorts::sort::SortExec>()
+            || plan.is::<physical_plan::aggregates::AggregateExec>()
+            || plan.is::<physical_plan::filter::FilterExec>()
+            || plan.is::<physical_plan::limit::LocalLimitExec>()
+            || plan.is::<physical_plan::projection::ProjectionExec>()
+            || plan.is::<physical_plan::coalesce_partitions::CoalescePartitionsExec>()
+            || plan.is::<physical_plan::union::UnionExec>()
+            || plan.is::<physical_plan::windows::WindowAggExec>()
     }
 
     // Validate that the recorded elapsed compute time was more than
@@ -200,7 +205,7 @@ fn nanos_from_timestamp(ts: &Timestamp) -> i64 {
 async fn collect_plan_with_context(
     sql_str: &str,
     ctx: &SessionContext,
-    level: ExplainAnalyzeLevel,
+    level: MetricType,
 ) -> String {
     {
         let state = ctx.state_ref();
@@ -214,7 +219,24 @@ async fn collect_plan_with_context(
         .to_string()
 }
 
-async fn collect_plan(sql_str: &str, level: ExplainAnalyzeLevel) -> String {
+async fn collect_plan_with_categories(
+    sql_str: &str,
+    categories: ExplainAnalyzeCategories,
+) -> String {
+    let ctx = SessionContext::new();
+    {
+        let state = ctx.state_ref();
+        let mut state = state.write();
+        state.config_mut().options_mut().explain.analyze_categories = categories;
+    }
+    let dataframe = ctx.sql(sql_str).await.unwrap();
+    let batches = dataframe.collect().await.unwrap();
+    arrow::util::pretty::pretty_format_batches(&batches)
+        .unwrap()
+        .to_string()
+}
+
+async fn collect_plan(sql_str: &str, level: MetricType) -> String {
     let ctx = SessionContext::new();
     collect_plan_with_context(sql_str, &ctx, level).await
 }
@@ -227,10 +249,14 @@ async fn explain_analyze_level() {
             ORDER BY v1 DESC";
 
     for (level, needle, should_contain) in [
-        (ExplainAnalyzeLevel::Summary, "spill_count", false),
-        (ExplainAnalyzeLevel::Summary, "output_rows", true),
-        (ExplainAnalyzeLevel::Dev, "spill_count", true),
-        (ExplainAnalyzeLevel::Dev, "output_rows", true),
+        (MetricType::Summary, "spill_count", false),
+        (MetricType::Summary, "output_batches", false),
+        (MetricType::Summary, "output_rows", true),
+        (MetricType::Summary, "output_bytes", true),
+        (MetricType::Dev, "spill_count", true),
+        (MetricType::Dev, "output_rows", true),
+        (MetricType::Dev, "output_bytes", true),
+        (MetricType::Dev, "output_batches", true),
     ] {
         let plan = collect_plan(sql, level).await;
         assert_eq!(
@@ -254,10 +280,10 @@ async fn explain_analyze_level_datasource_parquet() {
         .expect("register parquet table for explain analyze test");
 
     for (level, needle, should_contain) in [
-        (ExplainAnalyzeLevel::Summary, "metadata_load_time", true),
-        (ExplainAnalyzeLevel::Summary, "page_index_eval_time", false),
-        (ExplainAnalyzeLevel::Dev, "metadata_load_time", true),
-        (ExplainAnalyzeLevel::Dev, "page_index_eval_time", true),
+        (MetricType::Summary, "metadata_load_time", true),
+        (MetricType::Summary, "page_index_eval_time", false),
+        (MetricType::Dev, "metadata_load_time", true),
+        (MetricType::Dev, "page_index_eval_time", true),
     ] {
         let plan = collect_plan_with_context(&sql, &ctx, level).await;
 
@@ -290,8 +316,7 @@ async fn explain_analyze_parquet_pruning_metrics() {
             "explain analyze select * from {table_name} where l_orderkey = {l_orderkey};"
         );
 
-        let plan =
-            collect_plan_with_context(&sql, &ctx, ExplainAnalyzeLevel::Summary).await;
+        let plan = collect_plan_with_context(&sql, &ctx, MetricType::Summary).await;
 
         let expected_metrics =
             format!("files_ranges_pruned_statistics={expected_pruning_metrics}");
@@ -336,12 +361,12 @@ async fn csv_explain_plans() {
     let actual = formatted.trim();
     assert_snapshot!(
         actual,
-        @r###"
+        @r"
     Explain
       Projection: aggregate_test_100.c1
         Filter: aggregate_test_100.c2 > Int64(10)
           TableScan: aggregate_test_100
-    "###
+    "
     );
     //
     // verify the grahviz format of the plan
@@ -407,13 +432,12 @@ async fn csv_explain_plans() {
     let actual = formatted.trim();
     assert_snapshot!(
         actual,
-        @r###"
+        @r"
     Explain
       Projection: aggregate_test_100.c1
         Filter: aggregate_test_100.c2 > Int8(10)
           TableScan: aggregate_test_100 projection=[c1, c2], partial_filters=[aggregate_test_100.c2 > Int8(10)]
-
-    "###
+    "
     );
     //
     // verify the grahviz format of the plan
@@ -553,12 +577,12 @@ async fn csv_explain_verbose_plans() {
     let actual = formatted.trim();
     assert_snapshot!(
         actual,
-        @r###"
+        @r"
     Explain
       Projection: aggregate_test_100.c1
         Filter: aggregate_test_100.c2 > Int64(10)
           TableScan: aggregate_test_100
-    "###
+    "
     );
     //
     // verify the grahviz format of the plan
@@ -624,12 +648,12 @@ async fn csv_explain_verbose_plans() {
     let actual = formatted.trim();
     assert_snapshot!(
         actual,
-        @r###"
+        @r"
     Explain
       Projection: aggregate_test_100.c1
         Filter: aggregate_test_100.c2 > Int8(10)
           TableScan: aggregate_test_100 projection=[c1, c2], partial_filters=[aggregate_test_100.c2 > Int8(10)]
-    "###
+    "
     );
     //
     // verify the grahviz format of the plan
@@ -748,19 +772,17 @@ async fn test_physical_plan_display_indent() {
 
     assert_snapshot!(
         actual,
-        @r###"
+        @r"
     SortPreservingMergeExec: [the_min@2 DESC], fetch=10
       SortExec: TopK(fetch=10), expr=[the_min@2 DESC], preserve_partitioning=[true]
         ProjectionExec: expr=[c1@0 as c1, max(aggregate_test_100.c12)@1 as max(aggregate_test_100.c12), min(aggregate_test_100.c12)@2 as the_min]
           AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[max(aggregate_test_100.c12), min(aggregate_test_100.c12)]
-            CoalesceBatchesExec: target_batch_size=4096
-              RepartitionExec: partitioning=Hash([c1@0], 9000), input_partitions=9000
-                AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[max(aggregate_test_100.c12), min(aggregate_test_100.c12)]
-                  CoalesceBatchesExec: target_batch_size=4096
-                    FilterExec: c12@1 < 10
-                      RepartitionExec: partitioning=RoundRobinBatch(9000), input_partitions=1
-                        DataSourceExec: file_groups={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, projection=[c1, c12], file_type=csv, has_header=true
-    "###
+            RepartitionExec: partitioning=Hash([c1@0], 9000), input_partitions=9000
+              AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[max(aggregate_test_100.c12), min(aggregate_test_100.c12)]
+                FilterExec: c12@1 < 10
+                  RepartitionExec: partitioning=RoundRobinBatch(9000), input_partitions=1
+                    DataSourceExec: file_groups={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, projection=[c1, c12], file_type=csv, has_header=true
+    "
     );
 }
 
@@ -794,19 +816,13 @@ async fn test_physical_plan_display_indent_multi_children() {
 
     assert_snapshot!(
         actual,
-        @r###"
-    CoalesceBatchesExec: target_batch_size=4096
-      HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c1@0, c2@0)], projection=[c1@0]
-        CoalesceBatchesExec: target_batch_size=4096
-          RepartitionExec: partitioning=Hash([c1@0], 9000), input_partitions=9000
-            RepartitionExec: partitioning=RoundRobinBatch(9000), input_partitions=1
-              DataSourceExec: file_groups={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true
-        CoalesceBatchesExec: target_batch_size=4096
-          RepartitionExec: partitioning=Hash([c2@0], 9000), input_partitions=9000
-            RepartitionExec: partitioning=RoundRobinBatch(9000), input_partitions=1
-              ProjectionExec: expr=[c1@0 as c2]
-                DataSourceExec: file_groups={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true
-    "###
+        @r"
+    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c1@0, c2@0)], projection=[c1@0]
+      RepartitionExec: partitioning=Hash([c1@0], 9000), input_partitions=1
+        DataSourceExec: file_groups={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true
+      RepartitionExec: partitioning=Hash([c2@0], 9000), input_partitions=1
+        DataSourceExec: file_groups={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, projection=[c1@0 as c2], file_type=csv, has_header=true
+    "
     );
 }
 
@@ -845,8 +861,7 @@ async fn csv_explain_analyze_order_by() {
 
     // Ensure that the ordering is not optimized away from the plan
     // https://github.com/apache/datafusion/issues/6379
-    let needle =
-        "SortExec: expr=[c1@0 ASC NULLS LAST], preserve_partitioning=[false], metrics=[output_rows=100, elapsed_compute";
+    let needle = "SortExec: expr=[c1@0 ASC NULLS LAST], preserve_partitioning=[false], metrics=[output_rows=100, elapsed_compute";
     assert_contains!(&formatted, needle);
 }
 
@@ -872,6 +887,8 @@ async fn parquet_explain_analyze() {
         &formatted,
         "row_groups_pruned_statistics=1 total \u{2192} 1 matched"
     );
+    assert_contains!(&formatted, "output_rows_skew=0%");
+    assert_contains!(&formatted, "scan_efficiency_ratio=13.99%");
 
     // The order of metrics is expected to be the same as the actual pruning order
     // (file-> row-group -> page)
@@ -879,13 +896,14 @@ async fn parquet_explain_analyze() {
     let i_rowgroup_stat = formatted.find("row_groups_pruned_statistics").unwrap();
     let i_rowgroup_bloomfilter =
         formatted.find("row_groups_pruned_bloom_filter").unwrap();
-    let i_page = formatted.find("page_index_rows_pruned").unwrap();
+    let i_page_rows = formatted.find("page_index_rows_pruned").unwrap();
+    let i_page_pages = formatted.find("page_index_pages_pruned").unwrap();
 
     assert!(
         (i_file < i_rowgroup_stat)
             && (i_rowgroup_stat < i_rowgroup_bloomfilter)
-            && (i_rowgroup_bloomfilter < i_page),
-            "The parquet pruning metrics should be displayed in an order of: file range -> row group statistics -> row group bloom filter -> page index."
+            && (i_rowgroup_bloomfilter < i_page_pages && i_page_pages < i_page_rows),
+        "The parquet pruning metrics should be displayed in an order of: file range -> row group statistics -> row group bloom filter -> page index."
     );
 }
 
@@ -997,16 +1015,14 @@ async fn parquet_recursive_projection_pushdown() -> Result<()> {
       RecursiveQueryExec: name=number_series, is_distinct=false
         CoalescePartitionsExec
           ProjectionExec: expr=[id@0 as id, 1 as level]
-            CoalesceBatchesExec: target_batch_size=8192
-              FilterExec: id@0 = 1
-                RepartitionExec: partitioning=RoundRobinBatch(NUM_CORES), input_partitions=1
-                  DataSourceExec: file_groups={1 group: [[TMP_DIR/hierarchy.parquet]]}, projection=[id], file_type=parquet, predicate=id@0 = 1, pruning_predicate=id_null_count@2 != row_count@3 AND id_min@0 <= 1 AND 1 <= id_max@1, required_guarantees=[id in (1)]
+            FilterExec: id@0 = 1
+              RepartitionExec: partitioning=RoundRobinBatch(NUM_CORES), input_partitions=1
+                DataSourceExec: file_groups={1 group: [[TMP_DIR/hierarchy.parquet]]}, projection=[id], file_type=parquet, predicate=id@0 = 1, pruning_predicate=id_null_count@2 != row_count@3 AND id_min@0 <= 1 AND 1 <= id_max@1, required_guarantees=[id in (1)]
         CoalescePartitionsExec
           ProjectionExec: expr=[id@0 + 1 as ns.id + Int64(1), level@1 + 1 as ns.level + Int64(1)]
-            CoalesceBatchesExec: target_batch_size=8192
-              FilterExec: id@0 < 10
-                RepartitionExec: partitioning=RoundRobinBatch(NUM_CORES), input_partitions=1
-                  WorkTableExec: name=number_series
+            FilterExec: id@0 < 10
+              RepartitionExec: partitioning=RoundRobinBatch(NUM_CORES), input_partitions=1
+                WorkTableExec: name=number_series
     "
     );
 
@@ -1082,11 +1098,11 @@ async fn explain_physical_plan_only() {
 
     assert_snapshot!(
         actual,
-        @r###"
+        @r"
     physical_plan
     ProjectionExec: expr=[2 as count(*)]
       PlaceholderRowExec
-    "###
+    "
     );
 }
 
@@ -1110,3 +1126,144 @@ async fn csv_explain_analyze_with_statistics() {
         ", statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]"
     );
 }
+
+#[tokio::test]
+async fn nested_loop_join_selectivity() {
+    for (join_type, expected_selectivity) in [
+        ("INNER", "1% (1/100)"),
+        ("LEFT", "10% (10/100)"),
+        ("RIGHT", "10% (10/100)"),
+        // 1 match + 9 left + 9 right = 19
+        ("FULL", "19% (19/100)"),
+    ] {
+        let ctx = SessionContext::new();
+        let sql = format!(
+            "EXPLAIN ANALYZE SELECT * \
+                FROM generate_series(1, 10) as t1(a) \
+                {join_type} JOIN generate_series(1, 10) as t2(b) \
+                ON (t1.a + t2.b) = 20"
+        );
+
+        let actual = execute_to_batches(&ctx, sql.as_str()).await;
+        let formatted = arrow::util::pretty::pretty_format_batches(&actual)
+            .unwrap()
+            .to_string();
+
+        assert_metrics!(
+            &formatted,
+            "NestedLoopJoinExec",
+            &format!("selectivity={expected_selectivity}")
+        );
+    }
+}
+
+#[tokio::test]
+async fn explain_analyze_hash_join() {
+    let sql = "EXPLAIN ANALYZE \
+            SELECT * \
+            FROM generate_series(10) as t1(a) \
+            JOIN generate_series(20) as t2(b) \
+            ON t1.a=t2.b";
+
+    for (level, needle, should_contain) in [
+        (MetricType::Summary, "probe_hit_rate", true),
+        (MetricType::Summary, "avg_fanout", true),
+    ] {
+        let plan = collect_plan(sql, level).await;
+        assert_eq!(
+            plan.contains(needle),
+            should_contain,
+            "plan for level {level:?} unexpected content: {plan}"
+        );
+    }
+}
+
+#[tokio::test]
+async fn explain_analyze_categories() {
+    let sql = "EXPLAIN ANALYZE \
+            SELECT * \
+            FROM generate_series(10) as t1(v1) \
+            ORDER BY v1 DESC";
+
+    for (categories, needle, should_contain) in [
+        // "rows" category: output_rows yes, elapsed_compute no, output_bytes no
+        (
+            ExplainAnalyzeCategories::Only(vec![MetricCategory::Rows]),
+            "output_rows",
+            true,
+        ),
+        (
+            ExplainAnalyzeCategories::Only(vec![MetricCategory::Rows]),
+            "elapsed_compute",
+            false,
+        ),
+        (
+            ExplainAnalyzeCategories::Only(vec![MetricCategory::Rows]),
+            "output_bytes",
+            false,
+        ),
+        // "none" — plan only, no metrics at all
+        (ExplainAnalyzeCategories::Only(vec![]), "output_rows", false),
+        (
+            ExplainAnalyzeCategories::Only(vec![]),
+            "elapsed_compute",
+            false,
+        ),
+        // "all" — everything shown
+        (ExplainAnalyzeCategories::All, "output_rows", true),
+        (ExplainAnalyzeCategories::All, "elapsed_compute", true),
+        (ExplainAnalyzeCategories::All, "output_bytes", true),
+        // "rows,bytes" — row + byte metrics, no timing
+        (
+            ExplainAnalyzeCategories::Only(vec![
+                MetricCategory::Rows,
+                MetricCategory::Bytes,
+            ]),
+            "output_rows",
+            true,
+        ),
+        (
+            ExplainAnalyzeCategories::Only(vec![
+                MetricCategory::Rows,
+                MetricCategory::Bytes,
+            ]),
+            "output_bytes",
+            true,
+        ),
+        (
+            ExplainAnalyzeCategories::Only(vec![
+                MetricCategory::Rows,
+                MetricCategory::Bytes,
+            ]),
+            "elapsed_compute",
+            false,
+        ),
+        // "rows,bytes,uncategorized" — everything except timing
+        (
+            ExplainAnalyzeCategories::Only(vec![
+                MetricCategory::Rows,
+                MetricCategory::Bytes,
+                MetricCategory::Uncategorized,
+            ]),
+            "output_rows",
+            true,
+        ),
+        (
+            ExplainAnalyzeCategories::Only(vec![
+                MetricCategory::Rows,
+                MetricCategory::Bytes,
+                MetricCategory::Uncategorized,
+            ]),
+            "elapsed_compute",
+            false,
+        ),
+    ] {
+        let plan = collect_plan_with_categories(sql, categories.clone()).await;
+        assert_eq!(
+            plan.contains(needle),
+            should_contain,
+            "plan for categories {categories:?} should{} contain '{needle}':\n{plan}",
+            if should_contain { "" } else { " NOT" }
+        );
+    }
+}
diff --git a/datafusion/core/tests/sql/joins.rs b/datafusion/core/tests/sql/joins.rs
index 7a59834475920..7c0e89ee96418 100644
--- a/datafusion/core/tests/sql/joins.rs
+++ b/datafusion/core/tests/sql/joins.rs
@@ -38,14 +38,16 @@ async fn join_change_in_planner() -> Result<()> {
         Field::new("a2", DataType::UInt32, false),
     ]));
     // Specify the ordering:
-    let file_sort_order = vec![[col("a1")]
-        .into_iter()
-        .map(|e| {
-            let ascending = true;
-            let nulls_first = false;
-            e.sort(ascending, nulls_first)
-        })
-        .collect::<Vec<_>>()];
+    let file_sort_order = vec![
+        [col("a1")]
+            .into_iter()
+            .map(|e| {
+                let ascending = true;
+                let nulls_first = false;
+                e.sort(ascending, nulls_first)
+            })
+            .collect::<Vec<_>>(),
+    ];
     register_unbounded_file_with_ordering(
         &ctx,
         schema.clone(),
@@ -72,14 +74,10 @@ async fn join_change_in_planner() -> Result<()> {
         actual,
         @r"
     SymmetricHashJoinExec: mode=Partitioned, join_type=Full, on=[(a2@1, a2@1)], filter=CAST(a1@0 AS Int64) > CAST(a1@1 AS Int64) + 3 AND CAST(a1@0 AS Int64) < CAST(a1@1 AS Int64) + 10
-      CoalesceBatchesExec: target_batch_size=8192
-        RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a1@0 ASC NULLS LAST
-          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-            StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST]
-      CoalesceBatchesExec: target_batch_size=8192
-        RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a1@0 ASC NULLS LAST
-          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-            StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST]
+      RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=1, maintains_sort_order=true
+        StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST]
+      RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=1, maintains_sort_order=true
+        StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST]
     "
     );
     Ok(())
@@ -99,14 +97,16 @@ async fn join_no_order_on_filter() -> Result<()> {
         Field::new("a3", DataType::UInt32, false),
     ]));
     // Specify the ordering:
-    let file_sort_order = vec![[col("a1")]
-        .into_iter()
-        .map(|e| {
-            let ascending = true;
-            let nulls_first = false;
-            e.sort(ascending, nulls_first)
-        })
-        .collect::<Vec<_>>()];
+    let file_sort_order = vec![
+        [col("a1")]
+            .into_iter()
+            .map(|e| {
+                let ascending = true;
+                let nulls_first = false;
+                e.sort(ascending, nulls_first)
+            })
+            .collect::<Vec<_>>(),
+    ];
     register_unbounded_file_with_ordering(
         &ctx,
         schema.clone(),
@@ -133,14 +133,10 @@ async fn join_no_order_on_filter() -> Result<()> {
         actual,
         @r"
     SymmetricHashJoinExec: mode=Partitioned, join_type=Full, on=[(a2@1, a2@1)], filter=CAST(a3@0 AS Int64) > CAST(a3@1 AS Int64) + 3 AND CAST(a3@0 AS Int64) < CAST(a3@1 AS Int64) + 10
-      CoalesceBatchesExec: target_batch_size=8192
-        RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8
-          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-            StreamingTableExec: partition_sizes=1, projection=[a1, a2, a3], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST]
-      CoalesceBatchesExec: target_batch_size=8192
-        RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8
-          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-            StreamingTableExec: partition_sizes=1, projection=[a1, a2, a3], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST]
+      RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=1, maintains_sort_order=true
+        StreamingTableExec: partition_sizes=1, projection=[a1, a2, a3], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST]
+      RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=1, maintains_sort_order=true
+        StreamingTableExec: partition_sizes=1, projection=[a1, a2, a3], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST]
     "
     );
     Ok(())
@@ -176,14 +172,10 @@ async fn join_change_in_planner_without_sort() -> Result<()> {
         actual,
         @r"
     SymmetricHashJoinExec: mode=Partitioned, join_type=Full, on=[(a2@1, a2@1)], filter=CAST(a1@0 AS Int64) > CAST(a1@1 AS Int64) + 3 AND CAST(a1@0 AS Int64) < CAST(a1@1 AS Int64) + 10
-      CoalesceBatchesExec: target_batch_size=8192
-        RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8
-          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-            StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true
-      CoalesceBatchesExec: target_batch_size=8192
-        RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8
-          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-            StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true
+      RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=1
+        StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true
+      RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=1
+        StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true
     "
     );
     Ok(())
@@ -214,7 +206,10 @@ async fn join_change_in_planner_without_sort_not_allowed() -> Result<()> {
     match df.create_physical_plan().await {
         Ok(_) => panic!("Expecting error."),
         Err(e) => {
-            assert_eq!(e.strip_backtrace(), "SanityCheckPlan\ncaused by\nError during planning: Join operation cannot operate on a non-prunable stream without enabling the 'allow_symmetric_joins_without_pruning' configuration flag")
+            assert_eq!(
+                e.strip_backtrace(),
+                "SanityCheckPlan\ncaused by\nError during planning: Join operation cannot operate on a non-prunable stream without enabling the 'allow_symmetric_joins_without_pruning' configuration flag"
+            )
         }
     }
     Ok(())
@@ -295,16 +290,12 @@ async fn unparse_cross_join() -> Result<()> {
         .await?;
 
     let unopt_sql = plan_to_sql(df.logical_plan())?;
-    assert_snapshot!(unopt_sql, @r#"
-        SELECT j1.j1_id, j2.j2_string FROM j1 CROSS JOIN j2 WHERE (j2.j2_id = 0)
-    "#);
+    assert_snapshot!(unopt_sql, @"SELECT j1.j1_id, j2.j2_string FROM j1 CROSS JOIN j2 WHERE (j2.j2_id = 0)");
 
     let optimized_plan = df.into_optimized_plan()?;
 
     let opt_sql = plan_to_sql(&optimized_plan)?;
-    assert_snapshot!(opt_sql, @r#"
-        SELECT j1.j1_id, j2.j2_string FROM j1 CROSS JOIN j2 WHERE (j2.j2_id = 0)
-    "#);
+    assert_snapshot!(opt_sql, @"SELECT j1.j1_id, j2.j2_string FROM j1 CROSS JOIN j2 WHERE (j2.j2_id = 0)");
 
     Ok(())
 }
diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs
index 743c8750b5215..9a1dc5502ee60 100644
--- a/datafusion/core/tests/sql/mod.rs
+++ b/datafusion/core/tests/sql/mod.rs
@@ -24,10 +24,10 @@ use arrow::{
 
 use datafusion::error::Result;
 use datafusion::logical_expr::{Aggregate, LogicalPlan, TableScan};
-use datafusion::physical_plan::collect;
-use datafusion::physical_plan::metrics::MetricValue;
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::physical_plan::ExecutionPlanVisitor;
+use datafusion::physical_plan::collect;
+use datafusion::physical_plan::metrics::MetricValue;
 use datafusion::prelude::*;
 use datafusion::test_util;
 use datafusion::{execution::context::SessionContext, physical_plan::displayable};
@@ -40,18 +40,24 @@ use std::io::Write;
 use std::path::PathBuf;
 use tempfile::TempDir;
 
-/// A macro to assert that some particular line contains two substrings
+/// A macro to assert that some particular line contains the given substrings
 ///
-/// Usage: `assert_metrics!(actual, operator_name, metrics)`
+/// Usage: `assert_metrics!(actual, operator_name, metrics_1, metrics_2, ...)`
 macro_rules! assert_metrics {
-    ($ACTUAL: expr, $OPERATOR_NAME: expr, $METRICS: expr) => {
+    ($ACTUAL: expr, $OPERATOR_NAME: expr, $($METRICS: expr),+) => {
         let found = $ACTUAL
             .lines()
-            .any(|line| line.contains($OPERATOR_NAME) && line.contains($METRICS));
+            .any(|line| line.contains($OPERATOR_NAME) $( && line.contains($METRICS))+);
+
+        let mut metrics = String::new();
+        $(metrics.push_str(format!(" '{}',", $METRICS).as_str());)+
+        // remove the last `,` from the string
+        metrics.pop();
+
         assert!(
             found,
-            "Can not find a line with both '{}' and '{}' in\n\n{}",
-            $OPERATOR_NAME, $METRICS, $ACTUAL
+            "Cannot find a line with operator name '{}' and metrics containing values {} in :\n\n{}",
+            $OPERATOR_NAME, metrics, $ACTUAL
         );
     };
 }
@@ -64,6 +70,7 @@ mod path_partition;
 mod runtime_config;
 pub mod select;
 mod sql_api;
+mod unparser;
 
 async fn register_aggregate_csv_by_sql(ctx: &SessionContext) {
     let testdata = test_util::arrow_test_data();
@@ -329,8 +336,7 @@ async fn nyc() -> Result<()> {
     match &optimized_plan {
         LogicalPlan::Aggregate(Aggregate { input, .. }) => match input.as_ref() {
             LogicalPlan::TableScan(TableScan {
-                ref projected_schema,
-                ..
+                projected_schema, ..
             }) => {
                 assert_eq!(2, projected_schema.fields().len());
                 assert_eq!(projected_schema.field(0).name(), "passenger_count");
diff --git a/datafusion/core/tests/sql/path_partition.rs b/datafusion/core/tests/sql/path_partition.rs
index 05cc723ef05fb..2eff1c262f855 100644
--- a/datafusion/core/tests/sql/path_partition.rs
+++ b/datafusion/core/tests/sql/path_partition.rs
@@ -20,7 +20,6 @@
 use std::collections::BTreeSet;
 use std::fs::File;
 use std::io::{Read, Seek, SeekFrom};
-use std::ops::Range;
 use std::sync::Arc;
 
 use arrow::datatypes::DataType;
@@ -31,26 +30,28 @@ use datafusion::{
         listing::{ListingOptions, ListingTable, ListingTableConfig},
     },
     error::Result,
-    physical_plan::ColumnStatistics,
     prelude::SessionContext,
     test_util::{self, arrow_test_data, parquet_test_data},
 };
 use datafusion_catalog::TableProvider;
+use datafusion_common::ScalarValue;
 use datafusion_common::stats::Precision;
 use datafusion_common::test_util::batches_to_sort_string;
-use datafusion_common::ScalarValue;
 use datafusion_execution::config::SessionConfig;
 
 use async_trait::async_trait;
 use bytes::Bytes;
 use chrono::{TimeZone, Utc};
+use futures::StreamExt;
 use futures::stream::{self, BoxStream};
 use insta::assert_snapshot;
 use object_store::{
-    path::Path, GetOptions, GetResult, GetResultPayload, ListResult, ObjectMeta,
-    ObjectStore, PutOptions, PutResult,
+    Attributes, CopyOptions, GetRange, MultipartUpload, PutMultipartOptions, PutPayload,
+};
+use object_store::{
+    GetOptions, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore,
+    PutOptions, PutResult, path::Path,
 };
-use object_store::{Attributes, MultipartUpload, PutMultipartOptions, PutPayload};
 use url::Url;
 
 #[tokio::test]
@@ -460,14 +461,26 @@ async fn parquet_statistics() -> Result<()> {
     let schema = physical_plan.schema();
     assert_eq!(schema.fields().len(), 4);
 
-    let stat_cols = physical_plan.partition_statistics(None)?.column_statistics;
+    let stat_cols = physical_plan
+        .partition_statistics(None)?
+        .column_statistics
+        .clone();
     assert_eq!(stat_cols.len(), 4);
     // stats for the first col are read from the parquet file
     assert_eq!(stat_cols[0].null_count, Precision::Exact(3));
-    // TODO assert partition column (1,2,3) stats once implemented (#1186)
-    assert_eq!(stat_cols[1], ColumnStatistics::new_unknown(),);
-    assert_eq!(stat_cols[2], ColumnStatistics::new_unknown(),);
-    assert_eq!(stat_cols[3], ColumnStatistics::new_unknown(),);
+    // Partition column statistics (year=2021 for all 3 rows)
+    assert_eq!(stat_cols[1].null_count, Precision::Exact(0));
+    assert_eq!(
+        stat_cols[1].min_value,
+        Precision::Exact(ScalarValue::Int32(Some(2021)))
+    );
+    assert_eq!(
+        stat_cols[1].max_value,
+        Precision::Exact(ScalarValue::Int32(Some(2021)))
+    );
+    // month and day are Utf8 partition columns with statistics
+    assert_eq!(stat_cols[2].null_count, Precision::Exact(0));
+    assert_eq!(stat_cols[3].null_count, Precision::Exact(0));
 
     //// WITH PROJECTION ////
     let dataframe = ctx.sql("SELECT mycol, day FROM t WHERE day='28'").await?;
@@ -475,12 +488,23 @@ async fn parquet_statistics() -> Result<()> {
     let schema = physical_plan.schema();
     assert_eq!(schema.fields().len(), 2);
 
-    let stat_cols = physical_plan.partition_statistics(None)?.column_statistics;
+    let stat_cols = physical_plan
+        .partition_statistics(None)?
+        .column_statistics
+        .clone();
     assert_eq!(stat_cols.len(), 2);
     // stats for the first col are read from the parquet file
     assert_eq!(stat_cols[0].null_count, Precision::Exact(1));
-    // TODO assert partition column stats once implemented (#1186)
-    assert_eq!(stat_cols[1], ColumnStatistics::new_unknown());
+    // Partition column statistics for day='28' (1 row)
+    assert_eq!(stat_cols[1].null_count, Precision::Exact(0));
+    assert_eq!(
+        stat_cols[1].min_value,
+        Precision::Exact(ScalarValue::Utf8(Some("28".to_string())))
+    );
+    assert_eq!(
+        stat_cols[1].max_value,
+        Precision::Exact(ScalarValue::Utf8(Some("28".to_string())))
+    );
 
     Ok(())
 }
@@ -604,7 +628,7 @@ async fn create_partitioned_alltypes_parquet_table(
 }
 
 #[derive(Debug)]
-/// An object store implem that is mirrors a given file to multiple paths.
+/// An object store implem that mirrors a given file to multiple paths.
 pub struct MirroringObjectStore {
     /// The `(path,size)` of the files that "exist" in the store
     files: Vec<Path>,
@@ -653,12 +677,13 @@ impl ObjectStore for MirroringObjectStore {
     async fn get_opts(
         &self,
         location: &Path,
-        _options: GetOptions,
+        options: GetOptions,
     ) -> object_store::Result<GetResult> {
         self.files.iter().find(|x| *x == location).unwrap();
         let path = std::path::PathBuf::from(&self.mirrored_file);
         let file = File::open(&path).unwrap();
         let metadata = file.metadata().unwrap();
+
         let meta = ObjectMeta {
             location: location.clone(),
             last_modified: metadata.modified().map(chrono::DateTime::from).unwrap(),
@@ -667,37 +692,35 @@ impl ObjectStore for MirroringObjectStore {
             version: None,
         };
 
+        let payload = if options.head {
+            // no content for head requests
+            GetResultPayload::Stream(stream::empty().boxed())
+        } else if let Some(range) = options.range {
+            let GetRange::Bounded(range) = range else {
+                unimplemented!("Unbounded range not supported in MirroringObjectStore");
+            };
+            let mut file = File::open(path).unwrap();
+            file.seek(SeekFrom::Start(range.start)).unwrap();
+
+            let to_read = range.end - range.start;
+            let to_read: usize = to_read.try_into().unwrap();
+            let mut data = Vec::with_capacity(to_read);
+            let read = file.take(to_read as u64).read_to_end(&mut data).unwrap();
+            assert_eq!(read, to_read);
+            let stream = stream::once(async move { Ok(Bytes::from(data)) }).boxed();
+            GetResultPayload::Stream(stream)
+        } else {
+            GetResultPayload::File(file, path)
+        };
+
         Ok(GetResult {
             range: 0..meta.size,
-            payload: GetResultPayload::File(file, path),
+            payload,
             meta,
             attributes: Attributes::default(),
         })
     }
 
-    async fn get_range(
-        &self,
-        location: &Path,
-        range: Range<u64>,
-    ) -> object_store::Result<Bytes> {
-        self.files.iter().find(|x| *x == location).unwrap();
-        let path = std::path::PathBuf::from(&self.mirrored_file);
-        let mut file = File::open(path).unwrap();
-        file.seek(SeekFrom::Start(range.start)).unwrap();
-
-        let to_read = range.end - range.start;
-        let to_read: usize = to_read.try_into().unwrap();
-        let mut data = Vec::with_capacity(to_read);
-        let read = file.take(to_read as u64).read_to_end(&mut data).unwrap();
-        assert_eq!(read, to_read);
-
-        Ok(data.into())
-    }
-
-    async fn delete(&self, _location: &Path) -> object_store::Result<()> {
-        unimplemented!()
-    }
-
     fn list(
         &self,
         prefix: Option<&Path>,
@@ -712,6 +735,8 @@ impl ObjectStore for MirroringObjectStore {
                     .map(|mut x| x.next().is_some())
                     .unwrap_or(false);
 
+                #[expect(clippy::result_large_err)]
+                // closure only ever returns Ok; Err type is never constructed
                 filter.then(|| {
                     Ok(ObjectMeta {
                         location,
@@ -749,7 +774,7 @@ impl ObjectStore for MirroringObjectStore {
             };
 
             if parts.next().is_some() {
-                common_prefixes.insert(prefix.child(common_prefix));
+                common_prefixes.insert(prefix.clone().join(common_prefix));
             } else {
                 let object = ObjectMeta {
                     location: k.clone(),
@@ -767,14 +792,18 @@ impl ObjectStore for MirroringObjectStore {
         })
     }
 
-    async fn copy(&self, _from: &Path, _to: &Path) -> object_store::Result<()> {
+    fn delete_stream(
+        &self,
+        _locations: BoxStream<'static, object_store::Result<Path>>,
+    ) -> BoxStream<'static, object_store::Result<Path>> {
         unimplemented!()
     }
 
-    async fn copy_if_not_exists(
+    async fn copy_opts(
         &self,
         _from: &Path,
         _to: &Path,
+        _options: CopyOptions,
     ) -> object_store::Result<()> {
         unimplemented!()
     }
diff --git a/datafusion/core/tests/sql/runtime_config.rs b/datafusion/core/tests/sql/runtime_config.rs
index 9627d7bccdb04..cf5237d725805 100644
--- a/datafusion/core/tests/sql/runtime_config.rs
+++ b/datafusion/core/tests/sql/runtime_config.rs
@@ -18,9 +18,14 @@
 //! Tests for runtime configuration SQL interface
 
 use std::sync::Arc;
+use std::time::Duration;
 
 use datafusion::execution::context::SessionContext;
 use datafusion::execution::context::TaskContext;
+use datafusion::prelude::SessionConfig;
+use datafusion_execution::cache::DefaultListFilesCache;
+use datafusion_execution::cache::cache_manager::CacheManagerConfig;
+use datafusion_execution::runtime_env::RuntimeEnvBuilder;
 use datafusion_physical_plan::common::collect;
 
 #[tokio::test]
@@ -140,7 +145,7 @@ async fn test_memory_limit_enforcement() {
 }
 
 #[tokio::test]
-async fn test_invalid_memory_limit() {
+async fn test_invalid_memory_limit_when_unit_is_invalid() {
     let ctx = SessionContext::new();
 
     let result = ctx
@@ -149,7 +154,26 @@ async fn test_invalid_memory_limit() {
 
     assert!(result.is_err());
     let error_message = result.unwrap_err().to_string();
-    assert!(error_message.contains("Unsupported unit 'X'"));
+    assert!(
+        error_message
+            .contains("Unsupported unit 'X' in 'datafusion.runtime.memory_limit'")
+            && error_message.contains("Unit must be one of: 'K', 'M', 'G'")
+    );
+}
+
+#[tokio::test]
+async fn test_invalid_memory_limit_when_limit_is_not_numeric() {
+    let ctx = SessionContext::new();
+
+    let result = ctx
+        .sql("SET datafusion.runtime.memory_limit = 'invalid_memory_limit'")
+        .await;
+
+    assert!(result.is_err());
+    let error_message = result.unwrap_err().to_string();
+    assert!(error_message.contains(
+        "Failed to parse number from 'datafusion.runtime.memory_limit', limit 'invalid_memory_limit'"
+    ));
 }
 
 #[tokio::test]
@@ -233,6 +257,93 @@ async fn test_test_metadata_cache_limit() {
     assert_eq!(get_limit(&ctx), 123 * 1024);
 }
 
+#[tokio::test]
+async fn test_list_files_cache_limit() {
+    let list_files_cache = Arc::new(DefaultListFilesCache::default());
+
+    let rt = RuntimeEnvBuilder::new()
+        .with_cache_manager(
+            CacheManagerConfig::default().with_list_files_cache(Some(list_files_cache)),
+        )
+        .build_arc()
+        .unwrap();
+
+    let ctx = SessionContext::new_with_config_rt(SessionConfig::default(), rt);
+
+    let update_limit = async |ctx: &SessionContext, limit: &str| {
+        ctx.sql(
+            format!("SET datafusion.runtime.list_files_cache_limit = '{limit}'").as_str(),
+        )
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+    };
+
+    let get_limit = |ctx: &SessionContext| -> usize {
+        ctx.task_ctx()
+            .runtime_env()
+            .cache_manager
+            .get_list_files_cache()
+            .unwrap()
+            .cache_limit()
+    };
+
+    update_limit(&ctx, "100M").await;
+    assert_eq!(get_limit(&ctx), 100 * 1024 * 1024);
+
+    update_limit(&ctx, "2G").await;
+    assert_eq!(get_limit(&ctx), 2 * 1024 * 1024 * 1024);
+
+    update_limit(&ctx, "123K").await;
+    assert_eq!(get_limit(&ctx), 123 * 1024);
+}
+
+#[tokio::test]
+async fn test_list_files_cache_ttl() {
+    let list_files_cache = Arc::new(DefaultListFilesCache::default());
+
+    let rt = RuntimeEnvBuilder::new()
+        .with_cache_manager(
+            CacheManagerConfig::default().with_list_files_cache(Some(list_files_cache)),
+        )
+        .build_arc()
+        .unwrap();
+
+    let ctx = SessionContext::new_with_config_rt(SessionConfig::default(), rt);
+
+    let update_limit = async |ctx: &SessionContext, limit: &str| {
+        ctx.sql(
+            format!("SET datafusion.runtime.list_files_cache_ttl = '{limit}'").as_str(),
+        )
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+    };
+
+    let get_limit = |ctx: &SessionContext| -> Duration {
+        ctx.task_ctx()
+            .runtime_env()
+            .cache_manager
+            .get_list_files_cache()
+            .unwrap()
+            .cache_ttl()
+            .unwrap()
+    };
+
+    update_limit(&ctx, "1m").await;
+    assert_eq!(get_limit(&ctx), Duration::from_secs(60));
+
+    update_limit(&ctx, "30s").await;
+    assert_eq!(get_limit(&ctx), Duration::from_secs(30));
+
+    update_limit(&ctx, "1m30s").await;
+    assert_eq!(get_limit(&ctx), Duration::from_secs(90));
+}
+
 #[tokio::test]
 async fn test_unknown_runtime_config() {
     let ctx = SessionContext::new();
diff --git a/datafusion/core/tests/sql/select.rs b/datafusion/core/tests/sql/select.rs
index 8a0f620627384..96b911e8db130 100644
--- a/datafusion/core/tests/sql/select.rs
+++ b/datafusion/core/tests/sql/select.rs
@@ -18,8 +18,7 @@
 use std::collections::HashMap;
 
 use super::*;
-use datafusion::assert_batches_eq;
-use datafusion_common::{metadata::ScalarAndMetadata, ParamValues, ScalarValue};
+use datafusion_common::{ParamValues, ScalarValue, metadata::ScalarAndMetadata};
 use insta::assert_snapshot;
 
 #[tokio::test]
@@ -223,10 +222,10 @@ async fn test_parameter_invalid_types() -> Result<()> {
         .await;
     assert_snapshot!(results.unwrap_err().strip_backtrace(),
         @r"
-        type_coercion
-        caused by
-        Error during planning: Cannot infer common argument type for comparison operation List(nullable Int32) = Int32
-        ");
+    type_coercion
+    caused by
+    Error during planning: Cannot infer common argument type for comparison operation List(Int32) = Int32
+    ");
     Ok(())
 }
 
@@ -343,26 +342,20 @@ async fn test_query_parameters_with_metadata() -> Result<()> {
         ]))
         .unwrap();
 
-    // df_with_params_replaced.schema() is not correct here
-    // https://github.com/apache/datafusion/issues/18102
-    let batches = df_with_params_replaced.clone().collect().await.unwrap();
-    let schema = batches[0].schema();
-
+    let schema = df_with_params_replaced.schema();
     assert_eq!(schema.field(0).data_type(), &DataType::UInt32);
     assert_eq!(schema.field(0).metadata(), &metadata1);
     assert_eq!(schema.field(1).data_type(), &DataType::Utf8);
     assert_eq!(schema.field(1).metadata(), &metadata2);
 
-    assert_batches_eq!(
-        [
-            "+----+-----+",
-            "| $1 | $2  |",
-            "+----+-----+",
-            "| 1  | two |",
-            "+----+-----+",
-        ],
-        &batches
-    );
+    let batches = df_with_params_replaced.collect().await.unwrap();
+    assert_snapshot!(batches_to_sort_string(&batches), @r"
+    +----+-----+
+    | $1 | $2  |
+    +----+-----+
+    | 1  | two |
+    +----+-----+
+    ");
 
     Ok(())
 }
@@ -421,3 +414,82 @@ async fn test_select_no_projection() -> Result<()> {
     ");
     Ok(())
 }
+
+#[tokio::test]
+async fn test_select_cast_date_literal_to_timestamp_overflow() -> Result<()> {
+    let ctx = SessionContext::new();
+    let err = ctx
+        .sql("SELECT CAST(DATE '9999-12-31' AS TIMESTAMP)")
+        .await?
+        .collect()
+        .await
+        .unwrap_err();
+
+    assert_contains!(
+        err.to_string(),
+        "Cannot cast Date32 value 2932896 to Timestamp(ns): converted value exceeds the representable i64 range"
+    );
+    Ok(())
+}
+
+// Regression test: a recursive CTE whose anchor aliases a computed column
+// (`upper(val) AS val`) and whose recursive term leaves the same expression
+// un-aliased must still produce batches whose schema field names come from
+// the anchor term — especially when the outer query uses ORDER BY + LIMIT
+// (the TopK path passes batch schemas through verbatim, so any drift in
+// RecursiveQueryExec's emitted batches is visible downstream).
+#[tokio::test]
+async fn test_recursive_cte_batch_schema_stable_with_order_by_limit() -> Result<()> {
+    let ctx = SessionContext::new();
+    ctx.sql(
+        "CREATE TABLE records (\
+            id VARCHAR NOT NULL, \
+            parent_id VARCHAR, \
+            ts TIMESTAMP NOT NULL, \
+            val VARCHAR\
+         )",
+    )
+    .await?
+    .collect()
+    .await?;
+    ctx.sql(
+        "INSERT INTO records VALUES \
+            ('a00', NULL,  TIMESTAMP '2025-01-01 00:00:00', 'v_span'), \
+            ('a01', 'a00', TIMESTAMP '2025-01-01 00:00:01', 'v_log_1'), \
+            ('a02', 'a01', TIMESTAMP '2025-01-01 00:00:02', 'v_log_2'), \
+            ('a03', 'a02', TIMESTAMP '2025-01-01 00:00:03', 'v_log_3'), \
+            ('a04', 'a03', TIMESTAMP '2025-01-01 00:00:04', 'v_log_4'), \
+            ('a05', 'a04', TIMESTAMP '2025-01-01 00:00:05', 'v_log_5')",
+    )
+    .await?
+    .collect()
+    .await?;
+
+    let results = ctx
+        .sql(
+            "WITH RECURSIVE descendants AS (\
+                SELECT id, parent_id, ts, upper(val) AS val \
+                  FROM records WHERE id = 'a00' \
+                UNION ALL \
+                SELECT r.id, r.parent_id, r.ts, upper(r.val) \
+                  FROM records r INNER JOIN descendants d ON r.parent_id = d.id \
+             ) \
+             SELECT id, parent_id, ts, val FROM descendants ORDER BY ts ASC LIMIT 10",
+        )
+        .await?
+        .collect()
+        .await?;
+
+    let expected_names = ["id", "parent_id", "ts", "val"];
+    assert!(!results.is_empty(), "expected at least one batch");
+    for (i, batch) in results.iter().enumerate() {
+        let schema = batch.schema();
+        let actual_names: Vec<&str> =
+            schema.fields().iter().map(|f| f.name().as_str()).collect();
+        assert_eq!(
+            actual_names, expected_names,
+            "batch {i} schema field names leaked from recursive branch"
+        );
+    }
+    Ok(())
+}
diff --git a/datafusion/core/tests/sql/sql_api.rs b/datafusion/core/tests/sql/sql_api.rs
index b87afd27ddea7..290aa737d2742 100644
--- a/datafusion/core/tests/sql/sql_api.rs
+++ b/datafusion/core/tests/sql/sql_api.rs
@@ -16,6 +16,7 @@
 // under the License.
 
 use datafusion::prelude::*;
+use datafusion_common::assert_contains;
 
 use tempfile::TempDir;
 
@@ -206,3 +207,19 @@ async fn ddl_can_not_be_planned_by_session_state() {
         "This feature is not implemented: Unsupported logical plan: DropTable"
     );
 }
+
+#[tokio::test]
+async fn invalid_wrapped_negation_fails_during_optimization() {
+    let ctx = SessionContext::new();
+    let err = ctx
+        .sql("SELECT * FROM (SELECT 1) WHERE ((-'a') IS NULL)")
+        .await
+        .unwrap()
+        .into_optimized_plan()
+        .unwrap_err();
+
+    assert_contains!(
+        err.strip_backtrace(),
+        "Negation only supports numeric, interval and timestamp types"
+    );
+}
diff --git a/datafusion/core/tests/sql/unparser.rs b/datafusion/core/tests/sql/unparser.rs
new file mode 100644
index 0000000000000..d6ca872e198c3
--- /dev/null
+++ b/datafusion/core/tests/sql/unparser.rs
@@ -0,0 +1,466 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! SQL Unparser Roundtrip Integration Tests
+//!
+//! This module tests the [`Unparser`] by running queries through a complete roundtrip:
+//! the original SQL is parsed into a logical plan, unparsed back to SQL, then that
+//! generated SQL is parsed and executed. The results are compared to verify semantic
+//! equivalence.
+//!
+//! ## Test Strategy
+//!
+//! Uses real-world benchmark queries (TPC-H and Clickbench) to validate that:
+//! 1. The unparser produces syntactically valid SQL
+//! 2. The unparsed SQL is semantically equivalent (produces identical results)
+//!
+//! ## Query Suites
+//!
+//! - **TPC-H**: Standard decision-support benchmark with 22 complex analytical queries
+//! - **Clickbench**: Web analytics benchmark with 43 queries against a denormalized schema
+//!
+//! [`Unparser`]: datafusion_sql::unparser::Unparser
+
+use std::fs::ReadDir;
+use std::future::Future;
+
+use arrow::array::RecordBatch;
+use datafusion::common::Result;
+use datafusion::prelude::{ParquetReadOptions, SessionContext};
+use datafusion_common::Column;
+use datafusion_expr::Expr;
+use datafusion_sql::unparser::Unparser;
+use datafusion_sql::unparser::dialect::DefaultDialect;
+use itertools::Itertools;
+use recursive::{set_minimum_stack_size, set_stack_allocation_size};
+
+/// Paths to benchmark query files (supports running from repo root or different working directories).
+const BENCHMARK_PATHS: &[&str] = &["../../benchmarks/", "./benchmarks/"];
+
+/// Reads all `.sql` files from a directory and converts them to test queries.
+///
+/// Skips files that:
+/// - Are not regular files
+/// - Don't have a `.sql` extension
+/// - Contain multiple SQL statements (indicated by `;\n`)
+///
+/// Multi-statement files are skipped because the unparser doesn't support
+/// DML statements like `CREATE VIEW` that appear in multi-statement Clickbench queries.
+fn iterate_queries(dir: ReadDir) -> Vec<TestQuery> {
+    let mut queries = vec![];
+    for entry in dir.flatten() {
+        let Ok(file_type) = entry.file_type() else {
+            continue;
+        };
+        if !file_type.is_file() {
+            continue;
+        }
+        let path = entry.path();
+        let Some(ext) = path.extension() else {
+            continue;
+        };
+        if ext != "sql" {
+            continue;
+        }
+        let name = path.file_stem().unwrap().to_string_lossy().to_string();
+        if let Ok(mut contents) = std::fs::read_to_string(entry.path()) {
+            // If the query contains ;\n it has DML statements like CREATE VIEW which the unparser doesn't support; skip it
+            contents = contents.trim().to_string();
+            if contents.contains(";\n") {
+                println!("Skipping query with multiple statements: {name}");
+                continue;
+            }
+            queries.push(TestQuery {
+                sql: contents,
+                name,
+            });
+        }
+    }
+    queries
+}
+
+/// A SQL query loaded from a benchmark file for roundtrip testing.
+///
+/// Each query is identified by its filename (without extension) and contains
+/// the full SQL text to be tested.
+struct TestQuery {
+    /// The SQL query text to test.
+    sql: String,
+    /// The query identifier (typically the filename without .sql extension).
+    name: String,
+}
+
+/// Collect SQL for Clickbench queries.
+fn clickbench_queries() -> Vec<TestQuery> {
+    let mut queries = vec![];
+    for path in BENCHMARK_PATHS {
+        let dir = format!("{path}queries/clickbench/queries/");
+        println!("Reading Clickbench queries from {dir}");
+        if let Ok(dir) = std::fs::read_dir(dir) {
+            let read = iterate_queries(dir);
+            println!("Found {} Clickbench queries", read.len());
+            queries.extend(read);
+        }
+    }
+    queries.sort_unstable_by_key(|q| {
+        q.name
+            .split('q')
+            .next_back()
+            .and_then(|num| num.parse::<u32>().ok())
+    });
+    queries
+}
+
+/// Collect SQL for TPC-H queries.
+fn tpch_queries() -> Vec<TestQuery> {
+    let mut queries = vec![];
+    for path in BENCHMARK_PATHS {
+        let dir = format!("{path}queries/");
+        println!("Reading TPC-H queries from {dir}");
+        if let Ok(dir) = std::fs::read_dir(dir) {
+            let read = iterate_queries(dir);
+            queries.extend(read);
+        }
+    }
+    println!("Total TPC-H queries found: {}", queries.len());
+    queries.sort_unstable_by_key(|q| q.name.clone());
+    queries
+}
+
+/// Create a new SessionContext for testing that has all Clickbench tables registered.
+///
+/// Registers the raw Parquet as `hits_raw`, then creates a `hits` view that
+/// casts `EventDate` from UInt16 (day-offset) to DATE. This mirrors the
+/// approach used by the benchmark runner in `benchmarks/src/clickbench.rs`.
+async fn clickbench_test_context() -> Result<SessionContext> {
+    let ctx = SessionContext::new();
+    ctx.register_parquet(
+        "hits_raw",
+        "tests/data/clickbench_hits_10.parquet",
+        ParquetReadOptions::default(),
+    )
+    .await?;
+    ctx.sql(
+        r#"CREATE VIEW hits AS
+           SELECT * EXCEPT ("EventDate"),
+                  CAST(CAST("EventDate" AS INTEGER) AS DATE) AS "EventDate"
+           FROM hits_raw"#,
+    )
+    .await?;
+    // Sanity check we found the table by querying its schema
+    let df = ctx.sql("SELECT * FROM hits LIMIT 1").await?;
+    assert!(
+        !df.schema().fields().is_empty(),
+        "Clickbench 'hits' table not registered correctly"
+    );
+    Ok(ctx)
+}
+
+/// Create a new SessionContext for testing that has all TPC-H tables registered.
+async fn tpch_test_context() -> Result<SessionContext> {
+    let ctx = SessionContext::new();
+    let data_dir = "tests/data/";
+    // All tables have the pattern "tpch_<table_name>_small.parquet"
+    for table in [
+        "customer", "lineitem", "nation", "orders", "part", "partsupp", "region",
+        "supplier",
+    ] {
+        let path = format!("{data_dir}tpch_{table}_small.parquet");
+        ctx.register_parquet(table, &path, ParquetReadOptions::default())
+            .await?;
+        // Sanity check we found the table by querying it's schema, it should not be empty
+        // Otherwise if the path is wrong the tests will all fail in confusing ways
+        let df = ctx.sql(&format!("SELECT * FROM {table} LIMIT 1")).await?;
+        assert!(
+            !df.schema().fields().is_empty(),
+            "TPC-H '{table}' table not registered correctly"
+        );
+    }
+    Ok(ctx)
+}
+
+/// Sorts record batches by all columns for deterministic comparison.
+///
+/// When comparing query results, we need a canonical ordering so that
+/// semantically equivalent results compare as equal. This function sorts
+/// by all columns in the schema to achieve that.
+async fn sort_batches(
+    ctx: &SessionContext,
+    batches: Vec<RecordBatch>,
+) -> Result<Vec<RecordBatch>> {
+    let mut df = ctx.read_batches(batches)?;
+    let schema = df.schema().as_arrow().clone();
+    let sort_exprs = schema
+        .fields()
+        .iter()
+        // Use Column directly, col() causes the column names to be normalized to lowercase
+        .map(|f| {
+            Expr::Column(Column::new_unqualified(f.name().to_string())).sort(true, false)
+        })
+        .collect_vec();
+    if !sort_exprs.is_empty() {
+        df = df.sort(sort_exprs)?;
+    }
+    df.collect().await
+}
+
+/// The outcome of running a single roundtrip test.
+///
+/// A successful test produces [`TestCaseResult::Success`].
+/// All other variants capture different failure modes with enough context to diagnose the issue.
+enum TestCaseResult {
+    /// The unparsed SQL produced identical results to the original.
+    Success,
+
+    /// Both queries executed but produced different results.
+    ///
+    /// This indicates a semantic bug in the unparser where the generated SQL
+    /// has different meaning than the original.
+    ResultsMismatch { original: String, unparsed: String },
+
+    /// The unparser failed to convert the logical plan to SQL.
+    ///
+    /// This may indicate an unsupported SQL feature or a bug in the unparser.
+    UnparseError { original: String, error: String },
+
+    /// The original SQL failed to execute.
+    ///
+    /// This indicates a problem with the test setup (missing tables,
+    /// invalid test data) rather than an unparser issue.
+    ExecutionError { original: String, error: String },
+
+    /// The unparsed SQL failed to execute, even though the original succeeded.
+    ///
+    /// This indicates the unparser generated syntactically invalid SQL or SQL
+    /// that references non-existent columns/tables.
+    UnparsedExecutionError {
+        original: String,
+        unparsed: String,
+        error: String,
+    },
+}
+
+impl TestCaseResult {
+    /// Returns true if the test case represents a failure
+    /// (anything other than [`TestCaseResult::Success`]).
+    fn is_failure(&self) -> bool {
+        !matches!(self, TestCaseResult::Success)
+    }
+
+    /// Formats a detailed error message for the test case into a string.
+    fn format_error(&self, name: &str) -> String {
+        match self {
+            TestCaseResult::Success => String::new(),
+            TestCaseResult::ResultsMismatch { original, unparsed } => {
+                format!(
+                    "Results mismatch for {name}.\nOriginal SQL:\n{original}\n\nUnparsed SQL:\n{unparsed}"
+                )
+            }
+            TestCaseResult::UnparseError { original, error } => {
+                format!("Unparse error for {name}: {error}\nOriginal SQL:\n{original}")
+            }
+            TestCaseResult::ExecutionError { original, error } => {
+                format!("Execution error for {name}: {error}\nOriginal SQL:\n{original}")
+            }
+            TestCaseResult::UnparsedExecutionError {
+                original,
+                unparsed,
+                error,
+            } => {
+                format!(
+                    "Unparsed execution error for {name}: {error}\nOriginal SQL:\n{original}\n\nUnparsed SQL:\n{unparsed}"
+                )
+            }
+        }
+    }
+}
+
+/// Executes a roundtrip test for a single SQL query.
+///
+/// This is the core test logic that:
+/// 1. Parses the original SQL and creates a logical plan
+/// 2. Unparses the logical plan back to SQL
+/// 3. Executes both the original and unparsed queries
+/// 4. Compares the results (sorting if the query has no ORDER BY)
+///
+/// This always uses [`DefaultDialect`] for unparsing.
+///
+/// # Arguments
+///
+/// * `ctx` - Session context with tables registered
+/// * `original` - The original SQL query to test
+///
+/// # Returns
+///
+/// A [`TestCaseResult`] indicating success or the specific failure mode.
+async fn collect_results(ctx: &SessionContext, original: &str) -> TestCaseResult {
+    let unparser = Unparser::new(&DefaultDialect {});
+
+    // Parse and create logical plan from original SQL
+    let df = match ctx.sql(original).await {
+        Ok(df) => df,
+        Err(e) => {
+            return TestCaseResult::ExecutionError {
+                original: original.to_string(),
+                error: e.to_string(),
+            };
+        }
+    };
+
+    // Unparse the logical plan back to SQL
+    let unparsed = match unparser.plan_to_sql(df.logical_plan()) {
+        Ok(sql) => format!("{sql:#}"),
+        Err(e) => {
+            return TestCaseResult::UnparseError {
+                original: original.to_string(),
+                error: e.to_string(),
+            };
+        }
+    };
+
+    // Collect results from original query
+    let mut expected = match df.collect().await {
+        Ok(batches) => batches,
+        Err(e) => {
+            return TestCaseResult::ExecutionError {
+                original: original.to_string(),
+                error: e.to_string(),
+            };
+        }
+    };
+
+    // Parse and execute the unparsed SQL
+    let actual_df = match ctx.sql(&unparsed).await {
+        Ok(df) => df,
+        Err(e) => {
+            return TestCaseResult::UnparsedExecutionError {
+                original: original.to_string(),
+                unparsed,
+                error: e.to_string(),
+            };
+        }
+    };
+
+    // Collect results from unparsed query
+    let mut actual = match actual_df.collect().await {
+        Ok(batches) => batches,
+        Err(e) => {
+            return TestCaseResult::UnparsedExecutionError {
+                original: original.to_string(),
+                unparsed,
+                error: e.to_string(),
+            };
+        }
+    };
+
+    // Always sort for deterministic comparison — even "sorted" results can have
+    // tied rows in different order between original and unparsed SQL.
+    {
+        expected = match sort_batches(ctx, expected).await {
+            Ok(batches) => batches,
+            Err(e) => {
+                return TestCaseResult::ExecutionError {
+                    original: original.to_string(),
+                    error: format!("Failed to sort expected results: {e}"),
+                };
+            }
+        };
+        actual = match sort_batches(ctx, actual).await {
+            Ok(batches) => batches,
+            Err(e) => {
+                return TestCaseResult::UnparsedExecutionError {
+                    original: original.to_string(),
+                    unparsed,
+                    error: format!("Failed to sort actual results: {e}"),
+                };
+            }
+        };
+    }
+
+    if expected != actual {
+        TestCaseResult::ResultsMismatch {
+            original: original.to_string(),
+            unparsed,
+        }
+    } else {
+        TestCaseResult::Success
+    }
+}
+
+/// Runs roundtrip tests for a collection of queries and reports results.
+///
+/// Iterates through all queries, running each through [`collect_results`].
+/// Prints colored status (green checkmark for success, red X for failure)
+/// and panics at the end if any tests failed, with detailed error messages.
+///
+/// # Type Parameters
+///
+/// * `F` - Factory function that creates fresh session contexts
+/// * `Fut` - Future type returned by the context factory
+///
+/// # Panics
+///
+/// Panics if any query fails the roundtrip test, displaying all failures.
+async fn run_roundtrip_tests<F, Fut>(
+    suite_name: &str,
+    queries: Vec<TestQuery>,
+    create_context: F,
+) where
+    F: Fn() -> Fut,
+    Fut: Future<Output = Result<SessionContext>>,
+{
+    let mut errors: Vec<String> = vec![];
+    for sql in queries {
+        let ctx = match create_context().await {
+            Ok(ctx) => ctx,
+            Err(e) => {
+                println!("\x1b[31m✗\x1b[0m {} query: {}", suite_name, sql.name);
+                errors.push(format!("Failed to create context for {}: {}", sql.name, e));
+                continue;
+            }
+        };
+        let result = collect_results(&ctx, &sql.sql).await;
+        if result.is_failure() {
+            println!("\x1b[31m✗\x1b[0m {} query: {}", suite_name, sql.name);
+            errors.push(result.format_error(&sql.name));
+        } else {
+            println!("\x1b[32m✓\x1b[0m {} query: {}", suite_name, sql.name);
+        }
+    }
+    if !errors.is_empty() {
+        panic!(
+            "{} {} test(s) failed:\n\n{}",
+            errors.len(),
+            suite_name,
+            errors.join("\n\n---\n\n")
+        );
+    }
+}
+
+#[tokio::test]
+async fn test_clickbench_unparser_roundtrip() {
+    run_roundtrip_tests("Clickbench", clickbench_queries(), clickbench_test_context)
+        .await;
+}
+
+#[tokio::test]
+async fn test_tpch_unparser_roundtrip() {
+    // Grow stacker segments earlier to avoid deep unparser recursion overflow in q20.
+    set_minimum_stack_size(512 * 1024);
+    set_stack_allocation_size(8 * 1024 * 1024);
+    run_roundtrip_tests("TPC-H", tpch_queries(), tpch_test_context).await;
+}
diff --git a/datafusion/core/tests/tpc-ds/30.sql b/datafusion/core/tests/tpc-ds/30.sql
index 78f34b807e5b5..80624f49006a9 100644
--- a/datafusion/core/tests/tpc-ds/30.sql
+++ b/datafusion/core/tests/tpc-ds/30.sql
@@ -14,7 +14,7 @@ with customer_total_return as
          ,ca_state)
   select  c_customer_id,c_salutation,c_first_name,c_last_name,c_preferred_cust_flag
        ,c_birth_day,c_birth_month,c_birth_year,c_birth_country,c_login,c_email_address
-       ,c_last_review_date_sk,ctr_total_return
+       ,c_last_review_date,ctr_total_return
  from customer_total_return ctr1
      ,customer_address
      ,customer
@@ -26,7 +26,7 @@ with customer_total_return as
        and ctr1.ctr_customer_sk = c_customer_sk
  order by c_customer_id,c_salutation,c_first_name,c_last_name,c_preferred_cust_flag
                   ,c_birth_day,c_birth_month,c_birth_year,c_birth_country,c_login,c_email_address
-                  ,c_last_review_date_sk,ctr_total_return
+                  ,c_last_review_date,ctr_total_return
 limit 100;
 
 
diff --git a/datafusion/core/tests/tpcds_planning.rs b/datafusion/core/tests/tpcds_planning.rs
index 252d76d0f9d92..3ad74962bc2c0 100644
--- a/datafusion/core/tests/tpcds_planning.rs
+++ b/datafusion/core/tests/tpcds_planning.rs
@@ -1052,9 +1052,12 @@ async fn regression_test(query_no: u8, create_physical: bool) -> Result<()> {
     for sql in &sql {
         let df = ctx.sql(sql).await?;
         let (state, plan) = df.into_parts();
-        let plan = state.optimize(&plan)?;
         if create_physical {
             let _ = state.create_physical_plan(&plan).await?;
+        } else {
+            // Run the logical optimizer even if we are not creating the physical plan
+            // to ensure it will properly succeed
+            let _ = state.optimize(&plan)?;
         }
     }
 
diff --git a/datafusion/core/tests/tracing/asserting_tracer.rs b/datafusion/core/tests/tracing/asserting_tracer.rs
index 292e066e5f121..700f9f3308466 100644
--- a/datafusion/core/tests/tracing/asserting_tracer.rs
+++ b/datafusion/core/tests/tracing/asserting_tracer.rs
@@ -21,7 +21,7 @@ use std::ops::Deref;
 use std::sync::{Arc, LazyLock};
 
 use datafusion_common::{HashMap, HashSet};
-use datafusion_common_runtime::{set_join_set_tracer, JoinSetTracer};
+use datafusion_common_runtime::{JoinSetTracer, set_join_set_tracer};
 use futures::future::BoxFuture;
 use tokio::sync::{Mutex, MutexGuard};
 
diff --git a/datafusion/core/tests/tracing/traceable_object_store.rs b/datafusion/core/tests/tracing/traceable_object_store.rs
index 60ef1cc5d6b6a..71a61dbf8772a 100644
--- a/datafusion/core/tests/tracing/traceable_object_store.rs
+++ b/datafusion/core/tests/tracing/traceable_object_store.rs
@@ -18,10 +18,11 @@
 //! Object store implementation used for testing
 
 use crate::tracing::asserting_tracer::assert_traceability;
+use futures::StreamExt;
 use futures::stream::BoxStream;
 use object_store::{
-    path::Path, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta,
-    ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult,
+    CopyOptions, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta,
+    ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult, path::Path,
 };
 use std::fmt::{Debug, Display, Formatter};
 use std::sync::Arc;
@@ -83,14 +84,17 @@ impl ObjectStore for TraceableObjectStore {
         self.inner.get_opts(location, options).await
     }
 
-    async fn head(&self, location: &Path) -> object_store::Result<ObjectMeta> {
-        assert_traceability().await;
-        self.inner.head(location).await
-    }
-
-    async fn delete(&self, location: &Path) -> object_store::Result<()> {
-        assert_traceability().await;
-        self.inner.delete(location).await
+    fn delete_stream(
+        &self,
+        locations: BoxStream<'static, object_store::Result<Path>>,
+    ) -> BoxStream<'static, object_store::Result<Path>> {
+        self.inner
+            .delete_stream(locations)
+            .then(|res| async {
+                futures::executor::block_on(assert_traceability());
+                res
+            })
+            .boxed()
     }
 
     fn list(
@@ -109,17 +113,13 @@ impl ObjectStore for TraceableObjectStore {
         self.inner.list_with_delimiter(prefix).await
     }
 
-    async fn copy(&self, from: &Path, to: &Path) -> object_store::Result<()> {
-        assert_traceability().await;
-        self.inner.copy(from, to).await
-    }
-
-    async fn copy_if_not_exists(
+    async fn copy_opts(
         &self,
         from: &Path,
         to: &Path,
+        options: CopyOptions,
     ) -> object_store::Result<()> {
         assert_traceability().await;
-        self.inner.copy_if_not_exists(from, to).await
+        self.inner.copy_opts(from, to, options).await
     }
 }
diff --git a/datafusion/core/tests/user_defined/expr_planner.rs b/datafusion/core/tests/user_defined/expr_planner.rs
index 07d289cab06c2..c5e5af731359f 100644
--- a/datafusion/core/tests/user_defined/expr_planner.rs
+++ b/datafusion/core/tests/user_defined/expr_planner.rs
@@ -26,9 +26,9 @@ use datafusion::logical_expr::Operator;
 use datafusion::prelude::*;
 use datafusion::sql::sqlparser::ast::BinaryOperator;
 use datafusion_common::ScalarValue;
+use datafusion_expr::BinaryExpr;
 use datafusion_expr::expr::Alias;
 use datafusion_expr::planner::{ExprPlanner, PlannerResult, RawBinaryExpr};
-use datafusion_expr::BinaryExpr;
 
 #[derive(Debug)]
 struct MyCustomPlanner;
@@ -77,25 +77,25 @@ async fn plan_and_collect(sql: &str) -> Result<Vec<RecordBatch>> {
 #[tokio::test]
 async fn test_custom_operators_arrow() {
     let actual = plan_and_collect("select 'foo'->'bar';").await.unwrap();
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r#"
     +----------------------------+
     | Utf8("foo") || Utf8("bar") |
     +----------------------------+
     | foobar                     |
     +----------------------------+
-    "###);
+    "#);
 }
 
 #[tokio::test]
 async fn test_custom_operators_long_arrow() {
     let actual = plan_and_collect("select 1->>2;").await.unwrap();
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +---------------------+
     | Int64(1) + Int64(2) |
     +---------------------+
     | 3                   |
     +---------------------+
-    "###);
+    ");
 }
 
 #[tokio::test]
@@ -103,13 +103,13 @@ async fn test_question_select() {
     let actual = plan_and_collect("select a ? 2 from (select 1 as a);")
         .await
         .unwrap();
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +--------------+
     | a ? Int64(2) |
     +--------------+
     | true         |
     +--------------+
-    "###);
+    ");
 }
 
 #[tokio::test]
@@ -117,11 +117,11 @@ async fn test_question_filter() {
     let actual = plan_and_collect("select a from (select 1 as a) where a ? 2;")
         .await
         .unwrap();
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +---+
     | a |
     +---+
     | 1 |
     +---+
-    "###);
+    ");
 }
diff --git a/datafusion/core/tests/user_defined/insert_operation.rs b/datafusion/core/tests/user_defined/insert_operation.rs
index e0a3e98604ae4..326c767d97610 100644
--- a/datafusion/core/tests/user_defined/insert_operation.rs
+++ b/datafusion/core/tests/user_defined/insert_operation.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{any::Any, str::FromStr, sync::Arc};
+use std::{str::FromStr, sync::Arc};
 
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use async_trait::async_trait;
@@ -25,12 +25,13 @@ use datafusion::{
 };
 use datafusion_catalog::{Session, TableProvider};
 use datafusion_common::config::Dialect;
-use datafusion_expr::{dml::InsertOp, Expr, TableType};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_expr::{Expr, TableType, dml::InsertOp};
 use datafusion_physical_expr::{EquivalenceProperties, Partitioning};
 use datafusion_physical_plan::execution_plan::SchedulingType;
 use datafusion_physical_plan::{
-    execution_plan::{Boundedness, EmissionType},
     DisplayAs, ExecutionPlan, PlanProperties,
+    execution_plan::{Boundedness, EmissionType},
 };
 
 #[tokio::test]
@@ -57,7 +58,7 @@ async fn insert_operation_is_passed_correctly_to_table_provider() {
 async fn assert_insert_op(ctx: &SessionContext, sql: &str, insert_op: InsertOp) {
     let df = ctx.sql(sql).await.unwrap();
     let plan = df.create_physical_plan().await.unwrap();
-    let exec = plan.as_any().downcast_ref::<TestInsertExec>().unwrap();
+    let exec = plan.downcast_ref::<TestInsertExec>().unwrap();
     assert_eq!(exec.op, insert_op);
 }
 
@@ -87,10 +88,6 @@ impl TestInsertTableProvider {
 
 #[async_trait]
 impl TableProvider for TestInsertTableProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         self.schema.clone()
     }
@@ -122,20 +119,22 @@ impl TableProvider for TestInsertTableProvider {
 #[derive(Debug)]
 struct TestInsertExec {
     op: InsertOp,
-    plan_properties: PlanProperties,
+    plan_properties: Arc<PlanProperties>,
 }
 
 impl TestInsertExec {
     fn new(op: InsertOp) -> Self {
         Self {
             op,
-            plan_properties: PlanProperties::new(
-                EquivalenceProperties::new(make_count_schema()),
-                Partitioning::UnknownPartitioning(1),
-                EmissionType::Incremental,
-                Boundedness::Bounded,
-            )
-            .with_scheduling_type(SchedulingType::Cooperative),
+            plan_properties: Arc::new(
+                PlanProperties::new(
+                    EquivalenceProperties::new(make_count_schema()),
+                    Partitioning::UnknownPartitioning(1),
+                    EmissionType::Incremental,
+                    Boundedness::Bounded,
+                )
+                .with_scheduling_type(SchedulingType::Cooperative),
+            ),
         }
     }
 }
@@ -155,11 +154,7 @@ impl ExecutionPlan for TestInsertExec {
         "TestInsertExec"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.plan_properties
     }
 
@@ -182,6 +177,22 @@ impl ExecutionPlan for TestInsertExec {
     ) -> Result<datafusion_execution::SendableRecordBatchStream> {
         unimplemented!("TestInsertExec is a stub for testing.")
     }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion_physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.plan_properties.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
 }
 
 fn make_count_schema() -> SchemaRef {
diff --git a/datafusion/core/tests/user_defined/mod.rs b/datafusion/core/tests/user_defined/mod.rs
index 5d84cdb692830..bc9949f5d681c 100644
--- a/datafusion/core/tests/user_defined/mod.rs
+++ b/datafusion/core/tests/user_defined/mod.rs
@@ -15,6 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
+/// Tests for user defined Async Scalar functions
+mod user_defined_async_scalar_functions;
+
 /// Tests for user defined Scalar functions
 mod user_defined_scalar_functions;
 
@@ -33,5 +36,8 @@ mod user_defined_table_functions;
 /// Tests for Expression Planner
 mod expr_planner;
 
+/// Tests for Relation Planner extensions
+mod relation_planner;
+
 /// Tests for insert operations
 mod insert_operation;
diff --git a/datafusion/core/tests/user_defined/relation_planner.rs b/datafusion/core/tests/user_defined/relation_planner.rs
new file mode 100644
index 0000000000000..54af53ad858d4
--- /dev/null
+++ b/datafusion/core/tests/user_defined/relation_planner.rs
@@ -0,0 +1,531 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Tests for the RelationPlanner extension point
+
+use std::sync::Arc;
+
+use arrow::array::{Int64Array, RecordBatch, StringArray};
+use arrow::datatypes::{DataType, Field, Schema};
+use datafusion::catalog::memory::MemTable;
+use datafusion::common::test_util::batches_to_string;
+use datafusion::prelude::*;
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::Expr;
+use datafusion_expr::logical_plan::builder::LogicalPlanBuilder;
+use datafusion_expr::planner::{
+    PlannedRelation, RelationPlanner, RelationPlannerContext, RelationPlanning,
+};
+use datafusion_sql::sqlparser::ast::TableFactor;
+use insta::assert_snapshot;
+
+// ============================================================================
+// Test Planners - Example Implementations
+// ============================================================================
+
+// The planners in this section are deliberately minimal, static examples used
+// only for tests. In real applications a `RelationPlanner` would typically
+// construct richer logical plans tailored to external systems or custom
+// semantics rather than hard-coded in-memory tables.
+//
+// For more realistic examples, see `datafusion-examples/examples/relation_planner/`:
+// - `table_sample.rs`: Full TABLESAMPLE implementation (parsing → execution)
+// - `pivot_unpivot.rs`: PIVOT/UNPIVOT via SQL rewriting
+// - `match_recognize.rs`: MATCH_RECOGNIZE logical planning
+
+/// Helper to build simple static values-backed virtual tables used by the
+/// example planners below.
+fn plan_static_values_table(
+    relation: TableFactor,
+    table_name: &str,
+    column_name: &str,
+    values: Vec<ScalarValue>,
+) -> Result<RelationPlanning> {
+    match relation {
+        TableFactor::Table { name, alias, .. }
+            if name.to_string().eq_ignore_ascii_case(table_name) =>
+        {
+            let rows = values
+                .into_iter()
+                .map(|v| vec![Expr::Literal(v, None)])
+                .collect::<Vec<_>>();
+
+            let plan = LogicalPlanBuilder::values(rows)?
+                .project(vec![col("column1").alias(column_name)])?
+                .build()?;
+
+            Ok(RelationPlanning::Planned(Box::new(PlannedRelation::new(
+                plan, alias,
+            ))))
+        }
+        other => Ok(RelationPlanning::Original(Box::new(other))),
+    }
+}
+
+/// Example planner that provides a virtual `numbers` table with values
+/// 1, 2, 3.
+#[derive(Debug)]
+struct NumbersPlanner;
+
+impl RelationPlanner for NumbersPlanner {
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        _context: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning> {
+        plan_static_values_table(
+            relation,
+            "numbers",
+            "number",
+            vec![
+                ScalarValue::Int64(Some(1)),
+                ScalarValue::Int64(Some(2)),
+                ScalarValue::Int64(Some(3)),
+            ],
+        )
+    }
+}
+
+/// Example planner that provides a virtual `colors` table with three string
+/// values: `red`, `green`, `blue`.
+#[derive(Debug)]
+struct ColorsPlanner;
+
+impl RelationPlanner for ColorsPlanner {
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        _context: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning> {
+        plan_static_values_table(
+            relation,
+            "colors",
+            "color",
+            vec![
+                ScalarValue::Utf8(Some("red".into())),
+                ScalarValue::Utf8(Some("green".into())),
+                ScalarValue::Utf8(Some("blue".into())),
+            ],
+        )
+    }
+}
+
+/// Alternative implementation of `numbers` (returns 100, 200) used to
+/// demonstrate planner precedence (last registered planner wins).
+#[derive(Debug)]
+struct AlternativeNumbersPlanner;
+
+impl RelationPlanner for AlternativeNumbersPlanner {
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        _context: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning> {
+        plan_static_values_table(
+            relation,
+            "numbers",
+            "number",
+            vec![ScalarValue::Int64(Some(100)), ScalarValue::Int64(Some(200))],
+        )
+    }
+}
+
+/// Example planner that intercepts nested joins and samples both sides (limit 2)
+/// before joining, demonstrating recursive planning with `context.plan()`.
+#[derive(Debug)]
+struct SamplingJoinPlanner;
+
+impl RelationPlanner for SamplingJoinPlanner {
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        context: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning> {
+        match relation {
+            TableFactor::NestedJoin {
+                table_with_joins,
+                alias,
+                ..
+            } if table_with_joins.joins.len() == 1 => {
+                // Use context.plan() to recursively plan both sides
+                // This ensures other planners (like NumbersPlanner) can handle them
+                let left = context.plan(table_with_joins.relation.clone())?;
+                let right = context.plan(table_with_joins.joins[0].relation.clone())?;
+
+                // Sample each table to 2 rows
+                let left_sampled =
+                    LogicalPlanBuilder::from(left).limit(0, Some(2))?.build()?;
+
+                let right_sampled =
+                    LogicalPlanBuilder::from(right).limit(0, Some(2))?.build()?;
+
+                // Cross join: 2 rows × 2 rows = 4 rows (instead of 3×3=9 without sampling)
+                let plan = LogicalPlanBuilder::from(left_sampled)
+                    .cross_join(right_sampled)?
+                    .build()?;
+
+                Ok(RelationPlanning::Planned(Box::new(PlannedRelation::new(
+                    plan, alias,
+                ))))
+            }
+            other => Ok(RelationPlanning::Original(Box::new(other))),
+        }
+    }
+}
+
+/// Example planner that never handles any relation and always delegates by
+/// returning `RelationPlanning::Original`.
+#[derive(Debug)]
+struct PassThroughPlanner;
+
+impl RelationPlanner for PassThroughPlanner {
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        _context: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning> {
+        // Never handles anything - always delegates
+        Ok(RelationPlanning::Original(Box::new(relation)))
+    }
+}
+
+/// Example planner that shows how planners can block specific constructs and
+/// surface custom error messages by rejecting `UNNEST` relations (here framed
+/// as a mock premium feature check).
+#[derive(Debug)]
+struct PremiumFeaturePlanner;
+
+impl RelationPlanner for PremiumFeaturePlanner {
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        _context: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning> {
+        match relation {
+            TableFactor::UNNEST { .. } => Err(datafusion_common::DataFusionError::Plan(
+                "UNNEST is a premium feature! Please upgrade to DataFusion Pro™ \
+                     to unlock advanced array operations."
+                    .to_string(),
+            )),
+            other => Ok(RelationPlanning::Original(Box::new(other))),
+        }
+    }
+}
+
+// ============================================================================
+// Test Helpers - SQL Execution
+// ============================================================================
+
+/// Execute SQL and return results with better error messages.
+async fn execute_sql(ctx: &SessionContext, sql: &str) -> Result<Vec<RecordBatch>> {
+    let df = ctx.sql(sql).await?;
+    df.collect().await
+}
+
+/// Execute SQL and convert to string format for snapshot comparison.
+async fn execute_sql_to_string(ctx: &SessionContext, sql: &str) -> String {
+    let batches = execute_sql(ctx, sql)
+        .await
+        .expect("SQL execution should succeed");
+    batches_to_string(&batches)
+}
+
+// ============================================================================
+// Test Helpers - Context Builders
+// ============================================================================
+
+/// Create a SessionContext with a catalog table containing Int64 and Utf8 columns.
+///
+/// Creates a table with the specified name and sample data for fallback/integration tests.
+fn create_context_with_catalog_table(
+    table_name: &str,
+    id_values: Vec<i64>,
+    name_values: Vec<&str>,
+) -> SessionContext {
+    let ctx = SessionContext::new();
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int64, false),
+        Field::new("name", DataType::Utf8, false),
+    ]));
+
+    let batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![
+            Arc::new(Int64Array::from(id_values)),
+            Arc::new(StringArray::from(name_values)),
+        ],
+    )
+    .unwrap();
+
+    let table = MemTable::try_new(schema, vec![vec![batch]]).unwrap();
+    ctx.register_table(table_name, Arc::new(table)).unwrap();
+
+    ctx
+}
+
+/// Create a SessionContext with a simple single-column Int64 table.
+///
+/// Useful for basic tests that need a real catalog table.
+fn create_context_with_simple_table(
+    table_name: &str,
+    values: Vec<i64>,
+) -> SessionContext {
+    let ctx = SessionContext::new();
+
+    let schema = Arc::new(Schema::new(vec![Field::new(
+        "value",
+        DataType::Int64,
+        true,
+    )]));
+
+    let batch =
+        RecordBatch::try_new(schema.clone(), vec![Arc::new(Int64Array::from(values))])
+            .unwrap();
+
+    let table = MemTable::try_new(schema, vec![vec![batch]]).unwrap();
+    ctx.register_table(table_name, Arc::new(table)).unwrap();
+
+    ctx
+}
+
+// ============================================================================
+// TESTS: Ordered from Basic to Complex
+// ============================================================================
+
+/// Comprehensive test suite for RelationPlanner extension point.
+/// Tests are ordered from simplest smoke test to most complex scenarios.
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Small extension trait to make test setup read fluently.
+    trait TestSessionExt {
+        fn with_planner<P: RelationPlanner + 'static>(self, planner: P) -> Self;
+    }
+
+    impl TestSessionExt for SessionContext {
+        fn with_planner<P: RelationPlanner + 'static>(self, planner: P) -> Self {
+            self.register_relation_planner(Arc::new(planner)).unwrap();
+            self
+        }
+    }
+
+    /// Session context with only the `NumbersPlanner` registered.
+    fn ctx_with_numbers() -> SessionContext {
+        SessionContext::new().with_planner(NumbersPlanner)
+    }
+
+    /// Session context with virtual tables (`numbers`, `colors`) and the
+    /// `SamplingJoinPlanner` registered for nested joins.
+    fn ctx_with_virtual_tables_and_sampling() -> SessionContext {
+        SessionContext::new()
+            .with_planner(NumbersPlanner)
+            .with_planner(ColorsPlanner)
+            .with_planner(SamplingJoinPlanner)
+    }
+
+    // Basic smoke test: virtual table can be queried like a regular table.
+    #[tokio::test]
+    async fn virtual_table_basic_select() {
+        let ctx = ctx_with_numbers();
+
+        let result = execute_sql_to_string(&ctx, "SELECT * FROM numbers").await;
+
+        assert_snapshot!(result, @r"
+        +--------+
+        | number |
+        +--------+
+        | 1      |
+        | 2      |
+        | 3      |
+        +--------+
+        ");
+    }
+
+    // Virtual table supports standard SQL operations (projection, filter, aggregation).
+    #[tokio::test]
+    async fn virtual_table_filters_and_aggregation() {
+        let ctx = ctx_with_numbers();
+
+        let filtered = execute_sql_to_string(
+            &ctx,
+            "SELECT number * 10 AS scaled FROM numbers WHERE number > 1",
+        )
+        .await;
+
+        assert_snapshot!(filtered, @r"
+        +--------+
+        | scaled |
+        +--------+
+        | 20     |
+        | 30     |
+        +--------+
+        ");
+
+        let aggregated = execute_sql_to_string(
+            &ctx,
+            "SELECT COUNT(*) as count, SUM(number) as total, AVG(number) as average \
+             FROM numbers",
+        )
+        .await;
+
+        assert_snapshot!(aggregated, @r"
+        +-------+-------+---------+
+        | count | total | average |
+        +-------+-------+---------+
+        | 3     | 6     | 2.0     |
+        +-------+-------+---------+
+        ");
+    }
+
+    // Multiple planners can coexist and each handles its own virtual table.
+    #[tokio::test]
+    async fn multiple_planners_virtual_tables() {
+        let ctx = SessionContext::new()
+            .with_planner(NumbersPlanner)
+            .with_planner(ColorsPlanner);
+
+        let result1 = execute_sql_to_string(&ctx, "SELECT * FROM numbers").await;
+        assert_snapshot!(result1, @r"
+        +--------+
+        | number |
+        +--------+
+        | 1      |
+        | 2      |
+        | 3      |
+        +--------+
+        ");
+
+        let result2 = execute_sql_to_string(&ctx, "SELECT * FROM colors").await;
+        assert_snapshot!(result2, @r"
+        +-------+
+        | color |
+        +-------+
+        | red   |
+        | green |
+        | blue  |
+        +-------+
+        ");
+    }
+
+    // Last registered planner for the same table name takes precedence (LIFO).
+    #[tokio::test]
+    async fn lifo_precedence_last_planner_wins() {
+        let ctx = SessionContext::new()
+            .with_planner(AlternativeNumbersPlanner)
+            .with_planner(NumbersPlanner);
+
+        let result = execute_sql_to_string(&ctx, "SELECT * FROM numbers").await;
+
+        // CustomValuesPlanner registered last, should win (returns 1,2,3 not 100,200)
+        assert_snapshot!(result, @r"
+        +--------+
+        | number |
+        +--------+
+        | 1      |
+        | 2      |
+        | 3      |
+        +--------+
+        ");
+    }
+
+    // Pass-through planner delegates to the catalog without changing behavior.
+    #[tokio::test]
+    async fn delegation_pass_through_to_catalog() {
+        let ctx = create_context_with_simple_table("real_table", vec![42])
+            .with_planner(PassThroughPlanner);
+
+        let result = execute_sql_to_string(&ctx, "SELECT * FROM real_table").await;
+
+        assert_snapshot!(result, @r"
+        +-------+
+        | value |
+        +-------+
+        | 42    |
+        +-------+
+        ");
+    }
+
+    // Catalog is used when no planner claims the relation.
+    #[tokio::test]
+    async fn catalog_fallback_when_no_planner() {
+        let ctx =
+            create_context_with_catalog_table("users", vec![1, 2], vec!["Alice", "Bob"])
+                .with_planner(NumbersPlanner);
+
+        let result = execute_sql_to_string(&ctx, "SELECT * FROM users ORDER BY id").await;
+
+        assert_snapshot!(result, @r"
+        +----+-------+
+        | id | name  |
+        +----+-------+
+        | 1  | Alice |
+        | 2  | Bob   |
+        +----+-------+
+        ");
+    }
+
+    // Planners can block specific constructs and surface custom error messages.
+    #[tokio::test]
+    async fn error_handling_premium_feature_blocking() {
+        // Verify UNNEST works without planner
+        let ctx_without_planner = SessionContext::new();
+        let result =
+            execute_sql(&ctx_without_planner, "SELECT * FROM UNNEST(ARRAY[1, 2, 3])")
+                .await
+                .expect("UNNEST should work by default");
+        assert_eq!(result.len(), 1);
+
+        // Same query with blocking planner registered
+        let ctx = SessionContext::new().with_planner(PremiumFeaturePlanner);
+
+        // Verify UNNEST is now rejected
+        let error = execute_sql(&ctx, "SELECT * FROM UNNEST(ARRAY[1, 2, 3])")
+            .await
+            .expect_err("UNNEST should be rejected");
+
+        let error_msg = error.to_string();
+        assert!(
+            error_msg.contains("premium feature") && error_msg.contains("DataFusion Pro"),
+            "Expected custom rejection message, got: {error_msg}"
+        );
+    }
+
+    // SamplingJoinPlanner recursively calls `context.plan()` on both sides of a
+    // nested join before sampling, exercising recursive relation planning.
+    #[tokio::test]
+    async fn recursive_planning_sampling_join() {
+        let ctx = ctx_with_virtual_tables_and_sampling();
+
+        let result =
+            execute_sql_to_string(&ctx, "SELECT * FROM (numbers JOIN colors ON true)")
+                .await;
+
+        // SamplingJoinPlanner limits each side to 2 rows: 2×2=4 (not 3×3=9)
+        assert_snapshot!(result, @r"
+        +--------+-------+
+        | number | color |
+        +--------+-------+
+        | 1      | red   |
+        | 1      | green |
+        | 2      | red   |
+        | 2      | green |
+        +--------+-------+
+        ");
+    }
+}
diff --git a/datafusion/core/tests/user_defined/user_defined_aggregates.rs b/datafusion/core/tests/user_defined/user_defined_aggregates.rs
index 62e8ab18b9be0..7d22c5df70dfc 100644
--- a/datafusion/core/tests/user_defined/user_defined_aggregates.rs
+++ b/datafusion/core/tests/user_defined/user_defined_aggregates.rs
@@ -18,18 +18,17 @@
 //! This module contains end to end demonstrations of creating
 //! user defined aggregate functions
 
-use std::any::Any;
 use std::collections::HashMap;
 use std::hash::{Hash, Hasher};
 use std::mem::{size_of, size_of_val};
 use std::sync::{
-    atomic::{AtomicBool, Ordering},
     Arc,
+    atomic::{AtomicBool, Ordering},
 };
 
 use arrow::array::{
-    record_batch, types::UInt64Type, Array, AsArray, Int32Array, PrimitiveArray,
-    StringArray, StructArray, UInt64Array,
+    Array, AsArray, Int32Array, PrimitiveArray, StringArray, StructArray, UInt64Array,
+    record_batch, types::UInt64Type,
 };
 use arrow::datatypes::{Fields, Schema};
 use arrow_schema::FieldRef;
@@ -56,8 +55,8 @@ use datafusion_common::{cast::as_primitive_array, exec_err};
 
 use datafusion_expr::expr::WindowFunction;
 use datafusion_expr::{
-    col, create_udaf, function::AccumulatorArgs, AggregateUDFImpl, Expr,
-    GroupsAccumulator, LogicalPlanBuilder, SimpleAggregateUDF, WindowFunctionDefinition,
+    AggregateUDFImpl, Expr, GroupsAccumulator, LogicalPlanBuilder, SimpleAggregateUDF,
+    WindowFunctionDefinition, col, create_udaf, function::AccumulatorArgs,
 };
 use datafusion_functions_aggregate::average::AvgAccumulator;
 
@@ -69,7 +68,7 @@ async fn test_setup() {
 
     let actual = execute(&ctx, sql).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +-------+----------------------------+
     | value | time                       |
     +-------+----------------------------+
@@ -79,7 +78,7 @@ async fn test_setup() {
     | 5.0   | 1970-01-01T00:00:00.000005 |
     | 5.0   | 1970-01-01T00:00:00.000005 |
     +-------+----------------------------+
-    "###);
+    ");
 }
 
 /// Basic user defined aggregate
@@ -91,13 +90,13 @@ async fn test_udaf() {
 
     let actual = execute(&ctx, sql).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +----------------------------+
     | time_sum(t.time)           |
     +----------------------------+
     | 1970-01-01T00:00:00.000019 |
     +----------------------------+
-    "###);
+    ");
 
     // normal aggregates call update_batch
     assert!(test_state.update_batch());
@@ -112,7 +111,7 @@ async fn test_udaf_as_window() {
 
     let actual = execute(&ctx, sql).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +----------------------------+
     | time_sum                   |
     +----------------------------+
@@ -122,7 +121,7 @@ async fn test_udaf_as_window() {
     | 1970-01-01T00:00:00.000019 |
     | 1970-01-01T00:00:00.000019 |
     +----------------------------+
-    "###);
+    ");
 
     // aggregate over the entire window function call update_batch
     assert!(test_state.update_batch());
@@ -137,7 +136,7 @@ async fn test_udaf_as_window_with_frame() {
 
     let actual = execute(&ctx, sql).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +----------------------------+
     | time_sum                   |
     +----------------------------+
@@ -147,7 +146,7 @@ async fn test_udaf_as_window_with_frame() {
     | 1970-01-01T00:00:00.000014 |
     | 1970-01-01T00:00:00.000010 |
     +----------------------------+
-    "###);
+    ");
 
     // user defined aggregates with window frame should be calling retract batch
     assert!(test_state.update_batch());
@@ -164,7 +163,10 @@ async fn test_udaf_as_window_with_frame_without_retract_batch() {
     let sql = "SELECT time_sum(time) OVER(ORDER BY time ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) as time_sum from t";
     // Note if this query ever does start working
     let err = execute(&ctx, sql).await.unwrap_err();
-    assert_contains!(err.to_string(), "This feature is not implemented: Aggregate can not be used as a sliding accumulator because `retract_batch` is not implemented: time_sum(t.time) ORDER BY [t.time ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING");
+    assert_contains!(
+        err.to_string(),
+        "This feature is not implemented: Aggregate can not be used as a sliding accumulator because `retract_batch` is not implemented: time_sum(t.time) ORDER BY [t.time ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING"
+    );
 }
 
 /// Basic query for with a udaf returning a structure
@@ -175,13 +177,13 @@ async fn test_udaf_returning_struct() {
 
     let actual = execute(&ctx, sql).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +------------------------------------------------+
     | first(t.value,t.time)                          |
     +------------------------------------------------+
     | {value: 2.0, time: 1970-01-01T00:00:00.000002} |
     +------------------------------------------------+
-    "###);
+    ");
 }
 
 /// Demonstrate extracting the fields from a structure using a subquery
@@ -192,13 +194,13 @@ async fn test_udaf_returning_struct_subquery() {
 
     let actual = execute(&ctx, sql).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +-----------------+----------------------------+
     | sq.first[value] | sq.first[time]             |
     +-----------------+----------------------------+
     | 2.0             | 1970-01-01T00:00:00.000002 |
     +-----------------+----------------------------+
-    "###);
+    ");
 }
 
 #[tokio::test]
@@ -212,13 +214,13 @@ async fn test_udaf_shadows_builtin_fn() {
     // compute with builtin `sum` aggregator
     let actual = execute(&ctx, sql).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r#"
     +---------------------------------------+
     | sum(arrow_cast(t.time,Utf8("Int64"))) |
     +---------------------------------------+
     | 19000                                 |
     +---------------------------------------+
-    "###);
+    "#);
 
     // Register `TimeSum` with name `sum`. This will shadow the builtin one
     TimeSum::register(&mut ctx, test_state.clone(), "sum");
@@ -226,13 +228,13 @@ async fn test_udaf_shadows_builtin_fn() {
 
     let actual = execute(&ctx, sql).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +----------------------------+
     | sum(t.time)                |
     +----------------------------+
     | 1970-01-01T00:00:00.000019 |
     +----------------------------+
-    "###);
+    ");
 }
 
 async fn execute(ctx: &SessionContext, sql: &str) -> Result<Vec<RecordBatch>> {
@@ -272,13 +274,13 @@ async fn simple_udaf() -> Result<()> {
 
     let result = ctx.sql("SELECT MY_AVG(a) FROM t").await?.collect().await?;
 
-    insta::assert_snapshot!(batches_to_string(&result), @r###"
+    insta::assert_snapshot!(batches_to_string(&result), @r"
     +-------------+
     | my_avg(t.a) |
     +-------------+
     | 3.0         |
     +-------------+
-    "###);
+    ");
 
     Ok(())
 }
@@ -329,9 +331,10 @@ async fn case_sensitive_identifiers_user_defined_aggregates() -> Result<()> {
 
     // doesn't work as it was registered as non lowercase
     let err = ctx.sql("SELECT MY_AVG(i) FROM t").await.unwrap_err();
-    assert!(err
-        .to_string()
-        .contains("Error during planning: Invalid function \'my_avg\'"));
+    assert!(
+        err.to_string()
+            .contains("Error during planning: Invalid function \'my_avg\'")
+    );
 
     // Can call it if you put quotes
     let result = ctx
@@ -340,13 +343,13 @@ async fn case_sensitive_identifiers_user_defined_aggregates() -> Result<()> {
         .collect()
         .await?;
 
-    insta::assert_snapshot!(batches_to_string(&result), @r###"
+    insta::assert_snapshot!(batches_to_string(&result), @r"
     +-------------+
     | MY_AVG(t.i) |
     +-------------+
     | 1.0         |
     +-------------+
-    "###);
+    ");
 
     Ok(())
 }
@@ -372,13 +375,13 @@ async fn test_user_defined_functions_with_alias() -> Result<()> {
 
     let result = plan_and_collect(&ctx, "SELECT dummy(i) FROM t").await?;
 
-    insta::assert_snapshot!(batches_to_string(&result), @r###"
+    insta::assert_snapshot!(batches_to_string(&result), @r"
     +------------+
     | dummy(t.i) |
     +------------+
     | 1.0        |
     +------------+
-    "###);
+    ");
 
     let alias_result = plan_and_collect(&ctx, "SELECT dummy_alias(i) FROM t").await?;
 
@@ -449,13 +452,13 @@ async fn test_parameterized_aggregate_udf() -> Result<()> {
 
     let actual = DataFrame::new(ctx.state(), plan).collect().await?;
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +------+---+---+
     | text | a | b |
     +------+---+---+
     | foo  | 1 | 2 |
     +------+---+---+
-    "###);
+    ");
 
     ctx.deregister_table("t")?;
     Ok(())
@@ -569,6 +572,7 @@ impl TimeSum {
         Self { sum: 0, test_state }
     }
 
+    #[expect(clippy::needless_pass_by_value)]
     fn register(ctx: &mut SessionContext, test_state: Arc<TestState>, name: &str) {
         let timestamp_type = DataType::Timestamp(TimeUnit::Nanosecond, None);
         let input_type = vec![timestamp_type.clone()];
@@ -760,11 +764,11 @@ impl Accumulator for FirstSelector {
 
         // Update the actual values
         for (value, time) in v.iter().zip(t.iter()) {
-            if let (Some(time), Some(value)) = (time, value) {
-                if time < self.time {
-                    self.value = value;
-                    self.time = time;
-                }
+            if let (Some(time), Some(value)) = (time, value)
+                && time < self.time
+            {
+                self.value = value;
+                self.time = time;
             }
         }
 
@@ -788,10 +792,6 @@ struct TestGroupsAccumulator {
 }
 
 impl AggregateUDFImpl for TestGroupsAccumulator {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "geo_mean"
     }
@@ -931,10 +931,6 @@ impl MetadataBasedAggregateUdf {
 }
 
 impl AggregateUDFImpl for MetadataBasedAggregateUdf {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         &self.name
     }
diff --git a/datafusion/core/tests/user_defined/user_defined_async_scalar_functions.rs b/datafusion/core/tests/user_defined/user_defined_async_scalar_functions.rs
new file mode 100644
index 0000000000000..58a5cb803982b
--- /dev/null
+++ b/datafusion/core/tests/user_defined/user_defined_async_scalar_functions.rs
@@ -0,0 +1,167 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{Int32Array, RecordBatch, StringArray};
+use arrow::datatypes::{DataType, Field, Schema};
+use async_trait::async_trait;
+use datafusion::prelude::*;
+use datafusion_common::test_util::format_batches;
+use datafusion_common::{Result, assert_batches_eq};
+use datafusion_expr::async_udf::{AsyncScalarUDF, AsyncScalarUDFImpl};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+
+fn register_table_and_udf() -> Result<SessionContext> {
+    let num_rows = 3;
+    let batch_size = 2;
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("prompt", DataType::Utf8, false),
+    ]));
+
+    let batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![
+            Arc::new(Int32Array::from((0..num_rows).collect::<Vec<i32>>())),
+            Arc::new(StringArray::from(
+                (0..num_rows)
+                    .map(|i| format!("prompt{i}"))
+                    .collect::<Vec<_>>(),
+            )),
+        ],
+    )?;
+
+    let ctx = SessionContext::new();
+    ctx.register_batch("test_table", batch)?;
+
+    ctx.register_udf(
+        AsyncScalarUDF::new(Arc::new(TestAsyncUDFImpl::new(batch_size)))
+            .into_scalar_udf(),
+    );
+
+    Ok(ctx)
+}
+
+// This test checks the case where batch_size doesn't evenly divide
+// the number of rows.
+#[tokio::test]
+async fn test_async_udf_with_non_modular_batch_size() -> Result<()> {
+    let ctx = register_table_and_udf()?;
+
+    let df = ctx
+        .sql("SELECT id, test_async_udf(prompt) as result FROM test_table")
+        .await?;
+
+    let result = df.collect().await?;
+
+    assert_batches_eq!(
+        &[
+            "+----+---------+",
+            "| id | result  |",
+            "+----+---------+",
+            "| 0  | prompt0 |",
+            "| 1  | prompt1 |",
+            "| 2  | prompt2 |",
+            "+----+---------+"
+        ],
+        &result
+    );
+
+    Ok(())
+}
+
+// This test checks if metrics are printed for `AsyncFuncExec`
+#[tokio::test]
+async fn test_async_udf_metrics() -> Result<()> {
+    let ctx = register_table_and_udf()?;
+
+    let df = ctx
+        .sql(
+            "EXPLAIN ANALYZE SELECT id, test_async_udf(prompt) as result FROM test_table",
+        )
+        .await?;
+
+    let result = df.collect().await?;
+
+    let explain_analyze_str = format_batches(&result)?.to_string();
+    let async_func_exec_without_metrics =
+        explain_analyze_str.split("\n").any(|metric_line| {
+            metric_line.contains("AsyncFuncExec")
+                && !metric_line.contains("output_rows=3")
+        });
+
+    assert!(!async_func_exec_without_metrics);
+
+    Ok(())
+}
+
+#[derive(Debug, PartialEq, Eq, Hash, Clone)]
+struct TestAsyncUDFImpl {
+    batch_size: usize,
+    signature: Signature,
+}
+
+impl TestAsyncUDFImpl {
+    fn new(batch_size: usize) -> Self {
+        Self {
+            batch_size,
+            signature: Signature::exact(vec![DataType::Utf8], Volatility::Volatile),
+        }
+    }
+}
+
+impl ScalarUDFImpl for TestAsyncUDFImpl {
+    fn name(&self) -> &str {
+        "test_async_udf"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Utf8)
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        panic!("Call invoke_async_with_args instead")
+    }
+}
+
+#[async_trait]
+impl AsyncScalarUDFImpl for TestAsyncUDFImpl {
+    fn ideal_batch_size(&self) -> Option<usize> {
+        Some(self.batch_size)
+    }
+    async fn invoke_async_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> Result<ColumnarValue> {
+        let arg1 = &args.args[0];
+        let results = call_external_service(arg1.clone()).await?;
+        Ok(results)
+    }
+}
+
+/// Simulates calling an async external service
+async fn call_external_service(arg1: ColumnarValue) -> Result<ColumnarValue> {
+    Ok(arg1)
+}
diff --git a/datafusion/core/tests/user_defined/user_defined_plan.rs b/datafusion/core/tests/user_defined/user_defined_plan.rs
index ffe0ba021edb3..505468a19cd37 100644
--- a/datafusion/core/tests/user_defined/user_defined_plan.rs
+++ b/datafusion/core/tests/user_defined/user_defined_plan.rs
@@ -60,7 +60,7 @@
 use std::fmt::Debug;
 use std::hash::Hash;
 use std::task::{Context, Poll};
-use std::{any::Any, collections::BTreeMap, fmt, sync::Arc};
+use std::{collections::BTreeMap, fmt, sync::Arc};
 
 use arrow::array::{Array, ArrayRef, StringViewArray};
 use arrow::{
@@ -70,7 +70,7 @@ use arrow::{
 use datafusion::execution::session_state::SessionStateBuilder;
 use datafusion::{
     common::cast::as_int64_array,
-    common::{arrow_datafusion_err, internal_err, DFSchemaRef},
+    common::{DFSchemaRef, arrow_datafusion_err},
     error::{DataFusionError, Result},
     execution::{
         context::{QueryPlanner, SessionState, TaskContext},
@@ -84,17 +84,19 @@ use datafusion::{
     physical_expr::EquivalenceProperties,
     physical_plan::{
         DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning,
-        PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics,
+        PlanProperties, RecordBatchStream, SendableRecordBatchStream,
     },
     physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner},
     prelude::{SessionConfig, SessionContext},
 };
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::ScalarValue;
+use datafusion_common::tree_node::{
+    Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
+};
+use datafusion_common::{ScalarValue, assert_eq_or_internal_err, assert_or_internal_err};
 use datafusion_expr::{FetchType, InvariantLevel, Projection, SortExpr};
-use datafusion_optimizer::optimizer::ApplyOrder;
 use datafusion_optimizer::AnalyzerRule;
+use datafusion_optimizer::optimizer::ApplyOrder;
 use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
 
 use async_trait::async_trait;
@@ -161,7 +163,7 @@ async fn run_and_compare_query(ctx: SessionContext, description: &str) -> Result
         insta::with_settings!({
             description => description,
         }, {
-            insta::assert_snapshot!(actual, @r###"
+            insta::assert_snapshot!(actual, @r"
             +-------------+---------+
             | customer_id | revenue |
             +-------------+---------+
@@ -169,7 +171,7 @@ async fn run_and_compare_query(ctx: SessionContext, description: &str) -> Result
             | jorge       | 200     |
             | andy        | 150     |
             +-------------+---------+
-        "###);
+            ");
         });
     }
 
@@ -188,13 +190,13 @@ async fn run_and_compare_query_with_analyzer_rule(
     insta::with_settings!({
         description => description,
     }, {
-        insta::assert_snapshot!(actual, @r###"
+        insta::assert_snapshot!(actual, @r"
         +------------+--------------------------+
         | UInt64(42) | arrow_typeof(UInt64(42)) |
         +------------+--------------------------+
         | 42         | UInt64                   |
         +------------+--------------------------+
-        "###);
+        ");
     });
 
     Ok(())
@@ -212,7 +214,7 @@ async fn run_and_compare_query_with_auto_schemas(
     insta::with_settings!({
             description => description,
         }, {
-            insta::assert_snapshot!(actual, @r###"
+            insta::assert_snapshot!(actual, @r"
             +----------+----------+
             | column_1 | column_2 |
             +----------+----------+
@@ -220,7 +222,7 @@ async fn run_and_compare_query_with_auto_schemas(
             | jorge    | 200      |
             | andy     | 150      |
             +----------+----------+
-        "###);
+            ");
     });
 
     Ok(())
@@ -433,21 +435,21 @@ impl OptimizerRule for OptimizerMakeExtensionNodeInvalid {
         plan: LogicalPlan,
         _config: &dyn OptimizerConfig,
     ) -> Result<Transformed<LogicalPlan>, DataFusionError> {
-        if let LogicalPlan::Extension(Extension { node }) = &plan {
-            if let Some(prev) = node.as_any().downcast_ref::<TopKPlanNode>() {
-                return Ok(Transformed::yes(LogicalPlan::Extension(Extension {
-                    node: Arc::new(TopKPlanNode {
-                        k: prev.k,
-                        input: prev.input.clone(),
-                        expr: prev.expr.clone(),
-                        // In a real use case, this rewriter could have change the number of inputs, etc
-                        invariant_mock: Some(InvariantMock {
-                            should_fail_invariant: true,
-                            kind: InvariantLevel::Always,
-                        }),
+        if let LogicalPlan::Extension(Extension { node }) = &plan
+            && let Some(prev) = node.as_any().downcast_ref::<TopKPlanNode>()
+        {
+            return Ok(Transformed::yes(LogicalPlan::Extension(Extension {
+                node: Arc::new(TopKPlanNode {
+                    k: prev.k,
+                    input: prev.input.clone(),
+                    expr: prev.expr.clone(),
+                    // In a real use case, this rewriter could have change the number of inputs, etc
+                    invariant_mock: Some(InvariantMock {
+                        should_fail_invariant: true,
+                        kind: InvariantLevel::Always,
                     }),
-                })));
-            }
+                }),
+            })));
         };
 
         Ok(Transformed::no(plan))
@@ -515,23 +517,18 @@ impl OptimizerRule for TopKOptimizerRule {
             return Ok(Transformed::no(plan));
         };
 
-        if let LogicalPlan::Sort(Sort {
-            ref expr,
-            ref input,
-            ..
-        }) = limit.input.as_ref()
+        if let LogicalPlan::Sort(Sort { expr, input, .. }) = limit.input.as_ref()
+            && expr.len() == 1
         {
-            if expr.len() == 1 {
-                // we found a sort with a single sort expr, replace with a a TopK
-                return Ok(Transformed::yes(LogicalPlan::Extension(Extension {
-                    node: Arc::new(TopKPlanNode {
-                        k: fetch,
-                        input: input.as_ref().clone(),
-                        expr: expr[0].clone(),
-                        invariant_mock: self.invariant_mock.clone(),
-                    }),
-                })));
-            }
+            // we found a sort with a single sort expr, replace with a a TopK
+            return Ok(Transformed::yes(LogicalPlan::Extension(Extension {
+                node: Arc::new(TopKPlanNode {
+                    k: fetch,
+                    input: input.as_ref().clone(),
+                    expr: expr[0].clone(),
+                    invariant_mock: self.invariant_mock.clone(),
+                }),
+            })));
         }
 
         Ok(Transformed::no(plan))
@@ -585,9 +582,10 @@ impl UserDefinedLogicalNodeCore for TopKPlanNode {
             kind,
         }) = self.invariant_mock.clone()
         {
-            if should_fail_invariant && check == kind {
-                return internal_err!("node fails check, such as improper inputs");
-            }
+            assert_or_internal_err!(
+                !(should_fail_invariant && check == kind),
+                "node fails check, such as improper inputs"
+            );
         }
         Ok(())
     }
@@ -657,13 +655,17 @@ struct TopKExec {
     input: Arc<dyn ExecutionPlan>,
     /// The maximum number of values
     k: usize,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl TopKExec {
     fn new(input: Arc<dyn ExecutionPlan>, k: usize) -> Self {
         let cache = Self::compute_properties(input.schema());
-        Self { input, k, cache }
+        Self {
+            input,
+            k,
+            cache: Arc::new(cache),
+        }
     }
 
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
@@ -704,11 +706,7 @@ impl ExecutionPlan for TopKExec {
     }
 
     /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -733,9 +731,11 @@ impl ExecutionPlan for TopKExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        if 0 != partition {
-            return internal_err!("TopKExec invalid partition {partition}");
-        }
+        assert_eq_or_internal_err!(
+            partition,
+            0,
+            "TopKExec invalid partition {partition}"
+        );
 
         Ok(Box::pin(TopKReader {
             input: self.input.execute(partition, context)?,
@@ -745,10 +745,20 @@ impl ExecutionPlan for TopKExec {
         }))
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        // to improve the optimizability of this plan
-        // better statistics inference could be provided
-        Ok(Statistics::new_unknown(&self.schema()))
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion::physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.cache.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
     }
 }
 
diff --git a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs
index 3ca8f846aa5e5..b758aeb5209e8 100644
--- a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs
+++ b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs
@@ -15,16 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::collections::HashMap;
 use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 
-use arrow::array::{as_string_array, create_array, record_batch, Int8Array, UInt64Array};
 use arrow::array::{
-    builder::BooleanBuilder, cast::AsArray, Array, ArrayRef, Float32Array, Float64Array,
-    Int32Array, RecordBatch, StringArray,
+    Array, ArrayRef, Float32Array, Float64Array, Int32Array, RecordBatch, StringArray,
+    builder::BooleanBuilder, cast::AsArray,
 };
+use arrow::array::{Int8Array, UInt64Array, as_string_array, create_array, record_batch};
 use arrow::compute::kernels::numeric::add;
 use arrow::datatypes::{DataType, Field, Schema};
 use arrow_schema::extension::{Bool8, CanonicalExtensionType, ExtensionType};
@@ -38,15 +37,17 @@ use datafusion_common::metadata::FieldMetadata;
 use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_common::utils::take_function_args;
 use datafusion_common::{
-    assert_batches_eq, assert_batches_sorted_eq, assert_contains, exec_datafusion_err,
-    exec_err, not_impl_err, plan_err, DFSchema, DataFusionError, Result, ScalarValue,
+    DFSchema, DataFusionError, Result, ScalarValue, assert_batches_eq,
+    assert_batches_sorted_eq, assert_contains, exec_datafusion_err, exec_err,
+    not_impl_err, plan_err,
 };
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr::{
-    lit_with_metadata, Accumulator, ColumnarValue, CreateFunction, CreateFunctionBody,
-    LogicalPlanBuilder, OperateFunctionArg, ReturnFieldArgs, ScalarFunctionArgs,
-    ScalarUDF, ScalarUDFImpl, Signature, Volatility,
+    Accumulator, ColumnarValue, CreateFunction, CreateFunctionBody, LogicalPlanBuilder,
+    OperateFunctionArg, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl,
+    Signature, Volatility, lit_with_metadata,
 };
+use datafusion_expr_common::signature::TypeSignature;
 use datafusion_functions_nested::range::range_udf;
 use parking_lot::Mutex;
 use regex::Regex;
@@ -63,13 +64,13 @@ async fn csv_query_custom_udf_with_cast() -> Result<()> {
     let sql = "SELECT avg(custom_sqrt(c11)) FROM aggregate_test_100";
     let actual = plan_and_collect(&ctx, sql).await?;
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +------------------------------------------+
     | avg(custom_sqrt(aggregate_test_100.c11)) |
     +------------------------------------------+
     | 0.6584408483418835                       |
     +------------------------------------------+
-    "###);
+    ");
 
     Ok(())
 }
@@ -82,13 +83,13 @@ async fn csv_query_avg_sqrt() -> Result<()> {
     let sql = "SELECT avg(custom_sqrt(c12)) FROM aggregate_test_100";
     let actual = plan_and_collect(&ctx, sql).await?;
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +------------------------------------------+
     | avg(custom_sqrt(aggregate_test_100.c12)) |
     +------------------------------------------+
     | 0.6706002946036459                       |
     +------------------------------------------+
-    "###);
+    ");
 
     Ok(())
 }
@@ -153,7 +154,7 @@ async fn scalar_udf() -> Result<()> {
 
     let result = DataFrame::new(ctx.state(), plan).collect().await?;
 
-    insta::assert_snapshot!(batches_to_string(&result), @r###"
+    insta::assert_snapshot!(batches_to_string(&result), @r"
     +-----+-----+-----------------+
     | a   | b   | my_add(t.a,t.b) |
     +-----+-----+-----------------+
@@ -162,7 +163,7 @@ async fn scalar_udf() -> Result<()> {
     | 10  | 12  | 22              |
     | 100 | 120 | 220             |
     +-----+-----+-----------------+
-    "###);
+    ");
 
     let batch = &result[0];
     let a = as_int32_array(batch.column(0))?;
@@ -199,10 +200,6 @@ impl std::fmt::Debug for Simple0ArgsScalarUDF {
 }
 
 impl ScalarUDFImpl for Simple0ArgsScalarUDF {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         &self.name
     }
@@ -279,7 +276,7 @@ async fn scalar_udf_zero_params() -> Result<()> {
     ctx.register_udf(ScalarUDF::from(get_100_udf));
 
     let result = plan_and_collect(&ctx, "select get_100() a from t").await?;
-    insta::assert_snapshot!(batches_to_string(&result), @r###"
+    insta::assert_snapshot!(batches_to_string(&result), @r"
     +-----+
     | a   |
     +-----+
@@ -288,22 +285,22 @@ async fn scalar_udf_zero_params() -> Result<()> {
     | 100 |
     | 100 |
     +-----+
-    "###);
+    ");
 
     let result = plan_and_collect(&ctx, "select get_100() a").await?;
-    insta::assert_snapshot!(batches_to_string(&result), @r###"
+    insta::assert_snapshot!(batches_to_string(&result), @r"
     +-----+
     | a   |
     +-----+
     | 100 |
     +-----+
-    "###);
+    ");
 
     let result = plan_and_collect(&ctx, "select get_100() from t where a=999").await?;
-    insta::assert_snapshot!(batches_to_string(&result), @r###"
+    insta::assert_snapshot!(batches_to_string(&result), @r"
     ++
     ++
-    "###);
+    ");
 
     Ok(())
 }
@@ -330,13 +327,13 @@ async fn scalar_udf_override_built_in_scalar_function() -> Result<()> {
 
     // Make sure that the UDF is used instead of the built-in function
     let result = plan_and_collect(&ctx, "select abs(a) a from t").await?;
-    insta::assert_snapshot!(batches_to_string(&result), @r###"
+    insta::assert_snapshot!(batches_to_string(&result), @r"
     +---+
     | a |
     +---+
     | 1 |
     +---+
-    "###);
+    ");
 
     Ok(())
 }
@@ -425,20 +422,21 @@ async fn case_sensitive_identifiers_user_defined_functions() -> Result<()> {
     let err = plan_and_collect(&ctx, "SELECT MY_FUNC(i) FROM t")
         .await
         .unwrap_err();
-    assert!(err
-        .to_string()
-        .contains("Error during planning: Invalid function \'my_func\'"));
+    assert!(
+        err.to_string()
+            .contains("Error during planning: Invalid function \'my_func\'")
+    );
 
     // Can call it if you put quotes
     let result = plan_and_collect(&ctx, "SELECT \"MY_FUNC\"(i) FROM t").await?;
 
-    insta::assert_snapshot!(batches_to_string(&result), @r###"
+    insta::assert_snapshot!(batches_to_string(&result), @r"
     +--------------+
     | MY_FUNC(t.i) |
     +--------------+
     | 1            |
     +--------------+
-    "###);
+    ");
 
     Ok(())
 }
@@ -469,13 +467,13 @@ async fn test_user_defined_functions_with_alias() -> Result<()> {
     ctx.register_udf(udf);
 
     let result = plan_and_collect(&ctx, "SELECT dummy(i) FROM t").await?;
-    insta::assert_snapshot!(batches_to_string(&result), @r###"
+    insta::assert_snapshot!(batches_to_string(&result), @r"
     +------------+
     | dummy(t.i) |
     +------------+
     | 1          |
     +------------+
-    "###);
+    ");
 
     let alias_result = plan_and_collect(&ctx, "SELECT dummy_alias(i) FROM t").await?;
     insta::assert_snapshot!(batches_to_string(&alias_result), @r"
@@ -508,10 +506,6 @@ impl AddIndexToStringVolatileScalarUDF {
 }
 
 impl ScalarUDFImpl for AddIndexToStringVolatileScalarUDF {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         &self.name
     }
@@ -675,9 +669,6 @@ impl CastToI64UDF {
 }
 
 impl ScalarUDFImpl for CastToI64UDF {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "cast_to_i64"
     }
@@ -696,7 +687,7 @@ impl ScalarUDFImpl for CastToI64UDF {
     fn simplify(
         &self,
         mut args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         // DataFusion should have ensured the function is called with just a
         // single argument
@@ -712,10 +703,7 @@ impl ScalarUDFImpl for CastToI64UDF {
             arg
         } else {
             // need to use an actual cast to get the correct type
-            Expr::Cast(datafusion_expr::Cast {
-                expr: Box::new(arg),
-                data_type: DataType::Int64,
-            })
+            Expr::Cast(datafusion_expr::Cast::new(Box::new(arg), DataType::Int64))
         };
         // return the newly written argument to DataFusion
         Ok(ExprSimplifyResult::Simplified(new_expr))
@@ -800,9 +788,6 @@ impl TakeUDF {
 
 /// Implement a ScalarUDFImpl whose return type is a function of the input values
 impl ScalarUDFImpl for TakeUDF {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "take"
     }
@@ -945,13 +930,10 @@ struct ScalarFunctionWrapper {
     expr: Expr,
     signature: Signature,
     return_type: DataType,
+    defaults: Vec<Option<Expr>>,
 }
 
 impl ScalarUDFImpl for ScalarFunctionWrapper {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         &self.name
     }
@@ -971,9 +953,9 @@ impl ScalarUDFImpl for ScalarFunctionWrapper {
     fn simplify(
         &self,
         args: Vec<Expr>,
-        _info: &dyn SimplifyInfo,
+        _info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
-        let replacement = Self::replacement(&self.expr, &args)?;
+        let replacement = Self::replacement(&self.expr, &args, &self.defaults)?;
 
         Ok(ExprSimplifyResult::Simplified(replacement))
     }
@@ -981,7 +963,11 @@ impl ScalarUDFImpl for ScalarFunctionWrapper {
 
 impl ScalarFunctionWrapper {
     // replaces placeholders with actual arguments
-    fn replacement(expr: &Expr, args: &[Expr]) -> Result<Expr> {
+    fn replacement(
+        expr: &Expr,
+        args: &[Expr],
+        defaults: &[Option<Expr>],
+    ) -> Result<Expr> {
         let result = expr.clone().transform(|e| {
             let r = match e {
                 Expr::Placeholder(placeholder) => {
@@ -989,11 +975,19 @@ impl ScalarFunctionWrapper {
                         Self::parse_placeholder_identifier(&placeholder.id)?;
                     if placeholder_position < args.len() {
                         Transformed::yes(args[placeholder_position].clone())
-                    } else {
+                    } else if placeholder_position >= defaults.len() {
                         exec_err!(
-                            "Function argument {} not provided, argument missing!",
+                            "Invalid placeholder, out of range: {}",
                             placeholder.id
                         )?
+                    } else {
+                        match defaults[placeholder_position] {
+                            Some(ref default) => Transformed::yes(default.clone()),
+                            None => exec_err!(
+                                "Function argument {} not provided, argument missing!",
+                                placeholder.id
+                            )?,
+                        }
                     }
                 }
                 _ => Transformed::no(e),
@@ -1021,6 +1015,32 @@ impl TryFrom<CreateFunction> for ScalarFunctionWrapper {
     type Error = DataFusionError;
 
     fn try_from(definition: CreateFunction) -> std::result::Result<Self, Self::Error> {
+        let args = definition.args.unwrap_or_default();
+        let defaults: Vec<Option<Expr>> =
+            args.iter().map(|a| a.default_expr.clone()).collect();
+        let signature: Signature = match defaults.iter().position(|v| v.is_some()) {
+            Some(pos) => {
+                let mut type_signatures: Vec<TypeSignature> = vec![];
+                // Generate all valid signatures
+                for n in pos..defaults.len() + 1 {
+                    if n == 0 {
+                        type_signatures.push(TypeSignature::Nullary)
+                    } else {
+                        type_signatures.push(TypeSignature::Exact(
+                            args.iter().take(n).map(|a| a.data_type.clone()).collect(),
+                        ))
+                    }
+                }
+                Signature::one_of(
+                    type_signatures,
+                    definition.params.behavior.unwrap_or(Volatility::Volatile),
+                )
+            }
+            None => Signature::exact(
+                args.iter().map(|a| a.data_type.clone()).collect(),
+                definition.params.behavior.unwrap_or(Volatility::Volatile),
+            ),
+        };
         Ok(Self {
             name: definition.name,
             expr: definition
@@ -1030,15 +1050,8 @@ impl TryFrom<CreateFunction> for ScalarFunctionWrapper {
             return_type: definition
                 .return_type
                 .expect("Return type has to be defined!"),
-            signature: Signature::exact(
-                definition
-                    .args
-                    .unwrap_or_default()
-                    .into_iter()
-                    .map(|a| a.data_type)
-                    .collect(),
-                definition.params.behavior.unwrap_or(Volatility::Volatile),
-            ),
+            signature,
+            defaults,
         })
     }
 }
@@ -1061,10 +1074,11 @@ async fn create_scalar_function_from_sql_statement() -> Result<()> {
     // Create the `better_add` function dynamically via CREATE FUNCTION statement
     assert!(ctx.sql(sql).await.is_ok());
     // try to `drop function` when sql options have allow ddl disabled
-    assert!(ctx
-        .sql_with_options("drop function better_add", options)
-        .await
-        .is_err());
+    assert!(
+        ctx.sql_with_options("drop function better_add", options)
+            .await
+            .is_err()
+    );
 
     let result = ctx
         .sql("select better_add(2.0, 2.0)")
@@ -1109,6 +1123,175 @@ async fn create_scalar_function_from_sql_statement() -> Result<()> {
     "#;
     assert!(ctx.sql(bad_definition_sql).await.is_err());
 
+    // FIXME: Definitions with invalid placeholders are allowed, fail at runtime
+    let bad_expression_sql = r#"
+    CREATE FUNCTION better_add(DOUBLE, DOUBLE)
+        RETURNS DOUBLE
+        RETURN $1 + $3
+    "#;
+    assert!(ctx.sql(bad_expression_sql).await.is_ok());
+
+    let err = ctx
+        .sql("select better_add(2.0, 2.0)")
+        .await?
+        .collect()
+        .await
+        .expect_err("unknown placeholder");
+    let expected = "Optimizer rule 'simplify_expressions' failed\ncaused by\nExecution error: Invalid placeholder, out of range: $3";
+    assert!(expected.starts_with(&err.strip_backtrace()));
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn create_scalar_function_from_sql_statement_named_arguments() -> Result<()> {
+    let function_factory = Arc::new(CustomFunctionFactory::default());
+    let ctx = SessionContext::new().with_function_factory(function_factory.clone());
+
+    let sql = r#"
+    CREATE FUNCTION better_add(a DOUBLE, b DOUBLE)
+        RETURNS DOUBLE
+        RETURN $a + $b
+    "#;
+
+    assert!(ctx.sql(sql).await.is_ok());
+
+    let result = ctx
+        .sql("select better_add(2.0, 2.0)")
+        .await?
+        .collect()
+        .await?;
+
+    assert_batches_eq!(
+        &[
+            "+-----------------------------------+",
+            "| better_add(Float64(2),Float64(2)) |",
+            "+-----------------------------------+",
+            "| 4.0                               |",
+            "+-----------------------------------+",
+        ],
+        &result
+    );
+
+    // cannot mix named and positional style
+    let bad_expression_sql = r#"
+    CREATE FUNCTION bad_expression_fun(DOUBLE, b DOUBLE)
+        RETURNS DOUBLE
+        RETURN $1 + $b
+    "#;
+    let err = ctx
+        .sql(bad_expression_sql)
+        .await
+        .expect_err("cannot mix named and positional style");
+    let expected = "Error during planning: All function arguments must use either named or positional style.";
+    assert!(expected.starts_with(&err.strip_backtrace()));
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn create_scalar_function_from_sql_statement_default_arguments() -> Result<()> {
+    let function_factory = Arc::new(CustomFunctionFactory::default());
+    let ctx = SessionContext::new().with_function_factory(function_factory.clone());
+
+    let sql = r#"
+    CREATE FUNCTION better_add(a DOUBLE = 2.0, b DOUBLE = 2.0)
+        RETURNS DOUBLE
+        RETURN $a + $b
+    "#;
+
+    assert!(ctx.sql(sql).await.is_ok());
+
+    // Check all function arity supported
+    let result = ctx.sql("select better_add()").await?.collect().await?;
+
+    assert_batches_eq!(
+        &[
+            "+--------------+",
+            "| better_add() |",
+            "+--------------+",
+            "| 4.0          |",
+            "+--------------+",
+        ],
+        &result
+    );
+
+    let result = ctx.sql("select better_add(2.0)").await?.collect().await?;
+
+    assert_batches_eq!(
+        &[
+            "+------------------------+",
+            "| better_add(Float64(2)) |",
+            "+------------------------+",
+            "| 4.0                    |",
+            "+------------------------+",
+        ],
+        &result
+    );
+
+    let result = ctx
+        .sql("select better_add(2.0, 2.0)")
+        .await?
+        .collect()
+        .await?;
+
+    assert_batches_eq!(
+        &[
+            "+-----------------------------------+",
+            "| better_add(Float64(2),Float64(2)) |",
+            "+-----------------------------------+",
+            "| 4.0                               |",
+            "+-----------------------------------+",
+        ],
+        &result
+    );
+
+    assert!(ctx.sql("select better_add(2.0, 2.0, 2.0)").await.is_err());
+    assert!(ctx.sql("drop function better_add").await.is_ok());
+
+    // works with positional style
+    let sql = r#"
+    CREATE FUNCTION better_add(DOUBLE, DOUBLE = 2.0)
+        RETURNS DOUBLE
+        RETURN $1 + $2
+    "#;
+    assert!(ctx.sql(sql).await.is_ok());
+
+    assert!(ctx.sql("select better_add()").await.is_err());
+    let result = ctx.sql("select better_add(2.0)").await?.collect().await?;
+    assert_batches_eq!(
+        &[
+            "+------------------------+",
+            "| better_add(Float64(2)) |",
+            "+------------------------+",
+            "| 4.0                    |",
+            "+------------------------+",
+        ],
+        &result
+    );
+
+    // non-default argument cannot follow default argument
+    let bad_expression_sql = r#"
+    CREATE FUNCTION bad_expression_fun(a DOUBLE = 2.0, b DOUBLE)
+        RETURNS DOUBLE
+        RETURN $a + $b
+    "#;
+    let err = ctx
+        .sql(bad_expression_sql)
+        .await
+        .expect_err("non-default argument cannot follow default argument");
+    let expected =
+        "Error during planning: Non-default arguments cannot follow default arguments.";
+    assert!(expected.starts_with(&err.strip_backtrace()));
+
+    let expression_sql = r#"
+    CREATE FUNCTION bad_expression_fun(DOUBLE, DOUBLE DEFAULT 2.0)
+        RETURNS DOUBLE
+        RETURN $1 + $2
+    "#;
+    let result = ctx.sql(expression_sql).await;
+
+    assert!(result.is_ok());
     Ok(())
 }
 
@@ -1239,10 +1422,6 @@ impl MyRegexUdf {
 }
 
 impl ScalarUDFImpl for MyRegexUdf {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "regex_udf"
     }
@@ -1407,10 +1586,6 @@ impl MetadataBasedUdf {
 }
 
 impl ScalarUDFImpl for MetadataBasedUdf {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         &self.name
     }
@@ -1616,10 +1791,6 @@ impl Default for ExtensionBasedUdf {
     }
 }
 impl ScalarUDFImpl for ExtensionBasedUdf {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         &self.name
     }
@@ -1786,9 +1957,6 @@ async fn test_config_options_work_for_scalar_func() -> Result<()> {
     }
 
     impl ScalarUDFImpl for TestScalarUDF {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
         fn name(&self) -> &str {
             "TestScalarUDF"
         }
@@ -1850,10 +2018,6 @@ async fn test_extension_metadata_preserve_in_sql_values() -> Result<()> {
     }
 
     impl ScalarUDFImpl for MakeExtension {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
         fn name(&self) -> &str {
             "make_extension"
         }
@@ -1931,10 +2095,6 @@ async fn test_extension_metadata_preserve_in_subquery() -> Result<()> {
     }
 
     impl ScalarUDFImpl for ExtensionScalarPredicate {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
         fn name(&self) -> &str {
             "extension_predicate"
         }
diff --git a/datafusion/core/tests/user_defined/user_defined_table_functions.rs b/datafusion/core/tests/user_defined/user_defined_table_functions.rs
index 2c6611f382cea..c8ded3a6fce3f 100644
--- a/datafusion/core/tests/user_defined/user_defined_table_functions.rs
+++ b/datafusion/core/tests/user_defined/user_defined_table_functions.rs
@@ -21,20 +21,20 @@ use std::path::Path;
 use std::sync::Arc;
 
 use arrow::array::Int64Array;
-use arrow::csv::reader::Format;
 use arrow::csv::ReaderBuilder;
+use arrow::csv::reader::Format;
 
 use datafusion::arrow::datatypes::SchemaRef;
 use datafusion::arrow::record_batch::RecordBatch;
 use datafusion::common::test_util::batches_to_string;
-use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::datasource::TableProvider;
+use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::error::Result;
 use datafusion::execution::TaskContext;
-use datafusion::physical_plan::{collect, ExecutionPlan};
+use datafusion::physical_plan::{ExecutionPlan, collect};
 use datafusion::prelude::SessionContext;
-use datafusion_catalog::Session;
 use datafusion_catalog::TableFunctionImpl;
+use datafusion_catalog::{Session, TableFunctionArgs};
 use datafusion_common::{DFSchema, ScalarValue};
 use datafusion_expr::{EmptyRelation, Expr, LogicalPlan, Projection, TableType};
 
@@ -55,7 +55,7 @@ async fn test_simple_read_csv_udtf() -> Result<()> {
         .collect()
         .await?;
 
-    insta::assert_snapshot!(batches_to_string(&rbs), @r###"
+    insta::assert_snapshot!(batches_to_string(&rbs), @r"
     +-------------+-----------+-------------+-------------------------------------------------------------------------------------------------------------+
     | n_nationkey | n_name    | n_regionkey | n_comment                                                                                                   |
     +-------------+-----------+-------------+-------------------------------------------------------------------------------------------------------------+
@@ -65,7 +65,7 @@ async fn test_simple_read_csv_udtf() -> Result<()> {
     | 4           | EGYPT     | 4           | y above the carefully unusual theodolites. final dugouts are quickly across the furiously regular d         |
     | 5           | ETHIOPIA  | 0           | ven packages wake quickly. regu                                                                             |
     +-------------+-----------+-------------+-------------------------------------------------------------------------------------------------------------+
-    "###);
+    ");
 
     // just run, return all rows
     let rbs = ctx
@@ -74,7 +74,7 @@ async fn test_simple_read_csv_udtf() -> Result<()> {
         .collect()
         .await?;
 
-    insta::assert_snapshot!(batches_to_string(&rbs), @r###"
+    insta::assert_snapshot!(batches_to_string(&rbs), @r"
     +-------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------+
     | n_nationkey | n_name    | n_regionkey | n_comment                                                                                                          |
     +-------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------+
@@ -89,7 +89,7 @@ async fn test_simple_read_csv_udtf() -> Result<()> {
     | 9           | INDONESIA | 2           |  slyly express asymptotes. regular deposits haggle slyly. carefully ironic hockey players sleep blithely. carefull |
     | 10          | IRAN      | 4           | efully alongside of the slyly final dependencies.                                                                  |
     +-------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------+
-    "###);
+    ");
 
     Ok(())
 }
@@ -118,10 +118,6 @@ struct SimpleCsvTable {
 
 #[async_trait]
 impl TableProvider for SimpleCsvTable {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         self.schema.clone()
     }
@@ -200,12 +196,13 @@ impl SimpleCsvTable {
 struct SimpleCsvTableFunc {}
 
 impl TableFunctionImpl for SimpleCsvTableFunc {
-    fn call(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
+    fn call_with_args(&self, args: TableFunctionArgs) -> Result<Arc<dyn TableProvider>> {
+        let exprs = args.exprs();
         let mut new_exprs = vec![];
         let mut filepath = String::new();
         for expr in exprs {
             match expr {
-                Expr::Literal(ScalarValue::Utf8(Some(ref path)), _) => {
+                Expr::Literal(ScalarValue::Utf8(Some(path)), _) => {
                     filepath.clone_from(path);
                 }
                 expr => new_exprs.push(expr.clone()),
@@ -221,6 +218,31 @@ impl TableFunctionImpl for SimpleCsvTableFunc {
     }
 }
 
+/// Test that expressions passed to UDTFs are properly type-coerced
+/// This is a regression test for https://github.com/apache/datafusion/issues/19914
+#[tokio::test]
+async fn test_udtf_type_coercion() -> Result<()> {
+    use datafusion::datasource::MemTable;
+
+    #[derive(Debug)]
+    struct NoOpTableFunc;
+
+    impl TableFunctionImpl for NoOpTableFunc {
+        fn call_with_args(&self, _: TableFunctionArgs) -> Result<Arc<dyn TableProvider>> {
+            let schema = Arc::new(arrow::datatypes::Schema::empty());
+            Ok(Arc::new(MemTable::try_new(schema, vec![vec![]])?))
+        }
+    }
+
+    let ctx = SessionContext::new();
+    ctx.register_udtf("f", Arc::new(NoOpTableFunc));
+
+    // This should not panic - the array elements should be coerced to Float64
+    let _ = ctx.sql("SELECT * FROM f(ARRAY[0.1, 1, 2])").await?;
+
+    Ok(())
+}
+
 fn read_csv_batches(csv_path: impl AsRef<Path>) -> Result<(SchemaRef, Vec<RecordBatch>)> {
     let mut file = File::open(csv_path)?;
     let (schema, _) = Format::default()
diff --git a/datafusion/core/tests/user_defined/user_defined_window_functions.rs b/datafusion/core/tests/user_defined/user_defined_window_functions.rs
index 33607ebc0d2cc..afaf269ca1200 100644
--- a/datafusion/core/tests/user_defined/user_defined_window_functions.rs
+++ b/datafusion/core/tests/user_defined/user_defined_window_functions.rs
@@ -19,8 +19,8 @@
 //! user defined window functions
 
 use arrow::array::{
-    record_batch, Array, ArrayRef, AsArray, Int64Array, RecordBatch, StringArray,
-    UInt64Array,
+    Array, ArrayRef, AsArray, Int64Array, RecordBatch, StringArray, UInt64Array,
+    record_batch,
 };
 use arrow::datatypes::{DataType, Field, Schema};
 use arrow_schema::FieldRef;
@@ -38,17 +38,16 @@ use datafusion_functions_window_common::{
     expr::ExpressionArgs, field::WindowUDFFieldArgs,
 };
 use datafusion_physical_expr::{
-    expressions::{col, lit},
     PhysicalExpr,
+    expressions::{col, lit},
 };
 use std::collections::HashMap;
 use std::hash::{Hash, Hasher};
 use std::{
-    any::Any,
     ops::Range,
     sync::{
-        atomic::{AtomicUsize, Ordering},
         Arc,
+        atomic::{AtomicUsize, Ordering},
     },
 };
 
@@ -62,8 +61,7 @@ const UNBOUNDED_WINDOW_QUERY_WITH_ALIAS: &str = "SELECT x, y, val, \
      from t ORDER BY x, y";
 
 /// A query with a window function evaluated over a moving window
-const BOUNDED_WINDOW_QUERY:  &str  =
-    "SELECT x, y, val, \
+const BOUNDED_WINDOW_QUERY: &str = "SELECT x, y, val, \
      odd_counter(val) OVER (PARTITION BY x ORDER BY y ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) \
      from t ORDER BY x, y";
 
@@ -75,22 +73,22 @@ async fn test_setup() {
     let sql = "SELECT * from t order by x, y";
     let actual = execute(&ctx, sql).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
-         +---+---+-----+
-         | x | y | val |
-         +---+---+-----+
-         | 1 | a | 0   |
-         | 1 | b | 1   |
-         | 1 | c | 2   |
-         | 2 | d | 3   |
-         | 2 | e | 4   |
-         | 2 | f | 5   |
-         | 2 | g | 6   |
-         | 2 | h | 6   |
-         | 2 | i | 6   |
-         | 2 | j | 6   |
-         +---+---+-----+
-         "###);
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
+    +---+---+-----+
+    | x | y | val |
+    +---+---+-----+
+    | 1 | a | 0   |
+    | 1 | b | 1   |
+    | 1 | c | 2   |
+    | 2 | d | 3   |
+    | 2 | e | 4   |
+    | 2 | f | 5   |
+    | 2 | g | 6   |
+    | 2 | h | 6   |
+    | 2 | i | 6   |
+    | 2 | j | 6   |
+    +---+---+-----+
+    ");
 }
 
 /// Basic user defined window function
@@ -101,22 +99,22 @@ async fn test_udwf() {
 
     let actual = execute(&ctx, UNBOUNDED_WINDOW_QUERY).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         | 1 | a | 0   | 1                                                                                                                     |
-         | 1 | b | 1   | 1                                                                                                                     |
-         | 1 | c | 2   | 1                                                                                                                     |
-         | 2 | d | 3   | 2                                                                                                                     |
-         | 2 | e | 4   | 2                                                                                                                     |
-         | 2 | f | 5   | 2                                                                                                                     |
-         | 2 | g | 6   | 2                                                                                                                     |
-         | 2 | h | 6   | 2                                                                                                                     |
-         | 2 | i | 6   | 2                                                                                                                     |
-         | 2 | j | 6   | 2                                                                                                                     |
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         "###);
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
+    +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
+    | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |
+    +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
+    | 1 | a | 0   | 1                                                                                                                     |
+    | 1 | b | 1   | 1                                                                                                                     |
+    | 1 | c | 2   | 1                                                                                                                     |
+    | 2 | d | 3   | 2                                                                                                                     |
+    | 2 | e | 4   | 2                                                                                                                     |
+    | 2 | f | 5   | 2                                                                                                                     |
+    | 2 | g | 6   | 2                                                                                                                     |
+    | 2 | h | 6   | 2                                                                                                                     |
+    | 2 | i | 6   | 2                                                                                                                     |
+    | 2 | j | 6   | 2                                                                                                                     |
+    +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
+    ");
 
     // evaluated on two distinct batches
     assert_eq!(test_state.evaluate_all_called(), 2);
@@ -175,22 +173,22 @@ async fn test_udwf_bounded_window_ignores_frame() {
     // Since the UDWF doesn't say it needs the window frame, the frame is ignored
     let actual = execute(&ctx, BOUNDED_WINDOW_QUERY).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         | 1 | a | 0   | 1                                                                                                            |
-         | 1 | b | 1   | 1                                                                                                            |
-         | 1 | c | 2   | 1                                                                                                            |
-         | 2 | d | 3   | 2                                                                                                            |
-         | 2 | e | 4   | 2                                                                                                            |
-         | 2 | f | 5   | 2                                                                                                            |
-         | 2 | g | 6   | 2                                                                                                            |
-         | 2 | h | 6   | 2                                                                                                            |
-         | 2 | i | 6   | 2                                                                                                            |
-         | 2 | j | 6   | 2                                                                                                            |
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         "###);
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    | 1 | a | 0   | 1                                                                                                            |
+    | 1 | b | 1   | 1                                                                                                            |
+    | 1 | c | 2   | 1                                                                                                            |
+    | 2 | d | 3   | 2                                                                                                            |
+    | 2 | e | 4   | 2                                                                                                            |
+    | 2 | f | 5   | 2                                                                                                            |
+    | 2 | g | 6   | 2                                                                                                            |
+    | 2 | h | 6   | 2                                                                                                            |
+    | 2 | i | 6   | 2                                                                                                            |
+    | 2 | j | 6   | 2                                                                                                            |
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    ");
 
     // evaluated on 2 distinct batches (when x=1 and x=2)
     assert_eq!(test_state.evaluate_called(), 0);
@@ -205,22 +203,22 @@ async fn test_udwf_bounded_window() {
 
     let actual = execute(&ctx, BOUNDED_WINDOW_QUERY).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         | 1 | a | 0   | 1                                                                                                            |
-         | 1 | b | 1   | 1                                                                                                            |
-         | 1 | c | 2   | 1                                                                                                            |
-         | 2 | d | 3   | 1                                                                                                            |
-         | 2 | e | 4   | 2                                                                                                            |
-         | 2 | f | 5   | 1                                                                                                            |
-         | 2 | g | 6   | 1                                                                                                            |
-         | 2 | h | 6   | 0                                                                                                            |
-         | 2 | i | 6   | 0                                                                                                            |
-         | 2 | j | 6   | 0                                                                                                            |
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         "###);
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    | 1 | a | 0   | 1                                                                                                            |
+    | 1 | b | 1   | 1                                                                                                            |
+    | 1 | c | 2   | 1                                                                                                            |
+    | 2 | d | 3   | 1                                                                                                            |
+    | 2 | e | 4   | 2                                                                                                            |
+    | 2 | f | 5   | 1                                                                                                            |
+    | 2 | g | 6   | 1                                                                                                            |
+    | 2 | h | 6   | 0                                                                                                            |
+    | 2 | i | 6   | 0                                                                                                            |
+    | 2 | j | 6   | 0                                                                                                            |
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    ");
 
     // Evaluate is called for each input rows
     assert_eq!(test_state.evaluate_called(), 10);
@@ -237,22 +235,22 @@ async fn test_stateful_udwf() {
 
     let actual = execute(&ctx, UNBOUNDED_WINDOW_QUERY).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         | 1 | a | 0   | 0                                                                                                                     |
-         | 1 | b | 1   | 1                                                                                                                     |
-         | 1 | c | 2   | 1                                                                                                                     |
-         | 2 | d | 3   | 1                                                                                                                     |
-         | 2 | e | 4   | 1                                                                                                                     |
-         | 2 | f | 5   | 2                                                                                                                     |
-         | 2 | g | 6   | 2                                                                                                                     |
-         | 2 | h | 6   | 2                                                                                                                     |
-         | 2 | i | 6   | 2                                                                                                                     |
-         | 2 | j | 6   | 2                                                                                                                     |
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         "###);
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
+    +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
+    | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |
+    +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
+    | 1 | a | 0   | 0                                                                                                                     |
+    | 1 | b | 1   | 1                                                                                                                     |
+    | 1 | c | 2   | 1                                                                                                                     |
+    | 2 | d | 3   | 1                                                                                                                     |
+    | 2 | e | 4   | 1                                                                                                                     |
+    | 2 | f | 5   | 2                                                                                                                     |
+    | 2 | g | 6   | 2                                                                                                                     |
+    | 2 | h | 6   | 2                                                                                                                     |
+    | 2 | i | 6   | 2                                                                                                                     |
+    | 2 | j | 6   | 2                                                                                                                     |
+    +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
+    ");
 
     assert_eq!(test_state.evaluate_called(), 10);
     assert_eq!(test_state.evaluate_all_called(), 0);
@@ -268,22 +266,22 @@ async fn test_stateful_udwf_bounded_window() {
 
     let actual = execute(&ctx, BOUNDED_WINDOW_QUERY).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         | 1 | a | 0   | 1                                                                                                            |
-         | 1 | b | 1   | 1                                                                                                            |
-         | 1 | c | 2   | 1                                                                                                            |
-         | 2 | d | 3   | 1                                                                                                            |
-         | 2 | e | 4   | 2                                                                                                            |
-         | 2 | f | 5   | 1                                                                                                            |
-         | 2 | g | 6   | 1                                                                                                            |
-         | 2 | h | 6   | 0                                                                                                            |
-         | 2 | i | 6   | 0                                                                                                            |
-         | 2 | j | 6   | 0                                                                                                            |
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         "###);
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    | 1 | a | 0   | 1                                                                                                            |
+    | 1 | b | 1   | 1                                                                                                            |
+    | 1 | c | 2   | 1                                                                                                            |
+    | 2 | d | 3   | 1                                                                                                            |
+    | 2 | e | 4   | 2                                                                                                            |
+    | 2 | f | 5   | 1                                                                                                            |
+    | 2 | g | 6   | 1                                                                                                            |
+    | 2 | h | 6   | 0                                                                                                            |
+    | 2 | i | 6   | 0                                                                                                            |
+    | 2 | j | 6   | 0                                                                                                            |
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    ");
 
     // Evaluate and update_state is called for each input row
     assert_eq!(test_state.evaluate_called(), 10);
@@ -298,22 +296,22 @@ async fn test_udwf_query_include_rank() {
 
     let actual = execute(&ctx, UNBOUNDED_WINDOW_QUERY).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         | 1 | a | 0   | 3                                                                                                                     |
-         | 1 | b | 1   | 2                                                                                                                     |
-         | 1 | c | 2   | 1                                                                                                                     |
-         | 2 | d | 3   | 7                                                                                                                     |
-         | 2 | e | 4   | 6                                                                                                                     |
-         | 2 | f | 5   | 5                                                                                                                     |
-         | 2 | g | 6   | 4                                                                                                                     |
-         | 2 | h | 6   | 3                                                                                                                     |
-         | 2 | i | 6   | 2                                                                                                                     |
-         | 2 | j | 6   | 1                                                                                                                     |
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         "###);
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
+    +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
+    | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |
+    +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
+    | 1 | a | 0   | 3                                                                                                                     |
+    | 1 | b | 1   | 2                                                                                                                     |
+    | 1 | c | 2   | 1                                                                                                                     |
+    | 2 | d | 3   | 7                                                                                                                     |
+    | 2 | e | 4   | 6                                                                                                                     |
+    | 2 | f | 5   | 5                                                                                                                     |
+    | 2 | g | 6   | 4                                                                                                                     |
+    | 2 | h | 6   | 3                                                                                                                     |
+    | 2 | i | 6   | 2                                                                                                                     |
+    | 2 | j | 6   | 1                                                                                                                     |
+    +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
+    ");
 
     assert_eq!(test_state.evaluate_called(), 0);
     assert_eq!(test_state.evaluate_all_called(), 0);
@@ -329,22 +327,22 @@ async fn test_udwf_bounded_query_include_rank() {
 
     let actual = execute(&ctx, BOUNDED_WINDOW_QUERY).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         | 1 | a | 0   | 3                                                                                                            |
-         | 1 | b | 1   | 2                                                                                                            |
-         | 1 | c | 2   | 1                                                                                                            |
-         | 2 | d | 3   | 7                                                                                                            |
-         | 2 | e | 4   | 6                                                                                                            |
-         | 2 | f | 5   | 5                                                                                                            |
-         | 2 | g | 6   | 4                                                                                                            |
-         | 2 | h | 6   | 3                                                                                                            |
-         | 2 | i | 6   | 2                                                                                                            |
-         | 2 | j | 6   | 1                                                                                                            |
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         "###);
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    | 1 | a | 0   | 3                                                                                                            |
+    | 1 | b | 1   | 2                                                                                                            |
+    | 1 | c | 2   | 1                                                                                                            |
+    | 2 | d | 3   | 7                                                                                                            |
+    | 2 | e | 4   | 6                                                                                                            |
+    | 2 | f | 5   | 5                                                                                                            |
+    | 2 | g | 6   | 4                                                                                                            |
+    | 2 | h | 6   | 3                                                                                                            |
+    | 2 | i | 6   | 2                                                                                                            |
+    | 2 | j | 6   | 1                                                                                                            |
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    ");
 
     assert_eq!(test_state.evaluate_called(), 0);
     assert_eq!(test_state.evaluate_all_called(), 0);
@@ -362,22 +360,22 @@ async fn test_udwf_bounded_window_returns_null() {
 
     let actual = execute(&ctx, BOUNDED_WINDOW_QUERY).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         | 1 | a | 0   | 1                                                                                                            |
-         | 1 | b | 1   | 1                                                                                                            |
-         | 1 | c | 2   | 1                                                                                                            |
-         | 2 | d | 3   | 1                                                                                                            |
-         | 2 | e | 4   | 2                                                                                                            |
-         | 2 | f | 5   | 1                                                                                                            |
-         | 2 | g | 6   | 1                                                                                                            |
-         | 2 | h | 6   |                                                                                                              |
-         | 2 | i | 6   |                                                                                                              |
-         | 2 | j | 6   |                                                                                                              |
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         "###);
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    | 1 | a | 0   | 1                                                                                                            |
+    | 1 | b | 1   | 1                                                                                                            |
+    | 1 | c | 2   | 1                                                                                                            |
+    | 2 | d | 3   | 1                                                                                                            |
+    | 2 | e | 4   | 2                                                                                                            |
+    | 2 | f | 5   | 1                                                                                                            |
+    | 2 | g | 6   | 1                                                                                                            |
+    | 2 | h | 6   |                                                                                                              |
+    | 2 | i | 6   |                                                                                                              |
+    | 2 | j | 6   |                                                                                                              |
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    ");
 
     // Evaluate is called for each input rows
     assert_eq!(test_state.evaluate_called(), 10);
@@ -537,7 +535,7 @@ impl OddCounter {
         impl SimpleWindowUDF {
             fn new(test_state: Arc<TestState>) -> Self {
                 let signature =
-                    Signature::exact(vec![DataType::Float64], Volatility::Immutable);
+                    Signature::exact(vec![DataType::Int64], Volatility::Immutable);
                 Self {
                     signature,
                     test_state: test_state.into(),
@@ -547,10 +545,6 @@ impl OddCounter {
         }
 
         impl WindowUDFImpl for SimpleWindowUDF {
-            fn as_any(&self) -> &dyn Any {
-                self
-            }
-
             fn name(&self) -> &str {
                 "odd_counter"
             }
@@ -616,7 +610,9 @@ impl PartitionEvaluator for OddCounter {
         ranks_in_partition: &[Range<usize>],
     ) -> Result<ArrayRef> {
         self.test_state.inc_evaluate_all_with_rank_called();
-        println!("evaluate_all_with_rank, values: {num_rows:#?}, ranks_in_partitions: {ranks_in_partition:?}");
+        println!(
+            "evaluate_all_with_rank, values: {num_rows:#?}, ranks_in_partitions: {ranks_in_partition:?}"
+        );
         // when evaluating with ranks, just return the inverse rank instead
         let array: Int64Array = ranks_in_partition
             .iter()
@@ -674,10 +670,6 @@ impl VariadicWindowUDF {
 }
 
 impl WindowUDFImpl for VariadicWindowUDF {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "variadic_window_udf"
     }
@@ -818,10 +810,6 @@ impl MetadataBasedWindowUdf {
 }
 
 impl WindowUDFImpl for MetadataBasedWindowUdf {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         &self.name
     }
diff --git a/datafusion/datasource-arrow/Cargo.toml b/datafusion/datasource-arrow/Cargo.toml
index b3d1e3f2accc9..2718e424c6386 100644
--- a/datafusion/datasource-arrow/Cargo.toml
+++ b/datafusion/datasource-arrow/Cargo.toml
@@ -51,6 +51,9 @@ tokio = { workspace = true }
 [dev-dependencies]
 chrono = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -59,6 +62,6 @@ name = "datafusion_datasource_arrow"
 path = "src/mod.rs"
 
 [features]
-compression = [
-    "arrow-ipc/zstd",
-]
+# This feature is deprecated, as core functionality in the SpillManager requires all features
+# it enabled, and will be removed in a future version.
+compression = []
diff --git a/datafusion/datasource-arrow/NOTICE.txt b/datafusion/datasource-arrow/NOTICE.txt
index 7f3c80d606c07..0bd2d52368fea 100644
--- a/datafusion/datasource-arrow/NOTICE.txt
+++ b/datafusion/datasource-arrow/NOTICE.txt
@@ -1,5 +1,5 @@
 Apache DataFusion
-Copyright 2019-2025 The Apache Software Foundation
+Copyright 2019-2026 The Apache Software Foundation
 
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).
diff --git a/datafusion/datasource-arrow/src/file_format.rs b/datafusion/datasource-arrow/src/file_format.rs
index 3b85640804219..9297486ad66e7 100644
--- a/datafusion/datasource-arrow/src/file_format.rs
+++ b/datafusion/datasource-arrow/src/file_format.rs
@@ -19,31 +19,31 @@
 //!
 //! Works with files following the [Arrow IPC format](https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format)
 
-use std::any::Any;
-use std::borrow::Cow;
 use std::collections::HashMap;
 use std::fmt::{self, Debug};
+use std::io::{Seek, SeekFrom};
 use std::sync::Arc;
 
 use arrow::datatypes::{Schema, SchemaRef};
 use arrow::error::ArrowError;
 use arrow::ipc::convert::fb_to_schema;
-use arrow::ipc::reader::FileReader;
+use arrow::ipc::reader::{FileReader, StreamReader};
 use arrow::ipc::writer::IpcWriteOptions;
-use arrow::ipc::{root_as_message, CompressionType};
+use arrow::ipc::{CompressionType, root_as_message};
 use datafusion_common::error::Result;
 use datafusion_common::parsers::CompressionTypeVariant;
 use datafusion_common::{
-    internal_datafusion_err, not_impl_err, DataFusionError, GetExt, Statistics,
-    DEFAULT_ARROW_EXTENSION,
+    DEFAULT_ARROW_EXTENSION, DataFusionError, GetExt, Statistics,
+    internal_datafusion_err, not_impl_err,
 };
 use datafusion_common_runtime::{JoinSet, SpawnedTask};
+use datafusion_datasource::TableSchema;
 use datafusion_datasource::display::FileGroupDisplay;
 use datafusion_datasource::file::FileSource;
 use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
 use datafusion_datasource::sink::{DataSink, DataSinkExec};
 use datafusion_datasource::write::{
-    get_writer_schema, ObjectWriterBuilder, SharedBuffer,
+    ObjectWriterBuilder, SharedBuffer, get_writer_schema,
 };
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use datafusion_expr::dml::InsertOp;
@@ -59,9 +59,12 @@ use datafusion_datasource::source::DataSourceExec;
 use datafusion_datasource::write::demux::DemuxedStreamReceiver;
 use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan};
 use datafusion_session::Session;
-use futures::stream::BoxStream;
 use futures::StreamExt;
-use object_store::{GetResultPayload, ObjectMeta, ObjectStore};
+use futures::stream::BoxStream;
+use object_store::{
+    GetOptions, GetRange, GetResultPayload, ObjectMeta, ObjectStore, ObjectStoreExt,
+    path::Path,
+};
 use tokio::io::AsyncWriteExt;
 
 /// Initial writing buffer size. Note this is just a size hint for efficiency. It
@@ -71,8 +74,8 @@ const INITIAL_BUFFER_BYTES: usize = 1048576;
 /// If the buffered Arrow data exceeds this size, it is flushed to object store
 const BUFFER_FLUSH_BYTES: usize = 1024000;
 
+/// Factory struct used to create [`ArrowFormat`]
 #[derive(Default, Debug)]
-/// Factory struct used to create [ArrowFormat]
 pub struct ArrowFormatFactory;
 
 impl ArrowFormatFactory {
@@ -94,10 +97,6 @@ impl FileFormatFactory for ArrowFormatFactory {
     fn default(&self) -> Arc<dyn FileFormat> {
         Arc::new(ArrowFormat)
     }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
 }
 
 impl GetExt for ArrowFormatFactory {
@@ -107,16 +106,12 @@ impl GetExt for ArrowFormatFactory {
     }
 }
 
-/// Arrow `FileFormat` implementation.
+/// Arrow [`FileFormat`] implementation.
 #[derive(Default, Debug)]
 pub struct ArrowFormat;
 
 #[async_trait]
 impl FileFormat for ArrowFormat {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn get_ext(&self) -> String {
         ArrowFormatFactory::new().get_ext()
     }
@@ -150,14 +145,27 @@ impl FileFormat for ArrowFormat {
             let schema = match r.payload {
                 #[cfg(not(target_arch = "wasm32"))]
                 GetResultPayload::File(mut file, _) => {
-                    let reader = FileReader::try_new(&mut file, None)?;
-                    reader.schema()
-                }
-                GetResultPayload::Stream(stream) => {
-                    infer_schema_from_file_stream(stream).await?
+                    match FileReader::try_new(&mut file, None) {
+                        Ok(reader) => reader.schema(),
+                        Err(file_error) => {
+                            // not in the file format, but FileReader read some bytes
+                            // while trying to parse the file and so we need to rewind
+                            // it to the beginning of the file
+                            file.seek(SeekFrom::Start(0))?;
+                            match StreamReader::try_new(&mut file, None) {
+                                Ok(reader) => reader.schema(),
+                                Err(stream_error) => {
+                                    return Err(internal_datafusion_err!(
+                                        "Failed to parse Arrow file as either file format or stream format. File format error: {file_error}. Stream format error: {stream_error}"
+                                    ));
+                                }
+                            }
+                        }
+                    }
                 }
+                GetResultPayload::Stream(stream) => infer_stream_schema(stream).await?,
             };
-            schemas.push(schema.as_ref().clone());
+            schemas.push(Arc::unwrap_or_clone(schema));
         }
         let merged_schema = Schema::try_merge(schemas)?;
         Ok(Arc::new(merged_schema))
@@ -175,10 +183,40 @@ impl FileFormat for ArrowFormat {
 
     async fn create_physical_plan(
         &self,
-        _state: &dyn Session,
+        state: &dyn Session,
         conf: FileScanConfig,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let source = Arc::new(ArrowSource::default());
+        let object_store = state.runtime_env().object_store(&conf.object_store_url)?;
+        let object_location = &conf
+            .file_groups
+            .first()
+            .ok_or_else(|| internal_datafusion_err!("No files found in file group"))?
+            .files()
+            .first()
+            .ok_or_else(|| internal_datafusion_err!("No files found in file group"))?
+            .object_meta
+            .location;
+
+        let table_schema = TableSchema::new(
+            Arc::clone(conf.file_schema()),
+            conf.table_partition_cols().clone(),
+        );
+
+        let mut source: Arc<dyn FileSource> =
+            match is_object_in_arrow_ipc_file_format(object_store, object_location).await
+            {
+                Ok(true) => Arc::new(ArrowSource::new_file_source(table_schema)),
+                Ok(false) => Arc::new(ArrowSource::new_stream_file_source(table_schema)),
+                Err(e) => Err(e)?,
+            };
+
+        // Preserve projection from the original file source
+        if let Some(projection) = conf.file_source.projection()
+            && let Some(new_source) = source.try_pushdown_projection(projection)?
+        {
+            source = new_source;
+        }
+
         let config = FileScanConfigBuilder::from(conf)
             .with_source(source)
             .build();
@@ -202,12 +240,12 @@ impl FileFormat for ArrowFormat {
         Ok(Arc::new(DataSinkExec::new(input, sink, order_requirements)) as _)
     }
 
-    fn file_source(&self) -> Arc<dyn FileSource> {
-        Arc::new(ArrowSource::default())
+    fn file_source(&self, table_schema: TableSchema) -> Arc<dyn FileSource> {
+        Arc::new(ArrowSource::new_file_source(table_schema))
     }
 }
 
-/// Implements [`FileSink`] for writing to arrow_ipc files
+/// Implements [`FileSink`] for Arrow IPC files
 struct ArrowFileSink {
     config: FileSinkConfig,
 }
@@ -327,10 +365,6 @@ impl DisplayAs for ArrowFileSink {
 
 #[async_trait]
 impl DataSink for ArrowFileSink {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> &SchemaRef {
         self.config.output_schema()
     }
@@ -344,107 +378,178 @@ impl DataSink for ArrowFileSink {
     }
 }
 
+// Custom implementation of inferring schema. Should eventually be moved upstream to arrow-rs.
+// See <https://github.com/apache/arrow-rs/issues/5021>
+
 const ARROW_MAGIC: [u8; 6] = [b'A', b'R', b'R', b'O', b'W', b'1'];
 const CONTINUATION_MARKER: [u8; 4] = [0xff; 4];
 
-/// Custom implementation of inferring schema. Should eventually be moved upstream to arrow-rs.
-/// See <https://github.com/apache/arrow-rs/issues/5021>
-async fn infer_schema_from_file_stream(
+async fn infer_stream_schema(
     mut stream: BoxStream<'static, object_store::Result<Bytes>>,
 ) -> Result<SchemaRef> {
-    // Expected format:
-    // <magic number "ARROW1"> - 6 bytes
-    // <empty padding bytes [to 8 byte boundary]> - 2 bytes
-    // <continuation: 0xFFFFFFFF> - 4 bytes, not present below v0.15.0
-    // <metadata_size: int32> - 4 bytes
-    // <metadata_flatbuffer: bytes>
-    // <rest of file bytes>
-
-    // So in first read we need at least all known sized sections,
-    // which is 6 + 2 + 4 + 4 = 16 bytes.
-    let bytes = collect_at_least_n_bytes(&mut stream, 16, None).await?;
-
-    // Files should start with these magic bytes
-    if bytes[0..6] != ARROW_MAGIC {
-        return Err(ArrowError::ParseError(
-            "Arrow file does not contain correct header".to_string(),
-        ))?;
-    }
-
-    // Since continuation marker bytes added in later versions
-    let (meta_len, rest_of_bytes_start_index) = if bytes[8..12] == CONTINUATION_MARKER {
-        (&bytes[12..16], 16)
+    // IPC streaming format.
+    // See https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format
+    //
+    //   <SCHEMA>
+    //   <DICTIONARY 0>
+    //   ...
+    //   <DICTIONARY k - 1>
+    //   <RECORD BATCH 0>
+    //   ...
+    //   <DICTIONARY x DELTA>
+    //   ...
+    //   <DICTIONARY y DELTA>
+    //   ...
+    //   <RECORD BATCH n - 1>
+    //   <EOS [optional]: 0xFFFFFFFF 0x00000000>
+
+    // The streaming format is made up of a sequence of encapsulated messages.
+    // See https://arrow.apache.org/docs/format/Columnar.html#encapsulated-message-format
+    //
+    //   <continuation: 0xFFFFFFFF>  (added in v0.15.0)
+    //   <metadata_size: int32>
+    //   <metadata_flatbuffer: bytes>
+    //   <padding>
+    //   <message body>
+    //
+    // The first message is the schema.
+
+    // IPC file format is a wrapper around the streaming format with indexing information.
+    // See https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format
+    //
+    //   <magic number "ARROW1">
+    //   <empty padding bytes [to 8 byte boundary]>
+    //   <STREAMING FORMAT with EOS>
+    //   <FOOTER>
+    //   <FOOTER SIZE: int32>
+    //   <magic number "ARROW1">
+
+    // For the purposes of this function, the arrow "preamble" is the magic number, padding,
+    // and the continuation marker. 16 bytes covers the preamble and metadata length
+    // no matter which version or format is used.
+    let bytes = extend_bytes_to_n_length_from_stream(vec![], 16, &mut stream).await?;
+
+    // The preamble length is everything before the metadata length
+    let preamble_len = if bytes[0..6] == ARROW_MAGIC {
+        // File format starts with magic number "ARROW1"
+        if bytes[8..12] == CONTINUATION_MARKER {
+            // Continuation marker was added in v0.15.0
+            12
+        } else {
+            // File format before v0.15.0
+            8
+        }
+    } else if bytes[0..4] == CONTINUATION_MARKER {
+        // Stream format after v0.15.0 starts with continuation marker
+        4
     } else {
-        (&bytes[8..12], 12)
+        // Stream format before v0.15.0 does not have a preamble
+        0
     };
 
-    let meta_len = [meta_len[0], meta_len[1], meta_len[2], meta_len[3]];
-    let meta_len = i32::from_le_bytes(meta_len);
-
-    // Read bytes for Schema message
-    let block_data = if bytes[rest_of_bytes_start_index..].len() < meta_len as usize {
-        // Need to read more bytes to decode Message
-        let mut block_data = Vec::with_capacity(meta_len as usize);
-        // In case we had some spare bytes in our initial read chunk
-        block_data.extend_from_slice(&bytes[rest_of_bytes_start_index..]);
-        let size_to_read = meta_len as usize - block_data.len();
-        let block_data =
-            collect_at_least_n_bytes(&mut stream, size_to_read, Some(block_data)).await?;
-        Cow::Owned(block_data)
-    } else {
-        // Already have the bytes we need
-        let end_index = meta_len as usize + rest_of_bytes_start_index;
-        let block_data = &bytes[rest_of_bytes_start_index..end_index];
-        Cow::Borrowed(block_data)
-    };
+    let meta_len_bytes: [u8; 4] = bytes[preamble_len..preamble_len + 4]
+        .try_into()
+        .map_err(|err| {
+            ArrowError::ParseError(format!(
+                "Unable to read IPC message metadata length: {err:?}"
+            ))
+        })?;
+
+    let meta_len = i32::from_le_bytes([
+        meta_len_bytes[0],
+        meta_len_bytes[1],
+        meta_len_bytes[2],
+        meta_len_bytes[3],
+    ]);
+
+    if meta_len < 0 {
+        return Err(ArrowError::ParseError(
+            "IPC message metadata length is negative".to_string(),
+        )
+        .into());
+    }
+
+    let bytes = extend_bytes_to_n_length_from_stream(
+        bytes,
+        preamble_len + 4 + (meta_len as usize),
+        &mut stream,
+    )
+    .await?;
 
-    // Decode Schema message
-    let message = root_as_message(&block_data).map_err(|err| {
-        ArrowError::ParseError(format!("Unable to read IPC message as metadata: {err:?}"))
+    let message = root_as_message(&bytes[preamble_len + 4..]).map_err(|err| {
+        ArrowError::ParseError(format!("Unable to read IPC message metadata: {err:?}"))
     })?;
-    let ipc_schema = message.header_as_schema().ok_or_else(|| {
-        ArrowError::IpcError("Unable to read IPC message as schema".to_string())
+    let fb_schema = message.header_as_schema().ok_or_else(|| {
+        ArrowError::IpcError("Unable to read IPC message schema".to_string())
     })?;
-    let schema = fb_to_schema(ipc_schema);
+    let schema = fb_to_schema(fb_schema);
 
     Ok(Arc::new(schema))
 }
 
-async fn collect_at_least_n_bytes(
-    stream: &mut BoxStream<'static, object_store::Result<Bytes>>,
+async fn extend_bytes_to_n_length_from_stream(
+    bytes: Vec<u8>,
     n: usize,
-    extend_from: Option<Vec<u8>>,
+    stream: &mut BoxStream<'static, object_store::Result<Bytes>>,
 ) -> Result<Vec<u8>> {
-    let mut buf = extend_from.unwrap_or_else(|| Vec::with_capacity(n));
-    // If extending existing buffer then ensure we read n additional bytes
-    let n = n + buf.len();
-    while let Some(bytes) = stream.next().await.transpose()? {
-        buf.extend_from_slice(&bytes);
+    if bytes.len() >= n {
+        return Ok(bytes);
+    }
+
+    let mut buf = bytes;
+
+    while let Some(b) = stream.next().await.transpose()? {
+        buf.extend_from_slice(&b);
+
         if buf.len() >= n {
             break;
         }
     }
+
     if buf.len() < n {
         return Err(ArrowError::ParseError(
             "Unexpected end of byte stream for Arrow IPC file".to_string(),
-        ))?;
+        )
+        .into());
     }
+
     Ok(buf)
 }
 
+async fn is_object_in_arrow_ipc_file_format(
+    store: Arc<dyn ObjectStore>,
+    object_location: &Path,
+) -> Result<bool> {
+    let get_opts = GetOptions {
+        range: Some(GetRange::Bounded(0..6)),
+        ..Default::default()
+    };
+    let bytes = store
+        .get_opts(object_location, get_opts)
+        .await?
+        .bytes()
+        .await?;
+    Ok(bytes.len() >= 6 && bytes[0..6] == ARROW_MAGIC)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
 
+    use std::any::Any;
+
     use chrono::DateTime;
-    use datafusion_common::config::TableOptions;
     use datafusion_common::DFSchema;
+    use datafusion_common::config::TableOptions;
     use datafusion_execution::config::SessionConfig;
     use datafusion_execution::runtime_env::RuntimeEnv;
     use datafusion_expr::execution_props::ExecutionProps;
-    use datafusion_expr::{AggregateUDF, Expr, LogicalPlan, ScalarUDF, WindowUDF};
+    use datafusion_expr::registry::ExtensionTypeRegistryRef;
+    use datafusion_expr::{
+        AggregateUDF, Expr, HigherOrderUDF, LogicalPlan, ScalarUDF, WindowUDF,
+    };
     use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-    use object_store::{chunked::ChunkedStore, memory::InMemory, path::Path};
+    use object_store::{chunked::ChunkedStore, memory::InMemory};
 
     struct MockSession {
         config: SessionConfig,
@@ -489,6 +594,10 @@ mod tests {
             unimplemented!()
         }
 
+        fn higher_order_functions(&self) -> &HashMap<String, Arc<dyn HigherOrderUDF>> {
+            unimplemented!()
+        }
+
         fn aggregate_functions(&self) -> &HashMap<String, Arc<AggregateUDF>> {
             unimplemented!()
         }
@@ -497,6 +606,10 @@ mod tests {
             unimplemented!()
         }
 
+        fn extension_type_registry(&self) -> &ExtensionTypeRegistryRef {
+            unimplemented!()
+        }
+
         fn runtime_env(&self) -> &Arc<RuntimeEnv> {
             &self.runtime_env
         }
@@ -524,80 +637,146 @@ mod tests {
 
     #[tokio::test]
     async fn test_infer_schema_stream() -> Result<()> {
-        let mut bytes = std::fs::read("tests/data/example.arrow")?;
-        bytes.truncate(bytes.len() - 20); // mangle end to show we don't need to read whole file
-        let location = Path::parse("example.arrow")?;
-        let in_memory_store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
-        in_memory_store.put(&location, bytes.into()).await?;
-
-        let state = MockSession::new();
-        let object_meta = ObjectMeta {
-            location,
-            last_modified: DateTime::default(),
-            size: u64::MAX,
-            e_tag: None,
-            version: None,
-        };
-
-        let arrow_format = ArrowFormat {};
-        let expected = vec!["f0: Int64", "f1: Utf8", "f2: Boolean"];
-
-        // Test chunk sizes where too small so we keep having to read more bytes
-        // And when large enough that first read contains all we need
-        for chunk_size in [7, 3000] {
-            let store = Arc::new(ChunkedStore::new(in_memory_store.clone(), chunk_size));
-            let inferred_schema = arrow_format
+        for file in ["example.arrow", "example_stream.arrow"] {
+            let mut bytes = std::fs::read(format!("tests/data/{file}"))?;
+            bytes.truncate(bytes.len() - 20); // mangle end to show we don't need to read whole file
+            let location = Path::parse(file)?;
+            let in_memory_store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
+            in_memory_store.put(&location, bytes.into()).await?;
+
+            let state = MockSession::new();
+            let object_meta = ObjectMeta {
+                location,
+                last_modified: DateTime::default(),
+                size: u64::MAX,
+                e_tag: None,
+                version: None,
+            };
+
+            let arrow_format = ArrowFormat {};
+            let expected = vec!["f0: Int64", "f1: Utf8", "f2: Boolean"];
+
+            // Test chunk sizes where too small so we keep having to read more bytes
+            // And when large enough that first read contains all we need
+            for chunk_size in [7, 3000] {
+                let store =
+                    Arc::new(ChunkedStore::new(in_memory_store.clone(), chunk_size));
+                let inferred_schema = arrow_format
+                    .infer_schema(
+                        &state,
+                        &(store.clone() as Arc<dyn ObjectStore>),
+                        std::slice::from_ref(&object_meta),
+                    )
+                    .await?;
+                let actual_fields = inferred_schema
+                    .fields()
+                    .iter()
+                    .map(|f| format!("{}: {:?}", f.name(), f.data_type()))
+                    .collect::<Vec<_>>();
+                assert_eq!(expected, actual_fields);
+            }
+        }
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_infer_schema_short_stream() -> Result<()> {
+        for file in ["example.arrow", "example_stream.arrow"] {
+            let mut bytes = std::fs::read(format!("tests/data/{file}"))?;
+            bytes.truncate(20); // should cause error that file shorter than expected
+            let location = Path::parse(file)?;
+            let in_memory_store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
+            in_memory_store.put(&location, bytes.into()).await?;
+
+            let state = MockSession::new();
+            let object_meta = ObjectMeta {
+                location,
+                last_modified: DateTime::default(),
+                size: u64::MAX,
+                e_tag: None,
+                version: None,
+            };
+
+            let arrow_format = ArrowFormat {};
+
+            let store = Arc::new(ChunkedStore::new(in_memory_store.clone(), 7));
+            let err = arrow_format
                 .infer_schema(
                     &state,
                     &(store.clone() as Arc<dyn ObjectStore>),
                     std::slice::from_ref(&object_meta),
                 )
-                .await?;
-            let actual_fields = inferred_schema
-                .fields()
-                .iter()
-                .map(|f| format!("{}: {:?}", f.name(), f.data_type()))
-                .collect::<Vec<_>>();
-            assert_eq!(expected, actual_fields);
+                .await;
+
+            assert!(err.is_err());
+            assert_eq!(
+                "Arrow error: Parser error: Unexpected end of byte stream for Arrow IPC file",
+                err.unwrap_err().to_string().lines().next().unwrap()
+            );
         }
 
         Ok(())
     }
 
     #[tokio::test]
-    async fn test_infer_schema_short_stream() -> Result<()> {
-        let mut bytes = std::fs::read("tests/data/example.arrow")?;
-        bytes.truncate(20); // should cause error that file shorter than expected
-        let location = Path::parse("example.arrow")?;
-        let in_memory_store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
-        in_memory_store.put(&location, bytes.into()).await?;
-
-        let state = MockSession::new();
-        let object_meta = ObjectMeta {
-            location,
-            last_modified: DateTime::default(),
-            size: u64::MAX,
-            e_tag: None,
-            version: None,
-        };
-
-        let arrow_format = ArrowFormat {};
-
-        let store = Arc::new(ChunkedStore::new(in_memory_store.clone(), 7));
-        let err = arrow_format
-            .infer_schema(
-                &state,
-                &(store.clone() as Arc<dyn ObjectStore>),
-                std::slice::from_ref(&object_meta),
-            )
-            .await;
+    async fn test_format_detection_file_format() -> Result<()> {
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("test.arrow");
 
-        assert!(err.is_err());
-        assert_eq!(
-            "Arrow error: Parser error: Unexpected end of byte stream for Arrow IPC file",
-            err.unwrap_err().to_string().lines().next().unwrap()
+        let file_bytes = std::fs::read("tests/data/example.arrow")?;
+        store.put(&path, file_bytes.into()).await?;
+
+        let is_file = is_object_in_arrow_ipc_file_format(store.clone(), &path).await?;
+        assert!(is_file, "Should detect file format");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_format_detection_stream_format() -> Result<()> {
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("test_stream.arrow");
+
+        let stream_bytes = std::fs::read("tests/data/example_stream.arrow")?;
+        store.put(&path, stream_bytes.into()).await?;
+
+        let is_file = is_object_in_arrow_ipc_file_format(store.clone(), &path).await?;
+
+        assert!(!is_file, "Should detect stream format (not file)");
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_format_detection_corrupted_file() -> Result<()> {
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("corrupted.arrow");
+
+        store
+            .put(&path, Bytes::from(vec![0x43, 0x4f, 0x52, 0x41]).into())
+            .await?;
+
+        let is_file = is_object_in_arrow_ipc_file_format(store.clone(), &path).await?;
+
+        assert!(
+            !is_file,
+            "Corrupted file should not be detected as file format"
         );
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_format_detection_empty_file() -> Result<()> {
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("empty.arrow");
+
+        store.put(&path, Bytes::new().into()).await?;
+
+        let result = is_object_in_arrow_ipc_file_format(store.clone(), &path).await;
+
+        // currently errors because it tries to read 0..6 from an empty file
+        assert!(result.is_err(), "Empty file should error");
+
+        Ok(())
+    }
 }
diff --git a/datafusion/datasource-arrow/src/mod.rs b/datafusion/datasource-arrow/src/mod.rs
index 18bb8792c3ffe..0e3b0fdb5e875 100644
--- a/datafusion/datasource-arrow/src/mod.rs
+++ b/datafusion/datasource-arrow/src/mod.rs
@@ -15,10 +15,17 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
 
+//! [`ArrowFormat`]: Apache Arrow file format abstractions
+//!
+//! Note: As of DataFusion 54.0.0, the `compression` feature of this crate
+//! is a no-op, only kept for backwards compatibility purposes, and it will
+//! be removed in a future release.
+
 pub mod file_format;
 pub mod source;
 
diff --git a/datafusion/datasource-arrow/src/source.rs b/datafusion/datasource-arrow/src/source.rs
index f254b7e3ff30f..061f130f24131 100644
--- a/datafusion/datasource-arrow/src/source.rs
+++ b/datafusion/datasource-arrow/src/source.rs
@@ -15,117 +15,111 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
+//! Execution plan for reading Arrow IPC files
+//!
+//! # Naming Note
+//!
+//! The naming in this module can be confusing:
+//! - `ArrowFileOpener` handles the Arrow IPC **file format**
+//!   (with footer, supports parallel reading)
+//! - `ArrowStreamFileOpener` handles the Arrow IPC **stream format**
+//!   (without footer, sequential only)
+//! - `ArrowSource` is the unified `FileSource` implementation that uses either opener
+//!   depending on the format specified at construction
+//!
+//! Despite the name "ArrowStreamFileOpener", it still reads from files - the "Stream"
+//! refers to the Arrow IPC stream format, not streaming I/O. Both formats can be stored
+//! in files on disk or object storage.
+
+use std::io::Cursor;
 use std::sync::Arc;
 
-use datafusion_datasource::as_file_source;
-use datafusion_datasource::schema_adapter::SchemaAdapterFactory;
-use datafusion_datasource::TableSchema;
+use datafusion_datasource::{TableSchema, as_file_source};
 
 use arrow::buffer::Buffer;
-use arrow_ipc::reader::FileDecoder;
+use arrow::ipc::reader::{FileDecoder, FileReader, StreamReader};
 use datafusion_common::error::Result;
-use datafusion_common::{exec_datafusion_err, Statistics};
+use datafusion_common::exec_datafusion_err;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_datasource::PartitionedFile;
 use datafusion_datasource::file::FileSource;
 use datafusion_datasource::file_scan_config::FileScanConfig;
-use datafusion_datasource::PartitionedFile;
+use datafusion_datasource::projection::{ProjectionOpener, SplitProjection};
+use datafusion_physical_expr_common::sort_expr::LexOrdering;
 use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
+use datafusion_physical_plan::projection::ProjectionExprs;
 
 use datafusion_datasource::file_stream::FileOpenFuture;
 use datafusion_datasource::file_stream::FileOpener;
 use futures::StreamExt;
 use itertools::Itertools;
-use object_store::{GetOptions, GetRange, GetResultPayload, ObjectStore};
+use object_store::{GetOptions, GetRange, GetResultPayload, ObjectStore, ObjectStoreExt};
 
-/// Arrow configuration struct that is given to DataSourceExec
-/// Does not hold anything special, since [`FileScanConfig`] is sufficient for arrow
-#[derive(Clone, Default)]
-pub struct ArrowSource {
-    metrics: ExecutionPlanMetricsSet,
-    projected_statistics: Option<Statistics>,
-    schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
+/// Enum indicating which Arrow IPC format to use
+#[derive(Clone, Copy, Debug)]
+enum ArrowFormat {
+    /// Arrow IPC file format (with footer, supports parallel reading)
+    File,
+    /// Arrow IPC stream format (without footer, sequential only)
+    Stream,
 }
 
-impl From<ArrowSource> for Arc<dyn FileSource> {
-    fn from(source: ArrowSource) -> Self {
-        as_file_source(source)
-    }
+/// `FileOpener` for Arrow IPC stream format. Supports only sequential reading.
+pub(crate) struct ArrowStreamFileOpener {
+    object_store: Arc<dyn ObjectStore>,
+    projection: Option<Vec<usize>>,
 }
 
-impl FileSource for ArrowSource {
-    fn create_file_opener(
-        &self,
-        object_store: Arc<dyn ObjectStore>,
-        base_config: &FileScanConfig,
-        _partition: usize,
-    ) -> Arc<dyn FileOpener> {
-        Arc::new(ArrowOpener {
-            object_store,
-            projection: base_config.file_column_projection_indices(),
-        })
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn with_batch_size(&self, _batch_size: usize) -> Arc<dyn FileSource> {
-        Arc::new(Self { ..self.clone() })
-    }
-
-    fn with_schema(&self, _schema: TableSchema) -> Arc<dyn FileSource> {
-        Arc::new(Self { ..self.clone() })
-    }
-    fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource> {
-        let mut conf = self.clone();
-        conf.projected_statistics = Some(statistics);
-        Arc::new(conf)
-    }
-
-    fn with_projection(&self, _config: &FileScanConfig) -> Arc<dyn FileSource> {
-        Arc::new(Self { ..self.clone() })
-    }
-
-    fn metrics(&self) -> &ExecutionPlanMetricsSet {
-        &self.metrics
-    }
+impl FileOpener for ArrowStreamFileOpener {
+    fn open(&self, partitioned_file: PartitionedFile) -> Result<FileOpenFuture> {
+        if partitioned_file.range.is_some() {
+            return Err(exec_datafusion_err!(
+                "ArrowStreamFileOpener does not support range-based reading"
+            ));
+        }
+        let object_store = Arc::clone(&self.object_store);
+        let projection = self.projection.clone();
 
-    fn statistics(&self) -> Result<Statistics> {
-        let statistics = &self.projected_statistics;
-        Ok(statistics
-            .clone()
-            .expect("projected_statistics must be set"))
-    }
+        Ok(Box::pin(async move {
+            let r = object_store
+                .get(&partitioned_file.object_meta.location)
+                .await?;
 
-    fn file_type(&self) -> &str {
-        "arrow"
-    }
+            let stream = match r.payload {
+                #[cfg(not(target_arch = "wasm32"))]
+                GetResultPayload::File(file, _) => futures::stream::iter(
+                    StreamReader::try_new(file.try_clone()?, projection.clone())?,
+                )
+                .map(|r| r.map_err(Into::into))
+                .boxed(),
+                GetResultPayload::Stream(_) => {
+                    let bytes = r.bytes().await?;
+                    let cursor = Cursor::new(bytes);
+                    futures::stream::iter(StreamReader::try_new(
+                        cursor,
+                        projection.clone(),
+                    )?)
+                    .map(|r| r.map_err(Into::into))
+                    .boxed()
+                }
+            };
 
-    fn with_schema_adapter_factory(
-        &self,
-        schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
-    ) -> Result<Arc<dyn FileSource>> {
-        Ok(Arc::new(Self {
-            schema_adapter_factory: Some(schema_adapter_factory),
-            ..self.clone()
+            Ok(stream)
         }))
     }
-
-    fn schema_adapter_factory(&self) -> Option<Arc<dyn SchemaAdapterFactory>> {
-        self.schema_adapter_factory.clone()
-    }
 }
 
-/// The struct arrow that implements `[FileOpener]` trait
-pub struct ArrowOpener {
-    pub object_store: Arc<dyn ObjectStore>,
-    pub projection: Option<Vec<usize>>,
+/// `FileOpener` for Arrow IPC file format. Supports range-based parallel reading.
+pub(crate) struct ArrowFileOpener {
+    object_store: Arc<dyn ObjectStore>,
+    projection: Option<Vec<usize>>,
 }
 
-impl FileOpener for ArrowOpener {
+impl FileOpener for ArrowFileOpener {
     fn open(&self, partitioned_file: PartitionedFile) -> Result<FileOpenFuture> {
         let object_store = Arc::clone(&self.object_store);
         let projection = self.projection.clone();
+
         Ok(Box::pin(async move {
             let range = partitioned_file.range.clone();
             match range {
@@ -133,27 +127,26 @@ impl FileOpener for ArrowOpener {
                     let r = object_store
                         .get(&partitioned_file.object_meta.location)
                         .await?;
-                    match r.payload {
+                    let stream = match r.payload {
                         #[cfg(not(target_arch = "wasm32"))]
-                        GetResultPayload::File(file, _) => {
-                            let arrow_reader = arrow::ipc::reader::FileReader::try_new(
-                                file, projection,
-                            )?;
-                            Ok(futures::stream::iter(arrow_reader)
-                                .map(|r| r.map_err(Into::into))
-                                .boxed())
-                        }
+                        GetResultPayload::File(file, _) => futures::stream::iter(
+                            FileReader::try_new(file.try_clone()?, projection.clone())?,
+                        )
+                        .map(|r| r.map_err(Into::into))
+                        .boxed(),
                         GetResultPayload::Stream(_) => {
                             let bytes = r.bytes().await?;
-                            let cursor = std::io::Cursor::new(bytes);
-                            let arrow_reader = arrow::ipc::reader::FileReader::try_new(
-                                cursor, projection,
-                            )?;
-                            Ok(futures::stream::iter(arrow_reader)
-                                .map(|r| r.map_err(Into::into))
-                                .boxed())
+                            let cursor = Cursor::new(bytes);
+                            futures::stream::iter(FileReader::try_new(
+                                cursor,
+                                projection.clone(),
+                            )?)
+                            .map(|r| r.map_err(Into::into))
+                            .boxed()
                         }
-                    }
+                    };
+
+                    Ok(stream)
                 }
                 Some(range) => {
                     // range is not none, the file maybe split into multiple parts to scan in parallel
@@ -242,7 +235,7 @@ impl FileOpener for ArrowOpener {
                         )
                         .await?;
 
-                    Ok(futures::stream::iter(
+                    let stream = futures::stream::iter(
                         recordbatches
                             .into_iter()
                             .zip(recordbatch_results)
@@ -253,9 +246,416 @@ impl FileOpener for ArrowOpener {
                             }),
                     )
                     .map(|r| r.map_err(Into::into))
-                    .boxed())
+                    .boxed();
+
+                    Ok(stream)
                 }
             }
         }))
     }
 }
+
+/// `FileSource` for both Arrow IPC file and stream formats
+#[derive(Clone)]
+pub struct ArrowSource {
+    format: ArrowFormat,
+    metrics: ExecutionPlanMetricsSet,
+    projection: SplitProjection,
+    table_schema: TableSchema,
+}
+
+impl ArrowSource {
+    /// Creates an [`ArrowSource`] for file format
+    pub fn new_file_source(table_schema: impl Into<TableSchema>) -> Self {
+        let table_schema = table_schema.into();
+        Self {
+            format: ArrowFormat::File,
+            metrics: ExecutionPlanMetricsSet::new(),
+            projection: SplitProjection::unprojected(&table_schema),
+            table_schema,
+        }
+    }
+
+    /// Creates an [`ArrowSource`] for stream format
+    pub fn new_stream_file_source(table_schema: impl Into<TableSchema>) -> Self {
+        let table_schema = table_schema.into();
+        Self {
+            format: ArrowFormat::Stream,
+            metrics: ExecutionPlanMetricsSet::new(),
+            projection: SplitProjection::unprojected(&table_schema),
+            table_schema,
+        }
+    }
+}
+
+impl FileSource for ArrowSource {
+    fn create_file_opener(
+        &self,
+        object_store: Arc<dyn ObjectStore>,
+        _base_config: &FileScanConfig,
+        _partition: usize,
+    ) -> Result<Arc<dyn FileOpener>> {
+        let split_projection = self.projection.clone();
+
+        let opener: Arc<dyn FileOpener> = match self.format {
+            ArrowFormat::File => Arc::new(ArrowFileOpener {
+                object_store,
+                projection: Some(split_projection.file_indices.clone()),
+            }),
+            ArrowFormat::Stream => Arc::new(ArrowStreamFileOpener {
+                object_store,
+                projection: Some(split_projection.file_indices.clone()),
+            }),
+        };
+        ProjectionOpener::try_new(
+            split_projection,
+            opener,
+            self.table_schema.file_schema(),
+        )
+    }
+
+    fn with_batch_size(&self, _batch_size: usize) -> Arc<dyn FileSource> {
+        Arc::new(Self { ..self.clone() })
+    }
+
+    fn metrics(&self) -> &ExecutionPlanMetricsSet {
+        &self.metrics
+    }
+
+    fn file_type(&self) -> &str {
+        match self.format {
+            ArrowFormat::File => "arrow",
+            ArrowFormat::Stream => "arrow_stream",
+        }
+    }
+
+    fn repartitioned(
+        &self,
+        target_partitions: usize,
+        repartition_file_min_size: usize,
+        output_ordering: Option<LexOrdering>,
+        config: &FileScanConfig,
+    ) -> Result<Option<FileScanConfig>> {
+        match self.format {
+            ArrowFormat::Stream => {
+                // The Arrow IPC stream format doesn't support range-based parallel reading
+                // because it lacks a footer with the information that would be needed to
+                // make range-based parallel reading practical. Without the data in the
+                // footer you would either need to read the the entire file and record the
+                // offsets of the record batches and dictionaries, essentially recreating
+                // the footer's contents, or else each partition would need to read the
+                // entire file up to the correct offset which is a lot of duplicate I/O.
+                // We're opting to avoid that entirely by only acting on a single partition
+                // and reading sequentially.
+                Ok(None)
+            }
+            ArrowFormat::File => {
+                // Use the default trait implementation logic for file format
+                use datafusion_datasource::file_groups::FileGroupPartitioner;
+
+                if config.file_compression_type.is_compressed() {
+                    return Ok(None);
+                }
+
+                let repartitioned_file_groups_option = FileGroupPartitioner::new()
+                    .with_target_partitions(target_partitions)
+                    .with_repartition_file_min_size(repartition_file_min_size)
+                    .with_preserve_order_within_groups(output_ordering.is_some())
+                    .repartition_file_groups(&config.file_groups);
+
+                if let Some(repartitioned_file_groups) = repartitioned_file_groups_option
+                {
+                    let mut source = config.clone();
+                    source.file_groups = repartitioned_file_groups;
+                    return Ok(Some(source));
+                }
+                Ok(None)
+            }
+        }
+    }
+
+    fn table_schema(&self) -> &TableSchema {
+        &self.table_schema
+    }
+
+    fn try_pushdown_projection(
+        &self,
+        projection: &ProjectionExprs,
+    ) -> Result<Option<Arc<dyn FileSource>>> {
+        let mut source = self.clone();
+        source.projection = SplitProjection::new(
+            self.table_schema().file_schema(),
+            &source.projection.source.try_merge(projection)?,
+        );
+        Ok(Some(Arc::new(source)))
+    }
+
+    fn projection(&self) -> Option<&ProjectionExprs> {
+        Some(&self.projection.source)
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion_physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit projection expressions
+        let mut tnr = TreeNodeRecursion::Continue;
+        for proj_expr in &self.projection.source {
+            tnr = tnr.visit_sibling(|| f(proj_expr.expr.as_ref()))?;
+        }
+        Ok(tnr)
+    }
+}
+
+/// `FileOpener` wrapper for both Arrow IPC file and stream formats
+pub struct ArrowOpener {
+    pub inner: Arc<dyn FileOpener>,
+}
+
+impl FileOpener for ArrowOpener {
+    fn open(&self, partitioned_file: PartitionedFile) -> Result<FileOpenFuture> {
+        self.inner.open(partitioned_file)
+    }
+}
+
+impl ArrowOpener {
+    /// Creates a new [`ArrowOpener`]
+    pub fn new(inner: Arc<dyn FileOpener>) -> Self {
+        Self { inner }
+    }
+
+    pub fn new_file_opener(
+        object_store: Arc<dyn ObjectStore>,
+        projection: Option<Vec<usize>>,
+    ) -> Self {
+        Self {
+            inner: Arc::new(ArrowFileOpener {
+                object_store,
+                projection,
+            }),
+        }
+    }
+
+    pub fn new_stream_file_opener(
+        object_store: Arc<dyn ObjectStore>,
+        projection: Option<Vec<usize>>,
+    ) -> Self {
+        Self {
+            inner: Arc::new(ArrowStreamFileOpener {
+                object_store,
+                projection,
+            }),
+        }
+    }
+}
+
+impl From<ArrowSource> for Arc<dyn FileSource> {
+    fn from(source: ArrowSource) -> Self {
+        as_file_source(source)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{fs::File, io::Read};
+
+    use arrow::datatypes::{DataType, Field, Schema};
+    use arrow_ipc::reader::{FileReader, StreamReader};
+    use bytes::Bytes;
+    use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
+    use datafusion_execution::object_store::ObjectStoreUrl;
+    use object_store::memory::InMemory;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_file_opener_without_ranges() -> Result<()> {
+        for filename in ["example.arrow", "example_stream.arrow"] {
+            let path = format!("tests/data/{filename}");
+            let path_str = path.as_str();
+            let mut file = File::open(path_str)?;
+            let file_size = file.metadata()?.len();
+
+            let mut buffer = Vec::new();
+            file.read_to_end(&mut buffer)?;
+            let bytes = Bytes::from(buffer);
+
+            let object_store = Arc::new(InMemory::new());
+            let partitioned_file = PartitionedFile::new(filename, file_size);
+            object_store
+                .put(&partitioned_file.object_meta.location, bytes.into())
+                .await?;
+
+            let schema = match FileReader::try_new(File::open(path_str)?, None) {
+                Ok(reader) => reader.schema(),
+                Err(_) => StreamReader::try_new(File::open(path_str)?, None)?.schema(),
+            };
+
+            let source: Arc<dyn FileSource> = if filename.contains("stream") {
+                Arc::new(ArrowSource::new_stream_file_source(schema))
+            } else {
+                Arc::new(ArrowSource::new_file_source(schema))
+            };
+
+            let scan_config = FileScanConfigBuilder::new(
+                ObjectStoreUrl::local_filesystem(),
+                source.clone(),
+            )
+            .build();
+
+            let file_opener = source.create_file_opener(object_store, &scan_config, 0)?;
+            let mut stream = file_opener.open(partitioned_file)?.await?;
+
+            assert!(stream.next().await.is_some());
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_file_opener_with_ranges() -> Result<()> {
+        let filename = "example.arrow";
+        let path = format!("tests/data/{filename}");
+        let path_str = path.as_str();
+        let mut file = File::open(path_str)?;
+        let file_size = file.metadata()?.len();
+
+        let mut buffer = Vec::new();
+        file.read_to_end(&mut buffer)?;
+        let bytes = Bytes::from(buffer);
+
+        let object_store = Arc::new(InMemory::new());
+        let partitioned_file = PartitionedFile::new_with_range(
+            filename.into(),
+            file_size,
+            0,
+            (file_size - 1) as i64,
+        );
+        object_store
+            .put(&partitioned_file.object_meta.location, bytes.into())
+            .await?;
+
+        let schema = FileReader::try_new(File::open(path_str)?, None)?.schema();
+
+        let source = Arc::new(ArrowSource::new_file_source(schema));
+
+        let scan_config = FileScanConfigBuilder::new(
+            ObjectStoreUrl::local_filesystem(),
+            source.clone(),
+        )
+        .build();
+
+        let file_opener = source.create_file_opener(object_store, &scan_config, 0)?;
+        let mut stream = file_opener.open(partitioned_file)?.await?;
+
+        assert!(stream.next().await.is_some());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_stream_opener_errors_with_ranges() -> Result<()> {
+        let filename = "example_stream.arrow";
+        let path = format!("tests/data/{filename}");
+        let path_str = path.as_str();
+        let mut file = File::open(path_str)?;
+        let file_size = file.metadata()?.len();
+
+        let mut buffer = Vec::new();
+        file.read_to_end(&mut buffer)?;
+        let bytes = Bytes::from(buffer);
+
+        let object_store = Arc::new(InMemory::new());
+        let partitioned_file = PartitionedFile::new_with_range(
+            filename.into(),
+            file_size,
+            0,
+            (file_size - 1) as i64,
+        );
+        object_store
+            .put(&partitioned_file.object_meta.location, bytes.into())
+            .await?;
+
+        let schema = StreamReader::try_new(File::open(path_str)?, None)?.schema();
+
+        let source = Arc::new(ArrowSource::new_stream_file_source(schema));
+
+        let scan_config = FileScanConfigBuilder::new(
+            ObjectStoreUrl::local_filesystem(),
+            source.clone(),
+        )
+        .build();
+
+        let file_opener = source.create_file_opener(object_store, &scan_config, 0)?;
+        let result = file_opener.open(partitioned_file);
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_arrow_stream_repartitioning_not_supported() -> Result<()> {
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("f0", DataType::Int64, false)]));
+        let source = ArrowSource::new_stream_file_source(schema);
+
+        let config = FileScanConfigBuilder::new(
+            ObjectStoreUrl::local_filesystem(),
+            Arc::new(source.clone()) as Arc<dyn FileSource>,
+        )
+        .build();
+
+        for target_partitions in [2, 4, 8, 16] {
+            let result =
+                source.repartitioned(target_partitions, 1024 * 1024, None, &config)?;
+
+            assert!(
+                result.is_none(),
+                "Stream format should not support repartitioning with {target_partitions} partitions",
+            );
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_stream_opener_with_projection() -> Result<()> {
+        let filename = "example_stream.arrow";
+        let path = format!("tests/data/{filename}");
+        let path_str = path.as_str();
+        let mut file = File::open(path_str)?;
+        let file_size = file.metadata()?.len();
+
+        let mut buffer = Vec::new();
+        file.read_to_end(&mut buffer)?;
+        let bytes = Bytes::from(buffer);
+
+        let object_store = Arc::new(InMemory::new());
+        let partitioned_file = PartitionedFile::new(filename, file_size);
+        object_store
+            .put(&partitioned_file.object_meta.location, bytes.into())
+            .await?;
+
+        let opener = ArrowStreamFileOpener {
+            object_store,
+            projection: Some(vec![0]), // just the first column
+        };
+
+        let mut stream = opener.open(partitioned_file)?.await?;
+
+        if let Some(batch) = stream.next().await {
+            let batch = batch?;
+            assert_eq!(
+                batch.num_columns(),
+                1,
+                "Projection should result in 1 column"
+            );
+        } else {
+            panic!("Expected at least one batch");
+        }
+
+        Ok(())
+    }
+}
diff --git a/datafusion/datasource-arrow/tests/data/example_stream.arrow b/datafusion/datasource-arrow/tests/data/example_stream.arrow
new file mode 100644
index 0000000000000..dbe10596f3a9d
Binary files /dev/null and b/datafusion/datasource-arrow/tests/data/example_stream.arrow differ
diff --git a/datafusion/datasource-arrow/tests/data/example_stream_corrupted_metadata_length.arrow b/datafusion/datasource-arrow/tests/data/example_stream_corrupted_metadata_length.arrow
new file mode 100644
index 0000000000000..78e56749d7f0d
Binary files /dev/null and b/datafusion/datasource-arrow/tests/data/example_stream_corrupted_metadata_length.arrow differ
diff --git a/datafusion/datasource-arrow/tests/data/example_stream_empty.arrow b/datafusion/datasource-arrow/tests/data/example_stream_empty.arrow
new file mode 100644
index 0000000000000..3fa48d7669d91
Binary files /dev/null and b/datafusion/datasource-arrow/tests/data/example_stream_empty.arrow differ
diff --git a/datafusion/datasource-avro/Cargo.toml b/datafusion/datasource-avro/Cargo.toml
index 6bab899e7f976..adc2be1cb8f24 100644
--- a/datafusion/datasource-avro/Cargo.toml
+++ b/datafusion/datasource-avro/Cargo.toml
@@ -31,22 +31,23 @@ version.workspace = true
 all-features = true
 
 [dependencies]
-apache-avro = { workspace = true }
 arrow = { workspace = true }
+arrow-avro = { workspace = true }
 async-trait = { workspace = true }
 bytes = { workspace = true }
-datafusion-common = { workspace = true, features = ["object_store", "avro"] }
+datafusion-common = { workspace = true, features = ["object_store"] }
 datafusion-datasource = { workspace = true }
-datafusion-physical-expr-common = { workspace = true }
+datafusion-physical-expr-adapter = { workspace = true }
 datafusion-physical-plan = { workspace = true }
 datafusion-session = { workspace = true }
 futures = { workspace = true }
-num-traits = { workspace = true }
 object_store = { workspace = true }
 
 [dev-dependencies]
-serde_json = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
diff --git a/datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs b/datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs
deleted file mode 100644
index a80f18cf818fe..0000000000000
--- a/datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs
+++ /dev/null
@@ -1,1723 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Avro to Arrow array readers
-
-use apache_avro::schema::RecordSchema;
-use apache_avro::{
-    error::Details as AvroErrorDetails,
-    schema::{Schema as AvroSchema, SchemaKind},
-    types::Value,
-    Error as AvroError, Reader as AvroReader,
-};
-use arrow::array::{
-    make_array, Array, ArrayBuilder, ArrayData, ArrayDataBuilder, ArrayRef,
-    BooleanBuilder, LargeStringArray, ListBuilder, NullArray, OffsetSizeTrait,
-    PrimitiveArray, StringArray, StringBuilder, StringDictionaryBuilder,
-};
-use arrow::array::{BinaryArray, FixedSizeBinaryArray, GenericListArray};
-use arrow::buffer::{Buffer, MutableBuffer};
-use arrow::datatypes::{
-    ArrowDictionaryKeyType, ArrowNumericType, ArrowPrimitiveType, DataType, Date32Type,
-    Date64Type, Field, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type,
-    Int8Type, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
-    Time64NanosecondType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType,
-    TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type,
-    UInt8Type,
-};
-use arrow::datatypes::{Fields, SchemaRef};
-use arrow::error::ArrowError;
-use arrow::error::ArrowError::SchemaError;
-use arrow::error::Result as ArrowResult;
-use arrow::record_batch::RecordBatch;
-use arrow::util::bit_util;
-use datafusion_common::arrow_err;
-use datafusion_common::error::{DataFusionError, Result};
-use num_traits::NumCast;
-use std::collections::BTreeMap;
-use std::io::Read;
-use std::sync::Arc;
-
-type RecordSlice<'a> = &'a [&'a Vec<(String, Value)>];
-
-pub struct AvroArrowArrayReader<'a, R: Read> {
-    reader: AvroReader<'a, R>,
-    schema: SchemaRef,
-    schema_lookup: BTreeMap<String, usize>,
-}
-
-impl<R: Read> AvroArrowArrayReader<'_, R> {
-    pub fn try_new(reader: R, schema: SchemaRef) -> Result<Self> {
-        let reader = AvroReader::new(reader)?;
-        let writer_schema = reader.writer_schema().clone();
-        let schema_lookup = Self::schema_lookup(writer_schema)?;
-        Ok(Self {
-            reader,
-            schema,
-            schema_lookup,
-        })
-    }
-
-    pub fn schema_lookup(schema: AvroSchema) -> Result<BTreeMap<String, usize>> {
-        match schema {
-            AvroSchema::Record(RecordSchema {
-                fields, mut lookup, ..
-            }) => {
-                for field in fields {
-                    Self::child_schema_lookup(&field.name, &field.schema, &mut lookup)?;
-                }
-                Ok(lookup)
-            }
-            _ => arrow_err!(SchemaError(
-                "expected avro schema to be a record".to_string(),
-            )),
-        }
-    }
-
-    fn child_schema_lookup<'b>(
-        parent_field_name: &str,
-        schema: &AvroSchema,
-        schema_lookup: &'b mut BTreeMap<String, usize>,
-    ) -> Result<&'b BTreeMap<String, usize>> {
-        match schema {
-            AvroSchema::Union(us) => {
-                let has_nullable = us
-                    .find_schema_with_known_schemata::<apache_avro::Schema>(
-                        &Value::Null,
-                        None,
-                        &None,
-                    )
-                    .is_some();
-                let sub_schemas = us.variants();
-                if has_nullable && sub_schemas.len() == 2 {
-                    if let Some(sub_schema) =
-                        sub_schemas.iter().find(|&s| !matches!(s, AvroSchema::Null))
-                    {
-                        Self::child_schema_lookup(
-                            parent_field_name,
-                            sub_schema,
-                            schema_lookup,
-                        )?;
-                    }
-                }
-            }
-            AvroSchema::Record(RecordSchema { fields, lookup, .. }) => {
-                lookup.iter().for_each(|(field_name, pos)| {
-                    schema_lookup
-                        .insert(format!("{parent_field_name}.{field_name}"), *pos);
-                });
-
-                for field in fields {
-                    let sub_parent_field_name =
-                        format!("{}.{}", parent_field_name, field.name);
-                    Self::child_schema_lookup(
-                        &sub_parent_field_name,
-                        &field.schema,
-                        schema_lookup,
-                    )?;
-                }
-            }
-            AvroSchema::Array(schema) => {
-                let sub_parent_field_name = format!("{parent_field_name}.element");
-                Self::child_schema_lookup(
-                    &sub_parent_field_name,
-                    &schema.items,
-                    schema_lookup,
-                )?;
-            }
-            _ => (),
-        }
-        Ok(schema_lookup)
-    }
-
-    /// Read the next batch of records
-    pub fn next_batch(&mut self, batch_size: usize) -> Option<ArrowResult<RecordBatch>> {
-        let rows_result = self
-            .reader
-            .by_ref()
-            .take(batch_size)
-            .map(|value| match value {
-                Ok(Value::Record(v)) => Ok(v),
-                Err(e) => Err(ArrowError::ParseError(format!(
-                    "Failed to parse avro value: {e}"
-                ))),
-                other => Err(ArrowError::ParseError(format!(
-                    "Row needs to be of type object, got: {other:?}"
-                ))),
-            })
-            .collect::<ArrowResult<Vec<Vec<(String, Value)>>>>();
-
-        let rows = match rows_result {
-            // Return error early
-            Err(e) => return Some(Err(e)),
-            // No rows: return None early
-            Ok(rows) if rows.is_empty() => return None,
-            Ok(rows) => rows,
-        };
-
-        let rows = rows.iter().collect::<Vec<&Vec<(String, Value)>>>();
-        let arrays = self.build_struct_array(&rows, "", self.schema.fields());
-
-        Some(arrays.and_then(|arr| RecordBatch::try_new(Arc::clone(&self.schema), arr)))
-    }
-
-    fn build_boolean_array(&self, rows: RecordSlice, col_name: &str) -> ArrayRef {
-        let mut builder = BooleanBuilder::with_capacity(rows.len());
-        for row in rows {
-            if let Some(value) = self.field_lookup(col_name, row) {
-                if let Some(boolean) = resolve_boolean(value) {
-                    builder.append_value(boolean)
-                } else {
-                    builder.append_null();
-                }
-            } else {
-                builder.append_null();
-            }
-        }
-        Arc::new(builder.finish())
-    }
-
-    fn build_primitive_array<T>(&self, rows: RecordSlice, col_name: &str) -> ArrayRef
-    where
-        T: ArrowNumericType + Resolver,
-        T::Native: NumCast,
-    {
-        Arc::new(
-            rows.iter()
-                .map(|row| {
-                    self.field_lookup(col_name, row)
-                        .and_then(|value| resolve_item::<T>(value))
-                })
-                .collect::<PrimitiveArray<T>>(),
-        )
-    }
-
-    #[inline(always)]
-    fn build_string_dictionary_builder<T>(
-        &self,
-        row_len: usize,
-    ) -> StringDictionaryBuilder<T>
-    where
-        T: ArrowPrimitiveType + ArrowDictionaryKeyType,
-    {
-        StringDictionaryBuilder::with_capacity(row_len, row_len, row_len)
-    }
-
-    fn build_wrapped_list_array(
-        &self,
-        rows: RecordSlice,
-        col_name: &str,
-        key_type: &DataType,
-    ) -> ArrowResult<ArrayRef> {
-        match *key_type {
-            DataType::Int8 => {
-                let dtype = DataType::Dictionary(
-                    Box::new(DataType::Int8),
-                    Box::new(DataType::Utf8),
-                );
-                self.list_array_string_array_builder::<Int8Type>(&dtype, col_name, rows)
-            }
-            DataType::Int16 => {
-                let dtype = DataType::Dictionary(
-                    Box::new(DataType::Int16),
-                    Box::new(DataType::Utf8),
-                );
-                self.list_array_string_array_builder::<Int16Type>(&dtype, col_name, rows)
-            }
-            DataType::Int32 => {
-                let dtype = DataType::Dictionary(
-                    Box::new(DataType::Int32),
-                    Box::new(DataType::Utf8),
-                );
-                self.list_array_string_array_builder::<Int32Type>(&dtype, col_name, rows)
-            }
-            DataType::Int64 => {
-                let dtype = DataType::Dictionary(
-                    Box::new(DataType::Int64),
-                    Box::new(DataType::Utf8),
-                );
-                self.list_array_string_array_builder::<Int64Type>(&dtype, col_name, rows)
-            }
-            DataType::UInt8 => {
-                let dtype = DataType::Dictionary(
-                    Box::new(DataType::UInt8),
-                    Box::new(DataType::Utf8),
-                );
-                self.list_array_string_array_builder::<UInt8Type>(&dtype, col_name, rows)
-            }
-            DataType::UInt16 => {
-                let dtype = DataType::Dictionary(
-                    Box::new(DataType::UInt16),
-                    Box::new(DataType::Utf8),
-                );
-                self.list_array_string_array_builder::<UInt16Type>(&dtype, col_name, rows)
-            }
-            DataType::UInt32 => {
-                let dtype = DataType::Dictionary(
-                    Box::new(DataType::UInt32),
-                    Box::new(DataType::Utf8),
-                );
-                self.list_array_string_array_builder::<UInt32Type>(&dtype, col_name, rows)
-            }
-            DataType::UInt64 => {
-                let dtype = DataType::Dictionary(
-                    Box::new(DataType::UInt64),
-                    Box::new(DataType::Utf8),
-                );
-                self.list_array_string_array_builder::<UInt64Type>(&dtype, col_name, rows)
-            }
-            ref e => Err(SchemaError(format!(
-                "Data type is currently not supported for dictionaries in list : {e}"
-            ))),
-        }
-    }
-
-    #[inline(always)]
-    fn list_array_string_array_builder<D>(
-        &self,
-        data_type: &DataType,
-        col_name: &str,
-        rows: RecordSlice,
-    ) -> ArrowResult<ArrayRef>
-    where
-        D: ArrowPrimitiveType + ArrowDictionaryKeyType,
-    {
-        let mut builder: Box<dyn ArrayBuilder> = match data_type {
-            DataType::Utf8 => {
-                let values_builder = StringBuilder::with_capacity(rows.len(), 5);
-                Box::new(ListBuilder::new(values_builder))
-            }
-            DataType::Dictionary(_, _) => {
-                let values_builder =
-                    self.build_string_dictionary_builder::<D>(rows.len() * 5);
-                Box::new(ListBuilder::new(values_builder))
-            }
-            e => {
-                return Err(SchemaError(format!(
-                    "Nested list data builder type is not supported: {e}"
-                )))
-            }
-        };
-
-        for row in rows {
-            if let Some(value) = self.field_lookup(col_name, row) {
-                let value = maybe_resolve_union(value);
-                // value can be an array or a scalar
-                let vals: Vec<Option<String>> = if let Value::String(v) = value {
-                    vec![Some(v.to_string())]
-                } else if let Value::Array(n) = value {
-                    n.iter()
-                        .map(resolve_string)
-                        .collect::<ArrowResult<Vec<Option<String>>>>()?
-                        .into_iter()
-                        .collect::<Vec<Option<String>>>()
-                } else if let Value::Null = value {
-                    vec![None]
-                } else if !matches!(value, Value::Record(_)) {
-                    vec![resolve_string(value)?]
-                } else {
-                    return Err(SchemaError(
-                        "Only scalars are currently supported in Avro arrays".to_string(),
-                    ));
-                };
-
-                // TODO: ARROW-10335: APIs of dictionary arrays and others are different. Unify
-                // them.
-                match data_type {
-                    DataType::Utf8 => {
-                        let builder = builder
-                            .as_any_mut()
-                            .downcast_mut::<ListBuilder<StringBuilder>>()
-                            .ok_or_else(||SchemaError(
-                                "Cast failed for ListBuilder<StringBuilder> during nested data parsing".to_string(),
-                            ))?;
-                        for val in vals {
-                            if let Some(v) = val {
-                                builder.values().append_value(&v)
-                            } else {
-                                builder.values().append_null()
-                            };
-                        }
-
-                        // Append to the list
-                        builder.append(true);
-                    }
-                    DataType::Dictionary(_, _) => {
-                        let builder = builder.as_any_mut().downcast_mut::<ListBuilder<StringDictionaryBuilder<D>>>().ok_or_else(||SchemaError(
-                            "Cast failed for ListBuilder<StringDictionaryBuilder> during nested data parsing".to_string(),
-                        ))?;
-                        for val in vals {
-                            if let Some(v) = val {
-                                let _ = builder.values().append(&v)?;
-                            } else {
-                                builder.values().append_null()
-                            };
-                        }
-
-                        // Append to the list
-                        builder.append(true);
-                    }
-                    e => {
-                        return Err(SchemaError(format!(
-                            "Nested list data builder type is not supported: {e}"
-                        )))
-                    }
-                }
-            }
-        }
-
-        Ok(builder.finish() as ArrayRef)
-    }
-
-    #[inline(always)]
-    fn build_dictionary_array<T>(
-        &self,
-        rows: RecordSlice,
-        col_name: &str,
-    ) -> ArrowResult<ArrayRef>
-    where
-        T::Native: NumCast,
-        T: ArrowPrimitiveType + ArrowDictionaryKeyType,
-    {
-        let mut builder: StringDictionaryBuilder<T> =
-            self.build_string_dictionary_builder(rows.len());
-        for row in rows {
-            if let Some(value) = self.field_lookup(col_name, row) {
-                if let Ok(Some(str_v)) = resolve_string(value) {
-                    builder.append(str_v).map(drop)?
-                } else {
-                    builder.append_null()
-                }
-            } else {
-                builder.append_null()
-            }
-        }
-        Ok(Arc::new(builder.finish()) as ArrayRef)
-    }
-
-    #[inline(always)]
-    fn build_string_dictionary_array(
-        &self,
-        rows: RecordSlice,
-        col_name: &str,
-        key_type: &DataType,
-        value_type: &DataType,
-    ) -> ArrowResult<ArrayRef> {
-        if let DataType::Utf8 = *value_type {
-            match *key_type {
-                DataType::Int8 => self.build_dictionary_array::<Int8Type>(rows, col_name),
-                DataType::Int16 => {
-                    self.build_dictionary_array::<Int16Type>(rows, col_name)
-                }
-                DataType::Int32 => {
-                    self.build_dictionary_array::<Int32Type>(rows, col_name)
-                }
-                DataType::Int64 => {
-                    self.build_dictionary_array::<Int64Type>(rows, col_name)
-                }
-                DataType::UInt8 => {
-                    self.build_dictionary_array::<UInt8Type>(rows, col_name)
-                }
-                DataType::UInt16 => {
-                    self.build_dictionary_array::<UInt16Type>(rows, col_name)
-                }
-                DataType::UInt32 => {
-                    self.build_dictionary_array::<UInt32Type>(rows, col_name)
-                }
-                DataType::UInt64 => {
-                    self.build_dictionary_array::<UInt64Type>(rows, col_name)
-                }
-                _ => Err(SchemaError("unsupported dictionary key type".to_string())),
-            }
-        } else {
-            Err(SchemaError(
-                "dictionary types other than UTF-8 not yet supported".to_string(),
-            ))
-        }
-    }
-
-    /// Build a nested GenericListArray from a list of unnested `Value`s
-    fn build_nested_list_array<OffsetSize: OffsetSizeTrait>(
-        &self,
-        parent_field_name: &str,
-        rows: &[&Value],
-        list_field: &Field,
-    ) -> ArrowResult<ArrayRef> {
-        // build list offsets
-        let mut cur_offset = OffsetSize::zero();
-        let list_len = rows.len();
-        let num_list_bytes = bit_util::ceil(list_len, 8);
-        let mut offsets = Vec::with_capacity(list_len + 1);
-        let mut list_nulls = MutableBuffer::from_len_zeroed(num_list_bytes);
-        offsets.push(cur_offset);
-        rows.iter().enumerate().for_each(|(i, v)| {
-            // TODO: unboxing Union(Array(Union(...))) should probably be done earlier
-            let v = maybe_resolve_union(v);
-            if let Value::Array(a) = v {
-                cur_offset += OffsetSize::from_usize(a.len()).unwrap();
-                bit_util::set_bit(&mut list_nulls, i);
-            } else if let Value::Null = v {
-                // value is null, not incremented
-            } else {
-                cur_offset += OffsetSize::one();
-            }
-            offsets.push(cur_offset);
-        });
-        let valid_len = cur_offset.to_usize().unwrap();
-        let array_data = match list_field.data_type() {
-            DataType::Null => NullArray::new(valid_len).into_data(),
-            DataType::Boolean => {
-                let num_bytes = bit_util::ceil(valid_len, 8);
-                let mut bool_values = MutableBuffer::from_len_zeroed(num_bytes);
-                let mut bool_nulls =
-                    MutableBuffer::new(num_bytes).with_bitset(num_bytes, true);
-                let mut curr_index = 0;
-                rows.iter().for_each(|v| {
-                    if let Value::Array(vs) = v {
-                        vs.iter().for_each(|value| {
-                            if let Value::Boolean(child) = value {
-                                // if valid boolean, append value
-                                if *child {
-                                    bit_util::set_bit(&mut bool_values, curr_index);
-                                }
-                            } else {
-                                // null slot
-                                bit_util::unset_bit(&mut bool_nulls, curr_index);
-                            }
-                            curr_index += 1;
-                        });
-                    }
-                });
-                ArrayData::builder(list_field.data_type().clone())
-                    .len(valid_len)
-                    .add_buffer(bool_values.into())
-                    .null_bit_buffer(Some(bool_nulls.into()))
-                    .build()
-                    .unwrap()
-            }
-            DataType::Int8 => self.read_primitive_list_values::<Int8Type>(rows),
-            DataType::Int16 => self.read_primitive_list_values::<Int16Type>(rows),
-            DataType::Int32 => self.read_primitive_list_values::<Int32Type>(rows),
-            DataType::Int64 => self.read_primitive_list_values::<Int64Type>(rows),
-            DataType::UInt8 => self.read_primitive_list_values::<UInt8Type>(rows),
-            DataType::UInt16 => self.read_primitive_list_values::<UInt16Type>(rows),
-            DataType::UInt32 => self.read_primitive_list_values::<UInt32Type>(rows),
-            DataType::UInt64 => self.read_primitive_list_values::<UInt64Type>(rows),
-            DataType::Float16 => {
-                return Err(SchemaError("Float16 not supported".to_string()))
-            }
-            DataType::Float32 => self.read_primitive_list_values::<Float32Type>(rows),
-            DataType::Float64 => self.read_primitive_list_values::<Float64Type>(rows),
-            DataType::Timestamp(_, _)
-            | DataType::Date32
-            | DataType::Date64
-            | DataType::Time32(_)
-            | DataType::Time64(_) => {
-                return Err(SchemaError(
-                    "Temporal types are not yet supported, see ARROW-4803".to_string(),
-                ))
-            }
-            DataType::Utf8 => flatten_string_values(rows)
-                .into_iter()
-                .collect::<StringArray>()
-                .into_data(),
-            DataType::LargeUtf8 => flatten_string_values(rows)
-                .into_iter()
-                .collect::<LargeStringArray>()
-                .into_data(),
-            DataType::List(field) => {
-                let child = self.build_nested_list_array::<i32>(
-                    parent_field_name,
-                    &flatten_values(rows),
-                    field,
-                )?;
-                child.to_data()
-            }
-            DataType::LargeList(field) => {
-                let child = self.build_nested_list_array::<i64>(
-                    parent_field_name,
-                    &flatten_values(rows),
-                    field,
-                )?;
-                child.to_data()
-            }
-            DataType::Struct(fields) => {
-                // extract list values, with non-lists converted to Value::Null
-                let array_item_count = rows
-                    .iter()
-                    .map(|row| match maybe_resolve_union(row) {
-                        Value::Array(values) => values.len(),
-                        _ => 1,
-                    })
-                    .sum();
-                let num_bytes = bit_util::ceil(array_item_count, 8);
-                let mut null_buffer = MutableBuffer::from_len_zeroed(num_bytes);
-                let mut struct_index = 0;
-                let null_struct_array = vec![("null".to_string(), Value::Null)];
-                let rows: Vec<&Vec<(String, Value)>> = rows
-                    .iter()
-                    .map(|v| maybe_resolve_union(v))
-                    .flat_map(|row| {
-                        if let Value::Array(values) = row {
-                            values
-                                .iter()
-                                .map(maybe_resolve_union)
-                                .map(|v| match v {
-                                    Value::Record(record) => {
-                                        bit_util::set_bit(&mut null_buffer, struct_index);
-                                        struct_index += 1;
-                                        record
-                                    }
-                                    Value::Null => {
-                                        struct_index += 1;
-                                        &null_struct_array
-                                    }
-                                    other => panic!("expected Record, got {other:?}"),
-                                })
-                                .collect::<Vec<&Vec<(String, Value)>>>()
-                        } else {
-                            struct_index += 1;
-                            vec![&null_struct_array]
-                        }
-                    })
-                    .collect();
-
-                let sub_parent_field_name =
-                    format!("{}.{}", parent_field_name, list_field.name());
-                let arrays =
-                    self.build_struct_array(&rows, &sub_parent_field_name, fields)?;
-                let data_type = DataType::Struct(fields.clone());
-                ArrayDataBuilder::new(data_type)
-                    .len(rows.len())
-                    .null_bit_buffer(Some(null_buffer.into()))
-                    .child_data(arrays.into_iter().map(|a| a.to_data()).collect())
-                    .build()
-                    .unwrap()
-            }
-            datatype => {
-                return Err(SchemaError(format!(
-                    "Nested list of {datatype} not supported"
-                )));
-            }
-        };
-        // build list
-        let list_data = ArrayData::builder(DataType::List(Arc::new(list_field.clone())))
-            .len(list_len)
-            .add_buffer(Buffer::from_slice_ref(&offsets))
-            .add_child_data(array_data)
-            .null_bit_buffer(Some(list_nulls.into()))
-            .build()
-            .unwrap();
-        Ok(Arc::new(GenericListArray::<OffsetSize>::from(list_data)))
-    }
-
-    /// Builds the child values of a `StructArray`, falling short of constructing the StructArray.
-    /// The function does not construct the StructArray as some callers would want the child arrays.
-    ///
-    /// *Note*: The function is recursive, and will read nested structs.
-    fn build_struct_array(
-        &self,
-        rows: RecordSlice,
-        parent_field_name: &str,
-        struct_fields: &Fields,
-    ) -> ArrowResult<Vec<ArrayRef>> {
-        let arrays: ArrowResult<Vec<ArrayRef>> = struct_fields
-            .iter()
-            .map(|field| {
-                let field_path = if parent_field_name.is_empty() {
-                    field.name().to_string()
-                } else {
-                    format!("{}.{}", parent_field_name, field.name())
-                };
-                let arr = match field.data_type() {
-                    DataType::Null => Arc::new(NullArray::new(rows.len())) as ArrayRef,
-                    DataType::Boolean => self.build_boolean_array(rows, &field_path),
-                    DataType::Float64 => {
-                        self.build_primitive_array::<Float64Type>(rows, &field_path)
-                    }
-                    DataType::Float32 => {
-                        self.build_primitive_array::<Float32Type>(rows, &field_path)
-                    }
-                    DataType::Int64 => {
-                        self.build_primitive_array::<Int64Type>(rows, &field_path)
-                    }
-                    DataType::Int32 => {
-                        self.build_primitive_array::<Int32Type>(rows, &field_path)
-                    }
-                    DataType::Int16 => {
-                        self.build_primitive_array::<Int16Type>(rows, &field_path)
-                    }
-                    DataType::Int8 => {
-                        self.build_primitive_array::<Int8Type>(rows, &field_path)
-                    }
-                    DataType::UInt64 => {
-                        self.build_primitive_array::<UInt64Type>(rows, &field_path)
-                    }
-                    DataType::UInt32 => {
-                        self.build_primitive_array::<UInt32Type>(rows, &field_path)
-                    }
-                    DataType::UInt16 => {
-                        self.build_primitive_array::<UInt16Type>(rows, &field_path)
-                    }
-                    DataType::UInt8 => {
-                        self.build_primitive_array::<UInt8Type>(rows, &field_path)
-                    }
-                    // TODO: this is incomplete
-                    DataType::Timestamp(unit, _) => match unit {
-                        TimeUnit::Second => self
-                            .build_primitive_array::<TimestampSecondType>(
-                                rows,
-                                &field_path,
-                            ),
-                        TimeUnit::Microsecond => self
-                            .build_primitive_array::<TimestampMicrosecondType>(
-                                rows,
-                                &field_path,
-                            ),
-                        TimeUnit::Millisecond => self
-                            .build_primitive_array::<TimestampMillisecondType>(
-                                rows,
-                                &field_path,
-                            ),
-                        TimeUnit::Nanosecond => self
-                            .build_primitive_array::<TimestampNanosecondType>(
-                                rows,
-                                &field_path,
-                            ),
-                    },
-                    DataType::Date64 => {
-                        self.build_primitive_array::<Date64Type>(rows, &field_path)
-                    }
-                    DataType::Date32 => {
-                        self.build_primitive_array::<Date32Type>(rows, &field_path)
-                    }
-                    DataType::Time64(unit) => match unit {
-                        TimeUnit::Microsecond => self
-                            .build_primitive_array::<Time64MicrosecondType>(
-                                rows,
-                                &field_path,
-                            ),
-                        TimeUnit::Nanosecond => self
-                            .build_primitive_array::<Time64NanosecondType>(
-                                rows,
-                                &field_path,
-                            ),
-                        t => {
-                            return Err(SchemaError(format!(
-                                "TimeUnit {t:?} not supported with Time64"
-                            )))
-                        }
-                    },
-                    DataType::Time32(unit) => match unit {
-                        TimeUnit::Second => self
-                            .build_primitive_array::<Time32SecondType>(rows, &field_path),
-                        TimeUnit::Millisecond => self
-                            .build_primitive_array::<Time32MillisecondType>(
-                                rows,
-                                &field_path,
-                            ),
-                        t => {
-                            return Err(SchemaError(format!(
-                                "TimeUnit {t:?} not supported with Time32"
-                            )))
-                        }
-                    },
-                    DataType::Utf8 | DataType::LargeUtf8 => Arc::new(
-                        rows.iter()
-                            .map(|row| {
-                                let maybe_value = self.field_lookup(&field_path, row);
-                                match maybe_value {
-                                    None => Ok(None),
-                                    Some(v) => resolve_string(v),
-                                }
-                            })
-                            .collect::<ArrowResult<StringArray>>()?,
-                    )
-                        as ArrayRef,
-                    DataType::Binary | DataType::LargeBinary => Arc::new(
-                        rows.iter()
-                            .map(|row| {
-                                let maybe_value = self.field_lookup(&field_path, row);
-                                maybe_value.and_then(resolve_bytes)
-                            })
-                            .collect::<BinaryArray>(),
-                    )
-                        as ArrayRef,
-                    DataType::FixedSizeBinary(ref size) => {
-                        Arc::new(FixedSizeBinaryArray::try_from_sparse_iter_with_size(
-                            rows.iter().map(|row| {
-                                let maybe_value = self.field_lookup(&field_path, row);
-                                maybe_value.and_then(|v| resolve_fixed(v, *size as usize))
-                            }),
-                            *size,
-                        )?) as ArrayRef
-                    }
-                    DataType::List(ref list_field) => {
-                        match list_field.data_type() {
-                            DataType::Dictionary(ref key_ty, _) => {
-                                self.build_wrapped_list_array(rows, &field_path, key_ty)?
-                            }
-                            _ => {
-                                // extract rows by name
-                                let extracted_rows = rows
-                                    .iter()
-                                    .map(|row| {
-                                        self.field_lookup(&field_path, row)
-                                            .unwrap_or(&Value::Null)
-                                    })
-                                    .collect::<Vec<&Value>>();
-                                self.build_nested_list_array::<i32>(
-                                    &field_path,
-                                    &extracted_rows,
-                                    list_field,
-                                )?
-                            }
-                        }
-                    }
-                    DataType::Dictionary(ref key_ty, ref val_ty) => self
-                        .build_string_dictionary_array(
-                            rows,
-                            &field_path,
-                            key_ty,
-                            val_ty,
-                        )?,
-                    DataType::Struct(fields) => {
-                        let len = rows.len();
-                        let num_bytes = bit_util::ceil(len, 8);
-                        let mut null_buffer = MutableBuffer::from_len_zeroed(num_bytes);
-                        let empty_vec = vec![];
-                        let struct_rows = rows
-                            .iter()
-                            .enumerate()
-                            .map(|(i, row)| (i, self.field_lookup(&field_path, row)))
-                            .map(|(i, v)| {
-                                let v = v.map(maybe_resolve_union);
-                                match v {
-                                    Some(Value::Record(value)) => {
-                                        bit_util::set_bit(&mut null_buffer, i);
-                                        value
-                                    }
-                                    None | Some(Value::Null) => &empty_vec,
-                                    other => {
-                                        panic!("expected struct got {other:?}");
-                                    }
-                                }
-                            })
-                            .collect::<Vec<&Vec<(String, Value)>>>();
-                        let arrays =
-                            self.build_struct_array(&struct_rows, &field_path, fields)?;
-                        // construct a struct array's data in order to set null buffer
-                        let data_type = DataType::Struct(fields.clone());
-                        let data = ArrayDataBuilder::new(data_type)
-                            .len(len)
-                            .null_bit_buffer(Some(null_buffer.into()))
-                            .child_data(arrays.into_iter().map(|a| a.to_data()).collect())
-                            .build()?;
-                        make_array(data)
-                    }
-                    _ => {
-                        return Err(SchemaError(format!(
-                            "type {} not supported",
-                            field.data_type()
-                        )))
-                    }
-                };
-                Ok(arr)
-            })
-            .collect();
-        arrays
-    }
-
-    /// Read the primitive list's values into ArrayData
-    fn read_primitive_list_values<T>(&self, rows: &[&Value]) -> ArrayData
-    where
-        T: ArrowPrimitiveType + ArrowNumericType,
-        T::Native: NumCast,
-    {
-        let values = rows
-            .iter()
-            .flat_map(|row| {
-                let row = maybe_resolve_union(row);
-                if let Value::Array(values) = row {
-                    values
-                        .iter()
-                        .map(resolve_item::<T>)
-                        .collect::<Vec<Option<T::Native>>>()
-                } else if let Some(f) = resolve_item::<T>(row) {
-                    vec![Some(f)]
-                } else {
-                    vec![]
-                }
-            })
-            .collect::<Vec<Option<T::Native>>>();
-        let array = values.iter().collect::<PrimitiveArray<T>>();
-        array.to_data()
-    }
-
-    fn field_lookup<'b>(
-        &self,
-        name: &str,
-        row: &'b [(String, Value)],
-    ) -> Option<&'b Value> {
-        self.schema_lookup
-            .get(name)
-            .and_then(|i| row.get(*i))
-            .map(|o| &o.1)
-    }
-}
-
-/// Flattens a list of Avro values, by flattening lists, and treating all other values as
-/// single-value lists.
-/// This is used to read into nested lists (list of list, list of struct) and non-dictionary lists.
-#[inline]
-fn flatten_values<'a>(values: &[&'a Value]) -> Vec<&'a Value> {
-    values
-        .iter()
-        .flat_map(|row| {
-            let v = maybe_resolve_union(row);
-            if let Value::Array(values) = v {
-                values.iter().collect()
-            } else {
-                // we interpret a scalar as a single-value list to minimise data loss
-                vec![v]
-            }
-        })
-        .collect()
-}
-
-/// Flattens a list into string values, dropping Value::Null in the process.
-/// This is useful for interpreting any Avro array as string, dropping nulls.
-/// See `value_as_string`.
-#[inline]
-fn flatten_string_values(values: &[&Value]) -> Vec<Option<String>> {
-    values
-        .iter()
-        .flat_map(|row| {
-            let row = maybe_resolve_union(row);
-            if let Value::Array(values) = row {
-                values
-                    .iter()
-                    .map(|s| resolve_string(s).ok().flatten())
-                    .collect::<Vec<Option<_>>>()
-            } else if let Value::Null = row {
-                vec![]
-            } else {
-                vec![resolve_string(row).ok().flatten()]
-            }
-        })
-        .collect::<Vec<Option<_>>>()
-}
-
-/// Reads an Avro value as a string, regardless of its type.
-/// This is useful if the expected datatype is a string, in which case we preserve
-/// all the values regardless of they type.
-fn resolve_string(v: &Value) -> ArrowResult<Option<String>> {
-    let v = if let Value::Union(_, b) = v { b } else { v };
-    match v {
-        Value::String(s) => Ok(Some(s.clone())),
-        Value::Bytes(bytes) => String::from_utf8(bytes.to_vec())
-            .map_err(|e| AvroError::new(AvroErrorDetails::ConvertToUtf8(e)))
-            .map(Some),
-        Value::Enum(_, s) => Ok(Some(s.clone())),
-        Value::Null => Ok(None),
-        other => Err(AvroError::new(AvroErrorDetails::GetString(other.clone()))),
-    }
-    .map_err(|e| SchemaError(format!("expected resolvable string : {e}")))
-}
-
-fn resolve_u8(v: &Value) -> Option<u8> {
-    let v = match v {
-        Value::Union(_, inner) => inner.as_ref(),
-        _ => v,
-    };
-
-    match v {
-        Value::Int(n) => u8::try_from(*n).ok(),
-        Value::Long(n) => u8::try_from(*n).ok(),
-        _ => None,
-    }
-}
-
-fn resolve_bytes(v: &Value) -> Option<Vec<u8>> {
-    let v = match v {
-        Value::Union(_, inner) => inner.as_ref(),
-        _ => v,
-    };
-
-    match v {
-        Value::Bytes(bytes) => Some(bytes.clone()),
-        Value::String(s) => Some(s.as_bytes().to_vec()),
-        Value::Array(items) => items.iter().map(resolve_u8).collect::<Option<Vec<u8>>>(),
-        _ => None,
-    }
-}
-
-fn resolve_fixed(v: &Value, size: usize) -> Option<Vec<u8>> {
-    let v = if let Value::Union(_, b) = v { b } else { v };
-    match v {
-        Value::Fixed(n, bytes) => {
-            if *n == size {
-                Some(bytes.clone())
-            } else {
-                None
-            }
-        }
-        _ => None,
-    }
-}
-
-fn resolve_boolean(value: &Value) -> Option<bool> {
-    let v = if let Value::Union(_, b) = value {
-        b
-    } else {
-        value
-    };
-    match v {
-        Value::Boolean(boolean) => Some(*boolean),
-        _ => None,
-    }
-}
-
-trait Resolver: ArrowPrimitiveType {
-    fn resolve(value: &Value) -> Option<Self::Native>;
-}
-
-fn resolve_item<T: Resolver>(value: &Value) -> Option<T::Native> {
-    T::resolve(value)
-}
-
-fn maybe_resolve_union(value: &Value) -> &Value {
-    if SchemaKind::from(value) == SchemaKind::Union {
-        // Pull out the Union, and attempt to resolve against it.
-        match value {
-            Value::Union(_, b) => b,
-            _ => unreachable!(),
-        }
-    } else {
-        value
-    }
-}
-
-impl<N> Resolver for N
-where
-    N: ArrowNumericType,
-    N::Native: NumCast,
-{
-    fn resolve(value: &Value) -> Option<Self::Native> {
-        let value = maybe_resolve_union(value);
-        match value {
-            Value::Int(i) | Value::TimeMillis(i) | Value::Date(i) => NumCast::from(*i),
-            Value::Long(l)
-            | Value::TimeMicros(l)
-            | Value::TimestampMillis(l)
-            | Value::TimestampMicros(l) => NumCast::from(*l),
-            Value::Float(f) => NumCast::from(*f),
-            Value::Double(f) => NumCast::from(*f),
-            Value::Duration(_d) => unimplemented!(), // shenanigans type
-            Value::Null => None,
-            _ => unreachable!(),
-        }
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use crate::avro_to_arrow::{Reader, ReaderBuilder};
-    use arrow::array::Array;
-    use arrow::datatypes::DataType;
-    use arrow::datatypes::{Field, TimeUnit};
-    use datafusion_common::assert_batches_eq;
-    use datafusion_common::cast::{
-        as_int32_array, as_int64_array, as_list_array, as_timestamp_microsecond_array,
-    };
-    use std::fs::File;
-    use std::sync::Arc;
-
-    fn build_reader(name: &'_ str, batch_size: usize) -> Reader<'_, File> {
-        let testdata = datafusion_common::test_util::arrow_test_data();
-        let filename = format!("{testdata}/avro/{name}");
-        let builder = ReaderBuilder::new()
-            .read_schema()
-            .with_batch_size(batch_size);
-        builder.build(File::open(filename).unwrap()).unwrap()
-    }
-
-    // TODO: Fixed, Enum, Dictionary
-
-    #[test]
-    fn test_time_avro_milliseconds() {
-        let mut reader = build_reader("alltypes_plain.avro", 10);
-        let batch = reader.next().unwrap().unwrap();
-
-        assert_eq!(11, batch.num_columns());
-        assert_eq!(8, batch.num_rows());
-
-        let schema = reader.schema();
-        let batch_schema = batch.schema();
-        assert_eq!(schema, batch_schema);
-
-        let timestamp_col = schema.column_with_name("timestamp_col").unwrap();
-        assert_eq!(
-            &DataType::Timestamp(TimeUnit::Microsecond, None),
-            timestamp_col.1.data_type()
-        );
-        let timestamp_array =
-            as_timestamp_microsecond_array(batch.column(timestamp_col.0)).unwrap();
-        for i in 0..timestamp_array.len() {
-            assert!(timestamp_array.is_valid(i));
-        }
-        assert_eq!(1235865600000000, timestamp_array.value(0));
-        assert_eq!(1235865660000000, timestamp_array.value(1));
-        assert_eq!(1238544000000000, timestamp_array.value(2));
-        assert_eq!(1238544060000000, timestamp_array.value(3));
-        assert_eq!(1233446400000000, timestamp_array.value(4));
-        assert_eq!(1233446460000000, timestamp_array.value(5));
-        assert_eq!(1230768000000000, timestamp_array.value(6));
-        assert_eq!(1230768060000000, timestamp_array.value(7));
-    }
-
-    #[test]
-    fn test_avro_read_list() {
-        let mut reader = build_reader("list_columns.avro", 3);
-        let schema = reader.schema();
-        let (col_id_index, _) = schema.column_with_name("int64_list").unwrap();
-        let batch = reader.next().unwrap().unwrap();
-        assert_eq!(batch.num_columns(), 2);
-        assert_eq!(batch.num_rows(), 3);
-        let a_array = as_list_array(batch.column(col_id_index)).unwrap();
-        assert_eq!(
-            *a_array.data_type(),
-            DataType::List(Arc::new(Field::new("element", DataType::Int64, true)))
-        );
-        let array = a_array.value(0);
-        assert_eq!(*array.data_type(), DataType::Int64);
-
-        assert_eq!(
-            6,
-            as_int64_array(&array)
-                .unwrap()
-                .iter()
-                .flatten()
-                .sum::<i64>()
-        );
-    }
-    #[test]
-    fn test_avro_read_nested_list() {
-        let mut reader = build_reader("nested_lists.snappy.avro", 3);
-        let batch = reader.next().unwrap().unwrap();
-        assert_eq!(batch.num_columns(), 2);
-        assert_eq!(batch.num_rows(), 3);
-    }
-
-    #[test]
-    fn test_complex_list() {
-        let schema = apache_avro::Schema::parse_str(
-            r#"
-            {
-              "type": "record",
-              "name": "r1",
-              "fields": [
-                {
-                  "name": "headers",
-                  "type": ["null", {
-                        "type": "array",
-                        "items": ["null",{
-                            "name":"r2",
-                            "type": "record",
-                            "fields":[
-                                {"name":"name", "type": ["null", "string"], "default": null},
-                                {"name":"value", "type": ["null", "string"], "default": null}
-                            ]
-                        }]
-                    }],
-                    "default": null
-                }
-              ]
-            }"#,
-        )
-        .unwrap();
-        let r1 = apache_avro::to_value(serde_json::json!({
-            "headers": [
-                {
-                    "name": "a",
-                    "value": "b"
-                }
-            ]
-        }))
-        .unwrap()
-        .resolve(&schema)
-        .unwrap();
-
-        let mut w = apache_avro::Writer::new(&schema, vec![]);
-        w.append(r1).unwrap();
-        let bytes = w.into_inner().unwrap();
-
-        let mut reader = ReaderBuilder::new()
-            .read_schema()
-            .with_batch_size(2)
-            .build(std::io::Cursor::new(bytes))
-            .unwrap();
-
-        let batch = reader.next().unwrap().unwrap();
-        assert_eq!(batch.num_rows(), 1);
-        assert_eq!(batch.num_columns(), 1);
-        let expected = [
-            "+-----------------------+",
-            "| headers               |",
-            "+-----------------------+",
-            "| [{name: a, value: b}] |",
-            "+-----------------------+",
-        ];
-        assert_batches_eq!(expected, &[batch]);
-    }
-
-    #[test]
-    fn test_complex_struct() {
-        let schema = apache_avro::Schema::parse_str(
-            r#"
-        {
-          "type": "record",
-          "name": "r1",
-          "fields": [
-            {
-              "name": "dns",
-              "type": [
-                "null",
-                {
-                  "type": "record",
-                  "name": "r13",
-                  "fields": [
-                    {
-                      "name": "answers",
-                      "type": [
-                        "null",
-                        {
-                          "type": "array",
-                          "items": [
-                            "null",
-                            {
-                              "type": "record",
-                              "name": "r292",
-                              "fields": [
-                                {
-                                  "name": "class",
-                                  "type": ["null", "string"],
-                                  "default": null
-                                },
-                                {
-                                  "name": "data",
-                                  "type": ["null", "string"],
-                                  "default": null
-                                },
-                                {
-                                  "name": "name",
-                                  "type": ["null", "string"],
-                                  "default": null
-                                },
-                                {
-                                  "name": "ttl",
-                                  "type": ["null", "long"],
-                                  "default": null
-                                },
-                                {
-                                  "name": "type",
-                                  "type": ["null", "string"],
-                                  "default": null
-                                }
-                              ]
-                            }
-                          ]
-                        }
-                      ],
-                      "default": null
-                    },
-                    {
-                      "name": "header_flags",
-                      "type": [
-                        "null",
-                        {
-                          "type": "array",
-                          "items": ["null", "string"]
-                        }
-                      ],
-                      "default": null
-                    },
-                    {
-                      "name": "id",
-                      "type": ["null", "string"],
-                      "default": null
-                    },
-                    {
-                      "name": "op_code",
-                      "type": ["null", "string"],
-                      "default": null
-                    },
-                    {
-                      "name": "question",
-                      "type": [
-                        "null",
-                        {
-                          "type": "record",
-                          "name": "r288",
-                          "fields": [
-                            {
-                              "name": "class",
-                              "type": ["null", "string"],
-                              "default": null
-                            },
-                            {
-                              "name": "name",
-                              "type": ["null", "string"],
-                              "default": null
-                            },
-                            {
-                              "name": "registered_domain",
-                              "type": ["null", "string"],
-                              "default": null
-                            },
-                            {
-                              "name": "subdomain",
-                              "type": ["null", "string"],
-                              "default": null
-                            },
-                            {
-                              "name": "top_level_domain",
-                              "type": ["null", "string"],
-                              "default": null
-                            },
-                            {
-                              "name": "type",
-                              "type": ["null", "string"],
-                              "default": null
-                            }
-                          ]
-                        }
-                      ],
-                      "default": null
-                    },
-                    {
-                      "name": "resolved_ip",
-                      "type": [
-                        "null",
-                        {
-                          "type": "array",
-                          "items": ["null", "string"]
-                        }
-                      ],
-                      "default": null
-                    },
-                    {
-                      "name": "response_code",
-                      "type": ["null", "string"],
-                      "default": null
-                    },
-                    {
-                      "name": "type",
-                      "type": ["null", "string"],
-                      "default": null
-                    }
-                  ]
-                }
-              ],
-              "default": null
-            }
-          ]
-        }"#,
-        )
-        .unwrap();
-
-        let jv1 = serde_json::json!({
-          "dns": {
-            "answers": [
-                {
-                    "data": "CHNlY3VyaXR5BnVidW50dQMjb20AAAEAAQAAAAgABLl9vic=",
-                    "type": "1"
-                },
-                {
-                    "data": "CHNlY3VyaXR5BnVidW50dQNjb20AAAEAABAAAAgABLl9viQ=",
-                    "type": "1"
-                },
-                {
-                    "data": "CHNlT3VyaXR5BnVidW50dQNjb20AAAEAAQAAAAgABFu9Wyc=",
-                    "type": "1"
-                }
-            ],
-            "question": {
-                "name": "security.ubuntu.com",
-                "type": "A"
-            },
-            "resolved_ip": [
-                "67.43.156.1",
-                "67.43.156.2",
-                "67.43.156.3"
-            ],
-            "response_code": "0"
-          }
-        });
-        let r1 = apache_avro::to_value(jv1)
-            .unwrap()
-            .resolve(&schema)
-            .unwrap();
-
-        let mut w = apache_avro::Writer::new(&schema, vec![]);
-        w.append(r1).unwrap();
-        let bytes = w.into_inner().unwrap();
-
-        let mut reader = ReaderBuilder::new()
-            .read_schema()
-            .with_batch_size(1)
-            .build(std::io::Cursor::new(bytes))
-            .unwrap();
-
-        let batch = reader.next().unwrap().unwrap();
-        assert_eq!(batch.num_rows(), 1);
-        assert_eq!(batch.num_columns(), 1);
-
-        let expected = [
-            "+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+",
-            "| dns                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |",
-            "+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+",
-            "| {answers: [{class: , data: CHNlY3VyaXR5BnVidW50dQMjb20AAAEAAQAAAAgABLl9vic=, name: , ttl: , type: 1}, {class: , data: CHNlY3VyaXR5BnVidW50dQNjb20AAAEAABAAAAgABLl9viQ=, name: , ttl: , type: 1}, {class: , data: CHNlT3VyaXR5BnVidW50dQNjb20AAAEAAQAAAAgABFu9Wyc=, name: , ttl: , type: 1}], header_flags: , id: , op_code: , question: {class: , name: security.ubuntu.com, registered_domain: , subdomain: , top_level_domain: , type: A}, resolved_ip: [67.43.156.1, 67.43.156.2, 67.43.156.3], response_code: 0, type: } |",
-            "+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+",
-        ];
-        assert_batches_eq!(expected, &[batch]);
-    }
-
-    #[test]
-    fn test_deep_nullable_struct() {
-        let schema = apache_avro::Schema::parse_str(
-            r#"
-            {
-                "type": "record",
-                "name": "r1",
-                "fields": [
-                  {
-                    "name": "col1",
-                    "type": [
-                      "null",
-                      {
-                        "type": "record",
-                        "name": "r2",
-                        "fields": [
-                          {
-                            "name": "col2",
-                            "type": [
-                              "null",
-                              {
-                                "type": "record",
-                                "name": "r3",
-                                "fields": [
-                                  {
-                                    "name": "col3",
-                                    "type": [
-                                      "null",
-                                      {
-                                        "type": "record",
-                                        "name": "r4",
-                                        "fields": [
-                                          {
-                                            "name": "col4",
-                                            "type": [
-                                              "null",
-                                              {
-                                                "type": "record",
-                                                "name": "r5",
-                                                "fields": [
-                                                  {
-                                                    "name": "col5",
-                                                    "type": ["null", "string"]
-                                                  }
-                                                ]
-                                              }
-                                            ]
-                                          }
-                                        ]
-                                      }
-                                    ]
-                                  }
-                                ]
-                              }
-                            ]
-                          }
-                        ]
-                      }
-                    ]
-                  }
-                ]
-              }
-            "#,
-        )
-        .unwrap();
-        let r1 = apache_avro::to_value(serde_json::json!({
-            "col1": {
-                "col2": {
-                    "col3": {
-                        "col4": {
-                            "col5": "hello"
-                        }
-                    }
-                }
-            }
-        }))
-        .unwrap()
-        .resolve(&schema)
-        .unwrap();
-        let r2 = apache_avro::to_value(serde_json::json!({
-            "col1": {
-                "col2": {
-                    "col3": {
-                        "col4": {
-                            "col5": null
-                        }
-                    }
-                }
-            }
-        }))
-        .unwrap()
-        .resolve(&schema)
-        .unwrap();
-        let r3 = apache_avro::to_value(serde_json::json!({
-            "col1": {
-                "col2": {
-                    "col3": null
-                }
-            }
-        }))
-        .unwrap()
-        .resolve(&schema)
-        .unwrap();
-        let r4 = apache_avro::to_value(serde_json::json!({ "col1": null }))
-            .unwrap()
-            .resolve(&schema)
-            .unwrap();
-
-        let mut w = apache_avro::Writer::new(&schema, vec![]);
-        w.append(r1).unwrap();
-        w.append(r2).unwrap();
-        w.append(r3).unwrap();
-        w.append(r4).unwrap();
-        let bytes = w.into_inner().unwrap();
-
-        let mut reader = ReaderBuilder::new()
-            .read_schema()
-            .with_batch_size(4)
-            .build(std::io::Cursor::new(bytes))
-            .unwrap();
-
-        let batch = reader.next().unwrap().unwrap();
-
-        let expected = [
-            "+---------------------------------------+",
-            "| col1                                  |",
-            "+---------------------------------------+",
-            "| {col2: {col3: {col4: {col5: hello}}}} |",
-            "| {col2: {col3: {col4: {col5: }}}}      |",
-            "| {col2: {col3: }}                      |",
-            "|                                       |",
-            "+---------------------------------------+",
-        ];
-        assert_batches_eq!(expected, &[batch]);
-    }
-
-    #[test]
-    fn test_avro_nullable_struct() {
-        let schema = apache_avro::Schema::parse_str(
-            r#"
-            {
-              "type": "record",
-              "name": "r1",
-              "fields": [
-                {
-                  "name": "col1",
-                  "type": [
-                    "null",
-                    {
-                      "type": "record",
-                      "name": "r2",
-                      "fields": [
-                        {
-                          "name": "col2",
-                          "type": ["null", "string"]
-                        }
-                      ]
-                    }
-                  ],
-                  "default": null
-                }
-              ]
-            }"#,
-        )
-        .unwrap();
-        let r1 = apache_avro::to_value(serde_json::json!({ "col1": null }))
-            .unwrap()
-            .resolve(&schema)
-            .unwrap();
-        let r2 = apache_avro::to_value(serde_json::json!({
-            "col1": {
-                "col2": "hello"
-            }
-        }))
-        .unwrap()
-        .resolve(&schema)
-        .unwrap();
-        let r3 = apache_avro::to_value(serde_json::json!({
-            "col1": {
-                "col2": null
-            }
-        }))
-        .unwrap()
-        .resolve(&schema)
-        .unwrap();
-
-        let mut w = apache_avro::Writer::new(&schema, vec![]);
-        w.append(r1).unwrap();
-        w.append(r2).unwrap();
-        w.append(r3).unwrap();
-        let bytes = w.into_inner().unwrap();
-
-        let mut reader = ReaderBuilder::new()
-            .read_schema()
-            .with_batch_size(3)
-            .build(std::io::Cursor::new(bytes))
-            .unwrap();
-        let batch = reader.next().unwrap().unwrap();
-        assert_eq!(batch.num_rows(), 3);
-        assert_eq!(batch.num_columns(), 1);
-
-        let expected = [
-            "+---------------+",
-            "| col1          |",
-            "+---------------+",
-            "|               |",
-            "| {col2: hello} |",
-            "| {col2: }      |",
-            "+---------------+",
-        ];
-        assert_batches_eq!(expected, &[batch]);
-    }
-
-    #[test]
-    fn test_avro_nullable_struct_array() {
-        let schema = apache_avro::Schema::parse_str(
-            r#"
-            {
-              "type": "record",
-              "name": "r1",
-              "fields": [
-                {
-                  "name": "col1",
-                  "type": [
-                    "null",
-                    {
-                        "type": "array",
-                        "items": {
-                            "type": [
-                                "null",
-                                {
-                                    "type": "record",
-                                    "name": "Item",
-                                    "fields": [
-                                        {
-                                            "name": "id",
-                                            "type": "long"
-                                        }
-                                    ]
-                                }
-                            ]
-                        }
-                    }
-                  ],
-                  "default": null
-                }
-              ]
-            }"#,
-        )
-        .unwrap();
-        let jv1 = serde_json::json!({
-            "col1": [
-                {
-                    "id": 234
-                },
-                {
-                    "id": 345
-                }
-            ]
-        });
-        let r1 = apache_avro::to_value(jv1)
-            .unwrap()
-            .resolve(&schema)
-            .unwrap();
-        let r2 = apache_avro::to_value(serde_json::json!({ "col1": null }))
-            .unwrap()
-            .resolve(&schema)
-            .unwrap();
-
-        let mut w = apache_avro::Writer::new(&schema, vec![]);
-        for _i in 0..5 {
-            w.append(r1.clone()).unwrap();
-        }
-        w.append(r2).unwrap();
-        let bytes = w.into_inner().unwrap();
-
-        let mut reader = ReaderBuilder::new()
-            .read_schema()
-            .with_batch_size(20)
-            .build(std::io::Cursor::new(bytes))
-            .unwrap();
-        let batch = reader.next().unwrap().unwrap();
-        assert_eq!(batch.num_rows(), 6);
-        assert_eq!(batch.num_columns(), 1);
-
-        let expected = [
-            "+------------------------+",
-            "| col1                   |",
-            "+------------------------+",
-            "| [{id: 234}, {id: 345}] |",
-            "| [{id: 234}, {id: 345}] |",
-            "| [{id: 234}, {id: 345}] |",
-            "| [{id: 234}, {id: 345}] |",
-            "| [{id: 234}, {id: 345}] |",
-            "|                        |",
-            "+------------------------+",
-        ];
-        assert_batches_eq!(expected, &[batch]);
-    }
-
-    #[test]
-    fn test_avro_iterator() {
-        let reader = build_reader("alltypes_plain.avro", 5);
-        let schema = reader.schema();
-        let (col_id_index, _) = schema.column_with_name("id").unwrap();
-
-        let mut sum_num_rows = 0;
-        let mut num_batches = 0;
-        let mut sum_id = 0;
-        for batch in reader {
-            let batch = batch.unwrap();
-            assert_eq!(11, batch.num_columns());
-            sum_num_rows += batch.num_rows();
-            num_batches += 1;
-            let batch_schema = batch.schema();
-            assert_eq!(schema, batch_schema);
-            let a_array = as_int32_array(batch.column(col_id_index)).unwrap();
-            sum_id += (0..a_array.len()).map(|i| a_array.value(i)).sum::<i32>();
-        }
-        assert_eq!(8, sum_num_rows);
-        assert_eq!(2, num_batches);
-        assert_eq!(28, sum_id);
-    }
-}
diff --git a/datafusion/datasource-avro/src/avro_to_arrow/reader.rs b/datafusion/datasource-avro/src/avro_to_arrow/reader.rs
deleted file mode 100644
index 5ef35e2bee89e..0000000000000
--- a/datafusion/datasource-avro/src/avro_to_arrow/reader.rs
+++ /dev/null
@@ -1,353 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use super::arrow_array_reader::AvroArrowArrayReader;
-use arrow::datatypes::{Fields, SchemaRef};
-use arrow::error::Result as ArrowResult;
-use arrow::record_batch::RecordBatch;
-use datafusion_common::Result;
-use std::io::{Read, Seek};
-use std::sync::Arc;
-
-/// Avro file reader builder
-#[derive(Debug)]
-pub struct ReaderBuilder {
-    /// Optional schema for the Avro file
-    ///
-    /// If the schema is not supplied, the reader will try to read the schema.
-    schema: Option<SchemaRef>,
-    /// Batch size (number of records to load each time)
-    ///
-    /// The default batch size when using the `ReaderBuilder` is 1024 records
-    batch_size: usize,
-    /// Optional projection for which columns to load (zero-based column indices)
-    projection: Option<Vec<String>>,
-}
-
-impl Default for ReaderBuilder {
-    fn default() -> Self {
-        Self {
-            schema: None,
-            batch_size: 1024,
-            projection: None,
-        }
-    }
-}
-
-impl ReaderBuilder {
-    /// Create a new builder for configuring Avro parsing options.
-    ///
-    /// To convert a builder into a reader, call `Reader::from_builder`
-    ///
-    /// # Example
-    ///
-    /// ```
-    /// use std::fs::File;
-    ///
-    /// use datafusion_datasource_avro::avro_to_arrow::{Reader, ReaderBuilder};
-    ///
-    /// fn example() -> Reader<'static, File> {
-    ///     let file = File::open("test/data/basic.avro").unwrap();
-    ///
-    ///     // create a builder, inferring the schema with the first 100 records
-    ///     let builder = ReaderBuilder::new().read_schema().with_batch_size(100);
-    ///
-    ///     let reader = builder.build::<File>(file).unwrap();
-    ///
-    ///     reader
-    /// }
-    /// ```
-    pub fn new() -> Self {
-        Self::default()
-    }
-
-    /// Set the Avro file's schema
-    pub fn with_schema(mut self, schema: SchemaRef) -> Self {
-        self.schema = Some(schema);
-        self
-    }
-
-    /// Set the Avro reader to infer the schema of the file
-    pub fn read_schema(mut self) -> Self {
-        // remove any schema that is set
-        self.schema = None;
-        self
-    }
-
-    /// Set the batch size (number of records to load at one time)
-    pub fn with_batch_size(mut self, batch_size: usize) -> Self {
-        self.batch_size = batch_size;
-        self
-    }
-
-    /// Set the reader's column projection
-    pub fn with_projection(mut self, projection: Vec<String>) -> Self {
-        self.projection = Some(projection);
-        self
-    }
-
-    /// Create a new `Reader` from the `ReaderBuilder`
-    pub fn build<'a, R>(self, source: R) -> Result<Reader<'a, R>>
-    where
-        R: Read + Seek,
-    {
-        let mut source = source;
-
-        // check if schema should be inferred
-        let schema = match self.schema {
-            Some(schema) => schema,
-            None => Arc::new(super::read_avro_schema_from_reader(&mut source)?),
-        };
-        source.rewind()?;
-        Reader::try_new(source, schema, self.batch_size, self.projection)
-    }
-}
-
-/// Avro file record  reader
-pub struct Reader<'a, R: Read> {
-    array_reader: AvroArrowArrayReader<'a, R>,
-    schema: SchemaRef,
-    batch_size: usize,
-}
-
-impl<R: Read> Reader<'_, R> {
-    /// Create a new Avro Reader from any value that implements the `Read` trait.
-    ///
-    /// If reading a `File`, you can customise the Reader, such as to enable schema
-    /// inference, use `ReaderBuilder`.
-    ///
-    /// If projection is provided, it uses a schema with only the fields in the projection, respecting their order.
-    /// Only the first level of projection is handled. No further projection currently occurs, but would be
-    /// useful if plucking values from a struct, e.g. getting `a.b.c.e` from `a.b.c.{d, e}`.
-    pub fn try_new(
-        reader: R,
-        schema: SchemaRef,
-        batch_size: usize,
-        projection: Option<Vec<String>>,
-    ) -> Result<Self> {
-        let projected_schema = projection.as_ref().filter(|p| !p.is_empty()).map_or_else(
-            || Arc::clone(&schema),
-            |proj| {
-                Arc::new(arrow::datatypes::Schema::new(
-                    proj.iter()
-                        .filter_map(|name| {
-                            schema.column_with_name(name).map(|(_, f)| f.clone())
-                        })
-                        .collect::<Fields>(),
-                ))
-            },
-        );
-
-        Ok(Self {
-            array_reader: AvroArrowArrayReader::try_new(
-                reader,
-                Arc::clone(&projected_schema),
-            )?,
-            schema: projected_schema,
-            batch_size,
-        })
-    }
-
-    /// Returns the schema of the reader, useful for getting the schema without reading
-    /// record batches
-    pub fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.schema)
-    }
-}
-
-impl<R: Read> Iterator for Reader<'_, R> {
-    type Item = ArrowResult<RecordBatch>;
-
-    /// Returns the next batch of results (defined by `self.batch_size`), or `None` if there
-    /// are no more results.
-    fn next(&mut self) -> Option<Self::Item> {
-        self.array_reader.next_batch(self.batch_size)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use arrow::array::*;
-    use arrow::array::{
-        BinaryArray, BooleanArray, Float32Array, Float64Array, Int32Array, Int64Array,
-        TimestampMicrosecondArray,
-    };
-    use arrow::datatypes::TimeUnit;
-    use arrow::datatypes::{DataType, Field};
-    use std::fs::File;
-
-    fn build_reader(name: &'_ str, projection: Option<Vec<String>>) -> Reader<'_, File> {
-        let testdata = datafusion_common::test_util::arrow_test_data();
-        let filename = format!("{testdata}/avro/{name}");
-        let mut builder = ReaderBuilder::new().read_schema().with_batch_size(64);
-        if let Some(projection) = projection {
-            builder = builder.with_projection(projection);
-        }
-        builder.build(File::open(filename).unwrap()).unwrap()
-    }
-
-    fn get_col<'a, T: 'static>(
-        batch: &'a RecordBatch,
-        col: (usize, &Field),
-    ) -> Option<&'a T> {
-        batch.column(col.0).as_any().downcast_ref::<T>()
-    }
-
-    #[test]
-    fn test_avro_basic() {
-        let mut reader = build_reader("alltypes_dictionary.avro", None);
-        let batch = reader.next().unwrap().unwrap();
-
-        assert_eq!(11, batch.num_columns());
-        assert_eq!(2, batch.num_rows());
-
-        let schema = reader.schema();
-        let batch_schema = batch.schema();
-        assert_eq!(schema, batch_schema);
-
-        let id = schema.column_with_name("id").unwrap();
-        assert_eq!(0, id.0);
-        assert_eq!(&DataType::Int32, id.1.data_type());
-        let col = get_col::<Int32Array>(&batch, id).unwrap();
-        assert_eq!(0, col.value(0));
-        assert_eq!(1, col.value(1));
-        let bool_col = schema.column_with_name("bool_col").unwrap();
-        assert_eq!(1, bool_col.0);
-        assert_eq!(&DataType::Boolean, bool_col.1.data_type());
-        let col = get_col::<BooleanArray>(&batch, bool_col).unwrap();
-        assert!(col.value(0));
-        assert!(!col.value(1));
-        let tinyint_col = schema.column_with_name("tinyint_col").unwrap();
-        assert_eq!(2, tinyint_col.0);
-        assert_eq!(&DataType::Int32, tinyint_col.1.data_type());
-        let col = get_col::<Int32Array>(&batch, tinyint_col).unwrap();
-        assert_eq!(0, col.value(0));
-        assert_eq!(1, col.value(1));
-        let smallint_col = schema.column_with_name("smallint_col").unwrap();
-        assert_eq!(3, smallint_col.0);
-        assert_eq!(&DataType::Int32, smallint_col.1.data_type());
-        let col = get_col::<Int32Array>(&batch, smallint_col).unwrap();
-        assert_eq!(0, col.value(0));
-        assert_eq!(1, col.value(1));
-        let int_col = schema.column_with_name("int_col").unwrap();
-        assert_eq!(4, int_col.0);
-        let col = get_col::<Int32Array>(&batch, int_col).unwrap();
-        assert_eq!(0, col.value(0));
-        assert_eq!(1, col.value(1));
-        assert_eq!(&DataType::Int32, int_col.1.data_type());
-        let col = get_col::<Int32Array>(&batch, int_col).unwrap();
-        assert_eq!(0, col.value(0));
-        assert_eq!(1, col.value(1));
-        let bigint_col = schema.column_with_name("bigint_col").unwrap();
-        assert_eq!(5, bigint_col.0);
-        let col = get_col::<Int64Array>(&batch, bigint_col).unwrap();
-        assert_eq!(0, col.value(0));
-        assert_eq!(10, col.value(1));
-        assert_eq!(&DataType::Int64, bigint_col.1.data_type());
-        let float_col = schema.column_with_name("float_col").unwrap();
-        assert_eq!(6, float_col.0);
-        let col = get_col::<Float32Array>(&batch, float_col).unwrap();
-        assert_eq!(0.0, col.value(0));
-        assert_eq!(1.1, col.value(1));
-        assert_eq!(&DataType::Float32, float_col.1.data_type());
-        let col = get_col::<Float32Array>(&batch, float_col).unwrap();
-        assert_eq!(0.0, col.value(0));
-        assert_eq!(1.1, col.value(1));
-        let double_col = schema.column_with_name("double_col").unwrap();
-        assert_eq!(7, double_col.0);
-        assert_eq!(&DataType::Float64, double_col.1.data_type());
-        let col = get_col::<Float64Array>(&batch, double_col).unwrap();
-        assert_eq!(0.0, col.value(0));
-        assert_eq!(10.1, col.value(1));
-        let date_string_col = schema.column_with_name("date_string_col").unwrap();
-        assert_eq!(8, date_string_col.0);
-        assert_eq!(&DataType::Binary, date_string_col.1.data_type());
-        let col = get_col::<BinaryArray>(&batch, date_string_col).unwrap();
-        assert_eq!("01/01/09".as_bytes(), col.value(0));
-        assert_eq!("01/01/09".as_bytes(), col.value(1));
-        let string_col = schema.column_with_name("string_col").unwrap();
-        assert_eq!(9, string_col.0);
-        assert_eq!(&DataType::Binary, string_col.1.data_type());
-        let col = get_col::<BinaryArray>(&batch, string_col).unwrap();
-        assert_eq!("0".as_bytes(), col.value(0));
-        assert_eq!("1".as_bytes(), col.value(1));
-        let timestamp_col = schema.column_with_name("timestamp_col").unwrap();
-        assert_eq!(10, timestamp_col.0);
-        assert_eq!(
-            &DataType::Timestamp(TimeUnit::Microsecond, None),
-            timestamp_col.1.data_type()
-        );
-        let col = get_col::<TimestampMicrosecondArray>(&batch, timestamp_col).unwrap();
-        assert_eq!(1230768000000000, col.value(0));
-        assert_eq!(1230768060000000, col.value(1));
-    }
-
-    #[test]
-    fn test_avro_with_projection() {
-        // Test projection to filter and reorder columns
-        let projection = Some(vec![
-            "string_col".to_string(),
-            "double_col".to_string(),
-            "bool_col".to_string(),
-        ]);
-        let mut reader = build_reader("alltypes_dictionary.avro", projection);
-        let batch = reader.next().unwrap().unwrap();
-
-        // Only 3 columns should be present (not all 11)
-        assert_eq!(3, batch.num_columns());
-        assert_eq!(2, batch.num_rows());
-
-        let schema = reader.schema();
-        let batch_schema = batch.schema();
-        assert_eq!(schema, batch_schema);
-
-        // Verify columns are in the order specified in projection
-        // First column should be string_col (was at index 9 in original)
-        assert_eq!("string_col", schema.field(0).name());
-        assert_eq!(&DataType::Binary, schema.field(0).data_type());
-        let col = batch
-            .column(0)
-            .as_any()
-            .downcast_ref::<BinaryArray>()
-            .unwrap();
-        assert_eq!("0".as_bytes(), col.value(0));
-        assert_eq!("1".as_bytes(), col.value(1));
-
-        // Second column should be double_col (was at index 7 in original)
-        assert_eq!("double_col", schema.field(1).name());
-        assert_eq!(&DataType::Float64, schema.field(1).data_type());
-        let col = batch
-            .column(1)
-            .as_any()
-            .downcast_ref::<Float64Array>()
-            .unwrap();
-        assert_eq!(0.0, col.value(0));
-        assert_eq!(10.1, col.value(1));
-
-        // Third column should be bool_col (was at index 1 in original)
-        assert_eq!("bool_col", schema.field(2).name());
-        assert_eq!(&DataType::Boolean, schema.field(2).data_type());
-        let col = batch
-            .column(2)
-            .as_any()
-            .downcast_ref::<BooleanArray>()
-            .unwrap();
-        assert!(col.value(0));
-        assert!(!col.value(1));
-    }
-}
diff --git a/datafusion/datasource-avro/src/avro_to_arrow/schema.rs b/datafusion/datasource-avro/src/avro_to_arrow/schema.rs
deleted file mode 100644
index 3fce0d4826a22..0000000000000
--- a/datafusion/datasource-avro/src/avro_to_arrow/schema.rs
+++ /dev/null
@@ -1,523 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use apache_avro::schema::{
-    Alias, DecimalSchema, EnumSchema, FixedSchema, Name, RecordSchema,
-};
-use apache_avro::types::Value;
-use apache_avro::Schema as AvroSchema;
-use arrow::datatypes::{DataType, IntervalUnit, Schema, TimeUnit, UnionMode};
-use arrow::datatypes::{Field, UnionFields};
-use datafusion_common::error::Result;
-use std::collections::HashMap;
-use std::sync::Arc;
-
-/// Converts an avro schema to an arrow schema
-pub fn to_arrow_schema(avro_schema: &apache_avro::Schema) -> Result<Schema> {
-    let mut schema_fields = vec![];
-    match avro_schema {
-        AvroSchema::Record(RecordSchema { fields, .. }) => {
-            for field in fields {
-                schema_fields.push(schema_to_field_with_props(
-                    &field.schema,
-                    Some(&field.name),
-                    field.is_nullable(),
-                    Some(external_props(&field.schema)),
-                )?)
-            }
-        }
-        schema => schema_fields.push(schema_to_field(schema, Some(""), false)?),
-    }
-
-    let schema = Schema::new(schema_fields);
-    Ok(schema)
-}
-
-fn schema_to_field(
-    schema: &apache_avro::Schema,
-    name: Option<&str>,
-    nullable: bool,
-) -> Result<Field> {
-    schema_to_field_with_props(schema, name, nullable, Default::default())
-}
-
-fn schema_to_field_with_props(
-    schema: &AvroSchema,
-    name: Option<&str>,
-    nullable: bool,
-    props: Option<HashMap<String, String>>,
-) -> Result<Field> {
-    let mut nullable = nullable;
-    let field_type: DataType = match schema {
-        AvroSchema::Ref { .. } => todo!("Add support for AvroSchema::Ref"),
-        AvroSchema::Null => DataType::Null,
-        AvroSchema::Boolean => DataType::Boolean,
-        AvroSchema::Int => DataType::Int32,
-        AvroSchema::Long => DataType::Int64,
-        AvroSchema::Float => DataType::Float32,
-        AvroSchema::Double => DataType::Float64,
-        AvroSchema::Bytes => DataType::Binary,
-        AvroSchema::String => DataType::Utf8,
-        AvroSchema::Array(item_schema) => DataType::List(Arc::new(
-            schema_to_field_with_props(&item_schema.items, Some("element"), false, None)?,
-        )),
-        AvroSchema::Map(value_schema) => {
-            let value_field = schema_to_field_with_props(
-                &value_schema.types,
-                Some("value"),
-                false,
-                None,
-            )?;
-            DataType::Dictionary(
-                Box::new(DataType::Utf8),
-                Box::new(value_field.data_type().clone()),
-            )
-        }
-        AvroSchema::Union(us) => {
-            // If there are only two variants and one of them is null, set the other type as the field data type
-            let has_nullable = us
-                .find_schema_with_known_schemata::<apache_avro::Schema>(
-                    &Value::Null,
-                    None,
-                    &None,
-                )
-                .is_some();
-            let sub_schemas = us.variants();
-            if has_nullable && sub_schemas.len() == 2 {
-                nullable = true;
-                if let Some(schema) = sub_schemas
-                    .iter()
-                    .find(|&schema| !matches!(schema, AvroSchema::Null))
-                {
-                    schema_to_field_with_props(schema, None, has_nullable, None)?
-                        .data_type()
-                        .clone()
-                } else {
-                    return Err(apache_avro::Error::new(
-                        apache_avro::error::Details::GetUnionDuplicate,
-                    )
-                    .into());
-                }
-            } else {
-                let fields = sub_schemas
-                    .iter()
-                    .map(|s| schema_to_field_with_props(s, None, has_nullable, None))
-                    .collect::<Result<Vec<Field>>>()?;
-                let type_ids = 0_i8..fields.len() as i8;
-                DataType::Union(UnionFields::new(type_ids, fields), UnionMode::Dense)
-            }
-        }
-        AvroSchema::Record(RecordSchema { fields, .. }) => {
-            let fields: Result<_> = fields
-                .iter()
-                .map(|field| {
-                    let mut props = HashMap::new();
-                    if let Some(doc) = &field.doc {
-                        props.insert("avro::doc".to_string(), doc.clone());
-                    }
-                    /*if let Some(aliases) = fields.aliases {
-                        props.insert("aliases", aliases);
-                    }*/
-                    schema_to_field_with_props(
-                        &field.schema,
-                        Some(&field.name),
-                        false,
-                        Some(props),
-                    )
-                })
-                .collect();
-            DataType::Struct(fields?)
-        }
-        AvroSchema::Enum(EnumSchema { .. }) => DataType::Utf8,
-        AvroSchema::Fixed(FixedSchema { size, .. }) => {
-            DataType::FixedSizeBinary(*size as i32)
-        }
-        AvroSchema::Decimal(DecimalSchema {
-            precision, scale, ..
-        }) => DataType::Decimal128(*precision as u8, *scale as i8),
-        AvroSchema::BigDecimal => DataType::LargeBinary,
-        AvroSchema::Uuid => DataType::FixedSizeBinary(16),
-        AvroSchema::Date => DataType::Date32,
-        AvroSchema::TimeMillis => DataType::Time32(TimeUnit::Millisecond),
-        AvroSchema::TimeMicros => DataType::Time64(TimeUnit::Microsecond),
-        AvroSchema::TimestampMillis => DataType::Timestamp(TimeUnit::Millisecond, None),
-        AvroSchema::TimestampMicros => DataType::Timestamp(TimeUnit::Microsecond, None),
-        AvroSchema::TimestampNanos => DataType::Timestamp(TimeUnit::Nanosecond, None),
-        AvroSchema::LocalTimestampMillis => todo!(),
-        AvroSchema::LocalTimestampMicros => todo!(),
-        AvroSchema::LocalTimestampNanos => todo!(),
-        AvroSchema::Duration => DataType::Duration(TimeUnit::Millisecond),
-    };
-
-    let data_type = field_type.clone();
-    let name = name.unwrap_or_else(|| default_field_name(&data_type));
-
-    let mut field = Field::new(name, field_type, nullable);
-    field.set_metadata(props.unwrap_or_default());
-    Ok(field)
-}
-
-fn default_field_name(dt: &DataType) -> &str {
-    match dt {
-        DataType::Null => "null",
-        DataType::Boolean => "bit",
-        DataType::Int8 => "tinyint",
-        DataType::Int16 => "smallint",
-        DataType::Int32 => "int",
-        DataType::Int64 => "bigint",
-        DataType::UInt8 => "uint1",
-        DataType::UInt16 => "uint2",
-        DataType::UInt32 => "uint4",
-        DataType::UInt64 => "uint8",
-        DataType::Float16 => "float2",
-        DataType::Float32 => "float4",
-        DataType::Float64 => "float8",
-        DataType::Date32 => "dateday",
-        DataType::Date64 => "datemilli",
-        DataType::Time32(tu) | DataType::Time64(tu) => match tu {
-            TimeUnit::Second => "timesec",
-            TimeUnit::Millisecond => "timemilli",
-            TimeUnit::Microsecond => "timemicro",
-            TimeUnit::Nanosecond => "timenano",
-        },
-        DataType::Timestamp(tu, tz) => {
-            if tz.is_some() {
-                match tu {
-                    TimeUnit::Second => "timestampsectz",
-                    TimeUnit::Millisecond => "timestampmillitz",
-                    TimeUnit::Microsecond => "timestampmicrotz",
-                    TimeUnit::Nanosecond => "timestampnanotz",
-                }
-            } else {
-                match tu {
-                    TimeUnit::Second => "timestampsec",
-                    TimeUnit::Millisecond => "timestampmilli",
-                    TimeUnit::Microsecond => "timestampmicro",
-                    TimeUnit::Nanosecond => "timestampnano",
-                }
-            }
-        }
-        DataType::Duration(_) => "duration",
-        DataType::Interval(unit) => match unit {
-            IntervalUnit::YearMonth => "intervalyear",
-            IntervalUnit::DayTime => "intervalmonth",
-            IntervalUnit::MonthDayNano => "intervalmonthdaynano",
-        },
-        DataType::Binary => "varbinary",
-        DataType::FixedSizeBinary(_) => "fixedsizebinary",
-        DataType::LargeBinary => "largevarbinary",
-        DataType::Utf8 => "varchar",
-        DataType::LargeUtf8 => "largevarchar",
-        DataType::List(_) => "list",
-        DataType::FixedSizeList(_, _) => "fixed_size_list",
-        DataType::LargeList(_) => "largelist",
-        DataType::Struct(_) => "struct",
-        DataType::Union(_, _) => "union",
-        DataType::Dictionary(_, _) => "map",
-        DataType::Map(_, _) => unimplemented!("Map support not implemented"),
-        DataType::RunEndEncoded(_, _) => {
-            unimplemented!("RunEndEncoded support not implemented")
-        }
-        DataType::Utf8View
-        | DataType::BinaryView
-        | DataType::ListView(_)
-        | DataType::LargeListView(_) => {
-            unimplemented!("View support not implemented")
-        }
-        DataType::Decimal32(_, _) => "decimal",
-        DataType::Decimal64(_, _) => "decimal",
-        DataType::Decimal128(_, _) => "decimal",
-        DataType::Decimal256(_, _) => "decimal",
-    }
-}
-
-fn external_props(schema: &AvroSchema) -> HashMap<String, String> {
-    let mut props = HashMap::new();
-    match &schema {
-        AvroSchema::Record(RecordSchema {
-            doc: Some(ref doc), ..
-        })
-        | AvroSchema::Enum(EnumSchema {
-            doc: Some(ref doc), ..
-        })
-        | AvroSchema::Fixed(FixedSchema {
-            doc: Some(ref doc), ..
-        }) => {
-            props.insert("avro::doc".to_string(), doc.clone());
-        }
-        _ => {}
-    }
-    match &schema {
-        AvroSchema::Record(RecordSchema {
-            name: Name { namespace, .. },
-            aliases: Some(aliases),
-            ..
-        })
-        | AvroSchema::Enum(EnumSchema {
-            name: Name { namespace, .. },
-            aliases: Some(aliases),
-            ..
-        })
-        | AvroSchema::Fixed(FixedSchema {
-            name: Name { namespace, .. },
-            aliases: Some(aliases),
-            ..
-        }) => {
-            let aliases: Vec<String> = aliases
-                .iter()
-                .map(|alias| aliased(alias, namespace.as_deref(), None))
-                .collect();
-            props.insert(
-                "avro::aliases".to_string(),
-                format!("[{}]", aliases.join(",")),
-            );
-        }
-        _ => {}
-    }
-    props
-}
-
-/// Returns the fully qualified name for a field
-pub fn aliased(
-    alias: &Alias,
-    namespace: Option<&str>,
-    default_namespace: Option<&str>,
-) -> String {
-    if alias.namespace().is_some() {
-        alias.fullname(None)
-    } else {
-        let namespace = namespace.as_ref().copied().or(default_namespace);
-
-        match namespace {
-            Some(ref namespace) => format!("{}.{}", namespace, alias.name()),
-            None => alias.fullname(None),
-        }
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use super::{aliased, external_props, to_arrow_schema};
-    use apache_avro::schema::{Alias, EnumSchema, FixedSchema, Name, RecordSchema};
-    use apache_avro::Schema as AvroSchema;
-    use arrow::datatypes::DataType::{Binary, Float32, Float64, Timestamp, Utf8};
-    use arrow::datatypes::DataType::{Boolean, Int32, Int64};
-    use arrow::datatypes::TimeUnit::Microsecond;
-    use arrow::datatypes::{Field, Schema};
-
-    fn alias(name: &str) -> Alias {
-        Alias::new(name).unwrap()
-    }
-
-    #[test]
-    fn test_alias() {
-        assert_eq!(aliased(&alias("foo.bar"), None, None), "foo.bar");
-        assert_eq!(aliased(&alias("bar"), Some("foo"), None), "foo.bar");
-        assert_eq!(aliased(&alias("bar"), Some("foo"), Some("cat")), "foo.bar");
-        assert_eq!(aliased(&alias("bar"), None, Some("cat")), "cat.bar");
-    }
-
-    #[test]
-    fn test_external_props() {
-        let record_schema = AvroSchema::Record(RecordSchema {
-            name: Name {
-                name: "record".to_string(),
-                namespace: None,
-            },
-            aliases: Some(vec![alias("fooalias"), alias("baralias")]),
-            doc: Some("record documentation".to_string()),
-            fields: vec![],
-            lookup: Default::default(),
-            attributes: Default::default(),
-        });
-        let props = external_props(&record_schema);
-        assert_eq!(
-            props.get("avro::doc"),
-            Some(&"record documentation".to_string())
-        );
-        assert_eq!(
-            props.get("avro::aliases"),
-            Some(&"[fooalias,baralias]".to_string())
-        );
-        let enum_schema = AvroSchema::Enum(EnumSchema {
-            name: Name {
-                name: "enum".to_string(),
-                namespace: None,
-            },
-            aliases: Some(vec![alias("fooenum"), alias("barenum")]),
-            doc: Some("enum documentation".to_string()),
-            symbols: vec![],
-            default: None,
-            attributes: Default::default(),
-        });
-        let props = external_props(&enum_schema);
-        assert_eq!(
-            props.get("avro::doc"),
-            Some(&"enum documentation".to_string())
-        );
-        assert_eq!(
-            props.get("avro::aliases"),
-            Some(&"[fooenum,barenum]".to_string())
-        );
-        let fixed_schema = AvroSchema::Fixed(FixedSchema {
-            name: Name {
-                name: "fixed".to_string(),
-                namespace: None,
-            },
-            aliases: Some(vec![alias("foofixed"), alias("barfixed")]),
-            size: 1,
-            doc: None,
-            default: None,
-            attributes: Default::default(),
-        });
-        let props = external_props(&fixed_schema);
-        assert_eq!(
-            props.get("avro::aliases"),
-            Some(&"[foofixed,barfixed]".to_string())
-        );
-    }
-
-    #[test]
-    fn test_invalid_avro_schema() {}
-
-    #[test]
-    fn test_plain_types_schema() {
-        let schema = AvroSchema::parse_str(
-            r#"
-            {
-              "type" : "record",
-              "name" : "topLevelRecord",
-              "fields" : [ {
-                "name" : "id",
-                "type" : [ "int", "null" ]
-              }, {
-                "name" : "bool_col",
-                "type" : [ "boolean", "null" ]
-              }, {
-                "name" : "tinyint_col",
-                "type" : [ "int", "null" ]
-              }, {
-                "name" : "smallint_col",
-                "type" : [ "int", "null" ]
-              }, {
-                "name" : "int_col",
-                "type" : [ "int", "null" ]
-              }, {
-                "name" : "bigint_col",
-                "type" : [ "long", "null" ]
-              }, {
-                "name" : "float_col",
-                "type" : [ "float", "null" ]
-              }, {
-                "name" : "double_col",
-                "type" : [ "double", "null" ]
-              }, {
-                "name" : "date_string_col",
-                "type" : [ "bytes", "null" ]
-              }, {
-                "name" : "string_col",
-                "type" : [ "bytes", "null" ]
-              }, {
-                "name" : "timestamp_col",
-                "type" : [ {
-                  "type" : "long",
-                  "logicalType" : "timestamp-micros"
-                }, "null" ]
-              } ]
-            }"#,
-        );
-        assert!(schema.is_ok(), "{schema:?}");
-        let arrow_schema = to_arrow_schema(&schema.unwrap());
-        assert!(arrow_schema.is_ok(), "{arrow_schema:?}");
-        let expected = Schema::new(vec![
-            Field::new("id", Int32, true),
-            Field::new("bool_col", Boolean, true),
-            Field::new("tinyint_col", Int32, true),
-            Field::new("smallint_col", Int32, true),
-            Field::new("int_col", Int32, true),
-            Field::new("bigint_col", Int64, true),
-            Field::new("float_col", Float32, true),
-            Field::new("double_col", Float64, true),
-            Field::new("date_string_col", Binary, true),
-            Field::new("string_col", Binary, true),
-            Field::new("timestamp_col", Timestamp(Microsecond, None), true),
-        ]);
-        assert_eq!(arrow_schema.unwrap(), expected);
-    }
-
-    #[test]
-    fn test_nested_schema() {
-        let avro_schema = apache_avro::Schema::parse_str(
-            r#"
-            {
-              "type": "record",
-              "name": "r1",
-              "fields": [
-                {
-                  "name": "col1",
-                  "type": [
-                    "null",
-                    {
-                      "type": "record",
-                      "name": "r2",
-                      "fields": [
-                        {
-                          "name": "col2",
-                          "type": "string"
-                        },
-                        {
-                          "name": "col3",
-                          "type": ["null", "string"],
-                          "default": null
-                        }
-                      ]
-                    }
-                  ],
-                  "default": null
-                }
-              ]
-            }"#,
-        )
-        .unwrap();
-        // should not use Avro Record names.
-        let expected_arrow_schema = Schema::new(vec![Field::new(
-            "col1",
-            arrow::datatypes::DataType::Struct(
-                vec![
-                    Field::new("col2", Utf8, false),
-                    Field::new("col3", Utf8, true),
-                ]
-                .into(),
-            ),
-            true,
-        )]);
-        assert_eq!(
-            to_arrow_schema(&avro_schema).unwrap(),
-            expected_arrow_schema
-        );
-    }
-
-    #[test]
-    fn test_non_record_schema() {
-        let arrow_schema = to_arrow_schema(&AvroSchema::String);
-        assert!(arrow_schema.is_ok(), "{arrow_schema:?}");
-        assert_eq!(
-            arrow_schema.unwrap(),
-            Schema::new(vec![Field::new("", Utf8, false)])
-        );
-    }
-}
diff --git a/datafusion/datasource-avro/src/file_format.rs b/datafusion/datasource-avro/src/file_format.rs
index 60c361b42e771..9d9d3279c0504 100644
--- a/datafusion/datasource-avro/src/file_format.rs
+++ b/datafusion/datasource-avro/src/file_format.rs
@@ -16,32 +16,30 @@
 // under the License.
 
 //! Apache Avro [`FileFormat`] abstractions
-
-use std::any::Any;
 use std::collections::HashMap;
 use std::fmt;
 use std::sync::Arc;
 
-use crate::avro_to_arrow::read_avro_schema_from_reader;
+use crate::read_avro_schema_from_reader;
 use crate::source::AvroSource;
 
 use arrow::datatypes::Schema;
 use arrow::datatypes::SchemaRef;
+use datafusion_common::DEFAULT_AVRO_EXTENSION;
+use datafusion_common::GetExt;
 use datafusion_common::internal_err;
 use datafusion_common::parsers::CompressionTypeVariant;
-use datafusion_common::GetExt;
-use datafusion_common::DEFAULT_AVRO_EXTENSION;
 use datafusion_common::{Result, Statistics};
 use datafusion_datasource::file::FileSource;
 use datafusion_datasource::file_compression_type::FileCompressionType;
 use datafusion_datasource::file_format::{FileFormat, FileFormatFactory};
-use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
+use datafusion_datasource::file_scan_config::FileScanConfig;
 use datafusion_datasource::source::DataSourceExec;
 use datafusion_physical_plan::ExecutionPlan;
 use datafusion_session::Session;
 
 use async_trait::async_trait;
-use object_store::{GetResultPayload, ObjectMeta, ObjectStore};
+use object_store::{GetResultPayload, ObjectMeta, ObjectStore, ObjectStoreExt};
 
 #[derive(Default)]
 /// Factory struct used to create [`AvroFormat`]
@@ -66,10 +64,6 @@ impl FileFormatFactory for AvroFormatFactory {
     fn default(&self) -> Arc<dyn FileFormat> {
         Arc::new(AvroFormat)
     }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
 }
 
 impl fmt::Debug for AvroFormatFactory {
@@ -91,10 +85,6 @@ pub struct AvroFormat;
 
 #[async_trait]
 impl FileFormat for AvroFormat {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn get_ext(&self) -> String {
         AvroFormatFactory::new().get_ext()
     }
@@ -154,13 +144,13 @@ impl FileFormat for AvroFormat {
         _state: &dyn Session,
         conf: FileScanConfig,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let config = FileScanConfigBuilder::from(conf)
-            .with_source(self.file_source())
-            .build();
-        Ok(DataSourceExec::from_data_source(config))
+        Ok(DataSourceExec::from_data_source(conf))
     }
 
-    fn file_source(&self) -> Arc<dyn FileSource> {
-        Arc::new(AvroSource::new())
+    fn file_source(
+        &self,
+        table_schema: datafusion_datasource::TableSchema,
+    ) -> Arc<dyn FileSource> {
+        Arc::new(AvroSource::new(table_schema))
     }
 }
diff --git a/datafusion/datasource-avro/src/mod.rs b/datafusion/datasource-avro/src/mod.rs
index ad8ebe11446f5..42cf2a3ab7c26 100644
--- a/datafusion/datasource-avro/src/mod.rs
+++ b/datafusion/datasource-avro/src/mod.rs
@@ -23,12 +23,131 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! An [Avro](https://avro.apache.org/) based [`FileSource`](datafusion_datasource::file::FileSource) implementation and related functionality.
 
-pub mod avro_to_arrow;
 pub mod file_format;
 pub mod source;
 
-pub use apache_avro;
+use arrow::datatypes::{DataType, Field, Fields, Schema, UnionFields};
+pub use arrow_avro;
+use arrow_avro::reader::ReaderBuilder;
 pub use file_format::*;
+use std::io::{BufReader, Read};
+use std::sync::Arc;
+
+/// Read Avro schema given a reader
+pub fn read_avro_schema_from_reader<R: Read>(
+    reader: &mut R,
+) -> datafusion_common::Result<Schema> {
+    let avro_reader = ReaderBuilder::new().build(BufReader::new(reader))?;
+    // Avro readers perform strict schema resolution rules (e.g. record identity checks)
+    // that are stricter than DataFusion's table schema handling needs for inferred schemas.
+    // Drop metadata from inferred schemas so runtime batches and inferred table schemas
+    // compare consistently without requiring strict Avro metadata identity.
+    Ok(strip_metadata_from_schema(avro_reader.schema().as_ref()))
+}
+
+fn strip_metadata_from_schema(schema: &Schema) -> Schema {
+    let fields = schema
+        .fields
+        .into_iter()
+        .map(|f| Arc::new(strip_metadata_from_field(f.as_ref())))
+        .collect::<Fields>();
+    // Intentionally drop schema-level metadata
+    Schema::new(fields)
+}
+
+fn strip_metadata_from_field(field: &Field) -> Field {
+    // Intentionally drop field-level metadata
+    Field::new(
+        field.name(),
+        strip_metadata_from_data_type(field.data_type()),
+        field.is_nullable(),
+    )
+}
+
+fn strip_metadata_from_data_type(data_type: &DataType) -> DataType {
+    match data_type {
+        DataType::Struct(fields) => DataType::Struct(
+            fields
+                .iter()
+                .map(|f| Arc::new(strip_metadata_from_field(f.as_ref())))
+                .collect(),
+        ),
+        DataType::List(field) => {
+            DataType::List(Arc::new(strip_metadata_from_field(field.as_ref())))
+        }
+        DataType::LargeList(field) => {
+            DataType::LargeList(Arc::new(strip_metadata_from_field(field.as_ref())))
+        }
+        DataType::FixedSizeList(field, size) => DataType::FixedSizeList(
+            Arc::new(strip_metadata_from_field(field.as_ref())),
+            *size,
+        ),
+        DataType::Map(field, sorted) => {
+            DataType::Map(Arc::new(strip_metadata_from_field(field.as_ref())), *sorted)
+        }
+        DataType::Union(fields, mode) => {
+            let (type_ids, children): (Vec<_>, Vec<_>) = fields
+                .iter()
+                .map(|(type_id, field)| {
+                    (type_id, Arc::new(strip_metadata_from_field(field.as_ref())))
+                })
+                .unzip();
+
+            DataType::Union(
+                UnionFields::try_new(type_ids, children)
+                    .expect("existing union fields should remain valid"),
+                *mode,
+            )
+        }
+        _ => data_type.clone(),
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use arrow::datatypes::{DataType, Field, TimeUnit};
+    use datafusion_common::Result as DFResult;
+    use datafusion_common::test_util::arrow_test_data;
+    use std::fs::File;
+
+    fn avro_test_file(name: &str) -> String {
+        format!("{}/avro/{name}", arrow_test_data())
+    }
+
+    #[test]
+    fn test_read_avro_schema_from_reader() -> DFResult<()> {
+        let path = avro_test_file("alltypes_dictionary.avro");
+        let mut file = File::open(&path)?;
+        let file_schema = read_avro_schema_from_reader(&mut file)?;
+
+        let expected_fields = vec![
+            Field::new("id", DataType::Int32, true),
+            Field::new("bool_col", DataType::Boolean, true),
+            Field::new("tinyint_col", DataType::Int32, true),
+            Field::new("smallint_col", DataType::Int32, true),
+            Field::new("int_col", DataType::Int32, true),
+            Field::new("bigint_col", DataType::Int64, true),
+            Field::new("float_col", DataType::Float32, true),
+            Field::new("double_col", DataType::Float64, true),
+            Field::new("date_string_col", DataType::Binary, true),
+            Field::new("string_col", DataType::Binary, true),
+            Field::new(
+                "timestamp_col",
+                DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())),
+                true,
+            ),
+        ];
+
+        assert_eq!(file_schema.fields.len(), expected_fields.len());
+        for (i, field) in file_schema.fields.iter().enumerate() {
+            assert_eq!(field.as_ref(), &expected_fields[i]);
+        }
+
+        Ok(())
+    }
+}
diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs
index 1ff73d2c3cc39..b80d4f462e425 100644
--- a/datafusion/datasource-avro/src/source.rs
+++ b/datafusion/datasource-avro/src/source.rs
@@ -17,48 +17,98 @@
 
 //! Execution plan for reading line-delimited Avro files
 
-use std::any::Any;
 use std::sync::Arc;
 
-use crate::avro_to_arrow::Reader as AvroReader;
-
-use arrow::datatypes::SchemaRef;
+use arrow::datatypes::{Schema, SchemaRef};
+use arrow_avro::reader::{Reader, ReaderBuilder};
 use datafusion_common::error::Result;
-use datafusion_common::Statistics;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_datasource::TableSchema;
 use datafusion_datasource::file::FileSource;
 use datafusion_datasource::file_scan_config::FileScanConfig;
 use datafusion_datasource::file_stream::FileOpener;
-use datafusion_datasource::schema_adapter::SchemaAdapterFactory;
-use datafusion_datasource::TableSchema;
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_datasource::projection::{ProjectionOpener, SplitProjection};
+use datafusion_physical_expr_adapter::BatchAdapterFactory;
 use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
+use datafusion_physical_plan::projection::ProjectionExprs;
 
 use object_store::ObjectStore;
 
 /// AvroSource holds the extra configuration that is necessary for opening avro files
-#[derive(Clone, Default)]
+#[derive(Clone)]
 pub struct AvroSource {
-    schema: Option<SchemaRef>,
+    table_schema: TableSchema,
     batch_size: Option<usize>,
-    projection: Option<Vec<String>>,
+    projection: SplitProjection,
     metrics: ExecutionPlanMetricsSet,
-    projected_statistics: Option<Statistics>,
-    schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
 }
 
 impl AvroSource {
-    /// Initialize an AvroSource with default values
-    pub fn new() -> Self {
-        Self::default()
+    /// Initialize an AvroSource with the provided schema
+    pub fn new(table_schema: impl Into<TableSchema>) -> Self {
+        let table_schema = table_schema.into();
+        Self {
+            projection: SplitProjection::unprojected(&table_schema),
+            table_schema,
+            batch_size: None,
+            metrics: ExecutionPlanMetricsSet::new(),
+        }
     }
 
-    fn open<R: std::io::Read>(&self, reader: R) -> Result<AvroReader<'static, R>> {
-        AvroReader::try_new(
-            reader,
-            Arc::clone(self.schema.as_ref().expect("Schema must set before open")),
-            self.batch_size.expect("Batch size must set before open"),
-            self.projection.clone(),
-        )
+    fn open<R: std::io::BufRead>(
+        &self,
+        reader: R,
+        projection: Option<Vec<usize>>,
+    ) -> Result<Reader<R>> {
+        let mut builder = ReaderBuilder::new()
+            .with_batch_size(self.batch_size.expect("Batch size must set before open"));
+        if let Some(projection) = projection {
+            builder = builder.with_projection(projection);
+        }
+        builder.build(reader).map_err(Into::into)
+    }
+
+    fn projected_file_schema(&self) -> SchemaRef {
+        let file_schema = self.table_schema.file_schema();
+        if self.projection.file_indices.is_empty() {
+            return Arc::clone(file_schema);
+        }
+
+        Arc::new(Schema::new(
+            self.projection
+                .file_indices
+                .iter()
+                .map(|idx| file_schema.field(*idx).clone())
+                .collect::<Vec<_>>(),
+        ))
+    }
+
+    fn writer_projection_for_schema(
+        &self,
+        writer_schema: &Schema,
+        target_schema: &Schema,
+    ) -> Option<Vec<usize>> {
+        // `arrow-avro` accepts projection ordinals against the file's writer schema,
+        // while DataFusion plans projection against the logical table schema. Remap
+        // projected column names to writer ordinals so reader-level pushdown still
+        // preserves DataFusion's existing name-based projection semantics.
+        let projection = target_schema
+            .fields()
+            .iter()
+            .filter_map(|field| {
+                writer_schema
+                    .column_with_name(field.name())
+                    .map(|(idx, _)| idx)
+            })
+            .collect::<Vec<_>>();
+
+        let identity_projection = projection.len() == writer_schema.fields().len()
+            && projection
+                .iter()
+                .enumerate()
+                .all(|(idx, value)| idx == *value);
+
+        (!identity_projection).then_some(projection)
     }
 }
 
@@ -68,15 +118,21 @@ impl FileSource for AvroSource {
         object_store: Arc<dyn ObjectStore>,
         _base_config: &FileScanConfig,
         _partition: usize,
-    ) -> Arc<dyn FileOpener> {
-        Arc::new(private::AvroOpener {
+    ) -> Result<Arc<dyn FileOpener>> {
+        let mut opener = Arc::new(private::AvroOpener {
             config: Arc::new(self.clone()),
             object_store,
-        })
+        }) as Arc<dyn FileOpener>;
+        opener = ProjectionOpener::try_new(
+            self.projection.clone(),
+            Arc::clone(&opener),
+            self.table_schema.file_schema(),
+        )?;
+        Ok(opener)
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
+    fn table_schema(&self) -> &TableSchema {
+        &self.table_schema
     }
 
     fn with_batch_size(&self, batch_size: usize) -> Arc<dyn FileSource> {
@@ -85,72 +141,59 @@ impl FileSource for AvroSource {
         Arc::new(conf)
     }
 
-    fn with_schema(&self, schema: TableSchema) -> Arc<dyn FileSource> {
-        let mut conf = self.clone();
-        // TableSchema may have partition columns, but AvroSource does not use partition columns or values atm
-        conf.schema = Some(Arc::clone(schema.file_schema()));
-        Arc::new(conf)
-    }
-
-    fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource> {
-        let mut conf = self.clone();
-        conf.projected_statistics = Some(statistics);
-        Arc::new(conf)
+    fn try_pushdown_projection(
+        &self,
+        projection: &ProjectionExprs,
+    ) -> Result<Option<Arc<dyn FileSource>>> {
+        let mut source = self.clone();
+        let new_projection = self.projection.source.try_merge(projection)?;
+        let split_projection =
+            SplitProjection::new(self.table_schema.file_schema(), &new_projection);
+        source.projection = split_projection;
+        Ok(Some(Arc::new(source)))
     }
 
-    fn with_projection(&self, config: &FileScanConfig) -> Arc<dyn FileSource> {
-        let mut conf = self.clone();
-        conf.projection = config.projected_file_column_names();
-        Arc::new(conf)
+    fn projection(&self) -> Option<&ProjectionExprs> {
+        Some(&self.projection.source)
     }
 
     fn metrics(&self) -> &ExecutionPlanMetricsSet {
         &self.metrics
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        let statistics = &self.projected_statistics;
-        Ok(statistics
-            .clone()
-            .expect("projected_statistics must be set"))
-    }
-
     fn file_type(&self) -> &str {
         "avro"
     }
 
-    fn repartitioned(
-        &self,
-        _target_partitions: usize,
-        _repartition_file_min_size: usize,
-        _output_ordering: Option<LexOrdering>,
-        _config: &FileScanConfig,
-    ) -> Result<Option<FileScanConfig>> {
-        Ok(None)
+    fn supports_repartitioning(&self) -> bool {
+        // Avro OCF does not support safe byte-range splitting in this reader path.
+        false
     }
 
-    fn with_schema_adapter_factory(
+    fn apply_expressions(
         &self,
-        schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
-    ) -> Result<Arc<dyn FileSource>> {
-        Ok(Arc::new(Self {
-            schema_adapter_factory: Some(schema_adapter_factory),
-            ..self.clone()
-        }))
-    }
-
-    fn schema_adapter_factory(&self) -> Option<Arc<dyn SchemaAdapterFactory>> {
-        self.schema_adapter_factory.clone()
+        f: &mut dyn FnMut(
+            &dyn datafusion_physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit projection expressions
+        let mut tnr = TreeNodeRecursion::Continue;
+        for proj_expr in &self.projection.source {
+            tnr = tnr.visit_sibling(|| f(proj_expr.expr.as_ref()))?;
+        }
+        Ok(tnr)
     }
 }
 
 mod private {
     use super::*;
+    use std::io::BufReader;
+    use std::io::Seek;
 
     use bytes::Buf;
-    use datafusion_datasource::{file_stream::FileOpenFuture, PartitionedFile};
+    use datafusion_datasource::{PartitionedFile, file_stream::FileOpenFuture};
     use futures::StreamExt;
-    use object_store::{GetResultPayload, ObjectStore};
+    use object_store::{GetResultPayload, ObjectStore, ObjectStoreExt};
 
     pub struct AvroOpener {
         pub config: Arc<AvroSource>,
@@ -159,24 +202,57 @@ mod private {
 
     impl FileOpener for AvroOpener {
         fn open(&self, partitioned_file: PartitionedFile) -> Result<FileOpenFuture> {
-            let config = Arc::clone(&self.config);
             let object_store = Arc::clone(&self.object_store);
+            let config = Arc::clone(&self.config);
+            let projected_file_schema = config.projected_file_schema();
+
             Ok(Box::pin(async move {
                 let r = object_store
                     .get(&partitioned_file.object_meta.location)
                     .await?;
                 match r.payload {
-                    GetResultPayload::File(file, _) => {
-                        let reader = config.open(file)?;
+                    GetResultPayload::File(mut file, _) => {
+                        // Probe the writer schema first so logical projected columns can be
+                        // translated to the writer-schema ordinals expected by `arrow-avro`.
+                        let probe_reader =
+                            config.open(BufReader::new(file.try_clone()?), None)?;
+                        let writer_projection = config.writer_projection_for_schema(
+                            probe_reader.schema().as_ref(),
+                            projected_file_schema.as_ref(),
+                        );
+                        file.rewind()?;
+                        let reader =
+                            config.open(BufReader::new(file), writer_projection)?;
+                        let batch_adapter =
+                            BatchAdapterFactory::new(Arc::clone(&projected_file_schema))
+                                .make_adapter(&reader.schema())?;
                         Ok(futures::stream::iter(reader)
-                            .map(|r| r.map_err(Into::into))
+                            .map(move |r| {
+                                r.map_err(Into::into)
+                                    .and_then(|batch| batch_adapter.adapt_batch(&batch))
+                            })
                             .boxed())
                     }
                     GetResultPayload::Stream(_) => {
                         let bytes = r.bytes().await?;
-                        let reader = config.open(bytes.reader())?;
+                        // As above, inspect the writer schema before constructing the real
+                        // reader so `with_projection` can use writer-schema ordinals.
+                        let probe_reader =
+                            config.open(BufReader::new(bytes.clone().reader()), None)?;
+                        let writer_projection = config.writer_projection_for_schema(
+                            probe_reader.schema().as_ref(),
+                            projected_file_schema.as_ref(),
+                        );
+                        let reader = config
+                            .open(BufReader::new(bytes.reader()), writer_projection)?;
+                        let batch_adapter =
+                            BatchAdapterFactory::new(Arc::clone(&projected_file_schema))
+                                .make_adapter(&reader.schema())?;
                         Ok(futures::stream::iter(reader)
-                            .map(|r| r.map_err(Into::into))
+                            .map(move |r| {
+                                r.map_err(Into::into)
+                                    .and_then(|batch| batch_adapter.adapt_batch(&batch))
+                            })
                             .boxed())
                     }
                 }
@@ -184,3 +260,249 @@ mod private {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::*;
+    use arrow::array::{
+        BinaryArray, BooleanArray, Float32Array, Float64Array, Int32Array, Int64Array,
+        TimestampMicrosecondArray, TimestampMillisecondArray,
+    };
+    use arrow::datatypes::TimeUnit;
+    use arrow::datatypes::{DataType, Field};
+    use std::fs::File;
+    use std::io::BufReader;
+
+    fn build_reader(
+        name: &'_ str,
+        projection: Option<Vec<usize>>,
+    ) -> Reader<BufReader<File>> {
+        let testdata = datafusion_common::test_util::arrow_test_data();
+        let filename = format!("{testdata}/avro/{name}");
+        let mut builder = ReaderBuilder::new().with_batch_size(64);
+        if let Some(proj) = projection {
+            builder = builder.with_projection(proj);
+        }
+        builder
+            .build(BufReader::new(File::open(filename).unwrap()))
+            .unwrap()
+    }
+
+    fn get_col<'a, T: 'static>(
+        batch: &'a RecordBatch,
+        col: (usize, &Field),
+    ) -> Option<&'a T> {
+        batch.column(col.0).as_any().downcast_ref::<T>()
+    }
+
+    #[test]
+    fn test_avro_basic() {
+        let mut reader = build_reader("alltypes_dictionary.avro", None);
+        let batch = reader.next().unwrap().unwrap();
+
+        assert_eq!(11, batch.num_columns());
+        assert_eq!(2, batch.num_rows());
+
+        let schema = reader.schema();
+        let batch_schema = batch.schema();
+        assert_eq!(schema, batch_schema);
+
+        let id = schema.column_with_name("id").unwrap();
+        assert_eq!(0, id.0);
+        assert_eq!(&DataType::Int32, id.1.data_type());
+        let col = get_col::<Int32Array>(&batch, id).unwrap();
+        assert_eq!(0, col.value(0));
+        assert_eq!(1, col.value(1));
+        let bool_col = schema.column_with_name("bool_col").unwrap();
+        assert_eq!(1, bool_col.0);
+        assert_eq!(&DataType::Boolean, bool_col.1.data_type());
+        let col = get_col::<BooleanArray>(&batch, bool_col).unwrap();
+        assert!(col.value(0));
+        assert!(!col.value(1));
+        let tinyint_col = schema.column_with_name("tinyint_col").unwrap();
+        assert_eq!(2, tinyint_col.0);
+        assert_eq!(&DataType::Int32, tinyint_col.1.data_type());
+        let col = get_col::<Int32Array>(&batch, tinyint_col).unwrap();
+        assert_eq!(0, col.value(0));
+        assert_eq!(1, col.value(1));
+        let smallint_col = schema.column_with_name("smallint_col").unwrap();
+        assert_eq!(3, smallint_col.0);
+        assert_eq!(&DataType::Int32, smallint_col.1.data_type());
+        let col = get_col::<Int32Array>(&batch, smallint_col).unwrap();
+        assert_eq!(0, col.value(0));
+        assert_eq!(1, col.value(1));
+        let int_col = schema.column_with_name("int_col").unwrap();
+        assert_eq!(4, int_col.0);
+        let col = get_col::<Int32Array>(&batch, int_col).unwrap();
+        assert_eq!(0, col.value(0));
+        assert_eq!(1, col.value(1));
+        assert_eq!(&DataType::Int32, int_col.1.data_type());
+        let col = get_col::<Int32Array>(&batch, int_col).unwrap();
+        assert_eq!(0, col.value(0));
+        assert_eq!(1, col.value(1));
+        let bigint_col = schema.column_with_name("bigint_col").unwrap();
+        assert_eq!(5, bigint_col.0);
+        let col = get_col::<Int64Array>(&batch, bigint_col).unwrap();
+        assert_eq!(0, col.value(0));
+        assert_eq!(10, col.value(1));
+        assert_eq!(&DataType::Int64, bigint_col.1.data_type());
+        let float_col = schema.column_with_name("float_col").unwrap();
+        assert_eq!(6, float_col.0);
+        let col = get_col::<Float32Array>(&batch, float_col).unwrap();
+        assert_eq!(0.0, col.value(0));
+        assert_eq!(1.1, col.value(1));
+        assert_eq!(&DataType::Float32, float_col.1.data_type());
+        let col = get_col::<Float32Array>(&batch, float_col).unwrap();
+        assert_eq!(0.0, col.value(0));
+        assert_eq!(1.1, col.value(1));
+        let double_col = schema.column_with_name("double_col").unwrap();
+        assert_eq!(7, double_col.0);
+        assert_eq!(&DataType::Float64, double_col.1.data_type());
+        let col = get_col::<Float64Array>(&batch, double_col).unwrap();
+        assert_eq!(0.0, col.value(0));
+        assert_eq!(10.1, col.value(1));
+        let date_string_col = schema.column_with_name("date_string_col").unwrap();
+        assert_eq!(8, date_string_col.0);
+        assert_eq!(&DataType::Binary, date_string_col.1.data_type());
+        let col = get_col::<BinaryArray>(&batch, date_string_col).unwrap();
+        assert_eq!("01/01/09".as_bytes(), col.value(0));
+        assert_eq!("01/01/09".as_bytes(), col.value(1));
+        let string_col = schema.column_with_name("string_col").unwrap();
+        assert_eq!(9, string_col.0);
+        assert_eq!(&DataType::Binary, string_col.1.data_type());
+        let col = get_col::<BinaryArray>(&batch, string_col).unwrap();
+        assert_eq!("0".as_bytes(), col.value(0));
+        assert_eq!("1".as_bytes(), col.value(1));
+        let timestamp_col = schema.column_with_name("timestamp_col").unwrap();
+        assert_eq!(10, timestamp_col.0);
+        assert_eq!(
+            &DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())),
+            timestamp_col.1.data_type()
+        );
+        let col = get_col::<TimestampMicrosecondArray>(&batch, timestamp_col).unwrap();
+        assert_eq!(1230768000000000, col.value(0));
+        assert_eq!(1230768060000000, col.value(1));
+    }
+
+    #[test]
+    fn test_avro_with_projection() {
+        // Test projection to filter and reorder columns
+        let projection = vec![9, 7, 1]; // string_col, double_col, bool_col
+
+        let mut reader = build_reader("alltypes_dictionary.avro", Some(projection));
+        let batch = reader.next().unwrap().unwrap();
+
+        // Only 3 columns should be present (not all 11)
+        assert_eq!(3, batch.num_columns());
+        assert_eq!(2, batch.num_rows());
+
+        let schema = reader.schema();
+        let batch_schema = batch.schema();
+        assert_eq!(schema, batch_schema);
+
+        // Verify columns are in the order specified in projection
+        // First column should be string_col (was at index 9 in original)
+        assert_eq!("string_col", schema.field(0).name());
+        assert_eq!(&DataType::Binary, schema.field(0).data_type());
+        let col = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<BinaryArray>()
+            .unwrap();
+        assert_eq!("0".as_bytes(), col.value(0));
+        assert_eq!("1".as_bytes(), col.value(1));
+
+        // Second column should be double_col (was at index 7 in original)
+        assert_eq!("double_col", schema.field(1).name());
+        assert_eq!(&DataType::Float64, schema.field(1).data_type());
+        let col = batch
+            .column(1)
+            .as_any()
+            .downcast_ref::<Float64Array>()
+            .unwrap();
+        assert_eq!(0.0, col.value(0));
+        assert_eq!(10.1, col.value(1));
+
+        // Third column should be bool_col (was at index 1 in original)
+        assert_eq!("bool_col", schema.field(2).name());
+        assert_eq!(&DataType::Boolean, schema.field(2).data_type());
+        let col = batch
+            .column(2)
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .unwrap();
+        assert!(col.value(0));
+        assert!(!col.value(1));
+    }
+
+    #[test]
+    fn test_avro_timestamp_logical_types() {
+        let mut reader = build_reader("timestamp_logical_types.avro", None);
+        let batch = reader.next().unwrap().unwrap();
+
+        assert_eq!(7, batch.num_columns());
+        assert_eq!(2, batch.num_rows());
+
+        let schema = reader.schema();
+        let ts_millis = schema.column_with_name("ts_millis").unwrap();
+        assert_eq!(1, ts_millis.0);
+        assert_eq!(
+            &DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),
+            ts_millis.1.data_type()
+        );
+        let col = get_col::<TimestampMillisecondArray>(&batch, ts_millis).unwrap();
+        assert_eq!(0, col.value(0));
+        assert_eq!(1_000, col.value(1));
+
+        let ts_micros = schema.column_with_name("ts_micros").unwrap();
+        assert_eq!(2, ts_micros.0);
+        assert_eq!(
+            &DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())),
+            ts_micros.1.data_type()
+        );
+        let col = get_col::<TimestampMicrosecondArray>(&batch, ts_micros).unwrap();
+        assert_eq!(0, col.value(0));
+        assert_eq!(1_000_000, col.value(1));
+
+        let ts_nanos = schema.column_with_name("ts_nanos").unwrap();
+        assert_eq!(3, ts_nanos.0);
+        assert_eq!(
+            &DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())),
+            ts_nanos.1.data_type()
+        );
+        let col = get_col::<TimestampNanosecondArray>(&batch, ts_nanos).unwrap();
+        assert_eq!(0, col.value(0));
+        assert_eq!(1_000_000_000, col.value(1));
+
+        let local_ts_millis = schema.column_with_name("local_ts_millis").unwrap();
+        assert_eq!(4, local_ts_millis.0);
+        assert_eq!(
+            &DataType::Timestamp(TimeUnit::Millisecond, None),
+            local_ts_millis.1.data_type()
+        );
+        let col = get_col::<TimestampMillisecondArray>(&batch, local_ts_millis).unwrap();
+        assert_eq!(0, col.value(0));
+        assert_eq!(1_000, col.value(1));
+
+        let local_ts_micros = schema.column_with_name("local_ts_micros").unwrap();
+        assert_eq!(5, local_ts_micros.0);
+        assert_eq!(
+            &DataType::Timestamp(TimeUnit::Microsecond, None),
+            local_ts_micros.1.data_type()
+        );
+        let col = get_col::<TimestampMicrosecondArray>(&batch, local_ts_micros).unwrap();
+        assert_eq!(0, col.value(0));
+        assert_eq!(1_000_000, col.value(1));
+
+        let local_ts_nanos = schema.column_with_name("local_ts_nanos").unwrap();
+        assert_eq!(6, local_ts_nanos.0);
+        assert_eq!(
+            &DataType::Timestamp(TimeUnit::Nanosecond, None),
+            local_ts_nanos.1.data_type()
+        );
+        let col = get_col::<TimestampNanosecondArray>(&batch, local_ts_nanos).unwrap();
+        assert_eq!(0, col.value(0));
+        assert_eq!(1_000_000_000, col.value(1));
+    }
+}
diff --git a/datafusion/datasource-csv/Cargo.toml b/datafusion/datasource-csv/Cargo.toml
index 209cea403896b..295092512742b 100644
--- a/datafusion/datasource-csv/Cargo.toml
+++ b/datafusion/datasource-csv/Cargo.toml
@@ -20,7 +20,7 @@ name = "datafusion-datasource-csv"
 description = "datafusion-datasource-csv"
 readme = "README.md"
 authors.workspace = true
-edition.workspace = true
+edition = { workspace = true }
 homepage.workspace = true
 license.workspace = true
 repository.workspace = true
@@ -47,6 +47,9 @@ object_store = { workspace = true }
 regex = { workspace = true }
 tokio = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
diff --git a/datafusion/datasource-csv/src/file_format.rs b/datafusion/datasource-csv/src/file_format.rs
index 1c39893b23c85..9fdd688037682 100644
--- a/datafusion/datasource-csv/src/file_format.rs
+++ b/datafusion/datasource-csv/src/file_format.rs
@@ -17,7 +17,6 @@
 
 //! [`CsvFormat`], Comma Separated Value (CSV) [`FileFormat`] abstractions
 
-use std::any::Any;
 use std::collections::{HashMap, HashSet};
 use std::fmt::{self, Debug};
 use std::sync::Arc;
@@ -31,23 +30,24 @@ use arrow::error::ArrowError;
 use datafusion_common::config::{ConfigField, ConfigFileType, CsvOptions};
 use datafusion_common::file_options::csv_writer::CsvWriterOptions;
 use datafusion_common::{
-    exec_err, not_impl_err, DataFusionError, GetExt, Result, Statistics,
-    DEFAULT_CSV_EXTENSION,
+    DEFAULT_CSV_EXTENSION, DataFusionError, GetExt, Result, Statistics, exec_err,
+    not_impl_err,
 };
 use datafusion_common_runtime::SpawnedTask;
+use datafusion_datasource::TableSchema;
 use datafusion_datasource::decoder::Decoder;
 use datafusion_datasource::display::FileGroupDisplay;
 use datafusion_datasource::file::FileSource;
 use datafusion_datasource::file_compression_type::FileCompressionType;
 use datafusion_datasource::file_format::{
-    FileFormat, FileFormatFactory, DEFAULT_SCHEMA_INFER_MAX_RECORD,
+    DEFAULT_SCHEMA_INFER_MAX_RECORD, FileFormat, FileFormatFactory,
 };
 use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
 use datafusion_datasource::file_sink_config::{FileSink, FileSinkConfig};
 use datafusion_datasource::sink::{DataSink, DataSinkExec};
+use datafusion_datasource::write::BatchSerializer;
 use datafusion_datasource::write::demux::DemuxedStreamReceiver;
 use datafusion_datasource::write::orchestration::spawn_writer_tasks_and_join;
-use datafusion_datasource::write::BatchSerializer;
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use datafusion_expr::dml::InsertOp;
 use datafusion_physical_expr_common::sort_expr::LexRequirement;
@@ -58,8 +58,10 @@ use async_trait::async_trait;
 use bytes::{Buf, Bytes};
 use datafusion_datasource::source::DataSourceExec;
 use futures::stream::BoxStream;
-use futures::{pin_mut, Stream, StreamExt, TryStreamExt};
-use object_store::{delimited::newline_delimited_stream, ObjectMeta, ObjectStore};
+use futures::{Stream, StreamExt, TryStreamExt, pin_mut};
+use object_store::{
+    ObjectMeta, ObjectStore, ObjectStoreExt, delimited::newline_delimited_stream,
+};
 use regex::Regex;
 
 #[derive(Default)]
@@ -119,10 +121,6 @@ impl FileFormatFactory for CsvFormatFactory {
     fn default(&self) -> Arc<dyn FileFormat> {
         Arc::new(CsvFormat::default())
     }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
 }
 
 impl GetExt for CsvFormatFactory {
@@ -210,6 +208,11 @@ impl CsvFormat {
 
     /// Set a limit in terms of records to scan to infer the schema
     /// - default to `DEFAULT_SCHEMA_INFER_MAX_RECORD`
+    ///
+    /// # Behavior when set to 0
+    ///
+    /// When `max_rec` is set to 0, schema inference is disabled and all fields
+    /// will be inferred as `Utf8` (string) type, regardless of their actual content.
     pub fn with_schema_infer_max_rec(mut self, max_rec: usize) -> Self {
         self.options.schema_infer_max_rec = Some(max_rec);
         self
@@ -354,10 +357,6 @@ impl Debug for CsvSerializer {
 
 #[async_trait]
 impl FileFormat for CsvFormat {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn get_ext(&self) -> String {
         CsvFormatFactory::new().get_ext()
     }
@@ -434,20 +433,22 @@ impl FileFormat for CsvFormat {
             .newlines_in_values
             .unwrap_or_else(|| state.config_options().catalog.newlines_in_values);
 
-        let conf_builder = FileScanConfigBuilder::from(conf)
-            .with_file_compression_type(self.options.compression.into())
-            .with_newlines_in_values(newlines_in_values);
+        let mut csv_options = self.options.clone();
+        csv_options.has_header = Some(has_header);
+        csv_options.newlines_in_values = Some(newlines_in_values);
 
-        let truncated_rows = self.options.truncated_rows.unwrap_or(false);
-        let source = Arc::new(
-            CsvSource::new(has_header, self.options.delimiter, self.options.quote)
-                .with_escape(self.options.escape)
-                .with_terminator(self.options.terminator)
-                .with_comment(self.options.comment)
-                .with_truncate_rows(truncated_rows),
-        );
+        // Get the existing CsvSource and update its options
+        // We need to preserve the table_schema from the original source (which includes partition columns)
+        let csv_source = conf
+            .file_source
+            .downcast_ref::<CsvSource>()
+            .expect("file_source should be a CsvSource");
+        let source = Arc::new(csv_source.clone().with_csv_options(csv_options));
 
-        let config = conf_builder.with_source(source).build();
+        let config = FileScanConfigBuilder::from(conf)
+            .with_file_compression_type(self.options.compression.into())
+            .with_source(source)
+            .build();
 
         Ok(DataSourceExec::from_data_source(config))
     }
@@ -489,8 +490,12 @@ impl FileFormat for CsvFormat {
         Ok(Arc::new(DataSinkExec::new(input, sink, order_requirements)) as _)
     }
 
-    fn file_source(&self) -> Arc<dyn FileSource> {
-        Arc::new(CsvSource::default())
+    fn file_source(&self, table_schema: TableSchema) -> Arc<dyn FileSource> {
+        let mut csv_options = self.options.clone();
+        if csv_options.has_header.is_none() {
+            csv_options.has_header = Some(true);
+        }
+        Arc::new(CsvSource::new(table_schema).with_csv_options(csv_options))
     }
 }
 
@@ -521,6 +526,7 @@ impl CsvFormat {
         let mut column_names = vec![];
         let mut column_type_possibilities = vec![];
         let mut record_number = -1;
+        let initial_records_to_read = records_to_read;
 
         pin_mut!(stream);
 
@@ -611,12 +617,31 @@ impl CsvFormat {
             }
         }
 
-        let schema = build_schema_helper(column_names, column_type_possibilities);
+        let schema = build_schema_helper(
+            column_names,
+            column_type_possibilities,
+            initial_records_to_read == 0,
+        );
         Ok((schema, total_records_read))
     }
 }
 
-fn build_schema_helper(names: Vec<String>, types: Vec<HashSet<DataType>>) -> Schema {
+/// Builds a schema from column names and their possible data types.
+///
+/// # Arguments
+///
+/// * `names` - Vector of column names
+/// * `types` - Vector of possible data types for each column (as HashSets)
+/// * `disable_inference` - When true, forces all columns with no inferred types to be Utf8.
+///   This should be set to true when `schema_infer_max_rec` is explicitly
+///   set to 0, indicating the user wants to skip type inference and treat
+///   all fields as strings. When false, columns with no inferred types
+///   will be set to Null, allowing schema merging to work properly.
+fn build_schema_helper(
+    names: Vec<String>,
+    types: Vec<HashSet<DataType>>,
+    disable_inference: bool,
+) -> Schema {
     let fields = names
         .into_iter()
         .zip(types)
@@ -629,10 +654,17 @@ fn build_schema_helper(names: Vec<String>, types: Vec<HashSet<DataType>>) -> Sch
             data_type_possibilities.remove(&DataType::Null);
 
             match data_type_possibilities.len() {
-                // Return Null for columns with only nulls / empty files
-                // This allows schema merging to work when reading folders
-                // such files along with normal files.
-                0 => Field::new(field_name, DataType::Null, true),
+                // When no types were inferred (empty HashSet):
+                // - If schema_infer_max_rec was explicitly set to 0, return Utf8
+                // - Otherwise return Null (whether from reading null values or empty files)
+                //   This allows schema merging to work when reading folders with empty files
+                0 => {
+                    if disable_inference {
+                        Field::new(field_name, DataType::Utf8, true)
+                    } else {
+                        Field::new(field_name, DataType::Null, true)
+                    }
+                }
                 1 => Field::new(
                     field_name,
                     data_type_possibilities.iter().next().unwrap().clone(),
@@ -772,6 +804,7 @@ impl FileSink for CsvSink {
             context,
             serializer,
             self.writer_options.compression.into(),
+            self.writer_options.compression_level,
             object_store,
             demux_task,
             file_stream_rx,
@@ -782,10 +815,6 @@ impl FileSink for CsvSink {
 
 #[async_trait]
 impl DataSink for CsvSink {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> &SchemaRef {
         self.config.output_schema()
     }
@@ -823,7 +852,7 @@ mod tests {
             HashSet::from([DataType::Utf8]), // col5
         ];
 
-        let schema = build_schema_helper(column_names, column_type_possibilities);
+        let schema = build_schema_helper(column_names, column_type_possibilities, false);
 
         // Verify schema has 5 columns
         assert_eq!(schema.fields().len(), 5);
@@ -853,7 +882,7 @@ mod tests {
             HashSet::from([DataType::Utf8]),                     // Should remain Utf8
         ];
 
-        let schema = build_schema_helper(column_names, column_type_possibilities);
+        let schema = build_schema_helper(column_names, column_type_possibilities, false);
 
         // col1 should be Float64 due to Int64 + Float64 = Float64
         assert_eq!(*schema.field(0).data_type(), DataType::Float64);
@@ -871,7 +900,7 @@ mod tests {
             HashSet::from([DataType::Boolean, DataType::Int64, DataType::Utf8]), // Should resolve to Utf8 due to conflicts
         ];
 
-        let schema = build_schema_helper(column_names, column_type_possibilities);
+        let schema = build_schema_helper(column_names, column_type_possibilities, false);
 
         // Should default to Utf8 for conflicting types
         assert_eq!(*schema.field(0).data_type(), DataType::Utf8);
diff --git a/datafusion/datasource-csv/src/mod.rs b/datafusion/datasource-csv/src/mod.rs
index 90538d0808b1a..fdfee05d86a79 100644
--- a/datafusion/datasource-csv/src/mod.rs
+++ b/datafusion/datasource-csv/src/mod.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
@@ -24,7 +25,7 @@ pub mod source;
 
 use std::sync::Arc;
 
-use arrow::datatypes::SchemaRef;
+use datafusion_common::Result;
 use datafusion_datasource::file_groups::FileGroup;
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use datafusion_datasource::{file::FileSource, file_scan_config::FileScanConfig};
@@ -33,11 +34,12 @@ pub use file_format::*;
 
 /// Returns a [`FileScanConfig`] for given `file_groups`
 pub fn partitioned_csv_config(
-    schema: SchemaRef,
     file_groups: Vec<FileGroup>,
     file_source: Arc<dyn FileSource>,
-) -> FileScanConfig {
-    FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema, file_source)
-        .with_file_groups(file_groups)
-        .build()
+) -> Result<FileScanConfig> {
+    Ok(
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+            .with_file_groups(file_groups)
+            .build(),
+    )
 }
diff --git a/datafusion/datasource-csv/src/source.rs b/datafusion/datasource-csv/src/source.rs
index 0b18571e58bd7..611586cee6473 100644
--- a/datafusion/datasource-csv/src/source.rs
+++ b/datafusion/datasource-csv/src/source.rs
@@ -17,29 +17,30 @@
 
 //! Execution plan for reading CSV files
 
-use datafusion_datasource::schema_adapter::SchemaAdapterFactory;
-use std::any::Any;
+use datafusion_datasource::projection::{ProjectionOpener, SplitProjection};
+use datafusion_physical_plan::projection::ProjectionExprs;
 use std::fmt;
 use std::io::{Read, Seek, SeekFrom};
 use std::sync::Arc;
 use std::task::Poll;
 
-use datafusion_datasource::decoder::{deserialize_stream, DecoderDeserializer};
+use datafusion_datasource::decoder::{DecoderDeserializer, deserialize_stream};
 use datafusion_datasource::file_compression_type::FileCompressionType;
 use datafusion_datasource::file_stream::{FileOpenFuture, FileOpener};
 use datafusion_datasource::{
-    as_file_source, calculate_range, FileRange, ListingTableUrl, PartitionedFile,
-    RangeCalculation, TableSchema,
+    FileRange, ListingTableUrl, PartitionedFile, RangeCalculation, TableSchema,
+    as_file_source, calculate_range,
 };
 
 use arrow::csv;
-use arrow::datatypes::SchemaRef;
-use datafusion_common::{DataFusionError, Result, Statistics};
+use datafusion_common::config::CsvOptions;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{DataFusionError, Result};
 use datafusion_common_runtime::JoinSet;
 use datafusion_datasource::file::FileSource;
 use datafusion_datasource::file_scan_config::FileScanConfig;
 use datafusion_execution::TaskContext;
-use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
+use datafusion_physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet};
 use datafusion_physical_plan::{
     DisplayFormatType, ExecutionPlan, ExecutionPlanProperties,
 };
@@ -61,113 +62,122 @@ use tokio::io::AsyncWriteExt;
 /// # use datafusion_datasource_csv::source::CsvSource;
 /// # use datafusion_execution::object_store::ObjectStoreUrl;
 /// # use datafusion_datasource::source::DataSourceExec;
+/// # use datafusion_common::config::CsvOptions;
 ///
 /// # let object_store_url = ObjectStoreUrl::local_filesystem();
 /// # let file_schema = Arc::new(Schema::empty());
 ///
-/// let source = Arc::new(CsvSource::new(
-///         true,
-///         b',',
-///         b'"',
-///     )
-///     .with_terminator(Some(b'#')
-/// ));
+/// let options = CsvOptions {
+///     has_header: Some(true),
+///     delimiter: b',',
+///     quote: b'"',
+///     newlines_in_values: Some(true), // The file contains newlines in values
+///     ..Default::default()
+/// };
+/// let source = Arc::new(CsvSource::new(file_schema.clone())
+///     .with_csv_options(options)
+///     .with_terminator(Some(b'#'))
+/// );
 /// // Create a DataSourceExec for reading the first 100MB of `file1.csv`
-/// let config = FileScanConfigBuilder::new(object_store_url, file_schema, source)
+/// let config = FileScanConfigBuilder::new(object_store_url, source)
 ///     .with_file(PartitionedFile::new("file1.csv", 100*1024*1024))
-///     .with_newlines_in_values(true) // The file contains newlines in values;
 ///     .build();
 /// let exec = (DataSourceExec::from_data_source(config));
 /// ```
-#[derive(Debug, Clone, Default)]
+#[derive(Debug, Clone)]
 pub struct CsvSource {
+    options: CsvOptions,
     batch_size: Option<usize>,
-    file_schema: Option<SchemaRef>,
-    file_projection: Option<Vec<usize>>,
-    pub(crate) has_header: bool,
-    delimiter: u8,
-    quote: u8,
-    terminator: Option<u8>,
-    escape: Option<u8>,
-    comment: Option<u8>,
+    table_schema: TableSchema,
+    projection: SplitProjection,
     metrics: ExecutionPlanMetricsSet,
-    projected_statistics: Option<Statistics>,
-    schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
-    truncate_rows: bool,
 }
 
 impl CsvSource {
     /// Returns a [`CsvSource`]
-    pub fn new(has_header: bool, delimiter: u8, quote: u8) -> Self {
+    pub fn new(table_schema: impl Into<TableSchema>) -> Self {
+        let table_schema = table_schema.into();
         Self {
-            has_header,
-            delimiter,
-            quote,
-            ..Self::default()
+            options: CsvOptions::default(),
+            projection: SplitProjection::unprojected(&table_schema),
+            table_schema,
+            batch_size: None,
+            metrics: ExecutionPlanMetricsSet::new(),
         }
     }
 
+    /// Sets the CSV options
+    pub fn with_csv_options(mut self, options: CsvOptions) -> Self {
+        self.options = options;
+        self
+    }
+
     /// true if the first line of each file is a header
     pub fn has_header(&self) -> bool {
-        self.has_header
+        self.options.has_header.unwrap_or(true)
     }
 
     // true if rows length support truncate
     pub fn truncate_rows(&self) -> bool {
-        self.truncate_rows
+        self.options.truncated_rows.unwrap_or(false)
     }
     /// A column delimiter
     pub fn delimiter(&self) -> u8 {
-        self.delimiter
+        self.options.delimiter
     }
 
     /// The quote character
     pub fn quote(&self) -> u8 {
-        self.quote
+        self.options.quote
     }
 
     /// The line terminator
     pub fn terminator(&self) -> Option<u8> {
-        self.terminator
+        self.options.terminator
     }
 
     /// Lines beginning with this byte are ignored.
     pub fn comment(&self) -> Option<u8> {
-        self.comment
+        self.options.comment
     }
 
     /// The escape character
     pub fn escape(&self) -> Option<u8> {
-        self.escape
+        self.options.escape
     }
 
     /// Initialize a CsvSource with escape
     pub fn with_escape(&self, escape: Option<u8>) -> Self {
         let mut conf = self.clone();
-        conf.escape = escape;
+        conf.options.escape = escape;
         conf
     }
 
     /// Initialize a CsvSource with terminator
     pub fn with_terminator(&self, terminator: Option<u8>) -> Self {
         let mut conf = self.clone();
-        conf.terminator = terminator;
+        conf.options.terminator = terminator;
         conf
     }
 
     /// Initialize a CsvSource with comment
     pub fn with_comment(&self, comment: Option<u8>) -> Self {
         let mut conf = self.clone();
-        conf.comment = comment;
+        conf.options.comment = comment;
         conf
     }
 
     /// Whether to support truncate rows when read csv file
     pub fn with_truncate_rows(&self, truncate_rows: bool) -> Self {
         let mut conf = self.clone();
-        conf.truncate_rows = truncate_rows;
+        conf.options.truncated_rows = Some(truncate_rows);
         conf
     }
+
+    /// Whether values may contain newline characters
+    pub fn newlines_in_values(&self) -> bool {
+        self.options.newlines_in_values.unwrap_or(false)
+    }
 }
 
 impl CsvSource {
@@ -176,29 +186,24 @@ impl CsvSource {
     }
 
     fn builder(&self) -> csv::ReaderBuilder {
-        let mut builder = csv::ReaderBuilder::new(Arc::clone(
-            self.file_schema
-                .as_ref()
-                .expect("Schema must be set before initializing builder"),
-        ))
-        .with_delimiter(self.delimiter)
-        .with_batch_size(
-            self.batch_size
-                .expect("Batch size must be set before initializing builder"),
-        )
-        .with_header(self.has_header)
-        .with_quote(self.quote)
-        .with_truncated_rows(self.truncate_rows);
-        if let Some(terminator) = self.terminator {
+        let mut builder =
+            csv::ReaderBuilder::new(Arc::clone(self.table_schema.file_schema()))
+                .with_delimiter(self.delimiter())
+                .with_batch_size(
+                    self.batch_size
+                        .expect("Batch size must be set before initializing builder"),
+                )
+                .with_header(self.has_header())
+                .with_quote(self.quote())
+                .with_truncated_rows(self.truncate_rows());
+        if let Some(terminator) = self.terminator() {
             builder = builder.with_terminator(terminator);
         }
-        if let Some(proj) = &self.file_projection {
-            builder = builder.with_projection(proj.clone());
-        }
-        if let Some(escape) = self.escape {
+        builder = builder.with_projection(self.projection.file_indices.clone());
+        if let Some(escape) = self.escape() {
             builder = builder.with_escape(escape)
         }
-        if let Some(comment) = self.comment {
+        if let Some(comment) = self.comment() {
             builder = builder.with_comment(comment);
         }
 
@@ -211,6 +216,7 @@ pub struct CsvOpener {
     config: Arc<CsvSource>,
     file_compression_type: FileCompressionType,
     object_store: Arc<dyn ObjectStore>,
+    partition_index: usize,
 }
 
 impl CsvOpener {
@@ -224,6 +230,7 @@ impl CsvOpener {
             config,
             file_compression_type,
             object_store,
+            partition_index: 0,
         }
     }
 }
@@ -239,17 +246,24 @@ impl FileSource for CsvSource {
         &self,
         object_store: Arc<dyn ObjectStore>,
         base_config: &FileScanConfig,
-        _partition: usize,
-    ) -> Arc<dyn FileOpener> {
-        Arc::new(CsvOpener {
+        partition_index: usize,
+    ) -> Result<Arc<dyn FileOpener>> {
+        let mut opener = Arc::new(CsvOpener {
             config: Arc::new(self.clone()),
             file_compression_type: base_config.file_compression_type,
             object_store,
-        })
+            partition_index,
+        }) as Arc<dyn FileOpener>;
+        opener = ProjectionOpener::try_new(
+            self.projection.clone(),
+            Arc::clone(&opener),
+            self.table_schema.file_schema(),
+        )?;
+        Ok(opener)
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
+    fn table_schema(&self) -> &TableSchema {
+        &self.table_schema
     }
 
     fn with_batch_size(&self, batch_size: usize) -> Arc<dyn FileSource> {
@@ -258,57 +272,57 @@ impl FileSource for CsvSource {
         Arc::new(conf)
     }
 
-    fn with_schema(&self, schema: TableSchema) -> Arc<dyn FileSource> {
-        let mut conf = self.clone();
-        conf.file_schema = Some(Arc::clone(schema.file_schema()));
-        Arc::new(conf)
-    }
-
-    fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource> {
-        let mut conf = self.clone();
-        conf.projected_statistics = Some(statistics);
-        Arc::new(conf)
+    fn try_pushdown_projection(
+        &self,
+        projection: &ProjectionExprs,
+    ) -> Result<Option<Arc<dyn FileSource>>> {
+        let mut source = self.clone();
+        let new_projection = self.projection.source.try_merge(projection)?;
+        let split_projection =
+            SplitProjection::new(self.table_schema.file_schema(), &new_projection);
+        source.projection = split_projection;
+        Ok(Some(Arc::new(source)))
     }
 
-    fn with_projection(&self, config: &FileScanConfig) -> Arc<dyn FileSource> {
-        let mut conf = self.clone();
-        conf.file_projection = config.file_column_projection_indices();
-        Arc::new(conf)
+    fn projection(&self) -> Option<&ProjectionExprs> {
+        Some(&self.projection.source)
     }
 
     fn metrics(&self) -> &ExecutionPlanMetricsSet {
         &self.metrics
     }
-    fn statistics(&self) -> Result<Statistics> {
-        let statistics = &self.projected_statistics;
-        Ok(statistics
-            .clone()
-            .expect("projected_statistics must be set"))
-    }
+
     fn file_type(&self) -> &str {
         "csv"
     }
+
+    fn supports_repartitioning(&self) -> bool {
+        // Cannot repartition if values may contain newlines, as record
+        // boundaries cannot be determined by byte offset alone
+        !self.options.newlines_in_values.unwrap_or(false)
+    }
+
     fn fmt_extra(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
         match t {
             DisplayFormatType::Default | DisplayFormatType::Verbose => {
-                write!(f, ", has_header={}", self.has_header)
+                write!(f, ", has_header={}", self.has_header())
             }
             DisplayFormatType::TreeRender => Ok(()),
         }
     }
 
-    fn with_schema_adapter_factory(
+    fn apply_expressions(
         &self,
-        schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
-    ) -> Result<Arc<dyn FileSource>> {
-        Ok(Arc::new(Self {
-            schema_adapter_factory: Some(schema_adapter_factory),
-            ..self.clone()
-        }))
-    }
-
-    fn schema_adapter_factory(&self) -> Option<Arc<dyn SchemaAdapterFactory>> {
-        self.schema_adapter_factory.clone()
+        f: &mut dyn FnMut(
+            &dyn datafusion_physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit projection expressions
+        let mut tnr = TreeNodeRecursion::Continue;
+        for proj_expr in &self.projection.source {
+            tnr = tnr.visit_sibling(|| f(proj_expr.expr.as_ref()))?;
+        }
+        Ok(tnr)
     }
 }
 
@@ -340,18 +354,16 @@ impl FileOpener for CsvOpener {
         // `self.config.has_header` controls whether to skip reading the 1st line header
         // If the .csv file is read in parallel and this `CsvOpener` is only reading some middle
         // partition, then don't skip first line
-        let mut csv_has_header = self.config.has_header;
-        if let Some(FileRange { start, .. }) = partitioned_file.range {
-            if start != 0 {
-                csv_has_header = false;
-            }
+        let mut csv_has_header = self.config.has_header();
+        if let Some(FileRange { start, .. }) = partitioned_file.range
+            && start != 0
+        {
+            csv_has_header = false;
         }
 
-        let config = CsvSource {
-            has_header: csv_has_header,
-            truncate_rows: self.config.truncate_rows,
-            ..(*self.config).clone()
-        };
+        let mut config = (*self.config).clone();
+        config.options.has_header = Some(csv_has_header);
+        config.options.truncated_rows = Some(config.truncate_rows());
 
         let file_compression_type = self.file_compression_type.to_owned();
 
@@ -363,7 +375,10 @@ impl FileOpener for CsvOpener {
         }
 
         let store = Arc::clone(&self.object_store);
-        let terminator = self.config.terminator;
+        let terminator = self.config.terminator();
+
+        let baseline_metrics =
+            BaselineMetrics::new(&self.config.metrics, self.partition_index);
 
         Ok(Box::pin(async move {
             // Current partition contains bytes [start_byte, end_byte) (might contain incomplete lines at boundaries)
@@ -377,7 +392,7 @@ impl FileOpener for CsvOpener {
                 RangeCalculation::TerminateEarly => {
                     return Ok(
                         futures::stream::poll_fn(move |_| Poll::Ready(None)).boxed()
-                    )
+                    );
                 }
             };
 
@@ -404,7 +419,17 @@ impl FileOpener for CsvOpener {
                         )?
                     };
 
-                    Ok(futures::stream::iter(config.open(decoder)?)
+                    let mut reader = config.open(decoder)?;
+
+                    // Use std::iter::from_fn to wrap execution of iterator's next() method.
+                    let iterator = std::iter::from_fn(move || {
+                        let mut timer = baseline_metrics.elapsed_compute().timer();
+                        let result = reader.next();
+                        timer.stop();
+                        result
+                    });
+
+                    Ok(futures::stream::iter(iterator)
                         .map(|r| r.map_err(Into::into))
                         .boxed())
                 }
diff --git a/datafusion/datasource-json/Cargo.toml b/datafusion/datasource-json/Cargo.toml
index 987ab60c70b7c..b5947ea5c4c67 100644
--- a/datafusion/datasource-json/Cargo.toml
+++ b/datafusion/datasource-json/Cargo.toml
@@ -45,10 +45,17 @@ datafusion-session = { workspace = true }
 futures = { workspace = true }
 object_store = { workspace = true }
 tokio = { workspace = true }
+tokio-stream = { workspace = true, features = ["sync"] }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
+[dev-dependencies]
+serde_json = { workspace = true }
+
 [lib]
 name = "datafusion_datasource_json"
 path = "src/mod.rs"
diff --git a/datafusion/datasource-json/src/boundary_stream.rs b/datafusion/datasource-json/src/boundary_stream.rs
new file mode 100644
index 0000000000000..847c80279a53e
--- /dev/null
+++ b/datafusion/datasource-json/src/boundary_stream.rs
@@ -0,0 +1,884 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Streaming boundary-aligned wrapper for newline-delimited JSON range reads.
+//!
+//! [`AlignedBoundaryStream`] wraps a raw byte stream and lazily aligns to
+//! record (newline) boundaries, avoiding the need for separate `get_opts`
+//! calls to locate boundary positions.
+
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use bytes::Bytes;
+use futures::stream::{BoxStream, Stream};
+use futures::{StreamExt, TryFutureExt};
+use object_store::{GetOptions, GetRange, GetResultPayload, ObjectStore};
+
+/// How far past `raw_end` the initial bounded fetch covers. If the terminating
+/// newline is not found within this window, `ScanningLastTerminator` issues
+/// successive same-sized GETs until the newline is located or EOF is reached.
+pub const END_SCAN_LOOKAHEAD: u64 = 16 * 1024; // 16 KiB
+
+/// Phase of the boundary alignment state machine.
+#[derive(Debug)]
+enum Phase {
+    /// Scanning for the first newline to align the start boundary.
+    ScanningFirstTerminator,
+    /// Passing through aligned data, tracking byte position.
+    FetchingChunks,
+    /// Past the end boundary, scanning for terminating newline.
+    ScanningLastTerminator,
+    /// Stream is exhausted.
+    Done,
+}
+
+/// A stream wrapper that lazily aligns byte boundaries to newline characters.
+///
+/// Given a raw byte stream starting from `fetch_start` (which is `start - 1`
+/// for non-zero starts, or `0`), this stream:
+///
+/// 1. Skips bytes until the first newline is found (start alignment)
+/// 2. Passes through data until the `end` boundary is reached
+/// 3. Continues past `end` to find the terminating newline (end alignment)
+///
+/// When the initial byte stream is exhausted during step 3 and the file has
+/// not been fully read, `ScanningLastTerminator` issues additional bounded
+/// `get_opts` calls (`END_SCAN_LOOKAHEAD` bytes each) until the newline is
+/// found or EOF is reached.
+pub struct AlignedBoundaryStream {
+    inner: BoxStream<'static, object_store::Result<Bytes>>,
+    terminator: u8,
+    /// Effective end boundary. Set to `u64::MAX` when `end >= file_size`
+    /// (last partition), so `FetchingChunks` never transitions to
+    /// `ScanningLastTerminator` and simply streams until EOF is reached.
+    end: u64,
+    /// Cumulative bytes consumed from `inner` (relative to `fetch_start`).
+    bytes_consumed: u64,
+    /// The offset where the current `inner` stream begins.
+    fetch_start: u64,
+    phase: Phase,
+    /// Remainder bytes from `ScanningFirstTerminator` that still need
+    /// end-boundary processing. Consumed by `FetchingChunks` before polling
+    /// `inner`.
+    pending: Option<Bytes>,
+    store: Arc<dyn ObjectStore>,
+    location: object_store::path::Path,
+    /// Total file size; overflow stops when `abs_pos() >= file_size`.
+    file_size: u64,
+}
+
+/// Fetch a bounded byte range from `store` and return it as a stream
+async fn get_stream(
+    store: Arc<dyn ObjectStore>,
+    location: object_store::path::Path,
+    range: std::ops::Range<u64>,
+) -> object_store::Result<BoxStream<'static, object_store::Result<Bytes>>> {
+    let opts = GetOptions {
+        range: Some(GetRange::Bounded(range.clone())),
+        ..Default::default()
+    };
+    let result = store.get_opts(&location, opts).await?;
+
+    #[cfg(not(target_arch = "wasm32"))]
+    if let GetResultPayload::File(mut file, _path) = result.payload {
+        use std::io::{Read, Seek, SeekFrom};
+        const CHUNK_SIZE: u64 = 8 * 1024;
+
+        file.seek(SeekFrom::Start(range.start)).map_err(|e| {
+            object_store::Error::Generic {
+                store: "local",
+                source: Box::new(e),
+            }
+        })?;
+
+        return Ok(futures::stream::try_unfold(
+            (file, range.end - range.start),
+            move |(mut file, remaining)| async move {
+                if remaining == 0 {
+                    return Ok(None);
+                }
+                let to_read = remaining.min(CHUNK_SIZE);
+                let cap = usize::try_from(to_read).map_err(|e| {
+                    object_store::Error::Generic {
+                        store: "local",
+                        source: Box::new(e),
+                    }
+                })?;
+
+                let mut buf = Vec::with_capacity(cap);
+                let read =
+                    (&mut file)
+                        .take(to_read)
+                        .read_to_end(&mut buf)
+                        .map_err(|e| object_store::Error::Generic {
+                            store: "local",
+                            source: Box::new(e),
+                        })?;
+                Ok(Some((Bytes::from(buf), (file, remaining - read as u64))))
+            },
+        )
+        .boxed());
+    }
+
+    Ok(result.into_stream())
+}
+
+impl AlignedBoundaryStream {
+    /// Open a ranged byte stream from `store` and return a ready-to-poll
+    /// `AlignedBoundaryStream`.
+    ///
+    /// Issues a single bounded `get_opts` call covering
+    /// `[fetch_start, raw_end + END_SCAN_LOOKAHEAD)`.  If the terminating
+    /// newline is not found within that window, `ScanningLastTerminator`
+    /// automatically issues additional `END_SCAN_LOOKAHEAD`-sized GETs
+    /// via `store` until the newline is found or EOF is reached.
+    pub async fn new(
+        store: Arc<dyn ObjectStore>,
+        location: object_store::path::Path,
+        raw_start: u64,
+        raw_end: u64,
+        file_size: u64,
+        terminator: u8,
+    ) -> object_store::Result<Self> {
+        if raw_start >= raw_end || raw_start >= file_size {
+            return Ok(Self {
+                inner: futures::stream::empty().boxed(),
+                terminator,
+                end: 0,
+                bytes_consumed: 0,
+                fetch_start: 0,
+                phase: Phase::Done,
+                pending: None,
+                store,
+                location,
+                file_size,
+            });
+        }
+
+        let (fetch_start, phase) = if raw_start == 0 {
+            (0, Phase::FetchingChunks)
+        } else {
+            (raw_start - 1, Phase::ScanningFirstTerminator)
+        };
+
+        let initial_fetch_end = raw_end.saturating_add(END_SCAN_LOOKAHEAD).min(file_size);
+
+        let inner = get_stream(
+            Arc::clone(&store),
+            location.clone(),
+            fetch_start..initial_fetch_end,
+        )
+        .await?;
+
+        // Last partition reads until EOF is reached — no end-boundary scanning needed.
+        let end = if raw_end >= file_size {
+            u64::MAX
+        } else {
+            raw_end
+        };
+
+        Ok(Self {
+            inner,
+            terminator,
+            end,
+            bytes_consumed: 0,
+            fetch_start,
+            phase,
+            pending: None,
+            store,
+            location,
+            file_size,
+        })
+    }
+
+    /// Current absolute position in the file.
+    fn abs_pos(&self) -> u64 {
+        self.fetch_start + self.bytes_consumed
+    }
+}
+
+impl Stream for AlignedBoundaryStream {
+    type Item = object_store::Result<Bytes>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+
+        loop {
+            match this.phase {
+                Phase::Done => return Poll::Ready(None),
+
+                Phase::ScanningFirstTerminator => {
+                    // Find the first terminator and skip everything up to
+                    // and including it. Store any remainder in `pending`
+                    // so `FetchingChunks` can apply end-boundary logic to it.
+                    match this.inner.poll_next_unpin(cx) {
+                        Poll::Pending => return Poll::Pending,
+                        Poll::Ready(None) => {
+                            this.phase = Phase::Done;
+                            return Poll::Ready(None);
+                        }
+                        Poll::Ready(Some(Err(e))) => {
+                            this.phase = Phase::Done;
+                            return Poll::Ready(Some(Err(e)));
+                        }
+                        Poll::Ready(Some(Ok(chunk))) => {
+                            this.bytes_consumed += chunk.len() as u64;
+                            match chunk.iter().position(|&b| b == this.terminator) {
+                                Some(pos) => {
+                                    let remainder = chunk.slice((pos + 1)..);
+                                    // The aligned start position is where
+                                    // data begins after the newline.
+                                    let aligned_start =
+                                        this.abs_pos() - remainder.len() as u64;
+                                    if aligned_start >= this.end {
+                                        // Start alignment landed at or past
+                                        // the end boundary — no complete
+                                        // lines in this partition's range.
+                                        this.phase = Phase::Done;
+                                        return Poll::Ready(None);
+                                    }
+                                    if !remainder.is_empty() {
+                                        this.pending = Some(remainder);
+                                    }
+                                    this.phase = Phase::FetchingChunks;
+                                    continue;
+                                }
+                                None => continue,
+                            }
+                        }
+                    }
+                }
+
+                Phase::FetchingChunks => {
+                    // Get the next chunk: pending remainder or inner stream.
+                    let chunk = if let Some(pending) = this.pending.take() {
+                        pending
+                    } else {
+                        match this.inner.poll_next_unpin(cx) {
+                            Poll::Pending => return Poll::Pending,
+                            Poll::Ready(None) => {
+                                this.phase = Phase::Done;
+                                return Poll::Ready(None);
+                            }
+                            Poll::Ready(Some(Err(e))) => {
+                                this.phase = Phase::Done;
+                                return Poll::Ready(Some(Err(e)));
+                            }
+                            Poll::Ready(Some(Ok(chunk))) => {
+                                this.bytes_consumed += chunk.len() as u64;
+                                chunk
+                            }
+                        }
+                    };
+
+                    let pos_after = this.abs_pos();
+
+                    // When end == u64::MAX (last partition), this is always
+                    // true and we stream straight through until EOF is reached.
+                    if pos_after < this.end {
+                        return Poll::Ready(Some(Ok(chunk)));
+                    }
+
+                    if pos_after == this.end {
+                        // Chunk ends exactly at the boundary.
+                        if chunk.last() == Some(&this.terminator) {
+                            this.phase = Phase::Done;
+                        } else {
+                            // No terminator at boundary; any following data
+                            // is past end, so switch to end-scanning.
+                            this.phase = Phase::ScanningLastTerminator;
+                        }
+                        return Poll::Ready(Some(Ok(chunk)));
+                    }
+
+                    // Chunk crosses the end boundary (`pos_after > this.end`).
+                    // Find the first terminator at or after file position
+                    // `this.end - 1` and yield everything up to and
+                    // including it.
+                    //
+                    // `pos_before` is the absolute file position of chunk[0].
+                    // `chunk_in_range_len` is how many bytes of this chunk
+                    // fall within [pos_before, this.end), so chunk[0..
+                    // chunk_in_range_len] is the in-range portion.
+                    // `search_from` is the chunk index of the last in-range
+                    // byte (file position this.end - 1).
+                    //
+                    // Example A: "line1\nline2\nline3\n" (18 bytes), end=8,
+                    // one large chunk arriving with pos_after=18:
+                    //   pos_before         = 18 - 18 = 0
+                    //   chunk_in_range_len =  8 -  0 = 8
+                    //   search_from        = 7   (chunk[7] is file pos 7)
+                    //   chunk[7]='i', chunk[11]='\n' → rel=4
+                    //   yield chunk[..7+4+1] = chunk[..12] = "line1\nline2\n"
+                    //
+                    // Example B: same data, 3-byte chunks, end=8.
+                    // "lin"(pos 0-2) and "e1\n"(pos 3-5) yielded already.
+                    // Now chunk="lin" arrives with pos_after=9:
+                    //   pos_before         = 9 - 3 = 6
+                    //   chunk_in_range_len = 8 - 6 = 2
+                    //   search_from        = 1   (chunk[1] is file pos 7)
+                    //   chunk[1]='i', no '\n' in chunk[1..] → EndScan
+                    let pos_before = pos_after - chunk.len() as u64;
+                    let chunk_in_range_len = (this.end - pos_before) as usize;
+                    let search_from = chunk_in_range_len - 1;
+                    if let Some(rel) = chunk[search_from..]
+                        .iter()
+                        .position(|&b| b == this.terminator)
+                    {
+                        this.phase = Phase::Done;
+                        return Poll::Ready(Some(Ok(
+                            chunk.slice(..search_from + rel + 1)
+                        )));
+                    }
+
+                    // No terminator found; continue scanning in EndScan.
+                    this.phase = Phase::ScanningLastTerminator;
+                    return Poll::Ready(Some(Ok(chunk)));
+                }
+
+                Phase::ScanningLastTerminator => {
+                    match this.inner.poll_next_unpin(cx) {
+                        Poll::Pending => return Poll::Pending,
+                        Poll::Ready(None) => {
+                            // Inner exhausted. Issue the next overflow GET if
+                            // the file has not been fully read yet.
+                            let pos = this.abs_pos();
+                            if pos < this.file_size {
+                                let fetch_end = pos
+                                    .saturating_add(END_SCAN_LOOKAHEAD)
+                                    .min(this.file_size);
+                                let store = Arc::clone(&this.store);
+                                let location = this.location.clone();
+                                this.inner = get_stream(store, location, pos..fetch_end)
+                                    .try_flatten_stream()
+                                    .boxed();
+                                continue;
+                            }
+                            this.phase = Phase::Done;
+                            return Poll::Ready(None);
+                        }
+                        Poll::Ready(Some(Err(e))) => {
+                            this.phase = Phase::Done;
+                            return Poll::Ready(Some(Err(e)));
+                        }
+                        Poll::Ready(Some(Ok(chunk))) => {
+                            this.bytes_consumed += chunk.len() as u64;
+                            if let Some(pos) =
+                                chunk.iter().position(|&b| b == this.terminator)
+                            {
+                                this.phase = Phase::Done;
+                                return Poll::Ready(Some(Ok(chunk.slice(..pos + 1))));
+                            }
+                            // No terminator yet; yield and keep scanning.
+                            return Poll::Ready(Some(Ok(chunk)));
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::test_utils::{CHUNK_SIZES, make_chunked_store};
+    use futures::TryStreamExt;
+
+    async fn collect_stream(stream: AlignedBoundaryStream) -> Vec<u8> {
+        stream.try_collect::<Vec<Bytes>>().await.unwrap().concat()
+    }
+
+    #[tokio::test]
+    async fn test_start_at_zero_no_end_scan() {
+        // start=0, end >= file_size → pass through everything
+        static DATA: &[u8] = b"line1\nline2\nline3\n";
+        for &cs in CHUNK_SIZES {
+            let (store, path) = make_chunked_store(DATA, cs).await;
+            let s = AlignedBoundaryStream::new(store, path, 0, 100, 18, b'\n')
+                .await
+                .unwrap();
+            assert_eq!(collect_stream(s).await, DATA, "chunk_size={cs}");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_start_aligned_on_newline() {
+        // Data: "line1\nline2\nline3\n"
+        //        0    5 6   11 12  17
+        // start=6 → fetch_start=5. Byte at offset 5 is '\n'.
+        // Should skip the leading '\n' and yield "line2\nline3\n".
+        static DATA: &[u8] = b"line1\nline2\nline3\n";
+        for &cs in CHUNK_SIZES {
+            let (store, path) = make_chunked_store(DATA, cs).await;
+            let s = AlignedBoundaryStream::new(store, path, 6, 100, 18, b'\n')
+                .await
+                .unwrap();
+            assert_eq!(
+                collect_stream(s).await,
+                b"line2\nline3\n",
+                "chunk_size={cs}"
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn test_start_mid_line() {
+        // start=3, fetch_start=2. Bytes from offset 2: "ne1\nline2\nline3\n".
+        // Should skip "ne1\n" and yield "line2\nline3\n".
+        static DATA: &[u8] = b"line1\nline2\nline3\n";
+        for &cs in CHUNK_SIZES {
+            let (store, path) = make_chunked_store(DATA, cs).await;
+            let s = AlignedBoundaryStream::new(store, path, 3, 100, 18, b'\n')
+                .await
+                .unwrap();
+            assert_eq!(
+                collect_stream(s).await,
+                b"line2\nline3\n",
+                "chunk_size={cs}"
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn test_end_boundary_mid_line() {
+        // Data: "line1\nline2\nline3\n"
+        //        0    5 6   11 12  17
+        // start=0, end=8. End is mid "line2".
+        // Should yield "line1\nline2\n" (continue past end to find newline).
+        static DATA: &[u8] = b"line1\nline2\nline3\n";
+        for &cs in CHUNK_SIZES {
+            let (store, path) = make_chunked_store(DATA, cs).await;
+            let s = AlignedBoundaryStream::new(store, path, 0, 8, 18, b'\n')
+                .await
+                .unwrap();
+            assert_eq!(
+                collect_stream(s).await,
+                b"line1\nline2\n",
+                "chunk_size={cs}"
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn test_end_at_eof() {
+        // end >= file_size → no end scanning, pass through everything.
+        static DATA: &[u8] = b"line1\nline2\n";
+        for &cs in CHUNK_SIZES {
+            let (store, path) = make_chunked_store(DATA, cs).await;
+            let s = AlignedBoundaryStream::new(store, path, 0, 12, 12, b'\n')
+                .await
+                .unwrap();
+            assert_eq!(collect_stream(s).await, DATA, "chunk_size={cs}");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_no_newline_in_range() {
+        // start=2, fetch_start=1. Bytes from offset 1: "bcdef" — no newline.
+        // No complete line → empty output.
+        static DATA: &[u8] = b"abcdef";
+        for &cs in CHUNK_SIZES {
+            let (store, path) = make_chunked_store(DATA, cs).await;
+            let s = AlignedBoundaryStream::new(store, path, 2, 6, 6, b'\n')
+                .await
+                .unwrap();
+            assert!(collect_stream(s).await.is_empty(), "chunk_size={cs}");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_start_and_end_alignment() {
+        // Data: "line1\nline2\nline3\nline4\n"
+        //        0    5 6   11 12  17 18  23
+        // start=3, end=14, file_size=24
+        // fetch_start=2, bytes from offset 2: "ne1\nline2\nline3\nline4\n"
+        // Start aligns past "ne1\n"; end=14 is mid "line3", scan to '\n'.
+        // Expected: "line2\nline3\n"
+        static DATA: &[u8] = b"line1\nline2\nline3\nline4\n";
+        for &cs in CHUNK_SIZES {
+            let (store, path) = make_chunked_store(DATA, cs).await;
+            let s = AlignedBoundaryStream::new(store, path, 3, 14, 24, b'\n')
+                .await
+                .unwrap();
+            assert_eq!(
+                collect_stream(s).await,
+                b"line2\nline3\n",
+                "chunk_size={cs}"
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn test_end_scan_across_chunks() {
+        // end boundary falls before a newline; the terminating newline must be
+        // found by scanning past the end in subsequent chunks.
+        // Data: "line1\nline2\nline3\n" (18 bytes)
+        // start=0, end=7 (mid "line2"), file_size=18 → "line1\nline2\n"
+        static DATA: &[u8] = b"line1\nline2\nline3\n";
+        for &cs in CHUNK_SIZES {
+            let (store, path) = make_chunked_store(DATA, cs).await;
+            let s = AlignedBoundaryStream::new(store, path, 0, 7, 18, b'\n')
+                .await
+                .unwrap();
+            assert_eq!(
+                collect_stream(s).await,
+                b"line1\nline2\n",
+                "chunk_size={cs}"
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn test_empty_range() {
+        // start >= end — no complete line can exist, regardless of data.
+        static DATA: &[u8] = b"line1\nline2\n";
+        for &cs in CHUNK_SIZES {
+            let (store, path) = make_chunked_store(DATA, cs).await;
+
+            // start > end (non-zero start)
+            let s = AlignedBoundaryStream::new(
+                Arc::clone(&store),
+                path.clone(),
+                10,
+                5,
+                20,
+                b'\n',
+            )
+            .await
+            .unwrap();
+            assert!(
+                collect_stream(s).await.is_empty(),
+                "start>end chunk_size={cs}"
+            );
+
+            // start == end == 0 (zero start, previously unguarded)
+            let s = AlignedBoundaryStream::new(
+                Arc::clone(&store),
+                path.clone(),
+                0,
+                0,
+                12,
+                b'\n',
+            )
+            .await
+            .unwrap();
+            assert!(
+                collect_stream(s).await.is_empty(),
+                "start==end==0 chunk_size={cs}"
+            );
+
+            // start == end (non-zero)
+            let s = AlignedBoundaryStream::new(
+                Arc::clone(&store),
+                path.clone(),
+                6,
+                6,
+                12,
+                b'\n',
+            )
+            .await
+            .unwrap();
+            assert!(
+                collect_stream(s).await.is_empty(),
+                "start==end==6 chunk_size={cs}"
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn test_start_align_across_chunks() {
+        // The newline needed for start alignment may arrive in any chunk.
+        // fetch_start=0 (start=1). Data: "abcdef\nline2\n" (13 bytes)
+        // Start aligns past "abcdef\n", yielding "line2\n".
+        static DATA: &[u8] = b"abcdef\nline2\n";
+        for &cs in CHUNK_SIZES {
+            let (store, path) = make_chunked_store(DATA, cs).await;
+            let s = AlignedBoundaryStream::new(store, path, 1, 100, 13, b'\n')
+                .await
+                .unwrap();
+            assert_eq!(collect_stream(s).await, b"line2\n", "chunk_size={cs}");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_end_aligned_on_newline() {
+        // end falls right on a newline — line is complete, no end-scan needed.
+        // Data: "line1\nline2\nline3\n"
+        //        0    5 6   11 12  17
+        // start=0, end=6 → byte 5 is '\n' → yield only "line1\n".
+        static DATA: &[u8] = b"line1\nline2\nline3\n";
+        for &cs in CHUNK_SIZES {
+            let (store, path) = make_chunked_store(DATA, cs).await;
+            let s = AlignedBoundaryStream::new(store, path, 0, 6, 18, b'\n')
+                .await
+                .unwrap();
+            assert_eq!(collect_stream(s).await, b"line1\n", "chunk_size={cs}");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_adjacent_partitions_no_overlap() {
+        // Three adjacent partitions over "line1\nline2\nline3\n".
+        // Partition 1: [0, 6), fetch_start=0  → stream full file
+        // Partition 2: [6, 12), fetch_start=5 → stream from offset 5
+        // Partition 3: [12, 18), fetch_start=11 → stream from offset 11
+        static DATA: &[u8] = b"line1\nline2\nline3\n"; // 18 bytes
+
+        for &cs in CHUNK_SIZES {
+            let (store, path) = make_chunked_store(DATA, cs).await;
+            let r1 = collect_stream(
+                AlignedBoundaryStream::new(
+                    Arc::clone(&store),
+                    path.clone(),
+                    0,
+                    6,
+                    18,
+                    b'\n',
+                )
+                .await
+                .unwrap(),
+            )
+            .await;
+            let r2 = collect_stream(
+                AlignedBoundaryStream::new(
+                    Arc::clone(&store),
+                    path.clone(),
+                    6,
+                    12,
+                    18,
+                    b'\n',
+                )
+                .await
+                .unwrap(),
+            )
+            .await;
+            let r3 = collect_stream(
+                AlignedBoundaryStream::new(
+                    Arc::clone(&store),
+                    path.clone(),
+                    12,
+                    18,
+                    18,
+                    b'\n',
+                )
+                .await
+                .unwrap(),
+            )
+            .await;
+
+            assert_eq!(r1, b"line1\n", "p1 chunk_size={cs}");
+            assert_eq!(r2, b"line2\n", "p2 chunk_size={cs}");
+            assert_eq!(r3, b"line3\n", "p3 chunk_size={cs}");
+
+            let mut combined = r1;
+            combined.extend(r2);
+            combined.extend(r3);
+            assert_eq!(combined, DATA, "combined chunk_size={cs}");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_start_align_past_end_returns_empty() {
+        // The first aligned start lands at or past the end boundary.
+        // Data: "abcdefghij\nkl\n" (14 bytes)
+        //        0         10 11 13
+        // Partition [3, 6): start=3, end=6, fetch_start=2
+        // Bytes from offset 2: "cdefghij\nkl\n". First '\n' at offset 10;
+        // aligned start = 11, which is >= end = 6 → empty.
+        static DATA: &[u8] = b"abcdefghij\nkl\n";
+        for &cs in CHUNK_SIZES {
+            let (store, path) = make_chunked_store(DATA, cs).await;
+            let s = AlignedBoundaryStream::new(store, path, 3, 6, 14, b'\n')
+                .await
+                .unwrap();
+            assert!(collect_stream(s).await.is_empty(), "chunk_size={cs}");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_unaligned_partitions_no_overlap() {
+        // Partitions that don't fall on line boundaries.
+        // Data: "aaa\nbbb\nccc\n" (12 bytes)
+        //        0  3 4  7 8  11
+        // Partitions: [0, 5), [5, 10), [10, 12)
+        static DATA: &[u8] = b"aaa\nbbb\nccc\n"; // 12 bytes
+
+        for &cs in CHUNK_SIZES {
+            let (store, path) = make_chunked_store(DATA, cs).await;
+
+            // [0, 5): no start alignment; end=5 mid "bbb", scans to '\n' at 7.
+            let r1 = collect_stream(
+                AlignedBoundaryStream::new(
+                    Arc::clone(&store),
+                    path.clone(),
+                    0,
+                    5,
+                    12,
+                    b'\n',
+                )
+                .await
+                .unwrap(),
+            )
+            .await;
+
+            // [5, 10): fetch_start=4, bytes from offset 4: "bbb\nccc\n".
+            // '\n' at pos 3 → aligned start=8 ("ccc\n"). End=10 mid "ccc",
+            // scans to '\n' at 11 → yields "ccc\n".
+            let r2 = collect_stream(
+                AlignedBoundaryStream::new(
+                    Arc::clone(&store),
+                    path.clone(),
+                    5,
+                    10,
+                    12,
+                    b'\n',
+                )
+                .await
+                .unwrap(),
+            )
+            .await;
+
+            // [10, 12): fetch_start=9, bytes from offset 9: "cc\n".
+            // '\n' at pos 2 → aligned start=12. end=12==file_size → end=MAX.
+            // Remainder after '\n' is empty; Passthrough polls inner → Done.
+            let r3 = collect_stream(
+                AlignedBoundaryStream::new(
+                    Arc::clone(&store),
+                    path.clone(),
+                    10,
+                    12,
+                    12,
+                    b'\n',
+                )
+                .await
+                .unwrap(),
+            )
+            .await;
+
+            assert_eq!(r1, b"aaa\nbbb\n", "p1 chunk_size={cs}");
+            assert_eq!(r2, b"ccc\n", "p2 chunk_size={cs}");
+            assert!(r3.is_empty(), "p3 chunk_size={cs}");
+
+            let mut combined = r1;
+            combined.extend(r2);
+            combined.extend(r3);
+            assert_eq!(combined, DATA, "combined chunk_size={cs}");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_no_trailing_newline() {
+        // Last partition of a file that does not end with a newline.
+        // end >= file_size → this.end = u64::MAX, so Passthrough streams straight
+        // until EOF is reached and yields the final incomplete line as-is.
+        static DATA: &[u8] = b"line1\nline2"; // 11 bytes, no trailing '\n'
+        for &cs in CHUNK_SIZES {
+            let (store, path) = make_chunked_store(DATA, cs).await;
+
+            // Single partition covering the whole file.
+            let s = AlignedBoundaryStream::new(
+                Arc::clone(&store),
+                path.clone(),
+                0,
+                11,
+                11,
+                b'\n',
+            )
+            .await
+            .unwrap();
+            assert_eq!(collect_stream(s).await, DATA, "chunk_size={cs}");
+
+            // Last partition starting mid-file (start=6, fetch_start=5).
+            // Bytes from offset 5: "\nline2".
+            // StartAlign consumes '\n', remainder "line2" is yielded as-is.
+            let s = AlignedBoundaryStream::new(
+                Arc::clone(&store),
+                path.clone(),
+                6,
+                11,
+                11,
+                b'\n',
+            )
+            .await
+            .unwrap();
+            assert_eq!(collect_stream(s).await, b"line2", "tail chunk_size={cs}");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_overflow_fetch() {
+        // First line is longer than 2 * END_SCAN_LOOKAHEAD so the initial
+        // bounded fetch [fetch_start, raw_end + END_SCAN_LOOKAHEAD) does not
+        // reach its newline.  ScanningLastTerminator must issue overflow GETs
+        // to find it.
+        //
+        // Partition [0, 1): raw_end=1, initial_fetch_end=1+16384=16385.
+        // The newline is at byte 32768 > 16385 → one overflow GET required.
+        // Partition [1, file_size): start=1 lands mid line-1; ScanningFirstTerminator
+        // skips to byte 32769, then yields "line2\nline3\n".
+        let long_line: Vec<u8> =
+            std::iter::repeat_n(b'A', 2 * END_SCAN_LOOKAHEAD as usize)
+                .chain(std::iter::once(b'\n'))
+                .collect();
+        let rest = b"line2\nline3\n";
+        let mut data = long_line.clone();
+        data.extend_from_slice(rest);
+        let file_size = data.len() as u64;
+
+        for &cs in CHUNK_SIZES {
+            let (store, path) = make_chunked_store(&data, cs).await;
+
+            let r1 = collect_stream(
+                AlignedBoundaryStream::new(
+                    Arc::clone(&store),
+                    path.clone(),
+                    0,
+                    1,
+                    file_size,
+                    b'\n',
+                )
+                .await
+                .unwrap(),
+            )
+            .await;
+
+            let r2 = collect_stream(
+                AlignedBoundaryStream::new(
+                    Arc::clone(&store),
+                    path.clone(),
+                    1,
+                    file_size,
+                    file_size,
+                    b'\n',
+                )
+                .await
+                .unwrap(),
+            )
+            .await;
+
+            assert_eq!(r1, long_line, "p1 chunk_size={cs}");
+            assert_eq!(r2, rest.as_slice(), "p2 chunk_size={cs}");
+
+            let mut combined = r1;
+            combined.extend(r2);
+            assert_eq!(combined, data, "combined chunk_size={cs}");
+        }
+    }
+}
diff --git a/datafusion/datasource-json/src/file_format.rs b/datafusion/datasource-json/src/file_format.rs
index 51f4bd7e963e0..1854fddfb84b3 100644
--- a/datafusion/datasource-json/src/file_format.rs
+++ b/datafusion/datasource-json/src/file_format.rs
@@ -15,13 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`JsonFormat`]: Line delimited JSON [`FileFormat`] abstractions
+//! [`JsonFormat`]: Line delimited and array JSON [`FileFormat`] abstractions
 
-use std::any::Any;
 use std::collections::HashMap;
 use std::fmt;
 use std::fmt::Debug;
-use std::io::BufReader;
+use std::io::{BufReader, Read};
 use std::sync::Arc;
 
 use crate::source::JsonSource;
@@ -30,36 +29,38 @@ use arrow::array::RecordBatch;
 use arrow::datatypes::{Schema, SchemaRef};
 use arrow::error::ArrowError;
 use arrow::json;
-use arrow::json::reader::{infer_json_schema_from_iterator, ValueIter};
+use arrow::json::reader::{ValueIter, infer_json_schema_from_iterator};
+use bytes::{Buf, Bytes};
 use datafusion_common::config::{ConfigField, ConfigFileType, JsonOptions};
 use datafusion_common::file_options::json_writer::JsonWriterOptions;
 use datafusion_common::{
-    not_impl_err, GetExt, Result, Statistics, DEFAULT_JSON_EXTENSION,
+    DEFAULT_JSON_EXTENSION, GetExt, Result, Statistics, not_impl_err,
 };
 use datafusion_common_runtime::SpawnedTask;
+use datafusion_datasource::TableSchema;
 use datafusion_datasource::decoder::Decoder;
 use datafusion_datasource::display::FileGroupDisplay;
 use datafusion_datasource::file::FileSource;
 use datafusion_datasource::file_compression_type::FileCompressionType;
 use datafusion_datasource::file_format::{
-    FileFormat, FileFormatFactory, DEFAULT_SCHEMA_INFER_MAX_RECORD,
+    DEFAULT_SCHEMA_INFER_MAX_RECORD, FileFormat, FileFormatFactory,
 };
 use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
 use datafusion_datasource::file_sink_config::{FileSink, FileSinkConfig};
 use datafusion_datasource::sink::{DataSink, DataSinkExec};
+use datafusion_datasource::source::DataSourceExec;
+use datafusion_datasource::write::BatchSerializer;
 use datafusion_datasource::write::demux::DemuxedStreamReceiver;
 use datafusion_datasource::write::orchestration::spawn_writer_tasks_and_join;
-use datafusion_datasource::write::BatchSerializer;
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use datafusion_expr::dml::InsertOp;
 use datafusion_physical_expr_common::sort_expr::LexRequirement;
 use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan};
 use datafusion_session::Session;
 
+use crate::utils::JsonArrayToNdjsonReader;
 use async_trait::async_trait;
-use bytes::{Buf, Bytes};
-use datafusion_datasource::source::DataSourceExec;
-use object_store::{GetResultPayload, ObjectMeta, ObjectStore};
+use object_store::{GetResultPayload, ObjectMeta, ObjectStore, ObjectStoreExt};
 
 #[derive(Default)]
 /// Factory struct used to create [JsonFormat]
@@ -110,10 +111,6 @@ impl FileFormatFactory for JsonFormatFactory {
     fn default(&self) -> Arc<dyn FileFormat> {
         Arc::new(JsonFormat::default())
     }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
 }
 
 impl GetExt for JsonFormatFactory {
@@ -131,7 +128,26 @@ impl Debug for JsonFormatFactory {
     }
 }
 
-/// New line delimited JSON `FileFormat` implementation.
+/// JSON `FileFormat` implementation supporting both line-delimited and array formats.
+///
+/// # Supported Formats
+///
+/// ## Line-Delimited JSON (default, `newline_delimited = true`)
+/// ```text
+/// {"key1": 1, "key2": "val"}
+/// {"key1": 2, "key2": "vals"}
+/// ```
+///
+/// ## JSON Array Format (`newline_delimited = false`)
+/// ```text
+/// [
+///     {"key1": 1, "key2": "val"},
+///     {"key1": 2, "key2": "vals"}
+/// ]
+/// ```
+///
+/// Note: JSON array format is processed using streaming conversion,
+/// which is memory-efficient even for large files.
 #[derive(Debug, Default)]
 pub struct JsonFormat {
     options: JsonOptions,
@@ -165,14 +181,61 @@ impl JsonFormat {
         self.options.compression = file_compression_type.into();
         self
     }
-}
 
-#[async_trait]
-impl FileFormat for JsonFormat {
-    fn as_any(&self) -> &dyn Any {
+    /// Set whether to read as newline-delimited JSON (NDJSON).
+    ///
+    /// When `true` (default), expects newline-delimited format:
+    /// ```text
+    /// {"a": 1}
+    /// {"a": 2}
+    /// ```
+    ///
+    /// When `false`, expects JSON array format:
+    /// ```text
+    /// [{"a": 1}, {"a": 2}]
+    /// ```
+    pub fn with_newline_delimited(mut self, newline_delimited: bool) -> Self {
+        self.options.newline_delimited = newline_delimited;
         self
     }
 
+    /// Returns whether this format expects newline-delimited JSON.
+    pub fn is_newline_delimited(&self) -> bool {
+        self.options.newline_delimited
+    }
+}
+
+/// Infer schema from JSON array format using streaming conversion.
+///
+/// This function converts JSON array format to NDJSON on-the-fly and uses
+/// arrow-json's schema inference. It properly tracks the number of records
+/// processed for correct `records_to_read` management.
+///
+/// # Returns
+/// A tuple of (Schema, records_consumed) where records_consumed is the
+/// number of records that were processed for schema inference.
+fn infer_schema_from_json_array<R: Read>(
+    reader: R,
+    max_records: usize,
+) -> Result<(Schema, usize)> {
+    let ndjson_reader = JsonArrayToNdjsonReader::new(reader);
+
+    let iter = ValueIter::new(ndjson_reader, None);
+    let mut count = 0;
+
+    let schema = infer_json_schema_from_iterator(iter.take_while(|_| {
+        let should_take = count < max_records;
+        if should_take {
+            count += 1;
+        }
+        should_take
+    }))?;
+
+    Ok((schema, count))
+}
+
+#[async_trait]
+impl FileFormat for JsonFormat {
     fn get_ext(&self) -> String {
         JsonFormatFactory::new().get_ext()
     }
@@ -201,37 +264,67 @@ impl FileFormat for JsonFormat {
             .schema_infer_max_rec
             .unwrap_or(DEFAULT_SCHEMA_INFER_MAX_RECORD);
         let file_compression_type = FileCompressionType::from(self.options.compression);
+        let newline_delimited = self.options.newline_delimited;
+
         for object in objects {
-            let mut take_while = || {
-                let should_take = records_to_read > 0;
-                if should_take {
-                    records_to_read -= 1;
-                }
-                should_take
-            };
+            // Early exit if we've read enough records
+            if records_to_read == 0 {
+                break;
+            }
 
             let r = store.as_ref().get(&object.location).await?;
-            let schema = match r.payload {
+
+            let (schema, records_consumed) = match r.payload {
                 #[cfg(not(target_arch = "wasm32"))]
                 GetResultPayload::File(file, _) => {
                     let decoder = file_compression_type.convert_read(file)?;
-                    let mut reader = BufReader::new(decoder);
-                    let iter = ValueIter::new(&mut reader, None);
-                    infer_json_schema_from_iterator(iter.take_while(|_| take_while()))?
+                    let reader = BufReader::new(decoder);
+
+                    if newline_delimited {
+                        // NDJSON: use ValueIter directly
+                        let iter = ValueIter::new(reader, None);
+                        let mut count = 0;
+                        let schema =
+                            infer_json_schema_from_iterator(iter.take_while(|_| {
+                                let should_take = count < records_to_read;
+                                if should_take {
+                                    count += 1;
+                                }
+                                should_take
+                            }))?;
+                        (schema, count)
+                    } else {
+                        // JSON array format: use streaming converter
+                        infer_schema_from_json_array(reader, records_to_read)?
+                    }
                 }
                 GetResultPayload::Stream(_) => {
                     let data = r.bytes().await?;
                     let decoder = file_compression_type.convert_read(data.reader())?;
-                    let mut reader = BufReader::new(decoder);
-                    let iter = ValueIter::new(&mut reader, None);
-                    infer_json_schema_from_iterator(iter.take_while(|_| take_while()))?
+                    let reader = BufReader::new(decoder);
+
+                    if newline_delimited {
+                        let iter = ValueIter::new(reader, None);
+                        let mut count = 0;
+                        let schema =
+                            infer_json_schema_from_iterator(iter.take_while(|_| {
+                                let should_take = count < records_to_read;
+                                if should_take {
+                                    count += 1;
+                                }
+                                should_take
+                            }))?;
+                        (schema, count)
+                    } else {
+                        // JSON array format: use streaming converter
+                        infer_schema_from_json_array(reader, records_to_read)?
+                    }
                 }
             };
 
             schemas.push(schema);
-            if records_to_read == 0 {
-                break;
-            }
+            // Correctly decrement records_to_read
+            records_to_read = records_to_read.saturating_sub(records_consumed);
         }
 
         let schema = Schema::try_merge(schemas)?;
@@ -253,12 +346,10 @@ impl FileFormat for JsonFormat {
         _state: &dyn Session,
         conf: FileScanConfig,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let source = Arc::new(JsonSource::new());
         let conf = FileScanConfigBuilder::from(conf)
             .with_file_compression_type(FileCompressionType::from(
                 self.options.compression,
             ))
-            .with_source(source)
             .build();
         Ok(DataSourceExec::from_data_source(conf))
     }
@@ -281,8 +372,11 @@ impl FileFormat for JsonFormat {
         Ok(Arc::new(DataSinkExec::new(input, sink, order_requirements)) as _)
     }
 
-    fn file_source(&self) -> Arc<dyn FileSource> {
-        Arc::new(JsonSource::default())
+    fn file_source(&self, table_schema: TableSchema) -> Arc<dyn FileSource> {
+        Arc::new(
+            JsonSource::new(table_schema)
+                .with_newline_delimited(self.options.newline_delimited),
+        )
     }
 }
 
@@ -374,6 +468,7 @@ impl FileSink for JsonSink {
             context,
             serializer,
             self.writer_options.compression.into(),
+            self.writer_options.compression_level,
             object_store,
             demux_task,
             file_stream_rx,
@@ -384,10 +479,6 @@ impl FileSink for JsonSink {
 
 #[async_trait]
 impl DataSink for JsonSink {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> &SchemaRef {
         self.config.output_schema()
     }
diff --git a/datafusion/datasource-json/src/mod.rs b/datafusion/datasource-json/src/mod.rs
index 18bb8792c3ffe..f7932c8a21d95 100644
--- a/datafusion/datasource-json/src/mod.rs
+++ b/datafusion/datasource-json/src/mod.rs
@@ -15,11 +15,47 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
 
+pub mod boundary_stream;
 pub mod file_format;
 pub mod source;
+pub mod utils;
 
 pub use file_format::*;
+
+#[cfg(test)]
+pub(crate) mod test_utils {
+    use std::sync::Arc;
+
+    use bytes::Bytes;
+    use object_store::chunked::ChunkedStore;
+    use object_store::memory::InMemory;
+    use object_store::path::Path;
+    use object_store::{ObjectStore, ObjectStoreExt, PutPayload};
+
+    /// Chunk sizes exercised by every parameterised test.
+    ///
+    /// `usize::MAX` is intentionally included: `ChunkedStore` treats it as
+    /// "one chunk containing everything", giving the single-chunk fast path.
+    pub const CHUNK_SIZES: &[usize] = &[1, 2, 3, 4, 5, 7, 8, 11, 13, 16, usize::MAX];
+
+    /// Seed a fresh `InMemory` store with `data` and wrap it in a
+    /// [`ChunkedStore`] that splits every GET response into `chunk_size`-byte
+    /// pieces.
+    pub async fn make_chunked_store(
+        data: &[u8],
+        chunk_size: usize,
+    ) -> (Arc<dyn ObjectStore>, Path) {
+        let inner = Arc::new(InMemory::new());
+        let path = Path::from("test");
+        inner
+            .put(&path, PutPayload::from(Bytes::copy_from_slice(data)))
+            .await
+            .unwrap();
+        (Arc::new(ChunkedStore::new(inner, chunk_size)), path)
+    }
+}
diff --git a/datafusion/datasource-json/src/source.rs b/datafusion/datasource-json/src/source.rs
index 52ed0def03f18..2f2f459956f4e 100644
--- a/datafusion/datasource-json/src/source.rs
+++ b/datafusion/datasource-json/src/source.rs
@@ -15,39 +15,87 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Execution plan for reading line-delimited JSON files
+//! Execution plan for reading JSON files (line-delimited and array formats)
 
-use std::any::Any;
-use std::io::{BufReader, Read, Seek, SeekFrom};
+use std::io::BufReader;
+use std::pin::Pin;
 use std::sync::Arc;
-use std::task::Poll;
+use std::task::{Context, Poll};
 
 use crate::file_format::JsonDecoder;
+use crate::utils::{ChannelReader, JsonArrayToNdjsonReader};
+
+use crate::boundary_stream::AlignedBoundaryStream;
 
 use datafusion_common::error::{DataFusionError, Result};
-use datafusion_common_runtime::JoinSet;
-use datafusion_datasource::decoder::{deserialize_stream, DecoderDeserializer};
+use datafusion_common::exec_datafusion_err;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common_runtime::{JoinSet, SpawnedTask};
+use datafusion_datasource::decoder::{DecoderDeserializer, deserialize_stream};
 use datafusion_datasource::file_compression_type::FileCompressionType;
 use datafusion_datasource::file_stream::{FileOpenFuture, FileOpener};
-use datafusion_datasource::schema_adapter::SchemaAdapterFactory;
-use datafusion_datasource::{
-    as_file_source, calculate_range, ListingTableUrl, PartitionedFile, RangeCalculation,
-    TableSchema,
-};
+use datafusion_datasource::projection::{ProjectionOpener, SplitProjection};
+use datafusion_datasource::{ListingTableUrl, PartitionedFile, as_file_source};
+use datafusion_physical_plan::projection::ProjectionExprs;
 use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
 
+use arrow::array::RecordBatch;
 use arrow::json::ReaderBuilder;
 use arrow::{datatypes::SchemaRef, json};
-use datafusion_common::Statistics;
 use datafusion_datasource::file::FileSource;
 use datafusion_datasource::file_scan_config::FileScanConfig;
 use datafusion_execution::TaskContext;
 use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
 
-use futures::{StreamExt, TryStreamExt};
+use futures::{Stream, StreamExt, TryStreamExt};
 use object_store::buffered::BufWriter;
 use object_store::{GetOptions, GetResultPayload, ObjectStore};
 use tokio::io::AsyncWriteExt;
+use tokio_stream::wrappers::ReceiverStream;
+
+/// Channel buffer size for streaming JSON array processing.
+/// With ~128KB average chunk size, 128 chunks ≈ 16MB buffer.
+const CHANNEL_BUFFER_SIZE: usize = 128;
+
+/// Buffer size for JsonArrayToNdjsonReader (2MB each, 4MB total for input+output)
+const JSON_CONVERTER_BUFFER_SIZE: usize = 2 * 1024 * 1024;
+
+// ============================================================================
+// JsonArrayStream - Custom stream wrapper to hold SpawnedTask handles
+// ============================================================================
+
+/// A stream wrapper that holds SpawnedTask handles to keep them alive
+/// until the stream is fully consumed or dropped.
+///
+/// This ensures cancel-safety: when the stream is dropped, the tasks
+/// are properly aborted via SpawnedTask's Drop implementation.
+struct JsonArrayStream {
+    inner: ReceiverStream<std::result::Result<RecordBatch, arrow::error::ArrowError>>,
+    /// Task that reads from object store and sends bytes to channel.
+    /// Kept alive until stream is consumed or dropped.
+    _read_task: SpawnedTask<()>,
+    /// Task that parses JSON and sends RecordBatches.
+    /// Kept alive until stream is consumed or dropped.
+    _parse_task: SpawnedTask<()>,
+}
+
+impl Stream for JsonArrayStream {
+    type Item = std::result::Result<RecordBatch, arrow::error::ArrowError>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        Pin::new(&mut self.inner).poll_next(cx)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
+    }
+}
+// ============================================================================
+// JsonOpener and JsonSource
+// ============================================================================
 
 /// A [`FileOpener`] that opens a JSON file and yields a [`FileOpenFuture`]
 pub struct JsonOpener {
@@ -55,38 +103,62 @@ pub struct JsonOpener {
     projected_schema: SchemaRef,
     file_compression_type: FileCompressionType,
     object_store: Arc<dyn ObjectStore>,
+    /// When `true` (default), expects newline-delimited JSON (NDJSON).
+    /// When `false`, expects JSON array format `[{...}, {...}]`.
+    newline_delimited: bool,
 }
 
 impl JsonOpener {
-    /// Returns a  [`JsonOpener`]
+    /// Returns a [`JsonOpener`]
     pub fn new(
         batch_size: usize,
         projected_schema: SchemaRef,
         file_compression_type: FileCompressionType,
         object_store: Arc<dyn ObjectStore>,
+        newline_delimited: bool,
     ) -> Self {
         Self {
             batch_size,
             projected_schema,
             file_compression_type,
             object_store,
+            newline_delimited,
         }
     }
 }
 
 /// JsonSource holds the extra configuration that is necessary for [`JsonOpener`]
-#[derive(Clone, Default)]
+#[derive(Clone)]
 pub struct JsonSource {
+    table_schema: datafusion_datasource::TableSchema,
     batch_size: Option<usize>,
     metrics: ExecutionPlanMetricsSet,
-    projected_statistics: Option<Statistics>,
-    schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
+    projection: SplitProjection,
+    /// When `true` (default), expects newline-delimited JSON (NDJSON).
+    /// When `false`, expects JSON array format `[{...}, {...}]`.
+    newline_delimited: bool,
 }
 
 impl JsonSource {
-    /// Initialize a JsonSource with default values
-    pub fn new() -> Self {
-        Self::default()
+    /// Initialize a JsonSource with the provided schema
+    pub fn new(table_schema: impl Into<datafusion_datasource::TableSchema>) -> Self {
+        let table_schema = table_schema.into();
+        Self {
+            projection: SplitProjection::unprojected(&table_schema),
+            table_schema,
+            batch_size: None,
+            metrics: ExecutionPlanMetricsSet::new(),
+            newline_delimited: true,
+        }
+    }
+
+    /// Set whether to read as newline-delimited JSON.
+    ///
+    /// When `true` (default), expects newline-delimited format.
+    /// When `false`, expects JSON array format `[{...}, {...}]`.
+    pub fn with_newline_delimited(mut self, newline_delimited: bool) -> Self {
+        self.newline_delimited = newline_delimited;
+        self
     }
 }
 
@@ -102,19 +174,34 @@ impl FileSource for JsonSource {
         object_store: Arc<dyn ObjectStore>,
         base_config: &FileScanConfig,
         _partition: usize,
-    ) -> Arc<dyn FileOpener> {
-        Arc::new(JsonOpener {
+    ) -> Result<Arc<dyn FileOpener>> {
+        // Get the projected file schema for JsonOpener
+        let file_schema = self.table_schema.file_schema();
+        let projected_schema =
+            Arc::new(file_schema.project(&self.projection.file_indices)?);
+
+        let mut opener = Arc::new(JsonOpener {
             batch_size: self
                 .batch_size
                 .expect("Batch size must set before creating opener"),
-            projected_schema: base_config.projected_file_schema(),
+            projected_schema,
             file_compression_type: base_config.file_compression_type,
             object_store,
-        })
+            newline_delimited: self.newline_delimited,
+        }) as Arc<dyn FileOpener>;
+
+        // Wrap with ProjectionOpener
+        opener = ProjectionOpener::try_new(
+            self.projection.clone(),
+            Arc::clone(&opener),
+            self.table_schema.file_schema(),
+        )?;
+
+        Ok(opener)
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
+    fn table_schema(&self) -> &datafusion_datasource::TableSchema {
+        &self.table_schema
     }
 
     fn with_batch_size(&self, batch_size: usize) -> Arc<dyn FileSource> {
@@ -123,51 +210,47 @@ impl FileSource for JsonSource {
         Arc::new(conf)
     }
 
-    fn with_schema(&self, _schema: TableSchema) -> Arc<dyn FileSource> {
-        Arc::new(Self { ..self.clone() })
-    }
-    fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource> {
-        let mut conf = self.clone();
-        conf.projected_statistics = Some(statistics);
-        Arc::new(conf)
+    fn try_pushdown_projection(
+        &self,
+        projection: &ProjectionExprs,
+    ) -> Result<Option<Arc<dyn FileSource>>> {
+        let mut source = self.clone();
+        let new_projection = self.projection.source.try_merge(projection)?;
+        let split_projection =
+            SplitProjection::new(self.table_schema.file_schema(), &new_projection);
+        source.projection = split_projection;
+        Ok(Some(Arc::new(source)))
     }
 
-    fn with_projection(&self, _config: &FileScanConfig) -> Arc<dyn FileSource> {
-        Arc::new(Self { ..self.clone() })
+    fn projection(&self) -> Option<&ProjectionExprs> {
+        Some(&self.projection.source)
     }
 
     fn metrics(&self) -> &ExecutionPlanMetricsSet {
         &self.metrics
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        let statistics = &self.projected_statistics;
-        Ok(statistics
-            .clone()
-            .expect("projected_statistics must be set to call"))
-    }
-
     fn file_type(&self) -> &str {
         "json"
     }
 
-    fn with_schema_adapter_factory(
+    fn apply_expressions(
         &self,
-        schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
-    ) -> Result<Arc<dyn FileSource>> {
-        Ok(Arc::new(Self {
-            schema_adapter_factory: Some(schema_adapter_factory),
-            ..self.clone()
-        }))
-    }
-
-    fn schema_adapter_factory(&self) -> Option<Arc<dyn SchemaAdapterFactory>> {
-        self.schema_adapter_factory.clone()
+        f: &mut dyn FnMut(
+            &dyn datafusion_physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit projection expressions
+        let mut tnr = TreeNodeRecursion::Continue;
+        for proj_expr in &self.projection.source {
+            tnr = tnr.visit_sibling(|| f(proj_expr.expr.as_ref()))?;
+        }
+        Ok(tnr)
     }
 }
 
 impl FileOpener for JsonOpener {
-    /// Open a partitioned NDJSON file.
+    /// Open a partitioned JSON file.
     ///
     /// If `file_meta.range` is `None`, the entire file is opened.
     /// Else `file_meta.range` is `Some(FileRange{start, end})`, which corresponds to the byte range [start, end) within the file.
@@ -176,68 +259,215 @@ impl FileOpener for JsonOpener {
     /// are applied to determine which lines to read:
     /// 1. The first line of the partition is the line in which the index of the first character >= `start`.
     /// 2. The last line of the partition is the line in which the byte at position `end - 1` resides.
+    ///
+    /// Note: JSON array format does not support range-based scanning.
     fn open(&self, partitioned_file: PartitionedFile) -> Result<FileOpenFuture> {
         let store = Arc::clone(&self.object_store);
         let schema = Arc::clone(&self.projected_schema);
         let batch_size = self.batch_size;
         let file_compression_type = self.file_compression_type.to_owned();
+        let newline_delimited = self.newline_delimited;
+
+        // JSON array format requires reading the complete file
+        if !newline_delimited && partitioned_file.range.is_some() {
+            return Err(DataFusionError::NotImplemented(
+                "JSON array format does not support range-based file scanning. \
+                 Disable repartition_file_scans or use newline-delimited JSON format."
+                    .to_string(),
+            ));
+        }
 
         Ok(Box::pin(async move {
-            let calculated_range =
-                calculate_range(&partitioned_file, &store, None).await?;
-
-            let range = match calculated_range {
-                RangeCalculation::Range(None) => None,
-                RangeCalculation::Range(Some(range)) => Some(range.into()),
-                RangeCalculation::TerminateEarly => {
-                    return Ok(
-                        futures::stream::poll_fn(move |_| Poll::Ready(None)).boxed()
+            let file_size = partitioned_file.object_meta.size;
+            let location = &partitioned_file.object_meta.location;
+
+            if let Some(file_range) = partitioned_file.range.as_ref() {
+                let raw_start: u64 = file_range.start.try_into().map_err(|_| {
+                    exec_datafusion_err!(
+                        "Expected start range to fit in u64, got {}",
+                        file_range.start
                     )
-                }
-            };
+                })?;
+                let raw_end: u64 = file_range.end.try_into().map_err(|_| {
+                    exec_datafusion_err!(
+                        "Expected end range to fit in u64, got {}",
+                        file_range.end
+                    )
+                })?;
 
-            let options = GetOptions {
-                range,
-                ..Default::default()
-            };
+                let aligned_stream = AlignedBoundaryStream::new(
+                    Arc::clone(&store),
+                    location.clone(),
+                    raw_start,
+                    raw_end,
+                    file_size,
+                    b'\n',
+                )
+                .await?
+                .map_err(DataFusionError::from);
 
-            let result = store
-                .get_opts(&partitioned_file.object_meta.location, options)
-                .await?;
+                let decoder = ReaderBuilder::new(schema)
+                    .with_batch_size(batch_size)
+                    .build_decoder()?;
+                let input = file_compression_type
+                    .convert_stream(aligned_stream.boxed())?
+                    .fuse();
+                let stream = deserialize_stream(
+                    input,
+                    DecoderDeserializer::new(JsonDecoder::new(decoder)),
+                );
+                return Ok(stream.map_err(Into::into).boxed());
+            }
+
+            // No range specified — read the entire file
+            let options = GetOptions::default();
+            let result = store.get_opts(location, options).await?;
 
             match result.payload {
                 #[cfg(not(target_arch = "wasm32"))]
-                GetResultPayload::File(mut file, _) => {
-                    let bytes = match partitioned_file.range {
-                        None => file_compression_type.convert_read(file)?,
-                        Some(_) => {
-                            file.seek(SeekFrom::Start(result.range.start as _))?;
-                            let limit = result.range.end - result.range.start;
-                            file_compression_type.convert_read(file.take(limit as u64))?
-                        }
-                    };
-
-                    let reader = ReaderBuilder::new(schema)
-                        .with_batch_size(batch_size)
-                        .build(BufReader::new(bytes))?;
-
-                    Ok(futures::stream::iter(reader)
-                        .map(|r| r.map_err(Into::into))
-                        .boxed())
+                GetResultPayload::File(file, _) => {
+                    let bytes = file_compression_type.convert_read(file)?;
+
+                    if newline_delimited {
+                        // NDJSON: use BufReader directly
+                        let reader = BufReader::new(bytes);
+                        let arrow_reader = ReaderBuilder::new(schema)
+                            .with_batch_size(batch_size)
+                            .build(reader)?;
+
+                        Ok(futures::stream::iter(arrow_reader)
+                            .map(|r| r.map_err(Into::into))
+                            .boxed())
+                    } else {
+                        // JSON array format: wrap with streaming converter
+                        let ndjson_reader = JsonArrayToNdjsonReader::with_capacity(
+                            bytes,
+                            JSON_CONVERTER_BUFFER_SIZE,
+                        );
+                        let arrow_reader = ReaderBuilder::new(schema)
+                            .with_batch_size(batch_size)
+                            .build(ndjson_reader)?;
+
+                        Ok(futures::stream::iter(arrow_reader)
+                            .map(|r| r.map_err(Into::into))
+                            .boxed())
+                    }
                 }
                 GetResultPayload::Stream(s) => {
-                    let s = s.map_err(DataFusionError::from);
+                    if newline_delimited {
+                        // Newline-delimited JSON (NDJSON) streaming reader
+                        let s = s.map_err(DataFusionError::from);
+                        let decoder = ReaderBuilder::new(schema)
+                            .with_batch_size(batch_size)
+                            .build_decoder()?;
+                        let input =
+                            file_compression_type.convert_stream(s.boxed())?.fuse();
+                        let stream = deserialize_stream(
+                            input,
+                            DecoderDeserializer::new(JsonDecoder::new(decoder)),
+                        );
+                        Ok(stream.map_err(Into::into).boxed())
+                    } else {
+                        // JSON array format: streaming conversion with channel-based byte transfer
+                        //
+                        // Architecture:
+                        // 1. Async task reads from object store stream, decompresses, sends to channel
+                        // 2. Blocking task receives bytes, converts JSON array to NDJSON, parses to Arrow
+                        // 3. RecordBatches are sent back via another channel
+                        //
+                        // Memory budget (~32MB):
+                        // - sync_channel: CHANNEL_BUFFER_SIZE chunks (~16MB)
+                        // - JsonArrayToNdjsonReader: 2 × JSON_CONVERTER_BUFFER_SIZE (~4MB)
+                        // - Arrow JsonReader internal buffer (~8MB)
+                        // - Miscellaneous (~4MB)
 
-                    let decoder = ReaderBuilder::new(schema)
-                        .with_batch_size(batch_size)
-                        .build_decoder()?;
-                    let input = file_compression_type.convert_stream(s.boxed())?.fuse();
+                        let s = s.map_err(DataFusionError::from);
+                        let decompressed_stream =
+                            file_compression_type.convert_stream(s.boxed())?;
 
-                    let stream = deserialize_stream(
-                        input,
-                        DecoderDeserializer::new(JsonDecoder::new(decoder)),
-                    );
-                    Ok(stream.map_err(Into::into).boxed())
+                        // Channel for bytes: async producer -> blocking consumer
+                        // Uses tokio::sync::mpsc so the async send never blocks a
+                        // tokio worker thread; the consumer calls blocking_recv()
+                        // inside spawn_blocking.
+                        let (byte_tx, byte_rx) = tokio::sync::mpsc::channel::<bytes::Bytes>(
+                            CHANNEL_BUFFER_SIZE,
+                        );
+
+                        // Channel for results: sync producer -> async consumer
+                        let (result_tx, result_rx) = tokio::sync::mpsc::channel(2);
+                        let error_tx = result_tx.clone();
+
+                        // Async task: read from object store stream and send bytes to channel
+                        // Store the SpawnedTask to keep it alive until stream is dropped
+                        let read_task = SpawnedTask::spawn(async move {
+                            tokio::pin!(decompressed_stream);
+                            while let Some(chunk) = decompressed_stream.next().await {
+                                match chunk {
+                                    Ok(bytes) => {
+                                        if byte_tx.send(bytes).await.is_err() {
+                                            break; // Consumer dropped
+                                        }
+                                    }
+                                    Err(e) => {
+                                        let _ = error_tx
+                                            .send(Err(
+                                                arrow::error::ArrowError::ExternalError(
+                                                    Box::new(e),
+                                                ),
+                                            ))
+                                            .await;
+                                        break;
+                                    }
+                                }
+                            }
+                            // byte_tx dropped here, signals EOF to ChannelReader
+                        });
+
+                        // Blocking task: receive bytes from channel and parse JSON
+                        // Store the SpawnedTask to keep it alive until stream is dropped
+                        let parse_task = SpawnedTask::spawn_blocking(move || {
+                            let channel_reader = ChannelReader::new(byte_rx);
+                            let mut ndjson_reader =
+                                JsonArrayToNdjsonReader::with_capacity(
+                                    channel_reader,
+                                    JSON_CONVERTER_BUFFER_SIZE,
+                                );
+
+                            match ReaderBuilder::new(schema)
+                                .with_batch_size(batch_size)
+                                .build(&mut ndjson_reader)
+                            {
+                                Ok(arrow_reader) => {
+                                    for batch_result in arrow_reader {
+                                        if result_tx.blocking_send(batch_result).is_err()
+                                        {
+                                            break; // Receiver dropped
+                                        }
+                                    }
+                                }
+                                Err(e) => {
+                                    let _ = result_tx.blocking_send(Err(e));
+                                }
+                            }
+
+                            // Validate the JSON array was properly formed
+                            if let Err(e) = ndjson_reader.validate_complete() {
+                                let _ = result_tx.blocking_send(Err(
+                                    arrow::error::ArrowError::JsonError(e.to_string()),
+                                ));
+                            }
+                            // result_tx dropped here, closes the stream
+                        });
+
+                        // Wrap in JsonArrayStream to keep tasks alive until stream is consumed
+                        let stream = JsonArrayStream {
+                            inner: ReceiverStream::new(result_rx),
+                            _read_task: read_task,
+                            _parse_task: parse_task,
+                        };
+
+                        Ok(stream.map(|r| r.map_err(Into::into)).boxed())
+                    }
                 }
             }
         }))
@@ -298,3 +528,539 @@ pub async fn plan_to_json(
 
     Ok(())
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::test_utils::{CHUNK_SIZES, make_chunked_store};
+    use arrow::array::{Int64Array, StringArray};
+    use arrow::compute;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use arrow::record_batch::RecordBatch;
+    use bytes::Bytes;
+    use datafusion_datasource::FileRange;
+    use object_store::memory::InMemory;
+    use object_store::path::Path;
+    use object_store::{ObjectStoreExt, PutPayload};
+
+    /// Helper to create a test schema
+    fn test_schema() -> SchemaRef {
+        Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int64, true),
+            Field::new("name", DataType::Utf8, true),
+        ]))
+    }
+
+    #[tokio::test]
+    async fn test_json_array_from_file() -> Result<()> {
+        // Test reading JSON array format from a file
+        let json_data = r#"[{"id": 1, "name": "alice"}, {"id": 2, "name": "bob"}]"#;
+
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("test.json");
+        store
+            .put(&path, PutPayload::from_static(json_data.as_bytes()))
+            .await?;
+
+        let opener = JsonOpener::new(
+            1024,
+            test_schema(),
+            FileCompressionType::UNCOMPRESSED,
+            store.clone(),
+            false, // JSON array format
+        );
+
+        let meta = store.head(&path).await?;
+        let file = PartitionedFile::new(path.to_string(), meta.size);
+
+        let stream = opener.open(file)?.await?;
+        let batches: Vec<_> = stream.try_collect().await?;
+
+        assert_eq!(batches.len(), 1);
+        assert_eq!(batches[0].num_rows(), 2);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_from_stream() -> Result<()> {
+        // Test reading JSON array format from object store stream (simulates S3)
+        let json_data = r#"[{"id": 1, "name": "alice"}, {"id": 2, "name": "bob"}, {"id": 3, "name": "charlie"}]"#;
+
+        // Use InMemory store which returns Stream payload
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("test_stream.json");
+        store
+            .put(&path, PutPayload::from_static(json_data.as_bytes()))
+            .await?;
+
+        let opener = JsonOpener::new(
+            2, // small batch size to test multiple batches
+            test_schema(),
+            FileCompressionType::UNCOMPRESSED,
+            store.clone(),
+            false, // JSON array format
+        );
+
+        let meta = store.head(&path).await?;
+        let file = PartitionedFile::new(path.to_string(), meta.size);
+
+        let stream = opener.open(file)?.await?;
+        let batches: Vec<_> = stream.try_collect().await?;
+
+        let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total_rows, 3);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_nested_objects() -> Result<()> {
+        // Test JSON array with nested objects and arrays
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int64, true),
+            Field::new("data", DataType::Utf8, true),
+        ]));
+
+        let json_data = r#"[
+            {"id": 1, "data": "{\"nested\": true}"},
+            {"id": 2, "data": "[1, 2, 3]"}
+        ]"#;
+
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("nested.json");
+        store
+            .put(&path, PutPayload::from_static(json_data.as_bytes()))
+            .await?;
+
+        let opener = JsonOpener::new(
+            1024,
+            schema,
+            FileCompressionType::UNCOMPRESSED,
+            store.clone(),
+            false,
+        );
+
+        let meta = store.head(&path).await?;
+        let file = PartitionedFile::new(path.to_string(), meta.size);
+
+        let stream = opener.open(file)?.await?;
+        let batches: Vec<_> = stream.try_collect().await?;
+
+        assert_eq!(batches[0].num_rows(), 2);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_empty() -> Result<()> {
+        // Test empty JSON array
+        let json_data = "[]";
+
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("empty.json");
+        store
+            .put(&path, PutPayload::from_static(json_data.as_bytes()))
+            .await?;
+
+        let opener = JsonOpener::new(
+            1024,
+            test_schema(),
+            FileCompressionType::UNCOMPRESSED,
+            store.clone(),
+            false,
+        );
+
+        let meta = store.head(&path).await?;
+        let file = PartitionedFile::new(path.to_string(), meta.size);
+
+        let stream = opener.open(file)?.await?;
+        let batches: Vec<_> = stream.try_collect().await?;
+
+        let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total_rows, 0);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_range_not_supported() {
+        // Test that range-based scanning returns error for JSON array format
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("test.json");
+        store
+            .put(&path, PutPayload::from_static(b"[]"))
+            .await
+            .unwrap();
+
+        let opener = JsonOpener::new(
+            1024,
+            test_schema(),
+            FileCompressionType::UNCOMPRESSED,
+            store.clone(),
+            false, // JSON array format
+        );
+
+        let meta = store.head(&path).await.unwrap();
+        let mut file = PartitionedFile::new(path.to_string(), meta.size);
+        file.range = Some(FileRange { start: 0, end: 10 });
+
+        let result = opener.open(file);
+        match result {
+            Ok(_) => panic!("Expected error for range-based JSON array scanning"),
+            Err(e) => {
+                assert!(
+                    e.to_string().contains("does not support range-based"),
+                    "Unexpected error message: {e}"
+                );
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_ndjson_still_works() -> Result<()> {
+        // Ensure NDJSON format still works correctly
+        let json_data =
+            "{\"id\": 1, \"name\": \"alice\"}\n{\"id\": 2, \"name\": \"bob\"}\n";
+
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("test.ndjson");
+        store
+            .put(&path, PutPayload::from_static(json_data.as_bytes()))
+            .await?;
+
+        let opener = JsonOpener::new(
+            1024,
+            test_schema(),
+            FileCompressionType::UNCOMPRESSED,
+            store.clone(),
+            true, // NDJSON format
+        );
+
+        let meta = store.head(&path).await?;
+        let file = PartitionedFile::new(path.to_string(), meta.size);
+
+        let stream = opener.open(file)?.await?;
+        let batches: Vec<_> = stream.try_collect().await?;
+
+        assert_eq!(batches.len(), 1);
+        assert_eq!(batches[0].num_rows(), 2);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_large_file() -> Result<()> {
+        // Test with a larger JSON array to verify streaming works
+        let mut json_data = String::from("[");
+        for i in 0..1000 {
+            if i > 0 {
+                json_data.push(',');
+            }
+            json_data.push_str(&format!(r#"{{"id": {i}, "name": "user{i}"}}"#));
+        }
+        json_data.push(']');
+
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("large.json");
+        store
+            .put(&path, PutPayload::from(Bytes::from(json_data)))
+            .await?;
+
+        let opener = JsonOpener::new(
+            100, // batch size of 100
+            test_schema(),
+            FileCompressionType::UNCOMPRESSED,
+            store.clone(),
+            false,
+        );
+
+        let meta = store.head(&path).await?;
+        let file = PartitionedFile::new(path.to_string(), meta.size);
+
+        let stream = opener.open(file)?.await?;
+        let batches: Vec<_> = stream.try_collect().await?;
+
+        let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total_rows, 1000);
+
+        // Should have multiple batches due to batch_size=100
+        assert!(batches.len() >= 10);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_stream_cancellation() -> Result<()> {
+        // Test that cancellation works correctly (tasks are aborted when stream is dropped)
+        let mut json_data = String::from("[");
+        for i in 0..10000 {
+            if i > 0 {
+                json_data.push(',');
+            }
+            json_data.push_str(&format!(r#"{{"id": {i}, "name": "user{i}"}}"#));
+        }
+        json_data.push(']');
+
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("cancel_test.json");
+        store
+            .put(&path, PutPayload::from(Bytes::from(json_data)))
+            .await?;
+
+        let opener = JsonOpener::new(
+            10, // small batch size
+            test_schema(),
+            FileCompressionType::UNCOMPRESSED,
+            store.clone(),
+            false,
+        );
+
+        let meta = store.head(&path).await?;
+        let file = PartitionedFile::new(path.to_string(), meta.size);
+
+        let mut stream = opener.open(file)?.await?;
+
+        // Read only first batch, then drop the stream (simulating cancellation)
+        let first_batch = stream.next().await;
+        assert!(first_batch.is_some());
+
+        // Drop the stream - this should abort the spawned tasks via SpawnedTask's Drop
+        drop(stream);
+
+        // Give tasks time to be aborted
+        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+
+        // If we reach here without hanging, cancellation worked
+        Ok(())
+    }
+
+    fn get_partition_splits() -> Vec<usize> {
+        vec![1usize, 2, 3, 5, 7, 10]
+    }
+
+    /// Opens each byte-range partition of `path` in `store` and collects all
+    /// record batches produced across every partition.
+    async fn collect_partitioned_batches(
+        store: Arc<dyn ObjectStore>,
+        path: &Path,
+        file_size: u64,
+        num_partitions: usize,
+    ) -> Result<Vec<RecordBatch>> {
+        let mut all_batches = Vec::new();
+        for p in 0..num_partitions {
+            let start = (p as u64 * file_size) / num_partitions as u64;
+            let end = ((p as u64 + 1) * file_size) / num_partitions as u64;
+
+            let meta = store.head(path).await?;
+            let mut file = PartitionedFile::new(path.to_string(), meta.size);
+            file.range = Some(FileRange {
+                start: start as i64,
+                end: end as i64,
+            });
+
+            let opener = JsonOpener::new(
+                1024,
+                test_schema(),
+                FileCompressionType::UNCOMPRESSED,
+                Arc::clone(&store),
+                true,
+            );
+
+            let stream = opener.open(file)?.await?;
+            let batches: Vec<_> = stream.try_collect().await?;
+            all_batches.extend(batches);
+        }
+        Ok(all_batches)
+    }
+
+    /// Concatenates `batches` and returns a single batch sorted ascending by
+    /// the first (id) column.
+    fn concat_and_sort_by_id(batches: &[RecordBatch]) -> Result<RecordBatch> {
+        let schema = test_schema();
+        let combined = compute::concat_batches(&schema, batches)?;
+        let indices = compute::sort_to_indices(combined.column(0), None, None)?;
+        let sorted_cols: Vec<_> = combined
+            .columns()
+            .iter()
+            .map(|col| compute::take(col.as_ref(), &indices, None))
+            .collect::<std::result::Result<_, _>>()?;
+        Ok(RecordBatch::try_new(schema, sorted_cols)?)
+    }
+
+    #[tokio::test]
+    async fn test_ndjson_partitioned() -> Result<()> {
+        // Build an NDJSON file with a known number of rows.
+        let num_rows: usize = 20;
+        let mut ndjson = String::new();
+        for i in 0..num_rows {
+            ndjson.push_str(&format!("{{\"id\": {i}, \"name\": \"user{i}\"}}\n"));
+        }
+        let ndjson_bytes = Bytes::from(ndjson);
+        let file_size = ndjson_bytes.len() as u64;
+
+        for &cs in CHUNK_SIZES {
+            let (store, path) = make_chunked_store(&ndjson_bytes, cs).await;
+
+            for num_partitions in get_partition_splits() {
+                let batches = collect_partitioned_batches(
+                    Arc::clone(&store),
+                    &path,
+                    file_size,
+                    num_partitions,
+                )
+                .await?;
+
+                let total: usize = batches.iter().map(|b| b.num_rows()).sum();
+                assert_eq!(
+                    total, num_rows,
+                    "Expected {num_rows} rows with {num_partitions} partitions"
+                );
+
+                let result = concat_and_sort_by_id(&batches)?;
+                let ids = result
+                    .column(0)
+                    .as_any()
+                    .downcast_ref::<Int64Array>()
+                    .unwrap();
+                let names = result
+                    .column(1)
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .unwrap();
+                for i in 0..num_rows {
+                    assert_eq!(
+                        ids.value(i),
+                        i as i64,
+                        "id mismatch at row {i} with {num_partitions} partitions"
+                    );
+                    assert_eq!(
+                        names.value(i),
+                        format!("user{i}"),
+                        "name mismatch at row {i} with {num_partitions} partitions"
+                    );
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_ndjson_partitioned_uneven_lines() -> Result<()> {
+        // Lines of deliberately varying lengths so byte-range boundaries are
+        // more likely to land in the middle of a line.
+        let rows: &[(&str, &str)] = &[
+            ("1", "alice"),
+            ("2", "bob-with-a-longer-name"),
+            ("3", "charlie"),
+            ("4", "x"),
+            ("5", "diana-has-an-even-longer-name-here"),
+            ("6", "ed"),
+            ("7", "francesca"),
+            ("8", "g"),
+            ("9", "hector-the-magnificent"),
+            ("10", "isabella"),
+        ];
+        let num_rows = rows.len();
+
+        let mut ndjson = String::new();
+        for (id, name) in rows {
+            ndjson.push_str(&format!("{{\"id\": {id}, \"name\": \"{name}\"}}\n"));
+        }
+        let ndjson_bytes = Bytes::from(ndjson);
+        let file_size = ndjson_bytes.len() as u64;
+
+        for &cs in CHUNK_SIZES {
+            let (store, path) = make_chunked_store(&ndjson_bytes, cs).await;
+
+            for num_partitions in get_partition_splits() {
+                let batches = collect_partitioned_batches(
+                    Arc::clone(&store),
+                    &path,
+                    file_size,
+                    num_partitions,
+                )
+                .await?;
+
+                let total: usize = batches.iter().map(|b| b.num_rows()).sum();
+                assert_eq!(
+                    total, num_rows,
+                    "Expected {num_rows} rows with {num_partitions} partitions"
+                );
+
+                let result = concat_and_sort_by_id(&batches)?;
+                let ids = result
+                    .column(0)
+                    .as_any()
+                    .downcast_ref::<Int64Array>()
+                    .unwrap();
+                let names = result
+                    .column(1)
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .unwrap();
+                for (i, (expected_id, expected_name)) in rows.iter().enumerate() {
+                    assert_eq!(
+                        ids.value(i),
+                        expected_id.parse::<i64>().unwrap(),
+                        "id mismatch at row {i} with {num_partitions} partitions"
+                    );
+                    assert_eq!(
+                        names.value(i),
+                        *expected_name,
+                        "name mismatch at row {i} with {num_partitions} partitions"
+                    );
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_ndjson_partitioned_single_entry() -> Result<()> {
+        // A single JSON object with no trailing newline. No matter how many
+        // byte-range partitions the file is split into, exactly one row must
+        // be produced in total.
+        let ndjson = r#"{"id": 1, "name": "alice"}"#;
+        let ndjson_bytes = Bytes::from(ndjson);
+        let file_size = ndjson_bytes.len() as u64;
+
+        for &cs in CHUNK_SIZES {
+            let (store, path) = make_chunked_store(&ndjson_bytes, cs).await;
+
+            for num_partitions in get_partition_splits() {
+                let batches = collect_partitioned_batches(
+                    Arc::clone(&store),
+                    &path,
+                    file_size,
+                    num_partitions,
+                )
+                .await?;
+
+                let total: usize = batches.iter().map(|b| b.num_rows()).sum();
+                assert_eq!(
+                    total, 1,
+                    "Expected exactly 1 row with {num_partitions} partitions"
+                );
+
+                let result = concat_and_sort_by_id(&batches)?;
+                let ids = result
+                    .column(0)
+                    .as_any()
+                    .downcast_ref::<Int64Array>()
+                    .unwrap();
+                let names = result
+                    .column(1)
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .unwrap();
+                assert_eq!(ids.value(0), 1);
+                assert_eq!(names.value(0), "alice");
+            }
+        }
+
+        Ok(())
+    }
+}
diff --git a/datafusion/datasource-json/src/utils.rs b/datafusion/datasource-json/src/utils.rs
new file mode 100644
index 0000000000000..bc75799edff73
--- /dev/null
+++ b/datafusion/datasource-json/src/utils.rs
@@ -0,0 +1,778 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utility types for JSON processing
+
+use std::io::{BufRead, Read};
+
+use bytes::Bytes;
+
+// ============================================================================
+// JsonArrayToNdjsonReader - Streaming JSON Array to NDJSON Converter
+// ============================================================================
+//
+// Architecture:
+//
+// ```text
+// ┌─────────────────────────────────────────────────────────────┐
+// │  JSON Array File (potentially very large, e.g. 33GB)       │
+// │  [{"a":1}, {"a":2}, {"a":3}, ...... {"a":1000000}]         │
+// └─────────────────────────────────────────────────────────────┘
+//                           │
+//                           ▼ read chunks via ChannelReader
+//                 ┌───────────────────┐
+//                 │ JsonArrayToNdjson │  ← character substitution only:
+//                 │      Reader       │    '[' skip, ',' → '\n', ']' stop
+//                 └───────────────────┘
+//                           │
+//                           ▼ outputs NDJSON format
+//                 ┌───────────────────┐
+//                 │   Arrow Reader    │  ← internal buffer, batch parsing
+//                 │  batch_size=8192  │
+//                 └───────────────────┘
+//                           │
+//                           ▼ outputs RecordBatch
+//                 ┌───────────────────┐
+//                 │   RecordBatch     │
+//                 └───────────────────┘
+// ```
+//
+// Memory Efficiency:
+//
+// | Approach                              | Memory for 33GB file | Parse count |
+// |---------------------------------------|----------------------|-------------|
+// | Load entire file + serde_json         | ~100GB+              | 3x          |
+// | Streaming with JsonArrayToNdjsonReader| ~32MB (configurable) | 1x          |
+//
+// Design Note:
+//
+// This implementation uses `inner: R` directly (not `BufReader<R>`) and manages
+// its own input buffer. This is critical for compatibility with `SyncIoBridge`
+// and `ChannelReader` in `spawn_blocking` contexts.
+//
+
+/// Default buffer size for JsonArrayToNdjsonReader (2MB for better throughput)
+const DEFAULT_BUF_SIZE: usize = 2 * 1024 * 1024;
+
+/// Parser state for JSON array streaming
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum JsonArrayState {
+    /// Initial state, looking for opening '['
+    Start,
+    /// Inside the JSON array, processing objects
+    InArray,
+    /// Reached the closing ']', finished
+    Done,
+}
+
+/// A streaming reader that converts JSON array format to NDJSON format.
+///
+/// This reader wraps an underlying reader containing JSON array data
+/// `[{...}, {...}, ...]` and transforms it on-the-fly to newline-delimited
+/// JSON format that Arrow's JSON reader can process.
+///
+/// Implements both `Read` and `BufRead` traits for compatibility with Arrow's
+/// `ReaderBuilder::build()` which requires `BufRead`.
+///
+/// # Transformation Rules
+///
+/// - Skip leading `[` and whitespace before it
+/// - Convert top-level `,` (between objects) to `\n`
+/// - Skip whitespace at top level (between objects)
+/// - Stop at trailing `]`
+/// - Preserve everything inside objects (including nested `[`, `]`, `,`)
+/// - Properly handle strings (ignore special chars inside quotes)
+///
+/// # Example
+///
+/// ```text
+/// Input:  [{"a":1}, {"b":[1,2]}, {"c":"x,y"}]
+/// Output: {"a":1}
+///         {"b":[1,2]}
+///         {"c":"x,y"}
+/// ```
+pub struct JsonArrayToNdjsonReader<R: Read> {
+    /// Inner reader - we use R directly (not `BufReader<R>`) for SyncIoBridge compatibility
+    inner: R,
+    state: JsonArrayState,
+    /// Tracks nesting depth of `{` and `[` to identify top-level commas
+    depth: i32,
+    /// Whether we're currently inside a JSON string
+    in_string: bool,
+    /// Whether the next character is escaped (after `\`)
+    escape_next: bool,
+    /// Input buffer - stores raw bytes read from inner reader
+    input_buffer: Vec<u8>,
+    /// Current read position in input buffer
+    input_pos: usize,
+    /// Number of valid bytes in input buffer
+    input_filled: usize,
+    /// Output buffer - stores transformed NDJSON bytes
+    output_buffer: Vec<u8>,
+    /// Current read position in output buffer
+    output_pos: usize,
+    /// Number of valid bytes in output buffer
+    output_filled: usize,
+    /// Whether trailing non-whitespace content was detected after ']'
+    has_trailing_content: bool,
+    /// Whether leading non-whitespace content was detected before '['
+    has_leading_content: bool,
+}
+
+impl<R: Read> JsonArrayToNdjsonReader<R> {
+    /// Create a new streaming reader that converts JSON array to NDJSON.
+    pub fn new(reader: R) -> Self {
+        Self::with_capacity(reader, DEFAULT_BUF_SIZE)
+    }
+
+    /// Create a new streaming reader with custom buffer size.
+    ///
+    /// Larger buffers improve throughput but use more memory.
+    /// Total memory usage is approximately 2 * capacity (input + output buffers).
+    pub fn with_capacity(reader: R, capacity: usize) -> Self {
+        Self {
+            inner: reader,
+            state: JsonArrayState::Start,
+            depth: 0,
+            in_string: false,
+            escape_next: false,
+            input_buffer: vec![0; capacity],
+            input_pos: 0,
+            input_filled: 0,
+            output_buffer: vec![0; capacity],
+            output_pos: 0,
+            output_filled: 0,
+            has_trailing_content: false,
+            has_leading_content: false,
+        }
+    }
+
+    /// Check if the JSON array was properly terminated.
+    ///
+    /// This should be called after all data has been read.
+    ///
+    /// Returns an error if:
+    /// - Unbalanced braces/brackets (depth != 0)
+    /// - Unterminated string
+    /// - Missing closing `]`
+    /// - Unexpected trailing content after `]`
+    pub fn validate_complete(&self) -> std::io::Result<()> {
+        if self.has_leading_content {
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                "Malformed JSON: unexpected leading content before '['",
+            ));
+        }
+        if self.depth != 0 {
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                "Malformed JSON array: unbalanced braces or brackets",
+            ));
+        }
+        if self.in_string {
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                "Malformed JSON array: unterminated string",
+            ));
+        }
+        if self.state != JsonArrayState::Done {
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                "Incomplete JSON array: expected closing bracket ']'",
+            ));
+        }
+        if self.has_trailing_content {
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                "Malformed JSON: unexpected trailing content after ']'",
+            ));
+        }
+        Ok(())
+    }
+
+    /// Process a single byte and return the transformed byte (if any)
+    #[inline]
+    fn process_byte(&mut self, byte: u8) -> Option<u8> {
+        match self.state {
+            JsonArrayState::Start => {
+                // Looking for the opening '[', skip whitespace
+                if byte == b'[' {
+                    self.state = JsonArrayState::InArray;
+                } else if !byte.is_ascii_whitespace() {
+                    self.has_leading_content = true;
+                }
+                None
+            }
+            JsonArrayState::InArray => {
+                // Handle escape sequences in strings
+                if self.escape_next {
+                    self.escape_next = false;
+                    return Some(byte);
+                }
+
+                if self.in_string {
+                    // Inside a string: handle escape and closing quote
+                    match byte {
+                        b'\\' => self.escape_next = true,
+                        b'"' => self.in_string = false,
+                        _ => {}
+                    }
+                    Some(byte)
+                } else {
+                    // Outside strings: track depth and transform
+                    match byte {
+                        b'"' => {
+                            self.in_string = true;
+                            Some(byte)
+                        }
+                        b'{' | b'[' => {
+                            self.depth += 1;
+                            Some(byte)
+                        }
+                        b'}' => {
+                            self.depth -= 1;
+                            Some(byte)
+                        }
+                        b']' => {
+                            if self.depth == 0 {
+                                // Top-level ']' means end of array
+                                self.state = JsonArrayState::Done;
+                                None
+                            } else {
+                                // Nested ']' inside an object
+                                self.depth -= 1;
+                                Some(byte)
+                            }
+                        }
+                        b',' if self.depth == 0 => {
+                            // Top-level comma between objects → newline
+                            Some(b'\n')
+                        }
+                        _ => {
+                            // At depth 0, skip whitespace between objects
+                            if self.depth == 0 && byte.is_ascii_whitespace() {
+                                None
+                            } else {
+                                Some(byte)
+                            }
+                        }
+                    }
+                }
+            }
+            JsonArrayState::Done => {
+                // After ']', check for non-whitespace trailing content
+                if !byte.is_ascii_whitespace() {
+                    self.has_trailing_content = true;
+                }
+                None
+            }
+        }
+    }
+
+    /// Refill input buffer from inner reader if needed.
+    /// Returns true if there's data available, false on EOF.
+    fn refill_input_if_needed(&mut self) -> std::io::Result<bool> {
+        if self.input_pos >= self.input_filled {
+            // Input buffer exhausted, read more from inner
+            let bytes_read = self.inner.read(&mut self.input_buffer)?;
+            if bytes_read == 0 {
+                return Ok(false); // EOF
+            }
+            self.input_pos = 0;
+            self.input_filled = bytes_read;
+        }
+        Ok(true)
+    }
+
+    /// Fill the output buffer with transformed data.
+    ///
+    /// This method manages its own input buffer, reading from the inner reader
+    /// as needed. When the output buffer is full, we stop processing but preserve
+    /// the current position in the input buffer for the next call.
+    fn fill_output_buffer(&mut self) -> std::io::Result<()> {
+        let mut write_pos = 0;
+
+        while write_pos < self.output_buffer.len() {
+            // Refill input buffer if exhausted
+            if !self.refill_input_if_needed()? {
+                break; // EOF
+            }
+
+            // Process bytes from input buffer
+            while self.input_pos < self.input_filled
+                && write_pos < self.output_buffer.len()
+            {
+                let byte = self.input_buffer[self.input_pos];
+                self.input_pos += 1;
+
+                if let Some(transformed) = self.process_byte(byte) {
+                    self.output_buffer[write_pos] = transformed;
+                    write_pos += 1;
+                }
+            }
+        }
+
+        self.output_pos = 0;
+        self.output_filled = write_pos;
+        Ok(())
+    }
+}
+
+impl<R: Read> Read for JsonArrayToNdjsonReader<R> {
+    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
+        // If output buffer is empty, fill it
+        if self.output_pos >= self.output_filled {
+            self.fill_output_buffer()?;
+            if self.output_filled == 0 {
+                return Ok(0); // EOF
+            }
+        }
+
+        // Copy from output buffer to caller's buffer
+        let available = self.output_filled - self.output_pos;
+        let to_copy = std::cmp::min(available, buf.len());
+        buf[..to_copy].copy_from_slice(
+            &self.output_buffer[self.output_pos..self.output_pos + to_copy],
+        );
+        self.output_pos += to_copy;
+        Ok(to_copy)
+    }
+}
+
+impl<R: Read> BufRead for JsonArrayToNdjsonReader<R> {
+    fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
+        if self.output_pos >= self.output_filled {
+            self.fill_output_buffer()?;
+        }
+        Ok(&self.output_buffer[self.output_pos..self.output_filled])
+    }
+
+    fn consume(&mut self, amt: usize) {
+        self.output_pos = std::cmp::min(self.output_pos + amt, self.output_filled);
+    }
+}
+
+// ============================================================================
+// ChannelReader - Sync reader that receives bytes from async channel
+// ============================================================================
+//
+// Architecture:
+//
+// ```text
+// ┌─────────────────────────────────────────────────────────────────────────┐
+// │                         S3 / MinIO (async)                              │
+// │                    (33GB JSON Array File)                               │
+// └─────────────────────────────────────────────────────────────────────────┘
+//                                 │
+//                                 ▼ async stream (Bytes chunks)
+// ┌─────────────────────────────────────────────────────────────────────────┐
+// │                      Async Task (tokio runtime)                         │
+// │              while let Some(chunk) = stream.next().await                │
+// │                     byte_tx.send(chunk)                                 │
+// └─────────────────────────────────────────────────────────────────────────┘
+//                                 │
+//                                 ▼ tokio::sync::mpsc::channel<Bytes>
+//                                 │   (bounded, ~32MB buffer)
+//                                 ▼
+// ┌─────────────────────────────────────────────────────────────────────────┐
+// │                   Blocking Task (spawn_blocking)                        │
+// │  ┌──────────────┐   ┌────────────────────────┐   ┌──────────────────┐  │
+// │  │ChannelReader │ → │JsonArrayToNdjsonReader │ → │ Arrow JsonReader │  │
+// │  │   (Read)     │   │  [{},...] → {}\n{}     │   │  (RecordBatch)   │  │
+// │  └──────────────┘   └────────────────────────┘   └──────────────────┘  │
+// └─────────────────────────────────────────────────────────────────────────┘
+//                                 │
+//                                 ▼ tokio::sync::mpsc::channel<RecordBatch>
+// ┌─────────────────────────────────────────────────────────────────────────┐
+// │                      ReceiverStream (async)                             │
+// │                   → DataFusion execution engine                         │
+// └─────────────────────────────────────────────────────────────────────────┘
+// ```
+//
+// Memory Budget (~32MB total):
+// - sync_channel buffer: 128 chunks × ~128KB = ~16MB
+// - JsonArrayToNdjsonReader: 2 × 2MB = 4MB
+// - Arrow JsonReader internal: ~8MB
+// - Miscellaneous: ~4MB
+//
+
+/// A synchronous `Read` implementation that receives bytes from an async channel.
+///
+/// This enables true streaming between async and sync contexts without
+/// loading the entire file into memory. Uses `tokio::sync::mpsc::Receiver`
+/// with `blocking_recv()` so the async producer never blocks a tokio worker
+/// thread, while the sync consumer (running in `spawn_blocking`) safely blocks.
+pub struct ChannelReader {
+    rx: tokio::sync::mpsc::Receiver<Bytes>,
+    current: Option<Bytes>,
+    pos: usize,
+}
+
+impl ChannelReader {
+    /// Create a new ChannelReader from a tokio mpsc receiver.
+    pub fn new(rx: tokio::sync::mpsc::Receiver<Bytes>) -> Self {
+        Self {
+            rx,
+            current: None,
+            pos: 0,
+        }
+    }
+}
+
+impl Read for ChannelReader {
+    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
+        loop {
+            // If we have current chunk with remaining data, read from it
+            if let Some(ref chunk) = self.current {
+                let remaining = chunk.len() - self.pos;
+                if remaining > 0 {
+                    let to_copy = std::cmp::min(remaining, buf.len());
+                    buf[..to_copy].copy_from_slice(&chunk[self.pos..self.pos + to_copy]);
+                    self.pos += to_copy;
+                    return Ok(to_copy);
+                }
+            }
+
+            // Current chunk exhausted, get next from channel
+            match self.rx.blocking_recv() {
+                Some(bytes) => {
+                    self.current = Some(bytes);
+                    self.pos = 0;
+                    // Loop back to read from new chunk
+                }
+                None => return Ok(0), // Channel closed = EOF
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_json_array_to_ndjson_simple() {
+        let input = r#"[{"a":1}, {"a":2}, {"a":3}]"#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+        assert_eq!(output, "{\"a\":1}\n{\"a\":2}\n{\"a\":3}");
+    }
+
+    #[test]
+    fn test_json_array_to_ndjson_nested() {
+        let input = r#"[{"a":{"b":1}}, {"c":[1,2,3]}]"#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+        assert_eq!(output, "{\"a\":{\"b\":1}}\n{\"c\":[1,2,3]}");
+    }
+
+    #[test]
+    fn test_json_array_to_ndjson_strings_with_special_chars() {
+        let input = r#"[{"a":"[1,2]"}, {"b":"x,y"}]"#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+        assert_eq!(output, "{\"a\":\"[1,2]\"}\n{\"b\":\"x,y\"}");
+    }
+
+    #[test]
+    fn test_json_array_to_ndjson_escaped_quotes() {
+        let input = r#"[{"a":"say \"hello\""}, {"b":1}]"#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+        assert_eq!(output, "{\"a\":\"say \\\"hello\\\"\"}\n{\"b\":1}");
+    }
+
+    #[test]
+    fn test_json_array_to_ndjson_empty() {
+        let input = r#"[]"#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+        assert_eq!(output, "");
+    }
+
+    #[test]
+    fn test_json_array_to_ndjson_single_element() {
+        let input = r#"[{"a":1}]"#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+        assert_eq!(output, "{\"a\":1}");
+    }
+
+    #[test]
+    fn test_json_array_to_ndjson_bufread() {
+        let input = r#"[{"a":1}, {"a":2}]"#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+
+        let buf = reader.fill_buf().unwrap();
+        assert!(!buf.is_empty());
+
+        let first_len = buf.len();
+        reader.consume(first_len);
+
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+    }
+
+    #[test]
+    fn test_json_array_to_ndjson_whitespace() {
+        let input = r#"  [  {"a":1}  ,  {"a":2}  ]  "#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+        // Top-level whitespace is skipped, internal whitespace preserved
+        assert_eq!(output, "{\"a\":1}\n{\"a\":2}");
+    }
+
+    #[test]
+    fn test_validate_complete_valid_json() {
+        let valid_json = r#"[{"a":1},{"a":2}]"#;
+        let mut reader = JsonArrayToNdjsonReader::new(valid_json.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+        reader.validate_complete().unwrap();
+    }
+
+    #[test]
+    fn test_json_array_with_trailing_junk() {
+        let input = r#" [ {"a":1} , {"a":2} ] some { junk [ here ] "#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+
+        // Should extract the valid array content
+        assert_eq!(output, "{\"a\":1}\n{\"a\":2}");
+
+        // But validation should catch the trailing junk
+        let result = reader.validate_complete();
+        assert!(result.is_err());
+        let err_msg = result.unwrap_err().to_string();
+        assert!(
+            err_msg.contains("trailing content")
+                || err_msg.contains("Unexpected trailing"),
+            "Expected trailing content error, got: {err_msg}"
+        );
+    }
+
+    #[test]
+    fn test_validate_complete_incomplete_array() {
+        let invalid_json = r#"[{"a":1},{"a":2}"#; // Missing closing ]
+        let mut reader = JsonArrayToNdjsonReader::new(invalid_json.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+
+        let result = reader.validate_complete();
+        assert!(result.is_err());
+        let err_msg = result.unwrap_err().to_string();
+        assert!(
+            err_msg.contains("expected closing bracket")
+                || err_msg.contains("missing closing"),
+            "Expected missing bracket error, got: {err_msg}"
+        );
+    }
+
+    #[test]
+    fn test_validate_complete_unbalanced_braces() {
+        let invalid_json = r#"[{"a":1},{"a":2]"#; // Wrong closing bracket
+        let mut reader = JsonArrayToNdjsonReader::new(invalid_json.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+
+        let result = reader.validate_complete();
+        assert!(result.is_err());
+        let err_msg = result.unwrap_err().to_string();
+        assert!(
+            err_msg.contains("unbalanced")
+                || err_msg.contains("expected closing bracket"),
+            "Expected unbalanced or missing bracket error, got: {err_msg}"
+        );
+    }
+
+    #[test]
+    fn test_json_array_with_leading_junk() {
+        let input = r#"junk[{"a":1}, {"a":2}]"#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+
+        // Should still extract the valid array content
+        assert_eq!(output, "{\"a\":1}\n{\"a\":2}");
+
+        // But validation should catch the leading junk
+        let result = reader.validate_complete();
+        assert!(result.is_err());
+        let err_msg = result.unwrap_err().to_string();
+        assert!(
+            err_msg.contains("leading content"),
+            "Expected leading content error, got: {err_msg}"
+        );
+    }
+
+    #[test]
+    fn test_json_array_with_leading_whitespace_ok() {
+        let input = r#"
+  [{"a":1}, {"a":2}]"#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+        assert_eq!(output, "{\"a\":1}\n{\"a\":2}");
+
+        // Leading whitespace should be fine
+        reader.validate_complete().unwrap();
+    }
+
+    #[test]
+    fn test_validate_complete_valid_with_trailing_whitespace() {
+        let input = r#"[{"a":1},{"a":2}]
+    "#; // Trailing whitespace is OK
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+
+        // Whitespace after ] should be allowed
+        reader.validate_complete().unwrap();
+    }
+
+    /// Test that data is not lost at buffer boundaries.
+    ///
+    /// This test creates input larger than the internal buffer to verify
+    /// that newline characters are not dropped when they occur at buffer boundaries.
+    #[test]
+    fn test_buffer_boundary_no_data_loss() {
+        // Create objects ~9KB each, so 10 objects = ~90KB
+        let large_value = "x".repeat(9000);
+
+        let mut objects = vec![];
+        for i in 0..10 {
+            objects.push(format!(r#"{{"id":{i},"data":"{large_value}"}}"#));
+        }
+
+        let input = format!("[{}]", objects.join(","));
+
+        // Use small buffer to force multiple fill cycles
+        let mut reader = JsonArrayToNdjsonReader::with_capacity(input.as_bytes(), 8192);
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+
+        // Verify correct number of newlines (9 newlines separate 10 objects)
+        let newline_count = output.matches('\n').count();
+        assert_eq!(
+            newline_count, 9,
+            "Expected 9 newlines separating 10 objects, got {newline_count}"
+        );
+
+        // Verify each line is valid JSON
+        for (i, line) in output.lines().enumerate() {
+            let parsed: Result<serde_json::Value, _> = serde_json::from_str(line);
+            assert!(
+                parsed.is_ok(),
+                "Line {} is not valid JSON: {}...",
+                i,
+                &line[..100.min(line.len())]
+            );
+
+            // Verify the id field matches expected value
+            let value = parsed.unwrap();
+            assert_eq!(
+                value["id"].as_i64(),
+                Some(i as i64),
+                "Object {i} has wrong id"
+            );
+        }
+    }
+
+    /// Test with real-world-like data format (with leading whitespace and newlines)
+    #[test]
+    fn test_real_world_format_large() {
+        let large_value = "x".repeat(8000);
+
+        // Format similar to real files: opening bracket on its own line,
+        // each object indented with 2 spaces
+        let mut objects = vec![];
+        for i in 0..10 {
+            objects.push(format!(r#"  {{"id":{i},"data":"{large_value}"}}"#));
+        }
+
+        let input = format!("[\n{}\n]", objects.join(",\n"));
+
+        let mut reader = JsonArrayToNdjsonReader::with_capacity(input.as_bytes(), 8192);
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+
+        let lines: Vec<&str> = output.lines().collect();
+        assert_eq!(lines.len(), 10, "Expected 10 objects");
+
+        for (i, line) in lines.iter().enumerate() {
+            assert!(
+                line.starts_with("{\"id\""),
+                "Line {} should start with object, got: {}...",
+                i,
+                &line[..50.min(line.len())]
+            );
+        }
+    }
+
+    /// Test ChannelReader
+    #[test]
+    fn test_channel_reader() {
+        let (tx, rx) = tokio::sync::mpsc::channel(4);
+
+        // Send some chunks (try_send is non-async)
+        tx.try_send(Bytes::from("Hello, ")).unwrap();
+        tx.try_send(Bytes::from("World!")).unwrap();
+        drop(tx); // Close channel
+
+        let mut reader = ChannelReader::new(rx);
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+
+        assert_eq!(output, "Hello, World!");
+    }
+
+    /// Test ChannelReader with small reads
+    #[test]
+    fn test_channel_reader_small_reads() {
+        let (tx, rx) = tokio::sync::mpsc::channel(4);
+
+        tx.try_send(Bytes::from("ABCDEFGHIJ")).unwrap();
+        drop(tx);
+
+        let mut reader = ChannelReader::new(rx);
+        let mut buf = [0u8; 3];
+
+        // Read in small chunks
+        assert_eq!(reader.read(&mut buf).unwrap(), 3);
+        assert_eq!(&buf, b"ABC");
+
+        assert_eq!(reader.read(&mut buf).unwrap(), 3);
+        assert_eq!(&buf, b"DEF");
+
+        assert_eq!(reader.read(&mut buf).unwrap(), 3);
+        assert_eq!(&buf, b"GHI");
+
+        assert_eq!(reader.read(&mut buf).unwrap(), 1);
+        assert_eq!(&buf[..1], b"J");
+
+        // EOF
+        assert_eq!(reader.read(&mut buf).unwrap(), 0);
+    }
+}
diff --git a/datafusion/datasource-parquet/Cargo.toml b/datafusion/datasource-parquet/Cargo.toml
index 1f866ffd6cc2f..a5855af17a536 100644
--- a/datafusion/datasource-parquet/Cargo.toml
+++ b/datafusion/datasource-parquet/Cargo.toml
@@ -39,6 +39,7 @@ datafusion-common-runtime = { workspace = true }
 datafusion-datasource = { workspace = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
+datafusion-functions = { workspace = true }
 datafusion-functions-aggregate-common = { workspace = true }
 datafusion-physical-expr = { workspace = true }
 datafusion-physical-expr-adapter = { workspace = true }
@@ -56,7 +57,14 @@ tokio = { workspace = true }
 
 [dev-dependencies]
 chrono = { workspace = true }
+criterion = { workspace = true }
+datafusion-functions = { workspace = true }
+datafusion-functions-nested = { workspace = true }
+tempfile = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -70,3 +78,11 @@ parquet_encryption = [
     "datafusion-common/parquet_encryption",
     "datafusion-execution/parquet_encryption",
 ]
+
+[[bench]]
+name = "parquet_nested_filter_pushdown"
+harness = false
+
+[[bench]]
+name = "parquet_struct_filter_pushdown"
+harness = false
diff --git a/datafusion/datasource-parquet/benches/parquet_nested_filter_pushdown.rs b/datafusion/datasource-parquet/benches/parquet_nested_filter_pushdown.rs
new file mode 100644
index 0000000000000..02137b5a1d288
--- /dev/null
+++ b/datafusion/datasource-parquet/benches/parquet_nested_filter_pushdown.rs
@@ -0,0 +1,238 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::path::{Path, PathBuf};
+use std::sync::{Arc, LazyLock};
+
+use arrow::array::{
+    BinaryBuilder, BooleanArray, ListBuilder, RecordBatch, StringBuilder,
+};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use criterion::{Criterion, Throughput, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_datasource_parquet::{ParquetFileMetrics, build_row_filter};
+use datafusion_expr::{Expr, col};
+use datafusion_functions_nested::expr_fn::array_has;
+use datafusion_physical_expr::planner::logical2physical;
+use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
+use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+use parquet::arrow::{ArrowWriter, ProjectionMask};
+use parquet::file::properties::WriterProperties;
+use tempfile::TempDir;
+
+const ROW_GROUP_ROW_COUNT: usize = 10_000;
+const TOTAL_ROW_GROUPS: usize = 10;
+const TOTAL_ROWS: usize = ROW_GROUP_ROW_COUNT * TOTAL_ROW_GROUPS;
+const TARGET_VALUE: &str = "target_value";
+const COLUMN_NAME: &str = "list_col";
+const PAYLOAD_COLUMN_NAME: &str = "payload";
+// Large binary payload to emphasize decoding overhead when pushdown is disabled.
+const PAYLOAD_BYTES: usize = 8 * 1024;
+
+struct BenchmarkDataset {
+    _tempdir: TempDir,
+    file_path: PathBuf,
+}
+
+impl BenchmarkDataset {
+    fn path(&self) -> &Path {
+        &self.file_path
+    }
+}
+
+static DATASET: LazyLock<BenchmarkDataset> = LazyLock::new(|| {
+    create_dataset().expect("failed to prepare parquet benchmark dataset")
+});
+
+fn parquet_nested_filter_pushdown(c: &mut Criterion) {
+    let dataset_path = DATASET.path().to_owned();
+    let mut group = c.benchmark_group("parquet_nested_filter_pushdown");
+    group.throughput(Throughput::Elements(TOTAL_ROWS as u64));
+
+    group.bench_function("no_pushdown", |b| {
+        let file_schema = setup_reader(&dataset_path);
+        let predicate = logical2physical(&create_predicate(), &file_schema);
+        b.iter(|| {
+            let matched = scan_with_predicate(&dataset_path, &predicate, false)
+                .expect("baseline parquet scan with filter succeeded");
+            assert_eq!(matched, ROW_GROUP_ROW_COUNT);
+        });
+    });
+
+    group.bench_function("with_pushdown", |b| {
+        let file_schema = setup_reader(&dataset_path);
+        let predicate = logical2physical(&create_predicate(), &file_schema);
+        b.iter(|| {
+            let matched = scan_with_predicate(&dataset_path, &predicate, true)
+                .expect("pushdown parquet scan with filter succeeded");
+            assert_eq!(matched, ROW_GROUP_ROW_COUNT);
+        });
+    });
+
+    group.finish();
+}
+
+fn setup_reader(path: &Path) -> SchemaRef {
+    let file = std::fs::File::open(path).expect("failed to open file");
+    let builder =
+        ParquetRecordBatchReaderBuilder::try_new(file).expect("failed to build reader");
+    Arc::clone(builder.schema())
+}
+
+fn create_predicate() -> Expr {
+    array_has(
+        col(COLUMN_NAME),
+        Expr::Literal(ScalarValue::Utf8(Some(TARGET_VALUE.to_string())), None),
+    )
+}
+
+fn scan_with_predicate(
+    path: &Path,
+    predicate: &Arc<dyn datafusion_physical_expr::PhysicalExpr>,
+    pushdown: bool,
+) -> datafusion_common::Result<usize> {
+    let file = std::fs::File::open(path)?;
+    let builder = ParquetRecordBatchReaderBuilder::try_new(file)?;
+    let metadata = builder.metadata().clone();
+    let file_schema = builder.schema();
+    let projection = ProjectionMask::all();
+
+    let metrics = ExecutionPlanMetricsSet::new();
+    let file_metrics = ParquetFileMetrics::new(0, &path.display().to_string(), &metrics);
+
+    let builder = if pushdown {
+        if let Some(row_filter) =
+            build_row_filter(predicate, file_schema, &metadata, false, &file_metrics)?
+        {
+            builder.with_row_filter(row_filter)
+        } else {
+            builder
+        }
+    } else {
+        builder
+    };
+
+    let reader = builder.with_projection(projection).build()?;
+
+    let mut matched_rows = 0usize;
+    for batch in reader {
+        let batch = batch?;
+        matched_rows += count_matches(predicate, &batch)?;
+    }
+
+    if pushdown {
+        let pruned_rows = file_metrics.pushdown_rows_pruned.value();
+        assert_eq!(
+            pruned_rows,
+            TOTAL_ROWS - matched_rows,
+            "row-level pushdown should prune 90% of rows"
+        );
+    }
+
+    Ok(matched_rows)
+}
+
+fn count_matches(
+    expr: &Arc<dyn datafusion_physical_expr::PhysicalExpr>,
+    batch: &RecordBatch,
+) -> datafusion_common::Result<usize> {
+    let values = expr.evaluate(batch)?.into_array(batch.num_rows())?;
+    let bools = values
+        .as_any()
+        .downcast_ref::<BooleanArray>()
+        .expect("boolean filter result");
+
+    Ok(bools.iter().filter(|v| matches!(v, Some(true))).count())
+}
+
+fn create_dataset() -> datafusion_common::Result<BenchmarkDataset> {
+    let tempdir = TempDir::new()?;
+    let file_path = tempdir.path().join("nested_lists.parquet");
+
+    let field = Arc::new(Field::new("item", DataType::Utf8, true));
+    let schema = Arc::new(Schema::new(vec![
+        Field::new(COLUMN_NAME, DataType::List(field), false),
+        Field::new(PAYLOAD_COLUMN_NAME, DataType::Binary, false),
+    ]));
+
+    let writer_props = WriterProperties::builder()
+        .set_max_row_group_row_count(Some(ROW_GROUP_ROW_COUNT))
+        .build();
+
+    let mut writer = ArrowWriter::try_new(
+        std::fs::File::create(&file_path)?,
+        Arc::clone(&schema),
+        Some(writer_props),
+    )?;
+
+    // Create sorted row groups with distinct values so that min/max statistics
+    // allow skipping most groups when applying a selective predicate.
+    let sorted_values = [
+        "alpha",
+        "bravo",
+        "charlie",
+        "delta",
+        "echo",
+        "foxtrot",
+        "golf",
+        "hotel",
+        "india",
+        TARGET_VALUE,
+    ];
+
+    for value in sorted_values {
+        let batch = build_list_batch(&schema, value, ROW_GROUP_ROW_COUNT)?;
+        writer.write(&batch)?;
+    }
+
+    writer.close()?;
+
+    // Ensure the writer respected the requested row group size
+    let reader =
+        ParquetRecordBatchReaderBuilder::try_new(std::fs::File::open(&file_path)?)?;
+    assert_eq!(reader.metadata().row_groups().len(), TOTAL_ROW_GROUPS);
+
+    Ok(BenchmarkDataset {
+        _tempdir: tempdir,
+        file_path,
+    })
+}
+
+fn build_list_batch(
+    schema: &SchemaRef,
+    value: &str,
+    len: usize,
+) -> datafusion_common::Result<RecordBatch> {
+    let mut builder = ListBuilder::new(StringBuilder::new());
+    let mut payload_builder = BinaryBuilder::new();
+    let payload = vec![1u8; PAYLOAD_BYTES];
+    for _ in 0..len {
+        builder.values().append_value(value);
+        builder.append(true);
+        payload_builder.append_value(&payload);
+    }
+
+    let array = builder.finish();
+    let payload_array = payload_builder.finish();
+    Ok(RecordBatch::try_new(
+        Arc::clone(schema),
+        vec![Arc::new(array), Arc::new(payload_array)],
+    )?)
+}
+
+criterion_group!(benches, parquet_nested_filter_pushdown);
+criterion_main!(benches);
diff --git a/datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown.rs b/datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown.rs
new file mode 100644
index 0000000000000..b52408d4222d8
--- /dev/null
+++ b/datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown.rs
@@ -0,0 +1,353 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmarks for struct field filter pushdown in Parquet.
+//!
+//! Compares scanning with vs without row-level filter pushdown for
+//! predicates on struct sub-fields (e.g. `get_field(s, 'id') = 42`).
+//!
+//! The dataset schema (in SQL-like notation):
+//!
+//! ```sql
+//! CREATE TABLE t (
+//!     id       INT,          -- top-level id, useful for correctness checks
+//!     large_string TEXT,     -- wide column so SELECT * is expensive
+//!     s STRUCT<
+//!         id: INT,           -- mirrors top-level id
+//!         large_string: TEXT -- wide sub-field; pushdown with proper projection
+//!                            -- should avoid reading this when filtering on s.id
+//!     >
+//! );
+//! ```
+//!
+//! Benchmark queries:
+//!
+//! 1. `SELECT * FROM t WHERE get_field(s, 'id') = 42`
+//!     - no pushdown vs. row-level filter pushdown
+//! 2. `SELECT * FROM t WHERE get_field(s, 'id') = id`
+//!     - cross-column predicate; no pushdown vs. row-level filter pushdown
+//! 3. `SELECT id FROM t WHERE get_field(s, 'id') = 42`
+//!     - narrow projection; pushdown should avoid reading s.large_string
+
+use std::path::{Path, PathBuf};
+use std::sync::{Arc, LazyLock};
+
+use arrow::array::{BooleanArray, Int32Array, RecordBatch, StringBuilder, StructArray};
+use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef};
+use criterion::{Criterion, Throughput, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_datasource_parquet::{ParquetFileMetrics, build_row_filter};
+use datafusion_expr::{Expr, col};
+use datafusion_physical_expr::planner::logical2physical;
+use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
+use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+use parquet::arrow::{ArrowWriter, ProjectionMask};
+use parquet::file::properties::WriterProperties;
+use tempfile::TempDir;
+
+const ROW_GROUP_ROW_COUNT: usize = 10_000;
+const TOTAL_ROW_GROUPS: usize = 10;
+const TOTAL_ROWS: usize = ROW_GROUP_ROW_COUNT * TOTAL_ROW_GROUPS;
+/// Only one row group will contain the target value.
+const TARGET_VALUE: i32 = 42;
+const ID_COLUMN_NAME: &str = "id";
+const LARGE_STRING_COLUMN_NAME: &str = "large_string";
+const STRUCT_COLUMN_NAME: &str = "s";
+// Large string payload to emphasize decoding overhead when pushdown is disabled.
+const LARGE_STRING_LEN: usize = 8 * 1024;
+
+struct BenchmarkDataset {
+    _tempdir: TempDir,
+    file_path: PathBuf,
+}
+
+impl BenchmarkDataset {
+    fn path(&self) -> &Path {
+        &self.file_path
+    }
+}
+
+static DATASET: LazyLock<BenchmarkDataset> = LazyLock::new(|| {
+    create_dataset().expect("failed to prepare parquet benchmark dataset")
+});
+
+fn parquet_struct_filter_pushdown(c: &mut Criterion) {
+    let dataset_path = DATASET.path().to_owned();
+    let mut group = c.benchmark_group("parquet_struct_filter_pushdown");
+    group.throughput(Throughput::Elements(TOTAL_ROWS as u64));
+
+    // Scenario 1: SELECT * FROM t WHERE get_field(s, 'id') = 42
+    group.bench_function("select_star/no_pushdown", |b| {
+        let file_schema = setup_reader(&dataset_path);
+        let predicate = logical2physical(&struct_id_eq_literal(), &file_schema);
+        b.iter(|| {
+            let matched = scan(&dataset_path, &predicate, false, ProjectionMask::all())
+                .expect("scan succeeded");
+            assert_eq!(matched, ROW_GROUP_ROW_COUNT);
+        });
+    });
+
+    group.bench_function("select_star/with_pushdown", |b| {
+        let file_schema = setup_reader(&dataset_path);
+        let predicate = logical2physical(&struct_id_eq_literal(), &file_schema);
+        b.iter(|| {
+            let matched = scan(&dataset_path, &predicate, true, ProjectionMask::all())
+                .expect("scan succeeded");
+            assert_eq!(matched, ROW_GROUP_ROW_COUNT);
+        });
+    });
+
+    // Scenario 2: SELECT * FROM t WHERE get_field(s, 'id') = id
+    group.bench_function("select_star_cross_col/no_pushdown", |b| {
+        let file_schema = setup_reader(&dataset_path);
+        let predicate = logical2physical(&struct_id_eq_top_id(), &file_schema);
+        b.iter(|| {
+            let matched = scan(&dataset_path, &predicate, false, ProjectionMask::all())
+                .expect("scan succeeded");
+            assert_eq!(matched, TOTAL_ROWS);
+        });
+    });
+
+    group.bench_function("select_star_cross_col/with_pushdown", |b| {
+        let file_schema = setup_reader(&dataset_path);
+        let predicate = logical2physical(&struct_id_eq_top_id(), &file_schema);
+        b.iter(|| {
+            let matched = scan(&dataset_path, &predicate, true, ProjectionMask::all())
+                .expect("scan succeeded");
+            assert_eq!(matched, TOTAL_ROWS);
+        });
+    });
+
+    // Scenario 3: SELECT id FROM t WHERE get_field(s, 'id') = 42
+    group.bench_function("select_id/no_pushdown", |b| {
+        let file_schema = setup_reader(&dataset_path);
+        let predicate = logical2physical(&struct_id_eq_literal(), &file_schema);
+        b.iter(|| {
+            // Without pushdown we must read all columns to evaluate the predicate.
+            let matched = scan(&dataset_path, &predicate, false, ProjectionMask::all())
+                .expect("scan succeeded");
+            assert_eq!(matched, ROW_GROUP_ROW_COUNT);
+        });
+    });
+
+    group.bench_function("select_id/with_pushdown", |b| {
+        let file_schema = setup_reader(&dataset_path);
+        let predicate = logical2physical(&struct_id_eq_literal(), &file_schema);
+        let id_only = id_projection(&dataset_path);
+        b.iter(|| {
+            // With pushdown the filter runs first, then we only project `id`.
+            let matched = scan(&dataset_path, &predicate, true, id_only.clone())
+                .expect("scan succeeded");
+            assert_eq!(matched, ROW_GROUP_ROW_COUNT);
+        });
+    });
+
+    group.finish();
+}
+
+fn setup_reader(path: &Path) -> SchemaRef {
+    let file = std::fs::File::open(path).expect("failed to open file");
+    let builder =
+        ParquetRecordBatchReaderBuilder::try_new(file).expect("failed to build reader");
+    Arc::clone(builder.schema())
+}
+
+/// `get_field(s, 'id') = TARGET_VALUE`
+fn struct_id_eq_literal() -> Expr {
+    let get_field_expr = datafusion_functions::core::get_field().call(vec![
+        col(STRUCT_COLUMN_NAME),
+        Expr::Literal(ScalarValue::Utf8(Some("id".to_string())), None),
+    ]);
+    get_field_expr.eq(Expr::Literal(ScalarValue::Int32(Some(TARGET_VALUE)), None))
+}
+
+/// `get_field(s, 'id') = id`
+fn struct_id_eq_top_id() -> Expr {
+    let get_field_expr = datafusion_functions::core::get_field().call(vec![
+        col(STRUCT_COLUMN_NAME),
+        Expr::Literal(ScalarValue::Utf8(Some("id".to_string())), None),
+    ]);
+    get_field_expr.eq(col(ID_COLUMN_NAME))
+}
+
+/// Build a [`ProjectionMask`] that only reads the top-level `id` leaf column.
+fn id_projection(path: &Path) -> ProjectionMask {
+    let file = std::fs::File::open(path).expect("failed to open file");
+    let builder =
+        ParquetRecordBatchReaderBuilder::try_new(file).expect("failed to build reader");
+    let parquet_schema = builder.metadata().file_metadata().schema_descr_ptr();
+    // Leaf index 0 corresponds to the top-level `id` column.
+    ProjectionMask::leaves(&parquet_schema, [0])
+}
+
+fn scan(
+    path: &Path,
+    predicate: &Arc<dyn datafusion_physical_expr::PhysicalExpr>,
+    pushdown: bool,
+    projection: ProjectionMask,
+) -> datafusion_common::Result<usize> {
+    let file = std::fs::File::open(path)?;
+    let builder = ParquetRecordBatchReaderBuilder::try_new(file)?;
+    let metadata = builder.metadata().clone();
+    let file_schema = builder.schema();
+
+    let metrics = ExecutionPlanMetricsSet::new();
+    let file_metrics = ParquetFileMetrics::new(0, &path.display().to_string(), &metrics);
+
+    let mut filter_applied = false;
+    let builder = if pushdown {
+        if let Some(row_filter) =
+            build_row_filter(predicate, file_schema, &metadata, false, &file_metrics)?
+        {
+            filter_applied = true;
+            builder.with_row_filter(row_filter)
+        } else {
+            builder
+        }
+    } else {
+        builder
+    };
+
+    // Only apply a narrow projection when the filter was actually pushed down.
+    // Otherwise we need all columns to evaluate the predicate manually.
+    let output_projection = if filter_applied {
+        projection
+    } else {
+        ProjectionMask::all()
+    };
+    let reader = builder.with_projection(output_projection).build()?;
+
+    let mut matched_rows = 0usize;
+    for batch in reader {
+        let batch = batch?;
+        if filter_applied {
+            // When the row filter was applied, rows are already filtered.
+            matched_rows += batch.num_rows();
+        } else {
+            matched_rows += count_matches(predicate, &batch)?;
+        }
+    }
+
+    Ok(matched_rows)
+}
+
+fn count_matches(
+    expr: &Arc<dyn datafusion_physical_expr::PhysicalExpr>,
+    batch: &RecordBatch,
+) -> datafusion_common::Result<usize> {
+    let values = expr.evaluate(batch)?.into_array(batch.num_rows())?;
+    let bools = values
+        .as_any()
+        .downcast_ref::<BooleanArray>()
+        .expect("boolean filter result");
+
+    Ok(bools.iter().filter(|v| matches!(v, Some(true))).count())
+}
+
+fn schema() -> SchemaRef {
+    let struct_fields = Fields::from(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new(LARGE_STRING_COLUMN_NAME, DataType::Utf8, false),
+    ]);
+    Arc::new(Schema::new(vec![
+        Field::new(ID_COLUMN_NAME, DataType::Int32, false),
+        Field::new(LARGE_STRING_COLUMN_NAME, DataType::Utf8, false),
+        Field::new(STRUCT_COLUMN_NAME, DataType::Struct(struct_fields), false),
+    ]))
+}
+
+fn create_dataset() -> datafusion_common::Result<BenchmarkDataset> {
+    let tempdir = TempDir::new()?;
+    let file_path = tempdir.path().join("struct_filter.parquet");
+
+    let schema = schema();
+    let writer_props = WriterProperties::builder()
+        .set_max_row_group_row_count(Some(ROW_GROUP_ROW_COUNT))
+        .build();
+
+    let mut writer = ArrowWriter::try_new(
+        std::fs::File::create(&file_path)?,
+        Arc::clone(&schema),
+        Some(writer_props),
+    )?;
+
+    // Each row group has a distinct `s.id` value. Only one row group
+    // matches the target, so pushdown should prune 90% of rows.
+    for rg_idx in 0..TOTAL_ROW_GROUPS {
+        let id_value = if rg_idx == TOTAL_ROW_GROUPS - 1 {
+            TARGET_VALUE
+        } else {
+            (rg_idx as i32 + 1) * 1000
+        };
+        let batch = build_struct_batch(&schema, id_value, ROW_GROUP_ROW_COUNT)?;
+        writer.write(&batch)?;
+    }
+
+    writer.close()?;
+
+    let reader =
+        ParquetRecordBatchReaderBuilder::try_new(std::fs::File::open(&file_path)?)?;
+    assert_eq!(reader.metadata().row_groups().len(), TOTAL_ROW_GROUPS);
+
+    Ok(BenchmarkDataset {
+        _tempdir: tempdir,
+        file_path,
+    })
+}
+
+fn build_struct_batch(
+    schema: &SchemaRef,
+    id_value: i32,
+    len: usize,
+) -> datafusion_common::Result<RecordBatch> {
+    let large_string: String = "x".repeat(LARGE_STRING_LEN);
+
+    // Top-level columns
+    let top_id_array = Arc::new(Int32Array::from(vec![id_value; len]));
+    let mut top_string_builder = StringBuilder::new();
+    for _ in 0..len {
+        top_string_builder.append_value(&large_string);
+    }
+    let top_string_array = Arc::new(top_string_builder.finish());
+
+    // Struct sub-fields: s.id mirrors top-level id, s.large_string is the same payload
+    let struct_id_array = Arc::new(Int32Array::from(vec![id_value; len]));
+    let mut struct_string_builder = StringBuilder::new();
+    for _ in 0..len {
+        struct_string_builder.append_value(&large_string);
+    }
+    let struct_string_array = Arc::new(struct_string_builder.finish());
+
+    let struct_array = StructArray::from(vec![
+        (
+            Arc::new(Field::new("id", DataType::Int32, false)),
+            struct_id_array as Arc<dyn arrow::array::Array>,
+        ),
+        (
+            Arc::new(Field::new(LARGE_STRING_COLUMN_NAME, DataType::Utf8, false)),
+            struct_string_array as Arc<dyn arrow::array::Array>,
+        ),
+    ]);
+
+    Ok(RecordBatch::try_new(
+        Arc::clone(schema),
+        vec![top_id_array, top_string_array, Arc::new(struct_array)],
+    )?)
+}
+
+criterion_group!(benches, parquet_struct_filter_pushdown);
+criterion_main!(benches);
diff --git a/datafusion/datasource-parquet/src/access_plan.rs b/datafusion/datasource-parquet/src/access_plan.rs
index 0c30f3ff85b6d..ca4d097c37a44 100644
--- a/datafusion/datasource-parquet/src/access_plan.rs
+++ b/datafusion/datasource-parquet/src/access_plan.rs
@@ -15,9 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion_common::{internal_err, Result};
+use crate::sort::reverse_row_selection;
+use datafusion_common::{Result, assert_eq_or_internal_err};
 use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
-use parquet::file::metadata::RowGroupMetaData;
+use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData};
 
 /// A selection of rows and row groups within a ParquetFile to decode.
 ///
@@ -82,6 +83,10 @@ use parquet::file::metadata::RowGroupMetaData;
 /// └───────────────────┘
 ///  Row Group 3
 /// ```
+///
+/// For more background, please also see the [Embedding User-Defined Indexes in Apache Parquet Files blog]
+///
+/// [Embedding User-Defined Indexes in Apache Parquet Files blog]: https://datafusion.apache.org/blog/2025/07/14/user-defined-parquet-indexes
 #[derive(Debug, Clone, PartialEq)]
 pub struct ParquetAccessPlan {
     /// How to access the i-th row group
@@ -269,13 +274,13 @@ impl ParquetAccessPlan {
                 .sum::<usize>();
 
             let row_group_row_count = rg_meta.num_rows();
-            if rows_in_selection as i64 != row_group_row_count {
-                return internal_err!(
-                    "Invalid ParquetAccessPlan Selection. Row group {idx} has {row_group_row_count} rows \
+            assert_eq_or_internal_err!(
+                rows_in_selection as i64,
+                row_group_row_count,
+                "Invalid ParquetAccessPlan Selection. Row group {idx} has {row_group_row_count} rows \
                     but selection only specifies {rows_in_selection} rows. \
                     Selection: {selection:?}"
-                );
-            }
+            );
         }
 
         let total_selection: RowSelection = self
@@ -302,13 +307,10 @@ impl ParquetAccessPlan {
 
     /// Return an iterator over the row group indexes that should be scanned
     pub fn row_group_index_iter(&self) -> impl Iterator<Item = usize> + '_ {
-        self.row_groups.iter().enumerate().filter_map(|(idx, b)| {
-            if b.should_scan() {
-                Some(idx)
-            } else {
-                None
-            }
-        })
+        self.row_groups
+            .iter()
+            .enumerate()
+            .filter_map(|(idx, b)| if b.should_scan() { Some(idx) } else { None })
     }
 
     /// Return a vec of all row group indexes to scan
@@ -336,6 +338,64 @@ impl ParquetAccessPlan {
     pub fn into_inner(self) -> Vec<RowGroupAccess> {
         self.row_groups
     }
+
+    /// Prepare this plan and resolve to the final `PreparedAccessPlan`
+    pub(crate) fn prepare(
+        self,
+        row_group_meta_data: &[RowGroupMetaData],
+    ) -> Result<PreparedAccessPlan> {
+        let row_group_indexes = self.row_group_indexes();
+        let row_selection = self.into_overall_row_selection(row_group_meta_data)?;
+
+        PreparedAccessPlan::new(row_group_indexes, row_selection)
+    }
+}
+
+/// Represents a prepared, fully resolved [`ParquetAccessPlan`]
+///
+/// The [`RowSelection`] represents the result of applying all pruning such as
+/// user provided scans, Row Group statistics, DataPage statistics, and Bloom
+/// Filters.
+///
+/// This plan is what is passed to the parquet reader
+pub(crate) struct PreparedAccessPlan {
+    /// Row group indexes to read
+    pub(crate) row_group_indexes: Vec<usize>,
+    /// Optional row selection for filtering within row groups
+    pub(crate) row_selection: Option<RowSelection>,
+}
+
+impl PreparedAccessPlan {
+    /// Create a new prepared access plan
+    fn new(
+        row_group_indexes: Vec<usize>,
+        row_selection: Option<RowSelection>,
+    ) -> Result<Self> {
+        Ok(Self {
+            row_group_indexes,
+            row_selection,
+        })
+    }
+
+    /// Reverse the access plan for reverse scanning
+    pub(crate) fn reverse(mut self, file_metadata: &ParquetMetaData) -> Result<Self> {
+        // Get the row group indexes before reversing
+        let row_groups_to_scan = self.row_group_indexes.clone();
+
+        // Reverse the row group indexes
+        self.row_group_indexes = self.row_group_indexes.into_iter().rev().collect();
+
+        // If we have a row selection, reverse it to match the new row group order
+        if let Some(row_selection) = self.row_selection {
+            self.row_selection = Some(reverse_row_selection(
+                &row_selection,
+                file_metadata,
+                &row_groups_to_scan, // Pass the original (non-reversed) row group indexes
+            )?);
+        }
+
+        Ok(self)
+    }
 }
 
 #[cfg(test)]
@@ -482,7 +542,10 @@ mod test {
             .unwrap_err()
             .to_string();
         assert_eq!(row_group_indexes, vec![0, 1, 2, 3]);
-        assert_contains!(err, "Internal error: Invalid ParquetAccessPlan Selection. Row group 1 has 20 rows but selection only specifies 12 rows");
+        assert_contains!(
+            err,
+            "Row group 1 has 20 rows but selection only specifies 12 rows"
+        );
     }
 
     #[test]
@@ -508,7 +571,10 @@ mod test {
             .unwrap_err()
             .to_string();
         assert_eq!(row_group_indexes, vec![0, 1, 2, 3]);
-        assert_contains!(err, "Invalid ParquetAccessPlan Selection. Row group 1 has 20 rows but selection only specifies 22 rows");
+        assert_contains!(
+            err,
+            "Invalid ParquetAccessPlan Selection. Row group 1 has 20 rows but selection only specifies 22 rows"
+        );
     }
 
     /// [`RowGroupMetaData`] that returns 4 row groups with 10, 20, 30, 40 rows
diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs
index f27bda387fda5..d97077d609efd 100644
--- a/datafusion/datasource-parquet/src/file_format.rs
+++ b/datafusion/datasource-parquet/src/file_format.rs
@@ -17,7 +17,6 @@
 
 //! [`ParquetFormat`]: Parquet [`FileFormat`] abstractions
 
-use std::any::Any;
 use std::cell::RefCell;
 use std::fmt::Debug;
 use std::ops::Range;
@@ -27,10 +26,11 @@ use std::{fmt, vec};
 
 use arrow::array::RecordBatch;
 use arrow::datatypes::{Fields, Schema, SchemaRef, TimeUnit};
+use datafusion_datasource::TableSchema;
 use datafusion_datasource::file_compression_type::FileCompressionType;
 use datafusion_datasource::file_sink_config::{FileSink, FileSinkConfig};
 use datafusion_datasource::write::{
-    get_writer_schema, ObjectWriterBuilder, SharedBuffer,
+    ObjectWriterBuilder, SharedBuffer, get_writer_schema,
 };
 
 use datafusion_datasource::file_format::{FileFormat, FileFormatFactory};
@@ -41,8 +41,8 @@ use datafusion_common::config::{ConfigField, ConfigFileType, TableParquetOptions
 use datafusion_common::encryption::FileDecryptionProperties;
 use datafusion_common::parsers::CompressionTypeVariant;
 use datafusion_common::{
-    internal_datafusion_err, internal_err, not_impl_err, DataFusionError, GetExt,
-    HashSet, Result, DEFAULT_PARQUET_EXTENSION,
+    DEFAULT_PARQUET_EXTENSION, DataFusionError, GetExt, HashSet, Result,
+    internal_datafusion_err, internal_err, not_impl_err,
 };
 use datafusion_common::{HashMap, Statistics};
 use datafusion_common_runtime::{JoinSet, SpawnedTask};
@@ -53,13 +53,17 @@ use datafusion_datasource::sink::{DataSink, DataSinkExec};
 use datafusion_execution::memory_pool::{MemoryConsumer, MemoryPool, MemoryReservation};
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use datafusion_expr::dml::InsertOp;
-use datafusion_physical_expr_common::sort_expr::LexRequirement;
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement};
+use datafusion_physical_plan::metrics::{
+    ElapsedComputeFutureExt, ExecutionPlanMetricsSet, MetricBuilder, MetricCategory,
+    MetricsSet, Time,
+};
 use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan};
 use datafusion_session::Session;
 
-use crate::metadata::DFParquetMetadata;
+use crate::metadata::{DFParquetMetadata, lex_ordering_to_sorting_columns};
 use crate::reader::CachedParquetFileReaderFactory;
-use crate::source::{parse_coerce_int96_string, ParquetSource};
+use crate::source::{ParquetSource, parse_coerce_int96_string};
 use async_trait::async_trait;
 use bytes::Bytes;
 use datafusion_datasource::source::DataSourceExec;
@@ -69,10 +73,10 @@ use futures::future::BoxFuture;
 use futures::{FutureExt, StreamExt, TryStreamExt};
 use object_store::buffered::BufWriter;
 use object_store::path::Path;
-use object_store::{ObjectMeta, ObjectStore};
+use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt};
 use parquet::arrow::arrow_writer::{
-    compute_leaves, ArrowColumnChunk, ArrowColumnWriter, ArrowLeafColumn,
-    ArrowRowGroupWriterFactory, ArrowWriterOptions,
+    ArrowColumnChunk, ArrowColumnWriter, ArrowLeafColumn, ArrowRowGroupWriterFactory,
+    ArrowWriterOptions, compute_leaves,
 };
 use parquet::arrow::async_reader::MetadataFetch;
 use parquet::arrow::{ArrowWriter, AsyncArrowWriter};
@@ -80,8 +84,10 @@ use parquet::basic::Type;
 #[cfg(feature = "parquet_encryption")]
 use parquet::encryption::encrypt::FileEncryptionProperties;
 use parquet::errors::ParquetError;
-use parquet::file::metadata::ParquetMetaData;
-use parquet::file::properties::{WriterProperties, WriterPropertiesBuilder};
+use parquet::file::metadata::{ParquetMetaData, SortingColumn};
+use parquet::file::properties::{
+    DEFAULT_MAX_ROW_GROUP_ROW_COUNT, WriterProperties, WriterPropertiesBuilder,
+};
 use parquet::file::writer::SerializedFileWriter;
 use parquet::schema::types::SchemaDescriptor;
 use tokio::io::{AsyncWrite, AsyncWriteExt};
@@ -146,10 +152,6 @@ impl FileFormatFactory for ParquetFormatFactory {
     fn default(&self) -> Arc<dyn FileFormat> {
         Arc::new(ParquetFormat::default())
     }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
 }
 
 impl GetExt for ParquetFormatFactory {
@@ -334,10 +336,6 @@ async fn get_file_decryption_properties(
 
 #[async_trait]
 impl FileFormat for ParquetFormat {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn get_ext(&self) -> String {
         ParquetFormatFactory::new().get_ext()
     }
@@ -390,7 +388,7 @@ impl FileFormat for ParquetFormat {
             })
             .boxed() // Workaround https://github.com/rust-lang/rust/issues/64552
             // fetch schemas concurrently, if requested
-            .buffered(state.config_options().execution.meta_fetch_concurrency)
+            .buffer_unordered(state.config_options().execution.meta_fetch_concurrency)
             .try_collect()
             .await?;
 
@@ -400,12 +398,10 @@ impl FileFormat for ParquetFormat {
         // is not deterministic. Thus, to ensure deterministic schema inference
         // sort the files first.
         // https://github.com/apache/datafusion/pull/6629
-        schemas.sort_by(|(location1, _), (location2, _)| location1.cmp(location2));
+        schemas
+            .sort_unstable_by(|(location1, _), (location2, _)| location1.cmp(location2));
 
-        let schemas = schemas
-            .into_iter()
-            .map(|(_, schema)| schema)
-            .collect::<Vec<_>>();
+        let schemas = schemas.into_iter().map(|(_, schema)| schema);
 
         let schema = if self.skip_metadata() {
             Schema::try_merge(clear_metadata(schemas))
@@ -448,6 +444,57 @@ impl FileFormat for ParquetFormat {
             .await
     }
 
+    async fn infer_ordering(
+        &self,
+        state: &dyn Session,
+        store: &Arc<dyn ObjectStore>,
+        table_schema: SchemaRef,
+        object: &ObjectMeta,
+    ) -> Result<Option<LexOrdering>> {
+        let file_decryption_properties =
+            get_file_decryption_properties(state, &self.options, &object.location)
+                .await?;
+        let file_metadata_cache =
+            state.runtime_env().cache_manager.get_file_metadata_cache();
+        let metadata = DFParquetMetadata::new(store, object)
+            .with_metadata_size_hint(self.metadata_size_hint())
+            .with_decryption_properties(file_decryption_properties)
+            .with_file_metadata_cache(Some(file_metadata_cache))
+            .fetch_metadata()
+            .await?;
+        crate::metadata::ordering_from_parquet_metadata(&metadata, &table_schema)
+    }
+
+    async fn infer_stats_and_ordering(
+        &self,
+        state: &dyn Session,
+        store: &Arc<dyn ObjectStore>,
+        table_schema: SchemaRef,
+        object: &ObjectMeta,
+    ) -> Result<datafusion_datasource::file_format::FileMeta> {
+        let file_decryption_properties =
+            get_file_decryption_properties(state, &self.options, &object.location)
+                .await?;
+        let file_metadata_cache =
+            state.runtime_env().cache_manager.get_file_metadata_cache();
+        let metadata = DFParquetMetadata::new(store, object)
+            .with_metadata_size_hint(self.metadata_size_hint())
+            .with_decryption_properties(file_decryption_properties)
+            .with_file_metadata_cache(Some(file_metadata_cache))
+            .fetch_metadata()
+            .await?;
+        let statistics = DFParquetMetadata::statistics_from_parquet_metadata(
+            &metadata,
+            &table_schema,
+        )?;
+        let ordering =
+            crate::metadata::ordering_from_parquet_metadata(&metadata, &table_schema)?;
+        Ok(
+            datafusion_datasource::file_format::FileMeta::new(statistics)
+                .with_ordering(ordering),
+        )
+    }
+
     async fn create_physical_plan(
         &self,
         state: &dyn Session,
@@ -459,7 +506,12 @@ impl FileFormat for ParquetFormat {
             metadata_size_hint = Some(metadata);
         }
 
-        let mut source = ParquetSource::new(self.options.clone());
+        let mut source = conf
+            .file_source()
+            .downcast_ref::<ParquetSource>()
+            .cloned()
+            .ok_or_else(|| internal_datafusion_err!("Expected ParquetSource"))?;
+        source = source.with_table_parquet_options(self.options.clone());
 
         // Use the CachedParquetFileReaderFactory
         let metadata_cache = state.runtime_env().cache_manager.get_file_metadata_cache();
@@ -476,11 +528,8 @@ impl FileFormat for ParquetFormat {
 
         source = self.set_source_encryption_factory(source, state)?;
 
-        // Apply schema adapter factory before building the new config
-        let file_source = source.apply_schema_adapter(&conf)?;
-
         let conf = FileScanConfigBuilder::from(conf)
-            .with_source(file_source)
+            .with_source(Arc::new(source))
             .build();
         Ok(DataSourceExec::from_data_source(conf))
     }
@@ -496,13 +545,31 @@ impl FileFormat for ParquetFormat {
             return not_impl_err!("Overwrites are not implemented yet for Parquet");
         }
 
-        let sink = Arc::new(ParquetSink::new(conf, self.options.clone()));
+        // Convert ordering requirements to Parquet SortingColumns for file metadata
+        let sorting_columns = if let Some(ref requirements) = order_requirements {
+            let ordering: LexOrdering = requirements.clone().into();
+            // In cases like `COPY (... ORDER BY ...) TO ...` the ORDER BY clause
+            // may not be compatible with Parquet sorting columns (e.g. ordering on `random()`).
+            // So if we cannot create a Parquet sorting column from the ordering requirement,
+            // we skip setting sorting columns on the Parquet sink.
+            lex_ordering_to_sorting_columns(&ordering).ok()
+        } else {
+            None
+        };
+
+        let sink = Arc::new(
+            ParquetSink::new(conf, self.options.clone())
+                .with_sorting_columns(sorting_columns),
+        );
 
         Ok(Arc::new(DataSinkExec::new(input, sink, order_requirements)) as _)
     }
 
-    fn file_source(&self) -> Arc<dyn FileSource> {
-        Arc::new(ParquetSource::default())
+    fn file_source(&self, table_schema: TableSchema) -> Arc<dyn FileSource> {
+        Arc::new(
+            ParquetSource::new(table_schema)
+                .with_table_parquet_options(self.options.clone()),
+        )
     }
 }
 
@@ -533,8 +600,9 @@ impl ParquetFormat {
         _state: &dyn Session,
     ) -> Result<ParquetSource> {
         if let Some(encryption_factory_id) = &self.options.crypto.factory_id {
-            Err(DataFusionError::Configuration(
-                format!("Parquet encryption factory id is set to '{encryption_factory_id}' but the parquet_encryption feature is disabled")))
+            Err(DataFusionError::Configuration(format!(
+                "Parquet encryption factory id is set to '{encryption_factory_id}' but the parquet_encryption feature is disabled"
+            )))
         } else {
             Ok(source)
         }
@@ -1063,6 +1131,7 @@ pub async fn fetch_statistics(
     since = "50.0.0",
     note = "Use `DFParquetMetadata::statistics_from_parquet_metadata` instead"
 )]
+#[expect(clippy::needless_pass_by_value)]
 pub fn statistics_from_parquet_meta_calc(
     metadata: &ParquetMetaData,
     table_schema: SchemaRef,
@@ -1079,6 +1148,10 @@ pub struct ParquetSink {
     /// File metadata from successfully produced parquet files. The Mutex is only used
     /// to allow inserting to HashMap from behind borrowed reference in DataSink::write_all.
     written: Arc<parking_lot::Mutex<HashMap<Path, ParquetMetaData>>>,
+    /// Optional sorting columns to write to Parquet metadata
+    sorting_columns: Option<Vec<SortingColumn>>,
+    /// Metrics for tracking write operations
+    metrics: ExecutionPlanMetricsSet,
 }
 
 impl Debug for ParquetSink {
@@ -1110,9 +1183,20 @@ impl ParquetSink {
             config,
             parquet_options,
             written: Default::default(),
+            sorting_columns: None,
+            metrics: ExecutionPlanMetricsSet::new(),
         }
     }
 
+    /// Set sorting columns for the Parquet file metadata.
+    pub fn with_sorting_columns(
+        mut self,
+        sorting_columns: Option<Vec<SortingColumn>>,
+    ) -> Self {
+        self.sorting_columns = sorting_columns;
+        self
+    }
+
     /// Retrieve the file metadata for the written files, keyed to the path
     /// which may be partitioned (in the case of hive style partitioning).
     pub fn written(&self) -> HashMap<Path, ParquetMetaData> {
@@ -1136,6 +1220,12 @@ impl ParquetSink {
         }
 
         let mut builder = WriterPropertiesBuilder::try_from(&parquet_opts)?;
+
+        // Set sorting columns if configured
+        if let Some(ref sorting_columns) = self.sorting_columns {
+            builder = builder.set_sorting_columns(Some(sorting_columns.clone()));
+        }
+
         builder = set_writer_encryption_properties(
             builder,
             runtime,
@@ -1240,6 +1330,17 @@ impl FileSink for ParquetSink {
         mut file_stream_rx: DemuxedStreamReceiver,
         object_store: Arc<dyn ObjectStore>,
     ) -> Result<u64> {
+        let rows_written_counter = MetricBuilder::new(&self.metrics)
+            .with_category(MetricCategory::Rows)
+            .global_counter("rows_written");
+        // Note: bytes_written is the sum of compressed row group sizes, which
+        // may differ slightly from the actual on-disk file size (excludes footer,
+        // page indexes, and other Parquet metadata overhead).
+        let bytes_written_counter = MetricBuilder::new(&self.metrics)
+            .with_category(MetricCategory::Bytes)
+            .global_counter("bytes_written");
+        let elapsed_compute = MetricBuilder::new(&self.metrics).elapsed_compute(0);
+
         let parquet_opts = &self.parquet_options;
 
         let mut file_write_tasks: JoinSet<
@@ -1258,7 +1359,11 @@ impl FileSink for ParquetSink {
 
         while let Some((path, mut rx)) = file_stream_rx.recv().await {
             let parquet_props = self.create_writer_props(&runtime, &path).await?;
-            if !parquet_opts.global.allow_single_file_parallelism {
+            // CDC requires the sequential writer: the chunker state lives in ArrowWriter
+            // and persists across row groups. The parallel path bypasses ArrowWriter entirely.
+            if !parquet_opts.global.allow_single_file_parallelism
+                || parquet_opts.global.use_content_defined_chunking.is_some()
+            {
                 let mut writer = self
                     .create_async_arrow_writer(
                         &path,
@@ -1267,19 +1372,22 @@ impl FileSink for ParquetSink {
                         parquet_props.clone(),
                     )
                     .await?;
-                let mut reservation = MemoryConsumer::new(format!("ParquetSink[{path}]"))
+                let reservation = MemoryConsumer::new(format!("ParquetSink[{path}]"))
                     .register(context.memory_pool());
-                file_write_tasks.spawn(async move {
-                    while let Some(batch) = rx.recv().await {
-                        writer.write(&batch).await?;
-                        reservation.try_resize(writer.memory_size())?;
+                file_write_tasks.spawn(
+                    async move {
+                        while let Some(batch) = rx.recv().await {
+                            writer.write(&batch).await?;
+                            reservation.try_resize(writer.memory_size())?;
+                        }
+                        let parquet_meta_data = writer
+                            .close()
+                            .await
+                            .map_err(|e| DataFusionError::ParquetError(Box::new(e)))?;
+                        Ok((path, parquet_meta_data))
                     }
-                    let parquet_meta_data = writer
-                        .close()
-                        .await
-                        .map_err(|e| DataFusionError::ParquetError(Box::new(e)))?;
-                    Ok((path, parquet_meta_data))
-                });
+                    .with_elapsed_compute(elapsed_compute.clone()),
+                );
             } else {
                 let writer = ObjectWriterBuilder::new(
                     // Parquet files as a whole are never compressed, since they
@@ -1296,20 +1404,20 @@ impl FileSink for ParquetSink {
                         .objectstore_writer_buffer_size,
                 ))
                 .build()?;
-                let schema = get_writer_schema(&self.config);
-                let props = parquet_props.clone();
-                let skip_arrow_metadata = self.parquet_options.global.skip_arrow_metadata;
-                let parallel_options_clone = parallel_options.clone();
-                let pool = Arc::clone(context.memory_pool());
+                let ctx = ParquetFileWriteContext {
+                    schema: get_writer_schema(&self.config),
+                    props: Arc::new(parquet_props),
+                    skip_arrow_metadata: self.parquet_options.global.skip_arrow_metadata,
+                    parallel_options: Arc::new(parallel_options.clone()),
+                    pool: Arc::clone(context.memory_pool()),
+                };
+                let encoding_time = elapsed_compute.clone();
                 file_write_tasks.spawn(async move {
                     let parquet_meta_data = output_single_parquet_file_parallelized(
                         writer,
                         rx,
-                        schema,
-                        &props,
-                        skip_arrow_metadata,
-                        parallel_options_clone,
-                        pool,
+                        ctx,
+                        encoding_time,
                     )
                     .await?;
                     Ok((path, parquet_meta_data))
@@ -1317,12 +1425,18 @@ impl FileSink for ParquetSink {
             }
         }
 
-        let mut row_count = 0;
         while let Some(result) = file_write_tasks.join_next().await {
             match result {
                 Ok(r) => {
                     let (path, parquet_meta_data) = r?;
-                    row_count += parquet_meta_data.file_metadata().num_rows();
+                    let file_rows = parquet_meta_data.file_metadata().num_rows() as usize;
+                    let file_bytes: usize = parquet_meta_data
+                        .row_groups()
+                        .iter()
+                        .map(|rg| rg.compressed_size() as usize)
+                        .sum();
+                    rows_written_counter.add(file_rows);
+                    bytes_written_counter.add(file_bytes);
                     let mut written_files = self.written.lock();
                     written_files
                         .try_insert(path.clone(), parquet_meta_data)
@@ -1344,14 +1458,14 @@ impl FileSink for ParquetSink {
             .await
             .map_err(|e| DataFusionError::ExecutionJoin(Box::new(e)))??;
 
-        Ok(row_count as u64)
+        Ok(rows_written_counter.value() as u64)
     }
 }
 
 #[async_trait]
 impl DataSink for ParquetSink {
-    fn as_any(&self) -> &dyn Any {
-        self
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
     }
 
     fn schema(&self) -> &SchemaRef {
@@ -1372,9 +1486,11 @@ impl DataSink for ParquetSink {
 async fn column_serializer_task(
     mut rx: Receiver<ArrowLeafColumn>,
     mut writer: ArrowColumnWriter,
-    mut reservation: MemoryReservation,
+    reservation: MemoryReservation,
+    encoding_time: Time,
 ) -> Result<(ArrowColumnWriter, MemoryReservation)> {
     while let Some(col) = rx.recv().await {
+        let _timer = encoding_time.timer();
         writer.write(&col)?;
         reservation.try_resize(writer.memory_size())?;
     }
@@ -1391,6 +1507,7 @@ fn spawn_column_parallel_row_group_writer(
     col_writers: Vec<ArrowColumnWriter>,
     max_buffer_size: usize,
     pool: &Arc<dyn MemoryPool>,
+    encoding_time: &Time,
 ) -> Result<(Vec<ColumnWriterTask>, Vec<ColSender>)> {
     let num_columns = col_writers.len();
 
@@ -1408,6 +1525,7 @@ fn spawn_column_parallel_row_group_writer(
             receive_array,
             writer,
             reservation,
+            encoding_time.clone(),
         ));
         col_writer_tasks.push(task);
     }
@@ -1422,6 +1540,22 @@ struct ParallelParquetWriterOptions {
     max_buffered_record_batches_per_stream: usize,
 }
 
+/// Write configuration inputs shared across all parallel tasks that encode a
+/// single Parquet file. These values are invariant for the duration of one file
+/// write and do not change per row-group or per column.
+///
+/// Separating these from per-call parameters (`object_store_writer`, `data`,
+/// `encoding_time`) keeps the deep parallel call chain below the argument-count
+/// limit without mixing configuration with runtime state.
+#[derive(Clone)]
+struct ParquetFileWriteContext {
+    schema: Arc<Schema>,
+    props: Arc<WriterProperties>,
+    skip_arrow_metadata: bool,
+    parallel_options: Arc<ParallelParquetWriterOptions>,
+    pool: Arc<dyn MemoryPool>,
+}
+
 /// This is the return type of calling [ArrowColumnWriter].close() on each column
 /// i.e. the Vec of encoded columns which can be appended to a row group
 type RBStreamSerializeResult = Result<(Vec<ArrowColumnChunk>, MemoryReservation, usize)>;
@@ -1456,8 +1590,9 @@ fn spawn_rg_join_and_finalize_task(
     column_writer_tasks: Vec<ColumnWriterTask>,
     rg_rows: usize,
     pool: &Arc<dyn MemoryPool>,
+    encoding_time: Time,
 ) -> SpawnedTask<RBStreamSerializeResult> {
-    let mut rg_reservation =
+    let rg_reservation =
         MemoryConsumer::new("ParquetSink(SerializedRowGroupWriter)").register(pool);
 
     SpawnedTask::spawn(async move {
@@ -1470,6 +1605,7 @@ fn spawn_rg_join_and_finalize_task(
                 .map_err(|e| DataFusionError::ExecutionJoin(Box::new(e)))??;
             let encoded_size = writer.get_estimated_total_bytes();
             rg_reservation.grow(encoded_size);
+            let _timer = encoding_time.timer();
             finalized_rg.push(writer.close()?);
         }
 
@@ -1489,19 +1625,25 @@ fn spawn_parquet_parallel_serialization_task(
     row_group_writer_factory: ArrowRowGroupWriterFactory,
     mut data: Receiver<RecordBatch>,
     serialize_tx: Sender<SpawnedTask<RBStreamSerializeResult>>,
-    schema: Arc<Schema>,
-    writer_props: Arc<WriterProperties>,
-    parallel_options: ParallelParquetWriterOptions,
-    pool: Arc<dyn MemoryPool>,
+    ctx: ParquetFileWriteContext,
+    encoding_time: Time,
 ) -> SpawnedTask<Result<(), DataFusionError>> {
     SpawnedTask::spawn(async move {
-        let max_buffer_rb = parallel_options.max_buffered_record_batches_per_stream;
-        let max_row_group_rows = writer_props.max_row_group_size();
+        let max_buffer_rb = ctx.parallel_options.max_buffered_record_batches_per_stream;
+        let max_row_group_rows = ctx
+            .props
+            .max_row_group_row_count()
+            .unwrap_or(DEFAULT_MAX_ROW_GROUP_ROW_COUNT);
         let mut row_group_index = 0;
         let col_writers =
             row_group_writer_factory.create_column_writers(row_group_index)?;
         let (mut column_writer_handles, mut col_array_channels) =
-            spawn_column_parallel_row_group_writer(col_writers, max_buffer_rb, &pool)?;
+            spawn_column_parallel_row_group_writer(
+                col_writers,
+                max_buffer_rb,
+                &ctx.pool,
+                &encoding_time,
+            )?;
         let mut current_rg_rows = 0;
 
         while let Some(mut rb) = data.recv().await {
@@ -1513,7 +1655,7 @@ fn spawn_parquet_parallel_serialization_task(
                     send_arrays_to_col_writers(
                         &col_array_channels,
                         &rb,
-                        Arc::clone(&schema),
+                        Arc::clone(&ctx.schema),
                     )
                     .await?;
                     current_rg_rows += rb.num_rows();
@@ -1524,7 +1666,7 @@ fn spawn_parquet_parallel_serialization_task(
                     send_arrays_to_col_writers(
                         &col_array_channels,
                         &a,
-                        Arc::clone(&schema),
+                        Arc::clone(&ctx.schema),
                     )
                     .await?;
 
@@ -1535,7 +1677,8 @@ fn spawn_parquet_parallel_serialization_task(
                     let finalize_rg_task = spawn_rg_join_and_finalize_task(
                         column_writer_handles,
                         max_row_group_rows,
-                        &pool,
+                        &ctx.pool,
+                        encoding_time.clone(),
                     );
 
                     // Do not surface error from closed channel (means something
@@ -1554,7 +1697,8 @@ fn spawn_parquet_parallel_serialization_task(
                         spawn_column_parallel_row_group_writer(
                             col_writers,
                             max_buffer_rb,
-                            &pool,
+                            &ctx.pool,
+                            &encoding_time,
                         )?;
                 }
             }
@@ -1566,7 +1710,8 @@ fn spawn_parquet_parallel_serialization_task(
             let finalize_rg_task = spawn_rg_join_and_finalize_task(
                 column_writer_handles,
                 current_rg_rows,
-                &pool,
+                &ctx.pool,
+                encoding_time.clone(),
             );
 
             // Do not surface error from closed channel (means something
@@ -1589,12 +1734,12 @@ async fn concatenate_parallel_row_groups(
     mut object_store_writer: Box<dyn AsyncWrite + Send + Unpin>,
     pool: Arc<dyn MemoryPool>,
 ) -> Result<ParquetMetaData> {
-    let mut file_reservation =
+    let file_reservation =
         MemoryConsumer::new("ParquetSink(SerializedFileWriter)").register(&pool);
 
     while let Some(task) = serialize_rx.recv().await {
         let result = task.join_unwind().await;
-        let (serialized_columns, mut rg_reservation, _cnt) =
+        let (serialized_columns, rg_reservation, _cnt) =
             result.map_err(|e| DataFusionError::ExecutionJoin(Box::new(e)))??;
 
         let mut rg_out = parquet_writer.next_row_group()?;
@@ -1633,37 +1778,32 @@ async fn concatenate_parallel_row_groups(
 async fn output_single_parquet_file_parallelized(
     object_store_writer: Box<dyn AsyncWrite + Send + Unpin>,
     data: Receiver<RecordBatch>,
-    output_schema: Arc<Schema>,
-    parquet_props: &WriterProperties,
-    skip_arrow_metadata: bool,
-    parallel_options: ParallelParquetWriterOptions,
-    pool: Arc<dyn MemoryPool>,
+    ctx: ParquetFileWriteContext,
+    encoding_time: Time,
 ) -> Result<ParquetMetaData> {
-    let max_rowgroups = parallel_options.max_parallel_row_groups;
+    let max_rowgroups = ctx.parallel_options.max_parallel_row_groups;
     // Buffer size of this channel limits maximum number of RowGroups being worked on in parallel
     let (serialize_tx, serialize_rx) =
         mpsc::channel::<SpawnedTask<RBStreamSerializeResult>>(max_rowgroups);
 
-    let arc_props = Arc::new(parquet_props.clone());
     let merged_buff = SharedBuffer::new(INITIAL_BUFFER_BYTES);
     let options = ArrowWriterOptions::new()
-        .with_properties(parquet_props.clone())
-        .with_skip_arrow_metadata(skip_arrow_metadata);
+        .with_properties((*ctx.props).clone())
+        .with_skip_arrow_metadata(ctx.skip_arrow_metadata);
     let writer = ArrowWriter::try_new_with_options(
         merged_buff.clone(),
-        Arc::clone(&output_schema),
+        Arc::clone(&ctx.schema),
         options,
     )?;
     let (writer, row_group_writer_factory) = writer.into_serialized_writer()?;
 
+    let pool = Arc::clone(&ctx.pool);
     let launch_serialization_task = spawn_parquet_parallel_serialization_task(
         row_group_writer_factory,
         data,
         serialize_tx,
-        Arc::clone(&output_schema),
-        Arc::clone(&arc_props),
-        parallel_options,
-        Arc::clone(&pool),
+        ctx,
+        encoding_time,
     );
     let parquet_meta_data = concatenate_parallel_row_groups(
         writer,
@@ -1684,11 +1824,9 @@ async fn output_single_parquet_file_parallelized(
 #[cfg(test)]
 mod tests {
     use parquet::arrow::parquet_to_arrow_schema;
-    use std::sync::Arc;
 
     use super::*;
 
-    use arrow::datatypes::DataType;
     use parquet::schema::parser::parse_message_type;
 
     #[test]
diff --git a/datafusion/datasource-parquet/src/metadata.rs b/datafusion/datasource-parquet/src/metadata.rs
index 6505a447d7ce6..70ac3fe4987c0 100644
--- a/datafusion/datasource-parquet/src/metadata.rs
+++ b/datafusion/datasource-parquet/src/metadata.rs
@@ -18,10 +18,8 @@
 //! [`DFParquetMetadata`] for fetching Parquet file metadata, statistics
 //! and schema information.
 
-use crate::{
-    apply_file_schema_type_coercions, coerce_int96_to_resolution, ObjectStoreFetch,
-};
-use arrow::array::{ArrayRef, BooleanArray};
+use crate::{apply_file_schema_type_coercions, coerce_int96_to_resolution};
+use arrow::array::{Array, ArrayRef, BooleanArray};
 use arrow::compute::and;
 use arrow::compute::kernels::cmp::eq;
 use arrow::compute::sum;
@@ -31,21 +29,33 @@ use datafusion_common::stats::Precision;
 use datafusion_common::{
     ColumnStatistics, DataFusionError, Result, ScalarValue, Statistics,
 };
-use datafusion_execution::cache::cache_manager::{FileMetadata, FileMetadataCache};
+use datafusion_execution::cache::cache_manager::{
+    CachedFileMetadataEntry, FileMetadata, FileMetadataCache,
+};
 use datafusion_functions_aggregate_common::min_max::{MaxAccumulator, MinAccumulator};
+use datafusion_physical_expr::expressions::Column;
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
 use datafusion_physical_plan::Accumulator;
 use log::debug;
 use object_store::path::Path;
 use object_store::{ObjectMeta, ObjectStore};
+use parquet::DecodeResult;
 use parquet::arrow::arrow_reader::statistics::StatisticsConverter;
-use parquet::arrow::parquet_to_arrow_schema;
+use parquet::arrow::{parquet_column, parquet_to_arrow_schema};
 use parquet::file::metadata::{
-    PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData,
+    PageIndexPolicy, ParquetMetaData, ParquetMetaDataPushDecoder, RowGroupMetaData,
+    SortingColumn,
 };
+use parquet::schema::types::SchemaDescriptor;
 use std::any::Any;
 use std::collections::HashMap;
 use std::sync::Arc;
 
+/// Minimum fraction of row groups that must report NDV statistics for the
+/// merged result to be `Inexact` rather than `Absent`, as the estimate
+/// would be too unreliable otherwise.
+const PARTIAL_NDV_THRESHOLD: f64 = 0.75;
+
 /// Handles fetching Parquet file schema, metadata and statistics
 /// from object store.
 ///
@@ -108,66 +118,88 @@ impl<'a> DFParquetMetadata<'a> {
 
     /// Fetch parquet metadata from the remote object store
     pub async fn fetch_metadata(&self) -> Result<Arc<ParquetMetaData>> {
-        let Self {
-            store,
-            object_meta,
-            metadata_size_hint,
-            decryption_properties,
-            file_metadata_cache,
-            coerce_int96: _,
-        } = self;
-
-        let fetch = ObjectStoreFetch::new(*store, object_meta);
-
         // implementation to fetch parquet metadata
         let cache_metadata =
-            !cfg!(feature = "parquet_encryption") || decryption_properties.is_none();
-
-        if cache_metadata {
-            if let Some(parquet_metadata) = file_metadata_cache
-                .as_ref()
-                .and_then(|file_metadata_cache| file_metadata_cache.get(object_meta))
-                .and_then(|file_metadata| {
-                    file_metadata
-                        .as_any()
-                        .downcast_ref::<CachedParquetMetaData>()
-                        .map(|cached_parquet_metadata| {
-                            Arc::clone(cached_parquet_metadata.parquet_metadata())
-                        })
-                })
-            {
-                return Ok(parquet_metadata);
-            }
+            !cfg!(feature = "parquet_encryption") || self.decryption_properties.is_none();
+
+        if cache_metadata
+            && let Some(file_metadata_cache) = self.file_metadata_cache.as_ref()
+            && let Some(cached) = file_metadata_cache.get(&self.object_meta.location)
+            && cached.is_valid_for(self.object_meta)
+            && let Some(cached_parquet) = cached
+                .file_metadata
+                .as_any()
+                .downcast_ref::<CachedParquetMetaData>()
+        {
+            return Ok(Arc::clone(cached_parquet.parquet_metadata()));
         }
 
-        let mut reader =
-            ParquetMetaDataReader::new().with_prefetch_hint(*metadata_size_hint);
+        let file_size = self.object_meta.size;
+        let mut decoder = ParquetMetaDataPushDecoder::try_new(file_size)
+            .map_err(DataFusionError::from)?;
 
         #[cfg(feature = "parquet_encryption")]
-        if let Some(decryption_properties) = decryption_properties {
-            reader = reader
-                .with_decryption_properties(Some(Arc::clone(decryption_properties)));
+        if let Some(decryption_properties) = &self.decryption_properties {
+            decoder = decoder
+                .with_file_decryption_properties(Some(Arc::clone(decryption_properties)));
         }
 
-        if cache_metadata && file_metadata_cache.is_some() {
+        if cache_metadata && self.file_metadata_cache.is_some() {
             // Need to retrieve the entire metadata for the caching to be effective.
-            reader = reader.with_page_index_policy(PageIndexPolicy::Optional);
+            decoder = decoder.with_page_index_policy(PageIndexPolicy::Optional);
+        } else {
+            decoder = decoder.with_page_index_policy(PageIndexPolicy::Skip);
         }
 
-        let metadata = Arc::new(
-            reader
-                .load_and_finish(fetch, object_meta.size)
+        // If we have a size hint, prefetch that many bytes from the end of the file
+        if let Some(hint) = self.metadata_size_hint {
+            let prefetch_start = file_size.saturating_sub(hint as u64);
+            let prefetch_range = prefetch_start..file_size;
+            let data = self
+                .store
+                .get_ranges(
+                    &self.object_meta.location,
+                    std::slice::from_ref(&prefetch_range),
+                )
                 .await
-                .map_err(DataFusionError::from)?,
-        );
+                .map_err(DataFusionError::from)?;
+            decoder
+                .push_ranges(vec![prefetch_range], data)
+                .map_err(DataFusionError::from)?;
+        }
 
-        if cache_metadata {
-            if let Some(file_metadata_cache) = file_metadata_cache {
-                file_metadata_cache.put(
-                    object_meta,
-                    Arc::new(CachedParquetMetaData::new(Arc::clone(&metadata))),
-                );
+        let metadata = loop {
+            match decoder.try_decode().map_err(DataFusionError::from)? {
+                DecodeResult::Data(metadata) => break metadata,
+                DecodeResult::NeedsData(ranges) => {
+                    let buffers = self
+                        .store
+                        .get_ranges(&self.object_meta.location, &ranges)
+                        .await
+                        .map_err(DataFusionError::from)?;
+                    decoder
+                        .push_ranges(ranges, buffers)
+                        .map_err(DataFusionError::from)?;
+                }
+                DecodeResult::Finished => {
+                    return Err(DataFusionError::Internal(
+                        "ParquetMetaDataPushDecoder finished without producing metadata"
+                            .to_string(),
+                    ));
+                }
             }
+        };
+
+        let metadata = Arc::new(metadata);
+
+        if cache_metadata && let Some(file_metadata_cache) = &self.file_metadata_cache {
+            file_metadata_cache.put(
+                &self.object_meta.location,
+                CachedFileMetadataEntry::new(
+                    self.object_meta.clone(),
+                    Arc::new(CachedParquetMetaData::new(Arc::clone(&metadata))),
+                ),
+            );
         }
 
         Ok(metadata)
@@ -227,30 +259,40 @@ impl<'a> DFParquetMetadata<'a> {
     /// - Exact row count
     /// - Exact byte size
     /// - All column statistics marked as unknown via Statistics::unknown_column(&table_schema)
+    /// - Column byte sizes are still calculated and recorded
+    ///
     /// # When only some columns have statistics:
     ///
     /// For columns with statistics:
     /// - Min/max values are properly extracted and represented as Precision::Exact
     /// - Null counts are calculated by summing across row groups
+    /// - Byte sizes are calculated and recorded
     ///
     /// For columns without statistics,
     /// - For min/max, there are two situations:
     ///     1. The column isn't in arrow schema, then min/max values are set to Precision::Absent
     ///     2. The column is in arrow schema, but not in parquet schema due to schema revolution, min/max values are set to Precision::Exact(null)
     /// - Null counts are set to Precision::Exact(num_rows) (conservatively assuming all values could be null)
+    ///
+    /// # Byte Size Calculation:
+    ///
+    /// - For primitive types with known fixed size, exact byte size is calculated as (byte width * number of rows)
+    /// - For other types, uncompressed Parquet size is used as an estimate for in-memory size
+    /// - If neither method is applicable, byte size is marked as Precision::Absent
     pub fn statistics_from_parquet_metadata(
         metadata: &ParquetMetaData,
-        table_schema: &SchemaRef,
+        logical_file_schema: &SchemaRef,
     ) -> Result<Statistics> {
         let row_groups_metadata = metadata.row_groups();
 
-        let mut statistics = Statistics::new_unknown(table_schema);
+        // Use Statistics::default() as opposed to Statistics::new_unknown()
+        // because we are going to replace the column statistics below
+        // and we don't want to initialize them twice.
+        let mut statistics = Statistics::default();
         let mut has_statistics = false;
         let mut num_rows = 0_usize;
-        let mut total_byte_size = 0_usize;
         for row_group_meta in row_groups_metadata {
             num_rows += row_group_meta.num_rows() as usize;
-            total_byte_size += row_group_meta.total_byte_size() as usize;
 
             if !has_statistics {
                 has_statistics = row_group_meta
@@ -260,33 +302,37 @@ impl<'a> DFParquetMetadata<'a> {
             }
         }
         statistics.num_rows = Precision::Exact(num_rows);
-        statistics.total_byte_size = Precision::Exact(total_byte_size);
 
         let file_metadata = metadata.file_metadata();
-        let mut file_schema = parquet_to_arrow_schema(
+        let mut physical_file_schema = parquet_to_arrow_schema(
             file_metadata.schema_descr(),
             file_metadata.key_value_metadata(),
         )?;
 
-        if let Some(merged) = apply_file_schema_type_coercions(table_schema, &file_schema)
+        if let Some(merged) =
+            apply_file_schema_type_coercions(logical_file_schema, &physical_file_schema)
         {
-            file_schema = merged;
+            physical_file_schema = merged;
         }
 
-        statistics.column_statistics = if has_statistics {
-            let (mut max_accs, mut min_accs) = create_max_min_accs(table_schema);
-            let mut null_counts_array =
-                vec![Precision::Exact(0); table_schema.fields().len()];
-            let mut is_max_value_exact = vec![Some(true); table_schema.fields().len()];
-            let mut is_min_value_exact = vec![Some(true); table_schema.fields().len()];
-            table_schema
-                .fields()
-                .iter()
-                .enumerate()
-                .for_each(|(idx, field)| {
-                    match StatisticsConverter::try_new(
+        statistics.column_statistics =
+            if has_statistics {
+                let (mut max_accs, mut min_accs) =
+                    create_max_min_accs(logical_file_schema);
+                let mut null_counts_array =
+                    vec![Precision::Absent; logical_file_schema.fields().len()];
+                let mut column_byte_sizes =
+                    vec![Precision::Absent; logical_file_schema.fields().len()];
+                let mut is_max_value_exact =
+                    vec![Some(true); logical_file_schema.fields().len()];
+                let mut is_min_value_exact =
+                    vec![Some(true); logical_file_schema.fields().len()];
+                let mut distinct_counts_array =
+                    vec![Precision::Absent; logical_file_schema.fields().len()];
+                logical_file_schema.fields().iter().enumerate().for_each(
+                    |(idx, field)| match StatisticsConverter::try_new(
                         field.name(),
-                        &file_schema,
+                        &physical_file_schema,
                         file_metadata.schema_descr(),
                     ) {
                         Ok(stats_converter) => {
@@ -296,8 +342,13 @@ impl<'a> DFParquetMetadata<'a> {
                                 null_counts_array: &mut null_counts_array,
                                 is_min_value_exact: &mut is_min_value_exact,
                                 is_max_value_exact: &mut is_max_value_exact,
+                                column_byte_sizes: &mut column_byte_sizes,
+                                distinct_counts_array: &mut distinct_counts_array,
                             };
-                            summarize_min_max_null_counts(
+                            summarize_column_statistics(
+                                file_metadata.schema_descr(),
+                                logical_file_schema,
+                                &physical_file_schema,
                                 &mut accumulators,
                                 idx,
                                 &stats_converter,
@@ -309,20 +360,54 @@ impl<'a> DFParquetMetadata<'a> {
                             debug!("Failed to create statistics converter: {e}");
                             null_counts_array[idx] = Precision::Exact(num_rows);
                         }
-                    }
-                });
-
-            get_col_stats(
-                table_schema,
-                null_counts_array,
-                &mut max_accs,
-                &mut min_accs,
-                &mut is_max_value_exact,
-                &mut is_min_value_exact,
-            )
-        } else {
-            Statistics::unknown_column(table_schema)
-        };
+                    },
+                );
+
+                let mut accumulators = StatisticsAccumulators {
+                    min_accs: &mut min_accs,
+                    max_accs: &mut max_accs,
+                    null_counts_array: &mut null_counts_array,
+                    is_min_value_exact: &mut is_min_value_exact,
+                    is_max_value_exact: &mut is_max_value_exact,
+                    column_byte_sizes: &mut column_byte_sizes,
+                    distinct_counts_array: &mut distinct_counts_array,
+                };
+                accumulators.build_column_statistics(logical_file_schema)
+            } else {
+                // Record column sizes
+                logical_file_schema
+                    .fields()
+                    .iter()
+                    .enumerate()
+                    .map(|(logical_file_schema_index, field)| {
+                        let arrow_field =
+                            logical_file_schema.field(logical_file_schema_index);
+                        let parquet_idx = parquet_column(
+                            file_metadata.schema_descr(),
+                            &physical_file_schema,
+                            arrow_field.name(),
+                        )
+                        .map(|(idx, _)| idx);
+                        let byte_size = compute_arrow_column_size(
+                            field.data_type(),
+                            row_groups_metadata,
+                            parquet_idx,
+                            num_rows,
+                        );
+                        ColumnStatistics::new_unknown().with_byte_size(byte_size)
+                    })
+                    .collect()
+            };
+
+        #[cfg(debug_assertions)]
+        {
+            // Check that the column statistics length matches the table schema fields length
+            assert_eq!(
+                statistics.column_statistics.len(),
+                logical_file_schema.fields().len(),
+                "Column statistics length does not match table schema fields length"
+            );
+        }
 
         Ok(statistics)
     }
@@ -360,51 +445,6 @@ fn create_max_min_accs(
     (max_values, min_values)
 }
 
-fn get_col_stats(
-    schema: &Schema,
-    null_counts: Vec<Precision<usize>>,
-    max_values: &mut [Option<MaxAccumulator>],
-    min_values: &mut [Option<MinAccumulator>],
-    is_max_value_exact: &mut [Option<bool>],
-    is_min_value_exact: &mut [Option<bool>],
-) -> Vec<ColumnStatistics> {
-    (0..schema.fields().len())
-        .map(|i| {
-            let max_value = match (
-                max_values.get_mut(i).unwrap(),
-                is_max_value_exact.get(i).unwrap(),
-            ) {
-                (Some(max_value), Some(true)) => {
-                    max_value.evaluate().ok().map(Precision::Exact)
-                }
-                (Some(max_value), Some(false)) | (Some(max_value), None) => {
-                    max_value.evaluate().ok().map(Precision::Inexact)
-                }
-                (None, _) => None,
-            };
-            let min_value = match (
-                min_values.get_mut(i).unwrap(),
-                is_min_value_exact.get(i).unwrap(),
-            ) {
-                (Some(min_value), Some(true)) => {
-                    min_value.evaluate().ok().map(Precision::Exact)
-                }
-                (Some(min_value), Some(false)) | (Some(min_value), None) => {
-                    min_value.evaluate().ok().map(Precision::Inexact)
-                }
-                (None, _) => None,
-            };
-            ColumnStatistics {
-                null_count: null_counts[i],
-                max_value: max_value.unwrap_or(Precision::Absent),
-                min_value: min_value.unwrap_or(Precision::Absent),
-                sum_value: Precision::Absent,
-                distinct_count: Precision::Absent,
-            }
-        })
-        .collect()
-}
-
 /// Holds the accumulator state for collecting statistics from row groups
 struct StatisticsAccumulators<'a> {
     min_accs: &'a mut [Option<MinAccumulator>],
@@ -412,11 +452,58 @@ struct StatisticsAccumulators<'a> {
     null_counts_array: &'a mut [Precision<usize>],
     is_min_value_exact: &'a mut [Option<bool>],
     is_max_value_exact: &'a mut [Option<bool>],
+    column_byte_sizes: &'a mut [Precision<usize>],
+    distinct_counts_array: &'a mut [Precision<usize>],
 }
 
-fn summarize_min_max_null_counts(
+impl StatisticsAccumulators<'_> {
+    /// Converts the accumulated statistics into a vector of `ColumnStatistics`
+    fn build_column_statistics(&mut self, schema: &Schema) -> Vec<ColumnStatistics> {
+        (0..schema.fields().len())
+            .map(|i| {
+                let max_value = match (
+                    self.max_accs.get_mut(i).unwrap(),
+                    self.is_max_value_exact.get(i).unwrap(),
+                ) {
+                    (Some(max_value), Some(true)) => {
+                        max_value.evaluate().ok().map(Precision::Exact)
+                    }
+                    (Some(max_value), Some(false)) | (Some(max_value), None) => {
+                        max_value.evaluate().ok().map(Precision::Inexact)
+                    }
+                    (None, _) => None,
+                };
+                let min_value = match (
+                    self.min_accs.get_mut(i).unwrap(),
+                    self.is_min_value_exact.get(i).unwrap(),
+                ) {
+                    (Some(min_value), Some(true)) => {
+                        min_value.evaluate().ok().map(Precision::Exact)
+                    }
+                    (Some(min_value), Some(false)) | (Some(min_value), None) => {
+                        min_value.evaluate().ok().map(Precision::Inexact)
+                    }
+                    (None, _) => None,
+                };
+                ColumnStatistics {
+                    null_count: self.null_counts_array[i],
+                    max_value: max_value.unwrap_or(Precision::Absent),
+                    min_value: min_value.unwrap_or(Precision::Absent),
+                    sum_value: Precision::Absent,
+                    distinct_count: self.distinct_counts_array[i],
+                    byte_size: self.column_byte_sizes[i],
+                }
+            })
+            .collect()
+    }
+}
+
+fn summarize_column_statistics(
+    parquet_schema: &SchemaDescriptor,
+    logical_file_schema: &Schema,
+    physical_file_schema: &Schema,
     accumulators: &mut StatisticsAccumulators,
-    arrow_schema_index: usize,
+    logical_schema_index: usize,
     stats_converter: &StatisticsConverter,
     row_groups_metadata: &[RowGroupMetaData],
 ) -> Result<()> {
@@ -428,27 +515,41 @@ fn summarize_min_max_null_counts(
     let is_min_value_exact_stat =
         stats_converter.row_group_is_min_value_exact(row_groups_metadata)?;
 
-    if let Some(max_acc) = &mut accumulators.max_accs[arrow_schema_index] {
+    if let Some(max_acc) = &mut accumulators.max_accs[logical_schema_index] {
         max_acc.update_batch(&[Arc::clone(&max_values)])?;
-        let mut cur_max_acc = max_acc.clone();
-        accumulators.is_max_value_exact[arrow_schema_index] = has_any_exact_match(
-            cur_max_acc.evaluate()?,
-            max_values,
-            is_max_value_exact_stat,
-        );
+
+        // handle the common special case when all row groups have exact statistics
+        let exactness = &is_max_value_exact_stat;
+        if !exactness.is_empty() && exactness.null_count() == 0 && !exactness.has_false()
+        {
+            accumulators.is_max_value_exact[logical_schema_index] = Some(true);
+        } else if !exactness.has_true() {
+            accumulators.is_max_value_exact[logical_schema_index] = Some(false);
+        } else {
+            let val = max_acc.evaluate()?;
+            accumulators.is_max_value_exact[logical_schema_index] =
+                has_any_exact_match(&val, &max_values, exactness);
+        }
     }
 
-    if let Some(min_acc) = &mut accumulators.min_accs[arrow_schema_index] {
+    if let Some(min_acc) = &mut accumulators.min_accs[logical_schema_index] {
         min_acc.update_batch(&[Arc::clone(&min_values)])?;
-        let mut cur_min_acc = min_acc.clone();
-        accumulators.is_min_value_exact[arrow_schema_index] = has_any_exact_match(
-            cur_min_acc.evaluate()?,
-            min_values,
-            is_min_value_exact_stat,
-        );
+
+        // handle the common special case when all row groups have exact statistics
+        let exactness = &is_min_value_exact_stat;
+        if !exactness.is_empty() && exactness.null_count() == 0 && !exactness.has_false()
+        {
+            accumulators.is_min_value_exact[logical_schema_index] = Some(true);
+        } else if !exactness.has_true() {
+            accumulators.is_min_value_exact[logical_schema_index] = Some(false);
+        } else {
+            let val = min_acc.evaluate()?;
+            accumulators.is_min_value_exact[logical_schema_index] =
+                has_any_exact_match(&val, &min_values, exactness);
+        }
     }
 
-    accumulators.null_counts_array[arrow_schema_index] = match sum(&null_counts) {
+    accumulators.null_counts_array[logical_schema_index] = match sum(&null_counts) {
         Some(null_count) => Precision::Exact(null_count as usize),
         None => match null_counts.len() {
             // If sum() returned None we either have no rows or all values are null
@@ -457,9 +558,88 @@ fn summarize_min_max_null_counts(
         },
     };
 
+    // This is the same logic as parquet_column but we start from arrow schema index
+    // instead of looking up by name.
+    let parquet_index = parquet_column(
+        parquet_schema,
+        physical_file_schema,
+        logical_file_schema.field(logical_schema_index).name(),
+    )
+    .map(|(idx, _)| idx);
+
+    // Extract distinct counts from row group column statistics
+    accumulators.distinct_counts_array[logical_schema_index] =
+        if let Some(parquet_idx) = parquet_index {
+            let num_row_groups = row_groups_metadata.len();
+            let distinct_counts: Vec<u64> = row_groups_metadata
+                .iter()
+                .filter_map(|rg| {
+                    rg.columns()
+                        .get(parquet_idx)
+                        .and_then(|col| col.statistics())
+                        .and_then(|stats| stats.distinct_count_opt())
+                })
+                .collect();
+
+            let coverage = distinct_counts.len() as f64 / num_row_groups.max(1) as f64;
+
+            if coverage < PARTIAL_NDV_THRESHOLD {
+                Precision::Absent
+            } else if distinct_counts.len() == 1 && num_row_groups == 1 {
+                // Single row group with distinct count - use exact value
+                Precision::Exact(distinct_counts[0] as usize)
+            } else {
+                // Multiple row groups - use max as a lower bound estimate
+                // (can't accurately merge NDV since duplicates may exist across row groups)
+                match distinct_counts.iter().max() {
+                    Some(&max_ndv) => Precision::Inexact(max_ndv as usize),
+                    None => Precision::Absent,
+                }
+            }
+        } else {
+            Precision::Absent
+        };
+
+    let arrow_field = logical_file_schema.field(logical_schema_index);
+    accumulators.column_byte_sizes[logical_schema_index] = compute_arrow_column_size(
+        arrow_field.data_type(),
+        row_groups_metadata,
+        parquet_index,
+        row_groups_metadata
+            .iter()
+            .map(|rg| rg.num_rows() as usize)
+            .sum(),
+    );
+
     Ok(())
 }
 
+/// Compute the Arrow in-memory size for a single column
+fn compute_arrow_column_size(
+    data_type: &DataType,
+    row_groups_metadata: &[RowGroupMetaData],
+    parquet_idx: Option<usize>,
+    num_rows: usize,
+) -> Precision<usize> {
+    // For primitive types with known fixed size, compute exact size
+    if let Some(byte_width) = data_type.primitive_width() {
+        return Precision::Exact(byte_width * num_rows);
+    }
+
+    // Use the uncompressed Parquet size as an estimate for other types
+    if let Some(parquet_idx) = parquet_idx {
+        let uncompressed_bytes: i64 = row_groups_metadata
+            .iter()
+            .filter_map(|rg| rg.columns().get(parquet_idx))
+            .map(|col| col.uncompressed_size())
+            .sum();
+        return Precision::Inexact(uncompressed_bytes as usize);
+    }
+
+    // Otherwise, we cannot determine the size
+    Precision::Absent
+}
+
 /// Checks if any occurrence of `value` in `array` corresponds to a `true`
 /// entry in the `exactness` array.
 ///
@@ -475,14 +655,23 @@ fn summarize_min_max_null_counts(
 /// values are `[true, false, false]`. Since at least one is `true`, the
 /// function returns `Some(true)`.
 fn has_any_exact_match(
-    value: ScalarValue,
-    array: ArrayRef,
-    exactness: BooleanArray,
+    value: &ScalarValue,
+    array: &ArrayRef,
+    exactness: &BooleanArray,
 ) -> Option<bool> {
+    if value.is_null() {
+        return Some(false);
+    }
+
+    // Shortcut for single row group
+    if array.len() == 1 {
+        return Some(exactness.is_valid(0) && exactness.value(0));
+    }
+
     let scalar_array = value.to_scalar().ok()?;
     let eq_mask = eq(&scalar_array, &array).ok()?;
-    let combined_mask = and(&eq_mask, &exactness).ok()?;
-    Some(combined_mask.true_count() > 0)
+    let combined_mask = and(&eq_mask, exactness).ok()?;
+    Some(combined_mask.has_true())
 }
 
 /// Wrapper to implement [`FileMetadata`] for [`ParquetMetaData`].
@@ -514,12 +703,114 @@ impl FileMetadata for CachedParquetMetaData {
     }
 }
 
+/// Convert a [`PhysicalSortExpr`] to a Parquet [`SortingColumn`].
+///
+/// Returns `Err` if the expression is not a simple column reference.
+pub(crate) fn sort_expr_to_sorting_column(
+    sort_expr: &PhysicalSortExpr,
+) -> Result<SortingColumn> {
+    let column = sort_expr.expr.downcast_ref::<Column>().ok_or_else(|| {
+        DataFusionError::Plan(format!(
+            "Parquet sorting_columns only supports simple column references, \
+                 but got expression: {}",
+            sort_expr.expr
+        ))
+    })?;
+
+    let column_idx: i32 = column.index().try_into().map_err(|_| {
+        DataFusionError::Plan(format!(
+            "Column index {} is too large to be represented as i32",
+            column.index()
+        ))
+    })?;
+
+    Ok(SortingColumn {
+        column_idx,
+        descending: sort_expr.options.descending,
+        nulls_first: sort_expr.options.nulls_first,
+    })
+}
+
+/// Convert a [`LexOrdering`] to `Vec<SortingColumn>` for Parquet.
+///
+/// Returns `Err` if any expression is not a simple column reference.
+pub(crate) fn lex_ordering_to_sorting_columns(
+    ordering: &LexOrdering,
+) -> Result<Vec<SortingColumn>> {
+    ordering.iter().map(sort_expr_to_sorting_column).collect()
+}
+
+/// Extracts ordering information from Parquet metadata.
+///
+/// This function reads the sorting_columns from the first row group's metadata
+/// and converts them into a [`LexOrdering`] that can be used by the query engine.
+///
+/// # Arguments
+/// * `metadata` - The Parquet metadata containing sorting_columns information
+/// * `schema` - The Arrow schema to use for column lookup
+///
+/// # Returns
+/// * `Ok(Some(ordering))` if valid ordering information was found
+/// * `Ok(None)` if no sorting columns were specified or they couldn't be resolved
+pub fn ordering_from_parquet_metadata(
+    metadata: &ParquetMetaData,
+    schema: &SchemaRef,
+) -> Result<Option<LexOrdering>> {
+    // Get the sorting columns from the first row group metadata.
+    // If no row groups exist or no sorting columns are specified, return None.
+    let sorting_columns = metadata
+        .row_groups()
+        .first()
+        .and_then(|rg| rg.sorting_columns())
+        .filter(|cols| !cols.is_empty());
+
+    let Some(sorting_columns) = sorting_columns else {
+        return Ok(None);
+    };
+
+    let parquet_schema = metadata.file_metadata().schema_descr();
+
+    let sort_exprs =
+        sorting_columns_to_physical_exprs(sorting_columns, parquet_schema, schema);
+
+    if sort_exprs.is_empty() {
+        return Ok(None);
+    }
+
+    Ok(LexOrdering::new(sort_exprs))
+}
+
+/// Converts Parquet sorting columns to physical sort expressions.
+fn sorting_columns_to_physical_exprs(
+    sorting_columns: &[SortingColumn],
+    parquet_schema: &SchemaDescriptor,
+    arrow_schema: &SchemaRef,
+) -> Vec<PhysicalSortExpr> {
+    use arrow::compute::SortOptions;
+
+    sorting_columns
+        .iter()
+        .filter_map(|sc| {
+            let parquet_column = parquet_schema.column(sc.column_idx as usize);
+            let name = parquet_column.name();
+
+            // Find the column in the arrow schema
+            let (index, _) = arrow_schema.column_with_name(name)?;
+
+            let expr = Arc::new(Column::new(name, index));
+            let options = SortOptions {
+                descending: sc.descending,
+                nulls_first: sc.nulls_first,
+            };
+            Some(PhysicalSortExpr::new(expr, options))
+        })
+        .collect()
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::array::{ArrayRef, BooleanArray, Int32Array};
-    use datafusion_common::ScalarValue;
-    use std::sync::Arc;
+    use arrow::array::Int32Array;
 
     #[test]
     fn test_has_any_exact_match() {
@@ -531,7 +822,7 @@ mod tests {
             let exactness =
                 BooleanArray::from(vec![true, false, false, false, false, false]);
 
-            let result = has_any_exact_match(computed_min, row_group_mins, exactness);
+            let result = has_any_exact_match(&computed_min, &row_group_mins, &exactness);
             assert_eq!(result, Some(true));
         }
         // Case 2: All inexact matches
@@ -542,7 +833,7 @@ mod tests {
             let exactness =
                 BooleanArray::from(vec![false, false, false, false, false, false]);
 
-            let result = has_any_exact_match(computed_min, row_group_mins, exactness);
+            let result = has_any_exact_match(&computed_min, &row_group_mins, &exactness);
             assert_eq!(result, Some(false));
         }
         // Case 3: All exact matches
@@ -553,7 +844,7 @@ mod tests {
             let exactness =
                 BooleanArray::from(vec![false, true, true, true, false, true]);
 
-            let result = has_any_exact_match(computed_max, row_group_maxes, exactness);
+            let result = has_any_exact_match(&computed_max, &row_group_maxes, &exactness);
             assert_eq!(result, Some(true));
         }
         // Case 4: All maxes are null values
@@ -563,8 +854,395 @@ mod tests {
                 Arc::new(Int32Array::from(vec![None, None, None, None])) as ArrayRef;
             let exactness = BooleanArray::from(vec![None, Some(true), None, Some(false)]);
 
-            let result = has_any_exact_match(computed_max, row_group_maxes, exactness);
+            let result = has_any_exact_match(&computed_max, &row_group_maxes, &exactness);
             assert_eq!(result, Some(false));
         }
     }
+
+    mod ndv_tests {
+        use super::*;
+        use arrow::datatypes::Field;
+        use parquet::basic::Type as PhysicalType;
+        use parquet::file::metadata::ColumnChunkMetaData;
+        use parquet::file::reader::{FileReader, SerializedFileReader};
+        use parquet::file::statistics::Statistics as ParquetStatistics;
+        use parquet::schema::types::Type as SchemaType;
+        use std::fs::File;
+        use std::path::PathBuf;
+
+        fn create_schema_descr(num_columns: usize) -> Arc<SchemaDescriptor> {
+            let fields: Vec<Arc<SchemaType>> = (0..num_columns)
+                .map(|i| {
+                    Arc::new(
+                        SchemaType::primitive_type_builder(
+                            &format!("col_{i}"),
+                            PhysicalType::INT32,
+                        )
+                        .build()
+                        .unwrap(),
+                    )
+                })
+                .collect();
+
+            let schema = SchemaType::group_type_builder("schema")
+                .with_fields(fields)
+                .build()
+                .unwrap();
+
+            Arc::new(SchemaDescriptor::new(Arc::new(schema)))
+        }
+
+        fn create_arrow_schema(num_columns: usize) -> SchemaRef {
+            let fields: Vec<Field> = (0..num_columns)
+                .map(|i| Field::new(format!("col_{i}"), DataType::Int32, true))
+                .collect();
+            Arc::new(Schema::new(fields))
+        }
+
+        fn create_row_group_with_stats(
+            schema_descr: &Arc<SchemaDescriptor>,
+            column_stats: Vec<Option<ParquetStatistics>>,
+            num_rows: i64,
+        ) -> RowGroupMetaData {
+            let columns: Vec<ColumnChunkMetaData> = column_stats
+                .into_iter()
+                .enumerate()
+                .map(|(i, stats)| {
+                    let mut builder =
+                        ColumnChunkMetaData::builder(schema_descr.column(i));
+                    if let Some(s) = stats {
+                        builder = builder.set_statistics(s);
+                    }
+                    builder.set_num_values(num_rows).build().unwrap()
+                })
+                .collect();
+
+            RowGroupMetaData::builder(schema_descr.clone())
+                .set_num_rows(num_rows)
+                .set_total_byte_size(1000)
+                .set_column_metadata(columns)
+                .build()
+                .unwrap()
+        }
+
+        fn create_parquet_metadata(
+            schema_descr: Arc<SchemaDescriptor>,
+            row_groups: Vec<RowGroupMetaData>,
+        ) -> ParquetMetaData {
+            use parquet::file::metadata::FileMetaData;
+
+            let num_rows: i64 = row_groups.iter().map(|rg| rg.num_rows()).sum();
+            let file_meta = FileMetaData::new(
+                1,            // version
+                num_rows,     // num_rows
+                None,         // created_by
+                None,         // key_value_metadata
+                schema_descr, // schema_descr
+                None,         // column_orders
+            );
+
+            ParquetMetaData::new(file_meta, row_groups)
+        }
+
+        #[test]
+        fn test_distinct_count_single_row_group_with_ndv() {
+            // Single row group with distinct count should return Exact
+            let schema_descr = create_schema_descr(1);
+            let arrow_schema = create_arrow_schema(1);
+
+            // Create statistics with distinct_count = 42
+            let stats = ParquetStatistics::int32(
+                Some(1),   // min
+                Some(100), // max
+                Some(42),  // distinct_count
+                Some(0),   // null_count
+                false,     // is_deprecated
+            );
+
+            let row_group =
+                create_row_group_with_stats(&schema_descr, vec![Some(stats)], 1000);
+            let metadata = create_parquet_metadata(schema_descr, vec![row_group]);
+
+            let result = DFParquetMetadata::statistics_from_parquet_metadata(
+                &metadata,
+                &arrow_schema,
+            )
+            .unwrap();
+
+            assert_eq!(
+                result.column_statistics[0].distinct_count,
+                Precision::Exact(42)
+            );
+        }
+
+        #[test]
+        fn test_distinct_count_multiple_row_groups_with_ndv() {
+            // Multiple row groups with distinct counts should return Inexact (sum)
+            let schema_descr = create_schema_descr(1);
+            let arrow_schema = create_arrow_schema(1);
+
+            // Row group 1: distinct_count = 10
+            let stats1 = ParquetStatistics::int32(
+                Some(1),
+                Some(50),
+                Some(10), // distinct_count
+                Some(0),
+                false,
+            );
+
+            // Row group 2: distinct_count = 20
+            let stats2 = ParquetStatistics::int32(
+                Some(51),
+                Some(100),
+                Some(20), // distinct_count
+                Some(0),
+                false,
+            );
+
+            let row_group1 =
+                create_row_group_with_stats(&schema_descr, vec![Some(stats1)], 500);
+            let row_group2 =
+                create_row_group_with_stats(&schema_descr, vec![Some(stats2)], 500);
+            let metadata =
+                create_parquet_metadata(schema_descr, vec![row_group1, row_group2]);
+
+            let result = DFParquetMetadata::statistics_from_parquet_metadata(
+                &metadata,
+                &arrow_schema,
+            )
+            .unwrap();
+
+            // Max of distinct counts (lower bound since we can't accurately merge NDV)
+            assert_eq!(
+                result.column_statistics[0].distinct_count,
+                Precision::Inexact(20)
+            );
+        }
+
+        #[test]
+        fn test_distinct_count_no_ndv_available() {
+            // No distinct count in statistics should return Absent
+            let schema_descr = create_schema_descr(1);
+            let arrow_schema = create_arrow_schema(1);
+
+            // Create statistics without distinct_count (None)
+            let stats = ParquetStatistics::int32(
+                Some(1),
+                Some(100),
+                None, // no distinct_count
+                Some(0),
+                false,
+            );
+
+            let row_group =
+                create_row_group_with_stats(&schema_descr, vec![Some(stats)], 1000);
+            let metadata = create_parquet_metadata(schema_descr, vec![row_group]);
+
+            let result = DFParquetMetadata::statistics_from_parquet_metadata(
+                &metadata,
+                &arrow_schema,
+            )
+            .unwrap();
+
+            assert_eq!(
+                result.column_statistics[0].distinct_count,
+                Precision::Absent
+            );
+        }
+
+        #[test]
+        fn test_distinct_count_partial_ndv_below_threshold() {
+            // 1 of 2 row groups has NDV (50% < 75% threshold) -> Absent
+            let schema_descr = create_schema_descr(1);
+            let arrow_schema = create_arrow_schema(1);
+
+            let stats1 =
+                ParquetStatistics::int32(Some(1), Some(50), Some(15), Some(0), false);
+            let stats2 =
+                ParquetStatistics::int32(Some(51), Some(100), None, Some(0), false);
+
+            let row_group1 =
+                create_row_group_with_stats(&schema_descr, vec![Some(stats1)], 500);
+            let row_group2 =
+                create_row_group_with_stats(&schema_descr, vec![Some(stats2)], 500);
+            let metadata =
+                create_parquet_metadata(schema_descr, vec![row_group1, row_group2]);
+
+            let result = DFParquetMetadata::statistics_from_parquet_metadata(
+                &metadata,
+                &arrow_schema,
+            )
+            .unwrap();
+
+            assert_eq!(
+                result.column_statistics[0].distinct_count,
+                Precision::Absent
+            );
+        }
+
+        #[test]
+        fn test_distinct_count_partial_ndv_above_threshold() {
+            // 3 of 4 row groups have NDV (75% >= 75% threshold) -> Inexact
+            let schema_descr = create_schema_descr(1);
+            let arrow_schema = create_arrow_schema(1);
+
+            let stats_with = |ndv| {
+                ParquetStatistics::int32(Some(1), Some(100), Some(ndv), Some(0), false)
+            };
+            let stats_without =
+                ParquetStatistics::int32(Some(1), Some(100), None, Some(0), false);
+
+            let rg1 = create_row_group_with_stats(
+                &schema_descr,
+                vec![Some(stats_with(10))],
+                250,
+            );
+            let rg2 = create_row_group_with_stats(
+                &schema_descr,
+                vec![Some(stats_with(20))],
+                250,
+            );
+            let rg3 = create_row_group_with_stats(
+                &schema_descr,
+                vec![Some(stats_with(15))],
+                250,
+            );
+            let rg4 = create_row_group_with_stats(
+                &schema_descr,
+                vec![Some(stats_without)],
+                250,
+            );
+            let metadata =
+                create_parquet_metadata(schema_descr, vec![rg1, rg2, rg3, rg4]);
+
+            let result = DFParquetMetadata::statistics_from_parquet_metadata(
+                &metadata,
+                &arrow_schema,
+            )
+            .unwrap();
+
+            assert_eq!(
+                result.column_statistics[0].distinct_count,
+                Precision::Inexact(20)
+            );
+        }
+
+        #[test]
+        fn test_distinct_count_multiple_columns() {
+            // Test with multiple columns, each with different NDV
+            let schema_descr = create_schema_descr(3);
+            let arrow_schema = create_arrow_schema(3);
+
+            // col_0: distinct_count = 5
+            let stats0 =
+                ParquetStatistics::int32(Some(1), Some(10), Some(5), Some(0), false);
+            // col_1: no distinct_count
+            let stats1 =
+                ParquetStatistics::int32(Some(1), Some(100), None, Some(0), false);
+            // col_2: distinct_count = 100
+            let stats2 =
+                ParquetStatistics::int32(Some(1), Some(1000), Some(100), Some(0), false);
+
+            let row_group = create_row_group_with_stats(
+                &schema_descr,
+                vec![Some(stats0), Some(stats1), Some(stats2)],
+                1000,
+            );
+            let metadata = create_parquet_metadata(schema_descr, vec![row_group]);
+
+            let result = DFParquetMetadata::statistics_from_parquet_metadata(
+                &metadata,
+                &arrow_schema,
+            )
+            .unwrap();
+
+            assert_eq!(
+                result.column_statistics[0].distinct_count,
+                Precision::Exact(5)
+            );
+            assert_eq!(
+                result.column_statistics[1].distinct_count,
+                Precision::Absent
+            );
+            assert_eq!(
+                result.column_statistics[2].distinct_count,
+                Precision::Exact(100)
+            );
+        }
+
+        #[test]
+        fn test_distinct_count_no_statistics_at_all() {
+            // No statistics in row group should return Absent for all stats
+            let schema_descr = create_schema_descr(1);
+            let arrow_schema = create_arrow_schema(1);
+
+            // Create row group without any statistics
+            let row_group = create_row_group_with_stats(&schema_descr, vec![None], 1000);
+            let metadata = create_parquet_metadata(schema_descr, vec![row_group]);
+
+            let result = DFParquetMetadata::statistics_from_parquet_metadata(
+                &metadata,
+                &arrow_schema,
+            )
+            .unwrap();
+
+            assert_eq!(
+                result.column_statistics[0].distinct_count,
+                Precision::Absent
+            );
+        }
+
+        /// Integration test that reads a real Parquet file with distinct_count statistics.
+        /// The test file was created with DuckDB and has known NDV values:
+        /// - id: NULL (high cardinality, not tracked)
+        /// - category: 10 distinct values
+        /// - name: 5 distinct values
+        #[test]
+        fn test_distinct_count_from_real_parquet_file() {
+            // Path to test file created by DuckDB with distinct_count statistics
+            let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+            path.push("src/test_data/ndv_test.parquet");
+
+            let file = File::open(&path).expect("Failed to open test parquet file");
+            let reader =
+                SerializedFileReader::new(file).expect("Failed to create reader");
+            let parquet_metadata = reader.metadata();
+
+            // Derive Arrow schema from parquet file metadata
+            let arrow_schema = Arc::new(
+                parquet_to_arrow_schema(
+                    parquet_metadata.file_metadata().schema_descr(),
+                    None,
+                )
+                .expect("Failed to convert schema"),
+            );
+
+            let result = DFParquetMetadata::statistics_from_parquet_metadata(
+                parquet_metadata,
+                &arrow_schema,
+            )
+            .expect("Failed to extract statistics");
+
+            // id: no distinct_count (high cardinality)
+            assert_eq!(
+                result.column_statistics[0].distinct_count,
+                Precision::Absent,
+                "id column should have Absent distinct_count"
+            );
+
+            // category: 10 distinct values
+            assert_eq!(
+                result.column_statistics[1].distinct_count,
+                Precision::Exact(10),
+                "category column should have Exact(10) distinct_count"
+            );
+
+            // name: 5 distinct values
+            assert_eq!(
+                result.column_statistics[2].distinct_count,
+                Precision::Exact(5),
+                "name column should have Exact(5) distinct_count"
+            );
+        }
+    }
 }
diff --git a/datafusion/datasource-parquet/src/metrics.rs b/datafusion/datasource-parquet/src/metrics.rs
index 306bc9e6b013d..8eb5912b919da 100644
--- a/datafusion/datasource-parquet/src/metrics.rs
+++ b/datafusion/datasource-parquet/src/metrics.rs
@@ -16,7 +16,8 @@
 // under the License.
 
 use datafusion_physical_plan::metrics::{
-    Count, ExecutionPlanMetricsSet, MetricBuilder, MetricType, PruningMetrics, Time,
+    Count, ExecutionPlanMetricsSet, Gauge, MetricBuilder, MetricCategory, MetricType,
+    PruningMetrics, RatioMergeStrategy, RatioMetrics, Time,
 };
 
 /// Stores metrics about the parquet execution for a particular parquet file.
@@ -44,9 +45,11 @@ pub struct ParquetFileMetrics {
     pub files_ranges_pruned_statistics: PruningMetrics,
     /// Number of times the predicate could not be evaluated
     pub predicate_evaluation_errors: Count,
-    /// Number of row groups whose bloom filters were checked, tracked with matched/pruned counts
+    /// Number of row groups pruned by bloom filters
     pub row_groups_pruned_bloom_filter: PruningMetrics,
-    /// Number of row groups whose statistics were checked, tracked with matched/pruned counts
+    /// Number of row groups pruned due to limit pruning.
+    pub limit_pruned_row_groups: PruningMetrics,
+    /// Number of row groups pruned by statistics
     pub row_groups_pruned_statistics: PruningMetrics,
     /// Total number of bytes scanned
     pub bytes_scanned: Count,
@@ -62,17 +65,32 @@ pub struct ParquetFileMetrics {
     pub bloom_filter_eval_time: Time,
     /// Total rows filtered or matched by parquet page index
     pub page_index_rows_pruned: PruningMetrics,
+    /// Total pages filtered or matched by parquet page index
+    pub page_index_pages_pruned: PruningMetrics,
     /// Total time spent evaluating parquet page index filters
     pub page_index_eval_time: Time,
     /// Total time spent reading and parsing metadata from the footer
     pub metadata_load_time: Time,
-    /// Predicate Cache: number of records read directly from the inner reader.
-    /// This is the number of rows decoded while evaluating predicates
-    pub predicate_cache_inner_records: Count,
+    /// Scan Efficiency Ratio, calculated as bytes_scanned / total_file_size
+    pub scan_efficiency_ratio: RatioMetrics,
+    /// Predicate Cache: Total number of rows physically read and decoded from the Parquet file.
+    ///
+    /// This metric tracks "cache misses" in the predicate pushdown optimization.
+    /// When the specialized predicate reader cannot find the requested data in its cache,
+    /// it must fall back to the "inner reader" to physically decode the data from the
+    /// Parquet.
+    ///
+    /// This is the expensive path (IO + Decompression + Decoding).
+    ///
+    /// We use a Gauge here as arrow-rs reports absolute numbers rather
+    /// than incremental readings, we want a `set` operation here rather
+    /// than `add`. Earlier it was `Count`, which led to this issue:
+    /// github.com/apache/datafusion/issues/19334
+    pub predicate_cache_inner_records: Gauge,
     /// Predicate Cache: number of records read from the cache. This is the
     /// number of rows that were stored in the cache after evaluating predicates
     /// reused for the output.
-    pub predicate_cache_records: Count,
+    pub predicate_cache_records: Gauge,
 }
 
 impl ParquetFileMetrics {
@@ -87,45 +105,63 @@ impl ParquetFileMetrics {
         // -----------------------
         let row_groups_pruned_bloom_filter = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
-            .with_type(MetricType::SUMMARY)
+            .with_type(MetricType::Summary)
             .pruning_metrics("row_groups_pruned_bloom_filter", partition);
 
+        let limit_pruned_row_groups = MetricBuilder::new(metrics)
+            .with_new_label("filename", filename.to_string())
+            .with_type(MetricType::Summary)
+            .pruning_metrics("limit_pruned_row_groups", partition);
+
         let row_groups_pruned_statistics = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
-            .with_type(MetricType::SUMMARY)
+            .with_type(MetricType::Summary)
             .pruning_metrics("row_groups_pruned_statistics", partition);
 
-        let page_index_rows_pruned = MetricBuilder::new(metrics)
+        let page_index_pages_pruned = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
-            .with_type(MetricType::SUMMARY)
-            .pruning_metrics("page_index_rows_pruned", partition);
+            .with_type(MetricType::Summary)
+            .pruning_metrics("page_index_pages_pruned", partition);
 
         let bytes_scanned = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
-            .with_type(MetricType::SUMMARY)
+            .with_type(MetricType::Summary)
+            .with_category(MetricCategory::Bytes)
             .counter("bytes_scanned", partition);
 
         let metadata_load_time = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
-            .with_type(MetricType::SUMMARY)
+            .with_type(MetricType::Summary)
             .subset_time("metadata_load_time", partition);
 
         let files_ranges_pruned_statistics = MetricBuilder::new(metrics)
-            .with_type(MetricType::SUMMARY)
+            .with_type(MetricType::Summary)
             .pruning_metrics("files_ranges_pruned_statistics", partition);
 
+        let scan_efficiency_ratio = MetricBuilder::new(metrics)
+            .with_new_label("filename", filename.to_string())
+            .with_type(MetricType::Summary)
+            .ratio_metrics_with_strategy(
+                "scan_efficiency_ratio",
+                partition,
+                RatioMergeStrategy::AddPartSetTotal,
+            );
+
         // -----------------------
         // 'dev' level metrics
         // -----------------------
         let predicate_evaluation_errors = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
+            .with_category(MetricCategory::Rows)
             .counter("predicate_evaluation_errors", partition);
 
         let pushdown_rows_pruned = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
+            .with_category(MetricCategory::Rows)
             .counter("pushdown_rows_pruned", partition);
         let pushdown_rows_matched = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
+            .with_category(MetricCategory::Rows)
             .counter("pushdown_rows_matched", partition);
 
         let row_pushdown_eval_time = MetricBuilder::new(metrics)
@@ -142,28 +178,37 @@ impl ParquetFileMetrics {
             .with_new_label("filename", filename.to_string())
             .subset_time("page_index_eval_time", partition);
 
+        let page_index_rows_pruned = MetricBuilder::new(metrics)
+            .with_new_label("filename", filename.to_string())
+            .pruning_metrics("page_index_rows_pruned", partition);
+
         let predicate_cache_inner_records = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
-            .counter("predicate_cache_inner_records", partition);
+            .with_category(MetricCategory::Rows)
+            .gauge("predicate_cache_inner_records", partition);
 
         let predicate_cache_records = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
-            .counter("predicate_cache_records", partition);
+            .with_category(MetricCategory::Rows)
+            .gauge("predicate_cache_records", partition);
 
         Self {
             files_ranges_pruned_statistics,
             predicate_evaluation_errors,
             row_groups_pruned_bloom_filter,
             row_groups_pruned_statistics,
+            limit_pruned_row_groups,
             bytes_scanned,
             pushdown_rows_pruned,
             pushdown_rows_matched,
             row_pushdown_eval_time,
             page_index_rows_pruned,
+            page_index_pages_pruned,
             statistics_eval_time,
             bloom_filter_eval_time,
             page_index_eval_time,
             metadata_load_time,
+            scan_efficiency_ratio,
             predicate_cache_inner_records,
             predicate_cache_records,
         }
diff --git a/datafusion/datasource-parquet/src/mod.rs b/datafusion/datasource-parquet/src/mod.rs
index 2f64f34bc09b4..9a907f4118a86 100644
--- a/datafusion/datasource-parquet/src/mod.rs
+++ b/datafusion/datasource-parquet/src/mod.rs
@@ -15,9 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! DataFusion Parquet Reader: [`ParquetSource`]
+//!
+//! [`ParquetSource`]: source::ParquetSource
+
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 pub mod access_plan;
 pub mod file_format;
@@ -28,7 +33,9 @@ mod page_filter;
 mod reader;
 mod row_filter;
 mod row_group_filter;
+mod sort;
 pub mod source;
+mod supported_predicates;
 mod writer;
 
 pub use access_plan::{ParquetAccessPlan, RowGroupAccess};
diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs
index 2815b82f1d455..bad1c684b47f5 100644
--- a/datafusion/datasource-parquet/src/opener.rs
+++ b/datafusion/datasource-parquet/src/opener.rs
@@ -15,66 +15,89 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`ParquetOpener`] for opening Parquet files
+//! [`ParquetMorselizer`] state machines for opening Parquet files
 
 use crate::page_filter::PagePruningAccessPlanFilter;
-use crate::row_group_filter::RowGroupAccessPlanFilter;
+use crate::row_filter::build_projection_read_plan;
+use crate::row_group_filter::{BloomFilterStatistics, RowGroupAccessPlanFilter};
 use crate::{
-    apply_file_schema_type_coercions, coerce_int96_to_resolution, row_filter,
     ParquetAccessPlan, ParquetFileMetrics, ParquetFileReaderFactory,
+    apply_file_schema_type_coercions, coerce_int96_to_resolution, row_filter,
 };
-use arrow::array::RecordBatch;
-use datafusion_datasource::file_stream::{FileOpenFuture, FileOpener};
-use datafusion_datasource::schema_adapter::SchemaAdapterFactory;
+use arrow::array::{RecordBatch, RecordBatchOptions};
+use arrow::datatypes::DataType;
+use datafusion_datasource::morsel::{Morsel, MorselPlan, MorselPlanner, Morselizer};
+use datafusion_physical_expr::projection::{ProjectionExprs, Projector};
+use datafusion_physical_expr::utils::reassign_expr_columns;
+use datafusion_physical_expr_adapter::replace_columns_with_literals;
+use std::collections::HashMap;
+use std::fmt;
+use std::future::Future;
+use std::mem;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 
-use arrow::datatypes::{FieldRef, SchemaRef, TimeUnit};
+use arrow::datatypes::{Schema, SchemaRef, TimeUnit};
 use datafusion_common::encryption::FileDecryptionProperties;
-
-use datafusion_common::{exec_err, DataFusionError, Result};
-use datafusion_datasource::PartitionedFile;
+use datafusion_common::stats::Precision;
+use datafusion_common::{
+    ColumnStatistics, DataFusionError, Result, ScalarValue, Statistics, exec_err,
+};
+use datafusion_datasource::{PartitionedFile, TableSchema};
 use datafusion_physical_expr::simplifier::PhysicalExprSimplifier;
 use datafusion_physical_expr_adapter::PhysicalExprAdapterFactory;
 use datafusion_physical_expr_common::physical_expr::{
-    is_dynamic_physical_expr, PhysicalExpr,
+    PhysicalExpr, is_dynamic_physical_expr,
 };
 use datafusion_physical_plan::metrics::{
-    Count, ExecutionPlanMetricsSet, MetricBuilder, PruningMetrics,
+    BaselineMetrics, Count, ExecutionPlanMetricsSet, Gauge, MetricBuilder,
+    MetricCategory, PruningMetrics,
 };
-use datafusion_pruning::{build_pruning_predicate, FilePruner, PruningPredicate};
+use datafusion_pruning::{FilePruner, PruningPredicate, build_pruning_predicate};
 
 #[cfg(feature = "parquet_encryption")]
 use datafusion_common::config::EncryptionFactoryOptions;
 #[cfg(feature = "parquet_encryption")]
 use datafusion_execution::parquet_encryption::EncryptionFactory;
-use futures::{ready, Stream, StreamExt, TryStreamExt};
-use itertools::Itertools;
+use futures::{
+    FutureExt, Stream, StreamExt, future::BoxFuture, ready, stream::BoxStream,
+};
 use log::debug;
+use parquet::DecodeResult;
+use parquet::arrow::ParquetRecordBatchStreamBuilder;
 use parquet::arrow::arrow_reader::metrics::ArrowReaderMetrics;
-use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions};
+use parquet::arrow::arrow_reader::{
+    ArrowReaderMetadata, ArrowReaderOptions, RowSelectionPolicy,
+};
 use parquet::arrow::async_reader::AsyncFileReader;
-use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask};
+use parquet::arrow::parquet_column;
+use parquet::arrow::push_decoder::{ParquetPushDecoder, ParquetPushDecoderBuilder};
+use parquet::basic::Type;
+use parquet::bloom_filter::Sbbf;
 use parquet::file::metadata::{PageIndexPolicy, ParquetMetaDataReader};
 
-/// Implements [`FileOpener`] for a parquet file
-pub(super) struct ParquetOpener {
+/// Stateless Parquet morselizer implementation.
+///
+/// Reading a Parquet file is a multi-stage process, with multiple CPU-intensive
+/// steps interspersed with I/O steps. The code in this module implements the steps
+/// as an explicit state machine -- see [`ParquetOpenState`] for details.
+#[derive(Clone)]
+pub(super) struct ParquetMorselizer {
     /// Execution partition index
-    pub partition_index: usize,
-    /// Column indexes in `table_schema` needed by the query
-    pub projection: Arc<[usize]>,
+    pub(crate) partition_index: usize,
+    /// Projection to apply on top of the table schema (i.e. can reference partition columns).
+    pub projection: ProjectionExprs,
     /// Target number of rows in each output RecordBatch
     pub batch_size: usize,
     /// Optional limit on the number of rows to read
-    pub limit: Option<usize>,
+    pub(crate) limit: Option<usize>,
+    /// If should keep the output rows in order
+    pub preserve_order: bool,
     /// Optional predicate to apply during the scan
     pub predicate: Option<Arc<dyn PhysicalExpr>>,
-    /// Schema of the output table without partition columns.
-    /// This is the schema we coerce the physical file schema into.
-    pub logical_file_schema: SchemaRef,
-    /// Partition columns
-    pub partition_fields: Vec<FieldRef>,
+    /// Table schema, including partition columns.
+    pub table_schema: TableSchema,
     /// Optional hint for how large the initial request to read parquet metadata
     /// should be
     pub metadata_size_hint: Option<usize>,
@@ -87,14 +110,14 @@ pub(super) struct ParquetOpener {
     pub pushdown_filters: bool,
     /// Should the filters be reordered to optimize the scan?
     pub reorder_filters: bool,
+    /// Should we force the reader to use RowSelections for filtering
+    pub force_filter_selections: bool,
     /// Should the page index be read from parquet files, if present, to skip
     /// data pages
     pub enable_page_index: bool,
     /// Should the bloom filter be read from parquet, if present, to skip row
     /// groups
     pub enable_bloom_filter: bool,
-    /// Schema adapter factory
-    pub schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
     /// Should row group pruning be applied
     pub enable_row_group_stats_pruning: bool,
     /// Coerce INT96 timestamps to specific TimeUnit
@@ -103,7 +126,7 @@ pub(super) struct ParquetOpener {
     #[cfg(feature = "parquet_encryption")]
     pub file_decryption_properties: Option<Arc<FileDecryptionProperties>>,
     /// Rewrite expressions in the context of the file schema
-    pub(crate) expr_adapter_factory: Option<Arc<dyn PhysicalExprAdapterFactory>>,
+    pub(crate) expr_adapter_factory: Arc<dyn PhysicalExprAdapterFactory>,
     /// Optional factory to create file decryption properties dynamically
     #[cfg(feature = "parquet_encryption")]
     pub encryption_factory:
@@ -111,22 +134,419 @@ pub(super) struct ParquetOpener {
     /// Maximum size of the predicate cache, in bytes. If none, uses
     /// the arrow-rs default.
     pub max_predicate_cache_size: Option<usize>,
+    /// Whether to read row groups in reverse order
+    pub reverse_row_groups: bool,
+}
+
+impl fmt::Debug for ParquetMorselizer {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("ParquetMorselizer")
+            .field("partition_index", &self.partition_index)
+            .field("preserve_order", &self.preserve_order)
+            .field("enable_page_index", &self.enable_page_index)
+            .field("enable_bloom_filter", &self.enable_bloom_filter)
+            .finish()
+    }
+}
+
+impl Morselizer for ParquetMorselizer {
+    fn plan_file(&self, file: PartitionedFile) -> Result<Box<dyn MorselPlanner>> {
+        Ok(Box::new(ParquetMorselPlanner::try_new(self, file)?))
+    }
+}
+
+/// States for [`ParquetMorselPlanner`]
+///
+/// These states correspond to the steps required to read and apply various
+/// filter operations.
+///
+/// States whose names beginning with `Load` represent waiting on IO to resolve
+///
+/// ```text
+///      Start
+///        |
+///        v
+/// [LoadEncryption]?
+///        |
+///        v
+///    PruneFile
+///        |
+///        v
+///   LoadMetadata
+///        |
+///        v
+///  PrepareFilters
+///        |
+///        v
+///   LoadPageIndex
+///        |
+///        v
+/// PruneWithStatistics
+///        |
+///        v
+///  LoadBloomFilters
+///        |
+///        v
+/// PruneWithBloomFilters
+///        |
+///        v
+///   BuildStream
+///        |
+///        v
+///       Done
+/// ```
+///
+/// Note: `LoadEncryption` is only present when the `parquet_encryption` feature is
+/// enabled. All other states are always visited in the order shown above,
+/// though any async state may return `Poll::Pending` and then resume later.
+enum ParquetOpenState {
+    Start {
+        prepared: Box<PreparedParquetOpen>,
+        #[cfg(feature = "parquet_encryption")]
+        encryption_context: Arc<EncryptionContext>,
+    },
+    /// Loading encryption footers
+    #[cfg(feature = "parquet_encryption")]
+    LoadEncryption(BoxFuture<'static, Result<Box<PreparedParquetOpen>>>),
+    /// Try to prune file using only file-level statistics and partition
+    /// values before loading any parquet metadata
+    PruneFile(Box<PreparedParquetOpen>),
+    /// Loading Parquet metadata (in footer)
+    LoadMetadata(BoxFuture<'static, Result<MetadataLoadedParquetOpen>>),
+    /// Specialize any filters for the actual file schema (only known after
+    /// metadata is loaded)
+    PrepareFilters(Box<MetadataLoadedParquetOpen>),
+    /// Loading [Parquet Page Index](https://parquet.apache.org/docs/file-format/pageindex/)
+    LoadPageIndex(BoxFuture<'static, Result<FiltersPreparedParquetOpen>>),
+    /// Pruning Row Groups
+    PruneWithStatistics(Box<FiltersPreparedParquetOpen>),
+    /// Loading bloom filters required for row-group pruning
+    LoadBloomFilters(BoxFuture<'static, Result<BloomFiltersLoadedParquetOpen>>),
+    /// Pruning with preloaded Bloom Filters
+    PruneWithBloomFilters(Box<BloomFiltersLoadedParquetOpen>),
+    /// Builds the final reader stream
+    ///
+    /// TODO: split state as this currently does both I/O and CPU work.
+    BuildStream(Box<RowGroupsPrunedParquetOpen>),
+    /// Terminal state: the final opened stream is ready to return.
+    Ready(BoxStream<'static, Result<RecordBatch>>),
+    /// Terminal state: reading complete
+    Done,
+}
+
+impl fmt::Debug for ParquetOpenState {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let state = match self {
+            ParquetOpenState::Start { .. } => "Start",
+            #[cfg(feature = "parquet_encryption")]
+            ParquetOpenState::LoadEncryption(_) => "LoadEncryption",
+            ParquetOpenState::PruneFile(_) => "PruneFile",
+            ParquetOpenState::LoadMetadata(_) => "LoadMetadata",
+            ParquetOpenState::PrepareFilters(_) => "PrepareFilters",
+            ParquetOpenState::LoadPageIndex(_) => "LoadPageIndex",
+            ParquetOpenState::PruneWithStatistics(_) => "PruneWithStatistics",
+            ParquetOpenState::LoadBloomFilters(_) => "LoadBloomFilters",
+            ParquetOpenState::PruneWithBloomFilters(_) => "PruneWithBloomFilters",
+            ParquetOpenState::BuildStream(_) => "BuildStream",
+            ParquetOpenState::Ready(_) => "Ready",
+            ParquetOpenState::Done => "Done",
+        };
+        f.write_str(state)
+    }
+}
+
+struct PreparedParquetOpen {
+    partition_index: usize,
+    partitioned_file: PartitionedFile,
+    file_range: Option<datafusion_datasource::FileRange>,
+    extensions: Option<Arc<dyn std::any::Any + Send + Sync>>,
+    file_name: String,
+    file_metrics: ParquetFileMetrics,
+    baseline_metrics: BaselineMetrics,
+    file_pruner: Option<FilePruner>,
+    metadata_size_hint: Option<usize>,
+    metrics: ExecutionPlanMetricsSet,
+    parquet_file_reader_factory: Arc<dyn ParquetFileReaderFactory>,
+    async_file_reader: Box<dyn AsyncFileReader>,
+    batch_size: usize,
+    logical_file_schema: SchemaRef,
+    physical_file_schema: SchemaRef,
+    output_schema: SchemaRef,
+    projection: ProjectionExprs,
+    predicate: Option<Arc<dyn PhysicalExpr>>,
+    reorder_predicates: bool,
+    pushdown_filters: bool,
+    force_filter_selections: bool,
+    enable_page_index: bool,
+    enable_bloom_filter: bool,
+    enable_row_group_stats_pruning: bool,
+    limit: Option<usize>,
+    coerce_int96: Option<TimeUnit>,
+    expr_adapter_factory: Arc<dyn PhysicalExprAdapterFactory>,
+    predicate_creation_errors: Count,
+    max_predicate_cache_size: Option<usize>,
+    reverse_row_groups: bool,
+    preserve_order: bool,
+    #[cfg(feature = "parquet_encryption")]
+    file_decryption_properties: Option<Arc<FileDecryptionProperties>>,
+}
+
+/// State of [`ParquetOpenState`]
+///
+/// Result of loading parquet metadata after file-level pruning is complete.
+struct MetadataLoadedParquetOpen {
+    prepared: PreparedParquetOpen,
+    reader_metadata: ArrowReaderMetadata,
+    options: ArrowReaderOptions,
+}
+
+/// State of [`ParquetOpenState`]
+///
+/// Pruning Predicate and DataPage pruning information
+/// specialized for the files specific schema.
+struct FiltersPreparedParquetOpen {
+    loaded: MetadataLoadedParquetOpen,
+    pruning_predicate: Option<Arc<PruningPredicate>>,
+    page_pruning_predicate: Option<Arc<PagePruningAccessPlanFilter>>,
+}
+
+/// State of [`ParquetOpenState`]
+///
+/// Result of CPU-only row-group pruning before optional bloom-filter I/O.
+struct RowGroupsPrunedParquetOpen {
+    prepared: FiltersPreparedParquetOpen,
+    row_groups: RowGroupAccessPlanFilter,
+}
+
+/// State of [`ParquetOpenState`]
+///
+/// Result of loading bloom filters needed for row-group pruning.
+struct BloomFiltersLoadedParquetOpen {
+    prepared: RowGroupsPrunedParquetOpen,
+    /// Bloom filters loaded for each row group that remains under consideration.
+    ///
+    /// indexed by parquet row-group index
+    row_group_bloom_filters: Vec<BloomFilterStatistics>,
+}
+
+impl ParquetOpenState {
+    /// Applies one CPU-only state transition.
+    ///
+    /// `Load*` states do not transition here and are returned unchanged so the
+    /// driver loop can poll their inner futures separately.
+    ///
+    /// Implements state machine described in [`ParquetOpenState`]
+    fn transition(self) -> Result<ParquetOpenState> {
+        match self {
+            ParquetOpenState::Start {
+                prepared,
+                #[cfg(feature = "parquet_encryption")]
+                encryption_context,
+            } => {
+                #[cfg(feature = "parquet_encryption")]
+                {
+                    let mut prepared = *prepared;
+                    let future = async move {
+                        let file_location =
+                            &prepared.partitioned_file.object_meta.location;
+                        prepared.file_decryption_properties = encryption_context
+                            .get_file_decryption_properties(file_location)
+                            .await?;
+                        Ok(Box::new(prepared))
+                    }
+                    .boxed();
+                    Ok(ParquetOpenState::LoadEncryption(future))
+                }
+                #[cfg(not(feature = "parquet_encryption"))]
+                {
+                    Ok(ParquetOpenState::PruneFile(prepared))
+                }
+            }
+            #[cfg(feature = "parquet_encryption")]
+            ParquetOpenState::LoadEncryption(future) => {
+                Ok(ParquetOpenState::LoadEncryption(future))
+            }
+            ParquetOpenState::PruneFile(prepared) => {
+                let Some(prepared) = (*prepared).prune_file()? else {
+                    return Ok(ParquetOpenState::Done);
+                };
+                Ok(ParquetOpenState::LoadMetadata(prepared.load().boxed()))
+            }
+            ParquetOpenState::LoadMetadata(future) => {
+                Ok(ParquetOpenState::LoadMetadata(future))
+            }
+            ParquetOpenState::PrepareFilters(loaded) => {
+                let prepared_filters = loaded.prepare_filters()?;
+                Ok(ParquetOpenState::LoadPageIndex(
+                    prepared_filters.load_page_index().boxed(),
+                ))
+            }
+            ParquetOpenState::LoadPageIndex(future) => {
+                Ok(ParquetOpenState::LoadPageIndex(future))
+            }
+            ParquetOpenState::PruneWithStatistics(prepared) => {
+                let prepared_row_groups = prepared.prune_row_groups()?;
+                Ok(ParquetOpenState::LoadBloomFilters(
+                    prepared_row_groups.load_bloom_filters().boxed(),
+                ))
+            }
+            ParquetOpenState::LoadBloomFilters(future) => {
+                Ok(ParquetOpenState::LoadBloomFilters(future))
+            }
+            ParquetOpenState::PruneWithBloomFilters(loaded) => Ok(
+                ParquetOpenState::BuildStream(Box::new(loaded.prune_bloom_filters())),
+            ),
+            ParquetOpenState::BuildStream(prepared) => {
+                Ok(ParquetOpenState::Ready(prepared.build_stream()?))
+            }
+            ParquetOpenState::Ready(stream) => Ok(ParquetOpenState::Ready(stream)),
+            ParquetOpenState::Done => {
+                panic!("ParquetOpenFuture polled after completion");
+            }
+        }
+    }
+}
+
+/// Implements the Morsel API
+struct ParquetStreamMorsel {
+    stream: BoxStream<'static, Result<RecordBatch>>,
+}
+
+impl ParquetStreamMorsel {
+    fn new(stream: BoxStream<'static, Result<RecordBatch>>) -> Self {
+        Self { stream }
+    }
+}
+
+impl fmt::Debug for ParquetStreamMorsel {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("ParquetStreamMorsel")
+            .finish_non_exhaustive()
+    }
+}
+
+impl Morsel for ParquetStreamMorsel {
+    fn into_stream(self: Box<Self>) -> BoxStream<'static, Result<RecordBatch>> {
+        self.stream
+    }
+}
+
+/// Per-file planner that owns the current [`ParquetOpenState`].
+struct ParquetMorselPlanner {
+    /// Ready to perform CPU-only planning work.
+    state: ParquetOpenState,
 }
 
-impl FileOpener for ParquetOpener {
-    fn open(&self, partitioned_file: PartitionedFile) -> Result<FileOpenFuture> {
+impl fmt::Debug for ParquetMorselPlanner {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_tuple("ParquetMorselPlanner::Ready")
+            .field(&self.state)
+            .finish()
+    }
+}
+
+impl ParquetMorselPlanner {
+    fn try_new(morselizer: &ParquetMorselizer, file: PartitionedFile) -> Result<Self> {
+        let prepared = morselizer.prepare_open_file(file)?;
+        #[cfg(feature = "parquet_encryption")]
+        let state = ParquetOpenState::Start {
+            prepared: Box::new(prepared),
+            encryption_context: Arc::new(morselizer.get_encryption_context()),
+        };
+        #[cfg(not(feature = "parquet_encryption"))]
+        let state = ParquetOpenState::Start {
+            prepared: Box::new(prepared),
+        };
+        Ok(Self { state })
+    }
+
+    /// Schedule an I/O future that resolves to the next planner to run.
+    ///
+    /// This helper
+    ///
+    /// 1. drives one I/O phase to completion
+    /// 2. wraps the resulting state in a new [`ParquetMorselPlanner`]
+    /// 3. returns a [`MorselPlan`] containing the boxed future for the caller
+    ///    to poll
+    ///
+    fn schedule_io<F>(future: F) -> MorselPlan
+    where
+        F: Future<Output = Result<ParquetOpenState>> + Send + 'static,
+    {
+        let io_future = async move {
+            let next_state = future.await?;
+            Ok(Box::new(ParquetMorselPlanner { state: next_state }) as _)
+        };
+        MorselPlan::new().with_pending_planner(io_future)
+    }
+}
+
+impl MorselPlanner for ParquetMorselPlanner {
+    fn plan(self: Box<Self>) -> Result<Option<MorselPlan>> {
+        if let ParquetOpenState::Done = self.state {
+            return Ok(None);
+        }
+
+        let state = self.state.transition()?;
+
+        match state {
+            #[cfg(feature = "parquet_encryption")]
+            ParquetOpenState::LoadEncryption(future) => {
+                Ok(Some(Self::schedule_io(async move {
+                    Ok(ParquetOpenState::PruneFile(future.await?))
+                })))
+            }
+            ParquetOpenState::LoadMetadata(future) => {
+                Ok(Some(Self::schedule_io(async move {
+                    Ok(ParquetOpenState::PrepareFilters(Box::new(future.await?)))
+                })))
+            }
+            ParquetOpenState::LoadPageIndex(future) => {
+                Ok(Some(Self::schedule_io(async move {
+                    Ok(ParquetOpenState::PruneWithStatistics(Box::new(
+                        future.await?,
+                    )))
+                })))
+            }
+            ParquetOpenState::LoadBloomFilters(future) => {
+                Ok(Some(Self::schedule_io(async move {
+                    Ok(ParquetOpenState::PruneWithBloomFilters(Box::new(
+                        future.await?,
+                    )))
+                })))
+            }
+            ParquetOpenState::Ready(stream) => {
+                let morsels: Vec<Box<dyn Morsel>> =
+                    vec![Box::new(ParquetStreamMorsel::new(stream))];
+                Ok(Some(MorselPlan::new().with_morsels(morsels)))
+            }
+            ParquetOpenState::Done => Ok(None),
+            cpu_state => Ok(Some(
+                MorselPlan::new()
+                    .with_planners(vec![Box::new(Self { state: cpu_state })]),
+            )),
+        }
+    }
+}
+
+impl ParquetMorselizer {
+    /// Perform the CPU-only setup for opening a parquet file.
+    fn prepare_open_file(
+        &self,
+        partitioned_file: PartitionedFile,
+    ) -> Result<PreparedParquetOpen> {
         let file_range = partitioned_file.range.clone();
         let extensions = partitioned_file.extensions.clone();
-        let file_location = partitioned_file.object_meta.location.clone();
-        let file_name = file_location.to_string();
+        let file_name = partitioned_file.object_meta.location.to_string();
         let file_metrics =
             ParquetFileMetrics::new(self.partition_index, &file_name, &self.metrics);
+        let baseline_metrics = BaselineMetrics::new(&self.metrics, self.partition_index);
 
         let metadata_size_hint = partitioned_file
             .metadata_size_hint
             .or(self.metadata_size_hint);
 
-        let mut async_file_reader: Box<dyn AsyncFileReader> =
+        let async_file_reader: Box<dyn AsyncFileReader> =
             self.parquet_file_reader_factory.create_reader(
                 self.partition_index,
                 partitioned_file.clone(),
@@ -134,366 +554,833 @@ impl FileOpener for ParquetOpener {
                 &self.metrics,
             )?;
 
-        let batch_size = self.batch_size;
+        // Calculate the output schema from the original projection (before literal replacement)
+        // so we get correct field names from column references
+        let logical_file_schema = Arc::clone(self.table_schema.file_schema());
+        let output_schema = Arc::new(
+            self.projection
+                .project_schema(self.table_schema.table_schema())?,
+        );
+
+        // Build a combined map for replacing column references with literal values.
+        // This includes:
+        // 1. Partition column values from the file path (e.g., region=us-west-2)
+        // 2. Constant columns detected from file statistics (where min == max)
+        //
+        // Although partition columns *are* constant columns, we don't want to rely on
+        // statistics for them being populated if we can use the partition values
+        // (which are guaranteed to be present).
+        //
+        // For example, given a partition column `region` and predicate
+        // `region IN ('us-east-1', 'eu-central-1')` with file path
+        // `/data/region=us-west-2/...`, the predicate is rewritten to
+        // `'us-west-2' IN ('us-east-1', 'eu-central-1')` which simplifies to FALSE.
+        //
+        // While partition column optimization is done during logical planning,
+        // there are cases where partition columns may appear in more complex
+        // predicates that cannot be simplified until we open the file (such as
+        // dynamic predicates).
+        let mut literal_columns: HashMap<String, ScalarValue> = self
+            .table_schema
+            .table_partition_cols()
+            .iter()
+            .zip(partitioned_file.partition_values.iter())
+            .map(|(field, value)| (field.name().clone(), value.clone()))
+            .collect();
+        // Add constant columns from file statistics.
+        // Note that if there are statistics for partition columns there will be overlap,
+        // but since we use a HashMap, we'll just overwrite the partition values with the
+        // constant values from statistics (which should be the same).
+        literal_columns.extend(constant_columns_from_stats(
+            partitioned_file.statistics.as_deref(),
+            &logical_file_schema,
+        ));
 
-        let projected_schema =
-            SchemaRef::from(self.logical_file_schema.project(&self.projection)?);
-        let schema_adapter_factory = Arc::clone(&self.schema_adapter_factory);
-        let schema_adapter = self
-            .schema_adapter_factory
-            .create(projected_schema, Arc::clone(&self.logical_file_schema));
+        let mut projection = self.projection.clone();
         let mut predicate = self.predicate.clone();
-        let logical_file_schema = Arc::clone(&self.logical_file_schema);
-        let partition_fields = self.partition_fields.clone();
-        let reorder_predicates = self.reorder_filters;
-        let pushdown_filters = self.pushdown_filters;
-        let coerce_int96 = self.coerce_int96;
-        let enable_bloom_filter = self.enable_bloom_filter;
-        let enable_row_group_stats_pruning = self.enable_row_group_stats_pruning;
-        let limit = self.limit;
+        if !literal_columns.is_empty() {
+            projection = projection.try_map_exprs(|expr| {
+                replace_columns_with_literals(Arc::clone(&expr), &literal_columns)
+            })?;
+            predicate = predicate
+                .map(|p| replace_columns_with_literals(p, &literal_columns))
+                .transpose()?;
+        }
 
         let predicate_creation_errors = MetricBuilder::new(&self.metrics)
+            .with_category(MetricCategory::Rows)
             .global_counter("num_predicate_creation_errors");
 
-        let expr_adapter_factory = self.expr_adapter_factory.clone();
-        let mut predicate_file_schema = Arc::clone(&self.logical_file_schema);
+        // Apply literal replacements to projection and predicate
+        let file_pruner = predicate
+            .as_ref()
+            .filter(|p| is_dynamic_physical_expr(p) || partitioned_file.has_statistics())
+            .and_then(|p| {
+                FilePruner::try_new(
+                    Arc::clone(p),
+                    &logical_file_schema,
+                    &partitioned_file,
+                    predicate_creation_errors.clone(),
+                )
+            });
+
+        Ok(PreparedParquetOpen {
+            partition_index: self.partition_index,
+            partitioned_file,
+            file_range,
+            extensions,
+            file_name,
+            file_metrics,
+            baseline_metrics,
+            file_pruner,
+            metadata_size_hint,
+            metrics: self.metrics.clone(),
+            parquet_file_reader_factory: Arc::clone(&self.parquet_file_reader_factory),
+            async_file_reader,
+            batch_size: self.batch_size,
+            logical_file_schema: Arc::clone(&logical_file_schema),
+            physical_file_schema: logical_file_schema,
+            output_schema,
+            projection,
+            predicate,
+            reorder_predicates: self.reorder_filters,
+            pushdown_filters: self.pushdown_filters,
+            force_filter_selections: self.force_filter_selections,
+            enable_page_index: self.enable_page_index,
+            enable_bloom_filter: self.enable_bloom_filter,
+            enable_row_group_stats_pruning: self.enable_row_group_stats_pruning,
+            limit: self.limit,
+            coerce_int96: self.coerce_int96,
+            expr_adapter_factory: Arc::clone(&self.expr_adapter_factory),
+            predicate_creation_errors,
+            max_predicate_cache_size: self.max_predicate_cache_size,
+            reverse_row_groups: self.reverse_row_groups,
+            preserve_order: self.preserve_order,
+            #[cfg(feature = "parquet_encryption")]
+            file_decryption_properties: None,
+        })
+    }
+}
+
+impl PreparedParquetOpen {
+    /// Attempt file-level pruning before any metadata is loaded.
+    ///
+    /// Returns `None` if the file can be skipped completely.
+    fn prune_file(mut self) -> Result<Option<Self>> {
+        // Prune this file using the file level statistics and partition values.
+        // Since dynamic filters may have been updated since planning it is possible that we are able
+        // to prune files now that we couldn't prune at planning time.
+        // It is assumed that there is no point in doing pruning here if the predicate is not dynamic,
+        // as it would have been done at planning time.
+        // We'll also check this after every record batch we read,
+        // and if at some point we are able to prove we can prune the file using just the file level statistics
+        // we can end the stream early.
+        //
+        // Make a FilePruner only if there is either
+        // 1. a dynamic expr in the predicate
+        // 2. the file has file-level statistics.
+        //
+        // File-level statistics may prune the file without loading
+        // any row groups or metadata.
+        //
+        // Dynamic filters may prune the file after initial
+        // planning, as the dynamic filter is updated during
+        // execution.
+        //
+        // The case where there is a dynamic filter but no
+        // statistics corresponds to a dynamic filter that
+        // references partition columns. While rare, this is possible
+        // e.g. `select * from table order by partition_col limit
+        // 10` could hit this condition.
+        if let Some(file_pruner) = &mut self.file_pruner
+            && file_pruner.should_prune()?
+        {
+            self.file_metrics
+                .files_ranges_pruned_statistics
+                .add_pruned(1);
+            return Ok(None);
+        }
+
+        self.file_metrics
+            .files_ranges_pruned_statistics
+            .add_matched(1);
+        Ok(Some(self))
+    }
 
-        let enable_page_index = self.enable_page_index;
+    /// Load parquet metadata after file-level pruning is complete.
+    async fn load(mut self) -> Result<MetadataLoadedParquetOpen> {
+        // Don't load the page index yet. Since it is not stored inline in
+        // the footer, loading the page index if it is not needed will do
+        // unnecessary I/O. We decide later if it is needed to evaluate the
+        // pruning predicates. Thus default to not requesting it from the
+        // underlying reader.
+        let options =
+            ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Skip);
         #[cfg(feature = "parquet_encryption")]
-        let encryption_context = self.get_encryption_context();
-        let max_predicate_cache_size = self.max_predicate_cache_size;
+        let mut options = options;
+        #[cfg(feature = "parquet_encryption")]
+        if let Some(fd_val) = &self.file_decryption_properties {
+            options = options.with_file_decryption_properties(Arc::clone(fd_val));
+        }
 
-        Ok(Box::pin(async move {
-            #[cfg(feature = "parquet_encryption")]
-            let file_decryption_properties = encryption_context
-                .get_file_decryption_properties(&file_location)
+        let mut metadata_timer = self.file_metrics.metadata_load_time.timer();
+        // Begin by loading the metadata from the underlying reader (note
+        // the returned metadata may actually include page indexes as some
+        // readers may return page indexes even when not requested -- for
+        // example when they are cached)
+        let reader_metadata =
+            ArrowReaderMetadata::load_async(&mut self.async_file_reader, options.clone())
                 .await?;
+        metadata_timer.stop();
+        drop(metadata_timer);
+
+        Ok(MetadataLoadedParquetOpen {
+            prepared: self,
+            reader_metadata,
+            options,
+        })
+    }
+}
 
-            // Prune this file using the file level statistics and partition values.
-            // Since dynamic filters may have been updated since planning it is possible that we are able
-            // to prune files now that we couldn't prune at planning time.
-            // It is assumed that there is no point in doing pruning here if the predicate is not dynamic,
-            // as it would have been done at planning time.
-            // We'll also check this after every record batch we read,
-            // and if at some point we are able to prove we can prune the file using just the file level statistics
-            // we can end the stream early.
-            let mut file_pruner = predicate
-                .as_ref()
-                .map(|p| {
-                    Ok::<_, DataFusionError>(
-                        (is_dynamic_physical_expr(p) | partitioned_file.has_statistics())
-                            .then_some(FilePruner::new(
-                                Arc::clone(p),
-                                &logical_file_schema,
-                                partition_fields.clone(),
-                                partitioned_file.clone(),
-                                predicate_creation_errors.clone(),
-                            )?),
-                    )
-                })
-                .transpose()?
-                .flatten();
-
-            if let Some(file_pruner) = &mut file_pruner {
-                if file_pruner.should_prune()? {
-                    // Return an empty stream immediately to skip the work of setting up the actual stream
-                    file_metrics.files_ranges_pruned_statistics.add_pruned(1);
-                    return Ok(futures::stream::empty().boxed());
-                }
-            }
+impl MetadataLoadedParquetOpen {
+    /// Prepare file-schema coercions and pruning predicates once metadata is
+    /// loaded.
+    fn prepare_filters(self) -> Result<FiltersPreparedParquetOpen> {
+        let MetadataLoadedParquetOpen {
+            mut prepared,
+            mut reader_metadata,
+            mut options,
+        } = self;
+
+        // Note about schemas: we are actually dealing with **3 different schemas** here:
+        // - The table schema as defined by the TableProvider.
+        //   This is what the user sees, what they get when they `SELECT * FROM table`, etc.
+        // - The logical file schema: this is the table schema minus any hive partition columns and projections.
+        //   This is what the physical file schema is coerced to.
+        // - The physical file schema: this is the schema that the arrow-rs
+        //   parquet reader will actually produce.
+        let mut physical_file_schema = Arc::clone(reader_metadata.schema());
+
+        // The schema loaded from the file may not be the same as the
+        // desired schema (for example if we want to instruct the parquet
+        // reader to read strings using Utf8View instead). Update if necessary
+        if let Some(merged) = apply_file_schema_type_coercions(
+            &prepared.logical_file_schema,
+            &physical_file_schema,
+        ) {
+            physical_file_schema = Arc::new(merged);
+            options = options.with_schema(Arc::clone(&physical_file_schema));
+            reader_metadata = ArrowReaderMetadata::try_new(
+                Arc::clone(reader_metadata.metadata()),
+                options.clone(),
+            )?;
+        }
 
-            file_metrics.files_ranges_pruned_statistics.add_matched(1);
+        if let Some(ref coerce) = prepared.coerce_int96
+            && let Some(merged) = coerce_int96_to_resolution(
+                reader_metadata.parquet_schema(),
+                &physical_file_schema,
+                coerce,
+            )
+        {
+            physical_file_schema = Arc::new(merged);
+            options = options.with_schema(Arc::clone(&physical_file_schema));
+            reader_metadata = ArrowReaderMetadata::try_new(
+                Arc::clone(reader_metadata.metadata()),
+                options.clone(),
+            )?;
+        }
 
-            // Don't load the page index yet. Since it is not stored inline in
-            // the footer, loading the page index if it is not needed will do
-            // unnecessary I/O. We decide later if it is needed to evaluate the
-            // pruning predicates. Thus default to not requesting if from the
-            // underlying reader.
-            let mut options = ArrowReaderOptions::new().with_page_index(false);
-            #[cfg(feature = "parquet_encryption")]
-            if let Some(fd_val) = file_decryption_properties {
-                options = options.with_file_decryption_properties(Arc::clone(&fd_val));
+        // Adapt the projection & filter predicate to the physical file schema.
+        // This evaluates missing columns and inserts any necessary casts.
+        // After rewriting to the file schema, further simplifications may be possible.
+        // For example, if `'a' = col_that_is_missing` becomes `'a' = NULL` that can then be simplified to `FALSE`
+        // and we can avoid doing any more work on the file (bloom filters, loading the page index, etc.).
+        // Additionally, if any casts were inserted we can move casts from the column to the literal side:
+        // `CAST(col AS INT) = 5` can become `col = CAST(5 AS <col type>)`, which can be evaluated statically.
+        //
+        // When the schemas are identical and there is no predicate, the
+        // rewriter is a no-op: column indices already match (partition
+        // columns are appended after file columns in the table schema),
+        // types are the same, and there are no missing columns. Skip the
+        // tree walk entirely in that case.
+        let needs_rewrite = prepared.predicate.is_some()
+            || prepared.logical_file_schema != physical_file_schema;
+        if needs_rewrite {
+            let rewriter = prepared.expr_adapter_factory.create(
+                Arc::clone(&prepared.logical_file_schema),
+                Arc::clone(&physical_file_schema),
+            )?;
+            let simplifier = PhysicalExprSimplifier::new(&physical_file_schema);
+            prepared.predicate = prepared
+                .predicate
+                .map(|p| simplifier.simplify(rewriter.rewrite(p)?))
+                .transpose()?;
+            prepared.projection = prepared
+                .projection
+                .try_map_exprs(|p| simplifier.simplify(rewriter.rewrite(p)?))?;
+        }
+        prepared.physical_file_schema = Arc::clone(&physical_file_schema);
+
+        // Build predicates for this specific file
+        let pruning_predicate = build_pruning_predicates(
+            prepared.predicate.as_ref(),
+            &physical_file_schema,
+            &prepared.predicate_creation_errors,
+        );
+
+        // Only build page pruning predicate if page index is enabled
+        let page_pruning_predicate = if prepared.enable_page_index {
+            prepared.predicate.as_ref().and_then(|predicate| {
+                let p = build_page_pruning_predicate(predicate, &physical_file_schema);
+                (p.filter_number() > 0).then_some(p)
+            })
+        } else {
+            None
+        };
+
+        Ok(FiltersPreparedParquetOpen {
+            loaded: MetadataLoadedParquetOpen {
+                prepared,
+                reader_metadata,
+                options,
+            },
+            pruning_predicate,
+            page_pruning_predicate,
+        })
+    }
+}
+
+impl FiltersPreparedParquetOpen {
+    /// Load the page index if pruning requires it and metadata did not include it.
+    async fn load_page_index(mut self) -> Result<Self> {
+        // The page index is not stored inline in the parquet footer so the
+        // metadata load above may not have read the page index structures yet.
+        // If we need them for reading and they aren't yet loaded, we need to
+        // load them now.
+        if self.page_pruning_predicate.is_some() {
+            self.loaded.reader_metadata = load_page_index(
+                self.loaded.reader_metadata,
+                &mut self.loaded.prepared.async_file_reader,
+                self.loaded
+                    .options
+                    .clone()
+                    .with_page_index_policy(PageIndexPolicy::Optional),
+            )
+            .await?;
+        }
+
+        Ok(self)
+    }
+
+    /// Prune row groups using file ranges and parquet metadata.
+    fn prune_row_groups(self) -> Result<RowGroupsPrunedParquetOpen> {
+        let loaded = &self.loaded;
+        let prepared = &loaded.prepared;
+        let file_metadata = Arc::clone(loaded.reader_metadata.metadata());
+        let rg_metadata = file_metadata.row_groups();
+
+        // Determine which row groups to actually read. The idea is to skip
+        // as many row groups as possible based on the metadata and query
+        let mut row_groups = RowGroupAccessPlanFilter::new(create_initial_plan(
+            &prepared.file_name,
+            prepared.extensions.clone(),
+            rg_metadata.len(),
+        )?);
+
+        // If there is a range restricting what parts of the file to read
+        if let Some(range) = prepared.file_range.as_ref() {
+            row_groups.prune_by_range(rg_metadata, range);
+        }
+
+        // If there is a predicate that can be evaluated against the metadata
+        if let Some(predicate) = self.pruning_predicate.as_ref().map(|p| p.as_ref()) {
+            if prepared.enable_row_group_stats_pruning {
+                row_groups.prune_by_statistics(
+                    &prepared.physical_file_schema,
+                    loaded.reader_metadata.parquet_schema(),
+                    rg_metadata,
+                    predicate,
+                    &prepared.file_metrics,
+                );
+            } else {
+                // Update metrics: statistics unavailable, so all row groups are
+                // matched (not pruned)
+                prepared
+                    .file_metrics
+                    .row_groups_pruned_statistics
+                    .add_matched(row_groups.remaining_row_group_count());
             }
-            let mut metadata_timer = file_metrics.metadata_load_time.timer();
-
-            // Begin by loading the metadata from the underlying reader (note
-            // the returned metadata may actually include page indexes as some
-            // readers may return page indexes even when not requested -- for
-            // example when they are cached)
-            let mut reader_metadata =
-                ArrowReaderMetadata::load_async(&mut async_file_reader, options.clone())
-                    .await?;
-
-            // Note about schemas: we are actually dealing with **3 different schemas** here:
-            // - The table schema as defined by the TableProvider.
-            //   This is what the user sees, what they get when they `SELECT * FROM table`, etc.
-            // - The logical file schema: this is the table schema minus any hive partition columns and projections.
-            //   This is what the physicalfile schema is coerced to.
-            // - The physical file schema: this is the schema as defined by the parquet file. This is what the parquet file actually contains.
-            let mut physical_file_schema = Arc::clone(reader_metadata.schema());
-
-            // The schema loaded from the file may not be the same as the
-            // desired schema (for example if we want to instruct the parquet
-            // reader to read strings using Utf8View instead). Update if necessary
-            if let Some(merged) = apply_file_schema_type_coercions(
-                &logical_file_schema,
-                &physical_file_schema,
-            ) {
-                physical_file_schema = Arc::new(merged);
-                options = options.with_schema(Arc::clone(&physical_file_schema));
-                reader_metadata = ArrowReaderMetadata::try_new(
-                    Arc::clone(reader_metadata.metadata()),
-                    options.clone(),
-                )?;
+
+            if !prepared.enable_bloom_filter || row_groups.is_empty() {
+                // Update metrics: bloom filter unavailable, so all row groups are
+                // matched (not pruned)
+                prepared
+                    .file_metrics
+                    .row_groups_pruned_bloom_filter
+                    .add_matched(row_groups.remaining_row_group_count());
             }
+        } else {
+            // Update metrics: no predicate, so all row groups are matched (not pruned)
+            let remaining = row_groups.remaining_row_group_count();
+            prepared
+                .file_metrics
+                .row_groups_pruned_statistics
+                .add_matched(remaining);
+            prepared
+                .file_metrics
+                .row_groups_pruned_bloom_filter
+                .add_matched(remaining);
+        }
+
+        Ok(RowGroupsPrunedParquetOpen {
+            prepared: self,
+            row_groups,
+        })
+    }
+}
 
-            if let Some(ref coerce) = coerce_int96 {
-                if let Some(merged) = coerce_int96_to_resolution(
-                    reader_metadata.parquet_schema(),
-                    &physical_file_schema,
-                    coerce,
-                ) {
-                    physical_file_schema = Arc::new(merged);
-                    options = options.with_schema(Arc::clone(&physical_file_schema));
-                    reader_metadata = ArrowReaderMetadata::try_new(
-                        Arc::clone(reader_metadata.metadata()),
-                        options.clone(),
+impl RowGroupsPrunedParquetOpen {
+    /// Load bloom filters needed for pruning when enabled and a pruning predicate exists.
+    async fn load_bloom_filters(mut self) -> Result<BloomFiltersLoadedParquetOpen> {
+        let num_row_groups = self
+            .prepared
+            .loaded
+            .reader_metadata
+            .metadata()
+            .num_row_groups();
+        let mut row_group_bloom_filters =
+            vec![BloomFilterStatistics::new(); num_row_groups];
+
+        if let Some(predicate) =
+            self.prepared.pruning_predicate.as_ref().map(|p| p.as_ref())
+            && self.prepared.loaded.prepared.enable_bloom_filter
+            && !self.row_groups.is_empty()
+        {
+            // Use the existing reader for bloom filter I/O;
+            // replace with a fresh reader for decoding below.
+            let reader_metadata = self.prepared.loaded.reader_metadata.clone();
+            let replacement_reader = {
+                let prepared = &self.prepared.loaded.prepared;
+                prepared.parquet_file_reader_factory.create_reader(
+                    prepared.partition_index,
+                    prepared.partitioned_file.clone(),
+                    prepared.metadata_size_hint,
+                    &prepared.metrics,
+                )?
+            };
+
+            let prepared = &mut self.prepared.loaded.prepared;
+            let mut builder = ParquetRecordBatchStreamBuilder::new_with_metadata(
+                mem::replace(&mut prepared.async_file_reader, replacement_reader),
+                reader_metadata,
+            );
+            let parquet_columns: Vec<(String, usize, Type)> = predicate
+                .literal_columns()
+                .into_iter()
+                .filter_map(|column_name| {
+                    let parquet_schema = builder.parquet_schema();
+                    let (column_idx, _) = parquet_column(
+                        parquet_schema,
+                        &prepared.physical_file_schema,
+                        &column_name,
                     )?;
+                    Some((
+                        column_name,
+                        column_idx,
+                        parquet_schema.column(column_idx).physical_type(),
+                    ))
+                })
+                .collect();
+
+            for idx in self.row_groups.row_group_indexes() {
+                let mut row_group_filters =
+                    BloomFilterStatistics::with_capacity(parquet_columns.len());
+                for (column_name, column_idx, physical_type) in &parquet_columns {
+                    let bf: Sbbf = match builder
+                        .get_row_group_column_bloom_filter(idx, *column_idx)
+                        .await
+                    {
+                        Ok(Some(bf)) => bf,
+                        Ok(None) => continue,
+                        Err(e) => {
+                            debug!("Ignoring error reading bloom filter: {e}");
+                            prepared.file_metrics.predicate_evaluation_errors.add(1);
+                            continue;
+                        }
+                    };
+                    row_group_filters.insert(column_name, bf, *physical_type);
                 }
+                row_group_bloom_filters[idx] = row_group_filters;
             }
+        }
 
-            // Adapt the predicate to the physical file schema.
-            // This evaluates missing columns and inserts any necessary casts.
-            if let Some(expr_adapter_factory) = expr_adapter_factory {
-                predicate = predicate
-                    .map(|p| {
-                        let partition_values = partition_fields
-                            .iter()
-                            .cloned()
-                            .zip(partitioned_file.partition_values)
-                            .collect_vec();
-                        let expr = expr_adapter_factory
-                            .create(
-                                Arc::clone(&logical_file_schema),
-                                Arc::clone(&physical_file_schema),
-                            )
-                            .with_partition_values(partition_values)
-                            .rewrite(p)?;
-                        // After rewriting to the file schema, further simplifications may be possible.
-                        // For example, if `'a' = col_that_is_missing` becomes `'a' = NULL` that can then be simplified to `FALSE`
-                        // and we can avoid doing any more work on the file (bloom filters, loading the page index, etc.).
-                        PhysicalExprSimplifier::new(&physical_file_schema).simplify(expr)
-                    })
-                    .transpose()?;
-                predicate_file_schema = Arc::clone(&physical_file_schema);
-            }
+        Ok(BloomFiltersLoadedParquetOpen {
+            prepared: self,
+            row_group_bloom_filters,
+        })
+    }
+}
 
-            // Build predicates for this specific file
-            let (pruning_predicate, page_pruning_predicate) = build_pruning_predicates(
-                predicate.as_ref(),
-                &predicate_file_schema,
-                &predicate_creation_errors,
+impl BloomFiltersLoadedParquetOpen {
+    /// Apply bloom filter pruning using already loaded bloom filters.
+    fn prune_bloom_filters(mut self) -> RowGroupsPrunedParquetOpen {
+        let bloom_filter_eval_time = self
+            .prepared
+            .prepared
+            .loaded
+            .prepared
+            .file_metrics
+            .bloom_filter_eval_time
+            .clone();
+        let _timer_guard = bloom_filter_eval_time.timer();
+        if let Some(predicate) = self
+            .prepared
+            .prepared
+            .pruning_predicate
+            .as_ref()
+            .map(|p| p.as_ref())
+            && self.prepared.prepared.loaded.prepared.enable_bloom_filter
+            && !self.prepared.row_groups.is_empty()
+        {
+            self.prepared.row_groups.prune_by_bloom_filters(
+                predicate,
+                &self.prepared.prepared.loaded.prepared.file_metrics,
+                &self.row_group_bloom_filters,
             );
+        }
 
-            // The page index is not stored inline in the parquet footer so the
-            // code above may not have read the page index structures yet. If we
-            // need them for reading and they aren't yet loaded, we need to load them now.
-            if should_enable_page_index(enable_page_index, &page_pruning_predicate) {
-                reader_metadata = load_page_index(
-                    reader_metadata,
-                    &mut async_file_reader,
-                    // Since we're manually loading the page index the option here should not matter but we pass it in for consistency
-                    options.with_page_index(true),
-                )
-                .await?;
+        self.prepared
+    }
+}
+
+impl RowGroupsPrunedParquetOpen {
+    /// Build the final parquet stream once all pruning work is complete.
+    fn build_stream(self) -> Result<BoxStream<'static, Result<RecordBatch>>> {
+        let RowGroupsPrunedParquetOpen {
+            prepared,
+            mut row_groups,
+        } = self;
+        let FiltersPreparedParquetOpen {
+            loaded,
+            pruning_predicate: _,
+            page_pruning_predicate,
+        } = prepared;
+        let MetadataLoadedParquetOpen {
+            prepared,
+            reader_metadata,
+            options: _,
+        } = loaded;
+
+        let file_metadata = Arc::clone(reader_metadata.metadata());
+        let rg_metadata = file_metadata.row_groups();
+
+        // Filter pushdown: evaluate predicates during scan
+        let row_filter = if let Some(predicate) = prepared
+            .pushdown_filters
+            .then_some(prepared.predicate.clone())
+            .flatten()
+        {
+            let row_filter = row_filter::build_row_filter(
+                &predicate,
+                &prepared.physical_file_schema,
+                file_metadata.as_ref(),
+                prepared.reorder_predicates,
+                &prepared.file_metrics,
+            );
+
+            match row_filter {
+                Ok(Some(filter)) => Some(filter),
+                Ok(None) => None,
+                Err(e) => {
+                    debug!("Ignoring error building row filter for '{predicate:?}': {e}");
+                    None
+                }
             }
+        } else {
+            None
+        };
 
-            metadata_timer.stop();
+        // Prune by limit if limit is set and limit order is not sensitive
+        if let (Some(limit), false) = (prepared.limit, prepared.preserve_order) {
+            row_groups.prune_by_limit(limit, rg_metadata, &prepared.file_metrics);
+        }
 
-            let mut builder = ParquetRecordBatchStreamBuilder::new_with_metadata(
-                async_file_reader,
-                reader_metadata,
+        // Page index pruning: if all data on individual pages can
+        // be ruled using page metadata, rows from other columns
+        // with that range can be skipped as well.
+        let mut access_plan = row_groups.build();
+        if prepared.enable_page_index
+            && !access_plan.is_empty()
+            && let Some(page_pruning_predicate) = page_pruning_predicate
+        {
+            access_plan = page_pruning_predicate.prune_plan_with_page_index(
+                access_plan,
+                &prepared.physical_file_schema,
+                reader_metadata.parquet_schema(),
+                file_metadata.as_ref(),
+                &prepared.file_metrics,
             );
+        }
 
-            let (schema_mapping, adapted_projections) =
-                schema_adapter.map_schema(&physical_file_schema)?;
+        // Prepare the access plan (extract row groups and row selection)
+        let mut prepared_plan = access_plan.prepare(rg_metadata)?;
 
-            let mask = ProjectionMask::roots(
-                builder.parquet_schema(),
-                adapted_projections.iter().cloned(),
-            );
+        // Potentially reverse the access plan for performance.
+        // See `ParquetSource::try_pushdown_sort` for the rationale.
+        if prepared.reverse_row_groups {
+            prepared_plan = prepared_plan.reverse(file_metadata.as_ref())?;
+        }
 
-            // Filter pushdown: evaluate predicates during scan
-            if let Some(predicate) = pushdown_filters.then_some(predicate).flatten() {
-                let row_filter = row_filter::build_row_filter(
-                    &predicate,
-                    &physical_file_schema,
-                    &predicate_file_schema,
-                    builder.metadata(),
-                    reorder_predicates,
-                    &file_metrics,
-                    &schema_adapter_factory,
-                );
+        let arrow_reader_metrics = ArrowReaderMetrics::enabled();
+        let read_plan = build_projection_read_plan(
+            prepared.projection.expr_iter(),
+            &prepared.physical_file_schema,
+            reader_metadata.parquet_schema(),
+        );
 
-                match row_filter {
-                    Ok(Some(filter)) => {
-                        builder = builder.with_row_filter(filter);
-                    }
-                    Ok(None) => {}
-                    Err(e) => {
-                        debug!(
-                            "Ignoring error building row filter for '{predicate:?}': {e}"
-                        );
-                    }
-                };
-            };
+        let mut decoder_builder =
+            ParquetPushDecoderBuilder::new_with_metadata(reader_metadata)
+                .with_projection(read_plan.projection_mask)
+                .with_batch_size(prepared.batch_size)
+                .with_metrics(arrow_reader_metrics.clone());
 
-            // Determine which row groups to actually read. The idea is to skip
-            // as many row groups as possible based on the metadata and query
-            let file_metadata = Arc::clone(builder.metadata());
-            let predicate = pruning_predicate.as_ref().map(|p| p.as_ref());
-            let rg_metadata = file_metadata.row_groups();
-            // track which row groups to actually read
-            let access_plan =
-                create_initial_plan(&file_name, extensions, rg_metadata.len())?;
-            let mut row_groups = RowGroupAccessPlanFilter::new(access_plan);
-            // if there is a range restricting what parts of the file to read
-            if let Some(range) = file_range.as_ref() {
-                row_groups.prune_by_range(rg_metadata, range);
-            }
+        if let Some(row_filter) = row_filter {
+            decoder_builder = decoder_builder.with_row_filter(row_filter);
+        }
+        if prepared.force_filter_selections {
+            decoder_builder =
+                decoder_builder.with_row_selection_policy(RowSelectionPolicy::Selectors);
+        }
+        if let Some(row_selection) = prepared_plan.row_selection {
+            decoder_builder = decoder_builder.with_row_selection(row_selection);
+        }
+        decoder_builder =
+            decoder_builder.with_row_groups(prepared_plan.row_group_indexes);
+        if let Some(limit) = prepared.limit {
+            decoder_builder = decoder_builder.with_limit(limit);
+        }
+        if let Some(max_predicate_cache_size) = prepared.max_predicate_cache_size {
+            decoder_builder =
+                decoder_builder.with_max_predicate_cache_size(max_predicate_cache_size);
+        }
 
-            // If there is a predicate that can be evaluated against the metadata
-            if let Some(predicate) = predicate.as_ref() {
-                if enable_row_group_stats_pruning {
-                    row_groups.prune_by_statistics(
-                        &physical_file_schema,
-                        builder.parquet_schema(),
-                        rg_metadata,
-                        predicate,
-                        &file_metrics,
-                    );
-                } else {
-                    // Update metrics: statistics unavailable, so all row groups are
-                    // matched (not pruned)
-                    file_metrics
-                        .row_groups_pruned_statistics
-                        .add_matched(row_groups.remaining_row_group_count());
-                }
+        let decoder = decoder_builder.build()?;
+
+        let predicate_cache_inner_records =
+            prepared.file_metrics.predicate_cache_inner_records.clone();
+        let predicate_cache_records =
+            prepared.file_metrics.predicate_cache_records.clone();
+
+        // Check if we need to replace the schema to handle things like differing nullability or metadata.
+        // See note below about file vs. output schema.
+        let stream_schema = read_plan.projected_schema;
+        let replace_schema = stream_schema != prepared.output_schema;
+
+        // Rebase column indices to match the narrowed stream schema.
+        // The projection expressions have indices based on physical_file_schema,
+        // but the stream only contains the columns selected by the ProjectionMask.
+        let projection = prepared
+            .projection
+            .try_map_exprs(|expr| reassign_expr_columns(expr, &stream_schema))?;
+        let projector = projection.make_projector(&stream_schema)?;
+        let output_schema = Arc::clone(&prepared.output_schema);
+        let files_ranges_pruned_statistics =
+            prepared.file_metrics.files_ranges_pruned_statistics.clone();
+        let stream = futures::stream::unfold(
+            PushDecoderStreamState {
+                decoder,
+                reader: prepared.async_file_reader,
+                projector,
+                output_schema,
+                replace_schema,
+                arrow_reader_metrics,
+                predicate_cache_inner_records,
+                predicate_cache_records,
+                baseline_metrics: prepared.baseline_metrics,
+            },
+            |state| async move { state.transition().await },
+        )
+        .fuse();
+
+        // Wrap the stream so a dynamic filter can stop the file scan early.
+        if let Some(file_pruner) = prepared.file_pruner {
+            let stream = stream.boxed();
+            Ok(EarlyStoppingStream::new(
+                stream,
+                file_pruner,
+                files_ranges_pruned_statistics,
+            )
+            .boxed())
+        } else {
+            Ok(stream.boxed())
+        }
+    }
+}
 
-                if enable_bloom_filter && !row_groups.is_empty() {
-                    row_groups
-                        .prune_by_bloom_filters(
-                            &physical_file_schema,
-                            &mut builder,
-                            predicate,
-                            &file_metrics,
-                        )
-                        .await;
-                } else {
-                    // Update metrics: bloom filter unavailable, so all row groups are
-                    // matched (not pruned)
-                    file_metrics
-                        .row_groups_pruned_bloom_filter
-                        .add_matched(row_groups.remaining_row_group_count());
-                }
-            } else {
-                // Update metrics: no predicate, so all row groups are matched (not pruned)
-                let n_remaining_row_groups = row_groups.remaining_row_group_count();
-                file_metrics
-                    .row_groups_pruned_statistics
-                    .add_matched(n_remaining_row_groups);
-                file_metrics
-                    .row_groups_pruned_bloom_filter
-                    .add_matched(n_remaining_row_groups);
-            }
+/// State for a stream that decodes a single Parquet file using a push-based decoder.
+///
+/// The [`transition`](Self::transition) method drives the decoder in a loop: it requests
+/// byte ranges from the [`AsyncFileReader`], pushes the fetched data into the
+/// [`ParquetPushDecoder`], and yields projected [`RecordBatch`]es until the file is
+/// fully consumed.
+struct PushDecoderStreamState {
+    decoder: ParquetPushDecoder,
+    reader: Box<dyn AsyncFileReader>,
+    projector: Projector,
+    output_schema: Arc<Schema>,
+    replace_schema: bool,
+    arrow_reader_metrics: ArrowReaderMetrics,
+    predicate_cache_inner_records: Gauge,
+    predicate_cache_records: Gauge,
+    baseline_metrics: BaselineMetrics,
+}
 
-            let mut access_plan = row_groups.build();
-
-            // page index pruning: if all data on individual pages can
-            // be ruled using page metadata, rows from other columns
-            // with that range can be skipped as well
-            if enable_page_index && !access_plan.is_empty() {
-                if let Some(p) = page_pruning_predicate {
-                    access_plan = p.prune_plan_with_page_index(
-                        access_plan,
-                        &physical_file_schema,
-                        builder.parquet_schema(),
-                        file_metadata.as_ref(),
-                        &file_metrics,
-                    );
+impl PushDecoderStreamState {
+    /// Advances the decoder state machine until the next [`RecordBatch`] is
+    /// produced, the file is fully consumed, or an error occurs.
+    ///
+    /// On each iteration the decoder is polled via [`ParquetPushDecoder::try_decode`]:
+    /// - [`NeedsData`](DecodeResult::NeedsData) – the requested byte ranges are
+    ///   fetched from the [`AsyncFileReader`] and fed back into the decoder.
+    /// - [`Data`](DecodeResult::Data) – a decoded batch is projected and returned.
+    /// - [`Finished`](DecodeResult::Finished) – signals end-of-stream (`None`).
+    ///
+    /// Takes `self` by value (rather than `&mut self`) so the generated future
+    /// owns the state directly. This avoids a Stacked Borrows violation under
+    /// miri where `&mut self` creates a single opaque borrow that conflicts
+    /// with `unfold`'s ownership across yield points.
+    async fn transition(mut self) -> Option<(Result<RecordBatch>, Self)> {
+        loop {
+            match self.decoder.try_decode() {
+                Ok(DecodeResult::NeedsData(ranges)) => {
+                    let data = self
+                        .reader
+                        .get_byte_ranges(ranges.clone())
+                        .await
+                        .map_err(DataFusionError::from);
+                    match data {
+                        Ok(data) => {
+                            if let Err(e) = self.decoder.push_ranges(ranges, data) {
+                                return Some((Err(DataFusionError::from(e)), self));
+                            }
+                        }
+                        Err(e) => return Some((Err(e), self)),
+                    }
+                }
+                Ok(DecodeResult::Data(batch)) => {
+                    let mut timer = self.baseline_metrics.elapsed_compute().timer();
+                    self.copy_arrow_reader_metrics();
+                    let result = self.project_batch(&batch);
+                    timer.stop();
+                    // Release the borrow on baseline_metrics before moving self
+                    drop(timer);
+                    return Some((result, self));
+                }
+                Ok(DecodeResult::Finished) => {
+                    return None;
+                }
+                Err(e) => {
+                    return Some((Err(DataFusionError::from(e)), self));
                 }
             }
+        }
+    }
 
-            let row_group_indexes = access_plan.row_group_indexes();
-            if let Some(row_selection) =
-                access_plan.into_overall_row_selection(rg_metadata)?
-            {
-                builder = builder.with_row_selection(row_selection);
-            }
+    /// Copies metrics from ArrowReaderMetrics (the metrics collected by the
+    /// arrow-rs parquet reader) to the parquet file metrics for DataFusion
+    fn copy_arrow_reader_metrics(&self) {
+        if let Some(v) = self.arrow_reader_metrics.records_read_from_inner() {
+            self.predicate_cache_inner_records.set(v);
+        }
+        if let Some(v) = self.arrow_reader_metrics.records_read_from_cache() {
+            self.predicate_cache_records.set(v);
+        }
+    }
 
-            if let Some(limit) = limit {
-                builder = builder.with_limit(limit)
-            }
+    fn project_batch(&self, batch: &RecordBatch) -> Result<RecordBatch> {
+        let mut batch = self.projector.project_batch(batch)?;
+        if self.replace_schema {
+            // Ensure the output batch has the expected schema.
+            // This handles things like schema level and field level metadata, which may not be present
+            // in the physical file schema.
+            // It is also possible for nullability to differ; some writers create files with
+            // OPTIONAL fields even when there are no nulls in the data.
+            // In these cases it may make sense for the logical schema to be `NOT NULL`.
+            // RecordBatch::try_new_with_options checks that if the schema is NOT NULL
+            // the array cannot contain nulls, amongst other checks.
+            let (_stream_schema, arrays, num_rows) = batch.into_parts();
+            let options = RecordBatchOptions::new().with_row_count(Some(num_rows));
+            batch = RecordBatch::try_new_with_options(
+                Arc::clone(&self.output_schema),
+                arrays,
+                &options,
+            )?;
+        }
+        Ok(batch)
+    }
+}
 
-            if let Some(max_predicate_cache_size) = max_predicate_cache_size {
-                builder = builder.with_max_predicate_cache_size(max_predicate_cache_size);
-            }
+type ConstantColumns = HashMap<String, ScalarValue>;
 
-            // metrics from the arrow reader itself
-            let arrow_reader_metrics = ArrowReaderMetrics::enabled();
-
-            let stream = builder
-                .with_projection(mask)
-                .with_batch_size(batch_size)
-                .with_row_groups(row_group_indexes)
-                .with_metrics(arrow_reader_metrics.clone())
-                .build()?;
-
-            let files_ranges_pruned_statistics =
-                file_metrics.files_ranges_pruned_statistics.clone();
-            let predicate_cache_inner_records =
-                file_metrics.predicate_cache_inner_records.clone();
-            let predicate_cache_records = file_metrics.predicate_cache_records.clone();
-
-            let stream = stream.map_err(DataFusionError::from).map(move |b| {
-                b.and_then(|b| {
-                    copy_arrow_reader_metrics(
-                        &arrow_reader_metrics,
-                        &predicate_cache_inner_records,
-                        &predicate_cache_records,
-                    );
-                    schema_mapping.map_batch(b)
-                })
-            });
+/// Extract constant column values from statistics, keyed by column name in the logical file schema.
+fn constant_columns_from_stats(
+    statistics: Option<&Statistics>,
+    file_schema: &SchemaRef,
+) -> ConstantColumns {
+    let mut constants = HashMap::new();
+    let Some(statistics) = statistics else {
+        return constants;
+    };
 
-            if let Some(file_pruner) = file_pruner {
-                Ok(EarlyStoppingStream::new(
-                    stream,
-                    file_pruner,
-                    files_ranges_pruned_statistics,
-                )
-                .boxed())
-            } else {
-                Ok(stream.boxed())
-            }
-        }))
+    let num_rows = match statistics.num_rows {
+        Precision::Exact(num_rows) => Some(num_rows),
+        _ => None,
+    };
+
+    for (idx, column_stats) in statistics
+        .column_statistics
+        .iter()
+        .take(file_schema.fields().len())
+        .enumerate()
+    {
+        let field = file_schema.field(idx);
+        if let Some(value) =
+            constant_value_from_stats(column_stats, num_rows, field.data_type())
+        {
+            constants.insert(field.name().clone(), value);
+        }
     }
+
+    constants
 }
 
-/// Copies metrics from ArrowReaderMetrics (the metrics collected by the
-/// arrow-rs parquet reader) to the parquet file metrics for DataFusion
-fn copy_arrow_reader_metrics(
-    arrow_reader_metrics: &ArrowReaderMetrics,
-    predicate_cache_inner_records: &Count,
-    predicate_cache_records: &Count,
-) {
-    if let Some(v) = arrow_reader_metrics.records_read_from_inner() {
-        predicate_cache_inner_records.add(v);
+fn constant_value_from_stats(
+    column_stats: &ColumnStatistics,
+    num_rows: Option<usize>,
+    data_type: &DataType,
+) -> Option<ScalarValue> {
+    if let (Precision::Exact(min), Precision::Exact(max)) =
+        (&column_stats.min_value, &column_stats.max_value)
+        && min == max
+        && !min.is_null()
+        && matches!(column_stats.null_count, Precision::Exact(0))
+    {
+        // Cast to the expected data type if needed (e.g., Utf8 -> Dictionary)
+        if min.data_type() != *data_type {
+            return min.cast_to(data_type).ok();
+        }
+        return Some(min.clone());
     }
 
-    if let Some(v) = arrow_reader_metrics.records_read_from_cache() {
-        predicate_cache_records.add(v);
+    if let (Some(num_rows), Precision::Exact(nulls)) =
+        (num_rows, &column_stats.null_count)
+        && *nulls == num_rows
+    {
+        return ScalarValue::try_new_null(data_type).ok();
     }
+
+    None
 }
 
 /// Wraps an inner RecordBatchStream and a [`FilePruner`]
@@ -525,6 +1412,7 @@ impl<S> EarlyStoppingStream<S> {
         }
     }
 }
+
 impl<S> EarlyStoppingStream<S>
 where
     S: Stream<Item = Result<RecordBatch>> + Unpin,
@@ -575,7 +1463,6 @@ where
 }
 
 #[derive(Default)]
-#[cfg_attr(not(feature = "parquet_encryption"), allow(dead_code))]
 struct EncryptionContext {
     #[cfg(feature = "parquet_encryption")]
     file_decryption_properties: Option<Arc<FileDecryptionProperties>>,
@@ -617,7 +1504,7 @@ impl EncryptionContext {
 }
 
 #[cfg(not(feature = "parquet_encryption"))]
-#[allow(dead_code)]
+#[expect(dead_code)]
 impl EncryptionContext {
     async fn get_file_decryption_properties(
         &self,
@@ -627,7 +1514,7 @@ impl EncryptionContext {
     }
 }
 
-impl ParquetOpener {
+impl ParquetMorselizer {
     #[cfg(feature = "parquet_encryption")]
     fn get_encryption_context(&self) -> EncryptionContext {
         EncryptionContext::new(
@@ -637,7 +1524,7 @@ impl ParquetOpener {
     }
 
     #[cfg(not(feature = "parquet_encryption"))]
-    #[allow(dead_code)]
+    #[expect(dead_code)]
     fn get_encryption_context(&self) -> EncryptionContext {
         EncryptionContext::default()
     }
@@ -693,20 +1580,13 @@ pub(crate) fn build_pruning_predicates(
     predicate: Option<&Arc<dyn PhysicalExpr>>,
     file_schema: &SchemaRef,
     predicate_creation_errors: &Count,
-) -> (
-    Option<Arc<PruningPredicate>>,
-    Option<Arc<PagePruningAccessPlanFilter>>,
-) {
-    let Some(predicate) = predicate.as_ref() else {
-        return (None, None);
-    };
-    let pruning_predicate = build_pruning_predicate(
+) -> Option<Arc<PruningPredicate>> {
+    let predicate = predicate.as_ref()?;
+    build_pruning_predicate(
         Arc::clone(predicate),
         file_schema,
         predicate_creation_errors,
-    );
-    let page_pruning_predicate = build_page_pruning_predicate(predicate, file_schema);
-    (pruning_predicate, Some(page_pruning_predicate))
+    )
 }
 
 /// Returns a `ArrowReaderMetadata` with the page index loaded, loading
@@ -738,60 +1618,331 @@ async fn load_page_index<T: AsyncFileReader>(
         // No need to load the page index again, just return the existing metadata
         Ok(reader_metadata)
     }
-}
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use super::{ConstantColumns, ParquetMorselizer, constant_columns_from_stats};
+    use crate::{DefaultParquetFileReaderFactory, RowGroupAccess};
+    use arrow::array::RecordBatch;
+    use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+    use bytes::{BufMut, BytesMut};
+    use datafusion_common::{
+        ColumnStatistics, ScalarValue, Statistics, internal_err, record_batch,
+        stats::Precision,
+    };
+    use datafusion_datasource::morsel::{Morsel, Morselizer};
+    use datafusion_datasource::{PartitionedFile, TableSchema};
+    use datafusion_expr::{col, lit};
+    use datafusion_physical_expr::{
+        PhysicalExpr,
+        expressions::{Column, DynamicFilterPhysicalExpr, Literal},
+        planner::logical2physical,
+        projection::ProjectionExprs,
+    };
+    use datafusion_physical_expr_adapter::{
+        DefaultPhysicalExprAdapterFactory, replace_columns_with_literals,
+    };
+    use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
+    use futures::StreamExt;
+    use futures::stream::BoxStream;
+    use object_store::{ObjectStore, ObjectStoreExt, memory::InMemory, path::Path};
+    use parquet::arrow::ArrowWriter;
+    use parquet::file::properties::WriterProperties;
+    use std::collections::VecDeque;
+    use std::sync::Arc;
+
+    /// Builder for creating [`ParquetMorselizer`] instances with sensible defaults for tests.
+    /// This helps reduce code duplication and makes it clear what differs between test cases.
+    struct ParquetMorselizerBuilder {
+        store: Option<Arc<dyn ObjectStore>>,
+        table_schema: Option<TableSchema>,
+        partition_index: usize,
+        projection_indices: Option<Vec<usize>>,
+        projection: Option<ProjectionExprs>,
+        batch_size: usize,
+        limit: Option<usize>,
+        predicate: Option<Arc<dyn PhysicalExpr>>,
+        metadata_size_hint: Option<usize>,
+        metrics: ExecutionPlanMetricsSet,
+        pushdown_filters: bool,
+        reorder_filters: bool,
+        force_filter_selections: bool,
+        enable_page_index: bool,
+        enable_bloom_filter: bool,
+        enable_row_group_stats_pruning: bool,
+        coerce_int96: Option<TimeUnit>,
+        max_predicate_cache_size: Option<usize>,
+        reverse_row_groups: bool,
+        preserve_order: bool,
+    }
+
+    impl ParquetMorselizerBuilder {
+        /// Create a new builder with sensible defaults for tests.
+        fn new() -> Self {
+            Self {
+                store: None,
+                table_schema: None,
+                partition_index: 0,
+                projection_indices: None,
+                projection: None,
+                batch_size: 1024,
+                limit: None,
+                predicate: None,
+                metadata_size_hint: None,
+                metrics: ExecutionPlanMetricsSet::new(),
+                pushdown_filters: false,
+                reorder_filters: false,
+                force_filter_selections: false,
+                enable_page_index: false,
+                enable_bloom_filter: false,
+                enable_row_group_stats_pruning: false,
+                coerce_int96: None,
+                max_predicate_cache_size: None,
+                reverse_row_groups: false,
+                preserve_order: false,
+            }
+        }
+
+        /// Set the object store (required for building).
+        fn with_store(mut self, store: Arc<dyn ObjectStore>) -> Self {
+            self.store = Some(store);
+            self
+        }
+
+        /// Create a simple table schema from a file schema (for files without partition columns).
+        fn with_schema(mut self, file_schema: SchemaRef) -> Self {
+            self.table_schema = Some(TableSchema::from_file_schema(file_schema));
+            self
+        }
+
+        /// Set a custom table schema (for files with partition columns).
+        fn with_table_schema(mut self, table_schema: TableSchema) -> Self {
+            self.table_schema = Some(table_schema);
+            self
+        }
+
+        /// Set projection by column indices (convenience method for common case).
+        fn with_projection_indices(mut self, indices: &[usize]) -> Self {
+            self.projection_indices = Some(indices.to_vec());
+            self
+        }
+
+        /// Set the predicate.
+        fn with_predicate(mut self, predicate: Arc<dyn PhysicalExpr>) -> Self {
+            self.predicate = Some(predicate);
+            self
+        }
+
+        /// Enable pushdown filters.
+        fn with_pushdown_filters(mut self, enable: bool) -> Self {
+            self.pushdown_filters = enable;
+            self
+        }
+
+        /// Enable filter reordering.
+        fn with_reorder_filters(mut self, enable: bool) -> Self {
+            self.reorder_filters = enable;
+            self
+        }
+
+        /// Enable row group stats pruning.
+        fn with_row_group_stats_pruning(mut self, enable: bool) -> Self {
+            self.enable_row_group_stats_pruning = enable;
+            self
+        }
+
+        /// Enable page index.
+        fn with_enable_page_index(mut self, enable: bool) -> Self {
+            self.enable_page_index = enable;
+            self
+        }
+
+        /// Set reverse row groups flag.
+        fn with_reverse_row_groups(mut self, enable: bool) -> Self {
+            self.reverse_row_groups = enable;
+            self
+        }
+
+        /// Build the ParquetMorselizer instance.
+        ///
+        /// # Panics
+        ///
+        /// Panics if required fields (store, schema/table_schema) are not set.
+        fn build(self) -> ParquetMorselizer {
+            let store = self
+                .store
+                .expect("ParquetMorselizerBuilder: store must be set via with_store()");
+            let table_schema = self.table_schema.expect(
+                "ParquetMorselizerBuilder: table_schema must be set via with_schema() or with_table_schema()",
+            );
+            let file_schema = Arc::clone(table_schema.file_schema());
+
+            let projection = if let Some(projection) = self.projection {
+                projection
+            } else if let Some(indices) = self.projection_indices {
+                ProjectionExprs::from_indices(&indices, &file_schema)
+            } else {
+                // Default: project all columns
+                let all_indices: Vec<usize> = (0..file_schema.fields().len()).collect();
+                ProjectionExprs::from_indices(&all_indices, &file_schema)
+            };
+
+            ParquetMorselizer {
+                partition_index: self.partition_index,
+                projection,
+                batch_size: self.batch_size,
+                limit: self.limit,
+                preserve_order: self.preserve_order,
+                predicate: self.predicate,
+                table_schema,
+                metadata_size_hint: self.metadata_size_hint,
+                metrics: self.metrics,
+                parquet_file_reader_factory: Arc::new(
+                    DefaultParquetFileReaderFactory::new(store),
+                ),
+                pushdown_filters: self.pushdown_filters,
+                reorder_filters: self.reorder_filters,
+                force_filter_selections: self.force_filter_selections,
+                enable_page_index: self.enable_page_index,
+                enable_bloom_filter: self.enable_bloom_filter,
+                enable_row_group_stats_pruning: self.enable_row_group_stats_pruning,
+                coerce_int96: self.coerce_int96,
+                #[cfg(feature = "parquet_encryption")]
+                file_decryption_properties: None,
+                expr_adapter_factory: Arc::new(DefaultPhysicalExprAdapterFactory),
+                #[cfg(feature = "parquet_encryption")]
+                encryption_factory: None,
+                max_predicate_cache_size: self.max_predicate_cache_size,
+                reverse_row_groups: self.reverse_row_groups,
+            }
+        }
+    }
 
-fn should_enable_page_index(
-    enable_page_index: bool,
-    page_pruning_predicate: &Option<Arc<PagePruningAccessPlanFilter>>,
-) -> bool {
-    enable_page_index
-        && page_pruning_predicate.is_some()
-        && page_pruning_predicate
-            .as_ref()
-            .map(|p| p.filter_number() > 0)
-            .unwrap_or(false)
-}
+    /// Test helper that drives a [`ParquetMorselizer`] to completion and returns
+    /// the first stream morsel it produces.
+    ///
+    /// This mirrors how `FileStream` consumes the morsel APIs: it repeatedly
+    /// plans CPU work, awaits any discovered I/O futures, and feeds the planner
+    /// back into the ready queue until a stream morsel is ready.
+    async fn open_file(
+        morselizer: &ParquetMorselizer,
+        file: PartitionedFile,
+    ) -> Result<BoxStream<'static, Result<RecordBatch>>> {
+        let mut planners = VecDeque::from([morselizer.plan_file(file)?]);
+        let mut morsels: VecDeque<Box<dyn Morsel>> = VecDeque::new();
+
+        loop {
+            if let Some(morsel) = morsels.pop_front() {
+                return Ok(Box::pin(morsel.into_stream()));
+            }
 
-#[cfg(test)]
-mod test {
-    use std::sync::Arc;
+            let Some(planner) = planners.pop_front() else {
+                return Ok(Box::pin(futures::stream::empty()));
+            };
 
-    use arrow::{
-        compute::cast,
-        datatypes::{DataType, Field, Schema, SchemaRef},
-    };
-    use bytes::{BufMut, BytesMut};
-    use datafusion_common::{
-        assert_batches_eq, record_batch, stats::Precision, ColumnStatistics,
-        DataFusionError, ScalarValue, Statistics,
-    };
-    use datafusion_datasource::{
-        file_stream::FileOpener,
-        schema_adapter::{
-            DefaultSchemaAdapterFactory, SchemaAdapter, SchemaAdapterFactory,
-            SchemaMapper,
-        },
-        PartitionedFile,
-    };
-    use datafusion_expr::{col, lit};
-    use datafusion_physical_expr::{
-        expressions::DynamicFilterPhysicalExpr, planner::logical2physical, PhysicalExpr,
-    };
-    use datafusion_physical_expr_adapter::DefaultPhysicalExprAdapterFactory;
-    use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet};
-    use futures::{Stream, StreamExt};
-    use object_store::{memory::InMemory, path::Path, ObjectStore};
-    use parquet::arrow::ArrowWriter;
+            if let Some(mut plan) = planner.plan()? {
+                morsels.extend(plan.take_morsels());
+                planners.extend(plan.take_ready_planners());
+
+                if let Some(pending_planner) = plan.take_pending_planner() {
+                    planners.push_front(pending_planner.await?);
+                    continue;
+                }
+
+                if morsels.is_empty() && planners.is_empty() {
+                    return internal_err!("planner returned an empty morsel plan");
+                }
+            }
+        }
+    }
+
+    fn constant_int_stats() -> (Statistics, SchemaRef) {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]));
+        let statistics = Statistics {
+            num_rows: Precision::Exact(3),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::from(5i32)),
+                    min_value: Precision::Exact(ScalarValue::from(5i32)),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Absent,
+                },
+                ColumnStatistics::new_unknown(),
+            ],
+        };
+        (statistics, schema)
+    }
+
+    #[test]
+    fn extract_constant_columns_non_null() {
+        let (statistics, schema) = constant_int_stats();
+        let constants = constant_columns_from_stats(Some(&statistics), &schema);
+        assert_eq!(constants.len(), 1);
+        assert_eq!(constants.get("a"), Some(&ScalarValue::from(5i32)));
+        assert!(!constants.contains_key("b"));
+    }
+
+    #[test]
+    fn extract_constant_columns_all_null() {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, true)]));
+        let statistics = Statistics {
+            num_rows: Precision::Exact(2),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(2),
+                max_value: Precision::Absent,
+                min_value: Precision::Absent,
+                sum_value: Precision::Absent,
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Absent,
+            }],
+        };
+
+        let constants = constant_columns_from_stats(Some(&statistics), &schema);
+        assert_eq!(
+            constants.get("a"),
+            Some(&ScalarValue::Utf8(None)),
+            "all-null column should be treated as constant null"
+        );
+    }
+
+    #[test]
+    fn rewrite_projection_to_literals() {
+        let (statistics, schema) = constant_int_stats();
+        let constants = constant_columns_from_stats(Some(&statistics), &schema);
+        let projection = ProjectionExprs::from_indices(&[0, 1], &schema);
+
+        let rewritten = projection
+            .try_map_exprs(|expr| replace_columns_with_literals(expr, &constants))
+            .unwrap();
+        let exprs = rewritten.as_ref();
+        assert!(exprs[0].expr.downcast_ref::<Literal>().is_some());
+        assert!(exprs[1].expr.downcast_ref::<Column>().is_some());
+
+        // Only column `b` should remain in the projection mask
+        assert_eq!(rewritten.column_indices(), vec![1]);
+    }
+
+    #[test]
+    fn rewrite_physical_expr_literal() {
+        let mut constants = ConstantColumns::new();
+        constants.insert("a".to_string(), ScalarValue::from(7i32));
+        let expr: Arc<dyn PhysicalExpr> = Arc::new(Column::new("a", 0));
 
-    use crate::{opener::ParquetOpener, DefaultParquetFileReaderFactory};
+        let rewritten = replace_columns_with_literals(expr, &constants).unwrap();
+        assert!(rewritten.downcast_ref::<Literal>().is_some());
+    }
 
     async fn count_batches_and_rows(
-        mut stream: std::pin::Pin<
-            Box<
-                dyn Stream<Item = Result<arrow::array::RecordBatch, DataFusionError>>
-                    + Send,
-            >,
-        >,
+        mut stream: BoxStream<'static, Result<RecordBatch>>,
     ) -> (usize, usize) {
         let mut num_batches = 0;
         let mut num_rows = 0;
@@ -802,31 +1953,49 @@ mod test {
         (num_batches, num_rows)
     }
 
-    async fn collect_batches(
-        mut stream: std::pin::Pin<
-            Box<
-                dyn Stream<Item = Result<arrow::array::RecordBatch, DataFusionError>>
-                    + Send,
-            >,
-        >,
-    ) -> Vec<arrow::array::RecordBatch> {
-        let mut batches = vec![];
+    /// Helper to collect all int32 values from the first column of batches
+    async fn collect_int32_values(
+        mut stream: BoxStream<'static, Result<RecordBatch>>,
+    ) -> Vec<i32> {
+        use arrow::array::Array;
+        let mut values = vec![];
         while let Some(Ok(batch)) = stream.next().await {
-            batches.push(batch);
+            let array = batch
+                .column(0)
+                .as_any()
+                .downcast_ref::<arrow::array::Int32Array>()
+                .unwrap();
+            for i in 0..array.len() {
+                if !array.is_null(i) {
+                    values.push(array.value(i));
+                }
+            }
         }
-        batches
+        values
     }
 
     async fn write_parquet(
         store: Arc<dyn ObjectStore>,
         filename: &str,
         batch: arrow::record_batch::RecordBatch,
+    ) -> usize {
+        write_parquet_batches(store, filename, vec![batch], None).await
+    }
+
+    /// Write multiple batches to a parquet file with optional writer properties
+    async fn write_parquet_batches(
+        store: Arc<dyn ObjectStore>,
+        filename: &str,
+        batches: Vec<arrow::record_batch::RecordBatch>,
+        props: Option<WriterProperties>,
     ) -> usize {
         let mut out = BytesMut::new().writer();
         {
-            let mut writer =
-                ArrowWriter::try_new(&mut out, batch.schema(), None).unwrap();
-            writer.write(&batch).unwrap();
+            let schema = batches[0].schema();
+            let mut writer = ArrowWriter::try_new(&mut out, schema, props).unwrap();
+            for batch in batches {
+                writer.write(&batch).unwrap();
+            }
             writer.finish().unwrap();
         }
         let data = out.into_inner().freeze();
@@ -872,40 +2041,20 @@ mod test {
         ));
 
         let make_opener = |predicate| {
-            ParquetOpener {
-                partition_index: 0,
-                projection: Arc::new([0, 1]),
-                batch_size: 1024,
-                limit: None,
-                predicate: Some(predicate),
-                logical_file_schema: schema.clone(),
-                metadata_size_hint: None,
-                metrics: ExecutionPlanMetricsSet::new(),
-                parquet_file_reader_factory: Arc::new(
-                    DefaultParquetFileReaderFactory::new(Arc::clone(&store)),
-                ),
-                partition_fields: vec![],
-                pushdown_filters: false, // note that this is false!
-                reorder_filters: false,
-                enable_page_index: false,
-                enable_bloom_filter: false,
-                schema_adapter_factory: Arc::new(DefaultSchemaAdapterFactory),
-                enable_row_group_stats_pruning: true,
-                coerce_int96: None,
-                #[cfg(feature = "parquet_encryption")]
-                file_decryption_properties: None,
-                expr_adapter_factory: Some(Arc::new(DefaultPhysicalExprAdapterFactory)),
-                #[cfg(feature = "parquet_encryption")]
-                encryption_factory: None,
-                max_predicate_cache_size: None,
-            }
+            ParquetMorselizerBuilder::new()
+                .with_store(Arc::clone(&store))
+                .with_schema(Arc::clone(&schema))
+                .with_projection_indices(&[0, 1])
+                .with_predicate(predicate)
+                .with_row_group_stats_pruning(true)
+                .build()
         };
 
         // A filter on "a" should not exclude any rows even if it matches the data
         let expr = col("a").eq(lit(1));
         let predicate = logical2physical(&expr, &schema);
         let opener = make_opener(predicate);
-        let stream = opener.open(file.clone()).unwrap().await.unwrap();
+        let stream = open_file(&opener, file.clone()).await.unwrap();
         let (num_batches, num_rows) = count_batches_and_rows(stream).await;
         assert_eq!(num_batches, 1);
         assert_eq!(num_rows, 3);
@@ -914,7 +2063,7 @@ mod test {
         let expr = col("b").eq(lit(ScalarValue::Float32(Some(5.0))));
         let predicate = logical2physical(&expr, &schema);
         let opener = make_opener(predicate);
-        let stream = opener.open(file).unwrap().await.unwrap();
+        let stream = open_file(&opener, file).await.unwrap();
         let (num_batches, num_rows) = count_batches_and_rows(stream).await;
         assert_eq!(num_batches, 0);
         assert_eq!(num_rows, 0);
@@ -940,38 +2089,18 @@ mod test {
             Field::new("a", DataType::Int32, false),
         ]));
 
+        let table_schema_for_opener = TableSchema::new(
+            file_schema.clone(),
+            vec![Arc::new(Field::new("part", DataType::Int32, false))],
+        );
         let make_opener = |predicate| {
-            ParquetOpener {
-                partition_index: 0,
-                projection: Arc::new([0]),
-                batch_size: 1024,
-                limit: None,
-                predicate: Some(predicate),
-                logical_file_schema: file_schema.clone(),
-                metadata_size_hint: None,
-                metrics: ExecutionPlanMetricsSet::new(),
-                parquet_file_reader_factory: Arc::new(
-                    DefaultParquetFileReaderFactory::new(Arc::clone(&store)),
-                ),
-                partition_fields: vec![Arc::new(Field::new(
-                    "part",
-                    DataType::Int32,
-                    false,
-                ))],
-                pushdown_filters: false, // note that this is false!
-                reorder_filters: false,
-                enable_page_index: false,
-                enable_bloom_filter: false,
-                schema_adapter_factory: Arc::new(DefaultSchemaAdapterFactory),
-                enable_row_group_stats_pruning: true,
-                coerce_int96: None,
-                #[cfg(feature = "parquet_encryption")]
-                file_decryption_properties: None,
-                expr_adapter_factory: Some(Arc::new(DefaultPhysicalExprAdapterFactory)),
-                #[cfg(feature = "parquet_encryption")]
-                encryption_factory: None,
-                max_predicate_cache_size: None,
-            }
+            ParquetMorselizerBuilder::new()
+                .with_store(Arc::clone(&store))
+                .with_table_schema(table_schema_for_opener.clone())
+                .with_projection_indices(&[0])
+                .with_predicate(predicate)
+                .with_row_group_stats_pruning(true)
+                .build()
         };
 
         // Filter should match the partition value
@@ -980,7 +2109,7 @@ mod test {
         // Otherwise we assume it already happened at the planning stage and won't re-do the work here
         let predicate = make_dynamic_expr(logical2physical(&expr, &table_schema));
         let opener = make_opener(predicate);
-        let stream = opener.open(file.clone()).unwrap().await.unwrap();
+        let stream = open_file(&opener, file.clone()).await.unwrap();
         let (num_batches, num_rows) = count_batches_and_rows(stream).await;
         assert_eq!(num_batches, 1);
         assert_eq!(num_rows, 3);
@@ -991,7 +2120,7 @@ mod test {
         // Otherwise we assume it already happened at the planning stage and won't re-do the work here
         let predicate = make_dynamic_expr(logical2physical(&expr, &table_schema));
         let opener = make_opener(predicate);
-        let stream = opener.open(file).unwrap().await.unwrap();
+        let stream = open_file(&opener, file).await.unwrap();
         let (num_batches, num_rows) = count_batches_and_rows(stream).await;
         assert_eq!(num_batches, 0);
         assert_eq!(num_rows, 0);
@@ -1029,45 +2158,25 @@ mod test {
             Field::new("a", DataType::Int32, false),
             Field::new("b", DataType::Float32, true),
         ]));
+        let table_schema_for_opener = TableSchema::new(
+            file_schema.clone(),
+            vec![Arc::new(Field::new("part", DataType::Int32, false))],
+        );
         let make_opener = |predicate| {
-            ParquetOpener {
-                partition_index: 0,
-                projection: Arc::new([0]),
-                batch_size: 1024,
-                limit: None,
-                predicate: Some(predicate),
-                logical_file_schema: file_schema.clone(),
-                metadata_size_hint: None,
-                metrics: ExecutionPlanMetricsSet::new(),
-                parquet_file_reader_factory: Arc::new(
-                    DefaultParquetFileReaderFactory::new(Arc::clone(&store)),
-                ),
-                partition_fields: vec![Arc::new(Field::new(
-                    "part",
-                    DataType::Int32,
-                    false,
-                ))],
-                pushdown_filters: false, // note that this is false!
-                reorder_filters: false,
-                enable_page_index: false,
-                enable_bloom_filter: false,
-                schema_adapter_factory: Arc::new(DefaultSchemaAdapterFactory),
-                enable_row_group_stats_pruning: true,
-                coerce_int96: None,
-                #[cfg(feature = "parquet_encryption")]
-                file_decryption_properties: None,
-                expr_adapter_factory: Some(Arc::new(DefaultPhysicalExprAdapterFactory)),
-                #[cfg(feature = "parquet_encryption")]
-                encryption_factory: None,
-                max_predicate_cache_size: None,
-            }
+            ParquetMorselizerBuilder::new()
+                .with_store(Arc::clone(&store))
+                .with_table_schema(table_schema_for_opener.clone())
+                .with_projection_indices(&[0])
+                .with_predicate(predicate)
+                .with_row_group_stats_pruning(true)
+                .build()
         };
 
         // Filter should match the partition value and file statistics
         let expr = col("part").eq(lit(1)).and(col("b").eq(lit(1.0)));
         let predicate = logical2physical(&expr, &table_schema);
         let opener = make_opener(predicate);
-        let stream = opener.open(file.clone()).unwrap().await.unwrap();
+        let stream = open_file(&opener, file.clone()).await.unwrap();
         let (num_batches, num_rows) = count_batches_and_rows(stream).await;
         assert_eq!(num_batches, 1);
         assert_eq!(num_rows, 3);
@@ -1076,7 +2185,7 @@ mod test {
         let expr = col("part").eq(lit(2)).and(col("b").eq(lit(1.0)));
         let predicate = logical2physical(&expr, &table_schema);
         let opener = make_opener(predicate);
-        let stream = opener.open(file.clone()).unwrap().await.unwrap();
+        let stream = open_file(&opener, file.clone()).await.unwrap();
         let (num_batches, num_rows) = count_batches_and_rows(stream).await;
         assert_eq!(num_batches, 0);
         assert_eq!(num_rows, 0);
@@ -1085,7 +2194,7 @@ mod test {
         let expr = col("part").eq(lit(1)).and(col("b").eq(lit(7.0)));
         let predicate = logical2physical(&expr, &table_schema);
         let opener = make_opener(predicate);
-        let stream = opener.open(file.clone()).unwrap().await.unwrap();
+        let stream = open_file(&opener, file.clone()).await.unwrap();
         let (num_batches, num_rows) = count_batches_and_rows(stream).await;
         assert_eq!(num_batches, 0);
         assert_eq!(num_rows, 0);
@@ -1094,7 +2203,7 @@ mod test {
         let expr = col("part").eq(lit(2)).and(col("b").eq(lit(7.0)));
         let predicate = logical2physical(&expr, &table_schema);
         let opener = make_opener(predicate);
-        let stream = opener.open(file).unwrap().await.unwrap();
+        let stream = open_file(&opener, file).await.unwrap();
         let (num_batches, num_rows) = count_batches_and_rows(stream).await;
         assert_eq!(num_batches, 0);
         assert_eq!(num_rows, 0);
@@ -1121,45 +2230,26 @@ mod test {
             Field::new("a", DataType::Int32, false),
         ]));
 
+        let table_schema_for_opener = TableSchema::new(
+            file_schema.clone(),
+            vec![Arc::new(Field::new("part", DataType::Int32, false))],
+        );
         let make_opener = |predicate| {
-            ParquetOpener {
-                partition_index: 0,
-                projection: Arc::new([0]),
-                batch_size: 1024,
-                limit: None,
-                predicate: Some(predicate),
-                logical_file_schema: file_schema.clone(),
-                metadata_size_hint: None,
-                metrics: ExecutionPlanMetricsSet::new(),
-                parquet_file_reader_factory: Arc::new(
-                    DefaultParquetFileReaderFactory::new(Arc::clone(&store)),
-                ),
-                partition_fields: vec![Arc::new(Field::new(
-                    "part",
-                    DataType::Int32,
-                    false,
-                ))],
-                pushdown_filters: true, // note that this is true!
-                reorder_filters: true,
-                enable_page_index: false,
-                enable_bloom_filter: false,
-                schema_adapter_factory: Arc::new(DefaultSchemaAdapterFactory),
-                enable_row_group_stats_pruning: false, // note that this is false!
-                coerce_int96: None,
-                #[cfg(feature = "parquet_encryption")]
-                file_decryption_properties: None,
-                expr_adapter_factory: Some(Arc::new(DefaultPhysicalExprAdapterFactory)),
-                #[cfg(feature = "parquet_encryption")]
-                encryption_factory: None,
-                max_predicate_cache_size: None,
-            }
+            ParquetMorselizerBuilder::new()
+                .with_store(Arc::clone(&store))
+                .with_table_schema(table_schema_for_opener.clone())
+                .with_projection_indices(&[0])
+                .with_predicate(predicate)
+                .with_pushdown_filters(true) // note that this is true!
+                .with_reorder_filters(true)
+                .build()
         };
 
         // Filter should match the partition value and data value
         let expr = col("part").eq(lit(1)).or(col("a").eq(lit(1)));
         let predicate = logical2physical(&expr, &table_schema);
         let opener = make_opener(predicate);
-        let stream = opener.open(file.clone()).unwrap().await.unwrap();
+        let stream = open_file(&opener, file.clone()).await.unwrap();
         let (num_batches, num_rows) = count_batches_and_rows(stream).await;
         assert_eq!(num_batches, 1);
         assert_eq!(num_rows, 3);
@@ -1168,7 +2258,7 @@ mod test {
         let expr = col("part").eq(lit(1)).or(col("a").eq(lit(3)));
         let predicate = logical2physical(&expr, &table_schema);
         let opener = make_opener(predicate);
-        let stream = opener.open(file.clone()).unwrap().await.unwrap();
+        let stream = open_file(&opener, file.clone()).await.unwrap();
         let (num_batches, num_rows) = count_batches_and_rows(stream).await;
         assert_eq!(num_batches, 1);
         assert_eq!(num_rows, 3);
@@ -1177,7 +2267,7 @@ mod test {
         let expr = col("part").eq(lit(2)).or(col("a").eq(lit(1)));
         let predicate = logical2physical(&expr, &table_schema);
         let opener = make_opener(predicate);
-        let stream = opener.open(file.clone()).unwrap().await.unwrap();
+        let stream = open_file(&opener, file.clone()).await.unwrap();
         let (num_batches, num_rows) = count_batches_and_rows(stream).await;
         assert_eq!(num_batches, 1);
         assert_eq!(num_rows, 1);
@@ -1186,7 +2276,7 @@ mod test {
         let expr = col("part").eq(lit(2)).or(col("a").eq(lit(3)));
         let predicate = logical2physical(&expr, &table_schema);
         let opener = make_opener(predicate);
-        let stream = opener.open(file).unwrap().await.unwrap();
+        let stream = open_file(&opener, file).await.unwrap();
         let (num_batches, num_rows) = count_batches_and_rows(stream).await;
         assert_eq!(num_batches, 0);
         assert_eq!(num_rows, 0);
@@ -1207,211 +2297,427 @@ mod test {
             u64::try_from(data_size).unwrap(),
         );
         file.partition_values = vec![ScalarValue::Int32(Some(1))];
+        file.statistics = Some(Arc::new(
+            Statistics::default().add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(1))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(3))))
+                    .with_null_count(Precision::Exact(0)),
+            ),
+        ));
 
         let table_schema = Arc::new(Schema::new(vec![
-            Field::new("part", DataType::Int32, false),
             Field::new("a", DataType::Int32, false),
+            Field::new("part", DataType::Int32, false),
         ]));
 
+        let table_schema_for_opener = TableSchema::new(
+            file_schema.clone(),
+            vec![Arc::new(Field::new("part", DataType::Int32, false))],
+        );
         let make_opener = |predicate| {
-            ParquetOpener {
-                partition_index: 0,
-                projection: Arc::new([0]),
-                batch_size: 1024,
-                limit: None,
-                predicate: Some(predicate),
-                logical_file_schema: file_schema.clone(),
-                metadata_size_hint: None,
-                metrics: ExecutionPlanMetricsSet::new(),
-                parquet_file_reader_factory: Arc::new(
-                    DefaultParquetFileReaderFactory::new(Arc::clone(&store)),
-                ),
-                partition_fields: vec![Arc::new(Field::new(
-                    "part",
-                    DataType::Int32,
-                    false,
-                ))],
-                pushdown_filters: false, // note that this is false!
-                reorder_filters: false,
-                enable_page_index: false,
-                enable_bloom_filter: false,
-                schema_adapter_factory: Arc::new(DefaultSchemaAdapterFactory),
-                enable_row_group_stats_pruning: true,
-                coerce_int96: None,
-                #[cfg(feature = "parquet_encryption")]
-                file_decryption_properties: None,
-                expr_adapter_factory: Some(Arc::new(DefaultPhysicalExprAdapterFactory)),
-                #[cfg(feature = "parquet_encryption")]
-                encryption_factory: None,
-                max_predicate_cache_size: None,
-            }
+            ParquetMorselizerBuilder::new()
+                .with_store(Arc::clone(&store))
+                .with_table_schema(table_schema_for_opener.clone())
+                .with_projection_indices(&[0])
+                .with_predicate(predicate)
+                .build()
         };
 
-        // Filter should NOT match the stats but the file is never attempted to be pruned because the filters are not dynamic
-        let expr = col("part").eq(lit(2));
+        // This filter could prune based on statistics, but since it's not dynamic it's not applied for pruning
+        // (the assumption is this happened already at planning time)
+        let expr = col("a").eq(lit(42));
         let predicate = logical2physical(&expr, &table_schema);
         let opener = make_opener(predicate);
-        let stream = opener.open(file.clone()).unwrap().await.unwrap();
+        let stream = open_file(&opener, file.clone()).await.unwrap();
         let (num_batches, num_rows) = count_batches_and_rows(stream).await;
-        assert_eq!(num_batches, 1);
-        assert_eq!(num_rows, 3);
+        assert_eq!(num_batches, 0);
+        assert_eq!(num_rows, 0);
 
-        // If we make the filter dynamic, it should prune
+        // If we make the filter dynamic, it should prune.
+        // This allows dynamic filters to prune partitions/files even if they are populated late into execution.
         let predicate = make_dynamic_expr(logical2physical(&expr, &table_schema));
         let opener = make_opener(predicate);
-        let stream = opener.open(file.clone()).unwrap().await.unwrap();
+        let stream = open_file(&opener, file.clone()).await.unwrap();
         let (num_batches, num_rows) = count_batches_and_rows(stream).await;
         assert_eq!(num_batches, 0);
         assert_eq!(num_rows, 0);
-    }
 
-    fn get_value(metrics: &MetricsSet, metric_name: &str) -> usize {
-        match metrics.sum_by_name(metric_name) {
-            Some(v) => v.as_usize(),
-            _ => {
-                panic!(
-                    "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}"
-                );
-            }
-        }
+        // If we have a filter that touches partition columns only and is dynamic, it should prune even if there are no stats.
+        file.statistics = Some(Arc::new(Statistics::new_unknown(&file_schema)));
+        let expr = col("part").eq(lit(2));
+        let predicate = make_dynamic_expr(logical2physical(&expr, &table_schema));
+        let opener = make_opener(predicate);
+        let stream = open_file(&opener, file.clone()).await.unwrap();
+        let (num_batches, num_rows) = count_batches_and_rows(stream).await;
+        assert_eq!(num_batches, 0);
+        assert_eq!(num_rows, 0);
+
+        // Similarly a filter that combines partition and data columns should prune even if there are no stats.
+        let expr = col("part").eq(lit(2)).and(col("a").eq(lit(42)));
+        let predicate = make_dynamic_expr(logical2physical(&expr, &table_schema));
+        let opener = make_opener(predicate);
+        let stream = open_file(&opener, file.clone()).await.unwrap();
+        let (num_batches, num_rows) = count_batches_and_rows(stream).await;
+        assert_eq!(num_batches, 0);
+        assert_eq!(num_rows, 0);
     }
 
     #[tokio::test]
-    async fn test_custom_schema_adapter_no_rewriter() {
-        // Make a hardcoded schema adapter that adds a new column "b" with default value 0.0
-        // and converts the first column "a" from Int32 to UInt64.
-        #[derive(Debug, Clone)]
-        struct CustomSchemaMapper;
-
-        impl SchemaMapper for CustomSchemaMapper {
-            fn map_batch(
-                &self,
-                batch: arrow::array::RecordBatch,
-            ) -> datafusion_common::Result<arrow::array::RecordBatch> {
-                let a_column = cast(batch.column(0), &DataType::UInt64)?;
-                // Add in a new column "b" with default value 0.0
-                let b_column =
-                    arrow::array::Float64Array::from(vec![Some(0.0); batch.num_rows()]);
-                let columns = vec![a_column, Arc::new(b_column)];
-                let new_schema = Arc::new(Schema::new(vec![
-                    Field::new("a", DataType::UInt64, false),
-                    Field::new("b", DataType::Float64, false),
-                ]));
-                Ok(arrow::record_batch::RecordBatch::try_new(
-                    new_schema, columns,
-                )?)
-            }
+    async fn test_reverse_scan_row_groups() {
+        use parquet::file::properties::WriterProperties;
 
-            fn map_column_statistics(
-                &self,
-                file_col_statistics: &[ColumnStatistics],
-            ) -> datafusion_common::Result<Vec<ColumnStatistics>> {
-                Ok(vec![
-                    file_col_statistics[0].clone(),
-                    ColumnStatistics::new_unknown(),
-                ])
-            }
-        }
+        let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
 
-        #[derive(Debug, Clone)]
-        struct CustomSchemaAdapter;
-
-        impl SchemaAdapter for CustomSchemaAdapter {
-            fn map_schema(
-                &self,
-                _file_schema: &Schema,
-            ) -> datafusion_common::Result<(Arc<dyn SchemaMapper>, Vec<usize>)>
-            {
-                let mapper = Arc::new(CustomSchemaMapper);
-                let projection = vec![0]; // We only need to read the first column "a" from the file
-                Ok((mapper, projection))
-            }
+        // Create multiple batches to ensure multiple row groups
+        let batch1 =
+            record_batch!(("a", Int32, vec![Some(1), Some(2), Some(3)])).unwrap();
+        let batch2 =
+            record_batch!(("a", Int32, vec![Some(4), Some(5), Some(6)])).unwrap();
+        let batch3 =
+            record_batch!(("a", Int32, vec![Some(7), Some(8), Some(9)])).unwrap();
+
+        // Write parquet file with multiple row groups
+        // Force small row groups by setting max_row_group_size
+        let props = WriterProperties::builder()
+            .set_max_row_group_row_count(Some(3)) // Force each batch into its own row group
+            .build();
+
+        let data_len = write_parquet_batches(
+            Arc::clone(&store),
+            "test.parquet",
+            vec![batch1.clone(), batch2, batch3],
+            Some(props),
+        )
+        .await;
 
-            fn map_column_index(
-                &self,
-                index: usize,
-                file_schema: &Schema,
-            ) -> Option<usize> {
-                if index < file_schema.fields().len() {
-                    Some(index)
-                } else {
-                    None // The new column "b" is not in the original schema
-                }
-            }
-        }
+        let schema = batch1.schema();
+        let file = PartitionedFile::new(
+            "test.parquet".to_string(),
+            u64::try_from(data_len).unwrap(),
+        );
 
-        #[derive(Debug, Clone)]
-        struct CustomSchemaAdapterFactory;
+        let make_opener = |reverse_scan: bool| {
+            ParquetMorselizerBuilder::new()
+                .with_store(Arc::clone(&store))
+                .with_schema(Arc::clone(&schema))
+                .with_projection_indices(&[0])
+                .with_reverse_row_groups(reverse_scan)
+                .build()
+        };
 
-        impl SchemaAdapterFactory for CustomSchemaAdapterFactory {
-            fn create(
-                &self,
-                _projected_table_schema: SchemaRef,
-                _table_schema: SchemaRef,
-            ) -> Box<dyn SchemaAdapter> {
-                Box::new(CustomSchemaAdapter)
-            }
-        }
+        // Test normal scan (forward)
+        let opener = make_opener(false);
+        let stream = open_file(&opener, file.clone()).await.unwrap();
+        let forward_values = collect_int32_values(stream).await;
+
+        // Test reverse scan
+        let opener = make_opener(true);
+        let stream = open_file(&opener, file.clone()).await.unwrap();
+        let reverse_values = collect_int32_values(stream).await;
+
+        // The forward scan should return data in the order written
+        assert_eq!(forward_values, vec![1, 2, 3, 4, 5, 6, 7, 8, 9]);
+
+        // With reverse scan, row groups are reversed, so we expect:
+        // Row group 3 (7,8,9), then row group 2 (4,5,6), then row group 1 (1,2,3)
+        assert_eq!(reverse_values, vec![7, 8, 9, 4, 5, 6, 1, 2, 3]);
+    }
 
-        // Test that if no expression rewriter is provided we use a schemaadapter to adapt the data to the expression
+    #[tokio::test]
+    async fn test_reverse_scan_single_row_group() {
         let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+
+        // Create a single batch (single row group)
         let batch = record_batch!(("a", Int32, vec![Some(1), Some(2), Some(3)])).unwrap();
-        // Write out the batch to a Parquet file
         let data_size =
             write_parquet(Arc::clone(&store), "test.parquet", batch.clone()).await;
+
+        let schema = batch.schema();
         let file = PartitionedFile::new(
             "test.parquet".to_string(),
             u64::try_from(data_size).unwrap(),
         );
-        let table_schema = Arc::new(Schema::new(vec![
-            Field::new("a", DataType::UInt64, false),
-            Field::new("b", DataType::Float64, false),
-        ]));
 
-        let make_opener = |predicate| ParquetOpener {
-            partition_index: 0,
-            projection: Arc::new([0, 1]),
-            batch_size: 1024,
-            limit: None,
-            predicate: Some(predicate),
-            logical_file_schema: Arc::clone(&table_schema),
-            metadata_size_hint: None,
-            metrics: ExecutionPlanMetricsSet::new(),
-            parquet_file_reader_factory: Arc::new(DefaultParquetFileReaderFactory::new(
-                Arc::clone(&store),
-            )),
-            partition_fields: vec![],
-            pushdown_filters: true,
-            reorder_filters: false,
-            enable_page_index: false,
-            enable_bloom_filter: false,
-            schema_adapter_factory: Arc::new(CustomSchemaAdapterFactory),
-            enable_row_group_stats_pruning: false,
-            coerce_int96: None,
-            #[cfg(feature = "parquet_encryption")]
-            file_decryption_properties: None,
-            expr_adapter_factory: None,
-            #[cfg(feature = "parquet_encryption")]
-            encryption_factory: None,
-            max_predicate_cache_size: None,
+        let make_opener = |reverse_scan: bool| {
+            ParquetMorselizerBuilder::new()
+                .with_store(Arc::clone(&store))
+                .with_schema(Arc::clone(&schema))
+                .with_projection_indices(&[0])
+                .with_reverse_row_groups(reverse_scan)
+                .build()
         };
 
-        let predicate = logical2physical(&col("a").eq(lit(1u64)), &table_schema);
-        let opener = make_opener(predicate);
-        let stream = opener.open(file.clone()).unwrap().await.unwrap();
-        let batches = collect_batches(stream).await;
-
-        #[rustfmt::skip]
-        let expected = [
-            "+---+-----+",
-            "| a | b   |",
-            "+---+-----+",
-            "| 1 | 0.0 |",
-            "+---+-----+",
-        ];
-        assert_batches_eq!(expected, &batches);
-        let metrics = opener.metrics.clone_inner();
-        assert_eq!(get_value(&metrics, "row_groups_pruned_statistics"), 0);
-        assert_eq!(get_value(&metrics, "pushdown_rows_pruned"), 2);
+        // With a single row group, forward and reverse should be the same
+        // (only the row group order is reversed, not the rows within)
+        let opener_forward = make_opener(false);
+        let stream_forward = open_file(&opener_forward, file.clone()).await.unwrap();
+        let (batches_forward, _) = count_batches_and_rows(stream_forward).await;
+
+        let opener_reverse = make_opener(true);
+        let stream_reverse = open_file(&opener_reverse, file).await.unwrap();
+        let (batches_reverse, _) = count_batches_and_rows(stream_reverse).await;
+
+        // Both should have the same number of batches since there's only one row group
+        assert_eq!(batches_forward, batches_reverse);
+    }
+
+    #[tokio::test]
+    async fn test_reverse_scan_with_row_selection() {
+        use parquet::file::properties::WriterProperties;
+
+        let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+
+        // Create 3 batches with DIFFERENT selection patterns
+        let batch1 =
+            record_batch!(("a", Int32, vec![Some(1), Some(2), Some(3), Some(4)]))
+                .unwrap(); // 4 rows
+        let batch2 =
+            record_batch!(("a", Int32, vec![Some(5), Some(6), Some(7), Some(8)]))
+                .unwrap(); // 4 rows
+        let batch3 =
+            record_batch!(("a", Int32, vec![Some(9), Some(10), Some(11), Some(12)]))
+                .unwrap(); // 4 rows
+
+        let props = WriterProperties::builder()
+            .set_max_row_group_row_count(Some(4))
+            .build();
+
+        let data_len = write_parquet_batches(
+            Arc::clone(&store),
+            "test.parquet",
+            vec![batch1.clone(), batch2, batch3],
+            Some(props),
+        )
+        .await;
+
+        let schema = batch1.schema();
+
+        use crate::ParquetAccessPlan;
+        use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
+
+        let mut access_plan = ParquetAccessPlan::new_all(3);
+        // Row group 0: skip first 2, select last 2 (should get: 3, 4)
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![RowSelector::skip(2), RowSelector::select(2)]),
+        );
+        // Row group 1: select all (should get: 5, 6, 7, 8)
+        // Row group 2: select first 2, skip last 2 (should get: 9, 10)
+        access_plan.scan_selection(
+            2,
+            RowSelection::from(vec![RowSelector::select(2), RowSelector::skip(2)]),
+        );
+
+        let file = PartitionedFile::new(
+            "test.parquet".to_string(),
+            u64::try_from(data_len).unwrap(),
+        )
+        .with_extensions(Arc::new(access_plan));
+
+        let make_opener = |reverse_scan: bool| {
+            ParquetMorselizerBuilder::new()
+                .with_store(Arc::clone(&store))
+                .with_schema(Arc::clone(&schema))
+                .with_projection_indices(&[0])
+                .with_reverse_row_groups(reverse_scan)
+                .build()
+        };
+
+        // Forward scan: RG0(3,4), RG1(5,6,7,8), RG2(9,10)
+        let opener = make_opener(false);
+        let stream = open_file(&opener, file.clone()).await.unwrap();
+        let forward_values = collect_int32_values(stream).await;
+
+        // Forward scan should produce: RG0(3,4), RG1(5,6,7,8), RG2(9,10)
+        assert_eq!(
+            forward_values,
+            vec![3, 4, 5, 6, 7, 8, 9, 10],
+            "Forward scan should select correct rows based on RowSelection"
+        );
+
+        // Reverse scan
+        // CORRECT behavior: reverse row groups AND their corresponding selections
+        // - RG2 is read first, WITH RG2's selection (select 2, skip 2) -> 9, 10
+        // - RG1 is read second, WITH RG1's selection (select all) -> 5, 6, 7, 8
+        // - RG0 is read third, WITH RG0's selection (skip 2, select 2) -> 3, 4
+        let opener = make_opener(true);
+        let stream = open_file(&opener, file).await.unwrap();
+        let reverse_values = collect_int32_values(stream).await;
+
+        // Correct expected result: row groups reversed but each keeps its own selection
+        // RG2 with its selection (9,10), RG1 with its selection (5,6,7,8), RG0 with its selection (3,4)
+        assert_eq!(
+            reverse_values,
+            vec![9, 10, 5, 6, 7, 8, 3, 4],
+            "Reverse scan should reverse row group order while maintaining correct RowSelection for each group"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_reverse_scan_with_non_contiguous_row_groups() {
+        use parquet::file::properties::WriterProperties;
+
+        let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+
+        // Create 4 batches (4 row groups)
+        let batch0 = record_batch!(("a", Int32, vec![Some(1), Some(2)])).unwrap();
+        let batch1 = record_batch!(("a", Int32, vec![Some(3), Some(4)])).unwrap();
+        let batch2 = record_batch!(("a", Int32, vec![Some(5), Some(6)])).unwrap();
+        let batch3 = record_batch!(("a", Int32, vec![Some(7), Some(8)])).unwrap();
+
+        let props = WriterProperties::builder()
+            .set_max_row_group_row_count(Some(2))
+            .build();
+
+        let data_len = write_parquet_batches(
+            Arc::clone(&store),
+            "test.parquet",
+            vec![batch0.clone(), batch1, batch2, batch3],
+            Some(props),
+        )
+        .await;
+
+        let schema = batch0.schema();
+
+        use crate::ParquetAccessPlan;
+        use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
+
+        // KEY: Skip RG1 (non-contiguous!)
+        // Only scan row groups: [0, 2, 3]
+        let mut access_plan = ParquetAccessPlan::new(vec![
+            RowGroupAccess::Scan, // RG0
+            RowGroupAccess::Skip, // RG1 - SKIPPED!
+            RowGroupAccess::Scan, // RG2
+            RowGroupAccess::Scan, // RG3
+        ]);
+
+        // Add RowSelection for each scanned row group
+        // RG0: select first row (1), skip second (2)
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![RowSelector::select(1), RowSelector::skip(1)]),
+        );
+        // RG1: skipped, no selection needed
+        // RG2: select first row (5), skip second (6)
+        access_plan.scan_selection(
+            2,
+            RowSelection::from(vec![RowSelector::select(1), RowSelector::skip(1)]),
+        );
+        // RG3: select first row (7), skip second (8)
+        access_plan.scan_selection(
+            3,
+            RowSelection::from(vec![RowSelector::select(1), RowSelector::skip(1)]),
+        );
+
+        let file = PartitionedFile::new(
+            "test.parquet".to_string(),
+            u64::try_from(data_len).unwrap(),
+        )
+        .with_extensions(Arc::new(access_plan));
+
+        let make_opener = |reverse_scan: bool| {
+            ParquetMorselizerBuilder::new()
+                .with_store(Arc::clone(&store))
+                .with_schema(Arc::clone(&schema))
+                .with_projection_indices(&[0])
+                .with_reverse_row_groups(reverse_scan)
+                .build()
+        };
+
+        // Forward scan: RG0(1), RG2(5), RG3(7)
+        // Note: RG1 is completely skipped
+        let opener = make_opener(false);
+        let stream = open_file(&opener, file.clone()).await.unwrap();
+        let forward_values = collect_int32_values(stream).await;
+
+        assert_eq!(
+            forward_values,
+            vec![1, 5, 7],
+            "Forward scan with non-contiguous row groups"
+        );
+
+        // Reverse scan: RG3(7), RG2(5), RG0(1)
+        // WITHOUT the bug fix, this would return WRONG values
+        // because the RowSelection would be incorrectly mapped
+        let opener = make_opener(true);
+        let stream = open_file(&opener, file).await.unwrap();
+        let reverse_values = collect_int32_values(stream).await;
+
+        assert_eq!(
+            reverse_values,
+            vec![7, 5, 1],
+            "Reverse scan with non-contiguous row groups should correctly map RowSelection"
+        );
+    }
+
+    /// Test that page pruning predicates are only built and applied when `enable_page_index` is true.
+    ///
+    /// The file has a single row group with 10 pages (10 rows each, values 1..100).
+    /// With page index enabled, pages whose max value <= 90 are pruned, returning only
+    /// the last page (rows 91..100). With page index disabled, all 100 rows are returned
+    /// since neither pushdown nor row-group pruning is active.
+    #[tokio::test]
+    async fn test_page_pruning_predicate_respects_enable_page_index() {
+        use parquet::file::properties::WriterProperties;
+
+        let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+
+        // 100 rows with values 1..=100, written as a single row group with 10 rows per page
+        let values: Vec<i32> = (1..=100).collect();
+        let batch = record_batch!((
+            "a",
+            Int32,
+            values.iter().map(|v| Some(*v)).collect::<Vec<_>>()
+        ))
+        .unwrap();
+        let props = WriterProperties::builder()
+            .set_data_page_row_count_limit(10)
+            .set_write_batch_size(10)
+            .build();
+        let schema = batch.schema();
+        let data_size = write_parquet_batches(
+            Arc::clone(&store),
+            "test.parquet",
+            vec![batch],
+            Some(props),
+        )
+        .await;
+
+        let file = PartitionedFile::new("test.parquet".to_string(), data_size as u64);
+
+        // predicate: a > 90 — should allow page index to prune first 9 pages
+        let predicate = logical2physical(&col("a").gt(lit(90i32)), &schema);
+
+        let make_morselizer = |enable_page_index| {
+            ParquetMorselizerBuilder::new()
+                .with_store(Arc::clone(&store))
+                .with_schema(Arc::clone(&schema))
+                .with_predicate(Arc::clone(&predicate))
+                .with_enable_page_index(enable_page_index)
+                // disable pushdown and row-group pruning so the only pruning path is page index
+                .with_pushdown_filters(false)
+                .with_row_group_stats_pruning(false)
+                .build()
+        };
+        let (_, rows_with_page_index) = count_batches_and_rows(
+            open_file(&make_morselizer(true), file.clone())
+                .await
+                .unwrap(),
+        )
+        .await;
+        let (_, rows_without_page_index) = count_batches_and_rows(
+            open_file(&make_morselizer(false), file).await.unwrap(),
+        )
+        .await;
+
+        assert_eq!(
+            rows_with_page_index, 10,
+            "page index should prune 9 of 10 pages"
+        );
+        assert_eq!(
+            rows_without_page_index, 100,
+            "without page index all rows are returned"
+        );
     }
 }
diff --git a/datafusion/datasource-parquet/src/page_filter.rs b/datafusion/datasource-parquet/src/page_filter.rs
index 2698b6c5fbb67..baef36ce147d4 100644
--- a/datafusion/datasource-parquet/src/page_filter.rs
+++ b/datafusion/datasource-parquet/src/page_filter.rs
@@ -28,9 +28,9 @@ use arrow::{
     array::ArrayRef,
     datatypes::{Schema, SchemaRef},
 };
-use datafusion_common::pruning::PruningStatistics;
 use datafusion_common::ScalarValue;
-use datafusion_physical_expr::{split_conjunction, PhysicalExpr};
+use datafusion_common::pruning::PruningStatistics;
+use datafusion_physical_expr::{PhysicalExpr, split_conjunction};
 use datafusion_pruning::PruningPredicate;
 
 use log::{debug, trace};
@@ -118,6 +118,7 @@ pub struct PagePruningAccessPlanFilter {
 impl PagePruningAccessPlanFilter {
     /// Create a new [`PagePruningAccessPlanFilter`] from a physical
     /// expression.
+    #[expect(clippy::needless_pass_by_value)]
     pub fn new(expr: &Arc<dyn PhysicalExpr>, schema: SchemaRef) -> Self {
         // extract any single column predicates
         let predicates = split_conjunction(expr)
@@ -177,9 +178,10 @@ impl PagePruningAccessPlanFilter {
             || parquet_metadata.column_index().is_none()
         {
             debug!(
-                    "Can not prune pages due to lack of indexes. Have offset: {}, column index: {}",
-                    parquet_metadata.offset_index().is_some(), parquet_metadata.column_index().is_some()
-                );
+                "Can not prune pages due to lack of indexes. Have offset: {}, column index: {}",
+                parquet_metadata.offset_index().is_some(),
+                parquet_metadata.column_index().is_some()
+            );
             return access_plan;
         };
 
@@ -187,6 +189,10 @@ impl PagePruningAccessPlanFilter {
         let mut total_skip = 0;
         // track the total number of rows that should not be skipped
         let mut total_select = 0;
+        // track the total number of pages that should be skipped
+        let mut total_pages_skip = 0;
+        // track the total number of pages that should not be skipped
+        let mut total_pages_select = 0;
 
         // for each row group specified in the access plan
         let row_group_indexes = access_plan.row_group_indexes();
@@ -224,12 +230,15 @@ impl PagePruningAccessPlanFilter {
                     file_metrics,
                 );
 
-                let Some(selection) = selection else {
+                let Some((selection, total_pages, matched_pages)) = selection else {
                     trace!("No pages pruned in prune_pages_in_one_row_group");
                     continue;
                 };
+                total_pages_select += matched_pages;
+                total_pages_skip += total_pages - matched_pages;
 
-                debug!("Use filter and page index to create RowSelection {:?} from predicate: {:?}",
+                debug!(
+                    "Use filter and page index to create RowSelection {:?} from predicate: {:?}",
                     &selection,
                     predicate.predicate_expr(),
                 );
@@ -252,7 +261,9 @@ impl PagePruningAccessPlanFilter {
                 let rows_selected = overall_selection.row_count();
                 if rows_selected > 0 {
                     let rows_skipped = overall_selection.skipped_row_count();
-                    trace!("Overall selection from predicate skipped {rows_skipped}, selected {rows_selected}: {overall_selection:?}");
+                    trace!(
+                        "Overall selection from predicate skipped {rows_skipped}, selected {rows_selected}: {overall_selection:?}"
+                    );
                     total_skip += rows_skipped;
                     total_select += rows_selected;
                     access_plan.scan_selection(row_group_index, overall_selection)
@@ -273,6 +284,12 @@ impl PagePruningAccessPlanFilter {
         file_metrics
             .page_index_rows_pruned
             .add_matched(total_select);
+        file_metrics
+            .page_index_pages_pruned
+            .add_pruned(total_pages_skip);
+        file_metrics
+            .page_index_pages_pruned
+            .add_matched(total_pages_select);
         access_plan
     }
 
@@ -292,7 +309,8 @@ fn update_selection(
     }
 }
 
-/// Returns a [`RowSelection`] for the rows in this row group to scan.
+/// Returns a [`RowSelection`] for the rows in this row group to scan, in addition to the number of
+/// total and matched pages.
 ///
 /// This Row Selection is formed from the page index and the predicate skips row
 /// ranges that can be ruled out based on the predicate.
@@ -305,7 +323,7 @@ fn prune_pages_in_one_row_group(
     converter: StatisticsConverter<'_>,
     parquet_metadata: &ParquetMetaData,
     metrics: &ParquetFileMetrics,
-) -> Option<RowSelection> {
+) -> Option<(RowSelection, usize, usize)> {
     let pruning_stats =
         PagesPruningStatistics::try_new(row_group_index, converter, parquet_metadata)?;
 
@@ -357,7 +375,11 @@ fn prune_pages_in_one_row_group(
         RowSelector::skip(sum_row)
     };
     vec.push(selector);
-    Some(RowSelection::from(vec))
+
+    let total_pages = values.len();
+    let matched_pages = values.iter().filter(|v| **v).count();
+
+    Some((RowSelection::from(vec), total_pages, matched_pages))
 }
 
 /// Implement [`PruningStatistics`] for one column's PageIndex (column_index + offset_index)
@@ -487,7 +509,7 @@ impl PruningStatistics for PagesPruningStatistics<'_> {
         }
     }
 
-    fn row_counts(&self, _column: &datafusion_common::Column) -> Option<ArrayRef> {
+    fn row_counts(&self) -> Option<ArrayRef> {
         match self.converter.data_page_row_counts(
             self.offset_index,
             self.row_group_metadatas,
diff --git a/datafusion/datasource-parquet/src/reader.rs b/datafusion/datasource-parquet/src/reader.rs
index 88a3cea5623bc..482bf8dced4f8 100644
--- a/datafusion/datasource-parquet/src/reader.rs
+++ b/datafusion/datasource-parquet/src/reader.rs
@@ -18,15 +18,15 @@
 //! [`ParquetFileReaderFactory`] and [`DefaultParquetFileReaderFactory`] for
 //! low level control of parquet file readers
 
-use crate::metadata::DFParquetMetadata;
 use crate::ParquetFileMetrics;
+use crate::metadata::DFParquetMetadata;
 use bytes::Bytes;
 use datafusion_datasource::PartitionedFile;
 use datafusion_execution::cache::cache_manager::FileMetadata;
 use datafusion_execution::cache::cache_manager::FileMetadataCache;
 use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
-use futures::future::BoxFuture;
 use futures::FutureExt;
+use futures::future::BoxFuture;
 use object_store::ObjectStore;
 use parquet::arrow::arrow_reader::ArrowReaderOptions;
 use parquet::arrow::async_reader::{AsyncFileReader, ParquetObjectReader};
@@ -37,7 +37,7 @@ use std::fmt::Debug;
 use std::ops::Range;
 use std::sync::Arc;
 
-/// Interface for reading parquet files.
+/// Interface for reading Apache Parquet files.
 ///
 /// The combined implementations of [`ParquetFileReaderFactory`] and
 /// [`AsyncFileReader`] can be used to provide custom data access operations
@@ -97,6 +97,7 @@ impl DefaultParquetFileReaderFactory {
 pub struct ParquetFileReader {
     pub file_metrics: ParquetFileMetrics,
     pub inner: ParquetObjectReader,
+    pub partitioned_file: PartitionedFile,
 }
 
 impl AsyncFileReader for ParquetFileReader {
@@ -129,6 +130,18 @@ impl AsyncFileReader for ParquetFileReader {
     }
 }
 
+impl Drop for ParquetFileReader {
+    fn drop(&mut self) {
+        self.file_metrics
+            .scan_efficiency_ratio
+            .add_part(self.file_metrics.bytes_scanned.value());
+        // Multiple ParquetFileReaders may run, so we set_total to avoid adding the total multiple times
+        self.file_metrics
+            .scan_efficiency_ratio
+            .set_total(self.partitioned_file.object_meta.size as usize);
+    }
+}
+
 impl ParquetFileReaderFactory for DefaultParquetFileReaderFactory {
     fn create_reader(
         &self,
@@ -156,6 +169,7 @@ impl ParquetFileReaderFactory for DefaultParquetFileReaderFactory {
         Ok(Box::new(ParquetFileReader {
             inner,
             file_metrics,
+            partitioned_file,
         }))
     }
 }
@@ -208,14 +222,14 @@ impl ParquetFileReaderFactory for CachedParquetFileReaderFactory {
             inner = inner.with_footer_size_hint(hint)
         };
 
-        Ok(Box::new(CachedParquetFileReader {
-            store: Arc::clone(&self.store),
-            inner,
+        Ok(Box::new(CachedParquetFileReader::new(
             file_metrics,
+            Arc::clone(&self.store),
+            inner,
             partitioned_file,
-            metadata_cache: Arc::clone(&self.metadata_cache),
+            Arc::clone(&self.metadata_cache),
             metadata_size_hint,
-        }))
+        )))
     }
 }
 
@@ -231,6 +245,26 @@ pub struct CachedParquetFileReader {
     metadata_size_hint: Option<usize>,
 }
 
+impl CachedParquetFileReader {
+    pub fn new(
+        file_metrics: ParquetFileMetrics,
+        store: Arc<dyn ObjectStore>,
+        inner: ParquetObjectReader,
+        partitioned_file: PartitionedFile,
+        metadata_cache: Arc<dyn FileMetadataCache>,
+        metadata_size_hint: Option<usize>,
+    ) -> Self {
+        Self {
+            file_metrics,
+            store,
+            inner,
+            partitioned_file,
+            metadata_cache,
+            metadata_size_hint,
+        }
+    }
+}
+
 impl AsyncFileReader for CachedParquetFileReader {
     fn get_bytes(
         &mut self,
@@ -255,7 +289,8 @@ impl AsyncFileReader for CachedParquetFileReader {
 
     fn get_metadata<'a>(
         &'a mut self,
-        #[allow(unused_variables)] options: Option<&'a ArrowReaderOptions>,
+        #[cfg_attr(not(feature = "parquet_encryption"), expect(unused_variables))]
+        options: Option<&'a ArrowReaderOptions>,
     ) -> BoxFuture<'a, parquet::errors::Result<Arc<ParquetMetaData>>> {
         let object_meta = self.partitioned_file.object_meta.clone();
         let metadata_cache = Arc::clone(&self.metadata_cache);
@@ -286,6 +321,18 @@ impl AsyncFileReader for CachedParquetFileReader {
     }
 }
 
+impl Drop for CachedParquetFileReader {
+    fn drop(&mut self) {
+        self.file_metrics
+            .scan_efficiency_ratio
+            .add_part(self.file_metrics.bytes_scanned.value());
+        // Multiple ParquetFileReaders may run, so we set_total to avoid adding the total multiple times
+        self.file_metrics
+            .scan_efficiency_ratio
+            .set_total(self.partitioned_file.object_meta.size as usize);
+    }
+}
+
 /// Wrapper to implement [`FileMetadata`] for [`ParquetMetaData`].
 pub struct CachedParquetMetaData(Arc<ParquetMetaData>);
 
diff --git a/datafusion/datasource-parquet/src/row_filter.rs b/datafusion/datasource-parquet/src/row_filter.rs
index 660b32f486120..6dfaa731ae7f9 100644
--- a/datafusion/datasource-parquet/src/row_filter.rs
+++ b/datafusion/datasource-parquet/src/row_filter.rs
@@ -50,38 +50,46 @@
 //! 2. Determine whether each predicate can be evaluated as an `ArrowPredicate`.
 //! 3. Determine, for each predicate, the total compressed size of all
 //!    columns required to evaluate the predicate.
-//! 4. Determine, for each predicate, whether all columns required to
-//!    evaluate the expression are sorted.
-//! 5. Re-order the predicate by total size (from step 3).
-//! 6. Partition the predicates according to whether they are sorted (from step 4)
-//! 7. "Compile" each predicate `Expr` to a `DatafusionArrowPredicate`.
-//! 8. Build the `RowFilter` with the sorted predicates followed by
-//!    the unsorted predicates. Within each partition, predicates are
-//!    still be sorted by size.
-
-use std::cmp::Ordering;
+//! 4. Re-order predicates by total size (from step 3).
+//! 5. "Compile" each predicate `Expr` to a `DatafusionArrowPredicate`.
+//! 6. Build the `RowFilter` from the ordered predicates.
+//!
+//! List-aware predicates (for example, `array_has`, `array_has_all`, and
+//! `array_has_any`) can be evaluated directly during Parquet decoding.
+//! Struct field access via `get_field` is also supported when the accessed
+//! leaf is a primitive type. Filters that reference entire struct columns
+//! rather than individual fields cannot be pushed down and are instead
+//! evaluated after the full batches are materialized.
+//!
+//! For example, given a struct column `s {name: Utf8, value: Int32}`:
+//! - `WHERE s['value'] > 5` — pushed down (accesses a primitive leaf)
+//! - `WHERE s IS NOT NULL`  — not pushed down (references the whole struct)
+
 use std::collections::BTreeSet;
 use std::sync::Arc;
 
 use arrow::array::BooleanArray;
-use arrow::datatypes::{DataType, Schema, SchemaRef};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use arrow::error::{ArrowError, Result as ArrowResult};
 use arrow::record_batch::RecordBatch;
-use parquet::arrow::arrow_reader::{ArrowPredicate, RowFilter};
+use datafusion_functions::core::getfield::GetFieldFunc;
 use parquet::arrow::ProjectionMask;
+use parquet::arrow::arrow_reader::{ArrowPredicate, RowFilter};
 use parquet::file::metadata::ParquetMetaData;
+use parquet::schema::types::SchemaDescriptor;
 
+use datafusion_common::Result;
 use datafusion_common::cast::as_boolean_array;
 use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor};
-use datafusion_common::Result;
-use datafusion_datasource::schema_adapter::{SchemaAdapterFactory, SchemaMapper};
-use datafusion_physical_expr::expressions::Column;
-use datafusion_physical_expr::utils::reassign_expr_columns;
-use datafusion_physical_expr::{split_conjunction, PhysicalExpr};
+use datafusion_physical_expr::ScalarFunctionExpr;
+use datafusion_physical_expr::expressions::{Column, Literal};
+use datafusion_physical_expr::utils::{collect_columns, reassign_expr_columns};
+use datafusion_physical_expr::{PhysicalExpr, split_conjunction};
 
 use datafusion_physical_plan::metrics;
 
 use super::ParquetFileMetrics;
+use super::supported_predicates::supports_list_predicates;
 
 /// A "compiled" predicate passed to `ParquetRecordBatchStream` to perform
 /// row-level filtering during parquet decoding.
@@ -92,12 +100,17 @@ use super::ParquetFileMetrics;
 ///
 /// An expression can be evaluated as a `DatafusionArrowPredicate` if it:
 /// * Does not reference any projected columns
-/// * Does not reference columns with non-primitive types (e.g. structs / lists)
+/// * References either primitive columns or list columns used by
+///   supported predicates (such as `array_has_all` or NULL checks).
+/// * References struct fields via `get_field` where the accessed leaf
+///   is a primitive type (e.g. `get_field(struct_col, 'field') > 5`).
+///   Direct references to whole struct columns are still evaluated after
+///   decoding.
 #[derive(Debug)]
 pub(crate) struct DatafusionArrowPredicate {
     /// the filter expression
     physical_expr: Arc<dyn PhysicalExpr>,
-    /// Path to the columns in the parquet schema required to evaluate the
+    /// Path to the leaf columns in the parquet schema required to evaluate the
     /// expression
     projection_mask: ProjectionMask,
     /// how many rows were filtered out by this predicate
@@ -106,32 +119,25 @@ pub(crate) struct DatafusionArrowPredicate {
     rows_matched: metrics::Count,
     /// how long was spent evaluating this predicate
     time: metrics::Time,
-    /// used to perform type coercion while filtering rows
-    schema_mapper: Arc<dyn SchemaMapper>,
 }
 
 impl DatafusionArrowPredicate {
     /// Create a new `DatafusionArrowPredicate` from a `FilterCandidate`
     pub fn try_new(
         candidate: FilterCandidate,
-        metadata: &ParquetMetaData,
         rows_pruned: metrics::Count,
         rows_matched: metrics::Count,
         time: metrics::Time,
     ) -> Result<Self> {
         let physical_expr =
-            reassign_expr_columns(candidate.expr, &candidate.filter_schema)?;
+            reassign_expr_columns(candidate.expr, &candidate.read_plan.projected_schema)?;
 
         Ok(Self {
             physical_expr,
-            projection_mask: ProjectionMask::roots(
-                metadata.file_metadata().schema_descr(),
-                candidate.projection,
-            ),
+            projection_mask: candidate.read_plan.projection_mask,
             rows_pruned,
             rows_matched,
             time,
-            schema_mapper: candidate.schema_mapper,
         })
     }
 }
@@ -142,8 +148,6 @@ impl ArrowPredicate for DatafusionArrowPredicate {
     }
 
     fn evaluate(&mut self, batch: RecordBatch) -> ArrowResult<BooleanArray> {
-        let batch = self.schema_mapper.map_batch(batch)?;
-
         // scoped timer updates on drop
         let mut timer = self.time.timer();
 
@@ -180,75 +184,45 @@ pub(crate) struct FilterCandidate {
     /// the filter and to order the filters when `reorder_predicates` is true.
     /// This is generated by summing the compressed size of all columns that the filter references.
     required_bytes: usize,
-    /// Can this filter use an index (e.g. a page index) to prune rows?
-    can_use_index: bool,
-    /// The projection to read from the file schema to get the columns
-    /// required to pass through a `SchemaMapper` to the table schema
-    /// upon which we then evaluate the filter expression.
-    projection: Vec<usize>,
-    ///  A `SchemaMapper` used to map batches read from the file schema to
-    /// the filter's projection of the table schema.
-    schema_mapper: Arc<dyn SchemaMapper>,
-    /// The projected table schema that this filter references
-    filter_schema: SchemaRef,
+    /// The resolved Parquet read plan (leaf indices + projected schema).
+    read_plan: ParquetReadPlan,
+}
+
+/// The result of resolving which Parquet leaf columns and Arrow schema fields
+/// are needed to evaluate an expression against a Parquet file
+///
+/// This is the shared output of the column resolution pipeline used by both
+/// the row filter to build `ArrowPredicate`s and the opener to build `ProjectionMask`s
+#[derive(Debug, Clone)]
+pub(crate) struct ParquetReadPlan {
+    /// Projection mask built from leaf column indices in the Parquet schema.
+    /// Using a `ProjectionMask` directly (rather than raw indices) prevents
+    /// bugs from accidentally mixing up root vs leaf indices.
+    pub projection_mask: ProjectionMask,
+    /// The projected Arrow schema containing only the columns/fields required
+    /// Struct types are pruned to include only the accessed sub-fields
+    pub projected_schema: SchemaRef,
 }
 
 /// Helper to build a `FilterCandidate`.
 ///
-/// This will do several things
+/// This will do several things:
 /// 1. Determine the columns required to evaluate the expression
 /// 2. Calculate data required to estimate the cost of evaluating the filter
-/// 3. Rewrite column expressions in the predicate which reference columns not
-///    in the particular file schema.
-///
-/// # Schema Rewrite
-///
-/// When parquet files are read in the context of "schema evolution" there are
-/// potentially wo schemas:
-///
-/// 1. The table schema (the columns of the table that the parquet file is part of)
-/// 2. The file schema (the columns actually in the parquet file)
 ///
-/// There are times when the table schema contains columns that are not in the
-/// file schema, such as when new columns have been added in new parquet files
-/// but old files do not have the columns.
-///
-/// When a file is missing a column from the table schema, the value of the
-/// missing column is filled in by a `SchemaAdapter` (by default as `NULL`).
-///
-/// When a predicate is pushed down to the parquet reader, the predicate is
-/// evaluated in the context of the file schema.
-/// For each predicate we build a filter schema which is the projection of the table
-/// schema that contains only the columns that this filter references.
-/// If any columns from the file schema are missing from a particular file they are
-/// added by the `SchemaAdapter`, by default as `NULL`.
+/// Note: This does *not* handle any adaptation of the expression to the file schema.
+/// The expression must already be adapted before being passed in here, generally using
+/// [`PhysicalExprAdapter`](datafusion_physical_expr_adapter::PhysicalExprAdapter).
 struct FilterCandidateBuilder {
     expr: Arc<dyn PhysicalExpr>,
-    /// The schema of this parquet file.
-    /// Columns may have different types from the table schema and there may be
-    /// columns in the file schema that are not in the table schema or columns that
-    /// are in the table schema that are not in the file schema.
+    /// The Arrow schema of this parquet file (the result of converting the
+    /// parquet schema to Arrow, potentially with type coercions applied).
     file_schema: SchemaRef,
-    /// The schema of the table (merged schema) -- columns may be in different
-    /// order than in the file and have columns that are not in the file schema
-    table_schema: SchemaRef,
-    /// A `SchemaAdapterFactory` used to map the file schema to the table schema.
-    schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
 }
 
 impl FilterCandidateBuilder {
-    pub fn new(
-        expr: Arc<dyn PhysicalExpr>,
-        file_schema: Arc<Schema>,
-        table_schema: Arc<Schema>,
-        schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
-    ) -> Self {
-        Self {
-            expr,
-            file_schema,
-            table_schema,
-            schema_adapter_factory,
-        }
+    pub fn new(expr: Arc<dyn PhysicalExpr>, file_schema: Arc<Schema>) -> Self {
+        Self { expr, file_schema }
     }
 
     /// Attempt to build a `FilterCandidate` from the expression
@@ -259,118 +233,737 @@ impl FilterCandidateBuilder {
     /// * `Ok(None)` if the expression cannot be used as an ArrowFilter
     /// * `Err(e)` if an error occurs while building the candidate
     pub fn build(self, metadata: &ParquetMetaData) -> Result<Option<FilterCandidate>> {
-        let Some(required_indices_into_table_schema) =
-            pushdown_columns(&self.expr, &self.table_schema)?
-        else {
-            return Ok(None);
-        };
-
-        let projected_table_schema = Arc::new(
-            self.table_schema
-                .project(&required_indices_into_table_schema)?,
-        );
-
-        let (schema_mapper, projection_into_file_schema) = self
-            .schema_adapter_factory
-            .create(Arc::clone(&projected_table_schema), self.table_schema)
-            .map_schema(&self.file_schema)?;
-
-        let required_bytes = size_of_columns(&projection_into_file_schema, metadata)?;
-        let can_use_index = columns_sorted(&projection_into_file_schema, metadata)?;
-
-        Ok(Some(FilterCandidate {
-            expr: self.expr,
-            required_bytes,
-            can_use_index,
-            projection: projection_into_file_schema,
-            schema_mapper: Arc::clone(&schema_mapper),
-            filter_schema: Arc::clone(&projected_table_schema),
-        }))
+        Ok(
+            build_parquet_read_plan(&self.expr, &self.file_schema, metadata)?.map(
+                |(read_plan, required_bytes)| FilterCandidate {
+                    expr: self.expr,
+                    required_bytes,
+                    read_plan,
+                },
+            ),
+        )
     }
 }
 
-// a struct that implements TreeNodeRewriter to traverse a PhysicalExpr tree structure to determine
-// if any column references in the expression would prevent it from being predicate-pushed-down.
-// if non_primitive_columns || projected_columns, it can't be pushed down.
-// can't be reused between calls to `rewrite`; each construction must be used only once.
+/// Traverses a `PhysicalExpr` tree to determine if any column references would
+/// prevent the expression from being pushed down to the parquet decoder.
+///
+/// An expression cannot be pushed down if it references:
+/// - Unsupported nested columns (whole struct references or list fields that are
+///   not covered by the supported predicate set)
+/// - Columns that don't exist in the file schema
+///
+/// Struct field access via `get_field` is supported when the resolved leaf type
+/// is primitive (e.g. `get_field(struct_col, 'field') > 5`).
 struct PushdownChecker<'schema> {
     /// Does the expression require any non-primitive columns (like structs)?
     non_primitive_columns: bool,
-    /// Does the expression reference any columns that are in the table
-    /// schema but not in the file schema?
-    /// This includes partition columns and projected columns.
+    /// Does the expression reference any columns not present in the file schema?
     projected_columns: bool,
-    // Indices into the table schema of the columns required to evaluate the expression
-    required_columns: BTreeSet<usize>,
-    table_schema: &'schema Schema,
+    /// Indices into the file schema of columns required to evaluate the expression.
+    /// Does not include struct columns accessed via `get_field`.
+    required_columns: Vec<usize>,
+    /// Struct field accesses via `get_field`.
+    struct_field_accesses: Vec<StructFieldAccess>,
+    /// Whether nested list columns are supported by the predicate semantics.
+    allow_list_columns: bool,
+    /// The Arrow schema of the parquet file.
+    file_schema: &'schema Schema,
 }
 
 impl<'schema> PushdownChecker<'schema> {
-    fn new(table_schema: &'schema Schema) -> Self {
+    fn new(file_schema: &'schema Schema, allow_list_columns: bool) -> Self {
         Self {
             non_primitive_columns: false,
             projected_columns: false,
-            required_columns: BTreeSet::default(),
-            table_schema,
+            required_columns: Vec::new(),
+            struct_field_accesses: Vec::new(),
+            allow_list_columns,
+            file_schema,
         }
     }
 
+    /// Checks whether a struct's root column exists in the file schema and, if so,
+    /// records its index so the entire struct is decoded for filter evaluation.
+    ///
+    /// This is called when we see a `get_field` expression that resolves to a
+    /// primitive leaf type. We only need the *root* column index because the
+    /// Parquet reader decodes all leaves of a struct together.
+    ///
+    /// # Example
+    ///
+    /// Given file schema `{a: Int32, s: Struct(foo: Utf8, bar: Int64)}` and the
+    /// expression `get_field(s, 'foo') = 'hello'`:
+    ///
+    /// - `column_name` = `"s"` (the root struct column)
+    /// - `file_schema.index_of("s")` returns `1`
+    /// - We push `1` into `required_columns`
+    /// - Return `None` (no issue — traversal continues in the caller)
+    ///
+    /// If `"s"` is not in the file schema (e.g. a projected-away column), we set
+    /// `projected_columns = true` and return `Jump` to skip the subtree.
+    fn check_struct_field_column(
+        &mut self,
+        column_name: &str,
+        field_path: Vec<String>,
+    ) -> Option<TreeNodeRecursion> {
+        let Ok(idx) = self.file_schema.index_of(column_name) else {
+            self.projected_columns = true;
+            return Some(TreeNodeRecursion::Jump);
+        };
+
+        self.struct_field_accesses.push(StructFieldAccess {
+            root_index: idx,
+            field_path,
+        });
+
+        None
+    }
+
     fn check_single_column(&mut self, column_name: &str) -> Option<TreeNodeRecursion> {
-        if let Ok(idx) = self.table_schema.index_of(column_name) {
-            self.required_columns.insert(idx);
-            if DataType::is_nested(self.table_schema.field(idx).data_type()) {
-                self.non_primitive_columns = true;
+        let idx = match self.file_schema.index_of(column_name) {
+            Ok(idx) => idx,
+            Err(_) => {
+                // Column does not exist in the file schema, so we can't push this down.
+                self.projected_columns = true;
                 return Some(TreeNodeRecursion::Jump);
             }
+        };
+
+        // Duplicates are handled by dedup() in into_sorted_columns()
+        self.required_columns.push(idx);
+        let data_type = self.file_schema.field(idx).data_type();
+
+        if DataType::is_nested(data_type) {
+            self.handle_nested_type(data_type)
         } else {
-            // If the column does not exist in the (un-projected) table schema then
-            // it must be a projected column.
-            self.projected_columns = true;
-            return Some(TreeNodeRecursion::Jump);
+            None
         }
+    }
 
-        None
+    /// Determines whether a nested data type can be pushed down to Parquet decoding.
+    ///
+    /// Returns `Some(TreeNodeRecursion::Jump)` if the nested type prevents pushdown,
+    /// `None` if the type is supported and pushdown can continue.
+    fn handle_nested_type(&mut self, data_type: &DataType) -> Option<TreeNodeRecursion> {
+        if self.is_nested_type_supported(data_type) {
+            None
+        } else {
+            // Block pushdown for unsupported nested types:
+            // - Structs (regardless of predicate support)
+            // - Lists without supported predicates
+            self.non_primitive_columns = true;
+            Some(TreeNodeRecursion::Jump)
+        }
+    }
+
+    /// Checks if a nested data type is supported for list column pushdown.
+    ///
+    /// List columns are only supported if:
+    /// 1. The data type is a list variant (List, LargeList, or FixedSizeList)
+    /// 2. The expression contains supported list predicates (e.g., array_has_all)
+    fn is_nested_type_supported(&self, data_type: &DataType) -> bool {
+        let is_list = matches!(
+            data_type,
+            DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _)
+        );
+        self.allow_list_columns && is_list
     }
 
     #[inline]
     fn prevents_pushdown(&self) -> bool {
         self.non_primitive_columns || self.projected_columns
     }
+
+    /// Consumes the checker and returns sorted, deduplicated column indices
+    /// wrapped in a `PushdownColumns` struct.
+    ///
+    /// This method sorts the column indices and removes duplicates. The sort
+    /// is required because downstream code relies on column indices being in
+    /// ascending order for correct schema projection.
+    fn into_sorted_columns(mut self) -> PushdownColumns {
+        self.required_columns.sort_unstable();
+        self.required_columns.dedup();
+        PushdownColumns {
+            required_columns: self.required_columns,
+            struct_field_accesses: self.struct_field_accesses,
+        }
+    }
 }
 
 impl TreeNodeVisitor<'_> for PushdownChecker<'_> {
     type Node = Arc<dyn PhysicalExpr>;
 
     fn f_down(&mut self, node: &Self::Node) -> Result<TreeNodeRecursion> {
-        if let Some(column) = node.as_any().downcast_ref::<Column>() {
-            if let Some(recursion) = self.check_single_column(column.name()) {
-                return Ok(recursion);
+        // Handle struct field access like `s['foo']['bar'] > 10`.
+        //
+        // DataFusion represents nested field access as `get_field(Column("s"), "foo")`
+        // (or chained: `get_field(get_field(Column("s"), "foo"), "bar")`).
+        //
+        // We intercept the outermost `get_field` on the way *down* the tree so
+        // the visitor never reaches the raw `Column("s")` node. Without this,
+        // `check_single_column` would see that `s` is a Struct and reject it.
+        //
+        // The strategy:
+        //   1. Match `get_field` whose first arg is a `Column` (the struct root).
+        //   2. Check that the *resolved* return type is primitive — meaning we've
+        //      drilled all the way to a leaf (e.g. `s['foo']` → Utf8).
+        //   3. Record the root column index via `check_struct_field_column` and
+        //      return `Jump` to skip visiting the children (the Column and the
+        //      literal field-name args), since we've already handled them.
+        //
+        // If the return type is still nested (e.g. `s['nested_struct']` → Struct),
+        // we fall through and let normal traversal continue, which will
+        // eventually reject the expression when it hits the struct Column.
+        if let Some(func) =
+            ScalarFunctionExpr::try_downcast_func::<GetFieldFunc>(node.as_ref())
+        {
+            let args = func.args();
+
+            if let Some(column) = args.first().and_then(|a| a.downcast_ref::<Column>()) {
+                // for Map columns, get_field performs a runtime key lookup rather than a
+                // schema-level field access so the entire Map column must be read,
+                // we skip the struct field optimization and defer to normal Column traversal
+                let is_map_column = self
+                    .file_schema
+                    .index_of(column.name())
+                    .ok()
+                    .map(|idx| {
+                        matches!(
+                            self.file_schema.field(idx).data_type(),
+                            DataType::Map(_, _)
+                        )
+                    })
+                    .unwrap_or(false);
+
+                let return_type = func.return_type();
+
+                if !is_map_column
+                    && (!DataType::is_nested(return_type)
+                        || self.is_nested_type_supported(return_type))
+                {
+                    // try to resolve all field name arguments to strinrg literals
+                    // if any argument is not a string literal, we can not determine the exact
+                    // leaf path so we fall back to reading the entire struct root column
+                    let field_path = args[1..]
+                        .iter()
+                        .map(|arg| {
+                            arg.downcast_ref::<Literal>().and_then(|lit| {
+                                lit.value().try_as_str().flatten().map(|s| s.to_string())
+                            })
+                        })
+                        .collect();
+
+                    match field_path {
+                        Some(path) => {
+                            if let Some(recursion) =
+                                self.check_struct_field_column(column.name(), path)
+                            {
+                                return Ok(recursion);
+                            }
+                        }
+                        None => {
+                            // Could not resolve field path — fall back to
+                            // reading the entire struct root column.
+                            if let Some(recursion) =
+                                self.check_single_column(column.name())
+                            {
+                                return Ok(recursion);
+                            }
+                        }
+                    }
+
+                    return Ok(TreeNodeRecursion::Jump);
+                }
             }
         }
 
+        if let Some(column) = node.downcast_ref::<Column>()
+            && let Some(recursion) = self.check_single_column(column.name())
+        {
+            return Ok(recursion);
+        }
+
         Ok(TreeNodeRecursion::Continue)
     }
 }
 
-// Checks if a given expression can be pushed down into `DataSourceExec` as opposed to being evaluated
-// post-parquet-scan in a `FilterExec`. If it can be pushed down, this returns all the
-// columns in the given expression so that they can be used in the parquet scanning, along with the
-// expression rewritten as defined in [`PushdownChecker::f_up`]
+/// Describes the nested column behavior for filter pushdown.
+///
+/// This enum makes explicit the different states a predicate can be in
+/// with respect to nested column handling during Parquet decoding.
+/// Result of checking which columns are required for filter pushdown.
+#[derive(Debug)]
+struct PushdownColumns {
+    /// Sorted, unique column indices into the file schema required to evaluate
+    /// the filter expression. Must be in ascending order for correct schema
+    /// projection matching. Does not include struct columns accessed via `get_field`.
+    required_columns: Vec<usize>,
+    /// Struct field accesses via `get_field`. Each entry records the root struct
+    /// column index and the field path being accessed.
+    struct_field_accesses: Vec<StructFieldAccess>,
+}
+
+/// Records a struct field access via `get_field(struct_col, 'field1', 'field2', ...)`.
+///
+/// This allows the row filter to project only the specific Parquet leaf columns
+/// needed by the filter, rather than all leaves of the struct.
+#[derive(Debug, Clone)]
+struct StructFieldAccess {
+    /// Arrow root column index of the struct in the file schema.
+    root_index: usize,
+    /// Field names forming the path into the struct.
+    /// e.g., `["value"]` for `s['value']`, `["outer", "inner"]` for `s['outer']['inner']`.
+    field_path: Vec<String>,
+}
+
+/// Checks if a given expression can be pushed down to the parquet decoder.
+///
+/// Returns `Some(PushdownColumns)` if the expression can be pushed down,
+/// where the struct contains the indices into the file schema of all columns
+/// required to evaluate the expression.
+///
+/// Returns `None` if the expression cannot be pushed down (e.g., references
+/// unsupported nested types or columns not in the file).
 fn pushdown_columns(
     expr: &Arc<dyn PhysicalExpr>,
-    table_schema: &Schema,
-) -> Result<Option<Vec<usize>>> {
-    let mut checker = PushdownChecker::new(table_schema);
+    file_schema: &Schema,
+) -> Result<Option<PushdownColumns>> {
+    let allow_list_columns = supports_list_predicates(expr);
+    let mut checker = PushdownChecker::new(file_schema, allow_list_columns);
     expr.visit(&mut checker)?;
-    Ok((!checker.prevents_pushdown())
-        .then_some(checker.required_columns.into_iter().collect()))
+    Ok((!checker.prevents_pushdown()).then(|| checker.into_sorted_columns()))
 }
 
-/// Recurses through expr as a tree, finds all `column`s, and checks if any of them would prevent
-/// this expression from being predicate pushed down. If any of them would, this returns false.
-/// Otherwise, true.
-/// Note that the schema passed in here is *not* the physical file schema (as it is not available at that point in time);
-/// it is the schema of the table that this expression is being evaluated against minus any projected columns and partition columns.
+/// Resolves which Parquet leaf columns and Arrow schema fields are needed
+/// to evaluate `expr` against a Parquet file
+///
+/// Returns `Ok(Some((plan, required_bytes)))` when the expression can be
+/// evaluated using only pushdown-compatible columns. `Ok(None)` when it
+/// cannot (it references whole struct columns or columns missing from disk).
+///
+/// The `required_bytes` is the total compressed size of all referenced columns
+/// across all row groups, used to estimate filter evaluation cost.
+///
+/// Note: this is a shared entry point used by both row filter construction and
+/// the opener's projection logic
+pub(crate) fn build_parquet_read_plan(
+    expr: &Arc<dyn PhysicalExpr>,
+    file_schema: &Schema,
+    metadata: &ParquetMetaData,
+) -> Result<Option<(ParquetReadPlan, usize)>> {
+    let schema_descr = metadata.file_metadata().schema_descr();
+
+    let Some(required_columns) = pushdown_columns(expr, file_schema)? else {
+        return Ok(None);
+    };
+
+    let root_indices = &required_columns.required_columns;
+
+    let mut leaf_indices =
+        leaf_indices_for_roots(root_indices.iter().copied(), schema_descr);
+
+    let struct_leaf_indices = resolve_struct_field_leaves(
+        &required_columns.struct_field_accesses,
+        file_schema,
+        schema_descr,
+    );
+    leaf_indices.extend_from_slice(&struct_leaf_indices);
+    leaf_indices.sort_unstable();
+    leaf_indices.dedup();
+
+    let required_bytes = size_of_columns(&leaf_indices, metadata)?;
+
+    let projection_mask =
+        ProjectionMask::leaves(schema_descr, leaf_indices.iter().copied());
+
+    let projected_schema = build_filter_schema(
+        file_schema,
+        root_indices,
+        &required_columns.struct_field_accesses,
+    );
+
+    Ok(Some((
+        ParquetReadPlan {
+            projection_mask,
+            projected_schema,
+        },
+        required_bytes,
+    )))
+}
+
+/// Builds a unified [`ParquetReadPlan`] for a set of projection expressions
+///
+/// Unlike [`build_parquet_read_plan`] (which is used for filter pushdown and
+/// returns `None` when an expression references unsupported nested types or
+/// missing columns), this function always succeeds. It collects every column
+/// that *can* be resolved in the file and produces a leaf-level projection
+/// mask. Columns missing from the file are silently skipped since the projection
+/// layer handles those by inserting nulls.
+pub(crate) fn build_projection_read_plan(
+    exprs: impl IntoIterator<Item = Arc<dyn PhysicalExpr>>,
+    file_schema: &Schema,
+    schema_descr: &SchemaDescriptor,
+) -> ParquetReadPlan {
+    // fast path: if every expression is a plain Column reference, skip all
+    // struct analysis and use root-level projection directly
+    let exprs = exprs.into_iter().collect::<Vec<_>>();
+    let all_plain_columns = exprs.iter().all(|e| e.downcast_ref::<Column>().is_some());
+
+    if all_plain_columns {
+        let mut root_indices: Vec<usize> = exprs
+            .iter()
+            .map(|e| e.downcast_ref::<Column>().unwrap().index())
+            .collect();
+        root_indices.sort_unstable();
+        root_indices.dedup();
+
+        let projection_mask =
+            ProjectionMask::roots(schema_descr, root_indices.iter().copied());
+        let projected_schema = Arc::new(
+            file_schema
+                .project(&root_indices)
+                .expect("valid column indices"),
+        );
+
+        return ParquetReadPlan {
+            projection_mask,
+            projected_schema,
+        };
+    }
+
+    // secondary fast path: if the schema has no struct columns, we can skip
+    // PushdownChecker traversal and use root-level projection
+    let has_struct_columns = file_schema
+        .fields()
+        .iter()
+        .any(|f| matches!(f.data_type(), DataType::Struct(_)));
+
+    if !has_struct_columns {
+        let mut root_indices = exprs
+            .into_iter()
+            .flat_map(|e| collect_columns(&e).into_iter().map(|col| col.index()))
+            .collect::<Vec<_>>();
+
+        root_indices.sort_unstable();
+        root_indices.dedup();
+
+        let projection_mask =
+            ProjectionMask::roots(schema_descr, root_indices.iter().copied());
+
+        let projected_schema = Arc::new(
+            file_schema
+                .project(&root_indices)
+                .expect("valid column indices"),
+        );
+
+        return ParquetReadPlan {
+            projection_mask,
+            projected_schema,
+        };
+    }
+
+    let mut all_root_indices = Vec::new();
+    let mut all_struct_accesses = Vec::new();
+
+    for expr in exprs {
+        let mut checker = PushdownChecker::new(file_schema, true);
+        let _ = expr.visit(&mut checker);
+        let columns = checker.into_sorted_columns();
+
+        all_root_indices.extend_from_slice(&columns.required_columns);
+        all_struct_accesses.extend(columns.struct_field_accesses);
+    }
+
+    all_root_indices.sort_unstable();
+    all_root_indices.dedup();
+
+    // when no struct field accesses were found, fall back to root-level projection
+    // to match the performance of the simple path
+    if all_struct_accesses.is_empty() {
+        let projection_mask =
+            ProjectionMask::roots(schema_descr, all_root_indices.iter().copied());
+        let projected_schema = Arc::new(
+            file_schema
+                .project(&all_root_indices)
+                .expect("valid column indices"),
+        );
+
+        return ParquetReadPlan {
+            projection_mask,
+            projected_schema,
+        };
+    }
+
+    let leaf_indices = {
+        let mut out =
+            leaf_indices_for_roots(all_root_indices.iter().copied(), schema_descr);
+        let struct_leaf_indices =
+            resolve_struct_field_leaves(&all_struct_accesses, file_schema, schema_descr);
+
+        out.extend_from_slice(&struct_leaf_indices);
+        out.sort_unstable();
+        out.dedup();
+
+        out
+    };
+
+    let projection_mask =
+        ProjectionMask::leaves(schema_descr, leaf_indices.iter().copied());
+
+    let projected_schema =
+        build_filter_schema(file_schema, &all_root_indices, &all_struct_accesses);
+
+    ParquetReadPlan {
+        projection_mask,
+        projected_schema,
+    }
+}
+
+fn leaf_indices_for_roots<I>(
+    root_indices: I,
+    schema_descr: &SchemaDescriptor,
+) -> Vec<usize>
+where
+    I: IntoIterator<Item = usize>,
+{
+    // Always map root (Arrow) indices to Parquet leaf indices via the schema
+    // descriptor. Arrow root indices only equal Parquet leaf indices when the
+    // schema has no group columns (Struct, Map, etc.); when group columns
+    // exist, their children become separate leaves and shift all subsequent
+    // leaf indices.
+    // Struct columns are unsupported.
+    let root_set: BTreeSet<_> = root_indices.into_iter().collect();
+
+    (0..schema_descr.num_columns())
+        .filter(|leaf_idx| {
+            root_set.contains(&schema_descr.get_column_root_idx(*leaf_idx))
+        })
+        .collect()
+}
+
+/// Resolves struct field access to specific Parquet leaf column indices
+///
+/// For every `StructFieldAccess`, finds the leaf columns in the Parquet schema
+/// whose path matches the struct root name + field path. This avoids reading all
+/// leaves of a struct when only specific fields are needed
+fn resolve_struct_field_leaves(
+    accesses: &[StructFieldAccess],
+    file_schema: &Schema,
+    schema_descr: &SchemaDescriptor,
+) -> Vec<usize> {
+    let mut leaf_indices = Vec::new();
+
+    for access in accesses {
+        let root_name = file_schema.field(access.root_index).name();
+        let prefix = std::iter::once(root_name.as_str())
+            .chain(access.field_path.iter().map(|p| p.as_str()))
+            .collect::<Vec<_>>();
+
+        for leaf_idx in 0..schema_descr.num_columns() {
+            let col = schema_descr.column(leaf_idx);
+            let col_path = col.path().parts();
+
+            // A leaf matches if its path starts with our prefix.
+            // e.g., prefix=["s", "value"] matches leaf path ["s", "value"]
+            // prefix=["s", "outer"] matches ["s", "outer", "inner"]
+
+            // a leaf matches if its path starts with our prefix
+            // for example: prefix=["s", "value"] matches leaf path ["s", "value"]
+            //              prefix=["s", "outer"] matches ["s", "outer", "inner"]
+            let leaf_matches_path = col_path.len() >= prefix.len()
+                && col_path.iter().zip(prefix.iter()).all(|(a, b)| a == b);
+
+            if leaf_matches_path {
+                leaf_indices.push(leaf_idx);
+            }
+        }
+    }
+
+    leaf_indices
+}
+
+/// Builds a filter schema that includes only the fields actually accessed by the
+/// filter expression.
+///
+/// For regular (non-struct) columns, the full field type is used.
+/// For struct columns accessed via `get_field`, a pruned struct type is created
+/// containing only the fields along the access path. Note: it must match the schema
+/// that the Parquet reader produces when projecting specific struct leaves
+fn build_filter_schema(
+    file_schema: &Schema,
+    regular_indices: &[usize],
+    struct_field_accesses: &[StructFieldAccess],
+) -> SchemaRef {
+    let regular_set: BTreeSet<usize> = regular_indices.iter().copied().collect();
+
+    let all_indices = regular_indices
+        .iter()
+        .copied()
+        .chain(
+            struct_field_accesses
+                .iter()
+                .map(|&StructFieldAccess { root_index, .. }| root_index),
+        )
+        .collect::<BTreeSet<_>>();
+
+    let fields = all_indices
+        .iter()
+        .map(|&idx| {
+            let field = file_schema.field(idx);
+
+            // if this column appears as a regular (whole-column) reference,
+            // keep the full type
+            //
+            // Pruning is only valid when the column is accessed exclusively
+            // through struct field accesses
+            if regular_set.contains(&idx) {
+                return Arc::new(field.clone());
+            }
+
+            // collect all field paths that access this root struct column
+            let field_paths = struct_field_accesses
+                .iter()
+                .filter_map(
+                    |&StructFieldAccess {
+                         root_index,
+                         ref field_path,
+                     }| {
+                        (root_index == idx).then_some(field_path.as_slice())
+                    },
+                )
+                .collect::<Vec<_>>();
+
+            if field_paths.is_empty() {
+                return Arc::new(field.clone());
+            }
+
+            let pruned_data_type = prune_struct_type(field.data_type(), &field_paths);
+            Arc::new(Field::new(
+                field.name(),
+                pruned_data_type,
+                field.is_nullable(),
+            ))
+        })
+        .collect::<Vec<_>>();
+
+    Arc::new(Schema::new_with_metadata(
+        fields,
+        file_schema.metadata().clone(),
+    ))
+}
+
+fn prune_struct_type(dt: &DataType, paths: &[&[String]]) -> DataType {
+    let DataType::Struct(fields) = dt else {
+        return dt.clone();
+    };
+
+    let needed = paths
+        .iter()
+        .filter_map(|p| p.first().map(|s| s.as_str()))
+        .collect::<BTreeSet<_>>();
+
+    let pruned_fields = fields
+        .iter()
+        .filter_map(|f| {
+            if !needed.contains(f.name().as_str()) {
+                return None;
+            }
+
+            let sub_paths = paths
+                .iter()
+                .filter_map(|path| {
+                    if path.first().map(|s| s.as_str()) == Some(f.name()) {
+                        Some(&path[1..])
+                    } else {
+                        None
+                    }
+                })
+                .filter(|sub| !sub.is_empty())
+                .collect::<Vec<_>>();
+
+            let out = if sub_paths.is_empty() {
+                // Leaf of access path — keep the field as-is.
+                Arc::clone(f)
+            } else {
+                // Recurse into nested struct.
+                let pruned = prune_struct_type(f.data_type(), &sub_paths);
+                Arc::new(Field::new(f.name(), pruned, f.is_nullable()))
+            };
+
+            Some(out)
+        })
+        .collect::<Vec<_>>();
+
+    DataType::Struct(pruned_fields.into())
+}
+
+/// Checks if a predicate expression can be pushed down to the parquet decoder.
+///
+/// Returns `true` if all columns referenced by the expression:
+/// - Exist in the provided schema
+/// - Are primitive types OR list columns with supported predicates
+///   (e.g., `array_has`, `array_has_all`, `array_has_any`, IS NULL, IS NOT NULL)
+/// - Are struct columns accessed via `get_field` where the leaf type is primitive
+/// - Direct references to whole struct columns will prevent pushdown
+///
+/// # Arguments
+/// * `expr` - The filter expression to check
+/// * `file_schema` - The Arrow schema of the parquet file (or table schema when
+///   the file schema is not yet available during planning)
+///
+/// # Examples
+///
+/// Primitive column filters can be pushed down:
+/// ```ignore
+/// use datafusion_expr::{col, Expr};
+/// use datafusion_common::ScalarValue;
+/// use arrow::datatypes::{DataType, Field, Schema};
+/// use std::sync::Arc;
+///
+/// let schema = Arc::new(Schema::new(vec![
+///     Field::new("age", DataType::Int32, false),
+/// ]));
+///
+/// // Primitive filter: can be pushed down
+/// let expr = col("age").gt(Expr::Literal(ScalarValue::Int32(Some(30)), None));
+/// let expr = logical2physical(&expr, &schema);
+/// assert!(can_expr_be_pushed_down_with_schemas(&expr, &schema));
+/// ```
+///
+/// Struct column filters cannot be pushed down:
+/// ```ignore
+/// use arrow::datatypes::Fields;
+///
+/// let schema = Arc::new(Schema::new(vec![
+///     Field::new("person", DataType::Struct(
+///         Fields::from(vec![Field::new("name", DataType::Utf8, true)])
+///     ), true),
+/// ]));
+///
+/// // Struct filter: cannot be pushed down
+/// let expr = col("person").is_not_null();
+/// let expr = logical2physical(&expr, &schema);
+/// assert!(!can_expr_be_pushed_down_with_schemas(&expr, &schema));
+/// ```
+///
+/// List column filters with supported predicates can be pushed down:
+/// ```ignore
+/// use datafusion_functions_nested::expr_fn::{array_has_all, make_array};
+///
+/// let schema = Arc::new(Schema::new(vec![
+///     Field::new("tags", DataType::List(
+///         Arc::new(Field::new("item", DataType::Utf8, true))
+///     ), true),
+/// ]));
+///
+/// // Array filter with supported predicate: can be pushed down
+/// let expr = array_has_all(col("tags"), make_array(vec![
+///     Expr::Literal(ScalarValue::Utf8(Some("rust".to_string())), None)
+/// ]));
+/// let expr = logical2physical(&expr, &schema);
+/// assert!(can_expr_be_pushed_down_with_schemas(&expr, &schema));
+/// ```
 pub fn can_expr_be_pushed_down_with_schemas(
     expr: &Arc<dyn PhysicalExpr>,
     file_schema: &Schema,
@@ -381,7 +974,7 @@ pub fn can_expr_be_pushed_down_with_schemas(
     }
 }
 
-/// Calculate the total compressed size of all `Column`'s required for
+/// Calculate the total compressed size of all leaf columns required for
 /// predicate `Expr`.
 ///
 /// This value represents the total amount of IO required to evaluate the
@@ -398,38 +991,33 @@ fn size_of_columns(columns: &[usize], metadata: &ParquetMetaData) -> Result<usiz
     Ok(total_size)
 }
 
-/// For a given set of `Column`s required for predicate `Expr` determine whether
-/// all columns are sorted.
+/// Build a [`RowFilter`] from the given predicate expression if possible.
 ///
-/// Sorted columns may be queried more efficiently in the presence of
-/// a PageIndex.
-fn columns_sorted(_columns: &[usize], _metadata: &ParquetMetaData) -> Result<bool> {
-    // TODO How do we know this?
-    Ok(false)
-}
-
-/// Build a [`RowFilter`] from the given predicate `Expr` if possible
+/// # Arguments
+/// * `expr` - The filter predicate, already adapted to reference columns in `file_schema`
+/// * `file_schema` - The Arrow schema of the parquet file (the result of converting
+///   the parquet schema to Arrow, potentially with type coercions applied)
+/// * `metadata` - Parquet file metadata used for cost estimation
+/// * `reorder_predicates` - If true, reorder predicates to minimize I/O
+/// * `file_metrics` - Metrics for tracking filter performance
 ///
-/// # returns
-/// * `Ok(Some(row_filter))` if the expression can be used as RowFilter
-/// * `Ok(None)` if the expression cannot be used as an RowFilter
+/// # Returns
+/// * `Ok(Some(row_filter))` if the expression can be used as a RowFilter
+/// * `Ok(None)` if the expression cannot be used as a RowFilter
 /// * `Err(e)` if an error occurs while building the filter
 ///
-/// Note that the returned `RowFilter` may not contains all conjuncts in the
-/// original expression. This is because some conjuncts may not be able to be
-/// evaluated as an `ArrowPredicate` and will be ignored.
+/// Note: The returned `RowFilter` may not contain all conjuncts from the original
+/// expression. Conjuncts that cannot be evaluated as an `ArrowPredicate` are ignored.
 ///
 /// For example, if the expression is `a = 1 AND b = 2 AND c = 3` and `b = 2`
-/// can not be evaluated for some reason, the returned `RowFilter` will contain
-/// `a = 1` and `c = 3`.
+/// cannot be evaluated for some reason, the returned `RowFilter` will contain
+/// only `a = 1` and `c = 3`.
 pub fn build_row_filter(
     expr: &Arc<dyn PhysicalExpr>,
-    physical_file_schema: &SchemaRef,
-    predicate_file_schema: &SchemaRef,
+    file_schema: &SchemaRef,
     metadata: &ParquetMetaData,
     reorder_predicates: bool,
     file_metrics: &ParquetFileMetrics,
-    schema_adapter_factory: &Arc<dyn SchemaAdapterFactory>,
 ) -> Result<Option<RowFilter>> {
     let rows_pruned = &file_metrics.pushdown_rows_pruned;
     let rows_matched = &file_metrics.pushdown_rows_matched;
@@ -443,13 +1031,8 @@ pub fn build_row_filter(
     let mut candidates: Vec<FilterCandidate> = predicates
         .into_iter()
         .map(|expr| {
-            FilterCandidateBuilder::new(
-                Arc::clone(expr),
-                Arc::clone(physical_file_schema),
-                Arc::clone(predicate_file_schema),
-                Arc::clone(schema_adapter_factory),
-            )
-            .build(metadata)
+            FilterCandidateBuilder::new(Arc::clone(expr), Arc::clone(file_schema))
+                .build(metadata)
         })
         .collect::<Result<Vec<_>, _>>()?
         .into_iter()
@@ -462,22 +1045,35 @@ pub fn build_row_filter(
     }
 
     if reorder_predicates {
-        candidates.sort_unstable_by(|c1, c2| {
-            match c1.can_use_index.cmp(&c2.can_use_index) {
-                Ordering::Equal => c1.required_bytes.cmp(&c2.required_bytes),
-                ord => ord,
-            }
-        });
+        candidates.sort_unstable_by_key(|c| c.required_bytes);
     }
 
+    // To avoid double-counting metrics when multiple predicates are used:
+    // - All predicates should count rows_pruned (cumulative pruned rows)
+    // - Only the last predicate should count rows_matched (final result)
+    // This ensures: rows_matched + rows_pruned = total rows processed
+    let total_candidates = candidates.len();
+
     candidates
         .into_iter()
-        .map(|candidate| {
+        .enumerate()
+        .map(|(idx, candidate)| {
+            let is_last = idx == total_candidates - 1;
+
+            // All predicates share the pruned counter (cumulative)
+            let predicate_rows_pruned = rows_pruned.clone();
+
+            // Only the last predicate tracks matched rows (final result)
+            let predicate_rows_matched = if is_last {
+                rows_matched.clone()
+            } else {
+                metrics::Count::new()
+            };
+
             DatafusionArrowPredicate::try_new(
                 candidate,
-                metadata,
-                rows_pruned.clone(),
-                rows_matched.clone(),
+                predicate_rows_pruned,
+                predicate_rows_matched,
                 time.clone(),
             )
             .map(|pred| Box::new(pred) as _)
@@ -489,21 +1085,38 @@ pub fn build_row_filter(
 #[cfg(test)]
 mod test {
     use super::*;
+    use arrow::datatypes::Fields;
     use datafusion_common::ScalarValue;
 
+    use arrow::array::{
+        Int32Array, ListBuilder, StringArray, StringBuilder, StructArray,
+    };
     use arrow::datatypes::{Field, TimeUnit::Nanosecond};
-    use datafusion_datasource::schema_adapter::DefaultSchemaAdapterFactory;
-    use datafusion_expr::{col, Expr};
+    use datafusion_expr::{Expr, col};
+    use datafusion_functions::core::get_field;
+    use datafusion_functions_nested::array_has::{
+        array_has_all_udf, array_has_any_udf, array_has_udf,
+    };
+    use datafusion_functions_nested::expr_fn::{
+        array_has, array_has_all, array_has_any, make_array,
+    };
     use datafusion_physical_expr::planner::logical2physical;
-    use datafusion_physical_plan::metrics::{Count, Time};
+    use datafusion_physical_expr_adapter::{
+        DefaultPhysicalExprAdapterFactory, PhysicalExprAdapterFactory,
+    };
+    use datafusion_physical_plan::metrics::{Count, ExecutionPlanMetricsSet, Time};
 
+    use parquet::arrow::ArrowWriter;
     use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
     use parquet::arrow::parquet_to_arrow_schema;
     use parquet::file::reader::{FileReader, SerializedFileReader};
+    use tempfile::NamedTempFile;
+
+    use datafusion_physical_expr::expressions::Column as PhysicalColumn;
 
-    // We should ignore predicate that read non-primitive columns
+    // List predicates used by the decoder should be accepted for pushdown
     #[test]
-    fn test_filter_candidate_builder_ignore_complex_types() {
+    fn test_filter_candidate_builder_supports_list_types() {
         let testdata = datafusion_common::test_util::parquet_test_data();
         let file = std::fs::File::open(format!("{testdata}/list_columns.parquet"))
             .expect("opening file");
@@ -519,19 +1132,20 @@ mod test {
         let expr = col("int64_list").is_not_null();
         let expr = logical2physical(&expr, &table_schema);
 
-        let schema_adapter_factory = Arc::new(DefaultSchemaAdapterFactory);
         let table_schema = Arc::new(table_schema.clone());
 
-        let candidate = FilterCandidateBuilder::new(
-            expr,
-            table_schema.clone(),
-            table_schema,
-            schema_adapter_factory,
-        )
-        .build(metadata)
-        .expect("building candidate");
+        let list_index = table_schema
+            .index_of("int64_list")
+            .expect("list column should exist");
+
+        let candidate = FilterCandidateBuilder::new(expr, table_schema)
+            .build(metadata)
+            .expect("building candidate")
+            .expect("list pushdown should be supported");
 
-        assert!(candidate.is_none());
+        let expected_mask =
+            ProjectionMask::leaves(metadata.file_metadata().schema_descr(), [list_index]);
+        assert_eq!(candidate.read_plan.projection_mask, expected_mask);
     }
 
     #[test]
@@ -559,21 +1173,18 @@ mod test {
             None,
         ));
         let expr = logical2physical(&expr, &table_schema);
-        let schema_adapter_factory = Arc::new(DefaultSchemaAdapterFactory);
-        let table_schema = Arc::new(table_schema.clone());
-        let candidate = FilterCandidateBuilder::new(
-            expr,
-            file_schema.clone(),
-            table_schema.clone(),
-            schema_adapter_factory,
-        )
-        .build(&metadata)
-        .expect("building candidate")
-        .expect("candidate expected");
+        let expr = DefaultPhysicalExprAdapterFactory {}
+            .create(Arc::new(table_schema.clone()), Arc::clone(&file_schema))
+            .expect("creating expr adapter")
+            .rewrite(expr)
+            .expect("rewriting expression");
+        let candidate = FilterCandidateBuilder::new(expr, file_schema.clone())
+            .build(&metadata)
+            .expect("building candidate")
+            .expect("candidate expected");
 
         let mut row_filter = DatafusionArrowPredicate::try_new(
             candidate,
-            &metadata,
             Count::new(),
             Count::new(),
             Time::new(),
@@ -600,20 +1211,19 @@ mod test {
             None,
         ));
         let expr = logical2physical(&expr, &table_schema);
-        let schema_adapter_factory = Arc::new(DefaultSchemaAdapterFactory);
-        let candidate = FilterCandidateBuilder::new(
-            expr,
-            file_schema,
-            table_schema,
-            schema_adapter_factory,
-        )
-        .build(&metadata)
-        .expect("building candidate")
-        .expect("candidate expected");
+        // Rewrite the expression to add CastExpr for type coercion
+        let expr = DefaultPhysicalExprAdapterFactory {}
+            .create(Arc::new(table_schema), Arc::clone(&file_schema))
+            .expect("creating expr adapter")
+            .rewrite(expr)
+            .expect("rewriting expression");
+        let candidate = FilterCandidateBuilder::new(expr, file_schema)
+            .build(&metadata)
+            .expect("building candidate")
+            .expect("candidate expected");
 
         let mut row_filter = DatafusionArrowPredicate::try_new(
             candidate,
-            &metadata,
             Count::new(),
             Count::new(),
             Time::new(),
@@ -625,14 +1235,233 @@ mod test {
     }
 
     #[test]
-    fn nested_data_structures_prevent_pushdown() {
+    fn struct_data_structures_prevent_pushdown() {
+        let table_schema = Arc::new(Schema::new(vec![Field::new(
+            "struct_col",
+            DataType::Struct(
+                vec![Arc::new(Field::new("a", DataType::Int32, true))].into(),
+            ),
+            true,
+        )]));
+
+        let expr = col("struct_col").is_not_null();
+        let expr = logical2physical(&expr, &table_schema);
+
+        assert!(!can_expr_be_pushed_down_with_schemas(&expr, &table_schema));
+    }
+
+    #[test]
+    fn mixed_primitive_and_struct_prevents_pushdown() {
+        // Even when a predicate contains both primitive and unsupported nested columns,
+        // the entire predicate should not be pushed down because the struct column
+        // cannot be evaluated during Parquet decoding.
+        let table_schema = Arc::new(Schema::new(vec![
+            Field::new(
+                "struct_col",
+                DataType::Struct(
+                    vec![Arc::new(Field::new("a", DataType::Int32, true))].into(),
+                ),
+                true,
+            ),
+            Field::new("int_col", DataType::Int32, false),
+        ]));
+
+        // Expression: (struct_col IS NOT NULL) AND (int_col = 5)
+        // Even though int_col is primitive, the presence of struct_col in the
+        // conjunction should prevent pushdown of the entire expression.
+        let expr = col("struct_col")
+            .is_not_null()
+            .and(col("int_col").eq(Expr::Literal(ScalarValue::Int32(Some(5)), None)));
+        let expr = logical2physical(&expr, &table_schema);
+
+        // The entire expression should not be pushed down
+        assert!(!can_expr_be_pushed_down_with_schemas(&expr, &table_schema));
+
+        // However, just the int_col predicate alone should be pushable
+        let expr_int_only =
+            col("int_col").eq(Expr::Literal(ScalarValue::Int32(Some(5)), None));
+        let expr_int_only = logical2physical(&expr_int_only, &table_schema);
+        assert!(can_expr_be_pushed_down_with_schemas(
+            &expr_int_only,
+            &table_schema
+        ));
+    }
+
+    #[test]
+    fn nested_lists_allow_pushdown_checks() {
         let table_schema = Arc::new(get_lists_table_schema());
 
         let expr = col("utf8_list").is_not_null();
         let expr = logical2physical(&expr, &table_schema);
         check_expression_can_evaluate_against_schema(&expr, &table_schema);
 
-        assert!(!can_expr_be_pushed_down_with_schemas(&expr, &table_schema));
+        assert!(can_expr_be_pushed_down_with_schemas(&expr, &table_schema));
+    }
+
+    #[test]
+    fn array_has_all_pushdown_filters_rows() {
+        // Test array_has_all: checks if array contains all of ["c"]
+        // Rows with "c": row 1 and row 2
+        let expr = array_has_all(
+            col("letters"),
+            make_array(vec![Expr::Literal(
+                ScalarValue::Utf8(Some("c".to_string())),
+                None,
+            )]),
+        );
+        test_array_predicate_pushdown("array_has_all", expr, 1, 2, true);
+    }
+
+    /// Helper function to test array predicate pushdown functionality.
+    ///
+    /// Creates a Parquet file with a list column, applies the given predicate,
+    /// and verifies that rows are correctly filtered during decoding.
+    fn test_array_predicate_pushdown(
+        func_name: &str,
+        predicate_expr: Expr,
+        expected_pruned: usize,
+        expected_matched: usize,
+        expect_list_support: bool,
+    ) {
+        let item_field = Arc::new(Field::new("item", DataType::Utf8, true));
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "letters",
+            DataType::List(item_field),
+            true,
+        )]));
+
+        let mut builder = ListBuilder::new(StringBuilder::new());
+        // Row 0: ["a", "b"]
+        builder.values().append_value("a");
+        builder.values().append_value("b");
+        builder.append(true);
+
+        // Row 1: ["c"]
+        builder.values().append_value("c");
+        builder.append(true);
+
+        // Row 2: ["c", "d"]
+        builder.values().append_value("c");
+        builder.values().append_value("d");
+        builder.append(true);
+
+        let batch =
+            RecordBatch::try_new(schema.clone(), vec![Arc::new(builder.finish())])
+                .expect("record batch");
+
+        let file = NamedTempFile::new().expect("temp file");
+        let mut writer =
+            ArrowWriter::try_new(file.reopen().unwrap(), schema, None).expect("writer");
+        writer.write(&batch).expect("write batch");
+        writer.close().expect("close writer");
+
+        let reader_file = file.reopen().expect("reopen file");
+        let parquet_reader_builder =
+            ParquetRecordBatchReaderBuilder::try_new(reader_file)
+                .expect("reader builder");
+        let metadata = parquet_reader_builder.metadata().clone();
+        let file_schema = parquet_reader_builder.schema().clone();
+
+        let expr = logical2physical(&predicate_expr, &file_schema);
+        if expect_list_support {
+            assert!(supports_list_predicates(&expr));
+        }
+
+        let metrics = ExecutionPlanMetricsSet::new();
+        let file_metrics =
+            ParquetFileMetrics::new(0, &format!("{func_name}.parquet"), &metrics);
+
+        let row_filter =
+            build_row_filter(&expr, &file_schema, &metadata, false, &file_metrics)
+                .expect("building row filter")
+                .expect("row filter should exist");
+
+        let reader = parquet_reader_builder
+            .with_row_filter(row_filter)
+            .build()
+            .expect("build reader");
+
+        let mut total_rows = 0;
+        for batch in reader {
+            let batch = batch.expect("record batch");
+            total_rows += batch.num_rows();
+        }
+
+        assert_eq!(
+            file_metrics.pushdown_rows_pruned.value(),
+            expected_pruned,
+            "{func_name}: expected {expected_pruned} pruned rows"
+        );
+        assert_eq!(
+            file_metrics.pushdown_rows_matched.value(),
+            expected_matched,
+            "{func_name}: expected {expected_matched} matched rows"
+        );
+        assert_eq!(
+            total_rows, expected_matched,
+            "{func_name}: expected {expected_matched} total rows"
+        );
+    }
+
+    #[test]
+    fn array_has_pushdown_filters_rows() {
+        // Test array_has: checks if "c" is in the array
+        // Rows with "c": row 1 and row 2
+        let expr = array_has(
+            col("letters"),
+            Expr::Literal(ScalarValue::Utf8(Some("c".to_string())), None),
+        );
+        test_array_predicate_pushdown("array_has", expr, 1, 2, true);
+    }
+
+    #[test]
+    fn array_has_any_pushdown_filters_rows() {
+        // Test array_has_any: checks if array contains any of ["a", "d"]
+        // Row 0 has "a", row 2 has "d" - both should match
+        let expr = array_has_any(
+            col("letters"),
+            make_array(vec![
+                Expr::Literal(ScalarValue::Utf8(Some("a".to_string())), None),
+                Expr::Literal(ScalarValue::Utf8(Some("d".to_string())), None),
+            ]),
+        );
+        test_array_predicate_pushdown("array_has_any", expr, 1, 2, true);
+    }
+
+    #[test]
+    fn array_has_udf_pushdown_filters_rows() {
+        let expr = array_has_udf().call(vec![
+            col("letters"),
+            Expr::Literal(ScalarValue::Utf8(Some("c".to_string())), None),
+        ]);
+
+        test_array_predicate_pushdown("array_has_udf", expr, 1, 2, true);
+    }
+
+    #[test]
+    fn array_has_all_udf_pushdown_filters_rows() {
+        let expr = array_has_all_udf().call(vec![
+            col("letters"),
+            make_array(vec![Expr::Literal(
+                ScalarValue::Utf8(Some("c".to_string())),
+                None,
+            )]),
+        ]);
+
+        test_array_predicate_pushdown("array_has_all_udf", expr, 1, 2, true);
+    }
+
+    #[test]
+    fn array_has_any_udf_pushdown_filters_rows() {
+        let expr = array_has_any_udf().call(vec![
+            col("letters"),
+            make_array(vec![
+                Expr::Literal(ScalarValue::Utf8(Some("a".to_string())), None),
+                Expr::Literal(ScalarValue::Utf8(Some("d".to_string())), None),
+            ]),
+        ]);
+
+        test_array_predicate_pushdown("array_has_any_udf", expr, 1, 2, true);
     }
 
     #[test]
@@ -693,6 +1522,534 @@ mod test {
             .expect("parsing schema")
     }
 
+    /// Regression test: when a schema has Struct columns, Arrow field indices diverge
+    /// from Parquet leaf indices (Struct children become separate leaves). The
+    /// `PrimitiveOnly` fast-path in `leaf_indices_for_roots` assumes they are equal,
+    /// so a filter on a primitive column *after* a Struct gets the wrong leaf index.
+    ///
+    /// Schema:
+    ///   Arrow indices:   col_a=0  struct_col=1  col_b=2
+    ///   Parquet leaves:  col_a=0  struct_col.x=1  struct_col.y=2  col_b=3
+    ///
+    /// A filter on col_b should project Parquet leaf 3, but the bug causes it to
+    /// project leaf 2 (struct_col.y).
+    #[test]
+    fn test_filter_pushdown_leaf_index_with_struct_in_schema() {
+        use arrow::array::{Int32Array, StringArray, StructArray};
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("col_a", DataType::Int32, false),
+            Field::new(
+                "struct_col",
+                DataType::Struct(
+                    vec![
+                        Arc::new(Field::new("x", DataType::Int32, true)),
+                        Arc::new(Field::new("y", DataType::Int32, true)),
+                    ]
+                    .into(),
+                ),
+                true,
+            ),
+            Field::new("col_b", DataType::Utf8, false),
+        ]));
+
+        let col_a = Arc::new(Int32Array::from(vec![1, 2, 3]));
+        let struct_col = Arc::new(StructArray::from(vec![
+            (
+                Arc::new(Field::new("x", DataType::Int32, true)),
+                Arc::new(Int32Array::from(vec![10, 20, 30])) as _,
+            ),
+            (
+                Arc::new(Field::new("y", DataType::Int32, true)),
+                Arc::new(Int32Array::from(vec![100, 200, 300])) as _,
+            ),
+        ]));
+        let col_b = Arc::new(StringArray::from(vec!["aaa", "target", "zzz"]));
+
+        let batch =
+            RecordBatch::try_new(Arc::clone(&schema), vec![col_a, struct_col, col_b])
+                .unwrap();
+
+        let file = NamedTempFile::new().expect("temp file");
+        let mut writer =
+            ArrowWriter::try_new(file.reopen().unwrap(), Arc::clone(&schema), None)
+                .expect("writer");
+        writer.write(&batch).expect("write batch");
+        writer.close().expect("close writer");
+
+        let reader_file = file.reopen().expect("reopen file");
+        let builder = ParquetRecordBatchReaderBuilder::try_new(reader_file)
+            .expect("reader builder");
+        let metadata = builder.metadata().clone();
+        let file_schema = builder.schema().clone();
+
+        // sanity check: 4 Parquet leaves, 3 Arrow fields
+        assert_eq!(metadata.file_metadata().schema_descr().num_columns(), 4);
+        assert_eq!(file_schema.fields().len(), 3);
+
+        // build a filter candidate for `col_b = 'target'` through the public API
+        let expr = col("col_b").eq(Expr::Literal(
+            ScalarValue::Utf8(Some("target".to_string())),
+            None,
+        ));
+        let expr = logical2physical(&expr, &file_schema);
+
+        let candidate = FilterCandidateBuilder::new(expr, file_schema)
+            .build(&metadata)
+            .expect("building candidate")
+            .expect("filter on primitive col_b should be pushable");
+
+        // col_b is Parquet leaf 3 (shifted by struct_col's two children).
+        let expected_mask =
+            ProjectionMask::leaves(metadata.file_metadata().schema_descr(), [3]);
+        assert_eq!(
+            candidate.read_plan.projection_mask, expected_mask,
+            "projection_mask should select only leaf 3 for col_b"
+        );
+    }
+
+    /// get_field(struct_col, 'a') on a struct with a primitive leaf should allow pushdown.
+    #[test]
+    fn get_field_on_struct_allows_pushdown() {
+        let table_schema = Arc::new(Schema::new(vec![Field::new(
+            "struct_col",
+            DataType::Struct(
+                vec![Arc::new(Field::new("a", DataType::Int32, true))].into(),
+            ),
+            true,
+        )]));
+
+        // get_field(struct_col, 'a') > 5
+        let get_field_expr = get_field().call(vec![
+            col("struct_col"),
+            Expr::Literal(ScalarValue::Utf8(Some("a".to_string())), None),
+        ]);
+        let expr = get_field_expr.gt(Expr::Literal(ScalarValue::Int32(Some(5)), None));
+        let expr = logical2physical(&expr, &table_schema);
+
+        assert!(can_expr_be_pushed_down_with_schemas(&expr, &table_schema));
+    }
+
+    /// get_field on a struct field that resolves to a nested type should still block pushdown.
+    #[test]
+    fn get_field_on_nested_leaf_prevents_pushdown() {
+        let inner_struct = DataType::Struct(
+            vec![Arc::new(Field::new("x", DataType::Int32, true))].into(),
+        );
+        let table_schema = Arc::new(Schema::new(vec![Field::new(
+            "struct_col",
+            DataType::Struct(
+                vec![Arc::new(Field::new("nested", inner_struct, true))].into(),
+            ),
+            true,
+        )]));
+
+        // get_field(struct_col, 'nested') IS NOT NULL — the leaf is still a struct
+        let get_field_expr = get_field().call(vec![
+            col("struct_col"),
+            Expr::Literal(ScalarValue::Utf8(Some("nested".to_string())), None),
+        ]);
+        let expr = get_field_expr.is_not_null();
+        let expr = logical2physical(&expr, &table_schema);
+
+        assert!(!can_expr_be_pushed_down_with_schemas(&expr, &table_schema));
+    }
+
+    /// get_field returning a list inside a struct should allow pushdown when
+    /// wrapped in a supported list predicate like `array_has_any`.
+    /// e.g. `array_has_any(get_field(s, 'items'), make_array('x'))`
+    #[test]
+    fn get_field_list_leaf_with_array_predicate_allows_pushdown() {
+        let item_field = Arc::new(Field::new("item", DataType::Utf8, true));
+        let table_schema = Arc::new(Schema::new(vec![Field::new(
+            "s",
+            DataType::Struct(
+                vec![
+                    Arc::new(Field::new("id", DataType::Int32, true)),
+                    Arc::new(Field::new("items", DataType::List(item_field), true)),
+                ]
+                .into(),
+            ),
+            true,
+        )]));
+
+        // array_has_any(get_field(s, 'items'), make_array('x'))
+        let get_field_expr = get_field().call(vec![
+            col("s"),
+            Expr::Literal(ScalarValue::Utf8(Some("items".to_string())), None),
+        ]);
+        let expr = array_has_any(
+            get_field_expr,
+            make_array(vec![Expr::Literal(
+                ScalarValue::Utf8(Some("x".to_string())),
+                None,
+            )]),
+        );
+        let expr = logical2physical(&expr, &table_schema);
+
+        assert!(can_expr_be_pushed_down_with_schemas(&expr, &table_schema));
+    }
+
+    /// get_field on a struct produces correct Parquet leaf indices.
+    #[test]
+    fn get_field_filter_candidate_has_correct_leaf_indices() {
+        use arrow::array::{Int32Array, StringArray, StructArray};
+
+        // Schema: id (Int32), s (Struct{value: Int32, label: Utf8})
+        // Parquet leaves: id=0, s.value=1, s.label=2
+        let struct_fields: Fields = vec![
+            Arc::new(Field::new("value", DataType::Int32, false)),
+            Arc::new(Field::new("label", DataType::Utf8, false)),
+        ]
+        .into();
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("s", DataType::Struct(struct_fields.clone()), false),
+        ]));
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])),
+                Arc::new(StructArray::new(
+                    struct_fields,
+                    vec![
+                        Arc::new(Int32Array::from(vec![10, 20, 30])) as _,
+                        Arc::new(StringArray::from(vec!["a", "b", "c"])) as _,
+                    ],
+                    None,
+                )),
+            ],
+        )
+        .unwrap();
+
+        let file = NamedTempFile::new().expect("temp file");
+        let mut writer =
+            ArrowWriter::try_new(file.reopen().unwrap(), Arc::clone(&schema), None)
+                .expect("writer");
+        writer.write(&batch).expect("write batch");
+        writer.close().expect("close writer");
+
+        let reader_file = file.reopen().expect("reopen file");
+        let builder = ParquetRecordBatchReaderBuilder::try_new(reader_file)
+            .expect("reader builder");
+        let metadata = builder.metadata().clone();
+        let file_schema = builder.schema().clone();
+
+        // get_field(s, 'value') > 5
+        let get_field_expr = get_field().call(vec![
+            col("s"),
+            Expr::Literal(ScalarValue::Utf8(Some("value".to_string())), None),
+        ]);
+        let expr = get_field_expr.gt(Expr::Literal(ScalarValue::Int32(Some(5)), None));
+        let expr = logical2physical(&expr, &file_schema);
+
+        let candidate = FilterCandidateBuilder::new(expr, file_schema)
+            .build(&metadata)
+            .expect("building candidate")
+            .expect("get_field filter on struct should be pushable");
+
+        // The filter accesses only s.value, so only Parquet leaf 1 is needed.
+        // Leaf 2 (s.label) is not read, reducing unnecessary I/O.
+        let expected_mask =
+            ProjectionMask::leaves(metadata.file_metadata().schema_descr(), [1]);
+        assert_eq!(
+            candidate.read_plan.projection_mask, expected_mask,
+            "projection_mask should select only the accessed struct field leaf"
+        );
+    }
+
+    /// Deeply nested get_field: get_field(struct_col, 'outer', 'inner') where the
+    /// leaf is primitive should allow pushdown. The logical simplifier flattens
+    /// nested get_field(get_field(col, 'a'), 'b') into get_field(col, 'a', 'b').
+    #[test]
+    fn get_field_deeply_nested_allows_pushdown() {
+        let table_schema = Arc::new(Schema::new(vec![Field::new(
+            "s",
+            DataType::Struct(
+                vec![Arc::new(Field::new(
+                    "outer",
+                    DataType::Struct(
+                        vec![Arc::new(Field::new("inner", DataType::Int32, true))].into(),
+                    ),
+                    true,
+                ))]
+                .into(),
+            ),
+            true,
+        )]));
+
+        // s['outer']['inner'] > 5
+        let get_field_expr = get_field().call(vec![
+            col("s"),
+            Expr::Literal(ScalarValue::Utf8(Some("outer".to_string())), None),
+            Expr::Literal(ScalarValue::Utf8(Some("inner".to_string())), None),
+        ]);
+        let expr = get_field_expr.gt(Expr::Literal(ScalarValue::Int32(Some(5)), None));
+        let expr = logical2physical(&expr, &table_schema);
+
+        assert!(can_expr_be_pushed_down_with_schemas(&expr, &table_schema));
+    }
+
+    /// End-to-end: deeply nested get_field filter produces correct leaf indices
+    /// and the filter actually works against a Parquet file.
+    #[test]
+    fn get_field_deeply_nested_filter_candidate() {
+        use arrow::array::{Int32Array, StringArray, StructArray};
+
+        // Schema: id (Int32), s (Struct{outer: Struct{extra: Int32, inner: Int32}, tag: Utf8})
+        // Parquet leaves: id=0, s.outer.extra=1, s.outer.inner=2, s.tag=3
+        let inner_fields: Fields = vec![
+            Arc::new(Field::new("extra", DataType::Int32, false)),
+            Arc::new(Field::new("inner", DataType::Int32, false)),
+        ]
+        .into();
+        let outer_fields: Fields = vec![
+            Arc::new(Field::new(
+                "outer",
+                DataType::Struct(inner_fields.clone()),
+                false,
+            )),
+            Arc::new(Field::new("tag", DataType::Utf8, false)),
+        ]
+        .into();
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("s", DataType::Struct(outer_fields.clone()), false),
+        ]));
+
+        let inner_struct = StructArray::new(
+            inner_fields,
+            vec![
+                Arc::new(Int32Array::from(vec![100, 200, 300])) as _,
+                Arc::new(Int32Array::from(vec![10, 20, 30])) as _,
+            ],
+            None,
+        );
+        let outer_struct = StructArray::new(
+            outer_fields,
+            vec![
+                Arc::new(inner_struct) as _,
+                Arc::new(StringArray::from(vec!["x", "y", "z"])) as _,
+            ],
+            None,
+        );
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])),
+                Arc::new(outer_struct),
+            ],
+        )
+        .unwrap();
+
+        let file = NamedTempFile::new().expect("temp file");
+        let mut writer =
+            ArrowWriter::try_new(file.reopen().unwrap(), Arc::clone(&schema), None)
+                .expect("writer");
+        writer.write(&batch).expect("write batch");
+        writer.close().expect("close writer");
+
+        let reader_file = file.reopen().expect("reopen file");
+        let builder = ParquetRecordBatchReaderBuilder::try_new(reader_file)
+            .expect("reader builder");
+        let metadata = builder.metadata().clone();
+        let file_schema = builder.schema().clone();
+
+        // Parquet should have 4 leaves: id=0, s.outer.extra=1, s.outer.inner=2, s.tag=3
+        assert_eq!(metadata.file_metadata().schema_descr().num_columns(), 4);
+
+        // get_field(s, 'outer', 'inner') > 15
+        // Should only need leaf 2 (s.outer.inner), not leaf 1 (s.outer.extra) or leaf 3 (s.tag).
+        let get_field_expr = get_field().call(vec![
+            col("s"),
+            Expr::Literal(ScalarValue::Utf8(Some("outer".to_string())), None),
+            Expr::Literal(ScalarValue::Utf8(Some("inner".to_string())), None),
+        ]);
+        let expr = get_field_expr.gt(Expr::Literal(ScalarValue::Int32(Some(15)), None));
+        let expr = logical2physical(&expr, &file_schema);
+
+        let candidate = FilterCandidateBuilder::new(expr, file_schema)
+            .build(&metadata)
+            .expect("building candidate")
+            .expect("deeply nested get_field filter should be pushable");
+
+        // Only s.outer.inner (leaf 2) should be projected,
+        let expected_mask =
+            ProjectionMask::leaves(metadata.file_metadata().schema_descr(), [2]);
+        assert_eq!(
+            candidate.read_plan.projection_mask, expected_mask,
+            "projection_mask should select only leaf 2 for s.outer.inner, skipping sibling and cousin leaves"
+        );
+    }
+
+    /// End-to-end: get_field filter on a struct column with multiple fields
+    /// reads only the needed leaf and correctly filters rows during Parquet decoding.
+    #[test]
+    fn get_field_end_to_end_filters_rows() {
+        // Schema: id (Int32), s (Struct{value: Int32, label: Utf8})
+        // Parquet leaves: id=0, s.value=1, s.label=2
+        let struct_fields: Fields = vec![
+            Arc::new(Field::new("value", DataType::Int32, false)),
+            Arc::new(Field::new("label", DataType::Utf8, false)),
+        ]
+        .into();
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("s", DataType::Struct(struct_fields.clone()), false),
+        ]));
+
+        // +----+--------------------------+
+        // | id | s                        |
+        // +----+--------------------------+
+        // |  1 | {value: 10, label: "a"}  |
+        // |  2 | {value: 20, label: "b"}  |
+        // |  3 | {value: 30, label: "c"}  |
+        // +----+--------------------------+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])),
+                Arc::new(StructArray::new(
+                    struct_fields,
+                    vec![
+                        Arc::new(Int32Array::from(vec![10, 20, 30])) as _,
+                        Arc::new(StringArray::from(vec!["a", "b", "c"])) as _,
+                    ],
+                    None,
+                )),
+            ],
+        )
+        .unwrap();
+
+        let file = NamedTempFile::new().expect("temp file");
+        let mut writer =
+            ArrowWriter::try_new(file.reopen().unwrap(), Arc::clone(&schema), None)
+                .expect("writer");
+        writer.write(&batch).expect("write batch");
+        writer.close().expect("close writer");
+
+        let reader_file = file.reopen().expect("reopen file");
+        let parquet_reader_builder =
+            ParquetRecordBatchReaderBuilder::try_new(reader_file)
+                .expect("reader builder");
+        let metadata = parquet_reader_builder.metadata().clone();
+        let file_schema = parquet_reader_builder.schema().clone();
+
+        // get_field(s, 'value') > 15  — should match rows with value=20 and value=30
+        let get_field_expr = get_field().call(vec![
+            col("s"),
+            Expr::Literal(ScalarValue::Utf8(Some("value".to_string())), None),
+        ]);
+        let predicate_expr =
+            get_field_expr.gt(Expr::Literal(ScalarValue::Int32(Some(15)), None));
+        let expr = logical2physical(&predicate_expr, &file_schema);
+
+        let metrics = ExecutionPlanMetricsSet::new();
+        let file_metrics = ParquetFileMetrics::new(0, "struct_e2e.parquet", &metrics);
+
+        let row_filter =
+            build_row_filter(&expr, &file_schema, &metadata, false, &file_metrics)
+                .expect("building row filter")
+                .expect("row filter should exist");
+
+        let reader = parquet_reader_builder
+            .with_row_filter(row_filter)
+            .build()
+            .expect("build reader");
+
+        let mut total_rows = 0;
+        for batch in reader {
+            let batch = batch.expect("record batch");
+            total_rows += batch.num_rows();
+        }
+
+        assert_eq!(total_rows, 2, "expected 2 rows matching value > 15");
+        assert_eq!(file_metrics.pushdown_rows_pruned.value(), 1);
+        assert_eq!(file_metrics.pushdown_rows_matched.value(), 2);
+    }
+
+    #[test]
+    fn projection_read_plan_preserves_full_struct() {
+        // Schema: id (Int32), s (Struct{value: Int32, label: Utf8})
+        // Parquet leaves: id=0, s.value=1, s.label=2
+        let struct_fields: Fields = vec![
+            Arc::new(Field::new("value", DataType::Int32, false)),
+            Arc::new(Field::new("label", DataType::Utf8, false)),
+        ]
+        .into();
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("s", DataType::Struct(struct_fields.clone()), false),
+        ]));
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])),
+                Arc::new(StructArray::new(
+                    struct_fields,
+                    vec![
+                        Arc::new(Int32Array::from(vec![10, 20, 30])) as _,
+                        Arc::new(StringArray::from(vec!["a", "b", "c"])) as _,
+                    ],
+                    None,
+                )),
+            ],
+        )
+        .unwrap();
+
+        let file = NamedTempFile::new().expect("temp file");
+        let mut writer =
+            ArrowWriter::try_new(file.reopen().unwrap(), Arc::clone(&schema), None)
+                .expect("writer");
+        writer.write(&batch).expect("write batch");
+        writer.close().expect("close writer");
+
+        let reader_file = file.reopen().expect("reopen file");
+        let builder = ParquetRecordBatchReaderBuilder::try_new(reader_file)
+            .expect("reader builder");
+        let metadata = builder.metadata().clone();
+        let file_schema = builder.schema().clone();
+        let schema_descr = metadata.file_metadata().schema_descr();
+
+        // Simulate SELECT * output projection: Column("id") and Column("s")
+        // Plus a get_field(s, 'value') expression from the pushed-down filter
+        let exprs: Vec<Arc<dyn PhysicalExpr>> = vec![
+            Arc::new(PhysicalColumn::new("id", 0)),
+            Arc::new(PhysicalColumn::new("s", 1)),
+            logical2physical(
+                &get_field().call(vec![
+                    col("s"),
+                    Expr::Literal(ScalarValue::Utf8(Some("value".to_string())), None),
+                ]),
+                &file_schema,
+            ),
+        ];
+
+        let read_plan = build_projection_read_plan(exprs, &file_schema, schema_descr);
+
+        // The projected schema must have the FULL struct type because Column("s")
+        // is in the projection. It should NOT be narrowed to Struct{value: Int32}.
+        let s_field = read_plan.projected_schema.field_with_name("s").unwrap();
+        assert_eq!(
+            s_field.data_type(),
+            &DataType::Struct(
+                vec![
+                    Arc::new(Field::new("value", DataType::Int32, false)),
+                    Arc::new(Field::new("label", DataType::Utf8, false)),
+                ]
+                .into()
+            ),
+        );
+
+        // all3 Parquet leaves should be in the projection mask
+        let expected_mask = ProjectionMask::leaves(schema_descr, [0, 1, 2]);
+        assert_eq!(read_plan.projection_mask, expected_mask,);
+    }
+
     /// Sanity check that the given expression could be evaluated against the given schema without any errors.
     /// This will fail if the expression references columns that are not in the schema or if the types of the columns are incompatible, etc.
     fn check_expression_can_evaluate_against_schema(
diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs
index 2043f75070b5c..3f254c9f55282 100644
--- a/datafusion/datasource-parquet/src/row_group_filter.rs
+++ b/datafusion/datasource-parquet/src/row_group_filter.rs
@@ -19,22 +19,19 @@ use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
 use super::{ParquetAccessPlan, ParquetFileMetrics};
-use arrow::array::{ArrayRef, BooleanArray};
+use arrow::array::{ArrayRef, BooleanArray, UInt64Array};
 use arrow::datatypes::Schema;
 use datafusion_common::pruning::PruningStatistics;
 use datafusion_common::{Column, Result, ScalarValue};
 use datafusion_datasource::FileRange;
+use datafusion_physical_expr::PhysicalExprSimplifier;
+use datafusion_physical_expr::expressions::NotExpr;
 use datafusion_pruning::PruningPredicate;
 use parquet::arrow::arrow_reader::statistics::StatisticsConverter;
-use parquet::arrow::parquet_column;
 use parquet::basic::Type;
 use parquet::data_type::Decimal;
 use parquet::schema::types::SchemaDescriptor;
-use parquet::{
-    arrow::{async_reader::AsyncFileReader, ParquetRecordBatchStreamBuilder},
-    bloom_filter::Sbbf,
-    file::metadata::RowGroupMetaData,
-};
+use parquet::{bloom_filter::Sbbf, file::metadata::RowGroupMetaData};
 
 /// Reduces the [`ParquetAccessPlan`] based on row group level metadata.
 ///
@@ -46,13 +43,20 @@ use parquet::{
 pub struct RowGroupAccessPlanFilter {
     /// which row groups should be accessed
     access_plan: ParquetAccessPlan,
+    /// Row groups where ALL rows are known to match the pruning predicate
+    /// (the predicate does not filter any rows)
+    is_fully_matched: Vec<bool>,
 }
 
 impl RowGroupAccessPlanFilter {
     /// Create a new `RowGroupPlanBuilder` for pruning out the groups to scan
     /// based on metadata and statistics
     pub fn new(access_plan: ParquetAccessPlan) -> Self {
-        Self { access_plan }
+        let num_row_groups = access_plan.len();
+        Self {
+            access_plan,
+            is_fully_matched: vec![false; num_row_groups],
+        }
     }
 
     /// Return true if there are no row groups
@@ -65,11 +69,149 @@ impl RowGroupAccessPlanFilter {
         self.access_plan.row_group_index_iter().count()
     }
 
+    /// Return indexes of row groups that still need to be scanned.
+    pub fn row_group_indexes(&self) -> impl Iterator<Item = usize> + '_ {
+        self.access_plan.row_group_index_iter()
+    }
+
     /// Returns the inner access plan
     pub fn build(self) -> ParquetAccessPlan {
         self.access_plan
     }
 
+    /// Returns the is_fully_matched vector
+    pub fn is_fully_matched(&self) -> &Vec<bool> {
+        &self.is_fully_matched
+    }
+
+    /// Prunes the access plan based on the limit and fully contained row groups.
+    ///
+    /// The pruning works by leveraging the concept of fully matched row groups. Consider a query like:
+    /// `WHERE species LIKE 'Alpine%' AND s >= 50 LIMIT N`
+    ///
+    /// After initial filtering, row groups can be classified into three states:
+    ///
+    /// 1. Not Matching / Pruned
+    /// 2. Partially Matching (Row Group/Page contains some matches)
+    /// 3. Fully Matching (Entire range is within predicate)
+    ///
+    /// +-----------------------------------------------------------------------+
+    /// |                            NOT MATCHING                               |
+    /// |  Row group 1                                                          |
+    /// |  +-----------------------------------+-----------------------------+  |
+    /// |  | SPECIES                           | S                           |  |
+    /// |  +-----------------------------------+-----------------------------+  |
+    /// |  | Snow Vole                         | 7                           |  |
+    /// |  | Brown Bear                        | 133 ✅                      |  |
+    /// |  | Gray Wolf                         | 82  ✅                      |  |
+    /// |  +-----------------------------------+-----------------------------+  |
+    /// +-----------------------------------------------------------------------+
+    ///
+    /// +---------------------------------------------------------------------------+
+    /// |                          PARTIALLY MATCHING                               |
+    /// |                                                                           |
+    /// |  Row group 2                              Row group 4                     |
+    /// |  +------------------+--------------+      +------------------+----------+ |
+    /// |  | SPECIES          | S            |      | SPECIES          | S        | |
+    /// |  +------------------+--------------+      +------------------+----------+ |
+    /// |  | Lynx             | 71 ✅        |      | Europ. Mole      | 4        | |
+    /// |  | Red Fox          | 40           |      | Polecat          | 16       | |
+    /// |  | Alpine Bat  ✅   | 6            |      | Alpine Ibex ✅  | 97 ✅    | |
+    /// |  +------------------+--------------+      +------------------+----------+ |
+    /// +---------------------------------------------------------------------------+
+    ///
+    /// +-----------------------------------------------------------------------+
+    /// |                           FULLY MATCHING                              |
+    /// |  Row group 3                                                          |
+    /// |  +-----------------------------------+-----------------------------+  |
+    /// |  | SPECIES                           | S                           |  |
+    /// |  +-----------------------------------+-----------------------------+  |
+    /// |  | Alpine Ibex  ✅                  | 101    ✅                   |  |
+    /// |  | Alpine Goat  ✅                  | 76     ✅                   |  |
+    /// |  | Alpine Sheep ✅                  | 83     ✅                   |  |
+    /// |  +-----------------------------------+-----------------------------+  |
+    /// +-----------------------------------------------------------------------+
+    ///
+    /// ### Identification of Fully Matching Row Groups
+    ///
+    /// DataFusion identifies row groups where ALL rows satisfy the filter by inverting the
+    /// predicate and checking if statistics prove the inverted version is false for the group.
+    ///
+    /// For example, prefix matches like `species LIKE 'Alpine%'` are pruned using ranges:
+    /// 1. Candidate Range: `species >= 'Alpine' AND species < 'Alpinf'`
+    /// 2. Inverted Condition (to prove full match): `species < 'Alpine' OR species >= 'Alpinf'`
+    /// 3. Statistical Evaluation (check if any row *could* satisfy the inverted condition):
+    ///    `min < 'Alpine' OR max >= 'Alpinf'`
+    ///
+    /// If this evaluation is **false**, it proves no row can fail the original filter,
+    /// so the row group is **FULLY MATCHING**.
+    ///
+    /// ### Impact of Statistics Truncation
+    ///
+    /// The precision of pruning depends on the metadata quality. Truncated statistics
+    /// may prevent the system from proving a full match.
+    ///
+    /// **Example**: `WHERE species LIKE 'Alpine%'` (Target range: `['Alpine', 'Alpinf')`)
+    ///
+    /// | Truncation Length | min / max           | Inverted Evaluation                                                 | Status                 |
+    /// |-------------------|---------------------|---------------------------------------------------------------------|------------------------|
+    /// | **Length 6**      | `Alpine` / `Alpine` | `"Alpine" < "Alpine" (F) OR "Alpine" >= "Alpinf" (F)` -> **false**  | **FULLY MATCHING**     |
+    /// | **Length 3**      | `Alp` / `Alq`       | `"Alp" < "Alpine" (T) OR "Alq" >= "Alpinf" (T)` -> **true**         | **PARTIALLY MATCHING** |
+    ///
+    /// Even though Row Group 3 only contains matching rows, truncation to length 3 makes
+    /// the statistics `[Alp, Alq]` too broad to prove it (they could include "Alpha").
+    /// The system must conservatively scan the group.
+    ///
+    /// Without limit pruning: Scan Partition 2 → Partition 3 → Partition 4 (until limit reached)
+    /// With limit pruning: If Partition 3 contains enough rows to satisfy the limit,
+    /// skip Partitions 2 and 4 entirely and go directly to Partition 3.
+    ///
+    /// This optimization is particularly effective when:
+    /// - The limit is small relative to the total dataset size
+    /// - There are row groups that are fully matched by the filter predicates
+    /// - The fully matched row groups contain sufficient rows to satisfy the limit
+    ///
+    /// For more information, see the [paper](https://arxiv.org/pdf/2504.11540)'s "Pruning for LIMIT Queries" part
+    pub fn prune_by_limit(
+        &mut self,
+        limit: usize,
+        rg_metadata: &[RowGroupMetaData],
+        metrics: &ParquetFileMetrics,
+    ) {
+        let mut fully_matched_row_group_indexes: Vec<usize> = Vec::new();
+        let mut fully_matched_rows_count: usize = 0;
+
+        // Iterate through the currently accessible row groups and try to
+        // find a set of matching row groups that can satisfy the limit
+        for &idx in self.access_plan.row_group_indexes().iter() {
+            if self.is_fully_matched[idx] {
+                let row_group_row_count = rg_metadata[idx].num_rows() as usize;
+                fully_matched_row_group_indexes.push(idx);
+                fully_matched_rows_count += row_group_row_count;
+                if fully_matched_rows_count >= limit {
+                    break;
+                }
+            }
+        }
+
+        // If we can satisfy the limit with fully matching row groups,
+        // rewrite the plan to do so
+        if fully_matched_rows_count >= limit {
+            let original_num_accessible_row_groups =
+                self.access_plan.row_group_indexes().len();
+            let new_num_accessible_row_groups = fully_matched_row_group_indexes.len();
+            let pruned_count = original_num_accessible_row_groups
+                .saturating_sub(new_num_accessible_row_groups);
+            metrics.limit_pruned_row_groups.add_pruned(pruned_count);
+
+            let mut new_access_plan = ParquetAccessPlan::new_none(rg_metadata.len());
+            for &idx in &fully_matched_row_group_indexes {
+                new_access_plan.scan(idx);
+            }
+            self.access_plan = new_access_plan;
+        }
+    }
+
     /// Prune remaining row groups to only those  within the specified range.
     ///
     /// Updates this set to mark row groups that should not be scanned
@@ -135,15 +277,26 @@ impl RowGroupAccessPlanFilter {
         // try to prune the row groups in a single call
         match predicate.prune(&pruning_stats) {
             Ok(values) => {
-                // values[i] is false means the predicate could not be true for row group i
+                let mut fully_contained_candidates_original_idx: Vec<usize> = Vec::new();
                 for (idx, &value) in row_group_indexes.iter().zip(values.iter()) {
                     if !value {
                         self.access_plan.skip(*idx);
                         metrics.row_groups_pruned_statistics.add_pruned(1);
                     } else {
                         metrics.row_groups_pruned_statistics.add_matched(1);
+                        fully_contained_candidates_original_idx.push(*idx);
                     }
                 }
+
+                // Check if any of the matched row groups are fully contained by the predicate
+                self.identify_fully_matched_row_groups(
+                    &fully_contained_candidates_original_idx,
+                    arrow_schema,
+                    parquet_schema,
+                    groups,
+                    predicate,
+                    metrics,
+                );
             }
             // stats filter array could not be built, so we can't prune
             Err(e) => {
@@ -153,62 +306,94 @@ impl RowGroupAccessPlanFilter {
         }
     }
 
-    /// Prune remaining row groups using available bloom filters and the
-    /// [`PruningPredicate`].
+    /// Identifies row groups that are fully matched by the predicate.
     ///
-    /// Updates this set with row groups that should not be scanned
+    /// This optimization checks whether all rows in a row group satisfy the predicate
+    /// by inverting the predicate and checking if it prunes the row group. If the
+    /// inverted predicate prunes a row group, it means no rows match the inverted
+    /// predicate, which implies all rows match the original predicate.
     ///
-    /// # Panics
-    /// if the builder does not have the same number of row groups as this set
-    pub async fn prune_by_bloom_filters<T: AsyncFileReader + Send + 'static>(
+    /// Note: This optimization is relatively inexpensive for a limited number of row groups.
+    fn identify_fully_matched_row_groups(
         &mut self,
+        candidate_row_group_indices: &[usize],
         arrow_schema: &Schema,
-        builder: &mut ParquetRecordBatchStreamBuilder<T>,
+        parquet_schema: &SchemaDescriptor,
+        groups: &[RowGroupMetaData],
         predicate: &PruningPredicate,
         metrics: &ParquetFileMetrics,
     ) {
-        // scoped timer updates on drop
-        let _timer_guard = metrics.bloom_filter_eval_time.timer();
+        if candidate_row_group_indices.is_empty() {
+            return;
+        }
 
-        assert_eq!(builder.metadata().num_row_groups(), self.access_plan.len());
-        for idx in 0..self.access_plan.len() {
-            if !self.access_plan.should_scan(idx) {
-                continue;
-            }
+        // Use NotExpr to create the inverted predicate
+        let inverted_expr = Arc::new(NotExpr::new(Arc::clone(predicate.orig_expr())));
 
-            // Attempt to find bloom filters for filtering this row group
-            let literal_columns = predicate.literal_columns();
-            let mut column_sbbf = HashMap::with_capacity(literal_columns.len());
+        // Simplify the NOT expression (e.g., NOT(c1 = 0) -> c1 != 0)
+        // before building the pruning predicate
+        let simplifier = PhysicalExprSimplifier::new(arrow_schema);
+        let Ok(inverted_expr) = simplifier.simplify(inverted_expr) else {
+            return;
+        };
 
-            for column_name in literal_columns {
-                let Some((column_idx, _field)) =
-                    parquet_column(builder.parquet_schema(), arrow_schema, &column_name)
-                else {
-                    continue;
-                };
+        let Ok(inverted_predicate) =
+            PruningPredicate::try_new(inverted_expr, Arc::clone(predicate.schema()))
+        else {
+            return;
+        };
 
-                let bf = match builder
-                    .get_row_group_column_bloom_filter(idx, column_idx)
-                    .await
-                {
-                    Ok(Some(bf)) => bf,
-                    Ok(None) => continue, // no bloom filter for this column
-                    Err(e) => {
-                        log::debug!("Ignoring error reading bloom filter: {e}");
-                        metrics.predicate_evaluation_errors.add(1);
-                        continue;
-                    }
-                };
-                let physical_type =
-                    builder.parquet_schema().column(column_idx).physical_type();
+        let inverted_pruning_stats = RowGroupPruningStatistics {
+            parquet_schema,
+            row_group_metadatas: candidate_row_group_indices
+                .iter()
+                .map(|&i| &groups[i])
+                .collect::<Vec<_>>(),
+            arrow_schema,
+        };
+
+        let Ok(inverted_values) = inverted_predicate.prune(&inverted_pruning_stats)
+        else {
+            return;
+        };
 
-                column_sbbf.insert(column_name.to_string(), (bf, physical_type));
+        for (i, &original_row_group_idx) in candidate_row_group_indices.iter().enumerate()
+        {
+            // If the inverted predicate *also* prunes this row group (meaning inverted_values[i] is false),
+            // it implies that *all* rows in this group satisfy the original predicate.
+            if !inverted_values[i] {
+                self.is_fully_matched[original_row_group_idx] = true;
+                metrics.row_groups_pruned_statistics.add_fully_matched(1);
             }
+        }
+    }
+
+    /// Prune remaining row groups using loaded bloom filters and the
+    /// [`PruningPredicate`].
+    ///
+    /// Updates this set with row groups that should not be scanned.
+    /// `row_group_bloom_filters[idx]` contains the bloom filters for the
+    /// parquet row group at index `idx`.
+    ///
+    /// # Panics
+    /// if `row_group_bloom_filters` does not have the same number of row groups as this set
+    pub(crate) fn prune_by_bloom_filters(
+        &mut self,
+        predicate: &PruningPredicate,
+        metrics: &ParquetFileMetrics,
+        row_group_bloom_filters: &[BloomFilterStatistics],
+    ) {
+        // scoped timer updates on drop
+        let _timer_guard = metrics.bloom_filter_eval_time.timer();
 
-            let stats = BloomFilterStatistics { column_sbbf };
+        assert_eq!(row_group_bloom_filters.len(), self.access_plan.len());
+        for (idx, stats) in row_group_bloom_filters.iter().enumerate() {
+            if !self.access_plan.should_scan(idx) {
+                continue;
+            }
 
             // Can this group be pruned?
-            let prune_group = match predicate.prune(&stats) {
+            let prune_group = match predicate.prune(stats) {
                 Ok(values) => !values[0],
                 Err(e) => {
                     log::debug!(
@@ -228,13 +413,39 @@ impl RowGroupAccessPlanFilter {
         }
     }
 }
-/// Implements [`PruningStatistics`] for Parquet Split Block Bloom Filters (SBBF)
-struct BloomFilterStatistics {
-    /// Maps column name to the parquet bloom filter and parquet physical type
+
+/// In memory Parquet Split Block Bloom Filters (SBBF).
+///
+/// This structure implements [`PruningStatistics`] and is used to prune
+/// Parquet row groups and data pages based on the query predicate.
+#[derive(Debug, Clone, Default)]
+pub(crate) struct BloomFilterStatistics {
+    /// Per-column Bloom filters
+    /// Key: predicate column name
+    /// Value:
+    /// * [`Sbbf`] (Bloom filter),
+    /// * Parquet physical [`Type`] needed to evaluate  literals against the filter
     column_sbbf: HashMap<String, (Sbbf, Type)>,
 }
 
 impl BloomFilterStatistics {
+    /// Create an empty [`BloomFilterStatistics`]
+    pub(crate) fn new() -> Self {
+        Default::default()
+    }
+
+    /// Create an empty [`BloomFilterStatistics`] with the specified capacity
+    pub(crate) fn with_capacity(capacity: usize) -> Self {
+        Self {
+            column_sbbf: HashMap::with_capacity(capacity),
+        }
+    }
+
+    /// Add a Bloom filter and type for the specified column
+    pub(crate) fn insert(&mut self, column: impl Into<String>, sbbf: Sbbf, ty: Type) {
+        self.column_sbbf.insert(column.into(), (sbbf, ty));
+    }
+
     /// Helper function for checking if [`Sbbf`] filter contains [`ScalarValue`].
     ///
     /// In case the type of scalar is not supported, returns `true`, assuming that the
@@ -325,7 +536,7 @@ impl PruningStatistics for BloomFilterStatistics {
         None
     }
 
-    fn row_counts(&self, _column: &Column) -> Option<ArrayRef> {
+    fn row_counts(&self) -> Option<ArrayRef> {
         None
     }
 
@@ -415,13 +626,13 @@ impl PruningStatistics for RowGroupPruningStatistics<'_> {
             .map(|counts| Arc::new(counts) as ArrayRef)
     }
 
-    fn row_counts(&self, column: &Column) -> Option<ArrayRef> {
-        // row counts are the same for all columns in a row group
-        self.statistics_converter(column)
-            .and_then(|c| Ok(c.row_group_row_counts(self.metadata_iter())?))
-            .ok()
-            .flatten()
-            .map(|counts| Arc::new(counts) as ArrayRef)
+    fn row_counts(&self) -> Option<ArrayRef> {
+        // Row counts are container-level — read directly from row group metadata.
+        let counts: UInt64Array = self
+            .metadata_iter()
+            .map(|rg| Some(rg.num_rows() as u64))
+            .collect();
+        Some(Arc::new(counts) as ArrayRef)
     }
 
     fn contained(
@@ -436,19 +647,19 @@ impl PruningStatistics for RowGroupPruningStatistics<'_> {
 #[cfg(test)]
 mod tests {
     use std::ops::Rem;
-    use std::sync::Arc;
 
     use super::*;
     use crate::reader::ParquetFileReader;
 
     use arrow::datatypes::DataType::Decimal128;
     use arrow::datatypes::{DataType, Field};
-    use datafusion_common::Result;
-    use datafusion_expr::{cast, col, lit, Expr};
+    use datafusion_expr::{Expr, cast, col, lit};
     use datafusion_physical_expr::planner::logical2physical;
     use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
-    use parquet::arrow::async_reader::ParquetObjectReader;
+    use object_store::ObjectStoreExt;
     use parquet::arrow::ArrowSchemaConverter;
+    use parquet::arrow::ParquetRecordBatchStreamBuilder;
+    use parquet::arrow::async_reader::ParquetObjectReader;
     use parquet::basic::LogicalType;
     use parquet::data_type::{ByteArray, FixedLenByteArray};
     use parquet::file::metadata::ColumnChunkMetaData;
@@ -1425,7 +1636,10 @@ mod tests {
                 }
                 ExpectedPruning::Some(expected) => {
                     let actual = row_groups.access_plan.row_group_indexes();
-                    assert_eq!(expected, &actual, "Unexpected row groups pruned. Expected {expected:?}, got {actual:?}");
+                    assert_eq!(
+                        expected, &actual,
+                        "Unexpected row groups pruned. Expected {expected:?}, got {actual:?}"
+                    );
                 }
             }
         }
@@ -1533,7 +1747,8 @@ mod tests {
         data: bytes::Bytes,
         pruning_predicate: &PruningPredicate,
     ) -> Result<RowGroupAccessPlanFilter> {
-        use object_store::{ObjectMeta, ObjectStore};
+        use datafusion_datasource::PartitionedFile;
+        use object_store::ObjectMeta;
 
         let object_meta = ObjectMeta {
             location: object_store::path::Path::parse(file_name).expect("creating path"),
@@ -1551,25 +1766,67 @@ mod tests {
         let metrics = ExecutionPlanMetricsSet::new();
         let file_metrics =
             ParquetFileMetrics::new(0, object_meta.location.as_ref(), &metrics);
-        let inner = ParquetObjectReader::new(Arc::new(in_memory), object_meta.location)
-            .with_file_size(object_meta.size);
+        let inner =
+            ParquetObjectReader::new(Arc::new(in_memory), object_meta.location.clone())
+                .with_file_size(object_meta.size);
+
+        let partitioned_file = PartitionedFile::new_from_meta(object_meta);
 
         let reader = ParquetFileReader {
             inner,
             file_metrics: file_metrics.clone(),
+            partitioned_file,
         };
         let mut builder = ParquetRecordBatchStreamBuilder::new(reader).await.unwrap();
 
         let access_plan = ParquetAccessPlan::new_all(builder.metadata().num_row_groups());
         let mut pruned_row_groups = RowGroupAccessPlanFilter::new(access_plan);
-        pruned_row_groups
-            .prune_by_bloom_filters(
-                pruning_predicate.schema(),
-                &mut builder,
-                pruning_predicate,
-                &file_metrics,
-            )
-            .await;
+        let literal_columns = pruning_predicate.literal_columns();
+        let parquet_columns: Vec<_> = literal_columns
+            .into_iter()
+            .filter_map(|column_name| {
+                let (column_idx, _) = parquet::arrow::parquet_column(
+                    builder.parquet_schema(),
+                    pruning_predicate.schema(),
+                    &column_name,
+                )?;
+                Some((
+                    column_name.to_string(),
+                    column_idx,
+                    builder.parquet_schema().column(column_idx).physical_type(),
+                ))
+            })
+            .collect::<Vec<_>>();
+        let mut row_group_bloom_filters =
+            Vec::with_capacity(builder.metadata().num_row_groups());
+        row_group_bloom_filters.resize_with(
+            builder.metadata().num_row_groups(),
+            BloomFilterStatistics::new,
+        );
+        for idx in pruned_row_groups.row_group_indexes() {
+            let mut column_sbbf = HashMap::with_capacity(parquet_columns.len());
+            for (column_name, column_idx, physical_type) in &parquet_columns {
+                let bf = match builder
+                    .get_row_group_column_bloom_filter(idx, *column_idx)
+                    .await
+                {
+                    Ok(Some(bf)) => bf,
+                    Ok(None) => continue,
+                    Err(e) => {
+                        log::debug!("Ignoring error reading bloom filter: {e}");
+                        file_metrics.predicate_evaluation_errors.add(1);
+                        continue;
+                    }
+                };
+                column_sbbf.insert(column_name.clone(), (bf, *physical_type));
+            }
+            row_group_bloom_filters[idx] = BloomFilterStatistics { column_sbbf };
+        }
+        pruned_row_groups.prune_by_bloom_filters(
+            pruning_predicate,
+            &file_metrics,
+            &row_group_bloom_filters,
+        );
 
         Ok(pruned_row_groups)
     }
diff --git a/datafusion/datasource-parquet/src/sort.rs b/datafusion/datasource-parquet/src/sort.rs
new file mode 100644
index 0000000000000..db22363aa3746
--- /dev/null
+++ b/datafusion/datasource-parquet/src/sort.rs
@@ -0,0 +1,1021 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Sort-related utilities for Parquet scanning
+
+use datafusion_common::Result;
+use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
+use parquet::file::metadata::ParquetMetaData;
+use std::collections::HashMap;
+
+/// Reverse a row selection to match reversed row group order.
+///
+/// When scanning row groups in reverse order, we need to adjust the row selection
+/// to account for the new ordering. This function:
+/// 1. Maps each selection to its corresponding row group
+/// 2. Reverses the order of row groups
+/// 3. Reconstructs the row selection for the new order
+///
+/// # Arguments
+/// * `row_selection` - Original row selection (only covers row groups that are scanned)
+/// * `parquet_metadata` - Metadata containing row group information
+/// * `row_groups_to_scan` - Indexes of row groups that will be scanned (in original order)
+///
+/// # Returns
+/// A new `RowSelection` adjusted for reversed row group order
+///
+/// # Important Notes
+/// The input `row_selection` only covers the row groups specified in `row_groups_to_scan`.
+/// Row groups that are skipped (not in `row_groups_to_scan`) are not represented in the
+/// `row_selection` at all. This function needs `row_groups_to_scan` to correctly map
+/// the selection back to the original row groups.
+pub fn reverse_row_selection(
+    row_selection: &RowSelection,
+    parquet_metadata: &ParquetMetaData,
+    row_groups_to_scan: &[usize],
+) -> Result<RowSelection> {
+    let rg_metadata = parquet_metadata.row_groups();
+
+    // Build a mapping of row group index to its row range, but ONLY for
+    // the row groups that are actually being scanned.
+    //
+    // IMPORTANT: The row numbers in this mapping are RELATIVE to the scanned row groups,
+    // not absolute positions in the file.
+    //
+    // Example: If row_groups_to_scan = [0, 2, 3] and each has 100 rows:
+    //   RG0: rows 0-99 (relative to scanned data)
+    //   RG2: rows 100-199 (relative to scanned data, NOT 200-299 in file!)
+    //   RG3: rows 200-299 (relative to scanned data, NOT 300-399 in file!)
+    let mut rg_row_ranges: Vec<(usize, usize, usize)> =
+        Vec::with_capacity(row_groups_to_scan.len());
+    let mut current_row = 0;
+    for &rg_idx in row_groups_to_scan {
+        let rg = &rg_metadata[rg_idx];
+        let num_rows = rg.num_rows() as usize;
+        rg_row_ranges.push((rg_idx, current_row, current_row + num_rows));
+        current_row += num_rows; // This is relative row number, NOT absolute file position
+    }
+
+    // Map selections to row groups
+    let mut rg_selections: HashMap<usize, Vec<RowSelector>> = HashMap::new();
+
+    let mut current_file_row = 0;
+    for selector in row_selection.iter() {
+        let selector_end = current_file_row + selector.row_count;
+
+        // Find which row groups this selector spans
+        for (rg_idx, rg_start, rg_end) in rg_row_ranges.iter() {
+            if current_file_row < *rg_end && selector_end > *rg_start {
+                // This selector overlaps with this row group
+                let overlap_start = current_file_row.max(*rg_start);
+                let overlap_end = selector_end.min(*rg_end);
+                let overlap_count = overlap_end - overlap_start;
+
+                if overlap_count > 0 {
+                    let entry = rg_selections.entry(*rg_idx).or_default();
+                    if selector.skip {
+                        entry.push(RowSelector::skip(overlap_count));
+                    } else {
+                        entry.push(RowSelector::select(overlap_count));
+                    }
+                }
+            }
+        }
+
+        current_file_row = selector_end;
+    }
+
+    // Build new selection for reversed row group order
+    // Only iterate over the row groups that are being scanned, in reverse order
+    let mut reversed_selectors = Vec::new();
+    for &rg_idx in row_groups_to_scan.iter().rev() {
+        if let Some(selectors) = rg_selections.get(&rg_idx) {
+            reversed_selectors.extend(selectors.iter().cloned());
+        } else {
+            // No specific selection for this row group means select all rows in it
+            if let Some((_, start, end)) =
+                rg_row_ranges.iter().find(|(idx, _, _)| *idx == rg_idx)
+            {
+                reversed_selectors.push(RowSelector::select(end - start));
+            }
+        }
+    }
+
+    Ok(RowSelection::from(reversed_selectors))
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::ParquetAccessPlan;
+    use crate::RowGroupAccess;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use bytes::Bytes;
+    use parquet::arrow::ArrowWriter;
+    use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
+    use parquet::file::reader::FileReader;
+    use parquet::file::serialized_reader::SerializedFileReader;
+    use std::sync::Arc;
+
+    /// Helper function to create a ParquetMetaData with specified row group sizes
+    /// by actually writing a parquet file in memory
+    fn create_test_metadata(
+        row_group_sizes: Vec<i64>,
+    ) -> parquet::file::metadata::ParquetMetaData {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+        let mut buffer = Vec::new();
+        {
+            let props = parquet::file::properties::WriterProperties::builder().build();
+            let mut writer =
+                ArrowWriter::try_new(&mut buffer, schema.clone(), Some(props)).unwrap();
+
+            for &size in &row_group_sizes {
+                let array = arrow::array::Int32Array::from(vec![1; size as usize]);
+                let batch = arrow::record_batch::RecordBatch::try_new(
+                    schema.clone(),
+                    vec![Arc::new(array)],
+                )
+                .unwrap();
+                writer.write(&batch).unwrap();
+                writer.flush().unwrap();
+            }
+            writer.close().unwrap();
+        }
+
+        let bytes = Bytes::from(buffer);
+        let reader = SerializedFileReader::new(bytes).unwrap();
+        reader.metadata().clone()
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_simple() {
+        // Test: all row groups are scanned, no row selection
+        let metadata = create_test_metadata(vec![100, 100, 100]);
+
+        let access_plan = ParquetAccessPlan::new_all(3);
+        let rg_metadata = metadata.row_groups();
+
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        // Verify original plan
+        assert_eq!(prepared_plan.row_group_indexes, vec![0, 1, 2]);
+
+        // No row selection originally due to scanning all rows
+        assert_eq!(prepared_plan.row_selection, None);
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        // Verify row groups are reversed
+        assert_eq!(reversed_plan.row_group_indexes, vec![2, 1, 0]);
+
+        // If no selection originally, after reversal should still select all rows,
+        // and the selection should be None
+        assert_eq!(reversed_plan.row_selection, None);
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_with_selection() {
+        // Test: simple row selection that spans multiple row groups
+        let metadata = create_test_metadata(vec![100, 100, 100]);
+
+        let mut access_plan = ParquetAccessPlan::new_all(3);
+
+        // Select first 50 rows from first row group, skip rest
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![RowSelector::select(50), RowSelector::skip(50)]),
+        );
+
+        let rg_metadata = metadata.row_groups();
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        let original_selected: usize = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        let reversed_selected: usize = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(
+            original_selected, reversed_selected,
+            "Total selected rows should remain the same"
+        );
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_multi_row_group_selection() {
+        // Test: row selection spanning multiple row groups
+        let metadata = create_test_metadata(vec![100, 100, 100]);
+
+        let mut access_plan = ParquetAccessPlan::new_all(3);
+
+        // Create selection that spans RG0 and RG1
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![RowSelector::skip(50), RowSelector::select(50)]),
+        );
+        access_plan.scan_selection(
+            1,
+            RowSelection::from(vec![RowSelector::select(50), RowSelector::skip(50)]),
+        );
+
+        let rg_metadata = metadata.row_groups();
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        let original_selected: usize = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        let reversed_selected: usize = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(original_selected, reversed_selected);
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_empty_selection() {
+        // Test: all rows are skipped
+        let metadata = create_test_metadata(vec![100, 100, 100]);
+
+        let mut access_plan = ParquetAccessPlan::new_all(3);
+
+        // Skip all rows in all row groups
+        for i in 0..3 {
+            access_plan
+                .scan_selection(i, RowSelection::from(vec![RowSelector::skip(100)]));
+        }
+
+        let rg_metadata = metadata.row_groups();
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        // Should still skip all rows
+        let total_selected: usize = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(total_selected, 0);
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_different_row_group_sizes() {
+        // Test: row groups with different sizes
+        let metadata = create_test_metadata(vec![50, 150, 100]);
+
+        let mut access_plan = ParquetAccessPlan::new_all(3);
+
+        // Create complex selection pattern
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![RowSelector::skip(25), RowSelector::select(25)]),
+        );
+        access_plan.scan_selection(1, RowSelection::from(vec![RowSelector::select(150)]));
+        access_plan.scan_selection(
+            2,
+            RowSelection::from(vec![RowSelector::select(50), RowSelector::skip(50)]),
+        );
+
+        let rg_metadata = metadata.row_groups();
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        let original_selected: usize = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        let reversed_selected: usize = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(original_selected, reversed_selected);
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_single_row_group() {
+        // Test: single row group case
+        let metadata = create_test_metadata(vec![100]);
+
+        let mut access_plan = ParquetAccessPlan::new_all(1);
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![RowSelector::select(50), RowSelector::skip(50)]),
+        );
+
+        let rg_metadata = metadata.row_groups();
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        let original_selected: usize = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        // With single row group, row_group_indexes should remain [0]
+        assert_eq!(reversed_plan.row_group_indexes, vec![0]);
+
+        let reversed_selected: usize = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(original_selected, reversed_selected);
+        assert_eq!(original_selected, 50);
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_complex_pattern() {
+        // Test: complex pattern with multiple select/skip segments
+        let metadata = create_test_metadata(vec![100, 100, 100]);
+
+        let mut access_plan = ParquetAccessPlan::new_all(3);
+
+        // Complex pattern: select some, skip some, select some more
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![
+                RowSelector::select(30),
+                RowSelector::skip(40),
+                RowSelector::select(30),
+            ]),
+        );
+        access_plan.scan_selection(
+            1,
+            RowSelection::from(vec![RowSelector::skip(50), RowSelector::select(50)]),
+        );
+        access_plan.scan_selection(2, RowSelection::from(vec![RowSelector::select(100)]));
+
+        let rg_metadata = metadata.row_groups();
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        let original_selected: usize = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        let reversed_selected: usize = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(original_selected, reversed_selected);
+        assert_eq!(original_selected, 210); // 30 + 30 + 50 + 100
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_with_skipped_row_groups() {
+        // This is the KEY test case for the bug fix!
+        // Test scenario where some row groups are completely skipped (not in scan plan)
+        let metadata = create_test_metadata(vec![100, 100, 100, 100]);
+
+        // Scenario: RG0 (scan all), RG1 (completely skipped), RG2 (partial), RG3 (scan all)
+        // Only row groups [0, 2, 3] are in the scan plan
+        let mut access_plan = ParquetAccessPlan::new(vec![
+            RowGroupAccess::Scan, // RG0
+            RowGroupAccess::Skip, // RG1 - NOT in scan plan!
+            RowGroupAccess::Scan, // RG2
+            RowGroupAccess::Scan, // RG3
+        ]);
+
+        // Add row selections for the scanned row groups
+        // Note: The RowSelection only covers row groups [0, 2, 3] (300 rows total)
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![RowSelector::select(100)]), // RG0: all 100 rows
+        );
+        // RG1 is skipped, no selection needed
+        access_plan.scan_selection(
+            2,
+            RowSelection::from(vec![
+                RowSelector::select(25), // RG2: first 25 rows
+                RowSelector::skip(75),   // RG2: skip last 75 rows
+            ]),
+        );
+        access_plan.scan_selection(
+            3,
+            RowSelection::from(vec![RowSelector::select(100)]), // RG3: all 100 rows
+        );
+
+        let rg_metadata = metadata.row_groups();
+
+        // Step 1: Create PreparedAccessPlan
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        // Verify original plan
+        assert_eq!(prepared_plan.row_group_indexes, vec![0, 2, 3]);
+        let original_selected: usize = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+        assert_eq!(original_selected, 225); // 100 + 25 + 100
+
+        // Step 2: Reverse the plan (this is the production code path)
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        // Verify reversed results
+        // Row group order should be reversed: [3, 2, 0]
+        assert_eq!(
+            reversed_plan.row_group_indexes,
+            vec![3, 2, 0],
+            "Row groups should be reversed"
+        );
+
+        // Verify row selection is also correctly reversed
+        let reversed_selected: usize = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(
+            reversed_selected, 225,
+            "Total selected rows should remain the same"
+        );
+
+        // Verify the reversed selection structure
+        // After reversal, the order becomes: RG3, RG2, RG0
+        // - RG3: select(100)
+        // - RG2: select(25), skip(75)  (note: internal order preserved, not reversed)
+        // - RG0: select(100)
+        //
+        // After RowSelection::from() merges adjacent selectors of the same type:
+        // - RG3's select(100) + RG2's select(25) = select(125)
+        // - RG2's skip(75) remains as skip(75)
+        // - RG0's select(100) remains as select(100)
+        let selectors: Vec<_> = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .collect();
+        assert_eq!(selectors.len(), 3);
+
+        // RG3 (100) + RG2 first part (25) merged into select(125)
+        assert!(!selectors[0].skip);
+        assert_eq!(selectors[0].row_count, 125);
+
+        // RG2: skip last 75 rows
+        assert!(selectors[1].skip);
+        assert_eq!(selectors[1].row_count, 75);
+
+        // RG0: select all 100 rows
+        assert!(!selectors[2].skip);
+        assert_eq!(selectors[2].row_count, 100);
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_alternating_row_groups() {
+        // Test with alternating scan/skip pattern
+        let metadata = create_test_metadata(vec![100, 100, 100, 100]);
+
+        // Scan RG0 and RG2, skip RG1 and RG3
+        let mut access_plan = ParquetAccessPlan::new(vec![
+            RowGroupAccess::Scan, // RG0
+            RowGroupAccess::Skip, // RG1
+            RowGroupAccess::Scan, // RG2
+            RowGroupAccess::Skip, // RG3
+        ]);
+
+        access_plan.scan_selection(0, RowSelection::from(vec![RowSelector::select(100)]));
+        access_plan.scan_selection(2, RowSelection::from(vec![RowSelector::select(100)]));
+
+        let rg_metadata = metadata.row_groups();
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        let original_selected: usize = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        // Original: [0, 2]
+        assert_eq!(prepared_plan.row_group_indexes, vec![0, 2]);
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        // After reverse: [2, 0]
+        assert_eq!(reversed_plan.row_group_indexes, vec![2, 0]);
+
+        let reversed_selected: usize = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(original_selected, reversed_selected);
+        assert_eq!(original_selected, 200);
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_middle_row_group_only() {
+        // Test selecting only the middle row group
+        let metadata = create_test_metadata(vec![100, 100, 100]);
+
+        let mut access_plan = ParquetAccessPlan::new(vec![
+            RowGroupAccess::Skip, // RG0
+            RowGroupAccess::Scan, // RG1
+            RowGroupAccess::Skip, // RG2
+        ]);
+
+        access_plan.scan_selection(
+            1,
+            RowSelection::from(vec![RowSelector::select(100)]), // Select all of RG1
+        );
+
+        let rg_metadata = metadata.row_groups();
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        let original_selected: usize = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        // Original: [1]
+        assert_eq!(prepared_plan.row_group_indexes, vec![1]);
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        // After reverse: still [1] (only one row group)
+        assert_eq!(reversed_plan.row_group_indexes, vec![1]);
+
+        let reversed_selected: usize = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(original_selected, reversed_selected);
+        assert_eq!(original_selected, 100);
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_with_skipped_row_groups_detailed() {
+        // This is the KEY test case for the bug fix!
+        // Test scenario where some row groups are completely skipped (not in scan plan)
+        // This version includes DETAILED verification of the selector distribution
+        let metadata = create_test_metadata(vec![100, 100, 100, 100]);
+
+        // Scenario: RG0 (scan all), RG1 (completely skipped), RG2 (partial), RG3 (scan all)
+        // Only row groups [0, 2, 3] are in the scan plan
+        let mut access_plan = ParquetAccessPlan::new(vec![
+            RowGroupAccess::Scan, // RG0
+            RowGroupAccess::Skip, // RG1 - NOT in scan plan!
+            RowGroupAccess::Scan, // RG2
+            RowGroupAccess::Scan, // RG3
+        ]);
+
+        // Add row selections for the scanned row groups
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![RowSelector::select(100)]), // RG0: all 100 rows
+        );
+        // RG1 is skipped, no selection needed
+        access_plan.scan_selection(
+            2,
+            RowSelection::from(vec![
+                RowSelector::select(25), // RG2: first 25 rows
+                RowSelector::skip(75),   // RG2: skip last 75 rows
+            ]),
+        );
+        access_plan.scan_selection(
+            3,
+            RowSelection::from(vec![RowSelector::select(100)]), // RG3: all 100 rows
+        );
+
+        let rg_metadata = metadata.row_groups();
+
+        // Step 1: Create PreparedAccessPlan
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        // Verify original plan in detail
+        assert_eq!(prepared_plan.row_group_indexes, vec![0, 2, 3]);
+
+        // Detailed verification of original selection
+        let orig_selectors: Vec<_> = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .collect();
+
+        // Original structure should be:
+        // RG0: select(100)
+        // RG2: select(25), skip(75)
+        // RG3: select(100)
+        // After merging by RowSelection::from(): select(125), skip(75), select(100)
+        assert_eq!(
+            orig_selectors.len(),
+            3,
+            "Original should have 3 selectors after merging"
+        );
+        assert!(
+            !orig_selectors[0].skip && orig_selectors[0].row_count == 125,
+            "Original: First selector should be select(125) from RG0(100) + RG2(25)"
+        );
+        assert!(
+            orig_selectors[1].skip && orig_selectors[1].row_count == 75,
+            "Original: Second selector should be skip(75) from RG2"
+        );
+        assert!(
+            !orig_selectors[2].skip && orig_selectors[2].row_count == 100,
+            "Original: Third selector should be select(100) from RG3"
+        );
+
+        let original_selected: usize = orig_selectors
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+        assert_eq!(original_selected, 225); // 100 + 25 + 100
+
+        // Step 2: Reverse the plan (this is the production code path)
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        // Verify reversed results
+        // Row group order should be reversed: [3, 2, 0]
+        assert_eq!(
+            reversed_plan.row_group_indexes,
+            vec![3, 2, 0],
+            "Row groups should be reversed"
+        );
+
+        // Detailed verification of reversed selection
+        let rev_selectors: Vec<_> = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .collect();
+
+        // After reversal, the order becomes: RG3, RG2, RG0
+        // - RG3: select(100)
+        // - RG2: select(25), skip(75)  (note: internal order preserved, not reversed)
+        // - RG0: select(100)
+        //
+        // After RowSelection::from() merges adjacent selectors of the same type:
+        // - RG3's select(100) + RG2's select(25) = select(125)
+        // - RG2's skip(75) remains as skip(75)
+        // - RG0's select(100) remains as select(100)
+
+        assert_eq!(
+            rev_selectors.len(),
+            3,
+            "Reversed should have 3 selectors after merging"
+        );
+
+        // First selector: RG3 (100) + RG2 first part (25) merged into select(125)
+        assert!(
+            !rev_selectors[0].skip && rev_selectors[0].row_count == 125,
+            "Reversed: First selector should be select(125) from RG3(100) + RG2(25), got skip={} count={}",
+            rev_selectors[0].skip,
+            rev_selectors[0].row_count
+        );
+
+        // Second selector: RG2 skip last 75 rows
+        assert!(
+            rev_selectors[1].skip && rev_selectors[1].row_count == 75,
+            "Reversed: Second selector should be skip(75) from RG2, got skip={} count={}",
+            rev_selectors[1].skip,
+            rev_selectors[1].row_count
+        );
+
+        // Third selector: RG0 select all 100 rows
+        assert!(
+            !rev_selectors[2].skip && rev_selectors[2].row_count == 100,
+            "Reversed: Third selector should be select(100) from RG0, got skip={} count={}",
+            rev_selectors[2].skip,
+            rev_selectors[2].row_count
+        );
+
+        // Verify row selection is also correctly reversed (total count)
+        let reversed_selected: usize = rev_selectors
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(
+            reversed_selected, 225,
+            "Total selected rows should remain the same"
+        );
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_complex_pattern_detailed() {
+        // Test: complex pattern with detailed verification
+        let metadata = create_test_metadata(vec![100, 100, 100]);
+
+        let mut access_plan = ParquetAccessPlan::new_all(3);
+
+        // Complex pattern: select some, skip some, select some more
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![
+                RowSelector::select(30),
+                RowSelector::skip(40),
+                RowSelector::select(30),
+            ]),
+        );
+        access_plan.scan_selection(
+            1,
+            RowSelection::from(vec![RowSelector::skip(50), RowSelector::select(50)]),
+        );
+        access_plan.scan_selection(2, RowSelection::from(vec![RowSelector::select(100)]));
+
+        let rg_metadata = metadata.row_groups();
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        // Verify original selection structure in detail
+        let orig_selectors: Vec<_> = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .collect();
+
+        // RG0: select(30), skip(40), select(30)
+        // RG1: skip(50), select(50)
+        // RG2: select(100)
+        // Sequential: sel(30), skip(40), sel(30), skip(50), sel(50), sel(100)
+        // After merge: sel(30), skip(40), sel(30), skip(50), sel(150)
+
+        let original_selected: usize = orig_selectors
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+        assert_eq!(original_selected, 210); // 30 + 30 + 50 + 100
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        // Verify reversed selection structure
+        let rev_selectors: Vec<_> = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .collect();
+
+        // After reversal: RG2, RG1, RG0
+        // RG2: select(100)
+        // RG1: skip(50), select(50)
+        // RG0: select(30), skip(40), select(30)
+        // Sequential: sel(100), skip(50), sel(50), sel(30), skip(40), sel(30)
+        // After merge: sel(100), skip(50), sel(80), skip(40), sel(30)
+
+        let reversed_selected: usize = rev_selectors
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(
+            reversed_selected, 210,
+            "Total selected rows should remain the same (30 + 30 + 50 + 100)"
+        );
+
+        // Verify row group order
+        assert_eq!(reversed_plan.row_group_indexes, vec![2, 1, 0]);
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_alternating_detailed() {
+        // Test with alternating scan/skip pattern with detailed verification
+        let metadata = create_test_metadata(vec![100, 100, 100, 100]);
+
+        // Scan RG0 and RG2, skip RG1 and RG3
+        let mut access_plan = ParquetAccessPlan::new(vec![
+            RowGroupAccess::Scan, // RG0
+            RowGroupAccess::Skip, // RG1
+            RowGroupAccess::Scan, // RG2
+            RowGroupAccess::Skip, // RG3
+        ]);
+
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![RowSelector::select(30), RowSelector::skip(70)]),
+        );
+        access_plan.scan_selection(
+            2,
+            RowSelection::from(vec![RowSelector::skip(20), RowSelector::select(80)]),
+        );
+
+        let rg_metadata = metadata.row_groups();
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        // Original: [0, 2]
+        assert_eq!(prepared_plan.row_group_indexes, vec![0, 2]);
+
+        // Verify original selection
+        let orig_selectors: Vec<_> = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .collect();
+
+        // Original:
+        // RG0: select(30), skip(70)
+        // RG2: skip(20), select(80)
+        // Sequential: sel(30), skip(90), sel(80)
+        //   (RG0's skip(70) + RG2's skip(20) = skip(90))
+
+        let original_selected: usize = orig_selectors
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+        assert_eq!(original_selected, 110); // 30 + 80
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        // After reverse: [2, 0]
+        assert_eq!(reversed_plan.row_group_indexes, vec![2, 0]);
+
+        // Verify reversed selection
+        let rev_selectors: Vec<_> = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .collect();
+
+        // After reversal: RG2, RG0
+        // RG2: skip(20), select(80)
+        // RG0: select(30), skip(70)
+        // Sequential: skip(20), sel(110), skip(70)
+        //   (RG2's select(80) + RG0's select(30) = select(110))
+
+        let reversed_selected: usize = rev_selectors
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(reversed_selected, 110); // Should still be 30 + 80
+
+        // Detailed verification of structure
+        assert_eq!(rev_selectors.len(), 3, "Reversed should have 3 selectors");
+
+        assert!(
+            rev_selectors[0].skip && rev_selectors[0].row_count == 20,
+            "First selector should be skip(20) from RG2"
+        );
+
+        assert!(
+            !rev_selectors[1].skip && rev_selectors[1].row_count == 110,
+            "Second selector should be select(110) from RG2(80) + RG0(30)"
+        );
+
+        assert!(
+            rev_selectors[2].skip && rev_selectors[2].row_count == 70,
+            "Third selector should be skip(70) from RG0"
+        );
+    }
+}
diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs
index 450ccc5d0620e..a014c8b2726e7 100644
--- a/datafusion/datasource-parquet/src/source.rs
+++ b/datafusion/datasource-parquet/src/source.rs
@@ -16,45 +16,46 @@
 // under the License.
 
 //! ParquetSource implementation for reading parquet files
-use std::any::Any;
 use std::fmt::Debug;
 use std::fmt::Formatter;
 use std::sync::Arc;
 
-use crate::opener::build_pruning_predicates;
-use crate::opener::ParquetOpener;
-use crate::row_filter::can_expr_be_pushed_down_with_schemas;
 use crate::DefaultParquetFileReaderFactory;
 use crate::ParquetFileReaderFactory;
+use crate::opener::ParquetMorselizer;
+use crate::opener::build_pruning_predicates;
+use crate::row_filter::can_expr_be_pushed_down_with_schemas;
 use datafusion_common::config::ConfigOptions;
 #[cfg(feature = "parquet_encryption")]
 use datafusion_common::config::EncryptionFactoryOptions;
 use datafusion_datasource::as_file_source;
 use datafusion_datasource::file_stream::FileOpener;
-use datafusion_datasource::schema_adapter::{
-    DefaultSchemaAdapterFactory, SchemaAdapterFactory,
-};
+use datafusion_datasource::morsel::Morselizer;
 
 use arrow::datatypes::TimeUnit;
+use datafusion_common::DataFusionError;
 use datafusion_common::config::TableParquetOptions;
-use datafusion_common::{DataFusionError, Statistics};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_datasource::TableSchema;
 use datafusion_datasource::file::FileSource;
 use datafusion_datasource::file_scan_config::FileScanConfig;
-use datafusion_datasource::TableSchema;
-use datafusion_physical_expr::conjunction;
+use datafusion_physical_expr::projection::ProjectionExprs;
+use datafusion_physical_expr::{EquivalenceProperties, conjunction};
 use datafusion_physical_expr_adapter::DefaultPhysicalExprAdapterFactory;
-use datafusion_physical_expr_common::physical_expr::fmt_sql;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use datafusion_physical_expr_common::physical_expr::fmt_sql;
+use datafusion_physical_plan::DisplayFormatType;
+use datafusion_physical_plan::SortOrderPushdownResult;
 use datafusion_physical_plan::filter_pushdown::PushedDown;
 use datafusion_physical_plan::filter_pushdown::{
     FilterPushdownPropagation, PushedDownPredicate,
 };
 use datafusion_physical_plan::metrics::Count;
 use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
-use datafusion_physical_plan::DisplayFormatType;
 
 #[cfg(feature = "parquet_encryption")]
 use datafusion_execution::parquet_encryption::EncryptionFactory;
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 use itertools::Itertools;
 use object_store::ObjectStore;
 #[cfg(feature = "parquet_encryption")]
@@ -104,11 +105,11 @@ use parquet::encryption::decrypt::FileDecryptionProperties;
 /// # let object_store_url = ObjectStoreUrl::local_filesystem();
 /// # let predicate = lit(true);
 /// let source = Arc::new(
-///     ParquetSource::default()
-///     .with_predicate(predicate)
+///     ParquetSource::new(Arc::clone(&file_schema))
+///         .with_predicate(predicate)
 /// );
 /// // Create a DataSourceExec for reading `file1.parquet` with a file size of 100MB
-/// let config = FileScanConfigBuilder::new(object_store_url, file_schema, source)
+/// let config = FileScanConfigBuilder::new(object_store_url, source)
 ///    .with_file(PartitionedFile::new("file1.parquet", 100*1024*1024)).build();
 /// let exec = DataSourceExec::from_data_source(config);
 /// ```
@@ -133,7 +134,7 @@ use parquet::encryption::decrypt::FileDecryptionProperties;
 ///   details.
 ///
 /// * Schema evolution: read parquet files with different schemas into a unified
-///   table schema. See [`SchemaAdapterFactory`] for more details.
+///   table schema. See [`DefaultPhysicalExprAdapterFactory`] for more details.
 ///
 /// * metadata_size_hint: controls the number of bytes read from the end of the
 ///   file in the initial I/O when the default [`ParquetFileReaderFactory`]. If a
@@ -182,7 +183,7 @@ use parquet::encryption::decrypt::FileDecryptionProperties;
 /// // Split a single DataSourceExec into multiple DataSourceExecs, one for each file
 /// let exec = parquet_exec();
 /// let data_source = exec.data_source();
-/// let base_config = data_source.as_any().downcast_ref::<FileScanConfig>().unwrap();
+/// let base_config = data_source.downcast_ref::<FileScanConfig>().unwrap();
 /// let existing_file_groups = &base_config.file_groups;
 /// let new_execs = existing_file_groups
 ///   .iter()
@@ -231,7 +232,7 @@ use parquet::encryption::decrypt::FileDecryptionProperties;
 /// let partitioned_file = PartitionedFile::new("my_file.parquet", 1234)
 ///   .with_extensions(Arc::new(access_plan));
 /// // create a FileScanConfig to scan this file
-/// let config = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema(), Arc::new(ParquetSource::default()))
+/// let config = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), Arc::new(ParquetSource::new(schema())))
 ///     .with_file(partitioned_file).build();
 /// // this parquet DataSourceExec will not even try to read row groups 2 and 4. Additional
 /// // pruning based on predicates may also happen
@@ -240,17 +241,17 @@ use parquet::encryption::decrypt::FileDecryptionProperties;
 ///
 /// For a complete example, see the [`advanced_parquet_index` example]).
 ///
-/// [`parquet_index_advanced` example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_parquet_index.rs
+/// [`parquet_index_advanced` example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/data_io/parquet_advanced_index.rs
 ///
 /// # Execution Overview
 ///
 /// * Step 1: `DataSourceExec::execute` is called, returning a `FileStream`
-///   configured to open parquet files with a `ParquetOpener`.
+///   configured to morselize parquet files with a `ParquetMorselizer`.
 ///
-/// * Step 2: When the stream is polled, the `ParquetOpener` is called to open
-///   the file.
+/// * Step 2: When the stream is polled, the `ParquetMorselizer` is called to
+///   plan the file.
 ///
-/// * Step 3: The `ParquetOpener` gets the [`ParquetMetaData`] (file metadata)
+/// * Step 3: The `ParquetMorselizer` gets the [`ParquetMetaData`] (file metadata)
 ///   via [`ParquetFileReaderFactory`], creating a `ParquetAccessPlan` by
 ///   applying predicates to metadata. The plan and projections are used to
 ///   determine what pages must be read.
@@ -260,13 +261,13 @@ use parquet::encryption::decrypt::FileDecryptionProperties;
 ///   [`Self::with_pushdown_filters`]).
 ///
 /// * Step 5: As each [`RecordBatch`] is read, it may be adapted by a
-///   [`SchemaAdapter`] to match the table schema. By default missing columns are
-///   filled with nulls, but this can be customized via [`SchemaAdapterFactory`].
+///   [`DefaultPhysicalExprAdapterFactory`] to match the table schema. By default missing columns are
+///   filled with nulls, but this can be customized via [`PhysicalExprAdapterFactory`].
 ///
 /// [`RecordBatch`]: arrow::record_batch::RecordBatch
-/// [`SchemaAdapter`]: datafusion_datasource::schema_adapter::SchemaAdapter
 /// [`ParquetMetadata`]: parquet::file::metadata::ParquetMetaData
-#[derive(Clone, Default, Debug)]
+/// [`PhysicalExprAdapterFactory`]: datafusion_physical_expr_adapter::PhysicalExprAdapterFactory
+#[derive(Clone, Debug)]
 pub struct ParquetSource {
     /// Options for reading Parquet files
     pub(crate) table_parquet_options: TableParquetOptions,
@@ -275,33 +276,61 @@ pub struct ParquetSource {
     /// The schema of the file.
     /// In particular, this is the schema of the table without partition columns,
     /// *not* the physical schema of the file.
-    pub(crate) table_schema: Option<TableSchema>,
+    pub(crate) table_schema: TableSchema,
     /// Optional predicate for row filtering during parquet scan
     pub(crate) predicate: Option<Arc<dyn PhysicalExpr>>,
     /// Optional user defined parquet file reader factory
     pub(crate) parquet_file_reader_factory: Option<Arc<dyn ParquetFileReaderFactory>>,
-    /// Optional user defined schema adapter
-    pub(crate) schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
     /// Batch size configuration
     pub(crate) batch_size: Option<usize>,
     /// Optional hint for the size of the parquet metadata
     pub(crate) metadata_size_hint: Option<usize>,
-    pub(crate) projected_statistics: Option<Statistics>,
+    /// Projection to apply to the output.
+    pub(crate) projection: ProjectionExprs,
     #[cfg(feature = "parquet_encryption")]
     pub(crate) encryption_factory: Option<Arc<dyn EncryptionFactory>>,
+    /// If true, read files in reverse order and reverse row groups within files.
+    /// But it's not guaranteed that rows within row groups are in reverse order,
+    /// so we still need to sort them after reading, so the reverse scan is inexact.
+    /// Used to optimize ORDER BY ... DESC on sorted data.
+    reverse_row_groups: bool,
 }
 
 impl ParquetSource {
     /// Create a new ParquetSource to read the data specified in the file scan
-    /// configuration with the provided `TableParquetOptions`.
-    /// if default values are going to be used, use `ParguetConfig::default()` instead
-    pub fn new(table_parquet_options: TableParquetOptions) -> Self {
+    /// configuration with the provided schema.
+    ///
+    /// Uses default `TableParquetOptions`.
+    /// To set custom options, use [ParquetSource::with_table_parquet_options`].
+    pub fn new(table_schema: impl Into<TableSchema>) -> Self {
+        let table_schema = table_schema.into();
+        // Projection over the full table schema (file columns + partition columns)
+        let full_schema = table_schema.table_schema();
+        let indices: Vec<usize> = (0..full_schema.fields().len()).collect();
         Self {
-            table_parquet_options,
-            ..Self::default()
+            projection: ProjectionExprs::from_indices(&indices, full_schema),
+            table_schema,
+            table_parquet_options: TableParquetOptions::default(),
+            metrics: ExecutionPlanMetricsSet::new(),
+            predicate: None,
+            parquet_file_reader_factory: None,
+            batch_size: None,
+            metadata_size_hint: None,
+            #[cfg(feature = "parquet_encryption")]
+            encryption_factory: None,
+            reverse_row_groups: false,
         }
     }
 
+    /// Set the `TableParquetOptions` for this ParquetSource.
+    pub fn with_table_parquet_options(
+        mut self,
+        table_parquet_options: TableParquetOptions,
+    ) -> Self {
+        self.table_parquet_options = table_parquet_options;
+        self
+    }
+
     /// Set the metadata size hint
     ///
     /// This value determines how many bytes at the end of the file the default
@@ -314,6 +343,7 @@ impl ParquetSource {
     }
 
     /// Set predicate information
+    #[expect(clippy::needless_pass_by_value)]
     pub fn with_predicate(&self, predicate: Arc<dyn PhysicalExpr>) -> Self {
         let mut conf = self.clone();
         conf.predicate = Some(Arc::clone(&predicate));
@@ -385,6 +415,11 @@ impl ParquetSource {
         self.table_parquet_options.global.reorder_filters
     }
 
+    /// Return the value of [`datafusion_common::config::ParquetOptions::force_filter_selections`]
+    fn force_filter_selections(&self) -> bool {
+        self.table_parquet_options.global.force_filter_selections
+    }
+
     /// If enabled, the reader will read the page index
     /// This is used to optimize filter pushdown
     /// via `RowSelector` and `RowFilter` by
@@ -426,28 +461,6 @@ impl ParquetSource {
         self.table_parquet_options.global.max_predicate_cache_size
     }
 
-    /// Applies schema adapter factory from the FileScanConfig if present.
-    ///
-    /// # Arguments
-    /// * `conf` - FileScanConfig that may contain a schema adapter factory
-    /// # Returns
-    /// The converted FileSource with schema adapter factory applied if provided
-    pub fn apply_schema_adapter(
-        self,
-        conf: &FileScanConfig,
-    ) -> datafusion_common::Result<Arc<dyn FileSource>> {
-        let file_source: Arc<dyn FileSource> = self.into();
-
-        // If the FileScanConfig.file_source() has a schema adapter factory, apply it
-        if let Some(factory) = conf.file_source().schema_adapter_factory() {
-            file_source.with_schema_adapter_factory(
-                Arc::<dyn SchemaAdapterFactory>::clone(&factory),
-            )
-        } else {
-            Ok(file_source)
-        }
-    }
-
     #[cfg(feature = "parquet_encryption")]
     fn get_encryption_factory_with_config(
         &self,
@@ -460,6 +473,15 @@ impl ParquetSource {
             )),
         }
     }
+
+    pub(crate) fn with_reverse_row_groups(mut self, reverse_row_groups: bool) -> Self {
+        self.reverse_row_groups = reverse_row_groups;
+        self
+    }
+    #[cfg(test)]
+    pub(crate) fn reverse_row_groups(&self) -> bool {
+        self.reverse_row_groups
+    }
 }
 
 /// Parses datafusion.common.config.ParquetOptions.coerce_int96 String to a arrow_schema.datatype.TimeUnit
@@ -489,52 +511,26 @@ impl From<ParquetSource> for Arc<dyn FileSource> {
 
 impl FileSource for ParquetSource {
     fn create_file_opener(
+        &self,
+        _object_store: Arc<dyn ObjectStore>,
+        _base_config: &FileScanConfig,
+        _partition: usize,
+    ) -> datafusion_common::Result<Arc<dyn FileOpener>> {
+        datafusion_common::internal_err!(
+            "ParquetSource::create_file_opener called but it supports the Morsel API, please use that instead"
+        )
+    }
+
+    fn create_morselizer(
         &self,
         object_store: Arc<dyn ObjectStore>,
         base_config: &FileScanConfig,
         partition: usize,
-    ) -> Arc<dyn FileOpener> {
-        let projection = base_config
-            .file_column_projection_indices()
-            .unwrap_or_else(|| (0..base_config.file_schema().fields().len()).collect());
-
-        let (expr_adapter_factory, schema_adapter_factory) = match (
-            base_config.expr_adapter_factory.as_ref(),
-            self.schema_adapter_factory.as_ref(),
-        ) {
-            (Some(expr_adapter_factory), Some(schema_adapter_factory)) => {
-                // Use both the schema adapter factory and the expr adapter factory.
-                // This results in the SchemaAdapter being used for projections (e.g. a column was selected that is a UInt32 in the file and a UInt64 in the table schema)
-                // but the PhysicalExprAdapterFactory being used for predicate pushdown and stats pruning.
-                (
-                    Some(Arc::clone(expr_adapter_factory)),
-                    Arc::clone(schema_adapter_factory),
-                )
-            }
-            (Some(expr_adapter_factory), None) => {
-                // If no custom schema adapter factory is provided but an expr adapter factory is provided use the expr adapter factory alongside the default schema adapter factory.
-                // This means that the PhysicalExprAdapterFactory will be used for predicate pushdown and stats pruning, while the default schema adapter factory will be used for projections.
-                (
-                    Some(Arc::clone(expr_adapter_factory)),
-                    Arc::new(DefaultSchemaAdapterFactory) as _,
-                )
-            }
-            (None, Some(schema_adapter_factory)) => {
-                // If a custom schema adapter factory is provided but no expr adapter factory is provided use the custom SchemaAdapter for both projections and predicate pushdown.
-                // This maximizes compatibility with existing code that uses the SchemaAdapter API and did not explicitly opt into the PhysicalExprAdapterFactory API.
-                (None, Arc::clone(schema_adapter_factory) as _)
-            }
-            (None, None) => {
-                // If no custom schema adapter factory or expr adapter factory is provided, use the default schema adapter factory and the default physical expr adapter factory.
-                // This means that the default SchemaAdapter will be used for projections (e.g. a column was selected that is a UInt32 in the file and a UInt64 in the table schema)
-                // and the default PhysicalExprAdapterFactory will be used for predicate pushdown and stats pruning.
-                // This is the default behavior with not customization and means that most users of DataFusion will be cut over to the new PhysicalExprAdapterFactory API.
-                (
-                    Some(Arc::new(DefaultPhysicalExprAdapterFactory) as _),
-                    Arc::new(DefaultSchemaAdapterFactory) as _,
-                )
-            }
-        };
+    ) -> datafusion_common::Result<Box<dyn Morselizer>> {
+        let expr_adapter_factory = base_config
+            .expr_adapter_factory
+            .clone()
+            .unwrap_or_else(|| Arc::new(DefaultPhysicalExprAdapterFactory) as _);
 
         let parquet_file_reader_factory =
             self.parquet_file_reader_factory.clone().unwrap_or_else(|| {
@@ -557,25 +553,25 @@ impl FileSource for ParquetSource {
             .as_ref()
             .map(|time_unit| parse_coerce_int96_string(time_unit.as_str()).unwrap());
 
-        Arc::new(ParquetOpener {
+        Ok(Box::new(ParquetMorselizer {
             partition_index: partition,
-            projection: Arc::from(projection),
+            projection: self.projection.clone(),
             batch_size: self
                 .batch_size
-                .expect("Batch size must set before creating ParquetOpener"),
+                .expect("Batch size must set before creating ParquetMorselizer"),
             limit: base_config.limit,
+            preserve_order: base_config.preserve_order,
             predicate: self.predicate.clone(),
-            logical_file_schema: Arc::clone(base_config.file_schema()),
-            partition_fields: base_config.table_partition_cols().clone(),
+            table_schema: self.table_schema.clone(),
             metadata_size_hint: self.metadata_size_hint,
             metrics: self.metrics().clone(),
             parquet_file_reader_factory,
             pushdown_filters: self.pushdown_filters(),
             reorder_filters: self.reorder_filters(),
+            force_filter_selections: self.force_filter_selections(),
             enable_page_index: self.enable_page_index(),
             enable_bloom_filter: self.bloom_filter_on_read(),
             enable_row_group_stats_pruning: self.table_parquet_options.global.pruning,
-            schema_adapter_factory,
             coerce_int96,
             #[cfg(feature = "parquet_encryption")]
             file_decryption_properties,
@@ -583,11 +579,12 @@ impl FileSource for ParquetSource {
             #[cfg(feature = "parquet_encryption")]
             encryption_factory: self.get_encryption_factory_with_config(),
             max_predicate_cache_size: self.max_predicate_cache_size(),
-        })
+            reverse_row_groups: self.reverse_row_groups,
+        }))
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
+    fn table_schema(&self) -> &TableSchema {
+        &self.table_schema
     }
 
     fn filter(&self) -> Option<Arc<dyn PhysicalExpr>> {
@@ -600,44 +597,23 @@ impl FileSource for ParquetSource {
         Arc::new(conf)
     }
 
-    fn with_schema(&self, schema: TableSchema) -> Arc<dyn FileSource> {
-        Arc::new(Self {
-            table_schema: Some(schema),
-            ..self.clone()
-        })
-    }
-
-    fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource> {
-        let mut conf = self.clone();
-        conf.projected_statistics = Some(statistics);
-        Arc::new(conf)
+    fn try_pushdown_projection(
+        &self,
+        projection: &ProjectionExprs,
+    ) -> datafusion_common::Result<Option<Arc<dyn FileSource>>> {
+        let mut source = self.clone();
+        source.projection = self.projection.try_merge(projection)?;
+        Ok(Some(Arc::new(source)))
     }
 
-    fn with_projection(&self, _config: &FileScanConfig) -> Arc<dyn FileSource> {
-        Arc::new(Self { ..self.clone() })
+    fn projection(&self) -> Option<&ProjectionExprs> {
+        Some(&self.projection)
     }
 
     fn metrics(&self) -> &ExecutionPlanMetricsSet {
         &self.metrics
     }
 
-    fn statistics(&self) -> datafusion_common::Result<Statistics> {
-        let statistics = &self.projected_statistics;
-        let statistics = statistics
-            .clone()
-            .expect("projected_statistics must be set");
-        // When filters are pushed down, we have no way of knowing the exact statistics.
-        // Note that pruning predicate is also a kind of filter pushdown.
-        // (bloom filters use `pruning_predicate` too).
-        // Because filter pushdown may happen dynamically as long as there is a predicate
-        // if we have *any* predicate applied, we can't guarantee the statistics are exact.
-        if self.filter().is_some() {
-            Ok(statistics.to_inexact())
-        } else {
-            Ok(statistics)
-        }
-    }
-
     fn file_type(&self) -> &str {
         "parquet"
     }
@@ -652,22 +628,24 @@ impl FileSource for ParquetSource {
 
                 write!(f, "{predicate_string}")?;
 
-                // Try to build a the pruning predicates.
+                // Add reverse_scan info if enabled
+                if self.reverse_row_groups {
+                    write!(f, ", reverse_row_groups=true")?;
+                }
+
+                // Try to build the pruning predicates.
                 // These are only generated here because it's useful to have *some*
                 // idea of what pushdown is happening when viewing plans.
-                // However it is important to note that these predicates are *not*
+                // However, it is important to note that these predicates are *not*
                 // necessarily the predicates that are actually evaluated:
                 // the actual predicates are built in reference to the physical schema of
                 // each file, which we do not have at this point and hence cannot use.
-                // Instead we use the logical schema of the file (the table schema without partition columns).
-                if let (Some(file_schema), Some(predicate)) = (
-                    &self.table_schema.as_ref().map(|ts| ts.file_schema()),
-                    &self.predicate,
-                ) {
+                // Instead, we use the logical schema of the file (the table schema without partition columns).
+                if let Some(predicate) = &self.predicate {
                     let predicate_creation_errors = Count::new();
-                    if let (Some(pruning_predicate), _) = build_pruning_predicates(
+                    if let Some(pruning_predicate) = build_pruning_predicates(
                         Some(predicate),
-                        file_schema,
+                        self.table_schema.table_schema(),
                         &predicate_creation_errors,
                     ) {
                         let mut guarantees = pruning_predicate
@@ -700,16 +678,7 @@ impl FileSource for ParquetSource {
         filters: Vec<Arc<dyn PhysicalExpr>>,
         config: &ConfigOptions,
     ) -> datafusion_common::Result<FilterPushdownPropagation<Arc<dyn FileSource>>> {
-        let Some(table_schema) = self
-            .table_schema
-            .as_ref()
-            .map(|ts| ts.table_schema())
-            .cloned()
-        else {
-            return Ok(FilterPushdownPropagation::with_parent_pushdown_result(
-                vec![PushedDown::No; filters.len()],
-            ));
-        };
+        let table_schema = self.table_schema.table_schema();
         // Determine if based on configs we should push filters down.
         // If either the table / scan itself or the config has pushdown enabled,
         // we will push down the filters.
@@ -725,7 +694,7 @@ impl FileSource for ParquetSource {
         let filters: Vec<PushedDownPredicate> = filters
             .into_iter()
             .map(|filter| {
-                if can_expr_be_pushed_down_with_schemas(&filter, &table_schema) {
+                if can_expr_be_pushed_down_with_schemas(&filter, table_schema) {
                     PushedDownPredicate::supported(filter)
                 } else {
                     PushedDownPredicate::unsupported(filter)
@@ -772,33 +741,209 @@ impl FileSource for ParquetSource {
         .with_updated_node(source))
     }
 
-    fn with_schema_adapter_factory(
+    /// Try to optimize the scan to produce data in the requested sort order.
+    ///
+    /// This method receives:
+    /// 1. The query's required ordering (`order` parameter)
+    /// 2. The file's natural ordering (via `self.file_ordering`, set by FileScanConfig)
+    ///
+    /// With both pieces of information, ParquetSource can decide what optimizations to apply.
+    ///
+    /// # Behavior
+    /// - Returns `Exact` when the file's natural ordering (from Parquet metadata) already
+    ///   satisfies the requested ordering. This allows the Sort operator to be eliminated
+    ///   if the files within each group are also non-overlapping (checked by FileScanConfig).
+    /// - Returns `Inexact` when reversing the row group scan order would help satisfy the
+    ///   requested ordering. We still need a Sort operator at a higher level because:
+    ///   - We only reverse row group read order, not rows within row groups
+    ///   - This provides approximate ordering that benefits limit pushdown
+    ///
+    /// # Returns
+    /// - `Exact`: The file's natural ordering satisfies the request (within-file ordering guaranteed)
+    /// - `Inexact`: Created an optimized source (e.g., reversed scan) that approximates the order
+    /// - `Unsupported`: Cannot optimize for this ordering
+    fn try_pushdown_sort(
         &self,
-        schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
-    ) -> datafusion_common::Result<Arc<dyn FileSource>> {
-        Ok(Arc::new(Self {
-            schema_adapter_factory: Some(schema_adapter_factory),
-            ..self.clone()
-        }))
+        order: &[PhysicalSortExpr],
+        eq_properties: &EquivalenceProperties,
+    ) -> datafusion_common::Result<SortOrderPushdownResult<Arc<dyn FileSource>>> {
+        if order.is_empty() {
+            return Ok(SortOrderPushdownResult::Unsupported);
+        }
+
+        // Check if the natural (non-reversed) ordering already satisfies the request.
+        // Parquet metadata guarantees within-file ordering, so if the ordering matches
+        // we can return Exact. FileScanConfig will verify that files within each group
+        // are non-overlapping before declaring the entire scan as Exact.
+        if eq_properties.ordering_satisfy(order.iter().cloned())? {
+            return Ok(SortOrderPushdownResult::Exact {
+                inner: Arc::new(self.clone()) as Arc<dyn FileSource>,
+            });
+        }
+
+        // Build new equivalence properties with the reversed ordering.
+        // This allows us to check if the reversed ordering satisfies the request
+        // by leveraging:
+        // - Function monotonicity (e.g., extract_year_month preserves ordering)
+        // - Constant columns (from filters)
+        // - Other equivalence relationships
+        //
+        // Example flow:
+        // 1. File ordering: [extract_year_month(ws) DESC, ws DESC]
+        // 2. After reversal: [extract_year_month(ws) ASC, ws ASC]
+        // 3. Requested: [ws ASC]
+        // 4. Through extract_year_month's monotonicity property, the reversed
+        //    ordering satisfies [ws ASC] even though it has additional prefix
+        let reversed_eq_properties = {
+            let mut new = eq_properties.clone();
+            new.clear_orderings();
+
+            // Reverse each ordering in the equivalence properties
+            let reversed_orderings = eq_properties
+                .oeq_class()
+                .iter()
+                .map(|ordering| {
+                    ordering
+                        .iter()
+                        .map(|expr| expr.reverse())
+                        .collect::<Vec<_>>()
+                })
+                .collect::<Vec<_>>();
+
+            new.add_orderings(reversed_orderings);
+            new
+        };
+
+        // Check if the reversed ordering satisfies the requested ordering
+        if !reversed_eq_properties.ordering_satisfy(order.iter().cloned())? {
+            return Ok(SortOrderPushdownResult::Unsupported);
+        }
+
+        // Return Inexact because we're only reversing row group order,
+        // not guaranteeing perfect row-level ordering
+        let new_source = self.clone().with_reverse_row_groups(true);
+        Ok(SortOrderPushdownResult::Inexact {
+            inner: Arc::new(new_source) as Arc<dyn FileSource>,
+        })
     }
 
-    fn schema_adapter_factory(&self) -> Option<Arc<dyn SchemaAdapterFactory>> {
-        self.schema_adapter_factory.clone()
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn PhysicalExpr,
+        ) -> datafusion_common::Result<TreeNodeRecursion>,
+    ) -> datafusion_common::Result<TreeNodeRecursion> {
+        // Visit predicate (filter) expression if present
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(predicate) = &self.predicate {
+            tnr = tnr.visit_sibling(|| f(predicate.as_ref()))?;
+        }
+
+        // Visit projection expressions
+        for proj_expr in &self.projection {
+            tnr = tnr.visit_sibling(|| f(proj_expr.expr.as_ref()))?;
+        }
+
+        Ok(tnr)
     }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
+    use arrow::datatypes::Schema;
     use datafusion_physical_expr::expressions::lit;
 
     #[test]
-    #[allow(deprecated)]
+    #[expect(deprecated)]
     fn test_parquet_source_predicate_same_as_filter() {
         let predicate = lit(true);
 
-        let parquet_source = ParquetSource::default().with_predicate(predicate);
+        let parquet_source =
+            ParquetSource::new(Arc::new(Schema::empty())).with_predicate(predicate);
         // same value. but filter() call Arc::clone internally
         assert_eq!(parquet_source.predicate(), parquet_source.filter().as_ref());
     }
+
+    #[test]
+    fn test_reverse_scan_default_value() {
+        use arrow::datatypes::Schema;
+
+        let schema = Arc::new(Schema::empty());
+        let source = ParquetSource::new(schema);
+
+        assert!(!source.reverse_row_groups());
+    }
+
+    #[test]
+    fn test_reverse_scan_with_setter() {
+        use arrow::datatypes::Schema;
+
+        let schema = Arc::new(Schema::empty());
+
+        let source = ParquetSource::new(schema.clone()).with_reverse_row_groups(true);
+        assert!(source.reverse_row_groups());
+
+        let source = source.with_reverse_row_groups(false);
+        assert!(!source.reverse_row_groups());
+    }
+
+    #[test]
+    fn test_reverse_scan_clone_preserves_value() {
+        use arrow::datatypes::Schema;
+
+        let schema = Arc::new(Schema::empty());
+
+        let source = ParquetSource::new(schema).with_reverse_row_groups(true);
+        let cloned = source.clone();
+
+        assert!(cloned.reverse_row_groups());
+        assert_eq!(source.reverse_row_groups(), cloned.reverse_row_groups());
+    }
+
+    #[test]
+    fn test_reverse_scan_with_other_options() {
+        use arrow::datatypes::Schema;
+
+        let schema = Arc::new(Schema::empty());
+        let options = TableParquetOptions::default();
+
+        let source = ParquetSource::new(schema)
+            .with_table_parquet_options(options)
+            .with_metadata_size_hint(8192)
+            .with_reverse_row_groups(true);
+
+        assert!(source.reverse_row_groups());
+        assert_eq!(source.metadata_size_hint, Some(8192));
+    }
+
+    #[test]
+    fn test_reverse_scan_builder_pattern() {
+        use arrow::datatypes::Schema;
+
+        let schema = Arc::new(Schema::empty());
+
+        let source = ParquetSource::new(schema)
+            .with_reverse_row_groups(true)
+            .with_reverse_row_groups(false)
+            .with_reverse_row_groups(true);
+
+        assert!(source.reverse_row_groups());
+    }
+
+    #[test]
+    fn test_reverse_scan_independent_of_predicate() {
+        use arrow::datatypes::Schema;
+        use datafusion_physical_expr::expressions::lit;
+
+        let schema = Arc::new(Schema::empty());
+        let predicate = lit(true);
+
+        let source = ParquetSource::new(schema)
+            .with_predicate(predicate)
+            .with_reverse_row_groups(true);
+
+        assert!(source.reverse_row_groups());
+        assert!(source.filter().is_some());
+    }
 }
diff --git a/datafusion/datasource-parquet/src/supported_predicates.rs b/datafusion/datasource-parquet/src/supported_predicates.rs
new file mode 100644
index 0000000000000..5c6b5f3ec9a2d
--- /dev/null
+++ b/datafusion/datasource-parquet/src/supported_predicates.rs
@@ -0,0 +1,143 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Registry of physical expressions that support nested list column pushdown
+//! to the Parquet decoder.
+//!
+//! This module provides a trait-based approach for determining which predicates
+//! can be safely evaluated on nested list columns during Parquet decoding.
+
+use std::sync::Arc;
+
+use datafusion_physical_expr::expressions::{IsNotNullExpr, IsNullExpr};
+use datafusion_physical_expr::{PhysicalExpr, ScalarFunctionExpr};
+
+/// Trait for physical expressions that support list column pushdown during
+/// Parquet decoding.
+///
+/// This trait provides a type-safe mechanism for identifying expressions that
+/// can be safely pushed down to the Parquet decoder for evaluation on nested
+/// list columns.
+///
+/// # Implementation Notes
+///
+/// Expression types in external crates cannot directly implement this trait
+/// due to Rust's orphan rules. Instead, we use a blanket implementation that
+/// delegates to a registration mechanism.
+///
+/// # Examples
+///
+/// ```ignore
+/// use datafusion_physical_expr::PhysicalExpr;
+/// use datafusion_datasource_parquet::SupportsListPushdown;
+///
+/// let expr: Arc<dyn PhysicalExpr> = ...;
+/// if expr.supports_list_pushdown() {
+///     // Can safely push down to Parquet decoder
+/// }
+/// ```
+pub trait SupportsListPushdown {
+    /// Returns `true` if this expression supports list column pushdown.
+    fn supports_list_pushdown(&self) -> bool;
+}
+
+/// Blanket implementation for all physical expressions.
+///
+/// This delegates to specialized predicates that check whether the concrete
+/// expression type is registered as supporting list pushdown. This design
+/// allows the trait to work with expression types defined in external crates.
+impl SupportsListPushdown for dyn PhysicalExpr {
+    fn supports_list_pushdown(&self) -> bool {
+        is_null_check(self) || is_supported_scalar_function(self)
+    }
+}
+
+/// Checks if an expression is a NULL or NOT NULL check.
+///
+/// These checks are universally supported for all column types.
+fn is_null_check(expr: &dyn PhysicalExpr) -> bool {
+    expr.downcast_ref::<IsNullExpr>().is_some()
+        || expr.downcast_ref::<IsNotNullExpr>().is_some()
+}
+
+/// Checks if an expression is a scalar function registered for list pushdown.
+///
+/// Returns `true` if the expression is a `ScalarFunctionExpr` whose function
+/// is in the registry of supported operations.
+fn is_supported_scalar_function(expr: &dyn PhysicalExpr) -> bool {
+    scalar_function_name(expr).is_some_and(|name| {
+        // Registry of verified array functions
+        matches!(name, "array_has" | "array_has_all" | "array_has_any")
+    })
+}
+
+fn scalar_function_name(expr: &dyn PhysicalExpr) -> Option<&str> {
+    expr.downcast_ref::<ScalarFunctionExpr>()
+        .map(ScalarFunctionExpr::name)
+}
+
+/// Checks whether the given physical expression contains a supported nested
+/// predicate (for example, `array_has_all`).
+///
+/// This function recursively traverses the expression tree to determine if
+/// any node contains predicates that support list column pushdown to the
+/// Parquet decoder.
+///
+/// # Supported predicates
+///
+/// - `IS NULL` and `IS NOT NULL` checks on any column type
+/// - Array functions: `array_has`, `array_has_all`, `array_has_any`
+///
+/// # Returns
+///
+/// `true` if the expression or any of its children contain supported predicates.
+pub fn supports_list_predicates(expr: &Arc<dyn PhysicalExpr>) -> bool {
+    expr.supports_list_pushdown()
+        || expr
+            .children()
+            .iter()
+            .any(|child| supports_list_predicates(child))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_null_check_detection() {
+        use datafusion_physical_expr::expressions::Column;
+
+        let col_expr: Arc<dyn PhysicalExpr> = Arc::new(Column::new("test", 0));
+        assert!(!is_null_check(col_expr.as_ref()));
+
+        // IsNullExpr and IsNotNullExpr detection requires actual instances
+        // which need schema setup - tested in integration tests
+    }
+
+    #[test]
+    fn test_supported_scalar_functions() {
+        use datafusion_physical_expr::expressions::Column;
+
+        let col_expr: Arc<dyn PhysicalExpr> = Arc::new(Column::new("test", 0));
+
+        // Non-function expressions should return false
+        assert!(!is_supported_scalar_function(col_expr.as_ref()));
+
+        // Testing with actual ScalarFunctionExpr requires function setup
+        // and is better suited for integration tests
+    }
+}
diff --git a/datafusion/datasource-parquet/src/test_data/ndv_test.parquet b/datafusion/datasource-parquet/src/test_data/ndv_test.parquet
new file mode 100644
index 0000000000000..3ecbe320f506e
Binary files /dev/null and b/datafusion/datasource-parquet/src/test_data/ndv_test.parquet differ
diff --git a/datafusion/datasource/Cargo.toml b/datafusion/datasource/Cargo.toml
index 8e0738448a75e..40e2271f45205 100644
--- a/datafusion/datasource/Cargo.toml
+++ b/datafusion/datasource/Cargo.toml
@@ -31,12 +31,13 @@ version.workspace = true
 all-features = true
 
 [features]
-compression = ["async-compression", "xz2", "bzip2", "flate2", "zstd", "tokio-util"]
+backtrace = ["datafusion-common/backtrace"]
+compression = ["async-compression", "liblzma", "bzip2", "flate2", "zstd", "tokio-util"]
 default = ["compression"]
 
 [dependencies]
 arrow = { workspace = true }
-async-compression = { version = "0.4.19", features = [
+async-compression = { version = "0.4.40", features = [
     "bzip2",
     "gzip",
     "xz",
@@ -45,7 +46,7 @@ async-compression = { version = "0.4.19", features = [
 ], optional = true }
 async-trait = { workspace = true }
 bytes = { workspace = true }
-bzip2 = { version = "0.6.1", optional = true }
+bzip2 = { workspace = true, optional = true }
 chrono = { workspace = true }
 datafusion-common = { workspace = true, features = ["object_store"] }
 datafusion-common-runtime = { workspace = true }
@@ -56,24 +57,29 @@ datafusion-physical-expr-adapter = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
 datafusion-physical-plan = { workspace = true }
 datafusion-session = { workspace = true }
-flate2 = { version = "1.1.4", optional = true }
+flate2 = { workspace = true, optional = true }
 futures = { workspace = true }
-glob = "0.3.0"
+glob = { workspace = true }
 itertools = { workspace = true }
+liblzma = { workspace = true, optional = true }
 log = { workspace = true }
 object_store = { workspace = true }
+parking_lot = { workspace = true }
 rand = { workspace = true }
 tempfile = { workspace = true, optional = true }
 tokio = { workspace = true }
-tokio-util = { version = "0.7.16", features = ["io"], optional = true }
+tokio-util = { version = "0.7.17", features = ["io"], optional = true }
 url = { workspace = true }
-xz2 = { version = "0.1", optional = true, features = ["static"] }
-zstd = { version = "0.13", optional = true, default-features = false }
+zstd = { workspace = true, optional = true }
 
 [dev-dependencies]
 criterion = { workspace = true }
+insta = { workspace = true }
 tempfile = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
diff --git a/datafusion/datasource/benches/split_groups_by_statistics.rs b/datafusion/datasource/benches/split_groups_by_statistics.rs
index d51fdfc0a6e90..e2ae4a9753df8 100644
--- a/datafusion/datasource/benches/split_groups_by_statistics.rs
+++ b/datafusion/datasource/benches/split_groups_by_statistics.rs
@@ -24,7 +24,7 @@ use datafusion_datasource::{generate_test_files, verify_sort_integrity};
 use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
 
-use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
 
 pub fn compare_split_groups_by_statistics_algorithms(c: &mut Criterion) {
     let file_schema = Arc::new(Schema::new(vec![Field::new(
diff --git a/datafusion/datasource/src/decoder.rs b/datafusion/datasource/src/decoder.rs
index 654569f741138..9f9fc0d94bb1c 100644
--- a/datafusion/datasource/src/decoder.rs
+++ b/datafusion/datasource/src/decoder.rs
@@ -24,9 +24,9 @@ use arrow::error::ArrowError;
 use bytes::Buf;
 use bytes::Bytes;
 use datafusion_common::Result;
-use futures::stream::BoxStream;
 use futures::StreamExt as _;
-use futures::{ready, Stream};
+use futures::stream::BoxStream;
+use futures::{Stream, ready};
 use std::collections::VecDeque;
 use std::fmt;
 use std::task::Poll;
@@ -175,17 +175,19 @@ pub fn deserialize_stream<'a>(
     mut input: impl Stream<Item = Result<Bytes>> + Unpin + Send + 'a,
     mut deserializer: impl BatchDeserializer<Bytes> + 'a,
 ) -> BoxStream<'a, Result<RecordBatch, ArrowError>> {
-    futures::stream::poll_fn(move |cx| loop {
-        match ready!(input.poll_next_unpin(cx)).transpose()? {
-            Some(b) => _ = deserializer.digest(b),
-            None => deserializer.finish(),
-        };
-
-        return match deserializer.next()? {
-            DeserializerOutput::RecordBatch(rb) => Poll::Ready(Some(Ok(rb))),
-            DeserializerOutput::InputExhausted => Poll::Ready(None),
-            DeserializerOutput::RequiresMoreData => continue,
-        };
+    futures::stream::poll_fn(move |cx| {
+        loop {
+            match ready!(input.poll_next_unpin(cx)).transpose()? {
+                Some(b) => _ = deserializer.digest(b),
+                None => deserializer.finish(),
+            };
+
+            return match deserializer.next()? {
+                DeserializerOutput::RecordBatch(rb) => Poll::Ready(Some(Ok(rb))),
+                DeserializerOutput::InputExhausted => Poll::Ready(None),
+                DeserializerOutput::RequiresMoreData => continue,
+            };
+        }
     })
     .boxed()
 }
diff --git a/datafusion/datasource/src/display.rs b/datafusion/datasource/src/display.rs
index c9e979535963c..0f59e33ff9eac 100644
--- a/datafusion/datasource/src/display.rs
+++ b/datafusion/datasource/src/display.rs
@@ -135,7 +135,7 @@ mod tests {
     use super::*;
 
     use datafusion_physical_plan::{DefaultDisplay, VerboseDisplay};
-    use object_store::{path::Path, ObjectMeta};
+    use object_store::{ObjectMeta, path::Path};
 
     use crate::PartitionedFile;
     use chrono::Utc;
@@ -287,13 +287,6 @@ mod tests {
             version: None,
         };
 
-        PartitionedFile {
-            object_meta,
-            partition_values: vec![],
-            range: None,
-            statistics: None,
-            extensions: None,
-            metadata_size_hint: None,
-        }
+        PartitionedFile::new_from_meta(object_meta)
     }
 }
diff --git a/datafusion/datasource/src/file.rs b/datafusion/datasource/src/file.rs
index d6ade3b8b2107..9b4ae5827ae8b 100644
--- a/datafusion/datasource/src/file.rs
+++ b/datafusion/datasource/src/file.rs
@@ -25,23 +25,35 @@ use std::sync::Arc;
 use crate::file_groups::FileGroupPartitioner;
 use crate::file_scan_config::FileScanConfig;
 use crate::file_stream::FileOpener;
+use crate::morsel::{FileOpenerMorselizer, Morselizer};
+#[expect(deprecated)]
 use crate::schema_adapter::SchemaAdapterFactory;
-use crate::TableSchema;
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::{not_impl_err, Result, Statistics};
-use datafusion_physical_expr::{LexOrdering, PhysicalExpr};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, not_impl_err};
+use datafusion_physical_expr::projection::ProjectionExprs;
+use datafusion_physical_expr::{EquivalenceProperties, LexOrdering, PhysicalExpr};
+use datafusion_physical_plan::DisplayFormatType;
+use datafusion_physical_plan::SortOrderPushdownResult;
 use datafusion_physical_plan::filter_pushdown::{FilterPushdownPropagation, PushedDown};
 use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
-use datafusion_physical_plan::DisplayFormatType;
 
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 use object_store::ObjectStore;
 
-/// Helper function to convert any type implementing FileSource to Arc&lt;dyn FileSource&gt;
+/// Helper function to convert any type implementing [`FileSource`] to `Arc<dyn FileSource>`
 pub fn as_file_source<T: FileSource + 'static>(source: T) -> Arc<dyn FileSource> {
     Arc::new(source)
 }
 
-/// file format specific behaviors for elements in [`DataSource`]
+/// File format specific behaviors for [`DataSource`]
+///
+/// # Schema information
+/// There are two important schemas for a [`FileSource`]:
+/// 1. [`Self::table_schema`] -- the schema for the overall table
+///    (file data plus partition columns)
+/// 2. The logical output schema, comprised of [`Self::table_schema`] with
+///    [`Self::projection`] applied
 ///
 /// See more details on specific implementations:
 /// * [`ArrowSource`](https://docs.rs/datafusion/latest/datafusion/datasource/physical_plan/struct.ArrowSource.html)
@@ -51,39 +63,92 @@ pub fn as_file_source<T: FileSource + 'static>(source: T) -> Arc<dyn FileSource>
 /// * [`ParquetSource`](https://docs.rs/datafusion/latest/datafusion/datasource/physical_plan/struct.ParquetSource.html)
 ///
 /// [`DataSource`]: crate::source::DataSource
-pub trait FileSource: Send + Sync {
-    /// Creates a `dyn FileOpener` based on given parameters
+pub trait FileSource: Any + Send + Sync {
+    /// Creates a `dyn FileOpener` based on given parameters.
+    ///
+    /// Note: File sources with a native morsel implementation should return an
+    /// error from this method and implementing [`Self::create_morselizer`] instead.
     fn create_file_opener(
         &self,
         object_store: Arc<dyn ObjectStore>,
         base_config: &FileScanConfig,
         partition: usize,
-    ) -> Arc<dyn FileOpener>;
-    /// Any
-    fn as_any(&self) -> &dyn Any;
+    ) -> Result<Arc<dyn FileOpener>>;
+
+    /// Creates a `dyn Morselizer` based on given parameters.
+    ///
+    /// The default implementation preserves existing behavior by adapting the
+    /// legacy [`FileOpener`] API into a [`Morselizer`].
+    ///
+    /// It is preferred to implement the [`Morselizer`] API directly by
+    /// implementing this method.
+    fn create_morselizer(
+        &self,
+        object_store: Arc<dyn ObjectStore>,
+        base_config: &FileScanConfig,
+        partition: usize,
+    ) -> Result<Box<dyn Morselizer>> {
+        let opener = self.create_file_opener(object_store, base_config, partition)?;
+        Ok(Box::new(FileOpenerMorselizer::new(opener)))
+    }
+
+    /// Returns the table schema for the overall table (including partition columns, if any)
+    ///
+    /// This method returns the unprojected schema: the full schema of the data
+    /// without [`Self::projection`] applied.
+    ///
+    /// The output schema of this `FileSource` is this TableSchema
+    /// with [`Self::projection`] applied.
+    ///
+    /// Use [`ProjectionExprs::project_schema`] to get the projected schema
+    /// after applying the projection.
+    fn table_schema(&self) -> &crate::table_schema::TableSchema;
+
     /// Initialize new type with batch size configuration
     fn with_batch_size(&self, batch_size: usize) -> Arc<dyn FileSource>;
-    /// Initialize new instance with a new schema
-    fn with_schema(&self, schema: TableSchema) -> Arc<dyn FileSource>;
-    /// Initialize new instance with projection information
-    fn with_projection(&self, config: &FileScanConfig) -> Arc<dyn FileSource>;
-    /// Initialize new instance with projected statistics
-    fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource>;
-    /// Returns the filter expression that will be applied during the file scan.
+
+    /// Returns the filter expression that will be applied *during* the file scan.
+    ///
+    /// These expressions are in terms of the unprojected [`Self::table_schema`].
     fn filter(&self) -> Option<Arc<dyn PhysicalExpr>> {
         None
     }
+
+    /// Return the projection that will be applied to the output stream on top
+    /// of [`Self::table_schema`].
+    ///
+    /// Note you can use [`ProjectionExprs::project_schema`] on the table
+    /// schema to get the effective output schema of this source.
+    fn projection(&self) -> Option<&ProjectionExprs> {
+        None
+    }
+
     /// Return execution plan metrics
     fn metrics(&self) -> &ExecutionPlanMetricsSet;
-    /// Return projected statistics
-    fn statistics(&self) -> Result<Statistics>;
+
     /// String representation of file source such as "csv", "json", "parquet"
     fn file_type(&self) -> &str;
+
     /// Format FileType specific information
     fn fmt_extra(&self, _t: DisplayFormatType, _f: &mut Formatter) -> fmt::Result {
         Ok(())
     }
 
+    /// Returns whether this file source supports repartitioning files by byte ranges.
+    ///
+    /// When this returns `true`, files can be split into multiple partitions
+    /// based on byte offsets for parallel reading.
+    ///
+    /// When this returns `false`, files cannot be repartitioned (e.g., CSV files
+    /// with `newlines_in_values` enabled cannot be split because record boundaries
+    /// cannot be determined by byte offset alone).
+    ///
+    /// The default implementation returns `true`. File sources that cannot support
+    /// repartitioning should override this method.
+    fn supports_repartitioning(&self) -> bool {
+        true
+    }
+
     /// If supported by the [`FileSource`], redistribute files across partitions
     /// according to their size. Allows custom file formats to implement their
     /// own repartitioning logic.
@@ -97,7 +162,8 @@ pub trait FileSource: Send + Sync {
         output_ordering: Option<LexOrdering>,
         config: &FileScanConfig,
     ) -> Result<Option<FileScanConfig>> {
-        if config.file_compression_type.is_compressed() || config.new_lines_in_values {
+        if config.file_compression_type.is_compressed() || !self.supports_repartitioning()
+        {
             return Ok(None);
         }
 
@@ -116,6 +182,19 @@ pub trait FileSource: Send + Sync {
     }
 
     /// Try to push down filters into this FileSource.
+    ///
+    /// `filters` must be in terms of the unprojected table schema (file schema
+    /// plus partition columns), before any projection is applied.
+    ///
+    /// Any filters that this FileSource chooses to evaluate itself should be
+    /// returned as `PushedDown::Yes` in the result, along with a FileSource
+    /// instance that incorporates those filters. Such filters are logically
+    /// applied "during" the file scan, meaning they may refer to columns not
+    /// included in the final output projection.
+    ///
+    /// Filters that cannot be pushed down should be marked as `PushedDown::No`,
+    /// and will be evaluated by an execution plan after the file source.
+    ///
     /// See [`ExecutionPlan::handle_child_pushdown_result`] for more details.
     ///
     /// [`ExecutionPlan::handle_child_pushdown_result`]: datafusion_physical_plan::ExecutionPlan::handle_child_pushdown_result
@@ -129,29 +208,168 @@ pub trait FileSource: Send + Sync {
         ))
     }
 
-    /// Set optional schema adapter factory.
+    /// Try to create a new FileSource that can produce data in the specified sort order.
+    ///
+    /// This method attempts to optimize data retrieval to match the requested ordering.
+    /// It receives both the requested ordering and equivalence properties that describe
+    /// the output data from this file source.
+    ///
+    /// # Parameters
+    /// * `order` - The requested sort ordering from the query
+    /// * `eq_properties` - Equivalence properties of the data that will be produced by this
+    ///   file source. These properties describe the ordering, constant columns, and other
+    ///   relationships in the output data, allowing the implementation to determine if
+    ///   optimizations like reversed scanning can help satisfy the requested ordering.
+    ///   This includes information about:
+    ///   - The file's natural ordering (from output_ordering in FileScanConfig)
+    ///   - Constant columns (e.g., from filters like `ticker = 'AAPL'`)
+    ///   - Monotonic functions (e.g., `extract_year_month(timestamp)`)
+    ///   - Other equivalence relationships
+    ///
+    /// # Examples
     ///
-    /// [`SchemaAdapterFactory`] allows user to specify how fields from the
-    /// file get mapped to that of the table schema.  If you implement this
-    /// method, you should also implement [`schema_adapter_factory`].
+    /// ## Example 1: Simple reverse
+    /// ```text
+    /// File ordering: [a ASC, b DESC]
+    /// Requested:     [a DESC]
+    /// Reversed file: [a DESC, b ASC]
+    /// Result: Satisfies request (prefix match) → Inexact
+    /// ```
     ///
-    /// The default implementation returns a not implemented error.
+    /// ## Example 2: Monotonic function
+    /// ```text
+    /// File ordering: [extract_year_month(ts) ASC, ts ASC]
+    /// Requested:     [ts DESC]
+    /// Reversed file: [extract_year_month(ts) DESC, ts DESC]
+    /// Result: Through monotonicity, satisfies [ts DESC] → Inexact
+    /// ```
     ///
-    /// [`schema_adapter_factory`]: Self::schema_adapter_factory
+    /// # Returns
+    /// * `Exact` - Created a source that guarantees perfect ordering
+    /// * `Inexact` - Created a source optimized for ordering (e.g., reversed row groups) but not perfectly sorted
+    /// * `Unsupported` - Cannot optimize for this ordering
+    ///
+    /// # Deprecation / migration notes
+    /// - [`Self::try_reverse_output`] was renamed to this method and deprecated since `53.0.0`.
+    ///   Per DataFusion's deprecation guidelines, it will be removed in `59.0.0` or later
+    ///   (6 major versions or 6 months, whichever is longer).
+    /// - New implementations should override [`Self::try_pushdown_sort`] directly.
+    /// - For backwards compatibility, the default implementation of
+    ///   [`Self::try_pushdown_sort`] delegates to the deprecated
+    ///   [`Self::try_reverse_output`] until it is removed. After that point, the
+    ///   default implementation will return [`SortOrderPushdownResult::Unsupported`].
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+        eq_properties: &EquivalenceProperties,
+    ) -> Result<SortOrderPushdownResult<Arc<dyn FileSource>>> {
+        #[expect(deprecated)]
+        self.try_reverse_output(order, eq_properties)
+    }
+
+    /// Deprecated: Renamed to [`Self::try_pushdown_sort`].
+    #[deprecated(
+        since = "53.0.0",
+        note = "Renamed to try_pushdown_sort. This method was never limited to reversing output. It will be removed in 59.0.0 or later."
+    )]
+    fn try_reverse_output(
+        &self,
+        _order: &[PhysicalSortExpr],
+        _eq_properties: &EquivalenceProperties,
+    ) -> Result<SortOrderPushdownResult<Arc<dyn FileSource>>> {
+        Ok(SortOrderPushdownResult::Unsupported)
+    }
+
+    /// Try to push down a projection into this FileSource.
+    ///
+    /// `FileSource` implementations that support projection pushdown should
+    /// override this method and return a new `FileSource` instance with the
+    /// projection incorporated.
+    ///
+    /// If a `FileSource` does accept a projection it is expected to handle
+    /// the projection in it's entirety, including partition columns.
+    /// For example, the `FileSource` may translate that projection into a
+    /// file format specific projection (e.g. Parquet can push down struct field access,
+    /// some other file formats like Vortex can push down computed expressions into un-decoded data)
+    /// and also need to handle partition column projection (generally done by replacing partition column
+    /// references with literal values derived from each files partition values).
+    ///
+    /// Not all FileSource's can handle complex expression pushdowns. For example,
+    /// a CSV file source may only support simple column selections. In such cases,
+    /// the `FileSource` can use [`SplitProjection`] and [`ProjectionOpener`]
+    /// to split the projection into a pushdownable part and a non-pushdownable part.
+    /// These helpers also handle partition column projection.
+    ///
+    /// [`SplitProjection`]: crate::projection::SplitProjection
+    /// [`ProjectionOpener`]: crate::projection::ProjectionOpener
+    fn try_pushdown_projection(
+        &self,
+        _projection: &ProjectionExprs,
+    ) -> Result<Option<Arc<dyn FileSource>>> {
+        Ok(None)
+    }
+
+    /// Deprecated: Set optional schema adapter factory.
+    ///
+    /// `SchemaAdapterFactory` has been removed. Use `PhysicalExprAdapterFactory` instead.
+    /// See `upgrading.md` for more details.
+    #[deprecated(
+        since = "53.0.0",
+        note = "SchemaAdapterFactory has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+    )]
+    #[expect(deprecated)]
     fn with_schema_adapter_factory(
         &self,
         _factory: Arc<dyn SchemaAdapterFactory>,
     ) -> Result<Arc<dyn FileSource>> {
         not_impl_err!(
-            "FileSource {} does not support schema adapter factory",
-            self.file_type()
+            "SchemaAdapterFactory has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
         )
     }
 
-    /// Returns the current schema adapter factory if set
+    /// Deprecated: Returns the current schema adapter factory if set.
     ///
-    /// Default implementation returns `None`.
+    /// `SchemaAdapterFactory` has been removed. Use `PhysicalExprAdapterFactory` instead.
+    /// See `upgrading.md` for more details.
+    #[deprecated(
+        since = "53.0.0",
+        note = "SchemaAdapterFactory has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+    )]
+    #[expect(deprecated)]
     fn schema_adapter_factory(&self) -> Option<Arc<dyn SchemaAdapterFactory>> {
         None
     }
+
+    /// Apply a function to all physical expressions used by this file source.
+    ///
+    /// This includes:
+    /// - Filter predicates (which may contain dynamic filters)
+    /// - Projection expressions
+    ///
+    /// The function `f` is called once for each expression. The function should
+    /// return `TreeNodeRecursion::Continue` to continue visiting other expressions,
+    /// or `TreeNodeRecursion::Stop` to stop visiting expressions early.
+    ///
+    /// Implementations must explicitly visit all expressions. There is no default
+    /// implementation to ensure that all FileSource implementations handle this correctly.
+    ///
+    /// See [`ExecutionPlan::apply_expressions`] for more details and examples.
+    ///
+    /// [`ExecutionPlan::apply_expressions`]: datafusion_physical_plan::ExecutionPlan::apply_expressions
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion>;
+}
+
+impl dyn FileSource {
+    /// Returns `true` if this source is of type `T`.
+    pub fn is<T: FileSource>(&self) -> bool {
+        (self as &dyn Any).is::<T>()
+    }
+
+    /// Attempts to downcast this source to a concrete type `T`.
+    pub fn downcast_ref<T: FileSource>(&self) -> Option<&T> {
+        (self as &dyn Any).downcast_ref()
+    }
 }
diff --git a/datafusion/datasource/src/file_compression_type.rs b/datafusion/datasource/src/file_compression_type.rs
index 7cc3142564e9b..89efb580652b1 100644
--- a/datafusion/datasource/src/file_compression_type.rs
+++ b/datafusion/datasource/src/file_compression_type.rs
@@ -21,8 +21,8 @@ use std::str::FromStr;
 
 use datafusion_common::error::{DataFusionError, Result};
 
-use datafusion_common::parsers::CompressionTypeVariant::{self, *};
 use datafusion_common::GetExt;
+use datafusion_common::parsers::CompressionTypeVariant::{self, *};
 
 #[cfg(feature = "compression")]
 use async_compression::tokio::bufread::{
@@ -39,17 +39,17 @@ use bytes::Bytes;
 use bzip2::read::MultiBzDecoder;
 #[cfg(feature = "compression")]
 use flate2::read::MultiGzDecoder;
-use futures::stream::BoxStream;
 use futures::StreamExt;
 #[cfg(feature = "compression")]
 use futures::TryStreamExt;
+use futures::stream::BoxStream;
+#[cfg(feature = "compression")]
+use liblzma::read::XzDecoder;
 use object_store::buffered::BufWriter;
 use tokio::io::AsyncWrite;
 #[cfg(feature = "compression")]
 use tokio_util::io::{ReaderStream, StreamReader};
 #[cfg(feature = "compression")]
-use xz2::read::XzDecoder;
-#[cfg(feature = "compression")]
 use zstd::Decoder as ZstdDecoder;
 
 /// Readable file compression type
@@ -148,32 +148,70 @@ impl FileCompressionType {
             GZIP | BZIP2 | XZ | ZSTD => {
                 return Err(DataFusionError::NotImplemented(
                     "Compression feature is not enabled".to_owned(),
-                ))
+                ));
             }
             UNCOMPRESSED => s.boxed(),
         })
     }
 
     /// Wrap the given `BufWriter` so that it performs compressed writes
-    /// according to this `FileCompressionType`.
+    /// according to this `FileCompressionType` using the default compression level.
     pub fn convert_async_writer(
         &self,
         w: BufWriter,
     ) -> Result<Box<dyn AsyncWrite + Send + Unpin>> {
+        self.convert_async_writer_with_level(w, None)
+    }
+
+    /// Wrap the given `BufWriter` so that it performs compressed writes
+    /// according to this `FileCompressionType`.
+    ///
+    /// If `compression_level` is `Some`, the encoder will use the specified
+    /// compression level. If `None`, the default level for each algorithm is used.
+    pub fn convert_async_writer_with_level(
+        &self,
+        w: BufWriter,
+        compression_level: Option<u32>,
+    ) -> Result<Box<dyn AsyncWrite + Send + Unpin>> {
+        #[cfg(feature = "compression")]
+        use async_compression::Level;
+
         Ok(match self.variant {
             #[cfg(feature = "compression")]
-            GZIP => Box::new(GzipEncoder::new(w)),
+            GZIP => match compression_level {
+                Some(level) => {
+                    Box::new(GzipEncoder::with_quality(w, Level::Precise(level as i32)))
+                }
+                None => Box::new(GzipEncoder::new(w)),
+            },
             #[cfg(feature = "compression")]
-            BZIP2 => Box::new(BzEncoder::new(w)),
+            BZIP2 => match compression_level {
+                Some(level) => {
+                    Box::new(BzEncoder::with_quality(w, Level::Precise(level as i32)))
+                }
+                None => Box::new(BzEncoder::new(w)),
+            },
             #[cfg(feature = "compression")]
-            XZ => Box::new(XzEncoder::new(w)),
+            XZ => match compression_level {
+                Some(level) => {
+                    Box::new(XzEncoder::with_quality(w, Level::Precise(level as i32)))
+                }
+                None => Box::new(XzEncoder::new(w)),
+            },
             #[cfg(feature = "compression")]
-            ZSTD => Box::new(ZstdEncoder::new(w)),
+            ZSTD => match compression_level {
+                Some(level) => {
+                    Box::new(ZstdEncoder::with_quality(w, Level::Precise(level as i32)))
+                }
+                None => Box::new(ZstdEncoder::new(w)),
+            },
             #[cfg(not(feature = "compression"))]
             GZIP | BZIP2 | XZ | ZSTD => {
+                // compression_level is not used when compression feature is disabled
+                let _ = compression_level;
                 return Err(DataFusionError::NotImplemented(
                     "Compression feature is not enabled".to_owned(),
-                ))
+                ));
             }
             UNCOMPRESSED => Box::new(w),
         })
@@ -210,7 +248,7 @@ impl FileCompressionType {
             GZIP | BZIP2 | XZ | ZSTD => {
                 return Err(DataFusionError::NotImplemented(
                     "Compression feature is not enabled".to_owned(),
-                ))
+                ));
             }
             UNCOMPRESSED => s.boxed(),
         })
@@ -237,7 +275,7 @@ impl FileCompressionType {
             GZIP | BZIP2 | XZ | ZSTD => {
                 return Err(DataFusionError::NotImplemented(
                     "Compression feature is not enabled".to_owned(),
-                ))
+                ));
             }
             UNCOMPRESSED => Box::new(r),
         })
diff --git a/datafusion/datasource/src/file_format.rs b/datafusion/datasource/src/file_format.rs
index 23f68636c156e..dd30881610f36 100644
--- a/datafusion/datasource/src/file_format.rs
+++ b/datafusion/datasource/src/file_format.rs
@@ -30,8 +30,9 @@ use crate::file_sink_config::FileSinkConfig;
 
 use arrow::datatypes::SchemaRef;
 use datafusion_common::file_options::file_type::FileType;
-use datafusion_common::{internal_err, not_impl_err, GetExt, Result, Statistics};
+use datafusion_common::{GetExt, Result, Statistics, internal_err, not_impl_err};
 use datafusion_physical_expr::LexRequirement;
+use datafusion_physical_expr_common::sort_expr::LexOrdering;
 use datafusion_physical_plan::ExecutionPlan;
 use datafusion_session::Session;
 
@@ -41,17 +42,42 @@ use object_store::{ObjectMeta, ObjectStore};
 /// Default max records to scan to infer the schema
 pub const DEFAULT_SCHEMA_INFER_MAX_RECORD: usize = 1000;
 
+/// Metadata fetched from a file, including statistics and ordering.
+///
+/// This struct is returned by [`FileFormat::infer_stats_and_ordering`] to
+/// provide all metadata in a single read, avoiding duplicate I/O operations.
+#[derive(Debug, Clone)]
+#[non_exhaustive]
+pub struct FileMeta {
+    /// Statistics for the file (row counts, byte sizes, column statistics).
+    pub statistics: Statistics,
+    /// The ordering (sort order) of the file, if known.
+    pub ordering: Option<LexOrdering>,
+}
+
+impl FileMeta {
+    /// Creates a new `FileMeta` with the given statistics and no ordering.
+    pub fn new(statistics: Statistics) -> Self {
+        Self {
+            statistics,
+            ordering: None,
+        }
+    }
+
+    /// Sets the ordering for this file metadata.
+    pub fn with_ordering(mut self, ordering: Option<LexOrdering>) -> Self {
+        self.ordering = ordering;
+        self
+    }
+}
+
 /// This trait abstracts all the file format specific implementations
 /// from the [`TableProvider`]. This helps code re-utilization across
 /// providers that support the same file formats.
 ///
 /// [`TableProvider`]: https://docs.rs/datafusion/latest/datafusion/catalog/trait.TableProvider.html
 #[async_trait]
-pub trait FileFormat: Send + Sync + fmt::Debug {
-    /// Returns the table provider as [`Any`] so that it can be
-    /// downcast to a specific implementation.
-    fn as_any(&self) -> &dyn Any;
-
+pub trait FileFormat: Any + Send + Sync + fmt::Debug {
     /// Returns the extension for this FileFormat, e.g. "file.csv" -> csv
     fn get_ext(&self) -> String;
 
@@ -90,6 +116,52 @@ pub trait FileFormat: Send + Sync + fmt::Debug {
         object: &ObjectMeta,
     ) -> Result<Statistics>;
 
+    /// Infer the ordering (sort order) for the provided object from file metadata.
+    ///
+    /// Returns `Ok(None)` if the file format does not support ordering inference
+    /// or if the file does not have ordering information.
+    ///
+    /// `table_schema` is the (combined) schema of the overall table
+    /// and may be a superset of the schema contained in this file.
+    ///
+    /// The default implementation returns `Ok(None)`.
+    async fn infer_ordering(
+        &self,
+        _state: &dyn Session,
+        _store: &Arc<dyn ObjectStore>,
+        _table_schema: SchemaRef,
+        _object: &ObjectMeta,
+    ) -> Result<Option<LexOrdering>> {
+        Ok(None)
+    }
+
+    /// Infer both statistics and ordering from a single metadata read.
+    ///
+    /// This is more efficient than calling [`Self::infer_stats`] and
+    /// [`Self::infer_ordering`] separately when both are needed, as it avoids
+    /// reading file metadata twice.
+    ///
+    /// The default implementation calls both methods separately. File formats
+    /// that can extract both from a single read should override this method.
+    async fn infer_stats_and_ordering(
+        &self,
+        state: &dyn Session,
+        store: &Arc<dyn ObjectStore>,
+        table_schema: SchemaRef,
+        object: &ObjectMeta,
+    ) -> Result<FileMeta> {
+        let statistics = self
+            .infer_stats(state, store, Arc::clone(&table_schema), object)
+            .await?;
+        let ordering = self
+            .infer_ordering(state, store, table_schema, object)
+            .await?;
+        Ok(FileMeta {
+            statistics,
+            ordering,
+        })
+    }
+
     /// Take a list of files and convert it to the appropriate executor
     /// according to this file format.
     async fn create_physical_plan(
@@ -111,13 +183,26 @@ pub trait FileFormat: Send + Sync + fmt::Debug {
     }
 
     /// Return the related FileSource such as `CsvSource`, `JsonSource`, etc.
-    fn file_source(&self) -> Arc<dyn FileSource>;
+    ///
+    /// # Arguments
+    /// * `table_schema` - The table schema to use for the FileSource (includes partition columns)
+    fn file_source(&self, table_schema: crate::TableSchema) -> Arc<dyn FileSource>;
+}
+
+impl dyn FileFormat {
+    pub fn is<T: FileFormat>(&self) -> bool {
+        (self as &dyn Any).is::<T>()
+    }
+
+    pub fn downcast_ref<T: FileFormat>(&self) -> Option<&T> {
+        (self as &dyn Any).downcast_ref()
+    }
 }
 
 /// Factory for creating [`FileFormat`] instances based on session and command level options
 ///
 /// Users can provide their own `FileFormatFactory` to support arbitrary file formats
-pub trait FileFormatFactory: Sync + Send + GetExt + fmt::Debug {
+pub trait FileFormatFactory: Any + Sync + Send + GetExt + fmt::Debug {
     /// Initialize a [FileFormat] and configure based on session and command level options
     fn create(
         &self,
@@ -127,10 +212,16 @@ pub trait FileFormatFactory: Sync + Send + GetExt + fmt::Debug {
 
     /// Initialize a [FileFormat] with all options set to default values
     fn default(&self) -> Arc<dyn FileFormat>;
+}
 
-    /// Returns the table source as [`Any`] so that it can be
-    /// downcast to a specific implementation.
-    fn as_any(&self) -> &dyn Any;
+impl dyn FileFormatFactory {
+    pub fn is<T: FileFormatFactory>(&self) -> bool {
+        (self as &dyn Any).is::<T>()
+    }
+
+    pub fn downcast_ref<T: FileFormatFactory>(&self) -> Option<&T> {
+        (self as &dyn Any).downcast_ref()
+    }
 }
 
 /// A container of [FileFormatFactory] which also implements [FileType].
diff --git a/datafusion/datasource/src/file_groups.rs b/datafusion/datasource/src/file_groups.rs
index 998d09285cf1d..84594be54b504 100644
--- a/datafusion/datasource/src/file_groups.rs
+++ b/datafusion/datasource/src/file_groups.rs
@@ -18,10 +18,12 @@
 //! Logic for managing groups of [`PartitionedFile`]s in DataFusion
 
 use crate::{FileRange, PartitionedFile};
+use arrow::compute::SortOptions;
 use datafusion_common::Statistics;
+use datafusion_common::utils::compare_rows;
 use itertools::Itertools;
-use std::cmp::{min, Ordering};
-use std::collections::BinaryHeap;
+use std::cmp::{Ordering, min};
+use std::collections::{BinaryHeap, HashMap};
 use std::iter::repeat_with;
 use std::mem;
 use std::ops::{Deref, DerefMut, Index, IndexMut};
@@ -189,15 +191,6 @@ impl FileGroupPartitioner {
             return None;
         }
 
-        // Perform redistribution only in case all files should be read from beginning to end
-        let has_ranges = file_groups
-            .iter()
-            .flat_map(FileGroup::iter)
-            .any(|f| f.range.is_some());
-        if has_ranges {
-            return None;
-        }
-
         //  special case when order must be preserved
         if self.preserve_order_within_groups {
             self.repartition_preserving_order(file_groups)
@@ -218,14 +211,13 @@ impl FileGroupPartitioner {
 
         let total_size = flattened_files
             .iter()
-            .map(|f| f.object_meta.size as i64)
-            .sum::<i64>();
-        if total_size < (repartition_file_min_size as i64) || total_size == 0 {
+            .map(|f| f.effective_size())
+            .sum::<u64>();
+        if total_size < (repartition_file_min_size as u64) || total_size == 0 {
             return None;
         }
 
-        let target_partition_size =
-            (total_size as u64).div_ceil(target_partitions as u64);
+        let target_partition_size = total_size.div_ceil(target_partitions as u64);
 
         let current_partition_index: usize = 0;
         let current_partition_size: u64 = 0;
@@ -235,13 +227,14 @@ impl FileGroupPartitioner {
             .into_iter()
             .scan(
                 (current_partition_index, current_partition_size),
-                |state, source_file| {
+                |(current_partition_index, current_partition_size), source_file| {
                     let mut produced_files = vec![];
-                    let mut range_start = 0;
-                    while range_start < source_file.object_meta.size {
+                    let (mut range_start, file_end) = source_file.range();
+                    while range_start < file_end {
                         let range_end = min(
-                            range_start + (target_partition_size - state.1),
-                            source_file.object_meta.size,
+                            range_start
+                                + (target_partition_size - *current_partition_size),
+                            file_end,
                         );
 
                         let mut produced_file = source_file.clone();
@@ -249,13 +242,15 @@ impl FileGroupPartitioner {
                             start: range_start as i64,
                             end: range_end as i64,
                         });
-                        produced_files.push((state.0, produced_file));
+                        produced_files.push((*current_partition_index, produced_file));
 
-                        if state.1 + (range_end - range_start) >= target_partition_size {
-                            state.0 += 1;
-                            state.1 = 0;
+                        if *current_partition_size + (range_end - range_start)
+                            >= target_partition_size
+                        {
+                            *current_partition_index += 1;
+                            *current_partition_size = 0;
                         } else {
-                            state.1 += range_end - range_start;
+                            *current_partition_size += range_end - range_start;
                         }
                         range_start = range_end;
                     }
@@ -297,7 +292,7 @@ impl FileGroupPartitioner {
                 if group.len() == 1 {
                     Some(ToRepartition {
                         source_index: group_index,
-                        file_size: group[0].object_meta.size,
+                        file_size: group[0].effective_size(),
                         new_groups: vec![group_index],
                     })
                 } else {
@@ -333,28 +328,31 @@ impl FileGroupPartitioner {
 
         // Distribute files to their newly assigned groups
         while let Some(to_repartition) = heap.pop() {
-            let range_size = to_repartition.range_size() as i64;
+            let range_size = to_repartition.range_size();
             let ToRepartition {
                 source_index,
-                file_size,
+                file_size: _,
                 new_groups,
             } = to_repartition.into_inner();
             assert_eq!(file_groups[source_index].len(), 1);
             let original_file = file_groups[source_index].pop().unwrap();
 
             let last_group = new_groups.len() - 1;
-            let mut range_start: i64 = 0;
-            let mut range_end: i64 = range_size;
+            let (mut range_start, file_end) = original_file.range();
+            let mut range_end = range_start + range_size;
             for (i, group_index) in new_groups.into_iter().enumerate() {
                 let target_group = &mut file_groups[group_index];
                 assert!(target_group.is_empty());
 
                 // adjust last range to include the entire file
                 if i == last_group {
-                    range_end = file_size as i64;
+                    range_end = file_end;
                 }
-                target_group
-                    .push(original_file.clone().with_range(range_start, range_end));
+                target_group.push(
+                    original_file
+                        .clone()
+                        .with_range(range_start as i64, range_end as i64),
+                );
                 range_start = range_end;
                 range_end += range_size;
             }
@@ -366,11 +364,27 @@ impl FileGroupPartitioner {
 
 /// Represents a group of partitioned files that'll be processed by a single thread.
 /// Maintains optional statistics across all files in the group.
+///
+/// # Statistics
+///
+/// The group-level [`FileGroup::file_statistics`] field contains merged statistics from all files
+/// in the group for the **full table schema** (file columns + partition columns).
+///
+/// Partition column statistics are derived from the individual file partition values:
+/// - `min` = minimum partition value across all files in the group
+/// - `max` = maximum partition value across all files in the group
+/// - `null_count` = 0 (partition values are never null)
+///
+/// This allows query optimizers to prune entire file groups based on partition bounds.
 #[derive(Debug, Clone)]
 pub struct FileGroup {
     /// The files in this group
     files: Vec<PartitionedFile>,
-    /// Optional statistics for the data across all files in the group
+    /// Optional statistics for the data across all files in the group.
+    ///
+    /// These statistics cover the full table schema: file columns plus partition columns.
+    /// Partition column statistics are merged from individual [`PartitionedFile::statistics`],
+    /// which compute exact values from [`PartitionedFile::partition_values`].
     statistics: Option<Arc<Statistics>>,
 }
 
@@ -468,6 +482,65 @@ impl FileGroup {
 
         chunks
     }
+
+    /// Groups files by their partition values, ensuring all files with same
+    /// partition values are in the same group.
+    ///
+    /// Note: May return fewer groups than `max_target_partitions` when the
+    /// number of unique partition values is less than the target.
+    #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key
+    pub fn group_by_partition_values(
+        self,
+        max_target_partitions: usize,
+    ) -> Vec<FileGroup> {
+        if self.is_empty() || max_target_partitions == 0 {
+            return vec![];
+        }
+
+        let mut partition_groups: HashMap<
+            Vec<datafusion_common::ScalarValue>,
+            Vec<PartitionedFile>,
+        > = HashMap::new();
+
+        for file in self.files {
+            partition_groups
+                .entry(file.partition_values.clone())
+                .or_default()
+                .push(file);
+        }
+
+        let num_unique_partitions = partition_groups.len();
+
+        // Sort for deterministic bucket assignment across query executions.
+        let mut sorted_partitions: Vec<_> = partition_groups.into_iter().collect();
+        let sort_options =
+            vec![
+                SortOptions::default();
+                sorted_partitions.first().map(|(k, _)| k.len()).unwrap_or(0)
+            ];
+        sorted_partitions.sort_by(|a, b| {
+            compare_rows(&a.0, &b.0, &sort_options).unwrap_or(Ordering::Equal)
+        });
+
+        if num_unique_partitions <= max_target_partitions {
+            sorted_partitions
+                .into_iter()
+                .map(|(_, files)| FileGroup::new(files))
+                .collect()
+        } else {
+            // Merge into max_target_partitions buckets using round-robin.
+            // This maintains grouping by partition value as we are merging groups which already
+            // contain all values for a partition key.
+            let mut target_groups = vec![vec![]; max_target_partitions];
+
+            for (idx, (_, files)) in sorted_partitions.into_iter().enumerate() {
+                let bucket = idx % max_target_partitions;
+                target_groups[bucket].extend(files);
+            }
+
+            target_groups.into_iter().map(FileGroup::new).collect()
+        }
+    }
 }
 
 impl Index<usize> for FileGroup {
@@ -559,6 +632,7 @@ impl DerefMut for CompareByRangeSize {
 #[cfg(test)]
 mod test {
     use super::*;
+    use datafusion_common::ScalarValue;
 
     /// Empty file won't get partitioned
     #[test]
@@ -645,6 +719,68 @@ mod test {
         assert_partitioned_files(expected, actual);
     }
 
+    #[test]
+    fn repartition_single_file_with_range() {
+        // Single file, single partition into multiple partitions
+        let single_partition =
+            vec![FileGroup::new(vec![pfile("a", 123).with_range(0, 123)])];
+
+        let actual = FileGroupPartitioner::new()
+            .with_target_partitions(4)
+            .with_repartition_file_min_size(10)
+            .repartition_file_groups(&single_partition);
+
+        let expected = Some(vec![
+            FileGroup::new(vec![pfile("a", 123).with_range(0, 31)]),
+            FileGroup::new(vec![pfile("a", 123).with_range(31, 62)]),
+            FileGroup::new(vec![pfile("a", 123).with_range(62, 93)]),
+            FileGroup::new(vec![pfile("a", 123).with_range(93, 123)]),
+        ]);
+        assert_partitioned_files(expected, actual);
+    }
+
+    #[test]
+    fn repartition_single_file_with_incomplete_range() {
+        // Single file, single partition into multiple partitions
+        let single_partition =
+            vec![FileGroup::new(vec![pfile("a", 123).with_range(10, 100)])];
+
+        let actual = FileGroupPartitioner::new()
+            .with_target_partitions(4)
+            .with_repartition_file_min_size(10)
+            .repartition_file_groups(&single_partition);
+
+        let expected = Some(vec![
+            FileGroup::new(vec![pfile("a", 123).with_range(10, 33)]),
+            FileGroup::new(vec![pfile("a", 123).with_range(33, 56)]),
+            FileGroup::new(vec![pfile("a", 123).with_range(56, 79)]),
+            FileGroup::new(vec![pfile("a", 123).with_range(79, 100)]),
+        ]);
+        assert_partitioned_files(expected, actual);
+    }
+
+    #[test]
+    fn repartition_single_file_duplicated_with_range() {
+        // Single file, two partitions into multiple partitions
+        let single_partition = vec![FileGroup::new(vec![
+            pfile("a", 100).with_range(0, 50),
+            pfile("a", 100).with_range(50, 100),
+        ])];
+
+        let actual = FileGroupPartitioner::new()
+            .with_target_partitions(4)
+            .with_repartition_file_min_size(10)
+            .repartition_file_groups(&single_partition);
+
+        let expected = Some(vec![
+            FileGroup::new(vec![pfile("a", 100).with_range(0, 25)]),
+            FileGroup::new(vec![pfile("a", 100).with_range(25, 50)]),
+            FileGroup::new(vec![pfile("a", 100).with_range(50, 75)]),
+            FileGroup::new(vec![pfile("a", 100).with_range(75, 100)]),
+        ]);
+        assert_partitioned_files(expected, actual);
+    }
+
     #[test]
     fn repartition_too_much_partitions() {
         // Single file, single partition into 96 partitions
@@ -717,22 +853,6 @@ mod test {
         assert_partitioned_files(expected, actual);
     }
 
-    #[test]
-    fn repartition_no_action_ranges() {
-        // No action due to Some(range) in second file
-        let source_partitions = vec![
-            FileGroup::new(vec![pfile("a", 123)]),
-            FileGroup::new(vec![pfile("b", 144).with_range(1, 50)]),
-        ];
-
-        let actual = FileGroupPartitioner::new()
-            .with_target_partitions(65)
-            .with_repartition_file_min_size(10)
-            .repartition_file_groups(&source_partitions);
-
-        assert_partitioned_files(None, actual)
-    }
-
     #[test]
     fn repartition_no_action_min_size() {
         // No action due to target_partition_size
@@ -809,6 +929,26 @@ mod test {
         assert_partitioned_files(expected, actual);
     }
 
+    #[test]
+    fn repartition_ordered_one_large_file_with_range() {
+        // "Rebalance" the single large file across partitions
+        let source_partitions =
+            vec![FileGroup::new(vec![pfile("a", 100).with_range(0, 100)])];
+
+        let actual = FileGroupPartitioner::new()
+            .with_preserve_order_within_groups(true)
+            .with_target_partitions(3)
+            .with_repartition_file_min_size(10)
+            .repartition_file_groups(&source_partitions);
+
+        let expected = Some(vec![
+            FileGroup::new(vec![pfile("a", 100).with_range(0, 34)]),
+            FileGroup::new(vec![pfile("a", 100).with_range(34, 68)]),
+            FileGroup::new(vec![pfile("a", 100).with_range(68, 100)]),
+        ]);
+        assert_partitioned_files(expected, actual);
+    }
+
     #[test]
     fn repartition_ordered_one_large_one_small_file() {
         // "Rebalance" the single large file across empty partitions, but can't split
@@ -837,6 +977,91 @@ mod test {
         assert_partitioned_files(expected, actual);
     }
 
+    #[test]
+    fn repartition_ordered_one_large_one_small_file_with_full_range() {
+        // "Rebalance" the single large file across empty partitions, but can't split
+        // small file
+        let source_partitions = vec![
+            FileGroup::new(vec![pfile("a", 100).with_range(0, 100)]),
+            FileGroup::new(vec![pfile("b", 30)]),
+        ];
+
+        let actual = FileGroupPartitioner::new()
+            .with_preserve_order_within_groups(true)
+            .with_target_partitions(4)
+            .with_repartition_file_min_size(10)
+            .repartition_file_groups(&source_partitions);
+
+        let expected = Some(vec![
+            // scan first third of "a"
+            FileGroup::new(vec![pfile("a", 100).with_range(0, 33)]),
+            // only b in this group (can't do this)
+            FileGroup::new(vec![pfile("b", 30).with_range(0, 30)]),
+            // second third of "a"
+            FileGroup::new(vec![pfile("a", 100).with_range(33, 66)]),
+            // final third of "a"
+            FileGroup::new(vec![pfile("a", 100).with_range(66, 100)]),
+        ]);
+        assert_partitioned_files(expected, actual);
+    }
+
+    #[test]
+    fn repartition_ordered_one_large_one_small_file_with_split_range() {
+        // "Rebalance" the single large file across empty partitions, but can't split
+        // small file
+        let source_partitions = vec![
+            FileGroup::new(vec![pfile("a", 100).with_range(0, 50)]),
+            FileGroup::new(vec![pfile("a", 100).with_range(50, 100)]),
+            FileGroup::new(vec![pfile("b", 30)]),
+        ];
+
+        let actual = FileGroupPartitioner::new()
+            .with_preserve_order_within_groups(true)
+            .with_target_partitions(4)
+            .with_repartition_file_min_size(10)
+            .repartition_file_groups(&source_partitions);
+
+        let expected = Some(vec![
+            // scan first half of first "a"
+            FileGroup::new(vec![pfile("a", 100).with_range(0, 25)]),
+            // second "a" fully (not split)
+            FileGroup::new(vec![pfile("a", 100).with_range(50, 100)]),
+            // only b in this group (can't do this)
+            FileGroup::new(vec![pfile("b", 30).with_range(0, 30)]),
+            // second half of first "a"
+            FileGroup::new(vec![pfile("a", 100).with_range(25, 50)]),
+        ]);
+        assert_partitioned_files(expected, actual);
+    }
+
+    #[test]
+    fn repartition_ordered_one_large_one_small_file_with_non_full_range() {
+        // "Rebalance" the single large file across empty partitions, but can't split
+        // small file
+        let source_partitions = vec![
+            FileGroup::new(vec![pfile("a", 100).with_range(20, 80)]),
+            FileGroup::new(vec![pfile("b", 30).with_range(5, 25)]),
+        ];
+
+        let actual = FileGroupPartitioner::new()
+            .with_preserve_order_within_groups(true)
+            .with_target_partitions(4)
+            .with_repartition_file_min_size(10)
+            .repartition_file_groups(&source_partitions);
+
+        let expected = Some(vec![
+            // scan first third of "a"
+            FileGroup::new(vec![pfile("a", 100).with_range(20, 40)]),
+            // only b in this group (can't split this)
+            FileGroup::new(vec![pfile("b", 30).with_range(5, 25)]),
+            // second third of "a"
+            FileGroup::new(vec![pfile("a", 100).with_range(40, 60)]),
+            // final third of "a"
+            FileGroup::new(vec![pfile("a", 100).with_range(60, 80)]),
+        ]);
+        assert_partitioned_files(expected, actual);
+    }
+
     #[test]
     fn repartition_ordered_two_large_files() {
         // "Rebalance" two large files across empty partitions, but can't mix them
@@ -998,6 +1223,13 @@ mod test {
         PartitionedFile::new(path, file_size)
     }
 
+    /// Creates a file with partition value with a static size of 10.
+    fn pfile_with_pv(path: &str, pv: &str) -> PartitionedFile {
+        let mut file = pfile(path, 10);
+        file.partition_values = vec![ScalarValue::from(pv)];
+        file
+    }
+
     /// repartition the file groups both with and without preserving order
     /// asserting they return the same value and returns that value
     fn repartition_test(
@@ -1013,4 +1245,50 @@ mod test {
         assert_partitioned_files(repartitioned.clone(), repartitioned_preserving_sort);
         repartitioned
     }
+
+    #[test]
+    fn test_group_by_partition_values_edge_cases() {
+        // Edge cases: empty and zero target
+        assert!(FileGroup::default().group_by_partition_values(4).is_empty());
+        assert!(
+            FileGroup::new(vec![pfile("a", 100)])
+                .group_by_partition_values(0)
+                .is_empty()
+        );
+    }
+
+    #[test]
+    fn test_group_by_partition_values_less_groups_than_target() {
+        // File a and b have partition value p1.
+        // File c has partition value p2.
+        // Grouping by partition value should not redistribute any files since the number of partition
+        // values <= max_target_partitions.
+        let fg = FileGroup::new(vec![
+            pfile_with_pv("a", "p1"),
+            pfile_with_pv("b", "p1"),
+            pfile_with_pv("c", "p2"),
+        ]);
+        let groups = fg.group_by_partition_values(4);
+        assert_eq!(groups.len(), 2);
+        assert_eq!(groups[0].len(), 2);
+        assert_eq!(groups[1].len(), 1);
+    }
+
+    #[test]
+    fn test_group_by_partition_values_more_groups_than_target() {
+        // Each file has a single partition value. The number of partition values > max_target_partitions, so
+        // they should be round-robin distributed into groups.
+        let fg = FileGroup::new(vec![
+            pfile_with_pv("a", "p1"),
+            pfile_with_pv("b", "p2"),
+            pfile_with_pv("c", "p3"),
+            pfile_with_pv("d", "p4"),
+            pfile_with_pv("e", "p5"),
+        ]);
+        let groups = fg.group_by_partition_values(3);
+        assert_eq!(groups.len(), 3);
+        assert_eq!(groups[0].len(), 2);
+        assert_eq!(groups[1].len(), 2);
+        assert_eq!(groups[2].len(), 1);
+    }
 }
diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs
deleted file mode 100644
index 5847a8cf5e11f..0000000000000
--- a/datafusion/datasource/src/file_scan_config.rs
+++ /dev/null
@@ -1,2683 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! [`FileScanConfig`] to configure scanning of possibly partitioned
-//! file sources.
-
-use crate::file_groups::FileGroup;
-#[allow(unused_imports)]
-use crate::schema_adapter::SchemaAdapterFactory;
-use crate::{
-    display::FileGroupsDisplay, file::FileSource,
-    file_compression_type::FileCompressionType, file_stream::FileStream,
-    source::DataSource, statistics::MinMaxStatistics, PartitionedFile, TableSchema,
-};
-use arrow::datatypes::FieldRef;
-use arrow::{
-    array::{
-        ArrayData, ArrayRef, BufferBuilder, DictionaryArray, RecordBatch,
-        RecordBatchOptions,
-    },
-    buffer::Buffer,
-    datatypes::{ArrowNativeType, DataType, Field, Schema, SchemaRef, UInt16Type},
-};
-use datafusion_common::config::ConfigOptions;
-use datafusion_common::{
-    exec_datafusion_err, exec_err, internal_datafusion_err, ColumnStatistics,
-    Constraints, Result, ScalarValue, Statistics,
-};
-use datafusion_execution::{
-    object_store::ObjectStoreUrl, SendableRecordBatchStream, TaskContext,
-};
-use datafusion_expr::Operator;
-use datafusion_physical_expr::expressions::{BinaryExpr, Column};
-use datafusion_physical_expr::projection::ProjectionExprs;
-use datafusion_physical_expr::utils::reassign_expr_columns;
-use datafusion_physical_expr::{split_conjunction, EquivalenceProperties, Partitioning};
-use datafusion_physical_expr_adapter::PhysicalExprAdapterFactory;
-use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
-use datafusion_physical_plan::projection::{
-    all_alias_free_columns, new_projections_for_columns, ProjectionExpr,
-};
-use datafusion_physical_plan::{
-    display::{display_orderings, ProjectSchemaDisplay},
-    filter_pushdown::FilterPushdownPropagation,
-    metrics::ExecutionPlanMetricsSet,
-    DisplayAs, DisplayFormatType,
-};
-use std::{
-    any::Any, borrow::Cow, collections::HashMap, fmt::Debug, fmt::Formatter,
-    fmt::Result as FmtResult, marker::PhantomData, sync::Arc,
-};
-
-use datafusion_physical_expr::equivalence::project_orderings;
-use datafusion_physical_plan::coop::cooperative;
-use datafusion_physical_plan::execution_plan::SchedulingType;
-use log::{debug, warn};
-
-/// The base configurations for a [`DataSourceExec`], the a physical plan for
-/// any given file format.
-///
-/// Use [`DataSourceExec::from_data_source`] to create a [`DataSourceExec`] from a ``FileScanConfig`.
-///
-/// # Example
-/// ```
-/// # use std::any::Any;
-/// # use std::sync::Arc;
-/// # use arrow::datatypes::{Field, Fields, DataType, Schema, SchemaRef};
-/// # use object_store::ObjectStore;
-/// # use datafusion_common::Statistics;
-/// # use datafusion_common::Result;
-/// # use datafusion_datasource::file::FileSource;
-/// # use datafusion_datasource::file_groups::FileGroup;
-/// # use datafusion_datasource::PartitionedFile;
-/// # use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
-/// # use datafusion_datasource::file_stream::FileOpener;
-/// # use datafusion_datasource::source::DataSourceExec;
-/// # use datafusion_datasource::table_schema::TableSchema;
-/// # use datafusion_execution::object_store::ObjectStoreUrl;
-/// # use datafusion_physical_plan::ExecutionPlan;
-/// # use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
-/// # use datafusion_datasource::schema_adapter::SchemaAdapterFactory;
-/// # let file_schema = Arc::new(Schema::new(vec![
-/// #  Field::new("c1", DataType::Int32, false),
-/// #  Field::new("c2", DataType::Int32, false),
-/// #  Field::new("c3", DataType::Int32, false),
-/// #  Field::new("c4", DataType::Int32, false),
-/// # ]));
-/// # // Note: crate mock ParquetSource, as ParquetSource is not in the datasource crate
-/// #[derive(Clone)]
-/// # struct ParquetSource {
-/// #    projected_statistics: Option<Statistics>,
-/// #    schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>
-/// # };
-/// # impl FileSource for ParquetSource {
-/// #  fn create_file_opener(&self, _: Arc<dyn ObjectStore>, _: &FileScanConfig, _: usize) -> Arc<dyn FileOpener> { unimplemented!() }
-/// #  fn as_any(&self) -> &dyn Any { self  }
-/// #  fn with_batch_size(&self, _: usize) -> Arc<dyn FileSource> { unimplemented!() }
-/// #  fn with_schema(&self, _: TableSchema) -> Arc<dyn FileSource> { Arc::new(self.clone()) as Arc<dyn FileSource> }
-/// #  fn with_projection(&self, _: &FileScanConfig) -> Arc<dyn FileSource> { unimplemented!() }
-/// #  fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource> { Arc::new(Self {projected_statistics: Some(statistics), schema_adapter_factory: self.schema_adapter_factory.clone()} ) }
-/// #  fn metrics(&self) -> &ExecutionPlanMetricsSet { unimplemented!() }
-/// #  fn statistics(&self) -> Result<Statistics> { Ok(self.projected_statistics.clone().expect("projected_statistics should be set")) }
-/// #  fn file_type(&self) -> &str { "parquet" }
-/// #  fn with_schema_adapter_factory(&self, factory: Arc<dyn SchemaAdapterFactory>) -> Result<Arc<dyn FileSource>> { Ok(Arc::new(Self {projected_statistics: self.projected_statistics.clone(), schema_adapter_factory: Some(factory)} )) }
-/// #  fn schema_adapter_factory(&self) -> Option<Arc<dyn SchemaAdapterFactory>> { self.schema_adapter_factory.clone() }
-/// #  }
-/// # impl ParquetSource {
-/// #  fn new() -> Self { Self {projected_statistics: None, schema_adapter_factory: None} }
-/// # }
-/// // create FileScan config for reading parquet files from file://
-/// let object_store_url = ObjectStoreUrl::local_filesystem();
-/// let file_source = Arc::new(ParquetSource::new());
-/// let config = FileScanConfigBuilder::new(object_store_url, file_schema, file_source)
-///   .with_limit(Some(1000))            // read only the first 1000 records
-///   .with_projection_indices(Some(vec![2, 3])) // project columns 2 and 3
-///    // Read /tmp/file1.parquet with known size of 1234 bytes in a single group
-///   .with_file(PartitionedFile::new("file1.parquet", 1234))
-///   // Read /tmp/file2.parquet 56 bytes and /tmp/file3.parquet 78 bytes
-///   // in a  single row group
-///   .with_file_group(FileGroup::new(vec![
-///    PartitionedFile::new("file2.parquet", 56),
-///    PartitionedFile::new("file3.parquet", 78),
-///   ])).build();
-/// // create an execution plan from the config
-/// let plan: Arc<dyn ExecutionPlan> = DataSourceExec::from_data_source(config);
-/// ```
-///
-/// [`DataSourceExec`]: crate::source::DataSourceExec
-/// [`DataSourceExec::from_data_source`]: crate::source::DataSourceExec::from_data_source
-#[derive(Clone)]
-pub struct FileScanConfig {
-    /// Object store URL, used to get an [`ObjectStore`] instance from
-    /// [`RuntimeEnv::object_store`]
-    ///
-    /// This `ObjectStoreUrl` should be the prefix of the absolute url for files
-    /// as `file://` or `s3://my_bucket`. It should not include the path to the
-    /// file itself. The relevant URL prefix must be registered via
-    /// [`RuntimeEnv::register_object_store`]
-    ///
-    /// [`ObjectStore`]: object_store::ObjectStore
-    /// [`RuntimeEnv::register_object_store`]: datafusion_execution::runtime_env::RuntimeEnv::register_object_store
-    /// [`RuntimeEnv::object_store`]: datafusion_execution::runtime_env::RuntimeEnv::object_store
-    pub object_store_url: ObjectStoreUrl,
-    /// Schema information including the file schema, table partition columns,
-    /// and the combined table schema.
-    ///
-    /// The table schema (file schema + partition columns) is the schema exposed
-    /// upstream of [`FileScanConfig`] (e.g. in [`DataSourceExec`]).
-    ///
-    /// See [`TableSchema`] for more information.
-    ///
-    /// [`DataSourceExec`]: crate::source::DataSourceExec
-    pub table_schema: TableSchema,
-    /// List of files to be processed, grouped into partitions
-    ///
-    /// Each file must have a schema of `file_schema` or a subset. If
-    /// a particular file has a subset, the missing columns are
-    /// padded with NULLs.
-    ///
-    /// DataFusion may attempt to read each partition of files
-    /// concurrently, however files *within* a partition will be read
-    /// sequentially, one after the next.
-    pub file_groups: Vec<FileGroup>,
-    /// Table constraints
-    pub constraints: Constraints,
-    /// Physical expressions defining the projection to apply when reading data.
-    ///
-    /// Each expression in the projection can reference columns from both the file
-    /// schema and table partition columns. If `None`, all columns from the table
-    /// schema are projected.
-    pub projection_exprs: Option<ProjectionExprs>,
-    /// The maximum number of records to read from this plan. If `None`,
-    /// all records after filtering are returned.
-    pub limit: Option<usize>,
-    /// All equivalent lexicographical orderings that describe the schema.
-    pub output_ordering: Vec<LexOrdering>,
-    /// File compression type
-    pub file_compression_type: FileCompressionType,
-    /// Are new lines in values supported for CSVOptions
-    pub new_lines_in_values: bool,
-    /// File source such as `ParquetSource`, `CsvSource`, `JsonSource`, etc.
-    pub file_source: Arc<dyn FileSource>,
-    /// Batch size while creating new batches
-    /// Defaults to [`datafusion_common::config::ExecutionOptions`] batch_size.
-    pub batch_size: Option<usize>,
-    /// Expression adapter used to adapt filters and projections that are pushed down into the scan
-    /// from the logical schema to the physical schema of the file.
-    pub expr_adapter_factory: Option<Arc<dyn PhysicalExprAdapterFactory>>,
-}
-
-/// A builder for [`FileScanConfig`]'s.
-///
-/// Example:
-///
-/// ```rust
-/// # use std::sync::Arc;
-/// # use arrow::datatypes::{DataType, Field, Schema};
-/// # use datafusion_datasource::file_scan_config::{FileScanConfigBuilder, FileScanConfig};
-/// # use datafusion_datasource::file_compression_type::FileCompressionType;
-/// # use datafusion_datasource::file_groups::FileGroup;
-/// # use datafusion_datasource::PartitionedFile;
-/// # use datafusion_execution::object_store::ObjectStoreUrl;
-/// # use datafusion_common::Statistics;
-/// # use datafusion_datasource::file::FileSource;
-///
-/// # fn main() {
-/// # fn with_source(file_source: Arc<dyn FileSource>) {
-///     // Create a schema for our Parquet files
-///     let schema = Arc::new(Schema::new(vec![
-///         Field::new("id", DataType::Int32, false),
-///         Field::new("value", DataType::Utf8, false),
-///     ]));
-///
-///     // Create a builder for scanning Parquet files from a local filesystem
-///     let config = FileScanConfigBuilder::new(
-///         ObjectStoreUrl::local_filesystem(),
-///         schema,
-///         file_source,
-///     )
-///     // Set a limit of 1000 rows
-///     .with_limit(Some(1000))
-///     // Project only the first column
-///     .with_projection_indices(Some(vec![0]))
-///     // Add partition columns
-///     .with_table_partition_cols(vec![
-///         Field::new("date", DataType::Utf8, false),
-///     ])
-///     // Add a file group with two files
-///     .with_file_group(FileGroup::new(vec![
-///         PartitionedFile::new("data/date=2024-01-01/file1.parquet", 1024),
-///         PartitionedFile::new("data/date=2024-01-01/file2.parquet", 2048),
-///     ]))
-///     // Set compression type
-///     .with_file_compression_type(FileCompressionType::UNCOMPRESSED)
-///     // Build the final config
-///     .build();
-/// # }
-/// # }
-/// ```
-#[derive(Clone)]
-pub struct FileScanConfigBuilder {
-    object_store_url: ObjectStoreUrl,
-    /// Schema information including the file schema, table partition columns,
-    /// and the combined table schema.
-    ///
-    /// This schema is used to read the files, but the file schema is **not** necessarily
-    /// the schema of the physical files. Rather this is the schema that the
-    /// physical file schema will be mapped onto, and the schema that the
-    /// [`DataSourceExec`] will return.
-    ///
-    /// [`DataSourceExec`]: crate::source::DataSourceExec
-    table_schema: TableSchema,
-    file_source: Arc<dyn FileSource>,
-    limit: Option<usize>,
-    projection_indices: Option<Vec<usize>>,
-    constraints: Option<Constraints>,
-    file_groups: Vec<FileGroup>,
-    statistics: Option<Statistics>,
-    output_ordering: Vec<LexOrdering>,
-    file_compression_type: Option<FileCompressionType>,
-    new_lines_in_values: Option<bool>,
-    batch_size: Option<usize>,
-    expr_adapter_factory: Option<Arc<dyn PhysicalExprAdapterFactory>>,
-}
-
-impl FileScanConfigBuilder {
-    /// Create a new [`FileScanConfigBuilder`] with default settings for scanning files.
-    ///
-    /// # Parameters:
-    /// * `object_store_url`: See [`FileScanConfig::object_store_url`]
-    /// * `file_schema`: See [`FileScanConfig::file_schema`]
-    /// * `file_source`: See [`FileScanConfig::file_source`]
-    pub fn new(
-        object_store_url: ObjectStoreUrl,
-        file_schema: SchemaRef,
-        file_source: Arc<dyn FileSource>,
-    ) -> Self {
-        Self {
-            object_store_url,
-            table_schema: TableSchema::from_file_schema(file_schema),
-            file_source,
-            file_groups: vec![],
-            statistics: None,
-            output_ordering: vec![],
-            file_compression_type: None,
-            new_lines_in_values: None,
-            limit: None,
-            projection_indices: None,
-            constraints: None,
-            batch_size: None,
-            expr_adapter_factory: None,
-        }
-    }
-
-    /// Set the maximum number of records to read from this plan. If `None`,
-    /// all records after filtering are returned.
-    pub fn with_limit(mut self, limit: Option<usize>) -> Self {
-        self.limit = limit;
-        self
-    }
-
-    /// Set the file source for scanning files.
-    ///
-    /// This method allows you to change the file source implementation (e.g. ParquetSource, CsvSource, etc.)
-    /// after the builder has been created.
-    pub fn with_source(mut self, file_source: Arc<dyn FileSource>) -> Self {
-        self.file_source = file_source;
-        self
-    }
-
-    pub fn table_schema(&self) -> &SchemaRef {
-        self.table_schema.table_schema()
-    }
-
-    /// Set the columns on which to project the data. Indexes that are higher than the
-    /// number of columns of `file_schema` refer to `table_partition_cols`.
-    ///
-    /// # Deprecated
-    /// Use [`Self::with_projection_indices`] instead. This method will be removed in a future release.
-    #[deprecated(since = "51.0.0", note = "Use with_projection_indices instead")]
-    pub fn with_projection(self, indices: Option<Vec<usize>>) -> Self {
-        self.with_projection_indices(indices)
-    }
-
-    /// Set the columns on which to project the data using column indices.
-    ///
-    /// Indexes that are higher than the number of columns of `file_schema` refer to `table_partition_cols`.
-    pub fn with_projection_indices(mut self, indices: Option<Vec<usize>>) -> Self {
-        self.projection_indices = indices;
-        self
-    }
-
-    /// Set the partitioning columns
-    pub fn with_table_partition_cols(mut self, table_partition_cols: Vec<Field>) -> Self {
-        let table_partition_cols: Vec<FieldRef> = table_partition_cols
-            .into_iter()
-            .map(|f| Arc::new(f) as FieldRef)
-            .collect();
-        self.table_schema = self
-            .table_schema
-            .with_table_partition_cols(table_partition_cols);
-        self
-    }
-
-    /// Set the table constraints
-    pub fn with_constraints(mut self, constraints: Constraints) -> Self {
-        self.constraints = Some(constraints);
-        self
-    }
-
-    /// Set the estimated overall statistics of the files, taking `filters` into account.
-    /// Defaults to [`Statistics::new_unknown`].
-    pub fn with_statistics(mut self, statistics: Statistics) -> Self {
-        self.statistics = Some(statistics);
-        self
-    }
-
-    /// Set the list of files to be processed, grouped into partitions.
-    ///
-    /// Each file must have a schema of `file_schema` or a subset. If
-    /// a particular file has a subset, the missing columns are
-    /// padded with NULLs.
-    ///
-    /// DataFusion may attempt to read each partition of files
-    /// concurrently, however files *within* a partition will be read
-    /// sequentially, one after the next.
-    pub fn with_file_groups(mut self, file_groups: Vec<FileGroup>) -> Self {
-        self.file_groups = file_groups;
-        self
-    }
-
-    /// Add a new file group
-    ///
-    /// See [`Self::with_file_groups`] for more information
-    pub fn with_file_group(mut self, file_group: FileGroup) -> Self {
-        self.file_groups.push(file_group);
-        self
-    }
-
-    /// Add a file as a single group
-    ///
-    /// See [`Self::with_file_groups`] for more information.
-    pub fn with_file(self, partitioned_file: PartitionedFile) -> Self {
-        self.with_file_group(FileGroup::new(vec![partitioned_file]))
-    }
-
-    /// Set the output ordering of the files
-    pub fn with_output_ordering(mut self, output_ordering: Vec<LexOrdering>) -> Self {
-        self.output_ordering = output_ordering;
-        self
-    }
-
-    /// Set the file compression type
-    pub fn with_file_compression_type(
-        mut self,
-        file_compression_type: FileCompressionType,
-    ) -> Self {
-        self.file_compression_type = Some(file_compression_type);
-        self
-    }
-
-    /// Set whether new lines in values are supported for CSVOptions
-    ///
-    /// Parsing newlines in quoted values may be affected by execution behaviour such as
-    /// parallel file scanning. Setting this to `true` ensures that newlines in values are
-    /// parsed successfully, which may reduce performance.
-    pub fn with_newlines_in_values(mut self, new_lines_in_values: bool) -> Self {
-        self.new_lines_in_values = Some(new_lines_in_values);
-        self
-    }
-
-    /// Set the batch_size property
-    pub fn with_batch_size(mut self, batch_size: Option<usize>) -> Self {
-        self.batch_size = batch_size;
-        self
-    }
-
-    /// Register an expression adapter used to adapt filters and projections that are pushed down into the scan
-    /// from the logical schema to the physical schema of the file.
-    /// This can include things like:
-    /// - Column ordering changes
-    /// - Handling of missing columns
-    /// - Rewriting expression to use pre-computed values or file format specific optimizations
-    pub fn with_expr_adapter(
-        mut self,
-        expr_adapter: Option<Arc<dyn PhysicalExprAdapterFactory>>,
-    ) -> Self {
-        self.expr_adapter_factory = expr_adapter;
-        self
-    }
-
-    /// Build the final [`FileScanConfig`] with all the configured settings.
-    ///
-    /// This method takes ownership of the builder and returns the constructed `FileScanConfig`.
-    /// Any unset optional fields will use their default values.
-    pub fn build(self) -> FileScanConfig {
-        let Self {
-            object_store_url,
-            table_schema,
-            file_source,
-            limit,
-            projection_indices,
-            constraints,
-            file_groups,
-            statistics,
-            output_ordering,
-            file_compression_type,
-            new_lines_in_values,
-            batch_size,
-            expr_adapter_factory: expr_adapter,
-        } = self;
-
-        let constraints = constraints.unwrap_or_default();
-        let statistics = statistics
-            .unwrap_or_else(|| Statistics::new_unknown(table_schema.file_schema()));
-
-        let file_source = file_source
-            .with_statistics(statistics.clone())
-            .with_schema(table_schema.clone());
-        let file_compression_type =
-            file_compression_type.unwrap_or(FileCompressionType::UNCOMPRESSED);
-        let new_lines_in_values = new_lines_in_values.unwrap_or(false);
-
-        // Convert projection indices to ProjectionExprs using the final table schema
-        // (which now includes partition columns if they were added)
-        let projection_exprs = projection_indices.map(|indices| {
-            ProjectionExprs::from_indices(&indices, table_schema.table_schema())
-        });
-
-        FileScanConfig {
-            object_store_url,
-            table_schema,
-            file_source,
-            limit,
-            projection_exprs,
-            constraints,
-            file_groups,
-            output_ordering,
-            file_compression_type,
-            new_lines_in_values,
-            batch_size,
-            expr_adapter_factory: expr_adapter,
-        }
-    }
-}
-
-impl From<FileScanConfig> for FileScanConfigBuilder {
-    fn from(config: FileScanConfig) -> Self {
-        Self {
-            object_store_url: config.object_store_url,
-            table_schema: config.table_schema,
-            file_source: Arc::<dyn FileSource>::clone(&config.file_source),
-            file_groups: config.file_groups,
-            statistics: config.file_source.statistics().ok(),
-            output_ordering: config.output_ordering,
-            file_compression_type: Some(config.file_compression_type),
-            new_lines_in_values: Some(config.new_lines_in_values),
-            limit: config.limit,
-            projection_indices: config
-                .projection_exprs
-                .map(|p| p.ordered_column_indices()),
-            constraints: Some(config.constraints),
-            batch_size: config.batch_size,
-            expr_adapter_factory: config.expr_adapter_factory,
-        }
-    }
-}
-
-impl DataSource for FileScanConfig {
-    fn open(
-        &self,
-        partition: usize,
-        context: Arc<TaskContext>,
-    ) -> Result<SendableRecordBatchStream> {
-        let object_store = context.runtime_env().object_store(&self.object_store_url)?;
-        let batch_size = self
-            .batch_size
-            .unwrap_or_else(|| context.session_config().batch_size());
-
-        let source = self
-            .file_source
-            .with_batch_size(batch_size)
-            .with_projection(self);
-
-        let opener = source.create_file_opener(object_store, self, partition);
-
-        let stream = FileStream::new(self, partition, opener, source.metrics())?;
-        Ok(Box::pin(cooperative(stream)))
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult {
-        match t {
-            DisplayFormatType::Default | DisplayFormatType::Verbose => {
-                let schema = self.projected_schema();
-                let orderings = get_projected_output_ordering(self, &schema);
-
-                write!(f, "file_groups=")?;
-                FileGroupsDisplay(&self.file_groups).fmt_as(t, f)?;
-
-                if !schema.fields().is_empty() {
-                    write!(f, ", projection={}", ProjectSchemaDisplay(&schema))?;
-                }
-
-                if let Some(limit) = self.limit {
-                    write!(f, ", limit={limit}")?;
-                }
-
-                display_orderings(f, &orderings)?;
-
-                if !self.constraints.is_empty() {
-                    write!(f, ", {}", self.constraints)?;
-                }
-
-                self.fmt_file_source(t, f)
-            }
-            DisplayFormatType::TreeRender => {
-                writeln!(f, "format={}", self.file_source.file_type())?;
-                self.file_source.fmt_extra(t, f)?;
-                let num_files = self.file_groups.iter().map(|fg| fg.len()).sum::<usize>();
-                writeln!(f, "files={num_files}")?;
-                Ok(())
-            }
-        }
-    }
-
-    /// If supported by the underlying [`FileSource`], redistribute files across partitions according to their size.
-    fn repartitioned(
-        &self,
-        target_partitions: usize,
-        repartition_file_min_size: usize,
-        output_ordering: Option<LexOrdering>,
-    ) -> Result<Option<Arc<dyn DataSource>>> {
-        let source = self.file_source.repartitioned(
-            target_partitions,
-            repartition_file_min_size,
-            output_ordering,
-            self,
-        )?;
-
-        Ok(source.map(|s| Arc::new(s) as _))
-    }
-
-    fn output_partitioning(&self) -> Partitioning {
-        Partitioning::UnknownPartitioning(self.file_groups.len())
-    }
-
-    fn eq_properties(&self) -> EquivalenceProperties {
-        let (schema, constraints, _, orderings) = self.project();
-        let mut eq_properties =
-            EquivalenceProperties::new_with_orderings(Arc::clone(&schema), orderings)
-                .with_constraints(constraints);
-        if let Some(filter) = self.file_source.filter() {
-            // We need to remap column indexes to match the projected schema since that's what the equivalence properties deal with.
-            // Note that this will *ignore* any non-projected columns: these don't factor into ordering / equivalence.
-            match Self::add_filter_equivalence_info(filter, &mut eq_properties, &schema) {
-                Ok(()) => {}
-                Err(e) => {
-                    warn!("Failed to add filter equivalence info: {e}");
-                    #[cfg(debug_assertions)]
-                    panic!("Failed to add filter equivalence info: {e}");
-                }
-            }
-        }
-        eq_properties
-    }
-
-    fn scheduling_type(&self) -> SchedulingType {
-        SchedulingType::Cooperative
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        if let Some(partition) = partition {
-            // Get statistics for a specific partition
-            if let Some(file_group) = self.file_groups.get(partition) {
-                if let Some(stat) = file_group.file_statistics(None) {
-                    // Project the statistics based on the projection
-                    let table_cols_stats = self
-                        .projection_indices()
-                        .into_iter()
-                        .map(|idx| {
-                            if idx < self.file_schema().fields().len() {
-                                stat.column_statistics[idx].clone()
-                            } else {
-                                // TODO provide accurate stat for partition column
-                                // See https://github.com/apache/datafusion/issues/1186
-                                ColumnStatistics::new_unknown()
-                            }
-                        })
-                        .collect();
-
-                    return Ok(Statistics {
-                        num_rows: stat.num_rows,
-                        total_byte_size: stat.total_byte_size,
-                        column_statistics: table_cols_stats,
-                    });
-                }
-            }
-            // If no statistics available for this partition, return unknown
-            Ok(Statistics::new_unknown(&self.projected_schema()))
-        } else {
-            // Return aggregate statistics across all partitions
-            Ok(self.projected_stats())
-        }
-    }
-
-    fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn DataSource>> {
-        let source = FileScanConfigBuilder::from(self.clone())
-            .with_limit(limit)
-            .build();
-        Some(Arc::new(source))
-    }
-
-    fn fetch(&self) -> Option<usize> {
-        self.limit
-    }
-
-    fn metrics(&self) -> ExecutionPlanMetricsSet {
-        self.file_source.metrics().clone()
-    }
-
-    fn try_swapping_with_projection(
-        &self,
-        projection: &[ProjectionExpr],
-    ) -> Result<Option<Arc<dyn DataSource>>> {
-        // This process can be moved into CsvExec, but it would be an overlap of their responsibility.
-
-        // Must be all column references, with no table partition columns (which can not be projected)
-        let partitioned_columns_in_proj = projection.iter().any(|proj_expr| {
-            proj_expr
-                .expr
-                .as_any()
-                .downcast_ref::<Column>()
-                .map(|expr| expr.index() >= self.file_schema().fields().len())
-                .unwrap_or(false)
-        });
-
-        // If there is any non-column or alias-carrier expression, Projection should not be removed.
-        let no_aliases = all_alias_free_columns(projection);
-
-        Ok((no_aliases && !partitioned_columns_in_proj).then(|| {
-            let file_scan = self.clone();
-            let source = Arc::clone(&file_scan.file_source);
-            let new_projections = new_projections_for_columns(
-                projection,
-                &file_scan
-                    .projection_exprs
-                    .as_ref()
-                    .map(|p| p.ordered_column_indices())
-                    .unwrap_or_else(|| (0..self.file_schema().fields().len()).collect()),
-            );
-
-            Arc::new(
-                FileScanConfigBuilder::from(file_scan)
-                    // Assign projected statistics to source
-                    .with_projection_indices(Some(new_projections))
-                    .with_source(source)
-                    .build(),
-            ) as _
-        }))
-    }
-
-    fn try_pushdown_filters(
-        &self,
-        filters: Vec<Arc<dyn PhysicalExpr>>,
-        config: &ConfigOptions,
-    ) -> Result<FilterPushdownPropagation<Arc<dyn DataSource>>> {
-        let result = self.file_source.try_pushdown_filters(filters, config)?;
-        match result.updated_node {
-            Some(new_file_source) => {
-                let file_scan_config = FileScanConfigBuilder::from(self.clone())
-                    .with_source(new_file_source)
-                    .build();
-                Ok(FilterPushdownPropagation {
-                    filters: result.filters,
-                    updated_node: Some(Arc::new(file_scan_config) as _),
-                })
-            }
-            None => {
-                // If the file source does not support filter pushdown, return the original config
-                Ok(FilterPushdownPropagation {
-                    filters: result.filters,
-                    updated_node: None,
-                })
-            }
-        }
-    }
-}
-
-impl FileScanConfig {
-    /// Get the file schema (schema of the files without partition columns)
-    pub fn file_schema(&self) -> &SchemaRef {
-        self.table_schema.file_schema()
-    }
-
-    /// Get the table partition columns
-    pub fn table_partition_cols(&self) -> &Vec<FieldRef> {
-        self.table_schema.table_partition_cols()
-    }
-
-    fn projection_indices(&self) -> Vec<usize> {
-        match &self.projection_exprs {
-            Some(proj) => proj.ordered_column_indices(),
-            None => (0..self.file_schema().fields().len()
-                + self.table_partition_cols().len())
-                .collect(),
-        }
-    }
-
-    pub fn projected_stats(&self) -> Statistics {
-        let statistics = self.file_source.statistics().unwrap();
-
-        let table_cols_stats = self
-            .projection_indices()
-            .into_iter()
-            .map(|idx| {
-                if idx < self.file_schema().fields().len() {
-                    statistics.column_statistics[idx].clone()
-                } else {
-                    // TODO provide accurate stat for partition column (#1186)
-                    ColumnStatistics::new_unknown()
-                }
-            })
-            .collect();
-
-        Statistics {
-            num_rows: statistics.num_rows,
-            // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
-            total_byte_size: statistics.total_byte_size,
-            column_statistics: table_cols_stats,
-        }
-    }
-
-    pub fn projected_schema(&self) -> Arc<Schema> {
-        let table_fields: Vec<_> = self
-            .projection_indices()
-            .into_iter()
-            .map(|idx| {
-                if idx < self.file_schema().fields().len() {
-                    self.file_schema().field(idx).clone()
-                } else {
-                    let partition_idx = idx - self.file_schema().fields().len();
-                    Arc::unwrap_or_clone(Arc::clone(
-                        &self.table_partition_cols()[partition_idx],
-                    ))
-                }
-            })
-            .collect();
-
-        Arc::new(Schema::new_with_metadata(
-            table_fields,
-            self.file_schema().metadata().clone(),
-        ))
-    }
-
-    fn add_filter_equivalence_info(
-        filter: Arc<dyn PhysicalExpr>,
-        eq_properties: &mut EquivalenceProperties,
-        schema: &Schema,
-    ) -> Result<()> {
-        // Gather valid equality pairs from the filter expression
-        let equal_pairs = split_conjunction(&filter).into_iter().filter_map(|expr| {
-            // Ignore any binary expressions that reference non-existent columns in the current schema
-            // (e.g. due to unnecessary projections being removed)
-            reassign_expr_columns(Arc::clone(expr), schema)
-                .ok()
-                .and_then(|expr| match expr.as_any().downcast_ref::<BinaryExpr>() {
-                    Some(expr) if expr.op() == &Operator::Eq => {
-                        Some((Arc::clone(expr.left()), Arc::clone(expr.right())))
-                    }
-                    _ => None,
-                })
-        });
-
-        for (lhs, rhs) in equal_pairs {
-            eq_properties.add_equal_conditions(lhs, rhs)?
-        }
-
-        Ok(())
-    }
-
-    pub fn projected_constraints(&self) -> Constraints {
-        let indexes = self.projection_indices();
-        self.constraints.project(&indexes).unwrap_or_default()
-    }
-
-    /// Specifies whether newlines in (quoted) values are supported.
-    ///
-    /// Parsing newlines in quoted values may be affected by execution behaviour such as
-    /// parallel file scanning. Setting this to `true` ensures that newlines in values are
-    /// parsed successfully, which may reduce performance.
-    ///
-    /// The default behaviour depends on the `datafusion.catalog.newlines_in_values` setting.
-    pub fn newlines_in_values(&self) -> bool {
-        self.new_lines_in_values
-    }
-
-    /// Project the schema, constraints, and the statistics on the given column indices
-    pub fn project(&self) -> (SchemaRef, Constraints, Statistics, Vec<LexOrdering>) {
-        if self.projection_exprs.is_none() && self.table_partition_cols().is_empty() {
-            return (
-                Arc::clone(self.file_schema()),
-                self.constraints.clone(),
-                self.file_source.statistics().unwrap().clone(),
-                self.output_ordering.clone(),
-            );
-        }
-
-        let schema = self.projected_schema();
-        let constraints = self.projected_constraints();
-        let stats = self.projected_stats();
-
-        let output_ordering = get_projected_output_ordering(self, &schema);
-
-        (schema, constraints, stats, output_ordering)
-    }
-
-    pub fn projected_file_column_names(&self) -> Option<Vec<String>> {
-        let fields = self.file_schema().fields();
-
-        self.projection_exprs.as_ref().map(|p| {
-            let column_indices = p.ordered_column_indices();
-
-            column_indices
-                .iter()
-                .filter(|&&col_i| col_i < fields.len())
-                .map(|&col_i| self.file_schema().field(col_i).name())
-                .cloned()
-                .collect::<Vec<_>>()
-        })
-    }
-
-    /// Projects only file schema, ignoring partition columns
-    pub fn projected_file_schema(&self) -> SchemaRef {
-        let fields = self.file_column_projection_indices().map(|indices| {
-            indices
-                .iter()
-                .map(|col_idx| self.file_schema().field(*col_idx))
-                .cloned()
-                .collect::<Vec<_>>()
-        });
-
-        fields.map_or_else(
-            || Arc::clone(self.file_schema()),
-            |f| {
-                Arc::new(Schema::new_with_metadata(
-                    f,
-                    self.file_schema().metadata.clone(),
-                ))
-            },
-        )
-    }
-
-    pub fn file_column_projection_indices(&self) -> Option<Vec<usize>> {
-        self.projection_exprs.as_ref().map(|p| {
-            p.ordered_column_indices()
-                .into_iter()
-                .filter(|&i| i < self.file_schema().fields().len())
-                .collect::<Vec<_>>()
-        })
-    }
-
-    /// Splits file groups into new groups based on statistics to enable efficient parallel processing.
-    ///
-    /// The method distributes files across a target number of partitions while ensuring
-    /// files within each partition maintain sort order based on their min/max statistics.
-    ///
-    /// The algorithm works by:
-    /// 1. Takes files sorted by minimum values
-    /// 2. For each file:
-    ///   - Finds eligible groups (empty or where file's min > group's last max)
-    ///   - Selects the smallest eligible group
-    ///   - Creates a new group if needed
-    ///
-    /// # Parameters
-    /// * `table_schema`: Schema containing information about the columns
-    /// * `file_groups`: The original file groups to split
-    /// * `sort_order`: The lexicographical ordering to maintain within each group
-    /// * `target_partitions`: The desired number of output partitions
-    ///
-    /// # Returns
-    /// A new set of file groups, where files within each group are non-overlapping with respect to
-    /// their min/max statistics and maintain the specified sort order.
-    pub fn split_groups_by_statistics_with_target_partitions(
-        table_schema: &SchemaRef,
-        file_groups: &[FileGroup],
-        sort_order: &LexOrdering,
-        target_partitions: usize,
-    ) -> Result<Vec<FileGroup>> {
-        if target_partitions == 0 {
-            return Err(internal_datafusion_err!(
-                "target_partitions must be greater than 0"
-            ));
-        }
-
-        let flattened_files = file_groups
-            .iter()
-            .flat_map(FileGroup::iter)
-            .collect::<Vec<_>>();
-
-        if flattened_files.is_empty() {
-            return Ok(vec![]);
-        }
-
-        let statistics = MinMaxStatistics::new_from_files(
-            sort_order,
-            table_schema,
-            None,
-            flattened_files.iter().copied(),
-        )?;
-
-        let indices_sorted_by_min = statistics.min_values_sorted();
-
-        // Initialize with target_partitions empty groups
-        let mut file_groups_indices: Vec<Vec<usize>> = vec![vec![]; target_partitions];
-
-        for (idx, min) in indices_sorted_by_min {
-            if let Some((_, group)) = file_groups_indices
-                .iter_mut()
-                .enumerate()
-                .filter(|(_, group)| {
-                    group.is_empty()
-                        || min
-                            > statistics
-                                .max(*group.last().expect("groups should not be empty"))
-                })
-                .min_by_key(|(_, group)| group.len())
-            {
-                group.push(idx);
-            } else {
-                // Create a new group if no existing group fits
-                file_groups_indices.push(vec![idx]);
-            }
-        }
-
-        // Remove any empty groups
-        file_groups_indices.retain(|group| !group.is_empty());
-
-        // Assemble indices back into groups of PartitionedFiles
-        Ok(file_groups_indices
-            .into_iter()
-            .map(|file_group_indices| {
-                FileGroup::new(
-                    file_group_indices
-                        .into_iter()
-                        .map(|idx| flattened_files[idx].clone())
-                        .collect(),
-                )
-            })
-            .collect())
-    }
-
-    /// Attempts to do a bin-packing on files into file groups, such that any two files
-    /// in a file group are ordered and non-overlapping with respect to their statistics.
-    /// It will produce the smallest number of file groups possible.
-    pub fn split_groups_by_statistics(
-        table_schema: &SchemaRef,
-        file_groups: &[FileGroup],
-        sort_order: &LexOrdering,
-    ) -> Result<Vec<FileGroup>> {
-        let flattened_files = file_groups
-            .iter()
-            .flat_map(FileGroup::iter)
-            .collect::<Vec<_>>();
-        // First Fit:
-        // * Choose the first file group that a file can be placed into.
-        // * If it fits into no existing file groups, create a new one.
-        //
-        // By sorting files by min values and then applying first-fit bin packing,
-        // we can produce the smallest number of file groups such that
-        // files within a group are in order and non-overlapping.
-        //
-        // Source: Applied Combinatorics (Keller and Trotter), Chapter 6.8
-        // https://www.appliedcombinatorics.org/book/s_posets_dilworth-intord.html
-
-        if flattened_files.is_empty() {
-            return Ok(vec![]);
-        }
-
-        let statistics = MinMaxStatistics::new_from_files(
-            sort_order,
-            table_schema,
-            None,
-            flattened_files.iter().copied(),
-        )
-        .map_err(|e| {
-            e.context("construct min/max statistics for split_groups_by_statistics")
-        })?;
-
-        let indices_sorted_by_min = statistics.min_values_sorted();
-        let mut file_groups_indices: Vec<Vec<usize>> = vec![];
-
-        for (idx, min) in indices_sorted_by_min {
-            let file_group_to_insert = file_groups_indices.iter_mut().find(|group| {
-                // If our file is non-overlapping and comes _after_ the last file,
-                // it fits in this file group.
-                min > statistics.max(
-                    *group
-                        .last()
-                        .expect("groups should be nonempty at construction"),
-                )
-            });
-            match file_group_to_insert {
-                Some(group) => group.push(idx),
-                None => file_groups_indices.push(vec![idx]),
-            }
-        }
-
-        // Assemble indices back into groups of PartitionedFiles
-        Ok(file_groups_indices
-            .into_iter()
-            .map(|file_group_indices| {
-                file_group_indices
-                    .into_iter()
-                    .map(|idx| flattened_files[idx].clone())
-                    .collect()
-            })
-            .collect())
-    }
-
-    /// Write the data_type based on file_source
-    fn fmt_file_source(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult {
-        write!(f, ", file_type={}", self.file_source.file_type())?;
-        self.file_source.fmt_extra(t, f)
-    }
-
-    /// Returns the file_source
-    pub fn file_source(&self) -> &Arc<dyn FileSource> {
-        &self.file_source
-    }
-}
-
-impl Debug for FileScanConfig {
-    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
-        write!(f, "FileScanConfig {{")?;
-        write!(f, "object_store_url={:?}, ", self.object_store_url)?;
-
-        write!(
-            f,
-            "statistics={:?}, ",
-            self.file_source.statistics().unwrap()
-        )?;
-
-        DisplayAs::fmt_as(self, DisplayFormatType::Verbose, f)?;
-        write!(f, "}}")
-    }
-}
-
-impl DisplayAs for FileScanConfig {
-    fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult {
-        let schema = self.projected_schema();
-        let orderings = get_projected_output_ordering(self, &schema);
-
-        write!(f, "file_groups=")?;
-        FileGroupsDisplay(&self.file_groups).fmt_as(t, f)?;
-
-        if !schema.fields().is_empty() {
-            write!(f, ", projection={}", ProjectSchemaDisplay(&schema))?;
-        }
-
-        if let Some(limit) = self.limit {
-            write!(f, ", limit={limit}")?;
-        }
-
-        display_orderings(f, &orderings)?;
-
-        if !self.constraints.is_empty() {
-            write!(f, ", {}", self.constraints)?;
-        }
-
-        Ok(())
-    }
-}
-
-/// A helper that projects partition columns into the file record batches.
-///
-/// One interesting trick is the usage of a cache for the key buffers of the partition column
-/// dictionaries. Indeed, the partition columns are constant, so the dictionaries that represent them
-/// have all their keys equal to 0. This enables us to re-use the same "all-zero" buffer across batches,
-/// which makes the space consumption of the partition columns O(batch_size) instead of O(record_count).
-pub struct PartitionColumnProjector {
-    /// An Arrow buffer initialized to zeros that represents the key array of all partition
-    /// columns (partition columns are materialized by dictionary arrays with only one
-    /// value in the dictionary, thus all the keys are equal to zero).
-    key_buffer_cache: ZeroBufferGenerators,
-    /// Mapping between the indexes in the list of partition columns and the target
-    /// schema. Sorted by index in the target schema so that we can iterate on it to
-    /// insert the partition columns in the target record batch.
-    projected_partition_indexes: Vec<(usize, usize)>,
-    /// The schema of the table once the projection was applied.
-    projected_schema: SchemaRef,
-}
-
-impl PartitionColumnProjector {
-    // Create a projector to insert the partitioning columns into batches read from files
-    // - `projected_schema`: the target schema with both file and partitioning columns
-    // - `table_partition_cols`: all the partitioning column names
-    pub fn new(projected_schema: SchemaRef, table_partition_cols: &[String]) -> Self {
-        let mut idx_map = HashMap::new();
-        for (partition_idx, partition_name) in table_partition_cols.iter().enumerate() {
-            if let Ok(schema_idx) = projected_schema.index_of(partition_name) {
-                idx_map.insert(partition_idx, schema_idx);
-            }
-        }
-
-        let mut projected_partition_indexes: Vec<_> = idx_map.into_iter().collect();
-        projected_partition_indexes.sort_by(|(_, a), (_, b)| a.cmp(b));
-
-        Self {
-            projected_partition_indexes,
-            key_buffer_cache: Default::default(),
-            projected_schema,
-        }
-    }
-
-    // Transform the batch read from the file by inserting the partitioning columns
-    // to the right positions as deduced from `projected_schema`
-    // - `file_batch`: batch read from the file, with internal projection applied
-    // - `partition_values`: the list of partition values, one for each partition column
-    pub fn project(
-        &mut self,
-        file_batch: RecordBatch,
-        partition_values: &[ScalarValue],
-    ) -> Result<RecordBatch> {
-        let expected_cols =
-            self.projected_schema.fields().len() - self.projected_partition_indexes.len();
-
-        if file_batch.columns().len() != expected_cols {
-            return exec_err!(
-                "Unexpected batch schema from file, expected {} cols but got {}",
-                expected_cols,
-                file_batch.columns().len()
-            );
-        }
-
-        let mut cols = file_batch.columns().to_vec();
-        for &(pidx, sidx) in &self.projected_partition_indexes {
-            let p_value = partition_values.get(pidx).ok_or_else(|| {
-                exec_datafusion_err!("Invalid partitioning found on disk")
-            })?;
-
-            let mut partition_value = Cow::Borrowed(p_value);
-
-            // check if user forgot to dict-encode the partition value
-            let field = self.projected_schema.field(sidx);
-            let expected_data_type = field.data_type();
-            let actual_data_type = partition_value.data_type();
-            if let DataType::Dictionary(key_type, _) = expected_data_type {
-                if !matches!(actual_data_type, DataType::Dictionary(_, _)) {
-                    warn!("Partition value for column {} was not dictionary-encoded, applied auto-fix.", field.name());
-                    partition_value = Cow::Owned(ScalarValue::Dictionary(
-                        key_type.clone(),
-                        Box::new(partition_value.as_ref().clone()),
-                    ));
-                }
-            }
-
-            cols.insert(
-                sidx,
-                create_output_array(
-                    &mut self.key_buffer_cache,
-                    partition_value.as_ref(),
-                    file_batch.num_rows(),
-                )?,
-            )
-        }
-
-        RecordBatch::try_new_with_options(
-            Arc::clone(&self.projected_schema),
-            cols,
-            &RecordBatchOptions::new().with_row_count(Some(file_batch.num_rows())),
-        )
-        .map_err(Into::into)
-    }
-}
-
-#[derive(Debug, Default)]
-struct ZeroBufferGenerators {
-    gen_i8: ZeroBufferGenerator<i8>,
-    gen_i16: ZeroBufferGenerator<i16>,
-    gen_i32: ZeroBufferGenerator<i32>,
-    gen_i64: ZeroBufferGenerator<i64>,
-    gen_u8: ZeroBufferGenerator<u8>,
-    gen_u16: ZeroBufferGenerator<u16>,
-    gen_u32: ZeroBufferGenerator<u32>,
-    gen_u64: ZeroBufferGenerator<u64>,
-}
-
-/// Generate a arrow [`Buffer`] that contains zero values.
-#[derive(Debug, Default)]
-struct ZeroBufferGenerator<T>
-where
-    T: ArrowNativeType,
-{
-    cache: Option<Buffer>,
-    _t: PhantomData<T>,
-}
-
-impl<T> ZeroBufferGenerator<T>
-where
-    T: ArrowNativeType,
-{
-    const SIZE: usize = size_of::<T>();
-
-    fn get_buffer(&mut self, n_vals: usize) -> Buffer {
-        match &mut self.cache {
-            Some(buf) if buf.len() >= n_vals * Self::SIZE => {
-                buf.slice_with_length(0, n_vals * Self::SIZE)
-            }
-            _ => {
-                let mut key_buffer_builder = BufferBuilder::<T>::new(n_vals);
-                key_buffer_builder.advance(n_vals); // keys are all 0
-                self.cache.insert(key_buffer_builder.finish()).clone()
-            }
-        }
-    }
-}
-
-fn create_dict_array<T>(
-    buffer_gen: &mut ZeroBufferGenerator<T>,
-    dict_val: &ScalarValue,
-    len: usize,
-    data_type: DataType,
-) -> Result<ArrayRef>
-where
-    T: ArrowNativeType,
-{
-    let dict_vals = dict_val.to_array()?;
-
-    let sliced_key_buffer = buffer_gen.get_buffer(len);
-
-    // assemble pieces together
-    let mut builder = ArrayData::builder(data_type)
-        .len(len)
-        .add_buffer(sliced_key_buffer);
-    builder = builder.add_child_data(dict_vals.to_data());
-    Ok(Arc::new(DictionaryArray::<UInt16Type>::from(
-        builder.build().unwrap(),
-    )))
-}
-
-fn create_output_array(
-    key_buffer_cache: &mut ZeroBufferGenerators,
-    val: &ScalarValue,
-    len: usize,
-) -> Result<ArrayRef> {
-    if let ScalarValue::Dictionary(key_type, dict_val) = &val {
-        match key_type.as_ref() {
-            DataType::Int8 => {
-                return create_dict_array(
-                    &mut key_buffer_cache.gen_i8,
-                    dict_val,
-                    len,
-                    val.data_type(),
-                );
-            }
-            DataType::Int16 => {
-                return create_dict_array(
-                    &mut key_buffer_cache.gen_i16,
-                    dict_val,
-                    len,
-                    val.data_type(),
-                );
-            }
-            DataType::Int32 => {
-                return create_dict_array(
-                    &mut key_buffer_cache.gen_i32,
-                    dict_val,
-                    len,
-                    val.data_type(),
-                );
-            }
-            DataType::Int64 => {
-                return create_dict_array(
-                    &mut key_buffer_cache.gen_i64,
-                    dict_val,
-                    len,
-                    val.data_type(),
-                );
-            }
-            DataType::UInt8 => {
-                return create_dict_array(
-                    &mut key_buffer_cache.gen_u8,
-                    dict_val,
-                    len,
-                    val.data_type(),
-                );
-            }
-            DataType::UInt16 => {
-                return create_dict_array(
-                    &mut key_buffer_cache.gen_u16,
-                    dict_val,
-                    len,
-                    val.data_type(),
-                );
-            }
-            DataType::UInt32 => {
-                return create_dict_array(
-                    &mut key_buffer_cache.gen_u32,
-                    dict_val,
-                    len,
-                    val.data_type(),
-                );
-            }
-            DataType::UInt64 => {
-                return create_dict_array(
-                    &mut key_buffer_cache.gen_u64,
-                    dict_val,
-                    len,
-                    val.data_type(),
-                );
-            }
-            _ => {}
-        }
-    }
-
-    val.to_array_of_size(len)
-}
-
-/// The various listing tables does not attempt to read all files
-/// concurrently, instead they will read files in sequence within a
-/// partition.  This is an important property as it allows plans to
-/// run against 1000s of files and not try to open them all
-/// concurrently.
-///
-/// However, it means if we assign more than one file to a partition
-/// the output sort order will not be preserved as illustrated in the
-/// following diagrams:
-///
-/// When only 1 file is assigned to each partition, each partition is
-/// correctly sorted on `(A, B, C)`
-///
-/// ```text
-/// ┏ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┓
-///   ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─  ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─  ┌ ─ ─ ─ ─ ─ ─ ─ ─ ┐
-/// ┃   ┌───────────────┐     ┌──────────────┐ │   ┌──────────────┐ │   ┌─────────────┐   ┃
-///   │ │   1.parquet   │ │ │ │  2.parquet   │   │ │  3.parquet   │   │ │  4.parquet  │ │
-/// ┃   │ Sort: A, B, C │     │Sort: A, B, C │ │   │Sort: A, B, C │ │   │Sort: A, B, C│   ┃
-///   │ └───────────────┘ │ │ └──────────────┘   │ └──────────────┘   │ └─────────────┘ │
-/// ┃                                          │                    │                     ┃
-///   │                   │ │                    │                    │                 │
-/// ┃                                          │                    │                     ┃
-///   │                   │ │                    │                    │                 │
-/// ┃                                          │                    │                     ┃
-///   │                   │ │                    │                    │                 │
-/// ┃  ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘  ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘  ─ ─ ─ ─ ─ ─ ─ ─ ─  ┃
-///      DataFusion           DataFusion           DataFusion           DataFusion
-/// ┃    Partition 1          Partition 2          Partition 3          Partition 4       ┃
-///  ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━
-///
-///                                      DataSourceExec
-/// ```
-///
-/// However, when more than 1 file is assigned to each partition, each
-/// partition is NOT correctly sorted on `(A, B, C)`. Once the second
-/// file is scanned, the same values for A, B and C can be repeated in
-/// the same sorted stream
-///
-///```text
-/// ┏ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━
-///   ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─  ┃
-/// ┃   ┌───────────────┐     ┌──────────────┐ │
-///   │ │   1.parquet   │ │ │ │  2.parquet   │   ┃
-/// ┃   │ Sort: A, B, C │     │Sort: A, B, C │ │
-///   │ └───────────────┘ │ │ └──────────────┘   ┃
-/// ┃   ┌───────────────┐     ┌──────────────┐ │
-///   │ │   3.parquet   │ │ │ │  4.parquet   │   ┃
-/// ┃   │ Sort: A, B, C │     │Sort: A, B, C │ │
-///   │ └───────────────┘ │ │ └──────────────┘   ┃
-/// ┃                                          │
-///   │                   │ │                    ┃
-/// ┃  ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
-///      DataFusion           DataFusion         ┃
-/// ┃    Partition 1          Partition 2
-///  ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┛
-///
-///              DataSourceExec
-/// ```
-fn get_projected_output_ordering(
-    base_config: &FileScanConfig,
-    projected_schema: &SchemaRef,
-) -> Vec<LexOrdering> {
-    let projected_orderings =
-        project_orderings(&base_config.output_ordering, projected_schema);
-
-    let mut all_orderings = vec![];
-    for new_ordering in projected_orderings {
-        // Check if any file groups are not sorted
-        if base_config.file_groups.iter().any(|group| {
-            if group.len() <= 1 {
-                // File groups with <= 1 files are always sorted
-                return false;
-            }
-
-            let indices = base_config
-                .projection_exprs
-                .as_ref()
-                .map(|p| p.ordered_column_indices());
-
-            let statistics = match MinMaxStatistics::new_from_files(
-                &new_ordering,
-                projected_schema,
-                indices.as_deref(),
-                group.iter(),
-            ) {
-                Ok(statistics) => statistics,
-                Err(e) => {
-                    log::trace!("Error fetching statistics for file group: {e}");
-                    // we can't prove that it's ordered, so we have to reject it
-                    return true;
-                }
-            };
-
-            !statistics.is_sorted()
-        }) {
-            debug!(
-                "Skipping specified output ordering {:?}. \
-                Some file groups couldn't be determined to be sorted: {:?}",
-                base_config.output_ordering[0], base_config.file_groups
-            );
-            continue;
-        }
-
-        all_orderings.push(new_ordering);
-    }
-    all_orderings
-}
-
-/// Convert type to a type suitable for use as a `ListingTable`
-/// partition column. Returns `Dictionary(UInt16, val_type)`, which is
-/// a reasonable trade off between a reasonable number of partition
-/// values and space efficiency.
-///
-/// This use this to specify types for partition columns. However
-/// you MAY also choose not to dictionary-encode the data or to use a
-/// different dictionary type.
-///
-/// Use [`wrap_partition_value_in_dict`] to wrap a [`ScalarValue`] in the same say.
-pub fn wrap_partition_type_in_dict(val_type: DataType) -> DataType {
-    DataType::Dictionary(Box::new(DataType::UInt16), Box::new(val_type))
-}
-
-/// Convert a [`ScalarValue`] of partition columns to a type, as
-/// described in the documentation of [`wrap_partition_type_in_dict`],
-/// which can wrap the types.
-pub fn wrap_partition_value_in_dict(val: ScalarValue) -> ScalarValue {
-    ScalarValue::Dictionary(Box::new(DataType::UInt16), Box::new(val))
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::test_util::col;
-    use crate::{
-        generate_test_files, test_util::MockSource, tests::aggr_test_schema,
-        verify_sort_integrity,
-    };
-
-    use arrow::array::{Int32Array, RecordBatch};
-    use datafusion_common::stats::Precision;
-    use datafusion_common::{assert_batches_eq, internal_err};
-    use datafusion_expr::{Operator, SortExpr};
-    use datafusion_physical_expr::create_physical_sort_expr;
-    use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal};
-    use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
-
-    /// Returns the column names on the schema
-    pub fn columns(schema: &Schema) -> Vec<String> {
-        schema.fields().iter().map(|f| f.name().clone()).collect()
-    }
-
-    #[test]
-    fn physical_plan_config_no_projection() {
-        let file_schema = aggr_test_schema();
-        let conf = config_for_projection(
-            Arc::clone(&file_schema),
-            None,
-            Statistics::new_unknown(&file_schema),
-            to_partition_cols(vec![(
-                "date".to_owned(),
-                wrap_partition_type_in_dict(DataType::Utf8),
-            )]),
-        );
-
-        let (proj_schema, _, proj_statistics, _) = conf.project();
-        assert_eq!(proj_schema.fields().len(), file_schema.fields().len() + 1);
-        assert_eq!(
-            proj_schema.field(file_schema.fields().len()).name(),
-            "date",
-            "partition columns are the last columns"
-        );
-        assert_eq!(
-            proj_statistics.column_statistics.len(),
-            file_schema.fields().len() + 1
-        );
-        // TODO implement tests for partition column statistics once implemented
-
-        let col_names = conf.projected_file_column_names();
-        assert_eq!(col_names, None);
-
-        let col_indices = conf.file_column_projection_indices();
-        assert_eq!(col_indices, None);
-    }
-
-    #[test]
-    fn physical_plan_config_no_projection_tab_cols_as_field() {
-        let file_schema = aggr_test_schema();
-
-        // make a table_partition_col as a field
-        let table_partition_col =
-            Field::new("date", wrap_partition_type_in_dict(DataType::Utf8), true)
-                .with_metadata(HashMap::from_iter(vec![(
-                    "key_whatever".to_owned(),
-                    "value_whatever".to_owned(),
-                )]));
-
-        let conf = config_for_projection(
-            Arc::clone(&file_schema),
-            None,
-            Statistics::new_unknown(&file_schema),
-            vec![table_partition_col.clone()],
-        );
-
-        // verify the proj_schema includes the last column and exactly the same the field it is defined
-        let proj_schema = conf.projected_schema();
-        assert_eq!(proj_schema.fields().len(), file_schema.fields().len() + 1);
-        assert_eq!(
-            *proj_schema.field(file_schema.fields().len()),
-            table_partition_col,
-            "partition columns are the last columns and ust have all values defined in created field"
-        );
-    }
-
-    #[test]
-    fn physical_plan_config_with_projection() {
-        let file_schema = aggr_test_schema();
-        let conf = config_for_projection(
-            Arc::clone(&file_schema),
-            Some(vec![file_schema.fields().len(), 0]),
-            Statistics {
-                num_rows: Precision::Inexact(10),
-                // assign the column index to distinct_count to help assert
-                // the source statistic after the projection
-                column_statistics: (0..file_schema.fields().len())
-                    .map(|i| ColumnStatistics {
-                        distinct_count: Precision::Inexact(i),
-                        ..Default::default()
-                    })
-                    .collect(),
-                total_byte_size: Precision::Absent,
-            },
-            to_partition_cols(vec![(
-                "date".to_owned(),
-                wrap_partition_type_in_dict(DataType::Utf8),
-            )]),
-        );
-
-        let (proj_schema, _, proj_statistics, _) = conf.project();
-        assert_eq!(
-            columns(&proj_schema),
-            vec!["date".to_owned(), "c1".to_owned()]
-        );
-        let proj_stat_cols = proj_statistics.column_statistics;
-        assert_eq!(proj_stat_cols.len(), 2);
-        // TODO implement tests for proj_stat_cols[0] once partition column
-        // statistics are implemented
-        assert_eq!(proj_stat_cols[1].distinct_count, Precision::Inexact(0));
-
-        let col_names = conf.projected_file_column_names();
-        assert_eq!(col_names, Some(vec!["c1".to_owned()]));
-
-        let col_indices = conf.file_column_projection_indices();
-        assert_eq!(col_indices, Some(vec![0]));
-    }
-
-    #[test]
-    fn partition_column_projector() {
-        let file_batch = build_table_i32(
-            ("a", &vec![0, 1, 2]),
-            ("b", &vec![-2, -1, 0]),
-            ("c", &vec![10, 11, 12]),
-        );
-        let partition_cols = vec![
-            (
-                "year".to_owned(),
-                wrap_partition_type_in_dict(DataType::Utf8),
-            ),
-            (
-                "month".to_owned(),
-                wrap_partition_type_in_dict(DataType::Utf8),
-            ),
-            (
-                "day".to_owned(),
-                wrap_partition_type_in_dict(DataType::Utf8),
-            ),
-        ];
-        // create a projected schema
-        let statistics = Statistics {
-            num_rows: Precision::Inexact(3),
-            total_byte_size: Precision::Absent,
-            column_statistics: Statistics::unknown_column(&file_batch.schema()),
-        };
-
-        let conf = config_for_projection(
-            file_batch.schema(),
-            // keep all cols from file and 2 from partitioning
-            Some(vec![
-                0,
-                1,
-                2,
-                file_batch.schema().fields().len(),
-                file_batch.schema().fields().len() + 2,
-            ]),
-            statistics.clone(),
-            to_partition_cols(partition_cols.clone()),
-        );
-
-        let source_statistics = conf.file_source.statistics().unwrap();
-        let conf_stats = conf.partition_statistics(None).unwrap();
-
-        // projection should be reflected in the file source statistics
-        assert_eq!(conf_stats.num_rows, Precision::Inexact(3));
-
-        // 3 original statistics + 2 partition statistics
-        assert_eq!(conf_stats.column_statistics.len(), 5);
-
-        // file statics should not be modified
-        assert_eq!(source_statistics, statistics);
-        assert_eq!(source_statistics.column_statistics.len(), 3);
-
-        let proj_schema = conf.projected_schema();
-        // created a projector for that projected schema
-        let mut proj = PartitionColumnProjector::new(
-            proj_schema,
-            &partition_cols
-                .iter()
-                .map(|x| x.0.clone())
-                .collect::<Vec<_>>(),
-        );
-
-        // project first batch
-        let projected_batch = proj
-            .project(
-                // file_batch is ok here because we kept all the file cols in the projection
-                file_batch,
-                &[
-                    wrap_partition_value_in_dict(ScalarValue::from("2021")),
-                    wrap_partition_value_in_dict(ScalarValue::from("10")),
-                    wrap_partition_value_in_dict(ScalarValue::from("26")),
-                ],
-            )
-            .expect("Projection of partition columns into record batch failed");
-        let expected = [
-            "+---+----+----+------+-----+",
-            "| a | b  | c  | year | day |",
-            "+---+----+----+------+-----+",
-            "| 0 | -2 | 10 | 2021 | 26  |",
-            "| 1 | -1 | 11 | 2021 | 26  |",
-            "| 2 | 0  | 12 | 2021 | 26  |",
-            "+---+----+----+------+-----+",
-        ];
-        assert_batches_eq!(expected, &[projected_batch]);
-
-        // project another batch that is larger than the previous one
-        let file_batch = build_table_i32(
-            ("a", &vec![5, 6, 7, 8, 9]),
-            ("b", &vec![-10, -9, -8, -7, -6]),
-            ("c", &vec![12, 13, 14, 15, 16]),
-        );
-        let projected_batch = proj
-            .project(
-                // file_batch is ok here because we kept all the file cols in the projection
-                file_batch,
-                &[
-                    wrap_partition_value_in_dict(ScalarValue::from("2021")),
-                    wrap_partition_value_in_dict(ScalarValue::from("10")),
-                    wrap_partition_value_in_dict(ScalarValue::from("27")),
-                ],
-            )
-            .expect("Projection of partition columns into record batch failed");
-        let expected = [
-            "+---+-----+----+------+-----+",
-            "| a | b   | c  | year | day |",
-            "+---+-----+----+------+-----+",
-            "| 5 | -10 | 12 | 2021 | 27  |",
-            "| 6 | -9  | 13 | 2021 | 27  |",
-            "| 7 | -8  | 14 | 2021 | 27  |",
-            "| 8 | -7  | 15 | 2021 | 27  |",
-            "| 9 | -6  | 16 | 2021 | 27  |",
-            "+---+-----+----+------+-----+",
-        ];
-        assert_batches_eq!(expected, &[projected_batch]);
-
-        // project another batch that is smaller than the previous one
-        let file_batch = build_table_i32(
-            ("a", &vec![0, 1, 3]),
-            ("b", &vec![2, 3, 4]),
-            ("c", &vec![4, 5, 6]),
-        );
-        let projected_batch = proj
-            .project(
-                // file_batch is ok here because we kept all the file cols in the projection
-                file_batch,
-                &[
-                    wrap_partition_value_in_dict(ScalarValue::from("2021")),
-                    wrap_partition_value_in_dict(ScalarValue::from("10")),
-                    wrap_partition_value_in_dict(ScalarValue::from("28")),
-                ],
-            )
-            .expect("Projection of partition columns into record batch failed");
-        let expected = [
-            "+---+---+---+------+-----+",
-            "| a | b | c | year | day |",
-            "+---+---+---+------+-----+",
-            "| 0 | 2 | 4 | 2021 | 28  |",
-            "| 1 | 3 | 5 | 2021 | 28  |",
-            "| 3 | 4 | 6 | 2021 | 28  |",
-            "+---+---+---+------+-----+",
-        ];
-        assert_batches_eq!(expected, &[projected_batch]);
-
-        // forgot to dictionary-wrap the scalar value
-        let file_batch = build_table_i32(
-            ("a", &vec![0, 1, 2]),
-            ("b", &vec![-2, -1, 0]),
-            ("c", &vec![10, 11, 12]),
-        );
-        let projected_batch = proj
-            .project(
-                // file_batch is ok here because we kept all the file cols in the projection
-                file_batch,
-                &[
-                    ScalarValue::from("2021"),
-                    ScalarValue::from("10"),
-                    ScalarValue::from("26"),
-                ],
-            )
-            .expect("Projection of partition columns into record batch failed");
-        let expected = [
-            "+---+----+----+------+-----+",
-            "| a | b  | c  | year | day |",
-            "+---+----+----+------+-----+",
-            "| 0 | -2 | 10 | 2021 | 26  |",
-            "| 1 | -1 | 11 | 2021 | 26  |",
-            "| 2 | 0  | 12 | 2021 | 26  |",
-            "+---+----+----+------+-----+",
-        ];
-        assert_batches_eq!(expected, &[projected_batch]);
-    }
-
-    #[test]
-    fn test_projected_file_schema_with_partition_col() {
-        let schema = aggr_test_schema();
-        let partition_cols = vec![
-            (
-                "part1".to_owned(),
-                wrap_partition_type_in_dict(DataType::Utf8),
-            ),
-            (
-                "part2".to_owned(),
-                wrap_partition_type_in_dict(DataType::Utf8),
-            ),
-        ];
-
-        // Projected file schema for config with projection including partition column
-        let projection = config_for_projection(
-            schema.clone(),
-            Some(vec![0, 3, 5, schema.fields().len()]),
-            Statistics::new_unknown(&schema),
-            to_partition_cols(partition_cols),
-        )
-        .projected_file_schema();
-
-        // Assert partition column filtered out in projected file schema
-        let expected_columns = vec!["c1", "c4", "c6"];
-        let actual_columns = projection
-            .fields()
-            .iter()
-            .map(|f| f.name().clone())
-            .collect::<Vec<_>>();
-        assert_eq!(expected_columns, actual_columns);
-    }
-
-    #[test]
-    fn test_projected_file_schema_without_projection() {
-        let schema = aggr_test_schema();
-        let partition_cols = vec![
-            (
-                "part1".to_owned(),
-                wrap_partition_type_in_dict(DataType::Utf8),
-            ),
-            (
-                "part2".to_owned(),
-                wrap_partition_type_in_dict(DataType::Utf8),
-            ),
-        ];
-
-        // Projected file schema for config without projection
-        let projection = config_for_projection(
-            schema.clone(),
-            None,
-            Statistics::new_unknown(&schema),
-            to_partition_cols(partition_cols),
-        )
-        .projected_file_schema();
-
-        // Assert projected file schema is equal to file schema
-        assert_eq!(projection.fields(), schema.fields());
-    }
-
-    #[test]
-    fn test_split_groups_by_statistics() -> Result<()> {
-        use chrono::TimeZone;
-        use datafusion_common::DFSchema;
-        use datafusion_expr::execution_props::ExecutionProps;
-        use object_store::{path::Path, ObjectMeta};
-
-        struct File {
-            name: &'static str,
-            date: &'static str,
-            statistics: Vec<Option<(Option<f64>, Option<f64>)>>,
-        }
-        impl File {
-            fn new(
-                name: &'static str,
-                date: &'static str,
-                statistics: Vec<Option<(f64, f64)>>,
-            ) -> Self {
-                Self::new_nullable(
-                    name,
-                    date,
-                    statistics
-                        .into_iter()
-                        .map(|opt| opt.map(|(min, max)| (Some(min), Some(max))))
-                        .collect(),
-                )
-            }
-
-            fn new_nullable(
-                name: &'static str,
-                date: &'static str,
-                statistics: Vec<Option<(Option<f64>, Option<f64>)>>,
-            ) -> Self {
-                Self {
-                    name,
-                    date,
-                    statistics,
-                }
-            }
-        }
-
-        struct TestCase {
-            name: &'static str,
-            file_schema: Schema,
-            files: Vec<File>,
-            sort: Vec<SortExpr>,
-            expected_result: Result<Vec<Vec<&'static str>>, &'static str>,
-        }
-
-        use datafusion_expr::col;
-        let cases = vec![
-            TestCase {
-                name: "test sort",
-                file_schema: Schema::new(vec![Field::new(
-                    "value".to_string(),
-                    DataType::Float64,
-                    false,
-                )]),
-                files: vec![
-                    File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]),
-                    File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]),
-                    File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]),
-                ],
-                sort: vec![col("value").sort(true, false)],
-                expected_result: Ok(vec![vec!["0", "1"], vec!["2"]]),
-            },
-            // same input but file '2' is in the middle
-            // test that we still order correctly
-            TestCase {
-                name: "test sort with files ordered differently",
-                file_schema: Schema::new(vec![Field::new(
-                    "value".to_string(),
-                    DataType::Float64,
-                    false,
-                )]),
-                files: vec![
-                    File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]),
-                    File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]),
-                    File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]),
-                ],
-                sort: vec![col("value").sort(true, false)],
-                expected_result: Ok(vec![vec!["0", "1"], vec!["2"]]),
-            },
-            TestCase {
-                name: "reverse sort",
-                file_schema: Schema::new(vec![Field::new(
-                    "value".to_string(),
-                    DataType::Float64,
-                    false,
-                )]),
-                files: vec![
-                    File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]),
-                    File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]),
-                    File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]),
-                ],
-                sort: vec![col("value").sort(false, true)],
-                expected_result: Ok(vec![vec!["1", "0"], vec!["2"]]),
-            },
-            TestCase {
-                name: "nullable sort columns, nulls last",
-                file_schema: Schema::new(vec![Field::new(
-                    "value".to_string(),
-                    DataType::Float64,
-                    true,
-                )]),
-                files: vec![
-                    File::new_nullable("0", "2023-01-01", vec![Some((Some(0.00), Some(0.49)))]),
-                    File::new_nullable("1", "2023-01-01", vec![Some((Some(0.50), None))]),
-                    File::new_nullable("2", "2023-01-02", vec![Some((Some(0.00), None))]),
-                ],
-                sort: vec![col("value").sort(true, false)],
-                expected_result: Ok(vec![vec!["0", "1"], vec!["2"]])
-            },
-            TestCase {
-                name: "nullable sort columns, nulls first",
-                file_schema: Schema::new(vec![Field::new(
-                    "value".to_string(),
-                    DataType::Float64,
-                    true,
-                )]),
-                files: vec![
-                    File::new_nullable("0", "2023-01-01", vec![Some((None, Some(0.49)))]),
-                    File::new_nullable("1", "2023-01-01", vec![Some((Some(0.50), Some(1.00)))]),
-                    File::new_nullable("2", "2023-01-02", vec![Some((None, Some(1.00)))]),
-                ],
-                sort: vec![col("value").sort(true, true)],
-                expected_result: Ok(vec![vec!["0", "1"], vec!["2"]])
-            },
-            TestCase {
-                name: "all three non-overlapping",
-                file_schema: Schema::new(vec![Field::new(
-                    "value".to_string(),
-                    DataType::Float64,
-                    false,
-                )]),
-                files: vec![
-                    File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]),
-                    File::new("1", "2023-01-01", vec![Some((0.50, 0.99))]),
-                    File::new("2", "2023-01-02", vec![Some((1.00, 1.49))]),
-                ],
-                sort: vec![col("value").sort(true, false)],
-                expected_result: Ok(vec![vec!["0", "1", "2"]]),
-            },
-            TestCase {
-                name: "all three overlapping",
-                file_schema: Schema::new(vec![Field::new(
-                    "value".to_string(),
-                    DataType::Float64,
-                    false,
-                )]),
-                files: vec![
-                    File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]),
-                    File::new("1", "2023-01-01", vec![Some((0.00, 0.49))]),
-                    File::new("2", "2023-01-02", vec![Some((0.00, 0.49))]),
-                ],
-                sort: vec![col("value").sort(true, false)],
-                expected_result: Ok(vec![vec!["0"], vec!["1"], vec!["2"]]),
-            },
-            TestCase {
-                name: "empty input",
-                file_schema: Schema::new(vec![Field::new(
-                    "value".to_string(),
-                    DataType::Float64,
-                    false,
-                )]),
-                files: vec![],
-                sort: vec![col("value").sort(true, false)],
-                expected_result: Ok(vec![]),
-            },
-            TestCase {
-                name: "one file missing statistics",
-                file_schema: Schema::new(vec![Field::new(
-                    "value".to_string(),
-                    DataType::Float64,
-                    false,
-                )]),
-                files: vec![
-                    File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]),
-                    File::new("1", "2023-01-01", vec![Some((0.00, 0.49))]),
-                    File::new("2", "2023-01-02", vec![None]),
-                ],
-                sort: vec![col("value").sort(true, false)],
-                expected_result: Err("construct min/max statistics for split_groups_by_statistics\ncaused by\ncollect min/max values\ncaused by\nget min/max for column: 'value'\ncaused by\nError during planning: statistics not found"),
-            },
-        ];
-
-        for case in cases {
-            let table_schema = Arc::new(Schema::new(
-                case.file_schema
-                    .fields()
-                    .clone()
-                    .into_iter()
-                    .cloned()
-                    .chain(Some(Arc::new(Field::new(
-                        "date".to_string(),
-                        DataType::Utf8,
-                        false,
-                    ))))
-                    .collect::<Vec<_>>(),
-            ));
-            let Some(sort_order) = LexOrdering::new(
-                case.sort
-                    .into_iter()
-                    .map(|expr| {
-                        create_physical_sort_expr(
-                            &expr,
-                            &DFSchema::try_from(Arc::clone(&table_schema))?,
-                            &ExecutionProps::default(),
-                        )
-                    })
-                    .collect::<Result<Vec<_>>>()?,
-            ) else {
-                return internal_err!("This test should always use an ordering");
-            };
-
-            let partitioned_files = FileGroup::new(
-                case.files.into_iter().map(From::from).collect::<Vec<_>>(),
-            );
-            let result = FileScanConfig::split_groups_by_statistics(
-                &table_schema,
-                std::slice::from_ref(&partitioned_files),
-                &sort_order,
-            );
-            let results_by_name = result
-                .as_ref()
-                .map(|file_groups| {
-                    file_groups
-                        .iter()
-                        .map(|file_group| {
-                            file_group
-                                .iter()
-                                .map(|file| {
-                                    partitioned_files
-                                        .iter()
-                                        .find_map(|f| {
-                                            if f.object_meta == file.object_meta {
-                                                Some(
-                                                    f.object_meta
-                                                        .location
-                                                        .as_ref()
-                                                        .rsplit('/')
-                                                        .next()
-                                                        .unwrap()
-                                                        .trim_end_matches(".parquet"),
-                                                )
-                                            } else {
-                                                None
-                                            }
-                                        })
-                                        .unwrap()
-                                })
-                                .collect::<Vec<_>>()
-                        })
-                        .collect::<Vec<_>>()
-                })
-                .map_err(|e| e.strip_backtrace().leak() as &'static str);
-
-            assert_eq!(results_by_name, case.expected_result, "{}", case.name);
-        }
-
-        return Ok(());
-
-        impl From<File> for PartitionedFile {
-            fn from(file: File) -> Self {
-                PartitionedFile {
-                    object_meta: ObjectMeta {
-                        location: Path::from(format!(
-                            "data/date={}/{}.parquet",
-                            file.date, file.name
-                        )),
-                        last_modified: chrono::Utc.timestamp_nanos(0),
-                        size: 0,
-                        e_tag: None,
-                        version: None,
-                    },
-                    partition_values: vec![ScalarValue::from(file.date)],
-                    range: None,
-                    statistics: Some(Arc::new(Statistics {
-                        num_rows: Precision::Absent,
-                        total_byte_size: Precision::Absent,
-                        column_statistics: file
-                            .statistics
-                            .into_iter()
-                            .map(|stats| {
-                                stats
-                                    .map(|(min, max)| ColumnStatistics {
-                                        min_value: Precision::Exact(
-                                            ScalarValue::Float64(min),
-                                        ),
-                                        max_value: Precision::Exact(
-                                            ScalarValue::Float64(max),
-                                        ),
-                                        ..Default::default()
-                                    })
-                                    .unwrap_or_default()
-                            })
-                            .collect::<Vec<_>>(),
-                    })),
-                    extensions: None,
-                    metadata_size_hint: None,
-                }
-            }
-        }
-    }
-
-    // sets default for configs that play no role in projections
-    fn config_for_projection(
-        file_schema: SchemaRef,
-        projection: Option<Vec<usize>>,
-        statistics: Statistics,
-        table_partition_cols: Vec<Field>,
-    ) -> FileScanConfig {
-        FileScanConfigBuilder::new(
-            ObjectStoreUrl::parse("test:///").unwrap(),
-            file_schema,
-            Arc::new(MockSource::default()),
-        )
-        .with_projection_indices(projection)
-        .with_statistics(statistics)
-        .with_table_partition_cols(table_partition_cols)
-        .build()
-    }
-
-    /// Convert partition columns from Vec<String DataType> to Vec<Field>
-    fn to_partition_cols(table_partition_cols: Vec<(String, DataType)>) -> Vec<Field> {
-        table_partition_cols
-            .iter()
-            .map(|(name, dtype)| Field::new(name, dtype.clone(), false))
-            .collect::<Vec<_>>()
-    }
-
-    /// returns record batch with 3 columns of i32 in memory
-    pub fn build_table_i32(
-        a: (&str, &Vec<i32>),
-        b: (&str, &Vec<i32>),
-        c: (&str, &Vec<i32>),
-    ) -> RecordBatch {
-        let schema = Schema::new(vec![
-            Field::new(a.0, DataType::Int32, false),
-            Field::new(b.0, DataType::Int32, false),
-            Field::new(c.0, DataType::Int32, false),
-        ]);
-
-        RecordBatch::try_new(
-            Arc::new(schema),
-            vec![
-                Arc::new(Int32Array::from(a.1.clone())),
-                Arc::new(Int32Array::from(b.1.clone())),
-                Arc::new(Int32Array::from(c.1.clone())),
-            ],
-        )
-        .unwrap()
-    }
-
-    #[test]
-    fn test_file_scan_config_builder() {
-        let file_schema = aggr_test_schema();
-        let object_store_url = ObjectStoreUrl::parse("test:///").unwrap();
-        let file_source: Arc<dyn FileSource> = Arc::new(MockSource::default());
-
-        // Create a builder with required parameters
-        let builder = FileScanConfigBuilder::new(
-            object_store_url.clone(),
-            Arc::clone(&file_schema),
-            Arc::clone(&file_source),
-        );
-
-        // Build with various configurations
-        let config = builder
-            .with_limit(Some(1000))
-            .with_projection_indices(Some(vec![0, 1]))
-            .with_table_partition_cols(vec![Field::new(
-                "date",
-                wrap_partition_type_in_dict(DataType::Utf8),
-                false,
-            )])
-            .with_statistics(Statistics::new_unknown(&file_schema))
-            .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new(
-                "test.parquet".to_string(),
-                1024,
-            )])])
-            .with_output_ordering(vec![[PhysicalSortExpr::new_default(Arc::new(
-                Column::new("date", 0),
-            ))]
-            .into()])
-            .with_file_compression_type(FileCompressionType::UNCOMPRESSED)
-            .with_newlines_in_values(true)
-            .build();
-
-        // Verify the built config has all the expected values
-        assert_eq!(config.object_store_url, object_store_url);
-        assert_eq!(*config.file_schema(), file_schema);
-        assert_eq!(config.limit, Some(1000));
-        assert_eq!(
-            config.projection_exprs.as_ref().map(|p| p.column_indices()),
-            Some(vec![0, 1])
-        );
-        assert_eq!(config.table_partition_cols().len(), 1);
-        assert_eq!(config.table_partition_cols()[0].name(), "date");
-        assert_eq!(config.file_groups.len(), 1);
-        assert_eq!(config.file_groups[0].len(), 1);
-        assert_eq!(
-            config.file_groups[0][0].object_meta.location.as_ref(),
-            "test.parquet"
-        );
-        assert_eq!(
-            config.file_compression_type,
-            FileCompressionType::UNCOMPRESSED
-        );
-        assert!(config.new_lines_in_values);
-        assert_eq!(config.output_ordering.len(), 1);
-    }
-
-    #[test]
-    fn equivalence_properties_after_schema_change() {
-        let file_schema = aggr_test_schema();
-        let object_store_url = ObjectStoreUrl::parse("test:///").unwrap();
-        // Create a file source with a filter
-        let file_source: Arc<dyn FileSource> =
-            Arc::new(MockSource::default().with_filter(Arc::new(BinaryExpr::new(
-                col("c2", &file_schema).unwrap(),
-                Operator::Eq,
-                Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
-            ))));
-
-        let config = FileScanConfigBuilder::new(
-            object_store_url.clone(),
-            Arc::clone(&file_schema),
-            Arc::clone(&file_source),
-        )
-        .with_projection_indices(Some(vec![0, 1, 2]))
-        .build();
-
-        // Simulate projection being updated. Since the filter has already been pushed down,
-        // the new projection won't include the filtered column.
-        let data_source = config
-            .try_swapping_with_projection(&[ProjectionExpr::new(
-                col("c3", &file_schema).unwrap(),
-                "c3".to_string(),
-            )])
-            .unwrap()
-            .unwrap();
-
-        // Gather the equivalence properties from the new data source. There should
-        // be no equivalence class for column c2 since it was removed by the projection.
-        let eq_properties = data_source.eq_properties();
-        let eq_group = eq_properties.eq_group();
-
-        for class in eq_group.iter() {
-            for expr in class.iter() {
-                if let Some(col) = expr.as_any().downcast_ref::<Column>() {
-                    assert_ne!(
-                        col.name(),
-                        "c2",
-                        "c2 should not be present in any equivalence class"
-                    );
-                }
-            }
-        }
-    }
-
-    #[test]
-    fn test_file_scan_config_builder_defaults() {
-        let file_schema = aggr_test_schema();
-        let object_store_url = ObjectStoreUrl::parse("test:///").unwrap();
-        let file_source: Arc<dyn FileSource> = Arc::new(MockSource::default());
-
-        // Create a builder with only required parameters and build without any additional configurations
-        let config = FileScanConfigBuilder::new(
-            object_store_url.clone(),
-            Arc::clone(&file_schema),
-            Arc::clone(&file_source),
-        )
-        .build();
-
-        // Verify default values
-        assert_eq!(config.object_store_url, object_store_url);
-        assert_eq!(*config.file_schema(), file_schema);
-        assert_eq!(config.limit, None);
-        assert_eq!(
-            config.projection_exprs.as_ref().map(|p| p.column_indices()),
-            None
-        );
-        assert!(config.table_partition_cols().is_empty());
-        assert!(config.file_groups.is_empty());
-        assert_eq!(
-            config.file_compression_type,
-            FileCompressionType::UNCOMPRESSED
-        );
-        assert!(!config.new_lines_in_values);
-        assert!(config.output_ordering.is_empty());
-        assert!(config.constraints.is_empty());
-
-        // Verify statistics are set to unknown
-        assert_eq!(
-            config.file_source.statistics().unwrap().num_rows,
-            Precision::Absent
-        );
-        assert_eq!(
-            config.file_source.statistics().unwrap().total_byte_size,
-            Precision::Absent
-        );
-        assert_eq!(
-            config
-                .file_source
-                .statistics()
-                .unwrap()
-                .column_statistics
-                .len(),
-            file_schema.fields().len()
-        );
-        for stat in config.file_source.statistics().unwrap().column_statistics {
-            assert_eq!(stat.distinct_count, Precision::Absent);
-            assert_eq!(stat.min_value, Precision::Absent);
-            assert_eq!(stat.max_value, Precision::Absent);
-            assert_eq!(stat.null_count, Precision::Absent);
-        }
-    }
-
-    #[test]
-    fn test_file_scan_config_builder_new_from() {
-        let schema = aggr_test_schema();
-        let object_store_url = ObjectStoreUrl::parse("test:///").unwrap();
-        let file_source: Arc<dyn FileSource> = Arc::new(MockSource::default());
-        let partition_cols = vec![Field::new(
-            "date",
-            wrap_partition_type_in_dict(DataType::Utf8),
-            false,
-        )];
-        let file = PartitionedFile::new("test_file.parquet", 100);
-
-        // Create a config with non-default values
-        let original_config = FileScanConfigBuilder::new(
-            object_store_url.clone(),
-            Arc::clone(&schema),
-            Arc::clone(&file_source),
-        )
-        .with_projection_indices(Some(vec![0, 2]))
-        .with_limit(Some(10))
-        .with_table_partition_cols(partition_cols.clone())
-        .with_file(file.clone())
-        .with_constraints(Constraints::default())
-        .with_newlines_in_values(true)
-        .build();
-
-        // Create a new builder from the config
-        let new_builder = FileScanConfigBuilder::from(original_config);
-
-        // Build a new config from this builder
-        let new_config = new_builder.build();
-
-        // Verify properties match
-        let partition_cols = partition_cols.into_iter().map(Arc::new).collect::<Vec<_>>();
-        assert_eq!(new_config.object_store_url, object_store_url);
-        assert_eq!(*new_config.file_schema(), schema);
-        assert_eq!(
-            new_config
-                .projection_exprs
-                .as_ref()
-                .map(|p| p.column_indices()),
-            Some(vec![0, 2])
-        );
-        assert_eq!(new_config.limit, Some(10));
-        assert_eq!(*new_config.table_partition_cols(), partition_cols);
-        assert_eq!(new_config.file_groups.len(), 1);
-        assert_eq!(new_config.file_groups[0].len(), 1);
-        assert_eq!(
-            new_config.file_groups[0][0].object_meta.location.as_ref(),
-            "test_file.parquet"
-        );
-        assert_eq!(new_config.constraints, Constraints::default());
-        assert!(new_config.new_lines_in_values);
-    }
-
-    #[test]
-    fn test_split_groups_by_statistics_with_target_partitions() -> Result<()> {
-        use datafusion_common::DFSchema;
-        use datafusion_expr::{col, execution_props::ExecutionProps};
-
-        let schema = Arc::new(Schema::new(vec![Field::new(
-            "value",
-            DataType::Float64,
-            false,
-        )]));
-
-        // Setup sort expression
-        let exec_props = ExecutionProps::new();
-        let df_schema = DFSchema::try_from_qualified_schema("test", schema.as_ref())?;
-        let sort_expr = [col("value").sort(true, false)];
-        let sort_ordering = sort_expr
-            .map(|expr| {
-                create_physical_sort_expr(&expr, &df_schema, &exec_props).unwrap()
-            })
-            .into();
-
-        // Test case parameters
-        struct TestCase {
-            name: String,
-            file_count: usize,
-            overlap_factor: f64,
-            target_partitions: usize,
-            expected_partition_count: usize,
-        }
-
-        let test_cases = vec![
-            // Basic cases
-            TestCase {
-                name: "no_overlap_10_files_4_partitions".to_string(),
-                file_count: 10,
-                overlap_factor: 0.0,
-                target_partitions: 4,
-                expected_partition_count: 4,
-            },
-            TestCase {
-                name: "medium_overlap_20_files_5_partitions".to_string(),
-                file_count: 20,
-                overlap_factor: 0.5,
-                target_partitions: 5,
-                expected_partition_count: 5,
-            },
-            TestCase {
-                name: "high_overlap_30_files_3_partitions".to_string(),
-                file_count: 30,
-                overlap_factor: 0.8,
-                target_partitions: 3,
-                expected_partition_count: 7,
-            },
-            // Edge cases
-            TestCase {
-                name: "fewer_files_than_partitions".to_string(),
-                file_count: 3,
-                overlap_factor: 0.0,
-                target_partitions: 10,
-                expected_partition_count: 3, // Should only create as many partitions as files
-            },
-            TestCase {
-                name: "single_file".to_string(),
-                file_count: 1,
-                overlap_factor: 0.0,
-                target_partitions: 5,
-                expected_partition_count: 1, // Should create only one partition
-            },
-            TestCase {
-                name: "empty_files".to_string(),
-                file_count: 0,
-                overlap_factor: 0.0,
-                target_partitions: 3,
-                expected_partition_count: 0, // Empty result for empty input
-            },
-        ];
-
-        for case in test_cases {
-            println!("Running test case: {}", case.name);
-
-            // Generate files using bench utility function
-            let file_groups = generate_test_files(case.file_count, case.overlap_factor);
-
-            // Call the function under test
-            let result =
-                FileScanConfig::split_groups_by_statistics_with_target_partitions(
-                    &schema,
-                    &file_groups,
-                    &sort_ordering,
-                    case.target_partitions,
-                )?;
-
-            // Verify results
-            println!(
-                "Created {} partitions (target was {})",
-                result.len(),
-                case.target_partitions
-            );
-
-            // Check partition count
-            assert_eq!(
-                result.len(),
-                case.expected_partition_count,
-                "Case '{}': Unexpected partition count",
-                case.name
-            );
-
-            // Verify sort integrity
-            assert!(
-                verify_sort_integrity(&result),
-                "Case '{}': Files within partitions are not properly ordered",
-                case.name
-            );
-
-            // Distribution check for partitions
-            if case.file_count > 1 && case.expected_partition_count > 1 {
-                let group_sizes: Vec<usize> = result.iter().map(FileGroup::len).collect();
-                let max_size = *group_sizes.iter().max().unwrap();
-                let min_size = *group_sizes.iter().min().unwrap();
-
-                // Check partition balancing - difference shouldn't be extreme
-                let avg_files_per_partition =
-                    case.file_count as f64 / case.expected_partition_count as f64;
-                assert!(
-                    (max_size as f64) < 2.0 * avg_files_per_partition,
-                    "Case '{}': Unbalanced distribution. Max partition size {} exceeds twice the average {}",
-                    case.name,
-                    max_size,
-                    avg_files_per_partition
-                );
-
-                println!("Distribution - min files: {min_size}, max files: {max_size}");
-            }
-        }
-
-        // Test error case: zero target partitions
-        let empty_groups: Vec<FileGroup> = vec![];
-        let err = FileScanConfig::split_groups_by_statistics_with_target_partitions(
-            &schema,
-            &empty_groups,
-            &sort_ordering,
-            0,
-        )
-        .unwrap_err();
-
-        assert!(
-            err.to_string()
-                .contains("target_partitions must be greater than 0"),
-            "Expected error for zero target partitions"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_partition_statistics_projection() {
-        // This test verifies that partition_statistics applies projection correctly.
-        // The old implementation had a bug where it returned file group statistics
-        // without applying the projection, returning all column statistics instead
-        // of just the projected ones.
-
-        use crate::source::DataSourceExec;
-        use datafusion_physical_plan::ExecutionPlan;
-
-        // Create a schema with 4 columns
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("col0", DataType::Int32, false),
-            Field::new("col1", DataType::Int32, false),
-            Field::new("col2", DataType::Int32, false),
-            Field::new("col3", DataType::Int32, false),
-        ]));
-
-        // Create statistics for all 4 columns
-        let file_group_stats = Statistics {
-            num_rows: Precision::Exact(100),
-            total_byte_size: Precision::Exact(1024),
-            column_statistics: vec![
-                ColumnStatistics {
-                    null_count: Precision::Exact(0),
-                    ..ColumnStatistics::new_unknown()
-                },
-                ColumnStatistics {
-                    null_count: Precision::Exact(5),
-                    ..ColumnStatistics::new_unknown()
-                },
-                ColumnStatistics {
-                    null_count: Precision::Exact(10),
-                    ..ColumnStatistics::new_unknown()
-                },
-                ColumnStatistics {
-                    null_count: Precision::Exact(15),
-                    ..ColumnStatistics::new_unknown()
-                },
-            ],
-        };
-
-        // Create a file group with statistics
-        let file_group = FileGroup::new(vec![PartitionedFile::new("test.parquet", 1024)])
-            .with_statistics(Arc::new(file_group_stats));
-
-        // Create a FileScanConfig with projection: only keep columns 0 and 2
-        let config = FileScanConfigBuilder::new(
-            ObjectStoreUrl::parse("test:///").unwrap(),
-            Arc::clone(&schema),
-            Arc::new(MockSource::default()),
-        )
-        .with_projection_indices(Some(vec![0, 2])) // Only project columns 0 and 2
-        .with_file_groups(vec![file_group])
-        .build();
-
-        // Create a DataSourceExec from the config
-        let exec = DataSourceExec::from_data_source(config);
-
-        // Get statistics for partition 0
-        let partition_stats = exec.partition_statistics(Some(0)).unwrap();
-
-        // Verify that only 2 columns are in the statistics (the projected ones)
-        assert_eq!(
-            partition_stats.column_statistics.len(),
-            2,
-            "Expected 2 column statistics (projected), but got {}",
-            partition_stats.column_statistics.len()
-        );
-
-        // Verify the column statistics are for columns 0 and 2
-        assert_eq!(
-            partition_stats.column_statistics[0].null_count,
-            Precision::Exact(0),
-            "First projected column should be col0 with 0 nulls"
-        );
-        assert_eq!(
-            partition_stats.column_statistics[1].null_count,
-            Precision::Exact(10),
-            "Second projected column should be col2 with 10 nulls"
-        );
-
-        // Verify row count and byte size are preserved
-        assert_eq!(partition_stats.num_rows, Precision::Exact(100));
-        assert_eq!(partition_stats.total_byte_size, Precision::Exact(1024));
-    }
-}
diff --git a/datafusion/datasource/src/file_scan_config/mod.rs b/datafusion/datasource/src/file_scan_config/mod.rs
new file mode 100644
index 0000000000000..04b74528d5ac1
--- /dev/null
+++ b/datafusion/datasource/src/file_scan_config/mod.rs
@@ -0,0 +1,3169 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`FileScanConfig`] to configure scanning of possibly partitioned
+//! file sources.
+
+pub(crate) mod sort_pushdown;
+
+use crate::file_groups::FileGroup;
+use crate::{
+    PartitionedFile, display::FileGroupsDisplay, file::FileSource,
+    file_compression_type::FileCompressionType, file_stream::FileStreamBuilder,
+    file_stream::work_source::SharedWorkSource, source::DataSource,
+    statistics::MinMaxStatistics,
+};
+use arrow::datatypes::FieldRef;
+use arrow::datatypes::{DataType, Schema, SchemaRef};
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{
+    Constraints, Result, ScalarValue, Statistics, internal_datafusion_err, internal_err,
+};
+use datafusion_execution::{
+    SendableRecordBatchStream, TaskContext, object_store::ObjectStoreUrl,
+};
+use datafusion_expr::Operator;
+
+use crate::source::OpenArgs;
+use datafusion_physical_expr::expressions::{BinaryExpr, Column};
+use datafusion_physical_expr::projection::ProjectionExprs;
+use datafusion_physical_expr::utils::reassign_expr_columns;
+use datafusion_physical_expr::{EquivalenceProperties, Partitioning, split_conjunction};
+use datafusion_physical_expr_adapter::PhysicalExprAdapterFactory;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+use datafusion_physical_plan::SortOrderPushdownResult;
+use datafusion_physical_plan::coop::cooperative;
+use datafusion_physical_plan::execution_plan::SchedulingType;
+use datafusion_physical_plan::{
+    DisplayAs, DisplayFormatType,
+    display::{ProjectSchemaDisplay, display_orderings},
+    filter_pushdown::FilterPushdownPropagation,
+    metrics::ExecutionPlanMetricsSet,
+};
+use log::{debug, warn};
+use std::any::Any;
+use std::{fmt::Debug, fmt::Formatter, fmt::Result as FmtResult, sync::Arc};
+
+/// [`FileScanConfig`] represents scanning data from a group of files
+///
+/// `FileScanConfig` is used to create a [`DataSourceExec`], the physical plan
+/// for scanning files with a particular file format.
+///
+/// The [`FileSource`] (e.g. `ParquetSource`, `CsvSource`, etc.) is responsible
+/// for creating the actual execution plan to read the files based on a
+/// `FileScanConfig`. Fields in a `FileScanConfig` such as Statistics represent
+/// information about the files **before** any projection or filtering is
+/// applied in the file source.
+///
+/// Use [`FileScanConfigBuilder`] to construct a `FileScanConfig`.
+///
+/// Use [`DataSourceExec::from_data_source`] to create a [`DataSourceExec`] from
+/// a `FileScanConfig`.
+///
+/// # Example
+/// ```
+/// # use std::sync::Arc;
+/// # use arrow::datatypes::{Field, Fields, DataType, Schema, SchemaRef};
+/// # use object_store::ObjectStore;
+/// # use datafusion_common::Result;
+/// # use datafusion_common::tree_node::TreeNodeRecursion;
+/// # use datafusion_datasource::file::FileSource;
+/// # use datafusion_physical_plan::PhysicalExpr;
+/// # use datafusion_datasource::file_groups::FileGroup;
+/// # use datafusion_datasource::PartitionedFile;
+/// # use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
+/// # use datafusion_datasource::file_stream::FileOpener;
+/// # use datafusion_datasource::source::DataSourceExec;
+/// # use datafusion_datasource::table_schema::TableSchema;
+/// # use datafusion_execution::object_store::ObjectStoreUrl;
+/// # use datafusion_physical_expr::projection::ProjectionExprs;
+/// # use datafusion_physical_plan::ExecutionPlan;
+/// # use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
+/// # let file_schema = Arc::new(Schema::new(vec![
+/// #  Field::new("c1", DataType::Int32, false),
+/// #  Field::new("c2", DataType::Int32, false),
+/// #  Field::new("c3", DataType::Int32, false),
+/// #  Field::new("c4", DataType::Int32, false),
+/// # ]));
+/// # // Note: crate mock ParquetSource, as ParquetSource is not in the datasource crate
+/// #[derive(Clone)]
+/// # struct ParquetSource {
+/// #    table_schema: TableSchema,
+/// # };
+/// # impl FileSource for ParquetSource {
+/// #  fn create_file_opener(&self, _: Arc<dyn ObjectStore>, _: &FileScanConfig, _: usize) -> Result<Arc<dyn FileOpener>> { unimplemented!() }
+/// #  fn table_schema(&self) -> &TableSchema { &self.table_schema }
+/// #  fn with_batch_size(&self, _: usize) -> Arc<dyn FileSource> { unimplemented!() }
+/// #  fn metrics(&self) -> &ExecutionPlanMetricsSet { unimplemented!() }
+/// #  fn file_type(&self) -> &str { "parquet" }
+/// #  // Note that this implementation drops the projection on the floor, it is not complete!
+/// #  fn try_pushdown_projection(&self, projection: &ProjectionExprs) -> Result<Option<Arc<dyn FileSource>>> { Ok(Some(Arc::new(self.clone()) as Arc<dyn FileSource>)) }
+/// #  fn apply_expressions(&self, _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>) -> Result<TreeNodeRecursion> { Ok(TreeNodeRecursion::Continue) }
+/// #  }
+/// # impl ParquetSource {
+/// #  fn new(table_schema: impl Into<TableSchema>) -> Self { Self {table_schema: table_schema.into()} }
+/// # }
+/// // create FileScan config for reading parquet files from file://
+/// let object_store_url = ObjectStoreUrl::local_filesystem();
+/// let file_source = Arc::new(ParquetSource::new(file_schema.clone()));
+/// let config = FileScanConfigBuilder::new(object_store_url, file_source)
+///   .with_limit(Some(1000))            // read only the first 1000 records
+///   .with_projection_indices(Some(vec![2, 3])) // project columns 2 and 3
+///   .expect("Failed to push down projection")
+///    // Read /tmp/file1.parquet with known size of 1234 bytes in a single group
+///   .with_file(PartitionedFile::new("file1.parquet", 1234))
+///   // Read /tmp/file2.parquet 56 bytes and /tmp/file3.parquet 78 bytes
+///   // in a  single row group
+///   .with_file_group(FileGroup::new(vec![
+///    PartitionedFile::new("file2.parquet", 56),
+///    PartitionedFile::new("file3.parquet", 78),
+///   ])).build();
+/// // create an execution plan from the config
+/// let plan: Arc<dyn ExecutionPlan> = DataSourceExec::from_data_source(config);
+/// ```
+///
+/// [`DataSourceExec`]: crate::source::DataSourceExec
+/// [`DataSourceExec::from_data_source`]: crate::source::DataSourceExec::from_data_source
+#[derive(Clone)]
+pub struct FileScanConfig {
+    /// Object store URL, used to get an [`ObjectStore`] instance from
+    /// [`RuntimeEnv::object_store`]
+    ///
+    /// This `ObjectStoreUrl` should be the prefix of the absolute url for files
+    /// as `file://` or `s3://my_bucket`. It should not include the path to the
+    /// file itself. The relevant URL prefix must be registered via
+    /// [`RuntimeEnv::register_object_store`]
+    ///
+    /// [`ObjectStore`]: object_store::ObjectStore
+    /// [`RuntimeEnv::register_object_store`]: datafusion_execution::runtime_env::RuntimeEnv::register_object_store
+    /// [`RuntimeEnv::object_store`]: datafusion_execution::runtime_env::RuntimeEnv::object_store
+    pub object_store_url: ObjectStoreUrl,
+    /// List of files to be processed, grouped into partitions
+    ///
+    /// Each file must have a schema of `file_schema` or a subset. If
+    /// a particular file has a subset, the missing columns are
+    /// padded with NULLs.
+    ///
+    /// DataFusion may attempt to read each partition of files
+    /// concurrently, however files *within* a partition will be read
+    /// sequentially, one after the next.
+    pub file_groups: Vec<FileGroup>,
+    /// Table constraints
+    pub constraints: Constraints,
+    /// The maximum number of records to read from this plan. If `None`,
+    /// all records after filtering are returned.
+    pub limit: Option<usize>,
+    /// Whether the scan's limit is order sensitive
+    /// When `true`, files must be read in the exact order specified to produce
+    /// correct results (e.g., for `ORDER BY ... LIMIT` queries). When `false`,
+    /// DataFusion may reorder file processing for optimization without affecting correctness.
+    pub preserve_order: bool,
+    /// All equivalent lexicographical output orderings of this file scan, in terms of
+    /// [`FileSource::table_schema`]. See [`FileScanConfigBuilder::with_output_ordering`] for more
+    /// details.
+    ///
+    /// [`Self::eq_properties`] uses this information along with projection
+    /// and filtering information to compute the effective
+    /// [`EquivalenceProperties`]
+    pub output_ordering: Vec<LexOrdering>,
+    /// File compression type
+    pub file_compression_type: FileCompressionType,
+    /// File source such as `ParquetSource`, `CsvSource`, `JsonSource`, etc.
+    pub file_source: Arc<dyn FileSource>,
+    /// Batch size while creating new batches
+    /// Defaults to [`datafusion_common::config::ExecutionOptions`] batch_size.
+    pub batch_size: Option<usize>,
+    /// Expression adapter used to adapt filters and projections that are pushed down into the scan
+    /// from the logical schema to the physical schema of the file.
+    pub expr_adapter_factory: Option<Arc<dyn PhysicalExprAdapterFactory>>,
+    /// Statistics for the entire table (file schema + partition columns).
+    /// See [`FileScanConfigBuilder::with_statistics`] for more details.
+    ///
+    /// The effective statistics are computed on-demand via
+    /// [`ProjectionExprs::project_statistics`].
+    ///
+    /// Note that this field is pub(crate) because accessing it directly from outside
+    /// would be incorrect if there are filters being applied, thus this should be accessed
+    /// via [`FileScanConfig::statistics`].
+    pub(crate) statistics: Statistics,
+    /// When true, file_groups are organized by partition column values
+    /// and output_partitioning will return Hash partitioning on partition columns.
+    /// This allows the optimizer to skip hash repartitioning for aggregates and joins
+    /// on partition columns.
+    ///
+    /// If the number of file partitions > target_partitions, the file partitions will be grouped
+    /// in a round-robin fashion such that number of file partitions = target_partitions.
+    pub partitioned_by_file_group: bool,
+}
+
+/// A builder for [`FileScanConfig`]'s.
+///
+/// Example:
+///
+/// ```rust
+/// # use std::sync::Arc;
+/// # use arrow::datatypes::{DataType, Field, Schema};
+/// # use datafusion_datasource::file_scan_config::{FileScanConfigBuilder, FileScanConfig};
+/// # use datafusion_datasource::file_compression_type::FileCompressionType;
+/// # use datafusion_datasource::file_groups::FileGroup;
+/// # use datafusion_datasource::PartitionedFile;
+/// # use datafusion_datasource::table_schema::TableSchema;
+/// # use datafusion_execution::object_store::ObjectStoreUrl;
+/// # use datafusion_common::Statistics;
+/// # use datafusion_datasource::file::FileSource;
+///
+/// # fn main() {
+/// # fn with_source(file_source: Arc<dyn FileSource>) {
+///     // Create a schema for our Parquet files
+///     let file_schema = Arc::new(Schema::new(vec![
+///         Field::new("id", DataType::Int32, false),
+///         Field::new("value", DataType::Utf8, false),
+///     ]));
+///
+///     // Create partition columns
+///     let partition_cols = vec![
+///         Arc::new(Field::new("date", DataType::Utf8, false)),
+///     ];
+///
+///     // Create table schema with file schema and partition columns
+///     let table_schema = TableSchema::new(file_schema, partition_cols);
+///
+///     // Create a builder for scanning Parquet files from a local filesystem
+///     let config = FileScanConfigBuilder::new(
+///         ObjectStoreUrl::local_filesystem(),
+///         file_source,
+///     )
+///     // Set a limit of 1000 rows
+///     .with_limit(Some(1000))
+///     // Project only the first column
+///     .with_projection_indices(Some(vec![0]))
+///     .expect("Failed to push down projection")
+///     // Add a file group with two files
+///     .with_file_group(FileGroup::new(vec![
+///         PartitionedFile::new("data/date=2024-01-01/file1.parquet", 1024),
+///         PartitionedFile::new("data/date=2024-01-01/file2.parquet", 2048),
+///     ]))
+///     // Set compression type
+///     .with_file_compression_type(FileCompressionType::UNCOMPRESSED)
+///     // Build the final config
+///     .build();
+/// # }
+/// # }
+/// ```
+#[derive(Clone)]
+pub struct FileScanConfigBuilder {
+    object_store_url: ObjectStoreUrl,
+    file_source: Arc<dyn FileSource>,
+    limit: Option<usize>,
+    preserve_order: bool,
+    constraints: Option<Constraints>,
+    file_groups: Vec<FileGroup>,
+    statistics: Option<Statistics>,
+    output_ordering: Vec<LexOrdering>,
+    file_compression_type: Option<FileCompressionType>,
+    batch_size: Option<usize>,
+    expr_adapter_factory: Option<Arc<dyn PhysicalExprAdapterFactory>>,
+    partitioned_by_file_group: bool,
+}
+
+impl FileScanConfigBuilder {
+    /// Create a new [`FileScanConfigBuilder`] with default settings for scanning files.
+    ///
+    /// # Parameters:
+    /// * `object_store_url`: See [`FileScanConfig::object_store_url`]
+    /// * `file_source`: See [`FileScanConfig::file_source`]. The file source must have
+    ///   a schema set via its constructor.
+    pub fn new(
+        object_store_url: ObjectStoreUrl,
+        file_source: Arc<dyn FileSource>,
+    ) -> Self {
+        Self {
+            object_store_url,
+            file_source,
+            file_groups: vec![],
+            statistics: None,
+            output_ordering: vec![],
+            file_compression_type: None,
+            limit: None,
+            preserve_order: false,
+            constraints: None,
+            batch_size: None,
+            expr_adapter_factory: None,
+            partitioned_by_file_group: false,
+        }
+    }
+
+    /// Set the maximum number of records to read from this plan.
+    ///
+    /// If `None`, all records after filtering are returned.
+    pub fn with_limit(mut self, limit: Option<usize>) -> Self {
+        self.limit = limit;
+        self
+    }
+
+    /// Set whether the limit should be order-sensitive.
+    ///
+    /// When `true`, files must be read in the exact order specified to produce
+    /// correct results (e.g., for `ORDER BY ... LIMIT` queries). When `false`,
+    /// DataFusion may reorder file processing for optimization without
+    /// affecting correctness.
+    pub fn with_preserve_order(mut self, order_sensitive: bool) -> Self {
+        self.preserve_order = order_sensitive;
+        self
+    }
+
+    /// Set the file source for scanning files.
+    ///
+    /// This method allows you to change the file source implementation (e.g.
+    /// ParquetSource, CsvSource, etc.) after the builder has been created.
+    pub fn with_source(mut self, file_source: Arc<dyn FileSource>) -> Self {
+        self.file_source = file_source;
+        self
+    }
+
+    /// Return the table schema
+    pub fn table_schema(&self) -> &SchemaRef {
+        self.file_source.table_schema().table_schema()
+    }
+
+    /// Set the columns on which to project the data. Indexes that are higher than the
+    /// number of columns of `file_schema` refer to `table_partition_cols`.
+    ///
+    /// # Deprecated
+    /// Use [`Self::with_projection_indices`] instead. This method will be removed in a future release.
+    #[deprecated(since = "51.0.0", note = "Use with_projection_indices instead")]
+    pub fn with_projection(self, indices: Option<Vec<usize>>) -> Self {
+        match self.clone().with_projection_indices(indices) {
+            Ok(builder) => builder,
+            Err(e) => {
+                warn!(
+                    "Failed to push down projection in FileScanConfigBuilder::with_projection: {e}"
+                );
+                self
+            }
+        }
+    }
+
+    /// Set the columns on which to project the data using column indices.
+    ///
+    /// This method attempts to push down the projection to the underlying file
+    /// source if supported. If the file source does not support projection
+    /// pushdown, an error is returned.
+    ///
+    /// Indexes that are higher than the number of columns of `file_schema`
+    /// refer to `table_partition_cols`.
+    pub fn with_projection_indices(
+        mut self,
+        indices: Option<Vec<usize>>,
+    ) -> Result<Self> {
+        let projection_exprs = indices.map(|indices| {
+            ProjectionExprs::from_indices(
+                &indices,
+                self.file_source.table_schema().table_schema(),
+            )
+        });
+        let Some(projection_exprs) = projection_exprs else {
+            return Ok(self);
+        };
+        let new_source = self
+            .file_source
+            .try_pushdown_projection(&projection_exprs)
+            .map_err(|e| {
+                internal_datafusion_err!(
+                    "Failed to push down projection in FileScanConfigBuilder::build: {e}"
+                )
+            })?;
+        if let Some(new_source) = new_source {
+            self.file_source = new_source;
+        } else {
+            internal_err!(
+                "FileSource {} does not support projection pushdown",
+                self.file_source.file_type()
+            )?;
+        }
+        Ok(self)
+    }
+
+    /// Set the table constraints
+    pub fn with_constraints(mut self, constraints: Constraints) -> Self {
+        self.constraints = Some(constraints);
+        self
+    }
+
+    /// Set the statistics of the files, including partition
+    /// columns. Defaults to [`Statistics::new_unknown`].
+    ///
+    /// These statistics are for the entire table (file schema + partition
+    /// columns) before any projection or filtering is applied. Projections are
+    /// applied when statistics are retrieved, and if a filter is present,
+    /// [`FileScanConfig::statistics`] will mark the statistics as inexact
+    /// (counts are not adjusted).
+    ///
+    /// Projections and filters may be applied by the file source, either by
+    /// [`Self::with_projection_indices`] or a preexisting
+    /// [`FileSource::projection`] or [`FileSource::filter`].
+    pub fn with_statistics(mut self, statistics: Statistics) -> Self {
+        self.statistics = Some(statistics);
+        self
+    }
+
+    /// Set the list of files to be processed, grouped into partitions.
+    ///
+    /// Each file must have a schema of `file_schema` or a subset. If
+    /// a particular file has a subset, the missing columns are
+    /// padded with NULLs.
+    ///
+    /// DataFusion may attempt to read each partition of files
+    /// concurrently, however files *within* a partition will be read
+    /// sequentially, one after the next.
+    pub fn with_file_groups(mut self, file_groups: Vec<FileGroup>) -> Self {
+        self.file_groups = file_groups;
+        self
+    }
+
+    /// Add a new file group
+    ///
+    /// See [`Self::with_file_groups`] for more information
+    pub fn with_file_group(mut self, file_group: FileGroup) -> Self {
+        self.file_groups.push(file_group);
+        self
+    }
+
+    /// Add a file as a single group
+    ///
+    /// See [`Self::with_file_groups`] for more information.
+    pub fn with_file(self, partitioned_file: PartitionedFile) -> Self {
+        self.with_file_group(FileGroup::new(vec![partitioned_file]))
+    }
+
+    /// Set the output ordering of the files
+    ///
+    /// The expressions are in terms of the entire table schema (file schema +
+    /// partition columns), before any projection or filtering from the file
+    /// scan is applied.
+    ///
+    /// This is used for optimization purposes, e.g. to determine if a file scan
+    /// can satisfy an `ORDER BY` without an additional sort.
+    pub fn with_output_ordering(mut self, output_ordering: Vec<LexOrdering>) -> Self {
+        self.output_ordering = output_ordering;
+        self
+    }
+
+    /// Set the file compression type
+    pub fn with_file_compression_type(
+        mut self,
+        file_compression_type: FileCompressionType,
+    ) -> Self {
+        self.file_compression_type = Some(file_compression_type);
+        self
+    }
+
+    /// Set the batch_size property
+    pub fn with_batch_size(mut self, batch_size: Option<usize>) -> Self {
+        self.batch_size = batch_size;
+        self
+    }
+
+    /// Register an expression adapter used to adapt filters and projections that are pushed down into the scan
+    /// from the logical schema to the physical schema of the file.
+    /// This can include things like:
+    /// - Column ordering changes
+    /// - Handling of missing columns
+    /// - Rewriting expression to use pre-computed values or file format specific optimizations
+    pub fn with_expr_adapter(
+        mut self,
+        expr_adapter: Option<Arc<dyn PhysicalExprAdapterFactory>>,
+    ) -> Self {
+        self.expr_adapter_factory = expr_adapter;
+        self
+    }
+
+    /// Set whether file groups are organized by partition column values.
+    ///
+    /// When set to true, the output partitioning will be declared as Hash partitioning
+    /// on the partition columns.
+    pub fn with_partitioned_by_file_group(
+        mut self,
+        partitioned_by_file_group: bool,
+    ) -> Self {
+        self.partitioned_by_file_group = partitioned_by_file_group;
+        self
+    }
+
+    /// Build the final [`FileScanConfig`] with all the configured settings.
+    ///
+    /// This method takes ownership of the builder and returns the constructed `FileScanConfig`.
+    /// Any unset optional fields will use their default values.
+    ///
+    /// # Errors
+    /// Returns an error if projection pushdown fails or if schema operations fail.
+    pub fn build(self) -> FileScanConfig {
+        let Self {
+            object_store_url,
+            file_source,
+            limit,
+            preserve_order,
+            constraints,
+            file_groups,
+            statistics,
+            output_ordering,
+            file_compression_type,
+            batch_size,
+            expr_adapter_factory: expr_adapter,
+            partitioned_by_file_group,
+        } = self;
+
+        let constraints = constraints.unwrap_or_default();
+        let statistics = statistics.unwrap_or_else(|| {
+            Statistics::new_unknown(file_source.table_schema().table_schema())
+        });
+        let file_compression_type =
+            file_compression_type.unwrap_or(FileCompressionType::UNCOMPRESSED);
+
+        // If there is an output ordering, we should preserve it.
+        let preserve_order = preserve_order || !output_ordering.is_empty();
+
+        FileScanConfig {
+            object_store_url,
+            file_source,
+            limit,
+            preserve_order,
+            constraints,
+            file_groups,
+            output_ordering,
+            file_compression_type,
+            batch_size,
+            expr_adapter_factory: expr_adapter,
+            statistics,
+            partitioned_by_file_group,
+        }
+    }
+}
+
+impl From<FileScanConfig> for FileScanConfigBuilder {
+    fn from(config: FileScanConfig) -> Self {
+        Self {
+            object_store_url: config.object_store_url,
+            file_source: Arc::<dyn FileSource>::clone(&config.file_source),
+            file_groups: config.file_groups,
+            statistics: Some(config.statistics),
+            output_ordering: config.output_ordering,
+            file_compression_type: Some(config.file_compression_type),
+            limit: config.limit,
+            preserve_order: config.preserve_order,
+            constraints: Some(config.constraints),
+            batch_size: config.batch_size,
+            expr_adapter_factory: config.expr_adapter_factory,
+            partitioned_by_file_group: config.partitioned_by_file_group,
+        }
+    }
+}
+
+impl DataSource for FileScanConfig {
+    fn open(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        self.open_with_args(OpenArgs::new(partition, context))
+    }
+
+    fn open_with_args(&self, args: OpenArgs) -> Result<SendableRecordBatchStream> {
+        let OpenArgs {
+            partition,
+            context,
+            sibling_state,
+        } = args;
+        let object_store = context.runtime_env().object_store(&self.object_store_url)?;
+        let batch_size = self
+            .batch_size
+            .unwrap_or_else(|| context.session_config().batch_size());
+
+        let source = self.file_source.with_batch_size(batch_size);
+
+        let morselizer = source.create_morselizer(object_store, self, partition)?;
+
+        // Extract the shared work source from the sibling state if it exists.
+        // This allows multiple sibling streams to steal work from a single
+        // shared queue of unopened files.
+        let shared_work_source = sibling_state
+            .as_ref()
+            .and_then(|state| state.downcast_ref::<SharedWorkSource>())
+            .cloned();
+
+        let stream = FileStreamBuilder::new(self)
+            .with_partition(partition)
+            .with_shared_work_source(shared_work_source)
+            .with_morselizer(morselizer)
+            .with_metrics(source.metrics())
+            .build()?;
+        Ok(Box::pin(cooperative(stream)))
+    }
+
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                let schema = self.projected_schema().map_err(|_| std::fmt::Error {})?;
+                let orderings =
+                    sort_pushdown::get_projected_output_ordering(self, &schema);
+
+                write!(f, "file_groups=")?;
+                FileGroupsDisplay(&self.file_groups).fmt_as(t, f)?;
+
+                if !schema.fields().is_empty() {
+                    if let Some(projection) = self.file_source.projection() {
+                        // This matches what ProjectionExec does.
+                        // TODO: can we put this into ProjectionExprs so that it's shared code?
+                        let expr: Vec<String> = projection
+                            .as_ref()
+                            .iter()
+                            .map(|proj_expr| {
+                                if let Some(column) =
+                                    proj_expr.expr.downcast_ref::<Column>()
+                                {
+                                    if column.name() == proj_expr.alias {
+                                        column.name().to_string()
+                                    } else {
+                                        format!(
+                                            "{} as {}",
+                                            proj_expr.expr, proj_expr.alias
+                                        )
+                                    }
+                                } else {
+                                    format!("{} as {}", proj_expr.expr, proj_expr.alias)
+                                }
+                            })
+                            .collect();
+                        write!(f, ", projection=[{}]", expr.join(", "))?;
+                    } else {
+                        write!(f, ", projection={}", ProjectSchemaDisplay(&schema))?;
+                    }
+                }
+
+                if let Some(limit) = self.limit {
+                    write!(f, ", limit={limit}")?;
+                }
+
+                display_orderings(f, &orderings)?;
+
+                if !self.constraints.is_empty() {
+                    write!(f, ", {}", self.constraints)?;
+                }
+
+                self.fmt_file_source(t, f)
+            }
+            DisplayFormatType::TreeRender => {
+                writeln!(f, "format={}", self.file_source.file_type())?;
+                self.file_source.fmt_extra(t, f)?;
+                let num_files = self.file_groups.iter().map(|fg| fg.len()).sum::<usize>();
+                writeln!(f, "files={num_files}")?;
+                Ok(())
+            }
+        }
+    }
+
+    /// If supported by the underlying [`FileSource`], redistribute files across partitions according to their size.
+    fn repartitioned(
+        &self,
+        target_partitions: usize,
+        repartition_file_min_size: usize,
+        output_ordering: Option<LexOrdering>,
+    ) -> Result<Option<Arc<dyn DataSource>>> {
+        // When files are grouped by partition values, we cannot allow byte-range
+        // splitting. It would mix rows from different partition values across
+        // file groups, breaking the Hash partitioning.
+        if self.partitioned_by_file_group {
+            return Ok(None);
+        }
+
+        let source = self.file_source.repartitioned(
+            target_partitions,
+            repartition_file_min_size,
+            output_ordering,
+            self,
+        )?;
+
+        Ok(source.map(|s| Arc::new(s) as _))
+    }
+
+    /// Returns the output partitioning for this file scan.
+    ///
+    /// When `partitioned_by_file_group` is true, this returns `Partitioning::Hash` on
+    /// the Hive partition columns, allowing the optimizer to skip hash repartitioning
+    /// for aggregates and joins on those columns.
+    ///
+    /// Tradeoffs
+    /// - Benefit: Eliminates `RepartitionExec` and `SortExec` for queries with
+    ///   `GROUP BY` or `ORDER BY` on partition columns.
+    /// - Cost: Files are grouped by partition values rather than split by byte
+    ///   ranges, which may reduce I/O parallelism when partition sizes are uneven.
+    ///   For simple aggregations without `ORDER BY`, this cost may outweigh the benefit.
+    ///
+    /// Follow-up Work
+    /// - Idea: Could allow byte-range splitting within partition-aware groups,
+    ///   preserving I/O parallelism while maintaining partition semantics.
+    fn output_partitioning(&self) -> Partitioning {
+        if self.partitioned_by_file_group {
+            let partition_cols = self.table_partition_cols();
+            if !partition_cols.is_empty() {
+                let projected_schema = match self.projected_schema() {
+                    Ok(schema) => schema,
+                    Err(_) => {
+                        debug!(
+                            "Could not get projected schema, falling back to UnknownPartitioning."
+                        );
+                        return Partitioning::UnknownPartitioning(self.file_groups.len());
+                    }
+                };
+
+                // Build Column expressions for partition columns based on their
+                // position in the projected schema
+                let mut exprs: Vec<Arc<dyn PhysicalExpr>> = Vec::new();
+                for partition_col in partition_cols {
+                    if let Some((idx, _)) = projected_schema
+                        .fields()
+                        .iter()
+                        .enumerate()
+                        .find(|(_, f)| f.name() == partition_col.name())
+                    {
+                        exprs.push(Arc::new(Column::new(partition_col.name(), idx)));
+                    }
+                }
+
+                if exprs.len() == partition_cols.len() {
+                    return Partitioning::Hash(exprs, self.file_groups.len());
+                }
+            }
+        }
+        Partitioning::UnknownPartitioning(self.file_groups.len())
+    }
+
+    /// Computes the effective equivalence properties of this file scan, taking
+    /// into account the file schema, any projections or filters applied by the
+    /// file source, and the output ordering.
+    fn eq_properties(&self) -> EquivalenceProperties {
+        let schema = self.file_source.table_schema().table_schema();
+        let mut eq_properties = EquivalenceProperties::new_with_orderings(
+            Arc::clone(schema),
+            self.validated_output_ordering(),
+        )
+        .with_constraints(self.constraints.clone());
+
+        if let Some(filter) = self.file_source.filter() {
+            // We need to remap column indexes to match the projected schema since that's what the equivalence properties deal with.
+            // Note that this will *ignore* any non-projected columns: these don't factor into ordering / equivalence.
+            match Self::add_filter_equivalence_info(&filter, &mut eq_properties, schema) {
+                Ok(()) => {}
+                Err(e) => {
+                    warn!("Failed to add filter equivalence info: {e}");
+                    #[cfg(debug_assertions)]
+                    panic!("Failed to add filter equivalence info: {e}");
+                }
+            }
+        }
+
+        if let Some(projection) = self.file_source.projection() {
+            match (
+                projection.project_schema(schema),
+                projection.projection_mapping(schema),
+            ) {
+                (Ok(output_schema), Ok(mapping)) => {
+                    eq_properties =
+                        eq_properties.project(&mapping, Arc::new(output_schema));
+                }
+                (Err(e), _) | (_, Err(e)) => {
+                    warn!("Failed to project equivalence properties: {e}");
+                    #[cfg(debug_assertions)]
+                    panic!("Failed to project equivalence properties: {e}");
+                }
+            }
+        }
+
+        eq_properties
+    }
+
+    fn scheduling_type(&self) -> SchedulingType {
+        SchedulingType::Cooperative
+    }
+
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        if let Some(partition) = partition {
+            // Get statistics for a specific partition
+            // Note: FileGroup statistics include partition columns (computed from partition_values)
+            if let Some(file_group) = self.file_groups.get(partition)
+                && let Some(stat) = file_group.file_statistics(None)
+            {
+                // Project the statistics based on the projection
+                let output_schema = self.projected_schema()?;
+                return if let Some(projection) = self.file_source.projection() {
+                    Ok(Arc::new(
+                        projection.project_statistics(stat.clone(), &output_schema)?,
+                    ))
+                } else {
+                    Ok(Arc::new(stat.clone()))
+                };
+            }
+            // If no statistics available for this partition, return unknown
+            Ok(Arc::new(Statistics::new_unknown(
+                self.projected_schema()?.as_ref(),
+            )))
+        } else {
+            // Return aggregate statistics across all partitions
+            let statistics = self.statistics();
+            let projection = self.file_source.projection();
+            let output_schema = self.projected_schema()?;
+            if let Some(projection) = &projection {
+                Ok(Arc::new(
+                    projection.project_statistics(statistics.clone(), &output_schema)?,
+                ))
+            } else {
+                Ok(Arc::new(statistics))
+            }
+        }
+    }
+
+    fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn DataSource>> {
+        let source = FileScanConfigBuilder::from(self.clone())
+            .with_limit(limit)
+            .build();
+        Some(Arc::new(source))
+    }
+
+    fn fetch(&self) -> Option<usize> {
+        self.limit
+    }
+
+    fn metrics(&self) -> ExecutionPlanMetricsSet {
+        self.file_source.metrics().clone()
+    }
+
+    fn try_swapping_with_projection(
+        &self,
+        projection: &ProjectionExprs,
+    ) -> Result<Option<Arc<dyn DataSource>>> {
+        match self.file_source.try_pushdown_projection(projection)? {
+            Some(new_source) => {
+                let mut new_file_scan_config = self.clone();
+                new_file_scan_config.file_source = new_source;
+                Ok(Some(Arc::new(new_file_scan_config) as Arc<dyn DataSource>))
+            }
+            None => Ok(None),
+        }
+    }
+
+    fn try_pushdown_filters(
+        &self,
+        filters: Vec<Arc<dyn PhysicalExpr>>,
+        config: &ConfigOptions,
+    ) -> Result<FilterPushdownPropagation<Arc<dyn DataSource>>> {
+        // Remap filter Column indices to match the table schema (file + partition columns).
+        // This is necessary because filters refer to the output schema of this `DataSource`
+        // (e.g., after projection pushdown has been applied) and need to be remapped to the table schema
+        // before being passed to the file source
+        //
+        // For example, consider a filter `c1_c2 > 5` being pushed down. If the
+        // `DataSource` has a projection `c1 + c2 as c1_c2`, the filter must be rewritten
+        // to refer to the table schema `c1 + c2 > 5`
+        let table_schema = self.file_source.table_schema().table_schema();
+        let filters_to_remap = if let Some(projection) = self.file_source.projection() {
+            filters
+                .into_iter()
+                .map(|filter| projection.unproject_expr(&filter))
+                .collect::<Result<Vec<_>>>()?
+        } else {
+            filters
+        };
+        // Now remap column indices to match the table schema.
+        let remapped_filters = filters_to_remap
+            .into_iter()
+            .map(|filter| reassign_expr_columns(filter, table_schema))
+            .collect::<Result<Vec<_>>>()?;
+
+        let result = self
+            .file_source
+            .try_pushdown_filters(remapped_filters, config)?;
+        match result.updated_node {
+            Some(new_file_source) => {
+                let mut new_file_scan_config = self.clone();
+                new_file_scan_config.file_source = new_file_source;
+                Ok(FilterPushdownPropagation {
+                    filters: result.filters,
+                    updated_node: Some(Arc::new(new_file_scan_config) as _),
+                })
+            }
+            None => {
+                // If the file source does not support filter pushdown, return the original config
+                Ok(FilterPushdownPropagation {
+                    filters: result.filters,
+                    updated_node: None,
+                })
+            }
+        }
+    }
+
+    /// Push sort requirements into file-based data sources.
+    ///
+    /// # Sort Pushdown Architecture
+    ///
+    /// When a partition (file group) contains multiple files in wrong order,
+    /// `validated_output_ordering()` strips the ordering and `EnforceSorting`
+    /// inserts a `SortExec`. This optimizer fixes the file order by sorting
+    /// files within each group by min/max statistics, enabling sort elimination.
+    ///
+    /// This applies to both single-partition and multi-partition plans — any
+    /// file group with multiple files in wrong order benefits.
+    ///
+    /// ```text
+    /// PushdownSort optimizer finds SortExec
+    ///   │
+    ///   ▼
+    /// FileScanConfig::try_pushdown_sort()
+    ///   │
+    ///   ├─► FileSource returns Exact
+    ///   │     (natural ordering satisfies request)
+    ///   │     → rebuild_with_source: sort files by stats, verify non-overlapping
+    ///   │     → SortExec removed, fetch (LIMIT) pushed to DataSourceExec
+    ///   │
+    ///   ├─► FileSource returns Inexact
+    ///   │     (reverse_row_groups=true)
+    ///   │     → SortExec kept, scan optimized
+    ///   │
+    ///   └─► FileSource returns Unsupported
+    ///         (ordering stripped because files in wrong order)
+    ///         → try_sort_file_groups_by_statistics():
+    ///           1. Sort files within each group by min/max statistics
+    ///           2. Re-check: non-overlapping + ordering valid?
+    ///              YES → Exact → SortExec removed
+    ///              NO  → Inexact (files reordered, Sort stays)
+    /// ```
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn DataSource>>> {
+        let pushdown_result = self
+            .file_source
+            .try_pushdown_sort(order, &self.eq_properties())?;
+
+        match pushdown_result {
+            SortOrderPushdownResult::Exact { inner } => {
+                let config = self.rebuild_with_source(inner, true, order)?;
+                // rebuild_with_source keeps output_ordering only when all groups
+                // are non-overlapping. If output_ordering was cleared, files
+                // overlap despite within-file ordering → downgrade to Inexact.
+                if config.output_ordering.is_empty() {
+                    Ok(SortOrderPushdownResult::Inexact {
+                        inner: Arc::new(config),
+                    })
+                } else {
+                    Ok(SortOrderPushdownResult::Exact {
+                        inner: Arc::new(config),
+                    })
+                }
+            }
+            SortOrderPushdownResult::Inexact { inner } => {
+                Ok(SortOrderPushdownResult::Inexact {
+                    inner: Arc::new(self.rebuild_with_source(inner, false, order)?),
+                })
+            }
+            SortOrderPushdownResult::Unsupported => {
+                self.try_sort_file_groups_by_statistics(order)
+            }
+        }
+    }
+
+    fn with_preserve_order(&self, preserve_order: bool) -> Option<Arc<dyn DataSource>> {
+        if self.preserve_order == preserve_order {
+            return Some(Arc::new(self.clone()));
+        }
+
+        let new_config = FileScanConfig {
+            preserve_order,
+            ..self.clone()
+        };
+        Some(Arc::new(new_config))
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Delegate to the file source
+        self.file_source.apply_expressions(f)
+    }
+
+    /// Create any shared state that should be passed between sibling streams
+    /// during one execution.
+    ///
+    /// This returns `None` when sibling streams must not share work, such as
+    /// when file order must be preserved or the file groups define the output
+    /// partitioning needed for the rest of the plan
+    fn create_sibling_state(&self) -> Option<Arc<dyn Any + Send + Sync>> {
+        if self.preserve_order || self.partitioned_by_file_group {
+            return None;
+        }
+
+        Some(Arc::new(SharedWorkSource::from_config(self)) as Arc<dyn Any + Send + Sync>)
+    }
+}
+
+impl FileScanConfig {
+    /// Returns only the output orderings that are validated against actual
+    /// file group statistics.
+    ///
+    /// For example, individual files may be ordered by `col1 ASC`,
+    /// but if we have files with these min/max statistics in a single partition / file group:
+    ///
+    /// - file1: min(col1) = 10, max(col1) = 20
+    /// - file2: min(col1) = 5, max(col1) = 15
+    ///
+    /// Because reading file1 followed by file2 would produce out-of-order output (there is overlap
+    /// in the ranges), we cannot retain `col1 ASC` as a valid output ordering.
+    ///
+    /// Similarly this would not be a valid order (non-overlapping ranges but not ordered):
+    ///
+    /// - file1: min(col1) = 20, max(col1) = 30
+    /// - file2: min(col1) = 10, max(col1) = 15
+    ///
+    /// On the other hand if we had:
+    ///
+    /// - file1: min(col1) = 5, max(col1) = 15
+    /// - file2: min(col1) = 16, max(col1) = 25
+    ///
+    /// Then we know that reading file1 followed by file2 will produce ordered output,
+    /// so `col1 ASC` would be retained.
+    ///
+    /// Note that we are checking for ordering *within* *each* file group / partition,
+    /// files in different partitions are read independently and do not affect each other's ordering.
+    /// Merging of the multiple partition streams into a single ordered stream is handled
+    /// upstream e.g. by `SortPreservingMergeExec`.
+    fn validated_output_ordering(&self) -> Vec<LexOrdering> {
+        let schema = self.file_source.table_schema().table_schema();
+        sort_pushdown::validate_orderings(
+            &self.output_ordering,
+            schema,
+            &self.file_groups,
+            None,
+        )
+    }
+
+    /// Get the file schema (schema of the files without partition columns)
+    pub fn file_schema(&self) -> &SchemaRef {
+        self.file_source.table_schema().file_schema()
+    }
+
+    /// Get the table partition columns
+    pub fn table_partition_cols(&self) -> &Vec<FieldRef> {
+        self.file_source.table_schema().table_partition_cols()
+    }
+
+    /// Returns the unprojected table statistics, marking them as inexact if filters are present.
+    ///
+    /// When filters are pushed down (including pruning predicates and bloom filters),
+    /// we can't guarantee the statistics are exact because we don't know how many
+    /// rows will be filtered out.
+    pub fn statistics(&self) -> Statistics {
+        if self.file_source.filter().is_some() {
+            self.statistics.clone().to_inexact()
+        } else {
+            self.statistics.clone()
+        }
+    }
+
+    pub fn projected_schema(&self) -> Result<Arc<Schema>> {
+        let schema = self.file_source.table_schema().table_schema();
+        match self.file_source.projection() {
+            Some(proj) => Ok(Arc::new(proj.project_schema(schema)?)),
+            None => Ok(Arc::clone(schema)),
+        }
+    }
+
+    fn add_filter_equivalence_info(
+        filter: &Arc<dyn PhysicalExpr>,
+        eq_properties: &mut EquivalenceProperties,
+        schema: &Schema,
+    ) -> Result<()> {
+        // Gather valid equality pairs from the filter expression
+        let equal_pairs = split_conjunction(filter).into_iter().filter_map(|expr| {
+            // Ignore any binary expressions that reference non-existent columns in the current schema
+            // (e.g. due to unnecessary projections being removed)
+            reassign_expr_columns(Arc::clone(expr), schema)
+                .ok()
+                .and_then(|expr| match expr.downcast_ref::<BinaryExpr>() {
+                    Some(expr) if expr.op() == &Operator::Eq => {
+                        Some((Arc::clone(expr.left()), Arc::clone(expr.right())))
+                    }
+                    _ => None,
+                })
+        });
+
+        for (lhs, rhs) in equal_pairs {
+            eq_properties.add_equal_conditions(lhs, rhs)?
+        }
+
+        Ok(())
+    }
+
+    /// Returns whether newlines in values are supported.
+    ///
+    /// This method always returns `false`. The actual newlines_in_values setting
+    /// has been moved to [`CsvSource`] and should be accessed via
+    /// [`CsvSource::csv_options()`] instead.
+    ///
+    /// [`CsvSource`]: https://docs.rs/datafusion/latest/datafusion/datasource/physical_plan/struct.CsvSource.html
+    /// [`CsvSource::csv_options()`]: https://docs.rs/datafusion/latest/datafusion/datasource/physical_plan/struct.CsvSource.html#method.csv_options
+    #[deprecated(
+        since = "52.0.0",
+        note = "newlines_in_values has moved to CsvSource. Access it via CsvSource::csv_options().newlines_in_values instead. It will be removed in 58.0.0 or 6 months after 52.0.0 is released, whichever comes first."
+    )]
+    pub fn newlines_in_values(&self) -> bool {
+        false
+    }
+
+    #[deprecated(
+        since = "52.0.0",
+        note = "This method is no longer used, use eq_properties instead. It will be removed in 58.0.0 or 6 months after 52.0.0 is released, whichever comes first."
+    )]
+    pub fn projected_constraints(&self) -> Constraints {
+        let props = self.eq_properties();
+        props.constraints().clone()
+    }
+
+    #[deprecated(
+        since = "52.0.0",
+        note = "This method is no longer used, use eq_properties instead. It will be removed in 58.0.0 or 6 months after 52.0.0 is released, whichever comes first."
+    )]
+    pub fn file_column_projection_indices(&self) -> Option<Vec<usize>> {
+        #[expect(deprecated)]
+        self.file_source.projection().as_ref().map(|p| {
+            p.ordered_column_indices()
+                .into_iter()
+                .filter(|&i| i < self.file_schema().fields().len())
+                .collect::<Vec<_>>()
+        })
+    }
+
+    /// Splits file groups into new groups based on statistics to enable efficient parallel processing.
+    ///
+    /// The method distributes files across a target number of partitions while ensuring
+    /// files within each partition maintain sort order based on their min/max statistics.
+    ///
+    /// The algorithm works by:
+    /// 1. Takes files sorted by minimum values
+    /// 2. For each file:
+    ///   - Finds eligible groups (empty or where file's min > group's last max)
+    ///   - Selects the smallest eligible group
+    ///   - Creates a new group if needed
+    ///
+    /// # Parameters
+    /// * `table_schema`: Schema containing information about the columns
+    /// * `file_groups`: The original file groups to split
+    /// * `sort_order`: The lexicographical ordering to maintain within each group
+    /// * `target_partitions`: The desired number of output partitions
+    ///
+    /// # Returns
+    /// A new set of file groups, where files within each group are non-overlapping with respect to
+    /// their min/max statistics and maintain the specified sort order.
+    pub fn split_groups_by_statistics_with_target_partitions(
+        table_schema: &SchemaRef,
+        file_groups: &[FileGroup],
+        sort_order: &LexOrdering,
+        target_partitions: usize,
+    ) -> Result<Vec<FileGroup>> {
+        if target_partitions == 0 {
+            return Err(internal_datafusion_err!(
+                "target_partitions must be greater than 0"
+            ));
+        }
+
+        let flattened_files = file_groups
+            .iter()
+            .flat_map(FileGroup::iter)
+            .collect::<Vec<_>>();
+
+        if flattened_files.is_empty() {
+            return Ok(vec![]);
+        }
+
+        let statistics = MinMaxStatistics::new_from_files(
+            sort_order,
+            table_schema,
+            None,
+            flattened_files.iter().copied(),
+        )?;
+
+        let indices_sorted_by_min = statistics.min_values_sorted();
+
+        // Initialize with target_partitions empty groups
+        let mut file_groups_indices: Vec<Vec<usize>> = vec![vec![]; target_partitions];
+
+        for (idx, min) in indices_sorted_by_min {
+            if let Some((_, group)) = file_groups_indices
+                .iter_mut()
+                .enumerate()
+                .filter(|(_, group)| {
+                    group.is_empty()
+                        || min
+                            > statistics
+                                .max(*group.last().expect("groups should not be empty"))
+                })
+                .min_by_key(|(_, group)| group.len())
+            {
+                group.push(idx);
+            } else {
+                // Create a new group if no existing group fits
+                file_groups_indices.push(vec![idx]);
+            }
+        }
+
+        // Remove any empty groups
+        file_groups_indices.retain(|group| !group.is_empty());
+
+        // Assemble indices back into groups of PartitionedFiles
+        Ok(file_groups_indices
+            .into_iter()
+            .map(|file_group_indices| {
+                FileGroup::new(
+                    file_group_indices
+                        .into_iter()
+                        .map(|idx| flattened_files[idx].clone())
+                        .collect(),
+                )
+            })
+            .collect())
+    }
+
+    /// Attempts to do a bin-packing on files into file groups, such that any two files
+    /// in a file group are ordered and non-overlapping with respect to their statistics.
+    /// It will produce the smallest number of file groups possible.
+    pub fn split_groups_by_statistics(
+        table_schema: &SchemaRef,
+        file_groups: &[FileGroup],
+        sort_order: &LexOrdering,
+    ) -> Result<Vec<FileGroup>> {
+        let flattened_files = file_groups
+            .iter()
+            .flat_map(FileGroup::iter)
+            .collect::<Vec<_>>();
+        // First Fit:
+        // * Choose the first file group that a file can be placed into.
+        // * If it fits into no existing file groups, create a new one.
+        //
+        // By sorting files by min values and then applying first-fit bin packing,
+        // we can produce the smallest number of file groups such that
+        // files within a group are in order and non-overlapping.
+        //
+        // Source: Applied Combinatorics (Keller and Trotter), Chapter 6.8
+        // https://www.appliedcombinatorics.org/book/s_posets_dilworth-intord.html
+
+        if flattened_files.is_empty() {
+            return Ok(vec![]);
+        }
+
+        let statistics = MinMaxStatistics::new_from_files(
+            sort_order,
+            table_schema,
+            None,
+            flattened_files.iter().copied(),
+        )
+        .map_err(|e| {
+            e.context("construct min/max statistics for split_groups_by_statistics")
+        })?;
+
+        let indices_sorted_by_min = statistics.min_values_sorted();
+        let mut file_groups_indices: Vec<Vec<usize>> = vec![];
+
+        for (idx, min) in indices_sorted_by_min {
+            let file_group_to_insert = file_groups_indices.iter_mut().find(|group| {
+                // If our file is non-overlapping and comes _after_ the last file,
+                // it fits in this file group.
+                min > statistics.max(
+                    *group
+                        .last()
+                        .expect("groups should be nonempty at construction"),
+                )
+            });
+            match file_group_to_insert {
+                Some(group) => group.push(idx),
+                None => file_groups_indices.push(vec![idx]),
+            }
+        }
+
+        // Assemble indices back into groups of PartitionedFiles
+        Ok(file_groups_indices
+            .into_iter()
+            .map(|file_group_indices| {
+                file_group_indices
+                    .into_iter()
+                    .map(|idx| flattened_files[idx].clone())
+                    .collect()
+            })
+            .collect())
+    }
+
+    /// Write the data_type based on file_source
+    fn fmt_file_source(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult {
+        write!(f, ", file_type={}", self.file_source.file_type())?;
+        self.file_source.fmt_extra(t, f)
+    }
+
+    /// Returns the file_source
+    pub fn file_source(&self) -> &Arc<dyn FileSource> {
+        &self.file_source
+    }
+
+    // Sort pushdown methods (rebuild_with_source, try_sort_file_groups_by_statistics,
+    // sort_files_within_groups_by_statistics, any_file_has_nulls_in_sort_columns)
+    // are in crate::sort_pushdown module.
+}
+
+impl Debug for FileScanConfig {
+    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+        write!(f, "FileScanConfig {{")?;
+        write!(f, "object_store_url={:?}, ", self.object_store_url)?;
+
+        write!(f, "statistics={:?}, ", self.statistics())?;
+
+        DisplayAs::fmt_as(self, DisplayFormatType::Verbose, f)?;
+        write!(f, "}}")
+    }
+}
+
+impl DisplayAs for FileScanConfig {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult {
+        let schema = self.projected_schema().map_err(|_| std::fmt::Error {})?;
+        let orderings = sort_pushdown::get_projected_output_ordering(self, &schema);
+
+        write!(f, "file_groups=")?;
+        FileGroupsDisplay(&self.file_groups).fmt_as(t, f)?;
+
+        if !schema.fields().is_empty() {
+            write!(f, ", projection={}", ProjectSchemaDisplay(&schema))?;
+        }
+
+        if let Some(limit) = self.limit {
+            write!(f, ", limit={limit}")?;
+        }
+
+        display_orderings(f, &orderings)?;
+
+        if !self.constraints.is_empty() {
+            write!(f, ", {}", self.constraints)?;
+        }
+
+        Ok(())
+    }
+}
+
+/// Convert type to a type suitable for use as a `ListingTable`
+/// partition column. Returns `Dictionary(UInt16, val_type)`, which is
+/// a reasonable trade off between a reasonable number of partition
+/// values and space efficiency.
+///
+/// This use this to specify types for partition columns. However
+/// you MAY also choose not to dictionary-encode the data or to use a
+/// different dictionary type.
+///
+/// Use [`wrap_partition_value_in_dict`] to wrap a [`ScalarValue`] in the same say.
+pub fn wrap_partition_type_in_dict(val_type: DataType) -> DataType {
+    DataType::Dictionary(Box::new(DataType::UInt16), Box::new(val_type))
+}
+
+/// Convert a [`ScalarValue`] of partition columns to a type, as
+/// described in the documentation of [`wrap_partition_type_in_dict`],
+/// which can wrap the types.
+pub fn wrap_partition_value_in_dict(val: ScalarValue) -> ScalarValue {
+    ScalarValue::Dictionary(Box::new(DataType::UInt16), Box::new(val))
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+
+    use super::*;
+    use crate::TableSchema;
+    use crate::source::DataSourceExec;
+    use crate::test_util::col;
+    use crate::{
+        generate_test_files, test_util::MockSource, tests::aggr_test_schema,
+        verify_sort_integrity,
+    };
+
+    use arrow::array::{Int32Array, RecordBatch};
+    use arrow::datatypes::Field;
+    use datafusion_common::ColumnStatistics;
+    use datafusion_common::stats::Precision;
+    use datafusion_common::tree_node::TreeNodeRecursion;
+    use datafusion_common::{Result, assert_batches_eq, internal_err};
+    use datafusion_execution::TaskContext;
+    use datafusion_expr::SortExpr;
+    use datafusion_physical_expr::PhysicalExpr;
+    use datafusion_physical_expr::create_physical_sort_expr;
+    use datafusion_physical_expr::expressions::Literal;
+    use datafusion_physical_expr::projection::ProjectionExpr;
+    use datafusion_physical_expr::projection::ProjectionExprs;
+    use datafusion_physical_plan::ExecutionPlan;
+    use datafusion_physical_plan::execution_plan::collect;
+    use futures::FutureExt as _;
+    use futures::StreamExt as _;
+    use futures::stream;
+    use object_store::ObjectStore;
+    use std::fmt::Debug;
+
+    #[derive(Clone)]
+    struct InexactSortPushdownSource {
+        metrics: ExecutionPlanMetricsSet,
+        table_schema: TableSchema,
+    }
+
+    impl InexactSortPushdownSource {
+        fn new(table_schema: TableSchema) -> Self {
+            Self {
+                metrics: ExecutionPlanMetricsSet::new(),
+                table_schema,
+            }
+        }
+    }
+
+    impl FileSource for InexactSortPushdownSource {
+        fn create_file_opener(
+            &self,
+            _object_store: Arc<dyn ObjectStore>,
+            _base_config: &FileScanConfig,
+            _partition: usize,
+        ) -> Result<Arc<dyn crate::file_stream::FileOpener>> {
+            unimplemented!()
+        }
+
+        fn table_schema(&self) -> &TableSchema {
+            &self.table_schema
+        }
+
+        fn with_batch_size(&self, _batch_size: usize) -> Arc<dyn FileSource> {
+            Arc::new(self.clone())
+        }
+
+        fn metrics(&self) -> &ExecutionPlanMetricsSet {
+            &self.metrics
+        }
+
+        fn file_type(&self) -> &str {
+            "mock"
+        }
+
+        fn try_pushdown_sort(
+            &self,
+            _order: &[PhysicalSortExpr],
+            _eq_properties: &EquivalenceProperties,
+        ) -> Result<SortOrderPushdownResult<Arc<dyn FileSource>>> {
+            Ok(SortOrderPushdownResult::Inexact {
+                inner: Arc::new(self.clone()) as Arc<dyn FileSource>,
+            })
+        }
+
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
+    }
+
+    #[test]
+    fn physical_plan_config_no_projection_tab_cols_as_field() {
+        let file_schema = aggr_test_schema();
+
+        // make a table_partition_col as a field
+        let table_partition_col =
+            Field::new("date", wrap_partition_type_in_dict(DataType::Utf8), true)
+                .with_metadata(HashMap::from_iter(vec![(
+                    "key_whatever".to_owned(),
+                    "value_whatever".to_owned(),
+                )]));
+
+        let conf = config_for_projection(
+            Arc::clone(&file_schema),
+            None,
+            Statistics::new_unknown(&file_schema),
+            vec![table_partition_col.clone()],
+        );
+
+        // verify the proj_schema includes the last column and exactly the same the field it is defined
+        let proj_schema = conf.projected_schema().unwrap();
+        assert_eq!(proj_schema.fields().len(), file_schema.fields().len() + 1);
+        assert_eq!(
+            *proj_schema.field(file_schema.fields().len()),
+            table_partition_col,
+            "partition columns are the last columns and ust have all values defined in created field"
+        );
+    }
+
+    #[test]
+    fn test_split_groups_by_statistics() -> Result<()> {
+        use chrono::TimeZone;
+        use datafusion_common::DFSchema;
+        use datafusion_expr::execution_props::ExecutionProps;
+        use object_store::{ObjectMeta, path::Path};
+
+        struct File {
+            name: &'static str,
+            date: &'static str,
+            statistics: Vec<Option<(Option<f64>, Option<f64>)>>,
+        }
+        impl File {
+            fn new(
+                name: &'static str,
+                date: &'static str,
+                statistics: Vec<Option<(f64, f64)>>,
+            ) -> Self {
+                Self::new_nullable(
+                    name,
+                    date,
+                    statistics
+                        .into_iter()
+                        .map(|opt| opt.map(|(min, max)| (Some(min), Some(max))))
+                        .collect(),
+                )
+            }
+
+            fn new_nullable(
+                name: &'static str,
+                date: &'static str,
+                statistics: Vec<Option<(Option<f64>, Option<f64>)>>,
+            ) -> Self {
+                Self {
+                    name,
+                    date,
+                    statistics,
+                }
+            }
+        }
+
+        struct TestCase {
+            name: &'static str,
+            file_schema: Schema,
+            files: Vec<File>,
+            sort: Vec<SortExpr>,
+            expected_result: Result<Vec<Vec<&'static str>>, &'static str>,
+        }
+
+        use datafusion_expr::col;
+        let cases = vec![
+            TestCase {
+                name: "test sort",
+                file_schema: Schema::new(vec![Field::new(
+                    "value".to_string(),
+                    DataType::Float64,
+                    false,
+                )]),
+                files: vec![
+                    File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]),
+                    File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]),
+                    File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]),
+                ],
+                sort: vec![col("value").sort(true, false)],
+                expected_result: Ok(vec![vec!["0", "1"], vec!["2"]]),
+            },
+            // same input but file '2' is in the middle
+            // test that we still order correctly
+            TestCase {
+                name: "test sort with files ordered differently",
+                file_schema: Schema::new(vec![Field::new(
+                    "value".to_string(),
+                    DataType::Float64,
+                    false,
+                )]),
+                files: vec![
+                    File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]),
+                    File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]),
+                    File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]),
+                ],
+                sort: vec![col("value").sort(true, false)],
+                expected_result: Ok(vec![vec!["0", "1"], vec!["2"]]),
+            },
+            TestCase {
+                name: "reverse sort",
+                file_schema: Schema::new(vec![Field::new(
+                    "value".to_string(),
+                    DataType::Float64,
+                    false,
+                )]),
+                files: vec![
+                    File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]),
+                    File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]),
+                    File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]),
+                ],
+                sort: vec![col("value").sort(false, true)],
+                expected_result: Ok(vec![vec!["1", "0"], vec!["2"]]),
+            },
+            TestCase {
+                name: "nullable sort columns, nulls last",
+                file_schema: Schema::new(vec![Field::new(
+                    "value".to_string(),
+                    DataType::Float64,
+                    true,
+                )]),
+                files: vec![
+                    File::new_nullable(
+                        "0",
+                        "2023-01-01",
+                        vec![Some((Some(0.00), Some(0.49)))],
+                    ),
+                    File::new_nullable("1", "2023-01-01", vec![Some((Some(0.50), None))]),
+                    File::new_nullable("2", "2023-01-02", vec![Some((Some(0.00), None))]),
+                ],
+                sort: vec![col("value").sort(true, false)],
+                expected_result: Ok(vec![vec!["0", "1"], vec!["2"]]),
+            },
+            TestCase {
+                name: "nullable sort columns, nulls first",
+                file_schema: Schema::new(vec![Field::new(
+                    "value".to_string(),
+                    DataType::Float64,
+                    true,
+                )]),
+                files: vec![
+                    File::new_nullable("0", "2023-01-01", vec![Some((None, Some(0.49)))]),
+                    File::new_nullable(
+                        "1",
+                        "2023-01-01",
+                        vec![Some((Some(0.50), Some(1.00)))],
+                    ),
+                    File::new_nullable("2", "2023-01-02", vec![Some((None, Some(1.00)))]),
+                ],
+                sort: vec![col("value").sort(true, true)],
+                expected_result: Ok(vec![vec!["0", "1"], vec!["2"]]),
+            },
+            TestCase {
+                name: "all three non-overlapping",
+                file_schema: Schema::new(vec![Field::new(
+                    "value".to_string(),
+                    DataType::Float64,
+                    false,
+                )]),
+                files: vec![
+                    File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]),
+                    File::new("1", "2023-01-01", vec![Some((0.50, 0.99))]),
+                    File::new("2", "2023-01-02", vec![Some((1.00, 1.49))]),
+                ],
+                sort: vec![col("value").sort(true, false)],
+                expected_result: Ok(vec![vec!["0", "1", "2"]]),
+            },
+            TestCase {
+                name: "all three overlapping",
+                file_schema: Schema::new(vec![Field::new(
+                    "value".to_string(),
+                    DataType::Float64,
+                    false,
+                )]),
+                files: vec![
+                    File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]),
+                    File::new("1", "2023-01-01", vec![Some((0.00, 0.49))]),
+                    File::new("2", "2023-01-02", vec![Some((0.00, 0.49))]),
+                ],
+                sort: vec![col("value").sort(true, false)],
+                expected_result: Ok(vec![vec!["0"], vec!["1"], vec!["2"]]),
+            },
+            TestCase {
+                name: "empty input",
+                file_schema: Schema::new(vec![Field::new(
+                    "value".to_string(),
+                    DataType::Float64,
+                    false,
+                )]),
+                files: vec![],
+                sort: vec![col("value").sort(true, false)],
+                expected_result: Ok(vec![]),
+            },
+            TestCase {
+                name: "one file missing statistics",
+                file_schema: Schema::new(vec![Field::new(
+                    "value".to_string(),
+                    DataType::Float64,
+                    false,
+                )]),
+                files: vec![
+                    File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]),
+                    File::new("1", "2023-01-01", vec![Some((0.00, 0.49))]),
+                    File::new("2", "2023-01-02", vec![None]),
+                ],
+                sort: vec![col("value").sort(true, false)],
+                expected_result: Err(
+                    "construct min/max statistics for split_groups_by_statistics\ncaused by\ncollect min/max values\ncaused by\nget min/max for column: 'value'\ncaused by\nError during planning: statistics not found",
+                ),
+            },
+        ];
+
+        for case in cases {
+            let table_schema = Arc::new(Schema::new(
+                case.file_schema
+                    .fields()
+                    .clone()
+                    .into_iter()
+                    .cloned()
+                    .chain(Some(Arc::new(Field::new(
+                        "date".to_string(),
+                        DataType::Utf8,
+                        false,
+                    ))))
+                    .collect::<Vec<_>>(),
+            ));
+            let Some(sort_order) = LexOrdering::new(
+                case.sort
+                    .into_iter()
+                    .map(|expr| {
+                        create_physical_sort_expr(
+                            &expr,
+                            &DFSchema::try_from(Arc::clone(&table_schema))?,
+                            &ExecutionProps::default(),
+                        )
+                    })
+                    .collect::<Result<Vec<_>>>()?,
+            ) else {
+                return internal_err!("This test should always use an ordering");
+            };
+
+            let partitioned_files = FileGroup::new(
+                case.files.into_iter().map(From::from).collect::<Vec<_>>(),
+            );
+            let result = FileScanConfig::split_groups_by_statistics(
+                &table_schema,
+                std::slice::from_ref(&partitioned_files),
+                &sort_order,
+            );
+            let results_by_name = result
+                .as_ref()
+                .map(|file_groups| {
+                    file_groups
+                        .iter()
+                        .map(|file_group| {
+                            file_group
+                                .iter()
+                                .map(|file| {
+                                    partitioned_files
+                                        .iter()
+                                        .find_map(|f| {
+                                            if f.object_meta == file.object_meta {
+                                                Some(
+                                                    f.object_meta
+                                                        .location
+                                                        .as_ref()
+                                                        .rsplit('/')
+                                                        .next()
+                                                        .unwrap()
+                                                        .trim_end_matches(".parquet"),
+                                                )
+                                            } else {
+                                                None
+                                            }
+                                        })
+                                        .unwrap()
+                                })
+                                .collect::<Vec<_>>()
+                        })
+                        .collect::<Vec<_>>()
+                })
+                .map_err(|e| e.strip_backtrace().leak() as &'static str);
+
+            assert_eq!(results_by_name, case.expected_result, "{}", case.name);
+        }
+
+        return Ok(());
+
+        impl From<File> for PartitionedFile {
+            fn from(file: File) -> Self {
+                let object_meta = ObjectMeta {
+                    location: Path::from(format!(
+                        "data/date={}/{}.parquet",
+                        file.date, file.name
+                    )),
+                    last_modified: chrono::Utc.timestamp_nanos(0),
+                    size: 0,
+                    e_tag: None,
+                    version: None,
+                };
+                let statistics = Arc::new(Statistics {
+                    num_rows: Precision::Absent,
+                    total_byte_size: Precision::Absent,
+                    column_statistics: file
+                        .statistics
+                        .into_iter()
+                        .map(|stats| {
+                            stats
+                                .map(|(min, max)| ColumnStatistics {
+                                    min_value: Precision::Exact(ScalarValue::Float64(
+                                        min,
+                                    )),
+                                    max_value: Precision::Exact(ScalarValue::Float64(
+                                        max,
+                                    )),
+                                    ..Default::default()
+                                })
+                                .unwrap_or_default()
+                        })
+                        .collect::<Vec<_>>(),
+                });
+                PartitionedFile::new_from_meta(object_meta)
+                    .with_partition_values(vec![ScalarValue::from(file.date)])
+                    .with_statistics(statistics)
+            }
+        }
+    }
+
+    // sets default for configs that play no role in projections
+    fn config_for_projection(
+        file_schema: SchemaRef,
+        projection: Option<Vec<usize>>,
+        statistics: Statistics,
+        table_partition_cols: Vec<Field>,
+    ) -> FileScanConfig {
+        let table_schema = TableSchema::new(
+            file_schema,
+            table_partition_cols.into_iter().map(Arc::new).collect(),
+        );
+        FileScanConfigBuilder::new(
+            ObjectStoreUrl::parse("test:///").unwrap(),
+            Arc::new(MockSource::new(table_schema.clone())),
+        )
+        .with_projection_indices(projection)
+        .unwrap()
+        .with_statistics(statistics)
+        .build()
+    }
+
+    #[test]
+    fn test_file_scan_config_builder() {
+        let file_schema = aggr_test_schema();
+        let object_store_url = ObjectStoreUrl::parse("test:///").unwrap();
+
+        let table_schema = TableSchema::new(
+            Arc::clone(&file_schema),
+            vec![Arc::new(Field::new(
+                "date",
+                wrap_partition_type_in_dict(DataType::Utf8),
+                false,
+            ))],
+        );
+
+        let file_source: Arc<dyn FileSource> =
+            Arc::new(MockSource::new(table_schema.clone()));
+
+        // Create a builder with required parameters
+        let builder = FileScanConfigBuilder::new(
+            object_store_url.clone(),
+            Arc::clone(&file_source),
+        );
+
+        // Build with various configurations
+        let config = builder
+            .with_limit(Some(1000))
+            .with_projection_indices(Some(vec![0, 1]))
+            .unwrap()
+            .with_statistics(Statistics::new_unknown(&file_schema))
+            .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new(
+                "test.parquet".to_string(),
+                1024,
+            )])])
+            .with_output_ordering(vec![
+                [PhysicalSortExpr::new_default(Arc::new(Column::new(
+                    "date", 0,
+                )))]
+                .into(),
+            ])
+            .with_file_compression_type(FileCompressionType::UNCOMPRESSED)
+            .build();
+
+        // Verify the built config has all the expected values
+        assert_eq!(config.object_store_url, object_store_url);
+        assert_eq!(*config.file_schema(), file_schema);
+        assert_eq!(config.limit, Some(1000));
+        assert_eq!(
+            config
+                .file_source
+                .projection()
+                .as_ref()
+                .map(|p| p.column_indices()),
+            Some(vec![0, 1])
+        );
+        assert_eq!(config.table_partition_cols().len(), 1);
+        assert_eq!(config.table_partition_cols()[0].name(), "date");
+        assert_eq!(config.file_groups.len(), 1);
+        assert_eq!(config.file_groups[0].len(), 1);
+        assert_eq!(
+            config.file_groups[0][0].object_meta.location.as_ref(),
+            "test.parquet"
+        );
+        assert_eq!(
+            config.file_compression_type,
+            FileCompressionType::UNCOMPRESSED
+        );
+        assert_eq!(config.output_ordering.len(), 1);
+    }
+
+    #[test]
+    fn equivalence_properties_after_schema_change() {
+        let file_schema = aggr_test_schema();
+        let object_store_url = ObjectStoreUrl::parse("test:///").unwrap();
+
+        let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]);
+
+        // Create a file source with a filter
+        let file_source: Arc<dyn FileSource> = Arc::new(
+            MockSource::new(table_schema.clone()).with_filter(Arc::new(BinaryExpr::new(
+                col("c2", &file_schema).unwrap(),
+                Operator::Eq,
+                Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+            ))),
+        );
+
+        let config = FileScanConfigBuilder::new(
+            object_store_url.clone(),
+            Arc::clone(&file_source),
+        )
+        .with_projection_indices(Some(vec![0, 1, 2]))
+        .unwrap()
+        .build();
+
+        // Simulate projection being updated. Since the filter has already been pushed down,
+        // the new projection won't include the filtered column.
+        let exprs = ProjectionExprs::new(vec![ProjectionExpr::new(
+            col("c1", &file_schema).unwrap(),
+            "c1",
+        )]);
+        let data_source = config
+            .try_swapping_with_projection(&exprs)
+            .unwrap()
+            .unwrap();
+
+        // Gather the equivalence properties from the new data source. There should
+        // be no equivalence class for column c2 since it was removed by the projection.
+        let eq_properties = data_source.eq_properties();
+        let eq_group = eq_properties.eq_group();
+
+        for class in eq_group.iter() {
+            for expr in class.iter() {
+                if let Some(col) = expr.downcast_ref::<Column>() {
+                    assert_ne!(
+                        col.name(),
+                        "c2",
+                        "c2 should not be present in any equivalence class"
+                    );
+                }
+            }
+        }
+    }
+
+    #[test]
+    fn test_file_scan_config_builder_defaults() {
+        let file_schema = aggr_test_schema();
+        let object_store_url = ObjectStoreUrl::parse("test:///").unwrap();
+
+        let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]);
+
+        let file_source: Arc<dyn FileSource> =
+            Arc::new(MockSource::new(table_schema.clone()));
+
+        // Create a builder with only required parameters and build without any additional configurations
+        let config = FileScanConfigBuilder::new(
+            object_store_url.clone(),
+            Arc::clone(&file_source),
+        )
+        .build();
+
+        // Verify default values
+        assert_eq!(config.object_store_url, object_store_url);
+        assert_eq!(*config.file_schema(), file_schema);
+        assert_eq!(config.limit, None);
+        // When no projection is specified, the file source should have an unprojected projection
+        // (i.e., all columns)
+        let expected_projection: Vec<usize> = (0..file_schema.fields().len()).collect();
+        assert_eq!(
+            config
+                .file_source
+                .projection()
+                .as_ref()
+                .map(|p| p.column_indices()),
+            Some(expected_projection)
+        );
+        assert!(config.table_partition_cols().is_empty());
+        assert!(config.file_groups.is_empty());
+        assert_eq!(
+            config.file_compression_type,
+            FileCompressionType::UNCOMPRESSED
+        );
+        assert!(config.output_ordering.is_empty());
+        assert!(config.constraints.is_empty());
+
+        // Verify statistics are set to unknown
+        assert_eq!(config.statistics().num_rows, Precision::Absent);
+        assert_eq!(config.statistics().total_byte_size, Precision::Absent);
+        assert_eq!(
+            config.statistics().column_statistics.len(),
+            file_schema.fields().len()
+        );
+        for stat in config.statistics().column_statistics {
+            assert_eq!(stat.distinct_count, Precision::Absent);
+            assert_eq!(stat.min_value, Precision::Absent);
+            assert_eq!(stat.max_value, Precision::Absent);
+            assert_eq!(stat.null_count, Precision::Absent);
+        }
+    }
+
+    #[test]
+    fn test_file_scan_config_builder_new_from() {
+        let schema = aggr_test_schema();
+        let object_store_url = ObjectStoreUrl::parse("test:///").unwrap();
+        let partition_cols = vec![Field::new(
+            "date",
+            wrap_partition_type_in_dict(DataType::Utf8),
+            false,
+        )];
+        let file = PartitionedFile::new("test_file.parquet", 100);
+
+        let table_schema = TableSchema::new(
+            Arc::clone(&schema),
+            partition_cols.iter().map(|f| Arc::new(f.clone())).collect(),
+        );
+
+        let file_source: Arc<dyn FileSource> =
+            Arc::new(MockSource::new(table_schema.clone()));
+
+        // Create a config with non-default values
+        let original_config = FileScanConfigBuilder::new(
+            object_store_url.clone(),
+            Arc::clone(&file_source),
+        )
+        .with_projection_indices(Some(vec![0, 2]))
+        .unwrap()
+        .with_limit(Some(10))
+        .with_file(file.clone())
+        .with_constraints(Constraints::default())
+        .build();
+
+        // Create a new builder from the config
+        let new_builder = FileScanConfigBuilder::from(original_config);
+
+        // Build a new config from this builder
+        let new_config = new_builder.build();
+
+        // Verify properties match
+        let partition_cols = partition_cols.into_iter().map(Arc::new).collect::<Vec<_>>();
+        assert_eq!(new_config.object_store_url, object_store_url);
+        assert_eq!(*new_config.file_schema(), schema);
+        assert_eq!(
+            new_config
+                .file_source
+                .projection()
+                .as_ref()
+                .map(|p| p.column_indices()),
+            Some(vec![0, 2])
+        );
+        assert_eq!(new_config.limit, Some(10));
+        assert_eq!(*new_config.table_partition_cols(), partition_cols);
+        assert_eq!(new_config.file_groups.len(), 1);
+        assert_eq!(new_config.file_groups[0].len(), 1);
+        assert_eq!(
+            new_config.file_groups[0][0].object_meta.location.as_ref(),
+            "test_file.parquet"
+        );
+        assert_eq!(new_config.constraints, Constraints::default());
+    }
+
+    #[test]
+    fn test_split_groups_by_statistics_with_target_partitions() -> Result<()> {
+        use datafusion_common::DFSchema;
+        use datafusion_expr::{col, execution_props::ExecutionProps};
+
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "value",
+            DataType::Float64,
+            false,
+        )]));
+
+        // Setup sort expression
+        let exec_props = ExecutionProps::new();
+        let df_schema = DFSchema::try_from_qualified_schema("test", schema.as_ref())?;
+        let sort_expr = [col("value").sort(true, false)];
+        let sort_ordering = sort_expr
+            .map(|expr| {
+                create_physical_sort_expr(&expr, &df_schema, &exec_props).unwrap()
+            })
+            .into();
+
+        // Test case parameters
+        struct TestCase {
+            name: String,
+            file_count: usize,
+            overlap_factor: f64,
+            target_partitions: usize,
+            expected_partition_count: usize,
+        }
+
+        let test_cases = vec![
+            // Basic cases
+            TestCase {
+                name: "no_overlap_10_files_4_partitions".to_string(),
+                file_count: 10,
+                overlap_factor: 0.0,
+                target_partitions: 4,
+                expected_partition_count: 4,
+            },
+            TestCase {
+                name: "medium_overlap_20_files_5_partitions".to_string(),
+                file_count: 20,
+                overlap_factor: 0.5,
+                target_partitions: 5,
+                expected_partition_count: 5,
+            },
+            TestCase {
+                name: "high_overlap_30_files_3_partitions".to_string(),
+                file_count: 30,
+                overlap_factor: 0.8,
+                target_partitions: 3,
+                expected_partition_count: 7,
+            },
+            // Edge cases
+            TestCase {
+                name: "fewer_files_than_partitions".to_string(),
+                file_count: 3,
+                overlap_factor: 0.0,
+                target_partitions: 10,
+                expected_partition_count: 3, // Should only create as many partitions as files
+            },
+            TestCase {
+                name: "single_file".to_string(),
+                file_count: 1,
+                overlap_factor: 0.0,
+                target_partitions: 5,
+                expected_partition_count: 1, // Should create only one partition
+            },
+            TestCase {
+                name: "empty_files".to_string(),
+                file_count: 0,
+                overlap_factor: 0.0,
+                target_partitions: 3,
+                expected_partition_count: 0, // Empty result for empty input
+            },
+        ];
+
+        for case in test_cases {
+            println!("Running test case: {}", case.name);
+
+            // Generate files using bench utility function
+            let file_groups = generate_test_files(case.file_count, case.overlap_factor);
+
+            // Call the function under test
+            let result =
+                FileScanConfig::split_groups_by_statistics_with_target_partitions(
+                    &schema,
+                    &file_groups,
+                    &sort_ordering,
+                    case.target_partitions,
+                )?;
+
+            // Verify results
+            println!(
+                "Created {} partitions (target was {})",
+                result.len(),
+                case.target_partitions
+            );
+
+            // Check partition count
+            assert_eq!(
+                result.len(),
+                case.expected_partition_count,
+                "Case '{}': Unexpected partition count",
+                case.name
+            );
+
+            // Verify sort integrity
+            assert!(
+                verify_sort_integrity(&result),
+                "Case '{}': Files within partitions are not properly ordered",
+                case.name
+            );
+
+            // Distribution check for partitions
+            if case.file_count > 1 && case.expected_partition_count > 1 {
+                let group_sizes: Vec<usize> = result.iter().map(FileGroup::len).collect();
+                let max_size = *group_sizes.iter().max().unwrap();
+                let min_size = *group_sizes.iter().min().unwrap();
+
+                // Check partition balancing - difference shouldn't be extreme
+                let avg_files_per_partition =
+                    case.file_count as f64 / case.expected_partition_count as f64;
+                assert!(
+                    (max_size as f64) < 2.0 * avg_files_per_partition,
+                    "Case '{}': Unbalanced distribution. Max partition size {} exceeds twice the average {}",
+                    case.name,
+                    max_size,
+                    avg_files_per_partition
+                );
+
+                println!("Distribution - min files: {min_size}, max files: {max_size}");
+            }
+        }
+
+        // Test error case: zero target partitions
+        let empty_groups: Vec<FileGroup> = vec![];
+        let err = FileScanConfig::split_groups_by_statistics_with_target_partitions(
+            &schema,
+            &empty_groups,
+            &sort_ordering,
+            0,
+        )
+        .unwrap_err();
+
+        assert!(
+            err.to_string()
+                .contains("target_partitions must be greater than 0"),
+            "Expected error for zero target partitions"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_partition_statistics_projection() {
+        // This test verifies that partition_statistics applies projection correctly.
+        // The old implementation had a bug where it returned file group statistics
+        // without applying the projection, returning all column statistics instead
+        // of just the projected ones.
+
+        use crate::source::DataSourceExec;
+        use datafusion_physical_plan::ExecutionPlan;
+
+        // Create a schema with 4 columns
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("col0", DataType::Int32, false),
+            Field::new("col1", DataType::Int32, false),
+            Field::new("col2", DataType::Int32, false),
+            Field::new("col3", DataType::Int32, false),
+        ]));
+
+        // Create statistics for all 4 columns
+        let file_group_stats = Statistics {
+            num_rows: Precision::Exact(100),
+            total_byte_size: Precision::Exact(1024),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    ..ColumnStatistics::new_unknown()
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(5),
+                    ..ColumnStatistics::new_unknown()
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(10),
+                    ..ColumnStatistics::new_unknown()
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(15),
+                    ..ColumnStatistics::new_unknown()
+                },
+            ],
+        };
+
+        // Create a file group with statistics
+        let file_group = FileGroup::new(vec![PartitionedFile::new("test.parquet", 1024)])
+            .with_statistics(Arc::new(file_group_stats));
+
+        let table_schema = TableSchema::new(Arc::clone(&schema), vec![]);
+
+        // Create a FileScanConfig with projection: only keep columns 0 and 2
+        let config = FileScanConfigBuilder::new(
+            ObjectStoreUrl::parse("test:///").unwrap(),
+            Arc::new(MockSource::new(table_schema.clone())),
+        )
+        .with_projection_indices(Some(vec![0, 2]))
+        .unwrap() // Only project columns 0 and 2
+        .with_file_groups(vec![file_group])
+        .build();
+
+        // Create a DataSourceExec from the config
+        let exec = DataSourceExec::from_data_source(config);
+
+        // Get statistics for partition 0
+        let partition_stats = exec.partition_statistics(Some(0)).unwrap();
+
+        // Verify that only 2 columns are in the statistics (the projected ones)
+        assert_eq!(
+            partition_stats.column_statistics.len(),
+            2,
+            "Expected 2 column statistics (projected), but got {}",
+            partition_stats.column_statistics.len()
+        );
+
+        // Verify the column statistics are for columns 0 and 2
+        assert_eq!(
+            partition_stats.column_statistics[0].null_count,
+            Precision::Exact(0),
+            "First projected column should be col0 with 0 nulls"
+        );
+        assert_eq!(
+            partition_stats.column_statistics[1].null_count,
+            Precision::Exact(10),
+            "Second projected column should be col2 with 10 nulls"
+        );
+
+        // Verify row count and byte size
+        assert_eq!(partition_stats.num_rows, Precision::Exact(100));
+        assert_eq!(partition_stats.total_byte_size, Precision::Exact(800));
+    }
+
+    /// Regression test for reusing a `DataSourceExec` after its execution-local
+    /// shared work queue has been drained.
+    ///
+    /// This test uses a single file group with two files so the scan creates a
+    /// shared unopened-file queue. Executing after `reset_state` must recreate
+    /// the shared queue and return the same rows again.
+    #[tokio::test]
+    async fn reset_state_recreates_shared_work_source() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "value",
+            DataType::Int32,
+            false,
+        )]));
+        let file_source = Arc::new(
+            MockSource::new(Arc::clone(&schema))
+                .with_file_opener(Arc::new(ResetStateTestFileOpener { schema })),
+        );
+
+        let config =
+            FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+                .with_file_group(FileGroup::new(vec![
+                    PartitionedFile::new("file1.parquet", 100),
+                    PartitionedFile::new("file2.parquet", 100),
+                ]))
+                .build();
+
+        let exec: Arc<dyn ExecutionPlan> = DataSourceExec::from_data_source(config);
+        let task_ctx = Arc::new(TaskContext::default());
+
+        // Running the same scan after resetting the state, should
+        // produce the same answer.
+        let first_run = collect(Arc::clone(&exec), Arc::clone(&task_ctx)).await?;
+        let reset_exec = exec.reset_state()?;
+        let second_run = collect(reset_exec, task_ctx).await?;
+
+        let expected = [
+            "+-------+",
+            "| value |",
+            "+-------+",
+            "| 1     |",
+            "| 2     |",
+            "+-------+",
+        ];
+        assert_batches_eq!(expected, &first_run);
+        assert_batches_eq!(expected, &second_run);
+
+        Ok(())
+    }
+
+    /// Test-only `FileOpener` that turns file names like `file1.parquet` into a
+    /// single-batch stream containing that numeric value
+    #[derive(Debug)]
+    struct ResetStateTestFileOpener {
+        schema: SchemaRef,
+    }
+
+    impl crate::file_stream::FileOpener for ResetStateTestFileOpener {
+        fn open(
+            &self,
+            file: PartitionedFile,
+        ) -> Result<crate::file_stream::FileOpenFuture> {
+            let value = file
+                .object_meta
+                .location
+                .as_ref()
+                .trim_start_matches("file")
+                .trim_end_matches(".parquet")
+                .parse::<i32>()
+                .expect("invalid test file name");
+            let schema = Arc::clone(&self.schema);
+            Ok(async move {
+                let batch = RecordBatch::try_new(
+                    schema,
+                    vec![Arc::new(Int32Array::from(vec![value]))],
+                )
+                .expect("test batch should be valid");
+                Ok(stream::iter(vec![Ok(batch)]).boxed())
+            }
+            .boxed())
+        }
+    }
+
+    #[test]
+    fn test_output_partitioning_not_partitioned_by_file_group() {
+        let file_schema = aggr_test_schema();
+        let partition_col =
+            Field::new("date", wrap_partition_type_in_dict(DataType::Utf8), false);
+
+        let config = config_for_projection(
+            Arc::clone(&file_schema),
+            None,
+            Statistics::new_unknown(&file_schema),
+            vec![partition_col],
+        );
+
+        // partitioned_by_file_group defaults to false
+        let partitioning = config.output_partitioning();
+        assert!(matches!(partitioning, Partitioning::UnknownPartitioning(_)));
+    }
+
+    #[test]
+    fn test_output_partitioning_no_partition_columns() {
+        let file_schema = aggr_test_schema();
+        let mut config = config_for_projection(
+            Arc::clone(&file_schema),
+            None,
+            Statistics::new_unknown(&file_schema),
+            vec![], // No partition columns
+        );
+        config.partitioned_by_file_group = true;
+
+        let partitioning = config.output_partitioning();
+        assert!(matches!(partitioning, Partitioning::UnknownPartitioning(_)));
+    }
+
+    #[test]
+    fn test_output_partitioning_with_partition_columns() {
+        let file_schema = aggr_test_schema();
+
+        // Test single partition column
+        let single_partition_col = vec![Field::new(
+            "date",
+            wrap_partition_type_in_dict(DataType::Utf8),
+            false,
+        )];
+
+        let mut config = config_for_projection(
+            Arc::clone(&file_schema),
+            None,
+            Statistics::new_unknown(&file_schema),
+            single_partition_col,
+        );
+        config.partitioned_by_file_group = true;
+        config.file_groups = vec![
+            FileGroup::new(vec![PartitionedFile::new("f1.parquet".to_string(), 1024)]),
+            FileGroup::new(vec![PartitionedFile::new("f2.parquet".to_string(), 1024)]),
+            FileGroup::new(vec![PartitionedFile::new("f3.parquet".to_string(), 1024)]),
+        ];
+
+        let partitioning = config.output_partitioning();
+        match partitioning {
+            Partitioning::Hash(exprs, num_partitions) => {
+                assert_eq!(num_partitions, 3);
+                assert_eq!(exprs.len(), 1);
+                assert_eq!(exprs[0].downcast_ref::<Column>().unwrap().name(), "date");
+            }
+            _ => panic!("Expected Hash partitioning"),
+        }
+
+        // Test multiple partition columns
+        let multiple_partition_cols = vec![
+            Field::new("year", wrap_partition_type_in_dict(DataType::Utf8), false),
+            Field::new("month", wrap_partition_type_in_dict(DataType::Utf8), false),
+        ];
+
+        config = config_for_projection(
+            Arc::clone(&file_schema),
+            None,
+            Statistics::new_unknown(&file_schema),
+            multiple_partition_cols,
+        );
+        config.partitioned_by_file_group = true;
+        config.file_groups = vec![
+            FileGroup::new(vec![PartitionedFile::new("f1.parquet".to_string(), 1024)]),
+            FileGroup::new(vec![PartitionedFile::new("f2.parquet".to_string(), 1024)]),
+        ];
+
+        let partitioning = config.output_partitioning();
+        match partitioning {
+            Partitioning::Hash(exprs, num_partitions) => {
+                assert_eq!(num_partitions, 2);
+                assert_eq!(exprs.len(), 2);
+                let col_names: Vec<_> = exprs
+                    .iter()
+                    .map(|e| e.downcast_ref::<Column>().unwrap().name())
+                    .collect();
+                assert_eq!(col_names, vec!["year", "month"]);
+            }
+            _ => panic!("Expected Hash partitioning"),
+        }
+    }
+
+    #[test]
+    fn try_pushdown_sort_reverses_file_groups_only_when_requested_is_reverse()
+    -> Result<()> {
+        let file_schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)]));
+
+        let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]);
+        let file_source = Arc::new(InexactSortPushdownSource::new(table_schema));
+
+        let file_groups = vec![FileGroup::new(vec![
+            PartitionedFile::new("file1", 1),
+            PartitionedFile::new("file2", 1),
+        ])];
+
+        let sort_expr_asc = PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)));
+        let config =
+            FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+                .with_file_groups(file_groups)
+                .with_output_ordering(vec![
+                    LexOrdering::new(vec![sort_expr_asc.clone()]).unwrap(),
+                ])
+                .build();
+
+        let requested_asc = vec![sort_expr_asc.clone()];
+        let result = config.try_pushdown_sort(&requested_asc)?;
+        let SortOrderPushdownResult::Inexact { inner } = result else {
+            panic!("Expected Inexact result");
+        };
+        let pushed_config = inner
+            .downcast_ref::<FileScanConfig>()
+            .expect("Expected FileScanConfig");
+        let pushed_files = pushed_config.file_groups[0].files();
+        assert_eq!(pushed_files[0].object_meta.location.as_ref(), "file1");
+        assert_eq!(pushed_files[1].object_meta.location.as_ref(), "file2");
+
+        let requested_desc = vec![sort_expr_asc.reverse()];
+        let result = config.try_pushdown_sort(&requested_desc)?;
+        let SortOrderPushdownResult::Inexact { inner } = result else {
+            panic!("Expected Inexact result");
+        };
+        let pushed_config = inner
+            .downcast_ref::<FileScanConfig>()
+            .expect("Expected FileScanConfig");
+        let pushed_files = pushed_config.file_groups[0].files();
+        assert_eq!(pushed_files[0].object_meta.location.as_ref(), "file2");
+        assert_eq!(pushed_files[1].object_meta.location.as_ref(), "file1");
+
+        Ok(())
+    }
+
+    fn make_file_with_stats(name: &str, min: f64, max: f64) -> PartitionedFile {
+        PartitionedFile::new(name.to_string(), 1024).with_statistics(Arc::new(
+            Statistics {
+                num_rows: Precision::Exact(100),
+                total_byte_size: Precision::Exact(1024),
+                column_statistics: vec![ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    min_value: Precision::Exact(ScalarValue::Float64(Some(min))),
+                    max_value: Precision::Exact(ScalarValue::Float64(Some(max))),
+                    ..Default::default()
+                }],
+            },
+        ))
+    }
+
+    #[derive(Clone)]
+    struct ExactSortPushdownSource {
+        metrics: ExecutionPlanMetricsSet,
+        table_schema: TableSchema,
+    }
+
+    impl ExactSortPushdownSource {
+        fn new(table_schema: TableSchema) -> Self {
+            Self {
+                metrics: ExecutionPlanMetricsSet::new(),
+                table_schema,
+            }
+        }
+    }
+
+    impl FileSource for ExactSortPushdownSource {
+        fn create_file_opener(
+            &self,
+            _object_store: Arc<dyn ObjectStore>,
+            _base_config: &FileScanConfig,
+            _partition: usize,
+        ) -> Result<Arc<dyn crate::file_stream::FileOpener>> {
+            unimplemented!()
+        }
+
+        fn table_schema(&self) -> &TableSchema {
+            &self.table_schema
+        }
+
+        fn with_batch_size(&self, _batch_size: usize) -> Arc<dyn FileSource> {
+            Arc::new(self.clone())
+        }
+
+        fn metrics(&self) -> &ExecutionPlanMetricsSet {
+            &self.metrics
+        }
+
+        fn file_type(&self) -> &str {
+            "mock_exact"
+        }
+
+        fn try_pushdown_sort(
+            &self,
+            _order: &[PhysicalSortExpr],
+            _eq_properties: &EquivalenceProperties,
+        ) -> Result<SortOrderPushdownResult<Arc<dyn FileSource>>> {
+            Ok(SortOrderPushdownResult::Exact {
+                inner: Arc::new(self.clone()) as Arc<dyn FileSource>,
+            })
+        }
+
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
+    }
+
+    #[test]
+    fn sort_pushdown_unsupported_source_files_get_sorted() -> Result<()> {
+        let file_schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Float64, false)]));
+        let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]);
+        let file_source = Arc::new(MockSource::new(table_schema));
+
+        let file_groups = vec![FileGroup::new(vec![
+            make_file_with_stats("file3", 20.0, 30.0),
+            make_file_with_stats("file1", 0.0, 9.0),
+            make_file_with_stats("file2", 10.0, 19.0),
+        ])];
+
+        let sort_expr = PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)));
+        let config =
+            FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+                .with_file_groups(file_groups)
+                .build();
+
+        let result = config.try_pushdown_sort(&[sort_expr])?;
+        let SortOrderPushdownResult::Inexact { inner } = result else {
+            panic!("Expected Inexact result, got {result:?}");
+        };
+        let pushed_config = inner
+            .downcast_ref::<FileScanConfig>()
+            .expect("Expected FileScanConfig");
+        let files = pushed_config.file_groups[0].files();
+        assert_eq!(files[0].object_meta.location.as_ref(), "file1");
+        assert_eq!(files[1].object_meta.location.as_ref(), "file2");
+        assert_eq!(files[2].object_meta.location.as_ref(), "file3");
+        assert!(pushed_config.output_ordering.is_empty());
+        Ok(())
+    }
+
+    #[test]
+    fn sort_pushdown_unsupported_source_already_sorted() -> Result<()> {
+        let file_schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Float64, false)]));
+        let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]);
+        let file_source = Arc::new(MockSource::new(table_schema));
+
+        let file_groups = vec![FileGroup::new(vec![
+            make_file_with_stats("file1", 0.0, 9.0),
+            make_file_with_stats("file2", 10.0, 19.0),
+            make_file_with_stats("file3", 20.0, 30.0),
+        ])];
+
+        let sort_expr = PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)));
+        let config =
+            FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+                .with_file_groups(file_groups)
+                .build();
+
+        let result = config.try_pushdown_sort(&[sort_expr])?;
+        assert!(matches!(result, SortOrderPushdownResult::Unsupported));
+        Ok(())
+    }
+
+    #[test]
+    fn sort_pushdown_unsupported_source_descending_sort() -> Result<()> {
+        let file_schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Float64, false)]));
+        let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]);
+        let file_source = Arc::new(MockSource::new(table_schema));
+
+        let file_groups = vec![FileGroup::new(vec![
+            make_file_with_stats("file1", 0.0, 9.0),
+            make_file_with_stats("file3", 20.0, 30.0),
+            make_file_with_stats("file2", 10.0, 19.0),
+        ])];
+
+        let sort_expr = PhysicalSortExpr::new(
+            Arc::new(Column::new("a", 0)),
+            arrow::compute::SortOptions {
+                descending: true,
+                nulls_first: true,
+            },
+        );
+        let config =
+            FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+                .with_file_groups(file_groups)
+                .build();
+
+        let result = config.try_pushdown_sort(&[sort_expr])?;
+        let SortOrderPushdownResult::Inexact { inner } = result else {
+            panic!("Expected Inexact result");
+        };
+        let pushed_config = inner
+            .downcast_ref::<FileScanConfig>()
+            .expect("Expected FileScanConfig");
+        let files = pushed_config.file_groups[0].files();
+        assert_eq!(files[0].object_meta.location.as_ref(), "file3");
+        assert_eq!(files[1].object_meta.location.as_ref(), "file2");
+        assert_eq!(files[2].object_meta.location.as_ref(), "file1");
+        Ok(())
+    }
+
+    #[test]
+    fn sort_pushdown_exact_source_non_overlapping_returns_exact() -> Result<()> {
+        let file_schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Float64, false)]));
+        let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]);
+        let file_source = Arc::new(ExactSortPushdownSource::new(table_schema));
+
+        let sort_expr = PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)));
+
+        let file_groups = vec![FileGroup::new(vec![
+            make_file_with_stats("file1", 0.0, 9.0),
+            make_file_with_stats("file2", 10.0, 19.0),
+            make_file_with_stats("file3", 20.0, 30.0),
+        ])];
+
+        let config =
+            FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+                .with_file_groups(file_groups)
+                .with_output_ordering(vec![
+                    LexOrdering::new(vec![sort_expr.clone()]).unwrap(),
+                ])
+                .build();
+
+        let result = config.try_pushdown_sort(&[sort_expr])?;
+        let SortOrderPushdownResult::Exact { inner } = result else {
+            panic!("Expected Exact result, got {result:?}");
+        };
+        let pushed_config = inner
+            .downcast_ref::<FileScanConfig>()
+            .expect("Expected FileScanConfig");
+        assert!(!pushed_config.output_ordering.is_empty());
+        Ok(())
+    }
+
+    #[test]
+    fn sort_pushdown_exact_source_overlapping_downgraded_to_inexact() -> Result<()> {
+        let file_schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Float64, false)]));
+        let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]);
+        let file_source = Arc::new(ExactSortPushdownSource::new(table_schema));
+
+        let sort_expr = PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)));
+
+        let file_groups = vec![FileGroup::new(vec![
+            make_file_with_stats("file1", 0.0, 15.0),
+            make_file_with_stats("file2", 10.0, 25.0),
+            make_file_with_stats("file3", 20.0, 30.0),
+        ])];
+
+        let config =
+            FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+                .with_file_groups(file_groups)
+                .with_output_ordering(vec![
+                    LexOrdering::new(vec![sort_expr.clone()]).unwrap(),
+                ])
+                .build();
+
+        let result = config.try_pushdown_sort(&[sort_expr])?;
+        let SortOrderPushdownResult::Inexact { inner } = result else {
+            panic!("Expected Inexact (downgraded), got {result:?}");
+        };
+        let pushed_config = inner
+            .downcast_ref::<FileScanConfig>()
+            .expect("Expected FileScanConfig");
+        assert!(pushed_config.output_ordering.is_empty());
+        Ok(())
+    }
+
+    #[test]
+    fn sort_pushdown_exact_source_out_of_order_returns_exact() -> Result<()> {
+        let file_schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Float64, false)]));
+        let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]);
+        let file_source = Arc::new(ExactSortPushdownSource::new(table_schema));
+
+        let sort_expr = PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)));
+
+        let file_groups = vec![FileGroup::new(vec![
+            make_file_with_stats("file3", 20.0, 30.0),
+            make_file_with_stats("file1", 0.0, 9.0),
+            make_file_with_stats("file2", 10.0, 19.0),
+        ])];
+
+        let config =
+            FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+                .with_file_groups(file_groups)
+                .with_output_ordering(vec![
+                    LexOrdering::new(vec![sort_expr.clone()]).unwrap(),
+                ])
+                .build();
+
+        let result = config.try_pushdown_sort(&[sort_expr])?;
+        let SortOrderPushdownResult::Exact { inner } = result else {
+            panic!("Expected Exact result, got {result:?}");
+        };
+        let pushed_config = inner
+            .downcast_ref::<FileScanConfig>()
+            .expect("Expected FileScanConfig");
+        let files = pushed_config.file_groups[0].files();
+        assert_eq!(files[0].object_meta.location.as_ref(), "file1");
+        assert_eq!(files[1].object_meta.location.as_ref(), "file2");
+        assert_eq!(files[2].object_meta.location.as_ref(), "file3");
+        assert!(!pushed_config.output_ordering.is_empty());
+        Ok(())
+    }
+
+    #[test]
+    fn sort_pushdown_unsupported_source_single_file_groups() -> Result<()> {
+        let file_schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Float64, false)]));
+        let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]);
+        let file_source = Arc::new(MockSource::new(table_schema));
+
+        let file_groups = vec![
+            FileGroup::new(vec![make_file_with_stats("file1", 0.0, 9.0)]),
+            FileGroup::new(vec![make_file_with_stats("file2", 10.0, 19.0)]),
+        ];
+
+        let sort_expr = PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)));
+        let config =
+            FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+                .with_file_groups(file_groups)
+                .build();
+
+        let result = config.try_pushdown_sort(&[sort_expr])?;
+        assert!(
+            matches!(result, SortOrderPushdownResult::Unsupported),
+            "Expected Unsupported for single-file groups"
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn sort_pushdown_unsupported_source_multiple_groups() -> Result<()> {
+        let file_schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Float64, false)]));
+        let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]);
+        let file_source = Arc::new(MockSource::new(table_schema));
+
+        let file_groups = vec![
+            FileGroup::new(vec![
+                make_file_with_stats("file_b", 10.0, 19.0),
+                make_file_with_stats("file_a", 0.0, 9.0),
+            ]),
+            FileGroup::new(vec![
+                make_file_with_stats("file_d", 30.0, 39.0),
+                make_file_with_stats("file_c", 20.0, 29.0),
+            ]),
+        ];
+
+        let sort_expr = PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)));
+        let config =
+            FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+                .with_file_groups(file_groups)
+                .build();
+
+        let result = config.try_pushdown_sort(&[sort_expr])?;
+        let SortOrderPushdownResult::Inexact { inner } = result else {
+            panic!("Expected Inexact result");
+        };
+        let pushed_config = inner
+            .downcast_ref::<FileScanConfig>()
+            .expect("Expected FileScanConfig");
+        let files0 = pushed_config.file_groups[0].files();
+        assert_eq!(files0[0].object_meta.location.as_ref(), "file_a");
+        assert_eq!(files0[1].object_meta.location.as_ref(), "file_b");
+        let files1 = pushed_config.file_groups[1].files();
+        assert_eq!(files1[0].object_meta.location.as_ref(), "file_c");
+        assert_eq!(files1[1].object_meta.location.as_ref(), "file_d");
+        Ok(())
+    }
+
+    #[test]
+    fn sort_pushdown_unsupported_source_partial_statistics() -> Result<()> {
+        let file_schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Float64, false)]));
+        let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]);
+        let file_source = Arc::new(MockSource::new(table_schema));
+
+        let file_groups = vec![
+            FileGroup::new(vec![
+                make_file_with_stats("file_b", 10.0, 19.0),
+                make_file_with_stats("file_a", 0.0, 9.0),
+            ]),
+            FileGroup::new(vec![
+                PartitionedFile::new("file_d".to_string(), 1024),
+                PartitionedFile::new("file_c".to_string(), 1024),
+            ]),
+        ];
+
+        let sort_expr = PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)));
+        let config =
+            FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+                .with_file_groups(file_groups)
+                .build();
+
+        let result = config.try_pushdown_sort(&[sort_expr])?;
+        let SortOrderPushdownResult::Inexact { inner } = result else {
+            panic!("Expected Inexact result");
+        };
+        let pushed_config = inner
+            .downcast_ref::<FileScanConfig>()
+            .expect("Expected FileScanConfig");
+        let files0 = pushed_config.file_groups[0].files();
+        assert_eq!(files0[0].object_meta.location.as_ref(), "file_a");
+        assert_eq!(files0[1].object_meta.location.as_ref(), "file_b");
+        let files1 = pushed_config.file_groups[1].files();
+        assert_eq!(files1[0].object_meta.location.as_ref(), "file_d");
+        assert_eq!(files1[1].object_meta.location.as_ref(), "file_c");
+        Ok(())
+    }
+
+    #[test]
+    fn sort_pushdown_inexact_source_with_statistics_sorting() -> Result<()> {
+        let file_schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Float64, false)]));
+        let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]);
+        let file_source = Arc::new(InexactSortPushdownSource::new(table_schema));
+
+        let file_groups = vec![FileGroup::new(vec![
+            make_file_with_stats("file2", 10.0, 19.0),
+            make_file_with_stats("file1", 0.0, 9.0),
+        ])];
+
+        let sort_expr = PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)));
+        let config =
+            FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+                .with_file_groups(file_groups)
+                .build();
+
+        let result = config.try_pushdown_sort(&[sort_expr])?;
+        let SortOrderPushdownResult::Inexact { inner } = result else {
+            panic!("Expected Inexact result");
+        };
+        let pushed_config = inner
+            .downcast_ref::<FileScanConfig>()
+            .expect("Expected FileScanConfig");
+        let files = pushed_config.file_groups[0].files();
+        assert_eq!(files[0].object_meta.location.as_ref(), "file1");
+        assert_eq!(files[1].object_meta.location.as_ref(), "file2");
+        assert!(pushed_config.output_ordering.is_empty());
+        Ok(())
+    }
+
+    #[test]
+    fn sort_pushdown_exact_multi_group_preserves_parallelism() -> Result<()> {
+        // ExactSortPushdownSource + 4 non-overlapping files in 2 interleaved groups.
+        // Groups should NOT be redistributed — interleaved groups allow SPM to
+        // pull from both partitions concurrently, keeping parallel I/O active.
+        // Redistributing consecutively would make SPM read one partition at a
+        // time (all values in group 0 < group 1), degrading to single-threaded I/O.
+        let file_schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Float64, false)]));
+        let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]);
+        let file_source = Arc::new(ExactSortPushdownSource::new(table_schema));
+
+        let sort_expr = PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)));
+
+        // 2 groups with interleaved ranges (simulating bin-packing result):
+        // Group 0: [file_01(0-9), file_03(20-29)]
+        // Group 1: [file_02(10-19), file_04(30-39)]
+        let file_groups = vec![
+            FileGroup::new(vec![
+                make_file_with_stats("file_01", 0.0, 9.0),
+                make_file_with_stats("file_03", 20.0, 29.0),
+            ]),
+            FileGroup::new(vec![
+                make_file_with_stats("file_02", 10.0, 19.0),
+                make_file_with_stats("file_04", 30.0, 39.0),
+            ]),
+        ];
+
+        let config =
+            FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+                .with_file_groups(file_groups)
+                .with_output_ordering(vec![
+                    LexOrdering::new(vec![sort_expr.clone()]).unwrap(),
+                ])
+                .build();
+
+        let result = config.try_pushdown_sort(&[sort_expr])?;
+        let SortOrderPushdownResult::Exact { inner } = result else {
+            panic!("Expected Exact result, got {result:?}");
+        };
+        let pushed_config = inner
+            .downcast_ref::<FileScanConfig>()
+            .expect("Expected FileScanConfig");
+
+        // 2 groups preserved (parallelism maintained)
+        assert_eq!(pushed_config.file_groups.len(), 2);
+
+        // Files within each group are sorted by stats, but groups are NOT
+        // redistributed — interleaved assignment from bin-packing is kept
+        let files0 = pushed_config.file_groups[0].files();
+        assert_eq!(files0[0].object_meta.location.as_ref(), "file_01");
+        assert_eq!(files0[1].object_meta.location.as_ref(), "file_03");
+        let files1 = pushed_config.file_groups[1].files();
+        assert_eq!(files1[0].object_meta.location.as_ref(), "file_02");
+        assert_eq!(files1[1].object_meta.location.as_ref(), "file_04");
+
+        // output_ordering preserved (Exact, each group internally non-overlapping)
+        assert!(!pushed_config.output_ordering.is_empty());
+        Ok(())
+    }
+
+    #[test]
+    fn sort_pushdown_reverse_preserves_file_order_with_stats() -> Result<()> {
+        // Reverse scan should reverse file order but NOT apply statistics-based
+        // sorting (which would undo the reversal). The result is Inexact.
+        let file_schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Float64, false)]));
+        let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]);
+        let file_source = Arc::new(InexactSortPushdownSource::new(table_schema));
+
+        let sort_expr = PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)));
+
+        // Files with stats, in ASC order. Output ordering is [a ASC].
+        let file_groups = vec![FileGroup::new(vec![
+            make_file_with_stats("file1", 0.0, 9.0),
+            make_file_with_stats("file2", 10.0, 19.0),
+            make_file_with_stats("file3", 20.0, 30.0),
+        ])];
+
+        let config =
+            FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+                .with_file_groups(file_groups)
+                .with_output_ordering(vec![
+                    LexOrdering::new(vec![sort_expr.clone()]).unwrap(),
+                ])
+                .build();
+
+        // Request DESC → reverse path
+        let result = config.try_pushdown_sort(&[sort_expr.reverse()])?;
+        let SortOrderPushdownResult::Inexact { inner } = result else {
+            panic!("Expected Inexact for reverse scan, got {result:?}");
+        };
+        let pushed_config = inner
+            .downcast_ref::<FileScanConfig>()
+            .expect("Expected FileScanConfig");
+
+        // Files should be reversed (not re-sorted by stats)
+        let files = pushed_config.file_groups[0].files();
+        assert_eq!(files[0].object_meta.location.as_ref(), "file3");
+        assert_eq!(files[1].object_meta.location.as_ref(), "file2");
+        assert_eq!(files[2].object_meta.location.as_ref(), "file1");
+
+        // output_ordering cleared (Inexact)
+        assert!(pushed_config.output_ordering.is_empty());
+        Ok(())
+    }
+
+    /// Helper: create a PartitionedFile with stats including null count
+    fn make_file_with_null_stats(
+        name: &str,
+        min: f64,
+        max: f64,
+        null_count: usize,
+    ) -> PartitionedFile {
+        PartitionedFile::new(name.to_string(), 1024).with_statistics(Arc::new(
+            Statistics {
+                num_rows: Precision::Exact(100),
+                total_byte_size: Precision::Exact(1024),
+                column_statistics: vec![ColumnStatistics {
+                    null_count: Precision::Exact(null_count),
+                    min_value: Precision::Exact(ScalarValue::Float64(Some(min))),
+                    max_value: Precision::Exact(ScalarValue::Float64(Some(max))),
+                    ..Default::default()
+                }],
+            },
+        ))
+    }
+
+    #[test]
+    fn sort_pushdown_unsupported_with_nulls_does_not_upgrade_to_exact() -> Result<()> {
+        // Files are non-overlapping but one has NULLs.
+        // Should NOT upgrade to Exact — NULLs would appear in wrong position.
+        let file_schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Float64, true)]));
+        let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]);
+        let file_source = Arc::new(MockSource::new(table_schema));
+
+        let sort_expr = PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)));
+
+        // Files in wrong order (high min first) to trigger reordering
+        let file_groups = vec![FileGroup::new(vec![
+            make_file_with_null_stats("b_no_nulls", 10.0, 19.0, 0),
+            make_file_with_null_stats("a_with_nulls", 0.0, 9.0, 5), // has NULLs
+        ])];
+
+        let config =
+            FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+                .with_file_groups(file_groups)
+                .with_output_ordering(vec![
+                    LexOrdering::new(vec![sort_expr.clone()]).unwrap(),
+                ])
+                .build();
+
+        let result = config.try_pushdown_sort(&[sort_expr])?;
+        // Should be Inexact (not Exact) because of NULLs
+        assert!(
+            matches!(result, SortOrderPushdownResult::Inexact { .. }),
+            "Expected Inexact due to NULLs, got {result:?}"
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn sort_pushdown_unsupported_no_nulls_upgrades_to_exact() -> Result<()> {
+        // Files are non-overlapping, no NULLs → should upgrade to Exact
+        let file_schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Float64, true)]));
+        let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]);
+        let file_source = Arc::new(MockSource::new(table_schema));
+
+        let sort_expr = PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)));
+
+        let file_groups = vec![FileGroup::new(vec![
+            make_file_with_null_stats("b_high", 10.0, 19.0, 0),
+            make_file_with_null_stats("a_low", 0.0, 9.0, 0),
+        ])];
+
+        let config =
+            FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+                .with_file_groups(file_groups)
+                .with_output_ordering(vec![
+                    LexOrdering::new(vec![sort_expr.clone()]).unwrap(),
+                ])
+                .build();
+
+        let result = config.try_pushdown_sort(&[sort_expr])?;
+        assert!(
+            matches!(result, SortOrderPushdownResult::Exact { .. }),
+            "Expected Exact (no NULLs), got {result:?}"
+        );
+        Ok(())
+    }
+}
diff --git a/datafusion/datasource/src/file_scan_config/sort_pushdown.rs b/datafusion/datasource/src/file_scan_config/sort_pushdown.rs
new file mode 100644
index 0000000000000..af08ed71b9a6d
--- /dev/null
+++ b/datafusion/datasource/src/file_scan_config/sort_pushdown.rs
@@ -0,0 +1,577 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Sort pushdown helpers for [`FileScanConfig`].
+//!
+//! This module contains the statistics-based file sorting, non-overlapping
+//! validation, and NULL handling logic used by
+//! [`FileScanConfig::try_pushdown_sort`](super::FileScanConfig::try_pushdown_sort).
+//!
+//! Extracted from `file_scan_config.rs` to keep that module focused on
+//! core configuration and data-source plumbing.
+
+use super::FileScanConfig;
+use crate::file::FileSource;
+use crate::file_groups::FileGroup;
+use crate::source::DataSource;
+use crate::statistics::MinMaxStatistics;
+
+use arrow::datatypes::SchemaRef;
+use datafusion_common::Result;
+use datafusion_common::stats::Precision;
+use datafusion_physical_expr::equivalence::project_orderings;
+use datafusion_physical_expr::expressions::Column;
+use datafusion_physical_expr::projection::ProjectionExprs;
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+use datafusion_physical_plan::SortOrderPushdownResult;
+use log::debug;
+use std::sync::Arc;
+
+/// Result of sorting files within groups by their min/max statistics.
+pub(crate) struct SortedFileGroups {
+    file_groups: Vec<FileGroup>,
+    any_reordered: bool,
+    all_non_overlapping: bool,
+}
+
+impl FileScanConfig {
+    ///
+    /// This is the core of sort pushdown for file-based sources. It performs
+    /// three optimizations depending on the pushdown result:
+    ///
+    /// ```text
+    /// ┌─────────────────────────────────────────────────────────────┐
+    /// │                 rebuild_with_source                         │
+    /// │                                                             │
+    /// │  1. Reverse file groups (if DESC matches reversed ordering) │
+    /// │  2. Sort files within groups by min/max statistics          │
+    /// │  3. If Exact + non-overlapping:                             │
+    /// │     Keep output_ordering → SortExec eliminated              │
+    /// │     Otherwise: clear output_ordering → SortExec stays       │
+    /// └─────────────────────────────────────────────────────────────┘
+    /// ```
+    ///
+    /// # Why sort files by statistics?
+    ///
+    /// Files within a partition (file group) are read sequentially. By sorting
+    /// them so that file_i.max <= file_{i+1}.min, the combined output stream
+    /// is already in order — no SortExec needed for that partition.
+    ///
+    /// Even when files overlap (Inexact), statistics-based ordering helps
+    /// TopK/LIMIT queries: reading low-value files first lets dynamic filters
+    /// prune high-value files earlier.
+    pub(crate) fn rebuild_with_source(
+        &self,
+        new_file_source: Arc<dyn FileSource>,
+        is_exact: bool,
+        order: &[PhysicalSortExpr],
+    ) -> Result<FileScanConfig> {
+        let mut new_config = self.clone();
+
+        // Reverse file order (within each group) if the caller is requesting a reversal of this
+        // scan's declared output ordering.
+        let reverse_file_groups = if self.output_ordering.is_empty() {
+            false
+        } else if let Some(requested) = LexOrdering::new(order.iter().cloned()) {
+            let projected_schema = self.projected_schema()?;
+            let orderings = project_orderings(&self.output_ordering, &projected_schema);
+            orderings
+                .iter()
+                .any(|ordering| ordering.is_reverse(&requested))
+        } else {
+            false
+        };
+
+        if reverse_file_groups {
+            new_config.file_groups = new_config
+                .file_groups
+                .into_iter()
+                .map(|group| {
+                    let mut files = group.into_inner();
+                    files.reverse();
+                    files.into()
+                })
+                .collect();
+        }
+
+        new_config.file_source = new_file_source;
+
+        // Sort files within groups by statistics when not reversing
+        let all_non_overlapping = if !reverse_file_groups {
+            if let Some(sort_order) = LexOrdering::new(order.iter().cloned()) {
+                let projected_schema = new_config.projected_schema()?;
+                let projection_indices = new_config
+                    .file_source
+                    .projection()
+                    .as_ref()
+                    .and_then(|p| ordered_column_indices_from_projection(p));
+                let result = sort_files_within_groups_by_statistics(
+                    &new_config.file_groups,
+                    &sort_order,
+                    &projected_schema,
+                    projection_indices.as_deref(),
+                );
+                new_config.file_groups = result.file_groups;
+                result.all_non_overlapping
+            } else {
+                false
+            }
+        } else {
+            // When reversing, files are already reversed above. We skip
+            // statistics-based sorting here because it would undo the reversal.
+            // Note: reverse path is always Inexact, so all_non_overlapping
+            // is not used (is_exact is false).
+            false
+        };
+
+        if is_exact && all_non_overlapping {
+            // Truly exact: within-file ordering guaranteed and files are non-overlapping.
+            // Keep output_ordering so SortExec can be eliminated for each partition.
+            //
+            // We intentionally do NOT redistribute files across groups here.
+            // The planning-phase bin-packing may interleave file ranges across groups:
+            //
+            //   Group 0: [f1(1-10), f3(21-30)]   ← interleaved with group 1
+            //   Group 1: [f2(11-20), f4(31-40)]
+            //
+            // This interleaving is actually beneficial because SPM pulls from both
+            // partitions concurrently, keeping parallel I/O active:
+            //
+            //   SPM: pull P0 [1-10] → pull P1 [11-20] → pull P0 [21-30] → pull P1 [31-40]
+            //        ^^^^^^^^^^^^     ^^^^^^^^^^^^
+            //        both partitions scanning files simultaneously
+            //
+            // If we were to redistribute files consecutively:
+            //   Group 0: [f1(1-10), f2(11-20)]   ← all values < group 1
+            //   Group 1: [f3(21-30), f4(31-40)]
+            //
+            // SPM would read ALL of group 0 first (values always smaller), then group 1.
+            // This degrades to single-threaded sequential I/O — the other partition
+            // sits idle the entire time, losing the parallelism benefit.
+        } else {
+            new_config.output_ordering = vec![];
+        }
+
+        Ok(new_config)
+    }
+
+    /// Last-resort optimization when FileSource returns `Unsupported`.
+    ///
+    /// FileSource may return `Unsupported` because `eq_properties` had no
+    /// ordering — which happens when `validated_output_ordering()` stripped
+    /// the ordering because files were in the wrong order. After sorting
+    /// files by statistics, the ordering may become valid again.
+    ///
+    /// This method:
+    /// 1. Sorts files within groups by min/max statistics
+    /// 2. Re-checks if the sorted file order makes `output_ordering` valid
+    /// 3. If valid AND non-overlapping → `Exact` (SortExec eliminated!)
+    /// 4. If files were reordered but ordering not valid → `Inexact`
+    /// 5. If no files were reordered → `Unsupported`
+    ///
+    /// This handles the key case where files have correct within-file ordering
+    /// (e.g., Parquet sorting_columns metadata) but were listed in wrong order
+    /// (e.g., alphabetical order doesn't match sort key order).
+    pub(crate) fn try_sort_file_groups_by_statistics(
+        &self,
+        order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn DataSource>>> {
+        let Some(sort_order) = LexOrdering::new(order.iter().cloned()) else {
+            return Ok(SortOrderPushdownResult::Unsupported);
+        };
+
+        let projected_schema = self.projected_schema()?;
+        let projection_indices = self
+            .file_source
+            .projection()
+            .as_ref()
+            .and_then(|p| ordered_column_indices_from_projection(p));
+
+        let result = sort_files_within_groups_by_statistics(
+            &self.file_groups,
+            &sort_order,
+            &projected_schema,
+            projection_indices.as_deref(),
+        );
+
+        if !result.any_reordered {
+            return Ok(SortOrderPushdownResult::Unsupported);
+        }
+
+        let mut new_config = self.clone();
+        new_config.file_groups = result.file_groups;
+
+        // Re-check: now that files are sorted, does output_ordering become valid?
+        // This handles the case where validated_output_ordering() previously
+        // stripped the ordering because files were in the wrong order.
+        //
+        // IMPORTANT: We cannot claim Exact if any file in a non-last position
+        // contains NULLs in the sort columns. With NULLS LAST, NULLs within
+        // a file are placed after all non-null values. If the next file has
+        // non-null values smaller than the previous file's max, those values
+        // would incorrectly appear after the NULLs. Similarly for NULLS FIRST.
+        //
+        // Conservative approach: if any file has nulls in the sort columns,
+        // do not claim Exact. The SortExec will handle NULL ordering correctly.
+        if result.all_non_overlapping
+            && !self.output_ordering.is_empty()
+            && !any_file_has_nulls_in_sort_columns(
+                &new_config.file_groups,
+                order,
+                &projected_schema,
+                projection_indices.as_deref(),
+            )
+        {
+            // Files are now non-overlapping, no NULLs in sort columns.
+            // Re-ask the FileSource if this ordering satisfies the request,
+            // using eq_properties computed from the NEW (sorted) file groups.
+            let new_eq_props = new_config.eq_properties();
+            if new_eq_props.ordering_satisfy(order.iter().cloned())? {
+                // The sorted file order makes the ordering valid → Exact!
+                return Ok(SortOrderPushdownResult::Exact {
+                    inner: Arc::new(new_config),
+                });
+            }
+        }
+
+        new_config.output_ordering = vec![];
+        Ok(SortOrderPushdownResult::Inexact {
+            inner: Arc::new(new_config),
+        })
+    }
+}
+
+/// Sort files within each file group by their min/max statistics.
+///
+/// No files are moved between groups — parallelism and group composition
+/// are unchanged. Groups where statistics are unavailable are kept as-is.
+///
+/// ```text
+/// Before:  Group [file_c(20-30), file_a(0-9), file_b(10-19)]
+/// After:   Group [file_a(0-9), file_b(10-19), file_c(20-30)]
+///                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+///                 sorted by min value, non-overlapping → Exact
+/// ```
+pub(crate) fn sort_files_within_groups_by_statistics(
+    file_groups: &[FileGroup],
+    sort_order: &LexOrdering,
+    projected_schema: &SchemaRef,
+    projection_indices: Option<&[usize]>,
+) -> SortedFileGroups {
+    let mut any_reordered = false;
+    let mut confirmed_non_overlapping: usize = 0;
+    let mut new_groups = Vec::with_capacity(file_groups.len());
+
+    for group in file_groups {
+        if group.len() <= 1 {
+            new_groups.push(group.clone());
+            confirmed_non_overlapping += 1;
+            continue;
+        }
+
+        let files: Vec<_> = group.iter().collect();
+
+        let statistics = match MinMaxStatistics::new_from_files(
+            sort_order,
+            projected_schema,
+            projection_indices,
+            files.iter().copied(),
+        ) {
+            Ok(stats) => stats,
+            Err(e) => {
+                log::trace!(
+                    "Cannot sort file group by statistics: {e}. Keeping original order."
+                );
+                new_groups.push(group.clone());
+                continue;
+            }
+        };
+
+        let sorted_indices = statistics.min_values_sorted();
+
+        let already_sorted = sorted_indices
+            .iter()
+            .enumerate()
+            .all(|(pos, (idx, _))| pos == *idx);
+
+        let sorted_group: FileGroup = if already_sorted {
+            group.clone()
+        } else {
+            any_reordered = true;
+            sorted_indices
+                .iter()
+                .map(|(idx, _)| files[*idx].clone())
+                .collect()
+        };
+
+        let sorted_files: Vec<_> = sorted_group.iter().collect();
+        let is_non_overlapping = match MinMaxStatistics::new_from_files(
+            sort_order,
+            projected_schema,
+            projection_indices,
+            sorted_files.iter().copied(),
+        ) {
+            Ok(stats) => stats.is_sorted(),
+            Err(_) => false,
+        };
+
+        if is_non_overlapping {
+            confirmed_non_overlapping += 1;
+        }
+
+        new_groups.push(sorted_group);
+    }
+
+    SortedFileGroups {
+        file_groups: new_groups,
+        any_reordered,
+        all_non_overlapping: confirmed_non_overlapping == file_groups.len(),
+    }
+}
+
+/// Check if any file in any group has nulls in the sort columns.
+pub(crate) fn any_file_has_nulls_in_sort_columns(
+    file_groups: &[FileGroup],
+    order: &[PhysicalSortExpr],
+    projected_schema: &SchemaRef,
+    projection_indices: Option<&[usize]>,
+) -> bool {
+    let Some(sort_columns) =
+        sort_columns_from_physical_sort_exprs_nullable(order, projected_schema)
+    else {
+        return true; // Can't determine, assume nulls exist
+    };
+
+    for group in file_groups {
+        for file in group.iter() {
+            let Some(stats) = file.statistics.as_ref() else {
+                return true; // No stats, assume nulls exist
+            };
+            for col in &sort_columns {
+                let stat_idx = projection_indices
+                    .map(|p| p[col.index()])
+                    .unwrap_or_else(|| col.index());
+                if stat_idx >= stats.column_statistics.len() {
+                    return true;
+                }
+                let col_stats = &stats.column_statistics[stat_idx];
+                match &col_stats.null_count {
+                    Precision::Exact(0) => {}           // No nulls, safe
+                    Precision::Exact(_) => return true, // Has nulls
+                    _ => return true, // Unknown null count, assume nulls
+                }
+            }
+        }
+    }
+    false
+}
+
+/// Get the indices of columns in a projection if the projection is a simple
+/// list of columns.
+/// If there are any expressions other than columns, returns None.
+pub(crate) fn ordered_column_indices_from_projection(
+    projection: &ProjectionExprs,
+) -> Option<Vec<usize>> {
+    projection
+        .expr_iter()
+        .map(|e| {
+            let index = e.downcast_ref::<Column>()?.index();
+            Some(index)
+        })
+        .collect::<Option<Vec<usize>>>()
+}
+
+/// Extract Column references from sort expressions for null checking.
+fn sort_columns_from_physical_sort_exprs_nullable(
+    order: &[PhysicalSortExpr],
+    _schema: &SchemaRef,
+) -> Option<Vec<Column>> {
+    order
+        .iter()
+        .map(|expr| expr.expr.downcast_ref::<Column>().cloned())
+        .collect()
+}
+
+/// Check whether a given ordering is valid for all file groups by verifying
+/// that files within each group are sorted according to their min/max statistics.
+///
+/// For single-file (or empty) groups, the ordering is trivially valid.
+/// For multi-file groups, we check that the min/max statistics for the sort
+/// columns are in order and non-overlapping (or touching at boundaries).
+///
+/// `projection` maps projected column indices back to table-schema indices
+/// when validating after projection; pass `None` when validating at
+/// table-schema level.
+pub(crate) fn is_ordering_valid_for_file_groups(
+    file_groups: &[FileGroup],
+    ordering: &LexOrdering,
+    schema: &SchemaRef,
+    projection: Option<&[usize]>,
+) -> bool {
+    file_groups.iter().all(|group| {
+        if group.len() <= 1 {
+            return true; // single-file groups are trivially sorted
+        }
+        match MinMaxStatistics::new_from_files(ordering, schema, projection, group.iter())
+        {
+            Ok(stats) => stats.is_sorted(),
+            Err(_) => false, // can't prove sorted → reject
+        }
+    })
+}
+
+/// Filters orderings to retain only those valid for all file groups,
+/// verified via min/max statistics.
+pub(crate) fn validate_orderings(
+    orderings: &[LexOrdering],
+    schema: &SchemaRef,
+    file_groups: &[FileGroup],
+    projection: Option<&[usize]>,
+) -> Vec<LexOrdering> {
+    orderings
+        .iter()
+        .filter(|ordering| {
+            is_ordering_valid_for_file_groups(file_groups, ordering, schema, projection)
+        })
+        .cloned()
+        .collect()
+}
+
+/// The various listing tables do not attempt to read all files
+/// concurrently, instead they will read files in sequence within a
+/// partition.  This is an important property as it allows plans to
+/// run against 1000s of files and not try to open them all
+/// concurrently.
+///
+/// However, it means if we assign more than one file to a partition
+/// the output sort order will not be preserved as illustrated in the
+/// following diagrams:
+///
+/// When only 1 file is assigned to each partition, each partition is
+/// correctly sorted on `(A, B, C)`
+///
+/// ```text
+/// ┏ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┓
+///   ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─  ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─  ┌ ─ ─ ─ ─ ─ ─ ─ ─ ┐
+/// ┃   ┌───────────────┐     ┌──────────────┐ │   ┌──────────────┐ │   ┌─────────────┐   ┃
+///   │ │   1.parquet   │ │ │ │  2.parquet   │   │ │  3.parquet   │   │ │  4.parquet  │ │
+/// ┃   │ Sort: A, B, C │     │Sort: A, B, C │ │   │Sort: A, B, C │ │   │Sort: A, B, C│   ┃
+///   │ └───────────────┘ │ │ └──────────────┘   │ └──────────────┘   │ └─────────────┘ │
+/// ┃                                          │                    │                     ┃
+///   │                   │ │                    │                    │                 │
+/// ┃                                          │                    │                     ┃
+///   │                   │ │                    │                    │                 │
+/// ┃                                          │                    │                     ┃
+///   │                   │ │                    │                    │                 │
+/// ┃  ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘  ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘  ─ ─ ─ ─ ─ ─ ─ ─ ─  ┃
+///      DataFusion           DataFusion           DataFusion           DataFusion
+/// ┃    Partition 1          Partition 2          Partition 3          Partition 4       ┃
+///  ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━
+///
+///                                      DataSourceExec
+/// ```
+///
+/// However, when more than 1 file is assigned to each partition, each
+/// partition is NOT correctly sorted on `(A, B, C)`. Once the second
+/// file is scanned, the same values for A, B and C can be repeated in
+/// the same sorted stream
+///
+///```text
+/// ┏ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━
+///   ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─  ┃
+/// ┃   ┌───────────────┐     ┌──────────────┐ │
+///   │ │   1.parquet   │ │ │ │  2.parquet   │   ┃
+/// ┃   │ Sort: A, B, C │     │Sort: A, B, C │ │
+///   │ └───────────────┘ │ │ └──────────────┘   ┃
+/// ┃   ┌───────────────┐     ┌──────────────┐ │
+///   │ │   3.parquet   │ │ │ │  4.parquet   │   ┃
+/// ┃   │ Sort: A, B, C │     │Sort: A, B, C │ │
+///   │ └───────────────┘ │ │ └──────────────┘   ┃
+/// ┃                                          │
+///   │                   │ │                    ┃
+/// ┃  ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
+///      DataFusion           DataFusion         ┃
+/// ┃    Partition 1          Partition 2
+///  ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┛
+///
+///              DataSourceExec
+/// ```
+///
+/// **Exception**: When files within a partition are **non-overlapping** (verified
+/// via min/max statistics) and each file is internally sorted, the combined
+/// output is still correctly sorted. Sort pushdown
+/// ([`FileScanConfig::try_pushdown_sort`]) detects this case and preserves
+/// `output_ordering`, allowing `SortExec` to be eliminated entirely.
+///
+/// ```text
+///   Partition 1 (files sorted by stats, non-overlapping):
+///   ┌──────────────────┐  ┌──────────────────┐  ┌──────────────────┐
+///   │   1.parquet      │  │   2.parquet      │  │   3.parquet      │
+///   │ A: [1..100]      │  │ A: [101..200]    │  │ A: [201..300]    │
+///   │ Sort: A, B, C    │  │ Sort: A, B, C    │  │ Sort: A, B, C    │
+///   └──────────────────┘  └──────────────────┘  └──────────────────┘
+///   max(1) <= min(2) ✓    max(2) <= min(3) ✓   → output_ordering preserved
+/// ```
+pub(crate) fn get_projected_output_ordering(
+    base_config: &FileScanConfig,
+    projected_schema: &SchemaRef,
+) -> Vec<LexOrdering> {
+    let projected_orderings =
+        project_orderings(&base_config.output_ordering, projected_schema);
+
+    let indices = base_config
+        .file_source
+        .projection()
+        .as_ref()
+        .map(|p| ordered_column_indices_from_projection(p));
+
+    match indices {
+        Some(Some(indices)) => {
+            // Simple column projection — validate with statistics
+            validate_orderings(
+                &projected_orderings,
+                projected_schema,
+                &base_config.file_groups,
+                Some(indices.as_slice()),
+            )
+        }
+        None => {
+            // No projection — validate with statistics (no remapping needed)
+            validate_orderings(
+                &projected_orderings,
+                projected_schema,
+                &base_config.file_groups,
+                None,
+            )
+        }
+        Some(None) => {
+            // Complex projection (expressions, not simple columns) — can't
+            // determine column indices for statistics. Still valid if all
+            // file groups have at most one file.
+            if base_config.file_groups.iter().all(|g| g.len() <= 1) {
+                projected_orderings
+            } else {
+                debug!(
+                    "Skipping specified output orderings. \
+                     Some file groups couldn't be determined to be sorted: {:?}",
+                    base_config.file_groups
+                );
+                vec![]
+            }
+        }
+    }
+}
diff --git a/datafusion/datasource/src/file_sink_config.rs b/datafusion/datasource/src/file_sink_config.rs
index 2968bd1ee0449..1abce86a3565f 100644
--- a/datafusion/datasource/src/file_sink_config.rs
+++ b/datafusion/datasource/src/file_sink_config.rs
@@ -17,10 +17,10 @@
 
 use std::sync::Arc;
 
+use crate::ListingTableUrl;
 use crate::file_groups::FileGroup;
 use crate::sink::DataSink;
-use crate::write::demux::{start_demuxer_task, DemuxedStreamReceiver};
-use crate::ListingTableUrl;
+use crate::write::demux::{DemuxedStreamReceiver, start_demuxer_task};
 
 use arrow::datatypes::{DataType, SchemaRef};
 use datafusion_common::Result;
@@ -32,6 +32,52 @@ use datafusion_expr::dml::InsertOp;
 use async_trait::async_trait;
 use object_store::ObjectStore;
 
+/// Determines how `FileSink` output paths are interpreted.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub enum FileOutputMode {
+    /// Infer output mode from the output URL (for example, by extension / trailing `/`).
+    #[default]
+    Automatic,
+    /// Write to a single output file at the exact output path.
+    SingleFile,
+    /// Write to a directory under the output path with generated filenames.
+    Directory,
+}
+
+impl FileOutputMode {
+    /// Resolve this mode into a `single_file_output` boolean for the demuxer.
+    pub fn single_file_output(self, base_output_path: &ListingTableUrl) -> bool {
+        match self {
+            Self::Automatic => {
+                !base_output_path.is_collection()
+                    && base_output_path.file_extension().is_some()
+            }
+            Self::SingleFile => true,
+            Self::Directory => false,
+        }
+    }
+}
+
+impl From<Option<bool>> for FileOutputMode {
+    fn from(value: Option<bool>) -> Self {
+        match value {
+            None => Self::Automatic,
+            Some(true) => Self::SingleFile,
+            Some(false) => Self::Directory,
+        }
+    }
+}
+
+impl From<FileOutputMode> for Option<bool> {
+    fn from(value: FileOutputMode) -> Self {
+        match value {
+            FileOutputMode::Automatic => None,
+            FileOutputMode::SingleFile => Some(true),
+            FileOutputMode::Directory => Some(false),
+        }
+    }
+}
+
 /// General behaviors for files that do `DataSink` operations
 #[async_trait]
 pub trait FileSink: DataSink {
@@ -112,6 +158,8 @@ pub struct FileSinkConfig {
     pub keep_partition_by_columns: bool,
     /// File extension without a dot(.)
     pub file_extension: String,
+    /// Determines how the output path is interpreted.
+    pub file_output_mode: FileOutputMode,
 }
 
 impl FileSinkConfig {
diff --git a/datafusion/datasource/src/file_stream.rs b/datafusion/datasource/src/file_stream.rs
deleted file mode 100644
index a4a43ca9aeab3..0000000000000
--- a/datafusion/datasource/src/file_stream.rs
+++ /dev/null
@@ -1,969 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! A generic stream over file format readers that can be used by
-//! any file format that read its files from start to end.
-//!
-//! Note: Most traits here need to be marked `Sync + Send` to be
-//! compliant with the `SendableRecordBatchStream` trait.
-
-use std::collections::VecDeque;
-use std::mem;
-use std::pin::Pin;
-use std::sync::Arc;
-use std::task::{Context, Poll};
-
-use crate::file_scan_config::{FileScanConfig, PartitionColumnProjector};
-use crate::PartitionedFile;
-use arrow::datatypes::SchemaRef;
-use datafusion_common::error::Result;
-use datafusion_execution::RecordBatchStream;
-use datafusion_physical_plan::metrics::{
-    BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder, Time,
-};
-
-use arrow::record_batch::RecordBatch;
-use datafusion_common::instant::Instant;
-use datafusion_common::ScalarValue;
-
-use futures::future::BoxFuture;
-use futures::stream::BoxStream;
-use futures::{ready, FutureExt as _, Stream, StreamExt as _};
-
-/// A stream that iterates record batch by record batch, file over file.
-pub struct FileStream {
-    /// An iterator over input files.
-    file_iter: VecDeque<PartitionedFile>,
-    /// The stream schema (file schema including partition columns and after
-    /// projection).
-    projected_schema: SchemaRef,
-    /// The remaining number of records to parse, None if no limit
-    remain: Option<usize>,
-    /// A dynamic [`FileOpener`]. Calling `open()` returns a [`FileOpenFuture`],
-    /// which can be resolved to a stream of `RecordBatch`.
-    file_opener: Arc<dyn FileOpener>,
-    /// The partition column projector
-    pc_projector: PartitionColumnProjector,
-    /// The stream state
-    state: FileStreamState,
-    /// File stream specific metrics
-    file_stream_metrics: FileStreamMetrics,
-    /// runtime baseline metrics
-    baseline_metrics: BaselineMetrics,
-    /// Describes the behavior of the `FileStream` if file opening or scanning fails
-    on_error: OnError,
-}
-
-impl FileStream {
-    /// Create a new `FileStream` using the give `FileOpener` to scan underlying files
-    pub fn new(
-        config: &FileScanConfig,
-        partition: usize,
-        file_opener: Arc<dyn FileOpener>,
-        metrics: &ExecutionPlanMetricsSet,
-    ) -> Result<Self> {
-        let projected_schema = config.projected_schema();
-        let pc_projector = PartitionColumnProjector::new(
-            Arc::clone(&projected_schema),
-            &config
-                .table_partition_cols()
-                .iter()
-                .map(|x| x.name().clone())
-                .collect::<Vec<_>>(),
-        );
-
-        let file_group = config.file_groups[partition].clone();
-
-        Ok(Self {
-            file_iter: file_group.into_inner().into_iter().collect(),
-            projected_schema,
-            remain: config.limit,
-            file_opener,
-            pc_projector,
-            state: FileStreamState::Idle,
-            file_stream_metrics: FileStreamMetrics::new(metrics, partition),
-            baseline_metrics: BaselineMetrics::new(metrics, partition),
-            on_error: OnError::Fail,
-        })
-    }
-
-    /// Specify the behavior when an error occurs opening or scanning a file
-    ///
-    /// If `OnError::Skip` the stream will skip files which encounter an error and continue
-    /// If `OnError:Fail` (default) the stream will fail and stop processing when an error occurs
-    pub fn with_on_error(mut self, on_error: OnError) -> Self {
-        self.on_error = on_error;
-        self
-    }
-
-    /// Begin opening the next file in parallel while decoding the current file in FileStream.
-    ///
-    /// Since file opening is mostly IO (and may involve a
-    /// bunch of sequential IO), it can be parallelized with decoding.
-    fn start_next_file(&mut self) -> Option<Result<(FileOpenFuture, Vec<ScalarValue>)>> {
-        let part_file = self.file_iter.pop_front()?;
-
-        let partition_values = part_file.partition_values.clone();
-        Some(
-            self.file_opener
-                .open(part_file)
-                .map(|future| (future, partition_values)),
-        )
-    }
-
-    fn poll_inner(&mut self, cx: &mut Context<'_>) -> Poll<Option<Result<RecordBatch>>> {
-        loop {
-            match &mut self.state {
-                FileStreamState::Idle => {
-                    self.file_stream_metrics.time_opening.start();
-
-                    match self.start_next_file().transpose() {
-                        Ok(Some((future, partition_values))) => {
-                            self.state = FileStreamState::Open {
-                                future,
-                                partition_values,
-                            }
-                        }
-                        Ok(None) => return Poll::Ready(None),
-                        Err(e) => {
-                            self.state = FileStreamState::Error;
-                            return Poll::Ready(Some(Err(e)));
-                        }
-                    }
-                }
-                FileStreamState::Open {
-                    future,
-                    partition_values,
-                } => match ready!(future.poll_unpin(cx)) {
-                    Ok(reader) => {
-                        let partition_values = mem::take(partition_values);
-
-                        // include time needed to start opening in `start_next_file`
-                        self.file_stream_metrics.time_opening.stop();
-                        let next = self.start_next_file().transpose();
-                        self.file_stream_metrics.time_scanning_until_data.start();
-                        self.file_stream_metrics.time_scanning_total.start();
-
-                        match next {
-                            Ok(Some((next_future, next_partition_values))) => {
-                                self.state = FileStreamState::Scan {
-                                    partition_values,
-                                    reader,
-                                    next: Some((
-                                        NextOpen::Pending(next_future),
-                                        next_partition_values,
-                                    )),
-                                };
-                            }
-                            Ok(None) => {
-                                self.state = FileStreamState::Scan {
-                                    reader,
-                                    partition_values,
-                                    next: None,
-                                };
-                            }
-                            Err(e) => {
-                                self.state = FileStreamState::Error;
-                                return Poll::Ready(Some(Err(e)));
-                            }
-                        }
-                    }
-                    Err(e) => {
-                        self.file_stream_metrics.file_open_errors.add(1);
-                        match self.on_error {
-                            OnError::Skip => {
-                                self.file_stream_metrics.time_opening.stop();
-                                self.state = FileStreamState::Idle
-                            }
-                            OnError::Fail => {
-                                self.state = FileStreamState::Error;
-                                return Poll::Ready(Some(Err(e)));
-                            }
-                        }
-                    }
-                },
-                FileStreamState::Scan {
-                    reader,
-                    partition_values,
-                    next,
-                } => {
-                    // We need to poll the next `FileOpenFuture` here to drive it forward
-                    if let Some((next_open_future, _)) = next {
-                        if let NextOpen::Pending(f) = next_open_future {
-                            if let Poll::Ready(reader) = f.as_mut().poll(cx) {
-                                *next_open_future = NextOpen::Ready(reader);
-                            }
-                        }
-                    }
-                    match ready!(reader.poll_next_unpin(cx)) {
-                        Some(Ok(batch)) => {
-                            self.file_stream_metrics.time_scanning_until_data.stop();
-                            self.file_stream_metrics.time_scanning_total.stop();
-                            let result = self
-                                .pc_projector
-                                .project(batch, partition_values)
-                                .map(|batch| match &mut self.remain {
-                                    Some(remain) => {
-                                        if *remain > batch.num_rows() {
-                                            *remain -= batch.num_rows();
-                                            batch
-                                        } else {
-                                            let batch = batch.slice(0, *remain);
-                                            self.state = FileStreamState::Limit;
-                                            *remain = 0;
-                                            batch
-                                        }
-                                    }
-                                    None => batch,
-                                });
-
-                            if result.is_err() {
-                                // If the partition value projection fails, this is not governed by
-                                // the `OnError` behavior
-                                self.state = FileStreamState::Error
-                            }
-                            self.file_stream_metrics.time_scanning_total.start();
-                            return Poll::Ready(Some(result));
-                        }
-                        Some(Err(err)) => {
-                            self.file_stream_metrics.file_scan_errors.add(1);
-                            self.file_stream_metrics.time_scanning_until_data.stop();
-                            self.file_stream_metrics.time_scanning_total.stop();
-
-                            match self.on_error {
-                                // If `OnError::Skip` we skip the file as soon as we hit the first error
-                                OnError::Skip => match mem::take(next) {
-                                    Some((future, partition_values)) => {
-                                        self.file_stream_metrics.time_opening.start();
-
-                                        match future {
-                                            NextOpen::Pending(future) => {
-                                                self.state = FileStreamState::Open {
-                                                    future,
-                                                    partition_values,
-                                                }
-                                            }
-                                            NextOpen::Ready(reader) => {
-                                                self.state = FileStreamState::Open {
-                                                    future: Box::pin(std::future::ready(
-                                                        reader,
-                                                    )),
-                                                    partition_values,
-                                                }
-                                            }
-                                        }
-                                    }
-                                    None => return Poll::Ready(None),
-                                },
-                                OnError::Fail => {
-                                    self.state = FileStreamState::Error;
-                                    return Poll::Ready(Some(Err(err)));
-                                }
-                            }
-                        }
-                        None => {
-                            self.file_stream_metrics.time_scanning_until_data.stop();
-                            self.file_stream_metrics.time_scanning_total.stop();
-
-                            match mem::take(next) {
-                                Some((future, partition_values)) => {
-                                    self.file_stream_metrics.time_opening.start();
-
-                                    match future {
-                                        NextOpen::Pending(future) => {
-                                            self.state = FileStreamState::Open {
-                                                future,
-                                                partition_values,
-                                            }
-                                        }
-                                        NextOpen::Ready(reader) => {
-                                            self.state = FileStreamState::Open {
-                                                future: Box::pin(std::future::ready(
-                                                    reader,
-                                                )),
-                                                partition_values,
-                                            }
-                                        }
-                                    }
-                                }
-                                None => return Poll::Ready(None),
-                            }
-                        }
-                    }
-                }
-                FileStreamState::Error | FileStreamState::Limit => {
-                    return Poll::Ready(None)
-                }
-            }
-        }
-    }
-}
-
-impl Stream for FileStream {
-    type Item = Result<RecordBatch>;
-
-    fn poll_next(
-        mut self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Self::Item>> {
-        self.file_stream_metrics.time_processing.start();
-        let result = self.poll_inner(cx);
-        self.file_stream_metrics.time_processing.stop();
-        self.baseline_metrics.record_poll(result)
-    }
-}
-
-impl RecordBatchStream for FileStream {
-    fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.projected_schema)
-    }
-}
-
-/// A fallible future that resolves to a stream of [`RecordBatch`]
-pub type FileOpenFuture =
-    BoxFuture<'static, Result<BoxStream<'static, Result<RecordBatch>>>>;
-
-/// Describes the behavior of the `FileStream` if file opening or scanning fails
-#[derive(Default)]
-pub enum OnError {
-    /// Fail the entire stream and return the underlying error
-    #[default]
-    Fail,
-    /// Continue scanning, ignoring the failed file
-    Skip,
-}
-
-/// Generic API for opening a file using an [`ObjectStore`] and resolving to a
-/// stream of [`RecordBatch`]
-///
-/// [`ObjectStore`]: object_store::ObjectStore
-pub trait FileOpener: Unpin + Send + Sync {
-    /// Asynchronously open the specified file and return a stream
-    /// of [`RecordBatch`]
-    fn open(&self, partitioned_file: PartitionedFile) -> Result<FileOpenFuture>;
-}
-
-/// Represents the state of the next `FileOpenFuture`. Since we need to poll
-/// this future while scanning the current file, we need to store the result if it
-/// is ready
-pub enum NextOpen {
-    Pending(FileOpenFuture),
-    Ready(Result<BoxStream<'static, Result<RecordBatch>>>),
-}
-
-pub enum FileStreamState {
-    /// The idle state, no file is currently being read
-    Idle,
-    /// Currently performing asynchronous IO to obtain a stream of RecordBatch
-    /// for a given file
-    Open {
-        /// A [`FileOpenFuture`] returned by [`FileOpener::open`]
-        future: FileOpenFuture,
-        /// The partition values for this file
-        partition_values: Vec<ScalarValue>,
-    },
-    /// Scanning the [`BoxStream`] returned by the completion of a [`FileOpenFuture`]
-    /// returned by [`FileOpener::open`]
-    Scan {
-        /// Partitioning column values for the current batch_iter
-        partition_values: Vec<ScalarValue>,
-        /// The reader instance
-        reader: BoxStream<'static, Result<RecordBatch>>,
-        /// A [`FileOpenFuture`] for the next file to be processed,
-        /// and its corresponding partition column values, if any.
-        /// This allows the next file to be opened in parallel while the
-        /// current file is read.
-        next: Option<(NextOpen, Vec<ScalarValue>)>,
-    },
-    /// Encountered an error
-    Error,
-    /// Reached the row limit
-    Limit,
-}
-
-/// A timer that can be started and stopped.
-pub struct StartableTime {
-    pub metrics: Time,
-    // use for record each part cost time, will eventually add into 'metrics'.
-    pub start: Option<Instant>,
-}
-
-impl StartableTime {
-    pub fn start(&mut self) {
-        assert!(self.start.is_none());
-        self.start = Some(Instant::now());
-    }
-
-    pub fn stop(&mut self) {
-        if let Some(start) = self.start.take() {
-            self.metrics.add_elapsed(start);
-        }
-    }
-}
-
-#[allow(rustdoc::broken_intra_doc_links)]
-/// Metrics for [`FileStream`]
-///
-/// Note that all of these metrics are in terms of wall clock time
-/// (not cpu time) so they include time spent waiting on I/O as well
-/// as other operators.
-///
-/// [`FileStream`]: <https://github.com/apache/datafusion/blob/main/datafusion/datasource/src/file_stream.rs>
-pub struct FileStreamMetrics {
-    /// Wall clock time elapsed for file opening.
-    ///
-    /// Time between when [`FileOpener::open`] is called and when the
-    /// [`FileStream`] receives a stream for reading.
-    ///
-    /// If there are multiple files being scanned, the stream
-    /// will open the next file in the background while scanning the
-    /// current file. This metric will only capture time spent opening
-    /// while not also scanning.
-    /// [`FileStream`]: <https://github.com/apache/datafusion/blob/main/datafusion/datasource/src/file_stream.rs>
-    pub time_opening: StartableTime,
-    /// Wall clock time elapsed for file scanning + first record batch of decompression + decoding
-    ///
-    /// Time between when the [`FileStream`] requests data from the
-    /// stream and when the first [`RecordBatch`] is produced.
-    /// [`FileStream`]: <https://github.com/apache/datafusion/blob/main/datafusion/datasource/src/file_stream.rs>
-    pub time_scanning_until_data: StartableTime,
-    /// Total elapsed wall clock time for scanning + record batch decompression / decoding
-    ///
-    /// Sum of time between when the [`FileStream`] requests data from
-    /// the stream and when a [`RecordBatch`] is produced for all
-    /// record batches in the stream. Note that this metric also
-    /// includes the time of the parent operator's execution.
-    pub time_scanning_total: StartableTime,
-    /// Wall clock time elapsed for data decompression + decoding
-    ///
-    /// Time spent waiting for the FileStream's input.
-    pub time_processing: StartableTime,
-    /// Count of errors opening file.
-    ///
-    /// If using `OnError::Skip` this will provide a count of the number of files
-    /// which were skipped and will not be included in the scan results.
-    pub file_open_errors: Count,
-    /// Count of errors scanning file
-    ///
-    /// If using `OnError::Skip` this will provide a count of the number of files
-    /// which were skipped and will not be included in the scan results.
-    pub file_scan_errors: Count,
-}
-
-impl FileStreamMetrics {
-    pub fn new(metrics: &ExecutionPlanMetricsSet, partition: usize) -> Self {
-        let time_opening = StartableTime {
-            metrics: MetricBuilder::new(metrics)
-                .subset_time("time_elapsed_opening", partition),
-            start: None,
-        };
-
-        let time_scanning_until_data = StartableTime {
-            metrics: MetricBuilder::new(metrics)
-                .subset_time("time_elapsed_scanning_until_data", partition),
-            start: None,
-        };
-
-        let time_scanning_total = StartableTime {
-            metrics: MetricBuilder::new(metrics)
-                .subset_time("time_elapsed_scanning_total", partition),
-            start: None,
-        };
-
-        let time_processing = StartableTime {
-            metrics: MetricBuilder::new(metrics)
-                .subset_time("time_elapsed_processing", partition),
-            start: None,
-        };
-
-        let file_open_errors =
-            MetricBuilder::new(metrics).counter("file_open_errors", partition);
-
-        let file_scan_errors =
-            MetricBuilder::new(metrics).counter("file_scan_errors", partition);
-
-        Self {
-            time_opening,
-            time_scanning_until_data,
-            time_scanning_total,
-            time_processing,
-            file_open_errors,
-            file_scan_errors,
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::file_scan_config::FileScanConfigBuilder;
-    use crate::tests::make_partition;
-    use crate::PartitionedFile;
-    use datafusion_common::error::Result;
-    use datafusion_execution::object_store::ObjectStoreUrl;
-    use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
-    use futures::{FutureExt as _, StreamExt as _};
-    use std::sync::atomic::{AtomicUsize, Ordering};
-    use std::sync::Arc;
-
-    use crate::file_stream::{FileOpenFuture, FileOpener, FileStream, OnError};
-    use crate::test_util::MockSource;
-    use arrow::array::RecordBatch;
-    use arrow::datatypes::Schema;
-
-    use datafusion_common::{assert_batches_eq, exec_err, internal_err};
-
-    /// Test `FileOpener` which will simulate errors during file opening or scanning
-    #[derive(Default)]
-    struct TestOpener {
-        /// Index in stream of files which should throw an error while opening
-        error_opening_idx: Vec<usize>,
-        /// Index in stream of files which should throw an error while scanning
-        error_scanning_idx: Vec<usize>,
-        /// Index of last file in stream
-        current_idx: AtomicUsize,
-        /// `RecordBatch` to return
-        records: Vec<RecordBatch>,
-    }
-
-    impl FileOpener for TestOpener {
-        fn open(&self, _partitioned_file: PartitionedFile) -> Result<FileOpenFuture> {
-            let idx = self.current_idx.fetch_add(1, Ordering::SeqCst);
-
-            if self.error_opening_idx.contains(&idx) {
-                Ok(futures::future::ready(internal_err!("error opening")).boxed())
-            } else if self.error_scanning_idx.contains(&idx) {
-                let error = futures::future::ready(exec_err!("error scanning"));
-                let stream = futures::stream::once(error).boxed();
-                Ok(futures::future::ready(Ok(stream)).boxed())
-            } else {
-                let iterator = self.records.clone().into_iter().map(Ok);
-                let stream = futures::stream::iter(iterator).boxed();
-                Ok(futures::future::ready(Ok(stream)).boxed())
-            }
-        }
-    }
-
-    #[derive(Default)]
-    struct FileStreamTest {
-        /// Number of files in the stream
-        num_files: usize,
-        /// Global limit of records emitted by the stream
-        limit: Option<usize>,
-        /// Error-handling behavior of the stream
-        on_error: OnError,
-        /// Mock `FileOpener`
-        opener: TestOpener,
-    }
-
-    impl FileStreamTest {
-        pub fn new() -> Self {
-            Self::default()
-        }
-
-        /// Specify the number of files in the stream
-        pub fn with_num_files(mut self, num_files: usize) -> Self {
-            self.num_files = num_files;
-            self
-        }
-
-        /// Specify the limit
-        pub fn with_limit(mut self, limit: Option<usize>) -> Self {
-            self.limit = limit;
-            self
-        }
-
-        /// Specify the index of files in the stream which should
-        /// throw an error when opening
-        pub fn with_open_errors(mut self, idx: Vec<usize>) -> Self {
-            self.opener.error_opening_idx = idx;
-            self
-        }
-
-        /// Specify the index of files in the stream which should
-        /// throw an error when scanning
-        pub fn with_scan_errors(mut self, idx: Vec<usize>) -> Self {
-            self.opener.error_scanning_idx = idx;
-            self
-        }
-
-        /// Specify the behavior of the stream when an error occurs
-        pub fn with_on_error(mut self, on_error: OnError) -> Self {
-            self.on_error = on_error;
-            self
-        }
-
-        /// Specify the record batches that should be returned from each
-        /// file that is successfully scanned
-        pub fn with_records(mut self, records: Vec<RecordBatch>) -> Self {
-            self.opener.records = records;
-            self
-        }
-
-        /// Collect the results of the `FileStream`
-        pub async fn result(self) -> Result<Vec<RecordBatch>> {
-            let file_schema = self
-                .opener
-                .records
-                .first()
-                .map(|batch| batch.schema())
-                .unwrap_or_else(|| Arc::new(Schema::empty()));
-
-            // let ctx = SessionContext::new();
-            let mock_files: Vec<(String, u64)> = (0..self.num_files)
-                .map(|idx| (format!("mock_file{idx}"), 10_u64))
-                .collect();
-
-            // let mock_files_ref: Vec<(&str, u64)> = mock_files
-            //     .iter()
-            //     .map(|(name, size)| (name.as_str(), *size))
-            //     .collect();
-
-            let file_group = mock_files
-                .into_iter()
-                .map(|(name, size)| PartitionedFile::new(name, size))
-                .collect();
-
-            let on_error = self.on_error;
-
-            let config = FileScanConfigBuilder::new(
-                ObjectStoreUrl::parse("test:///").unwrap(),
-                file_schema,
-                Arc::new(MockSource::default()),
-            )
-            .with_file_group(file_group)
-            .with_limit(self.limit)
-            .build();
-            let metrics_set = ExecutionPlanMetricsSet::new();
-            let file_stream =
-                FileStream::new(&config, 0, Arc::new(self.opener), &metrics_set)
-                    .unwrap()
-                    .with_on_error(on_error);
-
-            file_stream
-                .collect::<Vec<_>>()
-                .await
-                .into_iter()
-                .collect::<Result<Vec<_>>>()
-        }
-    }
-
-    /// helper that creates a stream of 2 files with the same pair of batches in each ([0,1,2] and [0,1])
-    async fn create_and_collect(limit: Option<usize>) -> Vec<RecordBatch> {
-        FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(2)
-            .with_limit(limit)
-            .result()
-            .await
-            .expect("error executing stream")
-    }
-
-    #[tokio::test]
-    async fn on_error_opening() -> Result<()> {
-        let batches = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(2)
-            .with_on_error(OnError::Skip)
-            .with_open_errors(vec![0])
-            .result()
-            .await?;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "+---+",
-            "| i |",
-            "+---+",
-            "| 0 |",
-            "| 1 |",
-            "| 2 |",
-            "| 0 |",
-            "| 1 |",
-            "+---+",
-        ], &batches);
-
-        let batches = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(2)
-            .with_on_error(OnError::Skip)
-            .with_open_errors(vec![1])
-            .result()
-            .await?;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "+---+",
-            "| i |",
-            "+---+",
-            "| 0 |",
-            "| 1 |",
-            "| 2 |",
-            "| 0 |",
-            "| 1 |",
-            "+---+",
-        ], &batches);
-
-        let batches = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(2)
-            .with_on_error(OnError::Skip)
-            .with_open_errors(vec![0, 1])
-            .result()
-            .await?;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "++",
-            "++",
-        ], &batches);
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn on_error_scanning_fail() -> Result<()> {
-        let result = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(2)
-            .with_on_error(OnError::Fail)
-            .with_scan_errors(vec![1])
-            .result()
-            .await;
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn on_error_opening_fail() -> Result<()> {
-        let result = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(2)
-            .with_on_error(OnError::Fail)
-            .with_open_errors(vec![1])
-            .result()
-            .await;
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn on_error_scanning() -> Result<()> {
-        let batches = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(2)
-            .with_on_error(OnError::Skip)
-            .with_scan_errors(vec![0])
-            .result()
-            .await?;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "+---+",
-            "| i |",
-            "+---+",
-            "| 0 |",
-            "| 1 |",
-            "| 2 |",
-            "| 0 |",
-            "| 1 |",
-            "+---+",
-        ], &batches);
-
-        let batches = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(2)
-            .with_on_error(OnError::Skip)
-            .with_scan_errors(vec![1])
-            .result()
-            .await?;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "+---+",
-            "| i |",
-            "+---+",
-            "| 0 |",
-            "| 1 |",
-            "| 2 |",
-            "| 0 |",
-            "| 1 |",
-            "+---+",
-        ], &batches);
-
-        let batches = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(2)
-            .with_on_error(OnError::Skip)
-            .with_scan_errors(vec![0, 1])
-            .result()
-            .await?;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "++",
-            "++",
-        ], &batches);
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn on_error_mixed() -> Result<()> {
-        let batches = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(3)
-            .with_on_error(OnError::Skip)
-            .with_open_errors(vec![1])
-            .with_scan_errors(vec![0])
-            .result()
-            .await?;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "+---+",
-            "| i |",
-            "+---+",
-            "| 0 |",
-            "| 1 |",
-            "| 2 |",
-            "| 0 |",
-            "| 1 |",
-            "+---+",
-        ], &batches);
-
-        let batches = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(3)
-            .with_on_error(OnError::Skip)
-            .with_open_errors(vec![0])
-            .with_scan_errors(vec![1])
-            .result()
-            .await?;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "+---+",
-            "| i |",
-            "+---+",
-            "| 0 |",
-            "| 1 |",
-            "| 2 |",
-            "| 0 |",
-            "| 1 |",
-            "+---+",
-        ], &batches);
-
-        let batches = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(3)
-            .with_on_error(OnError::Skip)
-            .with_open_errors(vec![2])
-            .with_scan_errors(vec![0, 1])
-            .result()
-            .await?;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "++",
-            "++",
-        ], &batches);
-
-        let batches = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(3)
-            .with_on_error(OnError::Skip)
-            .with_open_errors(vec![0, 2])
-            .with_scan_errors(vec![1])
-            .result()
-            .await?;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "++",
-            "++",
-        ], &batches);
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn without_limit() -> Result<()> {
-        let batches = create_and_collect(None).await;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "+---+",
-            "| i |",
-            "+---+",
-            "| 0 |",
-            "| 1 |",
-            "| 2 |",
-            "| 0 |",
-            "| 1 |",
-            "| 0 |",
-            "| 1 |",
-            "| 2 |",
-            "| 0 |",
-            "| 1 |",
-            "+---+",
-        ], &batches);
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn with_limit_between_files() -> Result<()> {
-        let batches = create_and_collect(Some(5)).await;
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "+---+",
-            "| i |",
-            "+---+",
-            "| 0 |",
-            "| 1 |",
-            "| 2 |",
-            "| 0 |",
-            "| 1 |",
-            "+---+",
-        ], &batches);
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn with_limit_at_middle_of_batch() -> Result<()> {
-        let batches = create_and_collect(Some(6)).await;
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "+---+",
-            "| i |",
-            "+---+",
-            "| 0 |",
-            "| 1 |",
-            "| 2 |",
-            "| 0 |",
-            "| 1 |",
-            "| 0 |",
-            "+---+",
-        ], &batches);
-
-        Ok(())
-    }
-}
diff --git a/datafusion/datasource/src/file_stream/builder.rs b/datafusion/datasource/src/file_stream/builder.rs
new file mode 100644
index 0000000000000..7034e902550a9
--- /dev/null
+++ b/datafusion/datasource/src/file_stream/builder.rs
@@ -0,0 +1,142 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use crate::file_scan_config::FileScanConfig;
+use crate::file_stream::scan_state::ScanState;
+use crate::file_stream::work_source::{SharedWorkSource, WorkSource};
+use crate::morsel::{FileOpenerMorselizer, Morselizer};
+use datafusion_common::{Result, internal_err};
+use datafusion_physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet};
+
+use super::metrics::FileStreamMetrics;
+use super::{FileOpener, FileStream, FileStreamState, OnError};
+
+/// Builder for constructing a [`FileStream`].
+pub struct FileStreamBuilder<'a> {
+    config: &'a FileScanConfig,
+    partition: Option<usize>,
+    morselizer: Option<Box<dyn Morselizer>>,
+    metrics: Option<&'a ExecutionPlanMetricsSet>,
+    on_error: OnError,
+    shared_work_source: Option<SharedWorkSource>,
+}
+
+impl<'a> FileStreamBuilder<'a> {
+    /// Create a new builder for [`FileStream`].
+    pub fn new(config: &'a FileScanConfig) -> Self {
+        Self {
+            config,
+            partition: None,
+            morselizer: None,
+            metrics: None,
+            on_error: OnError::Fail,
+            shared_work_source: None,
+        }
+    }
+
+    /// Configure the partition to scan.
+    pub fn with_partition(mut self, partition: usize) -> Self {
+        self.partition = Some(partition);
+        self
+    }
+
+    /// Configure the [`FileOpener`] used to open files.
+    ///
+    /// This will overwrite any setting from [`Self::with_morselizer`]
+    pub fn with_file_opener(mut self, file_opener: Arc<dyn FileOpener>) -> Self {
+        self.morselizer = Some(Box::new(FileOpenerMorselizer::new(file_opener)));
+        self
+    }
+
+    /// Configure the [`Morselizer`] used to open files.
+    ///
+    /// This will overwrite any setting from [`Self::with_file_opener`]
+    pub fn with_morselizer(mut self, morselizer: Box<dyn Morselizer>) -> Self {
+        self.morselizer = Some(morselizer);
+        self
+    }
+
+    /// Configure the metrics set used by the stream.
+    pub fn with_metrics(mut self, metrics: &'a ExecutionPlanMetricsSet) -> Self {
+        self.metrics = Some(metrics);
+        self
+    }
+
+    /// Configure the behavior when opening or scanning a file fails.
+    pub fn with_on_error(mut self, on_error: OnError) -> Self {
+        self.on_error = on_error;
+        self
+    }
+
+    /// Configure the [`SharedWorkSource`] for sibling work stealing.
+    pub(crate) fn with_shared_work_source(
+        mut self,
+        shared_work_source: Option<SharedWorkSource>,
+    ) -> Self {
+        self.shared_work_source = shared_work_source;
+        self
+    }
+
+    /// Build the configured [`FileStream`].
+    pub fn build(self) -> Result<FileStream> {
+        let Self {
+            config,
+            partition,
+            morselizer,
+            metrics,
+            on_error,
+            shared_work_source,
+        } = self;
+
+        let Some(partition) = partition else {
+            return internal_err!("FileStreamBuilder missing required partition");
+        };
+        let Some(morselizer) = morselizer else {
+            return internal_err!("FileStreamBuilder missing required morselizer");
+        };
+        let Some(metrics) = metrics else {
+            return internal_err!("FileStreamBuilder missing required metrics");
+        };
+        let projected_schema = config.projected_schema()?;
+        let Some(file_group) = config.file_groups.get(partition).cloned() else {
+            return internal_err!(
+                "FileStreamBuilder invalid partition index: {partition}"
+            );
+        };
+        let work_source = match shared_work_source {
+            Some(shared) => WorkSource::Shared(shared),
+            None => WorkSource::Local(file_group.into_inner().into()),
+        };
+
+        let file_stream_metrics = FileStreamMetrics::new(metrics, partition);
+        let scan_state = Box::new(ScanState::new(
+            work_source,
+            config.limit,
+            morselizer,
+            on_error,
+            file_stream_metrics,
+        ));
+
+        Ok(FileStream {
+            projected_schema,
+            state: FileStreamState::Scan { scan_state },
+            baseline_metrics: BaselineMetrics::new(metrics, partition),
+        })
+    }
+}
diff --git a/datafusion/datasource/src/file_stream/metrics.rs b/datafusion/datasource/src/file_stream/metrics.rs
new file mode 100644
index 0000000000000..5f3894404f408
--- /dev/null
+++ b/datafusion/datasource/src/file_stream/metrics.rs
@@ -0,0 +1,159 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_common::instant::Instant;
+use datafusion_physical_plan::metrics::{
+    Count, ExecutionPlanMetricsSet, MetricBuilder, MetricCategory, Time,
+};
+
+/// A timer that can be started and stopped.
+pub struct StartableTime {
+    pub metrics: Time,
+    // use for record each part cost time, will eventually add into 'metrics'.
+    pub start: Option<Instant>,
+}
+
+impl StartableTime {
+    pub fn start(&mut self) {
+        assert!(self.start.is_none());
+        self.start = Some(Instant::now());
+    }
+
+    pub fn stop(&mut self) {
+        if let Some(start) = self.start.take() {
+            self.metrics.add_elapsed(start);
+        }
+    }
+}
+
+/// Metrics for [`FileStream`]
+///
+/// Note that all of these metrics are in terms of wall clock time
+/// (not cpu time) so they include time spent waiting on I/O as well
+/// as other operators.
+///
+/// [`FileStream`]: <https://github.com/apache/datafusion/blob/main/datafusion/datasource/src/file_stream.rs>
+pub struct FileStreamMetrics {
+    /// Wall clock time elapsed for file opening.
+    ///
+    /// Time between when [`FileOpener::open`] is called and when the
+    /// [`FileStream`] receives a stream for reading.
+    ///
+    /// [`FileStream`]: crate::file_stream::FileStream
+    /// [`FileOpener::open`]: crate::file_stream::FileOpener::open
+    pub time_opening: StartableTime,
+    /// Wall clock time elapsed for file scanning + first record batch of decompression + decoding
+    ///
+    /// Time between when the [`FileStream`] requests data from the
+    /// stream and when the first [`RecordBatch`] is produced.
+    ///
+    /// [`FileStream`]: crate::file_stream::FileStream
+    /// [`RecordBatch`]: arrow::record_batch::RecordBatch
+    pub time_scanning_until_data: StartableTime,
+    /// Total elapsed wall clock time for scanning + record batch decompression / decoding
+    ///
+    /// Sum of time between when the [`FileStream`] requests data from
+    /// the stream and when a [`RecordBatch`] is produced for all
+    /// record batches in the stream. Note that this metric also
+    /// includes the time of the parent operator's execution.
+    ///
+    /// [`FileStream`]: crate::file_stream::FileStream
+    /// [`RecordBatch`]: arrow::record_batch::RecordBatch
+    pub time_scanning_total: StartableTime,
+    /// Wall clock time elapsed for data decompression + decoding
+    ///
+    /// Time spent waiting for the FileStream's input.
+    pub time_processing: Time,
+    /// Count of errors opening file.
+    ///
+    /// If using `OnError::Skip` this will provide a count of the number of files
+    /// which were skipped and will not be included in the scan results.
+    pub file_open_errors: Count,
+    /// Count of errors scanning file
+    ///
+    /// If using `OnError::Skip` this will provide a count of the number of files
+    /// which were skipped and will not be included in the scan results.
+    pub file_scan_errors: Count,
+    /// Count of files successfully opened or evaluated for processing.
+    /// At t=end (completion of a query) this is equal to `files_opened`, and both values are equal
+    /// to the total number of files in the query; unless the query itself fails.
+    /// This value will always be greater than or equal to `files_open`.
+    /// Note that this value does *not* mean the file was actually scanned.
+    /// We increment this value for any processing of a file, even if that processing is
+    /// discarding it because we hit a `LIMIT` (in this case `files_opened` and `files_processed` are both incremented at the same time).
+    pub files_opened: Count,
+    /// Count of files completely processed / closed (opened, pruned, or skipped due to limit).
+    /// At t=0 (the beginning of a query) this is 0.
+    /// At t=end (completion of a query) this is equal to `files_opened`, and both values are equal
+    /// to the total number of files in the query; unless the query itself fails.
+    /// This value will always be less than or equal to `files_open`.
+    /// We increment this value for any processing of a file, even if that processing is
+    /// discarding it because we hit a `LIMIT` (in this case `files_opened` and `files_processed` are both incremented at the same time).
+    pub files_processed: Count,
+}
+
+impl FileStreamMetrics {
+    pub fn new(metrics: &ExecutionPlanMetricsSet, partition: usize) -> Self {
+        let time_opening = StartableTime {
+            metrics: MetricBuilder::new(metrics)
+                .subset_time("time_elapsed_opening", partition),
+            start: None,
+        };
+
+        let time_scanning_until_data = StartableTime {
+            metrics: MetricBuilder::new(metrics)
+                .subset_time("time_elapsed_scanning_until_data", partition),
+            start: None,
+        };
+
+        let time_scanning_total = StartableTime {
+            metrics: MetricBuilder::new(metrics)
+                .subset_time("time_elapsed_scanning_total", partition),
+            start: None,
+        };
+
+        let time_processing =
+            MetricBuilder::new(metrics).subset_time("time_elapsed_processing", partition);
+
+        let file_open_errors = MetricBuilder::new(metrics)
+            .with_category(MetricCategory::Rows)
+            .counter("file_open_errors", partition);
+
+        let file_scan_errors = MetricBuilder::new(metrics)
+            .with_category(MetricCategory::Rows)
+            .counter("file_scan_errors", partition);
+
+        let files_opened = MetricBuilder::new(metrics)
+            .with_category(MetricCategory::Rows)
+            .counter("files_opened", partition);
+
+        let files_processed = MetricBuilder::new(metrics)
+            .with_category(MetricCategory::Rows)
+            .counter("files_processed", partition);
+
+        Self {
+            time_opening,
+            time_scanning_until_data,
+            time_scanning_total,
+            time_processing,
+            file_open_errors,
+            file_scan_errors,
+            files_opened,
+            files_processed,
+        }
+    }
+}
diff --git a/datafusion/datasource/src/file_stream/mod.rs b/datafusion/datasource/src/file_stream/mod.rs
new file mode 100644
index 0000000000000..e277690cff810
--- /dev/null
+++ b/datafusion/datasource/src/file_stream/mod.rs
@@ -0,0 +1,1676 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! A generic stream over file format readers that can be used by
+//! any file format that read its files from start to end.
+//!
+//! Note: Most traits here need to be marked `Sync + Send` to be
+//! compliant with the `SendableRecordBatchStream` trait.
+
+mod builder;
+mod metrics;
+mod scan_state;
+pub(crate) mod work_source;
+
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use crate::PartitionedFile;
+use crate::file_scan_config::FileScanConfig;
+use arrow::datatypes::SchemaRef;
+use datafusion_common::Result;
+use datafusion_execution::RecordBatchStream;
+use datafusion_physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet};
+
+use arrow::record_batch::RecordBatch;
+
+use futures::Stream;
+use futures::future::BoxFuture;
+use futures::stream::BoxStream;
+
+use self::scan_state::{ScanAndReturn, ScanState};
+
+pub use builder::FileStreamBuilder;
+pub use metrics::{FileStreamMetrics, StartableTime};
+
+/// A stream that iterates record batch by record batch, file over file.
+pub struct FileStream {
+    /// The stream schema (file schema including partition columns and after
+    /// projection).
+    projected_schema: SchemaRef,
+    /// The stream state
+    state: FileStreamState,
+    /// runtime baseline metrics
+    baseline_metrics: BaselineMetrics,
+}
+
+impl FileStream {
+    /// Create a new `FileStream` using the give `FileOpener` to scan underlying files
+    #[deprecated(since = "54.0.0", note = "Use FileStreamBuilder instead")]
+    pub fn new(
+        config: &FileScanConfig,
+        partition: usize,
+        file_opener: Arc<dyn FileOpener>,
+        metrics: &ExecutionPlanMetricsSet,
+    ) -> Result<Self> {
+        FileStreamBuilder::new(config)
+            .with_partition(partition)
+            .with_file_opener(file_opener)
+            .with_metrics(metrics)
+            .build()
+    }
+
+    /// Specify the behavior when an error occurs opening or scanning a file
+    ///
+    /// If `OnError::Skip` the stream will skip files which encounter an error and continue
+    /// If `OnError:Fail` (default) the stream will fail and stop processing when an error occurs
+    pub fn with_on_error(mut self, on_error: OnError) -> Self {
+        match &mut self.state {
+            FileStreamState::Scan { scan_state } => scan_state.set_on_error(on_error),
+            FileStreamState::Error | FileStreamState::Done => {
+                // no effect as there are no more files to process
+            }
+        };
+        self
+    }
+
+    fn poll_inner(&mut self, cx: &mut Context<'_>) -> Poll<Option<Result<RecordBatch>>> {
+        loop {
+            match &mut self.state {
+                FileStreamState::Scan { scan_state: queue } => {
+                    let action = queue.poll_scan(cx);
+                    match action {
+                        ScanAndReturn::Continue => continue,
+                        ScanAndReturn::Done(result) => {
+                            self.state = FileStreamState::Done;
+                            return Poll::Ready(result);
+                        }
+                        ScanAndReturn::Error(err) => {
+                            self.state = FileStreamState::Error;
+                            return Poll::Ready(Some(Err(err)));
+                        }
+                        ScanAndReturn::Return(result) => return result,
+                    }
+                }
+                FileStreamState::Error | FileStreamState::Done => {
+                    return Poll::Ready(None);
+                }
+            }
+        }
+    }
+}
+
+impl Stream for FileStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        let result = self.poll_inner(cx);
+        self.baseline_metrics.record_poll(result)
+    }
+}
+
+impl RecordBatchStream for FileStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.projected_schema)
+    }
+}
+
+/// A fallible future that resolves to a stream of [`RecordBatch`]
+pub type FileOpenFuture =
+    BoxFuture<'static, Result<BoxStream<'static, Result<RecordBatch>>>>;
+
+/// Describes the behavior of the `FileStream` if file opening or scanning fails
+#[derive(Default)]
+pub enum OnError {
+    /// Fail the entire stream and return the underlying error
+    #[default]
+    Fail,
+    /// Continue scanning, ignoring the failed file
+    Skip,
+}
+
+/// Generic API for opening a file using an [`ObjectStore`] and resolving to a
+/// stream of [`RecordBatch`]
+///
+/// [`ObjectStore`]: object_store::ObjectStore
+pub trait FileOpener: Unpin + Send + Sync {
+    /// Asynchronously open the specified file and return a stream
+    /// of [`RecordBatch`]
+    fn open(&self, partitioned_file: PartitionedFile) -> Result<FileOpenFuture>;
+}
+
+enum FileStreamState {
+    /// Actively processing readers, ready morsels, and planner work.
+    Scan {
+        /// The ready queues and active reader for the current file.
+        scan_state: Box<ScanState>,
+    },
+    /// Encountered an error
+    Error,
+    /// Finished scanning all requested data, possibly because a limit was reached
+    Done,
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
+    use crate::morsel::mocks::{
+        IoFutureId, MockMorselizer, MockPlanBuilder, MockPlanner, MorselId,
+        PendingPlannerBuilder, PollsToResolve,
+    };
+    use crate::source::DataSource;
+    use crate::tests::make_partition;
+    use crate::{PartitionedFile, TableSchema};
+    use arrow::array::{AsArray, RecordBatch};
+    use arrow::datatypes::{DataType, Field, Int32Type, Schema};
+    use datafusion_common::DataFusionError;
+    use datafusion_common::error::Result;
+    use datafusion_execution::object_store::ObjectStoreUrl;
+    use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
+    use futures::{FutureExt as _, StreamExt as _};
+    use std::collections::{BTreeMap, VecDeque};
+    use std::sync::Arc;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+
+    use crate::file_stream::{
+        FileOpenFuture, FileOpener, FileStream, FileStreamBuilder, OnError,
+        work_source::SharedWorkSource,
+    };
+    use crate::test_util::MockSource;
+
+    use datafusion_common::{assert_batches_eq, exec_err, internal_err};
+
+    /// Test identifier for one `FileStream` partition.
+    #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+    struct PartitionId(usize);
+
+    /// Test `FileOpener` which will simulate errors during file opening or scanning
+    #[derive(Default)]
+    struct TestOpener {
+        /// Index in stream of files which should throw an error while opening
+        error_opening_idx: Vec<usize>,
+        /// Index in stream of files which should throw an error while scanning
+        error_scanning_idx: Vec<usize>,
+        /// Index of last file in stream
+        current_idx: AtomicUsize,
+        /// `RecordBatch` to return
+        records: Vec<RecordBatch>,
+    }
+
+    impl FileOpener for TestOpener {
+        fn open(&self, _partitioned_file: PartitionedFile) -> Result<FileOpenFuture> {
+            let idx = self.current_idx.fetch_add(1, Ordering::SeqCst);
+
+            if self.error_opening_idx.contains(&idx) {
+                Ok(futures::future::ready(internal_err!("error opening")).boxed())
+            } else if self.error_scanning_idx.contains(&idx) {
+                let error = futures::future::ready(exec_err!("error scanning"));
+                let stream = futures::stream::once(error).boxed();
+                Ok(futures::future::ready(Ok(stream)).boxed())
+            } else {
+                let iterator = self.records.clone().into_iter().map(Ok);
+                let stream = futures::stream::iter(iterator).boxed();
+                Ok(futures::future::ready(Ok(stream)).boxed())
+            }
+        }
+    }
+
+    #[derive(Default)]
+    struct FileStreamTest {
+        /// Number of files in the stream
+        num_files: usize,
+        /// Global limit of records emitted by the stream
+        limit: Option<usize>,
+        /// Error-handling behavior of the stream
+        on_error: OnError,
+        /// Mock `FileOpener`
+        opener: TestOpener,
+    }
+
+    impl FileStreamTest {
+        pub fn new() -> Self {
+            Self::default()
+        }
+
+        /// Specify the number of files in the stream
+        pub fn with_num_files(mut self, num_files: usize) -> Self {
+            self.num_files = num_files;
+            self
+        }
+
+        /// Specify the limit
+        pub fn with_limit(mut self, limit: Option<usize>) -> Self {
+            self.limit = limit;
+            self
+        }
+
+        /// Specify the index of files in the stream which should
+        /// throw an error when opening
+        pub fn with_open_errors(mut self, idx: Vec<usize>) -> Self {
+            self.opener.error_opening_idx = idx;
+            self
+        }
+
+        /// Specify the index of files in the stream which should
+        /// throw an error when scanning
+        pub fn with_scan_errors(mut self, idx: Vec<usize>) -> Self {
+            self.opener.error_scanning_idx = idx;
+            self
+        }
+
+        /// Specify the behavior of the stream when an error occurs
+        pub fn with_on_error(mut self, on_error: OnError) -> Self {
+            self.on_error = on_error;
+            self
+        }
+
+        /// Specify the record batches that should be returned from each
+        /// file that is successfully scanned
+        pub fn with_records(mut self, records: Vec<RecordBatch>) -> Self {
+            self.opener.records = records;
+            self
+        }
+
+        /// Collect the results of the `FileStream`
+        pub async fn result(self) -> Result<Vec<RecordBatch>> {
+            let file_schema = self
+                .opener
+                .records
+                .first()
+                .map(|batch| batch.schema())
+                .unwrap_or_else(|| Arc::new(Schema::empty()));
+
+            // let ctx = SessionContext::new();
+            let mock_files: Vec<(String, u64)> = (0..self.num_files)
+                .map(|idx| (format!("mock_file{idx}"), 10_u64))
+                .collect();
+
+            // let mock_files_ref: Vec<(&str, u64)> = mock_files
+            //     .iter()
+            //     .map(|(name, size)| (name.as_str(), *size))
+            //     .collect();
+
+            let file_group = mock_files
+                .into_iter()
+                .map(|(name, size)| PartitionedFile::new(name, size))
+                .collect();
+
+            let on_error = self.on_error;
+
+            let table_schema = TableSchema::new(file_schema, vec![]);
+            let config = FileScanConfigBuilder::new(
+                ObjectStoreUrl::parse("test:///").unwrap(),
+                Arc::new(MockSource::new(table_schema)),
+            )
+            .with_file_group(file_group)
+            .with_limit(self.limit)
+            .build();
+            let metrics_set = ExecutionPlanMetricsSet::new();
+            let file_stream = FileStreamBuilder::new(&config)
+                .with_partition(0)
+                .with_file_opener(Arc::new(self.opener))
+                .with_metrics(&metrics_set)
+                .with_on_error(on_error)
+                .build()?;
+
+            file_stream
+                .collect::<Vec<_>>()
+                .await
+                .into_iter()
+                .collect::<Result<Vec<_>>>()
+        }
+    }
+
+    /// helper that creates a stream of 2 files with the same pair of batches in each ([0,1,2] and [0,1])
+    async fn create_and_collect(limit: Option<usize>) -> Vec<RecordBatch> {
+        FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(2)
+            .with_limit(limit)
+            .result()
+            .await
+            .expect("error executing stream")
+    }
+
+    /// Create the smallest valid file scan config for builder validation tests.
+    fn builder_test_config() -> FileScanConfig {
+        let table_schema = TableSchema::new(Arc::new(Schema::empty()), vec![]);
+        FileScanConfigBuilder::new(
+            ObjectStoreUrl::parse("test:///").unwrap(),
+            Arc::new(MockSource::new(table_schema)),
+        )
+        .with_file(PartitionedFile::new("mock_file", 10))
+        .build()
+    }
+
+    /// Convenience helper to keep builder error assertions focused on the
+    /// specific missing or invalid input under test.
+    fn builder_error(builder: FileStreamBuilder<'_>) -> String {
+        builder.build().err().unwrap().to_string()
+    }
+
+    #[tokio::test]
+    async fn on_error_opening() -> Result<()> {
+        let batches = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(2)
+            .with_on_error(OnError::Skip)
+            .with_open_errors(vec![0])
+            .result()
+            .await?;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "+---+",
+            "| i |",
+            "+---+",
+            "| 0 |",
+            "| 1 |",
+            "| 2 |",
+            "| 0 |",
+            "| 1 |",
+            "+---+",
+        ], &batches);
+
+        let batches = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(2)
+            .with_on_error(OnError::Skip)
+            .with_open_errors(vec![1])
+            .result()
+            .await?;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "+---+",
+            "| i |",
+            "+---+",
+            "| 0 |",
+            "| 1 |",
+            "| 2 |",
+            "| 0 |",
+            "| 1 |",
+            "+---+",
+        ], &batches);
+
+        let batches = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(2)
+            .with_on_error(OnError::Skip)
+            .with_open_errors(vec![0, 1])
+            .result()
+            .await?;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "++",
+            "++",
+        ], &batches);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn on_error_scanning_fail() -> Result<()> {
+        let result = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(2)
+            .with_on_error(OnError::Fail)
+            .with_scan_errors(vec![1])
+            .result()
+            .await;
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn on_error_opening_fail() -> Result<()> {
+        let result = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(2)
+            .with_on_error(OnError::Fail)
+            .with_open_errors(vec![1])
+            .result()
+            .await;
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn on_error_scanning() -> Result<()> {
+        let batches = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(2)
+            .with_on_error(OnError::Skip)
+            .with_scan_errors(vec![0])
+            .result()
+            .await?;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "+---+",
+            "| i |",
+            "+---+",
+            "| 0 |",
+            "| 1 |",
+            "| 2 |",
+            "| 0 |",
+            "| 1 |",
+            "+---+",
+        ], &batches);
+
+        let batches = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(2)
+            .with_on_error(OnError::Skip)
+            .with_scan_errors(vec![1])
+            .result()
+            .await?;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "+---+",
+            "| i |",
+            "+---+",
+            "| 0 |",
+            "| 1 |",
+            "| 2 |",
+            "| 0 |",
+            "| 1 |",
+            "+---+",
+        ], &batches);
+
+        let batches = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(2)
+            .with_on_error(OnError::Skip)
+            .with_scan_errors(vec![0, 1])
+            .result()
+            .await?;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "++",
+            "++",
+        ], &batches);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn on_error_mixed() -> Result<()> {
+        let batches = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(3)
+            .with_on_error(OnError::Skip)
+            .with_open_errors(vec![1])
+            .with_scan_errors(vec![0])
+            .result()
+            .await?;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "+---+",
+            "| i |",
+            "+---+",
+            "| 0 |",
+            "| 1 |",
+            "| 2 |",
+            "| 0 |",
+            "| 1 |",
+            "+---+",
+        ], &batches);
+
+        let batches = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(3)
+            .with_on_error(OnError::Skip)
+            .with_open_errors(vec![0])
+            .with_scan_errors(vec![1])
+            .result()
+            .await?;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "+---+",
+            "| i |",
+            "+---+",
+            "| 0 |",
+            "| 1 |",
+            "| 2 |",
+            "| 0 |",
+            "| 1 |",
+            "+---+",
+        ], &batches);
+
+        let batches = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(3)
+            .with_on_error(OnError::Skip)
+            .with_open_errors(vec![2])
+            .with_scan_errors(vec![0, 1])
+            .result()
+            .await?;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "++",
+            "++",
+        ], &batches);
+
+        let batches = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(3)
+            .with_on_error(OnError::Skip)
+            .with_open_errors(vec![0, 2])
+            .with_scan_errors(vec![1])
+            .result()
+            .await?;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "++",
+            "++",
+        ], &batches);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn without_limit() -> Result<()> {
+        let batches = create_and_collect(None).await;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "+---+",
+            "| i |",
+            "+---+",
+            "| 0 |",
+            "| 1 |",
+            "| 2 |",
+            "| 0 |",
+            "| 1 |",
+            "| 0 |",
+            "| 1 |",
+            "| 2 |",
+            "| 0 |",
+            "| 1 |",
+            "+---+",
+        ], &batches);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn with_limit_between_files() -> Result<()> {
+        let batches = create_and_collect(Some(5)).await;
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "+---+",
+            "| i |",
+            "+---+",
+            "| 0 |",
+            "| 1 |",
+            "| 2 |",
+            "| 0 |",
+            "| 1 |",
+            "+---+",
+        ], &batches);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn with_limit_at_middle_of_batch() -> Result<()> {
+        let batches = create_and_collect(Some(6)).await;
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "+---+",
+            "| i |",
+            "+---+",
+            "| 0 |",
+            "| 1 |",
+            "| 2 |",
+            "| 0 |",
+            "| 1 |",
+            "| 0 |",
+            "+---+",
+        ], &batches);
+
+        Ok(())
+    }
+
+    #[test]
+    fn builder_requires_partition_file_opener_and_metrics() {
+        let config = builder_test_config();
+
+        let err = builder_error(FileStreamBuilder::new(&config));
+        assert!(err.contains("FileStreamBuilder missing required partition"));
+
+        let err = builder_error(FileStreamBuilder::new(&config).with_partition(0));
+        assert!(err.contains("FileStreamBuilder missing required morselizer"));
+
+        let err = builder_error(
+            FileStreamBuilder::new(&config)
+                .with_partition(0)
+                .with_file_opener(Arc::new(TestOpener::default())),
+        );
+        assert!(err.contains("FileStreamBuilder missing required metrics"));
+    }
+
+    #[test]
+    fn builder_errors_on_invalid_partition() {
+        let config = builder_test_config();
+        let metrics = ExecutionPlanMetricsSet::new();
+
+        let err = builder_error(
+            FileStreamBuilder::new(&config)
+                .with_partition(1)
+                .with_file_opener(Arc::new(TestOpener::default()))
+                .with_metrics(&metrics),
+        );
+        assert!(err.contains("FileStreamBuilder invalid partition index: 1"));
+    }
+
+    /// Verifies the simplest morsel-driven flow: one planner produces one
+    /// morsel immediately, and that morsel is then scanned to completion.
+    #[tokio::test]
+    async fn morsel_no_io() -> Result<()> {
+        let test = FileStreamMorselTest::new().with_file(
+            MockPlanner::builder("file1.parquet")
+                .add_plan(MockPlanBuilder::new().with_morsel(MorselId(10), 42))
+                .return_none(),
+        );
+
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Output Stream -----
+        Batch: 42
+        Done
+        ----- File Stream Events -----
+        morselize_file: file1.parquet
+        planner_created: file1.parquet
+        planner_called: file1.parquet
+        morsel_produced: file1.parquet, MorselId(10)
+        morsel_stream_started: MorselId(10)
+        morsel_stream_batch_produced: MorselId(10), BatchId(42)
+        morsel_stream_finished: MorselId(10)
+        ");
+
+        Ok(())
+    }
+
+    /// Verifies that a planner can block on one I/O phase and then produce a
+    /// morsel containing two batches.
+    #[tokio::test]
+    async fn morsel_single_io_two_batches() -> Result<()> {
+        let test = FileStreamMorselTest::new().with_file(
+            MockPlanner::builder("file1.parquet")
+                .add_plan(
+                    PendingPlannerBuilder::new(IoFutureId(1))
+                        .with_polls_to_resolve(PollsToResolve(1)),
+                )
+                .add_plan(
+                    MockPlanBuilder::new()
+                        .with_morsel_batches(MorselId(10), vec![42, 43]),
+                )
+                .return_none(),
+        );
+
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Output Stream -----
+        Batch: 42
+        Batch: 43
+        Done
+        ----- File Stream Events -----
+        morselize_file: file1.parquet
+        planner_created: file1.parquet
+        planner_called: file1.parquet
+        io_future_created: file1.parquet, IoFutureId(1)
+        io_future_polled: file1.parquet, IoFutureId(1)
+        io_future_polled: file1.parquet, IoFutureId(1)
+        io_future_resolved: file1.parquet, IoFutureId(1)
+        planner_called: file1.parquet
+        morsel_produced: file1.parquet, MorselId(10)
+        morsel_stream_started: MorselId(10)
+        morsel_stream_batch_produced: MorselId(10), BatchId(42)
+        morsel_stream_batch_produced: MorselId(10), BatchId(43)
+        morsel_stream_finished: MorselId(10)
+        ");
+
+        Ok(())
+    }
+
+    /// Verifies that a planner can traverse two sequential I/O phases before
+    /// producing one batch, similar to Parquet.
+    #[tokio::test]
+    async fn morsel_two_ios_one_batch() -> Result<()> {
+        let test = FileStreamMorselTest::new().with_file(
+            MockPlanner::builder("file1.parquet")
+                .add_plan(PendingPlannerBuilder::new(IoFutureId(1)))
+                .add_plan(PendingPlannerBuilder::new(IoFutureId(2)))
+                .add_plan(MockPlanBuilder::new().with_morsel(MorselId(10), 42))
+                .return_none(),
+        );
+
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Output Stream -----
+        Batch: 42
+        Done
+        ----- File Stream Events -----
+        morselize_file: file1.parquet
+        planner_created: file1.parquet
+        planner_called: file1.parquet
+        io_future_created: file1.parquet, IoFutureId(1)
+        io_future_polled: file1.parquet, IoFutureId(1)
+        io_future_resolved: file1.parquet, IoFutureId(1)
+        planner_called: file1.parquet
+        io_future_created: file1.parquet, IoFutureId(2)
+        io_future_polled: file1.parquet, IoFutureId(2)
+        io_future_resolved: file1.parquet, IoFutureId(2)
+        planner_called: file1.parquet
+        morsel_produced: file1.parquet, MorselId(10)
+        morsel_stream_started: MorselId(10)
+        morsel_stream_batch_produced: MorselId(10), BatchId(42)
+        morsel_stream_finished: MorselId(10)
+        ");
+
+        Ok(())
+    }
+
+    /// Verifies that a planner I/O future can fail and terminate the stream.
+    #[tokio::test]
+    async fn morsel_io_error() -> Result<()> {
+        let test = FileStreamMorselTest::new().with_file(
+            MockPlanner::builder("file1.parquet").add_plan(
+                PendingPlannerBuilder::new(IoFutureId(1))
+                    .with_error("io failed while opening file"),
+            ),
+        );
+
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Output Stream -----
+        Error: io failed while opening file
+        Done
+        ----- File Stream Events -----
+        morselize_file: file1.parquet
+        planner_created: file1.parquet
+        planner_called: file1.parquet
+        io_future_created: file1.parquet, IoFutureId(1)
+        io_future_polled: file1.parquet, IoFutureId(1)
+        io_future_errored: file1.parquet, IoFutureId(1), io failed while opening file
+        ");
+
+        Ok(())
+    }
+
+    /// Verifies that pending planner I/O does not block draining the current
+    /// morsel stream.
+    #[tokio::test]
+    async fn morsel_pending_planner_does_not_block_active_reader() -> Result<()> {
+        let test = FileStreamMorselTest::new().with_file(
+            MockPlanner::builder("file1.parquet")
+                .add_plan(
+                    MockPlanBuilder::new()
+                        .with_morsel_batches(MorselId(10), vec![41, 42])
+                        .with_pending_planner(IoFutureId(1), PollsToResolve(3), Ok(())),
+                )
+                .add_plan(MockPlanBuilder::new().with_morsel(MorselId(11), 43))
+                .return_none(),
+        );
+
+        // The key events are:
+        // 1. the first `planner_called` produces `MorselId(10)` and creates `IoFutureId(1)`
+        // 2. `MorselId(10)` continues yielding both batches while that I/O is pending
+        // 3. after the I/O resolves, planning resumes and yields `MorselId(11)`
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Output Stream -----
+        Batch: 41
+        Batch: 42
+        Batch: 43
+        Done
+        ----- File Stream Events -----
+        morselize_file: file1.parquet
+        planner_created: file1.parquet
+        planner_called: file1.parquet
+        morsel_produced: file1.parquet, MorselId(10)
+        io_future_created: file1.parquet, IoFutureId(1)
+        io_future_polled: file1.parquet, IoFutureId(1)
+        morsel_stream_started: MorselId(10)
+        io_future_polled: file1.parquet, IoFutureId(1)
+        morsel_stream_batch_produced: MorselId(10), BatchId(41)
+        io_future_polled: file1.parquet, IoFutureId(1)
+        morsel_stream_batch_produced: MorselId(10), BatchId(42)
+        io_future_polled: file1.parquet, IoFutureId(1)
+        io_future_resolved: file1.parquet, IoFutureId(1)
+        morsel_stream_finished: MorselId(10)
+        planner_called: file1.parquet
+        morsel_produced: file1.parquet, MorselId(11)
+        morsel_stream_started: MorselId(11)
+        morsel_stream_batch_produced: MorselId(11), BatchId(43)
+        morsel_stream_finished: MorselId(11)
+        ");
+
+        Ok(())
+    }
+
+    /// Verifies that one `plan()` call can return a ready child planner, which
+    /// is then called to produce the morsel.
+    #[tokio::test]
+    async fn morsel_ready_child_planner() -> Result<()> {
+        let child_planner = MockPlanner::builder("child planner")
+            .add_plan(MockPlanBuilder::new().with_morsel(MorselId(10), 42))
+            .return_none();
+
+        let test = FileStreamMorselTest::new().with_file(
+            MockPlanner::builder("file1.parquet")
+                .add_plan(MockPlanBuilder::new().with_ready_planner(child_planner))
+                .return_none(),
+        );
+
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Output Stream -----
+        Batch: 42
+        Done
+        ----- File Stream Events -----
+        morselize_file: file1.parquet
+        planner_created: file1.parquet
+        planner_called: file1.parquet
+        planner_created: child planner
+        planner_called: child planner
+        morsel_produced: child planner, MorselId(10)
+        morsel_stream_started: MorselId(10)
+        morsel_stream_batch_produced: MorselId(10), BatchId(42)
+        morsel_stream_finished: MorselId(10)
+        ");
+
+        Ok(())
+    }
+
+    /// Verifies that planning can fail after a successful I/O phase.
+    #[tokio::test]
+    async fn morsel_plan_error_after_io() -> Result<()> {
+        let test = FileStreamMorselTest::new().with_file(
+            MockPlanner::builder("file1.parquet")
+                .add_plan(PendingPlannerBuilder::new(IoFutureId(1)))
+                .return_error("planner failed after io"),
+        );
+
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Output Stream -----
+        Error: planner failed after io
+        Done
+        ----- File Stream Events -----
+        morselize_file: file1.parquet
+        planner_created: file1.parquet
+        planner_called: file1.parquet
+        io_future_created: file1.parquet, IoFutureId(1)
+        io_future_polled: file1.parquet, IoFutureId(1)
+        io_future_resolved: file1.parquet, IoFutureId(1)
+        planner_called: file1.parquet
+        ");
+
+        Ok(())
+    }
+
+    /// Verifies that `FileStream` scans multiple files in order.
+    #[tokio::test]
+    async fn morsel_multiple_files() -> Result<()> {
+        let test = FileStreamMorselTest::new()
+            .with_file(
+                MockPlanner::builder("file1.parquet")
+                    .add_plan(MockPlanBuilder::new().with_morsel(MorselId(10), 41))
+                    .return_none(),
+            )
+            .with_file(
+                MockPlanner::builder("file2.parquet")
+                    .add_plan(MockPlanBuilder::new().with_morsel(MorselId(11), 42))
+                    .return_none(),
+            );
+
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Output Stream -----
+        Batch: 41
+        Batch: 42
+        Done
+        ----- File Stream Events -----
+        morselize_file: file1.parquet
+        planner_created: file1.parquet
+        planner_called: file1.parquet
+        morsel_produced: file1.parquet, MorselId(10)
+        morsel_stream_started: MorselId(10)
+        morsel_stream_batch_produced: MorselId(10), BatchId(41)
+        morsel_stream_finished: MorselId(10)
+        morselize_file: file2.parquet
+        planner_created: file2.parquet
+        planner_called: file2.parquet
+        morsel_produced: file2.parquet, MorselId(11)
+        morsel_stream_started: MorselId(11)
+        morsel_stream_batch_produced: MorselId(11), BatchId(42)
+        morsel_stream_finished: MorselId(11)
+        ");
+
+        Ok(())
+    }
+
+    /// Verifies that a global limit can stop the stream before a second file is opened.
+    #[tokio::test]
+    async fn morsel_limit_prevents_second_file() -> Result<()> {
+        let test = FileStreamMorselTest::new()
+            .with_file(
+                MockPlanner::builder("file1.parquet")
+                    .add_plan(
+                        MockPlanBuilder::new()
+                            .with_morsel_batches(MorselId(10), vec![41, 42]),
+                    )
+                    .return_none(),
+            )
+            .with_file(
+                MockPlanner::builder("file2.parquet")
+                    .add_plan(MockPlanBuilder::new().with_morsel(MorselId(11), 43))
+                    .return_none(),
+            )
+            .with_limit(1);
+
+        // Note the snapshot should not ever see planner id2
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Output Stream -----
+        Batch: 41
+        Done
+        ----- File Stream Events -----
+        morselize_file: file1.parquet
+        planner_created: file1.parquet
+        planner_called: file1.parquet
+        morsel_produced: file1.parquet, MorselId(10)
+        morsel_stream_started: MorselId(10)
+        morsel_stream_batch_produced: MorselId(10), BatchId(41)
+        ");
+
+        Ok(())
+    }
+
+    /// Return a morsel test with two partitions:
+    /// Partition 0: file1, file2, file3
+    /// Partition 1: file4
+    ///
+    /// Partition 1 has only 1 file but it polled first 4 times
+    fn two_partition_morsel_test() -> FileStreamMorselTest {
+        FileStreamMorselTest::new()
+            // Partition 0 has three files
+            .with_file_in_partition(
+                PartitionId(0),
+                MockPlanner::builder("file1.parquet")
+                    .add_plan(MockPlanBuilder::new().with_morsel(MorselId(10), 101))
+                    .return_none(),
+            )
+            .with_file_in_partition(
+                PartitionId(0),
+                MockPlanner::builder("file2.parquet")
+                    .add_plan(MockPlanBuilder::new().with_morsel(MorselId(11), 102))
+                    .return_none(),
+            )
+            .with_file_in_partition(
+                PartitionId(0),
+                MockPlanner::builder("file3.parquet")
+                    .add_plan(MockPlanBuilder::new().with_morsel(MorselId(12), 103))
+                    .return_none(),
+            )
+            // Partition 1 has only one file, but is polled first
+            .with_file_in_partition(
+                PartitionId(1),
+                MockPlanner::builder("file4.parquet")
+                    .add_plan(MockPlanBuilder::new().with_morsel(MorselId(13), 201))
+                    .return_none(),
+            )
+            .with_reads(vec![
+                PartitionId(1),
+                PartitionId(1),
+                PartitionId(1),
+                PartitionId(1),
+                PartitionId(1),
+            ])
+    }
+
+    /// Verifies that an idle sibling stream can steal shared files from
+    /// another stream once it exhausts its own local work.
+    #[tokio::test]
+    async fn morsel_shared_files_can_be_stolen() -> Result<()> {
+        let test = two_partition_morsel_test().with_file_stream_events(false);
+
+        // Partition 0 starts with 3 files, but Partition 1 is polled first.
+        // Since Partition 1 is polled first, it will run all the files even those
+        // that were assigned to Partition 0.
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Partition 0 -----
+        Done
+        ----- Partition 1 -----
+        Batch: 101
+        Batch: 102
+        Batch: 103
+        Batch: 201
+        Done
+        ----- File Stream Events -----
+        (omitted due to with_file_stream_events(false))
+        ");
+
+        Ok(())
+    }
+
+    /// Verifies that a stream that must preserve order keeps its files local
+    /// and therefore cannot steal from a sibling shared queue.
+    #[tokio::test]
+    async fn morsel_preserve_order_keeps_files_local() -> Result<()> {
+        // same fixture as `morsel_shared_files_can_be_stolen` but marked as
+        // preserve-order
+        let test = two_partition_morsel_test()
+            .with_preserve_order(true)
+            .with_file_stream_events(false);
+
+        // Even though that Partition 1 is polled first, it can not steal files
+        // from partition 0. The three files originally assigned to Partition 0
+        // must be evaluated by Partition 0.
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Partition 0 -----
+        Batch: 101
+        Batch: 102
+        Batch: 103
+        Done
+        ----- Partition 1 -----
+        Batch: 201
+        Done
+        ----- File Stream Events -----
+        (omitted due to with_file_stream_events(false))
+        ");
+
+        Ok(())
+    }
+
+    /// Verifies that `partitioned_by_file_group` disables shared work stealing.
+    #[tokio::test]
+    async fn morsel_partitioned_by_file_group_keeps_files_local() -> Result<()> {
+        // same fixture as `morsel_shared_files_can_be_stolen` but marked as
+        // preserve-partitioned
+        let test = two_partition_morsel_test()
+            .with_partitioned_by_file_group(true)
+            .with_file_stream_events(false);
+
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Partition 0 -----
+        Batch: 101
+        Batch: 102
+        Batch: 103
+        Done
+        ----- Partition 1 -----
+        Batch: 201
+        Done
+        ----- File Stream Events -----
+        (omitted due to with_file_stream_events(false))
+        ");
+
+        Ok(())
+    }
+
+    /// Verifies that an empty sibling can immediately steal shared files when
+    /// it is polled before the stream that originally owned them.
+    #[tokio::test]
+    async fn morsel_empty_sibling_can_steal() -> Result<()> {
+        let test = FileStreamMorselTest::new()
+            .with_file_in_partition(
+                PartitionId(0),
+                MockPlanner::builder("file1.parquet")
+                    .add_plan(MockPlanBuilder::new().with_morsel(MorselId(10), 101))
+                    .return_none(),
+            )
+            .with_file_in_partition(
+                PartitionId(0),
+                MockPlanner::builder("file2.parquet")
+                    .add_plan(MockPlanBuilder::new().with_morsel(MorselId(11), 102))
+                    .return_none(),
+            )
+            // Poll the empty sibling first so it steals both files.
+            .with_reads(vec![PartitionId(1), PartitionId(1), PartitionId(1)])
+            .with_file_stream_events(false);
+
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Partition 0 -----
+        Done
+        ----- Partition 1 -----
+        Batch: 101
+        Batch: 102
+        Done
+        ----- File Stream Events -----
+        (omitted due to with_file_stream_events(false))
+        ");
+
+        Ok(())
+    }
+
+    /// Ensures that if a sibling is built and polled
+    /// before another sibling has been built and contributed its files to the
+    /// shared queue, the first sibling does not finish prematurely.
+    #[tokio::test]
+    async fn morsel_empty_sibling_can_finish_before_shared_work_exists() -> Result<()> {
+        let test = FileStreamMorselTest::new()
+            .with_file_in_partition(
+                PartitionId(0),
+                MockPlanner::builder("file1.parquet")
+                    .add_plan(MockPlanBuilder::new().with_morsel(MorselId(10), 101))
+                    .return_none(),
+            )
+            .with_file_in_partition(
+                PartitionId(0),
+                MockPlanner::builder("file2.parquet")
+                    .add_plan(MockPlanBuilder::new().with_morsel(MorselId(11), 102))
+                    .return_none(),
+            )
+            // Build streams lazily so partition 1 can poll the shared queue
+            // before partition 0 has contributed its files. Once partition 0
+            // is built, a later poll of partition 1 can still steal one of
+            // them from the shared queue.
+            .with_build_streams_on_first_read(true)
+            .with_reads(vec![PartitionId(1), PartitionId(0), PartitionId(1)])
+            .with_file_stream_events(false);
+
+        // Partition 1 polls too early once, then later steals one file after
+        // partition 0 has populated the shared queue.
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Partition 0 -----
+        Batch: 102
+        Done
+        ----- Partition 1 -----
+        Batch: 101
+        Done
+        ----- File Stream Events -----
+        (omitted due to with_file_stream_events(false))
+        ");
+
+        Ok(())
+    }
+
+    /// Verifies that a sibling hitting its limit does not count shared files
+    /// left in the queue as already processed by that stream.
+    #[tokio::test]
+    async fn morsel_shared_limit_does_not_double_count_files_processed() -> Result<()> {
+        let test = two_partition_morsel_test();
+        let unlimited_config = test.test_config();
+        let limited_config = test.clone().with_limit(1).test_config();
+        let shared_work_source = limited_config
+            .create_sibling_state()
+            .and_then(|state| state.as_ref().downcast_ref::<SharedWorkSource>().cloned())
+            .expect("shared work source");
+        let limited_metrics = ExecutionPlanMetricsSet::new();
+        let unlimited_metrics = ExecutionPlanMetricsSet::new();
+
+        let limited_stream = FileStreamBuilder::new(&limited_config)
+            .with_partition(1)
+            .with_shared_work_source(Some(shared_work_source.clone()))
+            .with_morselizer(Box::new(test.morselizer.clone()))
+            .with_metrics(&limited_metrics)
+            .build()?;
+
+        let unlimited_stream = FileStreamBuilder::new(&unlimited_config)
+            .with_partition(0)
+            .with_shared_work_source(Some(shared_work_source))
+            .with_morselizer(Box::new(test.morselizer))
+            .with_metrics(&unlimited_metrics)
+            .build()?;
+
+        let limited_output = drain_stream_output(limited_stream).await?;
+        let unlimited_output = drain_stream_output(unlimited_stream).await?;
+
+        insta::assert_snapshot!(format!(
+            "----- Limited Stream -----\n{limited_output}\n----- Unlimited Stream -----\n{unlimited_output}"
+        ), @r"
+        ----- Limited Stream -----
+        Batch: 101
+        ----- Unlimited Stream -----
+        Batch: 102
+        Batch: 103
+        Batch: 201
+        ");
+
+        assert_eq!(
+            metric_count(&limited_metrics, "files_opened"),
+            1,
+            "the limited stream should only open the file that produced its output"
+        );
+        assert_eq!(
+            metric_count(&limited_metrics, "files_processed"),
+            1,
+            "the limited stream should only mark its own file as processed"
+        );
+        assert_eq!(
+            metric_count(&unlimited_metrics, "files_opened"),
+            3,
+            "the draining stream should open the remaining shared files"
+        );
+        assert_eq!(
+            metric_count(&unlimited_metrics, "files_processed"),
+            3,
+            "the draining stream should process exactly the files it opened"
+        );
+
+        Ok(())
+    }
+
+    /// Verifies that one fast sibling can drain shared files that originated
+    /// in more than one other partition.
+    #[tokio::test]
+    async fn morsel_one_sibling_can_drain_multiple_siblings() -> Result<()> {
+        let test = FileStreamMorselTest::new()
+            .with_file_in_partition(
+                PartitionId(0),
+                MockPlanner::builder("file1.parquet")
+                    .add_plan(MockPlanBuilder::new().with_morsel(MorselId(10), 101))
+                    .return_none(),
+            )
+            // Partition 1 has two files
+            .with_file_in_partition(
+                PartitionId(1),
+                MockPlanner::builder("file2.parquet")
+                    .add_plan(MockPlanBuilder::new().with_morsel(MorselId(11), 102))
+                    .return_none(),
+            )
+            .with_file_in_partition(
+                PartitionId(1),
+                MockPlanner::builder("file3.parquet")
+                    .add_plan(MockPlanBuilder::new().with_morsel(MorselId(12), 103))
+                    .return_none(),
+            )
+            // Partition 2 starts empty but is polled first, so it should drain
+            // the shared queue across both sibling partitions.
+            .with_reads(vec![
+                PartitionId(2),
+                PartitionId(2),
+                PartitionId(1),
+                PartitionId(2),
+            ])
+            .with_file_stream_events(false);
+
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Partition 0 -----
+        Done
+        ----- Partition 1 -----
+        Batch: 103
+        Done
+        ----- Partition 2 -----
+        Batch: 101
+        Batch: 102
+        Done
+        ----- File Stream Events -----
+        (omitted due to with_file_stream_events(false))
+        ");
+
+        Ok(())
+    }
+
+    /// Tests how one or more `FileStream`s consume morselized file work.
+    #[derive(Clone)]
+    struct FileStreamMorselTest {
+        morselizer: MockMorselizer,
+        partition_files: BTreeMap<PartitionId, Vec<String>>,
+        preserve_order: bool,
+        partitioned_by_file_group: bool,
+        file_stream_events: bool,
+        build_streams_on_first_read: bool,
+        reads: Vec<PartitionId>,
+        limit: Option<usize>,
+    }
+
+    impl FileStreamMorselTest {
+        /// Creates an empty test harness.
+        fn new() -> Self {
+            Self {
+                morselizer: MockMorselizer::new(),
+                partition_files: BTreeMap::new(),
+                preserve_order: false,
+                partitioned_by_file_group: false,
+                file_stream_events: true,
+                build_streams_on_first_read: false,
+                reads: vec![],
+                limit: None,
+            }
+        }
+
+        /// Adds one file and its root planner to partition 0.
+        fn with_file(self, planner: impl Into<MockPlanner>) -> Self {
+            self.with_file_in_partition(PartitionId(0), planner)
+        }
+
+        /// Adds one file and its root planner to the specified input partition.
+        fn with_file_in_partition(
+            mut self,
+            partition: PartitionId,
+            planner: impl Into<MockPlanner>,
+        ) -> Self {
+            let planner = planner.into();
+            let file_path = planner.file_path().to_string();
+            self.morselizer = self.morselizer.with_planner(planner);
+            self.partition_files
+                .entry(partition)
+                .or_default()
+                .push(file_path);
+            self
+        }
+
+        /// Marks the stream (and all partitions) to preserve the specified file
+        /// order.
+        fn with_preserve_order(mut self, preserve_order: bool) -> Self {
+            self.preserve_order = preserve_order;
+            self
+        }
+
+        /// Marks the test scan as pre-partitioned by file group, which should
+        /// force each stream to keep its own files local.
+        fn with_partitioned_by_file_group(
+            mut self,
+            partitioned_by_file_group: bool,
+        ) -> Self {
+            self.partitioned_by_file_group = partitioned_by_file_group;
+            self
+        }
+
+        /// Controls whether scheduler events are included in the snapshot.
+        ///
+        /// When disabled, `run()` still includes the event section header but
+        /// replaces the trace with a fixed placeholder so tests can focus only
+        /// on the output batches.
+        fn with_file_stream_events(mut self, file_stream_events: bool) -> Self {
+            self.file_stream_events = file_stream_events;
+            self
+        }
+
+        /// Controls whether streams are all built up front or lazily on their
+        /// first read.
+        ///
+        /// The default builds all streams before polling begins, which matches
+        /// normal execution. Tests may enable lazy creation to model races
+        /// where one sibling polls before another has contributed its files to
+        /// the shared queue.
+        fn with_build_streams_on_first_read(
+            mut self,
+            build_streams_on_first_read: bool,
+        ) -> Self {
+            self.build_streams_on_first_read = build_streams_on_first_read;
+            self
+        }
+
+        /// Sets the partition polling order.
+        ///
+        /// `run()` polls these partitions in the listed order first. After
+        /// those explicit reads are exhausted, it completes to round
+        /// robin across all configured partitions, skipping any streams that
+        /// have already finished.
+        ///
+        /// This allows testing early scheduling decisions explicit in a test
+        /// while avoiding a fully scripted poll trace for the remainder.
+        fn with_reads(mut self, reads: Vec<PartitionId>) -> Self {
+            self.reads = reads;
+            self
+        }
+
+        /// Sets a global output limit for all streams created by this test.
+        fn with_limit(mut self, limit: usize) -> Self {
+            self.limit = Some(limit);
+            self
+        }
+
+        /// Runs the test and returns combined stream output and scheduler
+        /// trace text.
+        async fn run(self) -> Result<String> {
+            let observer = self.morselizer.observer().clone();
+            observer.clear();
+
+            let metrics_set = ExecutionPlanMetricsSet::new();
+            let partition_count = self.num_partitions();
+
+            let mut partitions = (0..partition_count)
+                .map(|_| PartitionState::new())
+                .collect::<Vec<_>>();
+
+            let mut build_order = Vec::new();
+            for partition in self.reads.iter().map(|partition| partition.0) {
+                if !build_order.contains(&partition) {
+                    build_order.push(partition);
+                }
+            }
+            for partition in 0..partition_count {
+                if !build_order.contains(&partition) {
+                    build_order.push(partition);
+                }
+            }
+
+            let config = self.test_config();
+            // `DataSourceExec::execute` creates one execution-local shared
+            // state object via `create_sibling_state()` and then passes it
+            // to `open_with_sibling_state(...)`. These tests build
+            // `FileStream`s directly, bypassing `DataSourceExec`, so they must
+            // perform the same setup explicitly when exercising sibling-stream
+            // work stealing.
+            let shared_work_source = config.create_sibling_state().and_then(|state| {
+                state.as_ref().downcast_ref::<SharedWorkSource>().cloned()
+            });
+            if !self.build_streams_on_first_read {
+                for partition in build_order {
+                    let stream = FileStreamBuilder::new(&config)
+                        .with_partition(partition)
+                        .with_shared_work_source(shared_work_source.clone())
+                        .with_morselizer(Box::new(self.morselizer.clone()))
+                        .with_metrics(&metrics_set)
+                        .build()?;
+                    partitions[partition].set_stream(stream);
+                }
+            }
+
+            let mut initial_reads: VecDeque<_> = self.reads.into();
+            let mut next_round_robin = 0;
+
+            while !initial_reads.is_empty()
+                || partitions.iter().any(PartitionState::is_active)
+            {
+                let partition = if let Some(partition) = initial_reads.pop_front() {
+                    partition.0
+                } else {
+                    let partition = next_round_robin;
+                    next_round_robin = (next_round_robin + 1) % partition_count.max(1);
+                    partition
+                };
+
+                let partition_state = &mut partitions[partition];
+
+                if self.build_streams_on_first_read && !partition_state.built {
+                    let stream = FileStreamBuilder::new(&config)
+                        .with_partition(partition)
+                        .with_shared_work_source(shared_work_source.clone())
+                        .with_morselizer(Box::new(self.morselizer.clone()))
+                        .with_metrics(&metrics_set)
+                        .build()?;
+                    partition_state.set_stream(stream);
+                }
+
+                let Some(stream) = partition_state.stream.as_mut() else {
+                    continue;
+                };
+
+                match stream.next().await {
+                    Some(result) => partition_state.push_output(format_result(result)),
+                    None => partition_state.finish(),
+                }
+            }
+
+            let output_text = if partition_count == 1 {
+                format!(
+                    "----- Output Stream -----\n{}",
+                    partitions[0].output.join("\n")
+                )
+            } else {
+                partitions
+                    .into_iter()
+                    .enumerate()
+                    .map(|(partition, state)| {
+                        format!(
+                            "----- Partition {} -----\n{}",
+                            partition,
+                            state.output.join("\n")
+                        )
+                    })
+                    .collect::<Vec<_>>()
+                    .join("\n")
+            };
+
+            let file_stream_events = if self.file_stream_events {
+                observer.format_events()
+            } else {
+                "(omitted due to with_file_stream_events(false))".to_string()
+            };
+
+            Ok(format!(
+                "{output_text}\n----- File Stream Events -----\n{file_stream_events}",
+            ))
+        }
+
+        /// Returns the number of configured partitions, including empty ones
+        /// that appear only in the explicit read schedule.
+        fn num_partitions(&self) -> usize {
+            self.partition_files
+                .keys()
+                .map(|partition| partition.0 + 1)
+                .chain(self.reads.iter().map(|partition| partition.0 + 1))
+                .max()
+                .unwrap_or(1)
+        }
+
+        /// Builds a `FileScanConfig` covering every configured partition.
+        fn test_config(&self) -> FileScanConfig {
+            let file_groups = (0..self.num_partitions())
+                .map(|partition| {
+                    self.partition_files
+                        .get(&PartitionId(partition))
+                        .into_iter()
+                        .flat_map(|files| files.iter())
+                        .map(|name| PartitionedFile::new(name, 10))
+                        .collect::<Vec<_>>()
+                        .into()
+                })
+                .collect::<Vec<_>>();
+
+            let table_schema = TableSchema::new(
+                Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, false)])),
+                vec![],
+            );
+            FileScanConfigBuilder::new(
+                ObjectStoreUrl::parse("test:///").unwrap(),
+                Arc::new(MockSource::new(table_schema)),
+            )
+            .with_file_groups(file_groups)
+            .with_limit(self.limit)
+            .with_preserve_order(self.preserve_order)
+            .with_partitioned_by_file_group(self.partitioned_by_file_group)
+            .build()
+        }
+    }
+
+    /// Formats one stream poll result into a stable snapshot line.
+    fn format_result(result: Result<RecordBatch>) -> String {
+        match result {
+            Ok(batch) => {
+                let col = batch.column(0).as_primitive::<Int32Type>();
+                let batch_id = col.value(0);
+                format!("Batch: {batch_id}")
+            }
+            Err(e) => {
+                // Pull the actual message for external errors rather than
+                // relying on DataFusionError formatting, which changes if
+                // backtraces are enabled, etc.
+                let message = if let DataFusionError::External(generic) = e {
+                    generic.to_string()
+                } else {
+                    e.to_string()
+                };
+                format!("Error: {message}")
+            }
+        }
+    }
+
+    async fn drain_stream_output(stream: FileStream) -> Result<String> {
+        let output = stream
+            .collect::<Vec<_>>()
+            .await
+            .into_iter()
+            .map(|result| result.map(|batch| format_result(Ok(batch))))
+            .collect::<Result<Vec<_>>>()?;
+        Ok(output.join("\n"))
+    }
+
+    fn metric_count(metrics: &ExecutionPlanMetricsSet, name: &str) -> usize {
+        metrics
+            .clone_inner()
+            .sum_by_name(name)
+            .unwrap_or_else(|| panic!("missing metric: {name}"))
+            .as_usize()
+    }
+
+    /// Test-only state for one stream partition in [`FileStreamMorselTest`].
+    struct PartitionState {
+        /// Whether the `FileStream` for this partition has been built yet.
+        built: bool,
+        /// The live stream, if this partition has not finished yet.
+        stream: Option<FileStream>,
+        /// Snapshot lines produced by this partition.
+        output: Vec<String>,
+    }
+
+    impl PartitionState {
+        /// Create an unbuilt partition with no output yet.
+        fn new() -> Self {
+            Self {
+                built: false,
+                stream: None,
+                output: vec![],
+            }
+        }
+
+        /// Returns true if this partition might still produce output.
+        fn is_active(&self) -> bool {
+            !self.built || self.stream.is_some()
+        }
+
+        /// Records that this partition's stream has been built.
+        fn set_stream(&mut self, stream: FileStream) {
+            self.stream = Some(stream);
+            self.built = true;
+        }
+
+        /// Records one formatted output line for this partition.
+        fn push_output(&mut self, line: String) {
+            self.output.push(line);
+        }
+
+        /// Marks this partition as finished.
+        fn finish(&mut self) {
+            self.push_output("Done".to_string());
+            self.stream = None;
+        }
+    }
+}
diff --git a/datafusion/datasource/src/file_stream/scan_state.rs b/datafusion/datasource/src/file_stream/scan_state.rs
new file mode 100644
index 0000000000000..21125cd08896c
--- /dev/null
+++ b/datafusion/datasource/src/file_stream/scan_state.rs
@@ -0,0 +1,304 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_common::internal_datafusion_err;
+use std::collections::VecDeque;
+use std::task::{Context, Poll};
+
+use crate::morsel::{Morsel, MorselPlanner, Morselizer, PendingMorselPlanner};
+use arrow::record_batch::RecordBatch;
+use datafusion_common::{DataFusionError, Result};
+use datafusion_physical_plan::metrics::ScopedTimerGuard;
+use futures::stream::BoxStream;
+use futures::{FutureExt as _, StreamExt as _};
+
+use super::work_source::WorkSource;
+use super::{FileStreamMetrics, OnError};
+
+/// State [`FileStreamState::Scan`].
+///
+/// There is one `ScanState` per `FileStream`, and thus per output partition.
+///
+/// It groups together the lifecycle of scanning that partition's files:
+/// unopened files, CPU-ready planners, pending planner I/O, ready morsels,
+/// the active reader, and the metrics associated with processing that work.
+///
+/// # I/O
+///
+/// To avoid challenges controlling buffering, the ScanState only ever has a
+/// single I/O outstanding at any time.
+///
+/// # State Transitions
+///
+/// ```text
+/// work_source
+///    |
+///    v
+/// morselizer.plan_file(file)
+///    |
+///    v
+/// ready_planners ---> plan() ---> ready_morsels ---> into_stream() ---> reader ---> RecordBatches
+///       ^               |
+///       |               v
+///       |          pending_planner
+///       |               |
+///       |               v
+///       +-------- poll until ready
+/// ```
+///
+/// [`FileStreamState::Scan`]: super::FileStreamState::Scan
+pub(super) struct ScanState {
+    /// Unopened files that still need to be planned for this stream.
+    work_source: WorkSource,
+    /// Remaining row limit, if any.
+    remain: Option<usize>,
+    /// The morselizer used to plan files.
+    morselizer: Box<dyn Morselizer>,
+    /// Behavior if opening or scanning a file fails.
+    on_error: OnError,
+    /// CPU-ready planners for the current file.
+    ready_planners: VecDeque<Box<dyn MorselPlanner>>,
+    /// Ready morsels for the current file.
+    ready_morsels: VecDeque<Box<dyn Morsel>>,
+    /// The active reader, if any.
+    reader: Option<BoxStream<'static, Result<RecordBatch>>>,
+    /// The single planner currently blocked on I/O, if any.
+    ///
+    /// Once the I/O completes, yields the next planner and is pushed back
+    /// onto `ready_planners`.
+    pending_planner: Option<PendingMorselPlanner>,
+    /// Metrics for the active scan queues.
+    metrics: FileStreamMetrics,
+}
+
+impl ScanState {
+    pub(super) fn new(
+        work_source: WorkSource,
+        remain: Option<usize>,
+        morselizer: Box<dyn Morselizer>,
+        on_error: OnError,
+        metrics: FileStreamMetrics,
+    ) -> Self {
+        Self {
+            work_source,
+            remain,
+            morselizer,
+            on_error,
+            ready_planners: Default::default(),
+            ready_morsels: Default::default(),
+            reader: None,
+            pending_planner: None,
+            metrics,
+        }
+    }
+
+    /// Updates how scan errors are handled while the stream is still active.
+    pub(super) fn set_on_error(&mut self, on_error: OnError) {
+        self.on_error = on_error;
+    }
+
+    /// Drives one iteration of the active scan state.
+    ///
+    /// Work is attempted in this order:
+    /// 1. resolve any pending planner I/O
+    /// 2. poll the active reader
+    /// 3. turn a ready morsel into the active reader
+    /// 4. run CPU planning on a ready planner
+    /// 5. morselize the next unopened file
+    ///
+    /// The return [`ScanAndReturn`] tells `poll_inner` how to update the
+    /// outer `FileStreamState`.
+    pub(super) fn poll_scan(&mut self, cx: &mut Context<'_>) -> ScanAndReturn {
+        let _processing_timer: ScopedTimerGuard<'_> =
+            self.metrics.time_processing.timer();
+
+        // Try and resolve outstanding IO first. If it is still pending, check
+        // the current reader or ready morsels before yielding. New planning
+        // work must still wait for this I/O to resolve.
+        if let Some(mut pending_planner) = self.pending_planner.take() {
+            match pending_planner.poll_unpin(cx) {
+                // IO is still pending
+                Poll::Pending => {
+                    self.pending_planner = Some(pending_planner);
+                }
+                // IO resolved, and the planner is ready for CPU work
+                Poll::Ready(Ok(planner)) => {
+                    self.ready_planners.push_back(planner);
+                }
+                // IO Error
+                Poll::Ready(Err(err)) => {
+                    self.metrics.file_open_errors.add(1);
+                    self.metrics.time_opening.stop();
+                    return match self.on_error {
+                        OnError::Skip => {
+                            self.metrics.files_processed.add(1);
+                            ScanAndReturn::Continue
+                        }
+                        OnError::Fail => ScanAndReturn::Error(err),
+                    };
+                }
+            }
+        }
+
+        // Next try and get the next batch from the active reader, if any.
+        if let Some(reader) = self.reader.as_mut() {
+            match reader.poll_next_unpin(cx) {
+                // Morsels should ideally only expose ready-to-decode streams,
+                // but tolerate pending readers here.
+                Poll::Pending => return ScanAndReturn::Return(Poll::Pending),
+                Poll::Ready(Some(Ok(batch))) => {
+                    self.metrics.time_scanning_until_data.stop();
+                    self.metrics.time_scanning_total.stop();
+                    // Apply any remaining row limit.
+                    let (batch, finished) = match &mut self.remain {
+                        Some(remain) => {
+                            if *remain > batch.num_rows() {
+                                *remain -= batch.num_rows();
+                                self.metrics.time_scanning_total.start();
+                                (batch, false)
+                            } else {
+                                let batch = batch.slice(0, *remain);
+                                let done = 1 + self.work_source.skipped_on_limit();
+                                self.metrics.files_processed.add(done);
+                                *remain = 0;
+                                (batch, true)
+                            }
+                        }
+                        None => {
+                            self.metrics.time_scanning_total.start();
+                            (batch, false)
+                        }
+                    };
+                    return if finished {
+                        ScanAndReturn::Done(Some(Ok(batch)))
+                    } else {
+                        ScanAndReturn::Return(Poll::Ready(Some(Ok(batch))))
+                    };
+                }
+                Poll::Ready(Some(Err(err))) => {
+                    self.reader = None;
+                    self.metrics.file_scan_errors.add(1);
+                    self.metrics.time_scanning_until_data.stop();
+                    self.metrics.time_scanning_total.stop();
+                    return match self.on_error {
+                        OnError::Skip => {
+                            self.metrics.files_processed.add(1);
+                            ScanAndReturn::Continue
+                        }
+                        OnError::Fail => ScanAndReturn::Error(err),
+                    };
+                }
+                Poll::Ready(None) => {
+                    self.reader = None;
+                    self.metrics.files_processed.add(1);
+                    self.metrics.time_scanning_until_data.stop();
+                    self.metrics.time_scanning_total.stop();
+                    return ScanAndReturn::Continue;
+                }
+            }
+        }
+
+        // No active reader, but a morsel is ready to become the reader.
+        if let Some(morsel) = self.ready_morsels.pop_front() {
+            self.metrics.time_opening.stop();
+            self.metrics.time_scanning_until_data.start();
+            self.metrics.time_scanning_total.start();
+            self.reader = Some(morsel.into_stream());
+            return ScanAndReturn::Continue;
+        }
+
+        // Do not start CPU planning or open another file while planner I/O is
+        // still outstanding because they may need additional IO and ScanState
+        // currently only permits a single outstanding IO
+        if self.pending_planner.is_some() {
+            return ScanAndReturn::Return(Poll::Pending);
+        }
+
+        // No reader or morsel, so try to produce more work via CPU planning.
+        if let Some(planner) = self.ready_planners.pop_front() {
+            return match planner.plan() {
+                Ok(Some(mut plan)) => {
+                    // Queue any newly-ready morsels, planners, or planner I/O.
+                    self.ready_morsels.extend(plan.take_morsels());
+                    self.ready_planners.extend(plan.take_ready_planners());
+                    if let Some(pending_planner) = plan.take_pending_planner() {
+                        // should not have planned if we have outstanding I/O
+                        if self.pending_planner.is_some() {
+                            return ScanAndReturn::Error(internal_datafusion_err!(
+                                "Conflicting pending planner state in FileStream ScanState"
+                            ));
+                        }
+                        self.pending_planner = Some(pending_planner);
+                    }
+                    ScanAndReturn::Continue
+                }
+                Ok(None) => {
+                    self.metrics.files_processed.add(1);
+                    self.metrics.time_opening.stop();
+                    ScanAndReturn::Continue
+                }
+                Err(err) => {
+                    self.metrics.file_open_errors.add(1);
+                    self.metrics.time_opening.stop();
+                    match self.on_error {
+                        OnError::Skip => {
+                            self.metrics.files_processed.add(1);
+                            ScanAndReturn::Continue
+                        }
+                        OnError::Fail => ScanAndReturn::Error(err),
+                    }
+                }
+            };
+        }
+
+        // No outstanding work remains, so begin planning the next unopened file.
+        let part_file = match self.work_source.pop_front() {
+            Some(part_file) => part_file,
+            None => return ScanAndReturn::Done(None),
+        };
+
+        self.metrics.time_opening.start();
+        match self.morselizer.plan_file(part_file) {
+            Ok(planner) => {
+                self.metrics.files_opened.add(1);
+                self.ready_planners.push_back(planner);
+                ScanAndReturn::Continue
+            }
+            Err(err) => match self.on_error {
+                OnError::Skip => {
+                    self.metrics.file_open_errors.add(1);
+                    self.metrics.time_opening.stop();
+                    self.metrics.files_processed.add(1);
+                    ScanAndReturn::Continue
+                }
+                OnError::Fail => ScanAndReturn::Error(err),
+            },
+        }
+    }
+}
+
+/// What should be done on the next iteration of [`ScanState::poll_scan`]?
+pub(super) enum ScanAndReturn {
+    /// Poll again.
+    Continue,
+    /// Return the provided result without changing the outer state.
+    Return(Poll<Option<Result<RecordBatch>>>),
+    /// Update the outer `FileStreamState` to `Done` and return the provided result.
+    Done(Option<Result<RecordBatch>>),
+    /// Update the outer `FileStreamState` to `Error` and return the provided error.
+    Error(DataFusionError),
+}
diff --git a/datafusion/datasource/src/file_stream/work_source.rs b/datafusion/datasource/src/file_stream/work_source.rs
new file mode 100644
index 0000000000000..7f31dacca9592
--- /dev/null
+++ b/datafusion/datasource/src/file_stream/work_source.rs
@@ -0,0 +1,98 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::collections::VecDeque;
+use std::sync::Arc;
+
+use crate::PartitionedFile;
+use crate::file_groups::FileGroup;
+use crate::file_scan_config::FileScanConfig;
+use parking_lot::Mutex;
+
+/// Source of work for `ScanState`.
+///
+/// Streams that may share work across siblings use [`WorkSource::Shared`],
+/// while streams that can not share work (e.g. because they must preserve file
+/// order) use  [`WorkSource::Local`].
+#[derive(Debug, Clone)]
+pub(super) enum WorkSource {
+    /// Files this stream will plan locally without sharing them.
+    Local(VecDeque<PartitionedFile>),
+    /// Files shared with sibling streams.
+    Shared(SharedWorkSource),
+}
+
+impl WorkSource {
+    /// Pop the next file to plan from this work source.
+    pub(super) fn pop_front(&mut self) -> Option<PartitionedFile> {
+        match self {
+            Self::Local(files) => files.pop_front(),
+            Self::Shared(shared) => shared.pop_front(),
+        }
+    }
+
+    /// Return how many queued files should be counted as already processed
+    /// when this stream stops early after hitting a global limit.
+    pub(super) fn skipped_on_limit(&self) -> usize {
+        match self {
+            Self::Local(files) => files.len(),
+            Self::Shared(_) => 0,
+        }
+    }
+}
+
+/// Shared source of work for sibling `FileStream`s
+///
+/// The queue is created once per execution and shared by all reorderable
+/// sibling streams for that execution. Whichever stream becomes idle first may
+/// take the next unopened file from the front of the queue.
+///
+/// It uses a [`Mutex`] internally to provide thread-safe access
+/// to the shared file queue.
+#[derive(Debug, Clone)]
+pub(crate) struct SharedWorkSource {
+    inner: Arc<SharedWorkSourceInner>,
+}
+
+#[derive(Debug, Default)]
+pub(super) struct SharedWorkSourceInner {
+    files: Mutex<VecDeque<PartitionedFile>>,
+}
+
+impl SharedWorkSource {
+    /// Create a shared work source containing the provided unopened files.
+    pub(crate) fn new(files: impl IntoIterator<Item = PartitionedFile>) -> Self {
+        let files = files.into_iter().collect();
+        Self {
+            inner: Arc::new(SharedWorkSourceInner {
+                files: Mutex::new(files),
+            }),
+        }
+    }
+
+    /// Create a shared work source for the unopened files in `config`.
+    pub(crate) fn from_config(config: &FileScanConfig) -> Self {
+        Self::new(config.file_groups.iter().flat_map(FileGroup::iter).cloned())
+    }
+
+    /// Pop the next file from the shared work queue.
+    ///
+    /// Returns `None` if the queue is empty
+    fn pop_front(&self) -> Option<PartitionedFile> {
+        self.inner.files.lock().pop_front()
+    }
+}
diff --git a/datafusion/datasource/src/memory.rs b/datafusion/datasource/src/memory.rs
index 7d5c8c4834ead..9f4f8aa0f3635 100644
--- a/datafusion/datasource/src/memory.rs
+++ b/datafusion/datasource/src/memory.rs
@@ -15,7 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::fmt;
@@ -29,18 +28,22 @@ use crate::source::{DataSource, DataSourceExec};
 
 use arrow::array::{RecordBatch, RecordBatchOptions};
 use arrow::datatypes::{Schema, SchemaRef};
-use datafusion_common::{internal_err, plan_err, project_schema, Result, ScalarValue};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{
+    Result, ScalarValue, assert_or_internal_err, plan_err, project_schema,
+};
 use datafusion_execution::TaskContext;
 use datafusion_physical_expr::equivalence::project_orderings;
+use datafusion_physical_expr::projection::ProjectionExprs;
 use datafusion_physical_expr::utils::collect_columns;
 use datafusion_physical_expr::{EquivalenceProperties, LexOrdering};
 use datafusion_physical_plan::memory::MemoryStream;
 use datafusion_physical_plan::projection::{
-    all_alias_free_columns, new_projections_for_columns, ProjectionExpr,
+    all_alias_free_columns, new_projections_for_columns,
 };
 use datafusion_physical_plan::{
-    common, ColumnarValue, DisplayAs, DisplayFormatType, Partitioning, PhysicalExpr,
-    SendableRecordBatchStream, Statistics,
+    ColumnarValue, DisplayAs, DisplayFormatType, Partitioning, PhysicalExpr,
+    SendableRecordBatchStream, Statistics, common,
 };
 
 use async_trait::async_trait;
@@ -88,10 +91,6 @@ impl DataSource for MemorySourceConfig {
         )))
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
         match t {
             DisplayFormatType::Default | DisplayFormatType::Verbose => {
@@ -117,10 +116,10 @@ impl DataSource for MemorySourceConfig {
                     .map_or(String::new(), |limit| format!(", fetch={limit}"));
                 if self.show_sizes {
                     write!(
-                                f,
-                                "partitions={}, partition_sizes={partition_sizes:?}{limit}{output_ordering}{constraints}",
-                                partition_sizes.len(),
-                            )
+                        f,
+                        "partitions={}, partition_sizes={partition_sizes:?}{limit}{output_ordering}{constraints}",
+                        partition_sizes.len(),
+                    )
                 } else {
                     write!(
                         f,
@@ -193,26 +192,26 @@ impl DataSource for MemorySourceConfig {
         SchedulingType::Cooperative
     }
 
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         if let Some(partition) = partition {
             // Compute statistics for a specific partition
             if let Some(batches) = self.partitions.get(partition) {
-                Ok(common::compute_record_batch_statistics(
+                Ok(Arc::new(common::compute_record_batch_statistics(
                     from_ref(batches),
                     &self.schema,
                     self.projection.clone(),
-                ))
+                )))
             } else {
                 // Invalid partition index
-                Ok(Statistics::new_unknown(&self.projected_schema))
+                Ok(Arc::new(Statistics::new_unknown(&self.projected_schema)))
             }
         } else {
             // Compute statistics across all partitions
-            Ok(common::compute_record_batch_statistics(
+            Ok(Arc::new(common::compute_record_batch_statistics(
                 &self.partitions,
                 &self.schema,
                 self.projection.clone(),
-            ))
+            )))
         }
     }
 
@@ -227,27 +226,51 @@ impl DataSource for MemorySourceConfig {
 
     fn try_swapping_with_projection(
         &self,
-        projection: &[ProjectionExpr],
+        projection: &ProjectionExprs,
     ) -> Result<Option<Arc<dyn DataSource>>> {
         // If there is any non-column or alias-carrier expression, Projection should not be removed.
         // This process can be moved into MemoryExec, but it would be an overlap of their responsibility.
-        all_alias_free_columns(projection)
+        let exprs = projection.iter().cloned().collect_vec();
+        all_alias_free_columns(exprs.as_slice())
             .then(|| {
                 let all_projections = (0..self.schema.fields().len()).collect();
                 let new_projections = new_projections_for_columns(
-                    projection,
+                    &exprs,
                     self.projection().as_ref().unwrap_or(&all_projections),
                 );
-
-                MemorySourceConfig::try_new(
-                    self.partitions(),
-                    self.original_schema(),
-                    Some(new_projections),
-                )
-                .map(|s| Arc::new(s) as Arc<dyn DataSource>)
+                let projected_schema =
+                    project_schema(&self.schema, Some(&new_projections));
+
+                projected_schema.map(|projected_schema| {
+                    // Clone self to preserve all metadata (fetch, sort_information,
+                    // show_sizes, etc.) then update only the projection-related fields.
+                    let mut new_source = self.clone();
+                    new_source.projection = Some(new_projections);
+                    new_source.projected_schema = projected_schema;
+                    // Project sort information to match the new projection
+                    new_source.sort_information = project_orderings(
+                        &new_source.sort_information,
+                        &new_source.projected_schema,
+                    );
+                    Arc::new(new_source) as Arc<dyn DataSource>
+                })
             })
             .transpose()
     }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in sort_information
+        let mut tnr = TreeNodeRecursion::Continue;
+        for ordering in &self.sort_information {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
 }
 
 impl MemorySourceConfig {
@@ -282,6 +305,7 @@ impl MemorySourceConfig {
     }
 
     /// Create a new execution plan from a list of constant values (`ValuesExec`)
+    #[expect(clippy::needless_pass_by_value)]
     pub fn try_new_as_values(
         schema: SchemaRef,
         data: Vec<Vec<Arc<dyn PhysicalExpr>>>,
@@ -339,6 +363,7 @@ impl MemorySourceConfig {
     ///
     /// Errors if any of the batches don't match the provided schema, or if no
     /// batches are provided.
+    #[expect(clippy::needless_pass_by_value)]
     pub fn try_new_from_batches(
         schema: SchemaRef,
         batches: Vec<RecordBatch>,
@@ -438,12 +463,11 @@ impl MemorySourceConfig {
                     .map(|field| field.name() != col.name())
                     .unwrap_or(true)
             });
-        if let Some(col) = ambiguous_column {
-            return internal_err!(
-                "Column {:?} is not found in the original schema of the MemorySourceConfig",
-                col
-            );
-        }
+        assert_or_internal_err!(
+            ambiguous_column.is_none(),
+            "Column {:?} is not found in the original schema of the MemorySourceConfig",
+            ambiguous_column.as_ref().unwrap()
+        );
 
         // If there is a projection on the source, we also need to project orderings
         if self.projection.is_some() {
@@ -743,10 +767,6 @@ impl MemSink {
 
 #[async_trait]
 impl DataSink for MemSink {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> &SchemaRef {
         &self.schema
     }
@@ -850,7 +870,6 @@ mod tests {
     use datafusion_physical_plan::expressions::lit;
 
     use datafusion_physical_plan::ExecutionPlan;
-    use futures::StreamExt;
 
     #[tokio::test]
     async fn exec_with_limit() -> Result<()> {
@@ -878,6 +897,39 @@ mod tests {
         Ok(())
     }
 
+    /// Test that `try_swapping_with_projection` preserves the `fetch` limit.
+    /// Regression test for <https://github.com/apache/datafusion/issues/21176>
+    #[test]
+    fn try_swapping_with_projection_preserves_fetch() {
+        use datafusion_physical_expr::projection::ProjectionExprs;
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, false),
+            Field::new("c", DataType::Int64, false),
+        ]));
+        let partitions: Vec<Vec<RecordBatch>> = vec![vec![batch(10)]];
+        let source = MemorySourceConfig::try_new(&partitions, schema.clone(), None)
+            .unwrap()
+            .with_limit(Some(5));
+
+        assert_eq!(source.fetch, Some(5));
+
+        // Create a projection that reorders columns: [c, a] (indices 2, 0)
+        let projection = ProjectionExprs::from_indices(&[2, 0], &schema);
+        let swapped = source
+            .try_swapping_with_projection(&projection)
+            .unwrap()
+            .unwrap();
+        let new_source = swapped.downcast_ref::<MemorySourceConfig>().unwrap();
+
+        assert_eq!(
+            new_source.fetch,
+            Some(5),
+            "fetch limit must be preserved after projection pushdown"
+        );
+    }
+
     #[tokio::test]
     async fn values_empty_case() -> Result<()> {
         let schema = aggr_test_schema();
@@ -943,13 +995,12 @@ mod tests {
             vec![lit(ScalarValue::Null)],
         ];
         let rows = data.len();
-        let values = MemorySourceConfig::try_new_as_values(
-            Arc::new(Schema::new(vec![Field::new("col0", DataType::Null, true)])),
-            data,
-        )?;
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("col0", DataType::Null, true)]));
+        let values = MemorySourceConfig::try_new_as_values(schema, data)?;
 
         assert_eq!(
-            values.partition_statistics(None)?,
+            *values.partition_statistics(None)?,
             Statistics {
                 num_rows: Precision::Exact(rows),
                 total_byte_size: Precision::Exact(8), // not important
@@ -959,6 +1010,7 @@ mod tests {
                     max_value: Precision::Absent,
                     min_value: Precision::Absent,
                     sum_value: Precision::Absent,
+                    byte_size: Precision::Absent,
                 },],
             }
         );
@@ -1079,8 +1131,7 @@ mod tests {
         let actual = partitioned_datasrc
             .map(|datasrc| datasrc.output_partitioning().partition_count());
         assert_eq!(
-            actual,
-            partition_cnt,
+            actual, partition_cnt,
             "partitioned datasrc does not match expected, we expected {should_exist}, instead found {actual:?}"
         );
     }
@@ -1198,9 +1249,8 @@ mod tests {
         // Starting = batch(100_000), batch(10_000), batch(100), batch(1).
         // It should have split as p1=batch(100_000), p2=[batch(10_000), batch(100), batch(1)]
         let partitioned_datasrc = partitioned_datasrc.unwrap();
-        let Some(mem_src_config) = partitioned_datasrc
-            .as_any()
-            .downcast_ref::<MemorySourceConfig>()
+        let Some(mem_src_config) =
+            partitioned_datasrc.downcast_ref::<MemorySourceConfig>()
         else {
             unreachable!()
         };
@@ -1266,8 +1316,8 @@ mod tests {
     }
 
     #[test]
-    fn test_repartition_no_sort_information_no_output_ordering_lopsized_batches(
-    ) -> Result<()> {
+    fn test_repartition_no_sort_information_no_output_ordering_lopsized_batches()
+    -> Result<()> {
         let no_sort = vec![];
         let no_output_ordering = None;
 
@@ -1397,9 +1447,8 @@ mod tests {
         // Starting = batch(100_000), batch(1), batch(100), batch(10_000).
         // It should have split as p1=batch(100_000), p2=[batch(1), batch(100), batch(10_000)]
         let partitioned_datasrc = partitioned_datasrc.unwrap();
-        let Some(mem_src_config) = partitioned_datasrc
-            .as_any()
-            .downcast_ref::<MemorySourceConfig>()
+        let Some(mem_src_config) =
+            partitioned_datasrc.downcast_ref::<MemorySourceConfig>()
         else {
             unreachable!()
         };
diff --git a/datafusion/datasource/src/mod.rs b/datafusion/datasource/src/mod.rs
index 8d988bdb31be7..a9600271c28ce 100644
--- a/datafusion/datasource/src/mod.rs
+++ b/datafusion/datasource/src/mod.rs
@@ -23,6 +23,7 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! A table that uses the `ObjectStore` listing capability
 //! to get the list of files to process.
@@ -37,6 +38,8 @@ pub mod file_scan_config;
 pub mod file_sink_config;
 pub mod file_stream;
 pub mod memory;
+pub mod morsel;
+pub mod projection;
 pub mod schema_adapter;
 pub mod sink;
 pub mod source;
@@ -53,14 +56,15 @@ pub use self::url::ListingTableUrl;
 use crate::file_groups::FileGroup;
 use chrono::TimeZone;
 use datafusion_common::stats::Precision;
-use datafusion_common::{exec_datafusion_err, ColumnStatistics, Result};
+use datafusion_common::{ColumnStatistics, Result, exec_datafusion_err};
 use datafusion_common::{ScalarValue, Statistics};
+use datafusion_physical_expr::LexOrdering;
 use futures::{Stream, StreamExt};
-use object_store::{path::Path, ObjectMeta};
 use object_store::{GetOptions, GetRange, ObjectStore};
+use object_store::{ObjectMeta, path::Path};
 pub use table_schema::TableSchema;
 // Remove when add_row_stats is remove
-#[allow(deprecated)]
+#[expect(deprecated)]
 pub use statistics::add_row_stats;
 pub use statistics::compute_all_files_statistics;
 use std::ops::Range;
@@ -68,6 +72,10 @@ use std::pin::Pin;
 use std::sync::Arc;
 
 /// Stream of files get listed from object store
+#[deprecated(
+    since = "54.0.0",
+    note = "This type is unused and will be removed in a future release"
+)]
 pub type PartitionedFileStream =
     Pin<Box<dyn Stream<Item = Result<PartitionedFile>> + Send + Sync + 'static>>;
 
@@ -92,6 +100,19 @@ impl FileRange {
 #[derive(Debug, Clone)]
 /// A single file or part of a file that should be read, along with its schema, statistics
 /// and partition column values that need to be appended to each row.
+///
+/// # Statistics
+///
+/// The [`Self::statistics`] field contains statistics for the **full table schema**,
+/// which includes both file columns and partition columns. When statistics are set via
+/// [`Self::with_statistics`], exact statistics for partition columns are automatically
+/// computed from [`Self::partition_values`]:
+///
+/// - `min = max = partition_value` (all rows in a file share the same partition value)
+/// - `null_count = 0` (partition values extracted from paths are never null)
+/// - `distinct_count = 1` (single distinct value per file for each partition column)
+///
+/// This enables query optimizers to use partition column bounds for pruning and planning.
 pub struct PartitionedFile {
     /// Path for the file (e.g. URL, filesystem path, etc)
     pub object_meta: ObjectMeta,
@@ -112,7 +133,21 @@ pub struct PartitionedFile {
     ///
     /// DataFusion relies on these statistics for planning (in particular to sort file groups),
     /// so if they are incorrect, incorrect answers may result.
+    ///
+    /// These statistics cover the full table schema: file columns plus partition columns.
+    /// When set via [`Self::with_statistics`], partition column statistics are automatically
+    /// computed from [`Self::partition_values`] with exact min/max/null_count/distinct_count.
     pub statistics: Option<Arc<Statistics>>,
+    /// The known lexicographical ordering of the rows in this file, if any.
+    ///
+    /// This describes how the data within the file is sorted with respect to one or more
+    /// columns, and is used by the optimizer for planning operations that depend on input
+    /// ordering (e.g. merges, sorts, and certain aggregations).
+    ///
+    /// When available, this is typically inferred from file-level metadata exposed by the
+    /// underlying format (for example, Parquet `sorting_columns`), but it may also be set
+    /// explicitly via [`Self::with_ordering`].
+    pub ordering: Option<LexOrdering>,
     /// An optional field for user defined per object metadata
     pub extensions: Option<Arc<dyn std::any::Any + Send + Sync>>,
     /// The estimated size of the parquet metadata, in bytes
@@ -133,6 +168,20 @@ impl PartitionedFile {
             partition_values: vec![],
             range: None,
             statistics: None,
+            ordering: None,
+            extensions: None,
+            metadata_size_hint: None,
+        }
+    }
+
+    /// Create a file from a known ObjectMeta without partition
+    pub fn new_from_meta(object_meta: ObjectMeta) -> Self {
+        Self {
+            object_meta,
+            partition_values: vec![],
+            range: None,
+            statistics: None,
+            ordering: None,
             extensions: None,
             metadata_size_hint: None,
         }
@@ -151,12 +200,38 @@ impl PartitionedFile {
             partition_values: vec![],
             range: Some(FileRange { start, end }),
             statistics: None,
+            ordering: None,
             extensions: None,
             metadata_size_hint: None,
         }
         .with_range(start, end)
     }
 
+    /// Attach partition values to this file.
+    /// This replaces any existing partition values.
+    pub fn with_partition_values(mut self, partition_values: Vec<ScalarValue>) -> Self {
+        self.partition_values = partition_values;
+        self
+    }
+
+    /// Size of the file to be scanned (taking into account the range, if present).
+    pub fn effective_size(&self) -> u64 {
+        if let Some(range) = &self.range {
+            (range.end - range.start) as u64
+        } else {
+            self.object_meta.size
+        }
+    }
+
+    /// Effective range of the file to be scanned.
+    pub fn range(&self) -> (u64, u64) {
+        if let Some(range) = &self.range {
+            (range.start as u64, range.end as u64)
+        } else {
+            (0, self.object_meta.size)
+        }
+    }
+
     /// Provide a hint to the size of the file metadata. If a hint is provided
     /// the reader will try and fetch the last `size_hint` bytes of the parquet file optimistically.
     /// Without an appropriate hint, two read may be required to fetch the metadata.
@@ -193,9 +268,38 @@ impl PartitionedFile {
         self
     }
 
-    // Update the statistics for this file.
-    pub fn with_statistics(mut self, statistics: Arc<Statistics>) -> Self {
-        self.statistics = Some(statistics);
+    /// Update the statistics for this file.
+    ///
+    /// The provided `statistics` should cover only the file schema columns.
+    /// This method will automatically append exact statistics for partition columns
+    /// based on `partition_values`:
+    /// - `min = max = partition_value` (all rows have the same value)
+    /// - `null_count = 0` (partition values from paths are never null)
+    /// - `distinct_count = 1` (all rows have the same partition value)
+    pub fn with_statistics(mut self, file_statistics: Arc<Statistics>) -> Self {
+        if self.partition_values.is_empty() {
+            // No partition columns, use stats as-is
+            self.statistics = Some(file_statistics);
+        } else {
+            // Extend stats with exact partition column statistics
+            let mut stats = Arc::unwrap_or_clone(file_statistics);
+            for partition_value in &self.partition_values {
+                let col_stats = ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(partition_value.clone()),
+                    min_value: Precision::Exact(partition_value.clone()),
+                    distinct_count: Precision::Exact(1),
+                    sum_value: Precision::Absent,
+                    byte_size: partition_value
+                        .data_type()
+                        .primitive_width()
+                        .map(|w| stats.num_rows.multiply(&Precision::Exact(w)))
+                        .unwrap_or_else(|| Precision::Absent),
+                };
+                stats.column_statistics.push(col_stats);
+            }
+            self.statistics = Some(Arc::new(stats));
+        }
         self
     }
 
@@ -215,6 +319,15 @@ impl PartitionedFile {
             false
         }
     }
+
+    /// Set the known ordering of data in this file.
+    ///
+    /// The ordering represents the lexicographical sort order of the data,
+    /// typically inferred from file metadata (e.g., Parquet sorting_columns).
+    pub fn with_ordering(mut self, ordering: Option<LexOrdering>) -> Self {
+        self.ordering = ordering;
+        self
+    }
 }
 
 impl From<ObjectMeta> for PartitionedFile {
@@ -224,6 +337,7 @@ impl From<ObjectMeta> for PartitionedFile {
             partition_values: vec![],
             range: None,
             statistics: None,
+            ordering: None,
             extensions: None,
             metadata_size_hint: None,
         }
@@ -283,6 +397,10 @@ pub async fn calculate_range(
                 0
             };
 
+            if start + start_delta > end {
+                return Ok(RangeCalculation::TerminateEarly);
+            }
+
             let end_delta = if end != file_size {
                 find_first_newline(store, location, end - 1, file_size, newline).await?
             } else {
@@ -291,7 +409,7 @@ pub async fn calculate_range(
 
             let range = start + start_delta..end + end_delta;
 
-            if range.start == range.end {
+            if range.start >= range.end {
                 return Ok(RangeCalculation::TerminateEarly);
             }
 
@@ -413,8 +531,10 @@ pub fn generate_test_files(num_files: usize, overlap_factor: f64) -> Vec<FileGro
                     min_value: Precision::Exact(ScalarValue::Float64(Some(min))),
                     sum_value: Precision::Absent,
                     distinct_count: Precision::Absent,
+                    byte_size: Precision::Absent,
                 }],
             })),
+            ordering: None,
             extensions: None,
             metadata_size_hint: None,
         };
@@ -458,7 +578,7 @@ mod tests {
     use datafusion_execution::object_store::{
         DefaultObjectStoreRegistry, ObjectStoreRegistry,
     };
-    use object_store::{local::LocalFileSystem, path::Path};
+    use object_store::{ObjectStoreExt, local::LocalFileSystem, path::Path};
     use std::{collections::HashMap, ops::Not, sync::Arc};
     use url::Url;
 
@@ -539,6 +659,70 @@ mod tests {
         sut.get_store(url.as_ref()).unwrap();
     }
 
+    #[test]
+    fn test_with_statistics_appends_partition_column_stats() {
+        use crate::PartitionedFile;
+        use datafusion_common::stats::Precision;
+        use datafusion_common::{ColumnStatistics, ScalarValue, Statistics};
+
+        // Create a PartitionedFile with partition values
+        let mut pf = PartitionedFile::new(
+            "test.parquet",
+            100, // file size
+        );
+        pf.partition_values = vec![
+            ScalarValue::Date32(Some(20148)), // 2025-03-01
+        ];
+
+        // Create file-only statistics (1 column for 'id')
+        let file_stats = Arc::new(Statistics {
+            num_rows: Precision::Exact(2),
+            total_byte_size: Precision::Exact(16),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(0),
+                max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                min_value: Precision::Exact(ScalarValue::Int32(Some(3))),
+                sum_value: Precision::Absent,
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Absent,
+            }],
+        });
+
+        // Call with_statistics - should append partition column stats
+        let pf = pf.with_statistics(file_stats);
+
+        // Verify the statistics now have 2 columns
+        let stats = pf.statistics.unwrap();
+        assert_eq!(
+            stats.column_statistics.len(),
+            2,
+            "Expected 2 columns (id + date partition)"
+        );
+
+        // Verify partition column statistics
+        let partition_col_stats = &stats.column_statistics[1];
+        assert_eq!(
+            partition_col_stats.null_count,
+            Precision::Exact(0),
+            "Partition column null_count should be Exact(0)"
+        );
+        assert_eq!(
+            partition_col_stats.min_value,
+            Precision::Exact(ScalarValue::Date32(Some(20148))),
+            "Partition column min should match partition value"
+        );
+        assert_eq!(
+            partition_col_stats.max_value,
+            Precision::Exact(ScalarValue::Date32(Some(20148))),
+            "Partition column max should match partition value"
+        );
+        assert_eq!(
+            partition_col_stats.distinct_count,
+            Precision::Exact(1),
+            "Partition column distinct_count should be Exact(1)"
+        );
+    }
+
     #[test]
     fn test_url_contains() {
         let url = ListingTableUrl::parse("file:///var/data/mytable/").unwrap();
@@ -557,12 +741,13 @@ mod tests {
 
         // as per documentation, when `ignore_subdirectory` is true, we should ignore files that aren't
         // a direct child of the `url`
-        assert!(url
-            .contains(
+        assert!(
+            url.contains(
                 &Path::parse("/var/data/mytable/mysubfolder/data.parquet").unwrap(),
                 true
             )
-            .not());
+            .not()
+        );
 
         // when we set `ignore_subdirectory` to false, we should not ignore the file
         assert!(url.contains(
@@ -590,4 +775,31 @@ mod tests {
         // testing an empty path with `ignore_subdirectory` set to false
         assert!(url.contains(&Path::parse("/var/data/mytable/").unwrap(), false));
     }
+
+    /// Regression test for <https://github.com/apache/datafusion/issues/19605>
+    #[tokio::test]
+    async fn test_calculate_range_single_line_file() {
+        use super::{PartitionedFile, RangeCalculation, calculate_range};
+        use object_store::ObjectStore;
+        use object_store::memory::InMemory;
+
+        let content = r#"{"id":1,"data":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}"#;
+        let file_size = content.len() as u64;
+
+        let store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
+        let path = Path::from("test.json");
+        store.put(&path, content.into()).await.unwrap();
+
+        let mid = file_size / 2;
+        let partitioned_file = PartitionedFile::new_with_range(
+            path.to_string(),
+            file_size,
+            mid as i64,
+            file_size as i64,
+        );
+
+        let result = calculate_range(&partitioned_file, &store, None).await;
+
+        assert!(matches!(result, Ok(RangeCalculation::TerminateEarly)));
+    }
 }
diff --git a/datafusion/datasource/src/morsel/adapters.rs b/datafusion/datasource/src/morsel/adapters.rs
new file mode 100644
index 0000000000000..6fa6d4916771d
--- /dev/null
+++ b/datafusion/datasource/src/morsel/adapters.rs
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::PartitionedFile;
+use crate::file_stream::FileOpener;
+use crate::morsel::{Morsel, MorselPlan, MorselPlanner, Morselizer};
+use arrow::array::RecordBatch;
+use datafusion_common::Result;
+use futures::FutureExt;
+use futures::stream::BoxStream;
+use std::fmt::Debug;
+use std::sync::Arc;
+
+/// Adapt a legacy [`FileOpener`] to the morsel API.
+///
+/// This preserves backwards compatibility for file formats that have not yet
+/// implemented a native [`Morselizer`].
+pub struct FileOpenerMorselizer {
+    file_opener: Arc<dyn FileOpener>,
+}
+
+impl Debug for FileOpenerMorselizer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("FileOpenerMorselizer")
+            .field("file_opener", &"...")
+            .finish()
+    }
+}
+
+impl FileOpenerMorselizer {
+    pub fn new(file_opener: Arc<dyn FileOpener>) -> Self {
+        Self { file_opener }
+    }
+}
+
+impl Morselizer for FileOpenerMorselizer {
+    fn plan_file(&self, file: PartitionedFile) -> Result<Box<dyn MorselPlanner>> {
+        Ok(Box::new(FileOpenFutureMorselPlanner::new(
+            Arc::clone(&self.file_opener),
+            file,
+        )))
+    }
+}
+
+enum FileOpenFutureMorselPlanner {
+    Unopened {
+        file_opener: Arc<dyn FileOpener>,
+        file: Box<PartitionedFile>,
+    },
+    ReadyStream(BoxStream<'static, Result<RecordBatch>>),
+}
+
+impl Debug for FileOpenFutureMorselPlanner {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Unopened { .. } => f
+                .debug_tuple("FileOpenFutureMorselPlanner::Unopened")
+                .finish(),
+            Self::ReadyStream(_) => f
+                .debug_tuple("FileOpenFutureMorselPlanner::ReadyStream")
+                .finish(),
+        }
+    }
+}
+
+impl FileOpenFutureMorselPlanner {
+    fn new(file_opener: Arc<dyn FileOpener>, file: PartitionedFile) -> Self {
+        Self::Unopened {
+            file_opener,
+            file: Box::new(file),
+        }
+    }
+}
+
+impl MorselPlanner for FileOpenFutureMorselPlanner {
+    fn plan(self: Box<Self>) -> Result<Option<MorselPlan>> {
+        match *self {
+            Self::Unopened { file_opener, file } => {
+                let io_future = async move {
+                    let stream = file_opener.open(*file)?.await?;
+                    Ok(Box::new(Self::ReadyStream(stream)) as Box<dyn MorselPlanner>)
+                }
+                .boxed();
+                Ok(Some(MorselPlan::new().with_pending_planner(io_future)))
+            }
+            Self::ReadyStream(stream) => Ok(Some(
+                MorselPlan::new()
+                    .with_morsels(vec![Box::new(FileStreamMorsel { stream })]),
+            )),
+        }
+    }
+}
+
+struct FileStreamMorsel {
+    stream: BoxStream<'static, Result<RecordBatch>>,
+}
+
+impl Debug for FileStreamMorsel {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("FileStreamMorsel").finish_non_exhaustive()
+    }
+}
+
+impl Morsel for FileStreamMorsel {
+    fn into_stream(self: Box<Self>) -> BoxStream<'static, Result<RecordBatch>> {
+        self.stream
+    }
+}
diff --git a/datafusion/datasource/src/morsel/mocks.rs b/datafusion/datasource/src/morsel/mocks.rs
new file mode 100644
index 0000000000000..ceb0e720691a7
--- /dev/null
+++ b/datafusion/datasource/src/morsel/mocks.rs
@@ -0,0 +1,746 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Test-only mocks for exercising the morsel-driven `FileStream` scheduler.
+
+use std::collections::{HashMap, VecDeque};
+use std::fmt::{Display, Formatter};
+use std::pin::Pin;
+use std::sync::{Arc, Mutex};
+use std::task::{Context, Poll};
+
+use crate::PartitionedFile;
+use crate::morsel::{Morsel, MorselPlan, MorselPlanner, Morselizer};
+use arrow::array::{Int32Array, RecordBatch};
+use arrow::datatypes::{DataType, Field, Schema};
+use datafusion_common::{DataFusionError, Result, internal_datafusion_err};
+use futures::stream::BoxStream;
+use futures::{Future, FutureExt};
+
+// Use thin wrappers around usize so the test setups are more explicit
+
+/// Identifier for a mock morsel in scheduler snapshots.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub(crate) struct MorselId(pub usize);
+
+/// Identifier for a produced batch in scheduler snapshots.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub(crate) struct BatchId(pub usize);
+
+/// Identifier for a mock I/O future in scheduler snapshots.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub(crate) struct IoFutureId(pub usize);
+
+/// Number of pending polls before a mock I/O future resolves.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub(crate) struct PollsToResolve(pub usize);
+
+/// Error message returned by a mock planner or I/O future.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub(crate) struct MockError(pub String);
+
+impl Display for MockError {
+    fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl std::error::Error for MockError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        None
+    }
+}
+
+/// Scheduler-visible event captured by the mock morsel test harness.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub(crate) enum MorselEvent {
+    MorselizeFile {
+        path: String,
+    },
+    PlannerCreated {
+        planner_name: String,
+    },
+    PlannerCalled {
+        planner_name: String,
+    },
+    IoFutureCreated {
+        planner_name: String,
+        io_future_id: IoFutureId,
+    },
+    IoFuturePolled {
+        planner_name: String,
+        io_future_id: IoFutureId,
+    },
+    IoFutureResolved {
+        planner_name: String,
+        io_future_id: IoFutureId,
+    },
+    IoFutureErrored {
+        planner_name: String,
+        io_future_id: IoFutureId,
+        message: String,
+    },
+    MorselProduced {
+        planner_name: String,
+        morsel_id: MorselId,
+    },
+    MorselStreamStarted {
+        morsel_id: MorselId,
+    },
+    MorselStreamBatchProduced {
+        morsel_id: MorselId,
+        batch_id: BatchId,
+    },
+    MorselStreamFinished {
+        morsel_id: MorselId,
+    },
+}
+
+impl Display for MorselEvent {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            MorselEvent::MorselizeFile { path } => {
+                write!(f, "morselize_file: {path}")
+            }
+            MorselEvent::PlannerCreated { planner_name } => {
+                write!(f, "planner_created: {planner_name}")
+            }
+            MorselEvent::PlannerCalled { planner_name } => {
+                write!(f, "planner_called: {planner_name}")
+            }
+            MorselEvent::IoFutureCreated {
+                planner_name,
+                io_future_id,
+            } => write!(f, "io_future_created: {planner_name}, {io_future_id:?}"),
+            MorselEvent::IoFuturePolled {
+                planner_name,
+                io_future_id,
+            } => write!(f, "io_future_polled: {planner_name}, {io_future_id:?}"),
+            MorselEvent::IoFutureResolved {
+                planner_name,
+                io_future_id,
+            } => write!(f, "io_future_resolved: {planner_name}, {io_future_id:?}"),
+            MorselEvent::IoFutureErrored {
+                planner_name,
+                io_future_id,
+                message,
+            } => write!(
+                f,
+                "io_future_errored: {planner_name}, {io_future_id:?}, {message}"
+            ),
+            MorselEvent::MorselProduced {
+                planner_name,
+                morsel_id,
+            } => write!(f, "morsel_produced: {planner_name}, {morsel_id:?}"),
+            MorselEvent::MorselStreamStarted { morsel_id } => {
+                write!(f, "morsel_stream_started: {morsel_id:?}")
+            }
+            MorselEvent::MorselStreamBatchProduced {
+                morsel_id,
+                batch_id,
+            } => write!(
+                f,
+                "morsel_stream_batch_produced: {morsel_id:?}, {batch_id:?}"
+            ),
+            MorselEvent::MorselStreamFinished { morsel_id } => {
+                write!(f, "morsel_stream_finished: {morsel_id:?}")
+            }
+        }
+    }
+}
+
+/// Shared observer that records scheduler events for snapshot tests.
+#[derive(Debug, Default, Clone)]
+pub(crate) struct MorselObserver {
+    events: Arc<Mutex<Vec<MorselEvent>>>,
+}
+
+impl MorselObserver {
+    /// Clears any previously recorded events.
+    pub(crate) fn clear(&self) {
+        self.events.lock().unwrap().clear();
+    }
+
+    /// Records one new scheduler event.
+    pub(crate) fn push(&self, event: MorselEvent) {
+        self.events.lock().unwrap().push(event);
+    }
+
+    /// Formats all recorded events into a stable, snapshot-friendly trace.
+    pub(crate) fn format_events(&self) -> String {
+        self.events
+            .lock()
+            .unwrap()
+            .iter()
+            .map(ToString::to_string)
+            .collect::<Vec<_>>()
+            .join("\n")
+    }
+}
+
+/// Declarative planner spec used by the mock morselizer.
+#[derive(Debug, Clone)]
+pub(crate) struct MockPlanner {
+    file_path: String,
+    steps: VecDeque<PlannerStep>,
+}
+
+impl MockPlanner {
+    /// Creates a fluent builder for one mock planner.
+    pub(crate) fn builder(file_path: impl Into<String>) -> MockPlannerBuilder {
+        MockPlannerBuilder {
+            file_path: file_path.into(),
+            ..Default::default()
+        }
+    }
+
+    /// Returns the file path associated with this planner.
+    pub(crate) fn file_path(&self) -> &str {
+        &self.file_path
+    }
+}
+
+/// One scheduler-visible result from calling `MorselPlanner::plan`.
+#[derive(Debug, Clone)]
+enum PlannerStep {
+    Plan {
+        morsels: Vec<MockMorselSpec>,
+        ready_planners: Vec<MockPlanner>,
+        pending_planner: Option<MockPendingPlanner>,
+    },
+    Error {
+        error: MockError,
+    },
+    None,
+}
+
+/// One mock morsel returned from a planning step.
+#[derive(Debug, Clone)]
+struct MockMorselSpec {
+    morsel_id: MorselId,
+    batch_ids: Vec<i32>,
+}
+
+/// One pending planner I/O future returned from a planning step.
+#[derive(Debug, Clone)]
+struct MockPendingPlanner {
+    io_future_id: IoFutureId,
+    polls_to_resolve: PollsToResolve,
+    result: std::result::Result<(), MockError>,
+}
+
+/// Builder for one mock `PlannerStep::Plan`.
+#[derive(Debug, Default)]
+pub(crate) struct MockPlanBuilder {
+    morsels: Vec<MockMorselSpec>,
+    ready_planners: Vec<MockPlanner>,
+    pending_planner: Option<MockPendingPlanner>,
+}
+
+impl MockPlanBuilder {
+    /// Create an empty mock plan.
+    pub(crate) fn new() -> Self {
+        Self::default()
+    }
+
+    /// Add one ready morsel with a single batch.
+    pub(crate) fn with_morsel(mut self, morsel_id: MorselId, batch_id: i32) -> Self {
+        self.morsels.push(MockMorselSpec {
+            morsel_id,
+            batch_ids: vec![batch_id],
+        });
+        self
+    }
+
+    /// Add one ready morsel with multiple batches.
+    pub(crate) fn with_morsel_batches(
+        mut self,
+        morsel_id: MorselId,
+        batch_ids: Vec<i32>,
+    ) -> Self {
+        self.morsels.push(MockMorselSpec {
+            morsel_id,
+            batch_ids,
+        });
+        self
+    }
+
+    /// Add a pending planner I/O future produced by this planning step.
+    pub(crate) fn with_pending_planner(
+        mut self,
+        io_future_id: IoFutureId,
+        polls_to_resolve: PollsToResolve,
+        result: std::result::Result<(), MockError>,
+    ) -> Self {
+        self.pending_planner = Some(MockPendingPlanner {
+            io_future_id,
+            polls_to_resolve,
+            result,
+        });
+        self
+    }
+
+    /// Add a ready child planner
+    pub(crate) fn with_ready_planner(
+        self,
+        ready_planner: impl Into<MockPlanner>,
+    ) -> Self {
+        self.with_ready_planners(vec![ready_planner.into()])
+    }
+
+    /// Add ready child planners produced by this planning step.
+    pub(crate) fn with_ready_planners(
+        mut self,
+        ready_planners: Vec<MockPlanner>,
+    ) -> Self {
+        self.ready_planners.extend(ready_planners);
+        self
+    }
+
+    /// Build the planner step.
+    fn build(self) -> PlannerStep {
+        PlannerStep::Plan {
+            morsels: self.morsels,
+            ready_planners: self.ready_planners,
+            pending_planner: self.pending_planner,
+        }
+    }
+}
+
+/// Builder for a planning step that only returns a pending planner.
+#[derive(Debug, Clone)]
+pub(crate) struct PendingPlannerBuilder {
+    io_future_id: IoFutureId,
+    polls_to_resolve: PollsToResolve,
+    result: std::result::Result<(), MockError>,
+}
+
+impl From<PendingPlannerBuilder> for MockPlanBuilder {
+    fn from(builder: PendingPlannerBuilder) -> Self {
+        builder.build()
+    }
+}
+
+impl PendingPlannerBuilder {
+    /// Create a pending-planner step with a successful I/O future.
+    pub(crate) fn new(io_future_id: IoFutureId) -> Self {
+        Self {
+            io_future_id,
+            polls_to_resolve: PollsToResolve(0),
+            result: Ok(()),
+        }
+    }
+
+    /// Configure how many pending polls occur before the I/O future resolves.
+    pub(crate) fn with_polls_to_resolve(
+        mut self,
+        polls_to_resolve: PollsToResolve,
+    ) -> Self {
+        self.polls_to_resolve = polls_to_resolve;
+        self
+    }
+
+    /// Configure a failing I/O future for this pending planner.
+    pub(crate) fn with_error(mut self, message: impl Into<String>) -> Self {
+        self.result = Err(MockError(message.into()));
+        self
+    }
+
+    /// Build a `MockPlanBuilder` containing only this pending planner.
+    pub(crate) fn build(self) -> MockPlanBuilder {
+        MockPlanBuilder::new().with_pending_planner(
+            self.io_future_id,
+            self.polls_to_resolve,
+            self.result,
+        )
+    }
+}
+
+/// Fluent builder for [`MockPlanner`] test specs.
+#[derive(Debug, Default)]
+pub(crate) struct MockPlannerBuilder {
+    file_path: String,
+    steps: Vec<PlannerStep>,
+}
+
+impl From<MockPlannerBuilder> for MockPlanner {
+    fn from(value: MockPlannerBuilder) -> Self {
+        value.build()
+    }
+}
+
+impl MockPlannerBuilder {
+    pub(crate) fn add_plan(mut self, builder: impl Into<MockPlanBuilder>) -> Self {
+        let builder = builder.into();
+        self.steps.push(builder.build());
+        self
+    }
+
+    /// Adds one planning step that reports the planner is exhausted.
+    pub(crate) fn return_none(mut self) -> Self {
+        self.steps.push(PlannerStep::None);
+        self
+    }
+
+    /// Adds one planning step that fails during CPU planning.
+    pub(crate) fn return_error(mut self, message: impl Into<String>) -> Self {
+        self.steps.push(PlannerStep::Error {
+            error: MockError(message.into()),
+        });
+        self
+    }
+
+    /// Finalizes the configured mock planner.
+    pub(crate) fn build(self) -> MockPlanner {
+        let Self { file_path, steps } = self;
+
+        MockPlanner {
+            file_path,
+            steps: VecDeque::from(steps),
+        }
+    }
+}
+
+/// Mock [`Morselizer`] that maps file paths to fixed planner specs.
+#[derive(Debug, Clone, Default)]
+pub(crate) struct MockMorselizer {
+    observer: MorselObserver,
+    files: HashMap<String, MockPlanner>,
+}
+
+impl MockMorselizer {
+    /// Creates an empty mock morselizer.
+    pub(crate) fn new() -> Self {
+        Self::default()
+    }
+
+    /// Returns the shared event observer for this test harness.
+    pub(crate) fn observer(&self) -> &MorselObserver {
+        &self.observer
+    }
+
+    /// Specify the return planner for the specified file_path
+    pub(crate) fn with_planner(mut self, planner: impl Into<MockPlanner>) -> Self {
+        let planner = planner.into();
+        self.files.insert(planner.file_path.clone(), planner);
+        self
+    }
+}
+
+impl Morselizer for MockMorselizer {
+    fn plan_file(&self, file: PartitionedFile) -> Result<Box<dyn MorselPlanner>> {
+        let path = file.object_meta.location.to_string();
+        self.observer
+            .push(MorselEvent::MorselizeFile { path: path.clone() });
+
+        let planner = self.files.get(&path).cloned().ok_or_else(|| {
+            internal_datafusion_err!("No mock planner configured for file: {path}")
+        })?;
+
+        self.observer.push(MorselEvent::PlannerCreated {
+            planner_name: planner.file_path.clone(),
+        });
+
+        Ok(Box::new(MockMorselPlanner::new(
+            self.observer.clone(),
+            planner,
+        )))
+    }
+}
+
+/// Concrete mock planner that executes one predefined step per `plan()` call.
+#[derive(Debug)]
+struct MockMorselPlanner {
+    observer: MorselObserver,
+    planner_name: String,
+    steps: VecDeque<PlannerStep>,
+}
+
+impl MockMorselPlanner {
+    /// Creates a concrete planner from its declarative test spec.
+    fn new(observer: MorselObserver, planner: MockPlanner) -> Self {
+        Self {
+            observer,
+            planner_name: planner.file_path,
+            steps: planner.steps,
+        }
+    }
+}
+
+/// Rebuilds the mock planner continuation after one step completes.
+fn current_planner_continuation(
+    observer: MorselObserver,
+    planner_name: String,
+    steps: VecDeque<PlannerStep>,
+) -> Vec<Box<dyn MorselPlanner>> {
+    let only_none_remaining =
+        matches!(steps.front(), Some(PlannerStep::None)) && steps.len() == 1;
+
+    if steps.is_empty() || only_none_remaining {
+        Vec::new()
+    } else {
+        vec![Box::new(MockMorselPlanner {
+            observer,
+            planner_name,
+            steps,
+        }) as Box<dyn MorselPlanner>]
+    }
+}
+
+/// Create any child planners produced by this planning step.
+fn child_planners(
+    observer: MorselObserver,
+    ready_planners: Vec<MockPlanner>,
+) -> Vec<Box<dyn MorselPlanner>> {
+    ready_planners
+        .into_iter()
+        .map(|planner| {
+            observer.push(MorselEvent::PlannerCreated {
+                planner_name: planner.file_path.clone(),
+            });
+            Box::new(MockMorselPlanner::new(observer.clone(), planner))
+                as Box<dyn MorselPlanner>
+        })
+        .collect()
+}
+
+impl MorselPlanner for MockMorselPlanner {
+    fn plan(self: Box<Self>) -> Result<Option<MorselPlan>> {
+        let Self {
+            observer,
+            planner_name,
+            mut steps,
+        } = *self;
+
+        observer.push(MorselEvent::PlannerCalled {
+            planner_name: planner_name.clone(),
+        });
+
+        let Some(step) = steps.pop_front() else {
+            return Ok(None);
+        };
+
+        match step {
+            PlannerStep::Plan {
+                morsels,
+                ready_planners,
+                pending_planner,
+            } => {
+                let mut ready_morsels = Vec::new();
+                for MockMorselSpec {
+                    morsel_id,
+                    batch_ids,
+                } in morsels
+                {
+                    observer.push(MorselEvent::MorselProduced {
+                        planner_name: planner_name.clone(),
+                        morsel_id,
+                    });
+                    ready_morsels.push(Box::new(MockMorsel::new(
+                        observer.clone(),
+                        morsel_id,
+                        batch_ids,
+                    )) as Box<dyn Morsel>);
+                }
+
+                let mut planners = child_planners(observer.clone(), ready_planners);
+                if pending_planner.is_none() {
+                    planners.extend(current_planner_continuation(
+                        observer.clone(),
+                        planner_name.clone(),
+                        steps.clone(),
+                    ));
+                }
+
+                let mut plan = MorselPlan::new()
+                    .with_morsels(ready_morsels)
+                    .with_planners(planners);
+
+                if let Some(MockPendingPlanner {
+                    io_future_id,
+                    polls_to_resolve,
+                    result,
+                }) = pending_planner
+                {
+                    observer.push(MorselEvent::IoFutureCreated {
+                        planner_name: planner_name.clone(),
+                        io_future_id,
+                    });
+                    let io_future = MockIoFuture::new(
+                        observer.clone(),
+                        planner_name.clone(),
+                        io_future_id,
+                        polls_to_resolve,
+                        result,
+                    )
+                    .map(move |result| {
+                        result?;
+                        Ok(Box::new(MockMorselPlanner {
+                            observer,
+                            planner_name,
+                            steps,
+                        }) as Box<dyn MorselPlanner>)
+                    })
+                    .boxed();
+                    plan = plan.with_pending_planner(io_future);
+                }
+
+                Ok(Some(plan))
+            }
+            PlannerStep::Error { error } => {
+                Err(DataFusionError::External(Box::new(error)))
+            }
+            PlannerStep::None => Ok(None),
+        }
+    }
+}
+
+/// Concrete morsel used by the mock scheduler tests.
+#[derive(Debug)]
+pub(crate) struct MockMorsel {
+    observer: MorselObserver,
+    morsel_id: MorselId,
+    batch_ids: Vec<i32>,
+}
+
+impl MockMorsel {
+    /// Creates a mock morsel with a deterministic sequence of batches.
+    fn new(observer: MorselObserver, morsel_id: MorselId, batch_ids: Vec<i32>) -> Self {
+        Self {
+            observer,
+            morsel_id,
+            batch_ids,
+        }
+    }
+}
+
+impl Morsel for MockMorsel {
+    fn into_stream(self: Box<Self>) -> BoxStream<'static, Result<RecordBatch>> {
+        self.observer.push(MorselEvent::MorselStreamStarted {
+            morsel_id: self.morsel_id,
+        });
+        Box::pin(MockMorselStream {
+            observer: self.observer.clone(),
+            morsel_id: self.morsel_id,
+            batch_ids: self.batch_ids.into(),
+            finished: false,
+        })
+    }
+}
+
+/// Stream returned by [`MockMorsel::into_stream`].
+struct MockMorselStream {
+    observer: MorselObserver,
+    morsel_id: MorselId,
+    batch_ids: VecDeque<i32>,
+    finished: bool,
+}
+
+impl futures::Stream for MockMorselStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        _cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        if let Some(batch_id) = self.batch_ids.pop_front() {
+            self.observer.push(MorselEvent::MorselStreamBatchProduced {
+                morsel_id: self.morsel_id,
+                batch_id: BatchId(batch_id as usize),
+            });
+            return Poll::Ready(Some(Ok(single_value_batch(batch_id))));
+        }
+
+        if !self.finished {
+            self.finished = true;
+            self.observer.push(MorselEvent::MorselStreamFinished {
+                morsel_id: self.morsel_id,
+            });
+        }
+
+        Poll::Ready(None)
+    }
+}
+
+/// Deterministic future used to simulate planner I/O in tests.
+struct MockIoFuture {
+    observer: MorselObserver,
+    planner_name: String,
+    io_future_id: IoFutureId,
+    pending_polls_remaining: usize,
+    result: std::result::Result<(), MockError>,
+}
+
+impl MockIoFuture {
+    /// Creates a future that resolves after `io_polls` pending polls.
+    fn new(
+        observer: MorselObserver,
+        planner_name: String,
+        io_future_id: IoFutureId,
+        polls_to_resolve: PollsToResolve,
+        result: std::result::Result<(), MockError>,
+    ) -> Self {
+        Self {
+            observer,
+            planner_name,
+            io_future_id,
+            pending_polls_remaining: polls_to_resolve.0,
+            result,
+        }
+    }
+}
+
+impl Future for MockIoFuture {
+    type Output = Result<()>;
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.observer.push(MorselEvent::IoFuturePolled {
+            planner_name: self.planner_name.clone(),
+            io_future_id: self.io_future_id,
+        });
+
+        if self.pending_polls_remaining > 0 {
+            self.pending_polls_remaining -= 1;
+            cx.waker().wake_by_ref();
+            return Poll::Pending;
+        }
+
+        match &self.result {
+            Ok(()) => {
+                self.observer.push(MorselEvent::IoFutureResolved {
+                    planner_name: self.planner_name.clone(),
+                    io_future_id: self.io_future_id,
+                });
+                Poll::Ready(Ok(()))
+            }
+            Err(e) => {
+                self.observer.push(MorselEvent::IoFutureErrored {
+                    planner_name: self.planner_name.clone(),
+                    io_future_id: self.io_future_id,
+                    message: e.0.clone(),
+                });
+                Poll::Ready(Err(DataFusionError::External(Box::new(e.clone()))))
+            }
+        }
+    }
+}
+
+/// Creates a one-row batch so snapshot output stays compact and readable.
+fn single_value_batch(value: i32) -> RecordBatch {
+    let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, false)]));
+    RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(vec![value]))]).unwrap()
+}
diff --git a/datafusion/datasource/src/morsel/mod.rs b/datafusion/datasource/src/morsel/mod.rs
new file mode 100644
index 0000000000000..7b5066ca07a26
--- /dev/null
+++ b/datafusion/datasource/src/morsel/mod.rs
@@ -0,0 +1,234 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Structures for Morsel Driven IO.
+//!
+//! NOTE: As of DataFusion 54.0.0, these are experimental APIs that may change
+//! substantially.
+//!
+//! Morsel Driven IO is a technique for parallelizing the reading of large files
+//! by dividing them into smaller "morsels" that are processed independently.
+//!
+//! It is inspired by the paper [Morsel-Driven Parallelism: A NUMA-Aware Query
+//! Evaluation Framework for the Many-Core Age](https://db.in.tum.de/~leis/papers/morsels.pdf).
+
+mod adapters;
+#[cfg(test)]
+pub(crate) mod mocks;
+
+use crate::PartitionedFile;
+pub(crate) use adapters::FileOpenerMorselizer;
+use arrow::array::RecordBatch;
+use datafusion_common::Result;
+use futures::FutureExt;
+use futures::future::BoxFuture;
+use futures::stream::BoxStream;
+use std::fmt::Debug;
+use std::pin::Pin;
+use std::task::{Context, Poll};
+
+/// A Morsel of work ready to resolve to a stream of [`RecordBatch`]es.
+///
+/// This represents a single morsel of work that is ready to be processed. It
+/// has all data necessary (does not need any I/O) and is ready to be turned
+/// into a stream of [`RecordBatch`]es for processing by the execution engine.
+pub trait Morsel: Send + Debug {
+    /// Consume this morsel and produce a stream of [`RecordBatch`]es for processing.
+    ///
+    /// Note: This may do CPU work to decode already-loaded data, but should not
+    /// do any I/O work such as reading from the file.
+    fn into_stream(self: Box<Self>) -> BoxStream<'static, Result<RecordBatch>>;
+}
+
+/// A Morselizer takes a single [`PartitionedFile`] and creates the initial planner
+/// for that file.
+///
+/// This is the entry point for morsel driven I/O.
+pub trait Morselizer: Send + Sync + Debug {
+    /// Return the initial [`MorselPlanner`] for this file.
+    ///
+    /// Morselizing a file may involve CPU work, such as parsing parquet
+    /// metadata and evaluating pruning predicates. It should NOT do any I/O
+    /// work, such as reading from the file. Any needed I/O should be done using
+    /// [`MorselPlan::with_pending_planner`].
+    fn plan_file(&self, file: PartitionedFile) -> Result<Box<dyn MorselPlanner>>;
+}
+
+/// A Morsel Planner is responsible for creating morsels for a given scan.
+///
+/// The [`MorselPlanner`] is the unit of I/O. There is only ever a single I/O
+/// outstanding for a specific planner. DataFusion may run
+/// multiple planners in parallel, which corresponds to multiple parallel
+/// I/O requests.
+///
+/// It is not a Rust `Stream` so that it can explicitly separate CPU bound
+/// work from I/O work.
+///
+/// The design is similar to `ParquetPushDecoder`: when `plan` is called, it
+/// should do CPU work to produce the next morsels or discover the next I/O
+/// phase.
+///
+/// Best practice is to spawn I/O in a Tokio task on a separate runtime to
+/// ensure that CPU work doesn't block or slow down I/O work, but this is not
+/// strictly required by the API.
+pub trait MorselPlanner: Send + Debug {
+    /// Attempt to plan morsels. This may involve CPU work, such as parsing
+    /// parquet metadata and evaluating pruning predicates.
+    ///
+    /// It should NOT do any I/O work, such as reading from the file. If I/O is
+    /// required, the returned [`MorselPlan`] should contain a pending planner
+    /// future that the caller polls to drive the I/O work to completion. Once
+    /// that future resolves, it yields a planner ready for work.
+    ///
+    /// Note this function is **not async** to make it explicitly clear that if
+    /// I/O is required, it should be done in the returned `io_future`.
+    ///
+    /// Returns `None` if the planner has no more work to do.
+    ///
+    /// # Empty Morsel Plans
+    ///
+    /// It may return `None`, which means no batches will be read from the file
+    /// (e.g. due to late-pruning based on statistics).
+    ///
+    /// # Output Ordering
+    ///
+    /// See the comments on [`MorselPlan`] for the logical output order.
+    fn plan(self: Box<Self>) -> Result<Option<MorselPlan>>;
+}
+
+/// Return result of [`MorselPlanner::plan`].
+///
+/// # Logical Ordering
+///
+/// For plans where the output order of rows is maintained, the output order of
+/// a [`MorselPlanner`] is logically defined as follows:
+/// 1. All morsels that are directly produced
+/// 2. Recursively, all morsels produced by the returned `planners`
+#[derive(Default)]
+pub struct MorselPlan {
+    /// Morsels ready for CPU work
+    morsels: Vec<Box<dyn Morsel>>,
+    /// Planners that are ready for CPU work.
+    ready_planners: Vec<Box<dyn MorselPlanner>>,
+    /// A future with planner I/O that resolves to a CPU ready planner.
+    ///
+    /// DataFusion will poll this future occasionally to drive the I/O work to
+    /// completion. Once it resolves, planning continues with the returned
+    /// planner.
+    pending_planner: Option<PendingMorselPlanner>,
+}
+
+impl MorselPlan {
+    /// Create an empty morsel plan.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Set the ready morsels.
+    pub fn with_morsels(mut self, morsels: Vec<Box<dyn Morsel>>) -> Self {
+        self.morsels = morsels;
+        self
+    }
+
+    /// Set the ready child planners.
+    pub fn with_planners(mut self, planners: Vec<Box<dyn MorselPlanner>>) -> Self {
+        self.ready_planners = planners;
+        self
+    }
+
+    /// Set the pending planner for an I/O phase.
+    pub fn with_pending_planner<F>(mut self, io_future: F) -> Self
+    where
+        F: Future<Output = Result<Box<dyn MorselPlanner>>> + Send + 'static,
+    {
+        self.pending_planner = Some(PendingMorselPlanner::new(io_future));
+        self
+    }
+
+    /// Set the pending planner  for an I/O phase.
+    pub fn set_pending_planner<F>(&mut self, io_future: F)
+    where
+        F: Future<Output = Result<Box<dyn MorselPlanner>>> + Send + 'static,
+    {
+        self.pending_planner = Some(PendingMorselPlanner::new(io_future));
+    }
+
+    /// Take the ready morsels.
+    pub fn take_morsels(&mut self) -> Vec<Box<dyn Morsel>> {
+        std::mem::take(&mut self.morsels)
+    }
+
+    /// Take the ready child planners.
+    pub fn take_ready_planners(&mut self) -> Vec<Box<dyn MorselPlanner>> {
+        std::mem::take(&mut self.ready_planners)
+    }
+
+    /// Take the pending I/O future, if any.
+    pub fn take_pending_planner(&mut self) -> Option<PendingMorselPlanner> {
+        self.pending_planner.take()
+    }
+
+    /// Returns `true` if this plan contains an I/O future.
+    pub fn has_io_future(&self) -> bool {
+        self.pending_planner.is_some()
+    }
+}
+
+/// Wrapper for I/O that must complete before planning can continue.
+pub struct PendingMorselPlanner {
+    future: BoxFuture<'static, Result<Box<dyn MorselPlanner>>>,
+}
+
+impl PendingMorselPlanner {
+    /// Create a new pending planner future.
+    ///
+    /// Example
+    /// ```
+    /// # use datafusion_common::DataFusionError;
+    /// # use datafusion_datasource::morsel::{MorselPlanner, PendingMorselPlanner};
+    /// let work = async move {
+    ///  let planner: Box<dyn MorselPlanner> = {
+    ///   // Do I/O work here, then return the next planner to run.
+    ///  # unimplemented!();
+    ///   };
+    ///   Ok(planner) as Result<_, DataFusionError>;
+    /// };
+    /// let pending_io = PendingMorselPlanner::new(work);
+    /// ```
+    pub fn new<F>(future: F) -> Self
+    where
+        F: Future<Output = Result<Box<dyn MorselPlanner>>> + Send + 'static,
+    {
+        Self {
+            future: future.boxed(),
+        }
+    }
+
+    /// Consume this wrapper and return the underlying future.
+    pub fn into_future(self) -> BoxFuture<'static, Result<Box<dyn MorselPlanner>>> {
+        self.future
+    }
+}
+
+/// Forwards polling to the underlying future.
+impl Future for PendingMorselPlanner {
+    type Output = Result<Box<dyn MorselPlanner>>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        // forward request to inner
+        self.future.as_mut().poll(cx)
+    }
+}
diff --git a/datafusion/datasource/src/projection.rs b/datafusion/datasource/src/projection.rs
new file mode 100644
index 0000000000000..ac33a96ca8321
--- /dev/null
+++ b/datafusion/datasource/src/projection.rs
@@ -0,0 +1,630 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::datatypes::{Schema, SchemaRef};
+use datafusion_common::{
+    Result, ScalarValue,
+    tree_node::{Transformed, TransformedResult, TreeNode},
+};
+use datafusion_physical_expr::{
+    expressions::{Column, Literal},
+    projection::{ProjectionExpr, ProjectionExprs},
+};
+use futures::{FutureExt, StreamExt};
+use itertools::Itertools;
+
+use crate::{
+    PartitionedFile, TableSchema,
+    file_stream::{FileOpenFuture, FileOpener},
+};
+
+/// A file opener that handles applying a projection on top of an inner opener.
+///
+/// This includes handling partition columns.
+///
+/// Any projection pushed down will be split up into:
+/// - Simple column indices / column selection
+/// - A remainder projection that this opener applies on top of it
+///
+/// This is meant to simplify projection pushdown for sources like CSV
+/// that can only handle "simple" column selection.
+pub struct ProjectionOpener {
+    inner: Arc<dyn FileOpener>,
+    projection: ProjectionExprs,
+    input_schema: SchemaRef,
+    partition_columns: Vec<PartitionColumnIndex>,
+}
+
+impl ProjectionOpener {
+    pub fn try_new(
+        projection: SplitProjection,
+        inner: Arc<dyn FileOpener>,
+        file_schema: &Schema,
+    ) -> Result<Arc<dyn FileOpener>> {
+        Ok(Arc::new(ProjectionOpener {
+            inner,
+            projection: projection.remapped_projection,
+            input_schema: Arc::new(file_schema.project(&projection.file_indices)?),
+            partition_columns: projection.partition_columns,
+        }))
+    }
+}
+
+impl FileOpener for ProjectionOpener {
+    fn open(&self, partitioned_file: PartitionedFile) -> Result<FileOpenFuture> {
+        let partition_values = partitioned_file.partition_values.clone();
+        // Modify any references to partition columns in the projection expressions
+        // and substitute them with literal values from PartitionedFile.partition_values
+        let projection = if self.partition_columns.is_empty() {
+            self.projection.clone()
+        } else {
+            inject_partition_columns_into_projection(
+                &self.projection,
+                &self.partition_columns,
+                partition_values,
+            )
+        };
+        let projector = projection.make_projector(&self.input_schema)?;
+
+        let inner = self.inner.open(partitioned_file)?;
+
+        Ok(async move {
+            let stream = inner.await?;
+            let stream = stream.map(move |batch| {
+                let batch = batch?;
+                let batch = projector.project_batch(&batch)?;
+                Ok(batch)
+            });
+            Ok(stream.boxed())
+        }
+        .boxed())
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct PartitionColumnIndex {
+    /// The index of this partition column in the remainder projection (>= num_file_columns)
+    pub in_remainder_projection: usize,
+    /// The index of this partition column in the partition_values array
+    pub in_partition_values: usize,
+}
+
+fn inject_partition_columns_into_projection(
+    projection: &ProjectionExprs,
+    partition_columns: &[PartitionColumnIndex],
+    partition_values: Vec<ScalarValue>,
+) -> ProjectionExprs {
+    // Pre-create all literals for partition columns to avoid cloning ScalarValues multiple times.
+    let partition_literals: Vec<Arc<Literal>> = partition_values
+        .into_iter()
+        .map(|value| Arc::new(Literal::new(value)))
+        .collect();
+
+    let projections = projection
+        .iter()
+        .map(|projection| {
+            let expr = Arc::clone(&projection.expr)
+                .transform(|expr| {
+                    let original_expr = Arc::clone(&expr);
+                    if let Some(column) = expr.downcast_ref::<Column>() {
+                        // Check if this column index corresponds to a partition column
+                        if let Some(pci) = partition_columns
+                            .iter()
+                            .find(|pci| pci.in_remainder_projection == column.index())
+                        {
+                            let literal =
+                                Arc::clone(&partition_literals[pci.in_partition_values]);
+                            return Ok(Transformed::yes(literal));
+                        }
+                    }
+                    Ok(Transformed::no(original_expr))
+                })
+                .data()
+                .expect("infallible transform");
+            ProjectionExpr::new(expr, projection.alias.clone())
+        })
+        .collect_vec();
+    ProjectionExprs::new(projections)
+}
+
+/// At a high level the goal of SplitProjection is to take a ProjectionExprs meant to be applied to the table schema
+/// and split that into:
+/// - The projection indices into the file schema (file_indices)
+/// - The projection indices into the partition values (partition_value_indices), which pre-compute both the index into the table schema
+///   and the index into the partition values array
+/// - A remapped projection that can be applied after the file projection is applied
+///   This remapped projection has the following properties:
+///     - Column indices referring to file columns are remapped to [0..file_indices.len())
+///     - Column indices referring to partition columns are remapped to [file_indices.len()..)
+///
+///   This allows the ProjectionOpener to easily identify which columns in the remapped projection
+///   refer to partition columns and substitute them with literals from the partition values.
+#[derive(Debug, Clone)]
+pub struct SplitProjection {
+    /// The original projection this [`SplitProjection`] was derived from
+    pub source: ProjectionExprs,
+    /// Column indices to read from file (public for file sources)
+    pub file_indices: Vec<usize>,
+    /// Pre-computed partition column mappings (internal, used by ProjectionOpener)
+    pub(crate) partition_columns: Vec<PartitionColumnIndex>,
+    /// The remapped projection (internal, used by ProjectionOpener)
+    pub(crate) remapped_projection: ProjectionExprs,
+}
+
+impl SplitProjection {
+    pub fn unprojected(table_schema: &TableSchema) -> Self {
+        let projection = ProjectionExprs::from_indices(
+            &(0..table_schema.table_schema().fields().len()).collect_vec(),
+            table_schema.table_schema(),
+        );
+        Self::new(table_schema.file_schema(), &projection)
+    }
+
+    /// Creates a new [`SplitProjection`] by splitting a projection into
+    /// simple file column indices and a remainder projection that is applied after reading the file.
+    ///
+    /// In other words: we get a `Vec<usize>` projection that is meant to be applied on top of `file_schema`
+    /// and a remainder projection that is applied to the result of that first projection.
+    ///
+    /// Here `file_schema` is expected to be the *logical* schema of the file, that is the
+    /// table schema minus any partition columns.
+    /// Partition columns are always expected to be at the end of the table schema.
+    /// Note that `file_schema` is *not* the physical schema of the file.
+    pub fn new(logical_file_schema: &Schema, projection: &ProjectionExprs) -> Self {
+        let num_file_schema_columns = logical_file_schema.fields().len();
+
+        // Collect all unique columns and classify as file or partition
+        let mut file_columns = Vec::new();
+        let mut partition_columns = Vec::new();
+        let mut all_columns = std::collections::HashMap::new();
+
+        // Extract all unique column references (index -> name)
+        for proj_expr in projection {
+            proj_expr
+                .expr
+                .apply(|expr| {
+                    if let Some(column) = expr.downcast_ref::<Column>() {
+                        all_columns
+                            .entry(column.index())
+                            .or_insert_with(|| column.name().to_string());
+                    }
+                    Ok(datafusion_common::tree_node::TreeNodeRecursion::Continue)
+                })
+                .expect("infallible apply");
+        }
+
+        // Sort by index and classify into file vs partition columns
+        let mut sorted_columns: Vec<_> = all_columns
+            .into_iter()
+            .map(|(idx, name)| (name, idx))
+            .collect();
+        sorted_columns.sort_by_key(|(_, idx)| *idx);
+
+        // Separate file and partition columns, assigning final indices
+        // Pre-create all remapped columns to avoid duplicate Arc'd expressions
+        let mut column_mapping = std::collections::HashMap::new();
+        let mut file_idx = 0;
+        let mut partition_idx = 0;
+
+        for (name, original_index) in sorted_columns {
+            let new_index = if original_index < num_file_schema_columns {
+                // File column: gets index [0..num_file_columns)
+                file_columns.push(original_index);
+                let idx = file_idx;
+                file_idx += 1;
+                idx
+            } else {
+                // Partition column: gets index [num_file_columns..)
+                partition_columns.push(original_index);
+                let idx = file_idx + partition_idx;
+                partition_idx += 1;
+                idx
+            };
+
+            // Pre-create the remapped column so all references can share the same Arc
+            let new_column: Arc<dyn datafusion_physical_plan::PhysicalExpr> =
+                Arc::new(Column::new(&name, new_index));
+            column_mapping.insert(original_index, new_column);
+        }
+
+        // Single tree transformation: remap all column references using pre-created columns
+        let remapped_projection = projection
+            .iter()
+            .map(|proj_expr| {
+                let expr = Arc::clone(&proj_expr.expr)
+                    .transform(|expr| {
+                        let original_expr = Arc::clone(&expr);
+                        if let Some(column) = expr.downcast_ref::<Column>()
+                            && let Some(new_column) = column_mapping.get(&column.index())
+                        {
+                            return Ok(Transformed::yes(Arc::clone(new_column)));
+                        }
+                        Ok(Transformed::no(original_expr))
+                    })
+                    .data()
+                    .expect("infallible transform");
+                ProjectionExpr::new(expr, proj_expr.alias.clone())
+            })
+            .collect_vec();
+
+        // Pre-compute partition column mappings for ProjectionOpener
+        let num_file_columns = file_columns.len();
+        let partition_column_mappings = partition_columns
+            .iter()
+            .enumerate()
+            .map(|(partition_idx, &table_index)| PartitionColumnIndex {
+                in_remainder_projection: num_file_columns + partition_idx,
+                in_partition_values: table_index - num_file_schema_columns,
+            })
+            .collect_vec();
+
+        Self {
+            source: projection.clone(),
+            file_indices: file_columns,
+            partition_columns: partition_column_mappings,
+            remapped_projection: ProjectionExprs::from(remapped_projection),
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::sync::Arc;
+
+    use arrow::array::AsArray;
+    use arrow::datatypes::{DataType, SchemaRef};
+    use datafusion_common::{DFSchema, ScalarValue, record_batch};
+    use datafusion_expr::{Expr, col, execution_props::ExecutionProps};
+    use datafusion_physical_expr::{create_physical_exprs, projection::ProjectionExpr};
+    use itertools::Itertools;
+
+    use super::*;
+
+    fn create_projection_exprs<'a>(
+        exprs: impl IntoIterator<Item = &'a Expr>,
+        schema: &SchemaRef,
+    ) -> ProjectionExprs {
+        let df_schema = DFSchema::try_from(Arc::clone(schema)).unwrap();
+        let physical_exprs =
+            create_physical_exprs(exprs, &df_schema, &ExecutionProps::default()).unwrap();
+        let projection_exprs = physical_exprs
+            .into_iter()
+            .enumerate()
+            .map(|(i, e)| ProjectionExpr::new(Arc::clone(&e), format!("col{i}")))
+            .collect_vec();
+        ProjectionExprs::from(projection_exprs)
+    }
+
+    #[test]
+    fn test_split_projection_with_partition_columns() {
+        use arrow::array::AsArray;
+        use arrow::datatypes::Field;
+        // Simulate the avro_exec_with_partition test scenario:
+        // file_schema has 3 fields
+        let file_schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("bool_col", DataType::Boolean, false),
+            Field::new("tinyint_col", DataType::Int8, false),
+        ]));
+
+        // table_schema has 4 fields (3 file + 1 partition)
+        let table_schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("bool_col", DataType::Boolean, false),
+            Field::new("tinyint_col", DataType::Int8, false),
+            Field::new("date", DataType::Utf8, false), // partition column at index 3
+        ]));
+
+        // projection indices: [0, 1, 3, 2]
+        // This should select: id (0), bool_col (1), date (3-partition), tinyint_col (2)
+        let projection_indices = vec![0, 1, 3, 2];
+
+        // Create projection expressions from indices using the table schema
+        let projection =
+            ProjectionExprs::from_indices(&projection_indices, &table_schema);
+
+        // Call SplitProjection to separate file and partition columns
+        let split = SplitProjection::new(&file_schema, &projection);
+
+        // The file_indices should be [0, 1, 2] (all file columns needed)
+        assert_eq!(split.file_indices, vec![0, 1, 2]);
+
+        // Should have 1 partition column at in_partition_values index 0
+        assert_eq!(split.partition_columns.len(), 1);
+        assert_eq!(split.partition_columns[0].in_partition_values, 0);
+
+        // Now create a batch with only the file columns
+        let file_batch = record_batch!(
+            ("id", Int32, vec![4]),
+            ("bool_col", Boolean, vec![true]),
+            ("tinyint_col", Int8, vec![0])
+        )
+        .unwrap();
+
+        // After the fix, the remainder projection should have remapped indices:
+        // - File columns: [0, 1, 2] (unchanged since they're already in order)
+        // - Partition column: [3] (stays at index 3, which is >= num_file_columns)
+        // So the remainder expects input columns [0, 1, 2] and references column [3] for partition
+
+        // Verify that we can inject partition columns and apply the projection
+        let partition_values = vec![ScalarValue::from("2021-10-26")];
+
+        // Create partition column mapping
+        let partition_columns = vec![PartitionColumnIndex {
+            in_remainder_projection: 3, // partition column is at index 3 in remainder
+            in_partition_values: 0,     // first partition value
+        }];
+
+        // Inject partition columns (replaces Column(3) with Literal)
+        let injected_projection = inject_partition_columns_into_projection(
+            &split.remapped_projection,
+            &partition_columns,
+            partition_values,
+        );
+
+        // Now the projection should work on the file batch
+        let projector = injected_projection
+            .make_projector(&file_batch.schema())
+            .unwrap();
+        let result = projector.project_batch(&file_batch).unwrap();
+
+        // Verify the output has the correct column order: id, bool_col, date, tinyint_col
+        assert_eq!(result.num_columns(), 4);
+        assert_eq!(
+            result
+                .column(0)
+                .as_primitive::<arrow::datatypes::Int32Type>()
+                .value(0),
+            4
+        );
+        assert!(result.column(1).as_boolean().value(0));
+        assert_eq!(result.column(2).as_string::<i32>().value(0), "2021-10-26");
+        assert_eq!(
+            result
+                .column(3)
+                .as_primitive::<arrow::datatypes::Int8Type>()
+                .value(0),
+            0
+        );
+    }
+
+    // ========================================================================
+    // Comprehensive Test Suite for SplitProjection
+    // ========================================================================
+
+    // Helper to create test schemas with file and partition columns
+    fn create_test_schemas(
+        file_cols: usize,
+        partition_cols: usize,
+    ) -> (SchemaRef, SchemaRef) {
+        use arrow::datatypes::Field;
+
+        let file_fields: Vec<_> = (0..file_cols)
+            .map(|i| Field::new(format!("col_{i}"), DataType::Int32, false))
+            .collect();
+
+        let mut table_fields = file_fields.clone();
+        table_fields.extend(
+            (0..partition_cols)
+                .map(|i| Field::new(format!("part_{i}"), DataType::Utf8, false)),
+        );
+
+        (
+            Arc::new(Schema::new(file_fields)),
+            Arc::new(Schema::new(table_fields)),
+        )
+    }
+
+    // ========================================================================
+    // Partition Column Handling Tests
+    // ========================================================================
+
+    #[test]
+    fn test_split_projection_only_file_columns() {
+        let (file_schema, table_schema) = create_test_schemas(3, 2);
+        // Select only file columns [0, 1, 2]
+        let projection = ProjectionExprs::from_indices(&[0, 1, 2], &table_schema);
+
+        let split = SplitProjection::new(&file_schema, &projection);
+
+        assert_eq!(split.file_indices, vec![0, 1, 2]);
+        assert_eq!(split.partition_columns.len(), 0);
+    }
+
+    #[test]
+    fn test_split_projection_only_partition_columns() {
+        let (file_schema, table_schema) = create_test_schemas(3, 2);
+        // Select only partition columns [3, 4]
+        let projection = ProjectionExprs::from_indices(&[3, 4], &table_schema);
+
+        let split = SplitProjection::new(&file_schema, &projection);
+
+        assert_eq!(split.file_indices, Vec::<usize>::new());
+        assert_eq!(split.partition_columns.len(), 2);
+        assert_eq!(split.partition_columns[0].in_partition_values, 0);
+        assert_eq!(split.partition_columns[1].in_partition_values, 1);
+    }
+
+    #[test]
+    fn test_split_projection_multiple_partition_columns() {
+        let (file_schema, table_schema) = create_test_schemas(2, 3);
+        // File cols: 0, 1; Partition cols: 2, 3, 4
+        // Select: [0, 2, 4, 1, 3] (mixed file and partition)
+        let projection = ProjectionExprs::from_indices(&[0, 2, 4, 1, 3], &table_schema);
+
+        let split = SplitProjection::new(&file_schema, &projection);
+
+        assert_eq!(split.file_indices, vec![0, 1]);
+        assert_eq!(split.partition_columns.len(), 3);
+        assert_eq!(split.partition_columns[0].in_partition_values, 0);
+        assert_eq!(split.partition_columns[1].in_partition_values, 1);
+        assert_eq!(split.partition_columns[2].in_partition_values, 2);
+
+        // Verify remapped projection has correct indices
+        // File columns should be at [0, 1], partition columns at [2, 3, 4]
+        assert_eq!(split.remapped_projection.iter().count(), 5);
+    }
+
+    #[test]
+    fn test_split_projection_partition_columns_reverse_order() {
+        let (file_schema, table_schema) = create_test_schemas(2, 2);
+        // File cols: 0, 1; Partition cols: 2, 3
+        // Select: [3, 2] (partitions in reverse)
+        let projection = ProjectionExprs::from_indices(&[3, 2], &table_schema);
+
+        let split = SplitProjection::new(&file_schema, &projection);
+
+        assert_eq!(split.file_indices, Vec::<usize>::new());
+        assert_eq!(split.partition_columns.len(), 2);
+        assert_eq!(split.partition_columns[0].in_partition_values, 0);
+        assert_eq!(split.partition_columns[1].in_partition_values, 1);
+    }
+
+    #[test]
+    fn test_split_projection_interleaved_file_and_partition() {
+        let (file_schema, table_schema) = create_test_schemas(3, 3);
+        // File cols: 0, 1, 2; Partition cols: 3, 4, 5
+        // Select: [0, 3, 1, 4, 2, 5] (alternating)
+        let projection =
+            ProjectionExprs::from_indices(&[0, 3, 1, 4, 2, 5], &table_schema);
+
+        let split = SplitProjection::new(&file_schema, &projection);
+
+        assert_eq!(split.file_indices, vec![0, 1, 2]);
+        assert_eq!(split.partition_columns.len(), 3);
+        assert_eq!(split.partition_columns[0].in_partition_values, 0);
+        assert_eq!(split.partition_columns[1].in_partition_values, 1);
+        assert_eq!(split.partition_columns[2].in_partition_values, 2);
+    }
+
+    #[test]
+    fn test_split_projection_expression_with_file_and_partition_columns() {
+        use arrow::datatypes::Field;
+
+        // Create schemas: 2 file columns, 1 partition column
+        let file_schema = Arc::new(Schema::new(vec![
+            Field::new("file_a", DataType::Int32, false),
+            Field::new("file_b", DataType::Int32, false),
+        ]));
+        let table_schema = Arc::new(Schema::new(vec![
+            Field::new("file_a", DataType::Int32, false),
+            Field::new("file_b", DataType::Int32, false),
+            Field::new("part_c", DataType::Int32, false),
+        ]));
+
+        // Create expression: file_a + part_c
+        let exprs = [col("file_a") + col("part_c")];
+        let projection = create_projection_exprs(exprs.iter(), &table_schema);
+
+        let split = SplitProjection::new(&file_schema, &projection);
+
+        // Should extract both columns
+        assert_eq!(split.file_indices, vec![0]);
+        assert_eq!(split.partition_columns.len(), 1);
+        assert_eq!(split.partition_columns[0].in_partition_values, 0);
+    }
+
+    // ========================================================================
+    // Category 4: Boundary Conditions
+    // ========================================================================
+
+    #[test]
+    fn test_split_projection_boundary_last_file_column() {
+        let (file_schema, table_schema) = create_test_schemas(3, 2);
+        // Last file column is index 2
+        let projection = ProjectionExprs::from_indices(&[2], &table_schema);
+
+        let split = SplitProjection::new(&file_schema, &projection);
+
+        assert_eq!(split.file_indices, vec![2]);
+        assert_eq!(split.partition_columns.len(), 0);
+    }
+
+    #[test]
+    fn test_split_projection_boundary_first_partition_column() {
+        let (file_schema, table_schema) = create_test_schemas(3, 2);
+        // First partition column is index 3
+        let projection = ProjectionExprs::from_indices(&[3], &table_schema);
+
+        let split = SplitProjection::new(&file_schema, &projection);
+
+        assert_eq!(split.file_indices, Vec::<usize>::new());
+        assert_eq!(split.partition_columns.len(), 1);
+        assert_eq!(split.partition_columns[0].in_partition_values, 0);
+    }
+
+    // ========================================================================
+    // Category 6: Integration Tests
+    // ========================================================================
+
+    #[test]
+    fn test_inject_partition_columns_multiple_partitions() {
+        let data =
+            record_batch!(("col_0", Int32, vec![1]), ("col_1", Int32, vec![2])).unwrap();
+
+        // Create projection that references file columns and partition columns
+        let (file_schema, table_schema) = create_test_schemas(2, 2);
+        // Projection: [0, 2, 1, 3] = [file_0, part_0, file_1, part_1]
+        let projection = ProjectionExprs::from_indices(&[0, 2, 1, 3], &table_schema);
+        let split = SplitProjection::new(&file_schema, &projection);
+
+        // Create partition column mappings
+        let partition_columns = vec![
+            PartitionColumnIndex {
+                in_remainder_projection: 2, // First partition column at index 2
+                in_partition_values: 0,
+            },
+            PartitionColumnIndex {
+                in_remainder_projection: 3, // Second partition column at index 3
+                in_partition_values: 1,
+            },
+        ];
+
+        let partition_values =
+            vec![ScalarValue::from("part_a"), ScalarValue::from("part_b")];
+
+        let injected = inject_partition_columns_into_projection(
+            &split.remapped_projection,
+            &partition_columns,
+            partition_values,
+        );
+
+        // Apply projection
+        let projector = injected.make_projector(&data.schema()).unwrap();
+        let result = projector.project_batch(&data).unwrap();
+
+        assert_eq!(result.num_columns(), 4);
+        assert_eq!(
+            result
+                .column(0)
+                .as_primitive::<arrow::datatypes::Int32Type>()
+                .value(0),
+            1
+        );
+        assert_eq!(result.column(1).as_string::<i32>().value(0), "part_a");
+        assert_eq!(
+            result
+                .column(2)
+                .as_primitive::<arrow::datatypes::Int32Type>()
+                .value(0),
+            2
+        );
+        assert_eq!(result.column(3).as_string::<i32>().value(0), "part_b");
+    }
+}
diff --git a/datafusion/datasource/src/schema_adapter.rs b/datafusion/datasource/src/schema_adapter.rs
index 4c7b37113d58d..c995fa58d6c89 100644
--- a/datafusion/datasource/src/schema_adapter.rs
+++ b/datafusion/datasource/src/schema_adapter.rs
@@ -15,49 +15,47 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`SchemaAdapter`] and [`SchemaAdapterFactory`] to adapt file-level record batches to a table schema.
+//! Deprecated: [`SchemaAdapter`] and [`SchemaAdapterFactory`] have been removed.
 //!
-//! Adapter provides a method of translating the RecordBatches that come out of the
-//! physical format into how they should be used by DataFusion.  For instance, a schema
-//! can be stored external to a parquet file that maps parquet logical types to arrow types.
-use arrow::{
-    array::{new_null_array, ArrayRef, RecordBatch, RecordBatchOptions},
-    compute::can_cast_types,
-    datatypes::{DataType, Field, Schema, SchemaRef},
-};
-use datafusion_common::{
-    format::DEFAULT_CAST_OPTIONS,
-    nested_struct::{cast_column, validate_struct_compatibility},
-    plan_err, ColumnStatistics,
-};
-use std::{fmt::Debug, sync::Arc};
-/// Function used by [`SchemaMapping`] to adapt a column from the file schema to
-/// the table schema.
-pub type CastColumnFn = dyn Fn(
-        &ArrayRef,
-        &Field,
-        &arrow::compute::CastOptions,
-    ) -> datafusion_common::Result<ArrayRef>
+//! Use [`PhysicalExprAdapterFactory`] instead. See `upgrading.md` for more details.
+//!
+//! [`PhysicalExprAdapterFactory`]: datafusion_physical_expr_adapter::PhysicalExprAdapterFactory
+
+#![allow(deprecated)]
+
+use arrow::array::{ArrayRef, RecordBatch};
+use arrow::datatypes::{Field, Schema, SchemaRef};
+use datafusion_common::{ColumnStatistics, Result, not_impl_err};
+use log::warn;
+use std::fmt::Debug;
+use std::sync::Arc;
+
+/// Deprecated: Function type for casting columns.
+///
+/// This type has been removed. Use [`PhysicalExprAdapterFactory`] instead.
+/// See `upgrading.md` for more details.
+///
+/// [`PhysicalExprAdapterFactory`]: datafusion_physical_expr_adapter::PhysicalExprAdapterFactory
+#[deprecated(
+    since = "52.0.0",
+    note = "SchemaAdapter has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+)]
+pub type CastColumnFn = dyn Fn(&ArrayRef, &Field, &arrow::compute::CastOptions) -> Result<ArrayRef>
     + Send
     + Sync;
 
-/// Factory for creating [`SchemaAdapter`]
+/// Deprecated: Factory for creating [`SchemaAdapter`].
 ///
-/// This interface provides a way to implement custom schema adaptation logic
-/// for DataSourceExec (for example, to fill missing columns with default value
-/// other than null).
+/// This trait has been removed. Use [`PhysicalExprAdapterFactory`] instead.
+/// See `upgrading.md` for more details.
 ///
-/// Most users should use [`DefaultSchemaAdapterFactory`]. See that struct for
-/// more details and examples.
+/// [`PhysicalExprAdapterFactory`]: datafusion_physical_expr_adapter::PhysicalExprAdapterFactory
+#[deprecated(
+    since = "52.0.0",
+    note = "SchemaAdapter has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+)]
 pub trait SchemaAdapterFactory: Debug + Send + Sync + 'static {
     /// Create a [`SchemaAdapter`]
-    ///
-    /// Arguments:
-    ///
-    /// * `projected_table_schema`: The schema for the table, projected to
-    ///   include only the fields being output (projected) by the this mapping.
-    ///
-    /// * `table_schema`: The entire table schema for the table
     fn create(
         &self,
         projected_table_schema: SchemaRef,
@@ -65,9 +63,6 @@ pub trait SchemaAdapterFactory: Debug + Send + Sync + 'static {
     ) -> Box<dyn SchemaAdapter>;
 
     /// Create a [`SchemaAdapter`] using only the projected table schema.
-    ///
-    /// This is a convenience method for cases where the table schema and the
-    /// projected table schema are the same.
     fn create_with_projected_schema(
         &self,
         projected_table_schema: SchemaRef,
@@ -76,971 +71,162 @@ pub trait SchemaAdapterFactory: Debug + Send + Sync + 'static {
     }
 }
 
-/// Creates [`SchemaMapper`]s to map file-level [`RecordBatch`]es to a table
-/// schema, which may have a schema obtained from merging multiple file-level
-/// schemas.
+/// Deprecated: Creates [`SchemaMapper`]s to map file-level [`RecordBatch`]es to a table schema.
 ///
-/// This is useful for implementing schema evolution in partitioned datasets.
+/// This trait has been removed. Use [`PhysicalExprAdapterFactory`] instead.
+/// See `upgrading.md` for more details.
 ///
-/// See [`DefaultSchemaAdapterFactory`] for more details and examples.
+/// [`PhysicalExprAdapterFactory`]: datafusion_physical_expr_adapter::PhysicalExprAdapterFactory
+#[deprecated(
+    since = "52.0.0",
+    note = "SchemaAdapter has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+)]
 pub trait SchemaAdapter: Send + Sync {
-    /// Map a column index in the table schema to a column index in a particular
-    /// file schema
-    ///
-    /// This is used while reading a file to push down projections by mapping
-    /// projected column indexes from the table schema to the file schema
-    ///
-    /// Panics if index is not in range for the table schema
+    /// Map a column index in the table schema to a column index in a particular file schema.
     fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option<usize>;
 
-    /// Creates a mapping for casting columns from the file schema to the table
-    /// schema.
-    ///
-    /// This is used after reading a record batch. The returned [`SchemaMapper`]:
-    ///
-    /// 1. Maps columns to the expected columns indexes
-    /// 2. Handles missing values (e.g. fills nulls or a default value) for
-    ///    columns in the in the table schema not in the file schema
-    /// 2. Handles different types: if the column in the file schema has a
-    ///    different type than `table_schema`, the mapper will resolve this
-    ///    difference (e.g. by casting to the appropriate type)
-    ///
-    /// Returns:
-    /// * a [`SchemaMapper`]
-    /// * an ordered list of columns to project from the file
+    /// Creates a mapping for casting columns from the file schema to the table schema.
     fn map_schema(
         &self,
         file_schema: &Schema,
-    ) -> datafusion_common::Result<(Arc<dyn SchemaMapper>, Vec<usize>)>;
+    ) -> Result<(Arc<dyn SchemaMapper>, Vec<usize>)>;
 }
 
-/// Maps, columns from a specific file schema to the table schema.
+/// Deprecated: Maps columns from a specific file schema to the table schema.
+///
+/// This trait has been removed. Use [`PhysicalExprAdapterFactory`] instead.
+/// See `upgrading.md` for more details.
 ///
-/// See [`DefaultSchemaAdapterFactory`] for more details and examples.
+/// [`PhysicalExprAdapterFactory`]: datafusion_physical_expr_adapter::PhysicalExprAdapterFactory
+#[deprecated(
+    since = "52.0.0",
+    note = "SchemaMapper has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+)]
 pub trait SchemaMapper: Debug + Send + Sync {
-    /// Adapts a `RecordBatch` to match the `table_schema`
-    fn map_batch(&self, batch: RecordBatch) -> datafusion_common::Result<RecordBatch>;
+    /// Adapts a `RecordBatch` to match the `table_schema`.
+    fn map_batch(&self, batch: RecordBatch) -> Result<RecordBatch>;
 
-    /// Adapts file-level column `Statistics` to match the `table_schema`
+    /// Adapts file-level column `Statistics` to match the `table_schema`.
     fn map_column_statistics(
         &self,
         file_col_statistics: &[ColumnStatistics],
-    ) -> datafusion_common::Result<Vec<ColumnStatistics>>;
+    ) -> Result<Vec<ColumnStatistics>>;
 }
 
-/// Default  [`SchemaAdapterFactory`] for mapping schemas.
-///
-/// This can be used to adapt file-level record batches to a table schema and
-/// implement schema evolution.
-///
-/// Given an input file schema and a table schema, this factory returns
-/// [`SchemaAdapter`] that return [`SchemaMapper`]s that:
-///
-/// 1. Reorder columns
-/// 2. Cast columns to the correct type
-/// 3. Fill missing columns with nulls
-///
-/// # Errors:
-///
-/// * If a column in the table schema is non-nullable but is not present in the
-///   file schema (i.e. it is missing), the returned mapper tries to fill it with
-///   nulls resulting in a schema error.
-///
-/// # Illustration of Schema Mapping
-///
-/// ```text
-/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─                  ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
-///  ┌───────┐   ┌───────┐ │                  ┌───────┐   ┌───────┐   ┌───────┐ │
-/// ││  1.0  │   │ "foo" │                   ││ NULL  │   │ "foo" │   │ "1.0" │
-///  ├───────┤   ├───────┤ │ Schema mapping   ├───────┤   ├───────┤   ├───────┤ │
-/// ││  2.0  │   │ "bar" │                   ││  NULL │   │ "bar" │   │ "2.0" │
-///  └───────┘   └───────┘ │────────────────▶ └───────┘   └───────┘   └───────┘ │
-/// │                                        │
-///  column "c"  column "b"│                  column "a"  column "b"  column "c"│
-/// │ Float64       Utf8                     │  Int32        Utf8        Utf8
-///  ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘                  ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
-///     Input Record Batch                         Output Record Batch
-///
-///     Schema {                                   Schema {
-///      "c": Float64,                              "a": Int32,
-///      "b": Utf8,                                 "b": Utf8,
-///     }                                           "c": Utf8,
-///                                                }
-/// ```
-///
-/// # Example of using the `DefaultSchemaAdapterFactory` to map [`RecordBatch`]s
-///
-/// Note `SchemaMapping` also supports mapping partial batches, which is used as
-/// part of predicate pushdown.
-///
-/// ```
-/// # use std::sync::Arc;
-/// # use arrow::datatypes::{DataType, Field, Schema};
-/// # use datafusion_datasource::schema_adapter::{DefaultSchemaAdapterFactory, SchemaAdapterFactory};
-/// # use datafusion_common::record_batch;
-/// // Table has fields "a",  "b" and "c"
-/// let table_schema = Schema::new(vec![
-///     Field::new("a", DataType::Int32, true),
-///     Field::new("b", DataType::Utf8, true),
-///     Field::new("c", DataType::Utf8, true),
-/// ]);
-///
-/// // create an adapter to map the table schema to the file schema
-/// let adapter = DefaultSchemaAdapterFactory::from_schema(Arc::new(table_schema));
+/// Deprecated: Default [`SchemaAdapterFactory`] for mapping schemas.
 ///
-/// // The file schema has fields "c" and "b" but "b" is stored as an 'Float64'
-/// // instead of 'Utf8'
-/// let file_schema = Schema::new(vec![
-///    Field::new("c", DataType::Utf8, true),
-///    Field::new("b", DataType::Float64, true),
-/// ]);
+/// This struct has been removed.
 ///
-/// // Get a mapping from the file schema to the table schema
-/// let (mapper, _indices) = adapter.map_schema(&file_schema).unwrap();
+/// Use [`PhysicalExprAdapterFactory`] instead to customize scans via
+/// [`FileScanConfigBuilder`], i.e. if you had implemented a custom [`SchemaAdapter`]
+/// and passed that into [`FileScanConfigBuilder`] / [`ParquetSource`].
+/// Use [`BatchAdapter`] if you want to map a stream of [`RecordBatch`]es
+/// between one schema and another, i.e. if you were calling [`SchemaMapper::map_batch`] manually.
 ///
-/// let file_batch = record_batch!(
-///     ("c", Utf8, vec!["foo", "bar"]),
-///     ("b", Float64, vec![1.0, 2.0])
-/// ).unwrap();
+/// See `upgrading.md` for more details.
 ///
-/// let mapped_batch = mapper.map_batch(file_batch).unwrap();
-///
-/// // the mapped batch has the correct schema and the "b" column has been cast to Utf8
-/// let expected_batch = record_batch!(
-///    ("a", Int32, vec![None, None]),  // missing column filled with nulls
-///    ("b", Utf8, vec!["1.0", "2.0"]), // b was cast to string and order was changed
-///    ("c", Utf8, vec!["foo", "bar"])
-/// ).unwrap();
-/// assert_eq!(mapped_batch, expected_batch);
-/// ```
+/// [`PhysicalExprAdapterFactory`]: datafusion_physical_expr_adapter::PhysicalExprAdapterFactory
+/// [`FileScanConfigBuilder`]: crate::file_scan_config::FileScanConfigBuilder
+/// [`ParquetSource`]: https://docs.rs/datafusion-datasource-parquet/latest/datafusion_datasource_parquet/source/struct.ParquetSource.html
+/// [`BatchAdapter`]: datafusion_physical_expr_adapter::BatchAdapter
+#[deprecated(
+    since = "52.0.0",
+    note = "DefaultSchemaAdapterFactory has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+)]
 #[derive(Clone, Debug, Default)]
 pub struct DefaultSchemaAdapterFactory;
 
-impl DefaultSchemaAdapterFactory {
-    /// Create a new factory for mapping batches from a file schema to a table
-    /// schema.
-    ///
-    /// This is a convenience for [`DefaultSchemaAdapterFactory::create`] with
-    /// the same schema for both the projected table schema and the table
-    /// schema.
-    pub fn from_schema(table_schema: SchemaRef) -> Box<dyn SchemaAdapter> {
-        Self.create(Arc::clone(&table_schema), table_schema)
-    }
-}
-
 impl SchemaAdapterFactory for DefaultSchemaAdapterFactory {
     fn create(
         &self,
         projected_table_schema: SchemaRef,
         _table_schema: SchemaRef,
     ) -> Box<dyn SchemaAdapter> {
-        Box::new(DefaultSchemaAdapter {
-            projected_table_schema,
+        Box::new(DeprecatedSchemaAdapter {
+            _projected_table_schema: projected_table_schema,
         })
     }
 }
 
-/// This SchemaAdapter requires both the table schema and the projected table
-/// schema. See  [`SchemaMapping`] for more details
-#[derive(Clone, Debug)]
-pub(crate) struct DefaultSchemaAdapter {
-    /// The schema for the table, projected to include only the fields being output (projected) by the
-    /// associated ParquetSource
-    projected_table_schema: SchemaRef,
+impl DefaultSchemaAdapterFactory {
+    /// Deprecated: Create a new factory for mapping batches from a file schema to a table schema.
+    #[deprecated(
+        since = "52.0.0",
+        note = "DefaultSchemaAdapterFactory has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+    )]
+    pub fn from_schema(table_schema: SchemaRef) -> Box<dyn SchemaAdapter> {
+        // Note: this method did not return an error thus the errors are raised from the returned adapter
+        warn!(
+            "DefaultSchemaAdapterFactory::from_schema is deprecated. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+        );
+        Box::new(DeprecatedSchemaAdapter {
+            _projected_table_schema: table_schema,
+        })
+    }
 }
 
-/// Checks if a file field can be cast to a table field
-///
-/// Returns Ok(true) if casting is possible, or an error explaining why casting is not possible
-pub(crate) fn can_cast_field(
-    file_field: &Field,
-    table_field: &Field,
-) -> datafusion_common::Result<bool> {
-    match (file_field.data_type(), table_field.data_type()) {
-        (DataType::Struct(source_fields), DataType::Struct(target_fields)) => {
-            // validate_struct_compatibility returns Result<()>; on success we can cast structs
-            validate_struct_compatibility(source_fields, target_fields)?;
-            Ok(true)
-        }
-        _ => {
-            if can_cast_types(file_field.data_type(), table_field.data_type()) {
-                Ok(true)
-            } else {
-                plan_err!(
-                    "Cannot cast file schema field {} of type {} to table schema field of type {}",
-                    file_field.name(),
-                    file_field.data_type(),
-                    table_field.data_type()
-                )
-            }
-        }
-    }
+/// Internal deprecated adapter that returns errors when methods are called.
+struct DeprecatedSchemaAdapter {
+    _projected_table_schema: SchemaRef,
 }
 
-impl SchemaAdapter for DefaultSchemaAdapter {
-    /// Map a column index in the table schema to a column index in a particular
-    /// file schema
-    ///
-    /// Panics if index is not in range for the table schema
-    fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option<usize> {
-        let field = self.projected_table_schema.field(index);
-        Some(file_schema.fields.find(field.name())?.0)
+impl SchemaAdapter for DeprecatedSchemaAdapter {
+    fn map_column_index(&self, _index: usize, _file_schema: &Schema) -> Option<usize> {
+        None // Safe no-op
     }
 
-    /// Creates a `SchemaMapping` for casting or mapping the columns from the
-    /// file schema to the table schema.
-    ///
-    /// If the provided `file_schema` contains columns of a different type to
-    /// the expected `table_schema`, the method will attempt to cast the array
-    /// data from the file schema to the table schema where possible.
-    ///
-    /// Returns a [`SchemaMapping`] that can be applied to the output batch
-    /// along with an ordered list of columns to project from the file
     fn map_schema(
         &self,
-        file_schema: &Schema,
-    ) -> datafusion_common::Result<(Arc<dyn SchemaMapper>, Vec<usize>)> {
-        let (field_mappings, projection) = create_field_mapping(
-            file_schema,
-            &self.projected_table_schema,
-            can_cast_field,
-        )?;
-
-        Ok((
-            Arc::new(SchemaMapping::new(
-                Arc::clone(&self.projected_table_schema),
-                field_mappings,
-                Arc::new(
-                    |array: &ArrayRef,
-                     field: &Field,
-                     opts: &arrow::compute::CastOptions| {
-                        cast_column(array, field, opts)
-                    },
-                ),
-            )),
-            projection,
-        ))
+        _file_schema: &Schema,
+    ) -> Result<(Arc<dyn SchemaMapper>, Vec<usize>)> {
+        not_impl_err!(
+            "SchemaAdapter has been removed. Use PhysicalExprAdapterFactory instead. \
+            See upgrading.md for more details."
+        )
     }
 }
 
-/// Helper function that creates field mappings between file schema and table schema
+/// Deprecated: The SchemaMapping struct held a mapping from the file schema to the table schema.
 ///
-/// Maps columns from the file schema to their corresponding positions in the table schema,
-/// applying type compatibility checking via the provided predicate function.
+/// This struct has been removed.
 ///
-/// Returns field mappings (for column reordering) and a projection (for field selection).
-pub(crate) fn create_field_mapping<F>(
-    file_schema: &Schema,
-    projected_table_schema: &SchemaRef,
-    can_map_field: F,
-) -> datafusion_common::Result<(Vec<Option<usize>>, Vec<usize>)>
-where
-    F: Fn(&Field, &Field) -> datafusion_common::Result<bool>,
-{
-    let mut projection = Vec::with_capacity(file_schema.fields().len());
-    let mut field_mappings = vec![None; projected_table_schema.fields().len()];
-
-    for (file_idx, file_field) in file_schema.fields.iter().enumerate() {
-        if let Some((table_idx, table_field)) =
-            projected_table_schema.fields().find(file_field.name())
-        {
-            if can_map_field(file_field, table_field)? {
-                field_mappings[table_idx] = Some(projection.len());
-                projection.push(file_idx);
-            }
-        }
-    }
-
-    Ok((field_mappings, projection))
-}
-
-/// The SchemaMapping struct holds a mapping from the file schema to the table
-/// schema and any necessary type conversions.
+/// Use [`PhysicalExprAdapterFactory`] instead to customize scans via
+/// [`FileScanConfigBuilder`], i.e. if you had implemented a custom [`SchemaAdapter`]
+/// and passed that into [`FileScanConfigBuilder`] / [`ParquetSource`].
+/// Use [`BatchAdapter`] if you want to map a stream of [`RecordBatch`]es
+/// between one schema and another, i.e. if you were calling [`SchemaMapper::map_batch`] manually.
 ///
-/// [`map_batch`] is used by the ParquetOpener to produce a RecordBatch which
-/// has the projected schema, since that's the schema which is supposed to come
-/// out of the execution of this query. Thus `map_batch` uses
-/// `projected_table_schema` as it can only operate on the projected fields.
+/// See `upgrading.md` for more details.
 ///
-/// [`map_batch`]: Self::map_batch
+/// [`PhysicalExprAdapterFactory`]: datafusion_physical_expr_adapter::PhysicalExprAdapterFactory
+/// [`FileScanConfigBuilder`]: crate::file_scan_config::FileScanConfigBuilder
+/// [`ParquetSource`]: https://docs.rs/datafusion-datasource-parquet/latest/datafusion_datasource_parquet/source/struct.ParquetSource.html
+/// [`BatchAdapter`]: datafusion_physical_expr_adapter::BatchAdapter
+#[deprecated(
+    since = "52.0.0",
+    note = "SchemaMapping has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+)]
+#[derive(Debug)]
 pub struct SchemaMapping {
-    /// The schema of the table. This is the expected schema after conversion
-    /// and it should match the schema of the query result.
-    projected_table_schema: SchemaRef,
-    /// Mapping from field index in `projected_table_schema` to index in
-    /// projected file_schema.
-    ///
-    /// They are Options instead of just plain `usize`s because the table could
-    /// have fields that don't exist in the file.
-    field_mappings: Vec<Option<usize>>,
-    /// Function used to adapt a column from the file schema to the table schema
-    /// when it exists in both schemas
-    cast_column: Arc<CastColumnFn>,
-}
-
-impl Debug for SchemaMapping {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("SchemaMapping")
-            .field("projected_table_schema", &self.projected_table_schema)
-            .field("field_mappings", &self.field_mappings)
-            .field("cast_column", &"<fn>")
-            .finish()
-    }
-}
-
-impl SchemaMapping {
-    /// Creates a new SchemaMapping instance
-    ///
-    /// Initializes the field mappings needed to transform file data to the projected table schema
-    pub fn new(
-        projected_table_schema: SchemaRef,
-        field_mappings: Vec<Option<usize>>,
-        cast_column: Arc<CastColumnFn>,
-    ) -> Self {
-        Self {
-            projected_table_schema,
-            field_mappings,
-            cast_column,
-        }
-    }
+    // Private fields removed - this is a skeleton for deprecation purposes only
+    _private: (),
 }
 
 impl SchemaMapper for SchemaMapping {
-    /// Adapts a `RecordBatch` to match the `projected_table_schema` using the stored mapping and
-    /// conversions.
-    /// The produced RecordBatch has a schema that contains only the projected columns.
-    fn map_batch(&self, batch: RecordBatch) -> datafusion_common::Result<RecordBatch> {
-        let (_old_schema, batch_cols, batch_rows) = batch.into_parts();
-
-        let cols = self
-            .projected_table_schema
-            // go through each field in the projected schema
-            .fields()
-            .iter()
-            // and zip it with the index that maps fields from the projected table schema to the
-            // projected file schema in `batch`
-            .zip(&self.field_mappings)
-            // and for each one...
-            .map(|(field, file_idx)| {
-                file_idx.map_or_else(
-                    // If this field only exists in the table, and not in the file, then we know
-                    // that it's null, so just return that.
-                    || Ok(new_null_array(field.data_type(), batch_rows)),
-                    // However, if it does exist in both, use the cast_column function
-                    // to perform any necessary conversions
-                    |batch_idx| {
-                        (self.cast_column)(
-                            &batch_cols[batch_idx],
-                            field,
-                            &DEFAULT_CAST_OPTIONS,
-                        )
-                    },
-                )
-            })
-            .collect::<datafusion_common::Result<Vec<_>, _>>()?;
-
-        // Necessary to handle empty batches
-        let options = RecordBatchOptions::new().with_row_count(Some(batch_rows));
-
-        let schema = Arc::clone(&self.projected_table_schema);
-        let record_batch = RecordBatch::try_new_with_options(schema, cols, &options)?;
-        Ok(record_batch)
+    fn map_batch(&self, _batch: RecordBatch) -> Result<RecordBatch> {
+        not_impl_err!(
+            "SchemaMapping has been removed. Use PhysicalExprAdapterFactory instead. \
+            See upgrading.md for more details."
+        )
     }
 
-    /// Adapts file-level column `Statistics` to match the `table_schema`
     fn map_column_statistics(
         &self,
-        file_col_statistics: &[ColumnStatistics],
-    ) -> datafusion_common::Result<Vec<ColumnStatistics>> {
-        let mut table_col_statistics = vec![];
-
-        // Map the statistics for each field in the file schema to the corresponding field in the
-        // table schema, if a field is not present in the file schema, we need to fill it with `ColumnStatistics::new_unknown`
-        for (_, file_col_idx) in self
-            .projected_table_schema
-            .fields()
-            .iter()
-            .zip(&self.field_mappings)
-        {
-            if let Some(file_col_idx) = file_col_idx {
-                table_col_statistics.push(
-                    file_col_statistics
-                        .get(*file_col_idx)
-                        .cloned()
-                        .unwrap_or_default(),
-                );
-            } else {
-                table_col_statistics.push(ColumnStatistics::new_unknown());
-            }
-        }
-
-        Ok(table_col_statistics)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use arrow::{
-        array::{Array, ArrayRef, StringBuilder, StructArray, TimestampMillisecondArray},
-        compute::cast,
-        datatypes::{DataType, Field, TimeUnit},
-        record_batch::RecordBatch,
-    };
-    use datafusion_common::{stats::Precision, Result, ScalarValue, Statistics};
-
-    #[test]
-    fn test_schema_mapping_map_statistics_basic() {
-        // Create table schema (a, b, c)
-        let table_schema = Arc::new(Schema::new(vec![
-            Field::new("a", DataType::Int32, true),
-            Field::new("b", DataType::Utf8, true),
-            Field::new("c", DataType::Float64, true),
-        ]));
-
-        // Create file schema (b, a) - different order, missing c
-        let file_schema = Schema::new(vec![
-            Field::new("b", DataType::Utf8, true),
-            Field::new("a", DataType::Int32, true),
-        ]);
-
-        // Create SchemaAdapter
-        let adapter = DefaultSchemaAdapter {
-            projected_table_schema: Arc::clone(&table_schema),
-        };
-
-        // Get mapper and projection
-        let (mapper, projection) = adapter.map_schema(&file_schema).unwrap();
-
-        // Should project columns 0,1 from file
-        assert_eq!(projection, vec![0, 1]);
-
-        // Create file statistics
-        let mut file_stats = Statistics::default();
-
-        // Statistics for column b (index 0 in file)
-        let b_stats = ColumnStatistics {
-            null_count: Precision::Exact(5),
-            ..Default::default()
-        };
-
-        // Statistics for column a (index 1 in file)
-        let a_stats = ColumnStatistics {
-            null_count: Precision::Exact(10),
-            ..Default::default()
-        };
-
-        file_stats.column_statistics = vec![b_stats, a_stats];
-
-        // Map statistics
-        let table_col_stats = mapper
-            .map_column_statistics(&file_stats.column_statistics)
-            .unwrap();
-
-        // Verify stats
-        assert_eq!(table_col_stats.len(), 3);
-        assert_eq!(table_col_stats[0].null_count, Precision::Exact(10)); // a from file idx 1
-        assert_eq!(table_col_stats[1].null_count, Precision::Exact(5)); // b from file idx 0
-        assert_eq!(table_col_stats[2].null_count, Precision::Absent); // c (unknown)
-    }
-
-    #[test]
-    fn test_schema_mapping_map_statistics_empty() {
-        // Create schemas
-        let table_schema = Arc::new(Schema::new(vec![
-            Field::new("a", DataType::Int32, true),
-            Field::new("b", DataType::Utf8, true),
-        ]));
-        let file_schema = Schema::new(vec![
-            Field::new("a", DataType::Int32, true),
-            Field::new("b", DataType::Utf8, true),
-        ]);
-
-        let adapter = DefaultSchemaAdapter {
-            projected_table_schema: Arc::clone(&table_schema),
-        };
-        let (mapper, _) = adapter.map_schema(&file_schema).unwrap();
-
-        // Empty file statistics
-        let file_stats = Statistics::default();
-        let table_col_stats = mapper
-            .map_column_statistics(&file_stats.column_statistics)
-            .unwrap();
-
-        // All stats should be unknown
-        assert_eq!(table_col_stats.len(), 2);
-        assert_eq!(table_col_stats[0], ColumnStatistics::new_unknown(),);
-        assert_eq!(table_col_stats[1], ColumnStatistics::new_unknown(),);
-    }
-
-    #[test]
-    fn test_can_cast_field() {
-        // Same type should work
-        let from_field = Field::new("col", DataType::Int32, true);
-        let to_field = Field::new("col", DataType::Int32, true);
-        assert!(can_cast_field(&from_field, &to_field).unwrap());
-
-        // Casting Int32 to Float64 is allowed
-        let from_field = Field::new("col", DataType::Int32, true);
-        let to_field = Field::new("col", DataType::Float64, true);
-        assert!(can_cast_field(&from_field, &to_field).unwrap());
-
-        // Casting Float64 to Utf8 should work (converts to string)
-        let from_field = Field::new("col", DataType::Float64, true);
-        let to_field = Field::new("col", DataType::Utf8, true);
-        assert!(can_cast_field(&from_field, &to_field).unwrap());
-
-        // Binary to Utf8 is not supported - this is an example of a cast that should fail
-        // Note: We use Binary instead of Utf8->Int32 because Arrow actually supports that cast
-        let from_field = Field::new("col", DataType::Binary, true);
-        let to_field = Field::new("col", DataType::Decimal128(10, 2), true);
-        let result = can_cast_field(&from_field, &to_field);
-        assert!(result.is_err());
-        let error_msg = result.unwrap_err().to_string();
-        assert!(error_msg.contains("Cannot cast file schema field col"));
-    }
-
-    #[test]
-    fn test_create_field_mapping() {
-        // Define the table schema
-        let table_schema = Arc::new(Schema::new(vec![
-            Field::new("a", DataType::Int32, true),
-            Field::new("b", DataType::Utf8, true),
-            Field::new("c", DataType::Float64, true),
-        ]));
-
-        // Define file schema: different order, missing column c, and b has different type
-        let file_schema = Schema::new(vec![
-            Field::new("b", DataType::Float64, true), // Different type but castable to Utf8
-            Field::new("a", DataType::Int32, true),   // Same type
-            Field::new("d", DataType::Boolean, true), // Not in table schema
-        ]);
-
-        // Custom can_map_field function that allows all mappings for testing
-        let allow_all = |_: &Field, _: &Field| Ok(true);
-
-        // Test field mapping
-        let (field_mappings, projection) =
-            create_field_mapping(&file_schema, &table_schema, allow_all).unwrap();
-
-        // Expected:
-        // - field_mappings[0] (a) maps to projection[1]
-        // - field_mappings[1] (b) maps to projection[0]
-        // - field_mappings[2] (c) is None (not in file)
-        assert_eq!(field_mappings, vec![Some(1), Some(0), None]);
-        assert_eq!(projection, vec![0, 1]); // Projecting file columns b, a
-
-        // Test with a failing mapper
-        let fails_all = |_: &Field, _: &Field| Ok(false);
-        let (field_mappings, projection) =
-            create_field_mapping(&file_schema, &table_schema, fails_all).unwrap();
-
-        // Should have no mappings or projections if all cast checks fail
-        assert_eq!(field_mappings, vec![None, None, None]);
-        assert_eq!(projection, Vec::<usize>::new());
-
-        // Test with error-producing mapper
-        let error_mapper = |_: &Field, _: &Field| plan_err!("Test error");
-        let result = create_field_mapping(&file_schema, &table_schema, error_mapper);
-        assert!(result.is_err());
-        assert!(result.unwrap_err().to_string().contains("Test error"));
-    }
-
-    #[test]
-    fn test_schema_mapping_new() {
-        // Define the projected table schema
-        let projected_schema = Arc::new(Schema::new(vec![
-            Field::new("a", DataType::Int32, true),
-            Field::new("b", DataType::Utf8, true),
-        ]));
-
-        // Define field mappings from table to file
-        let field_mappings = vec![Some(1), Some(0)];
-
-        // Create SchemaMapping manually
-        let mapping = SchemaMapping::new(
-            Arc::clone(&projected_schema),
-            field_mappings.clone(),
-            Arc::new(
-                |array: &ArrayRef, field: &Field, opts: &arrow::compute::CastOptions| {
-                    cast_column(array, field, opts)
-                },
-            ),
-        );
-
-        // Check that fields were set correctly
-        assert_eq!(*mapping.projected_table_schema, *projected_schema);
-        assert_eq!(mapping.field_mappings, field_mappings);
-
-        // Test with a batch to ensure it works properly
-        let batch = RecordBatch::try_new(
-            Arc::new(Schema::new(vec![
-                Field::new("b_file", DataType::Utf8, true),
-                Field::new("a_file", DataType::Int32, true),
-            ])),
-            vec![
-                Arc::new(arrow::array::StringArray::from(vec!["hello", "world"])),
-                Arc::new(arrow::array::Int32Array::from(vec![1, 2])),
-            ],
-        )
-        .unwrap();
-
-        // Test that map_batch works with our manually created mapping
-        let mapped_batch = mapping.map_batch(batch).unwrap();
-
-        // Verify the mapped batch has the correct schema and data
-        assert_eq!(*mapped_batch.schema(), *projected_schema);
-        assert_eq!(mapped_batch.num_columns(), 2);
-        assert_eq!(mapped_batch.column(0).len(), 2); // a column
-        assert_eq!(mapped_batch.column(1).len(), 2); // b column
-    }
-
-    #[test]
-    fn test_map_schema_error_path() {
-        // Define the table schema
-        let table_schema = Arc::new(Schema::new(vec![
-            Field::new("a", DataType::Int32, true),
-            Field::new("b", DataType::Utf8, true),
-            Field::new("c", DataType::Decimal128(10, 2), true), // Use Decimal which has stricter cast rules
-        ]));
-
-        // Define file schema with incompatible type for column c
-        let file_schema = Schema::new(vec![
-            Field::new("a", DataType::Int32, true),
-            Field::new("b", DataType::Float64, true), // Different but castable
-            Field::new("c", DataType::Binary, true),  // Not castable to Decimal128
-        ]);
-
-        // Create DefaultSchemaAdapter
-        let adapter = DefaultSchemaAdapter {
-            projected_table_schema: Arc::clone(&table_schema),
-        };
-
-        // map_schema should error due to incompatible types
-        let result = adapter.map_schema(&file_schema);
-        assert!(result.is_err());
-        let error_msg = result.unwrap_err().to_string();
-        assert!(error_msg.contains("Cannot cast file schema field c"));
-    }
-
-    #[test]
-    fn test_map_schema_happy_path() {
-        // Define the table schema
-        let table_schema = Arc::new(Schema::new(vec![
-            Field::new("a", DataType::Int32, true),
-            Field::new("b", DataType::Utf8, true),
-            Field::new("c", DataType::Decimal128(10, 2), true),
-        ]));
-
-        // Create DefaultSchemaAdapter
-        let adapter = DefaultSchemaAdapter {
-            projected_table_schema: Arc::clone(&table_schema),
-        };
-
-        // Define compatible file schema (missing column c)
-        let compatible_file_schema = Schema::new(vec![
-            Field::new("a", DataType::Int64, true), // Can be cast to Int32
-            Field::new("b", DataType::Float64, true), // Can be cast to Utf8
-        ]);
-
-        // Test successful schema mapping
-        let (mapper, projection) = adapter.map_schema(&compatible_file_schema).unwrap();
-
-        // Verify field_mappings and projection created correctly
-        assert_eq!(projection, vec![0, 1]); // Projecting a and b
-
-        // Verify the SchemaMapping works with actual data
-        let file_batch = RecordBatch::try_new(
-            Arc::new(compatible_file_schema.clone()),
-            vec![
-                Arc::new(arrow::array::Int64Array::from(vec![100, 200])),
-                Arc::new(arrow::array::Float64Array::from(vec![1.5, 2.5])),
-            ],
+        _file_col_statistics: &[ColumnStatistics],
+    ) -> Result<Vec<ColumnStatistics>> {
+        not_impl_err!(
+            "SchemaMapping has been removed. Use PhysicalExprAdapterFactory instead. \
+            See upgrading.md for more details."
         )
-        .unwrap();
-
-        let mapped_batch = mapper.map_batch(file_batch).unwrap();
-
-        // Verify correct schema mapping
-        assert_eq!(*mapped_batch.schema(), *table_schema);
-        assert_eq!(mapped_batch.num_columns(), 3); // a, b, c
-
-        // Column c should be null since it wasn't in the file schema
-        let c_array = mapped_batch.column(2);
-        assert_eq!(c_array.len(), 2);
-        assert_eq!(c_array.null_count(), 2);
-    }
-
-    #[test]
-    fn test_adapt_struct_with_added_nested_fields() -> Result<()> {
-        let (file_schema, table_schema) = create_test_schemas_with_nested_fields();
-        let batch = create_test_batch_with_struct_data(&file_schema)?;
-
-        let adapter = DefaultSchemaAdapter {
-            projected_table_schema: Arc::clone(&table_schema),
-        };
-        let (mapper, _) = adapter.map_schema(file_schema.as_ref())?;
-        let mapped_batch = mapper.map_batch(batch)?;
-
-        verify_adapted_batch_with_nested_fields(&mapped_batch, &table_schema)?;
-        Ok(())
-    }
-
-    #[test]
-    fn test_map_column_statistics_struct() -> Result<()> {
-        let (file_schema, table_schema) = create_test_schemas_with_nested_fields();
-
-        let adapter = DefaultSchemaAdapter {
-            projected_table_schema: Arc::clone(&table_schema),
-        };
-        let (mapper, _) = adapter.map_schema(file_schema.as_ref())?;
-
-        let file_stats = vec![
-            create_test_column_statistics(
-                0,
-                100,
-                Some(ScalarValue::Int32(Some(1))),
-                Some(ScalarValue::Int32(Some(100))),
-                Some(ScalarValue::Int32(Some(5100))),
-            ),
-            create_test_column_statistics(10, 50, None, None, None),
-        ];
-
-        let table_stats = mapper.map_column_statistics(&file_stats)?;
-        assert_eq!(table_stats.len(), 1);
-        verify_column_statistics(
-            &table_stats[0],
-            Some(0),
-            Some(100),
-            Some(ScalarValue::Int32(Some(1))),
-            Some(ScalarValue::Int32(Some(100))),
-            Some(ScalarValue::Int32(Some(5100))),
-        );
-        let missing_stats = mapper.map_column_statistics(&[])?;
-        assert_eq!(missing_stats.len(), 1);
-        assert_eq!(missing_stats[0], ColumnStatistics::new_unknown());
-        Ok(())
-    }
-
-    fn create_test_schemas_with_nested_fields() -> (SchemaRef, SchemaRef) {
-        let file_schema = Arc::new(Schema::new(vec![Field::new(
-            "info",
-            DataType::Struct(
-                vec![
-                    Field::new("location", DataType::Utf8, true),
-                    Field::new(
-                        "timestamp_utc",
-                        DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".into())),
-                        true,
-                    ),
-                ]
-                .into(),
-            ),
-            true,
-        )]));
-
-        let table_schema = Arc::new(Schema::new(vec![Field::new(
-            "info",
-            DataType::Struct(
-                vec![
-                    Field::new("location", DataType::Utf8, true),
-                    Field::new(
-                        "timestamp_utc",
-                        DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".into())),
-                        true,
-                    ),
-                    Field::new(
-                        "reason",
-                        DataType::Struct(
-                            vec![
-                                Field::new("_level", DataType::Float64, true),
-                                Field::new(
-                                    "details",
-                                    DataType::Struct(
-                                        vec![
-                                            Field::new("rurl", DataType::Utf8, true),
-                                            Field::new("s", DataType::Float64, true),
-                                            Field::new("t", DataType::Utf8, true),
-                                        ]
-                                        .into(),
-                                    ),
-                                    true,
-                                ),
-                            ]
-                            .into(),
-                        ),
-                        true,
-                    ),
-                ]
-                .into(),
-            ),
-            true,
-        )]));
-
-        (file_schema, table_schema)
-    }
-
-    fn create_test_batch_with_struct_data(
-        file_schema: &SchemaRef,
-    ) -> Result<RecordBatch> {
-        let mut location_builder = StringBuilder::new();
-        location_builder.append_value("San Francisco");
-        location_builder.append_value("New York");
-
-        let timestamp_array = TimestampMillisecondArray::from(vec![
-            Some(1640995200000),
-            Some(1641081600000),
-        ]);
-
-        let timestamp_type =
-            DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".into()));
-        let timestamp_array = cast(&timestamp_array, &timestamp_type)?;
-
-        let info_struct = StructArray::from(vec![
-            (
-                Arc::new(Field::new("location", DataType::Utf8, true)),
-                Arc::new(location_builder.finish()) as ArrayRef,
-            ),
-            (
-                Arc::new(Field::new("timestamp_utc", timestamp_type, true)),
-                timestamp_array,
-            ),
-        ]);
-
-        Ok(RecordBatch::try_new(
-            Arc::clone(file_schema),
-            vec![Arc::new(info_struct)],
-        )?)
-    }
-
-    fn verify_adapted_batch_with_nested_fields(
-        mapped_batch: &RecordBatch,
-        table_schema: &SchemaRef,
-    ) -> Result<()> {
-        assert_eq!(mapped_batch.schema(), *table_schema);
-        assert_eq!(mapped_batch.num_rows(), 2);
-
-        let info_col = mapped_batch.column(0);
-        let info_array = info_col
-            .as_any()
-            .downcast_ref::<StructArray>()
-            .expect("Expected info column to be a StructArray");
-
-        verify_preserved_fields(info_array)?;
-        verify_reason_field_structure(info_array)?;
-        Ok(())
-    }
-
-    fn verify_preserved_fields(info_array: &StructArray) -> Result<()> {
-        let location_col = info_array
-            .column_by_name("location")
-            .expect("Expected location field in struct");
-        let location_array = location_col
-            .as_any()
-            .downcast_ref::<arrow::array::StringArray>()
-            .expect("Expected location to be a StringArray");
-        assert_eq!(location_array.value(0), "San Francisco");
-        assert_eq!(location_array.value(1), "New York");
-
-        let timestamp_col = info_array
-            .column_by_name("timestamp_utc")
-            .expect("Expected timestamp_utc field in struct");
-        let timestamp_array = timestamp_col
-            .as_any()
-            .downcast_ref::<TimestampMillisecondArray>()
-            .expect("Expected timestamp_utc to be a TimestampMillisecondArray");
-        assert_eq!(timestamp_array.value(0), 1640995200000);
-        assert_eq!(timestamp_array.value(1), 1641081600000);
-        Ok(())
-    }
-
-    fn verify_reason_field_structure(info_array: &StructArray) -> Result<()> {
-        let reason_col = info_array
-            .column_by_name("reason")
-            .expect("Expected reason field in struct");
-        let reason_array = reason_col
-            .as_any()
-            .downcast_ref::<StructArray>()
-            .expect("Expected reason to be a StructArray");
-        assert_eq!(reason_array.fields().len(), 2);
-        assert!(reason_array.column_by_name("_level").is_some());
-        assert!(reason_array.column_by_name("details").is_some());
-
-        let details_col = reason_array
-            .column_by_name("details")
-            .expect("Expected details field in reason struct");
-        let details_array = details_col
-            .as_any()
-            .downcast_ref::<StructArray>()
-            .expect("Expected details to be a StructArray");
-        assert_eq!(details_array.fields().len(), 3);
-        assert!(details_array.column_by_name("rurl").is_some());
-        assert!(details_array.column_by_name("s").is_some());
-        assert!(details_array.column_by_name("t").is_some());
-        for i in 0..2 {
-            assert!(reason_array.is_null(i), "reason field should be null");
-        }
-        Ok(())
-    }
-
-    fn verify_column_statistics(
-        stats: &ColumnStatistics,
-        expected_null_count: Option<usize>,
-        expected_distinct_count: Option<usize>,
-        expected_min: Option<ScalarValue>,
-        expected_max: Option<ScalarValue>,
-        expected_sum: Option<ScalarValue>,
-    ) {
-        if let Some(count) = expected_null_count {
-            assert_eq!(
-                stats.null_count,
-                Precision::Exact(count),
-                "Null count should match expected value"
-            );
-        }
-        if let Some(count) = expected_distinct_count {
-            assert_eq!(
-                stats.distinct_count,
-                Precision::Exact(count),
-                "Distinct count should match expected value"
-            );
-        }
-        if let Some(min) = expected_min {
-            assert_eq!(
-                stats.min_value,
-                Precision::Exact(min),
-                "Min value should match expected value"
-            );
-        }
-        if let Some(max) = expected_max {
-            assert_eq!(
-                stats.max_value,
-                Precision::Exact(max),
-                "Max value should match expected value"
-            );
-        }
-        if let Some(sum) = expected_sum {
-            assert_eq!(
-                stats.sum_value,
-                Precision::Exact(sum),
-                "Sum value should match expected value"
-            );
-        }
-    }
-
-    fn create_test_column_statistics(
-        null_count: usize,
-        distinct_count: usize,
-        min_value: Option<ScalarValue>,
-        max_value: Option<ScalarValue>,
-        sum_value: Option<ScalarValue>,
-    ) -> ColumnStatistics {
-        ColumnStatistics {
-            null_count: Precision::Exact(null_count),
-            distinct_count: Precision::Exact(distinct_count),
-            min_value: min_value.map_or_else(|| Precision::Absent, Precision::Exact),
-            max_value: max_value.map_or_else(|| Precision::Absent, Precision::Exact),
-            sum_value: sum_value.map_or_else(|| Precision::Absent, Precision::Exact),
-        }
     }
 }
diff --git a/datafusion/datasource/src/sink.rs b/datafusion/datasource/src/sink.rs
index a8adb46b96ffa..2a1f5c4a2fd02 100644
--- a/datafusion/datasource/src/sink.rs
+++ b/datafusion/datasource/src/sink.rs
@@ -24,15 +24,16 @@ use std::sync::Arc;
 
 use arrow::array::{ArrayRef, RecordBatch, UInt64Array};
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
-use datafusion_common::{internal_err, Result};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, assert_eq_or_internal_err};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::{Distribution, EquivalenceProperties};
+use datafusion_physical_expr::{Distribution, EquivalenceProperties, PhysicalExpr};
 use datafusion_physical_expr_common::sort_expr::{LexRequirement, OrderingRequirements};
 use datafusion_physical_plan::metrics::MetricsSet;
 use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion_physical_plan::{
-    execute_input_stream, DisplayAs, DisplayFormatType, ExecutionPlan,
-    ExecutionPlanProperties, Partitioning, PlanProperties, SendableRecordBatchStream,
+    DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, Partitioning,
+    PlanProperties, SendableRecordBatchStream, execute_input_stream,
 };
 
 use async_trait::async_trait;
@@ -45,11 +46,7 @@ use futures::StreamExt;
 /// The `Display` impl is used to format the sink for explain plan
 /// output.
 #[async_trait]
-pub trait DataSink: DisplayAs + Debug + Send + Sync {
-    /// Returns the data sink as [`Any`] so that it can be
-    /// downcast to a specific implementation.
-    fn as_any(&self) -> &dyn Any;
-
+pub trait DataSink: Any + DisplayAs + Debug + Send + Sync {
     /// Return a snapshot of the [MetricsSet] for this
     /// [DataSink].
     ///
@@ -76,6 +73,18 @@ pub trait DataSink: DisplayAs + Debug + Send + Sync {
     ) -> Result<u64>;
 }
 
+impl dyn DataSink {
+    /// Returns true if the inner type is `T`.
+    pub fn is<T: DataSink>(&self) -> bool {
+        (self as &dyn Any).is::<T>()
+    }
+
+    /// Returns a reference to the inner value as the type `T` if it is of that type.
+    pub fn downcast_ref<T: DataSink>(&self) -> Option<&T> {
+        (self as &dyn Any).downcast_ref()
+    }
+}
+
 /// Execution plan for writing record batches to a [`DataSink`]
 ///
 /// Returns a single row with the number of values written
@@ -89,12 +98,12 @@ pub struct DataSinkExec {
     count_schema: SchemaRef,
     /// Optional required sort order for output data.
     sort_order: Option<LexRequirement>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl Debug for DataSinkExec {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "DataSinkExec schema: {:?}", self.count_schema)
+        write!(f, "DataSinkExec schema: {}", self.count_schema)
     }
 }
 
@@ -117,7 +126,7 @@ impl DataSinkExec {
             sink,
             count_schema: make_count_schema(),
             sort_order,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -170,11 +179,7 @@ impl ExecutionPlan for DataSinkExec {
     }
 
     /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -219,6 +224,19 @@ impl ExecutionPlan for DataSinkExec {
         )))
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to sort order requirements if present
+        if let Some(sort_order) = &self.sort_order {
+            for req in sort_order.iter() {
+                f(req.expr.as_ref())?;
+            }
+        }
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     /// Execute the plan and return a stream of `RecordBatch`es for
     /// the specified partition.
     fn execute(
@@ -226,9 +244,11 @@ impl ExecutionPlan for DataSinkExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        if partition != 0 {
-            return internal_err!("DataSinkExec can only be called on partition 0!");
-        }
+        assert_eq_or_internal_err!(
+            partition,
+            0,
+            "DataSinkExec can only be called on partition 0!"
+        );
         let data = execute_input_stream(
             Arc::clone(&self.input),
             Arc::clone(self.sink.schema()),
diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs
index 11a8a3867b809..420c6b508ce4f 100644
--- a/datafusion/datasource/src/source.rs
+++ b/datafusion/datasource/src/source.rs
@@ -20,26 +20,32 @@
 use std::any::Any;
 use std::fmt;
 use std::fmt::{Debug, Formatter};
-use std::sync::Arc;
+use std::sync::{Arc, OnceLock};
 
+use datafusion_physical_expr::projection::ProjectionExprs;
 use datafusion_physical_plan::execution_plan::{
     Boundedness, EmissionType, SchedulingType,
 };
 use datafusion_physical_plan::metrics::SplitMetrics;
-use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet};
-use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr};
+use datafusion_physical_plan::metrics::{
+    BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet,
+};
+use datafusion_physical_plan::projection::ProjectionExec;
 use datafusion_physical_plan::stream::BatchSplitStream;
 use datafusion_physical_plan::{
     DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
 };
 use itertools::Itertools;
 
+use crate::file::FileSource;
 use crate::file_scan_config::FileScanConfig;
 use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::{Constraints, Result, Statistics};
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use datafusion_physical_expr::{EquivalenceProperties, Partitioning, PhysicalExpr};
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+use datafusion_physical_plan::SortOrderPushdownResult;
 use datafusion_physical_plan::filter_pushdown::{
     ChildPushdownResult, FilterPushdownPhase, FilterPushdownPropagation, PushedDown,
 };
@@ -72,8 +78,8 @@ use datafusion_physical_plan::filter_pushdown::{
 /// ```text
 ///                       ┌─────────────────────┐                              -----► execute path
 ///                       │                     │                              ┄┄┄┄┄► init path
-///                       │   DataSourceExec    │  
-///                       │                     │    
+///                       │   DataSourceExec    │
+///                       │                     │
 ///                       └───────▲─────────────┘
 ///                               ┊  │
 ///                               ┊  │
@@ -117,13 +123,22 @@ use datafusion_physical_plan::filter_pushdown::{
 ///    │                     │
 ///    └─────────────────────┘
 /// ```
-pub trait DataSource: Send + Sync + Debug {
+pub trait DataSource: Any + Send + Sync + Debug {
+    /// Open the specified output partition and return its stream of
+    /// [`RecordBatch`]es.
+    ///
+    /// This should be used by data sources that do not need any sibling
+    /// coordination. Data sources that want to use per-execution shared state
+    /// (for example, to reorder work across partitions at runtime) should
+    /// implement [`Self::open_with_args`] instead.
+    ///
+    /// [`RecordBatch`]: arrow::record_batch::RecordBatch
     fn open(
         &self,
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream>;
-    fn as_any(&self) -> &dyn Any;
+
     /// Format this source for display in explain plans
     fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> fmt::Result;
 
@@ -154,17 +169,7 @@ pub trait DataSource: Send + Sync + Debug {
 
     /// Returns statistics for a specific partition, or aggregate statistics
     /// across all partitions if `partition` is `None`.
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics>;
-
-    /// Returns aggregate statistics across all partitions.
-    ///
-    /// # Deprecated
-    /// Use [`Self::partition_statistics`] instead, which provides more fine-grained
-    /// control over statistics retrieval (per-partition or aggregate).
-    #[deprecated(since = "51.0.0", note = "Use partition_statistics instead")]
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>>;
 
     /// Return a copy of this DataSource with a new fetch limit
     fn with_fetch(&self, _limit: Option<usize>) -> Option<Arc<dyn DataSource>>;
@@ -174,9 +179,15 @@ pub trait DataSource: Send + Sync + Debug {
     }
     fn try_swapping_with_projection(
         &self,
-        _projection: &[ProjectionExpr],
+        _projection: &ProjectionExprs,
     ) -> Result<Option<Arc<dyn DataSource>>>;
+
     /// Try to push down filters into this DataSource.
+    ///
+    /// These filters are in terms of the output schema of this DataSource (e.g.
+    /// [`Self::eq_properties`] and output of any projections pushed into the
+    /// source), not the original table schema.
+    ///
     /// See [`ExecutionPlan::handle_child_pushdown_result`] for more details.
     ///
     /// [`ExecutionPlan::handle_child_pushdown_result`]: datafusion_physical_plan::ExecutionPlan::handle_child_pushdown_result
@@ -189,6 +200,121 @@ pub trait DataSource: Send + Sync + Debug {
             vec![PushedDown::No; filters.len()],
         ))
     }
+
+    /// Try to create a new DataSource that produces data in the specified sort order.
+    ///
+    /// # Arguments
+    /// * `order` - The desired output ordering
+    ///
+    /// # Returns
+    /// * `Ok(SortOrderPushdownResult::Exact { .. })` - Created a source that guarantees exact ordering
+    /// * `Ok(SortOrderPushdownResult::Inexact { .. })` - Created a source optimized for the ordering
+    /// * `Ok(SortOrderPushdownResult::Unsupported)` - Cannot optimize for this ordering
+    /// * `Err(e)` - Error occurred
+    ///
+    /// Default implementation returns `Unsupported`.
+    fn try_pushdown_sort(
+        &self,
+        _order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn DataSource>>> {
+        Ok(SortOrderPushdownResult::Unsupported)
+    }
+
+    /// Returns a variant of this `DataSource` that is aware of order-sensitivity.
+    fn with_preserve_order(&self, _preserve_order: bool) -> Option<Arc<dyn DataSource>> {
+        None
+    }
+
+    /// Apply a closure to each expression used by this data source.
+    ///
+    /// This includes filter predicates (which may contain dynamic filters) and any
+    /// other expressions used during data scanning.
+    ///
+    /// Implementations must override this method. If the data source has no expressions,
+    /// return `Ok(TreeNodeRecursion::Continue)` immediately.
+    ///
+    /// See [`ExecutionPlan::apply_expressions`] for more details and implementation examples.
+    ///
+    /// [`ExecutionPlan::apply_expressions`]: datafusion_physical_plan::ExecutionPlan::apply_expressions
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion>;
+
+    /// Injects arbitrary run-time state into this DataSource, returning a new instance
+    /// that incorporates that state *if* it is relevant to the concrete DataSource implementation.
+    ///
+    /// This is a generic entry point: the `state` can be any type wrapped in
+    /// `Arc<dyn Any + Send + Sync>`.  A data source that cares about the state should
+    /// down-cast it to the concrete type it expects and, if successful, return a
+    /// modified copy of itself that captures the provided value.  If the state is
+    /// not applicable, the default behaviour is to return `None` so that parent
+    /// nodes can continue propagating the attempt further down the plan tree.
+    fn with_new_state(
+        &self,
+        _state: Arc<dyn Any + Send + Sync>,
+    ) -> Option<Arc<dyn DataSource>> {
+        None
+    }
+
+    /// Create per execution state to share across sibling instances of this
+    /// data source during one execution.
+    ///
+    /// Returns `None` (the default) if this data source has
+    /// no sibling-shared execution state.
+    fn create_sibling_state(&self) -> Option<Arc<dyn Any + Send + Sync>> {
+        None
+    }
+
+    /// Open a partition using optional sibling-shared execution state.
+    ///
+    /// The default implementation ignores the additional state and delegates to
+    /// [`Self::open`].
+    fn open_with_args(&self, args: OpenArgs) -> Result<SendableRecordBatchStream> {
+        self.open(args.partition, args.context)
+    }
+}
+
+/// Arguments for [`DataSource::open_with_args`]
+#[derive(Debug, Clone)]
+pub struct OpenArgs {
+    /// Which partition to open
+    pub partition: usize,
+    /// The task context for execution
+    pub context: Arc<TaskContext>,
+    /// Optional sibling-shared execution state, see
+    /// [`DataSource::create_sibling_state`] for details.
+    pub sibling_state: Option<Arc<dyn Any + Send + Sync>>,
+}
+
+impl OpenArgs {
+    /// Create a new OpenArgs with required arguments
+    pub fn new(partition: usize, context: Arc<TaskContext>) -> Self {
+        Self {
+            partition,
+            context,
+            sibling_state: None,
+        }
+    }
+
+    /// Set sibling shared state
+    pub fn with_shared_state(
+        mut self,
+        sibling_state: Option<Arc<dyn Any + Send + Sync>>,
+    ) -> Self {
+        self.sibling_state = sibling_state;
+        self
+    }
+}
+
+impl dyn DataSource {
+    pub fn is<T: DataSource>(&self) -> bool {
+        (self as &dyn Any).is::<T>()
+    }
+
+    pub fn downcast_ref<T: DataSource>(&self) -> Option<&T> {
+        (self as &dyn Any).downcast_ref()
+    }
 }
 
 /// [`ExecutionPlan`] that reads one or more files
@@ -208,7 +334,13 @@ pub struct DataSourceExec {
     /// The source of the data -- for example, `FileScanConfig` or `MemorySourceConfig`
     data_source: Arc<dyn DataSource>,
     /// Cached plan properties such as sort order
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
+    /// Per execution state shared across partitions of this plan.
+    ///
+    /// Created by [`DataSource::create_sibling_state`]
+    /// and then passed to
+    /// [`DataSource::open_with_args`].
+    execution_state: Arc<OnceLock<Option<Arc<dyn Any + Send + Sync>>>>,
 }
 
 impl DisplayAs for DataSourceExec {
@@ -228,11 +360,7 @@ impl ExecutionPlan for DataSourceExec {
         "DataSourceExec"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -240,6 +368,14 @@ impl ExecutionPlan for DataSourceExec {
         Vec::new()
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Delegate to the underlying data source
+        self.data_source.apply_expressions(f)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         _: Vec<Arc<dyn ExecutionPlan>>,
@@ -262,17 +398,15 @@ impl ExecutionPlan for DataSourceExec {
             self.properties().eq_properties.output_ordering(),
         )?;
 
-        if let Some(source) = data_source {
+        Ok(data_source.map(|source| {
             let output_partitioning = source.output_partitioning();
             let plan = self
                 .clone()
                 .with_data_source(source)
                 // Changing source partitioning may invalidate output partitioning. Update it also
                 .with_partitioning(output_partitioning);
-            Ok(Some(Arc::new(plan)))
-        } else {
-            Ok(Some(Arc::new(self.clone())))
-        }
+            Arc::new(plan) as _
+        }))
     }
 
     fn execute(
@@ -280,8 +414,15 @@ impl ExecutionPlan for DataSourceExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        let stream = self.data_source.open(partition, Arc::clone(&context))?;
+        let shared_state = self
+            .execution_state
+            .get_or_init(|| self.data_source.create_sibling_state())
+            .clone();
+        let args = OpenArgs::new(partition, Arc::clone(&context))
+            .with_shared_state(shared_state);
+        let stream = self.data_source.open_with_args(args)?;
         let batch_size = context.session_config().batch_size();
+
         log::debug!(
             "Batch splitting enabled for partition {partition}: batch_size={batch_size}"
         );
@@ -295,18 +436,35 @@ impl ExecutionPlan for DataSourceExec {
     }
 
     fn metrics(&self) -> Option<MetricsSet> {
-        Some(self.data_source.metrics().clone_inner())
+        let mut metrics = self.data_source.metrics().clone_inner();
+
+        // Add `output_rows_skew` metric to the metrics set.
+        // Done here because it's a derived metric from output_rows metric.
+        if let Some(file_scan_config) = self.data_source.downcast_ref::<FileScanConfig>()
+            && file_scan_config.file_source().file_type() == "parquet"
+            && let Some(output_rows_skew) =
+                BaselineMetrics::output_rows_skew_metric(&metrics)
+        {
+            metrics.push(output_rows_skew);
+        }
+
+        Some(metrics)
     }
 
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         self.data_source.partition_statistics(partition)
     }
 
     fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn ExecutionPlan>> {
         let data_source = self.data_source.with_fetch(limit)?;
-        let cache = self.cache.clone();
-
-        Some(Arc::new(Self { data_source, cache }))
+        let cache = Arc::clone(&self.cache);
+        let execution_state = Arc::new(OnceLock::new());
+
+        Some(Arc::new(Self {
+            data_source,
+            cache,
+            execution_state,
+        }))
     }
 
     fn fetch(&self) -> Option<usize> {
@@ -319,7 +477,7 @@ impl ExecutionPlan for DataSourceExec {
     ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
         match self
             .data_source
-            .try_swapping_with_projection(projection.expr())?
+            .try_swapping_with_projection(projection.projection_expr())?
         {
             Some(new_data_source) => {
                 Ok(Some(Arc::new(DataSourceExec::new(new_data_source))))
@@ -342,14 +500,14 @@ impl ExecutionPlan for DataSourceExec {
             .collect_vec();
         let res = self
             .data_source
-            .try_pushdown_filters(parent_filters.clone(), config)?;
+            .try_pushdown_filters(parent_filters, config)?;
         match res.updated_node {
             Some(data_source) => {
                 let mut new_node = self.clone();
                 new_node.data_source = data_source;
                 // Re-compute properties since we have new filters which will impact equivalence info
                 new_node.cache =
-                    Self::compute_properties(Arc::clone(&new_node.data_source));
+                    Arc::new(Self::compute_properties(&new_node.data_source));
 
                 Ok(FilterPushdownPropagation {
                     filters: res.filters,
@@ -362,6 +520,49 @@ impl ExecutionPlan for DataSourceExec {
             }),
         }
     }
+
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn ExecutionPlan>>> {
+        // Delegate to the data source and wrap result with DataSourceExec
+        self.data_source
+            .try_pushdown_sort(order)?
+            .try_map(|new_data_source| {
+                let new_exec = self.clone().with_data_source(new_data_source);
+                Ok(Arc::new(new_exec) as Arc<dyn ExecutionPlan>)
+            })
+    }
+
+    fn with_preserve_order(
+        &self,
+        preserve_order: bool,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        self.data_source
+            .with_preserve_order(preserve_order)
+            .map(|new_data_source| {
+                Arc::new(self.clone().with_data_source(new_data_source))
+                    as Arc<dyn ExecutionPlan>
+            })
+    }
+
+    fn with_new_state(
+        &self,
+        state: Arc<dyn Any + Send + Sync>,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        self.data_source
+            .with_new_state(state)
+            .map(|new_data_source| {
+                Arc::new(self.clone().with_data_source(new_data_source))
+                    as Arc<dyn ExecutionPlan>
+            })
+    }
+
+    fn reset_state(self: Arc<Self>) -> Result<Arc<dyn ExecutionPlan>> {
+        let mut new_exec = Arc::unwrap_or_clone(self);
+        new_exec.execution_state = Arc::new(OnceLock::new());
+        Ok(Arc::new(new_exec))
+    }
 }
 
 impl DataSourceExec {
@@ -371,8 +572,12 @@ impl DataSourceExec {
 
     // Default constructor for `DataSourceExec`, setting the `cooperative` flag to `true`.
     pub fn new(data_source: Arc<dyn DataSource>) -> Self {
-        let cache = Self::compute_properties(Arc::clone(&data_source));
-        Self { data_source, cache }
+        let cache = Self::compute_properties(&data_source);
+        Self {
+            data_source,
+            cache: Arc::new(cache),
+            execution_state: Arc::new(OnceLock::new()),
+        }
     }
 
     /// Return the source object
@@ -381,24 +586,25 @@ impl DataSourceExec {
     }
 
     pub fn with_data_source(mut self, data_source: Arc<dyn DataSource>) -> Self {
-        self.cache = Self::compute_properties(Arc::clone(&data_source));
+        self.cache = Arc::new(Self::compute_properties(&data_source));
         self.data_source = data_source;
+        self.execution_state = Arc::new(OnceLock::new());
         self
     }
 
     /// Assign constraints
     pub fn with_constraints(mut self, constraints: Constraints) -> Self {
-        self.cache = self.cache.with_constraints(constraints);
+        Arc::make_mut(&mut self.cache).set_constraints(constraints);
         self
     }
 
     /// Assign output partitioning
     pub fn with_partitioning(mut self, partitioning: Partitioning) -> Self {
-        self.cache = self.cache.with_partitioning(partitioning);
+        Arc::make_mut(&mut self.cache).partitioning = partitioning;
         self
     }
 
-    fn compute_properties(data_source: Arc<dyn DataSource>) -> PlanProperties {
+    fn compute_properties(data_source: &Arc<dyn DataSource>) -> PlanProperties {
         PlanProperties::new(
             data_source.eq_properties(),
             data_source.output_partitioning(),
@@ -413,14 +619,14 @@ impl DataSourceExec {
     /// Returns `None` if
     /// 1. the datasource is not scanning files (`FileScanConfig`)
     /// 2. The [`FileScanConfig::file_source`] is not of type `T`
-    pub fn downcast_to_file_source<T: 'static>(&self) -> Option<(&FileScanConfig, &T)> {
+    pub fn downcast_to_file_source<T: FileSource>(
+        &self,
+    ) -> Option<(&FileScanConfig, &T)> {
         self.data_source()
-            .as_any()
             .downcast_ref::<FileScanConfig>()
             .and_then(|file_scan_conf| {
                 file_scan_conf
                     .file_source()
-                    .as_any()
                     .downcast_ref::<T>()
                     .map(|source| (file_scan_conf, source))
             })
diff --git a/datafusion/datasource/src/statistics.rs b/datafusion/datasource/src/statistics.rs
index 0dd9bdb87c40a..032f4cec0b03f 100644
--- a/datafusion/datasource/src/statistics.rs
+++ b/datafusion/datasource/src/statistics.rs
@@ -22,16 +22,16 @@
 
 use std::sync::Arc;
 
-use crate::file_groups::FileGroup;
 use crate::PartitionedFile;
+use crate::file_groups::FileGroup;
 
 use arrow::array::RecordBatch;
 use arrow::compute::SortColumn;
 use arrow::datatypes::SchemaRef;
 use arrow::row::{Row, Rows};
-use datafusion_common::stats::Precision;
+use datafusion_common::stats::{NdvFallback, Precision};
 use datafusion_common::{
-    plan_datafusion_err, plan_err, DataFusionError, Result, ScalarValue,
+    DataFusionError, Result, ScalarValue, plan_datafusion_err, plan_err,
 };
 use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
@@ -50,13 +50,13 @@ pub(crate) struct MinMaxStatistics {
 
 impl MinMaxStatistics {
     /// Sort order used to sort the statistics
-    #[allow(unused)]
+    #[expect(unused)]
     pub fn sort_order(&self) -> &LexOrdering {
         &self.sort_order
     }
 
     /// Min value at index
-    #[allow(unused)]
+    #[expect(unused)]
     pub fn min(&'_ self, idx: usize) -> Row<'_> {
         self.min_by_sort_order.row(idx)
     }
@@ -152,28 +152,25 @@ impl MinMaxStatistics {
             .into_iter()
             .unzip();
 
-        Self::new(
-            &min_max_sort_order,
-            &min_max_schema,
-            RecordBatch::try_new(Arc::clone(&min_max_schema), min_values).map_err(
-                |e| {
-                    DataFusionError::ArrowError(
-                        Box::new(e),
-                        Some("\ncreate min batch".to_string()),
-                    )
-                },
-            )?,
-            RecordBatch::try_new(Arc::clone(&min_max_schema), max_values).map_err(
-                |e| {
-                    DataFusionError::ArrowError(
-                        Box::new(e),
-                        Some("\ncreate max batch".to_string()),
-                    )
-                },
-            )?,
-        )
+        let min_batch = RecordBatch::try_new(Arc::clone(&min_max_schema), min_values)
+            .map_err(|e| {
+                DataFusionError::ArrowError(
+                    Box::new(e),
+                    Some("\ncreate min batch".to_string()),
+                )
+            })?;
+        let max_batch = RecordBatch::try_new(Arc::clone(&min_max_schema), max_values)
+            .map_err(|e| {
+                DataFusionError::ArrowError(
+                    Box::new(e),
+                    Some("\ncreate max batch".to_string()),
+                )
+            })?;
+
+        Self::new(&min_max_sort_order, &min_max_schema, min_batch, max_batch)
     }
 
+    #[expect(clippy::needless_pass_by_value)]
     pub fn new(
         sort_order: &LexOrdering,
         schema: &SchemaRef,
@@ -269,11 +266,12 @@ impl MinMaxStatistics {
     }
 
     /// Check if the min/max statistics are in order and non-overlapping
+    /// (or touching at boundaries)
     pub fn is_sorted(&self) -> bool {
         self.max_by_sort_order
             .iter()
             .zip(self.min_by_sort_order.iter().skip(1))
-            .all(|(max, next_min)| max < next_min)
+            .all(|(max, next_min)| max <= next_min)
     }
 }
 
@@ -282,10 +280,84 @@ fn sort_columns_from_physical_sort_exprs(
 ) -> Option<Vec<&Column>> {
     sort_order
         .iter()
-        .map(|expr| expr.expr.as_any().downcast_ref::<Column>())
+        .map(|expr| expr.expr.downcast_ref::<Column>())
         .collect()
 }
 
+fn seed_summary_statistics(summary_statistics: &mut Statistics, file_stats: &Statistics) {
+    summary_statistics.num_rows = file_stats.num_rows;
+    summary_statistics.total_byte_size = file_stats.total_byte_size;
+
+    for (summary_col_stats, file_col_stats) in summary_statistics
+        .column_statistics
+        .iter_mut()
+        .zip(file_stats.column_statistics.iter())
+    {
+        summary_col_stats.null_count = file_col_stats.null_count;
+        summary_col_stats.max_value = file_col_stats.max_value.clone();
+        summary_col_stats.min_value = file_col_stats.min_value.clone();
+        summary_col_stats.sum_value = file_col_stats.sum_value.cast_to_sum_type();
+        summary_col_stats.byte_size = file_col_stats.byte_size;
+    }
+}
+
+fn merge_summary_statistics(
+    summary_statistics: &mut Statistics,
+    file_stats: &Statistics,
+) {
+    summary_statistics.num_rows = summary_statistics.num_rows.add(&file_stats.num_rows);
+    summary_statistics.total_byte_size = summary_statistics
+        .total_byte_size
+        .add(&file_stats.total_byte_size);
+
+    for (summary_col_stats, file_col_stats) in summary_statistics
+        .column_statistics
+        .iter_mut()
+        .zip(file_stats.column_statistics.iter())
+    {
+        let ColumnStatistics {
+            null_count: file_nc,
+            max_value: file_max,
+            min_value: file_min,
+            sum_value: file_sum,
+            distinct_count: _,
+            byte_size: file_sbs,
+        } = file_col_stats;
+
+        summary_col_stats.null_count = summary_col_stats.null_count.add(file_nc);
+        summary_col_stats.max_value = summary_col_stats.max_value.max(file_max);
+        summary_col_stats.min_value = summary_col_stats.min_value.min(file_min);
+        summary_col_stats.sum_value = summary_col_stats.sum_value.add_for_sum(file_sum);
+        summary_col_stats.byte_size = summary_col_stats.byte_size.add(file_sbs);
+    }
+}
+
+fn seed_first_file_statistics(
+    limit_num_rows: &mut Precision<usize>,
+    summary_statistics: &mut Statistics,
+    file_stats: &Statistics,
+    collect_stats: bool,
+) {
+    *limit_num_rows = file_stats.num_rows;
+
+    if collect_stats {
+        seed_summary_statistics(summary_statistics, file_stats);
+    }
+}
+
+fn merge_file_statistics(
+    limit_num_rows: &mut Precision<usize>,
+    summary_statistics: &mut Statistics,
+    file_stats: &Statistics,
+    collect_stats: bool,
+) {
+    *limit_num_rows = limit_num_rows.add(&file_stats.num_rows);
+
+    if collect_stats {
+        merge_summary_statistics(summary_statistics, file_stats);
+    }
+}
+
 /// Get all files as well as the file level summary statistics (no statistic for partition columns).
 /// If the optional `limit` is provided, includes only sufficient files. Needed to read up to
 /// `limit` number of rows. `collect_stats` is passed down from the configuration parameter on
@@ -295,7 +367,7 @@ fn sort_columns_from_physical_sort_exprs(
     since = "47.0.0",
     note = "Please use `get_files_with_limit` and  `compute_all_files_statistics` instead"
 )]
-#[allow(unused)]
+#[cfg_attr(not(test), expect(unused))]
 pub async fn get_statistics_with_limit(
     all_files: impl Stream<Item = Result<(PartitionedFile, Arc<Statistics>)>>,
     file_schema: SchemaRef,
@@ -310,9 +382,14 @@ pub async fn get_statistics_with_limit(
     // - zero for summations, and
     // - neutral element for extreme points.
     let size = file_schema.fields().len();
-    let mut col_stats_set = vec![ColumnStatistics::default(); size];
-    let mut num_rows = Precision::<usize>::Absent;
-    let mut total_byte_size = Precision::<usize>::Absent;
+    let mut summary_statistics = Statistics {
+        num_rows: Precision::Absent,
+        total_byte_size: Precision::Absent,
+        column_statistics: vec![ColumnStatistics::default(); size],
+    };
+    // Keep limit pruning separate from the returned summary so `collect_stats=false`
+    // can still stop early using known file row counts.
+    let mut limit_num_rows = Precision::<usize>::Absent;
 
     // Fusing the stream allows us to call next safely even once it is finished.
     let mut all_files = Box::pin(all_files.fuse());
@@ -322,23 +399,18 @@ pub async fn get_statistics_with_limit(
         file.statistics = Some(Arc::clone(&file_stats));
         result_files.push(file);
 
-        // First file, we set them directly from the file statistics.
-        num_rows = file_stats.num_rows;
-        total_byte_size = file_stats.total_byte_size;
-        for (index, file_column) in
-            file_stats.column_statistics.clone().into_iter().enumerate()
-        {
-            col_stats_set[index].null_count = file_column.null_count;
-            col_stats_set[index].max_value = file_column.max_value;
-            col_stats_set[index].min_value = file_column.min_value;
-            col_stats_set[index].sum_value = file_column.sum_value;
-        }
+        seed_first_file_statistics(
+            &mut limit_num_rows,
+            &mut summary_statistics,
+            &file_stats,
+            collect_stats,
+        );
 
         // If the number of rows exceeds the limit, we can stop processing
         // files. This only applies when we know the number of rows. It also
         // currently ignores tables that have no statistics regarding the
         // number of rows.
-        let conservative_num_rows = match num_rows {
+        let conservative_num_rows = match limit_num_rows {
             Precision::Exact(nr) => nr,
             _ => usize::MIN,
         };
@@ -347,42 +419,18 @@ pub async fn get_statistics_with_limit(
                 let (mut file, file_stats) = current?;
                 file.statistics = Some(Arc::clone(&file_stats));
                 result_files.push(file);
-                if !collect_stats {
-                    continue;
-                }
-
-                // We accumulate the number of rows, total byte size and null
-                // counts across all the files in question. If any file does not
-                // provide any information or provides an inexact value, we demote
-                // the statistic precision to inexact.
-                num_rows = num_rows.add(&file_stats.num_rows);
-
-                total_byte_size = total_byte_size.add(&file_stats.total_byte_size);
-
-                for (file_col_stats, col_stats) in file_stats
-                    .column_statistics
-                    .iter()
-                    .zip(col_stats_set.iter_mut())
-                {
-                    let ColumnStatistics {
-                        null_count: file_nc,
-                        max_value: file_max,
-                        min_value: file_min,
-                        sum_value: file_sum,
-                        distinct_count: _,
-                    } = file_col_stats;
-
-                    col_stats.null_count = col_stats.null_count.add(file_nc);
-                    col_stats.max_value = col_stats.max_value.max(file_max);
-                    col_stats.min_value = col_stats.min_value.min(file_min);
-                    col_stats.sum_value = col_stats.sum_value.add(file_sum);
-                }
+                merge_file_statistics(
+                    &mut limit_num_rows,
+                    &mut summary_statistics,
+                    &file_stats,
+                    collect_stats,
+                );
 
                 // If the number of rows exceeds the limit, we can stop processing
                 // files. This only applies when we know the number of rows. It also
                 // currently ignores tables that have no statistics regarding the
                 // number of rows.
-                if num_rows.get_value().unwrap_or(&usize::MIN)
+                if limit_num_rows.get_value().unwrap_or(&usize::MIN)
                     > &limit.unwrap_or(usize::MAX)
                 {
                     break;
@@ -391,11 +439,7 @@ pub async fn get_statistics_with_limit(
         }
     };
 
-    let mut statistics = Statistics {
-        num_rows,
-        total_byte_size,
-        column_statistics: col_stats_set,
-    };
+    let mut statistics = summary_statistics;
     if all_files.next().await.is_some() {
         // If we still have files in the stream, it means that the limit kicked
         // in, and the statistic could have been different had we processed the
@@ -421,6 +465,7 @@ pub async fn get_statistics_with_limit(
 ///
 /// # Returns
 /// A new file group with summary statistics attached
+#[expect(clippy::needless_pass_by_value)]
 pub fn compute_file_group_statistics(
     file_group: FileGroup,
     file_schema: SchemaRef,
@@ -434,7 +479,11 @@ pub fn compute_file_group_statistics(
         let stats = file.statistics.as_ref()?;
         Some(stats.as_ref())
     });
-    let statistics = Statistics::try_merge_iter(file_group_stats, &file_schema)?;
+    let statistics = Statistics::try_merge_iter_with_ndv_fallback(
+        file_group_stats,
+        &file_schema,
+        NdvFallback::Max,
+    )?;
 
     Ok(file_group.with_statistics(Arc::new(statistics)))
 }
@@ -456,6 +505,7 @@ pub fn compute_file_group_statistics(
 /// A tuple containing:
 /// * The processed file groups with their individual statistics attached
 /// * The summary statistics across all file groups, aka all files summary statistics
+#[expect(clippy::needless_pass_by_value)]
 pub fn compute_all_files_statistics(
     file_groups: Vec<FileGroup>,
     table_schema: SchemaRef,
@@ -478,8 +528,11 @@ pub fn compute_all_files_statistics(
         .iter()
         .filter_map(|file_group| file_group.file_statistics(None));
 
-    let mut statistics =
-        Statistics::try_merge_iter(file_groups_statistics, &table_schema)?;
+    let mut statistics = Statistics::try_merge_iter_with_ndv_fallback(
+        file_groups_statistics,
+        &table_schema,
+        NdvFallback::Max,
+    )?;
 
     if inexact_stats {
         statistics = statistics.to_inexact()
@@ -495,3 +548,346 @@ pub fn add_row_stats(
 ) -> Precision<usize> {
     file_num_rows.add(&num_rows)
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::PartitionedFile;
+    use crate::file_groups::FileGroup;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use futures::stream;
+
+    fn file_stats(sum: u32) -> Statistics {
+        Statistics {
+            num_rows: Precision::Exact(1),
+            total_byte_size: Precision::Exact(4),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(0),
+                max_value: Precision::Exact(ScalarValue::UInt32(Some(sum))),
+                min_value: Precision::Exact(ScalarValue::UInt32(Some(sum))),
+                sum_value: Precision::Exact(ScalarValue::UInt32(Some(sum))),
+                distinct_count: Precision::Exact(1),
+                byte_size: Precision::Exact(4),
+            }],
+        }
+    }
+
+    fn test_schema() -> SchemaRef {
+        Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, true)]))
+    }
+
+    fn make_file_stats(
+        num_rows: usize,
+        total_byte_size: usize,
+        col_stats: ColumnStatistics,
+    ) -> Arc<Statistics> {
+        Arc::new(Statistics {
+            num_rows: Precision::Exact(num_rows),
+            total_byte_size: Precision::Exact(total_byte_size),
+            column_statistics: vec![col_stats],
+        })
+    }
+
+    fn rich_col_stats(
+        null_count: usize,
+        min: i64,
+        max: i64,
+        sum: i64,
+        byte_size: usize,
+    ) -> ColumnStatistics {
+        ColumnStatistics {
+            null_count: Precision::Exact(null_count),
+            max_value: Precision::Exact(ScalarValue::Int64(Some(max))),
+            min_value: Precision::Exact(ScalarValue::Int64(Some(min))),
+            distinct_count: Precision::Absent,
+            sum_value: Precision::Exact(ScalarValue::Int64(Some(sum))),
+            byte_size: Precision::Exact(byte_size),
+        }
+    }
+
+    fn utf8_file_stats(ndv: usize, min: &str, max: &str) -> Statistics {
+        Statistics {
+            num_rows: Precision::Exact(1),
+            total_byte_size: Precision::Exact(16),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(0),
+                max_value: Precision::Exact(ScalarValue::Utf8(Some(max.to_string()))),
+                min_value: Precision::Exact(ScalarValue::Utf8(Some(min.to_string()))),
+                sum_value: Precision::Absent,
+                distinct_count: Precision::Exact(ndv),
+                byte_size: Precision::Exact(16),
+            }],
+        }
+    }
+
+    fn file_with_stats(path: &str, stats: Statistics) -> PartitionedFile {
+        PartitionedFile::new(path, 1).with_statistics(Arc::new(stats))
+    }
+    #[tokio::test]
+    #[expect(deprecated)]
+    async fn test_get_statistics_with_limit_casts_first_file_sum_to_sum_type()
+    -> Result<()> {
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("c1", DataType::UInt32, true)]));
+
+        let files = stream::iter(vec![Ok((
+            PartitionedFile::new("f1.parquet", 1),
+            Arc::new(file_stats(100)),
+        ))]);
+
+        let (_group, stats) =
+            get_statistics_with_limit(files, schema, None, true).await?;
+
+        assert_eq!(
+            stats.column_statistics[0].sum_value,
+            Precision::Exact(ScalarValue::UInt64(Some(100)))
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    #[expect(deprecated)]
+    async fn test_get_statistics_with_limit_merges_sum_with_unsigned_widening()
+    -> Result<()> {
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("c1", DataType::UInt32, true)]));
+
+        let files = stream::iter(vec![
+            Ok((
+                PartitionedFile::new("f1.parquet", 1),
+                Arc::new(file_stats(100)),
+            )),
+            Ok((
+                PartitionedFile::new("f2.parquet", 1),
+                Arc::new(file_stats(200)),
+            )),
+        ]);
+
+        let (_group, stats) =
+            get_statistics_with_limit(files, schema, None, true).await?;
+
+        assert_eq!(
+            stats.column_statistics[0].sum_value,
+            Precision::Exact(ScalarValue::UInt64(Some(300)))
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    #[expect(deprecated)]
+    async fn get_statistics_with_limit_collect_stats_false_returns_bare_statistics() {
+        let all_files = stream::iter(vec![
+            Ok((
+                PartitionedFile::new("first.parquet", 10),
+                make_file_stats(0, 0, rich_col_stats(1, 1, 9, 15, 64)),
+            )),
+            Ok((
+                PartitionedFile::new("second.parquet", 20),
+                make_file_stats(10, 100, rich_col_stats(2, 10, 99, 300, 128)),
+            )),
+        ]);
+
+        let (_files, statistics) =
+            get_statistics_with_limit(all_files, test_schema(), None, false)
+                .await
+                .unwrap();
+
+        assert_eq!(statistics.num_rows, Precision::Absent);
+        assert_eq!(statistics.total_byte_size, Precision::Absent);
+        assert_eq!(statistics.column_statistics.len(), 1);
+        assert_eq!(
+            statistics.column_statistics[0].null_count,
+            Precision::Absent
+        );
+        assert_eq!(statistics.column_statistics[0].max_value, Precision::Absent);
+        assert_eq!(statistics.column_statistics[0].min_value, Precision::Absent);
+        assert_eq!(statistics.column_statistics[0].sum_value, Precision::Absent);
+        assert_eq!(statistics.column_statistics[0].byte_size, Precision::Absent);
+    }
+
+    #[tokio::test]
+    #[expect(deprecated)]
+    async fn get_statistics_with_limit_collect_stats_false_uses_row_counts_for_limit() {
+        let all_files = stream::iter(vec![
+            Ok((
+                PartitionedFile::new("first.parquet", 10),
+                make_file_stats(3, 30, rich_col_stats(1, 1, 9, 15, 64)),
+            )),
+            Ok((
+                PartitionedFile::new("second.parquet", 20),
+                make_file_stats(3, 30, rich_col_stats(2, 10, 99, 300, 128)),
+            )),
+            Ok((
+                PartitionedFile::new("third.parquet", 30),
+                make_file_stats(3, 30, rich_col_stats(0, 100, 199, 450, 256)),
+            )),
+        ]);
+
+        let (files, statistics) =
+            get_statistics_with_limit(all_files, test_schema(), Some(4), false)
+                .await
+                .unwrap();
+
+        assert_eq!(files.len(), 2);
+        assert_eq!(statistics.num_rows, Precision::Absent);
+        assert_eq!(statistics.total_byte_size, Precision::Absent);
+    }
+
+    #[tokio::test]
+    #[expect(deprecated)]
+    async fn get_statistics_with_limit_collect_stats_true_aggregates_statistics() {
+        let all_files = stream::iter(vec![
+            Ok((
+                PartitionedFile::new("first.parquet", 10),
+                make_file_stats(5, 50, rich_col_stats(1, 1, 9, 15, 64)),
+            )),
+            Ok((
+                PartitionedFile::new("second.parquet", 20),
+                make_file_stats(10, 100, rich_col_stats(2, 10, 99, 300, 128)),
+            )),
+        ]);
+
+        let (_files, statistics) =
+            get_statistics_with_limit(all_files, test_schema(), None, true)
+                .await
+                .unwrap();
+
+        assert_eq!(statistics.num_rows, Precision::Exact(15));
+        assert_eq!(statistics.total_byte_size, Precision::Exact(150));
+        assert_eq!(
+            statistics.column_statistics[0].null_count,
+            Precision::Exact(3)
+        );
+        assert_eq!(
+            statistics.column_statistics[0].min_value,
+            Precision::Exact(ScalarValue::Int64(Some(1)))
+        );
+        assert_eq!(
+            statistics.column_statistics[0].max_value,
+            Precision::Exact(ScalarValue::Int64(Some(99)))
+        );
+        assert_eq!(
+            statistics.column_statistics[0].sum_value,
+            Precision::Exact(ScalarValue::Int64(Some(315)))
+        );
+        assert_eq!(
+            statistics.column_statistics[0].byte_size,
+            Precision::Exact(192)
+        );
+    }
+
+    #[tokio::test]
+    #[expect(deprecated)]
+    async fn get_statistics_with_limit_collect_stats_true_limit_marks_inexact() {
+        let all_files = stream::iter(vec![
+            Ok((
+                PartitionedFile::new("first.parquet", 10),
+                make_file_stats(5, 50, rich_col_stats(0, 1, 5, 15, 64)),
+            )),
+            Ok((
+                PartitionedFile::new("second.parquet", 20),
+                make_file_stats(5, 50, rich_col_stats(1, 6, 10, 40, 64)),
+            )),
+            Ok((
+                PartitionedFile::new("third.parquet", 20),
+                make_file_stats(5, 50, rich_col_stats(2, 11, 15, 65, 64)),
+            )),
+        ]);
+
+        let (files, statistics) =
+            get_statistics_with_limit(all_files, test_schema(), Some(8), true)
+                .await
+                .unwrap();
+
+        assert_eq!(files.len(), 2);
+        assert_eq!(statistics.num_rows, Precision::Inexact(10));
+        assert_eq!(statistics.total_byte_size, Precision::Inexact(100));
+        assert_eq!(
+            statistics.column_statistics[0].min_value,
+            Precision::Inexact(ScalarValue::Int64(Some(1)))
+        );
+        assert_eq!(
+            statistics.column_statistics[0].max_value,
+            Precision::Inexact(ScalarValue::Int64(Some(10)))
+        );
+        assert_eq!(
+            statistics.column_statistics[0].sum_value,
+            Precision::Inexact(ScalarValue::Int64(Some(55)))
+        );
+        assert_eq!(
+            statistics.column_statistics[0].byte_size,
+            Precision::Inexact(128)
+        );
+    }
+
+    #[test]
+    fn test_compute_file_group_statistics_uses_max_ndv_fallback() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Utf8, true)]));
+        let file_group = FileGroup::new(vec![
+            file_with_stats("f1.parquet", utf8_file_stats(5, "a", "x")),
+            file_with_stats("f2.parquet", utf8_file_stats(8, "b", "z")),
+        ]);
+
+        let file_group =
+            compute_file_group_statistics(file_group, Arc::clone(&schema), true)?;
+        let stats = file_group.file_statistics(None).unwrap();
+
+        assert_eq!(
+            stats.column_statistics[0].distinct_count,
+            Precision::Inexact(8)
+        );
+        assert_eq!(
+            stats.column_statistics[0].min_value,
+            Precision::Exact(ScalarValue::Utf8(Some("a".to_string())))
+        );
+        assert_eq!(
+            stats.column_statistics[0].max_value,
+            Precision::Exact(ScalarValue::Utf8(Some("z".to_string())))
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_compute_all_files_statistics_uses_max_ndv_fallback() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Utf8, true)]));
+        let file_groups = vec![
+            FileGroup::new(vec![
+                file_with_stats("f1.parquet", utf8_file_stats(5, "a", "x")),
+                file_with_stats("f2.parquet", utf8_file_stats(8, "b", "z")),
+            ]),
+            FileGroup::new(vec![
+                file_with_stats("f3.parquet", utf8_file_stats(3, "c", "w")),
+                file_with_stats("f4.parquet", utf8_file_stats(6, "d", "y")),
+            ]),
+        ];
+
+        let (file_groups, stats) =
+            compute_all_files_statistics(file_groups, schema, true, false)?;
+
+        assert_eq!(
+            file_groups[0]
+                .file_statistics(None)
+                .unwrap()
+                .column_statistics[0]
+                .distinct_count,
+            Precision::Inexact(8)
+        );
+        assert_eq!(
+            file_groups[1]
+                .file_statistics(None)
+                .unwrap()
+                .column_statistics[0]
+                .distinct_count,
+            Precision::Inexact(6)
+        );
+        assert_eq!(
+            stats.column_statistics[0].distinct_count,
+            Precision::Inexact(8)
+        );
+
+        Ok(())
+    }
+}
diff --git a/datafusion/datasource/src/table_schema.rs b/datafusion/datasource/src/table_schema.rs
index 8002df4a99dfc..5b7fc4727df05 100644
--- a/datafusion/datasource/src/table_schema.rs
+++ b/datafusion/datasource/src/table_schema.rs
@@ -20,13 +20,13 @@
 use arrow::datatypes::{FieldRef, SchemaBuilder, SchemaRef};
 use std::sync::Arc;
 
-/// Helper to hold table schema information for partitioned data sources.
+/// The overall schema for potentially partitioned data sources.
 ///
-/// When reading partitioned data (such as Hive-style partitioning), a table's schema
+/// When reading partitioned data (such as Hive-style partitioning), a [`TableSchema`]
 /// consists of two parts:
 /// 1. **File schema**: The schema of the actual data files on disk
-/// 2. **Partition columns**: Columns that are encoded in the directory structure,
-///    not stored in the files themselves
+/// 2. **Partition columns**: Columns whose values are encoded in the directory structure,
+///    but not stored in the files themselves
 ///
 /// # Example: Partitioned Table
 ///
@@ -70,7 +70,7 @@ pub struct TableSchema {
     ///
     /// These columns are NOT present in the data files but are appended to each
     /// row during query execution based on the file's location.
-    table_partition_cols: Vec<FieldRef>,
+    table_partition_cols: Arc<Vec<FieldRef>>,
 
     /// The complete table schema: file_schema columns followed by partition columns.
     ///
@@ -121,7 +121,7 @@ impl TableSchema {
         builder.extend(table_partition_cols.iter().cloned());
         Self {
             file_schema,
-            table_partition_cols,
+            table_partition_cols: Arc::new(table_partition_cols),
             table_schema: Arc::new(builder.finish()),
         }
     }
@@ -140,7 +140,15 @@ impl TableSchema {
     /// into [`TableSchema::with_table_partition_cols`] if you have partition columns at construction time
     /// since it avoids re-computing the table schema.
     pub fn with_table_partition_cols(mut self, partition_cols: Vec<FieldRef>) -> Self {
-        self.table_partition_cols = partition_cols;
+        if self.table_partition_cols.is_empty() {
+            self.table_partition_cols = Arc::new(partition_cols);
+        } else {
+            // Append to existing partition columns
+            let table_partition_cols = Arc::get_mut(&mut self.table_partition_cols).expect(
+                "Expected to be the sole owner of table_partition_cols since this function accepts mut self",
+            );
+            table_partition_cols.extend(partition_cols);
+        }
         let mut builder = SchemaBuilder::from(self.file_schema.as_ref());
         builder.extend(self.table_partition_cols.iter().cloned());
         self.table_schema = Arc::new(builder.finish());
@@ -170,3 +178,102 @@ impl TableSchema {
         &self.table_schema
     }
 }
+
+impl From<SchemaRef> for TableSchema {
+    fn from(schema: SchemaRef) -> Self {
+        Self::from_file_schema(schema)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::TableSchema;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use std::sync::Arc;
+
+    #[test]
+    fn test_table_schema_creation() {
+        let file_schema = Arc::new(Schema::new(vec![
+            Field::new("user_id", DataType::Int64, false),
+            Field::new("amount", DataType::Float64, false),
+        ]));
+
+        let partition_cols = vec![
+            Arc::new(Field::new("date", DataType::Utf8, false)),
+            Arc::new(Field::new("region", DataType::Utf8, false)),
+        ];
+
+        let table_schema = TableSchema::new(file_schema.clone(), partition_cols.clone());
+
+        // Verify file schema
+        assert_eq!(table_schema.file_schema().as_ref(), file_schema.as_ref());
+
+        // Verify partition columns
+        assert_eq!(table_schema.table_partition_cols().len(), 2);
+        assert_eq!(table_schema.table_partition_cols()[0], partition_cols[0]);
+        assert_eq!(table_schema.table_partition_cols()[1], partition_cols[1]);
+
+        // Verify full table schema
+        let expected_fields = vec![
+            Field::new("user_id", DataType::Int64, false),
+            Field::new("amount", DataType::Float64, false),
+            Field::new("date", DataType::Utf8, false),
+            Field::new("region", DataType::Utf8, false),
+        ];
+        let expected_schema = Schema::new(expected_fields);
+        assert_eq!(table_schema.table_schema().as_ref(), &expected_schema);
+    }
+
+    #[test]
+    fn test_add_multiple_partition_columns() {
+        let file_schema =
+            Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+
+        let initial_partition_cols =
+            vec![Arc::new(Field::new("country", DataType::Utf8, false))];
+
+        let table_schema = TableSchema::new(file_schema.clone(), initial_partition_cols);
+
+        let additional_partition_cols = vec![
+            Arc::new(Field::new("city", DataType::Utf8, false)),
+            Arc::new(Field::new("year", DataType::Int32, false)),
+        ];
+
+        let updated_table_schema =
+            table_schema.with_table_partition_cols(additional_partition_cols);
+
+        // Verify file schema remains unchanged
+        assert_eq!(
+            updated_table_schema.file_schema().as_ref(),
+            file_schema.as_ref()
+        );
+
+        // Verify partition columns
+        assert_eq!(updated_table_schema.table_partition_cols().len(), 3);
+        assert_eq!(
+            updated_table_schema.table_partition_cols()[0].name(),
+            "country"
+        );
+        assert_eq!(
+            updated_table_schema.table_partition_cols()[1].name(),
+            "city"
+        );
+        assert_eq!(
+            updated_table_schema.table_partition_cols()[2].name(),
+            "year"
+        );
+
+        // Verify full table schema
+        let expected_fields = vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("country", DataType::Utf8, false),
+            Field::new("city", DataType::Utf8, false),
+            Field::new("year", DataType::Int32, false),
+        ];
+        let expected_schema = Schema::new(expected_fields);
+        assert_eq!(
+            updated_table_schema.table_schema().as_ref(),
+            &expected_schema
+        );
+    }
+}
diff --git a/datafusion/datasource/src/test_util.rs b/datafusion/datasource/src/test_util.rs
index feb704af9913e..b59ce58a420a8 100644
--- a/datafusion/datasource/src/test_util.rs
+++ b/datafusion/datasource/src/test_util.rs
@@ -17,32 +17,61 @@
 
 use crate::{
     file::FileSource, file_scan_config::FileScanConfig, file_stream::FileOpener,
-    schema_adapter::SchemaAdapterFactory,
 };
 
 use std::sync::Arc;
 
-use crate::TableSchema;
 use arrow::datatypes::Schema;
-use datafusion_common::{Result, Statistics};
-use datafusion_physical_expr::{expressions::Column, PhysicalExpr};
+use datafusion_common::{Result, tree_node::TreeNodeRecursion};
+use datafusion_physical_expr::{PhysicalExpr, expressions::Column};
 use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
 use object_store::ObjectStore;
 
 /// Minimal [`crate::file::FileSource`] implementation for use in tests.
-#[derive(Clone, Default)]
+#[derive(Clone)]
 pub(crate) struct MockSource {
     metrics: ExecutionPlanMetricsSet,
-    projected_statistics: Option<Statistics>,
-    schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
     filter: Option<Arc<dyn PhysicalExpr>>,
+    table_schema: crate::table_schema::TableSchema,
+    projection: crate::projection::SplitProjection,
+    file_opener: Option<Arc<dyn FileOpener>>,
+}
+
+impl Default for MockSource {
+    fn default() -> Self {
+        let table_schema =
+            crate::table_schema::TableSchema::new(Arc::new(Schema::empty()), vec![]);
+        Self {
+            metrics: ExecutionPlanMetricsSet::new(),
+            filter: None,
+            projection: crate::projection::SplitProjection::unprojected(&table_schema),
+            table_schema,
+            file_opener: None,
+        }
+    }
 }
 
 impl MockSource {
+    pub fn new(table_schema: impl Into<crate::table_schema::TableSchema>) -> Self {
+        let table_schema = table_schema.into();
+        Self {
+            metrics: ExecutionPlanMetricsSet::new(),
+            filter: None,
+            projection: crate::projection::SplitProjection::unprojected(&table_schema),
+            table_schema,
+            file_opener: None,
+        }
+    }
+
     pub fn with_filter(mut self, filter: Arc<dyn PhysicalExpr>) -> Self {
         self.filter = Some(filter);
         self
     }
+
+    pub fn with_file_opener(mut self, file_opener: Arc<dyn FileOpener>) -> Self {
+        self.file_opener = Some(file_opener);
+        self
+    }
 }
 
 impl FileSource for MockSource {
@@ -51,12 +80,10 @@ impl FileSource for MockSource {
         _object_store: Arc<dyn ObjectStore>,
         _base_config: &FileScanConfig,
         _partition: usize,
-    ) -> Arc<dyn FileOpener> {
-        unimplemented!()
-    }
-
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
+    ) -> Result<Arc<dyn FileOpener>> {
+        self.file_opener.clone().ok_or_else(|| {
+            datafusion_common::internal_datafusion_err!("MockSource missing FileOpener")
+        })
     }
 
     fn filter(&self) -> Option<Arc<dyn PhysicalExpr>> {
@@ -67,48 +94,43 @@ impl FileSource for MockSource {
         Arc::new(Self { ..self.clone() })
     }
 
-    fn with_schema(&self, _schema: TableSchema) -> Arc<dyn FileSource> {
-        Arc::new(Self { ..self.clone() })
-    }
-
-    fn with_projection(&self, _config: &FileScanConfig) -> Arc<dyn FileSource> {
-        Arc::new(Self { ..self.clone() })
-    }
-
-    fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource> {
-        let mut source = self.clone();
-        source.projected_statistics = Some(statistics);
-        Arc::new(source)
-    }
-
     fn metrics(&self) -> &ExecutionPlanMetricsSet {
         &self.metrics
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(self
-            .projected_statistics
-            .as_ref()
-            .expect("projected_statistics must be set")
-            .clone())
-    }
-
     fn file_type(&self) -> &str {
         "mock"
     }
 
-    fn with_schema_adapter_factory(
+    fn table_schema(&self) -> &crate::table_schema::TableSchema {
+        &self.table_schema
+    }
+
+    fn try_pushdown_projection(
         &self,
-        schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
-    ) -> Result<Arc<dyn FileSource>> {
-        Ok(Arc::new(Self {
-            schema_adapter_factory: Some(schema_adapter_factory),
-            ..self.clone()
-        }))
+        projection: &datafusion_physical_plan::projection::ProjectionExprs,
+    ) -> Result<Option<Arc<dyn FileSource>>> {
+        let mut source = self.clone();
+        let new_projection = self.projection.source.try_merge(projection)?;
+        let split_projection = crate::projection::SplitProjection::new(
+            self.table_schema.file_schema(),
+            &new_projection,
+        );
+        source.projection = split_projection;
+        Ok(Some(Arc::new(source)))
     }
 
-    fn schema_adapter_factory(&self) -> Option<Arc<dyn SchemaAdapterFactory>> {
-        self.schema_adapter_factory.clone()
+    fn projection(
+        &self,
+    ) -> Option<&datafusion_physical_plan::projection::ProjectionExprs> {
+        Some(&self.projection.source)
+    }
+
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
     }
 }
 
diff --git a/datafusion/datasource/src/url.rs b/datafusion/datasource/src/url.rs
index 08e5b6a5df83a..4d7f5bf14c697 100644
--- a/datafusion/datasource/src/url.rs
+++ b/datafusion/datasource/src/url.rs
@@ -17,7 +17,9 @@
 
 use std::sync::Arc;
 
-use datafusion_common::{DataFusionError, Result};
+use datafusion_common::{DataFusionError, Result, TableReference};
+use datafusion_execution::cache::TableScopedPath;
+use datafusion_execution::cache::cache_manager::CachedFileList;
 use datafusion_execution::object_store::ObjectStoreUrl;
 use datafusion_session::Session;
 
@@ -26,9 +28,9 @@ use futures::{StreamExt, TryStreamExt};
 use glob::Pattern;
 use itertools::Itertools;
 use log::debug;
-use object_store::path::Path;
 use object_store::path::DELIMITER;
-use object_store::{ObjectMeta, ObjectStore};
+use object_store::path::Path;
+use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt};
 use url::Url;
 
 /// A parsed URL identifying files for a listing table, see [`ListingTableUrl::parse`]
@@ -41,6 +43,8 @@ pub struct ListingTableUrl {
     prefix: Path,
     /// An optional glob expression used to filter files
     glob: Option<Pattern>,
+    /// Optional table reference for the table this url belongs to
+    table_ref: Option<TableReference>,
 }
 
 impl ListingTableUrl {
@@ -145,7 +149,12 @@ impl ListingTableUrl {
     /// to create a [`ListingTableUrl`].
     pub fn try_new(url: Url, glob: Option<Pattern>) -> Result<Self> {
         let prefix = Path::from_url_path(url.path())?;
-        Ok(Self { url, prefix, glob })
+        Ok(Self {
+            url,
+            prefix,
+            glob,
+            table_ref: None,
+        })
     }
 
     /// Returns the URL scheme
@@ -209,12 +218,12 @@ impl ListingTableUrl {
     /// assert_eq!(url.file_extension(), None);
     /// ```
     pub fn file_extension(&self) -> Option<&str> {
-        if let Some(mut segments) = self.url.path_segments() {
-            if let Some(last_segment) = segments.next_back() {
-                if last_segment.contains(".") && !last_segment.ends_with(".") {
-                    return last_segment.split('.').next_back();
-                }
-            }
+        if let Some(mut segments) = self.url.path_segments()
+            && let Some(last_segment) = segments.next_back()
+            && last_segment.contains(".")
+            && !last_segment.ends_with(".")
+        {
+            return last_segment.split('.').next_back();
         }
 
         None
@@ -233,27 +242,52 @@ impl ListingTableUrl {
         Some(stripped.split_terminator(DELIMITER))
     }
 
-    /// List all files identified by this [`ListingTableUrl`] for the provided `file_extension`
-    pub async fn list_all_files<'a>(
+    /// List all files identified by this [`ListingTableUrl`] for the provided `file_extension`,
+    /// optionally filtering by a path prefix
+    pub async fn list_prefixed_files<'a>(
         &'a self,
         ctx: &'a dyn Session,
         store: &'a dyn ObjectStore,
+        prefix: Option<Path>,
         file_extension: &'a str,
     ) -> Result<BoxStream<'a, Result<ObjectMeta>>> {
         let exec_options = &ctx.config_options().execution;
         let ignore_subdirectory = exec_options.listing_table_ignore_subdirectory;
 
+        // Build full_prefix for non-cached path and head() calls
+        let full_prefix = if let Some(ref p) = prefix {
+            let mut parts = self.prefix.parts().collect::<Vec<_>>();
+            parts.extend(p.parts());
+            Path::from_iter(parts.into_iter())
+        } else {
+            self.prefix.clone()
+        };
+
         let list: BoxStream<'a, Result<ObjectMeta>> = if self.is_collection() {
-            list_with_cache(ctx, store, &self.prefix).await?
+            list_with_cache(
+                ctx,
+                store,
+                self.table_ref.as_ref(),
+                &self.prefix,
+                prefix.as_ref(),
+            )
+            .await?
         } else {
-            match store.head(&self.prefix).await {
+            match store.head(&full_prefix).await {
                 Ok(meta) => futures::stream::once(async { Ok(meta) })
                     .map_err(|e| DataFusionError::ObjectStore(Box::new(e)))
                     .boxed(),
                 // If the head command fails, it is likely that object doesn't exist.
                 // Retry as though it were a prefix (aka a collection)
                 Err(object_store::Error::NotFound { .. }) => {
-                    list_with_cache(ctx, store, &self.prefix).await?
+                    list_with_cache(
+                        ctx,
+                        store,
+                        self.table_ref.as_ref(),
+                        &self.prefix,
+                        prefix.as_ref(),
+                    )
+                    .await?
                 }
                 Err(e) => return Err(e.into()),
             }
@@ -269,6 +303,17 @@ impl ListingTableUrl {
             .boxed())
     }
 
+    /// List all files identified by this [`ListingTableUrl`] for the provided `file_extension`
+    pub async fn list_all_files<'a>(
+        &'a self,
+        ctx: &'a dyn Session,
+        store: &'a dyn ObjectStore,
+        file_extension: &'a str,
+    ) -> Result<BoxStream<'a, Result<ObjectMeta>>> {
+        self.list_prefixed_files(ctx, store, None, file_extension)
+            .await
+    }
+
     /// Returns this [`ListingTableUrl`] as a string
     pub fn as_str(&self) -> &str {
         self.as_ref()
@@ -296,36 +341,93 @@ impl ListingTableUrl {
     }
 
     /// Returns a copy of current [`ListingTableUrl`] with a specified `glob`
-    pub fn with_glob(self, glob: &str) -> Result<Self> {
-        let glob =
-            Pattern::new(glob).map_err(|e| DataFusionError::External(Box::new(e)))?;
-        Self::try_new(self.url, Some(glob))
+    pub fn with_glob(mut self, glob: &str) -> Result<Self> {
+        self.glob =
+            Some(Pattern::new(glob).map_err(|e| DataFusionError::External(Box::new(e)))?);
+        Ok(self)
+    }
+
+    /// Set the table reference for this [`ListingTableUrl`]
+    pub fn with_table_ref(mut self, table_ref: TableReference) -> Self {
+        self.table_ref = Some(table_ref);
+        self
+    }
+
+    /// Return the table reference for this [`ListingTableUrl`]
+    pub fn get_table_ref(&self) -> &Option<TableReference> {
+        &self.table_ref
     }
 }
 
+/// Lists files with cache support, using prefix-aware lookups.
+///
+/// # Arguments
+/// * `ctx` - The session context
+/// * `store` - The object store to list from
+/// * `table_base_path` - The table's base path (the stable cache key)
+/// * `prefix` - Optional prefix relative to table base for filtering results
+///
+/// # Cache Behavior:
+/// The cache key is always `table_base_path`. When a prefix-filtered listing
+/// is requested via `prefix`, the cache:
+/// - Looks up `table_base_path` in the cache
+/// - Filters results to match `table_base_path/prefix`
+/// - Returns filtered results without a storage call
+///
+/// On cache miss, the full table is always listed and cached, ensuring
+/// subsequent prefix queries can be served from cache.
 async fn list_with_cache<'b>(
     ctx: &'b dyn Session,
     store: &'b dyn ObjectStore,
-    prefix: &'b Path,
+    table_ref: Option<&TableReference>,
+    table_base_path: &Path,
+    prefix: Option<&Path>,
 ) -> Result<BoxStream<'b, Result<ObjectMeta>>> {
+    // Build the full listing path (table_base + prefix)
+    let full_prefix = match prefix {
+        Some(p) => {
+            let mut parts: Vec<_> = table_base_path.parts().collect();
+            parts.extend(p.parts());
+            Path::from_iter(parts)
+        }
+        None => table_base_path.clone(),
+    };
+
     match ctx.runtime_env().cache_manager.get_list_files_cache() {
         None => Ok(store
-            .list(Some(prefix))
+            .list(Some(&full_prefix))
             .map(|res| res.map_err(|e| DataFusionError::ObjectStore(Box::new(e))))
             .boxed()),
         Some(cache) => {
-            let vec = if let Some(res) = cache.get(prefix) {
-                debug!("Hit list all files cache");
-                res.as_ref().clone()
+            // Build the filter prefix (only Some if prefix was requested)
+            let filter_prefix = prefix.is_some().then(|| full_prefix.clone());
+
+            let table_scoped_base_path = TableScopedPath {
+                table: table_ref.cloned(),
+                path: table_base_path.clone(),
+            };
+
+            // Try cache lookup - get returns CachedFileList
+            let vec = if let Some(cached) = cache.get(&table_scoped_base_path) {
+                debug!("Hit list files cache");
+                cached.files_matching_prefix(&filter_prefix)
             } else {
-                let vec = store
-                    .list(Some(prefix))
+                // Cache miss - always list and cache the full table
+                // This ensures we have complete data for future prefix queries
+                let mut vec = store
+                    .list(Some(table_base_path))
                     .try_collect::<Vec<ObjectMeta>>()
                     .await?;
-                cache.put(prefix, Arc::new(vec.clone()));
-                vec
+                vec.shrink_to_fit(); // Right-size before caching
+                let cached: CachedFileList = vec.into();
+                let result = cached.files_matching_prefix(&filter_prefix);
+                cache.put(&table_scoped_base_path, cached);
+                result
             };
-            Ok(futures::stream::iter(vec.into_iter().map(Ok)).boxed())
+            Ok(
+                futures::stream::iter(Arc::unwrap_or_clone(vec).into_iter().map(Ok))
+                    .boxed(),
+            )
         }
     }
 }
@@ -409,18 +511,21 @@ mod tests {
     use super::*;
     use async_trait::async_trait;
     use bytes::Bytes;
-    use datafusion_common::config::TableOptions;
     use datafusion_common::DFSchema;
+    use datafusion_common::config::TableOptions;
+    use datafusion_execution::TaskContext;
     use datafusion_execution::config::SessionConfig;
     use datafusion_execution::runtime_env::RuntimeEnv;
-    use datafusion_execution::TaskContext;
     use datafusion_expr::execution_props::ExecutionProps;
-    use datafusion_expr::{AggregateUDF, Expr, LogicalPlan, ScalarUDF, WindowUDF};
+    use datafusion_expr::registry::ExtensionTypeRegistryRef;
+    use datafusion_expr::{
+        AggregateUDF, Expr, HigherOrderUDF, LogicalPlan, ScalarUDF, WindowUDF,
+    };
     use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
     use datafusion_physical_plan::ExecutionPlan;
     use object_store::{
-        GetOptions, GetResult, ListResult, MultipartUpload, PutMultipartOptions,
-        PutPayload,
+        CopyOptions, GetOptions, GetResult, ListResult, MultipartUpload,
+        PutMultipartOptions, PutPayload,
     };
     use std::any::Any;
     use std::collections::HashMap;
@@ -433,7 +538,7 @@ mod tests {
         let root = root.to_string_lossy();
 
         let url = ListingTableUrl::parse(root).unwrap();
-        let child = url.prefix.child("partition").child("file");
+        let child = url.prefix.clone().join("partition").join("file");
 
         let prefix: Vec<_> = url.strip_prefix(&child).unwrap().collect();
         assert_eq!(prefix, vec!["partition", "file"]);
@@ -701,6 +806,220 @@ mod tests {
             panic!("Expected PermissionDenied error");
         };
 
+        // Test prefix filtering with partition-style paths
+        create_file(&store, "/data/a=1/file1.parquet").await;
+        create_file(&store, "/data/a=1/b=100/file2.parquet").await;
+        create_file(&store, "/data/a=2/b=200/file3.parquet").await;
+        create_file(&store, "/data/a=2/b=200/file4.csv").await;
+
+        assert_eq!(
+            list_prefixed_files("/data/", &store, Some(Path::from("a=1")), "parquet")
+                .await?,
+            vec!["data/a=1/b=100/file2.parquet", "data/a=1/file1.parquet"],
+        );
+
+        assert_eq!(
+            list_prefixed_files(
+                "/data/",
+                &store,
+                Some(Path::from("a=1/b=100")),
+                "parquet"
+            )
+            .await?,
+            vec!["data/a=1/b=100/file2.parquet"],
+        );
+
+        assert_eq!(
+            list_prefixed_files("/data/", &store, Some(Path::from("a=2")), "parquet")
+                .await?,
+            vec!["data/a=2/b=200/file3.parquet"],
+        );
+
+        Ok(())
+    }
+
+    /// Tests that the cached code path produces identical results to the non-cached path.
+    ///
+    /// This is critical: the cache is a transparent optimization, so both paths
+    /// MUST return the same files. Note: order is not guaranteed by ObjectStore::list,
+    /// so we sort results before comparison.
+    #[tokio::test]
+    async fn test_cache_path_equivalence() -> Result<()> {
+        use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+
+        let store = MockObjectStore {
+            in_mem: object_store::memory::InMemory::new(),
+            forbidden_paths: vec![],
+        };
+
+        // Create test files with partition-style paths
+        create_file(&store, "/table/year=2023/data1.parquet").await;
+        create_file(&store, "/table/year=2023/month=01/data2.parquet").await;
+        create_file(&store, "/table/year=2024/data3.parquet").await;
+        create_file(&store, "/table/year=2024/month=06/data4.parquet").await;
+        create_file(&store, "/table/year=2024/month=12/data5.parquet").await;
+
+        // Session WITHOUT cache
+        let session_no_cache = MockSession::new();
+
+        // Session WITH cache - use RuntimeEnvBuilder with cache limit (no TTL needed for this test)
+        let runtime_with_cache = RuntimeEnvBuilder::new()
+            .with_object_list_cache_limit(1024 * 1024) // 1MB limit
+            .build_arc()?;
+        let session_with_cache = MockSession::with_runtime_env(runtime_with_cache);
+
+        // Test cases: (url, prefix, description)
+        let test_cases = vec![
+            ("/table/", None, "full table listing"),
+            (
+                "/table/",
+                Some(Path::from("year=2023")),
+                "single partition filter",
+            ),
+            (
+                "/table/",
+                Some(Path::from("year=2024")),
+                "different partition filter",
+            ),
+            (
+                "/table/",
+                Some(Path::from("year=2024/month=06")),
+                "nested partition filter",
+            ),
+            (
+                "/table/",
+                Some(Path::from("year=2025")),
+                "non-existent partition",
+            ),
+        ];
+
+        for (url_str, prefix, description) in test_cases {
+            let url = ListingTableUrl::parse(url_str)?;
+
+            // Get results WITHOUT cache (sorted for comparison)
+            let mut results_no_cache: Vec<String> = url
+                .list_prefixed_files(&session_no_cache, &store, prefix.clone(), "parquet")
+                .await?
+                .try_collect::<Vec<_>>()
+                .await?
+                .into_iter()
+                .map(|m| m.location.to_string())
+                .collect();
+            results_no_cache.sort();
+
+            // Get results WITH cache (first call - cache miss, sorted for comparison)
+            let mut results_with_cache_miss: Vec<String> = url
+                .list_prefixed_files(
+                    &session_with_cache,
+                    &store,
+                    prefix.clone(),
+                    "parquet",
+                )
+                .await?
+                .try_collect::<Vec<_>>()
+                .await?
+                .into_iter()
+                .map(|m| m.location.to_string())
+                .collect();
+            results_with_cache_miss.sort();
+
+            // Get results WITH cache (second call - cache hit, sorted for comparison)
+            let mut results_with_cache_hit: Vec<String> = url
+                .list_prefixed_files(&session_with_cache, &store, prefix, "parquet")
+                .await?
+                .try_collect::<Vec<_>>()
+                .await?
+                .into_iter()
+                .map(|m| m.location.to_string())
+                .collect();
+            results_with_cache_hit.sort();
+
+            // All three should contain the same files
+            assert_eq!(
+                results_no_cache, results_with_cache_miss,
+                "Cache miss path should match non-cached path for: {description}"
+            );
+            assert_eq!(
+                results_no_cache, results_with_cache_hit,
+                "Cache hit path should match non-cached path for: {description}"
+            );
+        }
+
+        Ok(())
+    }
+
+    /// Tests that prefix queries can be served from a cached full-table listing
+    #[tokio::test]
+    async fn test_cache_serves_partition_from_full_listing() -> Result<()> {
+        use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+
+        let store = MockObjectStore {
+            in_mem: object_store::memory::InMemory::new(),
+            forbidden_paths: vec![],
+        };
+
+        // Create test files
+        create_file(&store, "/sales/region=US/q1.parquet").await;
+        create_file(&store, "/sales/region=US/q2.parquet").await;
+        create_file(&store, "/sales/region=EU/q1.parquet").await;
+
+        // Create session with cache (no TTL needed for this test)
+        let runtime = RuntimeEnvBuilder::new()
+            .with_object_list_cache_limit(1024 * 1024) // 1MB limit
+            .build_arc()?;
+        let session = MockSession::with_runtime_env(runtime);
+
+        let url = ListingTableUrl::parse("/sales/")?;
+
+        // First: query full table (populates cache)
+        let full_results: Vec<String> = url
+            .list_prefixed_files(&session, &store, None, "parquet")
+            .await?
+            .try_collect::<Vec<_>>()
+            .await?
+            .into_iter()
+            .map(|m| m.location.to_string())
+            .collect();
+        assert_eq!(full_results.len(), 3);
+
+        // Second: query with prefix (should be served from cache)
+        let mut us_results: Vec<String> = url
+            .list_prefixed_files(
+                &session,
+                &store,
+                Some(Path::from("region=US")),
+                "parquet",
+            )
+            .await?
+            .try_collect::<Vec<_>>()
+            .await?
+            .into_iter()
+            .map(|m| m.location.to_string())
+            .collect();
+        us_results.sort();
+
+        assert_eq!(
+            us_results,
+            vec!["sales/region=US/q1.parquet", "sales/region=US/q2.parquet"]
+        );
+
+        // Third: different prefix (also from cache)
+        let eu_results: Vec<String> = url
+            .list_prefixed_files(
+                &session,
+                &store,
+                Some(Path::from("region=EU")),
+                "parquet",
+            )
+            .await?
+            .try_collect::<Vec<_>>()
+            .await?
+            .into_iter()
+            .map(|m| m.location.to_string())
+            .collect();
+
+        assert_eq!(eu_results, vec!["sales/region=EU/q1.parquet"]);
+
         Ok(())
     }
 
@@ -712,7 +1031,7 @@ mod tests {
             .expect("failed to create test file");
     }
 
-    /// Runs "list_all_files" and returns their paths
+    /// Runs "list_prefixed_files"  with no prefix to list all files and returns their paths
     ///
     /// Panic's on error
     async fn list_all_files(
@@ -720,19 +1039,32 @@ mod tests {
         store: &dyn ObjectStore,
         file_extension: &str,
     ) -> Result<Vec<String>> {
-        try_list_all_files(url, store, file_extension).await
+        try_list_prefixed_files(url, store, None, file_extension).await
     }
 
-    /// Runs "list_all_files" and returns their paths
-    async fn try_list_all_files(
+    /// Runs "list_prefixed_files" and returns their paths
+    ///
+    /// Panic's on error
+    async fn list_prefixed_files(
         url: &str,
         store: &dyn ObjectStore,
+        prefix: Option<Path>,
+        file_extension: &str,
+    ) -> Result<Vec<String>> {
+        try_list_prefixed_files(url, store, prefix, file_extension).await
+    }
+
+    /// Runs "list_prefixed_files" and returns their paths
+    async fn try_list_prefixed_files(
+        url: &str,
+        store: &dyn ObjectStore,
+        prefix: Option<Path>,
         file_extension: &str,
     ) -> Result<Vec<String>> {
         let session = MockSession::new();
         let url = ListingTableUrl::parse(url)?;
         let files = url
-            .list_all_files(&session, store, file_extension)
+            .list_prefixed_files(&session, store, prefix, file_extension)
             .await?
             .try_collect::<Vec<_>>()
             .await?
@@ -778,7 +1110,14 @@ mod tests {
             location: &Path,
             options: GetOptions,
         ) -> object_store::Result<GetResult> {
-            self.in_mem.get_opts(location, options).await
+            if options.head && self.forbidden_paths.contains(location) {
+                Err(object_store::Error::PermissionDenied {
+                    path: location.to_string(),
+                    source: "forbidden".into(),
+                })
+            } else {
+                self.in_mem.get_opts(location, options).await
+            }
         }
 
         async fn get_ranges(
@@ -789,19 +1128,11 @@ mod tests {
             self.in_mem.get_ranges(location, ranges).await
         }
 
-        async fn head(&self, location: &Path) -> object_store::Result<ObjectMeta> {
-            if self.forbidden_paths.contains(location) {
-                Err(object_store::Error::PermissionDenied {
-                    path: location.to_string(),
-                    source: "forbidden".into(),
-                })
-            } else {
-                self.in_mem.head(location).await
-            }
-        }
-
-        async fn delete(&self, location: &Path) -> object_store::Result<()> {
-            self.in_mem.delete(location).await
+        fn delete_stream(
+            &self,
+            locations: BoxStream<'static, object_store::Result<Path>>,
+        ) -> BoxStream<'static, object_store::Result<Path>> {
+            self.in_mem.delete_stream(locations)
         }
 
         fn list(
@@ -818,16 +1149,13 @@ mod tests {
             self.in_mem.list_with_delimiter(prefix).await
         }
 
-        async fn copy(&self, from: &Path, to: &Path) -> object_store::Result<()> {
-            self.in_mem.copy(from, to).await
-        }
-
-        async fn copy_if_not_exists(
+        async fn copy_opts(
             &self,
             from: &Path,
             to: &Path,
+            options: CopyOptions,
         ) -> object_store::Result<()> {
-            self.in_mem.copy_if_not_exists(from, to).await
+            self.in_mem.copy_opts(from, to, options).await
         }
     }
 
@@ -843,6 +1171,14 @@ mod tests {
                 runtime_env: Arc::new(RuntimeEnv::default()),
             }
         }
+
+        /// Create a MockSession with a custom RuntimeEnv (for cache testing)
+        fn with_runtime_env(runtime_env: Arc<RuntimeEnv>) -> Self {
+            Self {
+                config: SessionConfig::new(),
+                runtime_env,
+            }
+        }
     }
 
     #[async_trait::async_trait]
@@ -874,6 +1210,10 @@ mod tests {
             unimplemented!()
         }
 
+        fn higher_order_functions(&self) -> &HashMap<String, Arc<dyn HigherOrderUDF>> {
+            unimplemented!()
+        }
+
         fn aggregate_functions(&self) -> &HashMap<String, Arc<AggregateUDF>> {
             unimplemented!()
         }
@@ -882,6 +1222,10 @@ mod tests {
             unimplemented!()
         }
 
+        fn extension_type_registry(&self) -> &ExtensionTypeRegistryRef {
+            unimplemented!()
+        }
+
         fn runtime_env(&self) -> &Arc<RuntimeEnv> {
             &self.runtime_env
         }
diff --git a/datafusion/datasource/src/write/demux.rs b/datafusion/datasource/src/write/demux.rs
index 52cb17c10453e..acc6435acf371 100644
--- a/datafusion/datasource/src/write/demux.rs
+++ b/datafusion/datasource/src/write/demux.rs
@@ -28,15 +28,15 @@ use datafusion_common::error::Result;
 use datafusion_physical_plan::SendableRecordBatchStream;
 
 use arrow::array::{
-    builder::UInt64Builder, cast::AsArray, downcast_dictionary_array, ArrayAccessor,
-    RecordBatch, StringArray, StructArray,
+    ArrayAccessor, RecordBatch, StringArray, StructArray, builder::UInt64Builder,
+    cast::AsArray, downcast_dictionary_array,
 };
 use arrow::datatypes::{DataType, Schema};
 use datafusion_common::cast::{
     as_boolean_array, as_date32_array, as_date64_array, as_float16_array,
-    as_float32_array, as_float64_array, as_int16_array, as_int32_array, as_int64_array,
-    as_int8_array, as_string_array, as_string_view_array, as_uint16_array,
-    as_uint32_array, as_uint64_array, as_uint8_array,
+    as_float32_array, as_float64_array, as_int8_array, as_int16_array, as_int32_array,
+    as_int64_array, as_large_string_array, as_string_array, as_string_view_array,
+    as_uint8_array, as_uint16_array, as_uint32_array, as_uint64_array,
 };
 use datafusion_common::{exec_datafusion_err, internal_datafusion_err, not_impl_err};
 use datafusion_common_runtime::SpawnedTask;
@@ -106,8 +106,9 @@ pub(crate) fn start_demuxer_task(
     let file_extension = config.file_extension.clone();
     let base_output_path = config.table_paths[0].clone();
     let task = if config.table_partition_cols.is_empty() {
-        let single_file_output = !base_output_path.is_collection()
-            && base_output_path.file_extension().is_some();
+        let single_file_output = config
+            .file_output_mode
+            .single_file_output(&base_output_path);
         SpawnedTask::spawn(async move {
             row_count_demuxer(
                 tx,
@@ -191,7 +192,11 @@ async fn row_count_demuxer(
         part_idx += 1;
     }
 
+    let schema = input.schema();
+    let mut is_batch_received = false;
+
     while let Some(rb) = input.next().await.transpose()? {
+        is_batch_received = true;
         // ensure we have at least minimum_parallel_files open
         if open_file_streams.len() < minimum_parallel_files {
             open_file_streams.push(create_new_file_stream(
@@ -228,6 +233,19 @@ async fn row_count_demuxer(
 
         next_send_steam = (next_send_steam + 1) % minimum_parallel_files;
     }
+
+    // if there is no batch send but with a single file, send an empty batch
+    if single_file_output && !is_batch_received {
+        open_file_streams
+            .first_mut()
+            .ok_or_else(|| internal_datafusion_err!("Expected a single output file"))?
+            .send(RecordBatch::new_empty(schema))
+            .await
+            .map_err(|_| {
+                exec_datafusion_err!("Error sending empty RecordBatch to file stream!")
+            })?;
+    }
+
     Ok(())
 }
 
@@ -242,7 +260,8 @@ fn generate_file_path(
     if !single_file_output {
         base_output_path
             .prefix()
-            .child(format!("{write_id}_{part_idx}.{file_extension}"))
+            .clone()
+            .join(format!("{write_id}_{part_idx}.{file_extension}"))
     } else {
         base_output_path.prefix().to_owned()
     }
@@ -296,7 +315,7 @@ async fn hive_style_partitions_demuxer(
         let all_partition_values = compute_partition_keys_by_row(&rb, &partition_by)?;
 
         // Next compute how the batch should be split up to take each distinct key to its own batch
-        let take_map = compute_take_arrays(&rb, all_partition_values);
+        let take_map = compute_take_arrays(&rb, &all_partition_values);
 
         // Divide up the batch into distinct partition key batches and send each batch
         for (part_key, mut builder) in take_map.into_iter() {
@@ -380,6 +399,12 @@ fn compute_partition_keys_by_row<'a>(
                     partition_values.push(Cow::from(array.value(i)));
                 }
             }
+            DataType::LargeUtf8 => {
+                let array = as_large_string_array(col_array)?;
+                for i in 0..rb.num_rows() {
+                    partition_values.push(Cow::from(array.value(i)));
+                }
+            }
             DataType::Utf8View => {
                 let array = as_string_view_array(col_array)?;
                 for i in 0..rb.num_rows() {
@@ -502,9 +527,9 @@ fn compute_partition_keys_by_row<'a>(
             }
             _ => {
                 return not_impl_err!(
-                "it is not yet supported to write to hive partitions with datatype {}",
-                dtype
-            )
+                    "it is not yet supported to write to hive partitions with datatype {}",
+                    dtype
+                );
             }
         }
 
@@ -516,7 +541,7 @@ fn compute_partition_keys_by_row<'a>(
 
 fn compute_take_arrays(
     rb: &RecordBatch,
-    all_partition_values: Vec<Vec<Cow<str>>>,
+    all_partition_values: &[Vec<Cow<str>>],
 ) -> HashMap<Vec<String>, UInt64Builder> {
     let mut take_map = HashMap::new();
     for i in 0..rb.num_rows() {
@@ -564,8 +589,8 @@ fn compute_hive_style_file_path(
 ) -> Path {
     let mut file_path = base_output_path.prefix().clone();
     for j in 0..part_key.len() {
-        file_path = file_path.child(format!("{}={}", partition_by[j].0, part_key[j]));
+        file_path = file_path.join(format!("{}={}", partition_by[j].0, part_key[j]));
     }
 
-    file_path.child(format!("{write_id}.{file_extension}"))
+    file_path.join(format!("{write_id}.{file_extension}"))
 }
diff --git a/datafusion/datasource/src/write/mod.rs b/datafusion/datasource/src/write/mod.rs
index 85832f81bc185..e8d2d17da8ee8 100644
--- a/datafusion/datasource/src/write/mod.rs
+++ b/datafusion/datasource/src/write/mod.rs
@@ -28,9 +28,9 @@ use datafusion_common::error::Result;
 use arrow::array::RecordBatch;
 use arrow::datatypes::Schema;
 use bytes::Bytes;
+use object_store::ObjectStore;
 use object_store::buffered::BufWriter;
 use object_store::path::Path;
-use object_store::ObjectStore;
 use tokio::io::AsyncWrite;
 
 pub mod demux;
@@ -131,6 +131,8 @@ pub struct ObjectWriterBuilder {
     object_store: Arc<dyn ObjectStore>,
     /// The size of the buffer for the object writer.
     buffer_size: Option<usize>,
+    /// The compression level for the object writer.
+    compression_level: Option<u32>,
 }
 
 impl ObjectWriterBuilder {
@@ -145,6 +147,7 @@ impl ObjectWriterBuilder {
             location: location.clone(),
             object_store,
             buffer_size: None,
+            compression_level: None,
         }
     }
 
@@ -202,6 +205,22 @@ impl ObjectWriterBuilder {
         self.buffer_size
     }
 
+    /// Set compression level for object writer.
+    pub fn set_compression_level(&mut self, compression_level: Option<u32>) {
+        self.compression_level = compression_level;
+    }
+
+    /// Set compression level for object writer, returning the builder.
+    pub fn with_compression_level(mut self, compression_level: Option<u32>) -> Self {
+        self.compression_level = compression_level;
+        self
+    }
+
+    /// Currently specified compression level.
+    pub fn get_compression_level(&self) -> Option<u32> {
+        self.compression_level
+    }
+
     /// Return a writer object that writes to the object store location.
     ///
     /// If a buffer size has not been set, the default buffer buffer size will
@@ -215,6 +234,7 @@ impl ObjectWriterBuilder {
             location,
             object_store,
             buffer_size,
+            compression_level,
         } = self;
 
         let buf_writer = match buffer_size {
@@ -222,6 +242,7 @@ impl ObjectWriterBuilder {
             None => BufWriter::new(object_store, location),
         };
 
-        file_compression_type.convert_async_writer(buf_writer)
+        file_compression_type
+            .convert_async_writer_with_level(buf_writer, compression_level)
     }
 }
diff --git a/datafusion/datasource/src/write/orchestration.rs b/datafusion/datasource/src/write/orchestration.rs
index ab836b7b7f388..39c91a1c0d676 100644
--- a/datafusion/datasource/src/write/orchestration.rs
+++ b/datafusion/datasource/src/write/orchestration.rs
@@ -28,7 +28,7 @@ use datafusion_common::error::Result;
 
 use arrow::array::RecordBatch;
 use datafusion_common::{
-    exec_datafusion_err, internal_datafusion_err, internal_err, DataFusionError,
+    DataFusionError, exec_datafusion_err, internal_datafusion_err, internal_err,
 };
 use datafusion_common_runtime::{JoinSet, SpawnedTask};
 use datafusion_execution::TaskContext;
@@ -120,7 +120,7 @@ pub(crate) async fn serialize_rb_stream_to_object_store(
                         return SerializedRecordBatchResult::failure(
                             None,
                             exec_datafusion_err!("Error writing to object store: {e}"),
-                        )
+                        );
                     }
                 };
                 row_count += cnt;
@@ -148,7 +148,7 @@ pub(crate) async fn serialize_rb_stream_to_object_store(
             return SerializedRecordBatchResult::failure(
                 Some(writer),
                 internal_datafusion_err!("Unknown error writing to object store"),
-            )
+            );
         }
     }
     SerializedRecordBatchResult::success(writer, row_count)
@@ -216,12 +216,20 @@ pub(crate) async fn stateless_serialize_and_write_files(
     }
 
     if any_errors {
-        match any_abort_errors{
-            true => return internal_err!("Error encountered during writing to ObjectStore and failed to abort all writers. Partial result may have been written."),
+        match any_abort_errors {
+            true => {
+                return internal_err!(
+                    "Error encountered during writing to ObjectStore and failed to abort all writers. Partial result may have been written."
+                );
+            }
             false => match triggering_error {
                 Some(e) => return Err(e),
-                None => return internal_err!("Unknown Error encountered during writing to ObjectStore. All writers successfully aborted.")
-            }
+                None => {
+                    return internal_err!(
+                        "Unknown Error encountered during writing to ObjectStore. All writers successfully aborted."
+                    );
+                }
+            },
         }
     }
 
@@ -240,6 +248,7 @@ pub async fn spawn_writer_tasks_and_join(
     context: &Arc<TaskContext>,
     serializer: Arc<dyn BatchSerializer>,
     compression: FileCompressionType,
+    compression_level: Option<u32>,
     object_store: Arc<dyn ObjectStore>,
     demux_task: SpawnedTask<Result<()>>,
     mut file_stream_rx: DemuxedStreamReceiver,
@@ -265,6 +274,7 @@ pub async fn spawn_writer_tasks_and_join(
                         .execution
                         .objectstore_writer_buffer_size,
                 ))
+                .with_compression_level(compression_level)
                 .build()?;
 
         if tx_file_bundle
diff --git a/datafusion/doc/Cargo.toml b/datafusion/doc/Cargo.toml
index b8324565a0c67..c1368c1531533 100644
--- a/datafusion/doc/Cargo.toml
+++ b/datafusion/doc/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
diff --git a/datafusion/doc/src/lib.rs b/datafusion/doc/src/lib.rs
index 977130ffc0d6a..591a5a62f3b20 100644
--- a/datafusion/doc/src/lib.rs
+++ b/datafusion/doc/src/lib.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 #![doc(
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
@@ -29,15 +30,13 @@ pub use udaf::aggregate_doc_sections;
 pub use udf::scalar_doc_sections;
 pub use udwf::window_doc_sections;
 
-#[allow(rustdoc::broken_intra_doc_links)]
-/// Documentation for use by [`ScalarUDFImpl`](ScalarUDFImpl),
-/// [`AggregateUDFImpl`](AggregateUDFImpl) and [`WindowUDFImpl`](WindowUDFImpl) functions.
+/// Documentation for use by `ScalarUDFImpl`, `AggregateUDFImpl` and `WindowUDFImpl` functions.
 ///
 /// See the [`DocumentationBuilder`] to create a new [`Documentation`] struct.
 ///
 /// The DataFusion [SQL function documentation] is automatically  generated from these structs.
-/// The name of the udf will be pulled from the [`ScalarUDFImpl::name`](ScalarUDFImpl::name),
-/// [`AggregateUDFImpl::name`](AggregateUDFImpl::name) or [`WindowUDFImpl::name`](WindowUDFImpl::name)
+/// The name of the udf will be pulled from the `ScalarUDFImpl::name`,
+/// `AggregateUDFImpl::name` or `WindowUDFImpl::name`
 /// function as appropriate.
 ///
 /// All strings in the documentation are required to be
diff --git a/datafusion/doc/src/udf.rs b/datafusion/doc/src/udf.rs
index 3d18c9ac2714e..d1f51d919478d 100644
--- a/datafusion/doc/src/udf.rs
+++ b/datafusion/doc/src/udf.rs
@@ -127,6 +127,8 @@ The following regular expression functions are supported:"#,
     pub const DOC_SECTION_UNION: DocSection = DocSection {
         include: true,
         label: "Union Functions",
-        description: Some("Functions to work with the union data type, also know as tagged unions, variant types, enums or sum types. Note: Not related to the SQL UNION operator"),
+        description: Some(
+            "Functions to work with the union data type, also know as tagged unions, variant types, enums or sum types. Note: Not related to the SQL UNION operator",
+        ),
     };
 }
diff --git a/datafusion/execution/Cargo.toml b/datafusion/execution/Cargo.toml
index 67a37a86c7066..06c84d8acb493 100644
--- a/datafusion/execution/Cargo.toml
+++ b/datafusion/execution/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -43,14 +46,19 @@ default = ["sql"]
 parquet_encryption = [
     "parquet/encryption",
 ]
+arrow_buffer_pool = [
+    "arrow-buffer/pool",
+]
 sql = []
 
 [dependencies]
 arrow = { workspace = true }
+arrow-buffer = { workspace = true }
 async-trait = { workspace = true }
 dashmap = { workspace = true }
 datafusion-common = { workspace = true, default-features = false }
 datafusion-expr = { workspace = true, default-features = false }
+datafusion-physical-expr-common = { workspace = true, default-features = false }
 futures = { workspace = true }
 log = { workspace = true }
 object_store = { workspace = true, features = ["fs"] }
diff --git a/datafusion/execution/src/cache/cache_manager.rs b/datafusion/execution/src/cache/cache_manager.rs
index 3e0f4065d13f5..0868c968c3a2f 100644
--- a/datafusion/execution/src/cache/cache_manager.rs
+++ b/datafusion/execution/src/cache/cache_manager.rs
@@ -15,25 +15,158 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::cache::cache_unit::DefaultFilesMetadataCache;
 use crate::cache::CacheAccessor;
+use crate::cache::DefaultListFilesCache;
+use crate::cache::cache_unit::DefaultFilesMetadataCache;
+use crate::cache::list_files_cache::ListFilesEntry;
+use crate::cache::list_files_cache::TableScopedPath;
+use datafusion_common::TableReference;
+use datafusion_common::stats::Precision;
 use datafusion_common::{Result, Statistics};
-use object_store::path::Path;
+use datafusion_physical_expr_common::sort_expr::LexOrdering;
 use object_store::ObjectMeta;
+use object_store::path::Path;
 use std::any::Any;
 use std::collections::HashMap;
 use std::fmt::{Debug, Formatter};
+use std::ops::Deref;
 use std::sync::Arc;
+use std::time::Duration;
 
-/// A cache for [`Statistics`].
+pub use super::list_files_cache::{
+    DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT, DEFAULT_LIST_FILES_CACHE_TTL,
+};
+
+/// Cached metadata for a file, including statistics and ordering.
+///
+/// This struct embeds the [`ObjectMeta`] used for cache validation,
+/// along with the cached statistics and ordering information.
+#[derive(Debug, Clone)]
+pub struct CachedFileMetadata {
+    /// File metadata used for cache validation (size, last_modified).
+    pub meta: ObjectMeta,
+    /// Cached statistics for the file, if available.
+    pub statistics: Arc<Statistics>,
+    /// Cached ordering for the file.
+    pub ordering: Option<LexOrdering>,
+}
+
+impl CachedFileMetadata {
+    /// Create a new cached file metadata entry.
+    pub fn new(
+        meta: ObjectMeta,
+        statistics: Arc<Statistics>,
+        ordering: Option<LexOrdering>,
+    ) -> Self {
+        Self {
+            meta,
+            statistics,
+            ordering,
+        }
+    }
+
+    /// Check if this cached entry is still valid for the given metadata.
+    ///
+    /// Returns true if the file size and last modified time match.
+    pub fn is_valid_for(&self, current_meta: &ObjectMeta) -> bool {
+        self.meta.size == current_meta.size
+            && self.meta.last_modified == current_meta.last_modified
+    }
+}
+
+/// A cache for file statistics and orderings.
+///
+/// This cache stores [`CachedFileMetadata`] which includes:
+/// - File metadata for validation (size, last_modified)
+/// - Statistics for the file
+/// - Ordering information for the file
 ///
 /// If enabled via [`CacheManagerConfig::with_files_statistics_cache`] this
 /// cache avoids inferring the same file statistics repeatedly during the
 /// session lifetime.
 ///
+/// The typical usage pattern is:
+/// 1. Call `get(path)` to check for cached value
+/// 2. If `Some(cached)`, validate with `cached.is_valid_for(&current_meta)`
+/// 3. If invalid or missing, compute new value and call `put(path, new_value)`
+///
 /// See [`crate::runtime_env::RuntimeEnv`] for more details
-pub type FileStatisticsCache =
-    Arc<dyn CacheAccessor<Path, Arc<Statistics>, Extra = ObjectMeta>>;
+pub trait FileStatisticsCache: CacheAccessor<Path, CachedFileMetadata> {
+    /// Retrieves the information about the entries currently cached.
+    fn list_entries(&self) -> HashMap<Path, FileStatisticsCacheEntry>;
+}
+
+/// Represents information about a cached statistics entry.
+/// This is used to expose the statistics cache contents to outside modules.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct FileStatisticsCacheEntry {
+    pub object_meta: ObjectMeta,
+    /// Number of table rows.
+    pub num_rows: Precision<usize>,
+    /// Number of table columns.
+    pub num_columns: usize,
+    /// Total table size, in bytes.
+    pub table_size_bytes: Precision<usize>,
+    /// Size of the statistics entry, in bytes.
+    pub statistics_size_bytes: usize,
+    /// Whether ordering information is cached for this file.
+    pub has_ordering: bool,
+}
+
+/// Cached file listing.
+///
+/// TTL expiration is handled internally by the cache implementation.
+#[derive(Debug, Clone, PartialEq)]
+pub struct CachedFileList {
+    /// The cached file list.
+    pub files: Arc<Vec<ObjectMeta>>,
+}
+
+impl CachedFileList {
+    /// Create a new cached file list.
+    pub fn new(files: Vec<ObjectMeta>) -> Self {
+        Self {
+            files: Arc::new(files),
+        }
+    }
+
+    /// Filter the files by prefix.
+    fn filter_by_prefix(&self, prefix: &Option<Path>) -> Vec<ObjectMeta> {
+        match prefix {
+            Some(prefix) => self
+                .files
+                .iter()
+                .filter(|meta| meta.location.as_ref().starts_with(prefix.as_ref()))
+                .cloned()
+                .collect(),
+            None => self.files.as_ref().clone(),
+        }
+    }
+
+    /// Returns files matching the given prefix.
+    ///
+    /// When prefix is `None`, returns a clone of the `Arc` (no data copy).
+    /// When filtering is needed, returns a new `Arc` with filtered results (clones each matching [`ObjectMeta`]).
+    pub fn files_matching_prefix(&self, prefix: &Option<Path>) -> Arc<Vec<ObjectMeta>> {
+        match prefix {
+            None => Arc::clone(&self.files),
+            Some(p) => Arc::new(self.filter_by_prefix(&Some(p.clone()))),
+        }
+    }
+}
+
+impl Deref for CachedFileList {
+    type Target = Arc<Vec<ObjectMeta>>;
+    fn deref(&self) -> &Self::Target {
+        &self.files
+    }
+}
+
+impl From<Vec<ObjectMeta>> for CachedFileList {
+    fn from(files: Vec<ObjectMeta>) -> Self {
+        Self::new(files)
+    }
+}
 
 /// Cache for storing the [`ObjectMeta`]s that result from listing a path
 ///
@@ -41,9 +174,31 @@ pub type FileStatisticsCache =
 /// command on the local filesystem. This operation can be expensive,
 /// especially when done over remote object stores.
 ///
-/// See [`crate::runtime_env::RuntimeEnv`] for more details
-pub type ListFilesCache =
-    Arc<dyn CacheAccessor<Path, Arc<Vec<ObjectMeta>>, Extra = ObjectMeta>>;
+/// The cache key is always the table's base path, ensuring a stable cache key.
+/// The cached value is a [`CachedFileList`] containing the files and a timestamp.
+///
+/// Partition filtering is done after retrieval using [`CachedFileList::files_matching_prefix`].
+///
+/// See [`crate::runtime_env::RuntimeEnv`] for more details.
+pub trait ListFilesCache: CacheAccessor<TableScopedPath, CachedFileList> {
+    /// Returns the cache's memory limit in bytes.
+    fn cache_limit(&self) -> usize;
+
+    /// Returns the TTL (time-to-live) for cache entries, if configured.
+    fn cache_ttl(&self) -> Option<Duration>;
+
+    /// Updates the cache with a new memory limit in bytes.
+    fn update_cache_limit(&self, limit: usize);
+
+    /// Updates the cache with a new TTL (time-to-live).
+    fn update_cache_ttl(&self, ttl: Option<Duration>);
+
+    /// Retrieves the information about the entries currently cached.
+    fn list_entries(&self) -> HashMap<TableScopedPath, ListFilesEntry>;
+
+    /// Drop all entries for the given table reference.
+    fn drop_table_entries(&self, table_ref: &Option<TableReference>) -> Result<()>;
+}
 
 /// Generic file-embedded metadata used with [`FileMetadataCache`].
 ///
@@ -63,9 +218,44 @@ pub trait FileMetadata: Any + Send + Sync {
     fn extra_info(&self) -> HashMap<String, String>;
 }
 
+/// Cached file metadata entry with validation information.
+#[derive(Clone)]
+pub struct CachedFileMetadataEntry {
+    /// File metadata used for cache validation (size, last_modified).
+    pub meta: ObjectMeta,
+    /// The cached file metadata.
+    pub file_metadata: Arc<dyn FileMetadata>,
+}
+
+impl CachedFileMetadataEntry {
+    /// Create a new cached file metadata entry.
+    pub fn new(meta: ObjectMeta, file_metadata: Arc<dyn FileMetadata>) -> Self {
+        Self {
+            meta,
+            file_metadata,
+        }
+    }
+
+    /// Check if this cached entry is still valid for the given metadata.
+    pub fn is_valid_for(&self, current_meta: &ObjectMeta) -> bool {
+        self.meta.size == current_meta.size
+            && self.meta.last_modified == current_meta.last_modified
+    }
+}
+
+impl Debug for CachedFileMetadataEntry {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("CachedFileMetadataEntry")
+            .field("meta", &self.meta)
+            .field("memory_size", &self.file_metadata.memory_size())
+            .finish()
+    }
+}
+
 /// Cache for file-embedded metadata.
 ///
-/// This cache stores per-file metadata in the form of [`FileMetadata`],
+/// This cache stores per-file metadata in the form of [`CachedFileMetadataEntry`],
+/// which includes the [`ObjectMeta`] for validation.
 ///
 /// For example, the built in [`ListingTable`] uses this cache to avoid parsing
 /// Parquet footers multiple times for the same file.
@@ -74,12 +264,15 @@ pub trait FileMetadata: Any + Send + Sync {
 /// and users can also provide their own implementations to implement custom
 /// caching strategies.
 ///
+/// The typical usage pattern is:
+/// 1. Call `get(path)` to check for cached value
+/// 2. If `Some(cached)`, validate with `cached.is_valid_for(&current_meta)`
+/// 3. If invalid or missing, compute new value and call `put(path, new_value)`
+///
 /// See [`crate::runtime_env::RuntimeEnv`] for more details.
 ///
 /// [`ListingTable`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTable.html
-pub trait FileMetadataCache:
-    CacheAccessor<ObjectMeta, Arc<dyn FileMetadata>, Extra = ObjectMeta>
-{
+pub trait FileMetadataCache: CacheAccessor<Path, CachedFileMetadataEntry> {
     /// Returns the cache's memory limit in bytes.
     fn cache_limit(&self) -> usize;
 
@@ -103,13 +296,13 @@ pub struct FileMetadataCacheEntry {
     pub extra: HashMap<String, String>,
 }
 
-impl Debug for dyn CacheAccessor<Path, Arc<Statistics>, Extra = ObjectMeta> {
+impl Debug for dyn FileStatisticsCache {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         write!(f, "Cache name: {} with length: {}", self.name(), self.len())
     }
 }
 
-impl Debug for dyn CacheAccessor<Path, Arc<Vec<ObjectMeta>>, Extra = ObjectMeta> {
+impl Debug for dyn ListFilesCache {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         write!(f, "Cache name: {} with length: {}", self.name(), self.len())
     }
@@ -130,8 +323,8 @@ impl Debug for dyn FileMetadataCache {
 /// See [`CacheManagerConfig`] for configuration options.
 #[derive(Debug)]
 pub struct CacheManager {
-    file_statistic_cache: Option<FileStatisticsCache>,
-    list_files_cache: Option<ListFilesCache>,
+    file_statistic_cache: Option<Arc<dyn FileStatisticsCache>>,
+    list_files_cache: Option<Arc<dyn ListFilesCache>>,
     file_metadata_cache: Arc<dyn FileMetadataCache>,
 }
 
@@ -140,7 +333,25 @@ impl CacheManager {
         let file_statistic_cache =
             config.table_files_statistics_cache.as_ref().map(Arc::clone);
 
-        let list_files_cache = config.list_files_cache.as_ref().map(Arc::clone);
+        let list_files_cache = match &config.list_files_cache {
+            Some(lfc) if config.list_files_cache_limit > 0 => {
+                // the cache memory limit or ttl might have changed, ensure they are updated
+                lfc.update_cache_limit(config.list_files_cache_limit);
+                // Only update TTL if explicitly set in config, otherwise preserve the cache's existing TTL
+                if let Some(ttl) = config.list_files_cache_ttl {
+                    lfc.update_cache_ttl(Some(ttl));
+                }
+                Some(Arc::clone(lfc))
+            }
+            None if config.list_files_cache_limit > 0 => {
+                let lfc: Arc<dyn ListFilesCache> = Arc::new(DefaultListFilesCache::new(
+                    config.list_files_cache_limit,
+                    config.list_files_cache_ttl,
+                ));
+                Some(lfc)
+            }
+            _ => None,
+        };
 
         let file_metadata_cache = config
             .file_metadata_cache
@@ -161,15 +372,27 @@ impl CacheManager {
     }
 
     /// Get the cache of listing files statistics.
-    pub fn get_file_statistic_cache(&self) -> Option<FileStatisticsCache> {
+    pub fn get_file_statistic_cache(&self) -> Option<Arc<dyn FileStatisticsCache>> {
         self.file_statistic_cache.clone()
     }
 
     /// Get the cache for storing the result of listing [`ObjectMeta`]s under the same path.
-    pub fn get_list_files_cache(&self) -> Option<ListFilesCache> {
+    pub fn get_list_files_cache(&self) -> Option<Arc<dyn ListFilesCache>> {
         self.list_files_cache.clone()
     }
 
+    /// Get the memory limit of the list files cache.
+    pub fn get_list_files_cache_limit(&self) -> usize {
+        self.list_files_cache
+            .as_ref()
+            .map_or(0, |c| c.cache_limit())
+    }
+
+    /// Get the TTL (time-to-live) of the list files cache.
+    pub fn get_list_files_cache_ttl(&self) -> Option<Duration> {
+        self.list_files_cache.as_ref().and_then(|c| c.cache_ttl())
+    }
+
     /// Get the file embedded metadata cache.
     pub fn get_file_metadata_cache(&self) -> Arc<dyn FileMetadataCache> {
         Arc::clone(&self.file_metadata_cache)
@@ -181,21 +404,28 @@ impl CacheManager {
     }
 }
 
-const DEFAULT_METADATA_CACHE_LIMIT: usize = 50 * 1024 * 1024; // 50M
+pub const DEFAULT_METADATA_CACHE_LIMIT: usize = 50 * 1024 * 1024; // 50M
 
 #[derive(Clone)]
 pub struct CacheManagerConfig {
-    /// Enable cache of files statistics when listing files.
-    /// Avoid get same file statistics repeatedly in same datafusion session.
-    /// Default is disable. Fow now only supports Parquet files.
-    pub table_files_statistics_cache: Option<FileStatisticsCache>,
-    /// Enable cache of file metadata when listing files.
-    /// This setting avoids listing file meta of the same path repeatedly
-    /// in same session, which may be expensive in certain situations (e.g. remote object storage).
+    /// Enable caching of file statistics when listing files.
+    /// Enabling the cache avoids repeatedly reading file statistics in a DataFusion session.
+    /// Default is disabled. Currently only Parquet files are supported.
+    pub table_files_statistics_cache: Option<Arc<dyn FileStatisticsCache>>,
+    /// Enable caching of file metadata when listing files.
+    /// Enabling the cache avoids repeat list and object metadata fetch operations, which may be
+    /// expensive in certain situations (e.g. remote object storage), for objects under paths that
+    /// are cached.
     /// Note that if this option is enabled, DataFusion will not see any updates to the underlying
-    /// location.  
-    /// Default is disable.
-    pub list_files_cache: Option<ListFilesCache>,
+    /// storage for at least `list_files_cache_ttl` duration.
+    /// Default is disabled.
+    pub list_files_cache: Option<Arc<dyn ListFilesCache>>,
+    /// Limit of the `list_files_cache`, in bytes. Default: 1MiB.
+    pub list_files_cache_limit: usize,
+    /// The duration the list files cache will consider an entry valid after insertion. Note that
+    /// changes to the underlying storage system, such as adding or removing data, will not be
+    /// visible until an entry expires. Default: None (infinite).
+    pub list_files_cache_ttl: Option<Duration>,
     /// Cache of file-embedded metadata, used to avoid reading it multiple times when processing a
     /// data file (e.g., Parquet footer and page metadata).
     /// If not provided, the [`CacheManager`] will create a [`DefaultFilesMetadataCache`].
@@ -209,6 +439,8 @@ impl Default for CacheManagerConfig {
         Self {
             table_files_statistics_cache: Default::default(),
             list_files_cache: Default::default(),
+            list_files_cache_limit: DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT,
+            list_files_cache_ttl: DEFAULT_LIST_FILES_CACHE_TTL,
             file_metadata_cache: Default::default(),
             metadata_cache_limit: DEFAULT_METADATA_CACHE_LIMIT,
         }
@@ -221,20 +453,39 @@ impl CacheManagerConfig {
     /// Default is `None` (disabled).
     pub fn with_files_statistics_cache(
         mut self,
-        cache: Option<FileStatisticsCache>,
+        cache: Option<Arc<dyn FileStatisticsCache>>,
     ) -> Self {
         self.table_files_statistics_cache = cache;
         self
     }
 
     /// Set the cache for listing files.
-    ///     
+    ///
     /// Default is `None` (disabled).
-    pub fn with_list_files_cache(mut self, cache: Option<ListFilesCache>) -> Self {
+    pub fn with_list_files_cache(
+        mut self,
+        cache: Option<Arc<dyn ListFilesCache>>,
+    ) -> Self {
         self.list_files_cache = cache;
         self
     }
 
+    /// Sets the limit of the list files cache, in bytes.
+    ///
+    /// Default: 1MiB (1,048,576 bytes).
+    pub fn with_list_files_cache_limit(mut self, limit: usize) -> Self {
+        self.list_files_cache_limit = limit;
+        self
+    }
+
+    /// Sets the TTL (time-to-live) for entries in the list files cache.
+    ///
+    /// Default: None (infinite).
+    pub fn with_list_files_cache_ttl(mut self, ttl: Option<Duration>) -> Self {
+        self.list_files_cache_ttl = ttl;
+        self
+    }
+
     /// Sets the cache for file-embedded metadata.
     ///
     /// Default is a [`DefaultFilesMetadataCache`].
@@ -252,3 +503,72 @@ impl CacheManagerConfig {
         self
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Test to verify that TTL is preserved when not explicitly set in config.
+    /// This fixes issue #19396 where TTL was being unset from DefaultListFilesCache
+    /// when CacheManagerConfig::list_files_cache_ttl was not set explicitly.
+    #[test]
+    fn test_ttl_preserved_when_not_set_in_config() {
+        // Create a cache with TTL = 1 second
+        let list_file_cache =
+            DefaultListFilesCache::new(1024, Some(Duration::from_secs(1)));
+
+        // Verify the cache has TTL set initially
+        assert_eq!(
+            list_file_cache.cache_ttl(),
+            Some(Duration::from_secs(1)),
+            "Cache should have TTL = 1 second initially"
+        );
+
+        // Put cache in config WITHOUT setting list_files_cache_ttl
+        let config = CacheManagerConfig::default()
+            .with_list_files_cache(Some(Arc::new(list_file_cache)));
+
+        // Create CacheManager from config
+        let cache_manager = CacheManager::try_new(&config).unwrap();
+
+        // Verify TTL is preserved (not unset)
+        let cache_ttl = cache_manager.get_list_files_cache().unwrap().cache_ttl();
+
+        assert!(
+            cache_ttl.is_some(),
+            "TTL should be preserved when not set in config. Expected Some(Duration::from_secs(1)), got {cache_ttl:?}"
+        );
+
+        // Verify it's the correct TTL value
+        assert_eq!(
+            cache_ttl,
+            Some(Duration::from_secs(1)),
+            "TTL should be exactly 1 second"
+        );
+    }
+
+    /// Test to verify that TTL can still be overridden when explicitly set in config.
+    #[test]
+    fn test_ttl_overridden_when_set_in_config() {
+        // Create a cache with TTL = 1 second
+        let list_file_cache =
+            DefaultListFilesCache::new(1024, Some(Duration::from_secs(1)));
+
+        // Put cache in config WITH a different TTL set
+        let config = CacheManagerConfig::default()
+            .with_list_files_cache(Some(Arc::new(list_file_cache)))
+            .with_list_files_cache_ttl(Some(Duration::from_secs(60)));
+
+        // Create CacheManager from config
+        let cache_manager = CacheManager::try_new(&config).unwrap();
+
+        // Verify TTL is overridden to the config value
+        let cache_ttl = cache_manager.get_list_files_cache().unwrap().cache_ttl();
+
+        assert_eq!(
+            cache_ttl,
+            Some(Duration::from_secs(60)),
+            "TTL should be overridden to 60 seconds when set in config"
+        );
+    }
+}
diff --git a/datafusion/execution/src/cache/cache_unit.rs b/datafusion/execution/src/cache/cache_unit.rs
index d27c266b768ad..49e16ca4b6cbf 100644
--- a/datafusion/execution/src/cache/cache_unit.rs
+++ b/datafusion/execution/src/cache/cache_unit.rs
@@ -16,935 +16,395 @@
 // under the License.
 
 use std::collections::HashMap;
-use std::sync::{Arc, Mutex};
 
+use crate::cache::CacheAccessor;
 use crate::cache::cache_manager::{
-    FileMetadata, FileMetadataCache, FileMetadataCacheEntry,
+    CachedFileMetadata, FileStatisticsCache, FileStatisticsCacheEntry,
 };
-use crate::cache::lru_queue::LruQueue;
-use crate::cache::CacheAccessor;
-
-use datafusion_common::Statistics;
 
 use dashmap::DashMap;
 use object_store::path::Path;
-use object_store::ObjectMeta;
+
+pub use crate::cache::DefaultFilesMetadataCache;
 
 /// Default implementation of [`FileStatisticsCache`]
 ///
-/// Stores collected statistics for files
+/// Stores cached file metadata (statistics and orderings) for files.
 ///
-/// Cache is invalided when file size or last modification has changed
+/// The typical usage pattern is:
+/// 1. Call `get(path)` to check for cached value
+/// 2. If `Some(cached)`, validate with `cached.is_valid_for(&current_meta)`
+/// 3. If invalid or missing, compute new value and call `put(path, new_value)`
+///
+/// Uses DashMap for lock-free concurrent access.
 ///
 /// [`FileStatisticsCache`]: crate::cache::cache_manager::FileStatisticsCache
 #[derive(Default)]
 pub struct DefaultFileStatisticsCache {
-    statistics: DashMap<Path, (ObjectMeta, Arc<Statistics>)>,
+    cache: DashMap<Path, CachedFileMetadata>,
 }
 
-impl CacheAccessor<Path, Arc<Statistics>> for DefaultFileStatisticsCache {
-    type Extra = ObjectMeta;
-
-    /// Get `Statistics` for file location.
-    fn get(&self, k: &Path) -> Option<Arc<Statistics>> {
-        self.statistics
-            .get(k)
-            .map(|s| Some(Arc::clone(&s.value().1)))
-            .unwrap_or(None)
+impl CacheAccessor<Path, CachedFileMetadata> for DefaultFileStatisticsCache {
+    fn get(&self, key: &Path) -> Option<CachedFileMetadata> {
+        self.cache.get(key).map(|entry| entry.value().clone())
     }
 
-    /// Get `Statistics` for file location. Returns None if file has changed or not found.
-    fn get_with_extra(&self, k: &Path, e: &Self::Extra) -> Option<Arc<Statistics>> {
-        self.statistics
-            .get(k)
-            .map(|s| {
-                let (saved_meta, statistics) = s.value();
-                if saved_meta.size != e.size
-                    || saved_meta.last_modified != e.last_modified
-                {
-                    // file has changed
-                    None
-                } else {
-                    Some(Arc::clone(statistics))
-                }
-            })
-            .unwrap_or(None)
+    fn put(&self, key: &Path, value: CachedFileMetadata) -> Option<CachedFileMetadata> {
+        self.cache.insert(key.clone(), value)
     }
 
-    /// Save collected file statistics
-    fn put(&self, _key: &Path, _value: Arc<Statistics>) -> Option<Arc<Statistics>> {
-        panic!("Put cache in DefaultFileStatisticsCache without Extra not supported.")
-    }
-
-    fn put_with_extra(
-        &self,
-        key: &Path,
-        value: Arc<Statistics>,
-        e: &Self::Extra,
-    ) -> Option<Arc<Statistics>> {
-        self.statistics
-            .insert(key.clone(), (e.clone(), value))
-            .map(|x| x.1)
-    }
-
-    fn remove(&mut self, k: &Path) -> Option<Arc<Statistics>> {
-        self.statistics.remove(k).map(|x| x.1 .1)
+    fn remove(&self, k: &Path) -> Option<CachedFileMetadata> {
+        self.cache.remove(k).map(|(_, entry)| entry)
     }
 
     fn contains_key(&self, k: &Path) -> bool {
-        self.statistics.contains_key(k)
+        self.cache.contains_key(k)
     }
 
     fn len(&self) -> usize {
-        self.statistics.len()
+        self.cache.len()
     }
 
     fn clear(&self) {
-        self.statistics.clear()
+        self.cache.clear();
     }
+
     fn name(&self) -> String {
         "DefaultFileStatisticsCache".to_string()
     }
 }
 
-/// Default implementation of [`ListFilesCache`]
-///
-/// Collected files metadata for listing files.
-///
-/// Cache is not invalided until user calls [`Self::remove`] or [`Self::clear`].
-///
-/// [`ListFilesCache`]: crate::cache::cache_manager::ListFilesCache
-#[derive(Default)]
-pub struct DefaultListFilesCache {
-    statistics: DashMap<Path, Arc<Vec<ObjectMeta>>>,
-}
-
-impl CacheAccessor<Path, Arc<Vec<ObjectMeta>>> for DefaultListFilesCache {
-    type Extra = ObjectMeta;
+impl FileStatisticsCache for DefaultFileStatisticsCache {
+    fn list_entries(&self) -> HashMap<Path, FileStatisticsCacheEntry> {
+        let mut entries = HashMap::<Path, FileStatisticsCacheEntry>::new();
 
-    fn get(&self, k: &Path) -> Option<Arc<Vec<ObjectMeta>>> {
-        self.statistics.get(k).map(|x| Arc::clone(x.value()))
-    }
+        for entry in self.cache.iter() {
+            let path = entry.key();
+            let cached = entry.value();
+            entries.insert(
+                path.clone(),
+                FileStatisticsCacheEntry {
+                    object_meta: cached.meta.clone(),
+                    num_rows: cached.statistics.num_rows,
+                    num_columns: cached.statistics.column_statistics.len(),
+                    table_size_bytes: cached.statistics.total_byte_size,
+                    statistics_size_bytes: 0, // TODO: set to the real size in the future
+                    has_ordering: cached.ordering.is_some(),
+                },
+            );
+        }
 
-    fn get_with_extra(
-        &self,
-        _k: &Path,
-        _e: &Self::Extra,
-    ) -> Option<Arc<Vec<ObjectMeta>>> {
-        panic!("Not supported DefaultListFilesCache get_with_extra")
+        entries
     }
+}
 
-    fn put(
-        &self,
-        key: &Path,
-        value: Arc<Vec<ObjectMeta>>,
-    ) -> Option<Arc<Vec<ObjectMeta>>> {
-        self.statistics.insert(key.clone(), value)
-    }
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::cache::cache_manager::{
+        CachedFileMetadata, FileStatisticsCache, FileStatisticsCacheEntry,
+    };
+    use arrow::array::RecordBatch;
+    use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+    use chrono::DateTime;
+    use datafusion_common::Statistics;
+    use datafusion_common::stats::Precision;
+    use datafusion_expr::ColumnarValue;
+    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+    use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+    use object_store::ObjectMeta;
+    use std::sync::Arc;
 
-    fn put_with_extra(
-        &self,
-        _key: &Path,
-        _value: Arc<Vec<ObjectMeta>>,
-        _e: &Self::Extra,
-    ) -> Option<Arc<Vec<ObjectMeta>>> {
-        panic!("Not supported DefaultListFilesCache put_with_extra")
+    fn create_test_meta(path: &str, size: u64) -> ObjectMeta {
+        ObjectMeta {
+            location: Path::from(path),
+            last_modified: DateTime::parse_from_rfc3339("2022-09-27T22:36:00+02:00")
+                .unwrap()
+                .into(),
+            size,
+            e_tag: None,
+            version: None,
+        }
     }
 
-    fn remove(&mut self, k: &Path) -> Option<Arc<Vec<ObjectMeta>>> {
-        self.statistics.remove(k).map(|x| x.1)
-    }
+    #[test]
+    fn test_statistics_cache() {
+        let meta = create_test_meta("test", 1024);
+        let cache = DefaultFileStatisticsCache::default();
 
-    fn contains_key(&self, k: &Path) -> bool {
-        self.statistics.contains_key(k)
-    }
+        let schema = Schema::new(vec![Field::new(
+            "test_column",
+            DataType::Timestamp(TimeUnit::Second, None),
+            false,
+        )]);
 
-    fn len(&self) -> usize {
-        self.statistics.len()
-    }
+        // Cache miss
+        assert!(cache.get(&meta.location).is_none());
 
-    fn clear(&self) {
-        self.statistics.clear()
-    }
+        // Put a value
+        let cached_value = CachedFileMetadata::new(
+            meta.clone(),
+            Arc::new(Statistics::new_unknown(&schema)),
+            None,
+        );
+        cache.put(&meta.location, cached_value);
+
+        // Cache hit
+        let result = cache.get(&meta.location);
+        assert!(result.is_some());
+        let cached = result.unwrap();
+        assert!(cached.is_valid_for(&meta));
+
+        // File size changed - validation should fail
+        let meta2 = create_test_meta("test", 2048);
+        let cached = cache.get(&meta2.location).unwrap();
+        assert!(!cached.is_valid_for(&meta2));
+
+        // Update with new value
+        let cached_value2 = CachedFileMetadata::new(
+            meta2.clone(),
+            Arc::new(Statistics::new_unknown(&schema)),
+            None,
+        );
+        cache.put(&meta2.location, cached_value2);
 
-    fn name(&self) -> String {
-        "DefaultListFilesCache".to_string()
+        // Test list_entries
+        let entries = cache.list_entries();
+        assert_eq!(entries.len(), 1);
+        let entry = entries.get(&Path::from("test")).unwrap();
+        assert_eq!(entry.object_meta.size, 2048); // Should be updated value
     }
-}
 
-/// Handles the inner state of the [`DefaultFilesMetadataCache`] struct.
-struct DefaultFilesMetadataCacheState {
-    lru_queue: LruQueue<Path, (ObjectMeta, Arc<dyn FileMetadata>)>,
-    memory_limit: usize,
-    memory_used: usize,
-    cache_hits: HashMap<Path, usize>,
-}
+    #[derive(Clone, Debug, PartialEq, Eq, Hash)]
+    struct MockExpr {}
 
-impl DefaultFilesMetadataCacheState {
-    fn new(memory_limit: usize) -> Self {
-        Self {
-            lru_queue: LruQueue::new(),
-            memory_limit,
-            memory_used: 0,
-            cache_hits: HashMap::new(),
+    impl std::fmt::Display for MockExpr {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            write!(f, "MockExpr")
         }
     }
 
-    /// Returns the respective entry from the cache, if it exists and the `size` and `last_modified`
-    /// properties from [`ObjectMeta`] match.
-    /// If the entry exists, it becomes the most recently used.
-    fn get(&mut self, k: &ObjectMeta) -> Option<Arc<dyn FileMetadata>> {
-        self.lru_queue
-            .get(&k.location)
-            .map(|(object_meta, metadata)| {
-                if object_meta.size != k.size
-                    || object_meta.last_modified != k.last_modified
-                {
-                    None
-                } else {
-                    *self.cache_hits.entry(k.location.clone()).or_insert(0) += 1;
-                    Some(Arc::clone(metadata))
-                }
-            })
-            .unwrap_or(None)
-    }
-
-    /// Checks if the metadata is currently cached (entry exists and the `size` and `last_modified`
-    /// properties of [`ObjectMeta`] match).
-    /// The LRU queue is not updated.
-    fn contains_key(&self, k: &ObjectMeta) -> bool {
-        self.lru_queue
-            .peek(&k.location)
-            .map(|(object_meta, _)| {
-                object_meta.size == k.size && object_meta.last_modified == k.last_modified
-            })
-            .unwrap_or(false)
-    }
-
-    /// Adds a new key-value pair to cache, meaning LRU entries might be evicted if required.
-    /// If the key is already in the cache, the previous metadata is returned.
-    /// If the size of the metadata is greater than the `memory_limit`, the value is not inserted.
-    fn put(
-        &mut self,
-        key: ObjectMeta,
-        value: Arc<dyn FileMetadata>,
-    ) -> Option<Arc<dyn FileMetadata>> {
-        let value_size = value.memory_size();
-
-        // no point in trying to add this value to the cache if it cannot fit entirely
-        if value_size > self.memory_limit {
-            return None;
+    impl PhysicalExpr for MockExpr {
+        fn data_type(
+            &self,
+            _input_schema: &Schema,
+        ) -> datafusion_common::Result<DataType> {
+            Ok(DataType::Int32)
         }
 
-        self.cache_hits.insert(key.location.clone(), 0);
-        // if the key is already in the cache, the old value is removed
-        let old_value = self.lru_queue.put(key.location.clone(), (key, value));
-        self.memory_used += value_size;
-        if let Some((_, ref old_metadata)) = old_value {
-            self.memory_used -= old_metadata.memory_size();
+        fn nullable(&self, _input_schema: &Schema) -> datafusion_common::Result<bool> {
+            Ok(false)
         }
 
-        self.evict_entries();
-
-        old_value.map(|v| v.1)
-    }
-
-    /// Evicts entries from the LRU cache until `memory_used` is lower than `memory_limit`.
-    fn evict_entries(&mut self) {
-        while self.memory_used > self.memory_limit {
-            if let Some(removed) = self.lru_queue.pop() {
-                let metadata: Arc<dyn FileMetadata> = removed.1 .1;
-                self.memory_used -= metadata.memory_size();
-            } else {
-                // cache is empty while memory_used > memory_limit, cannot happen
-                debug_assert!(
-                    false,
-                    "cache is empty while memory_used > memory_limit, cannot happen"
-                );
-                return;
-            }
+        fn evaluate(
+            &self,
+            _batch: &RecordBatch,
+        ) -> datafusion_common::Result<ColumnarValue> {
+            unimplemented!()
         }
-    }
 
-    /// Removes an entry from the cache and returns it, if it exists.
-    fn remove(&mut self, k: &ObjectMeta) -> Option<Arc<dyn FileMetadata>> {
-        if let Some((_, old_metadata)) = self.lru_queue.remove(&k.location) {
-            self.memory_used -= old_metadata.memory_size();
-            self.cache_hits.remove(&k.location);
-            Some(old_metadata)
-        } else {
-            None
+        fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+            vec![]
         }
-    }
 
-    /// Returns the number of entries currently cached.
-    fn len(&self) -> usize {
-        self.lru_queue.len()
-    }
-
-    /// Removes all entries from the cache.
-    fn clear(&mut self) {
-        self.lru_queue.clear();
-        self.memory_used = 0;
-        self.cache_hits.clear();
-    }
-}
-
-/// Default implementation of [`FileMetadataCache`]
-///
-/// Collected file embedded metadata cache.
-///
-/// The metadata for each file is invalidated when the file size or last
-/// modification time have been changed.
-///
-/// # Internal details
-///
-/// The `memory_limit` controls the maximum size of the cache, which uses a
-/// Least Recently Used eviction algorithm. When adding a new entry, if the total
-/// size of the cached entries exceeds `memory_limit`, the least recently used entries
-/// are evicted until the total size is lower than `memory_limit`.
-///
-/// # `Extra` Handling
-///
-/// Users should use the [`Self::get`] and [`Self::put`] methods. The
-/// [`Self::get_with_extra`] and [`Self::put_with_extra`] methods simply call
-/// `get` and `put`, respectively.
-pub struct DefaultFilesMetadataCache {
-    // the state is wrapped in a Mutex to ensure the operations are atomic
-    state: Mutex<DefaultFilesMetadataCacheState>,
-}
+        fn with_new_children(
+            self: Arc<Self>,
+            children: Vec<Arc<dyn PhysicalExpr>>,
+        ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+            assert!(children.is_empty());
+            Ok(self)
+        }
 
-impl DefaultFilesMetadataCache {
-    /// Create a new instance of [`DefaultFilesMetadataCache`].
-    ///
-    /// # Arguments
-    /// `memory_limit`:  the maximum size of the cache, in bytes
-    //
-    pub fn new(memory_limit: usize) -> Self {
-        Self {
-            state: Mutex::new(DefaultFilesMetadataCacheState::new(memory_limit)),
+        fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            write!(f, "MockExpr")
         }
     }
 
-    /// Returns the size of the cached memory, in bytes.
-    pub fn memory_used(&self) -> usize {
-        let state = self.state.lock().unwrap();
-        state.memory_used
+    fn ordering() -> LexOrdering {
+        let expr = Arc::new(MockExpr {}) as Arc<dyn PhysicalExpr>;
+        LexOrdering::new(vec![PhysicalSortExpr::new_default(expr)]).unwrap()
     }
-}
 
-impl FileMetadataCache for DefaultFilesMetadataCache {
-    fn cache_limit(&self) -> usize {
-        let state = self.state.lock().unwrap();
-        state.memory_limit
-    }
+    #[test]
+    fn test_ordering_cache() {
+        let meta = create_test_meta("test.parquet", 100);
+        let cache = DefaultFileStatisticsCache::default();
 
-    fn update_cache_limit(&self, limit: usize) {
-        let mut state = self.state.lock().unwrap();
-        state.memory_limit = limit;
-        state.evict_entries();
-    }
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
 
-    fn list_entries(&self) -> HashMap<Path, FileMetadataCacheEntry> {
-        let state = self.state.lock().unwrap();
-        let mut entries = HashMap::<Path, FileMetadataCacheEntry>::new();
+        // Cache statistics with no ordering
+        let cached_value = CachedFileMetadata::new(
+            meta.clone(),
+            Arc::new(Statistics::new_unknown(&schema)),
+            None, // No ordering yet
+        );
+        cache.put(&meta.location, cached_value);
 
-        for (path, (object_meta, metadata)) in state.lru_queue.list_entries() {
-            entries.insert(
-                path.clone(),
-                FileMetadataCacheEntry {
-                    object_meta: object_meta.clone(),
-                    size_bytes: metadata.memory_size(),
-                    hits: *state.cache_hits.get(path).expect("entry must exist"),
-                    extra: metadata.extra_info(),
-                },
-            );
-        }
+        let result = cache.get(&meta.location).unwrap();
+        assert!(result.ordering.is_none());
 
-        entries
-    }
-}
+        // Update to add ordering
+        let mut cached = cache.get(&meta.location).unwrap();
+        if cached.is_valid_for(&meta) && cached.ordering.is_none() {
+            cached.ordering = Some(ordering());
+        }
+        cache.put(&meta.location, cached);
 
-impl CacheAccessor<ObjectMeta, Arc<dyn FileMetadata>> for DefaultFilesMetadataCache {
-    type Extra = ObjectMeta;
+        let result2 = cache.get(&meta.location).unwrap();
+        assert!(result2.ordering.is_some());
 
-    fn get(&self, k: &ObjectMeta) -> Option<Arc<dyn FileMetadata>> {
-        let mut state = self.state.lock().unwrap();
-        state.get(k)
+        // Verify list_entries shows has_ordering = true
+        let entries = cache.list_entries();
+        assert_eq!(entries.len(), 1);
+        assert!(entries.get(&meta.location).unwrap().has_ordering);
     }
 
-    fn get_with_extra(
-        &self,
-        k: &ObjectMeta,
-        _e: &Self::Extra,
-    ) -> Option<Arc<dyn FileMetadata>> {
-        self.get(k)
-    }
+    #[test]
+    fn test_cache_invalidation_on_file_modification() {
+        let cache = DefaultFileStatisticsCache::default();
+        let path = Path::from("test.parquet");
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
 
-    fn put(
-        &self,
-        key: &ObjectMeta,
-        value: Arc<dyn FileMetadata>,
-    ) -> Option<Arc<dyn FileMetadata>> {
-        let mut state = self.state.lock().unwrap();
-        state.put(key.clone(), value)
-    }
+        let meta_v1 = create_test_meta("test.parquet", 100);
 
-    fn put_with_extra(
-        &self,
-        key: &ObjectMeta,
-        value: Arc<dyn FileMetadata>,
-        _e: &Self::Extra,
-    ) -> Option<Arc<dyn FileMetadata>> {
-        self.put(key, value)
-    }
+        // Cache initial value
+        let cached_value = CachedFileMetadata::new(
+            meta_v1.clone(),
+            Arc::new(Statistics::new_unknown(&schema)),
+            None,
+        );
+        cache.put(&path, cached_value);
 
-    fn remove(&mut self, k: &ObjectMeta) -> Option<Arc<dyn FileMetadata>> {
-        let mut state = self.state.lock().unwrap();
-        state.remove(k)
-    }
+        // File modified (size changed)
+        let meta_v2 = create_test_meta("test.parquet", 200);
 
-    fn contains_key(&self, k: &ObjectMeta) -> bool {
-        let state = self.state.lock().unwrap();
-        state.contains_key(k)
-    }
+        let cached = cache.get(&path).unwrap();
+        // Should not be valid for new meta
+        assert!(!cached.is_valid_for(&meta_v2));
 
-    fn len(&self) -> usize {
-        let state = self.state.lock().unwrap();
-        state.len()
-    }
-
-    fn clear(&self) {
-        let mut state = self.state.lock().unwrap();
-        state.clear();
-    }
+        // Compute new value and update
+        let new_cached = CachedFileMetadata::new(
+            meta_v2.clone(),
+            Arc::new(Statistics::new_unknown(&schema)),
+            None,
+        );
+        cache.put(&path, new_cached);
 
-    fn name(&self) -> String {
-        "DefaultFilesMetadataCache".to_string()
+        // Should have new metadata
+        let result = cache.get(&path).unwrap();
+        assert_eq!(result.meta.size, 200);
     }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::collections::HashMap;
-    use std::sync::Arc;
-
-    use crate::cache::cache_manager::{
-        FileMetadata, FileMetadataCache, FileMetadataCacheEntry,
-    };
-    use crate::cache::cache_unit::{
-        DefaultFileStatisticsCache, DefaultFilesMetadataCache, DefaultListFilesCache,
-    };
-    use crate::cache::CacheAccessor;
-    use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
-    use chrono::DateTime;
-    use datafusion_common::Statistics;
-    use object_store::path::Path;
-    use object_store::ObjectMeta;
 
     #[test]
-    fn test_statistics_cache() {
-        let meta = ObjectMeta {
-            location: Path::from("test"),
-            last_modified: DateTime::parse_from_rfc3339("2022-09-27T22:36:00+02:00")
-                .unwrap()
-                .into(),
-            size: 1024,
-            e_tag: None,
-            version: None,
-        };
+    fn test_ordering_cache_invalidation_on_file_modification() {
         let cache = DefaultFileStatisticsCache::default();
-        assert!(cache.get_with_extra(&meta.location, &meta).is_none());
-
-        cache.put_with_extra(
-            &meta.location,
-            Statistics::new_unknown(&Schema::new(vec![Field::new(
-                "test_column",
-                DataType::Timestamp(TimeUnit::Second, None),
-                false,
-            )]))
-            .into(),
-            &meta,
-        );
-        assert!(cache.get_with_extra(&meta.location, &meta).is_some());
-
-        // file size changed
-        let mut meta2 = meta.clone();
-        meta2.size = 2048;
-        assert!(cache.get_with_extra(&meta2.location, &meta2).is_none());
-
-        // file last_modified changed
-        let mut meta2 = meta.clone();
-        meta2.last_modified = DateTime::parse_from_rfc3339("2022-09-27T22:40:00+02:00")
-            .unwrap()
-            .into();
-        assert!(cache.get_with_extra(&meta2.location, &meta2).is_none());
-
-        // different file
-        let mut meta2 = meta;
-        meta2.location = Path::from("test2");
-        assert!(cache.get_with_extra(&meta2.location, &meta2).is_none());
-    }
+        let path = Path::from("test.parquet");
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
 
-    #[test]
-    fn test_list_file_cache() {
-        let meta = ObjectMeta {
-            location: Path::from("test"),
+        // Cache with original metadata and ordering
+        let meta_v1 = ObjectMeta {
+            location: path.clone(),
             last_modified: DateTime::parse_from_rfc3339("2022-09-27T22:36:00+02:00")
                 .unwrap()
                 .into(),
-            size: 1024,
+            size: 100,
             e_tag: None,
             version: None,
         };
-
-        let cache = DefaultListFilesCache::default();
-        assert!(cache.get(&meta.location).is_none());
-
-        cache.put(&meta.location, vec![meta.clone()].into());
-        assert_eq!(
-            cache.get(&meta.location).unwrap().first().unwrap().clone(),
-            meta.clone()
+        let ordering_v1 = ordering();
+        let cached_v1 = CachedFileMetadata::new(
+            meta_v1.clone(),
+            Arc::new(Statistics::new_unknown(&schema)),
+            Some(ordering_v1),
         );
-    }
-
-    pub struct TestFileMetadata {
-        metadata: String,
-    }
-
-    impl FileMetadata for TestFileMetadata {
-        fn as_any(&self) -> &dyn std::any::Any {
-            self
-        }
+        cache.put(&path, cached_v1);
 
-        fn memory_size(&self) -> usize {
-            self.metadata.len()
-        }
-
-        fn extra_info(&self) -> HashMap<String, String> {
-            HashMap::from([("extra_info".to_owned(), "abc".to_owned())])
-        }
-    }
+        // Verify cached ordering is valid
+        let cached = cache.get(&path).unwrap();
+        assert!(cached.is_valid_for(&meta_v1));
+        assert!(cached.ordering.is_some());
 
-    #[test]
-    fn test_default_file_metadata_cache() {
-        let object_meta = ObjectMeta {
-            location: Path::from("test"),
-            last_modified: DateTime::parse_from_rfc3339("2025-07-29T12:12:12+00:00")
+        // File modified (size changed)
+        let meta_v2 = ObjectMeta {
+            location: path.clone(),
+            last_modified: DateTime::parse_from_rfc3339("2022-09-28T10:00:00+02:00")
                 .unwrap()
                 .into(),
-            size: 1024,
+            size: 200, // Changed
             e_tag: None,
             version: None,
         };
 
-        let metadata: Arc<dyn FileMetadata> = Arc::new(TestFileMetadata {
-            metadata: "retrieved_metadata".to_owned(),
-        });
-
-        let mut cache = DefaultFilesMetadataCache::new(1024 * 1024);
-        assert!(cache.get(&object_meta).is_none());
-
-        // put
-        cache.put(&object_meta, Arc::clone(&metadata));
-
-        // get and contains of a valid entry
-        assert!(cache.contains_key(&object_meta));
-        let value = cache.get(&object_meta);
-        assert!(value.is_some());
-        let test_file_metadata = Arc::downcast::<TestFileMetadata>(value.unwrap());
-        assert!(test_file_metadata.is_ok());
-        assert_eq!(test_file_metadata.unwrap().metadata, "retrieved_metadata");
-
-        // file size changed
-        let mut object_meta2 = object_meta.clone();
-        object_meta2.size = 2048;
-        assert!(cache.get(&object_meta2).is_none());
-        assert!(!cache.contains_key(&object_meta2));
-
-        // file last_modified changed
-        let mut object_meta2 = object_meta.clone();
-        object_meta2.last_modified =
-            DateTime::parse_from_rfc3339("2025-07-29T13:13:13+00:00")
-                .unwrap()
-                .into();
-        assert!(cache.get(&object_meta2).is_none());
-        assert!(!cache.contains_key(&object_meta2));
-
-        // different file
-        let mut object_meta2 = object_meta.clone();
-        object_meta2.location = Path::from("test2");
-        assert!(cache.get(&object_meta2).is_none());
-        assert!(!cache.contains_key(&object_meta2));
-
-        // remove
-        cache.remove(&object_meta);
-        assert!(cache.get(&object_meta).is_none());
-        assert!(!cache.contains_key(&object_meta));
-
-        // len and clear
-        cache.put(&object_meta, Arc::clone(&metadata));
-        cache.put(&object_meta2, metadata);
-        assert_eq!(cache.len(), 2);
-        cache.clear();
-        assert_eq!(cache.len(), 0);
-    }
+        // Cache entry exists but should be invalid for new metadata
+        let cached = cache.get(&path).unwrap();
+        assert!(!cached.is_valid_for(&meta_v2));
 
-    fn generate_test_metadata_with_size(
-        path: &str,
-        size: usize,
-    ) -> (ObjectMeta, Arc<dyn FileMetadata>) {
-        let object_meta = ObjectMeta {
-            location: Path::from(path),
-            last_modified: chrono::Utc::now(),
-            size: size as u64,
-            e_tag: None,
-            version: None,
-        };
-        let metadata: Arc<dyn FileMetadata> = Arc::new(TestFileMetadata {
-            metadata: "a".repeat(size),
-        });
+        // Cache new version with different ordering
+        let ordering_v2 = ordering(); // New ordering instance
+        let cached_v2 = CachedFileMetadata::new(
+            meta_v2.clone(),
+            Arc::new(Statistics::new_unknown(&schema)),
+            Some(ordering_v2),
+        );
+        cache.put(&path, cached_v2);
 
-        (object_meta, metadata)
-    }
+        // Old metadata should be invalid
+        let cached = cache.get(&path).unwrap();
+        assert!(!cached.is_valid_for(&meta_v1));
 
-    #[test]
-    fn test_default_file_metadata_cache_with_limit() {
-        let mut cache = DefaultFilesMetadataCache::new(1000);
-        let (object_meta1, metadata1) = generate_test_metadata_with_size("1", 100);
-        let (object_meta2, metadata2) = generate_test_metadata_with_size("2", 500);
-        let (object_meta3, metadata3) = generate_test_metadata_with_size("3", 300);
-
-        cache.put(&object_meta1, metadata1);
-        cache.put(&object_meta2, metadata2);
-        cache.put(&object_meta3, metadata3);
-
-        // all entries will fit
-        assert_eq!(cache.len(), 3);
-        assert_eq!(cache.memory_used(), 900);
-        assert!(cache.contains_key(&object_meta1));
-        assert!(cache.contains_key(&object_meta2));
-        assert!(cache.contains_key(&object_meta3));
-
-        // add a new entry which will remove the least recently used ("1")
-        let (object_meta4, metadata4) = generate_test_metadata_with_size("4", 200);
-        cache.put(&object_meta4, metadata4);
-        assert_eq!(cache.len(), 3);
-        assert_eq!(cache.memory_used(), 1000);
-        assert!(!cache.contains_key(&object_meta1));
-        assert!(cache.contains_key(&object_meta4));
-
-        // get entry "2", which will move it to the top of the queue, and add a new one which will
-        // remove the new least recently used ("3")
-        cache.get(&object_meta2);
-        let (object_meta5, metadata5) = generate_test_metadata_with_size("5", 100);
-        cache.put(&object_meta5, metadata5);
-        assert_eq!(cache.len(), 3);
-        assert_eq!(cache.memory_used(), 800);
-        assert!(!cache.contains_key(&object_meta3));
-        assert!(cache.contains_key(&object_meta5));
-
-        // new entry which will not be able to fit in the 1000 bytes allocated
-        let (object_meta6, metadata6) = generate_test_metadata_with_size("6", 1200);
-        cache.put(&object_meta6, metadata6);
-        assert_eq!(cache.len(), 3);
-        assert_eq!(cache.memory_used(), 800);
-        assert!(!cache.contains_key(&object_meta6));
-
-        // new entry which is able to fit without removing any entry
-        let (object_meta7, metadata7) = generate_test_metadata_with_size("7", 200);
-        cache.put(&object_meta7, metadata7);
-        assert_eq!(cache.len(), 4);
-        assert_eq!(cache.memory_used(), 1000);
-        assert!(cache.contains_key(&object_meta7));
-
-        // new entry which will remove all other entries
-        let (object_meta8, metadata8) = generate_test_metadata_with_size("8", 999);
-        cache.put(&object_meta8, metadata8);
-        assert_eq!(cache.len(), 1);
-        assert_eq!(cache.memory_used(), 999);
-        assert!(cache.contains_key(&object_meta8));
-
-        // when updating an entry, the previous ones are not unnecessarily removed
-        let (object_meta9, metadata9) = generate_test_metadata_with_size("9", 300);
-        let (object_meta10, metadata10) = generate_test_metadata_with_size("10", 200);
-        let (object_meta11_v1, metadata11_v1) =
-            generate_test_metadata_with_size("11", 400);
-        cache.put(&object_meta9, metadata9);
-        cache.put(&object_meta10, metadata10);
-        cache.put(&object_meta11_v1, metadata11_v1);
-        assert_eq!(cache.memory_used(), 900);
-        assert_eq!(cache.len(), 3);
-        let (object_meta11_v2, metadata11_v2) =
-            generate_test_metadata_with_size("11", 500);
-        cache.put(&object_meta11_v2, metadata11_v2);
-        assert_eq!(cache.memory_used(), 1000);
-        assert_eq!(cache.len(), 3);
-        assert!(cache.contains_key(&object_meta9));
-        assert!(cache.contains_key(&object_meta10));
-        assert!(cache.contains_key(&object_meta11_v2));
-        assert!(!cache.contains_key(&object_meta11_v1));
-
-        // when updating an entry that now exceeds the limit, the LRU ("9") needs to be removed
-        let (object_meta11_v3, metadata11_v3) =
-            generate_test_metadata_with_size("11", 501);
-        cache.put(&object_meta11_v3, metadata11_v3);
-        assert_eq!(cache.memory_used(), 701);
-        assert_eq!(cache.len(), 2);
-        assert!(cache.contains_key(&object_meta10));
-        assert!(cache.contains_key(&object_meta11_v3));
-        assert!(!cache.contains_key(&object_meta11_v2));
-
-        // manually removing an entry that is not the LRU
-        cache.remove(&object_meta11_v3);
-        assert_eq!(cache.len(), 1);
-        assert_eq!(cache.memory_used(), 200);
-        assert!(cache.contains_key(&object_meta10));
-        assert!(!cache.contains_key(&object_meta11_v3));
-
-        // clear
-        cache.clear();
-        assert_eq!(cache.len(), 0);
-        assert_eq!(cache.memory_used(), 0);
-
-        // resizing the cache should clear the extra entries
-        let (object_meta12, metadata12) = generate_test_metadata_with_size("12", 300);
-        let (object_meta13, metadata13) = generate_test_metadata_with_size("13", 200);
-        let (object_meta14, metadata14) = generate_test_metadata_with_size("14", 500);
-        cache.put(&object_meta12, metadata12);
-        cache.put(&object_meta13, metadata13);
-        cache.put(&object_meta14, metadata14);
-        assert_eq!(cache.len(), 3);
-        assert_eq!(cache.memory_used(), 1000);
-        cache.update_cache_limit(600);
-        assert_eq!(cache.len(), 1);
-        assert_eq!(cache.memory_used(), 500);
-        assert!(!cache.contains_key(&object_meta12));
-        assert!(!cache.contains_key(&object_meta13));
-        assert!(cache.contains_key(&object_meta14));
+        // New metadata should be valid
+        assert!(cached.is_valid_for(&meta_v2));
+        assert!(cached.ordering.is_some());
     }
 
     #[test]
-    fn test_default_file_metadata_cache_entries_info() {
-        let mut cache = DefaultFilesMetadataCache::new(1000);
-        let (object_meta1, metadata1) = generate_test_metadata_with_size("1", 100);
-        let (object_meta2, metadata2) = generate_test_metadata_with_size("2", 200);
-        let (object_meta3, metadata3) = generate_test_metadata_with_size("3", 300);
-
-        // initial entries, all will have hits = 0
-        cache.put(&object_meta1, metadata1);
-        cache.put(&object_meta2, metadata2);
-        cache.put(&object_meta3, metadata3);
-        assert_eq!(
-            cache.list_entries(),
-            HashMap::from([
-                (
-                    Path::from("1"),
-                    FileMetadataCacheEntry {
-                        object_meta: object_meta1.clone(),
-                        size_bytes: 100,
-                        hits: 0,
-                        extra: HashMap::from([(
-                            "extra_info".to_owned(),
-                            "abc".to_owned()
-                        )]),
-                    }
-                ),
-                (
-                    Path::from("2"),
-                    FileMetadataCacheEntry {
-                        object_meta: object_meta2.clone(),
-                        size_bytes: 200,
-                        hits: 0,
-                        extra: HashMap::from([(
-                            "extra_info".to_owned(),
-                            "abc".to_owned()
-                        )]),
-                    }
-                ),
-                (
-                    Path::from("3"),
-                    FileMetadataCacheEntry {
-                        object_meta: object_meta3.clone(),
-                        size_bytes: 300,
-                        hits: 0,
-                        extra: HashMap::from([(
-                            "extra_info".to_owned(),
-                            "abc".to_owned()
-                        )]),
-                    }
-                )
-            ])
-        );
+    fn test_list_entries() {
+        let cache = DefaultFileStatisticsCache::default();
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
 
-        // new hit on "1"
-        cache.get(&object_meta1);
-        assert_eq!(
-            cache.list_entries(),
-            HashMap::from([
-                (
-                    Path::from("1"),
-                    FileMetadataCacheEntry {
-                        object_meta: object_meta1.clone(),
-                        size_bytes: 100,
-                        hits: 1,
-                        extra: HashMap::from([(
-                            "extra_info".to_owned(),
-                            "abc".to_owned()
-                        )]),
-                    }
-                ),
-                (
-                    Path::from("2"),
-                    FileMetadataCacheEntry {
-                        object_meta: object_meta2.clone(),
-                        size_bytes: 200,
-                        hits: 0,
-                        extra: HashMap::from([(
-                            "extra_info".to_owned(),
-                            "abc".to_owned()
-                        )]),
-                    }
-                ),
-                (
-                    Path::from("3"),
-                    FileMetadataCacheEntry {
-                        object_meta: object_meta3.clone(),
-                        size_bytes: 300,
-                        hits: 0,
-                        extra: HashMap::from([(
-                            "extra_info".to_owned(),
-                            "abc".to_owned()
-                        )]),
-                    }
-                )
-            ])
-        );
+        let meta1 = create_test_meta("test1.parquet", 100);
 
-        // new entry, will evict "2"
-        let (object_meta4, metadata4) = generate_test_metadata_with_size("4", 600);
-        cache.put(&object_meta4, metadata4);
-        assert_eq!(
-            cache.list_entries(),
-            HashMap::from([
-                (
-                    Path::from("1"),
-                    FileMetadataCacheEntry {
-                        object_meta: object_meta1.clone(),
-                        size_bytes: 100,
-                        hits: 1,
-                        extra: HashMap::from([(
-                            "extra_info".to_owned(),
-                            "abc".to_owned()
-                        )]),
-                    }
-                ),
-                (
-                    Path::from("3"),
-                    FileMetadataCacheEntry {
-                        object_meta: object_meta3.clone(),
-                        size_bytes: 300,
-                        hits: 0,
-                        extra: HashMap::from([(
-                            "extra_info".to_owned(),
-                            "abc".to_owned()
-                        )]),
-                    }
-                ),
-                (
-                    Path::from("4"),
-                    FileMetadataCacheEntry {
-                        object_meta: object_meta4.clone(),
-                        size_bytes: 600,
-                        hits: 0,
-                        extra: HashMap::from([(
-                            "extra_info".to_owned(),
-                            "abc".to_owned()
-                        )]),
-                    }
-                )
-            ])
+        let cached_value = CachedFileMetadata::new(
+            meta1.clone(),
+            Arc::new(Statistics::new_unknown(&schema)),
+            None,
         );
-
-        // replace entry "1"
-        let (object_meta1_new, metadata1_new) = generate_test_metadata_with_size("1", 50);
-        cache.put(&object_meta1_new, metadata1_new);
-        assert_eq!(
-            cache.list_entries(),
-            HashMap::from([
-                (
-                    Path::from("1"),
-                    FileMetadataCacheEntry {
-                        object_meta: object_meta1_new.clone(),
-                        size_bytes: 50,
-                        hits: 0,
-                        extra: HashMap::from([(
-                            "extra_info".to_owned(),
-                            "abc".to_owned()
-                        )]),
-                    }
-                ),
-                (
-                    Path::from("3"),
-                    FileMetadataCacheEntry {
-                        object_meta: object_meta3.clone(),
-                        size_bytes: 300,
-                        hits: 0,
-                        extra: HashMap::from([(
-                            "extra_info".to_owned(),
-                            "abc".to_owned()
-                        )]),
-                    }
-                ),
-                (
-                    Path::from("4"),
-                    FileMetadataCacheEntry {
-                        object_meta: object_meta4.clone(),
-                        size_bytes: 600,
-                        hits: 0,
-                        extra: HashMap::from([(
-                            "extra_info".to_owned(),
-                            "abc".to_owned()
-                        )]),
-                    }
-                )
-            ])
+        cache.put(&meta1.location, cached_value);
+        let meta2 = create_test_meta("test2.parquet", 200);
+        let cached_value = CachedFileMetadata::new(
+            meta2.clone(),
+            Arc::new(Statistics::new_unknown(&schema)),
+            Some(ordering()),
         );
+        cache.put(&meta2.location, cached_value);
 
-        // remove entry "4"
-        cache.remove(&object_meta4);
+        let entries = cache.list_entries();
         assert_eq!(
-            cache.list_entries(),
+            entries,
             HashMap::from([
                 (
-                    Path::from("1"),
-                    FileMetadataCacheEntry {
-                        object_meta: object_meta1_new.clone(),
-                        size_bytes: 50,
-                        hits: 0,
-                        extra: HashMap::from([(
-                            "extra_info".to_owned(),
-                            "abc".to_owned()
-                        )]),
+                    Path::from("test1.parquet"),
+                    FileStatisticsCacheEntry {
+                        object_meta: meta1,
+                        num_rows: Precision::Absent,
+                        num_columns: 1,
+                        table_size_bytes: Precision::Absent,
+                        statistics_size_bytes: 0,
+                        has_ordering: false,
                     }
                 ),
                 (
-                    Path::from("3"),
-                    FileMetadataCacheEntry {
-                        object_meta: object_meta3.clone(),
-                        size_bytes: 300,
-                        hits: 0,
-                        extra: HashMap::from([(
-                            "extra_info".to_owned(),
-                            "abc".to_owned()
-                        )]),
+                    Path::from("test2.parquet"),
+                    FileStatisticsCacheEntry {
+                        object_meta: meta2,
+                        num_rows: Precision::Absent,
+                        num_columns: 1,
+                        table_size_bytes: Precision::Absent,
+                        statistics_size_bytes: 0,
+                        has_ordering: true,
                     }
-                )
+                ),
             ])
         );
-
-        // clear
-        cache.clear();
-        assert_eq!(cache.list_entries(), HashMap::from([]));
     }
 }
diff --git a/datafusion/execution/src/cache/file_metadata_cache.rs b/datafusion/execution/src/cache/file_metadata_cache.rs
new file mode 100644
index 0000000000000..5e899d7dd9f8b
--- /dev/null
+++ b/datafusion/execution/src/cache/file_metadata_cache.rs
@@ -0,0 +1,764 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{collections::HashMap, sync::Mutex};
+
+use object_store::path::Path;
+
+use crate::cache::{
+    CacheAccessor,
+    cache_manager::{CachedFileMetadataEntry, FileMetadataCache, FileMetadataCacheEntry},
+    lru_queue::LruQueue,
+};
+
+/// Handles the inner state of the [`DefaultFilesMetadataCache`] struct.
+struct DefaultFilesMetadataCacheState {
+    lru_queue: LruQueue<Path, CachedFileMetadataEntry>,
+    memory_limit: usize,
+    memory_used: usize,
+    cache_hits: HashMap<Path, usize>,
+}
+
+impl DefaultFilesMetadataCacheState {
+    fn new(memory_limit: usize) -> Self {
+        Self {
+            lru_queue: LruQueue::new(),
+            memory_limit,
+            memory_used: 0,
+            cache_hits: HashMap::new(),
+        }
+    }
+
+    /// Returns the respective entry from the cache, if it exists.
+    /// If the entry exists, it becomes the most recently used.
+    fn get(&mut self, k: &Path) -> Option<CachedFileMetadataEntry> {
+        self.lru_queue.get(k).cloned().inspect(|_| {
+            *self.cache_hits.entry(k.clone()).or_insert(0) += 1;
+        })
+    }
+
+    /// Checks if the metadata is currently cached.
+    /// The LRU queue is not updated.
+    fn contains_key(&self, k: &Path) -> bool {
+        self.lru_queue.peek(k).is_some()
+    }
+
+    /// Adds a new key-value pair to cache, meaning LRU entries might be evicted if required.
+    /// If the key is already in the cache, the previous metadata is returned.
+    /// If the size of the metadata is greater than the `memory_limit`, the value is not inserted.
+    fn put(
+        &mut self,
+        key: Path,
+        value: CachedFileMetadataEntry,
+    ) -> Option<CachedFileMetadataEntry> {
+        let value_size = value.file_metadata.memory_size();
+
+        // no point in trying to add this value to the cache if it cannot fit entirely
+        if value_size > self.memory_limit {
+            return None;
+        }
+
+        self.cache_hits.insert(key.clone(), 0);
+        // if the key is already in the cache, the old value is removed
+        let old_value = self.lru_queue.put(key, value);
+        self.memory_used += value_size;
+        if let Some(ref old_entry) = old_value {
+            self.memory_used -= old_entry.file_metadata.memory_size();
+        }
+
+        self.evict_entries();
+
+        old_value
+    }
+
+    /// Evicts entries from the LRU cache until `memory_used` is lower than `memory_limit`.
+    fn evict_entries(&mut self) {
+        while self.memory_used > self.memory_limit {
+            if let Some(removed) = self.lru_queue.pop() {
+                self.memory_used -= removed.1.file_metadata.memory_size();
+            } else {
+                // cache is empty while memory_used > memory_limit, cannot happen
+                debug_assert!(
+                    false,
+                    "cache is empty while memory_used > memory_limit, cannot happen"
+                );
+                return;
+            }
+        }
+    }
+
+    /// Removes an entry from the cache and returns it, if it exists.
+    fn remove(&mut self, k: &Path) -> Option<CachedFileMetadataEntry> {
+        if let Some(old_entry) = self.lru_queue.remove(k) {
+            self.memory_used -= old_entry.file_metadata.memory_size();
+            self.cache_hits.remove(k);
+            Some(old_entry)
+        } else {
+            None
+        }
+    }
+
+    /// Returns the number of entries currently cached.
+    fn len(&self) -> usize {
+        self.lru_queue.len()
+    }
+
+    /// Removes all entries from the cache.
+    fn clear(&mut self) {
+        self.lru_queue.clear();
+        self.memory_used = 0;
+        self.cache_hits.clear();
+    }
+}
+
+/// Default implementation of [`FileMetadataCache`]
+///
+/// Collected file embedded metadata cache.
+///
+/// The metadata for each file is validated by comparing the cached [`ObjectMeta`]
+/// (size and last_modified) against the current file state using `cached.is_valid_for(&current_meta)`.
+///
+/// # Internal details
+///
+/// The `memory_limit` controls the maximum size of the cache, which uses a
+/// Least Recently Used eviction algorithm. When adding a new entry, if the total
+/// size of the cached entries exceeds `memory_limit`, the least recently used entries
+/// are evicted until the total size is lower than `memory_limit`.
+///
+/// [`ObjectMeta`]: object_store::ObjectMeta
+pub struct DefaultFilesMetadataCache {
+    // the state is wrapped in a Mutex to ensure the operations are atomic
+    state: Mutex<DefaultFilesMetadataCacheState>,
+}
+
+impl DefaultFilesMetadataCache {
+    /// Create a new instance of [`DefaultFilesMetadataCache`].
+    ///
+    /// # Arguments
+    /// `memory_limit`:  the maximum size of the cache, in bytes
+    //
+    pub fn new(memory_limit: usize) -> Self {
+        Self {
+            state: Mutex::new(DefaultFilesMetadataCacheState::new(memory_limit)),
+        }
+    }
+
+    /// Returns the size of the cached memory, in bytes.
+    pub fn memory_used(&self) -> usize {
+        let state = self.state.lock().unwrap();
+        state.memory_used
+    }
+}
+
+impl CacheAccessor<Path, CachedFileMetadataEntry> for DefaultFilesMetadataCache {
+    fn get(&self, key: &Path) -> Option<CachedFileMetadataEntry> {
+        let mut state = self.state.lock().unwrap();
+        state.get(key)
+    }
+
+    fn put(
+        &self,
+        key: &Path,
+        value: CachedFileMetadataEntry,
+    ) -> Option<CachedFileMetadataEntry> {
+        let mut state = self.state.lock().unwrap();
+        state.put(key.clone(), value)
+    }
+
+    fn remove(&self, k: &Path) -> Option<CachedFileMetadataEntry> {
+        let mut state = self.state.lock().unwrap();
+        state.remove(k)
+    }
+
+    fn contains_key(&self, k: &Path) -> bool {
+        let state = self.state.lock().unwrap();
+        state.contains_key(k)
+    }
+
+    fn len(&self) -> usize {
+        let state = self.state.lock().unwrap();
+        state.len()
+    }
+
+    fn clear(&self) {
+        let mut state = self.state.lock().unwrap();
+        state.clear();
+    }
+
+    fn name(&self) -> String {
+        "DefaultFilesMetadataCache".to_string()
+    }
+}
+
+impl FileMetadataCache for DefaultFilesMetadataCache {
+    fn cache_limit(&self) -> usize {
+        let state = self.state.lock().unwrap();
+        state.memory_limit
+    }
+
+    fn update_cache_limit(&self, limit: usize) {
+        let mut state = self.state.lock().unwrap();
+        state.memory_limit = limit;
+        state.evict_entries();
+    }
+
+    fn list_entries(&self) -> HashMap<Path, FileMetadataCacheEntry> {
+        let state = self.state.lock().unwrap();
+        let mut entries = HashMap::<Path, FileMetadataCacheEntry>::new();
+
+        for (path, entry) in state.lru_queue.list_entries() {
+            entries.insert(
+                path.clone(),
+                FileMetadataCacheEntry {
+                    object_meta: entry.meta.clone(),
+                    size_bytes: entry.file_metadata.memory_size(),
+                    hits: *state.cache_hits.get(path).expect("entry must exist"),
+                    extra: entry.file_metadata.extra_info(),
+                },
+            );
+        }
+
+        entries
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+    use std::sync::Arc;
+
+    use crate::cache::CacheAccessor;
+    use crate::cache::cache_manager::{
+        CachedFileMetadataEntry, FileMetadata, FileMetadataCache, FileMetadataCacheEntry,
+    };
+    use crate::cache::file_metadata_cache::DefaultFilesMetadataCache;
+    use object_store::ObjectMeta;
+    use object_store::path::Path;
+
+    pub struct TestFileMetadata {
+        metadata: String,
+    }
+
+    impl FileMetadata for TestFileMetadata {
+        fn as_any(&self) -> &dyn std::any::Any {
+            self
+        }
+
+        fn memory_size(&self) -> usize {
+            self.metadata.len()
+        }
+
+        fn extra_info(&self) -> HashMap<String, String> {
+            HashMap::from([("extra_info".to_owned(), "abc".to_owned())])
+        }
+    }
+
+    fn create_test_object_meta(path: &str, size: usize) -> ObjectMeta {
+        ObjectMeta {
+            location: Path::from(path),
+            last_modified: chrono::DateTime::parse_from_rfc3339(
+                "2025-07-29T12:12:12+00:00",
+            )
+            .unwrap()
+            .into(),
+            size: size as u64,
+            e_tag: None,
+            version: None,
+        }
+    }
+
+    #[test]
+    fn test_default_file_metadata_cache() {
+        let object_meta = create_test_object_meta("test", 1024);
+
+        let metadata: Arc<dyn FileMetadata> = Arc::new(TestFileMetadata {
+            metadata: "retrieved_metadata".to_owned(),
+        });
+
+        let cache = DefaultFilesMetadataCache::new(1024 * 1024);
+
+        // Cache miss
+        assert!(cache.get(&object_meta.location).is_none());
+
+        // Put a value
+        let cached_entry =
+            CachedFileMetadataEntry::new(object_meta.clone(), Arc::clone(&metadata));
+        cache.put(&object_meta.location, cached_entry);
+
+        // Verify the cached value
+        assert!(cache.contains_key(&object_meta.location));
+        let result = cache.get(&object_meta.location).unwrap();
+        let test_file_metadata = Arc::downcast::<TestFileMetadata>(result.file_metadata);
+        assert!(test_file_metadata.is_ok());
+        assert_eq!(test_file_metadata.unwrap().metadata, "retrieved_metadata");
+
+        // Cache hit - check validation
+        let result2 = cache.get(&object_meta.location).unwrap();
+        assert!(result2.is_valid_for(&object_meta));
+
+        // File size changed - closure should detect invalidity
+        let object_meta2 = create_test_object_meta("test", 2048);
+        let result3 = cache.get(&object_meta2.location).unwrap();
+        // Cached entry should NOT be valid for new meta
+        assert!(!result3.is_valid_for(&object_meta2));
+
+        // Return new entry
+        let new_entry =
+            CachedFileMetadataEntry::new(object_meta2.clone(), Arc::clone(&metadata));
+        cache.put(&object_meta2.location, new_entry);
+
+        let result4 = cache.get(&object_meta2.location).unwrap();
+        assert_eq!(result4.meta.size, 2048);
+
+        // remove
+        cache.remove(&object_meta.location);
+        assert!(!cache.contains_key(&object_meta.location));
+
+        // len and clear
+        let object_meta3 = create_test_object_meta("test3", 100);
+        cache.put(
+            &object_meta.location,
+            CachedFileMetadataEntry::new(object_meta.clone(), Arc::clone(&metadata)),
+        );
+        cache.put(
+            &object_meta3.location,
+            CachedFileMetadataEntry::new(object_meta3.clone(), Arc::clone(&metadata)),
+        );
+        assert_eq!(cache.len(), 2);
+        cache.clear();
+        assert_eq!(cache.len(), 0);
+    }
+
+    fn generate_test_metadata_with_size(
+        path: &str,
+        size: usize,
+    ) -> (ObjectMeta, Arc<dyn FileMetadata>) {
+        let object_meta = ObjectMeta {
+            location: Path::from(path),
+            last_modified: chrono::Utc::now(),
+            size: size as u64,
+            e_tag: None,
+            version: None,
+        };
+        let metadata: Arc<dyn FileMetadata> = Arc::new(TestFileMetadata {
+            metadata: "a".repeat(size),
+        });
+
+        (object_meta, metadata)
+    }
+
+    #[test]
+    fn test_default_file_metadata_cache_with_limit() {
+        let cache = DefaultFilesMetadataCache::new(1000);
+        let (object_meta1, metadata1) = generate_test_metadata_with_size("1", 100);
+        let (object_meta2, metadata2) = generate_test_metadata_with_size("2", 500);
+        let (object_meta3, metadata3) = generate_test_metadata_with_size("3", 300);
+
+        cache.put(
+            &object_meta1.location,
+            CachedFileMetadataEntry::new(object_meta1.clone(), metadata1),
+        );
+        cache.put(
+            &object_meta2.location,
+            CachedFileMetadataEntry::new(object_meta2.clone(), metadata2),
+        );
+        cache.put(
+            &object_meta3.location,
+            CachedFileMetadataEntry::new(object_meta3.clone(), metadata3),
+        );
+
+        // all entries will fit
+        assert_eq!(cache.len(), 3);
+        assert_eq!(cache.memory_used(), 900);
+        assert!(cache.contains_key(&object_meta1.location));
+        assert!(cache.contains_key(&object_meta2.location));
+        assert!(cache.contains_key(&object_meta3.location));
+
+        // add a new entry which will remove the least recently used ("1")
+        let (object_meta4, metadata4) = generate_test_metadata_with_size("4", 200);
+        cache.put(
+            &object_meta4.location,
+            CachedFileMetadataEntry::new(object_meta4.clone(), metadata4),
+        );
+        assert_eq!(cache.len(), 3);
+        assert_eq!(cache.memory_used(), 1000);
+        assert!(!cache.contains_key(&object_meta1.location));
+        assert!(cache.contains_key(&object_meta4.location));
+
+        // get entry "2", which will move it to the top of the queue, and add a new one which will
+        // remove the new least recently used ("3")
+        let _ = cache.get(&object_meta2.location);
+        let (object_meta5, metadata5) = generate_test_metadata_with_size("5", 100);
+        cache.put(
+            &object_meta5.location,
+            CachedFileMetadataEntry::new(object_meta5.clone(), metadata5),
+        );
+        assert_eq!(cache.len(), 3);
+        assert_eq!(cache.memory_used(), 800);
+        assert!(!cache.contains_key(&object_meta3.location));
+        assert!(cache.contains_key(&object_meta5.location));
+
+        // new entry which will not be able to fit in the 1000 bytes allocated
+        let (object_meta6, metadata6) = generate_test_metadata_with_size("6", 1200);
+        cache.put(
+            &object_meta6.location,
+            CachedFileMetadataEntry::new(object_meta6.clone(), metadata6),
+        );
+        assert_eq!(cache.len(), 3);
+        assert_eq!(cache.memory_used(), 800);
+        assert!(!cache.contains_key(&object_meta6.location));
+
+        // new entry which is able to fit without removing any entry
+        let (object_meta7, metadata7) = generate_test_metadata_with_size("7", 200);
+        cache.put(
+            &object_meta7.location,
+            CachedFileMetadataEntry::new(object_meta7.clone(), metadata7),
+        );
+        assert_eq!(cache.len(), 4);
+        assert_eq!(cache.memory_used(), 1000);
+        assert!(cache.contains_key(&object_meta7.location));
+
+        // new entry which will remove all other entries
+        let (object_meta8, metadata8) = generate_test_metadata_with_size("8", 999);
+        cache.put(
+            &object_meta8.location,
+            CachedFileMetadataEntry::new(object_meta8.clone(), metadata8),
+        );
+        assert_eq!(cache.len(), 1);
+        assert_eq!(cache.memory_used(), 999);
+        assert!(cache.contains_key(&object_meta8.location));
+
+        // when updating an entry, the previous ones are not unnecessarily removed
+        let (object_meta9, metadata9) = generate_test_metadata_with_size("9", 300);
+        let (object_meta10, metadata10) = generate_test_metadata_with_size("10", 200);
+        let (object_meta11_v1, metadata11_v1) =
+            generate_test_metadata_with_size("11", 400);
+        cache.put(
+            &object_meta9.location,
+            CachedFileMetadataEntry::new(object_meta9.clone(), metadata9),
+        );
+        cache.put(
+            &object_meta10.location,
+            CachedFileMetadataEntry::new(object_meta10.clone(), metadata10),
+        );
+        cache.put(
+            &object_meta11_v1.location,
+            CachedFileMetadataEntry::new(object_meta11_v1.clone(), metadata11_v1),
+        );
+        assert_eq!(cache.memory_used(), 900);
+        assert_eq!(cache.len(), 3);
+        let (object_meta11_v2, metadata11_v2) =
+            generate_test_metadata_with_size("11", 500);
+        cache.put(
+            &object_meta11_v2.location,
+            CachedFileMetadataEntry::new(object_meta11_v2.clone(), metadata11_v2),
+        );
+        assert_eq!(cache.memory_used(), 1000);
+        assert_eq!(cache.len(), 3);
+        assert!(cache.contains_key(&object_meta9.location));
+        assert!(cache.contains_key(&object_meta10.location));
+        assert!(cache.contains_key(&object_meta11_v2.location));
+
+        // when updating an entry that now exceeds the limit, the LRU ("9") needs to be removed
+        let (object_meta11_v3, metadata11_v3) =
+            generate_test_metadata_with_size("11", 501);
+        cache.put(
+            &object_meta11_v3.location,
+            CachedFileMetadataEntry::new(object_meta11_v3.clone(), metadata11_v3),
+        );
+        assert_eq!(cache.memory_used(), 701);
+        assert_eq!(cache.len(), 2);
+        assert!(cache.contains_key(&object_meta10.location));
+        assert!(cache.contains_key(&object_meta11_v3.location));
+
+        // manually removing an entry that is not the LRU
+        cache.remove(&object_meta11_v3.location);
+        assert_eq!(cache.len(), 1);
+        assert_eq!(cache.memory_used(), 200);
+        assert!(cache.contains_key(&object_meta10.location));
+        assert!(!cache.contains_key(&object_meta11_v3.location));
+
+        // clear
+        cache.clear();
+        assert_eq!(cache.len(), 0);
+        assert_eq!(cache.memory_used(), 0);
+
+        // resizing the cache should clear the extra entries
+        let (object_meta12, metadata12) = generate_test_metadata_with_size("12", 300);
+        let (object_meta13, metadata13) = generate_test_metadata_with_size("13", 200);
+        let (object_meta14, metadata14) = generate_test_metadata_with_size("14", 500);
+        cache.put(
+            &object_meta12.location,
+            CachedFileMetadataEntry::new(object_meta12.clone(), metadata12),
+        );
+        cache.put(
+            &object_meta13.location,
+            CachedFileMetadataEntry::new(object_meta13.clone(), metadata13),
+        );
+        cache.put(
+            &object_meta14.location,
+            CachedFileMetadataEntry::new(object_meta14.clone(), metadata14),
+        );
+        assert_eq!(cache.len(), 3);
+        assert_eq!(cache.memory_used(), 1000);
+        cache.update_cache_limit(600);
+        assert_eq!(cache.len(), 1);
+        assert_eq!(cache.memory_used(), 500);
+        assert!(!cache.contains_key(&object_meta12.location));
+        assert!(!cache.contains_key(&object_meta13.location));
+        assert!(cache.contains_key(&object_meta14.location));
+    }
+
+    #[test]
+    fn test_default_file_metadata_cache_entries_info() {
+        let cache = DefaultFilesMetadataCache::new(1000);
+        let (object_meta1, metadata1) = generate_test_metadata_with_size("1", 100);
+        let (object_meta2, metadata2) = generate_test_metadata_with_size("2", 200);
+        let (object_meta3, metadata3) = generate_test_metadata_with_size("3", 300);
+
+        // initial entries, all will have hits = 0
+        cache.put(
+            &object_meta1.location,
+            CachedFileMetadataEntry::new(object_meta1.clone(), metadata1),
+        );
+        cache.put(
+            &object_meta2.location,
+            CachedFileMetadataEntry::new(object_meta2.clone(), metadata2),
+        );
+        cache.put(
+            &object_meta3.location,
+            CachedFileMetadataEntry::new(object_meta3.clone(), metadata3),
+        );
+        assert_eq!(
+            cache.list_entries(),
+            HashMap::from([
+                (
+                    Path::from("1"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta1.clone(),
+                        size_bytes: 100,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                ),
+                (
+                    Path::from("2"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta2.clone(),
+                        size_bytes: 200,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                ),
+                (
+                    Path::from("3"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta3.clone(),
+                        size_bytes: 300,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                )
+            ])
+        );
+
+        // new hit on "1"
+        let _ = cache.get(&object_meta1.location);
+        assert_eq!(
+            cache.list_entries(),
+            HashMap::from([
+                (
+                    Path::from("1"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta1.clone(),
+                        size_bytes: 100,
+                        hits: 1,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                ),
+                (
+                    Path::from("2"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta2.clone(),
+                        size_bytes: 200,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                ),
+                (
+                    Path::from("3"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta3.clone(),
+                        size_bytes: 300,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                )
+            ])
+        );
+
+        // new entry, will evict "2"
+        let (object_meta4, metadata4) = generate_test_metadata_with_size("4", 600);
+        cache.put(
+            &object_meta4.location,
+            CachedFileMetadataEntry::new(object_meta4.clone(), metadata4),
+        );
+        assert_eq!(
+            cache.list_entries(),
+            HashMap::from([
+                (
+                    Path::from("1"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta1.clone(),
+                        size_bytes: 100,
+                        hits: 1,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                ),
+                (
+                    Path::from("3"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta3.clone(),
+                        size_bytes: 300,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                ),
+                (
+                    Path::from("4"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta4.clone(),
+                        size_bytes: 600,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                )
+            ])
+        );
+
+        // replace entry "1"
+        let (object_meta1_new, metadata1_new) = generate_test_metadata_with_size("1", 50);
+        cache.put(
+            &object_meta1_new.location,
+            CachedFileMetadataEntry::new(object_meta1_new.clone(), metadata1_new),
+        );
+        assert_eq!(
+            cache.list_entries(),
+            HashMap::from([
+                (
+                    Path::from("1"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta1_new.clone(),
+                        size_bytes: 50,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                ),
+                (
+                    Path::from("3"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta3.clone(),
+                        size_bytes: 300,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                ),
+                (
+                    Path::from("4"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta4.clone(),
+                        size_bytes: 600,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                )
+            ])
+        );
+
+        // remove entry "4"
+        cache.remove(&object_meta4.location);
+        assert_eq!(
+            cache.list_entries(),
+            HashMap::from([
+                (
+                    Path::from("1"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta1_new.clone(),
+                        size_bytes: 50,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                ),
+                (
+                    Path::from("3"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta3.clone(),
+                        size_bytes: 300,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                )
+            ])
+        );
+
+        // clear
+        cache.clear();
+        assert_eq!(cache.list_entries(), HashMap::from([]));
+    }
+}
diff --git a/datafusion/execution/src/cache/list_files_cache.rs b/datafusion/execution/src/cache/list_files_cache.rs
new file mode 100644
index 0000000000000..b1b8e6b500169
--- /dev/null
+++ b/datafusion/execution/src/cache/list_files_cache.rs
@@ -0,0 +1,1218 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::mem::size_of;
+use std::{
+    collections::HashMap,
+    sync::{Arc, Mutex},
+    time::Duration,
+};
+
+use datafusion_common::TableReference;
+use datafusion_common::instant::Instant;
+use object_store::{ObjectMeta, path::Path};
+
+use crate::cache::{
+    CacheAccessor,
+    cache_manager::{CachedFileList, ListFilesCache},
+    lru_queue::LruQueue,
+};
+
+pub trait TimeProvider: Send + Sync + 'static {
+    fn now(&self) -> Instant;
+}
+
+#[derive(Debug, Default)]
+pub struct SystemTimeProvider;
+
+impl TimeProvider for SystemTimeProvider {
+    fn now(&self) -> Instant {
+        Instant::now()
+    }
+}
+
+/// Default implementation of [`ListFilesCache`]
+///
+/// Caches file metadata for file listing operations.
+///
+/// # Internal details
+///
+/// The `memory_limit` parameter controls the maximum size of the cache, which uses a Least
+/// Recently Used eviction algorithm. When adding a new entry, if the total number of entries in
+/// the cache exceeds `memory_limit`, the least recently used entries are evicted until the total
+/// size is lower than the `memory_limit`.
+///
+/// # Cache API
+///
+/// Uses `get` and `put` methods for cache operations. TTL validation is handled internally -
+/// expired entries return `None` from `get`.
+pub struct DefaultListFilesCache {
+    state: Mutex<DefaultListFilesCacheState>,
+    time_provider: Arc<dyn TimeProvider>,
+}
+
+impl Default for DefaultListFilesCache {
+    fn default() -> Self {
+        Self::new(DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT, None)
+    }
+}
+
+impl DefaultListFilesCache {
+    /// Creates a new instance of [`DefaultListFilesCache`].
+    ///
+    /// # Arguments
+    /// * `memory_limit` - The maximum size of the cache, in bytes.
+    /// * `ttl` - The TTL (time-to-live) of entries in the cache.
+    pub fn new(memory_limit: usize, ttl: Option<Duration>) -> Self {
+        Self {
+            state: Mutex::new(DefaultListFilesCacheState::new(memory_limit, ttl)),
+            time_provider: Arc::new(SystemTimeProvider),
+        }
+    }
+
+    #[cfg(test)]
+    pub(crate) fn with_time_provider(mut self, provider: Arc<dyn TimeProvider>) -> Self {
+        self.time_provider = provider;
+        self
+    }
+}
+
+#[derive(Clone, PartialEq, Debug)]
+pub struct ListFilesEntry {
+    pub metas: CachedFileList,
+    pub size_bytes: usize,
+    pub expires: Option<Instant>,
+}
+
+impl ListFilesEntry {
+    fn try_new(
+        cached_file_list: CachedFileList,
+        ttl: Option<Duration>,
+        now: Instant,
+    ) -> Option<Self> {
+        let size_bytes = (cached_file_list.files.capacity() * size_of::<ObjectMeta>())
+            + cached_file_list
+                .files
+                .iter()
+                .map(meta_heap_bytes)
+                .reduce(|acc, b| acc + b)?;
+
+        Some(Self {
+            metas: cached_file_list,
+            size_bytes,
+            expires: ttl.map(|t| now + t),
+        })
+    }
+}
+
+/// Calculates the number of bytes an [`ObjectMeta`] occupies in the heap.
+fn meta_heap_bytes(object_meta: &ObjectMeta) -> usize {
+    let mut size = object_meta.location.as_ref().len();
+
+    if let Some(e) = &object_meta.e_tag {
+        size += e.len();
+    }
+    if let Some(v) = &object_meta.version {
+        size += v.len();
+    }
+
+    size
+}
+
+/// The default memory limit for the [`DefaultListFilesCache`]
+pub const DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT: usize = 1024 * 1024; // 1MiB
+
+/// The default cache TTL for the [`DefaultListFilesCache`]
+pub const DEFAULT_LIST_FILES_CACHE_TTL: Option<Duration> = None; // Infinite
+
+/// Key for [`DefaultListFilesCache`]
+///
+/// Each entry is scoped to its use within a specific table so that the cache
+/// can differentiate between identical paths in different tables, and
+/// table-level cache invalidation.
+#[derive(PartialEq, Eq, Hash, Clone, Debug)]
+pub struct TableScopedPath {
+    pub table: Option<TableReference>,
+    pub path: Path,
+}
+
+/// Handles the inner state of the [`DefaultListFilesCache`] struct.
+pub struct DefaultListFilesCacheState {
+    lru_queue: LruQueue<TableScopedPath, ListFilesEntry>,
+    memory_limit: usize,
+    memory_used: usize,
+    ttl: Option<Duration>,
+}
+
+impl Default for DefaultListFilesCacheState {
+    fn default() -> Self {
+        Self {
+            lru_queue: LruQueue::new(),
+            memory_limit: DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT,
+            memory_used: 0,
+            ttl: DEFAULT_LIST_FILES_CACHE_TTL,
+        }
+    }
+}
+
+impl DefaultListFilesCacheState {
+    fn new(memory_limit: usize, ttl: Option<Duration>) -> Self {
+        Self {
+            lru_queue: LruQueue::new(),
+            memory_limit,
+            memory_used: 0,
+            ttl,
+        }
+    }
+
+    /// Gets an entry from the cache, checking for expiration.
+    ///
+    /// Returns the cached file list if it exists and hasn't expired.
+    /// If the entry has expired, it is removed from the cache.
+    fn get(&mut self, key: &TableScopedPath, now: Instant) -> Option<CachedFileList> {
+        let entry = self.lru_queue.get(key)?;
+
+        // Check expiration
+        if let Some(exp) = entry.expires
+            && now > exp
+        {
+            self.remove(key);
+            return None;
+        }
+
+        Some(entry.metas.clone())
+    }
+
+    /// Checks if the respective entry is currently cached.
+    ///
+    /// If the entry has expired by `now` it is removed from the cache.
+    ///
+    /// The LRU queue is not updated.
+    fn contains_key(&mut self, k: &TableScopedPath, now: Instant) -> bool {
+        let Some(entry) = self.lru_queue.peek(k) else {
+            return false;
+        };
+
+        match entry.expires {
+            Some(exp) if now > exp => {
+                self.remove(k);
+                false
+            }
+            _ => true,
+        }
+    }
+
+    /// Adds a new key-value pair to cache expiring at `now` + the TTL.
+    ///
+    /// This means that LRU entries might be evicted if required.
+    /// If the key is already in the cache, the previous entry is returned.
+    /// If the size of the entry is greater than the `memory_limit`, the value is not inserted.
+    fn put(
+        &mut self,
+        key: &TableScopedPath,
+        value: CachedFileList,
+        now: Instant,
+    ) -> Option<CachedFileList> {
+        let entry = ListFilesEntry::try_new(value, self.ttl, now)?;
+        let entry_size = entry.size_bytes;
+
+        // no point in trying to add this value to the cache if it cannot fit entirely
+        if entry_size > self.memory_limit {
+            return None;
+        }
+
+        // if the key is already in the cache, the old value is removed
+        let old_value = self.lru_queue.put(key.clone(), entry);
+        self.memory_used += entry_size;
+
+        if let Some(entry) = &old_value {
+            self.memory_used -= entry.size_bytes;
+        }
+
+        self.evict_entries();
+
+        old_value.map(|v| v.metas)
+    }
+
+    /// Evicts entries from the LRU cache until `memory_used` is lower than `memory_limit`.
+    fn evict_entries(&mut self) {
+        while self.memory_used > self.memory_limit {
+            if let Some(removed) = self.lru_queue.pop() {
+                self.memory_used -= removed.1.size_bytes;
+            } else {
+                // cache is empty while memory_used > memory_limit, cannot happen
+                debug_assert!(
+                    false,
+                    "cache is empty while memory_used > memory_limit, cannot happen"
+                );
+                return;
+            }
+        }
+    }
+
+    /// Removes an entry from the cache and returns it, if it exists.
+    fn remove(&mut self, k: &TableScopedPath) -> Option<CachedFileList> {
+        if let Some(entry) = self.lru_queue.remove(k) {
+            self.memory_used -= entry.size_bytes;
+            Some(entry.metas)
+        } else {
+            None
+        }
+    }
+
+    /// Returns the number of entries currently cached.
+    fn len(&self) -> usize {
+        self.lru_queue.len()
+    }
+
+    /// Removes all entries from the cache.
+    fn clear(&mut self) {
+        self.lru_queue.clear();
+        self.memory_used = 0;
+    }
+}
+
+impl CacheAccessor<TableScopedPath, CachedFileList> for DefaultListFilesCache {
+    fn get(&self, key: &TableScopedPath) -> Option<CachedFileList> {
+        let mut state = self.state.lock().unwrap();
+        let now = self.time_provider.now();
+        state.get(key, now)
+    }
+
+    fn put(
+        &self,
+        key: &TableScopedPath,
+        value: CachedFileList,
+    ) -> Option<CachedFileList> {
+        let mut state = self.state.lock().unwrap();
+        let now = self.time_provider.now();
+        state.put(key, value, now)
+    }
+
+    fn remove(&self, k: &TableScopedPath) -> Option<CachedFileList> {
+        let mut state = self.state.lock().unwrap();
+        state.remove(k)
+    }
+
+    fn contains_key(&self, k: &TableScopedPath) -> bool {
+        let mut state = self.state.lock().unwrap();
+        let now = self.time_provider.now();
+        state.contains_key(k, now)
+    }
+
+    fn len(&self) -> usize {
+        let state = self.state.lock().unwrap();
+        state.len()
+    }
+
+    fn clear(&self) {
+        let mut state = self.state.lock().unwrap();
+        state.clear();
+    }
+
+    fn name(&self) -> String {
+        String::from("DefaultListFilesCache")
+    }
+}
+
+impl ListFilesCache for DefaultListFilesCache {
+    fn cache_limit(&self) -> usize {
+        let state = self.state.lock().unwrap();
+        state.memory_limit
+    }
+
+    fn cache_ttl(&self) -> Option<Duration> {
+        let state = self.state.lock().unwrap();
+        state.ttl
+    }
+
+    fn update_cache_limit(&self, limit: usize) {
+        let mut state = self.state.lock().unwrap();
+        state.memory_limit = limit;
+        state.evict_entries();
+    }
+
+    fn update_cache_ttl(&self, ttl: Option<Duration>) {
+        let mut state = self.state.lock().unwrap();
+        state.ttl = ttl;
+        state.evict_entries();
+    }
+
+    fn list_entries(&self) -> HashMap<TableScopedPath, ListFilesEntry> {
+        let state = self.state.lock().unwrap();
+        let mut entries = HashMap::<TableScopedPath, ListFilesEntry>::new();
+        for (path, entry) in state.lru_queue.list_entries() {
+            entries.insert(path.clone(), entry.clone());
+        }
+        entries
+    }
+
+    fn drop_table_entries(
+        &self,
+        table_ref: &Option<TableReference>,
+    ) -> datafusion_common::Result<()> {
+        let mut state = self.state.lock().unwrap();
+        let mut table_paths = vec![];
+        for (path, _) in state.lru_queue.list_entries() {
+            if path.table == *table_ref {
+                table_paths.push(path.clone());
+            }
+        }
+        for path in table_paths {
+            state.remove(&path);
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use chrono::DateTime;
+    use std::thread;
+
+    struct MockTimeProvider {
+        base: Instant,
+        offset: Mutex<Duration>,
+    }
+
+    impl MockTimeProvider {
+        fn new() -> Self {
+            Self {
+                base: Instant::now(),
+                offset: Mutex::new(Duration::ZERO),
+            }
+        }
+
+        fn inc(&self, duration: Duration) {
+            let mut offset = self.offset.lock().unwrap();
+            *offset += duration;
+        }
+    }
+
+    impl TimeProvider for MockTimeProvider {
+        fn now(&self) -> Instant {
+            self.base + *self.offset.lock().unwrap()
+        }
+    }
+
+    /// Helper function to create a test ObjectMeta with a specific path and location string size
+    fn create_test_object_meta(path: &str, location_size: usize) -> ObjectMeta {
+        // Create a location string of the desired size by padding with zeros
+        let location_str = if location_size > path.len() {
+            format!("{}{}", path, "0".repeat(location_size - path.len()))
+        } else {
+            path.to_string()
+        };
+
+        ObjectMeta {
+            location: Path::from(location_str),
+            last_modified: DateTime::parse_from_rfc3339("2022-09-27T22:36:00+02:00")
+                .unwrap()
+                .into(),
+            size: 1024,
+            e_tag: None,
+            version: None,
+        }
+    }
+
+    /// Helper function to create a CachedFileList with at least meta_size bytes
+    fn create_test_list_files_entry(
+        path: &str,
+        count: usize,
+        meta_size: usize,
+    ) -> (Path, CachedFileList, usize) {
+        let metas: Vec<ObjectMeta> = (0..count)
+            .map(|i| create_test_object_meta(&format!("file{i}"), meta_size))
+            .collect();
+
+        // Calculate actual size using the same logic as ListFilesEntry::try_new
+        let size = (metas.capacity() * size_of::<ObjectMeta>())
+            + metas.iter().map(meta_heap_bytes).sum::<usize>();
+
+        (Path::from(path), CachedFileList::new(metas), size)
+    }
+
+    #[test]
+    fn test_basic_operations() {
+        let cache = DefaultListFilesCache::default();
+        let table_ref = Some(TableReference::from("table"));
+        let path = Path::from("test_path");
+        let key = TableScopedPath {
+            table: table_ref.clone(),
+            path,
+        };
+
+        // Initially cache is empty
+        assert!(!cache.contains_key(&key));
+        assert_eq!(cache.len(), 0);
+
+        // Cache miss - get returns None
+        assert!(cache.get(&key).is_none());
+
+        // Put a value
+        let meta = create_test_object_meta("file1", 50);
+        cache.put(&key, CachedFileList::new(vec![meta]));
+
+        // Entry should be cached
+        assert!(cache.contains_key(&key));
+        assert_eq!(cache.len(), 1);
+        let result = cache.get(&key).unwrap();
+        assert_eq!(result.files.len(), 1);
+
+        // Remove the entry
+        let removed = cache.remove(&key).unwrap();
+        assert_eq!(removed.files.len(), 1);
+        assert!(!cache.contains_key(&key));
+        assert_eq!(cache.len(), 0);
+
+        // Put multiple entries
+        let (path1, value1, size1) = create_test_list_files_entry("path1", 2, 50);
+        let (path2, value2, size2) = create_test_list_files_entry("path2", 3, 50);
+        let key1 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path1,
+        };
+        let key2 = TableScopedPath {
+            table: table_ref,
+            path: path2,
+        };
+        cache.put(&key1, value1.clone());
+        cache.put(&key2, value2.clone());
+        assert_eq!(cache.len(), 2);
+
+        // List cache entries
+        assert_eq!(
+            cache.list_entries(),
+            HashMap::from([
+                (
+                    key1.clone(),
+                    ListFilesEntry {
+                        metas: value1,
+                        size_bytes: size1,
+                        expires: None,
+                    }
+                ),
+                (
+                    key2.clone(),
+                    ListFilesEntry {
+                        metas: value2,
+                        size_bytes: size2,
+                        expires: None,
+                    }
+                )
+            ])
+        );
+
+        // Clear all entries
+        cache.clear();
+        assert_eq!(cache.len(), 0);
+        assert!(!cache.contains_key(&key1));
+        assert!(!cache.contains_key(&key2));
+    }
+
+    #[test]
+    fn test_lru_eviction_basic() {
+        let (path1, value1, size) = create_test_list_files_entry("path1", 1, 100);
+        let (path2, value2, _) = create_test_list_files_entry("path2", 1, 100);
+        let (path3, value3, _) = create_test_list_files_entry("path3", 1, 100);
+
+        // Set cache limit to exactly fit all three entries
+        let cache = DefaultListFilesCache::new(size * 3, None);
+
+        let table_ref = Some(TableReference::from("table"));
+        let key1 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path1,
+        };
+        let key2 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path2,
+        };
+        let key3 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path3,
+        };
+
+        // All three entries should fit
+        cache.put(&key1, value1);
+        cache.put(&key2, value2);
+        cache.put(&key3, value3);
+        assert_eq!(cache.len(), 3);
+        assert!(cache.contains_key(&key1));
+        assert!(cache.contains_key(&key2));
+        assert!(cache.contains_key(&key3));
+
+        // Adding a new entry should evict path1 (LRU)
+        let (path4, value4, _) = create_test_list_files_entry("path4", 1, 100);
+        let key4 = TableScopedPath {
+            table: table_ref,
+            path: path4,
+        };
+        cache.put(&key4, value4);
+
+        assert_eq!(cache.len(), 3);
+        assert!(!cache.contains_key(&key1)); // Evicted
+        assert!(cache.contains_key(&key2));
+        assert!(cache.contains_key(&key3));
+        assert!(cache.contains_key(&key4));
+    }
+
+    #[test]
+    fn test_lru_ordering_after_access() {
+        let (path1, value1, size) = create_test_list_files_entry("path1", 1, 100);
+        let (path2, value2, _) = create_test_list_files_entry("path2", 1, 100);
+        let (path3, value3, _) = create_test_list_files_entry("path3", 1, 100);
+
+        // Set cache limit to fit exactly three entries
+        let cache = DefaultListFilesCache::new(size * 3, None);
+
+        let table_ref = Some(TableReference::from("table"));
+        let key1 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path1,
+        };
+        let key2 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path2,
+        };
+        let key3 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path3,
+        };
+
+        cache.put(&key1, value1);
+        cache.put(&key2, value2);
+        cache.put(&key3, value3);
+        assert_eq!(cache.len(), 3);
+
+        // Access path1 to move it to front (MRU)
+        // Order is now: path2 (LRU), path3, path1 (MRU)
+        let _ = cache.get(&key1);
+
+        // Adding a new entry should evict path2 (the LRU)
+        let (path4, value4, _) = create_test_list_files_entry("path4", 1, 100);
+        let key4 = TableScopedPath {
+            table: table_ref,
+            path: path4,
+        };
+        cache.put(&key4, value4);
+
+        assert_eq!(cache.len(), 3);
+        assert!(cache.contains_key(&key1)); // Still present (recently accessed)
+        assert!(!cache.contains_key(&key2)); // Evicted (was LRU)
+        assert!(cache.contains_key(&key3));
+        assert!(cache.contains_key(&key4));
+    }
+
+    #[test]
+    fn test_reject_too_large() {
+        let (path1, value1, size) = create_test_list_files_entry("path1", 1, 100);
+        let (path2, value2, _) = create_test_list_files_entry("path2", 1, 100);
+
+        // Set cache limit to fit both entries
+        let cache = DefaultListFilesCache::new(size * 2, None);
+
+        let table_ref = Some(TableReference::from("table"));
+        let key1 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path1,
+        };
+        let key2 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path2,
+        };
+        cache.put(&key1, value1);
+        cache.put(&key2, value2);
+        assert_eq!(cache.len(), 2);
+
+        // Try to add an entry that's too large to fit in the cache
+        // The entry is not stored (too large)
+        let (path_large, value_large, _) = create_test_list_files_entry("large", 1, 1000);
+        let key_large = TableScopedPath {
+            table: table_ref,
+            path: path_large,
+        };
+        cache.put(&key_large, value_large);
+
+        // Large entry should not be added
+        assert!(!cache.contains_key(&key_large));
+        assert_eq!(cache.len(), 2);
+        assert!(cache.contains_key(&key1));
+        assert!(cache.contains_key(&key2));
+    }
+
+    #[test]
+    fn test_multiple_evictions() {
+        let (path1, value1, size) = create_test_list_files_entry("path1", 1, 100);
+        let (path2, value2, _) = create_test_list_files_entry("path2", 1, 100);
+        let (path3, value3, _) = create_test_list_files_entry("path3", 1, 100);
+
+        // Set cache limit for exactly 3 entries
+        let cache = DefaultListFilesCache::new(size * 3, None);
+
+        let table_ref = Some(TableReference::from("table"));
+        let key1 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path1,
+        };
+        let key2 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path2,
+        };
+        let key3 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path3,
+        };
+        cache.put(&key1, value1);
+        cache.put(&key2, value2);
+        cache.put(&key3, value3);
+        assert_eq!(cache.len(), 3);
+
+        // Add a large entry that requires evicting 2 entries
+        let (path_large, value_large, _) = create_test_list_files_entry("large", 1, 200);
+        let key_large = TableScopedPath {
+            table: table_ref,
+            path: path_large,
+        };
+        cache.put(&key_large, value_large);
+
+        // path1 and path2 should be evicted (both LRU), path3 and path_large remain
+        assert_eq!(cache.len(), 2);
+        assert!(!cache.contains_key(&key1)); // Evicted
+        assert!(!cache.contains_key(&key2)); // Evicted
+        assert!(cache.contains_key(&key3));
+        assert!(cache.contains_key(&key_large));
+    }
+
+    #[test]
+    fn test_cache_limit_resize() {
+        let (path1, value1, size) = create_test_list_files_entry("path1", 1, 100);
+        let (path2, value2, _) = create_test_list_files_entry("path2", 1, 100);
+        let (path3, value3, _) = create_test_list_files_entry("path3", 1, 100);
+
+        let cache = DefaultListFilesCache::new(size * 3, None);
+
+        let table_ref = Some(TableReference::from("table"));
+        let key1 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path1,
+        };
+        let key2 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path2,
+        };
+        let key3 = TableScopedPath {
+            table: table_ref,
+            path: path3,
+        };
+        // Add three entries
+        cache.put(&key1, value1);
+        cache.put(&key2, value2);
+        cache.put(&key3, value3);
+        assert_eq!(cache.len(), 3);
+
+        // Resize cache to only fit one entry
+        cache.update_cache_limit(size);
+
+        // Should keep only the most recent entry (path3, the MRU)
+        assert_eq!(cache.len(), 1);
+        assert!(cache.contains_key(&key3));
+        // Earlier entries (LRU) should be evicted
+        assert!(!cache.contains_key(&key1));
+        assert!(!cache.contains_key(&key2));
+    }
+
+    #[test]
+    fn test_entry_update_with_size_change() {
+        let (path1, value1, size) = create_test_list_files_entry("path1", 1, 100);
+        let (path2, value2, size2) = create_test_list_files_entry("path2", 1, 100);
+        let (path3, value3_v1, _) = create_test_list_files_entry("path3", 1, 100);
+
+        let cache = DefaultListFilesCache::new(size * 3, None);
+
+        let table_ref = Some(TableReference::from("table"));
+        let key1 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path1,
+        };
+        let key2 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path2,
+        };
+        let key3 = TableScopedPath {
+            table: table_ref,
+            path: path3,
+        };
+        // Add three entries
+        cache.put(&key1, value1);
+        cache.put(&key2, value2.clone());
+        cache.put(&key3, value3_v1);
+        assert_eq!(cache.len(), 3);
+
+        // Update path3 with same size - should not cause eviction
+        let (_, value3_v2, _) = create_test_list_files_entry("path3", 1, 100);
+        cache.put(&key3, value3_v2);
+
+        assert_eq!(cache.len(), 3);
+        assert!(cache.contains_key(&key1));
+        assert!(cache.contains_key(&key2));
+        assert!(cache.contains_key(&key3));
+
+        // Update path3 with larger size that requires evicting path1 (LRU)
+        let (_, value3_v3, size3_v3) = create_test_list_files_entry("path3", 1, 200);
+        cache.put(&key3, value3_v3.clone());
+
+        assert_eq!(cache.len(), 2);
+        assert!(!cache.contains_key(&key1)); // Evicted (was LRU)
+        assert!(cache.contains_key(&key2));
+        assert!(cache.contains_key(&key3));
+
+        // List cache entries
+        assert_eq!(
+            cache.list_entries(),
+            HashMap::from([
+                (
+                    key2,
+                    ListFilesEntry {
+                        metas: value2,
+                        size_bytes: size2,
+                        expires: None,
+                    }
+                ),
+                (
+                    key3,
+                    ListFilesEntry {
+                        metas: value3_v3,
+                        size_bytes: size3_v3,
+                        expires: None,
+                    }
+                )
+            ])
+        );
+    }
+
+    #[test]
+    fn test_cache_with_ttl() {
+        let ttl = Duration::from_millis(100);
+
+        let mock_time = Arc::new(MockTimeProvider::new());
+        let cache = DefaultListFilesCache::new(10000, Some(ttl))
+            .with_time_provider(Arc::clone(&mock_time) as Arc<dyn TimeProvider>);
+
+        let (path1, value1, size1) = create_test_list_files_entry("path1", 2, 50);
+        let (path2, value2, size2) = create_test_list_files_entry("path2", 2, 50);
+
+        let table_ref = Some(TableReference::from("table"));
+        let key1 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path1,
+        };
+        let key2 = TableScopedPath {
+            table: table_ref,
+            path: path2,
+        };
+        cache.put(&key1, value1.clone());
+        cache.put(&key2, value2.clone());
+
+        // Entries should be accessible immediately
+        assert!(cache.get(&key1).is_some());
+        assert!(cache.get(&key2).is_some());
+        // List cache entries
+        assert_eq!(
+            cache.list_entries(),
+            HashMap::from([
+                (
+                    key1.clone(),
+                    ListFilesEntry {
+                        metas: value1,
+                        size_bytes: size1,
+                        expires: mock_time.now().checked_add(ttl),
+                    }
+                ),
+                (
+                    key2.clone(),
+                    ListFilesEntry {
+                        metas: value2,
+                        size_bytes: size2,
+                        expires: mock_time.now().checked_add(ttl),
+                    }
+                )
+            ])
+        );
+        // Wait for TTL to expire
+        mock_time.inc(Duration::from_millis(150));
+
+        // Entries should now return None when observed through contains_key
+        assert!(!cache.contains_key(&key1));
+        assert_eq!(cache.len(), 1); // key1 was removed by contains_key()
+        assert!(!cache.contains_key(&key2));
+        assert_eq!(cache.len(), 0); // key2 was removed by contains_key()
+    }
+
+    #[test]
+    fn test_cache_with_ttl_and_lru() {
+        let ttl = Duration::from_millis(200);
+
+        let mock_time = Arc::new(MockTimeProvider::new());
+        let cache = DefaultListFilesCache::new(1000, Some(ttl))
+            .with_time_provider(Arc::clone(&mock_time) as Arc<dyn TimeProvider>);
+
+        let (path1, value1, _) = create_test_list_files_entry("path1", 1, 400);
+        let (path2, value2, _) = create_test_list_files_entry("path2", 1, 400);
+        let (path3, value3, _) = create_test_list_files_entry("path3", 1, 400);
+
+        let table_ref = Some(TableReference::from("table"));
+        let key1 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path1,
+        };
+        let key2 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path2,
+        };
+        let key3 = TableScopedPath {
+            table: table_ref,
+            path: path3,
+        };
+        cache.put(&key1, value1);
+        mock_time.inc(Duration::from_millis(50));
+        cache.put(&key2, value2);
+        mock_time.inc(Duration::from_millis(50));
+
+        // path3 should evict path1 due to size limit
+        cache.put(&key3, value3);
+        assert!(!cache.contains_key(&key1)); // Evicted by LRU
+        assert!(cache.contains_key(&key2));
+        assert!(cache.contains_key(&key3));
+
+        mock_time.inc(Duration::from_millis(151));
+
+        assert!(!cache.contains_key(&key2)); // Expired
+        assert!(cache.contains_key(&key3)); // Still valid
+    }
+
+    #[test]
+    fn test_ttl_expiration_in_get() {
+        let ttl = Duration::from_millis(100);
+        let cache = DefaultListFilesCache::new(10000, Some(ttl));
+
+        let (path, value, _) = create_test_list_files_entry("path", 2, 50);
+        let table_ref = Some(TableReference::from("table"));
+        let key = TableScopedPath {
+            table: table_ref,
+            path,
+        };
+
+        // Cache the entry
+        cache.put(&key, value.clone());
+
+        // Entry should be accessible immediately
+        let result = cache.get(&key);
+        assert!(result.is_some());
+        assert_eq!(result.unwrap().files.len(), 2);
+
+        // Wait for TTL to expire
+        thread::sleep(Duration::from_millis(150));
+
+        // Get should return None because entry expired
+        let result2 = cache.get(&key);
+        assert!(result2.is_none());
+    }
+
+    #[test]
+    fn test_meta_heap_bytes_calculation() {
+        // Test with minimal ObjectMeta (no e_tag, no version)
+        let meta1 = ObjectMeta {
+            location: Path::from("test"),
+            last_modified: chrono::Utc::now(),
+            size: 100,
+            e_tag: None,
+            version: None,
+        };
+        assert_eq!(meta_heap_bytes(&meta1), 4); // Just the location string "test"
+
+        // Test with e_tag
+        let meta2 = ObjectMeta {
+            location: Path::from("test"),
+            last_modified: chrono::Utc::now(),
+            size: 100,
+            e_tag: Some("etag123".to_string()),
+            version: None,
+        };
+        assert_eq!(meta_heap_bytes(&meta2), 4 + 7); // location (4) + e_tag (7)
+
+        // Test with version
+        let meta3 = ObjectMeta {
+            location: Path::from("test"),
+            last_modified: chrono::Utc::now(),
+            size: 100,
+            e_tag: None,
+            version: Some("v1.0".to_string()),
+        };
+        assert_eq!(meta_heap_bytes(&meta3), 4 + 4); // location (4) + version (4)
+
+        // Test with both e_tag and version
+        let meta4 = ObjectMeta {
+            location: Path::from("test"),
+            last_modified: chrono::Utc::now(),
+            size: 100,
+            e_tag: Some("tag".to_string()),
+            version: Some("ver".to_string()),
+        };
+        assert_eq!(meta_heap_bytes(&meta4), 4 + 3 + 3); // location (4) + e_tag (3) + version (3)
+    }
+
+    #[test]
+    fn test_entry_creation() {
+        // Test with empty vector
+        let empty_list = CachedFileList::new(vec![]);
+        let now = Instant::now();
+        let entry = ListFilesEntry::try_new(empty_list, None, now);
+        assert!(entry.is_none());
+
+        // Validate entry size
+        let metas: Vec<ObjectMeta> = (0..5)
+            .map(|i| create_test_object_meta(&format!("file{i}"), 30))
+            .collect();
+        let cached_list = CachedFileList::new(metas);
+        let entry = ListFilesEntry::try_new(cached_list, None, now).unwrap();
+        assert_eq!(entry.metas.files.len(), 5);
+        // Size should be: capacity * sizeof(ObjectMeta) + (5 * 30) for heap bytes
+        let expected_size = (entry.metas.files.capacity() * size_of::<ObjectMeta>())
+            + (entry.metas.files.len() * 30);
+        assert_eq!(entry.size_bytes, expected_size);
+
+        // Test with TTL
+        let meta = create_test_object_meta("file", 50);
+        let ttl = Duration::from_secs(10);
+        let cached_list = CachedFileList::new(vec![meta]);
+        let entry = ListFilesEntry::try_new(cached_list, Some(ttl), now).unwrap();
+        assert!(entry.expires.unwrap() > now);
+    }
+
+    #[test]
+    fn test_memory_tracking() {
+        let cache = DefaultListFilesCache::new(1000, None);
+
+        // Verify cache starts with 0 memory used
+        {
+            let state = cache.state.lock().unwrap();
+            assert_eq!(state.memory_used, 0);
+        }
+
+        // Add entry and verify memory tracking
+        let (path1, value1, size1) = create_test_list_files_entry("path1", 1, 100);
+        let table_ref = Some(TableReference::from("table"));
+        let key1 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path1,
+        };
+        cache.put(&key1, value1);
+        {
+            let state = cache.state.lock().unwrap();
+            assert_eq!(state.memory_used, size1);
+        }
+
+        // Add another entry
+        let (path2, value2, size2) = create_test_list_files_entry("path2", 1, 200);
+        let key2 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path2,
+        };
+        cache.put(&key2, value2);
+        {
+            let state = cache.state.lock().unwrap();
+            assert_eq!(state.memory_used, size1 + size2);
+        }
+
+        // Remove first entry and verify memory decreases
+        cache.remove(&key1);
+        {
+            let state = cache.state.lock().unwrap();
+            assert_eq!(state.memory_used, size2);
+        }
+
+        // Clear and verify memory is 0
+        cache.clear();
+        {
+            let state = cache.state.lock().unwrap();
+            assert_eq!(state.memory_used, 0);
+        }
+    }
+
+    // Prefix filtering tests using CachedFileList::filter_by_prefix
+
+    /// Helper function to create ObjectMeta with a specific location path
+    fn create_object_meta_with_path(location: &str) -> ObjectMeta {
+        ObjectMeta {
+            location: Path::from(location),
+            last_modified: DateTime::parse_from_rfc3339("2022-09-27T22:36:00+02:00")
+                .unwrap()
+                .into(),
+            size: 1024,
+            e_tag: None,
+            version: None,
+        }
+    }
+
+    #[test]
+    fn test_prefix_filtering() {
+        let cache = DefaultListFilesCache::new(100000, None);
+
+        // Create files for a partitioned table
+        let table_base = Path::from("my_table");
+        let files = vec![
+            create_object_meta_with_path("my_table/a=1/file1.parquet"),
+            create_object_meta_with_path("my_table/a=1/file2.parquet"),
+            create_object_meta_with_path("my_table/a=2/file3.parquet"),
+            create_object_meta_with_path("my_table/a=2/file4.parquet"),
+        ];
+
+        // Cache the full table listing
+        let table_ref = Some(TableReference::from("table"));
+        let key = TableScopedPath {
+            table: table_ref,
+            path: table_base,
+        };
+        cache.put(&key, CachedFileList::new(files));
+
+        let result = cache.get(&key).unwrap();
+
+        // Filter for partition a=1
+        let prefix_a1 = Some(Path::from("my_table/a=1"));
+        let filtered = result.files_matching_prefix(&prefix_a1);
+        assert_eq!(filtered.len(), 2);
+        assert!(
+            filtered
+                .iter()
+                .all(|m| m.location.as_ref().starts_with("my_table/a=1"))
+        );
+
+        // Filter for partition a=2
+        let prefix_a2 = Some(Path::from("my_table/a=2"));
+        let filtered_2 = result.files_matching_prefix(&prefix_a2);
+        assert_eq!(filtered_2.len(), 2);
+        assert!(
+            filtered_2
+                .iter()
+                .all(|m| m.location.as_ref().starts_with("my_table/a=2"))
+        );
+
+        // No filter returns all
+        let all = result.files_matching_prefix(&None);
+        assert_eq!(all.len(), 4);
+    }
+
+    #[test]
+    fn test_prefix_no_matching_files() {
+        let cache = DefaultListFilesCache::new(100000, None);
+
+        let table_base = Path::from("my_table");
+        let files = vec![
+            create_object_meta_with_path("my_table/a=1/file1.parquet"),
+            create_object_meta_with_path("my_table/a=2/file2.parquet"),
+        ];
+
+        let table_ref = Some(TableReference::from("table"));
+        let key = TableScopedPath {
+            table: table_ref,
+            path: table_base,
+        };
+        cache.put(&key, CachedFileList::new(files));
+        let result = cache.get(&key).unwrap();
+
+        // Query for partition a=3 which doesn't exist
+        let prefix_a3 = Some(Path::from("my_table/a=3"));
+        let filtered = result.files_matching_prefix(&prefix_a3);
+        assert!(filtered.is_empty());
+    }
+
+    #[test]
+    fn test_nested_partitions() {
+        let cache = DefaultListFilesCache::new(100000, None);
+
+        let table_base = Path::from("events");
+        let files = vec![
+            create_object_meta_with_path(
+                "events/year=2024/month=01/day=01/file1.parquet",
+            ),
+            create_object_meta_with_path(
+                "events/year=2024/month=01/day=02/file2.parquet",
+            ),
+            create_object_meta_with_path(
+                "events/year=2024/month=02/day=01/file3.parquet",
+            ),
+            create_object_meta_with_path(
+                "events/year=2025/month=01/day=01/file4.parquet",
+            ),
+        ];
+
+        let table_ref = Some(TableReference::from("table"));
+        let key = TableScopedPath {
+            table: table_ref,
+            path: table_base,
+        };
+        cache.put(&key, CachedFileList::new(files));
+        let result = cache.get(&key).unwrap();
+
+        // Filter for year=2024/month=01
+        let prefix_month = Some(Path::from("events/year=2024/month=01"));
+        let filtered = result.files_matching_prefix(&prefix_month);
+        assert_eq!(filtered.len(), 2);
+
+        // Filter for year=2024
+        let prefix_year = Some(Path::from("events/year=2024"));
+        let filtered_year = result.files_matching_prefix(&prefix_year);
+        assert_eq!(filtered_year.len(), 3);
+    }
+
+    #[test]
+    fn test_drop_table_entries() {
+        let cache = DefaultListFilesCache::default();
+
+        let (path1, value1, _) = create_test_list_files_entry("path1", 1, 100);
+        let (path2, value2, _) = create_test_list_files_entry("path2", 1, 100);
+        let (path3, value3, _) = create_test_list_files_entry("path3", 1, 100);
+
+        let table_ref1 = Some(TableReference::from("table1"));
+        let key1 = TableScopedPath {
+            table: table_ref1.clone(),
+            path: path1,
+        };
+        let key2 = TableScopedPath {
+            table: table_ref1.clone(),
+            path: path2,
+        };
+
+        let table_ref2 = Some(TableReference::from("table2"));
+        let key3 = TableScopedPath {
+            table: table_ref2.clone(),
+            path: path3,
+        };
+
+        cache.put(&key1, value1);
+        cache.put(&key2, value2);
+        cache.put(&key3, value3);
+
+        cache.drop_table_entries(&table_ref1).unwrap();
+
+        assert!(!cache.contains_key(&key1));
+        assert!(!cache.contains_key(&key2));
+        assert!(cache.contains_key(&key3));
+    }
+}
diff --git a/datafusion/execution/src/cache/mod.rs b/datafusion/execution/src/cache/mod.rs
index b1857c94facdf..0380e50c0935c 100644
--- a/datafusion/execution/src/cache/mod.rs
+++ b/datafusion/execution/src/cache/mod.rs
@@ -19,34 +19,62 @@ pub mod cache_manager;
 pub mod cache_unit;
 pub mod lru_queue;
 
-/// The cache accessor, users usually working on this interface while manipulating caches.
-/// This interface does not get `mut` references and thus has to handle its own
-/// locking via internal mutability. It can be accessed via multiple concurrent queries
-/// during planning and execution.
+mod file_metadata_cache;
+mod list_files_cache;
+
+pub use file_metadata_cache::DefaultFilesMetadataCache;
+pub use list_files_cache::DefaultListFilesCache;
+pub use list_files_cache::ListFilesEntry;
+pub use list_files_cache::TableScopedPath;
+
+/// Base trait for cache implementations with common operations.
+///
+/// This trait provides the fundamental cache operations (`get`, `put`, `remove`, etc.)
+/// that all cache types share. Specific cache traits like [`cache_manager::FileStatisticsCache`],
+/// [`cache_manager::ListFilesCache`], and [`cache_manager::FileMetadataCache`] extend this
+/// trait with their specialized methods.
+///
+/// ## Thread Safety
+///
+/// Implementations must handle their own locking via internal mutability, as methods do not
+/// take mutable references and may be accessed by multiple concurrent queries.
+///
+/// ## Validation Pattern
+///
+/// Validation metadata (e.g., file size, last modified time) should be embedded in the
+/// value type `V`. The typical usage pattern is:
+/// 1. Call `get(key)` to check for cached value
+/// 2. If `Some(cached)`, validate with `cached.is_valid_for(&current_meta)`
+/// 3. If invalid or missing, compute new value and call `put(key, new_value)`
 pub trait CacheAccessor<K, V>: Send + Sync {
-    // Extra info but not part of the cache key or cache value.
-    type Extra: Clone;
-
-    /// Get value from cache.
-    fn get(&self, k: &K) -> Option<V>;
-    /// Get value from cache.
-    fn get_with_extra(&self, k: &K, e: &Self::Extra) -> Option<V>;
-    /// Put value into cache. Returns the old value associated with the key if there was one.
+    /// Get a cached entry if it exists.
+    ///
+    /// Returns the cached value without any validation. The caller should
+    /// validate the returned value if freshness matters.
+    fn get(&self, key: &K) -> Option<V>;
+
+    /// Store a value in the cache.
+    ///
+    /// Returns the previous value if one existed.
     fn put(&self, key: &K, value: V) -> Option<V>;
-    /// Put value into cache. Returns the old value associated with the key if there was one.
-    fn put_with_extra(&self, key: &K, value: V, e: &Self::Extra) -> Option<V>;
-    /// Remove an entry from the cache, returning value if they existed in the map.
-    fn remove(&mut self, k: &K) -> Option<V>;
+
+    /// Remove an entry from the cache, returning the value if it existed.
+    fn remove(&self, k: &K) -> Option<V>;
+
     /// Check if the cache contains a specific key.
     fn contains_key(&self, k: &K) -> bool;
+
     /// Fetch the total number of cache entries.
     fn len(&self) -> usize;
-    /// Check if the Cache collection is empty or not.
+
+    /// Check if the cache collection is empty.
     fn is_empty(&self) -> bool {
         self.len() == 0
     }
+
     /// Remove all entries from the cache.
     fn clear(&self);
+
     /// Return the cache name.
     fn name(&self) -> String;
 }
diff --git a/datafusion/execution/src/config.rs b/datafusion/execution/src/config.rs
index a0b180bf40206..6f6071163110b 100644
--- a/datafusion/execution/src/config.rs
+++ b/datafusion/execution/src/config.rs
@@ -23,8 +23,8 @@ use std::{
 };
 
 use datafusion_common::{
-    config::{ConfigExtension, ConfigOptions, SpillCompression},
     Result, ScalarValue,
+    config::{ConfigExtension, ConfigOptions, SpillCompression},
 };
 
 /// Configuration options for [`SessionContext`].
@@ -424,6 +424,22 @@ impl SessionConfig {
         self.options.optimizer.enable_round_robin_repartition
     }
 
+    /// Enables or disables sort pushdown optimization, and currently only
+    /// applies to Parquet data source.
+    pub fn with_enable_sort_pushdown(mut self, enabled: bool) -> Self {
+        self.options_mut().optimizer.enable_sort_pushdown = enabled;
+        self
+    }
+
+    /// Enables or disables elimination of `ORDER BY` clauses in subqueries
+    /// when they are not required by order-sensitive operators.
+    pub fn with_enable_subquery_sort_elimination(mut self, enabled: bool) -> Self {
+        self.options_mut()
+            .sql_parser
+            .enable_subquery_sort_elimination = enabled;
+        self
+    }
+
     /// Set the size of [`sort_spill_reservation_bytes`] to control
     /// memory pre-reservation
     ///
@@ -473,6 +489,12 @@ impl SessionConfig {
         self.options.execution.enforce_batch_size_in_joins
     }
 
+    /// Toggle SQL ANSI mode for expressions, casting, and error handling
+    pub fn with_enable_ansi_mode(mut self, enable_ansi_mode: bool) -> Self {
+        self.options_mut().execution.enable_ansi_mode = enable_ansi_mode;
+        self
+    }
+
     /// Convert configuration options to name-value pairs with values
     /// converted to strings.
     ///
diff --git a/datafusion/execution/src/disk_manager.rs b/datafusion/execution/src/disk_manager.rs
index c3aa1bfa2958c..1a14bd239a61a 100644
--- a/datafusion/execution/src/disk_manager.rs
+++ b/datafusion/execution/src/disk_manager.rs
@@ -18,19 +18,19 @@
 //! [`DiskManager`]: Manages files generated during query execution
 
 use datafusion_common::{
-    config_err, resources_datafusion_err, resources_err, DataFusionError, Result,
+    DataFusionError, Result, config_err, resources_datafusion_err, resources_err,
 };
 use log::debug;
 use parking_lot::Mutex;
-use rand::{rng, Rng};
+use rand::{Rng, rng};
 use std::path::{Path, PathBuf};
-use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
 use tempfile::{Builder, NamedTempFile, TempDir};
 
-use crate::memory_pool::human_readable_size;
+use datafusion_common::human_readable_size;
 
-const DEFAULT_MAX_TEMP_DIRECTORY_SIZE: u64 = 100 * 1024 * 1024 * 1024; // 100GB
+pub const DEFAULT_MAX_TEMP_DIRECTORY_SIZE: u64 = 100 * 1024 * 1024 * 1024; // 100GB
 
 /// Builder pattern for the [DiskManager] structure
 #[derive(Clone, Debug)]
@@ -77,9 +77,10 @@ impl DiskManagerBuilder {
                 local_dirs: Mutex::new(Some(vec![])),
                 max_temp_directory_size: self.max_temp_directory_size,
                 used_disk_space: Arc::new(AtomicU64::new(0)),
+                active_files_count: Arc::new(AtomicUsize::new(0)),
             }),
             DiskManagerMode::Directories(conf_dirs) => {
-                let local_dirs = create_local_dirs(conf_dirs)?;
+                let local_dirs = create_local_dirs(&conf_dirs)?;
                 debug!(
                     "Created local dirs {local_dirs:?} as DataFusion working directory"
                 );
@@ -87,12 +88,14 @@ impl DiskManagerBuilder {
                     local_dirs: Mutex::new(Some(local_dirs)),
                     max_temp_directory_size: self.max_temp_directory_size,
                     used_disk_space: Arc::new(AtomicU64::new(0)),
+                    active_files_count: Arc::new(AtomicUsize::new(0)),
                 })
             }
             DiskManagerMode::Disabled => Ok(DiskManager {
                 local_dirs: Mutex::new(None),
                 max_temp_directory_size: self.max_temp_directory_size,
                 used_disk_space: Arc::new(AtomicU64::new(0)),
+                active_files_count: Arc::new(AtomicUsize::new(0)),
             }),
         }
     }
@@ -115,9 +118,10 @@ pub enum DiskManagerMode {
 }
 
 /// Configuration for temporary disk access
-#[allow(deprecated)]
 #[deprecated(since = "48.0.0", note = "Use DiskManagerBuilder instead")]
 #[derive(Debug, Clone, Default)]
+#[allow(clippy::allow_attributes)]
+#[allow(deprecated)]
 pub enum DiskManagerConfig {
     /// Use the provided [DiskManager] instance
     Existing(Arc<DiskManager>),
@@ -135,7 +139,7 @@ pub enum DiskManagerConfig {
     Disabled,
 }
 
-#[allow(deprecated)]
+#[expect(deprecated)]
 impl DiskManagerConfig {
     /// Create temporary files in a temporary directory chosen by the OS
     pub fn new() -> Self {
@@ -168,6 +172,17 @@ pub struct DiskManager {
     /// Used disk space in the temporary directories. Now only spilled data for
     /// external executors are counted.
     used_disk_space: Arc<AtomicU64>,
+    /// Number of active temporary files created by this disk manager
+    active_files_count: Arc<AtomicUsize>,
+}
+
+/// Information about the current disk usage for spilling
+#[derive(Debug, Clone, Copy)]
+pub struct SpillingProgress {
+    /// Total bytes currently used on disk for spilling
+    pub current_bytes: u64,
+    /// Total number of active spill files
+    pub active_files_count: usize,
 }
 
 impl DiskManager {
@@ -177,7 +192,7 @@ impl DiskManager {
     }
 
     /// Create a DiskManager given the configuration
-    #[allow(deprecated)]
+    #[expect(deprecated)]
     #[deprecated(since = "48.0.0", note = "Use DiskManager::builder() instead")]
     pub fn try_new(config: DiskManagerConfig) -> Result<Arc<Self>> {
         match config {
@@ -186,9 +201,10 @@ impl DiskManager {
                 local_dirs: Mutex::new(Some(vec![])),
                 max_temp_directory_size: DEFAULT_MAX_TEMP_DIRECTORY_SIZE,
                 used_disk_space: Arc::new(AtomicU64::new(0)),
+                active_files_count: Arc::new(AtomicUsize::new(0)),
             })),
             DiskManagerConfig::NewSpecified(conf_dirs) => {
-                let local_dirs = create_local_dirs(conf_dirs)?;
+                let local_dirs = create_local_dirs(&conf_dirs)?;
                 debug!(
                     "Created local dirs {local_dirs:?} as DataFusion working directory"
                 );
@@ -196,12 +212,14 @@ impl DiskManager {
                     local_dirs: Mutex::new(Some(local_dirs)),
                     max_temp_directory_size: DEFAULT_MAX_TEMP_DIRECTORY_SIZE,
                     used_disk_space: Arc::new(AtomicU64::new(0)),
+                    active_files_count: Arc::new(AtomicUsize::new(0)),
                 }))
             }
             DiskManagerConfig::Disabled => Ok(Arc::new(Self {
                 local_dirs: Mutex::new(None),
                 max_temp_directory_size: DEFAULT_MAX_TEMP_DIRECTORY_SIZE,
                 used_disk_space: Arc::new(AtomicU64::new(0)),
+                active_files_count: Arc::new(AtomicUsize::new(0)),
             })),
         }
     }
@@ -246,6 +264,32 @@ impl DiskManager {
         self.used_disk_space.load(Ordering::Relaxed)
     }
 
+    /// Returns the maximum temporary directory size in bytes
+    pub fn max_temp_directory_size(&self) -> u64 {
+        self.max_temp_directory_size
+    }
+
+    /// Returns the current spilling progress
+    pub fn spilling_progress(&self) -> SpillingProgress {
+        SpillingProgress {
+            current_bytes: self.used_disk_space.load(Ordering::Relaxed),
+            active_files_count: self.active_files_count.load(Ordering::Relaxed),
+        }
+    }
+
+    /// Returns the temporary directory paths
+    pub fn temp_dir_paths(&self) -> Vec<PathBuf> {
+        self.local_dirs
+            .lock()
+            .as_ref()
+            .map(|dirs| {
+                dirs.iter()
+                    .map(|temp_dir| temp_dir.path().to_path_buf())
+                    .collect()
+            })
+            .unwrap_or_default()
+    }
+
     /// Return true if this disk manager supports creating temporary
     /// files. If this returns false, any call to `create_tmp_file`
     /// will error.
@@ -282,12 +326,15 @@ impl DiskManager {
         }
 
         let dir_index = rng().random_range(0..local_dirs.len());
+        self.active_files_count.fetch_add(1, Ordering::Relaxed);
         Ok(RefCountedTempFile {
-            _parent_temp_dir: Arc::clone(&local_dirs[dir_index]),
-            tempfile: Builder::new()
-                .tempfile_in(local_dirs[dir_index].as_ref())
-                .map_err(DataFusionError::IoError)?,
-            current_file_disk_usage: 0,
+            parent_temp_dir: Arc::clone(&local_dirs[dir_index]),
+            tempfile: Arc::new(
+                Builder::new()
+                    .tempfile_in(local_dirs[dir_index].as_ref())
+                    .map_err(DataFusionError::IoError)?,
+            ),
+            current_file_disk_usage: Arc::new(AtomicU64::new(0)),
             disk_manager: Arc::clone(self),
         })
     }
@@ -301,26 +348,50 @@ impl DiskManager {
 /// must invoke [`Self::update_disk_usage`] to update the global disk usage counter.
 /// This ensures the disk manager can properly enforce usage limits configured by
 /// [`DiskManager::with_max_temp_directory_size`].
+///
+/// This type is Clone-able, allowing multiple references to the same underlying file.
+/// The file is deleted only when the last reference is dropped.
+///
+/// The parent temporary directory is also kept alive as long as any reference to
+/// this file exists, preventing premature cleanup of the directory.
+///
+/// Once all references to this file are dropped, the file is deleted, and the
+/// disk usage is subtracted from the disk manager's total.
 #[derive(Debug)]
 pub struct RefCountedTempFile {
     /// The reference to the directory in which temporary files are created to ensure
     /// it is not cleaned up prior to the NamedTempFile
-    _parent_temp_dir: Arc<TempDir>,
-    tempfile: NamedTempFile,
+    parent_temp_dir: Arc<TempDir>,
+    /// The underlying temporary file, wrapped in Arc to allow cloning
+    tempfile: Arc<NamedTempFile>,
     /// Tracks the current disk usage of this temporary file. See
     /// [`Self::update_disk_usage`] for more details.
-    current_file_disk_usage: u64,
+    ///
+    /// This is wrapped in `Arc<AtomicU64>` so that all clones share the same
+    /// disk usage tracking, preventing incorrect accounting when clones are dropped.
+    current_file_disk_usage: Arc<AtomicU64>,
     /// The disk manager that created and manages this temporary file
     disk_manager: Arc<DiskManager>,
 }
 
+impl Clone for RefCountedTempFile {
+    fn clone(&self) -> Self {
+        Self {
+            parent_temp_dir: Arc::clone(&self.parent_temp_dir),
+            tempfile: Arc::clone(&self.tempfile),
+            current_file_disk_usage: Arc::clone(&self.current_file_disk_usage),
+            disk_manager: Arc::clone(&self.disk_manager),
+        }
+    }
+}
+
 impl RefCountedTempFile {
     pub fn path(&self) -> &Path {
         self.tempfile.path()
     }
 
     pub fn inner(&self) -> &NamedTempFile {
-        &self.tempfile
+        self.tempfile.as_ref()
     }
 
     /// Updates the global disk usage counter after modifications to the underlying file.
@@ -332,11 +403,14 @@ impl RefCountedTempFile {
         let metadata = self.tempfile.as_file().metadata()?;
         let new_disk_usage = metadata.len();
 
+        // Get the old disk usage
+        let old_disk_usage = self.current_file_disk_usage.load(Ordering::Relaxed);
+
         // Update the global disk usage by:
         // 1. Subtracting the old file size from the global counter
         self.disk_manager
             .used_disk_space
-            .fetch_sub(self.current_file_disk_usage, Ordering::Relaxed);
+            .fetch_sub(old_disk_usage, Ordering::Relaxed);
         // 2. Adding the new file size to the global counter
         self.disk_manager
             .used_disk_space
@@ -346,34 +420,44 @@ impl RefCountedTempFile {
         let global_disk_usage = self.disk_manager.used_disk_space.load(Ordering::Relaxed);
         if global_disk_usage > self.disk_manager.max_temp_directory_size {
             return resources_err!(
-                "The used disk space during the spilling process has exceeded the allowable limit of {}. Try increasing the `max_temp_directory_size` in the disk manager configuration.",
+                "The used disk space during the spilling process has exceeded the allowable limit of {}. \
+                Please try increasing the config: `datafusion.runtime.max_temp_directory_size`.",
                 human_readable_size(self.disk_manager.max_temp_directory_size as usize)
             );
         }
 
         // 4. Update the local file size tracking
-        self.current_file_disk_usage = new_disk_usage;
+        self.current_file_disk_usage
+            .store(new_disk_usage, Ordering::Relaxed);
 
         Ok(())
     }
 
     pub fn current_disk_usage(&self) -> u64 {
-        self.current_file_disk_usage
+        self.current_file_disk_usage.load(Ordering::Relaxed)
     }
 }
 
 /// When the temporary file is dropped, subtract its disk usage from the disk manager's total
 impl Drop for RefCountedTempFile {
     fn drop(&mut self) {
-        // Subtract the current file's disk usage from the global counter
-        self.disk_manager
-            .used_disk_space
-            .fetch_sub(self.current_file_disk_usage, Ordering::Relaxed);
+        // Only subtract disk usage when this is the last reference to the file
+        // Check if we're the last one by seeing if there's only one strong reference
+        // left to the underlying tempfile (the one we're holding)
+        if Arc::strong_count(&self.tempfile) == 1 {
+            let current_usage = self.current_file_disk_usage.load(Ordering::Relaxed);
+            self.disk_manager
+                .used_disk_space
+                .fetch_sub(current_usage, Ordering::Relaxed);
+            self.disk_manager
+                .active_files_count
+                .fetch_sub(1, Ordering::Relaxed);
+        }
     }
 }
 
 /// Setup local dirs by creating one new dir in each of the given dirs
-fn create_local_dirs(local_dirs: Vec<PathBuf>) -> Result<Vec<Arc<TempDir>>> {
+fn create_local_dirs(local_dirs: &[PathBuf]) -> Result<Vec<Arc<TempDir>>> {
     local_dirs
         .iter()
         .map(|root| {
@@ -455,7 +539,10 @@ mod tests {
         );
         assert!(!manager.tmp_files_enabled());
         assert_eq!(
-            manager.create_tmp_file("Testing").unwrap_err().strip_backtrace(),
+            manager
+                .create_tmp_file("Testing")
+                .unwrap_err()
+                .strip_backtrace(),
             "Resources exhausted: Memory Exhausted while Testing (DiskManager is disabled)",
         )
     }
@@ -523,4 +610,190 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_disk_usage_basic() -> Result<()> {
+        use std::io::Write;
+
+        let dm = Arc::new(DiskManagerBuilder::default().build()?);
+        let mut temp_file = dm.create_tmp_file("Testing")?;
+
+        // Initially, disk usage should be 0
+        assert_eq!(dm.used_disk_space(), 0);
+        assert_eq!(temp_file.current_disk_usage(), 0);
+
+        // Write some data to the file
+        temp_file.inner().as_file().write_all(b"hello world")?;
+        temp_file.update_disk_usage()?;
+
+        // Disk usage should now reflect the written data
+        let expected_usage = temp_file.current_disk_usage();
+        assert!(expected_usage > 0);
+        assert_eq!(dm.used_disk_space(), expected_usage);
+
+        // Write more data
+        temp_file.inner().as_file().write_all(b" more data")?;
+        temp_file.update_disk_usage()?;
+
+        // Disk usage should increase
+        let new_usage = temp_file.current_disk_usage();
+        assert!(new_usage > expected_usage);
+        assert_eq!(dm.used_disk_space(), new_usage);
+
+        // Drop the file
+        drop(temp_file);
+
+        // Disk usage should return to 0
+        assert_eq!(dm.used_disk_space(), 0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_disk_usage_with_clones() -> Result<()> {
+        use std::io::Write;
+
+        let dm = Arc::new(DiskManagerBuilder::default().build()?);
+        let mut temp_file = dm.create_tmp_file("Testing")?;
+
+        // Write some data
+        temp_file.inner().as_file().write_all(b"test data")?;
+        temp_file.update_disk_usage()?;
+
+        let usage_after_write = temp_file.current_disk_usage();
+        assert!(usage_after_write > 0);
+        assert_eq!(dm.used_disk_space(), usage_after_write);
+
+        // Clone the file
+        let clone1 = temp_file.clone();
+        let clone2 = temp_file.clone();
+
+        // All clones should see the same disk usage
+        assert_eq!(clone1.current_disk_usage(), usage_after_write);
+        assert_eq!(clone2.current_disk_usage(), usage_after_write);
+
+        // Global disk usage should still be the same (not multiplied by number of clones)
+        assert_eq!(dm.used_disk_space(), usage_after_write);
+
+        // Write more data through one clone
+        clone1.inner().as_file().write_all(b" more data")?;
+        let mut mutable_clone1 = clone1;
+        mutable_clone1.update_disk_usage()?;
+
+        let new_usage = mutable_clone1.current_disk_usage();
+        assert!(new_usage > usage_after_write);
+
+        // All clones should see the updated disk usage
+        assert_eq!(temp_file.current_disk_usage(), new_usage);
+        assert_eq!(clone2.current_disk_usage(), new_usage);
+        assert_eq!(mutable_clone1.current_disk_usage(), new_usage);
+
+        // Global disk usage should reflect the new size (not multiplied)
+        assert_eq!(dm.used_disk_space(), new_usage);
+
+        // Drop one clone
+        drop(mutable_clone1);
+
+        // Disk usage should NOT change (other clones still exist)
+        assert_eq!(dm.used_disk_space(), new_usage);
+        assert_eq!(temp_file.current_disk_usage(), new_usage);
+        assert_eq!(clone2.current_disk_usage(), new_usage);
+
+        // Drop another clone
+        drop(clone2);
+
+        // Disk usage should still NOT change (original still exists)
+        assert_eq!(dm.used_disk_space(), new_usage);
+        assert_eq!(temp_file.current_disk_usage(), new_usage);
+
+        // Drop the original
+        drop(temp_file);
+
+        // Now disk usage should return to 0 (last reference dropped)
+        assert_eq!(dm.used_disk_space(), 0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_disk_usage_clones_dropped_out_of_order() -> Result<()> {
+        use std::io::Write;
+
+        let dm = Arc::new(DiskManagerBuilder::default().build()?);
+        let mut temp_file = dm.create_tmp_file("Testing")?;
+
+        // Write data
+        temp_file.inner().as_file().write_all(b"test")?;
+        temp_file.update_disk_usage()?;
+
+        let usage = temp_file.current_disk_usage();
+        assert_eq!(dm.used_disk_space(), usage);
+
+        // Create multiple clones
+        let clone1 = temp_file.clone();
+        let clone2 = temp_file.clone();
+        let clone3 = temp_file.clone();
+
+        // Drop the original first (out of order)
+        drop(temp_file);
+
+        // Disk usage should still be tracked (clones exist)
+        assert_eq!(dm.used_disk_space(), usage);
+        assert_eq!(clone1.current_disk_usage(), usage);
+
+        // Drop clones in different order
+        drop(clone2);
+        assert_eq!(dm.used_disk_space(), usage);
+
+        drop(clone1);
+        assert_eq!(dm.used_disk_space(), usage);
+
+        // Drop the last clone
+        drop(clone3);
+
+        // Now disk usage should be 0
+        assert_eq!(dm.used_disk_space(), 0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_disk_usage_multiple_files() -> Result<()> {
+        use std::io::Write;
+
+        let dm = Arc::new(DiskManagerBuilder::default().build()?);
+
+        // Create multiple temp files
+        let mut file1 = dm.create_tmp_file("Testing1")?;
+        let mut file2 = dm.create_tmp_file("Testing2")?;
+
+        // Write to first file
+        file1.inner().as_file().write_all(b"file1")?;
+        file1.update_disk_usage()?;
+        let usage1 = file1.current_disk_usage();
+
+        assert_eq!(dm.used_disk_space(), usage1);
+
+        // Write to second file
+        file2.inner().as_file().write_all(b"file2 data")?;
+        file2.update_disk_usage()?;
+        let usage2 = file2.current_disk_usage();
+
+        // Global usage should be sum of both files
+        assert_eq!(dm.used_disk_space(), usage1 + usage2);
+
+        // Drop first file
+        drop(file1);
+
+        // Usage should only reflect second file
+        assert_eq!(dm.used_disk_space(), usage2);
+
+        // Drop second file
+        drop(file2);
+
+        // Usage should be 0
+        assert_eq!(dm.used_disk_space(), 0);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/execution/src/lib.rs b/datafusion/execution/src/lib.rs
index 55243e301e0e9..1a8da9459ae10 100644
--- a/datafusion/execution/src/lib.rs
+++ b/datafusion/execution/src/lib.rs
@@ -23,6 +23,7 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! DataFusion execution configuration and runtime structures
 
@@ -46,4 +47,4 @@ pub mod registry {
 pub use disk_manager::DiskManager;
 pub use registry::FunctionRegistry;
 pub use stream::{RecordBatchStream, SendableRecordBatchStream};
-pub use task::TaskContext;
+pub use task::{TaskContext, TaskContextProvider};
diff --git a/datafusion/execution/src/memory_pool/arrow.rs b/datafusion/execution/src/memory_pool/arrow.rs
new file mode 100644
index 0000000000000..929e3b7bd27e5
--- /dev/null
+++ b/datafusion/execution/src/memory_pool/arrow.rs
@@ -0,0 +1,142 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Adapter for integrating DataFusion's [`MemoryPool`] with Arrow's memory tracking APIs.
+
+use crate::memory_pool::{MemoryConsumer, MemoryLimit, MemoryPool, MemoryReservation};
+use std::fmt::Debug;
+use std::sync::Arc;
+
+/// An adapter that implements Arrow's [`arrow_buffer::MemoryPool`] trait
+/// by wrapping a DataFusion [`MemoryPool`].
+///
+/// This allows DataFusion's memory management system to be used with Arrow's
+/// memory allocation APIs. Each reservation made through this pool will be
+/// tracked using the provided [`MemoryConsumer`], enabling DataFusion to
+/// monitor and limit memory usage across Arrow operations.
+///
+/// This is useful when you want Arrow operations (such as array builders
+/// or compute kernels) to participate in DataFusion's memory management
+/// and respect the same memory limits as DataFusion operators.
+#[derive(Debug)]
+pub struct ArrowMemoryPool {
+    inner: Arc<dyn MemoryPool>,
+    consumer: MemoryConsumer,
+}
+
+impl ArrowMemoryPool {
+    /// Creates a new [`ArrowMemoryPool`] that wraps the given DataFusion [`MemoryPool`]
+    /// and tracks allocations under the specified [`MemoryConsumer`].
+    pub fn new(inner: Arc<dyn MemoryPool>, consumer: MemoryConsumer) -> Self {
+        Self { inner, consumer }
+    }
+}
+
+impl arrow_buffer::MemoryReservation for MemoryReservation {
+    fn size(&self) -> usize {
+        MemoryReservation::size(self)
+    }
+
+    fn resize(&mut self, new_size: usize) {
+        MemoryReservation::resize(self, new_size)
+    }
+}
+
+impl arrow_buffer::MemoryPool for ArrowMemoryPool {
+    fn reserve(&self, size: usize) -> Box<dyn arrow_buffer::MemoryReservation> {
+        let consumer = self.consumer.clone_with_new_id();
+        let reservation = consumer.register(&self.inner);
+        reservation.grow(size);
+
+        Box::new(reservation)
+    }
+
+    fn available(&self) -> isize {
+        // The pool may be overfilled, so this method might return a negative value.
+        (self.capacity() as i128 - self.used() as i128)
+            .try_into()
+            .unwrap_or(isize::MIN)
+    }
+
+    fn used(&self) -> usize {
+        self.inner.reserved()
+    }
+
+    fn capacity(&self) -> usize {
+        match self.inner.memory_limit() {
+            MemoryLimit::Infinite | MemoryLimit::Unknown => usize::MAX,
+            MemoryLimit::Finite(capacity) => capacity,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::memory_pool::{GreedyMemoryPool, UnboundedMemoryPool};
+    use arrow::array::{Array, Int32Array};
+    use arrow_buffer::MemoryPool;
+
+    // Until https://github.com/apache/arrow-rs/pull/8918 lands, we need to iterate all
+    // buffers in the array. Change once the PR is released.
+    fn claim_array(array: &dyn Array, pool: &dyn MemoryPool) {
+        for buffer in array.to_data().buffers() {
+            buffer.claim(pool);
+        }
+    }
+
+    #[test]
+    pub fn can_claim_array() {
+        let pool = Arc::new(UnboundedMemoryPool::default());
+
+        let consumer = MemoryConsumer::new("arrow");
+        let arrow_pool = ArrowMemoryPool::new(pool, consumer);
+
+        let array = Int32Array::from(vec![1, 2, 3, 4, 5]);
+        claim_array(&array, &arrow_pool);
+
+        assert_eq!(arrow_pool.used(), array.get_buffer_memory_size());
+
+        let slice = array.slice(0, 2);
+
+        // This should be a no-op
+        claim_array(&slice, &arrow_pool);
+
+        assert_eq!(arrow_pool.used(), array.get_buffer_memory_size());
+    }
+
+    #[test]
+    pub fn can_claim_array_with_finite_limit() {
+        let pool_capacity = 1024;
+        let pool = Arc::new(GreedyMemoryPool::new(pool_capacity));
+
+        let consumer = MemoryConsumer::new("arrow");
+        let arrow_pool = ArrowMemoryPool::new(pool, consumer);
+
+        assert_eq!(arrow_pool.capacity(), pool_capacity);
+        assert_eq!(arrow_pool.available(), pool_capacity as isize);
+
+        let array = Int32Array::from(vec![1, 2, 3, 4, 5]);
+        claim_array(&array, &arrow_pool);
+
+        assert_eq!(arrow_pool.used(), array.get_buffer_memory_size());
+        assert_eq!(
+            arrow_pool.available(),
+            (pool_capacity - array.get_buffer_memory_size()) as isize
+        );
+    }
+}
diff --git a/datafusion/execution/src/memory_pool/mod.rs b/datafusion/execution/src/memory_pool/mod.rs
index e620b23267962..829e313d2381e 100644
--- a/datafusion/execution/src/memory_pool/mod.rs
+++ b/datafusion/execution/src/memory_pool/mod.rs
@@ -18,17 +18,23 @@
 //! [`MemoryPool`] for memory management during query execution, [`proxy`] for
 //! help with allocation accounting.
 
-use datafusion_common::{internal_err, Result};
+use datafusion_common::{Result, internal_datafusion_err};
+use std::fmt::Display;
 use std::hash::{Hash, Hasher};
-use std::{cmp::Ordering, sync::atomic, sync::Arc};
+use std::{cmp::Ordering, sync::Arc, sync::atomic};
 
 mod pool;
+
+#[cfg(feature = "arrow_buffer_pool")]
+pub mod arrow;
+
 pub mod proxy {
-    pub use datafusion_common::utils::proxy::{
-        HashTableAllocExt, RawTableAllocExt, VecAllocExt,
-    };
+    pub use datafusion_common::utils::proxy::{HashTableAllocExt, VecAllocExt};
 }
 
+pub use datafusion_common::{
+    human_readable_count, human_readable_duration, human_readable_size, units,
+};
 pub use pool::*;
 
 /// Tracks and potentially limits memory use across operators during execution.
@@ -176,7 +182,10 @@ pub use pool::*;
 ///
 /// * [`TrackConsumersPool`]: Wraps another [`MemoryPool`] and tracks consumers,
 ///   providing better error messages on the largest memory users.
-pub trait MemoryPool: Send + Sync + std::fmt::Debug {
+pub trait MemoryPool: Send + Sync + std::fmt::Debug + Display {
+    /// Return pool name
+    fn name(&self) -> &str;
+
     /// Registers a new [`MemoryConsumer`]
     ///
     /// Note: Subsequent calls to [`Self::grow`] must be made to reserve memory
@@ -227,7 +236,7 @@ pub enum MemoryLimit {
 /// [`MemoryReservation`] in a [`MemoryPool`]. All allocations are registered to
 /// a particular `MemoryConsumer`;
 ///
-/// Each `MemoryConsumer` is identifiable by a process-unique id, and is therefor not cloneable,
+/// Each `MemoryConsumer` is identifiable by a process-unique id, and is therefore not cloneable,
 /// If you want a clone of a `MemoryConsumer`, you should look into [`MemoryConsumer::clone_with_new_id`],
 /// but note that this `MemoryConsumer` may be treated as a separate entity based on the used pool,
 /// and is only guaranteed to share the name and inner properties.
@@ -321,7 +330,7 @@ impl MemoryConsumer {
                 pool: Arc::clone(pool),
                 consumer: self,
             }),
-            size: 0,
+            size: atomic::AtomicUsize::new(0),
         }
     }
 }
@@ -350,13 +359,13 @@ impl Drop for SharedRegistration {
 #[derive(Debug)]
 pub struct MemoryReservation {
     registration: Arc<SharedRegistration>,
-    size: usize,
+    size: atomic::AtomicUsize,
 }
 
 impl MemoryReservation {
     /// Returns the size of this reservation in bytes
     pub fn size(&self) -> usize {
-        self.size
+        self.size.load(atomic::Ordering::Relaxed)
     }
 
     /// Returns [MemoryConsumer] for this [MemoryReservation]
@@ -366,10 +375,10 @@ impl MemoryReservation {
 
     /// Frees all bytes from this reservation back to the underlying
     /// pool, returning the number of bytes freed.
-    pub fn free(&mut self) -> usize {
-        let size = self.size;
+    pub fn free(&self) -> usize {
+        let size = self.size.swap(0, atomic::Ordering::Relaxed);
         if size != 0 {
-            self.shrink(size)
+            self.registration.pool.shrink(self, size);
         }
         size
     }
@@ -379,60 +388,76 @@ impl MemoryReservation {
     /// # Panics
     ///
     /// Panics if `capacity` exceeds [`Self::size`]
-    pub fn shrink(&mut self, capacity: usize) {
-        let new_size = self.size.checked_sub(capacity).unwrap();
+    pub fn shrink(&self, capacity: usize) {
+        self.size
+            .fetch_update(
+                atomic::Ordering::Relaxed,
+                atomic::Ordering::Relaxed,
+                |prev| prev.checked_sub(capacity),
+            )
+            .unwrap_or_else(|prev| {
+                panic!("Cannot free the capacity {capacity} out of allocated size {prev}")
+            });
         self.registration.pool.shrink(self, capacity);
-        self.size = new_size
     }
 
     /// Tries to free `capacity` bytes from this reservation
-    /// if `capacity` does not exceed [`Self::size`]
-    /// Returns new reservation size
-    /// or error if shrinking capacity is more than allocated size
-    pub fn try_shrink(&mut self, capacity: usize) -> Result<usize> {
-        if let Some(new_size) = self.size.checked_sub(capacity) {
-            self.registration.pool.shrink(self, capacity);
-            self.size = new_size;
-            Ok(new_size)
-        } else {
-            internal_err!(
-                "Cannot free the capacity {capacity} out of allocated size {}",
-                self.size
+    /// if `capacity` does not exceed [`Self::size`].
+    /// Returns new reservation size,
+    /// or error if shrinking capacity is more than allocated size.
+    pub fn try_shrink(&self, capacity: usize) -> Result<usize> {
+        let prev = self
+            .size
+            .fetch_update(
+                atomic::Ordering::Relaxed,
+                atomic::Ordering::Relaxed,
+                |prev| prev.checked_sub(capacity),
             )
-        }
+            .map_err(|prev| {
+                internal_datafusion_err!(
+                    "Cannot free the capacity {capacity} out of allocated size {prev}"
+                )
+            })?;
+
+        self.registration.pool.shrink(self, capacity);
+        Ok(prev - capacity)
     }
 
     /// Sets the size of this reservation to `capacity`
-    pub fn resize(&mut self, capacity: usize) {
-        match capacity.cmp(&self.size) {
-            Ordering::Greater => self.grow(capacity - self.size),
-            Ordering::Less => self.shrink(self.size - capacity),
+    pub fn resize(&self, capacity: usize) {
+        let size = self.size.load(atomic::Ordering::Relaxed);
+        match capacity.cmp(&size) {
+            Ordering::Greater => self.grow(capacity - size),
+            Ordering::Less => self.shrink(size - capacity),
             _ => {}
         }
     }
 
     /// Try to set the size of this reservation to `capacity`
-    pub fn try_resize(&mut self, capacity: usize) -> Result<()> {
-        match capacity.cmp(&self.size) {
-            Ordering::Greater => self.try_grow(capacity - self.size)?,
-            Ordering::Less => self.shrink(self.size - capacity),
+    pub fn try_resize(&self, capacity: usize) -> Result<()> {
+        let size = self.size.load(atomic::Ordering::Relaxed);
+        match capacity.cmp(&size) {
+            Ordering::Greater => self.try_grow(capacity - size)?,
+            Ordering::Less => {
+                self.try_shrink(size - capacity)?;
+            }
             _ => {}
         };
         Ok(())
     }
 
     /// Increase the size of this reservation by `capacity` bytes
-    pub fn grow(&mut self, capacity: usize) {
+    pub fn grow(&self, capacity: usize) {
         self.registration.pool.grow(self, capacity);
-        self.size += capacity;
+        self.size.fetch_add(capacity, atomic::Ordering::Relaxed);
     }
 
     /// Try to increase the size of this reservation by `capacity`
     /// bytes, returning error if there is insufficient capacity left
     /// in the pool.
-    pub fn try_grow(&mut self, capacity: usize) -> Result<()> {
+    pub fn try_grow(&self, capacity: usize) -> Result<()> {
         self.registration.pool.try_grow(self, capacity)?;
-        self.size += capacity;
+        self.size.fetch_add(capacity, atomic::Ordering::Relaxed);
         Ok(())
     }
 
@@ -446,10 +471,16 @@ impl MemoryReservation {
     /// # Panics
     ///
     /// Panics if `capacity` exceeds [`Self::size`]
-    pub fn split(&mut self, capacity: usize) -> MemoryReservation {
-        self.size = self.size.checked_sub(capacity).unwrap();
+    pub fn split(&self, capacity: usize) -> MemoryReservation {
+        self.size
+            .fetch_update(
+                atomic::Ordering::Relaxed,
+                atomic::Ordering::Relaxed,
+                |prev| prev.checked_sub(capacity),
+            )
+            .unwrap();
         Self {
-            size: capacity,
+            size: atomic::AtomicUsize::new(capacity),
             registration: Arc::clone(&self.registration),
         }
     }
@@ -457,7 +488,7 @@ impl MemoryReservation {
     /// Returns a new empty [`MemoryReservation`] with the same [`MemoryConsumer`]
     pub fn new_empty(&self) -> Self {
         Self {
-            size: 0,
+            size: atomic::AtomicUsize::new(0),
             registration: Arc::clone(&self.registration),
         }
     }
@@ -465,7 +496,7 @@ impl MemoryReservation {
     /// Splits off all the bytes from this [`MemoryReservation`] into
     /// a new [`MemoryReservation`] with the same [`MemoryConsumer`]
     pub fn take(&mut self) -> MemoryReservation {
-        self.split(self.size)
+        self.split(self.size.load(atomic::Ordering::Relaxed))
     }
 }
 
@@ -475,34 +506,6 @@ impl Drop for MemoryReservation {
     }
 }
 
-pub mod units {
-    pub const TB: u64 = 1 << 40;
-    pub const GB: u64 = 1 << 30;
-    pub const MB: u64 = 1 << 20;
-    pub const KB: u64 = 1 << 10;
-}
-
-/// Present size in human-readable form
-pub fn human_readable_size(size: usize) -> String {
-    use units::*;
-
-    let size = size as u64;
-    let (value, unit) = {
-        if size >= 2 * TB {
-            (size as f64 / TB as f64, "TB")
-        } else if size >= 2 * GB {
-            (size as f64 / GB as f64, "GB")
-        } else if size >= 2 * MB {
-            (size as f64 / MB as f64, "MB")
-        } else if size >= 2 * KB {
-            (size as f64 / KB as f64, "KB")
-        } else {
-            (size as f64, "B")
-        }
-    };
-    format!("{value:.1} {unit}")
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -519,7 +522,7 @@ mod tests {
     #[test]
     fn test_memory_pool_underflow() {
         let pool = Arc::new(GreedyMemoryPool::new(50)) as _;
-        let mut a1 = MemoryConsumer::new("a1").register(&pool);
+        let a1 = MemoryConsumer::new("a1").register(&pool);
         assert_eq!(pool.reserved(), 0);
 
         a1.grow(100);
@@ -534,7 +537,7 @@ mod tests {
         a1.try_grow(30).unwrap();
         assert_eq!(pool.reserved(), 30);
 
-        let mut a2 = MemoryConsumer::new("a2").register(&pool);
+        let a2 = MemoryConsumer::new("a2").register(&pool);
         a2.try_grow(25).unwrap_err();
         assert_eq!(pool.reserved(), 30);
 
@@ -548,7 +551,7 @@ mod tests {
     #[test]
     fn test_split() {
         let pool = Arc::new(GreedyMemoryPool::new(50)) as _;
-        let mut r1 = MemoryConsumer::new("r1").register(&pool);
+        let r1 = MemoryConsumer::new("r1").register(&pool);
 
         r1.try_grow(20).unwrap();
         assert_eq!(r1.size(), 20);
@@ -569,10 +572,10 @@ mod tests {
     #[test]
     fn test_new_empty() {
         let pool = Arc::new(GreedyMemoryPool::new(50)) as _;
-        let mut r1 = MemoryConsumer::new("r1").register(&pool);
+        let r1 = MemoryConsumer::new("r1").register(&pool);
 
         r1.try_grow(20).unwrap();
-        let mut r2 = r1.new_empty();
+        let r2 = r1.new_empty();
         r2.try_grow(5).unwrap();
 
         assert_eq!(r1.size(), 20);
@@ -586,7 +589,7 @@ mod tests {
         let mut r1 = MemoryConsumer::new("r1").register(&pool);
 
         r1.try_grow(20).unwrap();
-        let mut r2 = r1.take();
+        let r2 = r1.take();
         r2.try_grow(5).unwrap();
 
         assert_eq!(r1.size(), 0);
@@ -599,4 +602,37 @@ mod tests {
         assert_eq!(r2.size(), 25);
         assert_eq!(pool.reserved(), 28);
     }
+
+    #[test]
+    fn test_try_shrink() {
+        let pool = Arc::new(GreedyMemoryPool::new(100)) as _;
+        let r1 = MemoryConsumer::new("r1").register(&pool);
+
+        r1.try_grow(50).unwrap();
+        assert_eq!(r1.size(), 50);
+        assert_eq!(pool.reserved(), 50);
+
+        // Successful shrink returns new size and frees pool memory
+        let new_size = r1.try_shrink(30).unwrap();
+        assert_eq!(new_size, 20);
+        assert_eq!(r1.size(), 20);
+        assert_eq!(pool.reserved(), 20);
+
+        // Freed pool memory is now available to other consumers
+        let r2 = MemoryConsumer::new("r2").register(&pool);
+        r2.try_grow(80).unwrap();
+        assert_eq!(pool.reserved(), 100);
+
+        // Shrinking more than allocated fails without changing state
+        let err = r1.try_shrink(25);
+        assert!(err.is_err());
+        assert_eq!(r1.size(), 20);
+        assert_eq!(pool.reserved(), 100);
+
+        // Shrink to exactly zero
+        let new_size = r1.try_shrink(20).unwrap();
+        assert_eq!(new_size, 0);
+        assert_eq!(r1.size(), 0);
+        assert_eq!(pool.reserved(), 80);
+    }
 }
diff --git a/datafusion/execution/src/memory_pool/pool.rs b/datafusion/execution/src/memory_pool/pool.rs
index d6b55182aa6ba..aac95b9d6a81f 100644
--- a/datafusion/execution/src/memory_pool/pool.rs
+++ b/datafusion/execution/src/memory_pool/pool.rs
@@ -16,12 +16,13 @@
 // under the License.
 
 use crate::memory_pool::{
-    human_readable_size, MemoryConsumer, MemoryLimit, MemoryPool, MemoryReservation,
+    MemoryConsumer, MemoryLimit, MemoryPool, MemoryReservation, human_readable_size,
 };
 use datafusion_common::HashMap;
-use datafusion_common::{resources_datafusion_err, DataFusionError, Result};
+use datafusion_common::{DataFusionError, Result, resources_datafusion_err};
 use log::debug;
 use parking_lot::Mutex;
+use std::fmt::{Display, Formatter};
 use std::{
     num::NonZeroUsize,
     sync::atomic::{AtomicUsize, Ordering},
@@ -34,6 +35,10 @@ pub struct UnboundedMemoryPool {
 }
 
 impl MemoryPool for UnboundedMemoryPool {
+    fn name(&self) -> &str {
+        "unbounded"
+    }
+
     fn grow(&self, _reservation: &MemoryReservation, additional: usize) {
         self.used.fetch_add(additional, Ordering::Relaxed);
     }
@@ -56,6 +61,13 @@ impl MemoryPool for UnboundedMemoryPool {
     }
 }
 
+impl Display for UnboundedMemoryPool {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        let used = self.used.load(Ordering::Relaxed);
+        write!(f, "{}(used: {})", &self.name(), human_readable_size(used))
+    }
+}
+
 /// A [`MemoryPool`] that implements a greedy first-come first-serve limit.
 ///
 /// This pool works well for queries that do not need to spill or have
@@ -79,6 +91,10 @@ impl GreedyMemoryPool {
 }
 
 impl MemoryPool for GreedyMemoryPool {
+    fn name(&self) -> &str {
+        "greedy"
+    }
+
     fn grow(&self, _reservation: &MemoryReservation, additional: usize) {
         self.used.fetch_add(additional, Ordering::Relaxed);
     }
@@ -98,6 +114,7 @@ impl MemoryPool for GreedyMemoryPool {
                     reservation,
                     additional,
                     self.pool_size.saturating_sub(used),
+                    self,
                 )
             })?;
         Ok(())
@@ -112,6 +129,19 @@ impl MemoryPool for GreedyMemoryPool {
     }
 }
 
+impl Display for GreedyMemoryPool {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        let used = self.used.load(Ordering::Relaxed);
+        write!(
+            f,
+            "{}(used: {}, pool_size: {})",
+            &self.name(),
+            human_readable_size(used),
+            human_readable_size(self.pool_size)
+        )
+    }
+}
+
 /// A [`MemoryPool`] that prevents spillable reservations from using more than
 /// an even fraction of the available memory sans any unspillable reservations
 /// (i.e. `(pool_size - unspillable_memory) / num_spillable_reservations`)
@@ -170,6 +200,10 @@ impl FairSpillPool {
 }
 
 impl MemoryPool for FairSpillPool {
+    fn name(&self) -> &str {
+        "fair"
+    }
+
     fn register(&self, consumer: &MemoryConsumer) {
         if consumer.can_spill {
             self.state.lock().num_spill += 1;
@@ -212,11 +246,12 @@ impl MemoryPool for FairSpillPool {
                     .checked_div(state.num_spill)
                     .unwrap_or(spill_available);
 
-                if reservation.size + additional > available {
+                if reservation.size() + additional > available {
                     return Err(insufficient_capacity_err(
                         reservation,
                         additional,
                         available,
+                        self,
                     ));
                 }
                 state.spillable += additional;
@@ -231,6 +266,7 @@ impl MemoryPool for FairSpillPool {
                         reservation,
                         additional,
                         available,
+                        self,
                     ));
                 }
                 state.unspillable += additional;
@@ -249,6 +285,17 @@ impl MemoryPool for FairSpillPool {
     }
 }
 
+impl Display for FairSpillPool {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{}(pool_size: {})",
+            &self.name(),
+            human_readable_size(self.pool_size),
+        )
+    }
+}
+
 /// Constructs a resources error based upon the individual [`MemoryReservation`].
 ///
 /// The error references the `bytes already allocated` for the reservation,
@@ -259,9 +306,16 @@ fn insufficient_capacity_err(
     reservation: &MemoryReservation,
     additional: usize,
     available: usize,
+    pool: &impl MemoryPool,
 ) -> DataFusionError {
-    resources_datafusion_err!("Failed to allocate additional {} for {} with {} already allocated for this reservation - {} remain available for the total pool", 
-    human_readable_size(additional), reservation.registration.consumer.name, human_readable_size(reservation.size), human_readable_size(available))
+    resources_datafusion_err!(
+        "Failed to allocate additional {} for {} with {} already allocated for this reservation - {} remain available for the total memory pool: {}",
+        human_readable_size(additional),
+        reservation.registration.consumer.name,
+        human_readable_size(reservation.size()),
+        human_readable_size(available),
+        pool
+    )
 }
 
 #[derive(Debug)]
@@ -297,6 +351,32 @@ impl TrackedConsumer {
     }
 }
 
+/// A point-in-time snapshot of a tracked memory consumer's state.
+///
+/// Returned by [`TrackConsumersPool::metrics()`].
+#[derive(Debug, Clone)]
+pub struct MemoryConsumerMetrics {
+    /// The name of the memory consumer
+    pub name: String,
+    /// Whether this consumer can spill to disk
+    pub can_spill: bool,
+    /// The number of bytes currently reserved by this consumer
+    pub reserved: usize,
+    /// The peak number of bytes reserved by this consumer
+    pub peak: usize,
+}
+
+impl From<&TrackedConsumer> for MemoryConsumerMetrics {
+    fn from(tracked: &TrackedConsumer) -> Self {
+        Self {
+            name: tracked.name.clone(),
+            can_spill: tracked.can_spill,
+            reserved: tracked.reserved(),
+            peak: tracked.peak(),
+        }
+    }
+}
+
 /// A [`MemoryPool`] that tracks the consumers that have
 /// reserved memory within the inner memory pool.
 ///
@@ -319,8 +399,8 @@ impl TrackedConsumer {
 ///
 /// For more examples of using `TrackConsumersPool`, see the [memory_pool_tracking.rs] example
 ///
-/// [memory_pool_tracking.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/memory_pool_tracking.rs
-/// [memory_pool_execution_plan.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/memory_pool_execution_plan.rs
+/// [memory_pool_tracking.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/execution_monitoring/memory_pool_tracking.rs
+/// [memory_pool_execution_plan.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/execution_monitoring/memory_pool_execution_plan.rs
 #[derive(Debug)]
 pub struct TrackConsumersPool<I> {
     /// The wrapped memory pool that actually handles reservation logic
@@ -331,6 +411,18 @@ pub struct TrackConsumersPool<I> {
     tracked_consumers: Mutex<HashMap<usize, TrackedConsumer>>,
 }
 
+impl<I: MemoryPool> Display for TrackConsumersPool<I> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{}(inner_pool: {}, num_of_top_consumers: {})",
+            &self.name(),
+            &self.inner,
+            &self.top,
+        )
+    }
+}
+
 impl<I: MemoryPool> TrackConsumersPool<I> {
     /// Creates a new [`TrackConsumersPool`].
     ///
@@ -376,6 +468,20 @@ impl<I: MemoryPool> TrackConsumersPool<I> {
         }
     }
 
+    /// Returns a reference to the wrapped inner [`MemoryPool`].
+    pub fn inner(&self) -> &I {
+        &self.inner
+    }
+
+    /// Returns a snapshot of all currently tracked consumers.
+    pub fn metrics(&self) -> Vec<MemoryConsumerMetrics> {
+        self.tracked_consumers
+            .lock()
+            .values()
+            .map(Into::into)
+            .collect()
+    }
+
     /// Returns a formatted string with the top memory consumers.
     pub fn report_top(&self, top: usize) -> String {
         let mut consumers = self
@@ -412,6 +518,10 @@ impl<I: MemoryPool> TrackConsumersPool<I> {
 }
 
 impl<I: MemoryPool> MemoryPool for TrackConsumersPool<I> {
+    fn name(&self) -> &str {
+        "track_consumers"
+    }
+
     fn register(&self, consumer: &MemoryConsumer) {
         self.inner.register(consumer);
 
@@ -466,8 +576,8 @@ impl<I: MemoryPool> MemoryPool for TrackConsumersPool<I> {
                     DataFusionError::ResourcesExhausted(
                         provide_top_memory_consumers_to_error_msg(
                             &reservation.consumer().name,
-                            e,
-                            self.report_top(self.top.into()),
+                            &e,
+                            &self.report_top(self.top.into()),
                         ),
                     )
                 }
@@ -494,16 +604,18 @@ impl<I: MemoryPool> MemoryPool for TrackConsumersPool<I> {
 
 fn provide_top_memory_consumers_to_error_msg(
     consumer_name: &str,
-    error_msg: String,
-    top_consumers: String,
+    error_msg: &str,
+    top_consumers: &str,
 ) -> String {
-    format!("Additional allocation failed for {consumer_name} with top memory consumers (across reservations) as:\n{top_consumers}\nError: {error_msg}")
+    format!(
+        "Additional allocation failed for {consumer_name} with top memory consumers (across reservations) as:\n{top_consumers}\nError: {error_msg}"
+    )
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
-    use insta::{allow_duplicates, assert_snapshot, Settings};
+    use insta::{Settings, allow_duplicates, assert_snapshot, with_settings};
     use std::sync::Arc;
 
     fn make_settings() -> Settings {
@@ -519,12 +631,12 @@ mod tests {
     fn test_fair() {
         let pool = Arc::new(FairSpillPool::new(100)) as _;
 
-        let mut r1 = MemoryConsumer::new("unspillable").register(&pool);
+        let r1 = MemoryConsumer::new("unspillable").register(&pool);
         // Can grow beyond capacity of pool
         r1.grow(2000);
         assert_eq!(pool.reserved(), 2000);
 
-        let mut r2 = MemoryConsumer::new("r2")
+        let r2 = MemoryConsumer::new("r2")
             .with_can_spill(true)
             .register(&pool);
         // Can grow beyond capacity of pool
@@ -533,10 +645,10 @@ mod tests {
         assert_eq!(pool.reserved(), 4000);
 
         let err = r2.try_grow(1).unwrap_err().strip_backtrace();
-        assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 1.0 B for r2 with 2000.0 B already allocated for this reservation - 0.0 B remain available for the total pool");
+        assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 1.0 B for r2 with 2000.0 B already allocated for this reservation - 0.0 B remain available for the total memory pool: fair(pool_size: 100.0 B)");
 
         let err = r2.try_grow(1).unwrap_err().strip_backtrace();
-        assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 1.0 B for r2 with 2000.0 B already allocated for this reservation - 0.0 B remain available for the total pool");
+        assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 1.0 B for r2 with 2000.0 B already allocated for this reservation - 0.0 B remain available for the total memory pool: fair(pool_size: 100.0 B)");
 
         r1.shrink(1990);
         r2.shrink(2000);
@@ -556,17 +668,17 @@ mod tests {
         assert_eq!(r2.size(), 10);
         assert_eq!(pool.reserved(), 30);
 
-        let mut r3 = MemoryConsumer::new("r3")
+        let r3 = MemoryConsumer::new("r3")
             .with_can_spill(true)
             .register(&pool);
 
         let err = r3.try_grow(70).unwrap_err().strip_backtrace();
-        assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 70.0 B for r3 with 0.0 B already allocated for this reservation - 40.0 B remain available for the total pool");
+        assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 70.0 B for r3 with 0.0 B already allocated for this reservation - 40.0 B remain available for the total memory pool: fair(pool_size: 100.0 B)");
 
         //Shrinking r2 to zero doesn't allow a3 to allocate more than 45
         r2.free();
         let err = r3.try_grow(70).unwrap_err().strip_backtrace();
-        assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 70.0 B for r3 with 0.0 B already allocated for this reservation - 40.0 B remain available for the total pool");
+        assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 70.0 B for r3 with 0.0 B already allocated for this reservation - 40.0 B remain available for the total memory pool: fair(pool_size: 100.0 B)");
 
         // But dropping r2 does
         drop(r2);
@@ -577,9 +689,9 @@ mod tests {
         r1.free();
         assert_eq!(pool.reserved(), 80);
 
-        let mut r4 = MemoryConsumer::new("s4").register(&pool);
+        let r4 = MemoryConsumer::new("s4").register(&pool);
         let err = r4.try_grow(30).unwrap_err().strip_backtrace();
-        assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 30.0 B for s4 with 0.0 B already allocated for this reservation - 20.0 B remain available for the total pool");
+        assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 30.0 B for s4 with 0.0 B already allocated for this reservation - 20.0 B remain available for the total memory pool: fair(pool_size: 100.0 B)");
     }
 
     #[test]
@@ -594,18 +706,18 @@ mod tests {
         // Test: use all the different interfaces to change reservation size
 
         // set r1=50, using grow and shrink
-        let mut r1 = MemoryConsumer::new("r1").register(&pool);
+        let r1 = MemoryConsumer::new("r1").register(&pool);
         r1.grow(50);
         r1.grow(20);
         r1.shrink(20);
 
         // set r2=15 using try_grow
-        let mut r2 = MemoryConsumer::new("r2").register(&pool);
+        let r2 = MemoryConsumer::new("r2").register(&pool);
         r2.try_grow(15)
             .expect("should succeed in memory allotment for r2");
 
         // set r3=20 using try_resize
-        let mut r3 = MemoryConsumer::new("r3").register(&pool);
+        let r3 = MemoryConsumer::new("r3").register(&pool);
         r3.try_resize(25)
             .expect("should succeed in memory allotment for r3");
         r3.try_resize(20)
@@ -613,12 +725,12 @@ mod tests {
 
         // set r4=10
         // this should not be reported in top 3
-        let mut r4 = MemoryConsumer::new("r4").register(&pool);
+        let r4 = MemoryConsumer::new("r4").register(&pool);
         r4.grow(10);
 
         // Test: reports if new reservation causes error
         // using the previously set sizes for other consumers
-        let mut r5 = MemoryConsumer::new("r5").register(&pool);
+        let r5 = MemoryConsumer::new("r5").register(&pool);
         let res = r5.try_grow(150);
         assert!(res.is_err());
         let error = res.unwrap_err().strip_backtrace();
@@ -627,7 +739,7 @@ mod tests {
           r1#[ID](can spill: false) consumed 50.0 B, peak 70.0 B,
           r3#[ID](can spill: false) consumed 20.0 B, peak 25.0 B,
           r2#[ID](can spill: false) consumed 15.0 B, peak 15.0 B.
-        Error: Failed to allocate additional 150.0 B for r5 with 0.0 B already allocated for this reservation - 5.0 B remain available for the total pool
+        Error: Failed to allocate additional 150.0 B for r5 with 0.0 B already allocated for this reservation - 5.0 B remain available for the total memory pool: greedy(used: 95.0 B, pool_size: 100.0 B)
         ");
     }
 
@@ -643,14 +755,14 @@ mod tests {
         let same_name = "foo";
 
         // Test: see error message when no consumers recorded yet
-        let mut r0 = MemoryConsumer::new(same_name).register(&pool);
+        let r0 = MemoryConsumer::new(same_name).register(&pool);
         let res = r0.try_grow(150);
         assert!(res.is_err());
         let error = res.unwrap_err().strip_backtrace();
         assert_snapshot!(error, @r"
         Resources exhausted: Additional allocation failed for foo with top memory consumers (across reservations) as:
           foo#[ID](can spill: false) consumed 0.0 B, peak 0.0 B.
-        Error: Failed to allocate additional 150.0 B for foo with 0.0 B already allocated for this reservation - 100.0 B remain available for the total pool
+        Error: Failed to allocate additional 150.0 B for foo with 0.0 B already allocated for this reservation - 100.0 B remain available for the total memory pool: greedy(used: 0.0 B, pool_size: 100.0 B)
         ");
 
         // API: multiple registrations using the same hashed consumer,
@@ -658,7 +770,7 @@ mod tests {
 
         r0.grow(10); // make r0=10, pool available=90
         let new_consumer_same_name = MemoryConsumer::new(same_name);
-        let mut r1 = new_consumer_same_name.register(&pool);
+        let r1 = new_consumer_same_name.register(&pool);
         // TODO: the insufficient_capacity_err() message is per reservation, not per consumer.
         // a followup PR will clarify this message "0 bytes already allocated for this reservation"
         let res = r1.try_grow(150);
@@ -668,7 +780,7 @@ mod tests {
         Resources exhausted: Additional allocation failed for foo with top memory consumers (across reservations) as:
           foo#[ID](can spill: false) consumed 10.0 B, peak 10.0 B,
           foo#[ID](can spill: false) consumed 0.0 B, peak 0.0 B.
-        Error: Failed to allocate additional 150.0 B for foo with 0.0 B already allocated for this reservation - 90.0 B remain available for the total pool
+        Error: Failed to allocate additional 150.0 B for foo with 0.0 B already allocated for this reservation - 90.0 B remain available for the total memory pool: greedy(used: 10.0 B, pool_size: 100.0 B)
         ");
 
         // Test: will accumulate size changes per consumer, not per reservation
@@ -681,14 +793,14 @@ mod tests {
         Resources exhausted: Additional allocation failed for foo with top memory consumers (across reservations) as:
           foo#[ID](can spill: false) consumed 20.0 B, peak 20.0 B,
           foo#[ID](can spill: false) consumed 10.0 B, peak 10.0 B.
-        Error: Failed to allocate additional 150.0 B for foo with 20.0 B already allocated for this reservation - 70.0 B remain available for the total pool
+        Error: Failed to allocate additional 150.0 B for foo with 20.0 B already allocated for this reservation - 70.0 B remain available for the total memory pool: greedy(used: 30.0 B, pool_size: 100.0 B)
         ");
 
         // Test: different hashed consumer, (even with the same name),
         // will be recognized as different in the TrackConsumersPool
         let consumer_with_same_name_but_different_hash =
             MemoryConsumer::new(same_name).with_can_spill(true);
-        let mut r2 = consumer_with_same_name_but_different_hash.register(&pool);
+        let r2 = consumer_with_same_name_but_different_hash.register(&pool);
         let res = r2.try_grow(150);
         assert!(res.is_err());
         let error = res.unwrap_err().strip_backtrace();
@@ -697,78 +809,134 @@ mod tests {
           foo#[ID](can spill: false) consumed 20.0 B, peak 20.0 B,
           foo#[ID](can spill: false) consumed 10.0 B, peak 10.0 B,
           foo#[ID](can spill: true) consumed 0.0 B, peak 0.0 B.
-        Error: Failed to allocate additional 150.0 B for foo with 0.0 B already allocated for this reservation - 70.0 B remain available for the total pool
+        Error: Failed to allocate additional 150.0 B for foo with 0.0 B already allocated for this reservation - 70.0 B remain available for the total memory pool: greedy(used: 30.0 B, pool_size: 100.0 B)
         ");
     }
 
     #[test]
     fn test_tracked_consumers_pool_deregister() {
-        fn test_per_pool_type(pool: Arc<dyn MemoryPool>) {
-            // Baseline: see the 2 memory consumers
-            let setting = make_settings();
-            let _bound = setting.bind_to_scope();
-            let mut r0 = MemoryConsumer::new("r0").register(&pool);
-            r0.grow(10);
-            let r1_consumer = MemoryConsumer::new("r1");
-            let mut r1 = r1_consumer.register(&pool);
-            r1.grow(20);
-
-            let res = r0.try_grow(150);
-            assert!(res.is_err());
-            let error = res.unwrap_err().strip_backtrace();
-            allow_duplicates!(assert_snapshot!(error, @r"
+        fn test_per_pool_type<P: MemoryPool + 'static>(pool: Arc<TrackConsumersPool<P>>) {
+            // `snapshot_suffix` ties each insta snapshot to this pool's inner backend; filters
+            // normalize inner pool `Display` so fair vs greedy share the same `@` reference text.
+            with_settings!({
+                snapshot_suffix => pool.inner().name().to_string(),
+                filters => vec![
+                    (
+                        r"([^\s]+)\#\d+\(can spill: (true|false)\)",
+                        "$1#[ID](can spill: $2)",
+                    ),
+                    (
+                        r"for the total memory pool: [^\n]+",
+                        "for the total memory pool: [INNER_POOL]",
+                    ),
+                ],
+            }, {
+                let memory_pool: Arc<dyn MemoryPool> = Arc::<TrackConsumersPool<P>>::clone(&pool);
+                let r0 = MemoryConsumer::new("r0").register(&memory_pool);
+                r0.grow(10);
+                let r1 = MemoryConsumer::new("r1").register(&memory_pool);
+                r1.grow(20);
+
+                // Baseline: see the 2 memory consumers
+                let error = r0.try_grow(150).unwrap_err().strip_backtrace();
+                assert_snapshot!(error, @r"
                 Resources exhausted: Additional allocation failed for r0 with top memory consumers (across reservations) as:
                   r1#[ID](can spill: false) consumed 20.0 B, peak 20.0 B,
                   r0#[ID](can spill: false) consumed 10.0 B, peak 10.0 B.
-                Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 70.0 B remain available for the total pool
-                "));
-
-            // Test: unregister one
-            // only the remaining one should be listed
-            drop(r1);
-            let res = r0.try_grow(150);
-            assert!(res.is_err());
-            let error = res.unwrap_err().strip_backtrace();
-            allow_duplicates!(assert_snapshot!(error, @r"
+                Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 70.0 B remain available for the total memory pool: [INNER_POOL]
+                ");
+
+                // Test: unregister one — only the remaining consumer should be listed
+                drop(r1);
+                let error = r0.try_grow(150).unwrap_err().strip_backtrace();
+                assert_snapshot!(error, @r"
                 Resources exhausted: Additional allocation failed for r0 with top memory consumers (across reservations) as:
                   r0#[ID](can spill: false) consumed 10.0 B, peak 10.0 B.
-                Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 90.0 B remain available for the total pool
-                "));
-
-            // Test: actual message we see is the `available is 70`. When it should be `available is 90`.
-            // This is because the pool.shrink() does not automatically occur within the inner_pool.deregister().
-            let res = r0.try_grow(150);
-            assert!(res.is_err());
-            let error = res.unwrap_err().strip_backtrace();
-            allow_duplicates!(assert_snapshot!(error, @r"
+                Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 90.0 B remain available for the total memory pool: [INNER_POOL]
+                ");
+
+                // Test: actual message we see is the `available is 70`. When it should be `available is 90`.
+                // This is because the pool.shrink() does not automatically occur within the inner_pool.deregister().
+                let error = r0.try_grow(150).unwrap_err().strip_backtrace();
+                assert_snapshot!(error, @r"
                 Resources exhausted: Additional allocation failed for r0 with top memory consumers (across reservations) as:
                   r0#[ID](can spill: false) consumed 10.0 B, peak 10.0 B.
-                Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 90.0 B remain available for the total pool
-                "));
-
-            // Test: the registration needs to free itself (or be dropped),
-            // for the proper error message
-            let res = r0.try_grow(150);
-            assert!(res.is_err());
-            let error = res.unwrap_err().strip_backtrace();
-            allow_duplicates!(assert_snapshot!(error, @r"
+                Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 90.0 B remain available for the total memory pool: [INNER_POOL]
+                ");
+
+                // Test: the registration needs to free itself (or be dropped),
+                // for the proper error message
+                let error = r0.try_grow(150).unwrap_err().strip_backtrace();
+                assert_snapshot!(error, @r"
                 Resources exhausted: Additional allocation failed for r0 with top memory consumers (across reservations) as:
                   r0#[ID](can spill: false) consumed 10.0 B, peak 10.0 B.
-                Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 90.0 B remain available for the total pool
-                "));
+                Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 90.0 B remain available for the total memory pool: [INNER_POOL]
+                ");
+                }
+            );
         }
 
-        let tracked_spill_pool: Arc<dyn MemoryPool> = Arc::new(TrackConsumersPool::new(
-            FairSpillPool::new(100),
-            NonZeroUsize::new(3).unwrap(),
-        ));
-        test_per_pool_type(tracked_spill_pool);
+        allow_duplicates! {
+            let tracked_spill_pool = Arc::new(TrackConsumersPool::new(
+                FairSpillPool::new(100),
+                NonZeroUsize::new(3).unwrap(),
+            ));
+            test_per_pool_type(tracked_spill_pool);
 
-        let tracked_greedy_pool: Arc<dyn MemoryPool> = Arc::new(TrackConsumersPool::new(
-            GreedyMemoryPool::new(100),
+            let tracked_greedy_pool = Arc::new(TrackConsumersPool::new(
+                GreedyMemoryPool::new(100),
+                NonZeroUsize::new(3).unwrap(),
+            ));
+            test_per_pool_type(tracked_greedy_pool);
+        }
+    }
+
+    #[test]
+    fn test_track_consumers_pool_metrics() {
+        let track_consumers_pool = Arc::new(TrackConsumersPool::new(
+            GreedyMemoryPool::new(1000),
             NonZeroUsize::new(3).unwrap(),
         ));
-        test_per_pool_type(tracked_greedy_pool);
+        let memory_pool: Arc<dyn MemoryPool> = Arc::clone(&track_consumers_pool) as _;
+
+        // Empty pool has no metrics
+        assert!(track_consumers_pool.metrics().is_empty());
+
+        // Register consumers with different spill settings
+        let r1 = MemoryConsumer::new("spilling")
+            .with_can_spill(true)
+            .register(&memory_pool);
+        let r2 = MemoryConsumer::new("non-spilling").register(&memory_pool);
+
+        // Grow r1 in two steps to verify peak tracking
+        r1.grow(100);
+        r1.grow(50);
+        r1.shrink(50); // reserved=100, peak=150
+
+        r2.grow(200); // reserved=200, peak=200
+
+        let mut metrics = track_consumers_pool.metrics();
+        metrics.sort_by_key(|m| m.name.clone());
+
+        assert_eq!(metrics.len(), 2);
+
+        let m_non = &metrics[0];
+        assert_eq!(m_non.name, "non-spilling");
+        assert!(!m_non.can_spill);
+        assert_eq!(m_non.reserved, 200);
+        assert_eq!(m_non.peak, 200);
+
+        let m_spill = &metrics[1];
+        assert_eq!(m_spill.name, "spilling");
+        assert!(m_spill.can_spill);
+        assert_eq!(m_spill.reserved, 100);
+        assert_eq!(m_spill.peak, 150);
+
+        // Unregistered consumers are removed from metrics
+        drop(r2);
+        let metrics = track_consumers_pool.metrics();
+        assert_eq!(metrics.len(), 1);
+        assert_eq!(metrics[0].name, "spilling");
     }
 
     #[test]
@@ -784,13 +952,13 @@ mod tests {
             .downcast::<TrackConsumersPool<GreedyMemoryPool>>()
             .unwrap();
         // set r1=20
-        let mut r1 = MemoryConsumer::new("r1").register(&pool);
+        let r1 = MemoryConsumer::new("r1").register(&pool);
         r1.grow(20);
         // set r2=15
-        let mut r2 = MemoryConsumer::new("r2").register(&pool);
+        let r2 = MemoryConsumer::new("r2").register(&pool);
         r2.grow(15);
         // set r3=45
-        let mut r3 = MemoryConsumer::new("r3").register(&pool);
+        let r3 = MemoryConsumer::new("r3").register(&pool);
         r3.grow(45);
 
         let downcasted = upcasted
@@ -804,4 +972,78 @@ mod tests {
         r1#[ID](can spill: false) consumed 20.0 B, peak 20.0 B.
         ");
     }
+
+    #[test]
+    fn test_memory_pool_display_fmt() {
+        let top = NonZeroUsize::new(5).unwrap();
+
+        // UnboundedMemoryPool Display with default allocation: 0.0B
+        let unbounded = UnboundedMemoryPool::default();
+        assert_eq!(
+            unbounded.to_string(),
+            "unbounded(used: 0.0 B)",
+            "UnboundedMemoryPool Display"
+        );
+
+        // UnboundedMemoryPool Display with reservations
+        let unbounded_arc: Arc<dyn MemoryPool> = Arc::new(UnboundedMemoryPool::default());
+        let r = MemoryConsumer::new("u").register(&unbounded_arc);
+        r.grow(2048);
+        assert_eq!(
+            unbounded_arc.as_ref().to_string(),
+            "unbounded(used: 2.0 KB)",
+            "UnboundedMemoryPool Display with reservations"
+        );
+
+        // GreedyMemoryPool Display with default allocation: 100.0B
+        let greedy = GreedyMemoryPool::new(100);
+        assert_eq!(
+            greedy.to_string(),
+            "greedy(used: 0.0 B, pool_size: 100.0 B)",
+            "GreedyMemoryPool Display"
+        );
+
+        // GreedyMemoryPool Display with reservations
+        let greedy_arc: Arc<dyn MemoryPool> = Arc::new(GreedyMemoryPool::new(100));
+        let r = MemoryConsumer::new("g").register(&greedy_arc);
+        r.grow(50);
+        assert_eq!(
+            greedy_arc.as_ref().to_string(),
+            "greedy(used: 50.0 B, pool_size: 100.0 B)",
+            "GreedyMemoryPool Display with reservations"
+        );
+
+        // FairSpillPool Display with default allocation: 4.0KB and without reservations
+        let fair = FairSpillPool::new(4096);
+        assert_eq!(
+            fair.to_string(),
+            "fair(pool_size: 4.0 KB)",
+            "FairSpillPool Display"
+        );
+
+        // TrackConsumersPool<GreedyMemoryPool> Display with default allocation: 128.0B and without reservations
+        let tracked_greedy = TrackConsumersPool::new(GreedyMemoryPool::new(128), top);
+        assert_eq!(
+            tracked_greedy.to_string(),
+            "track_consumers(inner_pool: greedy(used: 0.0 B, pool_size: 128.0 B), num_of_top_consumers: 5)",
+            "TrackConsumersPool<GreedyMemoryPool> Display"
+        );
+
+        // TrackConsumersPool<FairSpillPool> Display with default allocation: 256.0B and without reservations
+        let tracked_fair = TrackConsumersPool::new(FairSpillPool::new(256), top);
+        assert_eq!(
+            tracked_fair.to_string(),
+            "track_consumers(inner_pool: fair(pool_size: 256.0 B), num_of_top_consumers: 5)",
+            "TrackConsumersPool<FairSpillPool> Display"
+        );
+
+        // TrackConsumersPool<UnboundedMemoryPool> Display without reservations
+        let tracked_unbounded =
+            TrackConsumersPool::new(UnboundedMemoryPool::default(), top);
+        assert_eq!(
+            tracked_unbounded.to_string(),
+            "track_consumers(inner_pool: unbounded(used: 0.0 B), num_of_top_consumers: 5)",
+            "TrackConsumersPool<UnboundedMemoryPool> Display"
+        );
+    }
 }
diff --git a/datafusion/execution/src/object_store.rs b/datafusion/execution/src/object_store.rs
index aedee7d44460d..22ce1f0cf2bbf 100644
--- a/datafusion/execution/src/object_store.rs
+++ b/datafusion/execution/src/object_store.rs
@@ -21,11 +21,11 @@
 
 use dashmap::DashMap;
 use datafusion_common::{
-    exec_err, internal_datafusion_err, not_impl_err, DataFusionError, Result,
+    DataFusionError, Result, exec_err, internal_datafusion_err, not_impl_err,
 };
+use object_store::ObjectStore;
 #[cfg(not(target_arch = "wasm32"))]
 use object_store::local::LocalFileSystem;
-use object_store::ObjectStore;
 use std::sync::Arc;
 use url::Url;
 
@@ -158,9 +158,11 @@ pub trait ObjectStoreRegistry: Send + Sync + std::fmt::Debug + 'static {
 
     /// Deregister the store previously registered with the same key. Returns the
     /// deregistered store if it existed.
-    #[allow(unused_variables)]
+    #[expect(unused_variables)]
     fn deregister_store(&self, url: &Url) -> Result<Arc<dyn ObjectStore>> {
-        not_impl_err!("ObjectStoreRegistry::deregister_store is not implemented for this ObjectStoreRegistry")
+        not_impl_err!(
+            "ObjectStoreRegistry::deregister_store is not implemented for this ObjectStoreRegistry"
+        )
     }
 
     /// Get a suitable store for the provided URL. For example:
@@ -290,17 +292,29 @@ mod tests {
         assert_eq!(err.strip_backtrace(), "External error: invalid port number");
 
         let err = ObjectStoreUrl::parse("s3://bucket?").unwrap_err();
-        assert_eq!(err.strip_backtrace(), "Execution error: ObjectStoreUrl must only contain scheme and authority, got: ?");
+        assert_eq!(
+            err.strip_backtrace(),
+            "Execution error: ObjectStoreUrl must only contain scheme and authority, got: ?"
+        );
 
         let err = ObjectStoreUrl::parse("s3://bucket?foo=bar").unwrap_err();
-        assert_eq!(err.strip_backtrace(), "Execution error: ObjectStoreUrl must only contain scheme and authority, got: ?foo=bar");
+        assert_eq!(
+            err.strip_backtrace(),
+            "Execution error: ObjectStoreUrl must only contain scheme and authority, got: ?foo=bar"
+        );
 
         let err = ObjectStoreUrl::parse("s3://host:123/foo").unwrap_err();
-        assert_eq!(err.strip_backtrace(), "Execution error: ObjectStoreUrl must only contain scheme and authority, got: /foo");
+        assert_eq!(
+            err.strip_backtrace(),
+            "Execution error: ObjectStoreUrl must only contain scheme and authority, got: /foo"
+        );
 
         let err =
             ObjectStoreUrl::parse("s3://username:password@host:123/foo").unwrap_err();
-        assert_eq!(err.strip_backtrace(), "Execution error: ObjectStoreUrl must only contain scheme and authority, got: /foo");
+        assert_eq!(
+            err.strip_backtrace(),
+            "Execution error: ObjectStoreUrl must only contain scheme and authority, got: /foo"
+        );
     }
 
     #[test]
diff --git a/datafusion/execution/src/parquet_encryption.rs b/datafusion/execution/src/parquet_encryption.rs
index 027421e08f549..45eac10264e88 100644
--- a/datafusion/execution/src/parquet_encryption.rs
+++ b/datafusion/execution/src/parquet_encryption.rs
@@ -32,7 +32,7 @@ use std::sync::Arc;
 /// integrate with a user's key management service (KMS).
 /// For example usage, see the [`parquet_encrypted_with_kms` example].
 ///
-/// [`parquet_encrypted_with_kms` example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/parquet_encrypted_with_kms.rs
+/// [`parquet_encrypted_with_kms` example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/data_io/parquet_encrypted_with_kms.rs
 #[async_trait]
 pub trait EncryptionFactory: Send + Sync + std::fmt::Debug + 'static {
     /// Generate file encryption properties to use when writing a Parquet file.
diff --git a/datafusion/execution/src/runtime_env.rs b/datafusion/execution/src/runtime_env.rs
index d69987600855c..67604c424c766 100644
--- a/datafusion/execution/src/runtime_env.rs
+++ b/datafusion/execution/src/runtime_env.rs
@@ -18,8 +18,8 @@
 //! Execution [`RuntimeEnv`] environment that manages access to object
 //! store, memory manager, disk manager.
 
-#[allow(deprecated)]
-use crate::disk_manager::DiskManagerConfig;
+#[expect(deprecated)]
+use crate::disk_manager::{DiskManagerConfig, SpillingProgress};
 use crate::{
     disk_manager::{DiskManager, DiskManagerBuilder, DiskManagerMode},
     memory_pool::{
@@ -31,14 +31,14 @@ use crate::{
 use crate::cache::cache_manager::{CacheManager, CacheManagerConfig};
 #[cfg(feature = "parquet_encryption")]
 use crate::parquet_encryption::{EncryptionFactory, EncryptionFactoryRegistry};
-use datafusion_common::{config::ConfigEntry, Result};
+use datafusion_common::{Result, config::ConfigEntry};
 use object_store::ObjectStore;
-use std::path::PathBuf;
 use std::sync::Arc;
 use std::{
     fmt::{Debug, Formatter},
     num::NonZeroUsize,
 };
+use std::{path::PathBuf, time::Duration};
 use url::Url;
 
 #[derive(Clone)]
@@ -91,6 +91,53 @@ impl Debug for RuntimeEnv {
     }
 }
 
+/// Creates runtime configuration entries with the provided values
+///
+/// This helper function defines the structure and metadata for all runtime configuration
+/// entries to avoid duplication between `RuntimeEnv::config_entries()` and
+/// `RuntimeEnvBuilder::entries()`.
+fn create_runtime_config_entries(
+    memory_limit: Option<String>,
+    max_temp_directory_size: Option<String>,
+    temp_directory: Option<String>,
+    metadata_cache_limit: Option<String>,
+    list_files_cache_limit: Option<String>,
+    list_files_cache_ttl: Option<String>,
+) -> Vec<ConfigEntry> {
+    vec![
+        ConfigEntry {
+            key: "datafusion.runtime.memory_limit".to_string(),
+            value: memory_limit,
+            description: "Maximum memory limit for query execution. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.",
+        },
+        ConfigEntry {
+            key: "datafusion.runtime.max_temp_directory_size".to_string(),
+            value: max_temp_directory_size,
+            description: "Maximum temporary file directory size. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.",
+        },
+        ConfigEntry {
+            key: "datafusion.runtime.temp_directory".to_string(),
+            value: temp_directory,
+            description: "The path to the temporary file directory.",
+        },
+        ConfigEntry {
+            key: "datafusion.runtime.metadata_cache_limit".to_string(),
+            value: metadata_cache_limit,
+            description: "Maximum memory to use for file metadata cache such as Parquet metadata. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.",
+        },
+        ConfigEntry {
+            key: "datafusion.runtime.list_files_cache_limit".to_string(),
+            value: list_files_cache_limit,
+            description: "Maximum memory to use for list files cache. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.",
+        },
+        ConfigEntry {
+            key: "datafusion.runtime.list_files_cache_ttl".to_string(),
+            value: list_files_cache_ttl,
+            description: "TTL (time-to-live) of the entries in the list file cache. Supports units m (minutes), and s (seconds). Example: '2m' for 2 minutes.",
+        },
+    ]
+}
+
 impl RuntimeEnv {
     /// Registers a custom `ObjectStore` to be used with a specific url.
     /// This allows DataFusion to create external tables from urls that do not have
@@ -152,6 +199,11 @@ impl RuntimeEnv {
         self.object_store_registry.get_store(url.as_ref())
     }
 
+    /// Returns the current spilling progress
+    pub fn spilling_progress(&self) -> SpillingProgress {
+        self.disk_manager.spilling_progress()
+    }
+
     /// Register an [`EncryptionFactory`] with an associated identifier that can be later
     /// used to configure encryption when reading or writing Parquet.
     /// If an encryption factory with the same identifier was already registered, it is replaced and returned.
@@ -173,6 +225,86 @@ impl RuntimeEnv {
     ) -> Result<Arc<dyn EncryptionFactory>> {
         self.parquet_encryption_factory_registry.get_factory(id)
     }
+
+    /// Returns the current runtime configuration entries
+    pub fn config_entries(&self) -> Vec<ConfigEntry> {
+        use crate::memory_pool::MemoryLimit;
+
+        /// Convert bytes to a human-readable format
+        fn format_byte_size(size: u64) -> String {
+            const GB: u64 = 1024 * 1024 * 1024;
+            const MB: u64 = 1024 * 1024;
+            const KB: u64 = 1024;
+
+            match size {
+                s if s >= GB => format!("{}G", s / GB),
+                s if s >= MB => format!("{}M", s / MB),
+                s if s >= KB => format!("{}K", s / KB),
+                s => format!("{s}"),
+            }
+        }
+
+        fn format_duration(duration: Duration) -> String {
+            let total = duration.as_secs();
+            let mins = total / 60;
+            let secs = total % 60;
+
+            format!("{mins}m{secs}s")
+        }
+
+        let memory_limit_value = match self.memory_pool.memory_limit() {
+            MemoryLimit::Finite(size) => Some(format_byte_size(
+                size.try_into()
+                    .expect("Memory limit size conversion failed"),
+            )),
+            MemoryLimit::Infinite => Some("unlimited".to_string()),
+            MemoryLimit::Unknown => None,
+        };
+
+        let max_temp_dir_size = self.disk_manager.max_temp_directory_size();
+        let max_temp_dir_value = format_byte_size(max_temp_dir_size);
+
+        let temp_paths = self.disk_manager.temp_dir_paths();
+        let temp_dir_value = if temp_paths.is_empty() {
+            None
+        } else {
+            Some(
+                temp_paths
+                    .iter()
+                    .map(|p| p.display().to_string())
+                    .collect::<Vec<_>>()
+                    .join(","),
+            )
+        };
+
+        let metadata_cache_limit = self.cache_manager.get_metadata_cache_limit();
+        let metadata_cache_value = format_byte_size(
+            metadata_cache_limit
+                .try_into()
+                .expect("Metadata cache size conversion failed"),
+        );
+
+        let list_files_cache_limit = self.cache_manager.get_list_files_cache_limit();
+        let list_files_cache_value = format_byte_size(
+            list_files_cache_limit
+                .try_into()
+                .expect("List files cache size conversion failed"),
+        );
+
+        let list_files_cache_ttl = self
+            .cache_manager
+            .get_list_files_cache_ttl()
+            .map(format_duration);
+
+        create_runtime_config_entries(
+            memory_limit_value,
+            Some(max_temp_dir_value),
+            temp_dir_value,
+            Some(metadata_cache_value),
+            Some(list_files_cache_value),
+            list_files_cache_ttl,
+        )
+    }
 }
 
 impl Default for RuntimeEnv {
@@ -186,7 +318,7 @@ impl Default for RuntimeEnv {
 /// See example on [`RuntimeEnv`]
 #[derive(Clone)]
 pub struct RuntimeEnvBuilder {
-    #[allow(deprecated)]
+    #[expect(deprecated)]
     /// DiskManager to manage temporary disk file usage
     pub disk_manager: DiskManagerConfig,
     /// DiskManager builder to manager temporary disk file usage
@@ -224,7 +356,7 @@ impl RuntimeEnvBuilder {
         }
     }
 
-    #[allow(deprecated)]
+    #[expect(deprecated)]
     #[deprecated(since = "48.0.0", note = "Use with_disk_manager_builder instead")]
     /// Customize disk manager
     pub fn with_disk_manager(mut self, disk_manager: DiskManagerConfig) -> Self {
@@ -294,6 +426,18 @@ impl RuntimeEnvBuilder {
         self
     }
 
+    /// Specifies the memory limit for the object list cache, in bytes.
+    pub fn with_object_list_cache_limit(mut self, limit: usize) -> Self {
+        self.cache_manager = self.cache_manager.with_list_files_cache_limit(limit);
+        self
+    }
+
+    /// Specifies the duration entries in the object list cache will be considered valid.
+    pub fn with_object_list_cache_ttl(mut self, ttl: Option<Duration>) -> Self {
+        self.cache_manager = self.cache_manager.with_list_files_cache_ttl(ttl);
+        self
+    }
+
     /// Build a RuntimeEnv
     pub fn build(self) -> Result<RuntimeEnv> {
         let Self {
@@ -313,7 +457,7 @@ impl RuntimeEnvBuilder {
             disk_manager: if let Some(builder) = disk_manager_builder {
                 Arc::new(builder.build()?)
             } else {
-                #[allow(deprecated)]
+                #[expect(deprecated)]
                 DiskManager::try_new(disk_manager)?
             },
             cache_manager: CacheManager::try_new(&cache_manager)?,
@@ -335,6 +479,10 @@ impl RuntimeEnvBuilder {
                 .cache_manager
                 .get_file_statistic_cache(),
             list_files_cache: runtime_env.cache_manager.get_list_files_cache(),
+            list_files_cache_limit: runtime_env
+                .cache_manager
+                .get_list_files_cache_limit(),
+            list_files_cache_ttl: runtime_env.cache_manager.get_list_files_cache_ttl(),
             file_metadata_cache: Some(
                 runtime_env.cache_manager.get_file_metadata_cache(),
             ),
@@ -342,7 +490,7 @@ impl RuntimeEnvBuilder {
         };
 
         Self {
-            #[allow(deprecated)]
+            #[expect(deprecated)]
             disk_manager: DiskManagerConfig::Existing(Arc::clone(
                 &runtime_env.disk_manager,
             )),
@@ -359,28 +507,14 @@ impl RuntimeEnvBuilder {
 
     /// Returns a list of all available runtime configurations with their current values and descriptions
     pub fn entries(&self) -> Vec<ConfigEntry> {
-        vec![
-            ConfigEntry {
-                key: "datafusion.runtime.memory_limit".to_string(),
-                value: None, // Default is system-dependent
-                description: "Maximum memory limit for query execution. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.",
-            },
-            ConfigEntry {
-                key: "datafusion.runtime.max_temp_directory_size".to_string(),
-                value: Some("100G".to_string()),
-                description: "Maximum temporary file directory size. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.",
-            },
-            ConfigEntry {
-                key: "datafusion.runtime.temp_directory".to_string(),
-                value: None, // Default is system-dependent
-                description: "The path to the temporary file directory.",
-            },
-            ConfigEntry {
-                key: "datafusion.runtime.metadata_cache_limit".to_string(),
-                value: Some("50M".to_owned()),
-                description: "Maximum memory to use for file metadata cache such as Parquet metadata. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.",
-            }
-        ]
+        create_runtime_config_entries(
+            None,
+            Some("100G".to_string()),
+            None,
+            Some("50M".to_owned()),
+            Some("1M".to_owned()),
+            None,
+        )
     }
 
     /// Generate documentation that can be included in the user guide
diff --git a/datafusion/execution/src/task.rs b/datafusion/execution/src/task.rs
index c2a6cfe2c833f..0de0c937f2211 100644
--- a/datafusion/execution/src/task.rs
+++ b/datafusion/execution/src/task.rs
@@ -19,9 +19,9 @@ use crate::{
     config::SessionConfig, memory_pool::MemoryPool, registry::FunctionRegistry,
     runtime_env::RuntimeEnv,
 };
-use datafusion_common::{internal_datafusion_err, plan_datafusion_err, Result};
+use datafusion_common::{Result, internal_datafusion_err, plan_datafusion_err};
 use datafusion_expr::planner::ExprPlanner;
-use datafusion_expr::{AggregateUDF, ScalarUDF, WindowUDF};
+use datafusion_expr::{AggregateUDF, HigherOrderUDF, ScalarUDF, WindowUDF};
 use std::collections::HashSet;
 use std::{collections::HashMap, sync::Arc};
 
@@ -31,7 +31,23 @@ use std::{collections::HashMap, sync::Arc};
 /// execution. Please see the documentation on [`SessionContext`] for more
 /// information.
 ///
+/// # Relationship with [`ExecutionProps`]
+///
+/// [`TaskContext`] is intentionally distinct from [`ExecutionProps`].
+/// [`ExecutionProps`] is state used while optimizing a logical
+/// plan and constructing a physical plan.
+///
+/// [`TaskContext`] is the runtime context passed to physical operators when
+/// executing a physical plan. It carries runtime services and session state
+/// needed at that stage, such as [`RuntimeEnv`], memory-pool access, session
+/// configuration, and function lookup.
+///
+/// Keeping these structures separate avoids threading execution/runtime state
+/// through planning APIs, and avoids making execution depend on planner-only
+/// scratch state.
+///
 /// [`SessionContext`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html
+/// [`ExecutionProps`]: datafusion_expr::execution_props::ExecutionProps
 #[derive(Debug)]
 pub struct TaskContext {
     /// Session Id
@@ -42,6 +58,8 @@ pub struct TaskContext {
     session_config: SessionConfig,
     /// Scalar functions associated with this task context
     scalar_functions: HashMap<String, Arc<ScalarUDF>>,
+    /// Higher order functions associated with this task context
+    higher_order_functions: HashMap<String, Arc<dyn HigherOrderUDF>>,
     /// Aggregate functions associated with this task context
     aggregate_functions: HashMap<String, Arc<AggregateUDF>>,
     /// Window functions associated with this task context
@@ -60,6 +78,7 @@ impl Default for TaskContext {
             task_id: None,
             session_config: SessionConfig::new(),
             scalar_functions: HashMap::new(),
+            higher_order_functions: HashMap::new(),
             aggregate_functions: HashMap::new(),
             window_functions: HashMap::new(),
             runtime,
@@ -73,11 +92,13 @@ impl TaskContext {
     /// Most users will use [`SessionContext::task_ctx`] to create [`TaskContext`]s
     ///
     /// [`SessionContext::task_ctx`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.task_ctx
+    #[expect(clippy::too_many_arguments)]
     pub fn new(
         task_id: Option<String>,
         session_id: String,
         session_config: SessionConfig,
         scalar_functions: HashMap<String, Arc<ScalarUDF>>,
+        higher_order_functions: HashMap<String, Arc<dyn HigherOrderUDF>>,
         aggregate_functions: HashMap<String, Arc<AggregateUDF>>,
         window_functions: HashMap<String, Arc<WindowUDF>>,
         runtime: Arc<RuntimeEnv>,
@@ -87,6 +108,7 @@ impl TaskContext {
             session_id,
             session_config,
             scalar_functions,
+            higher_order_functions,
             aggregate_functions,
             window_functions,
             runtime,
@@ -122,6 +144,10 @@ impl TaskContext {
         &self.scalar_functions
     }
 
+    pub fn higher_order_functions(&self) -> &HashMap<String, Arc<dyn HigherOrderUDF>> {
+        &self.higher_order_functions
+    }
+
     pub fn aggregate_functions(&self) -> &HashMap<String, Arc<AggregateUDF>> {
         &self.aggregate_functions
     }
@@ -156,6 +182,16 @@ impl FunctionRegistry for TaskContext {
         })
     }
 
+    fn higher_order_function(&self, name: &str) -> Result<Arc<dyn HigherOrderUDF>> {
+        let result = self.higher_order_functions.get(name);
+
+        result.cloned().ok_or_else(|| {
+            plan_datafusion_err!(
+                "There is no higher-order function named \"{name}\" in the TaskContext"
+            )
+        })
+    }
+
     fn udaf(&self, name: &str) -> Result<Arc<AggregateUDF>> {
         let result = self.aggregate_functions.get(name);
 
@@ -198,10 +234,27 @@ impl FunctionRegistry for TaskContext {
         Ok(self.scalar_functions.insert(udf.name().into(), udf))
     }
 
+    fn register_higher_order_function(
+        &mut self,
+        function: Arc<dyn HigherOrderUDF>,
+    ) -> Result<Option<Arc<dyn HigherOrderUDF>>> {
+        function.aliases().iter().for_each(|alias| {
+            self.higher_order_functions
+                .insert(alias.clone(), Arc::clone(&function));
+        });
+        Ok(self
+            .higher_order_functions
+            .insert(function.name().into(), function))
+    }
+
     fn expr_planners(&self) -> Vec<Arc<dyn ExprPlanner>> {
         vec![]
     }
 
+    fn higher_order_function_names(&self) -> HashSet<String> {
+        self.higher_order_functions.keys().cloned().collect()
+    }
+
     fn udafs(&self) -> HashSet<String> {
         self.aggregate_functions.keys().cloned().collect()
     }
@@ -211,6 +264,11 @@ impl FunctionRegistry for TaskContext {
     }
 }
 
+/// Produce the [`TaskContext`].
+pub trait TaskContextProvider {
+    fn task_ctx(&self) -> Arc<TaskContext>;
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -248,6 +306,7 @@ mod tests {
             HashMap::default(),
             HashMap::default(),
             HashMap::default(),
+            HashMap::default(),
             runtime,
         );
 
@@ -280,6 +339,7 @@ mod tests {
             HashMap::default(),
             HashMap::default(),
             HashMap::default(),
+            HashMap::default(),
             runtime,
         );
 
diff --git a/datafusion/expr-common/Cargo.toml b/datafusion/expr-common/Cargo.toml
index db85f32079214..072c8f14da503 100644
--- a/datafusion/expr-common/Cargo.toml
+++ b/datafusion/expr-common/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -42,4 +45,6 @@ arrow = { workspace = true }
 datafusion-common = { workspace = true }
 indexmap = { workspace = true }
 itertools = { workspace = true }
-paste = "^1.0"
+
+[dev-dependencies]
+insta = { workspace = true }
diff --git a/datafusion/expr-common/src/accumulator.rs b/datafusion/expr-common/src/accumulator.rs
index 2829a9416f033..59fb6a595206a 100644
--- a/datafusion/expr-common/src/accumulator.rs
+++ b/datafusion/expr-common/src/accumulator.rs
@@ -18,7 +18,7 @@
 //! Accumulator module contains the trait definition for aggregation function's accumulators.
 
 use arrow::array::ArrayRef;
-use datafusion_common::{internal_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, internal_err};
 use std::fmt::Debug;
 
 /// Tracks an aggregate function's state.
@@ -48,7 +48,7 @@ use std::fmt::Debug;
 /// [`evaluate`]: Self::evaluate
 /// [`merge_batch`]: Self::merge_batch
 /// [window function]: https://en.wikipedia.org/wiki/Window_function_(SQL)
-pub trait Accumulator: Send + Sync + Debug {
+pub trait Accumulator: Send + Sync + Debug + std::any::Any {
     /// Updates the accumulator's state from its input.
     ///
     /// `values` contains the arguments to this aggregate function.
@@ -58,17 +58,30 @@ pub trait Accumulator: Send + Sync + Debug {
     /// running sum.
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()>;
 
-    /// Returns the final aggregate value, consuming the internal state.
+    /// Returns the final aggregate value.
     ///
     /// For example, the `SUM` accumulator maintains a running sum,
     /// and `evaluate` will produce that running sum as its output.
     ///
-    /// This function should not be called twice, otherwise it will
-    /// result in potentially non-deterministic behavior.
-    ///
     /// This function gets `&mut self` to allow for the accumulator to build
     /// arrow-compatible internal state that can be returned without copying
-    /// when possible (for example distinct strings)
+    /// when possible (for example distinct strings).
+    ///
+    /// ## Correctness
+    ///
+    /// This function must not consume the internal state, as it is also used in window
+    /// aggregate functions where it can be executed multiple times depending on the
+    /// current window frame. Consuming the internal state can cause the next invocation
+    /// to have incorrect results.
+    ///
+    /// - Even if this accumulator doesn't implement [`retract_batch`] it may still be used
+    ///   in window aggregate functions where the window frame is
+    ///   `ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW`
+    ///
+    /// It is fine to modify the state (e.g. re-order elements within internal state vec) so long
+    /// as this doesn't cause an incorrect computation on the next call of evaluate.
+    ///
+    /// [`retract_batch`]: Self::retract_batch
     fn evaluate(&mut self) -> Result<ScalarValue>;
 
     /// Returns the allocated size required for this accumulator, in
diff --git a/datafusion/expr-common/src/casts.rs b/datafusion/expr-common/src/casts.rs
index 8939ff1371bb9..dad589e4bfe9f 100644
--- a/datafusion/expr-common/src/casts.rs
+++ b/datafusion/expr-common/src/casts.rs
@@ -24,10 +24,9 @@
 use std::cmp::Ordering;
 
 use arrow::datatypes::{
-    DataType, TimeUnit, MAX_DECIMAL128_FOR_EACH_PRECISION,
-    MAX_DECIMAL32_FOR_EACH_PRECISION, MAX_DECIMAL64_FOR_EACH_PRECISION,
-    MIN_DECIMAL128_FOR_EACH_PRECISION, MIN_DECIMAL32_FOR_EACH_PRECISION,
-    MIN_DECIMAL64_FOR_EACH_PRECISION,
+    DataType, MAX_DECIMAL32_FOR_EACH_PRECISION, MAX_DECIMAL64_FOR_EACH_PRECISION,
+    MAX_DECIMAL128_FOR_EACH_PRECISION, MIN_DECIMAL32_FOR_EACH_PRECISION,
+    MIN_DECIMAL64_FOR_EACH_PRECISION, MIN_DECIMAL128_FOR_EACH_PRECISION, TimeUnit,
 };
 use arrow::temporal_conversions::{MICROSECONDS, MILLISECONDS, NANOSECONDS};
 use datafusion_common::ScalarValue;
@@ -59,7 +58,28 @@ pub fn is_supported_type(data_type: &DataType) -> bool {
         || is_supported_binary_type(data_type)
 }
 
-/// Returns true if unwrap_cast_in_comparison support this numeric type
+fn is_date_type(data_type: &DataType) -> bool {
+    matches!(data_type, DataType::Date32 | DataType::Date64)
+}
+
+/// Returns true when unwrapping a date/timestamp cast could change comparison
+/// semantics.
+///
+/// A `Date` stores only a calendar day, while a `Timestamp` stores a specific
+/// instant or wall-clock time. `Timestamp -> Date` is lossy because it drops the
+/// time-of-day. `Date -> Timestamp` is also lossy in this optimizer context
+/// because there is no unique inverse: converting a date to a timestamp has to
+/// invent a time component such as midnight.
+///
+/// For example, `CAST(ts AS DATE) = DATE '2024-01-01'` means "any timestamp
+/// during that day", but unwrapping it to `ts = TIMESTAMP '2024-01-01
+/// 00:00:00'` matches only midnight.
+fn is_lossy_temporal_cast(from_type: &DataType, to_type: &DataType) -> bool {
+    (is_date_type(from_type) && to_type.is_temporal())
+        || (is_date_type(to_type) && from_type.is_temporal())
+}
+
+/// Returns true if unwrap_cast_in_comparison supports this numeric type
 fn is_supported_numeric_type(data_type: &DataType) -> bool {
     matches!(
         data_type,
@@ -71,6 +91,8 @@ fn is_supported_numeric_type(data_type: &DataType) -> bool {
             | DataType::Int16
             | DataType::Int32
             | DataType::Int64
+            | DataType::Date32
+            | DataType::Date64
             | DataType::Decimal32(_, _)
             | DataType::Decimal64(_, _)
             | DataType::Decimal128(_, _)
@@ -108,6 +130,10 @@ fn try_cast_numeric_literal(
         return None;
     }
 
+    if is_lossy_temporal_cast(&lit_data_type, target_type) {
+        return None;
+    }
+
     let mul = match target_type {
         DataType::UInt8
         | DataType::UInt16
@@ -116,7 +142,9 @@ fn try_cast_numeric_literal(
         | DataType::Int8
         | DataType::Int16
         | DataType::Int32
-        | DataType::Int64 => 1_i128,
+        | DataType::Int64
+        | DataType::Date32
+        | DataType::Date64 => 1_i128,
         DataType::Timestamp(_, _) => 1_i128,
         DataType::Decimal32(_, scale) => 10_i128.pow(*scale as u32),
         DataType::Decimal64(_, scale) => 10_i128.pow(*scale as u32),
@@ -130,8 +158,8 @@ fn try_cast_numeric_literal(
         DataType::UInt64 => (u64::MIN as i128, u64::MAX as i128),
         DataType::Int8 => (i8::MIN as i128, i8::MAX as i128),
         DataType::Int16 => (i16::MIN as i128, i16::MAX as i128),
-        DataType::Int32 => (i32::MIN as i128, i32::MAX as i128),
-        DataType::Int64 => (i64::MIN as i128, i64::MAX as i128),
+        DataType::Int32 | DataType::Date32 => (i32::MIN as i128, i32::MAX as i128),
+        DataType::Int64 | DataType::Date64 => (i64::MIN as i128, i64::MAX as i128),
         DataType::Timestamp(_, _) => (i64::MIN as i128, i64::MAX as i128),
         DataType::Decimal32(precision, _) => (
             // Different precision for decimal32 can store different range of value.
@@ -165,6 +193,8 @@ fn try_cast_numeric_literal(
         ScalarValue::UInt16(Some(v)) => (*v as i128).checked_mul(mul),
         ScalarValue::UInt32(Some(v)) => (*v as i128).checked_mul(mul),
         ScalarValue::UInt64(Some(v)) => (*v as i128).checked_mul(mul),
+        ScalarValue::Date32(Some(v)) => (*v as i128).checked_mul(mul),
+        ScalarValue::Date64(Some(v)) => (*v as i128).checked_mul(mul),
         ScalarValue::TimestampSecond(Some(v), _) => (*v as i128).checked_mul(mul),
         ScalarValue::TimestampMillisecond(Some(v), _) => (*v as i128).checked_mul(mul),
         ScalarValue::TimestampMicrosecond(Some(v), _) => (*v as i128).checked_mul(mul),
@@ -242,6 +272,8 @@ fn try_cast_numeric_literal(
                     DataType::Int16 => ScalarValue::Int16(Some(value as i16)),
                     DataType::Int32 => ScalarValue::Int32(Some(value as i32)),
                     DataType::Int64 => ScalarValue::Int64(Some(value as i64)),
+                    DataType::Date32 => ScalarValue::Date32(Some(value as i32)),
+                    DataType::Date64 => ScalarValue::Date64(Some(value as i64)),
                     DataType::UInt8 => ScalarValue::UInt8(Some(value as u8)),
                     DataType::UInt16 => ScalarValue::UInt16(Some(value as u16)),
                     DataType::UInt32 => ScalarValue::UInt32(Some(value as u32)),
@@ -382,8 +414,8 @@ fn try_cast_binary(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::compute::{cast_with_options, CastOptions};
-    use arrow::datatypes::{Field, Fields, TimeUnit};
+    use arrow::compute::{CastOptions, cast_with_options};
+    use arrow::datatypes::{Field, Fields};
     use std::sync::Arc;
 
     #[derive(Debug, Clone)]
@@ -701,6 +733,33 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_try_cast_to_type_date_timestamp_lossy_not_allowed() {
+        expect_cast(
+            ScalarValue::Date32(Some(1)),
+            DataType::Timestamp(TimeUnit::Second, None),
+            ExpectedCast::NoValue,
+        );
+
+        expect_cast(
+            ScalarValue::Date64(Some(86_400_000)),
+            DataType::Timestamp(TimeUnit::Millisecond, None),
+            ExpectedCast::NoValue,
+        );
+
+        expect_cast(
+            ScalarValue::TimestampSecond(Some(86_400), None),
+            DataType::Date32,
+            ExpectedCast::NoValue,
+        );
+
+        expect_cast(
+            ScalarValue::TimestampMillisecond(Some(86_400_000), None),
+            DataType::Date64,
+            ExpectedCast::NoValue,
+        );
+    }
+
     #[test]
     fn test_try_cast_to_type_unsupported() {
         // int64 to list
diff --git a/datafusion/expr-common/src/columnar_value.rs b/datafusion/expr-common/src/columnar_value.rs
index a21ad5bbbcc30..bc6b8177ab3cf 100644
--- a/datafusion/expr-common/src/columnar_value.rs
+++ b/datafusion/expr-common/src/columnar_value.rs
@@ -17,12 +17,19 @@
 
 //! [`ColumnarValue`] represents the result of evaluating an expression.
 
-use arrow::array::{Array, ArrayRef, NullArray};
-use arrow::compute::{kernels, CastOptions};
-use arrow::datatypes::DataType;
-use arrow::util::pretty::pretty_format_columns;
-use datafusion_common::format::DEFAULT_CAST_OPTIONS;
-use datafusion_common::{internal_err, Result, ScalarValue};
+use arrow::{
+    array::{Array, ArrayRef, Date32Array, Date64Array, NullArray},
+    compute::{CastOptions, kernels, max, min},
+    datatypes::DataType,
+    util::pretty::pretty_format_columns,
+};
+use datafusion_common::internal_datafusion_err;
+use datafusion_common::{
+    Result, ScalarValue,
+    format::DEFAULT_CAST_OPTIONS,
+    internal_err,
+    scalar::{date_to_timestamp_multiplier, ensure_timestamp_in_bounds},
+};
 use std::fmt;
 use std::sync::Arc;
 
@@ -113,10 +120,12 @@ impl ColumnarValue {
         }
     }
 
-    /// Convert a columnar value into an Arrow [`ArrayRef`] with the specified
-    /// number of rows. [`Self::Scalar`] is converted by repeating the same
-    /// scalar multiple times which is not as efficient as handling the scalar
-    /// directly.
+    /// Convert any [`Self::Scalar`] into an Arrow [`ArrayRef`] with the specified
+    /// number of rows  by repeating the same scalar multiple times,
+    /// which is not as efficient as handling the scalar directly.
+    /// [`Self::Array`] will just be returned as is.
+    ///
+    /// See [`Self::into_array_of_size`] if you need to validate the length of the output array.
     ///
     /// See [`Self::values_to_arrays`] to convert multiple columnar values into
     /// arrays of the same length.
@@ -135,6 +144,38 @@ impl ColumnarValue {
     /// number of rows. [`Self::Scalar`] is converted by repeating the same
     /// scalar multiple times which is not as efficient as handling the scalar
     /// directly.
+    /// This validates that if this is [`Self::Array`], it has the expected length.
+    ///
+    /// See [`Self::values_to_arrays`] to convert multiple columnar values into
+    /// arrays of the same length.
+    ///
+    /// # Errors
+    ///
+    /// Errors if `self` is a Scalar that fails to be converted into an array of size or
+    /// if the array length does not match the expected length
+    pub fn into_array_of_size(self, num_rows: usize) -> Result<ArrayRef> {
+        match self {
+            ColumnarValue::Array(array) => {
+                if array.len() == num_rows {
+                    Ok(array)
+                } else {
+                    internal_err!(
+                        "Array length {} does not match expected length {}",
+                        array.len(),
+                        num_rows
+                    )
+                }
+            }
+            ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(num_rows),
+        }
+    }
+
+    /// Convert any [`Self::Scalar`] into an Arrow [`ArrayRef`] with the specified
+    /// number of rows  by repeating the same scalar multiple times,
+    /// which is not as efficient as handling the scalar directly.
+    /// [`Self::Array`] will just be returned as is.
+    ///
+    /// See [`Self::to_array_of_size`] if you need to validate the length of the output array.
     ///
     /// See [`Self::values_to_arrays`] to convert multiple columnar values into
     /// arrays of the same length.
@@ -149,6 +190,36 @@ impl ColumnarValue {
         })
     }
 
+    /// Convert a columnar value into an Arrow [`ArrayRef`] with the specified
+    /// number of rows. [`Self::Scalar`] is converted by repeating the same
+    /// scalar multiple times which is not as efficient as handling the scalar
+    /// directly.
+    /// This validates that if this is [`Self::Array`], it has the expected length.
+    ///
+    /// See [`Self::values_to_arrays`] to convert multiple columnar values into
+    /// arrays of the same length.
+    ///
+    /// # Errors
+    ///
+    /// Errors if `self` is a Scalar that fails to be converted into an array of size or
+    /// if the array length does not match the expected length
+    pub fn to_array_of_size(&self, num_rows: usize) -> Result<ArrayRef> {
+        match self {
+            ColumnarValue::Array(array) => {
+                if array.len() == num_rows {
+                    Ok(Arc::clone(array))
+                } else {
+                    internal_err!(
+                        "Array length {} does not match expected length {}",
+                        array.len(),
+                        num_rows
+                    )
+                }
+            }
+            ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(num_rows),
+        }
+    }
+
     /// Null columnar values are implemented as a null array in order to pass batch
     /// num_rows
     pub fn create_null_array(num_rows: usize) -> Self {
@@ -183,7 +254,8 @@ impl ColumnarValue {
                         Some(array_len)
                     } else {
                         return internal_err!(
-                            "Arguments has mixed length. Expected length: {array_len}, found length: {}", a.len()
+                            "Arguments has mixed length. Expected length: {array_len}, found length: {}",
+                            a.len()
                         );
                     }
                 }
@@ -202,7 +274,17 @@ impl ColumnarValue {
         Ok(args)
     }
 
-    /// Cast's this [ColumnarValue] to the specified `DataType`
+    /// Cast this [ColumnarValue] to the specified `DataType`
+    ///
+    /// # Struct Casting Behavior
+    ///
+    /// When casting struct types, fields are matched **by name** rather than position:
+    /// - Source fields are matched to target fields using case-sensitive name comparison
+    /// - Fields are reordered to match the target schema
+    /// - Missing target fields are filled with null arrays
+    /// - Extra source fields are ignored
+    ///
+    /// For non-struct types, uses Arrow's standard positional casting.
     pub fn cast_to(
         &self,
         cast_type: &DataType,
@@ -210,9 +292,10 @@ impl ColumnarValue {
     ) -> Result<ColumnarValue> {
         let cast_options = cast_options.cloned().unwrap_or(DEFAULT_CAST_OPTIONS);
         match self {
-            ColumnarValue::Array(array) => Ok(ColumnarValue::Array(
-                kernels::cast::cast_with_options(array, cast_type, &cast_options)?,
-            )),
+            ColumnarValue::Array(array) => {
+                let casted = cast_array_by_name(array, cast_type, &cast_options)?;
+                Ok(ColumnarValue::Array(casted))
+            }
             ColumnarValue::Scalar(scalar) => Ok(ColumnarValue::Scalar(
                 scalar.cast_to_with_options(cast_type, &cast_options)?,
             )),
@@ -220,6 +303,84 @@ impl ColumnarValue {
     }
 }
 
+fn cast_array_by_name(
+    array: &ArrayRef,
+    cast_type: &DataType,
+    cast_options: &CastOptions<'static>,
+) -> Result<ArrayRef> {
+    // If types are already equal, no cast needed
+    if array.data_type() == cast_type {
+        return Ok(Arc::clone(array));
+    }
+
+    if datafusion_common::nested_struct::requires_nested_struct_cast(
+        array.data_type(),
+        cast_type,
+    ) {
+        datafusion_common::nested_struct::cast_column(array, cast_type, cast_options)
+    } else {
+        ensure_date_array_timestamp_bounds(array, cast_type)?;
+        Ok(kernels::cast::cast_with_options(
+            array,
+            cast_type,
+            cast_options,
+        )?)
+    }
+}
+
+fn ensure_date_array_timestamp_bounds(
+    array: &ArrayRef,
+    cast_type: &DataType,
+) -> Result<()> {
+    let source_type = array.data_type().clone();
+    let Some(multiplier) = date_to_timestamp_multiplier(&source_type, cast_type) else {
+        return Ok(());
+    };
+
+    if multiplier <= 1 {
+        return Ok(());
+    }
+
+    // Use compute kernels to find min/max instead of iterating all elements
+    let (min_val, max_val): (Option<i64>, Option<i64>) = match &source_type {
+        DataType::Date32 => {
+            let arr = array
+                .as_any()
+                .downcast_ref::<Date32Array>()
+                .ok_or_else(|| {
+                    internal_datafusion_err!(
+                        "Expected Date32Array but found {}",
+                        array.data_type()
+                    )
+                })?;
+            (min(arr).map(|v| v as i64), max(arr).map(|v| v as i64))
+        }
+        DataType::Date64 => {
+            let arr = array
+                .as_any()
+                .downcast_ref::<Date64Array>()
+                .ok_or_else(|| {
+                    internal_datafusion_err!(
+                        "Expected Date64Array but found {}",
+                        array.data_type()
+                    )
+                })?;
+            (min(arr), max(arr))
+        }
+        _ => return Ok(()), // Not a date type, nothing to do
+    };
+
+    // Only validate the min and max values instead of all elements
+    if let Some(min) = min_val {
+        ensure_timestamp_in_bounds(min, multiplier, &source_type, cast_type)?;
+    }
+    if let Some(max) = max_val {
+        ensure_timestamp_in_bounds(max, multiplier, &source_type, cast_type)?;
+    }
+
+    Ok(())
+}
+
 // Implement Display trait for ColumnarValue
 impl fmt::Display for ColumnarValue {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
@@ -247,7 +408,38 @@ impl fmt::Display for ColumnarValue {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::array::Int32Array;
+    use arrow::{
+        array::{Date64Array, Int32Array, StructArray},
+        datatypes::{Field, Fields, TimeUnit},
+    };
+
+    #[test]
+    fn into_array_of_size() {
+        // Array case
+        let arr = make_array(1, 3);
+        let arr_columnar_value = ColumnarValue::Array(Arc::clone(&arr));
+        assert_eq!(&arr_columnar_value.into_array_of_size(3).unwrap(), &arr);
+
+        // Scalar case
+        let scalar_columnar_value = ColumnarValue::Scalar(ScalarValue::Int32(Some(42)));
+        let expected_array = make_array(42, 100);
+        assert_eq!(
+            &scalar_columnar_value.into_array_of_size(100).unwrap(),
+            &expected_array
+        );
+
+        // Array case with wrong size
+        let arr = make_array(1, 3);
+        let arr_columnar_value = ColumnarValue::Array(Arc::clone(&arr));
+        let result = arr_columnar_value.into_array_of_size(5);
+        let err = result.unwrap_err();
+        assert!(
+            err.to_string().starts_with(
+                "Internal error: Array length 3 does not match expected length 5"
+            ),
+            "Found: {err}"
+        );
+    }
 
     #[test]
     fn values_to_arrays() {
@@ -391,4 +583,115 @@ mod tests {
             )
         );
     }
+
+    #[test]
+    fn cast_struct_by_field_name() {
+        let source_fields = Fields::from(vec![
+            Field::new("b", DataType::Int32, true),
+            Field::new("a", DataType::Int32, true),
+        ]);
+
+        let target_fields = Fields::from(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+        ]);
+
+        let struct_array = StructArray::new(
+            source_fields,
+            vec![
+                Arc::new(Int32Array::from(vec![Some(3)])),
+                Arc::new(Int32Array::from(vec![Some(4)])),
+            ],
+            None,
+        );
+
+        let value = ColumnarValue::Array(Arc::new(struct_array));
+        let casted = value
+            .cast_to(&DataType::Struct(target_fields.clone()), None)
+            .expect("struct cast should succeed");
+
+        let ColumnarValue::Array(arr) = casted else {
+            panic!("expected array after cast");
+        };
+
+        let struct_array = arr
+            .as_any()
+            .downcast_ref::<StructArray>()
+            .expect("expected StructArray");
+
+        let field_a = struct_array
+            .column_by_name("a")
+            .expect("expected field a in cast result");
+        let field_b = struct_array
+            .column_by_name("b")
+            .expect("expected field b in cast result");
+
+        assert_eq!(
+            field_a
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .expect("expected Int32 array")
+                .value(0),
+            4
+        );
+        assert_eq!(
+            field_b
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .expect("expected Int32 array")
+                .value(0),
+            3
+        );
+    }
+
+    #[test]
+    fn cast_struct_missing_field_inserts_nulls() {
+        let source_fields = Fields::from(vec![Field::new("a", DataType::Int32, true)]);
+
+        let target_fields = Fields::from(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+        ]);
+
+        let struct_array = StructArray::new(
+            source_fields,
+            vec![Arc::new(Int32Array::from(vec![Some(5)]))],
+            None,
+        );
+
+        let value = ColumnarValue::Array(Arc::new(struct_array));
+        let casted = value
+            .cast_to(&DataType::Struct(target_fields.clone()), None)
+            .expect("struct cast should succeed");
+
+        let ColumnarValue::Array(arr) = casted else {
+            panic!("expected array after cast");
+        };
+
+        let struct_array = arr
+            .as_any()
+            .downcast_ref::<StructArray>()
+            .expect("expected StructArray");
+
+        let field_b = struct_array
+            .column_by_name("b")
+            .expect("expected missing field to be added");
+
+        assert!(field_b.is_null(0));
+    }
+
+    #[test]
+    fn cast_date64_array_to_timestamp_overflow() {
+        let overflow_value = i64::MAX / 1_000_000 + 1;
+        let array: ArrayRef = Arc::new(Date64Array::from(vec![Some(overflow_value)]));
+        let value = ColumnarValue::Array(array);
+        let result =
+            value.cast_to(&DataType::Timestamp(TimeUnit::Nanosecond, None), None);
+        let err = result.expect_err("expected overflow to be detected");
+        assert!(
+            err.to_string()
+                .contains("converted value exceeds the representable i64 range"),
+            "unexpected error: {err}"
+        );
+    }
 }
diff --git a/datafusion/expr-common/src/dyn_eq.rs b/datafusion/expr-common/src/dyn_eq.rs
index e0ebcae4879d6..75d9c06d67f56 100644
--- a/datafusion/expr-common/src/dyn_eq.rs
+++ b/datafusion/expr-common/src/dyn_eq.rs
@@ -28,7 +28,7 @@ use std::hash::{Hash, Hasher};
 ///
 /// Note: This trait should not be implemented directly. Implement `Eq` and `Any` and use
 /// the blanket implementation.
-#[allow(private_bounds)]
+#[expect(private_bounds)]
 pub trait DynEq: private::EqSealed {
     fn dyn_eq(&self, other: &dyn Any) -> bool;
 }
@@ -45,7 +45,7 @@ impl<T: Eq + Any> DynEq for T {
 ///
 /// Note: This trait should not be implemented directly. Implement `Hash` and `Any` and use
 /// the blanket implementation.
-#[allow(private_bounds)]
+#[expect(private_bounds)]
 pub trait DynHash: private::HashSealed {
     fn dyn_hash(&self, _state: &mut dyn Hasher);
 }
diff --git a/datafusion/expr-common/src/groups_accumulator.rs b/datafusion/expr-common/src/groups_accumulator.rs
index 9bcc1edff8824..9053f7a8eab9f 100644
--- a/datafusion/expr-common/src/groups_accumulator.rs
+++ b/datafusion/expr-common/src/groups_accumulator.rs
@@ -18,7 +18,7 @@
 //! Vectorized [`GroupsAccumulator`]
 
 use arrow::array::{ArrayRef, BooleanArray};
-use datafusion_common::{not_impl_err, Result};
+use datafusion_common::{Result, not_impl_err};
 
 /// Describes how many rows should be emitted during grouping.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -89,6 +89,9 @@ impl EmitTo {
 /// optional and is harder to implement than `Accumulator`, but can be much
 /// faster for queries with many group values.  See the [Aggregating Millions of
 /// Groups Fast blog] for more background.
+/// For more background, please also see the [Aggregating Millions of Groups Fast in Apache Arrow DataFusion 28.0.0 blog]
+///
+/// [Aggregating Millions of Groups Fast in Apache Arrow DataFusion 28.0.0 blog]: https://datafusion.apache.org/blog/2023/08/05/datafusion_fast_grouping
 ///
 /// [`NullState`] can help keep the state for groups that have not seen any
 /// values and produce the correct output for those groups.
@@ -105,7 +108,7 @@ impl EmitTo {
 ///
 /// [`Accumulator`]: crate::accumulator::Accumulator
 /// [Aggregating Millions of Groups Fast blog]: https://arrow.apache.org/blog/2023/08/05/datafusion_fast_grouping/
-pub trait GroupsAccumulator: Send {
+pub trait GroupsAccumulator: Send + std::any::Any {
     /// Updates the accumulator's state from its arguments, encoded as
     /// a vector of [`ArrayRef`]s.
     ///
diff --git a/datafusion/expr-common/src/interval_arithmetic.rs b/datafusion/expr-common/src/interval_arithmetic.rs
index 7515b59b9221b..71b150eb92c94 100644
--- a/datafusion/expr-common/src/interval_arithmetic.rs
+++ b/datafusion/expr-common/src/interval_arithmetic.rs
@@ -22,19 +22,22 @@ use std::fmt::{self, Display, Formatter};
 use std::ops::{AddAssign, SubAssign};
 
 use crate::operator::Operator;
-use crate::type_coercion::binary::{comparison_coercion_numeric, BinaryTypeCoercer};
+use crate::type_coercion::binary::{BinaryTypeCoercer, comparison_coercion};
 
-use arrow::compute::{cast_with_options, CastOptions};
+use arrow::compute::{CastOptions, cast_with_options};
 use arrow::datatypes::{
-    DataType, IntervalDayTime, IntervalMonthDayNano, IntervalUnit, TimeUnit,
+    DataType, IntervalDayTime, IntervalMonthDayNano, IntervalUnit,
     MAX_DECIMAL128_FOR_EACH_PRECISION, MAX_DECIMAL256_FOR_EACH_PRECISION,
-    MIN_DECIMAL128_FOR_EACH_PRECISION, MIN_DECIMAL256_FOR_EACH_PRECISION,
+    MIN_DECIMAL128_FOR_EACH_PRECISION, MIN_DECIMAL256_FOR_EACH_PRECISION, TimeUnit,
 };
 use datafusion_common::rounding::{alter_fp_rounding_mode, next_down, next_up};
-use datafusion_common::{internal_err, Result, ScalarValue};
+use datafusion_common::{
+    DataFusionError, Result, ScalarValue, assert_eq_or_internal_err,
+    assert_or_internal_err, internal_err,
+};
 
 macro_rules! get_extreme_value {
-    ($extreme:ident, $value:expr) => {
+    ($extreme:ident, $DECIMAL128_ARRAY:ident, $DECIMAL256_ARRAY:ident, $value:expr) => {
         match $value {
             DataType::UInt8 => ScalarValue::UInt8(Some(u8::$extreme)),
             DataType::UInt16 => ScalarValue::UInt16(Some(u16::$extreme)),
@@ -46,6 +49,8 @@ macro_rules! get_extreme_value {
             DataType::Int64 => ScalarValue::Int64(Some(i64::$extreme)),
             DataType::Float32 => ScalarValue::Float32(Some(f32::$extreme)),
             DataType::Float64 => ScalarValue::Float64(Some(f64::$extreme)),
+            DataType::Date32 => ScalarValue::Date32(Some(i32::$extreme)),
+            DataType::Date64 => ScalarValue::Date64(Some(i64::$extreme)),
             DataType::Duration(TimeUnit::Second) => {
                 ScalarValue::DurationSecond(Some(i64::$extreme))
             }
@@ -80,18 +85,12 @@ macro_rules! get_extreme_value {
                 ScalarValue::IntervalMonthDayNano(Some(IntervalMonthDayNano::$extreme))
             }
             DataType::Decimal128(precision, scale) => ScalarValue::Decimal128(
-                Some(
-                    paste::paste! {[<$extreme _DECIMAL128_FOR_EACH_PRECISION>]}
-                        [*precision as usize],
-                ),
+                Some($DECIMAL128_ARRAY[*precision as usize]),
                 *precision,
                 *scale,
             ),
             DataType::Decimal256(precision, scale) => ScalarValue::Decimal256(
-                Some(
-                    paste::paste! {[<$extreme _DECIMAL256_FOR_EACH_PRECISION>]}
-                        [*precision as usize],
-                ),
+                Some($DECIMAL256_ARRAY[*precision as usize]),
                 *precision,
                 *scale,
             ),
@@ -266,24 +265,23 @@ impl Interval {
     ///   - Floating-point endpoints with `NaN`, `INF`, or `NEG_INF` are converted
     ///     to `NULL`s.
     pub fn try_new(lower: ScalarValue, upper: ScalarValue) -> Result<Self> {
-        if lower.data_type() != upper.data_type() {
-            return internal_err!("Endpoints of an Interval should have the same type");
-        }
+        assert_eq_or_internal_err!(
+            lower.data_type(),
+            upper.data_type(),
+            "Endpoints of an Interval should have the same type"
+        );
 
         let interval = Self::new(lower, upper);
 
-        if interval.lower.is_null()
-            || interval.upper.is_null()
-            || interval.lower <= interval.upper
-        {
-            Ok(interval)
-        } else {
-            internal_err!(
-                "Interval's lower bound {} is greater than the upper bound {}",
-                interval.lower,
-                interval.upper
-            )
-        }
+        assert_or_internal_err!(
+            interval.lower.is_null()
+                || interval.upper.is_null()
+                || interval.lower <= interval.upper,
+            "Interval's lower bound {} is greater than the upper bound {}",
+            interval.lower,
+            interval.upper
+        );
+        Ok(interval)
     }
 
     /// Only for internal usage. Responsible for standardizing booleans and
@@ -430,21 +428,33 @@ impl Interval {
         )
     }
 
-    pub const CERTAINLY_FALSE: Self = Self {
+    /// An interval containing only the 'false' truth value.
+    pub const FALSE: Self = Self {
         lower: ScalarValue::Boolean(Some(false)),
         upper: ScalarValue::Boolean(Some(false)),
     };
 
-    pub const UNCERTAIN: Self = Self {
+    #[deprecated(since = "52.0.0", note = "Use `FALSE` instead")]
+    pub const CERTAINLY_FALSE: Self = Self::FALSE;
+
+    /// An interval containing both the 'true', and 'false' truth values.
+    pub const TRUE_OR_FALSE: Self = Self {
         lower: ScalarValue::Boolean(Some(false)),
         upper: ScalarValue::Boolean(Some(true)),
     };
 
-    pub const CERTAINLY_TRUE: Self = Self {
+    #[deprecated(since = "52.0.0", note = "Use `TRUE_OR_FALSE` instead")]
+    pub const UNCERTAIN: Self = Self::TRUE_OR_FALSE;
+
+    /// An interval containing only the 'true' truth value.
+    pub const TRUE: Self = Self {
         lower: ScalarValue::Boolean(Some(true)),
         upper: ScalarValue::Boolean(Some(true)),
     };
 
+    #[deprecated(since = "52.0.0", note = "Use `TRUE` instead")]
+    pub const CERTAINLY_TRUE: Self = Self::TRUE;
+
     /// Decide if this interval is certainly greater than, possibly greater than,
     /// or can't be greater than `other` by returning `[true, true]`,
     /// `[false, true]` or `[false, false]` respectively.
@@ -454,27 +464,28 @@ impl Interval {
     ///       to an error.
     pub fn gt<T: Borrow<Self>>(&self, other: T) -> Result<Self> {
         let rhs = other.borrow();
-        if self.data_type().ne(&rhs.data_type()) {
-            internal_err!(
-                "Only intervals with the same data type are comparable, lhs:{}, rhs:{}",
-                self.data_type(),
-                rhs.data_type()
-            )
-        } else if !(self.upper.is_null() || rhs.lower.is_null())
-            && self.upper <= rhs.lower
-        {
+        let lhs_type = self.data_type();
+        let rhs_type = rhs.data_type();
+        assert_eq_or_internal_err!(
+            lhs_type,
+            rhs_type,
+            "Only intervals with the same data type are comparable, lhs:{}, rhs:{}",
+            self.data_type(),
+            rhs.data_type()
+        );
+        if !(self.upper.is_null() || rhs.lower.is_null()) && self.upper <= rhs.lower {
             // Values in this interval are certainly less than or equal to
             // those in the given interval.
-            Ok(Self::CERTAINLY_FALSE)
+            Ok(Self::FALSE)
         } else if !(self.lower.is_null() || rhs.upper.is_null())
             && (self.lower > rhs.upper)
         {
             // Values in this interval are certainly greater than those in the
             // given interval.
-            Ok(Self::CERTAINLY_TRUE)
+            Ok(Self::TRUE)
         } else {
             // All outcomes are possible.
-            Ok(Self::UNCERTAIN)
+            Ok(Self::TRUE_OR_FALSE)
         }
     }
 
@@ -487,27 +498,28 @@ impl Interval {
     ///       to an error.
     pub fn gt_eq<T: Borrow<Self>>(&self, other: T) -> Result<Self> {
         let rhs = other.borrow();
-        if self.data_type().ne(&rhs.data_type()) {
-            internal_err!(
-                "Only intervals with the same data type are comparable, lhs:{}, rhs:{}",
-                self.data_type(),
-                rhs.data_type()
-            )
-        } else if !(self.lower.is_null() || rhs.upper.is_null())
-            && self.lower >= rhs.upper
-        {
+        let lhs_type = self.data_type();
+        let rhs_type = rhs.data_type();
+        assert_eq_or_internal_err!(
+            lhs_type,
+            rhs_type,
+            "Only intervals with the same data type are comparable, lhs:{}, rhs:{}",
+            self.data_type(),
+            rhs.data_type()
+        );
+        if !(self.lower.is_null() || rhs.upper.is_null()) && self.lower >= rhs.upper {
             // Values in this interval are certainly greater than or equal to
             // those in the given interval.
-            Ok(Self::CERTAINLY_TRUE)
+            Ok(Self::TRUE)
         } else if !(self.upper.is_null() || rhs.lower.is_null())
             && (self.upper < rhs.lower)
         {
             // Values in this interval are certainly less than those in the
             // given interval.
-            Ok(Self::CERTAINLY_FALSE)
+            Ok(Self::FALSE)
         } else {
             // All outcomes are possible.
-            Ok(Self::UNCERTAIN)
+            Ok(Self::TRUE_OR_FALSE)
         }
     }
 
@@ -542,25 +554,26 @@ impl Interval {
     ///       to an error.
     pub fn equal<T: Borrow<Self>>(&self, other: T) -> Result<Self> {
         let rhs = other.borrow();
-        if BinaryTypeCoercer::new(&self.data_type(), &Operator::Eq, &rhs.data_type())
-            .get_result_type()
-            .is_err()
-        {
-            internal_err!(
-                "Interval data types must be compatible for equality checks, lhs:{}, rhs:{}",
-                self.data_type(),
-                rhs.data_type()
-            )
-        } else if !self.lower.is_null()
+        let types_compatible =
+            BinaryTypeCoercer::new(&self.data_type(), &Operator::Eq, &rhs.data_type())
+                .get_result_type()
+                .is_ok();
+        assert_or_internal_err!(
+            types_compatible,
+            "Interval data types must be compatible for equality checks, lhs:{}, rhs:{}",
+            self.data_type(),
+            rhs.data_type()
+        );
+        if !self.lower.is_null()
             && (self.lower == self.upper)
             && (rhs.lower == rhs.upper)
             && (self.lower == rhs.lower)
         {
-            Ok(Self::CERTAINLY_TRUE)
+            Ok(Self::TRUE)
         } else if self.intersect(rhs)?.is_none() {
-            Ok(Self::CERTAINLY_FALSE)
+            Ok(Self::FALSE)
         } else {
-            Ok(Self::UNCERTAIN)
+            Ok(Self::TRUE_OR_FALSE)
         }
     }
 
@@ -584,8 +597,8 @@ impl Interval {
                 })
             }
 
-            // Return UNCERTAIN when intervals don't have concrete boolean bounds
-            _ => Ok(Self::UNCERTAIN),
+            // Return TRUE_OR_FALSE when intervals don't have concrete boolean bounds
+            _ => Ok(Self::TRUE_OR_FALSE),
         }
     }
 
@@ -609,21 +622,24 @@ impl Interval {
                 })
             }
 
-            // Return UNCERTAIN when intervals don't have concrete boolean bounds
-            _ => Ok(Self::UNCERTAIN),
+            // Return TRUE_OR_FALSE when intervals don't have concrete boolean bounds
+            _ => Ok(Self::TRUE_OR_FALSE),
         }
     }
 
     /// Compute the logical negation of this (boolean) interval.
     pub fn not(&self) -> Result<Self> {
-        if self.data_type().ne(&DataType::Boolean) {
-            internal_err!("Cannot apply logical negation to a non-boolean interval")
-        } else if self == &Self::CERTAINLY_TRUE {
-            Ok(Self::CERTAINLY_FALSE)
-        } else if self == &Self::CERTAINLY_FALSE {
-            Ok(Self::CERTAINLY_TRUE)
+        assert_eq_or_internal_err!(
+            self.data_type(),
+            DataType::Boolean,
+            "Cannot apply logical negation to a non-boolean interval"
+        );
+        if self == &Self::TRUE {
+            Ok(Self::FALSE)
+        } else if self == &Self::FALSE {
+            Ok(Self::TRUE)
         } else {
-            Ok(Self::UNCERTAIN)
+            Ok(Self::TRUE_OR_FALSE)
         }
     }
 
@@ -635,13 +651,15 @@ impl Interval {
     ///       to an error.
     pub fn intersect<T: Borrow<Self>>(&self, other: T) -> Result<Option<Self>> {
         let rhs = other.borrow();
-        if self.data_type().ne(&rhs.data_type()) {
-            return internal_err!(
-                "Only intervals with the same data type are intersectable, lhs:{}, rhs:{}",
-                self.data_type(),
-                rhs.data_type()
-            );
-        };
+        let lhs_type = self.data_type();
+        let rhs_type = rhs.data_type();
+        assert_eq_or_internal_err!(
+            lhs_type,
+            rhs_type,
+            "Only intervals with the same data type are intersectable, lhs:{}, rhs:{}",
+            self.data_type(),
+            rhs.data_type()
+        );
 
         // If it is evident that the result is an empty interval, short-circuit
         // and directly return `None`.
@@ -670,13 +688,15 @@ impl Interval {
     ///       to an error.
     pub fn union<T: Borrow<Self>>(&self, other: T) -> Result<Self> {
         let rhs = other.borrow();
-        if self.data_type().ne(&rhs.data_type()) {
-            return internal_err!(
-                "Cannot calculate the union of intervals with different data types, lhs:{}, rhs:{}",
-                self.data_type(),
-                rhs.data_type()
-            );
-        };
+        let lhs_type = self.data_type();
+        let rhs_type = rhs.data_type();
+        assert_eq_or_internal_err!(
+            lhs_type,
+            rhs_type,
+            "Cannot calculate the union of intervals with different data types, lhs:{}, rhs:{}",
+            self.data_type(),
+            rhs.data_type()
+        );
 
         let lower = if self.lower.is_null()
             || (!rhs.lower.is_null() && self.lower <= rhs.lower)
@@ -706,27 +726,28 @@ impl Interval {
     pub fn contains_value<T: Borrow<ScalarValue>>(&self, other: T) -> Result<bool> {
         let rhs = other.borrow();
 
-        let (lhs_lower, lhs_upper, rhs) = if self.data_type().eq(&rhs.data_type()) {
-            (&self.lower, &self.upper, rhs)
-        } else if let Some(common_type) =
-            comparison_coercion_numeric(&self.data_type(), &rhs.data_type())
-        {
-            (
-                &self.lower.cast_to(&common_type)?,
-                &self.upper.cast_to(&common_type)?,
-                &rhs.cast_to(&common_type)?,
-            )
+        let (lhs_lower, lhs_upper, rhs_value) = if self.data_type().eq(&rhs.data_type()) {
+            (self.lower.clone(), self.upper.clone(), rhs.clone())
         } else {
-            return internal_err!(
+            let maybe_common_type =
+                comparison_coercion(&self.data_type(), &rhs.data_type());
+            assert_or_internal_err!(
+                maybe_common_type.is_some(),
                 "Data types must be compatible for containment checks, lhs:{}, rhs:{}",
                 self.data_type(),
                 rhs.data_type()
             );
+            let common_type = maybe_common_type.expect("checked for Some");
+            (
+                self.lower.cast_to(&common_type)?,
+                self.upper.cast_to(&common_type)?,
+                rhs.cast_to(&common_type)?,
+            )
         };
 
         // We only check the upper bound for a `None` value because `None`
         // values are less than `Some` values according to Rust.
-        Ok(lhs_lower <= rhs && (lhs_upper.is_null() || rhs <= lhs_upper))
+        Ok(lhs_lower <= rhs_value && (lhs_upper.is_null() || rhs_value <= lhs_upper))
     }
 
     /// Decide if this interval is a superset of, overlaps with, or
@@ -738,23 +759,25 @@ impl Interval {
     ///       to an error.
     pub fn contains<T: Borrow<Self>>(&self, other: T) -> Result<Self> {
         let rhs = other.borrow();
-        if self.data_type().ne(&rhs.data_type()) {
-            return internal_err!(
-                "Interval data types must match for containment checks, lhs:{}, rhs:{}",
-                self.data_type(),
-                rhs.data_type()
-            );
-        };
+        let lhs_type = self.data_type();
+        let rhs_type = rhs.data_type();
+        assert_eq_or_internal_err!(
+            lhs_type,
+            rhs_type,
+            "Interval data types must match for containment checks, lhs:{}, rhs:{}",
+            self.data_type(),
+            rhs.data_type()
+        );
 
         match self.intersect(rhs)? {
             Some(intersection) => {
                 if &intersection == rhs {
-                    Ok(Self::CERTAINLY_TRUE)
+                    Ok(Self::TRUE)
                 } else {
-                    Ok(Self::UNCERTAIN)
+                    Ok(Self::TRUE_OR_FALSE)
                 }
             }
-            None => Ok(Self::CERTAINLY_FALSE),
+            None => Ok(Self::FALSE),
         }
     }
 
@@ -765,8 +788,7 @@ impl Interval {
     ///       Attempting to compare intervals of different data types will lead
     ///       to an error.
     pub fn is_superset(&self, other: &Interval, strict: bool) -> Result<bool> {
-        Ok(!(strict && self.eq(other))
-            && (self.contains(other)? == Interval::CERTAINLY_TRUE))
+        Ok(!(strict && self.eq(other)) && (self.contains(other)? == Interval::TRUE))
     }
 
     /// Add the given interval (`other`) to this interval. Say we have intervals
@@ -813,15 +835,15 @@ impl Interval {
     ///       to an error.
     pub fn mul<T: Borrow<Self>>(&self, other: T) -> Result<Self> {
         let rhs = other.borrow();
-        let dt = if self.data_type().eq(&rhs.data_type()) {
-            self.data_type()
-        } else {
-            return internal_err!(
-                "Intervals must have the same data type for multiplication, lhs:{}, rhs:{}",
-                self.data_type(),
-                rhs.data_type()
-            );
-        };
+        let dt = self.data_type();
+        let rhs_type = rhs.data_type();
+        assert_eq_or_internal_err!(
+            dt.clone(),
+            rhs_type.clone(),
+            "Intervals must have the same data type for multiplication, lhs:{}, rhs:{}",
+            dt.clone(),
+            rhs_type.clone()
+        );
 
         let zero = ScalarValue::new_zero(&dt)?;
 
@@ -832,12 +854,12 @@ impl Interval {
         ) {
             (true, true, false) => mul_helper_multi_zero_inclusive(&dt, self, rhs),
             (true, false, false) => {
-                mul_helper_single_zero_inclusive(&dt, self, rhs, zero)
+                mul_helper_single_zero_inclusive(&dt, self, rhs, &zero)
             }
             (false, true, false) => {
-                mul_helper_single_zero_inclusive(&dt, rhs, self, zero)
+                mul_helper_single_zero_inclusive(&dt, rhs, self, &zero)
             }
-            _ => mul_helper_zero_exclusive(&dt, self, rhs, zero),
+            _ => mul_helper_zero_exclusive(&dt, self, rhs, &zero),
         };
         Ok(result)
     }
@@ -856,15 +878,15 @@ impl Interval {
     ///           zero should result in an interval set, not the universal set.
     pub fn div<T: Borrow<Self>>(&self, other: T) -> Result<Self> {
         let rhs = other.borrow();
-        let dt = if self.data_type().eq(&rhs.data_type()) {
-            self.data_type()
-        } else {
-            return internal_err!(
-                "Intervals must have the same data type for division, lhs:{}, rhs:{}",
-                self.data_type(),
-                rhs.data_type()
-            );
-        };
+        let dt = self.data_type();
+        let rhs_type = rhs.data_type();
+        assert_eq_or_internal_err!(
+            dt.clone(),
+            rhs_type.clone(),
+            "Intervals must have the same data type for division, lhs:{}, rhs:{}",
+            dt.clone(),
+            rhs_type.clone()
+        );
 
         let zero = ScalarValue::new_zero(&dt)?;
         // We want 0 to be approachable from both negative and positive sides.
@@ -875,15 +897,12 @@ impl Interval {
 
         // Exit early with an unbounded interval if zero is strictly inside the
         // right hand side:
-        if rhs.contains(&zero_point)? == Self::CERTAINLY_TRUE && !dt.is_unsigned_integer()
-        {
+        if rhs.contains(&zero_point)? == Self::TRUE && !dt.is_unsigned_integer() {
             Self::make_unbounded(&dt)
         }
         // At this point, we know that only one endpoint of the right hand side
         // can be zero.
-        else if self.contains(&zero_point)? == Self::CERTAINLY_TRUE
-            && !dt.is_unsigned_integer()
-        {
+        else if self.contains(&zero_point)? == Self::TRUE && !dt.is_unsigned_integer() {
             Ok(div_helper_lhs_zero_inclusive(&dt, self, rhs, &zero_point))
         } else {
             Ok(div_helper_zero_exclusive(&dt, self, rhs, &zero_point))
@@ -910,7 +929,12 @@ impl Interval {
     ///   when the calculated cardinality does not fit in an `u64`.
     pub fn cardinality(&self) -> Option<u64> {
         let data_type = self.data_type();
-        if data_type.is_integer() {
+        if data_type.is_integer()
+            || matches!(
+                data_type,
+                DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, _)
+            )
+        {
             self.upper.distance(&self.lower).map(|diff| diff as u64)
         } else if data_type.is_floating() {
             // Negative numbers are sorted in the reverse order. To
@@ -1139,10 +1163,20 @@ fn handle_overflow<const UPPER: bool>(
     match (UPPER, positive_sign) {
         (true, true) | (false, false) => ScalarValue::try_from(dt).unwrap(),
         (true, false) => {
-            get_extreme_value!(MIN, dt)
+            get_extreme_value!(
+                MIN,
+                MIN_DECIMAL128_FOR_EACH_PRECISION,
+                MIN_DECIMAL256_FOR_EACH_PRECISION,
+                dt
+            )
         }
         (false, true) => {
-            get_extreme_value!(MAX, dt)
+            get_extreme_value!(
+                MAX,
+                MAX_DECIMAL128_FOR_EACH_PRECISION,
+                MAX_DECIMAL256_FOR_EACH_PRECISION,
+                dt
+            )
         }
     }
 }
@@ -1317,13 +1351,15 @@ pub fn satisfy_greater(
     right: &Interval,
     strict: bool,
 ) -> Result<Option<(Interval, Interval)>> {
-    if left.data_type().ne(&right.data_type()) {
-        return internal_err!(
-            "Intervals must have the same data type, lhs:{}, rhs:{}",
-            left.data_type(),
-            right.data_type()
-        );
-    }
+    let lhs_type = left.data_type();
+    let rhs_type = right.data_type();
+    assert_eq_or_internal_err!(
+        lhs_type.clone(),
+        rhs_type.clone(),
+        "Intervals must have the same data type, lhs:{}, rhs:{}",
+        lhs_type,
+        rhs_type
+    );
 
     if !left.upper.is_null() && left.upper <= right.lower {
         if !strict && left.upper == right.lower {
@@ -1437,10 +1473,10 @@ fn mul_helper_single_zero_inclusive(
     dt: &DataType,
     lhs: &Interval,
     rhs: &Interval,
-    zero: ScalarValue,
+    zero: &ScalarValue,
 ) -> Interval {
     // With the following interval bounds, there is no possibility to create an invalid interval.
-    if rhs.upper <= zero && !rhs.upper.is_null() {
+    if rhs.upper <= *zero && !rhs.upper.is_null() {
         // <-------=====0=====------->
         // <--======----0------------>
         let lower = mul_bounds::<false>(dt, &lhs.upper, &rhs.lower);
@@ -1489,11 +1525,11 @@ fn mul_helper_zero_exclusive(
     dt: &DataType,
     lhs: &Interval,
     rhs: &Interval,
-    zero: ScalarValue,
+    zero: &ScalarValue,
 ) -> Interval {
     let (lower, upper) = match (
-        lhs.upper <= zero && !lhs.upper.is_null(),
-        rhs.upper <= zero && !rhs.upper.is_null(),
+        lhs.upper <= *zero && !lhs.upper.is_null(),
+        rhs.upper <= *zero && !rhs.upper.is_null(),
     ) {
         // With the following interval bounds, there is no possibility to create an invalid interval.
         (true, true) => (
@@ -1738,6 +1774,44 @@ impl From<ScalarValue> for NullableInterval {
 }
 
 impl NullableInterval {
+    /// An interval containing only the 'false' truth value.
+    /// This interval is semantically equivalent to [Interval::FALSE].
+    pub const FALSE: Self = NullableInterval::NotNull {
+        values: Interval::FALSE,
+    };
+
+    /// An interval containing only the 'true' truth value.
+    /// This interval is semantically equivalent to [Interval::TRUE].
+    pub const TRUE: Self = NullableInterval::NotNull {
+        values: Interval::TRUE,
+    };
+
+    /// An interval containing only the 'unknown' truth value.
+    pub const UNKNOWN: Self = NullableInterval::Null {
+        datatype: DataType::Boolean,
+    };
+
+    /// An interval containing both the 'true', and 'false' truth values.
+    /// This interval is semantically equivalent to [Interval::TRUE_OR_FALSE].
+    pub const TRUE_OR_FALSE: Self = NullableInterval::NotNull {
+        values: Interval::TRUE_OR_FALSE,
+    };
+
+    /// An interval containing both the 'true' and 'unknown' truth values.
+    pub const TRUE_OR_UNKNOWN: Self = NullableInterval::MaybeNull {
+        values: Interval::TRUE,
+    };
+
+    /// An interval containing both the 'false' and 'unknown' truth values.
+    pub const FALSE_OR_UNKNOWN: Self = NullableInterval::MaybeNull {
+        values: Interval::FALSE,
+    };
+
+    /// An interval that contains all possible truth values: 'true', 'false' and 'unknown'.
+    pub const ANY_TRUTH_VALUE: Self = NullableInterval::MaybeNull {
+        values: Interval::TRUE_OR_FALSE,
+    };
+
     /// Get the values interval, or None if this interval is definitely null.
     pub fn values(&self) -> Option<&Interval> {
         match self {
@@ -1756,27 +1830,98 @@ impl NullableInterval {
 
     /// Return true if the value is definitely true (and not null).
     pub fn is_certainly_true(&self) -> bool {
-        match self {
-            Self::Null { .. } | Self::MaybeNull { .. } => false,
-            Self::NotNull { values } => values == &Interval::CERTAINLY_TRUE,
+        self == &Self::TRUE
+    }
+
+    /// Returns the set of possible values after applying the `is true` test on all
+    /// values in this set.
+    /// The resulting set can only contain 'TRUE' and/or 'FALSE', never 'UNKNOWN'.
+    pub fn is_true(&self) -> Result<Self> {
+        let (t, f, u) = self.is_true_false_unknown()?;
+
+        match (t, f, u) {
+            (true, false, false) => Ok(Self::TRUE),
+            (true, _, _) => Ok(Self::TRUE_OR_FALSE),
+            (false, _, _) => Ok(Self::FALSE),
         }
     }
 
     /// Return true if the value is definitely false (and not null).
     pub fn is_certainly_false(&self) -> bool {
-        match self {
-            Self::Null { .. } => false,
-            Self::MaybeNull { .. } => false,
-            Self::NotNull { values } => values == &Interval::CERTAINLY_FALSE,
+        self == &Self::FALSE
+    }
+
+    /// Returns the set of possible values after applying the `is false` test on all
+    /// values in this set.
+    /// The resulting set can only contain 'TRUE' and/or 'FALSE', never 'UNKNOWN'.
+    pub fn is_false(&self) -> Result<Self> {
+        let (t, f, u) = self.is_true_false_unknown()?;
+
+        match (t, f, u) {
+            (false, true, false) => Ok(Self::TRUE),
+            (_, true, _) => Ok(Self::TRUE_OR_FALSE),
+            (_, false, _) => Ok(Self::FALSE),
+        }
+    }
+
+    /// Return true if the value is definitely null (and not true or false).
+    pub fn is_certainly_unknown(&self) -> bool {
+        self == &Self::UNKNOWN
+    }
+
+    /// Returns the set of possible values after applying the `is unknown` test on all
+    /// values in this set.
+    /// The resulting set can only contain 'TRUE' and/or 'FALSE', never 'UNKNOWN'.
+    pub fn is_unknown(&self) -> Result<Self> {
+        let (t, f, u) = self.is_true_false_unknown()?;
+
+        match (t, f, u) {
+            (false, false, true) => Ok(Self::TRUE),
+            (_, _, true) => Ok(Self::TRUE_OR_FALSE),
+            (_, _, false) => Ok(Self::FALSE),
         }
     }
 
-    /// Perform logical negation on a boolean nullable interval.
-    fn not(&self) -> Result<Self> {
+    /// Returns a tuple of booleans indicating if this interval contains the
+    /// true, false, and unknown truth values respectively.
+    fn is_true_false_unknown(&self) -> Result<(bool, bool, bool), DataFusionError> {
+        Ok(match self {
+            NullableInterval::Null { .. } => (false, false, true),
+            NullableInterval::MaybeNull { values } => (
+                values.contains_value(ScalarValue::Boolean(Some(true)))?,
+                values.contains_value(ScalarValue::Boolean(Some(false)))?,
+                true,
+            ),
+            NullableInterval::NotNull { values } => (
+                values.contains_value(ScalarValue::Boolean(Some(true)))?,
+                values.contains_value(ScalarValue::Boolean(Some(false)))?,
+                false,
+            ),
+        })
+    }
+
+    /// Returns an interval representing the set of possible values after applying
+    /// SQL three-valued logical NOT on possible value in this interval.
+    ///
+    /// This method uses the following truth table.
+    ///
+    /// ```text
+    ///  A  | ¬A
+    /// ----|----
+    ///  F  |  T
+    ///  U  |  U
+    ///  T  |  F
+    /// ```
+    pub fn not(&self) -> Result<Self> {
         match self {
-            Self::Null { datatype } => Ok(Self::Null {
-                datatype: datatype.clone(),
-            }),
+            Self::Null { datatype } => {
+                assert_eq_or_internal_err!(
+                    datatype,
+                    &DataType::Boolean,
+                    "Cannot apply logical negation to a non-boolean interval"
+                );
+                Ok(Self::UNKNOWN)
+            }
             Self::MaybeNull { values } => Ok(Self::MaybeNull {
                 values: values.not()?,
             }),
@@ -1786,6 +1931,86 @@ impl NullableInterval {
         }
     }
 
+    /// Returns an interval representing the set of possible values after applying SQL
+    /// three-valued logical AND on each combination of possible values from `self` and `other`.
+    ///
+    /// This method uses the following truth table.
+    ///
+    /// ```text
+    ///       │   B
+    /// A ∧ B ├──────
+    ///       │ F U T
+    /// ──┬───┼──────
+    ///   │ F │ F F F
+    /// A │ U │ F U U
+    ///   │ T │ F U T
+    /// ```
+    pub fn and<T: Borrow<Self>>(&self, rhs: T) -> Result<Self> {
+        if self == &Self::FALSE || rhs.borrow() == &Self::FALSE {
+            return Ok(Self::FALSE);
+        }
+
+        match (self.values(), rhs.borrow().values()) {
+            (Some(l), Some(r)) => {
+                let values = l.and(r)?;
+                match (self, rhs.borrow()) {
+                    (Self::NotNull { .. }, Self::NotNull { .. }) => {
+                        Ok(Self::NotNull { values })
+                    }
+                    _ => Ok(Self::MaybeNull { values }),
+                }
+            }
+            (Some(v), None) | (None, Some(v)) => {
+                if v.contains_value(ScalarValue::Boolean(Some(false)))? {
+                    Ok(Self::FALSE_OR_UNKNOWN)
+                } else {
+                    Ok(Self::UNKNOWN)
+                }
+            }
+            _ => Ok(Self::UNKNOWN),
+        }
+    }
+
+    /// Returns an interval representing the set of possible values after applying SQL three-valued
+    /// logical OR on each combination of possible values from `self` and `other`.
+    ///
+    /// This method uses the following truth table.
+    ///
+    /// ```text
+    ///       │   B
+    /// A ∨ B ├──────
+    ///       │ F U T
+    /// ──┬───┼──────
+    ///   │ F │ F U T
+    /// A │ U │ U U T
+    ///   │ T │ T T T
+    /// ```
+    pub fn or<T: Borrow<Self>>(&self, rhs: T) -> Result<Self> {
+        if self == &Self::TRUE || rhs.borrow() == &Self::TRUE {
+            return Ok(Self::TRUE);
+        }
+
+        match (self.values(), rhs.borrow().values()) {
+            (Some(l), Some(r)) => {
+                let values = l.or(r)?;
+                match (self, rhs.borrow()) {
+                    (Self::NotNull { .. }, Self::NotNull { .. }) => {
+                        Ok(Self::NotNull { values })
+                    }
+                    _ => Ok(Self::MaybeNull { values }),
+                }
+            }
+            (Some(v), None) | (None, Some(v)) => {
+                if v.contains_value(ScalarValue::Boolean(Some(true)))? {
+                    Ok(Self::TRUE_OR_UNKNOWN)
+                } else {
+                    Ok(Self::UNKNOWN)
+                }
+            }
+            _ => Ok(Self::UNKNOWN),
+        }
+    }
+
     /// Apply the given operator to this interval and the given interval.
     ///
     /// # Examples
@@ -1838,7 +2063,7 @@ impl NullableInterval {
     ///     result,
     ///     NullableInterval::NotNull {
     ///         // Uncertain whether inequality is true or false
-    ///         values: Interval::UNCERTAIN,
+    ///         values: Interval::TRUE_OR_FALSE,
     ///     }
     /// );
     /// ```
@@ -1847,7 +2072,7 @@ impl NullableInterval {
             Operator::IsDistinctFrom => {
                 let values = match (self, rhs) {
                     // NULL is distinct from NULL -> False
-                    (Self::Null { .. }, Self::Null { .. }) => Interval::CERTAINLY_FALSE,
+                    (Self::Null { .. }, Self::Null { .. }) => Interval::FALSE,
                     // x is distinct from y -> x != y,
                     // if at least one of them is never null.
                     (Self::NotNull { .. }, _) | (_, Self::NotNull { .. }) => {
@@ -1857,11 +2082,11 @@ impl NullableInterval {
                             (Some(lhs_values), Some(rhs_values)) => {
                                 lhs_values.equal(rhs_values)?.not()?
                             }
-                            (Some(_), None) | (None, Some(_)) => Interval::CERTAINLY_TRUE,
+                            (Some(_), None) | (None, Some(_)) => Interval::TRUE,
                             (None, None) => unreachable!("Null case handled above"),
                         }
                     }
-                    _ => Interval::UNCERTAIN,
+                    _ => Interval::TRUE_OR_FALSE,
                 };
                 // IsDistinctFrom never returns null.
                 Ok(Self::NotNull { values })
@@ -1869,6 +2094,8 @@ impl NullableInterval {
             Operator::IsNotDistinctFrom => self
                 .apply_operator(&Operator::IsDistinctFrom, rhs)
                 .map(|i| i.not())?,
+            Operator::And => self.and(rhs),
+            Operator::Or => self.or(rhs),
             _ => {
                 if let (Some(left_values), Some(right_values)) =
                     (self.values(), rhs.values())
@@ -1918,6 +2145,30 @@ impl NullableInterval {
         }
     }
 
+    /// Determines if this interval contains a [`ScalarValue`] or not.
+    pub fn contains_value<T: Borrow<ScalarValue>>(&self, value: T) -> Result<bool> {
+        match value.borrow() {
+            ScalarValue::Null => match self {
+                NullableInterval::Null { .. } | NullableInterval::MaybeNull { .. } => {
+                    Ok(true)
+                }
+                NullableInterval::NotNull { .. } => Ok(false),
+            },
+            s if s.is_null() => match self {
+                NullableInterval::Null { datatype } => Ok(datatype.eq(&s.data_type())),
+                NullableInterval::MaybeNull { values } => {
+                    Ok(values.data_type().eq(&s.data_type()))
+                }
+                NullableInterval::NotNull { .. } => Ok(false),
+            },
+            s => match self {
+                NullableInterval::Null { .. } => Ok(false),
+                NullableInterval::MaybeNull { values }
+                | NullableInterval::NotNull { values } => values.contains_value(s),
+            },
+        }
+    }
+
     /// If the interval has collapsed to a single value, return that value.
     /// Otherwise, returns `None`.
     ///
@@ -1962,11 +2213,12 @@ impl NullableInterval {
 mod tests {
     use crate::{
         interval_arithmetic::{
-            handle_overflow, next_value, prev_value, satisfy_greater, Interval,
+            Interval, handle_overflow, next_value, prev_value, satisfy_greater,
         },
         operator::Operator,
     };
 
+    use crate::interval_arithmetic::NullableInterval;
     use arrow::datatypes::DataType;
     use datafusion_common::rounding::{next_down, next_up};
     use datafusion_common::{Result, ScalarValue};
@@ -2007,10 +2259,12 @@ mod tests {
             ScalarValue::Float64(Some(1e-6)),
         ];
         values.into_iter().zip(eps).for_each(|(value, eps)| {
-            assert!(next_value(value.clone())
-                .sub(value.clone())
-                .unwrap()
-                .lt(&eps));
+            assert!(
+                next_value(value.clone())
+                    .sub(value.clone())
+                    .unwrap()
+                    .lt(&eps)
+            );
             assert!(value.sub(prev_value(value.clone())).unwrap().lt(&eps));
             assert_ne!(next_value(value.clone()), value);
             assert_ne!(prev_value(value.clone()), value);
@@ -2188,8 +2442,8 @@ mod tests {
             ),
         ];
         for (first, second) in exactly_gt_cases {
-            assert_eq!(first.gt(second.clone())?, Interval::CERTAINLY_TRUE);
-            assert_eq!(second.lt(first)?, Interval::CERTAINLY_TRUE);
+            assert_eq!(first.gt(second.clone())?, Interval::TRUE);
+            assert_eq!(second.lt(first)?, Interval::TRUE);
         }
 
         let possibly_gt_cases = vec![
@@ -2225,8 +2479,8 @@ mod tests {
             ),
         ];
         for (first, second) in possibly_gt_cases {
-            assert_eq!(first.gt(second.clone())?, Interval::UNCERTAIN);
-            assert_eq!(second.lt(first)?, Interval::UNCERTAIN);
+            assert_eq!(first.gt(second.clone())?, Interval::TRUE_OR_FALSE);
+            assert_eq!(second.lt(first)?, Interval::TRUE_OR_FALSE);
         }
 
         let not_gt_cases = vec![
@@ -2262,8 +2516,8 @@ mod tests {
             ),
         ];
         for (first, second) in not_gt_cases {
-            assert_eq!(first.gt(second.clone())?, Interval::CERTAINLY_FALSE);
-            assert_eq!(second.lt(first)?, Interval::CERTAINLY_FALSE);
+            assert_eq!(first.gt(second.clone())?, Interval::FALSE);
+            assert_eq!(second.lt(first)?, Interval::FALSE);
         }
 
         Ok(())
@@ -2308,8 +2562,8 @@ mod tests {
             ),
         ];
         for (first, second) in exactly_gteq_cases {
-            assert_eq!(first.gt_eq(second.clone())?, Interval::CERTAINLY_TRUE);
-            assert_eq!(second.lt_eq(first)?, Interval::CERTAINLY_TRUE);
+            assert_eq!(first.gt_eq(second.clone())?, Interval::TRUE);
+            assert_eq!(second.lt_eq(first)?, Interval::TRUE);
         }
 
         let possibly_gteq_cases = vec![
@@ -2345,8 +2599,8 @@ mod tests {
             ),
         ];
         for (first, second) in possibly_gteq_cases {
-            assert_eq!(first.gt_eq(second.clone())?, Interval::UNCERTAIN);
-            assert_eq!(second.lt_eq(first)?, Interval::UNCERTAIN);
+            assert_eq!(first.gt_eq(second.clone())?, Interval::TRUE_OR_FALSE);
+            assert_eq!(second.lt_eq(first)?, Interval::TRUE_OR_FALSE);
         }
 
         let not_gteq_cases = vec![
@@ -2378,8 +2632,8 @@ mod tests {
             ),
         ];
         for (first, second) in not_gteq_cases {
-            assert_eq!(first.gt_eq(second.clone())?, Interval::CERTAINLY_FALSE);
-            assert_eq!(second.lt_eq(first)?, Interval::CERTAINLY_FALSE);
+            assert_eq!(first.gt_eq(second.clone())?, Interval::FALSE);
+            assert_eq!(second.lt_eq(first)?, Interval::FALSE);
         }
 
         Ok(())
@@ -2406,8 +2660,8 @@ mod tests {
             ),
         ];
         for (first, second) in exactly_eq_cases {
-            assert_eq!(first.equal(second.clone())?, Interval::CERTAINLY_TRUE);
-            assert_eq!(second.equal(first)?, Interval::CERTAINLY_TRUE);
+            assert_eq!(first.equal(second.clone())?, Interval::TRUE);
+            assert_eq!(second.equal(first)?, Interval::TRUE);
         }
 
         let possibly_eq_cases = vec![
@@ -2443,8 +2697,8 @@ mod tests {
             ),
         ];
         for (first, second) in possibly_eq_cases {
-            assert_eq!(first.equal(second.clone())?, Interval::UNCERTAIN);
-            assert_eq!(second.equal(first)?, Interval::UNCERTAIN);
+            assert_eq!(first.equal(second.clone())?, Interval::TRUE_OR_FALSE);
+            assert_eq!(second.equal(first)?, Interval::TRUE_OR_FALSE);
         }
 
         let not_eq_cases = vec![
@@ -2476,8 +2730,8 @@ mod tests {
             ),
         ];
         for (first, second) in not_eq_cases {
-            assert_eq!(first.equal(second.clone())?, Interval::CERTAINLY_FALSE);
-            assert_eq!(second.equal(first)?, Interval::CERTAINLY_FALSE);
+            assert_eq!(first.equal(second.clone())?, Interval::FALSE);
+            assert_eq!(second.equal(first)?, Interval::FALSE);
         }
 
         Ok(())
@@ -2486,95 +2740,178 @@ mod tests {
     #[test]
     fn and_test() -> Result<()> {
         let cases = vec![
-            (false, true, false, false, false, false),
-            (false, false, false, true, false, false),
-            (false, true, false, true, false, true),
-            (false, true, true, true, false, true),
-            (false, false, false, false, false, false),
-            (true, true, true, true, true, true),
+            (Interval::TRUE_OR_FALSE, Interval::FALSE, Interval::FALSE),
+            (
+                Interval::TRUE_OR_FALSE,
+                Interval::TRUE_OR_FALSE,
+                Interval::TRUE_OR_FALSE,
+            ),
+            (
+                Interval::TRUE_OR_FALSE,
+                Interval::TRUE,
+                Interval::TRUE_OR_FALSE,
+            ),
+            (Interval::FALSE, Interval::FALSE, Interval::FALSE),
+            (Interval::FALSE, Interval::TRUE_OR_FALSE, Interval::FALSE),
+            (Interval::FALSE, Interval::TRUE, Interval::FALSE),
+            (Interval::TRUE, Interval::FALSE, Interval::FALSE),
+            (
+                Interval::TRUE,
+                Interval::TRUE_OR_FALSE,
+                Interval::TRUE_OR_FALSE,
+            ),
+            (Interval::TRUE, Interval::TRUE, Interval::TRUE),
         ];
 
         for case in cases {
             assert_eq!(
-                Interval::make(Some(case.0), Some(case.1))?
-                    .and(Interval::make(Some(case.2), Some(case.3))?)?,
-                Interval::make(Some(case.4), Some(case.5))?
+                case.0.and(&case.1)?,
+                case.2,
+                "Failed for {} AND {}",
+                case.0,
+                case.1
             );
         }
         Ok(())
     }
 
     #[test]
-    fn not_test() -> Result<()> {
+    fn or_test() -> Result<()> {
         let cases = vec![
-            (false, true, false, true),
-            (false, false, true, true),
-            (true, true, false, false),
+            (
+                Interval::TRUE_OR_FALSE,
+                Interval::FALSE,
+                Interval::TRUE_OR_FALSE,
+            ),
+            (
+                Interval::TRUE_OR_FALSE,
+                Interval::TRUE_OR_FALSE,
+                Interval::TRUE_OR_FALSE,
+            ),
+            (Interval::TRUE_OR_FALSE, Interval::TRUE, Interval::TRUE),
+            (Interval::FALSE, Interval::FALSE, Interval::FALSE),
+            (
+                Interval::FALSE,
+                Interval::TRUE_OR_FALSE,
+                Interval::TRUE_OR_FALSE,
+            ),
+            (Interval::FALSE, Interval::TRUE, Interval::TRUE),
+            (Interval::TRUE, Interval::FALSE, Interval::TRUE),
+            (Interval::TRUE, Interval::TRUE_OR_FALSE, Interval::TRUE),
+            (Interval::TRUE, Interval::TRUE, Interval::TRUE),
         ];
 
         for case in cases {
             assert_eq!(
-                Interval::make(Some(case.0), Some(case.1))?.not()?,
-                Interval::make(Some(case.2), Some(case.3))?
+                case.0.or(&case.1)?,
+                case.2,
+                "Failed for {} OR {}",
+                case.0,
+                case.1
             );
         }
         Ok(())
     }
 
+    #[test]
+    fn not_test() -> Result<()> {
+        let cases = vec![
+            (Interval::TRUE_OR_FALSE, Interval::TRUE_OR_FALSE),
+            (Interval::FALSE, Interval::TRUE),
+            (Interval::TRUE, Interval::FALSE),
+        ];
+
+        for case in cases {
+            assert_eq!(case.0.not()?, case.1, "Failed for NOT {}", case.0);
+        }
+        Ok(())
+    }
+
     #[test]
     fn test_and_or_with_normalized_boolean_intervals() -> Result<()> {
         // Verify that NULL boolean bounds are normalized and don't cause errors
         let from_nulls =
             Interval::try_new(ScalarValue::Boolean(None), ScalarValue::Boolean(None))?;
 
-        assert!(from_nulls.or(&Interval::CERTAINLY_TRUE).is_ok());
-        assert!(from_nulls.and(&Interval::CERTAINLY_FALSE).is_ok());
+        assert!(from_nulls.or(&Interval::TRUE).is_ok());
+        assert!(from_nulls.and(&Interval::FALSE).is_ok());
 
         Ok(())
     }
 
+    // Tests that there's no such thing as a 'null' boolean interval.
+    // An interval with two `Boolean(None)` boundaries is normalised to `Interval::TRUE_OR_FALSE`.
     #[test]
-    fn test_and_null_boolean_intervals() -> Result<()> {
+    fn test_null_boolean_interval() {
         let null_interval =
-            Interval::try_new(ScalarValue::Boolean(None), ScalarValue::Boolean(None))?;
+            Interval::try_new(ScalarValue::Boolean(None), ScalarValue::Boolean(None))
+                .unwrap();
+
+        assert_eq!(null_interval, Interval::TRUE_OR_FALSE);
+    }
+
+    // Asserts that `Interval::TRUE_OR_FALSE` represents a set that contains `true`, `false`, and does
+    // not contain `null`.
+    #[test]
+    fn test_uncertain_boolean_interval() {
+        assert!(
+            Interval::TRUE_OR_FALSE
+                .contains_value(ScalarValue::Boolean(Some(true)))
+                .unwrap()
+        );
+        assert!(
+            Interval::TRUE_OR_FALSE
+                .contains_value(ScalarValue::Boolean(Some(false)))
+                .unwrap()
+        );
+        assert!(
+            !Interval::TRUE_OR_FALSE
+                .contains_value(ScalarValue::Boolean(None))
+                .unwrap()
+        );
+        assert!(
+            !Interval::TRUE_OR_FALSE
+                .contains_value(ScalarValue::Null)
+                .unwrap()
+        );
+    }
 
-        let and_result = null_interval.and(&Interval::CERTAINLY_FALSE)?;
-        assert_eq!(and_result, Interval::CERTAINLY_FALSE);
+    #[test]
+    fn test_and_uncertain_boolean_intervals() -> Result<()> {
+        let and_result = Interval::TRUE_OR_FALSE.and(&Interval::FALSE)?;
+        assert_eq!(and_result, Interval::FALSE);
 
-        let and_result = Interval::CERTAINLY_FALSE.and(&null_interval)?;
-        assert_eq!(and_result, Interval::CERTAINLY_FALSE);
+        let and_result = Interval::FALSE.and(&Interval::TRUE_OR_FALSE)?;
+        assert_eq!(and_result, Interval::FALSE);
 
-        let and_result = null_interval.and(&Interval::CERTAINLY_TRUE)?;
-        assert_eq!(and_result, Interval::UNCERTAIN);
+        let and_result = Interval::TRUE_OR_FALSE.and(&Interval::TRUE)?;
+        assert_eq!(and_result, Interval::TRUE_OR_FALSE);
 
-        let and_result = Interval::CERTAINLY_TRUE.and(&null_interval)?;
-        assert_eq!(and_result, Interval::UNCERTAIN);
+        let and_result = Interval::TRUE.and(&Interval::TRUE_OR_FALSE)?;
+        assert_eq!(and_result, Interval::TRUE_OR_FALSE);
 
-        let and_result = null_interval.and(&null_interval)?;
-        assert_eq!(and_result, Interval::UNCERTAIN);
+        let and_result = Interval::TRUE_OR_FALSE.and(&Interval::TRUE_OR_FALSE)?;
+        assert_eq!(and_result, Interval::TRUE_OR_FALSE);
 
         Ok(())
     }
 
     #[test]
-    fn test_or_null_boolean_intervals() -> Result<()> {
-        let null_interval =
-            Interval::try_new(ScalarValue::Boolean(None), ScalarValue::Boolean(None))?;
+    fn test_or_uncertain_boolean_intervals() -> Result<()> {
+        let or_result = Interval::TRUE_OR_FALSE.or(&Interval::FALSE)?;
+        assert_eq!(or_result, Interval::TRUE_OR_FALSE);
 
-        let or_result = null_interval.or(&Interval::CERTAINLY_FALSE)?;
-        assert_eq!(or_result, Interval::UNCERTAIN);
+        let or_result = Interval::FALSE.or(&Interval::TRUE_OR_FALSE)?;
+        assert_eq!(or_result, Interval::TRUE_OR_FALSE);
 
-        let or_result = Interval::CERTAINLY_FALSE.or(&null_interval)?;
-        assert_eq!(or_result, Interval::UNCERTAIN);
+        let or_result = Interval::TRUE_OR_FALSE.or(&Interval::TRUE)?;
+        assert_eq!(or_result, Interval::TRUE);
 
-        let or_result = null_interval.or(&Interval::CERTAINLY_TRUE)?;
-        assert_eq!(or_result, Interval::CERTAINLY_TRUE);
+        let or_result = Interval::TRUE.or(&Interval::TRUE_OR_FALSE)?;
+        assert_eq!(or_result, Interval::TRUE);
 
-        let or_result = Interval::CERTAINLY_TRUE.or(&null_interval)?;
-        assert_eq!(or_result, Interval::CERTAINLY_TRUE);
-
-        let or_result = null_interval.or(&null_interval)?;
-        assert_eq!(or_result, Interval::UNCERTAIN);
+        let or_result = Interval::TRUE_OR_FALSE.or(&Interval::TRUE_OR_FALSE)?;
+        assert_eq!(or_result, Interval::TRUE_OR_FALSE);
 
         Ok(())
     }
@@ -2817,37 +3154,37 @@ mod tests {
             (
                 Interval::make::<i64>(None, None)?,
                 Interval::make::<i64>(None, None)?,
-                Interval::CERTAINLY_TRUE,
+                Interval::TRUE,
             ),
             (
                 Interval::make(Some(1500_i64), Some(2000_i64))?,
                 Interval::make(Some(1501_i64), Some(1999_i64))?,
-                Interval::CERTAINLY_TRUE,
+                Interval::TRUE,
             ),
             (
                 Interval::make(Some(1000_i64), None)?,
                 Interval::make::<i64>(None, None)?,
-                Interval::UNCERTAIN,
+                Interval::TRUE_OR_FALSE,
             ),
             (
                 Interval::make(Some(1000_i64), Some(2000_i64))?,
                 Interval::make(Some(500), Some(1500_i64))?,
-                Interval::UNCERTAIN,
+                Interval::TRUE_OR_FALSE,
             ),
             (
                 Interval::make(Some(16.0), Some(32.0))?,
                 Interval::make(Some(32.0), Some(64.0))?,
-                Interval::UNCERTAIN,
+                Interval::TRUE_OR_FALSE,
             ),
             (
                 Interval::make(Some(1000_i64), None)?,
                 Interval::make(None, Some(0_i64))?,
-                Interval::CERTAINLY_FALSE,
+                Interval::FALSE,
             ),
             (
                 Interval::make(Some(1500_i64), Some(2000_i64))?,
                 Interval::make(Some(1000_i64), Some(1499_i64))?,
-                Interval::CERTAINLY_FALSE,
+                Interval::FALSE,
             ),
             (
                 Interval::try_new(
@@ -2855,7 +3192,7 @@ mod tests {
                     prev_value(ScalarValue::Float32(Some(1.0))),
                 )?,
                 Interval::make(Some(1.0_f32), Some(1.0_f32))?,
-                Interval::CERTAINLY_FALSE,
+                Interval::FALSE,
             ),
             (
                 Interval::try_new(
@@ -2863,7 +3200,7 @@ mod tests {
                     next_value(ScalarValue::Float32(Some(1.0))),
                 )?,
                 Interval::make(Some(1.0_f32), Some(1.0_f32))?,
-                Interval::CERTAINLY_FALSE,
+                Interval::FALSE,
             ),
         ];
         for (first, second, expected) in possible_cases {
@@ -3609,7 +3946,7 @@ mod tests {
         assert_eq!(interval.cardinality().unwrap(), 9178336040581070850);
 
         let interval = Interval::try_new(
-            ScalarValue::UInt64(Some(u64::MIN + 1)),
+            ScalarValue::UInt64(Some(1)),
             ScalarValue::UInt64(Some(u64::MAX)),
         )?;
         assert_eq!(interval.cardinality().unwrap(), u64::MAX);
@@ -3626,6 +3963,31 @@ mod tests {
         )?;
         assert_eq!(interval.cardinality().unwrap(), 2);
 
+        // Temporal types
+        let interval = Interval::try_new(
+            ScalarValue::Date32(Some(0)),
+            ScalarValue::Date32(Some(10)),
+        )?;
+        assert_eq!(interval.cardinality().unwrap(), 11);
+
+        let interval = Interval::try_new(
+            ScalarValue::Date64(Some(1000)),
+            ScalarValue::Date64(Some(5000)),
+        )?;
+        assert_eq!(interval.cardinality().unwrap(), 4001);
+
+        let interval = Interval::try_new(
+            ScalarValue::TimestampSecond(Some(100), None),
+            ScalarValue::TimestampSecond(Some(200), None),
+        )?;
+        assert_eq!(interval.cardinality().unwrap(), 101);
+
+        let interval = Interval::try_new(
+            ScalarValue::TimestampNanosecond(Some(1_000_000_000), None),
+            ScalarValue::TimestampNanosecond(Some(2_000_000_000), None),
+        )?;
+        assert_eq!(interval.cardinality().unwrap(), 1_000_000_001);
+
         Ok(())
     }
 
@@ -3892,12 +4254,8 @@ mod tests {
     }
 
     macro_rules! capture_mode_change {
-        ($TYPE:ty) => {
-            paste::item! {
-                capture_mode_change_helper!([<capture_mode_change_ $TYPE>],
-                                            [<create_interval_ $TYPE>],
-                                            $TYPE);
-            }
+        ($TYPE:ty, $TEST_FN_NAME:ident, $CREATE_FN_NAME:ident) => {
+            capture_mode_change_helper!($TEST_FN_NAME, $CREATE_FN_NAME, $TYPE);
         };
     }
 
@@ -3925,8 +4283,8 @@ mod tests {
         };
     }
 
-    capture_mode_change!(f32);
-    capture_mode_change!(f64);
+    capture_mode_change!(f32, capture_mode_change_f32, create_interval_f32);
+    capture_mode_change!(f64, capture_mode_change_f64, create_interval_f64);
 
     #[cfg(all(
         any(target_arch = "x86_64", target_arch = "aarch64"),
@@ -4103,4 +4461,331 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn nullable_and_test() -> Result<()> {
+        // Test cases: (lhs, rhs, expected) => lhs AND rhs = expected
+        #[rustfmt::skip]
+        let cases = vec![
+            (NullableInterval::TRUE, NullableInterval::TRUE, NullableInterval::TRUE),
+            (NullableInterval::TRUE, NullableInterval::FALSE, NullableInterval::FALSE),
+            (NullableInterval::TRUE, NullableInterval::UNKNOWN, NullableInterval::UNKNOWN),
+            (NullableInterval::TRUE, NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::TRUE, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::TRUE, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::TRUE, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::FALSE, NullableInterval::TRUE, NullableInterval::FALSE),
+            (NullableInterval::FALSE, NullableInterval::FALSE, NullableInterval::FALSE),
+            (NullableInterval::FALSE, NullableInterval::UNKNOWN, NullableInterval::FALSE),
+            (NullableInterval::FALSE, NullableInterval::TRUE_OR_FALSE, NullableInterval::FALSE),
+            (NullableInterval::FALSE, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::FALSE),
+            (NullableInterval::FALSE, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE),
+            (NullableInterval::FALSE, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::FALSE),
+            (NullableInterval::UNKNOWN, NullableInterval::TRUE, NullableInterval::UNKNOWN),
+            (NullableInterval::UNKNOWN, NullableInterval::FALSE, NullableInterval::FALSE),
+            (NullableInterval::UNKNOWN, NullableInterval::UNKNOWN, NullableInterval::UNKNOWN),
+            (NullableInterval::UNKNOWN, NullableInterval::TRUE_OR_FALSE, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::UNKNOWN),
+            (NullableInterval::UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::UNKNOWN, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::TRUE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::FALSE, NullableInterval::FALSE),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::TRUE_OR_FALSE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE, NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::FALSE, NullableInterval::FALSE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::FALSE, NullableInterval::FALSE),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::UNKNOWN, NullableInterval::UNKNOWN),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_FALSE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::TRUE, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE, NullableInterval::FALSE),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::TRUE_OR_FALSE, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::FALSE_OR_UNKNOWN),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                case.0.apply_operator(&Operator::And, &case.1).unwrap(),
+                case.2,
+                "Failed for {} AND {}",
+                case.0,
+                case.1
+            );
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn nullable_or_test() -> Result<()> {
+        // Test cases: (lhs, rhs, expected) => lhs OR rhs = expected
+        #[rustfmt::skip]
+        let cases = vec![
+            (NullableInterval::TRUE, NullableInterval::TRUE, NullableInterval::TRUE),
+            (NullableInterval::TRUE, NullableInterval::FALSE, NullableInterval::TRUE),
+            (NullableInterval::TRUE, NullableInterval::UNKNOWN, NullableInterval::TRUE),
+            (NullableInterval::TRUE, NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE),
+            (NullableInterval::TRUE, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE),
+            (NullableInterval::TRUE, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::TRUE),
+            (NullableInterval::TRUE, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::TRUE),
+            (NullableInterval::FALSE, NullableInterval::TRUE, NullableInterval::TRUE),
+            (NullableInterval::FALSE, NullableInterval::FALSE, NullableInterval::FALSE),
+            (NullableInterval::FALSE, NullableInterval::UNKNOWN, NullableInterval::UNKNOWN),
+            (NullableInterval::FALSE, NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::FALSE, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::FALSE, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::FALSE, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::UNKNOWN, NullableInterval::TRUE, NullableInterval::TRUE),
+            (NullableInterval::UNKNOWN, NullableInterval::FALSE, NullableInterval::UNKNOWN),
+            (NullableInterval::UNKNOWN, NullableInterval::UNKNOWN, NullableInterval::UNKNOWN),
+            (NullableInterval::UNKNOWN, NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::UNKNOWN),
+            (NullableInterval::UNKNOWN, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::TRUE, NullableInterval::TRUE),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::FALSE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::TRUE_OR_FALSE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE, NullableInterval::TRUE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::FALSE, NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE, NullableInterval::TRUE),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::FALSE, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::TRUE, NullableInterval::TRUE),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::UNKNOWN, NullableInterval::UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::TRUE_OR_FALSE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                case.0.apply_operator(&Operator::Or, &case.1).unwrap(),
+                case.2,
+                "Failed for {} OR {}",
+                case.0,
+                case.1
+            );
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn nullable_not_test() -> Result<()> {
+        // Test cases: (interval, expected) => NOT interval = expected
+        #[rustfmt::skip]
+        let cases = vec![
+            (NullableInterval::TRUE, NullableInterval::FALSE),
+            (NullableInterval::FALSE, NullableInterval::TRUE),
+            (NullableInterval::UNKNOWN, NullableInterval::UNKNOWN),
+            (NullableInterval::TRUE_OR_FALSE,NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::TRUE_OR_UNKNOWN,NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN,NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE),
+        ];
+
+        for case in cases {
+            assert_eq!(case.0.not().unwrap(), case.1, "Failed for NOT {}", case.0,);
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn nullable_interval_is_certainly_true() {
+        // Test cases: (interval, expected) => interval.is_certainly_true() = expected
+        #[rustfmt::skip]
+        let test_cases = vec![
+            (NullableInterval::TRUE, true),
+            (NullableInterval::FALSE, false),
+            (NullableInterval::UNKNOWN, false),
+            (NullableInterval::TRUE_OR_FALSE, false),
+            (NullableInterval::TRUE_OR_UNKNOWN, false),
+            (NullableInterval::FALSE_OR_UNKNOWN, false),
+            (NullableInterval::ANY_TRUTH_VALUE, false),
+        ];
+
+        for (interval, expected) in test_cases {
+            let result = interval.is_certainly_true();
+            assert_eq!(result, expected, "Failed for interval: {interval}",);
+        }
+    }
+
+    #[test]
+    fn nullable_interval_is_true() {
+        // Test cases: (interval, expected) => interval.is_true() = expected
+        #[rustfmt::skip]
+        let test_cases = vec![
+            (NullableInterval::TRUE, NullableInterval::TRUE),
+            (NullableInterval::FALSE, NullableInterval::FALSE),
+            (NullableInterval::UNKNOWN, NullableInterval::FALSE),
+            (NullableInterval::TRUE_OR_FALSE,NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::TRUE_OR_UNKNOWN,NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE),
+            (NullableInterval::ANY_TRUTH_VALUE,NullableInterval::TRUE_OR_FALSE),
+        ];
+
+        for (interval, expected) in test_cases {
+            let result = interval.is_true().unwrap();
+            assert_eq!(result, expected, "Failed for interval: {interval}",);
+        }
+    }
+
+    #[test]
+    fn nullable_interval_is_certainly_false() {
+        // Test cases: (interval, expected) => interval.is_certainly_false() = expected
+        #[rustfmt::skip]
+        let test_cases = vec![
+            (NullableInterval::TRUE, false),
+            (NullableInterval::FALSE, true),
+            (NullableInterval::UNKNOWN, false),
+            (NullableInterval::TRUE_OR_FALSE, false),
+            (NullableInterval::TRUE_OR_UNKNOWN, false),
+            (NullableInterval::FALSE_OR_UNKNOWN, false),
+            (NullableInterval::ANY_TRUTH_VALUE, false),
+        ];
+
+        for (interval, expected) in test_cases {
+            let result = interval.is_certainly_false();
+            assert_eq!(result, expected, "Failed for interval: {interval}",);
+        }
+    }
+
+    #[test]
+    fn nullable_interval_is_false() {
+        // Test cases: (interval, expected) => interval.is_false() = expected
+        #[rustfmt::skip]
+        let test_cases = vec![
+            (NullableInterval::TRUE, NullableInterval::FALSE),
+            (NullableInterval::FALSE, NullableInterval::TRUE),
+            (NullableInterval::UNKNOWN, NullableInterval::FALSE),
+            (NullableInterval::TRUE_OR_FALSE,NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::FALSE),
+            (NullableInterval::FALSE_OR_UNKNOWN,NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::ANY_TRUTH_VALUE,NullableInterval::TRUE_OR_FALSE),
+        ];
+
+        for (interval, expected) in test_cases {
+            let result = interval.is_false().unwrap();
+            assert_eq!(result, expected, "Failed for interval: {interval}",);
+        }
+    }
+
+    #[test]
+    fn nullable_interval_is_certainly_unknown() {
+        // Test cases: (interval, expected) => interval.is_certainly_unknown() = expected
+        #[rustfmt::skip]
+        let test_cases = vec![
+            (NullableInterval::TRUE, false),
+            (NullableInterval::FALSE, false),
+            (NullableInterval::UNKNOWN, true),
+            (NullableInterval::TRUE_OR_FALSE, false),
+            (NullableInterval::TRUE_OR_UNKNOWN, false),
+            (NullableInterval::FALSE_OR_UNKNOWN, false),
+            (NullableInterval::ANY_TRUTH_VALUE, false),
+        ];
+
+        for (interval, expected) in test_cases {
+            let result = interval.is_certainly_unknown();
+            assert_eq!(result, expected, "Failed for interval: {interval}",);
+        }
+    }
+
+    #[test]
+    fn nullable_interval_is_unknown() {
+        // Test cases: (interval, expected) => interval.is_unknown() = expected
+        #[rustfmt::skip]
+        let test_cases = vec![
+            (NullableInterval::TRUE, NullableInterval::FALSE),
+            (NullableInterval::FALSE, NullableInterval::FALSE),
+            (NullableInterval::UNKNOWN, NullableInterval::TRUE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::FALSE),
+            (NullableInterval::TRUE_OR_UNKNOWN,NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::FALSE_OR_UNKNOWN,NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::ANY_TRUTH_VALUE,NullableInterval::TRUE_OR_FALSE),
+        ];
+
+        for (interval, expected) in test_cases {
+            let result = interval.is_unknown().unwrap();
+            assert_eq!(result, expected, "Failed for interval: {interval}",);
+        }
+    }
+
+    #[test]
+    fn nullable_interval_contains_value() {
+        // Test cases: (interval, value, expected) => interval.contains_value(value) = expected
+        #[rustfmt::skip]
+        let test_cases = vec![
+            (NullableInterval::TRUE, ScalarValue::Boolean(Some(true)), true),
+            (NullableInterval::TRUE, ScalarValue::Boolean(Some(false)), false),
+            (NullableInterval::TRUE, ScalarValue::Boolean(None), false),
+            (NullableInterval::TRUE, ScalarValue::Null, false),
+            (NullableInterval::TRUE, ScalarValue::UInt32(None), false),
+            (NullableInterval::FALSE, ScalarValue::Boolean(Some(true)), false),
+            (NullableInterval::FALSE, ScalarValue::Boolean(Some(false)), true),
+            (NullableInterval::FALSE, ScalarValue::Boolean(None), false),
+            (NullableInterval::FALSE, ScalarValue::Null, false),
+            (NullableInterval::FALSE, ScalarValue::UInt32(None), false),
+            (NullableInterval::UNKNOWN, ScalarValue::Boolean(Some(true)), false),
+            (NullableInterval::UNKNOWN, ScalarValue::Boolean(Some(false)), false),
+            (NullableInterval::UNKNOWN, ScalarValue::Boolean(None), true),
+            (NullableInterval::UNKNOWN, ScalarValue::Null, true),
+            (NullableInterval::UNKNOWN, ScalarValue::UInt32(None), false),
+            (NullableInterval::TRUE_OR_FALSE, ScalarValue::Boolean(Some(true)), true),
+            (NullableInterval::TRUE_OR_FALSE, ScalarValue::Boolean(Some(false)), true),
+            (NullableInterval::TRUE_OR_FALSE, ScalarValue::Boolean(None), false),
+            (NullableInterval::TRUE_OR_FALSE, ScalarValue::Null, false),
+            (NullableInterval::TRUE_OR_FALSE, ScalarValue::UInt32(None), false),
+            (NullableInterval::TRUE_OR_UNKNOWN, ScalarValue::Boolean(Some(true)), true),
+            (NullableInterval::TRUE_OR_UNKNOWN, ScalarValue::Boolean(Some(false)), false),
+            (NullableInterval::TRUE_OR_UNKNOWN, ScalarValue::Boolean(None), true),
+            (NullableInterval::TRUE_OR_UNKNOWN, ScalarValue::Null, true),
+            (NullableInterval::TRUE_OR_UNKNOWN, ScalarValue::UInt32(None), false),
+            (NullableInterval::FALSE_OR_UNKNOWN, ScalarValue::Boolean(Some(true)), false),
+            (NullableInterval::FALSE_OR_UNKNOWN, ScalarValue::Boolean(Some(false)), true),
+            (NullableInterval::FALSE_OR_UNKNOWN, ScalarValue::Boolean(None), true),
+            (NullableInterval::FALSE_OR_UNKNOWN, ScalarValue::Null, true),
+            (NullableInterval::FALSE_OR_UNKNOWN, ScalarValue::UInt32(None), false),
+            (NullableInterval::ANY_TRUTH_VALUE, ScalarValue::Boolean(Some(true)), true),
+            (NullableInterval::ANY_TRUTH_VALUE, ScalarValue::Boolean(Some(false)), true),
+            (NullableInterval::ANY_TRUTH_VALUE, ScalarValue::Boolean(None), true),
+            (NullableInterval::ANY_TRUTH_VALUE, ScalarValue::Null, true),
+            (NullableInterval::ANY_TRUTH_VALUE, ScalarValue::UInt32(None), false),
+        ];
+
+        for (interval, value, expected) in test_cases {
+            let result = interval.contains_value(value.clone()).unwrap();
+            assert_eq!(
+                result, expected,
+                "Failed for interval: {interval} and value {value:?}",
+            );
+        }
+    }
 }
diff --git a/datafusion/expr-common/src/lib.rs b/datafusion/expr-common/src/lib.rs
index a4f6414a8c51d..c9a95fd294503 100644
--- a/datafusion/expr-common/src/lib.rs
+++ b/datafusion/expr-common/src/lib.rs
@@ -31,6 +31,7 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 pub mod accumulator;
 pub mod casts;
@@ -39,7 +40,10 @@ pub mod dyn_eq;
 pub mod groups_accumulator;
 pub mod interval_arithmetic;
 pub mod operator;
+pub mod placement;
 pub mod signature;
 pub mod sort_properties;
 pub mod statistics;
 pub mod type_coercion;
+
+pub use placement::ExpressionPlacement;
diff --git a/datafusion/expr-common/src/operator.rs b/datafusion/expr-common/src/operator.rs
index 33512b0c354d6..427069b326f9d 100644
--- a/datafusion/expr-common/src/operator.rs
+++ b/datafusion/expr-common/src/operator.rs
@@ -140,6 +140,10 @@ pub enum Operator {
     ///
     /// Not implemented in DataFusion yet.
     QuestionPipe,
+    /// Colon operator, like `:`
+    ///
+    /// Not implemented in DataFusion yet.
+    Colon,
 }
 
 impl Operator {
@@ -188,7 +192,8 @@ impl Operator {
             | Operator::AtQuestion
             | Operator::Question
             | Operator::QuestionAnd
-            | Operator::QuestionPipe => None,
+            | Operator::QuestionPipe
+            | Operator::Colon => None,
         }
     }
 
@@ -283,7 +288,8 @@ impl Operator {
             | Operator::AtQuestion
             | Operator::Question
             | Operator::QuestionAnd
-            | Operator::QuestionPipe => None,
+            | Operator::QuestionPipe
+            | Operator::Colon => None,
         }
     }
 
@@ -323,7 +329,8 @@ impl Operator {
             | Operator::AtQuestion
             | Operator::Question
             | Operator::QuestionAnd
-            | Operator::QuestionPipe => 30,
+            | Operator::QuestionPipe
+            | Operator::Colon => 30,
             Operator::Plus | Operator::Minus => 40,
             Operator::Multiply | Operator::Divide | Operator::Modulo => 45,
         }
@@ -369,7 +376,8 @@ impl Operator {
             | Operator::AtQuestion
             | Operator::Question
             | Operator::QuestionAnd
-            | Operator::QuestionPipe => true,
+            | Operator::QuestionPipe
+            | Operator::Colon => true,
 
             // E.g. `TRUE OR NULL` is `TRUE`
             Operator::Or
@@ -429,6 +437,7 @@ impl fmt::Display for Operator {
             Operator::Question => "?",
             Operator::QuestionAnd => "?&",
             Operator::QuestionPipe => "?|",
+            Operator::Colon => ":",
         };
         write!(f, "{display}")
     }
diff --git a/datafusion/expr-common/src/placement.rs b/datafusion/expr-common/src/placement.rs
new file mode 100644
index 0000000000000..8212ba618e322
--- /dev/null
+++ b/datafusion/expr-common/src/placement.rs
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Expression placement information for optimization decisions.
+
+/// Describes where an expression should be placed in the query plan for
+/// optimal execution. This is used by optimizers to make decisions about
+/// expression placement, such as whether to push expressions down through
+/// projections.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum ExpressionPlacement {
+    /// A constant literal value.
+    Literal,
+    /// A simple column reference.
+    Column,
+    /// A cheap expression that can be pushed to leaf nodes in the plan.
+    /// Examples include `get_field` for struct field access.
+    /// Pushing these expressions down in the plan can reduce data early
+    /// at low compute cost.
+    /// See [`ExpressionPlacement::should_push_to_leaves`] for details.
+    MoveTowardsLeafNodes,
+    /// An expensive expression that should stay where it is in the plan.
+    /// Examples include complex scalar functions or UDFs.
+    KeepInPlace,
+}
+
+impl ExpressionPlacement {
+    /// Returns true if the expression can be pushed down to leaf nodes
+    /// in the query plan.
+    ///
+    /// This returns true for:
+    /// - [`ExpressionPlacement::Column`]: Simple column references can be pushed down. They do no compute and do not increase or
+    ///   decrease the amount of data being processed.
+    ///   A projection that reduces the number of columns can eliminate unnecessary data early,
+    ///   but this method only considers one expression at a time, not a projection as a whole.
+    /// - [`ExpressionPlacement::MoveTowardsLeafNodes`]: Cheap expressions can be pushed down to leaves to take advantage of
+    ///   early computation and potential optimizations at the data source level.
+    ///   For example `struct_col['field']` is cheap to compute (just an Arc clone of the nested array for `'field'`)
+    ///   and thus can reduce data early in the plan at very low compute cost.
+    ///   It may even be possible to eliminate the expression entirely if the data source can project only the needed field
+    ///   (as e.g. Parquet can).
+    pub fn should_push_to_leaves(&self) -> bool {
+        matches!(
+            self,
+            ExpressionPlacement::Column | ExpressionPlacement::MoveTowardsLeafNodes
+        )
+    }
+}
diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs
index 2bf7092dd2224..3e941f00c2ee3 100644
--- a/datafusion/expr-common/src/signature.rs
+++ b/datafusion/expr-common/src/signature.rs
@@ -19,12 +19,15 @@
 
 use std::fmt::Display;
 use std::hash::Hash;
+use std::sync::Arc;
 
-use crate::type_coercion::aggregates::NUMERICS;
-use arrow::datatypes::{DataType, IntervalUnit, TimeUnit};
+use arrow::datatypes::{
+    DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION, DECIMAL128_MAX_PRECISION, DataType,
+    Decimal128Type, DecimalType, Field, IntervalUnit, TimeUnit,
+};
 use datafusion_common::types::{LogicalType, LogicalTypeRef, NativeType};
 use datafusion_common::utils::ListCoercion;
-use datafusion_common::{internal_err, plan_err, Result};
+use datafusion_common::{Result, internal_err, plan_err};
 use indexmap::IndexSet;
 use itertools::Itertools;
 
@@ -154,7 +157,7 @@ pub enum Arity {
 pub enum TypeSignature {
     /// One or more arguments of a common type out of a list of valid types.
     ///
-    /// For functions that take no arguments (e.g. `random()` see [`TypeSignature::Nullary`]).
+    /// For functions that take no arguments (e.g. `random()`), see [`TypeSignature::Nullary`].
     ///
     /// # Examples
     ///
@@ -180,7 +183,7 @@ pub enum TypeSignature {
     Uniform(usize, Vec<DataType>),
     /// One or more arguments with exactly the specified types in order.
     ///
-    /// For functions that take no arguments (e.g. `random()`) use [`TypeSignature::Nullary`].
+    /// For functions that take no arguments (e.g. `random()`), use [`TypeSignature::Nullary`].
     Exact(Vec<DataType>),
     /// One or more arguments belonging to the [`TypeSignatureClass`], in order.
     ///
@@ -188,12 +191,12 @@ pub enum TypeSignature {
     /// casts. For example, if you expect a function has string type, but you
     /// also allow it to be casted from binary type.
     ///
-    /// For functions that take no arguments (e.g. `random()`) see [`TypeSignature::Nullary`].
+    /// For functions that take no arguments (e.g. `random()`), see [`TypeSignature::Nullary`].
     Coercible(Vec<Coercion>),
     /// One or more arguments coercible to a single, comparable type.
     ///
     /// Each argument will be coerced to a single type using the
-    /// coercion rules described in [`comparison_coercion_numeric`].
+    /// coercion rules described in [`comparison_coercion`].
     ///
     /// # Examples
     ///
@@ -201,17 +204,18 @@ pub enum TypeSignature {
     /// the types will both be coerced to `i64` before the function is invoked.
     ///
     /// If the `nullif('1', 2)` function is called with `Utf8` and `i64` arguments
-    /// the types will both be coerced to `Utf8` before the function is invoked.
+    /// the types will both be coerced to `Int64` before the function is invoked
+    /// (numeric is preferred over string).
     ///
     /// Note:
-    /// - For functions that take no arguments (e.g. `random()` see [`TypeSignature::Nullary`]).
+    /// - For functions that take no arguments (e.g. `random()`), see [`TypeSignature::Nullary`].
     /// - If all arguments have type [`DataType::Null`], they are coerced to `Utf8`
     ///
-    /// [`comparison_coercion_numeric`]: crate::type_coercion::binary::comparison_coercion_numeric
+    /// [`comparison_coercion`]: crate::type_coercion::binary::comparison_coercion
     Comparable(usize),
     /// One or more arguments of arbitrary types.
     ///
-    /// For functions that take no arguments (e.g. `random()`) use [`TypeSignature::Nullary`].
+    /// For functions that take no arguments (e.g. `random()`), use [`TypeSignature::Nullary`].
     Any(usize),
     /// Matches exactly one of a list of [`TypeSignature`]s.
     ///
@@ -229,7 +233,7 @@ pub enum TypeSignature {
     ///
     /// See [`NativeType::is_numeric`] to know which type is considered numeric
     ///
-    /// For functions that take no arguments (e.g. `random()`) use [`TypeSignature::Nullary`].
+    /// For functions that take no arguments (e.g. `random()`), use [`TypeSignature::Nullary`].
     ///
     /// [`NativeType::is_numeric`]: datafusion_common::types::NativeType::is_numeric
     Numeric(usize),
@@ -242,7 +246,7 @@ pub enum TypeSignature {
     /// For example, if a function is called with (utf8, large_utf8), all
     /// arguments will be coerced to  `LargeUtf8`
     ///
-    /// For functions that take no arguments (e.g. `random()` use [`TypeSignature::Nullary`]).
+    /// For functions that take no arguments (e.g. `random()`), use [`TypeSignature::Nullary`].
     String(usize),
     /// No arguments
     Nullary,
@@ -318,6 +322,43 @@ impl TypeSignature {
     }
 }
 
+impl Display for TypeSignature {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            TypeSignature::Variadic(types) => {
+                write!(f, "Variadic({})", types.iter().join(", "))
+            }
+            TypeSignature::UserDefined => write!(f, "UserDefined"),
+            TypeSignature::VariadicAny => write!(f, "VariadicAny"),
+            TypeSignature::Uniform(count, types) => {
+                write!(f, "Uniform({count}, [{}])", types.iter().join(", "))
+            }
+            TypeSignature::Exact(types) => {
+                write!(f, "Exact({})", types.iter().join(", "))
+            }
+            TypeSignature::Coercible(coercions) => {
+                write!(f, "Coercible({})", coercions.iter().join(", "))
+            }
+            TypeSignature::Comparable(count) => write!(f, "Comparable({count})"),
+            TypeSignature::Any(count) => write!(f, "Any({count})"),
+            TypeSignature::OneOf(sigs) => {
+                write!(f, "OneOf(")?;
+                for (i, sig) in sigs.iter().enumerate() {
+                    if i > 0 {
+                        write!(f, ", ")?;
+                    }
+                    write!(f, "{sig}")?;
+                }
+                write!(f, ")")
+            }
+            TypeSignature::ArraySignature(sig) => write!(f, "ArraySignature({sig})"),
+            TypeSignature::Numeric(count) => write!(f, "Numeric({count})"),
+            TypeSignature::String(count) => write!(f, "String({count})"),
+            TypeSignature::Nullary => write!(f, "Nullary"),
+        }
+    }
+}
+
 /// Represents the class of types that can be used in a function signature.
 ///
 /// This is used to specify what types are valid for function arguments in a more flexible way than
@@ -328,21 +369,45 @@ impl TypeSignature {
 /// arguments that can be coerced to a particular class of types.
 #[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Hash)]
 pub enum TypeSignatureClass {
+    /// Allows an arbitrary type argument without coercing the argument.
+    Any,
+    /// Timestamps, allowing arbitrary (or no) timezones
     Timestamp,
+    /// All time types
     Time,
+    /// All interval types
     Interval,
+    /// All duration types
     Duration,
+    /// A specific native type
     Native(LogicalTypeRef),
-    // TODO:
-    // Numeric
+    /// Signed and unsigned integers
     Integer,
-    /// Encompasses both the native Binary as well as arbitrarily sized FixedSizeBinary types
+    /// All float types
+    Float,
+    /// All decimal types, allowing arbitrary precision & scale
+    Decimal,
+    /// Integers, floats and decimals
+    Numeric,
+    /// Encompasses both the native Binary/LargeBinary types as well as arbitrarily sized FixedSizeBinary types
     Binary,
 }
 
 impl Display for TypeSignatureClass {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "TypeSignatureClass::{self:?}")
+        match self {
+            Self::Any => write!(f, "Any"),
+            Self::Timestamp => write!(f, "Timestamp"),
+            Self::Time => write!(f, "Time"),
+            Self::Interval => write!(f, "Interval"),
+            Self::Duration => write!(f, "Duration"),
+            Self::Native(logical_type) => write!(f, "{logical_type}"),
+            Self::Integer => write!(f, "Integer"),
+            Self::Float => write!(f, "Float"),
+            Self::Decimal => write!(f, "Decimal"),
+            Self::Numeric => write!(f, "Numeric"),
+            Self::Binary => write!(f, "Binary"),
+        }
     }
 }
 
@@ -353,6 +418,9 @@ impl TypeSignatureClass {
     /// documentation or error messages.
     fn get_example_types(&self) -> Vec<DataType> {
         match self {
+            // TODO: might be too much info to return every single type here
+            //       maybe https://github.com/apache/datafusion/issues/14761 will help here?
+            TypeSignatureClass::Any => vec![],
             TypeSignatureClass::Native(l) => get_data_types(l.native()),
             TypeSignatureClass::Timestamp => {
                 vec![
@@ -378,6 +446,13 @@ impl TypeSignatureClass {
             TypeSignatureClass::Binary => {
                 vec![DataType::Binary]
             }
+            TypeSignatureClass::Decimal => vec![Decimal128Type::DEFAULT_TYPE],
+            TypeSignatureClass::Float => vec![DataType::Float64],
+            TypeSignatureClass::Numeric => vec![
+                DataType::Float64,
+                DataType::Int64,
+                Decimal128Type::DEFAULT_TYPE,
+            ],
         }
     }
 
@@ -388,6 +463,7 @@ impl TypeSignatureClass {
         }
 
         match self {
+            TypeSignatureClass::Any => true,
             TypeSignatureClass::Native(t) if t.native() == logical_type => true,
             TypeSignatureClass::Timestamp if logical_type.is_timestamp() => true,
             TypeSignatureClass::Time if logical_type.is_time() => true,
@@ -395,6 +471,9 @@ impl TypeSignatureClass {
             TypeSignatureClass::Duration if logical_type.is_duration() => true,
             TypeSignatureClass::Integer if logical_type.is_integer() => true,
             TypeSignatureClass::Binary if logical_type.is_binary() => true,
+            TypeSignatureClass::Decimal if logical_type.is_decimal() => true,
+            TypeSignatureClass::Float if logical_type.is_float() => true,
+            TypeSignatureClass::Numeric if logical_type.is_numeric() => true,
             _ => false,
         }
     }
@@ -406,6 +485,7 @@ impl TypeSignatureClass {
         origin_type: &DataType,
     ) -> Result<DataType> {
         match self {
+            TypeSignatureClass::Any => Ok(origin_type.to_owned()),
             TypeSignatureClass::Native(logical_type) => {
                 logical_type.native().default_cast_for(origin_type)
             }
@@ -428,6 +508,15 @@ impl TypeSignatureClass {
             TypeSignatureClass::Binary if native_type.is_binary() => {
                 Ok(origin_type.to_owned())
             }
+            TypeSignatureClass::Decimal if native_type.is_decimal() => {
+                Ok(origin_type.to_owned())
+            }
+            TypeSignatureClass::Float if native_type.is_float() => {
+                Ok(origin_type.to_owned())
+            }
+            TypeSignatureClass::Numeric if native_type.is_numeric() => {
+                Ok(origin_type.to_owned())
+            }
             _ if native_type.is_null() => Ok(origin_type.to_owned()),
             _ => internal_err!("May miss the matching logic in `matches_native_type`"),
         }
@@ -506,6 +595,20 @@ impl Display for ArrayFunctionArgument {
     }
 }
 
+static NUMERICS: &[DataType] = &[
+    DataType::Int8,
+    DataType::Int16,
+    DataType::Int32,
+    DataType::Int64,
+    DataType::UInt8,
+    DataType::UInt16,
+    DataType::UInt32,
+    DataType::UInt64,
+    DataType::Float16,
+    DataType::Float32,
+    DataType::Float64,
+];
+
 impl TypeSignature {
     pub fn to_string_repr(&self) -> Vec<String> {
         match self {
@@ -538,9 +641,11 @@ impl TypeSignature {
                 vec![Self::join_types(types, ", ")]
             }
             TypeSignature::Any(arg_count) => {
-                vec![std::iter::repeat_n("Any", *arg_count)
-                    .collect::<Vec<&str>>()
-                    .join(", ")]
+                vec![
+                    std::iter::repeat_n("Any", *arg_count)
+                        .collect::<Vec<&str>>()
+                        .join(", "),
+                ]
             }
             TypeSignature::UserDefined => {
                 vec!["UserDefined".to_string()]
@@ -587,87 +692,103 @@ impl TypeSignature {
         match self {
             TypeSignature::Exact(types) => {
                 if let Some(names) = parameter_names {
-                    vec![names
-                        .iter()
-                        .zip(types.iter())
-                        .map(|(name, typ)| format!("{name}: {typ}"))
-                        .collect::<Vec<_>>()
-                        .join(", ")]
+                    vec![
+                        names
+                            .iter()
+                            .zip(types.iter())
+                            .map(|(name, typ)| format!("{name}: {typ}"))
+                            .collect::<Vec<_>>()
+                            .join(", "),
+                    ]
                 } else {
                     vec![Self::join_types(types, ", ")]
                 }
             }
             TypeSignature::Any(count) => {
                 if let Some(names) = parameter_names {
-                    vec![names
-                        .iter()
-                        .take(*count)
-                        .map(|name| format!("{name}: Any"))
-                        .collect::<Vec<_>>()
-                        .join(", ")]
+                    vec![
+                        names
+                            .iter()
+                            .take(*count)
+                            .map(|name| format!("{name}: Any"))
+                            .collect::<Vec<_>>()
+                            .join(", "),
+                    ]
                 } else {
-                    vec![std::iter::repeat_n("Any", *count)
-                        .collect::<Vec<&str>>()
-                        .join(", ")]
+                    vec![
+                        std::iter::repeat_n("Any", *count)
+                            .collect::<Vec<&str>>()
+                            .join(", "),
+                    ]
                 }
             }
             TypeSignature::Uniform(count, types) => {
                 if let Some(names) = parameter_names {
                     let type_str = Self::join_types(types, "/");
-                    vec![names
-                        .iter()
-                        .take(*count)
-                        .map(|name| format!("{name}: {type_str}"))
-                        .collect::<Vec<_>>()
-                        .join(", ")]
+                    vec![
+                        names
+                            .iter()
+                            .take(*count)
+                            .map(|name| format!("{name}: {type_str}"))
+                            .collect::<Vec<_>>()
+                            .join(", "),
+                    ]
                 } else {
                     self.to_string_repr()
                 }
             }
             TypeSignature::Coercible(coercions) => {
                 if let Some(names) = parameter_names {
-                    vec![names
-                        .iter()
-                        .zip(coercions.iter())
-                        .map(|(name, coercion)| format!("{name}: {coercion}"))
-                        .collect::<Vec<_>>()
-                        .join(", ")]
+                    vec![
+                        names
+                            .iter()
+                            .zip(coercions.iter())
+                            .map(|(name, coercion)| format!("{name}: {coercion}"))
+                            .collect::<Vec<_>>()
+                            .join(", "),
+                    ]
                 } else {
                     vec![Self::join_types(coercions, ", ")]
                 }
             }
             TypeSignature::Comparable(count) => {
                 if let Some(names) = parameter_names {
-                    vec![names
-                        .iter()
-                        .take(*count)
-                        .map(|name| format!("{name}: Comparable"))
-                        .collect::<Vec<_>>()
-                        .join(", ")]
+                    vec![
+                        names
+                            .iter()
+                            .take(*count)
+                            .map(|name| format!("{name}: Comparable"))
+                            .collect::<Vec<_>>()
+                            .join(", "),
+                    ]
                 } else {
                     self.to_string_repr()
                 }
             }
             TypeSignature::Numeric(count) => {
                 if let Some(names) = parameter_names {
-                    vec![names
-                        .iter()
-                        .take(*count)
-                        .map(|name| format!("{name}: Numeric"))
-                        .collect::<Vec<_>>()
-                        .join(", ")]
+                    vec![
+                        names
+                            .iter()
+                            .take(*count)
+                            .map(|name| format!("{name}: Numeric"))
+                            .collect::<Vec<_>>()
+                            .join(", "),
+                    ]
                 } else {
                     self.to_string_repr()
                 }
             }
             TypeSignature::String(count) => {
                 if let Some(names) = parameter_names {
-                    vec![names
-                        .iter()
-                        .take(*count)
-                        .map(|name| format!("{name}: String"))
-                        .collect::<Vec<_>>()
-                        .join(", ")]
+                    vec![
+                        names
+                            .iter()
+                            .take(*count)
+                            .map(|name| format!("{name}: String"))
+                            .collect::<Vec<_>>()
+                            .join(", "),
+                    ]
                 } else {
                     self.to_string_repr()
                 }
@@ -677,28 +798,34 @@ impl TypeSignature {
                 if let Some(names) = parameter_names {
                     match array_sig {
                         ArrayFunctionSignature::Array { arguments, .. } => {
-                            vec![names
-                                .iter()
-                                .zip(arguments.iter())
-                                .map(|(name, arg_type)| format!("{name}: {arg_type}"))
-                                .collect::<Vec<_>>()
-                                .join(", ")]
+                            vec![
+                                names
+                                    .iter()
+                                    .zip(arguments.iter())
+                                    .map(|(name, arg_type)| format!("{name}: {arg_type}"))
+                                    .collect::<Vec<_>>()
+                                    .join(", "),
+                            ]
                         }
                         ArrayFunctionSignature::RecursiveArray => {
-                            vec![names
-                                .iter()
-                                .take(1)
-                                .map(|name| format!("{name}: recursive_array"))
-                                .collect::<Vec<_>>()
-                                .join(", ")]
+                            vec![
+                                names
+                                    .iter()
+                                    .take(1)
+                                    .map(|name| format!("{name}: recursive_array"))
+                                    .collect::<Vec<_>>()
+                                    .join(", "),
+                            ]
                         }
                         ArrayFunctionSignature::MapArray => {
-                            vec![names
-                                .iter()
-                                .take(1)
-                                .map(|name| format!("{name}: map_array"))
-                                .collect::<Vec<_>>()
-                                .join(", ")]
+                            vec![
+                                names
+                                    .iter()
+                                    .take(1)
+                                    .map(|name| format!("{name}: map_array"))
+                                    .collect::<Vec<_>>()
+                                    .join(", "),
+                            ]
                         }
                     }
                 } else {
@@ -844,8 +971,56 @@ fn get_data_types(native_type: &NativeType) -> Vec<DataType> {
         NativeType::String => {
             vec![DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View]
         }
-        // TODO: support other native types
-        _ => vec![],
+        NativeType::Decimal(precision, scale) => {
+            // We assume incoming NativeType is valid already, in terms of precision & scale
+            let mut types = vec![DataType::Decimal256(*precision, *scale)];
+            if *precision <= DECIMAL32_MAX_PRECISION {
+                types.push(DataType::Decimal32(*precision, *scale));
+            }
+            if *precision <= DECIMAL64_MAX_PRECISION {
+                types.push(DataType::Decimal64(*precision, *scale));
+            }
+            if *precision <= DECIMAL128_MAX_PRECISION {
+                types.push(DataType::Decimal128(*precision, *scale));
+            }
+            types
+        }
+        NativeType::Timestamp(time_unit, timezone) => {
+            vec![DataType::Timestamp(*time_unit, timezone.to_owned())]
+        }
+        NativeType::Time(TimeUnit::Second) => vec![DataType::Time32(TimeUnit::Second)],
+        NativeType::Time(TimeUnit::Millisecond) => {
+            vec![DataType::Time32(TimeUnit::Millisecond)]
+        }
+        NativeType::Time(TimeUnit::Microsecond) => {
+            vec![DataType::Time64(TimeUnit::Microsecond)]
+        }
+        NativeType::Time(TimeUnit::Nanosecond) => {
+            vec![DataType::Time64(TimeUnit::Nanosecond)]
+        }
+        NativeType::Duration(time_unit) => vec![DataType::Duration(*time_unit)],
+        NativeType::Interval(interval_unit) => vec![DataType::Interval(*interval_unit)],
+        NativeType::FixedSizeBinary(size) => vec![DataType::FixedSizeBinary(*size)],
+        NativeType::FixedSizeList(logical_field, size) => {
+            get_data_types(logical_field.logical_type.native())
+                .iter()
+                .map(|child_dt| {
+                    let field = Field::new(
+                        logical_field.name.clone(),
+                        child_dt.clone(),
+                        logical_field.nullable,
+                    );
+                    DataType::FixedSizeList(Arc::new(field), *size)
+                })
+                .collect()
+        }
+        // TODO: implement for nested types
+        NativeType::List(_)
+        | NativeType::Struct(_)
+        | NativeType::Union(_)
+        | NativeType::Map(_) => {
+            vec![]
+        }
     }
 }
 
@@ -950,12 +1125,7 @@ impl Coercion {
 
 impl Display for Coercion {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "Coercion({}", self.desired_type())?;
-        if let Some(implicit_coercion) = self.implicit_coercion() {
-            write!(f, ", implicit_coercion={implicit_coercion}",)
-        } else {
-            write!(f, ")")
-        }
+        write!(f, "{}", self.desired_type())
     }
 }
 
@@ -1007,11 +1177,14 @@ pub struct ImplicitCoercion {
 
 impl Display for ImplicitCoercion {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "ImplicitCoercion({:?}, default_type={:?})",
-            self.allowed_source_types, self.default_casted_type
-        )
+        write!(f, "ImplicitCoercion(")?;
+        for (i, source_type) in self.allowed_source_types.iter().enumerate() {
+            if i > 0 {
+                write!(f, ", ")?;
+            }
+            write!(f, "{source_type}")?;
+        }
+        write!(f, "; default={}", self.default_casted_type)
     }
 }
 
@@ -1304,7 +1477,7 @@ impl Signature {
             Arity::Variable => {
                 // For UserDefined signatures, allow parameter names
                 // The function implementer is responsible for validating the names match the actual arguments
-                if !matches!(self.type_signature, TypeSignature::UserDefined) {
+                if self.type_signature != TypeSignature::UserDefined {
                     return plan_err!(
                         "Cannot specify parameter names for variable arity signature: {:?}",
                         self.type_signature
@@ -1326,7 +1499,9 @@ impl Signature {
 
 #[cfg(test)]
 mod tests {
-    use datafusion_common::types::{logical_int32, logical_int64, logical_string};
+    use datafusion_common::types::{
+        NativeType, logical_float64, logical_int32, logical_int64, logical_string,
+    };
 
     use super::*;
     use crate::signature::{
@@ -1473,6 +1648,7 @@ mod tests {
                 vec![DataType::UInt16, DataType::UInt16],
                 vec![DataType::UInt32, DataType::UInt32],
                 vec![DataType::UInt64, DataType::UInt64],
+                vec![DataType::Float16, DataType::Float16],
                 vec![DataType::Float32, DataType::Float32],
                 vec![DataType::Float64, DataType::Float64]
             ]
@@ -1518,10 +1694,12 @@ mod tests {
         .with_parameter_names(vec!["count".to_string()]); // Only 1 name for 2 args
 
         assert!(result.is_err());
-        assert!(result
-            .unwrap_err()
-            .to_string()
-            .contains("does not match signature arity"));
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("does not match signature arity")
+        );
     }
 
     #[test]
@@ -1533,10 +1711,12 @@ mod tests {
         .with_parameter_names(vec!["count".to_string(), "count".to_string()]);
 
         assert!(result.is_err());
-        assert!(result
-            .unwrap_err()
-            .to_string()
-            .contains("Duplicate parameter name"));
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Duplicate parameter name")
+        );
     }
 
     #[test]
@@ -1545,10 +1725,12 @@ mod tests {
             .with_parameter_names(vec!["arg".to_string()]);
 
         assert!(result.is_err());
-        assert!(result
-            .unwrap_err()
-            .to_string()
-            .contains("variable arity signature"));
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("variable arity signature")
+        );
     }
 
     #[test]
@@ -1915,4 +2097,124 @@ mod tests {
         let sig = TypeSignature::UserDefined;
         assert_eq!(sig.arity(), Arity::Variable);
     }
+
+    #[test]
+    fn test_type_signature_display() {
+        use insta::assert_snapshot;
+
+        assert_snapshot!(TypeSignature::Nullary, @"Nullary");
+        assert_snapshot!(TypeSignature::Any(2), @"Any(2)");
+        assert_snapshot!(TypeSignature::Numeric(3), @"Numeric(3)");
+        assert_snapshot!(TypeSignature::String(1), @"String(1)");
+        assert_snapshot!(TypeSignature::Comparable(2), @"Comparable(2)");
+        assert_snapshot!(TypeSignature::VariadicAny, @"VariadicAny");
+        assert_snapshot!(TypeSignature::UserDefined, @"UserDefined");
+
+        assert_snapshot!(
+            TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]),
+            @"Exact(Int32, Utf8)"
+        );
+        assert_snapshot!(
+            TypeSignature::Variadic(vec![DataType::Utf8, DataType::LargeUtf8]),
+            @"Variadic(Utf8, LargeUtf8)"
+        );
+        assert_snapshot!(
+            TypeSignature::Uniform(2, vec![DataType::Float32, DataType::Float64]),
+            @"Uniform(2, [Float32, Float64])"
+        );
+
+        assert_snapshot!(
+            TypeSignature::Coercible(vec![
+                Coercion::new_exact(TypeSignatureClass::Native(logical_float64())),
+                Coercion::new_exact(TypeSignatureClass::Native(logical_int32())),
+            ]),
+            @"Coercible(Float64, Int32)"
+        );
+
+        assert_snapshot!(
+            TypeSignature::OneOf(vec![
+                TypeSignature::Nullary,
+                TypeSignature::VariadicAny,
+            ]),
+            @"OneOf(Nullary, VariadicAny)"
+        );
+    }
+
+    #[test]
+    fn test_type_signature_class_display() {
+        use insta::assert_snapshot;
+
+        assert_snapshot!(TypeSignatureClass::Any, @"Any");
+        assert_snapshot!(TypeSignatureClass::Numeric, @"Numeric");
+        assert_snapshot!(TypeSignatureClass::Integer, @"Integer");
+        assert_snapshot!(TypeSignatureClass::Float, @"Float");
+        assert_snapshot!(TypeSignatureClass::Decimal, @"Decimal");
+        assert_snapshot!(TypeSignatureClass::Timestamp, @"Timestamp");
+        assert_snapshot!(TypeSignatureClass::Time, @"Time");
+        assert_snapshot!(TypeSignatureClass::Interval, @"Interval");
+        assert_snapshot!(TypeSignatureClass::Duration, @"Duration");
+        assert_snapshot!(TypeSignatureClass::Binary, @"Binary");
+        assert_snapshot!(TypeSignatureClass::Native(logical_int32()), @"Int32");
+        assert_snapshot!(TypeSignatureClass::Native(logical_string()), @"String");
+    }
+
+    #[test]
+    fn test_coercion_display() {
+        use insta::assert_snapshot;
+
+        let exact_int = Coercion::new_exact(TypeSignatureClass::Native(logical_int32()));
+        assert_snapshot!(exact_int, @"Int32");
+
+        let exact_numeric = Coercion::new_exact(TypeSignatureClass::Numeric);
+        assert_snapshot!(exact_numeric, @"Numeric");
+
+        let implicit = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_float64()),
+            vec![TypeSignatureClass::Numeric],
+            NativeType::Float64,
+        );
+        assert_snapshot!(implicit, @"Float64");
+
+        let implicit_with_multiple_sources = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int64()),
+            vec![TypeSignatureClass::Integer, TypeSignatureClass::Numeric],
+            NativeType::Int64,
+        );
+        assert_snapshot!(implicit_with_multiple_sources, @"Int64");
+    }
+
+    #[test]
+    fn test_to_string_repr_coercible() {
+        use insta::assert_snapshot;
+
+        // Simulates a function like round(Float64, Int64) with coercion
+        let sig = TypeSignature::Coercible(vec![
+            Coercion::new_implicit(
+                TypeSignatureClass::Native(logical_float64()),
+                vec![TypeSignatureClass::Numeric],
+                NativeType::Float64,
+            ),
+            Coercion::new_implicit(
+                TypeSignatureClass::Native(logical_int64()),
+                vec![TypeSignatureClass::Integer],
+                NativeType::Int64,
+            ),
+        ]);
+        let repr = sig.to_string_repr();
+        assert_eq!(repr.len(), 1);
+        assert_snapshot!(repr[0], @"Float64, Int64");
+    }
+
+    #[test]
+    fn test_to_string_repr_coercible_exact() {
+        use insta::assert_snapshot;
+
+        let sig = TypeSignature::Coercible(vec![
+            Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+            Coercion::new_exact(TypeSignatureClass::Native(logical_int64())),
+        ]);
+        let repr = sig.to_string_repr();
+        assert_eq!(repr.len(), 1);
+        assert_snapshot!(repr[0], @"String, Int64");
+    }
 }
diff --git a/datafusion/expr-common/src/statistics.rs b/datafusion/expr-common/src/statistics.rs
index 5c5e397e74e76..6c8cef35b3a71 100644
--- a/datafusion/expr-common/src/statistics.rs
+++ b/datafusion/expr-common/src/statistics.rs
@@ -17,14 +17,17 @@
 
 use std::f64::consts::LN_2;
 
-use crate::interval_arithmetic::{apply_operator, Interval};
+use crate::interval_arithmetic::{Interval, apply_operator};
 use crate::operator::Operator;
 use crate::type_coercion::binary::binary_numeric_coercion;
 
 use arrow::array::ArrowNativeTypeOp;
 use arrow::datatypes::DataType;
 use datafusion_common::rounding::alter_fp_rounding_mode;
-use datafusion_common::{internal_err, not_impl_err, Result, ScalarValue};
+use datafusion_common::{
+    Result, ScalarValue, assert_eq_or_internal_err, assert_ne_or_internal_err,
+    assert_or_internal_err, internal_err, not_impl_err,
+};
 
 /// This object defines probabilistic distributions that encode uncertain
 /// information about a single, scalar value. Currently, we support five core
@@ -159,9 +162,9 @@ impl Distribution {
     /// - A [`Uniform`] distribution's range is simply its interval.
     /// - An [`Exponential`] distribution's range is `[offset, +∞)`.
     /// - A [`Gaussian`] distribution's range is unbounded.
-    /// - A [`Bernoulli`] distribution's range is [`Interval::UNCERTAIN`], if
-    ///   `p` is neither `0` nor `1`. Otherwise, it is [`Interval::CERTAINLY_FALSE`]
-    ///   and [`Interval::CERTAINLY_TRUE`], respectively.
+    /// - A [`Bernoulli`] distribution's range is [`Interval::TRUE_OR_FALSE`], if
+    ///   `p` is neither `0` nor `1`. Otherwise, it is [`Interval::FALSE`]
+    ///   and [`Interval::TRUE`], respectively.
     /// - A [`Generic`] distribution is unbounded by default, but more information
     ///   may be present.
     pub fn range(&self) -> Result<Interval> {
@@ -275,11 +278,11 @@ pub struct GenericDistribution {
 
 impl UniformDistribution {
     fn try_new(interval: Interval) -> Result<Self> {
-        if interval.data_type().eq(&DataType::Boolean) {
-            return internal_err!(
-                "Construction of a boolean `Uniform` distribution is prohibited, create a `Bernoulli` distribution instead."
-            );
-        }
+        assert_ne_or_internal_err!(
+            interval.data_type(),
+            DataType::Boolean,
+            "Construction of a boolean `Uniform` distribution is prohibited, create a `Bernoulli` distribution instead."
+        );
 
         Ok(Self { interval })
     }
@@ -337,21 +340,29 @@ impl ExponentialDistribution {
         positive_tail: bool,
     ) -> Result<Self> {
         let dt = rate.data_type();
-        if offset.data_type() != dt {
-            internal_err!("Rate and offset must have the same data type")
-        } else if offset.is_null() {
-            internal_err!("Offset of an `ExponentialDistribution` cannot be null")
-        } else if rate.is_null() {
-            internal_err!("Rate of an `ExponentialDistribution` cannot be null")
-        } else if rate.le(&ScalarValue::new_zero(&dt)?) {
-            internal_err!("Rate of an `ExponentialDistribution` must be positive")
-        } else {
-            Ok(Self {
-                rate,
-                offset,
-                positive_tail,
-            })
-        }
+        assert_eq_or_internal_err!(
+            offset.data_type(),
+            dt,
+            "Rate and offset must have the same data type"
+        );
+        assert_or_internal_err!(
+            !offset.is_null(),
+            "Offset of an `ExponentialDistribution` cannot be null"
+        );
+        assert_or_internal_err!(
+            !rate.is_null(),
+            "Rate of an `ExponentialDistribution` cannot be null"
+        );
+        let zero = ScalarValue::new_zero(&dt)?;
+        assert_or_internal_err!(
+            !rate.le(&zero),
+            "Rate of an `ExponentialDistribution` must be positive"
+        );
+        Ok(Self {
+            rate,
+            offset,
+            positive_tail,
+        })
     }
 
     pub fn data_type(&self) -> DataType {
@@ -412,15 +423,21 @@ impl ExponentialDistribution {
 impl GaussianDistribution {
     fn try_new(mean: ScalarValue, variance: ScalarValue) -> Result<Self> {
         let dt = mean.data_type();
-        if variance.data_type() != dt {
-            internal_err!("Mean and variance must have the same data type")
-        } else if variance.is_null() {
-            internal_err!("Variance of a `GaussianDistribution` cannot be null")
-        } else if variance.lt(&ScalarValue::new_zero(&dt)?) {
-            internal_err!("Variance of a `GaussianDistribution` must be positive")
-        } else {
-            Ok(Self { mean, variance })
-        }
+        assert_eq_or_internal_err!(
+            variance.data_type(),
+            dt,
+            "Mean and variance must have the same data type"
+        );
+        assert_or_internal_err!(
+            !variance.is_null(),
+            "Variance of a `GaussianDistribution` cannot be null"
+        );
+        let zero = ScalarValue::new_zero(&dt)?;
+        assert_or_internal_err!(
+            !variance.lt(&zero),
+            "Variance of a `GaussianDistribution` must be positive"
+        );
+        Ok(Self { mean, variance })
     }
 
     pub fn data_type(&self) -> DataType {
@@ -447,19 +464,16 @@ impl GaussianDistribution {
 impl BernoulliDistribution {
     fn try_new(p: ScalarValue) -> Result<Self> {
         if p.is_null() {
-            Ok(Self { p })
-        } else {
-            let dt = p.data_type();
-            let zero = ScalarValue::new_zero(&dt)?;
-            let one = ScalarValue::new_one(&dt)?;
-            if p.ge(&zero) && p.le(&one) {
-                Ok(Self { p })
-            } else {
-                internal_err!(
-                    "Success probability of a `BernoulliDistribution` must be in [0, 1]"
-                )
-            }
+            return Ok(Self { p });
         }
+        let dt = p.data_type();
+        let zero = ScalarValue::new_zero(&dt)?;
+        let one = ScalarValue::new_one(&dt)?;
+        assert_or_internal_err!(
+            p.ge(&zero) && p.le(&one),
+            "Success probability of a `BernoulliDistribution` must be in [0, 1]"
+        );
+        Ok(Self { p })
     }
 
     pub fn data_type(&self) -> DataType {
@@ -505,11 +519,11 @@ impl BernoulliDistribution {
         // Unwraps are safe as the constructor guarantees that the data type
         // supports zero and one values.
         if ScalarValue::new_zero(&dt).unwrap().eq(&self.p) {
-            Interval::CERTAINLY_FALSE
+            Interval::FALSE
         } else if ScalarValue::new_one(&dt).unwrap().eq(&self.p) {
-            Interval::CERTAINLY_TRUE
+            Interval::TRUE
         } else {
-            Interval::UNCERTAIN
+            Interval::TRUE_OR_FALSE
         }
     }
 }
@@ -521,11 +535,11 @@ impl GenericDistribution {
         variance: ScalarValue,
         range: Interval,
     ) -> Result<Self> {
-        if range.data_type().eq(&DataType::Boolean) {
-            return internal_err!(
-                "Construction of a boolean `Generic` distribution is prohibited, create a `Bernoulli` distribution instead."
-            );
-        }
+        assert_ne_or_internal_err!(
+            range.data_type(),
+            DataType::Boolean,
+            "Construction of a boolean `Generic` distribution is prohibited, create a `Bernoulli` distribution instead."
+        );
 
         let validate_location = |m: &ScalarValue| -> Result<bool> {
             // Checks whether the given location estimate is within the range.
@@ -536,20 +550,24 @@ impl GenericDistribution {
             }
         };
 
-        if !validate_location(&mean)?
-            || !validate_location(&median)?
-            || (!variance.is_null()
-                && variance.lt(&ScalarValue::new_zero(&variance.data_type())?))
-        {
-            internal_err!("Tried to construct an invalid `GenericDistribution` instance")
+        let locations_valid = validate_location(&mean)? && validate_location(&median)?;
+        let variance_non_negative = if variance.is_null() {
+            true
         } else {
-            Ok(Self {
-                mean,
-                median,
-                variance,
-                range,
-            })
-        }
+            let zero = ScalarValue::new_zero(&variance.data_type())?;
+            !variance.lt(&zero)
+        };
+        assert_or_internal_err!(
+            locations_valid && variance_non_negative,
+            "Tried to construct an invalid `GenericDistribution` instance"
+        );
+
+        Ok(Self {
+            mean,
+            median,
+            variance,
+            range,
+        })
     }
 
     pub fn data_type(&self) -> DataType {
@@ -718,11 +736,11 @@ pub fn create_bernoulli_from_comparison(
     }
     let (li, ri) = (left.range()?, right.range()?);
     let range_evaluation = apply_operator(op, &li, &ri)?;
-    if range_evaluation.eq(&Interval::CERTAINLY_FALSE) {
+    if range_evaluation.eq(&Interval::FALSE) {
         Distribution::new_bernoulli(ScalarValue::from(0.0))
-    } else if range_evaluation.eq(&Interval::CERTAINLY_TRUE) {
+    } else if range_evaluation.eq(&Interval::TRUE) {
         Distribution::new_bernoulli(ScalarValue::from(1.0))
-    } else if range_evaluation.eq(&Interval::UNCERTAIN) {
+    } else if range_evaluation.eq(&Interval::TRUE_OR_FALSE) {
         Distribution::new_bernoulli(ScalarValue::try_from(&DataType::Float64)?)
     } else {
         internal_err!("This function must be called with a comparison operator")
@@ -860,11 +878,11 @@ pub fn compute_variance(
 #[cfg(test)]
 mod tests {
     use super::{
+        BernoulliDistribution, Distribution, GaussianDistribution, UniformDistribution,
         combine_bernoullis, combine_gaussians, compute_mean, compute_median,
         compute_variance, create_bernoulli_from_comparison, new_generic_from_binary_op,
-        BernoulliDistribution, Distribution, GaussianDistribution, UniformDistribution,
     };
-    use crate::interval_arithmetic::{apply_operator, Interval};
+    use crate::interval_arithmetic::{Interval, apply_operator};
     use crate::operator::Operator;
 
     use arrow::datatypes::DataType;
@@ -879,7 +897,7 @@ mod tests {
             })
         );
 
-        assert!(Distribution::new_uniform(Interval::UNCERTAIN).is_err());
+        assert!(Distribution::new_uniform(Interval::TRUE_OR_FALSE).is_err());
         Ok(())
     }
 
@@ -992,7 +1010,7 @@ mod tests {
                     ScalarValue::Null,
                     ScalarValue::Null,
                     ScalarValue::Null,
-                    Interval::UNCERTAIN,
+                    Interval::TRUE_OR_FALSE,
                 ),
                 false,
             ),
diff --git a/datafusion/expr-common/src/type_coercion/aggregates.rs b/datafusion/expr-common/src/type_coercion/aggregates.rs
index 55a8843394b51..ada0bd26b8d06 100644
--- a/datafusion/expr-common/src/type_coercion/aggregates.rs
+++ b/datafusion/expr-common/src/type_coercion/aggregates.rs
@@ -18,10 +18,9 @@
 use crate::signature::TypeSignature;
 use arrow::datatypes::{DataType, FieldRef};
 
-use datafusion_common::{internal_err, plan_err, Result};
+use datafusion_common::{Result, internal_err, plan_err};
 
-// TODO: remove usage of these (INTEGERS and NUMERICS) in favour of signatures
-//       see https://github.com/apache/datafusion/issues/18092
+#[deprecated(since = "54.0.0", note = "Use functions signatures")]
 pub static INTEGERS: &[DataType] = &[
     DataType::Int8,
     DataType::Int16,
@@ -33,6 +32,7 @@ pub static INTEGERS: &[DataType] = &[
     DataType::UInt64,
 ];
 
+#[deprecated(since = "54.0.0", note = "Use functions signatures")]
 pub static NUMERICS: &[DataType] = &[
     DataType::Int8,
     DataType::Int16,
@@ -42,6 +42,7 @@ pub static NUMERICS: &[DataType] = &[
     DataType::UInt16,
     DataType::UInt32,
     DataType::UInt64,
+    DataType::Float16,
     DataType::Float32,
     DataType::Float64,
 ];
@@ -60,8 +61,7 @@ pub fn check_arg_count(
         TypeSignature::Uniform(agg_count, _) | TypeSignature::Any(agg_count) => {
             if input_fields.len() != *agg_count {
                 return plan_err!(
-                    "The function {func_name} expects {:?} arguments, but {:?} were provided",
-                    agg_count,
+                    "The function {func_name} expects {agg_count} arguments, but {} were provided",
                     input_fields.len()
                 );
             }
@@ -69,7 +69,7 @@ pub fn check_arg_count(
         TypeSignature::Exact(types) => {
             if types.len() != input_fields.len() {
                 return plan_err!(
-                    "The function {func_name} expects {:?} arguments, but {:?} were provided",
+                    "The function {func_name} expects {} arguments, but {} were provided",
                     types.len(),
                     input_fields.len()
                 );
@@ -81,7 +81,7 @@ pub fn check_arg_count(
                 .any(|v| check_arg_count(func_name, input_fields, v).is_ok());
             if !ok {
                 return plan_err!(
-                    "The function {func_name} does not accept {:?} function arguments.",
+                    "The function {func_name} does not accept {} function arguments.",
                     input_fields.len()
                 );
             }
@@ -100,9 +100,7 @@ pub fn check_arg_count(
             // Numeric and Coercible signature is validated in `get_valid_types`
         }
         _ => {
-            return internal_err!(
-                "Aggregate functions do not support this {signature:?}"
-            );
+            return internal_err!("Aggregate functions do not support this {signature}");
         }
     }
     Ok(())
diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs
index 122e0f987b6f9..aec87ec5ff853 100644
--- a/datafusion/expr-common/src/type_coercion/binary.rs
+++ b/datafusion/expr-common/src/type_coercion/binary.rs
@@ -17,23 +17,25 @@
 
 //! Coercion rules for matching argument types for binary operators
 
+use std::collections::HashMap;
 use std::collections::HashSet;
 use std::sync::Arc;
 
 use crate::operator::Operator;
 
-use arrow::array::{new_empty_array, Array};
+use arrow::array::{Array, new_empty_array};
 use arrow::compute::can_cast_types;
+use arrow::datatypes::IntervalUnit::MonthDayNano;
+use arrow::datatypes::TimeUnit::*;
 use arrow::datatypes::{
-    DataType, Field, FieldRef, Fields, TimeUnit, DECIMAL128_MAX_PRECISION,
-    DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE,
     DECIMAL32_MAX_PRECISION, DECIMAL32_MAX_SCALE, DECIMAL64_MAX_PRECISION,
-    DECIMAL64_MAX_SCALE,
+    DECIMAL64_MAX_SCALE, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE,
+    DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DataType, Field, FieldRef, Fields,
+    TimeUnit,
 };
-use datafusion_common::types::NativeType;
 use datafusion_common::{
-    exec_err, internal_err, not_impl_err, plan_datafusion_err, plan_err, Diagnostic,
-    Result, Span, Spans,
+    Diagnostic, Result, Span, Spans, exec_err, internal_err, not_impl_err,
+    plan_datafusion_err, plan_err,
 };
 use itertools::Itertools;
 
@@ -184,8 +186,8 @@ impl<'a> BinaryTypeCoercer<'a> {
     }
 
     fn signature_inner(&'a self, lhs: &DataType, rhs: &DataType) -> Result<Signature> {
-        use arrow::datatypes::DataType::*;
         use Operator::*;
+        use arrow::datatypes::DataType::*;
         let result = match self.op {
         Eq |
         NotEq |
@@ -258,15 +260,36 @@ impl<'a> BinaryTypeCoercer<'a> {
                 )
             })
         }
+        Minus if is_date_minus_date(lhs, rhs) => {
+            return Ok(Signature {
+                lhs: lhs.clone(),
+                rhs: rhs.clone(),
+                ret: Int64,
+            });
+        }
         Plus | Minus | Multiply | Divide | Modulo  =>  {
             if let Ok(ret) = self.get_result(lhs, rhs) {
+
                 // Temporal arithmetic, e.g. Date32 + Interval
                 Ok(Signature{
                     lhs: lhs.clone(),
                     rhs: rhs.clone(),
                     ret,
                 })
+            } else if let Some((lhs, rhs)) = temporal_math_coercion(lhs, rhs) {
+                // Temporal arithmetic, e.g. Date32 + int64, Timestamp + duration, etc
+                let ret = self.get_result(&lhs, &rhs).map_err(|e| {
+                    plan_datafusion_err!(
+                        "Cannot get result type for temporal operation {} {} {}: {e}", self.lhs, self.op, self.rhs
+                    )
+                })?;
+                Ok(Signature {
+                    lhs,
+                    rhs,
+                    ret,
+                })
             } else if let Some(coerced) = temporal_coercion_strict_timezone(lhs, rhs) {
+
                 // Temporal arithmetic by first coercing to a common time representation
                 // e.g. Date32 - Timestamp
                 let ret = self.get_result(&coerced, &coerced).map_err(|e| {
@@ -300,6 +323,9 @@ impl<'a> BinaryTypeCoercer<'a> {
                 )
             }
         },
+        Colon => {
+            Ok(Signature { lhs: lhs.clone(), rhs: rhs.clone(), ret: lhs.clone() })
+        },
         IntegerDivide | Arrow | LongArrow | HashArrow | HashLongArrow
         | HashMinus | AtQuestion | Question | QuestionAnd | QuestionPipe => {
             not_impl_err!("Operator {} is not yet supported", self.op)
@@ -327,13 +353,12 @@ impl<'a> BinaryTypeCoercer<'a> {
 
 // TODO Move the rest inside of BinaryTypeCoercer
 
-fn is_decimal(data_type: &DataType) -> bool {
+/// Returns true if both operands are Date types (Date32 or Date64)
+/// Used to detect Date - Date operations which should return Int64 (days difference)
+fn is_date_minus_date(lhs: &DataType, rhs: &DataType) -> bool {
     matches!(
-        data_type,
-        DataType::Decimal32(..)
-            | DataType::Decimal64(..)
-            | DataType::Decimal128(..)
-            | DataType::Decimal256(..)
+        (lhs, rhs),
+        (DataType::Date32, DataType::Date32) | (DataType::Date64, DataType::Date64)
     )
 }
 
@@ -353,6 +378,16 @@ fn math_decimal_coercion(
             let (lhs_type, value_type) = math_decimal_coercion(lhs_type, value_type)?;
             Some((lhs_type, value_type))
         }
+        (RunEndEncoded(_, field), _) => {
+            let (value_type, rhs_type) =
+                math_decimal_coercion(field.data_type(), rhs_type)?;
+            Some((value_type, rhs_type))
+        }
+        (_, RunEndEncoded(_, field)) => {
+            let (lhs_type, value_type) =
+                math_decimal_coercion(lhs_type, field.data_type())?;
+            Some((lhs_type, value_type))
+        }
         (
             Null,
             Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _),
@@ -369,8 +404,8 @@ fn math_decimal_coercion(
         }
         // Cross-variant decimal coercion - choose larger variant with appropriate precision/scale
         (lhs, rhs)
-            if is_decimal(lhs)
-                && is_decimal(rhs)
+            if lhs.is_decimal()
+                && rhs.is_decimal()
                 && std::mem::discriminant(lhs) != std::mem::discriminant(rhs) =>
         {
             let coerced_type = get_wider_decimal_type_cross_variant(lhs_type, rhs_type)?;
@@ -447,7 +482,9 @@ fn bitwise_coercion(left_type: &DataType, right_type: &DataType) -> Option<DataT
         return None;
     }
 
-    if left_type == right_type {
+    let is_integer_dictionary =
+        matches!(left_type, Dictionary(_, value_type) if value_type.is_integer());
+    if left_type == right_type && (left_type.is_integer() || is_integer_dictionary) {
         return Some(left_type.clone());
     }
 
@@ -491,17 +528,18 @@ enum TypeCategory {
 impl From<&DataType> for TypeCategory {
     fn from(data_type: &DataType) -> Self {
         match data_type {
-            // Dict is a special type in arrow, we check the value type
+            // Dict and REE are special types in arrow, we check the value type.
             DataType::Dictionary(_, v) => {
                 let v = v.as_ref();
                 TypeCategory::from(v)
             }
+            DataType::RunEndEncoded(_, v) => TypeCategory::from(v.data_type()),
             _ => {
                 if data_type.is_numeric() {
                     return TypeCategory::Numeric;
                 }
 
-                if matches!(data_type, DataType::Boolean) {
+                if *data_type == DataType::Boolean {
                     return TypeCategory::Boolean;
                 }
 
@@ -552,8 +590,8 @@ impl From<&DataType> for TypeCategory {
 }
 
 /// Coerce dissimilar data types to a single data type.
-/// UNION, INTERSECT, EXCEPT, CASE, ARRAY, VALUES, and the GREATEST and LEAST functions are
-/// examples that has the similar resolution rules.
+/// ARRAY literals, VALUES, COALESCE, and array concatenation are examples
+/// of contexts that use this function.
 /// See <https://www.postgresql.org/docs/current/typeconv-union-case.html> for more information.
 /// The rules in the document provide a clue, but adhering strictly to them doesn't precisely
 /// align with the behavior of Postgres. Therefore, we've made slight adjustments to the rules
@@ -672,6 +710,27 @@ fn type_union_resolution_coercion(
                 None => None,
             }
         }
+        (
+            DataType::RunEndEncoded(lhs_run, lhs_val),
+            DataType::RunEndEncoded(rhs_run, rhs_val),
+        ) => {
+            let new_run =
+                type_union_resolution_coercion(lhs_run.data_type(), rhs_run.data_type())?;
+            let new_val =
+                type_union_resolution_coercion(lhs_val.data_type(), rhs_val.data_type())?;
+            Some(DataType::RunEndEncoded(
+                Arc::new(lhs_run.as_ref().clone().with_data_type(new_run)),
+                Arc::new(lhs_val.as_ref().clone().with_data_type(new_val)),
+            ))
+        }
+        (DataType::RunEndEncoded(run, val), other)
+        | (other, DataType::RunEndEncoded(run, val)) => {
+            let new_val = type_union_resolution_coercion(val.data_type(), other)?;
+            Some(DataType::RunEndEncoded(
+                Arc::clone(run),
+                Arc::new(val.as_ref().clone().with_data_type(new_val)),
+            ))
+        }
         (DataType::Struct(lhs), DataType::Struct(rhs)) => {
             if lhs.len() != rhs.len() {
                 return None;
@@ -713,30 +772,27 @@ fn type_union_resolution_coercion(
                 .collect();
             Some(DataType::Struct(fields.into()))
         }
-        _ => {
-            // Numeric coercion is the same as comparison coercion, both find the narrowest type
-            // that can accommodate both types
-            binary_numeric_coercion(lhs_type, rhs_type)
-                .or_else(|| list_coercion(lhs_type, rhs_type))
-                .or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type))
-                .or_else(|| string_coercion(lhs_type, rhs_type))
-                .or_else(|| numeric_string_coercion(lhs_type, rhs_type))
-                .or_else(|| binary_coercion(lhs_type, rhs_type))
-        }
+        _ => binary_numeric_coercion(lhs_type, rhs_type)
+            .or_else(|| list_coercion(lhs_type, rhs_type, type_union_resolution_coercion))
+            .or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type))
+            .or_else(|| string_coercion(lhs_type, rhs_type))
+            .or_else(|| null_coercion(lhs_type, rhs_type))
+            .or_else(|| string_numeric_coercion(lhs_type, rhs_type))
+            .or_else(|| binary_coercion(lhs_type, rhs_type)),
     }
 }
 
 /// Handle type union resolution including struct type and others.
 pub fn try_type_union_resolution(data_types: &[DataType]) -> Result<Vec<DataType>> {
-    let err = match try_type_union_resolution_with_struct(data_types) {
+    let struct_err = match try_type_union_resolution_with_struct(data_types) {
         Ok(struct_types) => return Ok(struct_types),
-        Err(e) => Some(e),
+        Err(e) => e,
     };
 
     if let Some(new_type) = type_union_resolution(data_types) {
         Ok(vec![new_type; data_types.len()])
     } else {
-        exec_err!("Fail to find the coerced type, errors: {:?}", err)
+        exec_err!("Fail to find the coerced type, errors: {struct_err}")
     }
 }
 
@@ -751,7 +807,11 @@ pub fn try_type_union_resolution_with_struct(
             let keys = fields.iter().map(|f| f.name().to_owned()).join(",");
             if let Some(ref k) = keys_string {
                 if *k != keys {
-                    return exec_err!("Expect same keys for struct type but got mismatched pair {} and {}", *k, keys);
+                    return exec_err!(
+                        "Expect same keys for struct type but got mismatched pair {} and {}",
+                        *k,
+                        keys
+                    );
                 }
             } else {
                 keys_string = Some(keys);
@@ -765,7 +825,9 @@ pub fn try_type_union_resolution_with_struct(
     {
         fields.iter().map(|f| f.data_type().to_owned()).collect()
     } else {
-        return internal_err!("Struct type is checked is the previous function, so this should be unreachable");
+        return internal_err!(
+            "Struct type is checked is the previous function, so this should be unreachable"
+        );
     };
 
     for data_type in data_types.iter().skip(1) {
@@ -809,100 +871,104 @@ pub fn try_type_union_resolution_with_struct(
     Ok(final_struct_types)
 }
 
-/// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a
-/// comparison operation
-///
-/// Example comparison operations are `lhs = rhs` and `lhs > rhs`
-///
-/// Binary comparison kernels require the two arguments to be the (exact) same
-/// data type. However, users can write queries where the two arguments are
-/// different data types. In such cases, the data types are automatically cast
-/// (coerced) to a single data type to pass to the kernels.
-///
-/// # Numeric comparisons
+/// Coerce `lhs_type` and `rhs_type` to a common type for type unification
+/// contexts — where two values must be brought to a common type but are not
+/// being compared. Examples: UNION, CASE THEN/ELSE branches, NVL2. For other
+/// contexts, [`comparison_coercion`] should typically be used instead.
 ///
-/// When comparing numeric values, the lower precision type is coerced to the
-/// higher precision type to avoid losing data. For example when comparing
-/// `Int32` to `Int64` the coerced type is `Int64` so the `Int32` argument will
-/// be cast.
-///
-/// # Numeric / String comparisons
-///
-/// When comparing numeric values and strings, both values will be coerced to
-/// strings.  For example when comparing `'2' > 1`,  the arguments will be
-/// coerced to `Utf8` for comparison
-pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
+/// The intuition is that we try to find the "widest" type that can represent
+/// all values from both sides. When one side is a string and the other is
+/// numeric, this prefers strings because every number has a textual
+/// representation but not every string can be parsed as a number (e.g., `SELECT
+/// 1 UNION SELECT 'a'` coerces both sides to a string). This is in contrast to
+/// [`comparison_coercion`], which prefers numeric types so that ordering and
+/// equality follow numeric semantics.
+pub fn type_union_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
     if lhs_type.equals_datatype(rhs_type) {
-        // same type => equality is possible
         return Some(lhs_type.clone());
     }
     binary_numeric_coercion(lhs_type, rhs_type)
-        .or_else(|| dictionary_comparison_coercion(lhs_type, rhs_type, true))
+        .or_else(|| dictionary_coercion(lhs_type, rhs_type, true, type_union_coercion))
+        .or_else(|| ree_coercion(lhs_type, rhs_type, true, type_union_coercion))
         .or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type))
         .or_else(|| string_coercion(lhs_type, rhs_type))
-        .or_else(|| list_coercion(lhs_type, rhs_type))
+        .or_else(|| list_coercion(lhs_type, rhs_type, type_union_coercion))
         .or_else(|| null_coercion(lhs_type, rhs_type))
-        .or_else(|| string_numeric_coercion(lhs_type, rhs_type))
+        .or_else(|| string_numeric_union_coercion(lhs_type, rhs_type))
         .or_else(|| string_temporal_coercion(lhs_type, rhs_type))
         .or_else(|| binary_coercion(lhs_type, rhs_type))
-        .or_else(|| struct_coercion(lhs_type, rhs_type))
-        .or_else(|| map_coercion(lhs_type, rhs_type))
+        .or_else(|| struct_coercion(lhs_type, rhs_type, type_union_coercion))
+        .or_else(|| map_coercion(lhs_type, rhs_type, type_union_coercion))
 }
 
-/// Similar to [`comparison_coercion`] but prefers numeric if compares with
-/// numeric and string
+/// Coerce `lhs_type` and `rhs_type` to a common type for comparison
+/// contexts — any context where two values are compared rather than
+/// unified. This includes binary comparison operators, IN lists,
+/// CASE/WHEN conditions, and BETWEEN.
+///
+/// When the two types differ, this function determines the common type
+/// to cast to.
 ///
 /// # Numeric comparisons
 ///
-/// When comparing numeric values and strings, the values will be coerced to the
-/// numeric type.  For example, `'2' > 1` if `1` is an `Int32`, the arguments
-/// will be coerced to `Int32`.
-pub fn comparison_coercion_numeric(
-    lhs_type: &DataType,
-    rhs_type: &DataType,
-) -> Option<DataType> {
-    if lhs_type == rhs_type {
+/// The lower precision type is widened to the higher precision type
+/// (e.g., `Int32` vs `Int64` → `Int64`).
+///
+/// # Numeric / String comparisons
+///
+/// Prefers the numeric type (e.g., `'2' > 1` where `1` is `Int32` coerces
+/// `'2'` to `Int32`).
+///
+/// For type unification contexts (UNION, CASE THEN/ELSE), use
+/// [`type_union_coercion`] instead.
+pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
+    if lhs_type.equals_datatype(rhs_type) {
         // same type => equality is possible
         return Some(lhs_type.clone());
     }
     binary_numeric_coercion(lhs_type, rhs_type)
-        .or_else(|| dictionary_comparison_coercion_numeric(lhs_type, rhs_type, true))
+        .or_else(|| dictionary_coercion(lhs_type, rhs_type, true, comparison_coercion))
+        .or_else(|| ree_coercion(lhs_type, rhs_type, true, comparison_coercion))
+        .or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type))
         .or_else(|| string_coercion(lhs_type, rhs_type))
+        .or_else(|| list_coercion(lhs_type, rhs_type, comparison_coercion))
         .or_else(|| null_coercion(lhs_type, rhs_type))
-        .or_else(|| string_numeric_coercion_as_numeric(lhs_type, rhs_type))
+        .or_else(|| string_numeric_coercion(lhs_type, rhs_type))
+        .or_else(|| string_temporal_coercion(lhs_type, rhs_type))
+        .or_else(|| binary_coercion(lhs_type, rhs_type))
+        .or_else(|| struct_coercion(lhs_type, rhs_type, comparison_coercion))
+        .or_else(|| map_coercion(lhs_type, rhs_type, comparison_coercion))
 }
 
-/// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation
-/// where one is numeric and one is `Utf8`/`LargeUtf8`.
+/// Coerce a numeric/string pair to the numeric type.
+///
+/// Used by [`comparison_coercion`].
 fn string_numeric_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
     use arrow::datatypes::DataType::*;
     match (lhs_type, rhs_type) {
-        (Utf8, _) if rhs_type.is_numeric() => Some(Utf8),
-        (LargeUtf8, _) if rhs_type.is_numeric() => Some(LargeUtf8),
-        (Utf8View, _) if rhs_type.is_numeric() => Some(Utf8View),
-        (_, Utf8) if lhs_type.is_numeric() => Some(Utf8),
-        (_, LargeUtf8) if lhs_type.is_numeric() => Some(LargeUtf8),
-        (_, Utf8View) if lhs_type.is_numeric() => Some(Utf8View),
+        (lhs, Utf8 | LargeUtf8 | Utf8View) if lhs.is_numeric() => Some(lhs.clone()),
+        (Utf8 | LargeUtf8 | Utf8View, rhs) if rhs.is_numeric() => Some(rhs.clone()),
         _ => None,
     }
 }
 
-/// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation
-/// where one is numeric and one is `Utf8`/`LargeUtf8`.
-fn string_numeric_coercion_as_numeric(
+/// Coerce a numeric/string pair to the string type.
+///
+/// Used by [`type_union_coercion`].
+fn string_numeric_union_coercion(
     lhs_type: &DataType,
     rhs_type: &DataType,
 ) -> Option<DataType> {
-    let lhs_logical_type = NativeType::from(lhs_type);
-    let rhs_logical_type = NativeType::from(rhs_type);
-    if lhs_logical_type.is_numeric() && rhs_logical_type == NativeType::String {
-        return Some(lhs_type.to_owned());
-    }
-    if rhs_logical_type.is_numeric() && lhs_logical_type == NativeType::String {
-        return Some(rhs_type.to_owned());
+    use arrow::datatypes::DataType::*;
+    match (lhs_type, rhs_type) {
+        (lhs @ (Utf8 | LargeUtf8 | Utf8View), _) if rhs_type.is_numeric() => {
+            Some(lhs.clone())
+        }
+        (_, rhs @ (Utf8 | LargeUtf8 | Utf8View)) if lhs_type.is_numeric() => {
+            Some(rhs.clone())
+        }
+        _ => None,
     }
-
-    None
 }
 
 /// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation
@@ -917,7 +983,7 @@ fn string_numeric_coercion_as_numeric(
 /// ```
 ///
 /// In the absence of a full type inference system, we can't determine the correct type
-/// to parse the string argument
+/// to parse the string argument.
 fn string_temporal_coercion(
     lhs_type: &DataType,
     rhs_type: &DataType,
@@ -931,13 +997,13 @@ fn string_temporal_coercion(
                 match temporal {
                     Date32 | Date64 => Some(temporal.clone()),
                     Time32(_) | Time64(_) => {
-                        if is_time_with_valid_unit(temporal.to_owned()) {
+                        if is_time_with_valid_unit(temporal) {
                             Some(temporal.to_owned())
                         } else {
                             None
                         }
                     }
-                    Timestamp(_, tz) => Some(Timestamp(TimeUnit::Nanosecond, tz.clone())),
+                    Timestamp(_, tz) => Some(Timestamp(Nanosecond, tz.clone())),
                     _ => None,
                 }
             }
@@ -977,8 +1043,8 @@ pub fn decimal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<Data
     match (lhs_type, rhs_type) {
         // Same decimal types
         (lhs_type, rhs_type)
-            if is_decimal(lhs_type)
-                && is_decimal(rhs_type)
+            if lhs_type.is_decimal()
+                && rhs_type.is_decimal()
                 && std::mem::discriminant(lhs_type)
                     == std::mem::discriminant(rhs_type) =>
         {
@@ -986,8 +1052,8 @@ pub fn decimal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<Data
         }
         // Mismatched decimal types
         (lhs_type, rhs_type)
-            if is_decimal(lhs_type)
-                && is_decimal(rhs_type)
+            if lhs_type.is_decimal()
+                && rhs_type.is_decimal()
                 && std::mem::discriminant(lhs_type)
                     != std::mem::discriminant(rhs_type) =>
         {
@@ -1194,32 +1260,137 @@ fn coerce_numeric_type_to_decimal256(numeric_type: &DataType) -> Option<DataType
     }
 }
 
-fn struct_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
+/// Coerce two struct types by recursively coercing their fields using
+/// `coerce_fn` (either [`comparison_coercion`] or [`type_union_coercion`]).
+fn struct_coercion(
+    lhs_type: &DataType,
+    rhs_type: &DataType,
+    coerce_fn: fn(&DataType, &DataType) -> Option<DataType>,
+) -> Option<DataType> {
     use arrow::datatypes::DataType::*;
+
     match (lhs_type, rhs_type) {
         (Struct(lhs_fields), Struct(rhs_fields)) => {
+            // Field count must match for coercion
             if lhs_fields.len() != rhs_fields.len() {
                 return None;
             }
 
-            let coerced_types = std::iter::zip(lhs_fields.iter(), rhs_fields.iter())
-                .map(|(lhs, rhs)| comparison_coercion(lhs.data_type(), rhs.data_type()))
-                .collect::<Option<Vec<DataType>>>()?;
-
-            // preserve the field name and nullability
-            let orig_fields = std::iter::zip(lhs_fields.iter(), rhs_fields.iter());
+            // If the two structs have exactly the same set of field names (possibly in
+            // different order), prefer name-based coercion. Otherwise fall back to
+            // positional coercion which preserves backward compatibility.
+            //
+            // Name-based coercion is used in:
+            // 1. Array construction: [s1, s2] where s1 and s2 have reordered fields
+            // 2. UNION operations: different field orders unified by name
+            // 3. VALUES clauses: heterogeneous struct rows unified by field name
+            // 4. JOIN conditions: structs with matching field names
+            // 5. Window functions: partitions/orders by struct fields
+            // 6. Aggregate functions: collecting structs with reordered fields
+            //
+            // See docs/source/user-guide/sql/struct_coercion.md for detailed examples.
+            if fields_have_same_names(lhs_fields, rhs_fields) {
+                return coerce_struct_by_name(lhs_fields, rhs_fields, coerce_fn);
+            }
 
-            let fields: Vec<FieldRef> = coerced_types
-                .into_iter()
-                .zip(orig_fields)
-                .map(|(datatype, (lhs, rhs))| coerce_fields(datatype, lhs, rhs))
-                .collect();
-            Some(Struct(fields.into()))
+            coerce_struct_by_position(lhs_fields, rhs_fields, coerce_fn)
         }
         _ => None,
     }
 }
 
+/// Return true if every left-field name exists in the right fields (and lengths are equal).
+///
+/// # Assumptions
+/// **This function assumes field names within each struct are unique.** This assumption is safe
+/// because field name uniqueness is enforced at multiple levels:
+/// - **Arrow level:** `StructType` construction enforces unique field names at the schema level
+/// - **DataFusion level:** SQL parser rejects duplicate field names in `CREATE TABLE` and struct type definitions
+/// - **Runtime level:** `StructArray::try_new()` validates field uniqueness
+///
+/// Therefore, we don't need to handle degenerate cases like:
+/// - `struct<c1 int> -> struct<c1 int, c1 int>` (target has duplicate field names)
+/// - `struct<c1 int, c1 int> -> struct<c1 int>` (source has duplicate field names)
+fn fields_have_same_names(lhs_fields: &Fields, rhs_fields: &Fields) -> bool {
+    // Debug assertions: field names should be unique within each struct
+    #[cfg(debug_assertions)]
+    {
+        let lhs_names: HashSet<_> = lhs_fields.iter().map(|f| f.name()).collect();
+        assert_eq!(
+            lhs_names.len(),
+            lhs_fields.len(),
+            "Struct has duplicate field names (should be caught by Arrow schema validation)"
+        );
+
+        let rhs_names_check: HashSet<_> = rhs_fields.iter().map(|f| f.name()).collect();
+        assert_eq!(
+            rhs_names_check.len(),
+            rhs_fields.len(),
+            "Struct has duplicate field names (should be caught by Arrow schema validation)"
+        );
+    }
+
+    let rhs_names: HashSet<&str> = rhs_fields.iter().map(|f| f.name().as_str()).collect();
+    lhs_fields
+        .iter()
+        .all(|lf| rhs_names.contains(lf.name().as_str()))
+}
+
+/// Coerce two structs by matching fields by name using `coerce_fn`.
+/// Assumes the name-sets match.
+fn coerce_struct_by_name(
+    lhs_fields: &Fields,
+    rhs_fields: &Fields,
+    coerce_fn: fn(&DataType, &DataType) -> Option<DataType>,
+) -> Option<DataType> {
+    use arrow::datatypes::DataType::*;
+
+    let rhs_by_name: HashMap<&str, &FieldRef> =
+        rhs_fields.iter().map(|f| (f.name().as_str(), f)).collect();
+
+    let mut coerced: Vec<FieldRef> = Vec::with_capacity(lhs_fields.len());
+
+    for lhs in lhs_fields.iter() {
+        let rhs = rhs_by_name.get(lhs.name().as_str()).unwrap(); // safe: caller ensured names match
+        let coerced_type = coerce_fn(lhs.data_type(), rhs.data_type())?;
+        let is_nullable = lhs.is_nullable() || rhs.is_nullable();
+        coerced.push(Arc::new(Field::new(
+            lhs.name().clone(),
+            coerced_type,
+            is_nullable,
+        )));
+    }
+
+    Some(Struct(coerced.into()))
+}
+
+/// Coerce two structs positionally (left-to-right) using `coerce_fn`.
+/// Preserves field names from the left struct and uses combined nullability.
+fn coerce_struct_by_position(
+    lhs_fields: &Fields,
+    rhs_fields: &Fields,
+    coerce_fn: fn(&DataType, &DataType) -> Option<DataType>,
+) -> Option<DataType> {
+    use arrow::datatypes::DataType::*;
+
+    // First coerce individual types; fail early if any pair cannot be coerced.
+    let coerced_types: Vec<DataType> = lhs_fields
+        .iter()
+        .zip(rhs_fields.iter())
+        .map(|(l, r)| coerce_fn(l.data_type(), r.data_type()))
+        .collect::<Option<Vec<DataType>>>()?;
+
+    // Build final fields preserving left-side names and combined nullability.
+    let orig_pairs = lhs_fields.iter().zip(rhs_fields.iter());
+    let fields: Vec<FieldRef> = coerced_types
+        .into_iter()
+        .zip(orig_pairs)
+        .map(|(datatype, (lhs, rhs))| coerce_fields(datatype, lhs, rhs))
+        .collect();
+
+    Some(Struct(fields.into()))
+}
+
 /// returns the result of coercing two fields to a common type
 fn coerce_fields(common_type: DataType, lhs: &FieldRef, rhs: &FieldRef) -> FieldRef {
     let is_nullable = lhs.is_nullable() || rhs.is_nullable();
@@ -1227,13 +1398,17 @@ fn coerce_fields(common_type: DataType, lhs: &FieldRef, rhs: &FieldRef) -> Field
     Arc::new(Field::new(name, common_type, is_nullable))
 }
 
-/// coerce two types if they are Maps by coercing their inner 'entries' fields' types
-/// using struct coercion
-fn map_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
+/// Coerce two Map types by coercing their inner entry fields using
+/// `coerce_fn` (either [`comparison_coercion`] or [`type_union_coercion`]).
+fn map_coercion(
+    lhs_type: &DataType,
+    rhs_type: &DataType,
+    coerce_fn: fn(&DataType, &DataType) -> Option<DataType>,
+) -> Option<DataType> {
     use arrow::datatypes::DataType::*;
     match (lhs_type, rhs_type) {
         (Map(lhs_field, lhs_ordered), Map(rhs_field, rhs_ordered)) => {
-            struct_coercion(lhs_field.data_type(), rhs_field.data_type()).map(
+            struct_coercion(lhs_field.data_type(), rhs_field.data_type(), coerce_fn).map(
                 |key_value_type| {
                     Map(
                         Arc::new((**lhs_field).clone().with_data_type(key_value_type)),
@@ -1271,6 +1446,15 @@ fn mathematics_numerical_coercion(
         (_, Dictionary(_, value_type)) => {
             mathematics_numerical_coercion(lhs_type, value_type)
         }
+        (RunEndEncoded(_, lhs_field), RunEndEncoded(_, rhs_field)) => {
+            mathematics_numerical_coercion(lhs_field.data_type(), rhs_field.data_type())
+        }
+        (RunEndEncoded(_, field), _) => {
+            mathematics_numerical_coercion(field.data_type(), rhs_type)
+        }
+        (_, RunEndEncoded(_, field)) => {
+            mathematics_numerical_coercion(lhs_type, field.data_type())
+        }
         _ => numerical_coercion(lhs_type, rhs_type),
     }
 }
@@ -1350,19 +1534,25 @@ fn both_numeric_or_null_and_numeric(lhs_type: &DataType, rhs_type: &DataType) ->
         (_, Dictionary(_, value_type)) => {
             lhs_type.is_numeric() && value_type.is_numeric()
         }
+        (RunEndEncoded(_, lhs_field), RunEndEncoded(_, rhs_field)) => {
+            lhs_field.data_type().is_numeric() && rhs_field.data_type().is_numeric()
+        }
+        (RunEndEncoded(_, field), _) => {
+            field.data_type().is_numeric() && rhs_type.is_numeric()
+        }
+        (_, RunEndEncoded(_, field)) => {
+            lhs_type.is_numeric() && field.data_type().is_numeric()
+        }
         _ => lhs_type.is_numeric() && rhs_type.is_numeric(),
     }
 }
 
-/// Generic coercion rules for Dictionaries: the type that both lhs and rhs
-/// can be casted to for the purpose of a computation.
+/// Coerce two Dictionary types by coercing their value types using
+/// `coerce_fn` (either [`comparison_coercion`] or [`type_union_coercion`]).
 ///
-/// Not all operators support dictionaries, if `preserve_dictionaries` is true
-/// dictionaries will be preserved if possible.
-///
-/// The `coerce_fn` parameter determines which comparison coercion function to use
-/// for comparing the dictionary value types.
-fn dictionary_comparison_coercion_generic(
+/// If `preserve_dictionaries` is true, dictionaries will be preserved
+/// when possible.
+fn dictionary_coercion(
     lhs_type: &DataType,
     rhs_type: &DataType,
     preserve_dictionaries: bool,
@@ -1386,50 +1576,72 @@ fn dictionary_comparison_coercion_generic(
     }
 }
 
-/// Coercion rules for Dictionaries: the type that both lhs and rhs
-/// can be casted to for the purpose of a computation.
-///
-/// Not all operators support dictionaries, if `preserve_dictionaries` is true
-/// dictionaries will be preserved if possible
-fn dictionary_comparison_coercion(
-    lhs_type: &DataType,
-    rhs_type: &DataType,
-    preserve_dictionaries: bool,
-) -> Option<DataType> {
-    dictionary_comparison_coercion_generic(
-        lhs_type,
-        rhs_type,
-        preserve_dictionaries,
-        comparison_coercion,
-    )
-}
-
-/// Coercion rules for Dictionaries with numeric preference: similar to
-/// [`dictionary_comparison_coercion`] but uses [`comparison_coercion_numeric`]
-/// which prefers numeric types over strings when both are present.
+/// Coerce two RunEndEncoded types using `coerce_fn`
+/// (either [`comparison_coercion`] or [`type_union_coercion`]).
 ///
-/// This is used by [`comparison_coercion_numeric`] to maintain consistent
-/// numeric-preferring semantics when dealing with dictionary types.
-fn dictionary_comparison_coercion_numeric(
+/// If `preserve_ree` is true, REE will be preserved when possible.
+fn ree_coercion(
     lhs_type: &DataType,
     rhs_type: &DataType,
-    preserve_dictionaries: bool,
+    preserve_ree: bool,
+    coerce_fn: fn(&DataType, &DataType) -> Option<DataType>,
 ) -> Option<DataType> {
-    dictionary_comparison_coercion_generic(
-        lhs_type,
-        rhs_type,
-        preserve_dictionaries,
-        comparison_coercion_numeric,
-    )
+    use arrow::datatypes::DataType::*;
+    match (lhs_type, rhs_type) {
+        (RunEndEncoded(_, lhs_values_field), RunEndEncoded(_, rhs_values_field)) => {
+            coerce_fn(lhs_values_field.data_type(), rhs_values_field.data_type())
+        }
+        (ree @ RunEndEncoded(_, values_field), other_type)
+        | (other_type, ree @ RunEndEncoded(_, values_field))
+            if preserve_ree && values_field.data_type() == other_type =>
+        {
+            Some(ree.clone())
+        }
+        (RunEndEncoded(_, values_field), _) => {
+            coerce_fn(values_field.data_type(), rhs_type)
+        }
+        (_, RunEndEncoded(_, values_field)) => {
+            coerce_fn(lhs_type, values_field.data_type())
+        }
+        _ => None,
+    }
 }
 
 /// Coercion rules for string concat.
 /// This is a union of string coercion rules and specified rules:
 /// 1. At least one side of lhs and rhs should be string type (Utf8 / LargeUtf8)
 /// 2. Data type of the other side should be able to cast to string type
+/// 3. Binary and string types cannot be mixed
 fn string_concat_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
     use arrow::datatypes::DataType::*;
     string_coercion(lhs_type, rhs_type).or_else(|| match (lhs_type, rhs_type) {
+        // Allow pure binary + binary
+        (
+            Binary | LargeBinary | BinaryView | FixedSizeBinary(_),
+            Binary | LargeBinary | BinaryView | FixedSizeBinary(_),
+        ) => {
+            // Coerce fixed-sized binary to variable-sized `Binary` to make uniform signature
+            // with the `Binary` result
+            let lhs_type = match lhs_type {
+                FixedSizeBinary(_) => &Binary,
+                val => val,
+            };
+            let rhs_type = match rhs_type {
+                FixedSizeBinary(_) => &Binary,
+                val => val,
+            };
+            binary_coercion(lhs_type, rhs_type)
+        }
+        // Deny other mixed binary + string combinations
+        (
+            Binary | LargeBinary | BinaryView | FixedSizeBinary(_),
+            Utf8 | LargeUtf8 | Utf8View,
+        ) => None,
+        (
+            Utf8 | LargeUtf8 | Utf8View,
+            Binary | LargeBinary | BinaryView | FixedSizeBinary(_),
+        ) => None,
+        // Predicate-based coercion rules are following
         (Utf8View, from_type) | (from_type, Utf8View) => {
             string_concat_internal_coercion(from_type, &Utf8View)
         }
@@ -1487,32 +1699,28 @@ pub fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataT
     }
 }
 
-fn numeric_string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
-    use arrow::datatypes::DataType::*;
-    match (lhs_type, rhs_type) {
-        (Utf8 | LargeUtf8 | Utf8View, other_type)
-        | (other_type, Utf8 | LargeUtf8 | Utf8View)
-            if other_type.is_numeric() =>
-        {
-            Some(other_type.clone())
-        }
-        _ => None,
-    }
-}
-
-/// Coerces two fields together, ensuring the field data (name and nullability) is correctly set.
-fn coerce_list_children(lhs_field: &FieldRef, rhs_field: &FieldRef) -> Option<FieldRef> {
-    let data_types = vec![lhs_field.data_type().clone(), rhs_field.data_type().clone()];
+/// Coerce two list element fields to a common type using the provided
+/// coercion function for element types.
+fn coerce_list_children(
+    lhs_field: &FieldRef,
+    rhs_field: &FieldRef,
+    coerce_fn: fn(&DataType, &DataType) -> Option<DataType>,
+) -> Option<FieldRef> {
     Some(Arc::new(
         (**lhs_field)
             .clone()
-            .with_data_type(type_union_resolution(&data_types)?)
+            .with_data_type(coerce_fn(lhs_field.data_type(), rhs_field.data_type())?)
             .with_nullable(lhs_field.is_nullable() || rhs_field.is_nullable()),
     ))
 }
 
-/// Coercion rules for list types.
-fn list_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
+/// Coerce two list types by coercing their element types via `coerce_fn`
+/// (either [`comparison_coercion`] or [`type_union_coercion`]).
+fn list_coercion(
+    lhs_type: &DataType,
+    rhs_type: &DataType,
+    coerce_fn: fn(&DataType, &DataType) -> Option<DataType>,
+) -> Option<DataType> {
     use arrow::datatypes::DataType::*;
     match (lhs_type, rhs_type) {
         // Coerce to the left side FixedSizeList type if the list lengths are the same,
@@ -1520,11 +1728,11 @@ fn list_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
         (FixedSizeList(lhs_field, ls), FixedSizeList(rhs_field, rs)) => {
             if ls == rs {
                 Some(FixedSizeList(
-                    coerce_list_children(lhs_field, rhs_field)?,
+                    coerce_list_children(lhs_field, rhs_field, coerce_fn)?,
                     *rs,
                 ))
             } else {
-                Some(List(coerce_list_children(lhs_field, rhs_field)?))
+                Some(List(coerce_list_children(lhs_field, rhs_field, coerce_fn)?))
             }
         }
         // LargeList on any side
@@ -1532,13 +1740,13 @@ fn list_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
             LargeList(lhs_field),
             List(rhs_field) | LargeList(rhs_field) | FixedSizeList(rhs_field, _),
         )
-        | (List(lhs_field) | FixedSizeList(lhs_field, _), LargeList(rhs_field)) => {
-            Some(LargeList(coerce_list_children(lhs_field, rhs_field)?))
-        }
+        | (List(lhs_field) | FixedSizeList(lhs_field, _), LargeList(rhs_field)) => Some(
+            LargeList(coerce_list_children(lhs_field, rhs_field, coerce_fn)?),
+        ),
         // Lists on both sides
         (List(lhs_field), List(rhs_field) | FixedSizeList(rhs_field, _))
         | (FixedSizeList(lhs_field, _), List(rhs_field)) => {
-            Some(List(coerce_list_children(lhs_field, rhs_field)?))
+            Some(List(coerce_list_children(lhs_field, rhs_field, coerce_fn)?))
         }
         _ => None,
     }
@@ -1575,13 +1783,17 @@ pub fn binary_to_string_coercion(
 fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
     use arrow::datatypes::DataType::*;
     match (lhs_type, rhs_type) {
+        // Prefer symmetric coercion (in case the function is called directly)
+        (Binary, Binary) => Some(Binary),
+        (LargeBinary, LargeBinary) => Some(LargeBinary),
+        (BinaryView, BinaryView) => Some(BinaryView),
         // If BinaryView is in any side, we coerce to BinaryView.
-        (BinaryView, BinaryView | Binary | LargeBinary | Utf8 | LargeUtf8 | Utf8View)
+        (BinaryView, Binary | LargeBinary | Utf8 | LargeUtf8 | Utf8View)
         | (LargeBinary | Binary | Utf8 | LargeUtf8 | Utf8View, BinaryView) => {
             Some(BinaryView)
         }
         // Prefer LargeBinary over Binary
-        (LargeBinary | Binary | Utf8 | LargeUtf8 | Utf8View, LargeBinary)
+        (Binary | Utf8 | LargeUtf8 | Utf8View, LargeBinary)
         | (LargeBinary, Binary | Utf8 | LargeUtf8 | Utf8View) => Some(LargeBinary),
 
         // If Utf8View/LargeUtf8 presents need to be large Binary
@@ -1601,12 +1813,14 @@ fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType>
 }
 
 /// Coercion rules for like operations.
-/// This is a union of string coercion rules and dictionary coercion rules
+/// This is a union of string coercion rules, dictionary coercion rules, and REE coercion rules
+/// Note: list_coercion is intentionally NOT included here because LIKE is a string pattern
+/// matching operation and is not supported for nested types (List, Struct, etc.)
 pub fn like_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
     string_coercion(lhs_type, rhs_type)
-        .or_else(|| list_coercion(lhs_type, rhs_type))
         .or_else(|| binary_to_string_coercion(lhs_type, rhs_type))
-        .or_else(|| dictionary_comparison_coercion(lhs_type, rhs_type, false))
+        .or_else(|| dictionary_coercion(lhs_type, rhs_type, false, like_coercion))
+        .or_else(|| ree_coercion(lhs_type, rhs_type, false, like_coercion))
         .or_else(|| regex_null_coercion(lhs_type, rhs_type))
         .or_else(|| null_coercion(lhs_type, rhs_type))
 }
@@ -1623,23 +1837,24 @@ fn regex_null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataT
 }
 
 /// Coercion rules for regular expression comparison operations.
-/// This is a union of string coercion rules and dictionary coercion rules
+/// This is a union of string coercion rules, dictionary coercion rules, and REE coercion rules.
 pub fn regex_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
     string_coercion(lhs_type, rhs_type)
-        .or_else(|| dictionary_comparison_coercion(lhs_type, rhs_type, false))
+        .or_else(|| dictionary_coercion(lhs_type, rhs_type, false, regex_coercion))
+        .or_else(|| ree_coercion(lhs_type, rhs_type, false, regex_coercion))
         .or_else(|| regex_null_coercion(lhs_type, rhs_type))
 }
 
 /// Checks if the TimeUnit associated with a Time32 or Time64 type is consistent,
 /// as Time32 can only be used to Second and Millisecond accuracy, while Time64
 /// is exclusively used to Microsecond and Nanosecond accuracy
-fn is_time_with_valid_unit(datatype: DataType) -> bool {
+fn is_time_with_valid_unit(datatype: &DataType) -> bool {
     matches!(
         datatype,
-        DataType::Time32(TimeUnit::Second)
-            | DataType::Time32(TimeUnit::Millisecond)
-            | DataType::Time64(TimeUnit::Microsecond)
-            | DataType::Time64(TimeUnit::Nanosecond)
+        &DataType::Time32(Second)
+            | &DataType::Time32(Millisecond)
+            | &DataType::Time64(Microsecond)
+            | &DataType::Time64(Nanosecond)
     )
 }
 
@@ -1725,6 +1940,73 @@ fn temporal_coercion_strict_timezone(
     }
 }
 
+fn temporal_math_coercion(
+    lhs_type: &DataType,
+    rhs_type: &DataType,
+) -> Option<(DataType, DataType)> {
+    use DataType::*;
+
+    match (lhs_type, rhs_type) {
+        // Coerce Date + int -> Date + Interval
+        (Date32, Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64) => {
+            Some((Date32, Interval(MonthDayNano)))
+        }
+        (Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64, Date32) => {
+            Some((Interval(MonthDayNano), Date32))
+        }
+        (Date64, Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64) => {
+            Some((Date64, Interval(MonthDayNano)))
+        }
+        (Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64, Date64) => {
+            Some((Interval(MonthDayNano), Date64))
+        }
+        // Coerce Date + time -> timestamp + Duration
+        (Date32, Time32(_)) => Some((Timestamp(Nanosecond, None), Duration(Nanosecond))),
+        (Time32(_), Date32) => Some((Duration(Nanosecond), Timestamp(Nanosecond, None))),
+
+        (Date32, Time64(_)) => Some((Timestamp(Nanosecond, None), Duration(Nanosecond))),
+        (Time64(_), Date32) => Some((Duration(Nanosecond), Timestamp(Nanosecond, None))),
+
+        (Date64, Time32(_)) => Some((Timestamp(Nanosecond, None), Duration(Nanosecond))),
+        (Time32(_), Date64) => Some((Duration(Nanosecond), Timestamp(Nanosecond, None))),
+
+        (Date64, Time64(_)) => Some((Timestamp(Nanosecond, None), Duration(Nanosecond))),
+        (Time64(_), Date64) => Some((Duration(Nanosecond), Timestamp(Nanosecond, None))),
+
+        // Coerce Duration to match Timestamp's unit,
+        // e.g. Timestamp(ms) + Duration(s) → Timestamp(ms) + Duration(ms)
+        (Timestamp(ts_unit, tz), Duration(_)) => {
+            Some((Timestamp(*ts_unit, tz.clone()), Duration(*ts_unit)))
+        }
+        (Duration(_), Timestamp(ts_unit, tz)) => {
+            Some((Duration(*ts_unit), Timestamp(*ts_unit, tz.clone())))
+        }
+        // time - time -> Interval
+        (Time32(_) | Time64(_), Time32(_) | Time64(_)) => {
+            Some((Interval(MonthDayNano), Interval(MonthDayNano)))
+        }
+        // time + interval -> Interval
+        (Time32(_) | Time64(_), Interval(_)) => {
+            Some((Interval(MonthDayNano), Interval(MonthDayNano)))
+        }
+        (Interval(_), Time32(_) | Time64(_)) => {
+            Some((Interval(MonthDayNano), Interval(MonthDayNano)))
+        }
+        // Interval * number => Interval
+        (
+            Interval(_),
+            Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 | Float16
+            | Float32 | Float64,
+        ) => Some((Interval(MonthDayNano), Interval(MonthDayNano))),
+        (
+            Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 | Float16
+            | Float32 | Float64,
+            Interval(_),
+        ) => Some((Interval(MonthDayNano), Interval(MonthDayNano))),
+        _ => None,
+    }
+}
+
 fn temporal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
     use arrow::datatypes::DataType::*;
     use arrow::datatypes::IntervalUnit::*;
@@ -1734,7 +2016,19 @@ fn temporal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataTyp
         (Interval(_) | Duration(_), Interval(_) | Duration(_)) => {
             Some(Interval(MonthDayNano))
         }
+        (Date32, Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64)
+        | (Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64, Date32) => {
+            Some(Date32)
+        }
+        (Date64, Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64)
+        | (Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64, Date64) => {
+            Some(Date64)
+        }
         (Date64, Date32) | (Date32, Date64) => Some(Date64),
+        (Date32, Time32(_)) | (Time32(_), Date32) => Some(Timestamp(Nanosecond, None)),
+        (Date32, Time64(_)) | (Time64(_), Date32) => Some(Timestamp(Nanosecond, None)),
+        (Date64, Time32(_)) | (Time32(_), Date64) => Some(Timestamp(Nanosecond, None)),
+        (Date64, Time64(_)) | (Time64(_), Date64) => Some(Timestamp(Nanosecond, None)),
         (Timestamp(_, None), Date64) | (Date64, Timestamp(_, None)) => {
             Some(Timestamp(Nanosecond, None))
         }
diff --git a/datafusion/expr-common/src/type_coercion/binary/tests/arithmetic.rs b/datafusion/expr-common/src/type_coercion/binary/tests/arithmetic.rs
index 63945a4dabd0c..eb5622fedb8aa 100644
--- a/datafusion/expr-common/src/type_coercion/binary/tests/arithmetic.rs
+++ b/datafusion/expr-common/src/type_coercion/binary/tests/arithmetic.rs
@@ -25,20 +25,23 @@ fn test_coercion_error() -> Result<()> {
     let result_type = coercer.get_input_types();
 
     let e = result_type.unwrap_err();
-    assert_eq!(e.strip_backtrace(), "Error during planning: Cannot coerce arithmetic expression Float32 + Utf8 to valid types");
+    assert_eq!(
+        e.strip_backtrace(),
+        "Error during planning: Cannot coerce arithmetic expression Float32 + Utf8 to valid types"
+    );
     Ok(())
 }
 
 #[test]
 fn test_date_timestamp_arithmetic_error() -> Result<()> {
     let (lhs, rhs) = BinaryTypeCoercer::new(
-        &DataType::Timestamp(TimeUnit::Nanosecond, None),
+        &DataType::Timestamp(Nanosecond, None),
         &Operator::Minus,
-        &DataType::Timestamp(TimeUnit::Millisecond, None),
+        &DataType::Timestamp(Millisecond, None),
     )
     .get_input_types()?;
-    assert_eq!(lhs, DataType::Timestamp(TimeUnit::Millisecond, None));
-    assert_eq!(rhs, DataType::Timestamp(TimeUnit::Millisecond, None));
+    assert_eq!(lhs, DataType::Timestamp(Millisecond, None));
+    assert_eq!(rhs, DataType::Timestamp(Millisecond, None));
 
     let err =
         BinaryTypeCoercer::new(&DataType::Date32, &Operator::Plus, &DataType::Date64)
@@ -146,14 +149,18 @@ fn test_type_coercion_arithmetic() -> Result<()> {
     // (_, Float32) | (Float32, _) => Some(Float32)
     test_coercion_binary_rule_multiple!(
         Float32,
-        [Float32, Float16, Int64, UInt64, Int32, UInt32, Int16, UInt16, Int8, UInt8],
+        [
+            Float32, Float16, Int64, UInt64, Int32, UInt32, Int16, UInt16, Int8, UInt8
+        ],
         Operator::Plus,
         Float32
     );
     // (_, Float16) | (Float16, _) => Some(Float16)
     test_coercion_binary_rule_multiple!(
         Float16,
-        [Float16, Int64, UInt64, Int32, UInt32, Int16, UInt16, Int8, UInt8],
+        [
+            Float16, Int64, UInt64, Int32, UInt32, Int16, UInt16, Int8, UInt8
+        ],
         Operator::Plus,
         Float16
     );
@@ -221,6 +228,53 @@ fn test_type_coercion_arithmetic() -> Result<()> {
     Ok(())
 }
 
+#[test]
+fn test_bitwise_coercion_non_integer_types() -> Result<()> {
+    let err = BinaryTypeCoercer::new(
+        &DataType::Float32,
+        &Operator::BitwiseAnd,
+        &DataType::Float32,
+    )
+    .get_input_types()
+    .unwrap_err()
+    .to_string();
+    assert_contains!(
+        &err,
+        "Cannot infer common type for bitwise operation Float32 & Float32"
+    );
+
+    let err = BinaryTypeCoercer::new(
+        &DataType::Float32,
+        &Operator::BitwiseAnd,
+        &DataType::Float64,
+    )
+    .get_input_types()
+    .unwrap_err()
+    .to_string();
+    assert_contains!(
+        &err,
+        "Cannot infer common type for bitwise operation Float32 & Float64"
+    );
+
+    let err = BinaryTypeCoercer::new(
+        &DataType::Decimal128(10, 2),
+        &Operator::BitwiseAnd,
+        &DataType::Decimal128(10, 2),
+    )
+    .get_input_types()
+    .unwrap_err()
+    .to_string();
+    assert_contains!(
+        &err,
+        "Cannot infer common type for bitwise operation Decimal128(10, 2) & Decimal128(10, 2)"
+    );
+
+    let dict_int8 = DataType::Dictionary(DataType::Int8.into(), DataType::Int8.into());
+    test_coercion_binary_rule!(dict_int8, dict_int8, Operator::BitwiseAnd, dict_int8);
+
+    Ok(())
+}
+
 fn test_math_decimal_coercion_rule(
     lhs_type: DataType,
     rhs_type: DataType,
diff --git a/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs b/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs
index 5401264e43e39..f8bff3ca90ecf 100644
--- a/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs
+++ b/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs
@@ -122,51 +122,51 @@ fn test_type_coercion() -> Result<()> {
     );
     test_coercion_binary_rule!(
         DataType::Utf8,
-        DataType::Time32(TimeUnit::Second),
+        DataType::Time32(Second),
         Operator::Eq,
-        DataType::Time32(TimeUnit::Second)
+        DataType::Time32(Second)
     );
     test_coercion_binary_rule!(
         DataType::Utf8,
-        DataType::Time32(TimeUnit::Millisecond),
+        DataType::Time32(Millisecond),
         Operator::Eq,
-        DataType::Time32(TimeUnit::Millisecond)
+        DataType::Time32(Millisecond)
     );
     test_coercion_binary_rule!(
         DataType::Utf8,
-        DataType::Time64(TimeUnit::Microsecond),
+        DataType::Time64(Microsecond),
         Operator::Eq,
-        DataType::Time64(TimeUnit::Microsecond)
+        DataType::Time64(Microsecond)
     );
     test_coercion_binary_rule!(
         DataType::Utf8,
-        DataType::Time64(TimeUnit::Nanosecond),
+        DataType::Time64(Nanosecond),
         Operator::Eq,
-        DataType::Time64(TimeUnit::Nanosecond)
+        DataType::Time64(Nanosecond)
     );
     test_coercion_binary_rule!(
         DataType::Utf8,
-        DataType::Timestamp(TimeUnit::Second, None),
+        DataType::Timestamp(Second, None),
         Operator::Lt,
-        DataType::Timestamp(TimeUnit::Nanosecond, None)
+        DataType::Timestamp(Nanosecond, None)
     );
     test_coercion_binary_rule!(
         DataType::Utf8,
-        DataType::Timestamp(TimeUnit::Millisecond, None),
+        DataType::Timestamp(Millisecond, None),
         Operator::Lt,
-        DataType::Timestamp(TimeUnit::Nanosecond, None)
+        DataType::Timestamp(Nanosecond, None)
     );
     test_coercion_binary_rule!(
         DataType::Utf8,
-        DataType::Timestamp(TimeUnit::Microsecond, None),
+        DataType::Timestamp(Microsecond, None),
         Operator::Lt,
-        DataType::Timestamp(TimeUnit::Nanosecond, None)
+        DataType::Timestamp(Nanosecond, None)
     );
     test_coercion_binary_rule!(
         DataType::Utf8,
-        DataType::Timestamp(TimeUnit::Nanosecond, None),
+        DataType::Timestamp(Nanosecond, None),
         Operator::Lt,
-        DataType::Timestamp(TimeUnit::Nanosecond, None)
+        DataType::Timestamp(Nanosecond, None)
     );
     test_coercion_binary_rule!(
         DataType::Utf8,
@@ -552,28 +552,28 @@ fn test_type_coercion_compare() -> Result<()> {
     // Timestamps
     let utc: Option<Arc<str>> = Some("UTC".into());
     test_coercion_binary_rule!(
-        DataType::Timestamp(TimeUnit::Second, utc.clone()),
-        DataType::Timestamp(TimeUnit::Second, utc.clone()),
+        DataType::Timestamp(Second, utc.clone()),
+        DataType::Timestamp(Second, utc.clone()),
         Operator::Eq,
-        DataType::Timestamp(TimeUnit::Second, utc.clone())
+        DataType::Timestamp(Second, utc.clone())
     );
     test_coercion_binary_rule!(
-        DataType::Timestamp(TimeUnit::Second, utc.clone()),
-        DataType::Timestamp(TimeUnit::Second, Some("Europe/Brussels".into())),
+        DataType::Timestamp(Second, utc.clone()),
+        DataType::Timestamp(Second, Some("Europe/Brussels".into())),
         Operator::Eq,
-        DataType::Timestamp(TimeUnit::Second, utc.clone())
+        DataType::Timestamp(Second, utc.clone())
     );
     test_coercion_binary_rule!(
-        DataType::Timestamp(TimeUnit::Second, Some("America/New_York".into())),
-        DataType::Timestamp(TimeUnit::Second, Some("Europe/Brussels".into())),
+        DataType::Timestamp(Second, Some("America/New_York".into())),
+        DataType::Timestamp(Second, Some("Europe/Brussels".into())),
         Operator::Eq,
-        DataType::Timestamp(TimeUnit::Second, Some("America/New_York".into()))
+        DataType::Timestamp(Second, Some("America/New_York".into()))
     );
     test_coercion_binary_rule!(
-        DataType::Timestamp(TimeUnit::Second, Some("Europe/Brussels".into())),
-        DataType::Timestamp(TimeUnit::Second, utc),
+        DataType::Timestamp(Second, Some("Europe/Brussels".into())),
+        DataType::Timestamp(Second, utc),
         Operator::Eq,
-        DataType::Timestamp(TimeUnit::Second, Some("Europe/Brussels".into()))
+        DataType::Timestamp(Second, Some("Europe/Brussels".into()))
     );
 
     // list
@@ -634,7 +634,7 @@ fn test_type_coercion_compare() -> Result<()> {
     );
 
     let inner_timestamp_field = Arc::new(Field::new_list_field(
-        DataType::Timestamp(TimeUnit::Microsecond, None),
+        DataType::Timestamp(Microsecond, None),
         true,
     ));
     let result_type = BinaryTypeCoercer::new(
@@ -654,7 +654,7 @@ fn test_list_coercion() {
 
     let rhs_type = DataType::List(Arc::new(Field::new("rhs", DataType::Int64, true)));
 
-    let coerced_type = list_coercion(&lhs_type, &rhs_type).unwrap();
+    let coerced_type = list_coercion(&lhs_type, &rhs_type, comparison_coercion).unwrap();
     assert_eq!(
         coerced_type,
         DataType::List(Arc::new(Field::new("lhs", DataType::Int64, true)))
@@ -778,10 +778,228 @@ fn test_decimal_cross_variant_comparison_coercion() -> Result<()> {
         for op in comparison_op_types {
             let (lhs, rhs) =
                 BinaryTypeCoercer::new(&lhs_type, &op, &rhs_type).get_input_types()?;
-            assert_eq!(expected_type, lhs, "Coercion of type {lhs_type:?} with {rhs_type:?} resulted in unexpected type: {lhs:?}");
-            assert_eq!(expected_type, rhs, "Coercion of type {rhs_type:?} with {lhs_type:?} resulted in unexpected type: {rhs:?}");
+            assert_eq!(
+                expected_type, lhs,
+                "Coercion of type {lhs_type:?} with {rhs_type:?} resulted in unexpected type: {lhs:?}"
+            );
+            assert_eq!(
+                expected_type, rhs,
+                "Coercion of type {rhs_type:?} with {lhs_type:?} resulted in unexpected type: {rhs:?}"
+            );
         }
     }
 
     Ok(())
 }
+
+/// Tests that `comparison_coercion` prefers the numeric type when one side is
+/// numeric and the other is a string (e.g., `numeric_col < '123'`).
+#[test]
+fn test_comparison_coercion_prefers_numeric() {
+    assert_eq!(
+        comparison_coercion(&DataType::Int32, &DataType::Utf8),
+        Some(DataType::Int32)
+    );
+    assert_eq!(
+        comparison_coercion(&DataType::Utf8, &DataType::Int32),
+        Some(DataType::Int32)
+    );
+    assert_eq!(
+        comparison_coercion(&DataType::Utf8, &DataType::Float64),
+        Some(DataType::Float64)
+    );
+    assert_eq!(
+        comparison_coercion(&DataType::Float64, &DataType::Utf8),
+        Some(DataType::Float64)
+    );
+    assert_eq!(
+        comparison_coercion(&DataType::Int64, &DataType::LargeUtf8),
+        Some(DataType::Int64)
+    );
+    assert_eq!(
+        comparison_coercion(&DataType::Utf8View, &DataType::Int16),
+        Some(DataType::Int16)
+    );
+    // String-string stays string
+    assert_eq!(
+        comparison_coercion(&DataType::Utf8, &DataType::Utf8),
+        Some(DataType::Utf8)
+    );
+    // Numeric-numeric stays numeric
+    assert_eq!(
+        comparison_coercion(&DataType::Int32, &DataType::Int64),
+        Some(DataType::Int64)
+    );
+}
+
+/// Tests that `type_union_coercion` prefers the string type when unifying
+/// numeric and string types (for UNION, CASE THEN/ELSE, etc.).
+#[test]
+fn test_type_union_coercion_prefers_string() {
+    assert_eq!(
+        type_union_coercion(&DataType::Int32, &DataType::Utf8),
+        Some(DataType::Utf8)
+    );
+    assert_eq!(
+        type_union_coercion(&DataType::Utf8, &DataType::Int32),
+        Some(DataType::Utf8)
+    );
+    assert_eq!(
+        type_union_coercion(&DataType::Float64, &DataType::Utf8),
+        Some(DataType::Utf8)
+    );
+    assert_eq!(
+        type_union_coercion(&DataType::Utf8, &DataType::Float64),
+        Some(DataType::Utf8)
+    );
+    assert_eq!(
+        type_union_coercion(&DataType::Int64, &DataType::LargeUtf8),
+        Some(DataType::LargeUtf8)
+    );
+    assert_eq!(
+        type_union_coercion(&DataType::Utf8View, &DataType::Int16),
+        Some(DataType::Utf8View)
+    );
+    // String-string stays string
+    assert_eq!(
+        type_union_coercion(&DataType::Utf8, &DataType::Utf8),
+        Some(DataType::Utf8)
+    );
+    // Numeric-numeric stays numeric
+    assert_eq!(
+        type_union_coercion(&DataType::Int32, &DataType::Int64),
+        Some(DataType::Int64)
+    );
+}
+
+/// Tests that comparison operators coerce to numeric when comparing
+/// numeric and string types.
+#[test]
+fn test_binary_comparison_string_numeric_coercion() -> Result<()> {
+    let comparison_ops = [
+        Operator::Eq,
+        Operator::NotEq,
+        Operator::Lt,
+        Operator::LtEq,
+        Operator::Gt,
+        Operator::GtEq,
+    ];
+    for op in &comparison_ops {
+        let (lhs, rhs) = BinaryTypeCoercer::new(&DataType::Int64, op, &DataType::Utf8)
+            .get_input_types()?;
+        assert_eq!(lhs, DataType::Int64, "Op {op}: Int64 vs Utf8 -> lhs");
+        assert_eq!(rhs, DataType::Int64, "Op {op}: Int64 vs Utf8 -> rhs");
+
+        let (lhs, rhs) = BinaryTypeCoercer::new(&DataType::Utf8, op, &DataType::Float64)
+            .get_input_types()?;
+        assert_eq!(lhs, DataType::Float64, "Op {op}: Utf8 vs Float64 -> lhs");
+        assert_eq!(rhs, DataType::Float64, "Op {op}: Utf8 vs Float64 -> rhs");
+    }
+    Ok(())
+}
+
+#[test]
+fn test_string_concat_coercion() -> Result<()> {
+    // Binary
+    test_coercion_binary_rule!(
+        DataType::Binary,
+        DataType::Binary,
+        Operator::StringConcat,
+        DataType::Binary
+    );
+    test_coercion_binary_rule!(
+        DataType::LargeBinary,
+        DataType::LargeBinary,
+        Operator::StringConcat,
+        DataType::LargeBinary
+    );
+    test_coercion_binary_rule!(
+        DataType::BinaryView,
+        DataType::BinaryView,
+        Operator::StringConcat,
+        DataType::BinaryView
+    );
+    test_coercion_binary_rule!(
+        DataType::Binary,
+        DataType::LargeBinary,
+        Operator::StringConcat,
+        DataType::LargeBinary
+    );
+    test_coercion_binary_rule!(
+        DataType::BinaryView,
+        DataType::Binary,
+        Operator::StringConcat,
+        DataType::BinaryView
+    );
+    test_coercion_binary_rule!(
+        DataType::FixedSizeBinary(4),
+        DataType::FixedSizeBinary(16),
+        Operator::StringConcat,
+        DataType::Binary
+    );
+    test_coercion_binary_rule!(
+        DataType::FixedSizeBinary(4),
+        DataType::LargeBinary,
+        Operator::StringConcat,
+        DataType::LargeBinary
+    );
+    test_coercion_binary_rule!(
+        DataType::FixedSizeBinary(4),
+        DataType::BinaryView,
+        Operator::StringConcat,
+        DataType::BinaryView
+    );
+
+    // String
+    test_coercion_binary_rule!(
+        DataType::Utf8,
+        DataType::Utf8,
+        Operator::StringConcat,
+        DataType::Utf8
+    );
+    test_coercion_binary_rule!(
+        DataType::LargeUtf8,
+        DataType::LargeUtf8,
+        Operator::StringConcat,
+        DataType::LargeUtf8
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8View,
+        DataType::Utf8View,
+        Operator::StringConcat,
+        DataType::Utf8View
+    );
+
+    // Mixed string-binary
+    for string_dt in [DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View] {
+        for binary_dt in [
+            DataType::Binary,
+            DataType::LargeBinary,
+            DataType::BinaryView,
+            DataType::FixedSizeBinary(8),
+        ] {
+            assert!(
+                BinaryTypeCoercer::new(&binary_dt, &Operator::StringConcat, &string_dt,)
+                    .get_input_types()
+                    .is_err(),
+                "{binary_dt} || {string_dt}"
+            );
+            assert!(
+                BinaryTypeCoercer::new(&string_dt, &Operator::StringConcat, &binary_dt,)
+                    .get_input_types()
+                    .is_err(),
+                "{string_dt} || {binary_dt}"
+            );
+        }
+    }
+
+    // Mixed string-other
+    test_coercion_binary_rule!(
+        DataType::Utf8,
+        DataType::Timestamp(Second, None),
+        Operator::StringConcat,
+        DataType::Utf8
+    );
+
+    Ok(())
+}
diff --git a/datafusion/expr-common/src/type_coercion/binary/tests/dictionary.rs b/datafusion/expr-common/src/type_coercion/binary/tests/dictionary.rs
index 0fb56a4a2c536..f0aadfd3ce3a5 100644
--- a/datafusion/expr-common/src/type_coercion/binary/tests/dictionary.rs
+++ b/datafusion/expr-common/src/type_coercion/binary/tests/dictionary.rs
@@ -24,49 +24,49 @@ fn test_dictionary_type_coercion() {
     let lhs_type = Dictionary(Box::new(Int8), Box::new(Int32));
     let rhs_type = Dictionary(Box::new(Int8), Box::new(Int16));
     assert_eq!(
-        dictionary_comparison_coercion(&lhs_type, &rhs_type, true),
+        dictionary_coercion(&lhs_type, &rhs_type, true, comparison_coercion),
         Some(Int32)
     );
     assert_eq!(
-        dictionary_comparison_coercion(&lhs_type, &rhs_type, false),
+        dictionary_coercion(&lhs_type, &rhs_type, false, comparison_coercion),
         Some(Int32)
     );
 
-    // Since we can coerce values of Int16 to Utf8 can support this
+    // In comparison context, numeric is preferred over string
     let lhs_type = Dictionary(Box::new(Int8), Box::new(Utf8));
     let rhs_type = Dictionary(Box::new(Int8), Box::new(Int16));
     assert_eq!(
-        dictionary_comparison_coercion(&lhs_type, &rhs_type, true),
-        Some(Utf8)
+        dictionary_coercion(&lhs_type, &rhs_type, true, comparison_coercion),
+        Some(Int16)
     );
 
     // Since we can coerce values of Utf8 to Binary can support this
     let lhs_type = Dictionary(Box::new(Int8), Box::new(Utf8));
     let rhs_type = Dictionary(Box::new(Int8), Box::new(Binary));
     assert_eq!(
-        dictionary_comparison_coercion(&lhs_type, &rhs_type, true),
+        dictionary_coercion(&lhs_type, &rhs_type, true, comparison_coercion),
         Some(Binary)
     );
 
     let lhs_type = Dictionary(Box::new(Int8), Box::new(Utf8));
     let rhs_type = Utf8;
     assert_eq!(
-        dictionary_comparison_coercion(&lhs_type, &rhs_type, false),
+        dictionary_coercion(&lhs_type, &rhs_type, false, comparison_coercion),
         Some(Utf8)
     );
     assert_eq!(
-        dictionary_comparison_coercion(&lhs_type, &rhs_type, true),
+        dictionary_coercion(&lhs_type, &rhs_type, true, comparison_coercion),
         Some(lhs_type.clone())
     );
 
     let lhs_type = Utf8;
     let rhs_type = Dictionary(Box::new(Int8), Box::new(Utf8));
     assert_eq!(
-        dictionary_comparison_coercion(&lhs_type, &rhs_type, false),
+        dictionary_coercion(&lhs_type, &rhs_type, false, comparison_coercion),
         Some(Utf8)
     );
     assert_eq!(
-        dictionary_comparison_coercion(&lhs_type, &rhs_type, true),
+        dictionary_coercion(&lhs_type, &rhs_type, true, comparison_coercion),
         Some(rhs_type.clone())
     );
 }
diff --git a/datafusion/expr-common/src/type_coercion/binary/tests/mod.rs b/datafusion/expr-common/src/type_coercion/binary/tests/mod.rs
index 6d21d795e4b72..e4653d4955eb0 100644
--- a/datafusion/expr-common/src/type_coercion/binary/tests/mod.rs
+++ b/datafusion/expr-common/src/type_coercion/binary/tests/mod.rs
@@ -77,3 +77,4 @@ mod arithmetic;
 mod comparison;
 mod dictionary;
 mod null_coercion;
+mod run_end_encoded;
diff --git a/datafusion/expr-common/src/type_coercion/binary/tests/run_end_encoded.rs b/datafusion/expr-common/src/type_coercion/binary/tests/run_end_encoded.rs
new file mode 100644
index 0000000000000..38e9fb3908d5b
--- /dev/null
+++ b/datafusion/expr-common/src/type_coercion/binary/tests/run_end_encoded.rs
@@ -0,0 +1,131 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use super::*;
+use DataType::*;
+
+fn ree(value_type: DataType) -> DataType {
+    RunEndEncoded(
+        Arc::new(Field::new("run_ends", Int32, false)),
+        Arc::new(Field::new("values", value_type, false)),
+    )
+}
+
+#[test]
+fn test_ree_type_coercion() {
+    let lhs_type = RunEndEncoded(
+        Arc::new(Field::new("run_ends", Int8, false)),
+        Arc::new(Field::new("values", Int32, false)),
+    );
+    let rhs_type = RunEndEncoded(
+        Arc::new(Field::new("run_ends", Int8, false)),
+        Arc::new(Field::new("values", Int16, false)),
+    );
+    assert_eq!(
+        ree_coercion(&lhs_type, &rhs_type, true, comparison_coercion),
+        Some(Int32)
+    );
+    assert_eq!(
+        ree_coercion(&lhs_type, &rhs_type, false, comparison_coercion),
+        Some(Int32)
+    );
+
+    // In comparison context, numeric is preferred over string
+    let lhs_type = RunEndEncoded(
+        Arc::new(Field::new("run_ends", Int8, false)),
+        Arc::new(Field::new("values", Utf8, false)),
+    );
+    let rhs_type = RunEndEncoded(
+        Arc::new(Field::new("run_ends", Int8, false)),
+        Arc::new(Field::new("values", Int16, false)),
+    );
+    assert_eq!(
+        ree_coercion(&lhs_type, &rhs_type, true, comparison_coercion),
+        Some(Int16)
+    );
+
+    // Since we can coerce values of Utf8 to Binary can support this
+    let lhs_type = RunEndEncoded(
+        Arc::new(Field::new("run_ends", Int8, false)),
+        Arc::new(Field::new("values", Utf8, false)),
+    );
+    let rhs_type = RunEndEncoded(
+        Arc::new(Field::new("run_ends", Int8, false)),
+        Arc::new(Field::new("values", Binary, false)),
+    );
+    assert_eq!(
+        ree_coercion(&lhs_type, &rhs_type, true, comparison_coercion),
+        Some(Binary)
+    );
+    let lhs_type = RunEndEncoded(
+        Arc::new(Field::new("run_ends", Int8, false)),
+        Arc::new(Field::new("values", Utf8, false)),
+    );
+    let rhs_type = Utf8;
+    // Don't preserve REE
+    assert_eq!(
+        ree_coercion(&lhs_type, &rhs_type, false, comparison_coercion),
+        Some(Utf8)
+    );
+    // Preserve REE
+    assert_eq!(
+        ree_coercion(&lhs_type, &rhs_type, true, comparison_coercion),
+        Some(lhs_type.clone())
+    );
+
+    let lhs_type = Utf8;
+    let rhs_type = RunEndEncoded(
+        Arc::new(Field::new("run_ends", Int8, false)),
+        Arc::new(Field::new("values", Utf8, false)),
+    );
+    // Don't preserve REE
+    assert_eq!(
+        ree_coercion(&lhs_type, &rhs_type, false, comparison_coercion),
+        Some(Utf8)
+    );
+    // Preserve REE
+    assert_eq!(
+        ree_coercion(&lhs_type, &rhs_type, true, comparison_coercion),
+        Some(rhs_type.clone())
+    );
+}
+
+#[test]
+fn test_ree_arithmetic_coercion() -> Result<()> {
+    test_coercion_binary_rule!(ree(Int64), Int64, Operator::Plus, Int64);
+    test_coercion_binary_rule!(Int64, ree(Int64), Operator::Multiply, Int64);
+    test_coercion_binary_rule!(ree(Int32), ree(Int64), Operator::Plus, Int64);
+
+    // Decimal unwrapping through math_decimal_coercion
+    let (lhs, rhs) =
+        BinaryTypeCoercer::new(&ree(Decimal128(10, 2)), &Operator::Plus, &Int32)
+            .get_input_types()?;
+    assert_eq!(lhs, Decimal128(10, 2));
+    assert_eq!(rhs, Decimal128(10, 0));
+
+    let (lhs, rhs) =
+        BinaryTypeCoercer::new(&Int32, &Operator::Plus, &ree(Decimal128(10, 2)))
+            .get_input_types()?;
+    assert_eq!(lhs, Decimal128(10, 0));
+    assert_eq!(rhs, Decimal128(10, 2));
+
+    let result =
+        BinaryTypeCoercer::new(&ree(Utf8), &Operator::Plus, &Int32).get_input_types();
+    assert!(result.is_err());
+
+    Ok(())
+}
diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml
index e6b2734cfff34..8cec01feb30b5 100644
--- a/datafusion/expr/Cargo.toml
+++ b/datafusion/expr/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -43,7 +46,8 @@ recursive_protection = ["dep:recursive"]
 sql = ["sqlparser"]
 
 [dependencies]
-arrow = { workspace = true }
+arrow = { workspace = true, features = ["canonical_extension_types"] }
+arrow-schema = { workspace = true, features = ["canonical_extension_types"] }
 async-trait = { workspace = true }
 chrono = { workspace = true }
 datafusion-common = { workspace = true, default-features = false }
@@ -54,7 +58,6 @@ datafusion-functions-window-common = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
 indexmap = { workspace = true }
 itertools = { workspace = true }
-paste = "^1.0"
 recursive = { workspace = true, optional = true }
 serde_json = { workspace = true }
 sqlparser = { workspace = true, optional = true }
@@ -63,3 +66,6 @@ sqlparser = { workspace = true, optional = true }
 ctor = { workspace = true }
 env_logger = { workspace = true }
 insta = { workspace = true }
+# Makes sure `test_display_pg_json` behaves in a consistent way regardless of
+# feature unification with dependencies
+serde_json = { workspace = true, features = ["preserve_order"] }
diff --git a/datafusion/expr/src/arguments.rs b/datafusion/expr/src/arguments.rs
index 5653993db98fe..f10cf50f60b24 100644
--- a/datafusion/expr/src/arguments.rs
+++ b/datafusion/expr/src/arguments.rs
@@ -18,8 +18,21 @@
 //! Argument resolution logic for named function parameters
 
 use crate::Expr;
-use datafusion_common::{plan_err, Result};
-use std::collections::HashMap;
+use datafusion_common::{Result, plan_err};
+
+/// Represents a named function argument with its original case and quote information.
+///
+/// This struct preserves whether an identifier was quoted in the SQL, which determines
+/// whether case-sensitive or case-insensitive matching should be used per SQL standards.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct ArgumentName {
+    /// The argument name in its original case as it appeared in the SQL
+    pub value: String,
+    /// Whether the identifier was quoted (e.g., "STR" vs STR)
+    /// - true: quoted identifier, requires case-sensitive matching
+    /// - false: unquoted identifier, uses case-insensitive matching
+    pub is_quoted: bool,
+}
 
 /// Resolves function arguments, handling named and positional notation.
 ///
@@ -50,7 +63,7 @@ use std::collections::HashMap;
 pub fn resolve_function_arguments(
     param_names: &[String],
     args: Vec<Expr>,
-    arg_names: Vec<Option<String>>,
+    arg_names: Vec<Option<ArgumentName>>,
 ) -> Result<Vec<Expr>> {
     if args.len() != arg_names.len() {
         return plan_err!(
@@ -71,7 +84,7 @@ pub fn resolve_function_arguments(
 }
 
 /// Validates that positional arguments come before named arguments
-fn validate_argument_order(arg_names: &[Option<String>]) -> Result<()> {
+fn validate_argument_order(arg_names: &[Option<ArgumentName>]) -> Result<()> {
     let mut seen_named = false;
     for (i, arg_name) in arg_names.iter().enumerate() {
         match arg_name {
@@ -93,15 +106,8 @@ fn validate_argument_order(arg_names: &[Option<String>]) -> Result<()> {
 fn reorder_named_arguments(
     param_names: &[String],
     args: Vec<Expr>,
-    arg_names: Vec<Option<String>>,
+    arg_names: Vec<Option<ArgumentName>>,
 ) -> Result<Vec<Expr>> {
-    // Build HashMap for O(1) parameter name lookups
-    let param_index_map: HashMap<&str, usize> = param_names
-        .iter()
-        .enumerate()
-        .map(|(idx, name)| (name.as_str(), idx))
-        .collect();
-
     let positional_count = arg_names.iter().filter(|n| n.is_none()).count();
 
     // Capture args length before consuming the vector
@@ -120,19 +126,35 @@ fn reorder_named_arguments(
     let mut result: Vec<Option<Expr>> = vec![None; expected_arg_count];
 
     for (i, (arg, arg_name)) in args.into_iter().zip(arg_names).enumerate() {
-        if let Some(name) = arg_name {
-            // Named argument - O(1) lookup in HashMap
-            let param_index =
-                param_index_map.get(name.as_str()).copied().ok_or_else(|| {
+        if let Some(arg_name) = arg_name {
+            // Named argument - find parameter index using linear search
+            // Match based on SQL identifier rules:
+            // - Quoted identifiers: case-sensitive (exact match)
+            // - Unquoted identifiers: case-insensitive match
+            let param_index = param_names
+                .iter()
+                .position(|p| {
+                    if arg_name.is_quoted {
+                        // Quoted: exact case match
+                        p == &arg_name.value
+                    } else {
+                        // Unquoted: case-insensitive match
+                        p.eq_ignore_ascii_case(&arg_name.value)
+                    }
+                })
+                .ok_or_else(|| {
                     datafusion_common::plan_datafusion_err!(
                         "Unknown parameter name '{}'. Valid parameters are: [{}]",
-                        name,
+                        arg_name.value,
                         param_names.join(", ")
                     )
                 })?;
 
             if result[param_index].is_some() {
-                return plan_err!("Parameter '{}' specified multiple times", name);
+                return plan_err!(
+                    "Parameter '{}' specified multiple times",
+                    arg_name.value
+                );
             }
 
             result[param_index] = Some(arg);
@@ -175,12 +197,111 @@ mod tests {
         let param_names = vec!["a".to_string(), "b".to_string()];
 
         let args = vec![lit(1), lit("hello")];
-        let arg_names = vec![Some("a".to_string()), Some("b".to_string())];
+        let arg_names = vec![
+            Some(ArgumentName {
+                value: "a".to_string(),
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "b".to_string(),
+                is_quoted: false,
+            }),
+        ];
 
         let result = resolve_function_arguments(&param_names, args, arg_names).unwrap();
         assert_eq!(result.len(), 2);
     }
 
+    #[test]
+    fn test_case_insensitive_parameter_matching() {
+        // Parameter names in function signature (lowercase)
+        let param_names = vec!["startpos".to_string(), "length".to_string()];
+
+        // Unquoted arguments with different casing should match case-insensitively
+        let args = vec![lit(1), lit(10)];
+        let arg_names = vec![
+            Some(ArgumentName {
+                value: "STARTPOS".to_string(),
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "LENGTH".to_string(),
+                is_quoted: false,
+            }),
+        ];
+
+        let result = resolve_function_arguments(&param_names, args, arg_names).unwrap();
+        assert_eq!(result.len(), 2);
+        assert_eq!(result[0], lit(1));
+        assert_eq!(result[1], lit(10));
+
+        // Test with reordering and different cases
+        let args2 = vec![lit(20), lit(5)];
+        let arg_names2 = vec![
+            Some(ArgumentName {
+                value: "Length".to_string(),
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "StartPos".to_string(),
+                is_quoted: false,
+            }),
+        ];
+
+        let result2 =
+            resolve_function_arguments(&param_names, args2, arg_names2).unwrap();
+        assert_eq!(result2.len(), 2);
+        assert_eq!(result2[0], lit(5)); // startpos
+        assert_eq!(result2[1], lit(20)); // length
+    }
+
+    #[test]
+    fn test_quoted_parameter_case_sensitive() {
+        // Parameter names in function signature (lowercase)
+        let param_names = vec!["str".to_string(), "start_pos".to_string()];
+
+        // Quoted identifiers with wrong case should fail
+        let args = vec![lit("hello"), lit(1)];
+        let arg_names = vec![
+            Some(ArgumentName {
+                value: "STR".to_string(),
+                is_quoted: true,
+            }),
+            Some(ArgumentName {
+                value: "start_pos".to_string(),
+                is_quoted: true,
+            }),
+        ];
+
+        let result = resolve_function_arguments(&param_names, args, arg_names);
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Unknown parameter")
+        );
+
+        // Quoted identifiers with correct case should succeed
+        let args2 = vec![lit("hello"), lit(1)];
+        let arg_names2 = vec![
+            Some(ArgumentName {
+                value: "str".to_string(),
+                is_quoted: true,
+            }),
+            Some(ArgumentName {
+                value: "start_pos".to_string(),
+                is_quoted: true,
+            }),
+        ];
+
+        let result2 =
+            resolve_function_arguments(&param_names, args2, arg_names2).unwrap();
+        assert_eq!(result2.len(), 2);
+        assert_eq!(result2[0], lit("hello"));
+        assert_eq!(result2[1], lit(1));
+    }
+
     #[test]
     fn test_named_reordering() {
         let param_names = vec!["a".to_string(), "b".to_string(), "c".to_string()];
@@ -188,9 +309,18 @@ mod tests {
         // Call with: func(c => 3.0, a => 1, b => "hello")
         let args = vec![lit(3.0), lit(1), lit("hello")];
         let arg_names = vec![
-            Some("c".to_string()),
-            Some("a".to_string()),
-            Some("b".to_string()),
+            Some(ArgumentName {
+                value: "c".to_string(),
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "a".to_string(),
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "b".to_string(),
+                is_quoted: false,
+            }),
         ];
 
         let result = resolve_function_arguments(&param_names, args, arg_names).unwrap();
@@ -208,7 +338,17 @@ mod tests {
 
         // Call with: func(1, c => 3.0, b => "hello")
         let args = vec![lit(1), lit(3.0), lit("hello")];
-        let arg_names = vec![None, Some("c".to_string()), Some("b".to_string())];
+        let arg_names = vec![
+            None,
+            Some(ArgumentName {
+                value: "c".to_string(),
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "b".to_string(),
+                is_quoted: false,
+            }),
+        ];
 
         let result = resolve_function_arguments(&param_names, args, arg_names).unwrap();
 
@@ -225,14 +365,22 @@ mod tests {
 
         // Call with: func(a => 1, "hello") - ERROR
         let args = vec![lit(1), lit("hello")];
-        let arg_names = vec![Some("a".to_string()), None];
+        let arg_names = vec![
+            Some(ArgumentName {
+                value: "a".to_string(),
+                is_quoted: false,
+            }),
+            None,
+        ];
 
         let result = resolve_function_arguments(&param_names, args, arg_names);
         assert!(result.is_err());
-        assert!(result
-            .unwrap_err()
-            .to_string()
-            .contains("Positional argument"));
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Positional argument")
+        );
     }
 
     #[test]
@@ -241,14 +389,25 @@ mod tests {
 
         // Call with: func(x => 1, b => "hello") - ERROR
         let args = vec![lit(1), lit("hello")];
-        let arg_names = vec![Some("x".to_string()), Some("b".to_string())];
+        let arg_names = vec![
+            Some(ArgumentName {
+                value: "x".to_string(),
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "b".to_string(),
+                is_quoted: false,
+            }),
+        ];
 
         let result = resolve_function_arguments(&param_names, args, arg_names);
         assert!(result.is_err());
-        assert!(result
-            .unwrap_err()
-            .to_string()
-            .contains("Unknown parameter"));
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Unknown parameter")
+        );
     }
 
     #[test]
@@ -257,14 +416,25 @@ mod tests {
 
         // Call with: func(a => 1, a => 2) - ERROR
         let args = vec![lit(1), lit(2)];
-        let arg_names = vec![Some("a".to_string()), Some("a".to_string())];
+        let arg_names = vec![
+            Some(ArgumentName {
+                value: "a".to_string(),
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "a".to_string(),
+                is_quoted: false,
+            }),
+        ];
 
         let result = resolve_function_arguments(&param_names, args, arg_names);
         assert!(result.is_err());
-        assert!(result
-            .unwrap_err()
-            .to_string()
-            .contains("specified multiple times"));
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("specified multiple times")
+        );
     }
 
     #[test]
@@ -273,13 +443,232 @@ mod tests {
 
         // Call with: func(a => 1, c => 3.0) - missing 'b'
         let args = vec![lit(1), lit(3.0)];
-        let arg_names = vec![Some("a".to_string()), Some("c".to_string())];
+        let arg_names = vec![
+            Some(ArgumentName {
+                value: "a".to_string(),
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "c".to_string(),
+                is_quoted: false,
+            }),
+        ];
 
         let result = resolve_function_arguments(&param_names, args, arg_names);
         assert!(result.is_err());
-        assert!(result
-            .unwrap_err()
-            .to_string()
-            .contains("Missing required parameter"));
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Missing required parameter")
+        );
+    }
+
+    #[test]
+    fn test_mixed_case_signature_unquoted_matching() {
+        // Test with mixed-case signature parameters (lowercase, camelCase, UPPERCASE)
+        // This proves case-insensitive matching works for unquoted identifiers
+        let param_names = vec![
+            "prefix".to_string(),   // lowercase
+            "startPos".to_string(), // camelCase
+            "LENGTH".to_string(),   // UPPERCASE
+        ];
+
+        // Test 1: All lowercase unquoted arguments should match
+        let args1 = vec![lit("a"), lit(1), lit(5)];
+        let arg_names1 = vec![
+            Some(ArgumentName {
+                value: "prefix".to_string(),
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "startpos".to_string(), // lowercase version of startPos
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "length".to_string(), // lowercase version of LENGTH
+                is_quoted: false,
+            }),
+        ];
+
+        let result1 =
+            resolve_function_arguments(&param_names, args1, arg_names1).unwrap();
+        assert_eq!(result1.len(), 3);
+        assert_eq!(result1[0], lit("a"));
+        assert_eq!(result1[1], lit(1));
+        assert_eq!(result1[2], lit(5));
+
+        // Test 2: All uppercase unquoted arguments should match
+        let args2 = vec![lit("b"), lit(2), lit(10)];
+        let arg_names2 = vec![
+            Some(ArgumentName {
+                value: "PREFIX".to_string(), // uppercase version of prefix
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "STARTPOS".to_string(), // uppercase version of startPos
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "LENGTH".to_string(), // matches UPPERCASE
+                is_quoted: false,
+            }),
+        ];
+
+        let result2 =
+            resolve_function_arguments(&param_names, args2, arg_names2).unwrap();
+        assert_eq!(result2.len(), 3);
+        assert_eq!(result2[0], lit("b"));
+        assert_eq!(result2[1], lit(2));
+        assert_eq!(result2[2], lit(10));
+
+        // Test 3: Mixed case unquoted arguments should match
+        let args3 = vec![lit("c"), lit(3), lit(15)];
+        let arg_names3 = vec![
+            Some(ArgumentName {
+                value: "Prefix".to_string(), // Title case
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "StartPos".to_string(), // matches camelCase
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "Length".to_string(), // Title case
+                is_quoted: false,
+            }),
+        ];
+
+        let result3 =
+            resolve_function_arguments(&param_names, args3, arg_names3).unwrap();
+        assert_eq!(result3.len(), 3);
+        assert_eq!(result3[0], lit("c"));
+        assert_eq!(result3[1], lit(3));
+        assert_eq!(result3[2], lit(15));
+    }
+
+    #[test]
+    fn test_mixed_case_signature_quoted_matching() {
+        // Test that quoted identifiers require exact case match with signature
+        let param_names = vec![
+            "prefix".to_string(),   // lowercase
+            "startPos".to_string(), // camelCase
+            "LENGTH".to_string(),   // UPPERCASE
+        ];
+
+        // Test 1: Quoted with wrong case should fail for "prefix"
+        let args_wrong_prefix = vec![lit("a"), lit(1), lit(5)];
+        let arg_names_wrong_prefix = vec![
+            Some(ArgumentName {
+                value: "PREFIX".to_string(), // Wrong case
+                is_quoted: true,
+            }),
+            Some(ArgumentName {
+                value: "startPos".to_string(),
+                is_quoted: true,
+            }),
+            Some(ArgumentName {
+                value: "LENGTH".to_string(),
+                is_quoted: true,
+            }),
+        ];
+
+        let result = resolve_function_arguments(
+            &param_names,
+            args_wrong_prefix,
+            arg_names_wrong_prefix,
+        );
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Unknown parameter")
+        );
+
+        // Test 2: Quoted with wrong case should fail for "startPos"
+        let args_wrong_startpos = vec![lit("a"), lit(1), lit(5)];
+        let arg_names_wrong_startpos = vec![
+            Some(ArgumentName {
+                value: "prefix".to_string(),
+                is_quoted: true,
+            }),
+            Some(ArgumentName {
+                value: "STARTPOS".to_string(), // Wrong case
+                is_quoted: true,
+            }),
+            Some(ArgumentName {
+                value: "LENGTH".to_string(),
+                is_quoted: true,
+            }),
+        ];
+
+        let result2 = resolve_function_arguments(
+            &param_names,
+            args_wrong_startpos,
+            arg_names_wrong_startpos,
+        );
+        assert!(result2.is_err());
+        assert!(
+            result2
+                .unwrap_err()
+                .to_string()
+                .contains("Unknown parameter")
+        );
+
+        // Test 3: Quoted with wrong case should fail for "LENGTH"
+        let args_wrong_length = vec![lit("a"), lit(1), lit(5)];
+        let arg_names_wrong_length = vec![
+            Some(ArgumentName {
+                value: "prefix".to_string(),
+                is_quoted: true,
+            }),
+            Some(ArgumentName {
+                value: "startPos".to_string(),
+                is_quoted: true,
+            }),
+            Some(ArgumentName {
+                value: "length".to_string(), // Wrong case
+                is_quoted: true,
+            }),
+        ];
+
+        let result3 = resolve_function_arguments(
+            &param_names,
+            args_wrong_length,
+            arg_names_wrong_length,
+        );
+        assert!(result3.is_err());
+        assert!(
+            result3
+                .unwrap_err()
+                .to_string()
+                .contains("Unknown parameter")
+        );
+
+        // Test 4: Quoted with exact case should succeed
+        let args_correct = vec![lit("a"), lit(1), lit(5)];
+        let arg_names_correct = vec![
+            Some(ArgumentName {
+                value: "prefix".to_string(), // Exact match
+                is_quoted: true,
+            }),
+            Some(ArgumentName {
+                value: "startPos".to_string(), // Exact match
+                is_quoted: true,
+            }),
+            Some(ArgumentName {
+                value: "LENGTH".to_string(), // Exact match
+                is_quoted: true,
+            }),
+        ];
+
+        let result4 =
+            resolve_function_arguments(&param_names, args_correct, arg_names_correct)
+                .unwrap();
+        assert_eq!(result4.len(), 3);
+        assert_eq!(result4[0], lit("a"));
+        assert_eq!(result4[1], lit(1));
+        assert_eq!(result4[2], lit(5));
     }
 }
diff --git a/datafusion/expr/src/async_udf.rs b/datafusion/expr/src/async_udf.rs
index 561ef1dc15e7d..02a6d2ece8cdb 100644
--- a/datafusion/expr/src/async_udf.rs
+++ b/datafusion/expr/src/async_udf.rs
@@ -63,7 +63,7 @@ impl PartialEq for AsyncScalarUDF {
     fn eq(&self, other: &Self) -> bool {
         // Deconstruct to catch any new fields added in future
         let Self { inner } = self;
-        inner.dyn_eq(other.inner.as_any())
+        inner.as_ref().dyn_eq(other.inner.as_ref() as &dyn Any)
     }
 }
 impl Eq for AsyncScalarUDF {}
@@ -102,10 +102,6 @@ impl AsyncScalarUDF {
 }
 
 impl ScalarUDFImpl for AsyncScalarUDF {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         self.inner.name()
     }
@@ -146,8 +142,8 @@ mod tests {
     use datafusion_expr_common::{columnar_value::ColumnarValue, signature::Signature};
 
     use crate::{
-        async_udf::{AsyncScalarUDF, AsyncScalarUDFImpl},
         ScalarFunctionArgs, ScalarUDFImpl,
+        async_udf::{AsyncScalarUDF, AsyncScalarUDFImpl},
     };
 
     #[derive(Debug, PartialEq, Eq, Hash, Clone)]
@@ -156,10 +152,6 @@ mod tests {
     }
 
     impl ScalarUDFImpl for TestAsyncUDFImpl1 {
-        fn as_any(&self) -> &dyn std::any::Any {
-            self
-        }
-
         fn name(&self) -> &str {
             todo!()
         }
@@ -193,10 +185,6 @@ mod tests {
     }
 
     impl ScalarUDFImpl for TestAsyncUDFImpl2 {
-        fn as_any(&self) -> &dyn std::any::Any {
-            self
-        }
-
         fn name(&self) -> &str {
             todo!()
         }
diff --git a/datafusion/expr/src/conditional_expressions.rs b/datafusion/expr/src/conditional_expressions.rs
index d02f522910c19..10a9fd6948e4f 100644
--- a/datafusion/expr/src/conditional_expressions.rs
+++ b/datafusion/expr/src/conditional_expressions.rs
@@ -17,9 +17,9 @@
 
 //! Conditional expressions
 use crate::expr::Case;
-use crate::{expr_schema::ExprSchemable, Expr};
+use crate::{Expr, expr_schema::ExprSchemable};
 use arrow::datatypes::DataType;
-use datafusion_common::{plan_err, DFSchema, HashSet, Result};
+use datafusion_common::{DFSchema, HashSet, Result, plan_err};
 use itertools::Itertools as _;
 
 /// Helper struct for building [Expr::Case]
diff --git a/datafusion/expr/src/execution_props.rs b/datafusion/expr/src/execution_props.rs
index d8a8c6bb49e19..24d0f333a6e56 100644
--- a/datafusion/expr/src/execution_props.rs
+++ b/datafusion/expr/src/execution_props.rs
@@ -16,14 +16,19 @@
 // under the License.
 
 use crate::var_provider::{VarProvider, VarType};
-use chrono::{DateTime, TimeZone, Utc};
+use chrono::{DateTime, Utc};
+use datafusion_common::HashMap;
+use datafusion_common::ScalarValue;
 use datafusion_common::alias::AliasGenerator;
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::HashMap;
-use std::sync::Arc;
+use datafusion_common::{Result, internal_err};
+use std::fmt;
+use std::hash::{Hash, Hasher};
+use std::sync::{Arc, Mutex};
 
-/// Holds per-query execution properties and data (such as statement
-/// starting timestamps).
+/// Holds properties and scratch state used while optimizing a [`LogicalPlan`]
+/// and translating it into an executable physical plan, such as the statement
+/// start time used during simplification.
 ///
 /// An [`ExecutionProps`] is created each time a `LogicalPlan` is
 /// prepared for execution (optimized). If the same plan is optimized
@@ -31,15 +36,39 @@ use std::sync::Arc;
 ///
 /// It is important that this structure be cheap to create as it is
 /// done so during predicate pruning and expression simplification
+///
+/// # Relationship with [`TaskContext`]
+///
+/// [`ExecutionProps`] is intentionally distinct from [`TaskContext`].
+/// It is used while optimizing a logical plan and constructing physical
+/// expressions and physical plans, before physical operators are run.
+///
+/// [`TaskContext`] is the runtime context passed to physical operators during
+/// physical-plan execution.
+///
+/// Keeping these structures separate avoids threading execution/runtime state
+/// through planning APIs, and avoids making execution depend on planner-only
+/// scratch state.
+///
+/// [`TaskContext`]: https://docs.rs/datafusion/latest/datafusion/execution/struct.TaskContext.html
+/// [`LogicalPlan`]: crate::LogicalPlan
 #[derive(Clone, Debug)]
 pub struct ExecutionProps {
-    pub query_execution_start_time: DateTime<Utc>,
+    /// The time at which the query execution started. If `None`,
+    /// functions like `now()` will not be simplified during optimization.
+    pub query_execution_start_time: Option<DateTime<Utc>>,
     /// Alias generator used by subquery optimizer rules
     pub alias_generator: Arc<AliasGenerator>,
     /// Snapshot of config options when the query started
     pub config_options: Option<Arc<ConfigOptions>>,
     /// Providers for scalar variables
     pub var_providers: Option<HashMap<VarType, Arc<dyn VarProvider + Send + Sync>>>,
+    /// Maps each logical `Subquery` to its index in `subquery_results`.
+    /// Populated by the physical planner before calling `create_physical_expr`.
+    pub subquery_indexes: HashMap<crate::logical_plan::Subquery, SubqueryIndex>,
+    /// Shared results container for uncorrelated scalar subquery values.
+    /// Populated at execution time by `ScalarSubqueryExec`.
+    pub subquery_results: ScalarSubqueryResults,
 }
 
 impl Default for ExecutionProps {
@@ -52,12 +81,12 @@ impl ExecutionProps {
     /// Creates a new execution props
     pub fn new() -> Self {
         ExecutionProps {
-            // Set this to a fixed sentinel to make it obvious if this is
-            // not being updated / propagated correctly
-            query_execution_start_time: Utc.timestamp_nanos(0),
+            query_execution_start_time: None,
             alias_generator: Arc::new(AliasGenerator::new()),
             config_options: None,
             var_providers: None,
+            subquery_indexes: HashMap::new(),
+            subquery_results: ScalarSubqueryResults::default(),
         }
     }
 
@@ -66,7 +95,7 @@ impl ExecutionProps {
         mut self,
         query_execution_start_time: DateTime<Utc>,
     ) -> Self {
-        self.query_execution_start_time = query_execution_start_time;
+        self.query_execution_start_time = Some(query_execution_start_time);
         self
     }
 
@@ -79,14 +108,13 @@ impl ExecutionProps {
     /// Marks the execution of query started timestamp.
     /// This also instantiates a new alias generator.
     pub fn mark_start_execution(&mut self, config_options: Arc<ConfigOptions>) -> &Self {
-        self.query_execution_start_time = Utc::now();
+        self.query_execution_start_time = Some(Utc::now());
         self.alias_generator = Arc::new(AliasGenerator::new());
         self.config_options = Some(config_options);
         &*self
     }
 
-    /// Registers a variable provider, returning the existing
-    /// provider, if any
+    /// Registers a variable provider, returning the existing provider, if any
     pub fn add_var_provider(
         &mut self,
         var_type: VarType,
@@ -102,6 +130,7 @@ impl ExecutionProps {
     }
 
     /// Returns the provider for the `var_type`, if any
+    #[expect(clippy::needless_pass_by_value)]
     pub fn get_var_provider(
         &self,
         var_type: VarType,
@@ -118,12 +147,149 @@ impl ExecutionProps {
     }
 }
 
+/// Index of a scalar subquery within a [`ScalarSubqueryResults`] container.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub struct SubqueryIndex(usize);
+
+impl SubqueryIndex {
+    /// Creates a new subquery index.
+    pub const fn new(index: usize) -> Self {
+        Self(index)
+    }
+
+    /// Returns the underlying slot index.
+    pub const fn as_usize(self) -> usize {
+        self.0
+    }
+}
+
+/// Shared results container for uncorrelated scalar subqueries.
+///
+/// Each entry corresponds to one scalar subquery, identified by its index.
+/// Each slot is populated at execution time by `ScalarSubqueryExec`, read by
+/// `ScalarSubqueryExpr` instances that share this container, and cleared when
+/// the plan is reset for re-execution.
+#[derive(Clone, Default)]
+pub struct ScalarSubqueryResults {
+    slots: Arc<Vec<Mutex<Option<ScalarValue>>>>,
+}
+
+impl ScalarSubqueryResults {
+    /// Creates a new shared results container with `n` empty slots.
+    pub fn new(n: usize) -> Self {
+        Self {
+            slots: Arc::new((0..n).map(|_| Mutex::new(None)).collect()),
+        }
+    }
+
+    /// Returns the scalar value stored at `index`, if it has been populated.
+    pub fn get(&self, index: SubqueryIndex) -> Option<ScalarValue> {
+        let slot = self.slots.get(index.as_usize())?;
+        slot.lock().unwrap().clone()
+    }
+
+    /// Stores `value` in the slot at `index`.
+    pub fn set(&self, index: SubqueryIndex, value: ScalarValue) -> Result<()> {
+        let Some(slot) = self.slots.get(index.as_usize()) else {
+            return internal_err!(
+                "ScalarSubqueryResults: result index {} is out of bounds",
+                index.as_usize()
+            );
+        };
+
+        let mut slot = slot.lock().unwrap();
+        if slot.is_some() {
+            return internal_err!(
+                "ScalarSubqueryResults: result for index {} was already populated",
+                index.as_usize()
+            );
+        }
+        *slot = Some(value);
+
+        Ok(())
+    }
+
+    /// Clears all populated results so the container can be reused.
+    pub fn clear(&self) {
+        for slot in self.slots.iter() {
+            *slot.lock().unwrap() = None;
+        }
+    }
+
+    /// Returns true if `this` and `other` point to the same shared container.
+    pub fn ptr_eq(this: &Self, other: &Self) -> bool {
+        Arc::ptr_eq(&this.slots, &other.slots)
+    }
+}
+
+impl fmt::Debug for ScalarSubqueryResults {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_list()
+            .entries(self.slots.iter().map(|slot| slot.lock().unwrap().clone()))
+            .finish()
+    }
+}
+
+impl PartialEq for ScalarSubqueryResults {
+    fn eq(&self, other: &Self) -> bool {
+        Self::ptr_eq(self, other)
+    }
+}
+
+impl Eq for ScalarSubqueryResults {}
+
+impl Hash for ScalarSubqueryResults {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        Arc::as_ptr(&self.slots).hash(state);
+    }
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
+
     #[test]
     fn debug() {
         let props = ExecutionProps::new();
-        assert_eq!("ExecutionProps { query_execution_start_time: 1970-01-01T00:00:00Z, alias_generator: AliasGenerator { next_id: 1 }, config_options: None, var_providers: None }", format!("{props:?}"));
+        assert_eq!(
+            "ExecutionProps { query_execution_start_time: None, alias_generator: AliasGenerator { next_id: 1 }, config_options: None, var_providers: None, subquery_indexes: {}, subquery_results: [] }",
+            format!("{props:?}")
+        );
+    }
+
+    #[test]
+    fn scalar_subquery_results_set_and_get() -> Result<()> {
+        let results = ScalarSubqueryResults::new(1);
+        assert_eq!(results.get(SubqueryIndex::new(0)), None);
+
+        results.set(SubqueryIndex::new(0), ScalarValue::Int32(Some(42)))?;
+        assert_eq!(
+            results.get(SubqueryIndex::new(0)),
+            Some(ScalarValue::Int32(Some(42)))
+        );
+        assert!(
+            results
+                .set(SubqueryIndex::new(0), ScalarValue::Int32(Some(7)))
+                .is_err()
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn scalar_subquery_results_clear() -> Result<()> {
+        let results = ScalarSubqueryResults::new(1);
+        results.set(SubqueryIndex::new(0), ScalarValue::Int32(Some(42)))?;
+
+        results.clear();
+
+        assert_eq!(results.get(SubqueryIndex::new(0)), None);
+        results.set(SubqueryIndex::new(0), ScalarValue::Int32(Some(7)))?;
+        assert_eq!(
+            results.get(SubqueryIndex::new(0)),
+            Some(ScalarValue::Int32(Some(7)))
+        );
+
+        Ok(())
     }
 }
diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs
index 13160d573ab4d..582e8e41dd0cf 100644
--- a/datafusion/expr/src/expr.rs
+++ b/datafusion/expr/src/expr.rs
@@ -26,23 +26,29 @@ use std::sync::Arc;
 
 use crate::expr_fn::binary_expr;
 use crate::function::WindowFunctionSimplification;
+use crate::higher_order_function::HigherOrderUDF;
 use crate::logical_plan::Subquery;
-use crate::{AggregateUDF, Volatility};
+use crate::type_coercion::functions::value_fields_with_higher_order_udf;
+use crate::{AggregateUDF, LambdaParametersProgress, ValueOrLambda, Volatility};
 use crate::{ExprSchemable, Operator, Signature, WindowFrame, WindowUDF};
 
 use arrow::datatypes::{DataType, Field, FieldRef};
 use datafusion_common::cse::{HashNode, NormalizeEq, Normalizeable};
+use datafusion_common::datatype::DataTypeExt;
+use datafusion_common::metadata::format_type_and_metadata;
 use datafusion_common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeContainer, TreeNodeRecursion,
 };
 use datafusion_common::{
-    Column, DFSchema, HashMap, Result, ScalarValue, Spans, TableReference,
+    Column, DFSchema, ExprSchema, HashMap, Result, ScalarValue, Spans, TableReference,
+    plan_err,
 };
+use datafusion_expr_common::placement::ExpressionPlacement;
 use datafusion_functions_window_common::field::WindowUDFFieldArgs;
 #[cfg(feature = "sql")]
 use sqlparser::ast::{
-    display_comma_separated, ExceptSelectItem, ExcludeSelectItem, IlikeSelectItem,
-    RenameSelectItem, ReplaceSelectElement,
+    ExceptSelectItem, ExcludeSelectItem, IlikeSelectItem, RenameSelectItem,
+    ReplaceSelectElement,
 };
 
 // Moved in 51.0.0 to datafusion_common
@@ -309,6 +315,7 @@ impl From<sqlparser::ast::NullTreatment> for NullTreatment {
 /// assert!(rewritten.transformed);
 /// // to 42 = 5 AND b = 6
 /// assert_eq!(rewritten.data, lit(42).eq(lit(5)).and(col("b").eq(lit(6))));
+/// ```
 #[derive(Clone, PartialEq, PartialOrd, Eq, Debug, Hash)]
 pub enum Expr {
     /// An expression with a specific name.
@@ -316,7 +323,7 @@ pub enum Expr {
     /// A named reference to a qualified field in a schema.
     Column(Column),
     /// A named reference to a variable in a registry.
-    ScalarVariable(DataType, Vec<String>),
+    ScalarVariable(FieldRef, Vec<String>),
     /// A constant value along with associated [`FieldMetadata`].
     Literal(ScalarValue, Option<FieldMetadata>),
     /// A binary expression such as "age > 21"
@@ -372,6 +379,8 @@ pub enum Expr {
     Exists(Exists),
     /// IN subquery
     InSubquery(InSubquery),
+    /// Set comparison subquery (e.g. `= ANY`, `> ALL`)
+    SetComparison(SetComparison),
     /// Scalar subquery
     ScalarSubquery(Subquery),
     /// Represents a reference to all available fields in a specific schema,
@@ -398,6 +407,112 @@ pub enum Expr {
     OuterReferenceColumn(FieldRef, Column),
     /// Unnest expression
     Unnest(Unnest),
+    /// Call a higher order function with a set of arguments.
+    ///
+    /// For example, `array_transform([1,2,3], v -> v+1)` would be equivalent to:
+    ///
+    /// ```text
+    /// HigherOrderFunction(array_transform)
+    /// ├── args[0]: Literal([1,2,3])
+    /// └── args[1]: Lambda
+    ///     ├── params: ["v"]
+    ///     └── body: BinaryExpr(+)
+    ///         ├── LambdaVariable("v")
+    ///         └── Literal(1)
+    /// ```
+    HigherOrderFunction(HigherOrderFunction),
+    /// A Lambda expression with a set of parameters names and a body
+    Lambda(Lambda),
+    /// A named reference to a lambda parameter
+    LambdaVariable(LambdaVariable),
+}
+
+/// Invoke a [`HigherOrderUDF`] with a set of arguments
+#[derive(Clone, Eq, PartialOrd, Debug)]
+pub struct HigherOrderFunction {
+    /// The function
+    pub func: Arc<dyn HigherOrderUDF>,
+    /// List of expressions to feed to the functions as arguments
+    pub args: Vec<Expr>,
+}
+
+impl HigherOrderFunction {
+    /// Create a new `HigherOrderFunction` from a [`HigherOrderUDF`]
+    pub fn new(func: Arc<dyn HigherOrderUDF>, args: Vec<Expr>) -> Self {
+        Self { func, args }
+    }
+
+    pub fn name(&self) -> &str {
+        self.func.name()
+    }
+
+    /// Invokes the inner function [`HigherOrderUDF::lambda_parameters`]
+    /// using the arguments of this invocation
+    pub fn lambda_parameters(
+        &self,
+        schema: &dyn ExprSchema,
+    ) -> Result<Vec<Vec<FieldRef>>> {
+        let args = self
+            .args
+            .iter()
+            .map(|e| match e {
+                Expr::Lambda(lambda) => {
+                    Ok(ValueOrLambda::Lambda(Some(lambda.body.to_field(schema)?.1)))
+                }
+                _ => Ok(ValueOrLambda::Value(e.to_field(schema)?.1)),
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        let coerced_fields =
+            value_fields_with_higher_order_udf(&args, self.func.as_ref())?;
+
+        match self.func.lambda_parameters(0, &coerced_fields)? {
+            LambdaParametersProgress::Partial(_) => plan_err!(
+                "{} lambda_parameters returned a partial result when the return type of all it's lambdas were provided",
+                self.name()
+            ),
+            LambdaParametersProgress::Complete(items) => Ok(items),
+        }
+    }
+}
+
+impl Hash for HigherOrderFunction {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.func.hash(state);
+        self.args.hash(state);
+    }
+}
+
+impl PartialEq for HigherOrderFunction {
+    fn eq(&self, other: &Self) -> bool {
+        self.func.as_ref() == other.func.as_ref() && self.args == other.args
+    }
+}
+
+/// A named reference to a lambda parameter which includes it's own [`FieldRef`],
+/// which is used to implement [`ExprSchemable`], for example. Note the field must
+/// be set in order to create a physical lambda variable. A helper to automatically
+/// set them will be added in the future
+#[derive(Clone, PartialEq, PartialOrd, Eq, Debug, Hash)]
+pub struct LambdaVariable {
+    pub name: String,
+    pub field: Option<FieldRef>,
+    pub spans: Spans,
+}
+
+impl LambdaVariable {
+    /// Create a lambda variable from a name and a Field.
+    pub fn new(name: String, field: Option<FieldRef>) -> Self {
+        Self {
+            name,
+            field,
+            spans: Spans::new(),
+        }
+    }
+
+    pub fn spans_mut(&mut self) -> &mut Spans {
+        &mut self.spans
+    }
 }
 
 impl Default for Expr {
@@ -479,7 +594,7 @@ impl<'a> TreeNodeContainer<'a, Self> for Expr {
 /// that may be missing in the physical data but present in the logical schema.
 /// See the [default_column_values.rs] example implementation.
 ///
-/// [default_column_values.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/default_column_values.rs
+/// [default_column_values.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/custom_data_source/default_column_values.rs
 pub type SchemaFieldMetadata = std::collections::HashMap<String, String>;
 
 /// Intersects multiple metadata instances for UNION operations.
@@ -506,17 +621,26 @@ pub type SchemaFieldMetadata = std::collections::HashMap<String, String>;
 pub fn intersect_metadata_for_union<'a>(
     metadatas: impl IntoIterator<Item = &'a SchemaFieldMetadata>,
 ) -> SchemaFieldMetadata {
-    let mut metadatas = metadatas.into_iter();
-    let Some(mut intersected) = metadatas.next().cloned() else {
-        return Default::default();
-    };
+    let mut intersected: Option<SchemaFieldMetadata> = None;
 
     for metadata in metadatas {
-        // Only keep keys that exist in both with the same value
-        intersected.retain(|k, v| metadata.get(k) == Some(v));
+        // Skip empty metadata (e.g. from NULL literals or computed expressions)
+        // to avoid dropping metadata from branches that have it.
+        if metadata.is_empty() {
+            continue;
+        }
+        match &mut intersected {
+            None => {
+                intersected = Some(metadata.clone());
+            }
+            Some(current) => {
+                // Only keep keys that exist in both with the same value
+                current.retain(|k, v| metadata.get(k) == Some(v));
+            }
+        }
     }
 
-    intersected
+    intersected.unwrap_or_default()
 }
 
 /// UNNEST expression.
@@ -594,7 +718,7 @@ impl Alias {
     }
 }
 
-/// Binary expression
+/// Binary expression for [`Expr::BinaryExpr`]
 #[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
 pub struct BinaryExpr {
     /// Left-hand side of the expression
@@ -796,13 +920,20 @@ pub struct Cast {
     /// The expression being cast
     pub expr: Box<Expr>,
     /// The `DataType` the expression will yield
-    pub data_type: DataType,
+    pub field: FieldRef,
 }
 
 impl Cast {
     /// Create a new Cast expression
     pub fn new(expr: Box<Expr>, data_type: DataType) -> Self {
-        Self { expr, data_type }
+        Self {
+            expr,
+            field: data_type.into_nullable_field_ref(),
+        }
+    }
+
+    pub fn new_from_field(expr: Box<Expr>, field: FieldRef) -> Self {
+        Self { expr, field }
     }
 }
 
@@ -812,13 +943,20 @@ pub struct TryCast {
     /// The expression being cast
     pub expr: Box<Expr>,
     /// The `DataType` the expression will yield
-    pub data_type: DataType,
+    pub field: FieldRef,
 }
 
 impl TryCast {
     /// Create a new TryCast expression
     pub fn new(expr: Box<Expr>, data_type: DataType) -> Self {
-        Self { expr, data_type }
+        Self {
+            expr,
+            field: data_type.into_nullable_field_ref(),
+        }
+    }
+
+    pub fn new_from_field(expr: Box<Expr>, field: FieldRef) -> Self {
+        Self { expr, field }
     }
 }
 
@@ -953,7 +1091,7 @@ impl AggregateFunction {
 pub enum WindowFunctionDefinition {
     /// A user defined aggregate function
     AggregateUDF(Arc<AggregateUDF>),
-    /// A user defined aggregate function
+    /// A user defined window function
     WindowUDF(Arc<WindowUDF>),
 }
 
@@ -990,7 +1128,7 @@ impl WindowFunctionDefinition {
         }
     }
 
-    /// Return the inner window simplification function, if any
+    /// Returns this window function's simplification hook, if any.
     ///
     /// See [`WindowFunctionSimplification`] for more information
     pub fn simplify(&self) -> Option<WindowFunctionSimplification> {
@@ -1077,7 +1215,7 @@ impl WindowFunction {
         }
     }
 
-    /// Return the inner window simplification function, if any
+    /// Returns this window function's simplification hook, if any.
     ///
     /// See [`WindowFunctionSimplification`] for more information
     pub fn simplify(&self) -> Option<WindowFunctionSimplification> {
@@ -1101,6 +1239,54 @@ impl Exists {
     }
 }
 
+/// Whether the set comparison uses `ANY`/`SOME` or `ALL`
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Hash, Debug)]
+pub enum SetQuantifier {
+    /// `ANY` (or `SOME`)
+    Any,
+    /// `ALL`
+    All,
+}
+
+impl Display for SetQuantifier {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        match self {
+            SetQuantifier::Any => write!(f, "ANY"),
+            SetQuantifier::All => write!(f, "ALL"),
+        }
+    }
+}
+
+/// Set comparison subquery (e.g. `= ANY`, `> ALL`)
+#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
+pub struct SetComparison {
+    /// The expression to compare
+    pub expr: Box<Expr>,
+    /// Subquery that will produce a single column of data to compare against
+    pub subquery: Subquery,
+    /// Comparison operator (e.g. `=`, `>`, `<`)
+    pub op: Operator,
+    /// Quantifier (`ANY`/`ALL`)
+    pub quantifier: SetQuantifier,
+}
+
+impl SetComparison {
+    /// Create a new set comparison expression
+    pub fn new(
+        expr: Box<Expr>,
+        subquery: Subquery,
+        op: Operator,
+        quantifier: SetQuantifier,
+    ) -> Self {
+        Self {
+            expr,
+            subquery,
+            op,
+            quantifier,
+        }
+    }
+}
+
 /// InList expression
 #[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
 pub struct InList {
@@ -1211,6 +1397,25 @@ impl GroupingSet {
     }
 }
 
+/// A Lambda expression with a set of parameters names and a body
+#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
+pub struct Lambda {
+    /// The parameters names
+    pub params: Vec<String>,
+    /// The body expression
+    pub body: Box<Expr>,
+}
+
+impl Lambda {
+    /// Create a new lambda expression
+    pub fn new(params: Vec<String>, body: Expr) -> Self {
+        Self {
+            params,
+            body: Box::new(body),
+        }
+    }
+}
+
 #[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
 #[cfg(not(feature = "sql"))]
 pub struct IlikeSelectItem {
@@ -1268,7 +1473,6 @@ impl Display for ExceptSelectItem {
     }
 }
 
-#[cfg(not(feature = "sql"))]
 pub fn display_comma_separated<T>(slice: &[T]) -> String
 where
     T: Display,
@@ -1487,6 +1691,24 @@ impl Expr {
         }
     }
 
+    /// Returns placement information for this expression.
+    ///
+    /// This is used by optimizers to make decisions about expression placement,
+    /// such as whether to push expressions down through projections.
+    pub fn placement(&self) -> ExpressionPlacement {
+        match self {
+            Expr::Column(_) => ExpressionPlacement::Column,
+            Expr::Literal(_, _) => ExpressionPlacement::Literal,
+            Expr::Alias(inner) => inner.expr.placement(),
+            Expr::ScalarFunction(func) => {
+                let arg_placements: Vec<_> =
+                    func.args.iter().map(|arg| arg.placement()).collect();
+                func.func.placement(&arg_placements)
+            }
+            _ => ExpressionPlacement::KeepInPlace,
+        }
+    }
+
     /// Return String representation of the variant represented by `self`
     /// Useful for non-rust based bindings
     pub fn variant_name(&self) -> &str {
@@ -1503,6 +1725,7 @@ impl Expr {
             Expr::GroupingSet(..) => "GroupingSet",
             Expr::InList { .. } => "InList",
             Expr::InSubquery(..) => "InSubquery",
+            Expr::SetComparison(..) => "SetComparison",
             Expr::IsNotNull(..) => "IsNotNull",
             Expr::IsNull(..) => "IsNull",
             Expr::Like { .. } => "Like",
@@ -1525,6 +1748,9 @@ impl Expr {
             #[expect(deprecated)]
             Expr::Wildcard { .. } => "Wildcard",
             Expr::Unnest { .. } => "Unnest",
+            Expr::HigherOrderFunction { .. } => "HigherOrderFunction",
+            Expr::Lambda { .. } => "Lambda",
+            Expr::LambdaVariable { .. } => "LambdaVariable",
         }
     }
 
@@ -1964,6 +2190,12 @@ impl Expr {
             .expect("exists closure is infallible")
     }
 
+    /// Returns true if the expression contains a scalar subquery.
+    pub fn contains_scalar_subquery(&self) -> bool {
+        self.exists(|expr| Ok(matches!(expr, Expr::ScalarSubquery(_))))
+            .expect("exists closure is infallible")
+    }
+
     /// Returns true if the expression node is volatile, i.e. whether it can return
     /// different results when evaluated multiple times with the same input.
     /// Note: unlike [`Self::is_volatile`], this function does not consider inputs:
@@ -2040,6 +2272,9 @@ impl Expr {
     pub fn short_circuits(&self) -> bool {
         match self {
             Expr::ScalarFunction(ScalarFunction { func, .. }) => func.short_circuits(),
+            Expr::HigherOrderFunction(HigherOrderFunction { func, .. }) => {
+                func.short_circuits()
+            }
             Expr::BinaryExpr(BinaryExpr { op, .. }) => {
                 matches!(op, Operator::And | Operator::Or)
             }
@@ -2058,6 +2293,7 @@ impl Expr {
             | Expr::GroupingSet(..)
             | Expr::InList(..)
             | Expr::InSubquery(..)
+            | Expr::SetComparison(..)
             | Expr::IsFalse(..)
             | Expr::IsNotFalse(..)
             | Expr::IsNotNull(..)
@@ -2078,7 +2314,9 @@ impl Expr {
             | Expr::Wildcard { .. }
             | Expr::WindowFunction(..)
             | Expr::Literal(..)
-            | Expr::Placeholder(..) => false,
+            | Expr::Placeholder(..)
+            | Expr::Lambda(..)
+            | Expr::LambdaVariable(..) => false,
         }
     }
 
@@ -2104,7 +2342,7 @@ impl Expr {
 
 impl Normalizeable for Expr {
     fn can_normalize(&self) -> bool {
-        #[allow(clippy::match_like_matches_macro)]
+        #[expect(clippy::match_like_matches_macro)]
         match self {
             Expr::BinaryExpr(BinaryExpr {
                 op:
@@ -2252,23 +2490,23 @@ impl NormalizeEq for Expr {
             (
                 Expr::Cast(Cast {
                     expr: self_expr,
-                    data_type: self_data_type,
+                    field: self_field,
                 }),
                 Expr::Cast(Cast {
                     expr: other_expr,
-                    data_type: other_data_type,
+                    field: other_field,
                 }),
             )
             | (
                 Expr::TryCast(TryCast {
                     expr: self_expr,
-                    data_type: self_data_type,
+                    field: self_field,
                 }),
                 Expr::TryCast(TryCast {
                     expr: other_expr,
-                    data_type: other_data_type,
+                    field: other_field,
                 }),
-            ) => self_data_type == other_data_type && self_expr.normalize_eq(other_expr),
+            ) => self_field == other_field && self_expr.normalize_eq(other_expr),
             (
                 Expr::ScalarFunction(ScalarFunction {
                     func: self_func,
@@ -2529,8 +2767,8 @@ impl HashNode for Expr {
             Expr::Column(column) => {
                 column.hash(state);
             }
-            Expr::ScalarVariable(data_type, name) => {
-                data_type.hash(state);
+            Expr::ScalarVariable(field, name) => {
+                field.hash(state);
                 name.hash(state);
             }
             Expr::Literal(scalar_value, _) => {
@@ -2584,15 +2822,9 @@ impl HashNode for Expr {
                 when_then_expr: _when_then_expr,
                 else_expr: _else_expr,
             }) => {}
-            Expr::Cast(Cast {
-                expr: _expr,
-                data_type,
-            })
-            | Expr::TryCast(TryCast {
-                expr: _expr,
-                data_type,
-            }) => {
-                data_type.hash(state);
+            Expr::Cast(Cast { expr: _expr, field })
+            | Expr::TryCast(TryCast { expr: _expr, field }) => {
+                field.hash(state);
             }
             Expr::ScalarFunction(ScalarFunction { func, args: _args }) => {
                 func.hash(state);
@@ -2651,6 +2883,16 @@ impl HashNode for Expr {
                 subquery.hash(state);
                 negated.hash(state);
             }
+            Expr::SetComparison(SetComparison {
+                expr: _,
+                subquery,
+                op,
+                quantifier,
+            }) => {
+                subquery.hash(state);
+                op.hash(state);
+                quantifier.hash(state);
+            }
             Expr::ScalarSubquery(subquery) => {
                 subquery.hash(state);
             }
@@ -2674,6 +2916,20 @@ impl HashNode for Expr {
                 column.hash(state);
             }
             Expr::Unnest(Unnest { expr: _expr }) => {}
+            Expr::HigherOrderFunction(HigherOrderFunction { func, args: _args }) => {
+                func.hash(state);
+            }
+            Expr::Lambda(Lambda { params, body: _ }) => {
+                params.hash(state);
+            }
+            Expr::LambdaVariable(LambdaVariable {
+                name,
+                field,
+                spans: _,
+            }) => {
+                name.hash(state);
+                field.hash(state);
+            }
         };
     }
 }
@@ -2681,24 +2937,23 @@ impl HashNode for Expr {
 // Modifies expr to match the DataType, metadata, and nullability of other if it is
 // a placeholder with previously unspecified type information (i.e., most placeholders)
 fn rewrite_placeholder(expr: &mut Expr, other: &Expr, schema: &DFSchema) -> Result<()> {
-    if let Expr::Placeholder(Placeholder { id: _, field }) = expr {
-        if field.is_none() {
-            let other_field = other.to_field(schema);
-            match other_field {
-                Err(e) => {
-                    Err(e.context(format!(
-                        "Can not find type of {other} needed to infer type of {expr}"
-                    )))?;
-                }
-                Ok((_, other_field)) => {
-                    // We can't infer the nullability of the future parameter that might
-                    // be bound, so ensure this is set to true
-                    *field =
-                        Some(other_field.as_ref().clone().with_nullable(true).into());
-                }
+    if let Expr::Placeholder(Placeholder { id: _, field }) = expr
+        && field.is_none()
+    {
+        let other_field = other.to_field(schema);
+        match other_field {
+            Err(e) => {
+                Err(e.context(format!(
+                    "Can not find type of {other} needed to infer type of {expr}"
+                )))?;
             }
-        };
-    }
+            Ok((_, other_field)) => {
+                // We can't infer the nullability of the future parameter that might
+                // be bound, so ensure this is set to true
+                *field = Some(other_field.as_ref().clone().with_nullable(true).into());
+            }
+        }
+    };
     Ok(())
 }
 
@@ -2842,6 +3097,12 @@ impl Display for SchemaDisplay<'_> {
                 write!(f, "NOT IN")
             }
             Expr::InSubquery(InSubquery { negated: false, .. }) => write!(f, "IN"),
+            Expr::SetComparison(SetComparison {
+                expr,
+                op,
+                quantifier,
+                ..
+            }) => write!(f, "{} {op} {quantifier}", SchemaDisplay(expr.as_ref())),
             Expr::IsTrue(expr) => write!(f, "{} IS TRUE", SchemaDisplay(expr)),
             Expr::IsFalse(expr) => write!(f, "{} IS FALSE", SchemaDisplay(expr)),
             Expr::IsNotTrue(expr) => {
@@ -2987,6 +3248,25 @@ impl Display for SchemaDisplay<'_> {
                     }
                 }
             }
+            Expr::HigherOrderFunction(HigherOrderFunction { func, args }) => {
+                match func.schema_name(args) {
+                    Ok(name) => {
+                        write!(f, "{name}")
+                    }
+                    Err(e) => {
+                        write!(f, "got error from schema_name {e}")
+                    }
+                }
+            }
+            Expr::Lambda(Lambda { params, body }) => {
+                write!(
+                    f,
+                    "({}) -> {}",
+                    display_comma_separated(params),
+                    SchemaDisplay(body)
+                )
+            }
+            Expr::LambdaVariable(c) => f.write_str(&c.name),
         }
     }
 }
@@ -3167,6 +3447,9 @@ impl Display for SqlDisplay<'_> {
                     }
                 }
             }
+            Expr::Lambda(Lambda { params, body }) => {
+                write!(f, "({}) -> {}", params.join(", "), SchemaDisplay(body))
+            }
             _ => write!(f, "{}", self.0),
         }
     }
@@ -3283,11 +3566,15 @@ impl Display for Expr {
                 }
                 write!(f, "END")
             }
-            Expr::Cast(Cast { expr, data_type }) => {
-                write!(f, "CAST({expr} AS {data_type})")
+            Expr::Cast(Cast { expr, field }) => {
+                let formatted =
+                    format_type_and_metadata(field.data_type(), Some(field.metadata()));
+                write!(f, "CAST({expr} AS {formatted})")
             }
-            Expr::TryCast(TryCast { expr, data_type }) => {
-                write!(f, "TRY_CAST({expr} AS {data_type})")
+            Expr::TryCast(TryCast { expr, field }) => {
+                let formatted =
+                    format_type_and_metadata(field.data_type(), Some(field.metadata()));
+                write!(f, "TRY_CAST({expr} AS {formatted})")
             }
             Expr::Not(expr) => write!(f, "NOT {expr}"),
             Expr::Negative(expr) => write!(f, "(- {expr})"),
@@ -3317,6 +3604,12 @@ impl Display for Expr {
                 subquery,
                 negated: false,
             }) => write!(f, "{expr} IN ({subquery:?})"),
+            Expr::SetComparison(SetComparison {
+                expr,
+                subquery,
+                op,
+                quantifier,
+            }) => write!(f, "{expr} {op} {quantifier} ({subquery:?})"),
             Expr::ScalarSubquery(subquery) => write!(f, "({subquery:?})"),
             Expr::BinaryExpr(expr) => write!(f, "{expr}"),
             Expr::ScalarFunction(fun) => {
@@ -3474,6 +3767,13 @@ impl Display for Expr {
             Expr::Unnest(Unnest { expr }) => {
                 write!(f, "{UNNEST_COLUMN_PREFIX}({expr})")
             }
+            Expr::HigherOrderFunction(fun) => {
+                fmt_function(f, fun.name(), false, &fun.args, true)
+            }
+            Expr::Lambda(Lambda { params, body }) => {
+                write!(f, "({}) -> {body}", params.join(", "))
+            }
+            Expr::LambdaVariable(c) => f.write_str(&c.name),
         }
     }
 }
@@ -3511,13 +3811,12 @@ pub fn physical_name(expr: &Expr) -> Result<String> {
 mod test {
     use crate::expr_fn::col;
     use crate::{
-        case, lit, placeholder, qualified_wildcard, wildcard, wildcard_with_options,
-        ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Volatility,
+        ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Volatility, case,
+        lit, placeholder, qualified_wildcard, wildcard, wildcard_with_options,
     };
     use arrow::datatypes::{Field, Schema};
     use sqlparser::ast;
     use sqlparser::ast::{Ident, IdentWithAlias};
-    use std::any::Any;
 
     #[test]
     fn infer_placeholder_in_clause() {
@@ -3628,11 +3927,11 @@ mod test {
     #[test]
     fn infer_placeholder_with_metadata() {
         // name == $1, where name is a non-nullable string
-        let schema =
-            Arc::new(Schema::new(vec![Field::new("name", DataType::Utf8, false)
-                .with_metadata(
-                    [("some_key".to_string(), "some_value".to_string())].into(),
-                )]));
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("name", DataType::Utf8, false).with_metadata(
+                [("some_key".to_string(), "some_value".to_string())].into(),
+            ),
+        ]));
         let df_schema = DFSchema::try_from(schema).unwrap();
 
         let expr = binary_expr(col("name"), Operator::Eq, placeholder("$1"));
@@ -3673,7 +3972,7 @@ mod test {
     fn format_cast() -> Result<()> {
         let expr = Expr::Cast(Cast {
             expr: Box::new(Expr::Literal(ScalarValue::Float32(Some(1.23)), None)),
-            data_type: DataType::Utf8,
+            field: DataType::Utf8.into_nullable_field_ref(),
         });
         let expected_canonical = "CAST(Float32(1.23) AS Utf8)";
         assert_eq!(expected_canonical, format!("{expr}"));
@@ -3762,9 +4061,6 @@ mod test {
             signature: Signature,
         }
         impl ScalarUDFImpl for TestScalarUDF {
-            fn as_any(&self) -> &dyn Any {
-                self
-            }
             fn name(&self) -> &str {
                 "TestScalarUDF"
             }
@@ -3800,6 +4096,7 @@ mod test {
     }
 
     use super::*;
+    use crate::logical_plan::{EmptyRelation, LogicalPlan};
 
     #[test]
     fn test_display_wildcard() {
@@ -3890,6 +4187,28 @@ mod test {
         )
     }
 
+    #[test]
+    fn test_display_set_comparison() {
+        let subquery = Subquery {
+            subquery: Arc::new(LogicalPlan::EmptyRelation(EmptyRelation {
+                produce_one_row: false,
+                schema: Arc::new(DFSchema::empty()),
+            })),
+            outer_ref_columns: vec![],
+            spans: Spans::new(),
+        };
+
+        let expr = Expr::SetComparison(SetComparison::new(
+            Box::new(Expr::Column(Column::from_name("a"))),
+            subquery,
+            Operator::Gt,
+            SetQuantifier::Any,
+        ));
+
+        assert_eq!(format!("{expr}"), "a > ANY (<subquery>)");
+        assert_eq!(format!("{}", expr.human_display()), "a > ANY (<subquery>)");
+    }
+
     #[test]
     fn test_schema_display_alias_with_relation() {
         assert_eq!(
@@ -3974,10 +4293,6 @@ mod test {
         #[derive(Debug, PartialEq, Eq, Hash)]
         struct TestUDF {}
         impl ScalarUDFImpl for TestUDF {
-            fn as_any(&self) -> &dyn Any {
-                unimplemented!()
-            }
-
             fn name(&self) -> &str {
                 unimplemented!()
             }
@@ -3998,4 +4313,67 @@ mod test {
             }
         }
     }
+
+    mod intersect_metadata_tests {
+        use super::super::intersect_metadata_for_union;
+        use std::collections::HashMap;
+
+        #[test]
+        fn all_branches_same_metadata() {
+            let m1 = HashMap::from([("key".into(), "val".into())]);
+            let m2 = HashMap::from([("key".into(), "val".into())]);
+            let result = intersect_metadata_for_union([&m1, &m2]);
+            assert_eq!(result, HashMap::from([("key".into(), "val".into())]));
+        }
+
+        #[test]
+        fn conflicting_metadata_dropped() {
+            let m1 = HashMap::from([("key".into(), "a".into())]);
+            let m2 = HashMap::from([("key".into(), "b".into())]);
+            let result = intersect_metadata_for_union([&m1, &m2]);
+            assert!(result.is_empty());
+        }
+
+        #[test]
+        fn empty_metadata_branch_skipped() {
+            let m1 = HashMap::from([("key".into(), "val".into())]);
+            let m2 = HashMap::new(); // e.g. NULL literal
+            let result = intersect_metadata_for_union([&m1, &m2]);
+            assert_eq!(result, HashMap::from([("key".into(), "val".into())]));
+        }
+
+        #[test]
+        fn empty_metadata_first_branch_skipped() {
+            let m1 = HashMap::new();
+            let m2 = HashMap::from([("key".into(), "val".into())]);
+            let result = intersect_metadata_for_union([&m1, &m2]);
+            assert_eq!(result, HashMap::from([("key".into(), "val".into())]));
+        }
+
+        #[test]
+        fn all_branches_empty_metadata() {
+            let m1: HashMap<String, String> = HashMap::new();
+            let m2: HashMap<String, String> = HashMap::new();
+            let result = intersect_metadata_for_union([&m1, &m2]);
+            assert!(result.is_empty());
+        }
+
+        #[test]
+        fn mixed_empty_and_conflicting() {
+            let m1 = HashMap::from([("key".into(), "a".into())]);
+            let m2 = HashMap::new();
+            let m3 = HashMap::from([("key".into(), "b".into())]);
+            let result = intersect_metadata_for_union([&m1, &m2, &m3]);
+            // m2 is skipped; m1 and m3 conflict → dropped
+            assert!(result.is_empty());
+        }
+
+        #[test]
+        fn no_inputs() {
+            let result = intersect_metadata_for_union(std::iter::empty::<
+                &HashMap<String, String>,
+            >());
+            assert!(result.is_empty());
+        }
+    }
 }
diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs
index c777c4978f99a..cec6b7ec0565c 100644
--- a/datafusion/expr/src/expr_fn.rs
+++ b/datafusion/expr/src/expr_fn.rs
@@ -18,8 +18,9 @@
 //! Functions for creating logical expressions
 
 use crate::expr::{
-    AggregateFunction, BinaryExpr, Cast, Exists, GroupingSet, InList, InSubquery,
-    NullTreatment, Placeholder, TryCast, Unnest, WildcardOptions, WindowFunction,
+    AggregateFunction, BinaryExpr, Cast, Exists, GroupingSet, InList, InSubquery, Lambda,
+    LambdaVariable, NullTreatment, Placeholder, TryCast, Unnest, WildcardOptions,
+    WindowFunction,
 };
 use crate::function::{
     AccumulatorArgs, AccumulatorFactoryFunction, PartitionEvaluatorFactory,
@@ -28,9 +29,9 @@ use crate::function::{
 use crate::ptr_eq::PtrEq;
 use crate::select_expr::SelectExpr;
 use crate::{
-    conditional_expressions::CaseBuilder, expr::Sort, logical_plan::Subquery,
     AggregateUDF, Expr, LimitEffect, LogicalPlan, Operator, PartitionEvaluator,
     ScalarFunctionArgs, ScalarFunctionImplementation, ScalarUDF, Signature, Volatility,
+    conditional_expressions::CaseBuilder, expr::Sort, logical_plan::Subquery,
 };
 use crate::{
     AggregateUDFImpl, ColumnarValue, ScalarUDFImpl, WindowFrame, WindowUDF, WindowUDFImpl,
@@ -39,11 +40,10 @@ use arrow::compute::kernels::cast_utils::{
     parse_interval_day_time, parse_interval_month_day_nano, parse_interval_year_month,
 };
 use arrow::datatypes::{DataType, Field, FieldRef};
-use datafusion_common::{plan_err, Column, Result, ScalarValue, Spans, TableReference};
+use datafusion_common::{Column, Result, ScalarValue, Spans, TableReference, plan_err};
 use datafusion_functions_window_common::field::WindowUDFFieldArgs;
 use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-use std::any::Any;
 use std::collections::HashMap;
 use std::fmt::Debug;
 use std::hash::Hash;
@@ -341,6 +341,11 @@ pub fn is_null(expr: Expr) -> Expr {
     Expr::IsNull(Box::new(expr))
 }
 
+/// Create is not null expression
+pub fn is_not_null(expr: Expr) -> Expr {
+    Expr::IsNotNull(Box::new(expr))
+}
+
 /// Create is true expression
 pub fn is_true(expr: Expr) -> Expr {
     Expr::IsTrue(Box::new(expr))
@@ -473,10 +478,6 @@ impl SimpleScalarUDF {
 }
 
 impl ScalarUDFImpl for SimpleScalarUDF {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         &self.name
     }
@@ -587,10 +588,6 @@ impl SimpleAggregateUDF {
 }
 
 impl AggregateUDFImpl for SimpleAggregateUDF {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         &self.name
     }
@@ -680,10 +677,6 @@ impl SimpleWindowUDF {
 }
 
 impl WindowUDFImpl for SimpleWindowUDF {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         &self.name
     }
@@ -727,6 +720,21 @@ pub fn interval_month_day_nano_lit(value: &str) -> Expr {
     Expr::Literal(ScalarValue::IntervalMonthDayNano(interval), None)
 }
 
+/// Create a lambda expression
+pub fn lambda(params: impl IntoIterator<Item = impl Into<String>>, body: Expr) -> Expr {
+    Expr::Lambda(Lambda::new(
+        params.into_iter().map(Into::into).collect(),
+        body,
+    ))
+}
+
+/// Create a lambda variable expression
+// todo: make this pub when support for optional field lands
+#[expect(unused)]
+fn lambda_var(name: impl Into<String>) -> Expr {
+    Expr::LambdaVariable(LambdaVariable::new(name.into(), None))
+}
+
 /// Extensions for configuring [`Expr::AggregateFunction`] or [`Expr::WindowFunction`]
 ///
 /// Adds methods to [`Expr`] that make it easy to set optional options
diff --git a/datafusion/expr/src/expr_rewriter/guarantees.rs b/datafusion/expr/src/expr_rewriter/guarantees.rs
new file mode 100644
index 0000000000000..61fbbdba43aa9
--- /dev/null
+++ b/datafusion/expr/src/expr_rewriter/guarantees.rs
@@ -0,0 +1,667 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Rewrite expressions based on external expression value range guarantees.
+
+use crate::{Between, BinaryExpr, Expr, expr::InList, lit};
+use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter};
+use datafusion_common::{DataFusionError, HashMap, Result, ScalarValue};
+use datafusion_expr_common::interval_arithmetic::{Interval, NullableInterval};
+use std::borrow::Cow;
+
+/// Rewrite expressions to incorporate guarantees.
+///
+/// See [`rewrite_with_guarantees`] for more information
+pub struct GuaranteeRewriter<'a> {
+    guarantees: HashMap<&'a Expr, &'a NullableInterval>,
+}
+
+impl<'a> GuaranteeRewriter<'a> {
+    pub fn new(
+        guarantees: impl IntoIterator<Item = &'a (Expr, NullableInterval)>,
+    ) -> Self {
+        Self {
+            guarantees: guarantees.into_iter().map(|(k, v)| (k, v)).collect(),
+        }
+    }
+}
+
+/// Rewrite expressions to incorporate guarantees.
+///
+/// Guarantees are a mapping from an expression (which currently is always a
+/// column reference) to a [NullableInterval] that represents the known possible
+/// values of the expression.
+///
+/// Rewriting expressions using this type of guarantee can make the work of other expression
+/// simplifications, like const evaluation, easier.
+///
+/// For example, if we know that a column is not null and has values in the
+/// range [1, 10), we can rewrite `x IS NULL` to `false` or `x < 10` to `true`.
+///
+/// If the set of guarantees will be used to rewrite more than one expression, consider using
+/// [rewrite_with_guarantees_map] instead.
+///
+/// A full example of using this rewrite rule can be found in
+/// [`ExprSimplifier::with_guarantees()`](https://docs.rs/datafusion/latest/datafusion/optimizer/simplify_expressions/struct.ExprSimplifier.html#method.with_guarantees).
+pub fn rewrite_with_guarantees<'a>(
+    expr: Expr,
+    guarantees: impl IntoIterator<Item = &'a (Expr, NullableInterval)>,
+) -> Result<Transformed<Expr>> {
+    let guarantees_map: HashMap<&Expr, &NullableInterval> =
+        guarantees.into_iter().map(|(k, v)| (k, v)).collect();
+    rewrite_with_guarantees_map(expr, &guarantees_map)
+}
+
+/// Rewrite expressions to incorporate guarantees.
+///
+/// Guarantees are a mapping from an expression (which currently is always a
+/// column reference) to a [NullableInterval]. The interval represents the known
+/// possible values of the column.
+///
+/// For example, if we know that a column is not null and has values in the
+/// range [1, 10), we can rewrite `x IS NULL` to `false` or `x < 10` to `true`.
+pub fn rewrite_with_guarantees_map<'a>(
+    expr: Expr,
+    guarantees: &'a HashMap<&'a Expr, &'a NullableInterval>,
+) -> Result<Transformed<Expr>> {
+    if guarantees.is_empty() {
+        return Ok(Transformed::no(expr));
+    }
+
+    expr.transform_up(|e| rewrite_expr(e, guarantees))
+}
+
+impl TreeNodeRewriter for GuaranteeRewriter<'_> {
+    type Node = Expr;
+
+    fn f_up(&mut self, expr: Expr) -> Result<Transformed<Expr>> {
+        if self.guarantees.is_empty() {
+            return Ok(Transformed::no(expr));
+        }
+
+        rewrite_expr(expr, &self.guarantees)
+    }
+}
+
+fn rewrite_expr(
+    expr: Expr,
+    guarantees: &HashMap<&Expr, &NullableInterval>,
+) -> Result<Transformed<Expr>> {
+    // If an expression collapses to a single value, replace it with a literal
+    if let Some(interval) = guarantees.get(&expr)
+        && let Some(value) = interval.single_value()
+    {
+        return Ok(Transformed::yes(lit(value)));
+    }
+
+    let result = match expr {
+        Expr::IsNull(inner) => match guarantees.get(inner.as_ref()) {
+            Some(NullableInterval::Null { .. }) => Transformed::yes(lit(true)),
+            Some(NullableInterval::NotNull { .. }) => Transformed::yes(lit(false)),
+            _ => Transformed::no(Expr::IsNull(inner)),
+        },
+        Expr::IsNotNull(inner) => match guarantees.get(inner.as_ref()) {
+            Some(NullableInterval::Null { .. }) => Transformed::yes(lit(false)),
+            Some(NullableInterval::NotNull { .. }) => Transformed::yes(lit(true)),
+            _ => Transformed::no(Expr::IsNotNull(inner)),
+        },
+        Expr::Between(b) => rewrite_between(b, guarantees)?,
+        Expr::BinaryExpr(b) => rewrite_binary_expr(b, guarantees)?,
+        Expr::InList(i) => rewrite_inlist(i, guarantees)?,
+        expr => Transformed::no(expr),
+    };
+    Ok(result)
+}
+
+fn rewrite_between(
+    between: Between,
+    guarantees: &HashMap<&Expr, &NullableInterval>,
+) -> Result<Transformed<Expr>> {
+    let (Some(expr_interval), Expr::Literal(low, _), Expr::Literal(high, _)) = (
+        guarantees.get(between.expr.as_ref()),
+        between.low.as_ref(),
+        between.high.as_ref(),
+    ) else {
+        return Ok(Transformed::no(Expr::Between(between)));
+    };
+
+    // Ensure that, if low or high are null, their type matches the other bound
+    let low = ensure_typed_null(low, high)?;
+    let high = ensure_typed_null(high, &low)?;
+
+    let Ok(between_interval) = Interval::try_new(low, high) else {
+        // If we can't create an interval from the literals, be conservative and simply leave
+        // the expression unmodified.
+        return Ok(Transformed::no(Expr::Between(between)));
+    };
+
+    if between_interval.lower().is_null() && between_interval.upper().is_null() {
+        return Ok(Transformed::yes(lit(between_interval.lower().clone())));
+    }
+
+    let expr_interval = match expr_interval {
+        NullableInterval::Null { datatype } => {
+            // Value is guaranteed to be null, so we can simplify to null.
+            return Ok(Transformed::yes(lit(
+                ScalarValue::try_new_null(datatype).unwrap_or(ScalarValue::Null)
+            )));
+        }
+        NullableInterval::MaybeNull { .. } => {
+            // Value may or may not be null, so we can't simplify the expression.
+            return Ok(Transformed::no(Expr::Between(between)));
+        }
+        NullableInterval::NotNull { values } => values,
+    };
+
+    let result = if between_interval.lower().is_null() {
+        // <expr> (NOT) BETWEEN NULL AND <high>
+        let upper_bound = Interval::from(between_interval.upper().clone());
+        if expr_interval.gt(&upper_bound)?.eq(&Interval::TRUE) {
+            // if <expr> > high, then certainly false
+            Transformed::yes(lit(between.negated))
+        } else if expr_interval.lt_eq(&upper_bound)?.eq(&Interval::TRUE) {
+            // if <expr> <= high, then certainly null
+            Transformed::yes(lit(ScalarValue::try_new_null(&expr_interval.data_type())
+                .unwrap_or(ScalarValue::Null)))
+        } else {
+            // otherwise unknown
+            Transformed::no(Expr::Between(between))
+        }
+    } else if between_interval.upper().is_null() {
+        // <expr> (NOT) BETWEEN <low> AND NULL
+        let lower_bound = Interval::from(between_interval.lower().clone());
+        if expr_interval.lt(&lower_bound)?.eq(&Interval::TRUE) {
+            // if <expr> < low, then certainly false
+            Transformed::yes(lit(between.negated))
+        } else if expr_interval.gt_eq(&lower_bound)?.eq(&Interval::TRUE) {
+            // if <expr> >= low, then certainly null
+            Transformed::yes(lit(ScalarValue::try_new_null(&expr_interval.data_type())
+                .unwrap_or(ScalarValue::Null)))
+        } else {
+            // otherwise unknown
+            Transformed::no(Expr::Between(between))
+        }
+    } else {
+        let contains = between_interval.contains(expr_interval)?;
+        if contains.eq(&Interval::TRUE) {
+            Transformed::yes(lit(!between.negated))
+        } else if contains.eq(&Interval::FALSE) {
+            Transformed::yes(lit(between.negated))
+        } else {
+            Transformed::no(Expr::Between(between))
+        }
+    };
+    Ok(result)
+}
+
+fn ensure_typed_null(
+    value: &ScalarValue,
+    other: &ScalarValue,
+) -> Result<ScalarValue, DataFusionError> {
+    Ok(
+        if value.data_type().is_null() && !other.data_type().is_null() {
+            ScalarValue::try_new_null(&other.data_type())?
+        } else {
+            value.clone()
+        },
+    )
+}
+
+fn rewrite_binary_expr(
+    binary: BinaryExpr,
+    guarantees: &HashMap<&Expr, &NullableInterval>,
+) -> Result<Transformed<Expr>, DataFusionError> {
+    // The left or right side of expression might either have a guarantee
+    // or be a literal. Either way, we can resolve them to a NullableInterval.
+    let left_interval = guarantees
+        .get(binary.left.as_ref())
+        .map(|interval| Cow::Borrowed(*interval))
+        .or_else(|| {
+            if let Expr::Literal(value, _) = binary.left.as_ref() {
+                Some(Cow::Owned(value.clone().into()))
+            } else {
+                None
+            }
+        });
+    let right_interval = guarantees
+        .get(binary.right.as_ref())
+        .map(|interval| Cow::Borrowed(*interval))
+        .or_else(|| {
+            if let Expr::Literal(value, _) = binary.right.as_ref() {
+                Some(Cow::Owned(value.clone().into()))
+            } else {
+                None
+            }
+        });
+
+    if let (Some(left_interval), Some(right_interval)) = (left_interval, right_interval) {
+        let result = left_interval.apply_operator(&binary.op, right_interval.as_ref())?;
+        if result.is_certainly_true() {
+            return Ok(Transformed::yes(lit(true)));
+        } else if result.is_certainly_false() {
+            return Ok(Transformed::yes(lit(false)));
+        }
+    }
+    Ok(Transformed::no(Expr::BinaryExpr(binary)))
+}
+
+fn rewrite_inlist(
+    inlist: InList,
+    guarantees: &HashMap<&Expr, &NullableInterval>,
+) -> Result<Transformed<Expr>, DataFusionError> {
+    let Some(interval) = guarantees.get(inlist.expr.as_ref()) else {
+        return Ok(Transformed::no(Expr::InList(inlist)));
+    };
+
+    let InList {
+        expr,
+        list,
+        negated,
+    } = inlist;
+
+    // Can remove items from the list that don't match the guarantee
+    let list: Vec<Expr> = list
+        .into_iter()
+        .filter_map(|expr| {
+            if let Expr::Literal(item, _) = &expr {
+                match interval.contains(NullableInterval::from(item.clone())) {
+                    // If we know for certain the value isn't in the column's interval,
+                    // we can skip checking it.
+                    Ok(interval) if interval.is_certainly_false() => None,
+                    Ok(_) => Some(Ok(expr)),
+                    Err(e) => Some(Err(e)),
+                }
+            } else {
+                Some(Ok(expr))
+            }
+        })
+        .collect::<Result<_, DataFusionError>>()?;
+
+    Ok(Transformed::yes(Expr::InList(InList {
+        expr,
+        list,
+        negated,
+    })))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use crate::{Operator, col};
+    use datafusion_common::tree_node::TransformedResult;
+
+    #[test]
+    fn test_not_null_guarantee() {
+        // IsNull / IsNotNull can be rewritten to true / false
+        let guarantees = [
+            // Note: AlwaysNull case handled by test_column_single_value test,
+            // since it's a special case of a column with a single value.
+            (
+                col("x"),
+                NullableInterval::NotNull {
+                    values: Interval::make(Some(1), Some(3)).unwrap(),
+                },
+            ),
+        ];
+
+        let is_null_cases = vec![
+            // x IS NULL => guaranteed false
+            (col("x").is_null(), Some(lit(false))),
+            // x IS NOT NULL => guaranteed true
+            (col("x").is_not_null(), Some(lit(true))),
+            // [1, 3] BETWEEN 0 AND 10 => guaranteed true
+            (col("x").between(lit(0), lit(10)), Some(lit(true))),
+            // x BETWEEN 1 AND -2 => unknown (actually guaranteed false)
+            (col("x").between(lit(1), lit(-2)), None),
+            // [1, 3] BETWEEN NULL AND 0 => guaranteed false
+            (
+                col("x").between(lit(ScalarValue::Null), lit(0)),
+                Some(lit(false)),
+            ),
+            // [1, 3] BETWEEN NULL AND 1 => unknown
+            (col("x").between(lit(ScalarValue::Null), lit(1)), None),
+            // [1, 3] BETWEEN NULL AND 2 => unknown
+            (col("x").between(lit(ScalarValue::Null), lit(2)), None),
+            // [1, 3] BETWEEN NULL AND 3 => guaranteed NULL
+            (
+                col("x").between(lit(ScalarValue::Null), lit(3)),
+                Some(lit(ScalarValue::Int32(None))),
+            ),
+            // [1, 3] BETWEEN NULL AND 4 => guaranteed NULL
+            (
+                col("x").between(lit(ScalarValue::Null), lit(4)),
+                Some(lit(ScalarValue::Int32(None))),
+            ),
+            // [1, 3] BETWEEN 1 AND NULL => guaranteed NULL
+            (
+                col("x").between(lit(0), lit(ScalarValue::Null)),
+                Some(lit(ScalarValue::Int32(None))),
+            ),
+            // [1, 3] BETWEEN 1 AND NULL => guaranteed NULL
+            (
+                col("x").between(lit(1), lit(ScalarValue::Null)),
+                Some(lit(ScalarValue::Int32(None))),
+            ),
+            // [1, 3] BETWEEN 2 AND NULL => unknown
+            (col("x").between(lit(2), lit(ScalarValue::Null)), None),
+            // [1, 3] BETWEEN 3 AND NULL => unknown
+            (col("x").between(lit(3), lit(ScalarValue::Null)), None),
+            // [1, 3] BETWEEN 4 AND NULL => guaranteed false
+            (
+                col("x").between(lit(4), lit(ScalarValue::Null)),
+                Some(lit(false)),
+            ),
+            // [1, 3] NOT BETWEEN NULL AND 0 => guaranteed false
+            (
+                col("x").not_between(lit(ScalarValue::Null), lit(0)),
+                Some(lit(true)),
+            ),
+            // [1, 3] NOT BETWEEN NULL AND 1 => unknown
+            (col("x").not_between(lit(ScalarValue::Null), lit(1)), None),
+            // [1, 3] NOT BETWEEN NULL AND 2 => unknown
+            (col("x").not_between(lit(ScalarValue::Null), lit(2)), None),
+            // [1, 3] NOT BETWEEN NULL AND 3 => guaranteed NULL
+            (
+                col("x").not_between(lit(ScalarValue::Null), lit(3)),
+                Some(lit(ScalarValue::Int32(None))),
+            ),
+            // [1, 3] NOT BETWEEN NULL AND 4 => guaranteed NULL
+            (
+                col("x").not_between(lit(ScalarValue::Null), lit(4)),
+                Some(lit(ScalarValue::Int32(None))),
+            ),
+            // [1, 3] NOT BETWEEN 1 AND NULL => guaranteed NULL
+            (
+                col("x").not_between(lit(0), lit(ScalarValue::Null)),
+                Some(lit(ScalarValue::Int32(None))),
+            ),
+            // [1, 3] NOT BETWEEN 1 AND NULL => guaranteed NULL
+            (
+                col("x").not_between(lit(1), lit(ScalarValue::Null)),
+                Some(lit(ScalarValue::Int32(None))),
+            ),
+            // [1, 3] NOT BETWEEN 2 AND NULL => unknown
+            (col("x").not_between(lit(2), lit(ScalarValue::Null)), None),
+            // [1, 3] NOT BETWEEN 3 AND NULL => unknown
+            (col("x").not_between(lit(3), lit(ScalarValue::Null)), None),
+            // [1, 3] NOT BETWEEN 4 AND NULL => guaranteed false
+            (
+                col("x").not_between(lit(4), lit(ScalarValue::Null)),
+                Some(lit(true)),
+            ),
+        ];
+
+        for case in is_null_cases {
+            let output = rewrite_with_guarantees(case.0.clone(), guarantees.iter())
+                .data()
+                .unwrap();
+            let expected = match case.1 {
+                None => case.0.clone(),
+                Some(expected) => expected,
+            };
+
+            assert_eq!(output, expected, "Failed for {}", case.0);
+        }
+    }
+
+    fn validate_simplified_cases<T>(
+        guarantees: &[(Expr, NullableInterval)],
+        cases: &[(Expr, T)],
+    ) where
+        ScalarValue: From<T>,
+        T: Clone,
+    {
+        for (expr, expected_value) in cases {
+            let output = rewrite_with_guarantees(expr.clone(), guarantees.iter())
+                .data()
+                .unwrap();
+            let expected = lit(ScalarValue::from(expected_value.clone()));
+            assert_eq!(
+                output, expected,
+                "{expr} simplified to {output}, but expected {expected}"
+            );
+        }
+    }
+
+    fn validate_unchanged_cases(guarantees: &[(Expr, NullableInterval)], cases: &[Expr]) {
+        for expr in cases {
+            let output = rewrite_with_guarantees(expr.clone(), guarantees.iter())
+                .data()
+                .unwrap();
+            assert_eq!(
+                &output, expr,
+                "{expr} was simplified to {output}, but expected it to be unchanged"
+            );
+        }
+    }
+
+    #[test]
+    fn test_inequalities_non_null_unbounded() {
+        let guarantees = [
+            // y ∈ [2021-01-01, ∞) (not null)
+            (
+                col("x"),
+                NullableInterval::NotNull {
+                    values: Interval::try_new(
+                        ScalarValue::Date32(Some(18628)),
+                        ScalarValue::Date32(None),
+                    )
+                    .unwrap(),
+                },
+            ),
+        ];
+
+        // (original_expr, expected_simplification)
+        let simplified_cases = &[
+            (col("x").lt(lit(ScalarValue::Date32(Some(18628)))), false),
+            (col("x").lt_eq(lit(ScalarValue::Date32(Some(17000)))), false),
+            (col("x").gt(lit(ScalarValue::Date32(Some(18627)))), true),
+            (col("x").gt_eq(lit(ScalarValue::Date32(Some(18628)))), true),
+            (col("x").eq(lit(ScalarValue::Date32(Some(17000)))), false),
+            (col("x").not_eq(lit(ScalarValue::Date32(Some(17000)))), true),
+            (
+                col("x").between(
+                    lit(ScalarValue::Date32(Some(16000))),
+                    lit(ScalarValue::Date32(Some(17000))),
+                ),
+                false,
+            ),
+            (
+                col("x").not_between(
+                    lit(ScalarValue::Date32(Some(16000))),
+                    lit(ScalarValue::Date32(Some(17000))),
+                ),
+                true,
+            ),
+            (
+                Expr::BinaryExpr(BinaryExpr {
+                    left: Box::new(col("x")),
+                    op: Operator::IsDistinctFrom,
+                    right: Box::new(lit(ScalarValue::Null)),
+                }),
+                true,
+            ),
+            (
+                Expr::BinaryExpr(BinaryExpr {
+                    left: Box::new(col("x")),
+                    op: Operator::IsDistinctFrom,
+                    right: Box::new(lit(ScalarValue::Date32(Some(17000)))),
+                }),
+                true,
+            ),
+        ];
+
+        validate_simplified_cases(&guarantees, simplified_cases);
+
+        let unchanged_cases = &[
+            col("x").lt(lit(ScalarValue::Date32(Some(19000)))),
+            col("x").lt_eq(lit(ScalarValue::Date32(Some(19000)))),
+            col("x").gt(lit(ScalarValue::Date32(Some(19000)))),
+            col("x").gt_eq(lit(ScalarValue::Date32(Some(19000)))),
+            col("x").eq(lit(ScalarValue::Date32(Some(19000)))),
+            col("x").not_eq(lit(ScalarValue::Date32(Some(19000)))),
+            col("x").between(
+                lit(ScalarValue::Date32(Some(18000))),
+                lit(ScalarValue::Date32(Some(19000))),
+            ),
+            col("x").not_between(
+                lit(ScalarValue::Date32(Some(18000))),
+                lit(ScalarValue::Date32(Some(19000))),
+            ),
+        ];
+
+        validate_unchanged_cases(&guarantees, unchanged_cases);
+    }
+
+    #[test]
+    fn test_inequalities_maybe_null() {
+        let guarantees = [
+            // x ∈ ("abc", "def"]? (maybe null)
+            (
+                col("x"),
+                NullableInterval::MaybeNull {
+                    values: Interval::try_new(
+                        ScalarValue::from("abc"),
+                        ScalarValue::from("def"),
+                    )
+                    .unwrap(),
+                },
+            ),
+        ];
+
+        // (original_expr, expected_simplification)
+        let simplified_cases = &[
+            (
+                Expr::BinaryExpr(BinaryExpr {
+                    left: Box::new(col("x")),
+                    op: Operator::IsDistinctFrom,
+                    right: Box::new(lit("z")),
+                }),
+                true,
+            ),
+            (
+                Expr::BinaryExpr(BinaryExpr {
+                    left: Box::new(col("x")),
+                    op: Operator::IsNotDistinctFrom,
+                    right: Box::new(lit("z")),
+                }),
+                false,
+            ),
+        ];
+
+        validate_simplified_cases(&guarantees, simplified_cases);
+
+        let unchanged_cases = &[
+            col("x").lt(lit("z")),
+            col("x").lt_eq(lit("z")),
+            col("x").gt(lit("a")),
+            col("x").gt_eq(lit("a")),
+            col("x").eq(lit("abc")),
+            col("x").not_eq(lit("a")),
+            col("x").between(lit("a"), lit("z")),
+            col("x").not_between(lit("a"), lit("z")),
+            Expr::BinaryExpr(BinaryExpr {
+                left: Box::new(col("x")),
+                op: Operator::IsDistinctFrom,
+                right: Box::new(lit(ScalarValue::Null)),
+            }),
+        ];
+
+        validate_unchanged_cases(&guarantees, unchanged_cases);
+    }
+
+    #[test]
+    fn test_column_single_value() {
+        let scalars = [
+            ScalarValue::Null,
+            ScalarValue::Int32(Some(1)),
+            ScalarValue::Boolean(Some(true)),
+            ScalarValue::Boolean(None),
+            ScalarValue::from("abc"),
+            ScalarValue::LargeUtf8(Some("def".to_string())),
+            ScalarValue::Date32(Some(18628)),
+            ScalarValue::Date32(None),
+            ScalarValue::Decimal128(Some(1000), 19, 2),
+        ];
+
+        for scalar in scalars {
+            let guarantees = [(col("x"), NullableInterval::from(scalar.clone()))];
+
+            let output = rewrite_with_guarantees(col("x"), guarantees.iter())
+                .data()
+                .unwrap();
+            assert_eq!(output, Expr::Literal(scalar.clone(), None));
+        }
+    }
+
+    #[test]
+    fn test_in_list() {
+        let guarantees = [
+            // x ∈ [1, 10] (not null)
+            (
+                col("x"),
+                NullableInterval::NotNull {
+                    values: Interval::try_new(
+                        ScalarValue::Int32(Some(1)),
+                        ScalarValue::Int32(Some(10)),
+                    )
+                    .unwrap(),
+                },
+            ),
+        ];
+
+        // These cases should be simplified so the list doesn't contain any
+        // values the guarantee says are outside the range.
+        // (column_name, starting_list, negated, expected_list)
+        let cases = &[
+            // x IN (9, 11) => x IN (9)
+            ("x", vec![9, 11], false, vec![9]),
+            // x IN (10, 2) => x IN (10, 2)
+            ("x", vec![10, 2], false, vec![10, 2]),
+            // x NOT IN (9, 11) => x NOT IN (9)
+            ("x", vec![9, 11], true, vec![9]),
+            // x NOT IN (0, 22) => x NOT IN ()
+            ("x", vec![0, 22], true, vec![]),
+        ];
+
+        for (column_name, starting_list, negated, expected_list) in cases {
+            let expr = col(*column_name).in_list(
+                starting_list
+                    .iter()
+                    .map(|v| lit(ScalarValue::Int32(Some(*v))))
+                    .collect(),
+                *negated,
+            );
+            let output = rewrite_with_guarantees(expr.clone(), guarantees.iter())
+                .data()
+                .unwrap();
+            let expected_list = expected_list
+                .iter()
+                .map(|v| lit(ScalarValue::Int32(Some(*v))))
+                .collect();
+            assert_eq!(
+                output,
+                Expr::InList(InList {
+                    expr: Box::new(col(*column_name)),
+                    list: expected_list,
+                    negated: *negated,
+                })
+            );
+        }
+    }
+}
diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs
index 9c3c5df7007ff..32a88ab8cf310 100644
--- a/datafusion/expr/src/expr_rewriter/mod.rs
+++ b/datafusion/expr/src/expr_rewriter/mod.rs
@@ -26,12 +26,17 @@ use crate::expr::{Alias, Sort, Unnest};
 use crate::logical_plan::Projection;
 use crate::{Expr, ExprSchemable, LogicalPlan, LogicalPlanBuilder};
 
+use datafusion_common::TableReference;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::TableReference;
 use datafusion_common::{Column, DFSchema, Result};
 
+mod guarantees;
+pub use guarantees::GuaranteeRewriter;
+pub use guarantees::rewrite_with_guarantees;
+pub use guarantees::rewrite_with_guarantees_map;
 mod order_by;
+
 pub use order_by::rewrite_sort_cols_by_aggs;
 
 /// Trait for rewriting [`Expr`]s into function calls.
@@ -255,7 +260,18 @@ fn coerce_exprs_for_schema(
                     }
                     #[expect(deprecated)]
                     Expr::Wildcard { .. } => Ok(expr),
-                    _ => expr.cast_to(new_type, src_schema),
+                    _ => {
+                        match expr {
+                            // maintain the original name when casting a column, to avoid the
+                            // tablename being added to it when not explicitly set by the query
+                            // (see: https://github.com/apache/datafusion/issues/18818)
+                            Expr::Column(ref column) => {
+                                let name = column.name().to_owned();
+                                Ok(expr.cast_to(new_type, src_schema)?.alias(name))
+                            }
+                            _ => Ok(expr.cast_to(new_type, src_schema)?),
+                        }
+                    }
                 }
             } else {
                 Ok(expr)
@@ -355,10 +371,10 @@ mod test {
 
     use super::*;
     use crate::literal::lit_with_metadata;
-    use crate::{col, lit, Cast};
+    use crate::{Cast, col, lit};
     use arrow::datatypes::{DataType, Field, Schema};
-    use datafusion_common::tree_node::TreeNodeRewriter;
     use datafusion_common::ScalarValue;
+    use datafusion_common::tree_node::TreeNodeRewriter;
 
     #[derive(Default)]
     struct RecordingRewriter {
diff --git a/datafusion/expr/src/expr_rewriter/order_by.rs b/datafusion/expr/src/expr_rewriter/order_by.rs
index 6db95555502da..720788113c6cb 100644
--- a/datafusion/expr/src/expr_rewriter/order_by.rs
+++ b/datafusion/expr/src/expr_rewriter/order_by.rs
@@ -19,11 +19,9 @@
 
 use crate::expr::Alias;
 use crate::expr_rewriter::normalize_col;
-use crate::{expr::Sort, Cast, Expr, LogicalPlan, TryCast};
+use crate::{Cast, Expr, LogicalPlan, TryCast, expr::Sort};
 
-use datafusion_common::tree_node::{
-    Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
-};
+use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
 use datafusion_common::{Column, Result};
 
 /// Rewrite sort on aggregate expressions to sort on the column of aggregate output
@@ -52,7 +50,7 @@ fn rewrite_sort_col_by_aggs(expr: Expr, plan: &LogicalPlan) -> Result<Expr> {
     // on top of them)
     if plan_inputs.len() == 1 {
         let proj_exprs = plan.expressions();
-        rewrite_in_terms_of_projection(expr, proj_exprs, plan_inputs[0])
+        rewrite_in_terms_of_projection(expr, &proj_exprs, plan_inputs[0])
     } else {
         Ok(expr)
     }
@@ -71,14 +69,16 @@ fn rewrite_sort_col_by_aggs(expr: Expr, plan: &LogicalPlan) -> Result<Expr> {
 /// 2. t produces an output schema with two columns "a", "b + c"
 fn rewrite_in_terms_of_projection(
     expr: Expr,
-    proj_exprs: Vec<Expr>,
+    proj_exprs: &[Expr],
     input: &LogicalPlan,
 ) -> Result<Expr> {
     // assumption is that each item in exprs, such as "b + c" is
     // available as an output column named "b + c"
     expr.transform(|expr| {
-        // search for unnormalized names first such as "c1" (such as aliases)
-        if let Some(found) = proj_exprs.iter().find(|a| (**a) == expr) {
+        // search for unnormalized names first such as "c1" (such as aliases).
+        // Also look inside aliases so e.g. `count(Int64(1))` matches
+        // `count(Int64(1)) AS count(*)`.
+        if let Some(found) = proj_exprs.iter().find(|a| expr_match(&expr, a)) {
             let (qualifier, field_name) = found.qualified_name();
             let col = Expr::Column(Column::new(qualifier, field_name));
             return Ok(Transformed::yes(col));
@@ -102,29 +102,27 @@ fn rewrite_in_terms_of_projection(
 
         let search_col = Expr::Column(Column::new_unqualified(name));
 
-        // look for the column named the same as this expr
-        let mut found = None;
-        for proj_expr in &proj_exprs {
-            proj_expr.apply(|e| {
-                if expr_match(&search_col, e) {
-                    found = Some(e.clone());
-                    return Ok(TreeNodeRecursion::Stop);
-                }
-                Ok(TreeNodeRecursion::Continue)
-            })?;
-        }
+        // Search only top-level projection expressions for a match.
+        // We intentionally avoid a recursive search (e.g. `apply`) to
+        // prevent matching sub-expressions of composites like
+        // `min(c2) + max(c3)` when the ORDER BY is just `min(c2)`.
+        let found = proj_exprs
+            .iter()
+            .find(|proj_expr| expr_match(&search_col, proj_expr));
 
         if let Some(found) = found {
+            let (qualifier, field_name) = found.qualified_name();
+            let col = Expr::Column(Column::new(qualifier, field_name));
             return Ok(Transformed::yes(match normalized_expr {
-                Expr::Cast(Cast { expr: _, data_type }) => Expr::Cast(Cast {
-                    expr: Box::new(found),
-                    data_type,
+                Expr::Cast(Cast { expr: _, field }) => Expr::Cast(Cast {
+                    expr: Box::new(col),
+                    field,
                 }),
-                Expr::TryCast(TryCast { expr: _, data_type }) => Expr::TryCast(TryCast {
-                    expr: Box::new(found),
-                    data_type,
+                Expr::TryCast(TryCast { expr: _, field }) => Expr::TryCast(TryCast {
+                    expr: Box::new(col),
+                    field,
                 }),
-                _ => found,
+                _ => col,
             }));
         }
 
@@ -152,13 +150,16 @@ mod test {
     use arrow::datatypes::{DataType, Field, Schema};
 
     use crate::{
-        cast, col, lit, logical_plan::builder::LogicalTableSource, try_cast,
-        LogicalPlanBuilder,
+        LogicalPlanBuilder, cast, col, lit, logical_plan::builder::LogicalTableSource,
+        try_cast,
     };
 
     use super::*;
     use crate::test::function_stub::avg;
+    use crate::test::function_stub::count;
+    use crate::test::function_stub::max;
     use crate::test::function_stub::min;
+    use crate::test::function_stub::sum;
 
     #[test]
     fn rewrite_sort_cols_by_agg() {
@@ -235,18 +236,19 @@ mod test {
             TestCase {
                 desc: r#"min(c2) --> "min(c2)" -- (column *named* "min(t.c2)"!)"#,
                 input: sort(min(col("c2"))),
-                expected: sort(col("min(t.c2)")),
+                expected: sort(Expr::Column(Column::new_unqualified("min(t.c2)"))),
             },
             TestCase {
                 desc: r#"c1 + min(c2) --> "c1 + min(c2)" -- (column *named* "min(t.c2)"!)"#,
                 input: sort(col("c1") + min(col("c2"))),
-                // should be "c1" not t.c1
-                expected: sort(col("c1") + col("min(t.c2)")),
+                expected: sort(
+                    col("c1") + Expr::Column(Column::new_unqualified("min(t.c2)")),
+                ),
             },
             TestCase {
-                desc: r#"avg(c3) --> "avg(t.c3)" as average (column *named* "avg(t.c3)", aliased)"#,
+                desc: r#"avg(c3) --> "average" (column *named* "average", from alias)"#,
                 input: sort(avg(col("c3"))),
-                expected: sort(col("avg(t.c3)").alias("average")),
+                expected: sort(col("average")),
             },
         ];
 
@@ -255,6 +257,202 @@ mod test {
         }
     }
 
+    /// When an aggregate is aliased in the projection,
+    /// ORDER BY on the original aggregate expression should resolve to
+    /// a Column reference using the alias name — not leak the inner
+    /// Alias expression node or resolve to a descendant subtree.
+    #[test]
+    fn rewrite_sort_resolves_alias_to_column_ref() {
+        let plan = make_input()
+            .aggregate(vec![col("c1")], vec![min(col("c2")), max(col("c3"))])
+            .unwrap()
+            .project(vec![
+                col("c1"),
+                min(col("c2")).alias("min_val"),
+                max(col("c3")).alias("max_val"),
+            ])
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let cases = vec![
+            TestCase {
+                desc: "min(c2) with alias 'min_val' should resolve to col(min_val)",
+                input: sort(min(col("c2"))),
+                expected: sort(col("min_val")),
+            },
+            TestCase {
+                desc: "max(c3) with alias 'max_val' should resolve to col(max_val)",
+                input: sort(max(col("c3"))),
+                expected: sort(col("max_val")),
+            },
+        ];
+
+        for case in cases {
+            case.run(&plan)
+        }
+    }
+
+    #[test]
+    fn composite_proj_expr_containing_sort_col_as_subexpr() {
+        let plan = make_input()
+            .aggregate(vec![col("c1")], vec![min(col("c2")), max(col("c3"))])
+            .unwrap()
+            .project(vec![
+                col("c1"),
+                (min(col("c2")) + max(col("c3"))).alias("range"),
+                min(col("c2")).alias("min_val"),
+                max(col("c3")).alias("max_val"),
+            ])
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let cases = vec![
+            TestCase {
+                desc: "sort by min(c2) should resolve to col(min_val), not col(range)",
+                input: sort(min(col("c2"))),
+                expected: sort(col("min_val")),
+            },
+            TestCase {
+                desc: "sort by max(c3) should resolve to col(max_val), not col(range)",
+                input: sort(max(col("c3"))),
+                expected: sort(col("max_val")),
+            },
+        ];
+
+        for case in cases {
+            case.run(&plan)
+        }
+    }
+
+    #[test]
+    fn composite_before_standalone_should_not_shadow() {
+        let plan = make_input()
+            .aggregate(vec![col("c1")], vec![min(col("c2")), max(col("c2"))])
+            .unwrap()
+            .project(vec![
+                col("c1"),
+                (min(col("c2")) + max(col("c2"))).alias("combined"),
+                min(col("c2")),
+            ])
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let cases = vec![TestCase {
+            desc: "sort by min(c2) should resolve to col(min(t.c2)), not col(combined)",
+            input: sort(min(col("c2"))),
+            expected: sort(Expr::Column(Column::new_unqualified("min(t.c2)"))),
+        }];
+
+        for case in cases {
+            case.run(&plan)
+        }
+    }
+
+    #[test]
+    fn duplicate_aggregate_in_multiple_proj_exprs() {
+        let plan = make_input()
+            .aggregate(vec![col("c1")], vec![min(col("c2"))])
+            .unwrap()
+            .project(vec![
+                col("c1"),
+                min(col("c2")).alias("first_alias"),
+                min(col("c2")).alias("second_alias"),
+            ])
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let cases = vec![TestCase {
+            desc: "sort by min(c2) with two aliases picks first_alias",
+            input: sort(min(col("c2"))),
+            expected: sort(col("first_alias")),
+        }];
+
+        for case in cases {
+            case.run(&plan)
+        }
+    }
+
+    #[test]
+    fn sort_agg_not_in_select_with_aliased_aggs() {
+        let plan = make_input()
+            .aggregate(
+                vec![col("c1")],
+                vec![min(col("c2")), max(col("c3")), sum(col("c3"))],
+            )
+            .unwrap()
+            .project(vec![
+                col("c1"),
+                min(col("c2")).alias("min_val"),
+                max(col("c3")).alias("max_val"),
+            ])
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let cases = vec![TestCase {
+            desc: "sort by sum(c3) not in projection should not be rewritten",
+            input: sort(sum(col("c3"))),
+            expected: sort(sum(col("c3"))),
+        }];
+
+        for case in cases {
+            case.run(&plan)
+        }
+    }
+
+    #[test]
+    fn cast_on_aliased_aggregate() {
+        let plan = make_input()
+            .aggregate(vec![col("c1")], vec![min(col("c2"))])
+            .unwrap()
+            .project(vec![col("c1"), min(col("c2")).alias("min_val")])
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let cases = vec![
+            TestCase {
+                desc: "CAST on aliased aggregate should preserve cast and resolve alias",
+                input: sort(cast(min(col("c2")), DataType::Int64)),
+                expected: sort(cast(col("min_val"), DataType::Int64)),
+            },
+            TestCase {
+                desc: "TryCast on aliased aggregate should preserve try_cast and resolve alias",
+                input: sort(try_cast(min(col("c2")), DataType::Int64)),
+                expected: sort(try_cast(col("min_val"), DataType::Int64)),
+            },
+        ];
+
+        for case in cases {
+            case.run(&plan)
+        }
+    }
+
+    #[test]
+    fn count_star_with_alias() {
+        let plan = make_input()
+            .aggregate(vec![col("c1")], vec![count(lit(1))])
+            .unwrap()
+            .project(vec![col("c1"), count(lit(1)).alias("cnt")])
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let cases = vec![TestCase {
+            desc: "sort by count(1) should resolve to cnt alias",
+            input: sort(count(lit(1))),
+            expected: sort(col("cnt")),
+        }];
+
+        for case in cases {
+            case.run(&plan)
+        }
+    }
+
     #[test]
     fn preserve_cast() {
         let plan = make_input()
@@ -269,12 +467,12 @@ mod test {
             TestCase {
                 desc: "Cast is preserved by rewrite_sort_cols_by_aggs",
                 input: sort(cast(col("c2"), DataType::Int64)),
-                expected: sort(cast(col("c2").alias("c2"), DataType::Int64)),
+                expected: sort(cast(col("c2"), DataType::Int64)),
             },
             TestCase {
                 desc: "TryCast is preserved by rewrite_sort_cols_by_aggs",
                 input: sort(try_cast(col("c2"), DataType::Int64)),
-                expected: sort(try_cast(col("c2").alias("c2"), DataType::Int64)),
+                expected: sort(try_cast(col("c2"), DataType::Int64)),
             },
         ];
 
diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs
index 9e8d6080b82c8..c989bab3048ad 100644
--- a/datafusion/expr/src/expr_schema.rs
+++ b/datafusion/expr/src/expr_schema.rs
@@ -15,23 +15,26 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use super::{Between, Expr, Like};
+use super::{Between, Expr, Like, predicate_bounds};
+use crate::ValueOrLambda;
 use crate::expr::{
     AggregateFunction, AggregateFunctionParams, Alias, BinaryExpr, Cast, InList,
-    InSubquery, Placeholder, ScalarFunction, TryCast, Unnest, WindowFunction,
+    InSubquery, Lambda, Placeholder, ScalarFunction, TryCast, Unnest, WindowFunction,
     WindowFunctionParams,
 };
-use crate::type_coercion::functions::{
-    data_types_with_scalar_udf, fields_with_aggregate_udf, fields_with_window_udf,
-};
+use crate::expr::{FieldMetadata, LambdaVariable};
+use crate::higher_order_function::HigherOrderReturnFieldArgs;
+use crate::type_coercion::functions::value_fields_with_higher_order_udf_and_lambdas;
+use crate::type_coercion::functions::{UDFCoercionExt, fields_with_udf};
 use crate::udf::ReturnFieldArgs;
-use crate::{utils, LogicalPlan, Projection, Subquery, WindowFunctionDefinition};
+use crate::{LogicalPlan, Projection, Subquery, WindowFunctionDefinition, utils};
 use arrow::compute::can_cast_types;
-use arrow::datatypes::{DataType, Field, FieldRef};
-use datafusion_common::metadata::FieldMetadata;
+use arrow::datatypes::FieldRef;
+use arrow::datatypes::{DataType, Field};
+use datafusion_common::datatype::FieldExt;
 use datafusion_common::{
-    not_impl_err, plan_datafusion_err, plan_err, Column, DataFusionError, ExprSchema,
-    Result, Spans, TableReference,
+    Column, DataFusionError, ExprSchema, Result, ScalarValue, Spans, TableReference,
+    not_impl_err, plan_datafusion_err, plan_err,
 };
 use datafusion_expr_common::type_coercion::binary::BinaryTypeCoercer;
 use datafusion_functions_window_common::field::WindowUDFFieldArgs;
@@ -58,8 +61,30 @@ pub trait ExprSchemable {
     fn cast_to(self, cast_to_type: &DataType, schema: &dyn ExprSchema) -> Result<Expr>;
 
     /// Given a schema, return the type and nullability of the expr
+    #[deprecated(
+        since = "51.0.0",
+        note = "Use `to_field().1.is_nullable` and `to_field().1.data_type()` directly instead"
+    )]
     fn data_type_and_nullable(&self, schema: &dyn ExprSchema)
-        -> Result<(DataType, bool)>;
+    -> Result<(DataType, bool)>;
+}
+
+/// Derives the output field for a cast expression from the source field.
+/// For `TryCast`, `force_nullable` is `true` since a failed cast returns NULL.
+fn cast_output_field(
+    source_field: &FieldRef,
+    target_type: &DataType,
+    force_nullable: bool,
+) -> Arc<Field> {
+    let mut f = source_field
+        .as_ref()
+        .clone()
+        .with_data_type(target_type.clone())
+        .with_metadata(source_field.metadata().clone());
+    if force_nullable {
+        f = f.with_nullable(true);
+    }
+    Arc::new(f)
 }
 
 impl ExprSchemable for Expr {
@@ -116,7 +141,7 @@ impl ExprSchemable for Expr {
             Expr::Negative(expr) => expr.get_type(schema),
             Expr::Column(c) => Ok(schema.data_type(c)?.clone()),
             Expr::OuterReferenceColumn(field, _) => Ok(field.data_type().clone()),
-            Expr::ScalarVariable(ty, _) => Ok(ty.clone()),
+            Expr::ScalarVariable(field, _) => Ok(field.data_type().clone()),
             Expr::Literal(l, _) => Ok(l.data_type()),
             Expr::Case(case) => {
                 for (_, then_expr) in &case.when_then_expr {
@@ -129,15 +154,18 @@ impl ExprSchemable for Expr {
                     .as_ref()
                     .map_or(Ok(DataType::Null), |e| e.get_type(schema))
             }
-            Expr::Cast(Cast { data_type, .. })
-            | Expr::TryCast(TryCast { data_type, .. }) => Ok(data_type.clone()),
+            Expr::Cast(Cast { field, .. }) | Expr::TryCast(TryCast { field, .. }) => {
+                Ok(field.data_type().clone())
+            }
             Expr::Unnest(Unnest { expr }) => {
                 let arg_data_type = expr.get_type(schema)?;
                 // Unnest's output type is the inner type of the list
                 match arg_data_type {
                     DataType::List(field)
                     | DataType::LargeList(field)
-                    | DataType::FixedSizeList(field, _) => Ok(field.data_type().clone()),
+                    | DataType::FixedSizeList(field, _)
+                    | DataType::ListView(field)
+                    | DataType::LargeListView(field) => Ok(field.data_type().clone()),
                     DataType::Struct(_) => Ok(arg_data_type),
                     DataType::Null => {
                         not_impl_err!("unnest() does not support null yet")
@@ -149,48 +177,16 @@ impl ExprSchemable for Expr {
                     }
                 }
             }
-            Expr::ScalarFunction(_func) => {
-                let (return_type, _) = self.data_type_and_nullable(schema)?;
-                Ok(return_type)
-            }
-            Expr::WindowFunction(window_function) => self
-                .data_type_and_nullable_with_window_function(schema, window_function)
-                .map(|(return_type, _)| return_type),
-            Expr::AggregateFunction(AggregateFunction {
-                func,
-                params: AggregateFunctionParams { args, .. },
-            }) => {
-                let fields = args
-                    .iter()
-                    .map(|e| e.to_field(schema).map(|(_, f)| f))
-                    .collect::<Result<Vec<_>>>()?;
-                let new_fields = fields_with_aggregate_udf(&fields, func)
-                    .map_err(|err| {
-                        let data_types = fields
-                            .iter()
-                            .map(|f| f.data_type().clone())
-                            .collect::<Vec<_>>();
-                        plan_datafusion_err!(
-                            "{} {}",
-                            match err {
-                                DataFusionError::Plan(msg) => msg,
-                                err => err.to_string(),
-                            },
-                            utils::generate_signature_error_msg(
-                                func.name(),
-                                func.signature().clone(),
-                                &data_types
-                            )
-                        )
-                    })?
-                    .into_iter()
-                    .collect::<Vec<_>>();
-                Ok(func.return_field(&new_fields)?.data_type().clone())
+            Expr::ScalarFunction(_)
+            | Expr::WindowFunction(_)
+            | Expr::AggregateFunction(_) => {
+                Ok(self.to_field(schema)?.1.data_type().clone())
             }
             Expr::Not(_)
             | Expr::IsNull(_)
             | Expr::Exists { .. }
             | Expr::InSubquery(_)
+            | Expr::SetComparison(_)
             | Expr::Between { .. }
             | Expr::InList { .. }
             | Expr::IsNotNull(_)
@@ -203,11 +199,7 @@ impl ExprSchemable for Expr {
             Expr::ScalarSubquery(subquery) => {
                 Ok(subquery.subquery.schema().field(0).data_type().clone())
             }
-            Expr::BinaryExpr(BinaryExpr {
-                ref left,
-                ref right,
-                ref op,
-            }) => BinaryTypeCoercer::new(
+            Expr::BinaryExpr(BinaryExpr { left, right, op }) => BinaryTypeCoercer::new(
                 &left.get_type(schema)?,
                 op,
                 &right.get_type(schema)?,
@@ -229,6 +221,16 @@ impl ExprSchemable for Expr {
                 // Grouping sets do not really have a type and do not appear in projections
                 Ok(DataType::Null)
             }
+            Expr::HigherOrderFunction(_func) => {
+                Ok(self.to_field(schema)?.1.data_type().clone())
+            }
+            Expr::Lambda(_lambda) => Ok(DataType::Null),
+            Expr::LambdaVariable(LambdaVariable { field, .. }) => match field {
+                Some(f) => Ok(f.data_type().clone()),
+                // If the lambda variable's field hasn't been specified, treat it as
+                // null (unspecified lambda variables generate an error during planning)
+                None => Ok(DataType::Null),
+            },
         }
     }
 
@@ -282,15 +284,65 @@ impl ExprSchemable for Expr {
             Expr::OuterReferenceColumn(field, _) => Ok(field.is_nullable()),
             Expr::Literal(value, _) => Ok(value.is_null()),
             Expr::Case(case) => {
-                // This expression is nullable if any of the input expressions are nullable
-                let then_nullable = case
+                let nullable_then = case
                     .when_then_expr
                     .iter()
-                    .map(|(_, t)| t.nullable(input_schema))
-                    .collect::<Result<Vec<_>>>()?;
-                if then_nullable.contains(&true) {
-                    Ok(true)
+                    .filter_map(|(w, t)| {
+                        let is_nullable = match t.nullable(input_schema) {
+                            Err(e) => return Some(Err(e)),
+                            Ok(n) => n,
+                        };
+
+                        // Branches with a then expression that is not nullable do not impact the
+                        // nullability of the case expression.
+                        if !is_nullable {
+                            return None;
+                        }
+
+                        // For case-with-expression assume all 'then' expressions are reachable
+                        if case.expr.is_some() {
+                            return Some(Ok(()));
+                        }
+
+                        // For branches with a nullable 'then' expression, try to determine
+                        // if the 'then' expression is ever reachable in the situation where
+                        // it would evaluate to null.
+                        let bounds = match predicate_bounds::evaluate_bounds(
+                            w,
+                            Some(unwrap_certainly_null_expr(t)),
+                            input_schema,
+                        ) {
+                            Err(e) => return Some(Err(e)),
+                            Ok(b) => b,
+                        };
+
+                        let can_be_true = match bounds
+                            .contains_value(ScalarValue::Boolean(Some(true)))
+                        {
+                            Err(e) => return Some(Err(e)),
+                            Ok(b) => b,
+                        };
+
+                        if !can_be_true {
+                            // If the derived 'when' expression can never evaluate to true, the
+                            // 'then' expression is not reachable when it would evaluate to NULL.
+                            // The most common pattern for this is `WHEN x IS NOT NULL THEN x`.
+                            None
+                        } else {
+                            // The branch might be taken
+                            Some(Ok(()))
+                        }
+                    })
+                    .next();
+
+                if let Some(nullable_then) = nullable_then {
+                    // There is at least one reachable nullable 'then' expression, so the case
+                    // expression itself is nullable.
+                    // Use `Result::map` to propagate the error from `nullable_then` if there is one.
+                    nullable_then.map(|_| true)
                 } else if let Some(e) = &case.else_expr {
+                    // There are no reachable nullable 'then' expressions, so all we still need to
+                    // check is the 'else' expression's nullability.
                     e.nullable(input_schema)
                 } else {
                     // CASE produces NULL if there is no `else` expr
@@ -299,25 +351,11 @@ impl ExprSchemable for Expr {
                 }
             }
             Expr::Cast(Cast { expr, .. }) => expr.nullable(input_schema),
-            Expr::ScalarFunction(_func) => {
-                let (_, nullable) = self.data_type_and_nullable(input_schema)?;
-                Ok(nullable)
-            }
-            Expr::AggregateFunction(AggregateFunction { func, .. }) => {
-                Ok(func.is_nullable())
-            }
-            Expr::WindowFunction(window_function) => self
-                .data_type_and_nullable_with_window_function(
-                    input_schema,
-                    window_function,
-                )
-                .map(|(_, nullable)| nullable),
-            Expr::Placeholder(Placeholder { id: _, field }) => {
-                Ok(field.as_ref().map(|f| f.is_nullable()).unwrap_or(true))
-            }
-            Expr::ScalarVariable(_, _) | Expr::TryCast { .. } | Expr::Unnest(_) => {
-                Ok(true)
-            }
+            Expr::ScalarFunction(_)
+            | Expr::AggregateFunction(_)
+            | Expr::WindowFunction(_) => Ok(self.to_field(input_schema)?.1.is_nullable()),
+            Expr::ScalarVariable(field, _) => Ok(field.is_nullable()),
+            Expr::TryCast { .. } | Expr::Unnest(_) | Expr::Placeholder(_) => Ok(true),
             Expr::IsNull(_)
             | Expr::IsNotNull(_)
             | Expr::IsTrue(_)
@@ -327,15 +365,14 @@ impl ExprSchemable for Expr {
             | Expr::IsNotFalse(_)
             | Expr::IsNotUnknown(_)
             | Expr::Exists { .. } => Ok(false),
+            Expr::SetComparison(_) => Ok(true),
             Expr::InSubquery(InSubquery { expr, .. }) => expr.nullable(input_schema),
             Expr::ScalarSubquery(subquery) => {
                 Ok(subquery.subquery.schema().field(0).is_nullable())
             }
-            Expr::BinaryExpr(BinaryExpr {
-                ref left,
-                ref right,
-                ..
-            }) => Ok(left.nullable(input_schema)? || right.nullable(input_schema)?),
+            Expr::BinaryExpr(BinaryExpr { left, right, .. }) => {
+                Ok(left.nullable(input_schema)? || right.nullable(input_schema)?)
+            }
             Expr::Like(Like { expr, pattern, .. })
             | Expr::SimilarTo(Like { expr, pattern, .. }) => {
                 Ok(expr.nullable(input_schema)? || pattern.nullable(input_schema)?)
@@ -347,6 +384,16 @@ impl ExprSchemable for Expr {
                 // in projections
                 Ok(true)
             }
+            Expr::HigherOrderFunction(_func) => {
+                Ok(self.to_field(input_schema)?.1.is_nullable())
+            }
+            Expr::Lambda(_lambda) => Ok(true),
+            Expr::LambdaVariable(LambdaVariable { field, .. }) => match field {
+                Some(f) => Ok(f.is_nullable()),
+                // If the lambda variable's field hasn't been specified, treat it as
+                // null (unspecified lambda variables generate an error during planning)
+                None => Ok(true),
+            },
         }
     }
 
@@ -413,7 +460,7 @@ impl ExprSchemable for Expr {
     ///   with the default implementation returning empty field metadata
     /// - **Aggregate functions**: Generate metadata via function's [`return_field`] method,
     ///   with the default implementation returning empty field metadata
-    /// - **Window functions**: field metadata is empty
+    /// - **Window functions**: field metadata follows the function's return field
     ///
     /// ## Table Reference Scoping
     /// - Establishes proper qualified field references when columns belong to specific tables
@@ -429,7 +476,7 @@ impl ExprSchemable for Expr {
         schema: &dyn ExprSchema,
     ) -> Result<(Option<TableReference>, Arc<Field>)> {
         let (relation, schema_name) = self.qualified_name();
-        #[allow(deprecated)]
+        #[expect(deprecated)]
         let field = match self {
             Expr::Alias(Alias {
                 expr,
@@ -437,30 +484,26 @@ impl ExprSchemable for Expr {
                 metadata,
                 ..
             }) => {
-                let field = expr.to_field(schema).map(|(_, f)| f.as_ref().clone())?;
-
                 let mut combined_metadata = expr.metadata(schema)?;
                 if let Some(metadata) = metadata {
                     combined_metadata.extend(metadata.clone());
                 }
 
-                Ok(Arc::new(combined_metadata.add_to_field(field)))
+                Ok(expr
+                    .to_field(schema)
+                    .map(|(_, f)| f)?
+                    .with_field_metadata(&combined_metadata))
             }
             Expr::Negative(expr) => expr.to_field(schema).map(|(_, f)| f),
-            Expr::Column(c) => schema.field_from_column(c).map(|f| Arc::new(f.clone())),
+            Expr::Column(c) => schema.field_from_column(c).map(Arc::clone),
             Expr::OuterReferenceColumn(field, _) => {
-                Ok(Arc::new(field.as_ref().clone().with_name(&schema_name)))
-            }
-            Expr::ScalarVariable(ty, _) => {
-                Ok(Arc::new(Field::new(&schema_name, ty.clone(), true)))
-            }
-            Expr::Literal(l, metadata) => {
-                let mut field = Field::new(&schema_name, l.data_type(), l.is_null());
-                if let Some(metadata) = metadata {
-                    field = metadata.add_to_field(field);
-                }
-                Ok(Arc::new(field))
+                Ok(Arc::clone(field).renamed(&schema_name))
             }
+            Expr::ScalarVariable(field, _) => Ok(Arc::clone(field).renamed(&schema_name)),
+            Expr::Literal(l, metadata) => Ok(Arc::new(
+                Field::new(&schema_name, l.data_type(), l.is_null())
+                    .with_field_metadata_opt(metadata.as_ref()),
+            )),
             Expr::IsNull(_)
             | Expr::IsNotNull(_)
             | Expr::IsTrue(_)
@@ -475,14 +518,15 @@ impl ExprSchemable for Expr {
             Expr::ScalarSubquery(subquery) => {
                 Ok(Arc::clone(&subquery.subquery.schema().fields()[0]))
             }
-            Expr::BinaryExpr(BinaryExpr {
-                ref left,
-                ref right,
-                ref op,
-            }) => {
-                let (lhs_type, lhs_nullable) = left.data_type_and_nullable(schema)?;
-                let (rhs_type, rhs_nullable) = right.data_type_and_nullable(schema)?;
-                let mut coercer = BinaryTypeCoercer::new(&lhs_type, op, &rhs_type);
+            Expr::BinaryExpr(BinaryExpr { left, right, op }) => {
+                let (left_field, right_field) =
+                    (left.to_field(schema)?.1, right.to_field(schema)?.1);
+
+                let (lhs_type, lhs_nullable) =
+                    (left_field.data_type(), left_field.is_nullable());
+                let (rhs_type, rhs_nullable) =
+                    (right_field.data_type(), right_field.is_nullable());
+                let mut coercer = BinaryTypeCoercer::new(lhs_type, op, rhs_type);
                 coercer.set_lhs_spans(left.spans().cloned().unwrap_or_default());
                 coercer.set_rhs_spans(right.spans().cloned().unwrap_or_default());
                 Ok(Arc::new(Field::new(
@@ -492,79 +536,49 @@ impl ExprSchemable for Expr {
                 )))
             }
             Expr::WindowFunction(window_function) => {
-                let (dt, nullable) = self.data_type_and_nullable_with_window_function(
-                    schema,
-                    window_function,
-                )?;
-                Ok(Arc::new(Field::new(&schema_name, dt, nullable)))
-            }
-            Expr::AggregateFunction(aggregate_function) => {
-                let AggregateFunction {
-                    func,
-                    params: AggregateFunctionParams { args, .. },
+                let WindowFunction {
+                    fun,
+                    params: WindowFunctionParams { args, .. },
                     ..
-                } = aggregate_function;
+                } = window_function.as_ref();
 
                 let fields = args
                     .iter()
                     .map(|e| e.to_field(schema).map(|(_, f)| f))
                     .collect::<Result<Vec<_>>>()?;
-                // Verify that function is invoked with correct number and type of arguments as defined in `TypeSignature`
-                let new_fields = fields_with_aggregate_udf(&fields, func)
-                    .map_err(|err| {
-                        let arg_types = fields
-                            .iter()
-                            .map(|f| f.data_type())
-                            .cloned()
-                            .collect::<Vec<_>>();
-                        plan_datafusion_err!(
-                            "{} {}",
-                            match err {
-                                DataFusionError::Plan(msg) => msg,
-                                err => err.to_string(),
-                            },
-                            utils::generate_signature_error_msg(
-                                func.name(),
-                                func.signature().clone(),
-                                &arg_types,
-                            )
-                        )
-                    })?
-                    .into_iter()
-                    .collect::<Vec<_>>();
-
+                match fun {
+                    WindowFunctionDefinition::AggregateUDF(udaf) => {
+                        let new_fields =
+                            verify_function_arguments(udaf.as_ref(), &fields)?;
+                        let return_field = udaf.return_field(&new_fields)?;
+                        Ok(return_field)
+                    }
+                    WindowFunctionDefinition::WindowUDF(udwf) => {
+                        let new_fields =
+                            verify_function_arguments(udwf.as_ref(), &fields)?;
+                        let return_field = udwf
+                            .field(WindowUDFFieldArgs::new(&new_fields, &schema_name))?;
+                        Ok(return_field)
+                    }
+                }
+            }
+            Expr::AggregateFunction(AggregateFunction {
+                func,
+                params: AggregateFunctionParams { args, .. },
+            }) => {
+                let fields = args
+                    .iter()
+                    .map(|e| e.to_field(schema).map(|(_, f)| f))
+                    .collect::<Result<Vec<_>>>()?;
+                let new_fields = verify_function_arguments(func.as_ref(), &fields)?;
                 func.return_field(&new_fields)
             }
             Expr::ScalarFunction(ScalarFunction { func, args }) => {
-                let (arg_types, fields): (Vec<DataType>, Vec<Arc<Field>>) = args
+                let fields = args
                     .iter()
                     .map(|e| e.to_field(schema).map(|(_, f)| f))
-                    .collect::<Result<Vec<_>>>()?
-                    .into_iter()
-                    .map(|f| (f.data_type().clone(), f))
-                    .unzip();
-                // Verify that function is invoked with correct number and type of arguments as defined in `TypeSignature`
-                let new_data_types = data_types_with_scalar_udf(&arg_types, func)
-                    .map_err(|err| {
-                        plan_datafusion_err!(
-                            "{} {}",
-                            match err {
-                                DataFusionError::Plan(msg) => msg,
-                                err => err.to_string(),
-                            },
-                            utils::generate_signature_error_msg(
-                                func.name(),
-                                func.signature().clone(),
-                                &arg_types,
-                            )
-                        )
-                    })?;
-                let new_fields = fields
-                    .into_iter()
-                    .zip(new_data_types)
-                    .map(|(f, d)| f.as_ref().clone().with_data_type(d))
-                    .map(Arc::new)
-                    .collect::<Vec<FieldRef>>();
+                    .collect::<Result<Vec<_>>>()?;
+                let new_fields = verify_function_arguments(func.as_ref(), &fields)?;
 
                 let arguments = args
                     .iter()
@@ -581,35 +595,85 @@ impl ExprSchemable for Expr {
                 func.return_field_from_args(args)
             }
             // _ => Ok((self.get_type(schema)?, self.nullable(schema)?)),
-            Expr::Cast(Cast { expr, data_type }) => expr
-                .to_field(schema)
-                .map(|(_, f)| f.as_ref().clone().with_data_type(data_type.clone()))
-                .map(Arc::new),
+            Expr::Cast(Cast { expr, field }) => {
+                expr.to_field(schema).map(|(_table_ref, src)| {
+                    cast_output_field(&src, field.data_type(), false)
+                })
+            }
             Expr::Placeholder(Placeholder {
                 id: _,
                 field: Some(field),
-            }) => Ok(field.as_ref().clone().with_name(&schema_name).into()),
+            }) => Ok(Arc::clone(field).renamed(&schema_name)),
+            Expr::TryCast(TryCast { expr, field }) => {
+                expr.to_field(schema).map(|(_table_ref, src)| {
+                    cast_output_field(&src, field.data_type(), true)
+                })
+            }
+            Expr::LambdaVariable(LambdaVariable {
+                field: Some(field), ..
+            }) => Ok(Arc::clone(field).renamed(&schema_name)),
             Expr::Like(_)
             | Expr::SimilarTo(_)
             | Expr::Not(_)
             | Expr::Between(_)
             | Expr::Case(_)
-            | Expr::TryCast(_)
             | Expr::InList(_)
             | Expr::InSubquery(_)
+            | Expr::SetComparison(_)
             | Expr::Wildcard { .. }
             | Expr::GroupingSet(_)
             | Expr::Placeholder(_)
-            | Expr::Unnest(_) => Ok(Arc::new(Field::new(
+            | Expr::Unnest(_)
+            | Expr::Lambda(_)
+            | Expr::LambdaVariable(_) => Ok(Arc::new(Field::new(
                 &schema_name,
                 self.get_type(schema)?,
                 self.nullable(schema)?,
             ))),
+            Expr::HigherOrderFunction(func) => {
+                let arg_fields = func
+                    .args
+                    .iter()
+                    .map(|arg| match arg {
+                        Expr::Lambda(Lambda { params: _, body }) => {
+                            // use the name of the lambda instead of just the body to help with debugging
+                            Ok(ValueOrLambda::Lambda(Arc::new(Field::new(
+                                arg.qualified_name().1,
+                                body.get_type(schema)?,
+                                body.nullable(schema)?,
+                            ))))
+                        }
+                        _ => Ok(ValueOrLambda::Value(arg.to_field(schema)?.1)),
+                    })
+                    .collect::<Result<Vec<_>>>()?;
+
+                let new_fields = value_fields_with_higher_order_udf_and_lambdas(
+                    &arg_fields,
+                    func.func.as_ref(),
+                )?;
+
+                let arguments = func
+                    .args
+                    .iter()
+                    .map(|e| match e {
+                        Expr::Literal(sv, _) => Some(sv),
+                        _ => None,
+                    })
+                    .collect::<Vec<_>>();
+
+                let args = HigherOrderReturnFieldArgs {
+                    arg_fields: &new_fields,
+                    scalar_arguments: &arguments,
+                };
+
+                func.func.return_field_from_args(args)
+            }
         }?;
 
         Ok((
             relation,
-            Arc::new(field.as_ref().clone().with_name(schema_name)),
+            // todo avoid this rename / use the name above
+            field.renamed(&schema_name),
         ))
     }
 
@@ -629,7 +693,16 @@ impl ExprSchemable for Expr {
         // like all of the binary expressions below. Perhaps Expr should track the
         // type of the expression?
 
-        if can_cast_types(&this_type, cast_to_type) {
+        // Special handling for struct-to-struct casts with name-based field matching
+        let can_cast = match (&this_type, cast_to_type) {
+            (DataType::Struct(_), DataType::Struct(_)) => {
+                // Always allow struct-to-struct casts; field matching happens at runtime
+                true
+            }
+            _ => can_cast_types(&this_type, cast_to_type),
+        };
+
+        if can_cast {
             match self {
                 Expr::ScalarSubquery(subquery) => {
                     Ok(Expr::ScalarSubquery(cast_subquery(subquery, cast_to_type)?))
@@ -642,90 +715,40 @@ impl ExprSchemable for Expr {
     }
 }
 
-impl Expr {
-    /// Common method for window functions that applies type coercion
-    /// to all arguments of the window function to check if it matches
-    /// its signature.
-    ///
-    /// If successful, this method returns the data type and
-    /// nullability of the window function's result.
-    ///
-    /// Otherwise, returns an error if there's a type mismatch between
-    /// the window function's signature and the provided arguments.
-    fn data_type_and_nullable_with_window_function(
-        &self,
-        schema: &dyn ExprSchema,
-        window_function: &WindowFunction,
-    ) -> Result<(DataType, bool)> {
-        let WindowFunction {
-            fun,
-            params: WindowFunctionParams { args, .. },
-            ..
-        } = window_function;
-
-        let fields = args
+/// Verify that function is invoked with correct number and type of arguments as
+/// defined in `TypeSignature`.
+fn verify_function_arguments<F: UDFCoercionExt>(
+    function: &F,
+    input_fields: &[FieldRef],
+) -> Result<Vec<FieldRef>> {
+    fields_with_udf(input_fields, function).map_err(|err| {
+        let data_types = input_fields
             .iter()
-            .map(|e| e.to_field(schema).map(|(_, f)| f))
-            .collect::<Result<Vec<_>>>()?;
-        match fun {
-            WindowFunctionDefinition::AggregateUDF(udaf) => {
-                let data_types = fields
-                    .iter()
-                    .map(|f| f.data_type())
-                    .cloned()
-                    .collect::<Vec<_>>();
-                let new_fields = fields_with_aggregate_udf(&fields, udaf)
-                    .map_err(|err| {
-                        plan_datafusion_err!(
-                            "{} {}",
-                            match err {
-                                DataFusionError::Plan(msg) => msg,
-                                err => err.to_string(),
-                            },
-                            utils::generate_signature_error_msg(
-                                fun.name(),
-                                fun.signature(),
-                                &data_types
-                            )
-                        )
-                    })?
-                    .into_iter()
-                    .collect::<Vec<_>>();
-
-                let return_field = udaf.return_field(&new_fields)?;
-
-                Ok((return_field.data_type().clone(), return_field.is_nullable()))
-            }
-            WindowFunctionDefinition::WindowUDF(udwf) => {
-                let data_types = fields
-                    .iter()
-                    .map(|f| f.data_type())
-                    .cloned()
-                    .collect::<Vec<_>>();
-                let new_fields = fields_with_window_udf(&fields, udwf)
-                    .map_err(|err| {
-                        plan_datafusion_err!(
-                            "{} {}",
-                            match err {
-                                DataFusionError::Plan(msg) => msg,
-                                err => err.to_string(),
-                            },
-                            utils::generate_signature_error_msg(
-                                fun.name(),
-                                fun.signature(),
-                                &data_types
-                            )
-                        )
-                    })?
-                    .into_iter()
-                    .collect::<Vec<_>>();
-                let (_, function_name) = self.qualified_name();
-                let field_args = WindowUDFFieldArgs::new(&new_fields, &function_name);
+            .map(|f| f.data_type())
+            .cloned()
+            .collect::<Vec<_>>();
+        plan_datafusion_err!(
+            "{}. {}",
+            match err {
+                DataFusionError::Plan(msg) => msg,
+                err => err.to_string(),
+            },
+            utils::generate_signature_error_message(
+                function.name(),
+                function.signature(),
+                &data_types
+            )
+        )
+    })
+}
 
-                udwf.field(field_args)
-                    .map(|field| (field.data_type().clone(), field.is_nullable()))
-            }
-        }
+/// Returns the innermost [Expr] that is provably null if `expr` is null.
+fn unwrap_certainly_null_expr(expr: &Expr) -> &Expr {
+    match expr {
+        Expr::Not(e) => unwrap_certainly_null_expr(e),
+        Expr::Negative(e) => unwrap_certainly_null_expr(e),
+        Expr::Cast(e) => unwrap_certainly_null_expr(e.expr.as_ref()),
+        _ => expr,
     }
 }
 
@@ -773,9 +796,9 @@ mod tests {
     use std::collections::HashMap;
 
     use super::*;
-    use crate::{col, lit, out_ref_col_with_metadata};
+    use crate::{and, col, lit, not, or, out_ref_col_with_metadata, when};
 
-    use datafusion_common::{internal_err, DFSchema, ScalarValue};
+    use datafusion_common::{DFSchema, assert_or_internal_err};
 
     macro_rules! test_is_expr_nullable {
         ($EXPR_TYPE:ident) => {{
@@ -788,9 +811,10 @@ mod tests {
     fn expr_schema_nullability() {
         let expr = col("foo").eq(lit(1));
         assert!(!expr.nullable(&MockExprSchema::new()).unwrap());
-        assert!(expr
-            .nullable(&MockExprSchema::new().with_nullable(true))
-            .unwrap());
+        assert!(
+            expr.nullable(&MockExprSchema::new().with_nullable(true))
+                .unwrap()
+        );
 
         test_is_expr_nullable!(is_null);
         test_is_expr_nullable!(is_not_null);
@@ -826,6 +850,137 @@ mod tests {
         assert!(expr.nullable(&get_schema(false)).unwrap());
     }
 
+    fn assert_nullability(expr: &Expr, schema: &dyn ExprSchema, expected: bool) {
+        assert_eq!(
+            expr.nullable(schema).unwrap(),
+            expected,
+            "Nullability of '{expr}' should be {expected}"
+        );
+    }
+
+    fn assert_not_nullable(expr: &Expr, schema: &dyn ExprSchema) {
+        assert_nullability(expr, schema, false);
+    }
+
+    fn assert_nullable(expr: &Expr, schema: &dyn ExprSchema) {
+        assert_nullability(expr, schema, true);
+    }
+
+    #[test]
+    fn test_case_expression_nullability() -> Result<()> {
+        let nullable_schema = MockExprSchema::new()
+            .with_data_type(DataType::Int32)
+            .with_nullable(true);
+
+        let not_nullable_schema = MockExprSchema::new()
+            .with_data_type(DataType::Int32)
+            .with_nullable(false);
+
+        // CASE WHEN x IS NOT NULL THEN x ELSE 0
+        let e = when(col("x").is_not_null(), col("x")).otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN NOT x IS NULL THEN x ELSE 0
+        let e = when(not(col("x").is_null()), col("x")).otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN X = 5 THEN x ELSE 0
+        let e = when(col("x").eq(lit(5)), col("x")).otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x IS NOT NULL AND x = 5 THEN x ELSE 0
+        let e = when(and(col("x").is_not_null(), col("x").eq(lit(5))), col("x"))
+            .otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x = 5 AND x IS NOT NULL THEN x ELSE 0
+        let e = when(and(col("x").eq(lit(5)), col("x").is_not_null()), col("x"))
+            .otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x IS NOT NULL OR x = 5 THEN x ELSE 0
+        let e = when(or(col("x").is_not_null(), col("x").eq(lit(5))), col("x"))
+            .otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x = 5 OR x IS NOT NULL THEN x ELSE 0
+        let e = when(or(col("x").eq(lit(5)), col("x").is_not_null()), col("x"))
+            .otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN (x = 5 AND x IS NOT NULL) OR (x = bar AND x IS NOT NULL) THEN x ELSE 0
+        let e = when(
+            or(
+                and(col("x").eq(lit(5)), col("x").is_not_null()),
+                and(col("x").eq(col("bar")), col("x").is_not_null()),
+            ),
+            col("x"),
+        )
+        .otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x = 5 OR x IS NULL THEN x ELSE 0
+        let e = when(or(col("x").eq(lit(5)), col("x").is_null()), col("x"))
+            .otherwise(lit(0))?;
+        assert_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x IS TRUE THEN x ELSE 0
+        let e = when(col("x").is_true(), col("x")).otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x IS NOT TRUE THEN x ELSE 0
+        let e = when(col("x").is_not_true(), col("x")).otherwise(lit(0))?;
+        assert_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x IS FALSE THEN x ELSE 0
+        let e = when(col("x").is_false(), col("x")).otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x IS NOT FALSE THEN x ELSE 0
+        let e = when(col("x").is_not_false(), col("x")).otherwise(lit(0))?;
+        assert_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x IS UNKNOWN THEN x ELSE 0
+        let e = when(col("x").is_unknown(), col("x")).otherwise(lit(0))?;
+        assert_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x IS NOT UNKNOWN THEN x ELSE 0
+        let e = when(col("x").is_not_unknown(), col("x")).otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x LIKE 'x' THEN x ELSE 0
+        let e = when(col("x").like(lit("x")), col("x")).otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN 0 THEN x ELSE 0
+        let e = when(lit(0), col("x")).otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN 1 THEN x ELSE 0
+        let e = when(lit(1), col("x")).otherwise(lit(0))?;
+        assert_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        Ok(())
+    }
+
     #[test]
     fn test_inlist_nullability() {
         let get_schema = |nullable| {
@@ -838,9 +993,10 @@ mod tests {
         assert!(!expr.nullable(&get_schema(false)).unwrap());
         assert!(expr.nullable(&get_schema(true)).unwrap());
         // Testing nullable() returns an error.
-        assert!(expr
-            .nullable(&get_schema(false).with_error_on_nullable(true))
-            .is_err());
+        assert!(
+            expr.nullable(&get_schema(false).with_error_on_nullable(true))
+                .is_err()
+        );
 
         let null = lit(ScalarValue::Int32(None));
         let expr = col("foo").in_list(vec![null, lit(1)], false);
@@ -934,16 +1090,18 @@ mod tests {
             ),
         ));
 
+        let field = expr.to_field(&schema).unwrap().1;
         assert_eq!(
-            expr.data_type_and_nullable(&schema).unwrap(),
-            (DataType::Utf8, true)
+            (field.data_type(), field.is_nullable()),
+            (&DataType::Utf8, true)
         );
         assert_eq!(placeholder_meta, expr.metadata(&schema).unwrap());
 
         let expr_alias = expr.alias("a placeholder by any other name");
+        let expr_alias_field = expr_alias.to_field(&schema).unwrap().1;
         assert_eq!(
-            expr_alias.data_type_and_nullable(&schema).unwrap(),
-            (DataType::Utf8, true)
+            (expr_alias_field.data_type(), expr_alias_field.is_nullable()),
+            (&DataType::Utf8, true)
         );
         assert_eq!(placeholder_meta, expr_alias.metadata(&schema).unwrap());
 
@@ -952,38 +1110,41 @@ mod tests {
             "".to_string(),
             Some(Field::new("", DataType::Utf8, false).into()),
         ));
+        let expr_field = expr.to_field(&schema).unwrap().1;
         assert_eq!(
-            expr.data_type_and_nullable(&schema).unwrap(),
-            (DataType::Utf8, false)
+            (expr_field.data_type(), expr_field.is_nullable()),
+            (&DataType::Utf8, false)
         );
+
         let expr_alias = expr.alias("a placeholder by any other name");
+        let expr_alias_field = expr_alias.to_field(&schema).unwrap().1;
         assert_eq!(
-            expr_alias.data_type_and_nullable(&schema).unwrap(),
-            (DataType::Utf8, false)
+            (expr_alias_field.data_type(), expr_alias_field.is_nullable()),
+            (&DataType::Utf8, false)
         );
     }
 
     #[derive(Debug)]
     struct MockExprSchema {
-        field: Field,
+        field: FieldRef,
         error_on_nullable: bool,
     }
 
     impl MockExprSchema {
         fn new() -> Self {
             Self {
-                field: Field::new("mock_field", DataType::Null, false),
+                field: Arc::new(Field::new("mock_field", DataType::Null, false)),
                 error_on_nullable: false,
             }
         }
 
         fn with_nullable(mut self, nullable: bool) -> Self {
-            self.field = self.field.with_nullable(nullable);
+            Arc::make_mut(&mut self.field).set_nullable(nullable);
             self
         }
 
         fn with_data_type(mut self, data_type: DataType) -> Self {
-            self.field = self.field.with_data_type(data_type);
+            Arc::make_mut(&mut self.field).set_data_type(data_type);
             self
         }
 
@@ -993,22 +1154,37 @@ mod tests {
         }
 
         fn with_metadata(mut self, metadata: FieldMetadata) -> Self {
-            self.field = metadata.add_to_field(self.field);
+            self.field =
+                Arc::new(metadata.add_to_field(Arc::unwrap_or_clone(self.field)));
             self
         }
     }
 
     impl ExprSchema for MockExprSchema {
         fn nullable(&self, _col: &Column) -> Result<bool> {
-            if self.error_on_nullable {
-                internal_err!("nullable error")
-            } else {
-                Ok(self.field.is_nullable())
-            }
+            assert_or_internal_err!(!self.error_on_nullable, "nullable error");
+            Ok(self.field.is_nullable())
         }
 
-        fn field_from_column(&self, _col: &Column) -> Result<&Field> {
+        fn field_from_column(&self, _col: &Column) -> Result<&FieldRef> {
             Ok(&self.field)
         }
     }
+
+    #[test]
+    fn test_scalar_variable() {
+        let mut meta = HashMap::new();
+        meta.insert("bar".to_string(), "buzz".to_string());
+        let meta = FieldMetadata::from(meta);
+
+        let field = Field::new("foo", DataType::Int32, true);
+        let field = meta.add_to_field(field);
+        let field = Arc::new(field);
+
+        let expr = Expr::ScalarVariable(field, vec!["foo".to_string()]);
+
+        let schema = MockExprSchema::new();
+
+        assert_eq!(meta, expr.metadata(&schema).unwrap());
+    }
 }
diff --git a/datafusion/expr/src/extension_types/array_formatter_factory.rs b/datafusion/expr/src/extension_types/array_formatter_factory.rs
new file mode 100644
index 0000000000000..f0239d3978801
--- /dev/null
+++ b/datafusion/expr/src/extension_types/array_formatter_factory.rs
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::registry::ExtensionTypeRegistryRef;
+use arrow::array::Array;
+use arrow::util::display::{ArrayFormatter, ArrayFormatterFactory, FormatOptions};
+use arrow_schema::{ArrowError, Field};
+
+/// A factory for creating [`ArrayFormatter`]s that checks whether a registered extension type can
+/// format a given array based on its metadata.
+#[derive(Debug)]
+pub struct DFArrayFormatterFactory {
+    /// The extension type registry
+    registry: ExtensionTypeRegistryRef,
+}
+
+impl DFArrayFormatterFactory {
+    /// Creates a new [`DFArrayFormatterFactory`].
+    pub fn new(registry: ExtensionTypeRegistryRef) -> Self {
+        Self { registry }
+    }
+}
+
+impl ArrayFormatterFactory for DFArrayFormatterFactory {
+    fn create_array_formatter<'formatter>(
+        &self,
+        array: &'formatter dyn Array,
+        options: &FormatOptions<'formatter>,
+        field: Option<&'formatter Field>,
+    ) -> Result<Option<ArrayFormatter<'formatter>>, ArrowError> {
+        let Some(field) = field else {
+            return Ok(None);
+        };
+
+        let Some(extension_type_name) = field.extension_type_name() else {
+            return Ok(None);
+        };
+
+        let Some(registration) = self
+            .registry
+            .extension_type_registration(extension_type_name)
+            .ok()
+        else {
+            // If the extension type is not registered, we fall back to the default formatter
+            return Ok(None);
+        };
+
+        registration
+            .create_df_extension_type(field.data_type(), field.extension_type_metadata())?
+            .create_array_formatter(array, options)
+            .map_err(ArrowError::from)
+    }
+}
diff --git a/datafusion/expr/src/extension_types/mod.rs b/datafusion/expr/src/extension_types/mod.rs
new file mode 100644
index 0000000000000..55ec1ad95b5a1
--- /dev/null
+++ b/datafusion/expr/src/extension_types/mod.rs
@@ -0,0 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! This module contains code that enables DataFusion's extension type capabilities.
+
+mod array_formatter_factory;
+
+pub use array_formatter_factory::*;
diff --git a/datafusion/expr/src/function.rs b/datafusion/expr/src/function.rs
index e0235d32292fa..68865cbe1ca54 100644
--- a/datafusion/expr/src/function.rs
+++ b/datafusion/expr/src/function.rs
@@ -27,6 +27,8 @@ pub use datafusion_functions_aggregate_common::accumulator::{
     AccumulatorArgs, AccumulatorFactoryFunction, StateFieldsArgs,
 };
 
+use crate::expr::{AggregateFunction, WindowFunction};
+use crate::simplify::SimplifyContext;
 pub use datafusion_functions_window_common::expr::ExpressionArgs;
 pub use datafusion_functions_window_common::field::WindowUDFFieldArgs;
 pub use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
@@ -64,28 +66,22 @@ pub type PartitionEvaluatorFactory =
 pub type StateTypeFunction =
     Arc<dyn Fn(&DataType) -> Result<Arc<Vec<DataType>>> + Send + Sync>;
 
-/// [crate::udaf::AggregateUDFImpl::simplify] simplifier closure
-/// A closure with two arguments:
-/// * 'aggregate_function': [crate::expr::AggregateFunction] for which simplified has been invoked
-/// * 'info': [crate::simplify::SimplifyInfo]
+/// Type alias for [crate::udaf::AggregateUDFImpl::simplify].
 ///
-/// Closure returns simplified [Expr] or an error.
-pub type AggregateFunctionSimplification = Box<
-    dyn Fn(
-        crate::expr::AggregateFunction,
-        &dyn crate::simplify::SimplifyInfo,
-    ) -> Result<Expr>,
->;
+/// This closure is invoked with:
+/// * `aggregate_function`: [AggregateFunction] with already simplified arguments
+/// * `info`: [SimplifyContext]
+///
+/// It returns a simplified [Expr] or an error.
+pub type AggregateFunctionSimplification =
+    Box<dyn Fn(AggregateFunction, &SimplifyContext) -> Result<Expr>>;
 
-/// [crate::udwf::WindowUDFImpl::simplify] simplifier closure
-/// A closure with two arguments:
-/// * 'window_function': [crate::expr::WindowFunction] for which simplified has been invoked
-/// * 'info': [crate::simplify::SimplifyInfo]
+/// Type alias for [crate::udwf::WindowUDFImpl::simplify].
+///
+/// This closure is invoked with:
+/// * `window_function`: [WindowFunction] with already simplified arguments
+/// * `info`: [SimplifyContext]
 ///
-/// Closure returns simplified [Expr] or an error.
-pub type WindowFunctionSimplification = Box<
-    dyn Fn(
-        crate::expr::WindowFunction,
-        &dyn crate::simplify::SimplifyInfo,
-    ) -> Result<Expr>,
->;
+/// It returns a simplified [Expr] or an error.
+pub type WindowFunctionSimplification =
+    Box<dyn Fn(WindowFunction, &SimplifyContext) -> Result<Expr>>;
diff --git a/datafusion/expr/src/higher_order_function.rs b/datafusion/expr/src/higher_order_function.rs
new file mode 100644
index 0000000000000..0e238ffc65f1e
--- /dev/null
+++ b/datafusion/expr/src/higher_order_function.rs
@@ -0,0 +1,781 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`HigherOrderUDF`]: User Defined Higher Order Functions
+
+use crate::expr::schema_name_from_exprs_comma_separated_without_space;
+use crate::{ColumnarValue, Documentation, Expr};
+use arrow::array::{ArrayRef, RecordBatch};
+use arrow::datatypes::{DataType, FieldRef, Schema};
+use arrow_schema::SchemaRef;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::{Result, ScalarValue, not_impl_err};
+use datafusion_expr_common::dyn_eq::{DynEq, DynHash};
+use datafusion_expr_common::signature::Volatility;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use std::any::Any;
+use std::cmp::Ordering;
+use std::fmt::Debug;
+use std::hash::{Hash, Hasher};
+use std::sync::Arc;
+
+/// The types of arguments for which a function has implementations.
+///
+/// [`HigherOrderTypeSignature`] **DOES NOT** define the types that a user query could call the
+/// function with. DataFusion will automatically coerce (cast) argument types to
+/// one of the supported function signatures, if possible.
+///
+/// # Overview
+/// Functions typically provide implementations for a small number of different
+/// argument [`DataType`]s, rather than all possible combinations. If a user
+/// calls a function with arguments that do not match any of the declared types,
+/// DataFusion will attempt to automatically coerce (add casts to) function
+/// arguments so they match the [`HigherOrderTypeSignature`]. See the [`type_coercion`] module
+/// for more details
+///
+/// [`type_coercion`]: crate::type_coercion
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)]
+pub enum HigherOrderTypeSignature {
+    /// The acceptable signature and coercions rules are special for this
+    /// function.
+    ///
+    /// If this signature is specified,
+    /// DataFusion will call [`HigherOrderUDF::coerce_value_types`] to prepare argument types.
+    UserDefined,
+    /// One or more lambdas or arguments with arbitrary types
+    VariadicAny,
+    /// The specified number of lambdas or arguments with arbitrary types.
+    Any(usize),
+}
+
+/// Provides information necessary for calling a higher order function.
+///
+/// - [`HigherOrderTypeSignature`] defines the argument types that a function has implementations
+///   for.
+///
+/// - [`Volatility`] defines how the output of the function changes with the input.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)]
+pub struct HigherOrderSignature {
+    /// The data types that the function accepts. See [HigherOrderTypeSignature] for more information.
+    pub type_signature: HigherOrderTypeSignature,
+    /// The volatility of the function. See [Volatility] for more information.
+    pub volatility: Volatility,
+    /// Whether [HigherOrderUDF::coerce_values_for_lambdas] should be called
+    pub coerce_values_for_lambdas: bool,
+    /// The max number of times to call [HigherOrderUDF::lambda_parameters] before raising an error.
+    /// Used to guard against implementations that causes an infinite loop by endlessly returning
+    /// [LambdaParametersProgress::Partial]. Defaults to 256
+    pub lambda_parameters_max_iterations: usize,
+}
+
+const LAMBDA_PARAMETERS_MAX_ITERATIONS: usize = 256;
+
+impl HigherOrderSignature {
+    /// Creates a new `HigherOrderSignature` from a given type signature and volatility.
+    pub fn new(type_signature: HigherOrderTypeSignature, volatility: Volatility) -> Self {
+        HigherOrderSignature {
+            type_signature,
+            volatility,
+            coerce_values_for_lambdas: false,
+            lambda_parameters_max_iterations: LAMBDA_PARAMETERS_MAX_ITERATIONS,
+        }
+    }
+
+    /// User-defined coercion rules for the function.
+    pub fn user_defined(volatility: Volatility) -> Self {
+        Self {
+            type_signature: HigherOrderTypeSignature::UserDefined,
+            volatility,
+            coerce_values_for_lambdas: false,
+            lambda_parameters_max_iterations: LAMBDA_PARAMETERS_MAX_ITERATIONS,
+        }
+    }
+
+    /// An arbitrary number of lambdas or arguments of any type.
+    pub fn variadic_any(volatility: Volatility) -> Self {
+        Self {
+            type_signature: HigherOrderTypeSignature::VariadicAny,
+            volatility,
+            coerce_values_for_lambdas: false,
+            lambda_parameters_max_iterations: LAMBDA_PARAMETERS_MAX_ITERATIONS,
+        }
+    }
+
+    /// A specified number of arguments of any type
+    pub fn any(arg_count: usize, volatility: Volatility) -> Self {
+        Self {
+            type_signature: HigherOrderTypeSignature::Any(arg_count),
+            volatility,
+            coerce_values_for_lambdas: false,
+            lambda_parameters_max_iterations: LAMBDA_PARAMETERS_MAX_ITERATIONS,
+        }
+    }
+
+    /// Set [Self::coerce_values_for_lambdas] to true to indicate that [HigherOrderUDF::coerce_values_for_lambdas]
+    /// should be called
+    pub fn with_coerce_values_for_lambdas(mut self) -> Self {
+        self.coerce_values_for_lambdas = true;
+
+        self
+    }
+}
+
+impl PartialEq for dyn HigherOrderUDF {
+    fn eq(&self, other: &Self) -> bool {
+        self.dyn_eq(other as _)
+    }
+}
+
+impl PartialOrd for dyn HigherOrderUDF {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        let mut cmp = self.name().cmp(other.name());
+        if cmp == Ordering::Equal {
+            cmp = self.signature().partial_cmp(other.signature())?;
+        }
+        if cmp == Ordering::Equal {
+            cmp = self.aliases().partial_cmp(other.aliases())?;
+        }
+        // Contract for PartialOrd and PartialEq consistency requires that
+        // a == b if and only if partial_cmp(a, b) == Some(Equal).
+        if cmp == Ordering::Equal && self != other {
+            // Functions may have other properties besides name and signature
+            // that differentiate two instances (e.g. type, or arbitrary parameters).
+            // We cannot return Some(Equal) in such case.
+            return None;
+        }
+        debug_assert!(
+            cmp == Ordering::Equal || self != other,
+            "Detected incorrect implementation of PartialEq when comparing functions: '{}' and '{}'. \
+            The functions compare as equal, but they are not equal based on general properties that \
+            the PartialOrd implementation observes,",
+            self.name(),
+            other.name()
+        );
+        Some(cmp)
+    }
+}
+
+impl Eq for dyn HigherOrderUDF {}
+
+impl Hash for dyn HigherOrderUDF {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.dyn_hash(state)
+    }
+}
+
+/// Arguments passed to [`HigherOrderUDF::invoke_with_args`] when invoking a
+/// higher order function.
+#[derive(Debug, Clone)]
+pub struct HigherOrderFunctionArgs {
+    /// The evaluated arguments and lambdas to the function
+    pub args: Vec<ValueOrLambda<ColumnarValue, LambdaArgument>>,
+    /// Field associated with each arg, if it exists
+    /// For lambdas, it will be the field of the result of
+    /// the lambda if evaluated with the parameters
+    /// returned from [`HigherOrderUDF::lambda_parameters`]
+    pub arg_fields: Vec<ValueOrLambda<FieldRef, FieldRef>>,
+    /// The number of rows in record batch being evaluated
+    pub number_rows: usize,
+    /// The return field of the higher order function returned
+    /// (from `return_field_from_args`) when creating the
+    /// physical expression from the logical expression
+    pub return_field: FieldRef,
+    /// The config options at execution time
+    pub config_options: Arc<ConfigOptions>,
+}
+
+impl HigherOrderFunctionArgs {
+    /// The return type of the function. See [`Self::return_field`] for more
+    /// details.
+    pub fn return_type(&self) -> &DataType {
+        self.return_field.data_type()
+    }
+}
+
+/// A lambda argument to a HigherOrderFunction
+#[derive(Clone, Debug)]
+pub struct LambdaArgument {
+    /// The parameters defined in this lambda
+    ///
+    /// For example, for `array_transform([2], v -> -v)`,
+    /// this will be `vec![Field::new("v", DataType::Int32, true)]`
+    params: Vec<FieldRef>,
+    /// The body of the lambda
+    ///
+    /// For example, for `array_transform([2], v -> -v)`,
+    /// this will be the physical expression of `-v`
+    body: Arc<dyn PhysicalExpr>,
+    /// Cached schema built from `params`. Reused across every `evaluate` call
+    /// (and across every nested-list iteration when the lambda is called once
+    /// per outer sublist), avoiding the per-call `Schema::new` build that
+    /// includes constructing the internal name -> index map.
+    schema: SchemaRef,
+}
+
+impl LambdaArgument {
+    pub fn new(params: Vec<FieldRef>, body: Arc<dyn PhysicalExpr>) -> Self {
+        let schema = Arc::new(Schema::new(params.clone()));
+        Self {
+            params,
+            body,
+            schema,
+        }
+    }
+
+    /// Evaluate this lambda
+    /// `args` should evaluate to the value of each parameter
+    /// of the correspondent lambda returned in [HigherOrderUDF::lambda_parameters].
+    pub fn evaluate(
+        &self,
+        args: &[&dyn Fn() -> Result<ArrayRef>],
+    ) -> Result<ColumnarValue> {
+        let columns = args
+            .iter()
+            .take(self.params.len())
+            .map(|arg| arg())
+            .collect::<Result<_>>()?;
+
+        let batch = RecordBatch::try_new(Arc::clone(&self.schema), columns)?;
+
+        self.body.evaluate(&batch)
+    }
+}
+
+/// Information about arguments passed to the function
+///
+/// This structure contains metadata about how the function was called
+/// such as the type of the arguments, any scalar arguments and if the
+/// arguments can (ever) be null
+///
+/// See [`HigherOrderUDF::return_field_from_args`] for more information
+#[derive(Clone, Debug)]
+pub struct HigherOrderReturnFieldArgs<'a> {
+    /// The data types of the arguments to the function
+    ///
+    /// If argument `i` to the function is a lambda, it will be the field of the result of the
+    /// lambda if evaluated with the parameters returned from [`HigherOrderUDF::lambda_parameters`]
+    ///
+    /// For example, with `array_transform([1], v -> v == 5)`
+    /// this field will be
+    /// ```ignore
+    /// [
+    ///     ValueOrLambda::Value(Field::new("", DataType::new_list(DataType::Int32, true), true)),
+    ///     ValueOrLambda::Lambda(Field::new("", DataType::Boolean, true))
+    /// ]
+    /// ```
+    pub arg_fields: &'a [ValueOrLambda<FieldRef, FieldRef>],
+    /// Is argument `i` to the function a scalar (constant)?
+    ///
+    /// If the argument `i` is not a scalar, it will be None
+    ///
+    /// For example, if a function is called like `array_transform([1], v -> v == 5)`
+    /// this field will be `[Some(ScalarValue::List(...), None]`
+    pub scalar_arguments: &'a [Option<&'a ScalarValue>],
+}
+
+/// An argument to a higher order function
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum ValueOrLambda<V, L> {
+    /// A value with associated data
+    Value(V),
+    /// A lambda with associated data
+    Lambda(L),
+}
+
+/// Represents a step during the resolution of the parameters of all lambdas of a given
+/// higher-order function via [HigherOrderUDF::lambda_parameters]. It's valid that the
+/// fields of a given lambda changes between steps, and is up to the implementation to
+/// provide during the function evaluation the parameters that matches the fields returned
+/// at the [LambdaParametersProgress::Complete] step. See [HigherOrderUDF::lambda_parameters]
+/// docs for more details
+pub enum LambdaParametersProgress {
+    /// The parameters of some lambdas are unknown due to a dependency on another lambda output field
+    /// or are placeholders due to a dependency on it's own output field. It's perfectly valid to
+    /// contain only `Some`'s and not a single `None`, representing lambdas that depends only on itself
+    /// and not on others. [HigherOrderUDF::lambda_parameters] will be called again with the output
+    /// field of all lambdas with known parameters.
+    Partial(Vec<Option<Vec<FieldRef>>>),
+    /// There are no unmet dependencies and all parameters are known, [HigherOrderUDF::lambda_parameters]
+    /// will not be called again
+    Complete(Vec<Vec<FieldRef>>),
+}
+
+/// Trait for implementing user defined higher order functions.
+///
+/// This trait exposes the full API for implementing user defined functions and
+/// can be used to implement any function.
+///
+/// See [`array_transform.rs`] for a commented complete implementation
+///
+/// [`array_transform.rs`]: https://github.com/apache/datafusion/blob/main/datafusion/functions-nested/src/array_transform.rs
+pub trait HigherOrderUDF: Debug + DynEq + DynHash + Send + Sync + Any {
+    /// Returns this function's name
+    fn name(&self) -> &str;
+
+    /// Returns any aliases (alternate names) for this function.
+    ///
+    /// Aliases can be used to invoke the same function using different names.
+    /// For example in some databases `now()` and `current_timestamp()` are
+    /// aliases for the same function. This behavior can be obtained by
+    /// returning `current_timestamp` as an alias for the `now` function.
+    ///
+    /// Note: `aliases` should only include names other than [`Self::name`].
+    /// Defaults to `[]` (no aliases)
+    fn aliases(&self) -> &[String] {
+        &[]
+    }
+
+    /// Returns the name of the column this expression would create
+    ///
+    /// See [`Expr::schema_name`] for details
+    fn schema_name(&self, args: &[Expr]) -> Result<String> {
+        Ok(format!(
+            "{}({})",
+            self.name(),
+            schema_name_from_exprs_comma_separated_without_space(args)?
+        ))
+    }
+
+    /// Returns a [`HigherOrderSignature`] describing the argument types for which this
+    /// function has an implementation, and the function's [`Volatility`].
+    ///
+    /// See [`HigherOrderSignature`] for more details on argument type handling
+    /// and [`Self::return_field_from_args`] for computing the return type.
+    ///
+    /// [`Volatility`]: datafusion_expr_common::signature::Volatility
+    fn signature(&self) -> &HigherOrderSignature;
+
+    /// Return the field of all the parameters supported by the lambdas in `fields`.
+    /// If a lambda support multiple parameters, all should be returned, regardless of
+    /// whether they are used or not on a particular invocation
+    ///
+    /// Tip: If you have a [`HigherOrderFunction`] invocation, you can call the helper
+    /// [`HigherOrderFunction::lambda_parameters`] instead of this method directly
+    ///
+    /// The name of the returned fields are ignored.
+    ///
+    /// This function is repeatedelly called until [LambdaParametersProgress::Complete] is returned, with
+    /// `step` increased by one at each invocation, starting at 0.
+    ///
+    /// For functions which all lambda parameters depend only on the field of it's value arguments,
+    /// this can return [LambdaParametersProgress::Complete] at step 0. Taking as an example a strict
+    /// array_reduce with the signature `(arr: [V], initial_value: I, (I, V) -> I, (I) -> O) -> O`, which
+    /// requires it's initial value to be the exact same type of it's merge output, which is also the
+    /// parameter of it's finish lambda, the expression
+    ///
+    /// `array_reduce([1.2, 2.1], 0.0, (acc, v) -> acc + v + 1.5, v -> v > 5.1)`
+    ///
+    ///  would result in this function being called as the following:
+    ///
+    /// ```ignore
+    /// let lambda_parameters = array_reduce.lambda_parameters(
+    ///     0,
+    ///     &[
+    ///         // the Field of the literal `[1.2, 2.1]`, the array being reduced
+    ///         ValueOrLambda::Value(Arc::new(Field::new("", DataType::new_list(DataType::Float32, true), true))),
+    ///         // the Field of the literal `0.0`, the initial value
+    ///         ValueOrLambda::Value(Arc::new(Field::new("", DataType::Float32, true))),
+    ///         // the Field of the output of the merge lambda, which is unknown at this point because it depends
+    ///         // on the return of this call
+    ///         ValueOrLambda::Lambda(None),
+    ///         // the Field of the output of the finish lambda, unknown for the same reason as above
+    ///         ValueOrLambda::Lambda(None),
+    /// ])?;
+    ///
+    /// assert_eq!(
+    ///      lambda_parameters,
+    ///      LambdaParametersProgress::Complete(vec![
+    ///         // the finish lambda supported parameters, regardless of how many are actually used
+    ///         vec![
+    ///             // the accumulator which is the field of the initial value
+    ///             Arc::new(Field::new("ignored_name", DataType::Float32, true)),
+    ///             // the array values being reduced
+    ///             Arc::new(Field::new("", DataType::Float32, true)),
+    ///         ],
+    ///         // the merge lambda supported parameters
+    ///         vec![
+    ///             // the reduced value which is the field of the initial value
+    ///             Arc::new(Field::new("ignored_name", DataType::Float32, true)),
+    ///         ],
+    ///      ])
+    /// );
+    /// ```
+    ///
+    /// For functions which lambda parameters depends on the output of other lambdas, or on their own lambda,
+    /// this can return [LambdaParametersProgress::Partial] until all dependencies are met. Note that for
+    /// lambda with cyclic dependencies, you likely want to use [HigherOrderUDF::coerce_values_for_lambdas] too.
+    /// Take as an example a flexible array_reduce with the signature `(arr: [V], initial_value: I, (ACC, V) -> ACC, (ACC) -> O) -> O`.
+    /// It has a cyclic dependency in the merge lambda, and a dependency of the finish lambda in the merge lambda,
+    /// and only requires the initial value to be *coercible* to the output of the merge lambda, which is defined by
+    /// it's [HigherOrderUDF::coerce_values_for_lambdas] implementation. The expression
+    ///
+    /// `array_reduce([1.2, 2.1], 0, (acc, v) -> acc + v + 1.5, v -> v > 5.1)`
+    ///
+    /// would result in this function being called as the following:
+    ///
+    /// ```ignore
+    /// let lambda_parameters = array_reduce.lambda_parameters(
+    ///     0,
+    ///     &[
+    ///         // the Field of the literal `[1.2, 2.1]`, the array being reduced
+    ///         ValueOrLambda::Value(Arc::new(Field::new("", DataType::new_list(DataType::Float32, true), true))),
+    ///         // the Field of the literal `0`, the initial value
+    ///         ValueOrLambda::Value(Arc::new(Field::new("", DataType::Int32, true))),
+    ///         // the Field of the output of the merge lambda, which is unknown at this point because it depends on
+    ///         // the return this call
+    ///         ValueOrLambda::Lambda(None),
+    ///         // the Field of the output of the finish lambda, unknown for the same reason as above
+    ///         ValueOrLambda::Lambda(None),
+    /// ])?;
+    ///
+    /// assert_eq!(
+    ///      lambda_parameters,
+    ///      LambdaParametersProgress::Partial(vec![
+    ///         // the finish lambda supported parameters, regardless of how many are actually used
+    ///         Some(vec![
+    ///             // at step 0, use the field of the initial value
+    ///             Arc::new(Field::new("ignored_name", DataType::Int32, true)),
+    ///             // the array values being reduced
+    ///             Arc::new(Field::new("", DataType::Float32, true)),
+    ///         ]),
+    ///         // the merge lambda supported parameters, unknown at this point due to dependency on the merge output
+    ///         None,
+    ///      ])
+    /// );
+    ///
+    /// let lambda_parameters = array_reduce.lambda_parameters(
+    ///     1,
+    ///     &[
+    ///         // the Field of the literal `[1.2, 2.1]`, the array being reduced
+    ///         ValueOrLambda::Value(Arc::new(Field::new("", DataType::new_list(DataType::Float32, true), true))),
+    ///         // the Field of the literal `0`, the initial value
+    ///         ValueOrLambda::Value(Arc::new(Field::new("", DataType::Int32, true))),
+    ///         // the Field of the output of the merge lambda, which could be inferred to be a Float32 based on the
+    ///         // returned values of the previous step
+    ///         ValueOrLambda::Value(Arc::new(Field::new("", DataType::Float32, true))),
+    ///         // the Field of the output of the finish lambda, which is unknown at this point because it depends
+    ///         // on the return of this call
+    ///         ValueOrLambda::Lambda(None),
+    /// ])?;
+    ///
+    /// assert_eq!(
+    ///      lambda_parameters,
+    ///      LambdaParametersProgress::Complete(vec![
+    ///         // the finish lambda supported parameters, regardless of how many are actually used
+    ///         vec![
+    ///             // the finish lambda own output now used as it's accumulator
+    ///             Arc::new(Field::new("ignored_name", DataType::Float32, true)),
+    ///             // the array values being reduced
+    ///             Arc::new(Field::new("", DataType::Float32, true)),
+    ///         ],
+    ///         // the merge lambda supported parameters, which is the output of the merge lambda,
+    ///         vec![
+    ///             // the output of the merge lambda
+    ///             Arc::new(Field::new("", DataType::Float32, true)),
+    ///         ],
+    ///      ])
+    /// );
+    ///
+    /// let coerce_to = array_reduce.coerce_values_for_lambdas(&[
+    ///     // the literal `[1.2, 2.1]` data type, the array being reduced
+    ///     ValueOrLambda::Value(DataType::new_list(DataType::Float32, true)),
+    ///     // the literal `0` data type, the initial value
+    ///     ValueOrLambda::Value(DataType::Int32),
+    ///     // the output data type of the merge lambda
+    ///     ValueOrLambda::Lambda(DataType::Float32),
+    ///     // the output data type of the finish lambda
+    ///     ValueOrLambda::Lambda(DataType::Boolean),
+    /// ])?;
+    ///
+    /// assert_eq!(
+    ///     coerce_to,
+    ///     vec![
+    ///         // return the same type for the array being reduced
+    ///         DataType::new_list(DataType::Float32, true),
+    ///         // coerce the initial value to the output of the merge lambda
+    ///         DataType::Float32,
+    ///     ]
+    /// );
+    ///
+    /// ```
+    ///
+    /// Note this may also be called at step 0 with all lambda outputs already set, and in that case,
+    /// [LambdaParametersProgress::Complete] must be returned
+    ///
+    /// The implementation can assume that some other part of the code has coerced
+    /// the actual argument types to match [`Self::signature`], except the coercion defined by
+    /// [Self::coerce_values_for_lambdas], if applicable.
+    ///
+    /// [`HigherOrderFunction`]: crate::expr::HigherOrderFunction
+    /// [`HigherOrderFunction::lambda_parameters`]: crate::expr::HigherOrderFunction::lambda_parameters
+    fn lambda_parameters(
+        &self,
+        step: usize,
+        fields: &[ValueOrLambda<FieldRef, Option<FieldRef>>],
+    ) -> Result<LambdaParametersProgress>;
+
+    /// Coerce value arguments of a function call to types that the function can evaluate also taking into
+    /// account the *output type of it's lambdas*. This differs from [HigherOrderUDF::coerce_value_types]
+    /// that only has access to the type of it's value arguments because it's called before the output type
+    /// of lambdas are known. So that this method is called, the function must have it's
+    /// [HigherOrderSignature::coerce_values_for_lambdas] set to true
+    ///
+    /// See the [type coercion module](crate::type_coercion)
+    /// documentation for more details on type coercion
+    ///
+    /// # Parameters
+    /// * `fields`: The argument types of the value arguments of this function, or the output type of lambdas
+    ///
+    /// # Return value
+    /// A Vec with the same number of [ValueOrLambda::Value] in `fields`. DataFusion will `CAST` the
+    /// function call arguments to these specific types.
+    ///
+    /// For example, a flexible array_reduce implementation (see [Self::lambda_parameters] docs), when working
+    /// with the expression below, may want to coerce it's initial value argument, the *integer* `0`,
+    /// to match the output it's merge function, which is a *float*:
+    ///
+    /// `array_reduce([1.2, 2.1], 0, (acc, v) -> acc + v + 1.5, v -> v > 2.0)`
+    fn coerce_values_for_lambdas(
+        &self,
+        _fields: &[ValueOrLambda<DataType, DataType>],
+    ) -> Result<Vec<DataType>> {
+        not_impl_err!(
+            "{} coerce_values_for_lambdas is not implemented",
+            self.name()
+        )
+    }
+
+    /// What type will be returned by this function, given the arguments?
+    ///
+    /// The implementation can assume that some other part of the code has coerced
+    /// the actual argument types to match [`Self::signature`], including the coercion
+    /// defined by [Self::coerce_values_for_lambdas], if applicable.
+    ///
+    /// # Example creating `Field`
+    ///
+    /// Note the name of the `Field` is ignored, except for structured types such as
+    /// `DataType::Struct`.
+    ///
+    /// ```rust
+    /// # use std::sync::Arc;
+    /// # use arrow::datatypes::{DataType, Field, FieldRef};
+    /// # use datafusion_common::Result;
+    /// # use datafusion_expr::HigherOrderReturnFieldArgs;
+    /// # struct Example{}
+    /// # impl Example {
+    /// fn return_field_from_args(&self, args: HigherOrderReturnFieldArgs) -> Result<FieldRef> {
+    ///     let field = Arc::new(Field::new("ignored_name", DataType::Int32, true));
+    ///     Ok(field)
+    /// }
+    /// # }
+    /// ```
+    fn return_field_from_args(
+        &self,
+        args: HigherOrderReturnFieldArgs,
+    ) -> Result<FieldRef>;
+
+    /// Whether List or LargeList arguments should have it's non-empty null
+    /// sublists cleaned with [remove_list_null_values] before invoking this function
+    ///
+    /// The default implementation always returns true and should only be implemented
+    /// if you want to handle non-empty null sublists yourself
+    ///
+    /// [remove_list_null_values]: datafusion_common::utils::remove_list_null_values
+    // todo: extend this to listview and maps when remove_list_null_values supports it
+    fn clear_null_values(&self) -> bool {
+        true
+    }
+
+    /// Invoke the function returning the appropriate result.
+    ///
+    /// # Performance
+    ///
+    /// For the best performance, the implementations should handle the common case
+    /// when one or more of their arguments are constant values (aka
+    /// [`ColumnarValue::Scalar`]).
+    ///
+    /// [`ColumnarValue::values_to_arrays`] can be used to convert the arguments
+    /// to arrays, which will likely be simpler code, but be slower.
+    fn invoke_with_args(&self, args: HigherOrderFunctionArgs) -> Result<ColumnarValue>;
+
+    /// Returns true if some of this `exprs` subexpressions may not be evaluated
+    /// and thus any side effects (like divide by zero) may not be encountered.
+    ///
+    /// Setting this to true prevents certain optimizations such as common
+    /// subexpression elimination
+    ///
+    /// When overriding this function to return `true`, [HigherOrderUDF::conditional_arguments] can also be
+    /// overridden to report more accurately which arguments are eagerly evaluated and which ones
+    /// lazily.
+    fn short_circuits(&self) -> bool {
+        false
+    }
+
+    /// Determines which of the arguments passed to *this higher-order function*
+    /// are evaluated eagerly and which may be evaluated lazily. Note that this
+    /// does *not* applies to the arguments that *lambda functions* pass to it's
+    /// body expression
+    ///
+    /// If this function returns `None`, all arguments are eagerly evaluated.
+    /// Returning `None` is a micro optimization that saves a needless `Vec`
+    /// allocation.
+    ///
+    /// If the function returns `Some`, returns (`eager`, `lazy`) where `eager`
+    /// are the arguments that are always evaluated, and `lazy` are the
+    /// arguments that may be evaluated lazily (i.e. may not be evaluated at all
+    /// in some cases).
+    ///
+    /// Implementations must ensure that the two returned `Vec`s are disjunct,
+    /// and that each argument from `args` is present in one the two `Vec`s.
+    ///
+    /// When overriding this function, [HigherOrderUDF::short_circuits] must
+    /// be overridden to return `true`.
+    fn conditional_arguments<'a>(
+        &self,
+        args: &'a [Expr],
+    ) -> Option<(Vec<&'a Expr>, Vec<&'a Expr>)> {
+        if self.short_circuits() {
+            Some((vec![], args.iter().collect()))
+        } else {
+            None
+        }
+    }
+
+    /// Coerce value arguments of a function call to types that the function can evaluate.
+    /// Note that if you need to coerce values based on the output type of lambdas, you
+    /// must use [HigherOrderUDF::coerce_values_for_lambdas], as this function is used before
+    /// the output type of lambdas are known
+    ///
+    /// See the [type coercion module](crate::type_coercion)
+    /// documentation for more details on type coercion
+    ///
+    /// For example, if your function requires a contiguous list argument, but the user calls
+    /// it like `my_func(c, v -> v+2)` (i.e. with `c` as a ListView), coerce_types can return `[DataType::List(..)]`
+    /// to ensure the argument is converted to a List
+    ///
+    /// # Parameters
+    /// * `arg_types`: The argument types of the value arguments of this function, excluding lambdas
+    ///
+    /// # Return value
+    /// A Vec the same length as `arg_types`. DataFusion will `CAST` the function call
+    /// arguments to these specific types.
+    fn coerce_value_types(&self, _arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        not_impl_err!(
+            "Function {} does not implement coerce_value_types",
+            self.name()
+        )
+    }
+
+    /// Returns the documentation for this HigherOrderUDF.
+    ///
+    /// Documentation can be accessed programmatically as well as generating
+    /// publicly facing documentation.
+    fn documentation(&self) -> Option<&Documentation> {
+        None
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion_expr_common::signature::Volatility;
+
+    use super::*;
+    use std::hash::DefaultHasher;
+
+    #[derive(Debug, PartialEq, Eq, Hash)]
+    struct TestHigherOrderUDF {
+        name: &'static str,
+        field: &'static str,
+        signature: HigherOrderSignature,
+    }
+    impl HigherOrderUDF for TestHigherOrderUDF {
+        fn name(&self) -> &str {
+            self.name
+        }
+
+        fn signature(&self) -> &HigherOrderSignature {
+            &self.signature
+        }
+
+        fn lambda_parameters(
+            &self,
+            _step: usize,
+            _fields: &[ValueOrLambda<FieldRef, Option<FieldRef>>],
+        ) -> Result<LambdaParametersProgress> {
+            unimplemented!()
+        }
+
+        fn return_field_from_args(
+            &self,
+            _args: HigherOrderReturnFieldArgs,
+        ) -> Result<FieldRef> {
+            unimplemented!()
+        }
+
+        fn invoke_with_args(
+            &self,
+            _args: HigherOrderFunctionArgs,
+        ) -> Result<ColumnarValue> {
+            unimplemented!()
+        }
+    }
+
+    // PartialEq and Hash must be consistent, and also PartialEq and PartialOrd
+    // must be consistent, so they are tested together.
+    #[test]
+    fn test_partial_eq_hash_and_partial_ord() {
+        // A parameterized function
+        let f = test_func("foo", "a");
+
+        // Same like `f`, different instance
+        let f2 = test_func("foo", "a");
+        assert_eq!(&f, &f2);
+        assert_eq!(hash(&f), hash(&f2));
+        assert_eq!(f.partial_cmp(&f2), Some(Ordering::Equal));
+
+        // Different parameter
+        let b = test_func("foo", "b");
+        assert_ne!(&f, &b);
+        assert_ne!(hash(&f), hash(&b)); // hash can collide for different values but does not collide in this test
+        assert_eq!(f.partial_cmp(&b), None);
+
+        // Different name
+        let o = test_func("other", "a");
+        assert_ne!(&f, &o);
+        assert_ne!(hash(&f), hash(&o)); // hash can collide for different values but does not collide in this test
+        assert_eq!(f.partial_cmp(&o), Some(Ordering::Less));
+
+        // Different name and parameter
+        assert_ne!(&b, &o);
+        assert_ne!(hash(&b), hash(&o)); // hash can collide for different values but does not collide in this test
+        assert_eq!(b.partial_cmp(&o), Some(Ordering::Less));
+    }
+
+    fn test_func(name: &'static str, parameter: &'static str) -> Arc<dyn HigherOrderUDF> {
+        Arc::new(TestHigherOrderUDF {
+            name,
+            field: parameter,
+            signature: HigherOrderSignature::variadic_any(Volatility::Immutable),
+        })
+    }
+
+    fn hash<T: Hash>(value: &T) -> u64 {
+        let hasher = &mut DefaultHasher::new();
+        value.hash(hasher);
+        hasher.finish()
+    }
+}
diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs
index 2b7cc9d46ad34..92e6b13330c19 100644
--- a/datafusion/expr/src/lib.rs
+++ b/datafusion/expr/src/lib.rs
@@ -23,6 +23,7 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! [DataFusion](https://github.com/apache/datafusion)
 //! is an extensible query execution framework that uses
@@ -36,6 +37,7 @@
 
 extern crate core;
 
+mod higher_order_function;
 mod literal;
 mod operation;
 mod partition_evaluator;
@@ -51,6 +53,7 @@ pub mod expr;
 pub mod expr_fn;
 pub mod expr_rewriter;
 pub mod expr_schema;
+pub mod extension_types;
 pub mod function;
 pub mod select_expr;
 pub mod groups_accumulator {
@@ -60,6 +63,10 @@ pub mod interval_arithmetic {
     pub use datafusion_expr_common::interval_arithmetic::*;
 }
 pub mod logical_plan;
+pub mod dml {
+    //! DML (Data Manipulation Language) types for DELETE, UPDATE operations.
+    pub use crate::logical_plan::dml::*;
+}
 pub mod planner;
 pub mod registry;
 pub mod simplify;
@@ -70,6 +77,8 @@ pub mod async_udf;
 pub mod statistics {
     pub use datafusion_expr_common::statistics::*;
 }
+mod predicate_bounds;
+pub mod preimage;
 pub mod ptr_eq;
 pub mod test;
 pub mod tree_node;
@@ -81,16 +90,17 @@ pub mod window_frame;
 pub mod window_state;
 
 pub use datafusion_doc::{
-    aggregate_doc_sections, scalar_doc_sections, window_doc_sections, DocSection,
-    Documentation, DocumentationBuilder,
+    DocSection, Documentation, DocumentationBuilder, aggregate_doc_sections,
+    scalar_doc_sections, window_doc_sections,
 };
 pub use datafusion_expr_common::accumulator::Accumulator;
 pub use datafusion_expr_common::columnar_value::ColumnarValue;
 pub use datafusion_expr_common::groups_accumulator::{EmitTo, GroupsAccumulator};
 pub use datafusion_expr_common::operator::Operator;
+pub use datafusion_expr_common::placement::ExpressionPlacement;
 pub use datafusion_expr_common::signature::{
-    ArrayFunctionArgument, ArrayFunctionSignature, Coercion, Signature, TypeSignature,
-    TypeSignatureClass, Volatility, TIMEZONE_WILDCARD,
+    ArrayFunctionArgument, ArrayFunctionSignature, Coercion, Signature,
+    TIMEZONE_WILDCARD, TypeSignature, TypeSignatureClass, Volatility,
 };
 pub use datafusion_expr_common::type_coercion::binary;
 pub use expr::{
@@ -103,8 +113,13 @@ pub use function::{
     AccumulatorFactoryFunction, PartitionEvaluatorFactory, ReturnTypeFunction,
     ScalarFunctionImplementation, StateTypeFunction,
 };
+pub use higher_order_function::{
+    HigherOrderFunctionArgs, HigherOrderReturnFieldArgs, HigherOrderSignature,
+    HigherOrderTypeSignature, HigherOrderUDF, LambdaArgument, LambdaParametersProgress,
+    ValueOrLambda,
+};
 pub use literal::{
-    lit, lit_timestamp_nano, lit_with_metadata, Literal, TimestampLiteral,
+    Literal, TimestampLiteral, lit, lit_timestamp_nano, lit_with_metadata,
 };
 pub use logical_plan::*;
 pub use partition_evaluator::PartitionEvaluator;
@@ -112,12 +127,14 @@ pub use partition_evaluator::PartitionEvaluator;
 pub use sqlparser;
 pub use table_source::{TableProviderFilterPushDown, TableSource, TableType};
 pub use udaf::{
+    AggregateUDF, AggregateUDFImpl, ReversedUDAF, SetMonotonicity, StatisticsArgs,
     udaf_default_display_name, udaf_default_human_display, udaf_default_return_field,
     udaf_default_schema_name, udaf_default_window_function_display_name,
-    udaf_default_window_function_schema_name, AggregateUDF, AggregateUDFImpl,
-    ReversedUDAF, SetMonotonicity, StatisticsArgs,
+    udaf_default_window_function_schema_name,
+};
+pub use udf::{
+    ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, StructFieldMapping,
 };
-pub use udf::{ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl};
 pub use udwf::{LimitEffect, ReversedUDWF, WindowUDF, WindowUDFImpl};
 pub use window_frame::{WindowFrame, WindowFrameBound, WindowFrameUnits};
 
diff --git a/datafusion/expr/src/literal.rs b/datafusion/expr/src/literal.rs
index 335d7b471f5fe..2e2980d607648 100644
--- a/datafusion/expr/src/literal.rs
+++ b/datafusion/expr/src/literal.rs
@@ -18,13 +18,15 @@
 //! Literal module contains foundational types that are used to represent literals in DataFusion.
 
 use crate::Expr;
-use datafusion_common::{metadata::FieldMetadata, ScalarValue};
+use datafusion_common::{ScalarValue, metadata::FieldMetadata};
 
 /// Create a literal expression
+#[expect(clippy::needless_pass_by_value)]
 pub fn lit<T: Literal>(n: T) -> Expr {
     n.lit()
 }
 
+#[expect(clippy::needless_pass_by_value)]
 pub fn lit_with_metadata<T: Literal>(n: T, metadata: Option<FieldMetadata>) -> Expr {
     let Some(metadata) = metadata else {
         return n.lit();
@@ -45,6 +47,7 @@ pub fn lit_with_metadata<T: Literal>(n: T, metadata: Option<FieldMetadata>) -> E
 }
 
 /// Create a literal timestamp expression
+#[expect(clippy::needless_pass_by_value)]
 pub fn lit_timestamp_nano<T: TimestampLiteral>(n: T) -> Expr {
     n.lit_timestamp_nano()
 }
diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs
index b9afd894d77d3..017a123eb035b 100644
--- a/datafusion/expr/src/logical_plan/builder.rs
+++ b/datafusion/expr/src/logical_plan/builder.rs
@@ -17,7 +17,6 @@
 
 //! This module provides a builder for creating LogicalPlans
 
-use std::any::Any;
 use std::borrow::Cow;
 use std::cmp::Ordering;
 use std::collections::{HashMap, HashSet};
@@ -44,8 +43,8 @@ use crate::utils::{
     group_window_expr_by_sort_keys,
 };
 use crate::{
-    and, binary_expr, lit, DmlStatement, ExplainOption, Expr, ExprSchemable, Operator,
-    RecursiveQuery, Statement, TableProviderFilterPushDown, TableSource, WriteOp,
+    DmlStatement, ExplainOption, Expr, ExprSchemable, Operator, RecursiveQuery,
+    Statement, TableProviderFilterPushDown, TableSource, WriteOp, and, binary_expr, lit,
 };
 
 use super::dml::InsertOp;
@@ -55,9 +54,10 @@ use datafusion_common::display::ToStringifiedPlan;
 use datafusion_common::file_options::file_type::FileType;
 use datafusion_common::metadata::FieldMetadata;
 use datafusion_common::{
-    exec_err, get_target_functional_dependencies, internal_datafusion_err, not_impl_err,
-    plan_datafusion_err, plan_err, Column, Constraints, DFSchema, DFSchemaRef,
-    NullEquality, Result, ScalarValue, TableReference, ToDFSchema, UnnestOptions,
+    Column, Constraints, DFSchema, DFSchemaRef, NullEquality, Result, ScalarValue,
+    TableReference, ToDFSchema, UnnestOptions, exec_err,
+    get_target_functional_dependencies, internal_datafusion_err, plan_datafusion_err,
+    plan_err,
 };
 use datafusion_expr_common::type_coercion::binary::type_union_resolution;
 
@@ -179,19 +179,14 @@ impl LogicalPlanBuilder {
         recursive_term: LogicalPlan,
         is_distinct: bool,
     ) -> Result<Self> {
-        // TODO: we need to do a bunch of validation here. Maybe more.
-        if is_distinct {
-            return not_impl_err!(
-                "Recursive queries with a distinct 'UNION' (in which the previous iteration's results will be de-duplicated) is not supported"
-            );
-        }
         // Ensure that the static term and the recursive term have the same number of fields
         let static_fields_len = self.plan.schema().fields().len();
         let recursive_fields_len = recursive_term.schema().fields().len();
         if static_fields_len != recursive_fields_len {
             return plan_err!(
                 "Non-recursive term and recursive term must have the same number of columns ({} != {})",
-                static_fields_len, recursive_fields_len
+                static_fields_len,
+                recursive_fields_len
             );
         }
         // Ensure that the recursive term has the same field types as the static term
@@ -312,7 +307,11 @@ impl LogicalPlanBuilder {
                 let metadata = value.metadata(&schema)?;
                 if let Some(ref cm) = common_metadata {
                     if &metadata != cm {
-                        return plan_err!("Inconsistent metadata across values list at row {i} column {j}. Was {:?} but found {:?}", cm, metadata);
+                        return plan_err!(
+                            "Inconsistent metadata across values list at row {i} column {j}. Was {:?} but found {:?}",
+                            cm,
+                            metadata
+                        );
                     }
                 } else {
                     common_metadata = Some(metadata.clone());
@@ -326,7 +325,9 @@ impl LogicalPlanBuilder {
                     // get common type of each column values.
                     let data_types = vec![prev_type.clone(), data_type.clone()];
                     let Some(new_type) = type_union_resolution(&data_types) else {
-                        return plan_err!("Inconsistent data type across values list at row {i} column {j}. Was {prev_type} but found {data_type}");
+                        return plan_err!(
+                            "Inconsistent data type across values list at row {i} column {j}. Was {prev_type} but found {data_type}"
+                        );
                     };
                     common_type = Some(new_type);
                 } else {
@@ -513,29 +514,27 @@ impl LogicalPlanBuilder {
             TableScan::try_new(table_name, table_source, projection, filters, fetch)?;
 
         // Inline TableScan
-        if table_scan.filters.is_empty() {
-            if let Some(p) = table_scan.source.get_logical_plan() {
-                let sub_plan = p.into_owned();
-
-                if let Some(proj) = table_scan.projection {
-                    let projection_exprs = proj
-                        .into_iter()
-                        .map(|i| {
-                            Expr::Column(Column::from(
-                                sub_plan.schema().qualified_field(i),
-                            ))
-                        })
-                        .collect::<Vec<_>>();
-                    return Self::new(sub_plan)
-                        .project(projection_exprs)?
-                        .alias(table_scan.table_name);
-                }
+        if table_scan.filters.is_empty()
+            && let Some(p) = table_scan.source.get_logical_plan()
+        {
+            let sub_plan = p.into_owned();
 
-                // Ensures that the reference to the inlined table remains the
-                // same, meaning we don't have to change any of the parent nodes
-                // that reference this table.
-                return Self::new(sub_plan).alias(table_scan.table_name);
+            if let Some(proj) = table_scan.projection {
+                let projection_exprs = proj
+                    .into_iter()
+                    .map(|i| {
+                        Expr::Column(Column::from(sub_plan.schema().qualified_field(i)))
+                    })
+                    .collect::<Vec<_>>();
+                return Self::new(sub_plan)
+                    .project(projection_exprs)?
+                    .alias(table_scan.table_name);
             }
+
+            // Ensures that the reference to the inlined table remains the
+            // same, meaning we don't have to change any of the parent nodes
+            // that reference this table.
+            return Self::new(sub_plan).alias(table_scan.table_name);
         }
 
         Ok(Self::new(LogicalPlan::TableScan(table_scan)))
@@ -593,7 +592,23 @@ impl LogicalPlanBuilder {
         self,
         expr: Vec<(impl Into<SelectExpr>, bool)>,
     ) -> Result<Self> {
-        project_with_validation(Arc::unwrap_or_clone(self.plan), expr).map(Self::new)
+        project_with_validation(Arc::unwrap_or_clone(self.plan), expr, None)
+            .map(Self::new)
+    }
+
+    /// Apply a projection, aliasing non-Column/non-Alias expressions to
+    /// match the field names from the provided schema.
+    pub fn project_with_validation_and_schema(
+        self,
+        expr: impl IntoIterator<Item = impl Into<SelectExpr>>,
+        schema: &DFSchemaRef,
+    ) -> Result<Self> {
+        project_with_validation(
+            Arc::unwrap_or_clone(self.plan),
+            expr.into_iter().map(|e| (e, true)),
+            Some(schema),
+        )
+        .map(Self::new)
     }
 
     /// Select the given column indices
@@ -771,7 +786,9 @@ impl LogicalPlanBuilder {
             .map(|col| col.flat_name())
             .collect::<String>();
 
-        plan_err!("For SELECT DISTINCT, ORDER BY expressions {missing_col_names} must appear in select list")
+        plan_err!(
+            "For SELECT DISTINCT, ORDER BY expressions {missing_col_names} must appear in select list"
+        )
     }
 
     /// Apply a sort by provided expressions with default direction
@@ -1009,6 +1026,25 @@ impl LogicalPlanBuilder {
         join_keys: (Vec<impl Into<Column>>, Vec<impl Into<Column>>),
         filter: Option<Expr>,
         null_equality: NullEquality,
+    ) -> Result<Self> {
+        self.join_detailed_with_options(
+            right,
+            join_type,
+            join_keys,
+            filter,
+            null_equality,
+            false,
+        )
+    }
+
+    pub fn join_detailed_with_options(
+        self,
+        right: LogicalPlan,
+        join_type: JoinType,
+        join_keys: (Vec<impl Into<Column>>, Vec<impl Into<Column>>),
+        filter: Option<Expr>,
+        null_equality: NullEquality,
+        null_aware: bool,
     ) -> Result<Self> {
         if join_keys.0.len() != join_keys.1.len() {
             return plan_err!("left_keys and right_keys were not the same length");
@@ -1126,6 +1162,7 @@ impl LogicalPlanBuilder {
             join_constraint: JoinConstraint::On,
             schema: DFSchemaRef::new(join_schema),
             null_equality,
+            null_aware,
         })))
     }
 
@@ -1199,6 +1236,7 @@ impl LogicalPlanBuilder {
                 join_type,
                 JoinConstraint::Using,
                 NullEquality::NullEqualsNothing,
+                false, // null_aware
             )?;
 
             Ok(Self::new(LogicalPlan::Join(join)))
@@ -1215,6 +1253,7 @@ impl LogicalPlanBuilder {
             JoinType::Inner,
             JoinConstraint::On,
             NullEquality::NullEqualsNothing,
+            false, // null_aware
         )?;
 
         Ok(Self::new(LogicalPlan::Join(join)))
@@ -1350,6 +1389,15 @@ impl LogicalPlanBuilder {
             );
         }
 
+        // Requalify sides if needed to avoid duplicate qualified field names
+        // (e.g., when both sides reference the same table)
+        let left_builder = LogicalPlanBuilder::from(left_plan);
+        let right_builder = LogicalPlanBuilder::from(right_plan);
+        let (left_builder, right_builder, _requalified) =
+            requalify_sides_if_needed(left_builder, right_builder)?;
+        let left_plan = left_builder.build()?;
+        let right_plan = right_builder.build()?;
+
         let join_keys = left_plan
             .schema()
             .fields()
@@ -1460,6 +1508,7 @@ impl LogicalPlanBuilder {
             join_type,
             JoinConstraint::On,
             NullEquality::NullEqualsNothing,
+            false, // null_aware
         )?;
 
         Ok(Self::new(LogicalPlan::Join(join)))
@@ -1729,23 +1778,61 @@ pub fn requalify_sides_if_needed(
 ) -> Result<(LogicalPlanBuilder, LogicalPlanBuilder, bool)> {
     let left_cols = left.schema().columns();
     let right_cols = right.schema().columns();
-    if left_cols.iter().any(|l| {
-        right_cols.iter().any(|r| {
-            l == r || (l.name == r.name && (l.relation.is_none() || r.relation.is_none()))
-        })
-    }) {
-        // These names have no connection to the original plan, but they'll make the columns
-        // (mostly) unique.
-        Ok((
-            left.alias(TableReference::bare("left"))?,
-            right.alias(TableReference::bare("right"))?,
-            true,
-        ))
-    } else {
-        Ok((left, right, false))
+
+    // Requalify if merging the schemas would cause an error during join.
+    // This can happen in several cases:
+    // 1. Duplicate qualified fields: both sides have same relation.name
+    // 2. Duplicate unqualified fields: both sides have same unqualified name
+    // 3. Ambiguous reference: one side qualified, other unqualified, same name
+    //
+    // Implementation note: This uses a simple O(n*m) nested loop rather than
+    // a HashMap-based O(n+m) approach. The nested loop is preferred because:
+    // - Schemas are typically small (in TPCH benchmark, max is 16 columns),
+    //   so n*m is negligible
+    // - Early return on first conflict makes common case very fast
+    // - Code is simpler and easier to reason about
+    // - Called only during plan construction, not in execution hot path
+    for l in &left_cols {
+        for r in &right_cols {
+            if l.name != r.name {
+                continue;
+            }
+
+            // Same name - check if this would cause a conflict
+            match (&l.relation, &r.relation) {
+                // Both qualified with same relation - duplicate qualified field
+                (Some(l_rel), Some(r_rel)) if l_rel == r_rel => {
+                    return Ok((
+                        left.alias(TableReference::bare("left"))?,
+                        right.alias(TableReference::bare("right"))?,
+                        true,
+                    ));
+                }
+                // Both unqualified - duplicate unqualified field
+                (None, None) => {
+                    return Ok((
+                        left.alias(TableReference::bare("left"))?,
+                        right.alias(TableReference::bare("right"))?,
+                        true,
+                    ));
+                }
+                // One qualified, one not - ambiguous reference
+                (Some(_), None) | (None, Some(_)) => {
+                    return Ok((
+                        left.alias(TableReference::bare("left"))?,
+                        right.alias(TableReference::bare("right"))?,
+                        true,
+                    ));
+                }
+                // Different qualifiers - OK, no conflict
+                _ => {}
+            }
+        }
     }
-}
 
+    // No conflicts found
+    Ok((left, right, false))
+}
 /// Add additional "synthetic" group by expressions based on functional
 /// dependencies.
 ///
@@ -1844,7 +1931,7 @@ pub fn project(
     plan: LogicalPlan,
     expr: impl IntoIterator<Item = impl Into<SelectExpr>>,
 ) -> Result<LogicalPlan> {
-    project_with_validation(plan, expr.into_iter().map(|e| (e, true)))
+    project_with_validation(plan, expr.into_iter().map(|e| (e, true)), None)
 }
 
 /// Create Projection. Similar to project except that the expressions
@@ -1857,12 +1944,15 @@ pub fn project(
 fn project_with_validation(
     plan: LogicalPlan,
     expr: impl IntoIterator<Item = (impl Into<SelectExpr>, bool)>,
+    schema: Option<&DFSchemaRef>,
 ) -> Result<LogicalPlan> {
     let mut projected_expr = vec![];
+    let mut has_wildcard = false;
     for (e, validate) in expr {
         let e = e.into();
         match e {
             SelectExpr::Wildcard(opt) => {
+                has_wildcard = true;
                 let expanded = expand_wildcard(plan.schema(), &plan, Some(&opt))?;
 
                 // If there is a REPLACE statement, replace that column with the given
@@ -1883,6 +1973,7 @@ fn project_with_validation(
                 }
             }
             SelectExpr::QualifiedWildcard(table_ref, opt) => {
+                has_wildcard = true;
                 let expanded =
                     expand_qualified_wildcard(&table_ref, plan.schema(), Some(&opt))?;
 
@@ -1912,6 +2003,24 @@ fn project_with_validation(
             }
         }
     }
+
+    if has_wildcard && projected_expr.is_empty() && !plan.schema().fields().is_empty() {
+        return plan_err!(
+            "SELECT list is empty after resolving * expressions, \
+             the wildcard expanded to zero columns"
+        );
+    }
+
+    // When inside a set expression, alias non-Column/non-Alias expressions
+    // to match the left side's field names, avoiding duplicate name errors.
+    if let Some(schema) = &schema {
+        for (expr, field) in projected_expr.iter_mut().zip(schema.fields()) {
+            if !matches!(expr, Expr::Column(_) | Expr::Alias(_)) {
+                *expr = std::mem::take(expr).alias(field.name());
+            }
+        }
+    }
+
     validate_unique_names("Projections", projected_expr.iter())?;
 
     Projection::try_new(projected_expr, Arc::new(plan)).map(LogicalPlan::Projection)
@@ -1926,15 +2035,14 @@ fn replace_columns(
     replace: &PlannedReplaceSelectItem,
 ) -> Result<Vec<Expr>> {
     for expr in exprs.iter_mut() {
-        if let Expr::Column(Column { name, .. }) = expr {
-            if let Some((_, new_expr)) = replace
+        if let Expr::Column(Column { name, .. }) = expr
+            && let Some((_, new_expr)) = replace
                 .items()
                 .iter()
                 .zip(replace.expressions().iter())
                 .find(|(item, _)| item.column_name.value == *name)
-            {
-                *expr = new_expr.clone().alias(name.clone())
-            }
+        {
+            *expr = new_expr.clone().alias(name.clone())
         }
     }
     Ok(exprs)
@@ -2052,6 +2160,8 @@ pub fn wrap_projection_for_join_if_necessary(
             .into_iter()
             .map(Expr::Column)
             .collect::<Vec<_>>();
+        #[allow(clippy::allow_attributes, clippy::mutable_key_type)]
+        // Expr contains Arc with interior mutability but is intentionally used as hash key
         let join_key_items = alias_join_keys
             .iter()
             .flat_map(|expr| expr.try_as_col().is_none().then_some(expr))
@@ -2105,10 +2215,6 @@ impl LogicalTableSource {
 }
 
 impl TableSource for LogicalTableSource {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         Arc::clone(&self.table_schema)
     }
@@ -2188,7 +2294,7 @@ mod tests {
     use super::*;
     use crate::lit_with_metadata;
     use crate::logical_plan::StringifiedPlan;
-    use crate::{col, expr, expr_fn::exists, in_subquery, lit, scalar_subquery};
+    use crate::{col, expr, expr_fn::exists, in_subquery, scalar_subquery};
 
     use crate::test::function_stub::sum;
     use datafusion_common::{
@@ -2708,12 +2814,12 @@ mod tests {
 
         assert_snapshot!(plan, @r"
         Union
-          Cross Join: 
+          Cross Join:
             SubqueryAlias: left
               Values: (Int32(1))
             SubqueryAlias: right
               Values: (Int32(1))
-          Cross Join: 
+          Cross Join:
             SubqueryAlias: left
               Values: (Int32(1))
             SubqueryAlias: right
@@ -2828,11 +2934,13 @@ mod tests {
                 .into_iter()
                 .collect();
         let metadata2 = FieldMetadata::from(metadata2);
-        assert!(LogicalPlanBuilder::values(vec![
-            vec![lit_with_metadata(1, Some(metadata.clone()))],
-            vec![lit_with_metadata(2, Some(metadata2.clone()))],
-        ])
-        .is_err());
+        assert!(
+            LogicalPlanBuilder::values(vec![
+                vec![lit_with_metadata(1, Some(metadata.clone()))],
+                vec![lit_with_metadata(2, Some(metadata2.clone()))],
+            ])
+            .is_err()
+        );
 
         Ok(())
     }
diff --git a/datafusion/expr/src/logical_plan/ddl.rs b/datafusion/expr/src/logical_plan/ddl.rs
index 74fe7a2d009d0..8a46e842a861e 100644
--- a/datafusion/expr/src/logical_plan/ddl.rs
+++ b/datafusion/expr/src/logical_plan/ddl.rs
@@ -132,7 +132,7 @@ impl DdlStatement {
             fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
                 match self.0 {
                     DdlStatement::CreateExternalTable(CreateExternalTable {
-                        ref name,
+                        name,
                         constraints,
                         ..
                     }) => {
@@ -186,7 +186,10 @@ impl DdlStatement {
                         cascade,
                         ..
                     }) => {
-                        write!(f, "DropCatalogSchema: {name:?} if not exist:={if_exists} cascade:={cascade}")
+                        write!(
+                            f,
+                            "DropCatalogSchema: {name:?} if not exist:={if_exists} cascade:={cascade}"
+                        )
                     }
                     DdlStatement::CreateFunction(CreateFunction { name, .. }) => {
                         write!(f, "CreateFunction: name {name:?}")
@@ -234,6 +237,158 @@ pub struct CreateExternalTable {
     pub column_defaults: HashMap<String, Expr>,
 }
 
+impl CreateExternalTable {
+    /// Creates a builder for [`CreateExternalTable`] with required fields.
+    ///
+    /// # Arguments
+    /// * `name` - The table name
+    /// * `location` - The physical location of the table files
+    /// * `file_type` - The file type (e.g., "parquet", "csv", "json")
+    /// * `schema` - The table schema
+    ///
+    /// # Example
+    /// ```
+    /// # use datafusion_expr::CreateExternalTable;
+    /// # use datafusion_common::{DFSchema, TableReference};
+    /// # use std::sync::Arc;
+    /// let table = CreateExternalTable::builder(
+    ///     TableReference::bare("my_table"),
+    ///     "/path/to/data",
+    ///     "parquet",
+    ///     Arc::new(DFSchema::empty())
+    /// ).build();
+    /// ```
+    pub fn builder(
+        name: impl Into<TableReference>,
+        location: impl Into<String>,
+        file_type: impl Into<String>,
+        schema: DFSchemaRef,
+    ) -> CreateExternalTableBuilder {
+        CreateExternalTableBuilder {
+            name: name.into(),
+            location: location.into(),
+            file_type: file_type.into(),
+            schema,
+            table_partition_cols: vec![],
+            if_not_exists: false,
+            or_replace: false,
+            temporary: false,
+            definition: None,
+            order_exprs: vec![],
+            unbounded: false,
+            options: HashMap::new(),
+            constraints: Default::default(),
+            column_defaults: HashMap::new(),
+        }
+    }
+}
+
+/// Builder for [`CreateExternalTable`] that provides a fluent API for construction.
+///
+/// Created via [`CreateExternalTable::builder`].
+#[derive(Debug, Clone)]
+pub struct CreateExternalTableBuilder {
+    name: TableReference,
+    location: String,
+    file_type: String,
+    schema: DFSchemaRef,
+    table_partition_cols: Vec<String>,
+    if_not_exists: bool,
+    or_replace: bool,
+    temporary: bool,
+    definition: Option<String>,
+    order_exprs: Vec<Vec<Sort>>,
+    unbounded: bool,
+    options: HashMap<String, String>,
+    constraints: Constraints,
+    column_defaults: HashMap<String, Expr>,
+}
+
+impl CreateExternalTableBuilder {
+    /// Set the partition columns
+    pub fn with_partition_cols(mut self, cols: Vec<String>) -> Self {
+        self.table_partition_cols = cols;
+        self
+    }
+
+    /// Set the if_not_exists flag
+    pub fn with_if_not_exists(mut self, if_not_exists: bool) -> Self {
+        self.if_not_exists = if_not_exists;
+        self
+    }
+
+    /// Set the or_replace flag
+    pub fn with_or_replace(mut self, or_replace: bool) -> Self {
+        self.or_replace = or_replace;
+        self
+    }
+
+    /// Set the temporary flag
+    pub fn with_temporary(mut self, temporary: bool) -> Self {
+        self.temporary = temporary;
+        self
+    }
+
+    /// Set the SQL definition
+    pub fn with_definition(mut self, definition: Option<String>) -> Self {
+        self.definition = definition;
+        self
+    }
+
+    /// Set the order expressions
+    pub fn with_order_exprs(mut self, order_exprs: Vec<Vec<Sort>>) -> Self {
+        self.order_exprs = order_exprs;
+        self
+    }
+
+    /// Set the unbounded flag
+    pub fn with_unbounded(mut self, unbounded: bool) -> Self {
+        self.unbounded = unbounded;
+        self
+    }
+
+    /// Set the table options
+    pub fn with_options(mut self, options: HashMap<String, String>) -> Self {
+        self.options = options;
+        self
+    }
+
+    /// Set the table constraints
+    pub fn with_constraints(mut self, constraints: Constraints) -> Self {
+        self.constraints = constraints;
+        self
+    }
+
+    /// Set the column defaults
+    pub fn with_column_defaults(
+        mut self,
+        column_defaults: HashMap<String, Expr>,
+    ) -> Self {
+        self.column_defaults = column_defaults;
+        self
+    }
+
+    /// Build the [`CreateExternalTable`]
+    pub fn build(self) -> CreateExternalTable {
+        CreateExternalTable {
+            schema: self.schema,
+            name: self.name,
+            location: self.location,
+            file_type: self.file_type,
+            table_partition_cols: self.table_partition_cols,
+            if_not_exists: self.if_not_exists,
+            or_replace: self.or_replace,
+            temporary: self.temporary,
+            definition: self.definition,
+            order_exprs: self.order_exprs,
+            unbounded: self.unbounded,
+            options: self.options,
+            constraints: self.constraints,
+            column_defaults: self.column_defaults,
+        }
+    }
+}
+
 // Hashing refers to a subset of fields considered in PartialEq.
 impl Hash for CreateExternalTable {
     fn hash<H: Hasher>(&self, state: &mut H) {
diff --git a/datafusion/expr/src/logical_plan/display.rs b/datafusion/expr/src/logical_plan/display.rs
index b60126335598f..58c7feb616179 100644
--- a/datafusion/expr/src/logical_plan/display.rs
+++ b/datafusion/expr/src/logical_plan/display.rs
@@ -21,17 +21,17 @@ use std::collections::HashMap;
 use std::fmt;
 
 use crate::{
-    expr_vec_fmt, Aggregate, DescribeTable, Distinct, DistinctOn, DmlStatement, Expr,
-    Filter, Join, Limit, LogicalPlan, Partitioning, Projection, RecursiveQuery,
-    Repartition, Sort, Subquery, SubqueryAlias, TableProviderFilterPushDown, TableScan,
-    Unnest, Values, Window,
+    Aggregate, DescribeTable, Distinct, DistinctOn, DmlStatement, Expr, Filter, Join,
+    Limit, LogicalPlan, Partitioning, Projection, RecursiveQuery, Repartition, Sort,
+    Subquery, SubqueryAlias, TableProviderFilterPushDown, TableScan, Unnest, Values,
+    Window, expr_vec_fmt,
 };
 
 use crate::dml::CopyTo;
 use arrow::datatypes::Schema;
 use datafusion_common::display::GraphvizBuilder;
 use datafusion_common::tree_node::{TreeNodeRecursion, TreeNodeVisitor};
-use datafusion_common::{internal_datafusion_err, Column, DataFusionError};
+use datafusion_common::{Column, DataFusionError, internal_datafusion_err};
 use serde_json::json;
 
 /// Formats plans with a single line per node. For example:
@@ -117,13 +117,7 @@ pub fn display_schema(schema: &Schema) -> impl fmt::Display + '_ {
                     write!(f, ", ")?;
                 }
                 let nullable_str = if field.is_nullable() { ";N" } else { "" };
-                write!(
-                    f,
-                    "{}:{:?}{}",
-                    field.name(),
-                    field.data_type(),
-                    nullable_str
-                )?;
+                write!(f, "{}:{}{}", field.name(), field.data_type(), nullable_str)?;
             }
             write!(f, "]")
         }
@@ -319,7 +313,7 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> {
                     "Is Distinct": is_distinct,
                 })
             }
-            LogicalPlan::Values(Values { ref values, .. }) => {
+            LogicalPlan::Values(Values { values, .. }) => {
                 let str_values = values
                     .iter()
                     // limit to only 5 values to avoid horrible display
@@ -344,10 +338,10 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> {
                 })
             }
             LogicalPlan::TableScan(TableScan {
-                ref source,
-                ref table_name,
-                ref filters,
-                ref fetch,
+                source,
+                table_name,
+                filters,
+                fetch,
                 ..
             }) => {
                 let mut object = json!({
@@ -403,7 +397,7 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> {
 
                 object
             }
-            LogicalPlan::Projection(Projection { ref expr, .. }) => {
+            LogicalPlan::Projection(Projection { expr, .. }) => {
                 json!({
                     "Node Type": "Projection",
                     "Expressions": expr.iter().map(|e| e.to_string()).collect::<Vec<_>>()
@@ -443,25 +437,22 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> {
                 })
             }
             LogicalPlan::Filter(Filter {
-                predicate: ref expr,
-                ..
+                predicate: expr, ..
             }) => {
                 json!({
                     "Node Type": "Filter",
                     "Condition": format!("{}", expr)
                 })
             }
-            LogicalPlan::Window(Window {
-                ref window_expr, ..
-            }) => {
+            LogicalPlan::Window(Window { window_expr, .. }) => {
                 json!({
                     "Node Type": "WindowAggr",
                     "Expressions": expr_vec_fmt!(window_expr)
                 })
             }
             LogicalPlan::Aggregate(Aggregate {
-                ref group_expr,
-                ref aggr_expr,
+                group_expr,
+                aggr_expr,
                 ..
             }) => {
                 json!({
@@ -483,7 +474,7 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> {
                 object
             }
             LogicalPlan::Join(Join {
-                on: ref keys,
+                on: keys,
                 filter,
                 join_constraint,
                 join_type,
@@ -534,11 +525,7 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> {
                     })
                 }
             },
-            LogicalPlan::Limit(Limit {
-                ref skip,
-                ref fetch,
-                ..
-            }) => {
+            LogicalPlan::Limit(Limit { skip, fetch, .. }) => {
                 let mut object = serde_json::json!(
                     {
                         "Node Type": "Limit",
@@ -557,7 +544,7 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> {
                     "Node Type": "Subquery"
                 })
             }
-            LogicalPlan::SubqueryAlias(SubqueryAlias { ref alias, .. }) => {
+            LogicalPlan::SubqueryAlias(SubqueryAlias { alias, .. }) => {
                 json!({
                     "Node Type": "Subquery",
                     "Alias": alias.table(),
diff --git a/datafusion/expr/src/logical_plan/dml.rs b/datafusion/expr/src/logical_plan/dml.rs
index b8448a5da6c42..b668cbfe2cc35 100644
--- a/datafusion/expr/src/logical_plan/dml.rs
+++ b/datafusion/expr/src/logical_plan/dml.rs
@@ -122,11 +122,9 @@ impl CopyTo {
 /// * `INSERT` - Appends new rows to the existing table. Calls
 ///   [`TableProvider::insert_into`]
 ///
-/// * `DELETE` - Removes rows from the table. Currently NOT supported by the
-///   [`TableProvider`] trait or builtin sources.
+/// * `DELETE` - Removes rows from the table. Calls [`TableProvider::delete_from`]
 ///
-/// * `UPDATE` - Modifies existing rows in the table. Currently NOT supported by
-///   the [`TableProvider`] trait or builtin sources.
+/// * `UPDATE` - Modifies existing rows in the table. Calls [`TableProvider::update`]
 ///
 /// * `CREATE TABLE AS SELECT` - Creates a new table and populates it with data
 ///   from a query. This is similar to the `INSERT` operation, but it creates a new
@@ -136,6 +134,8 @@ impl CopyTo {
 ///
 /// [`TableProvider`]: https://docs.rs/datafusion/latest/datafusion/datasource/trait.TableProvider.html
 /// [`TableProvider::insert_into`]: https://docs.rs/datafusion/latest/datafusion/datasource/trait.TableProvider.html#method.insert_into
+/// [`TableProvider::delete_from`]: https://docs.rs/datafusion/latest/datafusion/datasource/trait.TableProvider.html#method.delete_from
+/// [`TableProvider::update`]: https://docs.rs/datafusion/latest/datafusion/datasource/trait.TableProvider.html#method.update
 #[derive(Clone)]
 pub struct DmlStatement {
     /// The table name
@@ -237,6 +237,8 @@ pub enum WriteOp {
     Update,
     /// `CREATE TABLE AS SELECT` operation
     Ctas,
+    /// `TRUNCATE` operation
+    Truncate,
 }
 
 impl WriteOp {
@@ -247,6 +249,7 @@ impl WriteOp {
             WriteOp::Delete => "Delete",
             WriteOp::Update => "Update",
             WriteOp::Ctas => "Ctas",
+            WriteOp::Truncate => "Truncate",
         }
     }
 }
diff --git a/datafusion/expr/src/logical_plan/invariants.rs b/datafusion/expr/src/logical_plan/invariants.rs
index ccdf9e444b8fd..0889afd08fee4 100644
--- a/datafusion/expr/src/logical_plan/invariants.rs
+++ b/datafusion/expr/src/logical_plan/invariants.rs
@@ -16,16 +16,15 @@
 // under the License.
 
 use datafusion_common::{
-    internal_err, plan_err,
+    DFSchemaRef, Result, assert_or_internal_err, plan_err,
     tree_node::{TreeNode, TreeNodeRecursion},
-    DFSchemaRef, Result,
 };
 
 use crate::{
-    expr::{Exists, InSubquery},
+    Aggregate, Expr, Filter, Join, JoinType, LogicalPlan, Window,
+    expr::{Exists, InSubquery, SetComparison},
     expr_rewriter::strip_outer_reference,
     utils::{collect_subquery_cols, split_conjunction},
-    Aggregate, Expr, Filter, Join, JoinType, LogicalPlan, Window,
 };
 
 use super::Extension;
@@ -82,6 +81,7 @@ fn assert_valid_extension_nodes(plan: &LogicalPlan, check: InvariantLevel) -> Re
                 match expr {
                     Expr::Exists(Exists { subquery, .. })
                     | Expr::InSubquery(InSubquery { subquery, .. })
+                    | Expr::SetComparison(SetComparison { subquery, .. })
                     | Expr::ScalarSubquery(subquery) => {
                         assert_valid_extension_nodes(&subquery.subquery, check)?;
                     }
@@ -114,15 +114,13 @@ fn assert_valid_semantic_plan(plan: &LogicalPlan) -> Result<()> {
 pub fn assert_expected_schema(schema: &DFSchemaRef, plan: &LogicalPlan) -> Result<()> {
     let compatible = plan.schema().logically_equivalent_names_and_types(schema);
 
-    if !compatible {
-        internal_err!(
-            "Failed due to a difference in schemas: original schema: {:?}, new schema: {:?}",
-            schema,
-            plan.schema()
-        )
-    } else {
-        Ok(())
-    }
+    assert_or_internal_err!(
+        compatible,
+        "Failed due to a difference in schemas: original schema: {:?}, new schema: {:?}",
+        schema,
+        plan.schema()
+    );
+    Ok(())
 }
 
 /// Asserts that the subqueries are structured properly with valid node placement.
@@ -136,6 +134,7 @@ fn assert_subqueries_are_valid(plan: &LogicalPlan) -> Result<()> {
                 match expr {
                     Expr::Exists(Exists { subquery, .. })
                     | Expr::InSubquery(InSubquery { subquery, .. })
+                    | Expr::SetComparison(SetComparison { subquery, .. })
                     | Expr::ScalarSubquery(subquery) => {
                         check_subquery_expr(plan, &subquery.subquery, expr)?;
                     }
@@ -200,21 +199,26 @@ pub fn check_subquery_expr(
                 }
             }?;
             match outer_plan {
-                LogicalPlan::Projection(_)
-                | LogicalPlan::Filter(_) => Ok(()),
-                LogicalPlan::Aggregate(Aggregate { group_expr, aggr_expr, .. }) => {
+                LogicalPlan::Projection(_) | LogicalPlan::Filter(_) => Ok(()),
+                LogicalPlan::Aggregate(Aggregate {
+                    group_expr,
+                    aggr_expr,
+                    ..
+                }) => {
                     if group_expr.contains(expr) && !aggr_expr.contains(expr) {
                         // TODO revisit this validation logic
                         plan_err!(
-                            "Correlated scalar subquery in the GROUP BY clause must also be in the aggregate expressions"
+                            "Correlated scalar subquery in the GROUP BY clause must \
+                            also be in the aggregate expressions"
                         )
                     } else {
                         Ok(())
                     }
                 }
                 _ => plan_err!(
-                    "Correlated scalar subquery can only be used in Projection, Filter, Aggregate plan nodes"
-                )
+                    "Correlated scalar subquery can only be used in Projection, \
+                    Filter, Aggregate plan nodes"
+                ),
             }?;
         }
         check_correlations_in_subquery(inner_plan)
@@ -229,6 +233,20 @@ pub fn check_subquery_expr(
                 );
             }
         }
+        if let Expr::SetComparison(set_comparison) = expr
+            && set_comparison.subquery.subquery.schema().fields().len() > 1
+        {
+            return plan_err!(
+                "Set comparison subquery should only return one column, but found {}: {}",
+                set_comparison.subquery.subquery.schema().fields().len(),
+                set_comparison
+                    .subquery
+                    .subquery
+                    .schema()
+                    .field_names()
+                    .join(", ")
+            );
+        }
         match outer_plan {
             LogicalPlan::Projection(_)
             | LogicalPlan::Filter(_)
@@ -237,7 +255,7 @@ pub fn check_subquery_expr(
             | LogicalPlan::Aggregate(_)
             | LogicalPlan::Join(_) => Ok(()),
             _ => plan_err!(
-                "In/Exist subquery can only be used in \
+                "In/Exist/SetComparison subquery can only be used in \
                 Projection, Filter, TableScan, Window functions, Aggregate and Join plan nodes, \
                 but was used in [{}]",
                 outer_plan.display()
diff --git a/datafusion/expr/src/logical_plan/mod.rs b/datafusion/expr/src/logical_plan/mod.rs
index 7de2fd117487a..c2b01868c97f3 100644
--- a/datafusion/expr/src/logical_plan/mod.rs
+++ b/datafusion/expr/src/logical_plan/mod.rs
@@ -21,15 +21,15 @@ pub mod display;
 pub mod dml;
 mod extension;
 pub(crate) mod invariants;
-pub use invariants::{assert_expected_schema, check_subquery_expr, InvariantLevel};
+pub use invariants::{InvariantLevel, assert_expected_schema, check_subquery_expr};
 mod plan;
 mod statement;
 pub mod tree_node;
 
 pub use builder::{
+    LogicalPlanBuilder, LogicalPlanBuilderOptions, LogicalTableSource, UNNAMED_TABLE,
     build_join_schema, requalify_sides_if_needed, table_scan, union,
-    wrap_projection_for_join_if_necessary, LogicalPlanBuilder, LogicalPlanBuilderOptions,
-    LogicalTableSource, UNNAMED_TABLE,
+    wrap_projection_for_join_if_necessary,
 };
 pub use ddl::{
     CreateCatalog, CreateCatalogSchema, CreateExternalTable, CreateFunction,
@@ -38,15 +38,17 @@ pub use ddl::{
 };
 pub use dml::{DmlStatement, WriteOp};
 pub use plan::{
-    projection_schema, Aggregate, Analyze, ColumnUnnestList, DescribeTable, Distinct,
-    DistinctOn, EmptyRelation, Explain, ExplainOption, Extension, FetchType, Filter,
-    Join, JoinConstraint, JoinType, Limit, LogicalPlan, Partitioning, PlanType,
-    Projection, RecursiveQuery, Repartition, SkipType, Sort, StringifiedPlan, Subquery,
+    Aggregate, Analyze, ColumnUnnestList, DescribeTable, Distinct, DistinctOn,
+    EmptyRelation, Explain, ExplainOption, Extension, FetchType, Filter, Join,
+    JoinConstraint, JoinType, Limit, LogicalPlan, Partitioning, PlanType, Projection,
+    RecursiveQuery, Repartition, SkipType, Sort, StringifiedPlan, Subquery,
     SubqueryAlias, TableScan, ToStringifiedPlan, Union, Unnest, Values, Window,
+    projection_schema,
 };
 pub use statement::{
-    Deallocate, Execute, Prepare, SetVariable, Statement, TransactionAccessMode,
-    TransactionConclusion, TransactionEnd, TransactionIsolationLevel, TransactionStart,
+    Deallocate, Execute, Prepare, ResetVariable, SetVariable, Statement,
+    TransactionAccessMode, TransactionConclusion, TransactionEnd,
+    TransactionIsolationLevel, TransactionStart,
 };
 
 pub use datafusion_common::format::ExplainFormat;
diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs
index 0f0d81186d68f..db8b82fe87a14 100644
--- a/datafusion/expr/src/logical_plan/plan.rs
+++ b/datafusion/expr/src/logical_plan/plan.rs
@@ -23,19 +23,19 @@ use std::fmt::{self, Debug, Display, Formatter};
 use std::hash::{Hash, Hasher};
 use std::sync::{Arc, LazyLock};
 
+use super::DdlStatement;
 use super::dml::CopyTo;
 use super::invariants::{
-    assert_always_invariants_at_current_node, assert_executable_invariants,
-    InvariantLevel,
+    InvariantLevel, assert_always_invariants_at_current_node,
+    assert_executable_invariants,
 };
-use super::DdlStatement;
 use crate::builder::{unique_field_aliases, unnest_with_options};
 use crate::expr::{
-    intersect_metadata_for_union, Alias, Placeholder, Sort as SortExpr, WindowFunction,
-    WindowFunctionParams,
+    Alias, Placeholder, Sort as SortExpr, WindowFunction, WindowFunctionParams,
+    intersect_metadata_for_union,
 };
 use crate::expr_rewriter::{
-    create_col_from_scalar_expr, normalize_cols, normalize_sorts, NamePreserver,
+    NamePreserver, create_col_from_scalar_expr, normalize_cols, normalize_sorts,
 };
 use crate::logical_plan::display::{GraphvizVisitor, IndentVisitor};
 use crate::logical_plan::extension::UserDefinedLogicalNode;
@@ -45,10 +45,9 @@ use crate::utils::{
     grouping_set_expr_count, grouping_set_to_exprlist, split_conjunction,
 };
 use crate::{
-    build_join_schema, expr_vec_fmt, requalify_sides_if_needed, BinaryExpr,
-    CreateMemoryTable, CreateView, Execute, Expr, ExprSchemable, LogicalPlanBuilder,
-    Operator, Prepare, TableProviderFilterPushDown, TableSource,
-    WindowFunctionDefinition,
+    BinaryExpr, CreateMemoryTable, CreateView, Execute, Expr, ExprSchemable, GroupingSet,
+    LogicalPlanBuilder, Operator, Prepare, TableProviderFilterPushDown, TableSource,
+    WindowFunctionDefinition, build_join_schema, expr_vec_fmt, requalify_sides_if_needed,
 };
 
 use arrow::datatypes::{DataType, Field, FieldRef, Schema, SchemaRef};
@@ -59,10 +58,10 @@ use datafusion_common::tree_node::{
     Transformed, TreeNode, TreeNodeContainer, TreeNodeRecursion,
 };
 use datafusion_common::{
-    aggregate_functional_dependencies, internal_err, plan_err, Column, Constraints,
-    DFSchema, DFSchemaRef, DataFusionError, Dependency, FunctionalDependence,
-    FunctionalDependencies, NullEquality, ParamValues, Result, ScalarValue, Spans,
-    TableReference, UnnestOptions,
+    Column, Constraints, DFSchema, DFSchemaRef, DataFusionError, Dependency,
+    FunctionalDependence, FunctionalDependencies, NullEquality, ParamValues, Result,
+    ScalarValue, Spans, TableReference, UnnestOptions, aggregate_functional_dependencies,
+    assert_eq_or_internal_err, assert_or_internal_err, internal_err, plan_err,
 };
 use indexmap::IndexSet;
 
@@ -295,9 +294,12 @@ pub enum LogicalPlan {
 
 impl Default for LogicalPlan {
     fn default() -> Self {
+        // `Default` is used as a transient placeholder on hot paths (e.g.
+        // `Box`/`Arc` `map_elements`), so use a shared empty schema to avoid
+        // allocating.
         LogicalPlan::EmptyRelation(EmptyRelation {
             produce_one_row: false,
-            schema: Arc::new(DFSchema::empty()),
+            schema: Arc::clone(DFSchema::empty_ref()),
         })
     }
 }
@@ -662,6 +664,7 @@ impl LogicalPlan {
                 on,
                 schema: _,
                 null_equality,
+                null_aware,
             }) => {
                 let schema =
                     build_join_schema(left.schema(), right.schema(), &join_type)?;
@@ -683,6 +686,7 @@ impl LogicalPlan {
                     filter,
                     schema: DFSchemaRef::new(schema),
                     null_equality,
+                    null_aware,
                 }))
             }
             LogicalPlan::Subquery(_) => Ok(self),
@@ -902,6 +906,7 @@ impl LogicalPlan {
                 join_constraint,
                 on,
                 null_equality,
+                null_aware,
                 ..
             }) => {
                 let (left, right) = self.only_two_inputs(inputs)?;
@@ -925,7 +930,9 @@ impl LogicalPlan {
                 let mut iter = expr.into_iter();
                 while let Some(left) = iter.next() {
                     let Some(right) = iter.next() else {
-                        internal_err!("Expected a pair of expressions to construct the join on expression")?
+                        internal_err!(
+                            "Expected a pair of expressions to construct the join on expression"
+                        )?
                     };
 
                     // SimplifyExpression rule may add alias to the equi_expr.
@@ -941,6 +948,7 @@ impl LogicalPlan {
                     filter: filter_expr,
                     schema: DFSchemaRef::new(schema),
                     null_equality: *null_equality,
+                    null_aware: *null_aware,
                 }))
             }
             LogicalPlan::Subquery(Subquery {
@@ -965,13 +973,13 @@ impl LogicalPlan {
             }
             LogicalPlan::Limit(Limit { skip, fetch, .. }) => {
                 let old_expr_len = skip.iter().chain(fetch.iter()).count();
-                if old_expr_len != expr.len() {
-                    return internal_err!(
-                        "Invalid number of new Limit expressions: expected {}, got {}",
-                        old_expr_len,
-                        expr.len()
-                    );
-                }
+                assert_eq_or_internal_err!(
+                    old_expr_len,
+                    expr.len(),
+                    "Invalid number of new Limit expressions: expected {}, got {}",
+                    old_expr_len,
+                    expr.len()
+                );
                 // `LogicalPlan::expressions()` returns in [skip, fetch] order, so we can pop from the end.
                 let new_fetch = fetch.as_ref().and_then(|_| expr.pop());
                 let new_skip = skip.as_ref().and_then(|_| expr.pop());
@@ -1053,7 +1061,10 @@ impl LogicalPlan {
                         let input = self.only_input(inputs)?;
                         let sort_expr = expr.split_off(on_expr.len() + select_expr.len());
                         let select_expr = expr.split_off(on_expr.len());
-                        assert!(sort_expr.is_empty(), "with_new_exprs for Distinct does not support sort expressions");
+                        assert!(
+                            sort_expr.is_empty(),
+                            "with_new_exprs for Distinct does not support sort expressions"
+                        );
                         Distinct::On(DistinctOn::try_new(
                             expr,
                             select_expr,
@@ -1156,45 +1167,49 @@ impl LogicalPlan {
 
     /// Helper for [Self::with_new_exprs] to use when no expressions are expected.
     #[inline]
-    #[allow(clippy::needless_pass_by_value)] // expr is moved intentionally to ensure it's not used again
+    #[expect(clippy::needless_pass_by_value)] // expr is moved intentionally to ensure it's not used again
     fn assert_no_expressions(&self, expr: Vec<Expr>) -> Result<()> {
-        if !expr.is_empty() {
-            return internal_err!("{self:?} should have no exprs, got {:?}", expr);
-        }
+        assert_or_internal_err!(
+            expr.is_empty(),
+            "{self:?} should have no exprs, got {:?}",
+            expr
+        );
         Ok(())
     }
 
     /// Helper for [Self::with_new_exprs] to use when no inputs are expected.
     #[inline]
-    #[allow(clippy::needless_pass_by_value)] // inputs is moved intentionally to ensure it's not used again
+    #[expect(clippy::needless_pass_by_value)] // inputs is moved intentionally to ensure it's not used again
     fn assert_no_inputs(&self, inputs: Vec<LogicalPlan>) -> Result<()> {
-        if !inputs.is_empty() {
-            return internal_err!("{self:?} should have no inputs, got: {:?}", inputs);
-        }
+        assert_or_internal_err!(
+            inputs.is_empty(),
+            "{self:?} should have no inputs, got: {:?}",
+            inputs
+        );
         Ok(())
     }
 
     /// Helper for [Self::with_new_exprs] to use when exactly one expression is expected.
     #[inline]
     fn only_expr(&self, mut expr: Vec<Expr>) -> Result<Expr> {
-        if expr.len() != 1 {
-            return internal_err!(
-                "{self:?} should have exactly one expr, got {:?}",
-                expr
-            );
-        }
+        assert_eq_or_internal_err!(
+            expr.len(),
+            1,
+            "{self:?} should have exactly one expr, got {:?}",
+            &expr
+        );
         Ok(expr.remove(0))
     }
 
     /// Helper for [Self::with_new_exprs] to use when exactly one input is expected.
     #[inline]
     fn only_input(&self, mut inputs: Vec<LogicalPlan>) -> Result<LogicalPlan> {
-        if inputs.len() != 1 {
-            return internal_err!(
-                "{self:?} should have exactly one input, got {:?}",
-                inputs
-            );
-        }
+        assert_eq_or_internal_err!(
+            inputs.len(),
+            1,
+            "{self:?} should have exactly one input, got {:?}",
+            &inputs
+        );
         Ok(inputs.remove(0))
     }
 
@@ -1204,12 +1219,12 @@ impl LogicalPlan {
         &self,
         mut inputs: Vec<LogicalPlan>,
     ) -> Result<(LogicalPlan, LogicalPlan)> {
-        if inputs.len() != 2 {
-            return internal_err!(
-                "{self:?} should have exactly two inputs, got {:?}",
-                inputs
-            );
-        }
+        assert_eq_or_internal_err!(
+            inputs.len(),
+            2,
+            "{self:?} should have exactly two inputs, got {:?}",
+            &inputs
+        );
         let right = inputs.remove(1);
         let left = inputs.remove(0);
         Ok((left, right))
@@ -1380,6 +1395,82 @@ impl LogicalPlan {
         }
     }
 
+    /// Returns the skip (offset) of this plan node, if it has one.
+    ///
+    /// Only [`LogicalPlan::Limit`] carries a skip value; all other variants
+    /// return `Ok(None)`. Returns `Ok(None)` for a zero skip.
+    pub fn skip(&self) -> Result<Option<usize>> {
+        match self {
+            LogicalPlan::Limit(limit) => match limit.get_skip_type()? {
+                SkipType::Literal(0) => Ok(None),
+                SkipType::Literal(n) => Ok(Some(n)),
+                SkipType::UnsupportedExpr => Ok(None),
+            },
+            LogicalPlan::Sort(_) => Ok(None),
+            LogicalPlan::TableScan(_) => Ok(None),
+            LogicalPlan::Projection(_) => Ok(None),
+            LogicalPlan::Filter(_) => Ok(None),
+            LogicalPlan::Window(_) => Ok(None),
+            LogicalPlan::Aggregate(_) => Ok(None),
+            LogicalPlan::Join(_) => Ok(None),
+            LogicalPlan::Repartition(_) => Ok(None),
+            LogicalPlan::Union(_) => Ok(None),
+            LogicalPlan::EmptyRelation(_) => Ok(None),
+            LogicalPlan::Subquery(_) => Ok(None),
+            LogicalPlan::SubqueryAlias(_) => Ok(None),
+            LogicalPlan::Statement(_) => Ok(None),
+            LogicalPlan::Values(_) => Ok(None),
+            LogicalPlan::Explain(_) => Ok(None),
+            LogicalPlan::Analyze(_) => Ok(None),
+            LogicalPlan::Extension(_) => Ok(None),
+            LogicalPlan::Distinct(_) => Ok(None),
+            LogicalPlan::Dml(_) => Ok(None),
+            LogicalPlan::Ddl(_) => Ok(None),
+            LogicalPlan::Copy(_) => Ok(None),
+            LogicalPlan::DescribeTable(_) => Ok(None),
+            LogicalPlan::Unnest(_) => Ok(None),
+            LogicalPlan::RecursiveQuery(_) => Ok(None),
+        }
+    }
+
+    /// Returns the fetch (limit) of this plan node, if it has one.
+    ///
+    /// [`LogicalPlan::Sort`], [`LogicalPlan::TableScan`], and
+    /// [`LogicalPlan::Limit`] may carry a fetch value; all other variants
+    /// return `Ok(None)`.
+    pub fn fetch(&self) -> Result<Option<usize>> {
+        match self {
+            LogicalPlan::Sort(Sort { fetch, .. }) => Ok(*fetch),
+            LogicalPlan::TableScan(TableScan { fetch, .. }) => Ok(*fetch),
+            LogicalPlan::Limit(limit) => match limit.get_fetch_type()? {
+                FetchType::Literal(s) => Ok(s),
+                FetchType::UnsupportedExpr => Ok(None),
+            },
+            LogicalPlan::Projection(_) => Ok(None),
+            LogicalPlan::Filter(_) => Ok(None),
+            LogicalPlan::Window(_) => Ok(None),
+            LogicalPlan::Aggregate(_) => Ok(None),
+            LogicalPlan::Join(_) => Ok(None),
+            LogicalPlan::Repartition(_) => Ok(None),
+            LogicalPlan::Union(_) => Ok(None),
+            LogicalPlan::EmptyRelation(_) => Ok(None),
+            LogicalPlan::Subquery(_) => Ok(None),
+            LogicalPlan::SubqueryAlias(_) => Ok(None),
+            LogicalPlan::Statement(_) => Ok(None),
+            LogicalPlan::Values(_) => Ok(None),
+            LogicalPlan::Explain(_) => Ok(None),
+            LogicalPlan::Analyze(_) => Ok(None),
+            LogicalPlan::Extension(_) => Ok(None),
+            LogicalPlan::Distinct(_) => Ok(None),
+            LogicalPlan::Dml(_) => Ok(None),
+            LogicalPlan::Ddl(_) => Ok(None),
+            LogicalPlan::Copy(_) => Ok(None),
+            LogicalPlan::DescribeTable(_) => Ok(None),
+            LogicalPlan::Unnest(_) => Ok(None),
+            LogicalPlan::RecursiveQuery(_) => Ok(None),
+        }
+    }
+
     /// If this node's expressions contains any references to an outer subquery
     pub fn contains_outer_reference(&self) -> bool {
         let mut contains = false;
@@ -1471,11 +1562,30 @@ impl LogicalPlan {
                     // Preserve name to avoid breaking column references to this expression
                     Ok(transformed_expr.update_data(|expr| original_name.restore(expr)))
                 }
-            })
+            })?
+            .map_data(|plan| plan.update_schema_data_type())
         })
         .map(|res| res.data)
     }
 
+    /// Recompute schema fields' data type after replacing params, ensuring fields data type can be
+    /// updated according to the new parameters.
+    ///
+    /// Unlike `recompute_schema()`, this method rebuilds VALUES plans entirely to properly infer
+    /// types types from literal values after placeholder substitution.
+    fn update_schema_data_type(self) -> Result<LogicalPlan> {
+        match self {
+            // Build `LogicalPlan::Values` from the values for type inference.
+            // We can't use `recompute_schema` because it skips recomputing for
+            // `LogicalPlan::Values`.
+            LogicalPlan::Values(Values { values, schema: _ }) => {
+                LogicalPlanBuilder::values(values)?.build()
+            }
+            // other plans can just use `recompute_schema` directly.
+            plan => plan.recompute_schema(),
+        }
+    }
+
     /// Walk the logical plan, find any `Placeholder` tokens, and return a set of their names.
     pub fn get_parameter_names(&self) -> Result<HashSet<String>> {
         let mut param_names = HashSet::new();
@@ -1744,16 +1854,19 @@ impl LogicalPlan {
         impl Display for Wrapper<'_> {
             fn fmt(&self, f: &mut Formatter) -> fmt::Result {
                 match self.0 {
-                    LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row, schema: _ }) => {
+                    LogicalPlan::EmptyRelation(EmptyRelation {
+                        produce_one_row,
+                        schema: _,
+                    }) => {
                         let rows = if *produce_one_row { 1 } else { 0 };
                         write!(f, "EmptyRelation: rows={rows}")
-                    },
+                    }
                     LogicalPlan::RecursiveQuery(RecursiveQuery {
                         is_distinct, ..
                     }) => {
                         write!(f, "RecursiveQuery: is_distinct={is_distinct}")
                     }
-                    LogicalPlan::Values(Values { ref values, .. }) => {
+                    LogicalPlan::Values(Values { values, .. }) => {
                         let str_values: Vec<_> = values
                             .iter()
                             // limit to only 5 values to avoid horrible display
@@ -1773,11 +1886,11 @@ impl LogicalPlan {
                     }
 
                     LogicalPlan::TableScan(TableScan {
-                        ref source,
-                        ref table_name,
-                        ref projection,
-                        ref filters,
-                        ref fetch,
+                        source,
+                        table_name,
+                        projection,
+                        filters,
+                        fetch,
                         ..
                     }) => {
                         let projected_fields = match projection {
@@ -1847,7 +1960,7 @@ impl LogicalPlan {
 
                         Ok(())
                     }
-                    LogicalPlan::Projection(Projection { ref expr, .. }) => {
+                    LogicalPlan::Projection(Projection { expr, .. }) => {
                         write!(f, "Projection:")?;
                         for (i, expr_item) in expr.iter().enumerate() {
                             if i > 0 {
@@ -1873,18 +1986,19 @@ impl LogicalPlan {
                             .collect::<Vec<String>>()
                             .join(", ");
 
-                        write!(f, "CopyTo: format={} output_url={output_url} options: ({op_str})", file_type.get_ext())
+                        write!(
+                            f,
+                            "CopyTo: format={} output_url={output_url} options: ({op_str})",
+                            file_type.get_ext()
+                        )
                     }
                     LogicalPlan::Ddl(ddl) => {
                         write!(f, "{}", ddl.display())
                     }
                     LogicalPlan::Filter(Filter {
-                        predicate: ref expr,
-                        ..
+                        predicate: expr, ..
                     }) => write!(f, "Filter: {expr}"),
-                    LogicalPlan::Window(Window {
-                        ref window_expr, ..
-                    }) => {
+                    LogicalPlan::Window(Window { window_expr, .. }) => {
                         write!(
                             f,
                             "WindowAggr: windowExpr=[[{}]]",
@@ -1892,8 +2006,8 @@ impl LogicalPlan {
                         )
                     }
                     LogicalPlan::Aggregate(Aggregate {
-                        ref group_expr,
-                        ref aggr_expr,
+                        group_expr,
+                        aggr_expr,
                         ..
                     }) => write!(
                         f,
@@ -1916,7 +2030,7 @@ impl LogicalPlan {
                         Ok(())
                     }
                     LogicalPlan::Join(Join {
-                        on: ref keys,
+                        on: keys,
                         filter,
                         join_constraint,
                         join_type,
@@ -1928,20 +2042,26 @@ impl LogicalPlan {
                             .as_ref()
                             .map(|expr| format!(" Filter: {expr}"))
                             .unwrap_or_else(|| "".to_string());
-                        let join_type = if filter.is_none() && keys.is_empty() && matches!(join_type, JoinType::Inner) {
+                        let join_type = if filter.is_none()
+                            && keys.is_empty()
+                            && *join_type == JoinType::Inner
+                        {
                             "Cross".to_string()
                         } else {
                             join_type.to_string()
                         };
                         match join_constraint {
                             JoinConstraint::On => {
-                                write!(
-                                    f,
-                                    "{} Join: {}{}",
-                                    join_type,
-                                    join_expr.join(", "),
-                                    filter_expr
-                                )
+                                write!(f, "{join_type} Join:",)?;
+                                if !join_expr.is_empty() || !filter_expr.is_empty() {
+                                    write!(
+                                        f,
+                                        " {}{}",
+                                        join_expr.join(", "),
+                                        filter_expr
+                                    )?;
+                                }
+                                Ok(())
                             }
                             JoinConstraint::Using => {
                                 write!(
@@ -1985,22 +2105,25 @@ impl LogicalPlan {
                         // Attempt to display `skip` and `fetch` as literals if possible, otherwise as expressions.
                         let skip_str = match limit.get_skip_type() {
                             Ok(SkipType::Literal(n)) => n.to_string(),
-                            _ => limit.skip.as_ref().map_or_else(|| "None".to_string(), |x| x.to_string()),
+                            _ => limit
+                                .skip
+                                .as_ref()
+                                .map_or_else(|| "None".to_string(), |x| x.to_string()),
                         };
                         let fetch_str = match limit.get_fetch_type() {
                             Ok(FetchType::Literal(Some(n))) => n.to_string(),
                             Ok(FetchType::Literal(None)) => "None".to_string(),
-                            _ => limit.fetch.as_ref().map_or_else(|| "None".to_string(), |x| x.to_string())
+                            _ => limit
+                                .fetch
+                                .as_ref()
+                                .map_or_else(|| "None".to_string(), |x| x.to_string()),
                         };
-                        write!(
-                            f,
-                            "Limit: skip={skip_str}, fetch={fetch_str}",
-                        )
+                        write!(f, "Limit: skip={skip_str}, fetch={fetch_str}",)
                     }
                     LogicalPlan::Subquery(Subquery { .. }) => {
                         write!(f, "Subquery:")
                     }
-                    LogicalPlan::SubqueryAlias(SubqueryAlias { ref alias, .. }) => {
+                    LogicalPlan::SubqueryAlias(SubqueryAlias { alias, .. }) => {
                         write!(f, "SubqueryAlias: {alias}")
                     }
                     LogicalPlan::Statement(statement) => {
@@ -2018,7 +2141,11 @@ impl LogicalPlan {
                             "DistinctOn: on_expr=[[{}]], select_expr=[[{}]], sort_expr=[[{}]]",
                             expr_vec_fmt!(on_expr),
                             expr_vec_fmt!(select_expr),
-                            if let Some(sort_expr) = sort_expr { expr_vec_fmt!(sort_expr) } else { "".to_string() },
+                            if let Some(sort_expr) = sort_expr {
+                                expr_vec_fmt!(sort_expr)
+                            } else {
+                                "".to_string()
+                            },
                         ),
                     },
                     LogicalPlan::Explain { .. } => write!(f, "Explain"),
@@ -2031,22 +2158,31 @@ impl LogicalPlan {
                     LogicalPlan::Unnest(Unnest {
                         input: plan,
                         list_type_columns: list_col_indices,
-                        struct_type_columns: struct_col_indices, .. }) => {
+                        struct_type_columns: struct_col_indices,
+                        ..
+                    }) => {
                         let input_columns = plan.schema().columns();
                         let list_type_columns = list_col_indices
                             .iter()
-                            .map(|(i,unnest_info)|
-                                format!("{}|depth={}", &input_columns[*i].to_string(),
-                                unnest_info.depth))
+                            .map(|(i, unnest_info)| {
+                                format!(
+                                    "{}|depth={}",
+                                    &input_columns[*i].to_string(),
+                                    unnest_info.depth
+                                )
+                            })
                             .collect::<Vec<String>>();
                         let struct_type_columns = struct_col_indices
                             .iter()
                             .map(|i| &input_columns[*i])
                             .collect::<Vec<&Column>>();
                         // get items from input_columns indexed by list_col_indices
-                        write!(f, "Unnest: lists[{}] structs[{}]",
-                        expr_vec_fmt!(list_type_columns),
-                        expr_vec_fmt!(struct_type_columns))
+                        write!(
+                            f,
+                            "Unnest: lists[{}] structs[{}]",
+                            expr_vec_fmt!(list_type_columns),
+                            expr_vec_fmt!(struct_type_columns)
+                        )
                     }
                 }
             }
@@ -2188,7 +2324,11 @@ impl Projection {
         if !expr.iter().any(|e| matches!(e, Expr::Wildcard { .. }))
             && expr.len() != schema.fields().len()
         {
-            return plan_err!("Projection has mismatch between number of expressions ({}) and number of fields in schema ({})", expr.len(), schema.fields().len());
+            return plan_err!(
+                "Projection has mismatch between number of expressions ({}) and number of fields in schema ({})",
+                expr.len(),
+                schema.fields().len()
+            );
         }
         Ok(Self {
             expr,
@@ -2375,12 +2515,12 @@ impl Filter {
         // Note that it is not always possible to resolve the predicate expression during plan
         // construction (such as with correlated subqueries) so we make a best effort here and
         // ignore errors resolving the expression against the schema.
-        if let Ok(predicate_type) = predicate.get_type(input.schema()) {
-            if !Filter::is_allowed_filter_type(&predicate_type) {
-                return plan_err!(
-                    "Cannot create filter with non-boolean predicate '{predicate}' returning {predicate_type}"
-                );
-            }
+        if let Ok(predicate_type) = predicate.get_type(input.schema())
+            && !Filter::is_allowed_filter_type(&predicate_type)
+        {
+            return plan_err!(
+                "Cannot create filter with non-boolean predicate '{predicate}' returning {predicate_type}"
+            );
         }
 
         Ok(Self {
@@ -3181,6 +3321,7 @@ impl PartialOrd for Analyze {
 // TODO(clippy): This clippy `allow` should be removed if
 // the manual `PartialEq` is removed in favor of a derive.
 // (see `PartialEq` the impl for details.)
+#[allow(clippy::allow_attributes)]
 #[allow(clippy::derived_hash_with_manual_eq)]
 #[derive(Debug, Clone, Eq, Hash)]
 pub struct Extension {
@@ -3428,7 +3569,9 @@ pub struct Aggregate {
     pub input: Arc<LogicalPlan>,
     /// Grouping expressions
     pub group_expr: Vec<Expr>,
-    /// Aggregate expressions
+    /// Aggregate expressions.
+    ///
+    /// Note these *must* be either [`Expr::AggregateFunction`] or [`Expr::Alias`]
     pub aggr_expr: Vec<Expr>,
     /// The schema description of the aggregate output
     pub schema: DFSchemaRef,
@@ -3455,11 +3598,12 @@ impl Aggregate {
                 .into_iter()
                 .map(|(q, f)| (q, f.as_ref().clone().with_nullable(true).into()))
                 .collect::<Vec<_>>();
+            let max_ordinal = max_grouping_set_duplicate_ordinal(&group_expr);
             qualified_fields.push((
                 None,
                 Field::new(
                     Self::INTERNAL_GROUPING_ID,
-                    Self::grouping_id_type(qualified_fields.len()),
+                    Self::grouping_id_type(qualified_fields.len(), max_ordinal),
                     false,
                 )
                 .into(),
@@ -3506,7 +3650,7 @@ impl Aggregate {
 
         let aggregate_func_dependencies =
             calc_func_dependencies_for_aggregate(&group_expr, &input, &schema)?;
-        let new_schema = schema.as_ref().clone();
+        let new_schema = Arc::unwrap_or_clone(schema);
         let schema = Arc::new(
             new_schema.with_functional_dependencies(aggregate_func_dependencies)?,
         );
@@ -3544,15 +3688,24 @@ impl Aggregate {
     }
 
     /// Returns the data type of the grouping id.
-    /// The grouping ID value is a bitmask where each set bit
-    /// indicates that the corresponding grouping expression is
-    /// null
-    pub fn grouping_id_type(group_exprs: usize) -> DataType {
-        if group_exprs <= 8 {
+    ///
+    /// The grouping ID packs two pieces of information into a single integer:
+    /// - The low `group_exprs` bits are the semantic bitmask (a set bit means the
+    ///   corresponding grouping expression is NULL for this grouping set).
+    /// - The bits above position `group_exprs` encode a duplicate ordinal that
+    ///   distinguishes multiple occurrences of the same grouping set pattern.
+    ///
+    /// `max_ordinal` is the highest ordinal value that will appear (0 when there
+    /// are no duplicate grouping sets).  The type is chosen to be the smallest
+    /// unsigned integer that can represent both parts.
+    pub fn grouping_id_type(group_exprs: usize, max_ordinal: usize) -> DataType {
+        let ordinal_bits = usize::BITS as usize - max_ordinal.leading_zeros() as usize;
+        let total_bits = group_exprs + ordinal_bits;
+        if total_bits <= 8 {
             DataType::UInt8
-        } else if group_exprs <= 16 {
+        } else if total_bits <= 16 {
             DataType::UInt16
-        } else if group_exprs <= 32 {
+        } else if total_bits <= 32 {
             DataType::UInt32
         } else {
             DataType::UInt64
@@ -3561,21 +3714,36 @@ impl Aggregate {
 
     /// Internal column used when the aggregation is a grouping set.
     ///
-    /// This column contains a bitmask where each bit represents a grouping
-    /// expression. The least significant bit corresponds to the rightmost
-    /// grouping expression. A bit value of 0 indicates that the corresponding
-    /// column is included in the grouping set, while a value of 1 means it is excluded.
+    /// This column packs two values into a single unsigned integer:
+    ///
+    /// - **Low bits (positions 0 .. n-1)**: a semantic bitmask where each bit
+    ///   represents one of the `n` grouping expressions.  The least significant
+    ///   bit corresponds to the rightmost grouping expression.  A `1` bit means
+    ///   the corresponding column is replaced with `NULL` for this grouping set;
+    ///   a `0` bit means it is included.
+    /// - **High bits (positions n and above)**: a *duplicate ordinal* that
+    ///   distinguishes multiple occurrences of the same semantic grouping set
+    ///   pattern within a single query.  The ordinal is `0` for the first
+    ///   occurrence, `1` for the second, and so on.
     ///
-    /// For example, for the grouping expressions CUBE(a, b), the grouping ID
-    /// column will have the following values:
+    /// The integer type is chosen by [`Self::grouping_id_type`] to be the
+    /// smallest `UInt8 / UInt16 / UInt32 / UInt64` that can represent both
+    /// parts.
+    ///
+    /// For example, for the grouping expressions CUBE(a, b) (no duplicates),
+    /// the grouping ID column will have the following values:
     ///     0b00: Both `a` and `b` are included
     ///     0b01: `b` is excluded
     ///     0b10: `a` is excluded
     ///     0b11: Both `a` and `b` are excluded
     ///
-    /// This internal column is necessary because excluded columns are replaced
-    /// with `NULL` values. To handle these cases correctly, we must distinguish
-    /// between an actual `NULL` value in a column and a column being excluded from the set.
+    /// When the same set appears twice and `n = 2`, the duplicate ordinal is
+    /// packed into bit 2:
+    ///     first occurrence:  `0b0_01` (ordinal = 0, mask = 0b01)
+    ///     second occurrence: `0b1_01` (ordinal = 1, mask = 0b01)
+    ///
+    /// The GROUPING function always masks the value with `(1 << n) - 1` before
+    /// interpreting it so the ordinal bits are invisible to user-facing SQL.
     pub const INTERNAL_GROUPING_ID: &'static str = "__grouping_id";
 }
 
@@ -3596,6 +3764,25 @@ impl PartialOrd for Aggregate {
     }
 }
 
+/// Returns the highest duplicate ordinal across all grouping sets in `group_expr`.
+///
+/// The ordinal for each occurrence of a grouping set pattern is its 0-based
+/// index among identical entries. For example, if the same set appears three
+/// times, the ordinals are 0, 1, 2 and this function returns 2.
+/// Returns 0 when no grouping set is duplicated.
+#[allow(clippy::allow_attributes, clippy::mutable_key_type)] // Expr contains Arc with interior mutability but is intentionally used as hash key
+fn max_grouping_set_duplicate_ordinal(group_expr: &[Expr]) -> usize {
+    if let Some(Expr::GroupingSet(GroupingSet::GroupingSets(sets))) = group_expr.first() {
+        let mut counts: HashMap<&[Expr], usize> = HashMap::new();
+        for set in sets {
+            *counts.entry(set).or_insert(0) += 1;
+        }
+        counts.into_values().max().unwrap_or(0).saturating_sub(1)
+    } else {
+        0
+    }
+}
+
 /// Checks whether any expression in `group_expr` contains `Expr::GroupingSet`.
 fn contains_grouping_set(group_expr: &[Expr]) -> bool {
     group_expr
@@ -3725,6 +3912,14 @@ pub struct Join {
     pub schema: DFSchemaRef,
     /// Defines the null equality for the join.
     pub null_equality: NullEquality,
+    /// Whether this is a null-aware anti join (for NOT IN semantics).
+    ///
+    /// Only applies to LeftAnti joins. When true, implements SQL NOT IN semantics where:
+    /// - If the right side (subquery) contains any NULL in join keys, no rows are output
+    /// - Left side rows with NULL in join keys are not output
+    ///
+    /// This is required for correct NOT IN subquery behavior with three-valued logic.
+    pub null_aware: bool,
 }
 
 impl Join {
@@ -3742,10 +3937,12 @@ impl Join {
     /// * `join_type` - Type of join (Inner, Left, Right, etc.)
     /// * `join_constraint` - Join constraint (On, Using)
     /// * `null_equality` - How to handle nulls in join comparisons
+    /// * `null_aware` - Whether this is a null-aware anti join (for NOT IN semantics)
     ///
     /// # Returns
     ///
     /// A new Join operator with the computed schema
+    #[expect(clippy::too_many_arguments)]
     pub fn try_new(
         left: Arc<LogicalPlan>,
         right: Arc<LogicalPlan>,
@@ -3754,6 +3951,7 @@ impl Join {
         join_type: JoinType,
         join_constraint: JoinConstraint,
         null_equality: NullEquality,
+        null_aware: bool,
     ) -> Result<Self> {
         let join_schema = build_join_schema(left.schema(), right.schema(), &join_type)?;
 
@@ -3766,6 +3964,7 @@ impl Join {
             join_constraint,
             schema: Arc::new(join_schema),
             null_equality,
+            null_aware,
         })
     }
 
@@ -3821,6 +4020,7 @@ impl Join {
                 join_constraint: original_join.join_constraint,
                 schema: Arc::new(join_schema),
                 null_equality: original_join.null_equality,
+                null_aware: original_join.null_aware,
             },
             requalified,
         ))
@@ -4118,7 +4318,9 @@ impl Unnest {
                                 }
                                 DataType::List(_)
                                 | DataType::FixedSizeList(_, _)
-                                | DataType::LargeList(_) => {
+                                | DataType::LargeList(_)
+                                | DataType::ListView(_)
+                                | DataType::LargeListView(_) => {
                                     list_columns.push((
                                         index,
                                         ColumnUnnestList {
@@ -4193,7 +4395,11 @@ fn get_unnested_columns(
     let mut qualified_columns = Vec::with_capacity(1);
 
     match data_type {
-        DataType::List(_) | DataType::FixedSizeList(_, _) | DataType::LargeList(_) => {
+        DataType::List(_)
+        | DataType::FixedSizeList(_, _)
+        | DataType::LargeList(_)
+        | DataType::ListView(_)
+        | DataType::LargeListView(_) => {
             let data_type = get_unnested_list_datatype_recursive(data_type, depth)?;
             let new_field = Arc::new(Field::new(
                 col_name, data_type,
@@ -4230,7 +4436,9 @@ fn get_unnested_list_datatype_recursive(
     match data_type {
         DataType::List(field)
         | DataType::FixedSizeList(field, _)
-        | DataType::LargeList(field) => {
+        | DataType::LargeList(field)
+        | DataType::ListView(field)
+        | DataType::LargeListView(field) => {
             if depth == 1 {
                 return Ok(field.data_type().clone());
             }
@@ -4247,16 +4455,17 @@ mod tests {
     use super::*;
     use crate::builder::LogicalTableSource;
     use crate::logical_plan::table_scan;
+    use crate::select_expr::SelectExpr;
     use crate::test::function_stub::{count, count_udaf};
     use crate::{
-        binary_expr, col, exists, in_subquery, lit, placeholder, scalar_subquery,
-        GroupingSet,
+        GroupingSet, binary_expr, col, exists, in_subquery, lit, placeholder,
+        scalar_subquery,
     };
     use datafusion_common::metadata::ScalarAndMetadata;
     use datafusion_common::tree_node::{
         TransformedResult, TreeNodeRewriter, TreeNodeVisitor,
     };
-    use datafusion_common::{not_impl_err, Constraint, ScalarValue};
+    use datafusion_common::{Constraint, not_impl_err};
     use insta::{assert_debug_snapshot, assert_snapshot};
     use std::hash::DefaultHasher;
 
@@ -4379,49 +4588,49 @@ mod tests {
         [
           {
             "Plan": {
+              "Node Type": "Projection",
               "Expressions": [
                 "employee_csv.id"
               ],
-              "Node Type": "Projection",
-              "Output": [
-                "id"
-              ],
               "Plans": [
                 {
-                  "Condition": "employee_csv.state IN (<subquery>)",
                   "Node Type": "Filter",
-                  "Output": [
-                    "id",
-                    "state"
-                  ],
+                  "Condition": "employee_csv.state IN (<subquery>)",
                   "Plans": [
                     {
                       "Node Type": "Subquery",
-                      "Output": [
-                        "state"
-                      ],
                       "Plans": [
                         {
                           "Node Type": "TableScan",
+                          "Relation Name": "employee_csv",
+                          "Plans": [],
                           "Output": [
                             "state"
-                          ],
-                          "Plans": [],
-                          "Relation Name": "employee_csv"
+                          ]
                         }
+                      ],
+                      "Output": [
+                        "state"
                       ]
                     },
                     {
                       "Node Type": "TableScan",
+                      "Relation Name": "employee_csv",
+                      "Plans": [],
                       "Output": [
                         "id",
                         "state"
-                      ],
-                      "Plans": [],
-                      "Relation Name": "employee_csv"
+                      ]
                     }
+                  ],
+                  "Output": [
+                    "id",
+                    "state"
                   ]
                 }
+              ],
+              "Output": [
+                "id"
               ]
             }
           }
@@ -4825,6 +5034,35 @@ mod tests {
             .expect_err("prepared field metadata mismatch unexpectedly succeeded");
     }
 
+    #[test]
+    fn test_replace_placeholder_empty_relation_valid_schema() {
+        // SELECT $1, $2;
+        let plan = LogicalPlanBuilder::empty(false)
+            .project(vec![
+                SelectExpr::from(placeholder("$1")),
+                SelectExpr::from(placeholder("$2")),
+            ])
+            .unwrap()
+            .build()
+            .unwrap();
+
+        // original
+        assert_snapshot!(plan.display_indent_schema(), @r"
+        Projection: $1, $2 [$1:Null;N, $2:Null;N]
+          EmptyRelation: rows=0 []
+        ");
+
+        let plan = plan
+            .with_param_values(vec![ScalarValue::from(1i32), ScalarValue::from("s")])
+            .unwrap();
+
+        // replaced
+        assert_snapshot!(plan.display_indent_schema(), @r#"
+        Projection: Int32(1) AS $1, Utf8("s") AS $2 [$1:Int32, $2:Utf8]
+          EmptyRelation: rows=0 []
+        "#);
+    }
+
     #[test]
     fn test_nullable_schema_after_grouping_set() {
         let schema = Schema::new(vec![
@@ -4847,14 +5085,26 @@ mod tests {
 
         let output_schema = plan.schema();
 
-        assert!(output_schema
-            .field_with_name(None, "foo")
-            .unwrap()
-            .is_nullable(),);
-        assert!(output_schema
-            .field_with_name(None, "bar")
-            .unwrap()
-            .is_nullable());
+        assert!(
+            output_schema
+                .field_with_name(None, "foo")
+                .unwrap()
+                .is_nullable(),
+        );
+        assert!(
+            output_schema
+                .field_with_name(None, "bar")
+                .unwrap()
+                .is_nullable()
+        );
+    }
+
+    #[test]
+    fn grouping_id_type_accounts_for_duplicate_ordinal_bits() {
+        // 8 grouping columns fit in UInt8 when there are no duplicate ordinals,
+        // but adding one duplicate ordinal bit widens the type to UInt16.
+        assert_eq!(Aggregate::grouping_id_type(8, 0), DataType::UInt8);
+        assert_eq!(Aggregate::grouping_id_type(8, 1), DataType::UInt16);
     }
 
     #[test]
@@ -5119,7 +5369,11 @@ mod tests {
             .transform_down_with_subqueries(|plan| {
                 match plan {
                     LogicalPlan::Projection(..) => {
-                        return Ok(Transformed::new(plan, false, TreeNodeRecursion::Jump))
+                        return Ok(Transformed::new(
+                            plan,
+                            false,
+                            TreeNodeRecursion::Jump,
+                        ));
                     }
                     LogicalPlan::Filter(..) => filter_found = true,
                     _ => {}
@@ -5139,7 +5393,7 @@ mod tests {
                                 plan,
                                 false,
                                 TreeNodeRecursion::Jump,
-                            ))
+                            ));
                         }
                         LogicalPlan::Filter(..) => filter_found = true,
                         _ => {}
@@ -5169,7 +5423,11 @@ mod tests {
             fn f_down(&mut self, node: Self::Node) -> Result<Transformed<Self::Node>> {
                 match node {
                     LogicalPlan::Projection(..) => {
-                        return Ok(Transformed::new(node, false, TreeNodeRecursion::Jump))
+                        return Ok(Transformed::new(
+                            node,
+                            false,
+                            TreeNodeRecursion::Jump,
+                        ));
                     }
                     LogicalPlan::Filter(..) => self.filter_found = true,
                     _ => {}
@@ -5231,6 +5489,7 @@ mod tests {
                 join_constraint: JoinConstraint::On,
                 schema: Arc::new(left_schema.join(&right_schema)?),
                 null_equality: NullEquality::NullEqualsNothing,
+                null_aware: false,
             }))
         }
 
@@ -5342,6 +5601,7 @@ mod tests {
                 join_type,
                 JoinConstraint::On,
                 NullEquality::NullEqualsNothing,
+                false,
             )?;
 
             match join_type {
@@ -5487,6 +5747,7 @@ mod tests {
                 JoinType::Inner,
                 JoinConstraint::Using,
                 NullEquality::NullEqualsNothing,
+                false,
             )?;
 
             let fields = join.schema.fields();
@@ -5538,6 +5799,7 @@ mod tests {
                 JoinType::Inner,
                 JoinConstraint::On,
                 NullEquality::NullEqualsNothing,
+                false,
             )?;
 
             let fields = join.schema.fields();
@@ -5587,6 +5849,7 @@ mod tests {
                 JoinType::Inner,
                 JoinConstraint::On,
                 NullEquality::NullEqualsNull,
+                false,
             )?;
 
             assert_eq!(join.null_equality, NullEquality::NullEqualsNull);
@@ -5629,6 +5892,7 @@ mod tests {
                 join_type,
                 JoinConstraint::On,
                 NullEquality::NullEqualsNothing,
+                false,
             )?;
 
             let fields = join.schema.fields();
@@ -5668,6 +5932,7 @@ mod tests {
             JoinType::Inner,
             JoinConstraint::Using,
             NullEquality::NullEqualsNothing,
+            false,
         )?;
 
         assert_eq!(
diff --git a/datafusion/expr/src/logical_plan/statement.rs b/datafusion/expr/src/logical_plan/statement.rs
index bfc6b53d1136e..daf29d7c81d3f 100644
--- a/datafusion/expr/src/logical_plan/statement.rs
+++ b/datafusion/expr/src/logical_plan/statement.rs
@@ -20,9 +20,9 @@ use datafusion_common::metadata::format_type_and_metadata;
 use datafusion_common::{DFSchema, DFSchemaRef};
 use itertools::Itertools as _;
 use std::fmt::{self, Display};
-use std::sync::{Arc, LazyLock};
+use std::sync::Arc;
 
-use crate::{expr_vec_fmt, Expr, LogicalPlan};
+use crate::{Expr, LogicalPlan, expr_vec_fmt};
 
 /// Various types of Statements.
 ///
@@ -39,6 +39,8 @@ pub enum Statement {
     TransactionEnd(TransactionEnd),
     /// Set a Variable
     SetVariable(SetVariable),
+    /// Reset a Variable
+    ResetVariable(ResetVariable),
     /// Prepare a statement and find any bind parameters
     /// (e.g. `?`). This is used to implement SQL-prepared statements.
     Prepare(Prepare),
@@ -53,10 +55,7 @@ impl Statement {
     /// Get a reference to the logical plan's schema
     pub fn schema(&self) -> &DFSchemaRef {
         // Statements have an unchanging empty schema.
-        static STATEMENT_EMPTY_SCHEMA: LazyLock<DFSchemaRef> =
-            LazyLock::new(|| Arc::new(DFSchema::empty()));
-
-        &STATEMENT_EMPTY_SCHEMA
+        DFSchema::empty_ref()
     }
 
     /// Return a descriptive string describing the type of this
@@ -66,6 +65,7 @@ impl Statement {
             Statement::TransactionStart(_) => "TransactionStart",
             Statement::TransactionEnd(_) => "TransactionEnd",
             Statement::SetVariable(_) => "SetVariable",
+            Statement::ResetVariable(_) => "ResetVariable",
             Statement::Prepare(_) => "Prepare",
             Statement::Execute(_) => "Execute",
             Statement::Deallocate(_) => "Deallocate",
@@ -109,6 +109,9 @@ impl Statement {
                     }) => {
                         write!(f, "SetVariable: set {variable:?} to {value:?}")
                     }
+                    Statement::ResetVariable(ResetVariable { variable }) => {
+                        write!(f, "ResetVariable: reset {variable:?}")
+                    }
                     Statement::Prepare(Prepare { name, fields, .. }) => {
                         write!(
                             f,
@@ -194,6 +197,12 @@ pub struct SetVariable {
     pub value: String,
 }
 
+/// Reset a configuration variable to its default
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)]
+pub struct ResetVariable {
+    /// The variable name
+    pub variable: String,
+}
 /// Prepare a statement but do not execute it. Prepare statements can have 0 or more
 /// `Expr::Placeholder` expressions that are filled in during execution
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)]
diff --git a/datafusion/expr/src/logical_plan/tree_node.rs b/datafusion/expr/src/logical_plan/tree_node.rs
index 47088370a1d93..ef9382a57209a 100644
--- a/datafusion/expr/src/logical_plan/tree_node.rs
+++ b/datafusion/expr/src/logical_plan/tree_node.rs
@@ -38,20 +38,20 @@
 //! * [`LogicalPlan::expressions`]: Return a copy of the plan's expressions
 
 use crate::{
-    dml::CopyTo, Aggregate, Analyze, CreateMemoryTable, CreateView, DdlStatement,
-    Distinct, DistinctOn, DmlStatement, Execute, Explain, Expr, Extension, Filter, Join,
-    Limit, LogicalPlan, Partitioning, Prepare, Projection, RecursiveQuery, Repartition,
-    Sort, Statement, Subquery, SubqueryAlias, TableScan, Union, Unnest,
-    UserDefinedLogicalNode, Values, Window,
+    Aggregate, Analyze, CreateMemoryTable, CreateView, DdlStatement, Distinct,
+    DistinctOn, DmlStatement, Execute, Explain, Expr, Extension, Filter, Join, Limit,
+    LogicalPlan, Partitioning, Prepare, Projection, RecursiveQuery, Repartition, Sort,
+    Statement, Subquery, SubqueryAlias, TableScan, Union, Unnest, UserDefinedLogicalNode,
+    Values, Window, dml::CopyTo,
 };
 use datafusion_common::tree_node::TreeNodeRefContainer;
 
-use crate::expr::{Exists, InSubquery};
+use crate::expr::{Exists, InSubquery, SetComparison};
 use datafusion_common::tree_node::{
     Transformed, TreeNode, TreeNodeContainer, TreeNodeIterator, TreeNodeRecursion,
     TreeNodeRewriter, TreeNodeVisitor,
 };
-use datafusion_common::{internal_err, Result};
+use datafusion_common::{Result, internal_err};
 
 impl TreeNode for LogicalPlan {
     fn apply_children<'n, F: FnMut(&'n Self) -> Result<TreeNodeRecursion>>(
@@ -133,6 +133,7 @@ impl TreeNode for LogicalPlan {
                 join_constraint,
                 schema,
                 null_equality,
+                null_aware,
             }) => (left, right).map_elements(f)?.update_data(|(left, right)| {
                 LogicalPlan::Join(Join {
                     left,
@@ -143,6 +144,7 @@ impl TreeNode for LogicalPlan {
                     join_constraint,
                     schema,
                     null_equality,
+                    null_aware,
                 })
             }),
             LogicalPlan::Limit(Limit { skip, fetch, input }) => input
@@ -564,6 +566,7 @@ impl LogicalPlan {
                 join_constraint,
                 schema,
                 null_equality,
+                null_aware,
             }) => (on, filter).map_elements(f)?.update_data(|(on, filter)| {
                 LogicalPlan::Join(Join {
                     left,
@@ -574,23 +577,36 @@ impl LogicalPlan {
                     join_constraint,
                     schema,
                     null_equality,
+                    null_aware,
                 })
             }),
             LogicalPlan::Sort(Sort { expr, input, fetch }) => expr
                 .map_elements(f)?
                 .update_data(|expr| LogicalPlan::Sort(Sort { expr, input, fetch })),
             LogicalPlan::Extension(Extension { node }) => {
-                // would be nice to avoid this copy -- maybe can
-                // update extension to just observer Exprs
-                let exprs = node.expressions().map_elements(f)?;
-                let plan = LogicalPlan::Extension(Extension {
-                    node: UserDefinedLogicalNode::with_exprs_and_inputs(
-                        node.as_ref(),
-                        exprs.data,
-                        node.inputs().into_iter().cloned().collect::<Vec<_>>(),
-                    )?,
-                });
-                Transformed::new(plan, exprs.transformed, exprs.tnr)
+                let raw_exprs = node.expressions();
+                if raw_exprs.is_empty() {
+                    // No expressions to transform — skip expensive clone of
+                    // all inputs and reconstruction via with_exprs_and_inputs.
+                    Transformed::no(LogicalPlan::Extension(Extension { node }))
+                } else {
+                    // TODO: a more general optimization would be to change
+                    // `UserDefinedLogicalNode::expressions()` to return
+                    // references (`&[Expr]`) instead of cloned `Vec<Expr>`,
+                    // and only clone + rebuild when the transform actually
+                    // modifies an expression. This would avoid the clone +
+                    // `with_exprs_and_inputs` rebuild even for non-empty
+                    // expression lists when the transform is a no-op.
+                    let exprs = raw_exprs.map_elements(f)?;
+                    let plan = LogicalPlan::Extension(Extension {
+                        node: UserDefinedLogicalNode::with_exprs_and_inputs(
+                            node.as_ref(),
+                            exprs.data,
+                            node.inputs().into_iter().cloned().collect::<Vec<_>>(),
+                        )?,
+                    });
+                    Transformed::new(plan, exprs.transformed, exprs.tnr)
+                }
             }
             LogicalPlan::TableScan(TableScan {
                 table_name,
@@ -804,7 +820,7 @@ impl LogicalPlan {
         transform_down_up_with_subqueries_impl(self, &mut f_down, &mut f_up)
     }
 
-    /// Similarly to [`Self::apply`], calls `f` on  this node and its inputs
+    /// Similarly to [`Self::apply`], calls `f` on this node and its inputs,
     /// including subqueries that may appear in expressions such as `IN (SELECT
     /// ...)`.
     pub fn apply_subqueries<F: FnMut(&Self) -> Result<TreeNodeRecursion>>(
@@ -815,10 +831,9 @@ impl LogicalPlan {
             expr.apply(|expr| match expr {
                 Expr::Exists(Exists { subquery, .. })
                 | Expr::InSubquery(InSubquery { subquery, .. })
+                | Expr::SetComparison(SetComparison { subquery, .. })
                 | Expr::ScalarSubquery(subquery) => {
-                    // use a synthetic plan so the collector sees a
-                    // LogicalPlan::Subquery (even though it is
-                    // actually a Subquery alias)
+                    // Wrap in LogicalPlan::Subquery to match f's signature
                     f(&LogicalPlan::Subquery(subquery.clone()))
                 }
                 _ => Ok(TreeNodeRecursion::Continue),
@@ -856,6 +871,22 @@ impl LogicalPlan {
                     })),
                     _ => internal_err!("Transformation should return Subquery"),
                 }),
+                Expr::SetComparison(SetComparison {
+                    expr,
+                    subquery,
+                    op,
+                    quantifier,
+                }) => f(LogicalPlan::Subquery(subquery))?.map_data(|s| match s {
+                    LogicalPlan::Subquery(subquery) => {
+                        Ok(Expr::SetComparison(SetComparison {
+                            expr,
+                            subquery,
+                            op,
+                            quantifier,
+                        }))
+                    }
+                    _ => internal_err!("Transformation should return Subquery"),
+                }),
                 Expr::ScalarSubquery(subquery) => f(LogicalPlan::Subquery(subquery))?
                     .map_data(|s| match s {
                         LogicalPlan::Subquery(subquery) => {
@@ -867,4 +898,18 @@ impl LogicalPlan {
             })
         })
     }
+
+    /// Similar to [`Self::map_subqueries`], but only applies `f` to
+    /// uncorrelated subqueries (those with no outer column references).
+    pub fn map_uncorrelated_subqueries<F: FnMut(Self) -> Result<Transformed<Self>>>(
+        self,
+        mut f: F,
+    ) -> Result<Transformed<Self>> {
+        self.map_subqueries(|subquery_plan| match &subquery_plan {
+            LogicalPlan::Subquery(sq) if sq.outer_ref_columns.is_empty() => {
+                f(subquery_plan)
+            }
+            _ => Ok(Transformed::no(subquery_plan)),
+        })
+    }
 }
diff --git a/datafusion/expr/src/partition_evaluator.rs b/datafusion/expr/src/partition_evaluator.rs
index a0f0988b4f4e5..5a4e20e5ac9ac 100644
--- a/datafusion/expr/src/partition_evaluator.rs
+++ b/datafusion/expr/src/partition_evaluator.rs
@@ -18,7 +18,7 @@
 //! Partition evaluation module
 
 use arrow::array::ArrayRef;
-use datafusion_common::{exec_err, not_impl_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err, not_impl_err};
 use std::fmt::Debug;
 use std::ops::Range;
 
@@ -86,7 +86,11 @@ use crate::window_state::WindowAggState;
 /// [`uses_window_frame`]: Self::uses_window_frame
 /// [`include_rank`]: Self::include_rank
 /// [`supports_bounded_execution`]: Self::supports_bounded_execution
-pub trait PartitionEvaluator: Debug + Send {
+///
+/// For more background, please also see the [User defined Window Functions in DataFusion blog]
+///
+/// [User defined Window Functions in DataFusion blog]: https://datafusion.apache.org/blog/2025/04/19/user-defined-window-functions
+pub trait PartitionEvaluator: Debug + Send + std::any::Any {
     /// When the window frame has a fixed beginning (e.g UNBOUNDED
     /// PRECEDING), some functions such as FIRST_VALUE, LAST_VALUE and
     /// NTH_VALUE do not need the (unbounded) input once they have
@@ -175,7 +179,7 @@ pub trait PartitionEvaluator: Debug + Send {
     }
 
     /// Evaluate window function on a range of rows in an input
-    /// partition.x
+    /// partition.
     ///
     /// This is the simplest and most general function to implement
     /// but also the least performant as it creates output one row at
@@ -210,7 +214,7 @@ pub trait PartitionEvaluator: Debug + Send {
     ///  A  | 1
     ///  C  | 3
     ///  D  | 4
-    ///  D  | 5
+    ///  D  | 4
     /// ```
     ///
     /// For this case, `num_rows` would be `5` and the
diff --git a/datafusion/expr/src/planner.rs b/datafusion/expr/src/planner.rs
index 25a0f83947eee..d69f4ac5fe23f 100644
--- a/datafusion/expr/src/planner.rs
+++ b/datafusion/expr/src/planner.rs
@@ -21,15 +21,20 @@ use std::fmt::Debug;
 use std::sync::Arc;
 
 use crate::expr::NullTreatment;
+#[cfg(feature = "sql")]
+use crate::logical_plan::LogicalPlan;
 use crate::{
-    AggregateUDF, Expr, GetFieldAccess, ScalarUDF, SortExpr, TableSource, WindowFrame,
-    WindowFunctionDefinition, WindowUDF,
+    AggregateUDF, Expr, GetFieldAccess, HigherOrderUDF, ScalarUDF, SortExpr, TableSource,
+    WindowFrame, WindowFunctionDefinition, WindowUDF,
 };
-use arrow::datatypes::{DataType, Field, SchemaRef};
+use arrow::datatypes::{DataType, Field, FieldRef, SchemaRef};
+use datafusion_common::datatype::DataTypeExt;
 use datafusion_common::{
-    config::ConfigOptions, file_options::file_type::FileType, not_impl_err, DFSchema,
-    Result, TableReference,
+    DFSchema, Result, TableReference, config::ConfigOptions,
+    file_options::file_type::FileType, not_impl_err,
 };
+#[cfg(feature = "sql")]
+use sqlparser::ast::{Expr as SQLExpr, Ident, ObjectName, TableAlias, TableFactor};
 
 /// Provides the `SQL` query planner meta-data about tables and
 /// functions referenced in SQL statements, without a direct dependency on the
@@ -83,6 +88,12 @@ pub trait ContextProvider {
         &[]
     }
 
+    /// Return [`RelationPlanner`] extensions for planning table factors
+    #[cfg(feature = "sql")]
+    fn get_relation_planners(&self) -> &[Arc<dyn RelationPlanner>] {
+        &[]
+    }
+
     /// Return [`TypePlanner`] extensions for planning data types
     #[cfg(feature = "sql")]
     fn get_type_planner(&self) -> Option<Arc<dyn TypePlanner>> {
@@ -92,6 +103,9 @@ pub trait ContextProvider {
     /// Return the scalar function with a given name, if any
     fn get_function_meta(&self, name: &str) -> Option<Arc<ScalarUDF>>;
 
+    /// Return the higher order function with a given name, if any
+    fn get_higher_order_meta(&self, name: &str) -> Option<Arc<dyn HigherOrderUDF>>;
+
     /// Return the aggregate function with a given name, if any
     fn get_aggregate_meta(&self, name: &str) -> Option<Arc<AggregateUDF>>;
 
@@ -103,12 +117,26 @@ pub trait ContextProvider {
     /// A user defined variable is typically accessed via `@var_name`
     fn get_variable_type(&self, variable_names: &[String]) -> Option<DataType>;
 
+    /// Return metadata about a system/user-defined variable, if any.
+    ///
+    /// By default, this wraps [`Self::get_variable_type`] in an Arrow [`Field`]
+    /// with nullable set to `true` and no metadata. Implementations that can
+    /// provide richer information (such as nullability or extension metadata)
+    /// should override this method.
+    fn get_variable_field(&self, variable_names: &[String]) -> Option<FieldRef> {
+        self.get_variable_type(variable_names)
+            .map(|data_type| data_type.into_nullable_field_ref())
+    }
+
     /// Return overall configuration options
     fn options(&self) -> &ConfigOptions;
 
     /// Return all scalar function names
     fn udf_names(&self) -> Vec<String>;
 
+    /// Return all higher order function names
+    fn higher_order_function_names(&self) -> Vec<String>;
+
     /// Return all aggregate function names
     fn udaf_names(&self) -> Vec<String>;
 
@@ -117,6 +145,10 @@ pub trait ContextProvider {
 }
 
 /// Customize planning of SQL AST expressions to [`Expr`]s
+///
+/// For more background, please also see the [Extending SQL in DataFusion: from ->> to TABLESAMPLE blog]
+///
+/// [Extending SQL in DataFusion: from ->> to TABLESAMPLE blog]: https://datafusion.apache.org/blog/2026/01/12/extending-sql
 pub trait ExprPlanner: Debug + Send + Sync {
     /// Plan the binary operation between two expressions, returns original
     /// BinaryExpr if not possible
@@ -227,13 +259,6 @@ pub trait ExprPlanner: Debug + Send + Sync {
         )
     }
 
-    /// Plans `ANY` expression, such as `expr = ANY(array_expr)`
-    ///
-    /// Returns origin binary expression if not possible
-    fn plan_any(&self, expr: RawBinaryExpr) -> Result<PlannerResult<RawBinaryExpr>> {
-        Ok(PlannerResult::Original(expr))
-    }
-
     /// Plans aggregate functions, such as `COUNT(<expr>)`
     ///
     /// Returns original expression arguments if not possible
@@ -324,16 +349,119 @@ pub enum PlannerResult<T> {
     Original(T),
 }
 
+/// Result of planning a relation with [`RelationPlanner`]
+#[cfg(feature = "sql")]
+#[derive(Debug, Clone)]
+pub struct PlannedRelation {
+    /// The logical plan for the relation
+    pub plan: LogicalPlan,
+    /// Optional table alias for the relation
+    pub alias: Option<TableAlias>,
+}
+
+#[cfg(feature = "sql")]
+impl PlannedRelation {
+    /// Create a new `PlannedRelation` with the given plan and alias
+    pub fn new(plan: LogicalPlan, alias: Option<TableAlias>) -> Self {
+        Self { plan, alias }
+    }
+}
+
+/// Result of attempting to plan a relation with extension planners
+#[cfg(feature = "sql")]
+#[derive(Debug)]
+pub enum RelationPlanning {
+    /// The relation was successfully planned by an extension planner
+    Planned(Box<PlannedRelation>),
+    /// No extension planner handled the relation, return it for default processing
+    Original(Box<TableFactor>),
+}
+
+/// Customize planning SQL table factors to [`LogicalPlan`]s.
+#[cfg(feature = "sql")]
+/// For more background, please also see the [Extending SQL in DataFusion: from ->> to TABLESAMPLE blog]
+///
+/// [Extending SQL in DataFusion: from ->> to TABLESAMPLE blog]: https://datafusion.apache.org/blog/2026/01/12/extending-sql
+pub trait RelationPlanner: Debug + Send + Sync {
+    /// Plan a table factor into a [`LogicalPlan`].
+    ///
+    /// Returning [`RelationPlanning::Planned`] short-circuits further planning and uses the
+    /// provided plan. Returning [`RelationPlanning::Original`] allows the next registered planner,
+    /// or DataFusion's default logic, to handle the relation.
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        context: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning>;
+}
+
+/// Provides utilities for relation planners to interact with DataFusion's SQL
+/// planner.
+///
+/// This trait provides SQL planning utilities specific to relation planning,
+/// such as converting SQL expressions to logical expressions and normalizing
+/// identifiers. It uses composition to provide access to session context via
+/// [`ContextProvider`].
+#[cfg(feature = "sql")]
+pub trait RelationPlannerContext {
+    /// Provides access to the underlying context provider for reading session
+    /// configuration, accessing tables, functions, and other metadata.
+    fn context_provider(&self) -> &dyn ContextProvider;
+
+    /// Plans the specified relation through the full planner pipeline, starting
+    /// from the first registered relation planner.
+    fn plan(&mut self, relation: TableFactor) -> Result<LogicalPlan>;
+
+    /// Converts a SQL expression into a logical expression using the current
+    /// planner context.
+    fn sql_to_expr(&mut self, expr: SQLExpr, schema: &DFSchema) -> Result<Expr>;
+
+    /// Converts a SQL expression into a logical expression without DataFusion
+    /// rewrites.
+    fn sql_expr_to_logical_expr(
+        &mut self,
+        expr: SQLExpr,
+        schema: &DFSchema,
+    ) -> Result<Expr>;
+
+    /// Normalizes an identifier according to session settings.
+    fn normalize_ident(&self, ident: Ident) -> String;
+
+    /// Normalizes a SQL object name into a [`TableReference`].
+    fn object_name_to_table_reference(&self, name: ObjectName) -> Result<TableReference>;
+}
+
 /// Customize planning SQL types to DataFusion (Arrow) types.
 #[cfg(feature = "sql")]
+/// For more background, please also see the [Extending SQL in DataFusion: from ->> to TABLESAMPLE blog]
+///
+/// [Extending SQL in DataFusion: from ->> to TABLESAMPLE blog]: https://datafusion.apache.org/blog/2026/01/12/extending-sql
 pub trait TypePlanner: Debug + Send + Sync {
     /// Plan SQL [`sqlparser::ast::DataType`] to DataFusion [`DataType`]
     ///
     /// Returns None if not possible
+    #[deprecated(since = "53.0.0", note = "Use plan_type_field()")]
     fn plan_type(
         &self,
         _sql_type: &sqlparser::ast::DataType,
     ) -> Result<Option<DataType>> {
         Ok(None)
     }
+
+    /// Plan SQL [`sqlparser::ast::DataType`] to DataFusion [`FieldRef`]
+    ///
+    /// Returns None if not possible. Unlike [`Self::plan_type`], `plan_type_field()`
+    /// makes it possible to express extension types (e.g., `arrow.uuid`) or otherwise
+    /// insert metadata into the DataFusion type representation. The default implementation
+    /// falls back on [`Self::plan_type`] for backward compatibility and wraps the result
+    /// in a nullable field reference.
+    fn plan_type_field(
+        &self,
+        sql_type: &sqlparser::ast::DataType,
+    ) -> Result<Option<FieldRef>> {
+        #[expect(deprecated)]
+        Ok(self
+            .plan_type(sql_type)?
+            .map(|data_type| data_type.into_nullable_field_ref()))
+    }
 }
diff --git a/datafusion/expr/src/predicate_bounds.rs b/datafusion/expr/src/predicate_bounds.rs
new file mode 100644
index 0000000000000..992d9f88bb14a
--- /dev/null
+++ b/datafusion/expr/src/predicate_bounds.rs
@@ -0,0 +1,681 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{Between, BinaryExpr, Expr, ExprSchemable};
+use arrow::datatypes::DataType;
+use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
+use datafusion_common::{ExprSchema, Result, ScalarValue};
+use datafusion_expr_common::interval_arithmetic::NullableInterval;
+use datafusion_expr_common::operator::Operator;
+
+/// Computes the output interval for the given boolean expression based on statically
+/// available information.
+///
+/// # Arguments
+///
+/// * `predicate` - The boolean expression to analyze
+/// * `is_null` - A callback function that provides additional nullability information for
+///   expressions. When called with an expression, it should return:
+///   - `Some(true)` if the expression is known to evaluate to NULL
+///   - `Some(false)` if the expression is known to NOT evaluate to NULL
+///   - `None` if the nullability cannot be determined
+///
+///   This callback allows the caller to provide context-specific knowledge about expression
+///   nullability that cannot be determined from the schema alone. For example, it can be used
+///   to indicate that a particular column reference is known to be NULL in a specific context,
+///   or that certain expressions will never be NULL based on runtime constraints.
+///
+/// * `input_schema` - Schema information for resolving expression types and nullability
+///
+/// # Return Value
+///
+/// The function returns a [NullableInterval] that describes the possible boolean values the
+/// predicate can evaluate to.
+///
+pub(super) fn evaluate_bounds(
+    predicate: &Expr,
+    certainly_null_expr: Option<&Expr>,
+    input_schema: &dyn ExprSchema,
+) -> Result<NullableInterval> {
+    let evaluator = PredicateBoundsEvaluator {
+        input_schema,
+        certainly_null_expr,
+    };
+    evaluator.evaluate_bounds(predicate)
+}
+
+struct PredicateBoundsEvaluator<'a> {
+    input_schema: &'a dyn ExprSchema,
+    certainly_null_expr: Option<&'a Expr>,
+}
+
+impl PredicateBoundsEvaluator<'_> {
+    /// Derives the bounds of the given boolean expression
+    fn evaluate_bounds(&self, predicate: &Expr) -> Result<NullableInterval> {
+        Ok(match predicate {
+            Expr::Literal(scalar, _) => {
+                // Interpret literals as boolean, coercing if necessary
+                match scalar {
+                    ScalarValue::Null => NullableInterval::UNKNOWN,
+                    ScalarValue::Boolean(b) => match b {
+                        Some(true) => NullableInterval::TRUE,
+                        Some(false) => NullableInterval::FALSE,
+                        None => NullableInterval::UNKNOWN,
+                    },
+                    _ => {
+                        let b = Expr::Literal(scalar.cast_to(&DataType::Boolean)?, None);
+                        self.evaluate_bounds(&b)?
+                    }
+                }
+            }
+            Expr::IsNull(e) => {
+                // If `e` is not nullable, then `e IS NULL` is provably false
+                if !e.nullable(self.input_schema)? {
+                    NullableInterval::FALSE
+                } else {
+                    match e.get_type(self.input_schema)? {
+                        // If `e` is a boolean expression, check if `e` is provably 'unknown'.
+                        DataType::Boolean => self.evaluate_bounds(e)?.is_unknown()?,
+                        // If `e` is not a boolean expression, check if `e` is provably null
+                        _ => self.is_null(e),
+                    }
+                }
+            }
+            Expr::IsNotNull(e) => {
+                // If `e` is not nullable, then `e IS NOT NULL` is provably true
+                if !e.nullable(self.input_schema)? {
+                    NullableInterval::TRUE
+                } else {
+                    match e.get_type(self.input_schema)? {
+                        // If `e` is a boolean expression, try to evaluate it and test for not unknown
+                        DataType::Boolean => {
+                            self.evaluate_bounds(e)?.is_unknown()?.not()?
+                        }
+                        // If `e` is not a boolean expression, check if `e` is provably null
+                        _ => self.is_null(e).not()?,
+                    }
+                }
+            }
+            Expr::IsTrue(e) => self.evaluate_bounds(e)?.is_true()?,
+            Expr::IsNotTrue(e) => self.evaluate_bounds(e)?.is_true()?.not()?,
+            Expr::IsFalse(e) => self.evaluate_bounds(e)?.is_false()?,
+            Expr::IsNotFalse(e) => self.evaluate_bounds(e)?.is_false()?.not()?,
+            Expr::IsUnknown(e) => self.evaluate_bounds(e)?.is_unknown()?,
+            Expr::IsNotUnknown(e) => self.evaluate_bounds(e)?.is_unknown()?.not()?,
+            Expr::Not(e) => self.evaluate_bounds(e)?.not()?,
+            Expr::BinaryExpr(BinaryExpr {
+                left,
+                op: Operator::And,
+                right,
+            }) => NullableInterval::and(
+                &self.evaluate_bounds(left)?,
+                &self.evaluate_bounds(right)?,
+            )?,
+            Expr::BinaryExpr(BinaryExpr {
+                left,
+                op: Operator::Or,
+                right,
+            }) => NullableInterval::or(
+                &self.evaluate_bounds(left)?,
+                &self.evaluate_bounds(right)?,
+            )?,
+            e => {
+                let is_null = self.is_null(e);
+
+                // If an expression is null, then it's value is UNKNOWN
+                let maybe_null =
+                    is_null.contains_value(ScalarValue::Boolean(Some(true)))?;
+
+                let maybe_not_null =
+                    is_null.contains_value(ScalarValue::Boolean(Some(false)))?;
+
+                match (maybe_null, maybe_not_null) {
+                    (true, true) | (false, false) => NullableInterval::ANY_TRUTH_VALUE,
+                    (true, false) => NullableInterval::UNKNOWN,
+                    (false, true) => NullableInterval::TRUE_OR_FALSE,
+                }
+            }
+        })
+    }
+
+    /// Determines if the given expression can evaluate to `NULL`.
+    ///
+    /// This method only returns sets containing `TRUE`, `FALSE`, or both.
+    fn is_null(&self, expr: &Expr) -> NullableInterval {
+        // Fast path for literals
+        if let Expr::Literal(scalar, _) = expr {
+            if scalar.is_null() {
+                return NullableInterval::TRUE;
+            } else {
+                return NullableInterval::FALSE;
+            }
+        }
+
+        // If `expr` is not nullable, we can be certain `expr` is not null
+        if let Ok(false) = expr.nullable(self.input_schema) {
+            return NullableInterval::FALSE;
+        }
+
+        // Check if the expression is the `certainly_null_expr` that was passed in.
+        if let Some(certainly_null_expr) = &self.certainly_null_expr
+            && expr.eq(certainly_null_expr)
+        {
+            return NullableInterval::TRUE;
+        }
+
+        // `expr` is nullable, so our default answer for `is null` is going to be `{ TRUE, FALSE }`.
+        // Try to see if we can narrow it down to just one option.
+        match expr {
+            Expr::BinaryExpr(BinaryExpr { op, .. }) if op.returns_null_on_null() => {
+                self.is_null_if_any_child_null(expr)
+            }
+            Expr::Alias(_)
+            | Expr::Cast(_)
+            | Expr::Like(_)
+            | Expr::Negative(_)
+            | Expr::Not(_)
+            | Expr::SimilarTo(_) => self.is_null_if_any_child_null(expr),
+            Expr::Between(Between {
+                expr, low, high, ..
+            }) if self.is_null(expr).is_certainly_true()
+                || (self.is_null(low.as_ref()).is_certainly_true()
+                    && self.is_null(high.as_ref()).is_certainly_true()) =>
+            {
+                // Between is always null if the left side is null
+                // or both the low and high bounds are null
+                NullableInterval::TRUE
+            }
+            _ => NullableInterval::TRUE_OR_FALSE,
+        }
+    }
+
+    fn is_null_if_any_child_null(&self, expr: &Expr) -> NullableInterval {
+        // These expressions are null if any of their direct children is null
+        // If any child is inconclusive, the result for this expression is also inconclusive
+        let mut is_null = NullableInterval::FALSE;
+
+        let _ = expr.apply_children(|child| {
+            let child_is_null = self.is_null(child);
+
+            if child_is_null.contains_value(ScalarValue::Boolean(Some(true)))? {
+                // If a child might be null, then the result may also be null
+                is_null = NullableInterval::TRUE_OR_FALSE;
+            }
+
+            if !child_is_null.contains_value(ScalarValue::Boolean(Some(false)))? {
+                // If the child is never not null, then the result can also never be not null
+                // and we can stop traversing the children
+                is_null = NullableInterval::TRUE;
+                Ok(TreeNodeRecursion::Stop)
+            } else {
+                Ok(TreeNodeRecursion::Continue)
+            }
+        });
+
+        is_null
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::expr::ScalarFunction;
+    use crate::predicate_bounds::evaluate_bounds;
+    use crate::{
+        Expr, binary_expr, col, create_udf, is_false, is_not_false, is_not_null,
+        is_not_true, is_not_unknown, is_null, is_true, is_unknown, lit, not,
+    };
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common::{DFSchema, Result, ScalarValue};
+    use datafusion_expr_common::columnar_value::ColumnarValue;
+    use datafusion_expr_common::interval_arithmetic::NullableInterval;
+    use datafusion_expr_common::operator::Operator::{And, Eq, Or};
+    use datafusion_expr_common::signature::Volatility;
+    use std::ops::Neg;
+    use std::sync::Arc;
+
+    fn eval_bounds(predicate: &Expr) -> Result<NullableInterval> {
+        let schema = DFSchema::try_from(Schema::empty())?;
+        evaluate_bounds(predicate, None, &schema)
+    }
+
+    #[test]
+    fn evaluate_bounds_literal() {
+        #[rustfmt::skip]
+        let cases = vec![
+            (lit(ScalarValue::Null), NullableInterval::UNKNOWN),
+            (lit(false), NullableInterval::FALSE),
+            (lit(true), NullableInterval::TRUE),
+            (lit(0), NullableInterval::FALSE),
+            (lit(1), NullableInterval::TRUE),
+            (lit(ScalarValue::Utf8(None)), NullableInterval::UNKNOWN),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                eval_bounds(&case.0).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+        }
+
+        assert!(eval_bounds(&lit("foo")).is_err());
+    }
+
+    #[test]
+    fn evaluate_bounds_and() {
+        let null = lit(ScalarValue::Null);
+        let zero = lit(0);
+        let one = lit(1);
+        let t = lit(true);
+        let f = lit(false);
+        let func = make_scalar_func_expr();
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (binary_expr(null.clone(), And, null.clone()), NullableInterval::UNKNOWN),
+            (binary_expr(null.clone(), And, one.clone()), NullableInterval::UNKNOWN),
+            (binary_expr(null.clone(), And, zero.clone()), NullableInterval::FALSE),
+            (binary_expr(one.clone(), And, one.clone()), NullableInterval::TRUE),
+            (binary_expr(one.clone(), And, zero.clone()), NullableInterval::FALSE),
+            (binary_expr(null.clone(), And, t.clone()), NullableInterval::UNKNOWN),
+            (binary_expr(t.clone(), And, null.clone()), NullableInterval::UNKNOWN),
+            (binary_expr(null.clone(), And, f.clone()), NullableInterval::FALSE),
+            (binary_expr(f.clone(), And, null.clone()), NullableInterval::FALSE),
+            (binary_expr(t.clone(), And, t.clone()), NullableInterval::TRUE),
+            (binary_expr(t.clone(), And, f.clone()), NullableInterval::FALSE),
+            (binary_expr(f.clone(), And, t.clone()), NullableInterval::FALSE),
+            (binary_expr(f.clone(), And, f.clone()), NullableInterval::FALSE),
+            (binary_expr(t.clone(), And, func.clone()), NullableInterval::ANY_TRUTH_VALUE),
+            (binary_expr(func.clone(), And, t.clone()), NullableInterval::ANY_TRUTH_VALUE),
+            (binary_expr(f.clone(), And, func.clone()), NullableInterval::FALSE),
+            (binary_expr(func.clone(), And, f.clone()), NullableInterval::FALSE),
+            (binary_expr(null.clone(), And, func.clone()), NullableInterval::FALSE_OR_UNKNOWN),
+            (binary_expr(func.clone(), And, null.clone()), NullableInterval::FALSE_OR_UNKNOWN),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                eval_bounds(&case.0).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+        }
+    }
+
+    #[test]
+    fn evaluate_bounds_or() {
+        let null = lit(ScalarValue::Null);
+        let zero = lit(0);
+        let one = lit(1);
+        let t = lit(true);
+        let f = lit(false);
+        let func = make_scalar_func_expr();
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (binary_expr(null.clone(), Or, null.clone()), NullableInterval::UNKNOWN),
+            (binary_expr(null.clone(), Or, one.clone()), NullableInterval::TRUE),
+            (binary_expr(null.clone(), Or, zero.clone()), NullableInterval::UNKNOWN),
+            (binary_expr(one.clone(), Or, one.clone()), NullableInterval::TRUE),
+            (binary_expr(one.clone(), Or, zero.clone()), NullableInterval::TRUE),
+            (binary_expr(null.clone(), Or, t.clone()), NullableInterval::TRUE),
+            (binary_expr(t.clone(), Or, null.clone()), NullableInterval::TRUE),
+            (binary_expr(null.clone(), Or, f.clone()), NullableInterval::UNKNOWN),
+            (binary_expr(f.clone(), Or, null.clone()), NullableInterval::UNKNOWN),
+            (binary_expr(t.clone(), Or, t.clone()), NullableInterval::TRUE),
+            (binary_expr(t.clone(), Or, f.clone()), NullableInterval::TRUE),
+            (binary_expr(f.clone(), Or, t.clone()), NullableInterval::TRUE),
+            (binary_expr(f.clone(), Or, f.clone()), NullableInterval::FALSE),
+            (binary_expr(t.clone(), Or, func.clone()), NullableInterval::TRUE),
+            (binary_expr(func.clone(), Or, t.clone()), NullableInterval::TRUE),
+            (binary_expr(f.clone(), Or, func.clone()), NullableInterval::ANY_TRUTH_VALUE),
+            (binary_expr(func.clone(), Or, f.clone()), NullableInterval::ANY_TRUTH_VALUE),
+            (binary_expr(null.clone(), Or, func.clone()), NullableInterval::TRUE_OR_UNKNOWN),
+            (binary_expr(func.clone(), Or, null.clone()), NullableInterval::TRUE_OR_UNKNOWN),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                eval_bounds(&case.0).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+        }
+    }
+
+    #[test]
+    fn evaluate_bounds_not() {
+        let null = lit(ScalarValue::Null);
+        let zero = lit(0);
+        let one = lit(1);
+        let t = lit(true);
+        let f = lit(false);
+        let func = make_scalar_func_expr();
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (not(null.clone()), NullableInterval::UNKNOWN),
+            (not(one.clone()), NullableInterval::FALSE),
+            (not(zero.clone()), NullableInterval::TRUE),
+            (not(t.clone()), NullableInterval::FALSE),
+            (not(f.clone()), NullableInterval::TRUE),
+            (not(func.clone()), NullableInterval::ANY_TRUTH_VALUE),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                eval_bounds(&case.0).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+        }
+    }
+
+    #[test]
+    fn evaluate_bounds_is() {
+        let null = lit(ScalarValue::Null);
+        let zero = lit(0);
+        let one = lit(1);
+        let t = lit(true);
+        let f = lit(false);
+        let col = col("col");
+        let nullable_schema = DFSchema::try_from(Schema::new(vec![Field::new(
+            "col",
+            DataType::UInt8,
+            true,
+        )]))
+        .unwrap();
+        let not_nullable_schema = DFSchema::try_from(Schema::new(vec![Field::new(
+            "col",
+            DataType::UInt8,
+            false,
+        )]))
+        .unwrap();
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (is_null(null.clone()), NullableInterval::TRUE),
+            (is_null(one.clone()), NullableInterval::FALSE),
+            (is_null(binary_expr(null.clone(), Eq, null.clone())), NullableInterval::TRUE),
+            (is_not_null(null.clone()), NullableInterval::FALSE),
+            (is_not_null(one.clone()), NullableInterval::TRUE),
+            (is_not_null(binary_expr(null.clone(), Eq, null.clone())), NullableInterval::FALSE),
+            (is_true(null.clone()), NullableInterval::FALSE),
+            (is_true(t.clone()), NullableInterval::TRUE),
+            (is_true(f.clone()), NullableInterval::FALSE),
+            (is_true(zero.clone()), NullableInterval::FALSE),
+            (is_true(one.clone()), NullableInterval::TRUE),
+            (is_true(binary_expr(null.clone(), Eq, null.clone())), NullableInterval::FALSE),
+            (is_not_true(null.clone()), NullableInterval::TRUE),
+            (is_not_true(t.clone()), NullableInterval::FALSE),
+            (is_not_true(f.clone()), NullableInterval::TRUE),
+            (is_not_true(zero.clone()), NullableInterval::TRUE),
+            (is_not_true(one.clone()), NullableInterval::FALSE),
+            (is_not_true(binary_expr(null.clone(), Eq, null.clone())), NullableInterval::TRUE),
+            (is_false(null.clone()), NullableInterval::FALSE),
+            (is_false(t.clone()), NullableInterval::FALSE),
+            (is_false(f.clone()), NullableInterval::TRUE),
+            (is_false(zero.clone()), NullableInterval::TRUE),
+            (is_false(one.clone()), NullableInterval::FALSE),
+            (is_false(binary_expr(null.clone(), Eq, null.clone())), NullableInterval::FALSE),
+            (is_not_false(null.clone()), NullableInterval::TRUE),
+            (is_not_false(t.clone()), NullableInterval::TRUE),
+            (is_not_false(f.clone()), NullableInterval::FALSE),
+            (is_not_false(zero.clone()), NullableInterval::FALSE),
+            (is_not_false(one.clone()), NullableInterval::TRUE),
+            (is_not_false(binary_expr(null.clone(), Eq, null.clone())), NullableInterval::TRUE),
+            (is_unknown(null.clone()), NullableInterval::TRUE),
+            (is_unknown(t.clone()), NullableInterval::FALSE),
+            (is_unknown(f.clone()), NullableInterval::FALSE),
+            (is_unknown(zero.clone()), NullableInterval::FALSE),
+            (is_unknown(one.clone()), NullableInterval::FALSE),
+            (is_unknown(binary_expr(null.clone(), Eq, null.clone())), NullableInterval::TRUE),
+            (is_not_unknown(null.clone()), NullableInterval::FALSE),
+            (is_not_unknown(t.clone()), NullableInterval::TRUE),
+            (is_not_unknown(f.clone()), NullableInterval::TRUE),
+            (is_not_unknown(zero.clone()), NullableInterval::TRUE),
+            (is_not_unknown(one.clone()), NullableInterval::TRUE),
+            (is_not_unknown(binary_expr(null.clone(), Eq, null.clone())), NullableInterval::FALSE),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                eval_bounds(&case.0).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+        }
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (is_null(col.clone()), &nullable_schema, NullableInterval::TRUE_OR_FALSE),
+            (is_null(col.clone()), &not_nullable_schema, NullableInterval::FALSE),
+            (is_null(binary_expr(col.clone(), Eq, col.clone())), &nullable_schema, NullableInterval::TRUE_OR_FALSE),
+            (is_null(binary_expr(col.clone(), Eq, col.clone())), &not_nullable_schema, NullableInterval::FALSE),
+            (is_not_null(col.clone()), &nullable_schema, NullableInterval::TRUE_OR_FALSE),
+            (is_not_null(col.clone()), &not_nullable_schema, NullableInterval::TRUE),
+            (is_not_null(binary_expr(col.clone(), Eq, col.clone())), &nullable_schema, NullableInterval::TRUE_OR_FALSE),
+            (is_not_null(binary_expr(col.clone(), Eq, col.clone())), &not_nullable_schema, NullableInterval::TRUE),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                evaluate_bounds(&case.0, None, case.1).unwrap(),
+                case.2,
+                "Failed for {}",
+                case.0
+            );
+        }
+    }
+
+    #[test]
+    fn evaluate_bounds_between() {
+        let null = lit(ScalarValue::Null);
+        let zero = lit(0);
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (zero.clone().between(zero.clone(), zero.clone()), NullableInterval::TRUE_OR_FALSE),
+            (null.clone().between(zero.clone(), zero.clone()), NullableInterval::UNKNOWN),
+            (zero.clone().between(null.clone(), zero.clone()), NullableInterval::ANY_TRUTH_VALUE),
+            (zero.clone().between(zero.clone(), null.clone()), NullableInterval::ANY_TRUTH_VALUE),
+            (zero.clone().between(null.clone(), null.clone()), NullableInterval::UNKNOWN),
+            (null.clone().between(null.clone(), null.clone()), NullableInterval::UNKNOWN),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                eval_bounds(&case.0).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+        }
+    }
+
+    #[test]
+    fn evaluate_bounds_binary_op() {
+        let null = lit(ScalarValue::Null);
+        let zero = lit(0);
+        let col = col("col");
+        let nullable_schema = DFSchema::try_from(Schema::new(vec![Field::new(
+            "col",
+            DataType::Utf8,
+            true,
+        )]))
+        .unwrap();
+        let not_nullable_schema = DFSchema::try_from(Schema::new(vec![Field::new(
+            "col",
+            DataType::Utf8,
+            false,
+        )]))
+        .unwrap();
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (binary_expr(zero.clone(), Eq, zero.clone()), NullableInterval::TRUE_OR_FALSE),
+            (binary_expr(null.clone(), Eq, zero.clone()), NullableInterval::UNKNOWN),
+            (binary_expr(zero.clone(), Eq, null.clone()), NullableInterval::UNKNOWN),
+            (binary_expr(null.clone(), Eq, null.clone()), NullableInterval::UNKNOWN),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                eval_bounds(&case.0).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+        }
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (binary_expr(zero.clone(), Eq, col.clone()), NullableInterval::TRUE_OR_FALSE),
+            (binary_expr(col.clone(), Eq, zero.clone()), NullableInterval::TRUE_OR_FALSE),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                evaluate_bounds(&case.0, None, &not_nullable_schema).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+
+            assert_eq!(
+                evaluate_bounds(&case.0, None, &nullable_schema).unwrap(),
+                NullableInterval::ANY_TRUTH_VALUE,
+                "Failed for {}",
+                case.0
+            );
+        }
+    }
+
+    #[test]
+    fn evaluate_bounds_negative() {
+        let null = lit(ScalarValue::Null);
+        let zero = lit(0);
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (zero.clone().neg(), NullableInterval::TRUE_OR_FALSE),
+            (null.clone().neg(), NullableInterval::UNKNOWN),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                eval_bounds(&case.0).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+        }
+    }
+
+    #[test]
+    fn evaluate_bounds_like() {
+        let null = lit(ScalarValue::Null);
+        let expr = lit("foo");
+        let pattern = lit("f.*");
+        let col = col("col");
+        let nullable_schema = DFSchema::try_from(Schema::new(vec![Field::new(
+            "col",
+            DataType::Utf8,
+            true,
+        )]))
+        .unwrap();
+        let not_nullable_schema = DFSchema::try_from(Schema::new(vec![Field::new(
+            "col",
+            DataType::Utf8,
+            false,
+        )]))
+        .unwrap();
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (expr.clone().like(pattern.clone()), NullableInterval::TRUE_OR_FALSE),
+            (null.clone().like(pattern.clone()), NullableInterval::UNKNOWN),
+            (expr.clone().like(null.clone()), NullableInterval::UNKNOWN),
+            (null.clone().like(null.clone()), NullableInterval::UNKNOWN),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                eval_bounds(&case.0).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+        }
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (col.clone().like(pattern.clone()), NullableInterval::TRUE_OR_FALSE),
+            (expr.clone().like(col.clone()), NullableInterval::TRUE_OR_FALSE),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                evaluate_bounds(&case.0, None, &not_nullable_schema).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+
+            assert_eq!(
+                evaluate_bounds(&case.0, None, &nullable_schema).unwrap(),
+                NullableInterval::ANY_TRUTH_VALUE,
+                "Failed for {}",
+                case.0
+            );
+        }
+    }
+
+    #[test]
+    fn evaluate_bounds_udf() {
+        let func = make_scalar_func_expr();
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (func.clone(), NullableInterval::ANY_TRUTH_VALUE),
+            (not(func.clone()), NullableInterval::ANY_TRUTH_VALUE),
+            (binary_expr(func.clone(), And, func.clone()), NullableInterval::ANY_TRUTH_VALUE),
+        ];
+
+        for case in cases {
+            assert_eq!(eval_bounds(&case.0).unwrap(), case.1);
+        }
+    }
+
+    fn make_scalar_func_expr() -> Expr {
+        let scalar_func_impl =
+            |_: &[ColumnarValue]| Ok(ColumnarValue::Scalar(ScalarValue::Null));
+        let udf = create_udf(
+            "foo",
+            vec![],
+            DataType::Boolean,
+            Volatility::Stable,
+            Arc::new(scalar_func_impl),
+        );
+        Expr::ScalarFunction(ScalarFunction::new_udf(Arc::new(udf), vec![]))
+    }
+}
diff --git a/datafusion/expr/src/preimage.rs b/datafusion/expr/src/preimage.rs
new file mode 100644
index 0000000000000..67ca7a91bbf38
--- /dev/null
+++ b/datafusion/expr/src/preimage.rs
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_expr_common::interval_arithmetic::Interval;
+
+use crate::Expr;
+
+/// Return from [`crate::ScalarUDFImpl::preimage`]
+pub enum PreimageResult {
+    /// No preimage exists for the specified value
+    None,
+    /// The expression always evaluates to the specified constant
+    /// given that `expr` is within the interval
+    Range { expr: Expr, interval: Box<Interval> },
+}
diff --git a/datafusion/expr/src/ptr_eq.rs b/datafusion/expr/src/ptr_eq.rs
index 0bbfba5e8d063..79ea3d7219143 100644
--- a/datafusion/expr/src/ptr_eq.rs
+++ b/datafusion/expr/src/ptr_eq.rs
@@ -39,7 +39,7 @@ pub fn arc_ptr_hash<T: ?Sized>(a: &Arc<T>, hasher: &mut impl Hasher) {
 ///
 /// If you have pointers to a `dyn UDF impl` consider using [`super::udf_eq::UdfEq`].
 #[derive(Clone)]
-#[allow(private_bounds)] // This is so that PtrEq can only be used with allowed pointer types (e.g. Arc), without allowing misuse.
+#[expect(private_bounds)] // This is so that PtrEq can only be used with allowed pointer types (e.g. Arc), without allowing misuse.
 pub struct PtrEq<Ptr: PointerType>(Ptr);
 
 impl<T> PartialEq for PtrEq<Arc<T>>
diff --git a/datafusion/expr/src/registry.rs b/datafusion/expr/src/registry.rs
index 9554dd68e1758..f03cc5936c6ed 100644
--- a/datafusion/expr/src/registry.rs
+++ b/datafusion/expr/src/registry.rs
@@ -18,18 +18,32 @@
 //! FunctionRegistry trait
 
 use crate::expr_rewriter::FunctionRewrite;
+use crate::higher_order_function::HigherOrderUDF;
 use crate::planner::ExprPlanner;
 use crate::{AggregateUDF, ScalarUDF, UserDefinedLogicalNode, WindowUDF};
-use datafusion_common::{not_impl_err, plan_datafusion_err, HashMap, Result};
+use arrow::datatypes::Field;
+use arrow_schema::DataType;
+use arrow_schema::extension::{
+    Bool8, ExtensionType, FixedShapeTensor, Json, Opaque, TimestampWithOffset, Uuid,
+    VariableShapeTensor,
+};
+use datafusion_common::types::{
+    DFBool8, DFExtensionTypeRef, DFFixedShapeTensor, DFJson, DFOpaque,
+    DFTimestampWithOffset, DFUuid, DFVariableShapeTensor,
+};
+use datafusion_common::{HashMap, Result, not_impl_err, plan_datafusion_err};
 use std::collections::HashSet;
-use std::fmt::Debug;
-use std::sync::Arc;
+use std::fmt::{Debug, Formatter};
+use std::sync::{Arc, RwLock};
 
 /// A registry knows how to build logical expressions out of user-defined function' names
 pub trait FunctionRegistry {
     /// Returns names of all available scalar user defined functions.
     fn udfs(&self) -> HashSet<String>;
 
+    /// Returns names of all available higher order user defined functions.
+    fn higher_order_function_names(&self) -> HashSet<String>;
+
     /// Returns names of all available aggregate user defined functions.
     fn udafs(&self) -> HashSet<String>;
 
@@ -40,6 +54,10 @@ pub trait FunctionRegistry {
     /// `name`.
     fn udf(&self, name: &str) -> Result<Arc<ScalarUDF>>;
 
+    /// Returns a reference to the user defined higher order function named
+    /// `name`.
+    fn higher_order_function(&self, name: &str) -> Result<Arc<dyn HigherOrderUDF>>;
+
     /// Returns a reference to the user defined aggregate function (udaf) named
     /// `name`.
     fn udaf(&self, name: &str) -> Result<Arc<AggregateUDF>>;
@@ -56,6 +74,17 @@ pub trait FunctionRegistry {
     fn register_udf(&mut self, _udf: Arc<ScalarUDF>) -> Result<Option<Arc<ScalarUDF>>> {
         not_impl_err!("Registering ScalarUDF")
     }
+    /// Registers a new [`HigherOrderUDF`], returning any previously registered
+    /// implementation.
+    ///
+    /// Returns an error (the default) if the function can not be registered,
+    /// for example if the registry is read only.
+    fn register_higher_order_function(
+        &mut self,
+        _function: Arc<dyn HigherOrderUDF>,
+    ) -> Result<Option<Arc<dyn HigherOrderUDF>>> {
+        not_impl_err!("Registering HigherOrderUDF")
+    }
     /// Registers a new [`AggregateUDF`], returning any previously registered
     /// implementation.
     ///
@@ -85,6 +114,18 @@ pub trait FunctionRegistry {
         not_impl_err!("Deregistering ScalarUDF")
     }
 
+    /// Deregisters a [`HigherOrderUDF`], returning the implementation that was
+    /// deregistered.
+    ///
+    /// Returns an error (the default) if the function can not be deregistered,
+    /// for example if the registry is read only.
+    fn deregister_higher_order_function(
+        &mut self,
+        _name: &str,
+    ) -> Result<Option<Arc<dyn HigherOrderUDF>>> {
+        not_impl_err!("Deregistering HigherOrderUDF")
+    }
+
     /// Deregisters a [`AggregateUDF`], returning the implementation that was
     /// deregistered.
     ///
@@ -156,6 +197,8 @@ pub struct MemoryFunctionRegistry {
     udafs: HashMap<String, Arc<AggregateUDF>>,
     /// Window Functions
     udwfs: HashMap<String, Arc<WindowUDF>>,
+    /// Higher Order Functions
+    higher_order_functions: HashMap<String, Arc<dyn HigherOrderUDF>>,
 }
 
 impl MemoryFunctionRegistry {
@@ -176,6 +219,13 @@ impl FunctionRegistry for MemoryFunctionRegistry {
             .ok_or_else(|| plan_datafusion_err!("Function {name} not found"))
     }
 
+    fn higher_order_function(&self, name: &str) -> Result<Arc<dyn HigherOrderUDF>> {
+        self.higher_order_functions
+            .get(name)
+            .cloned()
+            .ok_or_else(|| plan_datafusion_err!("Higher Order Function {name} not found"))
+    }
+
     fn udaf(&self, name: &str) -> Result<Arc<AggregateUDF>> {
         self.udafs
             .get(name)
@@ -193,6 +243,14 @@ impl FunctionRegistry for MemoryFunctionRegistry {
     fn register_udf(&mut self, udf: Arc<ScalarUDF>) -> Result<Option<Arc<ScalarUDF>>> {
         Ok(self.udfs.insert(udf.name().to_string(), udf))
     }
+    fn register_higher_order_function(
+        &mut self,
+        function: Arc<dyn HigherOrderUDF>,
+    ) -> Result<Option<Arc<dyn HigherOrderUDF>>> {
+        Ok(self
+            .higher_order_functions
+            .insert(function.name().into(), function))
+    }
     fn register_udaf(
         &mut self,
         udaf: Arc<AggregateUDF>,
@@ -207,6 +265,10 @@ impl FunctionRegistry for MemoryFunctionRegistry {
         vec![]
     }
 
+    fn higher_order_function_names(&self) -> HashSet<String> {
+        self.higher_order_functions.keys().cloned().collect()
+    }
+
     fn udafs(&self) -> HashSet<String> {
         self.udafs.keys().cloned().collect()
     }
@@ -215,3 +277,320 @@ impl FunctionRegistry for MemoryFunctionRegistry {
         self.udwfs.keys().cloned().collect()
     }
 }
+
+/// A cheaply cloneable pointer to an [ExtensionTypeRegistry].
+pub type ExtensionTypeRegistryRef = Arc<dyn ExtensionTypeRegistry>;
+
+/// Manages [`ExtensionTypeRegistration`]s, which allow users to register custom behavior for
+/// extension types.
+///
+/// Each registration is connected to the extension type name, which can also be looked up to get
+/// the registration.
+pub trait ExtensionTypeRegistry: Debug + Send + Sync {
+    /// Returns a reference to registration of an extension type named `name`.
+    ///
+    /// Returns an error if there is no extension type with that name.
+    fn extension_type_registration(
+        &self,
+        name: &str,
+    ) -> Result<ExtensionTypeRegistrationRef>;
+
+    /// Creates a [`DFExtensionTypeRef`] from the type information in the `field`.
+    ///
+    /// The result `Ok(None)` indicates that there is no extension type metadata. Returns an error
+    /// if the extension type in the metadata is not found.
+    fn create_extension_type_for_field(
+        &self,
+        field: &Field,
+    ) -> Result<Option<DFExtensionTypeRef>> {
+        let Some(extension_type_name) = field.extension_type_name() else {
+            return Ok(None);
+        };
+
+        let registration = self.extension_type_registration(extension_type_name)?;
+        registration
+            .create_df_extension_type(field.data_type(), field.extension_type_metadata())
+            .map(Some)
+    }
+
+    /// Returns all registered [ExtensionTypeRegistration].
+    fn extension_type_registrations(&self) -> Vec<ExtensionTypeRegistrationRef>;
+
+    /// Registers a new [ExtensionTypeRegistrationRef], returning any previously registered
+    /// implementation.
+    ///
+    /// Returns an error if the type cannot be registered, for example, if the registry is
+    /// read-only.
+    fn add_extension_type_registration(
+        &self,
+        extension_type: ExtensionTypeRegistrationRef,
+    ) -> Result<Option<ExtensionTypeRegistrationRef>>;
+
+    /// Extends the registry with the provided extension types.
+    ///
+    /// Returns an error if the type cannot be registered, for example, if the registry is
+    /// read-only.
+    fn extend(&self, extension_types: &[ExtensionTypeRegistrationRef]) -> Result<()> {
+        for extension_type in extension_types.iter().cloned() {
+            self.add_extension_type_registration(extension_type)?;
+        }
+        Ok(())
+    }
+
+    /// Deregisters an extension type registration with the name `name`, returning the
+    /// implementation that was deregistered.
+    ///
+    /// Returns an error if the type cannot be deregistered, for example, if the registry is
+    /// read-only.
+    fn remove_extension_type_registration(
+        &self,
+        name: &str,
+    ) -> Result<Option<ExtensionTypeRegistrationRef>>;
+}
+
+/// A factory that creates instances of extension types from a storage [`DataType`] and the
+/// metadata.
+pub type ExtensionTypeFactory =
+    dyn Fn(&DataType, Option<&str>) -> Result<DFExtensionTypeRef> + Send + Sync;
+
+/// A cheaply cloneable pointer to an [ExtensionTypeRegistration].
+pub type ExtensionTypeRegistrationRef = Arc<ExtensionTypeRegistration>;
+
+/// The registration of an extension type. Implementations of this trait are responsible for
+/// *creating* instances of [`DFExtensionType`] that represent the entire semantics of an extension
+/// type.
+///
+/// # Why do we need a Registration?
+///
+/// A good question is why this trait is even necessary. Why not directly register the
+/// [`DFExtensionType`] in a registry?
+///
+/// While this works for extension types requiring no additional metadata (e.g., `arrow.uuid`), it
+/// does not work for more complex extension types with metadata. For example, consider an extension
+/// type `custom.shortened(n)` that aims to short the pretty-printing string to `n` characters.
+/// Here, `n` is a parameter of the extension type and should be a field in the struct that
+/// implements the [`DFExtensionType`]. The job of the registration is to read the metadata from the
+/// field and create the corresponding [`DFExtensionType`] instance with the correct `n` set.
+///
+/// [`DFExtensionType`]: datafusion_common::types::DFExtensionType
+pub struct ExtensionTypeRegistration {
+    /// The name of the extension type.
+    name: String,
+    /// A function that creates an instance of [`DFExtensionTypeRef`] from the storage type and the
+    /// metadata.
+    factory: Box<ExtensionTypeFactory>,
+}
+
+impl ExtensionTypeRegistration {
+    /// Creates a new registration for an extension type. The factory is required to validate that
+    /// the storage [`DataType`] is compatible with the extension type.
+    pub fn new_arc(
+        name: impl Into<String>,
+        factory: impl Fn(&DataType, Option<&str>) -> Result<DFExtensionTypeRef>
+        + Send
+        + Sync
+        + 'static,
+    ) -> ExtensionTypeRegistrationRef {
+        Arc::new(Self {
+            name: name.into(),
+            factory: Box::new(factory),
+        })
+    }
+}
+
+impl ExtensionTypeRegistration {
+    /// The name of the extension type.
+    ///
+    /// This name will be used to find the correct [ExtensionTypeRegistration] when an extension
+    /// type is encountered.
+    pub fn type_name(&self) -> &str {
+        &self.name
+    }
+
+    /// Creates an extension type instance from the optional metadata. The name of the extension
+    /// type is not a parameter as it's already defined by the registration itself.
+    pub fn create_df_extension_type(
+        &self,
+        storage_type: &DataType,
+        metadata: Option<&str>,
+    ) -> Result<DFExtensionTypeRef> {
+        self.factory.as_ref()(storage_type, metadata)
+    }
+}
+
+impl Debug for ExtensionTypeRegistration {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("DefaultExtensionTypeRegistration")
+            .field("type_name", &self.name)
+            .finish()
+    }
+}
+
+/// An [`ExtensionTypeRegistry`] that uses in memory [`HashMap`]s.
+#[derive(Clone, Debug)]
+pub struct MemoryExtensionTypeRegistry {
+    /// Holds a mapping between the name of an extension type and its logical type.
+    extension_types: Arc<RwLock<HashMap<String, ExtensionTypeRegistrationRef>>>,
+}
+
+impl Default for MemoryExtensionTypeRegistry {
+    fn default() -> Self {
+        Self::new_empty()
+    }
+}
+
+impl MemoryExtensionTypeRegistry {
+    /// Creates an empty [MemoryExtensionTypeRegistry].
+    pub fn new_empty() -> Self {
+        Self {
+            extension_types: Arc::new(RwLock::new(HashMap::new())),
+        }
+    }
+
+    /// Pre-registers the [canonical extension types](https://arrow.apache.org/docs/format/CanonicalExtensions.html)
+    /// in the extension type registry.
+    pub fn new_with_canonical_extension_types() -> Self {
+        let mapping = [
+            ExtensionTypeRegistration::new_arc(
+                FixedShapeTensor::NAME,
+                |storage_type, metadata| {
+                    Ok(Arc::new(DFFixedShapeTensor::try_new(
+                        storage_type,
+                        FixedShapeTensor::deserialize_metadata(metadata)?,
+                    )?))
+                },
+            ),
+            ExtensionTypeRegistration::new_arc(
+                VariableShapeTensor::NAME,
+                |storage_type, metadata| {
+                    Ok(Arc::new(DFVariableShapeTensor::try_new(
+                        storage_type,
+                        VariableShapeTensor::deserialize_metadata(metadata)?,
+                    )?))
+                },
+            ),
+            ExtensionTypeRegistration::new_arc(Json::NAME, |storage_type, metadata| {
+                Ok(Arc::new(DFJson::try_new(
+                    storage_type,
+                    Json::deserialize_metadata(metadata)?,
+                )?))
+            }),
+            ExtensionTypeRegistration::new_arc(Uuid::NAME, |storage_type, metadata| {
+                Ok(Arc::new(DFUuid::try_new(
+                    storage_type,
+                    Uuid::deserialize_metadata(metadata)?,
+                )?))
+            }),
+            ExtensionTypeRegistration::new_arc(Opaque::NAME, |storage_type, metadata| {
+                Ok(Arc::new(DFOpaque::try_new(
+                    storage_type,
+                    Opaque::deserialize_metadata(metadata)?,
+                )?))
+            }),
+            ExtensionTypeRegistration::new_arc(Bool8::NAME, |storage_type, metadata| {
+                Ok(Arc::new(DFBool8::try_new(
+                    storage_type,
+                    Bool8::deserialize_metadata(metadata)?,
+                )?))
+            }),
+            ExtensionTypeRegistration::new_arc(
+                TimestampWithOffset::NAME,
+                |storage_type, metadata| {
+                    Ok(Arc::new(DFTimestampWithOffset::try_new(
+                        storage_type,
+                        TimestampWithOffset::deserialize_metadata(metadata)?,
+                    )?))
+                },
+            ),
+        ];
+
+        let mut extension_types = HashMap::new();
+        for registration in mapping.into_iter() {
+            extension_types.insert(registration.type_name().to_owned(), registration);
+        }
+
+        Self {
+            extension_types: Arc::new(RwLock::new(HashMap::from(extension_types))),
+        }
+    }
+
+    /// Creates a new [MemoryExtensionTypeRegistry] with the provided `types`.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if one of the `types` is a native type.
+    pub fn new_with_types(
+        types: impl IntoIterator<Item = ExtensionTypeRegistrationRef>,
+    ) -> Result<Self> {
+        let extension_types = types
+            .into_iter()
+            .map(|t| (t.type_name().to_owned(), t))
+            .collect::<HashMap<_, _>>();
+        Ok(Self {
+            extension_types: Arc::new(RwLock::new(extension_types)),
+        })
+    }
+
+    /// Returns a list of all registered types.
+    pub fn all_extension_types(&self) -> Vec<ExtensionTypeRegistrationRef> {
+        self.extension_types
+            .read()
+            .expect("Extension type registry lock poisoned")
+            .values()
+            .cloned()
+            .collect()
+    }
+}
+
+impl ExtensionTypeRegistry for MemoryExtensionTypeRegistry {
+    fn extension_type_registration(
+        &self,
+        name: &str,
+    ) -> Result<ExtensionTypeRegistrationRef> {
+        self.extension_types
+            .write()
+            .expect("Extension type registry lock poisoned")
+            .get(name)
+            .ok_or_else(|| plan_datafusion_err!("Logical type not found."))
+            .cloned()
+    }
+
+    fn extension_type_registrations(&self) -> Vec<ExtensionTypeRegistrationRef> {
+        self.extension_types
+            .read()
+            .expect("Extension type registry lock poisoned")
+            .values()
+            .cloned()
+            .collect()
+    }
+
+    fn add_extension_type_registration(
+        &self,
+        extension_type: ExtensionTypeRegistrationRef,
+    ) -> Result<Option<ExtensionTypeRegistrationRef>> {
+        Ok(self
+            .extension_types
+            .write()
+            .expect("Extension type registry lock poisoned")
+            .insert(extension_type.type_name().to_owned(), extension_type))
+    }
+
+    fn remove_extension_type_registration(
+        &self,
+        name: &str,
+    ) -> Result<Option<ExtensionTypeRegistrationRef>> {
+        Ok(self
+            .extension_types
+            .write()
+            .expect("Extension type registry lock poisoned")
+            .remove(name))
+    }
+}
+
+impl From<HashMap<String, ExtensionTypeRegistrationRef>> for MemoryExtensionTypeRegistry {
+    fn from(value: HashMap<String, ExtensionTypeRegistrationRef>) -> Self {
+        Self {
+            extension_types: Arc::new(RwLock::new(value)),
+        }
+    }
+}
diff --git a/datafusion/expr/src/select_expr.rs b/datafusion/expr/src/select_expr.rs
index bfec4c5844d08..22b9660572a66 100644
--- a/datafusion/expr/src/select_expr.rs
+++ b/datafusion/expr/src/select_expr.rs
@@ -20,7 +20,7 @@ use std::fmt;
 use arrow::datatypes::FieldRef;
 use datafusion_common::{Column, TableReference};
 
-use crate::{expr::WildcardOptions, Expr};
+use crate::{Expr, expr::WildcardOptions};
 
 /// Represents a SELECT expression in a SQL query.
 ///
diff --git a/datafusion/expr/src/simplify.rs b/datafusion/expr/src/simplify.rs
index 02794271a9ee1..522cf122a273c 100644
--- a/datafusion/expr/src/simplify.rs
+++ b/datafusion/expr/src/simplify.rs
@@ -15,92 +15,167 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Structs and traits to provide the information needed for expression simplification.
+//! Structs to provide the information needed for expression simplification.
 
-use arrow::datatypes::DataType;
-use datafusion_common::{internal_datafusion_err, DFSchemaRef, Result};
-
-use crate::{execution_props::ExecutionProps, Expr, ExprSchemable};
+use std::sync::Arc;
 
-/// Provides the information necessary to apply algebraic simplification to an
-/// [Expr]. See [SimplifyContext] for one concrete implementation.
-///
-/// This trait exists so that other systems can plug schema
-/// information in without having to create `DFSchema` objects. If you
-/// have a [`DFSchemaRef`] you can use [`SimplifyContext`]
-pub trait SimplifyInfo {
-    /// Returns true if this Expr has boolean type
-    fn is_boolean_type(&self, expr: &Expr) -> Result<bool>;
-
-    /// Returns true of this expr is nullable (could possibly be NULL)
-    fn nullable(&self, expr: &Expr) -> Result<bool>;
-
-    /// Returns details needed for partial expression evaluation
-    fn execution_props(&self) -> &ExecutionProps;
+use arrow::datatypes::DataType;
+use chrono::{DateTime, Utc};
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::{DFSchema, DFSchemaRef, Result};
 
-    /// Returns data type of this expr needed for determining optimized int type of a value
-    fn get_data_type(&self, expr: &Expr) -> Result<DataType>;
-}
+use crate::{Expr, ExprSchemable};
 
-/// Provides simplification information based on DFSchema and
-/// [`ExecutionProps`]. This is the default implementation used by DataFusion
+/// Provides simplification information based on schema, query execution time,
+/// and configuration options.
 ///
 /// # Example
 /// See the `simplify_demo` in the [`expr_api` example]
 ///
-/// [`expr_api` example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/expr_api.rs
+/// [`expr_api` example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/expr_api.rs
 #[derive(Debug, Clone)]
-pub struct SimplifyContext<'a> {
+pub struct SimplifyContext {
+    schema: DFSchemaRef,
+    query_execution_start_time: Option<DateTime<Utc>>,
+    config_options: Arc<ConfigOptions>,
+}
+
+/// Builder for [`SimplifyContext`].
+#[derive(Debug, Default)]
+pub struct SimplifyContextBuilder {
     schema: Option<DFSchemaRef>,
-    props: &'a ExecutionProps,
+    query_execution_start_time: Option<DateTime<Utc>>,
+    config_options: Option<Arc<ConfigOptions>>,
 }
 
-impl<'a> SimplifyContext<'a> {
-    /// Create a new SimplifyContext
-    pub fn new(props: &'a ExecutionProps) -> Self {
+impl Default for SimplifyContext {
+    fn default() -> Self {
         Self {
-            schema: None,
-            props,
+            schema: Arc::new(DFSchema::empty()),
+            query_execution_start_time: None,
+            config_options: Arc::new(ConfigOptions::default()),
         }
     }
+}
+
+impl SimplifyContext {
+    /// Returns a builder for [`SimplifyContext`].
+    pub fn builder() -> SimplifyContextBuilder {
+        SimplifyContextBuilder::default()
+    }
 
-    /// Register a [`DFSchemaRef`] with this context
+    #[deprecated(
+        since = "54.0.0",
+        note = "Use SimplifyContextBuilder if you intend to use non-default values."
+    )]
+    /// Set the [`ConfigOptions`] for this context
+    pub fn with_config_options(mut self, config_options: Arc<ConfigOptions>) -> Self {
+        self.config_options = config_options;
+        self
+    }
+
+    #[deprecated(
+        since = "54.0.0",
+        note = "Use SimplifyContextBuilder if you intend to use non-default values."
+    )]
+    /// Set the schema for this context
     pub fn with_schema(mut self, schema: DFSchemaRef) -> Self {
-        self.schema = Some(schema);
+        self.schema = schema;
         self
     }
-}
 
-impl SimplifyInfo for SimplifyContext<'_> {
-    /// Returns true if this Expr has boolean type
-    fn is_boolean_type(&self, expr: &Expr) -> Result<bool> {
-        if let Some(schema) = &self.schema {
-            if let Ok(DataType::Boolean) = expr.get_type(schema) {
-                return Ok(true);
-            }
-        }
+    #[deprecated(
+        since = "54.0.0",
+        note = "Use SimplifyContextBuilder if you intend to use non-default values."
+    )]
+    /// Set the query execution start time
+    pub fn with_query_execution_start_time(
+        mut self,
+        query_execution_start_time: Option<DateTime<Utc>>,
+    ) -> Self {
+        self.query_execution_start_time = query_execution_start_time;
+        self
+    }
+
+    #[deprecated(
+        since = "54.0.0",
+        note = "Use SimplifyContextBuilder if you intend to use non-default values."
+    )]
+    /// Set the query execution start to the current time
+    pub fn with_current_time(mut self) -> Self {
+        self.query_execution_start_time = Some(Utc::now());
+        self
+    }
+
+    /// Returns the schema
+    pub fn schema(&self) -> &DFSchemaRef {
+        &self.schema
+    }
 
-        Ok(false)
+    /// Returns true if this Expr has boolean type
+    pub fn is_boolean_type(&self, expr: &Expr) -> Result<bool> {
+        Ok(expr.get_type(&self.schema)? == DataType::Boolean)
     }
 
     /// Returns true if expr is nullable
-    fn nullable(&self, expr: &Expr) -> Result<bool> {
-        let schema = self.schema.as_ref().ok_or_else(|| {
-            internal_datafusion_err!("attempt to get nullability without schema")
-        })?;
-        expr.nullable(schema.as_ref())
+    pub fn nullable(&self, expr: &Expr) -> Result<bool> {
+        expr.nullable(self.schema.as_ref())
     }
 
     /// Returns data type of this expr needed for determining optimized int type of a value
-    fn get_data_type(&self, expr: &Expr) -> Result<DataType> {
-        let schema = self.schema.as_ref().ok_or_else(|| {
-            internal_datafusion_err!("attempt to get data type without schema")
-        })?;
-        expr.get_type(schema)
+    pub fn get_data_type(&self, expr: &Expr) -> Result<DataType> {
+        expr.get_type(&self.schema)
+    }
+
+    /// Returns the time at which the query execution started.
+    /// If `None`, time-dependent functions like `now()` will not be simplified.
+    pub fn query_execution_start_time(&self) -> Option<DateTime<Utc>> {
+        self.query_execution_start_time
     }
 
-    fn execution_props(&self) -> &ExecutionProps {
-        self.props
+    /// Returns the configuration options for the session.
+    pub fn config_options(&self) -> &Arc<ConfigOptions> {
+        &self.config_options
+    }
+}
+
+impl SimplifyContextBuilder {
+    /// Set the [`ConfigOptions`] for this context.
+    pub fn with_config_options(mut self, config_options: Arc<ConfigOptions>) -> Self {
+        self.config_options = Some(config_options);
+        self
+    }
+
+    /// Set the schema for this context.
+    pub fn with_schema(mut self, schema: DFSchemaRef) -> Self {
+        self.schema = Some(schema);
+        self
+    }
+
+    /// Set the query execution start time.
+    pub fn with_query_execution_start_time(
+        mut self,
+        query_execution_start_time: Option<DateTime<Utc>>,
+    ) -> Self {
+        self.query_execution_start_time = query_execution_start_time;
+        self
+    }
+
+    /// Set the query execution start to the current time.
+    pub fn with_current_time(mut self) -> Self {
+        self.query_execution_start_time = Some(Utc::now());
+        self
+    }
+
+    /// Build a [`SimplifyContext`], filling in any unspecified fields with defaults.
+    pub fn build(self) -> SimplifyContext {
+        SimplifyContext {
+            schema: self.schema.unwrap_or_else(|| Arc::new(DFSchema::empty())),
+            query_execution_start_time: self.query_execution_start_time,
+            config_options: self
+                .config_options
+                .unwrap_or_else(|| Arc::new(ConfigOptions::default())),
+        }
     }
 }
 
@@ -113,3 +188,38 @@ pub enum ExprSimplifyResult {
     /// are return unmodified.
     Original(Vec<Expr>),
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn simplify_context_builder_builds_default_context() {
+        let context = SimplifyContext::builder().build();
+        let default_options = ConfigOptions::default();
+
+        assert_eq!(context.schema().as_ref(), &DFSchema::empty());
+        assert_eq!(context.query_execution_start_time(), None);
+        assert_eq!(
+            context.config_options().optimizer.max_passes,
+            default_options.optimizer.max_passes
+        );
+    }
+
+    #[test]
+    fn simplify_context_builder_uses_overrides() {
+        let schema = Arc::new(DFSchema::empty());
+        let config_options = Arc::new(ConfigOptions::default());
+        let current_time = Utc::now();
+
+        let context = SimplifyContext::builder()
+            .with_schema(Arc::clone(&schema))
+            .with_config_options(Arc::clone(&config_options))
+            .with_query_execution_start_time(Some(current_time))
+            .build();
+
+        assert_eq!(context.schema().as_ref(), schema.as_ref());
+        assert_eq!(context.query_execution_start_time(), Some(current_time));
+        assert!(Arc::ptr_eq(context.config_options(), &config_options));
+    }
+}
diff --git a/datafusion/expr/src/table_source.rs b/datafusion/expr/src/table_source.rs
index d3b253c0e102c..65dce8f3c8b0b 100644
--- a/datafusion/expr/src/table_source.rs
+++ b/datafusion/expr/src/table_source.rs
@@ -91,9 +91,7 @@ impl std::fmt::Display for TableType {
 ///
 /// [`TableProvider`]: https://docs.rs/datafusion/latest/datafusion/datasource/trait.TableProvider.html
 /// [`DefaultTableSource`]: https://docs.rs/datafusion/latest/datafusion/datasource/default_table_source/struct.DefaultTableSource.html
-pub trait TableSource: Sync + Send {
-    fn as_any(&self) -> &dyn Any;
-
+pub trait TableSource: Any + Sync + Send {
     /// Get a reference to the schema for this table
     fn schema(&self) -> SchemaRef;
 
@@ -130,3 +128,13 @@ pub trait TableSource: Sync + Send {
         None
     }
 }
+
+impl dyn TableSource {
+    pub fn is<T: TableSource>(&self) -> bool {
+        (self as &dyn Any).is::<T>()
+    }
+
+    pub fn downcast_ref<T: TableSource>(&self) -> Option<&T> {
+        (self as &dyn Any).downcast_ref()
+    }
+}
diff --git a/datafusion/expr/src/test/function_stub.rs b/datafusion/expr/src/test/function_stub.rs
index 8609afeae6018..a1f29b649b2f8 100644
--- a/datafusion/expr/src/test/function_stub.rs
+++ b/datafusion/expr/src/test/function_stub.rs
@@ -19,29 +19,27 @@
 //!
 //! These are used to avoid a dependence on `datafusion-functions-aggregate` which live in a different crate
 
-use std::any::Any;
-
 use arrow::datatypes::{
-    DataType, FieldRef, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE,
-    DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DECIMAL32_MAX_PRECISION,
-    DECIMAL32_MAX_SCALE, DECIMAL64_MAX_PRECISION, DECIMAL64_MAX_SCALE,
+    DECIMAL32_MAX_PRECISION, DECIMAL32_MAX_SCALE, DECIMAL64_MAX_PRECISION,
+    DECIMAL64_MAX_SCALE, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE,
+    DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DataType, FieldRef,
 };
 
 use datafusion_common::plan_err;
-use datafusion_common::{exec_err, not_impl_err, utils::take_function_args, Result};
+use datafusion_common::{Result, exec_err, not_impl_err, utils::take_function_args};
 
-use crate::type_coercion::aggregates::NUMERICS;
 use crate::Volatility::Immutable;
 use crate::{
+    Accumulator, AggregateUDFImpl, Coercion, Expr, GroupsAccumulator, ReversedUDAF,
+    Signature, TypeSignature, TypeSignatureClass,
     expr::AggregateFunction,
     function::{AccumulatorArgs, StateFieldsArgs},
     utils::AggregateOrderSensitivity,
-    Accumulator, AggregateUDFImpl, Expr, GroupsAccumulator, ReversedUDAF, Signature,
 };
+use datafusion_common::types::{NativeType, logical_float64};
 
 macro_rules! create_func {
     ($UDAF:ty, $AGGREGATE_UDF_FN:ident) => {
-        paste::paste! {
             #[doc = concat!("AggregateFunction that returns a [AggregateUDF](crate::AggregateUDF) for [`", stringify!($UDAF), "`]")]
             pub fn $AGGREGATE_UDF_FN() -> std::sync::Arc<crate::AggregateUDF> {
                 // Singleton instance of [$UDAF], ensures the UDAF is only created once
@@ -51,7 +49,6 @@ macro_rules! create_func {
                     });
                 std::sync::Arc::clone(&INSTANCE)
             }
-        }
     }
 }
 
@@ -115,10 +112,6 @@ impl Default for Sum {
 }
 
 impl AggregateUDFImpl for Sum {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "sum"
     }
@@ -247,10 +240,6 @@ impl Count {
 }
 
 impl AggregateUDFImpl for Count {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "COUNT"
     }
@@ -334,10 +323,6 @@ impl Min {
 }
 
 impl AggregateUDFImpl for Min {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "min"
     }
@@ -416,10 +401,6 @@ impl Max {
 }
 
 impl AggregateUDFImpl for Max {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "max"
     }
@@ -464,9 +445,22 @@ pub struct Avg {
 
 impl Avg {
     pub fn new() -> Self {
+        let signature = Signature::one_of(
+            vec![
+                TypeSignature::Coercible(vec![Coercion::new_exact(
+                    TypeSignatureClass::Decimal,
+                )]),
+                TypeSignature::Coercible(vec![Coercion::new_implicit(
+                    TypeSignatureClass::Native(logical_float64()),
+                    vec![TypeSignatureClass::Integer, TypeSignatureClass::Float],
+                    NativeType::Float64,
+                )]),
+            ],
+            Immutable,
+        );
         Self {
             aliases: vec![String::from("mean")],
-            signature: Signature::uniform(1, NUMERICS.to_vec(), Immutable),
+            signature,
         }
     }
 }
@@ -478,10 +472,6 @@ impl Default for Avg {
 }
 
 impl AggregateUDFImpl for Avg {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "avg"
     }
diff --git a/datafusion/expr/src/tree_node.rs b/datafusion/expr/src/tree_node.rs
index 81846b4f80608..010441b5a25d1 100644
--- a/datafusion/expr/src/tree_node.rs
+++ b/datafusion/expr/src/tree_node.rs
@@ -17,17 +17,21 @@
 
 //! Tree node implementation for Logical Expressions
 
-use crate::expr::{
-    AggregateFunction, AggregateFunctionParams, Alias, Between, BinaryExpr, Case, Cast,
-    GroupingSet, InList, InSubquery, Like, Placeholder, ScalarFunction, TryCast, Unnest,
-    WindowFunction, WindowFunctionParams,
+use crate::{
+    Expr,
+    expr::{
+        AggregateFunction, AggregateFunctionParams, Alias, Between, BinaryExpr, Case,
+        Cast, GroupingSet, HigherOrderFunction, InList, InSubquery, Lambda, Like,
+        Placeholder, ScalarFunction, SetComparison, TryCast, Unnest, WindowFunction,
+        WindowFunctionParams,
+    },
 };
-use crate::Expr;
-
-use datafusion_common::tree_node::{
-    Transformed, TreeNode, TreeNodeContainer, TreeNodeRecursion, TreeNodeRefContainer,
+use datafusion_common::{
+    Result,
+    tree_node::{
+        Transformed, TreeNode, TreeNodeContainer, TreeNodeRecursion, TreeNodeRefContainer,
+    },
 };
-use datafusion_common::Result;
 
 /// Implementation of the [`TreeNode`] trait
 ///
@@ -58,7 +62,8 @@ impl TreeNode for Expr {
             | Expr::Negative(expr)
             | Expr::Cast(Cast { expr, .. })
             | Expr::TryCast(TryCast { expr, .. })
-            | Expr::InSubquery(InSubquery { expr, .. }) => expr.apply_elements(f),
+            | Expr::InSubquery(InSubquery { expr, .. })
+            | Expr::SetComparison(SetComparison { expr, .. }) => expr.apply_elements(f),
             Expr::GroupingSet(GroupingSet::Rollup(exprs))
             | Expr::GroupingSet(GroupingSet::Cube(exprs)) => exprs.apply_elements(f),
             Expr::ScalarFunction(ScalarFunction { args, .. }) => {
@@ -77,7 +82,8 @@ impl TreeNode for Expr {
             | Expr::Exists { .. }
             | Expr::ScalarSubquery(_)
             | Expr::Wildcard { .. }
-            | Expr::Placeholder(_) => Ok(TreeNodeRecursion::Continue),
+            | Expr::Placeholder(_)
+            | Expr::LambdaVariable(_) => Ok(TreeNodeRecursion::Continue),
             Expr::BinaryExpr(BinaryExpr { left, right, .. }) => {
                 (left, right).apply_ref_elements(f)
             }
@@ -106,6 +112,8 @@ impl TreeNode for Expr {
             Expr::InList(InList { expr, list, .. }) => {
                 (expr, list).apply_ref_elements(f)
             }
+            Expr::HigherOrderFunction(HigherOrderFunction { func: _, args}) => args.apply_elements(f),
+            Expr::Lambda (Lambda{ params: _, body}) => body.apply_elements(f)
         }
     }
 
@@ -115,7 +123,7 @@ impl TreeNode for Expr {
     /// indicating whether the expression was transformed or left unchanged.
     fn map_children<F: FnMut(Self) -> Result<Transformed<Self>>>(
         self,
-        mut f: F,
+        f: F,
     ) -> Result<Transformed<Self>> {
         Ok(match self {
             // TODO: remove the next line after `Expr::Wildcard` is removed
@@ -127,7 +135,21 @@ impl TreeNode for Expr {
             | Expr::Exists { .. }
             | Expr::ScalarSubquery(_)
             | Expr::ScalarVariable(_, _)
-            | Expr::Literal(_, _) => Transformed::no(self),
+            | Expr::Literal(_, _)
+            | Expr::LambdaVariable(_) => Transformed::no(self),
+            Expr::SetComparison(SetComparison {
+                expr,
+                subquery,
+                op,
+                quantifier,
+            }) => expr.map_elements(f)?.update_data(|expr| {
+                Expr::SetComparison(SetComparison {
+                    expr,
+                    subquery,
+                    op,
+                    quantifier,
+                })
+            }),
             Expr::Unnest(Unnest { expr, .. }) => expr
                 .map_elements(f)?
                 .update_data(|expr| Expr::Unnest(Unnest { expr })),
@@ -136,8 +158,13 @@ impl TreeNode for Expr {
                 relation,
                 name,
                 metadata,
-            }) => f(*expr)?.update_data(|e| {
-                e.alias_qualified_with_metadata(relation, name, metadata)
+            }) => expr.map_elements(f)?.update_data(|expr| {
+                Expr::Alias(Alias {
+                    expr,
+                    relation,
+                    name,
+                    metadata,
+                })
             }),
             Expr::InSubquery(InSubquery {
                 expr,
@@ -220,12 +247,12 @@ impl TreeNode for Expr {
                 .update_data(|(new_expr, new_when_then_expr, new_else_expr)| {
                     Expr::Case(Case::new(new_expr, new_when_then_expr, new_else_expr))
                 }),
-            Expr::Cast(Cast { expr, data_type }) => expr
+            Expr::Cast(Cast { expr, field }) => expr
                 .map_elements(f)?
-                .update_data(|be| Expr::Cast(Cast::new(be, data_type))),
-            Expr::TryCast(TryCast { expr, data_type }) => expr
+                .update_data(|be| Expr::Cast(Cast::new_from_field(be, field))),
+            Expr::TryCast(TryCast { expr, field }) => expr
                 .map_elements(f)?
-                .update_data(|be| Expr::TryCast(TryCast::new(be, data_type))),
+                .update_data(|be| Expr::TryCast(TryCast::new_from_field(be, field))),
             Expr::ScalarFunction(ScalarFunction { func, args }) => {
                 args.map_elements(f)?.map_data(|new_args| {
                     Ok(Expr::ScalarFunction(ScalarFunction::new_udf(
@@ -311,6 +338,14 @@ impl TreeNode for Expr {
                 .update_data(|(new_expr, new_list)| {
                     Expr::InList(InList::new(new_expr, new_list, negated))
                 }),
+            Expr::HigherOrderFunction(HigherOrderFunction { func, args }) => {
+                args.map_elements(f)?.update_data(|args| {
+                    Expr::HigherOrderFunction(HigherOrderFunction { func, args })
+                })
+            }
+            Expr::Lambda(Lambda { params, body }) => body
+                .map_elements(f)?
+                .update_data(|body| Expr::Lambda(Lambda { params, body })),
         })
     }
 }
diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs
index bcaff11bcdb49..86616daf08c73 100644
--- a/datafusion/expr/src/type_coercion/functions.rs
+++ b/datafusion/expr/src/type_coercion/functions.rs
@@ -16,77 +16,98 @@
 // under the License.
 
 use super::binary::binary_numeric_coercion;
-use crate::{AggregateUDF, ScalarUDF, Signature, TypeSignature, WindowUDF};
-use arrow::datatypes::FieldRef;
+use crate::{
+    AggregateUDF, HigherOrderTypeSignature, HigherOrderUDF, ScalarUDF, Signature,
+    TypeSignature, ValueOrLambda, WindowUDF,
+};
+use arrow::datatypes::{Field, FieldRef};
 use arrow::{
     compute::can_cast_types,
     datatypes::{DataType, TimeUnit},
 };
+use datafusion_common::internal_datafusion_err;
 use datafusion_common::types::LogicalType;
 use datafusion_common::utils::{
-    base_type, coerced_fixed_size_list_to_list, ListCoercion,
+    ListCoercion, base_type, coerced_fixed_size_list_to_list,
 };
 use datafusion_common::{
-    exec_err, internal_err, plan_err, types::NativeType, utils::list_ndims, Result,
+    Result, exec_err, internal_err, plan_err, types::NativeType, utils::list_ndims,
 };
 use datafusion_expr_common::signature::ArrayFunctionArgument;
 use datafusion_expr_common::type_coercion::binary::type_union_resolution;
 use datafusion_expr_common::{
     signature::{ArrayFunctionSignature, FIXED_SIZE_LIST_WILDCARD, TIMEZONE_WILDCARD},
-    type_coercion::binary::comparison_coercion_numeric,
+    type_coercion::binary::comparison_coercion,
     type_coercion::binary::string_coercion,
 };
 use itertools::Itertools as _;
 use std::sync::Arc;
 
-/// Performs type coercion for scalar function arguments.
-///
-/// Returns the data types to which each argument must be coerced to
-/// match `signature`.
-///
-/// For more details on coercion in general, please see the
-/// [`type_coercion`](crate::type_coercion) module.
-pub fn data_types_with_scalar_udf(
-    current_types: &[DataType],
-    func: &ScalarUDF,
-) -> Result<Vec<DataType>> {
-    let signature = func.signature();
-    let type_signature = &signature.type_signature;
+/// Extension trait to unify common functionality between [`ScalarUDF`], [`AggregateUDF`]
+/// and [`WindowUDF`] for use by signature coercion functions.
+pub trait UDFCoercionExt {
+    /// Should delegate to [`ScalarUDF::name`], [`AggregateUDF::name`] or [`WindowUDF::name`].
+    fn name(&self) -> &str;
+    /// Should delegate to [`ScalarUDF::signature`], [`AggregateUDF::signature`]
+    /// or [`WindowUDF::signature`].
+    fn signature(&self) -> &Signature;
+    /// Should delegate to [`ScalarUDF::coerce_types`], [`AggregateUDF::coerce_types`]
+    /// or [`WindowUDF::coerce_types`].
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>>;
+}
 
-    if current_types.is_empty() && type_signature != &TypeSignature::UserDefined {
-        if type_signature.supports_zero_argument() {
-            return Ok(vec![]);
-        } else if type_signature.used_to_support_zero_arguments() {
-            // Special error to help during upgrade: https://github.com/apache/datafusion/issues/13763
-            return plan_err!("'{}' does not support zero arguments. Use TypeSignature::Nullary for zero arguments", func.name());
-        } else {
-            return plan_err!("'{}' does not support zero arguments", func.name());
-        }
+impl UDFCoercionExt for ScalarUDF {
+    fn name(&self) -> &str {
+        self.name()
     }
 
-    let valid_types =
-        get_valid_types_with_scalar_udf(type_signature, current_types, func)?;
+    fn signature(&self) -> &Signature {
+        self.signature()
+    }
 
-    if valid_types
-        .iter()
-        .any(|data_type| data_type == current_types)
-    {
-        return Ok(current_types.to_vec());
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        self.coerce_types(arg_types)
+    }
+}
+
+impl UDFCoercionExt for AggregateUDF {
+    fn name(&self) -> &str {
+        self.name()
+    }
+
+    fn signature(&self) -> &Signature {
+        self.signature()
     }
 
-    try_coerce_types(func.name(), valid_types, current_types, type_signature)
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        self.coerce_types(arg_types)
+    }
 }
 
-/// Performs type coercion for aggregate function arguments.
+impl UDFCoercionExt for WindowUDF {
+    fn name(&self) -> &str {
+        self.name()
+    }
+
+    fn signature(&self) -> &Signature {
+        self.signature()
+    }
+
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        self.coerce_types(arg_types)
+    }
+}
+
+/// Performs type coercion for UDF arguments.
 ///
-/// Returns the fields to which each argument must be coerced to
+/// Returns the data types to which each argument must be coerced to
 /// match `signature`.
 ///
 /// For more details on coercion in general, please see the
 /// [`type_coercion`](crate::type_coercion) module.
-pub fn fields_with_aggregate_udf(
+pub fn fields_with_udf<F: UDFCoercionExt>(
     current_fields: &[FieldRef],
-    func: &AggregateUDF,
+    func: &F,
 ) -> Result<Vec<FieldRef>> {
     let signature = func.signature();
     let type_signature = &signature.type_signature;
@@ -96,7 +117,10 @@ pub fn fields_with_aggregate_udf(
             return Ok(vec![]);
         } else if type_signature.used_to_support_zero_arguments() {
             // Special error to help during upgrade: https://github.com/apache/datafusion/issues/13763
-            return plan_err!("'{}' does not support zero arguments. Use TypeSignature::Nullary for zero arguments", func.name());
+            return plan_err!(
+                "'{}' does not support zero arguments. Use TypeSignature::Nullary for zero arguments",
+                func.name()
+            );
         } else {
             return plan_err!("'{}' does not support zero arguments", func.name());
         }
@@ -107,8 +131,7 @@ pub fn fields_with_aggregate_udf(
         .cloned()
         .collect::<Vec<_>>();
 
-    let valid_types =
-        get_valid_types_with_aggregate_udf(type_signature, &current_types, func)?;
+    let valid_types = get_valid_types_with_udf(type_signature, &current_types, func)?;
     if valid_types
         .iter()
         .any(|data_type| data_type == &current_types)
@@ -129,56 +152,200 @@ pub fn fields_with_aggregate_udf(
         .collect())
 }
 
-/// Performs type coercion for window function arguments.
+/// Performs type coercion for higher order function arguments.
 ///
-/// Returns the data types to which each argument must be coerced to
-/// match `signature`.
+/// For value arguments, returns the field to which each
+/// argument must be coerced to match `signature`.
+/// For lambda arguments, returns a clone of the associated data
+///
+/// Note this does not invokes [HigherOrderUDF::coerce_values_for_lambdas]
+/// if requested by the function signature. If that's required, use
+/// [value_fields_with_higher_order_udf_and_lambdas] instead
 ///
 /// For more details on coercion in general, please see the
 /// [`type_coercion`](crate::type_coercion) module.
-pub fn fields_with_window_udf(
-    current_fields: &[FieldRef],
-    func: &WindowUDF,
-) -> Result<Vec<FieldRef>> {
-    let signature = func.signature();
-    let type_signature = &signature.type_signature;
+pub fn value_fields_with_higher_order_udf<L: Clone>(
+    current_fields: &[ValueOrLambda<FieldRef, L>],
+    func: &dyn HigherOrderUDF,
+) -> Result<Vec<ValueOrLambda<FieldRef, L>>> {
+    match func.signature().type_signature {
+        HigherOrderTypeSignature::UserDefined => {
+            let arg_types = current_fields
+                .iter()
+                .filter_map(|p| match p {
+                    ValueOrLambda::Value(field) => Some(field.data_type().clone()),
+                    ValueOrLambda::Lambda(_) => None,
+                })
+                .collect::<Vec<_>>();
 
-    if current_fields.is_empty() && type_signature != &TypeSignature::UserDefined {
-        if type_signature.supports_zero_argument() {
-            return Ok(vec![]);
-        } else if type_signature.used_to_support_zero_arguments() {
-            // Special error to help during upgrade: https://github.com/apache/datafusion/issues/13763
-            return plan_err!("'{}' does not support zero arguments. Use TypeSignature::Nullary for zero arguments", func.name());
-        } else {
-            return plan_err!("'{}' does not support zero arguments", func.name());
+            let coerced_types = func.coerce_value_types(&arg_types)?;
+
+            if coerced_types.len() != arg_types.len() {
+                return plan_err!(
+                    "{} coerce_value_types should have returned {} items but returned {}",
+                    func.name(),
+                    arg_types.len(),
+                    coerced_types.len()
+                );
+            }
+
+            // coerced_types has been partitioned from current_fields
+            // and refers only to values and not to lambdas, so instead
+            // of zipping them, we iterate over current_fields and only
+            // consume from coerced_types when a given argument is a value
+            // to reconstruct the arguments list with the correct order
+            // this supports any value and lambda positioning including
+            // multiple lambdas interleaved with values
+            let mut coerced_types = coerced_types.into_iter();
+
+            current_fields
+                .iter()
+                .map(|current_field| match current_field {
+                    ValueOrLambda::Value(field) => {
+                        let data_type = coerced_types.next().ok_or_else(|| {
+                            internal_datafusion_err!(
+                                "coerced_types len should have been checked above"
+                            )
+                        })?;
+
+                        Ok(ValueOrLambda::Value(Arc::new(
+                            field.as_ref().clone().with_data_type(data_type),
+                        )))
+                    }
+                    ValueOrLambda::Lambda(lambda) => {
+                        Ok(ValueOrLambda::Lambda(lambda.clone()))
+                    }
+                })
+                .collect()
+        }
+        HigherOrderTypeSignature::VariadicAny => Ok(current_fields.to_vec()),
+        HigherOrderTypeSignature::Any(number) => {
+            if current_fields.len() != number {
+                return plan_err!(
+                    "The function '{}' expected {number} arguments but received {}",
+                    func.name(),
+                    current_fields.len()
+                );
+            }
+
+            Ok(current_fields.to_vec())
         }
     }
+}
 
-    let current_types = current_fields
+/// Performs type coercion for higher order function arguments,
+/// including those defined by [HigherOrderUDF::coerce_values_for_lambdas],
+/// if defined by the signature. Note that compared to
+/// [value_fields_with_higher_order_udf], this function requires
+/// the [ValueOrLambda::Lambda] variant to contain the output field of the lambda.
+///
+/// For value arguments, returns the field to which each
+/// argument must be coerced to match `signature`.
+/// For lambda arguments, returns a clone of the output field
+///
+/// For more details on coercion in general, please see the
+/// [`type_coercion`](crate::type_coercion) module.
+pub fn value_fields_with_higher_order_udf_and_lambdas(
+    current_fields: &[ValueOrLambda<FieldRef, FieldRef>],
+    func: &dyn HigherOrderUDF,
+) -> Result<Vec<ValueOrLambda<FieldRef, FieldRef>>> {
+    let mut new_fields = value_fields_with_higher_order_udf(current_fields, func)?;
+
+    if func.signature().coerce_values_for_lambdas {
+        let new_types = new_fields
+            .iter()
+            .map(|f| match f {
+                ValueOrLambda::Value(f) => ValueOrLambda::Value(f.data_type().clone()),
+                ValueOrLambda::Lambda(f) => ValueOrLambda::Lambda(f.data_type().clone()),
+            })
+            .collect::<Vec<_>>();
+
+        let mut new_value_types = func.coerce_values_for_lambdas(&new_types)?.into_iter();
+
+        let value_types_count = new_types
+            .iter()
+            .filter(|e| matches!(e, ValueOrLambda::Value(_)))
+            .count();
+
+        if new_value_types.len() != value_types_count {
+            return plan_err!(
+                "{} coerce_values_for_lambdas returned {} values but {value_types_count} expected",
+                func.name(),
+                new_value_types.len()
+            );
+        }
+
+        for new_field in &mut new_fields {
+            match new_field {
+                ValueOrLambda::Value(value) => {
+                    let coerce_to = new_value_types.next().ok_or_else(|| {
+                        internal_datafusion_err!(
+                            "new_value_types len should have been checked above"
+                        )
+                    })?;
+
+                    if value.data_type() != &coerce_to {
+                        Arc::make_mut(value).set_data_type(coerce_to);
+                    }
+                }
+                ValueOrLambda::Lambda(_) => {}
+            }
+        }
+    };
+
+    Ok(new_fields)
+}
+
+/// Performs type coercion for scalar function arguments.
+///
+/// Returns the data types to which each argument must be coerced to
+/// match `signature`.
+///
+/// For more details on coercion in general, please see the
+/// [`type_coercion`](crate::type_coercion) module.
+#[deprecated(since = "52.0.0", note = "use fields_with_udf")]
+pub fn data_types_with_scalar_udf(
+    current_types: &[DataType],
+    func: &ScalarUDF,
+) -> Result<Vec<DataType>> {
+    let current_fields = current_types
         .iter()
-        .map(|f| f.data_type())
-        .cloned()
+        .map(|dt| Arc::new(Field::new("f", dt.clone(), true)))
         .collect::<Vec<_>>();
-    let valid_types =
-        get_valid_types_with_window_udf(type_signature, &current_types, func)?;
-    if valid_types
+    Ok(fields_with_udf(&current_fields, func)?
         .iter()
-        .any(|data_type| data_type == &current_types)
-    {
-        return Ok(current_fields.to_vec());
-    }
+        .map(|f| f.data_type().clone())
+        .collect())
+}
 
-    let updated_types =
-        try_coerce_types(func.name(), valid_types, &current_types, type_signature)?;
+/// Performs type coercion for aggregate function arguments.
+///
+/// Returns the fields to which each argument must be coerced to
+/// match `signature`.
+///
+/// For more details on coercion in general, please see the
+/// [`type_coercion`](crate::type_coercion) module.
+#[deprecated(since = "52.0.0", note = "use fields_with_udf")]
+pub fn fields_with_aggregate_udf(
+    current_fields: &[FieldRef],
+    func: &AggregateUDF,
+) -> Result<Vec<FieldRef>> {
+    fields_with_udf(current_fields, func)
+}
 
-    Ok(current_fields
-        .iter()
-        .zip(updated_types)
-        .map(|(current_field, new_type)| {
-            current_field.as_ref().clone().with_data_type(new_type)
-        })
-        .map(Arc::new)
-        .collect())
+/// Performs type coercion for window function arguments.
+///
+/// Returns the data types to which each argument must be coerced to
+/// match `signature`.
+///
+/// For more details on coercion in general, please see the
+/// [`type_coercion`](crate::type_coercion) module.
+#[deprecated(since = "52.0.0", note = "use fields_with_udf")]
+pub fn fields_with_window_udf(
+    current_fields: &[FieldRef],
+    func: &WindowUDF,
+) -> Result<Vec<FieldRef>> {
+    fields_with_udf(current_fields, func)
 }
 
 /// Performs type coercion for function arguments.
@@ -188,6 +355,7 @@ pub fn fields_with_window_udf(
 ///
 /// For more details on coercion in general, please see the
 /// [`type_coercion`](crate::type_coercion) module.
+#[deprecated(since = "52.0.0", note = "use fields_with_udf")]
 pub fn data_types(
     function_name: impl AsRef<str>,
     current_types: &[DataType],
@@ -201,12 +369,12 @@ pub fn data_types(
         } else if type_signature.used_to_support_zero_arguments() {
             // Special error to help during upgrade: https://github.com/apache/datafusion/issues/13763
             return plan_err!(
-                "function '{}' has signature {type_signature:?} which does not support zero arguments. Use TypeSignature::Nullary for zero arguments",
+                "function '{}' has signature {type_signature} which does not support zero arguments. Use TypeSignature::Nullary for zero arguments",
                 function_name.as_ref()
             );
         } else {
             return plan_err!(
-                "Function '{}' has signature {type_signature:?} which does not support zero arguments",
+                "Function '{}' has signature {type_signature} which does not support zero arguments",
                 function_name.as_ref()
             );
         }
@@ -230,20 +398,23 @@ pub fn data_types(
 }
 
 fn is_well_supported_signature(type_signature: &TypeSignature) -> bool {
-    if let TypeSignature::OneOf(signatures) = type_signature {
-        return signatures.iter().all(is_well_supported_signature);
-    }
-
-    matches!(
-        type_signature,
+    match type_signature {
+        TypeSignature::OneOf(type_signatures) => {
+            type_signatures.iter().all(is_well_supported_signature)
+        }
         TypeSignature::UserDefined
-            | TypeSignature::Numeric(_)
-            | TypeSignature::String(_)
-            | TypeSignature::Coercible(_)
-            | TypeSignature::Any(_)
-            | TypeSignature::Nullary
-            | TypeSignature::Comparable(_)
-    )
+        | TypeSignature::Numeric(_)
+        | TypeSignature::String(_)
+        | TypeSignature::Coercible(_)
+        | TypeSignature::Any(_)
+        | TypeSignature::Nullary
+        | TypeSignature::Comparable(_) => true,
+        TypeSignature::Variadic(_)
+        | TypeSignature::VariadicAny
+        | TypeSignature::Uniform(_, _)
+        | TypeSignature::Exact(_)
+        | TypeSignature::ArraySignature(_) => false,
+    }
 }
 
 fn try_coerce_types(
@@ -279,30 +450,32 @@ fn try_coerce_types(
 
     // none possible -> Error
     plan_err!(
-        "Failed to coerce arguments to satisfy a call to '{function_name}' function: coercion from {} to the signature {type_signature:?} failed",
+        "Failed to coerce arguments to satisfy a call to '{function_name}' function: coercion from {} to the signature {type_signature} failed",
         current_types.iter().join(", ")
     )
 }
 
-fn get_valid_types_with_scalar_udf(
+fn get_valid_types_with_udf<F: UDFCoercionExt>(
     signature: &TypeSignature,
     current_types: &[DataType],
-    func: &ScalarUDF,
+    func: &F,
 ) -> Result<Vec<Vec<DataType>>> {
-    match signature {
+    let valid_types = match signature {
         TypeSignature::UserDefined => match func.coerce_types(current_types) {
-            Ok(coerced_types) => Ok(vec![coerced_types]),
-            Err(e) => exec_err!(
-                "Function '{}' user-defined coercion failed with {:?}",
-                func.name(),
-                e.strip_backtrace()
-            ),
+            Ok(coerced_types) => vec![coerced_types],
+            Err(e) => {
+                return exec_err!(
+                    "Function '{}' user-defined coercion failed with: {}",
+                    func.name(),
+                    e.strip_backtrace()
+                );
+            }
         },
         TypeSignature::OneOf(signatures) => {
             let mut res = vec![];
             let mut errors = vec![];
             for sig in signatures {
-                match get_valid_types_with_scalar_udf(sig, current_types, func) {
+                match get_valid_types_with_udf(sig, current_types, func) {
                     Ok(valid_types) => {
                         res.extend(valid_types);
                     }
@@ -314,69 +487,15 @@ fn get_valid_types_with_scalar_udf(
 
             // Every signature failed, return the joined error
             if res.is_empty() {
-                internal_err!(
+                return internal_err!(
                     "Function '{}' failed to match any signature, errors: {}",
                     func.name(),
                     errors.join(",")
-                )
+                );
             } else {
-                Ok(res)
+                res
             }
         }
-        _ => get_valid_types(func.name(), signature, current_types),
-    }
-}
-
-fn get_valid_types_with_aggregate_udf(
-    signature: &TypeSignature,
-    current_types: &[DataType],
-    func: &AggregateUDF,
-) -> Result<Vec<Vec<DataType>>> {
-    let valid_types = match signature {
-        TypeSignature::UserDefined => match func.coerce_types(current_types) {
-            Ok(coerced_types) => vec![coerced_types],
-            Err(e) => {
-                return exec_err!(
-                    "Function '{}' user-defined coercion failed with {:?}",
-                    func.name(),
-                    e.strip_backtrace()
-                )
-            }
-        },
-        TypeSignature::OneOf(signatures) => signatures
-            .iter()
-            .filter_map(|t| {
-                get_valid_types_with_aggregate_udf(t, current_types, func).ok()
-            })
-            .flatten()
-            .collect::<Vec<_>>(),
-        _ => get_valid_types(func.name(), signature, current_types)?,
-    };
-
-    Ok(valid_types)
-}
-
-fn get_valid_types_with_window_udf(
-    signature: &TypeSignature,
-    current_types: &[DataType],
-    func: &WindowUDF,
-) -> Result<Vec<Vec<DataType>>> {
-    let valid_types = match signature {
-        TypeSignature::UserDefined => match func.coerce_types(current_types) {
-            Ok(coerced_types) => vec![coerced_types],
-            Err(e) => {
-                return exec_err!(
-                    "Function '{}' user-defined coercion failed with {:?}",
-                    func.name(),
-                    e.strip_backtrace()
-                )
-            }
-        },
-        TypeSignature::OneOf(signatures) => signatures
-            .iter()
-            .filter_map(|t| get_valid_types_with_window_udf(t, current_types, func).ok())
-            .flatten()
-            .collect::<Vec<_>>(),
         _ => get_valid_types(func.name(), signature, current_types)?,
     };
 
@@ -418,12 +537,12 @@ fn get_valid_types(
                         element_types.push(DataType::Null);
                         nested_item_nullability.push(None);
                     }
-                    DataType::List(field) => {
+                    DataType::List(field) | DataType::ListView(field) => {
                         element_types.push(field.data_type().clone());
                         nested_item_nullability.push(Some(field.is_nullable()));
                         fixed_size = false;
                     }
-                    DataType::LargeList(field) => {
+                    DataType::LargeList(field) | DataType::LargeListView(field) => {
                         element_types.push(field.data_type().clone());
                         nested_item_nullability.push(Some(field.is_nullable()));
                         large_list = true;
@@ -461,6 +580,8 @@ fn get_valid_types(
                     ArrayFunctionArgument::Index => DataType::Int64,
                     ArrayFunctionArgument::String => DataType::Utf8,
                     ArrayFunctionArgument::Element => element_type.clone(),
+                    // TODO: support maintaining ListView types here
+                    // https://github.com/apache/datafusion/issues/21777
                     ArrayFunctionArgument::Array => {
                         if current_type.is_null() {
                             DataType::Null
@@ -492,6 +613,8 @@ fn get_valid_types(
         match array_type {
             DataType::List(_)
             | DataType::LargeList(_)
+            | DataType::ListView(_)
+            | DataType::LargeListView(_)
             | DataType::FixedSizeList(_, _) => {
                 let array_type = coerced_fixed_size_list_to_list(array_type);
                 Some(array_type)
@@ -516,7 +639,7 @@ fn get_valid_types(
     let valid_types = match signature {
         TypeSignature::Variadic(valid_types) => valid_types
             .iter()
-            .map(|valid_type| current_types.iter().map(|_| valid_type.clone()).collect())
+            .map(|valid_type| vec![valid_type.clone(); current_types.len()])
             .collect(),
         TypeSignature::String(number) => {
             function_length_check(function_name, current_types.len(), *number)?;
@@ -531,7 +654,7 @@ fn get_valid_types(
                     new_types.push(DataType::Utf8);
                 } else {
                     return plan_err!(
-                        "Function '{function_name}' expects NativeType::String but NativeType::received NativeType::{logical_data_type}"
+                        "Function '{function_name}' expects String but received {logical_data_type}"
                     );
                 }
             }
@@ -591,7 +714,7 @@ fn get_valid_types(
 
                 if !logical_data_type.is_numeric() {
                     return plan_err!(
-                        "Function '{function_name}' expects NativeType::Numeric but received NativeType::{logical_data_type}"
+                        "Function '{function_name}' expects Numeric but received {logical_data_type}"
                     );
                 }
 
@@ -612,7 +735,7 @@ fn get_valid_types(
                 valid_type = DataType::Float64;
             } else if !logical_data_type.is_numeric() {
                 return plan_err!(
-                    "Function '{function_name}' expects NativeType::Numeric but received NativeType::{logical_data_type}"
+                    "Function '{function_name}' expects Numeric but received {logical_data_type}"
                 );
             }
 
@@ -622,10 +745,12 @@ fn get_valid_types(
             function_length_check(function_name, current_types.len(), *num)?;
             let mut target_type = current_types[0].to_owned();
             for data_type in current_types.iter().skip(1) {
-                if let Some(dt) = comparison_coercion_numeric(&target_type, data_type) {
+                if let Some(dt) = comparison_coercion(&target_type, data_type) {
                     target_type = dt;
                 } else {
-                    return plan_err!("For function '{function_name}' {target_type} and {data_type} is not comparable");
+                    return plan_err!(
+                        "For function '{function_name}' {target_type} and {data_type} is not comparable"
+                    );
                 }
             }
             // Convert null to String type.
@@ -642,24 +767,33 @@ fn get_valid_types(
             for (current_type, param) in current_types.iter().zip(param_types.iter()) {
                 let current_native_type: NativeType = current_type.into();
 
-                if param.desired_type().matches_native_type(&current_native_type) {
-                    let casted_type = param.desired_type().default_casted_type(
-                        &current_native_type,
-                        current_type,
-                    )?;
+                if param
+                    .desired_type()
+                    .matches_native_type(&current_native_type)
+                {
+                    let casted_type = param
+                        .desired_type()
+                        .default_casted_type(&current_native_type, current_type)?;
 
                     new_types.push(casted_type);
                 } else if param
-                .allowed_source_types()
-                .iter()
-                .any(|t| t.matches_native_type(&current_native_type)) {
+                    .allowed_source_types()
+                    .iter()
+                    .any(|t| t.matches_native_type(&current_native_type))
+                {
                     // If the condition is met which means `implicit coercion`` is provided so we can safely unwrap
                     let default_casted_type = param.default_casted_type().unwrap();
-                    let casted_type = default_casted_type.default_cast_for(current_type)?;
+                    let casted_type =
+                        default_casted_type.default_cast_for(current_type)?;
                     new_types.push(casted_type);
                 } else {
-                    return internal_err!(
-                        "Expect {} but received NativeType::{}, DataType: {}",
+                    let hint = if matches!(current_native_type, NativeType::Binary) {
+                        "\n\nHint: Binary types are not automatically coerced to String. Use CAST(column AS VARCHAR) to convert Binary data to String."
+                    } else {
+                        ""
+                    };
+                    return plan_err!(
+                        "Function '{function_name}' requires {}, but received {} (DataType: {}).{hint}",
                         param.desired_type(),
                         current_native_type,
                         current_type
@@ -671,18 +805,20 @@ fn get_valid_types(
         }
         TypeSignature::Uniform(number, valid_types) => {
             if *number == 0 {
-                return plan_err!("The function '{function_name}' expected at least one argument");
+                return plan_err!(
+                    "The function '{function_name}' expected at least one argument"
+                );
             }
 
             valid_types
                 .iter()
-                .map(|valid_type| (0..*number).map(|_| valid_type.clone()).collect())
+                .map(|valid_type| vec![valid_type.clone(); *number])
                 .collect()
         }
         TypeSignature::UserDefined => {
             return internal_err!(
                 "Function '{function_name}' user-defined signature should be handled by function-specific coerce_types"
-            )
+            );
         }
         TypeSignature::VariadicAny => {
             if current_types.is_empty() {
@@ -693,10 +829,16 @@ fn get_valid_types(
             vec![current_types.to_vec()]
         }
         TypeSignature::Exact(valid_types) => vec![valid_types.clone()],
-        TypeSignature::ArraySignature(ref function_signature) => match function_signature {
-            ArrayFunctionSignature::Array { arguments, array_coercion, } => {
-                array_valid_types(function_name, current_types, arguments, array_coercion.as_ref())?
-            }
+        TypeSignature::ArraySignature(function_signature) => match function_signature {
+            ArrayFunctionSignature::Array {
+                arguments,
+                array_coercion,
+            } => array_valid_types(
+                function_name,
+                current_types,
+                arguments,
+                array_coercion.as_ref(),
+            )?,
             ArrayFunctionSignature::RecursiveArray => {
                 if current_types.len() != 1 {
                     return Ok(vec![vec![]]);
@@ -737,7 +879,7 @@ fn get_valid_types(
                     current_types.len()
                 );
             }
-            vec![(0..*number).map(|i| current_types[i].clone()).collect()]
+            vec![current_types.to_vec()]
         }
         TypeSignature::OneOf(types) => types
             .iter()
@@ -815,6 +957,7 @@ fn maybe_data_types_without_coercion(
 /// (losslessly converted) into a value of `type_to`
 ///
 /// See the module level documentation for more detail on coercion.
+#[deprecated(since = "53.0.0", note = "Unused internal function")]
 pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool {
     if type_into == type_from {
         return true;
@@ -861,10 +1004,13 @@ fn coerced_from<'a>(
         (UInt16, Null | UInt8 | UInt16) => Some(type_into.clone()),
         (UInt32, Null | UInt8 | UInt16 | UInt32) => Some(type_into.clone()),
         (UInt64, Null | UInt8 | UInt16 | UInt32 | UInt64) => Some(type_into.clone()),
+        (Float16, Null | Int8 | Int16 | UInt8 | UInt16 | Float16) => {
+            Some(type_into.clone())
+        }
         (
             Float32,
             Null | Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64
-            | Float32,
+            | Float16 | Float32,
         ) => Some(type_into.clone()),
         (
             Float64,
@@ -877,6 +1023,7 @@ fn coerced_from<'a>(
             | UInt16
             | UInt32
             | UInt64
+            | Float16
             | Float32
             | Float64
             | Decimal32(_, _)
@@ -888,18 +1035,20 @@ fn coerced_from<'a>(
             Timestamp(TimeUnit::Nanosecond, None),
             Null | Timestamp(_, None) | Date32 | Utf8 | LargeUtf8,
         ) => Some(type_into.clone()),
-        (Interval(_), Utf8 | LargeUtf8) => Some(type_into.clone()),
+        (Interval(_), Null | Utf8 | LargeUtf8) => Some(type_into.clone()),
         // We can go into a Utf8View from a Utf8 or LargeUtf8
         (Utf8View, Utf8 | LargeUtf8 | Null) => Some(type_into.clone()),
         // Any type can be coerced into strings
         (Utf8 | LargeUtf8, _) => Some(type_into.clone()),
+        // We can go into a BinaryView from a Binary or LargeBinary
+        (BinaryView, Binary | LargeBinary | Null) => Some(type_into.clone()),
         (Null, _) if can_cast_types(type_from, type_into) => Some(type_into.clone()),
 
         (List(_), FixedSizeList(_, _)) => Some(type_into.clone()),
 
         // Only accept list and largelist with the same number of dimensions unless the type is Null.
         // List or LargeList with different dimensions should be handled in TypeSignature or other places before this
-        (List(_) | LargeList(_), _)
+        (List(_) | LargeList(_) | ListView(_) | LargeListView(_), _)
             if base_type(type_from).is_null()
                 || list_ndims(type_from) == list_ndims(type_into) =>
         {
@@ -933,30 +1082,91 @@ fn coerced_from<'a>(
         (Timestamp(_, Some(_)), Null | Timestamp(_, _) | Date32 | Utf8 | LargeUtf8) => {
             Some(type_into.clone())
         }
+        // Null can be coerced to any target type, provided the cast is valid.
+        // This mirrors null_coercion() in binary comparison coercion
+        // (expr-common/src/type_coercion/binary.rs) and is the symmetric
+        // counterpart of the (Null, _) arm above. Without this, untyped
+        // placeholders ($1, $foo) inside function calls fail signature matching
+        // because their Null type doesn't match any Exact(...) variant.
+        (_, Null) if can_cast_types(type_from, type_into) => Some(type_into.clone()),
         _ => None,
     }
 }
 
 #[cfg(test)]
 mod tests {
-    use crate::Volatility;
+    use crate::{
+        HigherOrderFunctionArgs, HigherOrderReturnFieldArgs, HigherOrderSignature,
+        Volatility,
+    };
 
     use super::*;
-    use arrow::datatypes::Field;
-    use datafusion_common::assert_contains;
+    use arrow::datatypes::IntervalUnit;
+    use datafusion_common::{
+        assert_contains,
+        types::{logical_binary, logical_int64},
+    };
+    use datafusion_expr_common::{
+        columnar_value::ColumnarValue,
+        signature::{Coercion, TypeSignatureClass},
+    };
 
     #[test]
     fn test_string_conversion() {
         let cases = vec![
-            (DataType::Utf8View, DataType::Utf8, true),
-            (DataType::Utf8View, DataType::LargeUtf8, true),
+            (DataType::Utf8View, DataType::Utf8),
+            (DataType::Utf8View, DataType::LargeUtf8),
+            (DataType::Utf8View, DataType::Null),
+        ];
+
+        for case in cases {
+            assert_eq!(coerced_from(&case.0, &case.1), Some(case.0));
+        }
+    }
+
+    #[test]
+    fn test_binary_conversion() {
+        let cases = vec![
+            (DataType::BinaryView, DataType::Binary),
+            (DataType::BinaryView, DataType::LargeBinary),
+            (DataType::BinaryView, DataType::Null),
         ];
 
         for case in cases {
-            assert_eq!(can_coerce_from(&case.0, &case.1), case.2);
+            assert_eq!(coerced_from(&case.0, &case.1), Some(case.0));
         }
     }
 
+    #[test]
+    fn test_coerced_from_null() {
+        // Null should coerce to Interval (the motivating case)
+        assert_eq!(
+            coerced_from(
+                &DataType::Interval(IntervalUnit::MonthDayNano),
+                &DataType::Null
+            ),
+            Some(DataType::Interval(IntervalUnit::MonthDayNano))
+        );
+
+        // Null should coerce to Date32
+        assert_eq!(
+            coerced_from(&DataType::Date32, &DataType::Null),
+            Some(DataType::Date32)
+        );
+
+        // Null should coerce to Timestamp with timezone
+        assert_eq!(
+            coerced_from(
+                &DataType::Timestamp(TimeUnit::Microsecond, Some("+00".into())),
+                &DataType::Null
+            ),
+            Some(DataType::Timestamp(
+                TimeUnit::Microsecond,
+                Some("+00".into())
+            ))
+        );
+    }
+
     #[test]
     fn test_maybe_data_types() {
         // this vec contains: arg1, arg2, expected result
@@ -1057,7 +1267,7 @@ mod tests {
         .unwrap_err();
         assert_contains!(
             got.to_string(),
-            "Function 'test' expects NativeType::Numeric but received NativeType::String"
+            "Function 'test' expects Numeric but received String"
         );
 
         // Fallbacks to float64 if the arg is of type null.
@@ -1077,7 +1287,7 @@ mod tests {
         .unwrap_err();
         assert_contains!(
             got.to_string(),
-            "Function 'test' expects NativeType::Numeric but received NativeType::Timestamp(Second, None)"
+            "Function 'test' expects Numeric but received Timestamp(s)"
         );
 
         Ok(())
@@ -1132,12 +1342,29 @@ mod tests {
         Ok(())
     }
 
+    struct MockUdf(Signature);
+
+    impl UDFCoercionExt for MockUdf {
+        fn name(&self) -> &str {
+            "test"
+        }
+        fn signature(&self) -> &Signature {
+            &self.0
+        }
+        fn coerce_types(&self, _arg_types: &[DataType]) -> Result<Vec<DataType>> {
+            unimplemented!()
+        }
+    }
+
     #[test]
     fn test_fixed_list_wildcard_coerce() -> Result<()> {
         let inner = Arc::new(Field::new_list_field(DataType::Int32, false));
-        let current_types = vec![
-            DataType::FixedSizeList(Arc::clone(&inner), 2), // able to coerce for any size
-        ];
+        // able to coerce for any size
+        let current_fields = vec![Arc::new(Field::new(
+            "t",
+            DataType::FixedSizeList(Arc::clone(&inner), 2),
+            true,
+        ))];
 
         let signature = Signature::exact(
             vec![DataType::FixedSizeList(
@@ -1147,24 +1374,25 @@ mod tests {
             Volatility::Stable,
         );
 
-        let coerced_data_types = data_types("test", &current_types, &signature)?;
-        assert_eq!(coerced_data_types, current_types);
+        let coerced_fields = fields_with_udf(&current_fields, &MockUdf(signature))?;
+        assert_eq!(coerced_fields, current_fields);
 
         // make sure it can't coerce to a different size
         let signature = Signature::exact(
             vec![DataType::FixedSizeList(Arc::clone(&inner), 3)],
             Volatility::Stable,
         );
-        let coerced_data_types = data_types("test", &current_types, &signature);
-        assert!(coerced_data_types.is_err());
+        let coerced_fields = fields_with_udf(&current_fields, &MockUdf(signature));
+        assert!(coerced_fields.is_err());
 
         // make sure it works with the same type.
         let signature = Signature::exact(
             vec![DataType::FixedSizeList(Arc::clone(&inner), 2)],
             Volatility::Stable,
         );
-        let coerced_data_types = data_types("test", &current_types, &signature).unwrap();
-        assert_eq!(coerced_data_types, current_types);
+        let coerced_fields =
+            fields_with_udf(&current_fields, &MockUdf(signature)).unwrap();
+        assert_eq!(coerced_fields, current_fields);
 
         Ok(())
     }
@@ -1271,6 +1499,54 @@ mod tests {
             ]]
         );
 
+        let data_types = vec![
+            DataType::ListView(Field::new_list_field(DataType::Int32, true).into()),
+            DataType::new_list(DataType::Int32, true),
+        ];
+        assert_eq!(
+            get_valid_types(function, &signature.type_signature, &data_types)?,
+            vec![vec![
+                DataType::new_list(DataType::Int32, true),
+                DataType::new_list(DataType::Int32, true),
+            ]]
+        );
+
+        let data_types = vec![
+            DataType::LargeListView(Field::new_list_field(DataType::Int32, true).into()),
+            DataType::new_list(DataType::Int32, true),
+        ];
+        assert_eq!(
+            get_valid_types(function, &signature.type_signature, &data_types)?,
+            vec![vec![
+                DataType::new_large_list(DataType::Int32, true),
+                DataType::new_large_list(DataType::Int32, true),
+            ]]
+        );
+
+        let data_types = vec![
+            DataType::ListView(Field::new_list_field(DataType::Int32, true).into()),
+            DataType::ListView(Field::new_list_field(DataType::Int32, true).into()),
+        ];
+        assert_eq!(
+            get_valid_types(function, &signature.type_signature, &data_types)?,
+            vec![vec![
+                DataType::new_list(DataType::Int32, true),
+                DataType::new_list(DataType::Int32, true),
+            ]]
+        );
+
+        let data_types = vec![
+            DataType::LargeListView(Field::new_list_field(DataType::Int32, true).into()),
+            DataType::LargeListView(Field::new_list_field(DataType::Int32, true).into()),
+        ];
+        assert_eq!(
+            get_valid_types(function, &signature.type_signature, &data_types)?,
+            vec![vec![
+                DataType::new_large_list(DataType::Int32, true),
+                DataType::new_large_list(DataType::Int32, true),
+            ]]
+        );
+
         Ok(())
     }
 
@@ -1336,6 +1612,164 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_coercible_nulls() -> Result<()> {
+        fn null_input(coercion: Coercion) -> Result<Vec<DataType>> {
+            fields_with_udf(
+                &[Field::new("field", DataType::Null, true).into()],
+                &MockUdf(Signature::coercible(vec![coercion], Volatility::Immutable)),
+            )
+            .map(|v| v.into_iter().map(|f| f.data_type().clone()).collect())
+        }
+
+        // Casts Null to Int64 if we use TypeSignatureClass::Native
+        let output = null_input(Coercion::new_exact(TypeSignatureClass::Native(
+            logical_int64(),
+        )))?;
+        assert_eq!(vec![DataType::Int64], output);
+
+        let output = null_input(Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int64()),
+            vec![],
+            NativeType::Int64,
+        ))?;
+        assert_eq!(vec![DataType::Int64], output);
+
+        // Null gets passed through if we use TypeSignatureClass apart from Native
+        let output = null_input(Coercion::new_exact(TypeSignatureClass::Integer))?;
+        assert_eq!(vec![DataType::Null], output);
+
+        let output = null_input(Coercion::new_implicit(
+            TypeSignatureClass::Integer,
+            vec![],
+            NativeType::Int64,
+        ))?;
+        assert_eq!(vec![DataType::Null], output);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_coercible_dictionary() -> Result<()> {
+        let dictionary =
+            DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Int64));
+        fn dictionary_input(coercion: Coercion) -> Result<Vec<DataType>> {
+            fields_with_udf(
+                &[Field::new(
+                    "field",
+                    DataType::Dictionary(
+                        Box::new(DataType::Int8),
+                        Box::new(DataType::Int64),
+                    ),
+                    true,
+                )
+                .into()],
+                &MockUdf(Signature::coercible(vec![coercion], Volatility::Immutable)),
+            )
+            .map(|v| v.into_iter().map(|f| f.data_type().clone()).collect())
+        }
+
+        // Casts Dictionary to Int64 if we use TypeSignatureClass::Native
+        let output = dictionary_input(Coercion::new_exact(TypeSignatureClass::Native(
+            logical_int64(),
+        )))?;
+        assert_eq!(vec![DataType::Int64], output);
+
+        let output = dictionary_input(Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int64()),
+            vec![],
+            NativeType::Int64,
+        ))?;
+        assert_eq!(vec![DataType::Int64], output);
+
+        // Dictionary gets passed through if we use TypeSignatureClass apart from Native
+        let output = dictionary_input(Coercion::new_exact(TypeSignatureClass::Integer))?;
+        assert_eq!(vec![dictionary.clone()], output);
+
+        let output = dictionary_input(Coercion::new_implicit(
+            TypeSignatureClass::Integer,
+            vec![],
+            NativeType::Int64,
+        ))?;
+        assert_eq!(vec![dictionary.clone()], output);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_coercible_run_end_encoded() -> Result<()> {
+        let run_end_encoded = DataType::RunEndEncoded(
+            Field::new("run_ends", DataType::Int16, false).into(),
+            Field::new("values", DataType::Int64, true).into(),
+        );
+        fn run_end_encoded_input(coercion: Coercion) -> Result<Vec<DataType>> {
+            fields_with_udf(
+                &[Field::new(
+                    "field",
+                    DataType::RunEndEncoded(
+                        Field::new("run_ends", DataType::Int16, false).into(),
+                        Field::new("values", DataType::Int64, true).into(),
+                    ),
+                    true,
+                )
+                .into()],
+                &MockUdf(Signature::coercible(vec![coercion], Volatility::Immutable)),
+            )
+            .map(|v| v.into_iter().map(|f| f.data_type().clone()).collect())
+        }
+
+        // Casts REE to Int64 if we use TypeSignatureClass::Native
+        let output = run_end_encoded_input(Coercion::new_exact(
+            TypeSignatureClass::Native(logical_int64()),
+        ))?;
+        assert_eq!(vec![DataType::Int64], output);
+
+        let output = run_end_encoded_input(Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int64()),
+            vec![],
+            NativeType::Int64,
+        ))?;
+        assert_eq!(vec![DataType::Int64], output);
+
+        // REE gets passed through if we use TypeSignatureClass apart from Native
+        let output =
+            run_end_encoded_input(Coercion::new_exact(TypeSignatureClass::Integer))?;
+        assert_eq!(vec![run_end_encoded.clone()], output);
+
+        let output = run_end_encoded_input(Coercion::new_implicit(
+            TypeSignatureClass::Integer,
+            vec![],
+            NativeType::Int64,
+        ))?;
+        assert_eq!(vec![run_end_encoded.clone()], output);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_get_valid_types_coercible_binary() -> Result<()> {
+        let signature = Signature::coercible(
+            vec![Coercion::new_exact(TypeSignatureClass::Native(
+                logical_binary(),
+            ))],
+            Volatility::Immutable,
+        );
+
+        // Binary types should stay their original selves
+        for t in [
+            DataType::Binary,
+            DataType::BinaryView,
+            DataType::LargeBinary,
+        ] {
+            assert_eq!(
+                get_valid_types("", &signature.type_signature, std::slice::from_ref(&t))?,
+                vec![vec![t]]
+            );
+        }
+
+        Ok(())
+    }
+
     #[test]
     fn test_get_valid_types_fixed_size_arrays() -> Result<()> {
         let function = "fixed_size_arrays";
@@ -1388,4 +1822,208 @@ mod tests {
 
         Ok(())
     }
+
+    #[derive(Debug, PartialEq, Eq, Hash)]
+    struct MockHigherOrderUDF {
+        signature: HigherOrderSignature,
+        coerced_value_types: Vec<DataType>,
+    }
+
+    impl HigherOrderUDF for MockHigherOrderUDF {
+        fn name(&self) -> &str {
+            "mock_higher_order_function"
+        }
+
+        fn signature(&self) -> &HigherOrderSignature {
+            &self.signature
+        }
+
+        fn coerce_value_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+            if arg_types.len() != 1 {
+                return plan_err!(
+                    "mock_higher_order_function expects 1 value arguments, got {}",
+                    arg_types.len()
+                );
+            }
+            Ok(self.coerced_value_types.clone())
+        }
+
+        fn coerce_values_for_lambdas(
+            &self,
+            fields: &[ValueOrLambda<DataType, DataType>],
+        ) -> Result<Vec<DataType>> {
+            // thoerical impl of array_reduce without finish
+            let [
+                ValueOrLambda::Value(list),
+                ValueOrLambda::Value(_initial),
+                ValueOrLambda::Lambda(merge),
+            ] = fields
+            else {
+                unreachable!()
+            };
+
+            Ok(vec![list.clone(), merge.clone()])
+        }
+
+        fn lambda_parameters(
+            &self,
+            _step: usize,
+            _fields: &[ValueOrLambda<FieldRef, Option<FieldRef>>],
+        ) -> Result<crate::LambdaParametersProgress> {
+            unimplemented!("mock_higher_order_function")
+        }
+
+        fn return_field_from_args(
+            &self,
+            _args: HigherOrderReturnFieldArgs,
+        ) -> Result<FieldRef> {
+            unimplemented!("mock_higher_order_function")
+        }
+
+        fn invoke_with_args(
+            &self,
+            _args: HigherOrderFunctionArgs,
+        ) -> Result<ColumnarValue> {
+            unimplemented!("mock_higher_order_function")
+        }
+    }
+
+    #[test]
+    fn test_higher_order_function_user_defined_type_coercion() {
+        let fun = MockHigherOrderUDF {
+            signature: HigherOrderSignature::user_defined(Volatility::Immutable),
+            coerced_value_types: vec![DataType::new_large_list(DataType::Int32, false)],
+        };
+
+        let new_fields = value_fields_with_higher_order_udf(
+            &[
+                ValueOrLambda::Value(Arc::new(Field::new_list(
+                    "",
+                    Field::new_list_field(DataType::Int32, false),
+                    false,
+                ))),
+                ValueOrLambda::Lambda(()),
+            ],
+            &fun,
+        )
+        .unwrap();
+
+        // from List(Int32) to LargeList(Int32)
+        assert_eq!(
+            new_fields,
+            vec![
+                ValueOrLambda::Value(Arc::new(Field::new_large_list(
+                    "",
+                    Field::new_list_field(DataType::Int32, false),
+                    false
+                ))),
+                ValueOrLambda::Lambda(()),
+            ]
+        )
+    }
+
+    #[test]
+    fn test_higher_order_function_coerce_values_for_lambdas() {
+        let fun = MockHigherOrderUDF {
+            signature: HigherOrderSignature::variadic_any(Volatility::Immutable)
+                .with_coerce_values_for_lambdas(),
+            coerced_value_types: vec![],
+        };
+
+        let new_fields = value_fields_with_higher_order_udf_and_lambdas(
+            &[
+                ValueOrLambda::Value(Arc::new(Field::new_list(
+                    "",
+                    Field::new_list_field(DataType::Float32, true),
+                    true,
+                ))),
+                ValueOrLambda::Value(Arc::new(Field::new("", DataType::Int32, true))),
+                ValueOrLambda::Lambda(Arc::new(Field::new("", DataType::Float32, true))),
+            ],
+            &fun,
+        )
+        .unwrap();
+
+        // second parameter from Int32 to Float32
+        assert_eq!(
+            new_fields,
+            vec![
+                ValueOrLambda::Value(Arc::new(Field::new_list(
+                    "",
+                    Field::new_list_field(DataType::Float32, true),
+                    true,
+                ))),
+                ValueOrLambda::Value(Arc::new(Field::new("", DataType::Float32, true))),
+                ValueOrLambda::Lambda(Arc::new(Field::new("", DataType::Float32, true))),
+            ]
+        )
+    }
+
+    #[test]
+    fn test_higher_order_function_user_defined_type_coercion_bad_args() {
+        let fun = MockHigherOrderUDF {
+            signature: HigherOrderSignature::user_defined(Volatility::Immutable),
+            coerced_value_types: vec![DataType::Int32],
+        };
+
+        let err = value_fields_with_higher_order_udf::<()>(&[], &fun).unwrap_err();
+
+        assert_contains!(
+            err.to_string(),
+            "mock_higher_order_function expects 1 value arguments, got 0"
+        );
+    }
+
+    #[test]
+    fn test_higher_order_function_faulty_user_defined_type_coercion() {
+        let fun = MockHigherOrderUDF {
+            signature: HigherOrderSignature::user_defined(Volatility::Immutable),
+            coerced_value_types: vec![DataType::Int32, DataType::Int32],
+        };
+
+        let err = value_fields_with_higher_order_udf::<()>(
+            &[ValueOrLambda::Value(Arc::new(Field::new(
+                "",
+                DataType::Int32,
+                false,
+            )))],
+            &fun,
+        )
+        .unwrap_err();
+
+        assert_contains!(
+            err.to_string(),
+            "mock_higher_order_function coerce_value_types should have returned 1 items but returned 2"
+        );
+    }
+
+    #[test]
+    fn test_higher_order_function_any_signature() {
+        let fun = MockHigherOrderUDF {
+            signature: HigherOrderSignature::any(1, Volatility::Immutable),
+            coerced_value_types: vec![],
+        };
+
+        let new_fields =
+            value_fields_with_higher_order_udf(&[ValueOrLambda::Lambda(())], &fun)
+                .unwrap();
+
+        // no coercion, just number of args checked
+        assert_eq!(new_fields, vec![ValueOrLambda::Lambda(())])
+    }
+
+    #[test]
+    fn test_higher_order_function_any_signature_bad_args() {
+        let fun = MockHigherOrderUDF {
+            signature: HigherOrderSignature::any(1, Volatility::Immutable),
+            coerced_value_types: vec![],
+        };
+
+        let err = value_fields_with_higher_order_udf::<()>(&[], &fun).unwrap_err();
+
+        assert_contains!(
+            err.to_string(),
+            "The function 'mock_higher_order_function' expected 1 arguments but received 0"
+        );
+    }
 }
diff --git a/datafusion/expr/src/type_coercion/mod.rs b/datafusion/expr/src/type_coercion/mod.rs
index bd1acd3f3a2e2..c92d434e34abe 100644
--- a/datafusion/expr/src/type_coercion/mod.rs
+++ b/datafusion/expr/src/type_coercion/mod.rs
@@ -58,11 +58,6 @@ pub fn is_signed_numeric(dt: &DataType) -> bool {
     )
 }
 
-/// Determine whether the given data type `dt` is `Null`.
-pub fn is_null(dt: &DataType) -> bool {
-    *dt == DataType::Null
-}
-
 /// Determine whether the given data type `dt` is a `Timestamp`.
 pub fn is_timestamp(dt: &DataType) -> bool {
     matches!(dt, DataType::Timestamp(_, _))
@@ -80,22 +75,3 @@ pub fn is_datetime(dt: &DataType) -> bool {
         DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, _)
     )
 }
-
-/// Determine whether the given data type `dt` is a `Utf8` or `Utf8View` or `LargeUtf8`.
-pub fn is_utf8_or_utf8view_or_large_utf8(dt: &DataType) -> bool {
-    matches!(
-        dt,
-        DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8
-    )
-}
-
-/// Determine whether the given data type `dt` is a `Decimal`.
-pub fn is_decimal(dt: &DataType) -> bool {
-    matches!(
-        dt,
-        DataType::Decimal32(_, _)
-            | DataType::Decimal64(_, _)
-            | DataType::Decimal128(_, _)
-            | DataType::Decimal256(_, _)
-    )
-}
diff --git a/datafusion/expr/src/type_coercion/other.rs b/datafusion/expr/src/type_coercion/other.rs
index 634558094ae79..48125b661e2ca 100644
--- a/datafusion/expr/src/type_coercion/other.rs
+++ b/datafusion/expr/src/type_coercion/other.rs
@@ -17,38 +17,58 @@
 
 use arrow::datatypes::DataType;
 
-use super::binary::comparison_coercion;
+use super::binary::{comparison_coercion, type_union_coercion};
+
+/// Fold `coerce_fn` over `types`, starting from `initial_type`.
+fn fold_coerce(
+    initial_type: &DataType,
+    types: &[DataType],
+    coerce_fn: fn(&DataType, &DataType) -> Option<DataType>,
+) -> Option<DataType> {
+    types
+        .iter()
+        .try_fold(initial_type.clone(), |left_type, right_type| {
+            coerce_fn(&left_type, right_type)
+        })
+}
 
 /// Attempts to coerce the types of `list_types` to be comparable with the
-/// `expr_type`.
-/// Returns the common data type for `expr_type` and `list_types`
+/// `expr_type` for IN list predicates.
+/// Returns the common data type for `expr_type` and `list_types`.
+///
+/// Uses comparison coercion because `x IN (a, b)` is semantically equivalent
+/// to `x = a OR x = b`.
 pub fn get_coerce_type_for_list(
     expr_type: &DataType,
     list_types: &[DataType],
 ) -> Option<DataType> {
-    list_types
-        .iter()
-        .try_fold(expr_type.clone(), |left_type, right_type| {
-            comparison_coercion(&left_type, right_type)
-        })
+    fold_coerce(expr_type, list_types, comparison_coercion)
+}
+
+/// Find a common coerceable type for `CASE expr WHEN val1 WHEN val2 ...`
+/// conditions. Returns the common type for `case_type` and all `when_types`.
+///
+/// Uses comparison coercion because `CASE expr WHEN val` is semantically
+/// equivalent to `expr = val`.
+pub fn get_coerce_type_for_case_when(
+    when_types: &[DataType],
+    case_type: &DataType,
+) -> Option<DataType> {
+    fold_coerce(case_type, when_types, comparison_coercion)
 }
 
-/// Find a common coerceable type for all `when_or_then_types` as well
-/// and the `case_or_else_type`, if specified.
-/// Returns the common data type for `when_or_then_types` and `case_or_else_type`
+/// Find a common coerceable type for CASE THEN/ELSE result expressions.
+/// Returns the common data type for `then_types` and `else_type`.
+///
+/// Uses type union coercion because the result branches must be brought to a
+/// common type (like UNION), not compared.
 pub fn get_coerce_type_for_case_expression(
-    when_or_then_types: &[DataType],
-    case_or_else_type: Option<&DataType>,
+    then_types: &[DataType],
+    else_type: Option<&DataType>,
 ) -> Option<DataType> {
-    let case_or_else_type = match case_or_else_type {
-        None => when_or_then_types[0].clone(),
-        Some(data_type) => data_type.clone(),
+    let (initial_type, remaining) = match else_type {
+        None => then_types.split_first()?,
+        Some(data_type) => (data_type, then_types),
     };
-    when_or_then_types
-        .iter()
-        .try_fold(case_or_else_type, |left_type, right_type| {
-            // TODO: now just use the `equal` coercion rule for case when. If find the issue, and
-            // refactor again.
-            comparison_coercion(&left_type, right_type)
-        })
+    fold_coerce(initial_type, remaining, type_union_coercion)
 }
diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs
index 42a5f9b262392..54957c273abcc 100644
--- a/datafusion/expr/src/udaf.rs
+++ b/datafusion/expr/src/udaf.rs
@@ -26,23 +26,24 @@ use std::vec;
 
 use arrow::datatypes::{DataType, Field, FieldRef};
 
-use datafusion_common::{exec_err, not_impl_err, Result, ScalarValue, Statistics};
+use datafusion_common::{Result, ScalarValue, Statistics, exec_err, not_impl_err};
 use datafusion_expr_common::dyn_eq::{DynEq, DynHash};
+use datafusion_expr_common::operator::Operator;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 
 use crate::expr::{
+    AggregateFunction, AggregateFunctionParams, ExprListDisplay, WindowFunctionParams,
     schema_name_from_exprs, schema_name_from_exprs_comma_separated_without_space,
-    schema_name_from_sorts, AggregateFunction, AggregateFunctionParams, ExprListDisplay,
-    WindowFunctionParams,
+    schema_name_from_sorts,
 };
 use crate::function::{
     AccumulatorArgs, AggregateFunctionSimplification, StateFieldsArgs,
 };
 use crate::groups_accumulator::GroupsAccumulator;
 use crate::udf_eq::UdfEq;
-use crate::utils::format_state_name;
 use crate::utils::AggregateOrderSensitivity;
-use crate::{expr_vec_fmt, Accumulator, Expr};
+use crate::utils::format_state_name;
+use crate::{Accumulator, Expr, expr_vec_fmt};
 use crate::{Documentation, Signature};
 
 /// Logical representation of a user-defined [aggregate function] (UDAF).
@@ -74,8 +75,8 @@ use crate::{Documentation, Signature};
 /// [aggregate function]: https://en.wikipedia.org/wiki/Aggregate_function
 /// [`Accumulator`]: Accumulator
 /// [`create_udaf`]: crate::expr_fn::create_udaf
-/// [`simple_udaf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/simple_udaf.rs
-/// [`advanced_udaf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udaf.rs
+/// [`simple_udaf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/simple_udaf.rs
+/// [`advanced_udaf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udaf.rs
 #[derive(Debug, Clone, PartialOrd)]
 pub struct AggregateUDF {
     inner: Arc<dyn AggregateUDFImpl>,
@@ -83,7 +84,7 @@ pub struct AggregateUDF {
 
 impl PartialEq for AggregateUDF {
     fn eq(&self, other: &Self) -> bool {
-        self.inner.dyn_eq(other.inner.as_any())
+        self.inner.dyn_eq(other.inner.as_ref() as &dyn Any)
     }
 }
 
@@ -294,13 +295,28 @@ impl AggregateUDF {
         self.inner.reverse_expr()
     }
 
-    /// Do the function rewrite
+    /// Returns this aggregate function's simplification hook, if any.
     ///
     /// See [`AggregateUDFImpl::simplify`] for more details.
     pub fn simplify(&self) -> Option<AggregateFunctionSimplification> {
         self.inner.simplify()
     }
 
+    /// Rewrite aggregate to have simpler arguments
+    ///
+    /// See  [`AggregateUDFImpl::simplify_expr_op_literal`] for more details
+    pub fn simplify_expr_op_literal(
+        &self,
+        agg_function: &AggregateFunction,
+        arg: &Expr,
+        op: Operator,
+        lit: &Expr,
+        arg_is_left: bool,
+    ) -> Result<Option<Expr>> {
+        self.inner
+            .simplify_expr_op_literal(agg_function, arg, op, lit, arg_is_left)
+    }
+
     /// Returns true if the function is max, false if the function is min
     /// None in all other cases, used in certain optimizations for
     /// or aggregate
@@ -360,7 +376,7 @@ where
 /// See [`advanced_udaf.rs`] for a full example with complete implementation and
 /// [`AggregateUDF`] for other available options.
 ///
-/// [`advanced_udaf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udaf.rs
+/// [`advanced_udaf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udaf.rs
 ///
 /// # Basic Example
 /// ```
@@ -399,7 +415,6 @@ where
 ///
 /// /// Implement the AggregateUDFImpl trait for GeoMeanUdf
 /// impl AggregateUDFImpl for GeoMeanUdf {
-///    fn as_any(&self) -> &dyn Any { self }
 ///    fn name(&self) -> &str { "geo_mean" }
 ///    fn signature(&self) -> &Signature { &self.signature }
 ///    fn return_type(&self, args: &[DataType]) -> Result<DataType> {
@@ -427,10 +442,7 @@ where
 /// // Call the function `geo_mean(col)`
 /// let expr = geometric_mean.call(vec![col("a")]);
 /// ```
-pub trait AggregateUDFImpl: Debug + DynEq + DynHash + Send + Sync {
-    /// Returns this object as an [`Any`] trait object
-    fn as_any(&self) -> &dyn Any;
-
+pub trait AggregateUDFImpl: Debug + DynEq + DynHash + Send + Sync + Any {
     /// Returns this function's name
     fn name(&self) -> &str;
 
@@ -565,11 +577,12 @@ pub trait AggregateUDFImpl: Debug + DynEq + DynHash + Send + Sync {
     /// be derived from `name`. See [`format_state_name`] for a utility function
     /// to generate a unique name.
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        let fields = vec![args
-            .return_field
-            .as_ref()
-            .clone()
-            .with_name(format_state_name(args.name, "value"))];
+        let fields = vec![
+            args.return_field
+                .as_ref()
+                .clone()
+                .with_name(format_state_name(args.name, "value")),
+        ];
 
         Ok(fields
             .into_iter()
@@ -650,26 +663,34 @@ pub trait AggregateUDFImpl: Debug + DynEq + DynHash + Send + Sync {
         AggregateOrderSensitivity::HardRequirement
     }
 
-    /// Optionally apply per-UDaF simplification / rewrite rules.
+    /// Returns an optional hook for simplifying this user-defined aggregate.
+    ///
+    /// Use this hook to apply function-specific rewrites during optimization.
+    /// The default implementation returns `None`.
     ///
-    /// This can be used to apply function specific simplification rules during
-    /// optimization (e.g. `arrow_cast` --> `Expr::Cast`). The default
-    /// implementation does nothing.
+    /// For example, `percentile_cont(x, 0.0)` and `percentile_cont(x, 1.0)` can
+    /// be rewritten to `MIN(x)` or `MAX(x)` depending on the `ORDER BY`
+    /// direction.
     ///
-    /// Note that DataFusion handles simplifying arguments and  "constant
-    /// folding" (replacing a function call with constant arguments such as
-    /// `my_add(1,2) --> 3` ). Thus, there is no need to implement such
-    /// optimizations manually for specific UDFs.
+    /// DataFusion already simplifies arguments and performs constant folding
+    /// (for example, `my_add(1, 2) -> 3`). For nested expressions, the optimizer
+    /// runs simplification in multiple passes, so arguments are typically
+    /// simplified before this hook is invoked. As a result, UDF implementations
+    /// usually do not need to handle argument simplification themselves.
+    ///
+    /// See configuration `datafusion.optimizer.max_passes` for details on how many
+    /// optimization passes may be applied.
     ///
     /// # Returns
     ///
-    /// [None] if simplify is not defined or,
+    /// `None` if simplify is not defined.
     ///
-    /// Or, a closure with two arguments:
-    /// * 'aggregate_function': [AggregateFunction] for which simplified has been invoked
-    /// * 'info': [crate::simplify::SimplifyInfo]
+    /// Or, a closure ([`AggregateFunctionSimplification`]) invoked with:
+    /// * `aggregate_function`: [AggregateFunction] with already simplified
+    ///   arguments
+    /// * `info`: [crate::simplify::SimplifyContext]
     ///
-    /// closure returns simplified [Expr] or an error.
+    /// The closure returns a simplified [Expr] or an error.
     ///
     /// # Notes
     ///
@@ -682,6 +703,74 @@ pub trait AggregateUDFImpl: Debug + DynEq + DynHash + Send + Sync {
         None
     }
 
+    /// Rewrite the aggregate to have simpler arguments
+    ///
+    /// This query pattern is not common in most real workloads, and most
+    /// aggregate implementations can safely ignore it. This API is included in
+    /// DataFusion because it is important for ClickBench Q29. See backstory
+    /// on <https://github.com/apache/datafusion/issues/15524>
+    ///
+    /// # Rewrite Overview
+    ///
+    /// The idea is to rewrite multiple aggregates with "complex arguments" into
+    /// ones with simpler arguments that can be optimized by common subexpression
+    /// elimination (CSE). At a high level the rewrite looks like
+    ///
+    /// * `Aggregate(SUM(x + 1), SUM(x + 2), ...)`
+    ///
+    /// Into
+    ///
+    /// * `Aggregate(SUM(x) + 1 * COUNT(x), SUM(x) + 2 * COUNT(x), ...)`
+    ///
+    /// While this rewrite may seem worse (slower) than the original as it
+    /// computes *more* aggregate expressions, the common subexpression
+    /// elimination (CSE) can then reduce the number of distinct aggregates the
+    /// query actually needs to compute with a rewrite like
+    ///
+    /// * `Projection(_A + 1*_B, _A + 2*_B)`
+    /// * `  Aggregate(_A = SUM(x), _B = COUNT(x))`
+    ///
+    /// This optimization is extremely important for ClickBench Q29, which has 90
+    /// such expressions for some reason, and so this optimization results in
+    /// only two aggregates being needed. The DataFusion optimizer will invoke
+    /// this method when it detects multiple aggregates in a query that share
+    /// arguments of the form `<arg> <op> <literal>`.
+    ///
+    /// # API
+    ///
+    /// If `agg_function` supports the rewrite, it should return a semantically
+    /// equivalent expression (likely with more aggregate expressions, but
+    /// simpler arguments)
+    ///
+    /// This is only called when:
+    /// 1. There are no "special" aggregate params (filters, null handling, etc)
+    /// 2. Aggregate functions with exactly one [`Expr`] argument
+    /// 3. There are no volatile expressions
+    ///
+    /// Arguments
+    /// * `agg_function`: the original aggregate function detected with complex
+    ///   arguments.
+    /// * `arg`: The common argument shared across multiple aggregates (e.g. `x`
+    ///   in the example above)
+    /// * `op`: the operator between the common argument and the literal (e.g.
+    ///   `+` in `x + 1` or `1 + x`)
+    /// * `lit`: the literal argument (e.g. `1` or `2` in the example above)
+    /// * `arg_is_left`: whether the common argument is on the left or right of
+    ///   the operator (e.g. `true` for `x + 1` and false for `1 + x`)
+    ///
+    /// The default implementation returns `None`, which is what most aggregates
+    /// should do.
+    fn simplify_expr_op_literal(
+        &self,
+        _agg_function: &AggregateFunction,
+        _arg: &Expr,
+        _op: Operator,
+        _lit: &Expr,
+        _arg_is_left: bool,
+    ) -> Result<Option<Expr>> {
+        Ok(None)
+    }
+
     /// Returns the reverse expression of the aggregate function.
     fn reverse_expr(&self) -> ReversedUDAF {
         ReversedUDAF::NotSupported
@@ -740,10 +829,14 @@ pub trait AggregateUDFImpl: Debug + DynEq + DynHash + Send + Sync {
         ScalarValue::try_from(data_type)
     }
 
-    /// If this function supports `[IGNORE NULLS | RESPECT NULLS]` clause, return true
-    /// If the function does not, return false
+    /// If this function supports `[IGNORE NULLS | RESPECT NULLS]` SQL clause,
+    /// return `true`. Otherwise, return `false` which will cause an error to be
+    /// raised during SQL parsing if these clauses are detected for this function.
+    ///
+    /// Functions which implement this as `true` are expected to handle the resulting
+    /// null handling config present in [`AccumulatorArgs`], `ignore_nulls`.
     fn supports_null_handling_clause(&self) -> bool {
-        true
+        false
     }
 
     /// If this function supports the `WITHIN GROUP (ORDER BY column [ASC|DESC])`
@@ -814,9 +907,28 @@ pub trait AggregateUDFImpl: Debug + DynEq + DynHash + Send + Sync {
     }
 }
 
+impl dyn AggregateUDFImpl {
+    /// Returns `true` if the implementation is of type `T`.
+    ///
+    /// Works correctly when called on `Arc<dyn AggregateUDFImpl>` via auto-deref.
+    pub fn is<T: AggregateUDFImpl>(&self) -> bool {
+        (self as &dyn Any).is::<T>()
+    }
+
+    /// Attempts to downcast to a concrete type `T`, returning `None` if the
+    /// implementation is not of that type.
+    ///
+    /// Works correctly when called on `Arc<dyn AggregateUDFImpl>` via auto-deref,
+    /// unlike `(&arc as &dyn Any).downcast_ref::<T>()` which would attempt to
+    /// downcast the `Arc` itself.
+    pub fn downcast_ref<T: AggregateUDFImpl>(&self) -> Option<&T> {
+        (self as &dyn Any).downcast_ref()
+    }
+}
+
 impl PartialEq for dyn AggregateUDFImpl {
     fn eq(&self, other: &Self) -> bool {
-        self.dyn_eq(other.as_any())
+        self.dyn_eq(other)
     }
 }
 
@@ -1134,10 +1246,6 @@ impl AliasedAggregateUDFImpl {
 
 #[warn(clippy::missing_trait_methods)] // Delegates, so it should implement every single trait method
 impl AggregateUDFImpl for AliasedAggregateUDFImpl {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         self.inner.name()
     }
@@ -1230,6 +1338,18 @@ impl AggregateUDFImpl for AliasedAggregateUDFImpl {
         self.inner.simplify()
     }
 
+    fn simplify_expr_op_literal(
+        &self,
+        agg_function: &AggregateFunction,
+        arg: &Expr,
+        op: Operator,
+        lit: &Expr,
+        arg_is_left: bool,
+    ) -> Result<Option<Expr>> {
+        self.inner
+            .simplify_expr_op_literal(agg_function, arg, op, lit, arg_is_left)
+    }
+
     fn reverse_expr(&self) -> ReversedUDAF {
         self.inner.reverse_expr()
     }
@@ -1306,7 +1426,6 @@ mod test {
     use datafusion_functions_aggregate_common::accumulator::{
         AccumulatorArgs, StateFieldsArgs,
     };
-    use std::any::Any;
     use std::cmp::Ordering;
     use std::hash::{DefaultHasher, Hash, Hasher};
 
@@ -1328,9 +1447,6 @@ mod test {
     }
 
     impl AggregateUDFImpl for AMeanUdf {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
         fn name(&self) -> &str {
             "a"
         }
@@ -1368,9 +1484,6 @@ mod test {
     }
 
     impl AggregateUDFImpl for BMeanUdf {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
         fn name(&self) -> &str {
             "b"
         }
diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs
index fd54bb13a62f3..6a3aa31a8609f 100644
--- a/datafusion/expr/src/udf.rs
+++ b/datafusion/expr/src/udf.rs
@@ -19,21 +19,44 @@
 
 use crate::async_udf::AsyncScalarUDF;
 use crate::expr::schema_name_from_exprs_comma_separated_without_space;
-use crate::simplify::{ExprSimplifyResult, SimplifyInfo};
+use crate::preimage::PreimageResult;
+use crate::simplify::{ExprSimplifyResult, SimplifyContext};
 use crate::sort_properties::{ExprProperties, SortProperties};
 use crate::udf_eq::UdfEq;
 use crate::{ColumnarValue, Documentation, Expr, Signature};
 use arrow::datatypes::{DataType, Field, FieldRef};
+#[cfg(debug_assertions)]
+use datafusion_common::assert_or_internal_err;
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::{not_impl_err, ExprSchema, Result, ScalarValue};
+use datafusion_common::{ExprSchema, Result, ScalarValue, not_impl_err};
 use datafusion_expr_common::dyn_eq::{DynEq, DynHash};
 use datafusion_expr_common::interval_arithmetic::Interval;
+use datafusion_expr_common::placement::ExpressionPlacement;
 use std::any::Any;
 use std::cmp::Ordering;
 use std::fmt::Debug;
 use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 
+/// Describes how a struct-producing UDF's output fields correspond to its
+/// input arguments. This enables the optimizer to propagate orderings
+/// through struct projections (e.g., so that sorting by a struct field
+/// can be recognized as equivalent to sorting by the source column).
+///
+/// See [`ScalarUDFImpl::struct_field_mapping`] for details.
+pub struct StructFieldMapping {
+    /// The UDF used to construct field access expressions on the output.
+    /// For example, the `get_field` UDF for accessing struct fields.
+    pub field_accessor: Arc<ScalarUDF>,
+    /// For each output field: the literal arguments to pass to the
+    /// `field_accessor` UDF (after the base expression), and the index
+    /// of the corresponding input argument that produces the field's value.
+    ///
+    /// For `named_struct('a', col1, 'b', col2)`, this would be:
+    /// `[(["a"], 1), (["b"], 3)]` — field `"a"` comes from arg index 1.
+    pub fields: Vec<(Vec<ScalarValue>, usize)>,
+}
+
 /// Logical representation of a Scalar User Defined Function.
 ///
 /// A scalar function produces a single row output for each row of input. This
@@ -54,8 +77,8 @@ use std::sync::Arc;
 /// compatibility with the older API.
 ///
 /// [`create_udf`]: crate::expr_fn::create_udf
-/// [`simple_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/simple_udf.rs
-/// [`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udf.rs
+/// [`simple_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/simple_udf.rs
+/// [`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udf.rs
 #[derive(Debug, Clone)]
 pub struct ScalarUDF {
     inner: Arc<dyn ScalarUDFImpl>,
@@ -63,7 +86,7 @@ pub struct ScalarUDF {
 
 impl PartialEq for ScalarUDF {
     fn eq(&self, other: &Self) -> bool {
-        self.inner.dyn_eq(other.inner.as_any())
+        self.inner.as_ref().dyn_eq(other.inner.as_ref() as &dyn Any)
     }
 }
 
@@ -89,7 +112,8 @@ impl PartialOrd for ScalarUDF {
             "Detected incorrect implementation of PartialEq when comparing functions: '{}' and '{}'. \
             The functions compare as equal, but they are not equal based on general properties that \
             the PartialOrd implementation observes,",
-            self.name(), other.name()
+            self.name(),
+            other.name()
         );
         Some(cmp)
     }
@@ -212,23 +236,35 @@ impl ScalarUDF {
         self.inner.return_field_from_args(args)
     }
 
-    /// Do the function rewrite
+    /// Returns this scalar function's simplification result.
     ///
     /// See [`ScalarUDFImpl::simplify`] for more details.
     pub fn simplify(
         &self,
         args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         self.inner.simplify(args, info)
     }
 
     #[deprecated(since = "50.0.0", note = "Use `return_field_from_args` instead.")]
     pub fn is_nullable(&self, args: &[Expr], schema: &dyn ExprSchema) -> bool {
-        #[allow(deprecated)]
+        #[expect(deprecated)]
         self.inner.is_nullable(args, schema)
     }
 
+    /// Return a preimage
+    ///
+    /// See [`ScalarUDFImpl::preimage`] for more details.
+    pub fn preimage(
+        &self,
+        args: &[Expr],
+        lit_expr: &Expr,
+        info: &SimplifyContext,
+    ) -> Result<PreimageResult> {
+        self.inner.preimage(args, lit_expr, info)
+    }
+
     /// Invoke the function on `args`, returning the appropriate result.
     ///
     /// See [`ScalarUDFImpl::invoke_with_args`] for details.
@@ -240,13 +276,15 @@ impl ScalarUDF {
         // This doesn't use debug_assert!, but it's meant to run anywhere except on production. It's same in spirit, thus conditioning on debug_assertions.
         #[cfg(debug_assertions)]
         {
-            if &result.data_type() != return_field.data_type() {
-                return datafusion_common::internal_err!("Function '{}' returned value of type '{:?}' while the following type was promised at planning time and expected: '{:?}'",
-                        self.name(),
-                        result.data_type(),
-                        return_field.data_type()
-                    );
-            }
+            let result_data_type = result.data_type();
+            let expected_type = return_field.data_type();
+            assert_or_internal_err!(
+                result_data_type == *expected_type,
+                "Function '{}' returned value of type '{}' while the following type was promised at planning time and expected: '{}'",
+                self.name(),
+                result_data_type,
+                expected_type
+            );
             // TODO verify return data is non-null when it was promised to be?
         }
         Ok(result)
@@ -286,6 +324,14 @@ impl ScalarUDF {
         self.inner.evaluate_bounds(inputs)
     }
 
+    /// See [`ScalarUDFImpl::struct_field_mapping`] for more details.
+    pub fn struct_field_mapping(
+        &self,
+        literal_args: &[Option<ScalarValue>],
+    ) -> Option<StructFieldMapping> {
+        self.inner.struct_field_mapping(literal_args)
+    }
+
     /// Updates bounds for child expressions, given a known interval for this
     /// function. This is used to propagate constraints down through an expression
     /// tree.
@@ -341,7 +387,14 @@ impl ScalarUDF {
 
     /// Return true if this function is an async function
     pub fn as_async(&self) -> Option<&AsyncScalarUDF> {
-        self.inner().as_any().downcast_ref::<AsyncScalarUDF>()
+        self.inner().downcast_ref::<AsyncScalarUDF>()
+    }
+
+    /// Returns placement information for this function.
+    ///
+    /// See [`ScalarUDFImpl::placement`] for more details.
+    pub fn placement(&self, args: &[ExpressionPlacement]) -> ExpressionPlacement {
+        self.inner.placement(args)
     }
 }
 
@@ -408,7 +461,7 @@ pub struct ReturnFieldArgs<'a> {
 /// See [`advanced_udf.rs`] for a full example with complete implementation and
 /// [`ScalarUDF`] for other available options.
 ///
-/// [`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udf.rs
+/// [`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udf.rs
 ///
 /// # Basic Example
 /// ```
@@ -445,7 +498,6 @@ pub struct ReturnFieldArgs<'a> {
 ///
 /// /// Implement the ScalarUDFImpl trait for AddOne
 /// impl ScalarUDFImpl for AddOne {
-///    fn as_any(&self) -> &dyn Any { self }
 ///    fn name(&self) -> &str { "add_one" }
 ///    fn signature(&self) -> &Signature { &self.signature }
 ///    fn return_type(&self, args: &[DataType]) -> Result<DataType> {
@@ -469,10 +521,7 @@ pub struct ReturnFieldArgs<'a> {
 /// // Call the function `add_one(col)`
 /// let expr = add_one.call(vec![col("a")]);
 /// ```
-pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync {
-    /// Returns this object as an [`Any`] trait object
-    fn as_any(&self) -> &dyn Any;
-
+pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync + Any {
     /// Returns this function's name
     fn name(&self) -> &str;
 
@@ -538,7 +587,7 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync {
     ///
     /// If you provide an implementation for [`Self::return_field_from_args`],
     /// DataFusion will not call `return_type` (this function). While it is
-    /// valid to to put [`unimplemented!()`] or [`unreachable!()`], it is
+    /// valid to put [`unimplemented!()`] or [`unreachable!()`], it is
     /// recommended to return [`DataFusionError::Internal`] instead, which
     /// reduces the severity of symptoms if bugs occur (an error rather than a
     /// panic).
@@ -605,7 +654,7 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync {
     /// fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
     ///     // report output is only nullable if any one of the arguments are nullable
     ///     let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
-    ///     let field = Arc::new(Field::new("ignored_name", DataType::Int32, true));
+    ///     let field = Arc::new(Field::new("ignored_name", DataType::Int32, nullable));
     ///     Ok(field)
     /// }
     /// # }
@@ -686,11 +735,116 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync {
     fn simplify(
         &self,
         args: Vec<Expr>,
-        _info: &dyn SimplifyInfo,
+        _info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         Ok(ExprSimplifyResult::Original(args))
     }
 
+    /// Returns a single contiguous preimage for this function and the specified
+    /// scalar expression, if any.
+    ///
+    /// Currently only applies to `=, !=, >, >=, <, <=, is distinct from, is not distinct from` predicates
+    /// # Return Value
+    ///
+    /// Implementations should return a half-open interval: inclusive lower
+    /// bound and exclusive upper bound. This is slightly different from normal
+    /// [`Interval`] semantics where the upper bound is closed (inclusive).
+    /// Typically this means the upper endpoint must be adjusted to the next
+    /// value not included in the preimage. See the Half-Open Intervals section
+    /// below for more details.
+    ///
+    /// # Background
+    ///
+    /// Inspired by the [ClickHouse Paper], a "preimage rewrite" transforms a
+    /// predicate containing a function call into a predicate containing an
+    /// equivalent set of input literal (constant) values. The resulting
+    /// predicate can often be further optimized by other rewrites (see
+    /// Examples).
+    ///
+    /// From the paper:
+    ///
+    /// > some functions can compute the preimage of a given function result.
+    /// > This is used to replace comparisons of constants with function calls
+    /// > on the key columns by comparing the key column value with the preimage.
+    /// > For example, `toYear(k) = 2024` can be replaced by
+    /// > `k >= 2024-01-01 && k < 2025-01-01`
+    ///
+    /// For example, given an expression like
+    /// ```sql
+    /// date_part('YEAR', k) = 2024
+    /// ```
+    ///
+    /// The interval `[2024-01-01, 2025-12-31`]` contains all possible input
+    /// values (preimage values) for which the function `date_part(YEAR, k)`
+    /// produces the output value `2024` (image value). Returning the interval
+    /// (note upper bound adjusted up) `[2024-01-01, 2025-01-01]` the expression
+    /// can be rewritten to
+    ///
+    /// ```sql
+    /// k >= '2024-01-01' AND k < '2025-01-01'
+    /// ```
+    ///
+    /// which is a simpler and a more canonical form, making it easier for other
+    /// optimizer passes to recognize and apply further transformations.
+    ///
+    /// # Examples
+    ///
+    /// Case 1:
+    ///
+    /// Original:
+    /// ```sql
+    /// date_part('YEAR', k) = 2024 AND k >= '2024-06-01'
+    /// ```
+    ///
+    /// After preimage rewrite:
+    /// ```sql
+    /// k >= '2024-01-01' AND k < '2025-01-01' AND k >= '2024-06-01'
+    /// ```
+    ///
+    /// Since this form is much simpler, the optimizer can combine and simplify
+    /// sub-expressions further into:
+    /// ```sql
+    /// k >= '2024-06-01' AND k < '2025-01-01'
+    /// ```
+    ///
+    /// Case 2:
+    ///
+    /// For min/max pruning, simpler predicates such as:
+    /// ```sql
+    /// k >= '2024-01-01' AND k < '2025-01-01'
+    /// ```
+    /// are much easier for the pruner to reason about. See [PruningPredicate]
+    /// for the backgrounds of predicate pruning.
+    ///
+    /// The trade-off with the preimage rewrite is that evaluating the rewritten
+    /// form might be slightly more expensive than evaluating the original
+    /// expression. In practice, this cost is usually outweighed by the more
+    /// aggressive optimization opportunities it enables.
+    ///
+    /// # Half-Open Intervals
+    ///
+    /// The preimage API uses half-open intervals, which makes the rewrite
+    /// easier to implement by avoiding calculations to adjust the upper bound.
+    /// For example, if a function returns its input unchanged and the desired
+    /// output is the single value `5`, a closed interval could be represented
+    /// as `[5, 5]`, but then the rewrite would require adjusting the upper
+    /// bound to `6` to create a proper range predicate. With a half-open
+    /// interval, the same range is represented as `[5, 6)`, which already
+    /// forms a valid predicate.
+    ///
+    /// [PruningPredicate]: https://docs.rs/datafusion/latest/datafusion/physical_optimizer/pruning/struct.PruningPredicate.html
+    /// [ClickHouse Paper]:  https://www.vldb.org/pvldb/vol17/p3731-schulze.pdf
+    /// [image]: https://en.wikipedia.org/wiki/Image_(mathematics)#Image_of_an_element
+    /// [preimage]: https://en.wikipedia.org/wiki/Image_(mathematics)#Inverse_image
+    fn preimage(
+        &self,
+        _args: &[Expr],
+        _lit_expr: &Expr,
+        _info: &SimplifyContext,
+    ) -> Result<PreimageResult> {
+        Ok(PreimageResult::None)
+    }
+
     /// Returns true if some of this `exprs` subexpressions may not be evaluated
     /// and thus any side effects (like divide by zero) may not be encountered.
     ///
@@ -834,6 +988,25 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync {
         not_impl_err!("Function {} does not implement coerce_types", self.name())
     }
 
+    /// For struct-producing functions, return how output fields map to input
+    /// arguments. This enables the optimizer to propagate orderings through
+    /// struct projections.
+    ///
+    /// `literal_args[i]` is `Some(value)` if argument `i` is a known literal,
+    /// allowing extraction of field names from arguments like
+    /// `named_struct('field_name', value, ...)`.
+    ///
+    /// For example, `named_struct('a', col1, 'b', col2)` would return a
+    /// mapping indicating that output field `'a'` (accessed via
+    /// `get_field(output, 'a')`) corresponds to input argument `col1` at
+    /// index 1, and field `'b'` corresponds to `col2` at index 3.
+    fn struct_field_mapping(
+        &self,
+        _literal_args: &[Option<ScalarValue>],
+    ) -> Option<StructFieldMapping> {
+        None
+    }
+
     /// Returns the documentation for this Scalar UDF.
     ///
     /// Documentation can be accessed programmatically as well as generating
@@ -841,6 +1014,39 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync {
     fn documentation(&self) -> Option<&Documentation> {
         None
     }
+
+    /// Returns placement information for this function.
+    ///
+    /// This is used by optimizers to make decisions about expression placement,
+    /// such as whether to push expressions down through projections.
+    ///
+    /// The default implementation returns [`ExpressionPlacement::KeepInPlace`],
+    /// meaning the expression should be kept where it is in the plan.
+    ///
+    /// Override this method to indicate that the function can be pushed down
+    /// closer to the data source.
+    fn placement(&self, _args: &[ExpressionPlacement]) -> ExpressionPlacement {
+        ExpressionPlacement::KeepInPlace
+    }
+}
+
+impl dyn ScalarUDFImpl {
+    /// Returns `true` if the implementation is of type `T`.
+    ///
+    /// Works correctly when called on `Arc<dyn ScalarUDFImpl>` via auto-deref.
+    pub fn is<T: ScalarUDFImpl>(&self) -> bool {
+        (self as &dyn Any).is::<T>()
+    }
+
+    /// Attempts to downcast to a concrete type `T`, returning `None` if the
+    /// implementation is not of that type.
+    ///
+    /// Works correctly when called on `Arc<dyn ScalarUDFImpl>` via auto-deref,
+    /// unlike `(&arc as &dyn Any).downcast_ref::<T>()` which would attempt to
+    /// downcast the `Arc` itself.
+    pub fn downcast_ref<T: ScalarUDFImpl>(&self) -> Option<&T> {
+        (self as &dyn Any).downcast_ref()
+    }
 }
 
 /// ScalarUDF that adds an alias to the underlying function. It is better to
@@ -867,10 +1073,6 @@ impl AliasedScalarUDFImpl {
 
 #[warn(clippy::missing_trait_methods)] // Delegates, so it should implement every single trait method
 impl ScalarUDFImpl for AliasedScalarUDFImpl {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         self.inner.name()
     }
@@ -897,7 +1099,7 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl {
     }
 
     fn is_nullable(&self, args: &[Expr], schema: &dyn ExprSchema) -> bool {
-        #[allow(deprecated)]
+        #[expect(deprecated)]
         self.inner.is_nullable(args, schema)
     }
 
@@ -916,11 +1118,20 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl {
     fn simplify(
         &self,
         args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         self.inner.simplify(args, info)
     }
 
+    fn preimage(
+        &self,
+        args: &[Expr],
+        lit_expr: &Expr,
+        info: &SimplifyContext,
+    ) -> Result<PreimageResult> {
+        self.inner.preimage(args, lit_expr, info)
+    }
+
     fn conditional_arguments<'a>(
         &self,
         args: &'a [Expr],
@@ -944,6 +1155,13 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl {
         self.inner.propagate_constraints(interval, inputs)
     }
 
+    fn struct_field_mapping(
+        &self,
+        literal_args: &[Option<ScalarValue>],
+    ) -> Option<StructFieldMapping> {
+        self.inner.struct_field_mapping(literal_args)
+    }
+
     fn output_ordering(&self, inputs: &[ExprProperties]) -> Result<SortProperties> {
         self.inner.output_ordering(inputs)
     }
@@ -959,6 +1177,10 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl {
     fn documentation(&self) -> Option<&Documentation> {
         self.inner.documentation()
     }
+
+    fn placement(&self, args: &[ExpressionPlacement]) -> ExpressionPlacement {
+        self.inner.placement(args)
+    }
 }
 
 #[cfg(test)]
@@ -974,10 +1196,6 @@ mod tests {
         signature: Signature,
     }
     impl ScalarUDFImpl for TestScalarUDFImpl {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
         fn name(&self) -> &str {
             self.name
         }
diff --git a/datafusion/expr/src/udf_eq.rs b/datafusion/expr/src/udf_eq.rs
index 6664495267129..5fb0266aef5dd 100644
--- a/datafusion/expr/src/udf_eq.rs
+++ b/datafusion/expr/src/udf_eq.rs
@@ -15,7 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::{AggregateUDFImpl, ScalarUDFImpl, WindowUDFImpl};
+use crate::{AggregateUDFImpl, HigherOrderUDF, ScalarUDFImpl, WindowUDFImpl};
+use std::any::Any;
 use std::fmt::Debug;
 use std::hash::{DefaultHasher, Hash, Hasher};
 use std::ops::Deref;
@@ -26,7 +27,7 @@ use std::sync::Arc;
 ///
 /// If you want to just compare pointers for equality, use [`super::ptr_eq::PtrEq`].
 #[derive(Clone)]
-#[allow(private_bounds)] // This is so that UdfEq can only be used with allowed pointer types (e.g. Arc), without allowing misuse.
+#[expect(private_bounds)] // This is so that UdfEq can only be used with allowed pointer types (e.g. Arc), without allowing misuse.
 pub struct UdfEq<Ptr: UdfPointer>(Ptr);
 
 impl<Ptr> PartialEq for UdfEq<Ptr>
@@ -83,7 +84,19 @@ trait UdfPointer: Deref {
 
 impl UdfPointer for Arc<dyn ScalarUDFImpl + '_> {
     fn equals(&self, other: &(dyn ScalarUDFImpl + '_)) -> bool {
-        self.as_ref().dyn_eq(other.as_any())
+        self.as_ref().dyn_eq(other as &dyn Any)
+    }
+
+    fn hash_value(&self) -> u64 {
+        let hasher = &mut DefaultHasher::new();
+        self.as_ref().dyn_hash(hasher);
+        hasher.finish()
+    }
+}
+
+impl UdfPointer for Arc<dyn HigherOrderUDF + '_> {
+    fn equals(&self, other: &Self::Target) -> bool {
+        self.as_ref().dyn_eq(other)
     }
 
     fn hash_value(&self) -> u64 {
@@ -95,7 +108,7 @@ impl UdfPointer for Arc<dyn ScalarUDFImpl + '_> {
 
 impl UdfPointer for Arc<dyn AggregateUDFImpl + '_> {
     fn equals(&self, other: &(dyn AggregateUDFImpl + '_)) -> bool {
-        self.as_ref().dyn_eq(other.as_any())
+        self.as_ref().dyn_eq(other)
     }
 
     fn hash_value(&self) -> u64 {
@@ -107,7 +120,7 @@ impl UdfPointer for Arc<dyn AggregateUDFImpl + '_> {
 
 impl UdfPointer for Arc<dyn WindowUDFImpl + '_> {
     fn equals(&self, other: &(dyn WindowUDFImpl + '_)) -> bool {
-        self.as_ref().dyn_eq(other.as_any())
+        self.as_ref().dyn_eq(other as &dyn Any)
     }
 
     fn hash_value(&self) -> u64 {
@@ -124,7 +137,6 @@ mod tests {
     use arrow::datatypes::DataType;
     use datafusion_expr_common::columnar_value::ColumnarValue;
     use datafusion_expr_common::signature::{Signature, Volatility};
-    use std::any::Any;
     use std::hash::DefaultHasher;
 
     #[derive(Debug, PartialEq, Eq, Hash)]
@@ -133,10 +145,6 @@ mod tests {
         name: &'static str,
     }
     impl ScalarUDFImpl for TestScalarUDF {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
         fn name(&self) -> &str {
             self.name
         }
diff --git a/datafusion/expr/src/udwf.rs b/datafusion/expr/src/udwf.rs
index 3220fdcbcad70..5a5daca28a918 100644
--- a/datafusion/expr/src/udwf.rs
+++ b/datafusion/expr/src/udwf.rs
@@ -31,9 +31,9 @@ use arrow::datatypes::{DataType, FieldRef};
 use crate::expr::WindowFunction;
 use crate::udf_eq::UdfEq;
 use crate::{
-    function::WindowFunctionSimplification, Expr, PartitionEvaluator, Signature,
+    Expr, PartitionEvaluator, Signature, function::WindowFunctionSimplification,
 };
-use datafusion_common::{not_impl_err, Result};
+use datafusion_common::{Result, not_impl_err};
 use datafusion_doc::Documentation;
 use datafusion_expr_common::dyn_eq::{DynEq, DynHash};
 use datafusion_functions_window_common::expr::ExpressionArgs;
@@ -66,8 +66,8 @@ use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 ///
 /// [`PartitionEvaluator`]: crate::PartitionEvaluator
 /// [`create_udwf`]: crate::expr_fn::create_udwf
-/// [`simple_udwf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/simple_udwf.rs
-/// [`advanced_udwf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udwf.rs
+/// [`simple_udwf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/simple_udwf.rs
+/// [`advanced_udwf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udwf.rs
 #[derive(Debug, Clone, PartialOrd)]
 pub struct WindowUDF {
     inner: Arc<dyn WindowUDFImpl>,
@@ -82,7 +82,7 @@ impl Display for WindowUDF {
 
 impl PartialEq for WindowUDF {
     fn eq(&self, other: &Self) -> bool {
-        self.inner.dyn_eq(other.inner.as_any())
+        self.inner.dyn_eq(other.inner.as_ref() as &dyn Any)
     }
 }
 
@@ -157,7 +157,7 @@ impl WindowUDF {
         self.inner.signature()
     }
 
-    /// Do the function rewrite
+    /// Returns this window function's simplification hook, if any.
     ///
     /// See [`WindowUDFImpl::simplify`] for more details.
     pub fn simplify(&self) -> Option<WindowFunctionSimplification> {
@@ -237,10 +237,9 @@ where
 /// [`WindowUDF`] for other available options.
 ///
 ///
-/// [`advanced_udwf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udwf.rs
+/// [`advanced_udwf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udwf.rs
 /// # Basic Example
 /// ```
-/// # use std::any::Any;
 /// # use std::sync::LazyLock;
 /// # use arrow::datatypes::{DataType, Field, FieldRef};
 /// # use datafusion_common::{DataFusionError, plan_err, Result};
@@ -277,7 +276,6 @@ where
 ///
 /// /// Implement the WindowUDFImpl trait for SmoothIt
 /// impl WindowUDFImpl for SmoothIt {
-///    fn as_any(&self) -> &dyn Any { self }
 ///    fn name(&self) -> &str { "smooth_it" }
 ///    fn signature(&self) -> &Signature { &self.signature }
 ///    // The actual implementation would smooth the window
@@ -314,10 +312,7 @@ where
 ///     .build()
 ///     .unwrap();
 /// ```
-pub trait WindowUDFImpl: Debug + DynEq + DynHash + Send + Sync {
-    /// Returns this object as an [`Any`] trait object
-    fn as_any(&self) -> &dyn Any;
-
+pub trait WindowUDFImpl: Debug + DynEq + DynHash + Send + Sync + Any {
     /// Returns this function's name
     fn name(&self) -> &str;
 
@@ -344,25 +339,28 @@ pub trait WindowUDFImpl: Debug + DynEq + DynHash + Send + Sync {
         partition_evaluator_args: PartitionEvaluatorArgs,
     ) -> Result<Box<dyn PartitionEvaluator>>;
 
-    /// Optionally apply per-UDWF simplification / rewrite rules.
+    /// Returns an optional hook for simplifying this user-defined window
+    /// function.
     ///
-    /// This can be used to apply function specific simplification rules during
-    /// optimization. The default implementation does nothing.
+    /// Use this hook to apply function-specific rewrites during optimization.
+    /// The default implementation returns `None`.
     ///
-    /// Note that DataFusion handles simplifying arguments and  "constant
-    /// folding" (replacing a function call with constant arguments such as
-    /// `my_add(1,2) --> 3` ). Thus, there is no need to implement such
-    /// optimizations manually for specific UDFs.
+    /// DataFusion already simplifies arguments and performs constant folding
+    /// (for example, `my_add(1, 2) -> 3`), so there is usually no need to
+    /// implement those optimizations manually for specific UDFs.
     ///
     /// Example:
-    /// `advanced_udwf.rs`: <https://github.com/apache/arrow-datafusion/blob/main/datafusion-examples/examples/advanced_udwf.rs>
+    /// `advanced_udwf.rs`: <https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udwf.rs>
     ///
     /// # Returns
-    /// [None] if simplify is not defined or,
+    /// `None` if simplify is not defined.
+    ///
+    /// Or, a closure ([`WindowFunctionSimplification`]) invoked with:
+    /// * `window_function`: [WindowFunction] with already simplified
+    ///   arguments
+    /// * `info`: [crate::simplify::SimplifyContext]
     ///
-    /// Or, a closure with two arguments:
-    /// * 'window_function': [crate::expr::WindowFunction] for which simplified has been invoked
-    /// * 'info': [crate::simplify::SimplifyInfo]
+    /// The closure returns a simplified [Expr] or an error.
     ///
     /// # Notes
     /// The returned expression must have the same schema as the original
@@ -433,6 +431,25 @@ pub trait WindowUDFImpl: Debug + DynEq + DynHash + Send + Sync {
     }
 }
 
+impl dyn WindowUDFImpl {
+    /// Returns `true` if the implementation is of type `T`.
+    ///
+    /// Works correctly when called on `Arc<dyn WindowUDFImpl>` via auto-deref.
+    pub fn is<T: WindowUDFImpl>(&self) -> bool {
+        (self as &dyn Any).is::<T>()
+    }
+
+    /// Attempts to downcast to a concrete type `T`, returning `None` if the
+    /// implementation is not of that type.
+    ///
+    /// Works correctly when called on `Arc<dyn WindowUDFImpl>` via auto-deref,
+    /// unlike `(&arc as &dyn Any).downcast_ref::<T>()` which would attempt to
+    /// downcast the `Arc` itself.
+    pub fn downcast_ref<T: WindowUDFImpl>(&self) -> Option<&T> {
+        (self as &dyn Any).downcast_ref()
+    }
+}
+
 /// the effect this function will have on the limit pushdown
 pub enum LimitEffect {
     /// Does not affect the limit (i.e. this is causal)
@@ -459,7 +476,7 @@ pub enum ReversedUDWF {
 
 impl PartialEq for dyn WindowUDFImpl {
     fn eq(&self, other: &Self) -> bool {
-        self.dyn_eq(other.as_any())
+        self.dyn_eq(other as &dyn Any)
     }
 }
 
@@ -499,10 +516,6 @@ impl AliasedWindowUDFImpl {
 
 #[warn(clippy::missing_trait_methods)] // Delegates, so it should implement every single trait method
 impl WindowUDFImpl for AliasedWindowUDFImpl {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         self.inner.name()
     }
@@ -567,7 +580,6 @@ mod test {
     use datafusion_functions_window_common::field::WindowUDFFieldArgs;
     use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
     use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-    use std::any::Any;
     use std::cmp::Ordering;
     use std::hash::{DefaultHasher, Hash, Hasher};
     use std::sync::Arc;
@@ -591,9 +603,6 @@ mod test {
 
     /// Implement the WindowUDFImpl trait for AddOne
     impl WindowUDFImpl for AWindowUDF {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
         fn name(&self) -> &str {
             "a"
         }
@@ -634,9 +643,6 @@ mod test {
 
     /// Implement the WindowUDFImpl trait for AddOne
     impl WindowUDFImpl for BWindowUDF {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
         fn name(&self) -> &str {
             "b"
         }
diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs
index cd733e0a130a9..fdb4b6de7874a 100644
--- a/datafusion/expr/src/utils.rs
+++ b/datafusion/expr/src/utils.rs
@@ -24,7 +24,7 @@ use std::sync::Arc;
 use crate::expr::{Alias, Sort, WildcardOptions, WindowFunctionParams};
 use crate::expr_rewriter::strip_outer_reference;
 use crate::{
-    and, BinaryExpr, Expr, ExprSchemable, Filter, GroupingSet, LogicalPlan, Operator,
+    BinaryExpr, Expr, ExprSchemable, Filter, GroupingSet, LogicalPlan, Operator, and,
 };
 use datafusion_expr_common::signature::{Signature, TypeSignature};
 
@@ -34,8 +34,8 @@ use datafusion_common::tree_node::{
 };
 use datafusion_common::utils::get_at_indices;
 use datafusion_common::{
-    internal_err, plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, HashMap,
-    Result, TableReference,
+    Column, DFSchema, DFSchemaRef, HashMap, Result, TableReference, internal_err,
+    plan_err,
 };
 
 #[cfg(not(feature = "sql"))]
@@ -66,6 +66,23 @@ pub fn grouping_set_expr_count(group_expr: &[Expr]) -> Result<usize> {
     }
 }
 
+/// Internal helper that generates indices for powerset subsets using bitset iteration.
+/// Returns an iterator of index vectors, where each vector contains the indices
+/// of elements to include in that subset.
+fn powerset_indices(len: usize) -> impl Iterator<Item = Vec<usize>> {
+    (0..(1 << len)).map(move |mask| {
+        let mut indices = vec![];
+        let mut bitset = mask;
+        while bitset > 0 {
+            let rightmost: u64 = bitset & !(bitset - 1);
+            let idx = rightmost.trailing_zeros() as usize;
+            indices.push(idx);
+            bitset &= bitset - 1;
+        }
+        indices
+    })
+}
+
 /// The [power set] (or powerset) of a set S is the set of all subsets of S, \
 /// including the empty set and S itself.
 ///
@@ -83,33 +100,23 @@ pub fn grouping_set_expr_count(group_expr: &[Expr]) -> Result<usize> {
 ///  and hence the power set of S is {{}, {x}, {y}, {z}, {x, y}, {x, z}, {y, z}, {x, y, z}}.
 ///
 /// [power set]: https://en.wikipedia.org/wiki/Power_set
-fn powerset<T>(slice: &[T]) -> Result<Vec<Vec<&T>>, String> {
+pub fn powerset<T>(slice: &[T]) -> Result<Vec<Vec<&T>>> {
     if slice.len() >= 64 {
-        return Err("The size of the set must be less than 64.".into());
+        return plan_err!("The size of the set must be less than 64");
     }
 
-    let mut v = Vec::new();
-    for mask in 0..(1 << slice.len()) {
-        let mut ss = vec![];
-        let mut bitset = mask;
-        while bitset > 0 {
-            let rightmost: u64 = bitset & !(bitset - 1);
-            let idx = rightmost.trailing_zeros();
-            let item = slice.get(idx as usize).unwrap();
-            ss.push(item);
-            // zero the trailing bit
-            bitset &= bitset - 1;
-        }
-        v.push(ss);
-    }
-    Ok(v)
+    Ok(powerset_indices(slice.len())
+        .map(|indices| indices.iter().map(|&idx| &slice[idx]).collect())
+        .collect())
 }
 
 /// check the number of expressions contained in the grouping_set
 fn check_grouping_set_size_limit(size: usize) -> Result<()> {
     let max_grouping_set_size = 65535;
     if size > max_grouping_set_size {
-        return plan_err!("The number of group_expression in grouping_set exceeds the maximum limit {max_grouping_set_size}, found {size}");
+        return plan_err!(
+            "The number of group_expression in grouping_set exceeds the maximum limit {max_grouping_set_size}, found {size}"
+        );
     }
 
     Ok(())
@@ -119,7 +126,9 @@ fn check_grouping_set_size_limit(size: usize) -> Result<()> {
 fn check_grouping_sets_size_limit(size: usize) -> Result<()> {
     let max_grouping_sets_size = 4096;
     if size > max_grouping_sets_size {
-        return plan_err!("The number of grouping_set in grouping_sets exceeds the maximum limit {max_grouping_sets_size}, found {size}");
+        return plan_err!(
+            "The number of grouping_set in grouping_sets exceeds the maximum limit {max_grouping_sets_size}, found {size}"
+        );
     }
 
     Ok(())
@@ -207,8 +216,7 @@ pub fn enumerate_grouping_sets(group_expr: Vec<Expr>) -> Result<Vec<Expr>> {
                     grouping_sets.iter().map(|e| e.iter().collect()).collect()
                 }
                 Expr::GroupingSet(GroupingSet::Cube(group_exprs)) => {
-                    let grouping_sets = powerset(group_exprs)
-                        .map_err(|e| plan_datafusion_err!("{}", e))?;
+                    let grouping_sets = powerset(group_exprs)?;
                     check_grouping_sets_size_limit(grouping_sets.len())?;
                     grouping_sets
                 }
@@ -304,10 +312,14 @@ pub fn expr_to_columns(expr: &Expr, accum: &mut HashSet<Column>) -> Result<()> {
             | Expr::InList { .. }
             | Expr::Exists { .. }
             | Expr::InSubquery(_)
+            | Expr::SetComparison(_)
             | Expr::ScalarSubquery(_)
             | Expr::Wildcard { .. }
             | Expr::Placeholder(_)
-            | Expr::OuterReferenceColumn { .. } => {}
+            | Expr::OuterReferenceColumn { .. }
+            | Expr::HigherOrderFunction(_)
+            | Expr::Lambda(_)
+            | Expr::LambdaVariable(_) => {}
         }
         Ok(TreeNodeRecursion::Continue)
     })
@@ -354,7 +366,7 @@ fn get_excluded_columns(
 /// Returns all `Expr`s in the schema, except the `Column`s in the `columns_to_skip`
 fn get_exprs_except_skipped(
     schema: &DFSchema,
-    columns_to_skip: HashSet<Column>,
+    columns_to_skip: &HashSet<Column>,
 ) -> Vec<Expr> {
     if columns_to_skip.is_empty() {
         schema.iter().map(Expr::from).collect::<Vec<Expr>>()
@@ -373,30 +385,39 @@ fn get_exprs_except_skipped(
     }
 }
 
-/// For each column specified in the USING JOIN condition, the JOIN plan outputs it twice
-/// (once for each join side), but an unqualified wildcard should include it only once.
-/// This function returns the columns that should be excluded.
+/// When a JOIN has a USING clause, the join columns appear in the output
+/// schema once per side (for inner/outer joins) or once total (for semi/anti
+/// joins). An unqualified wildcard should include each USING column only once.
+/// This function returns the duplicate columns that should be excluded.
 fn exclude_using_columns(plan: &LogicalPlan) -> Result<HashSet<Column>> {
-    let using_columns = plan.using_columns()?;
-    let excluded = using_columns
-        .into_iter()
-        // For each USING JOIN condition, only expand to one of each join column in projection
-        .flat_map(|cols| {
-            let mut cols = cols.into_iter().collect::<Vec<_>>();
-            // sort join columns to make sure we consistently keep the same
-            // qualified column
-            cols.sort();
-            let mut out_column_names: HashSet<String> = HashSet::new();
-            cols.into_iter().filter_map(move |c| {
-                if out_column_names.contains(&c.name) {
-                    Some(c)
-                } else {
-                    out_column_names.insert(c.name);
-                    None
-                }
-            })
-        })
-        .collect::<HashSet<_>>();
+    let output_columns: HashSet<_> = plan.schema().columns().iter().cloned().collect();
+    let mut excluded = HashSet::new();
+    for cols in plan.using_columns()? {
+        // `using_columns()` returns join columns from both sides regardless of
+        // the join type. For semi/anti joins, only one side's columns appear in
+        // the output schema. Filter to output columns so that columns from the
+        // non-output side don't participate in the deduplication process below
+        // and displace real output columns.
+        let mut cols: Vec<_> = cols
+            .into_iter()
+            .filter(|c| output_columns.contains(c))
+            .collect();
+
+        // Sort so we keep the same qualified column, regardless of HashSet
+        // iteration order.
+        cols.sort();
+
+        // Keep only one column per name from the columns set, adding any
+        // duplicates to the excluded set.
+        let mut seen_names = HashSet::new();
+        for col in cols {
+            if seen_names.contains(col.name.as_str()) {
+                excluded.insert(col); // exclude columns with already seen name
+            } else {
+                seen_names.insert(col.name.clone()); // mark column name as seen
+            }
+        }
+    }
     Ok(excluded)
 }
 
@@ -419,7 +440,7 @@ pub fn expand_wildcard(
     };
     // Add each excluded `Column` to columns_to_skip
     columns_to_skip.extend(excluded_columns);
-    Ok(get_exprs_except_skipped(schema, columns_to_skip))
+    Ok(get_exprs_except_skipped(schema, &columns_to_skip))
 }
 
 /// Resolves an `Expr::Wildcard` to a collection of qualified `Expr::Column`'s.
@@ -464,7 +485,7 @@ pub fn expand_qualified_wildcard(
     columns_to_skip.extend(excluded_columns);
     Ok(get_exprs_except_skipped(
         &qualified_dfschema,
-        columns_to_skip,
+        &columns_to_skip,
     ))
 }
 
@@ -928,6 +949,8 @@ pub fn find_valid_equijoin_key_pair(
 ///     round(Float64)
 ///     round(Float32)
 /// ```
+#[expect(clippy::needless_pass_by_value)]
+#[deprecated(since = "53.0.0", note = "Internal function")]
 pub fn generate_signature_error_msg(
     func_name: &str,
     func_signature: Signature,
@@ -942,9 +965,31 @@ pub fn generate_signature_error_msg(
         .join("\n");
 
     format!(
-            "No function matches the given name and argument types '{}({})'. You might need to add explicit type casts.\n\tCandidate functions:\n{}",
-            func_name, TypeSignature::join_types(input_expr_types, ", "), candidate_signatures
-        )
+        "No function matches the given name and argument types '{}({})'. You might need to add explicit type casts.\n\tCandidate functions:\n{}",
+        func_name,
+        TypeSignature::join_types(input_expr_types, ", "),
+        candidate_signatures
+    )
+}
+
+/// Creates a detailed error message for a function with wrong signature.
+///
+/// For example, a query like `select round(3.14, 1.1);` would yield:
+/// ```text
+/// Error during planning: No function matches 'round(Float64, Float64)'. You might need to add explicit type casts.
+///     Candidate functions:
+///     round(Float64, Int64)
+///     round(Float32, Int64)
+///     round(Float64)
+///     round(Float32)
+/// ```
+pub(crate) fn generate_signature_error_message(
+    func_name: &str,
+    func_signature: &Signature,
+    input_expr_types: &[DataType],
+) -> String {
+    #[expect(deprecated)]
+    generate_signature_error_msg(func_name, func_signature.clone(), input_expr_types)
 }
 
 /// Splits a conjunctive [`Expr`] such as `A AND B AND C` => `[A, B, C]`
@@ -1275,14 +1320,13 @@ pub fn collect_subquery_cols(
 mod tests {
     use super::*;
     use crate::{
-        col, cube,
+        Cast, ExprFunctionExt, WindowFunctionDefinition, col, cube,
         expr::WindowFunction,
         expr_vec_fmt, grouping_set, lit, rollup,
         test::function_stub::{max_udaf, min_udaf, sum_udaf},
-        Cast, ExprFunctionExt, WindowFunctionDefinition,
     };
     use arrow::datatypes::{UnionFields, UnionMode};
-    use datafusion_expr_common::signature::{TypeSignature, Volatility};
+    use datafusion_expr_common::signature::Volatility;
 
     #[test]
     fn test_group_window_expr_by_sort_keys_empty_case() -> Result<()> {
@@ -1724,7 +1768,8 @@ mod tests {
         .expect("valid parameter names");
 
         // Generate error message with only 1 argument provided
-        let error_msg = generate_signature_error_msg("substr", sig, &[DataType::Utf8]);
+        let error_msg =
+            generate_signature_error_message("substr", &sig, &[DataType::Utf8]);
 
         assert!(
             error_msg.contains("str: Utf8, start_pos: Int64"),
@@ -1743,11 +1788,112 @@ mod tests {
             Volatility::Immutable,
         );
 
-        let error_msg = generate_signature_error_msg("my_func", sig, &[DataType::Int32]);
+        let error_msg =
+            generate_signature_error_message("my_func", &sig, &[DataType::Int32]);
 
         assert!(
             error_msg.contains("Any, Any"),
             "Expected 'Any, Any' without parameter names, got: {error_msg}"
         );
     }
+
+    #[test]
+    fn test_signature_error_msg_exact() {
+        use insta::assert_snapshot;
+
+        let sig = Signature::one_of(
+            vec![
+                TypeSignature::Exact(vec![DataType::Float64, DataType::Int64]),
+                TypeSignature::Exact(vec![DataType::Float32, DataType::Int64]),
+                TypeSignature::Exact(vec![DataType::Float64]),
+                TypeSignature::Exact(vec![DataType::Float32]),
+            ],
+            Volatility::Immutable,
+        );
+        let msg = generate_signature_error_message(
+            "round",
+            &sig,
+            &[DataType::Float64, DataType::Float64],
+        );
+        assert_snapshot!(msg, @r"
+        No function matches the given name and argument types 'round(Float64, Float64)'. You might need to add explicit type casts.
+        	Candidate functions:
+        	round(Float64, Int64)
+        	round(Float32, Int64)
+        	round(Float64)
+        	round(Float32)
+        ");
+    }
+
+    #[test]
+    fn test_signature_error_msg_coercible() {
+        use datafusion_common::types::NativeType;
+        use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
+        use insta::assert_snapshot;
+
+        let sig = Signature::coercible(
+            vec![
+                Coercion::new_implicit(
+                    TypeSignatureClass::Native(
+                        datafusion_common::types::logical_float64(),
+                    ),
+                    vec![TypeSignatureClass::Numeric],
+                    NativeType::Float64,
+                ),
+                Coercion::new_implicit(
+                    TypeSignatureClass::Native(datafusion_common::types::logical_int64()),
+                    vec![TypeSignatureClass::Integer],
+                    NativeType::Int64,
+                ),
+            ],
+            Volatility::Immutable,
+        );
+        let msg = generate_signature_error_message(
+            "round",
+            &sig,
+            &[DataType::Utf8, DataType::Utf8],
+        );
+        assert_snapshot!(msg, @r"
+        No function matches the given name and argument types 'round(Utf8, Utf8)'. You might need to add explicit type casts.
+        	Candidate functions:
+        	round(Float64, Int64)
+        ");
+    }
+
+    #[test]
+    fn test_signature_error_msg_with_names_coercible() {
+        use datafusion_common::types::NativeType;
+        use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
+        use insta::assert_snapshot;
+
+        let sig = Signature::coercible(
+            vec![
+                Coercion::new_exact(TypeSignatureClass::Native(
+                    datafusion_common::types::logical_string(),
+                )),
+                Coercion::new_exact(TypeSignatureClass::Native(
+                    datafusion_common::types::logical_int64(),
+                )),
+                Coercion::new_implicit(
+                    TypeSignatureClass::Native(datafusion_common::types::logical_int64()),
+                    vec![TypeSignatureClass::Integer],
+                    NativeType::Int64,
+                ),
+            ],
+            Volatility::Immutable,
+        )
+        .with_parameter_names(vec![
+            "string".to_string(),
+            "start_pos".to_string(),
+            "length".to_string(),
+        ])
+        .expect("valid parameter names");
+
+        let msg = generate_signature_error_message("substr", &sig, &[DataType::Int32]);
+        assert_snapshot!(msg, @r"
+        No function matches the given name and argument types 'substr(Int32)'. You might need to add explicit type casts.
+        	Candidate functions:
+        	substr(string: String, start_pos: Int64, length: Int64)
+        ");
+    }
 }
diff --git a/datafusion/expr/src/window_frame.rs b/datafusion/expr/src/window_frame.rs
index 5fb2916c34e95..334c1fa2a090b 100644
--- a/datafusion/expr/src/window_frame.rs
+++ b/datafusion/expr/src/window_frame.rs
@@ -27,7 +27,7 @@ use crate::{expr::Sort, lit};
 use std::fmt::{self, Formatter};
 use std::hash::Hash;
 
-use datafusion_common::{plan_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, plan_err};
 #[cfg(feature = "sql")]
 use sqlparser::ast::{self, ValueWithSpan};
 
@@ -131,12 +131,10 @@ impl TryFrom<ast::WindowFrame> for WindowFrame {
                     "Invalid window frame: start bound cannot be UNBOUNDED FOLLOWING"
                 )?
             }
-        } else if let WindowFrameBound::Preceding(val) = &end_bound {
-            if val.is_null() {
-                plan_err!(
-                    "Invalid window frame: end bound cannot be UNBOUNDED PRECEDING"
-                )?
-            }
+        } else if let WindowFrameBound::Preceding(val) = &end_bound
+            && val.is_null()
+        {
+            plan_err!("Invalid window frame: end bound cannot be UNBOUNDED PRECEDING")?
         };
 
         let units = value.units.into();
@@ -375,9 +373,10 @@ fn convert_frame_bound_to_scalar_value(
     match units {
         // For ROWS and GROUPS we are sure that the ScalarValue must be a non-negative integer ...
         ast::WindowFrameUnits::Rows | ast::WindowFrameUnits::Groups => match v {
-            ast::Expr::Value(ValueWithSpan{value: ast::Value::Number(value, false), span: _}) => {
-                Ok(ScalarValue::try_from_string(value, &DataType::UInt64)?)
-            },
+            ast::Expr::Value(ValueWithSpan {
+                value: ast::Value::Number(value, false),
+                span: _,
+            }) => Ok(ScalarValue::try_from_string(value, &DataType::UInt64)?),
             ast::Expr::Interval(ast::Interval {
                 value,
                 leading_field: None,
@@ -386,11 +385,12 @@ fn convert_frame_bound_to_scalar_value(
                 fractional_seconds_precision: None,
             }) => {
                 let value = match *value {
-                    ast::Expr::Value(ValueWithSpan{value: ast::Value::SingleQuotedString(item), span: _}) => item,
+                    ast::Expr::Value(ValueWithSpan {
+                        value: ast::Value::SingleQuotedString(item),
+                        span: _,
+                    }) => item,
                     e => {
-                        return exec_err!(
-                            "INTERVAL expression cannot be {e:?}"
-                        );
+                        return exec_err!("INTERVAL expression cannot be {e:?}");
                     }
                 };
                 Ok(ScalarValue::try_from_string(value, &DataType::UInt64)?)
@@ -402,18 +402,22 @@ fn convert_frame_bound_to_scalar_value(
         // ... instead for RANGE it could be anything depending on the type of the ORDER BY clause,
         // so we use a ScalarValue::Utf8.
         ast::WindowFrameUnits::Range => Ok(ScalarValue::Utf8(Some(match v {
-            ast::Expr::Value(ValueWithSpan{value: ast::Value::Number(value, false), span: _}) => value,
+            ast::Expr::Value(ValueWithSpan {
+                value: ast::Value::Number(value, false),
+                span: _,
+            }) => value,
             ast::Expr::Interval(ast::Interval {
                 value,
                 leading_field,
                 ..
             }) => {
                 let result = match *value {
-                    ast::Expr::Value(ValueWithSpan{value: ast::Value::SingleQuotedString(item), span: _}) => item,
+                    ast::Expr::Value(ValueWithSpan {
+                        value: ast::Value::SingleQuotedString(item),
+                        span: _,
+                    }) => item,
                     e => {
-                        return exec_err!(
-                            "INTERVAL expression cannot be {e:?}"
-                        );
+                        return exec_err!("INTERVAL expression cannot be {e:?}");
                     }
                 };
                 if let Some(leading_field) = leading_field {
@@ -604,8 +608,16 @@ mod tests {
             last_field: None,
             leading_precision: None,
         })));
-        test_bound_err!(Rows, number.clone(), "Error during planning: Invalid window frame: frame offsets for ROWS / GROUPS must be non negative integers");
-        test_bound_err!(Groups, number.clone(), "Error during planning: Invalid window frame: frame offsets for ROWS / GROUPS must be non negative integers");
+        test_bound_err!(
+            Rows,
+            number.clone(),
+            "Error during planning: Invalid window frame: frame offsets for ROWS / GROUPS must be non negative integers"
+        );
+        test_bound_err!(
+            Groups,
+            number.clone(),
+            "Error during planning: Invalid window frame: frame offsets for ROWS / GROUPS must be non negative integers"
+        );
         test_bound!(
             Range,
             number.clone(),
diff --git a/datafusion/expr/src/window_state.rs b/datafusion/expr/src/window_state.rs
index cdfb18ee1ddd7..d7da7a778b011 100644
--- a/datafusion/expr/src/window_state.rs
+++ b/datafusion/expr/src/window_state.rs
@@ -23,14 +23,13 @@ use crate::{WindowFrame, WindowFrameBound, WindowFrameUnits};
 
 use arrow::{
     array::ArrayRef,
-    compute::{concat, concat_batches, SortOptions},
+    compute::{SortOptions, concat, concat_batches},
     datatypes::{DataType, SchemaRef},
     record_batch::RecordBatch,
 };
 use datafusion_common::{
-    internal_datafusion_err, internal_err,
+    Result, ScalarValue, internal_datafusion_err, internal_err,
     utils::{compare_rows, get_row_at_idx, search_in_slice},
-    Result, ScalarValue,
 };
 
 /// Holds the state of evaluating a window function
@@ -170,7 +169,7 @@ impl WindowFrameContext {
             // comparison of rows.
             WindowFrameContext::Range {
                 window_frame,
-                ref mut state,
+                state,
             } => state.calculate_range(
                 window_frame,
                 last_range,
@@ -183,7 +182,7 @@ impl WindowFrameContext {
             // or position of NULLs do not impact inequality.
             WindowFrameContext::Groups {
                 window_frame,
-                ref mut state,
+                state,
             } => state.calculate_range(window_frame, range_columns, length, idx),
         }
     }
@@ -205,14 +204,14 @@ impl WindowFrameContext {
             WindowFrameBound::Following(ScalarValue::UInt64(None)) => {
                 return internal_err!(
                     "Frame start cannot be UNBOUNDED FOLLOWING '{window_frame:?}'"
-                )
+                );
             }
             WindowFrameBound::Following(ScalarValue::UInt64(Some(n))) => {
                 std::cmp::min(idx + n as usize, length)
             }
             // ERRONEOUS FRAMES
             WindowFrameBound::Preceding(_) | WindowFrameBound::Following(_) => {
-                return internal_err!("Rows should be UInt64")
+                return internal_err!("Rows should be UInt64");
             }
         };
         let end = match window_frame.end_bound {
@@ -220,7 +219,7 @@ impl WindowFrameContext {
             WindowFrameBound::Preceding(ScalarValue::UInt64(None)) => {
                 return internal_err!(
                     "Frame end cannot be UNBOUNDED PRECEDING '{window_frame:?}'"
-                )
+                );
             }
             WindowFrameBound::Preceding(ScalarValue::UInt64(Some(n))) => {
                 if idx >= n as usize {
@@ -237,7 +236,7 @@ impl WindowFrameContext {
             }
             // ERRONEOUS FRAMES
             WindowFrameBound::Preceding(_) | WindowFrameBound::Following(_) => {
-                return internal_err!("Rows should be UInt64")
+                return internal_err!("Rows should be UInt64");
             }
         };
         Ok(Range { start, end })
diff --git a/datafusion/ffi/Cargo.toml b/datafusion/ffi/Cargo.toml
index babfe28ad5576..ea9a12665ad4c 100644
--- a/datafusion/ffi/Cargo.toml
+++ b/datafusion/ffi/Cargo.toml
@@ -30,6 +30,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -37,26 +40,52 @@ workspace = true
 name = "datafusion_ffi"
 crate-type = ["cdylib", "rlib"]
 
+# Note to developers: do *not* add `datafusion` as a dependency in this crate.
+# It increases build times and library binary size for users.
+
 [dependencies]
-abi_stable = "0.11.3"
 arrow = { workspace = true, features = ["ffi"] }
 arrow-schema = { workspace = true }
-async-ffi = { version = "0.5.0", features = ["abi_stable"] }
+async-ffi = { version = "0.5.0" }
 async-trait = { workspace = true }
-datafusion = { workspace = true, default-features = false }
+datafusion-catalog = { workspace = true }
 datafusion-common = { workspace = true }
+datafusion-datasource = { workspace = true }
+datafusion-execution = { workspace = true }
+datafusion-expr = { workspace = true }
+datafusion-functions = { workspace = true, optional = true }
+datafusion-functions-aggregate = { workspace = true, optional = true }
 datafusion-functions-aggregate-common = { workspace = true }
+datafusion-functions-table = { workspace = true, optional = true }
+datafusion-functions-window = { workspace = true, optional = true }
+datafusion-physical-expr = { workspace = true }
+datafusion-physical-expr-common = { workspace = true }
+datafusion-physical-optimizer = { workspace = true }
+datafusion-physical-plan = { workspace = true }
 datafusion-proto = { workspace = true }
 datafusion-proto-common = { workspace = true }
+datafusion-session = { workspace = true }
 futures = { workspace = true }
+libloading = "0.9"
 log = { workspace = true }
 prost = { workspace = true }
-semver = "1.0.27"
+semver = "1.0.28"
+stabby = "72.1.1"
 tokio = { workspace = true }
 
 [dev-dependencies]
+datafusion = { workspace = true, default-features = false, features = ["sql"] }
+datafusion-functions = { workspace = true }
+datafusion-functions-aggregate = { workspace = true }
+datafusion-functions-aggregate-common = { workspace = true }
+datafusion-functions-window = { workspace = true }
 doc-comment = { workspace = true }
 
 [features]
-integration-tests = []
+integration-tests = [
+    "datafusion-functions",
+    "datafusion-functions-aggregate",
+    "datafusion-functions-table",
+    "datafusion-functions-window",
+]
 tarpaulin_include = [] # Exists only to prevent warnings on stable and still have accurate coverage
diff --git a/datafusion/ffi/README.md b/datafusion/ffi/README.md
index 72070984f9315..ded54b0a88d09 100644
--- a/datafusion/ffi/README.md
+++ b/datafusion/ffi/README.md
@@ -65,7 +65,7 @@ to work across Rust libraries. In general, you can use Rust's [FFI] to
 operate across different programming languages, but that is not the design
 intent of this crate. Instead, we are using external crates that provide
 stable interfaces that closely mirror the Rust native approach. To learn more
-about this approach see the [abi_stable] and [async-ffi] crates.
+about this approach see the [stabby] and [async-ffi] crates.
 
 If you have a library in another language that you wish to interface to
 DataFusion the recommendation is to create a Rust wrapper crate to interface
@@ -73,6 +73,31 @@ with your library and then to connect it to DataFusion using this crate.
 Alternatively, you could use [bindgen] to interface directly to the [FFI] provided
 by this crate, but that is currently not supported.
 
+## Stabby Usage
+
+This crate uses [stabby] for ABI-stable types like `stabby::string::String` and
+`stabby::vec::Vec`.
+
+We intentionally use `#[repr(C)]` for our struct definitions instead of stabby's
+`#[stabby::stabby]` macro for the following reasons:
+
+1. **Build time**: The heavy macro use greatly increases build time, especially
+   given our many interdependent types.
+
+2. **Code complexity**: Creating stabby dyn pointers for trait objects leads to
+   more complex code patterns with very little added benefit.
+
+3. **Arrow types**: Arrow's FFI types like `FFI_ArrowSchema` do not implement
+   `IStable`, and adding such implementations would be laborious and error-prone.
+
+4. **FFI_Option and FFI_Result**: We provide our own `FFI_Option<T>` and
+   `FFI_Result<T>` types using `#[repr(C, u8)]` because stabby's `Option`
+   and `Result` require inner types to be `IStable`.
+
+Instead, we use stabby for its convenient ABI-stable collection types like
+`stabby::string::String` and `stabby::vec::Vec`, while retaining flexibility
+for our complex FFI struct layouts.
+
 ## FFI Boundary
 
 We expect this crate to be used by both sides of the FFI Boundary. This should
@@ -101,10 +126,103 @@ In this crate we have a variety of structs which closely mimic the behavior of
 their internal counterparts. To see detailed notes about how to use them, see
 the example in `FFI_TableProvider`.
 
+## Memory Management
+
+One of the advantages of Rust is the ownership model, which means programmers
+_usually_ do not need to worry about memory management. When interacting with
+foreign code, this is not necessarily true. If you review the structures in
+this crate, you will find that many of them implement the `Drop` trait and
+perform a foreign call.
+
+Suppose we have a `FFI_CatalogProvider`, for example. This struct is safe to
+pass across the FFI boundary, so it may be owned by either the library that
+produces the underlying `CatalogProvider` or by another library that consumes
+it. If we look closer at the `FFI_CatalogProvider`, it has a pointer to
+some private data. That private data is only accessible on the producer's
+side. If you attempt to access it on the consumer's side, you may get
+segmentation faults or other bad behavior. Within that private data is the
+actual `Arc<dyn CatalogProvider`. That `Arc<>` must be freed, but if the
+`FFI_CatalogProvider` is only owned on the consumer's side, we have no way
+to access the private data and free it.
+
+To account for this, most structs in this crate have a `release` method that
+is used to clean up any privately held data. This calls into the producer's
+side, regardless of if it is called on either the local or foreign side.
+Most of the structs in this crate carry atomic reference counts to the
+underlying data, and this is straight forward. Some structs like the
+`FFI_Accumulator` contain an inner `Box<dyn Accumulator>`. The reason for
+this is that we need to be able to mutably access these based on the
+`Accumulator` trait definition. For these we have slightly more complicated
+release code based on whether it is being dropped on the local or foreign side.
+Traits that use a `Box<>` for their underlying data also cannot implement
+`Clone`.
+
+## Library Marker ID
+
+When reviewing the code, many of the structs in this crate contain a call to
+a `library_marker_id`. The purpose of this call is to determine if a library is
+accessing _local_ code through the FFI structs. Consider this example: you have
+a `primary` program that exposes functions to create a schema provider. You
+have a `secondary` library that exposes a function to create a catalog provider
+and the `secondary` library uses the schema provider of the `primary` program.
+From the point of view of the `secondary` library, the schema provider is
+foreign code.
+
+Now when we register the `secondary` library with the `primary` program as a
+catalog provider and we make calls to get a schema, the `secondary` library
+will return a FFI wrapped schema provider back to the `primary` program. In
+this case that schema provider is actually local code to the `primary` program
+except that it is wrapped in the FFI code!
+
+We work around this by the `library_marker_id` calls. What this does is it
+creates a global variable within each library and returns a `usize` address
+of that library. This is guaranteed to be unique for every library that contains
+FFI code. By comparing these `usize` addresses we can determine if a FFI struct
+is local or foreign.
+
+In our example of the schema provider, if you were to make a call in your
+primary program to get the schema provider, it would reach out to the foreign
+catalog provider and send back a `FFI_SchemaProvider` object. By then
+comparing the `library_marker_id` of this object to the `primary` program, we
+determine it is local code. This means it is safe to access the underlying
+private data.
+
+Users of the FFI code should not need to access these function. If you are
+implementing a new FFI struct, then it is recommended that you follow the
+established patterns for converting from FFI struct into the underlying
+traits. Specifically you should use `crate::get_library_marker_id` and in
+your unit tests you should override this with
+`crate::mock_foreign_marker_id` to force your test to create the foreign
+variant of your struct.
+
+## Task Context Provider
+
+Many of the FFI structs in this crate contain a `FFI_TaskContextProvider`. The
+purpose of this struct is to _weakly_ hold a reference to a method to
+access the current `TaskContext`. The reason we need this accessor is because
+we use the `datafusion-proto` crate to serialize and deserialize data across
+the FFI boundary. In particular, we need to serialize and deserialize
+functions using a `TaskContext`, which implements `FunctionRegistry`.
+
+This becomes difficult because we may need to register multiple user defined
+functions, table or catalog providers, etc with a `Session`, and each of these
+will need the `TaskContext` to perform the processing. For this reason we
+cannot simply include the `TaskContext` at the time of registration because
+it would not have knowledge of anything registered afterward.
+
+The `FFI_TaskContextProvider` is built from a trait that provides a method
+to get the current `TaskContext`. `FFI_TaskContextProvider` only holds a
+`Weak` reference to the `TaskContextProvider`, because otherwise we could
+create a circular dependency at runtime. It is imperative that if you use
+these methods that your provider remains valid for the lifetime of the
+calls. The `FFI_TaskContextProvider` is implemented on `SessionContext`
+and it is easy to implement on any struct that implements `Session`.
+
 [apache datafusion]: https://datafusion.apache.org/
 [api docs]: http://docs.rs/datafusion-ffi/latest
 [rust abi]: https://doc.rust-lang.org/reference/abi.html
 [ffi]: https://doc.rust-lang.org/nomicon/ffi.html
+[stabby]: https://crates.io/crates/stabby
 [abi_stable]: https://crates.io/crates/abi_stable
 [async-ffi]: https://crates.io/crates/async-ffi
 [bindgen]: https://crates.io/crates/bindgen
diff --git a/datafusion/ffi/src/arrow_wrappers.rs b/datafusion/ffi/src/arrow_wrappers.rs
index 7b3751dcae823..1c921b0f83b1e 100644
--- a/datafusion/ffi/src/arrow_wrappers.rs
+++ b/datafusion/ffi/src/arrow_wrappers.rs
@@ -17,27 +17,27 @@
 
 use std::sync::Arc;
 
-use abi_stable::StableAbi;
-use arrow::{
-    array::{make_array, ArrayRef},
-    datatypes::{Schema, SchemaRef},
-    error::ArrowError,
-    ffi::{from_ffi, to_ffi, FFI_ArrowArray, FFI_ArrowSchema},
-};
+use arrow::array::{ArrayRef, make_array};
+use arrow::datatypes::{Schema, SchemaRef};
+use arrow::error::ArrowError;
+use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema, from_ffi, to_ffi};
+use datafusion_common::{DataFusionError, ScalarValue};
 use log::error;
 
 /// This is a wrapper struct around FFI_ArrowSchema simply to indicate
-/// to the StableAbi macros that the underlying struct is FFI safe.
+/// that the underlying struct is FFI safe.
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-pub struct WrappedSchema(#[sabi(unsafe_opaque_field)] pub FFI_ArrowSchema);
+#[derive(Debug)]
+pub struct WrappedSchema(pub FFI_ArrowSchema);
 
 impl From<SchemaRef> for WrappedSchema {
     fn from(value: SchemaRef) -> Self {
         let ffi_schema = match FFI_ArrowSchema::try_from(value.as_ref()) {
             Ok(s) => s,
             Err(e) => {
-                error!("Unable to convert DataFusion Schema to FFI_ArrowSchema in FFI_PlanProperties. {e}");
+                error!(
+                    "Unable to convert DataFusion Schema to FFI_ArrowSchema in FFI_PlanProperties. {e}"
+                );
                 FFI_ArrowSchema::empty()
             }
         };
@@ -50,27 +50,28 @@ impl From<SchemaRef> for WrappedSchema {
 /// give the user a warning, and return some kind of result. In this case we default to an
 /// empty schema.
 #[cfg(not(tarpaulin_include))]
-fn catch_df_schema_error(e: ArrowError) -> Schema {
-    error!("Unable to convert from FFI_ArrowSchema to DataFusion Schema in FFI_PlanProperties. {e}");
+fn catch_df_schema_error(e: &ArrowError) -> Schema {
+    error!(
+        "Unable to convert from FFI_ArrowSchema to DataFusion Schema in FFI_PlanProperties. {e}"
+    );
     Schema::empty()
 }
 
 impl From<WrappedSchema> for SchemaRef {
     fn from(value: WrappedSchema) -> Self {
-        let schema = Schema::try_from(&value.0).unwrap_or_else(catch_df_schema_error);
+        let schema =
+            Schema::try_from(&value.0).unwrap_or_else(|e| catch_df_schema_error(&e));
         Arc::new(schema)
     }
 }
 
-/// This is a wrapper struct for FFI_ArrowArray to indicate to StableAbi
+/// This is a wrapper struct for FFI_ArrowArray to indicate
 /// that the struct is FFI Safe. For convenience, we also include the
 /// schema needed to create a record batch from the array.
 #[repr(C)]
-#[derive(Debug, StableAbi)]
+#[derive(Debug)]
 pub struct WrappedArray {
-    #[sabi(unsafe_opaque_field)]
     pub array: FFI_ArrowArray,
-
     pub schema: WrappedSchema,
 }
 
@@ -94,3 +95,21 @@ impl TryFrom<&ArrayRef> for WrappedArray {
         Ok(WrappedArray { array, schema })
     }
 }
+
+impl TryFrom<&ScalarValue> for WrappedArray {
+    type Error = DataFusionError;
+
+    fn try_from(value: &ScalarValue) -> Result<Self, Self::Error> {
+        let array = value.to_array()?;
+        WrappedArray::try_from(&array).map_err(Into::into)
+    }
+}
+
+impl TryFrom<WrappedArray> for ScalarValue {
+    type Error = DataFusionError;
+
+    fn try_from(value: WrappedArray) -> Result<Self, Self::Error> {
+        let array: ArrayRef = value.try_into()?;
+        ScalarValue::try_from_array(array.as_ref(), 0)
+    }
+}
diff --git a/datafusion/ffi/src/catalog_provider.rs b/datafusion/ffi/src/catalog_provider.rs
index 65dcab34f17d0..ff37e8e7e0462 100644
--- a/datafusion/ffi/src/catalog_provider.rs
+++ b/datafusion/ffi/src/catalog_provider.rs
@@ -15,47 +15,50 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{any::Any, ffi::c_void, sync::Arc};
+use std::ffi::c_void;
+use std::sync::Arc;
 
-use abi_stable::{
-    std_types::{ROption, RResult, RString, RVec},
-    StableAbi,
+use datafusion_catalog::{CatalogProvider, SchemaProvider};
+use datafusion_common::error::Result;
+use datafusion_proto::logical_plan::{
+    DefaultLogicalExtensionCodec, LogicalExtensionCodec,
 };
-use datafusion::catalog::{CatalogProvider, SchemaProvider};
+use stabby::string::String as SString;
+use stabby::vec::Vec as SVec;
 use tokio::runtime::Handle;
 
-use crate::{
-    df_result, rresult_return,
-    schema_provider::{FFI_SchemaProvider, ForeignSchemaProvider},
-};
-
-use datafusion::error::Result;
+use crate::execution::FFI_TaskContextProvider;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::schema_provider::{FFI_SchemaProvider, ForeignSchemaProvider};
+use crate::util::{FFI_Option, FFI_Result};
+use crate::{df_result, sresult_return};
 
 /// A stable struct for sharing [`CatalogProvider`] across FFI boundaries.
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
+#[derive(Debug)]
 pub struct FFI_CatalogProvider {
-    pub schema_names: unsafe extern "C" fn(provider: &Self) -> RVec<RString>,
+    pub schema_names: unsafe extern "C" fn(provider: &Self) -> SVec<SString>,
 
     pub schema: unsafe extern "C" fn(
         provider: &Self,
-        name: RString,
-    ) -> ROption<FFI_SchemaProvider>,
+        name: SString,
+    ) -> FFI_Option<FFI_SchemaProvider>,
 
     pub register_schema:
         unsafe extern "C" fn(
             provider: &Self,
-            name: RString,
+            name: SString,
             schema: &FFI_SchemaProvider,
-        ) -> RResult<ROption<FFI_SchemaProvider>, RString>,
+        ) -> FFI_Result<FFI_Option<FFI_SchemaProvider>>,
 
     pub deregister_schema:
         unsafe extern "C" fn(
             provider: &Self,
-            name: RString,
+            name: SString,
             cascade: bool,
-        ) -> RResult<ROption<FFI_SchemaProvider>, RString>,
+        ) -> FFI_Result<FFI_Option<FFI_SchemaProvider>>,
+
+    pub logical_codec: FFI_LogicalExtensionCodec,
 
     /// Used to create a clone on the provider of the execution plan. This should
     /// only need to be called by the receiver of the plan.
@@ -70,105 +73,149 @@ pub struct FFI_CatalogProvider {
     /// Internal data. This is only to be accessed by the provider of the plan.
     /// A [`ForeignCatalogProvider`] should never attempt to access this data.
     pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
 }
 
 unsafe impl Send for FFI_CatalogProvider {}
 unsafe impl Sync for FFI_CatalogProvider {}
 
 struct ProviderPrivateData {
-    provider: Arc<dyn CatalogProvider + Send>,
+    provider: Arc<dyn CatalogProvider>,
     runtime: Option<Handle>,
 }
 
 impl FFI_CatalogProvider {
-    unsafe fn inner(&self) -> &Arc<dyn CatalogProvider + Send> {
-        let private_data = self.private_data as *const ProviderPrivateData;
-        &(*private_data).provider
+    unsafe fn inner(&self) -> &Arc<dyn CatalogProvider> {
+        unsafe {
+            let private_data = self.private_data as *const ProviderPrivateData;
+            &(*private_data).provider
+        }
     }
 
     unsafe fn runtime(&self) -> Option<Handle> {
-        let private_data = self.private_data as *const ProviderPrivateData;
-        (*private_data).runtime.clone()
+        unsafe {
+            let private_data = self.private_data as *const ProviderPrivateData;
+            (*private_data).runtime.clone()
+        }
     }
 }
 
 unsafe extern "C" fn schema_names_fn_wrapper(
     provider: &FFI_CatalogProvider,
-) -> RVec<RString> {
-    let names = provider.inner().schema_names();
-    names.into_iter().map(|s| s.into()).collect()
+) -> SVec<SString> {
+    unsafe {
+        let names = provider.inner().schema_names();
+        names.into_iter().map(|s| s.into()).collect()
+    }
 }
 
 unsafe extern "C" fn schema_fn_wrapper(
     provider: &FFI_CatalogProvider,
-    name: RString,
-) -> ROption<FFI_SchemaProvider> {
-    let maybe_schema = provider.inner().schema(name.as_str());
-    maybe_schema
-        .map(|schema| FFI_SchemaProvider::new(schema, provider.runtime()))
-        .into()
+    name: SString,
+) -> FFI_Option<FFI_SchemaProvider> {
+    unsafe {
+        let maybe_schema = provider.inner().schema(name.as_str());
+        maybe_schema
+            .map(|schema| {
+                FFI_SchemaProvider::new_with_ffi_codec(
+                    schema,
+                    provider.runtime(),
+                    provider.logical_codec.clone(),
+                )
+            })
+            .into()
+    }
 }
 
 unsafe extern "C" fn register_schema_fn_wrapper(
     provider: &FFI_CatalogProvider,
-    name: RString,
+    name: SString,
     schema: &FFI_SchemaProvider,
-) -> RResult<ROption<FFI_SchemaProvider>, RString> {
-    let runtime = provider.runtime();
-    let provider = provider.inner();
-    let schema = Arc::new(ForeignSchemaProvider::from(schema));
-
-    let returned_schema =
-        rresult_return!(provider.register_schema(name.as_str(), schema))
-            .map(|schema| FFI_SchemaProvider::new(schema, runtime))
-            .into();
-
-    RResult::ROk(returned_schema)
+) -> FFI_Result<FFI_Option<FFI_SchemaProvider>> {
+    unsafe {
+        let runtime = provider.runtime();
+        let inner_provider = provider.inner();
+        let schema: Arc<dyn SchemaProvider> = schema.into();
+
+        let returned_schema =
+            sresult_return!(inner_provider.register_schema(name.as_str(), schema))
+                .map(|schema| {
+                    FFI_SchemaProvider::new_with_ffi_codec(
+                        schema,
+                        runtime,
+                        provider.logical_codec.clone(),
+                    )
+                })
+                .into();
+
+        FFI_Result::Ok(returned_schema)
+    }
 }
 
 unsafe extern "C" fn deregister_schema_fn_wrapper(
     provider: &FFI_CatalogProvider,
-    name: RString,
+    name: SString,
     cascade: bool,
-) -> RResult<ROption<FFI_SchemaProvider>, RString> {
-    let runtime = provider.runtime();
-    let provider = provider.inner();
-
-    let maybe_schema =
-        rresult_return!(provider.deregister_schema(name.as_str(), cascade));
-
-    RResult::ROk(
-        maybe_schema
-            .map(|schema| FFI_SchemaProvider::new(schema, runtime))
-            .into(),
-    )
+) -> FFI_Result<FFI_Option<FFI_SchemaProvider>> {
+    unsafe {
+        let runtime = provider.runtime();
+        let inner_provider = provider.inner();
+
+        let maybe_schema =
+            sresult_return!(inner_provider.deregister_schema(name.as_str(), cascade));
+
+        FFI_Result::Ok(
+            maybe_schema
+                .map(|schema| {
+                    FFI_SchemaProvider::new_with_ffi_codec(
+                        schema,
+                        runtime,
+                        provider.logical_codec.clone(),
+                    )
+                })
+                .into(),
+        )
+    }
 }
 
 unsafe extern "C" fn release_fn_wrapper(provider: &mut FFI_CatalogProvider) {
-    let private_data = Box::from_raw(provider.private_data as *mut ProviderPrivateData);
-    drop(private_data);
+    unsafe {
+        debug_assert!(!provider.private_data.is_null());
+        let private_data =
+            Box::from_raw(provider.private_data as *mut ProviderPrivateData);
+        drop(private_data);
+        provider.private_data = std::ptr::null_mut();
+    }
 }
 
 unsafe extern "C" fn clone_fn_wrapper(
     provider: &FFI_CatalogProvider,
 ) -> FFI_CatalogProvider {
-    let old_private_data = provider.private_data as *const ProviderPrivateData;
-    let runtime = (*old_private_data).runtime.clone();
-
-    let private_data = Box::into_raw(Box::new(ProviderPrivateData {
-        provider: Arc::clone(&(*old_private_data).provider),
-        runtime,
-    })) as *mut c_void;
-
-    FFI_CatalogProvider {
-        schema_names: schema_names_fn_wrapper,
-        schema: schema_fn_wrapper,
-        register_schema: register_schema_fn_wrapper,
-        deregister_schema: deregister_schema_fn_wrapper,
-        clone: clone_fn_wrapper,
-        release: release_fn_wrapper,
-        version: super::version,
-        private_data,
+    unsafe {
+        let old_private_data = provider.private_data as *const ProviderPrivateData;
+        let runtime = (*old_private_data).runtime.clone();
+
+        let private_data = Box::into_raw(Box::new(ProviderPrivateData {
+            provider: Arc::clone(&(*old_private_data).provider),
+            runtime,
+        })) as *mut c_void;
+
+        FFI_CatalogProvider {
+            schema_names: schema_names_fn_wrapper,
+            schema: schema_fn_wrapper,
+            register_schema: register_schema_fn_wrapper,
+            deregister_schema: deregister_schema_fn_wrapper,
+            logical_codec: provider.logical_codec.clone(),
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data,
+            library_marker_id: crate::get_library_marker_id,
+        }
     }
 }
 
@@ -181,9 +228,31 @@ impl Drop for FFI_CatalogProvider {
 impl FFI_CatalogProvider {
     /// Creates a new [`FFI_CatalogProvider`].
     pub fn new(
-        provider: Arc<dyn CatalogProvider + Send>,
+        provider: Arc<dyn CatalogProvider>,
+        runtime: Option<Handle>,
+        task_ctx_provider: impl Into<FFI_TaskContextProvider>,
+        logical_codec: Option<Arc<dyn LogicalExtensionCodec>>,
+    ) -> Self {
+        let task_ctx_provider = task_ctx_provider.into();
+        let logical_codec =
+            logical_codec.unwrap_or_else(|| Arc::new(DefaultLogicalExtensionCodec {}));
+        let logical_codec = FFI_LogicalExtensionCodec::new(
+            logical_codec,
+            runtime.clone(),
+            task_ctx_provider.clone(),
+        );
+        Self::new_with_ffi_codec(provider, runtime, logical_codec)
+    }
+
+    pub fn new_with_ffi_codec(
+        provider: Arc<dyn CatalogProvider>,
         runtime: Option<Handle>,
+        logical_codec: FFI_LogicalExtensionCodec,
     ) -> Self {
+        if let Some(provider) = provider.downcast_ref::<ForeignCatalogProvider>() {
+            return provider.0.clone();
+        }
+
         let private_data = Box::new(ProviderPrivateData { provider, runtime });
 
         Self {
@@ -191,10 +260,12 @@ impl FFI_CatalogProvider {
             schema: schema_fn_wrapper,
             register_schema: register_schema_fn_wrapper,
             deregister_schema: deregister_schema_fn_wrapper,
+            logical_codec,
             clone: clone_fn_wrapper,
             release: release_fn_wrapper,
             version: super::version,
             private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
         }
     }
 }
@@ -204,14 +275,18 @@ impl FFI_CatalogProvider {
 /// defined on this struct must only use the stable functions provided in
 /// FFI_CatalogProvider to interact with the foreign table provider.
 #[derive(Debug)]
-pub struct ForeignCatalogProvider(FFI_CatalogProvider);
+pub struct ForeignCatalogProvider(pub(crate) FFI_CatalogProvider);
 
 unsafe impl Send for ForeignCatalogProvider {}
 unsafe impl Sync for ForeignCatalogProvider {}
 
-impl From<&FFI_CatalogProvider> for ForeignCatalogProvider {
+impl From<&FFI_CatalogProvider> for Arc<dyn CatalogProvider> {
     fn from(provider: &FFI_CatalogProvider) -> Self {
-        Self(provider.clone())
+        if (provider.library_marker_id)() == crate::get_library_marker_id() {
+            return Arc::clone(unsafe { provider.inner() });
+        }
+
+        Arc::new(ForeignCatalogProvider(provider.clone())) as Arc<dyn CatalogProvider>
     }
 }
 
@@ -222,10 +297,6 @@ impl Clone for FFI_CatalogProvider {
 }
 
 impl CatalogProvider for ForeignCatalogProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema_names(&self) -> Vec<String> {
         unsafe {
             (self.0.schema_names)(&self.0)
@@ -252,9 +323,13 @@ impl CatalogProvider for ForeignCatalogProvider {
         schema: Arc<dyn SchemaProvider>,
     ) -> Result<Option<Arc<dyn SchemaProvider>>> {
         unsafe {
-            let schema = match schema.as_any().downcast_ref::<ForeignSchemaProvider>() {
+            let schema = match schema.downcast_ref::<ForeignSchemaProvider>() {
                 Some(s) => &s.0,
-                None => &FFI_SchemaProvider::new(schema, None),
+                None => &FFI_SchemaProvider::new_with_ffi_codec(
+                    schema,
+                    None,
+                    self.0.logical_codec.clone(),
+                ),
             };
             let returned_schema: Option<FFI_SchemaProvider> =
                 df_result!((self.0.register_schema)(&self.0, name.into(), schema))?
@@ -292,15 +367,20 @@ mod tests {
         let prior_schema = Arc::new(MemorySchemaProvider::new());
 
         let catalog = Arc::new(MemoryCatalogProvider::new());
-        assert!(catalog
-            .as_ref()
-            .register_schema("prior_schema", prior_schema)
-            .unwrap()
-            .is_none());
+        assert!(
+            catalog
+                .as_ref()
+                .register_schema("prior_schema", prior_schema)
+                .unwrap()
+                .is_none()
+        );
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
 
-        let ffi_catalog = FFI_CatalogProvider::new(catalog, None);
+        let mut ffi_catalog =
+            FFI_CatalogProvider::new(catalog, None, task_ctx_provider, None);
+        ffi_catalog.library_marker_id = crate::mock_foreign_marker_id;
 
-        let foreign_catalog: ForeignCatalogProvider = (&ffi_catalog).into();
+        let foreign_catalog: Arc<dyn CatalogProvider> = (&ffi_catalog).into();
 
         let prior_schema_names = foreign_catalog.schema_names();
         assert_eq!(prior_schema_names.len(), 1);
@@ -335,4 +415,30 @@ mod tests {
         let returned_schema = foreign_catalog.schema("second_schema");
         assert!(returned_schema.is_some());
     }
+
+    #[test]
+    fn test_ffi_catalog_provider_local_bypass() {
+        let catalog = Arc::new(MemoryCatalogProvider::new());
+
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+        let mut ffi_catalog =
+            FFI_CatalogProvider::new(catalog, None, task_ctx_provider, None);
+
+        // Verify local libraries can be downcast to their original
+        let foreign_catalog: Arc<dyn CatalogProvider> = (&ffi_catalog).into();
+        assert!(
+            foreign_catalog
+                .downcast_ref::<MemoryCatalogProvider>()
+                .is_some()
+        );
+
+        // Verify different library markers generate foreign providers
+        ffi_catalog.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_catalog: Arc<dyn CatalogProvider> = (&ffi_catalog).into();
+        assert!(
+            foreign_catalog
+                .downcast_ref::<ForeignCatalogProvider>()
+                .is_some()
+        );
+    }
 }
diff --git a/datafusion/ffi/src/catalog_provider_list.rs b/datafusion/ffi/src/catalog_provider_list.rs
new file mode 100644
index 0000000000000..af7cfd0f870cd
--- /dev/null
+++ b/datafusion/ffi/src/catalog_provider_list.rs
@@ -0,0 +1,386 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::ffi::c_void;
+use std::sync::Arc;
+
+use datafusion_catalog::{CatalogProvider, CatalogProviderList};
+use datafusion_proto::logical_plan::{
+    DefaultLogicalExtensionCodec, LogicalExtensionCodec,
+};
+use stabby::string::String as SString;
+use stabby::vec::Vec as SVec;
+use tokio::runtime::Handle;
+
+use crate::catalog_provider::{FFI_CatalogProvider, ForeignCatalogProvider};
+use crate::execution::FFI_TaskContextProvider;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::util::FFI_Option;
+
+/// A stable struct for sharing [`CatalogProviderList`] across FFI boundaries.
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_CatalogProviderList {
+    /// Register a catalog
+    pub register_catalog: unsafe extern "C" fn(
+        &Self,
+        name: SString,
+        catalog: &FFI_CatalogProvider,
+    ) -> FFI_Option<FFI_CatalogProvider>,
+
+    /// List of existing catalogs
+    pub catalog_names: unsafe extern "C" fn(&Self) -> SVec<SString>,
+
+    /// Access a catalog
+    pub catalog:
+        unsafe extern "C" fn(&Self, name: SString) -> FFI_Option<FFI_CatalogProvider>,
+
+    pub logical_codec: FFI_LogicalExtensionCodec,
+
+    /// Used to create a clone on the provider of the execution plan. This should
+    /// only need to be called by the receiver of the plan.
+    pub clone: unsafe extern "C" fn(plan: &Self) -> Self,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(arg: &mut Self),
+
+    /// Return the major DataFusion version number of this provider.
+    pub version: unsafe extern "C" fn() -> u64,
+
+    /// Internal data. This is only to be accessed by the provider of the plan.
+    /// A [`ForeignCatalogProviderList`] should never attempt to access this data.
+    pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+unsafe impl Send for FFI_CatalogProviderList {}
+unsafe impl Sync for FFI_CatalogProviderList {}
+
+struct ProviderPrivateData {
+    provider: Arc<dyn CatalogProviderList>,
+    runtime: Option<Handle>,
+}
+
+impl FFI_CatalogProviderList {
+    unsafe fn inner(&self) -> &Arc<dyn CatalogProviderList> {
+        unsafe {
+            let private_data = self.private_data as *const ProviderPrivateData;
+            &(*private_data).provider
+        }
+    }
+
+    unsafe fn runtime(&self) -> Option<Handle> {
+        unsafe {
+            let private_data = self.private_data as *const ProviderPrivateData;
+            (*private_data).runtime.clone()
+        }
+    }
+}
+
+unsafe extern "C" fn catalog_names_fn_wrapper(
+    provider: &FFI_CatalogProviderList,
+) -> SVec<SString> {
+    unsafe {
+        let names = provider.inner().catalog_names();
+        names.into_iter().map(|s| s.into()).collect()
+    }
+}
+
+unsafe extern "C" fn register_catalog_fn_wrapper(
+    provider: &FFI_CatalogProviderList,
+    name: SString,
+    catalog: &FFI_CatalogProvider,
+) -> FFI_Option<FFI_CatalogProvider> {
+    unsafe {
+        let runtime = provider.runtime();
+        let inner_provider = provider.inner();
+        let catalog: Arc<dyn CatalogProvider> = catalog.into();
+
+        inner_provider
+            .register_catalog(name.into(), catalog)
+            .map(|catalog| {
+                FFI_CatalogProvider::new_with_ffi_codec(
+                    catalog,
+                    runtime,
+                    provider.logical_codec.clone(),
+                )
+            })
+            .into()
+    }
+}
+
+unsafe extern "C" fn catalog_fn_wrapper(
+    provider: &FFI_CatalogProviderList,
+    name: SString,
+) -> FFI_Option<FFI_CatalogProvider> {
+    unsafe {
+        let runtime = provider.runtime();
+        let inner_provider = provider.inner();
+        inner_provider
+            .catalog(name.as_str())
+            .map(|catalog| {
+                FFI_CatalogProvider::new_with_ffi_codec(
+                    catalog,
+                    runtime,
+                    provider.logical_codec.clone(),
+                )
+            })
+            .into()
+    }
+}
+
+unsafe extern "C" fn release_fn_wrapper(provider: &mut FFI_CatalogProviderList) {
+    unsafe {
+        debug_assert!(!provider.private_data.is_null());
+        let private_data =
+            Box::from_raw(provider.private_data as *mut ProviderPrivateData);
+        drop(private_data);
+        provider.private_data = std::ptr::null_mut();
+    }
+}
+
+unsafe extern "C" fn clone_fn_wrapper(
+    provider: &FFI_CatalogProviderList,
+) -> FFI_CatalogProviderList {
+    unsafe {
+        let old_private_data = provider.private_data as *const ProviderPrivateData;
+        let runtime = (*old_private_data).runtime.clone();
+
+        let private_data = Box::into_raw(Box::new(ProviderPrivateData {
+            provider: Arc::clone(&(*old_private_data).provider),
+            runtime,
+        })) as *mut c_void;
+
+        FFI_CatalogProviderList {
+            register_catalog: register_catalog_fn_wrapper,
+            catalog_names: catalog_names_fn_wrapper,
+            catalog: catalog_fn_wrapper,
+            logical_codec: provider.logical_codec.clone(),
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+impl Drop for FFI_CatalogProviderList {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+impl FFI_CatalogProviderList {
+    /// Creates a new [`FFI_CatalogProviderList`].
+    pub fn new(
+        provider: Arc<dyn CatalogProviderList>,
+        runtime: Option<Handle>,
+        task_ctx_provider: impl Into<FFI_TaskContextProvider>,
+        logical_codec: Option<Arc<dyn LogicalExtensionCodec>>,
+    ) -> Self {
+        let task_ctx_provider = task_ctx_provider.into();
+        let logical_codec =
+            logical_codec.unwrap_or_else(|| Arc::new(DefaultLogicalExtensionCodec {}));
+        let logical_codec = FFI_LogicalExtensionCodec::new(
+            logical_codec,
+            runtime.clone(),
+            task_ctx_provider.clone(),
+        );
+        Self::new_with_ffi_codec(provider, runtime, logical_codec)
+    }
+    pub fn new_with_ffi_codec(
+        provider: Arc<dyn CatalogProviderList>,
+        runtime: Option<Handle>,
+        logical_codec: FFI_LogicalExtensionCodec,
+    ) -> Self {
+        if let Some(provider) = provider.downcast_ref::<ForeignCatalogProviderList>() {
+            return provider.0.clone();
+        }
+
+        let private_data = Box::new(ProviderPrivateData { provider, runtime });
+
+        Self {
+            register_catalog: register_catalog_fn_wrapper,
+            catalog_names: catalog_names_fn_wrapper,
+            catalog: catalog_fn_wrapper,
+            logical_codec,
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+/// This wrapper struct exists on the receiver side of the FFI interface, so it has
+/// no guarantees about being able to access the data in `private_data`. Any functions
+/// defined on this struct must only use the stable functions provided in
+/// FFI_CatalogProviderList to interact with the foreign catalog provider list.
+#[derive(Debug)]
+pub struct ForeignCatalogProviderList(FFI_CatalogProviderList);
+
+unsafe impl Send for ForeignCatalogProviderList {}
+unsafe impl Sync for ForeignCatalogProviderList {}
+
+impl From<&FFI_CatalogProviderList> for Arc<dyn CatalogProviderList> {
+    fn from(provider: &FFI_CatalogProviderList) -> Self {
+        if (provider.library_marker_id)() == crate::get_library_marker_id() {
+            return Arc::clone(unsafe { provider.inner() });
+        }
+
+        Arc::new(ForeignCatalogProviderList(provider.clone()))
+            as Arc<dyn CatalogProviderList>
+    }
+}
+
+impl Clone for FFI_CatalogProviderList {
+    fn clone(&self) -> Self {
+        unsafe { (self.clone)(self) }
+    }
+}
+
+impl CatalogProviderList for ForeignCatalogProviderList {
+    fn register_catalog(
+        &self,
+        name: String,
+        catalog: Arc<dyn CatalogProvider>,
+    ) -> Option<Arc<dyn CatalogProvider>> {
+        unsafe {
+            let catalog = match catalog.downcast_ref::<ForeignCatalogProvider>() {
+                Some(s) => &s.0,
+                None => &FFI_CatalogProvider::new_with_ffi_codec(
+                    catalog,
+                    None,
+                    self.0.logical_codec.clone(),
+                ),
+            };
+
+            (self.0.register_catalog)(&self.0, name.into(), catalog)
+                .map(|s| Arc::new(ForeignCatalogProvider(s)) as Arc<dyn CatalogProvider>)
+                .into()
+        }
+    }
+
+    fn catalog_names(&self) -> Vec<String> {
+        unsafe {
+            (self.0.catalog_names)(&self.0)
+                .into_iter()
+                .map(Into::into)
+                .collect()
+        }
+    }
+
+    fn catalog(&self, name: &str) -> Option<Arc<dyn CatalogProvider>> {
+        unsafe {
+            (self.0.catalog)(&self.0, name.into())
+                .map(|catalog| {
+                    Arc::new(ForeignCatalogProvider(catalog)) as Arc<dyn CatalogProvider>
+                })
+                .into()
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion::catalog::{MemoryCatalogProvider, MemoryCatalogProviderList};
+
+    use super::*;
+
+    #[test]
+    fn test_round_trip_ffi_catalog_provider_list() {
+        let prior_catalog = Arc::new(MemoryCatalogProvider::new());
+
+        let catalog_list = Arc::new(MemoryCatalogProviderList::new());
+        assert!(
+            catalog_list
+                .as_ref()
+                .register_catalog("prior_catalog".to_owned(), prior_catalog)
+                .is_none()
+        );
+
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+        let mut ffi_catalog_list =
+            FFI_CatalogProviderList::new(catalog_list, None, task_ctx_provider, None);
+        ffi_catalog_list.library_marker_id = crate::mock_foreign_marker_id;
+
+        let foreign_catalog_list: Arc<dyn CatalogProviderList> =
+            (&ffi_catalog_list).into();
+
+        let prior_catalog_names = foreign_catalog_list.catalog_names();
+        assert_eq!(prior_catalog_names.len(), 1);
+        assert_eq!(prior_catalog_names[0], "prior_catalog");
+
+        // Replace an existing catalog with one of the same name
+        let returned_catalog = foreign_catalog_list.register_catalog(
+            "prior_catalog".to_owned(),
+            Arc::new(MemoryCatalogProvider::new()),
+        );
+        assert!(returned_catalog.is_some());
+        assert_eq!(foreign_catalog_list.catalog_names().len(), 1);
+
+        // Add a new catalog
+        let returned_catalog = foreign_catalog_list.register_catalog(
+            "second_catalog".to_owned(),
+            Arc::new(MemoryCatalogProvider::new()),
+        );
+        assert!(returned_catalog.is_none());
+        assert_eq!(foreign_catalog_list.catalog_names().len(), 2);
+
+        // Retrieve non-existent catalog
+        let returned_catalog = foreign_catalog_list.catalog("non_existent_catalog");
+        assert!(returned_catalog.is_none());
+
+        // Retrieve valid catalog
+        let returned_catalog = foreign_catalog_list.catalog("second_catalog");
+        assert!(returned_catalog.is_some());
+    }
+
+    #[test]
+    fn test_ffi_catalog_provider_list_local_bypass() {
+        let catalog_list = Arc::new(MemoryCatalogProviderList::new());
+
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+        let mut ffi_catalog_list =
+            FFI_CatalogProviderList::new(catalog_list, None, task_ctx_provider, None);
+
+        // Verify local libraries can be downcast to their original
+        let foreign_catalog_list: Arc<dyn CatalogProviderList> =
+            (&ffi_catalog_list).into();
+        assert!(
+            foreign_catalog_list
+                .downcast_ref::<MemoryCatalogProviderList>()
+                .is_some()
+        );
+
+        // Verify different library markers generate foreign providers
+        ffi_catalog_list.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_catalog_list: Arc<dyn CatalogProviderList> =
+            (&ffi_catalog_list).into();
+        assert!(
+            foreign_catalog_list
+                .downcast_ref::<ForeignCatalogProviderList>()
+                .is_some()
+        );
+    }
+}
diff --git a/datafusion/ffi/src/config/extension_options.rs b/datafusion/ffi/src/config/extension_options.rs
new file mode 100644
index 0000000000000..a8f759470034e
--- /dev/null
+++ b/datafusion/ffi/src/config/extension_options.rs
@@ -0,0 +1,292 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::collections::HashMap;
+use std::ffi::c_void;
+
+use datafusion_common::config::{ConfigEntry, ConfigExtension, ExtensionOptions};
+use datafusion_common::{Result, exec_err};
+
+use stabby::str::Str as SStr;
+use stabby::string::String as SString;
+use stabby::vec::Vec as SVec;
+
+use crate::df_result;
+use crate::util::FFI_Result;
+
+/// A stable struct for sharing [`ExtensionOptions`] across FFI boundaries.
+///
+/// Unlike other FFI structs in this crate, we do not construct a foreign
+/// variant of this object. This is due to the typical method for interacting
+/// with extension options is by creating a local struct of your concrete type.
+/// To support this methodology use the `to_extension` method instead.
+///
+/// When using [`FFI_ExtensionOptions`] with multiple extensions, all extension
+/// values are stored on a single [`FFI_ExtensionOptions`] object. The keys
+/// are stored with the full path prefix to avoid overwriting values when using
+/// multiple extensions.
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_ExtensionOptions {
+    /// Return a deep clone of this [`ExtensionOptions`]
+    pub cloned: unsafe extern "C" fn(&Self) -> FFI_ExtensionOptions,
+
+    /// Set the given `key`, `value` pair
+    pub set: unsafe extern "C" fn(&mut Self, key: SStr, value: SStr) -> FFI_Result<()>,
+
+    /// Returns the [`ConfigEntry`] stored in this [`ExtensionOptions`]
+    pub entries: unsafe extern "C" fn(&Self) -> SVec<(SString, SString)>,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(&mut Self),
+
+    /// Internal data. This is only to be accessed by the provider of the options.
+    pub private_data: *mut c_void,
+}
+
+unsafe impl Send for FFI_ExtensionOptions {}
+unsafe impl Sync for FFI_ExtensionOptions {}
+
+pub struct ExtensionOptionsPrivateData {
+    pub options: HashMap<String, String>,
+}
+
+impl FFI_ExtensionOptions {
+    #[inline]
+    fn inner_mut(&mut self) -> &mut HashMap<String, String> {
+        let private_data = self.private_data as *mut ExtensionOptionsPrivateData;
+        unsafe { &mut (*private_data).options }
+    }
+
+    #[inline]
+    fn inner(&self) -> &HashMap<String, String> {
+        let private_data = self.private_data as *const ExtensionOptionsPrivateData;
+        unsafe { &(*private_data).options }
+    }
+}
+
+unsafe extern "C" fn cloned_fn_wrapper(
+    options: &FFI_ExtensionOptions,
+) -> FFI_ExtensionOptions {
+    options
+        .inner()
+        .iter()
+        .map(|(k, v)| (k.to_owned(), v.to_owned()))
+        .collect::<HashMap<String, String>>()
+        .into()
+}
+
+unsafe extern "C" fn set_fn_wrapper(
+    options: &mut FFI_ExtensionOptions,
+    key: SStr,
+    value: SStr,
+) -> FFI_Result<()> {
+    let _ = options
+        .inner_mut()
+        .insert(key.as_str().into(), value.as_str().into());
+    FFI_Result::Ok(())
+}
+
+unsafe extern "C" fn entries_fn_wrapper(
+    options: &FFI_ExtensionOptions,
+) -> SVec<(SString, SString)> {
+    options
+        .inner()
+        .iter()
+        .map(|(key, value)| (key.to_owned().into(), value.to_owned().into()))
+        .collect()
+}
+
+unsafe extern "C" fn release_fn_wrapper(options: &mut FFI_ExtensionOptions) {
+    unsafe {
+        debug_assert!(!options.private_data.is_null());
+        let private_data =
+            Box::from_raw(options.private_data as *mut ExtensionOptionsPrivateData);
+        drop(private_data);
+        options.private_data = std::ptr::null_mut();
+    }
+}
+
+impl Default for FFI_ExtensionOptions {
+    fn default() -> Self {
+        HashMap::new().into()
+    }
+}
+
+impl From<HashMap<String, String>> for FFI_ExtensionOptions {
+    fn from(options: HashMap<String, String>) -> Self {
+        let private_data = ExtensionOptionsPrivateData { options };
+
+        Self {
+            cloned: cloned_fn_wrapper,
+            set: set_fn_wrapper,
+            entries: entries_fn_wrapper,
+            release: release_fn_wrapper,
+            private_data: Box::into_raw(Box::new(private_data)) as *mut c_void,
+        }
+    }
+}
+
+impl Drop for FFI_ExtensionOptions {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+impl Clone for FFI_ExtensionOptions {
+    fn clone(&self) -> Self {
+        unsafe { (self.cloned)(self) }
+    }
+}
+
+impl ConfigExtension for FFI_ExtensionOptions {
+    const PREFIX: &'static str =
+        datafusion_common::config::DATAFUSION_FFI_CONFIG_NAMESPACE;
+}
+
+impl ExtensionOptions for FFI_ExtensionOptions {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn as_any_mut(&mut self) -> &mut dyn Any {
+        self
+    }
+
+    fn cloned(&self) -> Box<dyn ExtensionOptions> {
+        let ffi_options = unsafe { (self.cloned)(self) };
+        Box::new(ffi_options)
+    }
+
+    fn set(&mut self, key: &str, value: &str) -> Result<()> {
+        if key.split_once('.').is_none() {
+            return exec_err!("Unable to set FFI config value without namespace set");
+        };
+
+        df_result!(unsafe { (self.set)(self, key.into(), value.into()) })
+    }
+
+    fn entries(&self) -> Vec<ConfigEntry> {
+        unsafe {
+            (self.entries)(self)
+                .into_iter()
+                .map(|entry_tuple| ConfigEntry {
+                    key: entry_tuple.0.into(),
+                    value: Some(entry_tuple.1.into()),
+                    description: "ffi_config_options",
+                })
+                .collect()
+        }
+    }
+}
+
+impl FFI_ExtensionOptions {
+    /// Add all of the values in a concrete configuration extension to the
+    /// FFI variant. This is safe to call on either side of the FFI
+    /// boundary.
+    pub fn add_config<C: ConfigExtension>(&mut self, config: &C) -> Result<()> {
+        for entry in config.entries() {
+            if let Some(value) = entry.value {
+                let key = format!("{}.{}", C::PREFIX, entry.key);
+                self.set(key.as_str(), value.as_str())?;
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Merge another `FFI_ExtensionOptions` configurations into this one.
+    /// This is safe to call on either side of the FFI boundary.
+    pub fn merge(&mut self, other: &FFI_ExtensionOptions) -> Result<()> {
+        for entry in other.entries() {
+            if let Some(value) = entry.value {
+                self.set(entry.key.as_str(), value.as_str())?;
+            }
+        }
+        Ok(())
+    }
+
+    /// Create a concrete extension type from the FFI variant.
+    /// This is safe to call on either side of the FFI boundary.
+    pub fn to_extension<C: ConfigExtension + Default>(&self) -> Result<C> {
+        let mut result = C::default();
+
+        unsafe {
+            for entry in (self.entries)(self) {
+                let key = entry.0.as_str();
+                let value = entry.1.as_str();
+
+                if let Some((prefix, inner_key)) = key.split_once('.')
+                    && prefix == C::PREFIX
+                {
+                    result.set(inner_key, value)?;
+                }
+            }
+        }
+
+        Ok(result)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion_common::config::{ConfigExtension, ConfigOptions};
+    use datafusion_common::extensions_options;
+
+    use crate::config::extension_options::FFI_ExtensionOptions;
+
+    // Define a new configuration struct using the `extensions_options` macro
+    extensions_options! {
+       /// My own config options.
+       pub struct MyConfig {
+           /// Should "foo" be replaced by "bar"?
+           pub foo_to_bar: bool, default = true
+
+           /// How many "baz" should be created?
+           pub baz_count: usize, default = 1337
+       }
+    }
+
+    impl ConfigExtension for MyConfig {
+        const PREFIX: &'static str = "my_config";
+    }
+
+    #[test]
+    fn round_trip_ffi_extension_options() {
+        // set up config struct and register extension
+        let mut config = ConfigOptions::default();
+        let mut ffi_options = FFI_ExtensionOptions::default();
+        ffi_options.add_config(&MyConfig::default()).unwrap();
+
+        config.extensions.insert(ffi_options);
+
+        // overwrite config default
+        config.set("my_config.baz_count", "42").unwrap();
+
+        // check config state
+        let returned_ffi_config =
+            config.extensions.get::<FFI_ExtensionOptions>().unwrap();
+        let my_config: MyConfig = returned_ffi_config.to_extension().unwrap();
+
+        // check default value
+        assert!(my_config.foo_to_bar);
+
+        // check overwritten value
+        assert_eq!(my_config.baz_count, 42);
+    }
+}
diff --git a/datafusion/ffi/src/config/mod.rs b/datafusion/ffi/src/config/mod.rs
new file mode 100644
index 0000000000000..4c834b0ac03cc
--- /dev/null
+++ b/datafusion/ffi/src/config/mod.rs
@@ -0,0 +1,169 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+pub mod extension_options;
+
+use datafusion_common::config::{
+    ConfigExtension, ConfigOptions, ExtensionOptions, TableOptions,
+};
+use datafusion_common::{DataFusionError, Result};
+use stabby::string::String as SString;
+use stabby::vec::Vec as SVec;
+
+use crate::config::extension_options::FFI_ExtensionOptions;
+
+/// A stable struct for sharing [`ConfigOptions`] across FFI boundaries.
+///
+/// Accessing FFI extension options require a slightly different pattern
+/// than local extensions. The trait [`ExtensionOptionsFFIProvider`] can
+/// be used to simplify accessing FFI extensions.
+#[repr(C)]
+#[derive(Debug, Clone)]
+pub struct FFI_ConfigOptions {
+    base_options: SVec<(SString, SString)>,
+
+    extensions: FFI_ExtensionOptions,
+}
+
+impl From<&ConfigOptions> for FFI_ConfigOptions {
+    fn from(options: &ConfigOptions) -> Self {
+        let base_options: SVec<(SString, SString)> = options
+            .entries()
+            .into_iter()
+            .filter_map(|entry| entry.value.map(|value| (entry.key, value)))
+            .map(|(key, value)| (key.into(), value.into()))
+            .collect();
+
+        let mut extensions = FFI_ExtensionOptions::default();
+        for (extension_name, extension) in options.extensions.iter() {
+            for entry in extension.entries().iter() {
+                if let Some(value) = entry.value.as_ref() {
+                    extensions
+                        .set(format!("{extension_name}.{}", entry.key).as_str(), value)
+                        .expect("FFI_ExtensionOptions set should always return Ok");
+                }
+            }
+        }
+
+        Self {
+            base_options,
+            extensions,
+        }
+    }
+}
+
+impl TryFrom<FFI_ConfigOptions> for ConfigOptions {
+    type Error = DataFusionError;
+    fn try_from(ffi_options: FFI_ConfigOptions) -> Result<Self, Self::Error> {
+        let mut options = ConfigOptions::default();
+        options.extensions.insert(ffi_options.extensions);
+
+        for kv_tuple in ffi_options.base_options.iter() {
+            options.set(kv_tuple.0.as_str(), kv_tuple.1.as_str())?;
+        }
+
+        Ok(options)
+    }
+}
+
+pub trait ExtensionOptionsFFIProvider {
+    /// Extract a [`ConfigExtension`]. This method should attempt to first extract
+    /// the extension from the local options when possible. Should that fail, it
+    /// should attempt to extract the FFI options and then convert them to the
+    /// desired [`ConfigExtension`].
+    fn local_or_ffi_extension<C: ConfigExtension + Clone + Default>(&self) -> Option<C>;
+}
+
+impl ExtensionOptionsFFIProvider for ConfigOptions {
+    fn local_or_ffi_extension<C: ConfigExtension + Clone + Default>(&self) -> Option<C> {
+        self.extensions
+            .get::<C>()
+            .map(|v| v.to_owned())
+            .or_else(|| {
+                self.extensions
+                    .get::<FFI_ExtensionOptions>()
+                    .and_then(|ffi_ext| ffi_ext.to_extension().ok())
+            })
+    }
+}
+
+impl ExtensionOptionsFFIProvider for TableOptions {
+    fn local_or_ffi_extension<C: ConfigExtension + Clone + Default>(&self) -> Option<C> {
+        self.extensions
+            .get::<C>()
+            .map(|v| v.to_owned())
+            .or_else(|| {
+                self.extensions
+                    .get::<FFI_ExtensionOptions>()
+                    .and_then(|ffi_ext| ffi_ext.to_extension().ok())
+            })
+    }
+}
+
+/// A stable struct for sharing [`TableOptions`] across FFI boundaries.
+///
+/// Accessing FFI extension options require a slightly different pattern
+/// than local extensions. The trait [`ExtensionOptionsFFIProvider`] can
+/// be used to simplify accessing FFI extensions.
+#[repr(C)]
+#[derive(Debug, Clone)]
+pub struct FFI_TableOptions {
+    base_options: SVec<(SString, SString)>,
+
+    extensions: FFI_ExtensionOptions,
+}
+
+impl From<&TableOptions> for FFI_TableOptions {
+    fn from(options: &TableOptions) -> Self {
+        let base_options: SVec<(SString, SString)> = options
+            .entries()
+            .into_iter()
+            .filter_map(|entry| entry.value.map(|value| (entry.key, value)))
+            .map(|(key, value)| (key.into(), value.into()))
+            .collect();
+
+        let mut extensions = FFI_ExtensionOptions::default();
+        for (extension_name, extension) in options.extensions.iter() {
+            for entry in extension.entries().iter() {
+                if let Some(value) = entry.value.as_ref() {
+                    extensions
+                        .set(format!("{extension_name}.{}", entry.key).as_str(), value)
+                        .expect("FFI_ExtensionOptions set should always return Ok");
+                }
+            }
+        }
+
+        Self {
+            base_options,
+            extensions,
+        }
+    }
+}
+
+impl TryFrom<FFI_TableOptions> for TableOptions {
+    type Error = DataFusionError;
+    fn try_from(ffi_options: FFI_TableOptions) -> Result<Self, Self::Error> {
+        let mut options = TableOptions::default();
+        options.extensions.insert(ffi_options.extensions);
+
+        for kv_tuple in ffi_options.base_options.iter() {
+            options.set(kv_tuple.0.as_str(), kv_tuple.1.as_str())?;
+        }
+
+        Ok(options)
+    }
+}
diff --git a/datafusion/ffi/src/execution/mod.rs b/datafusion/ffi/src/execution/mod.rs
new file mode 100644
index 0000000000000..41107947fff01
--- /dev/null
+++ b/datafusion/ffi/src/execution/mod.rs
@@ -0,0 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod task_ctx;
+pub mod task_ctx_provider;
+
+pub use task_ctx::FFI_TaskContext;
+pub use task_ctx_provider::FFI_TaskContextProvider;
diff --git a/datafusion/ffi/src/execution/task_ctx.rs b/datafusion/ffi/src/execution/task_ctx.rs
new file mode 100644
index 0000000000000..0a48a5fe5af36
--- /dev/null
+++ b/datafusion/ffi/src/execution/task_ctx.rs
@@ -0,0 +1,290 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::collections::HashMap;
+use std::ffi::c_void;
+use std::sync::Arc;
+
+use datafusion_execution::TaskContext;
+use datafusion_execution::config::SessionConfig;
+use datafusion_execution::runtime_env::RuntimeEnv;
+use datafusion_expr::{
+    AggregateUDF, AggregateUDFImpl, ScalarUDF, ScalarUDFImpl, WindowUDF, WindowUDFImpl,
+};
+
+use stabby::string::String as SString;
+use stabby::vec::Vec as SVec;
+
+use crate::session::config::FFI_SessionConfig;
+use crate::udaf::FFI_AggregateUDF;
+use crate::udf::FFI_ScalarUDF;
+use crate::udwf::FFI_WindowUDF;
+use crate::util::FFI_Option;
+
+/// A stable struct for sharing [`TaskContext`] across FFI boundaries.
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_TaskContext {
+    /// Return the session ID.
+    pub session_id: unsafe extern "C" fn(&Self) -> SString,
+
+    /// Return the task ID.
+    pub task_id: unsafe extern "C" fn(&Self) -> FFI_Option<SString>,
+
+    /// Return the session configuration.
+    pub session_config: unsafe extern "C" fn(&Self) -> FFI_SessionConfig,
+
+    /// Returns a vec of name-function pairs for scalar functions.
+    pub scalar_functions: unsafe extern "C" fn(&Self) -> SVec<(SString, FFI_ScalarUDF)>,
+
+    /// Returns a vec of name-function pairs for aggregate functions.
+    pub aggregate_functions:
+        unsafe extern "C" fn(&Self) -> SVec<(SString, FFI_AggregateUDF)>,
+
+    /// Returns a vec of name-function pairs for window functions.
+    pub window_functions: unsafe extern "C" fn(&Self) -> SVec<(SString, FFI_WindowUDF)>,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(arg: &mut Self),
+
+    /// Internal data. This is only to be accessed by the provider of the plan.
+    /// The foreign library should never attempt to access this data.
+    pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+struct TaskContextPrivateData {
+    ctx: Arc<TaskContext>,
+}
+
+impl FFI_TaskContext {
+    unsafe fn inner(&self) -> &Arc<TaskContext> {
+        unsafe {
+            let private_data = self.private_data as *const TaskContextPrivateData;
+            &(*private_data).ctx
+        }
+    }
+}
+
+unsafe extern "C" fn session_id_fn_wrapper(ctx: &FFI_TaskContext) -> SString {
+    unsafe {
+        let ctx = ctx.inner();
+        ctx.session_id().into()
+    }
+}
+
+unsafe extern "C" fn task_id_fn_wrapper(ctx: &FFI_TaskContext) -> FFI_Option<SString> {
+    unsafe {
+        let ctx = ctx.inner();
+        ctx.task_id().map(|s| s.as_str().into()).into()
+    }
+}
+
+unsafe extern "C" fn session_config_fn_wrapper(
+    ctx: &FFI_TaskContext,
+) -> FFI_SessionConfig {
+    unsafe {
+        let ctx = ctx.inner();
+        ctx.session_config().into()
+    }
+}
+
+unsafe extern "C" fn scalar_functions_fn_wrapper(
+    ctx: &FFI_TaskContext,
+) -> SVec<(SString, FFI_ScalarUDF)> {
+    unsafe {
+        let ctx = ctx.inner();
+        ctx.scalar_functions()
+            .iter()
+            .map(|(name, udf)| (name.to_owned().into(), Arc::clone(udf).into()))
+            .collect()
+    }
+}
+
+unsafe extern "C" fn aggregate_functions_fn_wrapper(
+    ctx: &FFI_TaskContext,
+) -> SVec<(SString, FFI_AggregateUDF)> {
+    unsafe {
+        let ctx = ctx.inner();
+        ctx.aggregate_functions()
+            .iter()
+            .map(|(name, udaf)| {
+                (
+                    name.to_owned().into(),
+                    FFI_AggregateUDF::from(Arc::clone(udaf)),
+                )
+            })
+            .collect()
+    }
+}
+
+unsafe extern "C" fn window_functions_fn_wrapper(
+    ctx: &FFI_TaskContext,
+) -> SVec<(SString, FFI_WindowUDF)> {
+    unsafe {
+        let ctx = ctx.inner();
+        ctx.window_functions()
+            .iter()
+            .map(|(name, udf)| {
+                (name.to_owned().into(), FFI_WindowUDF::from(Arc::clone(udf)))
+            })
+            .collect()
+    }
+}
+
+unsafe extern "C" fn release_fn_wrapper(ctx: &mut FFI_TaskContext) {
+    unsafe {
+        let private_data = Box::from_raw(ctx.private_data as *mut TaskContextPrivateData);
+        drop(private_data);
+    }
+}
+
+impl Drop for FFI_TaskContext {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+impl From<Arc<TaskContext>> for FFI_TaskContext {
+    fn from(ctx: Arc<TaskContext>) -> Self {
+        let private_data = Box::new(TaskContextPrivateData { ctx });
+
+        FFI_TaskContext {
+            session_id: session_id_fn_wrapper,
+            task_id: task_id_fn_wrapper,
+            session_config: session_config_fn_wrapper,
+            scalar_functions: scalar_functions_fn_wrapper,
+            aggregate_functions: aggregate_functions_fn_wrapper,
+            window_functions: window_functions_fn_wrapper,
+            release: release_fn_wrapper,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+impl From<FFI_TaskContext> for Arc<TaskContext> {
+    fn from(ffi_ctx: FFI_TaskContext) -> Self {
+        unsafe {
+            if (ffi_ctx.library_marker_id)() == crate::get_library_marker_id() {
+                return Arc::clone(ffi_ctx.inner());
+            }
+
+            let task_id = (ffi_ctx.task_id)(&ffi_ctx).map(|s| s.to_string()).into();
+            let session_id = (ffi_ctx.session_id)(&ffi_ctx).into();
+            let session_config = (ffi_ctx.session_config)(&ffi_ctx);
+            let session_config =
+                SessionConfig::try_from(&session_config).unwrap_or_default();
+
+            let scalar_functions = (ffi_ctx.scalar_functions)(&ffi_ctx)
+                .into_iter()
+                .map(|kv_pair| {
+                    let udf = <Arc<dyn ScalarUDFImpl>>::from(&kv_pair.1);
+
+                    (
+                        kv_pair.0.to_string(),
+                        Arc::new(ScalarUDF::new_from_shared_impl(udf)),
+                    )
+                })
+                .collect();
+            let aggregate_functions = (ffi_ctx.aggregate_functions)(&ffi_ctx)
+                .into_iter()
+                .map(|kv_pair| {
+                    let udaf = <Arc<dyn AggregateUDFImpl>>::from(&kv_pair.1);
+
+                    (
+                        kv_pair.0.to_string(),
+                        Arc::new(AggregateUDF::new_from_shared_impl(udaf)),
+                    )
+                })
+                .collect();
+            let window_functions = (ffi_ctx.window_functions)(&ffi_ctx)
+                .into_iter()
+                .map(|kv_pair| {
+                    let udwf = <Arc<dyn WindowUDFImpl>>::from(&kv_pair.1);
+
+                    (
+                        kv_pair.0.to_string(),
+                        Arc::new(WindowUDF::new_from_shared_impl(udwf)),
+                    )
+                })
+                .collect();
+
+            let runtime = Arc::new(RuntimeEnv::default());
+
+            Arc::new(TaskContext::new(
+                task_id,
+                session_id,
+                session_config,
+                scalar_functions,
+                HashMap::new(),
+                aggregate_functions,
+                window_functions,
+                runtime,
+            ))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use datafusion::prelude::SessionContext;
+    use datafusion_common::Result;
+    use datafusion_execution::TaskContext;
+
+    use crate::execution::FFI_TaskContext;
+
+    #[test]
+    fn ffi_task_ctx_round_trip() -> Result<()> {
+        let session_ctx = SessionContext::new();
+        let original = session_ctx.task_ctx();
+        let mut ffi_task_ctx = FFI_TaskContext::from(Arc::clone(&original));
+        ffi_task_ctx.library_marker_id = crate::mock_foreign_marker_id;
+
+        let foreign_task_ctx: Arc<TaskContext> = ffi_task_ctx.into();
+
+        // TaskContext doesn't implement Eq (nor should it) so check some of the
+        // data is round tripping correctly.
+
+        assert_eq!(
+            original.scalar_functions(),
+            foreign_task_ctx.scalar_functions()
+        );
+        assert_eq!(
+            original.aggregate_functions(),
+            foreign_task_ctx.aggregate_functions()
+        );
+        assert_eq!(
+            original.window_functions(),
+            foreign_task_ctx.window_functions()
+        );
+        assert_eq!(original.task_id(), foreign_task_ctx.task_id());
+        assert_eq!(original.session_id(), foreign_task_ctx.session_id());
+        assert_eq!(
+            format!("{:?}", original.session_config()),
+            format!("{:?}", foreign_task_ctx.session_config())
+        );
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/src/execution/task_ctx_provider.rs b/datafusion/ffi/src/execution/task_ctx_provider.rs
new file mode 100644
index 0000000000000..b8fa68a16ae1f
--- /dev/null
+++ b/datafusion/ffi/src/execution/task_ctx_provider.rs
@@ -0,0 +1,227 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::ffi::c_void;
+use std::sync::{Arc, Weak};
+
+use datafusion_common::{DataFusionError, ffi_datafusion_err};
+use datafusion_execution::{TaskContext, TaskContextProvider};
+
+use crate::execution::task_ctx::FFI_TaskContext;
+use crate::util::FFI_Result;
+use crate::{df_result, sresult};
+
+/// Struct for accessing the [`TaskContext`]. This method contains a weak
+/// reference, so there are no guarantees that the [`TaskContext`] remains
+/// valid. This is used primarily for protobuf encoding and decoding of
+/// data passed across the FFI boundary. See the crate README for
+/// additional information.
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_TaskContextProvider {
+    /// Retrieve the current [`TaskContext`] provided the provider has not
+    /// gone out of scope. This function will return an error if the weakly
+    /// held reference to the underlying [`TaskContextProvider`] is no longer
+    /// available.
+    pub task_ctx: unsafe extern "C" fn(&Self) -> FFI_Result<FFI_TaskContext>,
+
+    /// Used to create a clone on the task context accessor. This should
+    /// only need to be called by the receiver of the plan.
+    pub clone: unsafe extern "C" fn(plan: &Self) -> Self,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(arg: &mut Self),
+
+    /// Internal data. This is only to be accessed by the provider of the plan.
+    /// The foreign library should never attempt to access this data.
+    pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+unsafe impl Send for FFI_TaskContextProvider {}
+unsafe impl Sync for FFI_TaskContextProvider {}
+
+struct TaskContextProviderPrivateData {
+    ctx: Weak<dyn TaskContextProvider>,
+}
+
+impl FFI_TaskContextProvider {
+    unsafe fn inner(&self) -> Option<Arc<TaskContext>> {
+        unsafe {
+            let private_data = self.private_data as *const TaskContextProviderPrivateData;
+            (*private_data).ctx.upgrade().map(|ctx| ctx.task_ctx())
+        }
+    }
+}
+
+unsafe extern "C" fn task_ctx_fn_wrapper(
+    ctx_provider: &FFI_TaskContextProvider,
+) -> FFI_Result<FFI_TaskContext> {
+    unsafe {
+        sresult!(
+            ctx_provider
+                .inner()
+                .map(FFI_TaskContext::from)
+                .ok_or_else(|| {
+                    ffi_datafusion_err!(
+                        "TaskContextProvider went out of scope over FFI boundary."
+                    )
+                })
+        )
+    }
+}
+
+unsafe extern "C" fn clone_fn_wrapper(
+    provider: &FFI_TaskContextProvider,
+) -> FFI_TaskContextProvider {
+    unsafe {
+        let private_data = provider.private_data as *const TaskContextProviderPrivateData;
+        let ctx = Weak::clone(&(*private_data).ctx);
+
+        let private_data = Box::new(TaskContextProviderPrivateData { ctx });
+
+        FFI_TaskContextProvider {
+            task_ctx: task_ctx_fn_wrapper,
+            release: release_fn_wrapper,
+            clone: clone_fn_wrapper,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+unsafe extern "C" fn release_fn_wrapper(ctx: &mut FFI_TaskContextProvider) {
+    unsafe {
+        let private_data =
+            Box::from_raw(ctx.private_data as *mut TaskContextProviderPrivateData);
+        drop(private_data);
+    }
+}
+impl Drop for FFI_TaskContextProvider {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+impl Clone for FFI_TaskContextProvider {
+    fn clone(&self) -> Self {
+        unsafe { (self.clone)(self) }
+    }
+}
+
+impl From<&Arc<dyn TaskContextProvider>> for FFI_TaskContextProvider {
+    fn from(ctx: &Arc<dyn TaskContextProvider>) -> Self {
+        let ctx = Arc::downgrade(ctx);
+        let private_data = Box::new(TaskContextProviderPrivateData { ctx });
+
+        FFI_TaskContextProvider {
+            task_ctx: task_ctx_fn_wrapper,
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+impl TryFrom<&FFI_TaskContextProvider> for Arc<TaskContext> {
+    type Error = DataFusionError;
+    fn try_from(ffi_ctx: &FFI_TaskContextProvider) -> Result<Self, Self::Error> {
+        unsafe {
+            if (ffi_ctx.library_marker_id)() == crate::get_library_marker_id() {
+                return ffi_ctx.inner().ok_or_else(|| {
+                    ffi_datafusion_err!(
+                        "TaskContextProvider went out of scope over FFI boundary."
+                    )
+                });
+            }
+
+            df_result!((ffi_ctx.task_ctx)(ffi_ctx)).map(Into::into)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use datafusion_common::{DataFusionError, Result};
+    use datafusion_execution::{TaskContext, TaskContextProvider};
+
+    use crate::execution::FFI_TaskContextProvider;
+
+    #[derive(Default)]
+    struct TestCtxProvider {
+        ctx: Arc<TaskContext>,
+    }
+
+    impl TaskContextProvider for TestCtxProvider {
+        fn task_ctx(&self) -> Arc<TaskContext> {
+            Arc::clone(&self.ctx)
+        }
+    }
+
+    #[test]
+    fn ffi_task_context_provider_round_trip() -> Result<()> {
+        let ctx = Arc::new(TestCtxProvider::default()) as Arc<dyn TaskContextProvider>;
+        let mut ffi_ctx_provider: FFI_TaskContextProvider = (&Arc::clone(&ctx)).into();
+        ffi_ctx_provider.library_marker_id = crate::mock_foreign_marker_id;
+
+        let foreign_task_ctx: Arc<TaskContext> = (&ffi_ctx_provider).try_into()?;
+
+        assert_eq!(
+            format!("{foreign_task_ctx:?}"),
+            format!("{:?}", ctx.task_ctx())
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn ffi_task_context_provider_clone() -> Result<()> {
+        let ctx = Arc::new(TestCtxProvider::default()) as Arc<dyn TaskContextProvider>;
+        let first_provider: FFI_TaskContextProvider = (&ctx).into();
+
+        let second_provider = first_provider.clone();
+
+        let first_ctx: Arc<TaskContext> = (&first_provider).try_into()?;
+        let second_ctx: Arc<TaskContext> = (&second_provider).try_into()?;
+
+        assert!(Arc::ptr_eq(&first_ctx, &second_ctx));
+
+        Ok(())
+    }
+
+    #[test]
+    fn ffi_task_context_provider_out_of_scope() {
+        fn create_ffi_out_of_scope() -> FFI_TaskContextProvider {
+            let ctx =
+                Arc::new(TestCtxProvider::default()) as Arc<dyn TaskContextProvider>;
+            (&ctx).into()
+        }
+
+        let provider = create_ffi_out_of_scope();
+        let failed_ctx = <Arc<TaskContext>>::try_from(&provider);
+
+        let Err(DataFusionError::Ffi(_)) = failed_ctx else {
+            panic!("Expected out of scope error")
+        };
+    }
+}
diff --git a/datafusion/ffi/src/execution_plan.rs b/datafusion/ffi/src/execution_plan.rs
index 70c957d8c3733..7541880a233c8 100644
--- a/datafusion/ffi/src/execution_plan.rs
+++ b/datafusion/ffi/src/execution_plan.rs
@@ -15,45 +15,58 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{ffi::c_void, pin::Pin, sync::Arc};
-
-use abi_stable::{
-    std_types::{RResult, RString, RVec},
-    StableAbi,
-};
-use datafusion::{
-    error::DataFusionError,
-    execution::{SendableRecordBatchStream, TaskContext},
-    physical_plan::{DisplayAs, ExecutionPlan, PlanProperties},
+use std::ffi::c_void;
+use std::pin::Pin;
+use std::sync::Arc;
+
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{DataFusionError, Result};
+use datafusion_execution::{SendableRecordBatchStream, TaskContext};
+use datafusion_physical_plan::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
 };
-use datafusion::{error::Result, physical_plan::DisplayFormatType};
+use stabby::string::String as SString;
+use stabby::vec::Vec as SVec;
 use tokio::runtime::Handle;
 
-use crate::{
-    df_result, plan_properties::FFI_PlanProperties,
-    record_batch_stream::FFI_RecordBatchStream, rresult,
-};
+use crate::config::FFI_ConfigOptions;
+use crate::execution::FFI_TaskContext;
+use crate::plan_properties::FFI_PlanProperties;
+use crate::record_batch_stream::FFI_RecordBatchStream;
+use crate::util::{FFI_Option, FFI_Result};
+use crate::{df_result, sresult, sresult_return};
 
 /// A stable struct for sharing a [`ExecutionPlan`] across FFI boundaries.
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
+#[derive(Debug)]
 pub struct FFI_ExecutionPlan {
     /// Return the plan properties
     pub properties: unsafe extern "C" fn(plan: &Self) -> FFI_PlanProperties,
 
     /// Return a vector of children plans
-    pub children: unsafe extern "C" fn(plan: &Self) -> RVec<FFI_ExecutionPlan>,
+    pub children: unsafe extern "C" fn(plan: &Self) -> SVec<FFI_ExecutionPlan>,
+
+    pub with_new_children:
+        unsafe extern "C" fn(plan: &Self, children: SVec<Self>) -> FFI_Result<Self>,
 
     /// Return the plan name.
-    pub name: unsafe extern "C" fn(plan: &Self) -> RString,
+    pub name: unsafe extern "C" fn(plan: &Self) -> SString,
 
     /// Execute the plan and return a record batch stream. Errors
     /// will be returned as a string.
     pub execute: unsafe extern "C" fn(
         plan: &Self,
         partition: usize,
-    ) -> RResult<FFI_RecordBatchStream, RString>,
+        context: FFI_TaskContext,
+    ) -> FFI_Result<FFI_RecordBatchStream>,
+
+    pub repartitioned: unsafe extern "C" fn(
+        plan: &Self,
+        target_partitions: usize,
+        config: FFI_ConfigOptions,
+    )
+        -> FFI_Result<FFI_Option<FFI_ExecutionPlan>>,
 
     /// Used to create a clone on the provider of the execution plan. This should
     /// only need to be called by the receiver of the plan.
@@ -65,6 +78,11 @@ pub struct FFI_ExecutionPlan {
     /// Internal data. This is only to be accessed by the provider of the plan.
     /// A [`ForeignExecutionPlan`] should never attempt to access this data.
     pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
 }
 
 unsafe impl Send for FFI_ExecutionPlan {}
@@ -72,73 +90,110 @@ unsafe impl Sync for FFI_ExecutionPlan {}
 
 pub struct ExecutionPlanPrivateData {
     pub plan: Arc<dyn ExecutionPlan>,
-    pub context: Arc<TaskContext>,
     pub runtime: Option<Handle>,
 }
 
+impl FFI_ExecutionPlan {
+    fn inner(&self) -> &Arc<dyn ExecutionPlan> {
+        let private_data = self.private_data as *const ExecutionPlanPrivateData;
+        unsafe { &(*private_data).plan }
+    }
+
+    fn runtime(&self) -> Option<Handle> {
+        let private_data = self.private_data as *const ExecutionPlanPrivateData;
+        unsafe { (*private_data).runtime.clone() }
+    }
+}
+
 unsafe extern "C" fn properties_fn_wrapper(
     plan: &FFI_ExecutionPlan,
 ) -> FFI_PlanProperties {
-    let private_data = plan.private_data as *const ExecutionPlanPrivateData;
-    let plan = &(*private_data).plan;
-
-    plan.properties().into()
+    plan.inner().properties().as_ref().into()
 }
 
 unsafe extern "C" fn children_fn_wrapper(
     plan: &FFI_ExecutionPlan,
-) -> RVec<FFI_ExecutionPlan> {
-    let private_data = plan.private_data as *const ExecutionPlanPrivateData;
-    let plan = &(*private_data).plan;
-    let ctx = &(*private_data).context;
-    let runtime = &(*private_data).runtime;
-
-    let children: Vec<_> = plan
+) -> SVec<FFI_ExecutionPlan> {
+    let runtime = plan.runtime();
+    plan.inner()
         .children()
         .into_iter()
-        .map(|child| {
-            FFI_ExecutionPlan::new(Arc::clone(child), Arc::clone(ctx), runtime.clone())
-        })
+        .map(|child| FFI_ExecutionPlan::new(Arc::clone(child), runtime.clone()))
+        .collect()
+}
+
+unsafe extern "C" fn with_new_children_fn_wrapper(
+    plan: &FFI_ExecutionPlan,
+    children: SVec<FFI_ExecutionPlan>,
+) -> FFI_Result<FFI_ExecutionPlan> {
+    let runtime = plan.runtime();
+    let inner_plan = Arc::clone(plan.inner());
+
+    let children: Result<Vec<Arc<dyn ExecutionPlan>>> = children
+        .iter()
+        .map(<Arc<dyn ExecutionPlan>>::try_from)
         .collect();
 
-    children.into()
+    let children = sresult_return!(children);
+    let new_plan = sresult_return!(inner_plan.with_new_children(children));
+
+    FFI_Result::Ok(FFI_ExecutionPlan::new(new_plan, runtime))
 }
 
 unsafe extern "C" fn execute_fn_wrapper(
     plan: &FFI_ExecutionPlan,
     partition: usize,
-) -> RResult<FFI_RecordBatchStream, RString> {
-    let private_data = plan.private_data as *const ExecutionPlanPrivateData;
-    let plan = &(*private_data).plan;
-    let ctx = &(*private_data).context;
-    let runtime = (*private_data).runtime.clone();
-
-    rresult!(plan
-        .execute(partition, Arc::clone(ctx))
-        .map(|rbs| FFI_RecordBatchStream::new(rbs, runtime)))
+    context: FFI_TaskContext,
+) -> FFI_Result<FFI_RecordBatchStream> {
+    let ctx = context.into();
+    let runtime = plan.runtime();
+    let plan = plan.inner();
+
+    let _runtime_guard = runtime.as_ref().map(|rt| rt.enter());
+
+    sresult!(
+        plan.execute(partition, ctx)
+            .map(|rbs| FFI_RecordBatchStream::new(rbs, runtime))
+    )
 }
 
-unsafe extern "C" fn name_fn_wrapper(plan: &FFI_ExecutionPlan) -> RString {
-    let private_data = plan.private_data as *const ExecutionPlanPrivateData;
-    let plan = &(*private_data).plan;
+unsafe extern "C" fn repartitioned_fn_wrapper(
+    plan: &FFI_ExecutionPlan,
+    target_partitions: usize,
+    config: FFI_ConfigOptions,
+) -> FFI_Result<FFI_Option<FFI_ExecutionPlan>> {
+    let maybe_config: Result<ConfigOptions, DataFusionError> = config.try_into();
+    let config = sresult_return!(maybe_config);
+    let runtime = plan.runtime();
+    let plan = plan.inner();
+
+    sresult!(
+        plan.repartitioned(target_partitions, &config)
+            .map(|maybe_plan| maybe_plan
+                .map(|plan| FFI_ExecutionPlan::new(plan, runtime))
+                .into())
+    )
+}
 
-    plan.name().into()
+unsafe extern "C" fn name_fn_wrapper(plan: &FFI_ExecutionPlan) -> SString {
+    plan.inner().name().into()
 }
 
 unsafe extern "C" fn release_fn_wrapper(plan: &mut FFI_ExecutionPlan) {
-    let private_data = Box::from_raw(plan.private_data as *mut ExecutionPlanPrivateData);
-    drop(private_data);
+    unsafe {
+        debug_assert!(!plan.private_data.is_null());
+        let private_data =
+            Box::from_raw(plan.private_data as *mut ExecutionPlanPrivateData);
+        drop(private_data);
+        plan.private_data = std::ptr::null_mut();
+    }
 }
 
 unsafe extern "C" fn clone_fn_wrapper(plan: &FFI_ExecutionPlan) -> FFI_ExecutionPlan {
-    let private_data = plan.private_data as *const ExecutionPlanPrivateData;
-    let plan_data = &(*private_data);
+    let runtime = plan.runtime();
+    let plan = plan.inner();
 
-    FFI_ExecutionPlan::new(
-        Arc::clone(&plan_data.plan),
-        Arc::clone(&plan_data.context),
-        plan_data.runtime.clone(),
-    )
+    FFI_ExecutionPlan::new(Arc::clone(plan), runtime)
 }
 
 impl Clone for FFI_ExecutionPlan {
@@ -147,27 +202,78 @@ impl Clone for FFI_ExecutionPlan {
     }
 }
 
+/// Helper function to recursively identify any children that do not
+/// have a runtime set but should because they are local to this same
+/// library. This does imply a restriction that all execution plans
+/// in this chain that are within the same library use the same runtime.
+fn pass_runtime_to_children(
+    plan: &Arc<dyn ExecutionPlan>,
+    runtime: &Handle,
+) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+    let mut updated_children = false;
+    let plan_is_foreign = plan.is::<ForeignExecutionPlan>();
+
+    let children = plan
+        .children()
+        .into_iter()
+        .map(|child| {
+            let child = match pass_runtime_to_children(child, runtime)? {
+                Some(child) => {
+                    updated_children = true;
+                    child
+                }
+                None => Arc::clone(child),
+            };
+
+            // If the parent is foreign and the child is local to this library, then when
+            // we called `children()` above we will get something other than a
+            // `ForeignExecutionPlan`. In this case wrap the plan in a `ForeignExecutionPlan`
+            // because when we call `with_new_children` below it will extract the
+            // FFI plan that does contain the runtime.
+            if plan_is_foreign && !child.is::<ForeignExecutionPlan>() {
+                updated_children = true;
+                let ffi_child = FFI_ExecutionPlan::new(child, Some(runtime.clone()));
+                let foreign_child = ForeignExecutionPlan::try_from(ffi_child);
+                foreign_child.map(|c| Arc::new(c) as Arc<dyn ExecutionPlan>)
+            } else {
+                Ok(child)
+            }
+        })
+        .collect::<Result<Vec<_>>>()?;
+    if updated_children {
+        Arc::clone(plan).with_new_children(children).map(Some)
+    } else {
+        Ok(None)
+    }
+}
+
 impl FFI_ExecutionPlan {
     /// This function is called on the provider's side.
-    pub fn new(
-        plan: Arc<dyn ExecutionPlan>,
-        context: Arc<TaskContext>,
-        runtime: Option<Handle>,
-    ) -> Self {
-        let private_data = Box::new(ExecutionPlanPrivateData {
-            plan,
-            context,
-            runtime,
-        });
+    pub fn new(mut plan: Arc<dyn ExecutionPlan>, runtime: Option<Handle>) -> Self {
+        // Note to developers: `pass_runtime_to_children` relies on the logic here to
+        // get the underlying FFI plan during calls to `new_with_children`.
+        if let Some(plan) = plan.downcast_ref::<ForeignExecutionPlan>() {
+            return plan.plan.clone();
+        }
 
+        if let Some(rt) = &runtime
+            && let Ok(Some(p)) = pass_runtime_to_children(&plan, rt)
+        {
+            plan = p;
+        }
+
+        let private_data = Box::new(ExecutionPlanPrivateData { plan, runtime });
         Self {
             properties: properties_fn_wrapper,
             children: children_fn_wrapper,
+            with_new_children: with_new_children_fn_wrapper,
             name: name_fn_wrapper,
             execute: execute_fn_wrapper,
+            repartitioned: repartitioned_fn_wrapper,
             clone: clone_fn_wrapper,
             release: release_fn_wrapper,
             private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
         }
     }
 }
@@ -188,7 +294,7 @@ impl Drop for FFI_ExecutionPlan {
 pub struct ForeignExecutionPlan {
     name: String,
     plan: FFI_ExecutionPlan,
-    properties: PlanProperties,
+    properties: Arc<PlanProperties>,
     children: Vec<Arc<dyn ExecutionPlan>>,
 }
 
@@ -218,26 +324,37 @@ impl DisplayAs for ForeignExecutionPlan {
     }
 }
 
-impl TryFrom<&FFI_ExecutionPlan> for ForeignExecutionPlan {
+impl TryFrom<&FFI_ExecutionPlan> for Arc<dyn ExecutionPlan> {
     type Error = DataFusionError;
 
     fn try_from(plan: &FFI_ExecutionPlan) -> Result<Self, Self::Error> {
+        if (plan.library_marker_id)() == crate::get_library_marker_id() {
+            Ok(Arc::clone(plan.inner()))
+        } else {
+            let plan = ForeignExecutionPlan::try_from(plan.clone())?;
+            Ok(Arc::new(plan))
+        }
+    }
+}
+
+impl TryFrom<FFI_ExecutionPlan> for ForeignExecutionPlan {
+    type Error = DataFusionError;
+    fn try_from(plan: FFI_ExecutionPlan) -> Result<Self, Self::Error> {
         unsafe {
-            let name = (plan.name)(plan).into();
+            let name = (plan.name)(&plan).into();
 
-            let properties: PlanProperties = (plan.properties)(plan).try_into()?;
+            let properties: PlanProperties = (plan.properties)(&plan).try_into()?;
 
-            let children_rvec = (plan.children)(plan);
+            let children_rvec = (plan.children)(&plan);
             let children = children_rvec
                 .iter()
-                .map(ForeignExecutionPlan::try_from)
-                .map(|child| child.map(|c| Arc::new(c) as Arc<dyn ExecutionPlan>))
+                .map(<Arc<dyn ExecutionPlan>>::try_from)
                 .collect::<Result<Vec<_>>>()?;
 
-            Ok(Self {
+            Ok(ForeignExecutionPlan {
                 name,
-                plan: plan.clone(),
-                properties,
+                plan,
+                properties: Arc::new(properties),
                 children,
             })
         }
@@ -249,73 +366,95 @@ impl ExecutionPlan for ForeignExecutionPlan {
         &self.name
     }
 
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.properties
     }
 
     fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
-        self.children
-            .iter()
-            .map(|p| p as &Arc<dyn ExecutionPlan>)
-            .collect()
+        self.children.iter().collect()
     }
 
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        Ok(Arc::new(ForeignExecutionPlan {
-            plan: self.plan.clone(),
-            name: self.name.clone(),
-            children,
-            properties: self.properties.clone(),
-        }))
+        let children = children
+            .into_iter()
+            .map(|child| FFI_ExecutionPlan::new(child, None))
+            .collect::<SVec<_>>();
+        let new_plan =
+            unsafe { df_result!((self.plan.with_new_children)(&self.plan, children))? };
+
+        (&new_plan).try_into()
     }
 
     fn execute(
         &self,
         partition: usize,
-        _context: Arc<TaskContext>,
+        context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
+        let context = FFI_TaskContext::from(context);
         unsafe {
-            df_result!((self.plan.execute)(&self.plan, partition))
+            df_result!((self.plan.execute)(&self.plan, partition, context))
                 .map(|stream| Pin::new(Box::new(stream)) as SendableRecordBatchStream)
         }
     }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion_physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.properties.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
+
+    fn repartitioned(
+        &self,
+        target_partitions: usize,
+        config: &ConfigOptions,
+    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+        let config = config.into();
+        let maybe_plan: Option<FFI_ExecutionPlan> = df_result!(unsafe {
+            (self.plan.repartitioned)(&self.plan, target_partitions, config)
+        })?
+        .into();
+
+        maybe_plan
+            .map(|plan| <Arc<dyn ExecutionPlan>>::try_from(&plan))
+            .transpose()
+    }
 }
 
-#[cfg(test)]
-mod tests {
-    use arrow::datatypes::{DataType, Field, Schema};
-    use datafusion::{
-        physical_plan::{
-            execution_plan::{Boundedness, EmissionType},
-            Partitioning,
-        },
-        prelude::SessionContext,
-    };
+#[cfg(any(test, feature = "integration-tests"))]
+pub mod tests {
+    use datafusion_physical_plan::Partitioning;
+    use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
 
     use super::*;
 
     #[derive(Debug)]
     pub struct EmptyExec {
-        props: PlanProperties,
+        props: Arc<PlanProperties>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     }
 
     impl EmptyExec {
         pub fn new(schema: arrow::datatypes::SchemaRef) -> Self {
             Self {
-                props: PlanProperties::new(
-                    datafusion::physical_expr::EquivalenceProperties::new(schema),
+                props: Arc::new(PlanProperties::new(
+                    datafusion_physical_expr::EquivalenceProperties::new(schema),
                     Partitioning::UnknownPartitioning(3),
                     EmissionType::Incremental,
                     Boundedness::Bounded,
-                ),
+                )),
                 children: Vec::default(),
             }
         }
@@ -336,11 +475,7 @@ mod tests {
             "empty-exec"
         }
 
-        fn as_any(&self) -> &dyn std::any::Any {
-            self
-        }
-
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             &self.props
         }
 
@@ -353,7 +488,7 @@ mod tests {
             children: Vec<Arc<dyn ExecutionPlan>>,
         ) -> Result<Arc<dyn ExecutionPlan>> {
             Ok(Arc::new(EmptyExec {
-                props: self.props.clone(),
+                props: Arc::clone(&self.props),
                 children,
             }))
         }
@@ -366,28 +501,41 @@ mod tests {
             unimplemented!()
         }
 
-        fn statistics(&self) -> Result<datafusion::common::Statistics> {
-            unimplemented!()
+        fn apply_expressions(
+            &self,
+            f: &mut dyn FnMut(
+                &dyn datafusion_physical_plan::PhysicalExpr,
+            ) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            // Visit expressions in the output ordering from equivalence properties
+            let mut tnr = TreeNodeRecursion::Continue;
+            if let Some(ordering) = self.props.output_ordering() {
+                for sort_expr in ordering {
+                    tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+                }
+            }
+            Ok(tnr)
         }
     }
 
     #[test]
     fn test_round_trip_ffi_execution_plan() -> Result<()> {
-        let schema =
-            Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)]));
-        let ctx = SessionContext::new();
+        let schema = Arc::new(arrow::datatypes::Schema::new(vec![
+            arrow::datatypes::Field::new("a", arrow::datatypes::DataType::Float32, false),
+        ]));
 
         let original_plan = Arc::new(EmptyExec::new(schema));
         let original_name = original_plan.name().to_string();
 
-        let local_plan = FFI_ExecutionPlan::new(original_plan, ctx.task_ctx(), None);
+        let mut local_plan = FFI_ExecutionPlan::new(original_plan, None);
+        local_plan.library_marker_id = crate::mock_foreign_marker_id;
 
-        let foreign_plan: ForeignExecutionPlan = (&local_plan).try_into()?;
+        let foreign_plan: Arc<dyn ExecutionPlan> = (&local_plan).try_into()?;
 
-        assert!(original_name == foreign_plan.name());
+        assert_eq!(original_name, foreign_plan.name());
 
-        let display = datafusion::physical_plan::display::DisplayableExecutionPlan::new(
-            &foreign_plan,
+        let display = datafusion_physical_plan::display::DisplayableExecutionPlan::new(
+            foreign_plan.as_ref(),
         );
 
         let buf = display.one_line().to_string();
@@ -401,18 +549,20 @@ mod tests {
 
     #[test]
     fn test_ffi_execution_plan_children() -> Result<()> {
-        let schema =
-            Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)]));
-        let ctx = SessionContext::new();
+        let schema = Arc::new(arrow::datatypes::Schema::new(vec![
+            arrow::datatypes::Field::new("a", arrow::datatypes::DataType::Float32, false),
+        ]));
 
         // Version 1: Adding child to the foreign plan
         let child_plan = Arc::new(EmptyExec::new(Arc::clone(&schema)));
-        let child_local = FFI_ExecutionPlan::new(child_plan, ctx.task_ctx(), None);
-        let child_foreign = Arc::new(ForeignExecutionPlan::try_from(&child_local)?);
+        let mut child_local = FFI_ExecutionPlan::new(child_plan, None);
+        child_local.library_marker_id = crate::mock_foreign_marker_id;
+        let child_foreign = <Arc<dyn ExecutionPlan>>::try_from(&child_local)?;
 
         let parent_plan = Arc::new(EmptyExec::new(Arc::clone(&schema)));
-        let parent_local = FFI_ExecutionPlan::new(parent_plan, ctx.task_ctx(), None);
-        let parent_foreign = Arc::new(ForeignExecutionPlan::try_from(&parent_local)?);
+        let mut parent_local = FFI_ExecutionPlan::new(parent_plan, None);
+        parent_local.library_marker_id = crate::mock_foreign_marker_id;
+        let parent_foreign = <Arc<dyn ExecutionPlan>>::try_from(&parent_local)?;
 
         assert_eq!(parent_foreign.children().len(), 0);
         assert_eq!(child_foreign.children().len(), 0);
@@ -422,16 +572,38 @@ mod tests {
 
         // Version 2: Adding child to the local plan
         let child_plan = Arc::new(EmptyExec::new(Arc::clone(&schema)));
-        let child_local = FFI_ExecutionPlan::new(child_plan, ctx.task_ctx(), None);
-        let child_foreign = Arc::new(ForeignExecutionPlan::try_from(&child_local)?);
+        let mut child_local = FFI_ExecutionPlan::new(child_plan, None);
+        child_local.library_marker_id = crate::mock_foreign_marker_id;
+        let child_foreign = <Arc<dyn ExecutionPlan>>::try_from(&child_local)?;
 
         let parent_plan = Arc::new(EmptyExec::new(Arc::clone(&schema)));
         let parent_plan = parent_plan.with_new_children(vec![child_foreign])?;
-        let parent_local = FFI_ExecutionPlan::new(parent_plan, ctx.task_ctx(), None);
-        let parent_foreign = Arc::new(ForeignExecutionPlan::try_from(&parent_local)?);
+        let mut parent_local = FFI_ExecutionPlan::new(parent_plan, None);
+        parent_local.library_marker_id = crate::mock_foreign_marker_id;
+        let parent_foreign = <Arc<dyn ExecutionPlan>>::try_from(&parent_local)?;
 
         assert_eq!(parent_foreign.children().len(), 1);
 
         Ok(())
     }
+
+    #[test]
+    fn test_ffi_execution_plan_local_bypass() {
+        let schema = Arc::new(arrow::datatypes::Schema::new(vec![
+            arrow::datatypes::Field::new("a", arrow::datatypes::DataType::Float32, false),
+        ]));
+
+        let plan = Arc::new(EmptyExec::new(schema));
+
+        let mut ffi_plan = FFI_ExecutionPlan::new(plan, None);
+
+        // Verify local libraries can be downcast to their original
+        let foreign_plan: Arc<dyn ExecutionPlan> = (&ffi_plan).try_into().unwrap();
+        assert!(foreign_plan.is::<EmptyExec>());
+
+        // Verify different library markers generate foreign providers
+        ffi_plan.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_plan: Arc<dyn ExecutionPlan> = (&ffi_plan).try_into().unwrap();
+        assert!(foreign_plan.is::<ForeignExecutionPlan>());
+    }
 }
diff --git a/datafusion/ffi/src/expr/columnar_value.rs b/datafusion/ffi/src/expr/columnar_value.rs
new file mode 100644
index 0000000000000..19ad9ff7a3b79
--- /dev/null
+++ b/datafusion/ffi/src/expr/columnar_value.rs
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_common::{DataFusionError, ScalarValue};
+use datafusion_expr::ColumnarValue;
+
+use crate::arrow_wrappers::WrappedArray;
+
+/// A stable struct for sharing [`ColumnarValue`] across FFI boundaries.
+/// Scalar values are passed as an Arrow array of length 1.
+#[repr(C, u8)]
+#[derive(Debug)]
+pub enum FFI_ColumnarValue {
+    Array(WrappedArray),
+    Scalar(WrappedArray),
+}
+
+impl TryFrom<ColumnarValue> for FFI_ColumnarValue {
+    type Error = DataFusionError;
+    fn try_from(value: ColumnarValue) -> Result<Self, Self::Error> {
+        Ok(match value {
+            ColumnarValue::Array(v) => {
+                FFI_ColumnarValue::Array(WrappedArray::try_from(&v)?)
+            }
+            ColumnarValue::Scalar(v) => {
+                FFI_ColumnarValue::Scalar(WrappedArray::try_from(&v)?)
+            }
+        })
+    }
+}
+
+impl TryFrom<FFI_ColumnarValue> for ColumnarValue {
+    type Error = DataFusionError;
+    fn try_from(value: FFI_ColumnarValue) -> Result<Self, Self::Error> {
+        Ok(match value {
+            FFI_ColumnarValue::Array(v) => ColumnarValue::Array(v.try_into()?),
+            FFI_ColumnarValue::Scalar(v) => {
+                ColumnarValue::Scalar(ScalarValue::try_from(v)?)
+            }
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::array::create_array;
+    use datafusion_common::{DataFusionError, ScalarValue};
+    use datafusion_expr::ColumnarValue;
+
+    use crate::expr::columnar_value::FFI_ColumnarValue;
+
+    #[test]
+    fn ffi_columnar_value_round_trip() -> Result<(), DataFusionError> {
+        let array = create_array!(Int32, [1, 2, 3, 4, 5]);
+
+        for original in [
+            ColumnarValue::Array(array),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(1))),
+        ] {
+            let ffi_variant = FFI_ColumnarValue::try_from(original.clone())?;
+
+            let returned_value = ColumnarValue::try_from(ffi_variant)?;
+
+            assert_eq!(format!("{returned_value:?}"), format!("{original:?}"));
+        }
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/src/expr/distribution.rs b/datafusion/ffi/src/expr/distribution.rs
new file mode 100644
index 0000000000000..ca760f16ad17c
--- /dev/null
+++ b/datafusion/ffi/src/expr/distribution.rs
@@ -0,0 +1,209 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_common::DataFusionError;
+use datafusion_expr::statistics::{
+    BernoulliDistribution, Distribution, ExponentialDistribution, GaussianDistribution,
+    GenericDistribution, UniformDistribution,
+};
+
+use crate::arrow_wrappers::WrappedArray;
+use crate::expr::interval::FFI_Interval;
+
+/// A stable struct for sharing [`Distribution`] across FFI boundaries.
+/// See ['Distribution'] for the meaning of each variant.
+#[repr(C)]
+#[derive(Debug)]
+#[expect(clippy::large_enum_variant)]
+pub enum FFI_Distribution {
+    Uniform(FFI_UniformDistribution),
+    Exponential(FFI_ExponentialDistribution),
+    Gaussian(FFI_GaussianDistribution),
+    Bernoulli(FFI_BernoulliDistribution),
+    Generic(FFI_GenericDistribution),
+}
+
+impl TryFrom<&Distribution> for FFI_Distribution {
+    type Error = DataFusionError;
+    fn try_from(value: &Distribution) -> Result<Self, Self::Error> {
+        match value {
+            Distribution::Uniform(d) => Ok(FFI_Distribution::Uniform(d.try_into()?)),
+            Distribution::Exponential(d) => {
+                Ok(FFI_Distribution::Exponential(d.try_into()?))
+            }
+            Distribution::Gaussian(d) => Ok(FFI_Distribution::Gaussian(d.try_into()?)),
+            Distribution::Bernoulli(d) => Ok(FFI_Distribution::Bernoulli(d.try_into()?)),
+            Distribution::Generic(d) => Ok(FFI_Distribution::Generic(d.try_into()?)),
+        }
+    }
+}
+
+impl TryFrom<FFI_Distribution> for Distribution {
+    type Error = DataFusionError;
+    fn try_from(value: FFI_Distribution) -> Result<Self, Self::Error> {
+        match value {
+            FFI_Distribution::Uniform(d) => d.try_into(),
+            FFI_Distribution::Exponential(d) => d.try_into(),
+            FFI_Distribution::Gaussian(d) => d.try_into(),
+            FFI_Distribution::Bernoulli(d) => d.try_into(),
+            FFI_Distribution::Generic(d) => d.try_into(),
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_UniformDistribution {
+    interval: FFI_Interval,
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_ExponentialDistribution {
+    rate: WrappedArray,
+    offset: WrappedArray,
+    positive_tail: bool,
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_GaussianDistribution {
+    mean: WrappedArray,
+    variance: WrappedArray,
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_BernoulliDistribution {
+    p: WrappedArray,
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_GenericDistribution {
+    mean: WrappedArray,
+    median: WrappedArray,
+    variance: WrappedArray,
+    range: FFI_Interval,
+}
+
+impl TryFrom<&UniformDistribution> for FFI_UniformDistribution {
+    type Error = DataFusionError;
+    fn try_from(value: &UniformDistribution) -> Result<Self, Self::Error> {
+        Ok(Self {
+            interval: value.range().try_into()?,
+        })
+    }
+}
+
+impl TryFrom<&ExponentialDistribution> for FFI_ExponentialDistribution {
+    type Error = DataFusionError;
+    fn try_from(value: &ExponentialDistribution) -> Result<Self, Self::Error> {
+        let rate = value.rate().try_into()?;
+        let offset = value.offset().try_into()?;
+
+        Ok(Self {
+            rate,
+            offset,
+            positive_tail: value.positive_tail(),
+        })
+    }
+}
+
+impl TryFrom<&GaussianDistribution> for FFI_GaussianDistribution {
+    type Error = DataFusionError;
+    fn try_from(value: &GaussianDistribution) -> Result<Self, Self::Error> {
+        let mean = value.mean().try_into()?;
+        let variance = value.variance().try_into()?;
+
+        Ok(Self { mean, variance })
+    }
+}
+
+impl TryFrom<&BernoulliDistribution> for FFI_BernoulliDistribution {
+    type Error = DataFusionError;
+    fn try_from(value: &BernoulliDistribution) -> Result<Self, Self::Error> {
+        let p = value.p_value().try_into()?;
+
+        Ok(Self { p })
+    }
+}
+
+impl TryFrom<&GenericDistribution> for FFI_GenericDistribution {
+    type Error = DataFusionError;
+    fn try_from(value: &GenericDistribution) -> Result<Self, Self::Error> {
+        let mean = value.mean().try_into()?;
+        let median = value.median().try_into()?;
+        let variance = value.variance().try_into()?;
+
+        Ok(Self {
+            mean,
+            median,
+            variance,
+            range: value.range().try_into()?,
+        })
+    }
+}
+
+impl TryFrom<FFI_UniformDistribution> for Distribution {
+    type Error = DataFusionError;
+    fn try_from(value: FFI_UniformDistribution) -> Result<Self, Self::Error> {
+        let interval = value.interval.try_into()?;
+        Distribution::new_uniform(interval)
+    }
+}
+
+impl TryFrom<FFI_ExponentialDistribution> for Distribution {
+    type Error = DataFusionError;
+    fn try_from(value: FFI_ExponentialDistribution) -> Result<Self, Self::Error> {
+        let rate = value.rate.try_into()?;
+        let offset = value.offset.try_into()?;
+
+        Distribution::new_exponential(rate, offset, value.positive_tail)
+    }
+}
+
+impl TryFrom<FFI_GaussianDistribution> for Distribution {
+    type Error = DataFusionError;
+    fn try_from(value: FFI_GaussianDistribution) -> Result<Self, Self::Error> {
+        let mean = value.mean.try_into()?;
+        let variance = value.variance.try_into()?;
+
+        Distribution::new_gaussian(mean, variance)
+    }
+}
+
+impl TryFrom<FFI_BernoulliDistribution> for Distribution {
+    type Error = DataFusionError;
+    fn try_from(value: FFI_BernoulliDistribution) -> Result<Self, Self::Error> {
+        let p = value.p.try_into()?;
+
+        Distribution::new_bernoulli(p)
+    }
+}
+
+impl TryFrom<FFI_GenericDistribution> for Distribution {
+    type Error = DataFusionError;
+    fn try_from(value: FFI_GenericDistribution) -> Result<Self, Self::Error> {
+        let mean = value.mean.try_into()?;
+        let median = value.median.try_into()?;
+        let variance = value.variance.try_into()?;
+        let range = value.range.try_into()?;
+
+        Distribution::new_generic(mean, median, variance, range)
+    }
+}
diff --git a/datafusion/ffi/src/expr/expr_properties.rs b/datafusion/ffi/src/expr/expr_properties.rs
new file mode 100644
index 0000000000000..5b37cc6a28535
--- /dev/null
+++ b/datafusion/ffi/src/expr/expr_properties.rs
@@ -0,0 +1,112 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_schema::SortOptions;
+use datafusion_common::DataFusionError;
+use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
+
+use crate::expr::interval::FFI_Interval;
+
+/// A stable struct for sharing [`ExprProperties`] across FFI boundaries.
+/// See [`ExprProperties`] for the meaning of each field.
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_ExprProperties {
+    sort_properties: FFI_SortProperties,
+    range: FFI_Interval,
+    preserves_lex_ordering: bool,
+}
+
+impl TryFrom<&ExprProperties> for FFI_ExprProperties {
+    type Error = DataFusionError;
+    fn try_from(value: &ExprProperties) -> Result<Self, Self::Error> {
+        let sort_properties = (&value.sort_properties).into();
+        let range = value.range.clone().try_into()?;
+
+        Ok(FFI_ExprProperties {
+            sort_properties,
+            range,
+            preserves_lex_ordering: value.preserves_lex_ordering,
+        })
+    }
+}
+
+impl TryFrom<FFI_ExprProperties> for ExprProperties {
+    type Error = DataFusionError;
+    fn try_from(value: FFI_ExprProperties) -> Result<Self, Self::Error> {
+        let sort_properties = (&value.sort_properties).into();
+        let range = value.range.try_into()?;
+        Ok(ExprProperties {
+            sort_properties,
+            range,
+            preserves_lex_ordering: value.preserves_lex_ordering,
+        })
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub enum FFI_SortProperties {
+    Ordered(FFI_SortOptions),
+    Unordered,
+    Singleton,
+}
+
+impl From<&SortProperties> for FFI_SortProperties {
+    fn from(value: &SortProperties) -> Self {
+        match value {
+            SortProperties::Unordered => FFI_SortProperties::Unordered,
+            SortProperties::Singleton => FFI_SortProperties::Singleton,
+            SortProperties::Ordered(o) => FFI_SortProperties::Ordered(o.into()),
+        }
+    }
+}
+
+impl From<&FFI_SortProperties> for SortProperties {
+    fn from(value: &FFI_SortProperties) -> Self {
+        match value {
+            FFI_SortProperties::Unordered => SortProperties::Unordered,
+            FFI_SortProperties::Singleton => SortProperties::Singleton,
+            FFI_SortProperties::Ordered(o) => SortProperties::Ordered(o.into()),
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_SortOptions {
+    pub descending: bool,
+    pub nulls_first: bool,
+}
+
+impl From<&SortOptions> for FFI_SortOptions {
+    fn from(value: &SortOptions) -> Self {
+        Self {
+            descending: value.descending,
+            nulls_first: value.nulls_first,
+        }
+    }
+}
+
+impl From<&FFI_SortOptions> for SortOptions {
+    fn from(value: &FFI_SortOptions) -> Self {
+        Self {
+            descending: value.descending,
+            nulls_first: value.nulls_first,
+        }
+    }
+}
diff --git a/datafusion/ffi/src/expr/interval.rs b/datafusion/ffi/src/expr/interval.rs
new file mode 100644
index 0000000000000..6334f7bb24d90
--- /dev/null
+++ b/datafusion/ffi/src/expr/interval.rs
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_common::DataFusionError;
+use datafusion_expr::interval_arithmetic::Interval;
+
+use crate::arrow_wrappers::WrappedArray;
+
+/// A stable struct for sharing [`Interval`] across FFI boundaries.
+/// See [`Interval`] for the meaning of each field. Scalar values
+/// are passed as Arrow arrays of length 1.
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_Interval {
+    lower: WrappedArray,
+    upper: WrappedArray,
+}
+
+impl TryFrom<&Interval> for FFI_Interval {
+    type Error = DataFusionError;
+    fn try_from(value: &Interval) -> Result<Self, Self::Error> {
+        let upper = value.upper().try_into()?;
+        let lower = value.lower().try_into()?;
+
+        Ok(FFI_Interval { upper, lower })
+    }
+}
+impl TryFrom<Interval> for FFI_Interval {
+    type Error = DataFusionError;
+    fn try_from(value: Interval) -> Result<Self, Self::Error> {
+        FFI_Interval::try_from(&value)
+    }
+}
+
+impl TryFrom<FFI_Interval> for Interval {
+    type Error = DataFusionError;
+    fn try_from(value: FFI_Interval) -> Result<Self, Self::Error> {
+        let upper = value.upper.try_into()?;
+        let lower = value.lower.try_into()?;
+
+        Interval::try_new(lower, upper)
+    }
+}
diff --git a/datafusion/ffi/src/expr/mod.rs b/datafusion/ffi/src/expr/mod.rs
new file mode 100644
index 0000000000000..e11d52a2a1e57
--- /dev/null
+++ b/datafusion/ffi/src/expr/mod.rs
@@ -0,0 +1,21 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+pub mod columnar_value;
+pub mod distribution;
+pub mod expr_properties;
+pub mod interval;
diff --git a/datafusion/ffi/src/ffi_option.rs b/datafusion/ffi/src/ffi_option.rs
new file mode 100644
index 0000000000000..4ae454f86e8e5
--- /dev/null
+++ b/datafusion/ffi/src/ffi_option.rs
@@ -0,0 +1,135 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! FFI-safe Option and Result types that do not require `IStable` bounds.
+//!
+//! stabby's `Option<T>` and `Result<T, E>` require `T: IStable` for niche
+//! optimization. Many of our FFI structs contain self-referential function
+//! pointers and cannot implement `IStable`. These simple `#[repr(C)]` types
+//! provide the same FFI-safe semantics without that constraint.
+
+use stabby::string::String as SString;
+
+/// An FFI-safe option type.
+#[repr(C, u8)]
+#[derive(Debug, Clone)]
+pub enum FFI_Option<T> {
+    Some(T),
+    None,
+}
+
+impl<T> From<Option<T>> for FFI_Option<T> {
+    fn from(opt: Option<T>) -> Self {
+        match opt {
+            Some(v) => FFI_Option::Some(v),
+            None => FFI_Option::None,
+        }
+    }
+}
+
+impl<T> From<FFI_Option<T>> for Option<T> {
+    fn from(opt: FFI_Option<T>) -> Self {
+        match opt {
+            FFI_Option::Some(v) => Some(v),
+            FFI_Option::None => None,
+        }
+    }
+}
+
+impl<T> FFI_Option<T> {
+    pub fn as_ref(&self) -> Option<&T> {
+        match self {
+            FFI_Option::Some(v) => Some(v),
+            FFI_Option::None => None,
+        }
+    }
+
+    pub fn map<U, F: FnOnce(T) -> U>(self, f: F) -> FFI_Option<U> {
+        match self {
+            FFI_Option::Some(v) => FFI_Option::Some(f(v)),
+            FFI_Option::None => FFI_Option::None,
+        }
+    }
+
+    pub fn into_option(self) -> Option<T> {
+        self.into()
+    }
+}
+
+/// An FFI-safe result type with SString as the error type.
+#[repr(C, u8)]
+#[derive(Debug, Clone)]
+pub enum FFI_Result<T> {
+    Ok(T),
+    Err(SString),
+}
+
+impl<T> FFI_Result<T> {
+    pub fn is_ok(&self) -> bool {
+        matches!(self, FFI_Result::Ok(_))
+    }
+
+    pub fn is_err(&self) -> bool {
+        matches!(self, FFI_Result::Err(_))
+    }
+
+    pub fn unwrap_err(self) -> SString {
+        match self {
+            FFI_Result::Err(e) => e,
+            FFI_Result::Ok(_) => panic!("called unwrap_err on Ok"),
+        }
+    }
+
+    pub fn map<U, F: FnOnce(T) -> U>(self, f: F) -> FFI_Result<U> {
+        match self {
+            FFI_Result::Ok(v) => FFI_Result::Ok(f(v)),
+            FFI_Result::Err(e) => FFI_Result::Err(e),
+        }
+    }
+
+    pub fn into_result(self) -> Result<T, SString> {
+        self.into()
+    }
+}
+
+impl<T> From<FFI_Result<T>> for Result<T, SString> {
+    fn from(res: FFI_Result<T>) -> Self {
+        match res {
+            FFI_Result::Ok(v) => Ok(v),
+            FFI_Result::Err(e) => Err(e),
+        }
+    }
+}
+
+impl<T, E: ToString> From<Result<T, E>> for FFI_Result<T> {
+    fn from(res: Result<T, E>) -> Self {
+        match res {
+            Ok(v) => FFI_Result::Ok(v),
+            Err(e) => FFI_Result::Err(SString::from(e.to_string().as_str())),
+        }
+    }
+}
+
+impl<T: PartialEq> PartialEq for FFI_Result<T> {
+    fn eq(&self, other: &Self) -> bool {
+        match (self, other) {
+            (FFI_Result::Ok(a), FFI_Result::Ok(b)) => a == b,
+            (FFI_Result::Err(a), FFI_Result::Err(b)) => a == b,
+            _ => false,
+        }
+    }
+}
diff --git a/datafusion/ffi/src/insert_op.rs b/datafusion/ffi/src/insert_op.rs
index 8e8693076cc0e..e264d24dcaee2 100644
--- a/datafusion/ffi/src/insert_op.rs
+++ b/datafusion/ffi/src/insert_op.rs
@@ -15,13 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use abi_stable::StableAbi;
-use datafusion::logical_expr::logical_plan::dml::InsertOp;
+use datafusion_expr::logical_plan::dml::InsertOp;
 
 /// FFI safe version of [`InsertOp`].
-#[repr(C)]
-#[derive(StableAbi)]
-#[allow(non_camel_case_types)]
+#[expect(non_camel_case_types)]
+#[repr(u8)]
 pub enum FFI_InsertOp {
     Append,
     Overwrite,
diff --git a/datafusion/ffi/src/lib.rs b/datafusion/ffi/src/lib.rs
index 0c2340e8ce7b1..de3caf8c17260 100644
--- a/datafusion/ffi/src/lib.rs
+++ b/datafusion/ffi/src/lib.rs
@@ -23,16 +23,26 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 pub mod arrow_wrappers;
 pub mod catalog_provider;
+pub mod catalog_provider_list;
+pub mod config;
+pub mod execution;
 pub mod execution_plan;
+pub mod expr;
+pub mod ffi_option;
 pub mod insert_op;
+pub mod physical_expr;
+pub mod physical_optimizer;
 pub mod plan_properties;
+pub mod proto;
 pub mod record_batch_stream;
 pub mod schema_provider;
-pub mod session_config;
+pub mod session;
 pub mod table_provider;
+pub mod table_provider_factory;
 pub mod table_source;
 pub mod udaf;
 pub mod udf;
@@ -54,5 +64,34 @@ pub extern "C" fn version() -> u64 {
     version.major
 }
 
+static LIBRARY_MARKER: u8 = 0;
+
+/// This utility is used to determine if two FFI structs are within
+/// the same library. It is possible that the interplay between
+/// foreign and local functions calls create one FFI struct that
+/// references another. It is helpful to determine if a foreign
+/// struct in the same library or called from a different one.
+/// If we are in the same library, then we can access the underlying
+/// types directly.
+///
+/// This function works by checking the address of the library
+/// marker. Each library that implements the FFI code will have
+/// a different address for the marker. By checking the marker
+/// address we can determine if a struct is truly foreign or is
+/// actually within the same originating library.
+///
+/// See the crate's `README.md` for additional information.
+pub extern "C" fn get_library_marker_id() -> usize {
+    &LIBRARY_MARKER as *const u8 as usize
+}
+
+/// For unit testing in this crate we need to trick the providers
+/// into thinking we have a foreign call. We do this by overwriting
+/// their `library_marker_id` function to return a different value.
+#[cfg(test)]
+pub(crate) extern "C" fn mock_foreign_marker_id() -> usize {
+    get_library_marker_id() + 1
+}
+
 #[cfg(doctest)]
 doc_comment::doctest!("../README.md", readme_example_test);
diff --git a/datafusion/ffi/src/physical_expr/mod.rs b/datafusion/ffi/src/physical_expr/mod.rs
new file mode 100644
index 0000000000000..8756ed2bdc8a6
--- /dev/null
+++ b/datafusion/ffi/src/physical_expr/mod.rs
@@ -0,0 +1,990 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+pub(crate) mod partitioning;
+pub(crate) mod sort;
+
+use std::ffi::c_void;
+use std::fmt::{Display, Formatter};
+use std::hash::{DefaultHasher, Hash, Hasher};
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, BooleanArray, RecordBatch};
+use arrow::datatypes::SchemaRef;
+use arrow_schema::ffi::FFI_ArrowSchema;
+use arrow_schema::{DataType, Field, FieldRef, Schema};
+use datafusion_common::{Result, ffi_datafusion_err};
+use datafusion_expr::ColumnarValue;
+use datafusion_expr::interval_arithmetic::Interval;
+use datafusion_expr::sort_properties::ExprProperties;
+use datafusion_expr::statistics::Distribution;
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_expr_common::physical_expr::fmt_sql;
+
+use stabby::string::String as SString;
+use stabby::vec::Vec as SVec;
+
+use crate::arrow_wrappers::{WrappedArray, WrappedSchema};
+use crate::expr::columnar_value::FFI_ColumnarValue;
+use crate::expr::distribution::FFI_Distribution;
+use crate::expr::expr_properties::FFI_ExprProperties;
+use crate::expr::interval::FFI_Interval;
+use crate::record_batch_stream::{
+    record_batch_to_wrapped_array, wrapped_array_to_record_batch,
+};
+use crate::util::{FFI_Option, FFI_Result};
+use crate::{df_result, sresult, sresult_return};
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_PhysicalExpr {
+    pub data_type: unsafe extern "C" fn(
+        &Self,
+        input_schema: WrappedSchema,
+    ) -> FFI_Result<WrappedSchema>,
+
+    pub nullable:
+        unsafe extern "C" fn(&Self, input_schema: WrappedSchema) -> FFI_Result<bool>,
+
+    pub evaluate:
+        unsafe extern "C" fn(&Self, batch: WrappedArray) -> FFI_Result<FFI_ColumnarValue>,
+
+    pub return_field: unsafe extern "C" fn(
+        &Self,
+        input_schema: WrappedSchema,
+    ) -> FFI_Result<WrappedSchema>,
+
+    pub evaluate_selection: unsafe extern "C" fn(
+        &Self,
+        batch: WrappedArray,
+        selection: WrappedArray,
+    ) -> FFI_Result<FFI_ColumnarValue>,
+
+    pub children: unsafe extern "C" fn(&Self) -> SVec<FFI_PhysicalExpr>,
+
+    pub new_with_children: unsafe extern "C" fn(
+        &Self,
+        children: &SVec<FFI_PhysicalExpr>,
+    ) -> FFI_Result<Self>,
+
+    pub evaluate_bounds: unsafe extern "C" fn(
+        &Self,
+        children: SVec<FFI_Interval>,
+    ) -> FFI_Result<FFI_Interval>,
+
+    pub propagate_constraints:
+        unsafe extern "C" fn(
+            &Self,
+            interval: FFI_Interval,
+            children: SVec<FFI_Interval>,
+        ) -> FFI_Result<FFI_Option<SVec<FFI_Interval>>>,
+
+    pub evaluate_statistics: unsafe extern "C" fn(
+        &Self,
+        children: SVec<FFI_Distribution>,
+    ) -> FFI_Result<FFI_Distribution>,
+
+    pub propagate_statistics:
+        unsafe extern "C" fn(
+            &Self,
+            parent: FFI_Distribution,
+            children: SVec<FFI_Distribution>,
+        ) -> FFI_Result<FFI_Option<SVec<FFI_Distribution>>>,
+
+    pub get_properties: unsafe extern "C" fn(
+        &Self,
+        children: SVec<FFI_ExprProperties>,
+    ) -> FFI_Result<FFI_ExprProperties>,
+
+    pub fmt_sql: unsafe extern "C" fn(&Self) -> FFI_Result<SString>,
+
+    pub snapshot: unsafe extern "C" fn(&Self) -> FFI_Result<FFI_Option<FFI_PhysicalExpr>>,
+
+    pub snapshot_generation: unsafe extern "C" fn(&Self) -> u64,
+
+    pub is_volatile_node: unsafe extern "C" fn(&Self) -> bool,
+
+    // Display trait
+    pub display: unsafe extern "C" fn(&Self) -> SString,
+
+    // Hash trait
+    pub hash: unsafe extern "C" fn(&Self) -> u64,
+
+    /// Used to create a clone on the provider of the execution plan. This should
+    /// only need to be called by the receiver of the plan.
+    pub clone: unsafe extern "C" fn(plan: &Self) -> Self,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(arg: &mut Self),
+
+    /// Return the major DataFusion version number of this provider.
+    pub version: unsafe extern "C" fn() -> u64,
+
+    /// Internal data. This is only to be accessed by the provider of the plan.
+    /// A [`ForeignPhysicalExpr`] should never attempt to access this data.
+    pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+unsafe impl Send for FFI_PhysicalExpr {}
+unsafe impl Sync for FFI_PhysicalExpr {}
+
+impl FFI_PhysicalExpr {
+    fn inner(&self) -> &Arc<dyn PhysicalExpr> {
+        unsafe {
+            let private_data = self.private_data as *const PhysicalExprPrivateData;
+            &(*private_data).expr
+        }
+    }
+}
+
+struct PhysicalExprPrivateData {
+    expr: Arc<dyn PhysicalExpr>,
+}
+
+unsafe extern "C" fn data_type_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    input_schema: WrappedSchema,
+) -> FFI_Result<WrappedSchema> {
+    let expr = expr.inner();
+    let schema: SchemaRef = input_schema.into();
+    let data_type = expr
+        .data_type(&schema)
+        .and_then(|dt| FFI_ArrowSchema::try_from(dt).map_err(Into::into))
+        .map(WrappedSchema);
+    sresult!(data_type)
+}
+
+unsafe extern "C" fn nullable_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    input_schema: WrappedSchema,
+) -> FFI_Result<bool> {
+    let expr = expr.inner();
+    let schema: SchemaRef = input_schema.into();
+    sresult!(expr.nullable(&schema))
+}
+
+unsafe extern "C" fn evaluate_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    batch: WrappedArray,
+) -> FFI_Result<FFI_ColumnarValue> {
+    let batch = sresult_return!(wrapped_array_to_record_batch(batch));
+    sresult!(
+        expr.inner()
+            .evaluate(&batch)
+            .and_then(FFI_ColumnarValue::try_from)
+    )
+}
+
+unsafe extern "C" fn return_field_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    input_schema: WrappedSchema,
+) -> FFI_Result<WrappedSchema> {
+    let expr = expr.inner();
+    let schema: SchemaRef = input_schema.into();
+    sresult!(
+        expr.return_field(&schema)
+            .and_then(|f| FFI_ArrowSchema::try_from(&f).map_err(Into::into))
+            .map(WrappedSchema)
+    )
+}
+
+unsafe extern "C" fn evaluate_selection_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    batch: WrappedArray,
+    selection: WrappedArray,
+) -> FFI_Result<FFI_ColumnarValue> {
+    let batch = sresult_return!(wrapped_array_to_record_batch(batch));
+    let selection: ArrayRef = sresult_return!(selection.try_into());
+    let selection = sresult_return!(
+        selection
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .ok_or(ffi_datafusion_err!("Unexpected selection array type"))
+    );
+    sresult!(
+        expr.inner()
+            .evaluate_selection(&batch, selection)
+            .and_then(FFI_ColumnarValue::try_from)
+    )
+}
+
+unsafe extern "C" fn children_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+) -> SVec<FFI_PhysicalExpr> {
+    let expr = expr.inner();
+    let children = expr.children();
+    children
+        .into_iter()
+        .map(|child| FFI_PhysicalExpr::from(Arc::clone(child)))
+        .collect()
+}
+
+unsafe extern "C" fn new_with_children_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    children: &SVec<FFI_PhysicalExpr>,
+) -> FFI_Result<FFI_PhysicalExpr> {
+    let expr = Arc::clone(expr.inner());
+    let children = children.iter().map(Into::into).collect::<Vec<_>>();
+    sresult!(expr.with_new_children(children).map(FFI_PhysicalExpr::from))
+}
+
+unsafe extern "C" fn evaluate_bounds_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    children: SVec<FFI_Interval>,
+) -> FFI_Result<FFI_Interval> {
+    let expr = expr.inner();
+    let children = sresult_return!(
+        children
+            .into_iter()
+            .map(Interval::try_from)
+            .collect::<Result<Vec<_>>>()
+    );
+    let children_borrowed = children.iter().collect::<Vec<_>>();
+
+    sresult!(
+        expr.evaluate_bounds(&children_borrowed)
+            .and_then(FFI_Interval::try_from)
+    )
+}
+
+unsafe extern "C" fn propagate_constraints_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    interval: FFI_Interval,
+    children: SVec<FFI_Interval>,
+) -> FFI_Result<FFI_Option<SVec<FFI_Interval>>> {
+    let expr = expr.inner();
+    let interval = sresult_return!(Interval::try_from(interval));
+    let children = sresult_return!(
+        children
+            .into_iter()
+            .map(Interval::try_from)
+            .collect::<Result<Vec<_>>>()
+    );
+    let children_borrowed = children.iter().collect::<Vec<_>>();
+
+    let result =
+        sresult_return!(expr.propagate_constraints(&interval, &children_borrowed));
+
+    let result = sresult_return!(
+        result
+            .map(|intervals| intervals
+                .into_iter()
+                .map(FFI_Interval::try_from)
+                .collect::<Result<SVec<_>>>())
+            .transpose()
+    );
+
+    FFI_Result::Ok(result.into())
+}
+
+unsafe extern "C" fn evaluate_statistics_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    children: SVec<FFI_Distribution>,
+) -> FFI_Result<FFI_Distribution> {
+    let expr = expr.inner();
+    let children = sresult_return!(
+        children
+            .into_iter()
+            .map(Distribution::try_from)
+            .collect::<Result<Vec<_>>>()
+    );
+    let children_borrowed = children.iter().collect::<Vec<_>>();
+    sresult!(
+        expr.evaluate_statistics(&children_borrowed)
+            .and_then(|dist| FFI_Distribution::try_from(&dist))
+    )
+}
+
+unsafe extern "C" fn propagate_statistics_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    parent: FFI_Distribution,
+    children: SVec<FFI_Distribution>,
+) -> FFI_Result<FFI_Option<SVec<FFI_Distribution>>> {
+    let expr = expr.inner();
+    let parent = sresult_return!(Distribution::try_from(parent));
+    let children = sresult_return!(
+        children
+            .into_iter()
+            .map(Distribution::try_from)
+            .collect::<Result<Vec<_>>>()
+    );
+    let children_borrowed = children.iter().collect::<Vec<_>>();
+
+    let result = sresult_return!(expr.propagate_statistics(&parent, &children_borrowed));
+    let result = sresult_return!(
+        result
+            .map(|dists| dists
+                .iter()
+                .map(FFI_Distribution::try_from)
+                .collect::<Result<SVec<_>>>())
+            .transpose()
+    );
+
+    FFI_Result::Ok(result.into())
+}
+
+unsafe extern "C" fn get_properties_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    children: SVec<FFI_ExprProperties>,
+) -> FFI_Result<FFI_ExprProperties> {
+    let expr = expr.inner();
+    let children = sresult_return!(
+        children
+            .into_iter()
+            .map(ExprProperties::try_from)
+            .collect::<Result<Vec<_>>>()
+    );
+    sresult!(
+        expr.get_properties(&children)
+            .and_then(|p| FFI_ExprProperties::try_from(&p))
+    )
+}
+
+unsafe extern "C" fn fmt_sql_fn_wrapper(expr: &FFI_PhysicalExpr) -> FFI_Result<SString> {
+    let expr = expr.inner();
+    let result = fmt_sql(expr.as_ref()).to_string();
+    FFI_Result::Ok(result.into())
+}
+
+unsafe extern "C" fn snapshot_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+) -> FFI_Result<FFI_Option<FFI_PhysicalExpr>> {
+    let expr = expr.inner();
+    sresult!(
+        expr.snapshot()
+            .map(|snapshot| snapshot.map(FFI_PhysicalExpr::from).into())
+    )
+}
+
+unsafe extern "C" fn snapshot_generation_fn_wrapper(expr: &FFI_PhysicalExpr) -> u64 {
+    let expr = expr.inner();
+    expr.snapshot_generation()
+}
+
+unsafe extern "C" fn is_volatile_node_fn_wrapper(expr: &FFI_PhysicalExpr) -> bool {
+    let expr = expr.inner();
+    expr.is_volatile_node()
+}
+unsafe extern "C" fn display_fn_wrapper(expr: &FFI_PhysicalExpr) -> SString {
+    let expr = expr.inner();
+    format!("{expr}").into()
+}
+
+unsafe extern "C" fn hash_fn_wrapper(expr: &FFI_PhysicalExpr) -> u64 {
+    let expr = expr.inner();
+    let mut hasher = DefaultHasher::new();
+    expr.hash(&mut hasher);
+    hasher.finish()
+}
+
+unsafe extern "C" fn release_fn_wrapper(expr: &mut FFI_PhysicalExpr) {
+    unsafe {
+        debug_assert!(!expr.private_data.is_null());
+        let private_data =
+            Box::from_raw(expr.private_data as *mut PhysicalExprPrivateData);
+        drop(private_data);
+        expr.private_data = std::ptr::null_mut();
+    }
+}
+
+unsafe extern "C" fn clone_fn_wrapper(expr: &FFI_PhysicalExpr) -> FFI_PhysicalExpr {
+    unsafe {
+        let old_private_data = expr.private_data as *const PhysicalExprPrivateData;
+
+        let private_data = Box::into_raw(Box::new(PhysicalExprPrivateData {
+            expr: Arc::clone(&(*old_private_data).expr),
+        })) as *mut c_void;
+
+        FFI_PhysicalExpr {
+            data_type: data_type_fn_wrapper,
+            nullable: nullable_fn_wrapper,
+            evaluate: evaluate_fn_wrapper,
+            return_field: return_field_fn_wrapper,
+            evaluate_selection: evaluate_selection_fn_wrapper,
+            children: children_fn_wrapper,
+            new_with_children: new_with_children_fn_wrapper,
+            evaluate_bounds: evaluate_bounds_fn_wrapper,
+            propagate_constraints: propagate_constraints_fn_wrapper,
+            evaluate_statistics: evaluate_statistics_fn_wrapper,
+            propagate_statistics: propagate_statistics_fn_wrapper,
+            get_properties: get_properties_fn_wrapper,
+            fmt_sql: fmt_sql_fn_wrapper,
+            snapshot: snapshot_fn_wrapper,
+            snapshot_generation: snapshot_generation_fn_wrapper,
+            is_volatile_node: is_volatile_node_fn_wrapper,
+            display: display_fn_wrapper,
+            hash: hash_fn_wrapper,
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+impl Drop for FFI_PhysicalExpr {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+impl From<Arc<dyn PhysicalExpr>> for FFI_PhysicalExpr {
+    /// Creates a new [`FFI_PhysicalExpr`].
+    fn from(expr: Arc<dyn PhysicalExpr>) -> Self {
+        if let Some(expr) = expr.downcast_ref::<ForeignPhysicalExpr>() {
+            return expr.expr.clone();
+        }
+
+        let private_data = Box::new(PhysicalExprPrivateData { expr });
+
+        Self {
+            data_type: data_type_fn_wrapper,
+            nullable: nullable_fn_wrapper,
+            evaluate: evaluate_fn_wrapper,
+            return_field: return_field_fn_wrapper,
+            evaluate_selection: evaluate_selection_fn_wrapper,
+            children: children_fn_wrapper,
+            new_with_children: new_with_children_fn_wrapper,
+            evaluate_bounds: evaluate_bounds_fn_wrapper,
+            propagate_constraints: propagate_constraints_fn_wrapper,
+            evaluate_statistics: evaluate_statistics_fn_wrapper,
+            propagate_statistics: propagate_statistics_fn_wrapper,
+            get_properties: get_properties_fn_wrapper,
+            fmt_sql: fmt_sql_fn_wrapper,
+            snapshot: snapshot_fn_wrapper,
+            snapshot_generation: snapshot_generation_fn_wrapper,
+            is_volatile_node: is_volatile_node_fn_wrapper,
+            display: display_fn_wrapper,
+            hash: hash_fn_wrapper,
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+/// This wrapper struct exists on the receiver side of the FFI interface, so it has
+/// no guarantees about being able to access the data in `private_data`. Any functions
+/// defined on this struct must only use the stable functions provided in
+/// FFI_PhysicalExpr to interact with the expression.
+#[derive(Debug)]
+pub struct ForeignPhysicalExpr {
+    expr: FFI_PhysicalExpr,
+    children: Vec<Arc<dyn PhysicalExpr>>,
+}
+
+unsafe impl Send for ForeignPhysicalExpr {}
+unsafe impl Sync for ForeignPhysicalExpr {}
+
+impl From<&FFI_PhysicalExpr> for Arc<dyn PhysicalExpr> {
+    fn from(ffi_expr: &FFI_PhysicalExpr) -> Self {
+        if (ffi_expr.library_marker_id)() == crate::get_library_marker_id() {
+            Arc::clone(ffi_expr.inner())
+        } else {
+            let children = unsafe {
+                (ffi_expr.children)(ffi_expr)
+                    .into_iter()
+                    .map(|expr| <Arc<dyn PhysicalExpr>>::from(&expr))
+                    .collect()
+            };
+
+            Arc::new(ForeignPhysicalExpr {
+                expr: ffi_expr.clone(),
+                children,
+            })
+        }
+    }
+}
+
+impl Clone for FFI_PhysicalExpr {
+    fn clone(&self) -> Self {
+        unsafe { (self.clone)(self) }
+    }
+}
+
+impl PhysicalExpr for ForeignPhysicalExpr {
+    fn data_type(&self, input_schema: &Schema) -> Result<DataType> {
+        unsafe {
+            let schema = WrappedSchema::from(Arc::new(input_schema.clone()));
+            df_result!((self.expr.data_type)(&self.expr, schema))
+                .and_then(|d| DataType::try_from(&d.0).map_err(Into::into))
+        }
+    }
+
+    fn nullable(&self, input_schema: &Schema) -> Result<bool> {
+        unsafe {
+            let schema = WrappedSchema::from(Arc::new(input_schema.clone()));
+            df_result!((self.expr.nullable)(&self.expr, schema))
+        }
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
+        unsafe {
+            let batch = df_result!(record_batch_to_wrapped_array(batch.clone()))?;
+            df_result!((self.expr.evaluate)(&self.expr, batch))
+                .and_then(ColumnarValue::try_from)
+        }
+    }
+
+    fn return_field(&self, input_schema: &Schema) -> Result<FieldRef> {
+        unsafe {
+            let schema = WrappedSchema::from(Arc::new(input_schema.clone()));
+            let result = df_result!((self.expr.return_field)(&self.expr, schema))?;
+            Field::try_from(&result.0).map(Arc::new).map_err(Into::into)
+        }
+    }
+
+    fn evaluate_selection(
+        &self,
+        batch: &RecordBatch,
+        selection: &BooleanArray,
+    ) -> Result<ColumnarValue> {
+        unsafe {
+            let batch = df_result!(record_batch_to_wrapped_array(batch.clone()))?;
+            // This is not ideal - we are cloning the selection array
+            // This is not terrible since it will be a small array.
+            // The other alternative is to modify the trait signature.
+            let selection: ArrayRef = Arc::new(selection.clone());
+            let selection = WrappedArray::try_from(&selection)?;
+            df_result!((self.expr.evaluate_selection)(&self.expr, batch, selection))
+                .and_then(ColumnarValue::try_from)
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        self.children.iter().collect()
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        unsafe {
+            let children = children.into_iter().map(FFI_PhysicalExpr::from).collect();
+            df_result!(
+                (self.expr.new_with_children)(&self.expr, &children).map(|expr| <Arc<
+                    dyn PhysicalExpr,
+                >>::from(
+                    &expr
+                ))
+            )
+        }
+    }
+
+    fn evaluate_bounds(&self, children: &[&Interval]) -> Result<Interval> {
+        unsafe {
+            let children = children
+                .iter()
+                .map(|interval| FFI_Interval::try_from(*interval))
+                .collect::<Result<SVec<_>>>()?;
+            df_result!((self.expr.evaluate_bounds)(&self.expr, children))
+                .and_then(Interval::try_from)
+        }
+    }
+
+    fn propagate_constraints(
+        &self,
+        interval: &Interval,
+        children: &[&Interval],
+    ) -> Result<Option<Vec<Interval>>> {
+        unsafe {
+            let interval = interval.try_into()?;
+            let children = children
+                .iter()
+                .map(|interval| FFI_Interval::try_from(*interval))
+                .collect::<Result<SVec<_>>>()?;
+            let result = df_result!((self.expr.propagate_constraints)(
+                &self.expr, interval, children
+            ))?;
+
+            let result: Option<_> = result
+                .map(|intervals| {
+                    intervals
+                        .into_iter()
+                        .map(Interval::try_from)
+                        .collect::<Result<Vec<_>>>()
+                })
+                .into();
+            result.transpose()
+        }
+    }
+
+    fn evaluate_statistics(&self, children: &[&Distribution]) -> Result<Distribution> {
+        unsafe {
+            let children = children
+                .iter()
+                .map(|dist| FFI_Distribution::try_from(*dist))
+                .collect::<Result<SVec<_>>>()?;
+
+            let result =
+                df_result!((self.expr.evaluate_statistics)(&self.expr, children))?;
+            Distribution::try_from(result)
+        }
+    }
+
+    fn propagate_statistics(
+        &self,
+        parent: &Distribution,
+        children: &[&Distribution],
+    ) -> Result<Option<Vec<Distribution>>> {
+        unsafe {
+            let parent = FFI_Distribution::try_from(parent)?;
+            let children = children
+                .iter()
+                .map(|dist| FFI_Distribution::try_from(*dist))
+                .collect::<Result<SVec<_>>>()?;
+            let result = df_result!((self.expr.propagate_statistics)(
+                &self.expr, parent, children
+            ))?;
+
+            let result: Option<Result<Vec<Distribution>>> = result
+                .map(|dists| {
+                    dists
+                        .into_iter()
+                        .map(Distribution::try_from)
+                        .collect::<Result<Vec<_>>>()
+                })
+                .into();
+
+            result.transpose()
+        }
+    }
+
+    fn get_properties(&self, children: &[ExprProperties]) -> Result<ExprProperties> {
+        unsafe {
+            let children = children
+                .iter()
+                .map(FFI_ExprProperties::try_from)
+                .collect::<Result<SVec<_>>>()?;
+            df_result!((self.expr.get_properties)(&self.expr, children))
+                .and_then(ExprProperties::try_from)
+        }
+    }
+
+    fn fmt_sql(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        unsafe {
+            match (self.expr.fmt_sql)(&self.expr) {
+                FFI_Result::Ok(sql) => write!(f, "{sql}"),
+                FFI_Result::Err(_) => Err(std::fmt::Error),
+            }
+        }
+    }
+
+    fn snapshot(&self) -> Result<Option<Arc<dyn PhysicalExpr>>> {
+        unsafe {
+            let result = df_result!((self.expr.snapshot)(&self.expr))?;
+            Ok(result
+                .map(|expr| <Arc<dyn PhysicalExpr>>::from(&expr))
+                .into())
+        }
+    }
+
+    fn snapshot_generation(&self) -> u64 {
+        unsafe { (self.expr.snapshot_generation)(&self.expr) }
+    }
+
+    fn is_volatile_node(&self) -> bool {
+        unsafe { (self.expr.is_volatile_node)(&self.expr) }
+    }
+}
+
+impl Eq for ForeignPhysicalExpr {}
+impl PartialEq for ForeignPhysicalExpr {
+    fn eq(&self, other: &Self) -> bool {
+        // FFI_PhysicalExpr cannot be compared, so identity equality is the best we can do.
+        std::ptr::eq(self, other)
+    }
+}
+impl Hash for ForeignPhysicalExpr {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let value = unsafe { (self.expr.hash)(&self.expr) };
+        value.hash(state)
+    }
+}
+
+impl Display for ForeignPhysicalExpr {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        let display = unsafe { (self.expr.display)(&self.expr) };
+        write!(f, "{display}")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::hash::{DefaultHasher, Hash, Hasher};
+    use std::sync::Arc;
+
+    use arrow::array::{BooleanArray, RecordBatch, record_batch};
+    use datafusion_common::tree_node::DynTreeNode;
+    use datafusion_common::{DataFusionError, ScalarValue};
+    use datafusion_expr::interval_arithmetic::Interval;
+    use datafusion_expr::statistics::Distribution;
+    use datafusion_physical_expr::expressions::{Column, NegativeExpr, NotExpr};
+    use datafusion_physical_expr_common::physical_expr::{PhysicalExpr, fmt_sql};
+
+    use crate::physical_expr::FFI_PhysicalExpr;
+
+    fn create_test_expr() -> (Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>) {
+        let original = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
+        let mut ffi_expr = FFI_PhysicalExpr::from(Arc::clone(&original));
+        ffi_expr.library_marker_id = crate::mock_foreign_marker_id;
+
+        let foreign_expr: Arc<dyn PhysicalExpr> = (&ffi_expr).into();
+
+        (original, foreign_expr)
+    }
+
+    fn test_record_batch() -> RecordBatch {
+        record_batch!(("a", Int32, [1, 2, 3])).unwrap()
+    }
+
+    #[test]
+    fn ffi_physical_expr_fields() -> Result<(), DataFusionError> {
+        let (original, foreign_expr) = create_test_expr();
+        let schema = test_record_batch().schema();
+
+        // Verify the mock marker worked, otherwise tests to follow are not useful
+        assert_ne!(original.as_ref(), foreign_expr.as_ref());
+
+        assert_eq!(
+            original.return_field(&schema)?,
+            foreign_expr.return_field(&schema)?
+        );
+
+        assert_eq!(
+            original.data_type(&schema)?,
+            foreign_expr.data_type(&schema)?
+        );
+        assert_eq!(original.nullable(&schema)?, foreign_expr.nullable(&schema)?);
+
+        Ok(())
+    }
+    #[test]
+    fn ffi_physical_expr_evaluate() -> Result<(), DataFusionError> {
+        let (original, foreign_expr) = create_test_expr();
+        let rb = test_record_batch();
+
+        assert_eq!(
+            original.evaluate(&rb)?.to_array(3)?.as_ref(),
+            foreign_expr.evaluate(&rb)?.to_array(3)?.as_ref()
+        );
+
+        Ok(())
+    }
+    #[test]
+    fn ffi_physical_expr_selection() -> Result<(), DataFusionError> {
+        let (original, foreign_expr) = create_test_expr();
+        let rb = test_record_batch();
+
+        let selection = BooleanArray::from(vec![true, false, true]);
+
+        assert_eq!(
+            original
+                .evaluate_selection(&rb, &selection)?
+                .to_array(3)?
+                .as_ref(),
+            foreign_expr
+                .evaluate_selection(&rb, &selection)?
+                .to_array(3)?
+                .as_ref()
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn ffi_physical_expr_with_children() -> Result<(), DataFusionError> {
+        let (original, _) = create_test_expr();
+        let not_expr =
+            Arc::new(NotExpr::new(Arc::clone(&original))) as Arc<dyn PhysicalExpr>;
+        let mut ffi_not = FFI_PhysicalExpr::from(not_expr);
+        ffi_not.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_not: Arc<dyn PhysicalExpr> = (&ffi_not).into();
+
+        let replacement = Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>;
+        let updated =
+            Arc::clone(&foreign_not).with_new_children(vec![Arc::clone(&replacement)])?;
+        assert_eq!(
+            format!("{updated:?}").as_str(),
+            "NotExpr { arg: Column { name: \"b\", index: 1 } }"
+        );
+
+        let updated = foreign_not
+            .with_new_arc_children(Arc::clone(&foreign_not), vec![replacement])?;
+        assert_eq!(format!("{updated}").as_str(), "NOT b@1");
+
+        Ok(())
+    }
+
+    fn create_test_negative_expr() -> (Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>) {
+        let (original, _) = create_test_expr();
+
+        let negative_expr =
+            Arc::new(NegativeExpr::new(Arc::clone(&original))) as Arc<dyn PhysicalExpr>;
+        let mut ffi_neg = FFI_PhysicalExpr::from(Arc::clone(&negative_expr));
+        ffi_neg.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_neg: Arc<dyn PhysicalExpr> = (&ffi_neg).into();
+
+        (negative_expr, foreign_neg)
+    }
+
+    #[test]
+    fn ffi_physical_expr_bounds() -> Result<(), DataFusionError> {
+        let (negative_expr, foreign_neg) = create_test_negative_expr();
+
+        let interval =
+            Interval::try_new(ScalarValue::Int32(Some(0)), ScalarValue::Int32(Some(10)))?;
+        let left = negative_expr.evaluate_bounds(&[&interval])?;
+        let right = foreign_neg.evaluate_bounds(&[&interval])?;
+
+        assert_eq!(left, right);
+
+        Ok(())
+    }
+
+    #[test]
+    fn ffi_physical_expr_constraints() -> Result<(), DataFusionError> {
+        let (negative_expr, foreign_neg) = create_test_negative_expr();
+
+        let interval =
+            Interval::try_new(ScalarValue::Int32(Some(0)), ScalarValue::Int32(Some(10)))?;
+
+        let child =
+            Interval::try_new(ScalarValue::Int32(Some(0)), ScalarValue::Int32(Some(10)))?;
+        let left = negative_expr.propagate_constraints(&interval, &[&child])?;
+        let right = foreign_neg.propagate_constraints(&interval, &[&child])?;
+
+        assert_eq!(left, right);
+        Ok(())
+    }
+
+    #[test]
+    fn ffi_physical_expr_statistics() -> Result<(), DataFusionError> {
+        let (negative_expr, foreign_neg) = create_test_negative_expr();
+        let interval =
+            Interval::try_new(ScalarValue::Int32(Some(0)), ScalarValue::Int32(Some(10)))?;
+
+        for distribution in [
+            Distribution::new_uniform(interval.clone())?,
+            Distribution::new_exponential(
+                ScalarValue::Int32(Some(10)),
+                ScalarValue::Int32(Some(10)),
+                true,
+            )?,
+            Distribution::new_gaussian(
+                ScalarValue::Int32(Some(10)),
+                ScalarValue::Int32(Some(10)),
+            )?,
+            Distribution::new_generic(
+                ScalarValue::Int32(Some(10)),
+                ScalarValue::Int32(Some(10)),
+                ScalarValue::Int32(Some(10)),
+                interval,
+            )?,
+        ] {
+            let left = negative_expr.evaluate_statistics(&[&distribution])?;
+            let right = foreign_neg.evaluate_statistics(&[&distribution])?;
+
+            assert_eq!(left, right);
+
+            let left =
+                negative_expr.propagate_statistics(&distribution, &[&distribution])?;
+            let right =
+                foreign_neg.propagate_statistics(&distribution, &[&distribution])?;
+
+            assert_eq!(left, right);
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn ffi_physical_expr_properties() -> Result<(), DataFusionError> {
+        let (original, foreign_expr) = create_test_expr();
+
+        let left = original.get_properties(&[])?;
+        let right = foreign_expr.get_properties(&[])?;
+
+        assert_eq!(left.sort_properties, right.sort_properties);
+        assert_eq!(left.range, right.range);
+
+        Ok(())
+    }
+
+    #[test]
+    fn ffi_physical_formatting() {
+        let (original, foreign_expr) = create_test_expr();
+
+        let left = format!("{}", fmt_sql(original.as_ref()));
+        let right = format!("{}", fmt_sql(foreign_expr.as_ref()));
+        assert_eq!(left, right);
+    }
+
+    #[test]
+    fn ffi_physical_expr_snapshots() -> Result<(), DataFusionError> {
+        let (original, foreign_expr) = create_test_expr();
+
+        let left = original.snapshot()?;
+        let right = foreign_expr.snapshot()?;
+        assert_eq!(left, right);
+
+        assert_eq!(
+            original.snapshot_generation(),
+            foreign_expr.snapshot_generation()
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn ffi_physical_expr_volatility() {
+        let (original, foreign_expr) = create_test_expr();
+        assert_eq!(original.is_volatile_node(), foreign_expr.is_volatile_node());
+    }
+
+    #[test]
+    fn ffi_physical_expr_hash() {
+        let (_, foreign_1) = create_test_expr();
+        let (_, foreign_2) = create_test_expr();
+
+        assert_ne!(&foreign_1, &foreign_2);
+
+        let mut hasher = DefaultHasher::new();
+        foreign_1.as_ref().hash(&mut hasher);
+        let hash_1 = hasher.finish();
+
+        let mut hasher = DefaultHasher::new();
+        foreign_2.as_ref().hash(&mut hasher);
+        let hash_2 = hasher.finish();
+
+        // We cannot compare a local object and a foreign object
+        // so create two foreign objects that *should* be identical
+        // even though they were created differently.
+        assert_eq!(hash_1, hash_2);
+    }
+
+    #[test]
+    fn ffi_physical_expr_display() {
+        let (original, foreign_expr) = create_test_expr();
+        assert_eq!(format!("{original}"), format!("{foreign_expr}"));
+    }
+}
diff --git a/datafusion/ffi/src/physical_expr/partitioning.rs b/datafusion/ffi/src/physical_expr/partitioning.rs
new file mode 100644
index 0000000000000..434b6a097e645
--- /dev/null
+++ b/datafusion/ffi/src/physical_expr/partitioning.rs
@@ -0,0 +1,98 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use datafusion_physical_expr::Partitioning;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use stabby::vec::Vec as SVec;
+
+use crate::physical_expr::FFI_PhysicalExpr;
+
+/// A stable struct for sharing [`Partitioning`] across FFI boundaries.
+/// See ['Partitioning'] for the meaning of each variant.
+#[repr(C)]
+#[derive(Debug)]
+pub enum FFI_Partitioning {
+    RoundRobinBatch(usize),
+    Hash(SVec<FFI_PhysicalExpr>, usize),
+    UnknownPartitioning(usize),
+}
+
+impl From<&Partitioning> for FFI_Partitioning {
+    fn from(value: &Partitioning) -> Self {
+        match value {
+            Partitioning::RoundRobinBatch(size) => Self::RoundRobinBatch(*size),
+            Partitioning::Hash(exprs, size) => {
+                let exprs = exprs
+                    .iter()
+                    .map(Arc::clone)
+                    .map(FFI_PhysicalExpr::from)
+                    .collect();
+                Self::Hash(exprs, *size)
+            }
+            Partitioning::UnknownPartitioning(size) => Self::UnknownPartitioning(*size),
+        }
+    }
+}
+
+impl From<&FFI_Partitioning> for Partitioning {
+    fn from(value: &FFI_Partitioning) -> Self {
+        match value {
+            FFI_Partitioning::RoundRobinBatch(size) => {
+                Partitioning::RoundRobinBatch(*size)
+            }
+            FFI_Partitioning::Hash(exprs, size) => {
+                let exprs = exprs.iter().map(<Arc<dyn PhysicalExpr>>::from).collect();
+                Self::Hash(exprs, *size)
+            }
+            FFI_Partitioning::UnknownPartitioning(size) => {
+                Self::UnknownPartitioning(*size)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion_physical_expr::Partitioning;
+    use datafusion_physical_expr::expressions::lit;
+
+    use crate::physical_expr::partitioning::FFI_Partitioning;
+
+    #[test]
+    fn round_trip_ffi_partitioning() {
+        for partitioning in [
+            Partitioning::RoundRobinBatch(10),
+            Partitioning::Hash(vec![lit(1)], 10),
+            Partitioning::UnknownPartitioning(10),
+        ] {
+            let ffi_partitioning: FFI_Partitioning = (&partitioning).into();
+            let returned: Partitioning = (&ffi_partitioning).into();
+
+            if let Partitioning::UnknownPartitioning(return_size) = returned {
+                let Partitioning::UnknownPartitioning(original_size) = partitioning
+                else {
+                    panic!("Expected unknown partitioning")
+                };
+                assert_eq!(return_size, original_size);
+            } else {
+                assert_eq!(partitioning, returned);
+            }
+        }
+    }
+}
diff --git a/datafusion/ffi/src/physical_expr/sort.rs b/datafusion/ffi/src/physical_expr/sort.rs
new file mode 100644
index 0000000000000..fc8e2a81f36eb
--- /dev/null
+++ b/datafusion/ffi/src/physical_expr/sort.rs
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow_schema::SortOptions;
+use datafusion_physical_expr::PhysicalSortExpr;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+
+use crate::expr::expr_properties::FFI_SortOptions;
+use crate::physical_expr::FFI_PhysicalExpr;
+
+/// A stable struct for sharing [`PhysicalSortExpr`] across FFI boundaries.
+/// See [`PhysicalSortExpr`] for the meaning of each field.
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_PhysicalSortExpr {
+    expr: FFI_PhysicalExpr,
+    options: FFI_SortOptions,
+}
+
+impl From<&PhysicalSortExpr> for FFI_PhysicalSortExpr {
+    fn from(value: &PhysicalSortExpr) -> Self {
+        let expr = FFI_PhysicalExpr::from(value.clone().expr);
+        let options = FFI_SortOptions::from(&value.options);
+
+        Self { expr, options }
+    }
+}
+
+impl From<&FFI_PhysicalSortExpr> for PhysicalSortExpr {
+    fn from(value: &FFI_PhysicalSortExpr) -> Self {
+        let expr: Arc<dyn PhysicalExpr> = (&value.expr).into();
+        let options = SortOptions::from(&value.options);
+
+        Self { expr, options }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow_schema::SortOptions;
+    use datafusion_physical_expr::expressions::Column;
+    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+    use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
+
+    use crate::physical_expr::sort::FFI_PhysicalSortExpr;
+
+    #[test]
+    fn ffi_sort_expr_round_trip() {
+        let col_expr = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
+        let expr = PhysicalSortExpr::new(col_expr, SortOptions::default());
+
+        let ffi_expr = FFI_PhysicalSortExpr::from(&expr);
+        let foreign_expr = PhysicalSortExpr::from(&ffi_expr);
+
+        assert_eq!(expr, foreign_expr);
+    }
+}
diff --git a/datafusion/ffi/src/physical_optimizer.rs b/datafusion/ffi/src/physical_optimizer.rs
new file mode 100644
index 0000000000000..84dc40ce8f46c
--- /dev/null
+++ b/datafusion/ffi/src/physical_optimizer.rs
@@ -0,0 +1,377 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::ffi::c_void;
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::error::Result;
+use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_plan::ExecutionPlan;
+use stabby::string::String as SString;
+use tokio::runtime::Handle;
+
+use crate::config::FFI_ConfigOptions;
+use crate::execution_plan::FFI_ExecutionPlan;
+use crate::util::FFI_Result;
+use crate::{df_result, sresult_return};
+
+/// A stable struct for sharing [`PhysicalOptimizerRule`] across FFI boundaries.
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_PhysicalOptimizerRule {
+    pub optimize: unsafe extern "C" fn(
+        &Self,
+        plan: &FFI_ExecutionPlan,
+        config: FFI_ConfigOptions,
+    ) -> FFI_Result<FFI_ExecutionPlan>,
+
+    pub name: unsafe extern "C" fn(&Self) -> SString,
+
+    pub schema_check: unsafe extern "C" fn(&Self) -> bool,
+
+    /// Used to create a clone on the rule. This should
+    /// only need to be called by the receiver of the plan.
+    pub clone: unsafe extern "C" fn(plan: &Self) -> Self,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(arg: &mut Self),
+
+    /// Return the major DataFusion version number of this rule.
+    pub version: unsafe extern "C" fn() -> u64,
+
+    /// Internal data. This is only to be accessed by the provider of the rule.
+    /// A [`ForeignPhysicalOptimizerRule`] should never attempt to access this data.
+    pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+unsafe impl Send for FFI_PhysicalOptimizerRule {}
+unsafe impl Sync for FFI_PhysicalOptimizerRule {}
+
+struct RulePrivateData {
+    rule: Arc<dyn PhysicalOptimizerRule + Send + Sync>,
+    runtime: Option<Handle>,
+}
+
+impl FFI_PhysicalOptimizerRule {
+    fn inner(&self) -> &Arc<dyn PhysicalOptimizerRule + Send + Sync> {
+        let private_data = self.private_data as *const RulePrivateData;
+        unsafe { &(*private_data).rule }
+    }
+
+    fn runtime(&self) -> Option<Handle> {
+        let private_data = self.private_data as *const RulePrivateData;
+        unsafe { (*private_data).runtime.clone() }
+    }
+}
+
+unsafe extern "C" fn optimize_fn_wrapper(
+    rule: &FFI_PhysicalOptimizerRule,
+    plan: &FFI_ExecutionPlan,
+    config: FFI_ConfigOptions,
+) -> FFI_Result<FFI_ExecutionPlan> {
+    let runtime = rule.runtime();
+    let rule = rule.inner();
+    let plan: Arc<dyn ExecutionPlan> = sresult_return!(plan.try_into());
+    let config = sresult_return!(ConfigOptions::try_from(config));
+    let optimized_plan = sresult_return!(rule.optimize(plan, &config));
+
+    FFI_Result::Ok(FFI_ExecutionPlan::new(optimized_plan, runtime))
+}
+
+unsafe extern "C" fn name_fn_wrapper(rule: &FFI_PhysicalOptimizerRule) -> SString {
+    let rule = rule.inner();
+    rule.name().into()
+}
+
+unsafe extern "C" fn schema_check_fn_wrapper(rule: &FFI_PhysicalOptimizerRule) -> bool {
+    rule.inner().schema_check()
+}
+
+unsafe extern "C" fn release_fn_wrapper(rule: &mut FFI_PhysicalOptimizerRule) {
+    unsafe {
+        debug_assert!(!rule.private_data.is_null());
+        let private_data = Box::from_raw(rule.private_data as *mut RulePrivateData);
+        drop(private_data);
+        rule.private_data = std::ptr::null_mut();
+    }
+}
+
+unsafe extern "C" fn clone_fn_wrapper(
+    rule: &FFI_PhysicalOptimizerRule,
+) -> FFI_PhysicalOptimizerRule {
+    let runtime = rule.runtime();
+    let rule = Arc::clone(rule.inner());
+
+    let private_data =
+        Box::into_raw(Box::new(RulePrivateData { rule, runtime })) as *mut c_void;
+
+    FFI_PhysicalOptimizerRule {
+        optimize: optimize_fn_wrapper,
+        name: name_fn_wrapper,
+        schema_check: schema_check_fn_wrapper,
+        clone: clone_fn_wrapper,
+        release: release_fn_wrapper,
+        version: super::version,
+        private_data,
+        library_marker_id: crate::get_library_marker_id,
+    }
+}
+
+impl Drop for FFI_PhysicalOptimizerRule {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+impl FFI_PhysicalOptimizerRule {
+    /// Creates a new [`FFI_PhysicalOptimizerRule`].
+    pub fn new(
+        rule: Arc<dyn PhysicalOptimizerRule + Send + Sync>,
+        runtime: Option<Handle>,
+    ) -> Self {
+        if let Some(rule) = (Arc::clone(&rule) as Arc<dyn std::any::Any>)
+            .downcast_ref::<ForeignPhysicalOptimizerRule>()
+        {
+            return rule.rule.clone();
+        }
+
+        let private_data = Box::new(RulePrivateData { rule, runtime });
+        let private_data = Box::into_raw(private_data) as *mut c_void;
+
+        Self {
+            optimize: optimize_fn_wrapper,
+            name: name_fn_wrapper,
+            schema_check: schema_check_fn_wrapper,
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+/// This wrapper struct exists on the receiver side of the FFI interface, so it has
+/// no guarantees about being able to access the data in `private_data`. Any functions
+/// defined on this struct must only use the stable functions provided in
+/// FFI_PhysicalOptimizerRule to interact with the foreign rule.
+#[derive(Debug)]
+pub struct ForeignPhysicalOptimizerRule {
+    name: String,
+    rule: FFI_PhysicalOptimizerRule,
+}
+
+unsafe impl Send for ForeignPhysicalOptimizerRule {}
+unsafe impl Sync for ForeignPhysicalOptimizerRule {}
+
+impl From<&FFI_PhysicalOptimizerRule> for Arc<dyn PhysicalOptimizerRule + Send + Sync> {
+    fn from(rule: &FFI_PhysicalOptimizerRule) -> Self {
+        if (rule.library_marker_id)() == crate::get_library_marker_id() {
+            return Arc::clone(rule.inner());
+        }
+
+        let name: String = unsafe { (rule.name)(rule).into() };
+        Arc::new(ForeignPhysicalOptimizerRule {
+            name,
+            rule: rule.clone(),
+        }) as Arc<dyn PhysicalOptimizerRule + Send + Sync>
+    }
+}
+
+impl Clone for FFI_PhysicalOptimizerRule {
+    fn clone(&self) -> Self {
+        unsafe { (self.clone)(self) }
+    }
+}
+
+#[async_trait]
+impl PhysicalOptimizerRule for ForeignPhysicalOptimizerRule {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let config_options: FFI_ConfigOptions = config.into();
+        let plan = FFI_ExecutionPlan::new(plan, None);
+
+        let optimized_plan = unsafe {
+            df_result!((self.rule.optimize)(&self.rule, &plan, config_options))?
+        };
+        (&optimized_plan).try_into()
+    }
+
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    fn schema_check(&self) -> bool {
+        unsafe { (self.rule.schema_check)(&self.rule) }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_common::error::Result;
+    use datafusion_physical_optimizer::PhysicalOptimizerRule;
+    use datafusion_physical_plan::ExecutionPlan;
+
+    use super::*;
+    use crate::execution_plan::tests::EmptyExec;
+
+    #[derive(Debug)]
+    struct NoOpRule {
+        schema_check: bool,
+    }
+
+    impl PhysicalOptimizerRule for NoOpRule {
+        fn optimize(
+            &self,
+            plan: Arc<dyn ExecutionPlan>,
+            _config: &ConfigOptions,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
+            Ok(plan)
+        }
+
+        fn name(&self) -> &str {
+            "no_op_rule"
+        }
+
+        fn schema_check(&self) -> bool {
+            self.schema_check
+        }
+    }
+
+    fn create_test_plan() -> Arc<dyn ExecutionPlan> {
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)]));
+        Arc::new(EmptyExec::new(schema))
+    }
+
+    #[test]
+    fn test_round_trip_ffi_physical_optimizer_rule() -> Result<()> {
+        for expected_schema_check in [true, false] {
+            let rule: Arc<dyn PhysicalOptimizerRule + Send + Sync> = Arc::new(NoOpRule {
+                schema_check: expected_schema_check,
+            });
+
+            let mut ffi_rule = FFI_PhysicalOptimizerRule::new(rule, None);
+            ffi_rule.library_marker_id = crate::mock_foreign_marker_id;
+
+            let foreign_rule: Arc<dyn PhysicalOptimizerRule + Send + Sync> =
+                (&ffi_rule).into();
+
+            assert_eq!(foreign_rule.name(), "no_op_rule");
+            assert_eq!(foreign_rule.schema_check(), expected_schema_check);
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_round_trip_optimize() -> Result<()> {
+        let rule: Arc<dyn PhysicalOptimizerRule + Send + Sync> =
+            Arc::new(NoOpRule { schema_check: true });
+
+        let mut ffi_rule = FFI_PhysicalOptimizerRule::new(rule, None);
+        ffi_rule.library_marker_id = crate::mock_foreign_marker_id;
+
+        let foreign_rule: Arc<dyn PhysicalOptimizerRule + Send + Sync> =
+            (&ffi_rule).into();
+
+        let plan = create_test_plan();
+        let config = ConfigOptions::new();
+
+        let optimized = foreign_rule.optimize(plan, &config)?;
+        assert_eq!(optimized.name(), "empty-exec");
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_local_bypass() -> Result<()> {
+        let rule: Arc<dyn PhysicalOptimizerRule + Send + Sync> =
+            Arc::new(NoOpRule { schema_check: true });
+
+        // Without mock marker, local bypass should return the original rule
+        let ffi_rule = FFI_PhysicalOptimizerRule::new(rule, None);
+        let recovered: Arc<dyn PhysicalOptimizerRule + Send + Sync> = (&ffi_rule).into();
+        let any_ref: &dyn std::any::Any = &*recovered;
+        assert!(any_ref.downcast_ref::<NoOpRule>().is_some());
+
+        // With mock marker, should wrap in ForeignPhysicalOptimizerRule
+        let rule2: Arc<dyn PhysicalOptimizerRule + Send + Sync> =
+            Arc::new(NoOpRule { schema_check: true });
+        let mut ffi_rule2 = FFI_PhysicalOptimizerRule::new(rule2, None);
+        ffi_rule2.library_marker_id = crate::mock_foreign_marker_id;
+        let recovered2: Arc<dyn PhysicalOptimizerRule + Send + Sync> =
+            (&ffi_rule2).into();
+        let any_ref2: &dyn std::any::Any = &*recovered2;
+        assert!(
+            any_ref2
+                .downcast_ref::<ForeignPhysicalOptimizerRule>()
+                .is_some()
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_clone() -> Result<()> {
+        let rule: Arc<dyn PhysicalOptimizerRule + Send + Sync> =
+            Arc::new(NoOpRule { schema_check: true });
+
+        let ffi_rule = FFI_PhysicalOptimizerRule::new(rule, None);
+        let cloned = ffi_rule.clone();
+
+        let name1: String = unsafe { (ffi_rule.name)(&ffi_rule).into() };
+        let name2: String = unsafe { (cloned.name)(&cloned).into() };
+        assert_eq!(name1, name2);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_foreign_rule_rewrap_bypass() -> Result<()> {
+        // When creating an FFI wrapper from a ForeignPhysicalOptimizerRule,
+        // it should return the inner FFI rule rather than double-wrapping.
+        let rule: Arc<dyn PhysicalOptimizerRule + Send + Sync> =
+            Arc::new(NoOpRule { schema_check: true });
+
+        let mut ffi_rule = FFI_PhysicalOptimizerRule::new(rule, None);
+        ffi_rule.library_marker_id = crate::mock_foreign_marker_id;
+
+        let foreign_rule: Arc<dyn PhysicalOptimizerRule + Send + Sync> =
+            (&ffi_rule).into();
+
+        // Now wrap the foreign rule back into FFI - should not double-wrap
+        let re_wrapped = FFI_PhysicalOptimizerRule::new(foreign_rule, None);
+        let name: String = unsafe { (re_wrapped.name)(&re_wrapped).into() };
+        assert_eq!(name, "no_op_rule");
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/src/plan_properties.rs b/datafusion/ffi/src/plan_properties.rs
index 48c2698a58c75..b286ee2d7d30c 100644
--- a/datafusion/ffi/src/plan_properties.rs
+++ b/datafusion/ffi/src/plan_properties.rs
@@ -15,46 +15,29 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{ffi::c_void, sync::Arc};
-
-use abi_stable::{
-    std_types::{
-        RResult::{self, ROk},
-        RString, RVec,
-    },
-    StableAbi,
-};
+use std::ffi::c_void;
+use std::sync::Arc;
+
 use arrow::datatypes::SchemaRef;
-use datafusion::{
-    error::{DataFusionError, Result},
-    physical_expr::EquivalenceProperties,
-    physical_plan::{
-        execution_plan::{Boundedness, EmissionType},
-        PlanProperties,
-    },
-    prelude::SessionContext,
-};
-use datafusion_proto::{
-    physical_plan::{
-        from_proto::{parse_physical_sort_exprs, parse_protobuf_partitioning},
-        to_proto::{serialize_partitioning, serialize_physical_sort_exprs},
-        DefaultPhysicalExtensionCodec,
-    },
-    protobuf::{Partitioning, PhysicalSortExprNodeCollection},
-};
-use prost::Message;
-
-use crate::{arrow_wrappers::WrappedSchema, df_result, rresult_return};
+use datafusion_common::error::{DataFusionError, Result};
+use datafusion_physical_expr::EquivalenceProperties;
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
+use datafusion_physical_plan::PlanProperties;
+use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
+
+use stabby::vec::Vec as SVec;
+
+use crate::arrow_wrappers::WrappedSchema;
+use crate::physical_expr::partitioning::FFI_Partitioning;
+use crate::physical_expr::sort::FFI_PhysicalSortExpr;
+use crate::util::FFI_Option;
 
 /// A stable struct for sharing [`PlanProperties`] across FFI boundaries.
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
+#[derive(Debug)]
 pub struct FFI_PlanProperties {
-    /// The output partitioning is a [`Partitioning`] protobuf message serialized
-    /// into bytes to pass across the FFI boundary.
-    pub output_partitioning:
-        unsafe extern "C" fn(plan: &Self) -> RResult<RVec<u8>, RString>,
+    /// The output partitioning of the plan.
+    pub output_partitioning: unsafe extern "C" fn(plan: &Self) -> FFI_Partitioning,
 
     /// Return the emission type of the plan.
     pub emission_type: unsafe extern "C" fn(plan: &Self) -> FFI_EmissionType,
@@ -62,9 +45,9 @@ pub struct FFI_PlanProperties {
     /// Indicate boundedness of the plan and its memory requirements.
     pub boundedness: unsafe extern "C" fn(plan: &Self) -> FFI_Boundedness,
 
-    /// The output ordering is a [`PhysicalSortExprNodeCollection`] protobuf message
-    /// serialized into bytes to pass across the FFI boundary.
-    pub output_ordering: unsafe extern "C" fn(plan: &Self) -> RResult<RVec<u8>, RString>,
+    /// The output ordering of the plan.
+    pub output_ordering:
+        unsafe extern "C" fn(plan: &Self) -> FFI_Option<SVec<FFI_PhysicalSortExpr>>,
 
     /// Return the schema of the plan.
     pub schema: unsafe extern "C" fn(plan: &Self) -> WrappedSchema,
@@ -75,77 +58,70 @@ pub struct FFI_PlanProperties {
     /// Internal data. This is only to be accessed by the provider of the plan.
     /// The foreign library should never attempt to access this data.
     pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
 }
 
 struct PlanPropertiesPrivateData {
     props: PlanProperties,
 }
 
+impl FFI_PlanProperties {
+    fn inner(&self) -> &PlanProperties {
+        let private_data = self.private_data as *const PlanPropertiesPrivateData;
+        unsafe { &(*private_data).props }
+    }
+}
+
 unsafe extern "C" fn output_partitioning_fn_wrapper(
     properties: &FFI_PlanProperties,
-) -> RResult<RVec<u8>, RString> {
-    let private_data = properties.private_data as *const PlanPropertiesPrivateData;
-    let props = &(*private_data).props;
-
-    let codec = DefaultPhysicalExtensionCodec {};
-    let partitioning_data =
-        rresult_return!(serialize_partitioning(props.output_partitioning(), &codec));
-    let output_partitioning = partitioning_data.encode_to_vec();
-
-    ROk(output_partitioning.into())
+) -> FFI_Partitioning {
+    properties.inner().output_partitioning().into()
 }
 
 unsafe extern "C" fn emission_type_fn_wrapper(
     properties: &FFI_PlanProperties,
 ) -> FFI_EmissionType {
-    let private_data = properties.private_data as *const PlanPropertiesPrivateData;
-    let props = &(*private_data).props;
-    props.emission_type.into()
+    properties.inner().emission_type.into()
 }
 
 unsafe extern "C" fn boundedness_fn_wrapper(
     properties: &FFI_PlanProperties,
 ) -> FFI_Boundedness {
-    let private_data = properties.private_data as *const PlanPropertiesPrivateData;
-    let props = &(*private_data).props;
-    props.boundedness.into()
+    properties.inner().boundedness.into()
 }
 
 unsafe extern "C" fn output_ordering_fn_wrapper(
     properties: &FFI_PlanProperties,
-) -> RResult<RVec<u8>, RString> {
-    let private_data = properties.private_data as *const PlanPropertiesPrivateData;
-    let props = &(*private_data).props;
-
-    let codec = DefaultPhysicalExtensionCodec {};
-    let output_ordering = match props.output_ordering() {
-        Some(ordering) => {
-            let physical_sort_expr_nodes = rresult_return!(
-                serialize_physical_sort_exprs(ordering.to_owned(), &codec)
-            );
-            let ordering_data = PhysicalSortExprNodeCollection {
-                physical_sort_expr_nodes,
-            };
-
-            ordering_data.encode_to_vec()
-        }
-        None => Vec::default(),
-    };
-    ROk(output_ordering.into())
+) -> FFI_Option<SVec<FFI_PhysicalSortExpr>> {
+    let ordering: Option<SVec<FFI_PhysicalSortExpr>> =
+        properties.inner().output_ordering().map(|lex_ordering| {
+            let vec_ordering: Vec<PhysicalSortExpr> = lex_ordering.clone().into();
+            vec_ordering
+                .iter()
+                .map(FFI_PhysicalSortExpr::from)
+                .collect()
+        });
+
+    ordering.into()
 }
 
 unsafe extern "C" fn schema_fn_wrapper(properties: &FFI_PlanProperties) -> WrappedSchema {
-    let private_data = properties.private_data as *const PlanPropertiesPrivateData;
-    let props = &(*private_data).props;
-
-    let schema: SchemaRef = Arc::clone(props.eq_properties.schema());
+    let schema: SchemaRef = Arc::clone(properties.inner().eq_properties.schema());
     schema.into()
 }
 
 unsafe extern "C" fn release_fn_wrapper(props: &mut FFI_PlanProperties) {
-    let private_data =
-        Box::from_raw(props.private_data as *mut PlanPropertiesPrivateData);
-    drop(private_data);
+    unsafe {
+        debug_assert!(!props.private_data.is_null());
+        let private_data =
+            Box::from_raw(props.private_data as *mut PlanPropertiesPrivateData);
+        drop(private_data);
+        props.private_data = std::ptr::null_mut();
+    }
 }
 
 impl Drop for FFI_PlanProperties {
@@ -168,6 +144,7 @@ impl From<&PlanProperties> for FFI_PlanProperties {
             schema: schema_fn_wrapper,
             release: release_fn_wrapper,
             private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
         }
     }
 }
@@ -176,41 +153,25 @@ impl TryFrom<FFI_PlanProperties> for PlanProperties {
     type Error = DataFusionError;
 
     fn try_from(ffi_props: FFI_PlanProperties) -> Result<Self, Self::Error> {
+        if (ffi_props.library_marker_id)() == crate::get_library_marker_id() {
+            return Ok(ffi_props.inner().clone());
+        }
+
         let ffi_schema = unsafe { (ffi_props.schema)(&ffi_props) };
         let schema = (&ffi_schema.0).try_into()?;
 
-        // TODO Extend FFI to get the registry and codex
-        let default_ctx = SessionContext::new();
-        let task_context = default_ctx.task_ctx();
-        let codex = DefaultPhysicalExtensionCodec {};
-
-        let ffi_orderings = unsafe { (ffi_props.output_ordering)(&ffi_props) };
-
-        let proto_output_ordering =
-            PhysicalSortExprNodeCollection::decode(df_result!(ffi_orderings)?.as_ref())
-                .map_err(|e| DataFusionError::External(Box::new(e)))?;
-        let sort_exprs = parse_physical_sort_exprs(
-            &proto_output_ordering.physical_sort_expr_nodes,
-            &task_context,
-            &schema,
-            &codex,
-        )?;
-
-        let partitioning_vec =
-            unsafe { df_result!((ffi_props.output_partitioning)(&ffi_props))? };
-        let proto_output_partitioning =
-            Partitioning::decode(partitioning_vec.as_ref())
-                .map_err(|e| DataFusionError::External(Box::new(e)))?;
-        let partitioning = parse_protobuf_partitioning(
-            Some(&proto_output_partitioning),
-            &task_context,
-            &schema,
-            &codex,
-        )?
-        .ok_or(DataFusionError::Plan(
-            "Unable to deserialize partitioning protobuf in FFI_PlanProperties"
-                .to_string(),
-        ))?;
+        let ffi_orderings: Option<SVec<FFI_PhysicalSortExpr>> =
+            unsafe { (ffi_props.output_ordering)(&ffi_props) }.into();
+        let sort_exprs = ffi_orderings
+            .map(|ordering_vec| {
+                ordering_vec
+                    .iter()
+                    .map(PhysicalSortExpr::from)
+                    .collect::<Vec<_>>()
+            })
+            .unwrap_or_default();
+
+        let partitioning = unsafe { (ffi_props.output_partitioning)(&ffi_props) };
 
         let eq_properties = if sort_exprs.is_empty() {
             EquivalenceProperties::new(Arc::new(schema))
@@ -226,7 +187,7 @@ impl TryFrom<FFI_PlanProperties> for PlanProperties {
 
         Ok(PlanProperties::new(
             eq_properties,
-            partitioning,
+            (&partitioning).into(),
             emission_type,
             boundedness,
         ))
@@ -234,9 +195,8 @@ impl TryFrom<FFI_PlanProperties> for PlanProperties {
 }
 
 /// FFI safe version of [`Boundedness`].
-#[repr(C)]
-#[allow(non_camel_case_types)]
-#[derive(Clone, StableAbi)]
+#[repr(C, u8)]
+#[derive(Clone)]
 pub enum FFI_Boundedness {
     Bounded,
     Unbounded { requires_infinite_memory: bool },
@@ -269,9 +229,9 @@ impl From<FFI_Boundedness> for Boundedness {
 }
 
 /// FFI safe version of [`EmissionType`].
-#[repr(C)]
-#[allow(non_camel_case_types)]
-#[derive(Clone, StableAbi)]
+#[expect(non_camel_case_types)]
+#[repr(u8)]
+#[derive(Clone)]
 pub enum FFI_EmissionType {
     Incremental,
     Final,
@@ -300,12 +260,12 @@ impl From<FFI_EmissionType> for EmissionType {
 
 #[cfg(test)]
 mod tests {
-    use datafusion::{physical_expr::PhysicalSortExpr, physical_plan::Partitioning};
+    use datafusion::physical_expr::PhysicalSortExpr;
+    use datafusion::physical_plan::Partitioning;
 
     use super::*;
 
-    #[test]
-    fn test_round_trip_ffi_plan_properties() -> Result<()> {
+    fn create_test_props() -> Result<PlanProperties> {
         use arrow::datatypes::{DataType, Field, Schema};
         let schema =
             Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)]));
@@ -314,14 +274,20 @@ mod tests {
         let _ = eqp.reorder([PhysicalSortExpr::new_default(
             datafusion::physical_plan::expressions::col("a", &schema)?,
         )]);
-        let original_props = PlanProperties::new(
+        Ok(PlanProperties::new(
             eqp,
             Partitioning::RoundRobinBatch(3),
             EmissionType::Incremental,
             Boundedness::Bounded,
-        );
+        ))
+    }
+
+    #[test]
+    fn test_round_trip_ffi_plan_properties() -> Result<()> {
+        let original_props = create_test_props()?;
 
-        let local_props_ptr = FFI_PlanProperties::from(&original_props);
+        let mut local_props_ptr = FFI_PlanProperties::from(&original_props);
+        local_props_ptr.library_marker_id = crate::mock_foreign_marker_id;
 
         let foreign_props: PlanProperties = local_props_ptr.try_into()?;
 
@@ -329,4 +295,23 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_ffi_plan_properties_local_bypass() -> Result<()> {
+        let props = create_test_props()?;
+
+        let ffi_plan = FFI_PlanProperties::from(&props);
+
+        // Verify local libraries
+        let foreign_plan: PlanProperties = ffi_plan.try_into()?;
+        assert_eq!(format!("{foreign_plan:?}"), format!("{props:?}"));
+
+        // Verify different library markers still can produce identical properties
+        let mut ffi_plan = FFI_PlanProperties::from(&props);
+        ffi_plan.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_plan: PlanProperties = ffi_plan.try_into()?;
+        assert_eq!(format!("{foreign_plan:?}"), format!("{props:?}"));
+
+        Ok(())
+    }
 }
diff --git a/datafusion/ffi/src/proto/logical_extension_codec.rs b/datafusion/ffi/src/proto/logical_extension_codec.rs
new file mode 100644
index 0000000000000..97aa5c901a636
--- /dev/null
+++ b/datafusion/ffi/src/proto/logical_extension_codec.rs
@@ -0,0 +1,732 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::ffi::c_void;
+use std::sync::Arc;
+
+use arrow::datatypes::SchemaRef;
+use datafusion_catalog::TableProvider;
+use datafusion_common::error::Result;
+use datafusion_common::{TableReference, not_impl_err};
+use datafusion_datasource::file_format::FileFormatFactory;
+use datafusion_execution::{TaskContext, TaskContextProvider};
+use datafusion_expr::{
+    AggregateUDF, AggregateUDFImpl, Extension, LogicalPlan, ScalarUDF, ScalarUDFImpl,
+    WindowUDF, WindowUDFImpl,
+};
+use datafusion_proto::logical_plan::{
+    DefaultLogicalExtensionCodec, LogicalExtensionCodec,
+};
+
+use stabby::slice::Slice as SSlice;
+use stabby::str::Str as SStr;
+use stabby::vec::Vec as SVec;
+use tokio::runtime::Handle;
+
+use crate::arrow_wrappers::WrappedSchema;
+use crate::execution::FFI_TaskContextProvider;
+use crate::table_provider::FFI_TableProvider;
+use crate::udaf::FFI_AggregateUDF;
+use crate::udf::FFI_ScalarUDF;
+use crate::udwf::FFI_WindowUDF;
+use crate::util::FFI_Result;
+use crate::{df_result, sresult_return};
+
+/// A stable struct for sharing [`LogicalExtensionCodec`] across FFI boundaries.
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_LogicalExtensionCodec {
+    /// Decode bytes into a table provider.
+    try_decode_table_provider: unsafe extern "C" fn(
+        &Self,
+        buf: SSlice<u8>,
+        table_ref: SStr,
+        schema: WrappedSchema,
+    ) -> FFI_Result<FFI_TableProvider>,
+
+    /// Encode a table provider into bytes.
+    try_encode_table_provider: unsafe extern "C" fn(
+        &Self,
+        table_ref: SStr,
+        node: FFI_TableProvider,
+    ) -> FFI_Result<SVec<u8>>,
+
+    /// Decode bytes into a user defined scalar function.
+    try_decode_udf: unsafe extern "C" fn(
+        &Self,
+        name: SStr,
+        buf: SSlice<u8>,
+    ) -> FFI_Result<FFI_ScalarUDF>,
+
+    /// Encode a user defined scalar function into bytes.
+    try_encode_udf:
+        unsafe extern "C" fn(&Self, node: FFI_ScalarUDF) -> FFI_Result<SVec<u8>>,
+
+    /// Decode bytes into a user defined aggregate function.
+    try_decode_udaf: unsafe extern "C" fn(
+        &Self,
+        name: SStr,
+        buf: SSlice<u8>,
+    ) -> FFI_Result<FFI_AggregateUDF>,
+
+    /// Encode a user defined aggregate function into bytes.
+    try_encode_udaf:
+        unsafe extern "C" fn(&Self, node: FFI_AggregateUDF) -> FFI_Result<SVec<u8>>,
+
+    /// Decode bytes into a user defined window function.
+    try_decode_udwf: unsafe extern "C" fn(
+        &Self,
+        name: SStr,
+        buf: SSlice<u8>,
+    ) -> FFI_Result<FFI_WindowUDF>,
+
+    /// Encode a user defined window function into bytes.
+    try_encode_udwf:
+        unsafe extern "C" fn(&Self, node: FFI_WindowUDF) -> FFI_Result<SVec<u8>>,
+
+    pub task_ctx_provider: FFI_TaskContextProvider,
+
+    /// Used to create a clone on the provider of the execution plan. This should
+    /// only need to be called by the receiver of the plan.
+    pub clone: unsafe extern "C" fn(plan: &Self) -> Self,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(arg: &mut Self),
+
+    /// Return the major DataFusion version number of this provider.
+    pub version: unsafe extern "C" fn() -> u64,
+
+    /// Internal data. This is only to be accessed by the provider of the plan.
+    /// A [`ForeignLogicalExtensionCodec`] should never attempt to access this data.
+    pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+unsafe impl Send for FFI_LogicalExtensionCodec {}
+unsafe impl Sync for FFI_LogicalExtensionCodec {}
+
+struct LogicalExtensionCodecPrivateData {
+    codec: Arc<dyn LogicalExtensionCodec>,
+    runtime: Option<Handle>,
+}
+
+impl FFI_LogicalExtensionCodec {
+    fn inner(&self) -> &Arc<dyn LogicalExtensionCodec> {
+        let private_data = self.private_data as *const LogicalExtensionCodecPrivateData;
+        unsafe { &(*private_data).codec }
+    }
+
+    fn runtime(&self) -> &Option<Handle> {
+        let private_data = self.private_data as *const LogicalExtensionCodecPrivateData;
+        unsafe { &(*private_data).runtime }
+    }
+
+    fn task_ctx(&self) -> Result<Arc<TaskContext>> {
+        (&self.task_ctx_provider).try_into()
+    }
+}
+
+unsafe extern "C" fn try_decode_table_provider_fn_wrapper(
+    codec: &FFI_LogicalExtensionCodec,
+    buf: SSlice<u8>,
+    table_ref: SStr,
+    schema: WrappedSchema,
+) -> FFI_Result<FFI_TableProvider> {
+    let ctx = sresult_return!(codec.task_ctx());
+    let runtime = codec.runtime().clone();
+    let codec_inner = codec.inner();
+    let table_ref = TableReference::from(table_ref.as_str());
+    let schema: SchemaRef = schema.into();
+
+    let table_provider = sresult_return!(codec_inner.try_decode_table_provider(
+        buf.as_ref(),
+        &table_ref,
+        schema,
+        ctx.as_ref()
+    ));
+
+    FFI_Result::Ok(FFI_TableProvider::new_with_ffi_codec(
+        table_provider,
+        true,
+        runtime,
+        codec.clone(),
+    ))
+}
+
+unsafe extern "C" fn try_encode_table_provider_fn_wrapper(
+    codec: &FFI_LogicalExtensionCodec,
+    table_ref: SStr,
+    node: FFI_TableProvider,
+) -> FFI_Result<SVec<u8>> {
+    let table_ref = TableReference::from(table_ref.as_str());
+    let table_provider: Arc<dyn TableProvider> = (&node).into();
+    let codec = codec.inner();
+
+    let mut bytes = Vec::new();
+    sresult_return!(codec.try_encode_table_provider(
+        &table_ref,
+        table_provider,
+        &mut bytes
+    ));
+
+    FFI_Result::Ok(bytes.into_iter().collect())
+}
+
+unsafe extern "C" fn try_decode_udf_fn_wrapper(
+    codec: &FFI_LogicalExtensionCodec,
+    name: SStr,
+    buf: SSlice<u8>,
+) -> FFI_Result<FFI_ScalarUDF> {
+    let codec = codec.inner();
+
+    let udf = sresult_return!(codec.try_decode_udf(name.as_str(), buf.as_ref()));
+    let udf = FFI_ScalarUDF::from(udf);
+
+    FFI_Result::Ok(udf)
+}
+
+unsafe extern "C" fn try_encode_udf_fn_wrapper(
+    codec: &FFI_LogicalExtensionCodec,
+    node: FFI_ScalarUDF,
+) -> FFI_Result<SVec<u8>> {
+    let codec = codec.inner();
+    let node: Arc<dyn ScalarUDFImpl> = (&node).into();
+    let node = ScalarUDF::new_from_shared_impl(node);
+
+    let mut bytes = Vec::new();
+    sresult_return!(codec.try_encode_udf(&node, &mut bytes));
+
+    FFI_Result::Ok(bytes.into_iter().collect())
+}
+
+unsafe extern "C" fn try_decode_udaf_fn_wrapper(
+    codec: &FFI_LogicalExtensionCodec,
+    name: SStr,
+    buf: SSlice<u8>,
+) -> FFI_Result<FFI_AggregateUDF> {
+    let codec_inner = codec.inner();
+    let udaf = sresult_return!(codec_inner.try_decode_udaf(name.into(), buf.as_ref()));
+    let udaf = FFI_AggregateUDF::from(udaf);
+
+    FFI_Result::Ok(udaf)
+}
+
+unsafe extern "C" fn try_encode_udaf_fn_wrapper(
+    codec: &FFI_LogicalExtensionCodec,
+    node: FFI_AggregateUDF,
+) -> FFI_Result<SVec<u8>> {
+    let codec = codec.inner();
+    let udaf: Arc<dyn AggregateUDFImpl> = (&node).into();
+    let udaf = AggregateUDF::new_from_shared_impl(udaf);
+
+    let mut bytes = Vec::new();
+    sresult_return!(codec.try_encode_udaf(&udaf, &mut bytes));
+
+    FFI_Result::Ok(bytes.into_iter().collect())
+}
+
+unsafe extern "C" fn try_decode_udwf_fn_wrapper(
+    codec: &FFI_LogicalExtensionCodec,
+    name: SStr,
+    buf: SSlice<u8>,
+) -> FFI_Result<FFI_WindowUDF> {
+    let codec = codec.inner();
+    let udwf = sresult_return!(codec.try_decode_udwf(name.into(), buf.as_ref()));
+    let udwf = FFI_WindowUDF::from(udwf);
+
+    FFI_Result::Ok(udwf)
+}
+
+unsafe extern "C" fn try_encode_udwf_fn_wrapper(
+    codec: &FFI_LogicalExtensionCodec,
+    node: FFI_WindowUDF,
+) -> FFI_Result<SVec<u8>> {
+    let codec = codec.inner();
+    let udwf: Arc<dyn WindowUDFImpl> = (&node).into();
+    let udwf = WindowUDF::new_from_shared_impl(udwf);
+
+    let mut bytes = Vec::new();
+    sresult_return!(codec.try_encode_udwf(&udwf, &mut bytes));
+
+    FFI_Result::Ok(bytes.into_iter().collect())
+}
+
+unsafe extern "C" fn release_fn_wrapper(provider: &mut FFI_LogicalExtensionCodec) {
+    unsafe {
+        let private_data =
+            Box::from_raw(provider.private_data as *mut LogicalExtensionCodecPrivateData);
+        drop(private_data);
+    }
+}
+
+unsafe extern "C" fn clone_fn_wrapper(
+    codec: &FFI_LogicalExtensionCodec,
+) -> FFI_LogicalExtensionCodec {
+    let old_codec = Arc::clone(codec.inner());
+    let runtime = codec.runtime().clone();
+
+    FFI_LogicalExtensionCodec::new(old_codec, runtime, codec.task_ctx_provider.clone())
+}
+
+impl Drop for FFI_LogicalExtensionCodec {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+impl FFI_LogicalExtensionCodec {
+    /// Creates a new [`FFI_LogicalExtensionCodec`].
+    pub fn new(
+        codec: Arc<dyn LogicalExtensionCodec + Send>,
+        runtime: Option<Handle>,
+        task_ctx_provider: impl Into<FFI_TaskContextProvider>,
+    ) -> Self {
+        if let Some(codec) = (Arc::clone(&codec) as Arc<dyn Any>)
+            .downcast_ref::<ForeignLogicalExtensionCodec>()
+        {
+            return codec.0.clone();
+        }
+
+        let task_ctx_provider = task_ctx_provider.into();
+        let private_data = Box::new(LogicalExtensionCodecPrivateData { codec, runtime });
+
+        Self {
+            try_decode_table_provider: try_decode_table_provider_fn_wrapper,
+            try_encode_table_provider: try_encode_table_provider_fn_wrapper,
+            try_decode_udf: try_decode_udf_fn_wrapper,
+            try_encode_udf: try_encode_udf_fn_wrapper,
+            try_decode_udaf: try_decode_udaf_fn_wrapper,
+            try_encode_udaf: try_encode_udaf_fn_wrapper,
+            try_decode_udwf: try_decode_udwf_fn_wrapper,
+            try_encode_udwf: try_encode_udwf_fn_wrapper,
+            task_ctx_provider,
+
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: crate::version,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+
+    pub fn new_default(task_ctx_provider: &Arc<dyn TaskContextProvider>) -> Self {
+        let task_ctx_provider = FFI_TaskContextProvider::from(task_ctx_provider);
+        let codec = Arc::new(DefaultLogicalExtensionCodec {});
+
+        Self::new(codec, None, task_ctx_provider)
+    }
+}
+
+/// This wrapper struct exists on the receiver side of the FFI interface, so it has
+/// no guarantees about being able to access the data in `private_data`. Any functions
+/// defined on this struct must only use the stable functions provided in
+/// FFI_LogicalExtensionCodec to interact with the foreign table provider.
+#[derive(Debug)]
+pub struct ForeignLogicalExtensionCodec(pub FFI_LogicalExtensionCodec);
+
+unsafe impl Send for ForeignLogicalExtensionCodec {}
+unsafe impl Sync for ForeignLogicalExtensionCodec {}
+
+impl From<&FFI_LogicalExtensionCodec> for Arc<dyn LogicalExtensionCodec> {
+    fn from(provider: &FFI_LogicalExtensionCodec) -> Self {
+        if (provider.library_marker_id)() == crate::get_library_marker_id() {
+            Arc::clone(provider.inner())
+        } else {
+            Arc::new(ForeignLogicalExtensionCodec(provider.clone()))
+        }
+    }
+}
+
+impl Clone for FFI_LogicalExtensionCodec {
+    fn clone(&self) -> Self {
+        unsafe { (self.clone)(self) }
+    }
+}
+
+impl LogicalExtensionCodec for ForeignLogicalExtensionCodec {
+    fn try_decode(
+        &self,
+        _buf: &[u8],
+        _inputs: &[LogicalPlan],
+        _ctx: &TaskContext,
+    ) -> Result<Extension> {
+        not_impl_err!("FFI does not support decode of Extensions")
+    }
+
+    fn try_encode(&self, _node: &Extension, _buf: &mut Vec<u8>) -> Result<()> {
+        not_impl_err!("FFI does not support encode of Extensions")
+    }
+
+    fn try_decode_table_provider(
+        &self,
+        buf: &[u8],
+        table_ref: &TableReference,
+        schema: SchemaRef,
+        _ctx: &TaskContext,
+    ) -> Result<Arc<dyn TableProvider>> {
+        let table_ref = table_ref.to_string();
+        let schema: WrappedSchema = schema.into();
+
+        let ffi_table_provider = unsafe {
+            df_result!((self.0.try_decode_table_provider)(
+                &self.0,
+                buf.into(),
+                table_ref.as_str().into(),
+                schema
+            ))
+        }?;
+
+        Ok((&ffi_table_provider).into())
+    }
+
+    fn try_encode_table_provider(
+        &self,
+        table_ref: &TableReference,
+        node: Arc<dyn TableProvider>,
+        buf: &mut Vec<u8>,
+    ) -> Result<()> {
+        let table_ref = table_ref.to_string();
+        let node =
+            FFI_TableProvider::new_with_ffi_codec(node, true, None, self.0.clone());
+
+        let bytes = df_result!(unsafe {
+            (self.0.try_encode_table_provider)(&self.0, table_ref.as_str().into(), node)
+        })?;
+
+        buf.extend(bytes);
+
+        Ok(())
+    }
+
+    fn try_decode_file_format(
+        &self,
+        _buf: &[u8],
+        _ctx: &TaskContext,
+    ) -> Result<Arc<dyn FileFormatFactory>> {
+        not_impl_err!("FFI does not support decode_file_format")
+    }
+
+    fn try_encode_file_format(
+        &self,
+        _buf: &mut Vec<u8>,
+        _node: Arc<dyn FileFormatFactory>,
+    ) -> Result<()> {
+        not_impl_err!("FFI does not support encode_file_format")
+    }
+
+    fn try_decode_udf(&self, name: &str, buf: &[u8]) -> Result<Arc<ScalarUDF>> {
+        let udf = unsafe {
+            df_result!((self.0.try_decode_udf)(&self.0, name.into(), buf.into()))
+        }?;
+        let udf: Arc<dyn ScalarUDFImpl> = (&udf).into();
+
+        Ok(Arc::new(ScalarUDF::new_from_shared_impl(udf)))
+    }
+
+    fn try_encode_udf(&self, node: &ScalarUDF, buf: &mut Vec<u8>) -> Result<()> {
+        let node = FFI_ScalarUDF::from(Arc::new(node.clone()));
+        let bytes = df_result!(unsafe { (self.0.try_encode_udf)(&self.0, node) })?;
+
+        buf.extend(bytes);
+
+        Ok(())
+    }
+
+    fn try_decode_udaf(&self, name: &str, buf: &[u8]) -> Result<Arc<AggregateUDF>> {
+        let udaf = unsafe {
+            df_result!((self.0.try_decode_udaf)(&self.0, name.into(), buf.into()))
+        }?;
+        let udaf: Arc<dyn AggregateUDFImpl> = (&udaf).into();
+
+        Ok(Arc::new(AggregateUDF::new_from_shared_impl(udaf)))
+    }
+
+    fn try_encode_udaf(&self, node: &AggregateUDF, buf: &mut Vec<u8>) -> Result<()> {
+        let node = Arc::new(node.clone());
+        let node = FFI_AggregateUDF::from(node);
+        let bytes = df_result!(unsafe { (self.0.try_encode_udaf)(&self.0, node) })?;
+
+        buf.extend(bytes);
+
+        Ok(())
+    }
+
+    fn try_decode_udwf(&self, name: &str, buf: &[u8]) -> Result<Arc<WindowUDF>> {
+        let udwf = unsafe {
+            df_result!((self.0.try_decode_udwf)(&self.0, name.into(), buf.into()))
+        }?;
+        let udwf: Arc<dyn WindowUDFImpl> = (&udwf).into();
+
+        Ok(Arc::new(WindowUDF::new_from_shared_impl(udwf)))
+    }
+
+    fn try_encode_udwf(&self, node: &WindowUDF, buf: &mut Vec<u8>) -> Result<()> {
+        let node = Arc::new(node.clone());
+        let node = FFI_WindowUDF::from(node);
+        let bytes = df_result!(unsafe { (self.0.try_encode_udwf)(&self.0, node) })?;
+
+        buf.extend(bytes);
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::array::record_batch;
+    use arrow_schema::{DataType, Field, Schema, SchemaRef};
+    use datafusion_catalog::{MemTable, TableProvider};
+    use datafusion_common::{Result, TableReference, exec_err};
+    use datafusion_datasource::file_format::FileFormatFactory;
+    use datafusion_execution::TaskContext;
+    use datafusion_expr::ptr_eq::arc_ptr_eq;
+    use datafusion_expr::{AggregateUDF, Extension, LogicalPlan, ScalarUDF, WindowUDF};
+    use datafusion_functions::math::abs::AbsFunc;
+    use datafusion_functions_aggregate::sum::Sum;
+    use datafusion_functions_window::rank::{Rank, RankType};
+    use datafusion_proto::logical_plan::LogicalExtensionCodec;
+    use datafusion_proto::physical_plan::PhysicalExtensionCodec;
+
+    use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+    use crate::proto::physical_extension_codec::tests::TestExtensionCodec;
+
+    fn create_test_table() -> MemTable {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)]));
+        let rb = record_batch!(("a", Int32, [1, 2, 3]))
+            .expect("should be able to create a record batch");
+        MemTable::try_new(schema, vec![vec![rb]])
+            .expect("should be able to create an in memory table")
+    }
+
+    impl LogicalExtensionCodec for TestExtensionCodec {
+        fn try_decode(
+            &self,
+            _buf: &[u8],
+            _inputs: &[LogicalPlan],
+            _ctx: &TaskContext,
+        ) -> Result<Extension> {
+            unimplemented!()
+        }
+
+        fn try_encode(&self, _node: &Extension, _buf: &mut Vec<u8>) -> Result<()> {
+            unimplemented!()
+        }
+
+        fn try_decode_table_provider(
+            &self,
+            buf: &[u8],
+            _table_ref: &TableReference,
+            schema: SchemaRef,
+            _ctx: &TaskContext,
+        ) -> Result<Arc<dyn TableProvider>> {
+            if buf[0] != Self::MAGIC_NUMBER {
+                return exec_err!(
+                    "TestExtensionCodec input buffer does not start with magic number"
+                );
+            }
+
+            if schema != create_test_table().schema() {
+                return exec_err!("Incorrect test table schema");
+            }
+
+            if buf.len() != 2 || buf[1] != Self::MEMTABLE_SERIALIZED {
+                return exec_err!("TestExtensionCodec unable to decode table provider");
+            }
+
+            Ok(Arc::new(create_test_table()) as Arc<dyn TableProvider>)
+        }
+
+        fn try_encode_table_provider(
+            &self,
+            _table_ref: &TableReference,
+            node: Arc<dyn TableProvider>,
+            buf: &mut Vec<u8>,
+        ) -> Result<()> {
+            buf.push(Self::MAGIC_NUMBER);
+
+            if !node.is::<MemTable>() {
+                return exec_err!("TestExtensionCodec only expects MemTable");
+            };
+
+            if node.schema() != create_test_table().schema() {
+                return exec_err!("Unexpected schema for encoding.");
+            }
+
+            buf.push(Self::MEMTABLE_SERIALIZED);
+
+            Ok(())
+        }
+
+        fn try_decode_file_format(
+            &self,
+            _buf: &[u8],
+            _ctx: &TaskContext,
+        ) -> Result<Arc<dyn FileFormatFactory>> {
+            unimplemented!()
+        }
+
+        fn try_encode_file_format(
+            &self,
+            _buf: &mut Vec<u8>,
+            _node: Arc<dyn FileFormatFactory>,
+        ) -> Result<()> {
+            unimplemented!()
+        }
+
+        fn try_decode_udf(&self, name: &str, buf: &[u8]) -> Result<Arc<ScalarUDF>> {
+            PhysicalExtensionCodec::try_decode_udf(self, name, buf)
+        }
+
+        fn try_encode_udf(&self, node: &ScalarUDF, buf: &mut Vec<u8>) -> Result<()> {
+            PhysicalExtensionCodec::try_encode_udf(self, node, buf)
+        }
+
+        fn try_decode_udaf(&self, name: &str, buf: &[u8]) -> Result<Arc<AggregateUDF>> {
+            PhysicalExtensionCodec::try_decode_udaf(self, name, buf)
+        }
+
+        fn try_encode_udaf(&self, node: &AggregateUDF, buf: &mut Vec<u8>) -> Result<()> {
+            PhysicalExtensionCodec::try_encode_udaf(self, node, buf)
+        }
+
+        fn try_decode_udwf(&self, name: &str, buf: &[u8]) -> Result<Arc<WindowUDF>> {
+            PhysicalExtensionCodec::try_decode_udwf(self, name, buf)
+        }
+
+        fn try_encode_udwf(&self, node: &WindowUDF, buf: &mut Vec<u8>) -> Result<()> {
+            PhysicalExtensionCodec::try_encode_udwf(self, node, buf)
+        }
+    }
+
+    #[test]
+    fn roundtrip_ffi_logical_extension_codec_table_provider() -> Result<()> {
+        let codec = Arc::new(TestExtensionCodec {});
+        let (ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+
+        let mut ffi_codec =
+            FFI_LogicalExtensionCodec::new(codec, None, task_ctx_provider);
+        ffi_codec.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_codec: Arc<dyn LogicalExtensionCodec> = (&ffi_codec).into();
+
+        let table = Arc::new(create_test_table()) as Arc<dyn TableProvider>;
+        let mut bytes = Vec::new();
+        foreign_codec.try_encode_table_provider(&"my_table".into(), table, &mut bytes)?;
+
+        let returned_table = foreign_codec.try_decode_table_provider(
+            &bytes,
+            &"my_table".into(),
+            create_test_table().schema(),
+            ctx.task_ctx().as_ref(),
+        )?;
+
+        assert!(returned_table.is::<MemTable>());
+
+        Ok(())
+    }
+
+    #[test]
+    fn roundtrip_ffi_logical_extension_codec_udf() -> Result<()> {
+        let codec = Arc::new(TestExtensionCodec {});
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+
+        let mut ffi_codec =
+            FFI_LogicalExtensionCodec::new(codec, None, task_ctx_provider);
+        ffi_codec.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_codec: Arc<dyn LogicalExtensionCodec> = (&ffi_codec).into();
+
+        let udf = Arc::new(ScalarUDF::from(AbsFunc::new()));
+        let mut bytes = Vec::new();
+        foreign_codec.try_encode_udf(udf.as_ref(), &mut bytes)?;
+
+        let returned_udf = foreign_codec.try_decode_udf(udf.name(), &bytes)?;
+
+        assert!(returned_udf.inner().is::<AbsFunc>());
+
+        Ok(())
+    }
+
+    #[test]
+    fn roundtrip_ffi_logical_extension_codec_udaf() -> Result<()> {
+        let codec = Arc::new(TestExtensionCodec {});
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+
+        let mut ffi_codec =
+            FFI_LogicalExtensionCodec::new(codec, None, task_ctx_provider);
+        ffi_codec.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_codec: Arc<dyn LogicalExtensionCodec> = (&ffi_codec).into();
+
+        let udf = Arc::new(AggregateUDF::from(Sum::new()));
+        let mut bytes = Vec::new();
+        foreign_codec.try_encode_udaf(udf.as_ref(), &mut bytes)?;
+
+        let returned_udf = foreign_codec.try_decode_udaf(udf.name(), &bytes)?;
+
+        assert!(returned_udf.inner().is::<Sum>());
+
+        Ok(())
+    }
+
+    #[test]
+    fn roundtrip_ffi_logical_extension_codec_udwf() -> Result<()> {
+        let codec = Arc::new(TestExtensionCodec {});
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+
+        let mut ffi_codec =
+            FFI_LogicalExtensionCodec::new(codec, None, task_ctx_provider);
+        ffi_codec.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_codec: Arc<dyn LogicalExtensionCodec> = (&ffi_codec).into();
+
+        let udf = Arc::new(WindowUDF::from(Rank::new(
+            "my_rank".to_owned(),
+            RankType::Basic,
+        )));
+        let mut bytes = Vec::new();
+        foreign_codec.try_encode_udwf(udf.as_ref(), &mut bytes)?;
+
+        let returned_udf = foreign_codec.try_decode_udwf(udf.name(), &bytes)?;
+
+        assert!(returned_udf.inner().is::<Rank>());
+
+        Ok(())
+    }
+
+    #[test]
+    fn ffi_logical_extension_codec_local_bypass() {
+        let codec =
+            Arc::new(TestExtensionCodec {}) as Arc<dyn LogicalExtensionCodec + Send>;
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+
+        let mut ffi_codec =
+            FFI_LogicalExtensionCodec::new(Arc::clone(&codec), None, task_ctx_provider);
+
+        let codec = codec as Arc<dyn LogicalExtensionCodec>;
+        // Verify local libraries can be downcast to their original
+        let foreign_codec: Arc<dyn LogicalExtensionCodec> = (&ffi_codec).into();
+        assert!(arc_ptr_eq(&foreign_codec, &codec));
+
+        // Verify different library markers generate foreign providers
+        ffi_codec.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_codec: Arc<dyn LogicalExtensionCodec> = (&ffi_codec).into();
+        assert!(!arc_ptr_eq(&foreign_codec, &codec));
+    }
+}
diff --git a/datafusion/ffi/src/proto/mod.rs b/datafusion/ffi/src/proto/mod.rs
new file mode 100644
index 0000000000000..ae76027ecb64e
--- /dev/null
+++ b/datafusion/ffi/src/proto/mod.rs
@@ -0,0 +1,19 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+pub mod logical_extension_codec;
+pub mod physical_extension_codec;
diff --git a/datafusion/ffi/src/proto/physical_extension_codec.rs b/datafusion/ffi/src/proto/physical_extension_codec.rs
new file mode 100644
index 0000000000000..60d9d03dbd6dd
--- /dev/null
+++ b/datafusion/ffi/src/proto/physical_extension_codec.rs
@@ -0,0 +1,685 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::ffi::c_void;
+use std::sync::Arc;
+
+use datafusion_common::error::Result;
+use datafusion_execution::TaskContext;
+use datafusion_expr::{
+    AggregateUDF, AggregateUDFImpl, ScalarUDF, ScalarUDFImpl, WindowUDF, WindowUDFImpl,
+};
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_proto::physical_plan::PhysicalExtensionCodec;
+
+use stabby::slice::Slice as SSlice;
+use stabby::str::Str as SStr;
+use stabby::vec::Vec as SVec;
+use tokio::runtime::Handle;
+
+use crate::execution::FFI_TaskContextProvider;
+use crate::execution_plan::FFI_ExecutionPlan;
+use crate::udaf::FFI_AggregateUDF;
+use crate::udf::FFI_ScalarUDF;
+use crate::udwf::FFI_WindowUDF;
+use crate::util::FFI_Result;
+use crate::{df_result, sresult_return};
+
+/// A stable struct for sharing [`PhysicalExtensionCodec`] across FFI boundaries.
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_PhysicalExtensionCodec {
+    /// Decode bytes into an execution plan.
+    try_decode: unsafe extern "C" fn(
+        &Self,
+        buf: SSlice<u8>,
+        inputs: SVec<FFI_ExecutionPlan>,
+    ) -> FFI_Result<FFI_ExecutionPlan>,
+
+    /// Encode an execution plan into bytes.
+    try_encode:
+        unsafe extern "C" fn(&Self, node: FFI_ExecutionPlan) -> FFI_Result<SVec<u8>>,
+
+    /// Decode bytes into a user defined scalar function.
+    try_decode_udf: unsafe extern "C" fn(
+        &Self,
+        name: SStr,
+        buf: SSlice<u8>,
+    ) -> FFI_Result<FFI_ScalarUDF>,
+
+    /// Encode a user defined scalar function into bytes.
+    try_encode_udf:
+        unsafe extern "C" fn(&Self, node: FFI_ScalarUDF) -> FFI_Result<SVec<u8>>,
+
+    /// Decode bytes into a user defined aggregate function.
+    try_decode_udaf: unsafe extern "C" fn(
+        &Self,
+        name: SStr,
+        buf: SSlice<u8>,
+    ) -> FFI_Result<FFI_AggregateUDF>,
+
+    /// Encode a user defined aggregate function into bytes.
+    try_encode_udaf:
+        unsafe extern "C" fn(&Self, node: FFI_AggregateUDF) -> FFI_Result<SVec<u8>>,
+
+    /// Decode bytes into a user defined window function.
+    try_decode_udwf: unsafe extern "C" fn(
+        &Self,
+        name: SStr,
+        buf: SSlice<u8>,
+    ) -> FFI_Result<FFI_WindowUDF>,
+
+    /// Encode a user defined window function into bytes.
+    try_encode_udwf:
+        unsafe extern "C" fn(&Self, node: FFI_WindowUDF) -> FFI_Result<SVec<u8>>,
+
+    /// Access the current [`TaskContext`].
+    task_ctx_provider: FFI_TaskContextProvider,
+
+    /// Used to create a clone on the provider of the execution plan. This should
+    /// only need to be called by the receiver of the plan.
+    pub clone: unsafe extern "C" fn(plan: &Self) -> Self,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(arg: &mut Self),
+
+    /// Return the major DataFusion version number of this provider.
+    pub version: unsafe extern "C" fn() -> u64,
+
+    /// Internal data. This is only to be accessed by the provider of the plan.
+    /// A [`ForeignPhysicalExtensionCodec`] should never attempt to access this data.
+    pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+unsafe impl Send for FFI_PhysicalExtensionCodec {}
+unsafe impl Sync for FFI_PhysicalExtensionCodec {}
+
+struct PhysicalExtensionCodecPrivateData {
+    codec: Arc<dyn PhysicalExtensionCodec>,
+    runtime: Option<Handle>,
+}
+
+impl FFI_PhysicalExtensionCodec {
+    fn inner(&self) -> &Arc<dyn PhysicalExtensionCodec> {
+        let private_data = self.private_data as *const PhysicalExtensionCodecPrivateData;
+        unsafe { &(*private_data).codec }
+    }
+
+    fn runtime(&self) -> &Option<Handle> {
+        let private_data = self.private_data as *const PhysicalExtensionCodecPrivateData;
+        unsafe { &(*private_data).runtime }
+    }
+}
+
+unsafe extern "C" fn try_decode_fn_wrapper(
+    codec: &FFI_PhysicalExtensionCodec,
+    buf: SSlice<u8>,
+    inputs: SVec<FFI_ExecutionPlan>,
+) -> FFI_Result<FFI_ExecutionPlan> {
+    let runtime = codec.runtime().clone();
+    let task_ctx: Arc<TaskContext> =
+        sresult_return!((&codec.task_ctx_provider).try_into());
+    let codec = codec.inner();
+    let inputs = inputs
+        .into_iter()
+        .map(|plan| <Arc<dyn ExecutionPlan>>::try_from(&plan))
+        .collect::<Result<Vec<_>>>();
+    let inputs = sresult_return!(inputs);
+
+    let plan =
+        sresult_return!(codec.try_decode(buf.as_ref(), &inputs, task_ctx.as_ref()));
+
+    FFI_Result::Ok(FFI_ExecutionPlan::new(plan, runtime))
+}
+
+unsafe extern "C" fn try_encode_fn_wrapper(
+    codec: &FFI_PhysicalExtensionCodec,
+    node: FFI_ExecutionPlan,
+) -> FFI_Result<SVec<u8>> {
+    let codec = codec.inner();
+
+    let plan: Arc<dyn ExecutionPlan> = sresult_return!((&node).try_into());
+
+    let mut bytes = Vec::new();
+    sresult_return!(codec.try_encode(plan, &mut bytes));
+
+    FFI_Result::Ok(bytes.into_iter().collect())
+}
+
+unsafe extern "C" fn try_decode_udf_fn_wrapper(
+    codec: &FFI_PhysicalExtensionCodec,
+    name: SStr,
+    buf: SSlice<u8>,
+) -> FFI_Result<FFI_ScalarUDF> {
+    let codec = codec.inner();
+
+    let udf = sresult_return!(codec.try_decode_udf(name.as_str(), buf.as_ref()));
+    let udf = FFI_ScalarUDF::from(udf);
+
+    FFI_Result::Ok(udf)
+}
+
+unsafe extern "C" fn try_encode_udf_fn_wrapper(
+    codec: &FFI_PhysicalExtensionCodec,
+    node: FFI_ScalarUDF,
+) -> FFI_Result<SVec<u8>> {
+    let codec = codec.inner();
+    let node: Arc<dyn ScalarUDFImpl> = (&node).into();
+    let node = ScalarUDF::new_from_shared_impl(node);
+
+    let mut bytes = Vec::new();
+    sresult_return!(codec.try_encode_udf(&node, &mut bytes));
+
+    FFI_Result::Ok(bytes.into_iter().collect())
+}
+
+unsafe extern "C" fn try_decode_udaf_fn_wrapper(
+    codec: &FFI_PhysicalExtensionCodec,
+    name: SStr,
+    buf: SSlice<u8>,
+) -> FFI_Result<FFI_AggregateUDF> {
+    let codec_inner = codec.inner();
+    let udaf = sresult_return!(codec_inner.try_decode_udaf(name.into(), buf.as_ref()));
+    let udaf = FFI_AggregateUDF::from(udaf);
+
+    FFI_Result::Ok(udaf)
+}
+
+unsafe extern "C" fn try_encode_udaf_fn_wrapper(
+    codec: &FFI_PhysicalExtensionCodec,
+    node: FFI_AggregateUDF,
+) -> FFI_Result<SVec<u8>> {
+    let codec = codec.inner();
+    let udaf: Arc<dyn AggregateUDFImpl> = (&node).into();
+    let udaf = AggregateUDF::new_from_shared_impl(udaf);
+
+    let mut bytes = Vec::new();
+    sresult_return!(codec.try_encode_udaf(&udaf, &mut bytes));
+
+    FFI_Result::Ok(bytes.into_iter().collect())
+}
+
+unsafe extern "C" fn try_decode_udwf_fn_wrapper(
+    codec: &FFI_PhysicalExtensionCodec,
+    name: SStr,
+    buf: SSlice<u8>,
+) -> FFI_Result<FFI_WindowUDF> {
+    let codec = codec.inner();
+    let udwf = sresult_return!(codec.try_decode_udwf(name.into(), buf.as_ref()));
+    let udwf = FFI_WindowUDF::from(udwf);
+
+    FFI_Result::Ok(udwf)
+}
+
+unsafe extern "C" fn try_encode_udwf_fn_wrapper(
+    codec: &FFI_PhysicalExtensionCodec,
+    node: FFI_WindowUDF,
+) -> FFI_Result<SVec<u8>> {
+    let codec = codec.inner();
+    let udwf: Arc<dyn WindowUDFImpl> = (&node).into();
+    let udwf = WindowUDF::new_from_shared_impl(udwf);
+
+    let mut bytes = Vec::new();
+    sresult_return!(codec.try_encode_udwf(&udwf, &mut bytes));
+
+    FFI_Result::Ok(bytes.into_iter().collect())
+}
+
+unsafe extern "C" fn release_fn_wrapper(codec: &mut FFI_PhysicalExtensionCodec) {
+    unsafe {
+        let private_data =
+            Box::from_raw(codec.private_data as *mut PhysicalExtensionCodecPrivateData);
+        drop(private_data);
+    }
+}
+
+unsafe extern "C" fn clone_fn_wrapper(
+    codec: &FFI_PhysicalExtensionCodec,
+) -> FFI_PhysicalExtensionCodec {
+    let old_codec = Arc::clone(codec.inner());
+    let runtime = codec.runtime().clone();
+
+    FFI_PhysicalExtensionCodec::new(old_codec, runtime, codec.task_ctx_provider.clone())
+}
+
+impl Drop for FFI_PhysicalExtensionCodec {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+impl FFI_PhysicalExtensionCodec {
+    /// Creates a new [`FFI_PhysicalExtensionCodec`].
+    pub fn new(
+        codec: Arc<dyn PhysicalExtensionCodec + Send>,
+        runtime: Option<Handle>,
+        task_ctx_provider: impl Into<FFI_TaskContextProvider>,
+    ) -> Self {
+        if let Some(codec) = (Arc::clone(&codec) as Arc<dyn Any>)
+            .downcast_ref::<ForeignPhysicalExtensionCodec>()
+        {
+            return codec.0.clone();
+        }
+
+        let task_ctx_provider = task_ctx_provider.into();
+        let private_data = Box::new(PhysicalExtensionCodecPrivateData { codec, runtime });
+
+        Self {
+            try_decode: try_decode_fn_wrapper,
+            try_encode: try_encode_fn_wrapper,
+            try_decode_udf: try_decode_udf_fn_wrapper,
+            try_encode_udf: try_encode_udf_fn_wrapper,
+            try_decode_udaf: try_decode_udaf_fn_wrapper,
+            try_encode_udaf: try_encode_udaf_fn_wrapper,
+            try_decode_udwf: try_decode_udwf_fn_wrapper,
+            try_encode_udwf: try_encode_udwf_fn_wrapper,
+            task_ctx_provider,
+
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: crate::version,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+/// This wrapper struct exists on the receiver side of the FFI interface, so it has
+/// no guarantees about being able to access the data in `private_data`. Any functions
+/// defined on this struct must only use the stable functions provided in
+/// FFI_PhysicalExtensionCodec to interact with the foreign table provider.
+#[derive(Debug)]
+pub struct ForeignPhysicalExtensionCodec(pub FFI_PhysicalExtensionCodec);
+
+unsafe impl Send for ForeignPhysicalExtensionCodec {}
+unsafe impl Sync for ForeignPhysicalExtensionCodec {}
+
+impl From<&FFI_PhysicalExtensionCodec> for Arc<dyn PhysicalExtensionCodec> {
+    fn from(codec: &FFI_PhysicalExtensionCodec) -> Self {
+        if (codec.library_marker_id)() == crate::get_library_marker_id() {
+            Arc::clone(codec.inner())
+        } else {
+            Arc::new(ForeignPhysicalExtensionCodec(codec.clone()))
+        }
+    }
+}
+
+impl Clone for FFI_PhysicalExtensionCodec {
+    fn clone(&self) -> Self {
+        unsafe { (self.clone)(self) }
+    }
+}
+
+impl PhysicalExtensionCodec for ForeignPhysicalExtensionCodec {
+    fn try_decode(
+        &self,
+        buf: &[u8],
+        inputs: &[Arc<dyn ExecutionPlan>],
+        _ctx: &TaskContext,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let inputs = inputs
+            .iter()
+            .map(|plan| FFI_ExecutionPlan::new(Arc::clone(plan), None))
+            .collect();
+
+        let plan =
+            df_result!(unsafe { (self.0.try_decode)(&self.0, buf.into(), inputs) })?;
+        let plan: Arc<dyn ExecutionPlan> = (&plan).try_into()?;
+
+        Ok(plan)
+    }
+
+    fn try_encode(&self, node: Arc<dyn ExecutionPlan>, buf: &mut Vec<u8>) -> Result<()> {
+        let plan = FFI_ExecutionPlan::new(node, None);
+        let bytes = df_result!(unsafe { (self.0.try_encode)(&self.0, plan) })?;
+
+        buf.extend(bytes);
+        Ok(())
+    }
+
+    fn try_decode_udf(&self, name: &str, buf: &[u8]) -> Result<Arc<ScalarUDF>> {
+        let udf = unsafe {
+            df_result!((self.0.try_decode_udf)(&self.0, name.into(), buf.into()))
+        }?;
+        let udf: Arc<dyn ScalarUDFImpl> = (&udf).into();
+
+        Ok(Arc::new(ScalarUDF::new_from_shared_impl(udf)))
+    }
+
+    fn try_encode_udf(&self, node: &ScalarUDF, buf: &mut Vec<u8>) -> Result<()> {
+        let node = FFI_ScalarUDF::from(Arc::new(node.clone()));
+        let bytes = df_result!(unsafe { (self.0.try_encode_udf)(&self.0, node) })?;
+
+        buf.extend(bytes);
+
+        Ok(())
+    }
+
+    fn try_decode_udaf(&self, name: &str, buf: &[u8]) -> Result<Arc<AggregateUDF>> {
+        let udaf = unsafe {
+            df_result!((self.0.try_decode_udaf)(&self.0, name.into(), buf.into()))
+        }?;
+        let udaf: Arc<dyn AggregateUDFImpl> = (&udaf).into();
+
+        Ok(Arc::new(AggregateUDF::new_from_shared_impl(udaf)))
+    }
+
+    fn try_encode_udaf(&self, node: &AggregateUDF, buf: &mut Vec<u8>) -> Result<()> {
+        let node = Arc::new(node.clone());
+        let node = FFI_AggregateUDF::from(node);
+        let bytes = df_result!(unsafe { (self.0.try_encode_udaf)(&self.0, node) })?;
+
+        buf.extend(bytes);
+
+        Ok(())
+    }
+
+    fn try_decode_udwf(&self, name: &str, buf: &[u8]) -> Result<Arc<WindowUDF>> {
+        let udwf = unsafe {
+            df_result!((self.0.try_decode_udwf)(&self.0, name.into(), buf.into()))
+        }?;
+        let udwf: Arc<dyn WindowUDFImpl> = (&udwf).into();
+
+        Ok(Arc::new(WindowUDF::new_from_shared_impl(udwf)))
+    }
+
+    fn try_encode_udwf(&self, node: &WindowUDF, buf: &mut Vec<u8>) -> Result<()> {
+        let node = Arc::new(node.clone());
+        let node = FFI_WindowUDF::from(node);
+        let bytes = df_result!(unsafe { (self.0.try_encode_udwf)(&self.0, node) })?;
+
+        buf.extend(bytes);
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+pub(crate) mod tests {
+    use std::sync::Arc;
+
+    use arrow_schema::{DataType, Field, Schema};
+    use datafusion_common::{Result, exec_err};
+    use datafusion_execution::TaskContext;
+    use datafusion_expr::ptr_eq::arc_ptr_eq;
+    use datafusion_expr::{AggregateUDF, ScalarUDF, WindowUDF, WindowUDFImpl};
+    use datafusion_functions::math::abs::AbsFunc;
+    use datafusion_functions_aggregate::sum::Sum;
+    use datafusion_functions_window::rank::{Rank, RankType};
+    use datafusion_physical_plan::ExecutionPlan;
+    use datafusion_proto::physical_plan::PhysicalExtensionCodec;
+
+    use crate::execution_plan::tests::EmptyExec;
+    use crate::proto::physical_extension_codec::FFI_PhysicalExtensionCodec;
+
+    #[derive(Debug)]
+    pub(crate) struct TestExtensionCodec;
+
+    impl TestExtensionCodec {
+        pub(crate) const MAGIC_NUMBER: u8 = 127;
+        pub(crate) const EMPTY_EXEC_SERIALIZED: u8 = 1;
+        pub(crate) const ABS_FUNC_SERIALIZED: u8 = 2;
+        pub(crate) const SUM_UDAF_SERIALIZED: u8 = 3;
+        pub(crate) const RANK_UDWF_SERIALIZED: u8 = 4;
+        pub(crate) const MEMTABLE_SERIALIZED: u8 = 5;
+    }
+
+    impl PhysicalExtensionCodec for TestExtensionCodec {
+        fn try_decode(
+            &self,
+            buf: &[u8],
+            _inputs: &[Arc<dyn ExecutionPlan>],
+            _ctx: &TaskContext,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
+            if buf[0] != Self::MAGIC_NUMBER {
+                return exec_err!(
+                    "TestExtensionCodec input buffer does not start with magic number"
+                );
+            }
+
+            if buf.len() != 2 || buf[1] != Self::EMPTY_EXEC_SERIALIZED {
+                return exec_err!("TestExtensionCodec unable to decode execution plan");
+            }
+
+            Ok(create_test_exec())
+        }
+
+        fn try_encode(
+            &self,
+            node: Arc<dyn ExecutionPlan>,
+            buf: &mut Vec<u8>,
+        ) -> Result<()> {
+            buf.push(Self::MAGIC_NUMBER);
+
+            let Some(_) = node.downcast_ref::<EmptyExec>() else {
+                return exec_err!("TestExtensionCodec only expects EmptyExec");
+            };
+
+            buf.push(Self::EMPTY_EXEC_SERIALIZED);
+
+            Ok(())
+        }
+
+        fn try_decode_udf(&self, _name: &str, buf: &[u8]) -> Result<Arc<ScalarUDF>> {
+            if buf[0] != Self::MAGIC_NUMBER {
+                return exec_err!(
+                    "TestExtensionCodec input buffer does not start with magic number"
+                );
+            }
+
+            if buf.len() != 2 || buf[1] != Self::ABS_FUNC_SERIALIZED {
+                return exec_err!("TestExtensionCodec unable to decode udf");
+            }
+
+            Ok(Arc::new(ScalarUDF::from(AbsFunc::new())))
+        }
+
+        fn try_encode_udf(&self, node: &ScalarUDF, buf: &mut Vec<u8>) -> Result<()> {
+            buf.push(Self::MAGIC_NUMBER);
+
+            let udf = node.inner();
+            if !udf.is::<AbsFunc>() {
+                return exec_err!("TestExtensionCodec only expects Abs UDF");
+            };
+
+            buf.push(Self::ABS_FUNC_SERIALIZED);
+
+            Ok(())
+        }
+
+        fn try_decode_udaf(&self, _name: &str, buf: &[u8]) -> Result<Arc<AggregateUDF>> {
+            if buf[0] != Self::MAGIC_NUMBER {
+                return exec_err!(
+                    "TestExtensionCodec input buffer does not start with magic number"
+                );
+            }
+
+            if buf.len() != 2 || buf[1] != Self::SUM_UDAF_SERIALIZED {
+                return exec_err!("TestExtensionCodec unable to decode udaf");
+            }
+
+            Ok(Arc::new(AggregateUDF::from(Sum::new())))
+        }
+
+        fn try_encode_udaf(&self, node: &AggregateUDF, buf: &mut Vec<u8>) -> Result<()> {
+            buf.push(Self::MAGIC_NUMBER);
+
+            let udf = node.inner();
+            let Some(_udf) = udf.downcast_ref::<Sum>() else {
+                return exec_err!("TestExtensionCodec only expects Sum UDAF");
+            };
+
+            buf.push(Self::SUM_UDAF_SERIALIZED);
+
+            Ok(())
+        }
+
+        fn try_decode_udwf(&self, _name: &str, buf: &[u8]) -> Result<Arc<WindowUDF>> {
+            if buf[0] != Self::MAGIC_NUMBER {
+                return exec_err!(
+                    "TestExtensionCodec input buffer does not start with magic number"
+                );
+            }
+
+            if buf.len() != 2 || buf[1] != Self::RANK_UDWF_SERIALIZED {
+                return exec_err!("TestExtensionCodec unable to decode udwf");
+            }
+
+            Ok(Arc::new(WindowUDF::from(Rank::new(
+                "my_rank".to_owned(),
+                RankType::Basic,
+            ))))
+        }
+
+        fn try_encode_udwf(&self, node: &WindowUDF, buf: &mut Vec<u8>) -> Result<()> {
+            buf.push(Self::MAGIC_NUMBER);
+
+            let udf = node.inner();
+            let Some(udf) = udf.downcast_ref::<Rank>() else {
+                return exec_err!("TestExtensionCodec only expects Rank UDWF");
+            };
+
+            if udf.name() != "my_rank" {
+                return exec_err!("TestExtensionCodec only expects my_rank UDWF name");
+            }
+
+            buf.push(Self::RANK_UDWF_SERIALIZED);
+
+            Ok(())
+        }
+    }
+
+    fn create_test_exec() -> Arc<dyn ExecutionPlan> {
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)]));
+        Arc::new(EmptyExec::new(schema)) as Arc<dyn ExecutionPlan>
+    }
+
+    #[test]
+    fn roundtrip_ffi_physical_extension_codec_exec_plan() -> Result<()> {
+        let codec = Arc::new(TestExtensionCodec {});
+        let (ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+
+        let mut ffi_codec =
+            FFI_PhysicalExtensionCodec::new(codec, None, task_ctx_provider);
+        ffi_codec.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_codec: Arc<dyn PhysicalExtensionCodec> = (&ffi_codec).into();
+
+        let exec = create_test_exec();
+        let input_execs = [create_test_exec()];
+        let mut bytes = Vec::new();
+        foreign_codec.try_encode(Arc::clone(&exec), &mut bytes)?;
+
+        let returned_exec =
+            foreign_codec.try_decode(&bytes, &input_execs, ctx.task_ctx().as_ref())?;
+
+        assert!(returned_exec.is::<EmptyExec>());
+
+        Ok(())
+    }
+
+    #[test]
+    fn roundtrip_ffi_physical_extension_codec_udf() -> Result<()> {
+        let codec = Arc::new(TestExtensionCodec {});
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+
+        let mut ffi_codec =
+            FFI_PhysicalExtensionCodec::new(codec, None, task_ctx_provider);
+        ffi_codec.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_codec: Arc<dyn PhysicalExtensionCodec> = (&ffi_codec).into();
+
+        let udf = Arc::new(ScalarUDF::from(AbsFunc::new()));
+        let mut bytes = Vec::new();
+        foreign_codec.try_encode_udf(udf.as_ref(), &mut bytes)?;
+
+        let returned_udf = foreign_codec.try_decode_udf(udf.name(), &bytes)?;
+
+        assert!(returned_udf.inner().is::<AbsFunc>());
+
+        Ok(())
+    }
+
+    #[test]
+    fn roundtrip_ffi_physical_extension_codec_udaf() -> Result<()> {
+        let codec = Arc::new(TestExtensionCodec {});
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+
+        let mut ffi_codec =
+            FFI_PhysicalExtensionCodec::new(codec, None, task_ctx_provider);
+        ffi_codec.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_codec: Arc<dyn PhysicalExtensionCodec> = (&ffi_codec).into();
+
+        let udf = Arc::new(AggregateUDF::from(Sum::new()));
+        let mut bytes = Vec::new();
+        foreign_codec.try_encode_udaf(udf.as_ref(), &mut bytes)?;
+
+        let returned_udf = foreign_codec.try_decode_udaf(udf.name(), &bytes)?;
+
+        assert!(returned_udf.inner().is::<Sum>());
+
+        Ok(())
+    }
+
+    #[test]
+    fn roundtrip_ffi_physical_extension_codec_udwf() -> Result<()> {
+        let codec = Arc::new(TestExtensionCodec {});
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+
+        let mut ffi_codec =
+            FFI_PhysicalExtensionCodec::new(codec, None, task_ctx_provider);
+        ffi_codec.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_codec: Arc<dyn PhysicalExtensionCodec> = (&ffi_codec).into();
+
+        let udf = Arc::new(WindowUDF::from(Rank::new(
+            "my_rank".to_owned(),
+            RankType::Basic,
+        )));
+        let mut bytes = Vec::new();
+        foreign_codec.try_encode_udwf(udf.as_ref(), &mut bytes)?;
+
+        let returned_udf = foreign_codec.try_decode_udwf(udf.name(), &bytes)?;
+
+        assert!(returned_udf.inner().is::<Rank>());
+
+        Ok(())
+    }
+
+    #[test]
+    fn ffi_physical_extension_codec_local_bypass() {
+        let codec =
+            Arc::new(TestExtensionCodec {}) as Arc<dyn PhysicalExtensionCodec + Send>;
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+
+        let mut ffi_codec =
+            FFI_PhysicalExtensionCodec::new(Arc::clone(&codec), None, task_ctx_provider);
+
+        let codec = codec as Arc<dyn PhysicalExtensionCodec>;
+        // Verify local libraries can be downcast to their original
+        let foreign_codec: Arc<dyn PhysicalExtensionCodec> = (&ffi_codec).into();
+        assert!(arc_ptr_eq(&foreign_codec, &codec));
+
+        // Verify different library markers generate foreign providers
+        ffi_codec.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_codec: Arc<dyn PhysicalExtensionCodec> = (&ffi_codec).into();
+        assert!(!arc_ptr_eq(&foreign_codec, &codec));
+    }
+}
diff --git a/datafusion/ffi/src/record_batch_stream.rs b/datafusion/ffi/src/record_batch_stream.rs
index 1739235d17036..74709848cbb7f 100644
--- a/datafusion/ffi/src/record_batch_stream.rs
+++ b/datafusion/ffi/src/record_batch_stream.rs
@@ -15,45 +15,34 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{ffi::c_void, task::Poll};
-
-use abi_stable::{
-    std_types::{ROption, RResult, RString},
-    StableAbi,
-};
-use arrow::array::{Array, RecordBatch};
-use arrow::{
-    array::{make_array, StructArray},
-    ffi::{from_ffi, to_ffi},
-};
+use std::ffi::c_void;
+use std::task::Poll;
+
+use arrow::array::{Array, RecordBatch, StructArray, make_array};
+use arrow::ffi::{from_ffi, to_ffi};
 use async_ffi::{ContextExt, FfiContext, FfiPoll};
-use datafusion::error::Result;
-use datafusion::{
-    error::DataFusionError,
-    execution::{RecordBatchStream, SendableRecordBatchStream},
-};
-use datafusion_common::{exec_datafusion_err, exec_err};
+use datafusion_common::{DataFusionError, Result, ffi_datafusion_err, ffi_err};
+use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream};
 use futures::{Stream, TryStreamExt};
+
 use tokio::runtime::Handle;
 
-use crate::{
-    arrow_wrappers::{WrappedArray, WrappedSchema},
-    rresult,
-};
+use crate::arrow_wrappers::{WrappedArray, WrappedSchema};
+use crate::sresult;
+use crate::util::{FFI_Option, FFI_Result};
 
 /// A stable struct for sharing [`RecordBatchStream`] across FFI boundaries.
 /// We use the async-ffi crate for handling async calls across libraries.
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
+#[derive(Debug)]
 pub struct FFI_RecordBatchStream {
     /// This mirrors the `poll_next` of [`RecordBatchStream`] but does so
     /// in a FFI safe manner.
-    pub poll_next:
-        unsafe extern "C" fn(
-            stream: &Self,
-            cx: &mut FfiContext,
-        ) -> FfiPoll<ROption<RResult<WrappedArray, RString>>>,
+    pub poll_next: unsafe extern "C" fn(
+        stream: &Self,
+        cx: &mut FfiContext,
+    )
+        -> FfiPoll<FFI_Option<FFI_Result<WrappedArray>>>,
 
     /// Return the schema of the record batch
     pub schema: unsafe extern "C" fn(stream: &Self) -> WrappedSchema,
@@ -95,59 +84,66 @@ impl FFI_RecordBatchStream {
 unsafe impl Send for FFI_RecordBatchStream {}
 
 unsafe extern "C" fn schema_fn_wrapper(stream: &FFI_RecordBatchStream) -> WrappedSchema {
-    let private_data = stream.private_data as *const RecordBatchStreamPrivateData;
-    let stream = &(*private_data).rbs;
+    unsafe {
+        let private_data = stream.private_data as *const RecordBatchStreamPrivateData;
+        let stream = &(*private_data).rbs;
 
-    (*stream).schema().into()
+        (*stream).schema().into()
+    }
 }
 
 unsafe extern "C" fn release_fn_wrapper(provider: &mut FFI_RecordBatchStream) {
-    let private_data =
-        Box::from_raw(provider.private_data as *mut RecordBatchStreamPrivateData);
-    drop(private_data);
+    unsafe {
+        debug_assert!(!provider.private_data.is_null());
+        let private_data =
+            Box::from_raw(provider.private_data as *mut RecordBatchStreamPrivateData);
+        drop(private_data);
+        provider.private_data = std::ptr::null_mut();
+    }
 }
 
-fn record_batch_to_wrapped_array(
+pub(crate) fn record_batch_to_wrapped_array(
     record_batch: RecordBatch,
-) -> RResult<WrappedArray, RString> {
+) -> FFI_Result<WrappedArray> {
+    let schema = WrappedSchema::from(record_batch.schema());
     let struct_array = StructArray::from(record_batch);
-    rresult!(
-        to_ffi(&struct_array.to_data()).map(|(array, schema)| WrappedArray {
-            array,
-            schema: WrappedSchema(schema)
-        })
+    sresult!(
+        to_ffi(&struct_array.to_data())
+            .map(|(array, _schema)| WrappedArray { array, schema })
     )
 }
 
 // probably want to use pub unsafe fn from_ffi(array: FFI_ArrowArray, schema: &FFI_ArrowSchema) -> Result<ArrayData> {
 fn maybe_record_batch_to_wrapped_stream(
     record_batch: Option<Result<RecordBatch>>,
-) -> ROption<RResult<WrappedArray, RString>> {
+) -> FFI_Option<FFI_Result<WrappedArray>> {
     match record_batch {
         Some(Ok(record_batch)) => {
-            ROption::RSome(record_batch_to_wrapped_array(record_batch))
+            FFI_Option::Some(record_batch_to_wrapped_array(record_batch))
         }
-        Some(Err(e)) => ROption::RSome(RResult::RErr(e.to_string().into())),
-        None => ROption::RNone,
+        Some(Err(e)) => FFI_Option::Some(FFI_Result::Err(e.to_string().into())),
+        None => FFI_Option::None,
     }
 }
 
 unsafe extern "C" fn poll_next_fn_wrapper(
     stream: &FFI_RecordBatchStream,
     cx: &mut FfiContext,
-) -> FfiPoll<ROption<RResult<WrappedArray, RString>>> {
-    let private_data = stream.private_data as *mut RecordBatchStreamPrivateData;
-    let stream = &mut (*private_data).rbs;
+) -> FfiPoll<FFI_Option<FFI_Result<WrappedArray>>> {
+    unsafe {
+        let private_data = stream.private_data as *mut RecordBatchStreamPrivateData;
+        let stream = &mut (*private_data).rbs;
 
-    let _guard = (*private_data).runtime.as_ref().map(|rt| rt.enter());
+        let _guard = (*private_data).runtime.as_ref().map(|rt| rt.enter());
 
-    let poll_result = cx.with_context(|std_cx| {
-        (*stream)
-            .try_poll_next_unpin(std_cx)
-            .map(maybe_record_batch_to_wrapped_stream)
-    });
+        let poll_result = cx.with_context(|std_cx| {
+            (*stream)
+                .try_poll_next_unpin(std_cx)
+                .map(maybe_record_batch_to_wrapped_stream)
+        });
 
-    poll_result.into()
+        poll_result.into()
+    }
 }
 
 impl RecordBatchStream for FFI_RecordBatchStream {
@@ -157,29 +153,36 @@ impl RecordBatchStream for FFI_RecordBatchStream {
     }
 }
 
-fn wrapped_array_to_record_batch(array: WrappedArray) -> Result<RecordBatch> {
+pub(crate) fn wrapped_array_to_record_batch(array: WrappedArray) -> Result<RecordBatch> {
     let array_data =
         unsafe { from_ffi(array.array, &array.schema.0).map_err(DataFusionError::from)? };
+    let schema: arrow::datatypes::SchemaRef = array.schema.into();
     let array = make_array(array_data);
     let struct_array = array
         .as_any()
         .downcast_ref::<StructArray>()
-        .ok_or_else(|| exec_datafusion_err!(
+        .ok_or_else(|| ffi_datafusion_err!(
         "Unexpected array type during record batch collection in FFI_RecordBatchStream - expected StructArray"
     ))?;
 
-    Ok(struct_array.into())
+    let rb: RecordBatch = struct_array.into();
+
+    rb.with_schema(schema).map_err(Into::into)
 }
 
 fn maybe_wrapped_array_to_record_batch(
-    array: ROption<RResult<WrappedArray, RString>>,
+    array: FFI_Option<FFI_Result<WrappedArray>>,
 ) -> Option<Result<RecordBatch>> {
+    let array: Option<FFI_Result<WrappedArray>> = array.into();
     match array {
-        ROption::RSome(RResult::ROk(wrapped_array)) => {
-            Some(wrapped_array_to_record_batch(wrapped_array))
+        Some(result) => {
+            let result: std::result::Result<WrappedArray, _> = result.into();
+            match result {
+                Ok(wrapped_array) => Some(wrapped_array_to_record_batch(wrapped_array)),
+                Err(e) => Some(ffi_err!("{e}")),
+            }
         }
-        ROption::RSome(RResult::RErr(e)) => Some(exec_err!("FFI error: {e}")),
-        ROption::RNone => None,
+        None => None,
     }
 }
 
@@ -198,7 +201,7 @@ impl Stream for FFI_RecordBatchStream {
                 Poll::Ready(maybe_wrapped_array_to_record_batch(array))
             }
             FfiPoll::Pending => Poll::Pending,
-            FfiPoll::Panicked => Poll::Ready(Some(exec_err!(
+            FfiPoll::Panicked => Poll::Ready(Some(ffi_err!(
                 "Panic occurred during poll_next on FFI_RecordBatchStream"
             ))),
         }
@@ -216,14 +219,18 @@ mod tests {
     use std::sync::Arc;
 
     use arrow::datatypes::{DataType, Field, Schema};
-    use datafusion::{
-        common::record_batch, error::Result, execution::SendableRecordBatchStream,
-        test_util::bounded_stream,
-    };
-
-    use super::FFI_RecordBatchStream;
+    use datafusion::common::record_batch;
+    use datafusion::error::Result;
+    use datafusion::execution::SendableRecordBatchStream;
+    use datafusion::test_util::bounded_stream;
     use futures::StreamExt;
 
+    use super::{
+        FFI_RecordBatchStream, record_batch_to_wrapped_array,
+        wrapped_array_to_record_batch,
+    };
+    use crate::df_result;
+
     #[tokio::test]
     async fn test_round_trip_record_batch_stream() -> Result<()> {
         let record_batch = record_batch!(
@@ -255,4 +262,28 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn round_trip_record_batch_with_metadata() -> Result<()> {
+        let rb = record_batch!(
+            ("a", Int32, vec![1, 2, 3]),
+            ("b", Float64, vec![Some(4.0), None, Some(5.0)])
+        )?;
+
+        let schema = rb
+            .schema()
+            .as_ref()
+            .clone()
+            .with_metadata([("some_key".to_owned(), "some_value".to_owned())].into())
+            .into();
+
+        let rb = rb.with_schema(schema)?;
+
+        let ffi_rb = df_result!(record_batch_to_wrapped_array(rb.clone()))?;
+
+        let round_trip_rb = wrapped_array_to_record_batch(ffi_rb)?;
+
+        assert_eq!(rb, round_trip_rb);
+        Ok(())
+    }
 }
diff --git a/datafusion/ffi/src/schema_provider.rs b/datafusion/ffi/src/schema_provider.rs
index b5970d5881d6e..8441b84b48697 100644
--- a/datafusion/ffi/src/schema_provider.rs
+++ b/datafusion/ffi/src/schema_provider.rs
@@ -15,57 +15,57 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{any::Any, ffi::c_void, sync::Arc};
+use std::ffi::c_void;
+use std::sync::Arc;
 
-use abi_stable::{
-    std_types::{ROption, RResult, RString, RVec},
-    StableAbi,
-};
 use async_ffi::{FfiFuture, FutureExt};
 use async_trait::async_trait;
-use datafusion::{
-    catalog::{SchemaProvider, TableProvider},
-    error::DataFusionError,
+use datafusion_catalog::{SchemaProvider, TableProvider};
+use datafusion_common::error::{DataFusionError, Result};
+use datafusion_proto::logical_plan::{
+    DefaultLogicalExtensionCodec, LogicalExtensionCodec,
 };
+use stabby::string::String as SString;
+use stabby::vec::Vec as SVec;
 use tokio::runtime::Handle;
 
-use crate::{
-    df_result, rresult_return,
-    table_provider::{FFI_TableProvider, ForeignTableProvider},
-};
-
-use datafusion::error::Result;
+use crate::execution::FFI_TaskContextProvider;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::table_provider::{FFI_TableProvider, ForeignTableProvider};
+use crate::util::{FFI_Option, FFI_Result};
+use crate::{df_result, sresult_return};
 
 /// A stable struct for sharing [`SchemaProvider`] across FFI boundaries.
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
+#[derive(Debug)]
 pub struct FFI_SchemaProvider {
-    pub owner_name: ROption<RString>,
+    pub owner_name: FFI_Option<SString>,
 
-    pub table_names: unsafe extern "C" fn(provider: &Self) -> RVec<RString>,
+    pub table_names: unsafe extern "C" fn(provider: &Self) -> SVec<SString>,
 
     pub table: unsafe extern "C" fn(
         provider: &Self,
-        name: RString,
+        name: SString,
     ) -> FfiFuture<
-        RResult<ROption<FFI_TableProvider>, RString>,
+        FFI_Result<FFI_Option<FFI_TableProvider>>,
     >,
 
-    pub register_table:
-        unsafe extern "C" fn(
-            provider: &Self,
-            name: RString,
-            table: FFI_TableProvider,
-        ) -> RResult<ROption<FFI_TableProvider>, RString>,
+    pub register_table: unsafe extern "C" fn(
+        provider: &Self,
+        name: SString,
+        table: FFI_TableProvider,
+    )
+        -> FFI_Result<FFI_Option<FFI_TableProvider>>,
 
     pub deregister_table:
         unsafe extern "C" fn(
             provider: &Self,
-            name: RString,
-        ) -> RResult<ROption<FFI_TableProvider>, RString>,
+            name: SString,
+        ) -> FFI_Result<FFI_Option<FFI_TableProvider>>,
+
+    pub table_exist: unsafe extern "C" fn(provider: &Self, name: SString) -> bool,
 
-    pub table_exist: unsafe extern "C" fn(provider: &Self, name: RString) -> bool,
+    pub logical_codec: FFI_LogicalExtensionCodec,
 
     /// Used to create a clone on the provider of the execution plan. This should
     /// only need to be called by the receiver of the plan.
@@ -80,117 +80,152 @@ pub struct FFI_SchemaProvider {
     /// Internal data. This is only to be accessed by the provider of the plan.
     /// A [`ForeignSchemaProvider`] should never attempt to access this data.
     pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
 }
 
 unsafe impl Send for FFI_SchemaProvider {}
 unsafe impl Sync for FFI_SchemaProvider {}
 
 struct ProviderPrivateData {
-    provider: Arc<dyn SchemaProvider + Send>,
+    provider: Arc<dyn SchemaProvider>,
     runtime: Option<Handle>,
 }
 
 impl FFI_SchemaProvider {
-    unsafe fn inner(&self) -> &Arc<dyn SchemaProvider + Send> {
-        let private_data = self.private_data as *const ProviderPrivateData;
-        &(*private_data).provider
+    unsafe fn inner(&self) -> &Arc<dyn SchemaProvider> {
+        unsafe {
+            let private_data = self.private_data as *const ProviderPrivateData;
+            &(*private_data).provider
+        }
     }
 
     unsafe fn runtime(&self) -> Option<Handle> {
-        let private_data = self.private_data as *const ProviderPrivateData;
-        (*private_data).runtime.clone()
+        unsafe {
+            let private_data = self.private_data as *const ProviderPrivateData;
+            (*private_data).runtime.clone()
+        }
     }
 }
 
 unsafe extern "C" fn table_names_fn_wrapper(
     provider: &FFI_SchemaProvider,
-) -> RVec<RString> {
-    let provider = provider.inner();
+) -> SVec<SString> {
+    unsafe {
+        let provider = provider.inner();
 
-    let table_names = provider.table_names();
-    table_names.into_iter().map(|s| s.into()).collect()
+        let table_names = provider.table_names();
+        table_names.into_iter().map(|s| s.into()).collect()
+    }
 }
 
 unsafe extern "C" fn table_fn_wrapper(
     provider: &FFI_SchemaProvider,
-    name: RString,
-) -> FfiFuture<RResult<ROption<FFI_TableProvider>, RString>> {
-    let runtime = provider.runtime();
-    let provider = Arc::clone(provider.inner());
-
-    async move {
-        let table = rresult_return!(provider.table(name.as_str()).await)
-            .map(|t| FFI_TableProvider::new(t, true, runtime))
-            .into();
-
-        RResult::ROk(table)
+    name: SString,
+) -> FfiFuture<FFI_Result<FFI_Option<FFI_TableProvider>>> {
+    unsafe {
+        let runtime = provider.runtime();
+        let logical_codec = provider.logical_codec.clone();
+        let provider = Arc::clone(provider.inner());
+
+        async move {
+            let table = sresult_return!(provider.table(name.as_str()).await)
+                .map(|t| {
+                    FFI_TableProvider::new_with_ffi_codec(t, true, runtime, logical_codec)
+                })
+                .into();
+
+            FFI_Result::Ok(table)
+        }
+        .into_ffi()
     }
-    .into_ffi()
 }
 
 unsafe extern "C" fn register_table_fn_wrapper(
     provider: &FFI_SchemaProvider,
-    name: RString,
+    name: SString,
     table: FFI_TableProvider,
-) -> RResult<ROption<FFI_TableProvider>, RString> {
-    let runtime = provider.runtime();
-    let provider = provider.inner();
+) -> FFI_Result<FFI_Option<FFI_TableProvider>> {
+    unsafe {
+        let runtime = provider.runtime();
+        let logical_codec = provider.logical_codec.clone();
+        let provider = provider.inner();
 
-    let table = Arc::new(ForeignTableProvider(table));
+        let table = Arc::new(ForeignTableProvider(table));
 
-    let returned_table = rresult_return!(provider.register_table(name.into(), table))
-        .map(|t| FFI_TableProvider::new(t, true, runtime));
+        let returned_table = sresult_return!(provider.register_table(name.into(), table))
+            .map(|t| {
+                FFI_TableProvider::new_with_ffi_codec(t, true, runtime, logical_codec)
+            });
 
-    RResult::ROk(returned_table.into())
+        FFI_Result::Ok(returned_table.into())
+    }
 }
 
 unsafe extern "C" fn deregister_table_fn_wrapper(
     provider: &FFI_SchemaProvider,
-    name: RString,
-) -> RResult<ROption<FFI_TableProvider>, RString> {
-    let runtime = provider.runtime();
-    let provider = provider.inner();
-
-    let returned_table = rresult_return!(provider.deregister_table(name.as_str()))
-        .map(|t| FFI_TableProvider::new(t, true, runtime));
+    name: SString,
+) -> FFI_Result<FFI_Option<FFI_TableProvider>> {
+    unsafe {
+        let runtime = provider.runtime();
+        let logical_codec = provider.logical_codec.clone();
+        let provider = provider.inner();
+
+        let returned_table = sresult_return!(provider.deregister_table(name.as_str()))
+            .map(|t| {
+                FFI_TableProvider::new_with_ffi_codec(t, true, runtime, logical_codec)
+            });
 
-    RResult::ROk(returned_table.into())
+        FFI_Result::Ok(returned_table.into())
+    }
 }
 
 unsafe extern "C" fn table_exist_fn_wrapper(
     provider: &FFI_SchemaProvider,
-    name: RString,
+    name: SString,
 ) -> bool {
-    provider.inner().table_exist(name.as_str())
+    unsafe { provider.inner().table_exist(name.as_str()) }
 }
 
 unsafe extern "C" fn release_fn_wrapper(provider: &mut FFI_SchemaProvider) {
-    let private_data = Box::from_raw(provider.private_data as *mut ProviderPrivateData);
-    drop(private_data);
+    unsafe {
+        debug_assert!(!provider.private_data.is_null());
+        let private_data =
+            Box::from_raw(provider.private_data as *mut ProviderPrivateData);
+        drop(private_data);
+        provider.private_data = std::ptr::null_mut();
+    }
 }
 
 unsafe extern "C" fn clone_fn_wrapper(
     provider: &FFI_SchemaProvider,
 ) -> FFI_SchemaProvider {
-    let old_private_data = provider.private_data as *const ProviderPrivateData;
-    let runtime = (*old_private_data).runtime.clone();
-
-    let private_data = Box::into_raw(Box::new(ProviderPrivateData {
-        provider: Arc::clone(&(*old_private_data).provider),
-        runtime,
-    })) as *mut c_void;
-
-    FFI_SchemaProvider {
-        owner_name: provider.owner_name.clone(),
-        table_names: table_names_fn_wrapper,
-        clone: clone_fn_wrapper,
-        release: release_fn_wrapper,
-        version: super::version,
-        private_data,
-        table: table_fn_wrapper,
-        register_table: register_table_fn_wrapper,
-        deregister_table: deregister_table_fn_wrapper,
-        table_exist: table_exist_fn_wrapper,
+    unsafe {
+        let old_private_data = provider.private_data as *const ProviderPrivateData;
+        let runtime = (*old_private_data).runtime.clone();
+
+        let private_data = Box::into_raw(Box::new(ProviderPrivateData {
+            provider: Arc::clone(&(*old_private_data).provider),
+            runtime,
+        })) as *mut c_void;
+
+        FFI_SchemaProvider {
+            owner_name: provider.owner_name.clone(),
+            table_names: table_names_fn_wrapper,
+            table: table_fn_wrapper,
+            register_table: register_table_fn_wrapper,
+            deregister_table: deregister_table_fn_wrapper,
+            table_exist: table_exist_fn_wrapper,
+            logical_codec: provider.logical_codec.clone(),
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data,
+            library_marker_id: crate::get_library_marker_id,
+        }
     }
 }
 
@@ -203,23 +238,47 @@ impl Drop for FFI_SchemaProvider {
 impl FFI_SchemaProvider {
     /// Creates a new [`FFI_SchemaProvider`].
     pub fn new(
-        provider: Arc<dyn SchemaProvider + Send>,
+        provider: Arc<dyn SchemaProvider>,
         runtime: Option<Handle>,
+        task_ctx_provider: impl Into<FFI_TaskContextProvider>,
+        logical_codec: Option<Arc<dyn LogicalExtensionCodec>>,
     ) -> Self {
+        let task_ctx_provider = task_ctx_provider.into();
+        let logical_codec =
+            logical_codec.unwrap_or_else(|| Arc::new(DefaultLogicalExtensionCodec {}));
+        let logical_codec = FFI_LogicalExtensionCodec::new(
+            logical_codec,
+            runtime.clone(),
+            task_ctx_provider.clone(),
+        );
+        Self::new_with_ffi_codec(provider, runtime, logical_codec)
+    }
+
+    pub fn new_with_ffi_codec(
+        provider: Arc<dyn SchemaProvider>,
+        runtime: Option<Handle>,
+        logical_codec: FFI_LogicalExtensionCodec,
+    ) -> Self {
+        if let Some(provider) = provider.downcast_ref::<ForeignSchemaProvider>() {
+            return provider.0.clone();
+        }
+
         let owner_name = provider.owner_name().map(|s| s.into()).into();
         let private_data = Box::new(ProviderPrivateData { provider, runtime });
 
         Self {
             owner_name,
             table_names: table_names_fn_wrapper,
-            clone: clone_fn_wrapper,
-            release: release_fn_wrapper,
-            version: super::version,
-            private_data: Box::into_raw(private_data) as *mut c_void,
             table: table_fn_wrapper,
             register_table: register_table_fn_wrapper,
             deregister_table: deregister_table_fn_wrapper,
             table_exist: table_exist_fn_wrapper,
+            logical_codec,
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
         }
     }
 }
@@ -234,9 +293,13 @@ pub struct ForeignSchemaProvider(pub FFI_SchemaProvider);
 unsafe impl Send for ForeignSchemaProvider {}
 unsafe impl Sync for ForeignSchemaProvider {}
 
-impl From<&FFI_SchemaProvider> for ForeignSchemaProvider {
+impl From<&FFI_SchemaProvider> for Arc<dyn SchemaProvider> {
     fn from(provider: &FFI_SchemaProvider) -> Self {
-        Self(provider.clone())
+        if (provider.library_marker_id)() == crate::get_library_marker_id() {
+            return Arc::clone(unsafe { provider.inner() });
+        }
+
+        Arc::new(ForeignSchemaProvider(provider.clone())) as Arc<dyn SchemaProvider>
     }
 }
 
@@ -248,12 +311,8 @@ impl Clone for FFI_SchemaProvider {
 
 #[async_trait]
 impl SchemaProvider for ForeignSchemaProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn owner_name(&self) -> Option<&str> {
-        let name: Option<&RString> = self.0.owner_name.as_ref().into();
+        let name: Option<&SString> = self.0.owner_name.as_ref();
         name.map(|s| s.as_str())
     }
 
@@ -274,9 +333,7 @@ impl SchemaProvider for ForeignSchemaProvider {
             let table: Option<FFI_TableProvider> =
                 df_result!((self.0.table)(&self.0, name.into()).await)?.into();
 
-            let table = table.as_ref().map(|t| {
-                Arc::new(ForeignTableProvider::from(t)) as Arc<dyn TableProvider>
-            });
+            let table = table.as_ref().map(<Arc<dyn TableProvider>>::from);
 
             Ok(table)
         }
@@ -288,9 +345,14 @@ impl SchemaProvider for ForeignSchemaProvider {
         table: Arc<dyn TableProvider>,
     ) -> Result<Option<Arc<dyn TableProvider>>> {
         unsafe {
-            let ffi_table = match table.as_any().downcast_ref::<ForeignTableProvider>() {
+            let ffi_table = match table.downcast_ref::<ForeignTableProvider>() {
                 Some(t) => t.0.clone(),
-                None => FFI_TableProvider::new(table, true, None),
+                None => FFI_TableProvider::new_with_ffi_codec(
+                    table,
+                    true,
+                    None,
+                    self.0.logical_codec.clone(),
+                ),
             };
 
             let returned_provider: Option<FFI_TableProvider> =
@@ -320,7 +382,8 @@ impl SchemaProvider for ForeignSchemaProvider {
 #[cfg(test)]
 mod tests {
     use arrow::datatypes::Schema;
-    use datafusion::{catalog::MemorySchemaProvider, datasource::empty::EmptyTable};
+    use datafusion::catalog::MemorySchemaProvider;
+    use datafusion::datasource::empty::EmptyTable;
 
     use super::*;
 
@@ -331,15 +394,21 @@ mod tests {
     #[tokio::test]
     async fn test_round_trip_ffi_schema_provider() {
         let schema_provider = Arc::new(MemorySchemaProvider::new());
-        assert!(schema_provider
-            .as_ref()
-            .register_table("prior_table".to_string(), empty_table())
-            .unwrap()
-            .is_none());
+        assert!(
+            schema_provider
+                .as_ref()
+                .register_table("prior_table".to_string(), empty_table())
+                .unwrap()
+                .is_none()
+        );
 
-        let ffi_schema_provider = FFI_SchemaProvider::new(schema_provider, None);
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
 
-        let foreign_schema_provider: ForeignSchemaProvider =
+        let mut ffi_schema_provider =
+            FFI_SchemaProvider::new(schema_provider, None, task_ctx_provider, None);
+        ffi_schema_provider.library_marker_id = crate::mock_foreign_marker_id;
+
+        let foreign_schema_provider: Arc<dyn SchemaProvider> =
             (&ffi_schema_provider).into();
 
         let prior_table_names = foreign_schema_provider.table_names();
@@ -382,4 +451,30 @@ mod tests {
         assert!(returned_schema.is_some());
         assert!(foreign_schema_provider.table_exist("second_table"));
     }
+
+    #[test]
+    fn test_ffi_schema_provider_local_bypass() {
+        let schema_provider = Arc::new(MemorySchemaProvider::new());
+
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+        let mut ffi_schema =
+            FFI_SchemaProvider::new(schema_provider, None, task_ctx_provider, None);
+
+        // Verify local libraries can be downcast to their original
+        let foreign_schema: Arc<dyn SchemaProvider> = (&ffi_schema).into();
+        assert!(
+            foreign_schema
+                .downcast_ref::<MemorySchemaProvider>()
+                .is_some()
+        );
+
+        // Verify different library markers generate foreign providers
+        ffi_schema.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_schema: Arc<dyn SchemaProvider> = (&ffi_schema).into();
+        assert!(
+            foreign_schema
+                .downcast_ref::<ForeignSchemaProvider>()
+                .is_some()
+        );
+    }
 }
diff --git a/datafusion/ffi/src/session/config.rs b/datafusion/ffi/src/session/config.rs
new file mode 100644
index 0000000000000..fca0190c07138
--- /dev/null
+++ b/datafusion/ffi/src/session/config.rs
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::ffi::c_void;
+
+use crate::config::FFI_ConfigOptions;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::error::{DataFusionError, Result};
+use datafusion_execution::config::SessionConfig;
+
+/// A stable struct for sharing [`SessionConfig`] across FFI boundaries.
+/// Instead of attempting to expose the entire SessionConfig interface, we
+/// convert the config options into a map from a string to string and pass
+/// those values across the FFI boundary. On the receiver side, we
+/// reconstruct a SessionConfig from those values.
+///
+/// It is possible that using different versions of DataFusion across the
+/// FFI boundary could have differing expectations of the config options.
+/// This is a limitation of this approach, but exposing the entire
+/// SessionConfig via a FFI interface would be extensive and provide limited
+/// value over this version.
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_SessionConfig {
+    /// FFI stable configuration options.
+    pub config_options: FFI_ConfigOptions,
+
+    /// Used to create a clone on the provider of the execution plan. This should
+    /// only need to be called by the receiver of the plan.
+    pub clone: unsafe extern "C" fn(plan: &Self) -> Self,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(arg: &mut Self),
+
+    /// Internal data. This is only to be accessed by the provider of the plan.
+    pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+unsafe impl Send for FFI_SessionConfig {}
+unsafe impl Sync for FFI_SessionConfig {}
+
+impl FFI_SessionConfig {
+    fn inner(&self) -> &SessionConfig {
+        let private_data = self.private_data as *mut SessionConfigPrivateData;
+        unsafe { &(*private_data).config }
+    }
+}
+
+unsafe extern "C" fn release_fn_wrapper(config: &mut FFI_SessionConfig) {
+    unsafe {
+        debug_assert!(!config.private_data.is_null());
+        let private_data =
+            Box::from_raw(config.private_data as *mut SessionConfigPrivateData);
+        drop(private_data);
+        config.private_data = std::ptr::null_mut();
+    }
+}
+
+unsafe extern "C" fn clone_fn_wrapper(config: &FFI_SessionConfig) -> FFI_SessionConfig {
+    unsafe {
+        let old_private_data = config.private_data as *mut SessionConfigPrivateData;
+        let old_config = (*old_private_data).config.clone();
+
+        let private_data = Box::new(SessionConfigPrivateData { config: old_config });
+
+        FFI_SessionConfig {
+            config_options: config.config_options.clone(),
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+struct SessionConfigPrivateData {
+    pub config: SessionConfig,
+}
+
+impl From<&SessionConfig> for FFI_SessionConfig {
+    fn from(session: &SessionConfig) -> Self {
+        let private_data = Box::new(SessionConfigPrivateData {
+            config: session.clone(),
+        });
+
+        let config_options = FFI_ConfigOptions::from(session.options().as_ref());
+
+        Self {
+            config_options,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+impl Clone for FFI_SessionConfig {
+    fn clone(&self) -> Self {
+        unsafe { (self.clone)(self) }
+    }
+}
+
+impl Drop for FFI_SessionConfig {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) };
+    }
+}
+
+impl TryFrom<&FFI_SessionConfig> for SessionConfig {
+    type Error = DataFusionError;
+
+    fn try_from(config: &FFI_SessionConfig) -> Result<Self, Self::Error> {
+        if (config.library_marker_id)() == crate::get_library_marker_id() {
+            return Ok(config.inner().clone());
+        }
+
+        let config_options = ConfigOptions::try_from(config.config_options.clone())?;
+
+        Ok(SessionConfig::from(config_options))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_round_trip_ffi_session_config() -> Result<()> {
+        let session_config = SessionConfig::new();
+        let original_options = session_config.options().entries();
+
+        let mut ffi_config: FFI_SessionConfig = (&session_config).into();
+        let _ = ffi_config.clone();
+        ffi_config.library_marker_id = crate::mock_foreign_marker_id;
+
+        let foreign_config: SessionConfig = (&ffi_config).try_into()?;
+
+        let returned_options = foreign_config.options().entries();
+
+        assert_eq!(original_options.len(), returned_options.len());
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/src/session/mod.rs b/datafusion/ffi/src/session/mod.rs
new file mode 100644
index 0000000000000..dfc9d1c7dfebd
--- /dev/null
+++ b/datafusion/ffi/src/session/mod.rs
@@ -0,0 +1,722 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::collections::HashMap;
+use std::ffi::c_void;
+use std::sync::Arc;
+
+use arrow_schema::SchemaRef;
+use arrow_schema::ffi::FFI_ArrowSchema;
+use async_ffi::{FfiFuture, FutureExt};
+use async_trait::async_trait;
+use datafusion_common::config::{ConfigFileType, ConfigOptions, TableOptions};
+use datafusion_common::{DFSchema, DataFusionError};
+use datafusion_execution::TaskContext;
+use datafusion_execution::config::SessionConfig;
+use datafusion_execution::runtime_env::RuntimeEnv;
+use datafusion_expr::execution_props::ExecutionProps;
+use datafusion_expr::registry::{ExtensionTypeRegistryRef, MemoryExtensionTypeRegistry};
+use datafusion_expr::{
+    AggregateUDF, AggregateUDFImpl, Expr, HigherOrderUDF, LogicalPlan, ScalarUDF,
+    ScalarUDFImpl, WindowUDF, WindowUDFImpl,
+};
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_proto::bytes::{logical_plan_from_bytes, logical_plan_to_bytes};
+use datafusion_proto::logical_plan::LogicalExtensionCodec;
+use datafusion_proto::logical_plan::from_proto::parse_expr;
+use datafusion_proto::logical_plan::to_proto::serialize_expr;
+use datafusion_proto::protobuf::LogicalExprNode;
+use datafusion_session::Session;
+use prost::Message;
+
+use stabby::str::Str as SStr;
+use stabby::string::String as SString;
+use stabby::vec::Vec as SVec;
+use tokio::runtime::Handle;
+
+use crate::arrow_wrappers::WrappedSchema;
+use crate::execution::FFI_TaskContext;
+use crate::execution_plan::FFI_ExecutionPlan;
+use crate::physical_expr::FFI_PhysicalExpr;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::session::config::FFI_SessionConfig;
+use crate::udaf::FFI_AggregateUDF;
+use crate::udf::FFI_ScalarUDF;
+use crate::udwf::FFI_WindowUDF;
+use crate::util::FFI_Result;
+use crate::{df_result, sresult, sresult_return};
+
+pub mod config;
+
+/// A stable struct for sharing [`Session`] across FFI boundaries.
+///
+/// Care must be taken when using this struct. Unlike most of the structs in
+/// this crate, the private data for [`FFI_SessionRef`] contains borrowed data.
+/// The lifetime of the borrow is lost when hidden within the ``*mut c_void``
+/// of the private data. For this reason, it is the user's responsibility to
+/// ensure the lifetime of the [`Session`] remains valid.
+///
+/// The reason for storing `&dyn Session` is because the primary motivation
+/// for implementing this struct is [`crate::table_provider::FFI_TableProvider`]
+/// which has methods that require `&dyn Session`. For usage within this crate
+/// we know the [`Session`] lifetimes are valid.
+#[repr(C)]
+#[derive(Debug)]
+pub(crate) struct FFI_SessionRef {
+    session_id: unsafe extern "C" fn(&Self) -> SStr,
+
+    config: unsafe extern "C" fn(&Self) -> FFI_SessionConfig,
+
+    create_physical_plan:
+        unsafe extern "C" fn(
+            &Self,
+            logical_plan_serialized: SVec<u8>,
+        ) -> FfiFuture<FFI_Result<FFI_ExecutionPlan>>,
+
+    create_physical_expr: unsafe extern "C" fn(
+        &Self,
+        expr_serialized: SVec<u8>,
+        schema: WrappedSchema,
+    ) -> FFI_Result<FFI_PhysicalExpr>,
+
+    scalar_functions: unsafe extern "C" fn(&Self) -> SVec<(SString, FFI_ScalarUDF)>,
+
+    aggregate_functions: unsafe extern "C" fn(&Self) -> SVec<(SString, FFI_AggregateUDF)>,
+
+    window_functions: unsafe extern "C" fn(&Self) -> SVec<(SString, FFI_WindowUDF)>,
+
+    table_options: unsafe extern "C" fn(&Self) -> SVec<(SString, SString)>,
+
+    default_table_options: unsafe extern "C" fn(&Self) -> SVec<(SString, SString)>,
+
+    task_ctx: unsafe extern "C" fn(&Self) -> FFI_TaskContext,
+
+    logical_codec: FFI_LogicalExtensionCodec,
+
+    /// Used to create a clone on the provider of the registry. This should
+    /// only need to be called by the receiver of the plan.
+    clone: unsafe extern "C" fn(plan: &Self) -> Self,
+
+    /// Release the memory of the private data when it is no longer being used.
+    release: unsafe extern "C" fn(arg: &mut Self),
+
+    /// Return the major DataFusion version number of this registry.
+    pub version: unsafe extern "C" fn() -> u64,
+
+    /// Internal data. This is only to be accessed by the provider of the plan.
+    /// A [`ForeignSession`] should never attempt to access this data.
+    private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+unsafe impl Send for FFI_SessionRef {}
+unsafe impl Sync for FFI_SessionRef {}
+
+struct SessionPrivateData<'a> {
+    session: &'a (dyn Session + Send + Sync),
+    runtime: Option<Handle>,
+}
+
+impl FFI_SessionRef {
+    fn inner(&self) -> &(dyn Session + Send + Sync) {
+        let private_data = self.private_data as *const SessionPrivateData;
+        unsafe { (*private_data).session }
+    }
+
+    unsafe fn runtime(&self) -> &Option<Handle> {
+        unsafe {
+            let private_data = self.private_data as *const SessionPrivateData;
+            &(*private_data).runtime
+        }
+    }
+}
+
+unsafe extern "C" fn session_id_fn_wrapper(session: &FFI_SessionRef) -> SStr<'_> {
+    let session = session.inner();
+    session.session_id().into()
+}
+
+unsafe extern "C" fn config_fn_wrapper(session: &FFI_SessionRef) -> FFI_SessionConfig {
+    let session = session.inner();
+    session.config().into()
+}
+
+unsafe extern "C" fn create_physical_plan_fn_wrapper(
+    session: &FFI_SessionRef,
+    logical_plan_serialized: SVec<u8>,
+) -> FfiFuture<FFI_Result<FFI_ExecutionPlan>> {
+    unsafe {
+        let runtime = session.runtime().clone();
+        let session = session.clone();
+        async move {
+            let session = session.inner();
+            let task_ctx = session.task_ctx();
+
+            let logical_plan = sresult_return!(logical_plan_from_bytes(
+                logical_plan_serialized.as_slice(),
+                task_ctx.as_ref(),
+            ));
+
+            let physical_plan = session.create_physical_plan(&logical_plan).await;
+
+            sresult!(physical_plan.map(|plan| FFI_ExecutionPlan::new(plan, runtime)))
+        }
+        .into_ffi()
+    }
+}
+
+unsafe extern "C" fn create_physical_expr_fn_wrapper(
+    session: &FFI_SessionRef,
+    expr_serialized: SVec<u8>,
+    schema: WrappedSchema,
+) -> FFI_Result<FFI_PhysicalExpr> {
+    let codec: Arc<dyn LogicalExtensionCodec> = (&session.logical_codec).into();
+    let session = session.inner();
+
+    let logical_expr = LogicalExprNode::decode(expr_serialized.as_slice()).unwrap();
+    let logical_expr =
+        parse_expr(&logical_expr, session.task_ctx().as_ref(), codec.as_ref()).unwrap();
+    let schema: SchemaRef = schema.into();
+    let schema: DFSchema = sresult_return!(schema.try_into());
+
+    let physical_expr =
+        sresult_return!(session.create_physical_expr(logical_expr, &schema));
+
+    FFI_Result::Ok(physical_expr.into())
+}
+
+unsafe extern "C" fn scalar_functions_fn_wrapper(
+    session: &FFI_SessionRef,
+) -> SVec<(SString, FFI_ScalarUDF)> {
+    let session = session.inner();
+    session
+        .scalar_functions()
+        .iter()
+        .map(|(name, udf)| (name.clone().into(), FFI_ScalarUDF::from(Arc::clone(udf))))
+        .collect()
+}
+
+unsafe extern "C" fn aggregate_functions_fn_wrapper(
+    session: &FFI_SessionRef,
+) -> SVec<(SString, FFI_AggregateUDF)> {
+    let session = session.inner();
+    session
+        .aggregate_functions()
+        .iter()
+        .map(|(name, udaf)| {
+            (
+                name.clone().into(),
+                FFI_AggregateUDF::from(Arc::clone(udaf)),
+            )
+        })
+        .collect()
+}
+
+unsafe extern "C" fn window_functions_fn_wrapper(
+    session: &FFI_SessionRef,
+) -> SVec<(SString, FFI_WindowUDF)> {
+    let session = session.inner();
+    session
+        .window_functions()
+        .iter()
+        .map(|(name, udwf)| (name.clone().into(), FFI_WindowUDF::from(Arc::clone(udwf))))
+        .collect()
+}
+
+fn table_options_to_rhash(mut options: TableOptions) -> SVec<(SString, SString)> {
+    // It is important that we mutate options here and set current format
+    // to None so that when we call `entries()` we get ALL format entries.
+    // We will pass current_format as a special case and strip it on the
+    // other side of the boundary.
+    let current_format = options.current_format.take();
+    let mut options: HashMap<SString, SString> = options
+        .entries()
+        .into_iter()
+        .filter_map(|entry| entry.value.map(|v| (entry.key.into(), v.into())))
+        .collect();
+    if let Some(current_format) = current_format {
+        options.insert(
+            "datafusion_ffi.table_current_format".into(),
+            match current_format {
+                ConfigFileType::JSON => "json",
+                ConfigFileType::PARQUET => "parquet",
+                ConfigFileType::CSV => "csv",
+            }
+            .into(),
+        );
+    }
+
+    options.into_iter().collect()
+}
+
+unsafe extern "C" fn table_options_fn_wrapper(
+    session: &FFI_SessionRef,
+) -> SVec<(SString, SString)> {
+    let session = session.inner();
+    let table_options = session.table_options();
+    table_options_to_rhash(table_options.clone())
+}
+
+unsafe extern "C" fn default_table_options_fn_wrapper(
+    session: &FFI_SessionRef,
+) -> SVec<(SString, SString)> {
+    let session = session.inner();
+    let table_options = session.default_table_options();
+
+    table_options_to_rhash(table_options)
+}
+
+unsafe extern "C" fn task_ctx_fn_wrapper(session: &FFI_SessionRef) -> FFI_TaskContext {
+    session.inner().task_ctx().into()
+}
+
+unsafe extern "C" fn release_fn_wrapper(provider: &mut FFI_SessionRef) {
+    unsafe {
+        let private_data =
+            Box::from_raw(provider.private_data as *mut SessionPrivateData);
+        drop(private_data);
+    }
+}
+
+unsafe extern "C" fn clone_fn_wrapper(provider: &FFI_SessionRef) -> FFI_SessionRef {
+    unsafe {
+        let old_private_data = provider.private_data as *const SessionPrivateData;
+
+        let private_data = Box::into_raw(Box::new(SessionPrivateData {
+            session: (*old_private_data).session,
+            runtime: (*old_private_data).runtime.clone(),
+        })) as *mut c_void;
+
+        FFI_SessionRef {
+            session_id: session_id_fn_wrapper,
+            config: config_fn_wrapper,
+            create_physical_plan: create_physical_plan_fn_wrapper,
+            create_physical_expr: create_physical_expr_fn_wrapper,
+            scalar_functions: scalar_functions_fn_wrapper,
+            aggregate_functions: aggregate_functions_fn_wrapper,
+            window_functions: window_functions_fn_wrapper,
+            table_options: table_options_fn_wrapper,
+            default_table_options: default_table_options_fn_wrapper,
+            task_ctx: task_ctx_fn_wrapper,
+            logical_codec: provider.logical_codec.clone(),
+
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+impl Drop for FFI_SessionRef {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+impl FFI_SessionRef {
+    /// Creates a new [`FFI_SessionRef`].
+    pub fn new(
+        session: &(dyn Session + Send + Sync),
+        runtime: Option<Handle>,
+        logical_codec: FFI_LogicalExtensionCodec,
+    ) -> Self {
+        if let Some(session) = session.as_any().downcast_ref::<ForeignSession>() {
+            return session.session.clone();
+        }
+
+        let private_data = Box::new(SessionPrivateData { session, runtime });
+
+        Self {
+            session_id: session_id_fn_wrapper,
+            config: config_fn_wrapper,
+            create_physical_plan: create_physical_plan_fn_wrapper,
+            create_physical_expr: create_physical_expr_fn_wrapper,
+            scalar_functions: scalar_functions_fn_wrapper,
+            aggregate_functions: aggregate_functions_fn_wrapper,
+            window_functions: window_functions_fn_wrapper,
+            table_options: table_options_fn_wrapper,
+            default_table_options: default_table_options_fn_wrapper,
+            task_ctx: task_ctx_fn_wrapper,
+            logical_codec,
+
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+/// This wrapper struct exists on the receiver side of the FFI interface, so it has
+/// no guarantees about being able to access the data in `private_data`. Any functions
+/// defined on this struct must only use the stable functions provided in
+/// FFI_Session to interact with the foreign table provider.
+#[derive(Debug)]
+pub struct ForeignSession {
+    session: FFI_SessionRef,
+    config: SessionConfig,
+    scalar_functions: HashMap<String, Arc<ScalarUDF>>,
+    higher_order_functions: HashMap<String, Arc<dyn HigherOrderUDF>>,
+    aggregate_functions: HashMap<String, Arc<AggregateUDF>>,
+    window_functions: HashMap<String, Arc<WindowUDF>>,
+    extension_types: ExtensionTypeRegistryRef,
+    table_options: TableOptions,
+    runtime_env: Arc<RuntimeEnv>,
+    props: ExecutionProps,
+}
+
+unsafe impl Send for ForeignSession {}
+unsafe impl Sync for ForeignSession {}
+
+impl FFI_SessionRef {
+    pub fn as_local(&self) -> Option<&(dyn Session + Send + Sync)> {
+        if (self.library_marker_id)() == crate::get_library_marker_id() {
+            return Some(self.inner());
+        }
+        None
+    }
+}
+
+impl TryFrom<&FFI_SessionRef> for ForeignSession {
+    type Error = DataFusionError;
+    fn try_from(session: &FFI_SessionRef) -> Result<Self, Self::Error> {
+        unsafe {
+            let table_options =
+                table_options_from_rhashmap((session.table_options)(session));
+
+            let config = (session.config)(session);
+            let config = SessionConfig::try_from(&config)?;
+
+            let scalar_functions = (session.scalar_functions)(session)
+                .into_iter()
+                .map(|kv_pair| {
+                    let udf = <Arc<dyn ScalarUDFImpl>>::from(&kv_pair.1);
+
+                    (
+                        kv_pair.0.to_string(),
+                        Arc::new(ScalarUDF::new_from_shared_impl(udf)),
+                    )
+                })
+                .collect();
+            let aggregate_functions = (session.aggregate_functions)(session)
+                .into_iter()
+                .map(|kv_pair| {
+                    let udaf = <Arc<dyn AggregateUDFImpl>>::from(&kv_pair.1);
+
+                    (
+                        kv_pair.0.to_string(),
+                        Arc::new(AggregateUDF::new_from_shared_impl(udaf)),
+                    )
+                })
+                .collect();
+            let window_functions = (session.window_functions)(session)
+                .into_iter()
+                .map(|kv_pair| {
+                    let udwf = <Arc<dyn WindowUDFImpl>>::from(&kv_pair.1);
+
+                    (
+                        kv_pair.0.to_string(),
+                        Arc::new(WindowUDF::new_from_shared_impl(udwf)),
+                    )
+                })
+                .collect();
+
+            Ok(Self {
+                session: session.clone(),
+                config,
+                table_options,
+                scalar_functions,
+                higher_order_functions: HashMap::new(),
+                aggregate_functions,
+                window_functions,
+                extension_types: Arc::new(MemoryExtensionTypeRegistry::default()),
+                runtime_env: Default::default(),
+                props: Default::default(),
+            })
+        }
+    }
+}
+
+impl Clone for FFI_SessionRef {
+    fn clone(&self) -> Self {
+        unsafe { (self.clone)(self) }
+    }
+}
+
+fn table_options_from_rhashmap(options: SVec<(SString, SString)>) -> TableOptions {
+    let mut options: HashMap<String, String> = options
+        .into_iter()
+        .map(|kv_pair| (kv_pair.0.to_string(), kv_pair.1.to_string()))
+        .collect();
+    let current_format = options.remove("datafusion_ffi.table_current_format");
+
+    let mut table_options = TableOptions::default();
+    let formats = [
+        ConfigFileType::CSV,
+        ConfigFileType::JSON,
+        ConfigFileType::PARQUET,
+    ];
+    for format in formats {
+        // It is imperative that if new enum variants are added below that they be
+        // included in the formats list above and in the extension check below.
+        let format_name = match &format {
+            ConfigFileType::CSV => "csv",
+            ConfigFileType::PARQUET => "parquet",
+            ConfigFileType::JSON => "json",
+        };
+        let format_options: HashMap<String, String> = options
+            .iter()
+            .filter_map(|(k, v)| {
+                let (prefix, key) = k.split_once(".")?;
+                if prefix == format_name {
+                    Some((format!("format.{key}"), v.to_owned()))
+                } else {
+                    None
+                }
+            })
+            .collect();
+        if !format_options.is_empty() {
+            table_options.current_format = Some(format.clone());
+            table_options
+                .alter_with_string_hash_map(&format_options)
+                .unwrap_or_else(|err| log::warn!("Error parsing table options: {err}"));
+        }
+    }
+
+    let extension_options: HashMap<String, String> = options
+        .iter()
+        .filter_map(|(k, v)| {
+            let (prefix, _) = k.split_once(".")?;
+            if !["json", "parquet", "csv"].contains(&prefix) {
+                Some((k.to_owned(), v.to_owned()))
+            } else {
+                None
+            }
+        })
+        .collect();
+    if !extension_options.is_empty() {
+        table_options
+            .alter_with_string_hash_map(&extension_options)
+            .unwrap_or_else(|err| log::warn!("Error parsing table options: {err}"));
+    }
+
+    table_options.current_format =
+        current_format.and_then(|format| match format.as_str() {
+            "csv" => Some(ConfigFileType::CSV),
+            "parquet" => Some(ConfigFileType::PARQUET),
+            "json" => Some(ConfigFileType::JSON),
+            _ => None,
+        });
+    table_options
+}
+
+#[async_trait]
+impl Session for ForeignSession {
+    fn session_id(&self) -> &str {
+        unsafe { (self.session.session_id)(&self.session).as_str() }
+    }
+
+    fn config(&self) -> &SessionConfig {
+        &self.config
+    }
+
+    fn config_options(&self) -> &ConfigOptions {
+        self.config.options()
+    }
+
+    async fn create_physical_plan(
+        &self,
+        logical_plan: &LogicalPlan,
+    ) -> datafusion_common::Result<Arc<dyn ExecutionPlan>> {
+        unsafe {
+            let logical_plan = logical_plan_to_bytes(logical_plan)?;
+            let physical_plan = df_result!(
+                (self.session.create_physical_plan)(
+                    &self.session,
+                    logical_plan.as_ref().into()
+                )
+                .await
+            )?;
+            let physical_plan = <Arc<dyn ExecutionPlan>>::try_from(&physical_plan)?;
+
+            Ok(physical_plan)
+        }
+    }
+
+    fn create_physical_expr(
+        &self,
+        expr: Expr,
+        df_schema: &DFSchema,
+    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+        unsafe {
+            let codec: Arc<dyn LogicalExtensionCodec> =
+                (&self.session.logical_codec).into();
+            let logical_expr = serialize_expr(&expr, codec.as_ref())?.encode_to_vec();
+            let schema = WrappedSchema(FFI_ArrowSchema::try_from(df_schema.as_arrow())?);
+
+            let physical_expr = df_result!((self.session.create_physical_expr)(
+                &self.session,
+                logical_expr.into_iter().collect(),
+                schema
+            ))?;
+
+            Ok((&physical_expr).into())
+        }
+    }
+
+    fn scalar_functions(&self) -> &HashMap<String, Arc<ScalarUDF>> {
+        &self.scalar_functions
+    }
+
+    fn higher_order_functions(&self) -> &HashMap<String, Arc<dyn HigherOrderUDF>> {
+        &self.higher_order_functions
+    }
+
+    fn aggregate_functions(&self) -> &HashMap<String, Arc<AggregateUDF>> {
+        &self.aggregate_functions
+    }
+
+    fn window_functions(&self) -> &HashMap<String, Arc<WindowUDF>> {
+        &self.window_functions
+    }
+
+    fn extension_type_registry(&self) -> &ExtensionTypeRegistryRef {
+        &self.extension_types
+    }
+
+    fn runtime_env(&self) -> &Arc<RuntimeEnv> {
+        &self.runtime_env
+    }
+
+    fn execution_props(&self) -> &ExecutionProps {
+        &self.props
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn table_options(&self) -> &TableOptions {
+        &self.table_options
+    }
+
+    fn default_table_options(&self) -> TableOptions {
+        unsafe {
+            table_options_from_rhashmap((self.session.default_table_options)(
+                &self.session,
+            ))
+        }
+    }
+
+    fn table_options_mut(&mut self) -> &mut TableOptions {
+        log::warn!(
+            "Mutating table options is not supported via FFI. Changes will not have an effect."
+        );
+        &mut self.table_options
+    }
+
+    fn task_ctx(&self) -> Arc<TaskContext> {
+        unsafe { (self.session.task_ctx)(&self.session).into() }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow_schema::{DataType, Field, Schema};
+    use datafusion::execution::SessionStateBuilder;
+    use datafusion_common::DataFusionError;
+    use datafusion_expr::col;
+    use datafusion_expr::registry::FunctionRegistry;
+    use datafusion_proto::logical_plan::DefaultLogicalExtensionCodec;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_ffi_session() -> Result<(), DataFusionError> {
+        let (ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+        let mut table_options = TableOptions::default();
+        table_options.csv.has_header = Some(true);
+        table_options.json.schema_infer_max_rec = Some(10);
+        table_options.parquet.global.coerce_int96 = Some("123456789".into());
+        table_options.current_format = Some(ConfigFileType::JSON);
+
+        let state = SessionStateBuilder::new_from_existing(ctx.state())
+            .with_table_options(table_options)
+            .build();
+
+        let logical_codec = FFI_LogicalExtensionCodec::new(
+            Arc::new(DefaultLogicalExtensionCodec {}),
+            None,
+            task_ctx_provider,
+        );
+
+        let local_session = FFI_SessionRef::new(&state, None, logical_codec);
+        let foreign_session = ForeignSession::try_from(&local_session)?;
+
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+        let df_schema = schema.try_into()?;
+        let physical_expr = foreign_session.create_physical_expr(col("a"), &df_schema)?;
+        assert_eq!(
+            format!("{physical_expr:?}"),
+            "Column { name: \"a\", index: 0 }"
+        );
+
+        assert_eq!(foreign_session.session_id(), state.session_id());
+
+        let logical_plan = LogicalPlan::default();
+        let physical_plan = foreign_session.create_physical_plan(&logical_plan).await?;
+        assert_eq!(
+            format!("{physical_plan:?}"),
+            "EmptyExec { schema: Schema { fields: [], metadata: {} }, partitions: 1, cache: PlanProperties { eq_properties: EquivalenceProperties { eq_group: EquivalenceGroup { map: {}, classes: [] }, oeq_class: OrderingEquivalenceClass { orderings: [] }, oeq_cache: OrderingEquivalenceCache { normal_cls: OrderingEquivalenceClass { orderings: [] }, leading_map: {} }, constraints: Constraints { inner: [] }, schema: Schema { fields: [], metadata: {} } }, partitioning: UnknownPartitioning(1), emission_type: Incremental, boundedness: Bounded, evaluation_type: Lazy, scheduling_type: Cooperative, output_ordering: None } }"
+        );
+
+        assert_eq!(
+            format!("{:?}", foreign_session.default_table_options()),
+            format!("{:?}", state.default_table_options())
+        );
+
+        assert_eq!(
+            format!("{:?}", foreign_session.table_options()),
+            format!("{:?}", state.table_options())
+        );
+
+        let local_udfs = state.udfs();
+        for udf in foreign_session.scalar_functions().keys() {
+            assert!(local_udfs.contains(udf));
+        }
+        let local_udafs = state.udafs();
+        for udaf in foreign_session.aggregate_functions().keys() {
+            assert!(local_udafs.contains(udaf));
+        }
+        let local_udwfs = state.udwfs();
+        for udwf in foreign_session.window_functions().keys() {
+            assert!(local_udwfs.contains(udwf));
+        }
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/src/session_config.rs b/datafusion/ffi/src/session_config.rs
deleted file mode 100644
index a07b66c601962..0000000000000
--- a/datafusion/ffi/src/session_config.rs
+++ /dev/null
@@ -1,185 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use abi_stable::{
-    std_types::{RHashMap, RString},
-    StableAbi,
-};
-use datafusion::{config::ConfigOptions, error::Result};
-use datafusion::{error::DataFusionError, prelude::SessionConfig};
-use std::sync::Arc;
-use std::{
-    collections::HashMap,
-    ffi::{c_char, c_void, CString},
-};
-
-/// A stable struct for sharing [`SessionConfig`] across FFI boundaries.
-/// Instead of attempting to expose the entire SessionConfig interface, we
-/// convert the config options into a map from a string to string and pass
-/// those values across the FFI boundary. On the receiver side, we
-/// reconstruct a SessionConfig from those values.
-///
-/// It is possible that using different versions of DataFusion across the
-/// FFI boundary could have differing expectations of the config options.
-/// This is a limitation of this approach, but exposing the entire
-/// SessionConfig via a FFI interface would be extensive and provide limited
-/// value over this version.
-#[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
-pub struct FFI_SessionConfig {
-    /// Return a hash map from key to value of the config options represented
-    /// by string values.
-    pub config_options: unsafe extern "C" fn(config: &Self) -> RHashMap<RString, RString>,
-
-    /// Used to create a clone on the provider of the execution plan. This should
-    /// only need to be called by the receiver of the plan.
-    pub clone: unsafe extern "C" fn(plan: &Self) -> Self,
-
-    /// Release the memory of the private data when it is no longer being used.
-    pub release: unsafe extern "C" fn(arg: &mut Self),
-
-    /// Internal data. This is only to be accessed by the provider of the plan.
-    /// A [`ForeignSessionConfig`] should never attempt to access this data.
-    pub private_data: *mut c_void,
-}
-
-unsafe impl Send for FFI_SessionConfig {}
-unsafe impl Sync for FFI_SessionConfig {}
-
-unsafe extern "C" fn config_options_fn_wrapper(
-    config: &FFI_SessionConfig,
-) -> RHashMap<RString, RString> {
-    let private_data = config.private_data as *mut SessionConfigPrivateData;
-    let config_options = &(*private_data).config;
-
-    let mut options = RHashMap::default();
-    for config_entry in config_options.entries() {
-        if let Some(value) = config_entry.value {
-            options.insert(config_entry.key.into(), value.into());
-        }
-    }
-
-    options
-}
-
-unsafe extern "C" fn release_fn_wrapper(config: &mut FFI_SessionConfig) {
-    let private_data =
-        Box::from_raw(config.private_data as *mut SessionConfigPrivateData);
-    drop(private_data);
-}
-
-unsafe extern "C" fn clone_fn_wrapper(config: &FFI_SessionConfig) -> FFI_SessionConfig {
-    let old_private_data = config.private_data as *mut SessionConfigPrivateData;
-    let old_config = Arc::clone(&(*old_private_data).config);
-
-    let private_data = Box::new(SessionConfigPrivateData { config: old_config });
-
-    FFI_SessionConfig {
-        config_options: config_options_fn_wrapper,
-        private_data: Box::into_raw(private_data) as *mut c_void,
-        clone: clone_fn_wrapper,
-        release: release_fn_wrapper,
-    }
-}
-
-struct SessionConfigPrivateData {
-    pub config: Arc<ConfigOptions>,
-}
-
-impl From<&SessionConfig> for FFI_SessionConfig {
-    fn from(session: &SessionConfig) -> Self {
-        let mut config_keys = Vec::new();
-        let mut config_values = Vec::new();
-        for config_entry in session.options().entries() {
-            if let Some(value) = config_entry.value {
-                let key_cstr = CString::new(config_entry.key).unwrap_or_default();
-                let key_ptr = key_cstr.into_raw() as *const c_char;
-                config_keys.push(key_ptr);
-
-                config_values
-                    .push(CString::new(value).unwrap_or_default().into_raw()
-                        as *const c_char);
-            }
-        }
-
-        let private_data = Box::new(SessionConfigPrivateData {
-            config: Arc::clone(session.options()),
-        });
-
-        Self {
-            config_options: config_options_fn_wrapper,
-            private_data: Box::into_raw(private_data) as *mut c_void,
-            clone: clone_fn_wrapper,
-            release: release_fn_wrapper,
-        }
-    }
-}
-
-impl Clone for FFI_SessionConfig {
-    fn clone(&self) -> Self {
-        unsafe { (self.clone)(self) }
-    }
-}
-
-impl Drop for FFI_SessionConfig {
-    fn drop(&mut self) {
-        unsafe { (self.release)(self) };
-    }
-}
-
-/// A wrapper struct for accessing [`SessionConfig`] across a FFI boundary.
-/// The [`SessionConfig`] will be generated from a hash map of the config
-/// options in the provider and will be reconstructed on this side of the
-/// interface.s
-pub struct ForeignSessionConfig(pub SessionConfig);
-
-impl TryFrom<&FFI_SessionConfig> for ForeignSessionConfig {
-    type Error = DataFusionError;
-
-    fn try_from(config: &FFI_SessionConfig) -> Result<Self, Self::Error> {
-        let config_options = unsafe { (config.config_options)(config) };
-
-        let mut options_map = HashMap::new();
-        config_options.iter().for_each(|kv_pair| {
-            options_map.insert(kv_pair.0.to_string(), kv_pair.1.to_string());
-        });
-
-        Ok(Self(SessionConfig::from_string_hash_map(&options_map)?))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_round_trip_ffi_session_config() -> Result<()> {
-        let session_config = SessionConfig::new();
-        let original_options = session_config.options().entries();
-
-        let ffi_config: FFI_SessionConfig = (&session_config).into();
-
-        let foreign_config: ForeignSessionConfig = (&ffi_config).try_into()?;
-
-        let returned_options = foreign_config.0.options().entries();
-
-        assert!(original_options.len() == returned_options.len());
-
-        Ok(())
-    }
-}
diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs
index 890511997a706..b6c077526c5f2 100644
--- a/datafusion/ffi/src/table_provider.rs
+++ b/datafusion/ffi/src/table_provider.rs
@@ -15,46 +15,38 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{any::Any, ffi::c_void, sync::Arc};
+use std::ffi::c_void;
+use std::sync::Arc;
 
-use abi_stable::{
-    std_types::{ROption, RResult, RString, RVec},
-    StableAbi,
-};
 use arrow::datatypes::SchemaRef;
 use async_ffi::{FfiFuture, FutureExt};
 use async_trait::async_trait;
-use datafusion::{
-    catalog::{Session, TableProvider},
-    datasource::TableType,
-    error::DataFusionError,
-    execution::{session_state::SessionStateBuilder, TaskContext},
-    logical_expr::{logical_plan::dml::InsertOp, TableProviderFilterPushDown},
-    physical_plan::ExecutionPlan,
-    prelude::{Expr, SessionContext},
-};
-use datafusion_proto::{
-    logical_plan::{
-        from_proto::parse_exprs, to_proto::serialize_exprs, DefaultLogicalExtensionCodec,
-    },
-    protobuf::LogicalExprList,
+use datafusion_catalog::{Session, TableProvider};
+use datafusion_common::error::{DataFusionError, Result};
+use datafusion_execution::TaskContext;
+use datafusion_expr::dml::InsertOp;
+use datafusion_expr::{Expr, TableProviderFilterPushDown, TableType};
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_proto::logical_plan::from_proto::parse_exprs;
+use datafusion_proto::logical_plan::to_proto::serialize_exprs;
+use datafusion_proto::logical_plan::{
+    DefaultLogicalExtensionCodec, LogicalExtensionCodec,
 };
+use datafusion_proto::protobuf::LogicalExprList;
 use prost::Message;
-use tokio::runtime::Handle;
 
-use crate::{
-    arrow_wrappers::WrappedSchema,
-    df_result, rresult_return,
-    session_config::ForeignSessionConfig,
-    table_source::{FFI_TableProviderFilterPushDown, FFI_TableType},
-};
+use stabby::vec::Vec as SVec;
+use tokio::runtime::Handle;
 
-use super::{
-    execution_plan::{FFI_ExecutionPlan, ForeignExecutionPlan},
-    insert_op::FFI_InsertOp,
-    session_config::FFI_SessionConfig,
-};
-use datafusion::error::Result;
+use super::execution_plan::FFI_ExecutionPlan;
+use super::insert_op::FFI_InsertOp;
+use crate::arrow_wrappers::WrappedSchema;
+use crate::execution::FFI_TaskContextProvider;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::session::{FFI_SessionRef, ForeignSession};
+use crate::table_source::{FFI_TableProviderFilterPushDown, FFI_TableType};
+use crate::util::{FFI_Option, FFI_Result};
+use crate::{df_result, sresult_return};
 
 /// A stable struct for sharing [`TableProvider`] across FFI boundaries.
 ///
@@ -96,111 +88,121 @@ use datafusion::error::Result;
 /// It is important to be careful when expanding these functions to be certain which
 /// side of the interface each object refers to.
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
+#[derive(Debug)]
 pub struct FFI_TableProvider {
     /// Return the table schema
-    pub schema: unsafe extern "C" fn(provider: &Self) -> WrappedSchema,
+    schema: unsafe extern "C" fn(provider: &Self) -> WrappedSchema,
 
     /// Perform a scan on the table. See [`TableProvider`] for detailed usage information.
     ///
     /// # Arguments
     ///
     /// * `provider` - the table provider
-    /// * `session_config` - session configuration
+    /// * `session` - session
     /// * `projections` - if specified, only a subset of the columns are returned
     /// * `filters_serialized` - filters to apply to the scan, which are a
     ///   [`LogicalExprList`] protobuf message serialized into bytes to pass
     ///   across the FFI boundary.
     /// * `limit` - if specified, limit the number of rows returned
-    pub scan: unsafe extern "C" fn(
+    scan: unsafe extern "C" fn(
         provider: &Self,
-        session_config: &FFI_SessionConfig,
-        projections: RVec<usize>,
-        filters_serialized: RVec<u8>,
-        limit: ROption<usize>,
-    ) -> FfiFuture<RResult<FFI_ExecutionPlan, RString>>,
+        session: FFI_SessionRef,
+        projections: FFI_Option<SVec<usize>>,
+        filters_serialized: SVec<u8>,
+        limit: FFI_Option<usize>,
+    ) -> FfiFuture<FFI_Result<FFI_ExecutionPlan>>,
 
     /// Return the type of table. See [`TableType`] for options.
-    pub table_type: unsafe extern "C" fn(provider: &Self) -> FFI_TableType,
+    table_type: unsafe extern "C" fn(provider: &Self) -> FFI_TableType,
 
     /// Based upon the input filters, identify which are supported. The filters
     /// are a [`LogicalExprList`] protobuf message serialized into bytes to pass
     /// across the FFI boundary.
-    pub supports_filters_pushdown: Option<
+    supports_filters_pushdown: Option<
         unsafe extern "C" fn(
             provider: &FFI_TableProvider,
-            filters_serialized: RVec<u8>,
+            filters_serialized: SVec<u8>,
         )
-            -> RResult<RVec<FFI_TableProviderFilterPushDown>, RString>,
+            -> FFI_Result<SVec<FFI_TableProviderFilterPushDown>>,
     >,
 
-    pub insert_into:
-        unsafe extern "C" fn(
-            provider: &Self,
-            session_config: &FFI_SessionConfig,
-            input: &FFI_ExecutionPlan,
-            insert_op: FFI_InsertOp,
-        ) -> FfiFuture<RResult<FFI_ExecutionPlan, RString>>,
+    insert_into: unsafe extern "C" fn(
+        provider: &Self,
+        session: FFI_SessionRef,
+        input: &FFI_ExecutionPlan,
+        insert_op: FFI_InsertOp,
+    ) -> FfiFuture<FFI_Result<FFI_ExecutionPlan>>,
+
+    pub logical_codec: FFI_LogicalExtensionCodec,
 
     /// Used to create a clone on the provider of the execution plan. This should
     /// only need to be called by the receiver of the plan.
-    pub clone: unsafe extern "C" fn(plan: &Self) -> Self,
+    clone: unsafe extern "C" fn(plan: &Self) -> Self,
 
     /// Release the memory of the private data when it is no longer being used.
-    pub release: unsafe extern "C" fn(arg: &mut Self),
+    release: unsafe extern "C" fn(arg: &mut Self),
 
     /// Return the major DataFusion version number of this provider.
     pub version: unsafe extern "C" fn() -> u64,
 
     /// Internal data. This is only to be accessed by the provider of the plan.
-    /// A [`ForeignExecutionPlan`] should never attempt to access this data.
-    pub private_data: *mut c_void,
+    /// A [`ForeignTableProvider`] should never attempt to access this data.
+    private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
 }
 
 unsafe impl Send for FFI_TableProvider {}
 unsafe impl Sync for FFI_TableProvider {}
 
 struct ProviderPrivateData {
-    provider: Arc<dyn TableProvider + Send>,
+    provider: Arc<dyn TableProvider>,
     runtime: Option<Handle>,
 }
 
-unsafe extern "C" fn schema_fn_wrapper(provider: &FFI_TableProvider) -> WrappedSchema {
-    let private_data = provider.private_data as *const ProviderPrivateData;
-    let provider = &(*private_data).provider;
+impl FFI_TableProvider {
+    fn inner(&self) -> &Arc<dyn TableProvider> {
+        let private_data = self.private_data as *const ProviderPrivateData;
+        unsafe { &(*private_data).provider }
+    }
+
+    fn runtime(&self) -> &Option<Handle> {
+        let private_data = self.private_data as *const ProviderPrivateData;
+        unsafe { &(*private_data).runtime }
+    }
+}
 
-    provider.schema().into()
+unsafe extern "C" fn schema_fn_wrapper(provider: &FFI_TableProvider) -> WrappedSchema {
+    provider.inner().schema().into()
 }
 
 unsafe extern "C" fn table_type_fn_wrapper(
     provider: &FFI_TableProvider,
 ) -> FFI_TableType {
-    let private_data = provider.private_data as *const ProviderPrivateData;
-    let provider = &(*private_data).provider;
-
-    provider.table_type().into()
+    provider.inner().table_type().into()
 }
 
 fn supports_filters_pushdown_internal(
-    provider: &Arc<dyn TableProvider + Send>,
+    provider: &Arc<dyn TableProvider>,
     filters_serialized: &[u8],
-) -> Result<RVec<FFI_TableProviderFilterPushDown>> {
-    let default_ctx = SessionContext::new();
-    let codec = DefaultLogicalExtensionCodec {};
-
+    task_ctx: &Arc<TaskContext>,
+    codec: &dyn LogicalExtensionCodec,
+) -> Result<SVec<FFI_TableProviderFilterPushDown>> {
     let filters = match filters_serialized.is_empty() {
         true => vec![],
         false => {
             let proto_filters = LogicalExprList::decode(filters_serialized)
                 .map_err(|e| DataFusionError::Plan(e.to_string()))?;
 
-            parse_exprs(proto_filters.expr.iter(), &default_ctx, &codec)?
+            parse_exprs(proto_filters.expr.iter(), task_ctx.as_ref(), codec)?
         }
     };
     let filters_borrowed: Vec<&Expr> = filters.iter().collect();
 
-    let results: RVec<_> = provider
+    let results: SVec<_> = provider
         .supports_filters_pushdown(&filters_borrowed)?
         .iter()
         .map(|v| v.into())
@@ -211,120 +213,128 @@ fn supports_filters_pushdown_internal(
 
 unsafe extern "C" fn supports_filters_pushdown_fn_wrapper(
     provider: &FFI_TableProvider,
-    filters_serialized: RVec<u8>,
-) -> RResult<RVec<FFI_TableProviderFilterPushDown>, RString> {
-    let private_data = provider.private_data as *const ProviderPrivateData;
-    let provider = &(*private_data).provider;
-
-    supports_filters_pushdown_internal(provider, &filters_serialized)
-        .map_err(|e| e.to_string().into())
-        .into()
+    filters_serialized: SVec<u8>,
+) -> FFI_Result<SVec<FFI_TableProviderFilterPushDown>> {
+    let logical_codec: Arc<dyn LogicalExtensionCodec> = (&provider.logical_codec).into();
+    let task_ctx = sresult_return!(<Arc<TaskContext>>::try_from(
+        &provider.logical_codec.task_ctx_provider
+    ));
+    supports_filters_pushdown_internal(
+        provider.inner(),
+        &filters_serialized,
+        &task_ctx,
+        logical_codec.as_ref(),
+    )
+    .into()
 }
 
 unsafe extern "C" fn scan_fn_wrapper(
     provider: &FFI_TableProvider,
-    session_config: &FFI_SessionConfig,
-    projections: RVec<usize>,
-    filters_serialized: RVec<u8>,
-    limit: ROption<usize>,
-) -> FfiFuture<RResult<FFI_ExecutionPlan, RString>> {
-    let private_data = provider.private_data as *mut ProviderPrivateData;
-    let internal_provider = &(*private_data).provider;
-    let session_config = session_config.clone();
-    let runtime = &(*private_data).runtime;
+    session: FFI_SessionRef,
+    projections: FFI_Option<SVec<usize>>,
+    filters_serialized: SVec<u8>,
+    limit: FFI_Option<usize>,
+) -> FfiFuture<FFI_Result<FFI_ExecutionPlan>> {
+    let task_ctx: Result<Arc<TaskContext>, DataFusionError> =
+        (&provider.logical_codec.task_ctx_provider).try_into();
+    let runtime = provider.runtime().clone();
+    let logical_codec: Arc<dyn LogicalExtensionCodec> = (&provider.logical_codec).into();
+    let internal_provider = Arc::clone(provider.inner());
 
     async move {
-        let config = rresult_return!(ForeignSessionConfig::try_from(&session_config));
-        let session = SessionStateBuilder::new()
-            .with_default_features()
-            .with_config(config.0)
-            .build();
-        let ctx = SessionContext::new_with_state(session);
+        let mut foreign_session = None;
+        let session = sresult_return!(
+            session
+                .as_local()
+                .map(Ok::<&(dyn Session + Send + Sync), DataFusionError>)
+                .unwrap_or_else(|| {
+                    foreign_session = Some(ForeignSession::try_from(&session)?);
+                    Ok(foreign_session.as_ref().unwrap())
+                })
+        );
 
+        let task_ctx = sresult_return!(task_ctx);
         let filters = match filters_serialized.is_empty() {
             true => vec![],
             false => {
-                let default_ctx = SessionContext::new();
-                let codec = DefaultLogicalExtensionCodec {};
-
                 let proto_filters =
-                    rresult_return!(LogicalExprList::decode(filters_serialized.as_ref()));
+                    sresult_return!(LogicalExprList::decode(filters_serialized.as_ref()));
 
-                rresult_return!(parse_exprs(
+                sresult_return!(parse_exprs(
                     proto_filters.expr.iter(),
-                    &default_ctx,
-                    &codec
+                    task_ctx.as_ref(),
+                    logical_codec.as_ref(),
                 ))
             }
         };
 
-        let projections: Vec<_> = projections.into_iter().collect();
+        let projections: Option<Vec<usize>> =
+            projections.into_option().map(|p| p.into_iter().collect());
 
-        let plan = rresult_return!(
+        let plan = sresult_return!(
             internal_provider
-                .scan(&ctx.state(), Some(&projections), &filters, limit.into())
+                .scan(session, projections.as_ref(), &filters, limit.into())
                 .await
         );
 
-        RResult::ROk(FFI_ExecutionPlan::new(
-            plan,
-            ctx.task_ctx(),
-            runtime.clone(),
-        ))
+        FFI_Result::Ok(FFI_ExecutionPlan::new(plan, runtime.clone()))
     }
     .into_ffi()
 }
 
 unsafe extern "C" fn insert_into_fn_wrapper(
     provider: &FFI_TableProvider,
-    session_config: &FFI_SessionConfig,
+    session: FFI_SessionRef,
     input: &FFI_ExecutionPlan,
     insert_op: FFI_InsertOp,
-) -> FfiFuture<RResult<FFI_ExecutionPlan, RString>> {
-    let private_data = provider.private_data as *mut ProviderPrivateData;
-    let internal_provider = &(*private_data).provider;
-    let session_config = session_config.clone();
+) -> FfiFuture<FFI_Result<FFI_ExecutionPlan>> {
+    let runtime = provider.runtime().clone();
+    let internal_provider = Arc::clone(provider.inner());
     let input = input.clone();
-    let runtime = &(*private_data).runtime;
 
     async move {
-        let config = rresult_return!(ForeignSessionConfig::try_from(&session_config));
-        let session = SessionStateBuilder::new()
-            .with_default_features()
-            .with_config(config.0)
-            .build();
-        let ctx = SessionContext::new_with_state(session);
+        let mut foreign_session = None;
+        let session = sresult_return!(
+            session
+                .as_local()
+                .map(Ok::<&(dyn Session + Send + Sync), DataFusionError>)
+                .unwrap_or_else(|| {
+                    foreign_session = Some(ForeignSession::try_from(&session)?);
+                    Ok(foreign_session.as_ref().unwrap())
+                })
+        );
 
-        let input = rresult_return!(ForeignExecutionPlan::try_from(&input).map(Arc::new));
+        let input = sresult_return!(<Arc<dyn ExecutionPlan>>::try_from(&input));
 
         let insert_op = InsertOp::from(insert_op);
 
-        let plan = rresult_return!(
+        let plan = sresult_return!(
             internal_provider
-                .insert_into(&ctx.state(), input, insert_op)
+                .insert_into(session, input, insert_op)
                 .await
         );
 
-        RResult::ROk(FFI_ExecutionPlan::new(
-            plan,
-            ctx.task_ctx(),
-            runtime.clone(),
-        ))
+        FFI_Result::Ok(FFI_ExecutionPlan::new(plan, runtime.clone()))
     }
     .into_ffi()
 }
 
 unsafe extern "C" fn release_fn_wrapper(provider: &mut FFI_TableProvider) {
-    let private_data = Box::from_raw(provider.private_data as *mut ProviderPrivateData);
-    drop(private_data);
+    unsafe {
+        debug_assert!(!provider.private_data.is_null());
+        let private_data =
+            Box::from_raw(provider.private_data as *mut ProviderPrivateData);
+        drop(private_data);
+        provider.private_data = std::ptr::null_mut();
+    }
 }
 
 unsafe extern "C" fn clone_fn_wrapper(provider: &FFI_TableProvider) -> FFI_TableProvider {
-    let old_private_data = provider.private_data as *const ProviderPrivateData;
-    let runtime = (*old_private_data).runtime.clone();
+    let runtime = provider.runtime().clone();
+    let old_provider = Arc::clone(provider.inner());
 
     let private_data = Box::into_raw(Box::new(ProviderPrivateData {
-        provider: Arc::clone(&(*old_private_data).provider),
+        provider: old_provider,
         runtime,
     })) as *mut c_void;
 
@@ -334,10 +344,12 @@ unsafe extern "C" fn clone_fn_wrapper(provider: &FFI_TableProvider) -> FFI_Table
         table_type: table_type_fn_wrapper,
         supports_filters_pushdown: provider.supports_filters_pushdown,
         insert_into: provider.insert_into,
+        logical_codec: provider.logical_codec.clone(),
         clone: clone_fn_wrapper,
         release: release_fn_wrapper,
         version: super::version,
         private_data,
+        library_marker_id: crate::get_library_marker_id,
     }
 }
 
@@ -350,10 +362,37 @@ impl Drop for FFI_TableProvider {
 impl FFI_TableProvider {
     /// Creates a new [`FFI_TableProvider`].
     pub fn new(
-        provider: Arc<dyn TableProvider + Send>,
+        provider: Arc<dyn TableProvider>,
+        can_support_pushdown_filters: bool,
+        runtime: Option<Handle>,
+        task_ctx_provider: impl Into<FFI_TaskContextProvider>,
+        logical_codec: Option<Arc<dyn LogicalExtensionCodec>>,
+    ) -> Self {
+        let task_ctx_provider = task_ctx_provider.into();
+        let logical_codec =
+            logical_codec.unwrap_or_else(|| Arc::new(DefaultLogicalExtensionCodec {}));
+        let logical_codec = FFI_LogicalExtensionCodec::new(
+            logical_codec,
+            runtime.clone(),
+            task_ctx_provider.clone(),
+        );
+        Self::new_with_ffi_codec(
+            provider,
+            can_support_pushdown_filters,
+            runtime,
+            logical_codec,
+        )
+    }
+
+    pub fn new_with_ffi_codec(
+        provider: Arc<dyn TableProvider>,
         can_support_pushdown_filters: bool,
         runtime: Option<Handle>,
+        logical_codec: FFI_LogicalExtensionCodec,
     ) -> Self {
+        if let Some(provider) = provider.downcast_ref::<ForeignTableProvider>() {
+            return provider.0.clone();
+        }
         let private_data = Box::new(ProviderPrivateData { provider, runtime });
 
         Self {
@@ -365,10 +404,12 @@ impl FFI_TableProvider {
                 false => None,
             },
             insert_into: insert_into_fn_wrapper,
+            logical_codec,
             clone: clone_fn_wrapper,
             release: release_fn_wrapper,
             version: super::version,
             private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
         }
     }
 }
@@ -383,9 +424,13 @@ pub struct ForeignTableProvider(pub FFI_TableProvider);
 unsafe impl Send for ForeignTableProvider {}
 unsafe impl Sync for ForeignTableProvider {}
 
-impl From<&FFI_TableProvider> for ForeignTableProvider {
+impl From<&FFI_TableProvider> for Arc<dyn TableProvider> {
     fn from(provider: &FFI_TableProvider) -> Self {
-        Self(provider.clone())
+        if (provider.library_marker_id)() == crate::get_library_marker_id() {
+            Arc::clone(provider.inner()) as Arc<dyn TableProvider>
+        } else {
+            Arc::new(ForeignTableProvider(provider.clone()))
+        }
     }
 }
 
@@ -397,10 +442,6 @@ impl Clone for FFI_TableProvider {
 
 #[async_trait]
 impl TableProvider for ForeignTableProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         let wrapped_schema = unsafe { (self.0.schema)(&self.0) };
         wrapped_schema.into()
@@ -417,31 +458,32 @@ impl TableProvider for ForeignTableProvider {
         filters: &[Expr],
         limit: Option<usize>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let session_config: FFI_SessionConfig = session.config().into();
+        let session = FFI_SessionRef::new(session, None, self.0.logical_codec.clone());
 
-        let projections: Option<RVec<usize>> =
-            projection.map(|p| p.iter().map(|v| v.to_owned()).collect());
+        let projections: FFI_Option<SVec<usize>> = projection
+            .map(|p| p.iter().map(|v| v.to_owned()).collect())
+            .into();
 
-        let codec = DefaultLogicalExtensionCodec {};
+        let codec: Arc<dyn LogicalExtensionCodec> = (&self.0.logical_codec).into();
         let filter_list = LogicalExprList {
-            expr: serialize_exprs(filters, &codec)?,
+            expr: serialize_exprs(filters, codec.as_ref())?,
         };
-        let filters_serialized = filter_list.encode_to_vec().into();
+        let filters_serialized = filter_list.encode_to_vec().into_iter().collect();
 
         let plan = unsafe {
             let maybe_plan = (self.0.scan)(
                 &self.0,
-                &session_config,
-                projections.unwrap_or_default(),
+                session,
+                projections,
                 filters_serialized,
                 limit.into(),
             )
             .await;
 
-            ForeignExecutionPlan::try_from(&df_result!(maybe_plan)?)?
+            <Arc<dyn ExecutionPlan>>::try_from(&df_result!(maybe_plan)?)?
         };
 
-        Ok(Arc::new(plan))
+        Ok(plan)
     }
 
     /// Tests whether the table provider can make use of a filter expression
@@ -457,18 +499,24 @@ impl TableProvider for ForeignTableProvider {
                     return Ok(vec![
                         TableProviderFilterPushDown::Unsupported;
                         filters.len()
-                    ])
+                    ]);
                 }
             };
 
-            let codec = DefaultLogicalExtensionCodec {};
+            let codec: Arc<dyn LogicalExtensionCodec> = (&self.0.logical_codec).into();
 
             let expr_list = LogicalExprList {
-                expr: serialize_exprs(filters.iter().map(|f| f.to_owned()), &codec)?,
+                expr: serialize_exprs(
+                    filters.iter().map(|f| f.to_owned()),
+                    codec.as_ref(),
+                )?,
             };
             let serialized_filters = expr_list.encode_to_vec();
 
-            let pushdowns = df_result!(pushdown_fn(&self.0, serialized_filters.into()))?;
+            let pushdowns = df_result!(pushdown_fn(
+                &self.0,
+                serialized_filters.into_iter().collect()
+            ))?;
 
             Ok(pushdowns.iter().map(|v| v.into()).collect())
         }
@@ -480,37 +528,36 @@ impl TableProvider for ForeignTableProvider {
         input: Arc<dyn ExecutionPlan>,
         insert_op: InsertOp,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let session_config: FFI_SessionConfig = session.config().into();
+        let session = FFI_SessionRef::new(session, None, self.0.logical_codec.clone());
 
         let rc = Handle::try_current().ok();
-        let input =
-            FFI_ExecutionPlan::new(input, Arc::new(TaskContext::from(session)), rc);
+        let input = FFI_ExecutionPlan::new(input, rc);
         let insert_op: FFI_InsertOp = insert_op.into();
 
         let plan = unsafe {
             let maybe_plan =
-                (self.0.insert_into)(&self.0, &session_config, &input, insert_op).await;
+                (self.0.insert_into)(&self.0, session, &input, insert_op).await;
 
-            ForeignExecutionPlan::try_from(&df_result!(maybe_plan)?)?
+            <Arc<dyn ExecutionPlan>>::try_from(&df_result!(maybe_plan)?)?
         };
 
-        Ok(Arc::new(plan))
+        Ok(plan)
     }
 }
 
 #[cfg(test)]
 mod tests {
     use arrow::datatypes::Schema;
-    use datafusion::prelude::{col, lit};
+    use datafusion::prelude::{SessionContext, col, lit};
+    use datafusion_execution::TaskContextProvider;
 
     use super::*;
 
-    #[tokio::test]
-    async fn test_round_trip_ffi_table_provider_scan() -> Result<()> {
+    fn create_test_table_provider() -> Result<Arc<dyn TableProvider>> {
         use arrow::datatypes::Field;
-        use datafusion::arrow::{
-            array::Float32Array, datatypes::DataType, record_batch::RecordBatch,
-        };
+        use datafusion::arrow::array::Float32Array;
+        use datafusion::arrow::datatypes::DataType;
+        use datafusion::arrow::record_batch::RecordBatch;
         use datafusion::datasource::MemTable;
 
         let schema =
@@ -526,16 +573,26 @@ mod tests {
             vec![Arc::new(Float32Array::from(vec![64.0]))],
         )?;
 
-        let ctx = SessionContext::new();
+        Ok(Arc::new(MemTable::try_new(
+            schema,
+            vec![vec![batch1], vec![batch2]],
+        )?))
+    }
 
-        let provider =
-            Arc::new(MemTable::try_new(schema, vec![vec![batch1], vec![batch2]])?);
+    #[tokio::test]
+    async fn test_round_trip_ffi_table_provider_scan() -> Result<()> {
+        let provider = create_test_table_provider()?;
+        let ctx = Arc::new(SessionContext::new());
+        let task_ctx_provider = Arc::clone(&ctx) as Arc<dyn TaskContextProvider>;
+        let task_ctx_provider = FFI_TaskContextProvider::from(&task_ctx_provider);
 
-        let ffi_provider = FFI_TableProvider::new(provider, true, None);
+        let mut ffi_provider =
+            FFI_TableProvider::new(provider, true, None, task_ctx_provider, None);
+        ffi_provider.library_marker_id = crate::mock_foreign_marker_id;
 
-        let foreign_table_provider: ForeignTableProvider = (&ffi_provider).into();
+        let foreign_table_provider: Arc<dyn TableProvider> = (&ffi_provider).into();
 
-        ctx.register_table("t", Arc::new(foreign_table_provider))?;
+        ctx.register_table("t", foreign_table_provider)?;
 
         let df = ctx.table("t").await?;
 
@@ -549,35 +606,18 @@ mod tests {
 
     #[tokio::test]
     async fn test_round_trip_ffi_table_provider_insert_into() -> Result<()> {
-        use arrow::datatypes::Field;
-        use datafusion::arrow::{
-            array::Float32Array, datatypes::DataType, record_batch::RecordBatch,
-        };
-        use datafusion::datasource::MemTable;
+        let provider = create_test_table_provider()?;
+        let ctx = Arc::new(SessionContext::new());
+        let task_ctx_provider = Arc::clone(&ctx) as Arc<dyn TaskContextProvider>;
+        let task_ctx_provider = FFI_TaskContextProvider::from(&task_ctx_provider);
 
-        let schema =
-            Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)]));
+        let mut ffi_provider =
+            FFI_TableProvider::new(provider, true, None, task_ctx_provider, None);
+        ffi_provider.library_marker_id = crate::mock_foreign_marker_id;
 
-        // define data in two partitions
-        let batch1 = RecordBatch::try_new(
-            Arc::clone(&schema),
-            vec![Arc::new(Float32Array::from(vec![2.0, 4.0, 8.0]))],
-        )?;
-        let batch2 = RecordBatch::try_new(
-            Arc::clone(&schema),
-            vec![Arc::new(Float32Array::from(vec![64.0]))],
-        )?;
-
-        let ctx = SessionContext::new();
-
-        let provider =
-            Arc::new(MemTable::try_new(schema, vec![vec![batch1], vec![batch2]])?);
-
-        let ffi_provider = FFI_TableProvider::new(provider, true, None);
-
-        let foreign_table_provider: ForeignTableProvider = (&ffi_provider).into();
+        let foreign_table_provider: Arc<dyn TableProvider> = (&ffi_provider).into();
 
-        ctx.register_table("t", Arc::new(foreign_table_provider))?;
+        ctx.register_table("t", foreign_table_provider)?;
 
         let result = ctx
             .sql("INSERT INTO t VALUES (128.0);")
@@ -600,9 +640,9 @@ mod tests {
     #[tokio::test]
     async fn test_aggregation() -> Result<()> {
         use arrow::datatypes::Field;
-        use datafusion::arrow::{
-            array::Float32Array, datatypes::DataType, record_batch::RecordBatch,
-        };
+        use datafusion::arrow::array::Float32Array;
+        use datafusion::arrow::datatypes::DataType;
+        use datafusion::arrow::record_batch::RecordBatch;
         use datafusion::common::assert_batches_eq;
         use datafusion::datasource::MemTable;
 
@@ -615,15 +655,19 @@ mod tests {
             vec![Arc::new(Float32Array::from(vec![2.0, 4.0, 8.0]))],
         )?;
 
-        let ctx = SessionContext::new();
+        let ctx = Arc::new(SessionContext::new());
+        let task_ctx_provider = Arc::clone(&ctx) as Arc<dyn TaskContextProvider>;
+        let task_ctx_provider = FFI_TaskContextProvider::from(&task_ctx_provider);
 
         let provider = Arc::new(MemTable::try_new(schema, vec![vec![batch1]])?);
 
-        let ffi_provider = FFI_TableProvider::new(provider, true, None);
+        let mut ffi_provider =
+            FFI_TableProvider::new(provider, true, None, task_ctx_provider, None);
+        ffi_provider.library_marker_id = crate::mock_foreign_marker_id;
 
-        let foreign_table_provider: ForeignTableProvider = (&ffi_provider).into();
+        let foreign_table_provider: Arc<dyn TableProvider> = (&ffi_provider).into();
 
-        ctx.register_table("t", Arc::new(foreign_table_provider))?;
+        ctx.register_table("t", foreign_table_provider)?;
 
         let result = ctx
             .sql("SELECT COUNT(*) as cnt FROM t")
@@ -641,4 +685,91 @@ mod tests {
         assert_batches_eq!(expected, &result);
         Ok(())
     }
+
+    #[test]
+    fn test_ffi_table_provider_local_bypass() -> Result<()> {
+        let table_provider = create_test_table_provider()?;
+
+        let ctx = Arc::new(SessionContext::new()) as Arc<dyn TaskContextProvider>;
+        let task_ctx_provider = FFI_TaskContextProvider::from(&ctx);
+        let mut ffi_table =
+            FFI_TableProvider::new(table_provider, false, None, task_ctx_provider, None);
+
+        // Verify local libraries can be downcast to their original
+        let foreign_table: Arc<dyn TableProvider> = (&ffi_table).into();
+        assert!(
+            foreign_table
+                .downcast_ref::<datafusion::datasource::MemTable>()
+                .is_some()
+        );
+
+        // Verify different library markers generate foreign providers
+        ffi_table.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_table: Arc<dyn TableProvider> = (&ffi_table).into();
+        assert!(
+            foreign_table
+                .downcast_ref::<ForeignTableProvider>()
+                .is_some()
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_scan_with_none_projection_returns_all_columns() -> Result<()> {
+        use arrow::datatypes::Field;
+        use datafusion::arrow::array::Float32Array;
+        use datafusion::arrow::datatypes::DataType;
+        use datafusion::arrow::record_batch::RecordBatch;
+        use datafusion::datasource::MemTable;
+        use datafusion::physical_plan::collect;
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Float32, false),
+            Field::new("b", DataType::Float32, false),
+            Field::new("c", DataType::Float32, false),
+        ]));
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Float32Array::from(vec![1.0, 2.0])),
+                Arc::new(Float32Array::from(vec![3.0, 4.0])),
+                Arc::new(Float32Array::from(vec![5.0, 6.0])),
+            ],
+        )?;
+
+        let provider =
+            Arc::new(MemTable::try_new(Arc::clone(&schema), vec![vec![batch]])?);
+
+        let ctx = Arc::new(SessionContext::new());
+        let task_ctx_provider = Arc::clone(&ctx) as Arc<dyn TaskContextProvider>;
+        let task_ctx_provider = FFI_TaskContextProvider::from(&task_ctx_provider);
+
+        // Wrap in FFI and force the foreign path (not local bypass)
+        let mut ffi_provider =
+            FFI_TableProvider::new(provider, true, None, task_ctx_provider, None);
+        ffi_provider.library_marker_id = crate::mock_foreign_marker_id;
+
+        let foreign_table_provider: Arc<dyn TableProvider> = (&ffi_provider).into();
+
+        // Call scan with projection=None, meaning "return all columns"
+        let plan = foreign_table_provider
+            .scan(&ctx.state(), None, &[], None)
+            .await?;
+        assert_eq!(
+            plan.schema().fields().len(),
+            3,
+            "scan(projection=None) should return all columns; got {}",
+            plan.schema().fields().len()
+        );
+
+        // Also verify we can execute and get correct data
+        let batches = collect(plan, ctx.task_ctx()).await?;
+        assert_eq!(batches.len(), 1);
+        assert_eq!(batches[0].num_columns(), 3);
+        assert_eq!(batches[0].num_rows(), 2);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/ffi/src/table_provider_factory.rs b/datafusion/ffi/src/table_provider_factory.rs
new file mode 100644
index 0000000000000..3ce8841614bc0
--- /dev/null
+++ b/datafusion/ffi/src/table_provider_factory.rs
@@ -0,0 +1,428 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{ffi::c_void, sync::Arc};
+
+use async_ffi::{FfiFuture, FutureExt};
+use async_trait::async_trait;
+use datafusion_catalog::{Session, TableProvider, TableProviderFactory};
+use datafusion_common::error::{DataFusionError, Result};
+use datafusion_execution::TaskContext;
+use datafusion_expr::{CreateExternalTable, DdlStatement, LogicalPlan};
+use datafusion_proto::logical_plan::{
+    AsLogicalPlan, DefaultLogicalExtensionCodec, LogicalExtensionCodec,
+};
+use datafusion_proto::protobuf::LogicalPlanNode;
+use prost::Message;
+
+use stabby::vec::Vec as SVec;
+use tokio::runtime::Handle;
+
+use crate::execution::FFI_TaskContextProvider;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::session::{FFI_SessionRef, ForeignSession};
+use crate::table_provider::{FFI_TableProvider, ForeignTableProvider};
+use crate::util::FFI_Result;
+use crate::{df_result, sresult_return};
+
+/// A stable struct for sharing [`TableProviderFactory`] across FFI boundaries.
+///
+/// Similar to [`FFI_TableProvider`], this struct uses the FFI-safe pattern where:
+/// - The `FFI_*` struct exposes stable function pointers
+/// - Private data is stored as an opaque pointer
+/// - The `Foreign*` wrapper is used by consumers on the other side of the FFI boundary
+///
+/// [`FFI_TableProvider`]: crate::table_provider::FFI_TableProvider
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_TableProviderFactory {
+    /// Create a TableProvider with the given command.
+    ///
+    /// # Arguments
+    ///
+    /// * `factory` - the table provider factory
+    /// * `session_config` - session configuration
+    /// * `cmd_serialized` - a ['CreateExternalTable`] encoded as a [`LogicalPlanNode`] protobuf message serialized into bytes
+    ///   to pass across the FFI boundary.
+    create: unsafe extern "C" fn(
+        factory: &Self,
+        session: FFI_SessionRef,
+        cmd_serialized: SVec<u8>,
+    ) -> FfiFuture<FFI_Result<FFI_TableProvider>>,
+
+    logical_codec: FFI_LogicalExtensionCodec,
+
+    /// Used to create a clone of the factory. This should only need to be called
+    /// by the receiver of the factory.
+    clone: unsafe extern "C" fn(factory: &Self) -> Self,
+
+    /// Release the memory of the private data when it is no longer being used.
+    release: unsafe extern "C" fn(factory: &mut Self),
+
+    /// Return the major DataFusion version number of this factory.
+    version: unsafe extern "C" fn() -> u64,
+
+    /// Internal data. This is only to be accessed by the provider of the factory.
+    /// A [`ForeignTableProviderFactory`] should never attempt to access this data.
+    private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    library_marker_id: extern "C" fn() -> usize,
+}
+
+unsafe impl Send for FFI_TableProviderFactory {}
+unsafe impl Sync for FFI_TableProviderFactory {}
+
+struct FactoryPrivateData {
+    factory: Arc<dyn TableProviderFactory + Send>,
+    runtime: Option<Handle>,
+}
+
+impl FFI_TableProviderFactory {
+    /// Creates a new [`FFI_TableProvider`].
+    pub fn new(
+        factory: Arc<dyn TableProviderFactory + Send>,
+        runtime: Option<Handle>,
+        task_ctx_provider: impl Into<FFI_TaskContextProvider>,
+        logical_codec: Option<Arc<dyn LogicalExtensionCodec>>,
+    ) -> Self {
+        let task_ctx_provider = task_ctx_provider.into();
+        let logical_codec =
+            logical_codec.unwrap_or_else(|| Arc::new(DefaultLogicalExtensionCodec {}));
+        let logical_codec = FFI_LogicalExtensionCodec::new(
+            logical_codec,
+            runtime.clone(),
+            task_ctx_provider.clone(),
+        );
+        Self::new_with_ffi_codec(factory, runtime, logical_codec)
+    }
+
+    pub fn new_with_ffi_codec(
+        factory: Arc<dyn TableProviderFactory + Send>,
+        runtime: Option<Handle>,
+        logical_codec: FFI_LogicalExtensionCodec,
+    ) -> Self {
+        let private_data = Box::new(FactoryPrivateData { factory, runtime });
+
+        Self {
+            create: create_fn_wrapper,
+            logical_codec,
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+
+    fn inner(&self) -> &Arc<dyn TableProviderFactory + Send> {
+        let private_data = self.private_data as *const FactoryPrivateData;
+        unsafe { &(*private_data).factory }
+    }
+
+    fn runtime(&self) -> &Option<Handle> {
+        let private_data = self.private_data as *const FactoryPrivateData;
+        unsafe { &(*private_data).runtime }
+    }
+
+    fn deserialize_cmd(
+        &self,
+        cmd_serialized: &SVec<u8>,
+    ) -> Result<CreateExternalTable, DataFusionError> {
+        let task_ctx: Arc<TaskContext> =
+            (&self.logical_codec.task_ctx_provider).try_into()?;
+        let logical_codec: Arc<dyn LogicalExtensionCodec> = (&self.logical_codec).into();
+
+        let plan = LogicalPlanNode::decode(cmd_serialized.as_ref())
+            .map_err(|e| DataFusionError::Internal(format!("{e:?}")))?;
+        match plan.try_into_logical_plan(&task_ctx, logical_codec.as_ref())? {
+            LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) => Ok(cmd),
+            _ => Err(DataFusionError::Internal(
+                "Invalid logical plan in FFI_TableProviderFactory.".to_owned(),
+            )),
+        }
+    }
+}
+
+impl Clone for FFI_TableProviderFactory {
+    fn clone(&self) -> Self {
+        unsafe { (self.clone)(self) }
+    }
+}
+
+impl Drop for FFI_TableProviderFactory {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+impl From<&FFI_TableProviderFactory> for Arc<dyn TableProviderFactory> {
+    fn from(factory: &FFI_TableProviderFactory) -> Self {
+        if (factory.library_marker_id)() == crate::get_library_marker_id() {
+            Arc::clone(factory.inner()) as Arc<dyn TableProviderFactory>
+        } else {
+            Arc::new(ForeignTableProviderFactory(factory.clone()))
+        }
+    }
+}
+
+unsafe extern "C" fn create_fn_wrapper(
+    factory: &FFI_TableProviderFactory,
+    session: FFI_SessionRef,
+    cmd_serialized: SVec<u8>,
+) -> FfiFuture<FFI_Result<FFI_TableProvider>> {
+    let factory = factory.clone();
+
+    async move {
+        let provider = sresult_return!(
+            create_fn_wrapper_impl(factory, session, cmd_serialized).await
+        );
+        FFI_Result::Ok(provider)
+    }
+    .into_ffi()
+}
+
+async fn create_fn_wrapper_impl(
+    factory: FFI_TableProviderFactory,
+    session: FFI_SessionRef,
+    cmd_serialized: SVec<u8>,
+) -> Result<FFI_TableProvider, DataFusionError> {
+    let runtime = factory.runtime().clone();
+    let ffi_logical_codec = factory.logical_codec.clone();
+    let internal_factory = Arc::clone(factory.inner());
+    let cmd = factory.deserialize_cmd(&cmd_serialized)?;
+
+    let mut foreign_session = None;
+    let session = session
+        .as_local()
+        .map(Ok::<&(dyn Session + Send + Sync), DataFusionError>)
+        .unwrap_or_else(|| {
+            foreign_session = Some(ForeignSession::try_from(&session)?);
+            Ok(foreign_session.as_ref().unwrap())
+        })?;
+
+    let provider = internal_factory.create(session, &cmd).await?;
+    Ok(FFI_TableProvider::new_with_ffi_codec(
+        provider,
+        true,
+        runtime.clone(),
+        ffi_logical_codec,
+    ))
+}
+
+unsafe extern "C" fn clone_fn_wrapper(
+    factory: &FFI_TableProviderFactory,
+) -> FFI_TableProviderFactory {
+    let runtime = factory.runtime().clone();
+    let old_factory = Arc::clone(factory.inner());
+
+    let private_data = Box::into_raw(Box::new(FactoryPrivateData {
+        factory: old_factory,
+        runtime,
+    })) as *mut c_void;
+
+    FFI_TableProviderFactory {
+        create: create_fn_wrapper,
+        logical_codec: factory.logical_codec.clone(),
+        clone: clone_fn_wrapper,
+        release: release_fn_wrapper,
+        version: super::version,
+        private_data,
+        library_marker_id: crate::get_library_marker_id,
+    }
+}
+
+unsafe extern "C" fn release_fn_wrapper(factory: &mut FFI_TableProviderFactory) {
+    unsafe {
+        debug_assert!(!factory.private_data.is_null());
+        let private_data = Box::from_raw(factory.private_data as *mut FactoryPrivateData);
+        drop(private_data);
+        factory.private_data = std::ptr::null_mut();
+    }
+}
+
+/// This wrapper struct exists on the receiver side of the FFI interface, so it has
+/// no guarantees about being able to access the data in `private_data`. Any functions
+/// defined on this struct must only use the stable functions provided in
+/// FFI_TableProviderFactory to interact with the foreign table provider factory.
+#[derive(Debug)]
+pub struct ForeignTableProviderFactory(pub FFI_TableProviderFactory);
+
+impl ForeignTableProviderFactory {
+    fn serialize_cmd(
+        &self,
+        cmd: CreateExternalTable,
+    ) -> Result<SVec<u8>, DataFusionError> {
+        let logical_codec: Arc<dyn LogicalExtensionCodec> =
+            (&self.0.logical_codec).into();
+
+        let plan = LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd));
+        let plan: LogicalPlanNode =
+            AsLogicalPlan::try_from_logical_plan(&plan, logical_codec.as_ref())?;
+
+        let mut buf: Vec<u8> = Vec::new();
+        plan.try_encode(&mut buf)?;
+
+        Ok(buf.into_iter().collect())
+    }
+}
+
+unsafe impl Send for ForeignTableProviderFactory {}
+unsafe impl Sync for ForeignTableProviderFactory {}
+
+#[async_trait]
+impl TableProviderFactory for ForeignTableProviderFactory {
+    async fn create(
+        &self,
+        session: &dyn Session,
+        cmd: &CreateExternalTable,
+    ) -> Result<Arc<dyn TableProvider>> {
+        let session = FFI_SessionRef::new(session, None, self.0.logical_codec.clone());
+        let cmd = self.serialize_cmd(cmd.clone())?;
+
+        let provider = unsafe {
+            let maybe_provider = (self.0.create)(&self.0, session, cmd).await;
+
+            let ffi_provider = df_result!(maybe_provider)?;
+            ForeignTableProvider(ffi_provider)
+        };
+
+        Ok(Arc::new(provider))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::datatypes::Schema;
+    use datafusion::prelude::SessionContext;
+    use datafusion_common::{TableReference, ToDFSchema};
+    use datafusion_execution::TaskContextProvider;
+    use std::collections::HashMap;
+
+    use super::*;
+
+    #[derive(Debug)]
+    struct TestTableProviderFactory {}
+
+    #[async_trait]
+    impl TableProviderFactory for TestTableProviderFactory {
+        async fn create(
+            &self,
+            _session: &dyn Session,
+            _cmd: &CreateExternalTable,
+        ) -> Result<Arc<dyn TableProvider>> {
+            use arrow::datatypes::Field;
+            use datafusion::arrow::array::Float32Array;
+            use datafusion::arrow::datatypes::DataType;
+            use datafusion::arrow::record_batch::RecordBatch;
+            use datafusion::datasource::MemTable;
+
+            let schema =
+                Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)]));
+
+            let batch1 = RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![Arc::new(Float32Array::from(vec![2.0, 4.0, 8.0]))],
+            )?;
+            let batch2 = RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![Arc::new(Float32Array::from(vec![64.0]))],
+            )?;
+
+            Ok(Arc::new(MemTable::try_new(
+                schema,
+                vec![vec![batch1], vec![batch2]],
+            )?))
+        }
+    }
+
+    #[tokio::test]
+    async fn test_round_trip_ffi_table_provider_factory() -> Result<()> {
+        let ctx = Arc::new(SessionContext::new());
+        let task_ctx_provider = Arc::clone(&ctx) as Arc<dyn TaskContextProvider>;
+        let task_ctx_provider = FFI_TaskContextProvider::from(&task_ctx_provider);
+
+        let factory = Arc::new(TestTableProviderFactory {});
+        let mut ffi_factory =
+            FFI_TableProviderFactory::new(factory, None, task_ctx_provider, None);
+        ffi_factory.library_marker_id = crate::mock_foreign_marker_id;
+
+        let factory: Arc<dyn TableProviderFactory> = (&ffi_factory).into();
+
+        let cmd = CreateExternalTable {
+            schema: Schema::empty().to_dfschema_ref()?,
+            name: TableReference::bare("test_table"),
+            location: "test".to_string(),
+            file_type: "test".to_string(),
+            table_partition_cols: vec![],
+            if_not_exists: false,
+            or_replace: false,
+            temporary: false,
+            definition: None,
+            order_exprs: vec![],
+            unbounded: false,
+            options: HashMap::new(),
+            constraints: Default::default(),
+            column_defaults: HashMap::new(),
+        };
+
+        let provider = factory.create(&ctx.state(), &cmd).await?;
+
+        assert_eq!(provider.schema().fields().len(), 1);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_ffi_table_provider_factory_clone() -> Result<()> {
+        let ctx = Arc::new(SessionContext::new());
+        let task_ctx_provider = Arc::clone(&ctx) as Arc<dyn TaskContextProvider>;
+        let task_ctx_provider = FFI_TaskContextProvider::from(&task_ctx_provider);
+
+        let factory = Arc::new(TestTableProviderFactory {});
+        let ffi_factory =
+            FFI_TableProviderFactory::new(factory, None, task_ctx_provider, None);
+
+        // Test that we can clone the factory
+        let cloned_factory = ffi_factory.clone();
+        let factory: Arc<dyn TableProviderFactory> = (&cloned_factory).into();
+
+        let cmd = CreateExternalTable {
+            schema: Schema::empty().to_dfschema_ref()?,
+            name: TableReference::bare("cloned_test"),
+            location: "test".to_string(),
+            file_type: "test".to_string(),
+            table_partition_cols: vec![],
+            if_not_exists: false,
+            or_replace: false,
+            temporary: false,
+            definition: None,
+            order_exprs: vec![],
+            unbounded: false,
+            options: HashMap::new(),
+            constraints: Default::default(),
+            column_defaults: HashMap::new(),
+        };
+
+        let provider = factory.create(&ctx.state(), &cmd).await?;
+        assert_eq!(provider.schema().fields().len(), 1);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/src/table_source.rs b/datafusion/ffi/src/table_source.rs
index 418fdf16a564f..00374043af014 100644
--- a/datafusion/ffi/src/table_source.rs
+++ b/datafusion/ffi/src/table_source.rs
@@ -15,13 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use abi_stable::StableAbi;
-use datafusion::{datasource::TableType, logical_expr::TableProviderFilterPushDown};
+use datafusion_expr::{TableProviderFilterPushDown, TableType};
 
 /// FFI safe version of [`TableProviderFilterPushDown`].
-#[repr(C)]
-#[derive(StableAbi)]
-#[allow(non_camel_case_types)]
+#[expect(non_camel_case_types)]
+#[repr(u8)]
 pub enum FFI_TableProviderFilterPushDown {
     Unsupported,
     Inexact,
@@ -58,8 +56,7 @@ impl From<&TableProviderFilterPushDown> for FFI_TableProviderFilterPushDown {
 
 /// FFI safe version of [`TableType`].
 #[repr(C)]
-#[allow(non_camel_case_types)]
-#[derive(Debug, Clone, Copy, PartialEq, Eq, StableAbi)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum FFI_TableType {
     Base,
     View,
@@ -88,9 +85,10 @@ impl From<TableType> for FFI_TableType {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use datafusion::error::Result;
 
+    use super::*;
+
     fn round_trip_filter_pushdown(pushdown: TableProviderFilterPushDown) -> Result<()> {
         let ffi_pushdown: FFI_TableProviderFilterPushDown = (&pushdown).into();
         let round_trip: TableProviderFilterPushDown = (&ffi_pushdown).into();
diff --git a/datafusion/ffi/src/tests/async_provider.rs b/datafusion/ffi/src/tests/async_provider.rs
index cef4161d8c1fc..011d3f0a0a343 100644
--- a/datafusion/ffi/src/tests/async_provider.rs
+++ b/datafusion/ffi/src/tests/async_provider.rs
@@ -25,28 +25,27 @@
 //! access the runtime, then you will get a panic when trying to do operations
 //! such as spawning a tokio task.
 
-use std::{any::Any, fmt::Debug, sync::Arc};
+use std::fmt::Debug;
+use std::sync::Arc;
 
-use crate::table_provider::FFI_TableProvider;
 use arrow::array::RecordBatch;
 use arrow::datatypes::Schema;
 use async_trait::async_trait;
-use datafusion::{
-    catalog::{Session, TableProvider},
-    error::Result,
-    execution::RecordBatchStream,
-    physical_expr::EquivalenceProperties,
-    physical_plan::{ExecutionPlan, Partitioning},
-    prelude::Expr,
-};
-use datafusion_common::exec_err;
+use datafusion_catalog::TableProvider;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, exec_err};
+use datafusion_execution::RecordBatchStream;
+use datafusion_expr::Expr;
+use datafusion_physical_expr::{EquivalenceProperties, Partitioning};
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_session::Session;
 use futures::Stream;
-use tokio::{
-    runtime::Handle,
-    sync::{broadcast, mpsc},
-};
+use tokio::runtime::Handle;
+use tokio::sync::{broadcast, mpsc};
 
 use super::create_record_batch;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::table_provider::FFI_TableProvider;
 
 #[derive(Debug)]
 pub struct AsyncTableProvider {
@@ -60,7 +59,7 @@ fn async_table_provider_thread(
     mut shutdown: mpsc::Receiver<bool>,
     mut batch_request: mpsc::Receiver<bool>,
     batch_sender: broadcast::Sender<Option<RecordBatch>>,
-    tokio_rt: mpsc::Sender<Handle>,
+    tokio_rt: &mpsc::Sender<Handle>,
 ) {
     let runtime = Arc::new(
         tokio::runtime::Builder::new_current_thread()
@@ -107,7 +106,7 @@ pub fn start_async_provider() -> (AsyncTableProvider, Handle) {
             shutdown_rx,
             batch_request_rx,
             record_batch_tx,
-            tokio_rt_tx,
+            &tokio_rt_tx,
         )
     }));
 
@@ -127,16 +126,12 @@ pub fn start_async_provider() -> (AsyncTableProvider, Handle) {
 
 #[async_trait]
 impl TableProvider for AsyncTableProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> Arc<Schema> {
         super::create_test_schema()
     }
 
-    fn table_type(&self) -> datafusion::logical_expr::TableType {
-        datafusion::logical_expr::TableType::Base
+    fn table_type(&self) -> datafusion_expr::TableType {
+        datafusion_expr::TableType::Base
     }
 
     async fn scan(
@@ -163,7 +158,7 @@ impl Drop for AsyncTableProvider {
 
 #[derive(Debug)]
 struct AsyncTestExecutionPlan {
-    properties: datafusion::physical_plan::PlanProperties,
+    properties: Arc<datafusion_physical_plan::PlanProperties>,
     batch_request: mpsc::Sender<bool>,
     batch_receiver: broadcast::Receiver<Option<RecordBatch>>,
 }
@@ -174,12 +169,12 @@ impl AsyncTestExecutionPlan {
         batch_receiver: broadcast::Receiver<Option<RecordBatch>>,
     ) -> Self {
         Self {
-            properties: datafusion::physical_plan::PlanProperties::new(
+            properties: Arc::new(datafusion_physical_plan::PlanProperties::new(
                 EquivalenceProperties::new(super::create_test_schema()),
                 Partitioning::UnknownPartitioning(3),
-                datafusion::physical_plan::execution_plan::EmissionType::Incremental,
-                datafusion::physical_plan::execution_plan::Boundedness::Bounded,
-            ),
+                datafusion_physical_plan::execution_plan::EmissionType::Incremental,
+                datafusion_physical_plan::execution_plan::Boundedness::Bounded,
+            )),
             batch_request,
             batch_receiver,
         }
@@ -191,11 +186,7 @@ impl ExecutionPlan for AsyncTestExecutionPlan {
         "async test execution plan"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &datafusion::physical_plan::PlanProperties {
+    fn properties(&self) -> &Arc<datafusion_physical_plan::PlanProperties> {
         &self.properties
     }
 
@@ -213,19 +204,35 @@ impl ExecutionPlan for AsyncTestExecutionPlan {
     fn execute(
         &self,
         _partition: usize,
-        _context: Arc<datafusion::execution::TaskContext>,
-    ) -> Result<datafusion::execution::SendableRecordBatchStream> {
+        _context: Arc<datafusion_execution::TaskContext>,
+    ) -> Result<datafusion_execution::SendableRecordBatchStream> {
         Ok(Box::pin(AsyncTestRecordBatchStream {
             batch_request: self.batch_request.clone(),
             batch_receiver: self.batch_receiver.resubscribe(),
         }))
     }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion_physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.properties.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
 }
 
-impl datafusion::physical_plan::DisplayAs for AsyncTestExecutionPlan {
+impl datafusion_physical_plan::DisplayAs for AsyncTestExecutionPlan {
     fn fmt_as(
         &self,
-        _t: datafusion::physical_plan::DisplayFormatType,
+        _t: datafusion_physical_plan::DisplayFormatType,
         _f: &mut std::fmt::Formatter,
     ) -> std::fmt::Result {
         // Do nothing, just a test
@@ -253,7 +260,7 @@ impl Stream for AsyncTestRecordBatchStream {
     ) -> std::task::Poll<Option<Self::Item>> {
         let mut this = self.as_mut();
 
-        #[allow(clippy::disallowed_methods)]
+        #[expect(clippy::disallowed_methods)]
         tokio::spawn(async move {
             // Nothing to do. We just need to simulate an async
             // task running
@@ -277,7 +284,14 @@ impl Stream for AsyncTestRecordBatchStream {
     }
 }
 
-pub(crate) fn create_async_table_provider() -> FFI_TableProvider {
+pub(crate) fn create_async_table_provider(
+    codec: FFI_LogicalExtensionCodec,
+) -> FFI_TableProvider {
     let (table_provider, tokio_rt) = start_async_provider();
-    FFI_TableProvider::new(Arc::new(table_provider), true, Some(tokio_rt))
+    FFI_TableProvider::new_with_ffi_codec(
+        Arc::new(table_provider),
+        true,
+        Some(tokio_rt),
+        codec,
+    )
 }
diff --git a/datafusion/ffi/src/tests/catalog.rs b/datafusion/ffi/src/tests/catalog.rs
index f4293adb41b94..0c02de5d049ae 100644
--- a/datafusion/ffi/src/tests/catalog.rs
+++ b/datafusion/ffi/src/tests/catalog.rs
@@ -25,20 +25,20 @@
 //! access the runtime, then you will get a panic when trying to do operations
 //! such as spawning a tokio task.
 
-use std::{any::Any, fmt::Debug, sync::Arc};
+use std::fmt::Debug;
+use std::sync::Arc;
 
-use crate::catalog_provider::FFI_CatalogProvider;
 use arrow::datatypes::Schema;
 use async_trait::async_trait;
-use datafusion::{
-    catalog::{
-        CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, SchemaProvider,
-        TableProvider,
-    },
-    common::exec_err,
-    datasource::MemTable,
-    error::{DataFusionError, Result},
+use datafusion_catalog::{
+    CatalogProvider, CatalogProviderList, MemTable, MemoryCatalogProvider,
+    MemoryCatalogProviderList, MemorySchemaProvider, SchemaProvider, TableProvider,
 };
+use datafusion_common::{Result, exec_err};
+
+use crate::catalog_provider::FFI_CatalogProvider;
+use crate::catalog_provider_list::FFI_CatalogProviderList;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
 
 /// This schema provider is intended only for unit tests. It prepopulates with one
 /// table and only allows for tables named sales and purchases.
@@ -49,7 +49,7 @@ pub struct FixedSchemaProvider {
 
 pub fn fruit_table() -> Arc<dyn TableProvider + 'static> {
     use arrow::datatypes::{DataType, Field};
-    use datafusion::common::record_batch;
+    use datafusion_common::record_batch;
 
     let schema = Arc::new(Schema::new(vec![
         Field::new("units", DataType::Int32, true),
@@ -88,18 +88,11 @@ impl Default for FixedSchemaProvider {
 
 #[async_trait]
 impl SchemaProvider for FixedSchemaProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn table_names(&self) -> Vec<String> {
         self.inner.table_names()
     }
 
-    async fn table(
-        &self,
-        name: &str,
-    ) -> Result<Option<Arc<dyn TableProvider>>, DataFusionError> {
+    async fn table(&self, name: &str) -> Result<Option<Arc<dyn TableProvider>>> {
         self.inner.table(name).await
     }
 
@@ -144,10 +137,6 @@ impl Default for FixedCatalogProvider {
 }
 
 impl CatalogProvider for FixedCatalogProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema_names(&self) -> Vec<String> {
         self.inner.schema_names()
     }
@@ -162,7 +151,9 @@ impl CatalogProvider for FixedCatalogProvider {
         schema: Arc<dyn SchemaProvider>,
     ) -> Result<Option<Arc<dyn SchemaProvider>>> {
         if !["apple", "banana", "cherry", "date"].contains(&name) {
-            return exec_err!("FixedCatalogProvider only provides four schemas: apple, banana, cherry, date");
+            return exec_err!(
+                "FixedCatalogProvider only provides four schemas: apple, banana, cherry, date"
+            );
         }
 
         self.inner.register_schema(name, schema)
@@ -177,7 +168,61 @@ impl CatalogProvider for FixedCatalogProvider {
     }
 }
 
-pub(crate) extern "C" fn create_catalog_provider() -> FFI_CatalogProvider {
+pub(crate) extern "C" fn create_catalog_provider(
+    codec: FFI_LogicalExtensionCodec,
+) -> FFI_CatalogProvider {
     let catalog_provider = Arc::new(FixedCatalogProvider::default());
-    FFI_CatalogProvider::new(catalog_provider, None)
+    FFI_CatalogProvider::new_with_ffi_codec(catalog_provider, None, codec)
+}
+
+/// This catalog provider list is intended only for unit tests. It prepopulates with one
+/// catalog and only allows for catalogs named after four colors.
+#[derive(Debug)]
+pub struct FixedCatalogProviderList {
+    inner: MemoryCatalogProviderList,
+}
+
+impl Default for FixedCatalogProviderList {
+    fn default() -> Self {
+        let inner = MemoryCatalogProviderList::new();
+
+        let _ = inner.register_catalog(
+            "blue".to_owned(),
+            Arc::new(FixedCatalogProvider::default()),
+        );
+
+        Self { inner }
+    }
+}
+
+impl CatalogProviderList for FixedCatalogProviderList {
+    fn catalog_names(&self) -> Vec<String> {
+        self.inner.catalog_names()
+    }
+
+    fn catalog(&self, name: &str) -> Option<Arc<dyn CatalogProvider>> {
+        self.inner.catalog(name)
+    }
+
+    fn register_catalog(
+        &self,
+        name: String,
+        catalog: Arc<dyn CatalogProvider>,
+    ) -> Option<Arc<dyn CatalogProvider>> {
+        if !["blue", "red", "green", "yellow"].contains(&name.as_str()) {
+            log::warn!(
+                "FixedCatalogProviderList only provides four catalogs: blue, red, green, yellow"
+            );
+            return None;
+        }
+
+        self.inner.register_catalog(name, catalog)
+    }
+}
+
+pub(crate) extern "C" fn create_catalog_provider_list(
+    codec: FFI_LogicalExtensionCodec,
+) -> FFI_CatalogProviderList {
+    let catalog_provider_list = Arc::new(FixedCatalogProviderList::default());
+    FFI_CatalogProviderList::new_with_ffi_codec(catalog_provider_list, None, codec)
 }
diff --git a/datafusion/ffi/src/tests/config.rs b/datafusion/ffi/src/tests/config.rs
new file mode 100644
index 0000000000000..46fc9756203e3
--- /dev/null
+++ b/datafusion/ffi/src/tests/config.rs
@@ -0,0 +1,51 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_common::config::ConfigExtension;
+use datafusion_common::extensions_options;
+
+use crate::config::extension_options::FFI_ExtensionOptions;
+
+extensions_options! {
+   pub struct ExternalConfig {
+       /// Should "foo" be replaced by "bar"?
+       pub is_enabled: bool, default = true
+
+       /// Some value to be extracted
+       pub base_number: usize, default = 1000
+   }
+}
+
+impl PartialEq for ExternalConfig {
+    fn eq(&self, other: &Self) -> bool {
+        self.base_number == other.base_number && self.is_enabled == other.is_enabled
+    }
+}
+impl Eq for ExternalConfig {}
+
+impl ConfigExtension for ExternalConfig {
+    const PREFIX: &'static str = "external_config";
+}
+
+pub(crate) extern "C" fn create_extension_options() -> FFI_ExtensionOptions {
+    let mut extensions = FFI_ExtensionOptions::default();
+    extensions
+        .add_config(&ExternalConfig::default())
+        .expect("add_config should be infallible for ExternalConfig");
+
+    extensions
+}
diff --git a/datafusion/ffi/src/tests/mod.rs b/datafusion/ffi/src/tests/mod.rs
index 816086c320415..41fdd2699ab46 100644
--- a/datafusion/ffi/src/tests/mod.rs
+++ b/datafusion/ffi/src/tests/mod.rs
@@ -17,60 +17,73 @@
 
 use std::sync::Arc;
 
-use abi_stable::{
-    declare_root_module_statics, export_root_module,
-    library::{LibraryError, RootModule},
-    package_version_strings,
-    prefix_type::PrefixTypeTrait,
-    sabi_types::VersionStrings,
-    StableAbi,
-};
-use catalog::create_catalog_provider;
-
-use crate::{catalog_provider::FFI_CatalogProvider, udtf::FFI_TableFunction};
-
-use crate::udaf::FFI_AggregateUDF;
-
-use crate::udwf::FFI_WindowUDF;
-
-use super::{table_provider::FFI_TableProvider, udf::FFI_ScalarUDF};
 use arrow::array::RecordBatch;
+use arrow_schema::{DataType, Field, Schema};
 use async_provider::create_async_table_provider;
-use datafusion::{
-    arrow::datatypes::{DataType, Field, Schema},
-    common::record_batch,
-};
+use catalog::create_catalog_provider;
+use datafusion_common::record_batch;
 use sync_provider::create_sync_table_provider;
 use udf_udaf_udwf::{
     create_ffi_abs_func, create_ffi_random_func, create_ffi_rank_func,
     create_ffi_stddev_func, create_ffi_sum_func, create_ffi_table_func,
 };
 
+use crate::catalog_provider::FFI_CatalogProvider;
+use crate::catalog_provider_list::FFI_CatalogProviderList;
+use crate::config::extension_options::FFI_ExtensionOptions;
+use crate::execution_plan::FFI_ExecutionPlan;
+use crate::execution_plan::tests::EmptyExec;
+use crate::physical_optimizer::FFI_PhysicalOptimizerRule;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::table_provider::FFI_TableProvider;
+use crate::table_provider_factory::FFI_TableProviderFactory;
+use crate::tests::catalog::create_catalog_provider_list;
+use crate::udaf::FFI_AggregateUDF;
+use crate::udf::FFI_ScalarUDF;
+use crate::udtf::FFI_TableFunction;
+use crate::udwf::FFI_WindowUDF;
+
 mod async_provider;
 pub mod catalog;
+pub mod config;
+mod physical_optimizer;
 mod sync_provider;
+mod table_provider_factory;
 mod udf_udaf_udwf;
 pub mod utils;
 
 #[repr(C)]
-#[derive(StableAbi)]
-#[sabi(kind(Prefix(prefix_ref = ForeignLibraryModuleRef)))]
 /// This struct defines the module interfaces. It is to be shared by
 /// both the module loading program and library that implements the
 /// module.
 pub struct ForeignLibraryModule {
     /// Construct an opinionated catalog provider
-    pub create_catalog: extern "C" fn() -> FFI_CatalogProvider,
+    pub create_catalog:
+        extern "C" fn(codec: FFI_LogicalExtensionCodec) -> FFI_CatalogProvider,
+
+    /// Construct an opinionated catalog provider list
+    pub create_catalog_list:
+        extern "C" fn(codec: FFI_LogicalExtensionCodec) -> FFI_CatalogProviderList,
 
     /// Constructs the table provider
-    pub create_table: extern "C" fn(synchronous: bool) -> FFI_TableProvider,
+    pub create_table: extern "C" fn(
+        synchronous: bool,
+        codec: FFI_LogicalExtensionCodec,
+    ) -> FFI_TableProvider,
+
+    /// Constructs the table provider factory
+    pub create_table_factory:
+        extern "C" fn(codec: FFI_LogicalExtensionCodec) -> FFI_TableProviderFactory,
 
     /// Create a scalar UDF
     pub create_scalar_udf: extern "C" fn() -> FFI_ScalarUDF,
 
     pub create_nullary_udf: extern "C" fn() -> FFI_ScalarUDF,
 
-    pub create_table_function: extern "C" fn() -> FFI_TableFunction,
+    pub create_timezone_udf: extern "C" fn() -> FFI_ScalarUDF,
+
+    pub create_table_function:
+        extern "C" fn(FFI_LogicalExtensionCodec) -> FFI_TableFunction,
 
     /// Create an aggregate UDAF using sum
     pub create_sum_udaf: extern "C" fn() -> FFI_AggregateUDF,
@@ -80,18 +93,14 @@ pub struct ForeignLibraryModule {
 
     pub create_rank_udwf: extern "C" fn() -> FFI_WindowUDF,
 
-    pub version: extern "C" fn() -> u64,
-}
+    /// Create extension options, for either ConfigOptions or TableOptions
+    pub create_extension_options: extern "C" fn() -> FFI_ExtensionOptions,
 
-impl RootModule for ForeignLibraryModuleRef {
-    declare_root_module_statics! {ForeignLibraryModuleRef}
-    const BASE_NAME: &'static str = "datafusion_ffi";
-    const NAME: &'static str = "datafusion_ffi";
-    const VERSION_STRINGS: VersionStrings = package_version_strings!();
+    pub create_empty_exec: extern "C" fn() -> FFI_ExecutionPlan,
 
-    fn initialization(self) -> Result<Self, LibraryError> {
-        Ok(self)
-    }
+    pub create_physical_optimizer_rule: extern "C" fn() -> FFI_PhysicalOptimizerRule,
+
+    pub version: extern "C" fn() -> u64,
 }
 
 pub fn create_test_schema() -> Arc<Schema> {
@@ -111,26 +120,50 @@ pub fn create_record_batch(start_value: i32, num_values: usize) -> RecordBatch {
 
 /// Here we only wish to create a simple table provider as an example.
 /// We create an in-memory table and convert it to it's FFI counterpart.
-extern "C" fn construct_table_provider(synchronous: bool) -> FFI_TableProvider {
+extern "C" fn construct_table_provider(
+    synchronous: bool,
+    codec: FFI_LogicalExtensionCodec,
+) -> FFI_TableProvider {
     match synchronous {
-        true => create_sync_table_provider(),
-        false => create_async_table_provider(),
+        true => create_sync_table_provider(codec),
+        false => create_async_table_provider(codec),
     }
 }
 
-#[export_root_module]
+/// Here we only wish to create a simple table provider as an example.
+/// We create an in-memory table and convert it to it's FFI counterpart.
+extern "C" fn construct_table_provider_factory(
+    codec: FFI_LogicalExtensionCodec,
+) -> FFI_TableProviderFactory {
+    table_provider_factory::create(codec)
+}
+
+pub(crate) extern "C" fn create_empty_exec() -> FFI_ExecutionPlan {
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)]));
+
+    let plan = Arc::new(EmptyExec::new(schema));
+    FFI_ExecutionPlan::new(plan, None)
+}
+
 /// This defines the entry point for using the module.
-pub fn get_foreign_library_module() -> ForeignLibraryModuleRef {
+#[unsafe(no_mangle)]
+pub extern "C" fn datafusion_ffi_get_module() -> ForeignLibraryModule {
     ForeignLibraryModule {
         create_catalog: create_catalog_provider,
+        create_catalog_list: create_catalog_provider_list,
         create_table: construct_table_provider,
+        create_table_factory: construct_table_provider_factory,
         create_scalar_udf: create_ffi_abs_func,
         create_nullary_udf: create_ffi_random_func,
+        create_timezone_udf: udf_udaf_udwf::create_timezone_func,
         create_table_function: create_ffi_table_func,
         create_sum_udaf: create_ffi_sum_func,
         create_stddev_udaf: create_ffi_stddev_func,
         create_rank_udwf: create_ffi_rank_func,
+        create_extension_options: config::create_extension_options,
+        create_empty_exec,
+        create_physical_optimizer_rule:
+            physical_optimizer::create_physical_optimizer_rule,
         version: super::version,
     }
-    .leak_into_prefix()
 }
diff --git a/datafusion/ffi/src/tests/physical_optimizer.rs b/datafusion/ffi/src/tests/physical_optimizer.rs
new file mode 100644
index 0000000000000..2476526125b06
--- /dev/null
+++ b/datafusion/ffi/src/tests/physical_optimizer.rs
@@ -0,0 +1,54 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::error::Result;
+use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::limit::GlobalLimitExec;
+
+use crate::physical_optimizer::FFI_PhysicalOptimizerRule;
+
+/// A rule that wraps the input plan in a GlobalLimitExec with skip=0, fetch=10.
+/// This produces an observable change in the plan tree that tests can verify.
+#[derive(Debug)]
+struct AddLimitRule;
+
+impl PhysicalOptimizerRule for AddLimitRule {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        _config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(GlobalLimitExec::new(plan, 0, Some(10))))
+    }
+
+    fn name(&self) -> &str {
+        "add_limit_rule"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+pub(crate) extern "C" fn create_physical_optimizer_rule() -> FFI_PhysicalOptimizerRule {
+    let rule: Arc<dyn PhysicalOptimizerRule + Send + Sync> = Arc::new(AddLimitRule);
+    FFI_PhysicalOptimizerRule::new(rule, None)
+}
diff --git a/datafusion/ffi/src/tests/sync_provider.rs b/datafusion/ffi/src/tests/sync_provider.rs
index ff85e0b15b395..e3cb54fff90eb 100644
--- a/datafusion/ffi/src/tests/sync_provider.rs
+++ b/datafusion/ffi/src/tests/sync_provider.rs
@@ -17,12 +17,15 @@
 
 use std::sync::Arc;
 
-use crate::table_provider::FFI_TableProvider;
-use datafusion::datasource::MemTable;
+use datafusion_catalog::MemTable;
 
 use super::{create_record_batch, create_test_schema};
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::table_provider::FFI_TableProvider;
 
-pub(crate) fn create_sync_table_provider() -> FFI_TableProvider {
+pub(crate) fn create_sync_table_provider(
+    codec: FFI_LogicalExtensionCodec,
+) -> FFI_TableProvider {
     let schema = create_test_schema();
 
     // It is useful to create these as multiple record batches
@@ -35,5 +38,5 @@ pub(crate) fn create_sync_table_provider() -> FFI_TableProvider {
 
     let table_provider = MemTable::try_new(schema, vec![batches]).unwrap();
 
-    FFI_TableProvider::new(Arc::new(table_provider), true, None)
+    FFI_TableProvider::new_with_ffi_codec(Arc::new(table_provider), true, None, codec)
 }
diff --git a/datafusion/ffi/src/tests/table_provider_factory.rs b/datafusion/ffi/src/tests/table_provider_factory.rs
new file mode 100644
index 0000000000000..29af6aacf6484
--- /dev/null
+++ b/datafusion/ffi/src/tests/table_provider_factory.rs
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use datafusion_catalog::{MemTable, Session, TableProvider, TableProviderFactory};
+use datafusion_common::Result;
+use datafusion_expr::CreateExternalTable;
+
+use super::{create_record_batch, create_test_schema};
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::table_provider_factory::FFI_TableProviderFactory;
+
+#[derive(Debug)]
+pub struct TestTableProviderFactory {}
+
+#[async_trait]
+impl TableProviderFactory for TestTableProviderFactory {
+    async fn create(
+        &self,
+        _session: &dyn Session,
+        _cmd: &CreateExternalTable,
+    ) -> Result<Arc<dyn TableProvider>> {
+        let schema = create_test_schema();
+
+        // It is useful to create these as multiple record batches
+        // so that we can demonstrate the FFI stream.
+        let batches = vec![
+            create_record_batch(1, 5),
+            create_record_batch(6, 1),
+            create_record_batch(7, 5),
+        ];
+
+        let table_provider = MemTable::try_new(schema, vec![batches]).unwrap();
+
+        Ok(Arc::new(table_provider))
+    }
+}
+
+pub(crate) fn create(codec: FFI_LogicalExtensionCodec) -> FFI_TableProviderFactory {
+    let factory = TestTableProviderFactory {};
+    FFI_TableProviderFactory::new_with_ffi_codec(Arc::new(factory), None, codec)
+}
diff --git a/datafusion/ffi/src/tests/udf_udaf_udwf.rs b/datafusion/ffi/src/tests/udf_udaf_udwf.rs
index 55e31ef3ab770..399a2cc6be5cd 100644
--- a/datafusion/ffi/src/tests/udf_udaf_udwf.rs
+++ b/datafusion/ffi/src/tests/udf_udaf_udwf.rs
@@ -15,37 +15,109 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::{
-    udaf::FFI_AggregateUDF, udf::FFI_ScalarUDF, udtf::FFI_TableFunction,
-    udwf::FFI_WindowUDF,
-};
-use datafusion::{
-    catalog::TableFunctionImpl,
-    functions::math::{abs::AbsFunc, random::RandomFunc},
-    functions_aggregate::{stddev::Stddev, sum::Sum},
-    functions_table::generate_series::RangeFunc,
-    functions_window::rank::Rank,
-    logical_expr::{AggregateUDF, ScalarUDF, WindowUDF},
+use std::sync::Arc;
+
+use arrow_schema::DataType;
+use datafusion_catalog::TableFunctionImpl;
+use datafusion_common::ScalarValue;
+use datafusion_expr::{
+    AggregateUDF, ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
+    Volatility, WindowUDF,
 };
+use datafusion_functions::math::abs::AbsFunc;
+use datafusion_functions::math::random::RandomFunc;
+use datafusion_functions_aggregate::stddev::Stddev;
+use datafusion_functions_aggregate::sum::Sum;
+use datafusion_functions_table::generate_series::RangeFunc;
+use datafusion_functions_window::rank::Rank;
 
-use std::sync::Arc;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::udaf::FFI_AggregateUDF;
+use crate::udf::FFI_ScalarUDF;
+use crate::udtf::FFI_TableFunction;
+use crate::udwf::FFI_WindowUDF;
 
 pub(crate) extern "C" fn create_ffi_abs_func() -> FFI_ScalarUDF {
-    let udf: Arc<ScalarUDF> = Arc::new(AbsFunc::new().into());
+    let inner = WrappedAbs(Arc::new(AbsFunc::new().into()));
+    let udf: Arc<ScalarUDF> = Arc::new(inner.into());
 
     udf.into()
 }
 
+#[derive(Debug, Hash, Eq, PartialEq)]
+struct WrappedAbs(Arc<ScalarUDF>);
+
+impl ScalarUDFImpl for WrappedAbs {
+    fn name(&self) -> &str {
+        "ffi_abs"
+    }
+
+    fn signature(&self) -> &Signature {
+        self.0.signature()
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> datafusion_common::Result<DataType> {
+        self.0.return_type(arg_types)
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> datafusion_common::Result<ColumnarValue> {
+        self.0.invoke_with_args(args)
+    }
+}
+
 pub(crate) extern "C" fn create_ffi_random_func() -> FFI_ScalarUDF {
     let udf: Arc<ScalarUDF> = Arc::new(RandomFunc::new().into());
 
     udf.into()
 }
 
-pub(crate) extern "C" fn create_ffi_table_func() -> FFI_TableFunction {
+#[derive(Debug, PartialEq, Eq, Hash)]
+struct TimeZoneUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for TimeZoneUDF {
+    fn name(&self) -> &str {
+        "TimeZoneUDF"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(
+        &self,
+        _arg_types: &[DataType],
+    ) -> datafusion_common::Result<DataType> {
+        Ok(DataType::Utf8)
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> datafusion_common::Result<ColumnarValue> {
+        let tz = args.config_options.execution.time_zone.clone();
+        Ok(ColumnarValue::Scalar(ScalarValue::from(tz)))
+    }
+}
+
+pub(crate) extern "C" fn create_timezone_func() -> FFI_ScalarUDF {
+    let udf: Arc<ScalarUDF> = Arc::new(ScalarUDF::from(TimeZoneUDF {
+        signature: Signature::uniform(1, vec![DataType::Utf8], Volatility::Stable),
+    }));
+
+    udf.into()
+}
+
+pub(crate) extern "C" fn create_ffi_table_func(
+    codec: FFI_LogicalExtensionCodec,
+) -> FFI_TableFunction {
     let udtf: Arc<dyn TableFunctionImpl> = Arc::new(RangeFunc {});
 
-    FFI_TableFunction::new(udtf, None)
+    FFI_TableFunction::new_with_ffi_codec(udtf, None, codec)
 }
 
 pub(crate) extern "C" fn create_ffi_sum_func() -> FFI_AggregateUDF {
@@ -64,7 +136,7 @@ pub(crate) extern "C" fn create_ffi_rank_func() -> FFI_WindowUDF {
     let udwf: Arc<WindowUDF> = Arc::new(
         Rank::new(
             "rank_demo".to_string(),
-            datafusion::functions_window::rank::RankType::Basic,
+            datafusion_functions_window::rank::RankType::Basic,
         )
         .into(),
     );
diff --git a/datafusion/ffi/src/tests/utils.rs b/datafusion/ffi/src/tests/utils.rs
index 6465b17d9b60c..e1374c786266b 100644
--- a/datafusion/ffi/src/tests/utils.rs
+++ b/datafusion/ffi/src/tests/utils.rs
@@ -15,45 +15,63 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::tests::ForeignLibraryModuleRef;
-use abi_stable::library::RootModule;
-use datafusion::error::{DataFusionError, Result};
-use std::path::Path;
-
-/// Compute the path to the library. It would be preferable to simply use
-/// abi_stable::library::development_utils::compute_library_path however
-/// our current CI pipeline has a `ci` profile that we need to use to
-/// find the library.
-pub fn compute_library_path<M: RootModule>(
-    target_path: &Path,
-) -> std::io::Result<std::path::PathBuf> {
+use std::path::{Path, PathBuf};
+
+use datafusion_common::{DataFusionError, Result};
+
+use crate::tests::ForeignLibraryModule;
+
+/// Compute the path to the built cdylib. Checks debug, release, and ci profile dirs.
+fn compute_library_dir(target_path: &Path) -> PathBuf {
     let debug_dir = target_path.join("debug");
     let release_dir = target_path.join("release");
     let ci_dir = target_path.join("ci");
 
-    let debug_path = M::get_library_path(&debug_dir.join("deps"));
-    let release_path = M::get_library_path(&release_dir.join("deps"));
-    let ci_path = M::get_library_path(&ci_dir.join("deps"));
-
-    let all_paths = vec![
-        (debug_dir.clone(), debug_path),
-        (release_dir, release_path),
-        (ci_dir, ci_path),
-    ];
+    let all_dirs = vec![debug_dir.clone(), release_dir, ci_dir];
 
-    let best_path = all_paths
+    all_dirs
         .into_iter()
-        .filter(|(_, path)| path.exists())
-        .filter_map(|(dir, path)| path.metadata().map(|m| (dir, m)).ok())
-        .filter_map(|(dir, meta)| meta.modified().map(|m| (dir, m)).ok())
+        .filter(|dir| dir.join("deps").exists())
+        .filter_map(|dir| {
+            dir.join("deps")
+                .metadata()
+                .and_then(|m| m.modified())
+                .ok()
+                .map(|date| (dir, date))
+        })
         .max_by_key(|(_, date)| *date)
         .map(|(dir, _)| dir)
-        .unwrap_or(debug_dir);
+        .unwrap_or(debug_dir)
+}
+
+/// Find the cdylib file for datafusion_ffi in the given directory.
+fn find_cdylib(deps_dir: &Path) -> Result<PathBuf> {
+    let lib_prefix = if cfg!(target_os = "windows") {
+        ""
+    } else {
+        "lib"
+    };
+    let lib_ext = if cfg!(target_os = "macos") {
+        "dylib"
+    } else if cfg!(target_os = "windows") {
+        "dll"
+    } else {
+        "so"
+    };
+
+    let pattern = format!("{lib_prefix}datafusion_ffi.{lib_ext}");
+    let lib_path = deps_dir.join(&pattern);
 
-    Ok(best_path)
+    if lib_path.exists() {
+        return Ok(lib_path);
+    }
+
+    Err(DataFusionError::External(
+        format!("Could not find library at {}", lib_path.display()).into(),
+    ))
 }
 
-pub fn get_module() -> Result<ForeignLibraryModuleRef> {
+pub fn get_module() -> Result<ForeignLibraryModule> {
     let expected_version = crate::version();
 
     let crate_root = Path::new(env!("CARGO_MANIFEST_DIR"));
@@ -64,24 +82,26 @@ pub fn get_module() -> Result<ForeignLibraryModuleRef> {
         .expect("Failed to find workspace root")
         .join("target");
 
-    // Find the location of the library. This is specific to the build environment,
-    // so you will need to change the approach here based on your use case.
-    // let target: &std::path::Path = "../../../../target/".as_ref();
-    let library_path =
-        compute_library_path::<ForeignLibraryModuleRef>(target_dir.as_path())
+    let library_dir = compute_library_dir(target_dir.as_path());
+    let lib_path = find_cdylib(&library_dir.join("deps"))?;
+
+    // Load the library using libloading
+    let lib = unsafe {
+        libloading::Library::new(&lib_path)
+            .map_err(|e| DataFusionError::External(Box::new(e)))?
+    };
+
+    let get_module: libloading::Symbol<extern "C" fn() -> ForeignLibraryModule> = unsafe {
+        lib.get(b"datafusion_ffi_get_module")
             .map_err(|e| DataFusionError::External(Box::new(e)))?
-            .join("deps");
-
-    // Load the module
-    let module = ForeignLibraryModuleRef::load_from_directory(&library_path)
-        .map_err(|e| DataFusionError::External(Box::new(e)))?;
-
-    assert_eq!(
-        module
-            .version()
-            .expect("Unable to call version on FFI module")(),
-        expected_version
-    );
+    };
+
+    let module = get_module();
+
+    assert_eq!((module.version)(), expected_version);
+
+    // Leak the library to keep it loaded for the duration of the test
+    std::mem::forget(lib);
 
     Ok(module)
 }
diff --git a/datafusion/ffi/src/udaf/accumulator.rs b/datafusion/ffi/src/udaf/accumulator.rs
index 80b872159f483..9dae09291f13c 100644
--- a/datafusion/ffi/src/udaf/accumulator.rs
+++ b/datafusion/ffi/src/udaf/accumulator.rs
@@ -15,52 +15,51 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{ffi::c_void, ops::Deref};
-
-use abi_stable::{
-    std_types::{RResult, RString, RVec},
-    StableAbi,
-};
-use arrow::{array::ArrayRef, error::ArrowError};
-use datafusion::{
-    error::{DataFusionError, Result},
-    logical_expr::Accumulator,
-    scalar::ScalarValue,
-};
+use std::any::Any;
+use std::ffi::c_void;
+use std::ops::Deref;
+use std::ptr::null_mut;
+
+use arrow::array::ArrayRef;
+use arrow::error::ArrowError;
+use datafusion_common::error::{DataFusionError, Result};
+use datafusion_common::scalar::ScalarValue;
+use datafusion_expr::Accumulator;
 use prost::Message;
 
-use crate::{arrow_wrappers::WrappedArray, df_result, rresult, rresult_return};
+use stabby::vec::Vec as SVec;
+
+use crate::arrow_wrappers::WrappedArray;
+use crate::util::FFI_Result;
+use crate::{df_result, sresult, sresult_return};
 
 /// A stable struct for sharing [`Accumulator`] across FFI boundaries.
 /// For an explanation of each field, see the corresponding function
 /// defined in [`Accumulator`].
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
+#[derive(Debug)]
 pub struct FFI_Accumulator {
     pub update_batch: unsafe extern "C" fn(
         accumulator: &mut Self,
-        values: RVec<WrappedArray>,
-    ) -> RResult<(), RString>,
+        values: SVec<WrappedArray>,
+    ) -> FFI_Result<()>,
 
     // Evaluate and return a ScalarValues as protobuf bytes
-    pub evaluate:
-        unsafe extern "C" fn(accumulator: &mut Self) -> RResult<RVec<u8>, RString>,
+    pub evaluate: unsafe extern "C" fn(accumulator: &mut Self) -> FFI_Result<SVec<u8>>,
 
     pub size: unsafe extern "C" fn(accumulator: &Self) -> usize,
 
-    pub state:
-        unsafe extern "C" fn(accumulator: &mut Self) -> RResult<RVec<RVec<u8>>, RString>,
+    pub state: unsafe extern "C" fn(accumulator: &mut Self) -> FFI_Result<SVec<SVec<u8>>>,
 
     pub merge_batch: unsafe extern "C" fn(
         accumulator: &mut Self,
-        states: RVec<WrappedArray>,
-    ) -> RResult<(), RString>,
+        states: SVec<WrappedArray>,
+    ) -> FFI_Result<()>,
 
     pub retract_batch: unsafe extern "C" fn(
         accumulator: &mut Self,
-        values: RVec<WrappedArray>,
-    ) -> RResult<(), RString>,
+        values: SVec<WrappedArray>,
+    ) -> FFI_Result<()>,
 
     pub supports_retract_batch: bool,
 
@@ -70,6 +69,11 @@ pub struct FFI_Accumulator {
     /// Internal data. This is only to be accessed by the provider of the accumulator.
     /// A [`ForeignAccumulator`] should never attempt to access this data.
     pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
 }
 
 unsafe impl Send for FFI_Accumulator {}
@@ -82,104 +86,132 @@ pub struct AccumulatorPrivateData {
 impl FFI_Accumulator {
     #[inline]
     unsafe fn inner_mut(&mut self) -> &mut Box<dyn Accumulator> {
-        let private_data = self.private_data as *mut AccumulatorPrivateData;
-        &mut (*private_data).accumulator
+        unsafe {
+            let private_data = self.private_data as *mut AccumulatorPrivateData;
+            &mut (*private_data).accumulator
+        }
     }
 
     #[inline]
     unsafe fn inner(&self) -> &dyn Accumulator {
-        let private_data = self.private_data as *const AccumulatorPrivateData;
-        (*private_data).accumulator.deref()
+        unsafe {
+            let private_data = self.private_data as *const AccumulatorPrivateData;
+            (*private_data).accumulator.deref()
+        }
     }
 }
 
 unsafe extern "C" fn update_batch_fn_wrapper(
     accumulator: &mut FFI_Accumulator,
-    values: RVec<WrappedArray>,
-) -> RResult<(), RString> {
-    let accumulator = accumulator.inner_mut();
-
-    let values_arrays = values
-        .into_iter()
-        .map(|v| v.try_into().map_err(DataFusionError::from))
-        .collect::<Result<Vec<ArrayRef>>>();
-    let values_arrays = rresult_return!(values_arrays);
-
-    rresult!(accumulator.update_batch(&values_arrays))
+    values: SVec<WrappedArray>,
+) -> FFI_Result<()> {
+    unsafe {
+        let accumulator = accumulator.inner_mut();
+
+        let values_arrays = values
+            .into_iter()
+            .map(|v| v.try_into().map_err(DataFusionError::from))
+            .collect::<Result<Vec<ArrayRef>>>();
+        let values_arrays = sresult_return!(values_arrays);
+
+        sresult!(accumulator.update_batch(&values_arrays))
+    }
 }
 
 unsafe extern "C" fn evaluate_fn_wrapper(
     accumulator: &mut FFI_Accumulator,
-) -> RResult<RVec<u8>, RString> {
-    let accumulator = accumulator.inner_mut();
+) -> FFI_Result<SVec<u8>> {
+    unsafe {
+        let accumulator = accumulator.inner_mut();
 
-    let scalar_result = rresult_return!(accumulator.evaluate());
-    let proto_result: datafusion_proto::protobuf::ScalarValue =
-        rresult_return!((&scalar_result).try_into());
+        let scalar_result = sresult_return!(accumulator.evaluate());
+        let proto_result: datafusion_proto::protobuf::ScalarValue =
+            sresult_return!((&scalar_result).try_into());
 
-    RResult::ROk(proto_result.encode_to_vec().into())
+        FFI_Result::Ok(proto_result.encode_to_vec().into_iter().collect())
+    }
 }
 
 unsafe extern "C" fn size_fn_wrapper(accumulator: &FFI_Accumulator) -> usize {
-    accumulator.inner().size()
+    unsafe { accumulator.inner().size() }
 }
 
 unsafe extern "C" fn state_fn_wrapper(
     accumulator: &mut FFI_Accumulator,
-) -> RResult<RVec<RVec<u8>>, RString> {
-    let accumulator = accumulator.inner_mut();
-
-    let state = rresult_return!(accumulator.state());
-    let state = state
-        .into_iter()
-        .map(|state_val| {
-            datafusion_proto::protobuf::ScalarValue::try_from(&state_val)
-                .map_err(DataFusionError::from)
-                .map(|v| RVec::from(v.encode_to_vec()))
-        })
-        .collect::<Result<Vec<_>>>()
-        .map(|state_vec| state_vec.into());
-
-    rresult!(state)
+) -> FFI_Result<SVec<SVec<u8>>> {
+    unsafe {
+        let accumulator = accumulator.inner_mut();
+
+        let state = sresult_return!(accumulator.state());
+        let state = state
+            .into_iter()
+            .map(|state_val| {
+                datafusion_proto::protobuf::ScalarValue::try_from(&state_val)
+                    .map_err(DataFusionError::from)
+                    .map(|v| v.encode_to_vec().into_iter().collect::<SVec<u8>>())
+            })
+            .collect::<Result<Vec<_>>>()
+            .map(|state_vec| state_vec.into_iter().collect());
+
+        sresult!(state)
+    }
 }
 
 unsafe extern "C" fn merge_batch_fn_wrapper(
     accumulator: &mut FFI_Accumulator,
-    states: RVec<WrappedArray>,
-) -> RResult<(), RString> {
-    let accumulator = accumulator.inner_mut();
+    states: SVec<WrappedArray>,
+) -> FFI_Result<()> {
+    unsafe {
+        let accumulator = accumulator.inner_mut();
 
-    let states = rresult_return!(states
-        .into_iter()
-        .map(|state| ArrayRef::try_from(state).map_err(DataFusionError::from))
-        .collect::<Result<Vec<_>>>());
+        let states = sresult_return!(
+            states
+                .into_iter()
+                .map(|state| ArrayRef::try_from(state).map_err(DataFusionError::from))
+                .collect::<Result<Vec<_>>>()
+        );
 
-    rresult!(accumulator.merge_batch(&states))
+        sresult!(accumulator.merge_batch(&states))
+    }
 }
 
 unsafe extern "C" fn retract_batch_fn_wrapper(
     accumulator: &mut FFI_Accumulator,
-    values: RVec<WrappedArray>,
-) -> RResult<(), RString> {
-    let accumulator = accumulator.inner_mut();
-
-    let values_arrays = values
-        .into_iter()
-        .map(|v| v.try_into().map_err(DataFusionError::from))
-        .collect::<Result<Vec<ArrayRef>>>();
-    let values_arrays = rresult_return!(values_arrays);
-
-    rresult!(accumulator.retract_batch(&values_arrays))
+    values: SVec<WrappedArray>,
+) -> FFI_Result<()> {
+    unsafe {
+        let accumulator = accumulator.inner_mut();
+
+        let values_arrays = values
+            .into_iter()
+            .map(|v| v.try_into().map_err(DataFusionError::from))
+            .collect::<Result<Vec<ArrayRef>>>();
+        let values_arrays = sresult_return!(values_arrays);
+
+        sresult!(accumulator.retract_batch(&values_arrays))
+    }
 }
 
 unsafe extern "C" fn release_fn_wrapper(accumulator: &mut FFI_Accumulator) {
-    let private_data =
-        Box::from_raw(accumulator.private_data as *mut AccumulatorPrivateData);
-    drop(private_data);
+    unsafe {
+        if !accumulator.private_data.is_null() {
+            let private_data =
+                Box::from_raw(accumulator.private_data as *mut AccumulatorPrivateData);
+            drop(private_data);
+            accumulator.private_data = null_mut();
+        }
+    }
 }
 
 impl From<Box<dyn Accumulator>> for FFI_Accumulator {
     fn from(accumulator: Box<dyn Accumulator>) -> Self {
+        if (accumulator.as_ref() as &dyn Any).is::<ForeignAccumulator>() {
+            let accumulator = (accumulator as Box<dyn Any>)
+                .downcast::<ForeignAccumulator>()
+                .expect("already checked type");
+            return accumulator.accumulator;
+        }
+
         let supports_retract_batch = accumulator.supports_retract_batch();
         let private_data = AccumulatorPrivateData { accumulator };
 
@@ -193,6 +225,7 @@ impl From<Box<dyn Accumulator>> for FFI_Accumulator {
             supports_retract_batch,
             release: release_fn_wrapper,
             private_data: Box::into_raw(Box::new(private_data)) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
         }
     }
 }
@@ -214,12 +247,20 @@ pub struct ForeignAccumulator {
     accumulator: FFI_Accumulator,
 }
 
-unsafe impl Send for ForeignAccumulator {}
-unsafe impl Sync for ForeignAccumulator {}
-
-impl From<FFI_Accumulator> for ForeignAccumulator {
-    fn from(accumulator: FFI_Accumulator) -> Self {
-        Self { accumulator }
+impl From<FFI_Accumulator> for Box<dyn Accumulator> {
+    fn from(mut accumulator: FFI_Accumulator) -> Self {
+        if (accumulator.library_marker_id)() == crate::get_library_marker_id() {
+            unsafe {
+                let private_data = Box::from_raw(
+                    accumulator.private_data as *mut AccumulatorPrivateData,
+                );
+                // We must set this to null to avoid a double free
+                accumulator.private_data = null_mut();
+                private_data.accumulator
+            }
+        } else {
+            Box::new(ForeignAccumulator { accumulator })
+        }
     }
 }
 
@@ -232,7 +273,7 @@ impl Accumulator for ForeignAccumulator {
                 .collect::<std::result::Result<Vec<_>, ArrowError>>()?;
             df_result!((self.accumulator.update_batch)(
                 &mut self.accumulator,
-                values.into()
+                values.into_iter().collect()
             ))
         }
     }
@@ -281,7 +322,7 @@ impl Accumulator for ForeignAccumulator {
                 .collect::<std::result::Result<Vec<_>, ArrowError>>()?;
             df_result!((self.accumulator.merge_batch)(
                 &mut self.accumulator,
-                states.into()
+                states.into_iter().collect()
             ))
         }
     }
@@ -294,7 +335,7 @@ impl Accumulator for ForeignAccumulator {
                 .collect::<std::result::Result<Vec<_>, ArrowError>>()?;
             df_result!((self.accumulator.retract_batch)(
                 &mut self.accumulator,
-                values.into()
+                values.into_iter().collect()
             ))
         }
     }
@@ -306,12 +347,12 @@ impl Accumulator for ForeignAccumulator {
 
 #[cfg(test)]
 mod tests {
-    use arrow::array::{make_array, Array};
-    use datafusion::{
-        common::create_array, error::Result,
-        functions_aggregate::average::AvgAccumulator, logical_expr::Accumulator,
-        scalar::ScalarValue,
-    };
+    use arrow::array::{Array, make_array};
+    use datafusion::common::create_array;
+    use datafusion::error::Result;
+    use datafusion::functions_aggregate::average::AvgAccumulator;
+    use datafusion::logical_expr::Accumulator;
+    use datafusion::scalar::ScalarValue;
 
     use super::{FFI_Accumulator, ForeignAccumulator};
 
@@ -322,8 +363,9 @@ mod tests {
         let original_supports_retract = original_accum.supports_retract_batch();
 
         let boxed_accum: Box<dyn Accumulator> = Box::new(original_accum);
-        let ffi_accum: FFI_Accumulator = boxed_accum.into();
-        let mut foreign_accum: ForeignAccumulator = ffi_accum.into();
+        let mut ffi_accum: FFI_Accumulator = boxed_accum.into();
+        ffi_accum.library_marker_id = crate::mock_foreign_marker_id;
+        let mut foreign_accum: Box<dyn Accumulator> = ffi_accum.into();
 
         // Send in an array to average. There are 5 values and it should average to 30.0
         let values = create_array!(Float64, vec![10., 20., 30., 40., 50.]);
@@ -363,4 +405,35 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_ffi_accumulator_local_bypass() -> Result<()> {
+        let original_accum = AvgAccumulator::default();
+        let boxed_accum: Box<dyn Accumulator> = Box::new(original_accum);
+        let original_size = boxed_accum.size();
+
+        let ffi_accum: FFI_Accumulator = boxed_accum.into();
+
+        // Verify local libraries can be downcast to their original
+        let foreign_accum: Box<dyn Accumulator> = ffi_accum.into();
+        unsafe {
+            let concrete = &*(foreign_accum.as_ref() as *const dyn Accumulator
+                as *const AvgAccumulator);
+            assert_eq!(original_size, concrete.size());
+        }
+
+        // Verify different library markers generate foreign accumulator
+        let original_accum = AvgAccumulator::default();
+        let boxed_accum: Box<dyn Accumulator> = Box::new(original_accum);
+        let mut ffi_accum: FFI_Accumulator = boxed_accum.into();
+        ffi_accum.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_accum: Box<dyn Accumulator> = ffi_accum.into();
+        unsafe {
+            let concrete = &*(foreign_accum.as_ref() as *const dyn Accumulator
+                as *const ForeignAccumulator);
+            assert_eq!(original_size, concrete.size());
+        }
+
+        Ok(())
+    }
 }
diff --git a/datafusion/ffi/src/udaf/accumulator_args.rs b/datafusion/ffi/src/udaf/accumulator_args.rs
index 6ac0a0b21d2d7..58bcf58b88c2b 100644
--- a/datafusion/ffi/src/udaf/accumulator_args.rs
+++ b/datafusion/ffi/src/udaf/accumulator_args.rs
@@ -17,75 +17,69 @@
 
 use std::sync::Arc;
 
-use crate::arrow_wrappers::WrappedSchema;
-use abi_stable::{
-    std_types::{RString, RVec},
-    StableAbi,
-};
-use arrow::{datatypes::Schema, ffi::FFI_ArrowSchema};
+use arrow::datatypes::Schema;
+use arrow::ffi::FFI_ArrowSchema;
 use arrow_schema::FieldRef;
-use datafusion::{
-    error::DataFusionError,
-    logical_expr::function::AccumulatorArgs,
-    physical_expr::{PhysicalExpr, PhysicalSortExpr},
-    prelude::SessionContext,
-};
-use datafusion_common::exec_datafusion_err;
-use datafusion_proto::{
-    physical_plan::{
-        from_proto::{parse_physical_exprs, parse_physical_sort_exprs},
-        to_proto::{serialize_physical_exprs, serialize_physical_sort_exprs},
-        DefaultPhysicalExtensionCodec,
-    },
-    protobuf::PhysicalAggregateExprNode,
-};
-use prost::Message;
+use datafusion_common::error::DataFusionError;
+use datafusion_expr::function::AccumulatorArgs;
+use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr};
+use stabby::string::String as SString;
+use stabby::vec::Vec as SVec;
+
+use crate::arrow_wrappers::WrappedSchema;
+use crate::physical_expr::FFI_PhysicalExpr;
+use crate::physical_expr::sort::FFI_PhysicalSortExpr;
+use crate::util::{rvec_wrapped_to_vec_fieldref, vec_fieldref_to_rvec_wrapped};
 
 /// A stable struct for sharing [`AccumulatorArgs`] across FFI boundaries.
 /// For an explanation of each field, see the corresponding field
 /// defined in [`AccumulatorArgs`].
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
+#[derive(Debug)]
 pub struct FFI_AccumulatorArgs {
     return_field: WrappedSchema,
     schema: WrappedSchema,
+    ignore_nulls: bool,
+    order_bys: SVec<FFI_PhysicalSortExpr>,
     is_reversed: bool,
-    name: RString,
-    physical_expr_def: RVec<u8>,
+    name: SString,
+    is_distinct: bool,
+    exprs: SVec<FFI_PhysicalExpr>,
+    expr_fields: SVec<WrappedSchema>,
 }
 
 impl TryFrom<AccumulatorArgs<'_>> for FFI_AccumulatorArgs {
     type Error = DataFusionError;
-
-    fn try_from(args: AccumulatorArgs) -> Result<Self, Self::Error> {
+    fn try_from(args: AccumulatorArgs) -> Result<Self, DataFusionError> {
         let return_field =
             WrappedSchema(FFI_ArrowSchema::try_from(args.return_field.as_ref())?);
         let schema = WrappedSchema(FFI_ArrowSchema::try_from(args.schema)?);
 
-        let codec = DefaultPhysicalExtensionCodec {};
-        let ordering_req =
-            serialize_physical_sort_exprs(args.order_bys.to_owned(), &codec)?;
+        let order_bys: SVec<_> = args
+            .order_bys
+            .iter()
+            .map(FFI_PhysicalSortExpr::from)
+            .collect();
 
-        let expr = serialize_physical_exprs(args.exprs, &codec)?;
+        let exprs = args
+            .exprs
+            .iter()
+            .map(Arc::clone)
+            .map(FFI_PhysicalExpr::from)
+            .collect();
 
-        let physical_expr_def = PhysicalAggregateExprNode {
-            expr,
-            ordering_req,
-            distinct: args.is_distinct,
-            ignore_nulls: args.ignore_nulls,
-            fun_definition: None,
-            aggregate_function: None,
-            human_display: args.name.to_string(),
-        };
-        let physical_expr_def = physical_expr_def.encode_to_vec().into();
+        let expr_fields = vec_fieldref_to_rvec_wrapped(args.expr_fields)?;
 
         Ok(Self {
             return_field,
             schema,
+            ignore_nulls: args.ignore_nulls,
+            order_bys,
             is_reversed: args.is_reversed,
             name: args.name.into(),
-            physical_expr_def,
+            is_distinct: args.is_distinct,
+            exprs,
+            expr_fields,
         })
     }
 }
@@ -110,43 +104,28 @@ impl TryFrom<FFI_AccumulatorArgs> for ForeignAccumulatorArgs {
     type Error = DataFusionError;
 
     fn try_from(value: FFI_AccumulatorArgs) -> Result<Self, Self::Error> {
-        let proto_def = PhysicalAggregateExprNode::decode(
-            value.physical_expr_def.as_ref(),
-        )
-        .map_err(|e| {
-            exec_datafusion_err!("Failed to decode PhysicalAggregateExprNode: {e}")
-        })?;
-
         let return_field = Arc::new((&value.return_field.0).try_into()?);
         let schema = Schema::try_from(&value.schema.0)?;
 
-        let default_ctx = SessionContext::new();
-        let task_ctx = default_ctx.task_ctx();
-        let codex = DefaultPhysicalExtensionCodec {};
-
-        let order_bys = parse_physical_sort_exprs(
-            &proto_def.ordering_req,
-            &task_ctx,
-            &schema,
-            &codex,
-        )?;
+        let order_bys = value.order_bys.iter().map(PhysicalSortExpr::from).collect();
 
-        let exprs = parse_physical_exprs(&proto_def.expr, &task_ctx, &schema, &codex)?;
-
-        let expr_fields = exprs
+        let exprs = value
+            .exprs
             .iter()
-            .map(|e| e.return_field(&schema))
-            .collect::<Result<Vec<_>, _>>()?;
+            .map(<Arc<dyn PhysicalExpr>>::from)
+            .collect();
+
+        let expr_fields = rvec_wrapped_to_vec_fieldref(&value.expr_fields)?;
 
         Ok(Self {
             return_field,
             schema,
             expr_fields,
-            ignore_nulls: proto_def.ignore_nulls,
+            ignore_nulls: value.ignore_nulls,
             order_bys,
             is_reversed: value.is_reversed,
             name: value.name.to_string(),
-            is_distinct: proto_def.distinct,
+            is_distinct: value.is_distinct,
             exprs,
         })
     }
@@ -170,12 +149,13 @@ impl<'a> From<&'a ForeignAccumulatorArgs> for AccumulatorArgs<'a> {
 
 #[cfg(test)]
 mod tests {
-    use super::{FFI_AccumulatorArgs, ForeignAccumulatorArgs};
     use arrow::datatypes::{DataType, Field, Schema};
-    use datafusion::{
-        error::Result, logical_expr::function::AccumulatorArgs,
-        physical_expr::PhysicalSortExpr, physical_plan::expressions::col,
-    };
+    use datafusion::error::Result;
+    use datafusion::logical_expr::function::AccumulatorArgs;
+    use datafusion::physical_expr::PhysicalSortExpr;
+    use datafusion::physical_plan::expressions::col;
+
+    use super::{FFI_AccumulatorArgs, ForeignAccumulatorArgs};
 
     #[test]
     fn test_round_trip_accumulator_args() -> Result<()> {
@@ -193,7 +173,7 @@ mod tests {
         };
         let orig_str = format!("{orig_args:?}");
 
-        let ffi_args: FFI_AccumulatorArgs = orig_args.try_into()?;
+        let ffi_args = FFI_AccumulatorArgs::try_from(orig_args)?;
         let foreign_args: ForeignAccumulatorArgs = ffi_args.try_into()?;
         let round_trip_args: AccumulatorArgs = (&foreign_args).into();
 
diff --git a/datafusion/ffi/src/udaf/groups_accumulator.rs b/datafusion/ffi/src/udaf/groups_accumulator.rs
index 58a18c69db7c8..1600bef39da45 100644
--- a/datafusion/ffi/src/udaf/groups_accumulator.rs
+++ b/datafusion/ffi/src/udaf/groups_accumulator.rs
@@ -15,68 +15,64 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{ffi::c_void, ops::Deref, sync::Arc};
-
-use crate::{
-    arrow_wrappers::{WrappedArray, WrappedSchema},
-    df_result, rresult, rresult_return,
-};
-use abi_stable::{
-    std_types::{ROption, RResult, RString, RVec},
-    StableAbi,
-};
-use arrow::{
-    array::{Array, ArrayRef, BooleanArray},
-    error::ArrowError,
-    ffi::to_ffi,
-};
-use datafusion::{
-    error::{DataFusionError, Result},
-    logical_expr::{EmitTo, GroupsAccumulator},
-};
+use std::any::Any;
+use std::ffi::c_void;
+use std::ops::Deref;
+use std::ptr::null_mut;
+use std::sync::Arc;
+
+use arrow::array::{Array, ArrayRef, BooleanArray};
+use arrow::error::ArrowError;
+use arrow::ffi::to_ffi;
+use datafusion_common::error::{DataFusionError, Result};
+use datafusion_expr::{EmitTo, GroupsAccumulator};
+
+use stabby::vec::Vec as SVec;
+
+use crate::arrow_wrappers::{WrappedArray, WrappedSchema};
+use crate::util::{FFI_Option, FFI_Result};
+use crate::{df_result, sresult, sresult_return};
 
 /// A stable struct for sharing [`GroupsAccumulator`] across FFI boundaries.
 /// For an explanation of each field, see the corresponding function
 /// defined in [`GroupsAccumulator`].
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
+#[derive(Debug)]
 pub struct FFI_GroupsAccumulator {
     pub update_batch: unsafe extern "C" fn(
         accumulator: &mut Self,
-        values: RVec<WrappedArray>,
-        group_indices: RVec<usize>,
-        opt_filter: ROption<WrappedArray>,
+        values: SVec<WrappedArray>,
+        group_indices: SVec<usize>,
+        opt_filter: FFI_Option<WrappedArray>,
         total_num_groups: usize,
-    ) -> RResult<(), RString>,
+    ) -> FFI_Result<()>,
 
     // Evaluate and return a ScalarValues as protobuf bytes
     pub evaluate: unsafe extern "C" fn(
         accumulator: &mut Self,
         emit_to: FFI_EmitTo,
-    ) -> RResult<WrappedArray, RString>,
+    ) -> FFI_Result<WrappedArray>,
 
     pub size: unsafe extern "C" fn(accumulator: &Self) -> usize,
 
     pub state: unsafe extern "C" fn(
         accumulator: &mut Self,
         emit_to: FFI_EmitTo,
-    ) -> RResult<RVec<WrappedArray>, RString>,
+    ) -> FFI_Result<SVec<WrappedArray>>,
 
     pub merge_batch: unsafe extern "C" fn(
         accumulator: &mut Self,
-        values: RVec<WrappedArray>,
-        group_indices: RVec<usize>,
-        opt_filter: ROption<WrappedArray>,
+        values: SVec<WrappedArray>,
+        group_indices: SVec<usize>,
+        opt_filter: FFI_Option<WrappedArray>,
         total_num_groups: usize,
-    ) -> RResult<(), RString>,
+    ) -> FFI_Result<()>,
 
     pub convert_to_state: unsafe extern "C" fn(
         accumulator: &Self,
-        values: RVec<WrappedArray>,
-        opt_filter: ROption<WrappedArray>,
-    )
-        -> RResult<RVec<WrappedArray>, RString>,
+        values: SVec<WrappedArray>,
+        opt_filter: FFI_Option<WrappedArray>,
+    ) -> FFI_Result<SVec<WrappedArray>>,
 
     pub supports_convert_to_state: bool,
 
@@ -86,10 +82,12 @@ pub struct FFI_GroupsAccumulator {
     /// Internal data. This is only to be accessed by the provider of the accumulator.
     /// A [`ForeignGroupsAccumulator`] should never attempt to access this data.
     pub private_data: *mut c_void,
-}
 
-unsafe impl Send for FFI_GroupsAccumulator {}
-unsafe impl Sync for FFI_GroupsAccumulator {}
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
 
 pub struct GroupsAccumulatorPrivateData {
     pub accumulator: Box<dyn GroupsAccumulator>,
@@ -98,18 +96,22 @@ pub struct GroupsAccumulatorPrivateData {
 impl FFI_GroupsAccumulator {
     #[inline]
     unsafe fn inner_mut(&mut self) -> &mut Box<dyn GroupsAccumulator> {
-        let private_data = self.private_data as *mut GroupsAccumulatorPrivateData;
-        &mut (*private_data).accumulator
+        unsafe {
+            let private_data = self.private_data as *mut GroupsAccumulatorPrivateData;
+            &mut (*private_data).accumulator
+        }
     }
 
     #[inline]
     unsafe fn inner(&self) -> &dyn GroupsAccumulator {
-        let private_data = self.private_data as *const GroupsAccumulatorPrivateData;
-        (*private_data).accumulator.deref()
+        unsafe {
+            let private_data = self.private_data as *const GroupsAccumulatorPrivateData;
+            (*private_data).accumulator.deref()
+        }
     }
 }
 
-fn process_values(values: RVec<WrappedArray>) -> Result<Vec<Arc<dyn Array>>> {
+fn process_values(values: SVec<WrappedArray>) -> Result<Vec<Arc<dyn Array>>> {
     values
         .into_iter()
         .map(|v| v.try_into().map_err(DataFusionError::from))
@@ -117,7 +119,9 @@ fn process_values(values: RVec<WrappedArray>) -> Result<Vec<Arc<dyn Array>>> {
 }
 
 /// Convert C-typed opt_filter into the internal type.
-fn process_opt_filter(opt_filter: ROption<WrappedArray>) -> Result<Option<BooleanArray>> {
+fn process_opt_filter(
+    opt_filter: FFI_Option<WrappedArray>,
+) -> Result<Option<BooleanArray>> {
     opt_filter
         .into_option()
         .map(|filter| {
@@ -130,98 +134,127 @@ fn process_opt_filter(opt_filter: ROption<WrappedArray>) -> Result<Option<Boolea
 
 unsafe extern "C" fn update_batch_fn_wrapper(
     accumulator: &mut FFI_GroupsAccumulator,
-    values: RVec<WrappedArray>,
-    group_indices: RVec<usize>,
-    opt_filter: ROption<WrappedArray>,
+    values: SVec<WrappedArray>,
+    group_indices: SVec<usize>,
+    opt_filter: FFI_Option<WrappedArray>,
     total_num_groups: usize,
-) -> RResult<(), RString> {
-    let accumulator = accumulator.inner_mut();
-    let values = rresult_return!(process_values(values));
-    let group_indices: Vec<usize> = group_indices.into_iter().collect();
-    let opt_filter = rresult_return!(process_opt_filter(opt_filter));
-
-    rresult!(accumulator.update_batch(
-        &values,
-        &group_indices,
-        opt_filter.as_ref(),
-        total_num_groups
-    ))
+) -> FFI_Result<()> {
+    unsafe {
+        let accumulator = accumulator.inner_mut();
+        let values = sresult_return!(process_values(values));
+        let group_indices: Vec<usize> = group_indices.into_iter().collect();
+        let opt_filter = sresult_return!(process_opt_filter(opt_filter));
+
+        sresult!(accumulator.update_batch(
+            &values,
+            &group_indices,
+            opt_filter.as_ref(),
+            total_num_groups
+        ))
+    }
 }
 
 unsafe extern "C" fn evaluate_fn_wrapper(
     accumulator: &mut FFI_GroupsAccumulator,
     emit_to: FFI_EmitTo,
-) -> RResult<WrappedArray, RString> {
-    let accumulator = accumulator.inner_mut();
+) -> FFI_Result<WrappedArray> {
+    unsafe {
+        let accumulator = accumulator.inner_mut();
 
-    let result = rresult_return!(accumulator.evaluate(emit_to.into()));
+        let result = sresult_return!(accumulator.evaluate(emit_to.into()));
 
-    rresult!(WrappedArray::try_from(&result))
+        sresult!(WrappedArray::try_from(&result))
+    }
 }
 
 unsafe extern "C" fn size_fn_wrapper(accumulator: &FFI_GroupsAccumulator) -> usize {
-    let accumulator = accumulator.inner();
-    accumulator.size()
+    unsafe {
+        let accumulator = accumulator.inner();
+        accumulator.size()
+    }
 }
 
 unsafe extern "C" fn state_fn_wrapper(
     accumulator: &mut FFI_GroupsAccumulator,
     emit_to: FFI_EmitTo,
-) -> RResult<RVec<WrappedArray>, RString> {
-    let accumulator = accumulator.inner_mut();
+) -> FFI_Result<SVec<WrappedArray>> {
+    unsafe {
+        let accumulator = accumulator.inner_mut();
 
-    let state = rresult_return!(accumulator.state(emit_to.into()));
-    rresult!(state
-        .into_iter()
-        .map(|arr| WrappedArray::try_from(&arr).map_err(DataFusionError::from))
-        .collect::<Result<RVec<_>>>())
+        let state = sresult_return!(accumulator.state(emit_to.into()));
+        sresult!(
+            state
+                .into_iter()
+                .map(|arr| WrappedArray::try_from(&arr).map_err(DataFusionError::from))
+                .collect::<Result<SVec<_>>>()
+        )
+    }
 }
 
 unsafe extern "C" fn merge_batch_fn_wrapper(
     accumulator: &mut FFI_GroupsAccumulator,
-    values: RVec<WrappedArray>,
-    group_indices: RVec<usize>,
-    opt_filter: ROption<WrappedArray>,
+    values: SVec<WrappedArray>,
+    group_indices: SVec<usize>,
+    opt_filter: FFI_Option<WrappedArray>,
     total_num_groups: usize,
-) -> RResult<(), RString> {
-    let accumulator = accumulator.inner_mut();
-    let values = rresult_return!(process_values(values));
-    let group_indices: Vec<usize> = group_indices.into_iter().collect();
-    let opt_filter = rresult_return!(process_opt_filter(opt_filter));
-
-    rresult!(accumulator.merge_batch(
-        &values,
-        &group_indices,
-        opt_filter.as_ref(),
-        total_num_groups
-    ))
+) -> FFI_Result<()> {
+    unsafe {
+        let accumulator = accumulator.inner_mut();
+        let values = sresult_return!(process_values(values));
+        let group_indices: Vec<usize> = group_indices.into_iter().collect();
+        let opt_filter = sresult_return!(process_opt_filter(opt_filter));
+
+        sresult!(accumulator.merge_batch(
+            &values,
+            &group_indices,
+            opt_filter.as_ref(),
+            total_num_groups
+        ))
+    }
 }
 
 unsafe extern "C" fn convert_to_state_fn_wrapper(
     accumulator: &FFI_GroupsAccumulator,
-    values: RVec<WrappedArray>,
-    opt_filter: ROption<WrappedArray>,
-) -> RResult<RVec<WrappedArray>, RString> {
-    let accumulator = accumulator.inner();
-    let values = rresult_return!(process_values(values));
-    let opt_filter = rresult_return!(process_opt_filter(opt_filter));
-    let state =
-        rresult_return!(accumulator.convert_to_state(&values, opt_filter.as_ref()));
-
-    rresult!(state
-        .iter()
-        .map(|arr| WrappedArray::try_from(arr).map_err(DataFusionError::from))
-        .collect::<Result<RVec<_>>>())
+    values: SVec<WrappedArray>,
+    opt_filter: FFI_Option<WrappedArray>,
+) -> FFI_Result<SVec<WrappedArray>> {
+    unsafe {
+        let accumulator = accumulator.inner();
+        let values = sresult_return!(process_values(values));
+        let opt_filter = sresult_return!(process_opt_filter(opt_filter));
+        let state =
+            sresult_return!(accumulator.convert_to_state(&values, opt_filter.as_ref()));
+
+        sresult!(
+            state
+                .iter()
+                .map(|arr| WrappedArray::try_from(arr).map_err(DataFusionError::from))
+                .collect::<Result<SVec<_>>>()
+        )
+    }
 }
 
 unsafe extern "C" fn release_fn_wrapper(accumulator: &mut FFI_GroupsAccumulator) {
-    let private_data =
-        Box::from_raw(accumulator.private_data as *mut GroupsAccumulatorPrivateData);
-    drop(private_data);
+    unsafe {
+        if !accumulator.private_data.is_null() {
+            let private_data = Box::from_raw(
+                accumulator.private_data as *mut GroupsAccumulatorPrivateData,
+            );
+            drop(private_data);
+            accumulator.private_data = null_mut();
+        }
+    }
 }
 
 impl From<Box<dyn GroupsAccumulator>> for FFI_GroupsAccumulator {
     fn from(accumulator: Box<dyn GroupsAccumulator>) -> Self {
+        if (accumulator.as_ref() as &dyn Any).is::<ForeignGroupsAccumulator>() {
+            let accumulator = (accumulator as Box<dyn Any>)
+                .downcast::<ForeignGroupsAccumulator>()
+                .expect("already checked type");
+            return accumulator.accumulator;
+        }
+
         let supports_convert_to_state = accumulator.supports_convert_to_state();
         let private_data = GroupsAccumulatorPrivateData { accumulator };
 
@@ -236,6 +269,7 @@ impl From<Box<dyn GroupsAccumulator>> for FFI_GroupsAccumulator {
 
             release: release_fn_wrapper,
             private_data: Box::into_raw(Box::new(private_data)) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
         }
     }
 }
@@ -260,9 +294,20 @@ pub struct ForeignGroupsAccumulator {
 unsafe impl Send for ForeignGroupsAccumulator {}
 unsafe impl Sync for ForeignGroupsAccumulator {}
 
-impl From<FFI_GroupsAccumulator> for ForeignGroupsAccumulator {
-    fn from(accumulator: FFI_GroupsAccumulator) -> Self {
-        Self { accumulator }
+impl From<FFI_GroupsAccumulator> for Box<dyn GroupsAccumulator> {
+    fn from(mut accumulator: FFI_GroupsAccumulator) -> Self {
+        if (accumulator.library_marker_id)() == crate::get_library_marker_id() {
+            unsafe {
+                let private_data = Box::from_raw(
+                    accumulator.private_data as *mut GroupsAccumulatorPrivateData,
+                );
+                // We must set this to null to avoid a double free
+                accumulator.private_data = null_mut();
+                private_data.accumulator
+            }
+        } else {
+            Box::new(ForeignGroupsAccumulator { accumulator })
+        }
     }
 }
 
@@ -291,7 +336,7 @@ impl GroupsAccumulator for ForeignGroupsAccumulator {
 
             df_result!((self.accumulator.update_batch)(
                 &mut self.accumulator,
-                values.into(),
+                values.into_iter().collect(),
                 group_indices,
                 opt_filter,
                 total_num_groups
@@ -354,7 +399,7 @@ impl GroupsAccumulator for ForeignGroupsAccumulator {
 
             df_result!((self.accumulator.merge_batch)(
                 &mut self.accumulator,
-                values.into(),
+                values.into_iter().collect(),
                 group_indices,
                 opt_filter,
                 total_num_groups
@@ -371,7 +416,7 @@ impl GroupsAccumulator for ForeignGroupsAccumulator {
             let values = values
                 .iter()
                 .map(WrappedArray::try_from)
-                .collect::<std::result::Result<RVec<_>, ArrowError>>()?;
+                .collect::<std::result::Result<SVec<_>, ArrowError>>()?;
 
             let opt_filter = opt_filter
                 .map(|bool_array| to_ffi(&bool_array.to_data()))
@@ -401,8 +446,7 @@ impl GroupsAccumulator for ForeignGroupsAccumulator {
 }
 
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
+#[derive(Debug)]
 pub enum FFI_EmitTo {
     All,
     First(usize),
@@ -428,13 +472,13 @@ impl From<FFI_EmitTo> for EmitTo {
 
 #[cfg(test)]
 mod tests {
-    use arrow::array::{make_array, Array, BooleanArray};
-    use datafusion::{
-        common::create_array,
-        error::Result,
-        logical_expr::{EmitTo, GroupsAccumulator},
-    };
+    use arrow::array::{Array, BooleanArray, make_array};
+    use datafusion::common::create_array;
+    use datafusion::error::Result;
+    use datafusion::functions_aggregate::stddev::StddevGroupsAccumulator;
+    use datafusion::logical_expr::{EmitTo, GroupsAccumulator};
     use datafusion_functions_aggregate_common::aggregate::groups_accumulator::bool_op::BooleanGroupsAccumulator;
+    use datafusion_functions_aggregate_common::stats::StatsType;
 
     use super::{FFI_EmitTo, FFI_GroupsAccumulator, ForeignGroupsAccumulator};
 
@@ -442,8 +486,9 @@ mod tests {
     fn test_foreign_avg_accumulator() -> Result<()> {
         let boxed_accum: Box<dyn GroupsAccumulator> =
             Box::new(BooleanGroupsAccumulator::new(|a, b| a && b, true));
-        let ffi_accum: FFI_GroupsAccumulator = boxed_accum.into();
-        let mut foreign_accum: ForeignGroupsAccumulator = ffi_accum.into();
+        let mut ffi_accum: FFI_GroupsAccumulator = boxed_accum.into();
+        ffi_accum.library_marker_id = crate::mock_foreign_marker_id;
+        let mut foreign_accum: Box<dyn GroupsAccumulator> = ffi_accum.into();
 
         // Send in an array to evaluate. We want a mean of 30 and standard deviation of 4.
         let values = create_array!(Boolean, vec![true, true, true, false, true, true]);
@@ -510,4 +555,35 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_ffi_groups_accumulator_local_bypass_inner() -> Result<()> {
+        let original_accum = StddevGroupsAccumulator::new(StatsType::Population);
+        let boxed_accum: Box<dyn GroupsAccumulator> = Box::new(original_accum);
+        let original_size = boxed_accum.size();
+
+        let ffi_accum: FFI_GroupsAccumulator = boxed_accum.into();
+
+        // Verify local libraries can be downcast to their original
+        let foreign_accum: Box<dyn GroupsAccumulator> = ffi_accum.into();
+        unsafe {
+            let concrete = &*(foreign_accum.as_ref() as *const dyn GroupsAccumulator
+                as *const StddevGroupsAccumulator);
+            assert_eq!(original_size, concrete.size());
+        }
+
+        // Verify different library markers generate foreign accumulator
+        let original_accum = StddevGroupsAccumulator::new(StatsType::Population);
+        let boxed_accum: Box<dyn GroupsAccumulator> = Box::new(original_accum);
+        let mut ffi_accum: FFI_GroupsAccumulator = boxed_accum.into();
+        ffi_accum.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_accum: Box<dyn GroupsAccumulator> = ffi_accum.into();
+        unsafe {
+            let concrete = &*(foreign_accum.as_ref() as *const dyn GroupsAccumulator
+                as *const ForeignGroupsAccumulator);
+            assert_eq!(original_size, concrete.size());
+        }
+
+        Ok(())
+    }
 }
diff --git a/datafusion/ffi/src/udaf/mod.rs b/datafusion/ffi/src/udaf/mod.rs
index ce5611590b677..c4f8fb1254e84 100644
--- a/datafusion/ffi/src/udaf/mod.rs
+++ b/datafusion/ffi/src/udaf/mod.rs
@@ -15,42 +15,39 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use abi_stable::{
-    std_types::{ROption, RResult, RStr, RString, RVec},
-    StableAbi,
-};
-use accumulator::{FFI_Accumulator, ForeignAccumulator};
+use accumulator::FFI_Accumulator;
 use accumulator_args::{FFI_AccumulatorArgs, ForeignAccumulatorArgs};
 use arrow::datatypes::{DataType, Field};
 use arrow::ffi::FFI_ArrowSchema;
 use arrow_schema::FieldRef;
-use datafusion::{
-    error::DataFusionError,
-    logical_expr::{
-        function::{AccumulatorArgs, AggregateFunctionSimplification, StateFieldsArgs},
-        type_coercion::functions::fields_with_aggregate_udf,
-        utils::AggregateOrderSensitivity,
-        Accumulator, GroupsAccumulator,
-    },
+use datafusion_common::{DataFusionError, Result, ffi_datafusion_err};
+use datafusion_expr::function::AggregateFunctionSimplification;
+use datafusion_expr::type_coercion::functions::fields_with_udf;
+use datafusion_expr::{
+    Accumulator, AggregateUDF, AggregateUDFImpl, GroupsAccumulator, Signature,
 };
-use datafusion::{
-    error::Result,
-    logical_expr::{AggregateUDF, AggregateUDFImpl, Signature},
+use datafusion_functions_aggregate_common::accumulator::{
+    AccumulatorArgs, StateFieldsArgs,
 };
-use datafusion_common::exec_datafusion_err;
+use datafusion_functions_aggregate_common::order::AggregateOrderSensitivity;
 use datafusion_proto_common::from_proto::parse_proto_fields_to_fields;
-use groups_accumulator::{FFI_GroupsAccumulator, ForeignGroupsAccumulator};
+use groups_accumulator::FFI_GroupsAccumulator;
+use prost::{DecodeError, Message};
+
+use stabby::str::Str as SStr;
+use stabby::string::String as SString;
+use stabby::vec::Vec as SVec;
+use std::ffi::c_void;
 use std::hash::{Hash, Hasher};
-use std::{ffi::c_void, sync::Arc};
-
-use crate::util::{rvec_wrapped_to_vec_fieldref, vec_fieldref_to_rvec_wrapped};
-use crate::{
-    arrow_wrappers::WrappedSchema,
-    df_result, rresult, rresult_return,
-    util::{rvec_wrapped_to_vec_datatype, vec_datatype_to_rvec_wrapped},
-    volatility::FFI_Volatility,
+use std::sync::Arc;
+
+use crate::arrow_wrappers::WrappedSchema;
+use crate::util::{
+    FFI_Option, FFI_Result, rvec_wrapped_to_vec_datatype, rvec_wrapped_to_vec_fieldref,
+    vec_datatype_to_rvec_wrapped, vec_fieldref_to_rvec_wrapped,
 };
-use prost::{DecodeError, Message};
+use crate::volatility::FFI_Volatility;
+use crate::{df_result, sresult, sresult_return};
 
 mod accumulator;
 mod accumulator_args;
@@ -58,14 +55,13 @@ mod groups_accumulator;
 
 /// A stable struct for sharing a [`AggregateUDF`] across FFI boundaries.
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
+#[derive(Debug)]
 pub struct FFI_AggregateUDF {
     /// FFI equivalent to the `name` of a [`AggregateUDF`]
-    pub name: RString,
+    pub name: SString,
 
     /// FFI equivalent to the `aliases` of a [`AggregateUDF`]
-    pub aliases: RVec<RString>,
+    pub aliases: SVec<SString>,
 
     /// FFI equivalent to the `volatility` of a [`AggregateUDF`]
     pub volatility: FFI_Volatility,
@@ -74,8 +70,8 @@ pub struct FFI_AggregateUDF {
     /// argument fields.
     pub return_field: unsafe extern "C" fn(
         udaf: &Self,
-        arg_fields: RVec<WrappedSchema>,
-    ) -> RResult<WrappedSchema, RString>,
+        arg_fields: SVec<WrappedSchema>,
+    ) -> FFI_Result<WrappedSchema>,
 
     /// FFI equivalent to the `is_nullable` of a [`AggregateUDF`]
     pub is_nullable: bool,
@@ -88,39 +84,38 @@ pub struct FFI_AggregateUDF {
     pub accumulator: unsafe extern "C" fn(
         udaf: &FFI_AggregateUDF,
         args: FFI_AccumulatorArgs,
-    ) -> RResult<FFI_Accumulator, RString>,
+    ) -> FFI_Result<FFI_Accumulator>,
 
     /// FFI equivalent to [`AggregateUDF::create_sliding_accumulator`]
-    pub create_sliding_accumulator:
-        unsafe extern "C" fn(
-            udaf: &FFI_AggregateUDF,
-            args: FFI_AccumulatorArgs,
-        ) -> RResult<FFI_Accumulator, RString>,
+    pub create_sliding_accumulator: unsafe extern "C" fn(
+        udaf: &FFI_AggregateUDF,
+        args: FFI_AccumulatorArgs,
+    )
+        -> FFI_Result<FFI_Accumulator>,
 
     /// FFI equivalent to [`AggregateUDF::state_fields`]
-    #[allow(clippy::type_complexity)]
     pub state_fields: unsafe extern "C" fn(
         udaf: &FFI_AggregateUDF,
-        name: &RStr,
-        input_fields: RVec<WrappedSchema>,
+        name: &SStr,
+        input_fields: SVec<WrappedSchema>,
         return_field: WrappedSchema,
-        ordering_fields: RVec<RVec<u8>>,
+        ordering_fields: SVec<SVec<u8>>,
         is_distinct: bool,
-    ) -> RResult<RVec<RVec<u8>>, RString>,
+    ) -> FFI_Result<SVec<SVec<u8>>>,
 
     /// FFI equivalent to [`AggregateUDF::create_groups_accumulator`]
     pub create_groups_accumulator:
         unsafe extern "C" fn(
             udaf: &FFI_AggregateUDF,
             args: FFI_AccumulatorArgs,
-        ) -> RResult<FFI_GroupsAccumulator, RString>,
+        ) -> FFI_Result<FFI_GroupsAccumulator>,
 
     /// FFI equivalent to [`AggregateUDF::with_beneficial_ordering`]
     pub with_beneficial_ordering:
         unsafe extern "C" fn(
             udaf: &FFI_AggregateUDF,
             beneficial_ordering: bool,
-        ) -> RResult<ROption<FFI_AggregateUDF>, RString>,
+        ) -> FFI_Result<FFI_Option<FFI_AggregateUDF>>,
 
     /// FFI equivalent to [`AggregateUDF::order_sensitivity`]
     pub order_sensitivity:
@@ -132,8 +127,8 @@ pub struct FFI_AggregateUDF {
     /// appropriate calls on the underlying [`AggregateUDF`]
     pub coerce_types: unsafe extern "C" fn(
         udf: &Self,
-        arg_types: RVec<WrappedSchema>,
-    ) -> RResult<RVec<WrappedSchema>, RString>,
+        arg_types: SVec<WrappedSchema>,
+    ) -> FFI_Result<SVec<WrappedSchema>>,
 
     /// Used to create a clone on the provider of the udaf. This should
     /// only need to be called by the receiver of the udaf.
@@ -145,6 +140,11 @@ pub struct FFI_AggregateUDF {
     /// Internal data. This is only to be accessed by the provider of the udaf.
     /// A [`ForeignAggregateUDF`] should never attempt to access this data.
     pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
 }
 
 unsafe impl Send for FFI_AggregateUDF {}
@@ -156,177 +156,212 @@ pub struct AggregateUDFPrivateData {
 
 impl FFI_AggregateUDF {
     unsafe fn inner(&self) -> &Arc<AggregateUDF> {
-        let private_data = self.private_data as *const AggregateUDFPrivateData;
-        &(*private_data).udaf
+        unsafe {
+            let private_data = self.private_data as *const AggregateUDFPrivateData;
+            &(*private_data).udaf
+        }
     }
 }
 
 unsafe extern "C" fn return_field_fn_wrapper(
     udaf: &FFI_AggregateUDF,
-    arg_fields: RVec<WrappedSchema>,
-) -> RResult<WrappedSchema, RString> {
-    let udaf = udaf.inner();
+    arg_fields: SVec<WrappedSchema>,
+) -> FFI_Result<WrappedSchema> {
+    unsafe {
+        let udaf = udaf.inner();
 
-    let arg_fields = rresult_return!(rvec_wrapped_to_vec_fieldref(&arg_fields));
+        let arg_fields = sresult_return!(rvec_wrapped_to_vec_fieldref(&arg_fields));
 
-    let return_field = udaf
-        .return_field(&arg_fields)
-        .and_then(|v| {
-            FFI_ArrowSchema::try_from(v.as_ref()).map_err(DataFusionError::from)
-        })
-        .map(WrappedSchema);
+        let return_field = udaf
+            .return_field(&arg_fields)
+            .and_then(|v| {
+                FFI_ArrowSchema::try_from(v.as_ref()).map_err(DataFusionError::from)
+            })
+            .map(WrappedSchema);
 
-    rresult!(return_field)
+        sresult!(return_field)
+    }
 }
 
 unsafe extern "C" fn accumulator_fn_wrapper(
     udaf: &FFI_AggregateUDF,
     args: FFI_AccumulatorArgs,
-) -> RResult<FFI_Accumulator, RString> {
-    let udaf = udaf.inner();
+) -> FFI_Result<FFI_Accumulator> {
+    unsafe {
+        let udaf = udaf.inner();
 
-    let accumulator_args = &rresult_return!(ForeignAccumulatorArgs::try_from(args));
+        let accumulator_args = &sresult_return!(ForeignAccumulatorArgs::try_from(args));
 
-    rresult!(udaf
-        .accumulator(accumulator_args.into())
-        .map(FFI_Accumulator::from))
+        sresult!(
+            udaf.accumulator(accumulator_args.into())
+                .map(FFI_Accumulator::from)
+        )
+    }
 }
 
 unsafe extern "C" fn create_sliding_accumulator_fn_wrapper(
     udaf: &FFI_AggregateUDF,
     args: FFI_AccumulatorArgs,
-) -> RResult<FFI_Accumulator, RString> {
-    let udaf = udaf.inner();
+) -> FFI_Result<FFI_Accumulator> {
+    unsafe {
+        let udaf = udaf.inner();
 
-    let accumulator_args = &rresult_return!(ForeignAccumulatorArgs::try_from(args));
+        let accumulator_args = &sresult_return!(ForeignAccumulatorArgs::try_from(args));
 
-    rresult!(udaf
-        .create_sliding_accumulator(accumulator_args.into())
-        .map(FFI_Accumulator::from))
+        sresult!(
+            udaf.create_sliding_accumulator(accumulator_args.into())
+                .map(FFI_Accumulator::from)
+        )
+    }
 }
 
 unsafe extern "C" fn create_groups_accumulator_fn_wrapper(
     udaf: &FFI_AggregateUDF,
     args: FFI_AccumulatorArgs,
-) -> RResult<FFI_GroupsAccumulator, RString> {
-    let udaf = udaf.inner();
+) -> FFI_Result<FFI_GroupsAccumulator> {
+    unsafe {
+        let udaf = udaf.inner();
 
-    let accumulator_args = &rresult_return!(ForeignAccumulatorArgs::try_from(args));
+        let accumulator_args = &sresult_return!(ForeignAccumulatorArgs::try_from(args));
 
-    rresult!(udaf
-        .create_groups_accumulator(accumulator_args.into())
-        .map(FFI_GroupsAccumulator::from))
+        sresult!(
+            udaf.create_groups_accumulator(accumulator_args.into())
+                .map(FFI_GroupsAccumulator::from)
+        )
+    }
 }
 
 unsafe extern "C" fn groups_accumulator_supported_fn_wrapper(
     udaf: &FFI_AggregateUDF,
     args: FFI_AccumulatorArgs,
 ) -> bool {
-    let udaf = udaf.inner();
-
-    ForeignAccumulatorArgs::try_from(args)
-        .map(|a| udaf.groups_accumulator_supported((&a).into()))
-        .unwrap_or_else(|e| {
-            log::warn!("Unable to parse accumulator args. {e}");
-            false
-        })
+    unsafe {
+        let udaf = udaf.inner();
+
+        ForeignAccumulatorArgs::try_from(args)
+            .map(|a| udaf.groups_accumulator_supported((&a).into()))
+            .unwrap_or_else(|e| {
+                log::warn!("Unable to parse accumulator args. {e}");
+                false
+            })
+    }
 }
 
 unsafe extern "C" fn with_beneficial_ordering_fn_wrapper(
     udaf: &FFI_AggregateUDF,
     beneficial_ordering: bool,
-) -> RResult<ROption<FFI_AggregateUDF>, RString> {
-    let udaf = udaf.inner().as_ref().clone();
+) -> FFI_Result<FFI_Option<FFI_AggregateUDF>> {
+    unsafe {
+        let udaf = udaf.inner().as_ref().clone();
 
-    let result = rresult_return!(udaf.with_beneficial_ordering(beneficial_ordering));
-    let result = rresult_return!(result
-        .map(|func| func.with_beneficial_ordering(beneficial_ordering))
-        .transpose())
-    .flatten()
-    .map(|func| FFI_AggregateUDF::from(Arc::new(func)));
+        let result = sresult_return!(udaf.with_beneficial_ordering(beneficial_ordering));
+        let result = sresult_return!(
+            result
+                .map(|func| func.with_beneficial_ordering(beneficial_ordering))
+                .transpose()
+        )
+        .flatten()
+        .map(|func| FFI_AggregateUDF::from(Arc::new(func)));
 
-    RResult::ROk(result.into())
+        FFI_Result::Ok(result.into())
+    }
 }
 
 unsafe extern "C" fn state_fields_fn_wrapper(
     udaf: &FFI_AggregateUDF,
-    name: &RStr,
-    input_fields: RVec<WrappedSchema>,
+    name: &SStr,
+    input_fields: SVec<WrappedSchema>,
     return_field: WrappedSchema,
-    ordering_fields: RVec<RVec<u8>>,
+    ordering_fields: SVec<SVec<u8>>,
     is_distinct: bool,
-) -> RResult<RVec<RVec<u8>>, RString> {
-    let udaf = udaf.inner();
+) -> FFI_Result<SVec<SVec<u8>>> {
+    unsafe {
+        let udaf = udaf.inner();
 
-    let input_fields = &rresult_return!(rvec_wrapped_to_vec_fieldref(&input_fields));
-    let return_field = rresult_return!(Field::try_from(&return_field.0)).into();
+        let input_fields = &sresult_return!(rvec_wrapped_to_vec_fieldref(&input_fields));
+        let return_field = sresult_return!(Field::try_from(&return_field.0)).into();
 
-    let ordering_fields = &rresult_return!(ordering_fields
-        .into_iter()
-        .map(|field_bytes| datafusion_proto_common::Field::decode(field_bytes.as_ref()))
-        .collect::<std::result::Result<Vec<_>, DecodeError>>());
+        let ordering_fields = &sresult_return!(
+            ordering_fields
+                .into_iter()
+                .map(|field_bytes| datafusion_proto_common::Field::decode(
+                    field_bytes.as_ref()
+                ))
+                .collect::<std::result::Result<Vec<_>, DecodeError>>()
+        );
+
+        let ordering_fields =
+            &sresult_return!(parse_proto_fields_to_fields(ordering_fields))
+                .into_iter()
+                .map(Arc::new)
+                .collect::<Vec<_>>();
+
+        let args = StateFieldsArgs {
+            name: name.as_str(),
+            input_fields,
+            return_field,
+            ordering_fields,
+            is_distinct,
+        };
 
-    let ordering_fields = &rresult_return!(parse_proto_fields_to_fields(ordering_fields))
+        let state_fields = sresult_return!(udaf.state_fields(args));
+        let state_fields = sresult_return!(
+            state_fields
+                .iter()
+                .map(|f| f.as_ref())
+                .map(datafusion_proto::protobuf::Field::try_from)
+                .map(|v| v.map_err(DataFusionError::from))
+                .collect::<Result<Vec<_>>>()
+        )
         .into_iter()
-        .map(Arc::new)
-        .collect::<Vec<_>>();
-
-    let args = StateFieldsArgs {
-        name: name.as_str(),
-        input_fields,
-        return_field,
-        ordering_fields,
-        is_distinct,
-    };
-
-    let state_fields = rresult_return!(udaf.state_fields(args));
-    let state_fields = rresult_return!(state_fields
-        .iter()
-        .map(|f| f.as_ref())
-        .map(datafusion_proto::protobuf::Field::try_from)
-        .map(|v| v.map_err(DataFusionError::from))
-        .collect::<Result<Vec<_>>>())
-    .into_iter()
-    .map(|field| field.encode_to_vec().into())
-    .collect();
-
-    RResult::ROk(state_fields)
+        .map(|field| field.encode_to_vec().into_iter().collect())
+        .collect();
+
+        FFI_Result::Ok(state_fields)
+    }
 }
 
 unsafe extern "C" fn order_sensitivity_fn_wrapper(
     udaf: &FFI_AggregateUDF,
 ) -> FFI_AggregateOrderSensitivity {
-    udaf.inner().order_sensitivity().into()
+    unsafe { udaf.inner().order_sensitivity().into() }
 }
 
 unsafe extern "C" fn coerce_types_fn_wrapper(
     udaf: &FFI_AggregateUDF,
-    arg_types: RVec<WrappedSchema>,
-) -> RResult<RVec<WrappedSchema>, RString> {
-    let udaf = udaf.inner();
-
-    let arg_types = rresult_return!(rvec_wrapped_to_vec_datatype(&arg_types));
-
-    let arg_fields = arg_types
-        .iter()
-        .map(|dt| Field::new("f", dt.clone(), true))
-        .map(Arc::new)
-        .collect::<Vec<_>>();
-    let return_types = rresult_return!(fields_with_aggregate_udf(&arg_fields, udaf))
-        .into_iter()
-        .map(|f| f.data_type().to_owned())
-        .collect::<Vec<_>>();
+    arg_types: SVec<WrappedSchema>,
+) -> FFI_Result<SVec<WrappedSchema>> {
+    unsafe {
+        let udaf = udaf.inner();
 
-    rresult!(vec_datatype_to_rvec_wrapped(&return_types))
+        let arg_types = sresult_return!(rvec_wrapped_to_vec_datatype(&arg_types));
+
+        let arg_fields = arg_types
+            .iter()
+            .map(|dt| Field::new("f", dt.clone(), true))
+            .map(Arc::new)
+            .collect::<Vec<_>>();
+        let return_types = sresult_return!(fields_with_udf(&arg_fields, udaf.as_ref()))
+            .into_iter()
+            .map(|f| f.data_type().to_owned())
+            .collect::<Vec<_>>();
+
+        sresult!(vec_datatype_to_rvec_wrapped(&return_types))
+    }
 }
 
 unsafe extern "C" fn release_fn_wrapper(udaf: &mut FFI_AggregateUDF) {
-    let private_data = Box::from_raw(udaf.private_data as *mut AggregateUDFPrivateData);
-    drop(private_data);
+    unsafe {
+        debug_assert!(!udaf.private_data.is_null());
+        let private_data =
+            Box::from_raw(udaf.private_data as *mut AggregateUDFPrivateData);
+        drop(private_data);
+        udaf.private_data = std::ptr::null_mut();
+    }
 }
 
 unsafe extern "C" fn clone_fn_wrapper(udaf: &FFI_AggregateUDF) -> FFI_AggregateUDF {
-    Arc::clone(udaf.inner()).into()
+    unsafe { Arc::clone(udaf.inner()).into() }
 }
 
 impl Clone for FFI_AggregateUDF {
@@ -337,6 +372,10 @@ impl Clone for FFI_AggregateUDF {
 
 impl From<Arc<AggregateUDF>> for FFI_AggregateUDF {
     fn from(udaf: Arc<AggregateUDF>) -> Self {
+        if let Some(udaf) = udaf.inner().downcast_ref::<ForeignAggregateUDF>() {
+            return udaf.udaf.clone();
+        }
+
         let name = udaf.name().into();
         let aliases = udaf.aliases().iter().map(|a| a.to_owned().into()).collect();
         let is_nullable = udaf.is_nullable();
@@ -361,6 +400,7 @@ impl From<Arc<AggregateUDF>> for FFI_AggregateUDF {
             clone: clone_fn_wrapper,
             release: release_fn_wrapper,
             private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
         }
     }
 }
@@ -400,14 +440,16 @@ impl Hash for ForeignAggregateUDF {
     }
 }
 
-impl TryFrom<&FFI_AggregateUDF> for ForeignAggregateUDF {
-    type Error = DataFusionError;
+impl From<&FFI_AggregateUDF> for Arc<dyn AggregateUDFImpl> {
+    fn from(udaf: &FFI_AggregateUDF) -> Self {
+        if (udaf.library_marker_id)() == crate::get_library_marker_id() {
+            return Arc::clone(unsafe { udaf.inner().inner() });
+        }
 
-    fn try_from(udaf: &FFI_AggregateUDF) -> Result<Self, Self::Error> {
         let signature = Signature::user_defined((&udaf.volatility).into());
         let aliases = udaf.aliases.iter().map(|s| s.to_string()).collect();
 
-        Ok(Self {
+        Arc::new(ForeignAggregateUDF {
             udaf: udaf.clone(),
             signature,
             aliases,
@@ -416,10 +458,6 @@ impl TryFrom<&FFI_AggregateUDF> for ForeignAggregateUDF {
 }
 
 impl AggregateUDFImpl for ForeignAggregateUDF {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         self.udaf.name.as_str()
     }
@@ -453,15 +491,14 @@ impl AggregateUDFImpl for ForeignAggregateUDF {
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
         let args = acc_args.try_into()?;
         unsafe {
-            df_result!((self.udaf.accumulator)(&self.udaf, args)).map(|accum| {
-                Box::new(ForeignAccumulator::from(accum)) as Box<dyn Accumulator>
-            })
+            df_result!((self.udaf.accumulator)(&self.udaf, args))
+                .map(<Box<dyn Accumulator>>::from)
         }
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
         unsafe {
-            let name = RStr::from_str(args.name);
+            let name = SStr::from(args.name);
             let input_fields = vec_fieldref_to_rvec_wrapped(args.input_fields)?;
             let return_field =
                 WrappedSchema(FFI_ArrowSchema::try_from(args.return_field.as_ref())?);
@@ -473,7 +510,7 @@ impl AggregateUDFImpl for ForeignAggregateUDF {
                 .map(|v| v.map_err(DataFusionError::from))
                 .collect::<Result<Vec<_>>>()?
                 .into_iter()
-                .map(|proto_field| proto_field.encode_to_vec().into())
+                .map(|proto_field| proto_field.encode_to_vec().into_iter().collect())
                 .collect();
 
             let fields = df_result!((self.udaf.state_fields)(
@@ -488,13 +525,13 @@ impl AggregateUDFImpl for ForeignAggregateUDF {
                 .into_iter()
                 .map(|field_bytes| {
                     datafusion_proto_common::Field::decode(field_bytes.as_ref())
-                        .map_err(|e| exec_datafusion_err!("{e}"))
+                        .map_err(|e| ffi_datafusion_err!("{e}"))
                 })
                 .collect::<Result<Vec<_>>>()?;
 
             parse_proto_fields_to_fields(fields.iter())
                 .map(|fields| fields.into_iter().map(Arc::new).collect())
-                .map_err(|e| exec_datafusion_err!("{e}"))
+                .map_err(|e| ffi_datafusion_err!("{e}"))
         }
     }
 
@@ -517,12 +554,8 @@ impl AggregateUDFImpl for ForeignAggregateUDF {
         let args = FFI_AccumulatorArgs::try_from(args)?;
 
         unsafe {
-            df_result!((self.udaf.create_groups_accumulator)(&self.udaf, args)).map(
-                |accum| {
-                    Box::new(ForeignGroupsAccumulator::from(accum))
-                        as Box<dyn GroupsAccumulator>
-                },
-            )
+            df_result!((self.udaf.create_groups_accumulator)(&self.udaf, args))
+                .map(<Box<dyn GroupsAccumulator>>::from)
         }
     }
 
@@ -536,9 +569,8 @@ impl AggregateUDFImpl for ForeignAggregateUDF {
     ) -> Result<Box<dyn Accumulator>> {
         let args = args.try_into()?;
         unsafe {
-            df_result!((self.udaf.create_sliding_accumulator)(&self.udaf, args)).map(
-                |accum| Box::new(ForeignAccumulator::from(accum)) as Box<dyn Accumulator>,
-            )
+            df_result!((self.udaf.create_sliding_accumulator)(&self.udaf, args))
+                .map(<Box<dyn Accumulator>>::from)
         }
     }
 
@@ -553,11 +585,9 @@ impl AggregateUDFImpl for ForeignAggregateUDF {
             ))?
             .into_option();
 
-            let result = result
-                .map(|func| ForeignAggregateUDF::try_from(&func))
-                .transpose()?;
+            let result = result.map(|func| <Arc<dyn AggregateUDFImpl>>::from(&func));
 
-            Ok(result.map(|func| Arc::new(func) as Arc<dyn AggregateUDFImpl>))
+            Ok(result)
         }
     }
 
@@ -580,8 +610,7 @@ impl AggregateUDFImpl for ForeignAggregateUDF {
 }
 
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
+#[derive(Debug)]
 pub enum FFI_AggregateOrderSensitivity {
     Insensitive,
     HardRequirement,
@@ -613,15 +642,15 @@ impl From<AggregateOrderSensitivity> for FFI_AggregateOrderSensitivity {
 
 #[cfg(test)]
 mod tests {
-    use arrow::datatypes::Schema;
-    use datafusion::{
-        common::create_array, functions_aggregate::sum::Sum,
-        physical_expr::PhysicalSortExpr, physical_plan::expressions::col,
-        scalar::ScalarValue,
-    };
-    use std::any::Any;
     use std::collections::HashMap;
 
+    use arrow::datatypes::Schema;
+    use datafusion::common::create_array;
+    use datafusion::functions_aggregate::sum::Sum;
+    use datafusion::physical_expr::PhysicalSortExpr;
+    use datafusion::physical_plan::expressions::col;
+    use datafusion::scalar::ScalarValue;
+
     use super::*;
 
     #[derive(Default, Debug, Hash, Eq, PartialEq)]
@@ -630,10 +659,6 @@ mod tests {
     }
 
     impl AggregateUDFImpl for SumWithCopiedMetadata {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
         fn name(&self) -> &str {
             self.inner.name()
         }
@@ -661,10 +686,11 @@ mod tests {
     ) -> Result<AggregateUDF> {
         let original_udaf = Arc::new(AggregateUDF::from(original_udaf));
 
-        let local_udaf: FFI_AggregateUDF = Arc::clone(&original_udaf).into();
+        let mut local_udaf: FFI_AggregateUDF = Arc::clone(&original_udaf).into();
+        local_udaf.library_marker_id = crate::mock_foreign_marker_id;
 
-        let foreign_udaf: ForeignAggregateUDF = (&local_udaf).try_into()?;
-        Ok(foreign_udaf.into())
+        let foreign_udaf: Arc<dyn AggregateUDFImpl> = (&local_udaf).into();
+        Ok(AggregateUDF::new_from_shared_impl(foreign_udaf))
     }
 
     #[test]
@@ -674,11 +700,12 @@ mod tests {
         let original_udaf = Arc::new(AggregateUDF::from(original_udaf));
 
         // Convert to FFI format
-        let local_udaf: FFI_AggregateUDF = Arc::clone(&original_udaf).into();
+        let mut local_udaf: FFI_AggregateUDF = Arc::clone(&original_udaf).into();
+        local_udaf.library_marker_id = crate::mock_foreign_marker_id;
 
         // Convert back to native format
-        let foreign_udaf: ForeignAggregateUDF = (&local_udaf).try_into()?;
-        let foreign_udaf: AggregateUDF = foreign_udaf.into();
+        let foreign_udaf: Arc<dyn AggregateUDFImpl> = (&local_udaf).into();
+        let foreign_udaf = AggregateUDF::new_from_shared_impl(foreign_udaf);
 
         assert_eq!(original_name, foreign_udaf.name());
         Ok(())
@@ -731,8 +758,8 @@ mod tests {
         let local_udaf: FFI_AggregateUDF = Arc::clone(&original_udaf).into();
 
         // Convert back to native format
-        let foreign_udaf: ForeignAggregateUDF = (&local_udaf).try_into()?;
-        let foreign_udaf: AggregateUDF = foreign_udaf.into();
+        let foreign_udaf: Arc<dyn AggregateUDFImpl> = (&local_udaf).into();
+        let foreign_udaf = AggregateUDF::new_from_shared_impl(foreign_udaf);
 
         let metadata: HashMap<String, String> =
             [("a_key".to_string(), "a_value".to_string())]
@@ -815,4 +842,23 @@ mod tests {
         test_round_trip_order_sensitivity(AggregateOrderSensitivity::SoftRequirement);
         test_round_trip_order_sensitivity(AggregateOrderSensitivity::Beneficial);
     }
+
+    #[test]
+    fn test_ffi_udaf_local_bypass() -> Result<()> {
+        let original_udaf = Sum::new();
+        let original_udaf = Arc::new(AggregateUDF::from(original_udaf));
+
+        let mut ffi_udaf = FFI_AggregateUDF::from(original_udaf);
+
+        // Verify local libraries can be downcast to their original
+        let foreign_udaf: Arc<dyn AggregateUDFImpl> = (&ffi_udaf).into();
+        assert!(foreign_udaf.is::<Sum>());
+
+        // Verify different library markers generate foreign providers
+        ffi_udaf.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_udaf: Arc<dyn AggregateUDFImpl> = (&ffi_udaf).into();
+        assert!(foreign_udaf.is::<ForeignAggregateUDF>());
+
+        Ok(())
+    }
 }
diff --git a/datafusion/ffi/src/udf/mod.rs b/datafusion/ffi/src/udf/mod.rs
index 5e59cfc5ecb07..ff18a30e4ba19 100644
--- a/datafusion/ffi/src/udf/mod.rs
+++ b/datafusion/ffi/src/udf/mod.rs
@@ -15,82 +15,69 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::{
-    arrow_wrappers::{WrappedArray, WrappedSchema},
-    df_result, rresult, rresult_return,
-    util::{rvec_wrapped_to_vec_datatype, vec_datatype_to_rvec_wrapped},
-    volatility::FFI_Volatility,
-};
-use abi_stable::{
-    std_types::{RResult, RString, RVec},
-    StableAbi,
-};
+use std::ffi::c_void;
+use std::hash::{Hash, Hasher};
+use std::sync::Arc;
+
+use arrow::array::Array;
 use arrow::datatypes::{DataType, Field};
-use arrow::{
-    array::ArrayRef,
-    error::ArrowError,
-    ffi::{from_ffi, to_ffi, FFI_ArrowSchema},
-};
+use arrow::error::ArrowError;
+use arrow::ffi::{FFI_ArrowSchema, from_ffi, to_ffi};
 use arrow_schema::FieldRef;
-use datafusion::config::ConfigOptions;
-use datafusion::logical_expr::ReturnFieldArgs;
-use datafusion::{
-    error::DataFusionError,
-    logical_expr::type_coercion::functions::data_types_with_scalar_udf,
-};
-use datafusion::{
-    error::Result,
-    logical_expr::{
-        ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
-    },
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::{DataFusionError, Result, internal_err};
+use datafusion_expr::type_coercion::functions::fields_with_udf;
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl,
+    Signature,
 };
 use return_type_args::{
     FFI_ReturnFieldArgs, ForeignReturnFieldArgs, ForeignReturnFieldArgsOwned,
 };
-use std::hash::{Hash, Hasher};
-use std::{ffi::c_void, sync::Arc};
+
+use stabby::string::String as SString;
+use stabby::vec::Vec as SVec;
+
+use crate::arrow_wrappers::{WrappedArray, WrappedSchema};
+use crate::config::FFI_ConfigOptions;
+use crate::expr::columnar_value::FFI_ColumnarValue;
+use crate::util::{
+    FFI_Result, rvec_wrapped_to_vec_datatype, vec_datatype_to_rvec_wrapped,
+};
+use crate::volatility::FFI_Volatility;
+use crate::{df_result, sresult, sresult_return};
 
 pub mod return_type_args;
 
 /// A stable struct for sharing a [`ScalarUDF`] across FFI boundaries.
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
+#[derive(Debug)]
 pub struct FFI_ScalarUDF {
     /// FFI equivalent to the `name` of a [`ScalarUDF`]
-    pub name: RString,
+    pub name: SString,
 
     /// FFI equivalent to the `aliases` of a [`ScalarUDF`]
-    pub aliases: RVec<RString>,
+    pub aliases: SVec<SString>,
 
     /// FFI equivalent to the `volatility` of a [`ScalarUDF`]
     pub volatility: FFI_Volatility,
 
-    /// Determines the return type of the underlying [`ScalarUDF`] based on the
-    /// argument types.
-    pub return_type: unsafe extern "C" fn(
-        udf: &Self,
-        arg_types: RVec<WrappedSchema>,
-    ) -> RResult<WrappedSchema, RString>,
-
-    /// Determines the return info of the underlying [`ScalarUDF`]. Either this
-    /// or return_type may be implemented on a UDF.
+    /// Determines the return info of the underlying [`ScalarUDF`].
     pub return_field_from_args: unsafe extern "C" fn(
         udf: &Self,
         args: FFI_ReturnFieldArgs,
-    )
-        -> RResult<WrappedSchema, RString>,
+    ) -> FFI_Result<WrappedSchema>,
 
     /// Execute the underlying [`ScalarUDF`] and return the result as a `FFI_ArrowArray`
     /// within an AbiStable wrapper.
-    #[allow(clippy::type_complexity)]
     pub invoke_with_args: unsafe extern "C" fn(
         udf: &Self,
-        args: RVec<WrappedArray>,
-        arg_fields: RVec<WrappedSchema>,
+        args: SVec<WrappedArray>,
+        arg_fields: SVec<WrappedSchema>,
         num_rows: usize,
         return_field: WrappedSchema,
-    ) -> RResult<WrappedArray, RString>,
+        config_options: FFI_ConfigOptions,
+    ) -> FFI_Result<FFI_ColumnarValue>,
 
     /// See [`ScalarUDFImpl`] for details on short_circuits
     pub short_circuits: bool,
@@ -101,8 +88,8 @@ pub struct FFI_ScalarUDF {
     /// appropriate calls on the underlying [`ScalarUDF`]
     pub coerce_types: unsafe extern "C" fn(
         udf: &Self,
-        arg_types: RVec<WrappedSchema>,
-    ) -> RResult<RVec<WrappedSchema>, RString>,
+        arg_types: SVec<WrappedSchema>,
+    ) -> FFI_Result<SVec<WrappedSchema>>,
 
     /// Used to create a clone on the provider of the udf. This should
     /// only need to be called by the receiver of the udf.
@@ -114,6 +101,11 @@ pub struct FFI_ScalarUDF {
     /// Internal data. This is only to be accessed by the provider of the udf.
     /// A [`ForeignScalarUDF`] should never attempt to access this data.
     pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
 }
 
 unsafe impl Send for FFI_ScalarUDF {}
@@ -123,117 +115,112 @@ pub struct ScalarUDFPrivateData {
     pub udf: Arc<ScalarUDF>,
 }
 
-unsafe extern "C" fn return_type_fn_wrapper(
-    udf: &FFI_ScalarUDF,
-    arg_types: RVec<WrappedSchema>,
-) -> RResult<WrappedSchema, RString> {
-    let private_data = udf.private_data as *const ScalarUDFPrivateData;
-    let udf = &(*private_data).udf;
-
-    let arg_types = rresult_return!(rvec_wrapped_to_vec_datatype(&arg_types));
-
-    let return_type = udf
-        .return_type(&arg_types)
-        .and_then(|v| FFI_ArrowSchema::try_from(v).map_err(DataFusionError::from))
-        .map(WrappedSchema);
-
-    rresult!(return_type)
+impl FFI_ScalarUDF {
+    fn inner(&self) -> &Arc<ScalarUDF> {
+        let private_data = self.private_data as *const ScalarUDFPrivateData;
+        unsafe { &(*private_data).udf }
+    }
 }
 
 unsafe extern "C" fn return_field_from_args_fn_wrapper(
     udf: &FFI_ScalarUDF,
     args: FFI_ReturnFieldArgs,
-) -> RResult<WrappedSchema, RString> {
-    let private_data = udf.private_data as *const ScalarUDFPrivateData;
-    let udf = &(*private_data).udf;
-
-    let args: ForeignReturnFieldArgsOwned = rresult_return!((&args).try_into());
+) -> FFI_Result<WrappedSchema> {
+    let args: ForeignReturnFieldArgsOwned = sresult_return!((&args).try_into());
     let args_ref: ForeignReturnFieldArgs = (&args).into();
 
     let return_type = udf
+        .inner()
         .return_field_from_args((&args_ref).into())
         .and_then(|f| FFI_ArrowSchema::try_from(&f).map_err(DataFusionError::from))
         .map(WrappedSchema);
 
-    rresult!(return_type)
+    sresult!(return_type)
 }
 
 unsafe extern "C" fn coerce_types_fn_wrapper(
     udf: &FFI_ScalarUDF,
-    arg_types: RVec<WrappedSchema>,
-) -> RResult<RVec<WrappedSchema>, RString> {
-    let private_data = udf.private_data as *const ScalarUDFPrivateData;
-    let udf = &(*private_data).udf;
-
-    let arg_types = rresult_return!(rvec_wrapped_to_vec_datatype(&arg_types));
-
-    let return_types = rresult_return!(data_types_with_scalar_udf(&arg_types, udf));
+    arg_types: SVec<WrappedSchema>,
+) -> FFI_Result<SVec<WrappedSchema>> {
+    let arg_types = sresult_return!(rvec_wrapped_to_vec_datatype(&arg_types));
+
+    let arg_fields = arg_types
+        .iter()
+        .map(|dt| Arc::new(Field::new("f", dt.clone(), true)))
+        .collect::<Vec<_>>();
+    let return_types =
+        sresult_return!(fields_with_udf(&arg_fields, udf.inner().as_ref()))
+            .into_iter()
+            .map(|f| f.data_type().to_owned())
+            .collect::<Vec<_>>();
 
-    rresult!(vec_datatype_to_rvec_wrapped(&return_types))
+    sresult!(vec_datatype_to_rvec_wrapped(&return_types))
 }
 
 unsafe extern "C" fn invoke_with_args_fn_wrapper(
     udf: &FFI_ScalarUDF,
-    args: RVec<WrappedArray>,
-    arg_fields: RVec<WrappedSchema>,
+    args: SVec<WrappedArray>,
+    arg_fields: SVec<WrappedSchema>,
     number_rows: usize,
     return_field: WrappedSchema,
-) -> RResult<WrappedArray, RString> {
-    let private_data = udf.private_data as *const ScalarUDFPrivateData;
-    let udf = &(*private_data).udf;
-
-    let args = args
-        .into_iter()
-        .map(|arr| {
-            from_ffi(arr.array, &arr.schema.0)
-                .map(|v| ColumnarValue::Array(arrow::array::make_array(v)))
-        })
-        .collect::<std::result::Result<_, _>>();
+    config_options: FFI_ConfigOptions,
+) -> FFI_Result<FFI_ColumnarValue> {
+    unsafe {
+        let args = args
+            .into_iter()
+            .map(|arr| {
+                from_ffi(arr.array, &arr.schema.0)
+                    .map(|v| ColumnarValue::Array(arrow::array::make_array(v)))
+            })
+            .collect::<std::result::Result<_, _>>();
 
-    let args = rresult_return!(args);
-    let return_field = rresult_return!(Field::try_from(&return_field.0)).into();
+        let args = sresult_return!(args);
+        let return_field = sresult_return!(Field::try_from(&return_field.0)).into();
 
-    let arg_fields = arg_fields
-        .into_iter()
-        .map(|wrapped_field| {
-            Field::try_from(&wrapped_field.0)
-                .map(Arc::new)
-                .map_err(DataFusionError::from)
-        })
-        .collect::<Result<Vec<FieldRef>>>();
-    let arg_fields = rresult_return!(arg_fields);
-
-    let args = ScalarFunctionArgs {
-        args,
-        arg_fields,
-        number_rows,
-        return_field,
-        // TODO: pass config options: https://github.com/apache/datafusion/issues/17035
-        config_options: Arc::new(ConfigOptions::default()),
-    };
-
-    let result = rresult_return!(udf
-        .invoke_with_args(args)
-        .and_then(|r| r.to_array(number_rows)));
-
-    let (result_array, result_schema) = rresult_return!(to_ffi(&result.to_data()));
-
-    RResult::ROk(WrappedArray {
-        array: result_array,
-        schema: WrappedSchema(result_schema),
-    })
+        let arg_fields = arg_fields
+            .into_iter()
+            .map(|wrapped_field| {
+                Field::try_from(&wrapped_field.0)
+                    .map(Arc::new)
+                    .map_err(DataFusionError::from)
+            })
+            .collect::<Result<Vec<FieldRef>>>();
+        let arg_fields = sresult_return!(arg_fields);
+        let config_options = sresult_return!(ConfigOptions::try_from(config_options));
+        let config_options = Arc::new(config_options);
+
+        let args = ScalarFunctionArgs {
+            args,
+            arg_fields,
+            number_rows,
+            return_field,
+            config_options,
+        };
+
+        sresult!(
+            udf.inner()
+                .invoke_with_args(args)
+                .and_then(FFI_ColumnarValue::try_from)
+        )
+    }
 }
 
 unsafe extern "C" fn release_fn_wrapper(udf: &mut FFI_ScalarUDF) {
-    let private_data = Box::from_raw(udf.private_data as *mut ScalarUDFPrivateData);
-    drop(private_data);
+    unsafe {
+        debug_assert!(!udf.private_data.is_null());
+        let private_data = Box::from_raw(udf.private_data as *mut ScalarUDFPrivateData);
+        drop(private_data);
+        udf.private_data = std::ptr::null_mut();
+    }
 }
 
 unsafe extern "C" fn clone_fn_wrapper(udf: &FFI_ScalarUDF) -> FFI_ScalarUDF {
-    let private_data = udf.private_data as *const ScalarUDFPrivateData;
-    let udf_data = &(*private_data);
+    unsafe {
+        let private_data = udf.private_data as *const ScalarUDFPrivateData;
+        let udf_data = &(*private_data);
 
-    Arc::clone(&udf_data.udf).into()
+        Arc::clone(&udf_data.udf).into()
+    }
 }
 
 impl Clone for FFI_ScalarUDF {
@@ -244,6 +231,10 @@ impl Clone for FFI_ScalarUDF {
 
 impl From<Arc<ScalarUDF>> for FFI_ScalarUDF {
     fn from(udf: Arc<ScalarUDF>) -> Self {
+        if let Some(udf) = udf.inner().downcast_ref::<ForeignScalarUDF>() {
+            return udf.udf.clone();
+        }
+
         let name = udf.name().into();
         let aliases = udf.aliases().iter().map(|a| a.to_owned().into()).collect();
         let volatility = udf.signature().volatility.into();
@@ -257,12 +248,12 @@ impl From<Arc<ScalarUDF>> for FFI_ScalarUDF {
             volatility,
             short_circuits,
             invoke_with_args: invoke_with_args_fn_wrapper,
-            return_type: return_type_fn_wrapper,
             return_field_from_args: return_field_from_args_fn_wrapper,
             coerce_types: coerce_types_fn_wrapper,
             clone: clone_fn_wrapper,
             release: release_fn_wrapper,
             private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
         }
     }
 }
@@ -321,29 +312,27 @@ impl Hash for ForeignScalarUDF {
     }
 }
 
-impl TryFrom<&FFI_ScalarUDF> for ForeignScalarUDF {
-    type Error = DataFusionError;
-
-    fn try_from(udf: &FFI_ScalarUDF) -> Result<Self, Self::Error> {
-        let name = udf.name.to_owned().into();
-        let signature = Signature::user_defined((&udf.volatility).into());
-
-        let aliases = udf.aliases.iter().map(|s| s.to_string()).collect();
-
-        Ok(Self {
-            name,
-            udf: udf.clone(),
-            aliases,
-            signature,
-        })
+impl From<&FFI_ScalarUDF> for Arc<dyn ScalarUDFImpl> {
+    fn from(udf: &FFI_ScalarUDF) -> Self {
+        if (udf.library_marker_id)() == crate::get_library_marker_id() {
+            Arc::clone(udf.inner().inner())
+        } else {
+            let name = udf.name.to_string();
+            let signature = Signature::user_defined((&udf.volatility).into());
+
+            let aliases = udf.aliases.iter().map(|s| s.to_string()).collect();
+
+            Arc::new(ForeignScalarUDF {
+                name,
+                udf: udf.clone(),
+                aliases,
+                signature,
+            })
+        }
     }
 }
 
 impl ScalarUDFImpl for ForeignScalarUDF {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         &self.name
     }
@@ -352,14 +341,8 @@ impl ScalarUDFImpl for ForeignScalarUDF {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        let arg_types = vec_datatype_to_rvec_wrapped(arg_types)?;
-
-        let result = unsafe { (self.udf.return_type)(&self.udf, arg_types) };
-
-        let result = df_result!(result);
-
-        result.and_then(|r| (&r.0).try_into().map_err(DataFusionError::from))
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("ForeignScalarUDF implements return_field_from_args instead.")
     }
 
     fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
@@ -382,8 +365,7 @@ impl ScalarUDFImpl for ForeignScalarUDF {
             arg_fields,
             number_rows,
             return_field,
-            // TODO: pass config options: https://github.com/apache/datafusion/issues/17035
-            config_options: _config_options,
+            config_options,
         } = invoke_args;
 
         let args = args
@@ -398,7 +380,8 @@ impl ScalarUDFImpl for ForeignScalarUDF {
                 })
             })
             .collect::<std::result::Result<Vec<_>, ArrowError>>()?
-            .into();
+            .into_iter()
+            .collect();
 
         let arg_fields_wrapped = arg_fields
             .iter()
@@ -408,10 +391,11 @@ impl ScalarUDFImpl for ForeignScalarUDF {
         let arg_fields = arg_fields_wrapped
             .into_iter()
             .map(WrappedSchema)
-            .collect::<RVec<_>>();
+            .collect::<SVec<_>>();
 
-        let return_field = return_field.as_ref().clone();
+        let return_field = Arc::unwrap_or_clone(return_field);
         let return_field = WrappedSchema(FFI_ArrowSchema::try_from(return_field)?);
+        let config_options = config_options.as_ref().into();
 
         let result = unsafe {
             (self.udf.invoke_with_args)(
@@ -420,13 +404,12 @@ impl ScalarUDFImpl for ForeignScalarUDF {
                 arg_fields,
                 number_rows,
                 return_field,
+                config_options,
             )
         };
 
         let result = df_result!(result)?;
-        let result_array: ArrayRef = result.try_into()?;
-
-        Ok(ColumnarValue::Array(result_array))
+        result.try_into()
     }
 
     fn aliases(&self) -> &[String] {
@@ -455,12 +438,33 @@ mod tests {
         let original_udf = datafusion::functions::math::abs::AbsFunc::new();
         let original_udf = Arc::new(ScalarUDF::from(original_udf));
 
-        let local_udf: FFI_ScalarUDF = Arc::clone(&original_udf).into();
+        let mut local_udf: FFI_ScalarUDF = Arc::clone(&original_udf).into();
+        local_udf.library_marker_id = crate::mock_foreign_marker_id;
 
-        let foreign_udf: ForeignScalarUDF = (&local_udf).try_into()?;
+        let foreign_udf: Arc<dyn ScalarUDFImpl> = (&local_udf).into();
 
         assert_eq!(original_udf.name(), foreign_udf.name());
 
         Ok(())
     }
+
+    #[test]
+    fn test_ffi_udf_local_bypass() -> Result<()> {
+        use datafusion::functions::math::abs::AbsFunc;
+        let original_udf = AbsFunc::new();
+        let original_udf = Arc::new(ScalarUDF::from(original_udf));
+
+        let mut ffi_udf = FFI_ScalarUDF::from(original_udf);
+
+        // Verify local libraries can be downcast to their original
+        let foreign_udf: Arc<dyn ScalarUDFImpl> = (&ffi_udf).into();
+        assert!(foreign_udf.is::<AbsFunc>());
+
+        // Verify different library markers generate foreign providers
+        ffi_udf.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_udf: Arc<dyn ScalarUDFImpl> = (&ffi_udf).into();
+        assert!(foreign_udf.is::<ForeignScalarUDF>());
+
+        Ok(())
+    }
 }
diff --git a/datafusion/ffi/src/udf/return_type_args.rs b/datafusion/ffi/src/udf/return_type_args.rs
index c437c9537be6f..5a7cec8310c77 100644
--- a/datafusion/ffi/src/udf/return_type_args.rs
+++ b/datafusion/ffi/src/udf/return_type_args.rs
@@ -15,27 +15,25 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use abi_stable::{
-    std_types::{ROption, RVec},
-    StableAbi,
-};
 use arrow_schema::FieldRef;
-use datafusion::{
-    common::exec_datafusion_err, error::DataFusionError, logical_expr::ReturnFieldArgs,
-    scalar::ScalarValue,
-};
+use datafusion_common::scalar::ScalarValue;
+use datafusion_common::{DataFusionError, ffi_datafusion_err};
+use datafusion_expr::ReturnFieldArgs;
+use prost::Message;
+
+use stabby::vec::Vec as SVec;
 
 use crate::arrow_wrappers::WrappedSchema;
-use crate::util::{rvec_wrapped_to_vec_fieldref, vec_fieldref_to_rvec_wrapped};
-use prost::Message;
+use crate::util::{
+    FFI_Option, rvec_wrapped_to_vec_fieldref, vec_fieldref_to_rvec_wrapped,
+};
 
 /// A stable struct for sharing a [`ReturnFieldArgs`] across FFI boundaries.
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
+#[derive(Debug)]
 pub struct FFI_ReturnFieldArgs {
-    arg_fields: RVec<WrappedSchema>,
-    scalar_arguments: RVec<ROption<RVec<u8>>>,
+    arg_fields: SVec<WrappedSchema>,
+    scalar_arguments: SVec<FFI_Option<SVec<u8>>>,
 }
 
 impl TryFrom<ReturnFieldArgs<'_>> for FFI_ReturnFieldArgs {
@@ -51,13 +49,17 @@ impl TryFrom<ReturnFieldArgs<'_>> for FFI_ReturnFieldArgs {
                     .map(|arg| {
                         let proto_value: datafusion_proto::protobuf::ScalarValue =
                             arg.try_into()?;
-                        let proto_bytes: RVec<u8> = proto_value.encode_to_vec().into();
+                        let proto_bytes: SVec<u8> =
+                            proto_value.encode_to_vec().into_iter().collect();
                         Ok(proto_bytes)
                     })
                     .transpose()
             })
             .collect();
-        let scalar_arguments = scalar_arguments?.into_iter().map(ROption::from).collect();
+        let scalar_arguments = scalar_arguments?
+            .into_iter()
+            .map(FFI_Option::from)
+            .collect();
 
         Ok(Self {
             arg_fields,
@@ -91,11 +93,11 @@ impl TryFrom<&FFI_ReturnFieldArgs> for ForeignReturnFieldArgsOwned {
                 let maybe_arg = maybe_arg.as_ref().map(|arg| {
                     let proto_value =
                         datafusion_proto::protobuf::ScalarValue::decode(arg.as_ref())
-                            .map_err(|err| exec_datafusion_err!("{}", err))?;
+                            .map_err(|err| ffi_datafusion_err!("{}", err))?;
                     let scalar_value: ScalarValue = (&proto_value).try_into()?;
                     Ok(scalar_value)
                 });
-                Option::from(maybe_arg).transpose()
+                maybe_arg.transpose()
             })
             .collect();
         let scalar_arguments = scalar_arguments?.into_iter().collect();
diff --git a/datafusion/ffi/src/udtf.rs b/datafusion/ffi/src/udtf.rs
index edd5273c70a80..0a111028798d1 100644
--- a/datafusion/ffi/src/udtf.rs
+++ b/datafusion/ffi/src/udtf.rs
@@ -15,43 +15,55 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{ffi::c_void, sync::Arc};
-
-use abi_stable::{
-    std_types::{RResult, RString, RVec},
-    StableAbi,
-};
-
-use datafusion::error::Result;
-use datafusion::{
-    catalog::{TableFunctionImpl, TableProvider},
-    prelude::{Expr, SessionContext},
-};
-use datafusion_proto::{
-    logical_plan::{
-        from_proto::parse_exprs, to_proto::serialize_exprs, DefaultLogicalExtensionCodec,
-    },
-    protobuf::LogicalExprList,
+use std::any::Any;
+use std::ffi::c_void;
+use std::sync::Arc;
+
+use datafusion_catalog::{TableFunctionArgs, TableFunctionImpl, TableProvider};
+use datafusion_common::DataFusionError;
+use datafusion_common::error::Result;
+use datafusion_execution::TaskContext;
+use datafusion_proto::logical_plan::from_proto::parse_exprs;
+use datafusion_proto::logical_plan::to_proto::serialize_exprs;
+use datafusion_proto::logical_plan::{
+    DefaultLogicalExtensionCodec, LogicalExtensionCodec,
 };
+use datafusion_proto::protobuf::LogicalExprList;
+use datafusion_session::Session;
 use prost::Message;
+use stabby::vec::Vec as SVec;
 use tokio::runtime::Handle;
 
-use crate::{
-    df_result, rresult_return,
-    table_provider::{FFI_TableProvider, ForeignTableProvider},
-};
+use crate::execution::FFI_TaskContextProvider;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::session::{FFI_SessionRef, ForeignSession};
+use crate::table_provider::FFI_TableProvider;
+use crate::util::FFI_Result;
+use crate::{df_result, sresult_return};
 
 /// A stable struct for sharing a [`TableFunctionImpl`] across FFI boundaries.
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
+#[derive(Debug)]
 pub struct FFI_TableFunction {
-    /// Equivalent to the `call` function of the TableFunctionImpl.
+    /// Equivalent to the [`TableFunctionImpl::call`].
     /// The arguments are Expr passed as protobuf encoded bytes.
+    #[deprecated(
+        since = "53.0.0",
+        note = "See TableFunctionImpl::call deprecation note"
+    )]
     pub call: unsafe extern "C" fn(
         udtf: &Self,
-        args: RVec<u8>,
-    ) -> RResult<FFI_TableProvider, RString>,
+        args: SVec<u8>,
+    ) -> FFI_Result<FFI_TableProvider>,
+
+    /// Equivalent to the [`TableFunctionImpl::call_with_args`].
+    call_with_args: unsafe extern "C" fn(
+        udtf: &Self,
+        args: SVec<u8>,
+        session: FFI_SessionRef,
+    ) -> FFI_Result<FFI_TableProvider>,
+
+    pub logical_codec: FFI_LogicalExtensionCodec,
 
     /// Used to create a clone on the provider of the udtf. This should
     /// only need to be called by the receiver of the udtf.
@@ -63,6 +75,11 @@ pub struct FFI_TableFunction {
     /// Internal data. This is only to be accessed by the provider of the udtf.
     /// A [`ForeignTableFunction`] should never attempt to access this data.
     pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
 }
 
 unsafe impl Send for FFI_TableFunction {}
@@ -87,33 +104,93 @@ impl FFI_TableFunction {
 
 unsafe extern "C" fn call_fn_wrapper(
     udtf: &FFI_TableFunction,
-    args: RVec<u8>,
-) -> RResult<FFI_TableProvider, RString> {
+    args: SVec<u8>,
+) -> FFI_Result<FFI_TableProvider> {
     let runtime = udtf.runtime();
-    let udtf = udtf.inner();
-
-    let default_ctx = SessionContext::new();
-    let codec = DefaultLogicalExtensionCodec {};
-
-    let proto_filters = rresult_return!(LogicalExprList::decode(args.as_ref()));
-
-    let args =
-        rresult_return!(parse_exprs(proto_filters.expr.iter(), &default_ctx, &codec));
+    let udtf_inner = udtf.inner();
+
+    let ctx: Arc<TaskContext> =
+        sresult_return!((&udtf.logical_codec.task_ctx_provider).try_into());
+    let codec: Arc<dyn LogicalExtensionCodec> = (&udtf.logical_codec).into();
+
+    let proto_filters = sresult_return!(LogicalExprList::decode(args.as_ref()));
+
+    let args = sresult_return!(parse_exprs(
+        proto_filters.expr.iter(),
+        ctx.as_ref(),
+        codec.as_ref()
+    ));
+
+    #[expect(deprecated)]
+    let table_provider = sresult_return!(udtf_inner.call(&args));
+    FFI_Result::Ok(FFI_TableProvider::new_with_ffi_codec(
+        table_provider,
+        false,
+        runtime,
+        udtf.logical_codec.clone(),
+    ))
+}
 
-    let table_provider = rresult_return!(udtf.call(&args));
-    RResult::ROk(FFI_TableProvider::new(table_provider, false, runtime))
+unsafe extern "C" fn call_with_args_wrapper(
+    udtf: &FFI_TableFunction,
+    args: SVec<u8>,
+    session: FFI_SessionRef,
+) -> FFI_Result<FFI_TableProvider> {
+    let runtime = udtf.runtime();
+    let udtf_inner = udtf.inner();
+
+    let ctx: Arc<TaskContext> =
+        sresult_return!((&udtf.logical_codec.task_ctx_provider).try_into());
+    let codec: Arc<dyn LogicalExtensionCodec> = (&udtf.logical_codec).into();
+
+    let proto_filters = sresult_return!(LogicalExprList::decode(args.as_ref()));
+
+    let args = sresult_return!(parse_exprs(
+        proto_filters.expr.iter(),
+        ctx.as_ref(),
+        codec.as_ref()
+    ));
+
+    let mut foreign_session = None;
+    let session = sresult_return!(
+        session
+            .as_local()
+            .map(Ok::<&(dyn Session + Send + Sync), DataFusionError>)
+            .unwrap_or_else(|| {
+                foreign_session = Some(ForeignSession::try_from(&session)?);
+                Ok(foreign_session.as_ref().unwrap())
+            })
+    );
+    let table_provider = sresult_return!(
+        udtf_inner.call_with_args(TableFunctionArgs::new(&args, session))
+    );
+    FFI_Result::Ok(FFI_TableProvider::new_with_ffi_codec(
+        table_provider,
+        false,
+        runtime,
+        udtf.logical_codec.clone(),
+    ))
 }
 
 unsafe extern "C" fn release_fn_wrapper(udtf: &mut FFI_TableFunction) {
-    let private_data = Box::from_raw(udtf.private_data as *mut TableFunctionPrivateData);
-    drop(private_data);
+    unsafe {
+        debug_assert!(!udtf.private_data.is_null());
+        let private_data =
+            Box::from_raw(udtf.private_data as *mut TableFunctionPrivateData);
+        drop(private_data);
+        udtf.private_data = std::ptr::null_mut();
+    }
 }
 
 unsafe extern "C" fn clone_fn_wrapper(udtf: &FFI_TableFunction) -> FFI_TableFunction {
     let runtime = udtf.runtime();
-    let udtf = udtf.inner();
+    let udtf_inner = udtf.inner();
 
-    FFI_TableFunction::new(Arc::clone(udtf), runtime)
+    FFI_TableFunction::new_with_ffi_codec(
+        Arc::clone(udtf_inner),
+        runtime,
+        udtf.logical_codec.clone(),
+    )
 }
 
 impl Clone for FFI_TableFunction {
@@ -123,30 +200,46 @@ impl Clone for FFI_TableFunction {
 }
 
 impl FFI_TableFunction {
-    pub fn new(udtf: Arc<dyn TableFunctionImpl>, runtime: Option<Handle>) -> Self {
-        let private_data = Box::new(TableFunctionPrivateData { udtf, runtime });
+    pub fn new(
+        udtf: Arc<dyn TableFunctionImpl>,
+        runtime: Option<Handle>,
+        task_ctx_provider: impl Into<FFI_TaskContextProvider>,
+        logical_codec: Option<Arc<dyn LogicalExtensionCodec>>,
+    ) -> Self {
+        let task_ctx_provider = task_ctx_provider.into();
+        let logical_codec =
+            logical_codec.unwrap_or_else(|| Arc::new(DefaultLogicalExtensionCodec {}));
+        let logical_codec = FFI_LogicalExtensionCodec::new(
+            logical_codec,
+            runtime.clone(),
+            task_ctx_provider.clone(),
+        );
+
+        Self::new_with_ffi_codec(udtf, runtime, logical_codec)
+    }
 
-        Self {
-            call: call_fn_wrapper,
-            clone: clone_fn_wrapper,
-            release: release_fn_wrapper,
-            private_data: Box::into_raw(private_data) as *mut c_void,
+    pub fn new_with_ffi_codec(
+        udtf: Arc<dyn TableFunctionImpl>,
+        runtime: Option<Handle>,
+        logical_codec: FFI_LogicalExtensionCodec,
+    ) -> Self {
+        if let Some(udtf) =
+            (Arc::clone(&udtf) as Arc<dyn Any>).downcast_ref::<ForeignTableFunction>()
+        {
+            return udtf.0.clone();
         }
-    }
-}
 
-impl From<Arc<dyn TableFunctionImpl>> for FFI_TableFunction {
-    fn from(udtf: Arc<dyn TableFunctionImpl>) -> Self {
-        let private_data = Box::new(TableFunctionPrivateData {
-            udtf,
-            runtime: None,
-        });
+        let private_data = Box::new(TableFunctionPrivateData { udtf, runtime });
 
         Self {
+            #[expect(deprecated)]
             call: call_fn_wrapper,
+            call_with_args: call_with_args_wrapper,
+            logical_codec,
             clone: clone_fn_wrapper,
             release: release_fn_wrapper,
             private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
         }
     }
 }
@@ -169,40 +262,69 @@ pub struct ForeignTableFunction(FFI_TableFunction);
 unsafe impl Send for ForeignTableFunction {}
 unsafe impl Sync for ForeignTableFunction {}
 
-impl From<FFI_TableFunction> for ForeignTableFunction {
+impl From<FFI_TableFunction> for Arc<dyn TableFunctionImpl> {
     fn from(value: FFI_TableFunction) -> Self {
-        Self(value)
+        if (value.library_marker_id)() == crate::get_library_marker_id() {
+            Arc::clone(value.inner())
+        } else {
+            Arc::new(ForeignTableFunction(value))
+        }
     }
 }
 
 impl TableFunctionImpl for ForeignTableFunction {
-    fn call(&self, args: &[Expr]) -> Result<Arc<dyn TableProvider>> {
-        let codec = DefaultLogicalExtensionCodec {};
+    fn call_with_args(&self, args: TableFunctionArgs) -> Result<Arc<dyn TableProvider>> {
+        let session = FFI_SessionRef::new(
+            args.session(),
+            self.0.runtime(),
+            self.0.logical_codec.clone(),
+        );
+        let codec: Arc<dyn LogicalExtensionCodec> = (&self.0.logical_codec).into();
         let expr_list = LogicalExprList {
-            expr: serialize_exprs(args, &codec)?,
+            expr: serialize_exprs(args.exprs(), codec.as_ref())?,
         };
-        let filters_serialized = expr_list.encode_to_vec().into();
+        let filters_serialized = expr_list.encode_to_vec().into_iter().collect();
+
+        let table_provider =
+            unsafe { (self.0.call_with_args)(&self.0, filters_serialized, session) };
+
+        let table_provider = df_result!(table_provider)?;
+        let table_provider: Arc<dyn TableProvider> = (&table_provider).into();
+
+        Ok(table_provider)
+    }
 
+    fn call(&self, args: &[datafusion_expr::Expr]) -> Result<Arc<dyn TableProvider>> {
+        let codec: Arc<dyn LogicalExtensionCodec> = (&self.0.logical_codec).into();
+        let expr_list = LogicalExprList {
+            expr: serialize_exprs(args, codec.as_ref())?,
+        };
+        let filters_serialized = expr_list.encode_to_vec().into_iter().collect();
+
+        #[expect(deprecated)]
         let table_provider = unsafe { (self.0.call)(&self.0, filters_serialized) };
 
         let table_provider = df_result!(table_provider)?;
-        let table_provider: ForeignTableProvider = (&table_provider).into();
+        let table_provider: Arc<dyn TableProvider> = (&table_provider).into();
 
-        Ok(Arc::new(table_provider))
+        Ok(table_provider)
     }
 }
 
 #[cfg(test)]
 mod tests {
-    use arrow::{
-        array::{
-            record_batch, ArrayRef, Float64Array, RecordBatch, StringArray, UInt64Array,
-        },
-        datatypes::{DataType, Field, Schema},
-    };
-    use datafusion::{
-        catalog::MemTable, common::exec_err, prelude::lit, scalar::ScalarValue,
+    use arrow::array::{
+        ArrayRef, Float64Array, RecordBatch, StringArray, UInt64Array, record_batch,
     };
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion::catalog::MemTable;
+    use datafusion::common::exec_err;
+    use datafusion::logical_expr::ptr_eq::arc_ptr_eq;
+    use datafusion::prelude::{SessionContext, lit};
+    use datafusion::scalar::ScalarValue;
+    use datafusion_catalog::TableFunctionArgs;
+    use datafusion_execution::TaskContextProvider;
+    use datafusion_expr::Expr;
 
     use super::*;
 
@@ -210,8 +332,12 @@ mod tests {
     struct TestUDTF {}
 
     impl TableFunctionImpl for TestUDTF {
-        fn call(&self, args: &[Expr]) -> Result<Arc<dyn TableProvider>> {
+        fn call_with_args(
+            &self,
+            args: TableFunctionArgs,
+        ) -> Result<Arc<dyn TableProvider>> {
             let args = args
+                .exprs()
                 .iter()
                 .map(|arg| {
                     if let Expr::Literal(scalar, _) = arg {
@@ -287,15 +413,25 @@ mod tests {
     #[tokio::test]
     async fn test_round_trip_udtf() -> Result<()> {
         let original_udtf = Arc::new(TestUDTF {}) as Arc<dyn TableFunctionImpl>;
+        let ctx = Arc::new(SessionContext::default());
+        let task_ctx_provider = Arc::clone(&ctx) as Arc<dyn TaskContextProvider>;
+        let task_ctx_provider = FFI_TaskContextProvider::from(&task_ctx_provider);
 
-        let local_udtf: FFI_TableFunction =
-            FFI_TableFunction::new(Arc::clone(&original_udtf), None);
+        let mut local_udtf: FFI_TableFunction = FFI_TableFunction::new(
+            Arc::clone(&original_udtf),
+            None,
+            task_ctx_provider,
+            None,
+        );
+        local_udtf.library_marker_id = crate::mock_foreign_marker_id;
 
-        let foreign_udf: ForeignTableFunction = local_udtf.into();
+        let foreign_udf: Arc<dyn TableFunctionImpl> = local_udtf.into();
 
-        let table = foreign_udf.call(&[lit(6_u64), lit("one"), lit(2.0), lit(3_u64)])?;
+        let table = foreign_udf.call_with_args(TableFunctionArgs::new(
+            &[lit(6_u64), lit("one"), lit(2.0), lit(3_u64)],
+            &ctx.state(),
+        ))?;
 
-        let ctx = SessionContext::default();
         let _ = ctx.register_table("test-table", table)?;
 
         let returned_batches = ctx.table("test-table").await?.collect().await?;
@@ -317,4 +453,29 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_ffi_udtf_local_bypass() -> Result<()> {
+        let original_udtf = Arc::new(TestUDTF {}) as Arc<dyn TableFunctionImpl>;
+
+        let ctx = Arc::new(SessionContext::default()) as Arc<dyn TaskContextProvider>;
+        let task_ctx_provider = FFI_TaskContextProvider::from(&ctx);
+        let mut ffi_udtf = FFI_TableFunction::new(
+            Arc::clone(&original_udtf),
+            None,
+            task_ctx_provider,
+            None,
+        );
+
+        // Verify local libraries can be downcast to their original
+        let foreign_udtf: Arc<dyn TableFunctionImpl> = ffi_udtf.clone().into();
+        assert!(arc_ptr_eq(&original_udtf, &foreign_udtf));
+
+        // Verify different library markers generate foreign providers
+        ffi_udtf.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_udtf: Arc<dyn TableFunctionImpl> = ffi_udtf.into();
+        assert!(!arc_ptr_eq(&original_udtf, &foreign_udtf));
+
+        Ok(())
+    }
 }
diff --git a/datafusion/ffi/src/udwf/mod.rs b/datafusion/ffi/src/udwf/mod.rs
index 9f56e2d4788b7..bff46386709f9 100644
--- a/datafusion/ffi/src/udwf/mod.rs
+++ b/datafusion/ffi/src/udwf/mod.rs
@@ -15,74 +15,64 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use abi_stable::{
-    std_types::{ROption, RResult, RString, RVec},
-    StableAbi,
-};
-use arrow::datatypes::Schema;
-use arrow::{
-    compute::SortOptions,
-    datatypes::{DataType, SchemaRef},
-};
+use std::ffi::c_void;
+use std::hash::{Hash, Hasher};
+use std::sync::Arc;
+
+use arrow::compute::SortOptions;
+use arrow::datatypes::{DataType, Schema, SchemaRef};
 use arrow_schema::{Field, FieldRef};
-use datafusion::logical_expr::LimitEffect;
-use datafusion::physical_expr::PhysicalExpr;
-use datafusion::{
-    error::DataFusionError,
-    logical_expr::{
-        function::WindowUDFFieldArgs, type_coercion::functions::fields_with_window_udf,
-        PartitionEvaluator,
-    },
+use datafusion_common::{Result, ffi_err};
+use datafusion_expr::function::WindowUDFFieldArgs;
+use datafusion_expr::type_coercion::functions::fields_with_udf;
+use datafusion_expr::{
+    LimitEffect, PartitionEvaluator, Signature, WindowUDF, WindowUDFImpl,
 };
-use datafusion::{
-    error::Result,
-    logical_expr::{Signature, WindowUDF, WindowUDFImpl},
-};
-use datafusion_common::exec_err;
-use partition_evaluator::{FFI_PartitionEvaluator, ForeignPartitionEvaluator};
+use datafusion_physical_expr::PhysicalExpr;
+use partition_evaluator::FFI_PartitionEvaluator;
 use partition_evaluator_args::{
     FFI_PartitionEvaluatorArgs, ForeignPartitionEvaluatorArgs,
 };
-use std::hash::{Hash, Hasher};
-use std::{ffi::c_void, sync::Arc};
+
+use stabby::string::String as SString;
+use stabby::vec::Vec as SVec;
 
 mod partition_evaluator;
 mod partition_evaluator_args;
 mod range;
 
-use crate::util::{rvec_wrapped_to_vec_fieldref, vec_fieldref_to_rvec_wrapped};
-use crate::{
-    arrow_wrappers::WrappedSchema,
-    df_result, rresult, rresult_return,
-    util::{rvec_wrapped_to_vec_datatype, vec_datatype_to_rvec_wrapped},
-    volatility::FFI_Volatility,
+use crate::arrow_wrappers::WrappedSchema;
+use crate::util::{
+    FFI_Option, FFI_Result, rvec_wrapped_to_vec_datatype, rvec_wrapped_to_vec_fieldref,
+    vec_datatype_to_rvec_wrapped, vec_fieldref_to_rvec_wrapped,
 };
+use crate::volatility::FFI_Volatility;
+use crate::{df_result, sresult, sresult_return};
 
 /// A stable struct for sharing a [`WindowUDF`] across FFI boundaries.
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
+#[derive(Debug)]
 pub struct FFI_WindowUDF {
     /// FFI equivalent to the `name` of a [`WindowUDF`]
-    pub name: RString,
+    pub name: SString,
 
     /// FFI equivalent to the `aliases` of a [`WindowUDF`]
-    pub aliases: RVec<RString>,
+    pub aliases: SVec<SString>,
 
     /// FFI equivalent to the `volatility` of a [`WindowUDF`]
     pub volatility: FFI_Volatility,
 
-    pub partition_evaluator:
-        unsafe extern "C" fn(
-            udwf: &Self,
-            args: FFI_PartitionEvaluatorArgs,
-        ) -> RResult<FFI_PartitionEvaluator, RString>,
+    pub partition_evaluator: unsafe extern "C" fn(
+        udwf: &Self,
+        args: FFI_PartitionEvaluatorArgs,
+    )
+        -> FFI_Result<FFI_PartitionEvaluator>,
 
     pub field: unsafe extern "C" fn(
         udwf: &Self,
-        input_types: RVec<WrappedSchema>,
-        display_name: RString,
-    ) -> RResult<WrappedSchema, RString>,
+        input_types: SVec<WrappedSchema>,
+        display_name: SString,
+    ) -> FFI_Result<WrappedSchema>,
 
     /// Performs type coercion. To simply this interface, all UDFs are treated as having
     /// user defined signatures, which will in turn call coerce_types to be called. This
@@ -90,10 +80,10 @@ pub struct FFI_WindowUDF {
     /// appropriate calls on the underlying [`WindowUDF`]
     pub coerce_types: unsafe extern "C" fn(
         udf: &Self,
-        arg_types: RVec<WrappedSchema>,
-    ) -> RResult<RVec<WrappedSchema>, RString>,
+        arg_types: SVec<WrappedSchema>,
+    ) -> FFI_Result<SVec<WrappedSchema>>,
 
-    pub sort_options: ROption<FFI_SortOptions>,
+    pub sort_options: FFI_Option<FFI_SortOptions>,
 
     /// Used to create a clone on the provider of the udf. This should
     /// only need to be called by the receiver of the udf.
@@ -105,6 +95,11 @@ pub struct FFI_WindowUDF {
     /// Internal data. This is only to be accessed by the provider of the udf.
     /// A [`ForeignWindowUDF`] should never attempt to access this data.
     pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
 }
 
 unsafe impl Send for FFI_WindowUDF {}
@@ -116,91 +111,107 @@ pub struct WindowUDFPrivateData {
 
 impl FFI_WindowUDF {
     unsafe fn inner(&self) -> &Arc<WindowUDF> {
-        let private_data = self.private_data as *const WindowUDFPrivateData;
-        &(*private_data).udf
+        unsafe {
+            let private_data = self.private_data as *const WindowUDFPrivateData;
+            &(*private_data).udf
+        }
     }
 }
 
 unsafe extern "C" fn partition_evaluator_fn_wrapper(
     udwf: &FFI_WindowUDF,
     args: FFI_PartitionEvaluatorArgs,
-) -> RResult<FFI_PartitionEvaluator, RString> {
-    let inner = udwf.inner();
+) -> FFI_Result<FFI_PartitionEvaluator> {
+    unsafe {
+        let inner = udwf.inner();
 
-    let args = rresult_return!(ForeignPartitionEvaluatorArgs::try_from(args));
+        let args = sresult_return!(ForeignPartitionEvaluatorArgs::try_from(args));
 
-    let evaluator = rresult_return!(inner.partition_evaluator_factory((&args).into()));
+        let evaluator =
+            sresult_return!(inner.partition_evaluator_factory((&args).into()));
 
-    RResult::ROk(evaluator.into())
+        FFI_Result::Ok(evaluator.into())
+    }
 }
 
 unsafe extern "C" fn field_fn_wrapper(
     udwf: &FFI_WindowUDF,
-    input_fields: RVec<WrappedSchema>,
-    display_name: RString,
-) -> RResult<WrappedSchema, RString> {
-    let inner = udwf.inner();
+    input_fields: SVec<WrappedSchema>,
+    display_name: SString,
+) -> FFI_Result<WrappedSchema> {
+    unsafe {
+        let inner = udwf.inner();
 
-    let input_fields = rresult_return!(rvec_wrapped_to_vec_fieldref(&input_fields));
+        let input_fields = sresult_return!(rvec_wrapped_to_vec_fieldref(&input_fields));
 
-    let field = rresult_return!(inner.field(WindowUDFFieldArgs::new(
-        &input_fields,
-        display_name.as_str()
-    )));
+        let field = sresult_return!(inner.field(WindowUDFFieldArgs::new(
+            &input_fields,
+            display_name.as_str()
+        )));
 
-    let schema = Arc::new(Schema::new(vec![field]));
+        let schema = Arc::new(Schema::new(vec![field]));
 
-    RResult::ROk(WrappedSchema::from(schema))
+        FFI_Result::Ok(WrappedSchema::from(schema))
+    }
 }
 
 unsafe extern "C" fn coerce_types_fn_wrapper(
     udwf: &FFI_WindowUDF,
-    arg_types: RVec<WrappedSchema>,
-) -> RResult<RVec<WrappedSchema>, RString> {
-    let inner = udwf.inner();
-
-    let arg_fields = rresult_return!(rvec_wrapped_to_vec_datatype(&arg_types))
-        .into_iter()
-        .map(|dt| Field::new("f", dt, false))
-        .map(Arc::new)
-        .collect::<Vec<_>>();
-
-    let return_fields = rresult_return!(fields_with_window_udf(&arg_fields, inner));
-    let return_types = return_fields
-        .into_iter()
-        .map(|f| f.data_type().to_owned())
-        .collect::<Vec<_>>();
-
-    rresult!(vec_datatype_to_rvec_wrapped(&return_types))
+    arg_types: SVec<WrappedSchema>,
+) -> FFI_Result<SVec<WrappedSchema>> {
+    unsafe {
+        let inner = udwf.inner();
+
+        let arg_fields = sresult_return!(rvec_wrapped_to_vec_datatype(&arg_types))
+            .into_iter()
+            .map(|dt| Field::new("f", dt, false))
+            .map(Arc::new)
+            .collect::<Vec<_>>();
+
+        let return_fields = sresult_return!(fields_with_udf(&arg_fields, inner.as_ref()));
+        let return_types = return_fields
+            .into_iter()
+            .map(|f| f.data_type().to_owned())
+            .collect::<Vec<_>>();
+
+        sresult!(vec_datatype_to_rvec_wrapped(&return_types))
+    }
 }
 
 unsafe extern "C" fn release_fn_wrapper(udwf: &mut FFI_WindowUDF) {
-    let private_data = Box::from_raw(udwf.private_data as *mut WindowUDFPrivateData);
-    drop(private_data);
+    unsafe {
+        debug_assert!(!udwf.private_data.is_null());
+        let private_data = Box::from_raw(udwf.private_data as *mut WindowUDFPrivateData);
+        drop(private_data);
+        udwf.private_data = std::ptr::null_mut();
+    }
 }
 
 unsafe extern "C" fn clone_fn_wrapper(udwf: &FFI_WindowUDF) -> FFI_WindowUDF {
-    // let private_data = udf.private_data as *const WindowUDFPrivateData;
-    // let udf_data = &(*private_data);
-
-    // let private_data = Box::new(WindowUDFPrivateData {
-    //     udf: Arc::clone(&udf_data.udf),
-    // });
-    let private_data = Box::new(WindowUDFPrivateData {
-        udf: Arc::clone(udwf.inner()),
-    });
-
-    FFI_WindowUDF {
-        name: udwf.name.clone(),
-        aliases: udwf.aliases.clone(),
-        volatility: udwf.volatility.clone(),
-        partition_evaluator: partition_evaluator_fn_wrapper,
-        sort_options: udwf.sort_options.clone(),
-        coerce_types: coerce_types_fn_wrapper,
-        field: field_fn_wrapper,
-        clone: clone_fn_wrapper,
-        release: release_fn_wrapper,
-        private_data: Box::into_raw(private_data) as *mut c_void,
+    unsafe {
+        // let private_data = udf.private_data as *const WindowUDFPrivateData;
+        // let udf_data = &(*private_data);
+
+        // let private_data = Box::new(WindowUDFPrivateData {
+        //     udf: Arc::clone(&udf_data.udf),
+        // });
+        let private_data = Box::new(WindowUDFPrivateData {
+            udf: Arc::clone(udwf.inner()),
+        });
+
+        FFI_WindowUDF {
+            name: udwf.name.clone(),
+            aliases: udwf.aliases.clone(),
+            volatility: udwf.volatility.clone(),
+            partition_evaluator: partition_evaluator_fn_wrapper,
+            sort_options: udwf.sort_options.clone(),
+            coerce_types: coerce_types_fn_wrapper,
+            field: field_fn_wrapper,
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
     }
 }
 
@@ -212,6 +223,10 @@ impl Clone for FFI_WindowUDF {
 
 impl From<Arc<WindowUDF>> for FFI_WindowUDF {
     fn from(udf: Arc<WindowUDF>) -> Self {
+        if let Some(udwf) = udf.inner().downcast_ref::<ForeignWindowUDF>() {
+            return udwf.udf.clone();
+        }
+
         let name = udf.name().into();
         let aliases = udf.aliases().iter().map(|a| a.to_owned().into()).collect();
         let volatility = udf.signature().volatility.into();
@@ -230,6 +245,7 @@ impl From<Arc<WindowUDF>> for FFI_WindowUDF {
             clone: clone_fn_wrapper,
             release: release_fn_wrapper,
             private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
         }
     }
 }
@@ -270,29 +286,27 @@ impl Hash for ForeignWindowUDF {
     }
 }
 
-impl TryFrom<&FFI_WindowUDF> for ForeignWindowUDF {
-    type Error = DataFusionError;
-
-    fn try_from(udf: &FFI_WindowUDF) -> Result<Self, Self::Error> {
-        let name = udf.name.to_owned().into();
-        let signature = Signature::user_defined((&udf.volatility).into());
-
-        let aliases = udf.aliases.iter().map(|s| s.to_string()).collect();
-
-        Ok(Self {
-            name,
-            udf: udf.clone(),
-            aliases,
-            signature,
-        })
+impl From<&FFI_WindowUDF> for Arc<dyn WindowUDFImpl> {
+    fn from(udf: &FFI_WindowUDF) -> Self {
+        if (udf.library_marker_id)() == crate::get_library_marker_id() {
+            Arc::clone(unsafe { udf.inner().inner() })
+        } else {
+            let name = udf.name.to_owned().into();
+            let signature = Signature::user_defined((&udf.volatility).into());
+
+            let aliases = udf.aliases.iter().map(|s| s.to_string()).collect();
+
+            Arc::new(ForeignWindowUDF {
+                name,
+                udf: udf.clone(),
+                aliases,
+                signature,
+            })
+        }
     }
 }
 
 impl WindowUDFImpl for ForeignWindowUDF {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         &self.name
     }
@@ -315,17 +329,14 @@ impl WindowUDFImpl for ForeignWindowUDF {
 
     fn partition_evaluator(
         &self,
-        args: datafusion::logical_expr::function::PartitionEvaluatorArgs,
+        args: datafusion_expr::function::PartitionEvaluatorArgs,
     ) -> Result<Box<dyn PartitionEvaluator>> {
         let evaluator = unsafe {
             let args = FFI_PartitionEvaluatorArgs::try_from(args)?;
             (self.udf.partition_evaluator)(&self.udf, args)
         };
 
-        df_result!(evaluator).map(|evaluator| {
-            Box::new(ForeignPartitionEvaluator::from(evaluator))
-                as Box<dyn PartitionEvaluator>
-        })
+        df_result!(evaluator).map(<Box<dyn PartitionEvaluator>>::from)
     }
 
     fn field(&self, field_args: WindowUDFFieldArgs) -> Result<FieldRef> {
@@ -339,7 +350,7 @@ impl WindowUDFImpl for ForeignWindowUDF {
             let schema: SchemaRef = schema.into();
 
             match schema.fields().is_empty() {
-                true => exec_err!(
+                true => ffi_err!(
                     "Unable to retrieve field in WindowUDF via FFI - schema has no fields"
                 ),
                 false => Ok(schema.field(0).to_owned().into()),
@@ -348,7 +359,7 @@ impl WindowUDFImpl for ForeignWindowUDF {
     }
 
     fn sort_options(&self) -> Option<SortOptions> {
-        let options: Option<&FFI_SortOptions> = self.udf.sort_options.as_ref().into();
+        let options: Option<&FFI_SortOptions> = self.udf.sort_options.as_ref();
         options.map(|s| s.into())
     }
 
@@ -358,8 +369,7 @@ impl WindowUDFImpl for ForeignWindowUDF {
 }
 
 #[repr(C)]
-#[derive(Debug, StableAbi, Clone)]
-#[allow(non_camel_case_types)]
+#[derive(Debug, Clone)]
 pub struct FFI_SortOptions {
     pub descending: bool,
     pub nulls_first: bool,
@@ -386,24 +396,27 @@ impl From<&FFI_SortOptions> for SortOptions {
 #[cfg(test)]
 #[cfg(feature = "integration-tests")]
 mod tests {
-    use crate::tests::create_record_batch;
-    use crate::udwf::{FFI_WindowUDF, ForeignWindowUDF};
-    use arrow::array::{create_array, ArrayRef};
-    use datafusion::functions_window::lead_lag::{lag_udwf, WindowShift};
+    use std::sync::Arc;
+
+    use arrow::array::{ArrayRef, create_array};
+    use datafusion::functions_window::lead_lag::{WindowShift, lag_udwf};
     use datafusion::logical_expr::expr::Sort;
-    use datafusion::logical_expr::{col, ExprFunctionExt, WindowUDF, WindowUDFImpl};
+    use datafusion::logical_expr::{ExprFunctionExt, WindowUDF, WindowUDFImpl, col};
     use datafusion::prelude::SessionContext;
-    use std::sync::Arc;
+
+    use crate::tests::create_record_batch;
+    use crate::udwf::{FFI_WindowUDF, ForeignWindowUDF};
 
     fn create_test_foreign_udwf(
         original_udwf: impl WindowUDFImpl + 'static,
     ) -> datafusion::common::Result<WindowUDF> {
         let original_udwf = Arc::new(WindowUDF::from(original_udwf));
 
-        let local_udwf: FFI_WindowUDF = Arc::clone(&original_udwf).into();
+        let mut local_udwf: FFI_WindowUDF = Arc::clone(&original_udwf).into();
+        local_udwf.library_marker_id = crate::mock_foreign_marker_id;
 
-        let foreign_udwf: ForeignWindowUDF = (&local_udwf).try_into()?;
-        Ok(foreign_udwf.into())
+        let foreign_udwf: Arc<dyn WindowUDFImpl> = (&local_udwf).into();
+        Ok(WindowUDF::new_from_shared_impl(foreign_udwf))
     }
 
     #[test]
@@ -412,11 +425,12 @@ mod tests {
         let original_name = original_udwf.name().to_owned();
 
         // Convert to FFI format
-        let local_udwf: FFI_WindowUDF = Arc::clone(&original_udwf).into();
+        let mut local_udwf: FFI_WindowUDF = Arc::clone(&original_udwf).into();
+        local_udwf.library_marker_id = crate::mock_foreign_marker_id;
 
         // Convert back to native format
-        let foreign_udwf: ForeignWindowUDF = (&local_udwf).try_into()?;
-        let foreign_udwf: WindowUDF = foreign_udwf.into();
+        let foreign_udwf: Arc<dyn WindowUDFImpl> = (&local_udwf).into();
+        let foreign_udwf = WindowUDF::new_from_shared_impl(foreign_udwf);
 
         assert_eq!(original_name, foreign_udwf.name());
         Ok(())
@@ -450,4 +464,22 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_ffi_udwf_local_bypass() -> datafusion_common::Result<()> {
+        let original_udwf = Arc::new(WindowUDF::from(WindowShift::lag()));
+
+        let mut ffi_udwf = FFI_WindowUDF::from(original_udwf);
+
+        // Verify local libraries can be downcast to their original
+        let foreign_udwf: Arc<dyn WindowUDFImpl> = (&ffi_udwf).into();
+        assert!(foreign_udwf.is::<WindowShift>());
+
+        // Verify different library markers generate foreign providers
+        ffi_udwf.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_udwf: Arc<dyn WindowUDFImpl> = (&ffi_udwf).into();
+        assert!(foreign_udwf.is::<ForeignWindowUDF>());
+
+        Ok(())
+    }
 }
diff --git a/datafusion/ffi/src/udwf/partition_evaluator.rs b/datafusion/ffi/src/udwf/partition_evaluator.rs
index 14cf23b919aa3..c4c43f00d81fa 100644
--- a/datafusion/ffi/src/udwf/partition_evaluator.rs
+++ b/datafusion/ffi/src/udwf/partition_evaluator.rs
@@ -15,54 +15,54 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{ffi::c_void, ops::Range};
-
-use crate::{arrow_wrappers::WrappedArray, df_result, rresult, rresult_return};
-use abi_stable::{
-    std_types::{RResult, RString, RVec},
-    StableAbi,
-};
-use arrow::{array::ArrayRef, error::ArrowError};
-use datafusion::{
-    error::{DataFusionError, Result},
-    logical_expr::{window_state::WindowAggState, PartitionEvaluator},
-    scalar::ScalarValue,
-};
+use std::any::Any;
+use std::ffi::c_void;
+use std::ops::Range;
+
+use arrow::array::ArrayRef;
+use arrow::error::ArrowError;
+use datafusion_common::scalar::ScalarValue;
+use datafusion_common::{DataFusionError, Result};
+use datafusion_expr::PartitionEvaluator;
+use datafusion_expr::window_state::WindowAggState;
 use prost::Message;
 
+use stabby::vec::Vec as SVec;
+
 use super::range::FFI_Range;
+use crate::arrow_wrappers::WrappedArray;
+use crate::util::FFI_Result;
+use crate::{df_result, sresult, sresult_return};
 
 /// A stable struct for sharing [`PartitionEvaluator`] across FFI boundaries.
 /// For an explanation of each field, see the corresponding function
 /// defined in [`PartitionEvaluator`].
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
+#[derive(Debug)]
 pub struct FFI_PartitionEvaluator {
     pub evaluate_all: unsafe extern "C" fn(
         evaluator: &mut Self,
-        values: RVec<WrappedArray>,
+        values: SVec<WrappedArray>,
         num_rows: usize,
-    ) -> RResult<WrappedArray, RString>,
+    ) -> FFI_Result<WrappedArray>,
 
     pub evaluate: unsafe extern "C" fn(
         evaluator: &mut Self,
-        values: RVec<WrappedArray>,
+        values: SVec<WrappedArray>,
         range: FFI_Range,
-    ) -> RResult<RVec<u8>, RString>,
+    ) -> FFI_Result<SVec<u8>>,
 
     pub evaluate_all_with_rank: unsafe extern "C" fn(
         evaluator: &Self,
         num_rows: usize,
-        ranks_in_partition: RVec<FFI_Range>,
-    )
-        -> RResult<WrappedArray, RString>,
+        ranks_in_partition: SVec<FFI_Range>,
+    ) -> FFI_Result<WrappedArray>,
 
     pub get_range: unsafe extern "C" fn(
         evaluator: &Self,
         idx: usize,
         n_rows: usize,
-    ) -> RResult<FFI_Range, RString>,
+    ) -> FFI_Result<FFI_Range>,
 
     pub is_causal: bool,
 
@@ -76,6 +76,11 @@ pub struct FFI_PartitionEvaluator {
     /// Internal data. This is only to be accessed by the provider of the evaluator.
     /// A [`ForeignPartitionEvaluator`] should never attempt to access this data.
     pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
 }
 
 unsafe impl Send for FFI_PartitionEvaluator {}
@@ -87,96 +92,127 @@ pub struct PartitionEvaluatorPrivateData {
 
 impl FFI_PartitionEvaluator {
     unsafe fn inner_mut(&mut self) -> &mut Box<dyn PartitionEvaluator + 'static> {
-        let private_data = self.private_data as *mut PartitionEvaluatorPrivateData;
-        &mut (*private_data).evaluator
+        unsafe {
+            let private_data = self.private_data as *mut PartitionEvaluatorPrivateData;
+            &mut (*private_data).evaluator
+        }
     }
 
     unsafe fn inner(&self) -> &(dyn PartitionEvaluator + 'static) {
-        let private_data = self.private_data as *mut PartitionEvaluatorPrivateData;
-        (*private_data).evaluator.as_ref()
+        unsafe {
+            let private_data = self.private_data as *mut PartitionEvaluatorPrivateData;
+            (*private_data).evaluator.as_ref()
+        }
     }
 }
 
 unsafe extern "C" fn evaluate_all_fn_wrapper(
     evaluator: &mut FFI_PartitionEvaluator,
-    values: RVec<WrappedArray>,
+    values: SVec<WrappedArray>,
     num_rows: usize,
-) -> RResult<WrappedArray, RString> {
-    let inner = evaluator.inner_mut();
-
-    let values_arrays = values
-        .into_iter()
-        .map(|v| v.try_into().map_err(DataFusionError::from))
-        .collect::<Result<Vec<ArrayRef>>>();
-    let values_arrays = rresult_return!(values_arrays);
-
-    let return_array = inner
-        .evaluate_all(&values_arrays, num_rows)
-        .and_then(|array| WrappedArray::try_from(&array).map_err(DataFusionError::from));
-
-    rresult!(return_array)
+) -> FFI_Result<WrappedArray> {
+    unsafe {
+        let inner = evaluator.inner_mut();
+
+        let values_arrays = values
+            .into_iter()
+            .map(|v| v.try_into().map_err(DataFusionError::from))
+            .collect::<Result<Vec<ArrayRef>>>();
+        let values_arrays = sresult_return!(values_arrays);
+
+        let return_array =
+            inner
+                .evaluate_all(&values_arrays, num_rows)
+                .and_then(|array| {
+                    WrappedArray::try_from(&array).map_err(DataFusionError::from)
+                });
+
+        sresult!(return_array)
+    }
 }
 
 unsafe extern "C" fn evaluate_fn_wrapper(
     evaluator: &mut FFI_PartitionEvaluator,
-    values: RVec<WrappedArray>,
+    values: SVec<WrappedArray>,
     range: FFI_Range,
-) -> RResult<RVec<u8>, RString> {
-    let inner = evaluator.inner_mut();
-
-    let values_arrays = values
-        .into_iter()
-        .map(|v| v.try_into().map_err(DataFusionError::from))
-        .collect::<Result<Vec<ArrayRef>>>();
-    let values_arrays = rresult_return!(values_arrays);
-
-    // let return_array = (inner.evaluate(&values_arrays, &range.into()));
-    // .and_then(|array| WrappedArray::try_from(&array).map_err(DataFusionError::from));
-    let scalar_result = rresult_return!(inner.evaluate(&values_arrays, &range.into()));
-    let proto_result: datafusion_proto::protobuf::ScalarValue =
-        rresult_return!((&scalar_result).try_into());
-
-    RResult::ROk(proto_result.encode_to_vec().into())
+) -> FFI_Result<SVec<u8>> {
+    unsafe {
+        let inner = evaluator.inner_mut();
+
+        let values_arrays = values
+            .into_iter()
+            .map(|v| v.try_into().map_err(DataFusionError::from))
+            .collect::<Result<Vec<ArrayRef>>>();
+        let values_arrays = sresult_return!(values_arrays);
+
+        // let return_array = (inner.evaluate(&values_arrays, &range.into()));
+        // .and_then(|array| WrappedArray::try_from(&array).map_err(DataFusionError::from));
+        let scalar_result =
+            sresult_return!(inner.evaluate(&values_arrays, &range.into()));
+        let proto_result: datafusion_proto::protobuf::ScalarValue =
+            sresult_return!((&scalar_result).try_into());
+
+        FFI_Result::Ok(proto_result.encode_to_vec().into_iter().collect())
+    }
 }
 
 unsafe extern "C" fn evaluate_all_with_rank_fn_wrapper(
     evaluator: &FFI_PartitionEvaluator,
     num_rows: usize,
-    ranks_in_partition: RVec<FFI_Range>,
-) -> RResult<WrappedArray, RString> {
-    let inner = evaluator.inner();
-
-    let ranks_in_partition = ranks_in_partition
-        .into_iter()
-        .map(Range::from)
-        .collect::<Vec<_>>();
-
-    let return_array = inner
-        .evaluate_all_with_rank(num_rows, &ranks_in_partition)
-        .and_then(|array| WrappedArray::try_from(&array).map_err(DataFusionError::from));
-
-    rresult!(return_array)
+    ranks_in_partition: SVec<FFI_Range>,
+) -> FFI_Result<WrappedArray> {
+    unsafe {
+        let inner = evaluator.inner();
+
+        let ranks_in_partition = ranks_in_partition
+            .into_iter()
+            .map(Range::from)
+            .collect::<Vec<_>>();
+
+        let return_array = inner
+            .evaluate_all_with_rank(num_rows, &ranks_in_partition)
+            .and_then(|array| {
+                WrappedArray::try_from(&array).map_err(DataFusionError::from)
+            });
+
+        sresult!(return_array)
+    }
 }
 
 unsafe extern "C" fn get_range_fn_wrapper(
     evaluator: &FFI_PartitionEvaluator,
     idx: usize,
     n_rows: usize,
-) -> RResult<FFI_Range, RString> {
-    let inner = evaluator.inner();
-    let range = inner.get_range(idx, n_rows).map(FFI_Range::from);
+) -> FFI_Result<FFI_Range> {
+    unsafe {
+        let inner = evaluator.inner();
+        let range = inner.get_range(idx, n_rows).map(FFI_Range::from);
 
-    rresult!(range)
+        sresult!(range)
+    }
 }
 
 unsafe extern "C" fn release_fn_wrapper(evaluator: &mut FFI_PartitionEvaluator) {
-    let private_data =
-        Box::from_raw(evaluator.private_data as *mut PartitionEvaluatorPrivateData);
-    drop(private_data);
+    unsafe {
+        if !evaluator.private_data.is_null() {
+            let private_data = Box::from_raw(
+                evaluator.private_data as *mut PartitionEvaluatorPrivateData,
+            );
+            drop(private_data);
+            evaluator.private_data = std::ptr::null_mut();
+        }
+    }
 }
 
 impl From<Box<dyn PartitionEvaluator>> for FFI_PartitionEvaluator {
     fn from(evaluator: Box<dyn PartitionEvaluator>) -> Self {
+        if (evaluator.as_ref() as &dyn Any).is::<ForeignPartitionEvaluator>() {
+            let evaluator = (evaluator as Box<dyn Any>)
+                .downcast::<ForeignPartitionEvaluator>()
+                .expect("already checked type");
+            return evaluator.evaluator;
+        }
+
         let is_causal = evaluator.is_causal();
         let supports_bounded_execution = evaluator.supports_bounded_execution();
         let include_rank = evaluator.include_rank();
@@ -195,6 +231,7 @@ impl From<Box<dyn PartitionEvaluator>> for FFI_PartitionEvaluator {
             uses_window_frame,
             release: release_fn_wrapper,
             private_data: Box::into_raw(Box::new(private_data)) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
         }
     }
 }
@@ -216,12 +253,20 @@ pub struct ForeignPartitionEvaluator {
     evaluator: FFI_PartitionEvaluator,
 }
 
-unsafe impl Send for ForeignPartitionEvaluator {}
-unsafe impl Sync for ForeignPartitionEvaluator {}
-
-impl From<FFI_PartitionEvaluator> for ForeignPartitionEvaluator {
-    fn from(evaluator: FFI_PartitionEvaluator) -> Self {
-        Self { evaluator }
+impl From<FFI_PartitionEvaluator> for Box<dyn PartitionEvaluator> {
+    fn from(mut evaluator: FFI_PartitionEvaluator) -> Self {
+        if (evaluator.library_marker_id)() == crate::get_library_marker_id() {
+            unsafe {
+                let private_data = Box::from_raw(
+                    evaluator.private_data as *mut PartitionEvaluatorPrivateData,
+                );
+                // We must set this to null to avoid a double free
+                evaluator.private_data = std::ptr::null_mut();
+                private_data.evaluator
+            }
+        } else {
+            Box::new(ForeignPartitionEvaluator { evaluator })
+        }
     }
 }
 
@@ -247,7 +292,7 @@ impl PartitionEvaluator for ForeignPartitionEvaluator {
             let values = values
                 .iter()
                 .map(WrappedArray::try_from)
-                .collect::<std::result::Result<RVec<_>, ArrowError>>()?;
+                .collect::<std::result::Result<SVec<_>, ArrowError>>()?;
             (self.evaluator.evaluate_all)(&mut self.evaluator, values, num_rows)
         };
 
@@ -265,7 +310,7 @@ impl PartitionEvaluator for ForeignPartitionEvaluator {
             let values = values
                 .iter()
                 .map(WrappedArray::try_from)
-                .collect::<std::result::Result<RVec<_>, ArrowError>>()?;
+                .collect::<std::result::Result<SVec<_>, ArrowError>>()?;
 
             let scalar_bytes = df_result!((self.evaluator.evaluate)(
                 &mut self.evaluator,
@@ -317,4 +362,55 @@ impl PartitionEvaluator for ForeignPartitionEvaluator {
 }
 
 #[cfg(test)]
-mod tests {}
+mod tests {
+    use arrow::array::ArrayRef;
+    use datafusion::logical_expr::PartitionEvaluator;
+
+    use crate::udwf::partition_evaluator::{
+        FFI_PartitionEvaluator, ForeignPartitionEvaluator,
+    };
+
+    #[derive(Debug)]
+    struct TestPartitionEvaluator {}
+
+    impl PartitionEvaluator for TestPartitionEvaluator {
+        fn evaluate_all(
+            &mut self,
+            values: &[ArrayRef],
+            _num_rows: usize,
+        ) -> datafusion_common::Result<ArrayRef> {
+            Ok(values[0].to_owned())
+        }
+    }
+
+    #[test]
+    fn test_ffi_partition_evaluator_local_bypass_inner() -> datafusion_common::Result<()>
+    {
+        let original_accum = TestPartitionEvaluator {};
+        let boxed_accum: Box<dyn PartitionEvaluator> = Box::new(original_accum);
+
+        let ffi_accum: FFI_PartitionEvaluator = boxed_accum.into();
+
+        // Verify local libraries can be downcast to their original
+        let foreign_accum: Box<dyn PartitionEvaluator> = ffi_accum.into();
+        unsafe {
+            let concrete = &*(foreign_accum.as_ref() as *const dyn PartitionEvaluator
+                as *const TestPartitionEvaluator);
+            assert!(!concrete.uses_window_frame());
+        }
+
+        // Verify different library markers generate foreign accumulator
+        let original_accum = TestPartitionEvaluator {};
+        let boxed_accum: Box<dyn PartitionEvaluator> = Box::new(original_accum);
+        let mut ffi_accum: FFI_PartitionEvaluator = boxed_accum.into();
+        ffi_accum.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_accum: Box<dyn PartitionEvaluator> = ffi_accum.into();
+        unsafe {
+            let concrete = &*(foreign_accum.as_ref() as *const dyn PartitionEvaluator
+                as *const ForeignPartitionEvaluator);
+            assert!(!concrete.uses_window_frame());
+        }
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/src/udwf/partition_evaluator_args.rs b/datafusion/ffi/src/udwf/partition_evaluator_args.rs
index cd26412564374..77b02b513419d 100644
--- a/datafusion/ffi/src/udwf/partition_evaluator_args.rs
+++ b/datafusion/ffi/src/udwf/partition_evaluator_args.rs
@@ -15,91 +15,41 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{collections::HashMap, sync::Arc};
+use std::sync::Arc;
 
-use crate::arrow_wrappers::WrappedSchema;
-use abi_stable::{std_types::RVec, StableAbi};
-use arrow::{
-    datatypes::{DataType, Field, Schema, SchemaRef},
-    error::ArrowError,
-    ffi::FFI_ArrowSchema,
-};
+use arrow::error::ArrowError;
+use arrow::ffi::FFI_ArrowSchema;
 use arrow_schema::FieldRef;
-use datafusion::{
-    error::{DataFusionError, Result},
-    logical_expr::function::PartitionEvaluatorArgs,
-    physical_plan::{expressions::Column, PhysicalExpr},
-    prelude::SessionContext,
-};
-use datafusion_common::exec_datafusion_err;
-use datafusion_proto::{
-    physical_plan::{
-        from_proto::parse_physical_expr, to_proto::serialize_physical_exprs,
-        DefaultPhysicalExtensionCodec,
-    },
-    protobuf::PhysicalExprNode,
-};
-use prost::Message;
+use datafusion_common::{DataFusionError, Result};
+use datafusion_expr::function::PartitionEvaluatorArgs;
+use datafusion_physical_plan::PhysicalExpr;
+use stabby::vec::Vec as SVec;
+
+use crate::arrow_wrappers::WrappedSchema;
+use crate::physical_expr::FFI_PhysicalExpr;
+use crate::util::rvec_wrapped_to_vec_fieldref;
 
 /// A stable struct for sharing [`PartitionEvaluatorArgs`] across FFI boundaries.
 /// For an explanation of each field, see the corresponding function
 /// defined in [`PartitionEvaluatorArgs`].
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
+#[derive(Debug)]
 pub struct FFI_PartitionEvaluatorArgs {
-    input_exprs: RVec<RVec<u8>>,
-    input_fields: RVec<WrappedSchema>,
+    input_exprs: SVec<FFI_PhysicalExpr>,
+    input_fields: SVec<WrappedSchema>,
     is_reversed: bool,
     ignore_nulls: bool,
-    schema: WrappedSchema,
 }
 
 impl TryFrom<PartitionEvaluatorArgs<'_>> for FFI_PartitionEvaluatorArgs {
     type Error = DataFusionError;
+
     fn try_from(args: PartitionEvaluatorArgs) -> Result<Self, DataFusionError> {
-        // This is a bit of a hack. Since PartitionEvaluatorArgs does not carry a schema
-        // around, and instead passes the data types directly we are unable to decode the
-        // protobuf PhysicalExpr correctly. In evaluating the code the only place these
-        // appear to be really used are the Column data types. So here we will find all
-        // of the required columns and create a schema that has empty fields except for
-        // the ones we require. Ideally we would enhance PartitionEvaluatorArgs to just
-        // pass along the schema, but that is a larger breaking change.
-        let required_columns: HashMap<usize, (&str, &DataType)> = args
+        let input_exprs = args
             .input_exprs()
             .iter()
-            .zip(args.input_fields())
-            .filter_map(|(expr, field)| {
-                expr.as_any()
-                    .downcast_ref::<Column>()
-                    .map(|column| (column.index(), (column.name(), field.data_type())))
-            })
-            .collect();
-
-        let max_column = required_columns.keys().max();
-        let fields: Vec<_> = max_column
-            .map(|max_column| {
-                (0..(max_column + 1))
-                    .map(|idx| match required_columns.get(&idx) {
-                        Some((name, data_type)) => {
-                            Field::new(*name, (*data_type).clone(), true)
-                        }
-                        None => Field::new(
-                            format!("ffi_partition_evaluator_col_{idx}"),
-                            DataType::Null,
-                            true,
-                        ),
-                    })
-                    .collect()
-            })
-            .unwrap_or_default();
-
-        let schema = Arc::new(Schema::new(fields));
-
-        let codec = DefaultPhysicalExtensionCodec {};
-        let input_exprs = serialize_physical_exprs(args.input_exprs(), &codec)?
-            .into_iter()
-            .map(|expr_node| expr_node.encode_to_vec().into())
+            .map(Arc::clone)
+            .map(FFI_PhysicalExpr::from)
             .collect();
 
         let input_fields = args
@@ -107,14 +57,12 @@ impl TryFrom<PartitionEvaluatorArgs<'_>> for FFI_PartitionEvaluatorArgs {
             .iter()
             .map(|input_type| FFI_ArrowSchema::try_from(input_type).map(WrappedSchema))
             .collect::<Result<Vec<_>, ArrowError>>()?
-            .into();
-
-        let schema: WrappedSchema = schema.into();
+            .into_iter()
+            .collect();
 
         Ok(Self {
             input_exprs,
             input_fields,
-            schema,
             is_reversed: args.is_reversed(),
             ignore_nulls: args.ignore_nulls(),
         })
@@ -136,27 +84,9 @@ impl TryFrom<FFI_PartitionEvaluatorArgs> for ForeignPartitionEvaluatorArgs {
     type Error = DataFusionError;
 
     fn try_from(value: FFI_PartitionEvaluatorArgs) -> Result<Self> {
-        let default_ctx = SessionContext::new();
-        let codec = DefaultPhysicalExtensionCodec {};
-
-        let schema: SchemaRef = value.schema.into();
+        let input_exprs = value.input_exprs.iter().map(Into::into).collect();
 
-        let input_exprs = value
-            .input_exprs
-            .into_iter()
-            .map(|input_expr_bytes| PhysicalExprNode::decode(input_expr_bytes.as_ref()))
-            .collect::<std::result::Result<Vec<_>, prost::DecodeError>>()
-            .map_err(|e| exec_datafusion_err!("Failed to decode PhysicalExprNode: {e}"))?
-            .iter()
-            .map(|expr_node| {
-                parse_physical_expr(expr_node, &default_ctx.task_ctx(), &schema, &codec)
-            })
-            .collect::<Result<Vec<_>>>()?;
-
-        let input_fields = input_exprs
-            .iter()
-            .map(|expr| expr.return_field(&schema))
-            .collect::<Result<Vec<_>>>()?;
+        let input_fields = rvec_wrapped_to_vec_fieldref(&value.input_fields)?;
 
         Ok(Self {
             input_exprs,
diff --git a/datafusion/ffi/src/udwf/range.rs b/datafusion/ffi/src/udwf/range.rs
index 1ddcc4199fe28..558fd058a67cf 100644
--- a/datafusion/ffi/src/udwf/range.rs
+++ b/datafusion/ffi/src/udwf/range.rs
@@ -17,14 +17,11 @@
 
 use std::ops::Range;
 
-use abi_stable::StableAbi;
-
 /// A stable struct for sharing [`Range`] across FFI boundaries.
 /// For an explanation of each field, see the corresponding function
 /// defined in [`Range`].
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
+#[derive(Debug)]
 pub struct FFI_Range {
     pub start: usize,
     pub end: usize,
diff --git a/datafusion/ffi/src/util.rs b/datafusion/ffi/src/util.rs
index 151464dc97458..3ab8aec5af14f 100644
--- a/datafusion/ffi/src/util.rs
+++ b/datafusion/ffi/src/util.rs
@@ -15,52 +15,57 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::arrow_wrappers::WrappedSchema;
-use abi_stable::std_types::RVec;
-use arrow::datatypes::Field;
-use arrow::{datatypes::DataType, ffi::FFI_ArrowSchema};
-use arrow_schema::FieldRef;
 use std::sync::Arc;
 
-/// This macro is a helpful conversion utility to convert from an abi_stable::RResult to a
+use arrow::datatypes::{DataType, Field};
+use arrow::ffi::FFI_ArrowSchema;
+use arrow_schema::FieldRef;
+use stabby::vec::Vec as SVec;
+
+use crate::arrow_wrappers::WrappedSchema;
+
+// Re-export for convenience
+pub use crate::ffi_option::{FFI_Option, FFI_Result};
+
+/// This macro is a helpful conversion utility to convert from an FFI_Result to a
 /// DataFusion result.
 #[macro_export]
 macro_rules! df_result {
     ( $x:expr ) => {
-        match $x {
-            abi_stable::std_types::RResult::ROk(v) => Ok(v),
-            abi_stable::std_types::RResult::RErr(e) => {
-                datafusion_common::exec_err!("FFI error: {}", e)
+        match Into::<::std::result::Result<_, _>>::into($x) {
+            Ok(v) => Ok(v),
+            Err(err) => {
+                datafusion_common::ffi_err!("{err}")
             }
         }
     };
 }
 
-/// This macro is a helpful conversion utility to convert from a DataFusion Result to an abi_stable::RResult
+/// This macro is a helpful conversion utility to convert from a DataFusion Result to an FFI_Result.
 #[macro_export]
-macro_rules! rresult {
+macro_rules! sresult {
     ( $x:expr ) => {
         match $x {
-            Ok(v) => abi_stable::std_types::RResult::ROk(v),
-            Err(e) => abi_stable::std_types::RResult::RErr(
-                abi_stable::std_types::RString::from(e.to_string()),
-            ),
+            Ok(v) => $crate::ffi_option::FFI_Result::Ok(v),
+            Err(e) => $crate::ffi_option::FFI_Result::Err(stabby::string::String::from(
+                e.to_string().as_str(),
+            )),
         }
     };
 }
 
-/// This macro is a helpful conversion utility to convert from a DataFusion Result to an abi_stable::RResult
-/// and to also call return when it is an error. Since you cannot use `?` on an RResult, this is designed
+/// This macro is a helpful conversion utility to convert from a DataFusion Result to an FFI_Result
+/// and to also call return when it is an error. Since you cannot use `?` on an FFI_Result, this is designed
 /// to mimic the pattern.
 #[macro_export]
-macro_rules! rresult_return {
+macro_rules! sresult_return {
     ( $x:expr ) => {
         match $x {
             Ok(v) => v,
             Err(e) => {
-                return abi_stable::std_types::RResult::RErr(
-                    abi_stable::std_types::RString::from(e.to_string()),
-                )
+                return $crate::ffi_option::FFI_Result::Err(stabby::string::String::from(
+                    e.to_string().as_str(),
+                ))
             }
         }
     };
@@ -70,7 +75,7 @@ macro_rules! rresult_return {
 /// FFI friendly counterpart, [`WrappedSchema`]
 pub fn vec_fieldref_to_rvec_wrapped(
     fields: &[FieldRef],
-) -> Result<RVec<WrappedSchema>, arrow::error::ArrowError> {
+) -> Result<SVec<WrappedSchema>, arrow::error::ArrowError> {
     Ok(fields
         .iter()
         .map(FFI_ArrowSchema::try_from)
@@ -83,7 +88,7 @@ pub fn vec_fieldref_to_rvec_wrapped(
 /// This is a utility function to convert an FFI friendly vector of [`WrappedSchema`]
 /// to their equivalent [`Field`].
 pub fn rvec_wrapped_to_vec_fieldref(
-    fields: &RVec<WrappedSchema>,
+    fields: &SVec<WrappedSchema>,
 ) -> Result<Vec<FieldRef>, arrow::error::ArrowError> {
     fields
         .iter()
@@ -95,7 +100,7 @@ pub fn rvec_wrapped_to_vec_fieldref(
 /// FFI friendly counterpart, [`WrappedSchema`]
 pub fn vec_datatype_to_rvec_wrapped(
     data_types: &[DataType],
-) -> Result<RVec<WrappedSchema>, arrow::error::ArrowError> {
+) -> Result<SVec<WrappedSchema>, arrow::error::ArrowError> {
     Ok(data_types
         .iter()
         .map(FFI_ArrowSchema::try_from)
@@ -108,7 +113,7 @@ pub fn vec_datatype_to_rvec_wrapped(
 /// This is a utility function to convert an FFI friendly vector of [`WrappedSchema`]
 /// to their equivalent [`DataType`].
 pub fn rvec_wrapped_to_vec_datatype(
-    data_types: &RVec<WrappedSchema>,
+    data_types: &SVec<WrappedSchema>,
 ) -> Result<Vec<DataType>, arrow::error::ArrowError> {
     data_types
         .iter()
@@ -117,12 +122,28 @@ pub fn rvec_wrapped_to_vec_datatype(
 }
 
 #[cfg(test)]
-mod tests {
-    use abi_stable::std_types::{RResult, RString};
+pub(crate) mod tests {
+    use std::sync::Arc;
+
     use datafusion::error::DataFusionError;
+    use datafusion::prelude::SessionContext;
+    use datafusion_execution::TaskContextProvider;
+    use stabby::string::String as SString;
+
+    use crate::execution::FFI_TaskContextProvider;
+    use crate::ffi_option::FFI_Result;
 
-    fn wrap_result(result: Result<String, DataFusionError>) -> RResult<String, RString> {
-        RResult::ROk(rresult_return!(result))
+    pub(crate) fn test_session_and_ctx() -> (Arc<SessionContext>, FFI_TaskContextProvider)
+    {
+        let ctx = Arc::new(SessionContext::new());
+        let task_ctx_provider = Arc::clone(&ctx) as Arc<dyn TaskContextProvider>;
+        let task_ctx_provider = FFI_TaskContextProvider::from(&task_ctx_provider);
+
+        (ctx, task_ctx_provider)
+    }
+
+    fn wrap_result(result: Result<String, DataFusionError>) -> FFI_Result<String> {
+        FFI_Result::Ok(sresult_return!(result))
     }
 
     #[test]
@@ -130,33 +151,37 @@ mod tests {
         const VALID_VALUE: &str = "valid_value";
         const ERROR_VALUE: &str = "error_value";
 
-        let ok_r_result: RResult<RString, RString> =
-            RResult::ROk(VALID_VALUE.to_string().into());
-        let err_r_result: RResult<RString, RString> =
-            RResult::RErr(ERROR_VALUE.to_string().into());
+        let ok_r_result: FFI_Result<SString> = FFI_Result::Ok(SString::from(VALID_VALUE));
+        let err_r_result: FFI_Result<SString> =
+            FFI_Result::Err(SString::from(ERROR_VALUE));
 
         let returned_ok_result = df_result!(ok_r_result);
         assert!(returned_ok_result.is_ok());
-        assert!(returned_ok_result.unwrap().to_string() == VALID_VALUE);
+        assert!(*returned_ok_result.unwrap() == *VALID_VALUE);
 
         let returned_err_result = df_result!(err_r_result);
         assert!(returned_err_result.is_err());
         assert!(
             returned_err_result.unwrap_err().strip_backtrace()
-                == format!("Execution error: FFI error: {ERROR_VALUE}")
+                == format!("FFI error: {ERROR_VALUE}")
         );
 
         let ok_result: Result<String, DataFusionError> = Ok(VALID_VALUE.to_string());
         let err_result: Result<String, DataFusionError> =
-            datafusion_common::exec_err!("{ERROR_VALUE}");
+            datafusion_common::ffi_err!("{ERROR_VALUE}");
 
         let returned_ok_r_result = wrap_result(ok_result);
-        assert!(returned_ok_r_result == RResult::ROk(VALID_VALUE.into()));
+        let std_result: Result<String, SString> = returned_ok_r_result.into();
+        assert!(std_result == Ok(VALID_VALUE.into()));
 
         let returned_err_r_result = wrap_result(err_result);
-        assert!(returned_err_r_result.is_err());
-        assert!(returned_err_r_result
-            .unwrap_err()
-            .starts_with(format!("Execution error: {ERROR_VALUE}").as_str()));
+        let std_result: Result<String, SString> = returned_err_r_result.into();
+        assert!(std_result.is_err());
+        assert!(
+            std_result
+                .unwrap_err()
+                .as_str()
+                .starts_with(format!("FFI error: {ERROR_VALUE}").as_str())
+        );
     }
 }
diff --git a/datafusion/ffi/src/volatility.rs b/datafusion/ffi/src/volatility.rs
index f1705da294a39..48fce6f6cf117 100644
--- a/datafusion/ffi/src/volatility.rs
+++ b/datafusion/ffi/src/volatility.rs
@@ -15,12 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use abi_stable::StableAbi;
-use datafusion::logical_expr::Volatility;
+use datafusion_expr::Volatility;
 
-#[repr(C)]
-#[derive(Debug, StableAbi, Clone)]
-#[allow(non_camel_case_types)]
+#[expect(non_camel_case_types)]
+#[repr(u8)]
+#[derive(Debug, Clone)]
 pub enum FFI_Volatility {
     Immutable,
     Stable,
diff --git a/datafusion/ffi/tests/ffi_catalog.rs b/datafusion/ffi/tests/ffi_catalog.rs
new file mode 100644
index 0000000000000..440a435f75c7d
--- /dev/null
+++ b/datafusion/ffi/tests/ffi_catalog.rs
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod utils;
+
+/// Add an additional module here for convenience to scope this to only
+/// when the feature integration-tests is built
+#[cfg(feature = "integration-tests")]
+mod tests {
+    use std::sync::Arc;
+
+    use datafusion::catalog::{CatalogProvider, CatalogProviderList};
+    use datafusion_ffi::tests::utils::get_module;
+
+    #[tokio::test]
+    async fn test_catalog() -> datafusion_common::Result<()> {
+        let module = get_module()?;
+        let (ctx, codec) = super::utils::ctx_and_codec();
+
+        let ffi_catalog = (module.create_catalog)(codec);
+        let foreign_catalog: Arc<dyn CatalogProvider> = (&ffi_catalog).into();
+
+        let _ = ctx.register_catalog("fruit", foreign_catalog);
+
+        let df = ctx.table("fruit.apple.purchases").await?;
+
+        let results = df.collect().await?;
+
+        assert_eq!(results.len(), 2);
+        let num_rows: usize = results.into_iter().map(|rb| rb.num_rows()).sum();
+        assert_eq!(num_rows, 5);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_catalog_list() -> datafusion_common::Result<()> {
+        let module = get_module()?;
+        let (ctx, codec) = super::utils::ctx_and_codec();
+
+        let ffi_catalog_list = (module.create_catalog_list)(codec);
+        let foreign_catalog_list: Arc<dyn CatalogProviderList> =
+            (&ffi_catalog_list).into();
+
+        ctx.register_catalog_list(foreign_catalog_list);
+
+        let df = ctx.table("blue.apple.purchases").await?;
+
+        let results = df.collect().await?;
+
+        assert_eq!(results.len(), 2);
+        let num_rows: usize = results.into_iter().map(|rb| rb.num_rows()).sum();
+        assert_eq!(num_rows, 5);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/tests/ffi_config.rs b/datafusion/ffi/tests/ffi_config.rs
new file mode 100644
index 0000000000000..4e5ad56722fe3
--- /dev/null
+++ b/datafusion/ffi/tests/ffi_config.rs
@@ -0,0 +1,106 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// Add an additional module here for convenience to scope this to only
+/// when the feature integration-tests is built
+#[cfg(feature = "integration-tests")]
+mod tests {
+    use datafusion::error::Result;
+    use datafusion_common::ScalarValue;
+    use datafusion_common::config::{ConfigOptions, TableOptions};
+    use datafusion_execution::config::SessionConfig;
+    use datafusion_ffi::config::ExtensionOptionsFFIProvider;
+    use datafusion_ffi::tests::config::ExternalConfig;
+    use datafusion_ffi::tests::utils::get_module;
+
+    #[test]
+    fn test_ffi_config_options_extension() -> Result<()> {
+        let module = get_module()?;
+
+        let extension_options = (module.create_extension_options)();
+
+        let mut config = ConfigOptions::new();
+        config.extensions.insert(extension_options);
+
+        // Verify default values are as expected
+        let returned_config: ExternalConfig = config
+            .local_or_ffi_extension()
+            .expect("should have external config extension");
+        assert_eq!(returned_config, ExternalConfig::default());
+
+        config.set("external_config.is_enabled", "false")?;
+        let returned_config: ExternalConfig = config
+            .local_or_ffi_extension()
+            .expect("should have external config extension");
+        assert!(!returned_config.is_enabled);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_ffi_table_options_extension() -> Result<()> {
+        let module = get_module()?;
+
+        let extension_options = (module.create_extension_options)();
+
+        let mut table_options = TableOptions::new();
+        table_options.extensions.insert(extension_options);
+
+        // Verify default values are as expected
+        let returned_options: ExternalConfig = table_options
+            .local_or_ffi_extension()
+            .expect("should have external config extension");
+
+        assert_eq!(returned_options, ExternalConfig::default());
+
+        table_options.set("external_config.is_enabled", "false")?;
+        let returned_options: ExternalConfig = table_options
+            .local_or_ffi_extension()
+            .expect("should have external config extension");
+        assert!(!returned_options.is_enabled);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_ffi_session_config_options_extension() -> Result<()> {
+        let module = get_module()?;
+
+        let extension_options = (module.create_extension_options)();
+
+        let mut config = SessionConfig::new().with_option_extension(extension_options);
+
+        // Verify default values are as expected
+        let returned_config: ExternalConfig = config
+            .options()
+            .local_or_ffi_extension()
+            .expect("should have external config extension");
+        assert_eq!(returned_config, ExternalConfig::default());
+
+        config = config.set(
+            "external_config.is_enabled",
+            &ScalarValue::Boolean(Some(false)),
+        );
+        let returned_config: ExternalConfig = config
+            .options()
+            .local_or_ffi_extension()
+            .expect("should have external config extension");
+        assert!(!returned_config.is_enabled);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/tests/ffi_execution_plan.rs b/datafusion/ffi/tests/ffi_execution_plan.rs
new file mode 100644
index 0000000000000..66a5ce4b0af9f
--- /dev/null
+++ b/datafusion/ffi/tests/ffi_execution_plan.rs
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[cfg(feature = "integration-tests")]
+mod tests {
+    use arrow::datatypes::Field;
+    use arrow::datatypes::Schema;
+    use arrow_schema::DataType;
+    use datafusion_common::DataFusionError;
+    use datafusion_ffi::execution_plan::FFI_ExecutionPlan;
+    use datafusion_ffi::execution_plan::ForeignExecutionPlan;
+    use datafusion_ffi::execution_plan::{ExecutionPlanPrivateData, tests::EmptyExec};
+    use datafusion_ffi::tests::utils::get_module;
+    use datafusion_physical_plan::ExecutionPlan;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_ffi_execution_plan_new_sets_runtimes_on_children()
+    -> Result<(), DataFusionError> {
+        // We want to test the case where we have two libraries.
+        // Library A will have a foreign plan from Library B, called child_plan.
+        // Library A will add a plan called grandchild_plan under child_plan
+        // Library A will create a plan called parent_plan, that has child_plan
+        // under it. So we should have:
+        // parent_plan (local) -> child_plan (foreign) -> grandchild_plan (local)
+        // Then we want to turn parent_plan into a FFI plan.
+        // Verify that grandchild_plan also gets the same runtime as parent_plan.
+
+        let module = get_module()?;
+
+        fn generate_local_plan() -> Arc<dyn ExecutionPlan> {
+            let schema =
+                Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)]));
+
+            Arc::new(EmptyExec::new(schema))
+        }
+
+        let child_plan = (module.create_empty_exec)();
+        let child_plan: Arc<dyn ExecutionPlan> = (&child_plan)
+            .try_into()
+            .expect("should be able create plan");
+        assert!(child_plan.is::<ForeignExecutionPlan>());
+
+        let grandchild_plan = generate_local_plan();
+
+        let child_plan = child_plan.with_new_children(vec![grandchild_plan])?;
+
+        unsafe {
+            // Originally the runtime is not set. We go through the unsafe casting
+            // of data here because the `inner()` function is private and this is
+            // only an integration test so we do not want to expose it.
+            let ffi_child = FFI_ExecutionPlan::new(Arc::clone(&child_plan), None);
+            let ffi_grandchild =
+                (ffi_child.children)(&ffi_child).into_iter().next().unwrap();
+
+            let grandchild_private_data =
+                ffi_grandchild.private_data as *const ExecutionPlanPrivateData;
+            assert!((*grandchild_private_data).runtime.is_none());
+        }
+
+        let parent_plan = generate_local_plan().with_new_children(vec![child_plan])?;
+
+        // Adding the grandchild beneath this FFI plan should get the runtime passed down.
+        let runtime = tokio::runtime::Builder::new_current_thread()
+            .build()
+            .unwrap();
+        let ffi_parent =
+            FFI_ExecutionPlan::new(parent_plan, Some(runtime.handle().clone()));
+
+        unsafe {
+            let ffi_child = (ffi_parent.children)(&ffi_parent)
+                .into_iter()
+                .next()
+                .unwrap();
+            let ffi_grandchild =
+                (ffi_child.children)(&ffi_child).into_iter().next().unwrap();
+            assert_eq!(
+                (ffi_grandchild.library_marker_id)(),
+                (ffi_parent.library_marker_id)()
+            );
+
+            let grandchild_private_data =
+                ffi_grandchild.private_data as *const ExecutionPlanPrivateData;
+            assert!((*grandchild_private_data).runtime.is_some());
+        }
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/tests/ffi_integration.rs b/datafusion/ffi/tests/ffi_integration.rs
index eb53e76bfb9b6..4186bafc83d7a 100644
--- a/datafusion/ffi/tests/ffi_integration.rs
+++ b/datafusion/ffi/tests/ffi_integration.rs
@@ -15,47 +15,48 @@
 // specific language governing permissions and limitations
 // under the License.
 
+mod utils;
+
 /// Add an additional module here for convenience to scope this to only
 /// when the feature integration-tests is built
 #[cfg(feature = "integration-tests")]
 mod tests {
-    use datafusion::error::{DataFusionError, Result};
-    use datafusion::prelude::SessionContext;
-    use datafusion_ffi::catalog_provider::ForeignCatalogProvider;
-    use datafusion_ffi::table_provider::ForeignTableProvider;
+    use std::collections::HashMap;
+    use std::sync::Arc;
+
+    use arrow::datatypes::Schema;
+    use datafusion::catalog::{TableProvider, TableProviderFactory};
+    use datafusion::error::Result;
+    use datafusion_common::TableReference;
+    use datafusion_common::ToDFSchema;
+    use datafusion_expr::CreateExternalTable;
     use datafusion_ffi::tests::create_record_batch;
     use datafusion_ffi::tests::utils::get_module;
-    use std::sync::Arc;
 
     /// It is important that this test is in the `tests` directory and not in the
     /// library directory so we can verify we are building a dynamic library and
     /// testing it via a different executable.
     async fn test_table_provider(synchronous: bool) -> Result<()> {
         let table_provider_module = get_module()?;
+        let (ctx, codec) = super::utils::ctx_and_codec();
 
         // By calling the code below, the table provided will be created within
         // the module's code.
-        let ffi_table_provider = table_provider_module.create_table().ok_or(
-            DataFusionError::NotImplemented(
-                "External table provider failed to implement create_table".to_string(),
-            ),
-        )?(synchronous);
+        let ffi_table_provider = (table_provider_module.create_table)(synchronous, codec);
 
         // In order to access the table provider within this executable, we need to
-        // turn it into a `ForeignTableProvider`.
-        let foreign_table_provider: ForeignTableProvider = (&ffi_table_provider).into();
-
-        let ctx = SessionContext::new();
+        // turn it into a `TableProvider`.
+        let foreign_table_provider: Arc<dyn TableProvider> = (&ffi_table_provider).into();
 
         // Display the data to show the full cycle works.
-        ctx.register_table("external_table", Arc::new(foreign_table_provider))?;
+        ctx.register_table("external_table", foreign_table_provider)?;
         let df = ctx.table("external_table").await?;
         let results = df.collect().await?;
 
         assert_eq!(results.len(), 3);
-        assert_eq!(results[0], create_record_batch(1, 5));
-        assert_eq!(results[1], create_record_batch(6, 1));
-        assert_eq!(results[2], create_record_batch(7, 5));
+        assert!(results.contains(&create_record_batch(1, 5)));
+        assert!(results.contains(&create_record_batch(6, 1)));
+        assert!(results.contains(&create_record_batch(7, 5)));
 
         Ok(())
     }
@@ -71,27 +72,37 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_catalog() -> Result<()> {
-        let module = get_module()?;
-
-        let ffi_catalog =
-            module
-                .create_catalog()
-                .ok_or(DataFusionError::NotImplemented(
-                    "External catalog provider failed to implement create_catalog"
-                        .to_string(),
-                ))?();
-        let foreign_catalog: ForeignCatalogProvider = (&ffi_catalog).into();
-
-        let ctx = SessionContext::default();
-        let _ = ctx.register_catalog("fruit", Arc::new(foreign_catalog));
-
-        let df = ctx.table("fruit.apple.purchases").await?;
-
-        let results = df.collect().await?;
-
-        assert!(!results.is_empty());
-        assert!(results[0].num_rows() != 0);
+    async fn test_table_provider_factory() -> Result<()> {
+        let table_provider_module = get_module()?;
+        let (ctx, codec) = super::utils::ctx_and_codec();
+
+        let ffi_table_provider_factory =
+            (table_provider_module.create_table_factory)(codec);
+
+        let foreign_table_provider_factory: Arc<dyn TableProviderFactory> =
+            (&ffi_table_provider_factory).into();
+
+        let cmd = CreateExternalTable {
+            schema: Schema::empty().to_dfschema_ref()?,
+            name: TableReference::bare("cloned_test"),
+            location: "test".to_string(),
+            file_type: "test".to_string(),
+            table_partition_cols: vec![],
+            if_not_exists: false,
+            or_replace: false,
+            temporary: false,
+            definition: None,
+            order_exprs: vec![],
+            unbounded: false,
+            options: HashMap::new(),
+            constraints: Default::default(),
+            column_defaults: HashMap::new(),
+        };
+
+        let provider = foreign_table_provider_factory
+            .create(&ctx.state(), &cmd)
+            .await?;
+        assert_eq!(provider.schema().fields().len(), 2);
 
         Ok(())
     }
diff --git a/datafusion/ffi/tests/ffi_physical_optimizer.rs b/datafusion/ffi/tests/ffi_physical_optimizer.rs
new file mode 100644
index 0000000000000..d860fda340ae6
--- /dev/null
+++ b/datafusion/ffi/tests/ffi_physical_optimizer.rs
@@ -0,0 +1,69 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[cfg(feature = "integration-tests")]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common::DataFusionError;
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_ffi::execution_plan::tests::EmptyExec;
+    use datafusion_ffi::physical_optimizer::ForeignPhysicalOptimizerRule;
+    use datafusion_ffi::tests::utils::get_module;
+    use datafusion_physical_optimizer::PhysicalOptimizerRule;
+    use datafusion_physical_plan::ExecutionPlan;
+
+    fn create_test_plan() -> Arc<dyn ExecutionPlan> {
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)]));
+        Arc::new(EmptyExec::new(schema))
+    }
+
+    #[test]
+    fn test_ffi_physical_optimizer_rule() -> Result<(), DataFusionError> {
+        let module = get_module()?;
+
+        let ffi_rule = (module.create_physical_optimizer_rule)();
+
+        let foreign_rule: Arc<dyn PhysicalOptimizerRule + Send + Sync> =
+            (&ffi_rule).into();
+
+        // Verify the rule is wrapped as a foreign rule
+        let any_ref: &dyn std::any::Any = &*foreign_rule;
+        assert!(
+            any_ref
+                .downcast_ref::<ForeignPhysicalOptimizerRule>()
+                .is_some()
+        );
+
+        // Verify name and schema_check pass through FFI
+        assert_eq!(foreign_rule.name(), "add_limit_rule");
+        assert!(foreign_rule.schema_check());
+
+        // Verify the rule actually transforms the plan
+        let plan = create_test_plan();
+        let config = ConfigOptions::new();
+        let optimized = foreign_rule.optimize(plan, &config)?;
+
+        assert_eq!(optimized.name(), "GlobalLimitExec");
+        assert_eq!(optimized.children().len(), 1);
+        assert_eq!(optimized.children()[0].name(), "empty-exec");
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/tests/ffi_udaf.rs b/datafusion/ffi/tests/ffi_udaf.rs
index ffd99bac62ecc..7df3404d7421b 100644
--- a/datafusion/ffi/tests/ffi_udaf.rs
+++ b/datafusion/ffi/tests/ffi_udaf.rs
@@ -19,28 +19,25 @@
 /// when the feature integration-tests is built
 #[cfg(feature = "integration-tests")]
 mod tests {
+    use std::sync::Arc;
+
     use arrow::array::Float64Array;
     use datafusion::common::record_batch;
-    use datafusion::error::{DataFusionError, Result};
-    use datafusion::logical_expr::AggregateUDF;
-    use datafusion::prelude::{col, SessionContext};
-
+    use datafusion::error::Result;
+    use datafusion::logical_expr::{AggregateUDF, AggregateUDFImpl};
+    use datafusion::prelude::{SessionContext, col};
+    use datafusion_catalog::MemTable;
+    use datafusion_expr::{ScalarUDF, ScalarUDFImpl};
     use datafusion_ffi::tests::utils::get_module;
-    use datafusion_ffi::udaf::ForeignAggregateUDF;
 
     #[tokio::test]
     async fn test_ffi_udaf() -> Result<()> {
         let module = get_module()?;
 
-        let ffi_sum_func =
-            module
-                .create_sum_udaf()
-                .ok_or(DataFusionError::NotImplemented(
-                    "External table provider failed to implement create_udaf".to_string(),
-                ))?();
-        let foreign_sum_func: ForeignAggregateUDF = (&ffi_sum_func).try_into()?;
+        let ffi_sum_func = (module.create_sum_udaf)();
+        let foreign_sum_func: Arc<dyn AggregateUDFImpl> = (&ffi_sum_func).into();
 
-        let udaf: AggregateUDF = foreign_sum_func.into();
+        let udaf = AggregateUDF::new_from_shared_impl(foreign_sum_func);
 
         let ctx = SessionContext::default();
         let record_batch = record_batch!(
@@ -74,15 +71,10 @@ mod tests {
     async fn test_ffi_grouping_udaf() -> Result<()> {
         let module = get_module()?;
 
-        let ffi_stddev_func =
-            module
-                .create_stddev_udaf()
-                .ok_or(DataFusionError::NotImplemented(
-                    "External table provider failed to implement create_udaf".to_string(),
-                ))?();
-        let foreign_stddev_func: ForeignAggregateUDF = (&ffi_stddev_func).try_into()?;
+        let ffi_stddev_func = (module.create_stddev_udaf)();
+        let foreign_stddev_func: Arc<dyn AggregateUDFImpl> = (&ffi_stddev_func).into();
 
-        let udaf: AggregateUDF = foreign_stddev_func.into();
+        let udaf = AggregateUDF::new_from_shared_impl(foreign_stddev_func);
 
         let ctx = SessionContext::default();
         let record_batch = record_batch!(
@@ -126,4 +118,58 @@ mod tests {
 
         Ok(())
     }
+
+    /// This test FFI UDFs can be used as inputs to FFI Aggregate UDFs.
+    /// Really this is a test of the Protobuf serialization and deserialization
+    /// using the TaskContextProvider. It can be demonstrated through the
+    /// UDAF accumulator arguments as an end-to-end test.
+    #[tokio::test]
+    async fn udf_as_input_to_udf() -> Result<()> {
+        let module = get_module()?;
+
+        let ffi_abs_func = (module.create_scalar_udf)();
+        let foreign_abs_func: Arc<dyn ScalarUDFImpl> = (&ffi_abs_func).into();
+        let abs_udf = ScalarUDF::new_from_shared_impl(foreign_abs_func);
+
+        let ctx = SessionContext::new();
+        ctx.deregister_udf("abs");
+
+        let ffi_sum_func = (module.create_sum_udaf)();
+        let foreign_sum_func: Arc<dyn AggregateUDFImpl> = (&ffi_sum_func).into();
+
+        let udaf = AggregateUDF::new_from_shared_impl(foreign_sum_func);
+
+        // We need at least 2 record batches so we get an accumulator
+        let ctx = SessionContext::default();
+        let rb1 = record_batch!(
+            ("a", Int32, vec![1, 2, 2, 4, 4, 4, 4]),
+            ("b", Float64, vec![-1.0, 2.0, -2.0, 4.0, -4.0, -4.0, -4.0])
+        )
+        .unwrap();
+        let rb2 = rb1.clone();
+
+        let table = Arc::new(MemTable::try_new(rb1.schema(), vec![vec![rb1, rb2]])?);
+
+        let df = ctx.read_table(table)?;
+
+        let df = df
+            .aggregate(
+                vec![col("a")],
+                vec![udaf.call(vec![abs_udf.call(vec![col("b")])]).alias("sum_b")],
+            )?
+            .sort_by(vec![col("a")])?;
+
+        df.clone().show().await?;
+
+        let result = df.collect().await?;
+
+        let expected = record_batch!(
+            ("a", Int32, vec![1, 2, 4]),
+            ("sum_b", Float64, vec![2.0, 8.0, 32.0])
+        )?;
+
+        assert_eq!(result[0], expected);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/ffi/tests/ffi_udf.rs b/datafusion/ffi/tests/ffi_udf.rs
index fd6a84bcf5b08..6e6cb31f53133 100644
--- a/datafusion/ffi/tests/ffi_udf.rs
+++ b/datafusion/ffi/tests/ffi_udf.rs
@@ -19,16 +19,17 @@
 /// when the feature integration-tests is built
 #[cfg(feature = "integration-tests")]
 mod tests {
-
+    use arrow::array::{Array, AsArray};
     use arrow::datatypes::DataType;
     use datafusion::common::record_batch;
-    use datafusion::error::{DataFusionError, Result};
-    use datafusion::logical_expr::ScalarUDF;
-    use datafusion::prelude::{col, SessionContext};
-
+    use datafusion::error::Result;
+    use datafusion::logical_expr::{ScalarUDF, ScalarUDFImpl};
+    use datafusion::prelude::{SessionContext, col};
+    use datafusion_execution::config::SessionConfig;
+    use datafusion_expr::lit;
     use datafusion_ffi::tests::create_record_batch;
     use datafusion_ffi::tests::utils::get_module;
-    use datafusion_ffi::udf::ForeignScalarUDF;
+    use std::sync::Arc;
 
     /// This test validates that we can load an external module and use a scalar
     /// udf defined in it via the foreign function interface. In this case we are
@@ -37,16 +38,10 @@ mod tests {
     async fn test_scalar_udf() -> Result<()> {
         let module = get_module()?;
 
-        let ffi_abs_func =
-            module
-                .create_scalar_udf()
-                .ok_or(DataFusionError::NotImplemented(
-                    "External table provider failed to implement create_scalar_udf"
-                        .to_string(),
-                ))?();
-        let foreign_abs_func: ForeignScalarUDF = (&ffi_abs_func).try_into()?;
+        let ffi_abs_func = (module.create_scalar_udf)();
+        let foreign_abs_func: Arc<dyn ScalarUDFImpl> = (&ffi_abs_func).into();
 
-        let udf: ScalarUDF = foreign_abs_func.into();
+        let udf = ScalarUDF::new_from_shared_impl(foreign_abs_func);
 
         let ctx = SessionContext::default();
         let df = ctx.read_batch(create_record_batch(-5, 5))?;
@@ -75,16 +70,10 @@ mod tests {
     async fn test_nullary_scalar_udf() -> Result<()> {
         let module = get_module()?;
 
-        let ffi_abs_func =
-            module
-                .create_nullary_udf()
-                .ok_or(DataFusionError::NotImplemented(
-                    "External table provider failed to implement create_scalar_udf"
-                        .to_string(),
-                ))?();
-        let foreign_abs_func: ForeignScalarUDF = (&ffi_abs_func).try_into()?;
+        let ffi_abs_func = (module.create_nullary_udf)();
+        let foreign_abs_func: Arc<dyn ScalarUDFImpl> = (&ffi_abs_func).into();
 
-        let udf: ScalarUDF = foreign_abs_func.into();
+        let udf = ScalarUDF::new_from_shared_impl(foreign_abs_func);
 
         let ctx = SessionContext::default();
         let df = ctx.read_batch(create_record_batch(-5, 5))?;
@@ -101,4 +90,41 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_config_on_scalar_udf() -> Result<()> {
+        let module = get_module()?;
+
+        let ffi_udf = (module.create_timezone_udf)();
+        let foreign_udf: Arc<dyn ScalarUDFImpl> = (&ffi_udf).into();
+
+        let udf = ScalarUDF::new_from_shared_impl(foreign_udf);
+
+        let ctx = SessionContext::default();
+
+        let df = ctx
+            .read_empty()?
+            .select(vec![udf.call(vec![lit("a")]).alias("a")])?;
+
+        let result = df.collect().await?;
+        assert!(result[0].column(0).as_string::<i32>().is_null(0));
+
+        let mut config = SessionConfig::new();
+        config.options_mut().execution.time_zone = Some("AEST".into());
+
+        let ctx = SessionContext::new_with_config(config);
+
+        let df = ctx
+            .read_empty()?
+            .select(vec![udf.call(vec![lit("a")]).alias("a")])?;
+
+        let result = df.collect().await?;
+
+        assert!(result.len() == 1);
+        assert!(!result[0].column(0).as_string::<i32>().is_null(0));
+        let result = result[0].column(0).as_string::<i32>().value(0);
+        assert_eq!(result, "AEST");
+
+        Ok(())
+    }
 }
diff --git a/datafusion/ffi/tests/ffi_udtf.rs b/datafusion/ffi/tests/ffi_udtf.rs
index 8c1c64a092e13..69e5de90e1364 100644
--- a/datafusion/ffi/tests/ffi_udtf.rs
+++ b/datafusion/ffi/tests/ffi_udtf.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+mod utils;
+
 /// Add an additional module here for convenience to scope this to only
 /// when the feature integration-tests is built
 #[cfg(feature = "integration-tests")]
@@ -22,12 +24,10 @@ mod tests {
 
     use std::sync::Arc;
 
-    use arrow::array::{create_array, ArrayRef};
-    use datafusion::error::{DataFusionError, Result};
-    use datafusion::prelude::SessionContext;
-
+    use arrow::array::{ArrayRef, create_array};
+    use datafusion::catalog::TableFunctionImpl;
+    use datafusion::error::Result;
     use datafusion_ffi::tests::utils::get_module;
-    use datafusion_ffi::udtf::ForeignTableFunction;
 
     /// This test validates that we can load an external module and use a scalar
     /// udf defined in it via the foreign function interface. In this case we are
@@ -35,19 +35,12 @@ mod tests {
     #[tokio::test]
     async fn test_user_defined_table_function() -> Result<()> {
         let module = get_module()?;
+        let (ctx, codec) = super::utils::ctx_and_codec();
 
-        let ffi_table_func = module
-            .create_table_function()
-            .ok_or(DataFusionError::NotImplemented(
-            "External table function provider failed to implement create_table_function"
-                .to_string(),
-        ))?();
-        let foreign_table_func: ForeignTableFunction = ffi_table_func.into();
-
-        let udtf = Arc::new(foreign_table_func);
+        let ffi_table_func = (module.create_table_function)(codec);
+        let foreign_table_func: Arc<dyn TableFunctionImpl> = ffi_table_func.into();
 
-        let ctx = SessionContext::default();
-        ctx.register_udtf("my_range", udtf);
+        ctx.register_udtf("my_range", foreign_table_func);
 
         let result = ctx
             .sql("SELECT * FROM my_range(5)")
diff --git a/datafusion/ffi/tests/ffi_udwf.rs b/datafusion/ffi/tests/ffi_udwf.rs
index 18ffd0c5bcb79..66f2621d5fe63 100644
--- a/datafusion/ffi/tests/ffi_udwf.rs
+++ b/datafusion/ffi/tests/ffi_udwf.rs
@@ -19,29 +19,24 @@
 /// when the feature integration-tests is built
 #[cfg(feature = "integration-tests")]
 mod tests {
-    use arrow::array::{create_array, ArrayRef};
-    use datafusion::error::{DataFusionError, Result};
+    use std::sync::Arc;
+
+    use arrow::array::{ArrayRef, create_array};
+    use datafusion::error::Result;
     use datafusion::logical_expr::expr::Sort;
-    use datafusion::logical_expr::{col, ExprFunctionExt, WindowUDF};
+    use datafusion::logical_expr::{ExprFunctionExt, WindowUDF, WindowUDFImpl, col};
     use datafusion::prelude::SessionContext;
     use datafusion_ffi::tests::create_record_batch;
     use datafusion_ffi::tests::utils::get_module;
-    use datafusion_ffi::udwf::ForeignWindowUDF;
 
     #[tokio::test]
     async fn test_rank_udwf() -> Result<()> {
         let module = get_module()?;
 
-        let ffi_rank_func =
-            module
-                .create_rank_udwf()
-                .ok_or(DataFusionError::NotImplemented(
-                    "External table provider failed to implement create_scalar_udf"
-                        .to_string(),
-                ))?();
-        let foreign_rank_func: ForeignWindowUDF = (&ffi_rank_func).try_into()?;
+        let ffi_rank_func = (module.create_rank_udwf)();
+        let foreign_rank_func: Arc<dyn WindowUDFImpl> = (&ffi_rank_func).into();
 
-        let udwf: WindowUDF = foreign_rank_func.into();
+        let udwf = WindowUDF::new_from_shared_impl(foreign_rank_func);
 
         let ctx = SessionContext::default();
         let df = ctx.read_batch(create_record_batch(-5, 5))?;
diff --git a/datafusion/ffi/tests/utils/mod.rs b/datafusion/ffi/tests/utils/mod.rs
new file mode 100644
index 0000000000000..acf59de7f3464
--- /dev/null
+++ b/datafusion/ffi/tests/utils/mod.rs
@@ -0,0 +1,43 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use datafusion::prelude::SessionContext;
+use datafusion_execution::TaskContextProvider;
+use datafusion_ffi::execution::FFI_TaskContextProvider;
+use datafusion_ffi::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use datafusion_proto::logical_plan::DefaultLogicalExtensionCodec;
+
+// Creates a default SessionContext and FFI Logical Extension Codec
+// for use in FFI integration tests.
+//
+// This helper centralizes setup logic and is kept intentionally
+// for upcoming FFI test expansions.
+#[cfg_attr(not(feature = "integration-tests"), expect(dead_code))]
+pub fn ctx_and_codec() -> (Arc<SessionContext>, FFI_LogicalExtensionCodec) {
+    let ctx = Arc::new(SessionContext::default());
+    let task_ctx_provider = Arc::clone(&ctx) as Arc<dyn TaskContextProvider>;
+    let task_ctx_provider = FFI_TaskContextProvider::from(&task_ctx_provider);
+    let codec = FFI_LogicalExtensionCodec::new(
+        Arc::new(DefaultLogicalExtensionCodec {}),
+        None,
+        task_ctx_provider,
+    );
+
+    (ctx, codec)
+}
diff --git a/datafusion/functions-aggregate-common/Cargo.toml b/datafusion/functions-aggregate-common/Cargo.toml
index a6e0a1fc2f8bb..1714e1800a4fe 100644
--- a/datafusion/functions-aggregate-common/Cargo.toml
+++ b/datafusion/functions-aggregate-common/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -38,7 +41,6 @@ workspace = true
 name = "datafusion_functions_aggregate_common"
 
 [dependencies]
-ahash = { workspace = true }
 arrow = { workspace = true }
 datafusion-common = { workspace = true }
 datafusion-expr-common = { workspace = true }
diff --git a/datafusion/functions-aggregate-common/benches/accumulate.rs b/datafusion/functions-aggregate-common/benches/accumulate.rs
index f422f8a2a7bfd..aceec57df9666 100644
--- a/datafusion/functions-aggregate-common/benches/accumulate.rs
+++ b/datafusion/functions-aggregate-common/benches/accumulate.rs
@@ -15,12 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, BooleanArray, Int64Array};
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::accumulate_indices;
 
 fn generate_group_indices(len: usize) -> Vec<usize> {
diff --git a/datafusion/functions-aggregate-common/src/aggregate/avg_distinct/decimal.rs b/datafusion/functions-aggregate-common/src/aggregate/avg_distinct/decimal.rs
index 9920bf5bf4485..0a4c1692baa84 100644
--- a/datafusion/functions-aggregate-common/src/aggregate/avg_distinct/decimal.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate/avg_distinct/decimal.rs
@@ -18,7 +18,7 @@
 use arrow::{
     array::{ArrayRef, ArrowNumericType},
     datatypes::{
-        i256, Decimal128Type, Decimal256Type, Decimal32Type, Decimal64Type, DecimalType,
+        Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, DecimalType, i256,
     },
 };
 use datafusion_common::{Result, ScalarValue};
@@ -158,7 +158,7 @@ impl<T: DecimalType + ArrowNumericType + Debug> Accumulator
 mod tests {
     use super::*;
     use arrow::array::{
-        Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array,
+        Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array,
     };
     use std::sync::Arc;
 
diff --git a/datafusion/functions-aggregate-common/src/aggregate/count_distinct.rs b/datafusion/functions-aggregate-common/src/aggregate/count_distinct.rs
index 25b40382299b4..83cc5cded8361 100644
--- a/datafusion/functions-aggregate-common/src/aggregate/count_distinct.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate/count_distinct.rs
@@ -17,10 +17,16 @@
 
 mod bytes;
 mod dict;
+mod groups;
 mod native;
 
 pub use bytes::BytesDistinctCountAccumulator;
 pub use bytes::BytesViewDistinctCountAccumulator;
 pub use dict::DictionaryCountAccumulator;
+pub use groups::PrimitiveDistinctCountGroupsAccumulator;
+pub use native::Bitmap65536DistinctCountAccumulator;
+pub use native::Bitmap65536DistinctCountAccumulatorI16;
+pub use native::BoolArray256DistinctCountAccumulator;
+pub use native::BoolArray256DistinctCountAccumulatorI8;
 pub use native::FloatDistinctCountAccumulator;
 pub use native::PrimitiveDistinctCountAccumulator;
diff --git a/datafusion/functions-aggregate-common/src/aggregate/count_distinct/bytes.rs b/datafusion/functions-aggregate-common/src/aggregate/count_distinct/bytes.rs
index e321df61ddc6a..6e0d55bd64372 100644
--- a/datafusion/functions-aggregate-common/src/aggregate/count_distinct/bytes.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate/count_distinct/bytes.rs
@@ -18,9 +18,9 @@
 //! [`BytesDistinctCountAccumulator`] for Utf8/LargeUtf8/Binary/LargeBinary values
 
 use arrow::array::{ArrayRef, OffsetSizeTrait};
+use datafusion_common::ScalarValue;
 use datafusion_common::cast::as_list_array;
 use datafusion_common::utils::SingleRowListArrayBuilder;
-use datafusion_common::ScalarValue;
 use datafusion_expr_common::accumulator::Accumulator;
 use datafusion_physical_expr_common::binary_map::{ArrowBytesSet, OutputType};
 use datafusion_physical_expr_common::binary_view_map::ArrowBytesViewSet;
@@ -48,7 +48,9 @@ impl<O: OffsetSizeTrait> Accumulator for BytesDistinctCountAccumulator<O> {
     fn state(&mut self) -> datafusion_common::Result<Vec<ScalarValue>> {
         let set = self.0.take();
         let arr = set.into_state();
-        Ok(vec![SingleRowListArrayBuilder::new(arr).build_list_scalar()])
+        Ok(vec![
+            SingleRowListArrayBuilder::new(arr).build_list_scalar(),
+        ])
     }
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> datafusion_common::Result<()> {
@@ -107,7 +109,9 @@ impl Accumulator for BytesViewDistinctCountAccumulator {
     fn state(&mut self) -> datafusion_common::Result<Vec<ScalarValue>> {
         let set = self.0.take();
         let arr = set.into_state();
-        Ok(vec![SingleRowListArrayBuilder::new(arr).build_list_scalar()])
+        Ok(vec![
+            SingleRowListArrayBuilder::new(arr).build_list_scalar(),
+        ])
     }
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> datafusion_common::Result<()> {
diff --git a/datafusion/functions-aggregate-common/src/aggregate/count_distinct/dict.rs b/datafusion/functions-aggregate-common/src/aggregate/count_distinct/dict.rs
index 089d8d5acded1..d71aed3debe95 100644
--- a/datafusion/functions-aggregate-common/src/aggregate/count_distinct/dict.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate/count_distinct/dict.rs
@@ -17,8 +17,8 @@
 
 use arrow::array::{ArrayRef, BooleanArray};
 use arrow::downcast_dictionary_array;
-use datafusion_common::{arrow_datafusion_err, ScalarValue};
-use datafusion_common::{internal_err, DataFusionError};
+use datafusion_common::internal_err;
+use datafusion_common::{ScalarValue, arrow_datafusion_err};
 use datafusion_expr_common::accumulator::Accumulator;
 
 #[derive(Debug)]
diff --git a/datafusion/functions-aggregate-common/src/aggregate/count_distinct/groups.rs b/datafusion/functions-aggregate-common/src/aggregate/count_distinct/groups.rs
new file mode 100644
index 0000000000000..d370d59c90012
--- /dev/null
+++ b/datafusion/functions-aggregate-common/src/aggregate/count_distinct/groups.rs
@@ -0,0 +1,191 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    ArrayRef, AsArray, BooleanArray, Int64Array, ListArray, PrimitiveArray,
+};
+use arrow::buffer::{OffsetBuffer, ScalarBuffer};
+use arrow::datatypes::{ArrowPrimitiveType, Field};
+use datafusion_common::HashSet;
+use datafusion_common::hash_utils::RandomState;
+use datafusion_expr_common::groups_accumulator::{EmitTo, GroupsAccumulator};
+use std::hash::Hash;
+use std::mem::size_of;
+use std::sync::Arc;
+
+use crate::aggregate::groups_accumulator::accumulate::accumulate;
+
+pub struct PrimitiveDistinctCountGroupsAccumulator<T: ArrowPrimitiveType>
+where
+    T::Native: Eq + Hash,
+{
+    seen: HashSet<(usize, T::Native), RandomState>,
+    counts: Vec<i64>,
+}
+
+impl<T: ArrowPrimitiveType> PrimitiveDistinctCountGroupsAccumulator<T>
+where
+    T::Native: Eq + Hash,
+{
+    pub fn new() -> Self {
+        Self {
+            seen: HashSet::default(),
+            counts: Vec::new(),
+        }
+    }
+}
+
+impl<T: ArrowPrimitiveType> Default for PrimitiveDistinctCountGroupsAccumulator<T>
+where
+    T::Native: Eq + Hash,
+{
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<T: ArrowPrimitiveType + Send + std::fmt::Debug> GroupsAccumulator
+    for PrimitiveDistinctCountGroupsAccumulator<T>
+where
+    T::Native: Eq + Hash,
+{
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> datafusion_common::Result<()> {
+        debug_assert_eq!(values.len(), 1);
+        self.counts.resize(total_num_groups, 0);
+        let arr = values[0].as_primitive::<T>();
+        accumulate(group_indices, arr, opt_filter, |group_idx, value| {
+            if self.seen.insert((group_idx, value)) {
+                self.counts[group_idx] += 1;
+            }
+        });
+        Ok(())
+    }
+
+    fn evaluate(&mut self, emit_to: EmitTo) -> datafusion_common::Result<ArrayRef> {
+        let counts = emit_to.take_needed(&mut self.counts);
+
+        match emit_to {
+            EmitTo::All => {
+                self.seen.clear();
+            }
+            EmitTo::First(n) => {
+                let mut remaining = HashSet::default();
+                for (group_idx, value) in self.seen.drain() {
+                    if group_idx >= n {
+                        remaining.insert((group_idx - n, value));
+                    }
+                }
+                self.seen = remaining;
+            }
+        }
+
+        Ok(Arc::new(Int64Array::from(counts)))
+    }
+
+    fn state(&mut self, emit_to: EmitTo) -> datafusion_common::Result<Vec<ArrayRef>> {
+        let num_emitted = match emit_to {
+            EmitTo::All => self.counts.len(),
+            EmitTo::First(n) => n,
+        };
+
+        // Prefix-sum counts[..num_emitted] into offsets
+        let mut offsets = Vec::with_capacity(num_emitted + 1);
+        offsets.push(0i32);
+        let mut total = 0i32;
+        for &c in &self.counts[..num_emitted] {
+            total += c as i32;
+            offsets.push(total);
+        }
+
+        let mut all_values = vec![T::Native::default(); total as usize];
+        let mut cursors: Vec<i32> = offsets[..num_emitted].to_vec();
+
+        if matches!(emit_to, EmitTo::All) {
+            for (group_idx, value) in self.seen.drain() {
+                let pos = cursors[group_idx] as usize;
+                all_values[pos] = value;
+                cursors[group_idx] += 1;
+            }
+            self.counts.clear();
+        } else {
+            let mut remaining = HashSet::default();
+            for (group_idx, value) in self.seen.drain() {
+                if group_idx < num_emitted {
+                    let pos = cursors[group_idx] as usize;
+                    all_values[pos] = value;
+                    cursors[group_idx] += 1;
+                } else {
+                    remaining.insert((group_idx - num_emitted, value));
+                }
+            }
+            self.seen = remaining;
+            let _ = emit_to.take_needed(&mut self.counts);
+        }
+
+        let values_array = Arc::new(PrimitiveArray::<T>::new(
+            ScalarBuffer::from(all_values),
+            None,
+        ));
+        let list_array = ListArray::new(
+            Arc::new(Field::new_list_field(T::DATA_TYPE, true)),
+            OffsetBuffer::new(offsets.into()),
+            values_array,
+            None,
+        );
+
+        Ok(vec![Arc::new(list_array)])
+    }
+
+    fn merge_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        _opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> datafusion_common::Result<()> {
+        debug_assert_eq!(values.len(), 1);
+        self.counts.resize(total_num_groups, 0);
+        let list_array = values[0].as_list::<i32>();
+        let inner = list_array.values().as_primitive::<T>();
+        let inner_values = inner.values();
+        let offsets = list_array.offsets();
+
+        for (row_idx, &group_idx) in group_indices.iter().enumerate() {
+            let start = offsets[row_idx] as usize;
+            let end = offsets[row_idx + 1] as usize;
+            for &value in &inner_values[start..end] {
+                if self.seen.insert((group_idx, value)) {
+                    self.counts[group_idx] += 1;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    fn size(&self) -> usize {
+        size_of::<Self>()
+            + self.seen.capacity() * (size_of::<(usize, T::Native)>() + size_of::<u64>())
+            + self.counts.capacity() * size_of::<i64>()
+    }
+}
diff --git a/datafusion/functions-aggregate-common/src/aggregate/count_distinct/native.rs b/datafusion/functions-aggregate-common/src/aggregate/count_distinct/native.rs
index e8b6588dc0913..fb9cfb379a26e 100644
--- a/datafusion/functions-aggregate-common/src/aggregate/count_distinct/native.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate/count_distinct/native.rs
@@ -26,19 +26,19 @@ use std::hash::Hash;
 use std::mem::size_of_val;
 use std::sync::Arc;
 
-use ahash::RandomState;
-use arrow::array::types::ArrowPrimitiveType;
 use arrow::array::ArrayRef;
 use arrow::array::PrimitiveArray;
+use arrow::array::types::ArrowPrimitiveType;
 use arrow::datatypes::DataType;
+use datafusion_common::hash_utils::RandomState;
 
+use datafusion_common::ScalarValue;
 use datafusion_common::cast::{as_list_array, as_primitive_array};
-use datafusion_common::utils::memory::estimate_memory_size;
 use datafusion_common::utils::SingleRowListArrayBuilder;
-use datafusion_common::ScalarValue;
+use datafusion_common::utils::memory::estimate_memory_size;
 use datafusion_expr_common::accumulator::Accumulator;
 
-use crate::utils::Hashable;
+use crate::utils::GenericDistinctBuffer;
 
 #[derive(Debug)]
 pub struct PrimitiveDistinctCountAccumulator<T>
@@ -73,9 +73,12 @@ where
             PrimitiveArray::<T>::from_iter_values(self.values.iter().cloned())
                 .with_data_type(self.data_type.clone()),
         );
-        Ok(vec![SingleRowListArrayBuilder::new(arr).build_list_scalar()])
+        Ok(vec![
+            SingleRowListArrayBuilder::new(arr).build_list_scalar(),
+        ])
     }
 
+    #[inline(never)]
     fn update_batch(&mut self, values: &[ArrayRef]) -> datafusion_common::Result<()> {
         if values.is_empty() {
             return Ok(());
@@ -124,56 +127,162 @@ where
 }
 
 #[derive(Debug)]
-pub struct FloatDistinctCountAccumulator<T>
-where
-    T: ArrowPrimitiveType + Send,
-{
-    values: HashSet<Hashable<T::Native>, RandomState>,
+pub struct FloatDistinctCountAccumulator<T: ArrowPrimitiveType> {
+    values: GenericDistinctBuffer<T>,
 }
 
-impl<T> FloatDistinctCountAccumulator<T>
-where
-    T: ArrowPrimitiveType + Send,
-{
+impl<T: ArrowPrimitiveType> FloatDistinctCountAccumulator<T> {
     pub fn new() -> Self {
         Self {
-            values: HashSet::default(),
+            values: GenericDistinctBuffer::new(T::DATA_TYPE),
         }
     }
 }
 
-impl<T> Default for FloatDistinctCountAccumulator<T>
-where
-    T: ArrowPrimitiveType + Send,
-{
+impl<T: ArrowPrimitiveType> Default for FloatDistinctCountAccumulator<T> {
     fn default() -> Self {
         Self::new()
     }
 }
 
-impl<T> Accumulator for FloatDistinctCountAccumulator<T>
-where
-    T: ArrowPrimitiveType + Send + Debug,
-{
+impl<T: ArrowPrimitiveType + Debug> Accumulator for FloatDistinctCountAccumulator<T> {
     fn state(&mut self) -> datafusion_common::Result<Vec<ScalarValue>> {
-        let arr = Arc::new(PrimitiveArray::<T>::from_iter_values(
-            self.values.iter().map(|v| v.0),
-        )) as ArrayRef;
-        Ok(vec![SingleRowListArrayBuilder::new(arr).build_list_scalar()])
+        self.values.state()
+    }
+
+    #[inline(never)]
+    fn update_batch(&mut self, values: &[ArrayRef]) -> datafusion_common::Result<()> {
+        self.values.update_batch(values)
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> datafusion_common::Result<()> {
+        self.values.merge_batch(states)
+    }
+
+    fn evaluate(&mut self) -> datafusion_common::Result<ScalarValue> {
+        Ok(ScalarValue::Int64(Some(self.values.values.len() as i64)))
     }
 
+    fn size(&self) -> usize {
+        size_of_val(self) + self.values.size()
+    }
+}
+
+/// Optimized COUNT DISTINCT accumulator for u8 using a bool array.
+/// Uses 256 bytes to track all possible u8 values.
+#[derive(Debug)]
+pub struct BoolArray256DistinctCountAccumulator {
+    seen: [bool; 256],
+}
+
+impl BoolArray256DistinctCountAccumulator {
+    pub fn new() -> Self {
+        Self { seen: [false; 256] }
+    }
+
+    #[inline]
+    fn count(&self) -> i64 {
+        self.seen.iter().filter(|&&b| b).count() as i64
+    }
+}
+
+impl Default for BoolArray256DistinctCountAccumulator {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Accumulator for BoolArray256DistinctCountAccumulator {
+    #[inline(never)]
     fn update_batch(&mut self, values: &[ArrayRef]) -> datafusion_common::Result<()> {
         if values.is_empty() {
             return Ok(());
         }
 
-        let arr = as_primitive_array::<T>(&values[0])?;
-        arr.iter().for_each(|value| {
-            if let Some(value) = value {
-                self.values.insert(Hashable(value));
-            }
-        });
+        let arr = as_primitive_array::<arrow::datatypes::UInt8Type>(&values[0])?;
+        for value in arr.iter().flatten() {
+            self.seen[value as usize] = true;
+        }
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> datafusion_common::Result<()> {
+        if states.is_empty() {
+            return Ok(());
+        }
+
+        let arr = as_list_array(&states[0])?;
+        arr.iter().try_for_each(|maybe_list| {
+            if let Some(list) = maybe_list {
+                let list = as_primitive_array::<arrow::datatypes::UInt8Type>(&list)?;
+                for value in list.values().iter() {
+                    self.seen[*value as usize] = true;
+                }
+            };
+            Ok(())
+        })
+    }
+
+    fn state(&mut self) -> datafusion_common::Result<Vec<ScalarValue>> {
+        let values: Vec<u8> = self
+            .seen
+            .iter()
+            .enumerate()
+            .filter_map(|(idx, &seen)| if seen { Some(idx as u8) } else { None })
+            .collect();
+
+        let arr = Arc::new(
+            PrimitiveArray::<arrow::datatypes::UInt8Type>::from_iter_values(values),
+        );
+        Ok(vec![
+            SingleRowListArrayBuilder::new(arr).build_list_scalar(),
+        ])
+    }
+
+    fn evaluate(&mut self) -> datafusion_common::Result<ScalarValue> {
+        Ok(ScalarValue::Int64(Some(self.count())))
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self) + 256
+    }
+}
+
+/// Optimized COUNT DISTINCT accumulator for i8 using a bool array.
+/// Uses 256 bytes to track all possible i8 values (mapped to 0..255).
+#[derive(Debug)]
+pub struct BoolArray256DistinctCountAccumulatorI8 {
+    seen: [bool; 256],
+}
+
+impl BoolArray256DistinctCountAccumulatorI8 {
+    pub fn new() -> Self {
+        Self { seen: [false; 256] }
+    }
+
+    #[inline]
+    fn count(&self) -> i64 {
+        self.seen.iter().filter(|&&b| b).count() as i64
+    }
+}
+
+impl Default for BoolArray256DistinctCountAccumulatorI8 {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Accumulator for BoolArray256DistinctCountAccumulatorI8 {
+    #[inline(never)]
+    fn update_batch(&mut self, values: &[ArrayRef]) -> datafusion_common::Result<()> {
+        if values.is_empty() {
+            return Ok(());
+        }
 
+        let arr = as_primitive_array::<arrow::datatypes::Int8Type>(&values[0])?;
+        for value in arr.iter().flatten() {
+            self.seen[value as u8 as usize] = true;
+        }
         Ok(())
     }
 
@@ -181,31 +290,231 @@ where
         if states.is_empty() {
             return Ok(());
         }
-        assert_eq!(
-            states.len(),
-            1,
-            "count_distinct states must be single array"
+
+        let arr = as_list_array(&states[0])?;
+        arr.iter().try_for_each(|maybe_list| {
+            if let Some(list) = maybe_list {
+                let list = as_primitive_array::<arrow::datatypes::Int8Type>(&list)?;
+                for value in list.values().iter() {
+                    self.seen[*value as u8 as usize] = true;
+                }
+            };
+            Ok(())
+        })
+    }
+
+    fn state(&mut self) -> datafusion_common::Result<Vec<ScalarValue>> {
+        let values: Vec<i8> = self
+            .seen
+            .iter()
+            .enumerate()
+            .filter_map(
+                |(idx, &seen)| {
+                    if seen { Some(idx as u8 as i8) } else { None }
+                },
+            )
+            .collect();
+
+        let arr = Arc::new(
+            PrimitiveArray::<arrow::datatypes::Int8Type>::from_iter_values(values),
         );
+        Ok(vec![
+            SingleRowListArrayBuilder::new(arr).build_list_scalar(),
+        ])
+    }
+
+    fn evaluate(&mut self) -> datafusion_common::Result<ScalarValue> {
+        Ok(ScalarValue::Int64(Some(self.count())))
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self) + 256
+    }
+}
+
+/// Optimized COUNT DISTINCT accumulator for u16 using a 65536-bit bitmap.
+/// Uses 8KB (1024 x u64) to track all possible u16 values.
+#[derive(Debug)]
+pub struct Bitmap65536DistinctCountAccumulator {
+    bitmap: Box<[u64; 1024]>,
+}
+
+impl Bitmap65536DistinctCountAccumulator {
+    pub fn new() -> Self {
+        Self {
+            bitmap: Box::new([0; 1024]),
+        }
+    }
+
+    #[inline]
+    fn set_bit(&mut self, value: u16) {
+        let word = (value / 64) as usize;
+        let bit = value % 64;
+        self.bitmap[word] |= 1u64 << bit;
+    }
+
+    #[inline]
+    fn count(&self) -> i64 {
+        self.bitmap.iter().map(|w| w.count_ones() as i64).sum()
+    }
+}
+
+impl Default for Bitmap65536DistinctCountAccumulator {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Accumulator for Bitmap65536DistinctCountAccumulator {
+    #[inline(never)]
+    fn update_batch(&mut self, values: &[ArrayRef]) -> datafusion_common::Result<()> {
+        if values.is_empty() {
+            return Ok(());
+        }
+
+        let arr = as_primitive_array::<arrow::datatypes::UInt16Type>(&values[0])?;
+        for value in arr.iter().flatten() {
+            self.set_bit(value);
+        }
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> datafusion_common::Result<()> {
+        if states.is_empty() {
+            return Ok(());
+        }
 
         let arr = as_list_array(&states[0])?;
         arr.iter().try_for_each(|maybe_list| {
             if let Some(list) = maybe_list {
-                let list = as_primitive_array::<T>(&list)?;
-                self.values
-                    .extend(list.values().iter().map(|v| Hashable(*v)));
+                let list = as_primitive_array::<arrow::datatypes::UInt16Type>(&list)?;
+                for value in list.values().iter() {
+                    self.set_bit(*value);
+                }
             };
             Ok(())
         })
     }
 
+    fn state(&mut self) -> datafusion_common::Result<Vec<ScalarValue>> {
+        let mut values = Vec::new();
+        for (word_idx, &word) in self.bitmap.iter().enumerate() {
+            if word != 0 {
+                for bit in 0..64 {
+                    if (word & (1u64 << bit)) != 0 {
+                        values.push((word_idx as u16) * 64 + bit);
+                    }
+                }
+            }
+        }
+
+        let arr = Arc::new(
+            PrimitiveArray::<arrow::datatypes::UInt16Type>::from_iter_values(values),
+        );
+        Ok(vec![
+            SingleRowListArrayBuilder::new(arr).build_list_scalar(),
+        ])
+    }
+
     fn evaluate(&mut self) -> datafusion_common::Result<ScalarValue> {
-        Ok(ScalarValue::Int64(Some(self.values.len() as i64)))
+        Ok(ScalarValue::Int64(Some(self.count())))
     }
 
     fn size(&self) -> usize {
-        let num_elements = self.values.len();
-        let fixed_size = size_of_val(self) + size_of_val(&self.values);
+        size_of_val(self) + 8192
+    }
+}
 
-        estimate_memory_size::<T::Native>(num_elements, fixed_size).unwrap()
+/// Optimized COUNT DISTINCT accumulator for i16 using a 65536-bit bitmap.
+/// Uses 8KB (1024 x u64) to track all possible i16 values (mapped to 0..65535).
+#[derive(Debug)]
+pub struct Bitmap65536DistinctCountAccumulatorI16 {
+    bitmap: Box<[u64; 1024]>,
+}
+
+impl Bitmap65536DistinctCountAccumulatorI16 {
+    pub fn new() -> Self {
+        Self {
+            bitmap: Box::new([0; 1024]),
+        }
+    }
+
+    #[inline]
+    fn set_bit(&mut self, value: i16) {
+        let idx = value as u16;
+        let word = (idx / 64) as usize;
+        let bit = idx % 64;
+        self.bitmap[word] |= 1u64 << bit;
+    }
+
+    #[inline]
+    fn count(&self) -> i64 {
+        self.bitmap.iter().map(|w| w.count_ones() as i64).sum()
+    }
+}
+
+impl Default for Bitmap65536DistinctCountAccumulatorI16 {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Accumulator for Bitmap65536DistinctCountAccumulatorI16 {
+    #[inline(never)]
+    fn update_batch(&mut self, values: &[ArrayRef]) -> datafusion_common::Result<()> {
+        if values.is_empty() {
+            return Ok(());
+        }
+
+        let arr = as_primitive_array::<arrow::datatypes::Int16Type>(&values[0])?;
+        for value in arr.iter().flatten() {
+            self.set_bit(value);
+        }
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> datafusion_common::Result<()> {
+        if states.is_empty() {
+            return Ok(());
+        }
+
+        let arr = as_list_array(&states[0])?;
+        arr.iter().try_for_each(|maybe_list| {
+            if let Some(list) = maybe_list {
+                let list = as_primitive_array::<arrow::datatypes::Int16Type>(&list)?;
+                for value in list.values().iter() {
+                    self.set_bit(*value);
+                }
+            };
+            Ok(())
+        })
+    }
+
+    fn state(&mut self) -> datafusion_common::Result<Vec<ScalarValue>> {
+        let mut values = Vec::new();
+        for (word_idx, &word) in self.bitmap.iter().enumerate() {
+            if word != 0 {
+                for bit in 0..64 {
+                    if (word & (1u64 << bit)) != 0 {
+                        values.push(((word_idx as u16) * 64 + bit) as i16);
+                    }
+                }
+            }
+        }
+
+        let arr = Arc::new(
+            PrimitiveArray::<arrow::datatypes::Int16Type>::from_iter_values(values),
+        );
+        Ok(vec![
+            SingleRowListArrayBuilder::new(arr).build_list_scalar(),
+        ])
+    }
+
+    fn evaluate(&mut self) -> datafusion_common::Result<ScalarValue> {
+        Ok(ScalarValue::Int64(Some(self.count())))
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self) + 8192
     }
 }
diff --git a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator.rs b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator.rs
index c807591dabec8..ad2a21bb4733c 100644
--- a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator.rs
@@ -32,7 +32,7 @@ use arrow::{
     compute::take_arrays,
     datatypes::UInt32Type,
 };
-use datafusion_common::{arrow_datafusion_err, DataFusionError, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, arrow_datafusion_err};
 use datafusion_expr_common::accumulator::Accumulator;
 use datafusion_expr_common::groups_accumulator::{EmitTo, GroupsAccumulator};
 
diff --git a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs
index 736345874c274..25f52df61136f 100644
--- a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs
@@ -20,10 +20,70 @@
 //! [`GroupsAccumulator`]: datafusion_expr_common::groups_accumulator::GroupsAccumulator
 
 use arrow::array::{Array, BooleanArray, BooleanBufferBuilder, PrimitiveArray};
-use arrow::buffer::{BooleanBuffer, NullBuffer};
+use arrow::buffer::NullBuffer;
 use arrow::datatypes::ArrowPrimitiveType;
 
 use datafusion_expr_common::groups_accumulator::EmitTo;
+
+/// If the input has nulls, then the accumulator must potentially
+/// handle each input null value specially (e.g. for `SUM` to mark the
+/// corresponding sum as null)
+///
+/// If there are filters present, `NullState` tracks if it has seen
+/// *any* value for that group (as some values may be filtered
+/// out). Without a filter, the accumulator is only passed groups that
+/// had at least one value to accumulate so they do not need to track
+/// if they have seen values for a particular group.
+#[derive(Debug)]
+pub enum SeenValues {
+    /// All groups seen so far have seen at least one non-null value
+    All {
+        num_values: usize,
+    },
+    // Some groups have not yet seen a non-null value
+    Some {
+        values: BooleanBufferBuilder,
+    },
+}
+
+impl Default for SeenValues {
+    fn default() -> Self {
+        SeenValues::All { num_values: 0 }
+    }
+}
+
+impl SeenValues {
+    /// Return a mutable reference to the `BooleanBufferBuilder` in `SeenValues::Some`.
+    ///
+    /// If `self` is `SeenValues::All`, it is transitioned to `SeenValues::Some`
+    /// by creating a new `BooleanBufferBuilder` where the first `num_values` are true.
+    ///
+    /// The builder is then ensured to have at least `total_num_groups` length,
+    /// with any new entries initialized to false.
+    fn get_builder(&mut self, total_num_groups: usize) -> &mut BooleanBufferBuilder {
+        match self {
+            SeenValues::All { num_values } => {
+                let mut builder = BooleanBufferBuilder::new(total_num_groups);
+                builder.append_n(*num_values, true);
+                if total_num_groups > *num_values {
+                    builder.append_n(total_num_groups - *num_values, false);
+                }
+                *self = SeenValues::Some { values: builder };
+                match self {
+                    SeenValues::Some { values } => values,
+                    _ => unreachable!(),
+                }
+            }
+            SeenValues::Some { values } => {
+                if values.len() < total_num_groups {
+                    values.append_n(total_num_groups - values.len(), false);
+                }
+                values
+            }
+        }
+    }
+}
+
 /// Track the accumulator null state per row: if any values for that
 /// group were null and if any values have been seen at all for that group.
 ///
@@ -53,12 +113,14 @@ use datafusion_expr_common::groups_accumulator::EmitTo;
 pub struct NullState {
     /// Have we seen any non-filtered input values for `group_index`?
     ///
-    /// If `seen_values[i]` is true, have seen at least one non null
+    /// If `seen_values` is `SeenValues::Some(buffer)` and buffer\[i\] is true, have seen at least one non null
     /// value for group `i`
     ///
-    /// If `seen_values[i]` is false, have not seen any values that
+    /// If `seen_values` is `SeenValues::Some(buffer)` and buffer\[i\] is false, have not seen any values that
     /// pass the filter yet for group `i`
-    seen_values: BooleanBufferBuilder,
+    ///
+    /// If `seen_values` is `SeenValues::All`, all groups have seen at least one non null value
+    seen_values: SeenValues,
 }
 
 impl Default for NullState {
@@ -70,14 +132,16 @@ impl Default for NullState {
 impl NullState {
     pub fn new() -> Self {
         Self {
-            seen_values: BooleanBufferBuilder::new(0),
+            seen_values: SeenValues::All { num_values: 0 },
         }
     }
 
     /// return the size of all buffers allocated by this null state, not including self
     pub fn size(&self) -> usize {
-        // capacity is in bits, so convert to bytes
-        self.seen_values.capacity() / 8
+        match &self.seen_values {
+            SeenValues::All { .. } => 0,
+            SeenValues::Some { values } => values.capacity() / 8,
+        }
     }
 
     /// Invokes `value_fn(group_index, value)` for each non null, non
@@ -107,10 +171,17 @@ impl NullState {
         T: ArrowPrimitiveType + Send,
         F: FnMut(usize, T::Native) + Send,
     {
-        // ensure the seen_values is big enough (start everything at
-        // "not seen" valid)
-        let seen_values =
-            initialize_builder(&mut self.seen_values, total_num_groups, false);
+        // skip null handling if no nulls in input or accumulator
+        if let SeenValues::All { num_values } = &mut self.seen_values
+            && opt_filter.is_none()
+            && values.null_count() == 0
+        {
+            accumulate(group_indices, values, None, value_fn);
+            *num_values = total_num_groups;
+            return;
+        }
+
+        let seen_values = self.seen_values.get_builder(total_num_groups);
         accumulate(group_indices, values, opt_filter, |group_index, value| {
             seen_values.set_bit(group_index, true);
             value_fn(group_index, value);
@@ -140,10 +211,21 @@ impl NullState {
         let data = values.values();
         assert_eq!(data.len(), group_indices.len());
 
-        // ensure the seen_values is big enough (start everything at
-        // "not seen" valid)
-        let seen_values =
-            initialize_builder(&mut self.seen_values, total_num_groups, false);
+        // skip null handling if no nulls in input or accumulator
+        if let SeenValues::All { num_values } = &mut self.seen_values
+            && opt_filter.is_none()
+            && values.null_count() == 0
+        {
+            group_indices
+                .iter()
+                .zip(data.iter())
+                .for_each(|(&group_index, new_value)| value_fn(group_index, new_value));
+            *num_values = total_num_groups;
+
+            return;
+        }
+
+        let seen_values = self.seen_values.get_builder(total_num_groups);
 
         // These could be made more performant by iterating in chunks of 64 bits at a time
         match (values.null_count() > 0, opt_filter) {
@@ -195,11 +277,11 @@ impl NullState {
                     .zip(group_indices.iter())
                     .zip(values.iter())
                     .for_each(|((filter_value, &group_index), new_value)| {
-                        if let Some(true) = filter_value {
-                            if let Some(new_value) = new_value {
-                                seen_values.set_bit(group_index, true);
-                                value_fn(group_index, new_value)
-                            }
+                        if let Some(true) = filter_value
+                            && let Some(new_value) = new_value
+                        {
+                            seen_values.set_bit(group_index, true);
+                            value_fn(group_index, new_value)
                         }
                     })
             }
@@ -211,25 +293,39 @@ impl NullState {
     /// for the `emit_to` rows.
     ///
     /// resets the internal state appropriately
-    pub fn build(&mut self, emit_to: EmitTo) -> NullBuffer {
-        let nulls: BooleanBuffer = self.seen_values.finish();
-
-        let nulls = match emit_to {
-            EmitTo::All => nulls,
-            EmitTo::First(n) => {
-                // split off the first N values in seen_values
-                //
-                // TODO make this more efficient rather than two
-                // copies and bitwise manipulation
-                let first_n_null: BooleanBuffer = nulls.iter().take(n).collect();
-                // reset the existing seen buffer
-                for seen in nulls.iter().skip(n) {
-                    self.seen_values.append(seen);
+    pub fn build(&mut self, emit_to: EmitTo) -> Option<NullBuffer> {
+        match emit_to {
+            EmitTo::All => {
+                let old_seen = std::mem::take(&mut self.seen_values);
+                match old_seen {
+                    SeenValues::All { .. } => None,
+                    SeenValues::Some { mut values } => {
+                        Some(NullBuffer::new(values.finish()))
+                    }
                 }
-                first_n_null
             }
-        };
-        NullBuffer::new(nulls)
+            EmitTo::First(n) => match &mut self.seen_values {
+                SeenValues::All { num_values } => {
+                    *num_values = num_values.saturating_sub(n);
+                    None
+                }
+                SeenValues::Some { .. } => {
+                    let mut old_values = match std::mem::take(&mut self.seen_values) {
+                        SeenValues::Some { values } => values,
+                        _ => unreachable!(),
+                    };
+                    let nulls = old_values.finish();
+                    let first_n_null = nulls.slice(0, n);
+                    let remainder = nulls.slice(n, nulls.len() - n);
+                    let mut new_builder = BooleanBufferBuilder::new(remainder.len());
+                    new_builder.append_buffer(&remainder);
+                    self.seen_values = SeenValues::Some {
+                        values: new_builder,
+                    };
+                    Some(NullBuffer::new(first_n_null))
+                }
+            },
+        }
     }
 }
 
@@ -361,10 +457,10 @@ pub fn accumulate<T, F>(
                 .zip(group_indices.iter())
                 .zip(values.iter())
                 .for_each(|((filter_value, &group_index), new_value)| {
-                    if let Some(true) = filter_value {
-                        if let Some(new_value) = new_value {
-                            value_fn(group_index, new_value)
-                        }
+                    if let Some(true) = filter_value
+                        && let Some(new_value) = new_value
+                    {
+                        value_fn(group_index, new_value)
                     }
                 })
         }
@@ -577,28 +673,15 @@ pub fn accumulate_indices<F>(
     }
 }
 
-/// Ensures that `builder` contains a `BooleanBufferBuilder with at
-/// least `total_num_groups`.
-///
-/// All new entries are initialized to `default_value`
-fn initialize_builder(
-    builder: &mut BooleanBufferBuilder,
-    total_num_groups: usize,
-    default_value: bool,
-) -> &mut BooleanBufferBuilder {
-    if builder.len() < total_num_groups {
-        let new_groups = total_num_groups - builder.len();
-        builder.append_n(new_groups, default_value);
-    }
-    builder
-}
-
 #[cfg(test)]
 mod test {
     use super::*;
 
-    use arrow::array::{Int32Array, UInt32Array};
-    use rand::{rngs::ThreadRng, Rng};
+    use arrow::{
+        array::{Int32Array, UInt32Array},
+        buffer::BooleanBuffer,
+    };
+    use rand::{Rng, rngs::ThreadRng};
     use std::collections::HashSet;
 
     #[test]
@@ -694,11 +777,7 @@ mod test {
             let values_with_nulls: Vec<Option<u32>> = (0..num_values)
                 .map(|_| {
                     let is_null = null_pct < rng.random_range(0.0..1.0);
-                    if is_null {
-                        None
-                    } else {
-                        Some(rng.random())
-                    }
+                    if is_null { None } else { Some(rng.random()) }
                 })
                 .collect();
 
@@ -828,27 +907,38 @@ mod test {
                         .zip(filter.iter())
                         .for_each(|((&group_index, value), is_included)| {
                             // if value passed filter
-                            if let Some(true) = is_included {
-                                if let Some(value) = value {
-                                    mock.saw_value(group_index);
-                                    expected_values.push((group_index, value));
-                                }
+                            if let Some(true) = is_included
+                                && let Some(value) = value
+                            {
+                                mock.saw_value(group_index);
+                                expected_values.push((group_index, value));
                             }
                         });
                 }
             }
 
-            assert_eq!(accumulated_values, expected_values,
-                       "\n\naccumulated_values:{accumulated_values:#?}\n\nexpected_values:{expected_values:#?}");
-            let seen_values = null_state.seen_values.finish_cloned();
-            mock.validate_seen_values(&seen_values);
+            assert_eq!(
+                accumulated_values, expected_values,
+                "\n\naccumulated_values:{accumulated_values:#?}\n\nexpected_values:{expected_values:#?}"
+            );
+
+            match &null_state.seen_values {
+                SeenValues::All { num_values } => {
+                    assert_eq!(*num_values, total_num_groups);
+                }
+                SeenValues::Some { values } => {
+                    let seen_values = values.finish_cloned();
+                    mock.validate_seen_values(&seen_values);
+                }
+            }
 
             // Validate the final buffer (one value per group)
             let expected_null_buffer = mock.expected_null_buffer(total_num_groups);
 
             let null_buffer = null_state.build(EmitTo::All);
-
-            assert_eq!(null_buffer, expected_null_buffer);
+            if let Some(nulls) = &null_buffer {
+                assert_eq!(*nulls, expected_null_buffer);
+            }
         }
 
         // Calls `accumulate_indices`
@@ -899,8 +989,10 @@ mod test {
                 }
             }
 
-            assert_eq!(accumulated_values, expected_values,
-                       "\n\naccumulated_values:{accumulated_values:#?}\n\nexpected_values:{expected_values:#?}");
+            assert_eq!(
+                accumulated_values, expected_values,
+                "\n\naccumulated_values:{accumulated_values:#?}\n\nexpected_values:{expected_values:#?}"
+            );
         }
 
         /// This is effectively a different implementation of
@@ -944,28 +1036,40 @@ mod test {
                         .zip(filter.iter())
                         .for_each(|((&group_index, value), is_included)| {
                             // if value passed filter
-                            if let Some(true) = is_included {
-                                if let Some(value) = value {
-                                    mock.saw_value(group_index);
-                                    expected_values.push((group_index, value));
-                                }
+                            if let Some(true) = is_included
+                                && let Some(value) = value
+                            {
+                                mock.saw_value(group_index);
+                                expected_values.push((group_index, value));
                             }
                         });
                 }
             }
 
-            assert_eq!(accumulated_values, expected_values,
-                       "\n\naccumulated_values:{accumulated_values:#?}\n\nexpected_values:{expected_values:#?}");
+            assert_eq!(
+                accumulated_values, expected_values,
+                "\n\naccumulated_values:{accumulated_values:#?}\n\nexpected_values:{expected_values:#?}"
+            );
 
-            let seen_values = null_state.seen_values.finish_cloned();
-            mock.validate_seen_values(&seen_values);
+            match &null_state.seen_values {
+                SeenValues::All { num_values } => {
+                    assert_eq!(*num_values, total_num_groups);
+                }
+                SeenValues::Some { values } => {
+                    let seen_values = values.finish_cloned();
+                    mock.validate_seen_values(&seen_values);
+                }
+            }
 
             // Validate the final buffer (one value per group)
-            let expected_null_buffer = mock.expected_null_buffer(total_num_groups);
+            let expected_null_buffer = Some(mock.expected_null_buffer(total_num_groups));
 
+            let is_all_seen = matches!(null_state.seen_values, SeenValues::All { .. });
             let null_buffer = null_state.build(EmitTo::All);
 
-            assert_eq!(null_buffer, expected_null_buffer);
+            if !is_all_seen {
+                assert_eq!(null_buffer, expected_null_buffer);
+            }
         }
     }
 
diff --git a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/bool_op.rs b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/bool_op.rs
index 149312e5a9c0f..d1d8924a2c3e8 100644
--- a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/bool_op.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/bool_op.rs
@@ -37,7 +37,7 @@ use super::accumulate::NullState;
 #[derive(Debug)]
 pub struct BooleanGroupsAccumulator<F>
 where
-    F: Fn(bool, bool) -> bool + Send + Sync,
+    F: Fn(bool, bool) -> bool + Send + Sync + 'static,
 {
     /// values per group
     values: BooleanBufferBuilder,
@@ -55,7 +55,7 @@ where
 
 impl<F> BooleanGroupsAccumulator<F>
 where
-    F: Fn(bool, bool) -> bool + Send + Sync,
+    F: Fn(bool, bool) -> bool + Send + Sync + 'static,
 {
     pub fn new(bool_fn: F, identity: bool) -> Self {
         Self {
@@ -69,7 +69,7 @@ where
 
 impl<F> GroupsAccumulator for BooleanGroupsAccumulator<F>
 where
-    F: Fn(bool, bool) -> bool + Send + Sync,
+    F: Fn(bool, bool) -> bool + Send + Sync + 'static,
 {
     fn update_batch(
         &mut self,
@@ -120,7 +120,7 @@ where
         };
 
         let nulls = self.null_state.build(emit_to);
-        let values = BooleanArray::new(values, Some(nulls));
+        let values = BooleanArray::new(values, nulls);
         Ok(Arc::new(values))
     }
 
diff --git a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/nulls.rs b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/nulls.rs
index c8c7736bba14f..5b56b77e11d3f 100644
--- a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/nulls.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/nulls.rs
@@ -24,7 +24,7 @@ use arrow::array::{
 };
 use arrow::buffer::NullBuffer;
 use arrow::datatypes::DataType;
-use datafusion_common::{not_impl_err, Result};
+use datafusion_common::{Result, not_impl_err};
 use std::sync::Arc;
 
 /// Sets the validity mask for a `PrimitiveArray` to `nulls`
@@ -44,7 +44,7 @@ pub fn set_nulls<T: ArrowNumericType + Send>(
 /// The `NullBuffer` is
 /// * `true` (representing valid) for values that were `true` in filter
 /// * `false` (representing null) for values that were `false` or `null` in filter
-fn filter_to_nulls(filter: &BooleanArray) -> Option<NullBuffer> {
+pub fn filter_to_nulls(filter: &BooleanArray) -> Option<NullBuffer> {
     let (filter_bools, filter_nulls) = filter.clone().into_parts();
     let filter_bools = NullBuffer::from(filter_bools);
     NullBuffer::union(Some(&filter_bools), filter_nulls.as_ref())
@@ -206,7 +206,7 @@ pub fn set_nulls_dyn(input: &dyn Array, nulls: Option<NullBuffer>) -> Result<Arr
             }
         }
         _ => {
-            return not_impl_err!("Applying nulls {:?}", input.data_type());
+            return not_impl_err!("Applying nulls {}", input.data_type());
         }
     };
     assert_eq!(input.len(), output.len());
diff --git a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/prim_op.rs b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/prim_op.rs
index fe920927f39b8..a81b89e1e46f1 100644
--- a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/prim_op.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/prim_op.rs
@@ -23,7 +23,7 @@ use arrow::buffer::NullBuffer;
 use arrow::compute;
 use arrow::datatypes::ArrowPrimitiveType;
 use arrow::datatypes::DataType;
-use datafusion_common::{internal_datafusion_err, DataFusionError, Result};
+use datafusion_common::{DataFusionError, Result, internal_datafusion_err};
 use datafusion_expr_common::groups_accumulator::{EmitTo, GroupsAccumulator};
 
 use super::accumulate::NullState;
@@ -41,7 +41,7 @@ use super::accumulate::NullState;
 pub struct PrimitiveGroupsAccumulator<T, F>
 where
     T: ArrowPrimitiveType + Send,
-    F: Fn(&mut T::Native, T::Native) + Send + Sync,
+    F: Fn(&mut T::Native, T::Native) + Send + Sync + 'static,
 {
     /// values per group, stored as the native type
     values: Vec<T::Native>,
@@ -62,7 +62,7 @@ where
 impl<T, F> PrimitiveGroupsAccumulator<T, F>
 where
     T: ArrowPrimitiveType + Send,
-    F: Fn(&mut T::Native, T::Native) + Send + Sync,
+    F: Fn(&mut T::Native, T::Native) + Send + Sync + 'static,
 {
     pub fn new(data_type: &DataType, prim_fn: F) -> Self {
         Self {
@@ -84,7 +84,7 @@ where
 impl<T, F> GroupsAccumulator for PrimitiveGroupsAccumulator<T, F>
 where
     T: ArrowPrimitiveType + Send,
-    F: Fn(&mut T::Native, T::Native) + Send + Sync,
+    F: Fn(&mut T::Native, T::Native) + Send + Sync + 'static,
 {
     fn update_batch(
         &mut self,
@@ -106,7 +106,8 @@ where
             opt_filter,
             total_num_groups,
             |group_index, new_value| {
-                let value = &mut self.values[group_index];
+                // SAFETY: group_index is guaranteed to be in bounds
+                let value = unsafe { self.values.get_unchecked_mut(group_index) };
                 (self.prim_fn)(value, new_value);
             },
         );
@@ -117,7 +118,7 @@ where
     fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
         let values = emit_to.take_needed(&mut self.values);
         let nulls = self.null_state.build(emit_to);
-        let values = PrimitiveArray::<T>::new(values.into(), Some(nulls)) // no copy
+        let values = PrimitiveArray::<T>::new(values.into(), nulls) // no copy
             .with_data_type(self.data_type.clone());
         Ok(Arc::new(values))
     }
diff --git a/datafusion/functions-aggregate-common/src/aggregate/sum_distinct/numeric.rs b/datafusion/functions-aggregate-common/src/aggregate/sum_distinct/numeric.rs
index 3021783a2a79c..e5a23597c44ad 100644
--- a/datafusion/functions-aggregate-common/src/aggregate/sum_distinct/numeric.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate/sum_distinct/numeric.rs
@@ -17,16 +17,12 @@
 
 //! Defines the accumulator for `SUM DISTINCT` for primitive numeric types
 
-use std::collections::HashSet;
 use std::fmt::Debug;
-use std::mem::{size_of, size_of_val};
+use std::mem::size_of_val;
 
-use ahash::RandomState;
-use arrow::array::Array;
 use arrow::array::ArrayRef;
 use arrow::array::ArrowNativeTypeOp;
 use arrow::array::ArrowPrimitiveType;
-use arrow::array::AsArray;
 use arrow::datatypes::ArrowNativeType;
 use arrow::datatypes::DataType;
 
@@ -34,90 +30,54 @@ use datafusion_common::Result;
 use datafusion_common::ScalarValue;
 use datafusion_expr_common::accumulator::Accumulator;
 
-use crate::utils::Hashable;
+use crate::utils::GenericDistinctBuffer;
 
 /// Accumulator for computing SUM(DISTINCT expr)
+#[derive(Debug)]
 pub struct DistinctSumAccumulator<T: ArrowPrimitiveType> {
-    values: HashSet<Hashable<T::Native>, RandomState>,
+    values: GenericDistinctBuffer<T>,
     data_type: DataType,
 }
 
-impl<T: ArrowPrimitiveType> Debug for DistinctSumAccumulator<T> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "DistinctSumAccumulator({})", self.data_type)
-    }
-}
-
 impl<T: ArrowPrimitiveType> DistinctSumAccumulator<T> {
     pub fn new(data_type: &DataType) -> Self {
         Self {
-            values: HashSet::default(),
+            values: GenericDistinctBuffer::new(data_type.clone()),
             data_type: data_type.clone(),
         }
     }
 
     pub fn distinct_count(&self) -> usize {
-        self.values.len()
+        self.values.values.len()
     }
 }
 
-impl<T: ArrowPrimitiveType> Accumulator for DistinctSumAccumulator<T> {
+impl<T: ArrowPrimitiveType + Debug> Accumulator for DistinctSumAccumulator<T> {
     fn state(&mut self) -> Result<Vec<ScalarValue>> {
-        // 1. Stores aggregate state in `ScalarValue::List`
-        // 2. Constructs `ScalarValue::List` state from distinct numeric stored in hash set
-        let state_out = {
-            let distinct_values = self
-                .values
-                .iter()
-                .map(|value| {
-                    ScalarValue::new_primitive::<T>(Some(value.0), &self.data_type)
-                })
-                .collect::<Result<Vec<_>>>()?;
-
-            vec![ScalarValue::List(ScalarValue::new_list_nullable(
-                &distinct_values,
-                &self.data_type,
-            ))]
-        };
-        Ok(state_out)
+        self.values.state()
     }
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        if values.is_empty() {
-            return Ok(());
-        }
-
-        let array = values[0].as_primitive::<T>();
-        match array.nulls().filter(|x| x.null_count() > 0) {
-            Some(n) => {
-                for idx in n.valid_indices() {
-                    self.values.insert(Hashable(array.value(idx)));
-                }
-            }
-            None => array.values().iter().for_each(|x| {
-                self.values.insert(Hashable(*x));
-            }),
-        }
-        Ok(())
+        self.values.update_batch(values)
     }
 
     fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
-        for x in states[0].as_list::<i32>().iter().flatten() {
-            self.update_batch(&[x])?
-        }
-        Ok(())
+        self.values.merge_batch(states)
     }
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
-        let mut acc = T::Native::usize_as(0);
-        for distinct_value in self.values.iter() {
-            acc = acc.add_wrapping(distinct_value.0)
+        if self.distinct_count() == 0 {
+            ScalarValue::new_primitive::<T>(None, &self.data_type)
+        } else {
+            let mut acc = T::Native::usize_as(0);
+            for distinct_value in self.values.values.iter() {
+                acc = acc.add_wrapping(distinct_value.0)
+            }
+            ScalarValue::new_primitive::<T>(Some(acc), &self.data_type)
         }
-        let v = (!self.values.is_empty()).then_some(acc);
-        ScalarValue::new_primitive::<T>(v, &self.data_type)
     }
 
     fn size(&self) -> usize {
-        size_of_val(self) + self.values.capacity() * size_of::<T::Native>()
+        size_of_val(self) + self.values.size()
     }
 }
diff --git a/datafusion/functions-aggregate-common/src/lib.rs b/datafusion/functions-aggregate-common/src/lib.rs
index a07ef4d597cf2..574d160d4214a 100644
--- a/datafusion/functions-aggregate-common/src/lib.rs
+++ b/datafusion/functions-aggregate-common/src/lib.rs
@@ -30,11 +30,13 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 pub mod accumulator;
 pub mod aggregate;
 pub mod merge_arrays;
 pub mod min_max;
+pub mod noop_accumulator;
 pub mod order;
 pub mod stats;
 pub mod tdigest;
diff --git a/datafusion/functions-aggregate-common/src/merge_arrays.rs b/datafusion/functions-aggregate-common/src/merge_arrays.rs
index bdf1490417beb..e4eee9b2b1584 100644
--- a/datafusion/functions-aggregate-common/src/merge_arrays.rs
+++ b/datafusion/functions-aggregate-common/src/merge_arrays.rs
@@ -17,7 +17,7 @@
 
 use arrow::compute::SortOptions;
 use datafusion_common::utils::compare_rows;
-use datafusion_common::{exec_err, ScalarValue};
+use datafusion_common::{ScalarValue, exec_err};
 use std::cmp::Ordering;
 use std::collections::{BinaryHeap, VecDeque};
 
@@ -209,13 +209,12 @@ pub fn merge_ordered_arrays(
 mod tests {
     use super::*;
 
-    use std::collections::VecDeque;
     use std::sync::Arc;
 
     use arrow::array::{ArrayRef, Int64Array};
 
+    use datafusion_common::Result;
     use datafusion_common::utils::get_row_at_idx;
-    use datafusion_common::{Result, ScalarValue};
 
     #[test]
     fn test_merge_asc() -> Result<()> {
diff --git a/datafusion/functions-aggregate-common/src/min_max.rs b/datafusion/functions-aggregate-common/src/min_max.rs
index 7dd60e1c0e1b4..a9764d6ac6614 100644
--- a/datafusion/functions-aggregate-common/src/min_max.rs
+++ b/datafusion/functions-aggregate-common/src/min_max.rs
@@ -18,21 +18,21 @@
 //! Basic min/max functionality shared across DataFusion aggregate functions
 
 use arrow::array::{
-    ArrayRef, AsArray as _, BinaryArray, BinaryViewArray, BooleanArray, Date32Array,
-    Date64Array, Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array,
+    ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, Date32Array, Date64Array,
+    Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array,
     DurationMicrosecondArray, DurationMillisecondArray, DurationNanosecondArray,
     DurationSecondArray, FixedSizeBinaryArray, Float16Array, Float32Array, Float64Array,
-    Int16Array, Int32Array, Int64Array, Int8Array, IntervalDayTimeArray,
+    Int8Array, Int16Array, Int32Array, Int64Array, IntervalDayTimeArray,
     IntervalMonthDayNanoArray, IntervalYearMonthArray, LargeBinaryArray,
     LargeStringArray, StringArray, StringViewArray, Time32MillisecondArray,
     Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
     TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
-    TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
+    TimestampSecondArray, UInt8Array, UInt16Array, UInt32Array, UInt64Array,
 };
 use arrow::compute;
 use arrow::datatypes::{DataType, IntervalUnit, TimeUnit};
 use datafusion_common::{
-    downcast_value, internal_err, DataFusionError, Result, ScalarValue,
+    DataFusionError, Result, ScalarValue, downcast_value, internal_err,
 };
 use datafusion_expr_common::accumulator::Accumulator;
 use std::{cmp::Ordering, mem::size_of_val};
@@ -108,7 +108,9 @@ macro_rules! interval_min_max {
             Some(choose_min_max!($OP)) => $RHS.clone(),
             Some(_) => $LHS.clone(),
             None => {
-                return internal_err!("Comparison error while computing interval min/max")
+                return internal_err!(
+                    "Comparison error while computing interval min/max"
+                );
             }
         }
     }};
@@ -139,10 +141,25 @@ macro_rules! min_max_generic {
     }};
 }
 
-// min/max of two scalar values of the same type
 macro_rules! min_max {
     ($VALUE:expr, $DELTA:expr, $OP:ident) => {{
-        Ok(match ($VALUE, $DELTA) {
+        match choose_min_max!($OP) {
+            Ordering::Greater => Ok(min_max_scalar_impl!($VALUE, $DELTA, min)),
+            Ordering::Less => Ok(min_max_scalar_impl!($VALUE, $DELTA, max)),
+            Ordering::Equal => {
+                unreachable!("min/max comparisons do not use equal ordering")
+            }
+        }
+    }};
+}
+
+// min/max of two logically compatible scalar values.
+// Dictionary scalars participate by comparing their inner logical values.
+// When both inputs are dictionaries, matching key types are preserved in the
+// result; differing key types remain an unexpected invariant violation.
+macro_rules! min_max_scalar_impl {
+    ($VALUE:expr, $DELTA:expr, $OP:ident) => {{
+        match ($VALUE, $DELTA) {
             (ScalarValue::Null, ScalarValue::Null) => ScalarValue::Null,
             (
                 lhs @ ScalarValue::Decimal32(lhsv, lhsp, lhss),
@@ -411,16 +428,54 @@ macro_rules! min_max {
                 min_max_generic!(lhs, rhs, $OP)
             }
 
+            (
+                ScalarValue::Dictionary(lhs_dict_key_type, lhs_dict_value),
+                ScalarValue::Dictionary(rhs_dict_key_type, rhs_dict_value),
+            ) => {
+                if lhs_dict_key_type != rhs_dict_key_type {
+                    return internal_err!(
+                        "MIN/MAX is not expected to receive dictionary scalars with different key types ({:?} vs {:?})",
+                        lhs_dict_key_type,
+                        rhs_dict_key_type
+                    );
+                }
+
+                let result = min_max_scalar(
+                    lhs_dict_value.as_ref(),
+                    rhs_dict_value.as_ref(),
+                    choose_min_max!($OP),
+                )?;
+                ScalarValue::Dictionary(lhs_dict_key_type.clone(), Box::new(result))
+            }
+            (ScalarValue::Dictionary(_, lhs_dict_value), rhs_scalar) => {
+                min_max_scalar(lhs_dict_value.as_ref(), rhs_scalar, choose_min_max!($OP))?
+            }
+            (lhs_scalar, ScalarValue::Dictionary(_, rhs_dict_value)) => {
+                min_max_scalar(lhs_scalar, rhs_dict_value.as_ref(), choose_min_max!($OP))?
+            }
+
             e => {
                 return internal_err!(
-                    "MIN/MAX is not expected to receive scalars of incompatible types {:?}",
+                    "MIN/MAX is not expected to receive logically incompatible scalar values {:?}",
                     e
                 )
             }
-        })
+        }
     }};
 }
 
+fn min_max_scalar(
+    lhs: &ScalarValue,
+    rhs: &ScalarValue,
+    ordering: Ordering,
+) -> Result<ScalarValue> {
+    match ordering {
+        Ordering::Greater => Ok(min_max_scalar_impl!(lhs, rhs, min)),
+        Ordering::Less => Ok(min_max_scalar_impl!(lhs, rhs, max)),
+        Ordering::Equal => unreachable!("min/max comparisons do not use equal ordering"),
+    }
+}
+
 /// An accumulator to compute the maximum value
 #[derive(Debug, Clone)]
 pub struct MaxAccumulator {
@@ -758,16 +813,11 @@ pub fn min_batch(values: &ArrayRef) -> Result<ScalarValue> {
                 min_binary_view
             )
         }
-        DataType::Struct(_) => min_max_batch_generic(values, Ordering::Greater)?,
-        DataType::List(_) => min_max_batch_generic(values, Ordering::Greater)?,
-        DataType::LargeList(_) => min_max_batch_generic(values, Ordering::Greater)?,
-        DataType::FixedSizeList(_, _) => {
-            min_max_batch_generic(values, Ordering::Greater)?
-        }
-        DataType::Dictionary(_, _) => {
-            let values = values.as_any_dictionary().values();
-            min_batch(values)?
-        }
+        DataType::Struct(_)
+        | DataType::List(_)
+        | DataType::LargeList(_)
+        | DataType::FixedSizeList(_, _)
+        | DataType::Dictionary(_, _) => min_max_batch_generic(values, Ordering::Greater)?,
         _ => min_max_batch!(values, min),
     })
 }
@@ -841,14 +891,94 @@ pub fn max_batch(values: &ArrayRef) -> Result<ScalarValue> {
             let value = value.map(|e| e.to_vec());
             ScalarValue::FixedSizeBinary(*size, value)
         }
-        DataType::Struct(_) => min_max_batch_generic(values, Ordering::Less)?,
-        DataType::List(_) => min_max_batch_generic(values, Ordering::Less)?,
-        DataType::LargeList(_) => min_max_batch_generic(values, Ordering::Less)?,
-        DataType::FixedSizeList(_, _) => min_max_batch_generic(values, Ordering::Less)?,
-        DataType::Dictionary(_, _) => {
-            let values = values.as_any_dictionary().values();
-            max_batch(values)?
-        }
+        DataType::Struct(_)
+        | DataType::List(_)
+        | DataType::LargeList(_)
+        | DataType::FixedSizeList(_, _)
+        | DataType::Dictionary(_, _) => min_max_batch_generic(values, Ordering::Less)?,
         _ => min_max_batch!(values, max),
     })
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn min_max_dictionary_and_scalar_compare_by_inner_value() -> Result<()> {
+        let dictionary = ScalarValue::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(ScalarValue::Float32(Some(1.0))),
+        );
+        let scalar = ScalarValue::Float32(Some(2.0));
+
+        let result = min_max_scalar(&dictionary, &scalar, Ordering::Less)?;
+
+        assert_eq!(result, ScalarValue::Float32(Some(2.0)));
+        Ok(())
+    }
+
+    #[test]
+    fn min_max_dictionary_same_key_type_rewraps_result() -> Result<()> {
+        let lhs = ScalarValue::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(ScalarValue::Float32(Some(1.0))),
+        );
+        let rhs = ScalarValue::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(ScalarValue::Float32(Some(2.0))),
+        );
+
+        let result = min_max_scalar(&lhs, &rhs, Ordering::Less)?;
+
+        assert_eq!(
+            result,
+            ScalarValue::Dictionary(
+                Box::new(DataType::Int32),
+                Box::new(ScalarValue::Float32(Some(2.0))),
+            )
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn min_max_dictionary_different_key_types_error() -> Result<()> {
+        let lhs = ScalarValue::Dictionary(
+            Box::new(DataType::Int8),
+            Box::new(ScalarValue::Float32(Some(1.0))),
+        );
+        let rhs = ScalarValue::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(ScalarValue::Float32(Some(2.0))),
+        );
+
+        let error: DataFusionError =
+            min_max_scalar(&lhs, &rhs, Ordering::Less).unwrap_err();
+
+        assert!(
+            error
+                .to_string()
+                .contains("dictionary scalars with different key types")
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn min_max_dictionary_and_incompatible_scalar_error() -> Result<()> {
+        let dictionary = ScalarValue::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(ScalarValue::Float32(Some(1.0))),
+        );
+        let scalar = ScalarValue::Int32(Some(2));
+
+        let error: DataFusionError =
+            min_max_scalar(&dictionary, &scalar, Ordering::Less).unwrap_err();
+
+        assert!(
+            error
+                .to_string()
+                .contains("logically incompatible scalar values")
+        );
+        Ok(())
+    }
+}
diff --git a/datafusion/functions-aggregate-common/src/noop_accumulator.rs b/datafusion/functions-aggregate-common/src/noop_accumulator.rs
new file mode 100644
index 0000000000000..e34d58770a69d
--- /dev/null
+++ b/datafusion/functions-aggregate-common/src/noop_accumulator.rs
@@ -0,0 +1,70 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::ArrayRef;
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr_common::accumulator::Accumulator;
+
+/// [`Accumulator`] that does no work and always returns a fixed value (default
+/// of `NULL` but can be customized).
+///
+/// Useful for aggregate functions that need to handle an input of [`DataType::Null`]
+/// that does no work.
+///
+/// [`DataType::Null`]: arrow::datatypes::DataType::Null
+#[derive(Debug)]
+pub struct NoopAccumulator {
+    evaluate_value: ScalarValue,
+}
+
+impl NoopAccumulator {
+    pub fn new(evaluate_value: ScalarValue) -> Self {
+        Self { evaluate_value }
+    }
+}
+
+impl Default for NoopAccumulator {
+    fn default() -> Self {
+        Self {
+            evaluate_value: ScalarValue::Null,
+        }
+    }
+}
+
+impl Accumulator for NoopAccumulator {
+    fn update_batch(&mut self, _values: &[ArrayRef]) -> Result<()> {
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        Ok(self.evaluate_value.clone())
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self)
+    }
+
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        // We ensure we return a state field even if unused otherwise we run into
+        // issues with queries like `SELECT agg_fn(NULL) FROM table`
+        Ok(vec![ScalarValue::Null])
+    }
+
+    fn merge_batch(&mut self, _states: &[ArrayRef]) -> Result<()> {
+        Ok(())
+    }
+}
diff --git a/datafusion/functions-aggregate-common/src/tdigest.rs b/datafusion/functions-aggregate-common/src/tdigest.rs
index 320157fb7bd83..a7450f0eb52e9 100644
--- a/datafusion/functions-aggregate-common/src/tdigest.rs
+++ b/datafusion/functions-aggregate-common/src/tdigest.rs
@@ -31,9 +31,8 @@
 
 use arrow::datatypes::DataType;
 use arrow::datatypes::Float64Type;
-use datafusion_common::cast::as_primitive_array;
-use datafusion_common::Result;
 use datafusion_common::ScalarValue;
+use datafusion_common::cast::as_primitive_array;
 use std::cmp::Ordering;
 use std::mem::{size_of, size_of_val};
 
@@ -50,52 +49,6 @@ macro_rules! cast_scalar_f64 {
     };
 }
 
-// Cast a non-null [`ScalarValue::UInt64`] to an [`u64`], or
-// panic.
-macro_rules! cast_scalar_u64 {
-    ($value:expr ) => {
-        match &$value {
-            ScalarValue::UInt64(Some(v)) => *v,
-            v => panic!("invalid type {}", v),
-        }
-    };
-}
-
-/// This trait is implemented for each type a [`TDigest`] can operate on,
-/// allowing it to support both numerical rust types (obtained from
-/// `PrimitiveArray` instances), and [`ScalarValue`] instances.
-pub trait TryIntoF64 {
-    /// A fallible conversion of a possibly null `self` into a [`f64`].
-    ///
-    /// If `self` is null, this method must return `Ok(None)`.
-    ///
-    /// If `self` cannot be coerced to the desired type, this method must return
-    /// an `Err` variant.
-    fn try_as_f64(&self) -> Result<Option<f64>>;
-}
-
-/// Generate an infallible conversion from `type` to an [`f64`].
-macro_rules! impl_try_ordered_f64 {
-    ($type:ty) => {
-        impl TryIntoF64 for $type {
-            fn try_as_f64(&self) -> Result<Option<f64>> {
-                Ok(Some(*self as f64))
-            }
-        }
-    };
-}
-
-impl_try_ordered_f64!(f64);
-impl_try_ordered_f64!(f32);
-impl_try_ordered_f64!(i64);
-impl_try_ordered_f64!(i32);
-impl_try_ordered_f64!(i16);
-impl_try_ordered_f64!(i8);
-impl_try_ordered_f64!(u64);
-impl_try_ordered_f64!(u32);
-impl_try_ordered_f64!(u16);
-impl_try_ordered_f64!(u8);
-
 /// Centroid implementation to the cluster mentioned in the paper.
 #[derive(Debug, PartialEq, Clone)]
 pub struct Centroid {
@@ -146,7 +99,7 @@ pub struct TDigest {
     centroids: Vec<Centroid>,
     max_size: usize,
     sum: f64,
-    count: u64,
+    count: f64,
     max: f64,
     min: f64,
 }
@@ -156,26 +109,27 @@ impl TDigest {
         TDigest {
             centroids: Vec::new(),
             max_size,
-            sum: 0_f64,
-            count: 0,
+            sum: 0.0,
+            count: 0.0,
             max: f64::NAN,
             min: f64::NAN,
         }
     }
 
+    #[expect(clippy::needless_pass_by_value)]
     pub fn new_with_centroid(max_size: usize, centroid: Centroid) -> Self {
         TDigest {
             centroids: vec![centroid.clone()],
             max_size,
             sum: centroid.mean * centroid.weight,
-            count: 1,
+            count: centroid.weight,
             max: centroid.mean,
             min: centroid.mean,
         }
     }
 
     #[inline]
-    pub fn count(&self) -> u64 {
+    pub fn count(&self) -> f64 {
         self.count
     }
 
@@ -205,8 +159,8 @@ impl Default for TDigest {
         TDigest {
             centroids: Vec::new(),
             max_size: 100,
-            sum: 0_f64,
-            count: 0,
+            sum: 0.0,
+            count: 0.0,
             max: f64::NAN,
             min: f64::NAN,
         }
@@ -228,7 +182,11 @@ impl TDigest {
         if lo.is_nan() || hi.is_nan() {
             return v;
         }
-        v.clamp(lo, hi)
+
+        // Handle the case where floating point precision causes min > max.
+        let (min, max) = if lo > hi { (hi, lo) } else { (lo, hi) };
+
+        v.clamp(min, max)
     }
 
     // public for testing in other modules
@@ -247,12 +205,12 @@ impl TDigest {
         }
 
         let mut result = TDigest::new(self.max_size());
-        result.count = self.count() + sorted_values.len() as u64;
+        result.count = self.count() + sorted_values.len() as f64;
 
         let maybe_min = *sorted_values.first().unwrap();
         let maybe_max = *sorted_values.last().unwrap();
 
-        if self.count() > 0 {
+        if self.count() > 0.0 {
             result.min = self.min.min(maybe_min);
             result.max = self.max.max(maybe_max);
         } else {
@@ -264,7 +222,7 @@ impl TDigest {
 
         let mut k_limit: u64 = 1;
         let mut q_limit_times_count =
-            Self::k_to_q(k_limit, self.max_size) * result.count() as f64;
+            Self::k_to_q(k_limit, self.max_size) * result.count();
         k_limit += 1;
 
         let mut iter_centroids = self.centroids.iter().peekable();
@@ -312,7 +270,7 @@ impl TDigest {
 
                 compressed.push(curr.clone());
                 q_limit_times_count =
-                    Self::k_to_q(k_limit, self.max_size) * result.count() as f64;
+                    Self::k_to_q(k_limit, self.max_size) * result.count();
                 k_limit += 1;
                 curr = next;
             }
@@ -384,7 +342,7 @@ impl TDigest {
         let mut centroids: Vec<Centroid> = Vec::with_capacity(n_centroids);
         let mut starts: Vec<usize> = Vec::with_capacity(digests.len());
 
-        let mut count = 0;
+        let mut count = 0.0;
         let mut min = f64::INFINITY;
         let mut max = f64::NEG_INFINITY;
 
@@ -393,7 +351,7 @@ impl TDigest {
             starts.push(start);
 
             let curr_count = digest.count();
-            if curr_count > 0 {
+            if curr_count > 0.0 {
                 min = min.min(digest.min);
                 max = max.max(digest.max);
                 count += curr_count;
@@ -404,6 +362,11 @@ impl TDigest {
             }
         }
 
+        // If no centroids were added (all digests had zero count), return default
+        if centroids.is_empty() {
+            return TDigest::default();
+        }
+
         let mut digests_per_block: usize = 1;
         while digests_per_block < starts.len() {
             for i in (0..starts.len()).step_by(digests_per_block * 2) {
@@ -428,7 +391,7 @@ impl TDigest {
         let mut compressed: Vec<Centroid> = Vec::with_capacity(max_size);
 
         let mut k_limit = 1;
-        let mut q_limit_times_count = Self::k_to_q(k_limit, max_size) * count as f64;
+        let mut q_limit_times_count = Self::k_to_q(k_limit, max_size) * count;
 
         let mut iter_centroids = centroids.iter_mut();
         let mut curr = iter_centroids.next().unwrap();
@@ -447,7 +410,7 @@ impl TDigest {
                 sums_to_merge = 0_f64;
                 weights_to_merge = 0_f64;
                 compressed.push(curr.clone());
-                q_limit_times_count = Self::k_to_q(k_limit, max_size) * count as f64;
+                q_limit_times_count = Self::k_to_q(k_limit, max_size) * count;
                 k_limit += 1;
                 curr = centroid;
             }
@@ -471,7 +434,7 @@ impl TDigest {
             return 0.0;
         }
 
-        let rank = q * self.count as f64;
+        let rank = q * self.count;
 
         let mut pos: usize;
         let mut t;
@@ -481,7 +444,7 @@ impl TDigest {
             }
 
             pos = 0;
-            t = self.count as f64;
+            t = self.count;
 
             for (k, centroid) in self.centroids.iter().enumerate().rev() {
                 t -= centroid.weight();
@@ -594,7 +557,7 @@ impl TDigest {
         vec![
             ScalarValue::UInt64(Some(self.max_size as u64)),
             ScalarValue::Float64(Some(self.sum)),
-            ScalarValue::UInt64(Some(self.count)),
+            ScalarValue::Float64(Some(self.count)),
             ScalarValue::Float64(Some(self.max)),
             ScalarValue::Float64(Some(self.min)),
             ScalarValue::List(arr),
@@ -642,7 +605,7 @@ impl TDigest {
         Self {
             max_size,
             sum: cast_scalar_f64!(state[1]),
-            count: cast_scalar_u64!(&state[2]),
+            count: cast_scalar_f64!(state[2]),
             max,
             min,
             centroids,
@@ -779,4 +742,22 @@ mod tests {
 
         assert_eq!(t.size(), 96);
     }
+
+    #[test]
+    fn test_identical_values_floating_point_precision() {
+        // Regression test for https://github.com/apache/datafusion/issues/14855
+        // When all values are the same, floating-point arithmetic during centroid
+        // merging can cause slight precision differences between min and max,
+        // which previously caused a panic in clamp().
+
+        let t = TDigest::new(100);
+        let values: Vec<_> = (0..215).map(|_| 15.699999988079073_f64).collect();
+
+        let t = t.merge_unsorted_f64(values);
+
+        // This should not panic
+        let result = t.estimate_quantile(0.99);
+        // The result should be approximately equal to the input value
+        assert!((result - 15.699999988079073).abs() < 1e-10);
+    }
 }
diff --git a/datafusion/functions-aggregate-common/src/utils.rs b/datafusion/functions-aggregate-common/src/utils.rs
index 7ce5f09373f5c..256d80a67b1df 100644
--- a/datafusion/functions-aggregate-common/src/utils.rs
+++ b/datafusion/functions-aggregate-common/src/utils.rs
@@ -15,12 +15,20 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{ArrayRef, ArrowNativeTypeOp};
+use arrow::array::{
+    Array, ArrayRef, ArrowNativeTypeOp, ArrowPrimitiveType, PrimitiveArray,
+};
 use arrow::compute::SortOptions;
 use arrow::datatypes::{
     ArrowNativeType, DataType, DecimalType, Field, FieldRef, ToByteSlice,
 };
-use datafusion_common::{exec_err, internal_datafusion_err, Result};
+use datafusion_common::cast::{as_list_array, as_primitive_array};
+use datafusion_common::hash_utils::RandomState;
+use datafusion_common::utils::SingleRowListArrayBuilder;
+use datafusion_common::utils::memory::estimate_memory_size;
+use datafusion_common::{
+    HashSet, Result, ScalarValue, exec_err, internal_datafusion_err,
+};
 use datafusion_expr_common::accumulator::Accumulator;
 use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
 use std::sync::Arc;
@@ -167,3 +175,92 @@ impl<T: DecimalType> DecimalAverager<T> {
         }
     }
 }
+
+/// Generic way to collect distinct values for accumulators.
+///
+/// The intermediate state is represented as a List of scalar values updated by
+/// `merge_batch` and a `Vec` of `ArrayRef` that are converted to scalar values
+/// in the final evaluation step so that we avoid expensive conversions and
+/// allocations during `update_batch`.
+pub struct GenericDistinctBuffer<T: ArrowPrimitiveType> {
+    pub values: HashSet<Hashable<T::Native>, RandomState>,
+    data_type: DataType,
+}
+
+impl<T: ArrowPrimitiveType> std::fmt::Debug for GenericDistinctBuffer<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "GenericDistinctBuffer({}, values={})",
+            self.data_type,
+            self.values.len()
+        )
+    }
+}
+
+impl<T: ArrowPrimitiveType> GenericDistinctBuffer<T> {
+    pub fn new(data_type: DataType) -> Self {
+        Self {
+            values: HashSet::default(),
+            data_type,
+        }
+    }
+
+    /// Mirrors [`Accumulator::state`].
+    pub fn state(&self) -> Result<Vec<ScalarValue>> {
+        let arr = Arc::new(
+            PrimitiveArray::<T>::from_iter_values(self.values.iter().map(|v| v.0))
+                // Ideally we'd just use T::DATA_TYPE but this misses things like
+                // decimal scale/precision and timestamp timezones, which need to
+                // match up with Accumulator::state_fields
+                .with_data_type(self.data_type.clone()),
+        );
+        Ok(vec![
+            SingleRowListArrayBuilder::new(arr).build_list_scalar(),
+        ])
+    }
+
+    /// Mirrors [`Accumulator::update_batch`].
+    pub fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        if values.is_empty() {
+            return Ok(());
+        }
+
+        debug_assert_eq!(
+            values.len(),
+            1,
+            "DistinctValuesBuffer::update_batch expects only a single input array"
+        );
+
+        let arr = as_primitive_array::<T>(&values[0])?;
+        if arr.null_count() > 0 {
+            self.values.extend(arr.iter().flatten().map(Hashable));
+        } else {
+            self.values
+                .extend(arr.values().iter().cloned().map(Hashable));
+        }
+
+        Ok(())
+    }
+
+    /// Mirrors [`Accumulator::merge_batch`].
+    pub fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        if states.is_empty() {
+            return Ok(());
+        }
+
+        let array = as_list_array(&states[0])?;
+        for list in array.iter().flatten() {
+            self.update_batch(&[list])?;
+        }
+
+        Ok(())
+    }
+
+    /// Mirrors [`Accumulator::size`].
+    pub fn size(&self) -> usize {
+        let num_elements = self.values.len();
+        let fixed_size = size_of_val(self) + size_of_val(&self.values);
+        estimate_memory_size::<T::Native>(num_elements, fixed_size).unwrap()
+    }
+}
diff --git a/datafusion/functions-aggregate/Cargo.toml b/datafusion/functions-aggregate/Cargo.toml
index ffc6f3bb7a10a..778e6a24bf00e 100644
--- a/datafusion/functions-aggregate/Cargo.toml
+++ b/datafusion/functions-aggregate/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -38,7 +41,6 @@ workspace = true
 name = "datafusion_functions_aggregate"
 
 [dependencies]
-ahash = { workspace = true }
 arrow = { workspace = true }
 datafusion-common = { workspace = true }
 datafusion-doc = { workspace = true }
@@ -48,9 +50,10 @@ datafusion-functions-aggregate-common = { workspace = true }
 datafusion-macros = { workspace = true }
 datafusion-physical-expr = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
+foldhash = "0.2"
 half = { workspace = true }
 log = { workspace = true }
-paste = "1.0.14"
+num-traits = { workspace = true }
 
 [dev-dependencies]
 arrow = { workspace = true, features = ["test_utils"] }
@@ -72,3 +75,23 @@ harness = false
 [[bench]]
 harness = false
 name = "min_max_bytes"
+
+[[bench]]
+name = "approx_distinct"
+harness = false
+
+[[bench]]
+name = "first_last"
+harness = false
+
+[[bench]]
+name = "count_distinct"
+harness = false
+
+[[bench]]
+name = "median"
+harness = false
+
+[[bench]]
+name = "percentile_cont"
+harness = false
diff --git a/datafusion/functions-aggregate/benches/approx_distinct.rs b/datafusion/functions-aggregate/benches/approx_distinct.rs
new file mode 100644
index 0000000000000..cc85c2163c180
--- /dev/null
+++ b/datafusion/functions-aggregate/benches/approx_distinct.rs
@@ -0,0 +1,220 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{
+    ArrayRef, Int8Array, Int16Array, Int64Array, StringArray, StringViewArray,
+    UInt8Array, UInt16Array,
+};
+use arrow::datatypes::{DataType, Field, Schema};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_expr::function::AccumulatorArgs;
+use datafusion_expr::{Accumulator, AggregateUDFImpl};
+use datafusion_functions_aggregate::approx_distinct::ApproxDistinct;
+use datafusion_physical_expr::expressions::col;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+
+const BATCH_SIZE: usize = 8192;
+const SHORT_STRING_LENGTH: usize = 8;
+const LONG_STRING_LENGTH: usize = 20;
+
+fn prepare_accumulator(data_type: DataType) -> Box<dyn Accumulator> {
+    let schema = Arc::new(Schema::new(vec![Field::new("f", data_type, true)]));
+    let expr = col("f", &schema).unwrap();
+    let accumulator_args = AccumulatorArgs {
+        return_field: Field::new("f", DataType::UInt64, true).into(),
+        schema: &schema,
+        expr_fields: &[expr.return_field(&schema).unwrap()],
+        ignore_nulls: false,
+        order_bys: &[],
+        is_reversed: false,
+        name: "approx_distinct(f)",
+        is_distinct: false,
+        exprs: &[expr],
+    };
+    ApproxDistinct::new().accumulator(accumulator_args).unwrap()
+}
+
+/// Creates an Int64Array where values are drawn from `0..n_distinct`.
+fn create_i64_array(n_distinct: usize) -> Int64Array {
+    let mut rng = StdRng::seed_from_u64(42);
+    (0..BATCH_SIZE)
+        .map(|_| Some(rng.random_range(0..n_distinct as i64)))
+        .collect()
+}
+
+fn create_u8_array(n_distinct: usize) -> UInt8Array {
+    let mut rng = StdRng::seed_from_u64(42);
+    let max_val = n_distinct.min(256) as u8;
+    (0..BATCH_SIZE)
+        .map(|_| Some(rng.random_range(0..max_val)))
+        .collect()
+}
+
+fn create_i8_array(n_distinct: usize) -> Int8Array {
+    let mut rng = StdRng::seed_from_u64(42);
+    let max_val = (n_distinct.min(256) / 2) as i8;
+    (0..BATCH_SIZE)
+        .map(|_| Some(rng.random_range(-max_val..max_val)))
+        .collect()
+}
+
+fn create_u16_array(n_distinct: usize) -> UInt16Array {
+    let mut rng = StdRng::seed_from_u64(42);
+    let max_val = n_distinct.min(65536) as u16;
+    (0..BATCH_SIZE)
+        .map(|_| Some(rng.random_range(0..max_val)))
+        .collect()
+}
+
+fn create_i16_array(n_distinct: usize) -> Int16Array {
+    let mut rng = StdRng::seed_from_u64(42);
+    let max_val = (n_distinct.min(65536) / 2) as i16;
+    (0..BATCH_SIZE)
+        .map(|_| Some(rng.random_range(-max_val..max_val)))
+        .collect()
+}
+
+/// Creates a pool of `n_distinct` random strings of the given length.
+fn create_string_pool(n_distinct: usize, string_length: usize) -> Vec<String> {
+    let mut rng = StdRng::seed_from_u64(42);
+    (0..n_distinct)
+        .map(|_| {
+            (0..string_length)
+                .map(|_| rng.random_range(b'a'..=b'z') as char)
+                .collect()
+        })
+        .collect()
+}
+
+/// Creates a StringArray where values are drawn from the given pool.
+fn create_string_array(pool: &[String]) -> StringArray {
+    let mut rng = StdRng::seed_from_u64(99);
+    (0..BATCH_SIZE)
+        .map(|_| Some(pool[rng.random_range(0..pool.len())].as_str()))
+        .collect()
+}
+
+/// Creates a StringViewArray where values are drawn from the given pool.
+fn create_string_view_array(pool: &[String]) -> StringViewArray {
+    let mut rng = StdRng::seed_from_u64(99);
+    (0..BATCH_SIZE)
+        .map(|_| Some(pool[rng.random_range(0..pool.len())].as_str()))
+        .collect()
+}
+
+fn approx_distinct_benchmark(c: &mut Criterion) {
+    for pct in [80, 99] {
+        let n_distinct = BATCH_SIZE * pct / 100;
+
+        // --- Int64 benchmarks ---
+        let values = Arc::new(create_i64_array(n_distinct)) as ArrayRef;
+        c.bench_function(&format!("approx_distinct i64 {pct}% distinct"), |b| {
+            b.iter(|| {
+                let mut accumulator = prepare_accumulator(DataType::Int64);
+                accumulator
+                    .update_batch(std::slice::from_ref(&values))
+                    .unwrap()
+            })
+        });
+
+        for (label, str_len) in
+            [("short", SHORT_STRING_LENGTH), ("long", LONG_STRING_LENGTH)]
+        {
+            let string_pool = create_string_pool(n_distinct, str_len);
+
+            // --- Utf8 benchmarks ---
+            let values = Arc::new(create_string_array(&string_pool)) as ArrayRef;
+            c.bench_function(
+                &format!("approx_distinct utf8 {label} {pct}% distinct"),
+                |b| {
+                    b.iter(|| {
+                        let mut accumulator = prepare_accumulator(DataType::Utf8);
+                        accumulator
+                            .update_batch(std::slice::from_ref(&values))
+                            .unwrap()
+                    })
+                },
+            );
+
+            // --- Utf8View benchmarks ---
+            let values = Arc::new(create_string_view_array(&string_pool)) as ArrayRef;
+            c.bench_function(
+                &format!("approx_distinct utf8view {label} {pct}% distinct"),
+                |b| {
+                    b.iter(|| {
+                        let mut accumulator = prepare_accumulator(DataType::Utf8View);
+                        accumulator
+                            .update_batch(std::slice::from_ref(&values))
+                            .unwrap()
+                    })
+                },
+            );
+        }
+    }
+
+    // Small integer types
+
+    // UInt8
+    let values = Arc::new(create_u8_array(200)) as ArrayRef;
+    c.bench_function("approx_distinct u8 bitmap", |b| {
+        b.iter(|| {
+            let mut accumulator = prepare_accumulator(DataType::UInt8);
+            accumulator
+                .update_batch(std::slice::from_ref(&values))
+                .unwrap()
+        })
+    });
+
+    // Int8
+    let values = Arc::new(create_i8_array(200)) as ArrayRef;
+    c.bench_function("approx_distinct i8 bitmap", |b| {
+        b.iter(|| {
+            let mut accumulator = prepare_accumulator(DataType::Int8);
+            accumulator
+                .update_batch(std::slice::from_ref(&values))
+                .unwrap()
+        })
+    });
+
+    // UInt16
+    let values = Arc::new(create_u16_array(50000)) as ArrayRef;
+    c.bench_function("approx_distinct u16 bitmap", |b| {
+        b.iter(|| {
+            let mut accumulator = prepare_accumulator(DataType::UInt16);
+            accumulator
+                .update_batch(std::slice::from_ref(&values))
+                .unwrap()
+        })
+    });
+
+    // Int16
+    let values = Arc::new(create_i16_array(50000)) as ArrayRef;
+    c.bench_function("approx_distinct i16 bitmap", |b| {
+        b.iter(|| {
+            let mut accumulator = prepare_accumulator(DataType::Int16);
+            accumulator
+                .update_batch(std::slice::from_ref(&values))
+                .unwrap()
+        })
+    });
+}
+
+criterion_group!(benches, approx_distinct_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-aggregate/benches/array_agg.rs b/datafusion/functions-aggregate/benches/array_agg.rs
index 83b0c4a4c659c..b0d8148c3ea65 100644
--- a/datafusion/functions-aggregate/benches/array_agg.rs
+++ b/datafusion/functions-aggregate/benches/array_agg.rs
@@ -20,29 +20,30 @@ use std::sync::Arc;
 
 use arrow::array::{
     Array, ArrayRef, ArrowPrimitiveType, AsArray, ListArray, NullBufferBuilder,
-    PrimitiveArray,
 };
 use arrow::datatypes::{Field, Int64Type};
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_expr::Accumulator;
 use datafusion_functions_aggregate::array_agg::ArrayAggAccumulator;
 
 use arrow::buffer::OffsetBuffer;
-use rand::distr::{Distribution, StandardUniform};
-use rand::prelude::StdRng;
+use arrow::util::bench_util::create_primitive_array;
 use rand::Rng;
 use rand::SeedableRng;
+use rand::distr::{Distribution, StandardUniform};
+use rand::prelude::StdRng;
 
 /// Returns fixed seedable RNG
 pub fn seedable_rng() -> StdRng {
     StdRng::seed_from_u64(42)
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn merge_batch_bench(c: &mut Criterion, name: &str, values: ArrayRef) {
     let list_item_data_type = values.as_list::<i32>().values().data_type().clone();
     c.bench_function(name, |b| {
         b.iter(|| {
-            #[allow(clippy::unit_arg)]
+            #[expect(clippy::unit_arg)]
             black_box(
                 ArrayAggAccumulator::try_new(&list_item_data_type, false)
                     .unwrap()
@@ -53,24 +54,6 @@ fn merge_batch_bench(c: &mut Criterion, name: &str, values: ArrayRef) {
     });
 }
 
-pub fn create_primitive_array<T>(size: usize, null_density: f32) -> PrimitiveArray<T>
-where
-    T: ArrowPrimitiveType,
-    StandardUniform: Distribution<T::Native>,
-{
-    let mut rng = seedable_rng();
-
-    (0..size)
-        .map(|_| {
-            if rng.random::<f32>() < null_density {
-                None
-            } else {
-                Some(rng.random())
-            }
-        })
-        .collect()
-}
-
 /// Create List array with the given item data type, null density, null locations and zero length lists density
 /// Creates a random (but fixed-seeded) array of a given size and null density
 pub fn create_list_array<T>(
diff --git a/datafusion/functions-aggregate/benches/count.rs b/datafusion/functions-aggregate/benches/count.rs
index 53484652fd251..48f71858c1204 100644
--- a/datafusion/functions-aggregate/benches/count.rs
+++ b/datafusion/functions-aggregate/benches/count.rs
@@ -30,7 +30,7 @@ use datafusion_expr::{Accumulator, AggregateUDFImpl, GroupsAccumulator};
 use datafusion_functions_aggregate::count::Count;
 use datafusion_physical_expr::expressions::col;
 
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 
 fn prepare_group_accumulator() -> Box<dyn GroupsAccumulator> {
     let schema = Arc::new(Schema::new(vec![Field::new("f", DataType::Int32, true)]));
@@ -76,6 +76,7 @@ fn prepare_accumulator() -> Box<dyn Accumulator> {
     count_fn.accumulator(accumulator_args).unwrap()
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn convert_to_state_bench(
     c: &mut Criterion,
     name: &str,
@@ -129,7 +130,7 @@ fn count_benchmark(c: &mut Criterion) {
     let mut accumulator = prepare_accumulator();
     c.bench_function("count low cardinality dict 20% nulls, no filter", |b| {
         b.iter(|| {
-            #[allow(clippy::unit_arg)]
+            #[expect(clippy::unit_arg)]
             black_box(
                 accumulator
                     .update_batch(std::slice::from_ref(&values))
diff --git a/datafusion/functions-aggregate/benches/count_distinct.rs b/datafusion/functions-aggregate/benches/count_distinct.rs
new file mode 100644
index 0000000000000..4d9e8c5b67b31
--- /dev/null
+++ b/datafusion/functions-aggregate/benches/count_distinct.rs
@@ -0,0 +1,459 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{
+    Array, ArrayRef, Int8Array, Int16Array, Int32Array, Int64Array, UInt8Array,
+    UInt16Array, UInt32Array,
+};
+use arrow::datatypes::{DataType, Field, Schema};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_expr::function::AccumulatorArgs;
+use datafusion_expr::{Accumulator, AggregateUDFImpl, EmitTo};
+use datafusion_functions_aggregate::count::Count;
+use datafusion_physical_expr::expressions::col;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+
+const BATCH_SIZE: usize = 8192;
+
+fn prepare_accumulator(data_type: DataType) -> Box<dyn Accumulator> {
+    let schema = Arc::new(Schema::new(vec![Field::new("f", data_type, true)]));
+    let expr = col("f", &schema).unwrap();
+    let accumulator_args = AccumulatorArgs {
+        return_field: Field::new("f", DataType::Int64, true).into(),
+        schema: &schema,
+        expr_fields: &[expr.return_field(&schema).unwrap()],
+        ignore_nulls: false,
+        order_bys: &[],
+        is_reversed: false,
+        name: "count(distinct f)",
+        is_distinct: true,
+        exprs: &[expr],
+    };
+    Count::new().accumulator(accumulator_args).unwrap()
+}
+
+fn create_i64_array(n_distinct: usize) -> Int64Array {
+    let mut rng = StdRng::seed_from_u64(42);
+    (0..BATCH_SIZE)
+        .map(|_| Some(rng.random_range(0..n_distinct as i64)))
+        .collect()
+}
+
+fn create_u8_array(n_distinct: usize) -> UInt8Array {
+    let mut rng = StdRng::seed_from_u64(42);
+    let max_val = n_distinct.min(256) as u8;
+    (0..BATCH_SIZE)
+        .map(|_| Some(rng.random_range(0..max_val)))
+        .collect()
+}
+
+fn create_i8_array(n_distinct: usize) -> Int8Array {
+    let mut rng = StdRng::seed_from_u64(42);
+    let max_val = (n_distinct.min(256) / 2) as i8;
+    (0..BATCH_SIZE)
+        .map(|_| Some(rng.random_range(-max_val..max_val)))
+        .collect()
+}
+
+fn create_u16_array(n_distinct: usize) -> UInt16Array {
+    let mut rng = StdRng::seed_from_u64(42);
+    let max_val = n_distinct.min(65536) as u16;
+    (0..BATCH_SIZE)
+        .map(|_| Some(rng.random_range(0..max_val)))
+        .collect()
+}
+
+fn create_i16_array(n_distinct: usize) -> Int16Array {
+    let mut rng = StdRng::seed_from_u64(42);
+    let max_val = (n_distinct.min(65536) / 2) as i16;
+    (0..BATCH_SIZE)
+        .map(|_| Some(rng.random_range(-max_val..max_val)))
+        .collect()
+}
+
+fn create_u32_array(n_distinct: usize) -> UInt32Array {
+    let mut rng = StdRng::seed_from_u64(42);
+    (0..BATCH_SIZE)
+        .map(|_| Some(rng.random_range(0..n_distinct as u32)))
+        .collect()
+}
+
+fn create_i32_array(n_distinct: usize) -> Int32Array {
+    let mut rng = StdRng::seed_from_u64(42);
+    (0..BATCH_SIZE)
+        .map(|_| Some(rng.random_range(0..n_distinct as i32)))
+        .collect()
+}
+
+fn prepare_args(data_type: DataType) -> (Arc<Schema>, AccumulatorArgs<'static>) {
+    let schema = Arc::new(Schema::new(vec![Field::new("f", data_type, true)]));
+    let schema_leaked: &'static Schema = Box::leak(Box::new((*schema).clone()));
+    let expr = col("f", schema_leaked).unwrap();
+    let expr_leaked: &'static _ = Box::leak(Box::new(expr));
+    let return_field: Arc<Field> = Field::new("f", DataType::Int64, true).into();
+    let return_field_leaked: &'static _ = Box::leak(Box::new(return_field.clone()));
+    let expr_field = expr_leaked.return_field(schema_leaked).unwrap();
+    let expr_field_leaked: &'static _ = Box::leak(Box::new(expr_field));
+
+    let accumulator_args = AccumulatorArgs {
+        return_field: return_field_leaked.clone(),
+        schema: schema_leaked,
+        expr_fields: std::slice::from_ref(expr_field_leaked),
+        ignore_nulls: false,
+        order_bys: &[],
+        is_reversed: false,
+        name: "count(distinct f)",
+        is_distinct: true,
+        exprs: std::slice::from_ref(expr_leaked),
+    };
+    (schema, accumulator_args)
+}
+
+fn count_distinct_benchmark(c: &mut Criterion) {
+    for pct in [80, 99] {
+        let n_distinct = BATCH_SIZE * pct / 100;
+
+        // Int64
+        let values = Arc::new(create_i64_array(n_distinct)) as ArrayRef;
+        c.bench_function(&format!("count_distinct i64 {pct}% distinct"), |b| {
+            b.iter(|| {
+                let mut accumulator = prepare_accumulator(DataType::Int64);
+                accumulator
+                    .update_batch(std::slice::from_ref(&values))
+                    .unwrap()
+            })
+        });
+    }
+
+    // Small integer types
+
+    // UInt8
+    let values = Arc::new(create_u8_array(200)) as ArrayRef;
+    c.bench_function("count_distinct u8 bitmap", |b| {
+        b.iter(|| {
+            let mut accumulator = prepare_accumulator(DataType::UInt8);
+            accumulator
+                .update_batch(std::slice::from_ref(&values))
+                .unwrap()
+        })
+    });
+
+    // Int8
+    let values = Arc::new(create_i8_array(200)) as ArrayRef;
+    c.bench_function("count_distinct i8 bitmap", |b| {
+        b.iter(|| {
+            let mut accumulator = prepare_accumulator(DataType::Int8);
+            accumulator
+                .update_batch(std::slice::from_ref(&values))
+                .unwrap()
+        })
+    });
+
+    // UInt16
+    let values = Arc::new(create_u16_array(50000)) as ArrayRef;
+    c.bench_function("count_distinct u16 bitmap", |b| {
+        b.iter(|| {
+            let mut accumulator = prepare_accumulator(DataType::UInt16);
+            accumulator
+                .update_batch(std::slice::from_ref(&values))
+                .unwrap()
+        })
+    });
+
+    // Int16
+    let values = Arc::new(create_i16_array(50000)) as ArrayRef;
+    c.bench_function("count_distinct i16 bitmap", |b| {
+        b.iter(|| {
+            let mut accumulator = prepare_accumulator(DataType::Int16);
+            accumulator
+                .update_batch(std::slice::from_ref(&values))
+                .unwrap()
+        })
+    });
+
+    // 32-bit integer types
+    for pct in [80, 99] {
+        let n_distinct = BATCH_SIZE * pct / 100;
+
+        // UInt32
+        let values = Arc::new(create_u32_array(n_distinct)) as ArrayRef;
+        c.bench_function(&format!("count_distinct u32 {pct}% distinct"), |b| {
+            b.iter(|| {
+                let mut accumulator = prepare_accumulator(DataType::UInt32);
+                accumulator
+                    .update_batch(std::slice::from_ref(&values))
+                    .unwrap()
+            })
+        });
+
+        // Int32
+        let values = Arc::new(create_i32_array(n_distinct)) as ArrayRef;
+        c.bench_function(&format!("count_distinct i32 {pct}% distinct"), |b| {
+            b.iter(|| {
+                let mut accumulator = prepare_accumulator(DataType::Int32);
+                accumulator
+                    .update_batch(std::slice::from_ref(&values))
+                    .unwrap()
+            })
+        });
+    }
+}
+
+/// Create group indices with uniform distribution
+fn create_uniform_groups(num_groups: usize) -> Vec<usize> {
+    let mut rng = StdRng::seed_from_u64(42);
+    (0..BATCH_SIZE)
+        .map(|_| rng.random_range(0..num_groups))
+        .collect()
+}
+
+/// Create group indices with skewed distribution (80% in 20% of groups)
+fn create_skewed_groups(num_groups: usize) -> Vec<usize> {
+    let mut rng = StdRng::seed_from_u64(42);
+    let hot_groups = (num_groups / 5).max(1);
+    (0..BATCH_SIZE)
+        .map(|_| {
+            if rng.random_range(0..100) < 80 {
+                rng.random_range(0..hot_groups)
+            } else {
+                rng.random_range(0..num_groups)
+            }
+        })
+        .collect()
+}
+
+fn count_distinct_groups_benchmark(c: &mut Criterion) {
+    let count_fn = Count::new();
+
+    let group_counts = [100, 1000, 10000];
+    let cardinalities = [("low", 20), ("mid", 80), ("high", 99)];
+    let distributions = ["uniform", "skewed"];
+
+    // i64 benchmarks
+    for num_groups in group_counts {
+        for (card_name, distinct_pct) in cardinalities {
+            for dist in distributions {
+                let name = format!("i64_g{num_groups}_{card_name}_{dist}");
+                let n_distinct = BATCH_SIZE * distinct_pct / 100;
+                let values = Arc::new(create_i64_array(n_distinct)) as ArrayRef;
+                let group_indices = if dist == "uniform" {
+                    create_uniform_groups(num_groups)
+                } else {
+                    create_skewed_groups(num_groups)
+                };
+
+                let (_schema, args) = prepare_args(DataType::Int64);
+
+                if count_fn.groups_accumulator_supported(args.clone()) {
+                    c.bench_function(&format!("count_distinct_groups {name}"), |b| {
+                        b.iter(|| {
+                            let mut acc =
+                                count_fn.create_groups_accumulator(args.clone()).unwrap();
+                            acc.update_batch(
+                                std::slice::from_ref(&values),
+                                &group_indices,
+                                None,
+                                num_groups,
+                            )
+                            .unwrap();
+                            acc.evaluate(EmitTo::All).unwrap()
+                        })
+                    });
+                } else {
+                    let arr = values.as_any().downcast_ref::<Int64Array>().unwrap();
+                    let mut group_rows: Vec<Vec<i64>> = vec![Vec::new(); num_groups];
+                    for (idx, &group_idx) in group_indices.iter().enumerate() {
+                        if arr.is_valid(idx) {
+                            group_rows[group_idx].push(arr.value(idx));
+                        }
+                    }
+                    let group_arrays: Vec<ArrayRef> = group_rows
+                        .iter()
+                        .map(|rows| Arc::new(Int64Array::from(rows.clone())) as ArrayRef)
+                        .collect();
+
+                    c.bench_function(&format!("count_distinct_groups {name}"), |b| {
+                        b.iter(|| {
+                            let mut accumulators: Vec<_> = (0..num_groups)
+                                .map(|_| prepare_accumulator(DataType::Int64))
+                                .collect();
+
+                            for (group_idx, batch) in group_arrays.iter().enumerate() {
+                                if !batch.is_empty() {
+                                    accumulators[group_idx]
+                                        .update_batch(std::slice::from_ref(batch))
+                                        .unwrap();
+                                }
+                            }
+
+                            let _results: Vec<_> = accumulators
+                                .iter_mut()
+                                .map(|acc| acc.evaluate().unwrap())
+                                .collect();
+                        })
+                    });
+                }
+            }
+        }
+    }
+
+    // i32 benchmarks
+    for num_groups in group_counts {
+        for (card_name, distinct_pct) in cardinalities {
+            for dist in distributions {
+                let name = format!("i32_g{num_groups}_{card_name}_{dist}");
+                let n_distinct = BATCH_SIZE * distinct_pct / 100;
+                let values = Arc::new(create_i32_array(n_distinct)) as ArrayRef;
+                let group_indices = if dist == "uniform" {
+                    create_uniform_groups(num_groups)
+                } else {
+                    create_skewed_groups(num_groups)
+                };
+
+                let (_schema, args) = prepare_args(DataType::Int32);
+
+                if count_fn.groups_accumulator_supported(args.clone()) {
+                    c.bench_function(&format!("count_distinct_groups {name}"), |b| {
+                        b.iter(|| {
+                            let mut acc =
+                                count_fn.create_groups_accumulator(args.clone()).unwrap();
+                            acc.update_batch(
+                                std::slice::from_ref(&values),
+                                &group_indices,
+                                None,
+                                num_groups,
+                            )
+                            .unwrap();
+                            acc.evaluate(EmitTo::All).unwrap()
+                        })
+                    });
+                } else {
+                    let arr = values.as_any().downcast_ref::<Int32Array>().unwrap();
+                    let mut group_rows: Vec<Vec<i32>> = vec![Vec::new(); num_groups];
+                    for (idx, &group_idx) in group_indices.iter().enumerate() {
+                        if arr.is_valid(idx) {
+                            group_rows[group_idx].push(arr.value(idx));
+                        }
+                    }
+                    let group_arrays: Vec<ArrayRef> = group_rows
+                        .iter()
+                        .map(|rows| Arc::new(Int32Array::from(rows.clone())) as ArrayRef)
+                        .collect();
+
+                    c.bench_function(&format!("count_distinct_groups {name}"), |b| {
+                        b.iter(|| {
+                            let mut accumulators: Vec<_> = (0..num_groups)
+                                .map(|_| prepare_accumulator(DataType::Int32))
+                                .collect();
+
+                            for (group_idx, batch) in group_arrays.iter().enumerate() {
+                                if !batch.is_empty() {
+                                    accumulators[group_idx]
+                                        .update_batch(std::slice::from_ref(batch))
+                                        .unwrap();
+                                }
+                            }
+
+                            let _results: Vec<_> = accumulators
+                                .iter_mut()
+                                .map(|acc| acc.evaluate().unwrap())
+                                .collect();
+                        })
+                    });
+                }
+            }
+        }
+    }
+
+    // u32 benchmarks
+    for num_groups in group_counts {
+        for (card_name, distinct_pct) in cardinalities {
+            for dist in distributions {
+                let name = format!("u32_g{num_groups}_{card_name}_{dist}");
+                let n_distinct = BATCH_SIZE * distinct_pct / 100;
+                let values = Arc::new(create_u32_array(n_distinct)) as ArrayRef;
+                let group_indices = if dist == "uniform" {
+                    create_uniform_groups(num_groups)
+                } else {
+                    create_skewed_groups(num_groups)
+                };
+
+                let (_schema, args) = prepare_args(DataType::UInt32);
+
+                if count_fn.groups_accumulator_supported(args.clone()) {
+                    c.bench_function(&format!("count_distinct_groups {name}"), |b| {
+                        b.iter(|| {
+                            let mut acc =
+                                count_fn.create_groups_accumulator(args.clone()).unwrap();
+                            acc.update_batch(
+                                std::slice::from_ref(&values),
+                                &group_indices,
+                                None,
+                                num_groups,
+                            )
+                            .unwrap();
+                            acc.evaluate(EmitTo::All).unwrap()
+                        })
+                    });
+                } else {
+                    let arr = values.as_any().downcast_ref::<UInt32Array>().unwrap();
+                    let mut group_rows: Vec<Vec<u32>> = vec![Vec::new(); num_groups];
+                    for (idx, &group_idx) in group_indices.iter().enumerate() {
+                        if arr.is_valid(idx) {
+                            group_rows[group_idx].push(arr.value(idx));
+                        }
+                    }
+                    let group_arrays: Vec<ArrayRef> = group_rows
+                        .iter()
+                        .map(|rows| Arc::new(UInt32Array::from(rows.clone())) as ArrayRef)
+                        .collect();
+
+                    c.bench_function(&format!("count_distinct_groups {name}"), |b| {
+                        b.iter(|| {
+                            let mut accumulators: Vec<_> = (0..num_groups)
+                                .map(|_| prepare_accumulator(DataType::UInt32))
+                                .collect();
+
+                            for (group_idx, batch) in group_arrays.iter().enumerate() {
+                                if !batch.is_empty() {
+                                    accumulators[group_idx]
+                                        .update_batch(std::slice::from_ref(batch))
+                                        .unwrap();
+                                }
+                            }
+
+                            let _results: Vec<_> = accumulators
+                                .iter_mut()
+                                .map(|acc| acc.evaluate().unwrap())
+                                .collect();
+                        })
+                    });
+                }
+            }
+        }
+    }
+}
+
+criterion_group!(
+    benches,
+    count_distinct_benchmark,
+    count_distinct_groups_benchmark
+);
+criterion_main!(benches);
diff --git a/datafusion/functions-aggregate/benches/first_last.rs b/datafusion/functions-aggregate/benches/first_last.rs
new file mode 100644
index 0000000000000..1d18e1c7dcd44
--- /dev/null
+++ b/datafusion/functions-aggregate/benches/first_last.rs
@@ -0,0 +1,359 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, BooleanArray, Int64Array};
+use arrow::compute::SortOptions;
+use arrow::datatypes::{DataType, Field, Int64Type, Schema};
+use arrow::util::bench_util::{create_boolean_array, create_primitive_array};
+use datafusion_common::instant::Instant;
+use std::hint::black_box;
+use std::sync::Arc;
+
+use datafusion_expr::{
+    Accumulator, AggregateUDFImpl, EmitTo, GroupsAccumulator, function::AccumulatorArgs,
+};
+use datafusion_functions_aggregate::first_last::{
+    FirstValue, LastValue, TrivialFirstValueAccumulator, TrivialLastValueAccumulator,
+};
+use datafusion_physical_expr::PhysicalSortExpr;
+use datafusion_physical_expr::expressions::col;
+
+use criterion::{BatchSize, Criterion, criterion_group, criterion_main};
+
+fn prepare_groups_accumulator(is_first: bool) -> Box<dyn GroupsAccumulator> {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("value", DataType::Int64, true),
+        Field::new("ord", DataType::Int64, true),
+    ]));
+
+    let order_expr = col("ord", &schema).unwrap();
+    let sort_expr = PhysicalSortExpr {
+        expr: order_expr,
+        options: SortOptions::default(),
+    };
+
+    let value_field: Arc<Field> = Field::new("value", DataType::Int64, true).into();
+    let accumulator_args = AccumulatorArgs {
+        return_field: Arc::clone(&value_field),
+        schema: &schema,
+        expr_fields: &[value_field],
+        ignore_nulls: false,
+        order_bys: std::slice::from_ref(&sort_expr),
+        is_reversed: false,
+        name: if is_first {
+            "FIRST_VALUE(value ORDER BY ord)"
+        } else {
+            "LAST_VALUE(value ORDER BY ord)"
+        },
+        is_distinct: false,
+        exprs: &[col("value", &schema).unwrap()],
+    };
+
+    if is_first {
+        FirstValue::new()
+            .create_groups_accumulator(accumulator_args)
+            .unwrap()
+    } else {
+        LastValue::new()
+            .create_groups_accumulator(accumulator_args)
+            .unwrap()
+    }
+}
+
+fn create_trivial_accumulator(
+    is_first: bool,
+    ignore_nulls: bool,
+) -> Box<dyn Accumulator> {
+    if is_first {
+        Box::new(
+            TrivialFirstValueAccumulator::try_new(&DataType::Int64, ignore_nulls)
+                .unwrap(),
+        )
+    } else {
+        Box::new(
+            TrivialLastValueAccumulator::try_new(&DataType::Int64, ignore_nulls).unwrap(),
+        )
+    }
+}
+
+#[expect(clippy::needless_pass_by_value)]
+#[expect(clippy::too_many_arguments)]
+fn evaluate_bench(
+    c: &mut Criterion,
+    is_first: bool,
+    emit_to: EmitTo,
+    name: &str,
+    values: ArrayRef,
+    ord: ArrayRef,
+    opt_filter: Option<&BooleanArray>,
+    num_groups: usize,
+) {
+    let n = values.len();
+    let group_indices: Vec<usize> = (0..n).map(|i| i % num_groups).collect();
+
+    c.bench_function(name, |b| {
+        b.iter_batched(
+            || {
+                let mut accumulator = prepare_groups_accumulator(is_first);
+                accumulator
+                    .update_batch(
+                        &[Arc::clone(&values), Arc::clone(&ord)],
+                        &group_indices,
+                        opt_filter,
+                        num_groups,
+                    )
+                    .unwrap();
+                accumulator
+            },
+            |mut accumulator| {
+                black_box(accumulator.evaluate(emit_to).unwrap());
+            },
+            BatchSize::SmallInput,
+        )
+    });
+}
+
+#[expect(clippy::needless_pass_by_value)]
+fn update_bench(
+    c: &mut Criterion,
+    is_first: bool,
+    name: &str,
+    values: ArrayRef,
+    ord: ArrayRef,
+    opt_filter: Option<&BooleanArray>,
+    num_groups: usize,
+) {
+    let n = values.len();
+    let group_indices: Vec<usize> = (0..n).map(|i| i % num_groups).collect();
+
+    // Initialize with worst-case ordering so update_batch forces rows comparison for all groups.
+    let worst_ord: ArrayRef = Arc::new(Int64Array::from(vec![
+        if is_first {
+            i64::MAX
+        } else {
+            i64::MIN
+        };
+        n
+    ]));
+
+    c.bench_function(name, |b| {
+        b.iter_batched(
+            || {
+                let mut accumulator = prepare_groups_accumulator(is_first);
+                accumulator
+                    .update_batch(
+                        &[Arc::clone(&values), Arc::clone(&worst_ord)],
+                        &group_indices,
+                        None, // no filter: ensure all groups are initialised
+                        num_groups,
+                    )
+                    .unwrap();
+                accumulator
+            },
+            |mut accumulator| {
+                for _ in 0..100 {
+                    #[expect(clippy::unit_arg)]
+                    black_box(
+                        accumulator
+                            .update_batch(
+                                &[Arc::clone(&values), Arc::clone(&ord)],
+                                &group_indices,
+                                opt_filter,
+                                num_groups,
+                            )
+                            .unwrap(),
+                    );
+                }
+            },
+            BatchSize::SmallInput,
+        )
+    });
+}
+
+#[expect(clippy::needless_pass_by_value)]
+fn merge_bench(
+    c: &mut Criterion,
+    is_first: bool,
+    name: &str,
+    values: ArrayRef,
+    ord: ArrayRef,
+    opt_filter: Option<&BooleanArray>,
+    num_groups: usize,
+) {
+    let n = values.len();
+    let group_indices: Vec<usize> = (0..n).map(|i| i % num_groups).collect();
+    let is_set: ArrayRef = Arc::new(BooleanArray::from(vec![true; n]));
+
+    // Initialize with worst-case ordering so update_batch forces rows comparison for all groups.
+    let worst_ord: ArrayRef = Arc::new(Int64Array::from(vec![
+        if is_first {
+            i64::MAX
+        } else {
+            i64::MIN
+        };
+        n
+    ]));
+
+    c.bench_function(name, |b| {
+        b.iter_batched(
+            || {
+                // Prebuild accumulator
+                let mut accumulator = prepare_groups_accumulator(is_first);
+                accumulator
+                    .update_batch(
+                        &[Arc::clone(&values), Arc::clone(&worst_ord)],
+                        &group_indices,
+                        opt_filter,
+                        num_groups,
+                    )
+                    .unwrap();
+                accumulator
+            },
+            |mut accumulator| {
+                for _ in 0..100 {
+                    #[expect(clippy::unit_arg)]
+                    black_box(
+                        accumulator
+                            .merge_batch(
+                                &[
+                                    Arc::clone(&values),
+                                    Arc::clone(&ord),
+                                    Arc::clone(&is_set),
+                                ],
+                                &group_indices,
+                                opt_filter,
+                                num_groups,
+                            )
+                            .unwrap(),
+                    );
+                }
+            },
+            BatchSize::SmallInput,
+        )
+    });
+}
+
+#[expect(clippy::needless_pass_by_value)]
+fn trivial_update_bench(
+    c: &mut Criterion,
+    is_first: bool,
+    ignore_nulls: bool,
+    name: &str,
+    values: ArrayRef,
+) {
+    c.bench_function(name, |b| {
+        b.iter_custom(|iters| {
+            // The bench is way too fast, so apply scaling factor
+            let mut accumulators: Vec<Box<dyn Accumulator>> = (0..iters * 100)
+                .map(|_| create_trivial_accumulator(is_first, ignore_nulls))
+                .collect();
+            let start = Instant::now();
+            for acc in &mut accumulators {
+                #[expect(clippy::unit_arg)]
+                black_box(acc.update_batch(&[Arc::clone(&values)]).unwrap());
+            }
+            start.elapsed()
+        })
+    });
+}
+
+fn first_last_benchmark(c: &mut Criterion) {
+    const N: usize = 65536;
+    const NUM_GROUPS: usize = 1024;
+
+    assert_eq!(N % NUM_GROUPS, 0);
+
+    for is_first in [true, false] {
+        for pct in [0, 90] {
+            let fn_name = if is_first {
+                "first_value"
+            } else {
+                "last_value"
+            };
+
+            let null_density = (pct as f32) / 100.0;
+            let values = Arc::new(create_primitive_array::<Int64Type>(N, null_density))
+                as ArrayRef;
+            let ord = Arc::new(create_primitive_array::<Int64Type>(N, null_density))
+                as ArrayRef;
+
+            for with_filter in [false, true] {
+                let filter = create_boolean_array(N, 0.0, 0.5);
+                let opt_filter = if with_filter { Some(&filter) } else { None };
+
+                evaluate_bench(
+                    c,
+                    is_first,
+                    EmitTo::First(2),
+                    &format!(
+                        "{fn_name} evaluate_bench nulls={pct}%, filter={with_filter}, first(2)"
+                    ),
+                    values.clone(),
+                    ord.clone(),
+                    opt_filter,
+                    NUM_GROUPS,
+                );
+                evaluate_bench(
+                    c,
+                    is_first,
+                    EmitTo::All,
+                    &format!(
+                        "{fn_name} evaluate_bench nulls={pct}%, filter={with_filter}, all"
+                    ),
+                    values.clone(),
+                    ord.clone(),
+                    opt_filter,
+                    NUM_GROUPS,
+                );
+
+                update_bench(
+                    c,
+                    is_first,
+                    &format!("{fn_name} update_bench nulls={pct}%, filter={with_filter}"),
+                    values.clone(),
+                    ord.clone(),
+                    opt_filter,
+                    NUM_GROUPS,
+                );
+                merge_bench(
+                    c,
+                    is_first,
+                    &format!("{fn_name} merge_bench nulls={pct}%, filter={with_filter}"),
+                    values.clone(),
+                    ord.clone(),
+                    opt_filter,
+                    NUM_GROUPS,
+                );
+            }
+
+            for ignore_nulls in [false, true] {
+                trivial_update_bench(
+                    c,
+                    is_first,
+                    ignore_nulls,
+                    &format!(
+                        "{fn_name} trivial_update_bench nulls={pct}%, ignore_nulls={ignore_nulls}"
+                    ),
+                    values.clone(),
+                );
+            }
+        }
+    }
+}
+
+criterion_group!(benches, first_last_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-aggregate/benches/median.rs b/datafusion/functions-aggregate/benches/median.rs
new file mode 100644
index 0000000000000..0f5f70c7b47f4
--- /dev/null
+++ b/datafusion/functions-aggregate/benches/median.rs
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::hint::black_box;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, Float64Array};
+use arrow::datatypes::{DataType, Field, Schema};
+use criterion::{BatchSize, Criterion, criterion_group, criterion_main};
+use datafusion_expr::function::AccumulatorArgs;
+use datafusion_expr::{Accumulator, AggregateUDFImpl};
+use datafusion_functions_aggregate::median::Median;
+use datafusion_physical_expr::expressions::col;
+
+const STEP_SIZE: usize = 128;
+const SLIDES_PER_ITER: usize = 32;
+const WINDOW_SIZES: [usize; 3] = [256, 4096, 16384];
+
+fn prepare_accumulator() -> Box<dyn Accumulator> {
+    let schema = Arc::new(Schema::new(vec![Field::new("f", DataType::Float64, true)]));
+    let expr = col("f", &schema).unwrap();
+    let accumulator_args = AccumulatorArgs {
+        return_field: Field::new("f", DataType::Float64, true).into(),
+        schema: &schema,
+        expr_fields: &[expr.return_field(&schema).unwrap()],
+        ignore_nulls: false,
+        order_bys: &[],
+        is_reversed: false,
+        name: "median(f)",
+        is_distinct: false,
+        exprs: &[expr],
+    };
+    Median::new().accumulator(accumulator_args).unwrap()
+}
+
+fn stream_array(len: usize, null_stride: Option<usize>) -> ArrayRef {
+    let values = (0..len)
+        .map(|idx| {
+            if null_stride.is_some_and(|stride| idx % stride == 0) {
+                None
+            } else {
+                Some(idx as f64)
+            }
+        })
+        .collect::<Vec<_>>();
+    Arc::new(Float64Array::from(values)) as ArrayRef
+}
+
+/// Benchmark the sliding window cycle: retract + update + evaluate
+fn sliding_window_bench(
+    c: &mut Criterion,
+    name: &str,
+    window_size: usize,
+    stream: &ArrayRef,
+) {
+    c.bench_function(name, |b| {
+        b.iter_batched(
+            || {
+                let mut accumulator = prepare_accumulator();
+                let initial = stream.slice(0, window_size);
+                accumulator
+                    .update_batch(std::slice::from_ref(&initial))
+                    .unwrap();
+                accumulator
+            },
+            |mut accumulator| {
+                for slide in 0..SLIDES_PER_ITER {
+                    let offset = slide * STEP_SIZE;
+                    let retract = stream.slice(offset, STEP_SIZE);
+                    let update = stream.slice(offset + window_size, STEP_SIZE);
+                    accumulator
+                        .retract_batch(std::slice::from_ref(&retract))
+                        .unwrap();
+                    accumulator
+                        .update_batch(std::slice::from_ref(&update))
+                        .unwrap();
+                    black_box(accumulator.evaluate().unwrap());
+                }
+            },
+            BatchSize::SmallInput,
+        )
+    });
+}
+
+fn median_benchmark(c: &mut Criterion) {
+    for window_size in WINDOW_SIZES {
+        let stream_len = window_size + STEP_SIZE * SLIDES_PER_ITER;
+        let stream_no_nulls = stream_array(stream_len, None);
+        let stream_with_nulls = stream_array(stream_len, Some(10));
+
+        sliding_window_bench(
+            c,
+            &format!("median sliding_window f64 no_nulls window_size={window_size}"),
+            window_size,
+            &stream_no_nulls,
+        );
+
+        sliding_window_bench(
+            c,
+            &format!("median sliding_window f64 with_nulls window_size={window_size}"),
+            window_size,
+            &stream_with_nulls,
+        );
+    }
+}
+
+criterion_group!(benches, median_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-aggregate/benches/min_max_bytes.rs b/datafusion/functions-aggregate/benches/min_max_bytes.rs
index 6d76ff2d0366d..9f4eb0f0c6246 100644
--- a/datafusion/functions-aggregate/benches/min_max_bytes.rs
+++ b/datafusion/functions-aggregate/benches/min_max_bytes.rs
@@ -29,8 +29,8 @@ use arrow::{
     array::{ArrayRef, StringArray},
     datatypes::{DataType, Field, Schema},
 };
-use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
-use datafusion_expr::{function::AccumulatorArgs, GroupsAccumulator};
+use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
+use datafusion_expr::{GroupsAccumulator, function::AccumulatorArgs};
 use datafusion_functions_aggregate::min_max;
 use datafusion_physical_expr::expressions::col;
 
diff --git a/datafusion/functions-aggregate/benches/percentile_cont.rs b/datafusion/functions-aggregate/benches/percentile_cont.rs
new file mode 100644
index 0000000000000..05119441e1b10
--- /dev/null
+++ b/datafusion/functions-aggregate/benches/percentile_cont.rs
@@ -0,0 +1,129 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::hint::black_box;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, Float64Array};
+use arrow::datatypes::{DataType, Field, Schema};
+use criterion::{BatchSize, Criterion, criterion_group, criterion_main};
+use datafusion_expr::function::AccumulatorArgs;
+use datafusion_expr::{Accumulator, AggregateUDFImpl};
+use datafusion_functions_aggregate::percentile_cont::PercentileCont;
+use datafusion_physical_expr::expressions::{col, lit};
+
+const STEP_SIZE: usize = 128;
+const SLIDES_PER_ITER: usize = 32;
+const WINDOW_SIZES: [usize; 3] = [256, 4096, 16384];
+
+fn prepare_accumulator() -> Box<dyn Accumulator> {
+    let schema = Arc::new(Schema::new(vec![Field::new("f", DataType::Float64, true)]));
+    let value_expr = col("f", &schema).unwrap();
+    let percentile_expr = lit(0.5_f64);
+    let value_field = value_expr.return_field(&schema).unwrap();
+    let percentile_field = percentile_expr.return_field(&schema).unwrap();
+    let accumulator_args = AccumulatorArgs {
+        return_field: Field::new("f", DataType::Float64, true).into(),
+        schema: &schema,
+        expr_fields: &[value_field, percentile_field],
+        ignore_nulls: false,
+        order_bys: &[],
+        is_reversed: false,
+        name: "percentile_cont(f, 0.5)",
+        is_distinct: false,
+        exprs: &[value_expr, percentile_expr],
+    };
+    PercentileCont::new().accumulator(accumulator_args).unwrap()
+}
+
+fn stream_array(len: usize, null_stride: Option<usize>) -> ArrayRef {
+    let values = (0..len)
+        .map(|idx| {
+            if null_stride.is_some_and(|stride| idx % stride == 0) {
+                None
+            } else {
+                Some(idx as f64)
+            }
+        })
+        .collect::<Vec<_>>();
+    Arc::new(Float64Array::from(values)) as ArrayRef
+}
+
+/// Benchmark the sliding window cycle: retract + update + evaluate
+fn sliding_window_bench(
+    c: &mut Criterion,
+    name: &str,
+    window_size: usize,
+    stream: &ArrayRef,
+) {
+    c.bench_function(name, |b| {
+        b.iter_batched(
+            || {
+                let mut accumulator = prepare_accumulator();
+                let initial = stream.slice(0, window_size);
+                accumulator
+                    .update_batch(std::slice::from_ref(&initial))
+                    .unwrap();
+                accumulator
+            },
+            |mut accumulator| {
+                for slide in 0..SLIDES_PER_ITER {
+                    let offset = slide * STEP_SIZE;
+                    let retract = stream.slice(offset, STEP_SIZE);
+                    let update = stream.slice(offset + window_size, STEP_SIZE);
+                    accumulator
+                        .retract_batch(std::slice::from_ref(&retract))
+                        .unwrap();
+                    accumulator
+                        .update_batch(std::slice::from_ref(&update))
+                        .unwrap();
+                    black_box(accumulator.evaluate().unwrap());
+                }
+            },
+            BatchSize::SmallInput,
+        )
+    });
+}
+
+fn percentile_cont_benchmark(c: &mut Criterion) {
+    for window_size in WINDOW_SIZES {
+        let stream_len = window_size + STEP_SIZE * SLIDES_PER_ITER;
+        let stream_no_nulls = stream_array(stream_len, None);
+        let stream_with_nulls = stream_array(stream_len, Some(10));
+
+        sliding_window_bench(
+            c,
+            &format!(
+                "percentile_cont sliding_window f64 no_nulls window_size={window_size}"
+            ),
+            window_size,
+            &stream_no_nulls,
+        );
+
+        sliding_window_bench(
+            c,
+            &format!(
+                "percentile_cont sliding_window f64 with_nulls window_size={window_size}"
+            ),
+            window_size,
+            &stream_with_nulls,
+        );
+    }
+}
+
+criterion_group!(benches, percentile_cont_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-aggregate/benches/sum.rs b/datafusion/functions-aggregate/benches/sum.rs
index d85f0686224b3..52998179024c1 100644
--- a/datafusion/functions-aggregate/benches/sum.rs
+++ b/datafusion/functions-aggregate/benches/sum.rs
@@ -22,11 +22,11 @@ use arrow::array::{ArrayRef, BooleanArray};
 use arrow::datatypes::{DataType, Field, Int64Type, Schema};
 use arrow::util::bench_util::{create_boolean_array, create_primitive_array};
 
-use datafusion_expr::{function::AccumulatorArgs, AggregateUDFImpl, GroupsAccumulator};
+use datafusion_expr::{AggregateUDFImpl, GroupsAccumulator, function::AccumulatorArgs};
 use datafusion_functions_aggregate::sum::Sum;
 use datafusion_physical_expr::expressions::col;
 
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 
 fn prepare_accumulator(data_type: &DataType) -> Box<dyn GroupsAccumulator> {
     let field = Field::new("f", data_type.clone(), true).into();
@@ -47,6 +47,7 @@ fn prepare_accumulator(data_type: &DataType) -> Box<dyn GroupsAccumulator> {
     sum_fn.create_groups_accumulator(accumulator_args).unwrap()
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn convert_to_state_bench(
     c: &mut Criterion,
     name: &str,
diff --git a/datafusion/functions-aggregate/src/approx_distinct.rs b/datafusion/functions-aggregate/src/approx_distinct.rs
index 998f981deef7b..cc42b6c22bdbe 100644
--- a/datafusion/functions-aggregate/src/approx_distinct.rs
+++ b/datafusion/functions-aggregate/src/approx_distinct.rs
@@ -17,33 +17,36 @@
 
 //! Defines physical expressions that can evaluated at runtime during query execution
 
-use crate::hyperloglog::HyperLogLog;
-use arrow::array::{BinaryArray, StringViewArray};
+use crate::hyperloglog::{HLL_HASH_STATE, HyperLogLog};
+use arrow::array::{Array, BinaryArray, StringViewArray};
 use arrow::array::{
     GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray,
 };
 use arrow::datatypes::{
-    ArrowPrimitiveType, Date32Type, Date64Type, FieldRef, Int16Type, Int32Type,
-    Int64Type, Int8Type, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
-    Time64NanosecondType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType,
-    TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type,
-    UInt8Type,
+    ArrowPrimitiveType, Date32Type, Date64Type, FieldRef, Int32Type, Int64Type,
+    Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType,
+    TimeUnit, TimestampMicrosecondType, TimestampMillisecondType,
+    TimestampNanosecondType, TimestampSecondType, UInt32Type, UInt64Type,
 };
 use arrow::{array::ArrayRef, datatypes::DataType, datatypes::Field};
 use datafusion_common::ScalarValue;
 use datafusion_common::{
-    downcast_value, internal_datafusion_err, internal_err, not_impl_err, DataFusionError,
-    Result,
+    DataFusionError, Result, downcast_value, internal_datafusion_err, internal_err,
+    not_impl_err,
 };
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::utils::format_state_name;
 use datafusion_expr::{
     Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
 };
+use datafusion_functions_aggregate_common::aggregate::count_distinct::{
+    Bitmap65536DistinctCountAccumulator, Bitmap65536DistinctCountAccumulatorI16,
+    BoolArray256DistinctCountAccumulator, BoolArray256DistinctCountAccumulatorI8,
+};
+use datafusion_functions_aggregate_common::noop_accumulator::NoopAccumulator;
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::fmt::{Debug, Formatter};
-use std::hash::Hash;
+use std::hash::{BuildHasher, Hash};
 use std::marker::PhantomData;
 
 make_udaf_expr_and_func!(
@@ -54,14 +57,14 @@ make_udaf_expr_and_func!(
     approx_distinct_udaf
 );
 
-impl<T: Hash> From<&HyperLogLog<T>> for ScalarValue {
+impl<T: Hash + ?Sized> From<&HyperLogLog<T>> for ScalarValue {
     fn from(v: &HyperLogLog<T>) -> ScalarValue {
         let values = v.as_ref().to_vec();
         ScalarValue::Binary(Some(values))
     }
 }
 
-impl<T: Hash> TryFrom<&[u8]> for HyperLogLog<T> {
+impl<T: Hash + ?Sized> TryFrom<&[u8]> for HyperLogLog<T> {
     type Error = DataFusionError;
     fn try_from(v: &[u8]) -> Result<HyperLogLog<T>> {
         let arr: [u8; 16384] = v.try_into().map_err(|_| {
@@ -71,7 +74,7 @@ impl<T: Hash> TryFrom<&[u8]> for HyperLogLog<T> {
     }
 }
 
-impl<T: Hash> TryFrom<&ScalarValue> for HyperLogLog<T> {
+impl<T: Hash + ?Sized> TryFrom<&ScalarValue> for HyperLogLog<T> {
     type Error = DataFusionError;
     fn try_from(v: &ScalarValue) -> Result<HyperLogLog<T>> {
         if let ScalarValue::Binary(Some(slice)) = v {
@@ -84,6 +87,36 @@ impl<T: Hash> TryFrom<&ScalarValue> for HyperLogLog<T> {
     }
 }
 
+#[derive(Debug)]
+struct ApproxDistinctBitmapWrapper<A: Accumulator> {
+    inner: A,
+}
+
+impl<A: Accumulator> Accumulator for ApproxDistinctBitmapWrapper<A> {
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        self.inner.update_batch(values)
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        match self.inner.evaluate()? {
+            ScalarValue::Int64(Some(v)) => Ok(ScalarValue::UInt64(Some(v as u64))),
+            other => internal_err!("unexpected: {other}"),
+        }
+    }
+
+    fn size(&self) -> usize {
+        self.inner.size()
+    }
+
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        self.inner.state()
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        self.inner.merge_batch(states)
+    }
+}
+
 #[derive(Debug)]
 struct NumericHLLAccumulator<T>
 where
@@ -98,7 +131,6 @@ where
     T: ArrowPrimitiveType,
     T::Native: Hash,
 {
-    /// new approx_distinct accumulator
     pub fn new() -> Self {
         Self {
             hll: HyperLogLog::new(),
@@ -111,7 +143,7 @@ struct StringHLLAccumulator<T>
 where
     T: OffsetSizeTrait,
 {
-    hll: HyperLogLog<String>,
+    hll: HyperLogLog<str>,
     phantom_data: PhantomData<T>,
 }
 
@@ -119,7 +151,6 @@ impl<T> StringHLLAccumulator<T>
 where
     T: OffsetSizeTrait,
 {
-    /// new approx_distinct accumulator
     pub fn new() -> Self {
         Self {
             hll: HyperLogLog::new(),
@@ -129,22 +160,14 @@ where
 }
 
 #[derive(Debug)]
-struct StringViewHLLAccumulator<T>
-where
-    T: OffsetSizeTrait,
-{
-    hll: HyperLogLog<String>,
-    phantom_data: PhantomData<T>,
+struct StringViewHLLAccumulator {
+    hll: HyperLogLog<str>,
 }
 
-impl<T> StringViewHLLAccumulator<T>
-where
-    T: OffsetSizeTrait,
-{
+impl StringViewHLLAccumulator {
     pub fn new() -> Self {
         Self {
             hll: HyperLogLog::new(),
-            phantom_data: PhantomData,
         }
     }
 }
@@ -154,7 +177,7 @@ struct BinaryHLLAccumulator<T>
 where
     T: OffsetSizeTrait,
 {
-    hll: HyperLogLog<Vec<u8>>,
+    hll: HyperLogLog<[u8]>,
     phantom_data: PhantomData<T>,
 }
 
@@ -162,7 +185,6 @@ impl<T> BinaryHLLAccumulator<T>
 where
     T: OffsetSizeTrait,
 {
-    /// new approx_distinct accumulator
     pub fn new() -> Self {
         Self {
             hll: HyperLogLog::new(),
@@ -171,9 +193,6 @@ where
     }
 }
 
-#[derive(Debug)]
-struct NullHLLAccumulator;
-
 macro_rules! default_accumulator_impl {
     () => {
         fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
@@ -215,23 +234,29 @@ where
         let array: &GenericBinaryArray<T> =
             downcast_value!(values[0], GenericBinaryArray, T);
         // flatten because we would skip nulls
-        self.hll
-            .extend(array.into_iter().flatten().map(|v| v.to_vec()));
+        self.hll.extend(array.into_iter().flatten());
         Ok(())
     }
 
     default_accumulator_impl!();
 }
 
-impl<T> Accumulator for StringViewHLLAccumulator<T>
-where
-    T: OffsetSizeTrait,
-{
+impl Accumulator for StringViewHLLAccumulator {
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
         let array: &StringViewArray = downcast_value!(values[0], StringViewArray);
-        // flatten because we would skip nulls
-        self.hll
-            .extend(array.iter().flatten().map(|s| s.to_string()));
+
+        // When all strings are stored inline in the StringView (≤ 12 bytes),
+        // hash the raw u128 view directly instead of materializing a &str.
+        if array.data_buffers().is_empty() {
+            for (i, &view) in array.views().iter().enumerate() {
+                if !array.is_null(i) {
+                    self.hll.add_hashed(HLL_HASH_STATE.hash_one(view));
+                }
+            }
+        } else {
+            self.hll.extend(array.iter().flatten());
+        }
+
         Ok(())
     }
 
@@ -246,8 +271,7 @@ where
         let array: &GenericStringArray<T> =
             downcast_value!(values[0], GenericStringArray, T);
         // flatten because we would skip nulls
-        self.hll
-            .extend(array.into_iter().flatten().map(|i| i.to_string()));
+        self.hll.extend(array.into_iter().flatten());
         Ok(())
     }
 
@@ -269,29 +293,6 @@ where
     default_accumulator_impl!();
 }
 
-impl Accumulator for NullHLLAccumulator {
-    fn update_batch(&mut self, _values: &[ArrayRef]) -> Result<()> {
-        // do nothing, all values are null
-        Ok(())
-    }
-
-    fn merge_batch(&mut self, _states: &[ArrayRef]) -> Result<()> {
-        Ok(())
-    }
-
-    fn state(&mut self) -> Result<Vec<ScalarValue>> {
-        Ok(vec![])
-    }
-
-    fn evaluate(&mut self) -> Result<ScalarValue> {
-        Ok(ScalarValue::UInt64(Some(0)))
-    }
-
-    fn size(&self) -> usize {
-        size_of_val(self)
-    }
-}
-
 impl Debug for ApproxDistinct {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("ApproxDistinct")
@@ -334,11 +335,40 @@ impl ApproxDistinct {
     }
 }
 
-impl AggregateUDFImpl for ApproxDistinct {
-    fn as_any(&self) -> &dyn Any {
-        self
+#[cold]
+fn get_small_int_approx_accumulator(
+    data_type: &DataType,
+) -> Result<Box<dyn Accumulator>> {
+    match data_type {
+        DataType::UInt8 => Ok(Box::new(ApproxDistinctBitmapWrapper {
+            inner: BoolArray256DistinctCountAccumulator::new(),
+        })),
+        DataType::Int8 => Ok(Box::new(ApproxDistinctBitmapWrapper {
+            inner: BoolArray256DistinctCountAccumulatorI8::new(),
+        })),
+        DataType::UInt16 => Ok(Box::new(ApproxDistinctBitmapWrapper {
+            inner: Bitmap65536DistinctCountAccumulator::new(),
+        })),
+        DataType::Int16 => Ok(Box::new(ApproxDistinctBitmapWrapper {
+            inner: Bitmap65536DistinctCountAccumulatorI16::new(),
+        })),
+        _ => internal_err!("unsupported small int type: {}", data_type),
     }
+}
+
+#[cold]
+fn get_small_int_state_field(name: &str, data_type: &DataType) -> Result<Vec<FieldRef>> {
+    Ok(vec![
+        Field::new_list(
+            format_state_name(name, "approx_distinct"),
+            Field::new_list_field(data_type.clone(), true),
+            false,
+        )
+        .into(),
+    ])
+}
 
+impl AggregateUDFImpl for ApproxDistinct {
     fn name(&self) -> &str {
         "approx_distinct"
     }
@@ -352,27 +382,39 @@ impl AggregateUDFImpl for ApproxDistinct {
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        Ok(vec![Field::new(
-            format_state_name(args.name, "hll_registers"),
-            DataType::Binary,
-            false,
-        )
-        .into()])
+        let data_type = args.input_fields[0].data_type();
+        match data_type {
+            DataType::Null => Ok(vec![
+                Field::new(
+                    format_state_name(args.name, self.name()),
+                    DataType::Null,
+                    true,
+                )
+                .into(),
+            ]),
+            DataType::UInt8 | DataType::Int8 | DataType::UInt16 | DataType::Int16 => {
+                get_small_int_state_field(args.name, data_type)
+            }
+            _ => Ok(vec![
+                Field::new(
+                    format_state_name(args.name, "hll_registers"),
+                    DataType::Binary,
+                    false,
+                )
+                .into(),
+            ]),
+        }
     }
 
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
         let data_type = acc_args.expr_fields[0].data_type();
 
         let accumulator: Box<dyn Accumulator> = match data_type {
-            // TODO u8, i8, u16, i16 shall really be done using bitmap, not HLL
-            // TODO support for boolean (trivial case)
-            // https://github.com/apache/datafusion/issues/1109
-            DataType::UInt8 => Box::new(NumericHLLAccumulator::<UInt8Type>::new()),
-            DataType::UInt16 => Box::new(NumericHLLAccumulator::<UInt16Type>::new()),
+            DataType::UInt8 | DataType::Int8 | DataType::UInt16 | DataType::Int16 => {
+                return get_small_int_approx_accumulator(data_type);
+            }
             DataType::UInt32 => Box::new(NumericHLLAccumulator::<UInt32Type>::new()),
             DataType::UInt64 => Box::new(NumericHLLAccumulator::<UInt64Type>::new()),
-            DataType::Int8 => Box::new(NumericHLLAccumulator::<Int8Type>::new()),
-            DataType::Int16 => Box::new(NumericHLLAccumulator::<Int16Type>::new()),
             DataType::Int32 => Box::new(NumericHLLAccumulator::<Int32Type>::new()),
             DataType::Int64 => Box::new(NumericHLLAccumulator::<Int64Type>::new()),
             DataType::Date32 => Box::new(NumericHLLAccumulator::<Date32Type>::new()),
@@ -403,14 +445,16 @@ impl AggregateUDFImpl for ApproxDistinct {
             }
             DataType::Utf8 => Box::new(StringHLLAccumulator::<i32>::new()),
             DataType::LargeUtf8 => Box::new(StringHLLAccumulator::<i64>::new()),
-            DataType::Utf8View => Box::new(StringViewHLLAccumulator::<i32>::new()),
+            DataType::Utf8View => Box::new(StringViewHLLAccumulator::new()),
             DataType::Binary => Box::new(BinaryHLLAccumulator::<i32>::new()),
             DataType::LargeBinary => Box::new(BinaryHLLAccumulator::<i64>::new()),
-            DataType::Null => Box::new(NullHLLAccumulator),
+            DataType::Null => {
+                Box::new(NoopAccumulator::new(ScalarValue::UInt64(Some(0))))
+            }
             other => {
                 return not_impl_err!(
-                "Support for 'approx_distinct' for data type {other} is not implemented"
-            )
+                    "Support for 'approx_distinct' for data type {other} is not implemented"
+                );
             }
         };
         Ok(accumulator)
diff --git a/datafusion/functions-aggregate/src/approx_median.rs b/datafusion/functions-aggregate/src/approx_median.rs
index 530dbf3e43c79..162dc224f2ccb 100644
--- a/datafusion/functions-aggregate/src/approx_median.rs
+++ b/datafusion/functions-aggregate/src/approx_median.rs
@@ -19,16 +19,17 @@
 
 use arrow::datatypes::DataType::{Float64, UInt64};
 use arrow::datatypes::{DataType, Field, FieldRef};
-use std::any::Any;
+use datafusion_common::types::NativeType;
+use datafusion_functions_aggregate_common::noop_accumulator::NoopAccumulator;
 use std::fmt::Debug;
 use std::sync::Arc;
 
-use datafusion_common::{not_impl_err, plan_err, Result};
+use datafusion_common::{Result, not_impl_err};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
-use datafusion_expr::type_coercion::aggregates::NUMERICS;
 use datafusion_expr::utils::format_state_name;
 use datafusion_expr::{
-    Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
+    Accumulator, AggregateUDFImpl, Coercion, Documentation, Signature, TypeSignature,
+    TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -57,20 +58,11 @@ make_udaf_expr_and_func!(
 ```"#,
     standard_argument(name = "expression",)
 )]
-#[derive(PartialEq, Eq, Hash)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ApproxMedian {
     signature: Signature,
 }
 
-impl Debug for ApproxMedian {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        f.debug_struct("ApproxMedian")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for ApproxMedian {
     fn default() -> Self {
         Self::new()
@@ -81,33 +73,46 @@ impl ApproxMedian {
     /// Create a new APPROX_MEDIAN aggregate function
     pub fn new() -> Self {
         Self {
-            signature: Signature::uniform(1, NUMERICS.to_vec(), Volatility::Immutable),
+            signature: Signature::one_of(
+                vec![TypeSignature::Coercible(vec![Coercion::new_implicit(
+                    TypeSignatureClass::Float,
+                    vec![TypeSignatureClass::Numeric],
+                    NativeType::Float64,
+                )])],
+                Volatility::Immutable,
+            ),
         }
     }
 }
 
 impl AggregateUDFImpl for ApproxMedian {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        Ok(vec![
-            Field::new(format_state_name(args.name, "max_size"), UInt64, false),
-            Field::new(format_state_name(args.name, "sum"), Float64, false),
-            Field::new(format_state_name(args.name, "count"), UInt64, false),
-            Field::new(format_state_name(args.name, "max"), Float64, false),
-            Field::new(format_state_name(args.name, "min"), Float64, false),
-            Field::new_list(
-                format_state_name(args.name, "centroids"),
-                Field::new_list_field(Float64, true),
-                false,
-            ),
-        ]
-        .into_iter()
-        .map(Arc::new)
-        .collect())
+        if args.input_fields[0].data_type().is_null() {
+            Ok(vec![
+                Field::new(
+                    format_state_name(args.name, self.name()),
+                    DataType::Null,
+                    true,
+                )
+                .into(),
+            ])
+        } else {
+            Ok(vec![
+                Field::new(format_state_name(args.name, "max_size"), UInt64, false),
+                Field::new(format_state_name(args.name, "sum"), Float64, false),
+                Field::new(format_state_name(args.name, "count"), Float64, false),
+                Field::new(format_state_name(args.name, "max"), Float64, false),
+                Field::new(format_state_name(args.name, "min"), Float64, false),
+                Field::new_list(
+                    format_state_name(args.name, "centroids"),
+                    Field::new_list_field(Float64, true),
+                    false,
+                ),
+            ]
+            .into_iter()
+            .map(Arc::new)
+            .collect())
+        }
     }
 
     fn name(&self) -> &str {
@@ -119,9 +124,6 @@ impl AggregateUDFImpl for ApproxMedian {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if !arg_types[0].is_numeric() {
-            return plan_err!("ApproxMedian requires numeric input types");
-        }
         Ok(arg_types[0].clone())
     }
 
@@ -132,10 +134,14 @@ impl AggregateUDFImpl for ApproxMedian {
             );
         }
 
-        Ok(Box::new(ApproxPercentileAccumulator::new(
-            0.5_f64,
-            acc_args.expr_fields[0].data_type().clone(),
-        )))
+        if acc_args.expr_fields[0].data_type().is_null() {
+            Ok(Box::new(NoopAccumulator::default()))
+        } else {
+            Ok(Box::new(ApproxPercentileAccumulator::new(
+                0.5_f64,
+                acc_args.expr_fields[0].data_type().clone(),
+            )))
+        }
     }
 
     fn documentation(&self) -> Option<&Documentation> {
diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont.rs b/datafusion/functions-aggregate/src/approx_percentile_cont.rs
index 4015abc6adf70..3f1adcca12362 100644
--- a/datafusion/functions-aggregate/src/approx_percentile_cont.rs
+++ b/datafusion/functions-aggregate/src/approx_percentile_cont.rs
@@ -15,36 +15,30 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
-use std::fmt::{Debug, Formatter};
+use std::fmt::Debug;
 use std::mem::size_of_val;
 use std::sync::Arc;
 
-use arrow::array::Array;
+use arrow::array::{Array, Float16Array};
 use arrow::compute::{filter, is_not_null};
 use arrow::datatypes::FieldRef;
 use arrow::{
-    array::{
-        ArrayRef, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array,
-        Int8Array, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
-    },
+    array::{ArrayRef, Float32Array, Float64Array},
     datatypes::{DataType, Field},
 };
+use datafusion_common::types::{NativeType, logical_float64};
 use datafusion_common::{
-    downcast_value, internal_err, not_impl_err, plan_err, DataFusionError, Result,
-    ScalarValue,
+    DataFusionError, Result, ScalarValue, downcast_value, internal_err, not_impl_err,
+    plan_err,
 };
 use datafusion_expr::expr::{AggregateFunction, Sort};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
-use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS};
 use datafusion_expr::utils::format_state_name;
 use datafusion_expr::{
-    Accumulator, AggregateUDFImpl, Documentation, Expr, Signature, TypeSignature,
-    Volatility,
-};
-use datafusion_functions_aggregate_common::tdigest::{
-    TDigest, TryIntoF64, DEFAULT_MAX_SIZE,
+    Accumulator, AggregateUDFImpl, Coercion, Documentation, Expr, Signature,
+    TypeSignature, TypeSignatureClass, Volatility,
 };
+use datafusion_functions_aggregate_common::tdigest::{DEFAULT_MAX_SIZE, TDigest};
 use datafusion_macros::user_doc;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 
@@ -121,20 +115,11 @@ An alternate syntax is also supported:
         description = "Number of centroids to use in the t-digest algorithm. _Default is 100_. A higher number results in more accurate approximation but requires more memory."
     )
 )]
-#[derive(PartialEq, Eq, Hash)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ApproxPercentileCont {
     signature: Signature,
 }
 
-impl Debug for ApproxPercentileCont {
-    fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
-        f.debug_struct("ApproxPercentileCont")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for ApproxPercentileCont {
     fn default() -> Self {
         Self::new()
@@ -144,27 +129,49 @@ impl Default for ApproxPercentileCont {
 impl ApproxPercentileCont {
     /// Create a new [`ApproxPercentileCont`] aggregate function.
     pub fn new() -> Self {
-        let mut variants = Vec::with_capacity(NUMERICS.len() * (INTEGERS.len() + 1));
         // Accept any numeric value paired with a float64 percentile
-        for num in NUMERICS {
-            variants.push(TypeSignature::Exact(vec![num.clone(), DataType::Float64]));
-            // Additionally accept an integer number of centroids for T-Digest
-            for int in INTEGERS {
-                variants.push(TypeSignature::Exact(vec![
-                    num.clone(),
-                    DataType::Float64,
-                    int.clone(),
-                ]))
-            }
-        }
-        Self {
-            signature: Signature::one_of(variants, Volatility::Immutable),
-        }
+        let signature = Signature::one_of(
+            vec![
+                // 2 args - numeric, percentile (float)
+                TypeSignature::Coercible(vec![
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Float,
+                        vec![TypeSignatureClass::Numeric],
+                        NativeType::Float64,
+                    ),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_float64()),
+                        vec![TypeSignatureClass::Numeric],
+                        NativeType::Float64,
+                    ),
+                ]),
+                // 3 args - numeric, percentile (float), number of centroid for T-Digest (integer)
+                TypeSignature::Coercible(vec![
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Float,
+                        vec![TypeSignatureClass::Numeric],
+                        NativeType::Float64,
+                    ),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_float64()),
+                        vec![TypeSignatureClass::Numeric],
+                        NativeType::Float64,
+                    ),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Integer,
+                        vec![TypeSignatureClass::Numeric],
+                        NativeType::Int64,
+                    ),
+                ]),
+            ],
+            Volatility::Immutable,
+        );
+        Self { signature }
     }
 
     pub(crate) fn create_accumulator(
         &self,
-        args: AccumulatorArgs,
+        args: &AccumulatorArgs,
     ) -> Result<ApproxPercentileAccumulator> {
         let percentile =
             validate_percentile_expr(&args.exprs[1], "APPROX_PERCENTILE_CONT")?;
@@ -189,18 +196,13 @@ impl ApproxPercentileCont {
 
         let data_type = args.expr_fields[0].data_type();
         let accumulator: ApproxPercentileAccumulator = match data_type {
-            DataType::UInt8
-            | DataType::UInt16
-            | DataType::UInt32
-            | DataType::UInt64
-            | DataType::Int8
-            | DataType::Int16
-            | DataType::Int32
-            | DataType::Int64
-            | DataType::Float32
-            | DataType::Float64 => {
+            DataType::Float16 | DataType::Float32 | DataType::Float64 => {
                 if let Some(max_size) = tdigest_max_size {
-                    ApproxPercentileAccumulator::new_with_max_size(percentile, data_type.clone(), max_size)
+                    ApproxPercentileAccumulator::new_with_max_size(
+                        percentile,
+                        data_type.clone(),
+                        max_size,
+                    )
                 } else {
                     ApproxPercentileAccumulator::new(percentile, data_type.clone())
                 }
@@ -208,7 +210,7 @@ impl ApproxPercentileCont {
             other => {
                 return not_impl_err!(
                     "Support for 'APPROX_PERCENTILE_CONT' for data type {other} is not implemented"
-                )
+                );
             }
         };
 
@@ -237,19 +239,14 @@ fn validate_input_max_size_expr(expr: &Arc<dyn PhysicalExpr>) -> Result<usize> {
             return plan_err!(
                 "Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be UInt > 0 literal (got data type {}).",
                 sv.data_type()
-            )
-        },
+            );
+        }
     };
 
     Ok(max_size)
 }
 
 impl AggregateUDFImpl for ApproxPercentileCont {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    #[allow(rustdoc::private_intra_doc_links)]
     /// See [`TDigest::to_scalar_state()`] for a description of the serialized
     /// state.
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
@@ -266,7 +263,7 @@ impl AggregateUDFImpl for ApproxPercentileCont {
             ),
             Field::new(
                 format_state_name(args.name, "count"),
-                DataType::UInt64,
+                DataType::Float64,
                 false,
             ),
             Field::new(
@@ -300,7 +297,7 @@ impl AggregateUDFImpl for ApproxPercentileCont {
 
     #[inline]
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
-        Ok(Box::new(self.create_accumulator(acc_args)?))
+        Ok(Box::new(self.create_accumulator(&acc_args)?))
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
@@ -315,10 +312,6 @@ impl AggregateUDFImpl for ApproxPercentileCont {
         Ok(arg_types[0].clone())
     }
 
-    fn supports_null_handling_clause(&self) -> bool {
-        false
-    }
-
     fn supports_within_group_clause(&self) -> bool {
         true
     }
@@ -376,83 +369,19 @@ impl ApproxPercentileAccumulator {
         match values.data_type() {
             DataType::Float64 => {
                 let array = downcast_value!(values, Float64Array);
-                Ok(array
-                    .values()
-                    .iter()
-                    .filter_map(|v| v.try_as_f64().transpose())
-                    .collect::<Result<Vec<_>>>()?)
+                Ok(array.values().iter().copied().collect::<Vec<_>>())
             }
             DataType::Float32 => {
                 let array = downcast_value!(values, Float32Array);
-                Ok(array
-                    .values()
-                    .iter()
-                    .filter_map(|v| v.try_as_f64().transpose())
-                    .collect::<Result<Vec<_>>>()?)
-            }
-            DataType::Int64 => {
-                let array = downcast_value!(values, Int64Array);
-                Ok(array
-                    .values()
-                    .iter()
-                    .filter_map(|v| v.try_as_f64().transpose())
-                    .collect::<Result<Vec<_>>>()?)
-            }
-            DataType::Int32 => {
-                let array = downcast_value!(values, Int32Array);
-                Ok(array
-                    .values()
-                    .iter()
-                    .filter_map(|v| v.try_as_f64().transpose())
-                    .collect::<Result<Vec<_>>>()?)
-            }
-            DataType::Int16 => {
-                let array = downcast_value!(values, Int16Array);
-                Ok(array
-                    .values()
-                    .iter()
-                    .filter_map(|v| v.try_as_f64().transpose())
-                    .collect::<Result<Vec<_>>>()?)
-            }
-            DataType::Int8 => {
-                let array = downcast_value!(values, Int8Array);
-                Ok(array
-                    .values()
-                    .iter()
-                    .filter_map(|v| v.try_as_f64().transpose())
-                    .collect::<Result<Vec<_>>>()?)
-            }
-            DataType::UInt64 => {
-                let array = downcast_value!(values, UInt64Array);
-                Ok(array
-                    .values()
-                    .iter()
-                    .filter_map(|v| v.try_as_f64().transpose())
-                    .collect::<Result<Vec<_>>>()?)
-            }
-            DataType::UInt32 => {
-                let array = downcast_value!(values, UInt32Array);
-                Ok(array
-                    .values()
-                    .iter()
-                    .filter_map(|v| v.try_as_f64().transpose())
-                    .collect::<Result<Vec<_>>>()?)
-            }
-            DataType::UInt16 => {
-                let array = downcast_value!(values, UInt16Array);
-                Ok(array
-                    .values()
-                    .iter()
-                    .filter_map(|v| v.try_as_f64().transpose())
-                    .collect::<Result<Vec<_>>>()?)
+                Ok(array.values().iter().map(|v| *v as f64).collect::<Vec<_>>())
             }
-            DataType::UInt8 => {
-                let array = downcast_value!(values, UInt8Array);
+            DataType::Float16 => {
+                let array = downcast_value!(values, Float16Array);
                 Ok(array
                     .values()
                     .iter()
-                    .filter_map(|v| v.try_as_f64().transpose())
-                    .collect::<Result<Vec<_>>>()?)
+                    .map(|v| v.to_f64())
+                    .collect::<Vec<_>>())
             }
             e => internal_err!(
                 "APPROX_PERCENTILE_CONT is not expected to receive the type {e:?}"
@@ -479,7 +408,7 @@ impl Accumulator for ApproxPercentileAccumulator {
     }
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
-        if self.digest.count() == 0 {
+        if self.digest.count() == 0.0 {
             return ScalarValue::try_from(self.return_type.clone());
         }
         let q = self.digest.estimate_quantile(self.percentile);
@@ -487,14 +416,7 @@ impl Accumulator for ApproxPercentileAccumulator {
         // These acceptable return types MUST match the validation in
         // ApproxPercentile::create_accumulator.
         Ok(match &self.return_type {
-            DataType::Int8 => ScalarValue::Int8(Some(q as i8)),
-            DataType::Int16 => ScalarValue::Int16(Some(q as i16)),
-            DataType::Int32 => ScalarValue::Int32(Some(q as i32)),
-            DataType::Int64 => ScalarValue::Int64(Some(q as i64)),
-            DataType::UInt8 => ScalarValue::UInt8(Some(q as u8)),
-            DataType::UInt16 => ScalarValue::UInt16(Some(q as u16)),
-            DataType::UInt32 => ScalarValue::UInt32(Some(q as u32)),
-            DataType::UInt64 => ScalarValue::UInt64(Some(q as u64)),
+            DataType::Float16 => ScalarValue::Float16(Some(half::f16::from_f64(q))),
             DataType::Float32 => ScalarValue::Float32(Some(q as f32)),
             DataType::Float64 => ScalarValue::Float64(Some(q)),
             v => unreachable!("unexpected return type {}", v),
@@ -555,8 +477,8 @@ mod tests {
             ApproxPercentileAccumulator::new_with_max_size(0.5, DataType::Float64, 100);
 
         accumulator.merge_digests(&[t1]);
-        assert_eq!(accumulator.digest.count(), 50_000);
+        assert_eq!(accumulator.digest.count(), 50_000.0);
         accumulator.merge_digests(&[t2]);
-        assert_eq!(accumulator.digest.count(), 100_000);
+        assert_eq!(accumulator.digest.count(), 100_000.0);
     }
 }
diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs
index 51891ce7f2779..6ada47fb38040 100644
--- a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs
+++ b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs
@@ -15,8 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
-use std::fmt::{Debug, Formatter};
+use std::fmt::Debug;
 use std::hash::Hash;
 use std::mem::size_of_val;
 use std::sync::Arc;
@@ -25,13 +24,13 @@ use arrow::compute::{and, filter, is_not_null};
 use arrow::datatypes::FieldRef;
 use arrow::{array::ArrayRef, datatypes::DataType};
 use datafusion_common::ScalarValue;
-use datafusion_common::{not_impl_err, plan_err, Result};
+use datafusion_common::types::{NativeType, logical_float64};
+use datafusion_common::{Result, not_impl_err, plan_err};
 use datafusion_expr::expr::{AggregateFunction, Sort};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
-use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS};
-use datafusion_expr::Volatility::Immutable;
 use datafusion_expr::{
-    Accumulator, AggregateUDFImpl, Documentation, Expr, Signature, TypeSignature,
+    Accumulator, AggregateUDFImpl, Coercion, Documentation, Expr, Signature,
+    TypeSignature, TypeSignatureClass, Volatility,
 };
 use datafusion_functions_aggregate_common::tdigest::{Centroid, TDigest};
 use datafusion_macros::user_doc;
@@ -111,20 +110,12 @@ An alternative syntax is also supported:
         description = "Number of centroids to use in the t-digest algorithm. _Default is 100_. A higher number results in more accurate approximation but requires more memory."
     )
 )]
-#[derive(PartialEq, Eq, Hash)]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct ApproxPercentileContWithWeight {
     signature: Signature,
     approx_percentile_cont: ApproxPercentileCont,
 }
 
-impl Debug for ApproxPercentileContWithWeight {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("ApproxPercentileContWithWeight")
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for ApproxPercentileContWithWeight {
     fn default() -> Self {
         Self::new()
@@ -134,36 +125,60 @@ impl Default for ApproxPercentileContWithWeight {
 impl ApproxPercentileContWithWeight {
     /// Create a new [`ApproxPercentileContWithWeight`] aggregate function.
     pub fn new() -> Self {
-        let mut variants = Vec::with_capacity(NUMERICS.len() * (INTEGERS.len() + 1));
-        // Accept any numeric value paired with weight and float64 percentile
-        for num in NUMERICS {
-            variants.push(TypeSignature::Exact(vec![
-                num.clone(),
-                num.clone(),
-                DataType::Float64,
-            ]));
-            // Additionally accept an integer number of centroids for T-Digest
-            for int in INTEGERS {
-                variants.push(TypeSignature::Exact(vec![
-                    num.clone(),
-                    num.clone(),
-                    DataType::Float64,
-                    int.clone(),
-                ]));
-            }
-        }
+        let signature = Signature::one_of(
+            vec![
+                // 3 args - numeric, weight (float), percentile (float)
+                TypeSignature::Coercible(vec![
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Float,
+                        vec![TypeSignatureClass::Numeric],
+                        NativeType::Float64,
+                    ),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Float,
+                        vec![TypeSignatureClass::Numeric],
+                        NativeType::Float64,
+                    ),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_float64()),
+                        vec![TypeSignatureClass::Numeric],
+                        NativeType::Float64,
+                    ),
+                ]),
+                // 4 args - numeric, weight (float), percentile (float), centroid (integer)
+                TypeSignature::Coercible(vec![
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Float,
+                        vec![TypeSignatureClass::Numeric],
+                        NativeType::Float64,
+                    ),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Float,
+                        vec![TypeSignatureClass::Numeric],
+                        NativeType::Float64,
+                    ),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_float64()),
+                        vec![TypeSignatureClass::Numeric],
+                        NativeType::Float64,
+                    ),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Integer,
+                        vec![TypeSignatureClass::Numeric],
+                        NativeType::Int64,
+                    ),
+                ]),
+            ],
+            Volatility::Immutable,
+        );
         Self {
-            signature: Signature::one_of(variants, Immutable),
+            signature,
             approx_percentile_cont: ApproxPercentileCont::new(),
         }
     }
 }
 
 impl AggregateUDFImpl for ApproxPercentileContWithWeight {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "approx_percentile_cont_with_weight"
     }
@@ -184,7 +199,9 @@ impl AggregateUDFImpl for ApproxPercentileContWithWeight {
             );
         }
         if arg_types[2] != DataType::Float64 {
-            return plan_err!("approx_percentile_cont_with_weight requires float64 percentile input types");
+            return plan_err!(
+                "approx_percentile_cont_with_weight requires float64 percentile input types"
+            );
         }
         if arg_types.len() == 4 && !arg_types[3].is_integer() {
             return plan_err!(
@@ -244,24 +261,19 @@ impl AggregateUDFImpl for ApproxPercentileContWithWeight {
             is_distinct: acc_args.is_distinct,
         };
         let approx_percentile_cont_accumulator =
-            self.approx_percentile_cont.create_accumulator(sub_args)?;
+            self.approx_percentile_cont.create_accumulator(&sub_args)?;
         let accumulator = ApproxPercentileWithWeightAccumulator::new(
             approx_percentile_cont_accumulator,
         );
         Ok(Box::new(accumulator))
     }
 
-    #[allow(rustdoc::private_intra_doc_links)]
     /// See [`TDigest::to_scalar_state()`] for a description of the serialized
     /// state.
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
         self.approx_percentile_cont.state_fields(args)
     }
 
-    fn supports_null_handling_clause(&self) -> bool {
-        false
-    }
-
     fn supports_within_group_clause(&self) -> bool {
         true
     }
diff --git a/datafusion/functions-aggregate/src/array_agg.rs b/datafusion/functions-aggregate/src/array_agg.rs
index b830588d404b0..861d7712ba1b0 100644
--- a/datafusion/functions-aggregate/src/array_agg.rs
+++ b/datafusion/functions-aggregate/src/array_agg.rs
@@ -23,21 +23,25 @@ use std::mem::{size_of, size_of_val, take};
 use std::sync::Arc;
 
 use arrow::array::{
-    new_empty_array, Array, ArrayRef, AsArray, BooleanArray, ListArray, StructArray,
+    Array, ArrayRef, AsArray, BooleanArray, ListArray, NullBufferBuilder, StructArray,
+    UInt32Array, new_empty_array,
 };
-use arrow::compute::{filter, SortOptions};
+use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
+use arrow::compute::{SortOptions, filter};
 use arrow::datatypes::{DataType, Field, FieldRef, Fields};
 
 use datafusion_common::cast::as_list_array;
 use datafusion_common::utils::{
-    compare_rows, get_row_at_idx, take_function_args, SingleRowListArrayBuilder,
+    SingleRowListArrayBuilder, compare_rows, get_row_at_idx, take_function_args,
 };
-use datafusion_common::{exec_err, internal_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, assert_eq_or_internal_err, exec_err};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::utils::format_state_name;
 use datafusion_expr::{
-    Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
+    Accumulator, AggregateUDFImpl, Documentation, EmitTo, GroupsAccumulator, Signature,
+    Volatility,
 };
+use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::filter_to_nulls;
 use datafusion_functions_aggregate_common::merge_arrays::merge_ordered_arrays;
 use datafusion_functions_aggregate_common::order::AggregateOrderSensitivity;
 use datafusion_functions_aggregate_common::utils::ordering_fields;
@@ -92,10 +96,6 @@ impl Default for ArrayAgg {
 }
 
 impl AggregateUDFImpl for ArrayAgg {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array_agg"
     }
@@ -113,22 +113,26 @@ impl AggregateUDFImpl for ArrayAgg {
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
         if args.is_distinct {
-            return Ok(vec![Field::new_list(
-                format_state_name(args.name, "distinct_array_agg"),
+            return Ok(vec![
+                Field::new_list(
+                    format_state_name(args.name, "distinct_array_agg"),
+                    // See COMMENTS.md to understand why nullable is set to true
+                    Field::new_list_field(args.input_fields[0].data_type().clone(), true),
+                    true,
+                )
+                .into(),
+            ]);
+        }
+
+        let mut fields = vec![
+            Field::new_list(
+                format_state_name(args.name, "array_agg"),
                 // See COMMENTS.md to understand why nullable is set to true
                 Field::new_list_field(args.input_fields[0].data_type().clone(), true),
                 true,
             )
-            .into()]);
-        }
-
-        let mut fields = vec![Field::new_list(
-            format_state_name(args.name, "array_agg"),
-            // See COMMENTS.md to understand why nullable is set to true
-            Field::new_list_field(args.input_fields[0].data_type().clone(), true),
-            true,
-        )
-        .into()];
+            .into(),
+        ];
 
         if args.ordering_fields.is_empty() {
             return Ok(fields);
@@ -224,6 +228,27 @@ impl AggregateUDFImpl for ArrayAgg {
         datafusion_expr::ReversedUDAF::Reversed(array_agg_udaf())
     }
 
+    fn groups_accumulator_supported(&self, args: AccumulatorArgs) -> bool {
+        !args.is_distinct && args.order_bys.is_empty()
+    }
+
+    fn create_groups_accumulator(
+        &self,
+        args: AccumulatorArgs,
+    ) -> Result<Box<dyn GroupsAccumulator>> {
+        let field = &args.expr_fields[0];
+        let data_type = field.data_type().clone();
+        let ignore_nulls = args.ignore_nulls && field.is_nullable();
+        Ok(Box::new(ArrayAggGroupsAccumulator::new(
+            data_type,
+            ignore_nulls,
+        )))
+    }
+
+    fn supports_null_handling_clause(&self) -> bool {
+        true
+    }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
@@ -315,9 +340,7 @@ impl Accumulator for ArrayAggAccumulator {
             return Ok(());
         }
 
-        if values.len() != 1 {
-            return internal_err!("expects single batch");
-        }
+        assert_eq_or_internal_err!(values.len(), 1, "expects single batch");
 
         let val = &values[0];
         let nulls = if self.ignore_nulls {
@@ -345,9 +368,7 @@ impl Accumulator for ArrayAggAccumulator {
             return Ok(());
         }
 
-        if states.len() != 1 {
-            return internal_err!("expects single state");
-        }
+        assert_eq_or_internal_err!(states.len(), 1, "expects single state");
 
         let list_arr = as_list_array(&states[0])?;
 
@@ -411,7 +432,332 @@ impl Accumulator for ArrayAggAccumulator {
 }
 
 #[derive(Debug)]
-struct DistinctArrayAggAccumulator {
+struct ArrayAggGroupsAccumulator {
+    datatype: DataType,
+    ignore_nulls: bool,
+    /// Source arrays — input arrays (from update_batch) or list backing
+    /// arrays (from merge_batch).
+    batches: Vec<ArrayRef>,
+    /// Per-batch list of (group_idx, row_idx) pairs.
+    batch_entries: Vec<Vec<(u32, u32)>>,
+    /// Total number of groups tracked.
+    num_groups: usize,
+}
+
+impl ArrayAggGroupsAccumulator {
+    fn new(datatype: DataType, ignore_nulls: bool) -> Self {
+        Self {
+            datatype,
+            ignore_nulls,
+            batches: Vec::new(),
+            batch_entries: Vec::new(),
+            num_groups: 0,
+        }
+    }
+
+    fn clear_state(&mut self) {
+        // `size()` measures Vec capacity rather than len, so allocate new
+        // buffers instead of using `clear()`.
+        self.batches = Vec::new();
+        self.batch_entries = Vec::new();
+        self.num_groups = 0;
+    }
+
+    fn compact_retained_state(&mut self, emit_groups: usize) -> Result<()> {
+        // EmitTo::First is used to recover from memory pressure. Simply
+        // removing emitted entries in place is not enough because mixed batches
+        // would continue to pin their original Array arrays, even if only a few
+        // retained rows remain.
+        //
+        // Rebuild the retained state from scratch so fully emitted batches are
+        // dropped, mixed batches are compacted to arrays containing only the
+        // surviving rows, and retained metadata is right-sized.
+        let emit_groups = emit_groups as u32;
+        let old_batches = take(&mut self.batches);
+        let old_batch_entries = take(&mut self.batch_entries);
+
+        let mut batches = Vec::new();
+        let mut batch_entries = Vec::new();
+
+        for (batch, entries) in old_batches.into_iter().zip(old_batch_entries) {
+            let retained_len = entries.iter().filter(|(g, _)| *g >= emit_groups).count();
+
+            if retained_len == 0 {
+                continue;
+            }
+
+            if retained_len == entries.len() {
+                // Nothing was emitted from this batch, so we keep the existing
+                // array and only renumber the remaining group IDs so that they
+                // start from 0.
+                let mut retained_entries = entries;
+                for (g, _) in &mut retained_entries {
+                    *g -= emit_groups;
+                }
+                retained_entries.shrink_to_fit();
+                batches.push(batch);
+                batch_entries.push(retained_entries);
+                continue;
+            }
+
+            let mut retained_entries = Vec::with_capacity(retained_len);
+            let mut retained_rows = Vec::with_capacity(retained_len);
+
+            for (g, r) in entries {
+                if g >= emit_groups {
+                    // Compute the new `(group_idx, row_idx)` pair for a
+                    // retained row. `group_idx` is renumbered to start from
+                    // 0, and `row_idx` points into the new dense batch we are
+                    // building.
+                    retained_entries.push((g - emit_groups, retained_rows.len() as u32));
+                    retained_rows.push(r);
+                }
+            }
+
+            debug_assert_eq!(retained_entries.len(), retained_len);
+            debug_assert_eq!(retained_rows.len(), retained_len);
+
+            let batch = if retained_len == batch.len() {
+                batch
+            } else {
+                // Compact mixed batches so retained rows no longer pin the
+                // original array.
+                let retained_rows = UInt32Array::from(retained_rows);
+                arrow::compute::take(batch.as_ref(), &retained_rows, None)?
+            };
+
+            batches.push(batch);
+            batch_entries.push(retained_entries);
+        }
+
+        self.batches = batches;
+        self.batch_entries = batch_entries;
+        self.num_groups -= emit_groups as usize;
+
+        Ok(())
+    }
+}
+
+impl GroupsAccumulator for ArrayAggGroupsAccumulator {
+    /// Store a reference to the input batch, plus a `(group_idx, row_idx)` pair
+    /// for every row.
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        assert_eq!(values.len(), 1, "single argument to update_batch");
+        let input = &values[0];
+
+        self.num_groups = self.num_groups.max(total_num_groups);
+
+        let nulls = if self.ignore_nulls {
+            input.logical_nulls()
+        } else {
+            None
+        };
+
+        let mut entries = Vec::new();
+
+        for (row_idx, &group_idx) in group_indices.iter().enumerate() {
+            // Skip filtered rows
+            if let Some(filter) = opt_filter
+                && (filter.is_null(row_idx) || !filter.value(row_idx))
+            {
+                continue;
+            }
+
+            // Skip null values when ignore_nulls is set
+            if let Some(ref nulls) = nulls
+                && nulls.is_null(row_idx)
+            {
+                continue;
+            }
+
+            entries.push((group_idx as u32, row_idx as u32));
+        }
+
+        // We only need to record the batch if it was non-empty.
+        if !entries.is_empty() {
+            self.batches.push(Arc::clone(input));
+            self.batch_entries.push(entries);
+        }
+
+        Ok(())
+    }
+
+    /// Produce a `ListArray` ordered by group index: the list at
+    /// position N contains the aggregated values for group N.
+    ///
+    /// Uses a counting sort to rearrange the stored `(group, row)`
+    /// entries into group order, then calls `interleave` to gather
+    /// the values into a flat array that backs the output `ListArray`.
+    fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
+        let emit_groups = match emit_to {
+            EmitTo::All => self.num_groups,
+            EmitTo::First(n) => n,
+        };
+
+        // Step 1: Count entries per group. For EmitTo::First(n), only groups
+        // 0..n are counted; the rest are retained to be emitted in the future.
+        let mut counts = vec![0u32; emit_groups];
+        for entries in &self.batch_entries {
+            for &(g, _) in entries {
+                let g = g as usize;
+                if g < emit_groups {
+                    counts[g] += 1;
+                }
+            }
+        }
+
+        // Step 2: Do a prefix sum over the counts and use it to build ListArray
+        // offsets, null buffer, and write positions for the counting sort.
+        let mut offsets = Vec::<i32>::with_capacity(emit_groups + 1);
+        offsets.push(0);
+        let mut nulls_builder = NullBufferBuilder::new(emit_groups);
+        let mut write_positions = Vec::with_capacity(emit_groups);
+        let mut cur_offset = 0u32;
+        for &count in &counts {
+            if count == 0 {
+                nulls_builder.append_null();
+            } else {
+                nulls_builder.append_non_null();
+            }
+            write_positions.push(cur_offset);
+            cur_offset += count;
+            offsets.push(cur_offset as i32);
+        }
+        let total_rows = cur_offset as usize;
+
+        // Step 3: Scatter entries into group order using the counting sort. The
+        // batch index is implicit from the outer loop position.
+        let flat_values = if total_rows == 0 {
+            new_empty_array(&self.datatype)
+        } else {
+            let mut interleave_indices = vec![(0usize, 0usize); total_rows];
+            for (batch_idx, entries) in self.batch_entries.iter().enumerate() {
+                for &(g, r) in entries {
+                    let g = g as usize;
+                    if g < emit_groups {
+                        let wp = write_positions[g] as usize;
+                        interleave_indices[wp] = (batch_idx, r as usize);
+                        write_positions[g] += 1;
+                    }
+                }
+            }
+
+            let sources: Vec<&dyn Array> =
+                self.batches.iter().map(|b| b.as_ref()).collect();
+            arrow::compute::interleave(&sources, &interleave_indices)?
+        };
+
+        // Step 4: Release state for emitted groups.
+        match emit_to {
+            EmitTo::All => self.clear_state(),
+            EmitTo::First(_) => self.compact_retained_state(emit_groups)?,
+        }
+
+        let offsets = OffsetBuffer::new(ScalarBuffer::from(offsets));
+        let field = Arc::new(Field::new_list_field(self.datatype.clone(), true));
+        let result = ListArray::new(field, offsets, flat_values, nulls_builder.finish());
+
+        Ok(Arc::new(result))
+    }
+
+    fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
+        Ok(vec![self.evaluate(emit_to)?])
+    }
+
+    fn merge_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        _opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        assert_eq!(values.len(), 1, "one argument to merge_batch");
+        let input_list = values[0].as_list::<i32>();
+
+        self.num_groups = self.num_groups.max(total_num_groups);
+
+        // Push the ListArray's backing values array as a single batch.
+        let list_values = input_list.values();
+        let list_offsets = input_list.offsets();
+
+        let mut entries = Vec::new();
+
+        for (row_idx, &group_idx) in group_indices.iter().enumerate() {
+            if input_list.is_null(row_idx) {
+                continue;
+            }
+            let start = list_offsets[row_idx] as u32;
+            let end = list_offsets[row_idx + 1] as u32;
+            for pos in start..end {
+                entries.push((group_idx as u32, pos));
+            }
+        }
+
+        if !entries.is_empty() {
+            self.batches.push(Arc::clone(list_values));
+            self.batch_entries.push(entries);
+        }
+
+        Ok(())
+    }
+
+    fn convert_to_state(
+        &self,
+        values: &[ArrayRef],
+        opt_filter: Option<&BooleanArray>,
+    ) -> Result<Vec<ArrayRef>> {
+        assert_eq!(values.len(), 1, "one argument to convert_to_state");
+
+        let input = &values[0];
+
+        // Each row becomes a 1-element list: offsets are [0, 1, 2, ..., n].
+        let offsets = OffsetBuffer::from_repeated_length(1, input.len());
+
+        // Filtered rows become null list entries, which merge_batch will skip.
+        let filter_nulls = opt_filter.and_then(filter_to_nulls);
+
+        // With ignore_nulls, null values also become null list entries. Without
+        // ignore_nulls, null values stay as [NULL] so merge_batch retains them.
+        let nulls = if self.ignore_nulls {
+            let logical = input.logical_nulls();
+            NullBuffer::union(filter_nulls.as_ref(), logical.as_ref())
+        } else {
+            filter_nulls
+        };
+
+        let field = Arc::new(Field::new_list_field(self.datatype.clone(), true));
+        let list_array = ListArray::new(field, offsets, Arc::clone(input), nulls);
+
+        Ok(vec![Arc::new(list_array)])
+    }
+
+    fn supports_convert_to_state(&self) -> bool {
+        true
+    }
+
+    fn size(&self) -> usize {
+        self.batches
+            .iter()
+            .map(|arr| arr.to_data().get_slice_memory_size().unwrap_or_default())
+            .sum::<usize>()
+            + self.batches.capacity() * size_of::<ArrayRef>()
+            + self
+                .batch_entries
+                .iter()
+                .map(|e| e.capacity() * size_of::<(u32, u32)>())
+                .sum::<usize>()
+            + self.batch_entries.capacity() * size_of::<Vec<(u32, u32)>>()
+    }
+}
+
+#[derive(Debug)]
+pub struct DistinctArrayAggAccumulator {
     values: HashSet<ScalarValue>,
     datatype: DataType,
     sort_options: Option<SortOptions>,
@@ -468,9 +814,7 @@ impl Accumulator for DistinctArrayAggAccumulator {
             return Ok(());
         }
 
-        if states.len() != 1 {
-            return internal_err!("expects single state");
-        }
+        assert_eq_or_internal_err!(states.len(), 1, "expects single state");
 
         states[0]
             .as_list::<i32>()
@@ -798,13 +1142,11 @@ impl Accumulator for OrderSensitiveArrayAggAccumulator {
 mod tests {
     use super::*;
     use arrow::array::{ListBuilder, StringBuilder};
-    use arrow::datatypes::{FieldRef, Schema};
+    use arrow::datatypes::Schema;
     use datafusion_common::cast::as_generic_string_array;
     use datafusion_common::internal_err;
-    use datafusion_physical_expr::expressions::Column;
     use datafusion_physical_expr::PhysicalExpr;
-    use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
-    use std::sync::Arc;
+    use datafusion_physical_expr::expressions::Column;
 
     #[test]
     fn no_duplicates_no_distinct() -> Result<()> {
@@ -1109,7 +1451,7 @@ mod tests {
         ])])?;
 
         // without compaction, the size is 17112
-        assert_eq!(acc.size(), 2184);
+        assert_eq!(acc.size(), 2224);
 
         Ok(())
     }
@@ -1225,4 +1567,372 @@ mod tests {
         acc1.merge_batch(&intermediate_state)?;
         Ok(acc1)
     }
+
+    // ---- GroupsAccumulator tests ----
+
+    use arrow::array::Int32Array;
+
+    fn list_array_to_i32_vecs(list: &ListArray) -> Vec<Option<Vec<Option<i32>>>> {
+        (0..list.len())
+            .map(|i| {
+                if list.is_null(i) {
+                    None
+                } else {
+                    let arr = list.value(i);
+                    let vals: Vec<Option<i32>> = arr
+                        .as_any()
+                        .downcast_ref::<Int32Array>()
+                        .unwrap()
+                        .iter()
+                        .collect();
+                    Some(vals)
+                }
+            })
+            .collect()
+    }
+
+    fn eval_i32_lists(
+        acc: &mut ArrayAggGroupsAccumulator,
+        emit_to: EmitTo,
+    ) -> Result<Vec<Option<Vec<Option<i32>>>>> {
+        let result = acc.evaluate(emit_to)?;
+        Ok(list_array_to_i32_vecs(result.as_list::<i32>()))
+    }
+
+    #[test]
+    fn groups_accumulator_multiple_batches() -> Result<()> {
+        let mut acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        // First batch
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
+        acc.update_batch(&[values], &[0, 1, 0], None, 2)?;
+
+        // Second batch
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![4, 5]));
+        acc.update_batch(&[values], &[1, 0], None, 2)?;
+
+        let vals = eval_i32_lists(&mut acc, EmitTo::All)?;
+        assert_eq!(vals[0], Some(vec![Some(1), Some(3), Some(5)]));
+        assert_eq!(vals[1], Some(vec![Some(2), Some(4)]));
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_emit_first() -> Result<()> {
+        let mut acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 30]));
+        acc.update_batch(&[values], &[0, 1, 2], None, 3)?;
+
+        // Emit first 2 groups
+        let vals = eval_i32_lists(&mut acc, EmitTo::First(2))?;
+        assert_eq!(vals.len(), 2);
+        assert_eq!(vals[0], Some(vec![Some(10)]));
+        assert_eq!(vals[1], Some(vec![Some(20)]));
+
+        // Remaining group (was index 2, now shifted to 0)
+        let vals = eval_i32_lists(&mut acc, EmitTo::All)?;
+        assert_eq!(vals.len(), 1);
+        assert_eq!(vals[0], Some(vec![Some(30)]));
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_emit_first_frees_batches() -> Result<()> {
+        // Batch 0 has rows only for group 0; batch 1 has rows for
+        // both groups. After emitting group 0, batch 0 should be
+        // dropped entirely and batch 1 should be compacted to the
+        // retained row(s).
+        let mut acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        let batch0: ArrayRef = Arc::new(Int32Array::from(vec![10, 20]));
+        acc.update_batch(&[batch0], &[0, 0], None, 2)?;
+
+        let batch1: ArrayRef = Arc::new(Int32Array::from(vec![30, 40]));
+        acc.update_batch(&[batch1], &[0, 1], None, 2)?;
+
+        assert_eq!(acc.batches.len(), 2);
+        assert!(!acc.batches[0].is_empty());
+        assert!(!acc.batches[1].is_empty());
+
+        // Emit group 0. Batch 0 is only referenced by group 0, so it
+        // should be removed. Batch 1 is mixed, so it should be compacted
+        // to contain only the retained row for group 1.
+        let vals = eval_i32_lists(&mut acc, EmitTo::First(1))?;
+        assert_eq!(vals[0], Some(vec![Some(10), Some(20), Some(30)]));
+
+        assert_eq!(acc.batches.len(), 1);
+        let retained = acc.batches[0]
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert_eq!(retained.values(), &[40]);
+        assert_eq!(acc.batch_entries, vec![vec![(0, 0)]]);
+
+        // Emit remaining group 1
+        let vals = eval_i32_lists(&mut acc, EmitTo::All)?;
+        assert_eq!(vals[0], Some(vec![Some(40)]));
+
+        assert!(acc.batches.is_empty());
+        assert_eq!(acc.size(), 0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_emit_first_compacts_mixed_batches() -> Result<()> {
+        let mut acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        let batch: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 30, 40]));
+        acc.update_batch(&[batch], &[0, 1, 0, 1], None, 2)?;
+
+        let size_before = acc.size();
+        let vals = eval_i32_lists(&mut acc, EmitTo::First(1))?;
+        assert_eq!(vals[0], Some(vec![Some(10), Some(30)]));
+
+        assert_eq!(acc.num_groups, 1);
+        assert_eq!(acc.batches.len(), 1);
+        let retained = acc.batches[0]
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert_eq!(retained.values(), &[20, 40]);
+        assert_eq!(acc.batch_entries, vec![vec![(0, 0), (0, 1)]]);
+        assert!(acc.size() < size_before);
+
+        let vals = eval_i32_lists(&mut acc, EmitTo::All)?;
+        assert_eq!(vals[0], Some(vec![Some(20), Some(40)]));
+        assert_eq!(acc.size(), 0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_emit_all_releases_capacity() -> Result<()> {
+        let mut acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        let batch: ArrayRef = Arc::new(Int32Array::from_iter_values(0..64));
+        acc.update_batch(
+            &[batch],
+            &(0..64).map(|i| i % 4).collect::<Vec<_>>(),
+            None,
+            4,
+        )?;
+
+        assert!(acc.size() > 0);
+        let _ = eval_i32_lists(&mut acc, EmitTo::All)?;
+
+        assert_eq!(acc.size(), 0);
+        assert_eq!(acc.batches.capacity(), 0);
+        assert_eq!(acc.batch_entries.capacity(), 0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_null_groups() -> Result<()> {
+        // Groups that never receive values should produce null
+        let mut acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![1]));
+        // Only group 0 gets a value, groups 1 and 2 are empty
+        acc.update_batch(&[values], &[0], None, 3)?;
+
+        let vals = eval_i32_lists(&mut acc, EmitTo::All)?;
+        assert_eq!(vals, vec![Some(vec![Some(1)]), None, None]);
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_ignore_nulls() -> Result<()> {
+        let mut acc = ArrayAggGroupsAccumulator::new(DataType::Int32, true);
+
+        let values: ArrayRef =
+            Arc::new(Int32Array::from(vec![Some(1), None, Some(3), None]));
+        acc.update_batch(&[values], &[0, 0, 1, 1], None, 2)?;
+
+        let vals = eval_i32_lists(&mut acc, EmitTo::All)?;
+        // Group 0: only non-null value is 1
+        assert_eq!(vals[0], Some(vec![Some(1)]));
+        // Group 1: only non-null value is 3
+        assert_eq!(vals[1], Some(vec![Some(3)]));
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_opt_filter() -> Result<()> {
+        let mut acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
+        // Use a mix of false and null to filter out rows — both should
+        // be skipped.
+        let filter = BooleanArray::from(vec![Some(true), None, Some(true), Some(false)]);
+        acc.update_batch(&[values], &[0, 0, 1, 1], Some(&filter), 2)?;
+
+        let vals = eval_i32_lists(&mut acc, EmitTo::All)?;
+        assert_eq!(vals[0], Some(vec![Some(1)])); // row 1 filtered (null)
+        assert_eq!(vals[1], Some(vec![Some(3)])); // row 3 filtered (false)
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_state_merge_roundtrip() -> Result<()> {
+        // Accumulator 1: update_batch, then merge, then update_batch again.
+        // Verifies that values appear in chronological insertion order.
+        let mut acc1 = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2]));
+        acc1.update_batch(&[values], &[0, 1], None, 2)?;
+
+        // Accumulator 2
+        let mut acc2 = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![3, 4]));
+        acc2.update_batch(&[values], &[0, 1], None, 2)?;
+
+        // Merge acc2's state into acc1
+        let state = acc2.state(EmitTo::All)?;
+        acc1.merge_batch(&state, &[0, 1], None, 2)?;
+
+        // Another update_batch on acc1 after the merge
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![5, 6]));
+        acc1.update_batch(&[values], &[0, 1], None, 2)?;
+
+        // Each group's values in insertion order:
+        // group 0: update(1), merge(3), update(5) → [1, 3, 5]
+        // group 1: update(2), merge(4), update(6) → [2, 4, 6]
+        let vals = eval_i32_lists(&mut acc1, EmitTo::All)?;
+        assert_eq!(vals[0], Some(vec![Some(1), Some(3), Some(5)]));
+        assert_eq!(vals[1], Some(vec![Some(2), Some(4), Some(6)]));
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_convert_to_state() -> Result<()> {
+        let acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![Some(10), None, Some(30)]));
+        let state = acc.convert_to_state(&[values], None)?;
+
+        assert_eq!(state.len(), 1);
+        let vals = list_array_to_i32_vecs(state[0].as_list::<i32>());
+        assert_eq!(
+            vals,
+            vec![
+                Some(vec![Some(10)]),
+                Some(vec![None]), // null preserved inside list, not promoted
+                Some(vec![Some(30)]),
+            ]
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_convert_to_state_with_filter() -> Result<()> {
+        let acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 30]));
+        let filter = BooleanArray::from(vec![true, false, true]);
+        let state = acc.convert_to_state(&[values], Some(&filter))?;
+
+        let vals = list_array_to_i32_vecs(state[0].as_list::<i32>());
+        assert_eq!(
+            vals,
+            vec![
+                Some(vec![Some(10)]),
+                None, // filtered
+                Some(vec![Some(30)]),
+            ]
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_convert_to_state_merge_preserves_nulls() -> Result<()> {
+        // Verifies that null values survive the convert_to_state -> merge_batch
+        // round-trip when ignore_nulls is false (default null handling).
+        let acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), None, Some(3)]));
+        let state = acc.convert_to_state(&[values], None)?;
+
+        // Feed state into a new accumulator via merge_batch
+        let mut acc2 = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+        acc2.merge_batch(&state, &[0, 0, 1], None, 2)?;
+
+        // Group 0 received rows 0 ([1]) and 1 ([NULL]) → [1, NULL]
+        let vals = eval_i32_lists(&mut acc2, EmitTo::All)?;
+        assert_eq!(vals[0], Some(vec![Some(1), None]));
+        // Group 1 received row 2 ([3]) → [3]
+        assert_eq!(vals[1], Some(vec![Some(3)]));
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_convert_to_state_merge_ignore_nulls() -> Result<()> {
+        // Verifies that null values are dropped in the convert_to_state ->
+        // merge_batch round-trip when ignore_nulls is true.
+        let acc = ArrayAggGroupsAccumulator::new(DataType::Int32, true);
+
+        let values: ArrayRef =
+            Arc::new(Int32Array::from(vec![Some(1), None, Some(3), None]));
+        let state = acc.convert_to_state(&[values], None)?;
+
+        let list = state[0].as_list::<i32>();
+        // Rows 0 and 2 are valid lists; rows 1 and 3 are null list entries
+        assert!(!list.is_null(0));
+        assert!(list.is_null(1));
+        assert!(!list.is_null(2));
+        assert!(list.is_null(3));
+
+        // Feed state into a new accumulator via merge_batch
+        let mut acc2 = ArrayAggGroupsAccumulator::new(DataType::Int32, true);
+        acc2.merge_batch(&state, &[0, 0, 1, 1], None, 2)?;
+
+        // Group 0: received [1] and null (skipped) → [1]
+        let vals = eval_i32_lists(&mut acc2, EmitTo::All)?;
+        assert_eq!(vals[0], Some(vec![Some(1)]));
+        // Group 1: received [3] and null (skipped) → [3]
+        assert_eq!(vals[1], Some(vec![Some(3)]));
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_all_groups_empty() -> Result<()> {
+        let mut acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        // Create groups but don't add any values (all filtered out)
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2]));
+        let filter = BooleanArray::from(vec![false, false]);
+        acc.update_batch(&[values], &[0, 1], Some(&filter), 2)?;
+
+        let vals = eval_i32_lists(&mut acc, EmitTo::All)?;
+        assert_eq!(vals, vec![None, None]);
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_ignore_nulls_all_null_group() -> Result<()> {
+        // When ignore_nulls is true and a group receives only nulls,
+        // it should produce a null output
+        let mut acc = ArrayAggGroupsAccumulator::new(DataType::Int32, true);
+
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![None, Some(1), None]));
+        acc.update_batch(&[values], &[0, 1, 0], None, 2)?;
+
+        let vals = eval_i32_lists(&mut acc, EmitTo::All)?;
+        assert_eq!(vals[0], None); // group 0 got only nulls, all filtered
+        assert_eq!(vals[1], Some(vec![Some(1)])); // group 1 got value 1
+
+        Ok(())
+    }
 }
diff --git a/datafusion/functions-aggregate/src/average.rs b/datafusion/functions-aggregate/src/average.rs
index bec1734e2e203..bcccea381324e 100644
--- a/datafusion/functions-aggregate/src/average.rs
+++ b/datafusion/functions-aggregate/src/average.rs
@@ -24,25 +24,22 @@ use arrow::array::{
 
 use arrow::compute::sum;
 use arrow::datatypes::{
-    i256, ArrowNativeType, DataType, Decimal128Type, Decimal256Type, Decimal32Type,
-    Decimal64Type, DecimalType, DurationMicrosecondType, DurationMillisecondType,
-    DurationNanosecondType, DurationSecondType, Field, FieldRef, Float64Type, TimeUnit,
-    UInt64Type, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION,
-    DECIMAL256_MAX_SCALE, DECIMAL32_MAX_PRECISION, DECIMAL32_MAX_SCALE,
-    DECIMAL64_MAX_PRECISION, DECIMAL64_MAX_SCALE,
-};
-use datafusion_common::plan_err;
-use datafusion_common::{
-    exec_err, not_impl_err, utils::take_function_args, Result, ScalarValue,
+    ArrowNativeType, DECIMAL32_MAX_PRECISION, DECIMAL32_MAX_SCALE,
+    DECIMAL64_MAX_PRECISION, DECIMAL64_MAX_SCALE, DECIMAL128_MAX_PRECISION,
+    DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DataType,
+    Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, DecimalType,
+    DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType,
+    DurationSecondType, Field, FieldRef, Float64Type, TimeUnit, UInt64Type, i256,
 };
+use datafusion_common::types::{NativeType, logical_float64};
+use datafusion_common::{Result, ScalarValue, exec_err, not_impl_err};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::utils::format_state_name;
-use datafusion_expr::Volatility::Immutable;
 use datafusion_expr::{
-    Accumulator, AggregateUDFImpl, Documentation, EmitTo, Expr, GroupsAccumulator,
-    ReversedUDAF, Signature,
+    Accumulator, AggregateUDFImpl, Coercion, Documentation, EmitTo, Expr,
+    GroupsAccumulator, ReversedUDAF, Signature, TypeSignature, TypeSignatureClass,
+    Volatility,
 };
-
 use datafusion_functions_aggregate_common::aggregate::avg_distinct::{
     DecimalDistinctAvgAccumulator, Float64DistinctAvgAccumulator,
 };
@@ -50,11 +47,9 @@ use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumu
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::{
     filtered_null_mask, set_nulls,
 };
-
 use datafusion_functions_aggregate_common::utils::DecimalAverager;
 use datafusion_macros::user_doc;
 use log::debug;
-use std::any::Any;
 use std::fmt::Debug;
 use std::mem::{size_of, size_of_val};
 use std::sync::Arc;
@@ -101,7 +96,24 @@ pub struct Avg {
 impl Avg {
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Immutable),
+            // Supported types smallint, int, bigint, real, double precision, decimal, or interval
+            // Refer to https://www.postgresql.org/docs/8.2/functions-aggregate.html doc
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Decimal,
+                    )]),
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Duration,
+                    )]),
+                    TypeSignature::Coercible(vec![Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_float64()),
+                        vec![TypeSignatureClass::Integer, TypeSignatureClass::Float],
+                        NativeType::Float64,
+                    )]),
+                ],
+                Volatility::Immutable,
+            ),
             aliases: vec![String::from("mean")],
         }
     }
@@ -114,10 +126,6 @@ impl Default for Avg {
 }
 
 impl AggregateUDFImpl for Avg {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "avg"
     }
@@ -126,28 +134,6 @@ impl AggregateUDFImpl for Avg {
         &self.signature
     }
 
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        let [args] = take_function_args(self.name(), arg_types)?;
-
-        // Supported types smallint, int, bigint, real, double precision, decimal, or interval
-        // Refer to https://www.postgresql.org/docs/8.2/functions-aggregate.html doc
-        fn coerced_type(data_type: &DataType) -> Result<DataType> {
-            match &data_type {
-                DataType::Decimal32(p, s) => Ok(DataType::Decimal32(*p, *s)),
-                DataType::Decimal64(p, s) => Ok(DataType::Decimal64(*p, *s)),
-                DataType::Decimal128(p, s) => Ok(DataType::Decimal128(*p, *s)),
-                DataType::Decimal256(p, s) => Ok(DataType::Decimal256(*p, *s)),
-                d if d.is_numeric() => Ok(DataType::Float64),
-                DataType::Duration(time_unit) => Ok(DataType::Duration(*time_unit)),
-                DataType::Dictionary(_, v) => coerced_type(v.as_ref()),
-                _ => {
-                    plan_err!("Avg does not support inputs of type {data_type}.")
-                }
-            }
-        }
-        Ok(vec![coerced_type(args)?])
-    }
-
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
         match &arg_types[0] {
             DataType::Decimal32(precision, scale) => {
@@ -319,12 +305,14 @@ impl AggregateUDFImpl for Avg {
             };
             // Similar to datafusion_functions_aggregate::sum::Sum::state_fields
             // since the accumulator uses DistinctSumAccumulator internally.
-            Ok(vec![Field::new_list(
-                format_state_name(args.name, "avg distinct"),
-                Field::new_list_field(dt, true),
-                false,
-            )
-            .into()])
+            Ok(vec![
+                Field::new_list(
+                    format_state_name(args.name, "avg distinct"),
+                    Field::new_list_field(dt, true),
+                    false,
+                )
+                .into(),
+            ])
         } else {
             Ok(vec![
                 Field::new(
@@ -761,7 +749,7 @@ impl Accumulator for DurationAvgAccumulator {
 struct AvgGroupsAccumulator<T, F>
 where
     T: ArrowNumericType + Send,
-    F: Fn(T::Native, u64) -> Result<T::Native> + Send,
+    F: Fn(T::Native, u64) -> Result<T::Native> + Send + 'static,
 {
     /// The type of the internal sum
     sum_data_type: DataType,
@@ -785,7 +773,7 @@ where
 impl<T, F> AvgGroupsAccumulator<T, F>
 where
     T: ArrowNumericType + Send,
-    F: Fn(T::Native, u64) -> Result<T::Native> + Send,
+    F: Fn(T::Native, u64) -> Result<T::Native> + Send + 'static,
 {
     pub fn new(sum_data_type: &DataType, return_data_type: &DataType, avg_fn: F) -> Self {
         debug!(
@@ -807,7 +795,7 @@ where
 impl<T, F> GroupsAccumulator for AvgGroupsAccumulator<T, F>
 where
     T: ArrowNumericType + Send,
-    F: Fn(T::Native, u64) -> Result<T::Native> + Send,
+    F: Fn(T::Native, u64) -> Result<T::Native> + Send + 'static,
 {
     fn update_batch(
         &mut self,
@@ -828,7 +816,8 @@ where
             opt_filter,
             total_num_groups,
             |group_index, new_value| {
-                let sum = &mut self.sums[group_index];
+                // SAFETY: group_index is guaranteed to be in bounds
+                let sum = unsafe { self.sums.get_unchecked_mut(group_index) };
                 *sum = sum.add_wrapping(new_value);
 
                 self.counts[group_index] += 1;
@@ -843,12 +832,16 @@ where
         let sums = emit_to.take_needed(&mut self.sums);
         let nulls = self.null_state.build(emit_to);
 
-        assert_eq!(nulls.len(), sums.len());
+        if let Some(nulls) = &nulls {
+            assert_eq!(nulls.len(), sums.len());
+        }
         assert_eq!(counts.len(), sums.len());
 
         // don't evaluate averages with null inputs to avoid errors on null values
 
-        let array: PrimitiveArray<T> = if nulls.null_count() > 0 {
+        let array: PrimitiveArray<T> = if let Some(nulls) = &nulls
+            && nulls.null_count() > 0
+        {
             let mut builder = PrimitiveBuilder::<T>::with_capacity(nulls.len())
                 .with_data_type(self.return_data_type.clone());
             let iter = sums.into_iter().zip(counts).zip(nulls.iter());
@@ -867,7 +860,7 @@ where
                 .zip(counts.into_iter())
                 .map(|(sum, count)| (self.avg_fn)(sum, count))
                 .collect::<Result<Vec<_>>>()?;
-            PrimitiveArray::new(averages.into(), Some(nulls)) // no copy
+            PrimitiveArray::new(averages.into(), nulls) // no copy
                 .with_data_type(self.return_data_type.clone())
         };
 
@@ -877,7 +870,6 @@ where
     // return arrays for sums and counts
     fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
         let nulls = self.null_state.build(emit_to);
-        let nulls = Some(nulls);
 
         let counts = emit_to.take_needed(&mut self.counts);
         let counts = UInt64Array::new(counts.into(), nulls.clone()); // zero copy
@@ -911,7 +903,9 @@ where
             opt_filter,
             total_num_groups,
             |group_index, partial_count| {
-                self.counts[group_index] += partial_count;
+                // SAFETY: group_index is guaranteed to be in bounds
+                let count = unsafe { self.counts.get_unchecked_mut(group_index) };
+                *count += partial_count;
             },
         );
 
@@ -923,7 +917,8 @@ where
             opt_filter,
             total_num_groups,
             |group_index, new_value: <T as ArrowPrimitiveType>::Native| {
-                let sum = &mut self.sums[group_index];
+                // SAFETY: group_index is guaranteed to be in bounds
+                let sum = unsafe { self.sums.get_unchecked_mut(group_index) };
                 *sum = sum.add_wrapping(new_value);
             },
         );
diff --git a/datafusion/functions-aggregate/src/bit_and_or_xor.rs b/datafusion/functions-aggregate/src/bit_and_or_xor.rs
index e63044c753173..d730a6c1cb3eb 100644
--- a/datafusion/functions-aggregate/src/bit_and_or_xor.rs
+++ b/datafusion/functions-aggregate/src/bit_and_or_xor.rs
@@ -17,31 +17,30 @@
 
 //! Defines `BitAnd`, `BitOr`, `BitXor` and `BitXor DISTINCT` aggregate accumulators
 
-use std::any::Any;
 use std::collections::HashSet;
 use std::fmt::{Display, Formatter};
 use std::hash::Hash;
 use std::mem::{size_of, size_of_val};
 
-use ahash::RandomState;
-use arrow::array::{downcast_integer, Array, ArrayRef, AsArray};
+use arrow::array::{Array, ArrayRef, AsArray, downcast_integer};
 use arrow::datatypes::{
-    ArrowNativeType, ArrowNumericType, DataType, Field, FieldRef, Int16Type, Int32Type,
-    Int64Type, Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
+    ArrowNativeType, ArrowNumericType, DataType, Field, FieldRef, Int8Type, Int16Type,
+    Int32Type, Int64Type, UInt8Type, UInt16Type, UInt32Type, UInt64Type,
 };
+use datafusion_common::hash_utils::RandomState;
 
 use datafusion_common::cast::as_list_array;
-use datafusion_common::{exec_err, not_impl_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, not_impl_err};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
-use datafusion_expr::type_coercion::aggregates::INTEGERS;
 use datafusion_expr::utils::format_state_name;
 use datafusion_expr::{
-    Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, ReversedUDAF,
-    Signature, Volatility,
+    Accumulator, AggregateUDFImpl, Coercion, Documentation, GroupsAccumulator,
+    ReversedUDAF, Signature, TypeSignatureClass, Volatility,
 };
 
 use datafusion_doc::aggregate_doc_sections::DOC_SECTION_GENERAL;
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::prim_op::PrimitiveGroupsAccumulator;
+use datafusion_functions_aggregate_common::noop_accumulator::NoopAccumulator;
 use std::ops::{BitAndAssign, BitOrAssign, BitXorAssign};
 use std::sync::LazyLock;
 
@@ -89,6 +88,7 @@ macro_rules! accumulator_helper {
 macro_rules! downcast_bitwise_accumulator {
     ($args:ident, $opr:expr, $is_distinct: expr) => {
         match $args.return_field.data_type() {
+            DataType::Null => Ok(Box::new(NoopAccumulator::default())),
             DataType::Int8 => accumulator_helper!(Int8Type, $opr, $is_distinct),
             DataType::Int16 => accumulator_helper!(Int16Type, $opr, $is_distinct),
             DataType::Int32 => accumulator_helper!(Int32Type, $opr, $is_distinct),
@@ -228,7 +228,10 @@ impl BitwiseOperation {
     ) -> Self {
         Self {
             operation: operator,
-            signature: Signature::uniform(1, INTEGERS.to_vec(), Volatility::Immutable),
+            signature: Signature::coercible(
+                vec![Coercion::new_exact(TypeSignatureClass::Integer)],
+                Volatility::Immutable,
+            ),
             func_name,
             documentation,
         }
@@ -236,10 +239,6 @@ impl BitwiseOperation {
 }
 
 impl AggregateUDFImpl for BitwiseOperation {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         self.func_name
     }
@@ -249,15 +248,7 @@ impl AggregateUDFImpl for BitwiseOperation {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        let arg_type = &arg_types[0];
-        if !arg_type.is_integer() {
-            return exec_err!(
-                "[return_type] {} not supported for {}",
-                self.name(),
-                arg_type
-            );
-        }
-        Ok(arg_type.clone())
+        Ok(arg_types[0].clone())
     }
 
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
@@ -265,24 +256,37 @@ impl AggregateUDFImpl for BitwiseOperation {
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        if self.operation == BitwiseOperationType::Xor && args.is_distinct {
-            Ok(vec![Field::new_list(
-                format_state_name(
-                    args.name,
-                    format!("{} distinct", self.name()).as_str(),
-                ),
-                // See COMMENTS.md to understand why nullable is set to true
-                Field::new_list_field(args.return_type().clone(), true),
-                false,
-            )
-            .into()])
+        if args.input_fields[0].data_type().is_null() {
+            Ok(vec![
+                Field::new(
+                    format_state_name(args.name, self.name()),
+                    DataType::Null,
+                    true,
+                )
+                .into(),
+            ])
+        } else if self.operation == BitwiseOperationType::Xor && args.is_distinct {
+            Ok(vec![
+                Field::new_list(
+                    format_state_name(
+                        args.name,
+                        format!("{} distinct", self.name()).as_str(),
+                    ),
+                    // See COMMENTS.md to understand why nullable is set to true
+                    Field::new_list_field(args.return_type().clone(), true),
+                    false,
+                )
+                .into(),
+            ])
         } else {
-            Ok(vec![Field::new(
-                format_state_name(args.name, self.name()),
-                args.return_field.data_type().clone(),
-                true,
-            )
-            .into()])
+            Ok(vec![
+                Field::new(
+                    format_state_name(args.name, self.name()),
+                    args.return_field.data_type().clone(),
+                    true,
+                )
+                .into(),
+            ])
         }
     }
 
diff --git a/datafusion/functions-aggregate/src/bool_and_or.rs b/datafusion/functions-aggregate/src/bool_and_or.rs
index ff389bb419e2e..3b900f1655ec1 100644
--- a/datafusion/functions-aggregate/src/bool_and_or.rs
+++ b/datafusion/functions-aggregate/src/bool_and_or.rs
@@ -17,7 +17,6 @@
 
 //! Defines physical expressions that can evaluated at runtime during query execution
 
-use std::any::Any;
 use std::mem::size_of_val;
 
 use arrow::array::ArrayRef;
@@ -28,10 +27,10 @@ use arrow::datatypes::Field;
 use arrow::datatypes::{DataType, FieldRef};
 
 use datafusion_common::internal_err;
-use datafusion_common::{downcast_value, not_impl_err};
 use datafusion_common::{Result, ScalarValue};
+use datafusion_common::{downcast_value, not_impl_err};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
-use datafusion_expr::utils::{format_state_name, AggregateOrderSensitivity};
+use datafusion_expr::utils::{AggregateOrderSensitivity, format_state_name};
 use datafusion_expr::{
     Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, ReversedUDAF,
     Signature, Volatility,
@@ -114,11 +113,7 @@ pub struct BoolAnd {
 impl BoolAnd {
     fn new() -> Self {
         Self {
-            signature: Signature::uniform(
-                1,
-                vec![DataType::Boolean],
-                Volatility::Immutable,
-            ),
+            signature: Signature::exact(vec![DataType::Boolean], Volatility::Immutable),
         }
     }
 }
@@ -130,10 +125,6 @@ impl Default for BoolAnd {
 }
 
 impl AggregateUDFImpl for BoolAnd {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "bool_and"
     }
@@ -151,12 +142,14 @@ impl AggregateUDFImpl for BoolAnd {
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        Ok(vec![Field::new(
-            format_state_name(args.name, self.name()),
-            DataType::Boolean,
-            true,
-        )
-        .into()])
+        Ok(vec![
+            Field::new(
+                format_state_name(args.name, self.name()),
+                DataType::Boolean,
+                true,
+            )
+            .into(),
+        ])
     }
 
     fn groups_accumulator_supported(&self, _args: AccumulatorArgs) -> bool {
@@ -249,11 +242,7 @@ pub struct BoolOr {
 impl BoolOr {
     fn new() -> Self {
         Self {
-            signature: Signature::uniform(
-                1,
-                vec![DataType::Boolean],
-                Volatility::Immutable,
-            ),
+            signature: Signature::exact(vec![DataType::Boolean], Volatility::Immutable),
         }
     }
 }
@@ -265,10 +254,6 @@ impl Default for BoolOr {
 }
 
 impl AggregateUDFImpl for BoolOr {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "bool_or"
     }
@@ -286,12 +271,14 @@ impl AggregateUDFImpl for BoolOr {
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        Ok(vec![Field::new(
-            format_state_name(args.name, self.name()),
-            DataType::Boolean,
-            true,
-        )
-        .into()])
+        Ok(vec![
+            Field::new(
+                format_state_name(args.name, self.name()),
+                DataType::Boolean,
+                true,
+            )
+            .into(),
+        ])
     }
 
     fn groups_accumulator_supported(&self, _args: AccumulatorArgs) -> bool {
diff --git a/datafusion/functions-aggregate/src/correlation.rs b/datafusion/functions-aggregate/src/correlation.rs
index 20f23662cadec..2621fcf0bf3c7 100644
--- a/datafusion/functions-aggregate/src/correlation.rs
+++ b/datafusion/functions-aggregate/src/correlation.rs
@@ -17,14 +17,13 @@
 
 //! [`Correlation`]: correlation sample aggregations.
 
-use std::any::Any;
 use std::fmt::Debug;
 use std::mem::size_of_val;
 use std::sync::Arc;
 
 use arrow::array::{
-    downcast_array, Array, AsArray, BooleanArray, Float64Array, NullBufferBuilder,
-    UInt64Array,
+    Array, AsArray, BooleanArray, Float64Array, NullBufferBuilder, UInt64Array,
+    downcast_array,
 };
 use arrow::compute::{and, filter, is_not_null};
 use arrow::datatypes::{FieldRef, Float64Type, UInt64Type};
@@ -40,9 +39,9 @@ use crate::covariance::CovarianceAccumulator;
 use crate::stddev::StddevAccumulator;
 use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::{
+    Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
     function::{AccumulatorArgs, StateFieldsArgs},
     utils::format_state_name,
-    Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
 };
 use datafusion_functions_aggregate_common::stats::StatsType;
 use datafusion_macros::user_doc;
@@ -88,17 +87,14 @@ impl Correlation {
             signature: Signature::exact(
                 vec![DataType::Float64, DataType::Float64],
                 Volatility::Immutable,
-            ),
+            )
+            .with_parameter_names(vec!["y".to_string(), "x".to_string()])
+            .expect("valid parameter names for corr"),
         }
     }
 }
 
 impl AggregateUDFImpl for Correlation {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "corr"
     }
@@ -194,24 +190,32 @@ impl Accumulator for CorrelationAccumulator {
     }
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
-        let n = self.covar.get_count();
-        if n < 2 {
-            return Ok(ScalarValue::Float64(None));
-        }
-
         let covar = self.covar.evaluate()?;
         let stddev1 = self.stddev1.evaluate()?;
         let stddev2 = self.stddev2.evaluate()?;
 
-        if let ScalarValue::Float64(Some(c)) = covar {
-            if let ScalarValue::Float64(Some(s1)) = stddev1 {
-                if let ScalarValue::Float64(Some(s2)) = stddev2 {
-                    if s1 == 0_f64 || s2 == 0_f64 {
-                        return Ok(ScalarValue::Float64(None));
-                    } else {
-                        return Ok(ScalarValue::Float64(Some(c / s1 / s2)));
-                    }
-                }
+        // First check if we have NaN values by examining the internal state
+        // This handles the case where both inputs are NaN even with count=1
+        let mean1 = self.covar.get_mean1();
+        let mean2 = self.covar.get_mean2();
+
+        // If both means are NaN, then both input columns contain only NaN values
+        if mean1.is_nan() && mean2.is_nan() {
+            return Ok(ScalarValue::Float64(Some(f64::NAN)));
+        }
+        let n = self.covar.get_count();
+        if mean1.is_nan() || mean2.is_nan() || n < 2 {
+            return Ok(ScalarValue::Float64(None));
+        }
+
+        if let ScalarValue::Float64(Some(c)) = covar
+            && let ScalarValue::Float64(Some(s1)) = stddev1
+            && let ScalarValue::Float64(Some(s2)) = stddev2
+        {
+            if s1 == 0_f64 || s2 == 0_f64 {
+                return Ok(ScalarValue::Float64(None));
+            } else {
+                return Ok(ScalarValue::Float64(Some(c / s1 / s2)));
             }
         }
 
@@ -357,7 +361,7 @@ fn accumulate_correlation_states(
 /// where:
 /// n = number of observations
 /// sum_x = sum of x values
-/// sum_y = sum of y values  
+/// sum_y = sum of y values
 /// sum_xy = sum of (x * y)
 /// sum_xx = sum of x^2 values
 /// sum_yy = sum of y^2 values
@@ -400,6 +404,87 @@ impl GroupsAccumulator for CorrelationGroupsAccumulator {
         Ok(())
     }
 
+    fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
+        // Drain the state vectors for the groups being emitted
+        let counts = emit_to.take_needed(&mut self.count);
+        let sum_xs = emit_to.take_needed(&mut self.sum_x);
+        let sum_ys = emit_to.take_needed(&mut self.sum_y);
+        let sum_xys = emit_to.take_needed(&mut self.sum_xy);
+        let sum_xxs = emit_to.take_needed(&mut self.sum_xx);
+        let sum_yys = emit_to.take_needed(&mut self.sum_yy);
+
+        let n = counts.len();
+        let mut values = Vec::with_capacity(n);
+        let mut nulls = NullBufferBuilder::new(n);
+
+        // Notes for `Null` handling:
+        // - If the `count` state of a group is 0, no valid records are accumulated
+        //   for this group, so the aggregation result is `Null`.
+        // - Correlation can't be calculated when a group only has 1 record, or when
+        //   the `denominator` state is 0. In these cases, the final aggregation
+        //   result should be `Null` (according to PostgreSQL's behavior).
+        // - However, if any of the accumulated values contain NaN, the result should
+        //   be NaN regardless of the count (even for single-row groups).
+        for i in 0..n {
+            let count = counts[i];
+            let sum_x = sum_xs[i];
+            let sum_y = sum_ys[i];
+            let sum_xy = sum_xys[i];
+            let sum_xx = sum_xxs[i];
+            let sum_yy = sum_yys[i];
+
+            // If BOTH sum_x AND sum_y are NaN, then both input values are NaN → return NaN
+            // If only ONE of them is NaN, then only one input value is NaN → return NULL
+            if sum_x.is_nan() && sum_y.is_nan() {
+                // Both inputs are NaN → return NaN
+                values.push(f64::NAN);
+                nulls.append_non_null();
+                continue;
+            } else if count < 2 || sum_x.is_nan() || sum_y.is_nan() {
+                // Only one input is NaN → return NULL
+                values.push(0.0);
+                nulls.append_null();
+                continue;
+            }
+
+            let mean_x = sum_x / count as f64;
+            let mean_y = sum_y / count as f64;
+
+            let numerator = sum_xy - sum_x * mean_y;
+            let denominator =
+                ((sum_xx - sum_x * mean_x) * (sum_yy - sum_y * mean_y)).sqrt();
+
+            if denominator == 0.0 {
+                values.push(0.0);
+                nulls.append_null();
+            } else {
+                values.push(numerator / denominator);
+                nulls.append_non_null();
+            }
+        }
+
+        Ok(Arc::new(Float64Array::new(values.into(), nulls.finish())))
+    }
+
+    fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
+        // Drain the state vectors for the groups being emitted
+        let count = emit_to.take_needed(&mut self.count);
+        let sum_x = emit_to.take_needed(&mut self.sum_x);
+        let sum_y = emit_to.take_needed(&mut self.sum_y);
+        let sum_xy = emit_to.take_needed(&mut self.sum_xy);
+        let sum_xx = emit_to.take_needed(&mut self.sum_xx);
+        let sum_yy = emit_to.take_needed(&mut self.sum_yy);
+
+        Ok(vec![
+            Arc::new(UInt64Array::from(count)),
+            Arc::new(Float64Array::from(sum_x)),
+            Arc::new(Float64Array::from(sum_y)),
+            Arc::new(Float64Array::from(sum_xy)),
+            Arc::new(Float64Array::from(sum_xx)),
+            Arc::new(Float64Array::from(sum_yy)),
+        ])
+    }
+
     fn merge_batch(
         &mut self,
         values: &[ArrayRef],
@@ -423,7 +508,10 @@ impl GroupsAccumulator for CorrelationGroupsAccumulator {
         let partial_sum_xx = values[4].as_primitive::<Float64Type>();
         let partial_sum_yy = values[5].as_primitive::<Float64Type>();
 
-        assert!(opt_filter.is_none(), "aggregate filter should be applied in partial stage, there should be no filter in final stage");
+        assert!(
+            opt_filter.is_none(),
+            "aggregate filter should be applied in partial stage, there should be no filter in final stage"
+        );
 
         accumulate_correlation_states(
             group_indices,
@@ -448,85 +536,19 @@ impl GroupsAccumulator for CorrelationGroupsAccumulator {
         Ok(())
     }
 
-    fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
-        let n = match emit_to {
-            EmitTo::All => self.count.len(),
-            EmitTo::First(n) => n,
-        };
-
-        let mut values = Vec::with_capacity(n);
-        let mut nulls = NullBufferBuilder::new(n);
-
-        // Notes for `Null` handling:
-        // - If the `count` state of a group is 0, no valid records are accumulated
-        //   for this group, so the aggregation result is `Null`.
-        // - Correlation can't be calculated when a group only has 1 record, or when
-        //   the `denominator` state is 0. In these cases, the final aggregation
-        //   result should be `Null` (according to PostgreSQL's behavior).
-        //
-        for i in 0..n {
-            if self.count[i] < 2 {
-                values.push(0.0);
-                nulls.append_null();
-                continue;
-            }
-
-            let count = self.count[i];
-            let sum_x = self.sum_x[i];
-            let sum_y = self.sum_y[i];
-            let sum_xy = self.sum_xy[i];
-            let sum_xx = self.sum_xx[i];
-            let sum_yy = self.sum_yy[i];
-
-            let mean_x = sum_x / count as f64;
-            let mean_y = sum_y / count as f64;
-
-            let numerator = sum_xy - sum_x * mean_y;
-            let denominator =
-                ((sum_xx - sum_x * mean_x) * (sum_yy - sum_y * mean_y)).sqrt();
-
-            if denominator == 0.0 {
-                values.push(0.0);
-                nulls.append_null();
-            } else {
-                values.push(numerator / denominator);
-                nulls.append_non_null();
-            }
-        }
-
-        Ok(Arc::new(Float64Array::new(values.into(), nulls.finish())))
-    }
-
-    fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
-        let n = match emit_to {
-            EmitTo::All => self.count.len(),
-            EmitTo::First(n) => n,
-        };
-
-        Ok(vec![
-            Arc::new(UInt64Array::from(self.count[0..n].to_vec())),
-            Arc::new(Float64Array::from(self.sum_x[0..n].to_vec())),
-            Arc::new(Float64Array::from(self.sum_y[0..n].to_vec())),
-            Arc::new(Float64Array::from(self.sum_xy[0..n].to_vec())),
-            Arc::new(Float64Array::from(self.sum_xx[0..n].to_vec())),
-            Arc::new(Float64Array::from(self.sum_yy[0..n].to_vec())),
-        ])
-    }
-
     fn size(&self) -> usize {
-        size_of_val(&self.count)
-            + size_of_val(&self.sum_x)
-            + size_of_val(&self.sum_y)
-            + size_of_val(&self.sum_xy)
-            + size_of_val(&self.sum_xx)
-            + size_of_val(&self.sum_yy)
+        self.count.capacity() * size_of::<u64>()
+            + self.sum_x.capacity() * size_of::<f64>()
+            + self.sum_y.capacity() * size_of::<f64>()
+            + self.sum_xy.capacity() * size_of::<f64>()
+            + self.sum_xx.capacity() * size_of::<f64>()
+            + self.sum_yy.capacity() * size_of::<f64>()
     }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::array::{Float64Array, UInt64Array};
 
     #[test]
     fn test_accumulate_correlation_states() {
diff --git a/datafusion/functions-aggregate/src/count.rs b/datafusion/functions-aggregate/src/count.rs
index a291e8e21eb0f..eab36d4951a9c 100644
--- a/datafusion/functions-aggregate/src/count.rs
+++ b/datafusion/functions-aggregate/src/count.rs
@@ -15,33 +15,38 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use ahash::RandomState;
 use arrow::{
     array::{Array, ArrayRef, AsArray, BooleanArray, Int64Array, PrimitiveArray},
     buffer::BooleanBuffer,
     compute,
     datatypes::{
         DataType, Date32Type, Date64Type, Decimal128Type, Decimal256Type, Field,
-        FieldRef, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type,
-        Int8Type, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
+        FieldRef, Float16Type, Float32Type, Float64Type, Int8Type, Int16Type, Int32Type,
+        Int64Type, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
         Time64NanosecondType, TimeUnit, TimestampMicrosecondType,
         TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
-        UInt16Type, UInt32Type, UInt64Type, UInt8Type,
+        UInt8Type, UInt16Type, UInt32Type, UInt64Type,
     },
 };
+use datafusion_common::hash_utils::RandomState;
 use datafusion_common::{
-    downcast_value, internal_err, not_impl_err, stats::Precision,
-    utils::expr::COUNT_STAR_EXPANSION, HashMap, Result, ScalarValue,
+    HashMap, Result, ScalarValue, downcast_value, exec_err, internal_err, not_impl_err,
+    stats::Precision, utils::expr::COUNT_STAR_EXPANSION,
 };
 use datafusion_expr::{
-    expr::WindowFunction,
-    function::{AccumulatorArgs, StateFieldsArgs},
-    utils::format_state_name,
     Accumulator, AggregateUDFImpl, Documentation, EmitTo, Expr, GroupsAccumulator,
     ReversedUDAF, SetMonotonicity, Signature, StatisticsArgs, TypeSignature, Volatility,
     WindowFunctionDefinition,
+    expr::WindowFunction,
+    function::{AccumulatorArgs, StateFieldsArgs},
+    utils::format_state_name,
 };
+use datafusion_functions_aggregate_common::aggregate::count_distinct::PrimitiveDistinctCountGroupsAccumulator;
 use datafusion_functions_aggregate_common::aggregate::{
+    count_distinct::Bitmap65536DistinctCountAccumulator,
+    count_distinct::Bitmap65536DistinctCountAccumulatorI16,
+    count_distinct::BoolArray256DistinctCountAccumulator,
+    count_distinct::BoolArray256DistinctCountAccumulatorI8,
     count_distinct::BytesDistinctCountAccumulator,
     count_distinct::BytesViewDistinctCountAccumulator,
     count_distinct::DictionaryCountAccumulator,
@@ -147,20 +152,11 @@ pub fn count_all_window() -> Expr {
 ```"#,
     standard_argument(name = "expression",)
 )]
-#[derive(PartialEq, Eq, Hash)]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct Count {
     signature: Signature,
 }
 
-impl Debug for Count {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        f.debug_struct("Count")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for Count {
     fn default() -> Self {
         Self::new()
@@ -179,31 +175,23 @@ impl Count {
 }
 fn get_count_accumulator(data_type: &DataType) -> Box<dyn Accumulator> {
     match data_type {
-        // try and use a specialized accumulator if possible, otherwise fall back to generic accumulator
-        DataType::Int8 => Box::new(PrimitiveDistinctCountAccumulator::<Int8Type>::new(
-            data_type,
-        )),
-        DataType::Int16 => Box::new(PrimitiveDistinctCountAccumulator::<Int16Type>::new(
-            data_type,
-        )),
+        // HashSet-based accumulator for larger integer types
         DataType::Int32 => Box::new(PrimitiveDistinctCountAccumulator::<Int32Type>::new(
             data_type,
         )),
         DataType::Int64 => Box::new(PrimitiveDistinctCountAccumulator::<Int64Type>::new(
             data_type,
         )),
-        DataType::UInt8 => Box::new(PrimitiveDistinctCountAccumulator::<UInt8Type>::new(
-            data_type,
-        )),
-        DataType::UInt16 => Box::new(
-            PrimitiveDistinctCountAccumulator::<UInt16Type>::new(data_type),
-        ),
         DataType::UInt32 => Box::new(
             PrimitiveDistinctCountAccumulator::<UInt32Type>::new(data_type),
         ),
         DataType::UInt64 => Box::new(
             PrimitiveDistinctCountAccumulator::<UInt64Type>::new(data_type),
         ),
+        // Small int types - cold path
+        DataType::UInt8 | DataType::Int8 | DataType::UInt16 | DataType::Int16 => {
+            get_small_int_accumulator(data_type).unwrap()
+        }
         DataType::Decimal128(_, _) => Box::new(PrimitiveDistinctCountAccumulator::<
             Decimal128Type,
         >::new(data_type)),
@@ -279,11 +267,19 @@ fn get_count_accumulator(data_type: &DataType) -> Box<dyn Accumulator> {
     }
 }
 
-impl AggregateUDFImpl for Count {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
+/// Uses optimized bitmap accumulators but separated to keep hot path small
+#[cold]
+fn get_small_int_accumulator(data_type: &DataType) -> Result<Box<dyn Accumulator>> {
+    match data_type {
+        DataType::UInt8 => Ok(Box::new(BoolArray256DistinctCountAccumulator::new())),
+        DataType::Int8 => Ok(Box::new(BoolArray256DistinctCountAccumulatorI8::new())),
+        DataType::UInt16 => Ok(Box::new(Bitmap65536DistinctCountAccumulator::new())),
+        DataType::Int16 => Ok(Box::new(Bitmap65536DistinctCountAccumulatorI16::new())),
+        _ => exec_err!("unsupported accumulator for datatype: {}", data_type),
     }
+}
 
+impl AggregateUDFImpl for Count {
     fn name(&self) -> &str {
         "count"
     }
@@ -307,20 +303,24 @@ impl AggregateUDFImpl for Count {
                 &dtype => dtype.clone(),
             };
 
-            Ok(vec![Field::new_list(
-                format_state_name(args.name, "count distinct"),
-                // See COMMENTS.md to understand why nullable is set to true
-                Field::new_list_field(dtype, true),
-                false,
-            )
-            .into()])
+            Ok(vec![
+                Field::new_list(
+                    format_state_name(args.name, "count distinct"),
+                    // See COMMENTS.md to understand why nullable is set to true
+                    Field::new_list_field(dtype, true),
+                    false,
+                )
+                .into(),
+            ])
         } else {
-            Ok(vec![Field::new(
-                format_state_name(args.name, "count"),
-                DataType::Int64,
-                false,
-            )
-            .into()])
+            Ok(vec![
+                Field::new(
+                    format_state_name(args.name, "count"),
+                    DataType::Int64,
+                    false,
+                )
+                .into(),
+            ])
         }
     }
 
@@ -345,20 +345,33 @@ impl AggregateUDFImpl for Count {
     }
 
     fn groups_accumulator_supported(&self, args: AccumulatorArgs) -> bool {
-        // groups accumulator only supports `COUNT(c1)`, not
-        // `COUNT(c1, c2)`, etc
-        if args.is_distinct {
+        if args.exprs.len() != 1 {
             return false;
         }
-        args.exprs.len() == 1
+        if !args.is_distinct {
+            return true;
+        }
+        matches!(
+            args.expr_fields[0].data_type(),
+            DataType::Int8
+                | DataType::Int16
+                | DataType::Int32
+                | DataType::Int64
+                | DataType::UInt8
+                | DataType::UInt16
+                | DataType::UInt32
+                | DataType::UInt64
+        )
     }
 
     fn create_groups_accumulator(
         &self,
-        _args: AccumulatorArgs,
+        args: AccumulatorArgs,
     ) -> Result<Box<dyn GroupsAccumulator>> {
-        // instantiate specialized accumulator
-        Ok(Box::new(CountGroupsAccumulator::new()))
+        if !args.is_distinct {
+            return Ok(Box::new(CountGroupsAccumulator::new()));
+        }
+        create_distinct_count_groups_accumulator(&args)
     }
 
     fn reverse_expr(&self) -> ReversedUDAF {
@@ -370,32 +383,39 @@ impl AggregateUDFImpl for Count {
     }
 
     fn value_from_stats(&self, statistics_args: &StatisticsArgs) -> Option<ScalarValue> {
+        let [expr] = statistics_args.exprs else {
+            return None;
+        };
+        let col_stats = &statistics_args.statistics.column_statistics;
+
         if statistics_args.is_distinct {
+            // Only column references can be resolved from statistics;
+            // expressions like casts or literals are not supported.
+            let col_expr = expr.downcast_ref::<expressions::Column>()?;
+            if let Precision::Exact(dc) = col_stats[col_expr.index()].distinct_count {
+                let dc = i64::try_from(dc).ok()?;
+                return Some(ScalarValue::Int64(Some(dc)));
+            }
             return None;
         }
-        if let Precision::Exact(num_rows) = statistics_args.statistics.num_rows {
-            if statistics_args.exprs.len() == 1 {
-                // TODO optimize with exprs other than Column
-                if let Some(col_expr) = statistics_args.exprs[0]
-                    .as_any()
-                    .downcast_ref::<expressions::Column>()
-                {
-                    let current_val = &statistics_args.statistics.column_statistics
-                        [col_expr.index()]
-                    .null_count;
-                    if let &Precision::Exact(val) = current_val {
-                        return Some(ScalarValue::Int64(Some((num_rows - val) as i64)));
-                    }
-                } else if let Some(lit_expr) = statistics_args.exprs[0]
-                    .as_any()
-                    .downcast_ref::<expressions::Literal>()
-                {
-                    if lit_expr.value() == &COUNT_STAR_EXPANSION {
-                        return Some(ScalarValue::Int64(Some(num_rows as i64)));
-                    }
-                }
+
+        let Precision::Exact(num_rows) = statistics_args.statistics.num_rows else {
+            return None;
+        };
+
+        // TODO optimize with exprs other than Column
+        if let Some(col_expr) = expr.downcast_ref::<expressions::Column>() {
+            if let Precision::Exact(val) = col_stats[col_expr.index()].null_count {
+                let count = i64::try_from(num_rows - val).ok()?;
+                return Some(ScalarValue::Int64(Some(count)));
             }
+        } else if let Some(lit_expr) = expr.downcast_ref::<expressions::Literal>()
+            && lit_expr.value() == &COUNT_STAR_EXPANSION
+        {
+            let num_rows = i64::try_from(num_rows).ok()?;
+            return Some(ScalarValue::Int64(Some(num_rows)));
         }
+
         None
     }
 
@@ -424,6 +444,43 @@ impl AggregateUDFImpl for Count {
     }
 }
 
+#[cold]
+fn create_distinct_count_groups_accumulator(
+    args: &AccumulatorArgs,
+) -> Result<Box<dyn GroupsAccumulator>> {
+    let data_type = args.expr_fields[0].data_type();
+    match data_type {
+        DataType::Int8 => Ok(Box::new(
+            PrimitiveDistinctCountGroupsAccumulator::<Int8Type>::new(),
+        )),
+        DataType::Int16 => Ok(Box::new(PrimitiveDistinctCountGroupsAccumulator::<
+            Int16Type,
+        >::new())),
+        DataType::Int32 => Ok(Box::new(PrimitiveDistinctCountGroupsAccumulator::<
+            Int32Type,
+        >::new())),
+        DataType::Int64 => Ok(Box::new(PrimitiveDistinctCountGroupsAccumulator::<
+            Int64Type,
+        >::new())),
+        DataType::UInt8 => Ok(Box::new(PrimitiveDistinctCountGroupsAccumulator::<
+            UInt8Type,
+        >::new())),
+        DataType::UInt16 => Ok(Box::new(PrimitiveDistinctCountGroupsAccumulator::<
+            UInt16Type,
+        >::new())),
+        DataType::UInt32 => Ok(Box::new(PrimitiveDistinctCountGroupsAccumulator::<
+            UInt32Type,
+        >::new())),
+        DataType::UInt64 => Ok(Box::new(PrimitiveDistinctCountGroupsAccumulator::<
+            UInt64Type,
+        >::new())),
+        _ => not_impl_err!(
+            "GroupsAccumulator not supported for COUNT(DISTINCT) with {}",
+            data_type
+        ),
+    }
+}
+
 // DistinctCountAccumulator does not support retract_batch and sliding window
 // this is a specialized accumulator for distinct count that supports retract_batch
 // and sliding window.
@@ -466,12 +523,12 @@ impl Accumulator for SlidingDistinctCountAccumulator {
         let arr = &values[0];
         for i in 0..arr.len() {
             let v = ScalarValue::try_from_array(arr, i)?;
-            if !v.is_null() {
-                if let Some(cnt) = self.counts.get_mut(&v) {
-                    *cnt -= 1;
-                    if *cnt == 0 {
-                        self.counts.remove(&v);
-                    }
+            if !v.is_null()
+                && let Some(cnt) = self.counts.get_mut(&v)
+            {
+                *cnt -= 1;
+                if *cnt == 0 {
+                    self.counts.remove(&v);
                 }
             }
         }
@@ -595,7 +652,9 @@ impl GroupsAccumulator for CountGroupsAccumulator {
             values.logical_nulls().as_ref(),
             opt_filter,
             |group_index| {
-                self.counts[group_index] += 1;
+                // SAFETY: group_index is guaranteed to be in bounds
+                let count = unsafe { self.counts.get_unchecked_mut(group_index) };
+                *count += 1;
             },
         );
 
@@ -854,7 +913,7 @@ mod tests {
         datatypes::{DataType, Field, Int32Type, Schema},
     };
     use datafusion_expr::function::AccumulatorArgs;
-    use datafusion_physical_expr::{expressions::Column, PhysicalExpr};
+    use datafusion_physical_expr::{PhysicalExpr, expressions::Column};
     use std::sync::Arc;
     /// Helper function to create a dictionary array with non-null keys but some null values
     /// Returns a dictionary array where:
diff --git a/datafusion/functions-aggregate/src/covariance.rs b/datafusion/functions-aggregate/src/covariance.rs
index f74fddd603319..18d602ab33940 100644
--- a/datafusion/functions-aggregate/src/covariance.rs
+++ b/datafusion/functions-aggregate/src/covariance.rs
@@ -17,21 +17,14 @@
 
 //! [`CovarianceSample`]: covariance sample aggregations.
 
-use arrow::datatypes::FieldRef;
-use arrow::{
-    array::{ArrayRef, Float64Array, UInt64Array},
-    compute::kernels::cast,
-    datatypes::{DataType, Field},
-};
-use datafusion_common::{
-    downcast_value, plan_err, unwrap_or_internal_err, DataFusionError, Result,
-    ScalarValue,
-};
+use arrow::array::ArrayRef;
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::cast::{as_float64_array, as_uint64_array};
+use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::{
+    Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
     function::{AccumulatorArgs, StateFieldsArgs},
-    type_coercion::aggregates::NUMERICS,
     utils::format_state_name,
-    Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
 };
 use datafusion_functions_aggregate_common::stats::StatsType;
 use datafusion_macros::user_doc;
@@ -70,21 +63,12 @@ make_udaf_expr_and_func!(
     standard_argument(name = "expression1", prefix = "First"),
     standard_argument(name = "expression2", prefix = "Second")
 )]
-#[derive(PartialEq, Eq, Hash)]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct CovarianceSample {
     signature: Signature,
     aliases: Vec<String>,
 }
 
-impl Debug for CovarianceSample {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        f.debug_struct("CovarianceSample")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for CovarianceSample {
     fn default() -> Self {
         Self::new()
@@ -95,16 +79,15 @@ impl CovarianceSample {
     pub fn new() -> Self {
         Self {
             aliases: vec![String::from("covar")],
-            signature: Signature::uniform(2, NUMERICS.to_vec(), Volatility::Immutable),
+            signature: Signature::exact(
+                vec![DataType::Float64, DataType::Float64],
+                Volatility::Immutable,
+            ),
         }
     }
 }
 
 impl AggregateUDFImpl for CovarianceSample {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "covar_samp"
     }
@@ -113,11 +96,7 @@ impl AggregateUDFImpl for CovarianceSample {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if !arg_types[0].is_numeric() {
-            return plan_err!("Covariance requires numeric input types");
-        }
-
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
         Ok(DataType::Float64)
     }
 
@@ -166,20 +145,11 @@ impl AggregateUDFImpl for CovarianceSample {
     standard_argument(name = "expression1", prefix = "First"),
     standard_argument(name = "expression2", prefix = "Second")
 )]
-#[derive(PartialEq, Eq, Hash)]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct CovariancePopulation {
     signature: Signature,
 }
 
-impl Debug for CovariancePopulation {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        f.debug_struct("CovariancePopulation")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for CovariancePopulation {
     fn default() -> Self {
         Self::new()
@@ -189,16 +159,15 @@ impl Default for CovariancePopulation {
 impl CovariancePopulation {
     pub fn new() -> Self {
         Self {
-            signature: Signature::uniform(2, NUMERICS.to_vec(), Volatility::Immutable),
+            signature: Signature::exact(
+                vec![DataType::Float64, DataType::Float64],
+                Volatility::Immutable,
+            ),
         }
     }
 }
 
 impl AggregateUDFImpl for CovariancePopulation {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "covar_pop"
     }
@@ -207,11 +176,7 @@ impl AggregateUDFImpl for CovariancePopulation {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if !arg_types[0].is_numeric() {
-            return plan_err!("Covariance requires numeric input types");
-        }
-
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
         Ok(DataType::Float64)
     }
 
@@ -305,30 +270,15 @@ impl Accumulator for CovarianceAccumulator {
     }
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        let values1 = &cast(&values[0], &DataType::Float64)?;
-        let values2 = &cast(&values[1], &DataType::Float64)?;
+        let values1 = as_float64_array(&values[0])?;
+        let values2 = as_float64_array(&values[1])?;
 
-        let mut arr1 = downcast_value!(values1, Float64Array).iter().flatten();
-        let mut arr2 = downcast_value!(values2, Float64Array).iter().flatten();
-
-        for i in 0..values1.len() {
-            let value1 = if values1.is_valid(i) {
-                arr1.next()
-            } else {
-                None
-            };
-            let value2 = if values2.is_valid(i) {
-                arr2.next()
-            } else {
-                None
+        for (value1, value2) in values1.iter().zip(values2) {
+            let (value1, value2) = match (value1, value2) {
+                (Some(a), Some(b)) => (a, b),
+                _ => continue,
             };
 
-            if value1.is_none() || value2.is_none() {
-                continue;
-            }
-
-            let value1 = unwrap_or_internal_err!(value1);
-            let value2 = unwrap_or_internal_err!(value2);
             let new_count = self.count + 1;
             let delta1 = value1 - self.mean1;
             let new_mean1 = delta1 / new_count as f64 + self.mean1;
@@ -346,29 +296,14 @@ impl Accumulator for CovarianceAccumulator {
     }
 
     fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        let values1 = &cast(&values[0], &DataType::Float64)?;
-        let values2 = &cast(&values[1], &DataType::Float64)?;
-        let mut arr1 = downcast_value!(values1, Float64Array).iter().flatten();
-        let mut arr2 = downcast_value!(values2, Float64Array).iter().flatten();
-
-        for i in 0..values1.len() {
-            let value1 = if values1.is_valid(i) {
-                arr1.next()
-            } else {
-                None
-            };
-            let value2 = if values2.is_valid(i) {
-                arr2.next()
-            } else {
-                None
-            };
+        let values1 = as_float64_array(&values[0])?;
+        let values2 = as_float64_array(&values[1])?;
 
-            if value1.is_none() || value2.is_none() {
-                continue;
-            }
-
-            let value1 = unwrap_or_internal_err!(value1);
-            let value2 = unwrap_or_internal_err!(value2);
+        for (value1, value2) in values1.iter().zip(values2) {
+            let (value1, value2) = match (value1, value2) {
+                (Some(a), Some(b)) => (a, b),
+                _ => continue,
+            };
 
             let new_count = self.count - 1;
             let delta1 = self.mean1 - value1;
@@ -387,10 +322,10 @@ impl Accumulator for CovarianceAccumulator {
     }
 
     fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
-        let counts = downcast_value!(states[0], UInt64Array);
-        let means1 = downcast_value!(states[1], Float64Array);
-        let means2 = downcast_value!(states[2], Float64Array);
-        let cs = downcast_value!(states[3], Float64Array);
+        let counts = as_uint64_array(&states[0])?;
+        let means1 = as_float64_array(&states[1])?;
+        let means2 = as_float64_array(&states[2])?;
+        let cs = as_float64_array(&states[3])?;
 
         for i in 0..counts.len() {
             let c = counts.value(i);
diff --git a/datafusion/functions-aggregate/src/first_last.rs b/datafusion/functions-aggregate/src/first_last.rs
index 73f2ec112ffcc..1935f29c4cfe8 100644
--- a/datafusion/functions-aggregate/src/first_last.rs
+++ b/datafusion/functions-aggregate/src/first_last.rs
@@ -17,33 +17,30 @@
 
 //! Defines the FIRST_VALUE/LAST_VALUE aggregations.
 
-use std::any::Any;
 use std::fmt::Debug;
 use std::hash::Hash;
 use std::mem::size_of_val;
 use std::sync::Arc;
 
-use arrow::array::{
-    Array, ArrayRef, ArrowPrimitiveType, AsArray, BooleanArray, BooleanBufferBuilder,
-    PrimitiveArray,
-};
-use arrow::buffer::{BooleanBuffer, NullBuffer};
+use arrow::array::{Array, ArrayRef, AsArray, BooleanArray, BooleanBufferBuilder};
+use arrow::buffer::BooleanBuffer;
 use arrow::compute::{self, LexicographicalComparator, SortColumn, SortOptions};
 use arrow::datatypes::{
-    DataType, Date32Type, Date64Type, Decimal128Type, Decimal256Type, Decimal32Type,
-    Decimal64Type, Field, FieldRef, Float16Type, Float32Type, Float64Type, Int16Type,
-    Int32Type, Int64Type, Int8Type, Time32MillisecondType, Time32SecondType,
+    DataType, Date32Type, Date64Type, Decimal32Type, Decimal64Type, Decimal128Type,
+    Decimal256Type, Field, FieldRef, Float16Type, Float32Type, Float64Type, Int8Type,
+    Int16Type, Int32Type, Int64Type, Time32MillisecondType, Time32SecondType,
     Time64MicrosecondType, Time64NanosecondType, TimeUnit, TimestampMicrosecondType,
-    TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type,
-    UInt32Type, UInt64Type, UInt8Type,
+    TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt8Type,
+    UInt16Type, UInt32Type, UInt64Type,
 };
 use datafusion_common::cast::as_boolean_array;
 use datafusion_common::utils::{compare_rows, extract_row_at_idx_to_buf, get_row_at_idx};
 use datafusion_common::{
-    arrow_datafusion_err, internal_err, DataFusionError, Result, ScalarValue,
+    DataFusionError, Result, ScalarValue, arrow_datafusion_err, internal_err,
+    not_impl_err,
 };
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
-use datafusion_expr::utils::{format_state_name, AggregateOrderSensitivity};
+use datafusion_expr::utils::{AggregateOrderSensitivity, format_state_name};
 use datafusion_expr::{
     Accumulator, AggregateUDFImpl, Documentation, EmitTo, Expr, ExprFunctionExt,
     GroupsAccumulator, ReversedUDAF, Signature, SortExpr, Volatility,
@@ -52,6 +49,10 @@ use datafusion_functions_aggregate_common::utils::get_sort_options;
 use datafusion_macros::user_doc;
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
 
+mod state;
+
+use state::{BytesValueState, PrimitiveValueState, ValueState};
+
 create_func!(FirstValue, first_value_udaf);
 create_func!(LastValue, last_value_udaf);
 
@@ -75,6 +76,142 @@ pub fn last_value(expression: Expr, order_by: Vec<SortExpr>) -> Expr {
         .unwrap()
 }
 
+fn create_groups_accumulator_helper<S: ValueState + 'static>(
+    args: &AccumulatorArgs,
+    is_first: bool,
+    state: S,
+) -> Result<Box<dyn GroupsAccumulator>> {
+    let Some(ordering) = LexOrdering::new(args.order_bys.to_vec()) else {
+        return internal_err!("Groups accumulator must have an ordering.");
+    };
+
+    let ordering_dtypes = ordering
+        .iter()
+        .map(|e| e.expr.data_type(args.schema))
+        .collect::<Result<Vec<_>>>()?;
+
+    Ok(Box::new(FirstLastGroupsAccumulator::try_new(
+        state,
+        ordering,
+        args.ignore_nulls,
+        &ordering_dtypes,
+        is_first,
+    )?))
+}
+
+fn create_groups_accumulator(
+    args: &AccumulatorArgs,
+    is_first: bool,
+    function_name: &str,
+) -> Result<Box<dyn GroupsAccumulator>> {
+    let data_type = args.return_field.data_type();
+
+    macro_rules! instantiate_primitive {
+        ($t:ty) => {
+            create_groups_accumulator_helper(
+                args,
+                is_first,
+                PrimitiveValueState::<$t>::new(data_type.clone()),
+            )
+        };
+    }
+
+    match data_type {
+        DataType::Int8 => instantiate_primitive!(Int8Type),
+        DataType::Int16 => instantiate_primitive!(Int16Type),
+        DataType::Int32 => instantiate_primitive!(Int32Type),
+        DataType::Int64 => instantiate_primitive!(Int64Type),
+        DataType::UInt8 => instantiate_primitive!(UInt8Type),
+        DataType::UInt16 => instantiate_primitive!(UInt16Type),
+        DataType::UInt32 => instantiate_primitive!(UInt32Type),
+        DataType::UInt64 => instantiate_primitive!(UInt64Type),
+        DataType::Float16 => instantiate_primitive!(Float16Type),
+        DataType::Float32 => instantiate_primitive!(Float32Type),
+        DataType::Float64 => instantiate_primitive!(Float64Type),
+
+        DataType::Decimal32(_, _) => instantiate_primitive!(Decimal32Type),
+        DataType::Decimal64(_, _) => instantiate_primitive!(Decimal64Type),
+        DataType::Decimal128(_, _) => instantiate_primitive!(Decimal128Type),
+        DataType::Decimal256(_, _) => instantiate_primitive!(Decimal256Type),
+
+        DataType::Timestamp(TimeUnit::Second, _) => {
+            instantiate_primitive!(TimestampSecondType)
+        }
+        DataType::Timestamp(TimeUnit::Millisecond, _) => {
+            instantiate_primitive!(TimestampMillisecondType)
+        }
+        DataType::Timestamp(TimeUnit::Microsecond, _) => {
+            instantiate_primitive!(TimestampMicrosecondType)
+        }
+        DataType::Timestamp(TimeUnit::Nanosecond, _) => {
+            instantiate_primitive!(TimestampNanosecondType)
+        }
+
+        DataType::Date32 => instantiate_primitive!(Date32Type),
+        DataType::Date64 => instantiate_primitive!(Date64Type),
+        DataType::Time32(TimeUnit::Second) => instantiate_primitive!(Time32SecondType),
+        DataType::Time32(TimeUnit::Millisecond) => {
+            instantiate_primitive!(Time32MillisecondType)
+        }
+        DataType::Time64(TimeUnit::Microsecond) => {
+            instantiate_primitive!(Time64MicrosecondType)
+        }
+        DataType::Time64(TimeUnit::Nanosecond) => {
+            instantiate_primitive!(Time64NanosecondType)
+        }
+
+        DataType::Utf8
+        | DataType::LargeUtf8
+        | DataType::Utf8View
+        | DataType::Binary
+        | DataType::LargeBinary
+        | DataType::BinaryView => create_groups_accumulator_helper(
+            args,
+            is_first,
+            BytesValueState::try_new(data_type.clone())?,
+        ),
+
+        _ => internal_err!(
+            "GroupsAccumulator not supported for {}({})",
+            function_name,
+            data_type
+        ),
+    }
+}
+
+fn groups_accumulator_supported(args: &AccumulatorArgs) -> bool {
+    use DataType::*;
+    !args.order_bys.is_empty()
+        && matches!(
+            args.return_field.data_type(),
+            Int8 | Int16
+                | Int32
+                | Int64
+                | UInt8
+                | UInt16
+                | UInt32
+                | UInt64
+                | Float16
+                | Float32
+                | Float64
+                | Decimal32(_, _)
+                | Decimal64(_, _)
+                | Decimal128(_, _)
+                | Decimal256(_, _)
+                | Date32
+                | Date64
+                | Time32(_)
+                | Time64(_)
+                | Timestamp(_, _)
+                | Utf8
+                | LargeUtf8
+                | Utf8View
+                | Binary
+                | LargeBinary
+                | BinaryView
+        )
+}
+
 #[user_doc(
     doc_section(label = "General Functions"),
     description = "Returns the first element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group.",
@@ -89,22 +226,12 @@ pub fn last_value(expression: Expr, order_by: Vec<SortExpr>) -> Expr {
 ```"#,
     standard_argument(name = "expression",)
 )]
-#[derive(PartialEq, Eq, Hash)]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct FirstValue {
     signature: Signature,
     is_input_pre_ordered: bool,
 }
 
-impl Debug for FirstValue {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        f.debug_struct("FirstValue")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .field("accumulator", &"<FUNC>")
-            .finish()
-    }
-}
-
 impl Default for FirstValue {
     fn default() -> Self {
         Self::new()
@@ -121,10 +248,6 @@ impl FirstValue {
 }
 
 impl AggregateUDFImpl for FirstValue {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "first_value"
     }
@@ -133,8 +256,20 @@ impl AggregateUDFImpl for FirstValue {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(arg_types[0].clone())
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        not_impl_err!("Not called because the return_field_from_args is implemented")
+    }
+
+    fn return_field(&self, arg_fields: &[FieldRef]) -> Result<FieldRef> {
+        // Preserve metadata from the first argument field
+        Ok(Arc::new(
+            Field::new(
+                self.name(),
+                arg_fields[0].data_type().clone(),
+                true, // always nullable, there may be no rows
+            )
+            .with_metadata(arg_fields[0].metadata().clone()),
+        ))
     }
 
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
@@ -159,12 +294,14 @@ impl AggregateUDFImpl for FirstValue {
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        let mut fields = vec![Field::new(
-            format_state_name(args.name, "first_value"),
-            args.return_type().clone(),
-            true,
-        )
-        .into()];
+        let mut fields = vec![
+            Field::new(
+                format_state_name(args.name, "first_value"),
+                args.return_type().clone(),
+                true,
+            )
+            .into(),
+        ];
         fields.extend(args.ordering_fields.iter().cloned());
         fields.push(
             Field::new(
@@ -178,110 +315,14 @@ impl AggregateUDFImpl for FirstValue {
     }
 
     fn groups_accumulator_supported(&self, args: AccumulatorArgs) -> bool {
-        use DataType::*;
-        !args.order_bys.is_empty()
-            && matches!(
-                args.return_field.data_type(),
-                Int8 | Int16
-                    | Int32
-                    | Int64
-                    | UInt8
-                    | UInt16
-                    | UInt32
-                    | UInt64
-                    | Float16
-                    | Float32
-                    | Float64
-                    | Decimal32(_, _)
-                    | Decimal64(_, _)
-                    | Decimal128(_, _)
-                    | Decimal256(_, _)
-                    | Date32
-                    | Date64
-                    | Time32(_)
-                    | Time64(_)
-                    | Timestamp(_, _)
-            )
+        groups_accumulator_supported(&args)
     }
 
     fn create_groups_accumulator(
         &self,
         args: AccumulatorArgs,
     ) -> Result<Box<dyn GroupsAccumulator>> {
-        fn create_accumulator<T: ArrowPrimitiveType + Send>(
-            args: AccumulatorArgs,
-        ) -> Result<Box<dyn GroupsAccumulator>> {
-            let Some(ordering) = LexOrdering::new(args.order_bys.to_vec()) else {
-                return internal_err!("Groups accumulator must have an ordering.");
-            };
-
-            let ordering_dtypes = ordering
-                .iter()
-                .map(|e| e.expr.data_type(args.schema))
-                .collect::<Result<Vec<_>>>()?;
-
-            FirstPrimitiveGroupsAccumulator::<T>::try_new(
-                ordering,
-                args.ignore_nulls,
-                args.return_field.data_type(),
-                &ordering_dtypes,
-                true,
-            )
-            .map(|acc| Box::new(acc) as _)
-        }
-
-        match args.return_field.data_type() {
-            DataType::Int8 => create_accumulator::<Int8Type>(args),
-            DataType::Int16 => create_accumulator::<Int16Type>(args),
-            DataType::Int32 => create_accumulator::<Int32Type>(args),
-            DataType::Int64 => create_accumulator::<Int64Type>(args),
-            DataType::UInt8 => create_accumulator::<UInt8Type>(args),
-            DataType::UInt16 => create_accumulator::<UInt16Type>(args),
-            DataType::UInt32 => create_accumulator::<UInt32Type>(args),
-            DataType::UInt64 => create_accumulator::<UInt64Type>(args),
-            DataType::Float16 => create_accumulator::<Float16Type>(args),
-            DataType::Float32 => create_accumulator::<Float32Type>(args),
-            DataType::Float64 => create_accumulator::<Float64Type>(args),
-
-            DataType::Decimal32(_, _) => create_accumulator::<Decimal32Type>(args),
-            DataType::Decimal64(_, _) => create_accumulator::<Decimal64Type>(args),
-            DataType::Decimal128(_, _) => create_accumulator::<Decimal128Type>(args),
-            DataType::Decimal256(_, _) => create_accumulator::<Decimal256Type>(args),
-
-            DataType::Timestamp(TimeUnit::Second, _) => {
-                create_accumulator::<TimestampSecondType>(args)
-            }
-            DataType::Timestamp(TimeUnit::Millisecond, _) => {
-                create_accumulator::<TimestampMillisecondType>(args)
-            }
-            DataType::Timestamp(TimeUnit::Microsecond, _) => {
-                create_accumulator::<TimestampMicrosecondType>(args)
-            }
-            DataType::Timestamp(TimeUnit::Nanosecond, _) => {
-                create_accumulator::<TimestampNanosecondType>(args)
-            }
-
-            DataType::Date32 => create_accumulator::<Date32Type>(args),
-            DataType::Date64 => create_accumulator::<Date64Type>(args),
-            DataType::Time32(TimeUnit::Second) => {
-                create_accumulator::<Time32SecondType>(args)
-            }
-            DataType::Time32(TimeUnit::Millisecond) => {
-                create_accumulator::<Time32MillisecondType>(args)
-            }
-
-            DataType::Time64(TimeUnit::Microsecond) => {
-                create_accumulator::<Time64MicrosecondType>(args)
-            }
-            DataType::Time64(TimeUnit::Nanosecond) => {
-                create_accumulator::<Time64NanosecondType>(args)
-            }
-
-            _ => internal_err!(
-                "GroupsAccumulator not supported for first_value({})",
-                args.return_field.data_type()
-            ),
-        }
+        create_groups_accumulator(&args, true, self.name())
     }
 
     fn with_beneficial_ordering(
@@ -302,18 +343,18 @@ impl AggregateUDFImpl for FirstValue {
         ReversedUDAF::Reversed(last_value_udaf())
     }
 
+    fn supports_null_handling_clause(&self) -> bool {
+        true
+    }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 }
 
-// TODO: rename to PrimitiveGroupsAccumulator
-struct FirstPrimitiveGroupsAccumulator<T>
-where
-    T: ArrowPrimitiveType + Send,
-{
+struct FirstLastGroupsAccumulator<S: ValueState> {
     // ================ state ===========
-    vals: Vec<T::Native>,
+    state: S,
     // Stores ordering values, of the aggregator requirement corresponding to first value
     // of the aggregator.
     // The `orderings` are stored row-wise, meaning that `orderings[group_idx]`
@@ -322,19 +363,16 @@ where
     // At the beginning, `is_sets[group_idx]` is false, which means `first` is not seen yet.
     // Once we see the first value, we set the `is_sets[group_idx]` flag
     is_sets: BooleanBufferBuilder,
-    // null_builder[group_idx] == false => vals[group_idx] is null
-    null_builder: BooleanBufferBuilder,
     // size of `self.orderings`
     // Calculating the memory usage of `self.orderings` using `ScalarValue::size_of_vec` is quite costly.
     // Therefore, we cache it and compute `size_of` only after each update
     // to avoid calling `ScalarValue::size_of_vec` by Self.size.
     size_of_orderings: usize,
 
-    // buffer for `get_filtered_min_of_each_group`
+    // buffer for `get_filtered_extreme_of_each_group`
     // filter_min_of_each_group_buf.0[group_idx] -> idx_in_val
     // only valid if filter_min_of_each_group_buf.1[group_idx] == true
-    // TODO: rename to extreme_of_each_group_buf
-    min_of_each_group_buf: (Vec<usize>, BooleanBufferBuilder),
+    extreme_of_each_group_buf: (Vec<usize>, BooleanBufferBuilder),
 
     // =========== option ============
 
@@ -347,19 +385,14 @@ where
     sort_options: Vec<SortOptions>,
     // Ignore null values.
     ignore_nulls: bool,
-    /// The output type
-    data_type: DataType,
     default_orderings: Vec<ScalarValue>,
 }
 
-impl<T> FirstPrimitiveGroupsAccumulator<T>
-where
-    T: ArrowPrimitiveType + Send,
-{
+impl<S: ValueState> FirstLastGroupsAccumulator<S> {
     fn try_new(
+        state: S,
         ordering_req: LexOrdering,
         ignore_nulls: bool,
-        data_type: &DataType,
         ordering_dtypes: &[DataType],
         pick_first_in_group: bool,
     ) -> Result<Self> {
@@ -371,17 +404,15 @@ where
         let sort_options = get_sort_options(&ordering_req);
 
         Ok(Self {
-            null_builder: BooleanBufferBuilder::new(0),
             ordering_req,
             sort_options,
             ignore_nulls,
             default_orderings,
-            data_type: data_type.clone(),
-            vals: Vec::new(),
+            state,
             orderings: Vec::new(),
             is_sets: BooleanBufferBuilder::new(0),
             size_of_orderings: 0,
-            min_of_each_group_buf: (Vec::new(), BooleanBufferBuilder::new(0)),
+            extreme_of_each_group_buf: (Vec::new(), BooleanBufferBuilder::new(0)),
             pick_first_in_group,
         })
     }
@@ -395,7 +426,7 @@ where
             return Ok(true);
         }
 
-        assert!(new_ordering_values.len() == self.ordering_req.len());
+        debug_assert!(new_ordering_values.len() == self.ordering_req.len());
         let current_ordering = &self.orderings[group_idx];
         compare_rows(current_ordering, new_ordering_values, &self.sort_options).map(|x| {
             if self.pick_first_in_group {
@@ -420,32 +451,8 @@ where
         result
     }
 
-    fn take_need(
-        bool_buf_builder: &mut BooleanBufferBuilder,
-        emit_to: EmitTo,
-    ) -> BooleanBuffer {
-        let bool_buf = bool_buf_builder.finish();
-        match emit_to {
-            EmitTo::All => bool_buf,
-            EmitTo::First(n) => {
-                // split off the first N values in seen_values
-                //
-                // TODO make this more efficient rather than two
-                // copies and bitwise manipulation
-                let first_n: BooleanBuffer = bool_buf.iter().take(n).collect();
-                // reset the existing buffer
-                for b in bool_buf.iter().skip(n) {
-                    bool_buf_builder.append(b);
-                }
-                first_n
-            }
-        }
-    }
-
     fn resize_states(&mut self, new_size: usize) {
-        self.vals.resize(new_size, T::default_value());
-
-        self.null_builder.resize(new_size);
+        self.state.resize(new_size);
 
         if self.orderings.len() < new_size {
             let current_len = self.orderings.len();
@@ -464,44 +471,43 @@ where
 
         self.is_sets.resize(new_size);
 
-        self.min_of_each_group_buf.0.resize(new_size, 0);
-        self.min_of_each_group_buf.1.resize(new_size);
+        self.extreme_of_each_group_buf.0.resize(new_size, 0);
+        self.extreme_of_each_group_buf.1.resize(new_size);
     }
 
     fn update_state(
         &mut self,
         group_idx: usize,
         orderings: &[ScalarValue],
-        new_val: T::Native,
-        is_null: bool,
-    ) {
-        self.vals[group_idx] = new_val;
+        array: &ArrayRef,
+        idx: usize,
+    ) -> Result<()> {
+        self.state.update(group_idx, array, idx)?;
         self.is_sets.set_bit(group_idx, true);
 
-        self.null_builder.set_bit(group_idx, !is_null);
-
-        assert!(orderings.len() == self.ordering_req.len());
+        debug_assert!(orderings.len() == self.ordering_req.len());
         let old_size = ScalarValue::size_of_vec(&self.orderings[group_idx]);
         self.orderings[group_idx].clear();
         self.orderings[group_idx].extend_from_slice(orderings);
         let new_size = ScalarValue::size_of_vec(&self.orderings[group_idx]);
         self.size_of_orderings = self.size_of_orderings - old_size + new_size;
+        Ok(())
     }
 
     fn take_state(
         &mut self,
         emit_to: EmitTo,
-    ) -> (ArrayRef, Vec<Vec<ScalarValue>>, BooleanBuffer) {
-        emit_to.take_needed(&mut self.min_of_each_group_buf.0);
-        self.min_of_each_group_buf
+    ) -> Result<(ArrayRef, Vec<Vec<ScalarValue>>, BooleanBuffer)> {
+        emit_to.take_needed(&mut self.extreme_of_each_group_buf.0);
+        self.extreme_of_each_group_buf
             .1
-            .truncate(self.min_of_each_group_buf.0.len());
+            .truncate(self.extreme_of_each_group_buf.0.len());
 
-        (
-            self.take_vals_and_null_buf(emit_to),
+        Ok((
+            self.state.take(emit_to)?,
             self.take_orderings(emit_to),
-            Self::take_need(&mut self.is_sets, emit_to),
-        )
+            state::take_need(&mut self.is_sets, emit_to),
+        ))
     }
 
     // should be used in test only
@@ -515,20 +521,19 @@ where
     /// Returns a vector of tuples `(group_idx, idx_in_val)` representing the index of the
     /// minimum value in `orderings` for each group, using lexicographical comparison.
     /// Values are filtered using `opt_filter` and `is_set_arr` if provided.
-    /// TODO: rename to get_filtered_extreme_of_each_group
-    fn get_filtered_min_of_each_group(
+    fn get_filtered_extreme_of_each_group(
         &mut self,
         orderings: &[ArrayRef],
         group_indices: &[usize],
         opt_filter: Option<&BooleanArray>,
-        vals: &PrimitiveArray<T>,
+        vals: &ArrayRef,
         is_set_arr: Option<&BooleanArray>,
     ) -> Result<Vec<(usize, usize)>> {
         // Set all values in min_of_each_group_buf.1 to false.
-        self.min_of_each_group_buf.1.truncate(0);
-        self.min_of_each_group_buf
+        self.extreme_of_each_group_buf.1.truncate(0);
+        self.extreme_of_each_group_buf
             .1
-            .append_n(self.vals.len(), false);
+            .append_n(self.is_sets.len(), false);
 
         // No need to call `clear` since `self.min_of_each_group_buf.0[group_idx]`
         // is only valid when `self.min_of_each_group_buf.1[group_idx] == true`.
@@ -561,48 +566,35 @@ where
                 continue;
             }
 
-            let is_valid = self.min_of_each_group_buf.1.get_bit(group_idx);
+            let is_valid = self.extreme_of_each_group_buf.1.get_bit(group_idx);
 
             if !is_valid {
-                self.min_of_each_group_buf.1.set_bit(group_idx, true);
-                self.min_of_each_group_buf.0[group_idx] = idx_in_val;
+                self.extreme_of_each_group_buf.1.set_bit(group_idx, true);
+                self.extreme_of_each_group_buf.0[group_idx] = idx_in_val;
             } else {
                 let ordering = comparator
-                    .compare(self.min_of_each_group_buf.0[group_idx], idx_in_val);
+                    .compare(self.extreme_of_each_group_buf.0[group_idx], idx_in_val);
 
                 if (ordering.is_gt() && self.pick_first_in_group)
                     || (ordering.is_lt() && !self.pick_first_in_group)
                 {
-                    self.min_of_each_group_buf.0[group_idx] = idx_in_val;
+                    self.extreme_of_each_group_buf.0[group_idx] = idx_in_val;
                 }
             }
         }
 
         Ok(self
-            .min_of_each_group_buf
+            .extreme_of_each_group_buf
             .0
             .iter()
             .enumerate()
-            .filter(|(group_idx, _)| self.min_of_each_group_buf.1.get_bit(*group_idx))
+            .filter(|(group_idx, _)| self.extreme_of_each_group_buf.1.get_bit(*group_idx))
             .map(|(group_idx, idx_in_val)| (group_idx, *idx_in_val))
             .collect::<Vec<_>>())
     }
-
-    fn take_vals_and_null_buf(&mut self, emit_to: EmitTo) -> ArrayRef {
-        let r = emit_to.take_needed(&mut self.vals);
-
-        let null_buf = NullBuffer::new(Self::take_need(&mut self.null_builder, emit_to));
-
-        let values = PrimitiveArray::<T>::new(r.into(), Some(null_buf)) // no copy
-            .with_data_type(self.data_type.clone());
-        Arc::new(values)
-    }
 }
 
-impl<T> GroupsAccumulator for FirstPrimitiveGroupsAccumulator<T>
-where
-    T: ArrowPrimitiveType + Send,
-{
+impl<S: ValueState + 'static> GroupsAccumulator for FirstLastGroupsAccumulator<S> {
     fn update_batch(
         &mut self,
         // e.g. first_value(a order by b): values_and_order_cols will be [a, b]
@@ -613,13 +605,13 @@ where
     ) -> Result<()> {
         self.resize_states(total_num_groups);
 
-        let vals = values_and_order_cols[0].as_primitive::<T>();
+        let vals = &values_and_order_cols[0];
 
         let mut ordering_buf = Vec::with_capacity(self.ordering_req.len());
 
         // The overhead of calling `extract_row_at_idx_to_buf` is somewhat high, so we need to minimize its calls as much as possible.
         for (group_idx, idx) in self
-            .get_filtered_min_of_each_group(
+            .get_filtered_extreme_of_each_group(
                 &values_and_order_cols[1..],
                 group_indices,
                 opt_filter,
@@ -635,12 +627,7 @@ where
             )?;
 
             if self.should_update_state(group_idx, &ordering_buf)? {
-                self.update_state(
-                    group_idx,
-                    &ordering_buf,
-                    vals.value(idx),
-                    vals.is_null(idx),
-                );
+                self.update_state(group_idx, &ordering_buf, vals, idx)?;
             }
         }
 
@@ -648,11 +635,11 @@ where
     }
 
     fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
-        Ok(self.take_state(emit_to).0)
+        Ok(self.take_state(emit_to)?.0)
     }
 
     fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
-        let (val_arr, orderings, is_sets) = self.take_state(emit_to);
+        let (val_arr, orderings, is_sets) = self.take_state(emit_to)?;
         let mut result = Vec::with_capacity(self.orderings.len() + 2);
 
         result.push(val_arr);
@@ -663,7 +650,7 @@ where
                 ordering_cols.push(Vec::with_capacity(self.orderings.len()));
             }
             for row in orderings.into_iter() {
-                assert_eq!(row.len(), self.ordering_req.len());
+                debug_assert!(row.len() == self.ordering_req.len());
                 for (col_idx, ordering) in row.into_iter().enumerate() {
                     ordering_cols[col_idx].push(ordering);
                 }
@@ -698,9 +685,9 @@ where
 
         let is_set_arr = as_boolean_array(is_set_arr)?;
 
-        let vals = values[0].as_primitive::<T>();
+        let vals = &values[0];
         // The overhead of calling `extract_row_at_idx_to_buf` is somewhat high, so we need to minimize its calls as much as possible.
-        let groups = self.get_filtered_min_of_each_group(
+        let groups = self.get_filtered_extreme_of_each_group(
             &val_and_order_cols[1..],
             group_indices,
             opt_filter,
@@ -712,12 +699,7 @@ where
             extract_row_at_idx_to_buf(&val_and_order_cols[1..], idx, &mut ordering_buf)?;
 
             if self.should_update_state(group_idx, &ordering_buf)? {
-                self.update_state(
-                    group_idx,
-                    &ordering_buf,
-                    vals.value(idx),
-                    vals.is_null(idx),
-                );
+                self.update_state(group_idx, &ordering_buf, vals, idx)?;
             }
         }
 
@@ -725,12 +707,11 @@ where
     }
 
     fn size(&self) -> usize {
-        self.vals.capacity() * size_of::<T::Native>()
-            + self.null_builder.capacity() / 8 // capacity is in bits, so convert to bytes
-            + self.is_sets.capacity() / 8
+        self.state.size()
+            + self.is_sets.capacity() / 8 // capacity is in bits, so convert to bytes
             + self.size_of_orderings
-            + self.min_of_each_group_buf.0.capacity() * size_of::<usize>()
-            + self.min_of_each_group_buf.1.capacity() / 8
+            + self.extreme_of_each_group_buf.0.capacity() * size_of::<usize>()
+            + self.extreme_of_each_group_buf.1.capacity() / 8
     }
 
     fn supports_convert_to_state(&self) -> bool {
@@ -803,8 +784,7 @@ impl Accumulator for TrivialFirstValueAccumulator {
                 first_idx = Some(0);
             }
             if let Some(first_idx) = first_idx {
-                let mut row = get_row_at_idx(values, first_idx)?;
-                self.first = row.swap_remove(0);
+                self.first = ScalarValue::try_from_array(&values[0], first_idx)?;
                 self.first.compact();
                 self.is_set = true;
             }
@@ -821,11 +801,11 @@ impl Accumulator for TrivialFirstValueAccumulator {
 
             let filtered_states =
                 filter_states_according_to_is_set(&states[0..1], flags)?;
-            if let Some(first) = filtered_states.first() {
-                if !first.is_empty() {
-                    self.first = ScalarValue::try_from_array(first, 0)?;
-                    self.is_set = true;
-                }
+            if let Some(first) = filtered_states.first()
+                && !first.is_empty()
+            {
+                self.first = ScalarValue::try_from_array(first, 0)?;
+                self.is_set = true;
             }
         }
         Ok(())
@@ -850,6 +830,8 @@ pub struct FirstValueAccumulator {
     orderings: Vec<ScalarValue>,
     // Stores the applicable ordering requirement.
     ordering_req: LexOrdering,
+    // derived from `ordering_req`.
+    sort_options: Vec<SortOptions>,
     // Stores whether incoming data already satisfies the ordering requirement.
     is_input_pre_ordered: bool,
     // Ignore null values.
@@ -869,11 +851,13 @@ impl FirstValueAccumulator {
             .iter()
             .map(ScalarValue::try_from)
             .collect::<Result<_>>()?;
+        let sort_options = get_sort_options(&ordering_req);
         ScalarValue::try_from(data_type).map(|first| Self {
             first,
             is_set: false,
             orderings,
             ordering_req,
+            sort_options,
             is_input_pre_ordered,
             ignore_nulls,
         })
@@ -946,12 +930,8 @@ impl Accumulator for FirstValueAccumulator {
             let row = get_row_at_idx(values, first_idx)?;
             if !self.is_set
                 || (!self.is_input_pre_ordered
-                    && compare_rows(
-                        &self.orderings,
-                        &row[1..],
-                        &get_sort_options(&self.ordering_req),
-                    )?
-                    .is_gt())
+                    && compare_rows(&self.orderings, &row[1..], &self.sort_options)?
+                        .is_gt())
             {
                 self.update_with_new_row(row);
             }
@@ -979,10 +959,10 @@ impl Accumulator for FirstValueAccumulator {
             let mut first_row = get_row_at_idx(&filtered_states, first_idx)?;
             // When collecting orderings, we exclude the is_set flag from the state.
             let first_ordering = &first_row[1..is_set_idx];
-            let sort_options = get_sort_options(&self.ordering_req);
             // Either there is no existing value, or there is an earlier version in new data.
             if !self.is_set
-                || compare_rows(&self.orderings, first_ordering, &sort_options)?.is_gt()
+                || compare_rows(&self.orderings, first_ordering, &self.sort_options)?
+                    .is_gt()
             {
                 // Update with first value in the state. Note that we should exclude the
                 // is_set flag from the state. Otherwise, we will end up with a state
@@ -1021,22 +1001,12 @@ impl Accumulator for FirstValueAccumulator {
 ```"#,
     standard_argument(name = "expression",)
 )]
-#[derive(PartialEq, Eq, Hash)]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct LastValue {
     signature: Signature,
     is_input_pre_ordered: bool,
 }
 
-impl Debug for LastValue {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        f.debug_struct("LastValue")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .field("accumulator", &"<FUNC>")
-            .finish()
-    }
-}
-
 impl Default for LastValue {
     fn default() -> Self {
         Self::new()
@@ -1053,10 +1023,6 @@ impl LastValue {
 }
 
 impl AggregateUDFImpl for LastValue {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "last_value"
     }
@@ -1065,8 +1031,20 @@ impl AggregateUDFImpl for LastValue {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(arg_types[0].clone())
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        not_impl_err!("Not called because the return_field_from_args is implemented")
+    }
+
+    fn return_field(&self, arg_fields: &[FieldRef]) -> Result<FieldRef> {
+        // Preserve metadata from the first argument field
+        Ok(Arc::new(
+            Field::new(
+                self.name(),
+                arg_fields[0].data_type().clone(),
+                true, // always nullable, there may be no rows
+            )
+            .with_metadata(arg_fields[0].metadata().clone()),
+        ))
     }
 
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
@@ -1091,12 +1069,14 @@ impl AggregateUDFImpl for LastValue {
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        let mut fields = vec![Field::new(
-            format_state_name(args.name, "last_value"),
-            args.return_field.data_type().clone(),
-            true,
-        )
-        .into()];
+        let mut fields = vec![
+            Field::new(
+                format_state_name(args.name, "last_value"),
+                args.return_field.data_type().clone(),
+                true,
+            )
+            .into(),
+        ];
         fields.extend(args.ordering_fields.iter().cloned());
         fields.push(
             Field::new(
@@ -1127,119 +1107,23 @@ impl AggregateUDFImpl for LastValue {
         ReversedUDAF::Reversed(first_value_udaf())
     }
 
+    fn supports_null_handling_clause(&self) -> bool {
+        true
+    }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 
     fn groups_accumulator_supported(&self, args: AccumulatorArgs) -> bool {
-        use DataType::*;
-        !args.order_bys.is_empty()
-            && matches!(
-                args.return_field.data_type(),
-                Int8 | Int16
-                    | Int32
-                    | Int64
-                    | UInt8
-                    | UInt16
-                    | UInt32
-                    | UInt64
-                    | Float16
-                    | Float32
-                    | Float64
-                    | Decimal32(_, _)
-                    | Decimal64(_, _)
-                    | Decimal128(_, _)
-                    | Decimal256(_, _)
-                    | Date32
-                    | Date64
-                    | Time32(_)
-                    | Time64(_)
-                    | Timestamp(_, _)
-            )
+        groups_accumulator_supported(&args)
     }
 
     fn create_groups_accumulator(
         &self,
         args: AccumulatorArgs,
     ) -> Result<Box<dyn GroupsAccumulator>> {
-        fn create_accumulator<T>(
-            args: AccumulatorArgs,
-        ) -> Result<Box<dyn GroupsAccumulator>>
-        where
-            T: ArrowPrimitiveType + Send,
-        {
-            let Some(ordering) = LexOrdering::new(args.order_bys.to_vec()) else {
-                return internal_err!("Groups accumulator must have an ordering.");
-            };
-
-            let ordering_dtypes = ordering
-                .iter()
-                .map(|e| e.expr.data_type(args.schema))
-                .collect::<Result<Vec<_>>>()?;
-
-            Ok(Box::new(FirstPrimitiveGroupsAccumulator::<T>::try_new(
-                ordering,
-                args.ignore_nulls,
-                args.return_field.data_type(),
-                &ordering_dtypes,
-                false,
-            )?))
-        }
-
-        match args.return_field.data_type() {
-            DataType::Int8 => create_accumulator::<Int8Type>(args),
-            DataType::Int16 => create_accumulator::<Int16Type>(args),
-            DataType::Int32 => create_accumulator::<Int32Type>(args),
-            DataType::Int64 => create_accumulator::<Int64Type>(args),
-            DataType::UInt8 => create_accumulator::<UInt8Type>(args),
-            DataType::UInt16 => create_accumulator::<UInt16Type>(args),
-            DataType::UInt32 => create_accumulator::<UInt32Type>(args),
-            DataType::UInt64 => create_accumulator::<UInt64Type>(args),
-            DataType::Float16 => create_accumulator::<Float16Type>(args),
-            DataType::Float32 => create_accumulator::<Float32Type>(args),
-            DataType::Float64 => create_accumulator::<Float64Type>(args),
-
-            DataType::Decimal32(_, _) => create_accumulator::<Decimal32Type>(args),
-            DataType::Decimal64(_, _) => create_accumulator::<Decimal64Type>(args),
-            DataType::Decimal128(_, _) => create_accumulator::<Decimal128Type>(args),
-            DataType::Decimal256(_, _) => create_accumulator::<Decimal256Type>(args),
-
-            DataType::Timestamp(TimeUnit::Second, _) => {
-                create_accumulator::<TimestampSecondType>(args)
-            }
-            DataType::Timestamp(TimeUnit::Millisecond, _) => {
-                create_accumulator::<TimestampMillisecondType>(args)
-            }
-            DataType::Timestamp(TimeUnit::Microsecond, _) => {
-                create_accumulator::<TimestampMicrosecondType>(args)
-            }
-            DataType::Timestamp(TimeUnit::Nanosecond, _) => {
-                create_accumulator::<TimestampNanosecondType>(args)
-            }
-
-            DataType::Date32 => create_accumulator::<Date32Type>(args),
-            DataType::Date64 => create_accumulator::<Date64Type>(args),
-            DataType::Time32(TimeUnit::Second) => {
-                create_accumulator::<Time32SecondType>(args)
-            }
-            DataType::Time32(TimeUnit::Millisecond) => {
-                create_accumulator::<Time32MillisecondType>(args)
-            }
-
-            DataType::Time64(TimeUnit::Microsecond) => {
-                create_accumulator::<Time64MicrosecondType>(args)
-            }
-            DataType::Time64(TimeUnit::Nanosecond) => {
-                create_accumulator::<Time64NanosecondType>(args)
-            }
-
-            _ => {
-                internal_err!(
-                    "GroupsAccumulator not supported for last_value({})",
-                    args.return_field.data_type()
-                )
-            }
-        }
+        create_groups_accumulator(&args, false, self.name())
     }
 }
 
@@ -1291,8 +1175,7 @@ impl Accumulator for TrivialLastValueAccumulator {
             last_idx = Some(value.len() - 1);
         }
         if let Some(last_idx) = last_idx {
-            let mut row = get_row_at_idx(values, last_idx)?;
-            self.last = row.swap_remove(0);
+            self.last = ScalarValue::try_from_array(&values[0], last_idx)?;
             self.last.compact();
             self.is_set = true;
         }
@@ -1306,11 +1189,11 @@ impl Accumulator for TrivialLastValueAccumulator {
         validate_is_set_flags(flags, "last_value")?;
 
         let filtered_states = filter_states_according_to_is_set(&states[0..1], flags)?;
-        if let Some(last) = filtered_states.last() {
-            if !last.is_empty() {
-                self.last = ScalarValue::try_from_array(last, 0)?;
-                self.is_set = true;
-            }
+        if let Some(last) = filtered_states.last()
+            && !last.is_empty()
+        {
+            self.last = ScalarValue::try_from_array(last, 0)?;
+            self.is_set = true;
         }
         Ok(())
     }
@@ -1336,6 +1219,8 @@ struct LastValueAccumulator {
     orderings: Vec<ScalarValue>,
     // Stores the applicable ordering requirement.
     ordering_req: LexOrdering,
+    // derived from `ordering_req`.
+    sort_options: Vec<SortOptions>,
     // Stores whether incoming data already satisfies the ordering requirement.
     is_input_pre_ordered: bool,
     // Ignore null values.
@@ -1355,11 +1240,13 @@ impl LastValueAccumulator {
             .iter()
             .map(ScalarValue::try_from)
             .collect::<Result<_>>()?;
+        let sort_options = get_sort_options(&ordering_req);
         ScalarValue::try_from(data_type).map(|last| Self {
             last,
             is_set: false,
             orderings,
             ordering_req,
+            sort_options,
             is_input_pre_ordered,
             ignore_nulls,
         })
@@ -1432,12 +1319,7 @@ impl Accumulator for LastValueAccumulator {
             // Update when there is a more recent entry
             if !self.is_set
                 || self.is_input_pre_ordered
-                || compare_rows(
-                    &self.orderings,
-                    orderings,
-                    &get_sort_options(&self.ordering_req),
-                )?
-                .is_lt()
+                || compare_rows(&self.orderings, orderings, &self.sort_options)?.is_lt()
             {
                 self.update_with_new_row(row);
             }
@@ -1465,12 +1347,12 @@ impl Accumulator for LastValueAccumulator {
             let mut last_row = get_row_at_idx(&filtered_states, last_idx)?;
             // When collecting orderings, we exclude the is_set flag from the state.
             let last_ordering = &last_row[1..is_set_idx];
-            let sort_options = get_sort_options(&self.ordering_req);
             // Either there is no existing value, or there is a newer (latest)
             // version in the new data:
             if !self.is_set
                 || self.is_input_pre_ordered
-                || compare_rows(&self.orderings, last_ordering, &sort_options)?.is_lt()
+                || compare_rows(&self.orderings, last_ordering, &self.sort_options)?
+                    .is_lt()
             {
                 // Update with last value in the state. Note that we should exclude the
                 // is_set flag from the state. Otherwise, we will end up with a state
@@ -1533,11 +1415,11 @@ mod tests {
     use std::iter::repeat_with;
 
     use arrow::{
-        array::{BooleanArray, Int64Array, ListArray, StringArray},
+        array::{BooleanArray, Int64Array, ListArray, PrimitiveArray, StringArray},
         compute::SortOptions,
         datatypes::Schema,
     };
-    use datafusion_physical_expr::{expressions::col, PhysicalSortExpr};
+    use datafusion_physical_expr::{PhysicalSortExpr, expressions::col};
 
     use super::*;
 
@@ -1660,10 +1542,10 @@ mod tests {
             options: SortOptions::default(),
         }];
 
-        let mut group_acc = FirstPrimitiveGroupsAccumulator::<Int64Type>::try_new(
+        let mut group_acc = FirstLastGroupsAccumulator::try_new(
+            PrimitiveValueState::<Int64Type>::new(DataType::Int64),
             sort_keys.into(),
             true,
-            &DataType::Int64,
             &[DataType::Int64],
             true,
         )?;
@@ -1754,10 +1636,10 @@ mod tests {
             options: SortOptions::default(),
         }];
 
-        let mut group_acc = FirstPrimitiveGroupsAccumulator::<Int64Type>::try_new(
+        let mut group_acc = FirstLastGroupsAccumulator::try_new(
+            PrimitiveValueState::<Int64Type>::new(DataType::Int64),
             sort_keys.into(),
             true,
-            &DataType::Int64,
             &[DataType::Int64],
             true,
         )?;
@@ -1835,10 +1717,10 @@ mod tests {
             options: SortOptions::default(),
         }];
 
-        let mut group_acc = FirstPrimitiveGroupsAccumulator::<Int64Type>::try_new(
+        let mut group_acc = FirstLastGroupsAccumulator::try_new(
+            PrimitiveValueState::<Int64Type>::new(DataType::Int64),
             sort_keys.into(),
             true,
-            &DataType::Int64,
             &[DataType::Int64],
             false,
         )?;
@@ -1959,10 +1841,12 @@ mod tests {
         let trivial_states = vec![Arc::clone(&value), Arc::clone(&corrupted_flag)];
         let result = trivial_accumulator.merge_batch(&trivial_states);
         assert!(result.is_err());
-        assert!(result
-            .unwrap_err()
-            .to_string()
-            .contains("is_set flags contain nulls"));
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("is_set flags contain nulls")
+        );
 
         // Test FirstValueAccumulator (with ordering)
         let schema = Schema::new(vec![Field::new("ordering", DataType::Int64, false)]);
@@ -1982,10 +1866,12 @@ mod tests {
         let ordered_states = vec![value, ordering, corrupted_flag];
         let result = ordered_accumulator.merge_batch(&ordered_states);
         assert!(result.is_err());
-        assert!(result
-            .unwrap_err()
-            .to_string()
-            .contains("is_set flags contain nulls"));
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("is_set flags contain nulls")
+        );
 
         Ok(())
     }
@@ -2002,10 +1888,12 @@ mod tests {
         let trivial_states = vec![Arc::clone(&value), Arc::clone(&corrupted_flag)];
         let result = trivial_accumulator.merge_batch(&trivial_states);
         assert!(result.is_err());
-        assert!(result
-            .unwrap_err()
-            .to_string()
-            .contains("is_set flags contain nulls"));
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("is_set flags contain nulls")
+        );
 
         // Test LastValueAccumulator (with ordering)
         let schema = Schema::new(vec![Field::new("ordering", DataType::Int64, false)]);
@@ -2025,10 +1913,12 @@ mod tests {
         let ordered_states = vec![value, ordering, corrupted_flag];
         let result = ordered_accumulator.merge_batch(&ordered_states);
         assert!(result.is_err());
-        assert!(result
-            .unwrap_err()
-            .to_string()
-            .contains("is_set flags contain nulls"));
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("is_set flags contain nulls")
+        );
 
         Ok(())
     }
diff --git a/datafusion/functions-aggregate/src/first_last/state.rs b/datafusion/functions-aggregate/src/first_last/state.rs
new file mode 100644
index 0000000000000..cd7114bf04f9c
--- /dev/null
+++ b/datafusion/functions-aggregate/src/first_last/state.rs
@@ -0,0 +1,462 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::mem::size_of;
+use std::sync::Arc;
+
+use arrow::array::{
+    Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryBuilder, BinaryViewBuilder,
+    BooleanBufferBuilder, LargeBinaryBuilder, LargeStringBuilder, PrimitiveArray,
+    StringBuilder, StringViewBuilder,
+};
+use arrow::buffer::{BooleanBuffer, NullBuffer};
+use arrow::datatypes::DataType;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::EmitTo;
+
+pub(crate) trait ValueState: Send + Sync {
+    /// Resizes the state to accommodate `new_size` groups.
+    fn resize(&mut self, new_size: usize);
+    /// Updates the state for the specified `group_idx` using the value at `idx` from the provided `array`.
+    ///
+    /// Note: While this is not a batch interface, it is not a performance bottleneck.
+    /// In heavy aggregation benchmarks, the overhead of this method is typically less than 1%.
+    ///
+    /// ```sql
+    /// -- TPC-H SF10
+    /// select l_shipmode, last_value(l_partkey order by l_orderkey, l_linenumber, l_comment, l_suppkey, l_tax)
+    /// from 'benchmarks/data/tpch_sf10/lineitem'
+    /// group by l_shipmode;
+    ///
+    /// -- H2O G1_1e8
+    /// select t.id1, first_value(t.id3 order by t.id2, t.id4) as r2
+    /// from 'benchmarks/data/h2o/G1_1e8_1e8_100_0.parquet' as t
+    /// group by t.id1, t.v1;
+    /// ```
+    fn update(&mut self, group_idx: usize, array: &ArrayRef, idx: usize) -> Result<()>;
+    /// Takes the accumulated state and returns it as an [`ArrayRef`], respecting the `emit_to` strategy.
+    fn take(&mut self, emit_to: EmitTo) -> Result<ArrayRef>;
+    /// Returns the estimated memory size of the state in bytes.
+    fn size(&self) -> usize;
+}
+
+pub(crate) struct PrimitiveValueState<T: ArrowPrimitiveType + Send> {
+    /// Values data
+    vals: Vec<T::Native>,
+    nulls: BooleanBufferBuilder,
+    data_type: DataType,
+}
+
+impl<T: ArrowPrimitiveType + Send> PrimitiveValueState<T> {
+    pub(crate) fn new(data_type: DataType) -> Self {
+        Self {
+            vals: vec![],
+            nulls: BooleanBufferBuilder::new(0),
+            data_type,
+        }
+    }
+}
+
+impl<T: ArrowPrimitiveType + Send> ValueState for PrimitiveValueState<T> {
+    fn resize(&mut self, new_size: usize) {
+        self.vals.resize(new_size, T::default_value());
+        self.nulls.resize(new_size);
+    }
+
+    fn update(&mut self, group_idx: usize, array: &ArrayRef, idx: usize) -> Result<()> {
+        let array = array.as_primitive::<T>();
+        self.vals[group_idx] = array.value(idx);
+        self.nulls.set_bit(group_idx, !array.is_null(idx));
+        Ok(())
+    }
+
+    fn take(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
+        let values = emit_to.take_needed(&mut self.vals);
+        let null_buf = NullBuffer::new(take_need(&mut self.nulls, emit_to));
+        let array: PrimitiveArray<T> =
+            PrimitiveArray::<T>::new(values.into(), Some(null_buf))
+                .with_data_type(self.data_type.clone());
+        Ok(Arc::new(array))
+    }
+
+    fn size(&self) -> usize {
+        self.vals.capacity() * size_of::<T::Native>() + self.nulls.capacity() / 8
+    }
+}
+
+/// Stores internal state for "bytes" types (Utf8, Binary, etc.).
+///
+/// This implementation is similar to `MinMaxBytesState` in `min_max_bytes.rs`, but
+/// it does not reuse it for two main reasons:
+///
+/// 1. **Direct Overwrite**: `MinMaxBytesState::update_batch` is tightly coupled
+///    with min/max comparison logic, whereas `FirstLast` performs its own comparisons
+///    externally (using ordering columns) and only needs a simple interface to
+///    unconditionally set/overwrite values for specific groups.
+/// 2. **Different NULL Handling**: `MinMaxBytesState` always ignores `NULL` values
+///    in the input, while `BytesValueState` needs to support setting `NULL` values
+///    to correctly implement `RESPECT NULLS` behavior.
+///
+pub(crate) struct BytesValueState {
+    vals: Vec<Option<Vec<u8>>>,
+    data_type: DataType,
+    /// The sum of the capacities of all vectors in `vals`.
+    total_capacity: usize,
+}
+
+impl BytesValueState {
+    pub(crate) fn try_new(data_type: DataType) -> Result<Self> {
+        if !matches!(
+            data_type,
+            DataType::Utf8
+                | DataType::LargeUtf8
+                | DataType::Utf8View
+                | DataType::Binary
+                | DataType::LargeBinary
+                | DataType::BinaryView
+        ) {
+            return internal_err!("BytesValueState does not support {}", data_type);
+        }
+        Ok(Self {
+            vals: vec![],
+            data_type,
+            total_capacity: 0,
+        })
+    }
+}
+
+impl ValueState for BytesValueState {
+    fn resize(&mut self, new_size: usize) {
+        if new_size < self.vals.len() {
+            for v in self.vals[new_size..].iter().flatten() {
+                self.total_capacity -= v.capacity();
+            }
+        }
+        self.vals.resize(new_size, None);
+    }
+
+    fn update(&mut self, group_idx: usize, array: &ArrayRef, idx: usize) -> Result<()> {
+        if let Some(v) = &self.vals[group_idx] {
+            self.total_capacity -= v.capacity();
+        }
+
+        if array.is_null(idx) {
+            self.vals[group_idx] = None;
+        } else {
+            let val = match self.data_type {
+                DataType::Utf8 => array.as_string::<i32>().value(idx).as_bytes(),
+                DataType::LargeUtf8 => array.as_string::<i64>().value(idx).as_bytes(),
+                DataType::Utf8View => array.as_string_view().value(idx).as_bytes(),
+                DataType::Binary => array.as_binary::<i32>().value(idx),
+                DataType::LargeBinary => array.as_binary::<i64>().value(idx),
+                DataType::BinaryView => array.as_binary_view().value(idx),
+                _ => {
+                    return internal_err!(
+                        "Unsupported data type for BytesValueState: {}",
+                        self.data_type
+                    );
+                }
+            };
+
+            if let Some(v) = &mut self.vals[group_idx] {
+                v.clear();
+                v.extend_from_slice(val);
+            } else {
+                let v = val.to_vec();
+                self.vals[group_idx] = Some(v);
+            }
+
+            self.vals[group_idx]
+                .as_ref()
+                .inspect(|x| self.total_capacity += x.capacity());
+        }
+        Ok(())
+    }
+
+    fn take(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
+        let values = emit_to.take_needed(&mut self.vals);
+
+        let (total_len, taken_capacity) = values
+            .iter()
+            .flatten()
+            .fold((0, 0), |(len_acc, cap_acc), v| {
+                (len_acc + v.len(), cap_acc + v.capacity())
+            });
+        self.total_capacity -= taken_capacity;
+
+        match self.data_type {
+            DataType::Utf8 => {
+                let mut builder = StringBuilder::with_capacity(values.len(), total_len);
+                for val in values {
+                    match val {
+                        Some(v) => builder.append_value(
+                            // SAFETY: The bytes were originally from a valid UTF-8 array in `update`
+                            unsafe { std::str::from_utf8_unchecked(&v) },
+                        ),
+                        None => builder.append_null(),
+                    }
+                }
+                Ok(Arc::new(builder.finish()))
+            }
+            DataType::LargeUtf8 => {
+                let mut builder =
+                    LargeStringBuilder::with_capacity(values.len(), total_len);
+                for val in values {
+                    match val {
+                        Some(v) => builder.append_value(
+                            // SAFETY: The bytes were originally from a valid UTF-8 array in `update`
+                            unsafe { std::str::from_utf8_unchecked(&v) },
+                        ),
+                        None => builder.append_null(),
+                    }
+                }
+                Ok(Arc::new(builder.finish()))
+            }
+            DataType::Utf8View => {
+                let mut builder = StringViewBuilder::with_capacity(values.len());
+                for val in values {
+                    match val {
+                        Some(v) => builder.append_value(
+                            // SAFETY: The bytes were originally from a valid UTF-8 array in `update`
+                            unsafe { std::str::from_utf8_unchecked(&v) },
+                        ),
+                        None => builder.append_null(),
+                    }
+                }
+                Ok(Arc::new(builder.finish()))
+            }
+            DataType::Binary => {
+                let mut builder = BinaryBuilder::with_capacity(values.len(), total_len);
+                for val in values {
+                    match val {
+                        Some(v) => builder.append_value(&v),
+                        None => builder.append_null(),
+                    }
+                }
+                Ok(Arc::new(builder.finish()))
+            }
+            DataType::LargeBinary => {
+                let mut builder =
+                    LargeBinaryBuilder::with_capacity(values.len(), total_len);
+                for val in values {
+                    match val {
+                        Some(v) => builder.append_value(&v),
+                        None => builder.append_null(),
+                    }
+                }
+                Ok(Arc::new(builder.finish()))
+            }
+            DataType::BinaryView => {
+                let mut builder = BinaryViewBuilder::with_capacity(values.len());
+                for val in values {
+                    match val {
+                        Some(v) => builder.append_value(&v),
+                        None => builder.append_null(),
+                    }
+                }
+                Ok(Arc::new(builder.finish()))
+            }
+            _ => internal_err!(
+                "Unsupported data type for BytesValueState: {}",
+                self.data_type
+            ),
+        }
+    }
+
+    fn size(&self) -> usize {
+        self.vals.capacity() * size_of::<Option<Vec<u8>>>() + self.total_capacity
+    }
+}
+
+impl BytesValueState {
+    #[cfg(test)]
+    /// For testing only: strictly calculate the sum of capacities of all vectors in `vals`.
+    fn total_capacity_calculated(&self) -> usize {
+        self.vals.iter().flatten().map(|v| v.capacity()).sum()
+    }
+}
+
+pub(crate) fn take_need(
+    bool_buf_builder: &mut BooleanBufferBuilder,
+    emit_to: EmitTo,
+) -> BooleanBuffer {
+    let bool_buf = bool_buf_builder.finish();
+    match emit_to {
+        EmitTo::All => bool_buf,
+        EmitTo::First(n) => {
+            // split off the first N values in seen_values
+            //
+            let first_n: BooleanBuffer = bool_buf.slice(0, n);
+            // reset the existing buffer
+            bool_buf_builder.append_buffer(&bool_buf.slice(n, bool_buf.len() - n));
+            first_n
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{
+        BinaryArray, BinaryViewArray, LargeBinaryArray, LargeStringArray, StringArray,
+        StringViewArray,
+    };
+
+    #[test]
+    fn test_bytes_value_state_utf8() -> Result<()> {
+        let mut state = BytesValueState::try_new(DataType::Utf8)?;
+        state.resize(2);
+
+        let array: ArrayRef = Arc::new(StringArray::from(vec![
+            Some("hello"),
+            Some("world"),
+            Some("longer_string_than_hello"),
+        ]));
+
+        state.update(0, &array, 0)?; // group 0 = "hello"
+        state.update(1, &array, 1)?; // group 1 = "world"
+
+        assert_eq!(state.total_capacity, state.total_capacity_calculated());
+
+        // Overwrite group 0 with a longer string (checks capacity update logic)
+        state.update(0, &array, 2)?;
+        assert_eq!(state.total_capacity, state.total_capacity_calculated());
+
+        let result = state.take(EmitTo::All)?;
+        let result = result.as_string::<i32>();
+        assert_eq!(result.len(), 2);
+        assert_eq!(result.value(0), "longer_string_than_hello");
+        assert_eq!(result.value(1), "world");
+
+        // After take all, size should be 0 (excluding vals vector capacity)
+        assert_eq!(state.total_capacity, 0);
+        assert_eq!(state.total_capacity, state.total_capacity_calculated());
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_bytes_value_state_large_utf8() -> Result<()> {
+        let mut state = BytesValueState::try_new(DataType::LargeUtf8)?;
+        state.resize(1);
+        let array: ArrayRef = Arc::new(LargeStringArray::from(vec!["large_utf8"]));
+        state.update(0, &array, 0)?;
+        let result = state.take(EmitTo::All)?;
+        assert_eq!(result.as_string::<i64>().value(0), "large_utf8");
+        Ok(())
+    }
+
+    #[test]
+    fn test_bytes_value_state_utf8_view() -> Result<()> {
+        let mut state = BytesValueState::try_new(DataType::Utf8View)?;
+        state.resize(1);
+        let array: ArrayRef = Arc::new(StringViewArray::from(vec!["Utf8View"]));
+        state.update(0, &array, 0)?;
+        let result = state.take(EmitTo::All)?;
+        assert_eq!(result.as_string_view().value(0), "Utf8View");
+        Ok(())
+    }
+
+    #[test]
+    fn test_bytes_value_state_binary() -> Result<()> {
+        let mut state = BytesValueState::try_new(DataType::Binary)?;
+        state.resize(1);
+        let array: ArrayRef = Arc::new(BinaryArray::from(vec![b"binary" as &[u8]]));
+        state.update(0, &array, 0)?;
+        let result = state.take(EmitTo::All)?;
+        assert_eq!(result.as_binary::<i32>().value(0), b"binary");
+        Ok(())
+    }
+
+    #[test]
+    fn test_bytes_value_state_large_binary() -> Result<()> {
+        let mut state = BytesValueState::try_new(DataType::LargeBinary)?;
+        state.resize(1);
+        let array: ArrayRef =
+            Arc::new(LargeBinaryArray::from(vec![b"large_binary" as &[u8]]));
+        state.update(0, &array, 0)?;
+        let result = state.take(EmitTo::All)?;
+        assert_eq!(result.as_binary::<i64>().value(0), b"large_binary");
+        Ok(())
+    }
+
+    #[test]
+    fn test_bytes_value_state_binary_view() -> Result<()> {
+        let mut state = BytesValueState::try_new(DataType::BinaryView)?;
+        state.resize(1);
+
+        let data: Vec<Option<&[u8]>> = vec![Some(b"long_binary_value_to_test_view")];
+        let array: ArrayRef = Arc::new(BinaryViewArray::from(data));
+
+        state.update(0, &array, 0)?;
+
+        let result = state.take(EmitTo::All)?;
+        let result = result.as_binary_view();
+        assert_eq!(result.value(0), b"long_binary_value_to_test_view");
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_bytes_value_state_emit_first() -> Result<()> {
+        let mut state = BytesValueState::try_new(DataType::Utf8)?;
+        state.resize(3);
+
+        let array: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c"]));
+        state.update(0, &array, 0)?;
+        state.update(1, &array, 1)?;
+        state.update(2, &array, 2)?;
+
+        let result = state.take(EmitTo::First(2))?;
+        let result = result.as_string::<i32>();
+        assert_eq!(result.len(), 2);
+        assert_eq!(result.value(0), "a");
+        assert_eq!(result.value(1), "b");
+
+        // Remaining should be "c"
+        let result = state.take(EmitTo::All)?;
+        let result = result.as_string::<i32>();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result.value(0), "c");
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_bytes_value_state_update_null() -> Result<()> {
+        let mut state = BytesValueState::try_new(DataType::Utf8)?;
+        state.resize(1);
+
+        let array: ArrayRef = Arc::new(StringArray::from(vec![Some("hello"), None]));
+
+        // group 0 = "hello"
+        state.update(0, &array, 0)?;
+        assert_eq!(state.total_capacity, state.total_capacity_calculated());
+        assert!(state.total_capacity > 0);
+
+        // group 0 = NULL
+        state.update(0, &array, 1)?;
+        assert_eq!(
+            state.total_capacity,
+            state.total_capacity_calculated(),
+            "total_capacity should match calculated capacity after update(NULL)"
+        );
+        assert_eq!(state.total_capacity, 0);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/functions-aggregate/src/grouping.rs b/datafusion/functions-aggregate/src/grouping.rs
index 4d1da1dad5949..a170a7e0c95df 100644
--- a/datafusion/functions-aggregate/src/grouping.rs
+++ b/datafusion/functions-aggregate/src/grouping.rs
@@ -17,12 +17,9 @@
 
 //! Defines physical expressions that can evaluated at runtime during query execution
 
-use std::any::Any;
-use std::fmt;
-
 use arrow::datatypes::Field;
 use arrow::datatypes::{DataType, FieldRef};
-use datafusion_common::{not_impl_err, Result};
+use datafusion_common::{Result, not_impl_err};
 use datafusion_expr::function::AccumulatorArgs;
 use datafusion_expr::function::StateFieldsArgs;
 use datafusion_expr::utils::format_state_name;
@@ -60,20 +57,11 @@ make_udaf_expr_and_func!(
         description = "Expression to evaluate whether data is aggregated across the specified column. Can be a constant, column, or function."
     )
 )]
-#[derive(PartialEq, Eq, Hash)]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct Grouping {
     signature: Signature,
 }
 
-impl fmt::Debug for Grouping {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        f.debug_struct("Grouping")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for Grouping {
     fn default() -> Self {
         Self::new()
@@ -90,10 +78,6 @@ impl Grouping {
 }
 
 impl AggregateUDFImpl for Grouping {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "grouping"
     }
@@ -107,12 +91,14 @@ impl AggregateUDFImpl for Grouping {
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        Ok(vec![Field::new(
-            format_state_name(args.name, "grouping"),
-            DataType::Int32,
-            true,
-        )
-        .into()])
+        Ok(vec![
+            Field::new(
+                format_state_name(args.name, "grouping"),
+                DataType::Int32,
+                true,
+            )
+            .into(),
+        ])
     }
 
     fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
diff --git a/datafusion/functions-aggregate/src/hyperloglog.rs b/datafusion/functions-aggregate/src/hyperloglog.rs
index 3074889eab23c..3861800847edb 100644
--- a/datafusion/functions-aggregate/src/hyperloglog.rs
+++ b/datafusion/functions-aggregate/src/hyperloglog.rs
@@ -34,7 +34,7 @@
 //!
 //! This module also borrows some code structure from [pdatastructs.rs](https://github.com/crepererum/pdatastructs.rs/blob/3997ed50f6b6871c9e53c4c5e0f48f431405fc63/src/hyperloglog.rs).
 
-use ahash::RandomState;
+use std::hash::BuildHasher;
 use std::hash::Hash;
 use std::marker::PhantomData;
 
@@ -58,15 +58,11 @@ where
 /// Fixed seed for the hashing so that values are consistent across runs
 ///
 /// Note that when we later move on to have serialized HLL register binaries
-/// shared across cluster, this SEED will have to be consistent across all
+/// shared across cluster, this HLL_HASH_STATE will have to be consistent across all
 /// parties otherwise we might have corruption. So ideally for later this seed
 /// shall be part of the serialized form (or stay unchanged across versions).
-const SEED: RandomState = RandomState::with_seeds(
-    0x885f6cab121d01a3_u64,
-    0x71e4379f2976ad8f_u64,
-    0xbf30173dd28a8816_u64,
-    0x0eaea5d736d733a4_u64,
-);
+pub(crate) const HLL_HASH_STATE: foldhash::quality::FixedState =
+    foldhash::quality::FixedState::with_seed(0);
 
 impl<T> Default for HyperLogLog<T>
 where
@@ -97,17 +93,26 @@ where
         }
     }
 
-    /// choice of hash function: ahash is already an dependency
+    /// choice of hash function: foldhash is already an dependency
     /// and it fits the requirements of being a 64bit hash with
     /// reasonable performance.
     #[inline]
     fn hash_value(&self, obj: &T) -> u64 {
-        SEED.hash_one(obj)
+        HLL_HASH_STATE.hash_one(obj)
     }
 
     /// Adds an element to the HyperLogLog.
     pub fn add(&mut self, obj: &T) {
         let hash = self.hash_value(obj);
+        self.add_hashed(hash);
+    }
+
+    /// Adds a pre-computed hash value directly to the HyperLogLog.
+    ///
+    /// The hash should be computed using [`HLL_HASH_STATE`], the same hasher used
+    /// by [`Self::add`].
+    #[inline]
+    pub(crate) fn add_hashed(&mut self, hash: u64) {
         let index = (hash & HLL_P_MASK) as usize;
         let p = ((hash >> HLL_P) | (1_u64 << HLL_Q)).trailing_zeros() + 1;
         self.registers[index] = self.registers[index].max(p as u8);
diff --git a/datafusion/functions-aggregate/src/lib.rs b/datafusion/functions-aggregate/src/lib.rs
index 056cd45fa2c32..1b9996220d882 100644
--- a/datafusion/functions-aggregate/src/lib.rs
+++ b/datafusion/functions-aggregate/src/lib.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 #![doc(
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
diff --git a/datafusion/functions-aggregate/src/macros.rs b/datafusion/functions-aggregate/src/macros.rs
index 6c6bf72838899..0c919a1e5ea74 100644
--- a/datafusion/functions-aggregate/src/macros.rs
+++ b/datafusion/functions-aggregate/src/macros.rs
@@ -67,7 +67,6 @@ macro_rules! create_func {
         create_func!($UDAF, $AGGREGATE_UDF_FN, <$UDAF>::default());
     };
     ($UDAF:ty, $AGGREGATE_UDF_FN:ident, $CREATE:expr) => {
-        paste::paste! {
             #[doc = concat!("AggregateFunction that returns a [`AggregateUDF`](datafusion_expr::AggregateUDF) for [`", stringify!($UDAF), "`]")]
             pub fn $AGGREGATE_UDF_FN() -> std::sync::Arc<datafusion_expr::AggregateUDF> {
                 // Singleton instance of [$UDAF], ensures the UDAF is only created once
@@ -76,7 +75,6 @@ macro_rules! create_func {
                         std::sync::Arc::new(datafusion_expr::AggregateUDF::from($CREATE))
                     });
                 std::sync::Arc::clone(&INSTANCE)
-            }
         }
     }
 }
diff --git a/datafusion/functions-aggregate/src/median.rs b/datafusion/functions-aggregate/src/median.rs
index 8c524c2f1596a..74473e2f2884f 100644
--- a/datafusion/functions-aggregate/src/median.rs
+++ b/datafusion/functions-aggregate/src/median.rs
@@ -21,8 +21,8 @@ use std::mem::{size_of, size_of_val};
 use std::sync::Arc;
 
 use arrow::array::{
-    downcast_integer, ArrowNumericType, BooleanArray, ListArray, PrimitiveArray,
-    PrimitiveBuilder,
+    ArrowNumericType, BooleanArray, ListArray, PrimitiveArray, PrimitiveBuilder,
+    downcast_integer,
 };
 use arrow::buffer::{OffsetBuffer, ScalarBuffer};
 use arrow::{
@@ -40,18 +40,20 @@ use arrow::datatypes::{
 };
 
 use datafusion_common::{
-    internal_datafusion_err, internal_err, DataFusionError, HashSet, Result, ScalarValue,
+    DataFusionError, Result, ScalarValue, assert_eq_or_internal_err,
+    internal_datafusion_err,
 };
 use datafusion_expr::function::StateFieldsArgs;
 use datafusion_expr::{
-    function::AccumulatorArgs, utils::format_state_name, Accumulator, AggregateUDFImpl,
-    Documentation, Signature, Volatility,
+    Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
+    function::AccumulatorArgs, utils::format_state_name,
 };
 use datafusion_expr::{EmitTo, GroupsAccumulator};
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::accumulate;
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::filtered_null_mask;
-use datafusion_functions_aggregate_common::utils::Hashable;
+use datafusion_functions_aggregate_common::utils::{GenericDistinctBuffer, Hashable};
 use datafusion_macros::user_doc;
+use std::collections::HashMap;
 
 make_udaf_expr_and_func!(
     Median,
@@ -83,20 +85,11 @@ make_udaf_expr_and_func!(
 /// If using the distinct variation, the memory usage will be similarly high if the
 /// cardinality is high as it stores all distinct values in memory before computing the
 /// result, but if cardinality is low then memory usage will also be lower.
-#[derive(PartialEq, Eq, Hash)]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct Median {
     signature: Signature,
 }
 
-impl Debug for Median {
-    fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
-        f.debug_struct("Median")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for Median {
     fn default() -> Self {
         Self::new()
@@ -112,10 +105,6 @@ impl Median {
 }
 
 impl AggregateUDFImpl for Median {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "median"
     }
@@ -137,12 +126,14 @@ impl AggregateUDFImpl for Median {
             "median"
         };
 
-        Ok(vec![Field::new(
-            format_state_name(args.name, state_name),
-            DataType::List(Arc::new(field)),
-            true,
-        )
-        .into()])
+        Ok(vec![
+            Field::new(
+                format_state_name(args.name, state_name),
+                DataType::List(Arc::new(field)),
+                true,
+            )
+            .into(),
+        ])
     }
 
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
@@ -151,7 +142,7 @@ impl AggregateUDFImpl for Median {
                 if acc_args.is_distinct {
                     Ok(Box::new(DistinctMedianAccumulator::<$t> {
                         data_type: $dt.clone(),
-                        distinct_values: HashSet::new(),
+                        distinct_values: GenericDistinctBuffer::new($dt),
                     }))
                 } else {
                     Ok(Box::new(MedianAccumulator::<$t> {
@@ -189,12 +180,12 @@ impl AggregateUDFImpl for Median {
         args: AccumulatorArgs,
     ) -> Result<Box<dyn GroupsAccumulator>> {
         let num_args = args.exprs.len();
-        if num_args != 1 {
-            return internal_err!(
-                "median should only have 1 arg, but found num args:{}",
-                args.exprs.len()
-            );
-        }
+        assert_eq_or_internal_err!(
+            num_args,
+            1,
+            "median should only have 1 arg, but found num args:{}",
+            num_args
+        );
 
         let dt = args.expr_fields[0].data_type().clone();
 
@@ -286,14 +277,46 @@ impl<T: ArrowNumericType> Accumulator for MedianAccumulator<T> {
     }
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
-        let d = std::mem::take(&mut self.all_values);
-        let median = calculate_median::<T>(d);
+        let median = calculate_median::<T>(&mut self.all_values);
         ScalarValue::new_primitive::<T>(median, &self.data_type)
     }
 
     fn size(&self) -> usize {
         size_of_val(self) + self.all_values.capacity() * size_of::<T::Native>()
     }
+
+    fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let mut to_remove: HashMap<Hashable<T::Native>, usize> = HashMap::new();
+
+        let arr = values[0].as_primitive::<T>();
+        for value in arr.iter().flatten() {
+            *to_remove.entry(Hashable(value)).or_default() += 1;
+        }
+
+        let mut i = 0;
+        while i < self.all_values.len() {
+            let k = Hashable(self.all_values[i]);
+            if let Some(count) = to_remove.get_mut(&k)
+                && *count > 0
+            {
+                self.all_values.swap_remove(i);
+                *count -= 1;
+                if *count == 0 {
+                    to_remove.remove(&k);
+                    if to_remove.is_empty() {
+                        break;
+                    }
+                }
+            } else {
+                i += 1;
+            }
+        }
+        Ok(())
+    }
+
+    fn supports_retract_batch(&self) -> bool {
+        true
+    }
 }
 
 /// The median groups accumulator accumulates the raw input values
@@ -440,8 +463,8 @@ impl<T: ArrowNumericType + Send> GroupsAccumulator for MedianGroupsAccumulator<T
         // Calculate median for each group
         let mut evaluate_result_builder =
             PrimitiveBuilder::<T>::new().with_data_type(self.data_type.clone());
-        for values in emit_group_values {
-            let median = calculate_median::<T>(values);
+        for mut values in emit_group_values {
+            let median = calculate_median::<T>(&mut values);
             evaluate_result_builder.append_option(median);
         }
 
@@ -505,74 +528,34 @@ impl<T: ArrowNumericType + Send> GroupsAccumulator for MedianGroupsAccumulator<T
     }
 }
 
-/// The distinct median accumulator accumulates the raw input values
-/// as `ScalarValue`s
-///
-/// The intermediate state is represented as a List of scalar values updated by
-/// `merge_batch` and a `Vec` of `ArrayRef` that are converted to scalar values
-/// in the final evaluation step so that we avoid expensive conversions and
-/// allocations during `update_batch`.
+#[derive(Debug)]
 struct DistinctMedianAccumulator<T: ArrowNumericType> {
+    distinct_values: GenericDistinctBuffer<T>,
     data_type: DataType,
-    distinct_values: HashSet<Hashable<T::Native>>,
-}
-
-impl<T: ArrowNumericType> Debug for DistinctMedianAccumulator<T> {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(f, "DistinctMedianAccumulator({})", self.data_type)
-    }
 }
 
-impl<T: ArrowNumericType> Accumulator for DistinctMedianAccumulator<T> {
+impl<T: ArrowNumericType + Debug> Accumulator for DistinctMedianAccumulator<T> {
     fn state(&mut self) -> Result<Vec<ScalarValue>> {
-        let all_values = self
-            .distinct_values
-            .iter()
-            .map(|x| ScalarValue::new_primitive::<T>(Some(x.0), &self.data_type))
-            .collect::<Result<Vec<_>>>()?;
-
-        let arr = ScalarValue::new_list_nullable(&all_values, &self.data_type);
-        Ok(vec![ScalarValue::List(arr)])
+        self.distinct_values.state()
     }
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        if values.is_empty() {
-            return Ok(());
-        }
-
-        let array = values[0].as_primitive::<T>();
-        match array.nulls().filter(|x| x.null_count() > 0) {
-            Some(n) => {
-                for idx in n.valid_indices() {
-                    self.distinct_values.insert(Hashable(array.value(idx)));
-                }
-            }
-            None => array.values().iter().for_each(|x| {
-                self.distinct_values.insert(Hashable(*x));
-            }),
-        }
-        Ok(())
+        self.distinct_values.update_batch(values)
     }
 
     fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
-        let array = states[0].as_list::<i32>();
-        for v in array.iter().flatten() {
-            self.update_batch(&[v])?
-        }
-        Ok(())
+        self.distinct_values.merge_batch(states)
     }
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
-        let d = std::mem::take(&mut self.distinct_values)
-            .into_iter()
-            .map(|v| v.0)
-            .collect::<Vec<_>>();
-        let median = calculate_median::<T>(d);
+        let mut d: Vec<T::Native> =
+            self.distinct_values.values.iter().map(|v| v.0).collect();
+        let median = calculate_median::<T>(&mut d);
         ScalarValue::new_primitive::<T>(median, &self.data_type)
     }
 
     fn size(&self) -> usize {
-        size_of_val(self) + self.distinct_values.capacity() * size_of::<T::Native>()
+        size_of_val(self) + self.distinct_values.size()
     }
 }
 
@@ -591,9 +574,7 @@ where
         .unwrap()
 }
 
-fn calculate_median<T: ArrowNumericType>(
-    mut values: Vec<T::Native>,
-) -> Option<T::Native> {
+fn calculate_median<T: ArrowNumericType>(values: &mut [T::Native]) -> Option<T::Native> {
     let cmp = |x: &T::Native, y: &T::Native| x.compare(*y);
 
     let len = values.len();
@@ -603,9 +584,25 @@ fn calculate_median<T: ArrowNumericType>(
         let (low, high, _) = values.select_nth_unstable_by(len / 2, cmp);
         // Get the maximum of the low (left side after bi-partitioning)
         let left_max = slice_max::<T>(low);
-        let median = left_max
-            .add_wrapping(*high)
-            .div_wrapping(T::Native::usize_as(2));
+        // Calculate median as the average of the two middle values.
+        // Use checked arithmetic to detect overflow and fall back to safe formula.
+        let two = T::Native::usize_as(2);
+        let median = match left_max.add_checked(*high) {
+            Ok(sum) => sum.div_wrapping(two),
+            Err(_) => {
+                // Overflow detected - use safe midpoint formula:
+                // a/2 + b/2 + ((a%2 + b%2) / 2)
+                // This avoids overflow by dividing before adding.
+                let half_left = left_max.div_wrapping(two);
+                let half_right = (*high).div_wrapping(two);
+                let rem_left = left_max.mod_wrapping(two);
+                let rem_right = (*high).mod_wrapping(two);
+                // The sum of remainders (0, 1, or 2 for unsigned; -2 to 2 for signed)
+                // divided by 2 gives the correction factor (0 or 1 for unsigned; -1, 0, or 1 for signed)
+                let correction = rem_left.add_wrapping(rem_right).div_wrapping(two);
+                half_left.add_wrapping(half_right).add_wrapping(correction)
+            }
+        };
         Some(median)
     } else {
         let (_, median, _) = values.select_nth_unstable_by(len / 2, cmp);
diff --git a/datafusion/functions-aggregate/src/min_max.rs b/datafusion/functions-aggregate/src/min_max.rs
index 1a46afefffb3b..f4eaaab853464 100644
--- a/datafusion/functions-aggregate/src/min_max.rs
+++ b/datafusion/functions-aggregate/src/min_max.rs
@@ -23,13 +23,13 @@ mod min_max_struct;
 
 use arrow::array::ArrayRef;
 use arrow::datatypes::{
-    DataType, Decimal128Type, Decimal256Type, Decimal32Type, Decimal64Type,
+    DataType, Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type,
     DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType,
-    DurationSecondType, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type,
-    Int64Type, Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
+    DurationSecondType, Float16Type, Float32Type, Float64Type, Int8Type, Int16Type,
+    Int32Type, Int64Type, UInt8Type, UInt16Type, UInt32Type, UInt64Type,
 };
 use datafusion_common::stats::Precision;
-use datafusion_common::{exec_err, internal_err, ColumnStatistics, Result};
+use datafusion_common::{ColumnStatistics, Result, exec_err, internal_err};
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::prim_op::PrimitiveGroupsAccumulator;
 use datafusion_physical_expr::expressions;
 use std::cmp::Ordering;
@@ -46,8 +46,8 @@ use crate::min_max::min_max_bytes::MinMaxBytesAccumulator;
 use crate::min_max::min_max_struct::MinMaxStructAccumulator;
 use datafusion_common::ScalarValue;
 use datafusion_expr::{
-    function::AccumulatorArgs, Accumulator, AggregateUDFImpl, Documentation,
-    SetMonotonicity, Signature, Volatility,
+    Accumulator, AggregateUDFImpl, Documentation, SetMonotonicity, Signature, Volatility,
+    function::AccumulatorArgs,
 };
 use datafusion_expr::{GroupsAccumulator, StatisticsArgs};
 use datafusion_macros::user_doc;
@@ -171,9 +171,8 @@ trait FromColumnStatistics {
                     let col_stats = &statistics_args.statistics.column_statistics;
                     if statistics_args.exprs.len() == 1 {
                         // TODO optimize with exprs other than Column
-                        if let Some(col_expr) = statistics_args.exprs[0]
-                            .as_any()
-                            .downcast_ref::<expressions::Column>()
+                        if let Some(col_expr) =
+                            statistics_args.exprs[0].downcast_ref::<expressions::Column>()
                         {
                             return self.value_from_column_statistics(
                                 &col_stats[col_expr.index()],
@@ -193,20 +192,16 @@ impl FromColumnStatistics for Max {
         &self,
         col_stats: &ColumnStatistics,
     ) -> Option<ScalarValue> {
-        if let Precision::Exact(ref val) = col_stats.max_value {
-            if !val.is_null() {
-                return Some(val.clone());
-            }
+        if let Precision::Exact(ref val) = col_stats.max_value
+            && !val.is_null()
+        {
+            return Some(val.clone());
         }
         None
     }
 }
 
 impl AggregateUDFImpl for Max {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "max"
     }
@@ -480,20 +475,16 @@ impl FromColumnStatistics for Min {
         &self,
         col_stats: &ColumnStatistics,
     ) -> Option<ScalarValue> {
-        if let Precision::Exact(ref val) = col_stats.min_value {
-            if !val.is_null() {
-                return Some(val.clone());
-            }
+        if let Precision::Exact(ref val) = col_stats.min_value
+            && !val.is_null()
+        {
+            return Some(val.clone());
         }
         None
     }
 }
 
 impl AggregateUDFImpl for Min {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "min"
     }
@@ -1012,12 +1003,13 @@ mod tests {
     use super::*;
     use arrow::{
         array::{
-            DictionaryArray, Float32Array, Int32Array, IntervalDayTimeArray,
-            IntervalMonthDayNanoArray, IntervalYearMonthArray, StringArray,
+            Array, DictionaryArray, Float32Array, Int8Array, Int32Array,
+            IntervalDayTimeArray, IntervalMonthDayNanoArray, IntervalYearMonthArray,
+            PrimitiveArray, StringArray,
         },
         datatypes::{
-            IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit,
-            IntervalYearMonthType,
+            ArrowDictionaryKeyType, IntervalDayTimeType, IntervalMonthDayNanoType,
+            IntervalUnit, IntervalYearMonthType,
         },
     };
     use std::sync::Arc;
@@ -1154,7 +1146,6 @@ mod tests {
         check(&mut max(), &[&[zero, neg_inf]], zero);
     }
 
-    use datafusion_common::Result;
     use rand::Rng;
 
     fn get_random_vec_i32(len: usize) -> Vec<i32> {
@@ -1268,7 +1259,178 @@ mod tests {
         let mut max_acc = MaxAccumulator::try_new(&rt_type)?;
         max_acc.update_batch(&[Arc::clone(&dict_array_ref)])?;
         let max_result = max_acc.evaluate()?;
-        assert_eq!(max_result, ScalarValue::Utf8(Some("🦀".to_string())));
+        assert_eq!(max_result, ScalarValue::Utf8(Some("d".to_string())));
+        Ok(())
+    }
+
+    fn dict_scalar(key_type: DataType, inner: ScalarValue) -> ScalarValue {
+        ScalarValue::Dictionary(Box::new(key_type), Box::new(inner))
+    }
+
+    fn utf8_dict_scalar(key_type: DataType, value: &str) -> ScalarValue {
+        dict_scalar(key_type, ScalarValue::Utf8(Some(value.to_string())))
+    }
+
+    fn string_dictionary_batch(values: &[&str], keys: &[Option<i32>]) -> ArrayRef {
+        string_dictionary_batch_with_keys(Int32Array::from(keys.to_vec()), values)
+    }
+
+    fn string_dictionary_batch_with_keys<K>(
+        keys: PrimitiveArray<K>,
+        values: &[&str],
+    ) -> ArrayRef
+    where
+        K: ArrowDictionaryKeyType,
+    {
+        let values = Arc::new(StringArray::from(values.to_vec())) as ArrayRef;
+        Arc::new(DictionaryArray::try_new(keys, values).unwrap()) as ArrayRef
+    }
+
+    fn optional_string_dictionary_batch(
+        values: &[Option<&str>],
+        keys: &[Option<i32>],
+    ) -> ArrayRef {
+        let values = Arc::new(StringArray::from(values.to_vec())) as ArrayRef;
+        Arc::new(
+            DictionaryArray::try_new(Int32Array::from(keys.to_vec()), values).unwrap(),
+        ) as ArrayRef
+    }
+
+    fn float_dictionary_batch(values: &[f32], keys: &[Option<i32>]) -> ArrayRef {
+        let values = Arc::new(Float32Array::from(values.to_vec())) as ArrayRef;
+        Arc::new(
+            DictionaryArray::try_new(Int32Array::from(keys.to_vec()), values).unwrap(),
+        ) as ArrayRef
+    }
+
+    fn evaluate_dictionary_accumulator(
+        mut acc: impl Accumulator,
+        batches: &[ArrayRef],
+    ) -> Result<ScalarValue> {
+        for batch in batches {
+            acc.update_batch(&[Arc::clone(batch)])?;
+        }
+        acc.evaluate()
+    }
+
+    fn assert_dictionary_min_max(
+        dict_type: &DataType,
+        batches: &[ArrayRef],
+        expected_min: &str,
+        expected_max: &str,
+    ) -> Result<()> {
+        let key_type = match dict_type {
+            DataType::Dictionary(key_type, _) => key_type.as_ref().clone(),
+            other => panic!("expected dictionary type, got {other:?}"),
+        };
+
+        let min_result = evaluate_dictionary_accumulator(
+            MinAccumulator::try_new(dict_type)?,
+            batches,
+        )?;
+        assert_eq!(min_result, utf8_dict_scalar(key_type.clone(), expected_min));
+
+        let max_result = evaluate_dictionary_accumulator(
+            MaxAccumulator::try_new(dict_type)?,
+            batches,
+        )?;
+        assert_eq!(max_result, utf8_dict_scalar(key_type, expected_max));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_min_max_dictionary_without_coercion() -> Result<()> {
+        let dict_array_ref = string_dictionary_batch(
+            &["b", "c", "a", "d"],
+            &[Some(0), Some(1), Some(2), Some(3)],
+        );
+        let dict_type = dict_array_ref.data_type().clone();
+
+        assert_dictionary_min_max(&dict_type, &[dict_array_ref], "a", "d")
+    }
+
+    #[test]
+    fn test_min_max_dictionary_with_nulls() -> Result<()> {
+        let dict_array_ref = string_dictionary_batch(
+            &["b", "c", "a"],
+            &[None, Some(0), None, Some(1), Some(2)],
+        );
+        let dict_type = dict_array_ref.data_type().clone();
+
+        assert_dictionary_min_max(&dict_type, &[dict_array_ref], "a", "c")
+    }
+
+    #[test]
+    fn test_min_max_dictionary_ignores_unreferenced_values() -> Result<()> {
+        let dict_array_ref =
+            string_dictionary_batch(&["a", "z", "zz_unused"], &[Some(1), Some(1), None]);
+        let dict_type = dict_array_ref.data_type().clone();
+
+        assert_dictionary_min_max(&dict_type, &[dict_array_ref], "z", "z")
+    }
+
+    #[test]
+    fn test_min_max_dictionary_ignores_referenced_null_values() -> Result<()> {
+        let dict_array_ref = optional_string_dictionary_batch(
+            &[Some("b"), None, Some("a"), Some("d")],
+            &[Some(0), Some(1), Some(2), Some(3)],
+        );
+        let dict_type = dict_array_ref.data_type().clone();
+
+        assert_dictionary_min_max(&dict_type, &[dict_array_ref], "a", "d")
+    }
+
+    #[test]
+    fn test_min_max_dictionary_multi_batch() -> Result<()> {
+        let dict_type =
+            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
+        let batch1 = string_dictionary_batch(&["b", "c"], &[Some(0), Some(1)]);
+        let batch2 = string_dictionary_batch(&["a", "d"], &[Some(0), Some(1)]);
+
+        assert_dictionary_min_max(&dict_type, &[batch1, batch2], "a", "d")
+    }
+
+    #[test]
+    fn test_min_max_dictionary_int8_keys() -> Result<()> {
+        let dict_type =
+            DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8));
+        let dict_array_ref = string_dictionary_batch_with_keys(
+            Int8Array::from(vec![Some(0), Some(1), Some(2), Some(3)]),
+            &["b", "c", "a", "d"],
+        );
+
+        assert_dictionary_min_max(&dict_type, &[dict_array_ref], "a", "d")
+    }
+
+    #[test]
+    fn test_min_max_dictionary_float_with_nans() -> Result<()> {
+        let dict_type =
+            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Float32));
+        let batch1 = float_dictionary_batch(&[0.0, f32::NAN], &[Some(0), Some(1)]);
+        let batch2 = float_dictionary_batch(&[f32::NEG_INFINITY], &[Some(0)]);
+
+        let min_result = evaluate_dictionary_accumulator(
+            MinAccumulator::try_new(&dict_type)?,
+            &[Arc::clone(&batch1), Arc::clone(&batch2)],
+        )?;
+        assert_eq!(
+            min_result,
+            dict_scalar(
+                DataType::Int32,
+                ScalarValue::Float32(Some(f32::NEG_INFINITY)),
+            )
+        );
+
+        let max_result = evaluate_dictionary_accumulator(
+            MaxAccumulator::try_new(&dict_type)?,
+            &[batch1, batch2],
+        )?;
+        assert_eq!(
+            max_result,
+            dict_scalar(DataType::Int32, ScalarValue::Float32(Some(f32::NAN)))
+        );
+
         Ok(())
     }
 }
diff --git a/datafusion/functions-aggregate/src/min_max/min_max_bytes.rs b/datafusion/functions-aggregate/src/min_max/min_max_bytes.rs
index 30b2739c08edc..e4ac7eccf5692 100644
--- a/datafusion/functions-aggregate/src/min_max/min_max_bytes.rs
+++ b/datafusion/functions-aggregate/src/min_max/min_max_bytes.rs
@@ -21,7 +21,7 @@ use arrow::array::{
 };
 use arrow::datatypes::DataType;
 use datafusion_common::hash_map::Entry;
-use datafusion_common::{internal_err, HashMap, Result};
+use datafusion_common::{HashMap, Result, internal_err};
 use datafusion_expr::{EmitTo, GroupsAccumulator};
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::apply_filter_as_nulls;
 use std::mem::size_of;
diff --git a/datafusion/functions-aggregate/src/min_max/min_max_struct.rs b/datafusion/functions-aggregate/src/min_max/min_max_struct.rs
index 8038f2f01d90c..796fd586ca5c8 100644
--- a/datafusion/functions-aggregate/src/min_max/min_max_struct.rs
+++ b/datafusion/functions-aggregate/src/min_max/min_max_struct.rs
@@ -24,9 +24,8 @@ use arrow::{
     datatypes::DataType,
 };
 use datafusion_common::{
-    internal_err,
+    Result, internal_err,
     scalar::{copy_array_data, partial_cmp_struct},
-    Result,
 };
 use datafusion_expr::{EmitTo, GroupsAccumulator};
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::apply_filter_as_nulls;
diff --git a/datafusion/functions-aggregate/src/nth_value.rs b/datafusion/functions-aggregate/src/nth_value.rs
index 2f4f9371be589..d131bb7a846ee 100644
--- a/datafusion/functions-aggregate/src/nth_value.rs
+++ b/datafusion/functions-aggregate/src/nth_value.rs
@@ -18,21 +18,22 @@
 //! Defines NTH_VALUE aggregate expression which may specify ordering requirement
 //! that can evaluated at runtime during query execution
 
-use std::any::Any;
 use std::collections::VecDeque;
 use std::mem::{size_of, size_of_val};
 use std::sync::Arc;
 
-use arrow::array::{new_empty_array, ArrayRef, AsArray, StructArray};
+use arrow::array::{ArrayRef, AsArray, StructArray, new_empty_array};
 use arrow::datatypes::{DataType, Field, FieldRef, Fields};
 
-use datafusion_common::utils::{get_row_at_idx, SingleRowListArrayBuilder};
-use datafusion_common::{exec_err, internal_err, not_impl_err, Result, ScalarValue};
+use datafusion_common::utils::{SingleRowListArrayBuilder, get_row_at_idx};
+use datafusion_common::{
+    Result, ScalarValue, assert_or_internal_err, exec_err, not_impl_err,
+};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::utils::format_state_name;
 use datafusion_expr::{
-    lit, Accumulator, AggregateUDFImpl, Documentation, ExprFunctionExt, ReversedUDAF,
-    Signature, SortExpr, Volatility,
+    Accumulator, AggregateUDFImpl, Documentation, ExprFunctionExt, ReversedUDAF,
+    Signature, SortExpr, Volatility, lit,
 };
 use datafusion_functions_aggregate_common::merge_arrays::merge_ordered_arrays;
 use datafusion_functions_aggregate_common::utils::ordering_fields;
@@ -110,10 +111,6 @@ impl Default for NthValueAgg {
 }
 
 impl AggregateUDFImpl for NthValueAgg {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "nth_value"
     }
@@ -128,7 +125,6 @@ impl AggregateUDFImpl for NthValueAgg {
 
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
         let n = match acc_args.exprs[1]
-            .as_any()
             .downcast_ref::<Literal>()
             .map(|lit| lit.value())
         {
@@ -144,7 +140,7 @@ impl AggregateUDFImpl for NthValueAgg {
                     "{} not supported for n: {}",
                     self.name(),
                     &acc_args.exprs[1]
-                )
+                );
             }
         };
 
@@ -206,10 +202,11 @@ impl TrivialNthValueAccumulator {
     /// Create a new order-insensitive NTH_VALUE accumulator based on the given
     /// item data type.
     pub fn try_new(n: i64, datatype: &DataType) -> Result<Self> {
-        if n == 0 {
-            // n cannot be 0
-            return internal_err!("Nth value indices are 1 based. 0 is invalid index");
-        }
+        // n cannot be 0
+        assert_or_internal_err!(
+            n != 0,
+            "Nth value indices are 1 based. 0 is invalid index"
+        );
         Ok(Self {
             n,
             values: VecDeque::new(),
@@ -339,10 +336,11 @@ impl NthValueAccumulator {
         ordering_dtypes: &[DataType],
         ordering_req: LexOrdering,
     ) -> Result<Self> {
-        if n == 0 {
-            // n cannot be 0
-            return internal_err!("Nth value indices are 1 based. 0 is invalid index");
-        }
+        // n cannot be 0
+        assert_or_internal_err!(
+            n != 0,
+            "Nth value indices are 1 based. 0 is invalid index"
+        );
         let mut datatypes = vec![datatype.clone()];
         datatypes.extend(ordering_dtypes.iter().cloned());
         Ok(Self {
diff --git a/datafusion/functions-aggregate/src/percentile_cont.rs b/datafusion/functions-aggregate/src/percentile_cont.rs
index 7ef0f8baf08d9..256388c216f00 100644
--- a/datafusion/functions-aggregate/src/percentile_cont.rs
+++ b/datafusion/functions-aggregate/src/percentile_cont.rs
@@ -15,7 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::fmt::{Debug, Formatter};
+use std::collections::HashMap;
+use std::fmt::Debug;
 use std::mem::{size_of, size_of_val};
 use std::sync::Arc;
 
@@ -25,30 +26,34 @@ use arrow::array::{
 use arrow::buffer::{OffsetBuffer, ScalarBuffer};
 use arrow::{
     array::{Array, ArrayRef, AsArray},
-    datatypes::{
-        ArrowNativeType, DataType, Decimal128Type, Decimal256Type, Decimal32Type,
-        Decimal64Type, Field, FieldRef, Float16Type, Float32Type, Float64Type,
-    },
+    datatypes::{DataType, Field, FieldRef, Float16Type, Float32Type, Float64Type},
 };
 
+use num_traits::AsPrimitive;
+
 use arrow::array::ArrowNativeTypeOp;
+use datafusion_common::internal_err;
+use datafusion_common::types::{NativeType, logical_float64};
+use datafusion_functions_aggregate_common::noop_accumulator::NoopAccumulator;
 
+use crate::min_max::{max_udaf, min_udaf};
 use datafusion_common::{
-    internal_datafusion_err, internal_err, plan_err, DataFusionError, HashSet, Result,
-    ScalarValue,
+    Result, ScalarValue, internal_datafusion_err, utils::take_function_args,
 };
-use datafusion_expr::expr::{AggregateFunction, Sort};
-use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
-use datafusion_expr::type_coercion::aggregates::NUMERICS;
 use datafusion_expr::utils::format_state_name;
 use datafusion_expr::{
-    Accumulator, AggregateUDFImpl, Documentation, Expr, Signature, TypeSignature,
-    Volatility,
+    Accumulator, AggregateUDFImpl, Coercion, Documentation, Expr, Signature,
+    TypeSignatureClass, Volatility,
 };
 use datafusion_expr::{EmitTo, GroupsAccumulator};
+use datafusion_expr::{
+    expr::{AggregateFunction, Sort},
+    function::{AccumulatorArgs, AggregateFunctionSimplification, StateFieldsArgs},
+    simplify::SimplifyContext,
+};
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::accumulate;
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::filtered_null_mask;
-use datafusion_functions_aggregate_common::utils::Hashable;
+use datafusion_functions_aggregate_common::utils::{GenericDistinctBuffer, Hashable};
 use datafusion_macros::user_doc;
 
 use crate::utils::validate_percentile_expr;
@@ -63,7 +68,10 @@ use crate::utils::validate_percentile_expr;
 /// The interpolation formula: `lower + (upper - lower) * fraction`
 /// is computed as: `lower + ((upper - lower) * (fraction * PRECISION)) / PRECISION`
 /// to avoid floating-point operations on integer types while maintaining precision.
-const INTERPOLATION_PRECISION: usize = 1_000_000;
+///
+/// The interpolation arithmetic is performed in f64 and then cast back to the
+/// native type to avoid overflowing Float16 intermediates.
+const INTERPOLATION_PRECISION: f64 = 1_000_000.0;
 
 create_func!(PercentileCont, percentile_cont_udaf);
 
@@ -117,21 +125,12 @@ An alternate syntax is also supported:
 /// If using the distinct variation, the memory usage will be similarly high if the
 /// cardinality is high as it stores all distinct values in memory before computing the
 /// result, but if cardinality is low then memory usage will also be lower.
-#[derive(PartialEq, Eq, Hash)]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct PercentileCont {
     signature: Signature,
     aliases: Vec<String>,
 }
 
-impl Debug for PercentileCont {
-    fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
-        f.debug_struct("PercentileCont")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for PercentileCont {
     fn default() -> Self {
         Self::new()
@@ -140,81 +139,30 @@ impl Default for PercentileCont {
 
 impl PercentileCont {
     pub fn new() -> Self {
-        let mut variants = Vec::with_capacity(NUMERICS.len());
-        // Accept any numeric value paired with a float64 percentile
-        for num in NUMERICS {
-            variants.push(TypeSignature::Exact(vec![num.clone(), DataType::Float64]));
-        }
         Self {
-            signature: Signature::one_of(variants, Volatility::Immutable),
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Float,
+                        vec![TypeSignatureClass::Numeric],
+                        NativeType::Float64,
+                    ),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_float64()),
+                        vec![TypeSignatureClass::Numeric],
+                        NativeType::Float64,
+                    ),
+                ],
+                Volatility::Immutable,
+            )
+            .with_parameter_names(vec!["expr", "percentile"])
+            .unwrap(),
             aliases: vec![String::from("quantile_cont")],
         }
     }
-
-    fn create_accumulator(&self, args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
-        let percentile = validate_percentile_expr(&args.exprs[1], "PERCENTILE_CONT")?;
-
-        let is_descending = args
-            .order_bys
-            .first()
-            .map(|sort_expr| sort_expr.options.descending)
-            .unwrap_or(false);
-
-        let percentile = if is_descending {
-            1.0 - percentile
-        } else {
-            percentile
-        };
-
-        macro_rules! helper {
-            ($t:ty, $dt:expr) => {
-                if args.is_distinct {
-                    Ok(Box::new(DistinctPercentileContAccumulator::<$t> {
-                        data_type: $dt.clone(),
-                        distinct_values: HashSet::new(),
-                        percentile,
-                    }))
-                } else {
-                    Ok(Box::new(PercentileContAccumulator::<$t> {
-                        data_type: $dt.clone(),
-                        all_values: vec![],
-                        percentile,
-                    }))
-                }
-            };
-        }
-
-        let input_dt = args.exprs[0].data_type(args.schema)?;
-        match input_dt {
-            // For integer types, use Float64 internally since percentile_cont returns Float64
-            DataType::Int8
-            | DataType::Int16
-            | DataType::Int32
-            | DataType::Int64
-            | DataType::UInt8
-            | DataType::UInt16
-            | DataType::UInt32
-            | DataType::UInt64 => helper!(Float64Type, DataType::Float64),
-            DataType::Float16 => helper!(Float16Type, input_dt),
-            DataType::Float32 => helper!(Float32Type, input_dt),
-            DataType::Float64 => helper!(Float64Type, input_dt),
-            DataType::Decimal32(_, _) => helper!(Decimal32Type, input_dt),
-            DataType::Decimal64(_, _) => helper!(Decimal64Type, input_dt),
-            DataType::Decimal128(_, _) => helper!(Decimal128Type, input_dt),
-            DataType::Decimal256(_, _) => helper!(Decimal256Type, input_dt),
-            _ => Err(DataFusionError::NotImplemented(format!(
-                "PercentileContAccumulator not supported for {} with {}",
-                args.name, input_dt,
-            ))),
-        }
-    }
 }
 
 impl AggregateUDFImpl for PercentileCont {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "percentile_cont"
     }
@@ -228,136 +176,108 @@ impl AggregateUDFImpl for PercentileCont {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if !arg_types[0].is_numeric() {
-            return plan_err!("percentile_cont requires numeric input types");
-        }
-        // PERCENTILE_CONT performs linear interpolation and should return a float type
-        // For integer inputs, return Float64 (matching PostgreSQL/DuckDB behavior)
-        // For float inputs, preserve the float type
         match &arg_types[0] {
-            DataType::Float16 | DataType::Float32 | DataType::Float64 => {
-                Ok(arg_types[0].clone())
-            }
-            DataType::Decimal32(_, _)
-            | DataType::Decimal64(_, _)
-            | DataType::Decimal128(_, _)
-            | DataType::Decimal256(_, _) => Ok(arg_types[0].clone()),
-            DataType::UInt8
-            | DataType::UInt16
-            | DataType::UInt32
-            | DataType::UInt64
-            | DataType::Int8
-            | DataType::Int16
-            | DataType::Int32
-            | DataType::Int64 => Ok(DataType::Float64),
-            // Shouldn't happen due to signature check, but just in case
-            dt => plan_err!(
-                "percentile_cont does not support input type {}, must be numeric",
-                dt
-            ),
+            DataType::Null => Ok(DataType::Float64),
+            dt => Ok(dt.clone()),
         }
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        //Intermediate state is a list of the elements we have collected so far
         let input_type = args.input_fields[0].data_type().clone();
-        // For integer types, we store as Float64 internally
-        let storage_type = match &input_type {
-            DataType::Int8
-            | DataType::Int16
-            | DataType::Int32
-            | DataType::Int64
-            | DataType::UInt8
-            | DataType::UInt16
-            | DataType::UInt32
-            | DataType::UInt64 => DataType::Float64,
-            _ => input_type,
-        };
+        if input_type.is_null() {
+            return Ok(vec![
+                Field::new(
+                    format_state_name(args.name, self.name()),
+                    DataType::Null,
+                    true,
+                )
+                .into(),
+            ]);
+        }
 
-        let field = Field::new_list_field(storage_type, true);
+        let field = Field::new_list_field(input_type, true);
         let state_name = if args.is_distinct {
             "distinct_percentile_cont"
         } else {
             "percentile_cont"
         };
 
-        Ok(vec![Field::new(
-            format_state_name(args.name, state_name),
-            DataType::List(Arc::new(field)),
-            true,
-        )
-        .into()])
+        Ok(vec![
+            Field::new(
+                format_state_name(args.name, state_name),
+                DataType::List(Arc::new(field)),
+                true,
+            )
+            .into(),
+        ])
     }
 
-    fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
-        self.create_accumulator(acc_args)
+    fn accumulator(&self, args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        let percentile = get_percentile(&args)?;
+
+        let input_dt = args.expr_fields[0].data_type();
+        if input_dt.is_null() {
+            return Ok(Box::new(NoopAccumulator::new(ScalarValue::Float64(None))));
+        }
+
+        if args.is_distinct {
+            match input_dt {
+                DataType::Float16 => Ok(Box::new(DistinctPercentileContAccumulator::<
+                    Float16Type,
+                >::new(percentile))),
+                DataType::Float32 => Ok(Box::new(DistinctPercentileContAccumulator::<
+                    Float32Type,
+                >::new(percentile))),
+                DataType::Float64 => Ok(Box::new(DistinctPercentileContAccumulator::<
+                    Float64Type,
+                >::new(percentile))),
+                dt => internal_err!("Unsupported datatype for percentile cont: {dt}"),
+            }
+        } else {
+            match input_dt {
+                DataType::Float16 => Ok(Box::new(
+                    PercentileContAccumulator::<Float16Type>::new(percentile),
+                )),
+                DataType::Float32 => Ok(Box::new(
+                    PercentileContAccumulator::<Float32Type>::new(percentile),
+                )),
+                DataType::Float64 => Ok(Box::new(
+                    PercentileContAccumulator::<Float64Type>::new(percentile),
+                )),
+                dt => internal_err!("Unsupported datatype for percentile cont: {dt}"),
+            }
+        }
     }
 
     fn groups_accumulator_supported(&self, args: AccumulatorArgs) -> bool {
-        !args.is_distinct
+        !args.is_distinct && !args.expr_fields[0].data_type().is_null()
     }
 
     fn create_groups_accumulator(
         &self,
         args: AccumulatorArgs,
     ) -> Result<Box<dyn GroupsAccumulator>> {
-        let num_args = args.exprs.len();
-        if num_args != 2 {
-            return internal_err!(
-                "percentile_cont should have 2 args, but found num args:{}",
-                args.exprs.len()
-            );
-        }
-
-        let percentile = validate_percentile_expr(&args.exprs[1], "PERCENTILE_CONT")?;
-
-        let is_descending = args
-            .order_bys
-            .first()
-            .map(|sort_expr| sort_expr.options.descending)
-            .unwrap_or(false);
-
-        let percentile = if is_descending {
-            1.0 - percentile
-        } else {
-            percentile
-        };
+        let percentile = get_percentile(&args)?;
 
-        macro_rules! helper {
-            ($t:ty, $dt:expr) => {
-                Ok(Box::new(PercentileContGroupsAccumulator::<$t>::new(
-                    $dt, percentile,
-                )))
-            };
-        }
-
-        let input_dt = args.exprs[0].data_type(args.schema)?;
+        let input_dt = args.expr_fields[0].data_type();
         match input_dt {
-            // For integer types, use Float64 internally since percentile_cont returns Float64
-            DataType::Int8
-            | DataType::Int16
-            | DataType::Int32
-            | DataType::Int64
-            | DataType::UInt8
-            | DataType::UInt16
-            | DataType::UInt32
-            | DataType::UInt64 => helper!(Float64Type, DataType::Float64),
-            DataType::Float16 => helper!(Float16Type, input_dt),
-            DataType::Float32 => helper!(Float32Type, input_dt),
-            DataType::Float64 => helper!(Float64Type, input_dt),
-            DataType::Decimal32(_, _) => helper!(Decimal32Type, input_dt),
-            DataType::Decimal64(_, _) => helper!(Decimal64Type, input_dt),
-            DataType::Decimal128(_, _) => helper!(Decimal128Type, input_dt),
-            DataType::Decimal256(_, _) => helper!(Decimal256Type, input_dt),
-            _ => Err(DataFusionError::NotImplemented(format!(
-                "PercentileContGroupsAccumulator not supported for {} with {}",
-                args.name, input_dt,
-            ))),
+            DataType::Float16 => Ok(Box::new(PercentileContGroupsAccumulator::<
+                Float16Type,
+            >::new(percentile))),
+            DataType::Float32 => Ok(Box::new(PercentileContGroupsAccumulator::<
+                Float32Type,
+            >::new(percentile))),
+            DataType::Float64 => Ok(Box::new(PercentileContGroupsAccumulator::<
+                Float64Type,
+            >::new(percentile))),
+            dt => internal_err!("Unsupported datatype for percentile cont: {dt}"),
         }
     }
 
-    fn supports_null_handling_clause(&self) -> bool {
-        false
+    fn simplify(&self) -> Option<AggregateFunctionSimplification> {
+        Some(Box::new(|aggregate_function, info| {
+            simplify_percentile_cont_aggregate(aggregate_function, info)
+        }))
     }
 
     fn supports_within_group_clause(&self) -> bool {
@@ -369,6 +289,83 @@ impl AggregateUDFImpl for PercentileCont {
     }
 }
 
+fn get_percentile(args: &AccumulatorArgs) -> Result<f64> {
+    let percentile = validate_percentile_expr(&args.exprs[1], "PERCENTILE_CONT")?;
+
+    let is_descending = args
+        .order_bys
+        .first()
+        .map(|sort_expr| sort_expr.options.descending)
+        .unwrap_or(false);
+
+    let percentile = if is_descending {
+        1.0 - percentile
+    } else {
+        percentile
+    };
+
+    Ok(percentile)
+}
+
+fn simplify_percentile_cont_aggregate(
+    aggregate_function: AggregateFunction,
+    info: &SimplifyContext,
+) -> Result<Expr> {
+    enum PercentileRewriteTarget {
+        Min,
+        Max,
+    }
+
+    let params = &aggregate_function.params;
+    let [value, percentile] = take_function_args("percentile_cont", &params.args)?;
+    //
+    // For simplicity we don't bother with null types (otherwise we'd need to
+    // cast the return type)
+    let input_type = info.get_data_type(value)?;
+    if input_type.is_null() {
+        return Ok(Expr::AggregateFunction(aggregate_function));
+    }
+
+    let is_descending = params
+        .order_by
+        .first()
+        .map(|sort| !sort.asc)
+        .unwrap_or(false);
+
+    let rewrite_target = match percentile {
+        Expr::Literal(ScalarValue::Float64(Some(0.0)), _) => {
+            if is_descending {
+                PercentileRewriteTarget::Max
+            } else {
+                PercentileRewriteTarget::Min
+            }
+        }
+        Expr::Literal(ScalarValue::Float64(Some(1.0)), _) => {
+            if is_descending {
+                PercentileRewriteTarget::Min
+            } else {
+                PercentileRewriteTarget::Max
+            }
+        }
+        _ => return Ok(Expr::AggregateFunction(aggregate_function)),
+    };
+
+    let udaf = match rewrite_target {
+        PercentileRewriteTarget::Min => min_udaf(),
+        PercentileRewriteTarget::Max => max_udaf(),
+    };
+
+    let rewritten = Expr::AggregateFunction(AggregateFunction::new_udf(
+        udaf,
+        vec![value.clone()],
+        params.distinct,
+        params.filter.clone(),
+        vec![],
+        params.null_treatment,
+    ));
+    Ok(rewritten)
+}
+
 /// The percentile_cont accumulator accumulates the raw input values
 /// as native types.
 ///
@@ -376,23 +373,27 @@ impl AggregateUDFImpl for PercentileCont {
 /// `merge_batch` and a `Vec` of native values that are converted to scalar values
 /// in the final evaluation step so that we avoid expensive conversions and
 /// allocations during `update_batch`.
-struct PercentileContAccumulator<T: ArrowNumericType> {
-    data_type: DataType,
+#[derive(Debug)]
+struct PercentileContAccumulator<T: ArrowNumericType + Debug> {
     all_values: Vec<T::Native>,
     percentile: f64,
 }
 
-impl<T: ArrowNumericType> Debug for PercentileContAccumulator<T> {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "PercentileContAccumulator({}, percentile={})",
-            self.data_type, self.percentile
-        )
+impl<T: ArrowNumericType + Debug> PercentileContAccumulator<T> {
+    fn new(percentile: f64) -> Self {
+        Self {
+            all_values: vec![],
+            percentile,
+        }
     }
 }
 
-impl<T: ArrowNumericType> Accumulator for PercentileContAccumulator<T> {
+impl<T> Accumulator for PercentileContAccumulator<T>
+where
+    T: ArrowNumericType + Debug,
+    T::Native: Copy + AsPrimitive<f64>,
+    f64: AsPrimitive<T::Native>,
+{
     fn state(&mut self) -> Result<Vec<ScalarValue>> {
         // Convert `all_values` to `ListArray` and return a single List ScalarValue
 
@@ -404,12 +405,11 @@ impl<T: ArrowNumericType> Accumulator for PercentileContAccumulator<T> {
         let values_array = PrimitiveArray::<T>::new(
             ScalarBuffer::from(std::mem::take(&mut self.all_values)),
             None,
-        )
-        .with_data_type(self.data_type.clone());
+        );
 
         // Build the result list array
         let list_array = ListArray::new(
-            Arc::new(Field::new_list_field(self.data_type.clone(), true)),
+            Arc::new(Field::new_list_field(T::DATA_TYPE, true)),
             offsets,
             Arc::new(values_array),
             None,
@@ -419,14 +419,7 @@ impl<T: ArrowNumericType> Accumulator for PercentileContAccumulator<T> {
     }
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        // Cast to target type if needed (e.g., integer to Float64)
-        let values = if values[0].data_type() != &self.data_type {
-            arrow::compute::cast(&values[0], &self.data_type)?
-        } else {
-            Arc::clone(&values[0])
-        };
-
-        let values = values.as_primitive::<T>();
+        let values = values[0].as_primitive::<T>();
         self.all_values.reserve(values.len() - values.null_count());
         self.all_values.extend(values.iter().flatten());
         Ok(())
@@ -434,21 +427,51 @@ impl<T: ArrowNumericType> Accumulator for PercentileContAccumulator<T> {
 
     fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
         let array = states[0].as_list::<i32>();
-        for v in array.iter().flatten() {
-            self.update_batch(&[v])?
-        }
+        self.update_batch(&[array.value(0)])?;
         Ok(())
     }
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
-        let d = std::mem::take(&mut self.all_values);
-        let value = calculate_percentile::<T>(d, self.percentile);
-        ScalarValue::new_primitive::<T>(value, &self.data_type)
+        let value = calculate_percentile::<T>(&mut self.all_values, self.percentile);
+        ScalarValue::new_primitive::<T>(value, &T::DATA_TYPE)
     }
 
     fn size(&self) -> usize {
         size_of_val(self) + self.all_values.capacity() * size_of::<T::Native>()
     }
+
+    fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let mut to_remove: HashMap<Hashable<T::Native>, usize> = HashMap::new();
+
+        let arr = values[0].as_primitive::<T>();
+        for value in arr.iter().flatten() {
+            *to_remove.entry(Hashable(value)).or_default() += 1;
+        }
+
+        let mut i = 0;
+        while i < self.all_values.len() {
+            let k = Hashable(self.all_values[i]);
+            if let Some(count) = to_remove.get_mut(&k)
+                && *count > 0
+            {
+                self.all_values.swap_remove(i);
+                *count -= 1;
+                if *count == 0 {
+                    to_remove.remove(&k);
+                    if to_remove.is_empty() {
+                        break;
+                    }
+                }
+            } else {
+                i += 1;
+            }
+        }
+        Ok(())
+    }
+
+    fn supports_retract_batch(&self) -> bool {
+        true
+    }
 }
 
 /// The percentile_cont groups accumulator accumulates the raw input values
@@ -459,23 +482,24 @@ impl<T: ArrowNumericType> Accumulator for PercentileContAccumulator<T> {
 /// will be actually organized as a `Vec<Vec<T>>`.
 #[derive(Debug)]
 struct PercentileContGroupsAccumulator<T: ArrowNumericType + Send> {
-    data_type: DataType,
     group_values: Vec<Vec<T::Native>>,
     percentile: f64,
 }
 
 impl<T: ArrowNumericType + Send> PercentileContGroupsAccumulator<T> {
-    pub fn new(data_type: DataType, percentile: f64) -> Self {
+    fn new(percentile: f64) -> Self {
         Self {
-            data_type,
-            group_values: Vec::new(),
+            group_values: vec![],
             percentile,
         }
     }
 }
 
-impl<T: ArrowNumericType + Send> GroupsAccumulator
-    for PercentileContGroupsAccumulator<T>
+impl<T> GroupsAccumulator for PercentileContGroupsAccumulator<T>
+where
+    T: ArrowNumericType + Send,
+    T::Native: Copy + AsPrimitive<f64>,
+    f64: AsPrimitive<T::Native>,
 {
     fn update_batch(
         &mut self,
@@ -487,14 +511,7 @@ impl<T: ArrowNumericType + Send> GroupsAccumulator
         // For ordered-set aggregates, we only care about the ORDER BY column (first element)
         // The percentile parameter is already stored in self.percentile
 
-        // Cast to target type if needed (e.g., integer to Float64)
-        let values_array = if values[0].data_type() != &self.data_type {
-            arrow::compute::cast(&values[0], &self.data_type)?
-        } else {
-            Arc::clone(&values[0])
-        };
-
-        let values = values_array.as_primitive::<T>();
+        let values = values[0].as_primitive::<T>();
 
         // Push the `not nulls + not filtered` row into its group
         self.group_values.resize(total_num_groups, Vec::new());
@@ -557,12 +574,11 @@ impl<T: ArrowNumericType + Send> GroupsAccumulator
         let flatten_group_values =
             emit_group_values.into_iter().flatten().collect::<Vec<_>>();
         let group_values_array =
-            PrimitiveArray::<T>::new(ScalarBuffer::from(flatten_group_values), None)
-                .with_data_type(self.data_type.clone());
+            PrimitiveArray::<T>::new(ScalarBuffer::from(flatten_group_values), None);
 
         // Build the result list array
         let result_list_array = ListArray::new(
-            Arc::new(Field::new_list_field(self.data_type.clone(), true)),
+            Arc::new(Field::new_list_field(T::DATA_TYPE, true)),
             offsets,
             Arc::new(group_values_array),
             None,
@@ -573,13 +589,13 @@ impl<T: ArrowNumericType + Send> GroupsAccumulator
 
     fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
         // Emit values
-        let emit_group_values = emit_to.take_needed(&mut self.group_values);
+        let mut emit_group_values = emit_to.take_needed(&mut self.group_values);
 
         // Calculate percentile for each group
         let mut evaluate_result_builder =
-            PrimitiveBuilder::<T>::new().with_data_type(self.data_type.clone());
-        for values in emit_group_values {
-            let value = calculate_percentile::<T>(values, self.percentile);
+            PrimitiveBuilder::<T>::with_capacity(emit_group_values.len());
+        for values in &mut emit_group_values {
+            let value = calculate_percentile::<T>(values.as_mut_slice(), self.percentile);
             evaluate_result_builder.append_option(value);
         }
 
@@ -593,14 +609,7 @@ impl<T: ArrowNumericType + Send> GroupsAccumulator
     ) -> Result<Vec<ArrayRef>> {
         assert_eq!(values.len(), 1, "one argument to merge_batch");
 
-        // Cast to target type if needed (e.g., integer to Float64)
-        let values_array = if values[0].data_type() != &self.data_type {
-            arrow::compute::cast(&values[0], &self.data_type)?
-        } else {
-            Arc::clone(&values[0])
-        };
-
-        let input_array = values_array.as_primitive::<T>();
+        let input_array = values[0].as_primitive::<T>();
 
         // Directly convert the input array to states, each row will be
         // seen as a respective group.
@@ -610,8 +619,7 @@ impl<T: ArrowNumericType + Send> GroupsAccumulator
         // to null.
 
         // Reuse values buffer in `input_array` to build `values` in `ListArray`
-        let values = PrimitiveArray::<T>::new(input_array.values().clone(), None)
-            .with_data_type(self.data_type.clone());
+        let values = PrimitiveArray::<T>::new(input_array.values().clone(), None);
 
         // `offsets` in `ListArray`, each row as a list element
         let offset_end = i32::try_from(input_array.len()).map_err(|e| {
@@ -632,7 +640,7 @@ impl<T: ArrowNumericType + Send> GroupsAccumulator
         let nulls = filtered_null_mask(opt_filter, input_array);
 
         let converted_list_array = ListArray::new(
-            Arc::new(Field::new_list_field(self.data_type.clone(), true)),
+            Arc::new(Field::new_list_field(T::DATA_TYPE, true)),
             offsets,
             Arc::new(values),
             nulls,
@@ -655,86 +663,64 @@ impl<T: ArrowNumericType + Send> GroupsAccumulator
     }
 }
 
-/// The distinct percentile_cont accumulator accumulates the raw input values
-/// using a HashSet to eliminate duplicates.
-///
-/// The intermediate state is represented as a List of scalar values updated by
-/// `merge_batch` and a `Vec` of `ArrayRef` that are converted to scalar values
-/// in the final evaluation step so that we avoid expensive conversions and
-/// allocations during `update_batch`.
+#[derive(Debug)]
 struct DistinctPercentileContAccumulator<T: ArrowNumericType> {
-    data_type: DataType,
-    distinct_values: HashSet<Hashable<T::Native>>,
+    distinct_values: GenericDistinctBuffer<T>,
     percentile: f64,
 }
 
-impl<T: ArrowNumericType> Debug for DistinctPercentileContAccumulator<T> {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "DistinctPercentileContAccumulator({}, percentile={})",
-            self.data_type, self.percentile
-        )
+impl<T: ArrowNumericType + Debug> DistinctPercentileContAccumulator<T> {
+    fn new(percentile: f64) -> Self {
+        Self {
+            distinct_values: GenericDistinctBuffer::new(T::DATA_TYPE),
+            percentile,
+        }
     }
 }
 
-impl<T: ArrowNumericType> Accumulator for DistinctPercentileContAccumulator<T> {
+impl<T> Accumulator for DistinctPercentileContAccumulator<T>
+where
+    T: ArrowNumericType + Debug,
+    T::Native: Copy + AsPrimitive<f64>,
+    f64: AsPrimitive<T::Native>,
+{
     fn state(&mut self) -> Result<Vec<ScalarValue>> {
-        let all_values = self
-            .distinct_values
-            .iter()
-            .map(|x| ScalarValue::new_primitive::<T>(Some(x.0), &self.data_type))
-            .collect::<Result<Vec<_>>>()?;
-
-        let arr = ScalarValue::new_list_nullable(&all_values, &self.data_type);
-        Ok(vec![ScalarValue::List(arr)])
+        self.distinct_values.state()
     }
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        if values.is_empty() {
-            return Ok(());
-        }
-
-        // Cast to target type if needed (e.g., integer to Float64)
-        let values = if values[0].data_type() != &self.data_type {
-            arrow::compute::cast(&values[0], &self.data_type)?
-        } else {
-            Arc::clone(&values[0])
-        };
-
-        let array = values.as_primitive::<T>();
-        match array.nulls().filter(|x| x.null_count() > 0) {
-            Some(n) => {
-                for idx in n.valid_indices() {
-                    self.distinct_values.insert(Hashable(array.value(idx)));
-                }
-            }
-            None => array.values().iter().for_each(|x| {
-                self.distinct_values.insert(Hashable(*x));
-            }),
-        }
-        Ok(())
+        self.distinct_values.update_batch(values)
     }
 
     fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
-        let array = states[0].as_list::<i32>();
-        for v in array.iter().flatten() {
-            self.update_batch(&[v])?
-        }
-        Ok(())
+        self.distinct_values.merge_batch(states)
     }
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
-        let d = std::mem::take(&mut self.distinct_values)
-            .into_iter()
-            .map(|v| v.0)
-            .collect::<Vec<_>>();
-        let value = calculate_percentile::<T>(d, self.percentile);
-        ScalarValue::new_primitive::<T>(value, &self.data_type)
+        let mut values: Vec<T::Native> =
+            self.distinct_values.values.iter().map(|v| v.0).collect();
+        let value = calculate_percentile::<T>(&mut values, self.percentile);
+        ScalarValue::new_primitive::<T>(value, &T::DATA_TYPE)
     }
 
     fn size(&self) -> usize {
-        size_of_val(self) + self.distinct_values.capacity() * size_of::<T::Native>()
+        size_of_val(self) + self.distinct_values.size()
+    }
+
+    fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        if values.is_empty() {
+            return Ok(());
+        }
+
+        let arr = values[0].as_primitive::<T>();
+        for value in arr.iter().flatten() {
+            self.distinct_values.values.remove(&Hashable(value));
+        }
+        Ok(())
+    }
+
+    fn supports_retract_batch(&self) -> bool {
+        true
     }
 }
 
@@ -745,10 +731,18 @@ impl<T: ArrowNumericType> Accumulator for DistinctPercentileContAccumulator<T> {
 /// For percentile p and n values:
 /// - If p * (n-1) is an integer, return the value at that position
 /// - Otherwise, interpolate between the two closest values
+///
+/// Note: This function takes a mutable slice and sorts it in place, but does not
+/// consume the data. This is important for window frame queries where evaluate()
+/// may be called multiple times on the same accumulator state.
 fn calculate_percentile<T: ArrowNumericType>(
-    mut values: Vec<T::Native>,
+    values: &mut [T::Native],
     percentile: f64,
-) -> Option<T::Native> {
+) -> Option<T::Native>
+where
+    T::Native: Copy + AsPrimitive<f64>,
+    f64: AsPrimitive<T::Native>,
+{
     let cmp = |x: &T::Native, y: &T::Native| x.compare(*y);
 
     let len = values.len();
@@ -792,22 +786,47 @@ fn calculate_percentile<T: ArrowNumericType>(
             let (_, upper_value, _) = values.select_nth_unstable_by(upper_index, cmp);
             let upper_value = *upper_value;
 
-            // Linear interpolation using wrapping arithmetic
-            // We use wrapping operations here (matching the approach in median.rs) because:
-            // 1. Both values come from the input data, so diff is bounded by the value range
-            // 2. fraction is between 0 and 1, and INTERPOLATION_PRECISION is small enough
-            //    to prevent overflow when combined with typical numeric ranges
-            // 3. The result is guaranteed to be between lower_value and upper_value
-            // 4. For floating-point types, wrapping ops behave the same as standard ops
+            // Linear interpolation.
+            // We compute a quantized interpolation weight using `INTERPOLATION_PRECISION` because:
+            // 1. Both values come from the input data, so (upper - lower) is bounded by the value range
+            // 2. fraction is between 0 and 1; quantizing it provides stable, predictable results
+            // 3. The result is guaranteed to be between lower_value and upper_value (modulo cast rounding)
+            // 4. Arithmetic is performed in f64 and cast back to avoid overflowing Float16 intermediates
             let fraction = index - (lower_index as f64);
-            let diff = upper_value.sub_wrapping(lower_value);
-            let interpolated = lower_value.add_wrapping(
-                diff.mul_wrapping(T::Native::usize_as(
-                    (fraction * INTERPOLATION_PRECISION as f64) as usize,
-                ))
-                .div_wrapping(T::Native::usize_as(INTERPOLATION_PRECISION)),
-            );
-            Some(interpolated)
+            let scaled = (fraction * INTERPOLATION_PRECISION) as usize;
+            let weight = scaled as f64 / INTERPOLATION_PRECISION;
+
+            let lower_f: f64 = lower_value.as_();
+            let upper_f: f64 = upper_value.as_();
+            let interpolated_f = lower_f + (upper_f - lower_f) * weight;
+            Some(interpolated_f.as_())
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::calculate_percentile;
+    use half::f16;
+
+    #[test]
+    fn f16_interpolation_does_not_overflow_to_nan() {
+        // Regression test for https://github.com/apache/datafusion/issues/18945
+        // Interpolating between 0 and the max finite f16 value previously overflowed
+        // intermediate f16 computations and produced NaN.
+        let mut values = vec![f16::from_f32(0.0), f16::from_f32(65504.0)];
+        let result =
+            calculate_percentile::<arrow::datatypes::Float16Type>(&mut values, 0.5)
+                .expect("non-empty input");
+        let result_f = result.to_f32();
+        assert!(
+            !result_f.is_nan(),
+            "expected non-NaN result, got {result_f}"
+        );
+        // 0.5 percentile should be close to midpoint
+        assert!(
+            (result_f - 32752.0).abs() < 1.0,
+            "unexpected result {result_f}"
+        );
+    }
+}
diff --git a/datafusion/functions-aggregate/src/planner.rs b/datafusion/functions-aggregate/src/planner.rs
index f0e37f6b1dbe4..8a6d9b9bb1e9f 100644
--- a/datafusion/functions-aggregate/src/planner.rs
+++ b/datafusion/functions-aggregate/src/planner.rs
@@ -19,11 +19,11 @@
 
 use datafusion_common::Result;
 use datafusion_expr::{
+    Expr,
     expr::{AggregateFunction, AggregateFunctionParams},
     expr_rewriter::NamePreserver,
     planner::{ExprPlanner, PlannerResult, RawAggregateExpr},
     utils::COUNT_STAR_EXPANSION,
-    Expr,
 };
 
 #[derive(Debug)]
diff --git a/datafusion/functions-aggregate/src/regr.rs b/datafusion/functions-aggregate/src/regr.rs
index 44ce0bd48ead6..3a68672abb949 100644
--- a/datafusion/functions-aggregate/src/regr.rs
+++ b/datafusion/functions-aggregate/src/regr.rs
@@ -17,26 +17,16 @@
 
 //! Defines physical expressions that can evaluated at runtime during query execution
 
-use arrow::array::Float64Array;
 use arrow::datatypes::FieldRef;
-use arrow::{
-    array::{ArrayRef, UInt64Array},
-    compute::cast,
-    datatypes::DataType,
-    datatypes::Field,
-};
-use datafusion_common::{
-    downcast_value, plan_err, unwrap_or_internal_err, DataFusionError, HashMap, Result,
-    ScalarValue,
-};
+use arrow::{array::ArrayRef, datatypes::DataType, datatypes::Field};
+use datafusion_common::cast::{as_float64_array, as_uint64_array};
+use datafusion_common::{HashMap, Result, ScalarValue};
 use datafusion_doc::aggregate_doc_sections::DOC_SECTION_STATISTICAL;
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
-use datafusion_expr::type_coercion::aggregates::NUMERICS;
 use datafusion_expr::utils::format_state_name;
 use datafusion_expr::{
     Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
 };
-use std::any::Any;
 use std::fmt::Debug;
 use std::hash::Hash;
 use std::mem::size_of_val;
@@ -59,26 +49,20 @@ make_regr_udaf_expr_and_func!(regr_sxx, regr_sxx_udaf, RegrType::SXX);
 make_regr_udaf_expr_and_func!(regr_syy, regr_syy_udaf, RegrType::SYY);
 make_regr_udaf_expr_and_func!(regr_sxy, regr_sxy_udaf, RegrType::SXY);
 
-#[derive(PartialEq, Eq, Hash)]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct Regr {
     signature: Signature,
     regr_type: RegrType,
     func_name: &'static str,
 }
 
-impl Debug for Regr {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        f.debug_struct("regr")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Regr {
     pub fn new(regr_type: RegrType, func_name: &'static str) -> Self {
         Self {
-            signature: Signature::uniform(2, NUMERICS.to_vec(), Volatility::Immutable),
+            signature: Signature::exact(
+                vec![DataType::Float64, DataType::Float64],
+                Volatility::Immutable,
+            ),
             regr_type,
             func_name,
         }
@@ -86,7 +70,6 @@ impl Regr {
 }
 
 #[derive(Debug, Clone, PartialEq, Hash, Eq)]
-#[allow(clippy::upper_case_acronyms)]
 pub enum RegrType {
     /// Variant for `regr_slope` aggregate expression
     /// Returns the slope of the linear regression line for non-null pairs in aggregate columns.
@@ -458,10 +441,6 @@ fn get_regr_docs() -> &'static HashMap<RegrType, Documentation> {
 }
 
 impl AggregateUDFImpl for Regr {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         self.func_name
     }
@@ -470,12 +449,8 @@ impl AggregateUDFImpl for Regr {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if !arg_types[0].is_numeric() {
-            return plan_err!("Covariance requires numeric input types");
-        }
-
-        if matches!(self.regr_type, RegrType::Count) {
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        if self.regr_type == RegrType::Count {
             Ok(DataType::UInt64)
         } else {
             Ok(DataType::Float64)
@@ -608,32 +583,18 @@ impl Accumulator for RegrAccumulator {
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
         // regr_slope(Y, X) calculates k in y = k*x + b
-        let values_y = &cast(&values[0], &DataType::Float64)?;
-        let values_x = &cast(&values[1], &DataType::Float64)?;
-
-        let mut arr_y = downcast_value!(values_y, Float64Array).iter().flatten();
-        let mut arr_x = downcast_value!(values_x, Float64Array).iter().flatten();
+        let values_y = as_float64_array(&values[0])?;
+        let values_x = as_float64_array(&values[1])?;
 
-        for i in 0..values_y.len() {
+        for (value_y, value_x) in values_y.iter().zip(values_x) {
             // skip either x or y is NULL
-            let value_y = if values_y.is_valid(i) {
-                arr_y.next()
-            } else {
-                None
+            let (value_y, value_x) = match (value_y, value_x) {
+                (Some(y), Some(x)) => (y, x),
+                // skip either x or y is NULL
+                _ => continue,
             };
-            let value_x = if values_x.is_valid(i) {
-                arr_x.next()
-            } else {
-                None
-            };
-            if value_y.is_none() || value_x.is_none() {
-                continue;
-            }
 
             // Update states for regr_slope(y,x) [using cov_pop(x,y)/var_pop(x)]
-            let value_y = unwrap_or_internal_err!(value_y);
-            let value_x = unwrap_or_internal_err!(value_x);
-
             self.count += 1;
             let delta_x = value_x - self.mean_x;
             let delta_y = value_y - self.mean_y;
@@ -654,32 +615,18 @@ impl Accumulator for RegrAccumulator {
     }
 
     fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        let values_y = &cast(&values[0], &DataType::Float64)?;
-        let values_x = &cast(&values[1], &DataType::Float64)?;
-
-        let mut arr_y = downcast_value!(values_y, Float64Array).iter().flatten();
-        let mut arr_x = downcast_value!(values_x, Float64Array).iter().flatten();
+        let values_y = as_float64_array(&values[0])?;
+        let values_x = as_float64_array(&values[1])?;
 
-        for i in 0..values_y.len() {
+        for (value_y, value_x) in values_y.iter().zip(values_x) {
             // skip either x or y is NULL
-            let value_y = if values_y.is_valid(i) {
-                arr_y.next()
-            } else {
-                None
-            };
-            let value_x = if values_x.is_valid(i) {
-                arr_x.next()
-            } else {
-                None
+            let (value_y, value_x) = match (value_y, value_x) {
+                (Some(y), Some(x)) => (y, x),
+                // skip either x or y is NULL
+                _ => continue,
             };
-            if value_y.is_none() || value_x.is_none() {
-                continue;
-            }
 
             // Update states for regr_slope(y,x) [using cov_pop(x,y)/var_pop(x)]
-            let value_y = unwrap_or_internal_err!(value_y);
-            let value_x = unwrap_or_internal_err!(value_x);
-
             if self.count > 1 {
                 self.count -= 1;
                 let delta_x = value_x - self.mean_x;
@@ -705,12 +652,12 @@ impl Accumulator for RegrAccumulator {
     }
 
     fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
-        let count_arr = downcast_value!(states[0], UInt64Array);
-        let mean_x_arr = downcast_value!(states[1], Float64Array);
-        let mean_y_arr = downcast_value!(states[2], Float64Array);
-        let m2_x_arr = downcast_value!(states[3], Float64Array);
-        let m2_y_arr = downcast_value!(states[4], Float64Array);
-        let algo_const_arr = downcast_value!(states[5], Float64Array);
+        let count_arr = as_uint64_array(&states[0])?;
+        let mean_x_arr = as_float64_array(&states[1])?;
+        let mean_y_arr = as_float64_array(&states[2])?;
+        let m2_x_arr = as_float64_array(&states[3])?;
+        let m2_y_arr = as_float64_array(&states[4])?;
+        let algo_const_arr = as_float64_array(&states[5])?;
 
         for i in 0..count_arr.len() {
             let count_b = count_arr.value(i);
diff --git a/datafusion/functions-aggregate/src/stddev.rs b/datafusion/functions-aggregate/src/stddev.rs
index 782524aa4d0ac..68e38a3b8db07 100644
--- a/datafusion/functions-aggregate/src/stddev.rs
+++ b/datafusion/functions-aggregate/src/stddev.rs
@@ -17,8 +17,7 @@
 
 //! Defines physical expressions that can evaluated at runtime during query execution
 
-use std::any::Any;
-use std::fmt::{Debug, Formatter};
+use std::fmt::Debug;
 use std::hash::Hash;
 use std::mem::align_of_val;
 use std::sync::Arc;
@@ -26,8 +25,8 @@ use std::sync::Arc;
 use arrow::array::Float64Array;
 use arrow::datatypes::FieldRef;
 use arrow::{array::ArrayRef, datatypes::DataType, datatypes::Field};
-use datafusion_common::{internal_err, not_impl_err, Result};
-use datafusion_common::{plan_err, ScalarValue};
+use datafusion_common::ScalarValue;
+use datafusion_common::{Result, internal_err, not_impl_err};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::utils::format_state_name;
 use datafusion_expr::{
@@ -62,21 +61,12 @@ make_udaf_expr_and_func!(
     standard_argument(name = "expression",)
 )]
 /// STDDEV and STDDEV_SAMP (standard deviation) aggregate expression
-#[derive(PartialEq, Eq, Hash)]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct Stddev {
     signature: Signature,
     alias: Vec<String>,
 }
 
-impl Debug for Stddev {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("Stddev")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for Stddev {
     fn default() -> Self {
         Self::new()
@@ -87,18 +77,13 @@ impl Stddev {
     /// Create a new STDDEV aggregate function
     pub fn new() -> Self {
         Self {
-            signature: Signature::numeric(1, Volatility::Immutable),
+            signature: Signature::exact(vec![DataType::Float64], Volatility::Immutable),
             alias: vec!["stddev_samp".to_string()],
         }
     }
 }
 
 impl AggregateUDFImpl for Stddev {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "stddev"
     }
@@ -180,20 +165,11 @@ make_udaf_expr_and_func!(
     standard_argument(name = "expression",)
 )]
 /// STDDEV_POP population aggregate expression
-#[derive(PartialEq, Eq, Hash)]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct StddevPop {
     signature: Signature,
 }
 
-impl Debug for StddevPop {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("StddevPop")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for StddevPop {
     fn default() -> Self {
         Self::new()
@@ -204,17 +180,12 @@ impl StddevPop {
     /// Create a new STDDEV_POP aggregate function
     pub fn new() -> Self {
         Self {
-            signature: Signature::numeric(1, Volatility::Immutable),
+            signature: Signature::exact(vec![DataType::Float64], Volatility::Immutable),
         }
     }
 }
 
 impl AggregateUDFImpl for StddevPop {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "stddev_pop"
     }
@@ -249,11 +220,7 @@ impl AggregateUDFImpl for StddevPop {
         Ok(Box::new(StddevAccumulator::try_new(StatsType::Population)?))
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if !arg_types[0].is_numeric() {
-            return plan_err!("StddevPop requires numeric input types");
-        }
-
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
         Ok(DataType::Float64)
     }
 
@@ -318,13 +285,8 @@ impl Accumulator for StddevAccumulator {
     fn evaluate(&mut self) -> Result<ScalarValue> {
         let variance = self.variance.evaluate()?;
         match variance {
-            ScalarValue::Float64(e) => {
-                if e.is_none() {
-                    Ok(ScalarValue::Float64(None))
-                } else {
-                    Ok(ScalarValue::Float64(e.map(|f| f.sqrt())))
-                }
-            }
+            ScalarValue::Float64(None) => Ok(ScalarValue::Float64(None)),
+            ScalarValue::Float64(Some(f)) => Ok(ScalarValue::Float64(Some(f.sqrt()))),
             _ => internal_err!("Variance should be f64"),
         }
     }
@@ -396,7 +358,6 @@ mod tests {
     use datafusion_expr::AggregateUDF;
     use datafusion_functions_aggregate_common::utils::get_accum_scalar_values_as_arrays;
     use datafusion_physical_expr::expressions::col;
-    use std::sync::Arc;
 
     #[test]
     fn stddev_f64_merge_1() -> Result<()> {
@@ -473,12 +434,16 @@ mod tests {
         let mut accum1 = agg1.accumulator(args1)?;
         let mut accum2 = agg2.accumulator(args2)?;
 
-        let value1 = vec![col("a", schema)?
-            .evaluate(batch1)
-            .and_then(|v| v.into_array(batch1.num_rows()))?];
-        let value2 = vec![col("a", schema)?
-            .evaluate(batch2)
-            .and_then(|v| v.into_array(batch2.num_rows()))?];
+        let value1 = vec![
+            col("a", schema)?
+                .evaluate(batch1)
+                .and_then(|v| v.into_array(batch1.num_rows()))?,
+        ];
+        let value2 = vec![
+            col("a", schema)?
+                .evaluate(batch2)
+                .and_then(|v| v.into_array(batch2.num_rows()))?,
+        ];
 
         accum1.update_batch(&value1)?;
         accum2.update_batch(&value2)?;
diff --git a/datafusion/functions-aggregate/src/string_agg.rs b/datafusion/functions-aggregate/src/string_agg.rs
index 4a040df7b4a3b..f0757818afb93 100644
--- a/datafusion/functions-aggregate/src/string_agg.rs
+++ b/datafusion/functions-aggregate/src/string_agg.rs
@@ -17,26 +17,26 @@
 
 //! [`StringAgg`] accumulator for the `string_agg` function
 
-use std::any::Any;
 use std::hash::Hash;
 use std::mem::size_of_val;
+use std::sync::Arc;
 
 use crate::array_agg::ArrayAgg;
 
-use arrow::array::ArrayRef;
+use arrow::array::{ArrayRef, AsArray, BooleanArray, LargeStringArray};
 use arrow::datatypes::{DataType, Field, FieldRef};
-use datafusion_common::cast::{
-    as_generic_string_array, as_string_array, as_string_view_array,
-};
+use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
 use datafusion_common::{
-    internal_datafusion_err, internal_err, not_impl_err, Result, ScalarValue,
+    Result, ScalarValue, internal_datafusion_err, internal_err, not_impl_err,
 };
 use datafusion_expr::function::AccumulatorArgs;
 use datafusion_expr::utils::format_state_name;
 use datafusion_expr::{
-    Accumulator, AggregateUDFImpl, Documentation, Signature, TypeSignature, Volatility,
+    Accumulator, AggregateUDFImpl, Documentation, EmitTo, GroupsAccumulator, Signature,
+    TypeSignature, Volatility,
 };
 use datafusion_functions_aggregate_common::accumulator::StateFieldsArgs;
+use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::apply_filter_as_nulls;
 use datafusion_macros::user_doc;
 use datafusion_physical_expr::expressions::Literal;
 
@@ -117,6 +117,27 @@ impl StringAgg {
             array_agg: Default::default(),
         }
     }
+
+    /// Extract the delimiter string from the second argument expression.
+    fn extract_delimiter(args: &AccumulatorArgs) -> Result<String> {
+        let Some(lit) = args.exprs[1].downcast_ref::<Literal>() else {
+            return not_impl_err!("string_agg delimiter must be a string literal");
+        };
+
+        if lit.value().is_null() {
+            return Ok(String::new());
+        }
+
+        match lit.value().try_as_str() {
+            Some(s) => Ok(s.unwrap_or("").to_string()),
+            None => {
+                not_impl_err!(
+                    "string_agg not supported for delimiter \"{}\"",
+                    lit.value()
+                )
+            }
+        }
+    }
 }
 
 impl Default for StringAgg {
@@ -125,13 +146,11 @@ impl Default for StringAgg {
     }
 }
 
-/// If there is no `distinct` and `order by` required by the `string_agg` call, a
-/// more efficient accumulator `SimpleStringAggAccumulator` will be used.
+/// Three accumulation strategies depending on query shape:
+/// - No DISTINCT / ORDER BY with GROUP BY: `StringAggGroupsAccumulator`
+/// - No DISTINCT / ORDER BY without GROUP BY: `SimpleStringAggAccumulator`
+/// - With DISTINCT or ORDER BY: `StringAggAccumulator` (delegates to `ArrayAgg`)
 impl AggregateUDFImpl for StringAgg {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "string_agg"
     }
@@ -145,52 +164,26 @@ impl AggregateUDFImpl for StringAgg {
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        // See comments in `impl AggregateUDFImpl ...` for more detail
-        let no_order_no_distinct =
-            (args.ordering_fields.is_empty()) && (!args.is_distinct);
-        if no_order_no_distinct {
-            // Case `SimpleStringAggAccumulator`
-            Ok(vec![Field::new(
-                format_state_name(args.name, "string_agg"),
-                DataType::LargeUtf8,
-                true,
-            )
-            .into()])
+        if !args.is_distinct && args.ordering_fields.is_empty() {
+            Ok(vec![
+                Field::new(
+                    format_state_name(args.name, "string_agg"),
+                    DataType::LargeUtf8,
+                    true,
+                )
+                .into(),
+            ])
         } else {
-            // Case `StringAggAccumulator`
             self.array_agg.state_fields(args)
         }
     }
 
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
-        let Some(lit) = acc_args.exprs[1].as_any().downcast_ref::<Literal>() else {
-            return not_impl_err!(
-                "The second argument of the string_agg function must be a string literal"
-            );
-        };
-
-        let delimiter = if lit.value().is_null() {
-            // If the second argument (the delimiter that joins strings) is NULL, join
-            // on an empty string. (e.g. [a, b, c] => "abc").
-            ""
-        } else if let Some(lit_string) = lit.value().try_as_str() {
-            lit_string.unwrap_or("")
-        } else {
-            return not_impl_err!(
-                "StringAgg not supported for delimiter \"{}\"",
-                lit.value()
-            );
-        };
+        let delimiter = Self::extract_delimiter(&acc_args)?;
 
-        // See comments in `impl AggregateUDFImpl ...` for more detail
-        let no_order_no_distinct =
-            acc_args.order_bys.is_empty() && (!acc_args.is_distinct);
-
-        if no_order_no_distinct {
-            // simple case (more efficient)
-            Ok(Box::new(SimpleStringAggAccumulator::new(delimiter)))
+        if !acc_args.is_distinct && acc_args.order_bys.is_empty() {
+            Ok(Box::new(SimpleStringAggAccumulator::new(&delimiter)))
         } else {
-            // general case
             let array_agg_acc = self.array_agg.accumulator(AccumulatorArgs {
                 return_field: Field::new(
                     "f",
@@ -213,7 +206,7 @@ impl AggregateUDFImpl for StringAgg {
 
             Ok(Box::new(StringAggAccumulator::new(
                 array_agg_acc,
-                delimiter,
+                &delimiter,
             )))
         }
     }
@@ -222,6 +215,18 @@ impl AggregateUDFImpl for StringAgg {
         datafusion_expr::ReversedUDAF::Reversed(string_agg_udaf())
     }
 
+    fn groups_accumulator_supported(&self, args: AccumulatorArgs) -> bool {
+        !args.is_distinct && args.order_bys.is_empty()
+    }
+
+    fn create_groups_accumulator(
+        &self,
+        args: AccumulatorArgs,
+    ) -> Result<Box<dyn GroupsAccumulator>> {
+        let delimiter = Self::extract_delimiter(&args)?;
+        Ok(Box::new(StringAggGroupsAccumulator::new(delimiter)))
+    }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
@@ -252,7 +257,10 @@ impl Accumulator for StringAggAccumulator {
         let scalar = self.array_agg_acc.evaluate()?;
 
         let ScalarValue::List(list) = scalar else {
-            return internal_err!("Expected a DataType::List while evaluating underlying ArrayAggAccumulator, but got {}", scalar.data_type());
+            return internal_err!(
+                "Expected a DataType::List while evaluating underlying ArrayAggAccumulator, but got {}",
+                scalar.data_type()
+            );
         };
 
         let string_arr: Vec<_> = match list.value_type() {
@@ -272,7 +280,7 @@ impl Accumulator for StringAggAccumulator {
                 return internal_err!(
                     "Expected elements to of type Utf8 or LargeUtf8, but got {}",
                     list.value_type()
-                )
+                );
             }
         };
 
@@ -310,10 +318,136 @@ fn filter_index<T: Clone>(values: &[T], index: usize) -> Vec<T> {
         .collect::<Vec<_>>()
 }
 
-/// StringAgg accumulator for the simple case (no order or distinct specified)
-/// This accumulator is more efficient than `StringAggAccumulator`
-/// because it accumulates the string directly,
-/// whereas `StringAggAccumulator` uses `ArrayAggAccumulator`.
+/// GroupsAccumulator for `string_agg` without DISTINCT or ORDER BY.
+#[derive(Debug)]
+struct StringAggGroupsAccumulator {
+    /// The delimiter placed between concatenated values.
+    delimiter: String,
+    /// Accumulated string per group. `None` means no values have been seen
+    /// (the group's output will be NULL).
+    /// A potential improvement is to avoid this String allocation
+    /// See <https://github.com/apache/datafusion/issues/21156>
+    values: Vec<Option<String>>,
+    /// Running total of string data bytes across all groups.
+    total_data_bytes: usize,
+}
+
+impl StringAggGroupsAccumulator {
+    fn new(delimiter: String) -> Self {
+        Self {
+            delimiter,
+            values: Vec::new(),
+            total_data_bytes: 0,
+        }
+    }
+
+    fn append_batch<'a>(
+        &mut self,
+        iter: impl Iterator<Item = Option<&'a str>>,
+        group_indices: &[usize],
+    ) {
+        for (opt_value, &group_idx) in iter.zip(group_indices.iter()) {
+            if let Some(value) = opt_value {
+                match &mut self.values[group_idx] {
+                    Some(existing) => {
+                        let added = self.delimiter.len() + value.len();
+                        existing.reserve(added);
+                        existing.push_str(&self.delimiter);
+                        existing.push_str(value);
+                        self.total_data_bytes += added;
+                    }
+                    slot @ None => {
+                        *slot = Some(value.to_string());
+                        self.total_data_bytes += value.len();
+                    }
+                }
+            }
+        }
+    }
+}
+
+impl GroupsAccumulator for StringAggGroupsAccumulator {
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        self.values.resize(total_num_groups, None);
+        let array = apply_filter_as_nulls(&values[0], opt_filter)?;
+        match array.data_type() {
+            DataType::Utf8 => {
+                self.append_batch(array.as_string::<i32>().iter(), group_indices)
+            }
+            DataType::LargeUtf8 => {
+                self.append_batch(array.as_string::<i64>().iter(), group_indices)
+            }
+            DataType::Utf8View => {
+                self.append_batch(array.as_string_view().iter(), group_indices)
+            }
+            other => {
+                return internal_err!("string_agg unexpected data type: {other}");
+            }
+        }
+        Ok(())
+    }
+
+    fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
+        let to_emit = emit_to.take_needed(&mut self.values);
+        let emitted_bytes: usize = to_emit
+            .iter()
+            .filter_map(|opt| opt.as_ref().map(|s| s.len()))
+            .sum();
+        self.total_data_bytes -= emitted_bytes;
+
+        let result: ArrayRef = Arc::new(LargeStringArray::from(to_emit));
+        Ok(result)
+    }
+
+    fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
+        self.evaluate(emit_to).map(|arr| vec![arr])
+    }
+
+    fn merge_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        // State is always LargeUtf8, which update_batch already handles.
+        self.update_batch(values, group_indices, opt_filter, total_num_groups)
+    }
+
+    fn convert_to_state(
+        &self,
+        values: &[ArrayRef],
+        opt_filter: Option<&BooleanArray>,
+    ) -> Result<Vec<ArrayRef>> {
+        let input = apply_filter_as_nulls(&values[0], opt_filter)?;
+        let result = if input.data_type() == &DataType::LargeUtf8 {
+            input
+        } else {
+            arrow::compute::cast(&input, &DataType::LargeUtf8)?
+        };
+        Ok(vec![result])
+    }
+
+    fn supports_convert_to_state(&self) -> bool {
+        true
+    }
+
+    fn size(&self) -> usize {
+        self.total_data_bytes
+            + self.values.capacity() * size_of::<Option<String>>()
+            + self.delimiter.capacity()
+            + size_of_val(self)
+    }
+}
+
+/// Per-row accumulator for `string_agg` without DISTINCT or ORDER BY.  Used for
+/// non-grouped aggregation; grouped queries use [`StringAggGroupsAccumulator`].
 #[derive(Debug)]
 pub(crate) struct SimpleStringAggAccumulator {
     delimiter: String,
@@ -326,7 +460,7 @@ impl SimpleStringAggAccumulator {
     pub fn new(delimiter: &str) -> Self {
         Self {
             delimiter: delimiter.to_string(),
-            accumulated_string: "".to_string(),
+            accumulated_string: String::new(),
             has_value: false,
         }
     }
@@ -356,18 +490,11 @@ impl Accumulator for SimpleStringAggAccumulator {
         })?;
 
         match string_arr.data_type() {
-            DataType::Utf8 => {
-                let array = as_string_array(string_arr)?;
-                self.append_strings(array.iter());
-            }
+            DataType::Utf8 => self.append_strings(string_arr.as_string::<i32>().iter()),
             DataType::LargeUtf8 => {
-                let array = as_generic_string_array::<i64>(string_arr)?;
-                self.append_strings(array.iter());
-            }
-            DataType::Utf8View => {
-                let array = as_string_view_array(string_arr)?;
-                self.append_strings(array.iter());
+                self.append_strings(string_arr.as_string::<i64>().iter())
             }
+            DataType::Utf8View => self.append_strings(string_arr.as_string_view().iter()),
             other => {
                 return internal_err!(
                     "Planner should ensure string_agg first argument is Utf8-like, found {other}"
@@ -379,14 +506,13 @@ impl Accumulator for SimpleStringAggAccumulator {
     }
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
-        let result = if self.has_value {
-            ScalarValue::LargeUtf8(Some(std::mem::take(&mut self.accumulated_string)))
+        if self.has_value {
+            Ok(ScalarValue::LargeUtf8(Some(
+                self.accumulated_string.clone(),
+            )))
         } else {
-            ScalarValue::LargeUtf8(None)
-        };
-
-        self.has_value = false;
-        Ok(result)
+            Ok(ScalarValue::LargeUtf8(None))
+        }
     }
 
     fn size(&self) -> usize {
@@ -415,7 +541,6 @@ mod tests {
     use arrow::array::LargeStringArray;
     use arrow::compute::SortOptions;
     use arrow::datatypes::{Fields, Schema};
-    use datafusion_common::internal_err;
     use datafusion_physical_expr::expressions::Column;
     use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
     use std::sync::Arc;
@@ -659,4 +784,172 @@ mod tests {
         acc1.merge_batch(&intermediate_state)?;
         Ok(acc1)
     }
+
+    // ---------------------------------------------------------------
+    // Tests for StringAggGroupsAccumulator
+    // ---------------------------------------------------------------
+
+    fn make_groups_acc(delimiter: &str) -> StringAggGroupsAccumulator {
+        StringAggGroupsAccumulator::new(delimiter.to_string())
+    }
+
+    /// Helper: evaluate and downcast to LargeStringArray
+    fn evaluate_groups(
+        acc: &mut StringAggGroupsAccumulator,
+        emit_to: EmitTo,
+    ) -> Vec<Option<String>> {
+        let result = acc.evaluate(emit_to).unwrap();
+        let arr = result.as_any().downcast_ref::<LargeStringArray>().unwrap();
+        arr.iter().map(|v| v.map(|s| s.to_string())).collect()
+    }
+
+    #[test]
+    fn groups_basic() -> Result<()> {
+        let mut acc = make_groups_acc(",");
+
+        // 6 rows, 3 groups: group 0 gets "a","d"; group 1 gets "b","e"; group 2 gets "c","f"
+        let values: ArrayRef =
+            Arc::new(LargeStringArray::from(vec!["a", "b", "c", "d", "e", "f"]));
+        let group_indices = vec![0, 1, 2, 0, 1, 2];
+        acc.update_batch(&[values], &group_indices, None, 3)?;
+
+        let result = evaluate_groups(&mut acc, EmitTo::All);
+        assert_eq!(
+            result,
+            vec![
+                Some("a,d".to_string()),
+                Some("b,e".to_string()),
+                Some("c,f".to_string()),
+            ]
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn groups_with_nulls() -> Result<()> {
+        let mut acc = make_groups_acc("|");
+
+        // Group 0: "a", NULL, "c" → "a|c"
+        // Group 1: NULL, "b"     → "b"
+        // Group 2: NULL only     → NULL
+        let values: ArrayRef = Arc::new(LargeStringArray::from(vec![
+            Some("a"),
+            None,
+            Some("c"),
+            None,
+            Some("b"),
+            None,
+        ]));
+        let group_indices = vec![0, 1, 0, 2, 1, 2];
+        acc.update_batch(&[values], &group_indices, None, 3)?;
+
+        let result = evaluate_groups(&mut acc, EmitTo::All);
+        assert_eq!(
+            result,
+            vec![Some("a|c".to_string()), Some("b".to_string()), None,]
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn groups_with_filter() -> Result<()> {
+        let mut acc = make_groups_acc(",");
+
+        let values: ArrayRef = Arc::new(LargeStringArray::from(vec!["a", "b", "c", "d"]));
+        let group_indices = vec![0, 0, 1, 1];
+        // Filter: only rows 0 and 3 are included
+        let filter = BooleanArray::from(vec![true, false, false, true]);
+        acc.update_batch(&[values], &group_indices, Some(&filter), 2)?;
+
+        let result = evaluate_groups(&mut acc, EmitTo::All);
+        assert_eq!(result, vec![Some("a".to_string()), Some("d".to_string())]);
+        Ok(())
+    }
+
+    #[test]
+    fn groups_emit_first() -> Result<()> {
+        let mut acc = make_groups_acc(",");
+
+        let values: ArrayRef =
+            Arc::new(LargeStringArray::from(vec!["a", "b", "c", "d", "e", "f"]));
+        let group_indices = vec![0, 1, 2, 0, 1, 2];
+        acc.update_batch(&[values], &group_indices, None, 3)?;
+
+        // Emit only the first 2 groups
+        let result = evaluate_groups(&mut acc, EmitTo::First(2));
+        assert_eq!(
+            result,
+            vec![Some("a,d".to_string()), Some("b,e".to_string())]
+        );
+
+        // Group 2 (now shifted to index 0) should still be intact
+        let result = evaluate_groups(&mut acc, EmitTo::All);
+        assert_eq!(result, vec![Some("c,f".to_string())]);
+        Ok(())
+    }
+
+    #[test]
+    fn groups_merge_batch() -> Result<()> {
+        let mut acc = make_groups_acc(",");
+
+        // First batch: group 0 = "a", group 1 = "b"
+        let values: ArrayRef = Arc::new(LargeStringArray::from(vec!["a", "b"]));
+        acc.update_batch(&[values], &[0, 1], None, 2)?;
+
+        // Simulate a second accumulator's state (LargeUtf8 partial strings)
+        let partial_state: ArrayRef = Arc::new(LargeStringArray::from(vec!["c,d", "e"]));
+        acc.merge_batch(&[partial_state], &[0, 1], None, 2)?;
+
+        let result = evaluate_groups(&mut acc, EmitTo::All);
+        assert_eq!(
+            result,
+            vec![Some("a,c,d".to_string()), Some("b,e".to_string())]
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn groups_empty_groups() -> Result<()> {
+        let mut acc = make_groups_acc(",");
+
+        // 4 groups total, but only groups 0 and 2 receive values
+        let values: ArrayRef = Arc::new(LargeStringArray::from(vec!["a", "b"]));
+        acc.update_batch(&[values], &[0, 2], None, 4)?;
+
+        let result = evaluate_groups(&mut acc, EmitTo::All);
+        assert_eq!(
+            result,
+            vec![
+                Some("a".to_string()),
+                None, // group 1: never received a value
+                Some("b".to_string()),
+                None, // group 3: never received a value
+            ]
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn groups_multiple_batches() -> Result<()> {
+        let mut acc = make_groups_acc("|");
+
+        // Batch 1: 2 groups
+        let values: ArrayRef = Arc::new(LargeStringArray::from(vec!["a", "b"]));
+        acc.update_batch(&[values], &[0, 1], None, 2)?;
+
+        // Batch 2: same groups, plus a new group
+        let values: ArrayRef = Arc::new(LargeStringArray::from(vec!["c", "d", "e"]));
+        acc.update_batch(&[values], &[0, 1, 2], None, 3)?;
+
+        let result = evaluate_groups(&mut acc, EmitTo::All);
+        assert_eq!(
+            result,
+            vec![
+                Some("a|c".to_string()),
+                Some("b|d".to_string()),
+                Some("e".to_string()),
+            ]
+        );
+        Ok(())
+    }
 }
diff --git a/datafusion/functions-aggregate/src/sum.rs b/datafusion/functions-aggregate/src/sum.rs
index 958553d78ca51..81efea1df22b1 100644
--- a/datafusion/functions-aggregate/src/sum.rs
+++ b/datafusion/functions-aggregate/src/sum.rs
@@ -17,36 +17,35 @@
 
 //! Defines `SUM` and `SUM DISTINCT` aggregate accumulators
 
-use ahash::RandomState;
-use arrow::datatypes::DECIMAL32_MAX_PRECISION;
-use arrow::datatypes::DECIMAL64_MAX_PRECISION;
-use datafusion_expr::utils::AggregateOrderSensitivity;
-use datafusion_expr::Expr;
-use std::any::Any;
-use std::mem::size_of_val;
-
-use arrow::array::Array;
-use arrow::array::ArrowNativeTypeOp;
-use arrow::array::{ArrowNumericType, AsArray};
-use arrow::datatypes::{ArrowNativeType, FieldRef};
+use arrow::array::{Array, ArrayRef, ArrowNativeTypeOp, ArrowNumericType, AsArray};
+use arrow::datatypes::Field;
 use arrow::datatypes::{
-    DataType, Decimal128Type, Decimal256Type, Decimal32Type, Decimal64Type, Float64Type,
-    Int64Type, UInt64Type, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION,
+    ArrowNativeType, DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION,
+    DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, DataType, Decimal32Type,
+    Decimal64Type, Decimal128Type, Decimal256Type, DurationMicrosecondType,
+    DurationMillisecondType, DurationNanosecondType, DurationSecondType, FieldRef,
+    Float64Type, Int64Type, TimeUnit, UInt64Type,
 };
-use arrow::{array::ArrayRef, datatypes::Field};
-use datafusion_common::{
-    exec_err, not_impl_err, utils::take_function_args, HashMap, Result, ScalarValue,
+use datafusion_common::hash_utils::RandomState;
+use datafusion_common::internal_err;
+use datafusion_common::types::{
+    NativeType, logical_float64, logical_int8, logical_int16, logical_int32,
+    logical_int64, logical_uint8, logical_uint16, logical_uint32, logical_uint64,
 };
-use datafusion_expr::function::AccumulatorArgs;
-use datafusion_expr::function::StateFieldsArgs;
-use datafusion_expr::utils::format_state_name;
+use datafusion_common::{HashMap, Result, ScalarValue, exec_err, not_impl_err};
+use datafusion_expr::expr::AggregateFunction;
+use datafusion_expr::expr_fn::cast;
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::utils::{AggregateOrderSensitivity, format_state_name};
 use datafusion_expr::{
-    Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, ReversedUDAF,
-    SetMonotonicity, Signature, Volatility,
+    Accumulator, AggregateUDFImpl, Coercion, Documentation, Expr, GroupsAccumulator,
+    Operator, ReversedUDAF, SetMonotonicity, Signature, TypeSignature,
+    TypeSignatureClass, Volatility,
 };
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::prim_op::PrimitiveGroupsAccumulator;
 use datafusion_functions_aggregate_common::aggregate::sum_distinct::DistinctSumAccumulator;
 use datafusion_macros::user_doc;
+use std::mem::size_of_val;
 
 make_udaf_expr_and_func!(
     Sum,
@@ -57,7 +56,7 @@ make_udaf_expr_and_func!(
 );
 
 pub fn sum_distinct(expr: Expr) -> Expr {
-    Expr::AggregateFunction(datafusion_expr::expr::AggregateFunction::new_udf(
+    Expr::AggregateFunction(AggregateFunction::new_udf(
         sum_udaf(),
         vec![expr],
         true,
@@ -97,6 +96,27 @@ macro_rules! downcast_sum {
             DataType::Decimal256(_, _) => {
                 $helper!(Decimal256Type, $args.return_field.data_type().clone())
             }
+            DataType::Duration(TimeUnit::Second) => {
+                $helper!(DurationSecondType, $args.return_field.data_type().clone())
+            }
+            DataType::Duration(TimeUnit::Millisecond) => {
+                $helper!(
+                    DurationMillisecondType,
+                    $args.return_field.data_type().clone()
+                )
+            }
+            DataType::Duration(TimeUnit::Microsecond) => {
+                $helper!(
+                    DurationMicrosecondType,
+                    $args.return_field.data_type().clone()
+                )
+            }
+            DataType::Duration(TimeUnit::Nanosecond) => {
+                $helper!(
+                    DurationNanosecondType,
+                    $args.return_field.data_type().clone()
+                )
+            }
             _ => {
                 not_impl_err!(
                     "Sum not supported for {}: {}",
@@ -130,7 +150,45 @@ pub struct Sum {
 impl Sum {
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+            // Refer to https://www.postgresql.org/docs/8.2/functions-aggregate.html doc
+            // smallint, int, bigint, real, double precision, decimal, or interval.
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Decimal,
+                    )]),
+                    // Unsigned to u64
+                    TypeSignature::Coercible(vec![Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_uint64()),
+                        vec![
+                            TypeSignatureClass::Native(logical_uint8()),
+                            TypeSignatureClass::Native(logical_uint16()),
+                            TypeSignatureClass::Native(logical_uint32()),
+                        ],
+                        NativeType::UInt64,
+                    )]),
+                    // Signed to i64
+                    TypeSignature::Coercible(vec![Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_int64()),
+                        vec![
+                            TypeSignatureClass::Native(logical_int8()),
+                            TypeSignatureClass::Native(logical_int16()),
+                            TypeSignatureClass::Native(logical_int32()),
+                        ],
+                        NativeType::Int64,
+                    )]),
+                    // Floats to f64
+                    TypeSignature::Coercible(vec![Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_float64()),
+                        vec![TypeSignatureClass::Float],
+                        NativeType::Float64,
+                    )]),
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Duration,
+                    )]),
+                ],
+                Volatility::Immutable,
+            ),
         }
     }
 }
@@ -142,10 +200,6 @@ impl Default for Sum {
 }
 
 impl AggregateUDFImpl for Sum {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "sum"
     }
@@ -154,60 +208,30 @@ impl AggregateUDFImpl for Sum {
         &self.signature
     }
 
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        let [args] = take_function_args(self.name(), arg_types)?;
-
-        // Refer to https://www.postgresql.org/docs/8.2/functions-aggregate.html doc
-        // smallint, int, bigint, real, double precision, decimal, or interval.
-
-        fn coerced_type(data_type: &DataType) -> Result<DataType> {
-            match data_type {
-                DataType::Dictionary(_, v) => coerced_type(v),
-                // in the spark, the result type is DECIMAL(min(38,precision+10), s)
-                // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66
-                DataType::Decimal32(_, _)
-                | DataType::Decimal64(_, _)
-                | DataType::Decimal128(_, _)
-                | DataType::Decimal256(_, _) => Ok(data_type.clone()),
-                dt if dt.is_signed_integer() => Ok(DataType::Int64),
-                dt if dt.is_unsigned_integer() => Ok(DataType::UInt64),
-                dt if dt.is_floating() => Ok(DataType::Float64),
-                _ => exec_err!("Sum not supported for {data_type}"),
-            }
-        }
-
-        Ok(vec![coerced_type(args)?])
-    }
-
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
         match &arg_types[0] {
             DataType::Int64 => Ok(DataType::Int64),
             DataType::UInt64 => Ok(DataType::UInt64),
             DataType::Float64 => Ok(DataType::Float64),
+            // In the spark, the result type is DECIMAL(min(38,precision+10), s)
+            // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66
             DataType::Decimal32(precision, scale) => {
-                // in the spark, the result type is DECIMAL(min(38,precision+10), s)
-                // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66
                 let new_precision = DECIMAL32_MAX_PRECISION.min(*precision + 10);
                 Ok(DataType::Decimal32(new_precision, *scale))
             }
             DataType::Decimal64(precision, scale) => {
-                // in the spark, the result type is DECIMAL(min(38,precision+10), s)
-                // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66
                 let new_precision = DECIMAL64_MAX_PRECISION.min(*precision + 10);
                 Ok(DataType::Decimal64(new_precision, *scale))
             }
             DataType::Decimal128(precision, scale) => {
-                // in the spark, the result type is DECIMAL(min(38,precision+10), s)
-                // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66
                 let new_precision = DECIMAL128_MAX_PRECISION.min(*precision + 10);
                 Ok(DataType::Decimal128(new_precision, *scale))
             }
             DataType::Decimal256(precision, scale) => {
-                // in the spark, the result type is DECIMAL(min(38,precision+10), s)
-                // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66
                 let new_precision = DECIMAL256_MAX_PRECISION.min(*precision + 10);
                 Ok(DataType::Decimal256(new_precision, *scale))
             }
+            DataType::Duration(time_unit) => Ok(DataType::Duration(*time_unit)),
             other => {
                 exec_err!("[return_type] SUM not supported for {}", other)
             }
@@ -234,20 +258,24 @@ impl AggregateUDFImpl for Sum {
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
         if args.is_distinct {
-            Ok(vec![Field::new_list(
-                format_state_name(args.name, "sum distinct"),
-                // See COMMENTS.md to understand why nullable is set to true
-                Field::new_list_field(args.return_type().clone(), true),
-                false,
-            )
-            .into()])
+            Ok(vec![
+                Field::new_list(
+                    format_state_name(args.name, "sum distinct"),
+                    // See COMMENTS.md to understand why nullable is set to true
+                    Field::new_list_field(args.return_type().clone(), true),
+                    false,
+                )
+                .into(),
+            ])
         } else {
-            Ok(vec![Field::new(
-                format_state_name(args.name, "sum"),
-                args.return_type().clone(),
-                true,
-            )
-            .into()])
+            Ok(vec![
+                Field::new(
+                    format_state_name(args.name, "sum"),
+                    args.return_type().clone(),
+                    true,
+                )
+                .into(),
+            ])
         }
     }
 
@@ -316,6 +344,47 @@ impl AggregateUDFImpl for Sum {
             _ => SetMonotonicity::NotMonotonic,
         }
     }
+
+    /// Implement ClickBench Q29 specific optimization:
+    /// `SUM(arg + constant)` --> `SUM(arg) + constant * COUNT(arg)`
+    ///
+    /// See background on [`AggregateUDFImpl::simplify_expr_op_literal`]
+    fn simplify_expr_op_literal(
+        &self,
+        agg_function: &AggregateFunction,
+        arg: &Expr,
+        op: Operator,
+        lit: &Expr,
+        // Only support '+' so the order of the args doesn't matter
+        _arg_is_left: bool,
+    ) -> Result<Option<Expr>> {
+        if op != Operator::Plus {
+            return Ok(None);
+        }
+
+        let lit_type = match &lit {
+            Expr::Literal(value, _) => value.data_type(),
+            _ => {
+                return internal_err!(
+                    "Sum::simplify_expr_op_literal got a non literal argument"
+                );
+            }
+        };
+        if lit_type == DataType::Null {
+            return Ok(None);
+        }
+
+        // Build up SUM(arg)
+        let mut sum_agg = agg_function.clone();
+        sum_agg.params.args = vec![arg.clone()];
+        let sum_agg = Expr::AggregateFunction(sum_agg);
+
+        // COUNT(arg) - cast to the correct type
+        let count_agg = cast(crate::count::count(arg.clone()), lit_type);
+
+        // SUM(arg) + lit * COUNT(arg)
+        Ok(Some(sum_agg + (lit.clone() * count_agg)))
+    }
 }
 
 /// This accumulator computes SUM incrementally
diff --git a/datafusion/functions-aggregate/src/utils.rs b/datafusion/functions-aggregate/src/utils.rs
index c058b64f95727..6d816e54bdaf2 100644
--- a/datafusion/functions-aggregate/src/utils.rs
+++ b/datafusion/functions-aggregate/src/utils.rs
@@ -19,7 +19,7 @@ use std::sync::Arc;
 
 use arrow::array::RecordBatch;
 use arrow::datatypes::Schema;
-use datafusion_common::{internal_err, plan_err, DataFusionError, Result, ScalarValue};
+use datafusion_common::{DataFusionError, Result, ScalarValue, internal_err, plan_err};
 use datafusion_expr::ColumnarValue;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 
@@ -54,11 +54,16 @@ pub(crate) fn validate_percentile_expr(
     let percentile = match scalar_value {
         ScalarValue::Float32(Some(value)) => value as f64,
         ScalarValue::Float64(Some(value)) => value,
+        ScalarValue::Float32(None) | ScalarValue::Float64(None) => {
+            return plan_err!(
+                "Percentile value for '{fn_name}' must be Float32 or Float64 literal (got null)"
+            );
+        }
         sv => {
             return plan_err!(
                 "Percentile value for '{fn_name}' must be Float32 or Float64 literal (got data type {})",
                 sv.data_type()
-            )
+            );
         }
     };
 
diff --git a/datafusion/functions-aggregate/src/variance.rs b/datafusion/functions-aggregate/src/variance.rs
index 846c145cb11e7..ce3e00b9ffd91 100644
--- a/datafusion/functions-aggregate/src/variance.rs
+++ b/datafusion/functions-aggregate/src/variance.rs
@@ -18,20 +18,21 @@
 //! [`VarianceSample`]: variance sample aggregations.
 //! [`VariancePopulation`]: variance population aggregations.
 
-use arrow::datatypes::FieldRef;
+use arrow::datatypes::{FieldRef, Float64Type};
 use arrow::{
     array::{Array, ArrayRef, BooleanArray, Float64Array, UInt64Array},
     buffer::NullBuffer,
-    compute::kernels::cast,
     datatypes::{DataType, Field},
 };
-use datafusion_common::{downcast_value, not_impl_err, plan_err, Result, ScalarValue};
+use datafusion_common::cast::{as_float64_array, as_uint64_array};
+use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::{
-    function::{AccumulatorArgs, StateFieldsArgs},
-    utils::format_state_name,
     Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, Signature,
     Volatility,
+    function::{AccumulatorArgs, StateFieldsArgs},
+    utils::format_state_name,
 };
+use datafusion_functions_aggregate_common::utils::GenericDistinctBuffer;
 use datafusion_functions_aggregate_common::{
     aggregate::groups_accumulator::accumulate::accumulate, stats::StatsType,
 };
@@ -61,21 +62,12 @@ make_udaf_expr_and_func!(
     syntax_example = "var(expression)",
     standard_argument(name = "expression", prefix = "Numeric")
 )]
-#[derive(PartialEq, Eq, Hash)]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct VarianceSample {
     signature: Signature,
     aliases: Vec<String>,
 }
 
-impl Debug for VarianceSample {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        f.debug_struct("VarianceSample")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for VarianceSample {
     fn default() -> Self {
         Self::new()
@@ -86,16 +78,12 @@ impl VarianceSample {
     pub fn new() -> Self {
         Self {
             aliases: vec![String::from("var_sample"), String::from("var_samp")],
-            signature: Signature::numeric(1, Volatility::Immutable),
+            signature: Signature::exact(vec![DataType::Float64], Volatility::Immutable),
         }
     }
 }
 
 impl AggregateUDFImpl for VarianceSample {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "var"
     }
@@ -110,19 +98,35 @@ impl AggregateUDFImpl for VarianceSample {
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
         let name = args.name;
-        Ok(vec![
-            Field::new(format_state_name(name, "count"), DataType::UInt64, true),
-            Field::new(format_state_name(name, "mean"), DataType::Float64, true),
-            Field::new(format_state_name(name, "m2"), DataType::Float64, true),
-        ]
-        .into_iter()
-        .map(Arc::new)
-        .collect())
+        match args.is_distinct {
+            false => Ok(vec![
+                Field::new(format_state_name(name, "count"), DataType::UInt64, true),
+                Field::new(format_state_name(name, "mean"), DataType::Float64, true),
+                Field::new(format_state_name(name, "m2"), DataType::Float64, true),
+            ]
+            .into_iter()
+            .map(Arc::new)
+            .collect()),
+            true => {
+                let field = Field::new_list_field(DataType::Float64, true);
+                let state_name = "distinct_var";
+                Ok(vec![
+                    Field::new(
+                        format_state_name(name, state_name),
+                        DataType::List(Arc::new(field)),
+                        true,
+                    )
+                    .into(),
+                ])
+            }
+        }
     }
 
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
         if acc_args.is_distinct {
-            return not_impl_err!("VAR(DISTINCT) aggregations are not available");
+            return Ok(Box::new(DistinctVarianceAccumulator::new(
+                StatsType::Sample,
+            )));
         }
 
         Ok(Box::new(VarianceAccumulator::try_new(StatsType::Sample)?))
@@ -154,21 +158,12 @@ impl AggregateUDFImpl for VarianceSample {
     syntax_example = "var_pop(expression)",
     standard_argument(name = "expression", prefix = "Numeric")
 )]
-#[derive(PartialEq, Eq, Hash)]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct VariancePopulation {
     signature: Signature,
     aliases: Vec<String>,
 }
 
-impl Debug for VariancePopulation {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        f.debug_struct("VariancePopulation")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for VariancePopulation {
     fn default() -> Self {
         Self::new()
@@ -179,16 +174,12 @@ impl VariancePopulation {
     pub fn new() -> Self {
         Self {
             aliases: vec![String::from("var_population")],
-            signature: Signature::numeric(1, Volatility::Immutable),
+            signature: Signature::exact(vec![DataType::Float64], Volatility::Immutable),
         }
     }
 }
 
 impl AggregateUDFImpl for VariancePopulation {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "var_pop"
     }
@@ -197,29 +188,43 @@ impl AggregateUDFImpl for VariancePopulation {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if !arg_types[0].is_numeric() {
-            return plan_err!("Variance requires numeric input types");
-        }
-
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
         Ok(DataType::Float64)
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        let name = args.name;
-        Ok(vec![
-            Field::new(format_state_name(name, "count"), DataType::UInt64, true),
-            Field::new(format_state_name(name, "mean"), DataType::Float64, true),
-            Field::new(format_state_name(name, "m2"), DataType::Float64, true),
-        ]
-        .into_iter()
-        .map(Arc::new)
-        .collect())
+        match args.is_distinct {
+            false => {
+                let name = args.name;
+                Ok(vec![
+                    Field::new(format_state_name(name, "count"), DataType::UInt64, true),
+                    Field::new(format_state_name(name, "mean"), DataType::Float64, true),
+                    Field::new(format_state_name(name, "m2"), DataType::Float64, true),
+                ]
+                .into_iter()
+                .map(Arc::new)
+                .collect())
+            }
+            true => {
+                let field = Field::new_list_field(DataType::Float64, true);
+                let state_name = "distinct_var";
+                Ok(vec![
+                    Field::new(
+                        format_state_name(args.name, state_name),
+                        DataType::List(Arc::new(field)),
+                        true,
+                    )
+                    .into(),
+                ])
+            }
+        }
     }
 
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
         if acc_args.is_distinct {
-            return not_impl_err!("VAR_POP(DISTINCT) aggregations are not available");
+            return Ok(Box::new(DistinctVarianceAccumulator::new(
+                StatsType::Population,
+            )));
         }
 
         Ok(Box::new(VarianceAccumulator::try_new(
@@ -243,6 +248,7 @@ impl AggregateUDFImpl for VariancePopulation {
             StatsType::Population,
         )))
     }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
@@ -330,10 +336,8 @@ impl Accumulator for VarianceAccumulator {
     }
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        let values = &cast(&values[0], &DataType::Float64)?;
-        let arr = downcast_value!(values, Float64Array).iter().flatten();
-
-        for value in arr {
+        let arr = as_float64_array(&values[0])?;
+        for value in arr.iter().flatten() {
             (self.count, self.mean, self.m2) =
                 update(self.count, self.mean, self.m2, value)
         }
@@ -342,10 +346,8 @@ impl Accumulator for VarianceAccumulator {
     }
 
     fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        let values = &cast(&values[0], &DataType::Float64)?;
-        let arr = downcast_value!(values, Float64Array).iter().flatten();
-
-        for value in arr {
+        let arr = as_float64_array(&values[0])?;
+        for value in arr.iter().flatten() {
             let new_count = self.count - 1;
             let delta1 = self.mean - value;
             let new_mean = delta1 / new_count as f64 + self.mean;
@@ -361,9 +363,9 @@ impl Accumulator for VarianceAccumulator {
     }
 
     fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
-        let counts = downcast_value!(states[0], UInt64Array);
-        let means = downcast_value!(states[1], Float64Array);
-        let m2s = downcast_value!(states[2], Float64Array);
+        let counts = as_uint64_array(&states[0])?;
+        let means = as_float64_array(&states[1])?;
+        let m2s = as_float64_array(&states[2])?;
 
         for i in 0..counts.len() {
             let c = counts.value(i);
@@ -498,8 +500,7 @@ impl GroupsAccumulator for VarianceGroupsAccumulator {
         total_num_groups: usize,
     ) -> Result<()> {
         assert_eq!(values.len(), 1, "single argument to update_batch");
-        let values = &cast(&values[0], &DataType::Float64)?;
-        let values = downcast_value!(values, Float64Array);
+        let values = as_float64_array(&values[0])?;
 
         self.resize(total_num_groups);
         accumulate(group_indices, values, opt_filter, |group_index, value| {
@@ -526,9 +527,9 @@ impl GroupsAccumulator for VarianceGroupsAccumulator {
     ) -> Result<()> {
         assert_eq!(values.len(), 3, "two arguments to merge_batch");
         // first batch is counts, second is partial means, third is partial m2s
-        let partial_counts = downcast_value!(values[0], UInt64Array);
-        let partial_means = downcast_value!(values[1], Float64Array);
-        let partial_m2s = downcast_value!(values[2], Float64Array);
+        let partial_counts = as_uint64_array(&values[0])?;
+        let partial_means = as_float64_array(&values[1])?;
+        let partial_m2s = as_float64_array(&values[2])?;
 
         self.resize(total_num_groups);
         Self::merge(
@@ -581,6 +582,71 @@ impl GroupsAccumulator for VarianceGroupsAccumulator {
     }
 }
 
+#[derive(Debug)]
+pub struct DistinctVarianceAccumulator {
+    distinct_values: GenericDistinctBuffer<Float64Type>,
+    stat_type: StatsType,
+}
+
+impl DistinctVarianceAccumulator {
+    pub fn new(stat_type: StatsType) -> Self {
+        Self {
+            distinct_values: GenericDistinctBuffer::<Float64Type>::new(DataType::Float64),
+            stat_type,
+        }
+    }
+}
+
+impl Accumulator for DistinctVarianceAccumulator {
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        self.distinct_values.update_batch(values)
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        let values = self
+            .distinct_values
+            .values
+            .iter()
+            .map(|v| v.0)
+            .collect::<Vec<_>>();
+
+        let count = match self.stat_type {
+            StatsType::Sample => {
+                if !values.is_empty() {
+                    values.len() - 1
+                } else {
+                    0
+                }
+            }
+            StatsType::Population => values.len(),
+        };
+
+        let mean = values.iter().sum::<f64>() / values.len() as f64;
+        let m2 = values.iter().map(|x| (x - mean) * (x - mean)).sum::<f64>();
+
+        Ok(ScalarValue::Float64(match values.len() {
+            0 => None,
+            1 => match self.stat_type {
+                StatsType::Population => Some(0.0),
+                StatsType::Sample => None,
+            },
+            _ => Some(m2 / count as f64),
+        }))
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self) + self.distinct_values.size()
+    }
+
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        self.distinct_values.state()
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        self.distinct_values.merge_batch(states)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use datafusion_expr::EmitTo;
diff --git a/datafusion/functions-nested/Cargo.toml b/datafusion/functions-nested/Cargo.toml
index 9c0b7a16f9a9b..b25aa0b9bc2a2 100644
--- a/datafusion/functions-nested/Cargo.toml
+++ b/datafusion/functions-nested/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -54,18 +57,79 @@ datafusion-functions-aggregate = { workspace = true }
 datafusion-functions-aggregate-common = { workspace = true }
 datafusion-macros = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
+hashbrown = { workspace = true }
 itertools = { workspace = true, features = ["use_std"] }
+itoa = { workspace = true }
 log = { workspace = true }
-paste = "1.0.14"
+memchr = { workspace = true }
 
 [dev-dependencies]
+arrow = { workspace = true, features = ["test_utils"] }
 criterion = { workspace = true, features = ["async_tokio"] }
 rand = { workspace = true }
+# used to test array_transform
+datafusion-physical-expr = { workspace = true }
+
+[[bench]]
+harness = false
+name = "array_concat"
+
+[[bench]]
+harness = false
+name = "array_min_max"
 
 [[bench]]
 harness = false
 name = "array_expression"
 
+[[bench]]
+harness = false
+name = "arrays_zip"
+
+[[bench]]
+harness = false
+name = "array_has"
+
+[[bench]]
+harness = false
+name = "array_reverse"
+
+[[bench]]
+harness = false
+name = "array_slice"
+
 [[bench]]
 harness = false
 name = "map"
+
+[[bench]]
+harness = false
+name = "array_remove"
+
+[[bench]]
+harness = false
+name = "array_repeat"
+
+[[bench]]
+harness = false
+name = "array_set_ops"
+
+[[bench]]
+harness = false
+name = "array_to_string"
+
+[[bench]]
+harness = false
+name = "array_position"
+
+[[bench]]
+harness = false
+name = "array_sort"
+
+[[bench]]
+harness = false
+name = "string_to_array"
+
+[[bench]]
+harness = false
+name = "array_resize"
diff --git a/datafusion/functions-nested/benches/array_concat.rs b/datafusion/functions-nested/benches/array_concat.rs
new file mode 100644
index 0000000000000..75dcc88f14737
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_concat.rs
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::hint::black_box;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, Int32Array, ListArray};
+use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
+use arrow::datatypes::{DataType, Field};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+
+use datafusion_functions_nested::concat::array_concat_inner;
+
+const SEED: u64 = 42;
+
+/// Build a `ListArray<i32>` with `num_lists` rows, each containing
+/// `elements_per_list` random i32 values. Every 10th row is null.
+fn make_list_array(
+    rng: &mut StdRng,
+    num_lists: usize,
+    elements_per_list: usize,
+) -> ArrayRef {
+    let total_values = num_lists * elements_per_list;
+    let values: Vec<i32> = (0..total_values).map(|_| rng.random()).collect();
+    let values = Arc::new(Int32Array::from(values));
+
+    let offsets: Vec<i32> = (0..=num_lists)
+        .map(|i| (i * elements_per_list) as i32)
+        .collect();
+    let offsets = OffsetBuffer::new(ScalarBuffer::from(offsets));
+
+    let nulls: Vec<bool> = (0..num_lists).map(|i| i % 10 != 0).collect();
+    let nulls = Some(NullBuffer::from(nulls));
+
+    Arc::new(ListArray::new(
+        Arc::new(Field::new("item", DataType::Int32, false)),
+        offsets,
+        values,
+        nulls,
+    ))
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_concat");
+
+    // Benchmark: varying number of rows, 20 elements per list
+    for num_rows in [100, 1000, 10000] {
+        let mut rng = StdRng::seed_from_u64(SEED);
+        let list_a = make_list_array(&mut rng, num_rows, 20);
+        let list_b = make_list_array(&mut rng, num_rows, 20);
+        let args: Vec<ArrayRef> = vec![list_a, list_b];
+
+        group.bench_with_input(BenchmarkId::new("rows", num_rows), &args, |b, args| {
+            b.iter(|| black_box(array_concat_inner(args).unwrap()));
+        });
+    }
+
+    // Benchmark: 1000 rows, varying element counts per list
+    for elements_per_list in [5, 50, 500] {
+        let mut rng = StdRng::seed_from_u64(SEED);
+        let list_a = make_list_array(&mut rng, 1000, elements_per_list);
+        let list_b = make_list_array(&mut rng, 1000, elements_per_list);
+        let args: Vec<ArrayRef> = vec![list_a, list_b];
+
+        group.bench_with_input(
+            BenchmarkId::new("elements_per_list", elements_per_list),
+            &args,
+            |b, args| {
+                b.iter(|| black_box(array_concat_inner(args).unwrap()));
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/array_expression.rs b/datafusion/functions-nested/benches/array_expression.rs
index 8d72ffa3c1cd5..ad9f565f4d643 100644
--- a/datafusion/functions-nested/benches/array_expression.rs
+++ b/datafusion/functions-nested/benches/array_expression.rs
@@ -15,11 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#[macro_use]
-extern crate criterion;
-extern crate arrow;
-
-use crate::criterion::Criterion;
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_expr::lit;
 use datafusion_functions_nested::expr_fn::{array_replace_all, make_array};
 use std::hint::black_box;
diff --git a/datafusion/functions-nested/benches/array_has.rs b/datafusion/functions-nested/benches/array_has.rs
new file mode 100644
index 0000000000000..f5e66d56c0efe
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_has.rs
@@ -0,0 +1,781 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, Int64Array, ListArray, StringArray};
+use arrow::buffer::OffsetBuffer;
+use arrow::datatypes::{DataType, Field};
+use criterion::{
+    criterion_group, criterion_main, {BenchmarkId, Criterion},
+};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions_nested::array_has::{ArrayHas, ArrayHasAll, ArrayHasAny};
+use rand::Rng;
+use rand::SeedableRng;
+use rand::rngs::StdRng;
+use std::hint::black_box;
+use std::sync::Arc;
+
+const NUM_ROWS: usize = 10000;
+const SEED: u64 = 42;
+const NULL_DENSITY: f64 = 0.1;
+const NEEDLE_SIZE: usize = 3;
+
+// If not explicitly stated, `array` and `array_size` refer to the haystack array.
+fn criterion_benchmark(c: &mut Criterion) {
+    // Test different array sizes
+    let array_sizes = vec![10, 100, 500];
+
+    for &size in &array_sizes {
+        bench_array_has(c, size);
+        bench_array_has_all(c, size);
+        bench_array_has_any(c, size);
+    }
+
+    // Specific benchmarks for string arrays (common use case)
+    bench_array_has_strings(c);
+    bench_array_has_all_strings(c);
+    bench_array_has_any_strings(c);
+
+    // Benchmark for array_has_any with one scalar arg
+    bench_array_has_any_scalar(c);
+}
+
+fn bench_array_has(c: &mut Criterion, array_size: usize) {
+    let mut group = c.benchmark_group("array_has_i64");
+    let list_array = create_int64_list_array(NUM_ROWS, array_size, NULL_DENSITY);
+    let config_options = Arc::new(ConfigOptions::default());
+    let return_field: Arc<Field> = Field::new("result", DataType::Boolean, true).into();
+    let arg_fields: Vec<Arc<Field>> = vec![
+        Field::new("arr", list_array.data_type().clone(), false).into(),
+        Field::new("el", DataType::Int64, false).into(),
+    ];
+
+    // Benchmark: element found
+    let args_found = vec![
+        ColumnarValue::Array(list_array.clone()),
+        ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("found", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayHas::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_found.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    // Benchmark: element not found
+    let args_not_found = vec![
+        ColumnarValue::Array(list_array.clone()),
+        ColumnarValue::Scalar(ScalarValue::Int64(Some(-999))),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("not_found", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayHas::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_not_found.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    group.finish();
+}
+
+fn bench_array_has_all(c: &mut Criterion, array_size: usize) {
+    let mut group = c.benchmark_group("array_has_all");
+    let haystack = create_int64_list_array(NUM_ROWS, array_size, NULL_DENSITY);
+    let list_type = haystack.data_type().clone();
+    let config_options = Arc::new(ConfigOptions::default());
+    let return_field: Arc<Field> = Field::new("result", DataType::Boolean, true).into();
+    let arg_fields: Vec<Arc<Field>> = vec![
+        Field::new("haystack", list_type.clone(), false).into(),
+        Field::new("needle", list_type.clone(), false).into(),
+    ];
+
+    // Benchmark: all elements found (small needle)
+    let needle_found = create_int64_list_array(NUM_ROWS, NEEDLE_SIZE, 0.0);
+    let args_found = vec![
+        ColumnarValue::Array(haystack.clone()),
+        ColumnarValue::Array(needle_found),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("all_found_small_needle", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayHasAll::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_found.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    // Benchmark: not all found (needle contains elements outside haystack range)
+    let needle_missing =
+        create_int64_list_array_with_offset(NUM_ROWS, NEEDLE_SIZE, array_size as i64);
+    let args_missing = vec![
+        ColumnarValue::Array(haystack.clone()),
+        ColumnarValue::Array(needle_missing),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("not_all_found", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayHasAll::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_missing.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    group.finish();
+}
+
+const SMALL_ARRAY_SIZE: usize = NEEDLE_SIZE;
+
+fn bench_array_has_any(c: &mut Criterion, array_size: usize) {
+    let mut group = c.benchmark_group("array_has_any");
+    let first_arr = create_int64_list_array(NUM_ROWS, array_size, NULL_DENSITY);
+    let list_type = first_arr.data_type().clone();
+    let config_options = Arc::new(ConfigOptions::default());
+    let return_field: Arc<Field> = Field::new("result", DataType::Boolean, true).into();
+    let arg_fields: Vec<Arc<Field>> = vec![
+        Field::new("first", list_type.clone(), false).into(),
+        Field::new("second", list_type.clone(), false).into(),
+    ];
+
+    // Benchmark: some elements match
+    let second_match = create_int64_list_array(NUM_ROWS, SMALL_ARRAY_SIZE, 0.0);
+    let args_match = vec![
+        ColumnarValue::Array(first_arr.clone()),
+        ColumnarValue::Array(second_match),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("some_match", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayHasAny::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_match.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    // Benchmark: no match
+    let second_no_match = create_int64_list_array_with_offset(
+        NUM_ROWS,
+        SMALL_ARRAY_SIZE,
+        array_size as i64,
+    );
+    let args_no_match = vec![
+        ColumnarValue::Array(first_arr.clone()),
+        ColumnarValue::Array(second_no_match),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("no_match", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayHasAny::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_no_match.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    // Benchmark: scalar second arg, some match
+    let scalar_second_match = create_int64_scalar_list(SMALL_ARRAY_SIZE, 0);
+    let args_scalar_match = vec![
+        ColumnarValue::Array(first_arr.clone()),
+        ColumnarValue::Scalar(scalar_second_match),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("scalar_some_match", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayHasAny::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_scalar_match.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    // Benchmark: scalar second arg, no match
+    let scalar_second_no_match =
+        create_int64_scalar_list(SMALL_ARRAY_SIZE, array_size as i64);
+    let args_scalar_no_match = vec![
+        ColumnarValue::Array(first_arr.clone()),
+        ColumnarValue::Scalar(scalar_second_no_match),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("scalar_no_match", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayHasAny::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_scalar_no_match.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    group.finish();
+}
+
+fn bench_array_has_strings(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_has_strings");
+    let config_options = Arc::new(ConfigOptions::default());
+    let return_field: Arc<Field> = Field::new("result", DataType::Boolean, true).into();
+
+    let sizes = vec![10, 100, 500];
+
+    for &size in &sizes {
+        let list_array = create_string_list_array(NUM_ROWS, size, NULL_DENSITY);
+        let arg_fields: Vec<Arc<Field>> = vec![
+            Field::new("arr", list_array.data_type().clone(), false).into(),
+            Field::new("el", DataType::Utf8, false).into(),
+        ];
+
+        let args_found = vec![
+            ColumnarValue::Array(list_array.clone()),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("value_1".to_string()))),
+        ];
+        group.bench_with_input(BenchmarkId::new("found", size), &size, |b, _| {
+            let udf = ArrayHas::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_found.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        });
+
+        let args_not_found = vec![
+            ColumnarValue::Array(list_array.clone()),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("NOTFOUND".to_string()))),
+        ];
+        group.bench_with_input(BenchmarkId::new("not_found", size), &size, |b, _| {
+            let udf = ArrayHas::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_not_found.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_array_has_all_strings(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_has_all_strings");
+    let config_options = Arc::new(ConfigOptions::default());
+    let return_field: Arc<Field> = Field::new("result", DataType::Boolean, true).into();
+
+    let sizes = vec![10, 100, 500];
+
+    for &size in &sizes {
+        let haystack = create_string_list_array(NUM_ROWS, size, NULL_DENSITY);
+        let list_type = haystack.data_type().clone();
+        let arg_fields: Vec<Arc<Field>> = vec![
+            Field::new("haystack", list_type.clone(), false).into(),
+            Field::new("needle", list_type.clone(), false).into(),
+        ];
+
+        let needle_found = create_string_list_array(NUM_ROWS, NEEDLE_SIZE, 0.0);
+        let args_found = vec![
+            ColumnarValue::Array(haystack.clone()),
+            ColumnarValue::Array(needle_found),
+        ];
+        group.bench_with_input(BenchmarkId::new("all_found", size), &size, |b, _| {
+            let udf = ArrayHasAll::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_found.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        });
+
+        let needle_missing =
+            create_string_list_array_with_prefix(NUM_ROWS, NEEDLE_SIZE, "missing_");
+        let args_missing = vec![
+            ColumnarValue::Array(haystack.clone()),
+            ColumnarValue::Array(needle_missing),
+        ];
+        group.bench_with_input(BenchmarkId::new("not_all_found", size), &size, |b, _| {
+            let udf = ArrayHasAll::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_missing.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_array_has_any_strings(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_has_any_strings");
+    let config_options = Arc::new(ConfigOptions::default());
+    let return_field: Arc<Field> = Field::new("result", DataType::Boolean, true).into();
+
+    let sizes = vec![10, 100, 500];
+
+    for &size in &sizes {
+        let first_arr = create_string_list_array(NUM_ROWS, size, NULL_DENSITY);
+        let list_type = first_arr.data_type().clone();
+        let arg_fields: Vec<Arc<Field>> = vec![
+            Field::new("first", list_type.clone(), false).into(),
+            Field::new("second", list_type.clone(), false).into(),
+        ];
+
+        let second_match = create_string_list_array(NUM_ROWS, SMALL_ARRAY_SIZE, 0.0);
+        let args_match = vec![
+            ColumnarValue::Array(first_arr.clone()),
+            ColumnarValue::Array(second_match),
+        ];
+        group.bench_with_input(BenchmarkId::new("some_match", size), &size, |b, _| {
+            let udf = ArrayHasAny::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_match.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        });
+
+        let second_no_match =
+            create_string_list_array_with_prefix(NUM_ROWS, SMALL_ARRAY_SIZE, "missing_");
+        let args_no_match = vec![
+            ColumnarValue::Array(first_arr.clone()),
+            ColumnarValue::Array(second_no_match),
+        ];
+        group.bench_with_input(BenchmarkId::new("no_match", size), &size, |b, _| {
+            let udf = ArrayHasAny::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_no_match.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        });
+
+        // Benchmark: scalar second arg, some match
+        let scalar_second_match = create_string_scalar_list(SMALL_ARRAY_SIZE, "value_");
+        let args_scalar_match = vec![
+            ColumnarValue::Array(first_arr.clone()),
+            ColumnarValue::Scalar(scalar_second_match),
+        ];
+        group.bench_with_input(
+            BenchmarkId::new("scalar_some_match", size),
+            &size,
+            |b, _| {
+                let udf = ArrayHasAny::new();
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: args_scalar_match.clone(),
+                            arg_fields: arg_fields.clone(),
+                            number_rows: NUM_ROWS,
+                            return_field: return_field.clone(),
+                            config_options: config_options.clone(),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+
+        // Benchmark: scalar second arg, no match
+        let scalar_second_no_match =
+            create_string_scalar_list(SMALL_ARRAY_SIZE, "missing_");
+        let args_scalar_no_match = vec![
+            ColumnarValue::Array(first_arr.clone()),
+            ColumnarValue::Scalar(scalar_second_no_match),
+        ];
+        group.bench_with_input(
+            BenchmarkId::new("scalar_no_match", size),
+            &size,
+            |b, _| {
+                let udf = ArrayHasAny::new();
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: args_scalar_no_match.clone(),
+                            arg_fields: arg_fields.clone(),
+                            number_rows: NUM_ROWS,
+                            return_field: return_field.clone(),
+                            config_options: config_options.clone(),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmarks array_has_any with one scalar arg.  Varies the scalar argument
+/// size while keeping the columnar array small (3 elements per row).
+fn bench_array_has_any_scalar(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_has_any_scalar");
+    let config_options = Arc::new(ConfigOptions::default());
+    let return_field: Arc<Field> = Field::new("result", DataType::Boolean, true).into();
+
+    let array_size = 3;
+    let scalar_sizes = vec![1, 10, 100, 1000];
+
+    // i64 benchmarks
+    let first_arr_i64 = create_int64_list_array(NUM_ROWS, array_size, NULL_DENSITY);
+    let list_type_i64 = first_arr_i64.data_type().clone();
+    let arg_fields_i64: Vec<Arc<Field>> = vec![
+        Field::new("first", list_type_i64.clone(), false).into(),
+        Field::new("second", list_type_i64.clone(), false).into(),
+    ];
+
+    for &scalar_size in &scalar_sizes {
+        let scalar_arg = create_int64_scalar_list(scalar_size, array_size as i64);
+        let args = vec![
+            ColumnarValue::Array(first_arr_i64.clone()),
+            ColumnarValue::Scalar(scalar_arg),
+        ];
+        group.bench_with_input(
+            BenchmarkId::new("i64_no_match", scalar_size),
+            &scalar_size,
+            |b, _| {
+                let udf = ArrayHasAny::new();
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: arg_fields_i64.clone(),
+                            number_rows: NUM_ROWS,
+                            return_field: return_field.clone(),
+                            config_options: config_options.clone(),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+    }
+
+    // String benchmarks
+    let first_arr_str = create_string_list_array(NUM_ROWS, array_size, NULL_DENSITY);
+    let list_type_str = first_arr_str.data_type().clone();
+    let arg_fields_str: Vec<Arc<Field>> = vec![
+        Field::new("first", list_type_str.clone(), false).into(),
+        Field::new("second", list_type_str.clone(), false).into(),
+    ];
+
+    for &scalar_size in &scalar_sizes {
+        let scalar_arg = create_string_scalar_list(scalar_size, "missing_");
+        let args = vec![
+            ColumnarValue::Array(first_arr_str.clone()),
+            ColumnarValue::Scalar(scalar_arg),
+        ];
+        group.bench_with_input(
+            BenchmarkId::new("string_no_match", scalar_size),
+            &scalar_size,
+            |b, _| {
+                let udf = ArrayHasAny::new();
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: arg_fields_str.clone(),
+                            number_rows: NUM_ROWS,
+                            return_field: return_field.clone(),
+                            config_options: config_options.clone(),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn create_int64_list_array(
+    num_rows: usize,
+    array_size: usize,
+    null_density: f64,
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let values = (0..num_rows * array_size)
+        .map(|_| {
+            if rng.random::<f64>() < null_density {
+                None
+            } else {
+                Some(rng.random_range(0..array_size as i64))
+            }
+        })
+        .collect::<Int64Array>();
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Int64, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+/// Like `create_int64_list_array` but values are offset so they won't
+/// appear in a standard list array (useful for "not found" benchmarks).
+fn create_int64_list_array_with_offset(
+    num_rows: usize,
+    array_size: usize,
+    offset: i64,
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED + 1);
+    let values = (0..num_rows * array_size)
+        .map(|_| Some(rng.random_range(0..array_size as i64) + offset))
+        .collect::<Int64Array>();
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Int64, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+fn create_string_list_array(
+    num_rows: usize,
+    array_size: usize,
+    null_density: f64,
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let values = (0..num_rows * array_size)
+        .map(|_| {
+            if rng.random::<f64>() < null_density {
+                None
+            } else {
+                let idx = rng.random_range(0..array_size);
+                Some(format!("value_{idx}"))
+            }
+        })
+        .collect::<StringArray>();
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Utf8, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+/// Like `create_string_list_array` but values use a different prefix so
+/// they won't appear in a standard string list array.
+fn create_string_list_array_with_prefix(
+    num_rows: usize,
+    array_size: usize,
+    prefix: &str,
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED + 1);
+    let values = (0..num_rows * array_size)
+        .map(|_| {
+            let idx = rng.random_range(0..array_size);
+            Some(format!("{prefix}{idx}"))
+        })
+        .collect::<StringArray>();
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Utf8, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+/// Create a `ScalarValue::List` containing a single list of `size` i64 elements,
+/// with values starting at `offset`.
+fn create_int64_scalar_list(size: usize, offset: i64) -> ScalarValue {
+    let values = (0..size as i64)
+        .map(|i| Some(i + offset))
+        .collect::<Int64Array>();
+    let list = ListArray::try_new(
+        Arc::new(Field::new("item", DataType::Int64, true)),
+        OffsetBuffer::new(vec![0, size as i32].into()),
+        Arc::new(values),
+        None,
+    )
+    .unwrap();
+    ScalarValue::List(Arc::new(list))
+}
+
+/// Create a `ScalarValue::List` containing a single list of `size` string elements,
+/// with values like "{prefix}0", "{prefix}1", etc.
+fn create_string_scalar_list(size: usize, prefix: &str) -> ScalarValue {
+    let values = (0..size)
+        .map(|i| Some(format!("{prefix}{i}")))
+        .collect::<StringArray>();
+    let list = ListArray::try_new(
+        Arc::new(Field::new("item", DataType::Utf8, true)),
+        OffsetBuffer::new(vec![0, size as i32].into()),
+        Arc::new(values),
+        None,
+    )
+    .unwrap();
+    ScalarValue::List(Arc::new(list))
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/array_min_max.rs b/datafusion/functions-nested/benches/array_min_max.rs
new file mode 100644
index 0000000000000..45838da79f95b
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_min_max.rs
@@ -0,0 +1,74 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{Array, ArrayRef};
+use arrow::datatypes::{DataType, Field, Int64Type};
+use arrow::util::bench_util::create_primitive_list_array_with_seed;
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions_nested::min_max::ArrayMax;
+
+const NUM_ROWS: usize = 8192;
+const SEED: u64 = 42;
+const LIST_NULL_DENSITY: f64 = 0.1;
+const ELEMENT_NULL_DENSITY: f64 = 0.1;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let udf = ArrayMax::new();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    for list_size in [10, 100, 1000] {
+        for (label, null_density) in [("nulls", ELEMENT_NULL_DENSITY), ("no_nulls", 0.0)]
+        {
+            let list_array: ArrayRef =
+                Arc::new(create_primitive_list_array_with_seed::<i32, Int64Type>(
+                    NUM_ROWS,
+                    LIST_NULL_DENSITY as f32,
+                    null_density as f32,
+                    list_size,
+                    SEED,
+                ));
+            let args = vec![ColumnarValue::Array(Arc::clone(&list_array))];
+            let arg_fields =
+                vec![Field::new("arg_0", list_array.data_type().clone(), true).into()];
+            let return_field: Arc<Field> = Field::new("f", DataType::Int64, true).into();
+
+            c.bench_with_input(
+                BenchmarkId::new("array_max", format!("{label}/list_size={list_size}")),
+                &list_array,
+                |b, _| {
+                    b.iter(|| {
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: arg_fields.clone(),
+                            number_rows: NUM_ROWS,
+                            return_field: return_field.clone(),
+                            config_options: config_options.clone(),
+                        })
+                        .unwrap()
+                    });
+                },
+            );
+        }
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/array_position.rs b/datafusion/functions-nested/benches/array_position.rs
new file mode 100644
index 0000000000000..c718b2b725640
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_position.rs
@@ -0,0 +1,344 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, Int64Array, ListArray};
+use arrow::buffer::OffsetBuffer;
+use arrow::datatypes::{DataType, Field};
+use criterion::{
+    criterion_group, criterion_main, {BenchmarkId, Criterion},
+};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions_nested::position::{ArrayPosition, ArrayPositions};
+use rand::Rng;
+use rand::SeedableRng;
+use rand::rngs::StdRng;
+use std::hint::black_box;
+use std::sync::Arc;
+
+const NUM_ROWS: usize = 10000;
+const SEED: u64 = 42;
+const NULL_DENSITY: f64 = 0.1;
+const SENTINEL_NEEDLE: i64 = -1;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    for size in [10, 100, 500] {
+        bench_array_position(c, size);
+        bench_array_positions(c, size);
+    }
+}
+
+fn bench_array_position(c: &mut Criterion, array_size: usize) {
+    let mut group = c.benchmark_group("array_position_i64");
+    let haystack_found_once = create_haystack_with_sentinel(
+        NUM_ROWS,
+        array_size,
+        NULL_DENSITY,
+        SENTINEL_NEEDLE,
+        0,
+    );
+    let haystack_found_many = create_haystack_with_sentinels(
+        NUM_ROWS,
+        array_size,
+        NULL_DENSITY,
+        SENTINEL_NEEDLE,
+    );
+    let haystack_not_found =
+        create_haystack_without_sentinel(NUM_ROWS, array_size, NULL_DENSITY);
+    let num_rows = haystack_not_found.len();
+    let arg_fields: Vec<Arc<Field>> = vec![
+        Field::new("haystack", haystack_not_found.data_type().clone(), false).into(),
+        Field::new("needle", DataType::Int64, false).into(),
+    ];
+    let return_field: Arc<Field> = Field::new("result", DataType::UInt64, true).into();
+    let config_options = Arc::new(ConfigOptions::default());
+    let needle = ScalarValue::Int64(Some(SENTINEL_NEEDLE));
+
+    // Benchmark: one match per row.
+    let args_found_once = vec![
+        ColumnarValue::Array(haystack_found_once.clone()),
+        ColumnarValue::Scalar(needle.clone()),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("found_once", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayPosition::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_found_once.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: num_rows,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    // Benchmark: many matches per row.
+    let args_found_many = vec![
+        ColumnarValue::Array(haystack_found_many.clone()),
+        ColumnarValue::Scalar(needle.clone()),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("found_many", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayPosition::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_found_many.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: num_rows,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    // Benchmark: needle is not found in any row.
+    let args_not_found = vec![
+        ColumnarValue::Array(haystack_not_found.clone()),
+        ColumnarValue::Scalar(needle.clone()),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("not_found", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayPosition::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_not_found.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: num_rows,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    group.finish();
+}
+
+fn bench_array_positions(c: &mut Criterion, array_size: usize) {
+    let mut group = c.benchmark_group("array_positions_i64");
+    let haystack_found_once = create_haystack_with_sentinel(
+        NUM_ROWS,
+        array_size,
+        NULL_DENSITY,
+        SENTINEL_NEEDLE,
+        0,
+    );
+    let haystack_found_many = create_haystack_with_sentinels(
+        NUM_ROWS,
+        array_size,
+        NULL_DENSITY,
+        SENTINEL_NEEDLE,
+    );
+    let haystack_not_found =
+        create_haystack_without_sentinel(NUM_ROWS, array_size, NULL_DENSITY);
+    let num_rows = haystack_not_found.len();
+    let arg_fields: Vec<Arc<Field>> = vec![
+        Field::new("haystack", haystack_not_found.data_type().clone(), false).into(),
+        Field::new("needle", DataType::Int64, false).into(),
+    ];
+    let return_field: Arc<Field> = Field::new(
+        "result",
+        DataType::List(Arc::new(Field::new_list_field(DataType::UInt64, true))),
+        true,
+    )
+    .into();
+    let config_options = Arc::new(ConfigOptions::default());
+    let needle = ScalarValue::Int64(Some(SENTINEL_NEEDLE));
+
+    let args_found_once = vec![
+        ColumnarValue::Array(haystack_found_once.clone()),
+        ColumnarValue::Scalar(needle.clone()),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("found_once", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayPositions::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_found_once.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: num_rows,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    let args_found_many = vec![
+        ColumnarValue::Array(haystack_found_many.clone()),
+        ColumnarValue::Scalar(needle.clone()),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("found_many", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayPositions::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_found_many.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: num_rows,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    let args_not_found = vec![
+        ColumnarValue::Array(haystack_not_found.clone()),
+        ColumnarValue::Scalar(needle.clone()),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("not_found", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayPositions::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_not_found.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: num_rows,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    group.finish();
+}
+
+fn create_haystack_without_sentinel(
+    num_rows: usize,
+    array_size: usize,
+    null_density: f64,
+) -> ArrayRef {
+    create_haystack_from_fn(num_rows, array_size, |_, _, rng| {
+        random_haystack_value(rng, array_size, null_density)
+    })
+}
+
+fn create_haystack_with_sentinel(
+    num_rows: usize,
+    array_size: usize,
+    null_density: f64,
+    sentinel: i64,
+    sentinel_index: usize,
+) -> ArrayRef {
+    assert!(sentinel_index < array_size);
+
+    create_haystack_from_fn(num_rows, array_size, |_, col, rng| {
+        if col == sentinel_index {
+            Some(sentinel)
+        } else {
+            random_haystack_value(rng, array_size, null_density)
+        }
+    })
+}
+
+fn create_haystack_with_sentinels(
+    num_rows: usize,
+    array_size: usize,
+    null_density: f64,
+    sentinel: i64,
+) -> ArrayRef {
+    create_haystack_from_fn(num_rows, array_size, |_, col, rng| {
+        // Place the sentinel in half the positions to create many matches per row.
+        if col % 2 == 0 {
+            Some(sentinel)
+        } else {
+            random_haystack_value(rng, array_size, null_density)
+        }
+    })
+}
+
+fn create_haystack_from_fn<F>(
+    num_rows: usize,
+    array_size: usize,
+    mut value_at: F,
+) -> ArrayRef
+where
+    F: FnMut(usize, usize, &mut StdRng) -> Option<i64>,
+{
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let mut values = Vec::with_capacity(num_rows * array_size);
+    for row in 0..num_rows {
+        for col in 0..array_size {
+            values.push(value_at(row, col, &mut rng));
+        }
+    }
+    let values = values.into_iter().collect::<Int64Array>();
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Int64, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+fn random_haystack_value(
+    rng: &mut StdRng,
+    array_size: usize,
+    null_density: f64,
+) -> Option<i64> {
+    if rng.random::<f64>() < null_density {
+        None
+    } else {
+        Some(rng.random_range(0..array_size as i64))
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/array_remove.rs b/datafusion/functions-nested/benches/array_remove.rs
new file mode 100644
index 0000000000000..bfa7357384856
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_remove.rs
@@ -0,0 +1,553 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Array, ArrayBuilder, ArrayRef, BooleanBuilder, FixedSizeBinaryArray, Int64Builder,
+    ListArray, ListBuilder, StringBuilder,
+};
+use arrow::buffer::{NullBuffer, OffsetBuffer};
+use arrow::datatypes::{DataType, Field};
+use criterion::{
+    criterion_group, criterion_main, {BenchmarkId, Criterion},
+};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions_nested::remove::{ArrayRemove, ArrayRemoveAll, ArrayRemoveN};
+use rand::Rng;
+use rand::SeedableRng;
+use rand::rngs::StdRng;
+use rand::seq::IndexedRandom;
+use std::hint::black_box;
+use std::sync::Arc;
+
+// (num_rows, list_size)
+// Settings tuned so benchmarks finish in approx 5 seconds
+const SIZES: &[(usize, usize)] = &[(4_000, 10), (10_000, 100), (10_000, 500)];
+const NESTED_SIZES: &[(usize, usize)] = &[(4_000, 10), (3_000, 100), (1_500, 300)];
+const SEED: u64 = 42;
+const HAYSTACK_NULL_DENSITY: f64 = 0.1;
+const NEEDLE_DENSITY: f64 = 0.1;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    bench_array_remove_int64(c);
+    bench_array_remove_n_int64(c);
+    bench_array_remove_all_int64(c);
+
+    bench_array_remove_int64_nested(c);
+    bench_array_remove_n_int64_nested(c);
+    bench_array_remove_all_int64_nested(c);
+
+    bench_array_remove_strings(c);
+    bench_array_remove_boolean(c);
+    bench_array_remove_fixed_size_binary(c);
+}
+
+fn bench_array_remove_int64(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_remove_int64");
+
+    let filler_values = [None, Some(1), Some(2), Some(3), Some(4), Some(5)];
+    let needle = 0;
+    for &(num_rows, list_size) in SIZES {
+        let list_array = create_list_array::<Int64Builder, _>(
+            num_rows,
+            list_size,
+            needle,
+            &filler_values,
+        );
+        group.bench_with_input(
+            BenchmarkId::new(
+                "remove",
+                format!("list size: {list_size}, num_rows: {num_rows}"),
+            ),
+            &(list_size, num_rows),
+            |b, _| {
+                let udf = ArrayRemove::new();
+                b.iter(|| {
+                    let args = create_args(list_array.clone(), ScalarValue::from(needle));
+                    black_box(udf.invoke_with_args(args).unwrap())
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_array_remove_n_int64(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_remove_n_int64");
+
+    let filler_values = [None, Some(1), Some(2), Some(3), Some(4), Some(5)];
+    let needle = 0;
+    for &(num_rows, list_size) in SIZES {
+        let list_array = create_list_array::<Int64Builder, _>(
+            num_rows,
+            list_size,
+            needle,
+            &filler_values,
+        );
+        let n = (NEEDLE_DENSITY / 2.0 * list_size as f64) as i64;
+        let n = 2.max(n);
+
+        group.bench_with_input(
+            BenchmarkId::new(
+                "remove",
+                format!("list size: {list_size}, num_rows: {num_rows}"),
+            ),
+            &(list_size, num_rows),
+            |b, _| {
+                let udf = ArrayRemoveN::new();
+                b.iter(|| {
+                    let args = create_args_n(
+                        list_array.clone(),
+                        ScalarValue::from(needle),
+                        ScalarValue::from(n),
+                    );
+                    black_box(udf.invoke_with_args(args).unwrap())
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_array_remove_all_int64(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_remove_all_int64");
+
+    let filler_values = [None, Some(1), Some(2), Some(3), Some(4), Some(5)];
+    let needle = 0;
+    for &(num_rows, list_size) in SIZES {
+        let list_array = create_list_array::<Int64Builder, _>(
+            num_rows,
+            list_size,
+            needle,
+            &filler_values,
+        );
+        group.bench_with_input(
+            BenchmarkId::new(
+                "remove",
+                format!("list size: {list_size}, num_rows: {num_rows}"),
+            ),
+            &(list_size, num_rows),
+            |b, _| {
+                let udf = ArrayRemoveAll::new();
+                b.iter(|| {
+                    let args = create_args(list_array.clone(), ScalarValue::from(needle));
+                    black_box(udf.invoke_with_args(args).unwrap())
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_array_remove_int64_nested(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_remove_int64_nested");
+
+    let filler_values = [
+        None,
+        Some(vec![Some(1), Some(0), Some(2), Some(0)]),
+        Some(vec![Some(1)]),
+        Some(vec![]),
+        Some(vec![Some(1), Some(0), Some(2), Some(4), None]),
+        Some(vec![None]),
+    ];
+    let needle = vec![Some(1), Some(0), Some(2), Some(4)];
+    let needle_scalar = needle
+        .iter()
+        .copied()
+        .map(ScalarValue::from)
+        .collect::<Vec<_>>();
+    let needle_scalar = ScalarValue::List(ScalarValue::new_list_nullable(
+        &needle_scalar,
+        &DataType::Int64,
+    ));
+    for &(num_rows, list_size) in NESTED_SIZES {
+        let list_array =
+            create_nested_i64_list_array(num_rows, list_size, &needle, &filler_values);
+        group.bench_with_input(
+            BenchmarkId::new(
+                "remove",
+                format!("list size: {list_size}, num_rows: {num_rows}"),
+            ),
+            &(list_size, num_rows),
+            |b, _| {
+                let udf = ArrayRemove::new();
+                b.iter(|| {
+                    let args = create_args(list_array.clone(), needle_scalar.clone());
+                    black_box(udf.invoke_with_args(args).unwrap())
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_array_remove_n_int64_nested(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_remove_n_int64_nested");
+
+    let filler_values = [
+        None,
+        Some(vec![Some(1), Some(0), Some(2), Some(0)]),
+        Some(vec![Some(1)]),
+        Some(vec![]),
+        Some(vec![Some(1), Some(0), Some(2), Some(4), None]),
+        Some(vec![None]),
+    ];
+    let needle = vec![Some(1), Some(0), Some(2), Some(4)];
+    let needle_scalar = needle
+        .iter()
+        .copied()
+        .map(ScalarValue::from)
+        .collect::<Vec<_>>();
+    let needle_scalar = ScalarValue::List(ScalarValue::new_list_nullable(
+        &needle_scalar,
+        &DataType::Int64,
+    ));
+    for &(num_rows, list_size) in NESTED_SIZES {
+        let list_array =
+            create_nested_i64_list_array(num_rows, list_size, &needle, &filler_values);
+        let n = (NEEDLE_DENSITY / 2.0 * list_size as f64) as i64;
+        let n = 2.max(n);
+        group.bench_with_input(
+            BenchmarkId::new(
+                "remove",
+                format!("list size: {list_size}, num_rows: {num_rows}"),
+            ),
+            &(list_size, num_rows),
+            |b, _| {
+                let udf = ArrayRemoveN::new();
+                b.iter(|| {
+                    let args = create_args_n(
+                        list_array.clone(),
+                        needle_scalar.clone(),
+                        ScalarValue::from(n),
+                    );
+                    black_box(udf.invoke_with_args(args).unwrap())
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_array_remove_all_int64_nested(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_remove_all_int64_nested");
+
+    let filler_values = [
+        None,
+        Some(vec![Some(1), Some(0), Some(2), Some(0)]),
+        Some(vec![Some(1)]),
+        Some(vec![]),
+        Some(vec![Some(1), Some(0), Some(2), Some(4), None]),
+        Some(vec![None]),
+    ];
+    let needle = vec![Some(1), Some(0), Some(2), Some(4)];
+    let needle_scalar = needle
+        .iter()
+        .copied()
+        .map(ScalarValue::from)
+        .collect::<Vec<_>>();
+    let needle_scalar = ScalarValue::List(ScalarValue::new_list_nullable(
+        &needle_scalar,
+        &DataType::Int64,
+    ));
+    for &(num_rows, list_size) in NESTED_SIZES {
+        let list_array =
+            create_nested_i64_list_array(num_rows, list_size, &needle, &filler_values);
+        group.bench_with_input(
+            BenchmarkId::new(
+                "remove",
+                format!("list size: {list_size}, num_rows: {num_rows}"),
+            ),
+            &(list_size, num_rows),
+            |b, _| {
+                let udf = ArrayRemoveAll::new();
+                b.iter(|| {
+                    let args = create_args(list_array.clone(), needle_scalar.clone());
+                    black_box(udf.invoke_with_args(args).unwrap())
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_array_remove_strings(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_remove_strings");
+
+    let filler_values = [
+        None,
+        Some("neenee"),
+        Some("notthis"),
+        Some("value1"),
+        Some("abc"),
+        Some("hello"),
+    ];
+    let needle = "needle";
+    for &(num_rows, list_size) in SIZES {
+        let list_array = create_list_array::<StringBuilder, _>(
+            num_rows,
+            list_size,
+            needle,
+            &filler_values,
+        );
+        group.bench_with_input(
+            BenchmarkId::new(
+                "remove",
+                format!("list size: {list_size}, num_rows: {num_rows}"),
+            ),
+            &(list_size, num_rows),
+            |b, _| {
+                let udf = ArrayRemove::new();
+                b.iter(|| {
+                    let args = create_args(list_array.clone(), ScalarValue::from(needle));
+                    black_box(udf.invoke_with_args(args).unwrap())
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_array_remove_boolean(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_remove_boolean");
+
+    let filler_values = [None, Some(false)];
+    let needle = true;
+    for &(num_rows, list_size) in SIZES {
+        let list_array = create_list_array::<BooleanBuilder, _>(
+            num_rows,
+            list_size,
+            needle,
+            &filler_values,
+        );
+        group.bench_with_input(
+            BenchmarkId::new(
+                "remove",
+                format!("list size: {list_size}, num_rows: {num_rows}"),
+            ),
+            &(list_size, num_rows),
+            |b, _| {
+                let udf = ArrayRemove::new();
+                b.iter(|| {
+                    let args = create_args(list_array.clone(), ScalarValue::from(needle));
+                    black_box(udf.invoke_with_args(args).unwrap())
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_array_remove_fixed_size_binary(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_remove_fixed_size_binary");
+
+    const SIZE: usize = 16;
+    let filler_values = [
+        None,
+        Some([2_u8; SIZE]),
+        Some([3_u8; SIZE]),
+        Some([4_u8; SIZE]),
+        Some([5_u8; SIZE]),
+        Some([6_u8; SIZE]),
+    ];
+    let needle = [1_u8; SIZE];
+    for &(num_rows, list_size) in SIZES {
+        let list_array = create_fixed_size_binary_list_array::<SIZE>(
+            num_rows,
+            list_size,
+            needle,
+            &filler_values,
+        );
+        group.bench_with_input(
+            BenchmarkId::new(
+                "remove",
+                format!("list size: {list_size}, num_rows: {num_rows}"),
+            ),
+            &(list_size, num_rows),
+            |b, _| {
+                let udf = ArrayRemove::new();
+                b.iter(|| {
+                    let args = create_args(
+                        list_array.clone(),
+                        ScalarValue::FixedSizeBinary(SIZE as i32, Some(needle.to_vec())),
+                    );
+                    black_box(udf.invoke_with_args(args).unwrap())
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+#[inline]
+fn create_args(haystack: ArrayRef, needle: ScalarValue) -> ScalarFunctionArgs {
+    let number_rows = haystack.len();
+    let haystack_type = haystack.data_type().clone();
+    let needle_type = needle.data_type().clone();
+    ScalarFunctionArgs {
+        args: vec![
+            ColumnarValue::Array(haystack),
+            ColumnarValue::Scalar(needle),
+        ],
+        arg_fields: vec![
+            Field::new("haystack", haystack_type.clone(), true).into(),
+            Field::new("needle", needle_type, true).into(),
+        ],
+        number_rows,
+        return_field: Field::new("result", haystack_type, true).into(),
+        config_options: Arc::new(ConfigOptions::default()),
+    }
+}
+
+#[inline]
+fn create_args_n(
+    haystack: ArrayRef,
+    needle: ScalarValue,
+    n: ScalarValue,
+) -> ScalarFunctionArgs {
+    let number_rows = haystack.len();
+    let haystack_type = haystack.data_type().clone();
+    let needle_type = needle.data_type().clone();
+    let n_type = n.data_type().clone();
+    ScalarFunctionArgs {
+        args: vec![
+            ColumnarValue::Array(haystack),
+            ColumnarValue::Scalar(needle),
+            ColumnarValue::Scalar(n),
+        ],
+        arg_fields: vec![
+            Field::new("haystack", haystack_type.clone(), true).into(),
+            Field::new("needle", needle_type, true).into(),
+            Field::new("n", n_type, true).into(),
+        ],
+        number_rows,
+        return_field: Field::new("result", haystack_type, true).into(),
+        config_options: Arc::new(ConfigOptions::default()),
+    }
+}
+
+fn create_list_array<Builder, Item>(
+    num_rows: usize,
+    list_size: usize,
+    needle_value: Item,
+    filler_values: &[Option<Item>],
+) -> ArrayRef
+where
+    Builder: ArrayBuilder + Default + Extend<Option<Item>>,
+    Item: Copy,
+{
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let values = (0..num_rows)
+        .map(|_| {
+            if rng.random_bool(HAYSTACK_NULL_DENSITY) {
+                None
+            } else {
+                let list = (0..list_size)
+                    .map(|_| {
+                        if rng.random_bool(NEEDLE_DENSITY) {
+                            Some(needle_value)
+                        } else {
+                            *filler_values.choose(&mut rng).unwrap()
+                        }
+                    })
+                    .collect::<Vec<_>>();
+                Some(list)
+            }
+        })
+        .collect::<Vec<_>>();
+    Arc::new(ListArray::from_nested_iter::<Builder, _, _, _>(values))
+}
+
+fn create_fixed_size_binary_list_array<const SIZE: usize>(
+    num_rows: usize,
+    list_size: usize,
+    needle_value: [u8; SIZE],
+    filler_values: &[Option<[u8; SIZE]>],
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let mut buffer = Vec::with_capacity(num_rows * list_size);
+    for _ in 0..num_rows {
+        for _ in 0..list_size {
+            if rng.random_bool(NEEDLE_DENSITY) {
+                buffer.push(Some(needle_value));
+            } else {
+                buffer.push(*filler_values.choose(&mut rng).unwrap());
+            }
+        }
+    }
+    let values = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
+        buffer.into_iter(),
+        SIZE as i32,
+    )
+    .unwrap();
+
+    let null_buffer = NullBuffer::from_iter(
+        (0..num_rows).map(|_| rng.random_bool(1.0 - HAYSTACK_NULL_DENSITY)),
+    );
+
+    Arc::new(ListArray::new(
+        Field::new("item", DataType::FixedSizeBinary(SIZE as i32), true).into(),
+        OffsetBuffer::from_repeated_length(list_size, num_rows),
+        Arc::new(values),
+        Some(null_buffer),
+    ))
+}
+
+fn create_nested_i64_list_array(
+    num_rows: usize,
+    list_size: usize,
+    needle_value: &[Option<i64>],
+    filler_values: &[Option<Vec<Option<i64>>>],
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+
+    let value_builder = Int64Builder::new();
+    let inner_builder = ListBuilder::new(value_builder);
+    let mut outer_builder = ListBuilder::new(inner_builder);
+
+    for _ in 0..num_rows {
+        if rng.random_bool(HAYSTACK_NULL_DENSITY) {
+            outer_builder.append(false);
+            continue;
+        }
+
+        for _ in 0..list_size {
+            let inner = outer_builder.values();
+            if rng.random_bool(NEEDLE_DENSITY) {
+                inner.append_value(needle_value.to_vec());
+            } else {
+                inner.append_option(filler_values.choose(&mut rng).unwrap().clone());
+            }
+        }
+        outer_builder.append(true);
+    }
+
+    Arc::new(outer_builder.finish())
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/array_repeat.rs b/datafusion/functions-nested/benches/array_repeat.rs
new file mode 100644
index 0000000000000..42372322e2812
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_repeat.rs
@@ -0,0 +1,407 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, ListArray};
+use arrow::buffer::OffsetBuffer;
+use arrow::datatypes::{DataType, Field, Int64Type};
+use arrow::util::bench_util::{
+    create_boolean_array, create_f64_array, create_primitive_array,
+    create_primitive_list_array_with_seed, create_string_array_with_max_len,
+};
+use criterion::{
+    criterion_group, criterion_main, {BenchmarkId, Criterion},
+};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions_nested::repeat::ArrayRepeat;
+use rand::Rng;
+use rand::SeedableRng;
+use rand::rngs::StdRng;
+use std::hint::black_box;
+use std::sync::Arc;
+
+const NUM_ROWS: &[usize] = &[100, 1000, 10000];
+// Must be of type i64 because ArrayRepeat's second argument is Int64
+const REPEAT_COUNTS: &[i64] = &[5, 50];
+const SEED: u64 = 42;
+const NULL_DENSITY: f64 = 0.1;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    // Test array_repeat with different element types
+    bench_array_repeat_int64(c);
+    bench_array_repeat_string(c);
+    bench_array_repeat_float64(c);
+    bench_array_repeat_boolean(c);
+
+    // Test array_repeat with list element (nested arrays)
+    bench_array_repeat_nested_int64_list(c);
+    bench_array_repeat_nested_string_list(c);
+}
+
+fn bench_array_repeat_int64(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_repeat_int64");
+
+    for &num_rows in NUM_ROWS {
+        let element_array: ArrayRef = Arc::new(create_primitive_array::<Int64Type>(
+            num_rows,
+            NULL_DENSITY as f32,
+        ));
+
+        for &repeat_count in REPEAT_COUNTS {
+            let args = vec![
+                ColumnarValue::Array(element_array.clone()),
+                ColumnarValue::Scalar(ScalarValue::from(repeat_count)),
+            ];
+
+            group.bench_with_input(
+                BenchmarkId::new(format!("repeat_{repeat_count}_count"), num_rows),
+                &num_rows,
+                |b, _| {
+                    let udf = ArrayRepeat::new();
+                    b.iter(|| {
+                        black_box(
+                            udf.invoke_with_args(ScalarFunctionArgs {
+                                args: args.clone(),
+                                arg_fields: vec![
+                                    Field::new("element", DataType::Int64, false).into(),
+                                    Field::new("count", DataType::Int64, false).into(),
+                                ],
+                                number_rows: num_rows,
+                                return_field: Field::new(
+                                    "result",
+                                    DataType::List(Arc::new(Field::new_list_field(
+                                        DataType::Int64,
+                                        true,
+                                    ))),
+                                    false,
+                                )
+                                .into(),
+                                config_options: Arc::new(ConfigOptions::default()),
+                            })
+                            .unwrap(),
+                        )
+                    })
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
+fn bench_array_repeat_string(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_repeat_string");
+
+    for &num_rows in NUM_ROWS {
+        let element_array = Arc::new(create_string_array_with_max_len::<i64>(
+            num_rows,
+            NULL_DENSITY as f32,
+            100,
+        ));
+
+        for &repeat_count in REPEAT_COUNTS {
+            let args = vec![
+                ColumnarValue::Array(element_array.clone()),
+                ColumnarValue::Scalar(ScalarValue::from(repeat_count)),
+            ];
+
+            group.bench_with_input(
+                BenchmarkId::new(format!("repeat_{repeat_count}_count"), num_rows),
+                &num_rows,
+                |b, _| {
+                    let udf = ArrayRepeat::new();
+                    b.iter(|| {
+                        black_box(
+                            udf.invoke_with_args(ScalarFunctionArgs {
+                                args: args.clone(),
+                                arg_fields: vec![
+                                    Field::new("element", DataType::Utf8, false).into(),
+                                    Field::new("count", DataType::Int64, false).into(),
+                                ],
+                                number_rows: num_rows,
+                                return_field: Field::new(
+                                    "result",
+                                    DataType::List(Arc::new(Field::new_list_field(
+                                        DataType::Utf8,
+                                        true,
+                                    ))),
+                                    false,
+                                )
+                                .into(),
+                                config_options: Arc::new(ConfigOptions::default()),
+                            })
+                            .unwrap(),
+                        )
+                    })
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
+fn bench_array_repeat_nested_int64_list(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_repeat_nested_int64");
+
+    for &num_rows in NUM_ROWS {
+        let list_array: ArrayRef =
+            Arc::new(create_primitive_list_array_with_seed::<i32, Int64Type>(
+                num_rows,
+                NULL_DENSITY as f32,
+                NULL_DENSITY as f32,
+                5,
+                SEED,
+            ));
+
+        for &repeat_count in REPEAT_COUNTS {
+            let args = vec![
+                ColumnarValue::Array(list_array.clone()),
+                ColumnarValue::Scalar(ScalarValue::from(repeat_count)),
+            ];
+
+            group.bench_with_input(
+                BenchmarkId::new(format!("repeat_{repeat_count}_count"), num_rows),
+                &num_rows,
+                |b, _| {
+                    let udf = ArrayRepeat::new();
+                    b.iter(|| {
+                        black_box(
+                            udf.invoke_with_args(ScalarFunctionArgs {
+                                args: args.clone(),
+                                arg_fields: vec![
+                                    Field::new(
+                                        "element",
+                                        list_array.data_type().clone(),
+                                        false,
+                                    )
+                                    .into(),
+                                    Field::new("count", DataType::Int64, false).into(),
+                                ],
+                                number_rows: num_rows,
+                                return_field: Field::new(
+                                    "result",
+                                    DataType::List(Arc::new(Field::new_list_field(
+                                        list_array.data_type().clone(),
+                                        true,
+                                    ))),
+                                    false,
+                                )
+                                .into(),
+                                config_options: Arc::new(ConfigOptions::default()),
+                            })
+                            .unwrap(),
+                        )
+                    })
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
+fn bench_array_repeat_float64(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_repeat_float64");
+
+    for &num_rows in NUM_ROWS {
+        let element_array = Arc::new(create_f64_array(num_rows, NULL_DENSITY as f32));
+
+        for &repeat_count in REPEAT_COUNTS {
+            let args = vec![
+                ColumnarValue::Array(element_array.clone()),
+                ColumnarValue::Scalar(ScalarValue::from(repeat_count)),
+            ];
+
+            group.bench_with_input(
+                BenchmarkId::new(format!("repeat_{repeat_count}_count"), num_rows),
+                &num_rows,
+                |b, _| {
+                    let udf = ArrayRepeat::new();
+                    b.iter(|| {
+                        black_box(
+                            udf.invoke_with_args(ScalarFunctionArgs {
+                                args: args.clone(),
+                                arg_fields: vec![
+                                    Field::new("element", DataType::Float64, false)
+                                        .into(),
+                                    Field::new("count", DataType::Int64, false).into(),
+                                ],
+                                number_rows: num_rows,
+                                return_field: Field::new(
+                                    "result",
+                                    DataType::List(Arc::new(Field::new_list_field(
+                                        DataType::Float64,
+                                        true,
+                                    ))),
+                                    false,
+                                )
+                                .into(),
+                                config_options: Arc::new(ConfigOptions::default()),
+                            })
+                            .unwrap(),
+                        )
+                    })
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
+fn bench_array_repeat_boolean(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_repeat_boolean");
+
+    for &num_rows in NUM_ROWS {
+        let element_array = Arc::new(create_boolean_array(
+            num_rows,
+            NULL_DENSITY as f32,
+            f32::MAX,
+        ));
+
+        for &repeat_count in REPEAT_COUNTS {
+            let args = vec![
+                ColumnarValue::Array(element_array.clone()),
+                ColumnarValue::Scalar(ScalarValue::from(repeat_count)),
+            ];
+
+            group.bench_with_input(
+                BenchmarkId::new(format!("repeat_{repeat_count}_count"), num_rows),
+                &num_rows,
+                |b, _| {
+                    let udf = ArrayRepeat::new();
+                    b.iter(|| {
+                        black_box(
+                            udf.invoke_with_args(ScalarFunctionArgs {
+                                args: args.clone(),
+                                arg_fields: vec![
+                                    Field::new("element", DataType::Boolean, false)
+                                        .into(),
+                                    Field::new("count", DataType::Int64, false).into(),
+                                ],
+                                number_rows: num_rows,
+                                return_field: Field::new(
+                                    "result",
+                                    DataType::List(Arc::new(Field::new_list_field(
+                                        DataType::Boolean,
+                                        true,
+                                    ))),
+                                    false,
+                                )
+                                .into(),
+                                config_options: Arc::new(ConfigOptions::default()),
+                            })
+                            .unwrap(),
+                        )
+                    })
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
+fn bench_array_repeat_nested_string_list(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_repeat_nested_string");
+
+    for &num_rows in NUM_ROWS {
+        let list_array = create_string_list_array(num_rows, 5, NULL_DENSITY);
+
+        for &repeat_count in REPEAT_COUNTS {
+            let args = vec![
+                ColumnarValue::Array(list_array.clone()),
+                ColumnarValue::Scalar(ScalarValue::from(repeat_count)),
+            ];
+
+            group.bench_with_input(
+                BenchmarkId::new(format!("repeat_{repeat_count}_count"), num_rows),
+                &num_rows,
+                |b, _| {
+                    let udf = ArrayRepeat::new();
+                    b.iter(|| {
+                        black_box(
+                            udf.invoke_with_args(ScalarFunctionArgs {
+                                args: args.clone(),
+                                arg_fields: vec![
+                                    Field::new(
+                                        "element",
+                                        list_array.data_type().clone(),
+                                        false,
+                                    )
+                                    .into(),
+                                    Field::new("count", DataType::Int64, false).into(),
+                                ],
+                                number_rows: num_rows,
+                                return_field: Field::new(
+                                    "result",
+                                    DataType::List(Arc::new(Field::new_list_field(
+                                        list_array.data_type().clone(),
+                                        true,
+                                    ))),
+                                    false,
+                                )
+                                .into(),
+                                config_options: Arc::new(ConfigOptions::default()),
+                            })
+                            .unwrap(),
+                        )
+                    })
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
+fn create_string_list_array(
+    num_rows: usize,
+    array_size: usize,
+    null_density: f64,
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    use arrow::array::StringArray;
+
+    let values = (0..num_rows * array_size)
+        .map(|_| {
+            if rng.random::<f64>() < null_density {
+                None
+            } else {
+                Some(format!("value_{}", rng.random_range(0..100)))
+            }
+        })
+        .collect::<StringArray>();
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Utf8, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/array_resize.rs b/datafusion/functions-nested/benches/array_resize.rs
new file mode 100644
index 0000000000000..d605ab3a20d3e
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_resize.rs
@@ -0,0 +1,170 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, Int64Array, ListArray};
+use arrow::buffer::OffsetBuffer;
+use arrow::datatypes::{DataType, Field};
+use criterion::{
+    BenchmarkGroup, Criterion, criterion_group, criterion_main, measurement::WallTime,
+};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions_nested::resize::ArrayResize;
+use std::hint::black_box;
+use std::sync::Arc;
+
+const NUM_ROWS: usize = 1_000;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_resize_i64");
+    let list_field: Arc<Field> = Field::new_list_field(DataType::Int64, true).into();
+    let list_data_type = DataType::List(Arc::clone(&list_field));
+    let arg_fields = vec![
+        Field::new("array", list_data_type.clone(), true).into(),
+        Field::new("size", DataType::Int64, false).into(),
+        Field::new("value", DataType::Int64, true).into(),
+    ];
+    let return_field: Arc<Field> = Field::new("result", list_data_type, true).into();
+    let config_options = Arc::new(ConfigOptions::default());
+    let two_arg_fields = arg_fields[..2].to_vec();
+
+    bench_case(
+        &mut group,
+        "grow_uniform_fill_10_to_500",
+        &[
+            ColumnarValue::Array(create_int64_list_array(NUM_ROWS, 10)),
+            ColumnarValue::Array(repeated_int64_array(500)),
+            ColumnarValue::Array(repeated_int64_array(7)),
+        ],
+        &arg_fields,
+        &return_field,
+        &config_options,
+    );
+
+    bench_case(
+        &mut group,
+        "shrink_uniform_fill_500_to_10",
+        &[
+            ColumnarValue::Array(create_int64_list_array(NUM_ROWS, 500)),
+            ColumnarValue::Array(repeated_int64_array(10)),
+            ColumnarValue::Array(repeated_int64_array(7)),
+        ],
+        &arg_fields,
+        &return_field,
+        &config_options,
+    );
+
+    bench_case(
+        &mut group,
+        "grow_default_null_fill_10_to_500",
+        &[
+            ColumnarValue::Array(create_int64_list_array(NUM_ROWS, 10)),
+            ColumnarValue::Array(repeated_int64_array(500)),
+        ],
+        &two_arg_fields,
+        &return_field,
+        &config_options,
+    );
+
+    bench_case(
+        &mut group,
+        "grow_variable_fill_10_to_500",
+        &[
+            ColumnarValue::Array(create_int64_list_array(NUM_ROWS, 10)),
+            ColumnarValue::Array(repeated_int64_array(500)),
+            ColumnarValue::Array(distinct_fill_array()),
+        ],
+        &arg_fields,
+        &return_field,
+        &config_options,
+    );
+
+    bench_case(
+        &mut group,
+        "mixed_grow_shrink_1000x_100",
+        &[
+            ColumnarValue::Array(create_int64_list_array(NUM_ROWS, 100)),
+            ColumnarValue::Array(mixed_size_array()),
+        ],
+        &arg_fields[..2],
+        &return_field,
+        &config_options,
+    );
+
+    group.finish();
+}
+
+fn bench_case(
+    group: &mut BenchmarkGroup<'_, WallTime>,
+    name: &str,
+    args: &[ColumnarValue],
+    arg_fields: &[Arc<Field>],
+    return_field: &Arc<Field>,
+    config_options: &Arc<ConfigOptions>,
+) {
+    let udf = ArrayResize::new();
+    group.bench_function(name, |b| {
+        b.iter(|| {
+            black_box(
+                udf.invoke_with_args(ScalarFunctionArgs {
+                    args: args.to_vec(),
+                    arg_fields: arg_fields.to_vec(),
+                    number_rows: NUM_ROWS,
+                    return_field: return_field.clone(),
+                    config_options: config_options.clone(),
+                })
+                .unwrap(),
+            )
+        })
+    });
+}
+
+fn create_int64_list_array(num_rows: usize, list_len: usize) -> ArrayRef {
+    let values = (0..(num_rows * list_len))
+        .map(|v| Some(v as i64))
+        .collect::<Int64Array>();
+    let offsets = (0..=num_rows)
+        .map(|i| (i * list_len) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new_list_field(DataType::Int64, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+fn repeated_int64_array(value: i64) -> ArrayRef {
+    Arc::new(Int64Array::from_value(value, NUM_ROWS))
+}
+
+fn distinct_fill_array() -> ArrayRef {
+    Arc::new(Int64Array::from_iter((0..NUM_ROWS).map(|i| Some(i as i64))))
+}
+
+fn mixed_size_array() -> ArrayRef {
+    Arc::new(Int64Array::from_iter(
+        (0..NUM_ROWS).map(|i| Some(if i % 2 == 0 { 200_i64 } else { 10_i64 })),
+    ))
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/array_reverse.rs b/datafusion/functions-nested/benches/array_reverse.rs
new file mode 100644
index 0000000000000..0c37296188315
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_reverse.rs
@@ -0,0 +1,110 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{hint::black_box, sync::Arc};
+
+use arrow::{
+    array::{ArrayRef, FixedSizeListArray, Int32Array, ListArray, ListViewArray},
+    buffer::{NullBuffer, OffsetBuffer, ScalarBuffer},
+    datatypes::{DataType, Field},
+};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_functions_nested::reverse::array_reverse_inner;
+
+fn array_reverse(array: &ArrayRef) -> ArrayRef {
+    black_box(array_reverse_inner(std::slice::from_ref(array)).unwrap())
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    // Create array sizes with step size of 100, starting from 100.
+    let number_of_arrays = 1000;
+    let sizes = (0..number_of_arrays)
+        .map(|i| 100 + i * 100)
+        .collect::<Vec<i32>>();
+
+    // Calculate the total number of values
+    let total_values = sizes.iter().sum::<i32>();
+
+    // Calculate sizes and offsets from array lengths
+    let offsets = sizes
+        .iter()
+        .scan(0, |acc, &x| {
+            let offset = *acc;
+            *acc += x;
+            Some(offset)
+        })
+        .collect::<Vec<i32>>();
+    let offsets = ScalarBuffer::from(offsets);
+    // Set every 10th array to null
+    let nulls = (0..number_of_arrays)
+        .map(|i| i % 10 != 0)
+        .collect::<Vec<bool>>();
+
+    let values = (0..total_values).collect::<Vec<i32>>();
+    let values = Arc::new(Int32Array::from(values));
+
+    // Create ListArray and ListViewArray
+    let nulls_list_array = Some(NullBuffer::from(
+        nulls[..((number_of_arrays as usize) - 1)].to_vec(),
+    ));
+    let list_array: ArrayRef = Arc::new(ListArray::new(
+        Arc::new(Field::new("a", DataType::Int32, false)),
+        OffsetBuffer::new(offsets.clone()),
+        values.clone(),
+        nulls_list_array,
+    ));
+    let nulls_list_view_array = Some(NullBuffer::from(
+        nulls[..(number_of_arrays as usize)].to_vec(),
+    ));
+    let list_view_array: ArrayRef = Arc::new(ListViewArray::new(
+        Arc::new(Field::new("a", DataType::Int32, false)),
+        offsets,
+        ScalarBuffer::from(sizes),
+        values.clone(),
+        nulls_list_view_array,
+    ));
+
+    c.bench_function("array_reverse_list", |b| {
+        b.iter(|| array_reverse(&list_array))
+    });
+
+    c.bench_function("array_reverse_list_view", |b| {
+        b.iter(|| array_reverse(&list_view_array))
+    });
+
+    // Create FixedSizeListArray
+    let array_len = 1000;
+    let num_arrays = 5000;
+    let total_values = num_arrays * array_len;
+    let values = (0..total_values).collect::<Vec<i32>>();
+    let values = Arc::new(Int32Array::from(values));
+    // Set every 10th array to null
+    let nulls = (0..num_arrays).map(|i| i % 10 != 0).collect::<Vec<bool>>();
+    let nulls = Some(NullBuffer::from(nulls));
+    let fixed_size_list_array: ArrayRef = Arc::new(FixedSizeListArray::new(
+        Arc::new(Field::new("a", DataType::Int32, false)),
+        array_len,
+        values.clone(),
+        nulls.clone(),
+    ));
+    c.bench_function("array_reverse_fixed_size_list", |b| {
+        b.iter(|| array_reverse(&fixed_size_list_array))
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/array_set_ops.rs b/datafusion/functions-nested/benches/array_set_ops.rs
new file mode 100644
index 0000000000000..d43bbdb577d06
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_set_ops.rs
@@ -0,0 +1,389 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, Int64Array, ListArray};
+use arrow::buffer::OffsetBuffer;
+use arrow::datatypes::{DataType, Field};
+use criterion::{
+    criterion_group, criterion_main, {BenchmarkId, Criterion},
+};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions_nested::except::ArrayExcept;
+use datafusion_functions_nested::set_ops::{ArrayDistinct, ArrayIntersect, ArrayUnion};
+use rand::SeedableRng;
+use rand::prelude::SliceRandom;
+use rand::rngs::StdRng;
+use std::collections::HashSet;
+use std::hint::black_box;
+use std::sync::Arc;
+
+const NUM_ROWS: usize = 1000;
+const ARRAY_SIZES: &[usize] = &[10, 50, 100];
+const SEED: u64 = 42;
+/// Extra rows on each side when building sliced arrays, so the underlying
+/// values buffer is much larger than the visible portion.
+const SLICE_PADDING: usize = 5000;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    bench_array_union(c);
+    bench_array_intersect(c);
+    bench_array_except(c);
+    bench_array_distinct(c);
+    bench_array_union_sliced(c);
+    bench_array_intersect_sliced(c);
+    bench_array_distinct_sliced(c);
+    bench_array_except_sliced(c);
+}
+
+fn invoke_udf(udf: &impl ScalarUDFImpl, array1: &ArrayRef, array2: &ArrayRef) {
+    black_box(
+        udf.invoke_with_args(ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Array(array1.clone()),
+                ColumnarValue::Array(array2.clone()),
+            ],
+            arg_fields: vec![
+                Field::new("arr1", array1.data_type().clone(), false).into(),
+                Field::new("arr2", array2.data_type().clone(), false).into(),
+            ],
+            number_rows: NUM_ROWS,
+            return_field: Field::new("result", array1.data_type().clone(), false).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        })
+        .unwrap(),
+    );
+}
+
+fn bench_array_union(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_union");
+    let udf = ArrayUnion::new();
+
+    for (overlap_label, overlap_ratio) in &[("high_overlap", 0.8), ("low_overlap", 0.2)] {
+        for &array_size in ARRAY_SIZES {
+            let (array1, array2) =
+                create_arrays_with_overlap(NUM_ROWS, array_size, *overlap_ratio);
+            group.bench_with_input(
+                BenchmarkId::new(*overlap_label, array_size),
+                &array_size,
+                |b, _| b.iter(|| invoke_udf(&udf, &array1, &array2)),
+            );
+        }
+    }
+
+    group.finish();
+}
+
+fn bench_array_intersect(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_intersect");
+    let udf = ArrayIntersect::new();
+
+    for (overlap_label, overlap_ratio) in &[("high_overlap", 0.8), ("low_overlap", 0.2)] {
+        for &array_size in ARRAY_SIZES {
+            let (array1, array2) =
+                create_arrays_with_overlap(NUM_ROWS, array_size, *overlap_ratio);
+            group.bench_with_input(
+                BenchmarkId::new(*overlap_label, array_size),
+                &array_size,
+                |b, _| b.iter(|| invoke_udf(&udf, &array1, &array2)),
+            );
+        }
+    }
+
+    group.finish();
+}
+
+fn bench_array_except(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_except");
+    let udf = ArrayExcept::new();
+
+    for (overlap_label, overlap_ratio) in &[("high_overlap", 0.8), ("low_overlap", 0.2)] {
+        for &array_size in ARRAY_SIZES {
+            let (array1, array2) =
+                create_arrays_with_overlap(NUM_ROWS, array_size, *overlap_ratio);
+            group.bench_with_input(
+                BenchmarkId::new(*overlap_label, array_size),
+                &array_size,
+                |b, _| b.iter(|| invoke_udf(&udf, &array1, &array2)),
+            );
+        }
+    }
+
+    group.finish();
+}
+
+fn bench_array_distinct(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_distinct");
+    let udf = ArrayDistinct::new();
+
+    for (duplicate_label, duplicate_ratio) in
+        &[("high_duplicate", 0.8), ("low_duplicate", 0.2)]
+    {
+        for &array_size in ARRAY_SIZES {
+            let array =
+                create_array_with_duplicates(NUM_ROWS, array_size, *duplicate_ratio);
+            group.bench_with_input(
+                BenchmarkId::new(*duplicate_label, array_size),
+                &array_size,
+                |b, _| {
+                    b.iter(|| {
+                        black_box(
+                            udf.invoke_with_args(ScalarFunctionArgs {
+                                args: vec![ColumnarValue::Array(array.clone())],
+                                arg_fields: vec![
+                                    Field::new("arr", array.data_type().clone(), false)
+                                        .into(),
+                                ],
+                                number_rows: NUM_ROWS,
+                                return_field: Field::new(
+                                    "result",
+                                    array.data_type().clone(),
+                                    false,
+                                )
+                                .into(),
+                                config_options: Arc::new(ConfigOptions::default()),
+                            })
+                            .unwrap(),
+                        )
+                    })
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
+fn create_arrays_with_overlap(
+    num_rows: usize,
+    array_size: usize,
+    overlap_ratio: f64,
+) -> (ArrayRef, ArrayRef) {
+    assert!((0.0..=1.0).contains(&overlap_ratio));
+    let overlap_count = ((array_size as f64) * overlap_ratio).round() as usize;
+
+    let mut rng = StdRng::seed_from_u64(SEED);
+
+    let mut values1 = Vec::with_capacity(num_rows * array_size);
+    let mut values2 = Vec::with_capacity(num_rows * array_size);
+
+    for row in 0..num_rows {
+        let base = (row as i64) * (array_size as i64) * 2;
+
+        for i in 0..array_size {
+            values1.push(base + i as i64);
+        }
+
+        let mut positions: Vec<usize> = (0..array_size).collect();
+        positions.shuffle(&mut rng);
+
+        let overlap_positions: HashSet<_> =
+            positions[..overlap_count].iter().copied().collect();
+
+        for i in 0..array_size {
+            if overlap_positions.contains(&i) {
+                values2.push(base + i as i64);
+            } else {
+                values2.push(base + array_size as i64 + i as i64);
+            }
+        }
+    }
+
+    let values1 = Int64Array::from(values1);
+    let values2 = Int64Array::from(values2);
+
+    let field = Arc::new(Field::new("item", DataType::Int64, true));
+
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    let array1 = Arc::new(
+        ListArray::try_new(
+            field.clone(),
+            OffsetBuffer::new(offsets.clone().into()),
+            Arc::new(values1),
+            None,
+        )
+        .unwrap(),
+    );
+
+    let array2 = Arc::new(
+        ListArray::try_new(
+            field,
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values2),
+            None,
+        )
+        .unwrap(),
+    );
+
+    (array1, array2)
+}
+
+fn create_array_with_duplicates(
+    num_rows: usize,
+    array_size: usize,
+    duplicate_ratio: f64,
+) -> ArrayRef {
+    assert!((0.0..=1.0).contains(&duplicate_ratio));
+    let unique_count = ((array_size as f64) * (1.0 - duplicate_ratio)).round() as usize;
+    let duplicate_count = array_size - unique_count;
+
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let mut values = Vec::with_capacity(num_rows * array_size);
+
+    for row in 0..num_rows {
+        let base = (row as i64) * (array_size as i64) * 2;
+
+        // Add unique values first
+        for i in 0..unique_count {
+            values.push(base + i as i64);
+        }
+
+        // Fill the rest with duplicates randomly picked from the unique values
+        let mut unique_indices: Vec<i64> =
+            (0..unique_count).map(|i| base + i as i64).collect();
+        unique_indices.shuffle(&mut rng);
+
+        for i in 0..duplicate_count {
+            values.push(unique_indices[i % unique_count]);
+        }
+    }
+
+    let values = Int64Array::from(values);
+    let field = Arc::new(Field::new("item", DataType::Int64, true));
+
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            field,
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+/// Slice a pair of arrays to the middle `NUM_ROWS` rows from a larger array.
+fn slice_pair(arrays: &(ArrayRef, ArrayRef)) -> (ArrayRef, ArrayRef) {
+    let a1 = arrays.0.slice(SLICE_PADDING, NUM_ROWS);
+    let a2 = arrays.1.slice(SLICE_PADDING, NUM_ROWS);
+    (a1, a2)
+}
+
+fn bench_array_union_sliced(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_union_sliced");
+    let udf = ArrayUnion::new();
+
+    for &array_size in ARRAY_SIZES {
+        let (a1, a2) = slice_pair(&create_arrays_with_overlap(
+            NUM_ROWS + 2 * SLICE_PADDING,
+            array_size,
+            0.5,
+        ));
+        group.bench_with_input(
+            BenchmarkId::from_parameter(array_size),
+            &array_size,
+            |b, _| b.iter(|| invoke_udf(&udf, &a1, &a2)),
+        );
+    }
+    group.finish();
+}
+
+fn bench_array_intersect_sliced(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_intersect_sliced");
+    let udf = ArrayIntersect::new();
+
+    for &array_size in ARRAY_SIZES {
+        let (a1, a2) = slice_pair(&create_arrays_with_overlap(
+            NUM_ROWS + 2 * SLICE_PADDING,
+            array_size,
+            0.5,
+        ));
+        group.bench_with_input(
+            BenchmarkId::from_parameter(array_size),
+            &array_size,
+            |b, _| b.iter(|| invoke_udf(&udf, &a1, &a2)),
+        );
+    }
+    group.finish();
+}
+
+fn bench_array_except_sliced(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_except_sliced");
+    let udf = ArrayExcept::new();
+
+    for &array_size in ARRAY_SIZES {
+        let (a1, a2) = slice_pair(&create_arrays_with_overlap(
+            NUM_ROWS + 2 * SLICE_PADDING,
+            array_size,
+            0.5,
+        ));
+        group.bench_with_input(
+            BenchmarkId::from_parameter(array_size),
+            &array_size,
+            |b, _| b.iter(|| invoke_udf(&udf, &a1, &a2)),
+        );
+    }
+    group.finish();
+}
+
+fn bench_array_distinct_sliced(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_distinct_sliced");
+    let udf = ArrayDistinct::new();
+
+    for &array_size in ARRAY_SIZES {
+        let array =
+            create_array_with_duplicates(NUM_ROWS + 2 * SLICE_PADDING, array_size, 0.5)
+                .slice(SLICE_PADDING, NUM_ROWS);
+        group.bench_with_input(
+            BenchmarkId::from_parameter(array_size),
+            &array_size,
+            |b, _| {
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: vec![ColumnarValue::Array(array.clone())],
+                            arg_fields: vec![
+                                Field::new("arr", array.data_type().clone(), false)
+                                    .into(),
+                            ],
+                            number_rows: NUM_ROWS,
+                            return_field: Field::new(
+                                "result",
+                                array.data_type().clone(),
+                                false,
+                            )
+                            .into(),
+                            config_options: Arc::new(ConfigOptions::default()),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+    }
+    group.finish();
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/array_slice.rs b/datafusion/functions-nested/benches/array_slice.rs
new file mode 100644
index 0000000000000..b95fe47575e53
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_slice.rs
@@ -0,0 +1,228 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Int64Array, ListArray, ListViewArray, NullBufferBuilder, PrimitiveArray,
+};
+use arrow::buffer::{OffsetBuffer, ScalarBuffer};
+use arrow::datatypes::{DataType, Field, Int64Type};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions_nested::extract::array_slice_udf;
+use rand::rngs::StdRng;
+use rand::seq::IndexedRandom;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn create_inputs(
+    rng: &mut StdRng,
+    size: usize,
+    child_array_size: usize,
+    null_density: f32,
+) -> (ListArray, ListViewArray) {
+    let mut nulls_builder = NullBufferBuilder::new(size);
+    let mut sizes = Vec::with_capacity(size);
+
+    for _ in 0..size {
+        if rng.random::<f32>() < null_density {
+            nulls_builder.append_null();
+        } else {
+            nulls_builder.append_non_null();
+        }
+        sizes.push(rng.random_range(1..child_array_size));
+    }
+    let nulls = nulls_builder.finish();
+
+    let length = sizes.iter().sum();
+    let values: PrimitiveArray<Int64Type> =
+        (0..length).map(|_| Some(rng.random())).collect();
+    let values = Arc::new(values);
+
+    let offsets = OffsetBuffer::from_lengths(sizes.clone());
+    let list_array = ListArray::new(
+        Arc::new(Field::new_list_field(DataType::Int64, true)),
+        offsets.clone(),
+        values.clone(),
+        nulls.clone(),
+    );
+
+    let offsets = ScalarBuffer::from(offsets.slice(0, size - 1));
+    let sizes = ScalarBuffer::from_iter(sizes.into_iter().map(|v| v as i32));
+    let list_view_array = ListViewArray::new(
+        Arc::new(Field::new_list_field(DataType::Int64, true)),
+        offsets,
+        sizes,
+        values,
+        nulls,
+    );
+
+    (list_array, list_view_array)
+}
+
+/// Create `from`, `to`, and `stride` from an array of strides.
+fn random_from_to_stride(
+    rng: &mut StdRng,
+    size: i64,
+    null_density: f32,
+    stride_choices: &[Option<i64>],
+) -> (Option<i64>, Option<i64>, Option<i64>) {
+    let from = if rng.random::<f32>() < null_density {
+        None
+    } else {
+        Some(rng.random_range(1..=size))
+    };
+
+    let to = if rng.random::<f32>() < null_density {
+        None
+    } else {
+        match from {
+            Some(from) => Some(rng.random_range(from..=size)),
+            None => Some(rng.random_range(1..=size)),
+        }
+    };
+
+    let stride = stride_choices.choose(rng).cloned().unwrap_or(None);
+
+    if from.is_none() || to.is_none() || stride.is_none_or(|s| s > 0) {
+        (from, to, stride)
+    } else {
+        // stride < 0, swap from and to
+        (to, from, stride)
+    }
+}
+
+fn array_slice_benchmark(
+    name: &str,
+    input: ColumnarValue,
+    mut args: Vec<ColumnarValue>,
+    c: &mut Criterion,
+    size: usize,
+) {
+    args.insert(0, input);
+
+    let array_slice = array_slice_udf();
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| {
+            <Arc<Field>>::from(Field::new(format!("arg_{idx}"), arg.data_type(), true))
+        })
+        .collect::<Vec<_>>();
+    c.bench_function(name, |b| {
+        b.iter(|| {
+            black_box(
+                array_slice
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: args.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new_list_field(args[0].data_type(), true)
+                            .into(),
+                        config_options: Arc::new(ConfigOptions::default()),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let rng = &mut StdRng::seed_from_u64(42);
+    let size = 1_000_000;
+    let child_array_size = 100;
+    let null_density = 0.1;
+
+    let (list_array, list_view_array) =
+        create_inputs(rng, size, child_array_size, null_density);
+
+    let mut array_from = Vec::with_capacity(size);
+    let mut array_to = Vec::with_capacity(size);
+    let mut array_stride = Vec::with_capacity(size);
+    for child_array_size in list_array.offsets().lengths() {
+        let (from, to, stride) = random_from_to_stride(
+            rng,
+            child_array_size as i64,
+            null_density,
+            &[None, Some(-2), Some(-1), Some(1), Some(2)],
+        );
+        array_from.push(from);
+        array_to.push(to);
+        array_stride.push(stride);
+    }
+
+    // input
+    let list_array = ColumnarValue::Array(Arc::new(list_array));
+    let list_view_array = ColumnarValue::Array(Arc::new(list_view_array));
+
+    // args
+    let array_from = ColumnarValue::Array(Arc::new(Int64Array::from(array_from)));
+    let array_to = ColumnarValue::Array(Arc::new(Int64Array::from(array_to)));
+    let array_stride = ColumnarValue::Array(Arc::new(Int64Array::from(array_stride)));
+    let scalar_from = ColumnarValue::Scalar(ScalarValue::from(1i64));
+    let scalar_to = ColumnarValue::Scalar(ScalarValue::from(child_array_size as i64 / 2));
+
+    for input in [list_array, list_view_array] {
+        let input_type = input.data_type().to_string();
+
+        array_slice_benchmark(
+            &format!("array_slice: input {input_type}, array args"),
+            input.clone(),
+            vec![array_from.clone(), array_to.clone(), array_stride.clone()],
+            c,
+            size,
+        );
+
+        array_slice_benchmark(
+            &format!("array_slice: input {input_type}, array args, no stride"),
+            input.clone(),
+            vec![array_from.clone(), array_to.clone()],
+            c,
+            size,
+        );
+
+        array_slice_benchmark(
+            &format!("array_slice: input {input_type}, scalar args, no stride"),
+            input.clone(),
+            vec![scalar_from.clone(), scalar_to.clone()],
+            c,
+            size,
+        );
+
+        for stride in [-2i64, -1i64, 1i64, 2i64] {
+            // swap from and to if stride < 0
+            let (scalar_from, scalar_to) = if stride > 0 {
+                (scalar_from.clone(), scalar_to.clone())
+            } else {
+                (scalar_to.clone(), scalar_from.clone())
+            };
+            let scalar_stride = ColumnarValue::Scalar(ScalarValue::from(stride));
+            array_slice_benchmark(
+                &format!("array_slice: input {input_type}, scalar args, stride={stride}"),
+                input.clone(),
+                vec![scalar_from, scalar_to, scalar_stride],
+                c,
+                size,
+            );
+        }
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/array_sort.rs b/datafusion/functions-nested/benches/array_sort.rs
new file mode 100644
index 0000000000000..940c0396cbb08
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_sort.rs
@@ -0,0 +1,195 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::hint::black_box;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, BooleanBufferBuilder, Int32Array, ListArray, StringArray};
+use arrow::buffer::{NullBuffer, OffsetBuffer};
+use arrow::datatypes::{DataType, Field};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions_nested::sort::ArraySort;
+use rand::SeedableRng;
+use rand::rngs::StdRng;
+use rand::seq::SliceRandom;
+
+const SEED: u64 = 42;
+const NUM_ROWS: usize = 8192;
+
+fn create_int32_list_array(
+    num_rows: usize,
+    elements_per_row: usize,
+    with_nulls: bool,
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let total_values = num_rows * elements_per_row;
+
+    let mut values: Vec<i32> = (0..total_values as i32).collect();
+    values.shuffle(&mut rng);
+
+    let values = Arc::new(Int32Array::from(values));
+    let offsets: Vec<i32> = (0..=num_rows)
+        .map(|i| (i * elements_per_row) as i32)
+        .collect();
+
+    let nulls = if with_nulls {
+        // Every 10th row is null
+        Some(NullBuffer::from(
+            (0..num_rows).map(|i| i % 10 != 0).collect::<Vec<bool>>(),
+        ))
+    } else {
+        None
+    };
+
+    Arc::new(ListArray::new(
+        Arc::new(Field::new("item", DataType::Int32, true)),
+        OffsetBuffer::new(offsets.into()),
+        values,
+        nulls,
+    ))
+}
+
+/// Creates a ListArray where ~10% of elements within each row are null.
+fn create_int32_list_array_with_null_elements(
+    num_rows: usize,
+    elements_per_row: usize,
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let total_values = num_rows * elements_per_row;
+
+    let mut values: Vec<i32> = (0..total_values as i32).collect();
+    values.shuffle(&mut rng);
+
+    // ~10% of elements are null
+    let mut validity = BooleanBufferBuilder::new(total_values);
+    for i in 0..total_values {
+        validity.append(i % 10 != 0);
+    }
+    let null_buffer = NullBuffer::from(validity.finish());
+
+    let values = Arc::new(Int32Array::new(values.into(), Some(null_buffer)));
+    let offsets: Vec<i32> = (0..=num_rows)
+        .map(|i| (i * elements_per_row) as i32)
+        .collect();
+
+    Arc::new(ListArray::new(
+        Arc::new(Field::new("item", DataType::Int32, true)),
+        OffsetBuffer::new(offsets.into()),
+        values,
+        None,
+    ))
+}
+
+fn create_string_list_array(num_rows: usize, elements_per_row: usize) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let total_values = num_rows * elements_per_row;
+
+    let mut indices: Vec<usize> = (0..total_values).collect();
+    indices.shuffle(&mut rng);
+    let string_values: Vec<String> =
+        indices.iter().map(|i| format!("value_{i:06}")).collect();
+    let values = Arc::new(StringArray::from(string_values));
+
+    let offsets: Vec<i32> = (0..=num_rows)
+        .map(|i| (i * elements_per_row) as i32)
+        .collect();
+
+    Arc::new(ListArray::new(
+        Arc::new(Field::new("item", DataType::Utf8, true)),
+        OffsetBuffer::new(offsets.into()),
+        values,
+        None,
+    ))
+}
+
+fn invoke_array_sort(udf: &ArraySort, array: &ArrayRef) -> ColumnarValue {
+    udf.invoke_with_args(ScalarFunctionArgs {
+        args: vec![ColumnarValue::Array(Arc::clone(array))],
+        arg_fields: vec![Field::new("arr", array.data_type().clone(), true).into()],
+        number_rows: array.len(),
+        return_field: Field::new("result", array.data_type().clone(), true).into(),
+        config_options: Arc::new(ConfigOptions::default()),
+    })
+    .unwrap()
+}
+
+/// Vary elements_per_row over [5, 20, 100, 1000]: for small arrays, per-row
+/// overhead dominates, whereas for larger arrays the sort kernel dominates.
+fn bench_array_sort(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_sort");
+    let udf = ArraySort::new();
+
+    // Int32 arrays
+    for &elements_per_row in &[5, 20, 100, 1000] {
+        let array = create_int32_list_array(NUM_ROWS, elements_per_row, false);
+        group.bench_with_input(
+            BenchmarkId::new("int32", elements_per_row),
+            &elements_per_row,
+            |b, _| {
+                b.iter(|| {
+                    black_box(invoke_array_sort(&udf, &array));
+                });
+            },
+        );
+    }
+
+    // Int32 with nulls in the outer list (10% null rows), single size
+    {
+        let array = create_int32_list_array(NUM_ROWS, 50, true);
+        group.bench_function("int32_with_nulls", |b| {
+            b.iter(|| {
+                black_box(invoke_array_sort(&udf, &array));
+            });
+        });
+    }
+
+    // Int32 with null elements (~10% of elements within rows are null)
+    for &elements_per_row in &[5, 20, 100, 1000] {
+        let array =
+            create_int32_list_array_with_null_elements(NUM_ROWS, elements_per_row);
+        group.bench_with_input(
+            BenchmarkId::new("int32_null_elements", elements_per_row),
+            &elements_per_row,
+            |b, _| {
+                b.iter(|| {
+                    black_box(invoke_array_sort(&udf, &array));
+                });
+            },
+        );
+    }
+
+    // String arrays
+    for &elements_per_row in &[5, 20, 100, 1000] {
+        let array = create_string_list_array(NUM_ROWS, elements_per_row);
+        group.bench_with_input(
+            BenchmarkId::new("string", elements_per_row),
+            &elements_per_row,
+            |b, _| {
+                b.iter(|| {
+                    black_box(invoke_array_sort(&udf, &array));
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_array_sort);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/array_to_string.rs b/datafusion/functions-nested/benches/array_to_string.rs
new file mode 100644
index 0000000000000..4b63d705480bf
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_to_string.rs
@@ -0,0 +1,157 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef, ListArray, StringArray};
+use arrow::buffer::OffsetBuffer;
+use arrow::datatypes::{DataType, Field, Float64Type, Int64Type};
+use arrow::util::bench_util::create_primitive_list_array_with_seed;
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions_nested::string::ArrayToString;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+const NUM_ROWS: usize = 1000;
+const ARRAY_SIZES: &[usize] = &[5, 20, 100];
+const NESTED_ARRAY_SIZE: usize = 3;
+const SEED: u64 = 42;
+const NULL_DENSITY: f64 = 0.1;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    bench_array_to_string(c, "array_to_string_int64", create_int64_list_array);
+    bench_array_to_string(c, "array_to_string_float64", create_float64_list_array);
+    bench_array_to_string(c, "array_to_string_string", create_string_list_array);
+    bench_array_to_string(
+        c,
+        "array_to_string_nested_int64",
+        create_nested_int64_list_array,
+    );
+}
+
+fn bench_array_to_string(
+    c: &mut Criterion,
+    group_name: &str,
+    make_array: impl Fn(usize) -> ArrayRef,
+) {
+    let mut group = c.benchmark_group(group_name);
+
+    for &array_size in ARRAY_SIZES {
+        let list_array = make_array(array_size);
+        let args = vec![
+            ColumnarValue::Array(list_array.clone()),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(",".to_string()))),
+        ];
+        let arg_fields = vec![
+            Field::new("array", list_array.data_type().clone(), true).into(),
+            Field::new("delimiter", DataType::Utf8, false).into(),
+        ];
+
+        group.bench_with_input(
+            BenchmarkId::from_parameter(array_size),
+            &array_size,
+            |b, _| {
+                let udf = ArrayToString::new();
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: arg_fields.clone(),
+                            number_rows: NUM_ROWS,
+                            return_field: Field::new("result", DataType::Utf8, true)
+                                .into(),
+                            config_options: Arc::new(ConfigOptions::default()),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn create_int64_list_array(array_size: usize) -> ArrayRef {
+    Arc::new(create_primitive_list_array_with_seed::<i32, Int64Type>(
+        NUM_ROWS,
+        0.0,
+        NULL_DENSITY as f32,
+        array_size,
+        SEED,
+    ))
+}
+
+fn create_nested_int64_list_array(array_size: usize) -> ArrayRef {
+    let inner = create_int64_list_array(array_size);
+    let inner_rows = NUM_ROWS;
+    let outer_rows = inner_rows / NESTED_ARRAY_SIZE;
+    let offsets = (0..=outer_rows)
+        .map(|i| (i * NESTED_ARRAY_SIZE) as i32)
+        .collect::<Vec<i32>>();
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", inner.data_type().clone(), true)),
+            OffsetBuffer::new(offsets.into()),
+            inner,
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+fn create_float64_list_array(array_size: usize) -> ArrayRef {
+    Arc::new(create_primitive_list_array_with_seed::<i32, Float64Type>(
+        NUM_ROWS,
+        0.0,
+        NULL_DENSITY as f32,
+        array_size,
+        SEED,
+    ))
+}
+
+fn create_string_list_array(array_size: usize) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let values = (0..NUM_ROWS * array_size)
+        .map(|_| {
+            if rng.random::<f64>() < NULL_DENSITY {
+                None
+            } else {
+                Some(format!("value_{}", rng.random_range(0..100)))
+            }
+        })
+        .collect::<StringArray>();
+    let offsets = (0..=NUM_ROWS)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Utf8, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/arrays_zip.rs b/datafusion/functions-nested/benches/arrays_zip.rs
new file mode 100644
index 0000000000000..bc82b2978cc42
--- /dev/null
+++ b/datafusion/functions-nested/benches/arrays_zip.rs
@@ -0,0 +1,117 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::hint::black_box;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, Int64Array, ListArray};
+use arrow::buffer::{NullBuffer, OffsetBuffer};
+use arrow::datatypes::{DataType, Field};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions_nested::arrays_zip::ArraysZip;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+
+const NUM_ROWS: usize = 8192;
+const LIST_SIZE: usize = 10;
+const SEED: u64 = 42;
+
+/// Build a ListArray of Int64 with `num_rows` rows, each containing
+/// `list_size` elements. If `null_density > 0`, that fraction of
+/// rows will be null at the list level.
+fn make_list_array(
+    rng: &mut StdRng,
+    num_rows: usize,
+    list_size: usize,
+    null_density: f64,
+) -> ArrayRef {
+    let total = num_rows * list_size;
+    let values: Vec<i64> = (0..total).map(|_| rng.random_range(0..1000i64)).collect();
+    let values_array = Arc::new(Int64Array::from(values)) as ArrayRef;
+
+    let offsets: Vec<i32> = (0..=num_rows).map(|i| (i * list_size) as i32).collect();
+
+    let nulls = if null_density > 0.0 {
+        let valid: Vec<bool> = (0..num_rows)
+            .map(|_| rng.random::<f64>() >= null_density)
+            .collect();
+        Some(NullBuffer::from(valid))
+    } else {
+        None
+    };
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new_list_field(DataType::Int64, true)),
+            OffsetBuffer::new(offsets.into()),
+            values_array,
+            nulls,
+        )
+        .unwrap(),
+    )
+}
+
+fn bench_arrays_zip(c: &mut Criterion, name: &str, null_density: f64) {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let arr1 = make_list_array(&mut rng, NUM_ROWS, LIST_SIZE, null_density);
+    let arr2 = make_list_array(&mut rng, NUM_ROWS, LIST_SIZE, null_density);
+    let arr3 = make_list_array(&mut rng, NUM_ROWS, LIST_SIZE, null_density);
+
+    let udf = ArraysZip::new();
+    let args_vec = vec![
+        ColumnarValue::Array(Arc::clone(&arr1)),
+        ColumnarValue::Array(Arc::clone(&arr2)),
+        ColumnarValue::Array(Arc::clone(&arr3)),
+    ];
+    let return_type = udf
+        .return_type(&[
+            arr1.data_type().clone(),
+            arr2.data_type().clone(),
+            arr3.data_type().clone(),
+        ])
+        .unwrap();
+    let return_field = Arc::new(Field::new("f", return_type, true));
+    let arg_fields: Vec<_> = (0..3)
+        .map(|_| Arc::new(Field::new("a", arr1.data_type().clone(), true)))
+        .collect();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function(name, |b| {
+        b.iter(|| {
+            black_box(
+                udf.invoke_with_args(ScalarFunctionArgs {
+                    args: args_vec.clone(),
+                    arg_fields: arg_fields.clone(),
+                    number_rows: NUM_ROWS,
+                    return_field: Arc::clone(&return_field),
+                    config_options: Arc::clone(&config_options),
+                })
+                .expect("arrays_zip should work"),
+            )
+        })
+    });
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    bench_arrays_zip(c, "arrays_zip_no_nulls_8192", 0.0);
+    bench_arrays_zip(c, "arrays_zip_10pct_nulls_8192", 0.1);
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/map.rs b/datafusion/functions-nested/benches/map.rs
index 3197cc55cc957..67e7f314d2515 100644
--- a/datafusion/functions-nested/benches/map.rs
+++ b/datafusion/functions-nested/benches/map.rs
@@ -15,48 +15,116 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
-use arrow::array::{Int32Array, ListArray, StringArray};
+use arrow::array::{
+    ArrayRef, BinaryArray, BinaryViewArray, Int32Array, ListArray, StringArray,
+    StringViewArray,
+};
 use arrow::buffer::{OffsetBuffer, ScalarBuffer};
-use arrow::datatypes::{DataType, Field};
-use criterion::{criterion_group, criterion_main, Criterion};
-use datafusion_common::config::ConfigOptions;
+use arrow::datatypes::Field;
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::planner::ExprPlanner;
 use datafusion_expr::{ColumnarValue, Expr, ScalarFunctionArgs};
 use datafusion_functions_nested::map::map_udf;
 use datafusion_functions_nested::planner::NestedFunctionPlanner;
-use rand::prelude::ThreadRng;
 use rand::Rng;
+use rand::prelude::ThreadRng;
 use std::collections::HashSet;
+use std::hash::Hash;
 use std::hint::black_box;
 use std::sync::Arc;
 
-fn keys(rng: &mut ThreadRng) -> Vec<String> {
-    let mut keys = HashSet::with_capacity(1000);
+const MAP_ROWS: usize = 1000;
+const MAP_KEYS_PER_ROW: usize = 1000;
+
+fn gen_unique_values<T>(
+    rng: &mut ThreadRng,
+    mut make_value: impl FnMut(i32) -> T,
+) -> Vec<T>
+where
+    T: Eq + Hash,
+{
+    let mut values = HashSet::with_capacity(MAP_KEYS_PER_ROW);
 
-    while keys.len() < 1000 {
-        keys.insert(rng.random_range(0..10000).to_string());
+    while values.len() < MAP_KEYS_PER_ROW {
+        values.insert(make_value(rng.random_range(0..10000)));
     }
 
-    keys.into_iter().collect()
+    values.into_iter().collect()
 }
 
-fn values(rng: &mut ThreadRng) -> Vec<i32> {
-    let mut values = HashSet::with_capacity(1000);
+fn gen_repeat_values<T: Clone>(values: &[T], repeats: usize) -> Vec<T> {
+    let mut repeated = Vec::with_capacity(values.len() * repeats);
 
-    while values.len() < 1000 {
-        values.insert(rng.random_range(0..10000));
+    for _ in 0..repeats {
+        repeated.extend_from_slice(values);
     }
-    values.into_iter().collect()
+
+    repeated
+}
+
+fn gen_utf8_values(rng: &mut ThreadRng) -> Vec<String> {
+    gen_unique_values(rng, |value| value.to_string())
+}
+
+fn gen_binary_values(rng: &mut ThreadRng) -> Vec<Vec<u8>> {
+    gen_unique_values(rng, |value| value.to_le_bytes().to_vec())
+}
+
+fn gen_primitive_values(rng: &mut ThreadRng) -> Vec<i32> {
+    gen_unique_values(rng, |value| value)
+}
+
+fn list_array(values: ArrayRef, row_count: usize, values_per_row: usize) -> ArrayRef {
+    let offsets = (0..=row_count)
+        .map(|index| (index * values_per_row) as i32)
+        .collect::<Vec<_>>();
+    Arc::new(ListArray::new(
+        Arc::new(Field::new_list_field(values.data_type().clone(), true)),
+        OffsetBuffer::new(ScalarBuffer::from(offsets)),
+        values,
+        None,
+    ))
+}
+
+fn bench_map_case(c: &mut Criterion, name: &str, keys: ArrayRef, values: ArrayRef) {
+    let number_rows = keys.len();
+    let keys = ColumnarValue::Array(keys);
+    let values = ColumnarValue::Array(values);
+
+    let return_type = map_udf()
+        .return_type(&[keys.data_type(), values.data_type()])
+        .expect("should get return type");
+    let arg_fields = vec![
+        Field::new("a", keys.data_type(), true).into(),
+        Field::new("a", values.data_type(), true).into(),
+    ];
+    let return_field = Field::new("f", return_type, true).into();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function(name, |b| {
+        b.iter(|| {
+            black_box(
+                map_udf()
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: vec![keys.clone(), values.clone()],
+                        arg_fields: arg_fields.clone(),
+                        number_rows,
+                        return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .expect("map should work on valid values"),
+            );
+        });
+    });
 }
 
 fn criterion_benchmark(c: &mut Criterion) {
     c.bench_function("make_map_1000", |b| {
         let mut rng = rand::rng();
-        let keys = keys(&mut rng);
-        let values = values(&mut rng);
+        let keys = gen_utf8_values(&mut rng);
+        let values = gen_primitive_values(&mut rng);
         let mut buffer = Vec::new();
         for i in 0..1000 {
             buffer.push(Expr::Literal(
@@ -65,9 +133,7 @@ fn criterion_benchmark(c: &mut Criterion) {
             ));
             buffer.push(Expr::Literal(ScalarValue::Int32(Some(values[i])), None));
         }
-
         let planner = NestedFunctionPlanner {};
-
         b.iter(|| {
             black_box(
                 planner
@@ -77,51 +143,73 @@ fn criterion_benchmark(c: &mut Criterion) {
         });
     });
 
-    c.bench_function("map_1000", |b| {
-        let mut rng = rand::rng();
-        let field = Arc::new(Field::new_list_field(DataType::Utf8, true));
-        let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0, 1000]));
-        let key_list = ListArray::new(
-            field,
-            offsets,
-            Arc::new(StringArray::from(keys(&mut rng))),
-            None,
-        );
-        let field = Arc::new(Field::new_list_field(DataType::Int32, true));
-        let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0, 1000]));
-        let value_list = ListArray::new(
-            field,
-            offsets,
-            Arc::new(Int32Array::from(values(&mut rng))),
-            None,
-        );
-        let keys = ColumnarValue::Scalar(ScalarValue::List(Arc::new(key_list)));
-        let values = ColumnarValue::Scalar(ScalarValue::List(Arc::new(value_list)));
-
-        let return_type = map_udf()
-            .return_type(&[keys.data_type(), values.data_type()])
-            .expect("should get return type");
-        let arg_fields = vec![
-            Field::new("a", keys.data_type(), true).into(),
-            Field::new("a", values.data_type(), true).into(),
-        ];
-        let return_field = Field::new("f", return_type, true).into();
-        let config_options = Arc::new(ConfigOptions::default());
+    let mut rng = rand::rng();
+    let values = Arc::new(Int32Array::from(gen_repeat_values(
+        &gen_primitive_values(&mut rng),
+        MAP_ROWS,
+    ))) as ArrayRef;
+    let values = list_array(values, MAP_ROWS, MAP_KEYS_PER_ROW);
+    let map_cases = [
+        (
+            "map_1000_utf8",
+            list_array(
+                Arc::new(StringArray::from(gen_repeat_values(
+                    &gen_utf8_values(&mut rng),
+                    MAP_ROWS,
+                ))) as ArrayRef,
+                MAP_ROWS,
+                MAP_KEYS_PER_ROW,
+            ),
+        ),
+        (
+            "map_1000_binary",
+            list_array(
+                Arc::new(BinaryArray::from_iter_values(gen_repeat_values(
+                    &gen_binary_values(&mut rng),
+                    MAP_ROWS,
+                ))) as ArrayRef,
+                MAP_ROWS,
+                MAP_KEYS_PER_ROW,
+            ),
+        ),
+        (
+            "map_1000_utf8_view",
+            list_array(
+                Arc::new(StringViewArray::from(gen_repeat_values(
+                    &gen_utf8_values(&mut rng),
+                    MAP_ROWS,
+                ))) as ArrayRef,
+                MAP_ROWS,
+                MAP_KEYS_PER_ROW,
+            ),
+        ),
+        (
+            "map_1000_binary_view",
+            list_array(
+                Arc::new(BinaryViewArray::from_iter_values(gen_repeat_values(
+                    &gen_binary_values(&mut rng),
+                    MAP_ROWS,
+                ))) as ArrayRef,
+                MAP_ROWS,
+                MAP_KEYS_PER_ROW,
+            ),
+        ),
+        (
+            "map_1000_int32",
+            list_array(
+                Arc::new(Int32Array::from(gen_repeat_values(
+                    &gen_primitive_values(&mut rng),
+                    MAP_ROWS,
+                ))) as ArrayRef,
+                MAP_ROWS,
+                MAP_KEYS_PER_ROW,
+            ),
+        ),
+    ];
 
-        b.iter(|| {
-            black_box(
-                map_udf()
-                    .invoke_with_args(ScalarFunctionArgs {
-                        args: vec![keys.clone(), values.clone()],
-                        arg_fields: arg_fields.clone(),
-                        number_rows: 1,
-                        return_field: Arc::clone(&return_field),
-                        config_options: Arc::clone(&config_options),
-                    })
-                    .expect("map should work on valid values"),
-            );
-        });
-    });
+    for (name, keys) in map_cases {
+        bench_map_case(c, name, keys, Arc::clone(&values));
+    }
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/functions-nested/benches/string_to_array.rs b/datafusion/functions-nested/benches/string_to_array.rs
new file mode 100644
index 0000000000000..e403d5e51bac8
--- /dev/null
+++ b/datafusion/functions-nested/benches/string_to_array.rs
@@ -0,0 +1,244 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, StringArray};
+use arrow::datatypes::{DataType, Field};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions_nested::string::StringToArray;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+const NUM_ROWS: usize = 1000;
+const SEED: u64 = 42;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    // Single-char delimiter
+    let comma = ColumnarValue::Scalar(ScalarValue::Utf8(Some(",".to_string())));
+    bench_string_to_array(
+        c,
+        "string_to_array_single_char_delim",
+        create_csv_strings,
+        &comma,
+        None,
+    );
+
+    // Multi-char delimiter
+    let double_colon = ColumnarValue::Scalar(ScalarValue::Utf8(Some("::".to_string())));
+    bench_string_to_array(
+        c,
+        "string_to_array_multi_char_delim",
+        create_multi_delim_strings,
+        &double_colon,
+        None,
+    );
+
+    // With null_str argument
+    let null_str = ColumnarValue::Scalar(ScalarValue::Utf8(Some("NULL".to_string())));
+    bench_string_to_array(
+        c,
+        "string_to_array_with_null_str",
+        create_csv_strings_with_nulls,
+        &comma,
+        Some(&null_str),
+    );
+
+    // NULL delimiter
+    let null_delim = ColumnarValue::Scalar(ScalarValue::Utf8(None));
+    bench_string_to_array(
+        c,
+        "string_to_array_null_delim",
+        create_short_strings,
+        &null_delim,
+        None,
+    );
+
+    // Columnar delimiter (fall-back path)
+    bench_string_to_array_columnar_delim(c);
+}
+
+fn bench_string_to_array_columnar_delim(c: &mut Criterion) {
+    let mut group = c.benchmark_group("string_to_array_columnar_delim");
+
+    for &num_elements in &[5, 20, 100] {
+        let string_array = create_csv_strings(num_elements);
+        let delimiter_array: ArrayRef =
+            Arc::new(StringArray::from(vec![Some(","); NUM_ROWS]));
+
+        let args = vec![
+            ColumnarValue::Array(string_array.clone()),
+            ColumnarValue::Array(delimiter_array),
+        ];
+        let arg_fields = vec![
+            Field::new("str", DataType::Utf8, true).into(),
+            Field::new("delimiter", DataType::Utf8, false).into(),
+        ];
+
+        let return_field = Field::new(
+            "result",
+            DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))),
+            true,
+        );
+
+        group.bench_with_input(
+            BenchmarkId::from_parameter(num_elements),
+            &num_elements,
+            |b, _| {
+                let udf = StringToArray::new();
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: arg_fields.clone(),
+                            number_rows: NUM_ROWS,
+                            return_field: return_field.clone().into(),
+                            config_options: Arc::new(ConfigOptions::default()),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_string_to_array(
+    c: &mut Criterion,
+    group_name: &str,
+    make_strings: fn(usize) -> ArrayRef,
+    delimiter: &ColumnarValue,
+    null_str: Option<&ColumnarValue>,
+) {
+    let mut group = c.benchmark_group(group_name);
+
+    for &num_elements in &[5, 20, 100] {
+        let string_array = make_strings(num_elements);
+
+        let mut args = vec![
+            ColumnarValue::Array(string_array.clone()),
+            delimiter.clone(),
+        ];
+        let mut arg_fields = vec![
+            Field::new("str", DataType::Utf8, true).into(),
+            Field::new("delimiter", DataType::Utf8, true).into(),
+        ];
+        if let Some(ns) = null_str {
+            args.push(ns.clone());
+            arg_fields.push(Field::new("null_str", DataType::Utf8, true).into());
+        }
+
+        let return_field = Field::new(
+            "result",
+            DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))),
+            true,
+        );
+
+        group.bench_with_input(
+            BenchmarkId::from_parameter(num_elements),
+            &num_elements,
+            |b, _| {
+                let udf = StringToArray::new();
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: arg_fields.clone(),
+                            number_rows: NUM_ROWS,
+                            return_field: return_field.clone().into(),
+                            config_options: Arc::new(ConfigOptions::default()),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Creates strings like "val1,val2,val3,...,valN" with `num_elements` elements.
+fn create_csv_strings(num_elements: usize) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let strings: StringArray = (0..NUM_ROWS)
+        .map(|_| {
+            let parts: Vec<String> = (0..num_elements)
+                .map(|_| format!("val{}", rng.random_range(0..1000)))
+                .collect();
+            Some(parts.join(","))
+        })
+        .collect();
+    Arc::new(strings)
+}
+
+/// Creates strings like "val1::val2::val3::...::valN".
+fn create_multi_delim_strings(num_elements: usize) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let strings: StringArray = (0..NUM_ROWS)
+        .map(|_| {
+            let parts: Vec<String> = (0..num_elements)
+                .map(|_| format!("val{}", rng.random_range(0..1000)))
+                .collect();
+            Some(parts.join("::"))
+        })
+        .collect();
+    Arc::new(strings)
+}
+
+/// Creates CSV strings where ~10% of elements are the literal "NULL".
+fn create_csv_strings_with_nulls(num_elements: usize) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let strings: StringArray = (0..NUM_ROWS)
+        .map(|_| {
+            let parts: Vec<String> = (0..num_elements)
+                .map(|_| {
+                    if rng.random::<f64>() < 0.1 {
+                        "NULL".to_string()
+                    } else {
+                        format!("val{}", rng.random_range(0..1000))
+                    }
+                })
+                .collect();
+            Some(parts.join(","))
+        })
+        .collect();
+    Arc::new(strings)
+}
+
+/// Creates short strings (length = `num_chars`) for the NULL-delimiter
+/// (split-into-characters) benchmark.
+fn create_short_strings(num_chars: usize) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let strings: StringArray = (0..NUM_ROWS)
+        .map(|_| {
+            let s: String = (0..num_chars)
+                .map(|_| rng.random_range(b'a'..=b'z') as char)
+                .collect();
+            Some(s)
+        })
+        .collect();
+    Arc::new(strings)
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/src/array_any_match.rs b/datafusion/functions-nested/src/array_any_match.rs
new file mode 100644
index 0000000000000..e0a56e0f3c117
--- /dev/null
+++ b/datafusion/functions-nested/src/array_any_match.rs
@@ -0,0 +1,473 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`HigherOrderUDF`] definitions for array_any_match function.
+
+use arrow::{
+    array::{Array, AsArray, BooleanArray, BooleanBuilder, new_null_array},
+    buffer::NullBuffer,
+    datatypes::{ArrowNativeType, DataType, Field, FieldRef},
+};
+use datafusion_common::{
+    Result, exec_datafusion_err, exec_err, plan_err,
+    utils::{adjust_offsets_for_slice, list_values, take_function_args},
+};
+use datafusion_expr::{
+    ColumnarValue, Documentation, HigherOrderFunctionArgs, HigherOrderReturnFieldArgs,
+    HigherOrderSignature, HigherOrderUDF, LambdaParametersProgress, ValueOrLambda,
+    Volatility,
+};
+use datafusion_macros::user_doc;
+use std::{fmt::Debug, sync::Arc};
+
+make_higher_order_function_expr_and_func!(
+    ArrayAnyMatch,
+    array_any_match,
+    array lambda,
+    "returns true if any element in the array satisfies the predicate",
+    array_any_match_higher_order_function
+);
+
+#[user_doc(
+    doc_section(label = "Array Functions"),
+    description = "Returns whether any elements of an array match the given predicate. Returns true if one or more elements match, false if none match (including empty arrays), and null if the predicate returns null for some elements and false for all others.",
+    syntax_example = "any_match(array, predicate)",
+    sql_example = r#"```sql
+> select any_match([1, 2, 3], x -> x > 2);
++----------------------------------+
+| any_match([1, 2, 3], x -> x > 2) |
++----------------------------------+
+| true                             |
++----------------------------------+
+```"#,
+    argument(
+        name = "array",
+        description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
+    ),
+    argument(
+        name = "predicate",
+        description = "Lambda predicate that returns a boolean"
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct ArrayAnyMatch {
+    signature: HigherOrderSignature,
+    aliases: Vec<String>,
+}
+
+impl Default for ArrayAnyMatch {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ArrayAnyMatch {
+    pub fn new() -> Self {
+        Self {
+            signature: HigherOrderSignature::user_defined(Volatility::Immutable),
+            aliases: vec![String::from("any_match"), String::from("list_any_match")],
+        }
+    }
+}
+
+// Returns Some(true) if any element in [start, end) is true,
+// None if no element is true but some are null,
+// Some(false) if all are false or range is empty.
+fn any_match_for_range(
+    predicate: &BooleanArray,
+    start: usize,
+    end: usize,
+) -> Option<bool> {
+    let any_true = (start..end).any(|j| predicate.is_valid(j) && predicate.value(j));
+    if any_true {
+        return Some(true);
+    }
+    let any_null = (start..end).any(|j| predicate.is_null(j));
+    if any_null { None } else { Some(false) }
+}
+
+impl HigherOrderUDF for ArrayAnyMatch {
+    fn name(&self) -> &str {
+        "array_any_match"
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn signature(&self) -> &HigherOrderSignature {
+        &self.signature
+    }
+
+    fn coerce_value_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        let list = if arg_types.len() == 1 {
+            &arg_types[0]
+        } else {
+            return plan_err!(
+                "{} function requires 1 value argument, got {}",
+                self.name(),
+                arg_types.len()
+            );
+        };
+
+        let coerced = match list {
+            DataType::List(_) | DataType::LargeList(_) => list.clone(),
+            DataType::ListView(field) | DataType::FixedSizeList(field, _) => {
+                DataType::List(Arc::clone(field))
+            }
+            DataType::LargeListView(field) => DataType::LargeList(Arc::clone(field)),
+            _ => {
+                return plan_err!(
+                    "{} expected a list as first argument, got {}",
+                    self.name(),
+                    list
+                );
+            }
+        };
+
+        Ok(vec![coerced])
+    }
+
+    fn lambda_parameters(
+        &self,
+        _step: usize,
+        fields: &[ValueOrLambda<FieldRef, Option<FieldRef>>],
+    ) -> Result<LambdaParametersProgress> {
+        let [list, _lambda] = take_function_args(self.name(), fields)?;
+
+        let field = match list {
+            ValueOrLambda::Value(f) => match f.data_type() {
+                DataType::List(field) => field,
+                DataType::LargeList(field) => field,
+                other => return plan_err!("expected list, got {other}"),
+            },
+            _ => return plan_err!("{} expected a value as first argument", self.name()),
+        };
+
+        Ok(LambdaParametersProgress::Complete(vec![vec![Arc::clone(
+            field,
+        )]]))
+    }
+
+    fn return_field_from_args(
+        &self,
+        args: HigherOrderReturnFieldArgs,
+    ) -> Result<Arc<Field>> {
+        let [list, _lambda] = take_function_args(self.name(), args.arg_fields)?;
+        let nullable = matches!(list, ValueOrLambda::Value(f) if f.is_nullable());
+        Ok(Arc::new(Field::new("", DataType::Boolean, nullable)))
+    }
+
+    fn invoke_with_args(&self, args: HigherOrderFunctionArgs) -> Result<ColumnarValue> {
+        let [list, lambda] = take_function_args(self.name(), &args.args)?;
+
+        let (ValueOrLambda::Value(list), ValueOrLambda::Lambda(lambda)) = (list, lambda)
+        else {
+            return exec_err!("{} expects a value followed by a lambda", self.name());
+        };
+
+        let list_array = list.to_array(args.number_rows)?;
+
+        // fast path: fully null input — also required for FixedSizeList which can't be
+        // handled by clear_null_values when fully null
+        if list_array.null_count() == list_array.len() {
+            return Ok(ColumnarValue::Array(new_null_array(
+                args.return_type(),
+                list_array.len(),
+            )));
+        }
+
+        let list_values = list_values(&list_array)?;
+
+        let values_param = || Ok(Arc::clone(&list_values));
+
+        let predicate_results = lambda
+            .evaluate(&[&values_param])?
+            .into_array(list_values.len())?;
+
+        let predicate_bool = predicate_results
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .ok_or_else(|| {
+                exec_datafusion_err!(
+                    "{} predicate must return boolean array",
+                    self.name()
+                )
+            })?;
+
+        let mut values = BooleanBuilder::with_capacity(list_array.len());
+
+        // Maps predicate results (flat over all elements) back to one Boolean per row.
+        // Uses adjusted offsets so sliced lists index correctly into the predicate array.
+        macro_rules! process_list {
+            ($list_typed:expr) => {{
+                let offsets = adjust_offsets_for_slice($list_typed);
+                for i in 0..$list_typed.len() {
+                    let start = offsets[i].as_usize();
+                    let end = offsets[i + 1].as_usize();
+                    // any_match_for_range returns None when nulls poison the result;
+                    // null rows produce an empty range and return Some(false), but their
+                    // null bit is preserved by attaching the original null bitmap below.
+                    values.append_option(any_match_for_range(predicate_bool, start, end));
+                }
+            }};
+        }
+
+        match list_array.data_type() {
+            DataType::List(_) => {
+                process_list!(list_array.as_list::<i32>());
+            }
+            DataType::LargeList(_) => {
+                process_list!(list_array.as_list::<i64>());
+            }
+            other => return exec_err!("expected list, got {other}"),
+        }
+
+        let (boolean_buffer, predicate_nulls) = values.finish().into_parts();
+        // Merge: a row is null if the input list row was null or the predicate returned null.
+        let nulls = NullBuffer::union(list_array.nulls(), predicate_nulls.as_ref());
+        Ok(ColumnarValue::Array(Arc::new(BooleanArray::new(
+            boolean_buffer,
+            nulls,
+        ))))
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{collections::HashMap, sync::Arc};
+
+    use arrow::{
+        array::{ArrayRef, BooleanArray, Int32Array, ListArray, RecordBatch},
+        buffer::{NullBuffer, OffsetBuffer},
+        datatypes::{DataType, Field},
+    };
+    use datafusion_common::{DFSchema, Result};
+    use datafusion_expr::{
+        Expr, col,
+        execution_props::ExecutionProps,
+        expr::{HigherOrderFunction, LambdaVariable},
+        lambda, lit,
+    };
+    use datafusion_physical_expr::create_physical_expr;
+
+    use crate::array_any_match::array_any_match_higher_order_function;
+
+    fn run_any_match(
+        list: impl arrow::array::Array + Clone + 'static,
+    ) -> Result<ArrayRef> {
+        let schema = DFSchema::from_unqualified_fields(
+            vec![Field::new(
+                "list",
+                list.data_type().clone(),
+                list.is_nullable(),
+            )]
+            .into(),
+            HashMap::new(),
+        )?;
+
+        create_physical_expr(
+            &Expr::HigherOrderFunction(HigherOrderFunction::new(
+                array_any_match_higher_order_function(),
+                vec![
+                    col("list"),
+                    lambda(
+                        ["x"],
+                        Expr::LambdaVariable(LambdaVariable::new(
+                            "x".to_string(),
+                            Some(Arc::new(Field::new("x", DataType::Int32, true))),
+                        ))
+                        .gt(lit(2i32)),
+                    ),
+                ],
+            )),
+            &schema,
+            &ExecutionProps::new(),
+        )?
+        .evaluate(&RecordBatch::try_new(
+            Arc::clone(schema.inner()),
+            vec![Arc::new(list.clone())],
+        )?)?
+        .into_array(list.len())
+    }
+
+    fn run_any_match_div(
+        list: impl arrow::array::Array + Clone + 'static,
+    ) -> Result<ArrayRef> {
+        let schema = DFSchema::from_unqualified_fields(
+            vec![Field::new(
+                "list",
+                list.data_type().clone(),
+                list.is_nullable(),
+            )]
+            .into(),
+            HashMap::new(),
+        )?;
+
+        let x = Expr::LambdaVariable(LambdaVariable::new(
+            "x".to_string(),
+            Some(Arc::new(Field::new("x", DataType::Int32, true))),
+        ));
+        // predicate: (100 / x) > 5 — panics on divide by zero if x == 0 is evaluated
+        create_physical_expr(
+            &Expr::HigherOrderFunction(HigherOrderFunction::new(
+                array_any_match_higher_order_function(),
+                vec![col("list"), lambda(["x"], (lit(100i32) / x).gt(lit(5i32)))],
+            )),
+            &schema,
+            &ExecutionProps::new(),
+        )?
+        .evaluate(&RecordBatch::try_new(
+            Arc::clone(schema.inner()),
+            vec![Arc::new(list.clone())],
+        )?)?
+        .into_array(list.len())
+    }
+
+    fn make_list(values: Vec<i32>, offsets: OffsetBuffer<i32>) -> ListArray {
+        make_list_with_nulls(values, offsets, None)
+    }
+
+    fn make_list_with_nulls(
+        values: Vec<i32>,
+        offsets: OffsetBuffer<i32>,
+        nulls: Option<NullBuffer>,
+    ) -> ListArray {
+        ListArray::new(
+            Arc::new(Field::new_list_field(DataType::Int32, true)),
+            offsets,
+            Arc::new(Int32Array::from(values)),
+            nulls,
+        )
+    }
+
+    #[test]
+    fn test_any_match_some_true() -> Result<()> {
+        let list = make_list(vec![1, 2, 3], OffsetBuffer::from_lengths(vec![3]));
+        let result = run_any_match(list)?;
+        assert_eq!(
+            result.as_any().downcast_ref::<BooleanArray>().unwrap(),
+            &BooleanArray::from(vec![Some(true)])
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_any_match_none_true() -> Result<()> {
+        let list = make_list(vec![1, 2], OffsetBuffer::from_lengths(vec![2]));
+        let result = run_any_match(list)?;
+        assert_eq!(
+            result.as_any().downcast_ref::<BooleanArray>().unwrap(),
+            &BooleanArray::from(vec![Some(false)])
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_any_match_empty_array() -> Result<()> {
+        let list = make_list(vec![], OffsetBuffer::from_lengths(vec![0]));
+        let result = run_any_match(list)?;
+        assert_eq!(
+            result.as_any().downcast_ref::<BooleanArray>().unwrap(),
+            &BooleanArray::from(vec![Some(false)])
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_any_match_multiple_rows() -> Result<()> {
+        let list = make_list(vec![1, 2, 3, 1, 2], OffsetBuffer::from_lengths(vec![3, 2]));
+        let result = run_any_match(list)?;
+        assert_eq!(
+            result.as_any().downcast_ref::<BooleanArray>().unwrap(),
+            &BooleanArray::from(vec![Some(true), Some(false)])
+        );
+        Ok(())
+    }
+
+    // Predicate must not be evaluated on elements belonging to null rows.
+    // The 10 in the null row would satisfy x > 5, but the row result must be None.
+    #[test]
+    fn test_any_match_should_not_evaluate_predicate_on_values_underlying_null()
+    -> Result<()> {
+        let list = make_list_with_nulls(
+            vec![1, 2, 10, 1, 2],
+            OffsetBuffer::from_lengths(vec![3, 2]),
+            Some(NullBuffer::from(vec![false, true])),
+        );
+        let result = run_any_match(list)?;
+        assert_eq!(
+            result.as_any().downcast_ref::<BooleanArray>().unwrap(),
+            &BooleanArray::from(vec![None, Some(false)])
+        );
+        Ok(())
+    }
+
+    // Predicate must not be evaluated on elements before the slice offset.
+    // The 10 before the slice would satisfy x > 5, but it is unreachable.
+    #[test]
+    fn test_any_match_on_sliced_list_should_not_evaluate_on_unreachable_values()
+    -> Result<()> {
+        let list = make_list(
+            vec![10, 1, 2, 1, 2],
+            OffsetBuffer::from_lengths(vec![1, 2, 2]),
+        )
+        .slice(1, 2);
+        let result = run_any_match(list)?;
+        assert_eq!(
+            result.as_any().downcast_ref::<BooleanArray>().unwrap(),
+            &BooleanArray::from(vec![Some(false), Some(false)])
+        );
+        Ok(())
+    }
+
+    // 0 in the null row would cause divide by zero if the predicate is evaluated on it.
+    #[test]
+    fn test_any_match_does_not_evaluate_predicate_on_null_row_values() -> Result<()> {
+        let list = make_list_with_nulls(
+            vec![1, 2, 0, 4, 5],
+            OffsetBuffer::from_lengths(vec![3, 2]),
+            Some(NullBuffer::from(vec![false, true])),
+        );
+        let result = run_any_match_div(list)?;
+        assert_eq!(
+            result.as_any().downcast_ref::<BooleanArray>().unwrap(),
+            &BooleanArray::from(vec![None, Some(true)])
+        );
+        Ok(())
+    }
+
+    // 0 before the slice offset would cause divide by zero if evaluated.
+    #[test]
+    fn test_any_match_does_not_evaluate_predicate_on_unreachable_values() -> Result<()> {
+        let list = make_list(
+            vec![0, 4, 5, 50, 100],
+            OffsetBuffer::from_lengths(vec![1, 2, 2]),
+        )
+        .slice(1, 2);
+        let result = run_any_match_div(list)?;
+        assert_eq!(
+            result.as_any().downcast_ref::<BooleanArray>().unwrap(),
+            &BooleanArray::from(vec![Some(true), Some(false)])
+        );
+        Ok(())
+    }
+}
diff --git a/datafusion/functions-nested/src/array_compact.rs b/datafusion/functions-nested/src/array_compact.rs
new file mode 100644
index 0000000000000..11be494b5b20f
--- /dev/null
+++ b/datafusion/functions-nested/src/array_compact.rs
@@ -0,0 +1,191 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`ScalarUDFImpl`] definitions for array_compact function.
+
+use crate::utils::make_scalar_function;
+use arrow::array::{
+    Array, ArrayRef, Capacities, GenericListArray, MutableArrayData, OffsetSizeTrait,
+    make_array,
+};
+use arrow::buffer::OffsetBuffer;
+use arrow::datatypes::DataType;
+use arrow::datatypes::DataType::{LargeList, List, Null};
+use datafusion_common::cast::{as_large_list_array, as_list_array};
+use datafusion_common::{Result, exec_err, utils::take_function_args};
+use datafusion_expr::{
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+use datafusion_macros::user_doc;
+use std::sync::Arc;
+
+make_udf_expr_and_func!(
+    ArrayCompact,
+    array_compact,
+    array,
+    "removes null values from the array.",
+    array_compact_udf
+);
+
+#[user_doc(
+    doc_section(label = "Array Functions"),
+    description = "Removes null values from the array.",
+    syntax_example = "array_compact(array)",
+    sql_example = r#"```sql
+> select array_compact([1, NULL, 2, NULL, 3]) arr;
++-----------+
+| arr       |
++-----------+
+| [1, 2, 3] |
++-----------+
+```"#,
+    argument(
+        name = "array",
+        description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct ArrayCompact {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for ArrayCompact {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ArrayCompact {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::array(Volatility::Immutable),
+            aliases: vec!["list_compact".to_string()],
+        }
+    }
+}
+
+impl ScalarUDFImpl for ArrayCompact {
+    fn name(&self) -> &str {
+        "array_compact"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(array_compact_inner)(&args.args)
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+/// array_compact SQL function
+fn array_compact_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
+    let [input_array] = take_function_args("array_compact", arg)?;
+
+    match &input_array.data_type() {
+        List(field) => {
+            let array = as_list_array(input_array)?;
+            compact_list::<i32>(array, field)
+        }
+        LargeList(field) => {
+            let array = as_large_list_array(input_array)?;
+            compact_list::<i64>(array, field)
+        }
+        Null => Ok(Arc::clone(input_array)),
+        array_type => exec_err!("array_compact does not support type '{array_type}'."),
+    }
+}
+
+/// Remove null elements from each row of a list array.
+fn compact_list<O: OffsetSizeTrait>(
+    list_array: &GenericListArray<O>,
+    field: &Arc<arrow::datatypes::Field>,
+) -> Result<ArrayRef> {
+    let values = list_array.values();
+
+    // Fast path: no nulls in values, return input unchanged
+    if values.null_count() == 0 {
+        return Ok(Arc::new(list_array.clone()));
+    }
+
+    let original_data = values.to_data();
+    let capacity = original_data.len() - values.null_count();
+    let mut offsets = Vec::<O>::with_capacity(list_array.len() + 1);
+    offsets.push(O::zero());
+    let mut mutable = MutableArrayData::with_capacities(
+        vec![&original_data],
+        false,
+        Capacities::Array(capacity),
+    );
+
+    for row_index in 0..list_array.len() {
+        if list_array.nulls().is_some_and(|n| n.is_null(row_index)) {
+            offsets.push(offsets[row_index]);
+            continue;
+        }
+
+        let start = list_array.offsets()[row_index].as_usize();
+        let end = list_array.offsets()[row_index + 1].as_usize();
+        let mut copied = 0usize;
+
+        // Batch consecutive non-null elements into single extend() calls
+        // to reduce per-element overhead. For [1, 2, NULL, 3, 4] this
+        // produces 2 extend calls (0..2, 3..5) instead of 4 individual ones.
+        let mut batch_start: Option<usize> = None;
+        for i in start..end {
+            if values.is_null(i) {
+                // Null breaks the current batch — flush it
+                if let Some(bs) = batch_start {
+                    mutable.extend(0, bs, i);
+                    copied += i - bs;
+                    batch_start = None;
+                }
+            } else if batch_start.is_none() {
+                batch_start = Some(i);
+            }
+        }
+        // Flush any remaining batch after the loop
+        if let Some(bs) = batch_start {
+            mutable.extend(0, bs, end);
+            copied += end - bs;
+        }
+
+        offsets.push(offsets[row_index] + O::usize_as(copied));
+    }
+
+    let new_values = make_array(mutable.freeze());
+    Ok(Arc::new(GenericListArray::<O>::try_new(
+        Arc::clone(field),
+        OffsetBuffer::new(offsets.into()),
+        new_values,
+        list_array.nulls().cloned(),
+    )?))
+}
diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs
index 080b2f16d92f3..04818258f040b 100644
--- a/datafusion/functions-nested/src/array_has.rs
+++ b/datafusion/functions-nested/src/array_has.rs
@@ -17,18 +17,22 @@
 
 //! [`ScalarUDFImpl`] definitions for array_has, array_has_all and array_has_any functions.
 
-use arrow::array::{Array, ArrayRef, BooleanArray, Datum, Scalar};
-use arrow::buffer::BooleanBuffer;
+use arrow::array::{
+    Array, ArrayRef, AsArray, BooleanArray, BooleanBufferBuilder, Datum, Scalar,
+    StringArrayType,
+};
+use arrow::buffer::{BooleanBuffer, NullBuffer};
 use arrow::datatypes::DataType;
 use arrow::row::{RowConverter, Rows, SortField};
 use datafusion_common::cast::{as_fixed_size_list_array, as_generic_list_array};
 use datafusion_common::utils::string_utils::string_array_to_vec;
 use datafusion_common::utils::take_function_args;
-use datafusion_common::{exec_err, DataFusionError, Result, ScalarValue};
+use datafusion_common::{DataFusionError, Result, ScalarValue, exec_err};
 use datafusion_expr::expr::ScalarFunction;
 use datafusion_expr::simplify::ExprSimplifyResult;
 use datafusion_expr::{
-    in_list, ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility, in_list,
 };
 use datafusion_macros::user_doc;
 use datafusion_physical_expr_common::datum::compare_with_eq;
@@ -37,7 +41,8 @@ use itertools::Itertools;
 use crate::make_array::make_array_udf;
 use crate::utils::make_scalar_function;
 
-use std::any::Any;
+use hashbrown::HashSet;
+use std::ops::Range;
 use std::sync::Arc;
 
 // Create static instances of ScalarUDFs for each function
@@ -55,7 +60,7 @@ make_udf_expr_and_func!(ArrayHasAll,
 );
 make_udf_expr_and_func!(ArrayHasAny,
     array_has_any,
-    haystack_array needle_array, // arg names
+    first_array second_array, // arg names
     "returns true if at least one element of the second array appears in the first array; otherwise, it returns false.", // doc
     array_has_any_udf // internal function name
 );
@@ -107,9 +112,6 @@ impl ArrayHas {
 }
 
 impl ScalarUDFImpl for ArrayHas {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "array_has"
     }
@@ -125,7 +127,7 @@ impl ScalarUDFImpl for ArrayHas {
     fn simplify(
         &self,
         mut args: Vec<Expr>,
-        _info: &dyn datafusion_expr::simplify::SimplifyInfo,
+        _info: &datafusion_expr::simplify::SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         let [haystack, needle] = take_function_args(self.name(), &mut args)?;
 
@@ -136,7 +138,7 @@ impl ScalarUDFImpl for ArrayHas {
                 return Ok(ExprSimplifyResult::Simplified(Expr::Literal(
                     ScalarValue::Boolean(None),
                     None,
-                )))
+                )));
             }
             Expr::Literal(
                 // FixedSizeList gets coerced to List
@@ -176,10 +178,7 @@ impl ScalarUDFImpl for ArrayHas {
         Ok(ExprSimplifyResult::Original(args))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let [first_arg, second_arg] = take_function_args(self.name(), &args.args)?;
         if first_arg.data_type().is_null() {
             // Always return null if the first argument is null
@@ -239,6 +238,7 @@ fn array_has_inner_for_array(haystack: &ArrayRef, needle: &ArrayRef) -> Result<A
     array_has_dispatch_for_array(haystack, needle)
 }
 
+#[derive(Copy, Clone)]
 enum ArrayWrapper<'a> {
     FixedSizeList(&'a arrow::array::FixedSizeListArray),
     List(&'a arrow::array::GenericListArray<i32>),
@@ -261,7 +261,7 @@ impl<'a> TryFrom<&'a dyn Array> for ArrayWrapper<'a> {
             DataType::FixedSizeList(_, _) => Ok(ArrayWrapper::FixedSizeList(
                 as_fixed_size_list_array(value)?,
             )),
-            _ => exec_err!("array_has does not support type '{:?}'.", value.data_type()),
+            _ => exec_err!("array_has does not support type '{}'.", value.data_type()),
         }
     }
 }
@@ -302,10 +302,8 @@ impl<'a> ArrayWrapper<'a> {
     fn offsets(&self) -> Box<dyn Iterator<Item = usize> + 'a> {
         match self {
             ArrayWrapper::FixedSizeList(arr) => {
-                let offsets = (0..=arr.len())
-                    .step_by(arr.value_length() as usize)
-                    .collect::<Vec<_>>();
-                Box::new(offsets.into_iter())
+                let value_length = arr.value_length() as usize;
+                Box::new((0..=arr.len()).map(move |i| i * value_length))
             }
             ArrayWrapper::List(arr) => {
                 Box::new(arr.offsets().iter().map(|o| (*o) as usize))
@@ -315,34 +313,41 @@ impl<'a> ArrayWrapper<'a> {
             }
         }
     }
+
+    fn nulls(&self) -> Option<&NullBuffer> {
+        match self {
+            ArrayWrapper::FixedSizeList(arr) => arr.nulls(),
+            ArrayWrapper::List(arr) => arr.nulls(),
+            ArrayWrapper::LargeList(arr) => arr.nulls(),
+        }
+    }
 }
 
-fn array_has_dispatch_for_array(
-    haystack: ArrayWrapper<'_>,
+fn array_has_dispatch_for_array<'a>(
+    haystack: ArrayWrapper<'a>,
     needle: &ArrayRef,
 ) -> Result<ArrayRef> {
-    let mut boolean_builder = BooleanArray::builder(haystack.len());
+    let combined_nulls = NullBuffer::union(haystack.nulls(), needle.nulls());
+    let mut result = BooleanBufferBuilder::new(haystack.len());
     for (i, arr) in haystack.iter().enumerate() {
-        if arr.is_none() || needle.is_null(i) {
-            boolean_builder.append_null();
+        if combined_nulls.as_ref().is_some_and(|n| n.is_null(i)) {
+            result.append(false);
             continue;
         }
         let arr = arr.unwrap();
         let is_nested = arr.data_type().is_nested();
         let needle_row = Scalar::new(needle.slice(i, 1));
         let eq_array = compare_with_eq(&arr, &needle_row, is_nested)?;
-        boolean_builder.append_value(eq_array.true_count() > 0);
+        result.append(eq_array.has_true());
     }
 
-    Ok(Arc::new(boolean_builder.finish()))
+    Ok(Arc::new(BooleanArray::new(result.finish(), combined_nulls)))
 }
 
 fn array_has_dispatch_for_scalar(
     haystack: ArrayWrapper<'_>,
     needle: &dyn Datum,
 ) -> Result<ArrayRef> {
-    let values = haystack.values();
-    let is_nested = values.data_type().is_nested();
     // If first argument is empty list (second argument is non-null), return false
     // i.e. array_has([], non-null element) -> false
     if haystack.len() == 0 {
@@ -351,99 +356,182 @@ fn array_has_dispatch_for_scalar(
             None,
         )));
     }
-    let eq_array = compare_with_eq(values, needle, is_nested)?;
-    let mut final_contained = vec![None; haystack.len()];
 
-    // Check validity buffer to distinguish between null and empty arrays
+    // For sliced ListArrays, values() returns the full underlying array but
+    // only elements between the first and last offset are visible.
+    let offsets: Vec<usize> = haystack.offsets().collect();
+    let first_offset = offsets[0];
+    let visible_values = haystack
+        .values()
+        .slice(first_offset, offsets[offsets.len() - 1] - first_offset);
+
+    let is_nested = visible_values.data_type().is_nested();
+    let eq_array = compare_with_eq(&visible_values, needle, is_nested)?;
+
+    // When a haystack element is null, `eq()` returns null (not false).
+    // In Arrow, a null BooleanArray entry has validity=0 but an
+    // undefined value bit that may happen to be 1. Since set_indices()
+    // operates on the raw value buffer and ignores validity, we AND the
+    // values with the validity bitmap to clear any undefined bits at
+    // null positions. This ensures set_indices() only yields positions
+    // where the comparison genuinely returned true.
+    let eq_bits = match eq_array.nulls() {
+        Some(nulls) => eq_array.values() & nulls.inner(),
+        None => eq_array.values().clone(),
+    };
+
     let validity = match &haystack {
         ArrayWrapper::FixedSizeList(arr) => arr.nulls(),
         ArrayWrapper::List(arr) => arr.nulls(),
         ArrayWrapper::LargeList(arr) => arr.nulls(),
     };
+    let mut matches = eq_bits.set_indices().peekable();
+    let mut result = BooleanBufferBuilder::new(haystack.len());
+    result.append_n(haystack.len(), false);
 
-    for (i, (start, end)) in haystack.offsets().tuple_windows().enumerate() {
-        let length = end - start;
+    // Match positions are relative to visible_values (0-based), so
+    // subtract first_offset from each offset when comparing.
+    for (i, window) in offsets.windows(2).enumerate() {
+        let end = window[1] - first_offset;
 
-        // Check if the array at this position is null
-        if let Some(validity_buffer) = validity {
-            if !validity_buffer.is_valid(i) {
-                final_contained[i] = None; // null array -> null result
-                continue;
-            }
+        let has_match = matches.peek().is_some_and(|&p| p < end);
+
+        // Advance past all match positions in this row's range.
+        while matches.peek().is_some_and(|&p| p < end) {
+            matches.next();
         }
 
-        // For non-null arrays: length is 0 for empty arrays
-        if length == 0 {
-            final_contained[i] = Some(false); // empty array -> false
-        } else {
-            let sliced_array = eq_array.slice(start, length);
-            final_contained[i] = Some(sliced_array.true_count() > 0);
+        if has_match && validity.is_none_or(|v| v.is_valid(i)) {
+            result.set_bit(i, true);
         }
     }
 
-    Ok(Arc::new(BooleanArray::from(final_contained)))
+    // A null haystack row always produces a null output, so we can
+    // reuse the haystack's null buffer directly.
+    Ok(Arc::new(BooleanArray::new(
+        result.finish(),
+        validity.cloned(),
+    )))
 }
 
 fn array_has_all_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     array_has_all_and_any_inner(args, ComparisonType::All)
 }
 
+/// Number of rows to process at a time when doing batched row conversion.  This
+/// amortizes the row conversion overhead over more rows, but making this too
+/// large can cause cache pressure for large arrays. See
+/// <https://github.com/apache/datafusion/pull/20588> for context.
+const ROW_CONVERSION_CHUNK_SIZE: usize = 512;
+
 // General row comparison for array_has_all and array_has_any
 fn general_array_has_for_all_and_any<'a>(
-    haystack: &ArrayWrapper<'a>,
-    needle: &ArrayWrapper<'a>,
+    haystack: ArrayWrapper<'a>,
+    needle: ArrayWrapper<'a>,
     comparison_type: ComparisonType,
 ) -> Result<ArrayRef> {
-    let mut boolean_builder = BooleanArray::builder(haystack.len());
+    let num_rows = haystack.len();
     let converter = RowConverter::new(vec![SortField::new(haystack.value_type())])?;
 
-    for (arr, sub_arr) in haystack.iter().zip(needle.iter()) {
-        if let (Some(arr), Some(sub_arr)) = (arr, sub_arr) {
-            let arr_values = converter.convert_columns(&[arr])?;
-            let sub_arr_values = converter.convert_columns(&[sub_arr])?;
-            boolean_builder.append_value(general_array_has_all_and_any_kernel(
-                arr_values,
-                sub_arr_values,
+    let h_offsets: Vec<usize> = haystack.offsets().collect();
+    let n_offsets: Vec<usize> = needle.offsets().collect();
+
+    let combined_nulls = NullBuffer::union(haystack.nulls(), needle.nulls());
+    let mut result = BooleanBufferBuilder::new(num_rows);
+
+    for chunk_start in (0..num_rows).step_by(ROW_CONVERSION_CHUNK_SIZE) {
+        let chunk_end = (chunk_start + ROW_CONVERSION_CHUNK_SIZE).min(num_rows);
+
+        // For efficiency with sliced arrays, only process the visible elements,
+        // not the entire underlying buffer.
+        let h_elem_start = h_offsets[chunk_start];
+        let h_elem_end = h_offsets[chunk_end];
+        let n_elem_start = n_offsets[chunk_start];
+        let n_elem_end = n_offsets[chunk_end];
+
+        let h_vals = haystack
+            .values()
+            .slice(h_elem_start, h_elem_end - h_elem_start);
+        let n_vals = needle
+            .values()
+            .slice(n_elem_start, n_elem_end - n_elem_start);
+
+        let chunk_h_rows = converter.convert_columns(&[h_vals])?;
+        let chunk_n_rows = converter.convert_columns(&[n_vals])?;
+
+        for i in chunk_start..chunk_end {
+            if combined_nulls.as_ref().is_some_and(|n| n.is_null(i)) {
+                result.append(false);
+                continue;
+            }
+            result.append(general_array_has_all_and_any_kernel(
+                &chunk_h_rows,
+                (h_offsets[i] - h_elem_start)..(h_offsets[i + 1] - h_elem_start),
+                &chunk_n_rows,
+                (n_offsets[i] - n_elem_start)..(n_offsets[i + 1] - n_elem_start),
                 comparison_type,
             ));
-        } else {
-            boolean_builder.append_null();
         }
     }
 
-    Ok(Arc::new(boolean_builder.finish()))
+    Ok(Arc::new(BooleanArray::new(result.finish(), combined_nulls)))
 }
 
 // String comparison for array_has_all and array_has_any
 fn array_has_all_and_any_string_internal<'a>(
-    haystack: &ArrayWrapper<'a>,
-    needle: &ArrayWrapper<'a>,
+    haystack: ArrayWrapper<'a>,
+    needle: ArrayWrapper<'a>,
     comparison_type: ComparisonType,
 ) -> Result<ArrayRef> {
-    let mut boolean_builder = BooleanArray::builder(haystack.len());
-    for (arr, sub_arr) in haystack.iter().zip(needle.iter()) {
-        match (arr, sub_arr) {
-            (Some(arr), Some(sub_arr)) => {
-                let haystack_array = string_array_to_vec(&arr);
-                let needle_array = string_array_to_vec(&sub_arr);
-                boolean_builder.append_value(array_has_string_kernel(
-                    haystack_array,
-                    needle_array,
-                    comparison_type,
-                ));
-            }
-            (_, _) => {
-                boolean_builder.append_null();
+    let num_rows = haystack.len();
+
+    let h_offsets: Vec<usize> = haystack.offsets().collect();
+    let n_offsets: Vec<usize> = needle.offsets().collect();
+
+    let combined_nulls = NullBuffer::union(haystack.nulls(), needle.nulls());
+    let mut result = BooleanBufferBuilder::new(num_rows);
+
+    for chunk_start in (0..num_rows).step_by(ROW_CONVERSION_CHUNK_SIZE) {
+        let chunk_end = (chunk_start + ROW_CONVERSION_CHUNK_SIZE).min(num_rows);
+
+        let h_elem_start = h_offsets[chunk_start];
+        let h_elem_end = h_offsets[chunk_end];
+        let n_elem_start = n_offsets[chunk_start];
+        let n_elem_end = n_offsets[chunk_end];
+
+        let h_vals = haystack
+            .values()
+            .slice(h_elem_start, h_elem_end - h_elem_start);
+        let n_vals = needle
+            .values()
+            .slice(n_elem_start, n_elem_end - n_elem_start);
+
+        let chunk_h_strings = string_array_to_vec(h_vals.as_ref());
+        let chunk_n_strings = string_array_to_vec(n_vals.as_ref());
+
+        for i in chunk_start..chunk_end {
+            if combined_nulls.as_ref().is_some_and(|n| n.is_null(i)) {
+                result.append(false);
+                continue;
             }
+            let h_start = h_offsets[i] - h_elem_start;
+            let h_end = h_offsets[i + 1] - h_elem_start;
+            let n_start = n_offsets[i] - n_elem_start;
+            let n_end = n_offsets[i + 1] - n_elem_start;
+            result.append(array_has_string_kernel(
+                &chunk_h_strings[h_start..h_end],
+                &chunk_n_strings[n_start..n_end],
+                comparison_type,
+            ));
         }
     }
 
-    Ok(Arc::new(boolean_builder.finish()))
+    Ok(Arc::new(BooleanArray::new(result.finish(), combined_nulls)))
 }
 
 fn array_has_all_and_any_dispatch<'a>(
-    haystack: &ArrayWrapper<'a>,
-    needle: &ArrayWrapper<'a>,
+    haystack: ArrayWrapper<'a>,
+    needle: ArrayWrapper<'a>,
     comparison_type: ComparisonType,
 ) -> Result<ArrayRef> {
     if needle.values().is_empty() {
@@ -468,13 +556,226 @@ fn array_has_all_and_any_inner(
 ) -> Result<ArrayRef> {
     let haystack: ArrayWrapper = args[0].as_ref().try_into()?;
     let needle: ArrayWrapper = args[1].as_ref().try_into()?;
-    array_has_all_and_any_dispatch(&haystack, &needle, comparison_type)
+    array_has_all_and_any_dispatch(haystack, needle, comparison_type)
 }
 
 fn array_has_any_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     array_has_all_and_any_inner(args, ComparisonType::Any)
 }
 
+/// Fast path for `array_has_any` when exactly one argument is a scalar.
+fn array_has_any_with_scalar(
+    columnar_arg: &ColumnarValue,
+    scalar_arg: &ScalarValue,
+) -> Result<ColumnarValue> {
+    if scalar_arg.is_null() {
+        return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
+    }
+
+    // Convert the scalar to a 1-element ListArray, then extract the inner values
+    let scalar_array = scalar_arg.to_array_of_size(1)?;
+    let scalar_list: ArrayWrapper = scalar_array.as_ref().try_into()?;
+    let offsets: Vec<usize> = scalar_list.offsets().collect();
+    let scalar_values = scalar_list
+        .values()
+        .slice(offsets[0], offsets[1] - offsets[0]);
+
+    // If scalar list is empty, result is always false
+    if scalar_values.is_empty() {
+        return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(false))));
+    }
+
+    match scalar_values.data_type() {
+        DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => {
+            array_has_any_with_scalar_string(columnar_arg, &scalar_values)
+        }
+        _ => array_has_any_with_scalar_general(columnar_arg, &scalar_values),
+    }
+}
+
+/// When the scalar argument has more elements than this, the scalar fast path
+/// builds a HashSet for O(1) lookups. At or below this threshold, it falls
+/// back to a linear scan, since hashing every columnar element is more
+/// expensive than a linear scan over a short array.
+const SCALAR_SMALL_THRESHOLD: usize = 8;
+
+/// String-specialized scalar fast path for `array_has_any`.
+fn array_has_any_with_scalar_string(
+    columnar_arg: &ColumnarValue,
+    scalar_values: &ArrayRef,
+) -> Result<ColumnarValue> {
+    let (col_arr, is_scalar_output) = match columnar_arg {
+        ColumnarValue::Array(arr) => (Arc::clone(arr), false),
+        ColumnarValue::Scalar(s) => (s.to_array_of_size(1)?, true),
+    };
+
+    let col_list: ArrayWrapper = col_arr.as_ref().try_into()?;
+    let col_values = col_list.values();
+    let col_offsets: Vec<usize> = col_list.offsets().collect();
+    let col_nulls = col_list.nulls();
+
+    let scalar_lookup = ScalarStringLookup::new(scalar_values);
+    let has_null_scalar = scalar_values.null_count() > 0;
+
+    let result = match col_values.data_type() {
+        DataType::Utf8 => array_has_any_string_inner(
+            col_values.as_string::<i32>(),
+            &col_offsets,
+            col_nulls,
+            has_null_scalar,
+            &scalar_lookup,
+        ),
+        DataType::LargeUtf8 => array_has_any_string_inner(
+            col_values.as_string::<i64>(),
+            &col_offsets,
+            col_nulls,
+            has_null_scalar,
+            &scalar_lookup,
+        ),
+        DataType::Utf8View => array_has_any_string_inner(
+            col_values.as_string_view(),
+            &col_offsets,
+            col_nulls,
+            has_null_scalar,
+            &scalar_lookup,
+        ),
+        _ => unreachable!("array_has_any_with_scalar_string called with non-string type"),
+    };
+
+    if is_scalar_output {
+        Ok(ColumnarValue::Scalar(ScalarValue::try_from_array(
+            &result, 0,
+        )?))
+    } else {
+        Ok(ColumnarValue::Array(result))
+    }
+}
+
+/// Pre-computed lookup structure for the scalar string fastpath.
+enum ScalarStringLookup<'a> {
+    /// Large scalar: HashSet for O(1) lookups.
+    Set(HashSet<&'a str>),
+    /// Small scalar: Vec for linear scan.
+    List(Vec<Option<&'a str>>),
+}
+
+impl<'a> ScalarStringLookup<'a> {
+    fn new(scalar_values: &'a ArrayRef) -> Self {
+        let strings = string_array_to_vec(scalar_values.as_ref());
+        if strings.len() > SCALAR_SMALL_THRESHOLD {
+            ScalarStringLookup::Set(strings.into_iter().flatten().collect())
+        } else {
+            ScalarStringLookup::List(strings)
+        }
+    }
+
+    fn contains(&self, value: &str) -> bool {
+        match self {
+            ScalarStringLookup::Set(set) => set.contains(value),
+            ScalarStringLookup::List(list) => list.contains(&Some(value)),
+        }
+    }
+}
+
+/// Inner implementation of the string scalar fast path, generic over string
+/// array type to allow direct element access by index.
+fn array_has_any_string_inner<'a, C: StringArrayType<'a> + Copy>(
+    col_strings: C,
+    col_offsets: &[usize],
+    col_nulls: Option<&NullBuffer>,
+    has_null_scalar: bool,
+    scalar_lookup: &ScalarStringLookup<'_>,
+) -> ArrayRef {
+    let num_rows = col_offsets.len() - 1;
+    let mut result = BooleanBufferBuilder::new(num_rows);
+
+    for i in 0..num_rows {
+        if col_nulls.is_some_and(|v| v.is_null(i)) {
+            result.append(false);
+            continue;
+        }
+        let start = col_offsets[i];
+        let end = col_offsets[i + 1];
+        let found = (start..end).any(|j| {
+            if col_strings.is_null(j) {
+                has_null_scalar
+            } else {
+                scalar_lookup.contains(col_strings.value(j))
+            }
+        });
+        result.append(found);
+    }
+
+    Arc::new(BooleanArray::new(result.finish(), col_nulls.cloned()))
+}
+
+/// General scalar fast path for `array_has_any`, using RowConverter for
+/// type-erased comparison.
+fn array_has_any_with_scalar_general(
+    columnar_arg: &ColumnarValue,
+    scalar_values: &ArrayRef,
+) -> Result<ColumnarValue> {
+    let converter =
+        RowConverter::new(vec![SortField::new(scalar_values.data_type().clone())])?;
+    let scalar_rows = converter.convert_columns(&[Arc::clone(scalar_values)])?;
+
+    let (col_arr, is_scalar_output) = match columnar_arg {
+        ColumnarValue::Array(arr) => (Arc::clone(arr), false),
+        ColumnarValue::Scalar(s) => (s.to_array_of_size(1)?, true),
+    };
+
+    let col_list: ArrayWrapper = col_arr.as_ref().try_into()?;
+    let col_rows = converter.convert_columns(&[Arc::clone(col_list.values())])?;
+    let col_offsets: Vec<usize> = col_list.offsets().collect();
+    let col_nulls = col_list.nulls();
+
+    let mut result = BooleanBufferBuilder::new(col_list.len());
+    let num_scalar = scalar_rows.num_rows();
+
+    if num_scalar > SCALAR_SMALL_THRESHOLD {
+        // Large scalar: build HashSet for O(1) lookups
+        let scalar_set: HashSet<Box<[u8]>> = (0..num_scalar)
+            .map(|i| Box::from(scalar_rows.row(i).as_ref()))
+            .collect();
+
+        for i in 0..col_list.len() {
+            if col_nulls.is_some_and(|v| v.is_null(i)) {
+                result.append(false);
+                continue;
+            }
+            let start = col_offsets[i];
+            let end = col_offsets[i + 1];
+            let found =
+                (start..end).any(|j| scalar_set.contains(col_rows.row(j).as_ref()));
+            result.append(found);
+        }
+    } else {
+        // Small scalar: linear scan avoids HashSet hashing overhead
+        for i in 0..col_list.len() {
+            if col_nulls.is_some_and(|v| v.is_null(i)) {
+                result.append(false);
+                continue;
+            }
+            let start = col_offsets[i];
+            let end = col_offsets[i + 1];
+            let found = (start..end)
+                .any(|j| (0..num_scalar).any(|k| col_rows.row(j) == scalar_rows.row(k)));
+            result.append(found);
+        }
+    }
+
+    let output: ArrayRef =
+        Arc::new(BooleanArray::new(result.finish(), col_nulls.cloned()));
+
+    if is_scalar_output {
+        Ok(ColumnarValue::Scalar(ScalarValue::try_from_array(
+            &output, 0,
+        )?))
+    } else {
+        Ok(ColumnarValue::Array(output))
+    }
+}
+
 #[user_doc(
     doc_section(label = "Array Functions"),
     description = "Returns true if all elements of sub-array exist in array.",
@@ -518,9 +819,6 @@ impl ArrayHasAll {
 }
 
 impl ScalarUDFImpl for ArrayHasAll {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "array_has_all"
     }
@@ -533,10 +831,7 @@ impl ScalarUDFImpl for ArrayHasAll {
         Ok(DataType::Boolean)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_has_all_inner)(&args.args)
     }
 
@@ -551,8 +846,8 @@ impl ScalarUDFImpl for ArrayHasAll {
 
 #[user_doc(
     doc_section(label = "Array Functions"),
-    description = "Returns true if any elements exist in both arrays.",
-    syntax_example = "array_has_any(array, sub-array)",
+    description = "Returns true if the arrays have any elements in common.",
+    syntax_example = "array_has_any(array1, array2)",
     sql_example = r#"```sql
 > select array_has_any([1, 2, 3], [3, 4]);
 +------------------------------------------+
@@ -562,11 +857,11 @@ impl ScalarUDFImpl for ArrayHasAll {
 +------------------------------------------+
 ```"#,
     argument(
-        name = "array",
+        name = "array1",
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     ),
     argument(
-        name = "sub-array",
+        name = "array2",
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     )
 )]
@@ -592,9 +887,6 @@ impl ArrayHasAny {
 }
 
 impl ScalarUDFImpl for ArrayHasAny {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "array_has_any"
     }
@@ -607,11 +899,16 @@ impl ScalarUDFImpl for ArrayHasAny {
         Ok(DataType::Boolean)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        make_scalar_function(array_has_any_inner)(&args.args)
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let [first_arg, second_arg] = take_function_args(self.name(), &args.args)?;
+
+        // If either argument is scalar, use the fast path.
+        match (&first_arg, &second_arg) {
+            (cv, ColumnarValue::Scalar(scalar)) | (ColumnarValue::Scalar(scalar), cv) => {
+                array_has_any_with_scalar(cv, scalar)
+            }
+            _ => make_scalar_function(array_has_any_inner)(&args.args),
+        }
     }
 
     fn aliases(&self) -> &[String] {
@@ -633,8 +930,8 @@ enum ComparisonType {
 }
 
 fn array_has_string_kernel(
-    haystack: Vec<Option<&str>>,
-    needle: Vec<Option<&str>>,
+    haystack: &[Option<&str>],
+    needle: &[Option<&str>],
     comparison_type: ComparisonType,
 ) -> bool {
     match comparison_type {
@@ -650,20 +947,23 @@ fn array_has_string_kernel(
 }
 
 fn general_array_has_all_and_any_kernel(
-    haystack_rows: Rows,
-    needle_rows: Rows,
+    haystack_rows: &Rows,
+    h_range: Range<usize>,
+    needle_rows: &Rows,
+    mut n_range: Range<usize>,
     comparison_type: ComparisonType,
 ) -> bool {
+    let h_start = h_range.start;
+    let h_end = h_range.end;
+
     match comparison_type {
-        ComparisonType::All => needle_rows.iter().all(|needle_row| {
-            haystack_rows
-                .iter()
-                .any(|haystack_row| haystack_row == needle_row)
+        ComparisonType::All => n_range.all(|ni| {
+            let needle_row = needle_rows.row(ni);
+            (h_start..h_end).any(|hi| haystack_rows.row(hi) == needle_row)
         }),
-        ComparisonType::Any => needle_rows.iter().any(|needle_row| {
-            haystack_rows
-                .iter()
-                .any(|haystack_row| haystack_row == needle_row)
+        ComparisonType::Any => n_range.any(|ni| {
+            let needle_row = needle_rows.row(ni);
+            (h_start..h_end).any(|hi| haystack_rows.row(hi) == needle_row)
         }),
     }
 }
@@ -674,22 +974,26 @@ mod tests {
 
     use arrow::datatypes::Int32Type;
     use arrow::{
-        array::{create_array, Array, ArrayRef, AsArray, Int32Array, ListArray},
+        array::{
+            Array, ArrayRef, AsArray, FixedSizeListArray, Int32Array, ListArray,
+            create_array,
+        },
         buffer::OffsetBuffer,
         datatypes::{DataType, Field},
     };
     use datafusion_common::{
-        config::ConfigOptions, utils::SingleRowListArrayBuilder, DataFusionError,
-        ScalarValue,
+        DataFusionError, ScalarValue, config::ConfigOptions,
+        utils::SingleRowListArrayBuilder,
     };
+    use datafusion_expr::simplify::SimplifyContext;
     use datafusion_expr::{
-        col, execution_props::ExecutionProps, lit, simplify::ExprSimplifyResult,
-        ColumnarValue, Expr, ScalarFunctionArgs, ScalarUDFImpl,
+        ColumnarValue, Expr, ScalarFunctionArgs, ScalarUDFImpl, col, lit,
+        simplify::ExprSimplifyResult,
     };
 
     use crate::expr_fn::make_array;
 
-    use super::ArrayHas;
+    use super::{ArrayHas, ArrayHasAll, ArrayHasAny};
 
     #[test]
     fn test_simplify_array_has_to_in_list() {
@@ -700,8 +1004,7 @@ mod tests {
         .build_list_scalar());
         let needle = col("c");
 
-        let props = ExecutionProps::new();
-        let context = datafusion_expr::simplify::SimplifyContext::new(&props);
+        let context = SimplifyContext::default();
 
         let Ok(ExprSimplifyResult::Simplified(Expr::InList(in_list))) =
             ArrayHas::new().simplify(vec![haystack, needle.clone()], &context)
@@ -724,8 +1027,7 @@ mod tests {
         let haystack = make_array(vec![lit(1), lit(2), lit(3)]);
         let needle = col("c");
 
-        let props = ExecutionProps::new();
-        let context = datafusion_expr::simplify::SimplifyContext::new(&props);
+        let context = SimplifyContext::default();
 
         let Ok(ExprSimplifyResult::Simplified(Expr::InList(in_list))) =
             ArrayHas::new().simplify(vec![haystack, needle.clone()], &context)
@@ -748,8 +1050,7 @@ mod tests {
         let haystack = Expr::Literal(ScalarValue::Null, None);
         let needle = col("c");
 
-        let props = ExecutionProps::new();
-        let context = datafusion_expr::simplify::SimplifyContext::new(&props);
+        let context = SimplifyContext::default();
         let Ok(ExprSimplifyResult::Simplified(simplified)) =
             ArrayHas::new().simplify(vec![haystack, needle], &context)
         else {
@@ -766,8 +1067,7 @@ mod tests {
         let haystack = Expr::Literal(ScalarValue::List(Arc::new(haystack)), None);
         let needle = col("c");
 
-        let props = ExecutionProps::new();
-        let context = datafusion_expr::simplify::SimplifyContext::new(&props);
+        let context = SimplifyContext::default();
         let Ok(ExprSimplifyResult::Simplified(simplified)) =
             ArrayHas::new().simplify(vec![haystack, needle], &context)
         else {
@@ -782,8 +1082,7 @@ mod tests {
         let haystack = col("c1");
         let needle = col("c2");
 
-        let props = ExecutionProps::new();
-        let context = datafusion_expr::simplify::SimplifyContext::new(&props);
+        let context = SimplifyContext::default();
 
         let Ok(ExprSimplifyResult::Original(args)) =
             ArrayHas::new().simplify(vec![haystack, needle.clone()], &context)
@@ -829,6 +1128,52 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_array_has_sliced_list() -> Result<(), DataFusionError> {
+        // [[10, 20], [30, 40], [50, 60], [70, 80]]  →  slice(1,2)  →  [[30, 40], [50, 60]]
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(10), Some(20)]),
+            Some(vec![Some(30), Some(40)]),
+            Some(vec![Some(50), Some(60)]),
+            Some(vec![Some(70), Some(80)]),
+        ]);
+        let sliced = list.slice(1, 2);
+        let haystack_field =
+            Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
+        let needle_field = Arc::new(Field::new("needle", DataType::Int32, true));
+        let return_field = Arc::new(Field::new("return", DataType::Boolean, true));
+
+        // Search for elements that exist only in sliced-away rows:
+        // 10 is in the prefix row, 70 is in the suffix row.
+        let invoke = |needle: i32| -> Result<ArrayRef, DataFusionError> {
+            ArrayHas::new()
+                .invoke_with_args(ScalarFunctionArgs {
+                    args: vec![
+                        ColumnarValue::Array(Arc::new(sliced.clone())),
+                        ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))),
+                    ],
+                    arg_fields: vec![
+                        Arc::clone(&haystack_field),
+                        Arc::clone(&needle_field),
+                    ],
+                    number_rows: 2,
+                    return_field: Arc::clone(&return_field),
+                    config_options: Arc::new(ConfigOptions::default()),
+                })?
+                .into_array(2)
+        };
+
+        let output = invoke(10)?.as_boolean().clone();
+        assert!(!output.value(0));
+        assert!(!output.value(1));
+
+        let output = invoke(70)?.as_boolean().clone();
+        assert!(!output.value(0));
+        assert!(!output.value(1));
+
+        Ok(())
+    }
+
     #[test]
     fn test_array_has_list_null_haystack() -> Result<(), DataFusionError> {
         let haystack_field = Arc::new(Field::new("haystack", DataType::Null, true));
@@ -858,4 +1203,112 @@ mod tests {
 
         Ok(())
     }
+
+    /// Invoke a two-argument list UDF with the given arrays and assert the
+    /// boolean output matches `expected`.
+    fn invoke_and_assert(
+        udf: &dyn ScalarUDFImpl,
+        haystack: &ArrayRef,
+        needle: ArrayRef,
+        expected: &[Option<bool>],
+    ) {
+        let num_rows = haystack.len();
+        let list_type = haystack.data_type();
+        let result = udf
+            .invoke_with_args(ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Array(Arc::clone(haystack)),
+                    ColumnarValue::Array(needle),
+                ],
+                arg_fields: vec![
+                    Arc::new(Field::new("haystack", list_type.clone(), false)),
+                    Arc::new(Field::new("needle", list_type.clone(), false)),
+                ],
+                number_rows: num_rows,
+                return_field: Arc::new(Field::new("return", DataType::Boolean, true)),
+                config_options: Arc::new(ConfigOptions::default()),
+            })
+            .unwrap();
+        let output = result.into_array(num_rows).unwrap();
+        assert_eq!(output.as_boolean().iter().collect::<Vec<_>>(), expected);
+    }
+
+    #[test]
+    fn test_sliced_list_offsets() {
+        // Full rows:
+        //   row 0: [1, 2]   (not visible after slicing)
+        //   row 1: [11, 12] (visible row 0)
+        //   row 2: [21, 22] (visible row 1)
+        //   row 3: [31, 32] (not visible after slicing)
+        let field: Arc<Field> = Arc::new(Field::new("item", DataType::Int32, false));
+        let full_values = Arc::new(Int32Array::from(vec![1, 2, 11, 12, 21, 22, 31, 32]));
+        let full_offsets = OffsetBuffer::new(vec![0, 2, 4, 6, 8].into());
+        let full = ListArray::new(Arc::clone(&field), full_offsets, full_values, None);
+        let sliced_haystack: ArrayRef = Arc::new(full.slice(1, 2));
+
+        // array_has_all: needle row 0 = [11], row 1 = [21]
+        let needle_all: ArrayRef = Arc::new(ListArray::new(
+            Arc::clone(&field),
+            OffsetBuffer::new(vec![0, 1, 2].into()),
+            Arc::new(Int32Array::from(vec![11, 21])),
+            None,
+        ));
+        invoke_and_assert(
+            &ArrayHasAll::new(),
+            &sliced_haystack,
+            needle_all,
+            &[Some(true), Some(true)],
+        );
+
+        // array_has_any: needle row 0 = [99, 11], row 1 = [99, 21]
+        let needle_any: ArrayRef = Arc::new(ListArray::new(
+            field,
+            OffsetBuffer::new(vec![0, 2, 4].into()),
+            Arc::new(Int32Array::from(vec![99, 11, 99, 21])),
+            None,
+        ));
+        invoke_and_assert(
+            &ArrayHasAny::new(),
+            &sliced_haystack,
+            needle_any,
+            &[Some(true), Some(true)],
+        );
+    }
+
+    #[test]
+    fn test_sliced_fixed_size_list_offsets() {
+        // Same logical data as test_sliced_list_offsets, but using FixedSizeListArray.
+        let field = Arc::new(Field::new("item", DataType::Int32, false));
+        let full_values = Arc::new(Int32Array::from(vec![1, 2, 11, 12, 21, 22, 31, 32]));
+        let full = FixedSizeListArray::new(Arc::clone(&field), 2, full_values, None);
+        let sliced_haystack: ArrayRef = Arc::new(full.slice(1, 2));
+
+        // array_has_all: needle row 0 = [11, 12], row 1 = [21, 22]
+        let needle_all: ArrayRef = Arc::new(FixedSizeListArray::new(
+            Arc::clone(&field),
+            2,
+            Arc::new(Int32Array::from(vec![11, 12, 21, 22])),
+            None,
+        ));
+        invoke_and_assert(
+            &ArrayHasAll::new(),
+            &sliced_haystack,
+            needle_all,
+            &[Some(true), Some(true)],
+        );
+
+        // array_has_any: needle row 0 = [99, 12], row 1 = [99, 22]
+        let needle_any: ArrayRef = Arc::new(FixedSizeListArray::new(
+            field,
+            2,
+            Arc::new(Int32Array::from(vec![99, 12, 99, 22])),
+            None,
+        ));
+        invoke_and_assert(
+            &ArrayHasAny::new(),
+            &sliced_haystack,
+            needle_any,
+            &[Some(true), Some(true)],
+        );
+    }
 }
diff --git a/datafusion/functions-nested/src/array_transform.rs b/datafusion/functions-nested/src/array_transform.rs
new file mode 100644
index 0000000000000..0dcc7a7613f1e
--- /dev/null
+++ b/datafusion/functions-nested/src/array_transform.rs
@@ -0,0 +1,392 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`HigherOrderUDF`] definitions for array_transform function.
+
+use arrow::{
+    array::{Array, ArrayRef, AsArray, LargeListArray, ListArray},
+    datatypes::{DataType, Field, FieldRef},
+};
+use datafusion_common::{
+    Result, ScalarValue, exec_err, plan_err,
+    utils::{adjust_offsets_for_slice, list_values, take_function_args},
+};
+use datafusion_expr::{
+    ColumnarValue, Documentation, HigherOrderFunctionArgs, HigherOrderReturnFieldArgs,
+    HigherOrderSignature, HigherOrderUDF, LambdaParametersProgress, ValueOrLambda,
+    Volatility,
+};
+use datafusion_macros::user_doc;
+use std::{fmt::Debug, sync::Arc};
+
+make_higher_order_function_expr_and_func!(
+    ArrayTransform,
+    array_transform,
+    array lambda,
+    "transforms the values of an array",
+    array_transform_higher_order_function
+);
+
+#[user_doc(
+    doc_section(label = "Array Functions"),
+    description = "transforms the values of an array",
+    syntax_example = "array_transform(array, x -> x*2)",
+    sql_example = r#"```sql
+> select array_transform([1, 2, 3, 4, 5], x -> x*2);
++-------------------------------------------+
+| array_transform([1, 2, 3, 4, 5], x -> x*2)       |
++-------------------------------------------+
+| [2, 4, 6, 8, 10]                          |
++-------------------------------------------+
+```"#,
+    argument(
+        name = "array",
+        description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
+    ),
+    argument(name = "lambda", description = "Lambda")
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct ArrayTransform {
+    signature: HigherOrderSignature,
+    aliases: Vec<String>,
+}
+
+impl Default for ArrayTransform {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ArrayTransform {
+    pub fn new() -> Self {
+        Self {
+            signature: HigherOrderSignature::user_defined(Volatility::Immutable),
+            aliases: vec![String::from("list_transform")],
+        }
+    }
+}
+
+impl HigherOrderUDF for ArrayTransform {
+    fn name(&self) -> &str {
+        "array_transform"
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn signature(&self) -> &HigherOrderSignature {
+        &self.signature
+    }
+
+    fn coerce_value_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        let list = if arg_types.len() == 1 {
+            &arg_types[0]
+        } else {
+            return plan_err!(
+                "{} function requires 1 value arguments, got {}",
+                self.name(),
+                arg_types.len()
+            );
+        };
+
+        let coerced = match list {
+            DataType::List(_) | DataType::LargeList(_) => list.clone(),
+            DataType::ListView(field) | DataType::FixedSizeList(field, _) => {
+                DataType::List(Arc::clone(field))
+            }
+            DataType::LargeListView(field) => DataType::LargeList(Arc::clone(field)),
+            _ => {
+                return plan_err!(
+                    "{} expected a list as first argument, got {}",
+                    self.name(),
+                    list
+                );
+            }
+        };
+
+        Ok(vec![coerced])
+    }
+
+    fn lambda_parameters(
+        &self,
+        _step: usize,
+        fields: &[ValueOrLambda<FieldRef, Option<FieldRef>>],
+    ) -> Result<LambdaParametersProgress> {
+        let (list, _lambda) = value_lambda_pair(self.name(), fields)?;
+
+        let field = match list.data_type() {
+            DataType::List(field) => field,
+            DataType::LargeList(field) => field,
+            _ => return plan_err!("expected list, got {list}"),
+        };
+
+        // we don't need to check whether the lambda contains more than two parameters,
+        // e.g. array_transform([], (v, i, j) -> v+i+j), as datafusion will do that for us
+        Ok(LambdaParametersProgress::Complete(vec![vec![Arc::clone(
+            field,
+        )]]))
+    }
+
+    fn return_field_from_args(
+        &self,
+        args: HigherOrderReturnFieldArgs,
+    ) -> Result<Arc<Field>> {
+        let (list, lambda) = value_lambda_pair(self.name(), args.arg_fields)?;
+
+        //TODO: should metadata be copied into the transformed array?
+
+        // lambda is the resulting field of executing the lambda body
+        // with the parameters returned in lambda_parameters
+        let field = Arc::new(Field::new(
+            Field::LIST_FIELD_DEFAULT_NAME,
+            lambda.data_type().clone(),
+            lambda.is_nullable(),
+        ));
+
+        let return_type = match list.data_type() {
+            DataType::List(_) => DataType::List(field),
+            DataType::LargeList(_) => DataType::LargeList(field),
+            other => plan_err!("expected list, got {other}")?,
+        };
+
+        Ok(Arc::new(Field::new("", return_type, list.is_nullable())))
+    }
+
+    fn invoke_with_args(&self, args: HigherOrderFunctionArgs) -> Result<ColumnarValue> {
+        let (list, lambda) = value_lambda_pair(self.name(), &args.args)?;
+
+        let list_array = list.to_array(args.number_rows)?;
+
+        // Fast path for fully null input array
+        if list_array.null_count() == list_array.len() {
+            return Ok(ColumnarValue::Scalar(ScalarValue::try_new_null(
+                args.return_type(),
+            )?));
+        }
+
+        // as per list_values docs, if list_array is sliced, list_values will be sliced too,
+        // so before constructing the transformed array below, we must adjust the list offsets with
+        // adjust_offsets_for_slice
+        let list_values = list_values(&list_array)?;
+
+        // fast path: when every sublist is empty and non-null we can return a scalar of an non-null empty sublist.
+        // If every sublist is null have already been handled above
+        if list_values.is_empty()
+            && list_array.null_count() == 0
+            && matches!(
+                args.return_type(),
+                DataType::List(_) | DataType::LargeList(_)
+            )
+        {
+            return Ok(ColumnarValue::Scalar(ScalarValue::new_default(
+                args.return_type(),
+            )?));
+        }
+
+        // by passing closures, lambda.evaluate can evaluate only those actually needed
+        let values_param = || Ok(Arc::clone(&list_values));
+
+        // call the transforming lambda
+        let transformed_values = lambda
+            .evaluate(&[&values_param])?
+            .into_array(list_values.len())?;
+
+        let field = match args.return_field.data_type() {
+            DataType::List(field) | DataType::LargeList(field) => Arc::clone(field),
+            _ => {
+                return exec_err!(
+                    "{} expected ScalarFunctionArgs.return_field to be a list, got {}",
+                    self.name(),
+                    args.return_field
+                );
+            }
+        };
+
+        let transformed_list = match list_array.data_type() {
+            DataType::List(_) => {
+                let list = list_array.as_list();
+
+                // since we called list_values above which would return sliced values for
+                // a sliced list, we must adjust the offsets here as otherwise they would be invalid
+                let adjusted_offsets = adjust_offsets_for_slice(list);
+
+                Arc::new(ListArray::new(
+                    field,
+                    adjusted_offsets,
+                    transformed_values,
+                    list.nulls().cloned(),
+                )) as ArrayRef
+            }
+            DataType::LargeList(_) => {
+                let large_list = list_array.as_list();
+
+                // since we called list_values above which would return sliced values for
+                // a sliced list, we must adjust the offsets here as otherwise they would be invalid
+                let adjusted_offsets = adjust_offsets_for_slice(large_list);
+
+                Arc::new(LargeListArray::new(
+                    field,
+                    adjusted_offsets,
+                    transformed_values,
+                    large_list.nulls().cloned(),
+                ))
+            }
+            other => exec_err!("expected list, got {other}")?,
+        };
+
+        Ok(ColumnarValue::Array(transformed_list))
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+fn value_lambda_pair<'a, V: Debug, L: Debug>(
+    name: &str,
+    args: &'a [ValueOrLambda<V, L>],
+) -> Result<(&'a V, &'a L)> {
+    let [value, lambda] = take_function_args(name, args)?;
+
+    let (ValueOrLambda::Value(value), ValueOrLambda::Lambda(lambda)) = (value, lambda)
+    else {
+        return plan_err!(
+            "{name} expects a value followed by a lambda, got {value:?} and {lambda:?}"
+        );
+    };
+
+    Ok((value, lambda))
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{collections::HashMap, sync::Arc};
+
+    use arrow::{
+        array::{Array, ArrayRef, AsArray, Int32Array, ListArray, RecordBatch},
+        buffer::{NullBuffer, OffsetBuffer},
+        datatypes::{DataType, Field},
+    };
+    use datafusion_common::{DFSchema, Result};
+    use datafusion_expr::{
+        Expr, col,
+        execution_props::ExecutionProps,
+        expr::{HigherOrderFunction, LambdaVariable},
+        lambda, lit,
+    };
+    use datafusion_physical_expr::create_physical_expr;
+
+    use crate::array_transform::array_transform_higher_order_function;
+
+    fn create_i32_list(
+        values: impl Into<Int32Array>,
+        offsets: OffsetBuffer<i32>,
+        nulls: Option<NullBuffer>,
+    ) -> ListArray {
+        let list_field = Arc::new(Field::new_list_field(DataType::Int32, true));
+
+        ListArray::new(list_field, offsets, Arc::new(values.into()), nulls)
+    }
+
+    fn divide_100_by(list: impl Array + Clone + 'static) -> Result<ArrayRef> {
+        let array_transform = array_transform_higher_order_function();
+
+        let schema = DFSchema::from_unqualified_fields(
+            vec![Field::new(
+                "list",
+                list.data_type().clone(),
+                list.is_nullable(),
+            )]
+            .into(),
+            HashMap::new(),
+        )?;
+
+        create_physical_expr(
+            &Expr::HigherOrderFunction(HigherOrderFunction::new(
+                array_transform,
+                vec![
+                    col("list"),
+                    lambda(
+                        ["v"],
+                        lit(100i32)
+                            / Expr::LambdaVariable(LambdaVariable::new(
+                                "v".to_string(),
+                                Some(Arc::new(Field::new("v", DataType::Int32, true))),
+                            )),
+                    ),
+                ],
+            )),
+            &schema,
+            &ExecutionProps::new(),
+        )?
+        .evaluate(&RecordBatch::try_new(
+            Arc::clone(schema.inner()),
+            vec![Arc::new(list.clone())],
+        )?)?
+        .into_array(list.len())
+    }
+
+    #[test]
+    fn transform_on_sliced_list_should_not_evaluate_on_unreachable_values() {
+        let list = create_i32_list(
+            vec![
+                // Have 0 here so if the expression is called on data that it will fail
+                0, 4, 100, 25, 20, 5, 2, 1, 10,
+            ],
+            OffsetBuffer::<i32>::from_lengths(vec![1, 3, 4, 1]),
+            None,
+        )
+        .slice(1, 3);
+
+        let res = divide_100_by(list).unwrap();
+
+        let actual_list = res.as_list::<i32>();
+
+        let expected_list = create_i32_list(
+            vec![25, 1, 4, 5, 20, 50, 100, 10],
+            OffsetBuffer::<i32>::from_lengths(vec![3, 4, 1]),
+            None,
+        );
+
+        assert_eq!(actual_list, &expected_list);
+    }
+
+    #[test]
+    fn transform_function_should_not_be_evaluated_on_values_underlying_null() {
+        let list = create_i32_list(
+            // 0 here for one of the values behind null, so if it will be evaluated
+            // it will fail due to divide by 0
+            vec![100, 20, 10, 0, 1, 2, 0, 1, 50],
+            OffsetBuffer::<i32>::from_lengths(vec![3, 4, 2]),
+            Some(NullBuffer::from(vec![true, false, true])),
+        );
+
+        let res = divide_100_by(list).unwrap();
+
+        let actual_list = res.as_list::<i32>();
+
+        let expected_list = create_i32_list(
+            vec![1, 5, 10, 100, 2],
+            OffsetBuffer::<i32>::from_lengths(vec![3, 0, 2]),
+            Some(NullBuffer::from(vec![true, false, true])),
+        );
+
+        assert_eq!(actual_list.data_type(), expected_list.data_type());
+        assert_eq!(actual_list, &expected_list);
+    }
+}
diff --git a/datafusion/functions-nested/src/arrays_zip.rs b/datafusion/functions-nested/src/arrays_zip.rs
new file mode 100644
index 0000000000000..5f1cb9dedf408
--- /dev/null
+++ b/datafusion/functions-nested/src/arrays_zip.rs
@@ -0,0 +1,329 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`ScalarUDFImpl`] definitions for arrays_zip function.
+
+use crate::utils::make_scalar_function;
+use arrow::array::{
+    Array, ArrayRef, Capacities, ListArray, MutableArrayData, NullBufferBuilder,
+    StructArray, new_null_array,
+};
+use arrow::buffer::OffsetBuffer;
+use arrow::datatypes::DataType::{FixedSizeList, LargeList, List, Null};
+use arrow::datatypes::{DataType, Field, Fields};
+use datafusion_common::cast::{
+    as_fixed_size_list_array, as_large_list_array, as_list_array,
+};
+use datafusion_common::{Result, exec_err};
+use datafusion_expr::{
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+use datafusion_macros::user_doc;
+use std::sync::Arc;
+
+/// Type-erased view of a list column (works for both List and LargeList).
+/// Stores the information needed to iterate rows without re-downcasting.
+struct ListColumnView {
+    /// The flat values array backing this list column.
+    values: ArrayRef,
+    /// Pre-computed per-row start offsets (length = num_rows + 1).
+    offsets: Vec<usize>,
+    /// Null bitmap from the input array (None means no nulls).
+    nulls: Option<arrow::buffer::NullBuffer>,
+}
+
+impl ListColumnView {
+    fn is_null(&self, idx: usize) -> bool {
+        self.nulls.as_ref().is_some_and(|n| n.is_null(idx))
+    }
+}
+
+make_udf_expr_and_func!(
+    ArraysZip,
+    arrays_zip,
+    "combines one or multiple arrays into a single array of structs.",
+    arrays_zip_udf
+);
+
+#[user_doc(
+    doc_section(label = "Array Functions"),
+    description = "Returns an array of structs created by combining the elements of each input array at the same index. If the arrays have different lengths, shorter arrays are padded with NULLs.",
+    syntax_example = "arrays_zip(array1[, ..., array_n])",
+    sql_example = r#"```sql
+> select arrays_zip([1, 2, 3]);
++---------------------------------------------------+
+| arrays_zip([1, 2, 3])                             |
++---------------------------------------------------+
+| [{1: 1}, {1: 2}, {1: 3}]                          |
++---------------------------------------------------+
+> select arrays_zip([1, 2], [3, 4, 5]);
++---------------------------------------------------+
+| arrays_zip([1, 2], [3, 4, 5])                     |
++---------------------------------------------------+
+| [{1: 1, 2: 3}, {1: 2, 2: 4}, {1: NULL, 2: 5}]     |
++---------------------------------------------------+
+```"#,
+    argument(name = "array1", description = "First array expression."),
+    argument(
+        name = "array_n",
+        description = "Optional additional array expressions."
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct ArraysZip {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for ArraysZip {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ArraysZip {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::variadic_any(Volatility::Immutable),
+            aliases: vec![String::from("list_zip")],
+        }
+    }
+}
+
+impl ScalarUDFImpl for ArraysZip {
+    fn name(&self) -> &str {
+        "arrays_zip"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types.is_empty() {
+            return exec_err!("arrays_zip requires at least one argument");
+        }
+
+        let mut fields = Vec::with_capacity(arg_types.len());
+        for (i, arg_type) in arg_types.iter().enumerate() {
+            let element_type = match arg_type {
+                List(field) | LargeList(field) | FixedSizeList(field, _) => {
+                    field.data_type().clone()
+                }
+                Null => Null,
+                dt => {
+                    return exec_err!("arrays_zip expects array arguments, got {dt}");
+                }
+            };
+            fields.push(Field::new(format!("{}", i + 1), element_type, true));
+        }
+
+        Ok(List(Arc::new(Field::new_list_field(
+            DataType::Struct(Fields::from(fields)),
+            true,
+        ))))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(arrays_zip_inner)(&args.args)
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+/// Core implementation for arrays_zip.
+///
+/// Takes N list arrays and produces a list of structs where each struct
+/// has one field per input array. If arrays within a row have different
+/// lengths, shorter arrays are padded with NULLs.
+/// Supports List, LargeList, and Null input types.
+fn arrays_zip_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+    if args.is_empty() {
+        return exec_err!("arrays_zip requires at least one argument");
+    }
+
+    let num_rows = args[0].len();
+
+    // Build a type-erased ListColumnView for each argument.
+    // None means the argument is Null-typed (all nulls, no backing data).
+    let mut views: Vec<Option<ListColumnView>> = Vec::with_capacity(args.len());
+    let mut element_types: Vec<DataType> = Vec::with_capacity(args.len());
+
+    for (i, arg) in args.iter().enumerate() {
+        match arg.data_type() {
+            List(field) => {
+                let arr = as_list_array(arg)?;
+                let raw_offsets = arr.value_offsets();
+                let offsets: Vec<usize> =
+                    raw_offsets.iter().map(|&o| o as usize).collect();
+                element_types.push(field.data_type().clone());
+                views.push(Some(ListColumnView {
+                    values: Arc::clone(arr.values()),
+                    offsets,
+                    nulls: arr.nulls().cloned(),
+                }));
+            }
+            LargeList(field) => {
+                let arr = as_large_list_array(arg)?;
+                let raw_offsets = arr.value_offsets();
+                let offsets: Vec<usize> =
+                    raw_offsets.iter().map(|&o| o as usize).collect();
+                element_types.push(field.data_type().clone());
+                views.push(Some(ListColumnView {
+                    values: Arc::clone(arr.values()),
+                    offsets,
+                    nulls: arr.nulls().cloned(),
+                }));
+            }
+            FixedSizeList(field, size) => {
+                let arr = as_fixed_size_list_array(arg)?;
+                let size = *size as usize;
+                let offsets: Vec<usize> = (0..=num_rows).map(|row| row * size).collect();
+                element_types.push(field.data_type().clone());
+                views.push(Some(ListColumnView {
+                    values: Arc::clone(arr.values()),
+                    offsets,
+                    nulls: arr.nulls().cloned(),
+                }));
+            }
+            Null => {
+                element_types.push(Null);
+                views.push(None);
+            }
+            dt => {
+                return exec_err!("arrays_zip argument {i} expected list type, got {dt}");
+            }
+        }
+    }
+
+    // Collect per-column values data for MutableArrayData builders.
+    let values_data: Vec<_> = views
+        .iter()
+        .map(|v| v.as_ref().map(|view| view.values.to_data()))
+        .collect();
+
+    let struct_fields: Fields = element_types
+        .iter()
+        .enumerate()
+        .map(|(i, dt)| Field::new(format!("{}", i + 1), dt.clone(), true))
+        .collect::<Vec<_>>()
+        .into();
+
+    // Create a MutableArrayData builder per column. For None (Null-typed)
+    // args we only need extend_nulls, so we track them separately.
+    let mut builders: Vec<Option<MutableArrayData>> = values_data
+        .iter()
+        .map(|vd| {
+            vd.as_ref().map(|data| {
+                MutableArrayData::with_capacities(vec![data], true, Capacities::Array(0))
+            })
+        })
+        .collect();
+
+    let mut offsets: Vec<i32> = Vec::with_capacity(num_rows + 1);
+    offsets.push(0);
+    let mut null_builder = NullBufferBuilder::new(num_rows);
+    let mut total_values: usize = 0;
+
+    // Process each row: compute per-array lengths, then copy values
+    // and pad shorter arrays with NULLs.
+    for row_idx in 0..num_rows {
+        let mut max_len: usize = 0;
+        let mut all_null = true;
+
+        for view in views.iter().flatten() {
+            if !view.is_null(row_idx) {
+                all_null = false;
+                let len = view.offsets[row_idx + 1] - view.offsets[row_idx];
+                max_len = max_len.max(len);
+            }
+        }
+
+        if all_null {
+            null_builder.append_null();
+            offsets.push(*offsets.last().unwrap());
+            continue;
+        }
+        null_builder.append_non_null();
+
+        // Extend each column builder for this row.
+        for (col_idx, view) in views.iter().enumerate() {
+            match view {
+                Some(v) if !v.is_null(row_idx) => {
+                    let start = v.offsets[row_idx];
+                    let end = v.offsets[row_idx + 1];
+                    let len = end - start;
+                    let builder = builders[col_idx].as_mut().unwrap();
+                    builder.extend(0, start, end);
+                    if len < max_len {
+                        builder.extend_nulls(max_len - len);
+                    }
+                }
+                _ => {
+                    // Null list entry or None (Null-typed) arg — all nulls.
+                    if let Some(builder) = builders[col_idx].as_mut() {
+                        builder.extend_nulls(max_len);
+                    }
+                }
+            }
+        }
+
+        total_values += max_len;
+        let last = *offsets.last().unwrap();
+        offsets.push(last + max_len as i32);
+    }
+
+    // Assemble struct columns from builders.
+    let struct_columns: Vec<ArrayRef> = builders
+        .into_iter()
+        .zip(element_types.iter())
+        .map(|(builder, elem_type)| match builder {
+            Some(b) => arrow::array::make_array(b.freeze()),
+            None => new_null_array(
+                if elem_type.is_null() {
+                    &Null
+                } else {
+                    elem_type
+                },
+                total_values,
+            ),
+        })
+        .collect();
+
+    let struct_array = StructArray::try_new(struct_fields, struct_columns, None)?;
+
+    let null_buffer = null_builder.finish();
+
+    let result = ListArray::try_new(
+        Arc::new(Field::new_list_field(
+            struct_array.data_type().clone(),
+            true,
+        )),
+        OffsetBuffer::new(offsets.into()),
+        Arc::new(struct_array),
+        null_buffer,
+    )?;
+
+    Ok(Arc::new(result))
+}
diff --git a/datafusion/functions-nested/src/cardinality.rs b/datafusion/functions-nested/src/cardinality.rs
index 6db0011cd0784..d21bb72a457a8 100644
--- a/datafusion/functions-nested/src/cardinality.rs
+++ b/datafusion/functions-nested/src/cardinality.rs
@@ -25,16 +25,15 @@ use arrow::datatypes::{
     DataType,
     DataType::{LargeList, List, Map, Null, UInt64},
 };
+use datafusion_common::Result;
 use datafusion_common::cast::{as_large_list_array, as_list_array, as_map_array};
 use datafusion_common::exec_err;
-use datafusion_common::utils::{take_function_args, ListCoercion};
-use datafusion_common::Result;
+use datafusion_common::utils::{ListCoercion, take_function_args};
 use datafusion_expr::{
     ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, Documentation,
-    ScalarUDFImpl, Signature, TypeSignature, Volatility,
+    ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::sync::Arc;
 
 make_udf_expr_and_func!(
@@ -90,9 +89,6 @@ impl Default for Cardinality {
     }
 }
 impl ScalarUDFImpl for Cardinality {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "cardinality"
     }
@@ -105,10 +101,7 @@ impl ScalarUDFImpl for Cardinality {
         Ok(UInt64)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(cardinality_inner)(&args.args)
     }
 
@@ -117,11 +110,10 @@ impl ScalarUDFImpl for Cardinality {
     }
 }
 
-/// Cardinality SQL function
-pub fn cardinality_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn cardinality_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array] = take_function_args("cardinality", args)?;
     match array.data_type() {
-        Null => Ok(Arc::new(UInt64Array::from_value(0, array.len()))),
+        Null => Ok(Arc::new(UInt64Array::new_null(array.len()))),
         List(_) => {
             let list_array = as_list_array(array)?;
             generic_list_cardinality::<i32>(list_array)
@@ -153,9 +145,14 @@ fn generic_list_cardinality<O: OffsetSizeTrait>(
 ) -> Result<ArrayRef> {
     let result = array
         .iter()
-        .map(|arr| match crate::utils::compute_array_dims(arr)? {
-            Some(vector) => Ok(Some(vector.iter().map(|x| x.unwrap()).product::<u64>())),
-            None => Ok(None),
+        .map(|arr| match arr {
+            Some(arr) if arr.is_empty() => Ok(Some(0u64)),
+            arr => match crate::utils::compute_array_dims(arr)? {
+                Some(vector) => {
+                    Ok(Some(vector.iter().map(|x| x.unwrap()).product::<u64>()))
+                }
+                None => Ok(None),
+            },
         })
         .collect::<Result<UInt64Array>>()?;
     Ok(Arc::new(result) as ArrayRef)
diff --git a/datafusion/functions-nested/src/concat.rs b/datafusion/functions-nested/src/concat.rs
index 9a12db525f954..8d06140889a55 100644
--- a/datafusion/functions-nested/src/concat.rs
+++ b/datafusion/functions-nested/src/concat.rs
@@ -17,21 +17,20 @@
 
 //! [`ScalarUDFImpl`] definitions for `array_append`, `array_prepend` and `array_concat` functions.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use crate::make_array::make_array_inner;
 use crate::utils::{align_array_dimensions, check_datatypes, make_scalar_function};
 use arrow::array::{
     Array, ArrayData, ArrayRef, Capacities, GenericListArray, MutableArrayData,
-    NullBufferBuilder, OffsetSizeTrait,
+    OffsetSizeTrait,
 };
-use arrow::buffer::OffsetBuffer;
+use arrow::buffer::{NullBuffer, OffsetBuffer};
 use arrow::datatypes::{DataType, Field};
+use datafusion_common::Result;
 use datafusion_common::utils::{
-    base_type, coerced_type_with_base_type_only, ListCoercion,
+    ListCoercion, base_type, coerced_type_with_base_type_only,
 };
-use datafusion_common::Result;
 use datafusion_common::{
     cast::as_generic_list_array,
     exec_err, plan_err,
@@ -39,7 +38,8 @@ use datafusion_common::{
 };
 use datafusion_expr::binary::type_union_resolution;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 use itertools::Itertools;
@@ -96,10 +96,6 @@ impl ArrayAppend {
 }
 
 impl ScalarUDFImpl for ArrayAppend {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array_append"
     }
@@ -117,10 +113,7 @@ impl ScalarUDFImpl for ArrayAppend {
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_append_inner)(&args.args)
     }
 
@@ -185,10 +178,6 @@ impl ArrayPrepend {
 }
 
 impl ScalarUDFImpl for ArrayPrepend {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array_prepend"
     }
@@ -206,10 +195,7 @@ impl ScalarUDFImpl for ArrayPrepend {
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_prepend_inner)(&args.args)
     }
 
@@ -276,10 +262,6 @@ impl ArrayConcat {
 }
 
 impl ScalarUDFImpl for ArrayConcat {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array_concat"
     }
@@ -297,7 +279,7 @@ impl ScalarUDFImpl for ArrayConcat {
                 DataType::Null | DataType::List(_) | DataType::FixedSizeList(..) => (),
                 DataType::LargeList(_) => large_list = true,
                 arg_type => {
-                    return plan_err!("{} does not support type {arg_type}", self.name())
+                    return plan_err!("{} does not support type {arg_type}", self.name());
                 }
             }
 
@@ -326,10 +308,7 @@ impl ScalarUDFImpl for ArrayConcat {
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_concat_inner)(&args.args)
     }
 
@@ -338,10 +317,23 @@ impl ScalarUDFImpl for ArrayConcat {
     }
 
     fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        let base_type = base_type(&self.return_type(arg_types)?);
+        let return_type = self.return_type(arg_types)?;
+        let base_type = base_type(&return_type);
         let coercion = Some(&ListCoercion::FixedSizedListToList);
+        // When the return type is a `LargeList`, the outer container of every
+        // input must be widened to `LargeList` as well. Otherwise
+        // `array_concat_inner` would later try to downcast a `List` argument
+        // to `GenericListArray<i64>` and fail.
+        let promote_to_large_list = matches!(return_type, DataType::LargeList(_));
         let arg_types = arg_types.iter().map(|arg_type| {
-            coerced_type_with_base_type_only(arg_type, &base_type, coercion)
+            let coerced =
+                coerced_type_with_base_type_only(arg_type, &base_type, coercion);
+            match coerced {
+                DataType::List(field) if promote_to_large_list => {
+                    DataType::LargeList(field)
+                }
+                other => other,
+            }
         });
 
         Ok(arg_types.collect())
@@ -352,8 +344,7 @@ impl ScalarUDFImpl for ArrayConcat {
     }
 }
 
-/// Array_concat/Array_cat SQL function
-pub(crate) fn array_concat_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+pub fn array_concat_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     if args.is_empty() {
         return exec_err!("array_concat expects at least one argument");
     }
@@ -397,64 +388,70 @@ fn concat_internal<O: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
         .iter()
         .map(|arg| as_generic_list_array::<O>(arg))
         .collect::<Result<Vec<_>>>()?;
-    // Assume number of rows is the same for all arrays
     let row_count = list_arrays[0].len();
 
-    let mut array_lengths = vec![];
-    let mut arrays = vec![];
-    let mut valid = NullBufferBuilder::new(row_count);
-    for i in 0..row_count {
-        let nulls = list_arrays
+    // Extract underlying values ArrayData from each list array for MutableArrayData.
+    let values_data: Vec<ArrayData> =
+        list_arrays.iter().map(|la| la.values().to_data()).collect();
+    let values_data_refs: Vec<&ArrayData> = values_data.iter().collect();
+
+    // Estimate capacity as the sum of all values arrays' lengths.
+    let total_capacity: usize = values_data.iter().map(|d| d.len()).sum();
+
+    let mut mutable = MutableArrayData::with_capacities(
+        values_data_refs,
+        false,
+        Capacities::Array(total_capacity),
+    );
+    let mut offsets: Vec<O> = Vec::with_capacity(row_count + 1);
+    offsets.push(O::zero());
+
+    // Compute the output null buffer: a row is null only if null in ALL input
+    // arrays. This is the bitwise OR of validity bits (valid if valid in ANY
+    // input). If any array has no null buffer (all valid), no output row can be
+    // null.
+    let nulls = list_arrays
+        .iter()
+        .filter_map(|la| la.nulls())
+        .collect::<Vec<_>>();
+    let valid = if nulls.len() == list_arrays.len() {
+        nulls
             .iter()
-            .map(|arr| arr.is_null(i))
-            .collect::<Vec<_>>();
-
-        // If all the arrays are null, the concatenated array is null
-        let is_null = nulls.iter().all(|&x| x);
-        if is_null {
-            array_lengths.push(0);
-            valid.append_null();
-        } else {
-            // Get all the arrays on i-th row
-            let values = list_arrays
-                .iter()
-                .map(|arr| arr.value(i))
-                .collect::<Vec<_>>();
-
-            let elements = values
-                .iter()
-                .map(|a| a.as_ref())
-                .collect::<Vec<&dyn Array>>();
-
-            // Concatenated array on i-th row
-            let concatenated_array = arrow::compute::concat(elements.as_slice())?;
-            array_lengths.push(concatenated_array.len());
-            arrays.push(concatenated_array);
-            valid.append_non_null();
+            .map(|n| n.inner().clone())
+            .reduce(|a, b| &a | &b)
+            .map(NullBuffer::new)
+    } else {
+        None
+    };
+
+    for row_idx in 0..row_count {
+        for (arr_idx, list_array) in list_arrays.iter().enumerate() {
+            if list_array.is_null(row_idx) {
+                continue;
+            }
+            let start = list_array.offsets()[row_idx].to_usize().unwrap();
+            let end = list_array.offsets()[row_idx + 1].to_usize().unwrap();
+            if start < end {
+                mutable.extend(arr_idx, start, end);
+            }
         }
+        offsets.push(O::usize_as(mutable.len()));
     }
-    // Assume all arrays have the same data type
-    let data_type = list_arrays[0].value_type();
 
-    let elements = arrays
-        .iter()
-        .map(|a| a.as_ref())
-        .collect::<Vec<&dyn Array>>();
+    let data_type = list_arrays[0].value_type();
+    let data = mutable.freeze();
 
-    let list_arr = GenericListArray::<O>::new(
+    Ok(Arc::new(GenericListArray::<O>::try_new(
         Arc::new(Field::new_list_field(data_type, true)),
-        OffsetBuffer::from_lengths(array_lengths),
-        Arc::new(arrow::compute::concat(elements.as_slice())?),
-        valid.finish(),
-    );
-
-    Ok(Arc::new(list_arr))
+        OffsetBuffer::new(offsets.into()),
+        arrow::array::make_array(data),
+        valid,
+    )?))
 }
 
 // Kernel functions
 
-/// Array_append SQL function
-pub(crate) fn array_append_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_append_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array, values] = take_function_args("array_append", args)?;
     match array.data_type() {
         DataType::Null => make_array_inner(&[Arc::clone(values)]),
@@ -464,8 +461,7 @@ pub(crate) fn array_append_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     }
 }
 
-/// Array_prepend SQL function
-pub(crate) fn array_prepend_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_prepend_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [values, array] = take_function_args("array_prepend", args)?;
     match array.data_type() {
         DataType::Null => make_array_inner(&[Arc::clone(values)]),
diff --git a/datafusion/functions-nested/src/cosine_distance.rs b/datafusion/functions-nested/src/cosine_distance.rs
new file mode 100644
index 0000000000000..335856075046c
--- /dev/null
+++ b/datafusion/functions-nested/src/cosine_distance.rs
@@ -0,0 +1,219 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`ScalarUDFImpl`] definitions for cosine_distance function.
+
+use crate::utils::make_scalar_function;
+use arrow::array::{Array, ArrayRef, Float64Array, OffsetSizeTrait};
+use arrow::datatypes::{
+    DataType,
+    DataType::{FixedSizeList, LargeList, List, Null},
+    Field,
+};
+use datafusion_common::cast::{as_float64_array, as_generic_list_array};
+use datafusion_common::utils::{ListCoercion, coerced_type_with_base_type_only};
+use datafusion_common::{
+    Result, exec_err, internal_err, plan_err, utils::take_function_args,
+};
+use datafusion_expr::{
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+use datafusion_macros::user_doc;
+use std::sync::Arc;
+
+make_udf_expr_and_func!(
+    CosineDistance,
+    cosine_distance,
+    array1 array2,
+    "returns the cosine distance between two numeric arrays.",
+    cosine_distance_udf
+);
+
+#[user_doc(
+    doc_section(label = "Array Functions"),
+    description = "Returns the cosine distance between two input arrays of equal length. The cosine distance is defined as 1 - cosine_similarity, i.e. `1 - dot(a,b) / (||a|| * ||b||)`. Returns NULL if either array is NULL or contains only zeros.",
+    syntax_example = "cosine_distance(array1, array2)",
+    sql_example = r#"```sql
+> select cosine_distance([1.0, 0.0], [0.0, 1.0]);
++-----------------------------------------------+
+| cosine_distance(List([1.0,0.0]),List([0.0,1.0])) |
++-----------------------------------------------+
+| 1.0                                           |
++-----------------------------------------------+
+```"#,
+    argument(
+        name = "array1",
+        description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
+    ),
+    argument(
+        name = "array2",
+        description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct CosineDistance {
+    signature: Signature,
+}
+
+impl Default for CosineDistance {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl CosineDistance {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for CosineDistance {
+    fn name(&self) -> &str {
+        "cosine_distance"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        let [_, _] = take_function_args(self.name(), arg_types)?;
+        let coercion = Some(&ListCoercion::FixedSizedListToList);
+
+        for arg_type in arg_types {
+            if !matches!(arg_type, Null | List(_) | LargeList(_) | FixedSizeList(..)) {
+                return plan_err!("{} does not support type {arg_type}", self.name());
+            }
+        }
+
+        // If any input is `LargeList`, both sides must be widened to `LargeList`
+        // so the runtime dispatch in `cosine_distance_inner` sees a homogeneous
+        // pair. Follows the pattern in `ArrayConcat::coerce_types`.
+        let any_large_list = arg_types.iter().any(|t| matches!(t, LargeList(_)));
+
+        let coerced = arg_types
+            .iter()
+            .map(|arg_type| {
+                if matches!(arg_type, Null) {
+                    let field = Arc::new(Field::new_list_field(DataType::Float64, true));
+                    return if any_large_list {
+                        LargeList(field)
+                    } else {
+                        List(field)
+                    };
+                }
+                let coerced = coerced_type_with_base_type_only(
+                    arg_type,
+                    &DataType::Float64,
+                    coercion,
+                );
+                match coerced {
+                    List(field) if any_large_list => LargeList(field),
+                    other => other,
+                }
+            })
+            .collect();
+
+        Ok(coerced)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(cosine_distance_inner)(&args.args)
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+fn cosine_distance_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [array1, array2] = take_function_args("cosine_distance", args)?;
+    match (array1.data_type(), array2.data_type()) {
+        (List(_), List(_)) => general_cosine_distance::<i32>(args),
+        (LargeList(_), LargeList(_)) => general_cosine_distance::<i64>(args),
+        (arg_type1, arg_type2) => internal_err!(
+            "cosine_distance received unexpected types after coercion: {arg_type1} and {arg_type2}"
+        ),
+    }
+}
+
+fn general_cosine_distance<O: OffsetSizeTrait>(arrays: &[ArrayRef]) -> Result<ArrayRef> {
+    let list_array1 = as_generic_list_array::<O>(&arrays[0])?;
+    let list_array2 = as_generic_list_array::<O>(&arrays[1])?;
+
+    let values1 = as_float64_array(list_array1.values())?;
+    let values2 = as_float64_array(list_array2.values())?;
+    let offsets1 = list_array1.value_offsets();
+    let offsets2 = list_array2.value_offsets();
+
+    let mut builder = Float64Array::builder(list_array1.len());
+    for row in 0..list_array1.len() {
+        if list_array1.is_null(row) || list_array2.is_null(row) {
+            builder.append_null();
+            continue;
+        }
+
+        let start1 = offsets1[row].as_usize();
+        let end1 = offsets1[row + 1].as_usize();
+        let start2 = offsets2[row].as_usize();
+        let end2 = offsets2[row + 1].as_usize();
+        let len1 = end1 - start1;
+        let len2 = end2 - start2;
+
+        if len1 != len2 {
+            return exec_err!(
+                "cosine_distance requires both list inputs to have the same length, got {len1} and {len2}"
+            );
+        }
+
+        let slice1 = values1.slice(start1, len1);
+        let slice2 = values2.slice(start2, len2);
+        if slice1.null_count() != 0 || slice2.null_count() != 0 {
+            builder.append_null();
+            continue;
+        }
+
+        let vals1 = slice1.values();
+        let vals2 = slice2.values();
+
+        let mut dot = 0.0;
+        let mut sq1 = 0.0;
+        let mut sq2 = 0.0;
+        for i in 0..len1 {
+            let a = vals1[i];
+            let b = vals2[i];
+            dot += a * b;
+            sq1 += a * a;
+            sq2 += b * b;
+        }
+
+        if sq1 == 0.0 || sq2 == 0.0 {
+            builder.append_null();
+        } else {
+            builder.append_value(1.0 - dot / (sq1.sqrt() * sq2.sqrt()));
+        }
+    }
+
+    Ok(Arc::new(builder.finish()) as ArrayRef)
+}
diff --git a/datafusion/functions-nested/src/dimension.rs b/datafusion/functions-nested/src/dimension.rs
index b0fc5bee5494d..01fb81d878e0b 100644
--- a/datafusion/functions-nested/src/dimension.rs
+++ b/datafusion/functions-nested/src/dimension.rs
@@ -23,17 +23,17 @@ use arrow::datatypes::{
     DataType::{FixedSizeList, LargeList, List, Null, UInt64},
     UInt64Type,
 };
-use std::any::Any;
 
 use datafusion_common::cast::{
     as_fixed_size_list_array, as_large_list_array, as_list_array,
 };
-use datafusion_common::{exec_err, utils::take_function_args, Result};
+use datafusion_common::{Result, exec_err, utils::take_function_args};
 
 use crate::utils::{compute_array_dims, make_scalar_function};
 use datafusion_common::utils::list_ndims;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 use itertools::Itertools;
@@ -86,9 +86,6 @@ impl ArrayDims {
 }
 
 impl ScalarUDFImpl for ArrayDims {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "array_dims"
     }
@@ -101,10 +98,7 @@ impl ScalarUDFImpl for ArrayDims {
         Ok(DataType::new_list(UInt64, true))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_dims_inner)(&args.args)
     }
 
@@ -158,9 +152,6 @@ impl ArrayNdims {
 }
 
 impl ScalarUDFImpl for ArrayNdims {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "array_ndims"
     }
@@ -173,10 +164,7 @@ impl ScalarUDFImpl for ArrayNdims {
         Ok(UInt64)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_ndims_inner)(&args.args)
     }
 
@@ -189,8 +177,7 @@ impl ScalarUDFImpl for ArrayNdims {
     }
 }
 
-/// Array_dims SQL function
-pub fn array_dims_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_dims_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array] = take_function_args("array_dims", args)?;
     let data: Vec<_> = match array.data_type() {
         List(_) => as_list_array(&array)?
@@ -214,8 +201,7 @@ pub fn array_dims_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     Ok(Arc::new(result))
 }
 
-/// Array_ndims SQL function
-pub fn array_ndims_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_ndims_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array] = take_function_args("array_ndims", args)?;
 
     fn general_list_ndims(array: &ArrayRef) -> Result<ArrayRef> {
diff --git a/datafusion/functions-nested/src/distance.rs b/datafusion/functions-nested/src/distance.rs
index e2e38fbd0d836..edf1806b66c2d 100644
--- a/datafusion/functions-nested/src/distance.rs
+++ b/datafusion/functions-nested/src/distance.rs
@@ -29,15 +29,15 @@ use datafusion_common::cast::{
     as_float32_array, as_float64_array, as_generic_list_array, as_int32_array,
     as_int64_array,
 };
-use datafusion_common::utils::{coerced_type_with_base_type_only, ListCoercion};
-use datafusion_common::{exec_err, plan_err, utils::take_function_args, Result};
+use datafusion_common::utils::{ListCoercion, coerced_type_with_base_type_only};
+use datafusion_common::{Result, exec_err, plan_err, utils::take_function_args};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_functions::downcast_arg;
 use datafusion_macros::user_doc;
 use itertools::Itertools;
-use std::any::Any;
 use std::sync::Arc;
 
 make_udf_expr_and_func!(
@@ -91,10 +91,6 @@ impl ArrayDistance {
 }
 
 impl ScalarUDFImpl for ArrayDistance {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array_distance"
     }
@@ -125,10 +121,7 @@ impl ScalarUDFImpl for ArrayDistance {
         arg_types.try_collect()
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_distance_inner)(&args.args)
     }
 
@@ -141,7 +134,7 @@ impl ScalarUDFImpl for ArrayDistance {
     }
 }
 
-pub fn array_distance_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_distance_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array1, array2] = take_function_args("array_distance", args)?;
     match (array1.data_type(), array2.data_type()) {
         (List(_), List(_)) => general_array_distance::<i32>(args),
diff --git a/datafusion/functions-nested/src/empty.rs b/datafusion/functions-nested/src/empty.rs
index 27a90ab0442bc..262eb4935c968 100644
--- a/datafusion/functions-nested/src/empty.rs
+++ b/datafusion/functions-nested/src/empty.rs
@@ -25,12 +25,12 @@ use arrow::datatypes::{
     DataType::{Boolean, FixedSizeList, LargeList, List},
 };
 use datafusion_common::cast::as_generic_list_array;
-use datafusion_common::{exec_err, utils::take_function_args, Result};
+use datafusion_common::{Result, exec_err, utils::take_function_args};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::sync::Arc;
 
 make_udf_expr_and_func!(
@@ -79,9 +79,6 @@ impl ArrayEmpty {
 }
 
 impl ScalarUDFImpl for ArrayEmpty {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "empty"
     }
@@ -94,10 +91,7 @@ impl ScalarUDFImpl for ArrayEmpty {
         Ok(Boolean)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_empty_inner)(&args.args)
     }
 
@@ -110,8 +104,7 @@ impl ScalarUDFImpl for ArrayEmpty {
     }
 }
 
-/// Array_empty SQL function
-pub fn array_empty_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_empty_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array] = take_function_args("array_empty", args)?;
     match array.data_type() {
         List(_) => general_array_empty::<i32>(array),
diff --git a/datafusion/functions-nested/src/except.rs b/datafusion/functions-nested/src/except.rs
index d6982ab5a2ab0..12ed6c2e186f4 100644
--- a/datafusion/functions-nested/src/except.rs
+++ b/datafusion/functions-nested/src/except.rs
@@ -15,20 +15,26 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`ScalarUDFImpl`] definitions for array_except function.
+//! [`ScalarUDFImpl`] definition for array_except function.
 
 use crate::utils::{check_datatypes, make_scalar_function};
-use arrow::array::{cast::AsArray, Array, ArrayRef, GenericListArray, OffsetSizeTrait};
-use arrow::buffer::OffsetBuffer;
+use arrow::array::new_null_array;
+use arrow::array::{
+    Array, ArrayRef, GenericListArray, OffsetSizeTrait, UInt32Array, UInt64Array,
+    cast::AsArray,
+};
+use arrow::buffer::{NullBuffer, OffsetBuffer};
+use arrow::compute::take;
 use arrow::datatypes::{DataType, FieldRef};
 use arrow::row::{RowConverter, SortField};
-use datafusion_common::utils::{take_function_args, ListCoercion};
-use datafusion_common::{internal_err, HashSet, Result};
+use datafusion_common::utils::{ListCoercion, take_function_args};
+use datafusion_common::{HashSet, Result, internal_err};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
+use itertools::Itertools;
 use std::sync::Arc;
 
 make_udf_expr_and_func!(
@@ -92,9 +98,6 @@ impl ArrayExcept {
 }
 
 impl ScalarUDFImpl for ArrayExcept {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "array_except"
     }
@@ -104,16 +107,16 @@ impl ScalarUDFImpl for ArrayExcept {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        match (&arg_types[0].clone(), &arg_types[1].clone()) {
-            (DataType::Null, _) | (_, DataType::Null) => Ok(arg_types[0].clone()),
+        match (&arg_types[0], &arg_types[1]) {
+            (DataType::Null, DataType::Null) => {
+                Ok(DataType::new_list(DataType::Null, true))
+            }
+            (DataType::Null, dt) | (dt, DataType::Null) => Ok(dt.clone()),
             (dt, _) => Ok(dt.clone()),
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_except_inner)(&args.args)
     }
 
@@ -126,12 +129,19 @@ impl ScalarUDFImpl for ArrayExcept {
     }
 }
 
-/// Array_except SQL function
-pub fn array_except_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_except_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array1, array2] = take_function_args("array_except", args)?;
 
+    let len = array1.len();
     match (array1.data_type(), array2.data_type()) {
-        (DataType::Null, _) | (_, DataType::Null) => Ok(array1.to_owned()),
+        (DataType::Null, DataType::Null) => Ok(new_null_array(
+            &DataType::new_list(DataType::Null, true),
+            len,
+        )),
+        (DataType::Null, dt @ DataType::List(_))
+        | (DataType::Null, dt @ DataType::LargeList(_))
+        | (dt @ DataType::List(_), DataType::Null)
+        | (dt @ DataType::LargeList(_), DataType::Null) => Ok(new_null_array(dt, len)),
         (DataType::List(field), DataType::List(_)) => {
             check_datatypes("array_except", &[array1, array2])?;
             let list1 = array1.as_list::<i32>();
@@ -159,43 +169,132 @@ fn general_except<OffsetSize: OffsetSizeTrait>(
 ) -> Result<GenericListArray<OffsetSize>> {
     let converter = RowConverter::new(vec![SortField::new(l.value_type())])?;
 
-    let l_values = l.values().to_owned();
-    let r_values = r.values().to_owned();
-    let l_values = converter.convert_columns(&[l_values])?;
-    let r_values = converter.convert_columns(&[r_values])?;
+    // Only convert the visible portion of the values array. For sliced
+    // ListArrays, values() returns the full underlying array but only
+    // elements between the first and last offset are referenced.
+    let l_first = l.offsets()[0].as_usize();
+    let l_len = l.offsets()[l.len()].as_usize() - l_first;
+    let l_values = converter.convert_columns(&[l.values().slice(l_first, l_len)])?;
+
+    let r_first = r.offsets()[0].as_usize();
+    let r_len = r.offsets()[r.len()].as_usize() - r_first;
+    let r_values = converter.convert_columns(&[r.values().slice(r_first, r_len)])?;
 
     let mut offsets = Vec::<OffsetSize>::with_capacity(l.len() + 1);
     offsets.push(OffsetSize::usize_as(0));
 
-    let mut rows = Vec::with_capacity(l_values.num_rows());
+    let mut indices: Vec<usize> = Vec::with_capacity(l_values.num_rows());
     let mut dedup = HashSet::new();
 
-    for (l_w, r_w) in l.offsets().windows(2).zip(r.offsets().windows(2)) {
-        let l_slice = l_w[0].as_usize()..l_w[1].as_usize();
-        let r_slice = r_w[0].as_usize()..r_w[1].as_usize();
-        for i in r_slice {
-            let right_row = r_values.row(i);
+    let nulls = NullBuffer::union(l.nulls(), r.nulls());
+
+    let l_offsets_iter = l.offsets().iter().tuple_windows();
+    let r_offsets_iter = r.offsets().iter().tuple_windows();
+    for (list_index, ((l_start, l_end), (r_start, r_end))) in
+        l_offsets_iter.zip(r_offsets_iter).enumerate()
+    {
+        if nulls
+            .as_ref()
+            .is_some_and(|nulls| nulls.is_null(list_index))
+        {
+            offsets.push(OffsetSize::usize_as(indices.len()));
+            continue;
+        }
+
+        for element_index in r_start.as_usize() - r_first..r_end.as_usize() - r_first {
+            let right_row = r_values.row(element_index);
             dedup.insert(right_row);
         }
-        for i in l_slice {
-            let left_row = l_values.row(i);
+        for element_index in l_start.as_usize() - l_first..l_end.as_usize() - l_first {
+            let left_row = l_values.row(element_index);
             if dedup.insert(left_row) {
-                rows.push(left_row);
+                indices.push(element_index + l_first);
             }
         }
 
-        offsets.push(OffsetSize::usize_as(rows.len()));
+        offsets.push(OffsetSize::usize_as(indices.len()));
         dedup.clear();
     }
 
-    if let Some(values) = converter.convert_rows(rows)?.first() {
-        Ok(GenericListArray::<OffsetSize>::new(
-            field.to_owned(),
-            OffsetBuffer::new(offsets.into()),
-            values.to_owned(),
-            l.nulls().cloned(),
-        ))
+    // Gather distinct left-side values by index.
+    // Use UInt64Array for LargeList to support values arrays exceeding u32::MAX.
+    let values = if indices.is_empty() {
+        arrow::array::new_empty_array(&l.value_type())
+    } else if OffsetSize::IS_LARGE {
+        let indices =
+            UInt64Array::from(indices.into_iter().map(|i| i as u64).collect::<Vec<_>>());
+        take(l.values().as_ref(), &indices, None)?
     } else {
-        internal_err!("array_except failed to convert rows")
+        let indices =
+            UInt32Array::from(indices.into_iter().map(|i| i as u32).collect::<Vec<_>>());
+        take(l.values().as_ref(), &indices, None)?
+    };
+
+    Ok(GenericListArray::<OffsetSize>::new(
+        field.to_owned(),
+        OffsetBuffer::new(offsets.into()),
+        values,
+        nulls,
+    ))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::ArrayExcept;
+    use arrow::array::{Array, AsArray, Int32Array, ListArray};
+    use arrow::datatypes::{Field, Int32Type};
+    use datafusion_common::{Result, config::ConfigOptions};
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+    use std::sync::Arc;
+
+    #[test]
+    fn test_array_except_sliced_lists() -> Result<()> {
+        // l: [[1,2], [3,4], [5,6], [7,8]]  →  slice(1,2)  →  [[3,4], [5,6]]
+        // r: [[3],   [5],   [6],   [8]]    →  slice(1,2)  →  [[5],   [6]]
+        // except(l, r) should be [[3,4], [5]]
+        let l_full = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), Some(2)]),
+            Some(vec![Some(3), Some(4)]),
+            Some(vec![Some(5), Some(6)]),
+            Some(vec![Some(7), Some(8)]),
+        ]);
+        let r_full = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(3)]),
+            Some(vec![Some(5)]),
+            Some(vec![Some(6)]),
+            Some(vec![Some(8)]),
+        ]);
+        let l_sliced = l_full.slice(1, 2);
+        let r_sliced = r_full.slice(1, 2);
+
+        let list_field = Arc::new(Field::new("item", l_sliced.data_type().clone(), true));
+        let return_field =
+            Arc::new(Field::new("return", l_sliced.data_type().clone(), true));
+
+        let result = ArrayExcept::new().invoke_with_args(ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Array(Arc::new(l_sliced)),
+                ColumnarValue::Array(Arc::new(r_sliced)),
+            ],
+            arg_fields: vec![Arc::clone(&list_field), Arc::clone(&list_field)],
+            number_rows: 2,
+            return_field,
+            config_options: Arc::new(ConfigOptions::default()),
+        })?;
+
+        let output = result.into_array(2)?;
+        let output = output.as_list::<i32>();
+
+        // Row 0: [3,4] except [5] = [3,4]
+        let row0 = output.value(0);
+        let row0 = row0.as_any().downcast_ref::<Int32Array>().unwrap();
+        assert_eq!(row0.values().as_ref(), &[3, 4]);
+
+        // Row 1: [5,6] except [6] = [5]
+        let row1 = output.value(1);
+        let row1 = row1.as_any().downcast_ref::<Int32Array>().unwrap();
+        assert_eq!(row1.values().as_ref(), &[5]);
+
+        Ok(())
     }
 }
diff --git a/datafusion/functions-nested/src/extract.rs b/datafusion/functions-nested/src/extract.rs
index a46c9c75094c6..12cca0e52ab7f 100644
--- a/datafusion/functions-nested/src/extract.rs
+++ b/datafusion/functions-nested/src/extract.rs
@@ -18,31 +18,34 @@
 //! [`ScalarUDFImpl`] definitions for array_element, array_slice, array_pop_front, array_pop_back, and array_any_value functions.
 
 use arrow::array::{
-    Array, ArrayRef, ArrowNativeTypeOp, Capacities, GenericListArray, Int64Array,
-    MutableArrayData, NullArray, NullBufferBuilder, OffsetSizeTrait,
+    Array, ArrayRef, Capacities, GenericListArray, GenericListViewArray, Int64Array,
+    MutableArrayData, NullArray, OffsetSizeTrait,
 };
-use arrow::buffer::OffsetBuffer;
+use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
 use arrow::datatypes::DataType;
 use arrow::datatypes::{
-    DataType::{FixedSizeList, LargeList, List, Null},
+    DataType::{FixedSizeList, LargeList, LargeListView, List, ListView, Null},
     Field,
 };
-use datafusion_common::cast::as_int64_array;
 use datafusion_common::cast::as_large_list_array;
 use datafusion_common::cast::as_list_array;
+use datafusion_common::cast::{
+    as_int64_array, as_large_list_view_array, as_list_view_array,
+};
+use datafusion_common::internal_err;
 use datafusion_common::utils::ListCoercion;
 use datafusion_common::{
-    exec_datafusion_err, exec_err, internal_datafusion_err, plan_err,
-    utils::take_function_args, Result,
+    Result, exec_datafusion_err, exec_err, internal_datafusion_err, plan_err,
+    utils::take_function_args,
 };
 use datafusion_expr::{
-    ArrayFunctionArgument, ArrayFunctionSignature, Expr, TypeSignature,
+    ArrayFunctionArgument, ArrayFunctionSignature, Expr, ScalarFunctionArgs,
+    TypeSignature,
 };
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::sync::Arc;
 
 use crate::utils::make_scalar_function;
@@ -129,9 +132,6 @@ impl ArrayElement {
 }
 
 impl ScalarUDFImpl for ArrayElement {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "array_element"
     }
@@ -169,10 +169,7 @@ impl ScalarUDFImpl for ArrayElement {
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_element_inner)(&args.args)
     }
 
@@ -260,7 +257,7 @@ where
         let len = end - start;
 
         // array is null
-        if len == O::usize_as(0) {
+        if array.is_null(row_index) {
             mutable.extend_nulls(1);
             continue;
         }
@@ -355,10 +352,6 @@ impl ArraySlice {
 }
 
 impl ScalarUDFImpl for ArraySlice {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn display_name(&self, args: &[Expr]) -> Result<String> {
         let args_name = args.iter().map(ToString::to_string).collect::<Vec<_>>();
         if let Some((arr, indexes)) = args_name.split_first() {
@@ -392,10 +385,7 @@ impl ScalarUDFImpl for ArraySlice {
         Ok(arg_types[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_slice_inner)(&args.args)
     }
 
@@ -449,10 +439,179 @@ fn array_slice_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
             let array = as_large_list_array(&args[0])?;
             general_array_slice::<i64>(array, from_array, to_array, stride)
         }
+        ListView(_) => {
+            let array = as_list_view_array(&args[0])?;
+            general_list_view_array_slice::<i32>(array, from_array, to_array, stride)
+        }
+        LargeListView(_) => {
+            let array = as_large_list_view_array(&args[0])?;
+            general_list_view_array_slice::<i64>(array, from_array, to_array, stride)
+        }
         _ => exec_err!("array_slice does not support type: {}", array_data_type),
     }
 }
 
+fn adjusted_from_index<O: OffsetSizeTrait>(index: i64, len: O) -> Result<Option<O>>
+where
+    i64: TryInto<O>,
+{
+    // 0 ~ len - 1
+    let adjusted_zero_index = if index < 0 {
+        if let Ok(index) = index.try_into() {
+            // When index < 0 and -index > length, index is clamped to the beginning of the list.
+            // Otherwise, when index < 0, the index is counted from the end of the list.
+            //
+            // Note, we actually test the contrapositive, index < -length, because negating a
+            // negative will panic if the negative is equal to the smallest representable value
+            // while negating a positive is always safe.
+            if index < (O::zero() - O::one()) * len {
+                O::zero()
+            } else {
+                index + len
+            }
+        } else {
+            return exec_err!("array_slice got invalid index: {}", index);
+        }
+    } else {
+        // array_slice(arr, 1, to) is the same as array_slice(arr, 0, to)
+        if let Ok(index) = index.try_into() {
+            std::cmp::max(index - O::usize_as(1), O::usize_as(0))
+        } else {
+            return exec_err!("array_slice got invalid index: {}", index);
+        }
+    };
+
+    if O::usize_as(0) <= adjusted_zero_index && adjusted_zero_index < len {
+        Ok(Some(adjusted_zero_index))
+    } else {
+        // Out of bounds
+        Ok(None)
+    }
+}
+
+fn adjusted_to_index<O: OffsetSizeTrait>(index: i64, len: O) -> Result<Option<O>>
+where
+    i64: TryInto<O>,
+{
+    // 0 ~ len - 1
+    let adjusted_zero_index = if index < 0 {
+        // array_slice in duckdb with negative to_index is python-like, so index itself is exclusive
+        if let Ok(index) = index.try_into() {
+            index + len
+        } else {
+            return exec_err!("array_slice got invalid index: {}", index);
+        }
+    } else {
+        // array_slice(arr, from, len + 1) is the same as array_slice(arr, from, len)
+        if let Ok(index) = index.try_into() {
+            std::cmp::min(index - O::usize_as(1), len - O::usize_as(1))
+        } else {
+            return exec_err!("array_slice got invalid index: {}", index);
+        }
+    };
+
+    if O::usize_as(0) <= adjusted_zero_index && adjusted_zero_index < len {
+        Ok(Some(adjusted_zero_index))
+    } else {
+        // Out of bounds
+        Ok(None)
+    }
+}
+
+/// Internal plan describing how to materialize a single row's slice after
+/// the slice bounds/stride have been normalized. Both list layouts consume
+/// this to drive their copy logic.
+enum SlicePlan<O: OffsetSizeTrait> {
+    /// No values should be produced.
+    Empty,
+    /// A contiguous run starting at `start` (relative to the row) with `len`
+    /// elements can be copied in one go.
+    Contiguous { start: O, len: O },
+    /// Arbitrary positions (already relative to the row) must be copied in
+    /// sequence.
+    Indices(Vec<O>),
+}
+
+/// Produces a [`SlicePlan`] for the given logical slice parameters.
+fn compute_slice_plan<O: OffsetSizeTrait>(
+    len: O,
+    from_raw: i64,
+    to_raw: i64,
+    stride_raw: Option<i64>,
+) -> Result<SlicePlan<O>>
+where
+    i64: TryInto<O>,
+{
+    if len == O::usize_as(0) {
+        return Ok(SlicePlan::Empty);
+    }
+
+    let from_index = adjusted_from_index::<O>(from_raw, len)?;
+    let to_index = adjusted_to_index::<O>(to_raw, len)?;
+
+    let (Some(from), Some(to)) = (from_index, to_index) else {
+        return Ok(SlicePlan::Empty);
+    };
+
+    let stride_value = stride_raw.unwrap_or(1);
+    if stride_value == 0 {
+        return exec_err!(
+            "array_slice got invalid stride: {:?}, it cannot be 0",
+            stride_value
+        );
+    }
+
+    if (from < to && stride_value.is_negative())
+        || (from > to && stride_value.is_positive())
+    {
+        return Ok(SlicePlan::Empty);
+    }
+
+    let stride: O = stride_value.try_into().map_err(|_| {
+        internal_datafusion_err!("array_slice got invalid stride: {}", stride_value)
+    })?;
+
+    if from <= to && stride_value.is_positive() {
+        if stride_value == 1 {
+            let len = to - from + O::usize_as(1);
+            Ok(SlicePlan::Contiguous { start: from, len })
+        } else {
+            let mut indices = Vec::new();
+            let mut index = from;
+            while index <= to {
+                indices.push(index);
+                index += stride;
+            }
+            Ok(SlicePlan::Indices(indices))
+        }
+    } else {
+        let mut indices = Vec::new();
+        let mut index = from;
+        while index >= to {
+            indices.push(index);
+            index += stride;
+        }
+        Ok(SlicePlan::Indices(indices))
+    }
+}
+
+/// Combine null bitmaps from all slice inputs into a single mask.
+fn combine_input_nulls(
+    array: &dyn Array,
+    from_array: &Int64Array,
+    to_array: &Int64Array,
+    stride: Option<&Int64Array>,
+) -> Option<NullBuffer> {
+    [
+        array.nulls(),
+        from_array.nulls(),
+        to_array.nulls(),
+        stride.and_then(|s| s.nulls()),
+    ]
+    .into_iter()
+    .fold(None, |acc, nulls| NullBuffer::union(acc.as_ref(), nulls))
+}
+
 fn general_array_slice<O: OffsetSizeTrait>(
     array: &GenericListArray<O>,
     from_array: &Int64Array,
@@ -472,92 +631,20 @@ where
     // We have the slice syntax compatible with DuckDB v0.8.1.
     // The rule `adjusted_from_index` and `adjusted_to_index` follows the rule of array_slice in duckdb.
 
-    fn adjusted_from_index<O: OffsetSizeTrait>(index: i64, len: O) -> Result<Option<O>>
-    where
-        i64: TryInto<O>,
-    {
-        // 0 ~ len - 1
-        let adjusted_zero_index = if index < 0 {
-            if let Ok(index) = index.try_into() {
-                // When index < 0 and -index > length, index is clamped to the beginning of the list.
-                // Otherwise, when index < 0, the index is counted from the end of the list.
-                //
-                // Note, we actually test the contrapositive, index < -length, because negating a
-                // negative will panic if the negative is equal to the smallest representable value
-                // while negating a positive is always safe.
-                if index < (O::zero() - O::one()) * len {
-                    O::zero()
-                } else {
-                    index + len
-                }
-            } else {
-                return exec_err!("array_slice got invalid index: {}", index);
-            }
-        } else {
-            // array_slice(arr, 1, to) is the same as array_slice(arr, 0, to)
-            if let Ok(index) = index.try_into() {
-                std::cmp::max(index - O::usize_as(1), O::usize_as(0))
-            } else {
-                return exec_err!("array_slice got invalid index: {}", index);
-            }
-        };
-
-        if O::usize_as(0) <= adjusted_zero_index && adjusted_zero_index < len {
-            Ok(Some(adjusted_zero_index))
-        } else {
-            // Out of bounds
-            Ok(None)
-        }
-    }
-
-    fn adjusted_to_index<O: OffsetSizeTrait>(index: i64, len: O) -> Result<Option<O>>
-    where
-        i64: TryInto<O>,
-    {
-        // 0 ~ len - 1
-        let adjusted_zero_index = if index < 0 {
-            // array_slice in duckdb with negative to_index is python-like, so index itself is exclusive
-            if let Ok(index) = index.try_into() {
-                index + len
-            } else {
-                return exec_err!("array_slice got invalid index: {}", index);
-            }
-        } else {
-            // array_slice(arr, from, len + 1) is the same as array_slice(arr, from, len)
-            if let Ok(index) = index.try_into() {
-                std::cmp::min(index - O::usize_as(1), len - O::usize_as(1))
-            } else {
-                return exec_err!("array_slice got invalid index: {}", index);
-            }
-        };
-
-        if O::usize_as(0) <= adjusted_zero_index && adjusted_zero_index < len {
-            Ok(Some(adjusted_zero_index))
-        } else {
-            // Out of bounds
-            Ok(None)
-        }
-    }
-
     let mut offsets = vec![O::usize_as(0)];
-    let mut null_builder = NullBufferBuilder::new(array.len());
+
+    let nulls = combine_input_nulls(array, from_array, to_array, stride);
 
     for (row_index, offset_window) in array.offsets().windows(2).enumerate() {
         let start = offset_window[0];
         let end = offset_window[1];
         let len = end - start;
 
-        // If any input is null, return null.
-        if array.is_null(row_index)
-            || from_array.is_null(row_index)
-            || to_array.is_null(row_index)
-        {
+        if nulls.as_ref().is_some_and(|n| n.is_null(row_index)) {
             mutable.extend_nulls(1);
             offsets.push(offsets[row_index] + O::usize_as(1));
-            null_builder.append_null();
             continue;
         }
-        null_builder.append_non_null();
 
         // Empty arrays always return an empty array.
         if len == O::usize_as(0) {
@@ -565,72 +652,32 @@ where
             continue;
         }
 
-        let from_index = adjusted_from_index::<O>(from_array.value(row_index), len)?;
-        let to_index = adjusted_to_index::<O>(to_array.value(row_index), len)?;
-
-        if let (Some(from), Some(to)) = (from_index, to_index) {
-            let stride = stride.map(|s| s.value(row_index));
-            // Default stride is 1 if not provided
-            let stride = stride.unwrap_or(1);
-            if stride.is_zero() {
-                return exec_err!(
-                    "array_slice got invalid stride: {:?}, it cannot be 0",
-                    stride
-                );
-            } else if (from < to && stride.is_negative())
-                || (from > to && stride.is_positive())
-            {
-                // return empty array
-                offsets.push(offsets[row_index]);
-                continue;
+        let slice_plan = compute_slice_plan::<O>(
+            len,
+            from_array.value(row_index),
+            to_array.value(row_index),
+            stride.map(|s| s.value(row_index)),
+        )?;
+
+        match slice_plan {
+            SlicePlan::Empty => offsets.push(offsets[row_index]),
+            SlicePlan::Contiguous {
+                start: rel_start,
+                len: slice_len,
+            } => {
+                let start_index = (start + rel_start).to_usize().unwrap();
+                let end_index = (start + rel_start + slice_len).to_usize().unwrap();
+                mutable.extend(0, start_index, end_index);
+                offsets.push(offsets[row_index] + slice_len);
             }
-
-            let stride: O = stride.try_into().map_err(|_| {
-                internal_datafusion_err!("array_slice got invalid stride: {}", stride)
-            })?;
-
-            if from <= to && stride > O::zero() {
-                assert!(start + to <= end);
-                if stride.eq(&O::one()) {
-                    // stride is default to 1
-                    mutable.extend(
-                        0,
-                        (start + from).to_usize().unwrap(),
-                        (start + to + O::usize_as(1)).to_usize().unwrap(),
-                    );
-                    offsets.push(offsets[row_index] + (to - from + O::usize_as(1)));
-                    continue;
-                }
-                let mut index = start + from;
-                let mut cnt = 0;
-                while index <= start + to {
-                    mutable.extend(
-                        0,
-                        index.to_usize().unwrap(),
-                        index.to_usize().unwrap() + 1,
-                    );
-                    index += stride;
-                    cnt += 1;
+            SlicePlan::Indices(indices) => {
+                let count = indices.len();
+                for rel_index in indices {
+                    let absolute_index = (start + rel_index).to_usize().unwrap();
+                    mutable.extend(0, absolute_index, absolute_index + 1);
                 }
-                offsets.push(offsets[row_index] + O::usize_as(cnt));
-            } else {
-                let mut index = start + from;
-                let mut cnt = 0;
-                while index >= start + to {
-                    mutable.extend(
-                        0,
-                        index.to_usize().unwrap(),
-                        index.to_usize().unwrap() + 1,
-                    );
-                    index += stride;
-                    cnt += 1;
-                }
-                // invalid range, return empty array
-                offsets.push(offsets[row_index] + O::usize_as(cnt));
+                offsets.push(offsets[row_index] + O::usize_as(count));
             }
-        } else {
-            // invalid range, return empty array
-            offsets.push(offsets[row_index]);
         }
     }
 
@@ -640,7 +687,102 @@ where
         Arc::new(Field::new_list_field(array.value_type(), true)),
         OffsetBuffer::<O>::new(offsets.into()),
         arrow::array::make_array(data),
-        null_builder.finish(),
+        nulls,
+    )?))
+}
+
+fn general_list_view_array_slice<O: OffsetSizeTrait>(
+    array: &GenericListViewArray<O>,
+    from_array: &Int64Array,
+    to_array: &Int64Array,
+    stride: Option<&Int64Array>,
+) -> Result<ArrayRef>
+where
+    i64: TryInto<O>,
+{
+    let values = array.values();
+    let original_data = values.to_data();
+    let capacity = Capacities::Array(original_data.len());
+    let field = match array.data_type() {
+        ListView(field) | LargeListView(field) => Arc::clone(field),
+        other => {
+            return internal_err!("array_slice got unexpected data type: {}", other);
+        }
+    };
+
+    let mut mutable =
+        MutableArrayData::with_capacities(vec![&original_data], true, capacity);
+
+    // We must build `offsets` and `sizes` buffers manually as ListView does not enforce
+    // monotonically increasing offsets.
+    let mut offsets = Vec::with_capacity(array.len());
+    let mut sizes = Vec::with_capacity(array.len());
+    let mut current_offset = O::usize_as(0);
+
+    let nulls = combine_input_nulls(array, from_array, to_array, stride);
+
+    for row_index in 0..array.len() {
+        if nulls.as_ref().is_some_and(|n| n.is_null(row_index)) {
+            offsets.push(current_offset);
+            sizes.push(O::usize_as(0));
+            continue;
+        }
+
+        let len = array.value_size(row_index);
+
+        // Empty arrays always return an empty array.
+        if len == O::usize_as(0) {
+            offsets.push(current_offset);
+            sizes.push(O::usize_as(0));
+            continue;
+        }
+
+        let slice_plan = compute_slice_plan::<O>(
+            len,
+            from_array.value(row_index),
+            to_array.value(row_index),
+            stride.map(|s| s.value(row_index)),
+        )?;
+
+        let start = array.value_offset(row_index);
+        match slice_plan {
+            SlicePlan::Empty => {
+                offsets.push(current_offset);
+                sizes.push(O::usize_as(0));
+            }
+            SlicePlan::Contiguous {
+                start: rel_start,
+                len: slice_len,
+            } => {
+                let start_index = (start + rel_start).to_usize().unwrap();
+                let end_index = (start + rel_start + slice_len).to_usize().unwrap();
+                mutable.extend(0, start_index, end_index);
+                offsets.push(current_offset);
+                sizes.push(slice_len);
+                current_offset += slice_len;
+            }
+            SlicePlan::Indices(indices) => {
+                let count = indices.len();
+                for rel_index in indices {
+                    let absolute_index = (start + rel_index).to_usize().unwrap();
+                    mutable.extend(0, absolute_index, absolute_index + 1);
+                }
+                let length = O::usize_as(count);
+                offsets.push(current_offset);
+                sizes.push(length);
+                current_offset += length;
+            }
+        }
+    }
+
+    let data = mutable.freeze();
+
+    Ok(Arc::new(GenericListViewArray::<O>::try_new(
+        field,
+        ScalarBuffer::from(offsets),
+        ScalarBuffer::from(sizes),
+        arrow::array::make_array(data),
+        nulls,
     )?))
 }
 
@@ -677,9 +819,6 @@ impl ArrayPopFront {
 }
 
 impl ScalarUDFImpl for ArrayPopFront {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "array_pop_front"
     }
@@ -692,10 +831,7 @@ impl ScalarUDFImpl for ArrayPopFront {
         Ok(arg_types[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_pop_front_inner)(&args.args)
     }
 
@@ -773,9 +909,6 @@ impl ArrayPopBack {
 }
 
 impl ScalarUDFImpl for ArrayPopBack {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "array_pop_back"
     }
@@ -788,10 +921,7 @@ impl ScalarUDFImpl for ArrayPopBack {
         Ok(arg_types[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_pop_back_inner)(&args.args)
     }
 
@@ -873,9 +1003,6 @@ impl ArrayAnyValue {
 }
 
 impl ScalarUDFImpl for ArrayAnyValue {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "array_any_value"
     }
@@ -884,19 +1011,16 @@ impl ScalarUDFImpl for ArrayAnyValue {
     }
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
         match &arg_types[0] {
-            List(field)
-            | LargeList(field)
-            | FixedSizeList(field, _) => Ok(field.data_type().clone()),
+            List(field) | LargeList(field) | FixedSizeList(field, _) => {
+                Ok(field.data_type().clone())
+            }
             _ => plan_err!(
                 "array_any_value can only accept List, LargeList or FixedSizeList as the argument"
             ),
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_any_value_inner)(&args.args)
     }
 
@@ -940,11 +1064,9 @@ where
 
     for (row_index, offset_window) in array.offsets().windows(2).enumerate() {
         let start = offset_window[0];
-        let end = offset_window[1];
-        let len = end - start;
 
         // array is null
-        if len == O::usize_as(0) {
+        if array.is_null(row_index) {
             mutable.extend_nulls(1);
             continue;
         }
@@ -977,12 +1099,32 @@ where
 
 #[cfg(test)]
 mod tests {
-    use super::array_element_udf;
+    use super::{
+        array_element_udf, general_array_any_value, general_array_element,
+        general_list_view_array_slice,
+    };
+    use arrow::array::{
+        Array, ArrayRef, GenericListViewArray, Int32Array, Int64Array, ListViewArray,
+        cast::AsArray,
+    };
+    use arrow::array::{ListArray, RecordBatch};
+    use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
     use arrow::datatypes::{DataType, Field};
-    use datafusion_common::{Column, DFSchema};
+    use datafusion_common::{Column, DFSchema, Result, assert_batches_eq};
     use datafusion_expr::expr::ScalarFunction;
     use datafusion_expr::{Expr, ExprSchemable};
     use std::collections::HashMap;
+    use std::sync::Arc;
+
+    fn list_view_values(array: &GenericListViewArray<i32>) -> Vec<Vec<i32>> {
+        (0..array.len())
+            .map(|i| {
+                let child = array.value(i);
+                let values = child.as_any().downcast_ref::<Int32Array>().unwrap();
+                values.iter().map(|v| v.unwrap()).collect()
+            })
+            .collect()
+    }
 
     // Regression test for https://github.com/apache/datafusion/issues/13755
     #[test]
@@ -1028,4 +1170,211 @@ mod tests {
             fixed_size_list_type
         );
     }
+
+    #[test]
+    fn test_array_element_null_handling() -> Result<()> {
+        let values = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]));
+        let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0, 3, 4, 5]));
+        let nulls = NullBuffer::from(vec![true, false, true]);
+        let field = Arc::new(Field::new("item", DataType::Int32, true));
+
+        let list_array = ListArray::new(field, offsets, values, Some(nulls));
+        let indexes = Int64Array::from(vec![1, 1, 1]);
+
+        let result = general_array_element(&list_array, &indexes)?;
+
+        let expected = [
+            "+--------+",
+            "| result |",
+            "+--------+",
+            "| 1      |",
+            "|        |",
+            "| 5      |",
+            "+--------+",
+        ];
+
+        let batch = RecordBatch::try_from_iter([("result", result)])?;
+
+        assert_batches_eq!(expected, &[batch]);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_any_null_handling() -> Result<()> {
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]));
+        let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0, 3, 4, 5]));
+        let nulls = NullBuffer::from(vec![true, false, true]);
+        let field = Arc::new(Field::new("item", DataType::Int32, true));
+
+        let list_array = ListArray::new(field, offsets, values, Some(nulls));
+
+        let result = general_array_any_value(&list_array)?;
+
+        assert!(!result.is_null(0));
+        assert!(result.is_null(1));
+        assert!(!result.is_null(2));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_slice_list_view_basic() -> Result<()> {
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]));
+        let offsets = ScalarBuffer::from(vec![0, 3]);
+        let sizes = ScalarBuffer::from(vec![3, 2]);
+        let field = Arc::new(Field::new("item", DataType::Int32, true));
+        let array = ListViewArray::new(field, offsets, sizes, values, None);
+
+        let from = Int64Array::from(vec![2, 1]);
+        let to = Int64Array::from(vec![3, 2]);
+
+        let result = general_list_view_array_slice::<i32>(
+            &array,
+            &from,
+            &to,
+            None::<&Int64Array>,
+        )?;
+        let result = result.as_ref().as_list_view::<i32>();
+
+        assert_eq!(list_view_values(result), vec![vec![2, 3], vec![4, 5]]);
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_slice_list_view_non_monotonic_offsets() -> Result<()> {
+        // First list references the tail of the values buffer, second list references the head.
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]));
+        let offsets = ScalarBuffer::from(vec![3, 0]);
+        let sizes = ScalarBuffer::from(vec![2, 3]);
+        let field = Arc::new(Field::new("item", DataType::Int32, true));
+        let array = ListViewArray::new(field, offsets, sizes, values, None);
+
+        let from = Int64Array::from(vec![1, 1]);
+        let to = Int64Array::from(vec![2, 2]);
+
+        let result = general_list_view_array_slice::<i32>(
+            &array,
+            &from,
+            &to,
+            None::<&Int64Array>,
+        )?;
+        let result = result.as_ref().as_list_view::<i32>();
+
+        assert_eq!(list_view_values(result), vec![vec![4, 5], vec![1, 2]]);
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_slice_list_view_negative_stride() -> Result<()> {
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]));
+        let offsets = ScalarBuffer::from(vec![0, 3]);
+        let sizes = ScalarBuffer::from(vec![3, 2]);
+        let field = Arc::new(Field::new("item", DataType::Int32, true));
+        let array = ListViewArray::new(field, offsets, sizes, values, None);
+
+        let from = Int64Array::from(vec![3, 2]);
+        let to = Int64Array::from(vec![1, 1]);
+        let stride = Int64Array::from(vec![-1, -1]);
+
+        let result =
+            general_list_view_array_slice::<i32>(&array, &from, &to, Some(&stride))?;
+        let result = result.as_ref().as_list_view::<i32>();
+
+        assert_eq!(list_view_values(result), vec![vec![3, 2, 1], vec![5, 4]]);
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_slice_list_view_out_of_order() -> Result<()> {
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]));
+        let offsets = ScalarBuffer::from(vec![3, 1, 0]);
+        let sizes = ScalarBuffer::from(vec![2, 2, 1]);
+        let field = Arc::new(Field::new("item", DataType::Int32, true));
+        let array = ListViewArray::new(field, offsets, sizes, values, None);
+        assert_eq!(
+            list_view_values(&array),
+            vec![vec![4, 5], vec![2, 3], vec![1]]
+        );
+
+        let from = Int64Array::from(vec![2, 2, 2]);
+        let to = Int64Array::from(vec![1, 1, 1]);
+        let stride = Int64Array::from(vec![-1, -1, -1]);
+
+        let result =
+            general_list_view_array_slice::<i32>(&array, &from, &to, Some(&stride))?;
+        let result = result.as_ref().as_list_view::<i32>();
+
+        assert_eq!(
+            list_view_values(result),
+            vec![vec![5, 4], vec![3, 2], vec![]]
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_slice_list_view_with_nulls() -> Result<()> {
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![
+            Some(1),
+            None,
+            Some(3),
+            Some(4),
+            Some(5),
+        ]));
+        let offsets = ScalarBuffer::from(vec![0, 2, 5]);
+        let sizes = ScalarBuffer::from(vec![2, 3, 0]);
+        let field = Arc::new(Field::new("item", DataType::Int32, true));
+        let array = ListViewArray::new(field, offsets, sizes, values, None);
+
+        let from = Int64Array::from(vec![1, 1, 1]);
+        let to = Int64Array::from(vec![2, 2, 1]);
+
+        let result = general_list_view_array_slice::<i32>(&array, &from, &to, None)?;
+        let result = result.as_ref().as_list_view::<i32>();
+
+        let actual: Vec<Vec<Option<i32>>> = (0..result.len())
+            .map(|i| {
+                result
+                    .value(i)
+                    .as_any()
+                    .downcast_ref::<Int32Array>()
+                    .unwrap()
+                    .iter()
+                    .collect()
+            })
+            .collect();
+
+        assert_eq!(
+            actual,
+            vec![vec![Some(1), None], vec![Some(3), Some(4)], Vec::new(),]
+        );
+
+        // Test with NULL stride - should return NULL for rows with NULL stride
+        let stride_with_null = Int64Array::from(vec![Some(1), None, Some(1)]);
+        let result = general_list_view_array_slice::<i32>(
+            &array,
+            &from,
+            &to,
+            Some(&stride_with_null),
+        )?;
+        let result = result.as_ref().as_list_view::<i32>();
+
+        // First row: stride = 1, should return [1, None]
+        // Second row: stride = NULL, should return NULL
+        // Third row: stride = 1, empty array should return empty
+        assert!(!result.is_null(0)); // First row should not be null
+        assert!(result.is_null(1)); // Second row should be null (stride is NULL)
+        assert!(!result.is_null(2)); // Third row should not be null
+
+        let first_row: Vec<Option<i32>> = result
+            .value(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap()
+            .iter()
+            .collect();
+        assert_eq!(first_row, vec![Some(1), None]);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/functions-nested/src/flatten.rs b/datafusion/functions-nested/src/flatten.rs
index 49f4110faeaac..23612c93fefd5 100644
--- a/datafusion/functions-nested/src/flatten.rs
+++ b/datafusion/functions-nested/src/flatten.rs
@@ -25,12 +25,12 @@ use arrow::datatypes::{
     DataType::{FixedSizeList, LargeList, List, Null},
 };
 use datafusion_common::cast::{as_large_list_array, as_list_array};
-use datafusion_common::{exec_err, utils::take_function_args, Result};
+use datafusion_common::{Result, exec_err, utils::take_function_args};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::sync::Arc;
 
 make_udf_expr_and_func!(
@@ -80,10 +80,6 @@ impl Flatten {
 }
 
 impl ScalarUDFImpl for Flatten {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "flatten"
     }
@@ -114,10 +110,7 @@ impl ScalarUDFImpl for Flatten {
         Ok(data_type)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(flatten_inner)(&args.args)
     }
 
@@ -130,8 +123,7 @@ impl ScalarUDFImpl for Flatten {
     }
 }
 
-/// Flatten SQL function
-pub fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array] = take_function_args("flatten", args)?;
 
     match array.data_type() {
@@ -145,7 +137,7 @@ pub fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
                     let (inner_field, inner_offsets, inner_values, _) =
                         as_list_array(&values)?.clone().into_parts();
                     let offsets =
-                        get_offsets_for_flatten::<i32, i32>(inner_offsets, offsets);
+                        get_offsets_for_flatten::<i32, i32>(inner_offsets, &offsets);
                     let flattened_array = GenericListArray::<i32>::new(
                         inner_field,
                         offsets,
@@ -159,7 +151,7 @@ pub fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
                     let (inner_field, inner_offsets, inner_values, _) =
                         as_large_list_array(&values)?.clone().into_parts();
                     let offsets =
-                        get_offsets_for_flatten::<i64, i32>(inner_offsets, offsets);
+                        get_offsets_for_flatten::<i64, i32>(inner_offsets, &offsets);
                     let flattened_array = GenericListArray::<i64>::new(
                         inner_field,
                         offsets,
@@ -180,7 +172,7 @@ pub fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
                 List(_) => {
                     let (inner_field, inner_offsets, inner_values, _) =
                         as_list_array(&values)?.clone().into_parts();
-                    let offsets = get_large_offsets_for_flatten(inner_offsets, offsets);
+                    let offsets = get_large_offsets_for_flatten(inner_offsets, &offsets);
                     let flattened_array = GenericListArray::<i64>::new(
                         inner_field,
                         offsets,
@@ -194,7 +186,7 @@ pub fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
                     let (inner_field, inner_offsets, inner_values, _) =
                         as_large_list_array(&values)?.clone().into_parts();
                     let offsets =
-                        get_offsets_for_flatten::<i64, i64>(inner_offsets, offsets);
+                        get_offsets_for_flatten::<i64, i64>(inner_offsets, &offsets);
                     let flattened_array = GenericListArray::<i64>::new(
                         inner_field,
                         offsets,
@@ -209,7 +201,7 @@ pub fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
         }
         Null => Ok(Arc::clone(array)),
         _ => {
-            exec_err!("flatten does not support type '{:?}'", array.data_type())
+            exec_err!("flatten does not support type '{}'", array.data_type())
         }
     }
 }
@@ -217,7 +209,7 @@ pub fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
 // Create new offsets that are equivalent to `flatten` the array.
 fn get_offsets_for_flatten<O: OffsetSizeTrait, P: OffsetSizeTrait>(
     inner_offsets: OffsetBuffer<O>,
-    outer_offsets: OffsetBuffer<P>,
+    outer_offsets: &OffsetBuffer<P>,
 ) -> OffsetBuffer<O> {
     let buffer = inner_offsets.into_inner();
     let offsets: Vec<O> = outer_offsets
@@ -230,7 +222,7 @@ fn get_offsets_for_flatten<O: OffsetSizeTrait, P: OffsetSizeTrait>(
 // Create new large offsets that are equivalent to `flatten` the array.
 fn get_large_offsets_for_flatten<O: OffsetSizeTrait, P: OffsetSizeTrait>(
     inner_offsets: OffsetBuffer<O>,
-    outer_offsets: OffsetBuffer<P>,
+    outer_offsets: &OffsetBuffer<P>,
 ) -> OffsetBuffer<i64> {
     let buffer = inner_offsets.into_inner();
     let offsets: Vec<i64> = outer_offsets
diff --git a/datafusion/functions-nested/src/length.rs b/datafusion/functions-nested/src/length.rs
index 060a978185e51..9579c3c9cd658 100644
--- a/datafusion/functions-nested/src/length.rs
+++ b/datafusion/functions-nested/src/length.rs
@@ -29,14 +29,13 @@ use arrow::datatypes::{
 use datafusion_common::cast::{
     as_fixed_size_list_array, as_generic_list_array, as_int64_array,
 };
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::{
     ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, Documentation,
-    ScalarUDFImpl, Signature, TypeSignature, Volatility,
+    ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility,
 };
 use datafusion_functions::downcast_arg;
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::sync::Arc;
 
 make_udf_expr_and_func!(
@@ -102,9 +101,6 @@ impl ArrayLength {
 }
 
 impl ScalarUDFImpl for ArrayLength {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "array_length"
     }
@@ -117,10 +113,7 @@ impl ScalarUDFImpl for ArrayLength {
         Ok(UInt64)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_length_inner)(&args.args)
     }
 
@@ -150,8 +143,7 @@ macro_rules! array_length_impl {
     }};
 }
 
-/// Array_length SQL function
-pub fn array_length_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_length_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     if args.len() != 1 && args.len() != 2 {
         return exec_err!("array_length expects one or two arguments");
     }
diff --git a/datafusion/functions-nested/src/lib.rs b/datafusion/functions-nested/src/lib.rs
index 3a66e65694768..f94b3921a5768 100644
--- a/datafusion/functions-nested/src/lib.rs
+++ b/datafusion/functions-nested/src/lib.rs
@@ -23,6 +23,7 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! Nested type Functions for [DataFusion].
 //!
@@ -36,9 +37,17 @@
 #[macro_use]
 pub mod macros;
 
+#[macro_use]
+pub mod macros_lambda;
+
+pub mod array_any_match;
+pub mod array_compact;
 pub mod array_has;
+pub mod array_transform;
+pub mod arrays_zip;
 pub mod cardinality;
 pub mod concat;
+pub mod cosine_distance;
 pub mod dimension;
 pub mod distance;
 pub mod empty;
@@ -69,19 +78,24 @@ pub mod utils;
 
 use datafusion_common::Result;
 use datafusion_execution::FunctionRegistry;
-use datafusion_expr::ScalarUDF;
+use datafusion_expr::{HigherOrderUDF, ScalarUDF};
 use log::debug;
 use std::sync::Arc;
 
 /// Fluent-style API for creating `Expr`s
 pub mod expr_fn {
+    pub use super::array_any_match::array_any_match;
+    pub use super::array_compact::array_compact;
     pub use super::array_has::array_has;
     pub use super::array_has::array_has_all;
     pub use super::array_has::array_has_any;
+    pub use super::array_transform::array_transform;
+    pub use super::arrays_zip::arrays_zip;
     pub use super::cardinality::cardinality;
     pub use super::concat::array_append;
     pub use super::concat::array_concat;
     pub use super::concat::array_prepend;
+    pub use super::cosine_distance::cosine_distance;
     pub use super::dimension::array_dims;
     pub use super::dimension::array_ndims;
     pub use super::distance::array_distance;
@@ -125,6 +139,7 @@ pub mod expr_fn {
 /// Return all default nested type functions
 pub fn all_default_nested_functions() -> Vec<Arc<ScalarUDF>> {
     vec![
+        array_compact::array_compact_udf(),
         string::array_to_string_udf(),
         string::string_to_array_udf(),
         range::range_udf(),
@@ -147,6 +162,7 @@ pub fn all_default_nested_functions() -> Vec<Arc<ScalarUDF>> {
         array_has::array_has_any_udf(),
         empty::array_empty_udf(),
         length::array_length_udf(),
+        cosine_distance::cosine_distance_udf(),
         distance::array_distance_udf(),
         flatten::flatten_udf(),
         min_max::array_max_udf(),
@@ -158,6 +174,7 @@ pub fn all_default_nested_functions() -> Vec<Arc<ScalarUDF>> {
         set_ops::array_distinct_udf(),
         set_ops::array_intersect_udf(),
         set_ops::array_union_udf(),
+        arrays_zip::arrays_zip_udf(),
         position::array_position_udf(),
         position::array_positions_udf(),
         remove::array_remove_udf(),
@@ -174,6 +191,13 @@ pub fn all_default_nested_functions() -> Vec<Arc<ScalarUDF>> {
     ]
 }
 
+pub fn all_default_higher_order_functions() -> Vec<Arc<dyn HigherOrderUDF>> {
+    vec![
+        array_any_match::array_any_match_higher_order_function(),
+        array_transform::array_transform_higher_order_function(),
+    ]
+}
+
 /// Registers all enabled packages with a [`FunctionRegistry`]
 pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> {
     let functions: Vec<Arc<ScalarUDF>> = all_default_nested_functions();
@@ -185,25 +209,43 @@ pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> {
         Ok(()) as Result<()>
     })?;
 
+    let functions: Vec<Arc<dyn HigherOrderUDF>> = all_default_higher_order_functions();
+    functions.into_iter().try_for_each(|function| {
+        let existing_function = registry.register_higher_order_function(function)?;
+        if let Some(existing_function) = existing_function {
+            debug!(
+                "Overwrite existing higher-order function: {}",
+                existing_function.name()
+            );
+        }
+        Ok(()) as Result<()>
+    })?;
+
     Ok(())
 }
 
 #[cfg(test)]
 mod tests {
-    use crate::all_default_nested_functions;
+    use crate::{all_default_higher_order_functions, all_default_nested_functions};
     use datafusion_common::Result;
     use std::collections::HashSet;
 
     #[test]
     fn test_no_duplicate_name() -> Result<()> {
+        let scalars = all_default_nested_functions();
+        let scalars = scalars.iter().map(|s| (s.name(), s.aliases()));
+
+        let lambdas = all_default_higher_order_functions();
+        let lambdas = lambdas.iter().map(|l| (l.name(), l.aliases()));
+
         let mut names = HashSet::new();
-        for func in all_default_nested_functions() {
+
+        for (name, aliases) in scalars.chain(lambdas) {
             assert!(
-                names.insert(func.name().to_string().to_lowercase()),
-                "duplicate function name: {}",
-                func.name()
+                names.insert(name.to_string().to_lowercase()),
+                "duplicate function name: {name}",
             );
-            for alias in func.aliases() {
+            for alias in aliases {
                 assert!(
                     names.insert(alias.to_string().to_lowercase()),
                     "duplicate function name: {alias}"
diff --git a/datafusion/functions-nested/src/macros.rs b/datafusion/functions-nested/src/macros.rs
index 5380f6b1272d1..5f12113150a40 100644
--- a/datafusion/functions-nested/src/macros.rs
+++ b/datafusion/functions-nested/src/macros.rs
@@ -50,7 +50,6 @@ macro_rules! make_udf_expr_and_func {
         make_udf_expr_and_func!($UDF, $EXPR_FN, $($arg)*, $DOC, $SCALAR_UDF_FN, $UDF::new);
     };
     ($UDF:ident, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr, $SCALAR_UDF_FN:ident, $CTOR:path) => {
-        paste::paste! {
             // "fluent expr_fn" style function
             #[doc = $DOC]
             pub fn $EXPR_FN($($arg: datafusion_expr::Expr),*) -> datafusion_expr::Expr {
@@ -60,13 +59,11 @@ macro_rules! make_udf_expr_and_func {
                 ))
             }
             create_func!($UDF, $SCALAR_UDF_FN, $CTOR);
-        }
     };
     ($UDF:ident, $EXPR_FN:ident, $DOC:expr, $SCALAR_UDF_FN:ident) => {
         make_udf_expr_and_func!($UDF, $EXPR_FN, $DOC, $SCALAR_UDF_FN, $UDF::new);
     };
     ($UDF:ident, $EXPR_FN:ident, $DOC:expr, $SCALAR_UDF_FN:ident, $CTOR:path) => {
-        paste::paste! {
             // "fluent expr_fn" style function
             #[doc = $DOC]
             pub fn $EXPR_FN(arg: Vec<datafusion_expr::Expr>) -> datafusion_expr::Expr {
@@ -76,7 +73,6 @@ macro_rules! make_udf_expr_and_func {
                 ))
             }
             create_func!($UDF, $SCALAR_UDF_FN, $CTOR);
-        }
     };
 }
 
@@ -97,7 +93,6 @@ macro_rules! create_func {
         create_func!($UDF, $SCALAR_UDF_FN, $UDF::new);
     };
     ($UDF:ident, $SCALAR_UDF_FN:ident, $CTOR:path) => {
-        paste::paste! {
             #[doc = concat!("ScalarFunction that returns a [`ScalarUDF`](datafusion_expr::ScalarUDF) for ")]
             #[doc = stringify!($UDF)]
             pub fn $SCALAR_UDF_FN() -> std::sync::Arc<datafusion_expr::ScalarUDF> {
@@ -110,6 +105,5 @@ macro_rules! create_func {
                     });
                 std::sync::Arc::clone(&INSTANCE)
             }
-        }
     };
 }
diff --git a/datafusion/functions-nested/src/macros_lambda.rs b/datafusion/functions-nested/src/macros_lambda.rs
new file mode 100644
index 0000000000000..8c15d8aed13b6
--- /dev/null
+++ b/datafusion/functions-nested/src/macros_lambda.rs
@@ -0,0 +1,107 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// Creates external API functions for an array UDF. Specifically, creates
+///
+/// 1. Single `HigherOrderUDF` instance
+///
+/// Creates a singleton `HigherOrderUDF` of the `$UDF` function named `STATIC_$(UDF)` and a
+/// function named `$HIGHER_ORDER_UDF_FUNC` which returns that function named `STATIC_$(UDF)`.
+///
+/// This is used to ensure creating the list of `HigherOrderUDF` only happens once.
+///
+/// # 2. `expr_fn` style function
+///
+/// These are functions that create an `Expr` that invokes the UDF, used
+/// primarily to programmatically create expressions.
+///
+/// For example:
+/// ```text
+/// pub fn array_to_string(delimiter: Expr) -> Expr {
+/// ...
+/// }
+/// ```
+/// # Arguments
+/// * `UDF`: name of the [`HigherOrderUDF`]
+/// * `EXPR_FN`: name of the expr_fn function to be created
+/// * `arg`: 0 or more named arguments for the function
+/// * `DOC`: documentation string for the function
+/// * `HIGHER_ORDER_UDF_FUNC`: name of the function to create (just) the `HigherOrderUDF`
+/// * (optional) `$CTOR`: Pass a custom constructor. When omitted it
+///   automatically resolves to `$UDF::new()`.
+///
+/// [`HigherOrderUDF`]: datafusion_expr::HigherOrderUDF
+macro_rules! make_higher_order_function_expr_and_func {
+    ($UDF:ident, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr, $HIGHER_ORDER_UDF_FN:ident) => {
+        make_higher_order_function_expr_and_func!($UDF, $EXPR_FN, $($arg)*, $DOC, $HIGHER_ORDER_UDF_FN, $UDF::new);
+    };
+    ($UDF:ident, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr, $HIGHER_ORDER_UDF_FN:ident, $CTOR:path) => {
+        // "fluent expr_fn" style function
+        #[doc = $DOC]
+        pub fn $EXPR_FN($($arg: datafusion_expr::Expr),*) -> datafusion_expr::Expr {
+            datafusion_expr::Expr::HigherOrderFunction(datafusion_expr::expr::HigherOrderFunction::new(
+                $HIGHER_ORDER_UDF_FN(),
+                vec![$($arg),*],
+            ))
+        }
+        create_higher_order!($UDF, $HIGHER_ORDER_UDF_FN, $CTOR);
+    };
+    ($UDF:ident, $EXPR_FN:ident, $DOC:expr, $HIGHER_ORDER_UDF_FN:ident) => {
+        make_higher_order_function_expr_and_func!($UDF, $EXPR_FN, $DOC, $HIGHER_ORDER_UDF_FN, $UDF::new);
+    };
+    ($UDF:ident, $EXPR_FN:ident, $DOC:expr, $HIGHER_ORDER_UDF_FN:ident, $CTOR:path) => {
+        // "fluent expr_fn" style function
+        #[doc = $DOC]
+        pub fn $EXPR_FN(arg: Vec<datafusion_expr::Expr>) -> datafusion_expr::Expr {
+            datafusion_expr::Expr::HigherOrderFunction(datafusion_expr::expr::HigherOrderFunction::new(
+                $HIGHER_ORDER_UDF_FN(),
+                arg,
+            ))
+        }
+        create_higher_order!($UDF, $HIGHER_ORDER_UDF_FN, $CTOR);
+    };
+}
+
+/// Creates a singleton `HigherOrderUDF` of the `$UDF` function named `STATIC_$(UDF)` and a
+/// function named `$HIGHER_ORDER_UDF_FUNC` which returns that function named `STATIC_$(UDF)`.
+///
+/// This is used to ensure creating the list of `HigherOrderUDF` only happens once.
+///
+/// # Arguments
+/// * `UDF`: name of the [`HigherOrderUDF`]
+/// * `HIGHER_ORDER_UDF_FUNC`: name of the function to create (just) the `HigherOrderUDF`
+/// * (optional) `$CTOR`: Pass a custom constructor. When omitted it
+///   automatically resolves to `$UDF::new()`.
+///
+/// [`HigherOrderUDF`]: datafusion_expr::HigherOrderUDF
+macro_rules! create_higher_order {
+    ($UDF:ident, $HIGHER_ORDER_UDF_FN:ident) => {
+        create_higher_order!($UDF, $HIGHER_ORDER_UDF_FN, $UDF::new);
+    };
+    ($UDF:ident, $HIGHER_ORDER_UDF_FN:ident, $CTOR:path) => {
+        #[doc = concat!("HigherOrderFunction that returns a [`HigherOrderUDF`](datafusion_expr::HigherOrderUDF) for ")]
+        #[doc = stringify!($UDF)]
+        pub fn $HIGHER_ORDER_UDF_FN() -> std::sync::Arc<dyn datafusion_expr::HigherOrderUDF> {
+            // Singleton instance of [`$UDF`], ensures the UDF is only created once
+            static INSTANCE: std::sync::LazyLock<std::sync::Arc<dyn datafusion_expr::HigherOrderUDF>> =
+                std::sync::LazyLock::new(|| {
+                    std::sync::Arc::new($CTOR())
+                });
+            std::sync::Arc::clone(&INSTANCE)
+        }
+    };
+}
diff --git a/datafusion/functions-nested/src/make_array.rs b/datafusion/functions-nested/src/make_array.rs
index 97d64c70cd364..32af5df2c6019 100644
--- a/datafusion/functions-nested/src/make_array.rs
+++ b/datafusion/functions-nested/src/make_array.rs
@@ -17,26 +17,25 @@
 
 //! [`ScalarUDFImpl`] definitions for `make_array` function.
 
-use std::any::Any;
 use std::sync::Arc;
 use std::vec;
 
 use crate::utils::make_scalar_function;
 use arrow::array::{
-    new_null_array, Array, ArrayData, ArrayRef, Capacities, GenericListArray,
-    MutableArrayData, NullArray, OffsetSizeTrait,
+    Array, ArrayData, ArrayRef, Capacities, GenericListArray, MutableArrayData,
+    NullArray, OffsetSizeTrait, new_null_array,
 };
 use arrow::buffer::OffsetBuffer;
 use arrow::datatypes::DataType;
 use arrow::datatypes::{DataType::Null, Field};
 use datafusion_common::utils::SingleRowListArrayBuilder;
-use datafusion_common::{plan_err, Result};
+use datafusion_common::{Result, plan_err};
 use datafusion_expr::binary::{
     try_type_union_resolution_with_struct, type_union_resolution,
 };
-use datafusion_expr::TypeSignature;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 use itertools::Itertools as _;
@@ -80,20 +79,13 @@ impl Default for MakeArray {
 impl MakeArray {
     pub fn new() -> Self {
         Self {
-            signature: Signature::one_of(
-                vec![TypeSignature::Nullary, TypeSignature::UserDefined],
-                Volatility::Immutable,
-            ),
+            signature: Signature::user_defined(Volatility::Immutable),
             aliases: vec![String::from("make_list")],
         }
     }
 }
 
 impl ScalarUDFImpl for MakeArray {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "make_array"
     }
@@ -113,10 +105,7 @@ impl ScalarUDFImpl for MakeArray {
         Ok(DataType::new_list(element_type, true))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(make_array_inner)(&args.args)
     }
 
@@ -125,18 +114,10 @@ impl ScalarUDFImpl for MakeArray {
     }
 
     fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        if let Ok(unified) = try_type_union_resolution_with_struct(arg_types) {
-            return Ok(unified);
-        }
-
-        if let Some(unified) = type_union_resolution(arg_types) {
-            Ok(vec![unified; arg_types.len()])
+        if arg_types.is_empty() {
+            Ok(vec![])
         } else {
-            plan_err!(
-                "Failed to unify argument types of {}: [{}]",
-                self.name(),
-                arg_types.iter().join(", ")
-            )
+            coerce_types_inner(arg_types, self.name())
         }
     }
 
@@ -163,7 +144,7 @@ pub(crate) fn make_array_inner(arrays: &[ArrayRef]) -> Result<ArrayRef> {
             SingleRowListArrayBuilder::new(array).build_list_array(),
         ))
     } else {
-        array_array::<i32>(arrays, data_type.clone())
+        array_array::<i32>(arrays, data_type.clone(), Field::LIST_FIELD_DEFAULT_NAME)
     }
 }
 
@@ -207,9 +188,10 @@ pub(crate) fn make_array_inner(arrays: &[ArrayRef]) -> Result<ArrayRef> {
 /// └──────────────┘   └──────────────┘        └─────────────────────────────┘
 ///      col1               col2                         output
 /// ```
-fn array_array<O: OffsetSizeTrait>(
+pub fn array_array<O: OffsetSizeTrait>(
     args: &[ArrayRef],
     data_type: DataType,
+    field_name: &str,
 ) -> Result<ArrayRef> {
     // do not accept 0 arguments.
     if args.is_empty() {
@@ -252,9 +234,25 @@ fn array_array<O: OffsetSizeTrait>(
     let data = mutable.freeze();
 
     Ok(Arc::new(GenericListArray::<O>::try_new(
-        Arc::new(Field::new_list_field(data_type, true)),
+        Arc::new(Field::new(field_name, data_type, true)),
         OffsetBuffer::new(offsets.into()),
         arrow::array::make_array(data),
         None,
     )?))
 }
+
+pub fn coerce_types_inner(arg_types: &[DataType], name: &str) -> Result<Vec<DataType>> {
+    if let Ok(unified) = try_type_union_resolution_with_struct(arg_types) {
+        return Ok(unified);
+    }
+
+    if let Some(unified) = type_union_resolution(arg_types) {
+        Ok(vec![unified; arg_types.len()])
+    } else {
+        plan_err!(
+            "Failed to unify argument types of {}: [{}]",
+            name,
+            arg_types.iter().join(", ")
+        )
+    }
+}
diff --git a/datafusion/functions-nested/src/map.rs b/datafusion/functions-nested/src/map.rs
index 03cfdb52c6de7..c7418e9021494 100644
--- a/datafusion/functions-nested/src/map.rs
+++ b/datafusion/functions-nested/src/map.rs
@@ -15,21 +15,28 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::collections::VecDeque;
+use std::hash::Hash;
 use std::sync::Arc;
 
-use arrow::array::{Array, ArrayData, ArrayRef, MapArray, OffsetSizeTrait, StructArray};
+use arrow::array::{
+    Array, ArrayData, ArrayRef, ArrowPrimitiveType, MapArray, OffsetSizeTrait,
+    StructArray, cast::AsArray,
+};
 use arrow::buffer::Buffer;
-use arrow::datatypes::{DataType, Field, SchemaBuilder, ToByteSlice};
+use arrow::datatypes::{
+    DataType, Date32Type, Date64Type, Field, Int8Type, Int16Type, Int32Type, Int64Type,
+    SchemaBuilder, ToByteSlice, UInt8Type, UInt16Type, UInt32Type, UInt64Type,
+};
 
 use datafusion_common::utils::{fixed_size_list_to_arrays, list_to_arrays};
 use datafusion_common::{
-    exec_err, utils::take_function_args, HashSet, Result, ScalarValue,
+    HashSet, Result, ScalarValue, exec_err, utils::take_function_args,
 };
 use datafusion_expr::expr::ScalarFunction;
 use datafusion_expr::{
-    ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -61,45 +68,116 @@ fn make_map_batch(args: &[ColumnarValue]) -> Result<ColumnarValue> {
 
     let can_evaluate_to_const = can_evaluate_to_const(args);
 
-    // check the keys array is unique
     let keys = get_first_array_ref(keys_arg)?;
-    if keys.null_count() > 0 {
-        return exec_err!("map key cannot be null");
-    }
     let key_array = keys.as_ref();
 
     match keys_arg {
-        ColumnarValue::Array(_) => {
-            let row_keys = match key_array.data_type() {
-                DataType::List(_) => list_to_arrays::<i32>(&keys),
-                DataType::LargeList(_) => list_to_arrays::<i64>(&keys),
-                DataType::FixedSizeList(_, _) => fixed_size_list_to_arrays(&keys),
-                data_type => {
-                    return exec_err!(
-                        "Expected list, large_list or fixed_size_list, got {:?}",
-                        data_type
-                    );
-                }
-            };
-
-            row_keys
+        ColumnarValue::Array(_) => match key_array.data_type() {
+            DataType::List(_) => keys
+                .as_list::<i32>()
                 .iter()
-                .try_for_each(|key| check_unique_keys(key.as_ref()))?;
-        }
+                .flatten()
+                .try_for_each(|row| validate_map_keys(row.as_ref()))?,
+            DataType::LargeList(_) => keys
+                .as_list::<i64>()
+                .iter()
+                .flatten()
+                .try_for_each(|row| validate_map_keys(row.as_ref()))?,
+            DataType::FixedSizeList(_, _) => {
+                keys.as_fixed_size_list()
+                    .iter()
+                    .flatten()
+                    .try_for_each(|row| validate_map_keys(row.as_ref()))?
+            }
+            data_type => {
+                return exec_err!(
+                    "Expected list, large_list or fixed_size_list, got {:?}",
+                    data_type
+                );
+            }
+        },
         ColumnarValue::Scalar(_) => {
-            check_unique_keys(key_array)?;
+            validate_map_keys(key_array)?;
         }
     }
 
     let values = get_first_array_ref(values_arg)?;
-    make_map_batch_internal(keys, values, can_evaluate_to_const, keys_arg.data_type())
+
+    make_map_batch_internal(&keys, &values, can_evaluate_to_const, &keys_arg.data_type())
+}
+
+fn validate_unique_primitive_keys<T: ArrowPrimitiveType>(array: &dyn Array) -> Result<()>
+where
+    T::Native: Copy + Eq + Hash + std::fmt::Display,
+{
+    let primitive_array = array.as_primitive::<T>();
+    if primitive_array.null_count() > 0 {
+        return exec_err!("map key cannot be null");
+    }
+
+    if let Some(value) = find_duplicate_value(
+        primitive_array.len(),
+        primitive_array.values().iter().copied(),
+    ) {
+        return exec_err!("map key must be unique, duplicate key found: {}", value);
+    }
+
+    Ok(())
+}
+
+fn validate_unique_str_keys<'a>(
+    null_count: usize,
+    len: usize,
+    values: impl IntoIterator<Item = &'a str>,
+) -> Result<()> {
+    if null_count > 0 {
+        return exec_err!("map key cannot be null");
+    }
+
+    if let Some(value) = find_duplicate_value(len, values) {
+        return exec_err!("map key must be unique, duplicate key found: {}", value);
+    }
+
+    Ok(())
+}
+
+fn validate_unique_binary_keys<'a>(
+    null_count: usize,
+    len: usize,
+    values: impl IntoIterator<Item = &'a [u8]>,
+) -> Result<()> {
+    if null_count > 0 {
+        return exec_err!("map key cannot be null");
+    }
+
+    if let Some(value) = find_duplicate_value(len, values) {
+        return exec_err!("map key must be unique, duplicate key found: {:?}", value);
+    }
+
+    Ok(())
+}
+
+fn find_duplicate_value<T, I>(len: usize, values: I) -> Option<T>
+where
+    T: Copy + Eq + Hash,
+    I: IntoIterator<Item = T>,
+{
+    let mut seen_keys = HashSet::with_capacity(len);
+    values.into_iter().find(|value| !seen_keys.insert(*value))
 }
 
-fn check_unique_keys(array: &dyn Array) -> Result<()> {
+fn validate_unique_keys_generic(array: &dyn Array) -> Result<()> {
     let mut seen_keys = HashSet::with_capacity(array.len());
 
     for i in 0..array.len() {
         let key = ScalarValue::try_from_array(array, i)?;
+
+        // Validation 1: Map keys cannot be null
+        if key.is_null() {
+            return exec_err!("map key cannot be null");
+        }
+
+        // Validation 2: Map keys must be unique
         if seen_keys.contains(&key) {
             return exec_err!("map key must be unique, duplicate key found: {}", key);
         }
@@ -108,33 +186,84 @@ fn check_unique_keys(array: &dyn Array) -> Result<()> {
     Ok(())
 }
 
+/// Validates that map keys are non-null and unique.
+fn validate_map_keys(array: &dyn Array) -> Result<()> {
+    match array.data_type() {
+        DataType::Int8 => validate_unique_primitive_keys::<Int8Type>(array),
+        DataType::Int16 => validate_unique_primitive_keys::<Int16Type>(array),
+        DataType::Int32 => validate_unique_primitive_keys::<Int32Type>(array),
+        DataType::Int64 => validate_unique_primitive_keys::<Int64Type>(array),
+        DataType::UInt8 => validate_unique_primitive_keys::<UInt8Type>(array),
+        DataType::UInt16 => validate_unique_primitive_keys::<UInt16Type>(array),
+        DataType::UInt32 => validate_unique_primitive_keys::<UInt32Type>(array),
+        DataType::UInt64 => validate_unique_primitive_keys::<UInt64Type>(array),
+        DataType::Date32 => validate_unique_primitive_keys::<Date32Type>(array),
+        DataType::Date64 => validate_unique_primitive_keys::<Date64Type>(array),
+        DataType::Utf8 => {
+            let arr = array.as_string::<i32>();
+            validate_unique_str_keys(arr.null_count(), arr.len(), arr.iter().flatten())
+        }
+        DataType::LargeUtf8 => {
+            let arr = array.as_string::<i64>();
+            validate_unique_str_keys(arr.null_count(), arr.len(), arr.iter().flatten())
+        }
+        DataType::Utf8View => {
+            let arr = array.as_string_view();
+            validate_unique_str_keys(arr.null_count(), arr.len(), arr.iter().flatten())
+        }
+        DataType::Binary => {
+            let arr = array.as_binary::<i32>();
+            validate_unique_binary_keys(arr.null_count(), arr.len(), arr.iter().flatten())
+        }
+        DataType::LargeBinary => {
+            let arr = array.as_binary::<i64>();
+            validate_unique_binary_keys(arr.null_count(), arr.len(), arr.iter().flatten())
+        }
+        DataType::BinaryView => {
+            let arr = array.as_binary_view();
+            validate_unique_binary_keys(arr.null_count(), arr.len(), arr.iter().flatten())
+        }
+        _ => validate_unique_keys_generic(array),
+    }
+}
+
 fn get_first_array_ref(columnar_value: &ColumnarValue) -> Result<ArrayRef> {
     match columnar_value {
         ColumnarValue::Scalar(value) => match value {
             ScalarValue::List(array) => Ok(array.value(0)),
             ScalarValue::LargeList(array) => Ok(array.value(0)),
             ScalarValue::FixedSizeList(array) => Ok(array.value(0)),
-            _ => exec_err!("Expected array, got {:?}", value),
+            _ => exec_err!("Expected array, got {}", value),
         },
         ColumnarValue::Array(array) => Ok(array.to_owned()),
     }
 }
 
 fn make_map_batch_internal(
-    keys: ArrayRef,
-    values: ArrayRef,
+    keys: &ArrayRef,
+    values: &ArrayRef,
     can_evaluate_to_const: bool,
-    data_type: DataType,
+    data_type: &DataType,
 ) -> Result<ColumnarValue> {
     if keys.len() != values.len() {
         return exec_err!("map requires key and value lists to have the same length");
     }
 
-    if !can_evaluate_to_const {
-        return if let DataType::LargeList(..) = data_type {
-            make_map_array_internal::<i64>(keys, values)
-        } else {
-            make_map_array_internal::<i32>(keys, values)
+    // Use the array path (make_map_array_internal) in these cases:
+    // 1. Not const evaluation (!can_evaluate_to_const) - allows scalar elimination optimization
+    // 2. NULL maps present (keys.null_count() > 0) - fast path doesn't handle NULL list elements
+    if !can_evaluate_to_const || keys.null_count() > 0 {
+        return match data_type {
+            DataType::LargeList(..) => make_map_array_internal::<i64>(keys, values),
+            DataType::List(..) => make_map_array_internal::<i32>(keys, values),
+            DataType::FixedSizeList(..) => {
+                // FixedSizeList doesn't use OffsetSizeTrait, so handle it separately
+                make_map_array_from_fixed_size_list(keys, values)
+            }
+            _ => exec_err!(
+                "Expected List, LargeList, or FixedSizeList, got {:?}",
+                data_type
+            ),
         };
     }
 
@@ -144,8 +273,8 @@ fn make_map_batch_internal(
     let mut entry_offsets_buffer = VecDeque::new();
     entry_offsets_buffer.push_back(0);
 
-    entry_struct_buffer.push_back((Arc::clone(&key_field), Arc::clone(&keys)));
-    entry_struct_buffer.push_back((Arc::clone(&value_field), Arc::clone(&values)));
+    entry_struct_buffer.push_back((Arc::clone(&key_field), Arc::clone(keys)));
+    entry_struct_buffer.push_back((Arc::clone(&value_field), Arc::clone(values)));
     entry_offsets_buffer.push_back(keys.len() as u32);
 
     let entry_struct: Vec<(Arc<Field>, ArrayRef)> = entry_struct_buffer.into();
@@ -241,10 +370,6 @@ impl MapFunc {
 }
 
 impl ScalarUDFImpl for MapFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "map"
     }
@@ -273,10 +398,7 @@ impl ScalarUDFImpl for MapFunc {
         ))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_map_batch(&args.args)
     }
 
@@ -353,30 +475,163 @@ fn get_element_type(data_type: &DataType) -> Result<&DataType> {
 /// +-----------+      +-----------+
 /// ```text
 fn make_map_array_internal<O: OffsetSizeTrait>(
-    keys: ArrayRef,
-    values: ArrayRef,
+    keys: &ArrayRef,
+    values: &ArrayRef,
 ) -> Result<ColumnarValue> {
-    let mut offset_buffer = vec![O::zero()];
-    let mut running_offset = O::zero();
+    // Save original data types and array length before list_to_arrays transforms them
+    let keys_data_type = keys.data_type().clone();
+    let values_data_type = values.data_type().clone();
+    let original_len = keys.len(); // This is the number of rows in the input
+
+    // Save the nulls bitmap from the original keys array (before list_to_arrays)
+    // This tells us which MAP values are NULL (not which keys within maps are null)
+    let nulls_bitmap = keys.nulls().cloned();
+
+    let keys = list_to_arrays::<O>(keys);
+    let values = list_to_arrays_skipping_null_rows::<O>(values, nulls_bitmap.as_ref());
+
+    build_map_array(
+        &keys,
+        &values,
+        &keys_data_type,
+        &values_data_type,
+        original_len,
+        nulls_bitmap,
+    )
+}
 
-    let keys = list_to_arrays::<O>(&keys);
-    let values = list_to_arrays::<O>(&values);
+/// Helper function specifically for FixedSizeList inputs
+/// Similar to make_map_array_internal but uses fixed_size_list_to_arrays instead of list_to_arrays
+fn make_map_array_from_fixed_size_list(
+    keys: &ArrayRef,
+    values: &ArrayRef,
+) -> Result<ColumnarValue> {
+    // Save original data types and array length
+    let keys_data_type = keys.data_type().clone();
+    let values_data_type = values.data_type().clone();
+    let original_len = keys.len();
+
+    // Save the nulls bitmap from the original keys array
+    let nulls_bitmap = keys.nulls().cloned();
+
+    let keys = fixed_size_list_to_arrays(keys);
+    let values =
+        fixed_size_list_to_arrays_skipping_null_rows(values, nulls_bitmap.as_ref());
+
+    build_map_array(
+        &keys,
+        &values,
+        &keys_data_type,
+        &values_data_type,
+        original_len,
+        nulls_bitmap,
+    )
+}
+fn list_to_arrays_skipping_null_rows<O: OffsetSizeTrait>(
+    array: &ArrayRef,
+    null_rows: Option<&arrow::buffer::NullBuffer>,
+) -> Vec<ArrayRef> {
+    array
+        .as_list::<O>()
+        .iter()
+        .enumerate()
+        .filter_map(|(i, row)| {
+            if null_rows.is_some_and(|nulls| nulls.is_null(i)) {
+                None
+            } else {
+                row
+            }
+        })
+        .collect()
+}
+
+fn fixed_size_list_to_arrays_skipping_null_rows(
+    array: &ArrayRef,
+    null_rows: Option<&arrow::buffer::NullBuffer>,
+) -> Vec<ArrayRef> {
+    array
+        .as_fixed_size_list()
+        .iter()
+        .enumerate()
+        .filter_map(|(i, row)| {
+            if null_rows.is_some_and(|nulls| nulls.is_null(i)) {
+                None
+            } else {
+                row
+            }
+        })
+        .collect()
+}
+
+/// Common logic to build a MapArray from decomposed list arrays
+fn build_map_array(
+    keys: &[ArrayRef],
+    values: &[ArrayRef],
+    keys_data_type: &DataType,
+    values_data_type: &DataType,
+    original_len: usize,
+    nulls_bitmap: Option<arrow::buffer::NullBuffer>,
+) -> Result<ColumnarValue> {
+    if keys.len() != values.len() {
+        return exec_err!("map requires key and value lists to have the same length");
+    }
 
     let mut key_array_vec = vec![];
     let mut value_array_vec = vec![];
     for (k, v) in keys.iter().zip(values.iter()) {
-        running_offset = running_offset.add(O::usize_as(k.len()));
-        offset_buffer.push(running_offset);
         key_array_vec.push(k.as_ref());
         value_array_vec.push(v.as_ref());
     }
 
-    // concatenate all the arrays
-    let flattened_keys = arrow::compute::concat(key_array_vec.as_ref())?;
-    if flattened_keys.null_count() > 0 {
-        return exec_err!("keys cannot be null");
+    // Build offset buffer that accounts for NULL maps
+    // For each row, if it's NULL, the offset stays the same (empty range)
+    // If it's not NULL, the offset advances by the number of entries in that map
+    // NOTE: MapArray always requires i32 offsets, regardless of input list type
+    let mut running_offset = 0i32;
+    let mut offset_buffer = vec![running_offset];
+    let mut non_null_idx = 0;
+    for i in 0..original_len {
+        let is_null = nulls_bitmap.as_ref().is_some_and(|nulls| nulls.is_null(i));
+        if !is_null {
+            let entry_count = keys[non_null_idx].len();
+            // Validate that we won't overflow i32 when converting from potentially i64 offsets
+            let entry_count_i32 = i32::try_from(entry_count).map_err(|_| {
+                datafusion_common::DataFusionError::Execution(format!(
+                    "Map offset overflow: entry count {entry_count} at index {i} exceeds i32::MAX",
+                ))
+            })?;
+            running_offset =
+                running_offset.checked_add(entry_count_i32).ok_or_else(|| {
+                    datafusion_common::DataFusionError::Execution(format!(
+                    "Map offset overflow: cumulative offset exceeds i32::MAX at index {i}",
+                ))
+                })?;
+            non_null_idx += 1;
+        }
+        offset_buffer.push(running_offset);
     }
-    let flattened_values = arrow::compute::concat(value_array_vec.as_ref())?;
+
+    // concatenate all the arrays
+    // If key_array_vec is empty, it means all maps were NULL (list elements were NULL).
+    // In this case, we need to create empty arrays with the correct data type.
+    let (flattened_keys, flattened_values) = if key_array_vec.is_empty() {
+        // All maps are NULL - create empty arrays
+        // We need to infer the data type from the original keys/values arrays
+        let key_type = get_element_type(keys_data_type)?;
+        let value_type = get_element_type(values_data_type)?;
+
+        (
+            arrow::array::new_empty_array(key_type),
+            arrow::array::new_empty_array(value_type),
+        )
+    } else {
+        let flattened_keys = arrow::compute::concat(key_array_vec.as_ref())?;
+        if flattened_keys.null_count() > 0 {
+            return exec_err!("keys cannot be null");
+        }
+        let flattened_values = arrow::compute::concat(value_array_vec.as_ref())?;
+        (flattened_keys, flattened_values)
+    };
 
     let fields = vec![
         Arc::new(Field::new("key", flattened_keys.data_type().clone(), false)),
@@ -393,7 +648,7 @@ fn make_map_array_internal<O: OffsetSizeTrait>(
         .add_child_data(flattened_values.to_data())
         .build()?;
 
-    let map_data = ArrayData::builder(DataType::Map(
+    let mut map_data_builder = ArrayData::builder(DataType::Map(
         Arc::new(Field::new(
             "entries",
             struct_data.data_type().clone(),
@@ -401,9 +656,252 @@ fn make_map_array_internal<O: OffsetSizeTrait>(
         )),
         false,
     ))
-    .len(keys.len())
+    .len(original_len) // Use the original number of rows, not the filtered count
     .add_child_data(struct_data)
-    .add_buffer(Buffer::from_slice_ref(offset_buffer.as_slice()))
-    .build()?;
+    .add_buffer(Buffer::from_slice_ref(offset_buffer.as_slice()));
+
+    // Add the nulls bitmap if present (to preserve NULL map values)
+    if let Some(nulls) = nulls_bitmap {
+        map_data_builder = map_data_builder.nulls(Some(nulls));
+    }
+
+    let map_data = map_data_builder.build()?;
     Ok(ColumnarValue::Array(Arc::new(MapArray::from(map_data))))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_make_map_with_null_maps() {
+        // Test that NULL map values (entire map is NULL) are correctly handled
+        // This test directly calls make_map_batch with a List containing NULL elements
+        //
+        // Background: On main branch, the code would fail with "map key cannot be null"
+        // because it couldn't distinguish between:
+        // - NULL map (entire map is NULL) - should be allowed
+        // - null key within a map - should be rejected
+
+        // Build keys array: [['a'], NULL, ['b']]
+        // The middle NULL represents an entire NULL map, not a null key
+        let mut key_builder =
+            arrow::array::ListBuilder::new(arrow::array::StringBuilder::new());
+
+        // First map: ['a']
+        key_builder.values().append_value("a");
+        key_builder.append(true);
+
+        // Second map: NULL (entire map is NULL)
+        key_builder.append(false);
+
+        // Third map: ['b']
+        key_builder.values().append_value("b");
+        key_builder.append(true);
+
+        let keys_array = Arc::new(key_builder.finish());
+
+        // Build values array: [[1], [2], [3]]
+        let mut value_builder =
+            arrow::array::ListBuilder::new(arrow::array::Int32Builder::new());
+
+        value_builder.values().append_value(1);
+        value_builder.append(true);
+
+        value_builder.values().append_value(2);
+        value_builder.append(true);
+
+        value_builder.values().append_value(3);
+        value_builder.append(true);
+
+        let values_array = Arc::new(value_builder.finish());
+
+        // Call make_map_batch - should succeed
+        let result = make_map_batch(&[
+            ColumnarValue::Array(keys_array),
+            ColumnarValue::Array(values_array),
+        ]);
+
+        assert!(result.is_ok(), "Should handle NULL maps correctly");
+
+        // Verify the result
+        let map_array = match result.unwrap() {
+            ColumnarValue::Array(arr) => arr,
+            _ => panic!("Expected Array result"),
+        };
+
+        assert_eq!(map_array.len(), 3, "Should have 3 maps");
+        assert!(!map_array.is_null(0), "First map should not be NULL");
+        assert!(map_array.is_null(1), "Second map should be NULL");
+        assert!(!map_array.is_null(2), "Third map should not be NULL");
+    }
+
+    #[test]
+    fn test_make_map_with_null_key_within_map_should_fail() {
+        // Test that null keys WITHIN a map are properly rejected
+        // This ensures the fix doesn't accidentally allow invalid null keys
+
+        // Build keys array: [['a', NULL, 'b']]
+        // The NULL here is a null key within the map, which is invalid
+        let mut key_builder =
+            arrow::array::ListBuilder::new(arrow::array::StringBuilder::new());
+
+        key_builder.values().append_value("a");
+        key_builder.values().append_null(); // Invalid: null key
+        key_builder.values().append_value("b");
+        key_builder.append(true);
+
+        let keys_array = Arc::new(key_builder.finish());
+
+        // Build values array: [[1, 2, 3]]
+        let mut value_builder =
+            arrow::array::ListBuilder::new(arrow::array::Int32Builder::new());
+
+        value_builder.values().append_value(1);
+        value_builder.values().append_value(2);
+        value_builder.values().append_value(3);
+        value_builder.append(true);
+
+        let values_array = Arc::new(value_builder.finish());
+
+        // Call make_map_batch - should fail
+        let result = make_map_batch(&[
+            ColumnarValue::Array(keys_array),
+            ColumnarValue::Array(values_array),
+        ]);
+
+        assert!(result.is_err(), "Should reject null keys within maps");
+
+        let err_msg = result.unwrap_err().to_string();
+        assert!(
+            err_msg.contains("cannot be null"),
+            "Error should mention null keys, got: {err_msg}"
+        );
+    }
+
+    #[test]
+    fn test_make_map_with_large_list() {
+        // Test that LargeList inputs work correctly with i32 offset conversion
+        // This verifies the fix for the offset buffer type mismatch issue
+
+        // Build keys array as LargeList: [['a', 'b'], ['c']]
+        let mut key_builder =
+            arrow::array::LargeListBuilder::new(arrow::array::StringBuilder::new());
+
+        // First map: ['a', 'b']
+        key_builder.values().append_value("a");
+        key_builder.values().append_value("b");
+        key_builder.append(true);
+
+        // Second map: ['c']
+        key_builder.values().append_value("c");
+        key_builder.append(true);
+
+        let keys_array = Arc::new(key_builder.finish());
+
+        // Build values array as LargeList: [[1, 2], [3]]
+        let mut value_builder =
+            arrow::array::LargeListBuilder::new(arrow::array::Int32Builder::new());
+
+        value_builder.values().append_value(1);
+        value_builder.values().append_value(2);
+        value_builder.append(true);
+
+        value_builder.values().append_value(3);
+        value_builder.append(true);
+
+        let values_array = Arc::new(value_builder.finish());
+
+        // Call make_map_batch - should succeed
+        let result = make_map_batch(&[
+            ColumnarValue::Array(keys_array),
+            ColumnarValue::Array(values_array),
+        ]);
+
+        assert!(
+            result.is_ok(),
+            "Should handle LargeList inputs correctly: {:?}",
+            result.err()
+        );
+
+        // Verify the result
+        let map_array = match result.unwrap() {
+            ColumnarValue::Array(arr) => arr,
+            _ => panic!("Expected Array result"),
+        };
+
+        assert_eq!(map_array.len(), 2, "Should have 2 maps");
+        assert!(!map_array.is_null(0), "First map should not be NULL");
+        assert!(!map_array.is_null(1), "Second map should not be NULL");
+    }
+
+    #[test]
+    fn test_make_map_with_fixed_size_list() {
+        // Test that FixedSizeList inputs work correctly
+        // This verifies the fix for FixedSizeList support in the data type check
+
+        use arrow::array::FixedSizeListBuilder;
+
+        // Build keys array as FixedSizeList(2): [['a', 'b'], NULL, ['c', 'd']]
+        let key_values_builder = arrow::array::StringBuilder::new();
+        let mut key_builder = FixedSizeListBuilder::new(key_values_builder, 2);
+
+        // First map: ['a', 'b']
+        key_builder.values().append_value("a");
+        key_builder.values().append_value("b");
+        key_builder.append(true);
+
+        // Second map: NULL (entire map is NULL)
+        key_builder.values().append_null();
+        key_builder.values().append_null();
+        key_builder.append(false);
+
+        // Second map: ['c', 'd']
+        key_builder.values().append_value("c");
+        key_builder.values().append_value("d");
+        key_builder.append(true);
+
+        let keys_array = Arc::new(key_builder.finish());
+
+        // Build values array as FixedSizeList(2): [[1, 2], [99, 100], [3, 4]]
+        // The middle row should be ignored because the corresponding key row is NULL.
+        let value_values_builder = arrow::array::Int32Builder::new();
+        let mut value_builder = FixedSizeListBuilder::new(value_values_builder, 2);
+
+        value_builder.values().append_value(1);
+        value_builder.values().append_value(2);
+        value_builder.append(true);
+
+        value_builder.values().append_value(99);
+        value_builder.values().append_value(100);
+        value_builder.append(true);
+
+        value_builder.values().append_value(3);
+        value_builder.values().append_value(4);
+        value_builder.append(true);
+
+        let values_array = Arc::new(value_builder.finish());
+
+        // Call make_map_batch - should succeed
+        let result = make_map_batch(&[
+            ColumnarValue::Array(keys_array),
+            ColumnarValue::Array(values_array),
+        ]);
+
+        assert!(
+            result.is_ok(),
+            "Should handle FixedSizeList inputs correctly: {:?}",
+            result.err()
+        );
+
+        // Verify the result
+        let map_array = match result.unwrap() {
+            ColumnarValue::Array(arr) => arr,
+            _ => panic!("Expected Array result"),
+        };
+
+        assert_eq!(map_array.len(), 3, "Should have 3 maps");
+        assert!(!map_array.is_null(0), "First map should not be NULL");
+        assert!(map_array.is_null(1), "Second map should be NULL");
+        assert!(!map_array.is_null(2), "Third map should not be NULL");
+    }
+}
diff --git a/datafusion/functions-nested/src/map_entries.rs b/datafusion/functions-nested/src/map_entries.rs
index 7d9d103206dbc..e465b39d02751 100644
--- a/datafusion/functions-nested/src/map_entries.rs
+++ b/datafusion/functions-nested/src/map_entries.rs
@@ -21,13 +21,12 @@ use crate::utils::{get_map_entry_field, make_scalar_function};
 use arrow::array::{Array, ArrayRef, ListArray};
 use arrow::datatypes::{DataType, Field, Fields};
 use datafusion_common::utils::take_function_args;
-use datafusion_common::{cast::as_map_array, exec_err, Result};
+use datafusion_common::{Result, cast::as_map_array, exec_err};
 use datafusion_expr::{
-    ArrayFunctionSignature, ColumnarValue, Documentation, ScalarUDFImpl, Signature,
-    TypeSignature, Volatility,
+    ArrayFunctionSignature, ColumnarValue, Documentation, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignature, Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::sync::Arc;
 
 make_udf_expr_and_func!(
@@ -79,10 +78,6 @@ impl MapEntriesFunc {
 }
 
 impl ScalarUDFImpl for MapEntriesFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "map_entries"
     }
@@ -111,10 +106,7 @@ impl ScalarUDFImpl for MapEntriesFunc {
         ))))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(map_entries_inner)(&args.args)
     }
 
diff --git a/datafusion/functions-nested/src/map_extract.rs b/datafusion/functions-nested/src/map_extract.rs
index 4aab5d7a60d18..aab0d013a4152 100644
--- a/datafusion/functions-nested/src/map_extract.rs
+++ b/datafusion/functions-nested/src/map_extract.rs
@@ -19,17 +19,17 @@
 
 use crate::utils::{get_map_entry_field, make_scalar_function};
 use arrow::array::{
-    make_array, Array, ArrayRef, Capacities, ListArray, MapArray, MutableArrayData,
+    Array, ArrayRef, Capacities, ListArray, MapArray, MutableArrayData, make_array,
 };
 use arrow::buffer::OffsetBuffer;
 use arrow::datatypes::{DataType, Field};
 use datafusion_common::utils::take_function_args;
-use datafusion_common::{cast::as_map_array, exec_err, Result};
+use datafusion_common::{Result, cast::as_map_array, exec_err};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::sync::Arc;
 use std::vec;
 
@@ -57,6 +57,11 @@ SELECT map_extract(MAP {1: 'one', 2: 'two'}, 2);
 
 SELECT map_extract(MAP {'x': 10, 'y': NULL, 'z': 30}, 'y');
 ----
+[NULL]
+
+-- non-existing key
+SELECT map_extract(MAP {'x': 10, 'y': NULL, 'z': 30}, 'a');
+----
 []
 ```"#,
     argument(
@@ -90,9 +95,6 @@ impl MapExtract {
 }
 
 impl ScalarUDFImpl for MapExtract {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "map_extract"
     }
@@ -110,10 +112,7 @@ impl ScalarUDFImpl for MapExtract {
         ))))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(map_extract_inner)(&args.args)
     }
 
diff --git a/datafusion/functions-nested/src/map_keys.rs b/datafusion/functions-nested/src/map_keys.rs
index 2fc44670d74a2..4d3421085c5d0 100644
--- a/datafusion/functions-nested/src/map_keys.rs
+++ b/datafusion/functions-nested/src/map_keys.rs
@@ -21,13 +21,12 @@ use crate::utils::{get_map_entry_field, make_scalar_function};
 use arrow::array::{Array, ArrayRef, ListArray};
 use arrow::datatypes::{DataType, Field};
 use datafusion_common::utils::take_function_args;
-use datafusion_common::{cast::as_map_array, exec_err, Result};
+use datafusion_common::{Result, cast::as_map_array, exec_err};
 use datafusion_expr::{
-    ArrayFunctionSignature, ColumnarValue, Documentation, ScalarUDFImpl, Signature,
-    TypeSignature, Volatility,
+    ArrayFunctionSignature, ColumnarValue, Documentation, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignature, Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::sync::Arc;
 
 make_udf_expr_and_func!(
@@ -79,10 +78,6 @@ impl MapKeysFunc {
 }
 
 impl ScalarUDFImpl for MapKeysFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "map_keys"
     }
@@ -101,10 +96,7 @@ impl ScalarUDFImpl for MapKeysFunc {
         ))))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(map_keys_inner)(&args.args)
     }
 
diff --git a/datafusion/functions-nested/src/map_values.rs b/datafusion/functions-nested/src/map_values.rs
index 6ae8a278063da..d4fa3b0924f1b 100644
--- a/datafusion/functions-nested/src/map_values.rs
+++ b/datafusion/functions-nested/src/map_values.rs
@@ -21,13 +21,12 @@ use crate::utils::{get_map_entry_field, make_scalar_function};
 use arrow::array::{Array, ArrayRef, ListArray};
 use arrow::datatypes::{DataType, Field, FieldRef};
 use datafusion_common::utils::take_function_args;
-use datafusion_common::{cast::as_map_array, exec_err, internal_err, Result};
+use datafusion_common::{Result, cast::as_map_array, exec_err, internal_err};
 use datafusion_expr::{
-    ArrayFunctionSignature, ColumnarValue, Documentation, ScalarUDFImpl, Signature,
-    TypeSignature, Volatility,
+    ArrayFunctionSignature, ColumnarValue, Documentation, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignature, Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::ops::Deref;
 use std::sync::Arc;
 
@@ -80,10 +79,6 @@ impl MapValuesFunc {
 }
 
 impl ScalarUDFImpl for MapValuesFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "map_values"
     }
@@ -111,10 +106,7 @@ impl ScalarUDFImpl for MapValuesFunc {
         .into())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(map_values_inner)(&args.args)
     }
 
diff --git a/datafusion/functions-nested/src/min_max.rs b/datafusion/functions-nested/src/min_max.rs
index 117cfbeaa2b2c..ba9e5e7a07eb4 100644
--- a/datafusion/functions-nested/src/min_max.rs
+++ b/datafusion/functions-nested/src/min_max.rs
@@ -15,15 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`ScalarUDFImpl`] definitions for array_max function.
+//! [`ScalarUDFImpl`] definitions for array_min and array_max functions.
 use crate::utils::make_scalar_function;
-use arrow::array::{ArrayRef, GenericListArray, OffsetSizeTrait};
+use arrow::array::{
+    Array, ArrayRef, ArrowNativeTypeOp, ArrowPrimitiveType, AsArray, GenericListArray,
+    OffsetSizeTrait, PrimitiveBuilder, downcast_primitive,
+};
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::{LargeList, List};
+use datafusion_common::Result;
 use datafusion_common::cast::{as_large_list_array, as_list_array};
 use datafusion_common::utils::take_function_args;
-use datafusion_common::Result;
-use datafusion_common::{exec_err, plan_err, ScalarValue};
+use datafusion_common::{ScalarValue, exec_err, plan_err};
 use datafusion_doc::Documentation;
 use datafusion_expr::{
     ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
@@ -31,7 +34,7 @@ use datafusion_expr::{
 use datafusion_functions_aggregate_common::min_max::{max_batch, min_batch};
 use datafusion_macros::user_doc;
 use itertools::Itertools;
-use std::any::Any;
+use std::sync::Arc;
 
 make_udf_expr_and_func!(
     ArrayMax,
@@ -80,10 +83,6 @@ impl ArrayMax {
 }
 
 impl ScalarUDFImpl for ArrayMax {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array_max"
     }
@@ -113,18 +112,11 @@ impl ScalarUDFImpl for ArrayMax {
     }
 }
 
-/// array_max SQL function
-///
-/// There is one argument for array_max as the array.
-/// `array_max(array)`
-///
-/// For example:
-/// > array_max(\[1, 3, 2]) -> 3
-pub fn array_max_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_max_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array] = take_function_args("array_max", args)?;
     match array.data_type() {
-        List(_) => array_min_max_helper(as_list_array(array)?, max_batch),
-        LargeList(_) => array_min_max_helper(as_large_list_array(array)?, max_batch),
+        List(_) => array_min_max_helper(as_list_array(array)?, false),
+        LargeList(_) => array_min_max_helper(as_large_list_array(array)?, false),
         arg_type => exec_err!("array_max does not support type: {arg_type}"),
     }
 }
@@ -173,10 +165,6 @@ impl ArrayMin {
 }
 
 impl ScalarUDFImpl for ArrayMin {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array_min"
     }
@@ -202,19 +190,26 @@ impl ScalarUDFImpl for ArrayMin {
     }
 }
 
-pub fn array_min_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_min_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array] = take_function_args("array_min", args)?;
     match array.data_type() {
-        List(_) => array_min_max_helper(as_list_array(array)?, min_batch),
-        LargeList(_) => array_min_max_helper(as_large_list_array(array)?, min_batch),
+        List(_) => array_min_max_helper(as_list_array(array)?, true),
+        LargeList(_) => array_min_max_helper(as_large_list_array(array)?, true),
         arg_type => exec_err!("array_min does not support type: {arg_type}"),
     }
 }
 
 fn array_min_max_helper<O: OffsetSizeTrait>(
     array: &GenericListArray<O>,
-    agg_fn: fn(&ArrayRef) -> Result<ScalarValue>,
+    is_min: bool,
 ) -> Result<ArrayRef> {
+    // Try the primitive fast path first
+    if let Some(result) = try_primitive_array_min_max(array, is_min) {
+        return result;
+    }
+
+    // Fallback: per-row ScalarValue path for non-primitive types
+    let agg_fn = if is_min { min_batch } else { max_batch };
     let null_value = ScalarValue::try_from(array.value_type())?;
     let result_vec: Vec<ScalarValue> = array
         .iter()
@@ -222,3 +217,96 @@ fn array_min_max_helper<O: OffsetSizeTrait>(
         .try_collect()?;
     ScalarValue::iter_to_array(result_vec)
 }
+
+/// Dispatches to a typed primitive min/max implementation, or returns `None` if
+/// the element type is not a primitive.
+fn try_primitive_array_min_max<O: OffsetSizeTrait>(
+    list_array: &GenericListArray<O>,
+    is_min: bool,
+) -> Option<Result<ArrayRef>> {
+    macro_rules! helper {
+        ($t:ty) => {
+            return Some(primitive_array_min_max::<O, $t>(list_array, is_min))
+        };
+    }
+    downcast_primitive! {
+        list_array.value_type() => (helper),
+        _ => {}
+    }
+    None
+}
+
+/// Threshold to switch from direct iteration to using `min` / `max` kernel from
+/// `arrow::compute`. The latter has enough per-invocation overhead that direct
+/// iteration is faster for small lists.
+const ARROW_COMPUTE_THRESHOLD: usize = 32;
+
+/// Computes min or max for each row of a primitive ListArray.
+fn primitive_array_min_max<O: OffsetSizeTrait, T: ArrowPrimitiveType>(
+    list_array: &GenericListArray<O>,
+    is_min: bool,
+) -> Result<ArrayRef> {
+    let values_array = list_array.values().as_primitive::<T>();
+    let values_slice = values_array.values();
+    let values_nulls = values_array.nulls();
+    let mut result_builder = PrimitiveBuilder::<T>::with_capacity(list_array.len())
+        .with_data_type(values_array.data_type().clone());
+
+    for (row, w) in list_array.offsets().windows(2).enumerate() {
+        let row_result = if list_array.is_null(row) {
+            None
+        } else {
+            let start = w[0].as_usize();
+            let end = w[1].as_usize();
+            let len = end - start;
+
+            match len {
+                0 => None,
+                _ if len < ARROW_COMPUTE_THRESHOLD => {
+                    scalar_min_max::<T>(values_slice, values_nulls, start, end, is_min)
+                }
+                _ => {
+                    let slice = values_array.slice(start, len);
+                    if is_min {
+                        arrow::compute::min::<T>(&slice)
+                    } else {
+                        arrow::compute::max::<T>(&slice)
+                    }
+                }
+            }
+        };
+
+        result_builder.append_option(row_result);
+    }
+
+    Ok(Arc::new(result_builder.finish()) as ArrayRef)
+}
+
+/// Computes min or max for a single list row by directly scanning a slice of
+/// the flat values buffer.
+#[inline]
+fn scalar_min_max<T: ArrowPrimitiveType>(
+    values_slice: &[T::Native],
+    values_nulls: Option<&arrow::buffer::NullBuffer>,
+    start: usize,
+    end: usize,
+    is_min: bool,
+) -> Option<T::Native> {
+    let mut best: Option<T::Native> = None;
+    for (i, &val) in values_slice[start..end].iter().enumerate() {
+        if let Some(nulls) = values_nulls
+            && !nulls.is_valid(start + i)
+        {
+            continue;
+        }
+        let update_best = match best {
+            None => true,
+            Some(current) if is_min => val.is_lt(current),
+            Some(current) => val.is_gt(current),
+        };
+        if update_best {
+            best = Some(val);
+        }
+    }
+    best
+}
diff --git a/datafusion/functions-nested/src/planner.rs b/datafusion/functions-nested/src/planner.rs
index 4fec5e38065b5..e96fdb7d4baca 100644
--- a/datafusion/functions-nested/src/planner.rs
+++ b/datafusion/functions-nested/src/planner.rs
@@ -18,15 +18,15 @@
 //! SQL planning extensions like [`NestedFunctionPlanner`] and [`FieldAccessPlanner`]
 
 use arrow::datatypes::DataType;
-use datafusion_common::{plan_err, utils::list_ndims, DFSchema, Result};
+use datafusion_common::{DFSchema, Result, plan_err, utils::list_ndims};
+use datafusion_expr::AggregateUDF;
 use datafusion_expr::expr::ScalarFunction;
 use datafusion_expr::expr::{AggregateFunction, AggregateFunctionParams};
 #[cfg(feature = "sql")]
 use datafusion_expr::sqlparser::ast::BinaryOperator;
-use datafusion_expr::AggregateUDF;
 use datafusion_expr::{
-    planner::{ExprPlanner, PlannerResult, RawBinaryExpr, RawFieldAccessExpr},
     Expr, ExprSchemable, GetFieldAccess,
+    planner::{ExprPlanner, PlannerResult, RawBinaryExpr, RawFieldAccessExpr},
 };
 #[cfg(not(feature = "sql"))]
 use datafusion_expr_common::operator::Operator as BinaryOperator;
@@ -37,7 +37,7 @@ use std::sync::Arc;
 
 use crate::map::map_udf;
 use crate::{
-    array_has::{array_has_all, array_has_udf},
+    array_has::array_has_all,
     expr_fn::{array_append, array_concat, array_prepend},
     extract::{array_element, array_slice},
     make_array::make_array,
@@ -120,20 +120,6 @@ impl ExprPlanner for NestedFunctionPlanner {
             ScalarFunction::new_udf(map_udf(), vec![keys, values]),
         )))
     }
-
-    fn plan_any(&self, expr: RawBinaryExpr) -> Result<PlannerResult<RawBinaryExpr>> {
-        if expr.op == BinaryOperator::Eq {
-            Ok(PlannerResult::Planned(Expr::ScalarFunction(
-                ScalarFunction::new_udf(
-                    array_has_udf(),
-                    // left and right are reversed here so `needle=any(haystack)` -> `array_has(haystack, needle)`
-                    vec![expr.right, expr.left],
-                ),
-            )))
-        } else {
-            plan_err!("Unsupported AnyOp: '{}', only '=' is supported", expr.op)
-        }
-    }
 }
 
 #[derive(Debug)]
@@ -148,6 +134,9 @@ impl ExprPlanner for FieldAccessPlanner {
 
         match field_access {
             // expr["field"] => get_field(expr, "field")
+            // Nested accesses like expr["a"]["b"] create nested get_field calls,
+            // which are then merged by the SimplifyExpressions optimizer pass via
+            // the GetFieldFunc::simplify() method.
             GetFieldAccess::NamedStructField { name } => {
                 Ok(PlannerResult::Planned(get_field(expr, name)))
             }
diff --git a/datafusion/functions-nested/src/position.rs b/datafusion/functions-nested/src/position.rs
index dae946def8f53..d65620ede38e6 100644
--- a/datafusion/functions-nested/src/position.rs
+++ b/datafusion/functions-nested/src/position.rs
@@ -17,27 +17,30 @@
 
 //! [`ScalarUDFImpl`] definitions for array_position and array_positions functions.
 
+use arrow::array::Scalar;
+use arrow::buffer::OffsetBuffer;
 use arrow::datatypes::DataType;
 use arrow::datatypes::{
     DataType::{LargeList, List, UInt64},
     Field,
 };
+use datafusion_common::ScalarValue;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{
-    types::UInt64Type, Array, ArrayRef, GenericListArray, ListArray, OffsetSizeTrait,
-    UInt64Array,
+    Array, ArrayRef, GenericListArray, ListArray, OffsetSizeTrait, UInt64Array,
+    types::UInt64Type,
 };
 use datafusion_common::cast::{
     as_generic_list_array, as_int64_array, as_large_list_array, as_list_array,
 };
-use datafusion_common::{exec_err, internal_err, utils::take_function_args, Result};
+use datafusion_common::{Result, exec_err, utils::take_function_args};
 use itertools::Itertools;
 
 use crate::utils::{compare_element_to_list, make_scalar_function};
@@ -52,7 +55,7 @@ make_udf_expr_and_func!(
 
 #[user_doc(
     doc_section(label = "Array Functions"),
-    description = "Returns the position of the first occurrence of the specified element in the array, or NULL if not found.",
+    description = "Returns the position of the first occurrence of the specified element in the array, or NULL if not found. Comparisons are done using `IS DISTINCT FROM` semantics, so NULL is considered to match NULL.",
     syntax_example = "array_position(array, element)\narray_position(array, element, index)",
     sql_example = r#"```sql
 > select array_position([1, 2, 2, 3, 1, 4], 2);
@@ -72,10 +75,7 @@ make_udf_expr_and_func!(
         name = "array",
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     ),
-    argument(
-        name = "element",
-        description = "Element to search for position in the array."
-    ),
+    argument(name = "element", description = "Element to search for in the array."),
     argument(
         name = "index",
         description = "Index at which to start searching (1-indexed)."
@@ -108,9 +108,6 @@ impl ArrayPosition {
 }
 
 impl ScalarUDFImpl for ArrayPosition {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "array_position"
     }
@@ -123,11 +120,11 @@ impl ScalarUDFImpl for ArrayPosition {
         Ok(UInt64)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        make_scalar_function(array_position_inner)(&args.args)
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        match try_array_position_scalar(&args.args)? {
+            Some(result) => Ok(result),
+            None => make_scalar_function(array_position_inner)(&args.args),
+        }
     }
 
     fn aliases(&self) -> &[String] {
@@ -139,66 +136,206 @@ impl ScalarUDFImpl for ArrayPosition {
     }
 }
 
-/// Array_position SQL function
-pub fn array_position_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+/// Attempts the scalar-needle fast path for `array_position`.
+fn try_array_position_scalar(args: &[ColumnarValue]) -> Result<Option<ColumnarValue>> {
+    if args.len() < 2 || args.len() > 3 {
+        return exec_err!("array_position expects two or three arguments");
+    }
+
+    // Fallback to the generic code path if the needle is an array
+    let scalar_needle = match &args[1] {
+        ColumnarValue::Scalar(s) => s,
+        ColumnarValue::Array(_) => return Ok(None),
+    };
+
+    // `not_distinct` doesn't support nested types (List, Struct, etc.),
+    // so fall back to the generic code path for those.
+    if scalar_needle.data_type().is_nested() {
+        return Ok(None);
+    }
+
+    // Determine batch length from whichever argument is columnar;
+    // if all inputs are scalar, batch length is 1.
+    let (num_rows, all_inputs_scalar) = match (&args[0], args.get(2)) {
+        (ColumnarValue::Array(a), _) => (a.len(), false),
+        (_, Some(ColumnarValue::Array(a))) => (a.len(), false),
+        _ => (1, true),
+    };
+
+    let needle = scalar_needle.to_array_of_size(1)?;
+    let haystack = args[0].to_array(num_rows)?;
+    let arr_from = resolve_start_from(args.get(2), num_rows)?;
+
+    let result = match haystack.data_type() {
+        List(_) => {
+            let list = as_list_array(&haystack)?;
+            array_position_scalar::<i32>(list, &needle, &arr_from)
+        }
+        LargeList(_) => {
+            let list = as_large_list_array(&haystack)?;
+            array_position_scalar::<i64>(list, &needle, &arr_from)
+        }
+        t => exec_err!("array_position does not support type '{t}'"),
+    }?;
+
+    if all_inputs_scalar {
+        Ok(Some(ColumnarValue::Scalar(ScalarValue::try_from_array(
+            &result, 0,
+        )?)))
+    } else {
+        Ok(Some(ColumnarValue::Array(result)))
+    }
+}
+
+fn array_position_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     if args.len() < 2 || args.len() > 3 {
         return exec_err!("array_position expects two or three arguments");
     }
     match &args[0].data_type() {
         List(_) => general_position_dispatch::<i32>(args),
         LargeList(_) => general_position_dispatch::<i64>(args),
-        array_type => exec_err!("array_position does not support type '{array_type}'."),
+        dt => exec_err!("array_position does not support type '{dt}'"),
+    }
+}
+
+/// Resolves the optional `start_from` argument into a `Vec<i64>` of
+/// 0-indexed starting positions.
+fn resolve_start_from(
+    third_arg: Option<&ColumnarValue>,
+    num_rows: usize,
+) -> Result<Vec<i64>> {
+    match third_arg {
+        None => Ok(vec![0i64; num_rows]),
+        Some(ColumnarValue::Scalar(ScalarValue::Int64(Some(v)))) => {
+            Ok(vec![v - 1; num_rows])
+        }
+        Some(ColumnarValue::Scalar(s)) => {
+            exec_err!("array_position expected Int64 for start_from, got {s}")
+        }
+        Some(ColumnarValue::Array(a)) => {
+            Ok(as_int64_array(a)?.values().iter().map(|&x| x - 1).collect())
+        }
+    }
+}
+
+/// Fast path for `array_position` when the needle is scalar.
+///
+/// Performs a single bulk `not_distinct` comparison of the needle against the
+/// entire flat values buffer, then walks the result bitmap using offsets to
+/// find per-row first-match positions.
+fn array_position_scalar<O: OffsetSizeTrait>(
+    haystack: &GenericListArray<O>,
+    needle: &ArrayRef,
+    arr_from: &[i64], // 0-indexed
+) -> Result<ArrayRef> {
+    crate::utils::check_datatypes("array_position", &[haystack.values(), needle])?;
+
+    if haystack.len() == 0 {
+        return Ok(Arc::new(UInt64Array::new_null(0)));
+    }
+
+    let needle_datum = Scalar::new(Arc::clone(needle));
+    let validity = haystack.nulls();
+
+    // Only convert the visible portion of the values array. For sliced
+    // ListArrays, values() returns the full underlying array but only
+    // elements between the first and last offset are referenced.
+    let offsets = haystack.offsets();
+    let first_offset = offsets[0].as_usize();
+    let last_offset = offsets[haystack.len()].as_usize();
+    let visible_values = haystack
+        .values()
+        .slice(first_offset, last_offset - first_offset);
+
+    // `not_distinct` treats NULL=NULL as true, matching the semantics of
+    // `array_position`.
+    let eq_array = arrow_ord::cmp::not_distinct(&visible_values, &needle_datum)?;
+    let eq_bits = eq_array.values();
+
+    let mut result: Vec<Option<u64>> = Vec::with_capacity(haystack.len());
+    let mut matches = eq_bits.set_indices().peekable();
+
+    // Match positions are relative to visible_values (0-based), so
+    // subtract first_offset from each offset when comparing.
+    for i in 0..haystack.len() {
+        let start = offsets[i].as_usize() - first_offset;
+        let end = offsets[i + 1].as_usize() - first_offset;
+
+        if validity.is_some_and(|v| v.is_null(i)) {
+            // Null row -> null output; advance past matches in range
+            while matches.peek().is_some_and(|&p| p < end) {
+                matches.next();
+            }
+            result.push(None);
+            continue;
+        }
+
+        let from = arr_from[i];
+        let row_len = end - start;
+        if !(from >= 0 && (from as usize) <= row_len) {
+            return exec_err!("start_from out of bounds: {}", from + 1);
+        }
+        let search_start = start + from as usize;
+
+        // Advance past matches before search_start
+        while matches.peek().is_some_and(|&p| p < search_start) {
+            matches.next();
+        }
+
+        // First match in [search_start, end)?
+        if matches.peek().is_some_and(|&p| p < end) {
+            let pos = *matches.peek().unwrap();
+            result.push(Some((pos - start + 1) as u64));
+            // Advance past remaining matches in this row
+            while matches.peek().is_some_and(|&p| p < end) {
+                matches.next();
+            }
+        } else {
+            result.push(None);
+        }
     }
+
+    debug_assert_eq!(result.len(), haystack.len());
+    Ok(Arc::new(UInt64Array::from(result)))
 }
+
 fn general_position_dispatch<O: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let list_array = as_generic_list_array::<O>(&args[0])?;
-    let element_array = &args[1];
+    let haystack = as_generic_list_array::<O>(&args[0])?;
+    let needle = &args[1];
 
-    crate::utils::check_datatypes(
-        "array_position",
-        &[list_array.values(), element_array],
-    )?;
+    crate::utils::check_datatypes("array_position", &[haystack.values(), needle])?;
 
     let arr_from = if args.len() == 3 {
         as_int64_array(&args[2])?
             .values()
-            .to_vec()
             .iter()
             .map(|&x| x - 1)
             .collect::<Vec<_>>()
     } else {
-        vec![0; list_array.len()]
+        vec![0; haystack.len()]
     };
 
-    // if `start_from` index is out of bounds, return error
-    for (arr, &from) in list_array.iter().zip(arr_from.iter()) {
-        if let Some(arr) = arr {
-            if from < 0 || from as usize > arr.len() {
-                return internal_err!("start_from index out of bounds");
-            }
-        } else {
-            // We will get null if we got null in the array, so we don't need to check
+    for (row, &from) in haystack.iter().zip(arr_from.iter()) {
+        if !row.is_none_or(|row| from >= 0 && (from as usize) <= row.len()) {
+            return exec_err!("start_from out of bounds: {}", from + 1);
         }
     }
 
-    generic_position::<O>(list_array, element_array, arr_from)
+    generic_position::<O>(haystack, needle, &arr_from)
 }
 
-fn generic_position<OffsetSize: OffsetSizeTrait>(
-    list_array: &GenericListArray<OffsetSize>,
-    element_array: &ArrayRef,
-    arr_from: Vec<i64>, // 0-indexed
+fn generic_position<O: OffsetSizeTrait>(
+    haystack: &GenericListArray<O>,
+    needle: &ArrayRef,
+    arr_from: &[i64], // 0-indexed
 ) -> Result<ArrayRef> {
-    let mut data = Vec::with_capacity(list_array.len());
+    let mut data = Vec::with_capacity(haystack.len());
 
-    for (row_index, (list_array_row, &from)) in
-        list_array.iter().zip(arr_from.iter()).enumerate()
-    {
+    for (row_index, (row, &from)) in haystack.iter().zip(arr_from.iter()).enumerate() {
         let from = from as usize;
 
-        if let Some(list_array_row) = list_array_row {
-            let eq_array =
-                compare_element_to_list(&list_array_row, element_array, row_index, true)?;
+        if let Some(row) = row {
+            let eq_array = compare_element_to_list(&row, needle, row_index, true)?;
 
             // Collect `true`s in 1-indexed positions
             let index = eq_array
@@ -240,17 +377,20 @@ make_udf_expr_and_func!(
         name = "array",
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     ),
-    argument(
-        name = "element",
-        description = "Element to search for position in the array."
-    )
+    argument(name = "element", description = "Element to search for in the array.")
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
-pub(super) struct ArrayPositions {
+pub struct ArrayPositions {
     signature: Signature,
     aliases: Vec<String>,
 }
 
+impl Default for ArrayPositions {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl ArrayPositions {
     pub fn new() -> Self {
         Self {
@@ -261,9 +401,6 @@ impl ArrayPositions {
 }
 
 impl ScalarUDFImpl for ArrayPositions {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "array_positions"
     }
@@ -276,11 +413,11 @@ impl ScalarUDFImpl for ArrayPositions {
         Ok(List(Arc::new(Field::new_list_field(UInt64, true))))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        make_scalar_function(array_positions_inner)(&args.args)
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        match try_array_positions_scalar(&args.args)? {
+            Some(result) => Ok(result),
+            None => make_scalar_function(array_positions_inner)(&args.args),
+        }
     }
 
     fn aliases(&self) -> &[String] {
@@ -292,37 +429,70 @@ impl ScalarUDFImpl for ArrayPositions {
     }
 }
 
-/// Array_positions SQL function
-pub fn array_positions_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let [array, element] = take_function_args("array_positions", args)?;
+/// Attempts the scalar-needle fast path for `array_positions`.
+fn try_array_positions_scalar(args: &[ColumnarValue]) -> Result<Option<ColumnarValue>> {
+    let [haystack_arg, needle_arg] = take_function_args("array_positions", args)?;
+
+    let scalar_needle = match needle_arg {
+        ColumnarValue::Scalar(s) => s,
+        ColumnarValue::Array(_) => return Ok(None),
+    };
+
+    // `not_distinct` doesn't support nested types (List, Struct, etc.),
+    // so fall back to the per-row path for those.
+    if scalar_needle.data_type().is_nested() {
+        return Ok(None);
+    }
 
-    match &array.data_type() {
+    let (num_rows, all_inputs_scalar) = match haystack_arg {
+        ColumnarValue::Array(a) => (a.len(), false),
+        ColumnarValue::Scalar(_) => (1, true),
+    };
+
+    let needle = scalar_needle.to_array_of_size(1)?;
+    let haystack = haystack_arg.to_array(num_rows)?;
+
+    let result = match haystack.data_type() {
         List(_) => {
-            let arr = as_list_array(&array)?;
-            crate::utils::check_datatypes("array_positions", &[arr.values(), element])?;
-            general_positions::<i32>(arr, element)
+            let list = as_list_array(&haystack)?;
+            array_positions_scalar::<i32>(list, &needle)
         }
         LargeList(_) => {
-            let arr = as_large_list_array(&array)?;
-            crate::utils::check_datatypes("array_positions", &[arr.values(), element])?;
-            general_positions::<i64>(arr, element)
-        }
-        array_type => {
-            exec_err!("array_positions does not support type '{array_type}'.")
+            let list = as_large_list_array(&haystack)?;
+            array_positions_scalar::<i64>(list, &needle)
         }
+        t => exec_err!("array_positions does not support type '{t}'"),
+    }?;
+
+    if all_inputs_scalar {
+        Ok(Some(ColumnarValue::Scalar(ScalarValue::try_from_array(
+            &result, 0,
+        )?)))
+    } else {
+        Ok(Some(ColumnarValue::Array(result)))
+    }
+}
+
+fn array_positions_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [haystack, needle] = take_function_args("array_positions", args)?;
+
+    match &haystack.data_type() {
+        List(_) => general_positions::<i32>(as_list_array(&haystack)?, needle),
+        LargeList(_) => general_positions::<i64>(as_large_list_array(&haystack)?, needle),
+        dt => exec_err!("array_positions does not support type '{dt}'"),
     }
 }
 
-fn general_positions<OffsetSize: OffsetSizeTrait>(
-    list_array: &GenericListArray<OffsetSize>,
-    element_array: &ArrayRef,
+fn general_positions<O: OffsetSizeTrait>(
+    haystack: &GenericListArray<O>,
+    needle: &ArrayRef,
 ) -> Result<ArrayRef> {
-    let mut data = Vec::with_capacity(list_array.len());
+    crate::utils::check_datatypes("array_positions", &[haystack.values(), needle])?;
+    let mut data = Vec::with_capacity(haystack.len());
 
-    for (row_index, list_array_row) in list_array.iter().enumerate() {
-        if let Some(list_array_row) = list_array_row {
-            let eq_array =
-                compare_element_to_list(&list_array_row, element_array, row_index, true)?;
+    for (row_index, row) in haystack.iter().enumerate() {
+        if let Some(row) = row {
+            let eq_array = compare_element_to_list(&row, needle, row_index, true)?;
 
             // Collect `true`s in 1-indexed positions
             let indexes = eq_array
@@ -341,3 +511,243 @@ fn general_positions<OffsetSize: OffsetSizeTrait>(
         ListArray::from_iter_primitive::<UInt64Type, _, _>(data),
     ))
 }
+
+/// Fast path for `array_positions` when the needle is scalar.
+///
+/// Performs a single bulk `not_distinct` comparison of the needle against the
+/// entire flat values buffer, then walks the result bitmap using offsets to
+/// collect all per-row match positions.
+fn array_positions_scalar<O: OffsetSizeTrait>(
+    haystack: &GenericListArray<O>,
+    needle: &ArrayRef,
+) -> Result<ArrayRef> {
+    crate::utils::check_datatypes("array_positions", &[haystack.values(), needle])?;
+
+    let num_rows = haystack.len();
+    if num_rows == 0 {
+        return Ok(Arc::new(ListArray::try_new(
+            Arc::new(Field::new_list_field(UInt64, true)),
+            OffsetBuffer::new_zeroed(1),
+            Arc::new(UInt64Array::from(Vec::<u64>::new())),
+            None,
+        )?));
+    }
+
+    let needle_datum = Scalar::new(Arc::clone(needle));
+    let validity = haystack.nulls();
+
+    // Only convert the visible portion of the values array. For sliced
+    // ListArrays, values() returns the full underlying array but only
+    // elements between the first and last offset are referenced.
+    let offsets = haystack.offsets();
+    let first_offset = offsets[0].as_usize();
+    let last_offset = offsets[num_rows].as_usize();
+    let visible_values = haystack
+        .values()
+        .slice(first_offset, last_offset - first_offset);
+
+    // `not_distinct` treats NULL=NULL as true, matching the semantics of
+    // `array_positions`.
+    let eq_array = arrow_ord::cmp::not_distinct(&visible_values, &needle_datum)?;
+    let eq_bits = eq_array.values();
+
+    let num_matches = eq_bits.count_set_bits();
+    let mut positions: Vec<u64> = Vec::with_capacity(num_matches);
+    let mut result_offsets: Vec<i32> = Vec::with_capacity(num_rows + 1);
+    result_offsets.push(0);
+    let mut matches = eq_bits.set_indices().peekable();
+
+    // Match positions are relative to visible_values (0-based), so
+    // subtract first_offset from each offset when comparing.
+    for i in 0..num_rows {
+        let start = offsets[i].as_usize() - first_offset;
+        let end = offsets[i + 1].as_usize() - first_offset;
+
+        if validity.is_some_and(|v| v.is_null(i)) {
+            // Null row -> null output; advance past matches in range.
+            while matches.peek().is_some_and(|&p| p < end) {
+                matches.next();
+            }
+            result_offsets.push(positions.len() as i32);
+            continue;
+        }
+
+        // Collect all matches in [start, end).
+        while let Some(pos) = matches.next_if(|&p| p < end) {
+            positions.push((pos - start + 1) as u64);
+        }
+        result_offsets.push(positions.len() as i32);
+    }
+
+    debug_assert_eq!(result_offsets.len(), num_rows + 1);
+    Ok(Arc::new(ListArray::try_new(
+        Arc::new(Field::new_list_field(UInt64, true)),
+        OffsetBuffer::new(result_offsets.into()),
+        Arc::new(UInt64Array::from(positions)),
+        validity.cloned(),
+    )?))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::AsArray;
+    use arrow::datatypes::Int32Type;
+    use datafusion_common::config::ConfigOptions;
+
+    #[test]
+    fn test_array_position_sliced_list() -> Result<()> {
+        // [[10, 20], [30, 40], [50, 60], [70, 80]]  →  slice(1,2)  →  [[30, 40], [50, 60]]
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(10), Some(20)]),
+            Some(vec![Some(30), Some(40)]),
+            Some(vec![Some(50), Some(60)]),
+            Some(vec![Some(70), Some(80)]),
+        ]);
+        let sliced = list.slice(1, 2);
+        let haystack_field =
+            Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
+        let needle_field = Arc::new(Field::new("needle", DataType::Int32, true));
+        let return_field = Arc::new(Field::new("return", UInt64, true));
+
+        // Search for elements that exist only in sliced-away rows:
+        // 10 is in the prefix row, 70 is in the suffix row.
+        let invoke = |needle: i32| -> Result<ArrayRef> {
+            ArrayPosition::new()
+                .invoke_with_args(ScalarFunctionArgs {
+                    args: vec![
+                        ColumnarValue::Array(Arc::new(sliced.clone())),
+                        ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))),
+                    ],
+                    arg_fields: vec![
+                        Arc::clone(&haystack_field),
+                        Arc::clone(&needle_field),
+                    ],
+                    number_rows: 2,
+                    return_field: Arc::clone(&return_field),
+                    config_options: Arc::new(ConfigOptions::default()),
+                })?
+                .into_array(2)
+        };
+
+        let output = invoke(10)?;
+        let output = output.as_primitive::<UInt64Type>();
+        assert!(output.is_null(0));
+        assert!(output.is_null(1));
+
+        let output = invoke(70)?;
+        let output = output.as_primitive::<UInt64Type>();
+        assert!(output.is_null(0));
+        assert!(output.is_null(1));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_positions_sliced_list() -> Result<()> {
+        // [[10, 20, 30], [30, 40, 30], [50, 60, 30], [70, 80, 30]]
+        //   → slice(1,2) → [[30, 40, 30], [50, 60, 30]]
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(10), Some(20), Some(30)]),
+            Some(vec![Some(30), Some(40), Some(30)]),
+            Some(vec![Some(50), Some(60), Some(30)]),
+            Some(vec![Some(70), Some(80), Some(30)]),
+        ]);
+        let sliced = list.slice(1, 2);
+        let haystack_field =
+            Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
+        let needle_field = Arc::new(Field::new("needle", DataType::Int32, true));
+        let return_field = Arc::new(Field::new(
+            "return",
+            List(Arc::new(Field::new_list_field(UInt64, true))),
+            true,
+        ));
+
+        let invoke = |needle: i32| -> Result<ArrayRef> {
+            ArrayPositions::new()
+                .invoke_with_args(ScalarFunctionArgs {
+                    args: vec![
+                        ColumnarValue::Array(Arc::new(sliced.clone())),
+                        ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))),
+                    ],
+                    arg_fields: vec![
+                        Arc::clone(&haystack_field),
+                        Arc::clone(&needle_field),
+                    ],
+                    number_rows: 2,
+                    return_field: Arc::clone(&return_field),
+                    config_options: Arc::new(ConfigOptions::default()),
+                })?
+                .into_array(2)
+        };
+
+        // Needle 30: appears at positions 1,3 in row 0 ([30,40,30])
+        // and position 3 in row 1 ([50,60,30]).
+        let output = invoke(30)?;
+        let output = output.as_list::<i32>();
+        let row0 = output.value(0);
+        let row0 = row0.as_primitive::<UInt64Type>();
+        assert_eq!(row0.values().as_ref(), &[1, 3]);
+        let row1 = output.value(1);
+        let row1 = row1.as_primitive::<UInt64Type>();
+        assert_eq!(row1.values().as_ref(), &[3]);
+
+        // Needle 10: only in the sliced-away prefix row → empty lists.
+        let output = invoke(10)?;
+        let output = output.as_list::<i32>();
+        assert!(output.value(0).is_empty());
+        assert!(output.value(1).is_empty());
+
+        // Needle 70: only in the sliced-away suffix row → empty lists.
+        let output = invoke(70)?;
+        let output = output.as_list::<i32>();
+        assert!(output.value(0).is_empty());
+        assert!(output.value(1).is_empty());
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_positions_sliced_list_with_nulls() -> Result<()> {
+        // [[1, 2], null, [3, 1], [4, 5]]  →  slice(1,2)  →  [null, [3, 1]]
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), Some(2)]),
+            None,
+            Some(vec![Some(3), Some(1)]),
+            Some(vec![Some(4), Some(5)]),
+        ]);
+        let sliced = list.slice(1, 2);
+        let haystack_field =
+            Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
+        let needle_field = Arc::new(Field::new("needle", DataType::Int32, true));
+        let return_field = Arc::new(Field::new(
+            "return",
+            List(Arc::new(Field::new_list_field(UInt64, true))),
+            true,
+        ));
+
+        let output = ArrayPositions::new()
+            .invoke_with_args(ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Array(Arc::new(sliced)),
+                    ColumnarValue::Scalar(ScalarValue::Int32(Some(1))),
+                ],
+                arg_fields: vec![Arc::clone(&haystack_field), Arc::clone(&needle_field)],
+                number_rows: 2,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::new(ConfigOptions::default()),
+            })?
+            .into_array(2)?;
+
+        let output = output.as_list::<i32>();
+        // Row 0 is null (from the sliced null row).
+        assert!(output.is_null(0));
+        // Row 1 is [3, 1] → needle 1 found at position 2.
+        assert!(!output.is_null(1));
+        let row1 = output.value(1);
+        let row1 = row1.as_primitive::<UInt64Type>();
+        assert_eq!(row1.values().as_ref(), &[2]);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/functions-nested/src/range.rs b/datafusion/functions-nested/src/range.rs
index e570ecf97420f..c51b008191146 100644
--- a/datafusion/functions-nested/src/range.rs
+++ b/datafusion/functions-nested/src/range.rs
@@ -23,35 +23,34 @@ use arrow::datatypes::TimeUnit;
 use arrow::datatypes::{DataType, Field, IntervalUnit::MonthDayNano};
 use arrow::{
     array::{
+        Array, ArrayRef, Int64Array, ListArray, ListBuilder, NullBufferBuilder,
         builder::{Date32Builder, TimestampNanosecondBuilder},
         temporal_conversions::as_datetime_with_timezone,
         timezone::Tz,
         types::{Date32Type, IntervalMonthDayNanoType, TimestampNanosecondType},
-        Array, ArrayRef, Int64Array, ListArray, ListBuilder, NullBufferBuilder,
     },
     compute::cast,
 };
 use datafusion_common::internal_err;
 use datafusion_common::{
+    Result, exec_datafusion_err, exec_err, not_impl_datafusion_err,
+    utils::take_function_args,
+};
+use datafusion_common::{
+    ScalarValue,
     cast::{
         as_date32_array, as_int64_array, as_interval_mdn_array,
         as_timestamp_nanosecond_array,
     },
     types::{
-        logical_date, logical_int64, logical_interval_mdn, logical_string, NativeType,
+        NativeType, logical_date, logical_int64, logical_interval_mdn, logical_string,
     },
-    ScalarValue,
-};
-use datafusion_common::{
-    exec_datafusion_err, exec_err, not_impl_datafusion_err, utils::take_function_args,
-    Result,
 };
 use datafusion_expr::{
-    Coercion, ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature,
-    TypeSignatureClass, Volatility,
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature, TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::cmp::Ordering;
 use std::iter::from_fn;
 use std::str::FromStr;
@@ -212,10 +211,6 @@ impl Range {
 }
 
 impl ScalarUDFImpl for Range {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         if self.include_upper_bound {
             "generate_series"
@@ -252,10 +247,7 @@ impl ScalarUDFImpl for Range {
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
 
         if args.iter().any(|arg| arg.data_type().is_null()) {
@@ -297,7 +289,7 @@ impl Range {
     ///
     /// # Arguments
     ///
-    /// * `args` - An array of 1 to 3 ArrayRefs representing start, stop, and step(step value can not be zero.) values.
+    /// * `args` - An array of 1 to 3 ArrayRefs representing start, stop, and step (step value can not be zero) values.
     ///
     /// # Examples
     ///
@@ -392,20 +384,27 @@ impl Range {
             }
 
             let stop = if !self.include_upper_bound {
-                Date32Type::subtract_month_day_nano(stop, step)
+                Date32Type::subtract_month_day_nano_opt(stop, step).ok_or_else(|| {
+                    exec_datafusion_err!(
+                        "Cannot generate date range where stop {} - {step:?}) overflows",
+                        date32_to_string(stop)
+                    )
+                })?
             } else {
                 stop
             };
 
             let neg = months < 0 || days < 0;
-            let mut new_date = start;
+            let mut new_date = Some(start);
 
             let values = from_fn(|| {
-                if (neg && new_date < stop) || (!neg && new_date > stop) {
+                let Some(current_date) = new_date else {
+                    return None; // previous overflow
+                };
+                if (neg && current_date < stop) || (!neg && current_date > stop) {
                     None
                 } else {
-                    let current_date = new_date;
-                    new_date = Date32Type::add_month_day_nano(new_date, step);
+                    new_date = Date32Type::add_month_day_nano_opt(current_date, step);
                     Some(Some(current_date))
                 }
             });
@@ -573,8 +572,16 @@ fn gen_range_iter(
 }
 
 fn parse_tz(tz: &Option<&str>) -> Result<Tz> {
-    let tz = tz.as_ref().map_or_else(|| "+00", |s| s);
+    let tz = tz.unwrap_or_else(|| "+00");
 
     Tz::from_str(tz)
         .map_err(|op| exec_datafusion_err!("failed to parse timezone {tz}: {:?}", op))
 }
+
+fn date32_to_string(value: i32) -> String {
+    if let Some(d) = Date32Type::to_naive_date_opt(value) {
+        format!("{value} ({d})")
+    } else {
+        format!("{value} (unknown date)")
+    }
+}
diff --git a/datafusion/functions-nested/src/remove.rs b/datafusion/functions-nested/src/remove.rs
index d330606cdd894..d0f838ddad12a 100644
--- a/datafusion/functions-nested/src/remove.rs
+++ b/datafusion/functions-nested/src/remove.rs
@@ -20,33 +20,32 @@
 use crate::utils;
 use crate::utils::make_scalar_function;
 use arrow::array::{
-    cast::AsArray, new_empty_array, Array, ArrayRef, BooleanArray, GenericListArray,
-    OffsetSizeTrait,
+    Array, ArrayRef, Capacities, GenericListArray, MutableArrayData, OffsetSizeTrait,
+    cast::AsArray, make_array,
 };
-use arrow::buffer::OffsetBuffer;
-use arrow::datatypes::{DataType, Field};
+use arrow::buffer::{NullBuffer, OffsetBuffer};
+use arrow::datatypes::{DataType, FieldRef};
 use datafusion_common::cast::as_int64_array;
 use datafusion_common::utils::ListCoercion;
-use datafusion_common::{exec_err, utils::take_function_args, Result};
+use datafusion_common::{Result, exec_err, internal_err, utils::take_function_args};
 use datafusion_expr::{
     ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, Documentation,
-    ScalarUDFImpl, Signature, TypeSignature, Volatility,
+    ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::sync::Arc;
 
 make_udf_expr_and_func!(
     ArrayRemove,
     array_remove,
     array element,
-    "removes the first element from the array equal to the given value.",
+    "removes the first element from the array equal to the given value. NULL elements already in the array are preserved when removing a non-NULL value. If `element` evaluates to NULL, the result is NULL rather than removing NULL entries.",
     array_remove_udf
 );
 
 #[user_doc(
     doc_section(label = "Array Functions"),
-    description = "Removes the first element from the array equal to the given value.",
+    description = "Removes the first element from the array equal to the given value. NULL elements already in the array are preserved when removing a non-NULL value. If `element` evaluates to NULL, the result is NULL rather than removing NULL entries.",
     syntax_example = "array_remove(array, element)",
     sql_example = r#"```sql
 > select array_remove([1, 2, 2, 3, 2, 1, 4], 2);
@@ -55,6 +54,13 @@ make_udf_expr_and_func!(
 +----------------------------------------------+
 | [1, 2, 3, 2, 1, 4]                           |
 +----------------------------------------------+
+
+> select array_remove([1, 2, NULL, 2, 4], 2);
++---------------------------------------------------+
+| array_remove(List([1,2,NULL,2,4]),Int64(2)) |
++---------------------------------------------------+
+| [1, NULL, 2, 4]                              |
++---------------------------------------------------+
 ```"#,
     argument(
         name = "array",
@@ -87,10 +93,6 @@ impl ArrayRemove {
 }
 
 impl ScalarUDFImpl for ArrayRemove {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array_remove"
     }
@@ -99,14 +101,18 @@ impl ScalarUDFImpl for ArrayRemove {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(arg_types[0].clone())
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
     }
 
-    fn invoke_with_args(
+    fn return_field_from_args(
         &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+        args: datafusion_expr::ReturnFieldArgs,
+    ) -> Result<FieldRef> {
+        Ok(Arc::clone(&args.arg_fields[0]))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_remove_inner)(&args.args)
     }
 
@@ -123,14 +129,14 @@ make_udf_expr_and_func!(
     ArrayRemoveN,
     array_remove_n,
     array element max,
-    "removes the first `max` elements from the array equal to the given value.",
+    "removes the first `max` elements from the array equal to the given value. NULL elements already in the array are preserved when removing a non-NULL value. If `element` evaluates to NULL, the result is NULL rather than removing NULL entries.",
     array_remove_n_udf
 );
 
 #[user_doc(
     doc_section(label = "Array Functions"),
-    description = "Removes the first `max` elements from the array equal to the given value.",
-    syntax_example = "array_remove_n(array, element, max))",
+    description = "Removes the first `max` elements from the array equal to the given value. NULL elements already in the array are preserved when removing a non-NULL value. If `element` evaluates to NULL, the result is NULL rather than removing NULL entries.",
+    syntax_example = "array_remove_n(array, element, max)",
     sql_example = r#"```sql
 > select array_remove_n([1, 2, 2, 3, 2, 1, 4], 2, 2);
 +---------------------------------------------------------+
@@ -138,6 +144,13 @@ make_udf_expr_and_func!(
 +---------------------------------------------------------+
 | [1, 3, 2, 1, 4]                                         |
 +---------------------------------------------------------+
+
+> select array_remove_n([1, 2, NULL, 2, 4], 2, 2);
++----------------------------------------------------------+
+| array_remove_n(List([1,2,NULL,2,4]),Int64(2),Int64(2)) |
++----------------------------------------------------------+
+| [1, NULL, 4]                                            |
++----------------------------------------------------------+
 ```"#,
     argument(
         name = "array",
@@ -150,11 +163,17 @@ make_udf_expr_and_func!(
     argument(name = "max", description = "Number of first occurrences to remove.")
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
-pub(super) struct ArrayRemoveN {
+pub struct ArrayRemoveN {
     signature: Signature,
     aliases: Vec<String>,
 }
 
+impl Default for ArrayRemoveN {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl ArrayRemoveN {
     pub fn new() -> Self {
         Self {
@@ -175,10 +194,6 @@ impl ArrayRemoveN {
 }
 
 impl ScalarUDFImpl for ArrayRemoveN {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array_remove_n"
     }
@@ -187,14 +202,18 @@ impl ScalarUDFImpl for ArrayRemoveN {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(arg_types[0].clone())
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
     }
 
-    fn invoke_with_args(
+    fn return_field_from_args(
         &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+        args: datafusion_expr::ReturnFieldArgs,
+    ) -> Result<FieldRef> {
+        Ok(Arc::clone(&args.arg_fields[0]))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_remove_n_inner)(&args.args)
     }
 
@@ -211,13 +230,13 @@ make_udf_expr_and_func!(
     ArrayRemoveAll,
     array_remove_all,
     array element,
-    "removes all elements from the array equal to the given value.",
+    "removes all elements from the array equal to the given value. NULL elements already in the array are preserved when removing a non-NULL value. If `element` evaluates to NULL, the result is NULL rather than removing NULL entries.",
     array_remove_all_udf
 );
 
 #[user_doc(
     doc_section(label = "Array Functions"),
-    description = "Removes all elements from the array equal to the given value.",
+    description = "Removes all elements from the array equal to the given value. NULL elements already in the array are preserved when removing a non-NULL value. If `element` evaluates to NULL, the result is NULL rather than removing NULL entries.",
     syntax_example = "array_remove_all(array, element)",
     sql_example = r#"```sql
 > select array_remove_all([1, 2, 2, 3, 2, 1, 4], 2);
@@ -226,6 +245,13 @@ make_udf_expr_and_func!(
 +--------------------------------------------------+
 | [1, 3, 1, 4]                                     |
 +--------------------------------------------------+
+
+> select array_remove_all([1, 2, NULL, 2, 4], 2);
++-----------------------------------------------------+
+| array_remove_all(List([1,2,NULL,2,4]),Int64(2)) |
++-----------------------------------------------------+
+| [1, NULL, 4]                                     |
++-----------------------------------------------------+
 ```"#,
     argument(
         name = "array",
@@ -237,11 +263,17 @@ make_udf_expr_and_func!(
     )
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
-pub(super) struct ArrayRemoveAll {
+pub struct ArrayRemoveAll {
     signature: Signature,
     aliases: Vec<String>,
 }
 
+impl Default for ArrayRemoveAll {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl ArrayRemoveAll {
     pub fn new() -> Self {
         Self {
@@ -252,10 +284,6 @@ impl ArrayRemoveAll {
 }
 
 impl ScalarUDFImpl for ArrayRemoveAll {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array_remove_all"
     }
@@ -264,14 +292,18 @@ impl ScalarUDFImpl for ArrayRemoveAll {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(arg_types[0].clone())
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
     }
 
-    fn invoke_with_args(
+    fn return_field_from_args(
         &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+        args: datafusion_expr::ReturnFieldArgs,
+    ) -> Result<FieldRef> {
+        Ok(Arc::clone(&args.arg_fields[0]))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_remove_all_inner)(&args.args)
     }
 
@@ -284,34 +316,31 @@ impl ScalarUDFImpl for ArrayRemoveAll {
     }
 }
 
-/// Array_remove SQL function
-pub fn array_remove_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_remove_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array, element] = take_function_args("array_remove", args)?;
 
     let arr_n = vec![1; array.len()];
-    array_remove_internal(array, element, arr_n)
+    array_remove_internal(array, element, &arr_n)
 }
 
-/// Array_remove_n SQL function
-pub fn array_remove_n_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_remove_n_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array, element, max] = take_function_args("array_remove_n", args)?;
 
     let arr_n = as_int64_array(max)?.values().to_vec();
-    array_remove_internal(array, element, arr_n)
+    array_remove_internal(array, element, &arr_n)
 }
 
-/// Array_remove_all SQL function
-pub fn array_remove_all_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_remove_all_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array, element] = take_function_args("array_remove_all", args)?;
 
     let arr_n = vec![i64::MAX; array.len()];
-    array_remove_internal(array, element, arr_n)
+    array_remove_internal(array, element, &arr_n)
 }
 
 fn array_remove_internal(
     array: &ArrayRef,
     element_array: &ArrayRef,
-    arr_n: Vec<i64>,
+    arr_n: &[i64],
 ) -> Result<ArrayRef> {
     match array.data_type() {
         DataType::List(_) => {
@@ -348,74 +377,519 @@ fn array_remove_internal(
 fn general_remove<OffsetSize: OffsetSizeTrait>(
     list_array: &GenericListArray<OffsetSize>,
     element_array: &ArrayRef,
-    arr_n: Vec<i64>,
+    arr_n: &[i64],
 ) -> Result<ArrayRef> {
-    let data_type = list_array.value_type();
-    let mut new_values = vec![];
+    let list_field = match list_array.data_type() {
+        DataType::List(field) | DataType::LargeList(field) => field,
+        _ => {
+            return exec_err!(
+                "Expected List or LargeList data type, got {:?}",
+                list_array.data_type()
+            );
+        }
+    };
+    let original_data = list_array.values().to_data();
     // Build up the offsets for the final output array
     let mut offsets = Vec::<OffsetSize>::with_capacity(arr_n.len() + 1);
     offsets.push(OffsetSize::zero());
 
-    // n is the number of elements to remove in this row
-    for (row_index, (list_array_row, n)) in
-        list_array.iter().zip(arr_n.iter()).enumerate()
-    {
-        match list_array_row {
-            Some(list_array_row) => {
-                let eq_array = utils::compare_element_to_list(
-                    &list_array_row,
-                    element_array,
-                    row_index,
-                    false,
-                )?;
-
-                // We need to keep at most first n elements as `false`, which represent the elements to remove.
-                let eq_array = if eq_array.false_count() < *n as usize {
-                    eq_array
-                } else {
-                    let mut count = 0;
-                    eq_array
-                        .iter()
-                        .map(|e| {
-                            // Keep first n `false` elements, and reverse other elements to `true`.
-                            if let Some(false) = e {
-                                if count < *n {
-                                    count += 1;
-                                    e
-                                } else {
-                                    Some(true)
-                                }
-                            } else {
-                                e
-                            }
-                        })
-                        .collect::<BooleanArray>()
-                };
-
-                let filtered_array = arrow::compute::filter(&list_array_row, &eq_array)?;
-                offsets.push(
-                    offsets[row_index] + OffsetSize::usize_as(filtered_array.len()),
-                );
-                new_values.push(filtered_array);
-            }
-            None => {
-                // Null element results in a null row (no new offsets)
-                offsets.push(offsets[row_index]);
+    let mut mutable = MutableArrayData::with_capacities(
+        vec![&original_data],
+        false,
+        Capacities::Array(original_data.len()),
+    );
+
+    // Pre-compute combined null bitmap
+    let nulls = NullBuffer::union(list_array.nulls(), element_array.nulls());
+
+    for (row_index, offset_window) in list_array.offsets().windows(2).enumerate() {
+        if nulls.as_ref().is_some_and(|nulls| nulls.is_null(row_index)) {
+            offsets.push(offsets[row_index]);
+            continue;
+        }
+
+        let start = offset_window[0].to_usize().unwrap();
+        let end = offset_window[1].to_usize().unwrap();
+        // n is the number of elements to remove in this row
+        let n = arr_n[row_index];
+
+        // compare each element in the list, `false` means the element matches and should be removed
+        let eq_array = utils::compare_element_to_list(
+            &list_array.value(row_index),
+            element_array,
+            row_index,
+            false,
+        )?;
+
+        let num_to_remove = eq_array.false_count();
+
+        // Fast path: no elements to remove, copy entire row
+        if num_to_remove == 0 {
+            mutable.extend(0, start, end);
+            offsets.push(offsets[row_index] + OffsetSize::usize_as(end - start));
+            continue;
+        }
+
+        // Remove at most `n` matching elements
+        let max_removals = n.min(num_to_remove as i64);
+        let mut removed = 0i64;
+        let mut copied = 0usize;
+        // marks the beginning of a range of elements pending to be copied.
+        let mut pending_batch_to_retain: Option<usize> = None;
+        for (i, keep) in eq_array.iter().enumerate() {
+            if keep == Some(false) && removed < max_removals {
+                // Flush pending batch before skipping this element
+                if let Some(bs) = pending_batch_to_retain {
+                    mutable.extend(0, start + bs, start + i);
+                    copied += i - bs;
+                    pending_batch_to_retain = None;
+                }
+                removed += 1;
+            } else if pending_batch_to_retain.is_none() {
+                pending_batch_to_retain = Some(i);
             }
         }
-    }
 
-    let values = if new_values.is_empty() {
-        new_empty_array(&data_type)
-    } else {
-        let new_values = new_values.iter().map(|x| x.as_ref()).collect::<Vec<_>>();
-        arrow::compute::concat(&new_values)?
-    };
+        // Flush remaining batch
+        if let Some(bs) = pending_batch_to_retain {
+            mutable.extend(0, start + bs, start + eq_array.len());
+            copied += eq_array.len() - bs;
+        }
 
+        offsets.push(offsets[row_index] + OffsetSize::usize_as(copied));
+    }
+
+    let new_values = make_array(mutable.freeze());
     Ok(Arc::new(GenericListArray::<OffsetSize>::try_new(
-        Arc::new(Field::new_list_field(data_type, true)),
+        Arc::clone(list_field),
         OffsetBuffer::new(offsets.into()),
-        values,
-        list_array.nulls().cloned(),
+        new_values,
+        nulls,
     )?))
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::remove::{ArrayRemove, ArrayRemoveAll, ArrayRemoveN};
+    use arrow::array::{
+        Array, ArrayRef, AsArray, GenericListArray, ListArray, OffsetSizeTrait,
+    };
+    use arrow::datatypes::{DataType, Field, Int32Type};
+    use datafusion_common::ScalarValue;
+    use datafusion_expr::{ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl};
+    use datafusion_expr_common::columnar_value::ColumnarValue;
+    use std::ops::Deref;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_array_remove_nullability() {
+        for nullability in [true, false] {
+            for item_nullability in [true, false] {
+                let input_field = Arc::new(Field::new(
+                    "num",
+                    DataType::new_list(DataType::Int32, item_nullability),
+                    nullability,
+                ));
+                let args_fields = vec![
+                    Arc::clone(&input_field),
+                    Arc::new(Field::new("a", DataType::Int32, false)),
+                ];
+                let scalar_args = vec![None, Some(&ScalarValue::Int32(Some(1)))];
+
+                let result = ArrayRemove::new()
+                    .return_field_from_args(ReturnFieldArgs {
+                        arg_fields: &args_fields,
+                        scalar_arguments: &scalar_args,
+                    })
+                    .unwrap();
+
+                assert_eq!(result, input_field);
+            }
+        }
+    }
+
+    #[test]
+    fn test_array_remove_n_nullability() {
+        for nullability in [true, false] {
+            for item_nullability in [true, false] {
+                let input_field = Arc::new(Field::new(
+                    "num",
+                    DataType::new_list(DataType::Int32, item_nullability),
+                    nullability,
+                ));
+                let args_fields = vec![
+                    Arc::clone(&input_field),
+                    Arc::new(Field::new("a", DataType::Int32, false)),
+                    Arc::new(Field::new("b", DataType::Int64, false)),
+                ];
+                let scalar_args = vec![
+                    None,
+                    Some(&ScalarValue::Int32(Some(1))),
+                    Some(&ScalarValue::Int64(Some(1))),
+                ];
+
+                let result = ArrayRemoveN::new()
+                    .return_field_from_args(ReturnFieldArgs {
+                        arg_fields: &args_fields,
+                        scalar_arguments: &scalar_args,
+                    })
+                    .unwrap();
+
+                assert_eq!(result, input_field);
+            }
+        }
+    }
+
+    #[test]
+    fn test_array_remove_all_nullability() {
+        for nullability in [true, false] {
+            for item_nullability in [true, false] {
+                let input_field = Arc::new(Field::new(
+                    "num",
+                    DataType::new_list(DataType::Int32, item_nullability),
+                    nullability,
+                ));
+                let result = ArrayRemoveAll::new()
+                    .return_field_from_args(ReturnFieldArgs {
+                        arg_fields: &[Arc::clone(&input_field)],
+                        scalar_arguments: &[None],
+                    })
+                    .unwrap();
+
+                assert_eq!(result, input_field);
+            }
+        }
+    }
+
+    fn ensure_field_nullability<O: OffsetSizeTrait>(
+        field_nullable: bool,
+        list: GenericListArray<O>,
+    ) -> GenericListArray<O> {
+        let (field, offsets, values, nulls) = list.into_parts();
+
+        if field.is_nullable() == field_nullable {
+            return GenericListArray::new(field, offsets, values, nulls);
+        }
+        if !field_nullable {
+            assert_eq!(nulls, None);
+        }
+
+        let field = Arc::new(field.deref().clone().with_nullable(field_nullable));
+
+        GenericListArray::new(field, offsets, values, nulls)
+    }
+
+    #[test]
+    fn test_array_remove_non_nullable() {
+        let input_list = Arc::new(ensure_field_nullability(
+            false,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(([1, 2, 2, 3, 2, 1, 4]).iter().copied().map(Some)),
+                Some(([42, 2, 55, 63, 2]).iter().copied().map(Some)),
+            ]),
+        ));
+        let expected_list = ensure_field_nullability(
+            false,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(([1, 2, 3, 2, 1, 4]).iter().copied().map(Some)),
+                Some(([42, 55, 63, 2]).iter().copied().map(Some)),
+            ]),
+        );
+
+        let element_to_remove = ScalarValue::Int32(Some(2));
+
+        assert_array_remove(input_list, expected_list, element_to_remove);
+    }
+
+    #[test]
+    fn test_array_remove_nullable() {
+        let input_list = Arc::new(ensure_field_nullability(
+            true,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(vec![
+                    Some(1),
+                    Some(2),
+                    Some(2),
+                    Some(3),
+                    None,
+                    Some(1),
+                    Some(4),
+                ]),
+                Some(vec![Some(42), Some(2), None, Some(63), Some(2)]),
+            ]),
+        ));
+        let expected_list = ensure_field_nullability(
+            true,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(vec![Some(1), Some(2), Some(3), None, Some(1), Some(4)]),
+                Some(vec![Some(42), None, Some(63), Some(2)]),
+            ]),
+        );
+
+        let element_to_remove = ScalarValue::Int32(Some(2));
+
+        assert_array_remove(input_list, expected_list, element_to_remove);
+    }
+
+    fn assert_array_remove(
+        input_list: ArrayRef,
+        expected_list: GenericListArray<i32>,
+        element_to_remove: ScalarValue,
+    ) {
+        assert_eq!(input_list.data_type(), expected_list.data_type());
+        assert_eq!(expected_list.value_type(), element_to_remove.data_type());
+        let input_list_len = input_list.len();
+        let input_list_data_type = input_list.data_type().clone();
+
+        let udf = ArrayRemove::new();
+        let args_fields = vec![
+            Arc::new(Field::new("num", input_list.data_type().clone(), false)),
+            Arc::new(Field::new(
+                "el",
+                element_to_remove.data_type(),
+                element_to_remove.is_null(),
+            )),
+        ];
+        let scalar_args = vec![None, Some(&element_to_remove)];
+
+        let return_field = udf
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &args_fields,
+                scalar_arguments: &scalar_args,
+            })
+            .unwrap();
+
+        let result = udf
+            .invoke_with_args(ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Array(input_list),
+                    ColumnarValue::Scalar(element_to_remove),
+                ],
+                arg_fields: args_fields,
+                number_rows: input_list_len,
+                return_field,
+                config_options: Arc::new(Default::default()),
+            })
+            .unwrap();
+
+        assert_eq!(result.data_type(), input_list_data_type);
+        match result {
+            ColumnarValue::Array(array) => {
+                let result_list = array.as_list::<i32>();
+                assert_eq!(result_list, &expected_list);
+            }
+            _ => panic!("Expected ColumnarValue::Array"),
+        }
+    }
+
+    #[test]
+    fn test_array_remove_n_non_nullable() {
+        let input_list = Arc::new(ensure_field_nullability(
+            false,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(([1, 2, 2, 3, 2, 1, 4]).iter().copied().map(Some)),
+                Some(([42, 2, 55, 63, 2]).iter().copied().map(Some)),
+            ]),
+        ));
+        let expected_list = ensure_field_nullability(
+            false,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(([1, 3, 2, 1, 4]).iter().copied().map(Some)),
+                Some(([42, 55, 63]).iter().copied().map(Some)),
+            ]),
+        );
+
+        let element_to_remove = ScalarValue::Int32(Some(2));
+
+        assert_array_remove_n(input_list, expected_list, element_to_remove, 2);
+    }
+
+    #[test]
+    fn test_array_remove_n_nullable() {
+        let input_list = Arc::new(ensure_field_nullability(
+            true,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(vec![
+                    Some(1),
+                    Some(2),
+                    Some(2),
+                    Some(3),
+                    None,
+                    Some(1),
+                    Some(4),
+                ]),
+                Some(vec![Some(42), Some(2), None, Some(63), Some(2)]),
+            ]),
+        ));
+        let expected_list = ensure_field_nullability(
+            true,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(vec![Some(1), Some(3), None, Some(1), Some(4)]),
+                Some(vec![Some(42), None, Some(63)]),
+            ]),
+        );
+
+        let element_to_remove = ScalarValue::Int32(Some(2));
+
+        assert_array_remove_n(input_list, expected_list, element_to_remove, 2);
+    }
+
+    fn assert_array_remove_n(
+        input_list: ArrayRef,
+        expected_list: GenericListArray<i32>,
+        element_to_remove: ScalarValue,
+        n: i64,
+    ) {
+        assert_eq!(input_list.data_type(), expected_list.data_type());
+        assert_eq!(expected_list.value_type(), element_to_remove.data_type());
+        let input_list_len = input_list.len();
+        let input_list_data_type = input_list.data_type().clone();
+
+        let count_scalar = ScalarValue::Int64(Some(n));
+
+        let udf = ArrayRemoveN::new();
+        let args_fields = vec![
+            Arc::new(Field::new("num", input_list.data_type().clone(), false)),
+            Arc::new(Field::new(
+                "el",
+                element_to_remove.data_type(),
+                element_to_remove.is_null(),
+            )),
+            Arc::new(Field::new("count", DataType::Int64, false)),
+        ];
+        let scalar_args = vec![None, Some(&element_to_remove), Some(&count_scalar)];
+
+        let return_field = udf
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &args_fields,
+                scalar_arguments: &scalar_args,
+            })
+            .unwrap();
+
+        let result = udf
+            .invoke_with_args(ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Array(input_list),
+                    ColumnarValue::Scalar(element_to_remove),
+                    ColumnarValue::Scalar(count_scalar),
+                ],
+                arg_fields: args_fields,
+                number_rows: input_list_len,
+                return_field,
+                config_options: Arc::new(Default::default()),
+            })
+            .unwrap();
+
+        assert_eq!(result.data_type(), input_list_data_type);
+        match result {
+            ColumnarValue::Array(array) => {
+                let result_list = array.as_list::<i32>();
+                assert_eq!(result_list, &expected_list);
+            }
+            _ => panic!("Expected ColumnarValue::Array"),
+        }
+    }
+
+    #[test]
+    fn test_array_remove_all_non_nullable() {
+        let input_list = Arc::new(ensure_field_nullability(
+            false,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(([1, 2, 2, 3, 2, 1, 4]).iter().copied().map(Some)),
+                Some(([42, 2, 55, 63, 2]).iter().copied().map(Some)),
+            ]),
+        ));
+        let expected_list = ensure_field_nullability(
+            false,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(([1, 3, 1, 4]).iter().copied().map(Some)),
+                Some(([42, 55, 63]).iter().copied().map(Some)),
+            ]),
+        );
+
+        let element_to_remove = ScalarValue::Int32(Some(2));
+
+        assert_array_remove_all(input_list, expected_list, element_to_remove);
+    }
+
+    #[test]
+    fn test_array_remove_all_nullable() {
+        let input_list = Arc::new(ensure_field_nullability(
+            true,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(vec![
+                    Some(1),
+                    Some(2),
+                    Some(2),
+                    Some(3),
+                    None,
+                    Some(1),
+                    Some(4),
+                ]),
+                Some(vec![Some(42), Some(2), None, Some(63), Some(2)]),
+            ]),
+        ));
+        let expected_list = ensure_field_nullability(
+            true,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(vec![Some(1), Some(3), None, Some(1), Some(4)]),
+                Some(vec![Some(42), None, Some(63)]),
+            ]),
+        );
+
+        let element_to_remove = ScalarValue::Int32(Some(2));
+
+        assert_array_remove_all(input_list, expected_list, element_to_remove);
+    }
+
+    fn assert_array_remove_all(
+        input_list: ArrayRef,
+        expected_list: GenericListArray<i32>,
+        element_to_remove: ScalarValue,
+    ) {
+        assert_eq!(input_list.data_type(), expected_list.data_type());
+        assert_eq!(expected_list.value_type(), element_to_remove.data_type());
+        let input_list_len = input_list.len();
+        let input_list_data_type = input_list.data_type().clone();
+
+        let udf = ArrayRemoveAll::new();
+        let args_fields = vec![
+            Arc::new(Field::new("num", input_list.data_type().clone(), false)),
+            Arc::new(Field::new(
+                "el",
+                element_to_remove.data_type(),
+                element_to_remove.is_null(),
+            )),
+        ];
+        let scalar_args = vec![None, Some(&element_to_remove)];
+
+        let return_field = udf
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &args_fields,
+                scalar_arguments: &scalar_args,
+            })
+            .unwrap();
+
+        let result = udf
+            .invoke_with_args(ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Array(input_list),
+                    ColumnarValue::Scalar(element_to_remove),
+                ],
+                arg_fields: args_fields,
+                number_rows: input_list_len,
+                return_field,
+                config_options: Arc::new(Default::default()),
+            })
+            .unwrap();
+
+        assert_eq!(result.data_type(), input_list_data_type);
+        match result {
+            ColumnarValue::Array(array) => {
+                let result_list = array.as_list::<i32>();
+                assert_eq!(result_list, &expected_list);
+            }
+            _ => panic!("Expected ColumnarValue::Array"),
+        }
+    }
+}
diff --git a/datafusion/functions-nested/src/repeat.rs b/datafusion/functions-nested/src/repeat.rs
index ed66b9e396762..ceec748a6e776 100644
--- a/datafusion/functions-nested/src/repeat.rs
+++ b/datafusion/functions-nested/src/repeat.rs
@@ -19,24 +19,25 @@
 
 use crate::utils::make_scalar_function;
 use arrow::array::{
-    new_null_array, Array, ArrayRef, Capacities, GenericListArray, ListArray,
-    MutableArrayData, OffsetSizeTrait, UInt64Array,
+    Array, ArrayRef, BooleanBufferBuilder, GenericListArray, Int64Array, OffsetSizeTrait,
+    UInt64Array,
 };
-use arrow::buffer::OffsetBuffer;
+use arrow::buffer::{NullBuffer, OffsetBuffer};
 use arrow::compute;
-use arrow::compute::cast;
 use arrow::datatypes::DataType;
 use arrow::datatypes::{
     DataType::{LargeList, List},
     Field,
 };
-use datafusion_common::cast::{as_large_list_array, as_list_array, as_uint64_array};
-use datafusion_common::{exec_err, utils::take_function_args, Result};
+use datafusion_common::cast::{as_int64_array, as_large_list_array, as_list_array};
+use datafusion_common::types::{NativeType, logical_int64};
+use datafusion_common::{DataFusionError, Result};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
+use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::sync::Arc;
 
 make_udf_expr_and_func!(
@@ -89,17 +90,23 @@ impl Default for ArrayRepeat {
 impl ArrayRepeat {
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_exact(TypeSignatureClass::Any),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_int64()),
+                        vec![TypeSignatureClass::Integer],
+                        NativeType::Int64,
+                    ),
+                ],
+                Volatility::Immutable,
+            ),
             aliases: vec![String::from("list_repeat")],
         }
     }
 }
 
 impl ScalarUDFImpl for ArrayRepeat {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array_repeat"
     }
@@ -109,16 +116,20 @@ impl ScalarUDFImpl for ArrayRepeat {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(List(Arc::new(Field::new_list_field(
-            arg_types[0].clone(),
-            true,
-        ))))
+        let element_type = &arg_types[0];
+        match element_type {
+            LargeList(_) => Ok(LargeList(Arc::new(Field::new_list_field(
+                element_type.clone(),
+                true,
+            )))),
+            _ => Ok(List(Arc::new(Field::new_list_field(
+                element_type.clone(),
+                true,
+            )))),
+        }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_repeat_inner)(&args.args)
     }
 
@@ -126,40 +137,14 @@ impl ScalarUDFImpl for ArrayRepeat {
         &self.aliases
     }
 
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        let [first_type, second_type] = take_function_args(self.name(), arg_types)?;
-
-        // Coerce the second argument to Int64/UInt64 if it's a numeric type
-        let second = match second_type {
-            DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => {
-                DataType::Int64
-            }
-            DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => {
-                DataType::UInt64
-            }
-            _ => return exec_err!("count must be an integer type"),
-        };
-
-        Ok(vec![first_type.clone(), second])
-    }
-
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 }
 
-/// Array_repeat SQL function
-pub fn array_repeat_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_repeat_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let element = &args[0];
-    let count_array = &args[1];
-
-    let count_array = match count_array.data_type() {
-        DataType::Int64 => &cast(count_array, &DataType::UInt64)?,
-        DataType::UInt64 => count_array,
-        _ => return exec_err!("count must be an integer type"),
-    };
-
-    let count_array = as_uint64_array(count_array)?;
+    let count_array = as_int64_array(&args[1])?;
 
     match element.data_type() {
         List(_) => {
@@ -188,45 +173,46 @@ pub fn array_repeat_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
 /// ```
 fn general_repeat<O: OffsetSizeTrait>(
     array: &ArrayRef,
-    count_array: &UInt64Array,
+    count_array: &Int64Array,
 ) -> Result<ArrayRef> {
-    let data_type = array.data_type();
-    let mut new_values = vec![];
-
-    let count_vec = count_array
-        .values()
-        .to_vec()
-        .iter()
-        .map(|x| *x as usize)
-        .collect::<Vec<_>>();
-
-    for (row_index, &count) in count_vec.iter().enumerate() {
-        let repeated_array = if array.is_null(row_index) {
-            new_null_array(data_type, count)
-        } else {
-            let original_data = array.to_data();
-            let capacity = Capacities::Array(count);
-            let mut mutable =
-                MutableArrayData::with_capacities(vec![&original_data], false, capacity);
-
-            for _ in 0..count {
-                mutable.extend(0, row_index, row_index + 1);
-            }
-
-            let data = mutable.freeze();
-            arrow::array::make_array(data)
-        };
-        new_values.push(repeated_array);
+    let total_repeated_values: usize = (0..count_array.len())
+        .map(|i| get_count_with_validity(count_array, i))
+        .sum();
+
+    let mut take_indices = Vec::with_capacity(total_repeated_values);
+    let mut offsets = Vec::with_capacity(count_array.len() + 1);
+    offsets.push(O::zero());
+    let mut running_offset = 0usize;
+
+    for idx in 0..count_array.len() {
+        let count = get_count_with_validity(count_array, idx);
+        running_offset = running_offset.checked_add(count).ok_or_else(|| {
+            DataFusionError::Execution(
+                "array_repeat: running_offset overflowed usize".to_string(),
+            )
+        })?;
+        let offset = O::from_usize(running_offset).ok_or_else(|| {
+            DataFusionError::Execution(format!(
+                "array_repeat: offset {running_offset} exceeds the maximum value for offset type"
+            ))
+        })?;
+        offsets.push(offset);
+        take_indices.extend(std::iter::repeat_n(idx as u64, count));
     }
 
-    let new_values: Vec<_> = new_values.iter().map(|a| a.as_ref()).collect();
-    let values = compute::concat(&new_values)?;
+    // Build the flattened values
+    let repeated_values = compute::take(
+        array.as_ref(),
+        &UInt64Array::from_iter_values(take_indices),
+        None,
+    )?;
 
+    // Construct final ListArray
     Ok(Arc::new(GenericListArray::<O>::try_new(
-        Arc::new(Field::new_list_field(data_type.to_owned(), true)),
-        OffsetBuffer::from_lengths(count_vec),
-        values,
-        None,
+        Arc::new(Field::new_list_field(array.data_type().to_owned(), true)),
+        OffsetBuffer::new(offsets.into()),
+        repeated_values,
+        count_array.nulls().cloned(),
     )?))
 }
 
@@ -242,58 +228,95 @@ fn general_repeat<O: OffsetSizeTrait>(
 /// ```
 fn general_list_repeat<O: OffsetSizeTrait>(
     list_array: &GenericListArray<O>,
-    count_array: &UInt64Array,
+    count_array: &Int64Array,
 ) -> Result<ArrayRef> {
-    let data_type = list_array.data_type();
-    let value_type = list_array.value_type();
-    let mut new_values = vec![];
-
-    let count_vec = count_array
-        .values()
-        .to_vec()
-        .iter()
-        .map(|x| *x as usize)
-        .collect::<Vec<_>>();
-
-    for (list_array_row, &count) in list_array.iter().zip(count_vec.iter()) {
-        let list_arr = match list_array_row {
-            Some(list_array_row) => {
-                let original_data = list_array_row.to_data();
-                let capacity = Capacities::Array(original_data.len() * count);
-                let mut mutable = MutableArrayData::with_capacities(
-                    vec![&original_data],
-                    false,
-                    capacity,
-                );
-
-                for _ in 0..count {
-                    mutable.extend(0, 0, original_data.len());
-                }
-
-                let data = mutable.freeze();
-                let repeated_array = arrow::array::make_array(data);
-
-                let list_arr = GenericListArray::<O>::try_new(
-                    Arc::new(Field::new_list_field(value_type.clone(), true)),
-                    OffsetBuffer::<O>::from_lengths(vec![original_data.len(); count]),
-                    repeated_array,
-                    None,
-                )?;
-                Arc::new(list_arr) as ArrayRef
+    let list_offsets = list_array.value_offsets();
+
+    // calculate capacities for pre-allocation
+    let mut outer_total = 0usize;
+    let mut inner_total = 0usize;
+    for i in 0..count_array.len() {
+        let count = get_count_with_validity(count_array, i);
+        if count > 0 {
+            outer_total += count;
+            if list_array.is_valid(i) {
+                let len = list_offsets[i + 1].to_usize().unwrap()
+                    - list_offsets[i].to_usize().unwrap();
+                inner_total += len * count;
             }
-            None => new_null_array(data_type, count),
-        };
-        new_values.push(list_arr);
+        }
     }
 
-    let lengths = new_values.iter().map(|a| a.len()).collect::<Vec<_>>();
-    let new_values: Vec<_> = new_values.iter().map(|a| a.as_ref()).collect();
-    let values = compute::concat(&new_values)?;
+    // Build inner structures
+    let mut inner_offsets = Vec::with_capacity(outer_total + 1);
+    let mut take_indices = Vec::with_capacity(inner_total);
+    let mut inner_nulls = BooleanBufferBuilder::new(outer_total);
+    let mut inner_running = 0usize;
+    inner_offsets.push(O::zero());
+
+    for row_idx in 0..count_array.len() {
+        let count = get_count_with_validity(count_array, row_idx);
+        let list_is_valid = list_array.is_valid(row_idx);
+        let start = list_offsets[row_idx].to_usize().unwrap();
+        let end = list_offsets[row_idx + 1].to_usize().unwrap();
+        let row_len = end - start;
+
+        for _ in 0..count {
+            inner_running = inner_running.checked_add(row_len).ok_or_else(|| {
+                DataFusionError::Execution(
+                    "array_repeat: inner offset overflowed usize".to_string(),
+                )
+            })?;
+            let offset = O::from_usize(inner_running).ok_or_else(|| {
+                DataFusionError::Execution(format!(
+                    "array_repeat: offset {inner_running} exceeds the maximum value for offset type"
+                ))
+            })?;
+            inner_offsets.push(offset);
+            inner_nulls.append(list_is_valid);
+            if list_is_valid {
+                take_indices.extend(start as u64..end as u64);
+            }
+        }
+    }
 
-    Ok(Arc::new(ListArray::try_new(
-        Arc::new(Field::new_list_field(data_type.to_owned(), true)),
-        OffsetBuffer::<i32>::from_lengths(lengths),
-        values,
+    // Build inner ListArray
+    let inner_values = compute::take(
+        list_array.values().as_ref(),
+        &UInt64Array::from_iter_values(take_indices),
         None,
+    )?;
+    let inner_list = GenericListArray::<O>::try_new(
+        Arc::new(Field::new_list_field(list_array.value_type().clone(), true)),
+        OffsetBuffer::new(inner_offsets.into()),
+        inner_values,
+        Some(NullBuffer::new(inner_nulls.finish())),
+    )?;
+
+    // Build outer ListArray
+    Ok(Arc::new(GenericListArray::<O>::try_new(
+        Arc::new(Field::new_list_field(
+            list_array.data_type().to_owned(),
+            true,
+        )),
+        OffsetBuffer::<O>::from_lengths(
+            count_array
+                .iter()
+                .map(|c| c.map(|v| if v > 0 { v as usize } else { 0 }).unwrap_or(0)),
+        ),
+        Arc::new(inner_list),
+        count_array.nulls().cloned(),
     )?))
 }
+
+/// Helper function to get count from count_array at given index
+/// Return 0 for null values or non-positive count.
+#[inline]
+fn get_count_with_validity(count_array: &Int64Array, idx: usize) -> usize {
+    if count_array.is_null(idx) {
+        0
+    } else {
+        let c = count_array.value(idx);
+        if c > 0 { c as usize } else { 0 }
+    }
+}
diff --git a/datafusion/functions-nested/src/replace.rs b/datafusion/functions-nested/src/replace.rs
index 4314d41419bcc..7effd13b696ad 100644
--- a/datafusion/functions-nested/src/replace.rs
+++ b/datafusion/functions-nested/src/replace.rs
@@ -18,25 +18,24 @@
 //! [`ScalarUDFImpl`] definitions for array_replace, array_replace_n and array_replace_all functions.
 
 use arrow::array::{
-    new_null_array, Array, ArrayRef, AsArray, Capacities, GenericListArray,
-    MutableArrayData, NullBufferBuilder, OffsetSizeTrait,
+    Array, ArrayRef, AsArray, Capacities, GenericListArray, MutableArrayData,
+    NullBufferBuilder, OffsetSizeTrait, new_null_array,
 };
 use arrow::datatypes::{DataType, Field};
 
 use arrow::buffer::OffsetBuffer;
 use datafusion_common::cast::as_int64_array;
 use datafusion_common::utils::ListCoercion;
-use datafusion_common::{exec_err, utils::take_function_args, Result};
+use datafusion_common::{Result, exec_err, utils::take_function_args};
 use datafusion_expr::{
     ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, Documentation,
-    ScalarUDFImpl, Signature, TypeSignature, Volatility,
+    ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility,
 };
 use datafusion_macros::user_doc;
 
 use crate::utils::compare_element_to_list;
 use crate::utils::make_scalar_function;
 
-use std::any::Any;
 use std::sync::Arc;
 
 // Create static instances of ScalarUDFs for each function
@@ -113,10 +112,6 @@ impl ArrayReplace {
 }
 
 impl ScalarUDFImpl for ArrayReplace {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array_replace"
     }
@@ -129,10 +124,7 @@ impl ScalarUDFImpl for ArrayReplace {
         Ok(args[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_replace_inner)(&args.args)
     }
 
@@ -195,10 +187,6 @@ impl ArrayReplaceN {
 }
 
 impl ScalarUDFImpl for ArrayReplaceN {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array_replace_n"
     }
@@ -211,10 +199,7 @@ impl ScalarUDFImpl for ArrayReplaceN {
         Ok(args[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_replace_n_inner)(&args.args)
     }
 
@@ -275,10 +260,6 @@ impl ArrayReplaceAll {
 }
 
 impl ScalarUDFImpl for ArrayReplaceAll {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array_replace_all"
     }
@@ -291,10 +272,7 @@ impl ScalarUDFImpl for ArrayReplaceAll {
         Ok(args[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_replace_all_inner)(&args.args)
     }
 
@@ -328,7 +306,7 @@ fn general_replace<O: OffsetSizeTrait>(
     list_array: &GenericListArray<O>,
     from_array: &ArrayRef,
     to_array: &ArrayRef,
-    arr_n: Vec<i64>,
+    arr_n: &[i64],
 ) -> Result<ArrayRef> {
     // Build up the offsets for the final output array
     let mut offsets: Vec<O> = vec![O::usize_as(0)];
@@ -369,7 +347,7 @@ fn general_replace<O: OffsetSizeTrait>(
         let mut counter = 0;
 
         // All elements are false, no need to replace, just copy original data
-        if eq_array.false_count() == eq_array.len() {
+        if !eq_array.has_true() {
             mutable.extend(
                 original_idx.to_usize().unwrap(),
                 start.to_usize().unwrap(),
@@ -418,7 +396,7 @@ fn general_replace<O: OffsetSizeTrait>(
     )?))
 }
 
-pub(crate) fn array_replace_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_replace_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array, from, to] = take_function_args("array_replace", args)?;
 
     // replace at most one occurrence for each element
@@ -426,18 +404,18 @@ pub(crate) fn array_replace_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     match array.data_type() {
         DataType::List(_) => {
             let list_array = array.as_list::<i32>();
-            general_replace::<i32>(list_array, from, to, arr_n)
+            general_replace::<i32>(list_array, from, to, &arr_n)
         }
         DataType::LargeList(_) => {
             let list_array = array.as_list::<i64>();
-            general_replace::<i64>(list_array, from, to, arr_n)
+            general_replace::<i64>(list_array, from, to, &arr_n)
         }
         DataType::Null => Ok(new_null_array(array.data_type(), 1)),
         array_type => exec_err!("array_replace does not support type '{array_type}'."),
     }
 }
 
-pub(crate) fn array_replace_n_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_replace_n_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array, from, to, max] = take_function_args("array_replace_n", args)?;
 
     // replace the specified number of occurrences
@@ -445,11 +423,11 @@ pub(crate) fn array_replace_n_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     match array.data_type() {
         DataType::List(_) => {
             let list_array = array.as_list::<i32>();
-            general_replace::<i32>(list_array, from, to, arr_n)
+            general_replace::<i32>(list_array, from, to, &arr_n)
         }
         DataType::LargeList(_) => {
             let list_array = array.as_list::<i64>();
-            general_replace::<i64>(list_array, from, to, arr_n)
+            general_replace::<i64>(list_array, from, to, &arr_n)
         }
         DataType::Null => Ok(new_null_array(array.data_type(), 1)),
         array_type => {
@@ -458,7 +436,7 @@ pub(crate) fn array_replace_n_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     }
 }
 
-pub(crate) fn array_replace_all_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_replace_all_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array, from, to] = take_function_args("array_replace_all", args)?;
 
     // replace all occurrences (up to "i64::MAX")
@@ -466,11 +444,11 @@ pub(crate) fn array_replace_all_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     match array.data_type() {
         DataType::List(_) => {
             let list_array = array.as_list::<i32>();
-            general_replace::<i32>(list_array, from, to, arr_n)
+            general_replace::<i32>(list_array, from, to, &arr_n)
         }
         DataType::LargeList(_) => {
             let list_array = array.as_list::<i64>();
-            general_replace::<i64>(list_array, from, to, arr_n)
+            general_replace::<i64>(list_array, from, to, &arr_n)
         }
         DataType::Null => Ok(new_null_array(array.data_type(), 1)),
         array_type => {
diff --git a/datafusion/functions-nested/src/resize.rs b/datafusion/functions-nested/src/resize.rs
index 09f67a75fd56a..243f3531f9150 100644
--- a/datafusion/functions-nested/src/resize.rs
+++ b/datafusion/functions-nested/src/resize.rs
@@ -19,8 +19,8 @@
 
 use crate::utils::make_scalar_function;
 use arrow::array::{
-    new_null_array, Array, ArrayRef, Capacities, GenericListArray, Int64Array,
-    MutableArrayData, NullBufferBuilder, OffsetSizeTrait,
+    Array, ArrayRef, Capacities, GenericListArray, Int64Array, MutableArrayData,
+    NullBufferBuilder, OffsetSizeTrait, new_null_array,
 };
 use arrow::buffer::OffsetBuffer;
 use arrow::datatypes::DataType;
@@ -31,13 +31,12 @@ use arrow::datatypes::{
 };
 use datafusion_common::cast::{as_int64_array, as_large_list_array, as_list_array};
 use datafusion_common::utils::ListCoercion;
-use datafusion_common::{exec_err, internal_datafusion_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_datafusion_err};
 use datafusion_expr::{
     ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, Documentation,
-    ScalarUDFImpl, Signature, TypeSignature, Volatility,
+    ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::sync::Arc;
 
 make_udf_expr_and_func!(
@@ -111,10 +110,6 @@ impl ArrayResize {
 }
 
 impl ScalarUDFImpl for ArrayResize {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array_resize"
     }
@@ -136,10 +131,7 @@ impl ScalarUDFImpl for ArrayResize {
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_resize_inner)(&args.args)
     }
 
@@ -152,8 +144,7 @@ impl ScalarUDFImpl for ArrayResize {
     }
 }
 
-/// array_resize SQL function
-pub(crate) fn array_resize_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_resize_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
     if arg.len() < 2 || arg.len() > 3 {
         return exec_err!("array_resize needs two or three arguments");
     }
@@ -169,7 +160,7 @@ pub(crate) fn array_resize_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
                 return exec_err!(
                     "array_resize does not support type '{:?}'.",
                     array.data_type()
-                )
+                );
             }
         };
         return Ok(new_null_array(&return_type, array.len()));
@@ -207,24 +198,113 @@ fn general_list_resize<O: OffsetSizeTrait + TryInto<i64>>(
     let values = array.values();
     let original_data = values.to_data();
 
-    // create default element array
-    let default_element = if let Some(default_element) = default_element {
-        default_element
+    // Track the largest per-row growth so the uniform-fill fast path can
+    // materialize one reusable fill buffer of the required size.
+    let mut max_extra: usize = 0;
+    let mut output_values_len: usize = 0;
+    for (row_index, offset_window) in array.offsets().windows(2).enumerate() {
+        if array.is_null(row_index) {
+            continue;
+        }
+        let target_count = count_array.value(row_index).to_usize().ok_or_else(|| {
+            internal_datafusion_err!("array_resize: failed to convert size to usize")
+        })?;
+        output_values_len =
+            output_values_len.checked_add(target_count).ok_or_else(|| {
+                internal_datafusion_err!("array_resize: output size overflow")
+            })?;
+        let current_len = (offset_window[1] - offset_window[0]).to_usize().unwrap();
+        if target_count > current_len {
+            max_extra = max_extra.max(target_count - current_len);
+        }
+    }
+
+    // The fast path is valid when at least one row grows and every row would
+    // use the same fill value.
+    let use_bulk_fill = max_extra > 0
+        && match &default_element {
+            None => true,
+            Some(fill_array) => {
+                let len = fill_array.len();
+                let null_count = fill_array.logical_null_count();
+
+                len <= 1
+                    || null_count == len
+                    || (null_count == 0 && {
+                        let first = fill_array.slice(0, 1);
+                        (1..len)
+                            .all(|i| fill_array.slice(i, 1).as_ref() == first.as_ref())
+                    })
+            }
+        };
+
+    if use_bulk_fill {
+        // Fast path: materialize one reusable fill buffer for all grown rows.
+        let fill_scalar = match &default_element {
+            None => ScalarValue::try_from(&data_type)?,
+            Some(fill_array) if fill_array.logical_null_count() == fill_array.len() => {
+                ScalarValue::try_from(&data_type)?
+            }
+            Some(fill_array) => ScalarValue::try_from_array(fill_array.as_ref(), 0)?,
+        };
+        let fill_values = fill_scalar.to_array_of_size(max_extra)?;
+        let default_value_data = fill_values.to_data();
+        build_resized_list(
+            array,
+            count_array,
+            field,
+            &original_data,
+            &default_value_data,
+            output_values_len,
+            |mutable, _, extra_count| mutable.extend(1, 0, extra_count),
+        )
     } else {
-        let null_scalar = ScalarValue::try_from(&data_type)?;
-        null_scalar.to_array_of_size(original_data.len())?
-    };
-    let default_value_data = default_element.to_data();
+        // Slow path: rows may need different fill values, so append from the
+        // corresponding slot in the input fill array for each grown element.
+        let fill_values = match default_element {
+            Some(fill_values) => fill_values,
+            None => {
+                let null_scalar = ScalarValue::try_from(&data_type)?;
+                null_scalar.to_array_of_size(original_data.len())?
+            }
+        };
+        let default_value_data = fill_values.to_data();
+        build_resized_list(
+            array,
+            count_array,
+            field,
+            &original_data,
+            &default_value_data,
+            output_values_len,
+            |mutable, row_index, extra_count| {
+                for _ in 0..extra_count {
+                    mutable.extend(1, row_index, row_index + 1);
+                }
+            },
+        )
+    }
+}
 
-    // create a mutable array to store the original data
-    let capacity = Capacities::Array(original_data.len() + default_value_data.len());
+fn build_resized_list<O, F>(
+    array: &GenericListArray<O>,
+    count_array: &Int64Array,
+    field: &FieldRef,
+    original_data: &arrow::array::ArrayData,
+    default_value_data: &arrow::array::ArrayData,
+    output_values_len: usize,
+    mut append_fill_values: F,
+) -> Result<ArrayRef>
+where
+    O: OffsetSizeTrait + TryInto<i64>,
+    F: FnMut(&mut MutableArrayData, usize, usize),
+{
+    let capacity = Capacities::Array(output_values_len);
     let mut offsets = vec![O::usize_as(0)];
     let mut mutable = MutableArrayData::with_capacities(
-        vec![&original_data, &default_value_data],
+        vec![original_data, default_value_data],
         false,
         capacity,
     );
-
     let mut null_builder = NullBufferBuilder::new(array.len());
 
     for (row_index, offset_window) in array.offsets().windows(2).enumerate() {
@@ -241,21 +321,13 @@ fn general_list_resize<O: OffsetSizeTrait + TryInto<i64>>(
         let count = O::usize_as(count);
         let start = offset_window[0];
         if start + count > offset_window[1] {
-            let extra_count =
-                (start + count - offset_window[1]).try_into().map_err(|_| {
-                    internal_datafusion_err!(
-                        "array_resize: failed to convert size to i64"
-                    )
-                })?;
+            let extra_count = (start + count - offset_window[1]).to_usize().unwrap();
             let end = offset_window[1];
-            mutable.extend(0, (start).to_usize().unwrap(), (end).to_usize().unwrap());
-            // append default element
-            for _ in 0..extra_count {
-                mutable.extend(1, row_index, row_index + 1);
-            }
+            mutable.extend(0, start.to_usize().unwrap(), end.to_usize().unwrap());
+            append_fill_values(&mut mutable, row_index, extra_count);
         } else {
             let end = start + count;
-            mutable.extend(0, (start).to_usize().unwrap(), (end).to_usize().unwrap());
+            mutable.extend(0, start.to_usize().unwrap(), end.to_usize().unwrap());
         };
         offsets.push(offsets[row_index] + count);
     }
diff --git a/datafusion/functions-nested/src/reverse.rs b/datafusion/functions-nested/src/reverse.rs
index 870e54f590009..587d0dd29f306 100644
--- a/datafusion/functions-nested/src/reverse.rs
+++ b/datafusion/functions-nested/src/reverse.rs
@@ -19,22 +19,26 @@
 
 use crate::utils::make_scalar_function;
 use arrow::array::{
-    Array, ArrayRef, Capacities, FixedSizeListArray, GenericListArray, MutableArrayData,
-    OffsetSizeTrait,
+    Array, ArrayRef, FixedSizeListArray, GenericListArray, GenericListViewArray,
+    OffsetSizeTrait, UInt32Array, UInt64Array,
+};
+use arrow::buffer::{OffsetBuffer, ScalarBuffer};
+use arrow::compute::take;
+use arrow::datatypes::DataType::{
+    FixedSizeList, LargeList, LargeListView, List, ListView, Null,
 };
-use arrow::buffer::OffsetBuffer;
-use arrow::datatypes::DataType::{FixedSizeList, LargeList, List, Null};
 use arrow::datatypes::{DataType, FieldRef};
 use datafusion_common::cast::{
-    as_fixed_size_list_array, as_large_list_array, as_list_array,
+    as_fixed_size_list_array, as_large_list_array, as_large_list_view_array,
+    as_list_array, as_list_view_array,
 };
-use datafusion_common::{exec_err, utils::take_function_args, Result};
+use datafusion_common::{Result, exec_err, utils::take_function_args};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 use itertools::Itertools;
-use std::any::Any;
 use std::sync::Arc;
 
 make_udf_expr_and_func!(
@@ -84,10 +88,6 @@ impl ArrayReverse {
 }
 
 impl ScalarUDFImpl for ArrayReverse {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array_reverse"
     }
@@ -100,10 +100,7 @@ impl ScalarUDFImpl for ArrayReverse {
         Ok(arg_types[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_reverse_inner)(&args.args)
     }
 
@@ -134,6 +131,14 @@ pub fn array_reverse_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
             fixed_size_array_reverse(array, field)
         }
         Null => Ok(Arc::clone(input_array)),
+        ListView(field) => {
+            let array = as_list_view_array(input_array)?;
+            list_view_reverse::<i32>(array, field)
+        }
+        LargeListView(field) => {
+            let array = as_large_list_view_array(input_array)?;
+            list_view_reverse::<i64>(array, field)
+        }
         array_type => exec_err!("array_reverse does not support type '{array_type}'."),
     }
 }
@@ -143,11 +148,8 @@ fn general_array_reverse<O: OffsetSizeTrait>(
     field: &FieldRef,
 ) -> Result<ArrayRef> {
     let values = array.values();
-    let original_data = values.to_data();
-    let capacity = Capacities::Array(original_data.len());
     let mut offsets = vec![O::usize_as(0)];
-    let mut mutable =
-        MutableArrayData::with_capacities(vec![&original_data], false, capacity);
+    let mut indices: Vec<O> = Vec::with_capacity(values.len());
 
     for (row_index, (&start, &end)) in array.offsets().iter().tuple_windows().enumerate()
     {
@@ -159,51 +161,333 @@ fn general_array_reverse<O: OffsetSizeTrait>(
 
         let mut index = end - O::one();
         while index >= start {
-            mutable.extend(0, index.to_usize().unwrap(), index.to_usize().unwrap() + 1);
+            indices.push(index);
             index = index - O::one();
         }
         let size = end - start;
         offsets.push(offsets[row_index] + size);
     }
 
-    let data = mutable.freeze();
+    // Materialize values from underlying array with take
+    let indices_array: ArrayRef = if O::IS_LARGE {
+        Arc::new(UInt64Array::from(
+            indices
+                .iter()
+                .map(|i| i.as_usize() as u64)
+                .collect::<Vec<_>>(),
+        ))
+    } else {
+        Arc::new(UInt32Array::from(
+            indices
+                .iter()
+                .map(|i| i.as_usize() as u32)
+                .collect::<Vec<_>>(),
+        ))
+    };
+    let values = take(&values, &indices_array, None)?;
     Ok(Arc::new(GenericListArray::<O>::try_new(
         Arc::clone(field),
         OffsetBuffer::<O>::new(offsets.into()),
-        arrow::array::make_array(data),
+        values,
         array.nulls().cloned(),
     )?))
 }
 
-fn fixed_size_array_reverse(
-    array: &FixedSizeListArray,
+/// Reverses a list view array.
+///
+/// Construct indices, sizes and offsets for the reversed array by iterating over
+/// the list view array in the logical order, and reversing the order of the elements.
+/// We end up with a list view array where the elements are in order,
+/// even if the original array had elements out of order.
+fn list_view_reverse<O: OffsetSizeTrait>(
+    array: &GenericListViewArray<O>,
     field: &FieldRef,
 ) -> Result<ArrayRef> {
+    let offsets = array.offsets();
     let values = array.values();
-    let original_data = values.to_data();
-    let capacity = Capacities::Array(original_data.len());
-    let mut mutable =
-        MutableArrayData::with_capacities(vec![&original_data], false, capacity);
-    let value_length = array.value_length() as usize;
+    let sizes = array.sizes();
 
-    for row_index in 0..array.len() {
-        // skip the null value
+    let mut new_offsets: Vec<O> = Vec::with_capacity(offsets.len());
+    let mut indices: Vec<O> = Vec::with_capacity(values.len());
+    let mut new_sizes = Vec::with_capacity(sizes.len());
+
+    let mut current_offset = O::zero();
+    for (row_index, offset) in offsets.iter().enumerate() {
+        new_offsets.push(current_offset);
+
+        // If this array is null, we set its size to 0 and continue
         if array.is_null(row_index) {
-            mutable.extend(0, 0, value_length);
+            new_sizes.push(O::zero());
             continue;
         }
-        let start = row_index * value_length;
-        let end = start + value_length;
-        for idx in (start..end).rev() {
-            mutable.extend(0, idx, idx + 1);
+        let size = sizes[row_index];
+        new_sizes.push(size);
+
+        // Each array is located at [offset, offset + size), collect indices in the reverse order
+        let array_start = *offset;
+        let array_end = array_start + size;
+        let mut idx = array_end - O::one();
+        while idx >= array_start {
+            indices.push(idx);
+            idx = idx - O::one();
         }
+
+        current_offset += size;
+    }
+
+    // Materialize values from underlying array with take
+    let indices_array: ArrayRef = if O::IS_LARGE {
+        Arc::new(UInt64Array::from(
+            indices
+                .iter()
+                .map(|i| i.as_usize() as u64)
+                .collect::<Vec<_>>(),
+        ))
+    } else {
+        Arc::new(UInt32Array::from(
+            indices
+                .iter()
+                .map(|i| i.as_usize() as u32)
+                .collect::<Vec<_>>(),
+        ))
+    };
+    let values = take(&values, &indices_array, None)?;
+    Ok(Arc::new(GenericListViewArray::<O>::try_new(
+        Arc::clone(field),
+        ScalarBuffer::from(new_offsets),
+        ScalarBuffer::from(new_sizes),
+        values,
+        array.nulls().cloned(),
+    )?))
+}
+
+fn fixed_size_array_reverse(
+    array: &FixedSizeListArray,
+    field: &FieldRef,
+) -> Result<ArrayRef> {
+    let values: &Arc<dyn Array> = array.values();
+
+    // Since each fixed size list in the physical array is the same size and we keep the order
+    // of the fixed size lists, we can reverse the indices for each fixed size list.
+    let mut indices: Vec<u64> = (0..values.len() as u64).collect();
+    for chunk in indices.chunks_mut(array.value_length() as usize) {
+        chunk.reverse();
     }
 
-    let data = mutable.freeze();
+    // Materialize values from underlying array with take
+    let indices_array: ArrayRef = Arc::new(UInt64Array::from(indices));
+    let values = take(&values, &indices_array, None)?;
+
     Ok(Arc::new(FixedSizeListArray::try_new(
         Arc::clone(field),
         array.value_length(),
-        arrow::array::make_array(data),
+        values,
         array.nulls().cloned(),
     )?))
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::reverse::{fixed_size_array_reverse, list_view_reverse};
+    use arrow::{
+        array::{
+            AsArray, FixedSizeListArray, GenericListViewArray, Int32Array,
+            LargeListViewArray, ListViewArray, OffsetSizeTrait,
+        },
+        buffer::{NullBuffer, ScalarBuffer},
+        datatypes::{DataType, Field, Int32Type},
+    };
+    use datafusion_common::Result;
+    use std::sync::Arc;
+
+    fn list_view_values<O: OffsetSizeTrait>(
+        array: &GenericListViewArray<O>,
+    ) -> Vec<Option<Vec<i32>>> {
+        array
+            .iter()
+            .map(|x| x.map(|x| x.as_primitive::<Int32Type>().values().to_vec()))
+            .collect()
+    }
+
+    fn fixed_size_list_values(array: &FixedSizeListArray) -> Vec<Option<Vec<i32>>> {
+        array
+            .iter()
+            .map(|x| x.map(|x| x.as_primitive::<Int32Type>().values().to_vec()))
+            .collect()
+    }
+
+    #[test]
+    fn test_reverse_list_view() -> Result<()> {
+        let field = Arc::new(Field::new("a", DataType::Int32, false));
+        let offsets = ScalarBuffer::from(vec![0, 1, 6, 6]);
+        let sizes = ScalarBuffer::from(vec![1, 5, 0, 3]);
+        let values = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9]));
+        let nulls = Some(NullBuffer::from(vec![true, true, false, true]));
+        let list_view = ListViewArray::new(field, offsets, sizes, values, nulls);
+        let result = list_view_reverse(
+            &list_view,
+            &Arc::new(Field::new("test", DataType::Int32, true)),
+        )?;
+        let reversed = list_view_values(result.as_list_view::<i32>());
+        let expected = vec![
+            Some(vec![1]),
+            Some(vec![6, 5, 4, 3, 2]),
+            None,
+            Some(vec![9, 8, 7]),
+        ];
+        assert_eq!(expected, reversed);
+        Ok(())
+    }
+
+    #[test]
+    fn test_reverse_large_list_view() -> Result<()> {
+        let field = Arc::new(Field::new("a", DataType::Int32, false));
+        let offsets = ScalarBuffer::from(vec![0, 1, 6, 6]);
+        let sizes = ScalarBuffer::from(vec![1, 5, 0, 3]);
+        let values = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9]));
+        let nulls = Some(NullBuffer::from(vec![true, true, false, true]));
+        let list_view = LargeListViewArray::new(field, offsets, sizes, values, nulls);
+        let result = list_view_reverse(
+            &list_view,
+            &Arc::new(Field::new("test", DataType::Int32, true)),
+        )?;
+        let reversed = list_view_values(result.as_list_view::<i64>());
+        let expected = vec![
+            Some(vec![1]),
+            Some(vec![6, 5, 4, 3, 2]),
+            None,
+            Some(vec![9, 8, 7]),
+        ];
+        assert_eq!(expected, reversed);
+        Ok(())
+    }
+
+    #[test]
+    fn test_reverse_list_view_out_of_order() -> Result<()> {
+        let field = Arc::new(Field::new("a", DataType::Int32, false));
+        let offsets = ScalarBuffer::from(vec![6, 1, 6, 0]); // out of order
+        let sizes = ScalarBuffer::from(vec![3, 5, 0, 1]);
+        let values = Arc::new(Int32Array::from(vec![
+            1, // fourth array: offset 0, size 1
+            2, 3, 4, 5, 6, // second array: offset 1, size 5
+            // third array: offset 6, size 0 (and null)
+            7, 8, 9, // first array: offset 6, size 3
+        ]));
+        let nulls = Some(NullBuffer::from(vec![true, true, false, true]));
+        let list_view = ListViewArray::new(field, offsets, sizes, values, nulls);
+        let result = list_view_reverse(
+            &list_view,
+            &Arc::new(Field::new("test", DataType::Int32, true)),
+        )?;
+        let reversed = list_view_values(result.as_list_view::<i32>());
+        let expected = vec![
+            Some(vec![9, 8, 7]),
+            Some(vec![6, 5, 4, 3, 2]),
+            None,
+            Some(vec![1]),
+        ];
+        assert_eq!(expected, reversed);
+        Ok(())
+    }
+
+    #[test]
+    fn test_reverse_list_view_with_nulls() -> Result<()> {
+        let field = Arc::new(Field::new("a", DataType::Int32, false));
+        let offsets = ScalarBuffer::from(vec![16, 1, 6, 0]); // out of order
+        let sizes = ScalarBuffer::from(vec![3, 5, 10, 1]);
+        let values = Arc::new(Int32Array::from(vec![
+            1, // fourth array: offset 0, size 1
+            2, 3, 4, 5, 6, // second array: offset 1, size 5
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // third array: offset 6, size 10
+            7, 8, 9, // first array: offset 6, size 3
+        ]));
+        let nulls = Some(NullBuffer::from(vec![true, true, false, true]));
+        let list_view = ListViewArray::new(field, offsets, sizes, values, nulls);
+        let result = list_view_reverse(
+            &list_view,
+            &Arc::new(Field::new("test", DataType::Int32, true)),
+        )?;
+        let reversed = list_view_values(result.as_list_view::<i32>());
+        let expected = vec![
+            Some(vec![9, 8, 7]),
+            Some(vec![6, 5, 4, 3, 2]),
+            None,
+            Some(vec![1]),
+        ];
+        assert_eq!(expected, reversed);
+        Ok(())
+    }
+
+    #[test]
+    fn test_reverse_list_view_empty() -> Result<()> {
+        let field = Arc::new(Field::new("a", DataType::Int32, false));
+        let offsets = ScalarBuffer::from(vec![]);
+        let sizes = ScalarBuffer::from(vec![]);
+        let empty_array: Vec<i32> = vec![];
+        let values = Arc::new(Int32Array::from(empty_array));
+        let nulls = None;
+        let list_view = ListViewArray::new(field, offsets, sizes, values, nulls);
+        let result = list_view_reverse(
+            &list_view,
+            &Arc::new(Field::new("test", DataType::Int32, true)),
+        )?;
+        let reversed = list_view_values(result.as_list_view::<i32>());
+        let expected: Vec<Option<Vec<i32>>> = vec![];
+        assert_eq!(expected, reversed);
+        Ok(())
+    }
+
+    #[test]
+    fn test_reverse_list_view_all_nulls() -> Result<()> {
+        let field = Arc::new(Field::new("a", DataType::Int32, false));
+        let offsets = ScalarBuffer::from(vec![0, 1, 2, 3]);
+        let sizes = ScalarBuffer::from(vec![0, 1, 1, 1]);
+        let values = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
+        let nulls = Some(NullBuffer::from(vec![false, false, false, false]));
+        let list_view = ListViewArray::new(field, offsets, sizes, values, nulls);
+        let result = list_view_reverse(
+            &list_view,
+            &Arc::new(Field::new("test", DataType::Int32, true)),
+        )?;
+        let reversed = list_view_values(result.as_list_view::<i32>());
+        let expected: Vec<Option<Vec<i32>>> = vec![None, None, None, None];
+        assert_eq!(expected, reversed);
+        Ok(())
+    }
+
+    #[test]
+    fn test_reverse_fixed_size_list() -> Result<()> {
+        let field = Arc::new(Field::new("a", DataType::Int32, false));
+        let values = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9]));
+        let result = fixed_size_array_reverse(
+            &FixedSizeListArray::new(
+                field,
+                3,
+                values,
+                Some(NullBuffer::from(vec![true, false, true])),
+            ),
+            &Arc::new(Field::new("test", DataType::Int32, true)),
+        )?;
+        let reversed = fixed_size_list_values(result.as_fixed_size_list());
+        let expected = vec![Some(vec![3, 2, 1]), None, Some(vec![9, 8, 7])];
+        assert_eq!(expected, reversed);
+        Ok(())
+    }
+
+    #[test]
+    fn test_reverse_fixed_size_list_empty() -> Result<()> {
+        let field = Arc::new(Field::new("a", DataType::Int32, false));
+        let empty_array: Vec<i32> = vec![];
+        let values = Arc::new(Int32Array::from(empty_array));
+        let nulls = None;
+        let fixed_size_list = FixedSizeListArray::new(field, 3, values, nulls);
+        let result = fixed_size_array_reverse(
+            &fixed_size_list,
+            &Arc::new(Field::new("test", DataType::Int32, true)),
+        )?;
+        let reversed = fixed_size_list_values(result.as_fixed_size_list());
+        let expected: Vec<Option<Vec<i32>>> = vec![];
+        assert_eq!(expected, reversed);
+        Ok(())
+    }
+}
diff --git a/datafusion/functions-nested/src/set_ops.rs b/datafusion/functions-nested/src/set_ops.rs
index 53642bf1622b0..2ad08e2d43c02 100644
--- a/datafusion/functions-nested/src/set_ops.rs
+++ b/datafusion/functions-nested/src/set_ops.rs
@@ -19,24 +19,25 @@
 
 use crate::utils::make_scalar_function;
 use arrow::array::{
-    new_null_array, Array, ArrayRef, GenericListArray, LargeListArray, ListArray,
-    OffsetSizeTrait,
+    Array, ArrayRef, GenericListArray, OffsetSizeTrait, UInt32Array, UInt64Array,
+    new_empty_array, new_null_array,
 };
-use arrow::buffer::OffsetBuffer;
-use arrow::compute;
+use arrow::buffer::{NullBuffer, OffsetBuffer};
+use arrow::compute::{concat, take};
 use arrow::datatypes::DataType::{LargeList, List, Null};
 use arrow::datatypes::{DataType, Field, FieldRef};
 use arrow::row::{RowConverter, SortField};
 use datafusion_common::cast::{as_large_list_array, as_list_array};
 use datafusion_common::utils::ListCoercion;
-use datafusion_common::{exec_err, internal_err, utils::take_function_args, Result};
+use datafusion_common::{
+    Result, assert_eq_or_internal_err, exec_err, internal_err, utils::take_function_args,
+};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
-use itertools::Itertools;
-use std::any::Any;
-use std::collections::HashSet;
+use hashbrown::HashSet;
 use std::fmt::{Display, Formatter};
 use std::sync::Arc;
 
@@ -67,7 +68,7 @@ make_udf_expr_and_func!(
 
 #[user_doc(
     doc_section(label = "Array Functions"),
-    description = "Returns an array of elements that are present in both arrays (all elements from both arrays) with out duplicates.",
+    description = "Returns an array of elements that are present in both arrays (all elements from both arrays) without duplicates.",
     syntax_example = "array_union(array1, array2)",
     sql_example = r#"```sql
 > select array_union([1, 2, 3, 4], [5, 6, 3, 4]);
@@ -118,10 +119,6 @@ impl ArrayUnion {
 }
 
 impl ScalarUDFImpl for ArrayUnion {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array_union"
     }
@@ -134,16 +131,12 @@ impl ScalarUDFImpl for ArrayUnion {
         let [array1, array2] = take_function_args(self.name(), arg_types)?;
         match (array1, array2) {
             (Null, Null) => Ok(DataType::new_list(Null, true)),
-            (Null, dt) => Ok(dt.clone()),
-            (dt, Null) => Ok(dt.clone()),
+            (Null, dt) | (dt, Null) => Ok(dt.clone()),
             (dt, _) => Ok(dt.clone()),
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_union_inner)(&args.args)
     }
 
@@ -184,11 +177,17 @@ impl ScalarUDFImpl for ArrayUnion {
     )
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
-pub(super) struct ArrayIntersect {
+pub struct ArrayIntersect {
     signature: Signature,
     aliases: Vec<String>,
 }
 
+impl Default for ArrayIntersect {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl ArrayIntersect {
     pub fn new() -> Self {
         Self {
@@ -203,10 +202,6 @@ impl ArrayIntersect {
 }
 
 impl ScalarUDFImpl for ArrayIntersect {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array_intersect"
     }
@@ -219,16 +214,12 @@ impl ScalarUDFImpl for ArrayIntersect {
         let [array1, array2] = take_function_args(self.name(), arg_types)?;
         match (array1, array2) {
             (Null, Null) => Ok(DataType::new_list(Null, true)),
-            (Null, dt) => Ok(dt.clone()),
-            (dt, Null) => Ok(dt.clone()),
+            (Null, dt) | (dt, Null) => Ok(dt.clone()),
             (dt, _) => Ok(dt.clone()),
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_intersect_inner)(&args.args)
     }
 
@@ -259,7 +250,7 @@ impl ScalarUDFImpl for ArrayIntersect {
     )
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
-pub(super) struct ArrayDistinct {
+pub struct ArrayDistinct {
     signature: Signature,
     aliases: Vec<String>,
 }
@@ -273,11 +264,13 @@ impl ArrayDistinct {
     }
 }
 
-impl ScalarUDFImpl for ArrayDistinct {
-    fn as_any(&self) -> &dyn Any {
-        self
+impl Default for ArrayDistinct {
+    fn default() -> Self {
+        Self::new()
     }
+}
 
+impl ScalarUDFImpl for ArrayDistinct {
     fn name(&self) -> &str {
         "array_distinct"
     }
@@ -290,10 +283,7 @@ impl ScalarUDFImpl for ArrayDistinct {
         Ok(arg_types[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_distinct_inner)(&args.args)
     }
 
@@ -324,7 +314,7 @@ fn array_distinct_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     }
 }
 
-#[derive(Debug, PartialEq)]
+#[derive(Debug, PartialEq, Copy, Clone)]
 enum SetOp {
     Union,
     Intersect,
@@ -353,70 +343,164 @@ fn generic_set_lists<OffsetSize: OffsetSizeTrait>(
         return general_array_distinct::<OffsetSize>(l, &field);
     }
 
-    if l.value_type() != r.value_type() {
-        return internal_err!("{set_op:?} is not implemented for '{l:?}' and '{r:?}'");
-    }
+    assert_eq_or_internal_err!(
+        l.value_type(),
+        r.value_type(),
+        "{set_op:?} is not implemented for '{l:?}' and '{r:?}'"
+    );
 
-    let mut offsets = vec![OffsetSize::usize_as(0)];
-    let mut new_arrays = vec![];
     let converter = RowConverter::new(vec![SortField::new(l.value_type())])?;
-    for (first_arr, second_arr) in l.iter().zip(r.iter()) {
-        let l_values = if let Some(first_arr) = first_arr {
-            converter.convert_columns(&[first_arr])?
-        } else {
-            converter.convert_columns(&[])?
-        };
 
-        let r_values = if let Some(second_arr) = second_arr {
-            converter.convert_columns(&[second_arr])?
-        } else {
-            converter.convert_columns(&[])?
-        };
+    // Only convert the visible portion of the values array. For sliced
+    // ListArrays, values() returns the full underlying array but only
+    // elements between the first and last offset are referenced.
+    let l_first = l.offsets()[0].as_usize();
+    let l_len = l.offsets()[l.len()].as_usize() - l_first;
+    let rows_l = converter.convert_columns(&[l.values().slice(l_first, l_len)])?;
+
+    let r_first = r.offsets()[0].as_usize();
+    let r_len = r.offsets()[r.len()].as_usize() - r_first;
+    let rows_r = converter.convert_columns(&[r.values().slice(r_first, r_len)])?;
+
+    // Combine the *sliced* value arrays so 0-based indices from the row
+    // converter map directly into the concatenated array.
+    let l_values = l.values().slice(l_first, l_len);
+    let r_values = r.values().slice(r_first, r_len);
+    let combined_values = concat(&[l_values.as_ref(), r_values.as_ref()])?;
+    let r_offset = l_len;
+
+    match set_op {
+        SetOp::Union => generic_set_loop::<OffsetSize, true>(
+            l,
+            r,
+            &rows_l,
+            &rows_r,
+            field,
+            &combined_values,
+            r_offset,
+        ),
+        SetOp::Intersect => generic_set_loop::<OffsetSize, false>(
+            l,
+            r,
+            &rows_l,
+            &rows_r,
+            field,
+            &combined_values,
+            r_offset,
+        ),
+    }
+}
 
-        let l_iter = l_values.iter().sorted().dedup();
-        let values_set: HashSet<_> = l_iter.clone().collect();
-        let mut rows = if set_op == SetOp::Union {
-            l_iter.collect()
-        } else {
-            vec![]
-        };
-
-        for r_val in r_values.iter().sorted().dedup() {
-            match set_op {
-                SetOp::Union => {
-                    if !values_set.contains(&r_val) {
-                        rows.push(r_val);
-                    }
+/// Inner loop for set operations, parameterized by const generic to
+/// avoid branching inside the hot loop.
+fn generic_set_loop<OffsetSize: OffsetSizeTrait, const IS_UNION: bool>(
+    l: &GenericListArray<OffsetSize>,
+    r: &GenericListArray<OffsetSize>,
+    rows_l: &arrow::row::Rows,
+    rows_r: &arrow::row::Rows,
+    field: Arc<Field>,
+    combined_values: &ArrayRef,
+    r_offset: usize,
+) -> Result<ArrayRef> {
+    let l_offsets = l.value_offsets();
+    let r_offsets = r.value_offsets();
+    let l_first = l.offsets()[0].as_usize();
+    let r_first = r.offsets()[0].as_usize();
+
+    let mut result_offsets = Vec::with_capacity(l.len() + 1);
+    result_offsets.push(OffsetSize::usize_as(0));
+    let initial_capacity = if IS_UNION {
+        // Union can include all elements from both sides
+        rows_l.num_rows()
+    } else {
+        // Intersect result is bounded by the smaller side
+        rows_l.num_rows().min(rows_r.num_rows())
+    };
+
+    let mut indices: Vec<usize> = Vec::with_capacity(initial_capacity);
+
+    // Reuse hash sets across iterations
+    let mut seen = HashSet::new();
+    let mut lookup_set = HashSet::new();
+    for i in 0..l.len() {
+        let last_offset = *result_offsets.last().unwrap();
+
+        if l.is_null(i) || r.is_null(i) {
+            result_offsets.push(last_offset);
+            continue;
+        }
+
+        let l_start = l_offsets[i].as_usize() - l_first;
+        let l_end = l_offsets[i + 1].as_usize() - l_first;
+        let r_start = r_offsets[i].as_usize() - r_first;
+        let r_end = r_offsets[i + 1].as_usize() - r_first;
+
+        seen.clear();
+
+        if IS_UNION {
+            for idx in l_start..l_end {
+                let row = rows_l.row(idx);
+                if seen.insert(row) {
+                    indices.push(idx);
                 }
-                SetOp::Intersect => {
-                    if values_set.contains(&r_val) {
-                        rows.push(r_val);
-                    }
+            }
+            for idx in r_start..r_end {
+                let row = rows_r.row(idx);
+                if seen.insert(row) {
+                    indices.push(idx + r_offset);
                 }
             }
-        }
-
-        let last_offset = match offsets.last() {
-            Some(offset) => *offset,
-            None => return internal_err!("offsets should not be empty"),
-        };
-
-        offsets.push(last_offset + OffsetSize::usize_as(rows.len()));
-        let arrays = converter.convert_rows(rows)?;
-        let array = match arrays.first() {
-            Some(array) => Arc::clone(array),
-            None => {
-                return internal_err!("{set_op}: failed to get array from rows");
+        } else {
+            let l_len = l_end - l_start;
+            let r_len = r_end - r_start;
+
+            // Select shorter side for lookup, longer side for probing.
+            // Track the probe side's offset into the combined values array.
+            let (lookup_rows, lookup_range, probe_rows, probe_range, probe_offset) =
+                if l_len < r_len {
+                    (rows_l, l_start..l_end, rows_r, r_start..r_end, r_offset)
+                } else {
+                    (rows_r, r_start..r_end, rows_l, l_start..l_end, 0)
+                };
+            lookup_set.clear();
+            lookup_set.reserve(lookup_range.len());
+
+            // Build lookup table
+            for idx in lookup_range {
+                lookup_set.insert(lookup_rows.row(idx));
             }
-        };
 
-        new_arrays.push(array);
-    }
+            // Probe and emit distinct intersected rows
+            for idx in probe_range {
+                let row = probe_rows.row(idx);
+                if lookup_set.contains(&row) && seen.insert(row) {
+                    indices.push(idx + probe_offset);
+                }
+            }
+        }
+        result_offsets.push(last_offset + OffsetSize::usize_as(seen.len()));
+    }
+
+    // Gather distinct values by index from the combined values array.
+    // Use UInt64Array for LargeList to support values arrays exceeding u32::MAX.
+    let final_values = if indices.is_empty() {
+        new_empty_array(&l.value_type())
+    } else if OffsetSize::IS_LARGE {
+        let indices =
+            UInt64Array::from(indices.into_iter().map(|i| i as u64).collect::<Vec<_>>());
+        take(combined_values.as_ref(), &indices, None)?
+    } else {
+        let indices =
+            UInt32Array::from(indices.into_iter().map(|i| i as u32).collect::<Vec<_>>());
+        take(combined_values.as_ref(), &indices, None)?
+    };
 
-    let offsets = OffsetBuffer::new(offsets.into());
-    let new_arrays_ref: Vec<_> = new_arrays.iter().map(|v| v.as_ref()).collect();
-    let values = compute::concat(&new_arrays_ref)?;
-    let arr = GenericListArray::<OffsetSize>::try_new(field, offsets, values, None)?;
+    let arr = GenericListArray::<OffsetSize>::try_new(
+        field,
+        OffsetBuffer::new(result_offsets.into()),
+        final_values,
+        NullBuffer::union(l.nulls(), r.nulls()),
+    )?;
     Ok(Arc::new(arr))
 }
 
@@ -425,59 +509,13 @@ fn general_set_op(
     array2: &ArrayRef,
     set_op: SetOp,
 ) -> Result<ArrayRef> {
-    fn empty_array(data_type: &DataType, len: usize, large: bool) -> Result<ArrayRef> {
-        let field = Arc::new(Field::new_list_field(data_type.clone(), true));
-        let values = new_null_array(data_type, len);
-        if large {
-            Ok(Arc::new(LargeListArray::try_new(
-                field,
-                OffsetBuffer::new_zeroed(len),
-                values,
-                None,
-            )?))
-        } else {
-            Ok(Arc::new(ListArray::try_new(
-                field,
-                OffsetBuffer::new_zeroed(len),
-                values,
-                None,
-            )?))
-        }
-    }
-
+    let len = array1.len();
     match (array1.data_type(), array2.data_type()) {
-        (Null, Null) => Ok(Arc::new(ListArray::new_null(
-            Arc::new(Field::new_list_field(Null, true)),
-            array1.len(),
-        ))),
-        (Null, List(field)) => {
-            if set_op == SetOp::Intersect {
-                return empty_array(field.data_type(), array1.len(), false);
-            }
-            let array = as_list_array(&array2)?;
-            general_array_distinct::<i32>(array, field)
-        }
-        (List(field), Null) => {
-            if set_op == SetOp::Intersect {
-                return empty_array(field.data_type(), array1.len(), false);
-            }
-            let array = as_list_array(&array1)?;
-            general_array_distinct::<i32>(array, field)
-        }
-        (Null, LargeList(field)) => {
-            if set_op == SetOp::Intersect {
-                return empty_array(field.data_type(), array1.len(), true);
-            }
-            let array = as_large_list_array(&array2)?;
-            general_array_distinct::<i64>(array, field)
-        }
-        (LargeList(field), Null) => {
-            if set_op == SetOp::Intersect {
-                return empty_array(field.data_type(), array1.len(), true);
-            }
-            let array = as_large_list_array(&array1)?;
-            general_array_distinct::<i64>(array, field)
-        }
+        (Null, Null) => Ok(new_null_array(&DataType::new_list(Null, true), len)),
+        (Null, dt @ List(_))
+        | (Null, dt @ LargeList(_))
+        | (dt @ List(_), Null)
+        | (dt @ LargeList(_), Null) => Ok(new_null_array(dt, len)),
         (List(field), List(_)) => {
             let array1 = as_list_array(&array1)?;
             let array2 = as_list_array(&array2)?;
@@ -496,13 +534,11 @@ fn general_set_op(
     }
 }
 
-/// Array_union SQL function
 fn array_union_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array1, array2] = take_function_args("array_union", args)?;
     general_set_op(array1, array2, SetOp::Union)
 }
 
-/// array_intersect SQL function
 fn array_intersect_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array1, array2] = take_function_args("array_intersect", args)?;
     general_set_op(array1, array2, SetOp::Intersect)
@@ -515,42 +551,67 @@ fn general_array_distinct<OffsetSize: OffsetSizeTrait>(
     if array.is_empty() {
         return Ok(Arc::new(array.clone()) as ArrayRef);
     }
+    let value_offsets = array.value_offsets();
     let dt = array.value_type();
-    let mut offsets = Vec::with_capacity(array.len());
+    let mut offsets = Vec::with_capacity(array.len() + 1);
     offsets.push(OffsetSize::usize_as(0));
-    let mut new_arrays = Vec::with_capacity(array.len());
-    let converter = RowConverter::new(vec![SortField::new(dt)])?;
-    // distinct for each list in ListArray
-    for arr in array.iter() {
-        let last_offset: OffsetSize = offsets.last().copied().unwrap();
-        let Some(arr) = arr else {
-            // Add same offset for null
+
+    let converter = RowConverter::new(vec![SortField::new(dt.clone())])?;
+
+    // Only convert the visible portion of the values array. For sliced
+    // ListArrays, values() returns the full underlying array but only
+    // elements between the first and last offset are referenced.
+    let first_offset = value_offsets[0].as_usize();
+    let visible_len = value_offsets[array.len()].as_usize() - first_offset;
+    let rows =
+        converter.convert_columns(&[array.values().slice(first_offset, visible_len)])?;
+
+    let mut indices: Vec<usize> = Vec::with_capacity(rows.num_rows());
+    let mut seen = HashSet::new();
+    for i in 0..array.len() {
+        let last_offset = *offsets.last().unwrap();
+
+        // Null list entries produce no output; just carry forward the offset.
+        if array.is_null(i) {
             offsets.push(last_offset);
             continue;
-        };
-        let values = converter.convert_columns(&[arr])?;
-        // sort elements in list and remove duplicates
-        let rows = values.iter().sorted().dedup().collect::<Vec<_>>();
-        offsets.push(last_offset + OffsetSize::usize_as(rows.len()));
-        let arrays = converter.convert_rows(rows)?;
-        let array = match arrays.first() {
-            Some(array) => Arc::clone(array),
-            None => {
-                return internal_err!("array_distinct: failed to get array from rows")
+        }
+
+        let start = value_offsets[i].as_usize() - first_offset;
+        let end = value_offsets[i + 1].as_usize() - first_offset;
+        seen.clear();
+        seen.reserve(end - start);
+
+        // Walk the sub-array and keep only the first occurrence of each value.
+        for idx in start..end {
+            let row = rows.row(idx);
+            if seen.insert(row) {
+                indices.push(idx + first_offset);
             }
-        };
-        new_arrays.push(array);
-    }
-    if new_arrays.is_empty() {
-        return Ok(Arc::new(array.clone()) as ArrayRef);
-    }
-    let offsets = OffsetBuffer::new(offsets.into());
-    let new_arrays_ref = new_arrays.iter().map(|v| v.as_ref()).collect::<Vec<_>>();
-    let values = compute::concat(&new_arrays_ref)?;
+        }
+        offsets.push(last_offset + OffsetSize::usize_as(seen.len()));
+    }
+
+    // Gather distinct values in a single pass, using the computed `indices`.
+    // Indices are absolute positions in array.values() (first_offset was added
+    // back when collecting them), so we can take directly from the full values.
+    // Use UInt64Array for LargeList to support values arrays exceeding u32::MAX.
+    let final_values = if indices.is_empty() {
+        new_empty_array(&dt)
+    } else if OffsetSize::IS_LARGE {
+        let indices =
+            UInt64Array::from(indices.into_iter().map(|i| i as u64).collect::<Vec<_>>());
+        take(array.values().as_ref(), &indices, None)?
+    } else {
+        let indices =
+            UInt32Array::from(indices.into_iter().map(|i| i as u32).collect::<Vec<_>>());
+        take(array.values().as_ref(), &indices, None)?
+    };
+
     Ok(Arc::new(GenericListArray::<OffsetSize>::try_new(
         Arc::clone(field),
-        offsets,
-        values,
+        OffsetBuffer::new(offsets.into()),
+        final_values,
         // Keep the list nulls
         array.nulls().cloned(),
     )?))
@@ -561,18 +622,136 @@ mod tests {
     use std::sync::Arc;
 
     use arrow::{
-        array::{Int32Array, ListArray},
+        array::{Array, AsArray, Int32Array, ListArray},
         buffer::OffsetBuffer,
-        datatypes::{DataType, Field},
+        datatypes::{DataType, Field, Int32Type},
     };
-    use datafusion_common::{config::ConfigOptions, DataFusionError};
-    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+    use datafusion_common::{DataFusionError, Result, config::ConfigOptions};
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+
+    use crate::set_ops::{ArrayDistinct, ArrayIntersect, ArrayUnion, array_distinct_udf};
+
+    /// Build two sliced ListArrays and return them along with the shared list
+    /// field.
+    ///
+    /// l: [[1,2], [3,4], [5,6], [7,8]]  →  slice(1,2)  →  [[3,4], [5,6]]
+    /// r: [[1,3], [3,5], [5,7], [7,1]]  →  slice(1,2)  →  [[3,5], [5,7]]
+    fn make_sliced_pair() -> (ListArray, ListArray, Arc<Field>) {
+        let l = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), Some(2)]),
+            Some(vec![Some(3), Some(4)]),
+            Some(vec![Some(5), Some(6)]),
+            Some(vec![Some(7), Some(8)]),
+        ]);
+        let r = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), Some(3)]),
+            Some(vec![Some(3), Some(5)]),
+            Some(vec![Some(5), Some(7)]),
+            Some(vec![Some(7), Some(1)]),
+        ]);
+        let field = Arc::new(Field::new("item", l.data_type().clone(), true));
+        (l.slice(1, 2), r.slice(1, 2), field)
+    }
+
+    fn collect_i32_list(list: &ListArray) -> Vec<Vec<i32>> {
+        (0..list.len())
+            .map(|i| {
+                let arr = list.value(i);
+                arr.as_any()
+                    .downcast_ref::<Int32Array>()
+                    .unwrap()
+                    .values()
+                    .to_vec()
+            })
+            .collect()
+    }
+
+    #[test]
+    fn test_array_union_sliced_lists() -> Result<()> {
+        let (l, r, field) = make_sliced_pair();
+
+        let result = ArrayUnion::new().invoke_with_args(ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Array(Arc::new(l)),
+                ColumnarValue::Array(Arc::new(r)),
+            ],
+            arg_fields: vec![Arc::clone(&field), Arc::clone(&field)],
+            number_rows: 2,
+            return_field: Arc::clone(&field),
+            config_options: Arc::new(ConfigOptions::default()),
+        })?;
+
+        let output = result.into_array(2)?;
+        let output = output.as_list::<i32>();
+        let rows = collect_i32_list(output);
+
+        // Row 0: union([3,4], [3,5]) = [3,4,5]
+        assert_eq!(rows[0], vec![3, 4, 5]);
+        // Row 1: union([5,6], [5,7]) = [5,6,7]
+        assert_eq!(rows[1], vec![5, 6, 7]);
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_intersect_sliced_lists() -> Result<()> {
+        let (l, r, field) = make_sliced_pair();
+
+        let result = ArrayIntersect::new().invoke_with_args(ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Array(Arc::new(l)),
+                ColumnarValue::Array(Arc::new(r)),
+            ],
+            arg_fields: vec![Arc::clone(&field), Arc::clone(&field)],
+            number_rows: 2,
+            return_field: Arc::clone(&field),
+            config_options: Arc::new(ConfigOptions::default()),
+        })?;
+
+        let output = result.into_array(2)?;
+        let output = output.as_list::<i32>();
+        let rows = collect_i32_list(output);
+
+        // Row 0: intersect([3,4], [3,5]) = [3]
+        assert_eq!(rows[0], vec![3]);
+        // Row 1: intersect([5,6], [5,7]) = [5]
+        assert_eq!(rows[1], vec![5]);
+        Ok(())
+    }
 
-    use crate::set_ops::array_distinct_udf;
+    #[test]
+    fn test_array_distinct_sliced_list() -> Result<()> {
+        // [[1,1], [3,3,4], [5,5,6], [7,7]]  →  slice(1,2)  →  [[3,3,4], [5,5,6]]
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), Some(1)]),
+            Some(vec![Some(3), Some(3), Some(4)]),
+            Some(vec![Some(5), Some(5), Some(6)]),
+            Some(vec![Some(7), Some(7)]),
+        ]);
+        let sliced = list.slice(1, 2);
+        let field = Arc::new(Field::new("item", sliced.data_type().clone(), true));
+
+        let result = ArrayDistinct::new().invoke_with_args(ScalarFunctionArgs {
+            args: vec![ColumnarValue::Array(Arc::new(sliced))],
+            arg_fields: vec![Arc::clone(&field)],
+            number_rows: 2,
+            return_field: field,
+            config_options: Arc::new(ConfigOptions::default()),
+        })?;
+
+        let output = result.into_array(2)?;
+        let output = output.as_list::<i32>();
+        let rows = collect_i32_list(output);
+
+        // Row 0: distinct([3,3,4]) = [3,4]
+        assert_eq!(rows[0], vec![3, 4]);
+        // Row 1: distinct([5,5,6]) = [5,6]
+        assert_eq!(rows[1], vec![5, 6]);
+        Ok(())
+    }
 
     #[test]
-    fn test_array_distinct_inner_nullability_result_type_match_return_type(
-    ) -> Result<(), DataFusionError> {
+    fn test_array_distinct_inner_nullability_result_type_match_return_type()
+    -> Result<(), DataFusionError> {
         let udf = array_distinct_udf();
 
         for inner_nullable in [true, false] {
diff --git a/datafusion/functions-nested/src/sort.rs b/datafusion/functions-nested/src/sort.rs
index 4a7aa31c755b7..0a34cce6b965f 100644
--- a/datafusion/functions-nested/src/sort.rs
+++ b/datafusion/functions-nested/src/sort.rs
@@ -18,22 +18,23 @@
 //! [`ScalarUDFImpl`] definitions for array_sort function.
 
 use crate::utils::make_scalar_function;
+use arrow::array::BooleanBufferBuilder;
 use arrow::array::{
-    new_null_array, Array, ArrayRef, GenericListArray, NullBufferBuilder, OffsetSizeTrait,
+    Array, ArrayRef, ArrowPrimitiveType, GenericListArray, OffsetSizeTrait,
+    PrimitiveArray, UInt32Array, UInt64Array, new_empty_array, new_null_array,
 };
-use arrow::buffer::OffsetBuffer;
-use arrow::compute::SortColumn;
-use arrow::datatypes::{DataType, FieldRef};
-use arrow::{compute, compute::SortOptions};
+use arrow::buffer::{NullBuffer, OffsetBuffer};
+use arrow::datatypes::{ArrowNativeTypeOp, DataType, FieldRef};
+use arrow::row::{RowConverter, SortField};
+use arrow::{compute, compute::SortOptions, downcast_primitive_array};
 use datafusion_common::cast::{as_large_list_array, as_list_array, as_string_array};
 use datafusion_common::utils::ListCoercion;
-use datafusion_common::{exec_err, plan_err, Result};
+use datafusion_common::{Result, exec_err, internal_datafusion_err};
 use datafusion_expr::{
     ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, Documentation,
-    ScalarUDFImpl, Signature, TypeSignature, Volatility,
+    ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::sync::Arc;
 
 make_udf_expr_and_func!(
@@ -69,11 +70,11 @@ make_udf_expr_and_func!(
     ),
     argument(
         name = "desc",
-        description = "Whether to sort in descending order(`ASC` or `DESC`)."
+        description = "Whether to sort in ascending (`ASC`) or descending (`DESC`) order. The default is `ASC`."
     ),
     argument(
         name = "nulls_first",
-        description = "Whether to sort nulls first(`NULLS FIRST` or `NULLS LAST`)."
+        description = "Whether to sort nulls first (`NULLS FIRST`) or last (`NULLS LAST`). The default is `NULLS FIRST`."
     )
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
@@ -121,10 +122,6 @@ impl ArraySort {
 }
 
 impl ScalarUDFImpl for ArraySort {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array_sort"
     }
@@ -134,24 +131,10 @@ impl ScalarUDFImpl for ArraySort {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        match &arg_types[0] {
-            DataType::Null => Ok(DataType::Null),
-            DataType::List(field) => {
-                Ok(DataType::new_list(field.data_type().clone(), true))
-            }
-            DataType::LargeList(field) => {
-                Ok(DataType::new_large_list(field.data_type().clone(), true))
-            }
-            arg_type => {
-                plan_err!("{} does not support type {arg_type}", self.name())
-            }
-        }
+        Ok(arg_types[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_sort_inner)(&args.args)
     }
 
@@ -164,8 +147,7 @@ impl ScalarUDFImpl for ArraySort {
     }
 }
 
-/// Array_sort SQL function
-pub fn array_sort_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_sort_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     if args.is_empty() || args.len() > 3 {
         return exec_err!("array_sort expects one to three arguments");
     }
@@ -178,25 +160,20 @@ pub fn array_sort_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
         return Ok(new_null_array(args[0].data_type(), args[0].len()));
     }
 
-    let sort_options = match args.len() {
-        1 => None,
-        2 => {
-            let sort = as_string_array(&args[1])?.value(0);
-            Some(SortOptions {
-                descending: order_desc(sort)?,
-                nulls_first: true,
-            })
-        }
-        3 => {
-            let sort = as_string_array(&args[1])?.value(0);
-            let nulls_first = as_string_array(&args[2])?.value(0);
-            Some(SortOptions {
-                descending: order_desc(sort)?,
-                nulls_first: order_nulls_first(nulls_first)?,
-            })
-        }
-        // We guard at the top
-        _ => unreachable!(),
+    let sort_options = if args.len() >= 2 {
+        let order = as_string_array(&args[1])?.value(0);
+        let descending = order_desc(order)?;
+        let nulls_first = if args.len() >= 3 {
+            order_nulls_first(as_string_array(&args[2])?.value(0))?
+        } else {
+            true
+        };
+        Some(SortOptions {
+            descending,
+            nulls_first,
+        })
+    } else {
+        None
     };
 
     match args[0].data_type() {
@@ -207,76 +184,299 @@ pub fn array_sort_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
         }
         DataType::List(field) => {
             let array = as_list_array(&args[0])?;
-            array_sort_generic(array, field, sort_options)
+            array_sort_generic(array, Arc::clone(field), sort_options)
         }
         DataType::LargeList(field) => {
             let array = as_large_list_array(&args[0])?;
-            array_sort_generic(array, field, sort_options)
+            array_sort_generic(array, Arc::clone(field), sort_options)
         }
         // Signature should prevent this arm ever occurring
         _ => exec_err!("array_sort expects list for first argument"),
     }
 }
 
-/// Array_sort SQL function
-pub fn array_sort_generic<OffsetSize: OffsetSizeTrait>(
+fn array_sort_generic<OffsetSize: OffsetSizeTrait>(
+    list_array: &GenericListArray<OffsetSize>,
+    field: FieldRef,
+    sort_options: Option<SortOptions>,
+) -> Result<ArrayRef> {
+    let values = list_array.values();
+
+    if values.data_type().is_primitive() {
+        array_sort_primitive(list_array, field, sort_options)
+    } else {
+        array_sort_non_primitive(list_array, field, sort_options)
+    }
+}
+
+/// Sort each row of a primitive-typed ListArray using a custom in-place sort
+/// kernel.
+fn array_sort_primitive<OffsetSize: OffsetSizeTrait>(
     list_array: &GenericListArray<OffsetSize>,
-    field: &FieldRef,
+    field: FieldRef,
     sort_options: Option<SortOptions>,
 ) -> Result<ArrayRef> {
+    let values = list_array.values().as_ref();
+    downcast_primitive_array! {
+        values => sort_primitive_list(values, list_array, field, sort_options),
+        _ => exec_err!("array_sort: unsupported primitive type")
+    }
+}
+
+fn sort_primitive_list<T: ArrowPrimitiveType, OffsetSize: OffsetSizeTrait>(
+    prim_values: &PrimitiveArray<T>,
+    list_array: &GenericListArray<OffsetSize>,
+    field: FieldRef,
+    sort_options: Option<SortOptions>,
+) -> Result<ArrayRef>
+where
+    T::Native: ArrowNativeTypeOp,
+{
+    if prim_values.null_count() > 0 {
+        sort_list_with_nulls(prim_values, list_array, field, sort_options)
+    } else {
+        sort_list_no_nulls(prim_values, list_array, field, sort_options)
+    }
+}
+
+/// Fast path for primitive values with no element-level nulls. Copies all
+/// values into a single `Vec` and sorts each row's slice in-place.
+fn sort_list_no_nulls<T: ArrowPrimitiveType, OffsetSize: OffsetSizeTrait>(
+    prim_values: &PrimitiveArray<T>,
+    list_array: &GenericListArray<OffsetSize>,
+    field: FieldRef,
+    sort_options: Option<SortOptions>,
+) -> Result<ArrayRef>
+where
+    T::Native: ArrowNativeTypeOp,
+{
     let row_count = list_array.len();
+    let offsets = list_array.offsets();
+    let values_start = offsets[0].as_usize();
+    let values_end = offsets[row_count].as_usize();
+
+    let descending = sort_options.is_some_and(|o| o.descending);
+
+    // Copy all values into a mutable buffer
+    let mut values: Vec<T::Native> =
+        prim_values.values()[values_start..values_end].to_vec();
 
-    let mut array_lengths = vec![];
-    let mut arrays = vec![];
-    let mut valid = NullBufferBuilder::new(row_count);
-    for i in 0..row_count {
-        if list_array.is_null(i) {
-            array_lengths.push(0);
-            valid.append_null();
+    for (row_index, window) in offsets.windows(2).enumerate() {
+        if list_array.is_null(row_index) {
+            continue;
+        }
+        let start = window[0].as_usize() - values_start;
+        let end = window[1].as_usize() - values_start;
+        let slice = &mut values[start..end];
+        if descending {
+            slice.sort_unstable_by(|a, b| b.compare(*a));
         } else {
-            let arr_ref = list_array.value(i);
-
-            // arrow sort kernel does not support Structs, so use
-            // lexsort_to_indices instead:
-            // https://github.com/apache/arrow-rs/issues/6911#issuecomment-2562928843
-            let sorted_array = match arr_ref.data_type() {
-                DataType::Struct(_) => {
-                    let sort_columns: Vec<SortColumn> = vec![SortColumn {
-                        values: Arc::clone(&arr_ref),
-                        options: sort_options,
-                    }];
-                    let indices = compute::lexsort_to_indices(&sort_columns, None)?;
-                    compute::take(arr_ref.as_ref(), &indices, None)?
-                }
-                _ => {
-                    let arr_ref = arr_ref.as_ref();
-                    compute::sort(arr_ref, sort_options)?
-                }
-            };
-            array_lengths.push(sorted_array.len());
-            arrays.push(sorted_array);
-            valid.append_non_null();
+            slice.sort_unstable_by(|a, b| a.compare(*b));
         }
     }
 
-    let buffer = valid.finish();
+    let new_offsets = rebase_offsets(offsets);
+    let sorted_values = Arc::new(
+        PrimitiveArray::<T>::new(values.into(), None)
+            .with_data_type(prim_values.data_type().clone()),
+    );
+
+    Ok(Arc::new(GenericListArray::<OffsetSize>::try_new(
+        field,
+        new_offsets,
+        sorted_values,
+        list_array.nulls().cloned(),
+    )?))
+}
+
+/// Slow path for primitive values with element-level nulls.
+fn sort_list_with_nulls<T: ArrowPrimitiveType, OffsetSize: OffsetSizeTrait>(
+    prim_values: &PrimitiveArray<T>,
+    list_array: &GenericListArray<OffsetSize>,
+    field: FieldRef,
+    sort_options: Option<SortOptions>,
+) -> Result<ArrayRef>
+where
+    T::Native: ArrowNativeTypeOp,
+{
+    let row_count = list_array.len();
+    let offsets = list_array.offsets();
+    let values_start = offsets[0].as_usize();
+    let values_end = offsets[row_count].as_usize();
+    let total_values = values_end - values_start;
 
-    let elements = arrays
-        .iter()
-        .map(|a| a.as_ref())
-        .collect::<Vec<&dyn Array>>();
+    let descending = sort_options.is_some_and(|o| o.descending);
+    let nulls_first = sort_options.is_none_or(|o| o.nulls_first);
 
-    let list_arr = if elements.is_empty() {
-        GenericListArray::<OffsetSize>::new_null(Arc::clone(field), row_count)
-    } else {
-        GenericListArray::<OffsetSize>::new(
-            Arc::clone(field),
-            OffsetBuffer::from_lengths(array_lengths),
-            Arc::new(compute::concat(elements.as_slice())?),
-            buffer,
+    let mut out_values: Vec<T::Native> = vec![T::Native::default(); total_values];
+    let mut validity = BooleanBufferBuilder::new(total_values);
+
+    let src_nulls = prim_values.nulls().ok_or_else(|| {
+        internal_datafusion_err!(
+            "sort_list_with_nulls called but values have no null buffer"
         )
+    })?;
+    let src_values = prim_values.values();
+
+    for (row_index, window) in offsets.windows(2).enumerate() {
+        let start = window[0].as_usize();
+        let end = window[1].as_usize();
+        let row_len = end - start;
+        let out_start = start - values_start;
+
+        if list_array.is_null(row_index) || row_len == 0 {
+            validity.append_n(row_len, false);
+            continue;
+        }
+
+        let null_count = src_nulls.slice(start, row_len).null_count();
+        let valid_count = row_len - null_count;
+
+        // Compact valid values directly into the target region of the output
+        // buffer: after nulls (if nulls_first) or at the start (if nulls_last).
+        let valid_offset = if nulls_first { null_count } else { 0 };
+        let mut write_pos = out_start + valid_offset;
+        for i in start..end {
+            if src_nulls.is_valid(i) {
+                out_values[write_pos] = src_values[i];
+                write_pos += 1;
+            }
+        }
+
+        let valid_slice = &mut out_values
+            [out_start + valid_offset..out_start + valid_offset + valid_count];
+        if descending {
+            valid_slice.sort_unstable_by(|a, b| b.compare(*a));
+        } else {
+            valid_slice.sort_unstable_by(|a, b| a.compare(*b));
+        }
+
+        // Build validity bits
+        if nulls_first {
+            validity.append_n(null_count, false);
+            validity.append_n(valid_count, true);
+        } else {
+            validity.append_n(valid_count, true);
+            validity.append_n(null_count, false);
+        }
+    }
+
+    let new_offsets = rebase_offsets(offsets);
+
+    let null_buffer = NullBuffer::from(validity.finish());
+    let sorted_values = Arc::new(
+        PrimitiveArray::<T>::new(out_values.into(), Some(null_buffer))
+            .with_data_type(prim_values.data_type().clone()),
+    );
+
+    Ok(Arc::new(GenericListArray::<OffsetSize>::try_new(
+        field,
+        new_offsets,
+        sorted_values,
+        list_array.nulls().cloned(),
+    )?))
+}
+
+/// Sort a non-pritive-typed ListArray by converting all rows at once using
+/// `RowConverter`, and then sort row indices by comparing encoded bytes (sort
+/// direction and null ordering are baked into the encoding), and materialize
+/// the result with a single `take()`.
+fn array_sort_non_primitive<OffsetSize: OffsetSizeTrait>(
+    list_array: &GenericListArray<OffsetSize>,
+    field: FieldRef,
+    sort_options: Option<SortOptions>,
+) -> Result<ArrayRef> {
+    let row_count = list_array.len();
+    let values = list_array.values();
+    let offsets = list_array.offsets();
+    let values_start = offsets[0].as_usize();
+    let total_values = offsets[row_count].as_usize() - values_start;
+
+    let converter = RowConverter::new(vec![SortField::new_with_options(
+        values.data_type().clone(),
+        sort_options.unwrap_or_default(),
+    )])?;
+    let values_sliced = values.slice(values_start, total_values);
+    let rows = converter.convert_columns(&[Arc::clone(&values_sliced)])?;
+
+    let mut indices: Vec<OffsetSize> = Vec::with_capacity(total_values);
+    let mut new_offsets = Vec::with_capacity(row_count + 1);
+    new_offsets.push(OffsetSize::usize_as(0));
+
+    let mut sort_scratch: Vec<usize> = Vec::new();
+
+    for (row_index, window) in offsets.windows(2).enumerate() {
+        let start = window[0];
+        let end = window[1];
+
+        if list_array.is_null(row_index) {
+            new_offsets.push(new_offsets[row_index]);
+            continue;
+        }
+
+        let len = (end - start).as_usize();
+        let local_start = start.as_usize() - values_start;
+
+        if len <= 1 {
+            indices.extend((local_start..local_start + len).map(OffsetSize::usize_as));
+        } else {
+            sort_scratch.clear();
+            sort_scratch.extend(local_start..local_start + len);
+            sort_scratch.sort_unstable_by(|&a, &b| rows.row(a).cmp(&rows.row(b)));
+            indices.extend(sort_scratch.iter().map(|&i| OffsetSize::usize_as(i)));
+        }
+
+        new_offsets.push(new_offsets[row_index] + (end - start));
+    }
+
+    let sorted_values = if indices.is_empty() {
+        new_empty_array(values.data_type())
+    } else {
+        take_by_indices(&values_sliced, indices)?
     };
-    Ok(Arc::new(list_arr))
+
+    Ok(Arc::new(GenericListArray::<OffsetSize>::try_new(
+        field,
+        OffsetBuffer::<OffsetSize>::new(new_offsets.into()),
+        sorted_values,
+        list_array.nulls().cloned(),
+    )?))
+}
+
+/// Select elements from `values` at the given `indices` using `compute::take`.
+/// We consume `indices` in order to avoid an intermediate copy.
+fn take_by_indices<OffsetSize: OffsetSizeTrait>(
+    values: &ArrayRef,
+    indices: Vec<OffsetSize>,
+) -> Result<ArrayRef> {
+    let len = indices.len();
+    let buffer = arrow::buffer::Buffer::from_vec(indices);
+    let indices_array: ArrayRef = if OffsetSize::IS_LARGE {
+        Arc::new(UInt64Array::new(
+            arrow::buffer::ScalarBuffer::new(buffer, 0, len),
+            None,
+        ))
+    } else {
+        Arc::new(UInt32Array::new(
+            arrow::buffer::ScalarBuffer::new(buffer, 0, len),
+            None,
+        ))
+    };
+    Ok(compute::take(values.as_ref(), &indices_array, None)?)
+}
+
+/// Rebase offsets so they start at 0. For non-sliced ListArrays (the common
+/// case) offsets already start at 0 and we can clone the Arc-backed buffer
+/// cheaply instead of allocating a new Vec.
+fn rebase_offsets<OffsetSize: OffsetSizeTrait>(
+    offsets: &OffsetBuffer<OffsetSize>,
+) -> OffsetBuffer<OffsetSize> {
+    if offsets[0].as_usize() == 0 {
+        offsets.clone()
+    } else {
+        let rebased: Vec<OffsetSize> = offsets.iter().map(|o| *o - offsets[0]).collect();
+        OffsetBuffer::new(rebased.into())
+    }
 }
 
 fn order_desc(modifier: &str) -> Result<bool> {
diff --git a/datafusion/functions-nested/src/string.rs b/datafusion/functions-nested/src/string.rs
index 61caa3ac70764..b76736672cffa 100644
--- a/datafusion/functions-nested/src/string.rs
+++ b/datafusion/functions-nested/src/string.rs
@@ -19,104 +19,41 @@
 
 use arrow::array::{
     Array, ArrayRef, BooleanArray, Float32Array, Float64Array, GenericListArray,
-    Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, ListBuilder,
-    OffsetSizeTrait, StringArray, StringBuilder, UInt16Array, UInt32Array, UInt64Array,
-    UInt8Array,
+    Int8Array, Int16Array, Int32Array, Int64Array, LargeStringArray, ListBuilder,
+    OffsetSizeTrait, StringArray, StringBuilder, UInt8Array, UInt16Array, UInt32Array,
+    UInt64Array,
 };
 use arrow::datatypes::{DataType, Field};
 
 use datafusion_common::utils::ListCoercion;
-use datafusion_common::{not_impl_err, DataFusionError, Result};
+use datafusion_common::{DataFusionError, Result, ScalarValue, not_impl_err};
 
-use std::any::Any;
+use std::fmt::{self, Write};
 
 use crate::utils::make_scalar_function;
 use arrow::array::{
+    StringArrayType, StringViewArray,
     builder::{ArrayBuilder, LargeStringBuilder, StringViewBuilder},
     cast::AsArray,
-    GenericStringArray, StringArrayType, StringViewArray,
 };
-use arrow::compute::cast;
+use arrow::compute::{can_cast_types, cast};
 use arrow::datatypes::DataType::{
     Dictionary, FixedSizeList, LargeList, LargeUtf8, List, Null, Utf8, Utf8View,
 };
 use datafusion_common::cast::{
     as_fixed_size_list_array, as_large_list_array, as_list_array,
 };
-use datafusion_common::exec_err;
 use datafusion_common::types::logical_string;
+use datafusion_common::{exec_datafusion_err, exec_err};
 use datafusion_expr::{
     ArrayFunctionArgument, ArrayFunctionSignature, Coercion, ColumnarValue,
-    Documentation, ScalarUDFImpl, Signature, TypeSignature, TypeSignatureClass,
-    Volatility,
+    Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+    TypeSignatureClass, Volatility,
 };
 use datafusion_functions::downcast_arg;
 use datafusion_macros::user_doc;
 use std::sync::Arc;
 
-macro_rules! call_array_function {
-    ($DATATYPE:expr, false) => {
-        match $DATATYPE {
-            DataType::Utf8 => array_function!(StringArray),
-            DataType::Utf8View => array_function!(StringViewArray),
-            DataType::LargeUtf8 => array_function!(LargeStringArray),
-            DataType::Boolean => array_function!(BooleanArray),
-            DataType::Float32 => array_function!(Float32Array),
-            DataType::Float64 => array_function!(Float64Array),
-            DataType::Int8 => array_function!(Int8Array),
-            DataType::Int16 => array_function!(Int16Array),
-            DataType::Int32 => array_function!(Int32Array),
-            DataType::Int64 => array_function!(Int64Array),
-            DataType::UInt8 => array_function!(UInt8Array),
-            DataType::UInt16 => array_function!(UInt16Array),
-            DataType::UInt32 => array_function!(UInt32Array),
-            DataType::UInt64 => array_function!(UInt64Array),
-            dt => not_impl_err!("Unsupported data type in array_to_string: {dt}"),
-        }
-    };
-    ($DATATYPE:expr, $INCLUDE_LIST:expr) => {{
-        match $DATATYPE {
-            DataType::List(_) => array_function!(ListArray),
-            DataType::Utf8 => array_function!(StringArray),
-            DataType::Utf8View => array_function!(StringViewArray),
-            DataType::LargeUtf8 => array_function!(LargeStringArray),
-            DataType::Boolean => array_function!(BooleanArray),
-            DataType::Float32 => array_function!(Float32Array),
-            DataType::Float64 => array_function!(Float64Array),
-            DataType::Int8 => array_function!(Int8Array),
-            DataType::Int16 => array_function!(Int16Array),
-            DataType::Int32 => array_function!(Int32Array),
-            DataType::Int64 => array_function!(Int64Array),
-            DataType::UInt8 => array_function!(UInt8Array),
-            DataType::UInt16 => array_function!(UInt16Array),
-            DataType::UInt32 => array_function!(UInt32Array),
-            DataType::UInt64 => array_function!(UInt64Array),
-            dt => not_impl_err!("Unsupported data type in array_to_string: {dt}"),
-        }
-    }};
-}
-
-macro_rules! to_string {
-    ($ARG:expr, $ARRAY:expr, $DELIMITER:expr, $NULL_STRING:expr, $WITH_NULL_STRING:expr, $ARRAY_TYPE:ident) => {{
-        let arr = downcast_arg!($ARRAY, $ARRAY_TYPE);
-        for x in arr {
-            match x {
-                Some(x) => {
-                    $ARG.push_str(&x.to_string());
-                    $ARG.push_str($DELIMITER);
-                }
-                None => {
-                    if $WITH_NULL_STRING {
-                        $ARG.push_str($NULL_STRING);
-                        $ARG.push_str($DELIMITER);
-                    }
-                }
-            }
-        }
-        Ok($ARG)
-    }};
-}
-
 // Create static instances of ScalarUDFs for each function
 make_udf_expr_and_func!(
     ArrayToString,
@@ -145,7 +82,7 @@ make_udf_expr_and_func!(
     argument(name = "delimiter", description = "Array element separator."),
     argument(
         name = "null_string",
-        description = "Optional. String to replace null values in the array. If not provided, nulls will be handled by default behavior."
+        description = "Optional. String to use for null values in the output. If not provided, nulls will be omitted."
     )
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
@@ -193,10 +130,6 @@ impl ArrayToString {
 }
 
 impl ScalarUDFImpl for ArrayToString {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array_to_string"
     }
@@ -209,10 +142,7 @@ impl ScalarUDFImpl for ArrayToString {
         Ok(Utf8)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_to_string_inner)(&args.args)
     }
 
@@ -259,11 +189,17 @@ make_udf_expr_and_func!(
     )
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
-pub(super) struct StringToArray {
+pub struct StringToArray {
     signature: Signature,
     aliases: Vec<String>,
 }
 
+impl Default for StringToArray {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl StringToArray {
     pub fn new() -> Self {
         Self {
@@ -287,10 +223,6 @@ impl StringToArray {
 }
 
 impl ScalarUDFImpl for StringToArray {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "string_to_array"
     }
@@ -306,17 +238,72 @@ impl ScalarUDFImpl for StringToArray {
         ))))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        let args = &args.args;
-        match args[0].data_type() {
-            Utf8 | Utf8View => make_scalar_function(string_to_array_inner::<i32>)(args),
-            LargeUtf8 => make_scalar_function(string_to_array_inner::<i64>)(args),
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+
+        let delimiter_is_scalar = matches!(&args[1], ColumnarValue::Scalar(_));
+        let null_str_is_scalar = args
+            .get(2)
+            .is_none_or(|a| matches!(a, ColumnarValue::Scalar(_)));
+
+        if !(delimiter_is_scalar && null_str_is_scalar) {
+            return make_scalar_function(string_to_array_fallback)(&args);
+        }
+
+        // Delimiter and null_str (if given) are scalar, so use the fast path
+        let delimiter = match &args[1] {
+            ColumnarValue::Scalar(s) => s.try_as_str().ok_or_else(|| {
+                exec_datafusion_err!(
+                    "unsupported type for string_to_array delimiter: {:?}",
+                    args[1].data_type()
+                )
+            })?,
+            _ => unreachable!("delimiter must be scalar in this branch"),
+        };
+        let null_value = match args.get(2) {
+            Some(ColumnarValue::Scalar(s)) => s.try_as_str().ok_or_else(|| {
+                exec_datafusion_err!(
+                    "unsupported type for string_to_array null_str: {:?}",
+                    args[2].data_type()
+                )
+            })?,
+            _ => None,
+        };
+
+        let (all_scalar, string_array) = match &args[0] {
+            ColumnarValue::Array(a) => (false, Arc::clone(a)),
+            ColumnarValue::Scalar(s) => (true, s.to_array_of_size(1)?),
+        };
+
+        let result = match string_array.data_type() {
+            Utf8 => {
+                let arr = string_array.as_string::<i32>();
+                let builder =
+                    StringBuilder::with_capacity(arr.len(), arr.get_buffer_memory_size());
+                string_to_array_scalar_args(&arr, delimiter, null_value, builder)
+            }
+            Utf8View => {
+                let arr = string_array.as_string_view();
+                let builder = StringViewBuilder::with_capacity(arr.len());
+                string_to_array_scalar_args(&arr, delimiter, null_value, builder)
+            }
+            LargeUtf8 => {
+                let arr = string_array.as_string::<i64>();
+                let builder = LargeStringBuilder::with_capacity(
+                    arr.len(),
+                    arr.get_buffer_memory_size(),
+                );
+                string_to_array_scalar_args(&arr, delimiter, null_value, builder)
+            }
             other => {
                 exec_err!("unsupported type for string_to_array function as {other:?}")
             }
+        }?;
+
+        if all_scalar {
+            ScalarValue::try_from_array(&result, 0).map(ColumnarValue::Scalar)
+        } else {
+            Ok(ColumnarValue::Array(result))
         }
     }
 
@@ -329,429 +316,468 @@ impl ScalarUDFImpl for StringToArray {
     }
 }
 
-/// Array_to_string SQL function
-pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
-    if args.len() < 2 || args.len() > 3 {
-        return exec_err!("array_to_string expects two or three arguments");
+/// Appends `value` to the string builder, or NULL if it matches `null_value`.
+#[inline(always)]
+fn append_part(
+    builder: &mut impl StringArrayBuilderType,
+    value: &str,
+    null_value: Option<&str>,
+) {
+    if null_value == Some(value) {
+        builder.append_null();
+    } else {
+        builder.append_value(value);
     }
+}
 
-    let arr = &args[0];
-
-    let delimiters: Vec<Option<&str>> = match args[1].data_type() {
-        Utf8 => args[1].as_string::<i32>().iter().collect(),
-        Utf8View => args[1].as_string_view().iter().collect(),
-        LargeUtf8 => args[1].as_string::<i64>().iter().collect(),
-        other => return exec_err!("unsupported type for second argument to array_to_string function as {other:?}")
-    };
+/// Optimized `string_to_array` implementation for the common case where
+/// delimiter and null_value are scalar values.
+fn string_to_array_scalar_args<'a, StringArrType, StringBuilderType>(
+    string_array: &StringArrType,
+    delimiter: Option<&str>,
+    null_value: Option<&str>,
+    string_builder: StringBuilderType,
+) -> Result<ArrayRef>
+where
+    StringArrType: StringArrayType<'a>,
+    StringBuilderType: StringArrayBuilderType,
+{
+    let mut list_builder = ListBuilder::new(string_builder);
 
-    let mut null_string = String::from("");
-    let mut with_null_string = false;
-    if args.len() == 3 {
-        null_string = match args[2].data_type() {
-            Utf8 => args[2].as_string::<i32>().value(0).to_string(),
-            Utf8View => args[2].as_string_view().value(0).to_string(),
-            LargeUtf8 => args[2].as_string::<i64>().value(0).to_string(),
-            other => return exec_err!("unsupported type for second argument to array_to_string function as {other:?}")
-        };
-        with_null_string = true;
-    }
-
-    /// Creates a single string from single element of a ListArray (which is
-    /// itself another Array)
-    fn compute_array_to_string(
-        arg: &mut String,
-        arr: ArrayRef,
-        delimiter: String,
-        null_string: String,
-        with_null_string: bool,
-    ) -> Result<&mut String> {
-        match arr.data_type() {
-            List(..) => {
-                let list_array = as_list_array(&arr)?;
-                for i in 0..list_array.len() {
-                    if !list_array.is_null(i) {
-                        compute_array_to_string(
-                            arg,
-                            list_array.value(i),
-                            delimiter.clone(),
-                            null_string.clone(),
-                            with_null_string,
-                        )?;
-                    } else if with_null_string {
-                        arg.push_str(&null_string);
-                        arg.push_str(&delimiter);
-                    }
+    match delimiter {
+        Some("") => {
+            // Empty delimiter: each non-empty string becomes a single-element list.
+            // Empty strings produce an empty array (PostgreSQL compat).
+            for i in 0..string_array.len() {
+                if string_array.is_null(i) {
+                    list_builder.append(false);
+                    continue;
                 }
-
-                Ok(arg)
-            }
-            FixedSizeList(..) => {
-                let list_array = as_fixed_size_list_array(&arr)?;
-
-                for i in 0..list_array.len() {
-                    if !list_array.is_null(i) {
-                        compute_array_to_string(
-                            arg,
-                            list_array.value(i),
-                            delimiter.clone(),
-                            null_string.clone(),
-                            with_null_string,
-                        )?;
-                    } else if with_null_string {
-                        arg.push_str(&null_string);
-                        arg.push_str(&delimiter);
-                    }
+                let string = string_array.value(i);
+                if !string.is_empty() {
+                    append_part(list_builder.values(), string, null_value);
                 }
-
-                Ok(arg)
+                list_builder.append(true);
             }
-            LargeList(..) => {
-                let list_array = as_large_list_array(&arr)?;
-                for i in 0..list_array.len() {
-                    if !list_array.is_null(i) {
-                        compute_array_to_string(
-                            arg,
-                            list_array.value(i),
-                            delimiter.clone(),
-                            null_string.clone(),
-                            with_null_string,
-                        )?;
-                    } else if with_null_string {
-                        arg.push_str(&null_string);
-                        arg.push_str(&delimiter);
-                    }
+        }
+        Some(delimiter) => {
+            // Rather than using `str::split`, do the split ourselves using
+            // `memmem::Finder`. This allows pre-compiling the delimiter search
+            // pattern once and reusing it for all rows.
+            let finder = memchr::memmem::Finder::new(delimiter.as_bytes());
+            let delim_len = delimiter.len();
+
+            for i in 0..string_array.len() {
+                if string_array.is_null(i) {
+                    list_builder.append(false);
+                    continue;
                 }
-
-                Ok(arg)
-            }
-            Dictionary(_key_type, value_type) => {
-                // Call cast to unwrap the dictionary. This could be optimized if we wanted
-                // to accept the overhead of extra code
-                let values = cast(&arr, value_type.as_ref()).map_err(|e| {
-                    DataFusionError::from(e).context(
-                        "Casting dictionary to values in compute_array_to_string",
-                    )
-                })?;
-                compute_array_to_string(
-                    arg,
-                    values,
-                    delimiter,
-                    null_string,
-                    with_null_string,
-                )
-            }
-            Null => Ok(arg),
-            data_type => {
-                macro_rules! array_function {
-                    ($ARRAY_TYPE:ident) => {
-                        to_string!(
-                            arg,
-                            arr,
-                            &delimiter,
-                            &null_string,
-                            with_null_string,
-                            $ARRAY_TYPE
-                        )
-                    };
+                let string = string_array.value(i);
+                if !string.is_empty() {
+                    let bytes = string.as_bytes();
+                    let mut start = 0;
+                    for pos in finder.find_iter(bytes) {
+                        append_part(
+                            list_builder.values(),
+                            &string[start..pos],
+                            null_value,
+                        );
+                        start = pos + delim_len;
+                    }
+                    // Trailing part after last delimiter (or entire string if no
+                    // delimiter was found).
+                    append_part(list_builder.values(), &string[start..], null_value);
                 }
-                call_array_function!(data_type, false)
+                list_builder.append(true);
             }
         }
-    }
-
-    fn generate_string_array<O: OffsetSizeTrait>(
-        list_arr: &GenericListArray<O>,
-        delimiters: Vec<Option<&str>>,
-        null_string: String,
-        with_null_string: bool,
-    ) -> Result<StringArray> {
-        let mut res: Vec<Option<String>> = Vec::new();
-        for (arr, &delimiter) in list_arr.iter().zip(delimiters.iter()) {
-            if let (Some(arr), Some(delimiter)) = (arr, delimiter) {
-                let mut arg = String::from("");
-                let s = compute_array_to_string(
-                    &mut arg,
-                    arr,
-                    delimiter.to_string(),
-                    null_string.clone(),
-                    with_null_string,
-                )?
-                .clone();
-
-                if let Some(s) = s.strip_suffix(delimiter) {
-                    res.push(Some(s.to_string()));
-                } else {
-                    res.push(Some(s));
+        None => {
+            // NULL delimiter: split into individual characters.
+            for i in 0..string_array.len() {
+                if string_array.is_null(i) {
+                    list_builder.append(false);
+                    continue;
                 }
-            } else {
-                res.push(None);
+                let string = string_array.value(i);
+                for (pos, c) in string.char_indices() {
+                    append_part(
+                        list_builder.values(),
+                        &string[pos..pos + c.len_utf8()],
+                        null_value,
+                    );
+                }
+                list_builder.append(true);
             }
         }
-
-        Ok(StringArray::from(res))
     }
 
-    let string_arr = match arr.data_type() {
-        List(_) => {
-            let list_array = as_list_array(&arr)?;
-            generate_string_array::<i32>(
-                list_array,
-                delimiters,
-                null_string,
-                with_null_string,
-            )?
-        }
-        LargeList(_) => {
-            let list_array = as_large_list_array(&arr)?;
-            generate_string_array::<i64>(
-                list_array,
-                delimiters,
-                null_string,
-                with_null_string,
-            )?
-        }
-        // Signature guards against this arm
-        _ => return exec_err!("array_to_string expects list as first argument"),
-    };
-
-    Ok(Arc::new(string_arr))
+    Ok(Arc::new(list_builder.finish()) as ArrayRef)
 }
 
-/// String_to_array SQL function
-/// Splits string at occurrences of delimiter and returns an array of parts
-/// string_to_array('abc~@~def~@~ghi', '~@~') = '["abc", "def", "ghi"]'
-fn string_to_array_inner<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
-    if args.len() < 2 || args.len() > 3 {
-        return exec_err!("string_to_array expects two or three arguments");
-    }
+/// Fallback path for `string_to_array` when delimiter and/or null_value
+/// are array columns rather than scalars.
+fn string_to_array_fallback(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let null_value_array = args.get(2);
 
     match args[0].data_type() {
         Utf8 => {
-            let string_array = args[0].as_string::<T>();
-            let builder = StringBuilder::with_capacity(string_array.len(), string_array.get_buffer_memory_size());
-            string_to_array_inner_2::<&GenericStringArray<T>, StringBuilder>(args, string_array, builder)
+            let arr = args[0].as_string::<i32>();
+            let builder =
+                StringBuilder::with_capacity(arr.len(), arr.get_buffer_memory_size());
+            string_to_array_column_args(&arr, &args[1], null_value_array, builder)
         }
         Utf8View => {
-            let string_array = args[0].as_string_view();
-            let builder = StringViewBuilder::with_capacity(string_array.len());
-            string_to_array_inner_2::<&StringViewArray, StringViewBuilder>(args, string_array, builder)
+            let arr = args[0].as_string_view();
+            let builder = StringViewBuilder::with_capacity(arr.len());
+            string_to_array_column_args(&arr, &args[1], null_value_array, builder)
         }
         LargeUtf8 => {
-            let string_array = args[0].as_string::<T>();
-            let builder = LargeStringBuilder::with_capacity(string_array.len(), string_array.get_buffer_memory_size());
-            string_to_array_inner_2::<&GenericStringArray<T>, LargeStringBuilder>(args, string_array, builder)
+            let arr = args[0].as_string::<i64>();
+            let builder = LargeStringBuilder::with_capacity(
+                arr.len(),
+                arr.get_buffer_memory_size(),
+            );
+            string_to_array_column_args(&arr, &args[1], null_value_array, builder)
         }
-        other =>  exec_err!("unsupported type for first argument to string_to_array function as {other:?}")
+        other => exec_err!("unsupported type for string_to_array function as {other:?}"),
     }
 }
 
-fn string_to_array_inner_2<'a, StringArrType, StringBuilderType>(
-    args: &'a [ArrayRef],
-    string_array: StringArrType,
+fn string_to_array_column_args<'a, StringArrType, StringBuilderType>(
+    string_array: &StringArrType,
+    delimiter_array: &ArrayRef,
+    null_value_array: Option<&ArrayRef>,
     string_builder: StringBuilderType,
 ) -> Result<ArrayRef>
 where
     StringArrType: StringArrayType<'a>,
     StringBuilderType: StringArrayBuilderType,
 {
-    match args[1].data_type() {
-        Utf8 => {
-            let delimiter_array = args[1].as_string::<i32>();
-            if args.len() == 2 {
-                string_to_array_impl::<
-                    StringArrType,
-                    &GenericStringArray<i32>,
-                    &StringViewArray,
-                    StringBuilderType,
-                >(string_array, delimiter_array, None, string_builder)
-            } else {
-                string_to_array_inner_3::<StringArrType,
-                    &GenericStringArray<i32>,
-                    StringBuilderType>(args, string_array, delimiter_array, string_builder)
-            }
+    let mut list_builder = ListBuilder::new(string_builder);
+
+    for i in 0..string_array.len() {
+        if string_array.is_null(i) {
+            list_builder.append(false);
+            continue;
         }
-        Utf8View => {
-            let delimiter_array = args[1].as_string_view();
-
-            if args.len() == 2 {
-                string_to_array_impl::<
-                    StringArrType,
-                    &StringViewArray,
-                    &StringViewArray,
-                    StringBuilderType,
-                >(string_array, delimiter_array, None, string_builder)
-            } else {
-                string_to_array_inner_3::<StringArrType,
-                    &StringViewArray,
-                    StringBuilderType>(args, string_array, delimiter_array, string_builder)
+
+        let string = string_array.value(i);
+        let delimiter = get_str_value(delimiter_array, i);
+        let null_value = null_value_array.and_then(|arr| get_str_value(arr, i));
+
+        match delimiter {
+            Some("") => {
+                if !string.is_empty() {
+                    append_part(list_builder.values(), string, null_value);
+                }
             }
-        }
-        LargeUtf8 => {
-            let delimiter_array = args[1].as_string::<i64>();
-            if args.len() == 2 {
-                string_to_array_impl::<
-                    StringArrType,
-                    &GenericStringArray<i64>,
-                    &StringViewArray,
-                    StringBuilderType,
-                >(string_array, delimiter_array, None, string_builder)
-            } else {
-                string_to_array_inner_3::<StringArrType,
-                    &GenericStringArray<i64>,
-                    StringBuilderType>(args, string_array, delimiter_array, string_builder)
+            Some(delimiter) => {
+                if !string.is_empty() {
+                    for part in string.split(delimiter) {
+                        append_part(list_builder.values(), part, null_value);
+                    }
+                }
+            }
+            None => {
+                for (pos, c) in string.char_indices() {
+                    append_part(
+                        list_builder.values(),
+                        &string[pos..pos + c.len_utf8()],
+                        null_value,
+                    );
+                }
             }
         }
-        other =>  exec_err!("unsupported type for second argument to string_to_array function as {other:?}")
+
+        list_builder.append(true);
     }
+
+    Ok(Arc::new(list_builder.finish()) as ArrayRef)
 }
 
-fn string_to_array_inner_3<'a, StringArrType, DelimiterArrType, StringBuilderType>(
-    args: &'a [ArrayRef],
-    string_array: StringArrType,
-    delimiter_array: DelimiterArrType,
-    string_builder: StringBuilderType,
-) -> Result<ArrayRef>
-where
-    StringArrType: StringArrayType<'a>,
-    DelimiterArrType: StringArrayType<'a>,
-    StringBuilderType: StringArrayBuilderType,
-{
-    match args[2].data_type() {
-        Utf8 => {
-            let null_type_array = Some(args[2].as_string::<i32>());
-            string_to_array_impl::<
-                StringArrType,
-                DelimiterArrType,
-                &GenericStringArray<i32>,
-                StringBuilderType,
-            >(
-                string_array,
-                delimiter_array,
-                null_type_array,
-                string_builder,
-            )
+/// Returns the string value at index `i` from a string array of any type.
+fn get_str_value(array: &ArrayRef, i: usize) -> Option<&str> {
+    if array.is_null(i) {
+        return None;
+    }
+    match array.data_type() {
+        Utf8 => Some(array.as_string::<i32>().value(i)),
+        LargeUtf8 => Some(array.as_string::<i64>().value(i)),
+        Utf8View => Some(array.as_string_view().value(i)),
+        other => {
+            debug_assert!(false, "unexpected type in get_str_value: {other:?}");
+            None
         }
-        Utf8View => {
-            let null_type_array = Some(args[2].as_string_view());
-            string_to_array_impl::<
-                StringArrType,
-                DelimiterArrType,
-                &StringViewArray,
-                StringBuilderType,
-            >(
-                string_array,
-                delimiter_array,
-                null_type_array,
-                string_builder,
-            )
+    }
+}
+
+fn array_to_string_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+    if args.len() < 2 || args.len() > 3 {
+        return exec_err!("array_to_string expects two or three arguments");
+    }
+
+    let arr = &args[0];
+
+    let delimiters: Vec<Option<&str>> = match args[1].data_type() {
+        Utf8 => args[1].as_string::<i32>().iter().collect(),
+        Utf8View => args[1].as_string_view().iter().collect(),
+        LargeUtf8 => args[1].as_string::<i64>().iter().collect(),
+        other => {
+            return exec_err!(
+                "unsupported type for second argument to array_to_string function as {other:?}"
+            );
         }
-        LargeUtf8 => {
-            let null_type_array = Some(args[2].as_string::<i64>());
-            string_to_array_impl::<
-                StringArrType,
-                DelimiterArrType,
-                &GenericStringArray<i64>,
-                StringBuilderType,
-            >(
-                string_array,
-                delimiter_array,
-                null_type_array,
-                string_builder,
-            )
+    };
+
+    let null_strings: Vec<Option<&str>> = if args.len() == 3 {
+        match args[2].data_type() {
+            Utf8 => args[2].as_string::<i32>().iter().collect(),
+            Utf8View => args[2].as_string_view().iter().collect(),
+            LargeUtf8 => args[2].as_string::<i64>().iter().collect(),
+            other => {
+                return exec_err!(
+                    "unsupported type for third argument to array_to_string function as {other:?}"
+                );
+            }
         }
-        other => {
-            exec_err!("unsupported type for string_to_array function as {other:?}")
+    } else {
+        // If `null_strings` is not specified, we treat it as equivalent to
+        // explicitly passing a NULL value for `null_strings` in every row.
+        vec![None; args[0].len()]
+    };
+
+    let string_arr = match arr.data_type() {
+        List(_) => {
+            let list_array = as_list_array(&arr)?;
+            generate_string_array::<i32>(list_array, &delimiters, &null_strings)?
         }
-    }
+        LargeList(_) => {
+            let list_array = as_large_list_array(&arr)?;
+            generate_string_array::<i64>(list_array, &delimiters, &null_strings)?
+        }
+        // Signature guards against this arm
+        _ => return exec_err!("array_to_string expects list as first argument"),
+    };
+
+    Ok(Arc::new(string_arr))
 }
 
-fn string_to_array_impl<
-    'a,
-    StringArrType,
-    DelimiterArrType,
-    NullValueArrType,
-    StringBuilderType,
->(
-    string_array: StringArrType,
-    delimiter_array: DelimiterArrType,
-    null_value_array: Option<NullValueArrType>,
-    string_builder: StringBuilderType,
-) -> Result<ArrayRef>
-where
-    StringArrType: StringArrayType<'a>,
-    DelimiterArrType: StringArrayType<'a>,
-    NullValueArrType: StringArrayType<'a>,
-    StringBuilderType: StringArrayBuilderType,
-{
-    let mut list_builder = ListBuilder::new(string_builder);
+fn generate_string_array<O: OffsetSizeTrait>(
+    list_arr: &GenericListArray<O>,
+    delimiters: &[Option<&str>],
+    null_strings: &[Option<&str>],
+) -> Result<StringArray> {
+    let mut builder = StringBuilder::with_capacity(list_arr.len(), 0);
+
+    for ((arr, &delimiter), &null_string) in list_arr
+        .iter()
+        .zip(delimiters.iter())
+        .zip(null_strings.iter())
+    {
+        let (Some(arr), Some(delimiter)) = (arr, delimiter) else {
+            builder.append_null();
+            continue;
+        };
 
-    match null_value_array {
-        None => {
-            string_array.iter().zip(delimiter_array.iter()).for_each(
-                |(string, delimiter)| {
-                    match (string, delimiter) {
-                        (Some(string), Some("")) => {
-                            list_builder.values().append_value(string);
-                            list_builder.append(true);
-                        }
-                        (Some(string), Some(delimiter)) => {
-                            string.split(delimiter).for_each(|s| {
-                                list_builder.values().append_value(s);
-                            });
-                            list_builder.append(true);
-                        }
-                        (Some(string), None) => {
-                            string.chars().map(|c| c.to_string()).for_each(|c| {
-                                list_builder.values().append_value(c.as_str());
-                            });
-                            list_builder.append(true);
-                        }
-                        _ => list_builder.append(false), // null value
+        let mut first = true;
+        compute_array_to_string(&mut builder, &arr, delimiter, null_string, &mut first)?;
+        builder.append_value("");
+    }
+
+    Ok(builder.finish())
+}
+
+fn compute_array_to_string(
+    w: &mut impl Write,
+    arr: &ArrayRef,
+    delimiter: &str,
+    null_string: Option<&str>,
+    first: &mut bool,
+) -> Result<()> {
+    // Handle lists by recursing on each list element.
+    macro_rules! handle_list {
+        ($list_array:expr) => {
+            for i in 0..$list_array.len() {
+                if !$list_array.is_null(i) {
+                    compute_array_to_string(
+                        w,
+                        &$list_array.value(i),
+                        delimiter,
+                        null_string,
+                        first,
+                    )?;
+                } else if let Some(ns) = null_string {
+                    if *first {
+                        *first = false;
+                    } else {
+                        w.write_str(delimiter)?;
                     }
-                },
-            )
+                    w.write_str(ns)?;
+                }
+            }
+        };
+    }
+
+    match arr.data_type() {
+        List(..) => {
+            let list_array = as_list_array(arr)?;
+            handle_list!(list_array);
+            Ok(())
         }
-        Some(null_value_array) => string_array
-            .iter()
-            .zip(delimiter_array.iter())
-            .zip(null_value_array.iter())
-            .for_each(|((string, delimiter), null_value)| {
-                match (string, delimiter) {
-                    (Some(string), Some("")) => {
-                        if Some(string) == null_value {
-                            list_builder.values().append_null();
-                        } else {
-                            list_builder.values().append_value(string);
-                        }
-                        list_builder.append(true);
-                    }
-                    (Some(string), Some(delimiter)) => {
-                        string.split(delimiter).for_each(|s| {
-                            if Some(s) == null_value {
-                                list_builder.values().append_null();
-                            } else {
-                                list_builder.values().append_value(s);
-                            }
-                        });
-                        list_builder.append(true);
-                    }
-                    (Some(string), None) => {
-                        string.chars().map(|c| c.to_string()).for_each(|c| {
-                            if Some(c.as_str()) == null_value {
-                                list_builder.values().append_null();
+        FixedSizeList(..) => {
+            let list_array = as_fixed_size_list_array(arr)?;
+            handle_list!(list_array);
+            Ok(())
+        }
+        LargeList(..) => {
+            let list_array = as_large_list_array(arr)?;
+            handle_list!(list_array);
+            Ok(())
+        }
+        Dictionary(_key_type, value_type) => {
+            // Call cast to unwrap the dictionary. This could be optimized if we wanted
+            // to accept the overhead of extra code
+            let values = cast(arr, value_type.as_ref()).map_err(|e| {
+                DataFusionError::from(e)
+                    .context("Casting dictionary to values in compute_array_to_string")
+            })?;
+            compute_array_to_string(w, &values, delimiter, null_string, first)
+        }
+        Null => Ok(()),
+        data_type => {
+            macro_rules! str_leaf {
+                ($ARRAY_TYPE:ident) => {
+                    write_leaf_to_string(
+                        w,
+                        downcast_arg!(arr, $ARRAY_TYPE),
+                        delimiter,
+                        null_string,
+                        first,
+                        |w, x: &str| w.write_str(x),
+                    )?
+                };
+            }
+            macro_rules! bool_leaf {
+                ($ARRAY_TYPE:ident) => {
+                    write_leaf_to_string(
+                        w,
+                        downcast_arg!(arr, $ARRAY_TYPE),
+                        delimiter,
+                        null_string,
+                        first,
+                        |w, x: bool| {
+                            if x {
+                                w.write_str("true")
                             } else {
-                                list_builder.values().append_value(c.as_str());
+                                w.write_str("false")
                             }
-                        });
-                        list_builder.append(true);
-                    }
-                    _ => list_builder.append(false), // null value
+                        },
+                    )?
+                };
+            }
+            macro_rules! int_leaf {
+                ($ARRAY_TYPE:ident) => {
+                    write_leaf_to_string(
+                        w,
+                        downcast_arg!(arr, $ARRAY_TYPE),
+                        delimiter,
+                        null_string,
+                        first,
+                        |w, x| {
+                            let mut itoa_buf = itoa::Buffer::new();
+                            w.write_str(itoa_buf.format(x))
+                        },
+                    )?
+                };
+            }
+            macro_rules! float_leaf {
+                ($ARRAY_TYPE:ident) => {
+                    write_leaf_to_string(
+                        w,
+                        downcast_arg!(arr, $ARRAY_TYPE),
+                        delimiter,
+                        null_string,
+                        first,
+                        // TODO: Consider switching to a more efficient
+                        // floating point display library (e.g., ryu). This
+                        // might result in some differences in the output
+                        // format, however.
+                        |w, x| write!(w, "{}", x),
+                    )?
+                };
+            }
+            match data_type {
+                Utf8 => str_leaf!(StringArray),
+                Utf8View => str_leaf!(StringViewArray),
+                LargeUtf8 => str_leaf!(LargeStringArray),
+                DataType::Boolean => bool_leaf!(BooleanArray),
+                DataType::Float32 => float_leaf!(Float32Array),
+                DataType::Float64 => float_leaf!(Float64Array),
+                DataType::Int8 => int_leaf!(Int8Array),
+                DataType::Int16 => int_leaf!(Int16Array),
+                DataType::Int32 => int_leaf!(Int32Array),
+                DataType::Int64 => int_leaf!(Int64Array),
+                DataType::UInt8 => int_leaf!(UInt8Array),
+                DataType::UInt16 => int_leaf!(UInt16Array),
+                DataType::UInt32 => int_leaf!(UInt32Array),
+                DataType::UInt64 => int_leaf!(UInt64Array),
+                data_type if can_cast_types(data_type, &Utf8) => {
+                    let str_arr = cast(arr, &Utf8).map_err(|e| {
+                        DataFusionError::from(e)
+                            .context("Casting to string in array_to_string")
+                    })?;
+                    return compute_array_to_string(
+                        w,
+                        &str_arr,
+                        delimiter,
+                        null_string,
+                        first,
+                    );
                 }
-            }),
-    };
+                data_type => {
+                    return not_impl_err!(
+                        "Unsupported data type in array_to_string: {data_type}"
+                    );
+                }
+            }
+            Ok(())
+        }
+    }
+}
 
-    let list_array = list_builder.finish();
-    Ok(Arc::new(list_array) as ArrayRef)
+/// Appends the string representation of each element in a leaf (non-list)
+/// array to `w`, separated by `delimiter`. Null elements are rendered
+/// using `null_string` if provided, or skipped otherwise. The `append`
+/// closure controls how each non-null element is written.
+fn write_leaf_to_string<'a, W: Write, A, T>(
+    w: &mut W,
+    arr: &'a A,
+    delimiter: &str,
+    null_string: Option<&str>,
+    first: &mut bool,
+    append: impl Fn(&mut W, T) -> fmt::Result,
+) -> Result<()>
+where
+    &'a A: IntoIterator<Item = Option<T>>,
+{
+    for x in arr {
+        // Skip nulls when no null_string is provided
+        if x.is_none() && null_string.is_none() {
+            continue;
+        }
+
+        if *first {
+            *first = false;
+        } else {
+            w.write_str(delimiter)?;
+        }
+
+        match x {
+            Some(x) => append(w, x)?,
+            None => w.write_str(null_string.unwrap())?,
+        }
+    }
+    Ok(())
 }
 
 trait StringArrayBuilderType: ArrayBuilder {
diff --git a/datafusion/functions-nested/src/utils.rs b/datafusion/functions-nested/src/utils.rs
index 464301b6ffcf0..eeff003e8e766 100644
--- a/datafusion/functions-nested/src/utils.rs
+++ b/datafusion/functions-nested/src/utils.rs
@@ -22,13 +22,14 @@ use std::sync::Arc;
 use arrow::datatypes::{DataType, Field, Fields};
 
 use arrow::array::{
-    Array, ArrayRef, BooleanArray, GenericListArray, OffsetSizeTrait, Scalar, UInt32Array,
+    Array, ArrayRef, BooleanArray, GenericListArray, OffsetSizeTrait, Scalar,
 };
 use arrow::buffer::OffsetBuffer;
 use datafusion_common::cast::{
-    as_fixed_size_list_array, as_large_list_array, as_list_array,
+    as_fixed_size_list_array, as_large_list_array, as_large_list_view_array,
+    as_list_array, as_list_view_array,
 };
-use datafusion_common::{exec_err, internal_err, plan_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err, plan_err};
 
 use datafusion_expr::ColumnarValue;
 use itertools::Itertools as _;
@@ -161,8 +162,7 @@ pub(crate) fn compare_element_to_list(
         );
     }
 
-    let indices = UInt32Array::from(vec![row_index as u32]);
-    let element_array_row = arrow::compute::take(element_array, &indices, None)?;
+    let element_array_row = element_array.slice(row_index, 1);
 
     // Compute all positions in list_row_array (that is itself an
     // array) that are equal to `from_array_row`
@@ -244,6 +244,14 @@ pub(crate) fn compute_array_dims(
                 value = as_large_list_array(&value)?.value(0);
                 res.push(Some(value.len() as u64));
             }
+            DataType::ListView(_) => {
+                value = as_list_view_array(&value)?.value(0);
+                res.push(Some(value.len() as u64));
+            }
+            DataType::LargeListView(_) => {
+                value = as_large_list_view_array(&value)?.value(0);
+                res.push(Some(value.len() as u64));
+            }
             DataType::FixedSizeList(..) => {
                 value = as_fixed_size_list_array(&value)?.value(0);
                 res.push(Some(value.len() as u64));
@@ -260,7 +268,7 @@ pub(crate) fn get_map_entry_field(data_type: &DataType) -> Result<&Fields> {
             match field_data_type {
                 DataType::Struct(fields) => Ok(fields),
                 _ => {
-                    internal_err!("Expected a Struct type, got {:?}", field_data_type)
+                    internal_err!("Expected a Struct type, got {}", field_data_type)
                 }
             }
         }
diff --git a/datafusion/functions-table/Cargo.toml b/datafusion/functions-table/Cargo.toml
index 78d59257dd480..4edb640cb2cf2 100644
--- a/datafusion/functions-table/Cargo.toml
+++ b/datafusion/functions-table/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -45,7 +48,6 @@ datafusion-common = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-physical-plan = { workspace = true }
 parking_lot = { workspace = true }
-paste = "1.0.14"
 
 [dev-dependencies]
 arrow = { workspace = true, features = ["test_utils"] }
diff --git a/datafusion/functions-table/src/generate_series.rs b/datafusion/functions-table/src/generate_series.rs
index c66e652147eb8..175a6b3bff06c 100644
--- a/datafusion/functions-table/src/generate_series.rs
+++ b/datafusion/functions-table/src/generate_series.rs
@@ -23,13 +23,13 @@ use arrow::datatypes::{
 };
 use arrow::record_batch::RecordBatch;
 use async_trait::async_trait;
-use datafusion_catalog::Session;
 use datafusion_catalog::TableFunctionImpl;
 use datafusion_catalog::TableProvider;
-use datafusion_common::{plan_err, Result, ScalarValue};
+use datafusion_catalog::{Session, TableFunctionArgs};
+use datafusion_common::{Result, ScalarValue, plan_err};
 use datafusion_expr::{Expr, TableType};
-use datafusion_physical_plan::memory::{LazyBatchGenerator, LazyMemoryExec};
 use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::memory::{LazyBatchGenerator, LazyMemoryExec};
 use parking_lot::RwLock;
 use std::any::Any;
 use std::fmt;
@@ -56,6 +56,10 @@ impl LazyBatchGenerator for Empty {
     fn generate_next_batch(&mut self) -> Result<Option<RecordBatch>> {
         Ok(None)
     }
+
+    fn reset_state(&self) -> Arc<RwLock<dyn LazyBatchGenerator>> {
+        Arc::new(RwLock::new(Empty { name: self.name }))
+    }
 }
 
 impl fmt::Display for Empty {
@@ -237,7 +241,6 @@ impl GenerateSeriesTable {
     pub fn as_generator(
         &self,
         batch_size: usize,
-        projection: Option<Vec<usize>>,
     ) -> Result<Arc<RwLock<dyn LazyBatchGenerator>>> {
         let generator: Arc<RwLock<dyn LazyBatchGenerator>> = match &self.args {
             GenSeriesArgs::ContainsNull { name } => Arc::new(RwLock::new(Empty { name })),
@@ -256,7 +259,6 @@ impl GenerateSeriesTable {
                 batch_size,
                 include_end: *include_end,
                 name,
-                projection,
             })),
             GenSeriesArgs::TimestampArgs {
                 start,
@@ -297,7 +299,6 @@ impl GenerateSeriesTable {
                     batch_size,
                     include_end: *include_end,
                     name,
-                    projection,
                 }))
             }
             GenSeriesArgs::DateArgs {
@@ -327,7 +328,6 @@ impl GenerateSeriesTable {
                 batch_size,
                 include_end: *include_end,
                 name,
-                projection,
             })),
         };
 
@@ -345,7 +345,6 @@ pub struct GenericSeriesState<T: SeriesValue> {
     current: T,
     include_end: bool,
     name: &'static str,
-    projection: Option<Vec<usize>>,
 }
 
 impl<T: SeriesValue> GenericSeriesState<T> {
@@ -401,11 +400,13 @@ impl<T: SeriesValue> LazyBatchGenerator for GenericSeriesState<T> {
 
         let array = self.current.create_array(buf)?;
         let batch = RecordBatch::try_new(Arc::clone(&self.schema), vec![array])?;
-        let projected = match self.projection.as_ref() {
-            Some(projection) => batch.project(projection)?,
-            None => batch,
-        };
-        Ok(Some(projected))
+        Ok(Some(batch))
+    }
+
+    fn reset_state(&self) -> Arc<RwLock<dyn LazyBatchGenerator>> {
+        let mut new = self.clone();
+        new.current = new.start.clone();
+        Arc::new(RwLock::new(new))
     }
 }
 
@@ -424,11 +425,7 @@ impl<T: SeriesValue> fmt::Display for GenericSeriesState<T> {
 
 fn reach_end_int64(val: i64, end: i64, step: i64, include_end: bool) -> bool {
     if step > 0 {
-        if include_end {
-            val > end
-        } else {
-            val >= end
-        }
+        if include_end { val > end } else { val >= end }
     } else if include_end {
         val < end
     } else {
@@ -436,35 +433,16 @@ fn reach_end_int64(val: i64, end: i64, step: i64, include_end: bool) -> bool {
     }
 }
 
-fn validate_interval_step(
-    step: IntervalMonthDayNano,
-    start: i64,
-    end: i64,
-) -> Result<()> {
+fn validate_interval_step(step: IntervalMonthDayNano) -> Result<()> {
     if step.months == 0 && step.days == 0 && step.nanoseconds == 0 {
         return plan_err!("Step interval cannot be zero");
     }
 
-    let step_is_positive = step.months > 0 || step.days > 0 || step.nanoseconds > 0;
-    let step_is_negative = step.months < 0 || step.days < 0 || step.nanoseconds < 0;
-
-    if start > end && step_is_positive {
-        return plan_err!("Start is bigger than end, but increment is positive: Cannot generate infinite series");
-    }
-
-    if start < end && step_is_negative {
-        return plan_err!("Start is smaller than end, but increment is negative: Cannot generate infinite series");
-    }
-
     Ok(())
 }
 
 #[async_trait]
 impl TableProvider for GenerateSeriesTable {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         Arc::clone(&self.schema)
     }
@@ -481,14 +459,12 @@ impl TableProvider for GenerateSeriesTable {
         _limit: Option<usize>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let batch_size = state.config_options().execution.batch_size;
-        let schema = match projection {
-            Some(projection) => Arc::new(self.schema.project(projection)?),
-            None => self.schema(),
-        };
+        let generator = self.as_generator(batch_size)?;
 
-        let generator = self.as_generator(batch_size, projection.cloned())?;
-
-        Ok(Arc::new(LazyMemoryExec::try_new(schema, vec![generator])?))
+        Ok(Arc::new(
+            LazyMemoryExec::try_new(self.schema(), vec![generator])?
+                .with_projection(projection.cloned()),
+        ))
     }
 }
 
@@ -499,7 +475,8 @@ struct GenerateSeriesFuncImpl {
 }
 
 impl TableFunctionImpl for GenerateSeriesFuncImpl {
-    fn call(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
+    fn call_with_args(&self, args: TableFunctionArgs) -> Result<Arc<dyn TableProvider>> {
+        let exprs = args.exprs();
         if exprs.is_empty() || exprs.len() > 3 {
             return plan_err!("{} function requires 1 to 3 arguments", self.name);
         }
@@ -540,7 +517,7 @@ impl GenerateSeriesFuncImpl {
                         "Argument #{} must be an INTEGER or NULL, got {:?}",
                         expr_index + 1,
                         other
-                    )
+                    );
                 }
             };
         }
@@ -568,14 +545,6 @@ impl GenerateSeriesFuncImpl {
             }
         };
 
-        if start > end && step > 0 {
-            return plan_err!("Start is bigger than end, but increment is positive: Cannot generate infinite series");
-        }
-
-        if start < end && step < 0 {
-            return plan_err!("Start is smaller than end, but increment is negative: Cannot generate infinite series");
-        }
-
         if step == 0 {
             return plan_err!("Step cannot be zero");
         }
@@ -609,7 +578,7 @@ impl GenerateSeriesFuncImpl {
                 return plan_err!(
                     "First argument must be a timestamp or NULL, got {:?}",
                     other
-                )
+                );
             }
         };
 
@@ -621,7 +590,7 @@ impl GenerateSeriesFuncImpl {
                 return plan_err!(
                     "Second argument must be a timestamp or NULL, got {:?}",
                     other
-                )
+                );
             }
         };
 
@@ -633,7 +602,7 @@ impl GenerateSeriesFuncImpl {
                 return plan_err!(
                     "Third argument must be an interval or NULL, got {:?}",
                     other
-                )
+                );
             }
         };
 
@@ -653,7 +622,7 @@ impl GenerateSeriesFuncImpl {
         };
 
         // Validate step interval
-        validate_interval_step(step, start, end)?;
+        validate_interval_step(step)?;
 
         Ok(Arc::new(GenerateSeriesTable {
             schema,
@@ -696,7 +665,7 @@ impl GenerateSeriesFuncImpl {
                 return plan_err!(
                     "First argument must be a date or NULL, got {:?}",
                     other
-                )
+                );
             }
         };
 
@@ -714,7 +683,7 @@ impl GenerateSeriesFuncImpl {
                 return plan_err!(
                     "Second argument must be a date or NULL, got {:?}",
                     other
-                )
+                );
             }
         };
 
@@ -734,7 +703,7 @@ impl GenerateSeriesFuncImpl {
                 return plan_err!(
                     "Third argument must be an interval or NULL, got {:?}",
                     other
-                )
+                );
             }
         };
 
@@ -746,7 +715,7 @@ impl GenerateSeriesFuncImpl {
         let end_ts = end_date as i64 * NANOS_PER_DAY;
 
         // Validate step interval
-        validate_interval_step(step_interval, start_ts, end_ts)?;
+        validate_interval_step(step_interval)?;
 
         Ok(Arc::new(GenerateSeriesTable {
             schema,
@@ -765,12 +734,12 @@ impl GenerateSeriesFuncImpl {
 pub struct GenerateSeriesFunc {}
 
 impl TableFunctionImpl for GenerateSeriesFunc {
-    fn call(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
+    fn call_with_args(&self, args: TableFunctionArgs) -> Result<Arc<dyn TableProvider>> {
         let impl_func = GenerateSeriesFuncImpl {
             name: "generate_series",
             include_end: true,
         };
-        impl_func.call(exprs)
+        impl_func.call_with_args(args)
     }
 }
 
@@ -778,11 +747,48 @@ impl TableFunctionImpl for GenerateSeriesFunc {
 pub struct RangeFunc {}
 
 impl TableFunctionImpl for RangeFunc {
-    fn call(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
+    fn call_with_args(&self, args: TableFunctionArgs) -> Result<Arc<dyn TableProvider>> {
         let impl_func = GenerateSeriesFuncImpl {
             name: "range",
             include_end: false,
         };
-        impl_func.call(exprs)
+        impl_func.call_with_args(args)
+    }
+}
+
+#[cfg(test)]
+mod generate_series_tests {
+    use std::sync::Arc;
+
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common::Result;
+    use datafusion_physical_plan::memory::LazyBatchGenerator;
+
+    use crate::generate_series::GenericSeriesState;
+
+    #[test]
+    fn test_generic_series_state_reset() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]));
+        let mut state = GenericSeriesState::<i64> {
+            schema,
+            start: 1,
+            end: 5,
+            step: 1,
+            current: 1,
+            batch_size: 8192,
+            include_end: true,
+            name: "test",
+        };
+        let batch = state.generate_next_batch()?.expect("missing batch");
+
+        let state_reset = state.reset_state();
+        let reset_batch = state_reset
+            .write()
+            .generate_next_batch()?
+            .expect("missing reset batch");
+
+        assert_eq!(batch, reset_batch);
+
+        Ok(())
     }
 }
diff --git a/datafusion/functions-table/src/lib.rs b/datafusion/functions-table/src/lib.rs
index b339a8f4a52f3..668e964901c04 100644
--- a/datafusion/functions-table/src/lib.rs
+++ b/datafusion/functions-table/src/lib.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 #![doc(
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
@@ -37,25 +38,27 @@ pub fn all_default_table_functions() -> Vec<Arc<TableFunction>> {
 /// Creates a singleton instance of a table function
 /// - `$module`: A struct implementing `TableFunctionImpl` to create the function from
 /// - `$name`: The name to give to the created function
-///
-/// This is used to ensure creating the list of `TableFunction` only happens once.
+/// - `$func_name`: The name of the function to be called
+///   This is used to ensure creating the list of `TableFunction` only happens once.
 #[macro_export]
 macro_rules! create_udtf_function {
-    ($module:path, $name:expr) => {
-        paste::paste! {
-            pub fn [<$name:lower>]() -> Arc<TableFunction> {
-                static INSTANCE: std::sync::LazyLock<Arc<TableFunction>> =
-                    std::sync::LazyLock::new(|| {
-                        std::sync::Arc::new(TableFunction::new(
-                            $name.to_string(),
-                            Arc::new($module {}),
-                        ))
-                    });
-                std::sync::Arc::clone(&INSTANCE)
-            }
+    ($module:expr, $func_name:ident, $name:expr) => {
+        pub fn $func_name() -> Arc<TableFunction> {
+            static INSTANCE: std::sync::LazyLock<Arc<TableFunction>> =
+                std::sync::LazyLock::new(|| {
+                    std::sync::Arc::new(TableFunction::new(
+                        $name.to_string(),
+                        Arc::new($module),
+                    ))
+                });
+            std::sync::Arc::clone(&INSTANCE)
         }
     };
 }
 
-create_udtf_function!(generate_series::GenerateSeriesFunc, "generate_series");
-create_udtf_function!(generate_series::RangeFunc, "range");
+create_udtf_function!(
+    generate_series::GenerateSeriesFunc {},
+    generate_series,
+    "generate_series"
+);
+create_udtf_function!(generate_series::RangeFunc {}, range, "range");
diff --git a/datafusion/functions-window-common/Cargo.toml b/datafusion/functions-window-common/Cargo.toml
index 466e7bc68b486..6af668c1459e8 100644
--- a/datafusion/functions-window-common/Cargo.toml
+++ b/datafusion/functions-window-common/Cargo.toml
@@ -31,6 +31,9 @@ version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
diff --git a/datafusion/functions-window-common/src/lib.rs b/datafusion/functions-window-common/src/lib.rs
index 76341239f6a5a..301f2c34a6c95 100644
--- a/datafusion/functions-window-common/src/lib.rs
+++ b/datafusion/functions-window-common/src/lib.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 #![doc(
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
diff --git a/datafusion/functions-window/Cargo.toml b/datafusion/functions-window/Cargo.toml
index 23ee608a82675..9c4342adae8fd 100644
--- a/datafusion/functions-window/Cargo.toml
+++ b/datafusion/functions-window/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -47,4 +50,11 @@ datafusion-macros = { workspace = true }
 datafusion-physical-expr = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
 log = { workspace = true }
-paste = "1.0.15"
+
+[dev-dependencies]
+arrow = { workspace = true, features = ["test_utils"] }
+criterion = { workspace = true }
+
+[[bench]]
+name = "nth_value"
+harness = false
diff --git a/datafusion/functions-window/benches/nth_value.rs b/datafusion/functions-window/benches/nth_value.rs
new file mode 100644
index 0000000000000..00daf9fa4f9ba
--- /dev/null
+++ b/datafusion/functions-window/benches/nth_value.rs
@@ -0,0 +1,263 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::hint::black_box;
+use std::ops::Range;
+use std::slice;
+use std::sync::Arc;
+
+use arrow::array::ArrayRef;
+use arrow::datatypes::{DataType, Field, FieldRef, Int64Type};
+use arrow::util::bench_util::create_primitive_array;
+
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_expr::{PartitionEvaluator, WindowUDFImpl};
+use datafusion_functions_window::nth_value::{NthValue, NthValueKind};
+use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
+use datafusion_physical_expr::expressions::{Column, Literal};
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+
+const ARRAY_SIZE: usize = 8192;
+
+/// Creates a partition evaluator for FIRST_VALUE, LAST_VALUE, or NTH_VALUE
+fn create_evaluator(
+    kind: NthValueKind,
+    ignore_nulls: bool,
+    n: Option<i64>,
+) -> Box<dyn PartitionEvaluator> {
+    let expr = Arc::new(Column::new("c", 0)) as Arc<dyn PhysicalExpr>;
+    let input_field: FieldRef = Field::new("c", DataType::Int64, true).into();
+    let input_fields = vec![input_field];
+
+    let (nth_value, exprs): (NthValue, Vec<Arc<dyn PhysicalExpr>>) = match kind {
+        NthValueKind::First => (NthValue::first(), vec![expr]),
+        NthValueKind::Last => (NthValue::last(), vec![expr]),
+        NthValueKind::Nth => {
+            let n_value =
+                Arc::new(Literal::new(ScalarValue::Int64(n))) as Arc<dyn PhysicalExpr>;
+            (NthValue::nth(), vec![expr, n_value])
+        }
+    };
+
+    let args = PartitionEvaluatorArgs::new(&exprs, &input_fields, false, ignore_nulls);
+    nth_value.partition_evaluator(args).unwrap()
+}
+
+fn bench_nth_value_ignore_nulls(c: &mut Criterion) {
+    let mut group = c.benchmark_group("nth_value_ignore_nulls");
+
+    // Test different null densities
+    let null_densities = [0.0, 0.3, 0.5, 0.8];
+
+    for null_density in null_densities {
+        let values = Arc::new(create_primitive_array::<Int64Type>(
+            ARRAY_SIZE,
+            null_density,
+        )) as ArrayRef;
+        let null_pct = (null_density * 100.0) as u32;
+
+        // FIRST_VALUE with ignore_nulls - expanding window
+        group.bench_function(
+            BenchmarkId::new("first_value_expanding", format!("{null_pct}%_nulls")),
+            |b| {
+                b.iter(|| {
+                    let mut evaluator = create_evaluator(NthValueKind::First, true, None);
+                    let values_slice = slice::from_ref(&values);
+                    for i in 0..values.len() {
+                        let range = Range {
+                            start: 0,
+                            end: i + 1,
+                        };
+                        black_box(evaluator.evaluate(values_slice, &range).unwrap());
+                    }
+                })
+            },
+        );
+
+        // LAST_VALUE with ignore_nulls - expanding window
+        group.bench_function(
+            BenchmarkId::new("last_value_expanding", format!("{null_pct}%_nulls")),
+            |b| {
+                b.iter(|| {
+                    let mut evaluator = create_evaluator(NthValueKind::Last, true, None);
+                    let values_slice = slice::from_ref(&values);
+                    for i in 0..values.len() {
+                        let range = Range {
+                            start: 0,
+                            end: i + 1,
+                        };
+                        black_box(evaluator.evaluate(values_slice, &range).unwrap());
+                    }
+                })
+            },
+        );
+
+        // NTH_VALUE(col, 10) with ignore_nulls - get 10th non-null value
+        group.bench_function(
+            BenchmarkId::new("nth_value_10_expanding", format!("{null_pct}%_nulls")),
+            |b| {
+                b.iter(|| {
+                    let mut evaluator =
+                        create_evaluator(NthValueKind::Nth, true, Some(10));
+                    let values_slice = slice::from_ref(&values);
+                    for i in 0..values.len() {
+                        let range = Range {
+                            start: 0,
+                            end: i + 1,
+                        };
+                        black_box(evaluator.evaluate(values_slice, &range).unwrap());
+                    }
+                })
+            },
+        );
+
+        // NTH_VALUE(col, -10) with ignore_nulls - get 10th from last non-null value
+        group.bench_function(
+            BenchmarkId::new("nth_value_neg10_expanding", format!("{null_pct}%_nulls")),
+            |b| {
+                b.iter(|| {
+                    let mut evaluator =
+                        create_evaluator(NthValueKind::Nth, true, Some(-10));
+                    let values_slice = slice::from_ref(&values);
+                    for i in 0..values.len() {
+                        let range = Range {
+                            start: 0,
+                            end: i + 1,
+                        };
+                        black_box(evaluator.evaluate(values_slice, &range).unwrap());
+                    }
+                })
+            },
+        );
+
+        // Sliding window benchmarks with 100-row window
+        let window_size: usize = 100;
+
+        group.bench_function(
+            BenchmarkId::new("first_value_sliding_100", format!("{null_pct}%_nulls")),
+            |b| {
+                b.iter(|| {
+                    let mut evaluator = create_evaluator(NthValueKind::First, true, None);
+                    let values_slice = slice::from_ref(&values);
+                    for i in 0..values.len() {
+                        let start = i.saturating_sub(window_size - 1);
+                        let range = Range { start, end: i + 1 };
+                        black_box(evaluator.evaluate(values_slice, &range).unwrap());
+                    }
+                })
+            },
+        );
+
+        group.bench_function(
+            BenchmarkId::new("last_value_sliding_100", format!("{null_pct}%_nulls")),
+            |b| {
+                b.iter(|| {
+                    let mut evaluator = create_evaluator(NthValueKind::Last, true, None);
+                    let values_slice = slice::from_ref(&values);
+                    for i in 0..values.len() {
+                        let start = i.saturating_sub(window_size - 1);
+                        let range = Range { start, end: i + 1 };
+                        black_box(evaluator.evaluate(values_slice, &range).unwrap());
+                    }
+                })
+            },
+        );
+    }
+
+    group.finish();
+
+    // Comparison benchmarks: ignore_nulls vs respect_nulls
+    let mut comparison_group = c.benchmark_group("nth_value_nulls_comparison");
+    let values_with_nulls =
+        Arc::new(create_primitive_array::<Int64Type>(ARRAY_SIZE, 0.5)) as ArrayRef;
+
+    // FIRST_VALUE comparison
+    comparison_group.bench_function(
+        BenchmarkId::new("first_value", "ignore_nulls"),
+        |b| {
+            b.iter(|| {
+                let mut evaluator = create_evaluator(NthValueKind::First, true, None);
+                let values_slice = slice::from_ref(&values_with_nulls);
+                for i in 0..values_with_nulls.len() {
+                    let range = Range {
+                        start: 0,
+                        end: i + 1,
+                    };
+                    black_box(evaluator.evaluate(values_slice, &range).unwrap());
+                }
+            })
+        },
+    );
+
+    comparison_group.bench_function(
+        BenchmarkId::new("first_value", "respect_nulls"),
+        |b| {
+            b.iter(|| {
+                let mut evaluator = create_evaluator(NthValueKind::First, false, None);
+                let values_slice = slice::from_ref(&values_with_nulls);
+                for i in 0..values_with_nulls.len() {
+                    let range = Range {
+                        start: 0,
+                        end: i + 1,
+                    };
+                    black_box(evaluator.evaluate(values_slice, &range).unwrap());
+                }
+            })
+        },
+    );
+
+    // NTH_VALUE comparison
+    comparison_group.bench_function(
+        BenchmarkId::new("nth_value_10", "ignore_nulls"),
+        |b| {
+            b.iter(|| {
+                let mut evaluator = create_evaluator(NthValueKind::Nth, true, Some(10));
+                let values_slice = slice::from_ref(&values_with_nulls);
+                for i in 0..values_with_nulls.len() {
+                    let range = Range {
+                        start: 0,
+                        end: i + 1,
+                    };
+                    black_box(evaluator.evaluate(values_slice, &range).unwrap());
+                }
+            })
+        },
+    );
+
+    comparison_group.bench_function(
+        BenchmarkId::new("nth_value_10", "respect_nulls"),
+        |b| {
+            b.iter(|| {
+                let mut evaluator = create_evaluator(NthValueKind::Nth, false, Some(10));
+                let values_slice = slice::from_ref(&values_with_nulls);
+                for i in 0..values_with_nulls.len() {
+                    let range = Range {
+                        start: 0,
+                        end: i + 1,
+                    };
+                    black_box(evaluator.evaluate(values_slice, &range).unwrap());
+                }
+            })
+        },
+    );
+
+    comparison_group.finish();
+}
+
+criterion_group!(benches, bench_nth_value_ignore_nulls);
+criterion_main!(benches);
diff --git a/datafusion/functions-window/src/cume_dist.rs b/datafusion/functions-window/src/cume_dist.rs
index 372086b12d5ee..9d0f082daeb99 100644
--- a/datafusion/functions-window/src/cume_dist.rs
+++ b/datafusion/functions-window/src/cume_dist.rs
@@ -18,10 +18,10 @@
 //! `cume_dist` window function implementation
 
 use arrow::datatypes::FieldRef;
+use datafusion_common::Result;
 use datafusion_common::arrow::array::{ArrayRef, Float64Array};
 use datafusion_common::arrow::datatypes::DataType;
 use datafusion_common::arrow::datatypes::Field;
-use datafusion_common::Result;
 use datafusion_expr::{
     Documentation, LimitEffect, PartitionEvaluator, Signature, Volatility, WindowUDFImpl,
 };
@@ -30,7 +30,6 @@ use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
 use datafusion_macros::user_doc;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use field::WindowUDFFieldArgs;
-use std::any::Any;
 use std::fmt::Debug;
 use std::iter;
 use std::ops::Range;
@@ -39,6 +38,7 @@ use std::sync::Arc;
 define_udwf_and_expr!(
     CumeDist,
     cume_dist,
+    cume_dist_udwf,
     "Calculates the cumulative distribution of a value in a group of values."
 );
 
@@ -84,11 +84,6 @@ impl Default for CumeDist {
 }
 
 impl WindowUDFImpl for CumeDist {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "cume_dist"
     }
@@ -167,7 +162,7 @@ mod tests {
     }
 
     #[test]
-    #[allow(clippy::single_range_in_vec_init)]
+    #[expect(clippy::single_range_in_vec_init)]
     fn test_cume_dist() -> Result<()> {
         test_f64_result(0, vec![], vec![])?;
 
diff --git a/datafusion/functions-window/src/lead_lag.rs b/datafusion/functions-window/src/lead_lag.rs
index 3910a0be574d8..b78709b22c1db 100644
--- a/datafusion/functions-window/src/lead_lag.rs
+++ b/datafusion/functions-window/src/lead_lag.rs
@@ -22,7 +22,7 @@ use arrow::datatypes::FieldRef;
 use datafusion_common::arrow::array::ArrayRef;
 use datafusion_common::arrow::datatypes::DataType;
 use datafusion_common::arrow::datatypes::Field;
-use datafusion_common::{arrow_datafusion_err, DataFusionError, Result, ScalarValue};
+use datafusion_common::{DataFusionError, Result, ScalarValue, arrow_datafusion_err};
 use datafusion_doc::window_doc_sections::DOC_SECTION_ANALYTICAL;
 use datafusion_expr::{
     Documentation, LimitEffect, Literal, PartitionEvaluator, ReversedUDWF, Signature,
@@ -33,7 +33,6 @@ use datafusion_functions_window_common::field::WindowUDFFieldArgs;
 use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
 use datafusion_physical_expr::expressions;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-use std::any::Any;
 use std::cmp::min;
 use std::collections::VecDeque;
 use std::hash::Hash;
@@ -43,6 +42,7 @@ use std::sync::{Arc, LazyLock};
 get_or_init_udwf!(
     Lag,
     lag,
+    lag_udwf,
     "Returns the row value that precedes the current row by a specified \
     offset within partition. If no such row exists, then returns the \
     default value.",
@@ -51,6 +51,7 @@ get_or_init_udwf!(
 get_or_init_udwf!(
     Lead,
     lead,
+    lead_udwf,
     "Returns the value from a row that follows the current row by a \
     specified offset within the partition. If no such row exists, then \
     returns the default value.",
@@ -137,7 +138,13 @@ impl WindowShift {
                     TypeSignature::Any(3),
                 ],
                 Volatility::Immutable,
-            ),
+            )
+            .with_parameter_names(vec![
+                "expr".to_string(),
+                "offset".to_string(),
+                "default".to_string(),
+            ])
+            .expect("valid parameter names for lead/lag"),
             kind,
         }
     }
@@ -229,10 +236,6 @@ fn get_lead_doc() -> &'static Documentation {
 }
 
 impl WindowUDFImpl for WindowShift {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         self.kind.name()
     }
@@ -258,7 +261,7 @@ impl WindowUDFImpl for WindowShift {
     ) -> Result<Box<dyn PartitionEvaluator>> {
         let shift_offset =
             get_scalar_value_from_args(partition_evaluator_args.input_exprs(), 1)?
-                .map(get_signed_integer)
+                .map(|v| get_signed_integer(&v))
                 .map_or(Ok(None), |v| v.map(Some))
                 .map(|n| self.kind.shift_offset(n))
                 .map(|offset| {
@@ -311,8 +314,7 @@ impl WindowUDFImpl for WindowShift {
         }
         match args {
             [_, expr, ..] => {
-                let Some(lit) = expr.as_any().downcast_ref::<expressions::Literal>()
-                else {
+                let Some(lit) = expr.downcast_ref::<expressions::Literal>() else {
                     return LimitEffect::Unknown;
                 };
                 let ScalarValue::Int64(Some(amount)) = lit.value() else {
@@ -634,7 +636,7 @@ impl PartitionEvaluator for WindowShiftEvaluator {
         // OR
         // - ignore nulls mode and current value is null and is within window bounds
         // .unwrap() is safe here as there is a none check in front
-        #[allow(clippy::unnecessary_unwrap)]
+        #[expect(clippy::unnecessary_unwrap)]
         if !(idx.is_none() || (self.ignore_nulls && array.is_null(idx.unwrap()))) {
             ScalarValue::try_from_array(array, idx.unwrap())
         } else {
@@ -672,7 +674,6 @@ mod tests {
     use arrow::array::*;
     use datafusion_common::cast::as_int32_array;
     use datafusion_physical_expr::expressions::{Column, Literal};
-    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 
     fn test_i32_result(
         expr: WindowShift,
diff --git a/datafusion/functions-window/src/lib.rs b/datafusion/functions-window/src/lib.rs
index 0093a1c235228..6edfb92744f5b 100644
--- a/datafusion/functions-window/src/lib.rs
+++ b/datafusion/functions-window/src/lib.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 #![doc(
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
@@ -23,6 +24,7 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
+// https://github.com/apache/datafusion/issues/18881
 
 //! Window Function packages for [DataFusion].
 //!
@@ -35,8 +37,8 @@ use std::sync::Arc;
 
 use log::debug;
 
-use datafusion_expr::registry::FunctionRegistry;
 use datafusion_expr::WindowUDF;
+use datafusion_expr::registry::FunctionRegistry;
 
 #[macro_use]
 pub mod macros;
diff --git a/datafusion/functions-window/src/macros.rs b/datafusion/functions-window/src/macros.rs
index 890ced90a9a21..066dd43c17351 100644
--- a/datafusion/functions-window/src/macros.rs
+++ b/datafusion/functions-window/src/macros.rs
@@ -30,8 +30,8 @@
 ///
 /// * `$UDWF`: The struct which defines the [`Signature`](datafusion_expr::Signature)
 ///   of the user-defined window function.
-/// * `$OUT_FN_NAME`: The basename to generate a unique function name like
-///   `$OUT_FN_NAME_udwf`.
+/// * `$OUT_FN_NAME`: The expression function name
+///   `UDWF_FN` : The unique function name
 /// * `$DOC`: Doc comments for UDWF.
 /// * (optional) `$CTOR`: Pass a custom constructor. When omitted it
 ///   automatically resolves to `$UDWF::default()`.
@@ -39,7 +39,6 @@
 /// # Example
 ///
 /// ```
-/// # use std::any::Any;
 /// use arrow::datatypes::FieldRef;
 /// # use datafusion_common::arrow::datatypes::{DataType, Field};
 /// # use datafusion_expr::{PartitionEvaluator, Signature, Volatility, WindowUDFImpl};
@@ -52,6 +51,7 @@
 /// get_or_init_udwf!(
 ///     SimpleUDWF,
 ///     simple,
+///     simple_udwf,
 ///     "Simple user-defined window function doc comment."
 /// );
 /// #
@@ -71,9 +71,6 @@
 /// #  }
 /// #
 /// #  impl WindowUDFImpl for SimpleUDWF {
-/// #      fn as_any(&self) -> &dyn Any {
-/// #          self
-/// #      }
 /// #      fn name(&self) -> &str {
 /// #          "simple_user_defined_window_function"
 /// #      }
@@ -94,16 +91,15 @@
 /// ```
 #[macro_export]
 macro_rules! get_or_init_udwf {
-    ($UDWF:ident, $OUT_FN_NAME:ident, $DOC:expr) => {
-        get_or_init_udwf!($UDWF, $OUT_FN_NAME, $DOC, $UDWF::default);
+    ($UDWF:ident, $OUT_FN_NAME:ident, $UDWF_FN:ident, $DOC:expr) => {
+        get_or_init_udwf!($UDWF, $OUT_FN_NAME, $UDWF_FN, $DOC, $UDWF::default);
     };
 
-    ($UDWF:ident, $OUT_FN_NAME:ident, $DOC:expr, $CTOR:path) => {
-        paste::paste! {
+    ($UDWF:ident, $OUT_FN_NAME:ident, $UDWF_FN:ident, $DOC:expr, $CTOR:path) => {
             #[doc = concat!(" Returns a [`WindowUDF`](datafusion_expr::WindowUDF) for [`", stringify!($OUT_FN_NAME), "`].")]
             #[doc = ""]
             #[doc = concat!(" ", $DOC)]
-            pub fn [<$OUT_FN_NAME _udwf>]() -> std::sync::Arc<datafusion_expr::WindowUDF> {
+            pub fn $UDWF_FN() -> std::sync::Arc<datafusion_expr::WindowUDF> {
                 // Singleton instance of UDWF, ensures it is only created once.
                 static INSTANCE: std::sync::LazyLock<std::sync::Arc<datafusion_expr::WindowUDF>> =
                     std::sync::LazyLock::new(|| {
@@ -111,7 +107,6 @@ macro_rules! get_or_init_udwf {
                     });
                 std::sync::Arc::clone(&INSTANCE)
             }
-        }
     };
 }
 
@@ -138,7 +133,6 @@ macro_rules! get_or_init_udwf {
 ///
 /// 1. With Zero Parameters
 /// ```
-/// # use std::any::Any;
 /// use arrow::datatypes::FieldRef;
 /// # use datafusion_common::arrow::datatypes::{DataType, Field};
 /// # use datafusion_expr::{PartitionEvaluator, Signature, Volatility, WindowUDFImpl};
@@ -149,6 +143,7 @@ macro_rules! get_or_init_udwf {
 /// # get_or_init_udwf!(
 /// #     RowNumber,
 /// #     row_number,
+/// #     row_number_udwf,
 /// #     "Returns a unique row number for each row in window partition beginning at 1."
 /// # );
 /// /// Creates `row_number()` API which has zero parameters:
@@ -163,6 +158,7 @@ macro_rules! get_or_init_udwf {
 /// create_udwf_expr!(
 ///     RowNumber,
 ///     row_number,
+///     row_number_udwf,
 ///     "Returns a unique row number for each row in window partition beginning at 1."
 /// );
 /// #
@@ -183,9 +179,6 @@ macro_rules! get_or_init_udwf {
 /// #     }
 /// # }
 /// # impl WindowUDFImpl for RowNumber {
-/// #     fn as_any(&self) -> &dyn Any {
-/// #         self
-/// #     }
 /// #     fn name(&self) -> &str {
 /// #         "row_number"
 /// #     }
@@ -206,7 +199,6 @@ macro_rules! get_or_init_udwf {
 ///
 /// 2. With Multiple Parameters
 /// ```
-/// # use std::any::Any;
 /// use arrow::datatypes::FieldRef;
 /// #
 /// # use datafusion_expr::{
@@ -221,7 +213,7 @@ macro_rules! get_or_init_udwf {
 /// # use datafusion_expr::{col, lit};
 /// # use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
 /// #
-/// # get_or_init_udwf!(Lead, lead, "user-defined window function");
+/// # get_or_init_udwf!(Lead, lead,lead_udwf, "user-defined window function");
 /// #
 /// /// Creates `lead(expr, offset, default)` with 3 parameters:
 /// ///
@@ -240,6 +232,7 @@ macro_rules! get_or_init_udwf {
 ///     Lead,
 ///     lead,
 ///     [expr, offset, default],
+///     lead_udwf,
 ///     "Returns a value evaluated at the row that is offset rows after the current row within the partition."
 /// );
 /// #
@@ -271,9 +264,6 @@ macro_rules! get_or_init_udwf {
 /// # }
 /// #
 /// # impl WindowUDFImpl for Lead {
-/// #     fn as_any(&self) -> &dyn Any {
-/// #         self
-/// #     }
 /// #     fn name(&self) -> &str {
 /// #         "lead"
 /// #     }
@@ -298,21 +288,18 @@ macro_rules! get_or_init_udwf {
 #[macro_export]
 macro_rules! create_udwf_expr {
     // zero arguments
-    ($UDWF:ident, $OUT_FN_NAME:ident, $DOC:expr) => {
-        paste::paste! {
+    ($UDWF:ident, $OUT_FN_NAME:ident, $UDWF_FN:ident, $DOC:expr) => {
             #[doc = " Create a [`WindowFunction`](datafusion_expr::Expr::WindowFunction) expression for"]
             #[doc = concat!(" `", stringify!($UDWF), "` user-defined window function.")]
             #[doc = ""]
             #[doc = concat!(" ", $DOC)]
             pub fn $OUT_FN_NAME() -> datafusion_expr::Expr {
-                [<$OUT_FN_NAME _udwf>]().call(vec![])
+                $UDWF_FN().call(vec![])
             }
-       }
     };
 
     // 1 or more arguments
-    ($UDWF:ident, $OUT_FN_NAME:ident, [$($PARAM:ident),+], $DOC:expr) => {
-        paste::paste! {
+    ($UDWF:ident, $OUT_FN_NAME:ident, [$($PARAM:ident),+], $UDWF_FN:ident, $DOC:expr) => {
             #[doc = " Create a [`WindowFunction`](datafusion_expr::Expr::WindowFunction) expression for"]
             #[doc = concat!(" `", stringify!($UDWF), "` user-defined window function.")]
             #[doc = ""]
@@ -320,10 +307,9 @@ macro_rules! create_udwf_expr {
             pub fn $OUT_FN_NAME(
                 $($PARAM: datafusion_expr::Expr),+
             ) -> datafusion_expr::Expr {
-                [<$OUT_FN_NAME _udwf>]()
+                $UDWF_FN()
                     .call(vec![$($PARAM),+])
             }
-       }
     };
 }
 
@@ -354,7 +340,6 @@ macro_rules! create_udwf_expr {
 /// 1. Uses default constructor for UDWF.
 ///
 /// ```
-/// # use std::any::Any;
 /// use arrow::datatypes::FieldRef;
 /// # use datafusion_common::arrow::datatypes::{DataType, Field};
 /// # use datafusion_expr::{PartitionEvaluator, Signature, Volatility, WindowUDFImpl};
@@ -374,6 +359,7 @@ macro_rules! create_udwf_expr {
 /// define_udwf_and_expr!(
 ///     SimpleUDWF,
 ///     simple,
+///     simple_udwf,
 ///     "a simple user-defined window function"
 /// );
 /// #
@@ -393,9 +379,6 @@ macro_rules! create_udwf_expr {
 /// #  }
 /// #
 /// #  impl WindowUDFImpl for SimpleUDWF {
-/// #      fn as_any(&self) -> &dyn Any {
-/// #          self
-/// #      }
 /// #      fn name(&self) -> &str {
 /// #          "simple_user_defined_window_function"
 /// #      }
@@ -418,7 +401,6 @@ macro_rules! create_udwf_expr {
 /// 2. Uses a custom constructor for UDWF.
 ///
 /// ```
-/// # use std::any::Any;
 /// use arrow::datatypes::FieldRef;
 /// # use datafusion_common::arrow::datatypes::{DataType, Field};
 /// # use datafusion_expr::{PartitionEvaluator, Signature, Volatility, WindowUDFImpl};
@@ -437,6 +419,7 @@ macro_rules! create_udwf_expr {
 /// define_udwf_and_expr!(
 ///     RowNumber,
 ///     row_number,
+///     row_number_udwf,
 ///     "Returns a unique row number for each row in window partition beginning at 1.",
 ///     RowNumber::new // <-- custom constructor
 /// );
@@ -458,9 +441,6 @@ macro_rules! create_udwf_expr {
 /// #     }
 /// # }
 /// # impl WindowUDFImpl for RowNumber {
-/// #     fn as_any(&self) -> &dyn Any {
-/// #         self
-/// #     }
 /// #     fn name(&self) -> &str {
 /// #         "row_number"
 /// #     }
@@ -483,7 +463,6 @@ macro_rules! create_udwf_expr {
 /// 3. Uses default constructor for UDWF
 ///
 /// ```
-/// # use std::any::Any;
 /// use arrow::datatypes::FieldRef;
 /// #
 /// # use datafusion_expr::{
@@ -514,6 +493,7 @@ macro_rules! create_udwf_expr {
 ///     Lead,
 ///     lead,
 ///     [expr, offset, default],        // <- 3 parameters
+///     lead_udwf,
 ///     "user-defined window function"
 /// );
 /// #
@@ -545,9 +525,6 @@ macro_rules! create_udwf_expr {
 /// # }
 /// #
 /// # impl WindowUDFImpl for Lead {
-/// #     fn as_any(&self) -> &dyn Any {
-/// #         self
-/// #     }
 /// #     fn name(&self) -> &str {
 /// #         "lead"
 /// #     }
@@ -572,7 +549,6 @@ macro_rules! create_udwf_expr {
 /// 4. Uses custom constructor for UDWF
 ///
 /// ```
-/// # use std::any::Any;
 /// use arrow::datatypes::FieldRef;
 /// #
 /// # use datafusion_expr::{
@@ -603,6 +579,7 @@ macro_rules! create_udwf_expr {
 ///     Lead,
 ///     lead,
 ///     [expr, offset, default],        // <- 3 parameters
+///     lead_udwf,
 ///     "user-defined window function",
 ///     Lead::new                       // <- Custom constructor
 /// );
@@ -635,9 +612,6 @@ macro_rules! create_udwf_expr {
 /// # }
 /// #
 /// # impl WindowUDFImpl for Lead {
-/// #     fn as_any(&self) -> &dyn Any {
-/// #         self
-/// #     }
 /// #     fn name(&self) -> &str {
 /// #         "lead"
 /// #     }
@@ -663,29 +637,29 @@ macro_rules! create_udwf_expr {
 macro_rules! define_udwf_and_expr {
     // Defines UDWF with default constructor
     // Defines expression API with zero parameters
-    ($UDWF:ident, $OUT_FN_NAME:ident, $DOC:expr) => {
-        get_or_init_udwf!($UDWF, $OUT_FN_NAME, $DOC);
-        create_udwf_expr!($UDWF, $OUT_FN_NAME, $DOC);
+    ($UDWF:ident, $OUT_FN_NAME:ident, $UDWF_FN:ident, $DOC:expr) => {
+        get_or_init_udwf!($UDWF, $OUT_FN_NAME,$UDWF_FN, $DOC);
+        create_udwf_expr!($UDWF, $OUT_FN_NAME, $UDWF_FN, $DOC);
     };
 
     // Defines UDWF by passing a custom constructor
     // Defines expression API with zero parameters
-    ($UDWF:ident, $OUT_FN_NAME:ident, $DOC:expr, $CTOR:path) => {
-        get_or_init_udwf!($UDWF, $OUT_FN_NAME, $DOC, $CTOR);
-        create_udwf_expr!($UDWF, $OUT_FN_NAME, $DOC);
+    ($UDWF:ident, $OUT_FN_NAME:ident, $UDWF_FN:ident, $DOC:expr, $CTOR:path) => {
+        get_or_init_udwf!($UDWF, $OUT_FN_NAME, $UDWF_FN, $DOC, $CTOR);
+        create_udwf_expr!($UDWF, $OUT_FN_NAME, $UDWF_FN, $DOC);
     };
 
     // Defines UDWF with default constructor
     // Defines expression API with multiple parameters
-    ($UDWF:ident, $OUT_FN_NAME:ident, [$($PARAM:ident),+], $DOC:expr) => {
-        get_or_init_udwf!($UDWF, $OUT_FN_NAME, $DOC);
-        create_udwf_expr!($UDWF, $OUT_FN_NAME, [$($PARAM),+], $DOC);
+    ($UDWF:ident, $OUT_FN_NAME:ident, [$($PARAM:ident),+],$UDWF_FN:ident, $DOC:expr) => {
+        get_or_init_udwf!($UDWF, $OUT_FN_NAME, $UDWF_FN, $DOC);
+        create_udwf_expr!($UDWF, $OUT_FN_NAME, [$($PARAM),+], $UDWF_FN, $DOC);
     };
 
     // Defines UDWF by passing a custom constructor
     // Defines expression API with multiple parameters
-    ($UDWF:ident, $OUT_FN_NAME:ident, [$($PARAM:ident),+], $DOC:expr, $CTOR:path) => {
-        get_or_init_udwf!($UDWF, $OUT_FN_NAME, $DOC, $CTOR);
-        create_udwf_expr!($UDWF, $OUT_FN_NAME, [$($PARAM),+], $DOC);
+    ($UDWF:ident, $OUT_FN_NAME:ident, [$($PARAM:ident),+], $UDWF_FN:ident, $DOC:expr, $CTOR:path) => {
+        get_or_init_udwf!($UDWF, $OUT_FN_NAME, $UDWF_FN, $DOC, $CTOR);
+        create_udwf_expr!($UDWF, $OUT_FN_NAME, [$($PARAM),+], $UDWF_FN, $DOC);
     };
 }
diff --git a/datafusion/functions-window/src/nth_value.rs b/datafusion/functions-window/src/nth_value.rs
index 1ba6ad5ce0d49..6c6139405cbe8 100644
--- a/datafusion/functions-window/src/nth_value.rs
+++ b/datafusion/functions-window/src/nth_value.rs
@@ -19,10 +19,11 @@
 
 use crate::utils::{get_scalar_value_from_args, get_signed_integer};
 
+use arrow::buffer::NullBuffer;
 use arrow::datatypes::FieldRef;
 use datafusion_common::arrow::array::ArrayRef;
 use datafusion_common::arrow::datatypes::{DataType, Field};
-use datafusion_common::{exec_datafusion_err, exec_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_datafusion_err, exec_err};
 use datafusion_doc::window_doc_sections::DOC_SECTION_ANALYTICAL;
 use datafusion_expr::window_state::WindowAggState;
 use datafusion_expr::{
@@ -33,7 +34,6 @@ use datafusion_functions_window_common::field;
 use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use field::WindowUDFFieldArgs;
-use std::any::Any;
 use std::cmp::Ordering;
 use std::fmt::Debug;
 use std::hash::Hash;
@@ -44,6 +44,7 @@ define_udwf_and_expr!(
     First,
     first_value,
     [arg],
+    first_value_udwf,
     "Returns the first value in the window frame",
     NthValue::first
 );
@@ -51,12 +52,14 @@ define_udwf_and_expr!(
     Last,
     last_value,
     [arg],
+    last_value_udwf,
     "Returns the last value in the window frame",
     NthValue::last
 );
 get_or_init_udwf!(
     NthValue,
     nth_value,
+    nth_value_udwf,
     "Returns the nth value in the window frame",
     NthValue::nth
 );
@@ -96,7 +99,7 @@ impl NthValue {
         Self {
             signature: Signature::one_of(
                 vec![
-                    TypeSignature::Any(0),
+                    TypeSignature::Nullary,
                     TypeSignature::Any(1),
                     TypeSignature::Any(2),
                 ],
@@ -247,10 +250,6 @@ fn get_nth_value_doc() -> &'static Documentation {
 }
 
 impl WindowUDFImpl for NthValue {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         self.kind.name()
     }
@@ -268,7 +267,7 @@ impl WindowUDFImpl for NthValue {
             kind: self.kind,
         };
 
-        if !matches!(self.kind, NthValueKind::Nth) {
+        if self.kind != NthValueKind::Nth {
             return Ok(Box::new(NthValueEvaluator {
                 state,
                 ignore_nulls: partition_evaluator_args.ignore_nulls(),
@@ -276,27 +275,30 @@ impl WindowUDFImpl for NthValue {
             }));
         }
 
-        let n =
-            match get_scalar_value_from_args(partition_evaluator_args.input_exprs(), 1)
-                .map_err(|_e| {
-                    exec_datafusion_err!(
-                "Expected a signed integer literal for the second argument of nth_value")
-                })?
-                .map(get_signed_integer)
-            {
-                Some(Ok(n)) => {
-                    if partition_evaluator_args.is_reversed() {
-                        -n
-                    } else {
-                        n
-                    }
-                }
-                _ => {
-                    return exec_err!(
+        let n = match get_scalar_value_from_args(
+            partition_evaluator_args.input_exprs(),
+            1,
+        )
+        .map_err(|_e| {
+            exec_datafusion_err!(
                 "Expected a signed integer literal for the second argument of nth_value"
             )
+        })?
+        .map(|v| get_signed_integer(&v))
+        {
+            Some(Ok(n)) => {
+                if partition_evaluator_args.is_reversed() {
+                    -n
+                } else {
+                    n
                 }
-            };
+            }
+            _ => {
+                return exec_err!(
+                    "Expected a signed integer literal for the second argument of nth_value"
+                );
+            }
+        };
 
         Ok(Box::new(NthValueEvaluator {
             state,
@@ -367,6 +369,33 @@ impl PartitionEvaluator for NthValueEvaluator {
     fn memoize(&mut self, state: &mut WindowAggState) -> Result<()> {
         let out = &state.out_col;
         let size = out.len();
+        if self.ignore_nulls {
+            match self.state.kind {
+                // Prune on first non-null output in case of FIRST_VALUE
+                NthValueKind::First => {
+                    if let Some(nulls) = out.nulls() {
+                        if self.state.finalized_result.is_none() {
+                            if let Some(valid_index) = nulls.valid_indices().next() {
+                                let result =
+                                    ScalarValue::try_from_array(out, valid_index)?;
+                                self.state.finalized_result = Some(result);
+                            } else {
+                                // The output is empty or all nulls, ignore
+                            }
+                        }
+                        if state.window_frame_range.start < state.window_frame_range.end {
+                            state.window_frame_range.start =
+                                state.window_frame_range.end - 1;
+                        }
+                        return Ok(());
+                    } else {
+                        // Fall through to the main case because there are no nulls
+                    }
+                }
+                // Do not memoize for other kinds when nulls are ignored
+                NthValueKind::Last | NthValueKind::Nth => return Ok(()),
+            }
+        }
         let mut buffer_size = 1;
         // Decide if we arrived at a final result yet:
         let (is_prunable, is_reverse_direction) = match self.state.kind {
@@ -394,8 +423,7 @@ impl PartitionEvaluator for NthValueEvaluator {
                 }
             }
         };
-        // Do not memoize results when nulls are ignored.
-        if is_prunable && !self.ignore_nulls {
+        if is_prunable {
             if self.state.finalized_result.is_none() && !is_reverse_direction {
                 let result = ScalarValue::try_from_array(out, size - 1)?;
                 self.state.finalized_result = Some(result);
@@ -421,99 +449,90 @@ impl PartitionEvaluator for NthValueEvaluator {
                 // We produce None if the window is empty.
                 return ScalarValue::try_from(arr.data_type());
             }
+            match self.valid_index(arr, range) {
+                Some(index) => ScalarValue::try_from_array(arr, index),
+                None => ScalarValue::try_from(arr.data_type()),
+            }
+        }
+    }
 
-            // If null values exist and need to be ignored, extract the valid indices.
-            let valid_indices = if self.ignore_nulls {
-                // Calculate valid indices, inside the window frame boundaries.
-                let slice = arr.slice(range.start, n_range);
-                match slice.nulls() {
-                    Some(nulls) => {
-                        let valid_indices = nulls
-                            .valid_indices()
-                            .map(|idx| {
-                                // Add offset `range.start` to valid indices, to point correct index in the original arr.
-                                idx + range.start
-                            })
-                            .collect::<Vec<_>>();
-                        if valid_indices.is_empty() {
-                            // If all values are null, return directly.
-                            return ScalarValue::try_from(arr.data_type());
-                        }
-                        Some(valid_indices)
-                    }
-                    None => None,
-                }
-            } else {
-                None
-            };
-            match self.state.kind {
-                NthValueKind::First => {
-                    if let Some(valid_indices) = &valid_indices {
-                        ScalarValue::try_from_array(arr, valid_indices[0])
+    fn supports_bounded_execution(&self) -> bool {
+        true
+    }
+
+    fn uses_window_frame(&self) -> bool {
+        true
+    }
+}
+
+impl NthValueEvaluator {
+    fn valid_index(&self, array: &ArrayRef, range: &Range<usize>) -> Option<usize> {
+        let n_range = range.end - range.start;
+        if self.ignore_nulls {
+            // Calculate valid indices, inside the window frame boundaries.
+            let slice = array.slice(range.start, n_range);
+            if let Some(nulls) = slice.nulls()
+                && nulls.null_count() > 0
+            {
+                return self.valid_index_with_nulls(nulls, range.start);
+            }
+        }
+        // Either no nulls, or nulls are regarded as valid rows
+        match self.state.kind {
+            NthValueKind::First => Some(range.start),
+            NthValueKind::Last => Some(range.end - 1),
+            NthValueKind::Nth => match self.n.cmp(&0) {
+                Ordering::Greater => {
+                    // SQL indices are not 0-based.
+                    let index = (self.n as usize) - 1;
+                    if index >= n_range {
+                        // Outside the range, return NULL:
+                        None
                     } else {
-                        ScalarValue::try_from_array(arr, range.start)
+                        Some(range.start + index)
                     }
                 }
-                NthValueKind::Last => {
-                    if let Some(valid_indices) = &valid_indices {
-                        ScalarValue::try_from_array(
-                            arr,
-                            valid_indices[valid_indices.len() - 1],
-                        )
+                Ordering::Less => {
+                    let reverse_index = (-self.n) as usize;
+                    if n_range < reverse_index {
+                        // Outside the range, return NULL:
+                        None
                     } else {
-                        ScalarValue::try_from_array(arr, range.end - 1)
+                        Some(range.end - reverse_index)
                     }
                 }
-                NthValueKind::Nth => {
-                    match self.n.cmp(&0) {
-                        Ordering::Greater => {
-                            // SQL indices are not 0-based.
-                            let index = (self.n as usize) - 1;
-                            if index >= n_range {
-                                // Outside the range, return NULL:
-                                ScalarValue::try_from(arr.data_type())
-                            } else if let Some(valid_indices) = valid_indices {
-                                if index >= valid_indices.len() {
-                                    return ScalarValue::try_from(arr.data_type());
-                                }
-                                ScalarValue::try_from_array(&arr, valid_indices[index])
-                            } else {
-                                ScalarValue::try_from_array(arr, range.start + index)
-                            }
-                        }
-                        Ordering::Less => {
-                            let reverse_index = (-self.n) as usize;
-                            if n_range < reverse_index {
-                                // Outside the range, return NULL:
-                                ScalarValue::try_from(arr.data_type())
-                            } else if let Some(valid_indices) = valid_indices {
-                                if reverse_index > valid_indices.len() {
-                                    return ScalarValue::try_from(arr.data_type());
-                                }
-                                let new_index =
-                                    valid_indices[valid_indices.len() - reverse_index];
-                                ScalarValue::try_from_array(&arr, new_index)
-                            } else {
-                                ScalarValue::try_from_array(
-                                    arr,
-                                    range.start + n_range - reverse_index,
-                                )
-                            }
+                Ordering::Equal => None,
+            },
+        }
+    }
+
+    fn valid_index_with_nulls(&self, nulls: &NullBuffer, offset: usize) -> Option<usize> {
+        match self.state.kind {
+            NthValueKind::First => nulls.valid_indices().next().map(|idx| idx + offset),
+            NthValueKind::Last => nulls.valid_indices().last().map(|idx| idx + offset),
+            NthValueKind::Nth => {
+                match self.n.cmp(&0) {
+                    Ordering::Greater => {
+                        // SQL indices are not 0-based.
+                        let index = (self.n as usize) - 1;
+                        nulls.valid_indices().nth(index).map(|idx| idx + offset)
+                    }
+                    Ordering::Less => {
+                        let reverse_index = (-self.n) as usize;
+                        let valid_indices_len = nulls.len() - nulls.null_count();
+                        if reverse_index > valid_indices_len {
+                            return None;
                         }
-                        Ordering::Equal => ScalarValue::try_from(arr.data_type()),
+                        nulls
+                            .valid_indices()
+                            .nth(valid_indices_len - reverse_index)
+                            .map(|idx| idx + offset)
                     }
+                    Ordering::Equal => None,
                 }
             }
         }
     }
-
-    fn supports_bounded_execution(&self) -> bool {
-        true
-    }
-
-    fn uses_window_frame(&self) -> bool {
-        true
-    }
 }
 
 #[cfg(test)]
@@ -522,8 +541,6 @@ mod tests {
     use arrow::array::*;
     use datafusion_common::cast::as_int32_array;
     use datafusion_physical_expr::expressions::{Column, Literal};
-    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-    use std::sync::Arc;
 
     fn test_i32_result(
         expr: NthValue,
diff --git a/datafusion/functions-window/src/ntile.rs b/datafusion/functions-window/src/ntile.rs
index 008caaa848aab..6d9215f78e19f 100644
--- a/datafusion/functions-window/src/ntile.rs
+++ b/datafusion/functions-window/src/ntile.rs
@@ -23,7 +23,7 @@ use crate::utils::{
 use arrow::datatypes::FieldRef;
 use datafusion_common::arrow::array::{ArrayRef, UInt64Array};
 use datafusion_common::arrow::datatypes::{DataType, Field};
-use datafusion_common::{exec_datafusion_err, exec_err, Result};
+use datafusion_common::{Result, exec_datafusion_err, exec_err};
 use datafusion_expr::{
     Documentation, LimitEffect, PartitionEvaluator, Signature, Volatility, WindowUDFImpl,
 };
@@ -32,7 +32,6 @@ use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
 use datafusion_macros::user_doc;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use field::WindowUDFFieldArgs;
-use std::any::Any;
 use std::fmt::Debug;
 use std::sync::Arc;
 
@@ -40,6 +39,7 @@ define_udwf_and_expr!(
     Ntile,
     ntile,
     [arg],
+    ntile_udwf,
     "Integer ranging from 1 to the argument value, dividing the partition as equally as possible."
 );
 
@@ -108,10 +108,6 @@ impl Default for Ntile {
 }
 
 impl WindowUDFImpl for Ntile {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "ntile"
     }
@@ -135,10 +131,10 @@ impl WindowUDFImpl for Ntile {
         }
 
         if scalar_n.is_unsigned() {
-            let n = get_unsigned_integer(scalar_n)?;
+            let n = get_unsigned_integer(&scalar_n)?;
             Ok(Box::new(NtileEvaluator { n }))
         } else {
-            let n: i64 = get_signed_integer(scalar_n)?;
+            let n: i64 = get_signed_integer(&scalar_n)?;
             if n <= 0 {
                 return exec_err!("NTILE requires a positive integer");
             }
diff --git a/datafusion/functions-window/src/planner.rs b/datafusion/functions-window/src/planner.rs
index 84836ad569ff8..6f4eb2051f047 100644
--- a/datafusion/functions-window/src/planner.rs
+++ b/datafusion/functions-window/src/planner.rs
@@ -19,11 +19,11 @@
 
 use datafusion_common::Result;
 use datafusion_expr::{
+    Expr,
     expr::{WindowFunction, WindowFunctionParams},
     expr_rewriter::NamePreserver,
     planner::{ExprPlanner, PlannerResult, RawWindowExpr},
     utils::COUNT_STAR_EXPANSION,
-    Expr,
 };
 
 #[derive(Debug)]
diff --git a/datafusion/functions-window/src/rank.rs b/datafusion/functions-window/src/rank.rs
index 6d891e76671d7..d18bd0748917e 100644
--- a/datafusion/functions-window/src/rank.rs
+++ b/datafusion/functions-window/src/rank.rs
@@ -18,7 +18,6 @@
 //! Implementation of `rank`, `dense_rank`, and `percent_rank` window functions,
 //! which can be evaluated at runtime during query execution.
 
-use crate::define_udwf_and_expr;
 use arrow::datatypes::FieldRef;
 use datafusion_common::arrow::array::ArrayRef;
 use datafusion_common::arrow::array::{Float64Array, UInt64Array};
@@ -26,7 +25,7 @@ use datafusion_common::arrow::compute::SortOptions;
 use datafusion_common::arrow::datatypes::DataType;
 use datafusion_common::arrow::datatypes::Field;
 use datafusion_common::utils::get_row_at_idx;
-use datafusion_common::{exec_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err};
 use datafusion_doc::window_doc_sections::DOC_SECTION_RANKING;
 use datafusion_expr::{
     Documentation, LimitEffect, PartitionEvaluator, Signature, Volatility, WindowUDFImpl,
@@ -35,7 +34,6 @@ use datafusion_functions_window_common::field;
 use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use field::WindowUDFFieldArgs;
-use std::any::Any;
 use std::fmt::Debug;
 use std::hash::Hash;
 use std::iter;
@@ -45,6 +43,7 @@ use std::sync::{Arc, LazyLock};
 define_udwf_and_expr!(
     Rank,
     rank,
+    rank_udwf,
     "Returns rank of the current row with gaps. Same as `row_number` of its first peer",
     Rank::basic
 );
@@ -52,6 +51,7 @@ define_udwf_and_expr!(
 define_udwf_and_expr!(
     DenseRank,
     dense_rank,
+    dense_rank_udwf,
     "Returns rank of the current row without gaps. This function counts peer groups",
     Rank::dense_rank
 );
@@ -59,6 +59,7 @@ define_udwf_and_expr!(
 define_udwf_and_expr!(
     PercentRank,
     percent_rank,
+    percent_rank_udwf,
     "Returns the relative rank of the current row: (rank - 1) / (total rows - 1)",
     Rank::percent_rank
 );
@@ -195,10 +196,6 @@ fn get_percent_rank_doc() -> &'static Documentation {
 }
 
 impl WindowUDFImpl for Rank {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         &self.name
     }
@@ -381,7 +378,7 @@ mod tests {
         test_i32_result(expr, vec![0..2, 2..3, 3..6, 6..7, 7..8], expected)
     }
 
-    #[allow(clippy::single_range_in_vec_init)]
+    #[expect(clippy::single_range_in_vec_init)]
     fn test_without_rank(expr: &Rank, expected: Vec<u64>) -> Result<()> {
         test_i32_result(expr, vec![0..8], expected)
     }
@@ -434,7 +431,7 @@ mod tests {
     }
 
     #[test]
-    #[allow(clippy::single_range_in_vec_init)]
+    #[expect(clippy::single_range_in_vec_init)]
     fn test_percent_rank() -> Result<()> {
         let r = Rank::percent_rank();
 
diff --git a/datafusion/functions-window/src/row_number.rs b/datafusion/functions-window/src/row_number.rs
index d7d298cecead8..eb48a31753d3e 100644
--- a/datafusion/functions-window/src/row_number.rs
+++ b/datafusion/functions-window/src/row_number.rs
@@ -32,7 +32,6 @@ use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
 use datafusion_macros::user_doc;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use field::WindowUDFFieldArgs;
-use std::any::Any;
 use std::fmt::Debug;
 use std::ops::Range;
 use std::sync::Arc;
@@ -40,6 +39,7 @@ use std::sync::Arc;
 define_udwf_and_expr!(
     RowNumber,
     row_number,
+    row_number_udwf,
     "Returns a unique row number for each row in window partition beginning at 1."
 );
 
@@ -90,10 +90,6 @@ impl Default for RowNumber {
 }
 
 impl WindowUDFImpl for RowNumber {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "row_number"
     }
@@ -167,7 +163,6 @@ impl PartitionEvaluator for NumRowsEvaluator {
 
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
 
     use datafusion_common::arrow::array::{Array, BooleanArray};
     use datafusion_common::cast::as_uint64_array;
diff --git a/datafusion/functions-window/src/utils.rs b/datafusion/functions-window/src/utils.rs
index 3f8061dbea3e1..95b60864cc8f3 100644
--- a/datafusion/functions-window/src/utils.rs
+++ b/datafusion/functions-window/src/utils.rs
@@ -16,12 +16,12 @@
 // under the License.
 
 use datafusion_common::arrow::datatypes::DataType;
-use datafusion_common::{exec_err, DataFusionError, Result, ScalarValue};
+use datafusion_common::{DataFusionError, Result, ScalarValue, exec_err};
 use datafusion_physical_expr::expressions::Literal;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use std::sync::Arc;
 
-pub(crate) fn get_signed_integer(value: ScalarValue) -> Result<i64> {
+pub(crate) fn get_signed_integer(value: &ScalarValue) -> Result<i64> {
     if value.is_null() {
         return Ok(0);
     }
@@ -39,7 +39,6 @@ pub(crate) fn get_scalar_value_from_args(
 ) -> Result<Option<ScalarValue>> {
     Ok(if let Some(field) = args.get(index) {
         let tmp = field
-            .as_any()
             .downcast_ref::<Literal>()
             .ok_or_else(|| DataFusionError::NotImplemented(
                 format!("There is only support Literal types for field at idx: {index} in Window Function"),
@@ -52,7 +51,7 @@ pub(crate) fn get_scalar_value_from_args(
     })
 }
 
-pub(crate) fn get_unsigned_integer(value: ScalarValue) -> Result<u64> {
+pub(crate) fn get_unsigned_integer(value: &ScalarValue) -> Result<u64> {
     if value.is_null() {
         return Ok(0);
     }
diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
index 1dbeee7159fd5..2d69728bda919 100644
--- a/datafusion/functions/Cargo.toml
+++ b/datafusion/functions/Cargo.toml
@@ -31,13 +31,16 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
 [features]
 crypto_expressions = ["md-5", "sha2", "blake2", "blake3"]
 # enable datetime functions
-datetime_expressions = []
+datetime_expressions = ["chrono-tz"]
 # Enable encoding by default so the doctests work. In general don't automatically enable all packages.
 default = [
     "datetime_expressions",
@@ -56,7 +59,7 @@ regex_expressions = ["regex"]
 # enable string functions
 string_expressions = ["uuid"]
 # enable unicode functions
-unicode_expressions = ["unicode-segmentation"]
+unicode_expressions = []
 
 [lib]
 name = "datafusion_functions"
@@ -68,22 +71,24 @@ base64 = { version = "0.22", optional = true }
 blake2 = { version = "^0.10.2", optional = true }
 blake3 = { version = "1.8", optional = true }
 chrono = { workspace = true }
+chrono-tz = { version = "0.10.4", optional = true }
 datafusion-common = { workspace = true }
 datafusion-doc = { workspace = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-expr-common = { workspace = true }
 datafusion-macros = { workspace = true }
-hex = { version = "0.4", optional = true }
+datafusion-physical-expr-common = { workspace = true }
+hex = { workspace = true, optional = true }
 itertools = { workspace = true }
 log = { workspace = true }
-md-5 = { version = "^0.10.0", optional = true }
+md-5 = { version = "^0.11.0", optional = true }
+memchr = { workspace = true }
 num-traits = { workspace = true }
 rand = { workspace = true }
 regex = { workspace = true, optional = true }
-sha2 = { version = "^0.10.9", optional = true }
-unicode-segmentation = { version = "^1.7.1", optional = true }
-uuid = { version = "1.18", features = ["v4"], optional = true }
+sha2 = { workspace = true, optional = true }
+uuid = { workspace = true, features = ["v4"], optional = true }
 
 [dev-dependencies]
 arrow = { workspace = true, features = ["test_utils"] }
@@ -103,6 +108,11 @@ harness = false
 name = "concat"
 required-features = ["string_expressions"]
 
+[[bench]]
+harness = false
+name = "concat_ws"
+required-features = ["string_expressions"]
+
 [[bench]]
 harness = false
 name = "to_timestamp"
@@ -123,6 +133,16 @@ harness = false
 name = "gcd"
 required-features = ["math_expressions"]
 
+[[bench]]
+harness = false
+name = "lcm"
+required-features = ["math_expressions"]
+
+[[bench]]
+harness = false
+name = "nanvl"
+required-features = ["math_expressions"]
+
 [[bench]]
 harness = false
 name = "uuid"
@@ -167,6 +187,16 @@ harness = false
 name = "to_char"
 required-features = ["datetime_expressions"]
 
+[[bench]]
+harness = false
+name = "to_local_time"
+required-features = ["datetime_expressions"]
+
+[[bench]]
+harness = false
+name = "to_time"
+required-features = ["datetime_expressions"]
+
 [[bench]]
 harness = false
 name = "isnan"
@@ -177,6 +207,11 @@ harness = false
 name = "signum"
 required-features = ["math_expressions"]
 
+[[bench]]
+harness = false
+name = "atan2"
+required-features = ["math_expressions"]
+
 [[bench]]
 harness = false
 name = "substr_index"
@@ -184,7 +219,7 @@ required-features = ["unicode_expressions"]
 
 [[bench]]
 harness = false
-name = "ltrim"
+name = "trim"
 required-features = ["string_expressions"]
 
 [[bench]]
@@ -207,6 +242,11 @@ harness = false
 name = "repeat"
 required-features = ["string_expressions"]
 
+[[bench]]
+harness = false
+name = "replace"
+required-features = ["string_expressions"]
+
 [[bench]]
 harness = false
 name = "random"
@@ -251,3 +291,63 @@ required-features = ["unicode_expressions"]
 harness = false
 name = "find_in_set"
 required-features = ["unicode_expressions"]
+
+[[bench]]
+harness = false
+name = "contains"
+required-features = ["string_expressions"]
+
+[[bench]]
+harness = false
+name = "starts_with"
+required-features = ["string_expressions"]
+
+[[bench]]
+harness = false
+name = "ends_with"
+required-features = ["string_expressions"]
+
+[[bench]]
+harness = false
+name = "regexp_count"
+required-features = ["regex_expressions"]
+
+[[bench]]
+harness = false
+name = "crypto"
+required-features = ["crypto_expressions"]
+
+[[bench]]
+harness = false
+name = "translate"
+required-features = ["unicode_expressions"]
+
+[[bench]]
+harness = false
+name = "levenshtein"
+required-features = ["unicode_expressions"]
+
+[[bench]]
+harness = false
+name = "split_part"
+required-features = ["string_expressions"]
+
+[[bench]]
+harness = false
+name = "left_right"
+required-features = ["unicode_expressions"]
+
+[[bench]]
+harness = false
+name = "factorial"
+required-features = ["math_expressions"]
+
+[[bench]]
+harness = false
+name = "floor_ceil"
+required-features = ["math_expressions"]
+
+[[bench]]
+harness = false
+name = "round"
+required-features = ["math_expressions"]
diff --git a/datafusion/functions/benches/ascii.rs b/datafusion/functions/benches/ascii.rs
index 03d25e9c3d4fe..a2424ed352afc 100644
--- a/datafusion/functions/benches/ascii.rs
+++ b/datafusion/functions/benches/ascii.rs
@@ -15,19 +15,47 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
 mod helper;
 
 use arrow::datatypes::{DataType, Field};
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
 use datafusion_common::config::ConfigOptions;
-use datafusion_expr::ScalarFunctionArgs;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use helper::gen_string_array;
 use std::hint::black_box;
 use std::sync::Arc;
 
 fn criterion_benchmark(c: &mut Criterion) {
     let ascii = datafusion_functions::string::ascii();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    // Scalar benchmarks (outside loop)
+    c.bench_function("ascii/scalar_utf8", |b| {
+        let args = ScalarFunctionArgs {
+            args: vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                "hello".to_string(),
+            )))],
+            arg_fields: vec![Field::new("a", DataType::Utf8, false).into()],
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Int32, true).into(),
+            config_options: Arc::clone(&config_options),
+        };
+        b.iter(|| black_box(ascii.invoke_with_args(args.clone()).unwrap()))
+    });
+
+    c.bench_function("ascii/scalar_utf8view", |b| {
+        let args = ScalarFunctionArgs {
+            args: vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
+                "hello".to_string(),
+            )))],
+            arg_fields: vec![Field::new("a", DataType::Utf8View, false).into()],
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Int32, true).into(),
+            config_options: Arc::clone(&config_options),
+        };
+        b.iter(|| black_box(ascii.invoke_with_args(args.clone()).unwrap()))
+    });
 
     // All benches are single batch run with 8192 rows
     const N_ROWS: usize = 8192;
diff --git a/datafusion/functions/benches/atan2.rs b/datafusion/functions/benches/atan2.rs
new file mode 100644
index 0000000000000..f1c9756a0cc08
--- /dev/null
+++ b/datafusion/functions/benches/atan2.rs
@@ -0,0 +1,146 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+extern crate criterion;
+
+use arrow::datatypes::{DataType, Field, Float32Type, Float64Type};
+use arrow::util::bench_util::create_primitive_array;
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::math::atan2;
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let atan2_fn = atan2();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    for size in [1024, 4096, 8192] {
+        let y_f32 = Arc::new(create_primitive_array::<Float32Type>(size, 0.2));
+        let x_f32 = Arc::new(create_primitive_array::<Float32Type>(size, 0.2));
+        let f32_args = vec![ColumnarValue::Array(y_f32), ColumnarValue::Array(x_f32)];
+        let f32_arg_fields = f32_args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+        let return_field_f32 = Field::new("f", DataType::Float32, true).into();
+
+        c.bench_function(&format!("atan2 f32 array: {size}"), |b| {
+            b.iter(|| {
+                black_box(
+                    atan2_fn
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: f32_args.clone(),
+                            arg_fields: f32_arg_fields.clone(),
+                            number_rows: size,
+                            return_field: Arc::clone(&return_field_f32),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        let y_f64 = Arc::new(create_primitive_array::<Float64Type>(size, 0.2));
+        let x_f64 = Arc::new(create_primitive_array::<Float64Type>(size, 0.2));
+        let f64_args = vec![ColumnarValue::Array(y_f64), ColumnarValue::Array(x_f64)];
+        let f64_arg_fields = f64_args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+        let return_field_f64 = Field::new("f", DataType::Float64, true).into();
+
+        c.bench_function(&format!("atan2 f64 array: {size}"), |b| {
+            b.iter(|| {
+                black_box(
+                    atan2_fn
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: f64_args.clone(),
+                            arg_fields: f64_arg_fields.clone(),
+                            number_rows: size,
+                            return_field: Arc::clone(&return_field_f64),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+    }
+
+    let scalar_f32_args = vec![
+        ColumnarValue::Scalar(ScalarValue::Float32(Some(1.0))),
+        ColumnarValue::Scalar(ScalarValue::Float32(Some(2.0))),
+    ];
+    let scalar_f32_arg_fields = vec![
+        Field::new("a", DataType::Float32, false).into(),
+        Field::new("b", DataType::Float32, false).into(),
+    ];
+    let return_field_f32 = Field::new("f", DataType::Float32, false).into();
+
+    c.bench_function("atan2 f32 scalar", |b| {
+        b.iter(|| {
+            black_box(
+                atan2_fn
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: scalar_f32_args.clone(),
+                        arg_fields: scalar_f32_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Arc::clone(&return_field_f32),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+
+    let scalar_f64_args = vec![
+        ColumnarValue::Scalar(ScalarValue::Float64(Some(1.0))),
+        ColumnarValue::Scalar(ScalarValue::Float64(Some(2.0))),
+    ];
+    let scalar_f64_arg_fields = vec![
+        Field::new("a", DataType::Float64, false).into(),
+        Field::new("b", DataType::Float64, false).into(),
+    ];
+    let return_field_f64 = Field::new("f", DataType::Float64, false).into();
+
+    c.bench_function("atan2 f64 scalar", |b| {
+        b.iter(|| {
+            black_box(
+                atan2_fn
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: scalar_f64_args.clone(),
+                        arg_fields: scalar_f64_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Arc::clone(&return_field_f64),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/character_length.rs b/datafusion/functions/benches/character_length.rs
index 4a1a63d62765f..4927627ec2f05 100644
--- a/datafusion/functions/benches/character_length.rs
+++ b/datafusion/functions/benches/character_length.rs
@@ -15,10 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::datatypes::{DataType, Field};
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::ScalarFunctionArgs;
 use helper::gen_string_array;
diff --git a/datafusion/functions/benches/chr.rs b/datafusion/functions/benches/chr.rs
index 8356cf7c31726..a702dc161ae06 100644
--- a/datafusion/functions/benches/chr.rs
+++ b/datafusion/functions/benches/chr.rs
@@ -15,10 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::{array::PrimitiveArray, datatypes::Int64Type};
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::string::chr;
 use rand::{Rng, SeedableRng};
@@ -35,11 +34,32 @@ pub fn seedable_rng() -> StdRng {
 }
 
 fn criterion_benchmark(c: &mut Criterion) {
-    let cot_fn = chr();
+    let chr_fn = chr();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    // Scalar benchmarks
+    c.bench_function("chr/scalar", |b| {
+        let args = vec![ColumnarValue::Scalar(ScalarValue::Int64(Some(65)))];
+        let arg_fields = vec![Field::new("arg_0", DataType::Int64, true).into()];
+        b.iter(|| {
+            black_box(
+                chr_fn
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: args.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+
     let size = 1024;
     let input: PrimitiveArray<Int64Type> = {
         let null_density = 0.2;
-        let mut rng = StdRng::seed_from_u64(42);
+        let mut rng = seedable_rng();
         (0..size)
             .map(|_| {
                 if rng.random::<f32>() < null_density {
@@ -57,12 +77,11 @@ fn criterion_benchmark(c: &mut Criterion) {
         .enumerate()
         .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
         .collect::<Vec<_>>();
-    let config_options = Arc::new(ConfigOptions::default());
 
-    c.bench_function("chr", |b| {
+    c.bench_function("chr/array", |b| {
         b.iter(|| {
             black_box(
-                cot_fn
+                chr_fn
                     .invoke_with_args(ScalarFunctionArgs {
                         args: args.clone(),
                         arg_fields: arg_fields.clone(),
diff --git a/datafusion/functions/benches/concat.rs b/datafusion/functions/benches/concat.rs
index 09200139a244b..0fb910800e3bc 100644
--- a/datafusion/functions/benches/concat.rs
+++ b/datafusion/functions/benches/concat.rs
@@ -17,16 +17,18 @@
 
 use arrow::array::ArrayRef;
 use arrow::datatypes::{DataType, Field};
-use arrow::util::bench_util::create_string_array_with_len;
-use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
-use datafusion_common::config::ConfigOptions;
+use arrow::util::bench_util::{create_string_array_with_len, create_string_view_array};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::string::concat;
+use rand::Rng;
+use rand::distr::Alphanumeric;
 use std::hint::black_box;
 use std::sync::Arc;
 
-fn create_args(size: usize, str_len: usize) -> Vec<ColumnarValue> {
+fn create_array_args(size: usize, str_len: usize) -> Vec<ColumnarValue> {
     let array = Arc::new(create_string_array_with_len::<i32>(size, 0.2, str_len));
     let scalar = ScalarValue::Utf8(Some(", ".to_string()));
     vec![
@@ -36,9 +38,37 @@ fn create_args(size: usize, str_len: usize) -> Vec<ColumnarValue> {
     ]
 }
 
+fn create_array_args_view(size: usize) -> Vec<ColumnarValue> {
+    let array = Arc::new(create_string_view_array(size, 0.2));
+    let scalar = ScalarValue::Utf8(Some(", ".to_string()));
+    vec![
+        ColumnarValue::Array(Arc::clone(&array) as ArrayRef),
+        ColumnarValue::Scalar(scalar),
+        ColumnarValue::Array(array),
+    ]
+}
+
+fn generate_random_string(str_len: usize) -> String {
+    rand::rng()
+        .sample_iter(&Alphanumeric)
+        .take(str_len)
+        .map(char::from)
+        .collect()
+}
+
+fn create_scalar_args(count: usize, str_len: usize) -> Vec<ColumnarValue> {
+    std::iter::repeat_with(|| {
+        let s = generate_random_string(str_len);
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some(s)))
+    })
+    .take(count)
+    .collect()
+}
+
 fn criterion_benchmark(c: &mut Criterion) {
+    // Benchmark for array concat
     for size in [1024, 4096, 8192] {
-        let args = create_args(size, 32);
+        let args = create_array_args(size, 32);
         let arg_fields = args
             .iter()
             .enumerate()
@@ -67,6 +97,70 @@ fn criterion_benchmark(c: &mut Criterion) {
         });
         group.finish();
     }
+
+    // Benchmark for StringViewArray concat
+    for size in [1024, 4096, 8192] {
+        let args = create_array_args_view(size);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                // Use Utf8View for array args
+                let dt = if matches!(arg, ColumnarValue::Array(_)) {
+                    DataType::Utf8View
+                } else {
+                    DataType::Utf8 // scalar remains Utf8
+                };
+                Field::new(format!("arg_{idx}"), dt, true).into()
+            })
+            .collect::<Vec<_>>();
+        let config_options = Arc::new(ConfigOptions::default());
+
+        let mut group = c.benchmark_group("concat function");
+        group.bench_function(BenchmarkId::new("concat_view", size), |b| {
+            b.iter(|| {
+                let args_cloned = args.clone();
+                black_box(
+                    concat()
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args_cloned,
+                            arg_fields: arg_fields.clone(),
+                            number_rows: size,
+                            return_field: Field::new("f", DataType::Utf8View, true)
+                                .into(),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+        group.finish();
+    }
+
+    // Benchmark for scalar concat
+    let scalar_args = create_scalar_args(10, 100);
+    let scalar_arg_fields = scalar_args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let mut group = c.benchmark_group("concat function");
+    group.bench_function(BenchmarkId::new("concat", "scalar"), |b| {
+        b.iter(|| {
+            let args_cloned = scalar_args.clone();
+            black_box(
+                concat()
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: scalar_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::new(ConfigOptions::default()),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/functions/benches/concat_ws.rs b/datafusion/functions/benches/concat_ws.rs
new file mode 100644
index 0000000000000..97d6d96411d73
--- /dev/null
+++ b/datafusion/functions/benches/concat_ws.rs
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::ArrayRef;
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::create_string_array_with_len;
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::string::concat_ws;
+use rand::Rng;
+use rand::distr::Alphanumeric;
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn create_array_args(size: usize, str_len: usize) -> Vec<ColumnarValue> {
+    let array = Arc::new(create_string_array_with_len::<i32>(size, 0.2, str_len));
+    let scalar = ScalarValue::Utf8(Some(", ".to_string()));
+    vec![
+        ColumnarValue::Scalar(scalar),
+        ColumnarValue::Array(Arc::clone(&array) as ArrayRef),
+        ColumnarValue::Array(array),
+    ]
+}
+
+fn generate_random_string(str_len: usize) -> String {
+    rand::rng()
+        .sample_iter(&Alphanumeric)
+        .take(str_len)
+        .map(char::from)
+        .collect()
+}
+
+fn create_scalar_args(count: usize, str_len: usize) -> Vec<ColumnarValue> {
+    let mut args = Vec::with_capacity(count + 1);
+
+    args.push(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+        ",".to_string(),
+    ))));
+
+    for _ in 0..count {
+        let s = generate_random_string(str_len);
+        args.push(ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))));
+    }
+    args
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    // Benchmark for array concat_ws
+    for size in [1024, 4096, 8192] {
+        let args = create_array_args(size, 32);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+        let config_options = Arc::new(ConfigOptions::default());
+
+        let mut group = c.benchmark_group("concat_ws function");
+        group.bench_function(BenchmarkId::new("concat_ws", size), |b| {
+            b.iter(|| {
+                let args_cloned = args.clone();
+                black_box(
+                    concat_ws()
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args_cloned,
+                            arg_fields: arg_fields.clone(),
+                            number_rows: size,
+                            return_field: Field::new("f", DataType::Utf8, true).into(),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+        group.finish();
+    }
+
+    // Benchmark for scalar concat_ws
+    let scalar_args = create_scalar_args(10, 100);
+    let scalar_arg_fields = scalar_args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let mut group = c.benchmark_group("concat_ws function");
+    group.bench_function(BenchmarkId::new("concat_ws", "scalar"), |b| {
+        b.iter(|| {
+            let args_cloned = scalar_args.clone();
+            black_box(
+                concat_ws()
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: scalar_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::new(ConfigOptions::default()),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/contains.rs b/datafusion/functions/benches/contains.rs
new file mode 100644
index 0000000000000..6c39f45e14fa6
--- /dev/null
+++ b/datafusion/functions/benches/contains.rs
@@ -0,0 +1,183 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{StringArray, StringViewArray};
+use arrow::datatypes::{DataType, Field};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use rand::distr::Alphanumeric;
+use rand::prelude::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+/// Generate a StringArray/StringViewArray with random ASCII strings
+fn gen_string_array(
+    n_rows: usize,
+    str_len: usize,
+    is_string_view: bool,
+) -> ColumnarValue {
+    let mut rng = StdRng::seed_from_u64(42);
+    let strings: Vec<Option<String>> = (0..n_rows)
+        .map(|_| {
+            let s: String = (&mut rng)
+                .sample_iter(&Alphanumeric)
+                .take(str_len)
+                .map(char::from)
+                .collect();
+            Some(s)
+        })
+        .collect();
+
+    if is_string_view {
+        ColumnarValue::Array(Arc::new(StringViewArray::from(strings)))
+    } else {
+        ColumnarValue::Array(Arc::new(StringArray::from(strings)))
+    }
+}
+
+/// Generate a scalar search string
+fn gen_scalar_search(search_str: &str, is_string_view: bool) -> ColumnarValue {
+    if is_string_view {
+        ColumnarValue::Scalar(ScalarValue::Utf8View(Some(search_str.to_string())))
+    } else {
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some(search_str.to_string())))
+    }
+}
+
+/// Generate an array of search strings (same string repeated)
+fn gen_array_search(
+    search_str: &str,
+    n_rows: usize,
+    is_string_view: bool,
+) -> ColumnarValue {
+    let strings: Vec<Option<String>> =
+        (0..n_rows).map(|_| Some(search_str.to_string())).collect();
+
+    if is_string_view {
+        ColumnarValue::Array(Arc::new(StringViewArray::from(strings)))
+    } else {
+        ColumnarValue::Array(Arc::new(StringArray::from(strings)))
+    }
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let contains = datafusion_functions::string::contains();
+    let n_rows = 8192;
+    let str_len = 128;
+    let search_str = "xyz"; // A pattern that likely won't be found
+
+    // Benchmark: StringArray with scalar search (the optimized path)
+    let str_array = gen_string_array(n_rows, str_len, false);
+    let scalar_search = gen_scalar_search(search_str, false);
+    let arg_fields = vec![
+        Field::new("a", DataType::Utf8, true).into(),
+        Field::new("b", DataType::Utf8, true).into(),
+    ];
+    let return_field = Field::new("f", DataType::Boolean, true).into();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function("contains_StringArray_scalar_search", |b| {
+        b.iter(|| {
+            black_box(contains.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_array.clone(), scalar_search.clone()],
+                arg_fields: arg_fields.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark: StringArray with array search (for comparison)
+    let array_search = gen_array_search(search_str, n_rows, false);
+    c.bench_function("contains_StringArray_array_search", |b| {
+        b.iter(|| {
+            black_box(contains.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_array.clone(), array_search.clone()],
+                arg_fields: arg_fields.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark: StringViewArray with scalar search (the optimized path)
+    let str_view_array = gen_string_array(n_rows, str_len, true);
+    let scalar_search_view = gen_scalar_search(search_str, true);
+    let arg_fields_view = vec![
+        Field::new("a", DataType::Utf8View, true).into(),
+        Field::new("b", DataType::Utf8View, true).into(),
+    ];
+
+    c.bench_function("contains_StringViewArray_scalar_search", |b| {
+        b.iter(|| {
+            black_box(contains.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_view_array.clone(), scalar_search_view.clone()],
+                arg_fields: arg_fields_view.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark: StringViewArray with array search (for comparison)
+    let array_search_view = gen_array_search(search_str, n_rows, true);
+    c.bench_function("contains_StringViewArray_array_search", |b| {
+        b.iter(|| {
+            black_box(contains.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_view_array.clone(), array_search_view.clone()],
+                arg_fields: arg_fields_view.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark different string lengths with scalar search
+    for str_len in [8, 32, 128, 512] {
+        let str_array = gen_string_array(n_rows, str_len, true);
+        let scalar_search = gen_scalar_search(search_str, true);
+        let arg_fields = vec![
+            Field::new("a", DataType::Utf8View, true).into(),
+            Field::new("b", DataType::Utf8View, true).into(),
+        ];
+
+        c.bench_function(
+            &format!("contains_StringViewArray_scalar_strlen_{str_len}"),
+            |b| {
+                b.iter(|| {
+                    black_box(contains.invoke_with_args(ScalarFunctionArgs {
+                        args: vec![str_array.clone(), scalar_search.clone()],
+                        arg_fields: arg_fields.clone(),
+                        number_rows: n_rows,
+                        return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/cot.rs b/datafusion/functions/benches/cot.rs
index 97f21ccd6d55e..16c3fba2175fe 100644
--- a/datafusion/functions/benches/cot.rs
+++ b/datafusion/functions/benches/cot.rs
@@ -15,23 +15,25 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::{
     datatypes::{Float32Type, Float64Type},
     util::bench_util::create_primitive_array,
 };
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::math::cot;
 use std::hint::black_box;
 
 use arrow::datatypes::{DataType, Field};
+use datafusion_common::ScalarValue;
 use datafusion_common::config::ConfigOptions;
 use std::sync::Arc;
 
 fn criterion_benchmark(c: &mut Criterion) {
     let cot_fn = cot();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    // Array benchmarks - run for different sizes
     for size in [1024, 4096, 8192] {
         let f32_array = Arc::new(create_primitive_array::<Float32Type>(size, 0.2));
         let f32_args = vec![ColumnarValue::Array(f32_array)];
@@ -42,7 +44,6 @@ fn criterion_benchmark(c: &mut Criterion) {
                 Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
             })
             .collect::<Vec<_>>();
-        let config_options = Arc::new(ConfigOptions::default());
 
         c.bench_function(&format!("cot f32 array: {size}"), |b| {
             b.iter(|| {
@@ -59,6 +60,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                 )
             })
         });
+
         let f64_array = Arc::new(create_primitive_array::<Float64Type>(size, 0.2));
         let f64_args = vec![ColumnarValue::Array(f64_array)];
         let arg_fields = f64_args
@@ -86,6 +88,47 @@ fn criterion_benchmark(c: &mut Criterion) {
             })
         });
     }
+
+    // Scalar benchmarks - run only once since size doesn't affect scalar performance
+    let scalar_f32_args = vec![ColumnarValue::Scalar(ScalarValue::Float32(Some(1.0)))];
+    let scalar_f32_arg_fields = vec![Field::new("a", DataType::Float32, false).into()];
+    let return_field_f32 = Field::new("f", DataType::Float32, false).into();
+
+    c.bench_function("cot f32 scalar", |b| {
+        b.iter(|| {
+            black_box(
+                cot_fn
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: scalar_f32_args.clone(),
+                        arg_fields: scalar_f32_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Arc::clone(&return_field_f32),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+
+    let scalar_f64_args = vec![ColumnarValue::Scalar(ScalarValue::Float64(Some(1.0)))];
+    let scalar_f64_arg_fields = vec![Field::new("a", DataType::Float64, false).into()];
+    let return_field_f64 = Field::new("f", DataType::Float64, false).into();
+
+    c.bench_function("cot f64 scalar", |b| {
+        b.iter(|| {
+            black_box(
+                cot_fn
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: scalar_f64_args.clone(),
+                        arg_fields: scalar_f64_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Arc::clone(&return_field_f64),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/functions/benches/crypto.rs b/datafusion/functions/benches/crypto.rs
new file mode 100644
index 0000000000000..9a86efbff9ed8
--- /dev/null
+++ b/datafusion/functions/benches/crypto.rs
@@ -0,0 +1,73 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::create_string_array_with_len;
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::ScalarFunctionArgs;
+use datafusion_expr_common::columnar_value::ColumnarValue;
+use datafusion_functions::crypto;
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let crypto = vec![
+        crypto::md5(),
+        crypto::sha224(),
+        crypto::sha256(),
+        crypto::sha384(),
+        crypto::sha512(),
+    ];
+    let config_options = Arc::new(ConfigOptions::default());
+
+    for func in crypto {
+        let size = 1024;
+        let arr_args = vec![ColumnarValue::Array(Arc::new(
+            create_string_array_with_len::<i32>(size, 0.2, 32),
+        ))];
+        c.bench_function(&format!("{}_array", func.name()), |b| {
+            b.iter(|| {
+                let args_cloned = arr_args.clone();
+                black_box(func.invoke_with_args(ScalarFunctionArgs {
+                    args: args_cloned,
+                    arg_fields: vec![Field::new("a", DataType::Utf8, true).into()],
+                    number_rows: size,
+                    return_field: Field::new("f", DataType::Utf8, true).into(),
+                    config_options: Arc::clone(&config_options),
+                }))
+            })
+        });
+
+        let scalar_args = vec![ColumnarValue::Scalar("test_string".into())];
+        c.bench_function(&format!("{}_scalar", func.name()), |b| {
+            b.iter(|| {
+                let args_cloned = scalar_args.clone();
+                black_box(func.invoke_with_args(ScalarFunctionArgs {
+                    args: args_cloned,
+                    arg_fields: vec![Field::new("a", DataType::Utf8, true).into()],
+                    number_rows: 1,
+                    return_field: Field::new("f", DataType::Utf8, true).into(),
+                    config_options: Arc::clone(&config_options),
+                }))
+            })
+        });
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/date_bin.rs b/datafusion/functions/benches/date_bin.rs
index 74390491d538c..28dee96987261 100644
--- a/datafusion/functions/benches/date_bin.rs
+++ b/datafusion/functions/benches/date_bin.rs
@@ -15,20 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use std::hint::black_box;
 use std::sync::Arc;
 
 use arrow::array::{Array, ArrayRef, TimestampSecondArray};
 use arrow::datatypes::Field;
-use criterion::{criterion_group, criterion_main, Criterion};
-use datafusion_common::config::ConfigOptions;
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::datetime::date_bin;
-use rand::rngs::ThreadRng;
 use rand::Rng;
+use rand::rngs::ThreadRng;
 
 fn timestamps(rng: &mut ThreadRng) -> TimestampSecondArray {
     let mut seconds = vec![];
diff --git a/datafusion/functions/benches/date_trunc.rs b/datafusion/functions/benches/date_trunc.rs
index 498a3e63ef290..0668a1cc5085c 100644
--- a/datafusion/functions/benches/date_trunc.rs
+++ b/datafusion/functions/benches/date_trunc.rs
@@ -15,20 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use std::hint::black_box;
 use std::sync::Arc;
 
 use arrow::array::{Array, ArrayRef, TimestampSecondArray};
 use arrow::datatypes::Field;
-use criterion::{criterion_group, criterion_main, Criterion};
-use datafusion_common::config::ConfigOptions;
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
-use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs};
 use datafusion_functions::datetime::date_trunc;
-use rand::rngs::ThreadRng;
 use rand::Rng;
+use rand::rngs::ThreadRng;
 
 fn timestamps(rng: &mut ThreadRng) -> TimestampSecondArray {
     let mut seconds = vec![];
@@ -57,10 +55,13 @@ fn criterion_benchmark(c: &mut Criterion) {
             })
             .collect::<Vec<_>>();
 
-        let return_type = udf
-            .return_type(&args.iter().map(|arg| arg.data_type()).collect::<Vec<_>>())
+        let scalar_arguments = vec![None; arg_fields.len()];
+        let return_field = udf
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &arg_fields,
+                scalar_arguments: &scalar_arguments,
+            })
             .unwrap();
-        let return_field = Arc::new(Field::new("f", return_type, true));
         let config_options = Arc::new(ConfigOptions::default());
 
         b.iter(|| {
diff --git a/datafusion/functions/benches/encoding.rs b/datafusion/functions/benches/encoding.rs
index 98faee91e1911..0b8f0c5c51a58 100644
--- a/datafusion/functions/benches/encoding.rs
+++ b/datafusion/functions/benches/encoding.rs
@@ -15,12 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::array::Array;
 use arrow::datatypes::{DataType, Field};
-use arrow::util::bench_util::create_string_array_with_len;
-use criterion::{criterion_group, criterion_main, Criterion};
+use arrow::util::bench_util::create_binary_array;
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::encoding;
@@ -32,20 +30,22 @@ fn criterion_benchmark(c: &mut Criterion) {
     let config_options = Arc::new(ConfigOptions::default());
 
     for size in [1024, 4096, 8192] {
-        let str_array = Arc::new(create_string_array_with_len::<i32>(size, 0.2, 32));
+        let bin_array = Arc::new(create_binary_array::<i32>(size, 0.2));
         c.bench_function(&format!("base64_decode/{size}"), |b| {
             let method = ColumnarValue::Scalar("base64".into());
             let encoded = encoding::encode()
                 .invoke_with_args(ScalarFunctionArgs {
-                    args: vec![ColumnarValue::Array(str_array.clone()), method.clone()],
+                    args: vec![ColumnarValue::Array(bin_array.clone()), method.clone()],
                     arg_fields: vec![
-                        Field::new("a", str_array.data_type().to_owned(), true).into(),
+                        Field::new("a", bin_array.data_type().to_owned(), true).into(),
                         Field::new("b", method.data_type().to_owned(), true).into(),
                     ],
                     number_rows: size,
                     return_field: Field::new("f", DataType::Utf8, true).into(),
                     config_options: Arc::clone(&config_options),
                 })
+                .unwrap()
+                .cast_to(&DataType::Binary, None)
                 .unwrap();
 
             let arg_fields = vec![
@@ -61,7 +61,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                             args: args.clone(),
                             arg_fields: arg_fields.clone(),
                             number_rows: size,
-                            return_field: Field::new("f", DataType::Utf8, true).into(),
+                            return_field: Field::new("f", DataType::Binary, true).into(),
                             config_options: Arc::clone(&config_options),
                         })
                         .unwrap(),
@@ -72,24 +72,26 @@ fn criterion_benchmark(c: &mut Criterion) {
         c.bench_function(&format!("hex_decode/{size}"), |b| {
             let method = ColumnarValue::Scalar("hex".into());
             let arg_fields = vec![
-                Field::new("a", str_array.data_type().to_owned(), true).into(),
+                Field::new("a", bin_array.data_type().to_owned(), true).into(),
                 Field::new("b", method.data_type().to_owned(), true).into(),
             ];
             let encoded = encoding::encode()
                 .invoke_with_args(ScalarFunctionArgs {
-                    args: vec![ColumnarValue::Array(str_array.clone()), method.clone()],
+                    args: vec![ColumnarValue::Array(bin_array.clone()), method.clone()],
                     arg_fields,
                     number_rows: size,
                     return_field: Field::new("f", DataType::Utf8, true).into(),
                     config_options: Arc::clone(&config_options),
                 })
+                .unwrap()
+                .cast_to(&DataType::Binary, None)
                 .unwrap();
 
             let arg_fields = vec![
                 Field::new("a", encoded.data_type().to_owned(), true).into(),
                 Field::new("b", method.data_type().to_owned(), true).into(),
             ];
-            let return_field = Field::new("f", DataType::Utf8, true).into();
+            let return_field = Field::new("f", DataType::Binary, true).into();
             let args = vec![encoded, method];
 
             b.iter(|| {
diff --git a/datafusion/functions/benches/ends_with.rs b/datafusion/functions/benches/ends_with.rs
new file mode 100644
index 0000000000000..474e8a1555cf2
--- /dev/null
+++ b/datafusion/functions/benches/ends_with.rs
@@ -0,0 +1,183 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{StringArray, StringViewArray};
+use arrow::datatypes::{DataType, Field};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use rand::distr::Alphanumeric;
+use rand::prelude::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+/// Generate a StringArray/StringViewArray with random ASCII strings
+fn gen_string_array(
+    n_rows: usize,
+    str_len: usize,
+    is_string_view: bool,
+) -> ColumnarValue {
+    let mut rng = StdRng::seed_from_u64(42);
+    let strings: Vec<Option<String>> = (0..n_rows)
+        .map(|_| {
+            let s: String = (&mut rng)
+                .sample_iter(&Alphanumeric)
+                .take(str_len)
+                .map(char::from)
+                .collect();
+            Some(s)
+        })
+        .collect();
+
+    if is_string_view {
+        ColumnarValue::Array(Arc::new(StringViewArray::from(strings)))
+    } else {
+        ColumnarValue::Array(Arc::new(StringArray::from(strings)))
+    }
+}
+
+/// Generate a scalar suffix string
+fn gen_scalar_suffix(suffix_str: &str, is_string_view: bool) -> ColumnarValue {
+    if is_string_view {
+        ColumnarValue::Scalar(ScalarValue::Utf8View(Some(suffix_str.to_string())))
+    } else {
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some(suffix_str.to_string())))
+    }
+}
+
+/// Generate an array of suffix strings (same string repeated)
+fn gen_array_suffix(
+    suffix_str: &str,
+    n_rows: usize,
+    is_string_view: bool,
+) -> ColumnarValue {
+    let strings: Vec<Option<String>> =
+        (0..n_rows).map(|_| Some(suffix_str.to_string())).collect();
+
+    if is_string_view {
+        ColumnarValue::Array(Arc::new(StringViewArray::from(strings)))
+    } else {
+        ColumnarValue::Array(Arc::new(StringArray::from(strings)))
+    }
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let ends_with = datafusion_functions::string::ends_with();
+    let n_rows = 8192;
+    let str_len = 128;
+    let suffix_str = "xyz"; // A pattern that likely won't match
+
+    // Benchmark: StringArray with scalar suffix (the optimized path)
+    let str_array = gen_string_array(n_rows, str_len, false);
+    let scalar_suffix = gen_scalar_suffix(suffix_str, false);
+    let arg_fields = vec![
+        Field::new("a", DataType::Utf8, true).into(),
+        Field::new("b", DataType::Utf8, true).into(),
+    ];
+    let return_field = Field::new("f", DataType::Boolean, true).into();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function("ends_with_StringArray_scalar_suffix", |b| {
+        b.iter(|| {
+            black_box(ends_with.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_array.clone(), scalar_suffix.clone()],
+                arg_fields: arg_fields.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark: StringArray with array suffix (for comparison)
+    let array_suffix = gen_array_suffix(suffix_str, n_rows, false);
+    c.bench_function("ends_with_StringArray_array_suffix", |b| {
+        b.iter(|| {
+            black_box(ends_with.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_array.clone(), array_suffix.clone()],
+                arg_fields: arg_fields.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark: StringViewArray with scalar suffix (the optimized path)
+    let str_view_array = gen_string_array(n_rows, str_len, true);
+    let scalar_suffix_view = gen_scalar_suffix(suffix_str, true);
+    let arg_fields_view = vec![
+        Field::new("a", DataType::Utf8View, true).into(),
+        Field::new("b", DataType::Utf8View, true).into(),
+    ];
+
+    c.bench_function("ends_with_StringViewArray_scalar_suffix", |b| {
+        b.iter(|| {
+            black_box(ends_with.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_view_array.clone(), scalar_suffix_view.clone()],
+                arg_fields: arg_fields_view.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark: StringViewArray with array suffix (for comparison)
+    let array_suffix_view = gen_array_suffix(suffix_str, n_rows, true);
+    c.bench_function("ends_with_StringViewArray_array_suffix", |b| {
+        b.iter(|| {
+            black_box(ends_with.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_view_array.clone(), array_suffix_view.clone()],
+                arg_fields: arg_fields_view.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark different string lengths with scalar suffix
+    for str_len in [8, 32, 128, 512] {
+        let str_array = gen_string_array(n_rows, str_len, true);
+        let scalar_suffix = gen_scalar_suffix(suffix_str, true);
+        let arg_fields = vec![
+            Field::new("a", DataType::Utf8View, true).into(),
+            Field::new("b", DataType::Utf8View, true).into(),
+        ];
+
+        c.bench_function(
+            &format!("ends_with_StringViewArray_scalar_strlen_{str_len}"),
+            |b| {
+                b.iter(|| {
+                    black_box(ends_with.invoke_with_args(ScalarFunctionArgs {
+                        args: vec![str_array.clone(), scalar_suffix.clone()],
+                        arg_fields: arg_fields.clone(),
+                        number_rows: n_rows,
+                        return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/factorial.rs b/datafusion/functions/benches/factorial.rs
new file mode 100644
index 0000000000000..c441b50c288c3
--- /dev/null
+++ b/datafusion/functions/benches/factorial.rs
@@ -0,0 +1,65 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::Int64Array;
+use arrow::datatypes::{DataType, Field};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::ScalarFunctionArgs;
+use datafusion_expr_common::columnar_value::ColumnarValue;
+use datafusion_functions::math::factorial;
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let factorial = factorial();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    let arr_args = vec![ColumnarValue::Array(Arc::new(Int64Array::from_iter(
+        (0..1024).map(|i| Some(i % 21)),
+    )))];
+    c.bench_function(&format!("{}_array", factorial.name()), |b| {
+        b.iter(|| {
+            let args_cloned = arr_args.clone();
+            black_box(factorial.invoke_with_args(ScalarFunctionArgs {
+                args: args_cloned,
+                arg_fields: vec![Field::new("a", DataType::Utf8, true).into()],
+                number_rows: arr_args.len(),
+                return_field: Field::new("f", DataType::Utf8, true).into(),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    let scalar_args = vec![ColumnarValue::Scalar(ScalarValue::Int64(Some(20)))];
+    c.bench_function(&format!("{}_scalar", factorial.name()), |b| {
+        b.iter(|| {
+            let args_cloned = scalar_args.clone();
+            black_box(factorial.invoke_with_args(ScalarFunctionArgs {
+                args: args_cloned,
+                arg_fields: vec![Field::new("a", DataType::Utf8, true).into()],
+                number_rows: 1,
+                return_field: Field::new("f", DataType::Utf8, true).into(),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/find_in_set.rs b/datafusion/functions/benches/find_in_set.rs
index a928f5655806c..9ee20ecd14fdf 100644
--- a/datafusion/functions/benches/find_in_set.rs
+++ b/datafusion/functions/benches/find_in_set.rs
@@ -15,16 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::array::{StringArray, StringViewArray};
 use arrow::datatypes::{DataType, Field};
 use arrow::util::bench_util::{
     create_string_array_with_len, create_string_view_array_with_len,
 };
-use criterion::{criterion_group, criterion_main, Criterion, SamplingMode};
-use datafusion_common::config::ConfigOptions;
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use rand::distr::Alphanumeric;
 use rand::prelude::StdRng;
diff --git a/datafusion/functions/benches/floor_ceil.rs b/datafusion/functions/benches/floor_ceil.rs
new file mode 100644
index 0000000000000..dc095e0152c4d
--- /dev/null
+++ b/datafusion/functions/benches/floor_ceil.rs
@@ -0,0 +1,133 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::{DataType, Field, Float64Type};
+use arrow::util::bench_util::create_primitive_array;
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::math::{ceil, floor};
+use std::hint::black_box;
+use std::sync::Arc;
+use std::time::Duration;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let floor_fn = floor();
+    let ceil_fn = ceil();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    for size in [1024, 4096, 8192] {
+        let mut group = c.benchmark_group(format!("floor_ceil size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        // Float64 array benchmark
+        let f64_array = Arc::new(create_primitive_array::<Float64Type>(size, 0.1));
+        let batch_len = f64_array.len();
+        let f64_args = vec![ColumnarValue::Array(f64_array)];
+
+        group.bench_function("floor_f64_array", |b| {
+            b.iter(|| {
+                let args_cloned = f64_args.clone();
+                black_box(
+                    floor_fn
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args_cloned,
+                            arg_fields: vec![
+                                Field::new("a", DataType::Float64, true).into(),
+                            ],
+                            number_rows: batch_len,
+                            return_field: Field::new("f", DataType::Float64, true).into(),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        group.bench_function("ceil_f64_array", |b| {
+            b.iter(|| {
+                let args_cloned = f64_args.clone();
+                black_box(
+                    ceil_fn
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args_cloned,
+                            arg_fields: vec![
+                                Field::new("a", DataType::Float64, true).into(),
+                            ],
+                            number_rows: batch_len,
+                            return_field: Field::new("f", DataType::Float64, true).into(),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        // Scalar benchmark (the optimization we added)
+        let scalar_args = vec![ColumnarValue::Scalar(ScalarValue::Float64(Some(
+            std::f64::consts::PI,
+        )))];
+
+        group.bench_function("floor_f64_scalar", |b| {
+            b.iter(|| {
+                let args_cloned = scalar_args.clone();
+                black_box(
+                    floor_fn
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args_cloned,
+                            arg_fields: vec![
+                                Field::new("a", DataType::Float64, false).into(),
+                            ],
+                            number_rows: 1,
+                            return_field: Field::new("f", DataType::Float64, false)
+                                .into(),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        group.bench_function("ceil_f64_scalar", |b| {
+            b.iter(|| {
+                let args_cloned = scalar_args.clone();
+                black_box(
+                    ceil_fn
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args_cloned,
+                            arg_fields: vec![
+                                Field::new("a", DataType::Float64, false).into(),
+                            ],
+                            number_rows: 1,
+                            return_field: Field::new("f", DataType::Float64, false)
+                                .into(),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/gcd.rs b/datafusion/functions/benches/gcd.rs
index 19e196d9a3eab..3c72a46e6643d 100644
--- a/datafusion/functions/benches/gcd.rs
+++ b/datafusion/functions/benches/gcd.rs
@@ -15,16 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::datatypes::Field;
 use arrow::{
     array::{ArrayRef, Int64Array},
     datatypes::DataType,
 };
-use criterion::{criterion_group, criterion_main, Criterion};
-use datafusion_common::config::ConfigOptions;
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::math::gcd;
 use rand::Rng;
diff --git a/datafusion/functions/benches/helper.rs b/datafusion/functions/benches/helper.rs
index a2b110ae4d63b..d6d6afd48f2ca 100644
--- a/datafusion/functions/benches/helper.rs
+++ b/datafusion/functions/benches/helper.rs
@@ -18,7 +18,7 @@
 use arrow::array::{StringArray, StringViewArray};
 use datafusion_expr::ColumnarValue;
 use rand::distr::Alphanumeric;
-use rand::{rngs::StdRng, Rng, SeedableRng};
+use rand::{Rng, SeedableRng, rngs::StdRng};
 use std::sync::Arc;
 
 /// gen_arr(4096, 128, 0.1, 0.1, true) will generate a StringViewArray with
diff --git a/datafusion/functions/benches/initcap.rs b/datafusion/functions/benches/initcap.rs
index 50aee8dbb9161..b5e653e4136a3 100644
--- a/datafusion/functions/benches/initcap.rs
+++ b/datafusion/functions/benches/initcap.rs
@@ -15,19 +15,19 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
-use arrow::array::OffsetSizeTrait;
+use arrow::array::{ArrayRef, OffsetSizeTrait, StringArray, StringViewBuilder};
 use arrow::datatypes::{DataType, Field};
 use arrow::util::bench_util::{
     create_string_array_with_len, create_string_view_array_with_len,
 };
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::unicode;
 use std::hint::black_box;
 use std::sync::Arc;
+use std::time::Duration;
 
 fn create_args<O: OffsetSizeTrait>(
     size: usize,
@@ -47,62 +47,161 @@ fn create_args<O: OffsetSizeTrait>(
     }
 }
 
+/// Create a Utf8 array where every value contains non-ASCII Unicode text.
+fn create_unicode_utf8_args(size: usize) -> Vec<ColumnarValue> {
+    let array = Arc::new(StringArray::from_iter_values(std::iter::repeat_n(
+        "ñAnDÚ ÁrBOL ОлЕГ ÍslENsku",
+        size,
+    ))) as ArrayRef;
+    vec![ColumnarValue::Array(array)]
+}
+
+/// Create a Utf8View array where every value contains non-ASCII Unicode text.
+fn create_unicode_utf8view_args(size: usize) -> Vec<ColumnarValue> {
+    let mut builder = StringViewBuilder::with_capacity(size);
+    for _ in 0..size {
+        builder.append_value("ñAnDÚ ÁrBOL ОлЕГ ÍslENsku");
+    }
+    let array = Arc::new(builder.finish()) as ArrayRef;
+    vec![ColumnarValue::Array(array)]
+}
+
 fn criterion_benchmark(c: &mut Criterion) {
     let initcap = unicode::initcap();
-    for size in [1024, 4096] {
-        let args = create_args::<i32>(size, 8, true);
-        let arg_fields = args
-            .iter()
-            .enumerate()
-            .map(|(idx, arg)| {
-                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
-            })
-            .collect::<Vec<_>>();
-        let config_options = Arc::new(ConfigOptions::default());
+    let config_options = Arc::new(ConfigOptions::default());
+
+    // Array benchmarks: vary both row count and string length
+    for size in [1024, 4096, 8192] {
+        for str_len in [16, 128] {
+            let mut group =
+                c.benchmark_group(format!("initcap size={size} str_len={str_len}"));
+            group.sampling_mode(SamplingMode::Flat);
+            group.sample_size(10);
+            group.measurement_time(Duration::from_secs(10));
+
+            // Utf8
+            let array_args = create_args::<i32>(size, str_len, false);
+            let array_arg_fields = vec![Field::new("arg_0", DataType::Utf8, true).into()];
 
-        c.bench_function(
-            format!("initcap string view shorter than 12 [size={size}]").as_str(),
-            |b| {
+            group.bench_function("array_utf8", |b| {
                 b.iter(|| {
                     black_box(initcap.invoke_with_args(ScalarFunctionArgs {
-                        args: args.clone(),
-                        arg_fields: arg_fields.clone(),
+                        args: array_args.clone(),
+                        arg_fields: array_arg_fields.clone(),
                         number_rows: size,
-                        return_field: Field::new("f", DataType::Utf8View, true).into(),
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
                         config_options: Arc::clone(&config_options),
                     }))
                 })
-            },
-        );
+            });
+
+            // Utf8View
+            let array_view_args = create_args::<i32>(size, str_len, true);
+            let array_view_arg_fields =
+                vec![Field::new("arg_0", DataType::Utf8View, true).into()];
 
-        let args = create_args::<i32>(size, 16, true);
-        c.bench_function(
-            format!("initcap string view longer than 12 [size={size}]").as_str(),
-            |b| {
+            group.bench_function("array_utf8view", |b| {
                 b.iter(|| {
                     black_box(initcap.invoke_with_args(ScalarFunctionArgs {
-                        args: args.clone(),
-                        arg_fields: arg_fields.clone(),
+                        args: array_view_args.clone(),
+                        arg_fields: array_view_arg_fields.clone(),
                         number_rows: size,
                         return_field: Field::new("f", DataType::Utf8View, true).into(),
                         config_options: Arc::clone(&config_options),
                     }))
                 })
-            },
-        );
+            });
 
-        let args = create_args::<i32>(size, 16, false);
-        c.bench_function(format!("initcap string [size={size}]").as_str(), |b| {
+            group.finish();
+        }
+    }
+
+    // Unicode array benchmarks
+    for size in [1024, 4096, 8192] {
+        let mut group = c.benchmark_group(format!("initcap unicode size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        let unicode_args = create_unicode_utf8_args(size);
+        let unicode_arg_fields = vec![Field::new("arg_0", DataType::Utf8, true).into()];
+
+        group.bench_function("array_utf8", |b| {
             b.iter(|| {
                 black_box(initcap.invoke_with_args(ScalarFunctionArgs {
-                    args: args.clone(),
-                    arg_fields: arg_fields.clone(),
+                    args: unicode_args.clone(),
+                    arg_fields: unicode_arg_fields.clone(),
                     number_rows: size,
                     return_field: Field::new("f", DataType::Utf8, true).into(),
                     config_options: Arc::clone(&config_options),
                 }))
             })
         });
+
+        let unicode_view_args = create_unicode_utf8view_args(size);
+        let unicode_view_arg_fields =
+            vec![Field::new("arg_0", DataType::Utf8View, true).into()];
+
+        group.bench_function("array_utf8view", |b| {
+            b.iter(|| {
+                black_box(initcap.invoke_with_args(ScalarFunctionArgs {
+                    args: unicode_view_args.clone(),
+                    arg_fields: unicode_view_arg_fields.clone(),
+                    number_rows: size,
+                    return_field: Field::new("f", DataType::Utf8View, true).into(),
+                    config_options: Arc::clone(&config_options),
+                }))
+            })
+        });
+
+        group.finish();
+    }
+
+    // Scalar benchmarks: independent of array size, run once
+    {
+        let mut group = c.benchmark_group("initcap scalar");
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        // Utf8
+        let scalar_args = vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+            "hello world test string".to_string(),
+        )))];
+        let scalar_arg_fields = vec![Field::new("arg_0", DataType::Utf8, false).into()];
+
+        group.bench_function("scalar_utf8", |b| {
+            b.iter(|| {
+                black_box(initcap.invoke_with_args(ScalarFunctionArgs {
+                    args: scalar_args.clone(),
+                    arg_fields: scalar_arg_fields.clone(),
+                    number_rows: 1,
+                    return_field: Field::new("f", DataType::Utf8, false).into(),
+                    config_options: Arc::clone(&config_options),
+                }))
+            })
+        });
+
+        // Utf8View
+        let scalar_view_args = vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
+            "hello world test string".to_string(),
+        )))];
+        let scalar_view_arg_fields =
+            vec![Field::new("arg_0", DataType::Utf8View, false).into()];
+
+        group.bench_function("scalar_utf8view", |b| {
+            b.iter(|| {
+                black_box(initcap.invoke_with_args(ScalarFunctionArgs {
+                    args: scalar_view_args.clone(),
+                    arg_fields: scalar_view_arg_fields.clone(),
+                    number_rows: 1,
+                    return_field: Field::new("f", DataType::Utf8View, false).into(),
+                    config_options: Arc::clone(&config_options),
+                }))
+            })
+        });
+
+        group.finish();
     }
 }
 
diff --git a/datafusion/functions/benches/isnan.rs b/datafusion/functions/benches/isnan.rs
index 4a90d45d66223..e353b9d27a0a1 100644
--- a/datafusion/functions/benches/isnan.rs
+++ b/datafusion/functions/benches/isnan.rs
@@ -15,14 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::datatypes::{DataType, Field};
 use arrow::{
     datatypes::{Float32Type, Float64Type},
     util::bench_util::create_primitive_array,
 };
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::math::isnan;
diff --git a/datafusion/functions/benches/iszero.rs b/datafusion/functions/benches/iszero.rs
index 961cba7200ce0..c6d0aed4c615c 100644
--- a/datafusion/functions/benches/iszero.rs
+++ b/datafusion/functions/benches/iszero.rs
@@ -15,14 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::datatypes::{DataType, Field};
 use arrow::{
     datatypes::{Float32Type, Float64Type},
     util::bench_util::create_primitive_array,
 };
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::math::iszero;
@@ -31,6 +30,8 @@ use std::sync::Arc;
 
 fn criterion_benchmark(c: &mut Criterion) {
     let iszero = iszero();
+    let config_options = Arc::new(ConfigOptions::default());
+
     for size in [1024, 4096, 8192] {
         let f32_array = Arc::new(create_primitive_array::<Float32Type>(size, 0.2));
         let batch_len = f32_array.len();
@@ -43,7 +44,6 @@ fn criterion_benchmark(c: &mut Criterion) {
             })
             .collect::<Vec<_>>();
         let return_field = Arc::new(Field::new("f", DataType::Boolean, true));
-        let config_options = Arc::new(ConfigOptions::default());
 
         c.bench_function(&format!("iszero f32 array: {size}"), |b| {
             b.iter(|| {
@@ -60,6 +60,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                 )
             })
         });
+
         let f64_array = Arc::new(create_primitive_array::<Float64Type>(size, 0.2));
         let batch_len = f64_array.len();
         let f64_args = vec![ColumnarValue::Array(f64_array)];
@@ -88,6 +89,46 @@ fn criterion_benchmark(c: &mut Criterion) {
             })
         });
     }
+
+    // Scalar benchmarks - run once since size doesn't affect scalar performance
+    let scalar_f32_args = vec![ColumnarValue::Scalar(ScalarValue::Float32(Some(1.0)))];
+    let scalar_f32_arg_fields = vec![Field::new("a", DataType::Float32, false).into()];
+    let return_field_scalar = Arc::new(Field::new("f", DataType::Boolean, false));
+
+    c.bench_function("iszero f32 scalar", |b| {
+        b.iter(|| {
+            black_box(
+                iszero
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: scalar_f32_args.clone(),
+                        arg_fields: scalar_f32_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Arc::clone(&return_field_scalar),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+
+    let scalar_f64_args = vec![ColumnarValue::Scalar(ScalarValue::Float64(Some(1.0)))];
+    let scalar_f64_arg_fields = vec![Field::new("a", DataType::Float64, false).into()];
+
+    c.bench_function("iszero f64 scalar", |b| {
+        b.iter(|| {
+            black_box(
+                iszero
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: scalar_f64_args.clone(),
+                        arg_fields: scalar_f64_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Arc::clone(&return_field_scalar),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/functions/benches/lcm.rs b/datafusion/functions/benches/lcm.rs
new file mode 100644
index 0000000000000..247c0ec749d15
--- /dev/null
+++ b/datafusion/functions/benches/lcm.rs
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::Field;
+use arrow::{
+    array::{ArrayRef, Int64Array},
+    datatypes::DataType,
+};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::math::lcm;
+use rand::Rng;
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn generate_i64_array(n_rows: usize) -> ArrayRef {
+    let mut rng = rand::rng();
+    let values = (0..n_rows)
+        .map(|_| rng.random_range(0..1000))
+        .collect::<Vec<_>>();
+    Arc::new(Int64Array::from(values)) as ArrayRef
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let n_rows = 100000;
+    let array_a = ColumnarValue::Array(generate_i64_array(n_rows));
+    let array_b = ColumnarValue::Array(generate_i64_array(n_rows));
+    let udf = lcm();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function("lcm both array", |b| {
+        b.iter(|| {
+            black_box(
+                udf.invoke_with_args(ScalarFunctionArgs {
+                    args: vec![array_a.clone(), array_b.clone()],
+                    arg_fields: vec![
+                        Field::new("a", array_a.data_type(), true).into(),
+                        Field::new("b", array_b.data_type(), true).into(),
+                    ],
+                    number_rows: n_rows,
+                    return_field: Field::new("f", DataType::Int64, true).into(),
+                    config_options: Arc::clone(&config_options),
+                })
+                .expect("lcm should work on valid values"),
+            )
+        })
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/left_right.rs b/datafusion/functions/benches/left_right.rs
new file mode 100644
index 0000000000000..8d5865acb845e
--- /dev/null
+++ b/datafusion/functions/benches/left_right.rs
@@ -0,0 +1,115 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::hint::black_box;
+use std::ops::Range;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, Int64Array};
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::{
+    create_string_array_with_len, create_string_view_array_with_len,
+};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::unicode::{left, right};
+
+const BATCH_SIZE: usize = 8192;
+
+fn create_args(
+    str_len: usize,
+    n_range: Range<i64>,
+    is_string_view: bool,
+) -> Vec<ColumnarValue> {
+    let string_arg = if is_string_view {
+        ColumnarValue::Array(Arc::new(create_string_view_array_with_len(
+            BATCH_SIZE, 0.1, str_len, true,
+        )))
+    } else {
+        ColumnarValue::Array(Arc::new(create_string_array_with_len::<i32>(
+            BATCH_SIZE, 0.1, str_len,
+        )))
+    };
+
+    let n_span = (n_range.end - n_range.start) as usize;
+    let n_values: Vec<i64> = (0..BATCH_SIZE)
+        .map(|i| n_range.start + (i % n_span) as i64)
+        .collect();
+    let n_array = Arc::new(Int64Array::from(n_values));
+
+    vec![
+        string_arg,
+        ColumnarValue::Array(Arc::clone(&n_array) as ArrayRef),
+    ]
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    // Short results (1-10 chars) produce inline StringView entries (≤12 bytes).
+    // Long results (20-29 chars) produce out-of-line entries.
+    let cases = [
+        ("short_result", 32, 1..11_i64),
+        ("long_result", 32, 20..30_i64),
+    ];
+
+    for function in [left(), right()] {
+        let mut group = c.benchmark_group(function.name().to_string());
+
+        for is_string_view in [false, true] {
+            let array_type = if is_string_view {
+                "string_view"
+            } else {
+                "string"
+            };
+
+            for (case_name, str_len, n_range) in &cases {
+                let bench_name = format!("{array_type} {case_name}");
+                let args = create_args(*str_len, n_range.clone(), is_string_view);
+                let arg_fields: Vec<_> = args
+                    .iter()
+                    .enumerate()
+                    .map(|(idx, arg)| {
+                        Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+                    })
+                    .collect();
+                let config_options = Arc::new(ConfigOptions::default());
+                let return_field = Field::new("f", DataType::Utf8View, true).into();
+
+                group.bench_function(&bench_name, |b| {
+                    b.iter(|| {
+                        black_box(
+                            function
+                                .invoke_with_args(ScalarFunctionArgs {
+                                    args: args.clone(),
+                                    arg_fields: arg_fields.clone(),
+                                    number_rows: BATCH_SIZE,
+                                    return_field: Arc::clone(&return_field),
+                                    config_options: Arc::clone(&config_options),
+                                })
+                                .expect("should work"),
+                        )
+                    })
+                });
+            }
+        }
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/levenshtein.rs b/datafusion/functions/benches/levenshtein.rs
new file mode 100644
index 0000000000000..08733b245ffb4
--- /dev/null
+++ b/datafusion/functions/benches/levenshtein.rs
@@ -0,0 +1,85 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::OffsetSizeTrait;
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::create_string_array_with_len;
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::DataFusionError;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::string;
+use std::hint::black_box;
+use std::sync::Arc;
+use std::time::Duration;
+
+fn create_args<O: OffsetSizeTrait>(size: usize, str_len: usize) -> Vec<ColumnarValue> {
+    let string1_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+    let string2_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+
+    vec![
+        ColumnarValue::Array(string1_array),
+        ColumnarValue::Array(string2_array),
+    ]
+}
+
+fn invoke_levenshtein_with_args(
+    args: Vec<ColumnarValue>,
+    number_rows: usize,
+) -> Result<ColumnarValue, DataFusionError> {
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    string::levenshtein().invoke_with_args(ScalarFunctionArgs {
+        args,
+        arg_fields,
+        number_rows,
+        return_field: Field::new("f", DataType::Int32, true).into(),
+        config_options: Arc::clone(&config_options),
+    })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    for size in [1024, 4096] {
+        let mut group = c.benchmark_group(format!("levenshtein size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        for str_len in [8, 32] {
+            let args = create_args::<i32>(size, str_len);
+            group.bench_function(
+                format!("levenshtein_string [size={size}, str_len={str_len}]"),
+                |b| {
+                    b.iter(|| {
+                        let args_cloned = args.clone();
+                        black_box(invoke_levenshtein_with_args(args_cloned, size))
+                    })
+                },
+            );
+        }
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/lower.rs b/datafusion/functions/benches/lower.rs
index 6a5178b87fdce..2764491c69c71 100644
--- a/datafusion/functions/benches/lower.rs
+++ b/datafusion/functions/benches/lower.rs
@@ -15,14 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
-use arrow::array::{ArrayRef, StringArray, StringViewBuilder};
+use arrow::array::{Array, ArrayRef, StringArray, StringViewBuilder};
 use arrow::datatypes::{DataType, Field};
 use arrow::util::bench_util::{
     create_string_array_with_len, create_string_view_array_with_len,
 };
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::string;
@@ -197,6 +195,43 @@ fn criterion_benchmark(c: &mut Criterion) {
         );
     }
 
+    {
+        let parent_size = 65536;
+        let slice_len = 128;
+        let str_len = 32;
+        let parent = Arc::new(create_string_array_with_len::<i32>(
+            parent_size,
+            0.2,
+            str_len,
+        )) as ArrayRef;
+        let offset = (parent_size - slice_len) / 2;
+        let sliced = parent.slice(offset, slice_len);
+        let args = vec![ColumnarValue::Array(sliced)];
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+
+        c.bench_function(
+            &format!("lower_sliced_ascii: parent={parent_size}, slice={slice_len}, str_len={str_len}"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(lower.invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: slice_len,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+    }
+
     let sizes = [4096, 8192];
     let str_lens = [10, 64, 128];
     let mixes = [true, false];
diff --git a/datafusion/functions/benches/ltrim.rs b/datafusion/functions/benches/ltrim.rs
deleted file mode 100644
index 4458af614396d..0000000000000
--- a/datafusion/functions/benches/ltrim.rs
+++ /dev/null
@@ -1,255 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-extern crate criterion;
-
-use arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray};
-use arrow::datatypes::{DataType, Field};
-use criterion::{
-    criterion_group, criterion_main, measurement::Measurement, BenchmarkGroup, Criterion,
-    SamplingMode,
-};
-use datafusion_common::config::ConfigOptions;
-use datafusion_common::ScalarValue;
-use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDF};
-use datafusion_functions::string;
-use rand::{distr::Alphanumeric, rngs::StdRng, Rng, SeedableRng};
-use std::hint::black_box;
-use std::{fmt, sync::Arc};
-
-#[derive(Clone, Copy)]
-pub enum StringArrayType {
-    Utf8View,
-    Utf8,
-    LargeUtf8,
-}
-
-impl fmt::Display for StringArrayType {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        match self {
-            StringArrayType::Utf8View => f.write_str("string_view"),
-            StringArrayType::Utf8 => f.write_str("string"),
-            StringArrayType::LargeUtf8 => f.write_str("large_string"),
-        }
-    }
-}
-
-/// returns an array of strings, and `characters` as a ScalarValue
-pub fn create_string_array_and_characters(
-    size: usize,
-    characters: &str,
-    trimmed: &str,
-    remaining_len: usize,
-    string_array_type: StringArrayType,
-) -> (ArrayRef, ScalarValue) {
-    let rng = &mut StdRng::seed_from_u64(42);
-
-    // Create `size` rows:
-    //   - 10% rows will be `None`
-    //   - Other 90% will be strings with same `remaining_len` lengths
-    // We will build the string array on it later.
-    let string_iter = (0..size).map(|_| {
-        if rng.random::<f32>() < 0.1 {
-            None
-        } else {
-            let mut value = trimmed.as_bytes().to_vec();
-            let generated = rng.sample_iter(&Alphanumeric).take(remaining_len);
-            value.extend(generated);
-            Some(String::from_utf8(value).unwrap())
-        }
-    });
-
-    // Build the target `string array` and `characters` according to `string_array_type`
-    match string_array_type {
-        StringArrayType::Utf8View => (
-            Arc::new(string_iter.collect::<StringViewArray>()),
-            ScalarValue::Utf8View(Some(characters.to_string())),
-        ),
-        StringArrayType::Utf8 => (
-            Arc::new(string_iter.collect::<StringArray>()),
-            ScalarValue::Utf8(Some(characters.to_string())),
-        ),
-        StringArrayType::LargeUtf8 => (
-            Arc::new(string_iter.collect::<LargeStringArray>()),
-            ScalarValue::LargeUtf8(Some(characters.to_string())),
-        ),
-    }
-}
-
-/// Create args for the ltrim benchmark
-/// Inputs:
-///   - size: rows num of the test array
-///   - characters: the characters we need to trim
-///   - trimmed: the part in the testing string that will be trimmed
-///   - remaining_len: the len of the remaining part of testing string after trimming
-///   - string_array_type: the method used to store the testing strings
-///
-/// Outputs:
-///   - testing string array
-///   - trimmed characters
-fn create_args(
-    size: usize,
-    characters: &str,
-    trimmed: &str,
-    remaining_len: usize,
-    string_array_type: StringArrayType,
-) -> Vec<ColumnarValue> {
-    let (string_array, pattern) = create_string_array_and_characters(
-        size,
-        characters,
-        trimmed,
-        remaining_len,
-        string_array_type,
-    );
-    vec![
-        ColumnarValue::Array(string_array),
-        ColumnarValue::Scalar(pattern),
-    ]
-}
-
-#[allow(clippy::too_many_arguments)]
-fn run_with_string_type<M: Measurement>(
-    group: &mut BenchmarkGroup<'_, M>,
-    ltrim: &ScalarUDF,
-    size: usize,
-    len: usize,
-    characters: &str,
-    trimmed: &str,
-    remaining_len: usize,
-    string_type: StringArrayType,
-) {
-    let args = create_args(size, characters, trimmed, remaining_len, string_type);
-    let arg_fields = args
-        .iter()
-        .enumerate()
-        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
-        .collect::<Vec<_>>();
-    let config_options = Arc::new(ConfigOptions::default());
-
-    group.bench_function(
-        format!(
-            "{string_type} [size={size}, len_before={len}, len_after={remaining_len}]",
-        ),
-        |b| {
-            b.iter(|| {
-                let args_cloned = args.clone();
-                black_box(ltrim.invoke_with_args(ScalarFunctionArgs {
-                    args: args_cloned,
-                    arg_fields: arg_fields.clone(),
-                    number_rows: size,
-                    return_field: Field::new("f", DataType::Utf8, true).into(),
-                    config_options: Arc::clone(&config_options),
-                }))
-            })
-        },
-    );
-}
-
-#[allow(clippy::too_many_arguments)]
-fn run_one_group(
-    c: &mut Criterion,
-    group_name: &str,
-    ltrim: &ScalarUDF,
-    string_types: &[StringArrayType],
-    size: usize,
-    len: usize,
-    characters: &str,
-    trimmed: &str,
-    remaining_len: usize,
-) {
-    let mut group = c.benchmark_group(group_name);
-    group.sampling_mode(SamplingMode::Flat);
-    group.sample_size(10);
-
-    for string_type in string_types {
-        run_with_string_type(
-            &mut group,
-            ltrim,
-            size,
-            len,
-            characters,
-            trimmed,
-            remaining_len,
-            *string_type,
-        );
-    }
-
-    group.finish();
-}
-
-fn criterion_benchmark(c: &mut Criterion) {
-    let ltrim = string::ltrim();
-    let characters = ",!()";
-
-    let string_types = [
-        StringArrayType::Utf8View,
-        StringArrayType::Utf8,
-        StringArrayType::LargeUtf8,
-    ];
-    for size in [1024, 4096, 8192] {
-        // len=12, trimmed_len=4, len_after_ltrim=8
-        let len = 12;
-        let trimmed = characters;
-        let remaining_len = len - trimmed.len();
-        run_one_group(
-            c,
-            "INPUT LEN <= 12",
-            &ltrim,
-            &string_types,
-            size,
-            len,
-            characters,
-            trimmed,
-            remaining_len,
-        );
-
-        // len=64, trimmed_len=4, len_after_ltrim=60
-        let len = 64;
-        let trimmed = characters;
-        let remaining_len = len - trimmed.len();
-        run_one_group(
-            c,
-            "INPUT LEN > 12, OUTPUT LEN > 12",
-            &ltrim,
-            &string_types,
-            size,
-            len,
-            characters,
-            trimmed,
-            remaining_len,
-        );
-
-        // len=64, trimmed_len=56, len_after_ltrim=8
-        let len = 64;
-        let trimmed = characters.repeat(15);
-        let remaining_len = len - trimmed.len();
-        run_one_group(
-            c,
-            "INPUT LEN > 12, OUTPUT LEN <= 12",
-            &ltrim,
-            &string_types,
-            size,
-            len,
-            characters,
-            &trimmed,
-            remaining_len,
-        );
-    }
-}
-
-criterion_group!(benches, criterion_benchmark);
-criterion_main!(benches);
diff --git a/datafusion/functions/benches/make_date.rs b/datafusion/functions/benches/make_date.rs
index 15a895468db93..1c7b61ec60497 100644
--- a/datafusion/functions/benches/make_date.rs
+++ b/datafusion/functions/benches/make_date.rs
@@ -15,24 +15,22 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use std::hint::black_box;
 use std::sync::Arc;
 
 use arrow::array::{Array, ArrayRef, Int32Array};
 use arrow::datatypes::{DataType, Field};
-use criterion::{criterion_group, criterion_main, Criterion};
-use datafusion_common::config::ConfigOptions;
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::datetime::make_date;
-use rand::rngs::ThreadRng;
 use rand::Rng;
+use rand::rngs::ThreadRng;
 
 fn years(rng: &mut ThreadRng) -> Int32Array {
     let mut years = vec![];
-    for _ in 0..1000 {
+    for _ in 0..8192 {
         years.push(rng.random_range(1900..2050));
     }
 
@@ -41,7 +39,7 @@ fn years(rng: &mut ThreadRng) -> Int32Array {
 
 fn months(rng: &mut ThreadRng) -> Int32Array {
     let mut months = vec![];
-    for _ in 0..1000 {
+    for _ in 0..8192 {
         months.push(rng.random_range(1..13));
     }
 
@@ -50,14 +48,14 @@ fn months(rng: &mut ThreadRng) -> Int32Array {
 
 fn days(rng: &mut ThreadRng) -> Int32Array {
     let mut days = vec![];
-    for _ in 0..1000 {
+    for _ in 0..8192 {
         days.push(rng.random_range(1..29));
     }
 
     Int32Array::from(days)
 }
 fn criterion_benchmark(c: &mut Criterion) {
-    c.bench_function("make_date_col_col_col_1000", |b| {
+    c.bench_function("make_date_col_col_col_8192", |b| {
         let mut rng = rand::rng();
         let years_array = Arc::new(years(&mut rng)) as ArrayRef;
         let batch_len = years_array.len();
@@ -87,7 +85,7 @@ fn criterion_benchmark(c: &mut Criterion) {
         })
     });
 
-    c.bench_function("make_date_scalar_col_col_1000", |b| {
+    c.bench_function("make_date_scalar_col_col_8192", |b| {
         let mut rng = rand::rng();
         let year = ColumnarValue::Scalar(ScalarValue::Int32(Some(2025)));
         let months_arr = Arc::new(months(&mut rng)) as ArrayRef;
@@ -117,7 +115,7 @@ fn criterion_benchmark(c: &mut Criterion) {
         })
     });
 
-    c.bench_function("make_date_scalar_scalar_col_1000", |b| {
+    c.bench_function("make_date_scalar_scalar_col_8192", |b| {
         let mut rng = rand::rng();
         let year = ColumnarValue::Scalar(ScalarValue::Int32(Some(2025)));
         let month = ColumnarValue::Scalar(ScalarValue::Int32(Some(11)));
diff --git a/datafusion/functions/benches/nanvl.rs b/datafusion/functions/benches/nanvl.rs
new file mode 100644
index 0000000000000..206eebd81eb81
--- /dev/null
+++ b/datafusion/functions/benches/nanvl.rs
@@ -0,0 +1,114 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+extern crate criterion;
+
+use arrow::array::{ArrayRef, Float32Array, Float64Array};
+use arrow::datatypes::{DataType, Field};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::math::nanvl;
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let nanvl_fn = nanvl();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    // Scalar benchmarks
+    c.bench_function("nanvl/scalar_f64", |b| {
+        let args = ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(f64::NAN))),
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(1.0))),
+            ],
+            arg_fields: vec![
+                Field::new("a", DataType::Float64, true).into(),
+                Field::new("b", DataType::Float64, true).into(),
+            ],
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Float64, true).into(),
+            config_options: Arc::clone(&config_options),
+        };
+
+        b.iter(|| black_box(nanvl_fn.invoke_with_args(args.clone()).unwrap()))
+    });
+
+    c.bench_function("nanvl/scalar_f32", |b| {
+        let args = ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Scalar(ScalarValue::Float32(Some(f32::NAN))),
+                ColumnarValue::Scalar(ScalarValue::Float32(Some(1.0))),
+            ],
+            arg_fields: vec![
+                Field::new("a", DataType::Float32, true).into(),
+                Field::new("b", DataType::Float32, true).into(),
+            ],
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Float32, true).into(),
+            config_options: Arc::clone(&config_options),
+        };
+
+        b.iter(|| black_box(nanvl_fn.invoke_with_args(args.clone()).unwrap()))
+    });
+
+    // Array benchmarks
+    for size in [1024, 4096, 8192] {
+        let a64: ArrayRef = Arc::new(Float64Array::from(vec![f64::NAN; size]));
+        let b64: ArrayRef = Arc::new(Float64Array::from(vec![1.0; size]));
+        c.bench_function(&format!("nanvl/array_f64/{size}"), |bench| {
+            let args = ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Array(Arc::clone(&a64)),
+                    ColumnarValue::Array(Arc::clone(&b64)),
+                ],
+                arg_fields: vec![
+                    Field::new("a", DataType::Float64, true).into(),
+                    Field::new("b", DataType::Float64, true).into(),
+                ],
+                number_rows: size,
+                return_field: Field::new("f", DataType::Float64, true).into(),
+                config_options: Arc::clone(&config_options),
+            };
+            bench.iter(|| black_box(nanvl_fn.invoke_with_args(args.clone()).unwrap()))
+        });
+
+        let a32: ArrayRef = Arc::new(Float32Array::from(vec![f32::NAN; size]));
+        let b32: ArrayRef = Arc::new(Float32Array::from(vec![1.0; size]));
+        c.bench_function(&format!("nanvl/array_f32/{size}"), |bench| {
+            let args = ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Array(Arc::clone(&a32)),
+                    ColumnarValue::Array(Arc::clone(&b32)),
+                ],
+                arg_fields: vec![
+                    Field::new("a", DataType::Float32, true).into(),
+                    Field::new("b", DataType::Float32, true).into(),
+                ],
+                number_rows: size,
+                return_field: Field::new("f", DataType::Float32, true).into(),
+                config_options: Arc::clone(&config_options),
+            };
+            bench.iter(|| black_box(nanvl_fn.invoke_with_args(args.clone()).unwrap()))
+        });
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/nullif.rs b/datafusion/functions/benches/nullif.rs
index d649697cc5188..f9f063c52d0d4 100644
--- a/datafusion/functions/benches/nullif.rs
+++ b/datafusion/functions/benches/nullif.rs
@@ -15,13 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::datatypes::{DataType, Field};
 use arrow::util::bench_util::create_string_array_with_len;
-use criterion::{criterion_group, criterion_main, Criterion};
-use datafusion_common::config::ConfigOptions;
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::core::nullif;
 use std::hint::black_box;
diff --git a/datafusion/functions/benches/pad.rs b/datafusion/functions/benches/pad.rs
index f92a69bbf4f92..c71d5a7161a66 100644
--- a/datafusion/functions/benches/pad.rs
+++ b/datafusion/functions/benches/pad.rs
@@ -15,20 +15,69 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{ArrayRef, ArrowPrimitiveType, OffsetSizeTrait, PrimitiveArray};
+use arrow::array::{
+    ArrowPrimitiveType, GenericStringBuilder, OffsetSizeTrait, PrimitiveArray,
+    StringViewBuilder,
+};
 use arrow::datatypes::{DataType, Field, Int64Type};
 use arrow::util::bench_util::{
     create_string_array_with_len, create_string_view_array_with_len,
 };
-use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::DataFusionError;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
-use datafusion_functions::unicode::{lpad, rpad};
-use rand::distr::{Distribution, Uniform};
+use datafusion_functions::unicode;
 use rand::Rng;
+use rand::distr::{Distribution, Uniform};
 use std::hint::black_box;
 use std::sync::Arc;
+use std::time::Duration;
+
+const UNICODE_STRINGS: &[&str] = &[
+    "Ñandú",
+    "Íslensku",
+    "Þjóðarinnar",
+    "Ελληνική",
+    "Иванович",
+    "データフュージョン",
+    "José García",
+    "Ölçü bïrïmï",
+    "Ÿéšṱëṟḏàÿ",
+    "Ährenstraße",
+];
+
+fn create_unicode_string_array<O: OffsetSizeTrait>(
+    size: usize,
+    null_density: f32,
+) -> arrow::array::GenericStringArray<O> {
+    let mut rng = rand::rng();
+    let mut builder = GenericStringBuilder::<O>::new();
+    for i in 0..size {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            builder.append_value(UNICODE_STRINGS[i % UNICODE_STRINGS.len()]);
+        }
+    }
+    builder.finish()
+}
+
+fn create_unicode_string_view_array(
+    size: usize,
+    null_density: f32,
+) -> arrow::array::StringViewArray {
+    let mut rng = rand::rng();
+    let mut builder = StringViewBuilder::with_capacity(size);
+    for i in 0..size {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            builder.append_value(UNICODE_STRINGS[i % UNICODE_STRINGS.len()]);
+        }
+    }
+    builder.finish()
+}
 
 struct Filter<Dist> {
     dist: Dist,
@@ -67,103 +116,642 @@ where
         .collect()
 }
 
-fn create_args<O: OffsetSizeTrait>(
+/// Create args for pad benchmark with Unicode strings
+fn create_unicode_pad_args(
     size: usize,
-    str_len: usize,
-    force_view_types: bool,
+    target_len: usize,
+    use_string_view: bool,
 ) -> Vec<ColumnarValue> {
-    let length_array = Arc::new(create_primitive_array::<Int64Type>(size, 0.0, str_len));
-
-    if !force_view_types {
-        let string_array =
-            Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
-        let fill_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+    let length_array =
+        Arc::new(create_primitive_array::<Int64Type>(size, 0.0, target_len));
 
+    if use_string_view {
+        let string_array = create_unicode_string_view_array(size, 0.1);
+        let fill_array = create_unicode_string_view_array(size, 0.1);
         vec![
-            ColumnarValue::Array(string_array),
-            ColumnarValue::Array(Arc::clone(&length_array) as ArrayRef),
-            ColumnarValue::Array(fill_array),
+            ColumnarValue::Array(Arc::new(string_array)),
+            ColumnarValue::Array(length_array),
+            ColumnarValue::Array(Arc::new(fill_array)),
         ]
     } else {
-        let string_array =
-            Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
-        let fill_array =
-            Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
-
+        let string_array = create_unicode_string_array::<i32>(size, 0.1);
+        let fill_array = create_unicode_string_array::<i32>(size, 0.1);
         vec![
-            ColumnarValue::Array(string_array),
-            ColumnarValue::Array(Arc::clone(&length_array) as ArrayRef),
-            ColumnarValue::Array(fill_array),
+            ColumnarValue::Array(Arc::new(string_array)),
+            ColumnarValue::Array(length_array),
+            ColumnarValue::Array(Arc::new(fill_array)),
         ]
     }
 }
 
-fn invoke_pad_with_args(
-    args: Vec<ColumnarValue>,
-    number_rows: usize,
-    left_pad: bool,
-) -> Result<ColumnarValue, DataFusionError> {
-    let arg_fields = args
-        .iter()
-        .enumerate()
-        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
-        .collect::<Vec<_>>();
-    let config_options = Arc::new(ConfigOptions::default());
-
-    let scalar_args = ScalarFunctionArgs {
-        args: args.clone(),
-        arg_fields,
-        number_rows,
-        return_field: Field::new("f", DataType::Utf8, true).into(),
-        config_options: Arc::clone(&config_options),
-    };
+/// Create args for pad benchmark
+fn create_pad_args<O: OffsetSizeTrait>(
+    size: usize,
+    str_len: usize,
+    target_len: usize,
+    use_string_view: bool,
+) -> Vec<ColumnarValue> {
+    let length_array =
+        Arc::new(create_primitive_array::<Int64Type>(size, 0.0, target_len));
+
+    if use_string_view {
+        let string_array = create_string_view_array_with_len(size, 0.1, str_len, false);
+        let fill_array = create_string_view_array_with_len(size, 0.1, str_len, false);
+        vec![
+            ColumnarValue::Array(Arc::new(string_array)),
+            ColumnarValue::Array(length_array),
+            ColumnarValue::Array(Arc::new(fill_array)),
+        ]
+    } else {
+        let string_array = create_string_array_with_len::<O>(size, 0.1, str_len);
+        let fill_array = create_string_array_with_len::<O>(size, 0.1, str_len);
+        vec![
+            ColumnarValue::Array(Arc::new(string_array)),
+            ColumnarValue::Array(length_array),
+            ColumnarValue::Array(Arc::new(fill_array)),
+        ]
+    }
+}
 
-    if left_pad {
-        lpad().invoke_with_args(scalar_args)
+/// Create args for pad benchmark with scalar length and fill (common pattern:
+/// `lpad(column, 20, '0')`).
+fn create_scalar_pad_args<O: OffsetSizeTrait>(
+    size: usize,
+    str_len: usize,
+    target_len: i64,
+    fill: &str,
+    use_string_view: bool,
+) -> Vec<ColumnarValue> {
+    if use_string_view {
+        let string_array = create_string_view_array_with_len(size, 0.1, str_len, false);
+        vec![
+            ColumnarValue::Array(Arc::new(string_array)),
+            ColumnarValue::Scalar(ScalarValue::Int64(Some(target_len))),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(fill.to_string()))),
+        ]
     } else {
-        rpad().invoke_with_args(scalar_args)
+        let string_array = create_string_array_with_len::<O>(size, 0.1, str_len);
+        vec![
+            ColumnarValue::Array(Arc::new(string_array)),
+            ColumnarValue::Scalar(ScalarValue::Int64(Some(target_len))),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(fill.to_string()))),
+        ]
     }
 }
 
 fn criterion_benchmark(c: &mut Criterion) {
-    for size in [1024, 2048] {
-        let mut group = c.benchmark_group("lpad function");
+    for size in [1024, 4096] {
+        let mut group = c.benchmark_group(format!("lpad size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        // Utf8 type
+        let args = create_pad_args::<i32>(size, 5, 20, false);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+        let config_options = Arc::new(ConfigOptions::default());
+
+        group.bench_function(
+            format!("lpad utf8 [size={size}, str_len=5, target=20]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // StringView type
+        let args = create_pad_args::<i32>(size, 5, 20, true);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!("lpad stringview [size={size}, str_len=5, target=20]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8View, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // Utf8 type with longer strings
+        let args = create_pad_args::<i32>(size, 20, 50, false);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!("lpad utf8 [size={size}, str_len=20, target=50]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // StringView type with longer strings
+        let args = create_pad_args::<i32>(size, 20, 50, true);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!("lpad stringview [size={size}, str_len=20, target=50]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8View, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // Utf8 type with Unicode strings
+        let args = create_unicode_pad_args(size, 20, false);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
 
-        let args = create_args::<i32>(size, 32, false);
+        group.bench_function(
+            format!("lpad utf8 unicode [size={size}, target=20]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
 
-        group.bench_function(BenchmarkId::new("utf8 type", size), |b| {
-            b.iter(|| black_box(invoke_pad_with_args(args.clone(), size, true).unwrap()))
-        });
+        // StringView type with Unicode strings
+        let args = create_unicode_pad_args(size, 20, true);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
 
-        let args = create_args::<i64>(size, 32, false);
-        group.bench_function(BenchmarkId::new("largeutf8 type", size), |b| {
-            b.iter(|| black_box(invoke_pad_with_args(args.clone(), size, true).unwrap()))
-        });
+        group.bench_function(
+            format!("lpad stringview unicode [size={size}, target=20]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8View, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
 
-        let args = create_args::<i32>(size, 32, true);
-        group.bench_function(BenchmarkId::new("stringview type", size), |b| {
-            b.iter(|| black_box(invoke_pad_with_args(args.clone(), size, true).unwrap()))
-        });
+        // --- Scalar length + fill benchmarks ---
+
+        // Utf8 with scalar length and fill (3-arg)
+        let args = create_scalar_pad_args::<i32>(size, 5, 20, "x", false);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!("lpad utf8 scalar [size={size}, str_len=5, target=20, fill='x']"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // StringView with scalar length and fill (3-arg)
+        let args = create_scalar_pad_args::<i32>(size, 5, 20, "x", true);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!(
+                "lpad stringview scalar [size={size}, str_len=5, target=20, fill='x']"
+            ),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // Utf8 with scalar length and unicode fill
+        let args = create_scalar_pad_args::<i32>(size, 5, 20, "é", false);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!(
+                "lpad utf8 scalar unicode [size={size}, str_len=5, target=20, fill='é']"
+            ),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // Utf8 with scalar truncation (str_len > target) and unicode fill
+        let args = create_scalar_pad_args::<i32>(size, 20, 5, "é", false);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!(
+                "lpad utf8 scalar truncate [size={size}, str_len=20, target=5, fill='é']"
+            ),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
 
         group.finish();
+    }
+
+    for size in [1024, 4096] {
+        let mut group = c.benchmark_group(format!("rpad size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        // Utf8 type
+        let args = create_pad_args::<i32>(size, 5, 20, false);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+        let config_options = Arc::new(ConfigOptions::default());
+
+        group.bench_function(
+            format!("rpad utf8 [size={size}, str_len=5, target=20]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // StringView type
+        let args = create_pad_args::<i32>(size, 5, 20, true);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!("rpad stringview [size={size}, str_len=5, target=20]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8View, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // Utf8 type with longer strings
+        let args = create_pad_args::<i32>(size, 20, 50, false);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!("rpad utf8 [size={size}, str_len=20, target=50]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // StringView type with longer strings
+        let args = create_pad_args::<i32>(size, 20, 50, true);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!("rpad stringview [size={size}, str_len=20, target=50]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8View, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // Utf8 type with Unicode strings
+        let args = create_unicode_pad_args(size, 20, false);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!("rpad utf8 unicode [size={size}, target=20]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // StringView type with Unicode strings
+        let args = create_unicode_pad_args(size, 20, true);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!("rpad stringview unicode [size={size}, target=20]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8View, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // --- Scalar length + fill benchmarks ---
+
+        // Utf8 with scalar length and fill (3-arg)
+        let args = create_scalar_pad_args::<i32>(size, 5, 20, "x", false);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!("rpad utf8 scalar [size={size}, str_len=5, target=20, fill='x']"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // StringView with scalar length and fill (3-arg)
+        let args = create_scalar_pad_args::<i32>(size, 5, 20, "x", true);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!(
+                "rpad stringview scalar [size={size}, str_len=5, target=20, fill='x']"
+            ),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
 
-        let mut group = c.benchmark_group("rpad function");
+        // Utf8 with scalar length and unicode fill
+        let args = create_scalar_pad_args::<i32>(size, 5, 20, "é", false);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
 
-        let args = create_args::<i32>(size, 32, false);
-        group.bench_function(BenchmarkId::new("utf8 type", size), |b| {
-            b.iter(|| black_box(invoke_pad_with_args(args.clone(), size, false).unwrap()))
-        });
+        group.bench_function(
+            format!(
+                "rpad utf8 scalar unicode [size={size}, str_len=5, target=20, fill='é']"
+            ),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
 
-        let args = create_args::<i64>(size, 32, false);
-        group.bench_function(BenchmarkId::new("largeutf8 type", size), |b| {
-            b.iter(|| black_box(invoke_pad_with_args(args.clone(), size, false).unwrap()))
-        });
+        // Utf8 with scalar truncation (str_len > target) and unicode fill
+        let args = create_scalar_pad_args::<i32>(size, 20, 5, "é", false);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
 
-        // rpad for stringview type
-        let args = create_args::<i32>(size, 32, true);
-        group.bench_function(BenchmarkId::new("stringview type", size), |b| {
-            b.iter(|| black_box(invoke_pad_with_args(args.clone(), size, false).unwrap()))
-        });
+        group.bench_function(
+            format!(
+                "rpad utf8 scalar truncate [size={size}, str_len=20, target=5, fill='é']"
+            ),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
 
         group.finish();
     }
diff --git a/datafusion/functions/benches/random.rs b/datafusion/functions/benches/random.rs
index 88efb2d1b5b93..71ded120eb515 100644
--- a/datafusion/functions/benches/random.rs
+++ b/datafusion/functions/benches/random.rs
@@ -15,10 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::datatypes::{DataType, Field};
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl};
 use datafusion_functions::math::random::RandomFunc;
diff --git a/datafusion/functions/benches/regexp_count.rs b/datafusion/functions/benches/regexp_count.rs
new file mode 100644
index 0000000000000..bce76c05585b9
--- /dev/null
+++ b/datafusion/functions/benches/regexp_count.rs
@@ -0,0 +1,116 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::Int64Array;
+use arrow::array::OffsetSizeTrait;
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::create_string_array_with_len;
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::{DataFusionError, ScalarValue};
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::regex;
+use std::hint::black_box;
+use std::sync::Arc;
+use std::time::Duration;
+
+fn create_args<O: OffsetSizeTrait>(
+    size: usize,
+    str_len: usize,
+    with_start: bool,
+) -> Vec<ColumnarValue> {
+    let string_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+
+    // Use a simple pattern that matches common characters
+    let pattern = ColumnarValue::Scalar(ScalarValue::Utf8(Some("a".to_string())));
+
+    if with_start {
+        // Test with start position (this is where the optimization matters)
+        let start_array = Arc::new(Int64Array::from(
+            (0..size).map(|i| (i % 10 + 1) as i64).collect::<Vec<_>>(),
+        ));
+        vec![
+            ColumnarValue::Array(string_array),
+            pattern,
+            ColumnarValue::Array(start_array),
+        ]
+    } else {
+        vec![ColumnarValue::Array(string_array), pattern]
+    }
+}
+
+fn invoke_regexp_count_with_args(
+    args: Vec<ColumnarValue>,
+    number_rows: usize,
+) -> Result<ColumnarValue, DataFusionError> {
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    regex::regexp_count().invoke_with_args(ScalarFunctionArgs {
+        args,
+        arg_fields,
+        number_rows,
+        return_field: Field::new("f", DataType::Int64, true).into(),
+        config_options: Arc::clone(&config_options),
+    })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    for size in [1024, 4096] {
+        let mut group = c.benchmark_group(format!("regexp_count size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        // Test without start position (no optimization impact)
+        for str_len in [32, 128] {
+            let args = create_args::<i32>(size, str_len, false);
+            group.bench_function(
+                format!("regexp_count_no_start [size={size}, str_len={str_len}]"),
+                |b| {
+                    b.iter(|| {
+                        let args_cloned = args.clone();
+                        black_box(invoke_regexp_count_with_args(args_cloned, size))
+                    })
+                },
+            );
+        }
+
+        // Test with start position (optimization should help here)
+        for str_len in [32, 128] {
+            let args = create_args::<i32>(size, str_len, true);
+            group.bench_function(
+                format!("regexp_count_with_start [size={size}, str_len={str_len}]"),
+                |b| {
+                    b.iter(|| {
+                        let args_cloned = args.clone();
+                        black_box(invoke_regexp_count_with_args(args_cloned, size))
+                    })
+                },
+            );
+        }
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/regx.rs b/datafusion/functions/benches/regx.rs
index a415330245bf5..a46b548236d08 100644
--- a/datafusion/functions/benches/regx.rs
+++ b/datafusion/functions/benches/regx.rs
@@ -15,25 +15,27 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
+use std::hint::black_box;
+use std::iter;
+use std::sync::Arc;
 
 use arrow::array::builder::StringBuilder;
 use arrow::array::{ArrayRef, AsArray, Int64Array, StringArray, StringViewArray};
 use arrow::compute::cast;
-use arrow::datatypes::DataType;
-use criterion::{criterion_group, criterion_main, Criterion};
+use arrow::datatypes::{DataType, Field};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
 use datafusion_functions::regex::regexpcount::regexp_count_func;
 use datafusion_functions::regex::regexpinstr::regexp_instr_func;
-use datafusion_functions::regex::regexplike::regexp_like;
+use datafusion_functions::regex::regexplike::{RegexpLikeFunc, regexp_like};
 use datafusion_functions::regex::regexpmatch::regexp_match;
 use datafusion_functions::regex::regexpreplace::regexp_replace;
+use rand::Rng;
 use rand::distr::Alphanumeric;
 use rand::prelude::IndexedRandom;
 use rand::rngs::ThreadRng;
-use rand::Rng;
-use std::hint::black_box;
-use std::iter;
-use std::sync::Arc;
 fn data(rng: &mut ThreadRng) -> StringArray {
     let mut data: Vec<String> = vec![];
     for _ in 0..1000 {
@@ -107,6 +109,8 @@ fn subexp(rng: &mut ThreadRng) -> Int64Array {
 }
 
 fn criterion_benchmark(c: &mut Criterion) {
+    let regexp_like_func = RegexpLikeFunc::new();
+    let config_options = Arc::new(ConfigOptions::default());
     c.bench_function("regexp_count_1000 string", |b| {
         let mut rng = rand::rng();
         let data = Arc::new(data(&mut rng)) as ArrayRef;
@@ -221,6 +225,32 @@ fn criterion_benchmark(c: &mut Criterion) {
         })
     });
 
+    let scalar_args = vec![
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some("foobarbequebaz".to_string()))),
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some("(bar)(beque)".to_string()))),
+    ];
+    let scalar_arg_fields = vec![
+        Field::new("arg_0", DataType::Utf8, false).into(),
+        Field::new("arg_1", DataType::Utf8, false).into(),
+    ];
+    let return_field = Field::new("f", DataType::Boolean, true).into();
+
+    c.bench_function("regexp_like scalar utf8", |b| {
+        b.iter(|| {
+            black_box(
+                regexp_like_func
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: scalar_args.clone(),
+                        arg_fields: scalar_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .expect("regexp_like scalar should work on valid values"),
+            )
+        })
+    });
+
     c.bench_function("regexp_match_1000", |b| {
         let mut rng = rand::rng();
         let data = Arc::new(data(&mut rng)) as ArrayRef;
diff --git a/datafusion/functions/benches/repeat.rs b/datafusion/functions/benches/repeat.rs
index 80ffa8ee38f1a..354812c0d2ea2 100644
--- a/datafusion/functions/benches/repeat.rs
+++ b/datafusion/functions/benches/repeat.rs
@@ -15,16 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::array::{ArrayRef, Int64Array, OffsetSizeTrait};
 use arrow::datatypes::{DataType, Field};
 use arrow::util::bench_util::{
     create_string_array_with_len, create_string_view_array_with_len,
 };
-use criterion::{criterion_group, criterion_main, Criterion, SamplingMode};
-use datafusion_common::config::ConfigOptions;
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
 use datafusion_common::DataFusionError;
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::string;
 use std::hint::black_box;
@@ -80,6 +79,44 @@ fn invoke_repeat_with_args(
 }
 
 fn criterion_benchmark(c: &mut Criterion) {
+    let repeat_fn = string::repeat();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    // Scalar benchmarks (outside loop)
+    c.bench_function("repeat/scalar_utf8", |b| {
+        let args = ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("hello".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
+            ],
+            arg_fields: vec![
+                Field::new("a", DataType::Utf8, false).into(),
+                Field::new("b", DataType::Int64, false).into(),
+            ],
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Utf8, true).into(),
+            config_options: Arc::clone(&config_options),
+        };
+        b.iter(|| black_box(repeat_fn.invoke_with_args(args.clone()).unwrap()))
+    });
+
+    c.bench_function("repeat/scalar_utf8view", |b| {
+        let args = ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some("hello".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
+            ],
+            arg_fields: vec![
+                Field::new("a", DataType::Utf8View, false).into(),
+                Field::new("b", DataType::Int64, false).into(),
+            ],
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Utf8, true).into(),
+            config_options: Arc::clone(&config_options),
+        };
+        b.iter(|| black_box(repeat_fn.invoke_with_args(args.clone()).unwrap()))
+    });
+
     for size in [1024, 4096] {
         // REPEAT 3 TIMES
         let repeat_times = 3;
diff --git a/datafusion/functions/benches/replace.rs b/datafusion/functions/benches/replace.rs
new file mode 100644
index 0000000000000..7ad198995a028
--- /dev/null
+++ b/datafusion/functions/benches/replace.rs
@@ -0,0 +1,170 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{GenericStringArray, OffsetSizeTrait, StringViewArray};
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::{
+    create_string_array_with_len, create_string_view_array_with_len,
+};
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::DataFusionError;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::string;
+use std::hint::black_box;
+use std::sync::Arc;
+use std::time::Duration;
+
+/// Build a string array, dropping the null buffer when `null_density == 0.0`
+fn make_string_array<O: OffsetSizeTrait>(
+    size: usize,
+    null_density: f32,
+    str_len: usize,
+) -> GenericStringArray<O> {
+    let arr = create_string_array_with_len::<O>(size, null_density, str_len);
+    if null_density == 0.0 {
+        let (offsets, values, _) = arr.into_parts();
+        GenericStringArray::<O>::new(offsets, values, None)
+    } else {
+        arr
+    }
+}
+
+fn make_string_view_array(
+    size: usize,
+    null_density: f32,
+    str_len: usize,
+) -> StringViewArray {
+    let arr = create_string_view_array_with_len(size, null_density, str_len, false);
+    if null_density == 0.0 {
+        let (views, buffers, _) = arr.into_parts();
+        StringViewArray::new(views, buffers, None)
+    } else {
+        arr
+    }
+}
+
+fn create_args<O: OffsetSizeTrait>(
+    size: usize,
+    str_len: usize,
+    force_view_types: bool,
+    from_len: usize,
+    to_len: usize,
+    null_density: f32,
+) -> Vec<ColumnarValue> {
+    // Apply `null_density` only to the string column; `from` and `to` are
+    // typically not NULL in real-world workloads.
+    if force_view_types {
+        let string_array = Arc::new(make_string_view_array(size, null_density, str_len));
+        let from_array = Arc::new(make_string_view_array(size, 0.0, from_len));
+        let to_array = Arc::new(make_string_view_array(size, 0.0, to_len));
+        vec![
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Array(from_array),
+            ColumnarValue::Array(to_array),
+        ]
+    } else {
+        let string_array = Arc::new(make_string_array::<O>(size, null_density, str_len));
+        let from_array = Arc::new(make_string_array::<O>(size, 0.0, from_len));
+        let to_array = Arc::new(make_string_array::<O>(size, 0.0, to_len));
+
+        vec![
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Array(from_array),
+            ColumnarValue::Array(to_array),
+        ]
+    }
+}
+
+fn invoke_replace_with_args(
+    args: Vec<ColumnarValue>,
+    number_rows: usize,
+) -> Result<ColumnarValue, DataFusionError> {
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    string::replace().invoke_with_args(ScalarFunctionArgs {
+        args,
+        arg_fields,
+        number_rows,
+        return_field: Field::new("f", DataType::Utf8, true).into(),
+        config_options: Arc::clone(&config_options),
+    })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    for size in [1024, 4096] {
+        let mut group = c.benchmark_group(format!("replace size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        for &nulls in &[0.0_f32, 0.2] {
+            for &str_len in &[32_usize, 128] {
+                // ASCII single character replacement (fast path)
+                let args = create_args::<i32>(size, str_len, false, 1, 1, nulls);
+                group.bench_function(
+                    format!(
+                        "replace_string_ascii_single [size={size}, str_len={str_len}, nulls={nulls}]"
+                    ),
+                    |b| {
+                        b.iter(|| {
+                            let args_cloned = args.clone();
+                            black_box(invoke_replace_with_args(args_cloned, size))
+                        })
+                    },
+                );
+
+                // Multi-character strings (general path)
+                let args = create_args::<i32>(size, str_len, true, 3, 5, nulls);
+                group.bench_function(
+                    format!(
+                        "replace_string_view [size={size}, str_len={str_len}, nulls={nulls}]"
+                    ),
+                    |b| {
+                        b.iter(|| {
+                            let args_cloned = args.clone();
+                            black_box(invoke_replace_with_args(args_cloned, size))
+                        })
+                    },
+                );
+
+                let args = create_args::<i32>(size, str_len, false, 3, 5, nulls);
+                group.bench_function(
+                    format!(
+                        "replace_string [size={size}, str_len={str_len}, nulls={nulls}]"
+                    ),
+                    |b| {
+                        b.iter(|| {
+                            let args_cloned = args.clone();
+                            black_box(invoke_replace_with_args(args_cloned, size))
+                        })
+                    },
+                );
+            }
+        }
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/reverse.rs b/datafusion/functions/benches/reverse.rs
index b1eca654fb254..f2e2898bbfe43 100644
--- a/datafusion/functions/benches/reverse.rs
+++ b/datafusion/functions/benches/reverse.rs
@@ -15,11 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
 mod helper;
 
 use arrow::datatypes::{DataType, Field};
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::ScalarFunctionArgs;
 use helper::gen_string_array;
diff --git a/datafusion/functions/benches/round.rs b/datafusion/functions/benches/round.rs
new file mode 100644
index 0000000000000..7010aa3507dbc
--- /dev/null
+++ b/datafusion/functions/benches/round.rs
@@ -0,0 +1,152 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::{DataType, Field, Float32Type, Float64Type};
+use arrow::util::bench_util::create_primitive_array;
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::math::round;
+use std::hint::black_box;
+use std::sync::Arc;
+use std::time::Duration;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let round_fn = round();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    for size in [1024, 4096, 8192] {
+        let mut group = c.benchmark_group(format!("round size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        // Float64 array benchmark
+        let f64_array = Arc::new(create_primitive_array::<Float64Type>(size, 0.1));
+        let batch_len = f64_array.len();
+        let f64_args = vec![
+            ColumnarValue::Array(f64_array),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(2))),
+        ];
+
+        group.bench_function("round_f64_array", |b| {
+            b.iter(|| {
+                let args_cloned = f64_args.clone();
+                black_box(
+                    round_fn
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args_cloned,
+                            arg_fields: vec![
+                                Field::new("a", DataType::Float64, true).into(),
+                                Field::new("b", DataType::Int32, false).into(),
+                            ],
+                            number_rows: batch_len,
+                            return_field: Field::new("f", DataType::Float64, true).into(),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        // Float32 array benchmark
+        let f32_array = Arc::new(create_primitive_array::<Float32Type>(size, 0.1));
+        let f32_args = vec![
+            ColumnarValue::Array(f32_array),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(2))),
+        ];
+
+        group.bench_function("round_f32_array", |b| {
+            b.iter(|| {
+                let args_cloned = f32_args.clone();
+                black_box(
+                    round_fn
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args_cloned,
+                            arg_fields: vec![
+                                Field::new("a", DataType::Float32, true).into(),
+                                Field::new("b", DataType::Int32, false).into(),
+                            ],
+                            number_rows: batch_len,
+                            return_field: Field::new("f", DataType::Float32, true).into(),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        // Scalar benchmark (the optimization we added)
+        let scalar_f64_args = vec![
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(std::f64::consts::PI))),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(2))),
+        ];
+
+        group.bench_function("round_f64_scalar", |b| {
+            b.iter(|| {
+                let args_cloned = scalar_f64_args.clone();
+                black_box(
+                    round_fn
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args_cloned,
+                            arg_fields: vec![
+                                Field::new("a", DataType::Float64, false).into(),
+                                Field::new("b", DataType::Int32, false).into(),
+                            ],
+                            number_rows: 1,
+                            return_field: Field::new("f", DataType::Float64, false)
+                                .into(),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        let scalar_f32_args = vec![
+            ColumnarValue::Scalar(ScalarValue::Float32(Some(std::f32::consts::PI))),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(2))),
+        ];
+
+        group.bench_function("round_f32_scalar", |b| {
+            b.iter(|| {
+                let args_cloned = scalar_f32_args.clone();
+                black_box(
+                    round_fn
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args_cloned,
+                            arg_fields: vec![
+                                Field::new("a", DataType::Float32, false).into(),
+                                Field::new("b", DataType::Int32, false).into(),
+                            ],
+                            number_rows: 1,
+                            return_field: Field::new("f", DataType::Float32, false)
+                                .into(),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/signum.rs b/datafusion/functions/benches/signum.rs
index 24b8861e4d28c..e98d1b2c22ea2 100644
--- a/datafusion/functions/benches/signum.rs
+++ b/datafusion/functions/benches/signum.rs
@@ -15,14 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::datatypes::DataType;
 use arrow::{
     datatypes::{Field, Float32Type, Float64Type},
     util::bench_util::create_primitive_array,
 };
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::math::signum;
@@ -88,6 +87,51 @@ fn criterion_benchmark(c: &mut Criterion) {
                 )
             })
         });
+
+        // Scalar benchmarks (the optimization we added)
+        let scalar_f32_args =
+            vec![ColumnarValue::Scalar(ScalarValue::Float32(Some(-42.5)))];
+        let scalar_f32_arg_fields =
+            vec![Field::new("a", DataType::Float32, false).into()];
+        let return_field_f32 = Field::new("f", DataType::Float32, false).into();
+
+        c.bench_function(&format!("signum f32 scalar: {size}"), |b| {
+            b.iter(|| {
+                black_box(
+                    signum
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: scalar_f32_args.clone(),
+                            arg_fields: scalar_f32_arg_fields.clone(),
+                            number_rows: 1,
+                            return_field: Arc::clone(&return_field_f32),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        let scalar_f64_args =
+            vec![ColumnarValue::Scalar(ScalarValue::Float64(Some(-42.5)))];
+        let scalar_f64_arg_fields =
+            vec![Field::new("a", DataType::Float64, false).into()];
+        let return_field_f64 = Field::new("f", DataType::Float64, false).into();
+
+        c.bench_function(&format!("signum f64 scalar: {size}"), |b| {
+            b.iter(|| {
+                black_box(
+                    signum
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: scalar_f64_args.clone(),
+                            arg_fields: scalar_f64_arg_fields.clone(),
+                            number_rows: 1,
+                            return_field: Arc::clone(&return_field_f64),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
     }
 }
 
diff --git a/datafusion/functions/benches/split_part.rs b/datafusion/functions/benches/split_part.rs
new file mode 100644
index 0000000000000..d578339368768
--- /dev/null
+++ b/datafusion/functions/benches/split_part.rs
@@ -0,0 +1,272 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, Int64Array, StringArray, StringViewArray};
+use arrow::datatypes::{DataType, Field};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDF};
+use datafusion_functions::string::split_part;
+use rand::distr::Alphanumeric;
+use rand::prelude::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+const N_ROWS: usize = 8192;
+
+/// Creates an array of strings with `num_parts` random alphanumeric segments
+/// of `part_len` bytes each, joined by `delimiter`.
+fn gen_string_array(
+    n_rows: usize,
+    num_parts: usize,
+    part_len: usize,
+    delimiter: &str,
+    use_string_view: bool,
+) -> ColumnarValue {
+    let mut rng = StdRng::seed_from_u64(42);
+
+    let mut strings: Vec<String> = Vec::with_capacity(n_rows);
+    for _ in 0..n_rows {
+        let mut parts: Vec<String> = Vec::with_capacity(num_parts);
+        for _ in 0..num_parts {
+            let part: String = (&mut rng)
+                .sample_iter(&Alphanumeric)
+                .take(part_len)
+                .map(char::from)
+                .collect();
+            parts.push(part);
+        }
+        strings.push(parts.join(delimiter));
+    }
+
+    if use_string_view {
+        let string_array: StringViewArray = strings.into_iter().map(Some).collect();
+        ColumnarValue::Array(Arc::new(string_array) as ArrayRef)
+    } else {
+        let string_array: StringArray = strings.into_iter().map(Some).collect();
+        ColumnarValue::Array(Arc::new(string_array) as ArrayRef)
+    }
+}
+
+#[expect(clippy::too_many_arguments)]
+fn bench_split_part(
+    group: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>,
+    func: &ScalarUDF,
+    config_options: &Arc<ConfigOptions>,
+    name: &str,
+    tag: &str,
+    strings: ColumnarValue,
+    delimiter: ColumnarValue,
+    position: ColumnarValue,
+) {
+    let args = vec![strings, delimiter, position];
+    let arg_fields: Vec<_> = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect();
+    let return_type = match args[0].data_type() {
+        DataType::Utf8View => DataType::Utf8View,
+        _ => DataType::Utf8,
+    };
+    let return_field = Field::new("f", return_type, true).into();
+
+    group.bench_function(BenchmarkId::new(name, tag), |b| {
+        b.iter(|| {
+            black_box(
+                func.invoke_with_args(ScalarFunctionArgs {
+                    args: args.clone(),
+                    arg_fields: arg_fields.clone(),
+                    number_rows: N_ROWS,
+                    return_field: Arc::clone(&return_field),
+                    config_options: Arc::clone(config_options),
+                })
+                .expect("split_part should work"),
+            )
+        })
+    });
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let split_part_func = split_part();
+    let config_options = Arc::new(ConfigOptions::default());
+    let mut group = c.benchmark_group("split_part");
+
+    // ── Scalar delimiter and position ────────────────
+
+    // Utf8, single-char delimiter, scalar args
+    {
+        let strings = gen_string_array(N_ROWS, 10, 8, ".", false);
+        let delimiter = ColumnarValue::Scalar(ScalarValue::Utf8(Some(".".into())));
+        let position = ColumnarValue::Scalar(ScalarValue::Int64(Some(1)));
+        bench_split_part(
+            &mut group,
+            &split_part_func,
+            &config_options,
+            "scalar_utf8_single_char",
+            "pos_first",
+            strings,
+            delimiter,
+            position,
+        );
+    }
+
+    {
+        let strings = gen_string_array(N_ROWS, 10, 8, ".", false);
+        let delimiter = ColumnarValue::Scalar(ScalarValue::Utf8(Some(".".into())));
+        let position = ColumnarValue::Scalar(ScalarValue::Int64(Some(5)));
+        bench_split_part(
+            &mut group,
+            &split_part_func,
+            &config_options,
+            "scalar_utf8_single_char",
+            "pos_middle",
+            strings,
+            delimiter,
+            position,
+        );
+    }
+
+    {
+        let strings = gen_string_array(N_ROWS, 10, 8, ".", false);
+        let delimiter = ColumnarValue::Scalar(ScalarValue::Utf8(Some(".".into())));
+        let position = ColumnarValue::Scalar(ScalarValue::Int64(Some(-1)));
+        bench_split_part(
+            &mut group,
+            &split_part_func,
+            &config_options,
+            "scalar_utf8_single_char",
+            "pos_negative",
+            strings,
+            delimiter,
+            position,
+        );
+    }
+
+    // Utf8, multi-char delimiter, scalar args
+    {
+        let strings = gen_string_array(N_ROWS, 10, 8, "~@~", false);
+        let delimiter = ColumnarValue::Scalar(ScalarValue::Utf8(Some("~@~".into())));
+        let position = ColumnarValue::Scalar(ScalarValue::Int64(Some(5)));
+        bench_split_part(
+            &mut group,
+            &split_part_func,
+            &config_options,
+            "scalar_utf8_multi_char",
+            "pos_middle",
+            strings,
+            delimiter,
+            position,
+        );
+    }
+
+    // Utf8, long strings, scalar args
+    {
+        let strings = gen_string_array(N_ROWS, 50, 16, ".", false);
+        let delimiter = ColumnarValue::Scalar(ScalarValue::Utf8(Some(".".into())));
+        let position = ColumnarValue::Scalar(ScalarValue::Int64(Some(25)));
+        bench_split_part(
+            &mut group,
+            &split_part_func,
+            &config_options,
+            "scalar_utf8_long_strings",
+            "pos_middle",
+            strings,
+            delimiter,
+            position,
+        );
+    }
+
+    // Utf8View, long parts, scalar args
+    {
+        let strings = gen_string_array(N_ROWS, 10, 32, ".", true);
+        let delimiter = ColumnarValue::Scalar(ScalarValue::Utf8View(Some(".".into())));
+        let position = ColumnarValue::Scalar(ScalarValue::Int64(Some(5)));
+        bench_split_part(
+            &mut group,
+            &split_part_func,
+            &config_options,
+            "scalar_utf8view_long_parts",
+            "pos_middle",
+            strings,
+            delimiter,
+            position,
+        );
+    }
+
+    // Utf8View, very long parts (256 bytes), position 1
+    {
+        let strings = gen_string_array(N_ROWS, 5, 256, ".", true);
+        let delimiter = ColumnarValue::Scalar(ScalarValue::Utf8View(Some(".".into())));
+        let position = ColumnarValue::Scalar(ScalarValue::Int64(Some(1)));
+        bench_split_part(
+            &mut group,
+            &split_part_func,
+            &config_options,
+            "scalar_utf8view_very_long_parts",
+            "pos_first",
+            strings,
+            delimiter,
+            position,
+        );
+    }
+
+    // ── Array delimiter and position ─────────────────
+
+    // Utf8, single-char delimiter, array args
+    {
+        let strings = gen_string_array(N_ROWS, 10, 8, ".", false);
+        let delimiters: StringArray = vec![Some("."); N_ROWS].into_iter().collect();
+        let delimiter = ColumnarValue::Array(Arc::new(delimiters) as ArrayRef);
+        let positions = ColumnarValue::Array(Arc::new(Int64Array::from(vec![5; N_ROWS])));
+        bench_split_part(
+            &mut group,
+            &split_part_func,
+            &config_options,
+            "array_utf8_single_char",
+            "pos_middle",
+            strings,
+            delimiter,
+            positions,
+        );
+    }
+
+    // Utf8, multi-char delimiter, array args
+    {
+        let strings = gen_string_array(N_ROWS, 10, 8, "~@~", false);
+        let delimiters: StringArray = vec![Some("~@~"); N_ROWS].into_iter().collect();
+        let delimiter = ColumnarValue::Array(Arc::new(delimiters) as ArrayRef);
+        let positions = ColumnarValue::Array(Arc::new(Int64Array::from(vec![5; N_ROWS])));
+        bench_split_part(
+            &mut group,
+            &split_part_func,
+            &config_options,
+            "array_utf8_multi_char",
+            "pos_middle",
+            strings,
+            delimiter,
+            positions,
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/starts_with.rs b/datafusion/functions/benches/starts_with.rs
new file mode 100644
index 0000000000000..17483f0da7a07
--- /dev/null
+++ b/datafusion/functions/benches/starts_with.rs
@@ -0,0 +1,183 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{StringArray, StringViewArray};
+use arrow::datatypes::{DataType, Field};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use rand::distr::Alphanumeric;
+use rand::prelude::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+/// Generate a StringArray/StringViewArray with random ASCII strings
+fn gen_string_array(
+    n_rows: usize,
+    str_len: usize,
+    is_string_view: bool,
+) -> ColumnarValue {
+    let mut rng = StdRng::seed_from_u64(42);
+    let strings: Vec<Option<String>> = (0..n_rows)
+        .map(|_| {
+            let s: String = (&mut rng)
+                .sample_iter(&Alphanumeric)
+                .take(str_len)
+                .map(char::from)
+                .collect();
+            Some(s)
+        })
+        .collect();
+
+    if is_string_view {
+        ColumnarValue::Array(Arc::new(StringViewArray::from(strings)))
+    } else {
+        ColumnarValue::Array(Arc::new(StringArray::from(strings)))
+    }
+}
+
+/// Generate a scalar prefix string
+fn gen_scalar_prefix(prefix_str: &str, is_string_view: bool) -> ColumnarValue {
+    if is_string_view {
+        ColumnarValue::Scalar(ScalarValue::Utf8View(Some(prefix_str.to_string())))
+    } else {
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some(prefix_str.to_string())))
+    }
+}
+
+/// Generate an array of prefix strings (same string repeated)
+fn gen_array_prefix(
+    prefix_str: &str,
+    n_rows: usize,
+    is_string_view: bool,
+) -> ColumnarValue {
+    let strings: Vec<Option<String>> =
+        (0..n_rows).map(|_| Some(prefix_str.to_string())).collect();
+
+    if is_string_view {
+        ColumnarValue::Array(Arc::new(StringViewArray::from(strings)))
+    } else {
+        ColumnarValue::Array(Arc::new(StringArray::from(strings)))
+    }
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let starts_with = datafusion_functions::string::starts_with();
+    let n_rows = 8192;
+    let str_len = 128;
+    let prefix_str = "xyz"; // A pattern that likely won't match
+
+    // Benchmark: StringArray with scalar prefix (the optimized path)
+    let str_array = gen_string_array(n_rows, str_len, false);
+    let scalar_prefix = gen_scalar_prefix(prefix_str, false);
+    let arg_fields = vec![
+        Field::new("a", DataType::Utf8, true).into(),
+        Field::new("b", DataType::Utf8, true).into(),
+    ];
+    let return_field = Field::new("f", DataType::Boolean, true).into();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function("starts_with_StringArray_scalar_prefix", |b| {
+        b.iter(|| {
+            black_box(starts_with.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_array.clone(), scalar_prefix.clone()],
+                arg_fields: arg_fields.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark: StringArray with array prefix (for comparison)
+    let array_prefix = gen_array_prefix(prefix_str, n_rows, false);
+    c.bench_function("starts_with_StringArray_array_prefix", |b| {
+        b.iter(|| {
+            black_box(starts_with.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_array.clone(), array_prefix.clone()],
+                arg_fields: arg_fields.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark: StringViewArray with scalar prefix (the optimized path)
+    let str_view_array = gen_string_array(n_rows, str_len, true);
+    let scalar_prefix_view = gen_scalar_prefix(prefix_str, true);
+    let arg_fields_view = vec![
+        Field::new("a", DataType::Utf8View, true).into(),
+        Field::new("b", DataType::Utf8View, true).into(),
+    ];
+
+    c.bench_function("starts_with_StringViewArray_scalar_prefix", |b| {
+        b.iter(|| {
+            black_box(starts_with.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_view_array.clone(), scalar_prefix_view.clone()],
+                arg_fields: arg_fields_view.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark: StringViewArray with array prefix (for comparison)
+    let array_prefix_view = gen_array_prefix(prefix_str, n_rows, true);
+    c.bench_function("starts_with_StringViewArray_array_prefix", |b| {
+        b.iter(|| {
+            black_box(starts_with.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_view_array.clone(), array_prefix_view.clone()],
+                arg_fields: arg_fields_view.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark different string lengths with scalar prefix
+    for str_len in [8, 32, 128, 512] {
+        let str_array = gen_string_array(n_rows, str_len, true);
+        let scalar_prefix = gen_scalar_prefix(prefix_str, true);
+        let arg_fields = vec![
+            Field::new("a", DataType::Utf8View, true).into(),
+            Field::new("b", DataType::Utf8View, true).into(),
+        ];
+
+        c.bench_function(
+            &format!("starts_with_StringViewArray_scalar_strlen_{str_len}"),
+            |b| {
+                b.iter(|| {
+                    black_box(starts_with.invoke_with_args(ScalarFunctionArgs {
+                        args: vec![str_array.clone(), scalar_prefix.clone()],
+                        arg_fields: arg_fields.clone(),
+                        number_rows: n_rows,
+                        return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/strpos.rs b/datafusion/functions/benches/strpos.rs
index 18a99e44bf487..549186dbab14d 100644
--- a/datafusion/functions/benches/strpos.rs
+++ b/datafusion/functions/benches/strpos.rs
@@ -15,180 +15,219 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::array::{StringArray, StringViewArray};
 use arrow::datatypes::{DataType, Field};
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use rand::distr::Alphanumeric;
 use rand::prelude::StdRng;
 use rand::{Rng, SeedableRng};
 use std::hint::black_box;
-use std::str::Chars;
 use std::sync::Arc;
 
-/// gen_arr(4096, 128, 0.1, 0.1, true) will generate a StringViewArray with
-/// 4096 rows, each row containing a string with 128 random characters.
-/// around 10% of the rows are null, around 10% of the rows are non-ASCII.
-fn gen_string_array(
-    n_rows: usize,
+#[rustfmt::skip]
+const UTF8_CORPUS: &[char] = &[
+    // Cyrillic (2 bytes each)
+    'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З', 'И', 'К', 'Л', 'М', 'Н', 'О', 'П', 'Р', 'С',
+    'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч', 'Ш', 'Щ', 'Э', 'Ю', 'Я',
+    // CJK (3 bytes each)
+    '数', '据', '融', '合', '查', '询', '引', '擎', '优', '化', '执', '行', '计', '划',
+    '表', '达',
+    // Emoji (4 bytes each)
+    '📊', '🔥', '🚀', '⚡', '🎯', '💡', '🔧', '📈',
+];
+const N_ROWS: usize = 8192;
+
+/// Returns a random string of `len` characters. If `ascii` is true, the string
+/// is ASCII-only; otherwise it is drawn from `UTF8_CORPUS`.
+fn random_string(rng: &mut StdRng, len: usize, ascii: bool) -> String {
+    if ascii {
+        let value: Vec<u8> = rng.sample_iter(&Alphanumeric).take(len).collect();
+        String::from_utf8(value).unwrap()
+    } else {
+        (0..len)
+            .map(|_| UTF8_CORPUS[rng.random_range(0..UTF8_CORPUS.len())])
+            .collect()
+    }
+}
+
+/// Wraps `strings` into either a `StringArray` or `StringViewArray`.
+fn to_columnar_value(
+    strings: Vec<Option<String>>,
+    is_string_view: bool,
+) -> ColumnarValue {
+    if is_string_view {
+        let arr: StringViewArray = strings.into_iter().collect();
+        ColumnarValue::Array(Arc::new(arr))
+    } else {
+        let arr: StringArray = strings.into_iter().collect();
+        ColumnarValue::Array(Arc::new(arr))
+    }
+}
+
+/// Returns haystack and needle, where both are arrays. Each needle is a
+/// contiguous substring of its corresponding haystack. Around `null_density`
+/// fraction of rows are null and `utf8_density` fraction contain non-ASCII
+/// characters.
+fn make_array_needle_args(
+    rng: &mut StdRng,
     str_len_chars: usize,
     null_density: f32,
     utf8_density: f32,
-    is_string_view: bool, // false -> StringArray, true -> StringViewArray
+    is_string_view: bool,
 ) -> Vec<ColumnarValue> {
-    let mut rng = StdRng::seed_from_u64(42);
-    let rng_ref = &mut rng;
-
-    let utf8 = "DatafusionДатаФусион数据融合📊🔥"; // includes utf8 encoding with 1~4 bytes
-    let corpus_char_count = utf8.chars().count();
-
-    let mut output_string_vec: Vec<Option<String>> = Vec::with_capacity(n_rows);
-    let mut output_sub_string_vec: Vec<Option<String>> = Vec::with_capacity(n_rows);
-    for _ in 0..n_rows {
-        let rand_num = rng_ref.random::<f32>(); // [0.0, 1.0)
-        if rand_num < null_density {
-            output_sub_string_vec.push(None);
-            output_string_vec.push(None);
-        } else if rand_num < null_density + utf8_density {
-            // Generate random UTF8 string
-            let mut generated_string = String::with_capacity(str_len_chars);
-            for _ in 0..str_len_chars {
-                let idx = rng_ref.random_range(0..corpus_char_count);
-                let char = utf8.chars().nth(idx).unwrap();
-                generated_string.push(char);
-            }
-            output_sub_string_vec.push(Some(random_substring(generated_string.chars())));
-            output_string_vec.push(Some(generated_string));
+    let mut haystacks: Vec<Option<String>> = Vec::with_capacity(N_ROWS);
+    let mut needles: Vec<Option<String>> = Vec::with_capacity(N_ROWS);
+    for _ in 0..N_ROWS {
+        let r = rng.random::<f32>();
+        if r < null_density {
+            haystacks.push(None);
+            needles.push(None);
         } else {
-            // Generate random ASCII-only string
-            let value = rng_ref
+            let ascii = r >= null_density + utf8_density;
+            let s = random_string(rng, str_len_chars, ascii);
+            needles.push(Some(random_substring(rng, &s)));
+            haystacks.push(Some(s));
+        }
+    }
+
+    vec![
+        to_columnar_value(haystacks, is_string_view),
+        to_columnar_value(needles, is_string_view),
+    ]
+}
+
+/// Returns haystack array with a fixed scalar needle inserted into each row.
+/// Around `null_density` fraction of rows are null and `utf8_density` fraction
+/// contain non-ASCII characters. The needle must be ASCII.
+fn make_scalar_needle_args(
+    rng: &mut StdRng,
+    str_len_chars: usize,
+    needle: &str,
+    null_density: f32,
+    utf8_density: f32,
+    is_string_view: bool,
+) -> Vec<ColumnarValue> {
+    let needle_len = needle.len();
+    assert!(
+        str_len_chars >= needle_len,
+        "str_len_chars must be >= needle length"
+    );
+
+    let mut haystacks: Vec<Option<String>> = Vec::with_capacity(N_ROWS);
+    for _ in 0..N_ROWS {
+        let r = rng.random::<f32>();
+        if r < null_density {
+            haystacks.push(None);
+        } else if r >= null_density + utf8_density {
+            let mut value: Vec<u8> = (&mut *rng)
                 .sample_iter(&Alphanumeric)
                 .take(str_len_chars)
                 .collect();
-            let value = String::from_utf8(value).unwrap();
-            output_sub_string_vec.push(Some(random_substring(value.chars())));
-            output_string_vec.push(Some(value));
+            let pos = rng.random_range(0..=str_len_chars - needle_len);
+            value[pos..pos + needle_len].copy_from_slice(needle.as_bytes());
+            haystacks.push(Some(String::from_utf8(value).unwrap()));
+        } else {
+            let mut s = random_string(rng, str_len_chars, false);
+            let char_positions: Vec<usize> = s.char_indices().map(|(i, _)| i).collect();
+            let insert_pos = if char_positions.len() > 1 {
+                char_positions[rng.random_range(0..char_positions.len())]
+            } else {
+                0
+            };
+            s.insert_str(insert_pos, needle);
+            haystacks.push(Some(s));
         }
     }
 
-    if is_string_view {
-        let string_view_array: StringViewArray = output_string_vec.into_iter().collect();
-        let sub_string_view_array: StringViewArray =
-            output_sub_string_vec.into_iter().collect();
-        vec![
-            ColumnarValue::Array(Arc::new(string_view_array)),
-            ColumnarValue::Array(Arc::new(sub_string_view_array)),
-        ]
-    } else {
-        let string_array: StringArray = output_string_vec.clone().into_iter().collect();
-        let sub_string_array: StringArray = output_sub_string_vec.into_iter().collect();
-        vec![
-            ColumnarValue::Array(Arc::new(string_array)),
-            ColumnarValue::Array(Arc::new(sub_string_array)),
-        ]
-    }
+    let needle_cv = ColumnarValue::Scalar(ScalarValue::Utf8(Some(needle.to_string())));
+    vec![to_columnar_value(haystacks, is_string_view), needle_cv]
 }
 
-fn random_substring(chars: Chars) -> String {
-    // get the substring of a random length from the input string by byte unit
-    let mut rng = StdRng::seed_from_u64(44);
-    let count = chars.clone().count();
+/// Extracts a random contiguous substring from `s`.
+fn random_substring(rng: &mut StdRng, s: &str) -> String {
+    let count = s.chars().count();
+
+    assert!(count > 0, "random_substring requires a non-empty string");
+    if count == 1 {
+        return s.to_string();
+    }
+
     let start = rng.random_range(0..count - 1);
     let end = rng.random_range(start + 1..count);
-    chars
-        .enumerate()
-        .filter(|(i, _)| *i >= start && *i < end)
-        .map(|(_, c)| c)
-        .collect()
+    s.chars().skip(start).take(end - start).collect()
+}
+
+fn bench_strpos(
+    c: &mut Criterion,
+    name: &str,
+    args: &[ColumnarValue],
+    strpos: &datafusion_expr::ScalarUDF,
+) {
+    let arg_fields = vec![Field::new("a", args[0].data_type(), true).into()];
+    let return_field: Arc<Field> = Field::new("f", DataType::Int32, true).into();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function(name, |b| {
+        b.iter(|| {
+            black_box(strpos.invoke_with_args(ScalarFunctionArgs {
+                args: args.to_vec(),
+                arg_fields: arg_fields.clone(),
+                number_rows: N_ROWS,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
 }
 
 fn criterion_benchmark(c: &mut Criterion) {
-    // All benches are single batch run with 8192 rows
     let strpos = datafusion_functions::unicode::strpos();
+    let mut rng = StdRng::seed_from_u64(42);
 
-    let n_rows = 8192;
     for str_len in [8, 32, 128, 4096] {
-        // StringArray ASCII only
-        let args_string_ascii = gen_string_array(n_rows, str_len, 0.1, 0.0, false);
-        let arg_fields =
-            vec![Field::new("a", args_string_ascii[0].data_type(), true).into()];
-        let return_field = Field::new("f", DataType::Int32, true).into();
-        let config_options = Arc::new(ConfigOptions::default());
-
-        c.bench_function(
-            &format!("strpos_StringArray_ascii_str_len_{str_len}"),
-            |b| {
-                b.iter(|| {
-                    black_box(strpos.invoke_with_args(ScalarFunctionArgs {
-                        args: args_string_ascii.clone(),
-                        arg_fields: arg_fields.clone(),
-                        number_rows: n_rows,
-                        return_field: Arc::clone(&return_field),
-                        config_options: Arc::clone(&config_options),
-                    }))
-                })
-            },
-        );
-
-        // StringArray UTF8
-        let args_string_utf8 = gen_string_array(n_rows, str_len, 0.1, 0.5, false);
-        let arg_fields =
-            vec![Field::new("a", args_string_utf8[0].data_type(), true).into()];
-        let return_field = Field::new("f", DataType::Int32, true).into();
-        c.bench_function(&format!("strpos_StringArray_utf8_str_len_{str_len}"), |b| {
-            b.iter(|| {
-                black_box(strpos.invoke_with_args(ScalarFunctionArgs {
-                    args: args_string_utf8.clone(),
-                    arg_fields: arg_fields.clone(),
-                    number_rows: n_rows,
-                    return_field: Arc::clone(&return_field),
-                    config_options: Arc::clone(&config_options),
-                }))
-            })
-        });
-
-        // StringViewArray ASCII only
-        let args_string_view_ascii = gen_string_array(n_rows, str_len, 0.1, 0.0, true);
-        let arg_fields =
-            vec![Field::new("a", args_string_view_ascii[0].data_type(), true).into()];
-        let return_field = Field::new("f", DataType::Int32, true).into();
-        c.bench_function(
-            &format!("strpos_StringViewArray_ascii_str_len_{str_len}"),
-            |b| {
-                b.iter(|| {
-                    black_box(strpos.invoke_with_args(ScalarFunctionArgs {
-                        args: args_string_view_ascii.clone(),
-                        arg_fields: arg_fields.clone(),
-                        number_rows: n_rows,
-                        return_field: Arc::clone(&return_field),
-                        config_options: Arc::clone(&config_options),
-                    }))
-                })
-            },
-        );
-
-        // StringViewArray UTF8
-        let args_string_view_utf8 = gen_string_array(n_rows, str_len, 0.1, 0.5, true);
-        let arg_fields =
-            vec![Field::new("a", args_string_view_utf8[0].data_type(), true).into()];
-        let return_field = Field::new("f", DataType::Int32, true).into();
-        c.bench_function(
-            &format!("strpos_StringViewArray_utf8_str_len_{str_len}"),
-            |b| {
-                b.iter(|| {
-                    black_box(strpos.invoke_with_args(ScalarFunctionArgs {
-                        args: args_string_view_utf8.clone(),
-                        arg_fields: arg_fields.clone(),
-                        number_rows: n_rows,
-                        return_field: Arc::clone(&return_field),
-                        config_options: Arc::clone(&config_options),
-                    }))
-                })
-            },
-        );
+        // Array needle benchmarks
+        for (label, utf8_density, is_view) in [
+            ("StringArray_ascii", 0.0, false),
+            ("StringArray_utf8", 0.5, false),
+            ("StringViewArray_ascii", 0.0, true),
+            ("StringViewArray_utf8", 0.5, true),
+        ] {
+            let args =
+                make_array_needle_args(&mut rng, str_len, 0.1, utf8_density, is_view);
+            bench_strpos(
+                c,
+                &format!("strpos_{label}_str_len_{str_len}"),
+                &args,
+                strpos.as_ref(),
+            );
+        }
+
+        // Scalar needle benchmarks
+        let needle = "xyz";
+        for (label, utf8_density, is_view) in [
+            ("StringArray_scalar_needle_ascii", 0.0, false),
+            ("StringArray_scalar_needle_utf8", 0.5, false),
+            ("StringViewArray_scalar_needle_ascii", 0.0, true),
+            ("StringViewArray_scalar_needle_utf8", 0.5, true),
+        ] {
+            let args = make_scalar_needle_args(
+                &mut rng,
+                str_len,
+                needle,
+                0.1,
+                utf8_density,
+                is_view,
+            );
+            bench_strpos(
+                c,
+                &format!("strpos_{label}_str_len_{str_len}"),
+                &args,
+                strpos.as_ref(),
+            );
+        }
     }
 }
 
diff --git a/datafusion/functions/benches/substr.rs b/datafusion/functions/benches/substr.rs
index 771413458c1fb..3939fd100e459 100644
--- a/datafusion/functions/benches/substr.rs
+++ b/datafusion/functions/benches/substr.rs
@@ -15,55 +15,48 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::array::{ArrayRef, Int64Array, OffsetSizeTrait};
 use arrow::datatypes::{DataType, Field};
 use arrow::util::bench_util::{
     create_string_array_with_len, create_string_view_array_with_len,
 };
-use criterion::{criterion_group, criterion_main, Criterion, SamplingMode};
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::DataFusionError;
+use datafusion_common::{DataFusionError, ScalarValue};
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::unicode;
 use std::hint::black_box;
 use std::sync::Arc;
 
+fn make_i64_arg(value: i64, size: usize, as_scalar: bool) -> ColumnarValue {
+    if as_scalar {
+        ColumnarValue::Scalar(ScalarValue::from(value))
+    } else {
+        ColumnarValue::Array(Arc::new(Int64Array::from(vec![value; size])))
+    }
+}
+
 fn create_args_without_count<O: OffsetSizeTrait>(
     size: usize,
     str_len: usize,
     start_half_way: bool,
     force_view_types: bool,
+    scalar_start: bool,
 ) -> Vec<ColumnarValue> {
-    let start_array = Arc::new(Int64Array::from(
-        (0..size)
-            .map(|_| {
-                if start_half_way {
-                    (str_len / 2) as i64
-                } else {
-                    1i64
-                }
-            })
-            .collect::<Vec<_>>(),
-    ));
-
-    if force_view_types {
-        let string_array =
-            Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
-        vec![
-            ColumnarValue::Array(string_array),
-            ColumnarValue::Array(start_array),
-        ]
+    let start_val = if start_half_way {
+        (str_len / 2) as i64
     } else {
-        let string_array =
-            Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+        1i64
+    };
+    let start = make_i64_arg(start_val, size, scalar_start);
 
-        vec![
-            ColumnarValue::Array(string_array),
-            ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef),
-        ]
-    }
+    let string_array: ArrayRef = if force_view_types {
+        Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false))
+    } else {
+        Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len))
+    };
+
+    vec![ColumnarValue::Array(string_array), start]
 }
 
 fn create_args_with_count<O: OffsetSizeTrait>(
@@ -71,34 +64,22 @@ fn create_args_with_count<O: OffsetSizeTrait>(
     str_len: usize,
     count_max: usize,
     force_view_types: bool,
+    scalar_args: bool,
 ) -> Vec<ColumnarValue> {
-    let start_array =
-        Arc::new(Int64Array::from((0..size).map(|_| 1).collect::<Vec<_>>()));
     let count = count_max.min(str_len) as i64;
-    let count_array = Arc::new(Int64Array::from(
-        (0..size).map(|_| count).collect::<Vec<_>>(),
-    ));
-
-    if force_view_types {
-        let string_array =
-            Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
-        vec![
-            ColumnarValue::Array(string_array),
-            ColumnarValue::Array(start_array),
-            ColumnarValue::Array(count_array),
-        ]
+    let start = make_i64_arg(1i64, size, scalar_args);
+    let count = make_i64_arg(count, size, scalar_args);
+
+    let string_array: ArrayRef = if force_view_types {
+        Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false))
     } else {
-        let string_array =
-            Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
-
-        vec![
-            ColumnarValue::Array(string_array),
-            ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef),
-            ColumnarValue::Array(Arc::clone(&count_array) as ArrayRef),
-        ]
-    }
+        Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len))
+    };
+
+    vec![ColumnarValue::Array(string_array), start, count]
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn invoke_substr_with_args(
     args: Vec<ColumnarValue>,
     number_rows: usize,
@@ -123,22 +104,22 @@ fn criterion_benchmark(c: &mut Criterion) {
     for size in [1024, 4096] {
         // string_len = 12, substring_len=6 (see `create_args_without_count`)
         let len = 12;
-        let mut group = c.benchmark_group("SHORTER THAN 12");
+        let mut group = c.benchmark_group("substr, no count, short strings");
         group.sampling_mode(SamplingMode::Flat);
         group.sample_size(10);
 
-        let args = create_args_without_count::<i32>(size, len, true, true);
+        let args = create_args_without_count::<i32>(size, len, true, true, false);
         group.bench_function(
             format!("substr_string_view [size={size}, strlen={len}]"),
             |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
         );
 
-        let args = create_args_without_count::<i32>(size, len, false, false);
+        let args = create_args_without_count::<i32>(size, len, false, false, false);
         group.bench_function(format!("substr_string [size={size}, strlen={len}]"), |b| {
             b.iter(|| black_box(invoke_substr_with_args(args.clone(), size)))
         });
 
-        let args = create_args_without_count::<i64>(size, len, true, false);
+        let args = create_args_without_count::<i64>(size, len, true, false, false);
         group.bench_function(
             format!("substr_large_string [size={size}, strlen={len}]"),
             |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
@@ -149,23 +130,23 @@ fn criterion_benchmark(c: &mut Criterion) {
         // string_len = 128, start=1, count=64, substring_len=64
         let len = 128;
         let count = 64;
-        let mut group = c.benchmark_group("LONGER THAN 12");
+        let mut group = c.benchmark_group("substr, with count, long strings");
         group.sampling_mode(SamplingMode::Flat);
         group.sample_size(10);
 
-        let args = create_args_with_count::<i32>(size, len, count, true);
+        let args = create_args_with_count::<i32>(size, len, count, true, false);
         group.bench_function(
             format!("substr_string_view [size={size}, count={count}, strlen={len}]",),
             |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
         );
 
-        let args = create_args_with_count::<i32>(size, len, count, false);
+        let args = create_args_with_count::<i32>(size, len, count, false, false);
         group.bench_function(
             format!("substr_string [size={size}, count={count}, strlen={len}]",),
             |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
         );
 
-        let args = create_args_with_count::<i64>(size, len, count, false);
+        let args = create_args_with_count::<i64>(size, len, count, false, false);
         group.bench_function(
             format!("substr_large_string [size={size}, count={count}, strlen={len}]",),
             |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
@@ -176,29 +157,136 @@ fn criterion_benchmark(c: &mut Criterion) {
         // string_len = 128, start=1, count=6, substring_len=6
         let len = 128;
         let count = 6;
-        let mut group = c.benchmark_group("SRC_LEN > 12, SUB_LEN < 12");
+        let mut group = c.benchmark_group("substr, short count, long strings");
         group.sampling_mode(SamplingMode::Flat);
         group.sample_size(10);
 
-        let args = create_args_with_count::<i32>(size, len, count, true);
+        let args = create_args_with_count::<i32>(size, len, count, true, false);
         group.bench_function(
             format!("substr_string_view [size={size}, count={count}, strlen={len}]",),
             |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
         );
 
-        let args = create_args_with_count::<i32>(size, len, count, false);
+        let args = create_args_with_count::<i32>(size, len, count, false, false);
         group.bench_function(
             format!("substr_string [size={size}, count={count}, strlen={len}]",),
             |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
         );
 
-        let args = create_args_with_count::<i64>(size, len, count, false);
+        let args = create_args_with_count::<i64>(size, len, count, false, false);
         group.bench_function(
             format!("substr_large_string [size={size}, count={count}, strlen={len}]",),
             |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
         );
 
         group.finish();
+
+        // Scalar start, no count, short strings
+        let len = 12;
+        let mut group =
+            c.benchmark_group("substr, scalar start, no count, short strings");
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+
+        let args = create_args_without_count::<i32>(size, len, true, true, true);
+        group.bench_function(
+            format!("substr_string_view [size={size}, strlen={len}]"),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_without_count::<i32>(size, len, false, false, true);
+        group.bench_function(format!("substr_string [size={size}, strlen={len}]"), |b| {
+            b.iter(|| black_box(invoke_substr_with_args(args.clone(), size)))
+        });
+
+        group.finish();
+
+        // Scalar start, no count, long strings, start near middle
+        let len = 128;
+        let mut group = c.benchmark_group("substr, scalar start, no count, long strings");
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+
+        let args = create_args_without_count::<i32>(size, len, true, true, true);
+        group.bench_function(
+            format!("substr_string_view [size={size}, strlen={len}]"),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_without_count::<i32>(size, len, false, false, true);
+        group.bench_function(format!("substr_string [size={size}, strlen={len}]"), |b| {
+            b.iter(|| black_box(invoke_substr_with_args(args.clone(), size)))
+        });
+
+        group.finish();
+
+        // Scalar start=1, no count, long strings
+        let len = 128;
+        let mut group =
+            c.benchmark_group("substr, scalar start=1, no count, long strings");
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+
+        let args = create_args_without_count::<i32>(size, len, false, true, true);
+        group.bench_function(
+            format!("substr_string_view [size={size}, strlen={len}]"),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_without_count::<i32>(size, len, false, false, true);
+        group.bench_function(format!("substr_string [size={size}, strlen={len}]"), |b| {
+            b.iter(|| black_box(invoke_substr_with_args(args.clone(), size)))
+        });
+
+        group.finish();
+
+        // Scalar start and count, short strings
+        let len = 12;
+        let count = 6;
+        let mut group = c.benchmark_group("substr, scalar args, short strings");
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+
+        let args = create_args_with_count::<i32>(size, len, count, true, true);
+        group.bench_function(
+            format!("substr_string_view [size={size}, count={count}, strlen={len}]"),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_with_count::<i32>(size, len, count, false, true);
+        group.bench_function(
+            format!("substr_string [size={size}, count={count}, strlen={len}]"),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        group.finish();
+
+        // Scalar start and count, long strings
+        let len = 128;
+        let count = 64;
+        let mut group = c.benchmark_group("substr, scalar args, long strings");
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+
+        let args = create_args_with_count::<i32>(size, len, count, true, true);
+        group.bench_function(
+            format!("substr_string_view [size={size}, count={count}, strlen={len}]"),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_with_count::<i32>(size, len, count, false, true);
+        group.bench_function(
+            format!("substr_string [size={size}, count={count}, strlen={len}]"),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_with_count::<i64>(size, len, count, false, true);
+        group.bench_function(
+            format!("substr_large_string [size={size}, count={count}, strlen={len}]"),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        group.finish();
     }
 }
 
diff --git a/datafusion/functions/benches/substr_index.rs b/datafusion/functions/benches/substr_index.rs
index d0941d9baedda..a0c3784dbeee5 100644
--- a/datafusion/functions/benches/substr_index.rs
+++ b/datafusion/functions/benches/substr_index.rs
@@ -15,20 +15,23 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use std::hint::black_box;
 use std::sync::Arc;
 
-use arrow::array::{ArrayRef, Int64Array, StringArray};
+use arrow::array::{ArrayRef, Int64Array, StringArray, StringViewArray};
 use arrow::datatypes::{DataType, Field};
-use criterion::{criterion_group, criterion_main, Criterion};
-use datafusion_common::config::ConfigOptions;
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::{ScalarValue, config::ConfigOptions};
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::unicode::substr_index;
+use rand::Rng;
+use rand::SeedableRng;
 use rand::distr::{Alphanumeric, Uniform};
 use rand::prelude::Distribution;
-use rand::Rng;
+use rand::rngs::StdRng;
+
+const ARRAY_DATA_SEED: u64 = 0x5EED_AAAA;
+const SCALAR_DATA_SEED: u64 = 0x5EED_BBBB;
 
 struct Filter<Dist, Test> {
     dist: Dist,
@@ -50,71 +53,207 @@ where
     }
 }
 
-fn data() -> (StringArray, StringArray, Int64Array) {
-    let dist = Filter {
-        dist: Uniform::new(-4, 5),
+#[derive(Clone, Copy)]
+enum StringRep {
+    Utf8,
+    Utf8View,
+}
+
+impl StringRep {
+    fn name(self) -> &'static str {
+        match self {
+            Self::Utf8 => "utf8",
+            Self::Utf8View => "utf8view",
+        }
+    }
+
+    fn data_type(self) -> DataType {
+        match self {
+            Self::Utf8 => DataType::Utf8,
+            Self::Utf8View => DataType::Utf8View,
+        }
+    }
+
+    fn array(self, values: &[String]) -> ArrayRef {
+        match self {
+            Self::Utf8 => Arc::new(StringArray::from(values.to_vec())) as ArrayRef,
+            Self::Utf8View => Arc::new(
+                values
+                    .iter()
+                    .map(|value| Some(value.as_str()))
+                    .collect::<StringViewArray>(),
+            ) as ArrayRef,
+        }
+    }
+
+    fn scalar(self, value: &str) -> ScalarValue {
+        match self {
+            Self::Utf8 => ScalarValue::Utf8(Some(value.to_string())),
+            Self::Utf8View => ScalarValue::Utf8View(Some(value.to_string())),
+        }
+    }
+}
+
+fn random_token<R: Rng + ?Sized>(rng: &mut R, len: usize) -> String {
+    rng.sample_iter(&Alphanumeric)
+        .take(len)
+        .map(char::from)
+        .collect()
+}
+
+fn array_data(
+    batch_size: usize,
+    single_char_delimiter: bool,
+) -> (Vec<String>, Vec<String>, Vec<i64>) {
+    let count_dist = Filter {
+        dist: Uniform::new(-4, 5).expect("valid count distribution"),
         test: |x: &i64| x != &0,
     };
-    let mut rng = rand::rng();
-    let mut strings: Vec<String> = vec![];
-    let mut delimiters: Vec<String> = vec![];
-    let mut counts: Vec<i64> = vec![];
+    let mut rng = StdRng::seed_from_u64(ARRAY_DATA_SEED);
+    let mut strings = Vec::with_capacity(batch_size);
+    let mut delimiters = Vec::with_capacity(batch_size);
+    let mut counts = Vec::with_capacity(batch_size);
 
-    for _ in 0..1000 {
+    for _ in 0..batch_size {
         let length = rng.random_range(20..50);
-        let text: String = (&mut rng)
-            .sample_iter(&Alphanumeric)
-            .take(length)
-            .map(char::from)
-            .collect();
-        let char = rng.random_range(0..text.len());
-        let delimiter = &text.chars().nth(char).unwrap();
-        let count = rng.sample(dist.dist.unwrap());
-
-        strings.push(text);
-        delimiters.push(delimiter.to_string());
-        counts.push(count);
+        let base = random_token(&mut rng, length);
+
+        let (string_value, delimiter) = if single_char_delimiter {
+            let char_idx = rng.random_range(0..base.chars().count());
+            let delimiter = base.chars().nth(char_idx).unwrap().to_string();
+            (base, delimiter)
+        } else {
+            let long_delimiters = ["|||", "***", "&&&", "###", "@@@", "$$$"];
+            let delimiter =
+                long_delimiters[rng.random_range(0..long_delimiters.len())].to_string();
+
+            let delimiter_count = rng.random_range(1..4);
+            let mut result = String::new();
+            for i in 0..delimiter_count {
+                result.push_str(&base);
+                if i < delimiter_count - 1 {
+                    result.push_str(&delimiter);
+                }
+            }
+            (result, delimiter)
+        };
+
+        strings.push(string_value);
+        delimiters.push(delimiter);
+        counts.push(count_dist.sample(&mut rng));
     }
 
-    (
-        StringArray::from(strings),
-        StringArray::from(delimiters),
-        Int64Array::from(counts),
-    )
+    (strings, delimiters, counts)
 }
 
-fn criterion_benchmark(c: &mut Criterion) {
-    c.bench_function("substr_index_array_array_1000", |b| {
-        let (strings, delimiters, counts) = data();
-        let batch_len = counts.len();
-        let strings = ColumnarValue::Array(Arc::new(strings) as ArrayRef);
-        let delimiters = ColumnarValue::Array(Arc::new(delimiters) as ArrayRef);
-        let counts = ColumnarValue::Array(Arc::new(counts) as ArrayRef);
-
-        let args = vec![strings, delimiters, counts];
-        let arg_fields = args
-            .iter()
-            .enumerate()
-            .map(|(idx, arg)| {
-                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
-            })
-            .collect::<Vec<_>>();
-        let config_options = Arc::new(ConfigOptions::default());
-
-        b.iter(|| {
-            black_box(
-                substr_index()
-                    .invoke_with_args(ScalarFunctionArgs {
-                        args: args.clone(),
-                        arg_fields: arg_fields.clone(),
-                        number_rows: batch_len,
-                        return_field: Field::new("f", DataType::Utf8, true).into(),
-                        config_options: Arc::clone(&config_options),
-                    })
-                    .expect("substr_index should work on valid values"),
-            )
+fn scalar_data(batch_size: usize, delimiter: &str) -> Vec<String> {
+    let mut rng = StdRng::seed_from_u64(SCALAR_DATA_SEED);
+    let mut strings = Vec::with_capacity(batch_size);
+
+    for _ in 0..batch_size {
+        let left_len = rng.random_range(12..24);
+        let middle_len = rng.random_range(12..24);
+        let right_len = rng.random_range(12..24);
+        let left = random_token(&mut rng, left_len);
+        let middle = random_token(&mut rng, middle_len);
+        let right = random_token(&mut rng, right_len);
+        strings.push(format!("{left}{delimiter}{middle}{delimiter}{right}"));
+    }
+
+    strings
+}
+
+fn run_benchmark(
+    b: &mut criterion::Bencher,
+    args: &[ColumnarValue],
+    return_type: &DataType,
+    number_rows: usize,
+) {
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| {
+            Field::new(format!("arg_{idx}"), arg.data_type().clone(), true).into()
         })
-    });
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    b.iter(|| {
+        black_box(
+            substr_index()
+                .invoke_with_args(ScalarFunctionArgs {
+                    args: args.to_vec(),
+                    arg_fields: arg_fields.clone(),
+                    number_rows,
+                    return_field: Field::new("f", return_type.clone(), true).into(),
+                    config_options: Arc::clone(&config_options),
+                })
+                .expect("substr_index should work on valid values"),
+        )
+    })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let mut group = c.benchmark_group("substr_index");
+    let batch_sizes = [100, 1000, 10_000];
+
+    for batch_size in batch_sizes {
+        for rep in [StringRep::Utf8, StringRep::Utf8View] {
+            let rep_name = rep.name();
+
+            group.bench_function(
+                format!("substr_index_{rep_name}_{batch_size}_array_single_delimiter"),
+                |b| {
+                    let (strings, delimiters, counts) = array_data(batch_size, true);
+                    let args = vec![
+                        ColumnarValue::Array(rep.array(&strings)),
+                        ColumnarValue::Array(rep.array(&delimiters)),
+                        ColumnarValue::Array(
+                            Arc::new(Int64Array::from(counts)) as ArrayRef
+                        ),
+                    ];
+                    run_benchmark(b, &args, &rep.data_type(), batch_size);
+                },
+            );
+
+            group.bench_function(
+                format!("substr_index_{rep_name}_{batch_size}_array_long_delimiter"),
+                |b| {
+                    let (strings, delimiters, counts) = array_data(batch_size, false);
+                    let args = vec![
+                        ColumnarValue::Array(rep.array(&strings)),
+                        ColumnarValue::Array(rep.array(&delimiters)),
+                        ColumnarValue::Array(
+                            Arc::new(Int64Array::from(counts)) as ArrayRef
+                        ),
+                    ];
+                    run_benchmark(b, &args, &rep.data_type(), batch_size);
+                },
+            );
+
+            for (name, delimiter, count) in [
+                ("single_delimiter_pos", ".", 1_i64),
+                ("single_delimiter_neg", ".", -1_i64),
+                ("long_delimiter_pos", "|||", 1_i64),
+                ("long_delimiter_neg", "|||", -1_i64),
+            ] {
+                group.bench_function(
+                    format!("substr_index_{rep_name}_{batch_size}_scalar_{name}"),
+                    |b| {
+                        let strings = scalar_data(batch_size, delimiter);
+                        let args = vec![
+                            ColumnarValue::Array(rep.array(&strings)),
+                            ColumnarValue::Scalar(rep.scalar(delimiter)),
+                            ColumnarValue::Scalar(ScalarValue::Int64(Some(count))),
+                        ];
+                        run_benchmark(b, &args, &rep.data_type(), batch_size);
+                    },
+                );
+            }
+        }
+    }
+
+    group.finish();
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/functions/benches/to_char.rs b/datafusion/functions/benches/to_char.rs
index 945508aec7405..4d866570b7dd1 100644
--- a/datafusion/functions/benches/to_char.rs
+++ b/datafusion/functions/benches/to_char.rs
@@ -15,24 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use std::hint::black_box;
 use std::sync::Arc;
 
-use arrow::array::{ArrayRef, Date32Array, StringArray};
+use arrow::array::{ArrayRef, Date32Array, Date64Array, StringArray};
 use arrow::datatypes::{DataType, Field};
-use chrono::prelude::*;
 use chrono::TimeDelta;
-use criterion::{criterion_group, criterion_main, Criterion};
-use datafusion_common::config::ConfigOptions;
+use chrono::prelude::*;
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
-use datafusion_common::ScalarValue::TimestampNanosecond;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::datetime::to_char;
+use rand::Rng;
 use rand::prelude::IndexedRandom;
 use rand::rngs::ThreadRng;
-use rand::Rng;
 
 fn pick_date_in_range(
     rng: &mut ThreadRng,
@@ -65,6 +62,26 @@ fn generate_date32_array(rng: &mut ThreadRng) -> Date32Array {
     Date32Array::from(data)
 }
 
+fn generate_date64_array(rng: &mut ThreadRng) -> Date64Array {
+    let start_date = "1970-01-01"
+        .parse::<NaiveDate>()
+        .expect("Date should parse");
+    let end_date = "2050-12-31"
+        .parse::<NaiveDate>()
+        .expect("Date should parse");
+    let mut data: Vec<i64> = Vec::with_capacity(1000);
+    for _ in 0..1000 {
+        let date = pick_date_in_range(rng, start_date, end_date);
+        let millis = date
+            .and_hms_opt(0, 0, 0)
+            .unwrap()
+            .and_utc()
+            .timestamp_millis();
+        data.push(millis);
+    }
+    Date64Array::from(data)
+}
+
 const DATE_PATTERNS: [&str; 5] =
     ["%Y:%m:%d", "%d-%m-%Y", "%d%m%Y", "%Y%m%d", "%Y...%m...%d"];
 
@@ -157,7 +174,7 @@ fn criterion_benchmark(c: &mut Criterion) {
 
     c.bench_function("to_char_array_datetime_patterns_1000", |b| {
         let mut rng = rand::rng();
-        let data_arr = generate_date32_array(&mut rng);
+        let data_arr = generate_date64_array(&mut rng);
         let batch_len = data_arr.len();
         let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
         let patterns = ColumnarValue::Array(Arc::new(generate_datetime_pattern_array(
@@ -184,7 +201,7 @@ fn criterion_benchmark(c: &mut Criterion) {
 
     c.bench_function("to_char_array_mixed_patterns_1000", |b| {
         let mut rng = rand::rng();
-        let data_arr = generate_date32_array(&mut rng);
+        let data_arr = generate_date64_array(&mut rng);
         let batch_len = data_arr.len();
         let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
         let patterns = ColumnarValue::Array(Arc::new(generate_mixed_pattern_array(
@@ -237,7 +254,7 @@ fn criterion_benchmark(c: &mut Criterion) {
 
     c.bench_function("to_char_scalar_datetime_pattern_1000", |b| {
         let mut rng = rand::rng();
-        let data_arr = generate_date32_array(&mut rng);
+        let data_arr = generate_date64_array(&mut rng);
         let batch_len = data_arr.len();
         let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
         let patterns = ColumnarValue::Scalar(ScalarValue::Utf8(Some(
@@ -261,38 +278,6 @@ fn criterion_benchmark(c: &mut Criterion) {
             )
         })
     });
-
-    c.bench_function("to_char_scalar_1000", |b| {
-        let mut rng = rand::rng();
-        let timestamp = "2026-07-08T09:10:11"
-            .parse::<NaiveDateTime>()
-            .unwrap()
-            .with_nanosecond(56789)
-            .unwrap()
-            .and_utc()
-            .timestamp_nanos_opt()
-            .unwrap();
-        let data = ColumnarValue::Scalar(TimestampNanosecond(Some(timestamp), None));
-        let pattern =
-            ColumnarValue::Scalar(ScalarValue::Utf8(Some(pick_date_pattern(&mut rng))));
-
-        b.iter(|| {
-            black_box(
-                to_char()
-                    .invoke_with_args(ScalarFunctionArgs {
-                        args: vec![data.clone(), pattern.clone()],
-                        arg_fields: vec![
-                            Field::new("a", data.data_type(), true).into(),
-                            Field::new("b", pattern.data_type(), true).into(),
-                        ],
-                        number_rows: 1,
-                        return_field: Field::new("f", DataType::Utf8, true).into(),
-                        config_options: Arc::clone(&config_options),
-                    })
-                    .expect("to_char should work on valid values"),
-            )
-        })
-    });
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/functions/benches/to_hex.rs b/datafusion/functions/benches/to_hex.rs
index a75ed9258791e..33f8d9c49e8eb 100644
--- a/datafusion/functions/benches/to_hex.rs
+++ b/datafusion/functions/benches/to_hex.rs
@@ -15,33 +15,31 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
+use arrow::array::Int64Array;
 use arrow::datatypes::{DataType, Field, Int32Type, Int64Type};
 use arrow::util::bench_util::create_primitive_array;
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::string;
 use std::hint::black_box;
 use std::sync::Arc;
+use std::time::Duration;
 
 fn criterion_benchmark(c: &mut Criterion) {
     let hex = string::to_hex();
-    let size = 1024;
-    let i32_array = Arc::new(create_primitive_array::<Int32Type>(size, 0.2));
-    let batch_len = i32_array.len();
-    let i32_args = vec![ColumnarValue::Array(i32_array)];
     let config_options = Arc::new(ConfigOptions::default());
 
-    c.bench_function(&format!("to_hex i32 array: {size}"), |b| {
+    c.bench_function("to_hex/scalar_i32", |b| {
+        let args = vec![ColumnarValue::Scalar(ScalarValue::Int32(Some(2147483647)))];
+        let arg_fields = vec![Field::new("a", DataType::Int32, true).into()];
         b.iter(|| {
-            let args_cloned = i32_args.clone();
             black_box(
                 hex.invoke_with_args(ScalarFunctionArgs {
-                    args: args_cloned,
-                    arg_fields: vec![Field::new("a", DataType::Int32, false).into()],
-                    number_rows: batch_len,
+                    args: args.clone(),
+                    arg_fields: arg_fields.clone(),
+                    number_rows: 1,
                     return_field: Field::new("f", DataType::Utf8, true).into(),
                     config_options: Arc::clone(&config_options),
                 })
@@ -49,17 +47,18 @@ fn criterion_benchmark(c: &mut Criterion) {
             )
         })
     });
-    let i64_array = Arc::new(create_primitive_array::<Int64Type>(size, 0.2));
-    let batch_len = i64_array.len();
-    let i64_args = vec![ColumnarValue::Array(i64_array)];
-    c.bench_function(&format!("to_hex i64 array: {size}"), |b| {
+
+    c.bench_function("to_hex/scalar_i64", |b| {
+        let args = vec![ColumnarValue::Scalar(ScalarValue::Int64(Some(
+            9223372036854775807,
+        )))];
+        let arg_fields = vec![Field::new("a", DataType::Int64, true).into()];
         b.iter(|| {
-            let args_cloned = i64_args.clone();
             black_box(
                 hex.invoke_with_args(ScalarFunctionArgs {
-                    args: args_cloned,
-                    arg_fields: vec![Field::new("a", DataType::Int64, false).into()],
-                    number_rows: batch_len,
+                    args: args.clone(),
+                    arg_fields: arg_fields.clone(),
+                    number_rows: 1,
                     return_field: Field::new("f", DataType::Utf8, true).into(),
                     config_options: Arc::clone(&config_options),
                 })
@@ -67,6 +66,88 @@ fn criterion_benchmark(c: &mut Criterion) {
             )
         })
     });
+
+    for size in [1024, 4096, 8192] {
+        let mut group = c.benchmark_group(format!("to_hex size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        // i32 array with random values
+        let i32_array = Arc::new(create_primitive_array::<Int32Type>(size, 0.1));
+        let batch_len = i32_array.len();
+        let i32_args = vec![ColumnarValue::Array(i32_array)];
+
+        group.bench_function("i32_random", |b| {
+            b.iter(|| {
+                let args_cloned = i32_args.clone();
+                black_box(
+                    hex.invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: vec![Field::new("a", DataType::Int32, true).into()],
+                        number_rows: batch_len,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+                )
+            })
+        });
+
+        // i64 array with random values (produces longer hex strings)
+        let i64_array = Arc::new(create_primitive_array::<Int64Type>(size, 0.1));
+        let batch_len = i64_array.len();
+        let i64_args = vec![ColumnarValue::Array(i64_array)];
+
+        group.bench_function("i64_random", |b| {
+            b.iter(|| {
+                let args_cloned = i64_args.clone();
+                black_box(
+                    hex.invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: vec![Field::new("a", DataType::Int64, true).into()],
+                        number_rows: batch_len,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+                )
+            })
+        });
+
+        // i64 array with large values (max length hex strings)
+        let i64_large_array = Arc::new(Int64Array::from(
+            (0..size)
+                .map(|i| {
+                    if i % 10 == 0 {
+                        None
+                    } else {
+                        Some(i64::MAX - i as i64)
+                    }
+                })
+                .collect::<Vec<_>>(),
+        ));
+        let batch_len = i64_large_array.len();
+        let i64_large_args = vec![ColumnarValue::Array(i64_large_array)];
+
+        group.bench_function("i64_large_values", |b| {
+            b.iter(|| {
+                let args_cloned = i64_large_args.clone();
+                black_box(
+                    hex.invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: vec![Field::new("a", DataType::Int64, true).into()],
+                        number_rows: batch_len,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+                )
+            })
+        });
+
+        group.finish();
+    }
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/functions/benches/to_local_time.rs b/datafusion/functions/benches/to_local_time.rs
new file mode 100644
index 0000000000000..42d1e271980e8
--- /dev/null
+++ b/datafusion/functions/benches/to_local_time.rs
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::hint::black_box;
+use std::sync::Arc;
+
+use arrow::array::{Array, ArrayRef, TimestampNanosecondArray};
+use arrow::datatypes::Field;
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::datetime::to_local_time;
+use rand::Rng;
+use rand::rngs::ThreadRng;
+
+fn timestamps(rng: &mut ThreadRng) -> TimestampNanosecondArray {
+    let nanos: Vec<i64> = (0..100_000)
+        .map(|_| rng.random_range(0..1_000_000_000_000_000_000i64))
+        .collect();
+    TimestampNanosecondArray::from(nanos).with_timezone("America/New_York")
+}
+
+fn timestamps_with_nulls(rng: &mut ThreadRng) -> TimestampNanosecondArray {
+    let values: Vec<Option<i64>> = (0..100_000)
+        .map(|_| {
+            if rng.random_range(0..10u32) == 0 {
+                None
+            } else {
+                Some(rng.random_range(0..1_000_000_000_000_000_000i64))
+            }
+        })
+        .collect();
+    TimestampNanosecondArray::from(values).with_timezone("America/New_York")
+}
+
+fn bench_to_local_time(c: &mut Criterion, name: &str, array: ArrayRef) {
+    let batch_len = array.len();
+    let input = ColumnarValue::Array(array);
+    let udf = to_local_time();
+    let return_type = udf.return_type(&[input.data_type()]).unwrap();
+    let return_field = Arc::new(Field::new("f", return_type, true));
+    let arg_fields = vec![Field::new("a", input.data_type(), true).into()];
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function(name, |b| {
+        b.iter(|| {
+            black_box(
+                udf.invoke_with_args(ScalarFunctionArgs {
+                    args: vec![input.clone()],
+                    arg_fields: arg_fields.clone(),
+                    number_rows: batch_len,
+                    return_field: Arc::clone(&return_field),
+                    config_options: Arc::clone(&config_options),
+                })
+                .expect("to_local_time should work on valid values"),
+            )
+        })
+    });
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let mut rng = rand::rng();
+    bench_to_local_time(
+        c,
+        "to_local_time_no_nulls_100k",
+        Arc::new(timestamps(&mut rng)),
+    );
+    bench_to_local_time(
+        c,
+        "to_local_time_10pct_nulls_100k",
+        Arc::new(timestamps_with_nulls(&mut rng)),
+    );
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/to_time.rs b/datafusion/functions/benches/to_time.rs
new file mode 100644
index 0000000000000..6b3aa192415a3
--- /dev/null
+++ b/datafusion/functions/benches/to_time.rs
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::hint::black_box;
+use std::sync::Arc;
+
+use arrow::array::{Array, ArrayRef, StringArray};
+use arrow::datatypes::Field;
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::datetime::to_time;
+use rand::Rng;
+use rand::rngs::ThreadRng;
+
+fn random_time_string(rng: &mut ThreadRng) -> String {
+    format!(
+        "{:02}:{:02}:{:02}.{:06}",
+        rng.random_range(0..24u32),
+        rng.random_range(0..60u32),
+        rng.random_range(0..60u32),
+        rng.random_range(0..1_000_000u32),
+    )
+}
+
+fn time_strings(rng: &mut ThreadRng) -> StringArray {
+    let strings: Vec<String> = (0..100_000).map(|_| random_time_string(rng)).collect();
+    StringArray::from(strings)
+}
+
+fn time_strings_with_nulls(rng: &mut ThreadRng) -> StringArray {
+    let values: Vec<Option<String>> = (0..100_000)
+        .map(|_| {
+            if rng.random_range(0..10u32) == 0 {
+                None
+            } else {
+                Some(random_time_string(rng))
+            }
+        })
+        .collect();
+    StringArray::from(values)
+}
+
+fn bench_to_time(c: &mut Criterion, name: &str, array: ArrayRef) {
+    let batch_len = array.len();
+    let input = ColumnarValue::Array(array);
+    let udf = to_time();
+    let return_type = udf.return_type(&[input.data_type()]).unwrap();
+    let return_field = Arc::new(Field::new("f", return_type, true));
+    let arg_fields = vec![Field::new("a", input.data_type(), true).into()];
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function(name, |b| {
+        b.iter(|| {
+            black_box(
+                udf.invoke_with_args(ScalarFunctionArgs {
+                    args: vec![input.clone()],
+                    arg_fields: arg_fields.clone(),
+                    number_rows: batch_len,
+                    return_field: Arc::clone(&return_field),
+                    config_options: Arc::clone(&config_options),
+                })
+                .expect("to_time should work on valid values"),
+            )
+        })
+    });
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let mut rng = rand::rng();
+    bench_to_time(c, "to_time_no_nulls_100k", Arc::new(time_strings(&mut rng)));
+    bench_to_time(
+        c,
+        "to_time_10pct_nulls_100k",
+        Arc::new(time_strings_with_nulls(&mut rng)),
+    );
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/to_timestamp.rs b/datafusion/functions/benches/to_timestamp.rs
index a8f5c5816d4da..90ea145d5d2c0 100644
--- a/datafusion/functions/benches/to_timestamp.rs
+++ b/datafusion/functions/benches/to_timestamp.rs
@@ -15,8 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use std::hint::black_box;
 use std::sync::Arc;
 
@@ -24,7 +22,7 @@ use arrow::array::builder::StringBuilder;
 use arrow::array::{Array, ArrayRef, StringArray};
 use arrow::compute::cast;
 use arrow::datatypes::{DataType, Field, TimeUnit};
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::datetime::to_timestamp;
@@ -114,16 +112,21 @@ fn criterion_benchmark(c: &mut Criterion) {
         Field::new("f", DataType::Timestamp(TimeUnit::Nanosecond, None), true).into();
     let arg_field = Field::new("a", DataType::Utf8, false).into();
     let arg_fields = vec![arg_field];
-    let config_options = Arc::new(ConfigOptions::default());
+    let mut options = ConfigOptions::default();
+    options.execution.time_zone = Some("UTC".into());
+    let config_options = Arc::new(options);
+
+    let to_timestamp_udf = to_timestamp(config_options.as_ref());
 
     c.bench_function("to_timestamp_no_formats_utf8", |b| {
+        let to_timestamp_udf = Arc::clone(&to_timestamp_udf);
         let arr_data = data();
         let batch_len = arr_data.len();
         let string_array = ColumnarValue::Array(Arc::new(arr_data) as ArrayRef);
 
         b.iter(|| {
             black_box(
-                to_timestamp()
+                to_timestamp_udf
                     .invoke_with_args(ScalarFunctionArgs {
                         args: vec![string_array.clone()],
                         arg_fields: arg_fields.clone(),
@@ -137,13 +140,14 @@ fn criterion_benchmark(c: &mut Criterion) {
     });
 
     c.bench_function("to_timestamp_no_formats_largeutf8", |b| {
+        let to_timestamp_udf = Arc::clone(&to_timestamp_udf);
         let data = cast(&data(), &DataType::LargeUtf8).unwrap();
         let batch_len = data.len();
         let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef);
 
         b.iter(|| {
             black_box(
-                to_timestamp()
+                to_timestamp_udf
                     .invoke_with_args(ScalarFunctionArgs {
                         args: vec![string_array.clone()],
                         arg_fields: arg_fields.clone(),
@@ -157,13 +161,14 @@ fn criterion_benchmark(c: &mut Criterion) {
     });
 
     c.bench_function("to_timestamp_no_formats_utf8view", |b| {
+        let to_timestamp_udf = Arc::clone(&to_timestamp_udf);
         let data = cast(&data(), &DataType::Utf8View).unwrap();
         let batch_len = data.len();
         let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef);
 
         b.iter(|| {
             black_box(
-                to_timestamp()
+                to_timestamp_udf
                     .invoke_with_args(ScalarFunctionArgs {
                         args: vec![string_array.clone()],
                         arg_fields: arg_fields.clone(),
@@ -177,6 +182,7 @@ fn criterion_benchmark(c: &mut Criterion) {
     });
 
     c.bench_function("to_timestamp_with_formats_utf8", |b| {
+        let to_timestamp_udf = Arc::clone(&to_timestamp_udf);
         let (inputs, format1, format2, format3) = data_with_formats();
         let batch_len = inputs.len();
 
@@ -196,7 +202,7 @@ fn criterion_benchmark(c: &mut Criterion) {
 
         b.iter(|| {
             black_box(
-                to_timestamp()
+                to_timestamp_udf
                     .invoke_with_args(ScalarFunctionArgs {
                         args: args.clone(),
                         arg_fields: arg_fields.clone(),
@@ -210,6 +216,7 @@ fn criterion_benchmark(c: &mut Criterion) {
     });
 
     c.bench_function("to_timestamp_with_formats_largeutf8", |b| {
+        let to_timestamp_udf = Arc::clone(&to_timestamp_udf);
         let (inputs, format1, format2, format3) = data_with_formats();
         let batch_len = inputs.len();
 
@@ -237,7 +244,7 @@ fn criterion_benchmark(c: &mut Criterion) {
 
         b.iter(|| {
             black_box(
-                to_timestamp()
+                to_timestamp_udf
                     .invoke_with_args(ScalarFunctionArgs {
                         args: args.clone(),
                         arg_fields: arg_fields.clone(),
@@ -251,6 +258,7 @@ fn criterion_benchmark(c: &mut Criterion) {
     });
 
     c.bench_function("to_timestamp_with_formats_utf8view", |b| {
+        let to_timestamp_udf = Arc::clone(&to_timestamp_udf);
         let (inputs, format1, format2, format3) = data_with_formats();
 
         let batch_len = inputs.len();
@@ -279,7 +287,7 @@ fn criterion_benchmark(c: &mut Criterion) {
 
         b.iter(|| {
             black_box(
-                to_timestamp()
+                to_timestamp_udf
                     .invoke_with_args(ScalarFunctionArgs {
                         args: args.clone(),
                         arg_fields: arg_fields.clone(),
diff --git a/datafusion/functions/benches/translate.rs b/datafusion/functions/benches/translate.rs
new file mode 100644
index 0000000000000..d0568ba0f5355
--- /dev/null
+++ b/datafusion/functions/benches/translate.rs
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::OffsetSizeTrait;
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::create_string_array_with_len;
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::{DataFusionError, ScalarValue};
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::unicode;
+use std::hint::black_box;
+use std::sync::Arc;
+use std::time::Duration;
+
+fn create_args_array_from_to<O: OffsetSizeTrait>(
+    size: usize,
+    str_len: usize,
+) -> Vec<ColumnarValue> {
+    let string_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+    let from_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, 3));
+    let to_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, 2));
+
+    vec![
+        ColumnarValue::Array(string_array),
+        ColumnarValue::Array(from_array),
+        ColumnarValue::Array(to_array),
+    ]
+}
+
+fn create_args_scalar_from_to<O: OffsetSizeTrait>(
+    size: usize,
+    str_len: usize,
+) -> Vec<ColumnarValue> {
+    let string_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+
+    vec![
+        ColumnarValue::Array(string_array),
+        ColumnarValue::Scalar(ScalarValue::from("aeiou")),
+        ColumnarValue::Scalar(ScalarValue::from("AEIOU")),
+    ]
+}
+
+fn invoke_translate_with_args(
+    args: Vec<ColumnarValue>,
+    number_rows: usize,
+) -> Result<ColumnarValue, DataFusionError> {
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    unicode::translate().invoke_with_args(ScalarFunctionArgs {
+        args,
+        arg_fields,
+        number_rows,
+        return_field: Field::new("f", DataType::Utf8, true).into(),
+        config_options: Arc::clone(&config_options),
+    })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    for size in [1024, 4096] {
+        let mut group = c.benchmark_group(format!("translate size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        for str_len in [8, 32, 128, 1024] {
+            let args = create_args_array_from_to::<i32>(size, str_len);
+            group.bench_function(format!("array_from_to [str_len={str_len}]"), |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(invoke_translate_with_args(args_cloned, size))
+                })
+            });
+
+            let args = create_args_scalar_from_to::<i32>(size, str_len);
+            group.bench_function(format!("scalar_from_to [str_len={str_len}]"), |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(invoke_translate_with_args(args_cloned, size))
+                })
+            });
+        }
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/trim.rs b/datafusion/functions/benches/trim.rs
new file mode 100644
index 0000000000000..21d99592d1820
--- /dev/null
+++ b/datafusion/functions/benches/trim.rs
@@ -0,0 +1,435 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray};
+use arrow::datatypes::{DataType, Field};
+use criterion::{
+    BenchmarkGroup, Criterion, SamplingMode, criterion_group, criterion_main,
+    measurement::Measurement,
+};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDF};
+use datafusion_functions::string;
+use rand::{Rng, SeedableRng, distr::Alphanumeric, rngs::StdRng};
+use std::hint::black_box;
+use std::{fmt, sync::Arc};
+
+#[derive(Clone, Copy)]
+pub enum StringArrayType {
+    Utf8View,
+    Utf8,
+    LargeUtf8,
+}
+
+impl fmt::Display for StringArrayType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            StringArrayType::Utf8View => f.write_str("string_view"),
+            StringArrayType::Utf8 => f.write_str("string"),
+            StringArrayType::LargeUtf8 => f.write_str("large_string"),
+        }
+    }
+}
+
+#[derive(Clone, Copy)]
+pub enum TrimType {
+    Ltrim,
+    Rtrim,
+    Btrim,
+}
+
+impl fmt::Display for TrimType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            TrimType::Ltrim => f.write_str("ltrim"),
+            TrimType::Rtrim => f.write_str("rtrim"),
+            TrimType::Btrim => f.write_str("btrim"),
+        }
+    }
+}
+
+/// Returns an array of strings with trim characters positioned according to trim type,
+/// and `characters` as a ScalarValue.
+///
+/// For ltrim: trim characters are at the start (prefix)
+/// For rtrim: trim characters are at the end (suffix)
+/// For btrim: trim characters are at both start and end
+fn create_string_array_and_characters(
+    size: usize,
+    characters: &str,
+    trimmed: &str,
+    remaining_len: usize,
+    string_array_type: StringArrayType,
+    trim_type: TrimType,
+) -> (ArrayRef, ScalarValue) {
+    let rng = &mut StdRng::seed_from_u64(42);
+
+    // Create `size` rows:
+    //   - 10% rows will be `None`
+    //   - Other 90% will be strings with `remaining_len` content length
+    let string_iter = (0..size).map(|_| {
+        if rng.random::<f32>() < 0.1 {
+            None
+        } else {
+            let content: String = rng
+                .sample_iter(&Alphanumeric)
+                .take(remaining_len)
+                .map(char::from)
+                .collect();
+
+            let value = match trim_type {
+                TrimType::Ltrim => format!("{trimmed}{content}"),
+                TrimType::Rtrim => format!("{content}{trimmed}"),
+                TrimType::Btrim => format!("{trimmed}{content}{trimmed}"),
+            };
+            Some(value)
+        }
+    });
+
+    // Build the target `string array` and `characters` according to `string_array_type`
+    match string_array_type {
+        StringArrayType::Utf8View => (
+            Arc::new(string_iter.collect::<StringViewArray>()),
+            ScalarValue::Utf8View(Some(characters.to_string())),
+        ),
+        StringArrayType::Utf8 => (
+            Arc::new(string_iter.collect::<StringArray>()),
+            ScalarValue::Utf8(Some(characters.to_string())),
+        ),
+        StringArrayType::LargeUtf8 => (
+            Arc::new(string_iter.collect::<LargeStringArray>()),
+            ScalarValue::LargeUtf8(Some(characters.to_string())),
+        ),
+    }
+}
+
+/// Create args for the trim benchmark
+fn create_args(
+    size: usize,
+    characters: &str,
+    trimmed: &str,
+    remaining_len: usize,
+    string_array_type: StringArrayType,
+    trim_type: TrimType,
+) -> Vec<ColumnarValue> {
+    let (string_array, pattern) = create_string_array_and_characters(
+        size,
+        characters,
+        trimmed,
+        remaining_len,
+        string_array_type,
+        trim_type,
+    );
+    vec![
+        ColumnarValue::Array(string_array),
+        ColumnarValue::Scalar(pattern),
+    ]
+}
+
+/// Create args for trim benchmark where space characters are being trimmed
+fn create_space_trim_args(
+    size: usize,
+    pad_len: usize,
+    remaining_len: usize,
+    string_array_type: StringArrayType,
+    trim_type: TrimType,
+) -> Vec<ColumnarValue> {
+    let rng = &mut StdRng::seed_from_u64(42);
+    let spaces = " ".repeat(pad_len);
+
+    let string_iter = (0..size).map(|_| {
+        if rng.random::<f32>() < 0.1 {
+            None
+        } else {
+            let content: String = rng
+                .sample_iter(&Alphanumeric)
+                .take(remaining_len)
+                .map(char::from)
+                .collect();
+
+            let value = match trim_type {
+                TrimType::Ltrim => format!("{spaces}{content}"),
+                TrimType::Rtrim => format!("{content}{spaces}"),
+                TrimType::Btrim => format!("{spaces}{content}{spaces}"),
+            };
+            Some(value)
+        }
+    });
+
+    let string_array: ArrayRef = match string_array_type {
+        StringArrayType::Utf8View => Arc::new(string_iter.collect::<StringViewArray>()),
+        StringArrayType::Utf8 => Arc::new(string_iter.collect::<StringArray>()),
+        StringArrayType::LargeUtf8 => Arc::new(string_iter.collect::<LargeStringArray>()),
+    };
+
+    vec![ColumnarValue::Array(string_array)]
+}
+
+#[expect(clippy::too_many_arguments)]
+fn run_with_string_type<M: Measurement>(
+    group: &mut BenchmarkGroup<'_, M>,
+    trim_func: &ScalarUDF,
+    trim_type: TrimType,
+    size: usize,
+    total_len: usize,
+    characters: &str,
+    trimmed: &str,
+    remaining_len: usize,
+    string_type: StringArrayType,
+) {
+    let args = create_args(
+        size,
+        characters,
+        trimmed,
+        remaining_len,
+        string_type,
+        trim_type,
+    );
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    group.bench_function(
+        format!(
+            "{trim_type} {string_type} [size={size}, len={total_len}, remaining={remaining_len}]",
+        ),
+        |b| {
+            b.iter(|| {
+                let args_cloned = args.clone();
+                black_box(trim_func.invoke_with_args(ScalarFunctionArgs {
+                    args: args_cloned,
+                    arg_fields: arg_fields.clone(),
+                    number_rows: size,
+                    return_field: Field::new("f", DataType::Utf8, true).into(),
+                    config_options: Arc::clone(&config_options),
+                }))
+            })
+        },
+    );
+}
+
+#[expect(clippy::too_many_arguments)]
+fn run_trim_benchmark(
+    c: &mut Criterion,
+    group_name: &str,
+    trim_func: &ScalarUDF,
+    trim_type: TrimType,
+    string_types: &[StringArrayType],
+    size: usize,
+    total_len: usize,
+    characters: &str,
+    trimmed: &str,
+    remaining_len: usize,
+) {
+    let mut group = c.benchmark_group(group_name);
+    group.sampling_mode(SamplingMode::Flat);
+    group.sample_size(10);
+
+    for string_type in string_types {
+        run_with_string_type(
+            &mut group,
+            trim_func,
+            trim_type,
+            size,
+            total_len,
+            characters,
+            trimmed,
+            remaining_len,
+            *string_type,
+        );
+    }
+
+    group.finish();
+}
+
+#[expect(clippy::too_many_arguments)]
+fn run_space_trim_benchmark(
+    c: &mut Criterion,
+    group_name: &str,
+    trim_func: &ScalarUDF,
+    trim_type: TrimType,
+    string_types: &[StringArrayType],
+    size: usize,
+    pad_len: usize,
+    remaining_len: usize,
+) {
+    let mut group = c.benchmark_group(group_name);
+    group.sampling_mode(SamplingMode::Flat);
+    group.sample_size(10);
+
+    let total_len = match trim_type {
+        TrimType::Btrim => 2 * pad_len + remaining_len,
+        _ => pad_len + remaining_len,
+    };
+
+    for string_type in string_types {
+        let args =
+            create_space_trim_args(size, pad_len, remaining_len, *string_type, trim_type);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+        let config_options = Arc::new(ConfigOptions::default());
+
+        group.bench_function(
+            format!(
+                "{trim_type} {string_type} [size={size}, len={total_len}, pad={pad_len}]",
+            ),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(trim_func.invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let ltrim = string::ltrim();
+    let rtrim = string::rtrim();
+    let btrim = string::btrim();
+
+    let characters = ",!()";
+
+    let string_types = [
+        StringArrayType::Utf8View,
+        StringArrayType::Utf8,
+        StringArrayType::LargeUtf8,
+    ];
+
+    let trim_funcs = [
+        (&ltrim, TrimType::Ltrim),
+        (&rtrim, TrimType::Rtrim),
+        (&btrim, TrimType::Btrim),
+    ];
+
+    for size in [4096] {
+        for (trim_func, trim_type) in &trim_funcs {
+            // Scenario 1: Short strings (len <= 12, inline in StringView)
+            // trimmed_len=4, remaining_len=8
+            let total_len = 12;
+            let trimmed = characters;
+            let remaining_len = total_len - trimmed.len();
+            run_trim_benchmark(
+                c,
+                "short strings (len <= 12)",
+                trim_func,
+                *trim_type,
+                &string_types,
+                size,
+                total_len,
+                characters,
+                trimmed,
+                remaining_len,
+            );
+
+            // Scenario 2: Long strings, short trim (len > 12, output > 12)
+            // trimmed_len=4, remaining_len=60
+            let total_len = 64;
+            let trimmed = characters;
+            let remaining_len = total_len - trimmed.len();
+            run_trim_benchmark(
+                c,
+                "long strings, short trim",
+                trim_func,
+                *trim_type,
+                &string_types,
+                size,
+                total_len,
+                characters,
+                trimmed,
+                remaining_len,
+            );
+
+            // Scenario 3: Long strings, long trim (len > 12, output <= 12)
+            // trimmed_len=56, remaining_len=8
+            let total_len = 64;
+            let trimmed = characters.repeat(14);
+            let remaining_len = total_len - trimmed.len();
+            run_trim_benchmark(
+                c,
+                "long strings, long trim",
+                trim_func,
+                *trim_type,
+                &string_types,
+                size,
+                total_len,
+                characters,
+                &trimmed,
+                remaining_len,
+            );
+
+            // Scenario 4: Trim spaces, short strings (len <= 12)
+            // pad_len=4, remaining_len=8
+            run_space_trim_benchmark(
+                c,
+                "trim spaces, short strings (len <= 12)",
+                trim_func,
+                *trim_type,
+                &string_types,
+                size,
+                4,
+                8,
+            );
+
+            // Scenario 5: Trim spaces, long strings (len > 12)
+            // pad_len=4, remaining_len=60
+            run_space_trim_benchmark(
+                c,
+                "trim spaces, long strings",
+                trim_func,
+                *trim_type,
+                &string_types,
+                size,
+                4,
+                60,
+            );
+
+            // Scenario 6: Trim spaces, long strings, heavy padding
+            // pad_len=56, remaining_len=8
+            run_space_trim_benchmark(
+                c,
+                "trim spaces, heavy padding",
+                trim_func,
+                *trim_type,
+                &string_types,
+                size,
+                56,
+                8,
+            );
+        }
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/trunc.rs b/datafusion/functions/benches/trunc.rs
index 6e225e0e7038b..ffbedcb142c71 100644
--- a/datafusion/functions/benches/trunc.rs
+++ b/datafusion/functions/benches/trunc.rs
@@ -15,13 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::{
     datatypes::{Field, Float32Type, Float64Type},
     util::bench_util::create_primitive_array,
 };
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::math::trunc;
 use std::hint::black_box;
@@ -32,12 +30,13 @@ use std::sync::Arc;
 
 fn criterion_benchmark(c: &mut Criterion) {
     let trunc = trunc();
+    let config_options = Arc::new(ConfigOptions::default());
+
     for size in [1024, 4096, 8192] {
         let f32_array = Arc::new(create_primitive_array::<Float32Type>(size, 0.2));
         let f32_args = vec![ColumnarValue::Array(f32_array)];
         let arg_fields = vec![Field::new("a", DataType::Float32, false).into()];
         let return_field = Field::new("f", DataType::Float32, true).into();
-        let config_options = Arc::new(ConfigOptions::default());
 
         c.bench_function(&format!("trunc f32 array: {size}"), |b| {
             b.iter(|| {
@@ -74,6 +73,51 @@ fn criterion_benchmark(c: &mut Criterion) {
             })
         });
     }
+
+    // Scalar benchmarks - to measure optimized performance
+    let scalar_f64_args = vec![ColumnarValue::Scalar(
+        datafusion_common::ScalarValue::Float64(Some(std::f64::consts::PI)),
+    )];
+    let scalar_arg_fields = vec![Field::new("a", DataType::Float64, false).into()];
+    let scalar_return_field = Field::new("f", DataType::Float64, false).into();
+
+    c.bench_function("trunc f64 scalar", |b| {
+        b.iter(|| {
+            black_box(
+                trunc
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: scalar_f64_args.clone(),
+                        arg_fields: scalar_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Arc::clone(&scalar_return_field),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+
+    let scalar_f32_args = vec![ColumnarValue::Scalar(
+        datafusion_common::ScalarValue::Float32(Some(std::f32::consts::PI)),
+    )];
+    let scalar_f32_arg_fields = vec![Field::new("a", DataType::Float32, false).into()];
+    let scalar_f32_return_field = Field::new("f", DataType::Float32, false).into();
+
+    c.bench_function("trunc f32 scalar", |b| {
+        b.iter(|| {
+            black_box(
+                trunc
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: scalar_f32_args.clone(),
+                        arg_fields: scalar_f32_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Arc::clone(&scalar_f32_return_field),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/functions/benches/upper.rs b/datafusion/functions/benches/upper.rs
index 7328b32574a4a..3f6fa36b18c13 100644
--- a/datafusion/functions/benches/upper.rs
+++ b/datafusion/functions/benches/upper.rs
@@ -15,11 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::datatypes::{DataType, Field};
 use arrow::util::bench_util::create_string_array_with_len;
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::string;
diff --git a/datafusion/functions/benches/uuid.rs b/datafusion/functions/benches/uuid.rs
index 1368e2f2af5d1..629fb950dd9ff 100644
--- a/datafusion/functions/benches/uuid.rs
+++ b/datafusion/functions/benches/uuid.rs
@@ -15,10 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::datatypes::{DataType, Field};
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::ScalarFunctionArgs;
 use datafusion_functions::string;
diff --git a/datafusion/functions/src/core/arrow_cast.rs b/datafusion/functions/src/core/arrow_cast.rs
index c4e58601cd106..0b67883c17c87 100644
--- a/datafusion/functions/src/core/arrow_cast.rs
+++ b/datafusion/functions/src/core/arrow_cast.rs
@@ -20,17 +20,15 @@
 use arrow::datatypes::{DataType, Field, FieldRef};
 use arrow::error::ArrowError;
 use datafusion_common::{
-    arrow_datafusion_err, exec_err, internal_err, Result, ScalarValue,
+    Result, ScalarValue, arrow_datafusion_err, datatype::DataTypeExt,
+    exec_datafusion_err, exec_err, internal_err, types::logical_string,
+    utils::take_function_args,
 };
-use datafusion_common::{
-    exec_datafusion_err, utils::take_function_args, DataFusionError,
-};
-use std::any::Any;
 
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr::{
-    ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs,
-    ScalarUDFImpl, Signature, Volatility,
+    Coercion, ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -104,16 +102,18 @@ impl Default for ArrowCastFunc {
 impl ArrowCastFunc {
     pub fn new() -> Self {
         Self {
-            signature: Signature::any(2, Volatility::Immutable),
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_exact(TypeSignatureClass::Any),
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                ],
+                Volatility::Immutable,
+            ),
         }
     }
 }
 
 impl ScalarUDFImpl for ArrowCastFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "arrow_cast"
     }
@@ -154,24 +154,21 @@ impl ScalarUDFImpl for ArrowCastFunc {
 
     fn simplify(
         &self,
-        mut args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         // convert this into a real cast
-        let target_type = data_type_from_args(&args)?;
-        // remove second (type) argument
-        args.pop().unwrap();
-        let arg = args.pop().unwrap();
-
-        let source_type = info.get_data_type(&arg)?;
+        let [source_arg, type_arg] = take_function_args(self.name(), args)?;
+        let target_type = data_type_from_type_arg(self.name(), &type_arg)?;
+        let source_type = info.get_data_type(&source_arg)?;
         let new_expr = if source_type == target_type {
             // the argument's data type is already the correct type
-            arg
+            source_arg
         } else {
             // Use an actual cast to get the correct type
             Expr::Cast(datafusion_expr::Cast {
-                expr: Box::new(arg),
-                data_type: target_type,
+                expr: Box::new(source_arg),
+                field: target_type.into_nullable_field_ref(),
             })
         };
         // return the newly written argument to DataFusion
@@ -183,13 +180,11 @@ impl ScalarUDFImpl for ArrowCastFunc {
     }
 }
 
-/// Returns the requested type from the arguments
-fn data_type_from_args(args: &[Expr]) -> Result<DataType> {
-    let [_, type_arg] = take_function_args("arrow_cast", args)?;
-
+/// Returns the requested type from the type argument
+pub(crate) fn data_type_from_type_arg(name: &str, type_arg: &Expr) -> Result<DataType> {
     let Expr::Literal(ScalarValue::Utf8(Some(val)), _) = type_arg else {
         return exec_err!(
-            "arrow_cast requires its second argument to be a constant string, got {:?}",
+            "{name} requires its second argument to be a constant string, got {:?}",
             type_arg
         );
     };
diff --git a/datafusion/functions/src/core/arrow_field.rs b/datafusion/functions/src/core/arrow_field.rs
new file mode 100644
index 0000000000000..dce7cff42ba80
--- /dev/null
+++ b/datafusion/functions/src/core/arrow_field.rs
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Array, BooleanArray, MapBuilder, StringArray, StringBuilder, StructArray,
+};
+use arrow::datatypes::{DataType, Field, Fields};
+use datafusion_common::{Result, ScalarValue, utils::take_function_args};
+use datafusion_expr::{
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+use datafusion_macros::user_doc;
+use std::sync::Arc;
+
+#[user_doc(
+    doc_section(label = "Other Functions"),
+    description = "Returns a struct containing the Arrow field information of the expression, including name, data type, nullability, and metadata.",
+    syntax_example = "arrow_field(expression)",
+    sql_example = r#"```sql
+> select arrow_field(1);
++-------------------------------------------------------------+
+| arrow_field(Int64(1))                                       |
++-------------------------------------------------------------+
+| {name: lit, data_type: Int64, nullable: false, metadata: {}} |
++-------------------------------------------------------------+
+
+> select arrow_field(1)['data_type'];
++-----------------------------------+
+| arrow_field(Int64(1))[data_type]  |
++-----------------------------------+
+| Int64                             |
++-----------------------------------+
+```"#,
+    argument(
+        name = "expression",
+        description = "Expression to evaluate. The expression can be a constant, column, or function, and any combination of operators."
+    )
+)]
+#[derive(Clone, Debug, PartialEq, Eq, Hash)]
+pub struct ArrowFieldFunc {
+    signature: Signature,
+}
+
+impl Default for ArrowFieldFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ArrowFieldFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::any(1, Volatility::Immutable),
+        }
+    }
+
+    fn return_struct_type() -> DataType {
+        DataType::Struct(Fields::from(vec![
+            Field::new("name", DataType::Utf8, false),
+            Field::new("data_type", DataType::Utf8, false),
+            Field::new("nullable", DataType::Boolean, false),
+            Field::new(
+                "metadata",
+                DataType::Map(
+                    Arc::new(Field::new(
+                        "entries",
+                        DataType::Struct(Fields::from(vec![
+                            Field::new("keys", DataType::Utf8, false),
+                            Field::new("values", DataType::Utf8, true),
+                        ])),
+                        false,
+                    )),
+                    false,
+                ),
+                false,
+            ),
+        ]))
+    }
+}
+
+impl ScalarUDFImpl for ArrowFieldFunc {
+    fn name(&self) -> &str {
+        "arrow_field"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Self::return_struct_type())
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let return_type = args.return_type().clone();
+        let [field] = take_function_args(self.name(), args.arg_fields)?;
+
+        // Build the name array
+        let name_array =
+            Arc::new(StringArray::from(vec![field.name().as_str()])) as Arc<dyn Array>;
+
+        // Build the data_type array
+        let data_type_str = field.data_type().to_string();
+        let data_type_array =
+            Arc::new(StringArray::from(vec![data_type_str.as_str()])) as Arc<dyn Array>;
+
+        // Build the nullable array
+        let nullable_array =
+            Arc::new(BooleanArray::from(vec![field.is_nullable()])) as Arc<dyn Array>;
+
+        // Build the metadata map array (same pattern as arrow_metadata.rs)
+        let metadata = field.metadata();
+        let mut map_builder =
+            MapBuilder::new(None, StringBuilder::new(), StringBuilder::new());
+
+        let mut entries: Vec<_> = metadata.iter().collect();
+        entries.sort_by_key(|(k, _)| *k);
+
+        for (k, v) in entries {
+            map_builder.keys().append_value(k);
+            map_builder.values().append_value(v);
+        }
+        map_builder.append(true)?;
+
+        let metadata_array = Arc::new(map_builder.finish()) as Arc<dyn Array>;
+
+        // Build the struct
+        let DataType::Struct(fields) = return_type else {
+            unreachable!()
+        };
+
+        let struct_array = StructArray::new(
+            fields,
+            vec![name_array, data_type_array, nullable_array, metadata_array],
+            None,
+        );
+
+        Ok(ColumnarValue::Scalar(ScalarValue::try_from_array(
+            &struct_array,
+            0,
+        )?))
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
diff --git a/datafusion/functions/src/core/arrow_metadata.rs b/datafusion/functions/src/core/arrow_metadata.rs
new file mode 100644
index 0000000000000..a80f66f396731
--- /dev/null
+++ b/datafusion/functions/src/core/arrow_metadata.rs
@@ -0,0 +1,155 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{MapBuilder, StringBuilder};
+use arrow::datatypes::{DataType, Field, Fields};
+use datafusion_common::types::logical_string;
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature, TypeSignatureClass, Volatility,
+};
+use datafusion_macros::user_doc;
+use std::sync::Arc;
+
+#[user_doc(
+    doc_section(label = "Other Functions"),
+    description = "Returns the metadata of the input expression. If a key is provided, returns the value for that key. If no key is provided, returns a Map of all metadata.",
+    syntax_example = "arrow_metadata(expression[, key])",
+    sql_example = r#"```sql
+> select arrow_metadata(col) from table;
++----------------------------+
+| arrow_metadata(table.col)  |
++----------------------------+
+| {k: v}                     |
++----------------------------+
+> select arrow_metadata(col, 'k') from table;
++-------------------------------+
+| arrow_metadata(table.col, 'k')|
++-------------------------------+
+| v                             |
++-------------------------------+
+```"#,
+    argument(
+        name = "expression",
+        description = "The expression to retrieve metadata from. Can be a column or other expression."
+    ),
+    argument(
+        name = "key",
+        description = "Optional. The specific metadata key to retrieve."
+    )
+)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct ArrowMetadataFunc {
+    signature: Signature,
+}
+
+impl ArrowMetadataFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Any,
+                    )]),
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_exact(TypeSignatureClass::Any),
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                    ]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl Default for ArrowMetadataFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ScalarUDFImpl for ArrowMetadataFunc {
+    fn name(&self) -> &str {
+        "arrow_metadata"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types.len() == 2 {
+            Ok(DataType::Utf8)
+        } else if arg_types.len() == 1 {
+            Ok(DataType::Map(
+                Arc::new(Field::new(
+                    "entries",
+                    DataType::Struct(Fields::from(vec![
+                        Field::new("keys", DataType::Utf8, false),
+                        Field::new("values", DataType::Utf8, true),
+                    ])),
+                    false,
+                )),
+                false,
+            ))
+        } else {
+            internal_err!("arrow_metadata requires 1 or 2 arguments")
+        }
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let metadata = args.arg_fields[0].metadata();
+
+        if args.args.len() == 2 {
+            let key = match &args.args[1] {
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(key))) => key,
+                _ => {
+                    return exec_err!(
+                        "Second argument to arrow_metadata must be a string literal key"
+                    );
+                }
+            };
+            let value = metadata.get(key).cloned();
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(value)))
+        } else if args.args.len() == 1 {
+            let mut map_builder =
+                MapBuilder::new(None, StringBuilder::new(), StringBuilder::new());
+
+            let mut entries: Vec<_> = metadata.iter().collect();
+            entries.sort_by_key(|(k, _)| *k);
+
+            for (k, v) in entries {
+                map_builder.keys().append_value(k);
+                map_builder.values().append_value(v);
+            }
+            map_builder.append(true)?;
+
+            let map_array = map_builder.finish();
+
+            Ok(ColumnarValue::Scalar(ScalarValue::try_from_array(
+                &map_array, 0,
+            )?))
+        } else {
+            internal_err!("arrow_metadata requires 1 or 2 arguments")
+        }
+    }
+}
diff --git a/datafusion/functions/src/core/arrow_try_cast.rs b/datafusion/functions/src/core/arrow_try_cast.rs
new file mode 100644
index 0000000000000..d27b29ba5736d
--- /dev/null
+++ b/datafusion/functions/src/core/arrow_try_cast.rs
@@ -0,0 +1,151 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`ArrowTryCastFunc`]: Implementation of the `arrow_try_cast`
+
+use arrow::datatypes::{DataType, Field, FieldRef};
+use arrow::error::ArrowError;
+use datafusion_common::{
+    Result, arrow_datafusion_err, datatype::DataTypeExt, exec_datafusion_err, exec_err,
+    internal_err, types::logical_string, utils::take_function_args,
+};
+
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignatureClass, Volatility,
+};
+use datafusion_macros::user_doc;
+
+use super::arrow_cast::data_type_from_type_arg;
+
+/// Like [`arrow_cast`](super::arrow_cast::ArrowCastFunc) but returns NULL on cast failure instead of erroring.
+///
+/// This is implemented by simplifying `arrow_try_cast(expr, 'Type')` into
+/// `Expr::TryCast` during optimization.
+#[user_doc(
+    doc_section(label = "Other Functions"),
+    description = "Casts a value to a specific Arrow data type, returning NULL if the cast fails.",
+    syntax_example = "arrow_try_cast(expression, datatype)",
+    sql_example = r#"```sql
+> select arrow_try_cast('123', 'Int64') as a,
+         arrow_try_cast('not_a_number', 'Int64') as b;
+
++-----+------+
+| a   | b    |
++-----+------+
+| 123 | NULL |
++-----+------+
+```"#,
+    argument(
+        name = "expression",
+        description = "Expression to cast. The expression can be a constant, column, or function, and any combination of operators."
+    ),
+    argument(
+        name = "datatype",
+        description = "[Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) name to cast to, as a string. The format is the same as that returned by [`arrow_typeof`]"
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct ArrowTryCastFunc {
+    signature: Signature,
+}
+
+impl Default for ArrowTryCastFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ArrowTryCastFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_exact(TypeSignatureClass::Any),
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for ArrowTryCastFunc {
+    fn name(&self) -> &str {
+        "arrow_try_cast"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be called instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        // TryCast can always return NULL (on cast failure), so always nullable
+        let [_, type_arg] = take_function_args(self.name(), args.scalar_arguments)?;
+
+        type_arg
+            .and_then(|sv| sv.try_as_str().flatten().filter(|s| !s.is_empty()))
+            .map_or_else(
+                || {
+                    exec_err!(
+                        "{} requires its second argument to be a non-empty constant string",
+                        self.name()
+                    )
+                },
+                |casted_type| match casted_type.parse::<DataType>() {
+                    Ok(data_type) => {
+                        Ok(Field::new(self.name(), data_type, true).into())
+                    }
+                    Err(ArrowError::ParseError(e)) => Err(exec_datafusion_err!("{e}")),
+                    Err(e) => Err(arrow_datafusion_err!(e)),
+                },
+            )
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!("arrow_try_cast should have been simplified to try_cast")
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [source_arg, type_arg] = take_function_args(self.name(), args)?;
+        let target_type = data_type_from_type_arg(self.name(), &type_arg)?;
+
+        let source_type = info.get_data_type(&source_arg)?;
+        let new_expr = if source_type == target_type {
+            source_arg
+        } else {
+            Expr::TryCast(datafusion_expr::TryCast {
+                expr: Box::new(source_arg),
+                field: target_type.into_nullable_field_ref(),
+            })
+        };
+        Ok(ExprSimplifyResult::Simplified(new_expr))
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
diff --git a/datafusion/functions/src/core/arrowtypeof.rs b/datafusion/functions/src/core/arrowtypeof.rs
index f178890f93704..d25db584f6ea0 100644
--- a/datafusion/functions/src/core/arrowtypeof.rs
+++ b/datafusion/functions/src/core/arrowtypeof.rs
@@ -16,11 +16,10 @@
 // under the License.
 
 use arrow::datatypes::DataType;
-use datafusion_common::{utils::take_function_args, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, utils::take_function_args};
 use datafusion_expr::{ColumnarValue, Documentation, ScalarFunctionArgs};
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
-use std::any::Any;
 
 #[user_doc(
     doc_section(label = "Other Functions"),
@@ -60,9 +59,6 @@ impl ArrowTypeOfFunc {
 }
 
 impl ScalarUDFImpl for ArrowTypeOfFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "arrow_typeof"
     }
diff --git a/datafusion/functions/src/core/cast_to_type.rs b/datafusion/functions/src/core/cast_to_type.rs
new file mode 100644
index 0000000000000..abc7d440e04ba
--- /dev/null
+++ b/datafusion/functions/src/core/cast_to_type.rs
@@ -0,0 +1,146 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`CastToTypeFunc`]: Implementation of the `cast_to_type` function
+
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::{Result, internal_err, utils::take_function_args};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignatureClass, Volatility,
+};
+use datafusion_macros::user_doc;
+
+/// Casts the first argument to the data type of the second argument.
+///
+/// Only the type of the second argument is used; its value is ignored.
+/// This is useful in macros or generic SQL where you need to preserve
+/// or match types dynamically.
+///
+/// For example:
+/// ```sql
+/// select cast_to_type('42', NULL::INTEGER);
+/// ```
+#[user_doc(
+    doc_section(label = "Other Functions"),
+    description = "Casts the first argument to the data type of the second argument. Only the type of the second argument is used; its value is ignored.",
+    syntax_example = "cast_to_type(expression, reference)",
+    sql_example = r#"```sql
+> select cast_to_type('42', NULL::INTEGER) as a;
++----+
+| a  |
++----+
+| 42 |
++----+
+
+> select cast_to_type(1 + 2, NULL::DOUBLE) as b;
++-----+
+| b   |
++-----+
+| 3.0 |
++-----+
+```"#,
+    argument(
+        name = "expression",
+        description = "The expression to cast. It can be a constant, column, or function, and any combination of operators."
+    ),
+    argument(
+        name = "reference",
+        description = "Reference expression whose data type determines the target cast type. The value is ignored."
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct CastToTypeFunc {
+    signature: Signature,
+}
+
+impl Default for CastToTypeFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl CastToTypeFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_exact(TypeSignatureClass::Any),
+                    Coercion::new_exact(TypeSignatureClass::Any),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for CastToTypeFunc {
+    fn name(&self) -> &str {
+        "cast_to_type"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be called instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let [source_field, reference_field] =
+            take_function_args(self.name(), args.arg_fields)?;
+        let target_type = reference_field.data_type().clone();
+        // Nullability is inherited only from the first argument (the value
+        // being cast).  The second argument is used solely for its type, so
+        // its own nullability is irrelevant.  The one exception is when the
+        // target type is Null – that type is inherently nullable.
+        let nullable = source_field.is_nullable() || target_type == DataType::Null;
+        Ok(Field::new(self.name(), target_type, nullable).into())
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!("cast_to_type should have been simplified to cast")
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [source_arg, type_arg] = take_function_args(self.name(), args)?;
+        let target_type = info.get_data_type(&type_arg)?;
+        let source_type = info.get_data_type(&source_arg)?;
+        let new_expr = if source_type == target_type {
+            // the argument's data type is already the correct type
+            source_arg
+        } else {
+            let nullable = info.nullable(&source_arg)? || target_type == DataType::Null;
+            // Use an actual cast to get the correct type
+            Expr::Cast(datafusion_expr::Cast {
+                expr: Box::new(source_arg),
+                field: Field::new("", target_type, nullable).into(),
+            })
+        };
+        Ok(ExprSimplifyResult::Simplified(new_expr))
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
diff --git a/datafusion/functions/src/core/coalesce.rs b/datafusion/functions/src/core/coalesce.rs
index aab1f445d5590..9cf3536443e6c 100644
--- a/datafusion/functions/src/core/coalesce.rs
+++ b/datafusion/functions/src/core/coalesce.rs
@@ -16,17 +16,16 @@
 // under the License.
 
 use arrow::datatypes::{DataType, Field, FieldRef};
-use datafusion_common::{exec_err, internal_err, plan_err, Result};
+use datafusion_common::{Result, exec_err, internal_err, plan_err};
 use datafusion_expr::binary::try_type_union_resolution;
 use datafusion_expr::conditional_expressions::CaseBuilder;
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr::{
     ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs,
 };
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
 use itertools::Itertools;
-use std::any::Any;
 
 #[user_doc(
     doc_section(label = "Conditional Functions"),
@@ -65,10 +64,6 @@ impl CoalesceFunc {
 }
 
 impl ScalarUDFImpl for CoalesceFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "coalesce"
     }
@@ -97,7 +92,7 @@ impl ScalarUDFImpl for CoalesceFunc {
     fn simplify(
         &self,
         args: Vec<Expr>,
-        _info: &dyn SimplifyInfo,
+        _info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         if args.is_empty() {
             return plan_err!("coalesce must have at least one argument");
diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs
index d18bd6e31f72e..e1fce4ee6c835 100644
--- a/datafusion/functions/src/core/getfield.rs
+++ b/datafusion/functions/src/core/getfield.rs
@@ -15,64 +15,75 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::sync::Arc;
+
 use arrow::array::{
-    make_array, make_comparator, Array, BooleanArray, Capacities, MutableArrayData,
-    Scalar,
+    Array, BooleanArray, Capacities, MutableArrayData, Scalar, make_array,
+    make_comparator,
 };
 use arrow::compute::SortOptions;
 use arrow::datatypes::{DataType, Field, FieldRef};
 use arrow_buffer::NullBuffer;
+
 use datafusion_common::cast::{as_map_array, as_struct_array};
 use datafusion_common::{
-    exec_err, internal_err, plan_datafusion_err, utils::take_function_args, Result,
-    ScalarValue,
+    Result, ScalarValue, exec_err, internal_err, plan_datafusion_err,
 };
+use datafusion_expr::expr::ScalarFunction;
+use datafusion_expr::simplify::ExprSimplifyResult;
 use datafusion_expr::{
-    ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs,
+    ColumnarValue, Documentation, Expr, ExpressionPlacement, ReturnFieldArgs,
+    ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, Volatility,
 };
-use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
-use std::any::Any;
-use std::sync::Arc;
 
 #[user_doc(
     doc_section(label = "Other Functions"),
     description = r#"Returns a field within a map or a struct with the given key.
+    Supports nested field access by providing multiple field names.
     Note: most users invoke `get_field` indirectly via field access
     syntax such as `my_struct_col['field_name']` which results in a call to
-    `get_field(my_struct_col, 'field_name')`."#,
-    syntax_example = "get_field(expression1, expression2)",
+    `get_field(my_struct_col, 'field_name')`.
+    Nested access like `my_struct['a']['b']` is optimized to a single call:
+    `get_field(my_struct, 'a', 'b')`."#,
+    syntax_example = "get_field(expression, field_name[, field_name2, ...])",
     sql_example = r#"```sql
-> create table t (idx varchar, v varchar) as values ('data','fusion'), ('apache', 'arrow');
-> select struct(idx, v) from t as c;
-+-------------------------+
-| struct(c.idx,c.v)       |
-+-------------------------+
-| {c0: data, c1: fusion}  |
-| {c0: apache, c1: arrow} |
-+-------------------------+
-> select get_field((select struct(idx, v) from t), 'c0');
-+-----------------------+
-| struct(t.idx,t.v)[c0] |
-+-----------------------+
-| data                  |
-| apache                |
-+-----------------------+
-> select get_field((select struct(idx, v) from t), 'c1');
-+-----------------------+
-| struct(t.idx,t.v)[c1] |
-+-----------------------+
-| fusion                |
-| arrow                 |
-+-----------------------+
+> -- Access a field from a struct column
+> create table test( struct_col) as values
+    ({name: 'Alice', age: 30}),
+    ({name: 'Bob', age: 25});
+> select struct_col from test;
++-----------------------------+
+| struct_col                  |
++-----------------------------+
+| {name: Alice, age: 30}      |
+| {name: Bob, age: 25}        |
++-----------------------------+
+> select struct_col['name'] as name from test;
++-------+
+| name  |
++-------+
+| Alice |
+| Bob   |
++-------+
+
+> -- Nested field access with multiple arguments
+> create table test(struct_col) as values
+    ({outer: {inner_val: 42}});
+> select struct_col['outer']['inner_val'] as result from test;
++--------+
+| result |
++--------+
+| 42     |
++--------+
 ```"#,
     argument(
-        name = "expression1",
-        description = "The map or struct to retrieve a field for."
+        name = "expression",
+        description = "The map or struct to retrieve a field from."
     ),
     argument(
-        name = "expression2",
-        description = "The field name in the map or struct to retrieve data for. Must evaluate to a string."
+        name = "field_name",
+        description = "The field name(s) to access, in order for nested access. Must evaluate to strings."
     )
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
@@ -86,43 +97,196 @@ impl Default for GetFieldFunc {
     }
 }
 
+/// Process a map array by finding matching keys and extracting corresponding values.
+///
+/// This function handles both simple (scalar) and nested key types by using
+/// appropriate comparison strategies.
+fn process_map_array(
+    array: &dyn Array,
+    key_array: Arc<dyn Array>,
+) -> Result<ColumnarValue> {
+    let map_array = as_map_array(array)?;
+    let keys = if key_array.data_type().is_nested() {
+        let comparator = make_comparator(
+            map_array.keys().as_ref(),
+            key_array.as_ref(),
+            SortOptions::default(),
+        )?;
+        let len = map_array.keys().len().min(key_array.len());
+        let values = (0..len).map(|i| comparator(i, i).is_eq()).collect();
+        let nulls = NullBuffer::union(map_array.keys().nulls(), key_array.nulls());
+        BooleanArray::new(values, nulls)
+    } else {
+        let be_compared = Scalar::new(key_array);
+        arrow::compute::kernels::cmp::eq(&be_compared, map_array.keys())?
+    };
+
+    let original_data = map_array.entries().column(1).to_data();
+    let capacity = Capacities::Array(original_data.len());
+    let mut mutable =
+        MutableArrayData::with_capacities(vec![&original_data], true, capacity);
+
+    for entry in 0..map_array.len() {
+        let start = map_array.value_offsets()[entry] as usize;
+        let end = map_array.value_offsets()[entry + 1] as usize;
+
+        let maybe_matched = keys
+            .slice(start, end - start)
+            .iter()
+            .enumerate()
+            .find(|(_, t)| t.unwrap());
+
+        if maybe_matched.is_none() {
+            mutable.extend_nulls(1);
+            continue;
+        }
+        let (match_offset, _) = maybe_matched.unwrap();
+        mutable.extend(0, start + match_offset, start + match_offset + 1);
+    }
+
+    let data = mutable.freeze();
+    let data = make_array(data);
+    Ok(ColumnarValue::Array(data))
+}
+
+/// Process a map array with a nested key type by iterating through entries
+/// and using a comparator for key matching.
+///
+/// This specialized version is used when the key type is nested (e.g., struct, list).
+fn process_map_with_nested_key(
+    array: &dyn Array,
+    key_array: &dyn Array,
+) -> Result<ColumnarValue> {
+    let map_array = as_map_array(array)?;
+
+    let comparator =
+        make_comparator(map_array.keys().as_ref(), key_array, SortOptions::default())?;
+
+    let original_data = map_array.entries().column(1).to_data();
+    let capacity = Capacities::Array(original_data.len());
+    let mut mutable =
+        MutableArrayData::with_capacities(vec![&original_data], true, capacity);
+
+    for entry in 0..map_array.len() {
+        let start = map_array.value_offsets()[entry] as usize;
+        let end = map_array.value_offsets()[entry + 1] as usize;
+
+        let mut found_match = false;
+        for i in start..end {
+            if comparator(i, 0).is_eq() {
+                mutable.extend(0, i, i + 1);
+                found_match = true;
+                break;
+            }
+        }
+
+        if !found_match {
+            mutable.extend_nulls(1);
+        }
+    }
+
+    let data = mutable.freeze();
+    let data = make_array(data);
+    Ok(ColumnarValue::Array(data))
+}
+
+/// Extract a single field from a struct or map array
+fn extract_single_field(base: ColumnarValue, name: ScalarValue) -> Result<ColumnarValue> {
+    let arrays = ColumnarValue::values_to_arrays(&[base])?;
+    let array = Arc::clone(&arrays[0]);
+
+    let string_value = name.try_as_str().flatten().map(|s| s.to_string());
+
+    match (array.data_type(), name, string_value) {
+        (DataType::Map(_, _), ScalarValue::List(arr), _) => {
+            let key_array: Arc<dyn Array> = arr;
+            process_map_array(&array, key_array)
+        }
+        (DataType::Map(_, _), ScalarValue::Struct(arr), _) => {
+            process_map_array(&array, arr as Arc<dyn Array>)
+        }
+        (DataType::Map(_, _), other, _) => {
+            let data_type = other.data_type();
+            if data_type.is_nested() {
+                process_map_with_nested_key(&array, &other.to_array()?)
+            } else {
+                process_map_array(&array, other.to_array()?)
+            }
+        }
+        (DataType::Struct(_), _, Some(k)) => {
+            let as_struct_array = as_struct_array(&array)?;
+            match as_struct_array.column_by_name(&k) {
+                None => exec_err!("Field {k} not found in struct"),
+                Some(col) => Ok(ColumnarValue::Array(Arc::clone(col))),
+            }
+        }
+        (DataType::Struct(_), name, _) => exec_err!(
+            "get_field is only possible on struct with utf8 indexes. \
+                         Received with {name:?} index"
+        ),
+        (DataType::Null, _, _) => Ok(ColumnarValue::Scalar(ScalarValue::Null)),
+        (dt, name, _) => exec_err!(
+            "get_field is only possible on maps or structs. Received {dt} with {name:?} index"
+        ),
+    }
+}
+
 impl GetFieldFunc {
     pub fn new() -> Self {
         Self {
-            signature: Signature::any(2, Volatility::Immutable),
+            signature: Signature::user_defined(Volatility::Immutable),
         }
     }
 }
 
 // get_field(struct_array, field_name)
 impl ScalarUDFImpl for GetFieldFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "get_field"
     }
 
     fn display_name(&self, args: &[Expr]) -> Result<String> {
-        let [base, field_name] = take_function_args(self.name(), args)?;
+        if args.len() < 2 {
+            return exec_err!(
+                "get_field requires at least 2 arguments, got {}",
+                args.len()
+            );
+        }
 
-        let name = match field_name {
-            Expr::Literal(name, _) => name.to_string(),
-            other => other.schema_name().to_string(),
-        };
+        let base = &args[0];
+        let field_names: Vec<String> = args[1..]
+            .iter()
+            .map(|f| match f {
+                Expr::Literal(name, _) => name.to_string(),
+                other => other.schema_name().to_string(),
+            })
+            .collect();
 
-        Ok(format!("{base}[{name}]"))
+        Ok(format!("{}[{}]", base, field_names.join("][")))
     }
 
     fn schema_name(&self, args: &[Expr]) -> Result<String> {
-        let [base, field_name] = take_function_args(self.name(), args)?;
-        let name = match field_name {
-            Expr::Literal(name, _) => name.to_string(),
-            other => other.schema_name().to_string(),
-        };
+        if args.len() < 2 {
+            return exec_err!(
+                "get_field requires at least 2 arguments, got {}",
+                args.len()
+            );
+        }
+
+        let base = &args[0];
+        let field_names: Vec<String> = args[1..]
+            .iter()
+            .map(|f| match f {
+                Expr::Literal(name, _) => name.to_string(),
+                other => other.schema_name().to_string(),
+            })
+            .collect();
 
-        Ok(format!("{}[{}]", base.schema_name(), name))
+        Ok(format!(
+            "{}[{}]",
+            base.schema_name(),
+            field_names.join("][")
+        ))
     }
 
     fn signature(&self) -> &Signature {
@@ -134,153 +298,353 @@ impl ScalarUDFImpl for GetFieldFunc {
     }
 
     fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
-        // Length check handled in the signature
-        debug_assert_eq!(args.scalar_arguments.len(), 2);
-
-        match (&args.arg_fields[0].data_type(), args.scalar_arguments[1].as_ref()) {
-            (DataType::Map(fields, _), _) => {
-                match fields.data_type() {
-                    DataType::Struct(fields) if fields.len() == 2 => {
-                        // Arrow's MapArray is essentially a ListArray of structs with two columns. They are
-                        // often named "key", and "value", but we don't require any specific naming here;
-                        // instead, we assume that the second column is the "value" column both here and in
-                        // execution.
-                        let value_field = fields.get(1).expect("fields should have exactly two members");
-
-                        Ok(value_field.as_ref().clone().with_nullable(true).into())
-                    },
-                    _ => exec_err!("Map fields must contain a Struct with exactly 2 fields"),
+        // Validate minimum 2 arguments: base expression + at least one field name
+        if args.scalar_arguments.len() < 2 {
+            return exec_err!(
+                "get_field requires at least 2 arguments, got {}",
+                args.scalar_arguments.len()
+            );
+        }
+
+        let mut current_field = Arc::clone(&args.arg_fields[0]);
+
+        // Iterate through each field name (starting from index 1)
+        for (i, sv) in args.scalar_arguments.iter().enumerate().skip(1) {
+            match current_field.data_type() {
+                DataType::Map(map_field, _) => {
+                    match map_field.data_type() {
+                        DataType::Struct(fields) if fields.len() == 2 => {
+                            // Arrow's MapArray is essentially a ListArray of structs with two columns. They are
+                            // often named "key", and "value", but we don't require any specific naming here;
+                            // instead, we assume that the second column is the "value" column both here and in
+                            // execution.
+                            let value_field = fields
+                                .get(1)
+                                .expect("fields should have exactly two members");
+
+                            current_field = Arc::new(
+                                value_field.as_ref().clone().with_nullable(true),
+                            );
+                        }
+                        _ => {
+                            return exec_err!(
+                                "Map fields must contain a Struct with exactly 2 fields"
+                            );
+                        }
+                    }
+                }
+                DataType::Struct(fields) => {
+                    let field_name = sv
+                        .as_ref()
+                        .and_then(|sv| {
+                            sv.try_as_str().flatten().filter(|s| !s.is_empty())
+                        })
+                        .ok_or_else(|| {
+                            datafusion_common::DataFusionError::Execution(
+                                "Field name must be a non-empty string".to_string(),
+                            )
+                        })?;
+
+                    let child_field = fields
+                        .iter()
+                        .find(|f| f.name() == field_name)
+                        .ok_or_else(|| {
+                            plan_datafusion_err!("Field {field_name} not found in struct")
+                        })?;
+
+                    let mut new_field = child_field.as_ref().clone();
+
+                    // If the parent is nullable, then getting the child must be nullable
+                    if current_field.is_nullable() {
+                        new_field = new_field.with_nullable(true);
+                    }
+                    current_field = Arc::new(new_field);
+                }
+                DataType::Null => {
+                    return Ok(Field::new(self.name(), DataType::Null, true).into());
+                }
+                other => {
+                    return exec_err!(
+                        "Cannot access field at argument {}: type {} is not Struct, Map, or Null",
+                        i,
+                        other
+                    );
                 }
             }
-            (DataType::Struct(fields),sv) => {
-                sv.and_then(|sv| sv.try_as_str().flatten().filter(|s| !s.is_empty()))
-                .map_or_else(
-                    || exec_err!("Field name must be a non-empty string"),
-                    |field_name| {
-                    fields.iter().find(|f| f.name() == field_name)
-                    .ok_or(plan_datafusion_err!("Field {field_name} not found in struct"))
-                    .map(|f| {
-                        let mut child_field = f.as_ref().clone();
-
-                        // If the parent is nullable, then getting the child must be nullable,
-                        // so potentially override the return value
-
-                        if args.arg_fields[0].is_nullable() {
-                            child_field = child_field.with_nullable(true);
-                        }
-                        Arc::new(child_field)
-                    })
-                })
-            },
-            (DataType::Null, _) => Ok(Field::new(self.name(), DataType::Null, true).into()),
-            (other, _) => exec_err!("The expression to get an indexed field is only valid for `Struct`, `Map` or `Null` types, got {other}"),
         }
+
+        Ok(current_field)
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        let [base, field_name] = take_function_args(self.name(), args.args)?;
+        if args.args.len() < 2 {
+            return exec_err!(
+                "get_field requires at least 2 arguments, got {}",
+                args.args.len()
+            );
+        }
+
+        let mut current = args.args[0].clone();
 
-        if base.data_type().is_null() {
+        // Early exit for null base
+        if current.data_type().is_null() {
             return Ok(ColumnarValue::Scalar(ScalarValue::Null));
         }
 
-        let arrays =
-            ColumnarValue::values_to_arrays(&[base.clone(), field_name.clone()])?;
-        let array = Arc::clone(&arrays[0]);
-        let name = match field_name {
-            ColumnarValue::Scalar(name) => name,
-            _ => {
-                return exec_err!(
-                    "get_field function requires the argument field_name to be a string"
-                );
+        // Iterate through each field name
+        for field_name in args.args.iter().skip(1) {
+            let field_name_scalar = match field_name {
+                ColumnarValue::Scalar(name) => name.clone(),
+                _ => {
+                    return exec_err!(
+                        "get_field function requires all field_name arguments to be scalars"
+                    );
+                }
+            };
+
+            current = extract_single_field(current, field_name_scalar)?;
+
+            // Early exit if we hit null
+            if current.data_type().is_null() {
+                return Ok(ColumnarValue::Scalar(ScalarValue::Null));
             }
-        };
+        }
 
-        fn process_map_array(
-            array: Arc<dyn Array>,
-            key_array: Arc<dyn Array>,
-        ) -> Result<ColumnarValue> {
-            let map_array = as_map_array(array.as_ref())?;
-            let keys = if key_array.data_type().is_nested() {
-                let comparator = make_comparator(
-                    map_array.keys().as_ref(),
-                    key_array.as_ref(),
-                    SortOptions::default(),
-                )?;
-                let len = map_array.keys().len().min(key_array.len());
-                let values = (0..len).map(|i| comparator(i, i).is_eq()).collect();
-                let nulls =
-                    NullBuffer::union(map_array.keys().nulls(), key_array.nulls());
-                BooleanArray::new(values, nulls)
-            } else {
-                let be_compared = Scalar::new(key_array);
-                arrow::compute::kernels::cmp::eq(&be_compared, map_array.keys())?
-            };
+        Ok(current)
+    }
 
-            let original_data = map_array.entries().column(1).to_data();
-            let capacity = Capacities::Array(original_data.len());
-            let mut mutable =
-                MutableArrayData::with_capacities(vec![&original_data], true, capacity);
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        _info: &datafusion_expr::simplify::SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        // Need at least 2 args (base + field)
+        if args.len() < 2 {
+            return Ok(ExprSimplifyResult::Original(args));
+        }
 
-            for entry in 0..map_array.len() {
-                let start = map_array.value_offsets()[entry] as usize;
-                let end = map_array.value_offsets()[entry + 1] as usize;
+        // Flatten all nested get_field calls in a single pass
+        // Pattern: get_field(get_field(get_field(base, a), b), c) => get_field(base, a, b, c)
 
-                let maybe_matched = keys
-                    .slice(start, end - start)
-                    .iter()
-                    .enumerate()
-                    .find(|(_, t)| t.unwrap());
+        // Collect path arguments from all nested levels
+        let mut path_args_stack = Vec::new();
+        let mut current_expr = &args[0];
 
-                if maybe_matched.is_none() {
-                    mutable.extend_nulls(1);
-                    continue;
-                }
-                let (match_offset, _) = maybe_matched.unwrap();
-                mutable.extend(0, start + match_offset, start + match_offset + 1);
+        // Push the outermost path arguments first
+        path_args_stack.push(&args[1..]);
+
+        // Walk down the chain of nested get_field calls
+        let base_expr = loop {
+            if let Expr::ScalarFunction(ScalarFunction {
+                func,
+                args: inner_args,
+            }) = current_expr
+                && func.inner().is::<GetFieldFunc>()
+            {
+                // Store this level's path arguments (all except the first, which is base/nested call)
+                path_args_stack.push(&inner_args[1..]);
+
+                // Move to the next level down
+                current_expr = &inner_args[0];
+                continue;
             }
+            // Not a get_field call, this is the base expression
+            break current_expr;
+        };
 
-            let data = mutable.freeze();
-            let data = make_array(data);
-            Ok(ColumnarValue::Array(data))
+        // If no nested get_field calls were found, return original
+        if path_args_stack.len() == args.len() - 1 {
+            return Ok(ExprSimplifyResult::Original(args));
         }
 
-        match (array.data_type(), name) {
-            (DataType::Map(_, _), ScalarValue::List(arr)) => {
-                let key_array: Arc<dyn Array> = arr;
-                process_map_array(array, key_array)
-            }
-            (DataType::Map(_, _), ScalarValue::Struct(arr)) => {
-                process_map_array(array, arr as Arc<dyn Array>)
-            }
-            (DataType::Map(_, _), other) => {
-                let data_type = other.data_type();
-                if data_type.is_nested() {
-                    exec_err!("unsupported type {} for map access", data_type)
-                } else {
-                    process_map_array(array, other.to_array()?)
-                }
-            }
-            (DataType::Struct(_), ScalarValue::Utf8(Some(k))) => {
-                let as_struct_array = as_struct_array(&array)?;
-                match as_struct_array.column_by_name(&k) {
-                    None => exec_err!("get indexed field {k} not found in struct"),
-                    Some(col) => Ok(ColumnarValue::Array(Arc::clone(col))),
-                }
-            }
-            (DataType::Struct(_), name) => exec_err!(
-                "get_field is only possible on struct with utf8 indexes. \
-                             Received with {name:?} index"
-            ),
-            (DataType::Null, _) => Ok(ColumnarValue::Scalar(ScalarValue::Null)),
-            (dt, name) => exec_err!(
-                "get_field is only possible on maps with utf8 indexes or struct \
-                                         with utf8 indexes. Received {dt} with {name:?} index"
+        // If we found any nested get_field calls, flatten them
+        // Build merged args: [base, ...all_path_args_in_correct_order]
+        let mut merged_args = vec![base_expr.clone()];
+
+        // Add path args in reverse order (innermost to outermost)
+        // Stack is: [outermost_paths, ..., innermost_paths]
+        // We want: [base, innermost_paths, ..., outermost_paths]
+        for path_slice in path_args_stack.iter().rev() {
+            merged_args.extend_from_slice(path_slice);
+        }
+
+        Ok(ExprSimplifyResult::Simplified(Expr::ScalarFunction(
+            ScalarFunction::new_udf(
+                Arc::new(ScalarUDF::new_from_impl(GetFieldFunc::new())),
+                merged_args,
             ),
+        )))
+    }
+
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        if arg_types.len() < 2 {
+            return exec_err!(
+                "get_field requires at least 2 arguments, got {}",
+                arg_types.len()
+            );
         }
+        // Accept types as-is, validation happens in return_field_from_args
+        Ok(arg_types.to_vec())
     }
 
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
+
+    fn placement(&self, args: &[ExpressionPlacement]) -> ExpressionPlacement {
+        // get_field can be pushed to leaves if:
+        // 1. The base (first arg) is a column or already placeable at leaves
+        // 2. All field keys (remaining args) are literals
+        if args.is_empty() {
+            return ExpressionPlacement::KeepInPlace;
+        }
+
+        let base_placement = args[0];
+        let base_is_pushable = matches!(
+            base_placement,
+            ExpressionPlacement::Column | ExpressionPlacement::MoveTowardsLeafNodes
+        );
+
+        let all_keys_are_literals = args
+            .iter()
+            .skip(1)
+            .all(|p| *p == ExpressionPlacement::Literal);
+
+        if base_is_pushable && all_keys_are_literals {
+            ExpressionPlacement::MoveTowardsLeafNodes
+        } else {
+            ExpressionPlacement::KeepInPlace
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{ArrayRef, Int32Array, StructArray};
+    use arrow::datatypes::Fields;
+
+    #[test]
+    fn test_get_field_utf8view_key() -> Result<()> {
+        // Create a struct array with fields "a" and "b"
+        let a_values = Int32Array::from(vec![Some(1), Some(2), Some(3)]);
+        let b_values = Int32Array::from(vec![Some(10), Some(20), Some(30)]);
+
+        let fields: Fields = vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+        ]
+        .into();
+
+        let struct_array = StructArray::new(
+            fields,
+            vec![
+                Arc::new(a_values) as ArrayRef,
+                Arc::new(b_values) as ArrayRef,
+            ],
+            None,
+        );
+
+        let base = ColumnarValue::Array(Arc::new(struct_array));
+
+        // Use Utf8View key to access field "a"
+        let key = ScalarValue::Utf8View(Some("a".to_string()));
+
+        let result = extract_single_field(base, key)?;
+
+        let result_array = result.into_array(3)?;
+        let expected = Int32Array::from(vec![Some(1), Some(2), Some(3)]);
+
+        assert_eq!(result_array.as_ref(), &expected as &dyn Array);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_placement_literal_key() {
+        let func = GetFieldFunc::new();
+
+        // get_field(col, 'literal') -> leaf-pushable (static field access)
+        let args = vec![ExpressionPlacement::Column, ExpressionPlacement::Literal];
+        assert_eq!(
+            func.placement(&args),
+            ExpressionPlacement::MoveTowardsLeafNodes
+        );
+
+        // get_field(col, 'a', 'b') -> leaf-pushable (nested static field access)
+        let args = vec![
+            ExpressionPlacement::Column,
+            ExpressionPlacement::Literal,
+            ExpressionPlacement::Literal,
+        ];
+        assert_eq!(
+            func.placement(&args),
+            ExpressionPlacement::MoveTowardsLeafNodes
+        );
+
+        // get_field(get_field(col, 'a'), 'b') represented as MoveTowardsLeafNodes for base
+        let args = vec![
+            ExpressionPlacement::MoveTowardsLeafNodes,
+            ExpressionPlacement::Literal,
+        ];
+        assert_eq!(
+            func.placement(&args),
+            ExpressionPlacement::MoveTowardsLeafNodes
+        );
+    }
+
+    #[test]
+    fn test_placement_column_key() {
+        let func = GetFieldFunc::new();
+
+        // get_field(col, other_col) -> NOT leaf-pushable (dynamic per-row lookup)
+        let args = vec![ExpressionPlacement::Column, ExpressionPlacement::Column];
+        assert_eq!(func.placement(&args), ExpressionPlacement::KeepInPlace);
+
+        // get_field(col, 'a', other_col) -> NOT leaf-pushable (dynamic nested lookup)
+        let args = vec![
+            ExpressionPlacement::Column,
+            ExpressionPlacement::Literal,
+            ExpressionPlacement::Column,
+        ];
+        assert_eq!(func.placement(&args), ExpressionPlacement::KeepInPlace);
+    }
+
+    #[test]
+    fn test_placement_root() {
+        let func = GetFieldFunc::new();
+
+        // get_field(root_expr, 'literal') -> NOT leaf-pushable
+        let args = vec![
+            ExpressionPlacement::KeepInPlace,
+            ExpressionPlacement::Literal,
+        ];
+        assert_eq!(func.placement(&args), ExpressionPlacement::KeepInPlace);
+
+        // get_field(col, root_expr) -> NOT leaf-pushable
+        let args = vec![
+            ExpressionPlacement::Column,
+            ExpressionPlacement::KeepInPlace,
+        ];
+        assert_eq!(func.placement(&args), ExpressionPlacement::KeepInPlace);
+    }
+
+    #[test]
+    fn test_placement_edge_cases() {
+        let func = GetFieldFunc::new();
+
+        // Empty args -> NOT leaf-pushable
+        assert_eq!(func.placement(&[]), ExpressionPlacement::KeepInPlace);
+
+        // Just base, no key -> MoveTowardsLeafNodes (not a valid call but should handle gracefully)
+        let args = vec![ExpressionPlacement::Column];
+        assert_eq!(
+            func.placement(&args),
+            ExpressionPlacement::MoveTowardsLeafNodes
+        );
+
+        // Literal base with literal key -> NOT leaf-pushable (would be constant-folded)
+        let args = vec![ExpressionPlacement::Literal, ExpressionPlacement::Literal];
+        assert_eq!(func.placement(&args), ExpressionPlacement::KeepInPlace);
+    }
 }
diff --git a/datafusion/functions/src/core/greatest.rs b/datafusion/functions/src/core/greatest.rs
index 6afc5b25512f4..64eaefb9b887d 100644
--- a/datafusion/functions/src/core/greatest.rs
+++ b/datafusion/functions/src/core/greatest.rs
@@ -16,17 +16,16 @@
 // under the License.
 
 use crate::core::greatest_least_utils::GreatestLeastOperator;
-use arrow::array::{make_comparator, Array, BooleanArray};
+use arrow::array::{Array, BooleanArray, make_comparator};
 use arrow::buffer::BooleanBuffer;
-use arrow::compute::kernels::cmp;
 use arrow::compute::SortOptions;
+use arrow::compute::kernels::cmp;
 use arrow::datatypes::DataType;
-use datafusion_common::{internal_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, assert_eq_or_internal_err};
 use datafusion_doc::Documentation;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
-use std::any::Any;
 
 const SORT_OPTIONS: SortOptions = SortOptions {
     // We want greatest first
@@ -90,11 +89,7 @@ impl GreatestLeastOperator for GreatestFunc {
             SORT_OPTIONS,
         )?;
 
-        if cmp(0, 0).is_ge() {
-            Ok(lhs)
-        } else {
-            Ok(rhs)
-        }
+        if cmp(0, 0).is_ge() { Ok(lhs) } else { Ok(rhs) }
     }
 
     /// Return boolean array where `arr[i] = lhs[i] >= rhs[i]` for all i, where `arr` is the result array
@@ -113,11 +108,11 @@ impl GreatestLeastOperator for GreatestFunc {
 
         let cmp = make_comparator(lhs, rhs, SORT_OPTIONS)?;
 
-        if lhs.len() != rhs.len() {
-            return internal_err!(
-                "All arrays should have the same length for greatest comparison"
-            );
-        }
+        assert_eq_or_internal_err!(
+            lhs.len(),
+            rhs.len(),
+            "All arrays should have the same length for greatest comparison"
+        );
 
         let values = BooleanBuffer::collect_bool(lhs.len(), |i| cmp(i, i).is_ge());
 
@@ -127,10 +122,6 @@ impl GreatestLeastOperator for GreatestFunc {
 }
 
 impl ScalarUDFImpl for GreatestFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "greatest"
     }
diff --git a/datafusion/functions/src/core/greatest_least_utils.rs b/datafusion/functions/src/core/greatest_least_utils.rs
index 46b3645e703a2..2714a01832175 100644
--- a/datafusion/functions/src/core/greatest_least_utils.rs
+++ b/datafusion/functions/src/core/greatest_least_utils.rs
@@ -18,9 +18,9 @@
 use arrow::array::{Array, ArrayRef, BooleanArray};
 use arrow::compute::kernels::zip::zip;
 use arrow::datatypes::DataType;
-use datafusion_common::{internal_err, plan_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, assert_or_internal_err, plan_err};
 use datafusion_expr_common::columnar_value::ColumnarValue;
-use datafusion_expr_common::type_coercion::binary::type_union_resolution;
+use datafusion_expr_common::type_coercion::binary::comparison_coercion;
 use std::sync::Arc;
 
 pub(super) trait GreatestLeastOperator {
@@ -36,11 +36,11 @@ pub(super) trait GreatestLeastOperator {
 }
 
 fn keep_array<Op: GreatestLeastOperator>(
-    lhs: ArrayRef,
-    rhs: ArrayRef,
+    lhs: &dyn Array,
+    rhs: &dyn Array,
 ) -> Result<ArrayRef> {
     // True for values that we should keep from the left array
-    let keep_lhs = Op::get_indexes_to_keep(lhs.as_ref(), rhs.as_ref())?;
+    let keep_lhs = Op::get_indexes_to_keep(lhs, rhs)?;
 
     let result = zip(&keep_lhs, &lhs, &rhs)?;
 
@@ -50,12 +50,11 @@ fn keep_array<Op: GreatestLeastOperator>(
 pub(super) fn execute_conditional<Op: GreatestLeastOperator>(
     args: &[ColumnarValue],
 ) -> Result<ColumnarValue> {
-    if args.is_empty() {
-        return internal_err!(
-            "{} was called with no arguments. It requires at least 1.",
-            Op::NAME
-        );
-    }
+    assert_or_internal_err!(
+        !args.is_empty(),
+        "{} was called with no arguments. It requires at least 1.",
+        Op::NAME
+    );
 
     // Some engines (e.g. SQL Server) allow greatest/least with single arg, it's a noop
     if args.len() == 1 {
@@ -101,8 +100,8 @@ pub(super) fn execute_conditional<Op: GreatestLeastOperator>(
 
         // Start with the result value
         result = keep_array::<Op>(
-            Arc::clone(first_array),
-            result_scalar.to_array_of_size(first_array.len())?,
+            first_array,
+            &result_scalar.to_array_of_size(first_array.len())?,
         )?;
     } else {
         // If we only have arrays, start with the first array
@@ -111,7 +110,7 @@ pub(super) fn execute_conditional<Op: GreatestLeastOperator>(
     }
 
     for array in arrays_iter {
-        result = keep_array::<Op>(Arc::clone(array), result)?;
+        result = keep_array::<Op>(array, &result)?;
     }
 
     Ok(ColumnarValue::Array(result))
@@ -121,13 +120,17 @@ pub(super) fn find_coerced_type<Op: GreatestLeastOperator>(
     data_types: &[DataType],
 ) -> Result<DataType> {
     if data_types.is_empty() {
-        plan_err!(
+        return plan_err!(
             "{} was called without any arguments. It requires at least 1.",
             Op::NAME
-        )
-    } else if let Some(coerced_type) = type_union_resolution(data_types) {
-        Ok(coerced_type)
-    } else {
-        plan_err!("Cannot find a common type for arguments")
+        );
+    }
+    let mut coerced = data_types[0].clone();
+    for dt in &data_types[1..] {
+        let Some(next) = comparison_coercion(&coerced, dt) else {
+            return plan_err!("Cannot find a common type for arguments to {}", Op::NAME);
+        };
+        coerced = next;
     }
+    Ok(coerced)
 }
diff --git a/datafusion/functions/src/core/least.rs b/datafusion/functions/src/core/least.rs
index 31cdf54441117..8b84aa49ab82a 100644
--- a/datafusion/functions/src/core/least.rs
+++ b/datafusion/functions/src/core/least.rs
@@ -16,17 +16,16 @@
 // under the License.
 
 use crate::core::greatest_least_utils::GreatestLeastOperator;
-use arrow::array::{make_comparator, Array, BooleanArray};
+use arrow::array::{Array, BooleanArray, make_comparator};
 use arrow::buffer::BooleanBuffer;
-use arrow::compute::kernels::cmp;
 use arrow::compute::SortOptions;
+use arrow::compute::kernels::cmp;
 use arrow::datatypes::DataType;
-use datafusion_common::{internal_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, assert_eq_or_internal_err};
 use datafusion_doc::Documentation;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
-use std::any::Any;
 
 const SORT_OPTIONS: SortOptions = SortOptions {
     // Having the smallest result first
@@ -103,11 +102,7 @@ impl GreatestLeastOperator for LeastFunc {
             SORT_OPTIONS,
         )?;
 
-        if cmp(0, 0).is_le() {
-            Ok(lhs)
-        } else {
-            Ok(rhs)
-        }
+        if cmp(0, 0).is_le() { Ok(lhs) } else { Ok(rhs) }
     }
 
     /// Return boolean array where `arr[i] = lhs[i] <= rhs[i]` for all i, where `arr` is the result array
@@ -126,11 +121,11 @@ impl GreatestLeastOperator for LeastFunc {
 
         let cmp = make_comparator(lhs, rhs, SORT_OPTIONS)?;
 
-        if lhs.len() != rhs.len() {
-            return internal_err!(
-                "All arrays should have the same length for least comparison"
-            );
-        }
+        assert_eq_or_internal_err!(
+            lhs.len(),
+            rhs.len(),
+            "All arrays should have the same length for least comparison"
+        );
 
         let values = BooleanBuffer::collect_bool(lhs.len(), |i| cmp(i, i).is_le());
 
@@ -140,10 +135,6 @@ impl GreatestLeastOperator for LeastFunc {
 }
 
 impl ScalarUDFImpl for LeastFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "least"
     }
diff --git a/datafusion/functions/src/core/mod.rs b/datafusion/functions/src/core/mod.rs
index db080cd628478..5657f9d88810c 100644
--- a/datafusion/functions/src/core/mod.rs
+++ b/datafusion/functions/src/core/mod.rs
@@ -21,7 +21,11 @@ use datafusion_expr::ScalarUDF;
 use std::sync::Arc;
 
 pub mod arrow_cast;
+pub mod arrow_field;
+pub mod arrow_metadata;
+pub mod arrow_try_cast;
 pub mod arrowtypeof;
+pub mod cast_to_type;
 pub mod coalesce;
 pub mod expr_ext;
 pub mod getfield;
@@ -35,12 +39,17 @@ pub mod nvl2;
 pub mod overlay;
 pub mod planner;
 pub mod r#struct;
+pub mod try_cast_to_type;
 pub mod union_extract;
 pub mod union_tag;
 pub mod version;
+pub mod with_metadata;
 
 // create UDFs
 make_udf_function!(arrow_cast::ArrowCastFunc, arrow_cast);
+make_udf_function!(arrow_try_cast::ArrowTryCastFunc, arrow_try_cast);
+make_udf_function!(cast_to_type::CastToTypeFunc, cast_to_type);
+make_udf_function!(try_cast_to_type::TryCastToTypeFunc, try_cast_to_type);
 make_udf_function!(nullif::NullIfFunc, nullif);
 make_udf_function!(nvl::NVLFunc, nvl);
 make_udf_function!(nvl2::NVL2Func, nvl2);
@@ -55,6 +64,9 @@ make_udf_function!(least::LeastFunc, least);
 make_udf_function!(union_extract::UnionExtractFun, union_extract);
 make_udf_function!(union_tag::UnionTagFunc, union_tag);
 make_udf_function!(version::VersionFunc, version);
+make_udf_function!(arrow_metadata::ArrowMetadataFunc, arrow_metadata);
+make_udf_function!(with_metadata::WithMetadataFunc, with_metadata);
+make_udf_function!(arrow_field::ArrowFieldFunc, arrow_field);
 
 pub mod expr_fn {
     use datafusion_expr::{Expr, Literal};
@@ -65,7 +77,19 @@ pub mod expr_fn {
         arg1 arg2
     ),(
         arrow_cast,
-        "Returns value2 if value1 is NULL; otherwise it returns value1",
+        "Casts a value to a specific Arrow data type",
+        arg1 arg2
+    ),(
+        arrow_try_cast,
+        "Casts a value to a specific Arrow data type, returning NULL if the cast fails",
+        arg1 arg2
+    ),(
+        cast_to_type,
+        "Casts the first argument to the data type of the second argument",
+        arg1 arg2
+    ),(
+        try_cast_to_type,
+        "Casts the first argument to the data type of the second argument, returning NULL on failure",
         arg1 arg2
     ),(
         nvl,
@@ -83,6 +107,18 @@ pub mod expr_fn {
         arrow_typeof,
         "Returns the Arrow type of the input expression.",
         arg1
+    ),(
+        arrow_field,
+        "Returns the Arrow field info (name, data_type, nullable, metadata) of the input expression.",
+        arg1
+    ),(
+        arrow_metadata,
+        "Returns the metadata of the input expression",
+        args,
+    ),(
+        with_metadata,
+        "Attaches Arrow field metadata (key/value pairs) to the input expression",
+        args,
     ),(
         r#struct,
         "Returns a struct with the given arguments",
@@ -110,11 +146,20 @@ pub mod expr_fn {
     ));
 
     #[doc = "Returns the value of the field with the given name from the struct"]
+    #[expect(clippy::needless_pass_by_value)]
     pub fn get_field(arg1: Expr, arg2: impl Literal) -> Expr {
         super::get_field().call(vec![arg1, arg2.lit()])
     }
 
+    #[doc = "Returns the value of nested fields by traversing multiple field names"]
+    pub fn get_field_path(base: Expr, field_names: Vec<Expr>) -> Expr {
+        let mut args = vec![base];
+        args.extend(field_names);
+        super::get_field().call(args)
+    }
+
     #[doc = "Returns the value of the field with the given name from the union when it's selected, or NULL otherwise"]
+    #[expect(clippy::needless_pass_by_value)]
     pub fn union_extract(arg1: Expr, arg2: impl Literal) -> Expr {
         super::union_extract().call(vec![arg1, arg2.lit()])
     }
@@ -125,6 +170,12 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
     vec![
         nullif(),
         arrow_cast(),
+        arrow_field(),
+        arrow_try_cast(),
+        cast_to_type(),
+        try_cast_to_type(),
+        arrow_metadata(),
+        with_metadata(),
         nvl(),
         nvl2(),
         overlay(),
diff --git a/datafusion/functions/src/core/named_struct.rs b/datafusion/functions/src/core/named_struct.rs
index 1da5148474f8c..10420ba58d210 100644
--- a/datafusion/functions/src/core/named_struct.rs
+++ b/datafusion/functions/src/core/named_struct.rs
@@ -15,20 +15,23 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use super::getfield::GetFieldFunc;
 use arrow::array::StructArray;
 use arrow::datatypes::{DataType, Field, FieldRef, Fields};
-use datafusion_common::{exec_err, internal_err, Result};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs,
+    ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF,
+    StructFieldMapping,
 };
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::sync::Arc;
 
 #[user_doc(
     doc_section(label = "Struct Functions"),
-    description = "Returns an Arrow struct using the specified name and input expressions pairs.",
+    description = "Returns an Arrow struct using the specified name and input expressions pairs.
+For information on comparing and ordering struct values (including `NULL` handling),
+see [Comparison and Ordering](struct_coercion.md#comparison-and-ordering).",
     syntax_example = "named_struct(expression1_name, expression1_input[, ..., expression_n_name, expression_n_input])",
     sql_example = r#"
 For example, this query converts two columns `a` and `b` to a single column with
@@ -78,10 +81,6 @@ impl NamedStructFunc {
 }
 
 impl ScalarUDFImpl for NamedStructFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "named_struct"
     }
@@ -177,4 +176,31 @@ impl ScalarUDFImpl for NamedStructFunc {
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
+
+    fn struct_field_mapping(
+        &self,
+        literal_args: &[Option<ScalarValue>],
+    ) -> Option<StructFieldMapping> {
+        if literal_args.is_empty() || !literal_args.len().is_multiple_of(2) {
+            return None;
+        }
+
+        let mut fields = Vec::with_capacity(literal_args.len() / 2);
+        for (i, chunk) in literal_args.chunks(2).enumerate() {
+            match chunk {
+                [Some(ScalarValue::Utf8(Some(name))), _] => {
+                    fields.push((
+                        vec![ScalarValue::Utf8(Some(name.clone()))],
+                        i * 2 + 1, // index of the value argument
+                    ));
+                }
+                _ => return None,
+            }
+        }
+
+        Some(StructFieldMapping {
+            field_accessor: Arc::new(ScalarUDF::from(GetFieldFunc::new())),
+            fields,
+        })
+    }
 }
diff --git a/datafusion/functions/src/core/nullif.rs b/datafusion/functions/src/core/nullif.rs
index 69d86360cb3cb..f58ae857d4791 100644
--- a/datafusion/functions/src/core/nullif.rs
+++ b/datafusion/functions/src/core/nullif.rs
@@ -18,12 +18,11 @@
 use arrow::datatypes::DataType;
 use datafusion_expr::{ColumnarValue, Documentation, ScalarFunctionArgs};
 
-use arrow::compute::kernels::cmp::eq;
 use arrow::compute::kernels::nullif::nullif;
-use datafusion_common::{utils::take_function_args, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, utils::take_function_args};
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
-use std::any::Any;
+use datafusion_physical_expr_common::datum::compare_with_eq;
 
 #[user_doc(
     doc_section(label = "Conditional Functions"),
@@ -86,9 +85,6 @@ impl NullIfFunc {
 }
 
 impl ScalarUDFImpl for NullIfFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "nullif"
     }
@@ -115,25 +111,29 @@ impl ScalarUDFImpl for NullIfFunc {
 ///       1 - if the left is equal to this expr2, then the result is NULL, otherwise left value is passed.
 fn nullif_func(args: &[ColumnarValue]) -> Result<ColumnarValue> {
     let [lhs, rhs] = take_function_args("nullif", args)?;
+    let is_nested = lhs.data_type().is_nested();
 
     match (lhs, rhs) {
         (ColumnarValue::Array(lhs), ColumnarValue::Scalar(rhs)) => {
             let rhs = rhs.to_scalar()?;
-            let array = nullif(lhs, &eq(&lhs, &rhs)?)?;
+            let eq_array = compare_with_eq(lhs, &rhs, is_nested)?;
+            let array = nullif(lhs, &eq_array)?;
 
             Ok(ColumnarValue::Array(array))
         }
         (ColumnarValue::Array(lhs), ColumnarValue::Array(rhs)) => {
-            let array = nullif(lhs, &eq(&lhs, &rhs)?)?;
+            let eq_array = compare_with_eq(lhs, rhs, is_nested)?;
+            let array = nullif(lhs, &eq_array)?;
             Ok(ColumnarValue::Array(array))
         }
         (ColumnarValue::Scalar(lhs), ColumnarValue::Array(rhs)) => {
             let lhs_s = lhs.to_scalar()?;
             let lhs_a = lhs.to_array_of_size(rhs.len())?;
+            let eq_array = compare_with_eq(&lhs_s, rhs, is_nested)?;
             let array = nullif(
                 // nullif in arrow-select does not support Datum, so we need to convert to array
                 lhs_a.as_ref(),
-                &eq(&lhs_s, &rhs)?,
+                &eq_array,
             )?;
             Ok(ColumnarValue::Array(array))
         }
@@ -152,7 +152,12 @@ fn nullif_func(args: &[ColumnarValue]) -> Result<ColumnarValue> {
 mod tests {
     use std::sync::Arc;
 
-    use arrow::array::*;
+    use arrow::{
+        array::*,
+        buffer::NullBuffer,
+        datatypes::{Field, Fields, Int64Type},
+    };
+    use datafusion_common::DataFusionError;
 
     use super::*;
 
@@ -255,6 +260,104 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn nullif_struct() -> Result<()> {
+        let fields = Fields::from(vec![
+            Field::new("a", DataType::Int64, true),
+            Field::new("b", DataType::Utf8, true),
+        ]);
+
+        let lhs_a = Arc::new(Int64Array::from(vec![Some(1), Some(2), None]));
+        let lhs_b = Arc::new(StringArray::from(vec![Some("1"), Some("2"), None]));
+        let lhs_nulls = Some(NullBuffer::from(vec![true, true, false]));
+        let lhs = ColumnarValue::Array(Arc::new(StructArray::new(
+            fields.clone(),
+            vec![lhs_a, lhs_b],
+            lhs_nulls,
+        )));
+
+        let rhs_a = Arc::new(Int64Array::from(vec![Some(1), Some(9), None]));
+        let rhs_b = Arc::new(StringArray::from(vec![Some("1"), Some("2"), None]));
+        let rhs_nulls = Some(NullBuffer::from(vec![true, true, false]));
+        let rhs = ColumnarValue::Array(Arc::new(StructArray::new(
+            fields.clone(),
+            vec![rhs_a, rhs_b],
+            rhs_nulls,
+        )));
+
+        let result = nullif_func(&[lhs, rhs])?;
+        let result = result.into_array(0).expect("Failed to convert to array");
+
+        let expected_arrays = vec![
+            Arc::new(Int64Array::from(vec![None, Some(2), None])) as ArrayRef,
+            Arc::new(StringArray::from(vec![None, Some("2"), None])) as ArrayRef,
+        ];
+        let expected_nulls = NullBuffer::from(vec![false, true, false]);
+
+        let expected = Arc::new(StructArray::try_new(
+            fields,
+            expected_arrays,
+            Some(expected_nulls),
+        )?) as ArrayRef;
+
+        assert_eq!(expected.as_ref(), result.as_ref());
+
+        Ok(())
+    }
+
+    #[test]
+    fn nullif_list() -> Result<()> {
+        let lhs = Arc::new(ListArray::from_iter_primitive::<Int64Type, _, _>(vec![
+            Some(vec![Some(1), Some(2)]),
+            Some(vec![Some(3)]),
+            Some(vec![]),
+            Some(vec![Some(5), Some(6), Some(7)]),
+            None,
+        ]));
+        let lhs = ColumnarValue::Array(lhs);
+
+        let rhs = Arc::new(ListArray::from_iter_primitive::<Int64Type, _, _>(vec![
+            Some(vec![Some(1), Some(2)]),
+        ]));
+        let rhs = ColumnarValue::Scalar(ScalarValue::List(rhs));
+
+        let result = nullif_func(&[lhs, rhs])?;
+        let result = result.into_array(0).expect("Failed to convert to array");
+
+        let expected = Arc::new(ListArray::from_iter_primitive::<Int64Type, _, _>(vec![
+            None,
+            Some(vec![Some(3)]),
+            Some(vec![]),
+            Some(vec![Some(5), Some(6), Some(7)]),
+            None,
+        ])) as ArrayRef;
+
+        assert_eq!(expected.as_ref(), result.as_ref());
+
+        Ok(())
+    }
+
+    #[test]
+    fn nullif_compare_nested_to_unnested() -> Result<()> {
+        let lhs = Arc::new(ListArray::from_iter_primitive::<Int64Type, _, _>(vec![
+            Some(vec![Some(1), Some(2)]),
+            Some(vec![Some(3)]),
+            Some(vec![]),
+            Some(vec![Some(5), Some(6), Some(7)]),
+            None,
+        ]));
+        let lhs = ColumnarValue::Array(lhs);
+
+        let rhs = Arc::new(Int64Array::from(vec![Some(1), Some(3), None, None, None]));
+        let rhs = ColumnarValue::Array(rhs);
+
+        let result = nullif_func(&[lhs, rhs]);
+
+        assert!(matches!(result, Err(DataFusionError::ArrowError(_, _))));
+
+        Ok(())
+    }
+
     #[test]
     fn nullif_literal_first() -> Result<()> {
         let a = Int32Array::from(vec![Some(1), Some(2), None, None, Some(3), Some(4)]);
diff --git a/datafusion/functions/src/core/nvl.rs b/datafusion/functions/src/core/nvl.rs
index 0b9968a88fc95..3b73dd0165143 100644
--- a/datafusion/functions/src/core/nvl.rs
+++ b/datafusion/functions/src/core/nvl.rs
@@ -18,7 +18,7 @@
 use crate::core::coalesce::CoalesceFunc;
 use arrow::datatypes::{DataType, FieldRef};
 use datafusion_common::Result;
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr::{
     ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs,
     ScalarUDFImpl, Signature, Volatility,
@@ -101,10 +101,6 @@ impl NVLFunc {
 }
 
 impl ScalarUDFImpl for NVLFunc {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "nvl"
     }
@@ -124,7 +120,7 @@ impl ScalarUDFImpl for NVLFunc {
     fn simplify(
         &self,
         args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         self.coalesce.simplify(args, info)
     }
diff --git a/datafusion/functions/src/core/nvl2.rs b/datafusion/functions/src/core/nvl2.rs
index 45cb6760d062d..d68296d9b862b 100644
--- a/datafusion/functions/src/core/nvl2.rs
+++ b/datafusion/functions/src/core/nvl2.rs
@@ -16,13 +16,13 @@
 // under the License.
 
 use arrow::datatypes::{DataType, Field, FieldRef};
-use datafusion_common::{internal_err, utils::take_function_args, Result};
+use datafusion_common::{Result, internal_err, utils::take_function_args};
 use datafusion_expr::{
-    conditional_expressions::CaseBuilder,
-    simplify::{ExprSimplifyResult, SimplifyInfo},
-    type_coercion::binary::comparison_coercion,
     ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs,
     ScalarUDFImpl, Signature, Volatility,
+    conditional_expressions::CaseBuilder,
+    simplify::{ExprSimplifyResult, SimplifyContext},
+    type_coercion::binary::type_union_coercion,
 };
 use datafusion_macros::user_doc;
 
@@ -78,10 +78,6 @@ impl NVL2Func {
 }
 
 impl ScalarUDFImpl for NVL2Func {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "nvl2"
     }
@@ -108,7 +104,7 @@ impl ScalarUDFImpl for NVL2Func {
     fn simplify(
         &self,
         args: Vec<Expr>,
-        _info: &dyn SimplifyInfo,
+        _info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         let [test, if_non_null, if_null] = take_function_args(self.name(), args)?;
 
@@ -133,11 +129,9 @@ impl ScalarUDFImpl for NVL2Func {
             [if_non_null, if_null]
                 .iter()
                 .try_fold(tested.clone(), |acc, x| {
-                    // The coerced types found by `comparison_coercion` are not guaranteed to be
-                    // coercible for the arguments. `comparison_coercion` returns more loose
-                    // types that can be coerced to both `acc` and `x` for comparison purpose.
-                    // See `maybe_data_types` for the actual coercion.
-                    let coerced_type = comparison_coercion(&acc, x);
+                    // `type_union_coercion` finds a loose common type; the actual
+                    // coercion is done by `maybe_data_types`.
+                    let coerced_type = type_union_coercion(&acc, x);
                     if let Some(coerced_type) = coerced_type {
                         Ok(coerced_type)
                     } else {
diff --git a/datafusion/functions/src/core/overlay.rs b/datafusion/functions/src/core/overlay.rs
index 165bc571afe09..58f22caf52e32 100644
--- a/datafusion/functions/src/core/overlay.rs
+++ b/datafusion/functions/src/core/overlay.rs
@@ -15,7 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
@@ -25,7 +24,7 @@ use crate::utils::{make_scalar_function, utf8_to_str_type};
 use datafusion_common::cast::{
     as_generic_string_array, as_int64_array, as_string_view_array,
 };
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::{ColumnarValue, Documentation, TypeSignature, Volatility};
 use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature};
 use datafusion_macros::user_doc;
@@ -84,10 +83,6 @@ impl OverlayFunc {
 }
 
 impl ScalarUDFImpl for OverlayFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "overlay"
     }
@@ -201,7 +196,7 @@ fn overlay<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     }
 }
 
-pub fn string_overlay<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn string_overlay<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     match args.len() {
         3 => {
             let string_array = as_generic_string_array::<T>(&args[0])?;
@@ -227,7 +222,7 @@ pub fn string_overlay<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef>
     }
 }
 
-pub fn string_view_overlay<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn string_view_overlay<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     match args.len() {
         3 => {
             let string_array = as_string_view_array(&args[0])?;
diff --git a/datafusion/functions/src/core/planner.rs b/datafusion/functions/src/core/planner.rs
index 227e401156173..4d6b744b3e68b 100644
--- a/datafusion/functions/src/core/planner.rs
+++ b/datafusion/functions/src/core/planner.rs
@@ -20,7 +20,7 @@ use datafusion_common::Result;
 use datafusion_common::{Column, DFSchema, ScalarValue, TableReference};
 use datafusion_expr::expr::ScalarFunction;
 use datafusion_expr::planner::{ExprPlanner, PlannerResult, RawDictionaryExpr};
-use datafusion_expr::{lit, Expr};
+use datafusion_expr::{Expr, lit};
 
 use super::named_struct;
 
diff --git a/datafusion/functions/src/core/struct.rs b/datafusion/functions/src/core/struct.rs
index 32c7af80e397f..2697cb46b09f0 100644
--- a/datafusion/functions/src/core/struct.rs
+++ b/datafusion/functions/src/core/struct.rs
@@ -17,18 +17,19 @@
 
 use arrow::array::StructArray;
 use arrow::datatypes::{DataType, Field};
-use datafusion_common::{exec_err, internal_err, Result};
+use datafusion_common::{Result, exec_err, internal_err};
 use datafusion_expr::{ColumnarValue, Documentation, ScalarFunctionArgs};
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::sync::Arc;
 
 #[user_doc(
     doc_section(label = "Struct Functions"),
     description = "Returns an Arrow struct using the specified input expressions optionally named.
 Fields in the returned struct use the optional name or the `cN` naming convention.
-For example: `c0`, `c1`, `c2`, etc.",
+For example: `c0`, `c1`, `c2`, etc.
+For information on comparing and ordering struct values (including `NULL` handling),
+see [Comparison and Ordering](struct_coercion.md#comparison-and-ordering).",
     syntax_example = "struct(expression1[, ..., expression_n])",
     sql_example = r#"For example, this query converts two columns `a` and `b` to a single column with
 a struct type of fields `field_a` and `c1`:
@@ -86,9 +87,6 @@ impl StructFunc {
 }
 
 impl ScalarUDFImpl for StructFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "struct"
     }
diff --git a/datafusion/functions/src/core/try_cast_to_type.rs b/datafusion/functions/src/core/try_cast_to_type.rs
new file mode 100644
index 0000000000000..4c5af4cc6d228
--- /dev/null
+++ b/datafusion/functions/src/core/try_cast_to_type.rs
@@ -0,0 +1,130 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`TryCastToTypeFunc`]: Implementation of the `try_cast_to_type` function
+
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::{
+    Result, datatype::DataTypeExt, internal_err, utils::take_function_args,
+};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignatureClass, Volatility,
+};
+use datafusion_macros::user_doc;
+
+/// Like [`cast_to_type`](super::cast_to_type::CastToTypeFunc) but returns NULL
+/// on cast failure instead of erroring.
+///
+/// This is implemented by simplifying `try_cast_to_type(expr, ref)` into
+/// `Expr::TryCast` during optimization.
+#[user_doc(
+    doc_section(label = "Other Functions"),
+    description = "Casts the first argument to the data type of the second argument, returning NULL if the cast fails. Only the type of the second argument is used; its value is ignored.",
+    syntax_example = "try_cast_to_type(expression, reference)",
+    sql_example = r#"```sql
+> select try_cast_to_type('123', NULL::INTEGER) as a,
+         try_cast_to_type('not_a_number', NULL::INTEGER) as b;
+
++-----+------+
+| a   | b    |
++-----+------+
+| 123 | NULL |
++-----+------+
+```"#,
+    argument(
+        name = "expression",
+        description = "The expression to cast. It can be a constant, column, or function, and any combination of operators."
+    ),
+    argument(
+        name = "reference",
+        description = "Reference expression whose data type determines the target cast type. The value is ignored."
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct TryCastToTypeFunc {
+    signature: Signature,
+}
+
+impl Default for TryCastToTypeFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl TryCastToTypeFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_exact(TypeSignatureClass::Any),
+                    Coercion::new_exact(TypeSignatureClass::Any),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for TryCastToTypeFunc {
+    fn name(&self) -> &str {
+        "try_cast_to_type"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be called instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        // TryCast can always return NULL (on cast failure), so always nullable
+        let [_, reference_field] = take_function_args(self.name(), args.arg_fields)?;
+        let target_type = reference_field.data_type().clone();
+        Ok(Field::new(self.name(), target_type, true).into())
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!("try_cast_to_type should have been simplified to try_cast")
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [source_arg, type_arg] = take_function_args(self.name(), args)?;
+        let target_type = info.get_data_type(&type_arg)?;
+        let source_type = info.get_data_type(&source_arg)?;
+        let new_expr = if source_type == target_type {
+            source_arg
+        } else {
+            Expr::TryCast(datafusion_expr::TryCast {
+                expr: Box::new(source_arg),
+                field: target_type.into_nullable_field_ref(),
+            })
+        };
+        Ok(ExprSimplifyResult::Simplified(new_expr))
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
diff --git a/datafusion/functions/src/core/union_extract.rs b/datafusion/functions/src/core/union_extract.rs
index a71e2e87388d5..9c0d42edf7fde 100644
--- a/datafusion/functions/src/core/union_extract.rs
+++ b/datafusion/functions/src/core/union_extract.rs
@@ -20,7 +20,7 @@ use arrow::datatypes::{DataType, Field, FieldRef, UnionFields};
 use datafusion_common::cast::as_union_array;
 use datafusion_common::utils::take_function_args;
 use datafusion_common::{
-    exec_datafusion_err, exec_err, internal_err, Result, ScalarValue,
+    Result, ScalarValue, exec_datafusion_err, exec_err, internal_err,
 };
 use datafusion_doc::Documentation;
 use datafusion_expr::{ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs};
@@ -69,10 +69,6 @@ impl UnionExtractFun {
 }
 
 impl ScalarUDFImpl for UnionExtractFun {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "union_extract"
     }
@@ -117,9 +113,16 @@ impl ScalarUDFImpl for UnionExtractFun {
         let [array, target_name] = take_function_args("union_extract", args.args)?;
 
         let target_name = match target_name {
-            ColumnarValue::Scalar(ScalarValue::Utf8(Some(target_name))) => Ok(target_name),
-            ColumnarValue::Scalar(ScalarValue::Utf8(None)) => exec_err!("union_extract second argument must be a non-null string literal, got a null instead"),
-            _ => exec_err!("union_extract second argument must be a non-null string literal, got {} instead", target_name.data_type()),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(target_name))) => {
+                Ok(target_name)
+            }
+            ColumnarValue::Scalar(ScalarValue::Utf8(None)) => exec_err!(
+                "union_extract second argument must be a non-null string literal, got a null instead"
+            ),
+            _ => exec_err!(
+                "union_extract second argument must be a non-null string literal, got {} instead",
+                target_name.data_type()
+            ),
         }?;
 
         match array {
@@ -182,13 +185,14 @@ mod tests {
     fn test_scalar_value() -> Result<()> {
         let fun = UnionExtractFun::new();
 
-        let fields = UnionFields::new(
+        let fields = UnionFields::try_new(
             vec![1, 3],
             vec![
                 Field::new("str", DataType::Utf8, false),
                 Field::new("int", DataType::Int32, false),
             ],
-        );
+        )
+        .unwrap();
 
         let args = vec![
             ColumnarValue::Scalar(ScalarValue::Union(
diff --git a/datafusion/functions/src/core/union_tag.rs b/datafusion/functions/src/core/union_tag.rs
index aeadb8292ba1e..9a349a4b9a8eb 100644
--- a/datafusion/functions/src/core/union_tag.rs
+++ b/datafusion/functions/src/core/union_tag.rs
@@ -18,7 +18,7 @@
 use arrow::array::{Array, AsArray, DictionaryArray, Int8Array, StringArray};
 use arrow::datatypes::DataType;
 use datafusion_common::utils::take_function_args;
-use datafusion_common::{exec_datafusion_err, exec_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_datafusion_err, exec_err};
 use datafusion_doc::Documentation;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
@@ -63,10 +63,6 @@ impl UnionTagFunc {
 }
 
 impl ScalarUDFImpl for UnionTagFunc {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "union_tag"
     }
@@ -143,7 +139,7 @@ impl ScalarUDFImpl for UnionTagFunc {
                     args.return_field.data_type(),
                 )?)),
             },
-            v => exec_err!("union_tag only support unions, got {:?}", v.data_type()),
+            v => exec_err!("union_tag only support unions, got {}", v.data_type()),
         }
     }
 
@@ -156,8 +152,8 @@ impl ScalarUDFImpl for UnionTagFunc {
 mod tests {
     use super::UnionTagFunc;
     use arrow::datatypes::{DataType, Field, UnionFields, UnionMode};
-    use datafusion_common::config::ConfigOptions;
     use datafusion_common::ScalarValue;
+    use datafusion_common::config::ConfigOptions;
     use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
     use std::sync::Arc;
 
diff --git a/datafusion/functions/src/core/version.rs b/datafusion/functions/src/core/version.rs
index ef3c5aafa4801..1e8cc8683ab5b 100644
--- a/datafusion/functions/src/core/version.rs
+++ b/datafusion/functions/src/core/version.rs
@@ -18,13 +18,12 @@
 //! [`VersionFunc`]: Implementation of the `version` function.
 
 use arrow::datatypes::DataType;
-use datafusion_common::{utils::take_function_args, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, utils::take_function_args};
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
     Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
 
 #[user_doc(
     doc_section(label = "Other Functions"),
@@ -53,16 +52,12 @@ impl Default for VersionFunc {
 impl VersionFunc {
     pub fn new() -> Self {
         Self {
-            signature: Signature::exact(vec![], Volatility::Immutable),
+            signature: Signature::nullary(Volatility::Immutable),
         }
     }
 }
 
 impl ScalarUDFImpl for VersionFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "version"
     }
@@ -99,6 +94,7 @@ mod test {
     use super::*;
     use arrow::datatypes::Field;
     use datafusion_common::config::ConfigOptions;
+    use datafusion_expr::ScalarFunctionArgs;
     use datafusion_expr::ScalarUDF;
     use std::sync::Arc;
 
diff --git a/datafusion/functions/src/core/with_metadata.rs b/datafusion/functions/src/core/with_metadata.rs
new file mode 100644
index 0000000000000..481ed713ed7ad
--- /dev/null
+++ b/datafusion/functions/src/core/with_metadata.rs
@@ -0,0 +1,335 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::{Result, exec_err, internal_err};
+use datafusion_expr::{
+    ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, Volatility,
+};
+use datafusion_macros::user_doc;
+
+#[user_doc(
+    doc_section(label = "Other Functions"),
+    description = "Attaches Arrow field metadata (key/value pairs) to the input expression. Keys must be non-empty constant strings and values must be constant strings (empty values are allowed). Existing metadata on the input field is preserved; new keys overwrite on collision. This is the inverse of `arrow_metadata`.",
+    syntax_example = "with_metadata(expression, key1, value1[, key2, value2, ...])",
+    sql_example = r#"```sql
+> select arrow_metadata(with_metadata(column1, 'unit', 'ms'), 'unit') from (values (1));
++---------------------------------------------------------------+
+| arrow_metadata(with_metadata(column1,Utf8("unit"),Utf8("ms")),Utf8("unit")) |
++---------------------------------------------------------------+
+| ms                                                            |
++---------------------------------------------------------------+
+> select arrow_metadata(with_metadata(column1, 'unit', 'ms', 'source', 'sensor')) from (values (1));
++--------------------------+
+| {source: sensor, unit: ms} |
++--------------------------+
+```"#,
+    argument(
+        name = "expression",
+        description = "The expression whose output Arrow field should be annotated. Values flow through unchanged."
+    ),
+    argument(
+        name = "key",
+        description = "Metadata key. Must be a non-empty constant string literal."
+    ),
+    argument(
+        name = "value",
+        description = "Metadata value. Must be a constant string literal (may be empty)."
+    )
+)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct WithMetadataFunc {
+    signature: Signature,
+}
+
+impl Default for WithMetadataFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl WithMetadataFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::variadic_any(Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for WithMetadataFunc {
+    fn name(&self) -> &str {
+        "with_metadata"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!(
+            "with_metadata: return_type called instead of return_field_from_args"
+        )
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        // Require at least the value expression plus one (key, value) pair,
+        // and an odd total (1 + 2*N).
+        if args.arg_fields.len() < 3 {
+            return exec_err!(
+                "with_metadata requires the input expression plus at least one (key, value) pair (minimum 3 arguments), got {}",
+                args.arg_fields.len()
+            );
+        }
+        if args.arg_fields.len().is_multiple_of(2) {
+            return exec_err!(
+                "with_metadata requires an odd number of arguments (expression followed by key/value pairs), got {}",
+                args.arg_fields.len()
+            );
+        }
+
+        let input_field = &args.arg_fields[0];
+        let mut metadata = input_field.metadata().clone();
+
+        // Keys are at indices 1, 3, 5, ...; values at 2, 4, 6, ...
+        for pair_idx in 0..((args.scalar_arguments.len() - 1) / 2) {
+            let key_idx = 1 + pair_idx * 2;
+            let value_idx = key_idx + 1;
+
+            let key = args.scalar_arguments[key_idx]
+                .and_then(|sv| sv.try_as_str().flatten().filter(|s| !s.is_empty()))
+                .ok_or_else(|| {
+                    datafusion_common::DataFusionError::Execution(format!(
+                        "with_metadata requires argument {key_idx} (key) to be a non-empty constant string"
+                    ))
+                })?;
+
+            let value = args.scalar_arguments[value_idx]
+                .and_then(|sv| sv.try_as_str().flatten())
+                .ok_or_else(|| {
+                    datafusion_common::DataFusionError::Execution(format!(
+                        "with_metadata requires argument {value_idx} (value) to be a constant string"
+                    ))
+                })?;
+
+            metadata.insert(key.to_string(), value.to_string());
+        }
+
+        // Preserve the input field's name, data type, and nullability; only the
+        // metadata changes. This makes `with_metadata(col, ...)` a true
+        // pass-through annotation from a schema perspective.
+        let field = Field::new(
+            input_field.name(),
+            input_field.data_type().clone(),
+            input_field.is_nullable(),
+        )
+        .with_metadata(metadata);
+
+        Ok(field.into())
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        // Pure value pass-through. The metadata was attached to the return
+        // field during planning and flows through record batch schemas; the
+        // physical operator does not need to rebuild arrays.
+        Ok(args.args[0].clone())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::Field;
+    use datafusion_common::ScalarValue;
+    use std::sync::Arc;
+
+    fn field(name: &str, dt: DataType, nullable: bool) -> FieldRef {
+        Arc::new(Field::new(name, dt, nullable))
+    }
+
+    fn str_lit(s: &str) -> ScalarValue {
+        ScalarValue::Utf8(Some(s.to_string()))
+    }
+
+    #[test]
+    fn attaches_single_key() {
+        let udf = WithMetadataFunc::new();
+        let input = field("my_col", DataType::Int32, true);
+        let k = str_lit("unit");
+        let v = str_lit("ms");
+        let fields = [
+            Arc::clone(&input),
+            field("", DataType::Utf8, false),
+            field("", DataType::Utf8, false),
+        ];
+        let scalars = [None, Some(&k), Some(&v)];
+        let ret = udf
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &fields,
+                scalar_arguments: &scalars,
+            })
+            .unwrap();
+        assert_eq!(ret.name(), "my_col");
+        assert_eq!(ret.data_type(), &DataType::Int32);
+        assert!(ret.is_nullable());
+        assert_eq!(ret.metadata().get("unit").map(String::as_str), Some("ms"));
+    }
+
+    #[test]
+    fn merges_existing_metadata_and_overwrites_on_collision() {
+        let udf = WithMetadataFunc::new();
+        let mut existing = Field::new("x", DataType::Float64, false);
+        existing.set_metadata(
+            [
+                ("keep".to_string(), "yes".to_string()),
+                ("unit".to_string(), "old".to_string()),
+            ]
+            .into_iter()
+            .collect(),
+        );
+        let input: FieldRef = Arc::new(existing);
+        let k = str_lit("unit");
+        let v = str_lit("new");
+        let fields = [
+            Arc::clone(&input),
+            field("", DataType::Utf8, false),
+            field("", DataType::Utf8, false),
+        ];
+        let scalars = [None, Some(&k), Some(&v)];
+        let ret = udf
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &fields,
+                scalar_arguments: &scalars,
+            })
+            .unwrap();
+        assert_eq!(ret.name(), "x");
+        assert!(!ret.is_nullable());
+        assert_eq!(ret.metadata().get("keep").map(String::as_str), Some("yes"));
+        assert_eq!(ret.metadata().get("unit").map(String::as_str), Some("new"));
+    }
+
+    #[test]
+    fn multiple_pairs() {
+        let udf = WithMetadataFunc::new();
+        let input = field("c", DataType::Utf8, true);
+        let k1 = str_lit("a");
+        let v1 = str_lit("1");
+        let k2 = str_lit("b");
+        let v2 = str_lit("2");
+        let fields = [
+            Arc::clone(&input),
+            field("", DataType::Utf8, false),
+            field("", DataType::Utf8, false),
+            field("", DataType::Utf8, false),
+            field("", DataType::Utf8, false),
+        ];
+        let scalars = [None, Some(&k1), Some(&v1), Some(&k2), Some(&v2)];
+        let ret = udf
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &fields,
+                scalar_arguments: &scalars,
+            })
+            .unwrap();
+        assert_eq!(ret.metadata().get("a").map(String::as_str), Some("1"));
+        assert_eq!(ret.metadata().get("b").map(String::as_str), Some("2"));
+    }
+
+    #[test]
+    fn rejects_even_arity() {
+        let udf = WithMetadataFunc::new();
+        let input = field("c", DataType::Int32, true);
+        let a = str_lit("a");
+        let b = str_lit("b");
+        let c = str_lit("c");
+        // 4 args total: input + 3 literals (odd key count)
+        let fields = [
+            Arc::clone(&input),
+            field("", DataType::Utf8, false),
+            field("", DataType::Utf8, false),
+            field("", DataType::Utf8, false),
+        ];
+        let scalars = [None, Some(&a), Some(&b), Some(&c)];
+        let err = udf
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &fields,
+                scalar_arguments: &scalars,
+            })
+            .unwrap_err();
+        assert!(err.to_string().contains("odd number"));
+    }
+
+    #[test]
+    fn rejects_too_few_args() {
+        let udf = WithMetadataFunc::new();
+        let input = field("c", DataType::Int32, true);
+        let k = str_lit("a");
+        let fields = [Arc::clone(&input), field("", DataType::Utf8, false)];
+        let scalars = [None, Some(&k)];
+        let err = udf
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &fields,
+                scalar_arguments: &scalars,
+            })
+            .unwrap_err();
+        assert!(err.to_string().contains("at least one"));
+    }
+
+    #[test]
+    fn allows_empty_value() {
+        let udf = WithMetadataFunc::new();
+        let input = field("c", DataType::Int32, true);
+        let k = str_lit("unit");
+        let v = str_lit("");
+        let fields = [
+            Arc::clone(&input),
+            field("", DataType::Utf8, false),
+            field("", DataType::Utf8, false),
+        ];
+        let scalars = [None, Some(&k), Some(&v)];
+        let ret = udf
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &fields,
+                scalar_arguments: &scalars,
+            })
+            .unwrap();
+        assert_eq!(ret.metadata().get("unit").map(String::as_str), Some(""));
+    }
+
+    #[test]
+    fn rejects_non_literal_key() {
+        let udf = WithMetadataFunc::new();
+        let input = field("c", DataType::Int32, true);
+        let v = str_lit("v");
+        let fields = [
+            Arc::clone(&input),
+            field("", DataType::Utf8, true),
+            field("", DataType::Utf8, false),
+        ];
+        let scalars = [None, None, Some(&v)];
+        let err = udf
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &fields,
+                scalar_arguments: &scalars,
+            })
+            .unwrap_err();
+        assert!(err.to_string().contains("non-empty constant string"));
+    }
+}
diff --git a/datafusion/functions/src/crypto/basic.rs b/datafusion/functions/src/crypto/basic.rs
index 5bf83943a92da..e848daaed1cbf 100644
--- a/datafusion/functions/src/crypto/basic.rs
+++ b/datafusion/functions/src/crypto/basic.rs
@@ -17,85 +17,22 @@
 
 //! "crypto" DataFusion functions
 
-use arrow::array::{
-    Array, ArrayRef, BinaryArray, BinaryArrayType, BinaryViewArray, GenericBinaryArray,
-    OffsetSizeTrait,
-};
-use arrow::array::{AsArray, GenericStringArray, StringViewArray};
+use arrow::array::{Array, ArrayRef, AsArray, BinaryArray, BinaryArrayType};
 use arrow::datatypes::DataType;
-use blake2::{Blake2b512, Blake2s256, Digest};
+use blake2::{Blake2b512, Blake2s256};
 use blake3::Hasher as Blake3;
-use datafusion_common::cast::as_binary_array;
 
 use arrow::compute::StringArrayType;
-use datafusion_common::{
-    exec_err, internal_err, plan_err, utils::take_function_args, DataFusionError, Result,
-    ScalarValue,
-};
+use datafusion_common::{DataFusionError, Result, ScalarValue, exec_err, plan_err};
 use datafusion_expr::ColumnarValue;
 use md5::Md5;
 use sha2::{Sha224, Sha256, Sha384, Sha512};
-use std::fmt::{self, Write};
+use std::fmt;
 use std::str::FromStr;
 use std::sync::Arc;
 
-macro_rules! define_digest_function {
-    ($NAME: ident, $METHOD: ident, $DOC: expr) => {
-        #[doc = $DOC]
-        pub fn $NAME(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-            let [data] = take_function_args(&DigestAlgorithm::$METHOD.to_string(), args)?;
-            digest_process(data, DigestAlgorithm::$METHOD)
-        }
-    };
-}
-define_digest_function!(
-    sha224,
-    Sha224,
-    "computes sha224 hash digest of the given input"
-);
-define_digest_function!(
-    sha256,
-    Sha256,
-    "computes sha256 hash digest of the given input"
-);
-define_digest_function!(
-    sha384,
-    Sha384,
-    "computes sha384 hash digest of the given input"
-);
-define_digest_function!(
-    sha512,
-    Sha512,
-    "computes sha512 hash digest of the given input"
-);
-define_digest_function!(
-    blake2b,
-    Blake2b,
-    "computes blake2b hash digest of the given input"
-);
-define_digest_function!(
-    blake2s,
-    Blake2s,
-    "computes blake2s hash digest of the given input"
-);
-define_digest_function!(
-    blake3,
-    Blake3,
-    "computes blake3 hash digest of the given input"
-);
-
-macro_rules! digest_to_scalar {
-    ($METHOD: ident, $INPUT:expr) => {{
-        ScalarValue::Binary($INPUT.as_ref().map(|v| {
-            let mut digest = $METHOD::default();
-            digest.update(v);
-            digest.finalize().as_slice().to_vec()
-        }))
-    }};
-}
-
-#[derive(Debug, Copy, Clone)]
-pub enum DigestAlgorithm {
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
+pub(crate) enum DigestAlgorithm {
     Md5,
     Sha224,
     Sha256,
@@ -106,23 +43,6 @@ pub enum DigestAlgorithm {
     Blake3,
 }
 
-/// Digest computes a binary hash of the given data, accepts Utf8 or LargeUtf8 and returns a [`ColumnarValue`].
-/// Second argument is the algorithm to use.
-/// Standard algorithms are md5, sha1, sha224, sha256, sha384 and sha512.
-pub fn digest(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-    let [data, digest_algorithm] = take_function_args("digest", args)?;
-    let digest_algorithm = match digest_algorithm {
-        ColumnarValue::Scalar(scalar) => match scalar.try_as_str() {
-            Some(Some(method)) => method.parse::<DigestAlgorithm>(),
-            _ => exec_err!("Unsupported data type {scalar:?} for function digest"),
-        },
-        ColumnarValue::Array(_) => {
-            internal_err!("Digest using dynamically decided method is not yet supported")
-        }
-    }?;
-    digest_process(data, digest_algorithm)
-}
-
 impl FromStr for DigestAlgorithm {
     type Err = DataFusionError;
     fn from_str(name: &str) -> Result<DigestAlgorithm> {
@@ -164,84 +84,35 @@ impl fmt::Display for DigestAlgorithm {
     }
 }
 
-/// computes md5 hash digest of the given input
-pub fn md5(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-    let [data] = take_function_args("md5", args)?;
-    let value = digest_process(data, DigestAlgorithm::Md5)?;
-
-    // md5 requires special handling because of its unique utf8view return type
-    Ok(match value {
-        ColumnarValue::Array(array) => {
-            let binary_array = as_binary_array(&array)?;
-            let string_array: StringViewArray = binary_array
-                .iter()
-                .map(|opt| opt.map(hex_encode::<_>))
-                .collect();
-            ColumnarValue::Array(Arc::new(string_array))
-        }
-        ColumnarValue::Scalar(ScalarValue::Binary(opt)) => {
-            ColumnarValue::Scalar(ScalarValue::Utf8View(opt.map(hex_encode::<_>)))
-        }
-        _ => return exec_err!("Impossibly got invalid results from digest"),
-    })
-}
-
-/// this function exists so that we do not need to pull in the crate hex. it is only used by md5
-/// function below
-#[inline]
-fn hex_encode<T: AsRef<[u8]>>(data: T) -> String {
-    let mut s = String::with_capacity(data.as_ref().len() * 2);
-    for b in data.as_ref() {
-        // Writing to a string never errors, so we can unwrap here.
-        write!(&mut s, "{b:02x}").unwrap();
-    }
-    s
-}
-pub fn utf8_or_binary_to_binary_type(
-    arg_type: &DataType,
-    name: &str,
-) -> Result<DataType> {
-    Ok(match arg_type {
-        DataType::Utf8View
-        | DataType::LargeUtf8
-        | DataType::Utf8
-        | DataType::Binary
-        | DataType::BinaryView
-        | DataType::LargeBinary => DataType::Binary,
-        DataType::Null => DataType::Null,
-        _ => {
-            return plan_err!(
-                "The {name:?} function can only accept strings or binary arrays."
-            );
-        }
-    })
-}
 macro_rules! digest_to_array {
-    ($METHOD:ident, $INPUT:expr) => {{
+    ($MODULE:ident, $METHOD:ident, $INPUT:expr) => {{
+        use $MODULE::Digest;
         let binary_array: BinaryArray = $INPUT
             .iter()
-            .map(|x| {
-                x.map(|x| {
-                    let mut digest = $METHOD::default();
-                    digest.update(x);
-                    digest.finalize()
-                })
-            })
+            .map(|x| x.map(|x| $METHOD::digest(x)))
             .collect();
         Arc::new(binary_array)
     }};
 }
+
+macro_rules! digest_to_scalar {
+    ($MODULE: ident, $METHOD: ident, $INPUT:expr) => {{
+        use $MODULE::Digest;
+        ScalarValue::Binary($INPUT.map(|v| $METHOD::digest(v).as_slice().to_vec()))
+    }};
+}
+
 impl DigestAlgorithm {
     /// digest an optional string to its hash value, null values are returned as is
-    pub fn digest_scalar(self, value: Option<&[u8]>) -> ColumnarValue {
+    fn digest_scalar(self, value: Option<&[u8]>) -> ColumnarValue {
         ColumnarValue::Scalar(match self {
-            Self::Md5 => digest_to_scalar!(Md5, value),
-            Self::Sha224 => digest_to_scalar!(Sha224, value),
-            Self::Sha256 => digest_to_scalar!(Sha256, value),
-            Self::Sha384 => digest_to_scalar!(Sha384, value),
-            Self::Sha512 => digest_to_scalar!(Sha512, value),
-            Self::Blake2b => digest_to_scalar!(Blake2b512, value),
-            Self::Blake2s => digest_to_scalar!(Blake2s256, value),
+            Self::Md5 => digest_to_scalar!(md5, Md5, value),
+            Self::Sha224 => digest_to_scalar!(sha2, Sha224, value),
+            Self::Sha256 => digest_to_scalar!(sha2, Sha256, value),
+            Self::Sha384 => digest_to_scalar!(sha2, Sha384, value),
+            Self::Sha512 => digest_to_scalar!(sha2, Sha512, value),
+            Self::Blake2b => digest_to_scalar!(blake2, Blake2b512, value),
+            Self::Blake2s => digest_to_scalar!(blake2, Blake2s256, value),
             Self::Blake3 => ScalarValue::Binary(value.map(|v| {
                 let mut digest = Blake3::default();
                 digest.update(v);
@@ -250,63 +121,21 @@ impl DigestAlgorithm {
         })
     }
 
-    /// digest a binary array to their hash values
-    pub fn digest_binary_array<T>(self, value: &dyn Array) -> Result<ColumnarValue>
-    where
-        T: OffsetSizeTrait,
-    {
-        let array = match value.data_type() {
-            DataType::Binary | DataType::LargeBinary => {
-                let v = value.as_binary::<T>();
-                self.digest_binary_array_impl::<&GenericBinaryArray<T>>(v)
-            }
-            DataType::BinaryView => {
-                let v = value.as_binary_view();
-                self.digest_binary_array_impl::<&BinaryViewArray>(v)
-            }
-            other => {
-                return exec_err!("unsupported type for digest_utf_array: {other:?}")
-            }
-        };
-        Ok(ColumnarValue::Array(array))
-    }
-
-    /// digest a string array to their hash values
-    pub fn digest_utf8_array<T>(self, value: &dyn Array) -> Result<ColumnarValue>
-    where
-        T: OffsetSizeTrait,
-    {
-        let array = match value.data_type() {
-            DataType::Utf8 | DataType::LargeUtf8 => {
-                let v = value.as_string::<T>();
-                self.digest_utf8_array_impl::<&GenericStringArray<T>>(v)
-            }
-            DataType::Utf8View => {
-                let v = value.as_string_view();
-                self.digest_utf8_array_impl::<&StringViewArray>(v)
-            }
-            other => {
-                return exec_err!("unsupported type for digest_utf_array: {other:?}")
-            }
-        };
-        Ok(ColumnarValue::Array(array))
-    }
-
-    pub fn digest_utf8_array_impl<'a, StringArrType>(
+    fn digest_utf8_array_impl<'a, StringArrType>(
         self,
-        input_value: StringArrType,
+        input_value: &StringArrType,
     ) -> ArrayRef
     where
         StringArrType: StringArrayType<'a>,
     {
         match self {
-            Self::Md5 => digest_to_array!(Md5, input_value),
-            Self::Sha224 => digest_to_array!(Sha224, input_value),
-            Self::Sha256 => digest_to_array!(Sha256, input_value),
-            Self::Sha384 => digest_to_array!(Sha384, input_value),
-            Self::Sha512 => digest_to_array!(Sha512, input_value),
-            Self::Blake2b => digest_to_array!(Blake2b512, input_value),
-            Self::Blake2s => digest_to_array!(Blake2s256, input_value),
+            Self::Md5 => digest_to_array!(md5, Md5, input_value),
+            Self::Sha224 => digest_to_array!(sha2, Sha224, input_value),
+            Self::Sha256 => digest_to_array!(sha2, Sha256, input_value),
+            Self::Sha384 => digest_to_array!(sha2, Sha384, input_value),
+            Self::Sha512 => digest_to_array!(sha2, Sha512, input_value),
+            Self::Blake2b => digest_to_array!(blake2, Blake2b512, input_value),
+            Self::Blake2s => digest_to_array!(blake2, Blake2s256, input_value),
             Self::Blake3 => {
                 let binary_array: BinaryArray = input_value
                     .iter()
@@ -323,21 +152,21 @@ impl DigestAlgorithm {
         }
     }
 
-    pub fn digest_binary_array_impl<'a, BinaryArrType>(
+    fn digest_binary_array_impl<'a, BinaryArrType>(
         self,
-        input_value: BinaryArrType,
+        input_value: &BinaryArrType,
     ) -> ArrayRef
     where
         BinaryArrType: BinaryArrayType<'a>,
     {
         match self {
-            Self::Md5 => digest_to_array!(Md5, input_value),
-            Self::Sha224 => digest_to_array!(Sha224, input_value),
-            Self::Sha256 => digest_to_array!(Sha256, input_value),
-            Self::Sha384 => digest_to_array!(Sha384, input_value),
-            Self::Sha512 => digest_to_array!(Sha512, input_value),
-            Self::Blake2b => digest_to_array!(Blake2b512, input_value),
-            Self::Blake2s => digest_to_array!(Blake2s256, input_value),
+            Self::Md5 => digest_to_array!(md5, Md5, input_value),
+            Self::Sha224 => digest_to_array!(sha2, Sha224, input_value),
+            Self::Sha256 => digest_to_array!(sha2, Sha256, input_value),
+            Self::Sha384 => digest_to_array!(sha2, Sha384, input_value),
+            Self::Sha512 => digest_to_array!(sha2, Sha512, input_value),
+            Self::Blake2b => digest_to_array!(blake2, Blake2b512, input_value),
+            Self::Blake2s => digest_to_array!(blake2, Blake2s256, input_value),
             Self::Blake3 => {
                 let binary_array: BinaryArray = input_value
                     .iter()
@@ -354,26 +183,40 @@ impl DigestAlgorithm {
         }
     }
 }
-pub fn digest_process(
+
+pub(crate) fn digest_process(
     value: &ColumnarValue,
     digest_algorithm: DigestAlgorithm,
 ) -> Result<ColumnarValue> {
     match value {
-        ColumnarValue::Array(a) => match a.data_type() {
-            DataType::Utf8View => digest_algorithm.digest_utf8_array::<i32>(a.as_ref()),
-            DataType::Utf8 => digest_algorithm.digest_utf8_array::<i32>(a.as_ref()),
-            DataType::LargeUtf8 => digest_algorithm.digest_utf8_array::<i64>(a.as_ref()),
-            DataType::Binary => digest_algorithm.digest_binary_array::<i32>(a.as_ref()),
-            DataType::LargeBinary => {
-                digest_algorithm.digest_binary_array::<i64>(a.as_ref())
-            }
-            DataType::BinaryView => {
-                digest_algorithm.digest_binary_array::<i32>(a.as_ref())
-            }
-            other => exec_err!(
-                "Unsupported data type {other:?} for function {digest_algorithm}"
-            ),
-        },
+        ColumnarValue::Array(a) => {
+            let output = match a.data_type() {
+                DataType::Utf8View => {
+                    digest_algorithm.digest_utf8_array_impl(&a.as_string_view())
+                }
+                DataType::Utf8 => {
+                    digest_algorithm.digest_utf8_array_impl(&a.as_string::<i32>())
+                }
+                DataType::LargeUtf8 => {
+                    digest_algorithm.digest_utf8_array_impl(&a.as_string::<i64>())
+                }
+                DataType::Binary => {
+                    digest_algorithm.digest_binary_array_impl(&a.as_binary::<i32>())
+                }
+                DataType::LargeBinary => {
+                    digest_algorithm.digest_binary_array_impl(&a.as_binary::<i64>())
+                }
+                DataType::BinaryView => {
+                    digest_algorithm.digest_binary_array_impl(&a.as_binary_view())
+                }
+                other => {
+                    return exec_err!(
+                        "Unsupported data type {other:?} for function {digest_algorithm}"
+                    );
+                }
+            };
+            Ok(ColumnarValue::Array(output))
+        }
         ColumnarValue::Scalar(scalar) => {
             match scalar {
                 ScalarValue::Utf8View(a)
diff --git a/datafusion/functions/src/crypto/digest.rs b/datafusion/functions/src/crypto/digest.rs
index a4999f72f8d56..84b2c99b00087 100644
--- a/datafusion/functions/src/crypto/digest.rs
+++ b/datafusion/functions/src/crypto/digest.rs
@@ -15,12 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! "crypto" DataFusion functions
-use super::basic::{digest, utf8_or_binary_to_binary_type};
+use crate::crypto::basic::{DigestAlgorithm, digest_process};
+
 use arrow::datatypes::DataType;
 use datafusion_common::{
+    Result, exec_err, not_impl_err,
     types::{logical_binary, logical_string},
-    Result,
+    utils::take_function_args,
 };
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
@@ -28,7 +29,6 @@ use datafusion_expr::{
 };
 use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
 use datafusion_macros::user_doc;
-use std::any::Any;
 
 #[user_doc(
     doc_section(label = "Hashing Functions"),
@@ -36,16 +36,16 @@ use std::any::Any;
     syntax_example = "digest(expression, algorithm)",
     sql_example = r#"```sql
 > select digest('foo', 'sha256');
-+------------------------------------------+
-| digest(Utf8("foo"), Utf8("sha256"))      |
-+------------------------------------------+
-| <binary_hash_result>                     |
-+------------------------------------------+
++------------------------------------------------------------------+
+| digest(Utf8("foo"),Utf8("sha256"))                               |
++------------------------------------------------------------------+
+| 2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae |
++------------------------------------------------------------------+
 ```"#,
     standard_argument(name = "expression", prefix = "String"),
     argument(
         name = "algorithm",
-        description = "String expression specifying algorithm to use. Must be one of:       
+        description = "String expression specifying algorithm to use. Must be one of:
     - md5
     - sha224
     - sha256
@@ -60,6 +60,7 @@ use std::any::Any;
 pub struct DigestFunc {
     signature: Signature,
 }
+
 impl Default for DigestFunc {
     fn default() -> Self {
         Self::new()
@@ -85,11 +86,8 @@ impl DigestFunc {
         }
     }
 }
-impl ScalarUDFImpl for DigestFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
 
+impl ScalarUDFImpl for DigestFunc {
     fn name(&self) -> &str {
         "digest"
     }
@@ -98,14 +96,35 @@ impl ScalarUDFImpl for DigestFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_or_binary_to_binary_type(&arg_types[0], self.name())
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Binary)
     }
+
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        digest(&args.args)
+        let [data, digest_algorithm] = take_function_args(self.name(), &args.args)?;
+        digest(data, digest_algorithm)
     }
 
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 }
+
+/// Compute binary hash of the given `data` (String or Binary array), according
+/// to the specified `digest_algorithm`. See [`DigestAlgorithm`] for supported
+/// algorithms.
+fn digest(
+    data: &ColumnarValue,
+    digest_algorithm: &ColumnarValue,
+) -> Result<ColumnarValue> {
+    let digest_algorithm = match digest_algorithm {
+        ColumnarValue::Scalar(scalar) => match scalar.try_as_str() {
+            Some(Some(method)) => method.parse::<DigestAlgorithm>(),
+            _ => exec_err!("Unsupported data type {scalar:?} for function digest"),
+        },
+        ColumnarValue::Array(_) => {
+            not_impl_err!("Digest using dynamically decided method is not yet supported")
+        }
+    }?;
+    digest_process(data, digest_algorithm)
+}
diff --git a/datafusion/functions/src/crypto/md5.rs b/datafusion/functions/src/crypto/md5.rs
index 88859fdee34a7..178aebf0fbd41 100644
--- a/datafusion/functions/src/crypto/md5.rs
+++ b/datafusion/functions/src/crypto/md5.rs
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! "crypto" DataFusion functions
-use crate::crypto::basic::md5;
-use arrow::datatypes::DataType;
+use arrow::{array::StringViewArray, datatypes::DataType};
 use datafusion_common::{
-    plan_err,
-    types::{logical_binary, logical_string, NativeType},
-    Result,
+    Result, ScalarValue,
+    cast::as_binary_array,
+    internal_err,
+    types::{logical_binary, logical_string},
+    utils::take_function_args,
 };
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
@@ -29,7 +29,9 @@ use datafusion_expr::{
 };
 use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
 use datafusion_macros::user_doc;
-use std::any::Any;
+use std::sync::Arc;
+
+use crate::crypto::basic::{DigestAlgorithm, digest_process};
 
 #[user_doc(
     doc_section(label = "Hashing Functions"),
@@ -37,11 +39,11 @@ use std::any::Any;
     syntax_example = "md5(expression)",
     sql_example = r#"```sql
 > select md5('foo');
-+-------------------------------------+
-| md5(Utf8("foo"))                    |
-+-------------------------------------+
-| <md5_checksum_result>               |
-+-------------------------------------+
++----------------------------------+
+| md5(Utf8("foo"))                 |
++----------------------------------+
+| acbd18db4cc2f85cedef654fccc4a4d8 |
++----------------------------------+
 ```"#,
     standard_argument(name = "expression", prefix = "String")
 )]
@@ -49,6 +51,7 @@ use std::any::Any;
 pub struct Md5Func {
     signature: Signature,
 }
+
 impl Default for Md5Func {
     fn default() -> Self {
         Self::new()
@@ -60,15 +63,11 @@ impl Md5Func {
         Self {
             signature: Signature::one_of(
                 vec![
-                    TypeSignature::Coercible(vec![Coercion::new_implicit(
-                        TypeSignatureClass::Native(logical_binary()),
-                        vec![TypeSignatureClass::Native(logical_string())],
-                        NativeType::String,
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Native(logical_string()),
                     )]),
-                    TypeSignature::Coercible(vec![Coercion::new_implicit(
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
                         TypeSignatureClass::Native(logical_binary()),
-                        vec![TypeSignatureClass::Native(logical_binary())],
-                        NativeType::Binary,
                     )]),
                 ],
                 Volatility::Immutable,
@@ -76,11 +75,8 @@ impl Md5Func {
         }
     }
 }
-impl ScalarUDFImpl for Md5Func {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
 
+impl ScalarUDFImpl for Md5Func {
     fn name(&self) -> &str {
         "md5"
     }
@@ -89,30 +85,10 @@ impl ScalarUDFImpl for Md5Func {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        use DataType::*;
-        Ok(match &arg_types[0] {
-            LargeUtf8 | LargeBinary => Utf8View,
-            Utf8View | Utf8 | Binary | BinaryView => Utf8View,
-            Null => Null,
-            Dictionary(_, t) => match **t {
-                LargeUtf8 | LargeBinary => Utf8View,
-                Utf8 | Binary | BinaryView => Utf8View,
-                Null => Null,
-                _ => {
-                    return plan_err!(
-                        "the md5 can only accept strings but got {:?}",
-                        **t
-                    );
-                }
-            },
-            other => {
-                return plan_err!(
-                    "The md5 function can only accept strings. Got {other}"
-                );
-            }
-        })
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Utf8View)
     }
+
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         md5(&args.args)
     }
@@ -121,3 +97,38 @@ impl ScalarUDFImpl for Md5Func {
         self.doc()
     }
 }
+
+/// Hex encoding lookup table for fast byte-to-hex conversion
+const HEX_CHARS_LOWER: &[u8; 16] = b"0123456789abcdef";
+
+/// Fast hex encoding using a lookup table instead of format strings.
+/// This is significantly faster than using `write!("{:02x}")` for each byte.
+#[inline]
+fn hex_encode(data: impl AsRef<[u8]>) -> String {
+    let bytes = data.as_ref();
+    let mut s = String::with_capacity(bytes.len() * 2);
+    for &b in bytes {
+        s.push(HEX_CHARS_LOWER[(b >> 4) as usize] as char);
+        s.push(HEX_CHARS_LOWER[(b & 0x0f) as usize] as char);
+    }
+    s
+}
+
+fn md5(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+    let [data] = take_function_args("md5", args)?;
+    let value = digest_process(data, DigestAlgorithm::Md5)?;
+
+    // md5 requires special handling because of its unique utf8view return type
+    Ok(match value {
+        ColumnarValue::Array(array) => {
+            let binary_array = as_binary_array(&array)?;
+            let string_array: StringViewArray =
+                binary_array.iter().map(|opt| opt.map(hex_encode)).collect();
+            ColumnarValue::Array(Arc::new(string_array))
+        }
+        ColumnarValue::Scalar(ScalarValue::Binary(opt)) => {
+            ColumnarValue::Scalar(ScalarValue::Utf8View(opt.map(hex_encode)))
+        }
+        _ => return internal_err!("Impossibly got invalid results from digest"),
+    })
+}
diff --git a/datafusion/functions/src/crypto/mod.rs b/datafusion/functions/src/crypto/mod.rs
index 62ea3c2e27371..fd15db44c795d 100644
--- a/datafusion/functions/src/crypto/mod.rs
+++ b/datafusion/functions/src/crypto/mod.rs
@@ -23,16 +23,13 @@ use std::sync::Arc;
 pub mod basic;
 pub mod digest;
 pub mod md5;
-pub mod sha224;
-pub mod sha256;
-pub mod sha384;
-pub mod sha512;
+pub mod sha;
 make_udf_function!(digest::DigestFunc, digest);
 make_udf_function!(md5::Md5Func, md5);
-make_udf_function!(sha224::SHA224Func, sha224);
-make_udf_function!(sha256::SHA256Func, sha256);
-make_udf_function!(sha384::SHA384Func, sha384);
-make_udf_function!(sha512::SHA512Func, sha512);
+make_udf_function!(sha::SHAFunc, sha224, sha::SHAFunc::sha224);
+make_udf_function!(sha::SHAFunc, sha256, sha::SHAFunc::sha256);
+make_udf_function!(sha::SHAFunc, sha384, sha::SHAFunc::sha384);
+make_udf_function!(sha::SHAFunc, sha512, sha::SHAFunc::sha512);
 
 pub mod expr_fn {
     export_functions!((
diff --git a/datafusion/functions/src/crypto/sha.rs b/datafusion/functions/src/crypto/sha.rs
new file mode 100644
index 0000000000000..65153fa117eda
--- /dev/null
+++ b/datafusion/functions/src/crypto/sha.rs
@@ -0,0 +1,170 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::crypto::basic::{DigestAlgorithm, digest_process};
+
+use arrow::datatypes::DataType;
+use datafusion_common::{
+    Result,
+    types::{logical_binary, logical_string},
+    utils::take_function_args,
+};
+use datafusion_expr::{
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature, Volatility,
+};
+use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
+use datafusion_macros::user_doc;
+
+#[user_doc(
+    doc_section(label = "Hashing Functions"),
+    description = "Computes the SHA-224 hash of a binary string.",
+    syntax_example = "sha224(expression)",
+    sql_example = r#"```sql
+> select sha224('foo');
++----------------------------------------------------------+
+| sha224(Utf8("foo"))                                      |
++----------------------------------------------------------+
+| 0808f64e60d58979fcb676c96ec938270dea42445aeefcd3a4e6f8db |
++----------------------------------------------------------+
+```"#,
+    standard_argument(name = "expression", prefix = "String")
+)]
+struct SHA224Doc;
+
+#[user_doc(
+    doc_section(label = "Hashing Functions"),
+    description = "Computes the SHA-256 hash of a binary string.",
+    syntax_example = "sha256(expression)",
+    sql_example = r#"```sql
+> select sha256('foo');
++------------------------------------------------------------------+
+| sha256(Utf8("foo"))                                              |
++------------------------------------------------------------------+
+| 2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae |
++------------------------------------------------------------------+
+```"#,
+    standard_argument(name = "expression", prefix = "String")
+)]
+struct SHA256Doc;
+
+#[user_doc(
+    doc_section(label = "Hashing Functions"),
+    description = "Computes the SHA-384 hash of a binary string.",
+    syntax_example = "sha384(expression)",
+    sql_example = r#"```sql
+> select sha384('foo');
++--------------------------------------------------------------------------------------------------+
+| sha384(Utf8("foo"))                                                                              |
++--------------------------------------------------------------------------------------------------+
+| 98c11ffdfdd540676b1a137cb1a22b2a70350c9a44171d6b1180c6be5cbb2ee3f79d532c8a1dd9ef2e8e08e752a3babb |
++--------------------------------------------------------------------------------------------------+
+```"#,
+    standard_argument(name = "expression", prefix = "String")
+)]
+struct SHA384Doc;
+
+#[user_doc(
+    doc_section(label = "Hashing Functions"),
+    description = "Computes the SHA-512 hash of a binary string.",
+    syntax_example = "sha512(expression)",
+    sql_example = r#"```sql
+> select sha512('foo');
++----------------------------------------------------------------------------------------------------------------------------------+
+| sha512(Utf8("foo"))                                                                                                              |
++----------------------------------------------------------------------------------------------------------------------------------+
+| f7fbba6e0636f890e56fbbf3283e524c6fa3204ae298382d624741d0dc6638326e282c41be5e4254d8820772c5518a2c5a8c0c7f7eda19594a7eb539453e1ed7 |
++----------------------------------------------------------------------------------------------------------------------------------+
+```"#,
+    standard_argument(name = "expression", prefix = "String")
+)]
+struct SHA512Doc;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SHAFunc {
+    signature: Signature,
+    name: &'static str,
+    algorithm: DigestAlgorithm,
+}
+
+impl SHAFunc {
+    pub fn sha224() -> Self {
+        Self::new("sha224", DigestAlgorithm::Sha224)
+    }
+
+    pub fn sha256() -> Self {
+        Self::new("sha256", DigestAlgorithm::Sha256)
+    }
+
+    pub fn sha384() -> Self {
+        Self::new("sha384", DigestAlgorithm::Sha384)
+    }
+
+    pub fn sha512() -> Self {
+        Self::new("sha512", DigestAlgorithm::Sha512)
+    }
+
+    fn new(name: &'static str, algorithm: DigestAlgorithm) -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Native(logical_string()),
+                    )]),
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Native(logical_binary()),
+                    )]),
+                ],
+                Volatility::Immutable,
+            ),
+            name,
+            algorithm,
+        }
+    }
+}
+
+impl ScalarUDFImpl for SHAFunc {
+    fn name(&self) -> &str {
+        self.name
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Binary)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let [data] = take_function_args(self.name(), args.args)?;
+        digest_process(&data, self.algorithm)
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        match self.algorithm {
+            DigestAlgorithm::Sha224 => SHA224Doc {}.doc(),
+            DigestAlgorithm::Sha256 => SHA256Doc {}.doc(),
+            DigestAlgorithm::Sha384 => SHA384Doc {}.doc(),
+            DigestAlgorithm::Sha512 => SHA512Doc {}.doc(),
+            DigestAlgorithm::Md5
+            | DigestAlgorithm::Blake2s
+            | DigestAlgorithm::Blake2b
+            | DigestAlgorithm::Blake3 => unreachable!(),
+        }
+    }
+}
diff --git a/datafusion/functions/src/crypto/sha224.rs b/datafusion/functions/src/crypto/sha224.rs
deleted file mode 100644
index 69b79cce72c4e..0000000000000
--- a/datafusion/functions/src/crypto/sha224.rs
+++ /dev/null
@@ -1,104 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! "crypto" DataFusion functions
-use super::basic::{sha224, utf8_or_binary_to_binary_type};
-use arrow::datatypes::DataType;
-use datafusion_common::{
-    types::{logical_binary, logical_string, NativeType},
-    Result,
-};
-use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
-    TypeSignature, Volatility,
-};
-use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
-use datafusion_macros::user_doc;
-use std::any::Any;
-
-#[user_doc(
-    doc_section(label = "Hashing Functions"),
-    description = "Computes the SHA-224 hash of a binary string.",
-    syntax_example = "sha224(expression)",
-    sql_example = r#"```sql
-> select sha224('foo');
-+------------------------------------------+
-| sha224(Utf8("foo"))                      |
-+------------------------------------------+
-| <sha224_hash_result>                     |
-+------------------------------------------+
-```"#,
-    standard_argument(name = "expression", prefix = "String")
-)]
-#[derive(Debug, PartialEq, Eq, Hash)]
-pub struct SHA224Func {
-    signature: Signature,
-}
-
-impl Default for SHA224Func {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl SHA224Func {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::one_of(
-                vec![
-                    TypeSignature::Coercible(vec![Coercion::new_implicit(
-                        TypeSignatureClass::Native(logical_binary()),
-                        vec![TypeSignatureClass::Native(logical_string())],
-                        NativeType::String,
-                    )]),
-                    TypeSignature::Coercible(vec![Coercion::new_implicit(
-                        TypeSignatureClass::Native(logical_binary()),
-                        vec![TypeSignatureClass::Native(logical_binary())],
-                        NativeType::Binary,
-                    )]),
-                ],
-                Volatility::Immutable,
-            ),
-        }
-    }
-}
-
-impl ScalarUDFImpl for SHA224Func {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn name(&self) -> &str {
-        "sha224"
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_or_binary_to_binary_type(&arg_types[0], self.name())
-    }
-
-    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        sha224(&args.args)
-    }
-
-    fn documentation(&self) -> Option<&Documentation> {
-        self.doc()
-    }
-}
diff --git a/datafusion/functions/src/crypto/sha256.rs b/datafusion/functions/src/crypto/sha256.rs
deleted file mode 100644
index 9a948ba50c9e1..0000000000000
--- a/datafusion/functions/src/crypto/sha256.rs
+++ /dev/null
@@ -1,102 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! "crypto" DataFusion functions
-use super::basic::{sha256, utf8_or_binary_to_binary_type};
-use arrow::datatypes::DataType;
-use datafusion_common::{
-    types::{logical_binary, logical_string, NativeType},
-    Result,
-};
-use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
-    TypeSignature, Volatility,
-};
-use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
-use datafusion_macros::user_doc;
-use std::any::Any;
-
-#[user_doc(
-    doc_section(label = "Hashing Functions"),
-    description = "Computes the SHA-256 hash of a binary string.",
-    syntax_example = "sha256(expression)",
-    sql_example = r#"```sql
-> select sha256('foo');
-+--------------------------------------+
-| sha256(Utf8("foo"))                  |
-+--------------------------------------+
-| <sha256_hash_result>                 |
-+--------------------------------------+
-```"#,
-    standard_argument(name = "expression", prefix = "String")
-)]
-#[derive(Debug, PartialEq, Eq, Hash)]
-pub struct SHA256Func {
-    signature: Signature,
-}
-impl Default for SHA256Func {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl SHA256Func {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::one_of(
-                vec![
-                    TypeSignature::Coercible(vec![Coercion::new_implicit(
-                        TypeSignatureClass::Native(logical_binary()),
-                        vec![TypeSignatureClass::Native(logical_string())],
-                        NativeType::String,
-                    )]),
-                    TypeSignature::Coercible(vec![Coercion::new_implicit(
-                        TypeSignatureClass::Native(logical_binary()),
-                        vec![TypeSignatureClass::Native(logical_binary())],
-                        NativeType::Binary,
-                    )]),
-                ],
-                Volatility::Immutable,
-            ),
-        }
-    }
-}
-impl ScalarUDFImpl for SHA256Func {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn name(&self) -> &str {
-        "sha256"
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_or_binary_to_binary_type(&arg_types[0], self.name())
-    }
-
-    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        sha256(&args.args)
-    }
-
-    fn documentation(&self) -> Option<&Documentation> {
-        self.doc()
-    }
-}
diff --git a/datafusion/functions/src/crypto/sha384.rs b/datafusion/functions/src/crypto/sha384.rs
deleted file mode 100644
index 9e363cf883d29..0000000000000
--- a/datafusion/functions/src/crypto/sha384.rs
+++ /dev/null
@@ -1,102 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! "crypto" DataFusion functions
-use super::basic::{sha384, utf8_or_binary_to_binary_type};
-use arrow::datatypes::DataType;
-use datafusion_common::{
-    types::{logical_binary, logical_string, NativeType},
-    Result,
-};
-use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
-    TypeSignature, Volatility,
-};
-use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
-use datafusion_macros::user_doc;
-use std::any::Any;
-
-#[user_doc(
-    doc_section(label = "Hashing Functions"),
-    description = "Computes the SHA-384 hash of a binary string.",
-    syntax_example = "sha384(expression)",
-    sql_example = r#"```sql
-> select sha384('foo');
-+-----------------------------------------+
-| sha384(Utf8("foo"))                     |
-+-----------------------------------------+
-| <sha384_hash_result>                    |
-+-----------------------------------------+
-```"#,
-    standard_argument(name = "expression", prefix = "String")
-)]
-#[derive(Debug, PartialEq, Eq, Hash)]
-pub struct SHA384Func {
-    signature: Signature,
-}
-impl Default for SHA384Func {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl SHA384Func {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::one_of(
-                vec![
-                    TypeSignature::Coercible(vec![Coercion::new_implicit(
-                        TypeSignatureClass::Native(logical_binary()),
-                        vec![TypeSignatureClass::Native(logical_string())],
-                        NativeType::String,
-                    )]),
-                    TypeSignature::Coercible(vec![Coercion::new_implicit(
-                        TypeSignatureClass::Native(logical_binary()),
-                        vec![TypeSignatureClass::Native(logical_binary())],
-                        NativeType::Binary,
-                    )]),
-                ],
-                Volatility::Immutable,
-            ),
-        }
-    }
-}
-impl ScalarUDFImpl for SHA384Func {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn name(&self) -> &str {
-        "sha384"
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_or_binary_to_binary_type(&arg_types[0], self.name())
-    }
-
-    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        sha384(&args.args)
-    }
-
-    fn documentation(&self) -> Option<&Documentation> {
-        self.doc()
-    }
-}
diff --git a/datafusion/functions/src/crypto/sha512.rs b/datafusion/functions/src/crypto/sha512.rs
deleted file mode 100644
index a185698ca46ff..0000000000000
--- a/datafusion/functions/src/crypto/sha512.rs
+++ /dev/null
@@ -1,102 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! "crypto" DataFusion functions
-use super::basic::{sha512, utf8_or_binary_to_binary_type};
-use arrow::datatypes::DataType;
-use datafusion_common::{
-    types::{logical_binary, logical_string, NativeType},
-    Result,
-};
-use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
-    TypeSignature, Volatility,
-};
-use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
-use datafusion_macros::user_doc;
-use std::any::Any;
-
-#[user_doc(
-    doc_section(label = "Hashing Functions"),
-    description = "Computes the SHA-512 hash of a binary string.",
-    syntax_example = "sha512(expression)",
-    sql_example = r#"```sql
-> select sha512('foo');
-+-------------------------------------------+
-| sha512(Utf8("foo"))                       |
-+-------------------------------------------+
-| <sha512_hash_result>                      |
-+-------------------------------------------+
-```"#,
-    standard_argument(name = "expression", prefix = "String")
-)]
-#[derive(Debug, PartialEq, Eq, Hash)]
-pub struct SHA512Func {
-    signature: Signature,
-}
-impl Default for SHA512Func {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl SHA512Func {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::one_of(
-                vec![
-                    TypeSignature::Coercible(vec![Coercion::new_implicit(
-                        TypeSignatureClass::Native(logical_binary()),
-                        vec![TypeSignatureClass::Native(logical_string())],
-                        NativeType::String,
-                    )]),
-                    TypeSignature::Coercible(vec![Coercion::new_implicit(
-                        TypeSignatureClass::Native(logical_binary()),
-                        vec![TypeSignatureClass::Native(logical_binary())],
-                        NativeType::Binary,
-                    )]),
-                ],
-                Volatility::Immutable,
-            ),
-        }
-    }
-}
-impl ScalarUDFImpl for SHA512Func {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn name(&self) -> &str {
-        "sha512"
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_or_binary_to_binary_type(&arg_types[0], self.name())
-    }
-
-    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        sha512(&args.args)
-    }
-
-    fn documentation(&self) -> Option<&Documentation> {
-        self.doc()
-    }
-}
diff --git a/datafusion/functions/src/datetime/common.rs b/datafusion/functions/src/datetime/common.rs
index 90b92a7f88f9f..2db64beafa9b7 100644
--- a/datafusion/functions/src/datetime/common.rs
+++ b/datafusion/functions/src/datetime/common.rs
@@ -15,31 +15,57 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::sync::Arc;
+use std::sync::{Arc, LazyLock};
 
+use arrow::array::timezone::Tz;
 use arrow::array::{
     Array, ArrowPrimitiveType, AsArray, GenericStringArray, PrimitiveArray,
     StringArrayType, StringViewArray,
 };
-use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos;
-use arrow::datatypes::DataType;
-use chrono::format::{parse, Parsed, StrftimeItems};
+use arrow::compute::DecimalCast;
+use arrow::compute::kernels::cast_utils::string_to_datetime;
+use arrow::datatypes::{DataType, TimeUnit};
+use arrow_buffer::ArrowNativeType;
 use chrono::LocalResult::Single;
+use chrono::format::{Parsed, StrftimeItems, parse};
 use chrono::{DateTime, TimeZone, Utc};
-
 use datafusion_common::cast::as_generic_string_array;
 use datafusion_common::{
-    exec_datafusion_err, exec_err, unwrap_or_internal_err, DataFusionError, Result,
-    ScalarType, ScalarValue,
+    DataFusionError, Result, ScalarValue, exec_datafusion_err, exec_err,
+    internal_datafusion_err, unwrap_or_internal_err,
 };
 use datafusion_expr::ColumnarValue;
 
 /// Error message if nanosecond conversion request beyond supported interval
 const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804";
 
-/// Calls string_to_timestamp_nanos and converts the error type
-pub(crate) fn string_to_timestamp_nanos_shim(s: &str) -> Result<i64> {
-    string_to_timestamp_nanos(s).map_err(|e| e.into())
+static UTC: LazyLock<Tz> = LazyLock::new(|| "UTC".parse().expect("UTC is always valid"));
+
+/// Converts a string representation of a date‑time into a timestamp expressed in
+/// nanoseconds since the Unix epoch.
+///
+/// This helper is a thin wrapper around the more general `string_to_datetime`
+/// function. It accepts an optional `timezone` which, if `None`, defaults to
+/// Coordinated Universal Time (UTC). The string `s` must contain a valid
+/// date‑time format that can be parsed by the underlying chrono parser.
+///
+/// # Return Value
+///
+/// * `Ok(i64)` – The number of nanoseconds since `1970‑01‑01T00:00:00Z`.
+/// * `Err(DataFusionError)` – If the string cannot be parsed, the parsed
+///   value is out of range (between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804)
+///   or the parsed value does not correspond to an unambiguous time.
+pub(crate) fn string_to_timestamp_nanos_with_timezone(
+    timezone: &Option<Tz>,
+    s: &str,
+) -> Result<i64> {
+    let tz = timezone.as_ref().unwrap_or(&UTC);
+    let dt = string_to_datetime(tz, s)?;
+    let parsed = dt
+        .timestamp_nanos_opt()
+        .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))?;
+
+    Ok(parsed)
 }
 
 /// Checks that all the arguments from the second are of type [Utf8], [LargeUtf8] or [Utf8View]
@@ -69,13 +95,12 @@ pub(crate) fn validate_data_types(args: &[ColumnarValue], name: &str) -> Result<
 /// Accepts a string and parses it using the [`chrono::format::strftime`] specifiers
 /// relative to the provided `timezone`
 ///
-/// [IANA timezones] are only supported if the `arrow-array/chrono-tz` feature is enabled
-///
-/// * `2023-01-01 040506 America/Los_Angeles`
-///
 /// If a timestamp is ambiguous, for example as a result of daylight-savings time, an error
 /// will be returned
 ///
+/// Note that parsing [IANA timezones] is not supported yet in chrono - <https://github.com/chronotope/chrono/issues/38>
+/// and this implementation only supports named timezones at the end of the string preceded by a space.
+///
 /// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html
 /// [IANA timezones]: https://www.iana.org/time-zones
 pub(crate) fn string_to_datetime_formatted<T: TimeZone>(
@@ -89,11 +114,55 @@ pub(crate) fn string_to_datetime_formatted<T: TimeZone>(
         )
     };
 
+    let mut datetime_str = s;
+    let mut format = format;
+
+    // Manually handle the most common case of a named timezone at the end of the timestamp.
+    // Note that %+ handles 'Z' at the end of the string without a space. This code doesn't
+    // handle named timezones with no preceding space since that would require writing a
+    // custom parser (or switching to Jiff)
+    let tz: Option<chrono_tz::Tz> = if format.trim_end().ends_with(" %Z") {
+        // grab the string after the last space as the named timezone
+        if let Some((dt_str, timezone_name)) = datetime_str.trim_end().rsplit_once(' ') {
+            datetime_str = dt_str;
+
+            // attempt to parse the timezone name
+            let result: Result<chrono_tz::Tz, chrono_tz::ParseError> =
+                timezone_name.parse();
+            let Ok(tz) = result else {
+                return Err(err(&result.unwrap_err().to_string()));
+            };
+
+            // successfully parsed the timezone name, remove the ' %Z' from the format
+            format = &format[..format.len() - 3];
+
+            Some(tz)
+        } else {
+            None
+        }
+    } else if format.contains("%Z") {
+        return Err(err(
+            "'%Z' is only supported at the end of the format string preceded by a space",
+        ));
+    } else {
+        None
+    };
+
     let mut parsed = Parsed::new();
-    parse(&mut parsed, s, StrftimeItems::new(format)).map_err(|e| err(&e.to_string()))?;
+    parse(&mut parsed, datetime_str, StrftimeItems::new(format))
+        .map_err(|e| err(&e.to_string()))?;
 
-    // attempt to parse the string assuming it has a timezone
-    let dt = parsed.to_datetime();
+    let dt = match tz {
+        Some(tz) => {
+            // A timezone was manually parsed out, convert it to a fixed offset
+            match parsed.to_datetime_with_timezone(&tz) {
+                Ok(dt) => Ok(dt.fixed_offset()),
+                Err(e) => Err(e),
+            }
+        }
+        // default to parse the string assuming it has a timezone
+        None => parsed.to_datetime(),
+    };
 
     if let Err(e) = &dt {
         // no timezone or other failure, try without a timezone
@@ -115,7 +184,7 @@ pub(crate) fn string_to_datetime_formatted<T: TimeZone>(
 }
 
 /// Accepts a string with a `chrono` format and converts it to a
-/// nanosecond precision timestamp.
+/// nanosecond precision timestamp relative to the provided `timezone`.
 ///
 /// See [`chrono::format::strftime`] for the full set of supported formats.
 ///
@@ -141,19 +210,21 @@ pub(crate) fn string_to_datetime_formatted<T: TimeZone>(
 ///
 /// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html
 #[inline]
-pub(crate) fn string_to_timestamp_nanos_formatted(
+pub(crate) fn string_to_timestamp_nanos_formatted_with_timezone(
+    timezone: &Option<Tz>,
     s: &str,
     format: &str,
 ) -> Result<i64, DataFusionError> {
-    string_to_datetime_formatted(&Utc, s, format)?
-        .naive_utc()
-        .and_utc()
+    let dt = string_to_datetime_formatted(timezone.as_ref().unwrap_or(&UTC), s, format)?;
+    let parsed = dt
         .timestamp_nanos_opt()
-        .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))
+        .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))?;
+
+    Ok(parsed)
 }
 
 /// Accepts a string with a `chrono` format and converts it to a
-/// millisecond precision timestamp.
+/// millisecond precision timestamp relative to the provided `timezone`.
 ///
 /// See [`chrono::format::strftime`] for the full set of supported formats.
 ///
@@ -176,33 +247,33 @@ pub(crate) fn string_to_timestamp_millis_formatted(s: &str, format: &str) -> Res
         .timestamp_millis())
 }
 
-pub(crate) fn handle<O, F, S>(
+pub(crate) fn handle<O, F>(
     args: &[ColumnarValue],
     op: F,
     name: &str,
+    dt: &DataType,
 ) -> Result<ColumnarValue>
 where
     O: ArrowPrimitiveType,
-    S: ScalarType<O::Native>,
     F: Fn(&str) -> Result<O::Native>,
 {
     match &args[0] {
         ColumnarValue::Array(a) => match a.data_type() {
             DataType::Utf8View => Ok(ColumnarValue::Array(Arc::new(
                 unary_string_to_primitive_function::<&StringViewArray, O, _>(
-                    a.as_ref().as_string_view(),
+                    &a.as_string_view(),
                     op,
                 )?,
             ))),
             DataType::LargeUtf8 => Ok(ColumnarValue::Array(Arc::new(
                 unary_string_to_primitive_function::<&GenericStringArray<i64>, O, _>(
-                    a.as_ref().as_string::<i64>(),
+                    &a.as_string::<i64>(),
                     op,
                 )?,
             ))),
             DataType::Utf8 => Ok(ColumnarValue::Array(Arc::new(
                 unary_string_to_primitive_function::<&GenericStringArray<i32>, O, _>(
-                    a.as_ref().as_string::<i32>(),
+                    &a.as_string::<i32>(),
                     op,
                 )?,
             ))),
@@ -210,8 +281,13 @@ where
         },
         ColumnarValue::Scalar(scalar) => match scalar.try_as_str() {
             Some(a) => {
-                let result = a.as_ref().map(|x| op(x)).transpose()?;
-                Ok(ColumnarValue::Scalar(S::scalar(result)))
+                let result = a
+                    .as_ref()
+                    .map(|x| op(x))
+                    .transpose()?
+                    .and_then(|v| v.to_i64());
+                let s = scalar_value(dt, result)?;
+                Ok(ColumnarValue::Scalar(s))
             }
             _ => exec_err!("Unsupported data type {scalar:?} for function {name}"),
         },
@@ -221,15 +297,15 @@ where
 // Given a function that maps a `&str`, `&str` to an arrow native type,
 // returns a `ColumnarValue` where the function is applied to either a `ArrayRef` or `ScalarValue`
 // depending on the `args`'s variant.
-pub(crate) fn handle_multiple<O, F, S, M>(
+pub(crate) fn handle_multiple<O, F, M>(
     args: &[ColumnarValue],
     op: F,
     op2: M,
     name: &str,
+    dt: &DataType,
 ) -> Result<ColumnarValue>
 where
     O: ArrowPrimitiveType,
-    S: ScalarType<O::Native>,
     F: Fn(&str, &str) -> Result<O::Native>,
     M: Fn(O::Native) -> O::Native,
 {
@@ -243,14 +319,24 @@ where
                             DataType::Utf8View | DataType::LargeUtf8 | DataType::Utf8 => {
                                 // all good
                             }
-                            other => return exec_err!("Unsupported data type {other:?} for function {name}, arg # {pos}"),
+                            other => {
+                                return exec_err!(
+                                    "Unsupported data type {other:?} for function {name}, arg # {pos}"
+                                );
+                            }
                         },
                         ColumnarValue::Scalar(arg) => {
                             match arg.data_type() {
-                                DataType::Utf8View| DataType::LargeUtf8 | DataType::Utf8 => {
+                                DataType::Utf8View
+                                | DataType::LargeUtf8
+                                | DataType::Utf8 => {
                                     // all good
                                 }
-                                other => return exec_err!("Unsupported data type {other:?} for function {name}, arg # {pos}"),
+                                other => {
+                                    return exec_err!(
+                                        "Unsupported data type {other:?} for function {name}, arg # {pos}"
+                                    );
+                                }
                             }
                         }
                     }
@@ -280,15 +366,17 @@ where
                         | ScalarValue::Utf8(x),
                     ) = v
                     else {
-                        return exec_err!("Unsupported data type {v:?} for function {name}, arg # {pos}");
+                        return exec_err!(
+                            "Unsupported data type {v:?} for function {name}, arg # {pos}"
+                        );
                     };
 
                     if let Some(s) = x {
                         match op(a, s.as_str()) {
                             Ok(r) => {
-                                ret = Some(Ok(ColumnarValue::Scalar(S::scalar(Some(
-                                    op2(r),
-                                )))));
+                                let result = op2(r).to_i64();
+                                let s = scalar_value(dt, result)?;
+                                ret = Some(Ok(ColumnarValue::Scalar(s)));
                                 break;
                             }
                             Err(e) => ret = Some(Err(e)),
@@ -431,7 +519,7 @@ where
 /// * the number of arguments is not 1 or
 /// * the function `op` errors
 fn unary_string_to_primitive_function<'a, StringArrType, O, F>(
-    array: StringArrType,
+    array: &StringArrType,
     op: F,
 ) -> Result<PrimitiveArray<O>>
 where
@@ -442,3 +530,16 @@ where
     // first map is the iterator, second is for the `Option<_>`
     array.iter().map(|x| x.map(&op).transpose()).collect()
 }
+
+fn scalar_value(dt: &DataType, r: Option<i64>) -> Result<ScalarValue> {
+    match dt {
+        DataType::Date32 => Ok(ScalarValue::Date32(r.and_then(|v| v.to_i32()))),
+        DataType::Timestamp(u, tz) => match u {
+            TimeUnit::Second => Ok(ScalarValue::TimestampSecond(r, tz.clone())),
+            TimeUnit::Millisecond => Ok(ScalarValue::TimestampMillisecond(r, tz.clone())),
+            TimeUnit::Microsecond => Ok(ScalarValue::TimestampMicrosecond(r, tz.clone())),
+            TimeUnit::Nanosecond => Ok(ScalarValue::TimestampNanosecond(r, tz.clone())),
+        },
+        t => Err(internal_datafusion_err!("Unsupported data type: {t:?}")),
+    }
+}
diff --git a/datafusion/functions/src/datetime/current_date.rs b/datafusion/functions/src/datetime/current_date.rs
index da690b4e6be18..d07a3b1caf13b 100644
--- a/datafusion/functions/src/datetime/current_date.rs
+++ b/datafusion/functions/src/datetime/current_date.rs
@@ -15,17 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
-
 use arrow::array::timezone::Tz;
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::Date32;
 use chrono::{Datelike, NaiveDate, TimeZone};
 
-use datafusion_common::{internal_err, Result, ScalarValue};
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion_common::{Result, ScalarValue, internal_err};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr::{
-    ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -38,7 +37,24 @@ The `current_date()` return value is determined at query time and will return th
 "#,
     syntax_example = r#"current_date()
     (optional) SET datafusion.execution.time_zone = '+00:00';
-    SELECT current_date();"#
+    SELECT current_date();"#,
+    sql_example = r#"```sql
+> SELECT current_date();
++----------------+
+| current_date() |
++----------------+
+| 2024-12-23     |
++----------------+
+
+-- The current date is based on the session time zone (UTC by default)
+> SET datafusion.execution.time_zone = 'Asia/Tokyo';
+> SELECT current_date();
++----------------+
+| current_date() |
++----------------+
+| 2024-12-24     |
++----------------+
+```"#
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
 pub struct CurrentDateFunc {
@@ -68,10 +84,6 @@ impl CurrentDateFunc {
 /// wherever it appears within a single statement. This value is
 /// chosen during planning time.
 impl ScalarUDFImpl for CurrentDateFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "current_date"
     }
@@ -84,10 +96,7 @@ impl ScalarUDFImpl for CurrentDateFunc {
         Ok(Date32)
     }
 
-    fn invoke_with_args(
-        &self,
-        _args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         internal_err!(
             "invoke should not be called on a simplified current_date() function"
         )
@@ -99,23 +108,20 @@ impl ScalarUDFImpl for CurrentDateFunc {
 
     fn simplify(
         &self,
-        _args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
-        let now_ts = info.execution_props().query_execution_start_time;
+        let Some(now_ts) = info.query_execution_start_time() else {
+            return Ok(ExprSimplifyResult::Original(args));
+        };
 
         // Get timezone from config and convert to local time
         let days = info
-            .execution_props()
             .config_options()
-            .and_then(|config| {
-                config
-                    .execution
-                    .time_zone
-                    .as_ref()
-                    .map(|tz| tz.parse::<Tz>().ok())
-            })
-            .flatten()
+            .execution
+            .time_zone
+            .as_ref()
+            .and_then(|tz| tz.parse::<Tz>().ok())
             .map_or_else(
                 || datetime_to_days(&now_ts),
                 |tz| {
diff --git a/datafusion/functions/src/datetime/current_time.rs b/datafusion/functions/src/datetime/current_time.rs
index 9f3456b8777f0..92f4ae5e66f02 100644
--- a/datafusion/functions/src/datetime/current_time.rs
+++ b/datafusion/functions/src/datetime/current_time.rs
@@ -21,13 +21,13 @@ use arrow::datatypes::DataType::Time64;
 use arrow::datatypes::TimeUnit::Nanosecond;
 use chrono::TimeZone;
 use chrono::Timelike;
-use datafusion_common::{internal_err, Result, ScalarValue};
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion_common::{Result, ScalarValue, internal_err};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr::{
-    ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
 
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
@@ -40,7 +40,24 @@ The session time zone can be set using the statement 'SET datafusion.execution.t
 "#,
     syntax_example = r#"current_time()
     (optional) SET datafusion.execution.time_zone = '+00:00';
-    SELECT current_time();"#
+    SELECT current_time();"#,
+    sql_example = r#"```sql
+> SELECT current_time();
++--------------------+
+| current_time()     |
++--------------------+
+| 06:30:00.123456789 |
++--------------------+
+
+-- The current time is based on the session time zone (UTC by default)
+> SET datafusion.execution.time_zone = 'Asia/Tokyo';
+> SELECT current_time();
++--------------------+
+| current_time()     |
++--------------------+
+| 15:30:00.123456789 |
++--------------------+
+```"#
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
 pub struct CurrentTimeFunc {
@@ -68,10 +85,6 @@ impl CurrentTimeFunc {
 /// wherever it appears within a single statement. This value is
 /// chosen during planning time.
 impl ScalarUDFImpl for CurrentTimeFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "current_time"
     }
@@ -84,10 +97,7 @@ impl ScalarUDFImpl for CurrentTimeFunc {
         Ok(Time64(Nanosecond))
     }
 
-    fn invoke_with_args(
-        &self,
-        _args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         internal_err!(
             "invoke should not be called on a simplified current_time() function"
         )
@@ -95,23 +105,20 @@ impl ScalarUDFImpl for CurrentTimeFunc {
 
     fn simplify(
         &self,
-        _args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
-        let now_ts = info.execution_props().query_execution_start_time;
+        let Some(now_ts) = info.query_execution_start_time() else {
+            return Ok(ExprSimplifyResult::Original(args));
+        };
 
         // Try to get timezone from config and convert to local time
         let nano = info
-            .execution_props()
             .config_options()
-            .and_then(|config| {
-                config
-                    .execution
-                    .time_zone
-                    .as_ref()
-                    .map(|tz| tz.parse::<Tz>().ok())
-            })
-            .flatten()
+            .execution
+            .time_zone
+            .as_ref()
+            .and_then(|tz| tz.parse::<Tz>().ok())
             .map_or_else(
                 || datetime_to_time_nanos(&now_ts),
                 |tz| {
@@ -143,46 +150,24 @@ fn datetime_to_time_nanos<Tz: TimeZone>(dt: &chrono::DateTime<Tz>) -> Option<i64
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::datatypes::{DataType, TimeUnit::Nanosecond};
     use chrono::{DateTime, Utc};
-    use datafusion_common::{Result, ScalarValue};
-    use datafusion_expr::execution_props::ExecutionProps;
-    use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+    use datafusion_common::DFSchema;
+    use datafusion_common::config::ConfigOptions;
     use std::sync::Arc;
 
-    struct MockSimplifyInfo {
-        execution_props: ExecutionProps,
-    }
-
-    impl SimplifyInfo for MockSimplifyInfo {
-        fn is_boolean_type(&self, _expr: &Expr) -> Result<bool> {
-            Ok(false)
-        }
-
-        fn nullable(&self, _expr: &Expr) -> Result<bool> {
-            Ok(true)
-        }
-
-        fn execution_props(&self) -> &ExecutionProps {
-            &self.execution_props
-        }
-
-        fn get_data_type(&self, _expr: &Expr) -> Result<DataType> {
-            Ok(Time64(Nanosecond))
-        }
-    }
-
-    fn set_session_timezone_env(tz: &str, start_time: DateTime<Utc>) -> MockSimplifyInfo {
-        let mut config = datafusion_common::config::ConfigOptions::default();
+    fn set_session_timezone_env(tz: &str, start_time: DateTime<Utc>) -> SimplifyContext {
+        let mut config = ConfigOptions::default();
         config.execution.time_zone = if tz.is_empty() {
             None
         } else {
             Some(tz.to_string())
         };
-        let mut execution_props =
-            ExecutionProps::new().with_query_execution_start_time(start_time);
-        execution_props.config_options = Some(Arc::new(config));
-        MockSimplifyInfo { execution_props }
+        let schema = Arc::new(DFSchema::empty());
+        SimplifyContext::builder()
+            .with_schema(schema)
+            .with_config_options(Arc::new(config))
+            .with_query_execution_start_time(Some(start_time))
+            .build()
     }
 
     #[test]
@@ -225,6 +210,9 @@ mod tests {
         // 10 hours in nanoseconds
         let expected_offset = 10i64 * 3600 * 1_000_000_000;
 
-        assert_eq!(difference, expected_offset, "Expected 10-hour offset difference in nanoseconds between UTC+05:00 and UTC-05:00");
+        assert_eq!(
+            difference, expected_offset,
+            "Expected 10-hour offset difference in nanoseconds between UTC+05:00 and UTC-05:00"
+        );
     }
 }
diff --git a/datafusion/functions/src/datetime/date_bin.rs b/datafusion/functions/src/datetime/date_bin.rs
index 92af123dbafac..123375cc26a80 100644
--- a/datafusion/functions/src/datetime/date_bin.rs
+++ b/datafusion/functions/src/datetime/date_bin.rs
@@ -15,7 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::temporal_conversions::NANOSECONDS;
@@ -24,18 +23,22 @@ use arrow::array::types::{
     TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
     TimestampSecondType,
 };
-use arrow::array::{ArrayRef, PrimitiveArray};
-use arrow::datatypes::DataType::{Null, Timestamp, Utf8};
+use arrow::array::{ArrayRef, AsArray, PrimitiveArray};
+use arrow::datatypes::DataType::{Time32, Time64, Timestamp};
 use arrow::datatypes::IntervalUnit::{DayTime, MonthDayNano};
 use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second};
-use arrow::datatypes::{DataType, TimeUnit};
-
+use arrow::datatypes::{
+    DataType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
+    Time64NanosecondType, TimeUnit,
+};
+use arrow::temporal_conversions::NANOSECONDS_IN_DAY;
 use datafusion_common::cast::as_primitive_array;
-use datafusion_common::{exec_err, not_impl_err, plan_err, Result, ScalarValue};
-use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
+use datafusion_common::{Result, ScalarValue, exec_err, not_impl_err, plan_err};
 use datafusion_expr::TypeSignature::Exact;
+use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, TIMEZONE_WILDCARD,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TIMEZONE_WILDCARD, Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -71,6 +74,17 @@ FROM VALUES ('2023-01-01T18:18:18Z'), ('2023-01-03T19:00:03Z')  t(time);
 | 2023-01-03T03:00:00 |
 +---------------------+
 2 row(s) fetched.
+
+-- Bin the time into 15 minute intervals starting at 1 min
+>  SELECT date_bin(interval '15 minutes', time, TIME '00:01:00') as bin
+FROM VALUES (TIME '02:18:18'), (TIME '19:00:03')  t(time);
++----------+
+| bin      |
++----------+
+| 02:16:00 |
+| 18:46:00 |
++----------+
+2 row(s) fetched.
 ```"#,
     argument(name = "interval", description = "Bin interval."),
     argument(
@@ -109,7 +123,7 @@ impl Default for DateBinFunc {
 impl DateBinFunc {
     pub fn new() -> Self {
         let base_sig = |array_type: TimeUnit| {
-            vec![
+            let mut v = vec![
                 Exact(vec![
                     DataType::Interval(MonthDayNano),
                     Timestamp(array_type, None),
@@ -146,7 +160,44 @@ impl DateBinFunc {
                     DataType::Interval(DayTime),
                     Timestamp(array_type, Some(TIMEZONE_WILDCARD.into())),
                 ]),
-            ]
+            ];
+
+            match array_type {
+                Second | Millisecond => {
+                    v.append(&mut vec![
+                        Exact(vec![
+                            DataType::Interval(MonthDayNano),
+                            Time32(array_type),
+                            Time32(array_type),
+                        ]),
+                        Exact(vec![DataType::Interval(MonthDayNano), Time32(array_type)]),
+                        Exact(vec![
+                            DataType::Interval(DayTime),
+                            Time32(array_type),
+                            Time32(array_type),
+                        ]),
+                        Exact(vec![DataType::Interval(DayTime), Time32(array_type)]),
+                    ]);
+                }
+                Microsecond | Nanosecond => {
+                    v.append(&mut vec![
+                        Exact(vec![
+                            DataType::Interval(DayTime),
+                            Time64(array_type),
+                            Time64(array_type),
+                        ]),
+                        Exact(vec![DataType::Interval(DayTime), Time64(array_type)]),
+                        Exact(vec![
+                            DataType::Interval(MonthDayNano),
+                            Time64(array_type),
+                            Time64(array_type),
+                        ]),
+                        Exact(vec![DataType::Interval(MonthDayNano), Time64(array_type)]),
+                    ]);
+                }
+            }
+
+            v
         };
 
         let full_sig = [Nanosecond, Microsecond, Millisecond, Second]
@@ -162,10 +213,6 @@ impl DateBinFunc {
 }
 
 impl ScalarUDFImpl for DateBinFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "date_bin"
     }
@@ -176,28 +223,39 @@ impl ScalarUDFImpl for DateBinFunc {
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
         match &arg_types[1] {
-            Timestamp(Nanosecond, None) | Utf8 | Null => Ok(Timestamp(Nanosecond, None)),
-            Timestamp(Nanosecond, tz_opt) => Ok(Timestamp(Nanosecond, tz_opt.clone())),
-            Timestamp(Microsecond, tz_opt) => Ok(Timestamp(Microsecond, tz_opt.clone())),
-            Timestamp(Millisecond, tz_opt) => Ok(Timestamp(Millisecond, tz_opt.clone())),
-            Timestamp(Second, tz_opt) => Ok(Timestamp(Second, tz_opt.clone())),
+            Timestamp(tu, tz_opt) => Ok(Timestamp(*tu, tz_opt.clone())),
+            Time32(tu) => Ok(Time32(*tu)),
+            Time64(tu) => Ok(Time64(*tu)),
             _ => plan_err!(
-                "The date_bin function can only accept timestamp as the second arg."
+                "The date_bin function can only accept timestamp or time as the second arg."
             ),
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
         if args.len() == 2 {
-            // Default to unix EPOCH
-            let origin = ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
-                Some(0),
-                Some("+00:00".into()),
-            ));
+            let origin = match args[1].data_type() {
+                Time32(Second) => {
+                    ColumnarValue::Scalar(ScalarValue::Time32Second(Some(0)))
+                }
+                Time32(Millisecond) => {
+                    ColumnarValue::Scalar(ScalarValue::Time32Millisecond(Some(0)))
+                }
+                Time64(Microsecond) => {
+                    ColumnarValue::Scalar(ScalarValue::Time64Microsecond(Some(0)))
+                }
+                Time64(Nanosecond) => {
+                    ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(Some(0)))
+                }
+                _ => {
+                    // Default to unix EPOCH
+                    ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                        Some(0),
+                        Some("+00:00".into()),
+                    ))
+                }
+            };
             date_bin_impl(&args[0], &args[1], &origin)
         } else if args.len() == 3 {
             date_bin_impl(&args[0], &args[1], &args[2])
@@ -227,6 +285,18 @@ impl ScalarUDFImpl for DateBinFunc {
     }
 }
 
+const NANOS_PER_MICRO: i64 = 1_000;
+const NANOS_PER_MILLI: i64 = 1_000_000;
+const NANOS_PER_SEC: i64 = NANOSECONDS;
+/// Function type for binning timestamps into intervals
+///
+/// Arguments:
+/// * `stride` - Interval width (nanoseconds for time-based, months for month-based)
+/// * `source` - Timestamp to bin (nanoseconds since epoch)
+/// * `origin` - Origin timestamp (nanoseconds since epoch)
+///
+/// Returns: Binned timestamp in nanoseconds, or error if out of range
+type BinFunction = fn(i64, i64, i64) -> Result<i64>;
 enum Interval {
     Nanoseconds(i64),
     Months(i64),
@@ -241,7 +311,7 @@ impl Interval {
     /// `source` is the timestamp being binned
     ///
     /// `origin`  is the time, in nanoseconds, where windows are measured from
-    fn bin_fn(&self) -> (i64, fn(i64, i64, i64) -> i64) {
+    fn bin_fn(&self) -> (i64, BinFunction) {
         match self {
             Interval::Nanoseconds(nanos) => (*nanos, date_bin_nanos_interval),
             Interval::Months(months) => (*months, date_bin_months_interval),
@@ -250,13 +320,13 @@ impl Interval {
 }
 
 // return time in nanoseconds that the source timestamp falls into based on the stride and origin
-fn date_bin_nanos_interval(stride_nanos: i64, source: i64, origin: i64) -> i64 {
+fn date_bin_nanos_interval(stride_nanos: i64, source: i64, origin: i64) -> Result<i64> {
     let time_diff = source - origin;
 
     // distance from origin to bin
     let time_delta = compute_distance(time_diff, stride_nanos);
 
-    origin + time_delta
+    Ok(origin + time_delta)
 }
 
 // distance from origin to bin
@@ -272,10 +342,10 @@ fn compute_distance(time_diff: i64, stride: i64) -> i64 {
 }
 
 // return time in nanoseconds that the source timestamp falls into based on the stride and origin
-fn date_bin_months_interval(stride_months: i64, source: i64, origin: i64) -> i64 {
+fn date_bin_months_interval(stride_months: i64, source: i64, origin: i64) -> Result<i64> {
     // convert source and origin to DateTime<Utc>
-    let source_date = to_utc_date_time(source);
-    let origin_date = to_utc_date_time(origin);
+    let source_date = to_utc_date_time(source)?;
+    let origin_date = to_utc_date_time(origin)?;
 
     // calculate the number of months between the source and origin
     let month_diff = (source_date.year() - origin_date.year()) * 12
@@ -286,9 +356,17 @@ fn date_bin_months_interval(stride_months: i64, source: i64, origin: i64) -> i64
     let month_delta = compute_distance(month_diff as i64, stride_months);
 
     let mut bin_time = if month_delta < 0 {
-        origin_date - Months::new(month_delta.unsigned_abs() as u32)
+        match origin_date
+            .checked_sub_months(Months::new(month_delta.unsigned_abs() as u32))
+        {
+            Some(dt) => dt,
+            None => return exec_err!("DATE_BIN month subtraction out of range"),
+        }
     } else {
-        origin_date + Months::new(month_delta as u32)
+        match origin_date.checked_add_months(Months::new(month_delta as u32)) {
+            Some(dt) => dt,
+            None => return exec_err!("DATE_BIN month addition out of range"),
+        }
     };
 
     // If origin is not midnight of first date of the month, the bin_time may be larger than the source
@@ -296,19 +374,32 @@ fn date_bin_months_interval(stride_months: i64, source: i64, origin: i64) -> i64
     if bin_time > source_date {
         let month_delta = month_delta - stride_months;
         bin_time = if month_delta < 0 {
-            origin_date - Months::new(month_delta.unsigned_abs() as u32)
+            match origin_date
+                .checked_sub_months(Months::new(month_delta.unsigned_abs() as u32))
+            {
+                Some(dt) => dt,
+                None => return exec_err!("DATE_BIN month subtraction out of range"),
+            }
         } else {
-            origin_date + Months::new(month_delta as u32)
+            match origin_date.checked_add_months(Months::new(month_delta as u32)) {
+                Some(dt) => dt,
+                None => return exec_err!("DATE_BIN month addition out of range"),
+            }
         };
     }
-
-    bin_time.timestamp_nanos_opt().unwrap()
+    match bin_time.timestamp_nanos_opt() {
+        Some(nanos) => Ok(nanos),
+        None => exec_err!("DATE_BIN result timestamp out of range"),
+    }
 }
 
-fn to_utc_date_time(nanos: i64) -> DateTime<Utc> {
-    let secs = nanos / 1_000_000_000;
-    let nsec = (nanos % 1_000_000_000) as u32;
-    DateTime::from_timestamp(secs, nsec).unwrap()
+fn to_utc_date_time(nanos: i64) -> Result<DateTime<Utc>> {
+    let secs = nanos / NANOS_PER_SEC;
+    let nsec = (nanos % NANOS_PER_SEC) as u32;
+    match DateTime::from_timestamp(secs, nsec) {
+        Some(dt) => Ok(dt),
+        None => exec_err!("Invalid timestamp value"),
+    }
 }
 
 // Supported intervals:
@@ -323,6 +414,12 @@ fn date_bin_impl(
     origin: &ColumnarValue,
 ) -> Result<ColumnarValue> {
     let stride = match stride {
+        ColumnarValue::Scalar(s) if s.is_null() => {
+            // NULL stride -> NULL result (standard SQL NULL propagation)
+            return Ok(ColumnarValue::Scalar(ScalarValue::try_from(
+                array.data_type(),
+            )?));
+        }
         ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(v))) => {
             let (days, ms) = IntervalDayTimeType::to_parts(*v);
             let nanos = (TimeDelta::try_days(days as i64).unwrap()
@@ -365,23 +462,105 @@ fn date_bin_impl(
         }
         ColumnarValue::Array(_) => {
             return not_impl_err!(
-            "DATE_BIN only supports literal values for the stride argument, not arrays"
-        );
+                "DATE_BIN only supports literal values for the stride argument, not arrays"
+            );
         }
     };
 
-    let origin = match origin {
-        ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(v), _)) => *v,
+    let (origin, is_time) = match origin {
+        ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(v), _)) => {
+            (*v, false)
+        }
+        ColumnarValue::Scalar(ScalarValue::Time32Millisecond(Some(v))) => {
+            match stride {
+                Interval::Months(m) => {
+                    if m > 0 {
+                        return exec_err!(
+                            "DATE_BIN stride for TIME input must be less than 1 day"
+                        );
+                    }
+                }
+                Interval::Nanoseconds(ns) => {
+                    if ns >= NANOSECONDS_IN_DAY {
+                        return exec_err!(
+                            "DATE_BIN stride for TIME input must be less than 1 day"
+                        );
+                    }
+                }
+            }
+
+            (*v as i64 * NANOS_PER_MILLI, true)
+        }
+        ColumnarValue::Scalar(ScalarValue::Time32Second(Some(v))) => {
+            match stride {
+                Interval::Months(m) => {
+                    if m > 0 {
+                        return exec_err!(
+                            "DATE_BIN stride for TIME input must be less than 1 day"
+                        );
+                    }
+                }
+                Interval::Nanoseconds(ns) => {
+                    if ns >= NANOSECONDS_IN_DAY {
+                        return exec_err!(
+                            "DATE_BIN stride for TIME input must be less than 1 day"
+                        );
+                    }
+                }
+            }
+
+            (*v as i64 * NANOS_PER_SEC, true)
+        }
+        ColumnarValue::Scalar(ScalarValue::Time64Microsecond(Some(v))) => {
+            match stride {
+                Interval::Months(m) => {
+                    if m > 0 {
+                        return exec_err!(
+                            "DATE_BIN stride for TIME input must be less than 1 day"
+                        );
+                    }
+                }
+                Interval::Nanoseconds(ns) => {
+                    if ns >= NANOSECONDS_IN_DAY {
+                        return exec_err!(
+                            "DATE_BIN stride for TIME input must be less than 1 day"
+                        );
+                    }
+                }
+            }
+
+            (*v * NANOS_PER_MICRO, true)
+        }
+        ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(Some(v))) => {
+            match stride {
+                Interval::Months(m) => {
+                    if m > 0 {
+                        return exec_err!(
+                            "DATE_BIN stride for TIME input must be less than 1 day"
+                        );
+                    }
+                }
+                Interval::Nanoseconds(ns) => {
+                    if ns >= NANOSECONDS_IN_DAY {
+                        return exec_err!(
+                            "DATE_BIN stride for TIME input must be less than 1 day"
+                        );
+                    }
+                }
+            }
+
+            (*v, true)
+        }
         ColumnarValue::Scalar(v) => {
             return exec_err!(
-                "DATE_BIN expects origin argument to be a TIMESTAMP with nanosecond precision but got {}",
+                "DATE_BIN expects origin argument to be a TIMESTAMP with nanosecond precision or a TIME but got {}",
                 v.data_type()
             );
         }
         ColumnarValue::Array(_) => {
             return not_impl_err!(
-            "DATE_BIN only supports literal values for the origin argument, not arrays"
-        );
+                "DATE_BIN only supports literal values for the origin argument, not arrays"
+            );
         }
     };
 
@@ -395,15 +574,18 @@ fn date_bin_impl(
     fn stride_map_fn<T: ArrowTimestampType>(
         origin: i64,
         stride: i64,
-        stride_fn: fn(i64, i64, i64) -> i64,
-    ) -> impl Fn(i64) -> i64 {
+        stride_fn: BinFunction,
+    ) -> impl Fn(i64) -> Result<i64> {
         let scale = match T::UNIT {
             Nanosecond => 1,
-            Microsecond => NANOSECONDS / 1_000_000,
-            Millisecond => NANOSECONDS / 1_000,
+            Microsecond => NANOS_PER_MICRO,
+            Millisecond => NANOS_PER_MILLI,
             Second => NANOSECONDS,
         };
-        move |x: i64| stride_fn(stride, x * scale, origin) / scale
+        move |x: i64| match stride_fn(stride, x * scale, origin) {
+            Ok(result) => Ok(result / scale),
+            Err(e) => Err(e),
+        }
     }
 
     Ok(match array {
@@ -411,7 +593,7 @@ fn date_bin_impl(
             let apply_stride_fn =
                 stride_map_fn::<TimestampNanosecondType>(origin, stride, stride_fn);
             ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
-                v.map(apply_stride_fn),
+                v.and_then(|val| apply_stride_fn(val).ok()),
                 tz_opt.clone(),
             ))
         }
@@ -419,7 +601,7 @@ fn date_bin_impl(
             let apply_stride_fn =
                 stride_map_fn::<TimestampMicrosecondType>(origin, stride, stride_fn);
             ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(
-                v.map(apply_stride_fn),
+                v.and_then(|val| apply_stride_fn(val).ok()),
                 tz_opt.clone(),
             ))
         }
@@ -427,7 +609,7 @@ fn date_bin_impl(
             let apply_stride_fn =
                 stride_map_fn::<TimestampMillisecondType>(origin, stride, stride_fn);
             ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(
-                v.map(apply_stride_fn),
+                v.and_then(|val| apply_stride_fn(val).ok()),
                 tz_opt.clone(),
             ))
         }
@@ -435,16 +617,69 @@ fn date_bin_impl(
             let apply_stride_fn =
                 stride_map_fn::<TimestampSecondType>(origin, stride, stride_fn);
             ColumnarValue::Scalar(ScalarValue::TimestampSecond(
-                v.map(apply_stride_fn),
+                v.and_then(|val| apply_stride_fn(val).ok()),
                 tz_opt.clone(),
             ))
         }
-
+        ColumnarValue::Scalar(ScalarValue::Time32Millisecond(v)) => {
+            if !is_time {
+                return exec_err!("DATE_BIN with Time32 source requires Time32 origin");
+            }
+            let result = v.and_then(|x| {
+                match stride_fn(stride, x as i64 * NANOS_PER_MILLI, origin) {
+                    Ok(binned_nanos) => {
+                        let nanos = binned_nanos % (NANOSECONDS_IN_DAY);
+                        Some((nanos / NANOS_PER_MILLI) as i32)
+                    }
+                    Err(_) => None,
+                }
+            });
+            ColumnarValue::Scalar(ScalarValue::Time32Millisecond(result))
+        }
+        ColumnarValue::Scalar(ScalarValue::Time32Second(v)) => {
+            if !is_time {
+                return exec_err!("DATE_BIN with Time32 source requires Time32 origin");
+            }
+            let result = v.and_then(|x| {
+                match stride_fn(stride, x as i64 * NANOS_PER_SEC, origin) {
+                    Ok(binned_nanos) => {
+                        let nanos = binned_nanos % (NANOSECONDS_IN_DAY);
+                        Some((nanos / NANOS_PER_SEC) as i32)
+                    }
+                    Err(_) => None,
+                }
+            });
+            ColumnarValue::Scalar(ScalarValue::Time32Second(result))
+        }
+        ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(v)) => {
+            if !is_time {
+                return exec_err!("DATE_BIN with Time64 source requires Time64 origin");
+            }
+            let result = v.and_then(|x| match stride_fn(stride, x, origin) {
+                Ok(binned_nanos) => Some(binned_nanos % (NANOSECONDS_IN_DAY)),
+                Err(_) => None,
+            });
+            ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(result))
+        }
+        ColumnarValue::Scalar(ScalarValue::Time64Microsecond(v)) => {
+            if !is_time {
+                return exec_err!("DATE_BIN with Time64 source requires Time64 origin");
+            }
+            let result =
+                v.and_then(|x| match stride_fn(stride, x * NANOS_PER_MICRO, origin) {
+                    Ok(binned_nanos) => {
+                        let nanos = binned_nanos % (NANOSECONDS_IN_DAY);
+                        Some(nanos / NANOS_PER_MICRO)
+                    }
+                    Err(_) => None,
+                });
+            ColumnarValue::Scalar(ScalarValue::Time64Microsecond(result))
+        }
         ColumnarValue::Array(array) => {
             fn transform_array_with_stride<T>(
                 origin: i64,
                 stride: i64,
-                stride_fn: fn(i64, i64, i64) -> i64,
+                stride_fn: BinFunction,
                 array: &ArrayRef,
                 tz_opt: &Option<Arc<str>>,
             ) -> Result<ColumnarValue>
@@ -452,11 +687,22 @@ fn date_bin_impl(
                 T: ArrowTimestampType,
             {
                 let array = as_primitive_array::<T>(array)?;
-                let apply_stride_fn = stride_map_fn::<T>(origin, stride, stride_fn);
-                let array: PrimitiveArray<T> = array
-                    .unary(apply_stride_fn)
-                    .with_timezone_opt(tz_opt.clone());
-
+                let scale = match T::UNIT {
+                    Nanosecond => 1,
+                    Microsecond => NANOS_PER_MICRO,
+                    Millisecond => NANOS_PER_MILLI,
+                    Second => NANOSECONDS,
+                };
+
+                let result: PrimitiveArray<T> = array.try_unary(|val| {
+                    stride_fn(stride, val * scale, origin)
+                        .map(|binned| binned / scale)
+                        .map_err(|e| {
+                            arrow::error::ArrowError::ComputeError(e.to_string())
+                        })
+                })?;
+
+                let array = result.with_timezone_opt(tz_opt.clone());
                 Ok(ColumnarValue::Array(Arc::new(array)))
             }
 
@@ -481,9 +727,86 @@ fn date_bin_impl(
                         origin, stride, stride_fn, array, tz_opt,
                     )?
                 }
+                Time32(Millisecond) => {
+                    if !is_time {
+                        return exec_err!(
+                            "DATE_BIN with Time32 source requires Time32 origin"
+                        );
+                    }
+                    let array = array.as_primitive::<Time32MillisecondType>();
+                    let result: PrimitiveArray<Time32MillisecondType> =
+                        array.try_unary(|x| {
+                            stride_fn(stride, x as i64 * NANOS_PER_MILLI, origin)
+                                .map(|binned_nanos| {
+                                    let nanos = binned_nanos % (NANOSECONDS_IN_DAY);
+                                    (nanos / NANOS_PER_MILLI) as i32
+                                })
+                                .map_err(|e| {
+                                    arrow::error::ArrowError::ComputeError(e.to_string())
+                                })
+                        })?;
+                    ColumnarValue::Array(Arc::new(result))
+                }
+                Time32(Second) => {
+                    if !is_time {
+                        return exec_err!(
+                            "DATE_BIN with Time32 source requires Time32 origin"
+                        );
+                    }
+                    let array = array.as_primitive::<Time32SecondType>();
+                    let result: PrimitiveArray<Time32SecondType> =
+                        array.try_unary(|x| {
+                            stride_fn(stride, x as i64 * NANOS_PER_SEC, origin)
+                                .map(|binned_nanos| {
+                                    let nanos = binned_nanos % (NANOSECONDS_IN_DAY);
+                                    (nanos / NANOS_PER_SEC) as i32
+                                })
+                                .map_err(|e| {
+                                    arrow::error::ArrowError::ComputeError(e.to_string())
+                                })
+                        })?;
+                    ColumnarValue::Array(Arc::new(result))
+                }
+                Time64(Microsecond) => {
+                    if !is_time {
+                        return exec_err!(
+                            "DATE_BIN with Time64 source requires Time64 origin"
+                        );
+                    }
+                    let array = array.as_primitive::<Time64MicrosecondType>();
+                    let result: PrimitiveArray<Time64MicrosecondType> =
+                        array.try_unary(|x| {
+                            stride_fn(stride, x * NANOS_PER_MICRO, origin)
+                                .map(|binned_nanos| {
+                                    let nanos = binned_nanos % (NANOSECONDS_IN_DAY);
+                                    nanos / NANOS_PER_MICRO
+                                })
+                                .map_err(|e| {
+                                    arrow::error::ArrowError::ComputeError(e.to_string())
+                                })
+                        })?;
+                    ColumnarValue::Array(Arc::new(result))
+                }
+                Time64(Nanosecond) => {
+                    if !is_time {
+                        return exec_err!(
+                            "DATE_BIN with Time64 source requires Time64 origin"
+                        );
+                    }
+                    let array = array.as_primitive::<Time64NanosecondType>();
+                    let result: PrimitiveArray<Time64NanosecondType> =
+                        array.try_unary(|x| {
+                            stride_fn(stride, x, origin)
+                                .map(|binned_nanos| binned_nanos % (NANOSECONDS_IN_DAY))
+                                .map_err(|e| {
+                                    arrow::error::ArrowError::ComputeError(e.to_string())
+                                })
+                        })?;
+                    ColumnarValue::Array(Arc::new(result))
+                }
                 _ => {
                     return exec_err!(
-                        "DATE_BIN expects source argument to be a TIMESTAMP but got {}",
+                        "DATE_BIN expects source argument to be a TIMESTAMP or TIME but got {}",
                         array.data_type()
                     );
                 }
@@ -491,7 +814,7 @@ fn date_bin_impl(
         }
         _ => {
             return exec_err!(
-                "DATE_BIN expects source argument to be a TIMESTAMP scalar or array"
+                "DATE_BIN expects source argument to be a TIMESTAMP or TIME scalar or array"
             );
         }
     })
@@ -501,7 +824,7 @@ fn date_bin_impl(
 mod tests {
     use std::sync::Arc;
 
-    use crate::datetime::date_bin::{date_bin_nanos_interval, DateBinFunc};
+    use crate::datetime::date_bin::{DateBinFunc, date_bin_nanos_interval};
     use arrow::array::types::TimestampNanosecondType;
     use arrow::array::{Array, IntervalDayTimeArray, TimestampNanosecondArray};
     use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos;
@@ -509,7 +832,7 @@ mod tests {
 
     use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano};
     use datafusion_common::{DataFusionError, ScalarValue};
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
 
     use chrono::TimeDelta;
     use datafusion_common::config::ConfigOptions;
@@ -524,7 +847,7 @@ mod tests {
             .map(|arg| Field::new("a", arg.data_type(), true).into())
             .collect::<Vec<_>>();
 
-        let args = datafusion_expr::ScalarFunctionArgs {
+        let args = ScalarFunctionArgs {
             args,
             arg_fields,
             number_rows,
@@ -687,7 +1010,7 @@ mod tests {
         let res = invoke_date_bin_with_args(args, 1, return_field);
         assert_eq!(
             res.err().unwrap().strip_backtrace(),
-            "Execution error: DATE_BIN expects origin argument to be a TIMESTAMP with nanosecond precision but got Timestamp(µs)"
+            "Execution error: DATE_BIN expects origin argument to be a TIMESTAMP with nanosecond precision or a TIME but got Timestamp(µs)"
         );
 
         args = vec![
@@ -935,7 +1258,7 @@ mod tests {
                 let origin1 = string_to_timestamp_nanos(origin).unwrap();
 
                 let expected1 = string_to_timestamp_nanos(expected).unwrap();
-                let result = date_bin_nanos_interval(stride1, source1, origin1);
+                let result = date_bin_nanos_interval(stride1, source1, origin1).unwrap();
                 assert_eq!(result, expected1, "{source} = {expected}");
             })
     }
@@ -963,8 +1286,55 @@ mod tests {
             let source1 = string_to_timestamp_nanos(source).unwrap();
 
             let expected1 = string_to_timestamp_nanos(expected).unwrap();
-            let result = date_bin_nanos_interval(stride1, source1, 0);
+            let result = date_bin_nanos_interval(stride1, source1, 0).unwrap();
             assert_eq!(result, expected1, "{source} = {expected}");
         })
     }
+
+    #[test]
+    fn test_date_bin_out_of_range() {
+        let return_field = &Arc::new(Field::new(
+            "f",
+            DataType::Timestamp(TimeUnit::Millisecond, None),
+            true,
+        ));
+        let args = vec![
+            ColumnarValue::Scalar(ScalarValue::new_interval_mdn(1637426858, 0, 0)),
+            ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(
+                Some(1040292460),
+                None,
+            )),
+            ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                Some(string_to_timestamp_nanos("1984-01-07 00:00:00").unwrap()),
+                None,
+            )),
+        ];
+
+        let result = invoke_date_bin_with_args(args, 1, return_field);
+        assert!(result.is_ok());
+        if let ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(val, _)) =
+            result.unwrap()
+        {
+            assert!(val.is_none(), "Expected None for out of range operation");
+        }
+        let args = vec![
+            ColumnarValue::Scalar(ScalarValue::new_interval_mdn(1637426858, 0, 0)),
+            ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(
+                Some(-1040292460),
+                None,
+            )),
+            ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                Some(string_to_timestamp_nanos("1984-01-07 00:00:00").unwrap()),
+                None,
+            )),
+        ];
+
+        let result = invoke_date_bin_with_args(args, 1, return_field);
+        assert!(result.is_ok());
+        if let ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(val, _)) =
+            result.unwrap()
+        {
+            assert!(val.is_none(), "Expected None for out of range operation");
+        }
+    }
 }
diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs
index aa23a5028dd81..10275f5aaed2e 100644
--- a/datafusion/functions/src/datetime/date_part.rs
+++ b/datafusion/functions/src/datetime/date_part.rs
@@ -15,23 +15,30 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::str::FromStr;
 use std::sync::Arc;
 
-use arrow::array::{Array, ArrayRef, Float64Array, Int32Array};
+use arrow::array::timezone::Tz;
+use arrow::array::{Array, ArrayRef, Float64Array, Int32Array, Int64Array};
 use arrow::compute::kernels::cast_utils::IntervalUnit;
-use arrow::compute::{binary, date_part, DatePart};
+use arrow::compute::{DatePart, binary, date_part};
 use arrow::datatypes::DataType::{
     Date32, Date64, Duration, Interval, Time32, Time64, Timestamp,
 };
 use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second};
-use arrow::datatypes::{DataType, Field, FieldRef, TimeUnit};
-use datafusion_common::types::{logical_date, NativeType};
+use arrow::datatypes::{
+    ArrowTimestampType, DataType, Date32Type, Date64Type, Field, FieldRef,
+    IntervalUnit as ArrowIntervalUnit, TimeUnit, TimestampMicrosecondType,
+    TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
+};
+use chrono::{Datelike, NaiveDate};
+use datafusion_common::types::{NativeType, logical_date};
 
 use datafusion_common::{
+    Result, ScalarValue,
     cast::{
-        as_date32_array, as_date64_array, as_int32_array, as_time32_millisecond_array,
+        as_date32_array, as_date64_array, as_int32_array, as_interval_dt_array,
+        as_interval_mdn_array, as_interval_ym_array, as_time32_millisecond_array,
         as_time32_second_array, as_time64_microsecond_array, as_time64_nanosecond_array,
         as_timestamp_microsecond_array, as_timestamp_millisecond_array,
         as_timestamp_nanosecond_array, as_timestamp_second_array,
@@ -39,11 +46,12 @@ use datafusion_common::{
     exec_err, internal_err, not_impl_err,
     types::logical_string,
     utils::take_function_args,
-    Result, ScalarValue,
 };
+use datafusion_expr::preimage::PreimageResult;
+use datafusion_expr::simplify::SimplifyContext;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ReturnFieldArgs, ScalarUDFImpl, Signature,
-    TypeSignature, Volatility,
+    ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignature, Volatility, interval_arithmetic,
 };
 use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
 use datafusion_macros::user_doc;
@@ -56,8 +64,9 @@ use datafusion_macros::user_doc;
     argument(
         name = "part",
         description = r#"Part of the date to return. The following date parts are supported:
-        
+
     - year
+    - isoyear (ISO 8601 week-numbering year)
     - quarter (emits value in inclusive range [1, 4] based on which quartile of the year the date is in)
     - month
     - week (week of the year)
@@ -70,14 +79,28 @@ use datafusion_macros::user_doc;
     - nanosecond
     - dow (day of the week where Sunday is 0)
     - doy (day of the year)
-    - epoch (seconds since Unix epoch)
+    - epoch (seconds since Unix epoch for timestamps/dates, total seconds for intervals)
     - isodow (day of the week where Monday is 0)
 "#
     ),
     argument(
         name = "expression",
         description = "Time expression to operate on. Can be a constant, column, or function."
-    )
+    ),
+    sql_example = r#"```sql
+> SELECT date_part('year', '2024-05-01T00:00:00');
++-----------------------------------------------------+
+| date_part(Utf8("year"),Utf8("2024-05-01T00:00:00")) |
++-----------------------------------------------------+
+| 2024                                                |
++-----------------------------------------------------+
+> SELECT extract(day FROM timestamp '2024-05-01T00:00:00');
++----------------------------------------------------+
+| date_part(Utf8("DAY"),Utf8("2024-05-01T00:00:00")) |
++----------------------------------------------------+
+| 1                                                  |
++----------------------------------------------------+
+```"#
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
 pub struct DatePartFunc {
@@ -130,10 +153,6 @@ impl DatePartFunc {
 }
 
 impl ScalarUDFImpl for DatePartFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "date_part"
     }
@@ -148,6 +167,7 @@ impl ScalarUDFImpl for DatePartFunc {
 
     fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
         let [field, _] = take_function_args(self.name(), args.scalar_arguments)?;
+        let nullable = args.arg_fields[1].is_nullable();
 
         field
             .and_then(|sv| {
@@ -156,9 +176,12 @@ impl ScalarUDFImpl for DatePartFunc {
                     .filter(|s| !s.is_empty())
                     .map(|part| {
                         if is_epoch(part) {
-                            Field::new(self.name(), DataType::Float64, true)
+                            Field::new(self.name(), DataType::Float64, nullable)
+                        } else if is_nanosecond(part) {
+                            // See notes on [seconds_ns] for rationale
+                            Field::new(self.name(), DataType::Int64, nullable)
                         } else {
-                            Field::new(self.name(), DataType::Int32, true)
+                            Field::new(self.name(), DataType::Int32, nullable)
                         }
                     })
             })
@@ -169,10 +192,7 @@ impl ScalarUDFImpl for DatePartFunc {
             )
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = args.args;
         let [part, array] = take_function_args(self.name(), args)?;
 
@@ -208,13 +228,14 @@ impl ScalarUDFImpl for DatePartFunc {
                 IntervalUnit::Second => seconds_as_i32(array.as_ref(), Second)?,
                 IntervalUnit::Millisecond => seconds_as_i32(array.as_ref(), Millisecond)?,
                 IntervalUnit::Microsecond => seconds_as_i32(array.as_ref(), Microsecond)?,
-                IntervalUnit::Nanosecond => seconds_as_i32(array.as_ref(), Nanosecond)?,
+                IntervalUnit::Nanosecond => seconds_ns(array.as_ref())?,
                 // century and decade are not supported by `DatePart`, although they are supported in postgres
                 _ => return exec_err!("Date part '{part}' not supported"),
             }
         } else {
             // special cases that can be extracted (in postgres) but are not interval units
             match part_trim.to_lowercase().as_str() {
+                "isoyear" => date_part(array.as_ref(), DatePart::YearISO)?,
                 "qtr" | "quarter" => date_part(array.as_ref(), DatePart::Quarter)?,
                 "doy" => date_part(array.as_ref(), DatePart::DayOfYear)?,
                 "dow" => date_part(array.as_ref(), DatePart::DayOfWeekSunday0)?,
@@ -231,6 +252,71 @@ impl ScalarUDFImpl for DatePartFunc {
         })
     }
 
+    // Only casting the year is supported since pruning other IntervalUnit is not possible
+    // date_part(col, YEAR) = 2024 => col >= '2024-01-01' and col < '2025-01-01'
+    // But for anything less than YEAR simplifying is not possible without specifying the bigger interval
+    // date_part(col, MONTH) = 1 => col = '2023-01-01' or col = '2024-01-01' or ... or col = '3000-01-01'
+    fn preimage(
+        &self,
+        args: &[Expr],
+        lit_expr: &Expr,
+        info: &SimplifyContext,
+    ) -> Result<PreimageResult> {
+        let [part, col_expr] = take_function_args(self.name(), args)?;
+
+        // Get the interval unit from the part argument
+        let interval_unit = part
+            .as_literal()
+            .and_then(|sv| sv.try_as_str().flatten())
+            .map(part_normalization)
+            .and_then(|s| IntervalUnit::from_str(s).ok());
+
+        // only support extracting year
+        match interval_unit {
+            Some(IntervalUnit::Year) => (),
+            _ => return Ok(PreimageResult::None),
+        }
+
+        // Check if the argument is a literal (e.g. date_part(YEAR, col) = 2024)
+        let Some(argument_literal) = lit_expr.as_literal() else {
+            return Ok(PreimageResult::None);
+        };
+
+        // Extract i32 year from Scalar value
+        let year = match argument_literal {
+            ScalarValue::Int32(Some(y)) => *y,
+            _ => return Ok(PreimageResult::None),
+        };
+
+        // Can only extract year from Date32/64 and Timestamp column
+        let target_type = match info.get_data_type(col_expr)? {
+            Date32 | Date64 | Timestamp(_, _) => &info.get_data_type(col_expr)?,
+            _ => return Ok(PreimageResult::None),
+        };
+
+        // Compute the Interval bounds
+        let Some(start_time) = NaiveDate::from_ymd_opt(year, 1, 1) else {
+            return Ok(PreimageResult::None);
+        };
+        let Some(end_time) = start_time.with_year(year + 1) else {
+            return Ok(PreimageResult::None);
+        };
+
+        // Convert to ScalarValues
+        let (Some(lower), Some(upper)) = (
+            date_to_scalar(start_time, target_type),
+            date_to_scalar(end_time, target_type),
+        ) else {
+            return Ok(PreimageResult::None);
+        };
+        let interval = Box::new(interval_arithmetic::Interval::try_new(lower, upper)?);
+
+        Ok(PreimageResult::Range {
+            expr: col_expr.clone(),
+            interval,
+        })
+    }
+
     fn aliases(&self) -> &[String] {
         &self.aliases
     }
@@ -245,6 +331,53 @@ fn is_epoch(part: &str) -> bool {
     matches!(part.to_lowercase().as_str(), "epoch")
 }
 
+fn is_nanosecond(part: &str) -> bool {
+    IntervalUnit::from_str(part_normalization(part))
+        .map(|p| matches!(p, IntervalUnit::Nanosecond))
+        .unwrap_or(false)
+}
+
+fn date_to_scalar(date: NaiveDate, target_type: &DataType) -> Option<ScalarValue> {
+    Some(match target_type {
+        Date32 => ScalarValue::Date32(Some(Date32Type::from_naive_date(date))),
+        Date64 => ScalarValue::Date64(Some(Date64Type::from_naive_date(date))),
+
+        Timestamp(unit, tz_opt) => {
+            let naive_midnight = date.and_hms_opt(0, 0, 0)?;
+            let tz: Option<Tz> = tz_opt.clone().and_then(|s| s.parse().ok());
+
+            match unit {
+                Second => ScalarValue::TimestampSecond(
+                    TimestampSecondType::from_naive_datetime(naive_midnight, tz.as_ref()),
+                    tz_opt.clone(),
+                ),
+                Millisecond => ScalarValue::TimestampMillisecond(
+                    TimestampMillisecondType::from_naive_datetime(
+                        naive_midnight,
+                        tz.as_ref(),
+                    ),
+                    tz_opt.clone(),
+                ),
+                Microsecond => ScalarValue::TimestampMicrosecond(
+                    TimestampMicrosecondType::from_naive_datetime(
+                        naive_midnight,
+                        tz.as_ref(),
+                    ),
+                    tz_opt.clone(),
+                ),
+                Nanosecond => ScalarValue::TimestampNanosecond(
+                    TimestampNanosecondType::from_naive_datetime(
+                        naive_midnight,
+                        tz.as_ref(),
+                    ),
+                    tz_opt.clone(),
+                ),
+            }
+        }
+        _ => return None,
+    })
+}
+
 // Try to remove quote if exist, if the quote is invalid, return original string and let the downstream function handle the error
 fn part_normalization(part: &str) -> &str {
     part.strip_prefix(|c| c == '\'' || c == '\"')
@@ -349,6 +482,11 @@ fn seconds(array: &dyn Array, unit: TimeUnit) -> Result<ArrayRef> {
 
 fn epoch(array: &dyn Array) -> Result<ArrayRef> {
     const SECONDS_IN_A_DAY: f64 = 86400_f64;
+    // Note: Month-to-second conversion uses 30 days as an approximation.
+    // This matches PostgreSQL's behavior for interval epoch extraction,
+    // but does not represent exact calendar months (which vary 28-31 days).
+    // See: https://doxygen.postgresql.org/datatype_2timestamp_8h.html
+    const DAYS_PER_MONTH: f64 = 30_f64;
 
     let f: Float64Array = match array.data_type() {
         Timestamp(Second, _) => as_timestamp_second_array(array)?.unary(|x| x as f64),
@@ -373,8 +511,56 @@ fn epoch(array: &dyn Array) -> Result<ArrayRef> {
         Time64(Nanosecond) => {
             as_time64_nanosecond_array(array)?.unary(|x| x as f64 / 1_000_000_000_f64)
         }
-        Interval(_) | Duration(_) => return seconds(array, Second),
+        Interval(ArrowIntervalUnit::YearMonth) => as_interval_ym_array(array)?
+            .unary(|x| x as f64 * DAYS_PER_MONTH * SECONDS_IN_A_DAY),
+        Interval(ArrowIntervalUnit::DayTime) => as_interval_dt_array(array)?.unary(|x| {
+            x.days as f64 * SECONDS_IN_A_DAY + x.milliseconds as f64 / 1_000_f64
+        }),
+        Interval(ArrowIntervalUnit::MonthDayNano) => {
+            as_interval_mdn_array(array)?.unary(|x| {
+                x.months as f64 * DAYS_PER_MONTH * SECONDS_IN_A_DAY
+                    + x.days as f64 * SECONDS_IN_A_DAY
+                    + x.nanoseconds as f64 / 1_000_000_000_f64
+            })
+        }
+        Duration(_) => return seconds(array, Second),
         d => return exec_err!("Cannot convert {d:?} to epoch"),
     };
     Ok(Arc::new(f))
 }
+
+/// Invoke [`date_part`] on an `array` (e.g. Timestamp) and convert the
+/// result to a total number of nanoseconds as an Int64 array.
+///
+/// This returns an Int64 rather than Int32 because  there 1 billion
+/// `nanosecond`s in each second, so representing up to 60 seconds as
+/// nanoseconds can be values up to 60 billion, which does not fit in Int32.
+fn seconds_ns(array: &dyn Array) -> Result<ArrayRef> {
+    let secs = date_part(array, DatePart::Second)?;
+    // This assumes array is primitive and not a dictionary
+    let secs = as_int32_array(secs.as_ref())?;
+    let subsecs = date_part(array, DatePart::Nanosecond)?;
+    let subsecs = as_int32_array(subsecs.as_ref())?;
+
+    // Special case where there are no nulls.
+    if subsecs.null_count() == 0 {
+        let r: Int64Array = binary(secs, subsecs, |secs, subsecs| {
+            (secs as i64) * 1_000_000_000 + (subsecs as i64)
+        })?;
+        Ok(Arc::new(r))
+    } else {
+        // Nulls in secs are preserved, nulls in subsecs are treated as zero to account for the case
+        // where the number of nanoseconds overflows.
+        let r: Int64Array = secs
+            .iter()
+            .zip(subsecs)
+            .map(|(secs, subsecs)| {
+                secs.map(|secs| {
+                    let subsecs = subsecs.unwrap_or(0);
+                    (secs as i64) * 1_000_000_000 + (subsecs as i64)
+                })
+            })
+            .collect();
+        Ok(Arc::new(r))
+    }
+}
diff --git a/datafusion/functions/src/datetime/date_trunc.rs b/datafusion/functions/src/datetime/date_trunc.rs
index 913e6217af82d..a97fd138ecb12 100644
--- a/datafusion/functions/src/datetime/date_trunc.rs
+++ b/datafusion/functions/src/datetime/date_trunc.rs
@@ -15,32 +15,36 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::num::NonZeroI64;
 use std::ops::{Add, Sub};
 use std::str::FromStr;
 use std::sync::Arc;
 
 use arrow::array::temporal_conversions::{
-    as_datetime_with_timezone, timestamp_ns_to_datetime,
+    MICROSECONDS, MILLISECONDS, NANOSECONDS, as_datetime_with_timezone,
+    timestamp_ns_to_datetime,
 };
 use arrow::array::timezone::Tz;
 use arrow::array::types::{
-    ArrowTimestampType, TimestampMicrosecondType, TimestampMillisecondType,
+    ArrowTimestampType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
+    Time64NanosecondType, TimestampMicrosecondType, TimestampMillisecondType,
     TimestampNanosecondType, TimestampSecondType,
 };
 use arrow::array::{Array, ArrayRef, PrimitiveArray};
-use arrow::datatypes::DataType::{self, Null, Timestamp, Utf8, Utf8View};
+use arrow::datatypes::DataType::{self, Time32, Time64, Timestamp};
 use arrow::datatypes::TimeUnit::{self, Microsecond, Millisecond, Nanosecond, Second};
+use arrow::datatypes::{Field, FieldRef};
 use datafusion_common::cast::as_primitive_array;
+use datafusion_common::types::{NativeType, logical_date, logical_string};
 use datafusion_common::{
-    exec_datafusion_err, exec_err, plan_err, DataFusionError, Result, ScalarValue,
+    DataFusionError, Result, ScalarValue, exec_datafusion_err, exec_err, internal_err,
 };
 use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
-use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, TIMEZONE_WILDCARD,
+    ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignature, Volatility,
 };
+use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
 use datafusion_macros::user_doc;
 
 use chrono::{
@@ -116,16 +120,30 @@ impl DateTruncGranularity {
     fn is_fine_granularity_utc(&self) -> bool {
         self.is_fine_granularity() || matches!(self, Self::Hour | Self::Day)
     }
+
+    /// Returns true if this granularity is valid for Time types
+    /// Time types don't have date components, so day/week/month/quarter/year are not valid
+    fn valid_for_time(&self) -> bool {
+        matches!(
+            self,
+            Self::Hour
+                | Self::Minute
+                | Self::Second
+                | Self::Millisecond
+                | Self::Microsecond
+        )
+    }
 }
 
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
-    description = "Truncates a timestamp value to a specified precision.",
+    description = "Truncates a timestamp or time value to a specified precision.",
     syntax_example = "date_trunc(precision, expression)",
     argument(
         name = "precision",
         description = r#"Time precision to truncate to. The following precisions are supported:
 
+    For Timestamp types:
     - year / YEAR
     - quarter / QUARTER
     - month / MONTH
@@ -136,12 +154,33 @@ impl DateTruncGranularity {
     - second / SECOND
     - millisecond / MILLISECOND
     - microsecond / MICROSECOND
+
+    For Time types (hour, minute, second, millisecond, microsecond only):
+    - hour / HOUR
+    - minute / MINUTE
+    - second / SECOND
+    - millisecond / MILLISECOND
+    - microsecond / MICROSECOND
 "#
     ),
     argument(
         name = "expression",
-        description = "Time expression to operate on. Can be a constant, column, or function."
-    )
+        description = "Timestamp or time expression to operate on. Can be a constant, column, or function."
+    ),
+    sql_example = r#"```sql
+> SELECT date_trunc('month', '2024-05-15T10:30:00');
++-----------------------------------------------+
+| date_trunc(Utf8("month"),Utf8("2024-05-15T10:30:00")) |
++-----------------------------------------------+
+| 2024-05-01T00:00:00                           |
++-----------------------------------------------+
+> SELECT date_trunc('hour', '2024-05-15T10:30:00');
++----------------------------------------------+
+| date_trunc(Utf8("hour"),Utf8("2024-05-15T10:30:00")) |
++----------------------------------------------+
+| 2024-05-15T10:00:00                          |
++----------------------------------------------+
+```"#
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
 pub struct DateTruncFunc {
@@ -160,45 +199,21 @@ impl DateTruncFunc {
         Self {
             signature: Signature::one_of(
                 vec![
-                    Exact(vec![Utf8, Timestamp(Nanosecond, None)]),
-                    Exact(vec![Utf8View, Timestamp(Nanosecond, None)]),
-                    Exact(vec![
-                        Utf8,
-                        Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())),
-                    ]),
-                    Exact(vec![
-                        Utf8View,
-                        Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())),
-                    ]),
-                    Exact(vec![Utf8, Timestamp(Microsecond, None)]),
-                    Exact(vec![Utf8View, Timestamp(Microsecond, None)]),
-                    Exact(vec![
-                        Utf8,
-                        Timestamp(Microsecond, Some(TIMEZONE_WILDCARD.into())),
-                    ]),
-                    Exact(vec![
-                        Utf8View,
-                        Timestamp(Microsecond, Some(TIMEZONE_WILDCARD.into())),
-                    ]),
-                    Exact(vec![Utf8, Timestamp(Millisecond, None)]),
-                    Exact(vec![Utf8View, Timestamp(Millisecond, None)]),
-                    Exact(vec![
-                        Utf8,
-                        Timestamp(Millisecond, Some(TIMEZONE_WILDCARD.into())),
-                    ]),
-                    Exact(vec![
-                        Utf8View,
-                        Timestamp(Millisecond, Some(TIMEZONE_WILDCARD.into())),
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                        Coercion::new_implicit(
+                            TypeSignatureClass::Timestamp,
+                            // Allow implicit cast from string and date to timestamp for backward compatibility
+                            vec![
+                                TypeSignatureClass::Native(logical_string()),
+                                TypeSignatureClass::Native(logical_date()),
+                            ],
+                            NativeType::Timestamp(Nanosecond, None),
+                        ),
                     ]),
-                    Exact(vec![Utf8, Timestamp(Second, None)]),
-                    Exact(vec![Utf8View, Timestamp(Second, None)]),
-                    Exact(vec![
-                        Utf8,
-                        Timestamp(Second, Some(TIMEZONE_WILDCARD.into())),
-                    ]),
-                    Exact(vec![
-                        Utf8View,
-                        Timestamp(Second, Some(TIMEZONE_WILDCARD.into())),
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                        Coercion::new_exact(TypeSignatureClass::Time),
                     ]),
                 ],
                 Volatility::Immutable,
@@ -209,10 +224,6 @@ impl DateTruncFunc {
 }
 
 impl ScalarUDFImpl for DateTruncFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "date_trunc"
     }
@@ -221,25 +232,25 @@ impl ScalarUDFImpl for DateTruncFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        match &arg_types[1] {
-            Timestamp(Nanosecond, None) | Utf8 | DataType::Date32 | Null => {
-                Ok(Timestamp(Nanosecond, None))
-            }
-            Timestamp(Nanosecond, tz_opt) => Ok(Timestamp(Nanosecond, tz_opt.clone())),
-            Timestamp(Microsecond, tz_opt) => Ok(Timestamp(Microsecond, tz_opt.clone())),
-            Timestamp(Millisecond, tz_opt) => Ok(Timestamp(Millisecond, tz_opt.clone())),
-            Timestamp(Second, tz_opt) => Ok(Timestamp(Second, tz_opt.clone())),
-            _ => plan_err!(
-                "The date_trunc function can only accept timestamp as the second arg."
-            ),
-        }
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be called instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let field = &args.arg_fields[1];
+        let return_type = if field.data_type().is_null() {
+            Timestamp(Nanosecond, None)
+        } else {
+            field.data_type().clone()
+        };
+        Ok(Arc::new(Field::new(
+            self.name(),
+            return_type,
+            field.is_nullable(),
+        )))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = args.args;
         let (granularity, array) = (&args[0], &args[1]);
 
@@ -248,6 +259,9 @@ impl ScalarUDFImpl for DateTruncFunc {
         {
             v.to_lowercase()
         } else if let ColumnarValue::Scalar(ScalarValue::Utf8View(Some(v))) = granularity
+        {
+            v.to_lowercase()
+        } else if let ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(v))) = granularity
         {
             v.to_lowercase()
         } else {
@@ -256,6 +270,15 @@ impl ScalarUDFImpl for DateTruncFunc {
 
         let granularity = DateTruncGranularity::from_str(&granularity_str)?;
 
+        // Check upfront if granularity is valid for Time types
+        let is_time_type = matches!(array.data_type(), Time64(_) | Time32(_));
+        if is_time_type && !granularity.valid_for_time() {
+            return exec_err!(
+                "date_trunc does not support '{}' granularity for Time types. Valid values are: hour, minute, second, millisecond, microsecond",
+                granularity_str
+            );
+        }
+
         fn process_array<T: ArrowTimestampType>(
             array: &dyn Array,
             granularity: DateTruncGranularity,
@@ -276,6 +299,7 @@ impl ScalarUDFImpl for DateTruncFunc {
                     T::UNIT,
                     array,
                     granularity,
+                    tz_opt.clone(),
                 )?;
                 return Ok(ColumnarValue::Array(result));
             }
@@ -302,6 +326,10 @@ impl ScalarUDFImpl for DateTruncFunc {
         }
 
         Ok(match array {
+            ColumnarValue::Scalar(ScalarValue::Null) => {
+                // NULL input returns NULL timestamp
+                ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(None, None))
+            }
             ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(v, tz_opt)) => {
                 process_scalar::<TimestampNanosecondType>(v, granularity, tz_opt)?
             }
@@ -314,38 +342,77 @@ impl ScalarUDFImpl for DateTruncFunc {
             ColumnarValue::Scalar(ScalarValue::TimestampSecond(v, tz_opt)) => {
                 process_scalar::<TimestampSecondType>(v, granularity, tz_opt)?
             }
+            ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(v)) => {
+                let truncated = v.map(|val| truncate_time_nanos(val, granularity));
+                ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(truncated))
+            }
+            ColumnarValue::Scalar(ScalarValue::Time64Microsecond(v)) => {
+                let truncated = v.map(|val| truncate_time_micros(val, granularity));
+                ColumnarValue::Scalar(ScalarValue::Time64Microsecond(truncated))
+            }
+            ColumnarValue::Scalar(ScalarValue::Time32Millisecond(v)) => {
+                let truncated = v.map(|val| truncate_time_millis(val, granularity));
+                ColumnarValue::Scalar(ScalarValue::Time32Millisecond(truncated))
+            }
+            ColumnarValue::Scalar(ScalarValue::Time32Second(v)) => {
+                let truncated = v.map(|val| truncate_time_secs(val, granularity));
+                ColumnarValue::Scalar(ScalarValue::Time32Second(truncated))
+            }
             ColumnarValue::Array(array) => {
                 let array_type = array.data_type();
-                if let Timestamp(unit, tz_opt) = array_type {
-                    match unit {
-                        Second => process_array::<TimestampSecondType>(
-                            array,
-                            granularity,
-                            tz_opt,
-                        )?,
-                        Millisecond => process_array::<TimestampMillisecondType>(
-                            array,
-                            granularity,
-                            tz_opt,
-                        )?,
-                        Microsecond => process_array::<TimestampMicrosecondType>(
-                            array,
-                            granularity,
-                            tz_opt,
-                        )?,
-                        Nanosecond => process_array::<TimestampNanosecondType>(
-                            array,
-                            granularity,
-                            tz_opt,
-                        )?,
+                match array_type {
+                    Timestamp(Second, tz_opt) => {
+                        process_array::<TimestampSecondType>(array, granularity, tz_opt)?
+                    }
+                    Timestamp(Millisecond, tz_opt) => process_array::<
+                        TimestampMillisecondType,
+                    >(
+                        array, granularity, tz_opt
+                    )?,
+                    Timestamp(Microsecond, tz_opt) => process_array::<
+                        TimestampMicrosecondType,
+                    >(
+                        array, granularity, tz_opt
+                    )?,
+                    Timestamp(Nanosecond, tz_opt) => process_array::<
+                        TimestampNanosecondType,
+                    >(
+                        array, granularity, tz_opt
+                    )?,
+                    Time64(Nanosecond) => {
+                        let arr = as_primitive_array::<Time64NanosecondType>(array)?;
+                        let result: PrimitiveArray<Time64NanosecondType> =
+                            arr.unary(|v| truncate_time_nanos(v, granularity));
+                        ColumnarValue::Array(Arc::new(result))
+                    }
+                    Time64(Microsecond) => {
+                        let arr = as_primitive_array::<Time64MicrosecondType>(array)?;
+                        let result: PrimitiveArray<Time64MicrosecondType> =
+                            arr.unary(|v| truncate_time_micros(v, granularity));
+                        ColumnarValue::Array(Arc::new(result))
+                    }
+                    Time32(Millisecond) => {
+                        let arr = as_primitive_array::<Time32MillisecondType>(array)?;
+                        let result: PrimitiveArray<Time32MillisecondType> =
+                            arr.unary(|v| truncate_time_millis(v, granularity));
+                        ColumnarValue::Array(Arc::new(result))
+                    }
+                    Time32(Second) => {
+                        let arr = as_primitive_array::<Time32SecondType>(array)?;
+                        let result: PrimitiveArray<Time32SecondType> =
+                            arr.unary(|v| truncate_time_secs(v, granularity));
+                        ColumnarValue::Array(Arc::new(result))
+                    }
+                    _ => {
+                        return exec_err!(
+                            "second argument of `date_trunc` is an unsupported array type: {array_type}"
+                        );
                     }
-                } else {
-                    return exec_err!("second argument of `date_trunc` is an unsupported array type: {array_type}");
                 }
             }
             _ => {
                 return exec_err!(
-                    "second argument of `date_trunc` must be timestamp scalar or array"
+                    "second argument of `date_trunc` must be timestamp, time scalar or array"
                 );
             }
         })
@@ -371,6 +438,76 @@ impl ScalarUDFImpl for DateTruncFunc {
     }
 }
 
+const NANOS_PER_MICROSECOND: i64 = NANOSECONDS / MICROSECONDS;
+const NANOS_PER_MILLISECOND: i64 = NANOSECONDS / MILLISECONDS;
+const NANOS_PER_SECOND: i64 = NANOSECONDS;
+const NANOS_PER_MINUTE: i64 = 60 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: i64 = 60 * NANOS_PER_MINUTE;
+
+const MICROS_PER_MILLISECOND: i64 = MICROSECONDS / MILLISECONDS;
+const MICROS_PER_SECOND: i64 = MICROSECONDS;
+const MICROS_PER_MINUTE: i64 = 60 * MICROS_PER_SECOND;
+const MICROS_PER_HOUR: i64 = 60 * MICROS_PER_MINUTE;
+
+const MILLIS_PER_SECOND: i32 = MILLISECONDS as i32;
+const MILLIS_PER_MINUTE: i32 = 60 * MILLIS_PER_SECOND;
+const MILLIS_PER_HOUR: i32 = 60 * MILLIS_PER_MINUTE;
+
+const SECS_PER_MINUTE: i32 = 60;
+const SECS_PER_HOUR: i32 = 60 * SECS_PER_MINUTE;
+
+/// Truncate time in nanoseconds to the specified granularity
+fn truncate_time_nanos(value: i64, granularity: DateTruncGranularity) -> i64 {
+    match granularity {
+        DateTruncGranularity::Hour => value - (value % NANOS_PER_HOUR),
+        DateTruncGranularity::Minute => value - (value % NANOS_PER_MINUTE),
+        DateTruncGranularity::Second => value - (value % NANOS_PER_SECOND),
+        DateTruncGranularity::Millisecond => value - (value % NANOS_PER_MILLISECOND),
+        DateTruncGranularity::Microsecond => value - (value % NANOS_PER_MICROSECOND),
+        // Other granularities are not valid for time - should be caught earlier
+        _ => value,
+    }
+}
+
+/// Truncate time in microseconds to the specified granularity
+fn truncate_time_micros(value: i64, granularity: DateTruncGranularity) -> i64 {
+    match granularity {
+        DateTruncGranularity::Hour => value - (value % MICROS_PER_HOUR),
+        DateTruncGranularity::Minute => value - (value % MICROS_PER_MINUTE),
+        DateTruncGranularity::Second => value - (value % MICROS_PER_SECOND),
+        DateTruncGranularity::Millisecond => value - (value % MICROS_PER_MILLISECOND),
+        DateTruncGranularity::Microsecond => value, // Already at microsecond precision
+        // Other granularities are not valid for time
+        _ => value,
+    }
+}
+
+/// Truncate time in milliseconds to the specified granularity
+fn truncate_time_millis(value: i32, granularity: DateTruncGranularity) -> i32 {
+    match granularity {
+        DateTruncGranularity::Hour => value - (value % MILLIS_PER_HOUR),
+        DateTruncGranularity::Minute => value - (value % MILLIS_PER_MINUTE),
+        DateTruncGranularity::Second => value - (value % MILLIS_PER_SECOND),
+        DateTruncGranularity::Millisecond => value, // Already at millisecond precision
+        DateTruncGranularity::Microsecond => value, // Can't truncate to finer precision
+        // Other granularities are not valid for time
+        _ => value,
+    }
+}
+
+/// Truncate time in seconds to the specified granularity
+fn truncate_time_secs(value: i32, granularity: DateTruncGranularity) -> i32 {
+    match granularity {
+        DateTruncGranularity::Hour => value - (value % SECS_PER_HOUR),
+        DateTruncGranularity::Minute => value - (value % SECS_PER_MINUTE),
+        DateTruncGranularity::Second => value, // Already at second precision
+        DateTruncGranularity::Millisecond => value, // Can't truncate to finer precision
+        DateTruncGranularity::Microsecond => value, // Can't truncate to finer precision
+        // Other granularities are not valid for time
+        _ => value,
+    }
+}
+
 fn _date_trunc_coarse<T>(
     granularity: DateTruncGranularity,
     value: Option<T>,
@@ -522,6 +659,7 @@ fn general_date_trunc_array_fine_granularity<T: ArrowTimestampType>(
     tu: TimeUnit,
     array: &PrimitiveArray<T>,
     granularity: DateTruncGranularity,
+    tz_opt: Option<Arc<str>>,
 ) -> Result<ArrayRef> {
     let unit = match (tu, granularity) {
         (Second, DateTruncGranularity::Minute) => NonZeroI64::new(60),
@@ -556,7 +694,8 @@ fn general_date_trunc_array_fine_granularity<T: ArrowTimestampType>(
                 .iter()
                 .map(|v| *v - i64::rem_euclid(*v, unit)),
             array.nulls().cloned(),
-        );
+        )
+        .with_timezone_opt(tz_opt);
         Ok(Arc::new(array))
     } else {
         // truncate to the same or smaller unit
@@ -626,7 +765,7 @@ mod tests {
     use std::sync::Arc;
 
     use crate::datetime::date_trunc::{
-        date_trunc_coarse, DateTruncFunc, DateTruncGranularity,
+        DateTruncFunc, DateTruncGranularity, date_trunc_coarse,
     };
 
     use arrow::array::cast::as_primitive_array;
@@ -634,9 +773,9 @@ mod tests {
     use arrow::array::{Array, TimestampNanosecondArray};
     use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos;
     use arrow::datatypes::{DataType, Field, TimeUnit};
-    use datafusion_common::config::ConfigOptions;
     use datafusion_common::ScalarValue;
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
 
     #[test]
     fn date_trunc_test() {
@@ -878,7 +1017,7 @@ mod tests {
                 Field::new("a", DataType::Utf8, false).into(),
                 Field::new("b", input.data_type().clone(), false).into(),
             ];
-            let args = datafusion_expr::ScalarFunctionArgs {
+            let args = ScalarFunctionArgs {
                 args: vec![
                     ColumnarValue::Scalar(ScalarValue::from("day")),
                     ColumnarValue::Array(Arc::new(input)),
@@ -1066,7 +1205,7 @@ mod tests {
                 Field::new("a", DataType::Utf8, false).into(),
                 Field::new("b", input.data_type().clone(), false).into(),
             ];
-            let args = datafusion_expr::ScalarFunctionArgs {
+            let args = ScalarFunctionArgs {
                 args: vec![
                     ColumnarValue::Scalar(ScalarValue::from("hour")),
                     ColumnarValue::Array(Arc::new(input)),
@@ -1094,4 +1233,176 @@ mod tests {
             }
         });
     }
+
+    #[test]
+    fn test_date_trunc_fine_granularity_timezones() {
+        let cases = [
+            // Test "second" granularity
+            (
+                vec![
+                    "2020-09-08T13:42:29.190855Z",
+                    "2020-09-08T13:42:30.500000Z",
+                    "2020-09-08T13:42:31.999999Z",
+                ],
+                Some("+00".into()),
+                "second",
+                vec![
+                    "2020-09-08T13:42:29.000000Z",
+                    "2020-09-08T13:42:30.000000Z",
+                    "2020-09-08T13:42:31.000000Z",
+                ],
+            ),
+            (
+                vec![
+                    "2020-09-08T13:42:29.190855+05",
+                    "2020-09-08T13:42:30.500000+05",
+                    "2020-09-08T13:42:31.999999+05",
+                ],
+                Some("+05".into()),
+                "second",
+                vec![
+                    "2020-09-08T13:42:29.000000+05",
+                    "2020-09-08T13:42:30.000000+05",
+                    "2020-09-08T13:42:31.000000+05",
+                ],
+            ),
+            (
+                vec![
+                    "2020-09-08T13:42:29.190855Z",
+                    "2020-09-08T13:42:30.500000Z",
+                    "2020-09-08T13:42:31.999999Z",
+                ],
+                Some("Europe/Berlin".into()),
+                "second",
+                vec![
+                    "2020-09-08T13:42:29.000000Z",
+                    "2020-09-08T13:42:30.000000Z",
+                    "2020-09-08T13:42:31.000000Z",
+                ],
+            ),
+            // Test "minute" granularity
+            (
+                vec![
+                    "2020-09-08T13:42:29.190855Z",
+                    "2020-09-08T13:43:30.500000Z",
+                    "2020-09-08T13:44:31.999999Z",
+                ],
+                Some("+00".into()),
+                "minute",
+                vec![
+                    "2020-09-08T13:42:00.000000Z",
+                    "2020-09-08T13:43:00.000000Z",
+                    "2020-09-08T13:44:00.000000Z",
+                ],
+            ),
+            (
+                vec![
+                    "2020-09-08T13:42:29.190855+08",
+                    "2020-09-08T13:43:30.500000+08",
+                    "2020-09-08T13:44:31.999999+08",
+                ],
+                Some("+08".into()),
+                "minute",
+                vec![
+                    "2020-09-08T13:42:00.000000+08",
+                    "2020-09-08T13:43:00.000000+08",
+                    "2020-09-08T13:44:00.000000+08",
+                ],
+            ),
+            (
+                vec![
+                    "2020-09-08T13:42:29.190855Z",
+                    "2020-09-08T13:43:30.500000Z",
+                    "2020-09-08T13:44:31.999999Z",
+                ],
+                Some("America/Sao_Paulo".into()),
+                "minute",
+                vec![
+                    "2020-09-08T13:42:00.000000Z",
+                    "2020-09-08T13:43:00.000000Z",
+                    "2020-09-08T13:44:00.000000Z",
+                ],
+            ),
+            // Test with None (no timezone)
+            (
+                vec![
+                    "2020-09-08T13:42:29.190855Z",
+                    "2020-09-08T13:43:30.500000Z",
+                    "2020-09-08T13:44:31.999999Z",
+                ],
+                None,
+                "minute",
+                vec![
+                    "2020-09-08T13:42:00.000000Z",
+                    "2020-09-08T13:43:00.000000Z",
+                    "2020-09-08T13:44:00.000000Z",
+                ],
+            ),
+            // Test millisecond granularity
+            (
+                vec![
+                    "2020-09-08T13:42:29.190855Z",
+                    "2020-09-08T13:42:29.191999Z",
+                    "2020-09-08T13:42:29.192500Z",
+                ],
+                Some("Asia/Kolkata".into()),
+                "millisecond",
+                vec![
+                    "2020-09-08T19:12:29.190000+05:30",
+                    "2020-09-08T19:12:29.191000+05:30",
+                    "2020-09-08T19:12:29.192000+05:30",
+                ],
+            ),
+        ];
+
+        cases
+            .iter()
+            .for_each(|(original, tz_opt, granularity, expected)| {
+                let input = original
+                    .iter()
+                    .map(|s| Some(string_to_timestamp_nanos(s).unwrap()))
+                    .collect::<TimestampNanosecondArray>()
+                    .with_timezone_opt(tz_opt.clone());
+                let right = expected
+                    .iter()
+                    .map(|s| Some(string_to_timestamp_nanos(s).unwrap()))
+                    .collect::<TimestampNanosecondArray>()
+                    .with_timezone_opt(tz_opt.clone());
+                let batch_len = input.len();
+                let arg_fields = vec![
+                    Field::new("a", DataType::Utf8, false).into(),
+                    Field::new("b", input.data_type().clone(), false).into(),
+                ];
+                let args = ScalarFunctionArgs {
+                    args: vec![
+                        ColumnarValue::Scalar(ScalarValue::from(*granularity)),
+                        ColumnarValue::Array(Arc::new(input)),
+                    ],
+                    arg_fields,
+                    number_rows: batch_len,
+                    return_field: Field::new(
+                        "f",
+                        DataType::Timestamp(TimeUnit::Nanosecond, tz_opt.clone()),
+                        true,
+                    )
+                    .into(),
+                    config_options: Arc::new(ConfigOptions::default()),
+                };
+                let result = DateTruncFunc::new().invoke_with_args(args).unwrap();
+                if let ColumnarValue::Array(result) = result {
+                    assert_eq!(
+                        result.data_type(),
+                        &DataType::Timestamp(TimeUnit::Nanosecond, tz_opt.clone()),
+                        "Failed for granularity: {granularity}, timezone: {tz_opt:?}"
+                    );
+                    let left = as_primitive_array::<TimestampNanosecondType>(&result);
+                    assert_eq!(
+                        left, &right,
+                        "Failed for granularity: {granularity}, timezone: {tz_opt:?}"
+                    );
+                } else {
+                    panic!("unexpected column type");
+                }
+            });
+    }
 }
diff --git a/datafusion/functions/src/datetime/from_unixtime.rs b/datafusion/functions/src/datetime/from_unixtime.rs
index 5d6adfb6f119a..4787c75b610b6 100644
--- a/datafusion/functions/src/datetime/from_unixtime.rs
+++ b/datafusion/functions/src/datetime/from_unixtime.rs
@@ -15,16 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::datatypes::DataType::{Int64, Timestamp, Utf8};
 use arrow::datatypes::TimeUnit::Second;
 use arrow::datatypes::{DataType, Field, FieldRef};
-use datafusion_common::{exec_err, internal_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ReturnFieldArgs, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -69,10 +69,6 @@ impl FromUnixtimeFunc {
 }
 
 impl ScalarUDFImpl for FromUnixtimeFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "from_unixtime"
     }
@@ -118,10 +114,7 @@ impl ScalarUDFImpl for FromUnixtimeFunc {
         internal_err!("call return_field_from_args instead")
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = args.args;
         let len = args.len();
         if len != 1 && len != 2 {
@@ -164,16 +157,16 @@ mod test {
     use crate::datetime::from_unixtime::FromUnixtimeFunc;
     use arrow::datatypes::TimeUnit::Second;
     use arrow::datatypes::{DataType, Field};
-    use datafusion_common::config::ConfigOptions;
     use datafusion_common::ScalarValue;
     use datafusion_common::ScalarValue::Int64;
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
     use std::sync::Arc;
 
     #[test]
     fn test_without_timezone() {
         let arg_field = Arc::new(Field::new("a", DataType::Int64, true));
-        let args = datafusion_expr::ScalarFunctionArgs {
+        let args = ScalarFunctionArgs {
             args: vec![ColumnarValue::Scalar(Int64(Some(1729900800)))],
             arg_fields: vec![arg_field],
             number_rows: 1,
@@ -196,7 +189,7 @@ mod test {
             Field::new("a", DataType::Int64, true).into(),
             Field::new("a", DataType::Utf8, true).into(),
         ];
-        let args = datafusion_expr::ScalarFunctionArgs {
+        let args = ScalarFunctionArgs {
             args: vec![
                 ColumnarValue::Scalar(Int64(Some(1729900800))),
                 ColumnarValue::Scalar(ScalarValue::Utf8(Some(
diff --git a/datafusion/functions/src/datetime/make_date.rs b/datafusion/functions/src/datetime/make_date.rs
index 0fe5d156a8383..dc1328742f24e 100644
--- a/datafusion/functions/src/datetime/make_date.rs
+++ b/datafusion/functions/src/datetime/make_date.rs
@@ -15,20 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::builder::PrimitiveBuilder;
 use arrow::array::cast::AsArray;
 use arrow::array::types::{Date32Type, Int32Type};
-use arrow::array::PrimitiveArray;
+use arrow::array::{Array, PrimitiveArray};
 use arrow::datatypes::DataType;
-use arrow::datatypes::DataType::{Date32, Int32, Int64, UInt32, UInt64, Utf8, Utf8View};
+use arrow::datatypes::DataType::Date32;
 use chrono::prelude::*;
 
-use datafusion_common::{exec_err, utils::take_function_args, Result, ScalarValue};
+use datafusion_common::types::{NativeType, logical_int32, logical_string};
+use datafusion_common::{Result, ScalarValue, exec_err, utils::take_function_args};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -51,7 +52,7 @@ use datafusion_macros::user_doc;
 +-----------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/date_time_functions.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 "#,
     argument(
         name = "year",
@@ -79,21 +80,21 @@ impl Default for MakeDateFunc {
 
 impl MakeDateFunc {
     pub fn new() -> Self {
+        let int = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int32()),
+            vec![
+                TypeSignatureClass::Integer,
+                TypeSignatureClass::Native(logical_string()),
+            ],
+            NativeType::Int32,
+        );
         Self {
-            signature: Signature::uniform(
-                3,
-                vec![Int32, Int64, UInt32, UInt64, Utf8, Utf8View],
-                Volatility::Immutable,
-            ),
+            signature: Signature::coercible(vec![int; 3], Volatility::Immutable),
         }
     }
 }
 
 impl ScalarUDFImpl for MakeDateFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "make_date"
     }
@@ -106,91 +107,60 @@ impl ScalarUDFImpl for MakeDateFunc {
         Ok(Date32)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        // first, identify if any of the arguments is an Array. If yes, store its `len`,
-        // as any scalar will need to be converted to an array of len `len`.
-        let args = args.args;
-        let len = args
-            .iter()
-            .fold(Option::<usize>::None, |acc, arg| match arg {
-                ColumnarValue::Scalar(_) => acc,
-                ColumnarValue::Array(a) => Some(a.len()),
-            });
-
-        let [years, months, days] = take_function_args(self.name(), args)?;
-
-        if matches!(years, ColumnarValue::Scalar(ScalarValue::Null))
-            || matches!(months, ColumnarValue::Scalar(ScalarValue::Null))
-            || matches!(days, ColumnarValue::Scalar(ScalarValue::Null))
-        {
-            return Ok(ColumnarValue::Scalar(ScalarValue::Null));
-        }
-
-        let years = years.cast_to(&Int32, None)?;
-        let months = months.cast_to(&Int32, None)?;
-        let days = days.cast_to(&Int32, None)?;
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let [years, months, days] = take_function_args(self.name(), args.args)?;
 
-        let scalar_value_fn = |col: &ColumnarValue| -> Result<i32> {
-            let ColumnarValue::Scalar(s) = col else {
-                return exec_err!("Expected scalar value");
-            };
-            let ScalarValue::Int32(Some(i)) = s else {
-                return exec_err!("Unable to parse date from null/empty value");
-            };
-            Ok(*i)
-        };
-
-        let value = if let Some(array_size) = len {
-            let to_primitive_array_fn =
-                |col: &ColumnarValue| -> PrimitiveArray<Int32Type> {
-                    match col {
-                        ColumnarValue::Array(a) => {
-                            a.as_primitive::<Int32Type>().to_owned()
-                        }
-                        _ => {
-                            let v = scalar_value_fn(col).unwrap();
-                            PrimitiveArray::<Int32Type>::from_value(v, array_size)
-                        }
+        match (years, months, days) {
+            (ColumnarValue::Scalar(y), _, _) if y.is_null() => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Date32(None)))
+            }
+            (_, ColumnarValue::Scalar(m), _) if m.is_null() => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Date32(None)))
+            }
+            (_, _, ColumnarValue::Scalar(d)) if d.is_null() => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Date32(None)))
+            }
+            (
+                ColumnarValue::Scalar(ScalarValue::Int32(Some(years))),
+                ColumnarValue::Scalar(ScalarValue::Int32(Some(months))),
+                ColumnarValue::Scalar(ScalarValue::Int32(Some(days))),
+            ) => {
+                let mut value = 0;
+                make_date_inner(years, months, days, |days: i32| value = days)?;
+                Ok(ColumnarValue::Scalar(ScalarValue::Date32(Some(value))))
+            }
+            (years, months, days) => {
+                let len = args.number_rows;
+                let years = years.into_array(len)?;
+                let months = months.into_array(len)?;
+                let days = days.into_array(len)?;
+
+                let years = years.as_primitive::<Int32Type>();
+                let months = months.as_primitive::<Int32Type>();
+                let days = days.as_primitive::<Int32Type>();
+
+                let mut builder: PrimitiveBuilder<Date32Type> =
+                    PrimitiveArray::builder(len);
+
+                for i in 0..len {
+                    // match postgresql behaviour which returns null for any null input
+                    if years.is_null(i) || months.is_null(i) || days.is_null(i) {
+                        builder.append_null();
+                    } else {
+                        make_date_inner(
+                            years.value(i),
+                            months.value(i),
+                            days.value(i),
+                            |days: i32| builder.append_value(days),
+                        )?;
                     }
-                };
+                }
 
-            let years = to_primitive_array_fn(&years);
-            let months = to_primitive_array_fn(&months);
-            let days = to_primitive_array_fn(&days);
-
-            let mut builder: PrimitiveBuilder<Date32Type> =
-                PrimitiveArray::builder(array_size);
-            for i in 0..array_size {
-                make_date_inner(
-                    years.value(i),
-                    months.value(i),
-                    days.value(i),
-                    |days: i32| builder.append_value(days),
-                )?;
+                Ok(ColumnarValue::Array(Arc::new(builder.finish())))
             }
-
-            let arr = builder.finish();
-
-            ColumnarValue::Array(Arc::new(arr))
-        } else {
-            // For scalar only columns the operation is faster without using the PrimitiveArray.
-            // Also, keep the output as scalar since all inputs are scalar.
-            let mut value = 0;
-            make_date_inner(
-                scalar_value_fn(&years)?,
-                scalar_value_fn(&months)?,
-                scalar_value_fn(&days)?,
-                |days: i32| value = days,
-            )?;
-
-            ColumnarValue::Scalar(ScalarValue::Date32(Some(value)))
-        };
-
-        Ok(value)
+        }
     }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
@@ -204,11 +174,13 @@ fn make_date_inner<F: FnMut(i32)>(
     day: i32,
     mut date_consumer_fn: F,
 ) -> Result<()> {
-    let Ok(m) = u32::try_from(month) else {
-        return exec_err!("Month value '{month:?}' is out of range");
+    let m = match month {
+        1..=12 => month as u32,
+        _ => return exec_err!("Month value '{month:?}' is out of range"),
     };
-    let Ok(d) = u32::try_from(day) else {
-        return exec_err!("Day value '{day:?}' is out of range");
+    let d = match day {
+        1..=31 => day as u32,
+        _ => return exec_err!("Day value '{day:?}' is out of range"),
     };
 
     if let Some(date) = NaiveDate::from_ymd_opt(year, m, d) {
@@ -225,180 +197,3 @@ fn make_date_inner<F: FnMut(i32)>(
         exec_err!("Unable to parse date from {year}, {month}, {day}")
     }
 }
-
-#[cfg(test)]
-mod tests {
-    use crate::datetime::make_date::MakeDateFunc;
-    use arrow::array::{Array, Date32Array, Int32Array, Int64Array, UInt32Array};
-    use arrow::datatypes::{DataType, Field};
-    use datafusion_common::config::ConfigOptions;
-    use datafusion_common::{DataFusionError, ScalarValue};
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
-    use std::sync::Arc;
-
-    fn invoke_make_date_with_args(
-        args: Vec<ColumnarValue>,
-        number_rows: usize,
-    ) -> Result<ColumnarValue, DataFusionError> {
-        let arg_fields = args
-            .iter()
-            .map(|arg| Field::new("a", arg.data_type(), true).into())
-            .collect::<Vec<_>>();
-        let args = datafusion_expr::ScalarFunctionArgs {
-            args,
-            arg_fields,
-            number_rows,
-            return_field: Field::new("f", DataType::Date32, true).into(),
-            config_options: Arc::new(ConfigOptions::default()),
-        };
-        MakeDateFunc::new().invoke_with_args(args)
-    }
-
-    #[test]
-    fn test_make_date() {
-        let res = invoke_make_date_with_args(
-            vec![
-                ColumnarValue::Scalar(ScalarValue::Int32(Some(2024))),
-                ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
-                ColumnarValue::Scalar(ScalarValue::UInt32(Some(14))),
-            ],
-            1,
-        )
-        .expect("that make_date parsed values without error");
-
-        if let ColumnarValue::Scalar(ScalarValue::Date32(date)) = res {
-            assert_eq!(19736, date.unwrap());
-        } else {
-            panic!("Expected a scalar value")
-        }
-
-        let res = invoke_make_date_with_args(
-            vec![
-                ColumnarValue::Scalar(ScalarValue::Int64(Some(2024))),
-                ColumnarValue::Scalar(ScalarValue::UInt64(Some(1))),
-                ColumnarValue::Scalar(ScalarValue::UInt32(Some(14))),
-            ],
-            1,
-        )
-        .expect("that make_date parsed values without error");
-
-        if let ColumnarValue::Scalar(ScalarValue::Date32(date)) = res {
-            assert_eq!(19736, date.unwrap());
-        } else {
-            panic!("Expected a scalar value")
-        }
-
-        let res = invoke_make_date_with_args(
-            vec![
-                ColumnarValue::Scalar(ScalarValue::Utf8(Some("2024".to_string()))),
-                ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("1".to_string()))),
-                ColumnarValue::Scalar(ScalarValue::Utf8(Some("14".to_string()))),
-            ],
-            1,
-        )
-        .expect("that make_date parsed values without error");
-
-        if let ColumnarValue::Scalar(ScalarValue::Date32(date)) = res {
-            assert_eq!(19736, date.unwrap());
-        } else {
-            panic!("Expected a scalar value")
-        }
-
-        let years = Arc::new((2021..2025).map(Some).collect::<Int64Array>());
-        let months = Arc::new((1..5).map(Some).collect::<Int32Array>());
-        let days = Arc::new((11..15).map(Some).collect::<UInt32Array>());
-        let batch_len = years.len();
-        let res = invoke_make_date_with_args(
-            vec![
-                ColumnarValue::Array(years),
-                ColumnarValue::Array(months),
-                ColumnarValue::Array(days),
-            ],
-            batch_len,
-        )
-        .unwrap();
-
-        if let ColumnarValue::Array(array) = res {
-            assert_eq!(array.len(), 4);
-            let mut builder = Date32Array::builder(4);
-            builder.append_value(18_638);
-            builder.append_value(19_035);
-            builder.append_value(19_429);
-            builder.append_value(19_827);
-            assert_eq!(&builder.finish() as &dyn Array, array.as_ref());
-        } else {
-            panic!("Expected a columnar array")
-        }
-
-        //
-        // Fallible test cases
-        //
-
-        // invalid number of arguments
-        let res = invoke_make_date_with_args(
-            vec![ColumnarValue::Scalar(ScalarValue::Int32(Some(1)))],
-            1,
-        );
-        assert_eq!(
-            res.err().unwrap().strip_backtrace(),
-            "Execution error: make_date function requires 3 arguments, got 1"
-        );
-
-        // invalid type
-        let res = invoke_make_date_with_args(
-            vec![
-                ColumnarValue::Scalar(ScalarValue::IntervalYearMonth(Some(1))),
-                ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)),
-                ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)),
-            ],
-            1,
-        );
-        assert_eq!(
-            res.err().unwrap().strip_backtrace(),
-            "Arrow error: Cast error: Casting from Interval(YearMonth) to Int32 not supported"
-        );
-
-        // overflow of month
-        let res = invoke_make_date_with_args(
-            vec![
-                ColumnarValue::Scalar(ScalarValue::Int32(Some(2023))),
-                ColumnarValue::Scalar(ScalarValue::UInt64(Some(u64::MAX))),
-                ColumnarValue::Scalar(ScalarValue::Int32(Some(22))),
-            ],
-            1,
-        );
-        assert_eq!(
-            res.err().unwrap().strip_backtrace(),
-            "Arrow error: Cast error: Can't cast value 18446744073709551615 to type Int32"
-        );
-
-        // overflow of day
-        let res = invoke_make_date_with_args(
-            vec![
-                ColumnarValue::Scalar(ScalarValue::Int32(Some(2023))),
-                ColumnarValue::Scalar(ScalarValue::Int32(Some(22))),
-                ColumnarValue::Scalar(ScalarValue::UInt32(Some(u32::MAX))),
-            ],
-            1,
-        );
-        assert_eq!(
-            res.err().unwrap().strip_backtrace(),
-            "Arrow error: Cast error: Can't cast value 4294967295 to type Int32"
-        );
-    }
-
-    #[test]
-    fn test_make_date_null_param() {
-        let res = invoke_make_date_with_args(
-            vec![
-                ColumnarValue::Scalar(ScalarValue::Null),
-                ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
-                ColumnarValue::Scalar(ScalarValue::UInt32(Some(14))),
-            ],
-            1,
-        )
-        .expect("that make_date parsed values without error");
-
-        assert!(matches!(res, ColumnarValue::Scalar(ScalarValue::Null)));
-    }
-}
diff --git a/datafusion/functions/src/datetime/make_time.rs b/datafusion/functions/src/datetime/make_time.rs
new file mode 100644
index 0000000000000..718612fcaddbb
--- /dev/null
+++ b/datafusion/functions/src/datetime/make_time.rs
@@ -0,0 +1,267 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::cast::AsArray;
+use arrow::array::types::Int32Type;
+use arrow::array::{Array, PrimitiveArray};
+use arrow::buffer::NullBuffer;
+use arrow::datatypes::DataType::Time32;
+use arrow::datatypes::{DataType, Time32SecondType, TimeUnit};
+use chrono::prelude::*;
+
+use datafusion_common::types::{NativeType, logical_int32, logical_string};
+use datafusion_common::{Result, ScalarValue, exec_err, utils::take_function_args};
+use datafusion_expr::{
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
+use datafusion_macros::user_doc;
+
+#[user_doc(
+    doc_section(label = "Time and Date Functions"),
+    description = "Make a time from hour/minute/second component parts.",
+    syntax_example = "make_time(hour, minute, second)",
+    sql_example = r#"```sql
+> select make_time(13, 23, 1);
++-------------------------------------------+
+| make_time(Int64(13),Int64(23),Int64(1))   |
++-------------------------------------------+
+| 13:23:01                                  |
++-------------------------------------------+
+> select make_time('23', '01', '31');
++-----------------------------------------------+
+| make_time(Utf8("23"),Utf8("01"),Utf8("31"))   |
++-----------------------------------------------+
+| 23:01:31                                      |
++-----------------------------------------------+
+```
+
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
+"#,
+    argument(
+        name = "hour",
+        description = "Hour to use when making the time. Can be a constant, column or function, and any combination of arithmetic operators."
+    ),
+    argument(
+        name = "minute",
+        description = "Minute to use when making the time. Can be a constant, column or function, and any combination of arithmetic operators."
+    ),
+    argument(
+        name = "second",
+        description = "Second to use when making the time. Can be a constant, column or function, and any combination of arithmetic operators."
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct MakeTimeFunc {
+    signature: Signature,
+}
+
+impl Default for MakeTimeFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl MakeTimeFunc {
+    pub fn new() -> Self {
+        let int = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int32()),
+            vec![
+                TypeSignatureClass::Integer,
+                TypeSignatureClass::Native(logical_string()),
+            ],
+            NativeType::Int32,
+        );
+        Self {
+            signature: Signature::coercible(vec![int; 3], Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for MakeTimeFunc {
+    fn name(&self) -> &str {
+        "make_time"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Time32(TimeUnit::Second))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let [hours, minutes, seconds] = take_function_args(self.name(), args.args)?;
+
+        match (hours, minutes, seconds) {
+            (ColumnarValue::Scalar(h), _, _) if h.is_null() => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Time32Second(None)))
+            }
+            (_, ColumnarValue::Scalar(m), _) if m.is_null() => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Time32Second(None)))
+            }
+            (_, _, ColumnarValue::Scalar(s)) if s.is_null() => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Time32Second(None)))
+            }
+            (
+                ColumnarValue::Scalar(ScalarValue::Int32(Some(hours))),
+                ColumnarValue::Scalar(ScalarValue::Int32(Some(minutes))),
+                ColumnarValue::Scalar(ScalarValue::Int32(Some(seconds))),
+            ) => {
+                let mut value = 0;
+                make_time_inner(hours, minutes, seconds, |seconds: i32| value = seconds)?;
+                Ok(ColumnarValue::Scalar(ScalarValue::Time32Second(Some(
+                    value,
+                ))))
+            }
+            (hours, minutes, seconds) => {
+                let len = args.number_rows;
+                let hours = hours.into_array(len)?;
+                let minutes = minutes.into_array(len)?;
+                let seconds = seconds.into_array(len)?;
+
+                let hours = hours.as_primitive::<Int32Type>();
+                let minutes = minutes.as_primitive::<Int32Type>();
+                let seconds = seconds.as_primitive::<Int32Type>();
+
+                let nulls = NullBuffer::union(
+                    NullBuffer::union(hours.nulls(), minutes.nulls()).as_ref(),
+                    seconds.nulls(),
+                );
+
+                let mut values = Vec::with_capacity(len);
+                for i in 0..len {
+                    // Match Postgres behaviour which returns null for any null input
+                    if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
+                        values.push(0);
+                    } else {
+                        make_time_inner(
+                            hours.value(i),
+                            minutes.value(i),
+                            seconds.value(i),
+                            |seconds: i32| values.push(seconds),
+                        )?;
+                    }
+                }
+
+                Ok(ColumnarValue::Array(Arc::new(PrimitiveArray::<
+                    Time32SecondType,
+                >::new(
+                    values.into(), nulls
+                ))))
+            }
+        }
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+/// Converts the hour/minute/second fields to an `i32` representing the seconds from
+/// midnight and invokes `time_consumer_fn` with the value
+fn make_time_inner<F: FnMut(i32)>(
+    hour: i32,
+    minute: i32,
+    second: i32,
+    mut time_consumer_fn: F,
+) -> Result<()> {
+    let h = match hour {
+        0..=24 => hour as u32,
+        _ => return exec_err!("Hour value '{hour:?}' is out of range"),
+    };
+    let m = match minute {
+        0..=60 => minute as u32,
+        _ => return exec_err!("Minute value '{minute:?}' is out of range"),
+    };
+    let s = match second {
+        0..=60 => second as u32,
+        _ => return exec_err!("Second value '{second:?}' is out of range"),
+    };
+
+    if let Some(time) = NaiveTime::from_hms_opt(h, m, s) {
+        time_consumer_fn(time.num_seconds_from_midnight() as i32);
+        Ok(())
+    } else {
+        exec_err!("Unable to parse time from {hour}, {minute}, {second}")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::datetime::make_time::MakeTimeFunc;
+    use arrow::array::{Array, Int32Array, Time32SecondArray};
+    use arrow::datatypes::TimeUnit::Second;
+    use arrow::datatypes::{DataType, Field};
+    use datafusion_common::DataFusionError;
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+    use std::sync::Arc;
+
+    fn invoke_make_time_with_args(
+        args: Vec<ColumnarValue>,
+        number_rows: usize,
+    ) -> Result<ColumnarValue, DataFusionError> {
+        let arg_fields = args
+            .iter()
+            .map(|arg| Field::new("a", arg.data_type(), true).into())
+            .collect::<Vec<_>>();
+        let args = ScalarFunctionArgs {
+            args,
+            arg_fields,
+            number_rows,
+            return_field: Field::new("f", DataType::Time32(Second), true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+
+        MakeTimeFunc::new().invoke_with_args(args)
+    }
+
+    #[test]
+    fn test_make_time() {
+        let hours = Arc::new((4..8).map(Some).collect::<Int32Array>());
+        let minutes = Arc::new((1..5).map(Some).collect::<Int32Array>());
+        let seconds = Arc::new((11..15).map(Some).collect::<Int32Array>());
+        let batch_len = hours.len();
+        let res = invoke_make_time_with_args(
+            vec![
+                ColumnarValue::Array(hours),
+                ColumnarValue::Array(minutes),
+                ColumnarValue::Array(seconds),
+            ],
+            batch_len,
+        )
+        .unwrap();
+
+        if let ColumnarValue::Array(array) = res {
+            assert_eq!(array.len(), 4);
+
+            let mut builder = Time32SecondArray::builder(4);
+            builder.append_value(14_471);
+            builder.append_value(18_132);
+            builder.append_value(21_793);
+            builder.append_value(25_454);
+            assert_eq!(&builder.finish() as &dyn Array, array.as_ref());
+        } else {
+            panic!("Expected a columnar array")
+        }
+    }
+}
diff --git a/datafusion/functions/src/datetime/mod.rs b/datafusion/functions/src/datetime/mod.rs
index d80f14facf822..39b9453295df6 100644
--- a/datafusion/functions/src/datetime/mod.rs
+++ b/datafusion/functions/src/datetime/mod.rs
@@ -29,11 +29,13 @@ pub mod date_part;
 pub mod date_trunc;
 pub mod from_unixtime;
 pub mod make_date;
+pub mod make_time;
 pub mod now;
 pub mod planner;
 pub mod to_char;
 pub mod to_date;
 pub mod to_local_time;
+pub mod to_time;
 pub mod to_timestamp;
 pub mod to_unixtime;
 
@@ -44,16 +46,21 @@ make_udf_function!(date_bin::DateBinFunc, date_bin);
 make_udf_function!(date_part::DatePartFunc, date_part);
 make_udf_function!(date_trunc::DateTruncFunc, date_trunc);
 make_udf_function!(make_date::MakeDateFunc, make_date);
+make_udf_function!(make_time::MakeTimeFunc, make_time);
 make_udf_function!(from_unixtime::FromUnixtimeFunc, from_unixtime);
 make_udf_function!(to_char::ToCharFunc, to_char);
 make_udf_function!(to_date::ToDateFunc, to_date);
 make_udf_function!(to_local_time::ToLocalTimeFunc, to_local_time);
+make_udf_function!(to_time::ToTimeFunc, to_time);
 make_udf_function!(to_unixtime::ToUnixtimeFunc, to_unixtime);
-make_udf_function!(to_timestamp::ToTimestampFunc, to_timestamp);
-make_udf_function!(to_timestamp::ToTimestampSecondsFunc, to_timestamp_seconds);
-make_udf_function!(to_timestamp::ToTimestampMillisFunc, to_timestamp_millis);
-make_udf_function!(to_timestamp::ToTimestampMicrosFunc, to_timestamp_micros);
-make_udf_function!(to_timestamp::ToTimestampNanosFunc, to_timestamp_nanos);
+make_udf_function_with_config!(to_timestamp::ToTimestampFunc, to_timestamp);
+make_udf_function_with_config!(
+    to_timestamp::ToTimestampSecondsFunc,
+    to_timestamp_seconds
+);
+make_udf_function_with_config!(to_timestamp::ToTimestampMillisFunc, to_timestamp_millis);
+make_udf_function_with_config!(to_timestamp::ToTimestampMicrosFunc, to_timestamp_micros);
+make_udf_function_with_config!(to_timestamp::ToTimestampNanosFunc, to_timestamp_nanos);
 
 // create UDF with config
 make_udf_function_with_config!(now::NowFunc, now);
@@ -90,6 +97,10 @@ pub mod expr_fn {
         make_date,
         "make a date from year, month and day component parts",
         year month day
+    ),(
+        make_time,
+        "make a time from hour, minute and second component parts",
+        hour minute second
     ),(
         now,
         "returns the current timestamp in nanoseconds, using the same value for all instances of now() in same statement",
@@ -102,28 +113,32 @@ pub mod expr_fn {
     ),
     (
         to_unixtime,
-        "converts a string and optional formats to a Unixtime",
+        "converts a value to seconds since the unix epoch",
         args,
     ),(
-        to_timestamp,
-        "converts a string and optional formats to a `Timestamp(Nanoseconds, None)`",
+        to_time,
+        "converts a string and optional formats to a `Time64(Nanoseconds)`",
         args,
+    ),(
+        to_timestamp,
+        "converts a string and optional formats to a `Timestamp(Nanoseconds, TimeZone)`",
+        @config args,
     ),(
         to_timestamp_seconds,
-        "converts a string and optional formats to a `Timestamp(Seconds, None)`",
-        args,
+        "converts a string and optional formats to a `Timestamp(Seconds, TimeZone)`",
+        @config args,
     ),(
         to_timestamp_millis,
-        "converts a string and optional formats to a `Timestamp(Milliseconds, None)`",
-        args,
+        "converts a string and optional formats to a `Timestamp(Milliseconds, TimeZone)`",
+        @config args,
     ),(
         to_timestamp_micros,
-        "converts a string and optional formats to a `Timestamp(Microseconds, None)`",
-        args,
+        "converts a string and optional formats to a `Timestamp(Microseconds, TimeZone)`",
+        @config args,
     ),(
         to_timestamp_nanos,
-        "converts a string and optional formats to a `Timestamp(Nanoseconds, None)`",
-        args,
+        "converts a string and optional formats to a `Timestamp(Nanoseconds, TimeZone)`",
+        @config args,
     ));
 
     /// Returns a string representation of a date, time, timestamp or duration based
@@ -259,6 +274,7 @@ pub mod expr_fn {
 /// Returns all DataFusion functions defined in this package
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
     use datafusion_common::config::ConfigOptions;
+    let config = ConfigOptions::default();
     vec![
         current_date(),
         current_time(),
@@ -267,15 +283,17 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
         date_trunc(),
         from_unixtime(),
         make_date(),
-        now(&ConfigOptions::default()),
+        make_time(),
+        now(&config),
         to_char(),
         to_date(),
         to_local_time(),
+        to_time(),
         to_unixtime(),
-        to_timestamp(),
-        to_timestamp_seconds(),
-        to_timestamp_millis(),
-        to_timestamp_micros(),
-        to_timestamp_nanos(),
+        to_timestamp(&config),
+        to_timestamp_seconds(&config),
+        to_timestamp_millis(&config),
+        to_timestamp_micros(&config),
+        to_timestamp_nanos(&config),
     ]
 }
diff --git a/datafusion/functions/src/datetime/now.rs b/datafusion/functions/src/datetime/now.rs
index 4723548a45584..82bb1251b2045 100644
--- a/datafusion/functions/src/datetime/now.rs
+++ b/datafusion/functions/src/datetime/now.rs
@@ -18,15 +18,14 @@
 use arrow::datatypes::DataType::Timestamp;
 use arrow::datatypes::TimeUnit::Nanosecond;
 use arrow::datatypes::{DataType, Field, FieldRef};
-use std::any::Any;
 use std::sync::Arc;
 
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::{internal_err, Result, ScalarValue};
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion_common::{Result, ScalarValue, internal_err};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr::{
-    ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarUDF, ScalarUDFImpl,
-    Signature, Volatility,
+    ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF,
+    ScalarUDFImpl, Signature, Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -37,7 +36,24 @@ Returns the current timestamp in the system configured timezone (None by default
 
 The `now()` return value is determined at query time and will return the same timestamp, no matter when in the query plan the function executes.
 "#,
-    syntax_example = "now()"
+    syntax_example = "now()",
+    sql_example = r#"```sql
+> SELECT now();
++----------------------------------+
+| now()                            |
++----------------------------------+
+| 2024-12-23T06:30:00.123456789    |
++----------------------------------+
+
+-- The timezone of the returned timestamp depends on the session time zone
+> SET datafusion.execution.time_zone = 'America/New_York';
+> SELECT now();
++--------------------------------------+
+| now()                                |
++--------------------------------------+
+| 2024-12-23T01:30:00.123456789-05:00  |
++--------------------------------------+
+```"#
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
 pub struct NowFunc {
@@ -83,10 +99,6 @@ impl NowFunc {
 /// wherever it appears within a single statement. This value is
 /// chosen during planning time.
 impl ScalarUDFImpl for NowFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "now"
     }
@@ -112,25 +124,24 @@ impl ScalarUDFImpl for NowFunc {
         internal_err!("return_field_from_args should be called instead")
     }
 
-    fn invoke_with_args(
-        &self,
-        _args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         internal_err!("invoke should not be called on a simplified now() function")
     }
 
     fn simplify(
         &self,
-        _args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
-        let now_ts = info
-            .execution_props()
-            .query_execution_start_time
-            .timestamp_nanos_opt();
+        let Some(now_ts) = info.query_execution_start_time() else {
+            return Ok(ExprSimplifyResult::Original(args));
+        };
 
         Ok(ExprSimplifyResult::Simplified(Expr::Literal(
-            ScalarValue::TimestampNanosecond(now_ts, self.timezone.clone()),
+            ScalarValue::TimestampNanosecond(
+                now_ts.timestamp_nanos_opt(),
+                self.timezone.clone(),
+            ),
             None,
         )))
     }
@@ -148,7 +159,7 @@ impl ScalarUDFImpl for NowFunc {
 mod tests {
     use super::*;
 
-    #[allow(deprecated)]
+    #[expect(deprecated)]
     #[test]
     fn now_func_default_matches_config() {
         let default_config = ConfigOptions::default();
diff --git a/datafusion/functions/src/datetime/planner.rs b/datafusion/functions/src/datetime/planner.rs
index f4b64c3711e2c..f2b8ef9d1d310 100644
--- a/datafusion/functions/src/datetime/planner.rs
+++ b/datafusion/functions/src/datetime/planner.rs
@@ -16,9 +16,9 @@
 // under the License.
 
 //! SQL planning extensions like [`DatetimeFunctionPlanner`]
+use datafusion_expr::Expr;
 use datafusion_expr::expr::ScalarFunction;
 use datafusion_expr::planner::{ExprPlanner, PlannerResult};
-use datafusion_expr::Expr;
 
 #[derive(Default, Debug)]
 pub struct DatetimeFunctionPlanner;
diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs
index 7d9b2bc241e1a..cdd4b7d5d046c 100644
--- a/datafusion/functions/src/datetime/to_char.rs
+++ b/datafusion/functions/src/datetime/to_char.rs
@@ -15,23 +15,23 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
+use arrow::array::builder::StringBuilder;
 use arrow::array::cast::AsArray;
-use arrow::array::{new_null_array, Array, ArrayRef, StringArray};
+use arrow::array::{Array, ArrayRef};
 use arrow::compute::cast;
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::{
     Date32, Date64, Duration, Time32, Time64, Timestamp, Utf8,
 };
 use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second};
-use arrow::error::ArrowError;
 use arrow::util::display::{ArrayFormatter, DurationFormat, FormatOptions};
-use datafusion_common::{exec_err, utils::take_function_args, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err, utils::take_function_args};
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, TIMEZONE_WILDCARD,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TIMEZONE_WILDCARD, Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -48,7 +48,7 @@ use datafusion_macros::user_doc;
 +----------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/date_time_functions.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 "#,
     argument(
         name = "expression",
@@ -119,10 +119,6 @@ impl ToCharFunc {
 }
 
 impl ScalarUDFImpl for ToCharFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "to_char"
     }
@@ -135,30 +131,22 @@ impl ScalarUDFImpl for ToCharFunc {
         Ok(Utf8)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = args.args;
         let [date_time, format] = take_function_args(self.name(), &args)?;
 
         match format {
-            ColumnarValue::Scalar(ScalarValue::Utf8(None))
-            | ColumnarValue::Scalar(ScalarValue::Null) => {
-                to_char_scalar(date_time.clone(), None)
+            ColumnarValue::Scalar(ScalarValue::Null | ScalarValue::Utf8(None)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
             }
-            // constant format
-            ColumnarValue::Scalar(ScalarValue::Utf8(Some(format))) => {
-                // invoke to_char_scalar with the known string, without converting to array
-                to_char_scalar(date_time.clone(), Some(format))
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(fmt))) => {
+                to_char_scalar(date_time, fmt)
             }
             ColumnarValue::Array(_) => to_char_array(&args),
-            _ => {
-                exec_err!(
-                    "Format for `to_char` must be non-null Utf8, received {:?}",
-                    format.data_type()
-                )
-            }
+            _ => exec_err!(
+                "Format for `to_char` must be non-null Utf8, received {}",
+                format.data_type()
+            ),
         }
     }
 
@@ -173,11 +161,8 @@ impl ScalarUDFImpl for ToCharFunc {
 
 fn build_format_options<'a>(
     data_type: &DataType,
-    format: Option<&'a str>,
-) -> Result<FormatOptions<'a>, Result<ColumnarValue>> {
-    let Some(format) = format else {
-        return Ok(FormatOptions::new());
-    };
+    format: &'a str,
+) -> Result<FormatOptions<'a>> {
     let format_options = match data_type {
         Date32 => FormatOptions::new()
             .with_date_format(Some(format))
@@ -196,144 +181,114 @@ fn build_format_options<'a>(
             },
         ),
         other => {
-            return Err(exec_err!(
+            return exec_err!(
                 "to_char only supports date, time, timestamp and duration data types, received {other:?}"
-            ));
+            );
         }
     };
     Ok(format_options)
 }
 
-/// Special version when arg\[1] is a scalar
-fn to_char_scalar(
-    expression: ColumnarValue,
-    format: Option<&str>,
-) -> Result<ColumnarValue> {
-    // it's possible that the expression is a scalar however because
-    // of the implementation in arrow-rs we need to convert it to an array
+/// Formats `expression` using a constant `format` string.
+fn to_char_scalar(expression: &ColumnarValue, format: &str) -> Result<ColumnarValue> {
+    // ArrayFormatter requires an array, so scalar expressions must be
+    // converted to a 1-element array first.
     let data_type = &expression.data_type();
     let is_scalar_expression = matches!(&expression, ColumnarValue::Scalar(_));
-    let array = expression.clone().into_array(1)?;
+    let array = expression.to_array(1)?;
 
-    if format.is_none() {
-        return if is_scalar_expression {
-            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
-        } else {
-            Ok(ColumnarValue::Array(new_null_array(&Utf8, array.len())))
-        };
-    }
+    let format_options = build_format_options(data_type, format)?;
+    let formatter = ArrayFormatter::try_new(array.as_ref(), &format_options)?;
 
-    let format_options = match build_format_options(data_type, format) {
-        Ok(value) => value,
-        Err(value) => return value,
-    };
+    // Pad the preallocated capacity a bit because format specifiers often
+    // expand the string (e.g., %Y -> "2026")
+    let fmt_len = format.len() + 10;
+    let mut builder = StringBuilder::with_capacity(array.len(), array.len() * fmt_len);
 
-    let formatter = ArrayFormatter::try_new(array.as_ref(), &format_options)?;
-    let formatted: Result<Vec<Option<String>>, ArrowError> = (0..array.len())
-        .map(|i| {
-            if array.is_null(i) {
-                Ok(None)
-            } else {
-                formatter.value(i).try_to_string().map(Some)
-            }
-        })
-        .collect();
-
-    if let Ok(formatted) = formatted {
-        if is_scalar_expression {
-            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(
-                formatted.first().unwrap().clone(),
-            )))
+    for i in 0..array.len() {
+        if array.is_null(i) {
+            builder.append_null();
         } else {
-            Ok(ColumnarValue::Array(
-                Arc::new(StringArray::from(formatted)) as ArrayRef
-            ))
-        }
-    } else {
-        // if the data type was a Date32, formatting could have failed because the format string
-        // contained datetime specifiers, so we'll retry by casting the date array as a timestamp array
-        if data_type == &Date32 {
-            return to_char_scalar(expression.clone().cast_to(&Date64, None)?, format);
+            // Write directly into the builder's internal buffer, then
+            // commit the value with append_value("").
+            match formatter.value(i).write(&mut builder) {
+                Ok(()) => builder.append_value(""),
+                // Arrow's Date32 formatter only handles date specifiers
+                // (%Y, %m, %d, ...). Format strings with time specifiers
+                // (%H, %M, %S, ...) cause it to fail. When this happens,
+                // we retry by casting to Date64, whose datetime formatter
+                // handles both date and time specifiers (with zero for
+                // the time components).
+                Err(_) if data_type == &Date32 => {
+                    return to_char_scalar(&expression.cast_to(&Date64, None)?, format);
+                }
+                Err(e) => return Err(e.into()),
+            }
         }
+    }
 
-        exec_err!("{}", formatted.unwrap_err())
+    let result = builder.finish();
+    if is_scalar_expression {
+        let val = result.is_valid(0).then(|| result.value(0).to_string());
+        Ok(ColumnarValue::Scalar(ScalarValue::Utf8(val)))
+    } else {
+        Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef))
     }
 }
 
 fn to_char_array(args: &[ColumnarValue]) -> Result<ColumnarValue> {
     let arrays = ColumnarValue::values_to_arrays(args)?;
-    let mut results: Vec<Option<String>> = vec![];
+    let data_array = &arrays[0];
     let format_array = arrays[1].as_string::<i32>();
-    let data_type = arrays[0].data_type();
+    let data_type = data_array.data_type();
 
-    for idx in 0..arrays[0].len() {
-        let format = if format_array.is_null(idx) {
-            None
-        } else {
-            Some(format_array.value(idx))
-        };
-        if format.is_none() {
-            results.push(None);
+    // Arbitrary guess for the length of a typical formatted datetime string
+    let fmt_len = 30;
+    let mut builder =
+        StringBuilder::with_capacity(data_array.len(), data_array.len() * fmt_len);
+    let mut buffer = String::with_capacity(fmt_len);
+
+    for idx in 0..data_array.len() {
+        if format_array.is_null(idx) || data_array.is_null(idx) {
+            builder.append_null();
             continue;
         }
-        let format_options = match build_format_options(data_type, format) {
-            Ok(value) => value,
-            Err(value) => return value,
-        };
-        // this isn't ideal but this can't use ValueFormatter as it isn't independent
-        // from ArrayFormatter
-        let formatter = ArrayFormatter::try_new(arrays[0].as_ref(), &format_options)?;
-        let result = formatter.value(idx).try_to_string();
-        match result {
-            Ok(value) => results.push(Some(value)),
-            Err(e) => {
-                // if the data type was a Date32, formatting could have failed because the format string
-                // contained datetime specifiers, so we'll treat this specific date element as a timestamp
-                if data_type == &Date32 {
-                    let failed_date_value = arrays[0].slice(idx, 1);
-
-                    match retry_date_as_timestamp(failed_date_value, &format_options) {
-                        Ok(value) => {
-                            results.push(Some(value));
-                            continue;
-                        }
-                        Err(e) => {
-                            return exec_err!("{}", e);
-                        }
-                    }
-                }
 
-                return exec_err!("{}", e);
+        let format = format_array.value(idx);
+        let format_options = build_format_options(data_type, format)?;
+        let formatter = ArrayFormatter::try_new(data_array.as_ref(), &format_options)?;
+
+        buffer.clear();
+
+        // We'd prefer to write directly to the StringBuilder's internal buffer,
+        // but the write might fail, and there's no easy way to ensure a partial
+        // write is removed from the buffer. So instead we write to a temporary
+        // buffer and `append_value` on success.
+        match formatter.value(idx).write(&mut buffer) {
+            Ok(()) => builder.append_value(&buffer),
+            // Retry with Date64 (see comment in to_char_scalar).
+            Err(_) if data_type == &Date32 => {
+                buffer.clear();
+                let date64_value = cast(&data_array.slice(idx, 1), &Date64)?;
+                let retry_fmt =
+                    ArrayFormatter::try_new(date64_value.as_ref(), &format_options)?;
+                retry_fmt.value(0).write(&mut buffer)?;
+                builder.append_value(&buffer);
             }
+            Err(e) => return Err(e.into()),
         }
     }
 
+    let result = builder.finish();
     match args[0] {
-        ColumnarValue::Array(_) => Ok(ColumnarValue::Array(Arc::new(StringArray::from(
-            results,
-        )) as ArrayRef)),
-        ColumnarValue::Scalar(_) => match results.first().unwrap() {
-            Some(value) => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
-                value.to_string(),
-            )))),
-            None => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))),
-        },
+        ColumnarValue::Scalar(_) => {
+            let val = result.is_valid(0).then(|| result.value(0).to_string());
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(val)))
+        }
+        ColumnarValue::Array(_) => Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef)),
     }
 }
 
-fn retry_date_as_timestamp(
-    array_ref: ArrayRef,
-    format_options: &FormatOptions,
-) -> Result<String> {
-    let target_data_type = Date64;
-
-    let date_value = cast(&array_ref, &target_data_type)?;
-    let formatter = ArrayFormatter::try_new(date_value.as_ref(), format_options)?;
-    let result = formatter.value(0).try_to_string()?;
-
-    Ok(result)
-}
-
 #[cfg(test)]
 mod tests {
     use crate::datetime::to_char::ToCharFunc;
@@ -345,9 +300,9 @@ mod tests {
     };
     use arrow::datatypes::{DataType, Field, TimeUnit};
     use chrono::{NaiveDateTime, Timelike};
-    use datafusion_common::config::ConfigOptions;
     use datafusion_common::ScalarValue;
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
     use std::sync::Arc;
 
     #[test]
@@ -363,7 +318,7 @@ mod tests {
             let value_data_type = value.data_type().clone();
             let format_data_type = format.data_type().clone();
 
-            let args = datafusion_expr::ScalarFunctionArgs {
+            let args = ScalarFunctionArgs {
                 args: vec![
                     ColumnarValue::Array(value),
                     ColumnarValue::Array(Arc::new(format) as ArrayRef),
@@ -474,7 +429,7 @@ mod tests {
                 Field::new("a", value.data_type(), false).into(),
                 Field::new("a", format.data_type(), false).into(),
             ];
-            let args = datafusion_expr::ScalarFunctionArgs {
+            let args = ScalarFunctionArgs {
                 args: vec![ColumnarValue::Scalar(value), ColumnarValue::Scalar(format)],
                 arg_fields,
                 number_rows: 1,
@@ -565,7 +520,7 @@ mod tests {
                 Field::new("a", value.data_type(), false).into(),
                 Field::new("a", format.data_type().to_owned(), false).into(),
             ];
-            let args = datafusion_expr::ScalarFunctionArgs {
+            let args = ScalarFunctionArgs {
                 args: vec![
                     ColumnarValue::Scalar(value),
                     ColumnarValue::Array(Arc::new(format) as ArrayRef),
@@ -729,7 +684,7 @@ mod tests {
                 Field::new("a", value.data_type().clone(), false).into(),
                 Field::new("a", format.data_type(), false).into(),
             ];
-            let args = datafusion_expr::ScalarFunctionArgs {
+            let args = ScalarFunctionArgs {
                 args: vec![
                     ColumnarValue::Array(value as ArrayRef),
                     ColumnarValue::Scalar(format),
@@ -757,7 +712,7 @@ mod tests {
                 Field::new("a", value.data_type().clone(), false).into(),
                 Field::new("a", format.data_type().clone(), false).into(),
             ];
-            let args = datafusion_expr::ScalarFunctionArgs {
+            let args = ScalarFunctionArgs {
                 args: vec![
                     ColumnarValue::Array(value),
                     ColumnarValue::Array(Arc::new(format) as ArrayRef),
@@ -785,7 +740,7 @@ mod tests {
 
         // invalid number of arguments
         let arg_field = Field::new("a", DataType::Int32, true).into();
-        let args = datafusion_expr::ScalarFunctionArgs {
+        let args = ScalarFunctionArgs {
             args: vec![ColumnarValue::Scalar(ScalarValue::Int32(Some(1)))],
             arg_fields: vec![arg_field],
             number_rows: 1,
@@ -803,7 +758,7 @@ mod tests {
             Field::new("a", DataType::Utf8, true).into(),
             Field::new("a", DataType::Timestamp(TimeUnit::Nanosecond, None), true).into(),
         ];
-        let args = datafusion_expr::ScalarFunctionArgs {
+        let args = ScalarFunctionArgs {
             args: vec![
                 ColumnarValue::Scalar(ScalarValue::Int32(Some(1))),
                 ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)),
@@ -816,7 +771,7 @@ mod tests {
         let result = ToCharFunc::new().invoke_with_args(args);
         assert_eq!(
             result.err().unwrap().strip_backtrace(),
-            "Execution error: Format for `to_char` must be non-null Utf8, received Timestamp(Nanosecond, None)"
+            "Execution error: Format for `to_char` must be non-null Utf8, received Timestamp(ns)"
         );
     }
 }
diff --git a/datafusion/functions/src/datetime/to_date.rs b/datafusion/functions/src/datetime/to_date.rs
index 3840c8d8bbb94..cd75ac6bed3ac 100644
--- a/datafusion/functions/src/datetime/to_date.rs
+++ b/datafusion/functions/src/datetime/to_date.rs
@@ -16,22 +16,23 @@
 // under the License.
 
 use crate::datetime::common::*;
+use arrow::compute::cast_with_options;
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::*;
 use arrow::error::ArrowError::ParseError;
 use arrow::{array::types::Date32Type, compute::kernels::cast_utils::Parser};
-use datafusion_common::error::DataFusionError;
-use datafusion_common::{arrow_err, exec_err, internal_datafusion_err, Result};
+use datafusion_common::format::DEFAULT_CAST_OPTIONS;
+use datafusion_common::{Result, arrow_err, exec_err, internal_datafusion_err};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
 
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
     description = r"Converts a value to a date (`YYYY-MM-DD`).
-Supports strings, integer and double types as input.
+Supports strings, numeric and timestamp types as input.
 Strings are parsed as YYYY-MM-DD (e.g. '2023-07-20') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided.
 Integers and doubles are interpreted as days since the unix epoch (`1970-01-01T00:00:00Z`).
 Returns the corresponding date.
@@ -53,7 +54,7 @@ Note: `to_date` returns Date32, which represents its values as the number of day
 +---------------------------------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/date_time_functions.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 "#,
     standard_argument(name = "expression", prefix = "String"),
     argument(
@@ -83,7 +84,7 @@ impl ToDateFunc {
 
     fn to_date(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
         match args.len() {
-            1 => handle::<Date32Type, _, Date32Type>(
+            1 => handle::<Date32Type, _>(
                 args,
                 |s| match Date32Type::parse(s) {
                     Some(v) => Ok(v),
@@ -93,8 +94,9 @@ impl ToDateFunc {
                     )),
                 },
                 "to_date",
+                &Date32,
             ),
-            2.. => handle_multiple::<Date32Type, _, Date32Type, _>(
+            2.. => handle_multiple::<Date32Type, _, _>(
                 args,
                 |s, format| {
                     string_to_timestamp_millis_formatted(s, format)
@@ -107,6 +109,7 @@ impl ToDateFunc {
                 },
                 |n| n,
                 "to_date",
+                &Date32,
             ),
             0 => exec_err!("Unsupported 0 argument count for function to_date"),
         }
@@ -114,10 +117,6 @@ impl ToDateFunc {
 }
 
 impl ScalarUDFImpl for ToDateFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "to_date"
     }
@@ -130,10 +129,7 @@ impl ScalarUDFImpl for ToDateFunc {
         Ok(Date32)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = args.args;
         if args.is_empty() {
             return exec_err!("to_date function requires 1 or more arguments, got 0");
@@ -145,9 +141,42 @@ impl ScalarUDFImpl for ToDateFunc {
         }
 
         match args[0].data_type() {
-            Int32 | Int64 | Null | Float64 | Date32 | Date64 => {
+            Null | Int32 | Int64 | Date32 | Date64 | Timestamp(_, _) => {
                 args[0].cast_to(&Date32, None)
             }
+            UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 => {
+                // Arrow cast doesn't support direct casting of these types to date32
+                // as it only supports Int32 and Int64. To work around that limitation,
+                // use cast_with_options to cast to Int32 and then cast the result of
+                // that to Date32.
+                match &args[0] {
+                    ColumnarValue::Array(array) => {
+                        Ok(ColumnarValue::Array(cast_with_options(
+                            &cast_with_options(&array, &Int32, &DEFAULT_CAST_OPTIONS)?,
+                            &Date32,
+                            &DEFAULT_CAST_OPTIONS,
+                        )?))
+                    }
+                    ColumnarValue::Scalar(scalar) => {
+                        let sv =
+                            scalar.cast_to_with_options(&Int32, &DEFAULT_CAST_OPTIONS)?;
+                        Ok(ColumnarValue::Scalar(
+                            sv.cast_to_with_options(&Date32, &DEFAULT_CAST_OPTIONS)?,
+                        ))
+                    }
+                }
+            }
+            Float16
+            | Float32
+            | Float64
+            | Decimal32(_, _)
+            | Decimal64(_, _)
+            | Decimal128(_, _)
+            | Decimal256(_, _) => {
+                // The only way this makes sense is to get the Int64 value of the float
+                // or decimal and then cast that to Date32.
+                args[0].cast_to(&Int64, None)?.cast_to(&Date32, None)
+            }
             Utf8View | LargeUtf8 | Utf8 => self.to_date(&args),
             other => {
                 exec_err!("Unsupported data type {} for function to_date", other)
@@ -168,7 +197,7 @@ mod tests {
     use arrow::{compute::kernels::cast_utils::Parser, datatypes::Date32Type};
     use datafusion_common::config::ConfigOptions;
     use datafusion_common::{DataFusionError, ScalarValue};
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
     use std::sync::Arc;
 
     fn invoke_to_date_with_args(
@@ -180,7 +209,7 @@ mod tests {
             .map(|arg| Field::new("a", arg.data_type(), true).into())
             .collect::<Vec<_>>();
 
-        let args = datafusion_expr::ScalarFunctionArgs {
+        let args = ScalarFunctionArgs {
             args,
             arg_fields,
             number_rows,
@@ -352,7 +381,11 @@ mod tests {
             match to_date_result {
                 Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => {
                     let expected = Date32Type::parse_formatted(tc.date_str, "%Y-%m-%d");
-                    assert_eq!(date_val, expected, "{}: to_date created wrong value for date '{}' with format string '{}'", tc.name, tc.formatted_date, tc.format_str);
+                    assert_eq!(
+                        date_val, expected,
+                        "{}: to_date created wrong value for date '{}' with format string '{}'",
+                        tc.name, tc.formatted_date, tc.format_str
+                    );
                 }
                 _ => panic!(
                     "Could not convert '{}' with format string '{}'to Date",
@@ -386,7 +419,8 @@ mod tests {
                     builder.append_value(expected.unwrap());
 
                     assert_eq!(
-                        &builder.finish() as &dyn Array, a.as_ref(),
+                        &builder.finish() as &dyn Array,
+                        a.as_ref(),
                         "{}: to_date created wrong value for date '{}' with format string '{}'",
                         tc.name,
                         tc.formatted_date,
diff --git a/datafusion/functions/src/datetime/to_local_time.rs b/datafusion/functions/src/datetime/to_local_time.rs
index 6e0a150b0a35f..5bd1978893d54 100644
--- a/datafusion/functions/src/datetime/to_local_time.rs
+++ b/datafusion/functions/src/datetime/to_local_time.rs
@@ -15,12 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::ops::Add;
 use std::sync::Arc;
 
 use arrow::array::timezone::Tz;
-use arrow::array::{Array, ArrayRef, PrimitiveBuilder};
+use arrow::array::{ArrayRef, PrimitiveArray};
 use arrow::datatypes::DataType::Timestamp;
 use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second};
 use arrow::datatypes::{
@@ -31,11 +30,12 @@ use chrono::{DateTime, MappedLocalTime, Offset, TimeDelta, TimeZone, Utc};
 
 use datafusion_common::cast::as_primitive_array;
 use datafusion_common::{
-    exec_err, internal_datafusion_err, plan_err, utils::take_function_args, Result,
-    ScalarValue,
+    Result, ScalarValue, exec_err, internal_datafusion_err, internal_err,
+    utils::take_function_args,
 };
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -111,131 +111,145 @@ impl Default for ToLocalTimeFunc {
 impl ToLocalTimeFunc {
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+            signature: Signature::coercible(
+                vec![Coercion::new_exact(TypeSignatureClass::Timestamp)],
+                Volatility::Immutable,
+            ),
         }
     }
+}
 
-    fn to_local_time(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
-        let [time_value] = take_function_args(self.name(), args)?;
+impl ScalarUDFImpl for ToLocalTimeFunc {
+    fn name(&self) -> &str {
+        "to_local_time"
+    }
 
-        let arg_type = time_value.data_type();
-        match arg_type {
-            Timestamp(_, None) => {
-                // if no timezone specified, just return the input
-                Ok(time_value.clone())
-            }
-            // If has timezone, adjust the underlying time value. The current time value
-            // is stored as i64 in UTC, even though the timezone may not be in UTC. Therefore,
-            // we need to adjust the time value to the local time. See [`adjust_to_local_time`]
-            // for more details.
-            //
-            // Then remove the timezone in return type, i.e. return None
-            Timestamp(_, Some(timezone)) => {
-                let tz: Tz = timezone.parse()?;
-
-                match time_value {
-                    ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
-                        Some(ts),
-                        Some(_),
-                    )) => {
-                        let adjusted_ts =
-                            adjust_to_local_time::<TimestampNanosecondType>(*ts, tz)?;
-                        Ok(ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
-                            Some(adjusted_ts),
-                            None,
-                        )))
-                    }
-                    ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(
-                        Some(ts),
-                        Some(_),
-                    )) => {
-                        let adjusted_ts =
-                            adjust_to_local_time::<TimestampMicrosecondType>(*ts, tz)?;
-                        Ok(ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(
-                            Some(adjusted_ts),
-                            None,
-                        )))
-                    }
-                    ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(
-                        Some(ts),
-                        Some(_),
-                    )) => {
-                        let adjusted_ts =
-                            adjust_to_local_time::<TimestampMillisecondType>(*ts, tz)?;
-                        Ok(ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(
-                            Some(adjusted_ts),
-                            None,
-                        )))
-                    }
-                    ColumnarValue::Scalar(ScalarValue::TimestampSecond(
-                        Some(ts),
-                        Some(_),
-                    )) => {
-                        let adjusted_ts =
-                            adjust_to_local_time::<TimestampSecondType>(*ts, tz)?;
-                        Ok(ColumnarValue::Scalar(ScalarValue::TimestampSecond(
-                            Some(adjusted_ts),
-                            None,
-                        )))
-                    }
-                    ColumnarValue::Array(array) => {
-                        fn transform_array<T: ArrowTimestampType>(
-                            array: &ArrayRef,
-                            tz: Tz,
-                        ) -> Result<ColumnarValue> {
-                            let mut builder = PrimitiveBuilder::<T>::new();
-
-                            let primitive_array = as_primitive_array::<T>(array)?;
-                            for ts_opt in primitive_array.iter() {
-                                match ts_opt {
-                                    None => builder.append_null(),
-                                    Some(ts) => {
-                                        let adjusted_ts: i64 =
-                                            adjust_to_local_time::<T>(ts, tz)?;
-                                        builder.append_value(adjusted_ts)
-                                    }
-                                }
-                            }
-
-                            Ok(ColumnarValue::Array(Arc::new(builder.finish())))
-                        }
-
-                        match array.data_type() {
-                            Timestamp(_, None) => {
-                                // if no timezone specified, just return the input
-                                Ok(time_value.clone())
-                            }
-                            Timestamp(Nanosecond, Some(_)) => {
-                                transform_array::<TimestampNanosecondType>(array, tz)
-                            }
-                            Timestamp(Microsecond, Some(_)) => {
-                                transform_array::<TimestampMicrosecondType>(array, tz)
-                            }
-                            Timestamp(Millisecond, Some(_)) => {
-                                transform_array::<TimestampMillisecondType>(array, tz)
-                            }
-                            Timestamp(Second, Some(_)) => {
-                                transform_array::<TimestampSecondType>(array, tz)
-                            }
-                            _ => {
-                                exec_err!("to_local_time function requires timestamp argument in array, got {:?}", array.data_type())
-                            }
-                        }
-                    }
-                    _ => {
-                        exec_err!(
-                        "to_local_time function requires timestamp argument, got {:?}",
-                        time_value.data_type()
-                    )
-                    }
-                }
-            }
-            _ => {
-                exec_err!(
-                    "to_local_time function requires timestamp argument, got {:?}",
-                    arg_type
-                )
-            }
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        match &arg_types[0] {
+            DataType::Null => Ok(Timestamp(Nanosecond, None)),
+            Timestamp(timeunit, _) => Ok(Timestamp(*timeunit, None)),
+            dt => internal_err!(
+                "The to_local_time function can only accept timestamp as the arg, got {dt}"
+            ),
+        }
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let [time_value] = take_function_args(self.name(), &args.args)?;
+        to_local_time(time_value)
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+fn transform_array<T: ArrowTimestampType>(
+    array: &ArrayRef,
+    tz: Tz,
+) -> Result<ColumnarValue> {
+    let primitive_array = as_primitive_array::<T>(array)?;
+    let result: PrimitiveArray<T> =
+        primitive_array.try_unary(|ts| adjust_to_local_time::<T>(ts, tz))?;
+    Ok(ColumnarValue::Array(Arc::new(result)))
+}
+
+fn to_local_time(time_value: &ColumnarValue) -> Result<ColumnarValue> {
+    let arg_type = time_value.data_type();
+
+    let tz: Tz = match &arg_type {
+        Timestamp(_, Some(timezone)) => timezone.parse()?,
+        Timestamp(_, None) => {
+            // if no timezone specified, just return the input
+            return Ok(time_value.clone());
+        }
+        DataType::Null => {
+            return Ok(ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                None, None,
+            )));
+        }
+        dt => {
+            return internal_err!(
+                "to_local_time function requires timestamp argument, got {dt}"
+            );
+        }
+    };
+
+    // If has timezone, adjust the underlying time value. The current time value
+    // is stored as i64 in UTC, even though the timezone may not be in UTC. Therefore,
+    // we need to adjust the time value to the local time. See [`adjust_to_local_time`]
+    // for more details.
+    //
+    // Then remove the timezone in return type, i.e. return None
+    match time_value {
+        ColumnarValue::Scalar(ScalarValue::TimestampSecond(None, Some(_))) => Ok(
+            ColumnarValue::Scalar(ScalarValue::TimestampSecond(None, None)),
+        ),
+        ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(None, Some(_))) => Ok(
+            ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(None, None)),
+        ),
+        ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(None, Some(_))) => Ok(
+            ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(None, None)),
+        ),
+        ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(None, Some(_))) => Ok(
+            ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(None, None)),
+        ),
+        ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(ts), Some(_))) => {
+            let adjusted_ts = adjust_to_local_time::<TimestampNanosecondType>(*ts, tz)?;
+            Ok(ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                Some(adjusted_ts),
+                None,
+            )))
+        }
+        ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(Some(ts), Some(_))) => {
+            let adjusted_ts = adjust_to_local_time::<TimestampMicrosecondType>(*ts, tz)?;
+            Ok(ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(
+                Some(adjusted_ts),
+                None,
+            )))
+        }
+        ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(Some(ts), Some(_))) => {
+            let adjusted_ts = adjust_to_local_time::<TimestampMillisecondType>(*ts, tz)?;
+            Ok(ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(
+                Some(adjusted_ts),
+                None,
+            )))
+        }
+        ColumnarValue::Scalar(ScalarValue::TimestampSecond(Some(ts), Some(_))) => {
+            let adjusted_ts = adjust_to_local_time::<TimestampSecondType>(*ts, tz)?;
+            Ok(ColumnarValue::Scalar(ScalarValue::TimestampSecond(
+                Some(adjusted_ts),
+                None,
+            )))
+        }
+        ColumnarValue::Array(array)
+            if matches!(array.data_type(), Timestamp(Nanosecond, Some(_))) =>
+        {
+            transform_array::<TimestampNanosecondType>(array, tz)
+        }
+        ColumnarValue::Array(array)
+            if matches!(array.data_type(), Timestamp(Microsecond, Some(_))) =>
+        {
+            transform_array::<TimestampMicrosecondType>(array, tz)
+        }
+        ColumnarValue::Array(array)
+            if matches!(array.data_type(), Timestamp(Millisecond, Some(_))) =>
+        {
+            transform_array::<TimestampMillisecondType>(array, tz)
+        }
+        ColumnarValue::Array(array)
+            if matches!(array.data_type(), Timestamp(Second, Some(_))) =>
+        {
+            transform_array::<TimestampSecondType>(array, tz)
+        }
+        _ => {
+            internal_err!(
+                "to_local_time function requires timestamp argument, got {arg_type}"
+            )
         }
     }
 }
@@ -293,7 +307,7 @@ impl ToLocalTimeFunc {
 /// ```
 ///
 /// See `test_adjust_to_local_time()` for example
-fn adjust_to_local_time<T: ArrowTimestampType>(ts: i64, tz: Tz) -> Result<i64> {
+pub fn adjust_to_local_time<T: ArrowTimestampType>(ts: i64, tz: Tz) -> Result<i64> {
     fn convert_timestamp<F>(ts: i64, converter: F) -> Result<DateTime<Utc>>
     where
         F: Fn(i64) -> MappedLocalTime<DateTime<Utc>>,
@@ -343,81 +357,19 @@ fn adjust_to_local_time<T: ArrowTimestampType>(ts: i64, tz: Tz) -> Result<i64> {
     }
 }
 
-impl ScalarUDFImpl for ToLocalTimeFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn name(&self) -> &str {
-        "to_local_time"
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        let [time_value] = take_function_args(self.name(), arg_types)?;
-
-        match time_value {
-            Timestamp(timeunit, _) => Ok(Timestamp(*timeunit, None)),
-            _ => exec_err!(
-                "The to_local_time function can only accept timestamp as the arg, got {:?}", time_value
-            )
-        }
-    }
-
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        let [time_value] = take_function_args(self.name(), args.args)?;
-
-        self.to_local_time(std::slice::from_ref(&time_value))
-    }
-
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        if arg_types.len() != 1 {
-            return plan_err!(
-                "to_local_time function requires 1 argument, got {:?}",
-                arg_types.len()
-            );
-        }
-
-        let first_arg = arg_types[0].clone();
-        match &first_arg {
-            DataType::Null => Ok(vec![Timestamp(Nanosecond, None)]),
-            Timestamp(Nanosecond, timezone) => {
-                Ok(vec![Timestamp(Nanosecond, timezone.clone())])
-            }
-            Timestamp(Microsecond, timezone) => {
-                Ok(vec![Timestamp(Microsecond, timezone.clone())])
-            }
-            Timestamp(Millisecond, timezone) => {
-                Ok(vec![Timestamp(Millisecond, timezone.clone())])
-            }
-            Timestamp(Second, timezone) => Ok(vec![Timestamp(Second, timezone.clone())]),
-            _ => plan_err!("The to_local_time function can only accept Timestamp as the arg got {first_arg}"),
-        }
-    }
-    fn documentation(&self) -> Option<&Documentation> {
-        self.doc()
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use std::sync::Arc;
 
-    use arrow::array::{types::TimestampNanosecondType, Array, TimestampNanosecondArray};
+    use arrow::array::{Array, TimestampNanosecondArray, types::TimestampNanosecondType};
     use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos;
     use arrow::datatypes::{DataType, Field, TimeUnit};
     use chrono::NaiveDateTime;
-    use datafusion_common::config::ConfigOptions;
     use datafusion_common::ScalarValue;
+    use datafusion_common::config::ConfigOptions;
     use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
 
-    use super::{adjust_to_local_time, ToLocalTimeFunc};
+    use super::{ToLocalTimeFunc, adjust_to_local_time};
 
     #[test]
     fn test_adjust_to_local_time() {
diff --git a/datafusion/functions/src/datetime/to_time.rs b/datafusion/functions/src/datetime/to_time.rs
new file mode 100644
index 0000000000000..94aa49fbbad2f
--- /dev/null
+++ b/datafusion/functions/src/datetime/to_time.rs
@@ -0,0 +1,239 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::datetime::common::*;
+use arrow::array::cast::AsArray;
+use arrow::array::temporal_conversions::time_to_time64ns;
+use arrow::array::types::Time64NanosecondType;
+use arrow::array::{Array, PrimitiveArray, StringArrayType};
+use arrow::datatypes::DataType;
+use arrow::datatypes::DataType::*;
+use chrono::NaiveTime;
+use datafusion_common::{Result, ScalarValue, exec_err};
+use datafusion_expr::{
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+use datafusion_macros::user_doc;
+use std::sync::Arc;
+
+/// Default time formats to try when parsing without an explicit format
+const DEFAULT_TIME_FORMATS: &[&str] = &[
+    "%H:%M:%S%.f", // 12:30:45.123456789
+    "%H:%M:%S",    // 12:30:45
+    "%H:%M",       // 12:30
+];
+
+#[user_doc(
+    doc_section(label = "Time and Date Functions"),
+    description = r"Converts a value to a time (`HH:MM:SS.nnnnnnnnn`).
+Supports strings and timestamps as input.
+Strings are parsed as `HH:MM:SS`, `HH:MM:SS.nnnnnnnnn`, or `HH:MM` if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided.
+Timestamps will have the time portion extracted.
+Returns the corresponding time.
+
+Note: `to_time` returns Time64(Nanosecond), which represents the time of day in nanoseconds since midnight.",
+    syntax_example = "to_time('12:30:45', '%H:%M:%S')",
+    sql_example = r#"```sql
+> select to_time('12:30:45');
++---------------------------+
+| to_time(Utf8("12:30:45")) |
++---------------------------+
+| 12:30:45                  |
++---------------------------+
+> select to_time('12-30-45', '%H-%M-%S');
++--------------------------------------------+
+| to_time(Utf8("12-30-45"),Utf8("%H-%M-%S")) |
++--------------------------------------------+
+| 12:30:45                                   |
++--------------------------------------------+
+> select to_time('2024-01-15 14:30:45'::timestamp);
++--------------------------------------------------+
+| to_time(Utf8("2024-01-15 14:30:45"))             |
++--------------------------------------------------+
+| 14:30:45                                         |
++--------------------------------------------------+
+```
+
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
+"#,
+    standard_argument(name = "expression", prefix = "String or Timestamp"),
+    argument(
+        name = "format_n",
+        description = r"Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order
+  they appear with the first successful one being returned. If none of the formats successfully parse the expression
+  an error will be returned."
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct ToTimeFunc {
+    signature: Signature,
+}
+
+impl Default for ToTimeFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ToTimeFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::variadic_any(Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for ToTimeFunc {
+    fn name(&self) -> &str {
+        "to_time"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Time64(arrow::datatypes::TimeUnit::Nanosecond))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let args = args.args;
+        if args.is_empty() {
+            return exec_err!("to_time function requires 1 or more arguments, got 0");
+        }
+
+        // validate that any args after the first one are Utf8
+        if args.len() > 1 {
+            validate_data_types(&args, "to_time")?;
+        }
+
+        match args[0].data_type() {
+            Utf8View | LargeUtf8 | Utf8 => string_to_time(&args),
+            Null => Ok(ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(None))),
+            // Support timestamp input by extracting time portion using Arrow cast
+            Timestamp(_, _) => timestamp_to_time(&args[0]),
+            other => {
+                exec_err!("Unsupported data type {} for function to_time", other)
+            }
+        }
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+/// Convert string arguments to time (standalone function, not a method on ToTimeFunc)
+fn string_to_time(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+    let formats = collect_formats(args)?;
+
+    match &args[0] {
+        ColumnarValue::Scalar(ScalarValue::Utf8(s))
+        | ColumnarValue::Scalar(ScalarValue::LargeUtf8(s))
+        | ColumnarValue::Scalar(ScalarValue::Utf8View(s)) => {
+            let result = s
+                .as_ref()
+                .map(|s| parse_time_with_formats(s, &formats))
+                .transpose()?;
+            Ok(ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(result)))
+        }
+        ColumnarValue::Array(array) => {
+            let result = match array.data_type() {
+                Utf8 => parse_time_array(&array.as_string::<i32>(), &formats)?,
+                LargeUtf8 => parse_time_array(&array.as_string::<i64>(), &formats)?,
+                Utf8View => parse_time_array(&array.as_string_view(), &formats)?,
+                other => return exec_err!("Unsupported type for to_time: {other}"),
+            };
+            Ok(ColumnarValue::Array(Arc::new(result)))
+        }
+        other => exec_err!("Unsupported argument for to_time: {other:?}"),
+    }
+}
+
+/// Collect format strings from arguments, erroring on non-scalar inputs
+fn collect_formats(args: &[ColumnarValue]) -> Result<Vec<&str>> {
+    if args.len() <= 1 {
+        return Ok(DEFAULT_TIME_FORMATS.to_vec());
+    }
+
+    let mut formats = Vec::with_capacity(args.len() - 1);
+    for (i, arg) in args[1..].iter().enumerate() {
+        match arg {
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(s)))
+            | ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(s)))
+            | ColumnarValue::Scalar(ScalarValue::Utf8View(Some(s))) => {
+                formats.push(s.as_str());
+            }
+            ColumnarValue::Scalar(ScalarValue::Utf8(None))
+            | ColumnarValue::Scalar(ScalarValue::LargeUtf8(None))
+            | ColumnarValue::Scalar(ScalarValue::Utf8View(None)) => {
+                // Skip null format strings
+            }
+            ColumnarValue::Array(_) => {
+                return exec_err!(
+                    "to_time format argument {} must be a scalar, not an array",
+                    i + 2 // argument position (1-indexed, +1 for the first arg)
+                );
+            }
+            other => {
+                return exec_err!(
+                    "to_time format argument {} has unsupported type: {:?}",
+                    i + 2,
+                    other.data_type()
+                );
+            }
+        }
+    }
+    Ok(formats)
+}
+
+/// Extract time portion from timestamp using Arrow cast kernel
+fn timestamp_to_time(arg: &ColumnarValue) -> Result<ColumnarValue> {
+    arg.cast_to(&Time64(arrow::datatypes::TimeUnit::Nanosecond), None)
+}
+
+/// Parse time array using the provided formats
+fn parse_time_array<'a, A: StringArrayType<'a>>(
+    array: &A,
+    formats: &[&str],
+) -> Result<PrimitiveArray<Time64NanosecondType>> {
+    let mut values = Vec::with_capacity(array.len());
+    for i in 0..array.len() {
+        if array.is_null(i) {
+            values.push(0);
+        } else {
+            values.push(parse_time_with_formats(array.value(i), formats)?);
+        }
+    }
+    Ok(PrimitiveArray::new(values.into(), array.nulls().cloned()))
+}
+
+/// Parse time string using provided formats
+fn parse_time_with_formats(s: &str, formats: &[&str]) -> Result<i64> {
+    for format in formats {
+        if let Ok(time) = NaiveTime::parse_from_str(s, format) {
+            // Use Arrow's time_to_time64ns function instead of custom implementation
+            return Ok(time_to_time64ns(time));
+        }
+    }
+    exec_err!(
+        "Error parsing '{}' as time. Tried formats: {:?}",
+        s,
+        formats
+    )
+}
diff --git a/datafusion/functions/src/datetime/to_timestamp.rs b/datafusion/functions/src/datetime/to_timestamp.rs
index 0a0700097770f..405f6ff3c7b13 100644
--- a/datafusion/functions/src/datetime/to_timestamp.rs
+++ b/datafusion/functions/src/datetime/to_timestamp.rs
@@ -15,30 +15,45 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use crate::datetime::common::*;
-use arrow::array::Float64Array;
+use arrow::array::timezone::Tz;
+use arrow::array::{
+    Array, Decimal128Array, Float16Array, Float32Array, Float64Array,
+    TimestampNanosecondArray,
+};
 use arrow::datatypes::DataType::*;
 use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second};
 use arrow::datatypes::{
-    ArrowTimestampType, DataType, TimeUnit, TimestampMicrosecondType,
-    TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
+    ArrowTimestampType, DataType, TimestampMicrosecondType, TimestampMillisecondType,
+    TimestampNanosecondType, TimestampSecondType,
 };
-use datafusion_common::format::DEFAULT_CAST_OPTIONS;
-use datafusion_common::{exec_err, Result, ScalarType, ScalarValue};
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::{Result, ScalarType, ScalarValue, exec_err};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl,
+    Signature, Volatility,
 };
 use datafusion_macros::user_doc;
 
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
     description = r#"
-Converts a value to a timestamp (`YYYY-MM-DDT00:00:00Z`). Supports strings, integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono formats] are provided. Integers, unsigned integers, and doubles are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp.
-
-Note: `to_timestamp` returns `Timestamp(ns)`. The supported range for integer input is between `-9223372037` and `9223372036`. Supported range for string input is between `1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. Please use `to_timestamp_seconds` for the input outside of supported bounds.
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000<TZ>`) in the session time zone. Supports strings,
+integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided. 
+Strings that parse without a time zone are treated as if they are in the
+session time zone, or UTC if no session time zone is set.
+Integers, unsigned integers, and doubles are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`).
+
+Note: `to_timestamp` returns `Timestamp(ns, TimeZone)` where the time zone is the session time zone. The supported range
+for integer input is between`-9223372037` and `9223372036`. Supported range for string input is between
+`1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. Please use `to_timestamp_seconds`
+for the input outside of supported bounds.
+
+The session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.
+The time zone can be a value like +00:00, 'Europe/London' etc.
 "#,
     syntax_example = "to_timestamp(expression[, ..., format_n])",
     sql_example = r#"```sql
@@ -55,7 +70,7 @@ Note: `to_timestamp` returns `Timestamp(ns)`. The supported range for integer in
 | 2023-05-17T03:59:00.123456789                                                                          |
 +--------------------------------------------------------------------------------------------------------+
 ```
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/date_time_functions.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 "#,
     argument(
         name = "expression",
@@ -63,17 +78,33 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
     ),
     argument(
         name = "format_n",
-        description = "Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned."
+        description = r#"
+Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression.
+Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully
+parse the expression an error will be returned. Note: parsing of named timezones (e.g. 'America/New_York') using %Z is
+only supported at the end of the string preceded by a space.
+"#
     )
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ToTimestampFunc {
     signature: Signature,
+    timezone: Option<Arc<str>>,
 }
 
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
-    description = "Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000Z`). Supports strings, integer, and unsigned integer types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. Integers and unsigned integers are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp.",
+    description = r#"
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00<TZ>`) in the session time zone. Supports strings,
+integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided. 
+Strings that parse without a time zone are treated as if they are in the
+session time zone, or UTC if no session time zone is set.
+Integers, unsigned integers, and doubles are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`).
+
+The session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.
+The time zone can be a value like +00:00, 'Europe/London' etc.
+"#,
     syntax_example = "to_timestamp_seconds(expression[, ..., format_n])",
     sql_example = r#"```sql
 > select to_timestamp_seconds('2023-01-31T09:26:56.123456789-05:00');
@@ -89,7 +120,7 @@ pub struct ToTimestampFunc {
 | 2023-05-17T03:59:00                                                                                            |
 +----------------------------------------------------------------------------------------------------------------+
 ```
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/date_time_functions.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 "#,
     argument(
         name = "expression",
@@ -97,17 +128,33 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
     ),
     argument(
         name = "format_n",
-        description = "Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned."
+        description = r#"
+Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression.
+Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully
+parse the expression an error will be returned. Note: parsing of named timezones (e.g. 'America/New_York') using %Z is
+only supported at the end of the string preceded by a space.
+"#
     )
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ToTimestampSecondsFunc {
     signature: Signature,
+    timezone: Option<Arc<str>>,
 }
 
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
-    description = "Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000Z`). Supports strings, integer, and unsigned integer types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided. Integers and unsigned integers are interpreted as milliseconds since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp.",
+    description = r#"
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000<TZ>`) in the session time zone. Supports strings,
+integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided. 
+Strings that parse without a time zone are treated as if they are in the
+session time zone, or UTC if no session time zone is set.
+Integers, unsigned integers, and doubles are interpreted as milliseconds since the unix epoch (`1970-01-01T00:00:00Z`).
+
+The session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.
+The time zone can be a value like +00:00, 'Europe/London' etc.
+"#,
     syntax_example = "to_timestamp_millis(expression[, ..., format_n])",
     sql_example = r#"```sql
 > select to_timestamp_millis('2023-01-31T09:26:56.123456789-05:00');
@@ -123,7 +170,7 @@ pub struct ToTimestampSecondsFunc {
 | 2023-05-17T03:59:00.123                                                                                       |
 +---------------------------------------------------------------------------------------------------------------+
 ```
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/date_time_functions.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 "#,
     argument(
         name = "expression",
@@ -131,17 +178,33 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
     ),
     argument(
         name = "format_n",
-        description = "Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned."
+        description = r#"
+Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression.
+Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully
+parse the expression an error will be returned. Note: parsing of named timezones (e.g. 'America/New_York') using %Z is
+only supported at the end of the string preceded by a space.
+"#
     )
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ToTimestampMillisFunc {
     signature: Signature,
+    timezone: Option<Arc<str>>,
 }
 
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
-    description = "Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000Z`). Supports strings, integer, and unsigned integer types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. Integers and unsigned integers are interpreted as microseconds since the unix epoch (`1970-01-01T00:00:00Z`) Returns the corresponding timestamp.",
+    description = r#"
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000<TZ>`) in the session time zone. Supports strings,
+integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided. 
+Strings that parse without a time zone are treated as if they are in the
+session time zone, or UTC if no session time zone is set.
+Integers, unsigned integers, and doubles are interpreted as microseconds since the unix epoch (`1970-01-01T00:00:00Z`).
+
+The session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.
+The time zone can be a value like +00:00, 'Europe/London' etc.
+"#,
     syntax_example = "to_timestamp_micros(expression[, ..., format_n])",
     sql_example = r#"```sql
 > select to_timestamp_micros('2023-01-31T09:26:56.123456789-05:00');
@@ -157,7 +220,7 @@ pub struct ToTimestampMillisFunc {
 | 2023-05-17T03:59:00.123456                                                                                    |
 +---------------------------------------------------------------------------------------------------------------+
 ```
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/date_time_functions.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 "#,
     argument(
         name = "expression",
@@ -165,17 +228,32 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
     ),
     argument(
         name = "format_n",
-        description = "Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned."
+        description = r#"
+Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression.
+Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully
+parse the expression an error will be returned. Note: parsing of named timezones (e.g. 'America/New_York') using %Z is
+only supported at the end of the string preceded by a space.
+"#
     )
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ToTimestampMicrosFunc {
     signature: Signature,
+    timezone: Option<Arc<str>>,
 }
 
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
-    description = "Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000000Z`). Supports strings, integer, and unsigned integer types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. Integers and unsigned integers are interpreted as nanoseconds since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp.",
+    description = r#"
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000000<TZ>`) in the session time zone. Supports strings,
+integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided. 
+Strings that parse without a time zone are treated as if they are in the
+session time zone. Integers, unsigned integers, and doubles are interpreted as nanoseconds since the unix epoch (`1970-01-01T00:00:00Z`).
+
+The session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.
+The time zone can be a value like +00:00, 'Europe/London' etc.
+"#,
     syntax_example = "to_timestamp_nanos(expression[, ..., format_n])",
     sql_example = r#"```sql
 > select to_timestamp_nanos('2023-01-31T09:26:56.123456789-05:00');
@@ -191,7 +269,7 @@ pub struct ToTimestampMicrosFunc {
 | 2023-05-17T03:59:00.123456789                                                                                |
 +---------------------------------------------------------------------------------------------------------------+
 ```
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/date_time_functions.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 "#,
     argument(
         name = "expression",
@@ -199,81 +277,97 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
     ),
     argument(
         name = "format_n",
-        description = "Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned."
+        description = r#"
+Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression.
+Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully
+parse the expression an error will be returned. Note: parsing of named timezones (e.g. 'America/New_York') using %Z is
+only supported at the end of the string preceded by a space.
+"#
     )
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ToTimestampNanosFunc {
     signature: Signature,
+    timezone: Option<Arc<str>>,
 }
 
-impl Default for ToTimestampFunc {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl ToTimestampFunc {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::variadic_any(Volatility::Immutable),
-        }
-    }
-}
-
-impl Default for ToTimestampSecondsFunc {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl ToTimestampSecondsFunc {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::variadic_any(Volatility::Immutable),
+/// Macro to generate boilerplate constructors and config methods for ToTimestamp* functions.
+/// Generates: Default impl, deprecated new(), new_with_config(), and extracts timezone from ConfigOptions.
+macro_rules! impl_to_timestamp_constructors {
+    ($func:ty) => {
+        impl Default for $func {
+            fn default() -> Self {
+                Self::new_with_config(&ConfigOptions::default())
+            }
         }
-    }
-}
 
-impl Default for ToTimestampMillisFunc {
-    fn default() -> Self {
-        Self::new()
-    }
-}
+        impl $func {
+            #[deprecated(since = "52.0.0", note = "use `new_with_config` instead")]
+            /// Deprecated constructor retained for backwards compatibility.
+            ///
+            /// Prefer `new_with_config` which allows specifying the
+            /// timezone via [`ConfigOptions`]. This helper now mirrors the
+            /// canonical default offset (None) provided by `ConfigOptions::default()`.
+            pub fn new() -> Self {
+                Self::new_with_config(&ConfigOptions::default())
+            }
 
-impl ToTimestampMillisFunc {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::variadic_any(Volatility::Immutable),
+            pub fn new_with_config(config: &ConfigOptions) -> Self {
+                Self {
+                    signature: Signature::variadic_any(Volatility::Immutable),
+                    timezone: config
+                        .execution
+                        .time_zone
+                        .as_ref()
+                        .map(|tz| Arc::from(tz.as_str())),
+                }
+            }
         }
-    }
+    };
 }
 
-impl Default for ToTimestampMicrosFunc {
-    fn default() -> Self {
-        Self::new()
-    }
+impl_to_timestamp_constructors!(ToTimestampFunc);
+impl_to_timestamp_constructors!(ToTimestampSecondsFunc);
+impl_to_timestamp_constructors!(ToTimestampMillisFunc);
+impl_to_timestamp_constructors!(ToTimestampMicrosFunc);
+impl_to_timestamp_constructors!(ToTimestampNanosFunc);
+
+fn decimal_to_nanoseconds(value: i128, scale: i8) -> i64 {
+    let nanos_exponent = 9_i16 - scale as i16;
+    let timestamp_nanos = if nanos_exponent >= 0 {
+        value * 10_i128.pow(nanos_exponent as u32)
+    } else {
+        value / 10_i128.pow(nanos_exponent.unsigned_abs() as u32)
+    };
+    timestamp_nanos as i64
 }
 
-impl ToTimestampMicrosFunc {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::variadic_any(Volatility::Immutable),
+fn decimal128_to_timestamp_nanos(
+    arg: &ColumnarValue,
+    tz: Option<Arc<str>>,
+) -> Result<ColumnarValue> {
+    match arg {
+        ColumnarValue::Scalar(ScalarValue::Decimal128(Some(value), _, scale)) => {
+            let timestamp_nanos = decimal_to_nanoseconds(*value, *scale);
+            Ok(ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                Some(timestamp_nanos),
+                tz,
+            )))
         }
-    }
-}
-
-impl Default for ToTimestampNanosFunc {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl ToTimestampNanosFunc {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::variadic_any(Volatility::Immutable),
+        ColumnarValue::Scalar(ScalarValue::Decimal128(None, _, _)) => Ok(
+            ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(None, tz)),
+        ),
+        ColumnarValue::Array(arr) => {
+            let decimal_arr = downcast_arg!(arr, Decimal128Array);
+            let scale = decimal_arr.scale();
+            let result: TimestampNanosecondArray = decimal_arr
+                .iter()
+                .map(|v| v.map(|val| decimal_to_nanoseconds(val, scale)))
+                .collect();
+            let result = result.with_timezone_opt(tz);
+            Ok(ColumnarValue::Array(Arc::new(result)))
         }
+        _ => exec_err!("Invalid Decimal128 value for to_timestamp"),
     }
 }
 
@@ -283,11 +377,16 @@ impl ToTimestampNanosFunc {
 /// The supported range for integer input is between `-9223372037` and `9223372036`.
 /// Supported range for string input is between `1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`.
 /// Please use `to_timestamp_seconds` for the input outside of supported bounds.
-impl ScalarUDFImpl for ToTimestampFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
+/// Macro to generate the with_updated_config method for ToTimestamp* functions.
+macro_rules! impl_with_updated_config {
+    () => {
+        fn with_updated_config(&self, config: &ConfigOptions) -> Option<ScalarUDF> {
+            Some(Self::new_with_config(config).into())
+        }
+    };
+}
 
+impl ScalarUDFImpl for ToTimestampFunc {
     fn name(&self) -> &str {
         "to_timestamp"
     }
@@ -296,15 +395,15 @@ impl ScalarUDFImpl for ToTimestampFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(return_type_for(&arg_types[0], Nanosecond))
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Timestamp(Nanosecond, self.timezone.clone()))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        let args = args.args;
+    impl_with_updated_config!();
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+
         if args.is_empty() {
             return exec_err!(
                 "to_timestamp function requires 1 or more arguments, got {}",
@@ -317,71 +416,83 @@ impl ScalarUDFImpl for ToTimestampFunc {
             validate_data_types(&args, "to_timestamp")?;
         }
 
+        let tz = self.timezone.clone();
+
         match args[0].data_type() {
-            Int32 | Int64 => args[0]
+            Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 => args[0]
                 .cast_to(&Timestamp(Second, None), None)?
-                .cast_to(&Timestamp(Nanosecond, None), None),
-            Null | Timestamp(_, None) => {
-                args[0].cast_to(&Timestamp(Nanosecond, None), None)
-            }
-            Float64 => {
-                let rescaled = arrow::compute::kernels::numeric::mul(
-                    &args[0].to_array(1)?,
-                    &arrow::array::Scalar::new(Float64Array::from(vec![
-                        1_000_000_000f64,
-                    ])),
-                )?;
-                Ok(ColumnarValue::Array(arrow::compute::cast_with_options(
-                    &rescaled,
-                    &Timestamp(Nanosecond, None),
-                    &DEFAULT_CAST_OPTIONS,
-                )?))
-            }
-            Timestamp(_, Some(tz)) => {
-                args[0].cast_to(&Timestamp(Nanosecond, Some(tz)), None)
+                .cast_to(&Timestamp(Nanosecond, tz), None),
+            Null | Timestamp(_, _) => args[0].cast_to(&Timestamp(Nanosecond, tz), None),
+            Float16 => match &args[0] {
+                ColumnarValue::Scalar(ScalarValue::Float16(value)) => {
+                    let timestamp_nanos =
+                        value.map(|v| (v.to_f64() * 1_000_000_000.0) as i64);
+                    Ok(ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                        timestamp_nanos,
+                        tz,
+                    )))
+                }
+                ColumnarValue::Array(arr) => {
+                    let f16_arr = downcast_arg!(arr, Float16Array);
+                    let result: TimestampNanosecondArray =
+                        f16_arr.unary(|x| (x.to_f64() * 1_000_000_000.0) as i64);
+                    Ok(ColumnarValue::Array(Arc::new(result.with_timezone_opt(tz))))
+                }
+                _ => exec_err!("Invalid Float16 value for to_timestamp"),
+            },
+            Float32 => match &args[0] {
+                ColumnarValue::Scalar(ScalarValue::Float32(value)) => {
+                    let timestamp_nanos =
+                        value.map(|v| (v as f64 * 1_000_000_000.0) as i64);
+                    Ok(ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                        timestamp_nanos,
+                        tz,
+                    )))
+                }
+                ColumnarValue::Array(arr) => {
+                    let f32_arr = downcast_arg!(arr, Float32Array);
+                    let result: TimestampNanosecondArray =
+                        f32_arr.unary(|x| (x as f64 * 1_000_000_000.0) as i64);
+                    Ok(ColumnarValue::Array(Arc::new(result.with_timezone_opt(tz))))
+                }
+                _ => exec_err!("Invalid Float32 value for to_timestamp"),
+            },
+            Float64 => match &args[0] {
+                ColumnarValue::Scalar(ScalarValue::Float64(value)) => {
+                    let timestamp_nanos = value.map(|v| (v * 1_000_000_000.0) as i64);
+                    Ok(ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                        timestamp_nanos,
+                        tz,
+                    )))
+                }
+                ColumnarValue::Array(arr) => {
+                    let f64_arr = downcast_arg!(arr, Float64Array);
+                    let result: TimestampNanosecondArray =
+                        f64_arr.unary(|x| (x * 1_000_000_000.0) as i64);
+                    Ok(ColumnarValue::Array(Arc::new(result.with_timezone_opt(tz))))
+                }
+                _ => exec_err!("Invalid Float64 value for to_timestamp"),
+            },
+            Decimal32(_, _) | Decimal64(_, _) | Decimal256(_, _) => {
+                let arg = args[0].cast_to(&Decimal128(38, 9), None)?;
+                decimal128_to_timestamp_nanos(&arg, tz)
             }
+            Decimal128(_, _) => decimal128_to_timestamp_nanos(&args[0], tz),
             Utf8View | LargeUtf8 | Utf8 => {
-                to_timestamp_impl::<TimestampNanosecondType>(&args, "to_timestamp")
-            }
-            Decimal128(_, _) => {
-                match &args[0] {
-                    ColumnarValue::Scalar(ScalarValue::Decimal128(
-                        Some(value),
-                        _,
-                        scale,
-                    )) => {
-                        // Convert decimal to seconds and nanoseconds
-                        let scale_factor = 10_i128.pow(*scale as u32);
-                        let seconds = value / scale_factor;
-                        let fraction = value % scale_factor;
-
-                        let nanos = (fraction * 1_000_000_000) / scale_factor;
-
-                        let timestamp_nanos = seconds * 1_000_000_000 + nanos;
-
-                        Ok(ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
-                            Some(timestamp_nanos as i64),
-                            None,
-                        )))
-                    }
-                    _ => exec_err!("Invalid decimal value"),
-                }
+                to_timestamp_impl::<TimestampNanosecondType>(&args, "to_timestamp", &tz)
             }
             other => {
                 exec_err!("Unsupported data type {other} for function to_timestamp")
             }
         }
     }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 }
 
 impl ScalarUDFImpl for ToTimestampSecondsFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "to_timestamp_seconds"
     }
@@ -390,15 +501,15 @@ impl ScalarUDFImpl for ToTimestampSecondsFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(return_type_for(&arg_types[0], Second))
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Timestamp(Second, self.timezone.clone()))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        let args = args.args;
+    impl_with_updated_config!();
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+
         if args.is_empty() {
             return exec_err!(
                 "to_timestamp_seconds function requires 1 or more arguments, got {}",
@@ -411,14 +522,31 @@ impl ScalarUDFImpl for ToTimestampSecondsFunc {
             validate_data_types(&args, "to_timestamp")?;
         }
 
+        let tz = self.timezone.clone();
+
         match args[0].data_type() {
-            Null | Int32 | Int64 | Timestamp(_, None) | Decimal128(_, _) => {
-                args[0].cast_to(&Timestamp(Second, None), None)
-            }
-            Timestamp(_, Some(tz)) => args[0].cast_to(&Timestamp(Second, Some(tz)), None),
-            Utf8View | LargeUtf8 | Utf8 => {
-                to_timestamp_impl::<TimestampSecondType>(&args, "to_timestamp_seconds")
-            }
+            Null
+            | Int8
+            | Int16
+            | Int32
+            | Int64
+            | UInt8
+            | UInt16
+            | UInt32
+            | UInt64
+            | Timestamp(_, _)
+            | Decimal32(_, _)
+            | Decimal64(_, _)
+            | Decimal128(_, _)
+            | Decimal256(_, _) => args[0].cast_to(&Timestamp(Second, tz), None),
+            Float16 | Float32 | Float64 => args[0]
+                .cast_to(&Int64, None)?
+                .cast_to(&Timestamp(Second, tz), None),
+            Utf8View | LargeUtf8 | Utf8 => to_timestamp_impl::<TimestampSecondType>(
+                &args,
+                "to_timestamp_seconds",
+                &self.timezone,
+            ),
             other => {
                 exec_err!(
                     "Unsupported data type {} for function to_timestamp_seconds",
@@ -427,16 +555,13 @@ impl ScalarUDFImpl for ToTimestampSecondsFunc {
             }
         }
     }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 }
 
 impl ScalarUDFImpl for ToTimestampMillisFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "to_timestamp_millis"
     }
@@ -445,15 +570,15 @@ impl ScalarUDFImpl for ToTimestampMillisFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(return_type_for(&arg_types[0], Millisecond))
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Timestamp(Millisecond, self.timezone.clone()))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        let args = args.args;
+    impl_with_updated_config!();
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+
         if args.is_empty() {
             return exec_err!(
                 "to_timestamp_millis function requires 1 or more arguments, got {}",
@@ -467,15 +592,29 @@ impl ScalarUDFImpl for ToTimestampMillisFunc {
         }
 
         match args[0].data_type() {
-            Null | Int32 | Int64 | Timestamp(_, None) => {
-                args[0].cast_to(&Timestamp(Millisecond, None), None)
-            }
-            Timestamp(_, Some(tz)) => {
-                args[0].cast_to(&Timestamp(Millisecond, Some(tz)), None)
+            Null
+            | Int8
+            | Int16
+            | Int32
+            | Int64
+            | UInt8
+            | UInt16
+            | UInt32
+            | UInt64
+            | Timestamp(_, _)
+            | Decimal32(_, _)
+            | Decimal64(_, _)
+            | Decimal128(_, _)
+            | Decimal256(_, _) => {
+                args[0].cast_to(&Timestamp(Millisecond, self.timezone.clone()), None)
             }
+            Float16 | Float32 | Float64 => args[0]
+                .cast_to(&Int64, None)?
+                .cast_to(&Timestamp(Millisecond, self.timezone.clone()), None),
             Utf8View | LargeUtf8 | Utf8 => to_timestamp_impl::<TimestampMillisecondType>(
                 &args,
                 "to_timestamp_millis",
+                &self.timezone,
             ),
             other => {
                 exec_err!(
@@ -485,16 +624,13 @@ impl ScalarUDFImpl for ToTimestampMillisFunc {
             }
         }
     }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 }
 
 impl ScalarUDFImpl for ToTimestampMicrosFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "to_timestamp_micros"
     }
@@ -503,15 +639,15 @@ impl ScalarUDFImpl for ToTimestampMicrosFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(return_type_for(&arg_types[0], Microsecond))
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Timestamp(Microsecond, self.timezone.clone()))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        let args = args.args;
+    impl_with_updated_config!();
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+
         if args.is_empty() {
             return exec_err!(
                 "to_timestamp_micros function requires 1 or more arguments, got {}",
@@ -525,15 +661,29 @@ impl ScalarUDFImpl for ToTimestampMicrosFunc {
         }
 
         match args[0].data_type() {
-            Null | Int32 | Int64 | Timestamp(_, None) => {
-                args[0].cast_to(&Timestamp(Microsecond, None), None)
-            }
-            Timestamp(_, Some(tz)) => {
-                args[0].cast_to(&Timestamp(Microsecond, Some(tz)), None)
+            Null
+            | Int8
+            | Int16
+            | Int32
+            | Int64
+            | UInt8
+            | UInt16
+            | UInt32
+            | UInt64
+            | Timestamp(_, _)
+            | Decimal32(_, _)
+            | Decimal64(_, _)
+            | Decimal128(_, _)
+            | Decimal256(_, _) => {
+                args[0].cast_to(&Timestamp(Microsecond, self.timezone.clone()), None)
             }
+            Float16 | Float32 | Float64 => args[0]
+                .cast_to(&Int64, None)?
+                .cast_to(&Timestamp(Microsecond, self.timezone.clone()), None),
             Utf8View | LargeUtf8 | Utf8 => to_timestamp_impl::<TimestampMicrosecondType>(
                 &args,
                 "to_timestamp_micros",
+                &self.timezone,
             ),
             other => {
                 exec_err!(
@@ -543,16 +693,13 @@ impl ScalarUDFImpl for ToTimestampMicrosFunc {
             }
         }
     }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 }
 
 impl ScalarUDFImpl for ToTimestampNanosFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "to_timestamp_nanos"
     }
@@ -561,15 +708,15 @@ impl ScalarUDFImpl for ToTimestampNanosFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(return_type_for(&arg_types[0], Nanosecond))
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Timestamp(Nanosecond, self.timezone.clone()))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        let args = args.args;
+    impl_with_updated_config!();
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+
         if args.is_empty() {
             return exec_err!(
                 "to_timestamp_nanos function requires 1 or more arguments, got {}",
@@ -583,15 +730,30 @@ impl ScalarUDFImpl for ToTimestampNanosFunc {
         }
 
         match args[0].data_type() {
-            Null | Int32 | Int64 | Timestamp(_, None) => {
-                args[0].cast_to(&Timestamp(Nanosecond, None), None)
-            }
-            Timestamp(_, Some(tz)) => {
-                args[0].cast_to(&Timestamp(Nanosecond, Some(tz)), None)
-            }
-            Utf8View | LargeUtf8 | Utf8 => {
-                to_timestamp_impl::<TimestampNanosecondType>(&args, "to_timestamp_nanos")
+            Null
+            | Int8
+            | Int16
+            | Int32
+            | Int64
+            | UInt8
+            | UInt16
+            | UInt32
+            | UInt64
+            | Timestamp(_, _)
+            | Decimal32(_, _)
+            | Decimal64(_, _)
+            | Decimal128(_, _)
+            | Decimal256(_, _) => {
+                args[0].cast_to(&Timestamp(Nanosecond, self.timezone.clone()), None)
             }
+            Float16 | Float32 | Float64 => args[0]
+                .cast_to(&Int64, None)?
+                .cast_to(&Timestamp(Nanosecond, self.timezone.clone()), None),
+            Utf8View | LargeUtf8 | Utf8 => to_timestamp_impl::<TimestampNanosecondType>(
+                &args,
+                "to_timestamp_nanos",
+                &self.timezone,
+            ),
             other => {
                 exec_err!(
                     "Unsupported data type {} for function to_timestamp_nanos",
@@ -600,23 +762,16 @@ impl ScalarUDFImpl for ToTimestampNanosFunc {
             }
         }
     }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 }
 
-/// Returns the return type for the to_timestamp_* function, preserving
-/// the timezone if it exists.
-fn return_type_for(arg: &DataType, unit: TimeUnit) -> DataType {
-    match arg {
-        Timestamp(_, Some(tz)) => Timestamp(unit, Some(Arc::clone(tz))),
-        _ => Timestamp(unit, None),
-    }
-}
-
 fn to_timestamp_impl<T: ArrowTimestampType + ScalarType<i64>>(
     args: &[ColumnarValue],
     name: &str,
+    timezone: &Option<Arc<str>>,
 ) -> Result<ColumnarValue> {
     let factor = match T::UNIT {
         Second => 1_000_000_000,
@@ -625,17 +780,26 @@ fn to_timestamp_impl<T: ArrowTimestampType + ScalarType<i64>>(
         Nanosecond => 1,
     };
 
+    let tz = match timezone.clone() {
+        Some(tz) => Some(tz.parse::<Tz>()?),
+        None => None,
+    };
+
     match args.len() {
-        1 => handle::<T, _, T>(
+        1 => handle::<T, _>(
             args,
-            |s| string_to_timestamp_nanos_shim(s).map(|n| n / factor),
+            move |s| string_to_timestamp_nanos_with_timezone(&tz, s).map(|n| n / factor),
             name,
+            &Timestamp(T::UNIT, timezone.clone()),
         ),
-        n if n >= 2 => handle_multiple::<T, _, T, _>(
+        n if n >= 2 => handle_multiple::<T, _, _>(
             args,
-            string_to_timestamp_nanos_formatted,
+            move |s, format| {
+                string_to_timestamp_nanos_formatted_with_timezone(&tz, s, format)
+            },
             |n| n / factor,
             name,
+            &Timestamp(T::UNIT, timezone.clone()),
         ),
         _ => exec_err!("Unsupported 0 argument count for function {name}"),
     }
@@ -643,7 +807,6 @@ fn to_timestamp_impl<T: ArrowTimestampType + ScalarType<i64>>(
 
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
 
     use arrow::array::types::Int64Type;
     use arrow::array::{
@@ -652,35 +815,109 @@ mod tests {
     };
     use arrow::array::{ArrayRef, Int64Array, StringBuilder};
     use arrow::datatypes::{Field, TimeUnit};
-    use chrono::Utc;
-    use datafusion_common::config::ConfigOptions;
-    use datafusion_common::{assert_contains, DataFusionError, ScalarValue};
+    use chrono::{DateTime, FixedOffset, Utc};
+    use datafusion_common::{DataFusionError, assert_contains};
     use datafusion_expr::ScalarFunctionImplementation;
 
     use super::*;
 
     fn to_timestamp(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-        to_timestamp_impl::<TimestampNanosecondType>(args, "to_timestamp")
+        let timezone: Option<Arc<str>> = Some("UTC".into());
+        to_timestamp_impl::<TimestampNanosecondType>(args, "to_timestamp", &timezone)
     }
 
     /// to_timestamp_millis SQL function
     fn to_timestamp_millis(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-        to_timestamp_impl::<TimestampMillisecondType>(args, "to_timestamp_millis")
+        let timezone: Option<Arc<str>> = Some("UTC".into());
+        to_timestamp_impl::<TimestampMillisecondType>(
+            args,
+            "to_timestamp_millis",
+            &timezone,
+        )
     }
 
     /// to_timestamp_micros SQL function
     fn to_timestamp_micros(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-        to_timestamp_impl::<TimestampMicrosecondType>(args, "to_timestamp_micros")
+        let timezone: Option<Arc<str>> = Some("UTC".into());
+        to_timestamp_impl::<TimestampMicrosecondType>(
+            args,
+            "to_timestamp_micros",
+            &timezone,
+        )
     }
 
     /// to_timestamp_nanos SQL function
     fn to_timestamp_nanos(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-        to_timestamp_impl::<TimestampNanosecondType>(args, "to_timestamp_nanos")
+        let timezone: Option<Arc<str>> = Some("UTC".into());
+        to_timestamp_impl::<TimestampNanosecondType>(
+            args,
+            "to_timestamp_nanos",
+            &timezone,
+        )
     }
 
     /// to_timestamp_seconds SQL function
     fn to_timestamp_seconds(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-        to_timestamp_impl::<TimestampSecondType>(args, "to_timestamp_seconds")
+        let timezone: Option<Arc<str>> = Some("UTC".into());
+        to_timestamp_impl::<TimestampSecondType>(args, "to_timestamp_seconds", &timezone)
+    }
+
+    fn udfs_and_timeunit() -> Vec<(Box<dyn ScalarUDFImpl>, TimeUnit)> {
+        let udfs: Vec<(Box<dyn ScalarUDFImpl>, TimeUnit)> = vec![
+            (
+                Box::new(ToTimestampFunc::new_with_config(&ConfigOptions::default())),
+                Nanosecond,
+            ),
+            (
+                Box::new(ToTimestampSecondsFunc::new_with_config(
+                    &ConfigOptions::default(),
+                )),
+                Second,
+            ),
+            (
+                Box::new(ToTimestampMillisFunc::new_with_config(
+                    &ConfigOptions::default(),
+                )),
+                Millisecond,
+            ),
+            (
+                Box::new(ToTimestampMicrosFunc::new_with_config(
+                    &ConfigOptions::default(),
+                )),
+                Microsecond,
+            ),
+            (
+                Box::new(ToTimestampNanosFunc::new_with_config(
+                    &ConfigOptions::default(),
+                )),
+                Nanosecond,
+            ),
+        ];
+        udfs
+    }
+
+    fn validate_expected_error(
+        options: &mut ConfigOptions,
+        args: ScalarFunctionArgs,
+        expected_err: &str,
+    ) {
+        let udfs = udfs_and_timeunit();
+
+        for (udf, _) in udfs {
+            match udf
+                .with_updated_config(options)
+                .unwrap()
+                .invoke_with_args(args.clone())
+            {
+                Ok(_) => panic!("Expected error but got success"),
+                Err(e) => {
+                    assert!(
+                        e.to_string().contains(expected_err),
+                        "Can not find expected error '{expected_err}'. Actual error '{e}'"
+                    );
+                }
+            }
+        }
     }
 
     #[test]
@@ -751,6 +988,368 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn to_timestamp_respects_execution_timezone() -> Result<()> {
+        let udfs = udfs_and_timeunit();
+
+        let mut options = ConfigOptions::default();
+        options.execution.time_zone = Some("-05:00".to_string());
+
+        let time_zone: Option<Arc<str>> = options
+            .execution
+            .time_zone
+            .as_ref()
+            .map(|tz| Arc::from(tz.as_str()));
+
+        for (udf, time_unit) in udfs {
+            let field = Field::new("arg", Utf8, true).into();
+
+            let args = ScalarFunctionArgs {
+                args: vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                    "2020-09-08T13:42:29".to_string(),
+                )))],
+                arg_fields: vec![field],
+                number_rows: 1,
+                return_field: Field::new(
+                    "f",
+                    Timestamp(time_unit, Some("-05:00".into())),
+                    true,
+                )
+                .into(),
+                config_options: Arc::new(options.clone()),
+            };
+
+            let result = udf
+                .with_updated_config(&options.clone())
+                .unwrap()
+                .invoke_with_args(args)?;
+            let result = match time_unit {
+                Second => {
+                    let ColumnarValue::Scalar(ScalarValue::TimestampSecond(
+                        Some(value),
+                        tz,
+                    )) = result
+                    else {
+                        panic!("expected scalar timestamp");
+                    };
+
+                    assert_eq!(tz, time_zone);
+
+                    value
+                }
+                Millisecond => {
+                    let ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(
+                        Some(value),
+                        tz,
+                    )) = result
+                    else {
+                        panic!("expected scalar timestamp");
+                    };
+
+                    assert_eq!(tz, time_zone);
+
+                    value
+                }
+                Microsecond => {
+                    let ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(
+                        Some(value),
+                        tz,
+                    )) = result
+                    else {
+                        panic!("expected scalar timestamp");
+                    };
+
+                    assert_eq!(tz, time_zone);
+
+                    value
+                }
+                Nanosecond => {
+                    let ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                        Some(value),
+                        tz,
+                    )) = result
+                    else {
+                        panic!("expected scalar timestamp");
+                    };
+
+                    assert_eq!(tz, time_zone);
+
+                    value
+                }
+            };
+
+            let scale = match time_unit {
+                Second => 1_000_000_000,
+                Millisecond => 1_000_000,
+                Microsecond => 1_000,
+                Nanosecond => 1,
+            };
+
+            let offset = FixedOffset::west_opt(5 * 3600).unwrap();
+            let result = Some(
+                DateTime::<Utc>::from_timestamp_nanos(result * scale)
+                    .with_timezone(&offset)
+                    .to_string(),
+            );
+
+            assert_eq!(result, Some("2020-09-08 13:42:29 -05:00".to_string()));
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn to_timestamp_formats_respects_execution_timezone() -> Result<()> {
+        let udfs = udfs_and_timeunit();
+
+        let mut options = ConfigOptions::default();
+        options.execution.time_zone = Some("-05:00".to_string());
+
+        let time_zone: Option<Arc<str>> = options
+            .execution
+            .time_zone
+            .as_ref()
+            .map(|tz| Arc::from(tz.as_str()));
+
+        let expr_field = Field::new("arg", Utf8, true).into();
+        let format_field: Arc<Field> = Field::new("fmt", Utf8, true).into();
+
+        for (udf, time_unit) in udfs {
+            for (value, format, expected_str) in [
+                (
+                    "2020-09-08 09:42:29 -05:00",
+                    "%Y-%m-%d %H:%M:%S %z",
+                    Some("2020-09-08 09:42:29 -05:00"),
+                ),
+                (
+                    "2020-09-08T13:42:29Z",
+                    "%+",
+                    Some("2020-09-08 08:42:29 -05:00"),
+                ),
+                (
+                    "2020-09-08 13:42:29 UTC",
+                    "%Y-%m-%d %H:%M:%S %Z",
+                    Some("2020-09-08 08:42:29 -05:00"),
+                ),
+                (
+                    "+0000 2024-01-01 12:00:00",
+                    "%z %Y-%m-%d %H:%M:%S",
+                    Some("2024-01-01 07:00:00 -05:00"),
+                ),
+                (
+                    "20200908134229+0100",
+                    "%Y%m%d%H%M%S%z",
+                    Some("2020-09-08 07:42:29 -05:00"),
+                ),
+                (
+                    "2020-09-08+0230 13:42",
+                    "%Y-%m-%d%z %H:%M",
+                    Some("2020-09-08 06:12:00 -05:00"),
+                ),
+            ] {
+                let args = ScalarFunctionArgs {
+                    args: vec![
+                        ColumnarValue::Scalar(ScalarValue::Utf8(Some(value.to_string()))),
+                        ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                            format.to_string(),
+                        ))),
+                    ],
+                    arg_fields: vec![Arc::clone(&expr_field), Arc::clone(&format_field)],
+                    number_rows: 1,
+                    return_field: Field::new(
+                        "f",
+                        Timestamp(time_unit, Some("-05:00".into())),
+                        true,
+                    )
+                    .into(),
+                    config_options: Arc::new(options.clone()),
+                };
+                let result = udf
+                    .with_updated_config(&options.clone())
+                    .unwrap()
+                    .invoke_with_args(args)?;
+                let result = match time_unit {
+                    Second => {
+                        let ColumnarValue::Scalar(ScalarValue::TimestampSecond(
+                            Some(value),
+                            tz,
+                        )) = result
+                        else {
+                            panic!("expected scalar timestamp");
+                        };
+
+                        assert_eq!(tz, time_zone);
+
+                        value
+                    }
+                    Millisecond => {
+                        let ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(
+                            Some(value),
+                            tz,
+                        )) = result
+                        else {
+                            panic!("expected scalar timestamp");
+                        };
+
+                        assert_eq!(tz, time_zone);
+
+                        value
+                    }
+                    Microsecond => {
+                        let ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(
+                            Some(value),
+                            tz,
+                        )) = result
+                        else {
+                            panic!("expected scalar timestamp");
+                        };
+
+                        assert_eq!(tz, time_zone);
+
+                        value
+                    }
+                    Nanosecond => {
+                        let ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                            Some(value),
+                            tz,
+                        )) = result
+                        else {
+                            panic!("expected scalar timestamp");
+                        };
+
+                        assert_eq!(tz, time_zone);
+
+                        value
+                    }
+                };
+
+                let scale = match time_unit {
+                    Second => 1_000_000_000,
+                    Millisecond => 1_000_000,
+                    Microsecond => 1_000,
+                    Nanosecond => 1,
+                };
+                let offset = FixedOffset::west_opt(5 * 3600).unwrap();
+                let result = Some(
+                    DateTime::<Utc>::from_timestamp_nanos(result * scale)
+                        .with_timezone(&offset)
+                        .to_string(),
+                );
+
+                assert_eq!(result, expected_str.map(|s| s.to_string()));
+            }
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn to_timestamp_invalid_execution_timezone_behavior() -> Result<()> {
+        let field: Arc<Field> = Field::new("arg", Utf8, true).into();
+        let return_field: Arc<Field> =
+            Field::new("f", Timestamp(Nanosecond, None), true).into();
+
+        let mut options = ConfigOptions::default();
+        options.execution.time_zone = Some("Invalid/Timezone".to_string());
+
+        let args = ScalarFunctionArgs {
+            args: vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                "2020-09-08T13:42:29Z".to_string(),
+            )))],
+            arg_fields: vec![Arc::clone(&field)],
+            number_rows: 1,
+            return_field: Arc::clone(&return_field),
+            config_options: Arc::new(options.clone()),
+        };
+
+        let expected_err =
+            "Invalid timezone \"Invalid/Timezone\": failed to parse timezone";
+
+        validate_expected_error(&mut options, args, expected_err);
+
+        Ok(())
+    }
+
+    #[test]
+    fn to_timestamp_formats_invalid_execution_timezone_behavior() -> Result<()> {
+        let expr_field: Arc<Field> = Field::new("arg", Utf8, true).into();
+        let format_field: Arc<Field> = Field::new("fmt", Utf8, true).into();
+        let return_field: Arc<Field> =
+            Field::new("f", Timestamp(Nanosecond, None), true).into();
+
+        let mut options = ConfigOptions::default();
+        options.execution.time_zone = Some("Invalid/Timezone".to_string());
+
+        let expected_err =
+            "Invalid timezone \"Invalid/Timezone\": failed to parse timezone";
+
+        let make_args = |value: &str, format: &str| ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(value.to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(format.to_string()))),
+            ],
+            arg_fields: vec![Arc::clone(&expr_field), Arc::clone(&format_field)],
+            number_rows: 1,
+            return_field: Arc::clone(&return_field),
+            config_options: Arc::new(options.clone()),
+        };
+
+        for (value, format, _expected_str) in [
+            (
+                "2020-09-08 09:42:29 -05:00",
+                "%Y-%m-%d %H:%M:%S %z",
+                Some("2020-09-08 09:42:29 -05:00"),
+            ),
+            (
+                "2020-09-08T13:42:29Z",
+                "%+",
+                Some("2020-09-08 08:42:29 -05:00"),
+            ),
+            (
+                "2020-09-08 13:42:29 +0000",
+                "%Y-%m-%d %H:%M:%S %z",
+                Some("2020-09-08 08:42:29 -05:00"),
+            ),
+            (
+                "+0000 2024-01-01 12:00:00",
+                "%z %Y-%m-%d %H:%M:%S",
+                Some("2024-01-01 07:00:00 -05:00"),
+            ),
+            (
+                "20200908134229+0100",
+                "%Y%m%d%H%M%S%z",
+                Some("2020-09-08 07:42:29 -05:00"),
+            ),
+            (
+                "2020-09-08+0230 13:42",
+                "%Y-%m-%d%z %H:%M",
+                Some("2020-09-08 06:12:00 -05:00"),
+            ),
+        ] {
+            let args = make_args(value, format);
+            validate_expected_error(&mut options.clone(), args, expected_err);
+        }
+
+        let args = ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                    "2020-09-08T13:42:29".to_string(),
+                ))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                    "%Y-%m-%dT%H:%M:%S".to_string(),
+                ))),
+            ],
+            arg_fields: vec![Arc::clone(&expr_field), Arc::clone(&format_field)],
+            number_rows: 1,
+            return_field: Arc::clone(&return_field),
+            config_options: Arc::new(options.clone()),
+        };
+
+        validate_expected_error(&mut options.clone(), args, expected_err);
+
+        Ok(())
+    }
+
     #[test]
     fn to_timestamp_invalid_input_type() -> Result<()> {
         // pass the wrong type of input array to to_timestamp and test
@@ -811,8 +1410,7 @@ mod tests {
         let string_array =
             ColumnarValue::Array(Arc::new(date_string_builder.finish()) as ArrayRef);
 
-        let expected_err =
-            "Arrow error: Parser error: Error parsing timestamp from '2020-09-08 - 13:42:29.19085Z': error parsing time";
+        let expected_err = "Arrow error: Parser error: Error parsing timestamp from '2020-09-08 - 13:42:29.19085Z': error parsing time";
         match to_timestamp(&[string_array]) {
             Ok(_) => panic!("Expected error but got success"),
             Err(e) => {
@@ -836,8 +1434,7 @@ mod tests {
         let string_array =
             ColumnarValue::Array(Arc::new(date_string_builder.finish()) as ArrayRef);
 
-        let expected_err =
-            "Arrow error: Parser error: Invalid timezone \"ZZ\": failed to parse timezone";
+        let expected_err = "Arrow error: Parser error: Invalid timezone \"ZZ\": failed to parse timezone";
         match to_timestamp(&[string_array]) {
             Ok(_) => panic!("Expected error but got success"),
             Err(e) => {
@@ -874,8 +1471,7 @@ mod tests {
             ColumnarValue::Array(Arc::new(format3_builder.finish()) as ArrayRef),
         ];
 
-        let expected_err =
-            "Execution error: Error parsing timestamp from '2020-09-08T13:42:29.19085Z' using format '%H:%M:%S': input contains invalid characters";
+        let expected_err = "Execution error: Error parsing timestamp from '2020-09-08T13:42:29.19085Z' using format '%H:%M:%S': input contains invalid characters";
         match to_timestamp(&string_array) {
             Ok(_) => panic!("Expected error but got success"),
             Err(e) => {
@@ -923,7 +1519,11 @@ mod tests {
     }
 
     fn parse_timestamp_formatted(s: &str, format: &str) -> Result<i64, DataFusionError> {
-        let result = string_to_timestamp_nanos_formatted(s, format);
+        let result = string_to_timestamp_nanos_formatted_with_timezone(
+            &Some("UTC".parse()?),
+            s,
+            format,
+        );
         if let Err(e) = &result {
             eprintln!("Error parsing timestamp '{s}' using format '{format}': {e:?}");
         }
@@ -950,7 +1550,9 @@ mod tests {
         ];
 
         for (s, f, ctx) in cases {
-            let expected = format!("Execution error: Error parsing timestamp from '{s}' using format '{f}': {ctx}");
+            let expected = format!(
+                "Execution error: Error parsing timestamp from '{s}' using format '{f}': {ctx}"
+            );
             let actual = string_to_datetime_formatted(&Utc, s, f)
                 .unwrap_err()
                 .strip_backtrace();
@@ -978,7 +1580,9 @@ mod tests {
         ];
 
         for (s, f, ctx) in cases {
-            let expected = format!("Execution error: Error parsing timestamp from '{s}' using format '{f}': {ctx}");
+            let expected = format!(
+                "Execution error: Error parsing timestamp from '{s}' using format '{f}': {ctx}"
+            );
             let actual = string_to_datetime_formatted(&Utc, s, f)
                 .unwrap_err()
                 .strip_backtrace();
@@ -987,13 +1591,21 @@ mod tests {
     }
 
     #[test]
-    fn test_tz() {
+    fn test_no_tz() {
         let udfs: Vec<Box<dyn ScalarUDFImpl>> = vec![
-            Box::new(ToTimestampFunc::new()),
-            Box::new(ToTimestampSecondsFunc::new()),
-            Box::new(ToTimestampMillisFunc::new()),
-            Box::new(ToTimestampNanosFunc::new()),
-            Box::new(ToTimestampSecondsFunc::new()),
+            Box::new(ToTimestampFunc::new_with_config(&ConfigOptions::default())),
+            Box::new(ToTimestampSecondsFunc::new_with_config(
+                &ConfigOptions::default(),
+            )),
+            Box::new(ToTimestampMillisFunc::new_with_config(
+                &ConfigOptions::default(),
+            )),
+            Box::new(ToTimestampNanosFunc::new_with_config(
+                &ConfigOptions::default(),
+            )),
+            Box::new(ToTimestampSecondsFunc::new_with_config(
+                &ConfigOptions::default(),
+            )),
         ];
 
         let mut nanos_builder = TimestampNanosecondArray::builder(2);
@@ -1026,8 +1638,8 @@ mod tests {
             for array in arrays {
                 let rt = udf.return_type(&[array.data_type()]).unwrap();
                 let arg_field = Field::new("arg", array.data_type().clone(), true).into();
-                assert!(matches!(rt, Timestamp(_, Some(_))));
-                let args = datafusion_expr::ScalarFunctionArgs {
+                assert!(matches!(rt, Timestamp(_, None)));
+                let args = ScalarFunctionArgs {
                     args: vec![array.clone()],
                     arg_fields: vec![arg_field],
                     number_rows: 4,
@@ -1042,7 +1654,7 @@ mod tests {
                     _ => panic!("Expected a columnar array"),
                 };
                 let ty = array.data_type();
-                assert!(matches!(ty, Timestamp(_, Some(_))));
+                assert!(matches!(ty, Timestamp(_, None)));
             }
         }
 
@@ -1077,7 +1689,7 @@ mod tests {
                 let rt = udf.return_type(&[array.data_type()]).unwrap();
                 assert!(matches!(rt, Timestamp(_, None)));
                 let arg_field = Field::new("arg", array.data_type().clone(), true).into();
-                let args = datafusion_expr::ScalarFunctionArgs {
+                let args = ScalarFunctionArgs {
                     args: vec![array.clone()],
                     arg_fields: vec![arg_field],
                     number_rows: 5,
@@ -1214,4 +1826,23 @@ mod tests {
             assert_contains!(actual, expected);
         }
     }
+
+    #[test]
+    fn test_decimal_to_nanoseconds_negative_scale() {
+        // scale -2: internal value 5 represents 5 * 10^2 = 500 seconds
+        let nanos = decimal_to_nanoseconds(5, -2);
+        assert_eq!(nanos, 500_000_000_000); // 500 seconds in nanoseconds
+
+        // scale -1: internal value 10 represents 10 * 10^1 = 100 seconds
+        let nanos = decimal_to_nanoseconds(10, -1);
+        assert_eq!(nanos, 100_000_000_000);
+
+        // scale 0: internal value 5 represents 5 seconds
+        let nanos = decimal_to_nanoseconds(5, 0);
+        assert_eq!(nanos, 5_000_000_000);
+
+        // scale 3: internal value 1500 represents 1.5 seconds
+        let nanos = decimal_to_nanoseconds(1500, 3);
+        assert_eq!(nanos, 1_500_000_000);
+    }
 }
diff --git a/datafusion/functions/src/datetime/to_unixtime.rs b/datafusion/functions/src/datetime/to_unixtime.rs
index 42651cd537162..9fcfd254ca74d 100644
--- a/datafusion/functions/src/datetime/to_unixtime.rs
+++ b/datafusion/functions/src/datetime/to_unixtime.rs
@@ -18,16 +18,21 @@
 use super::to_timestamp::ToTimestampSecondsFunc;
 use crate::datetime::common::*;
 use arrow::datatypes::{DataType, TimeUnit};
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
 
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
-    description = "Converts a value to seconds since the unix epoch (`1970-01-01T00:00:00Z`). Supports strings, dates, timestamps and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided.",
+    description = r#"
+Converts a value to seconds since the unix epoch (`1970-01-01T00:00:00`).
+Supports strings, dates, timestamps, integer, unsigned integer, and float types as input.
+Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided.
+Integers, unsigned integers, and floats are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00`)."#,
     syntax_example = "to_unixtime(expression[, ..., format_n])",
     sql_example = r#"
 ```sql
@@ -74,10 +79,6 @@ impl ToUnixtimeFunc {
 }
 
 impl ScalarUDFImpl for ToUnixtimeFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "to_unixtime"
     }
@@ -90,10 +91,7 @@ impl ScalarUDFImpl for ToUnixtimeFunc {
         Ok(DataType::Int64)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let arg_args = &args.args;
         if arg_args.is_empty() {
             return exec_err!("to_unixtime function requires 1 or more arguments, got 0");
@@ -101,22 +99,44 @@ impl ScalarUDFImpl for ToUnixtimeFunc {
 
         // validate that any args after the first one are Utf8
         if arg_args.len() > 1 {
-            validate_data_types(arg_args, "to_unixtime")?;
+            // Format arguments only make sense for string inputs
+            match arg_args[0].data_type() {
+                DataType::Utf8View | DataType::LargeUtf8 | DataType::Utf8 => {
+                    validate_data_types(arg_args, "to_unixtime")?;
+                }
+                _ => {
+                    return exec_err!(
+                        "to_unixtime function only accepts format arguments with string input, got {} arguments",
+                        arg_args.len()
+                    );
+                }
+            }
         }
 
         match arg_args[0].data_type() {
-            DataType::Int32 | DataType::Int64 | DataType::Null | DataType::Float64 => {
-                arg_args[0].cast_to(&DataType::Int64, None)
-            }
+            DataType::Int8
+            | DataType::Int16
+            | DataType::Int32
+            | DataType::Int64
+            | DataType::UInt8
+            | DataType::UInt16
+            | DataType::UInt32
+            | DataType::UInt64
+            | DataType::Float16
+            | DataType::Float32
+            | DataType::Float64
+            | DataType::Null => arg_args[0].cast_to(&DataType::Int64, None),
             DataType::Date64 | DataType::Date32 => arg_args[0]
                 .cast_to(&DataType::Timestamp(TimeUnit::Second, None), None)?
                 .cast_to(&DataType::Int64, None),
             DataType::Timestamp(_, tz) => arg_args[0]
                 .cast_to(&DataType::Timestamp(TimeUnit::Second, tz), None)?
                 .cast_to(&DataType::Int64, None),
-            DataType::Utf8 => ToTimestampSecondsFunc::new()
-                .invoke_with_args(args)?
-                .cast_to(&DataType::Int64, None),
+            DataType::Utf8View | DataType::LargeUtf8 | DataType::Utf8 => {
+                ToTimestampSecondsFunc::new_with_config(args.config_options.as_ref())
+                    .invoke_with_args(args)?
+                    .cast_to(&DataType::Int64, None)
+            }
             other => {
                 exec_err!("Unsupported data type {} for function to_unixtime", other)
             }
diff --git a/datafusion/functions/src/encoding/inner.rs b/datafusion/functions/src/encoding/inner.rs
index e5314ad220c8f..ad156f735b33b 100644
--- a/datafusion/functions/src/encoding/inner.rs
+++ b/datafusion/functions/src/encoding/inner.rs
@@ -19,29 +19,29 @@
 
 use arrow::{
     array::{
-        Array, ArrayRef, BinaryArray, GenericByteArray, OffsetSizeTrait, StringArray,
+        Array, ArrayRef, AsArray, BinaryArrayType, GenericBinaryArray,
+        GenericStringArray, OffsetSizeTrait,
     },
-    datatypes::{ByteArrayType, DataType},
+    datatypes::DataType,
 };
 use arrow_buffer::{Buffer, OffsetBufferBuilder};
 use base64::{
-    engine::{DecodePaddingMode, GeneralPurpose, GeneralPurposeConfig},
     Engine as _,
+    engine::{DecodePaddingMode, GeneralPurpose, GeneralPurposeConfig},
 };
 use datafusion_common::{
-    cast::{as_generic_binary_array, as_generic_string_array},
+    DataFusionError, Result, ScalarValue, exec_datafusion_err, exec_err, internal_err,
     not_impl_err, plan_err,
+    types::{NativeType, logical_string},
     utils::take_function_args,
 };
-use datafusion_common::{exec_err, internal_datafusion_err, ScalarValue};
-use datafusion_common::{DataFusionError, Result};
-use datafusion_expr::{ColumnarValue, Documentation};
-use std::sync::Arc;
-use std::{fmt, str::FromStr};
-
-use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
+};
 use datafusion_macros::user_doc;
-use std::any::Any;
+use std::fmt;
+use std::sync::Arc;
 
 // Allow padding characters, but don't require them, and don't generate them.
 const BASE64_ENGINE: GeneralPurpose = GeneralPurpose::new(
@@ -51,6 +51,12 @@ const BASE64_ENGINE: GeneralPurpose = GeneralPurpose::new(
         .with_decode_padding_mode(DecodePaddingMode::Indifferent),
 );
 
+// Generate padding characters when encoding
+const BASE64_ENGINE_PADDED: GeneralPurpose = GeneralPurpose::new(
+    &base64::alphabet::STANDARD,
+    GeneralPurposeConfig::new().with_encode_padding(true),
+);
+
 #[user_doc(
     doc_section(label = "Binary String Functions"),
     description = "Encode binary data into a textual representation.",
@@ -61,7 +67,7 @@ const BASE64_ENGINE: GeneralPurpose = GeneralPurpose::new(
     ),
     argument(
         name = "format",
-        description = "Supported formats are: `base64`, `hex`"
+        description = "Supported formats are: `base64`, `base64pad`, `hex`"
     ),
     related_udf(name = "decode")
 )]
@@ -79,15 +85,22 @@ impl Default for EncodeFunc {
 impl EncodeFunc {
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Binary,
+                        vec![TypeSignatureClass::Native(logical_string())],
+                        NativeType::Binary,
+                    ),
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                ],
+                Volatility::Immutable,
+            ),
         }
     }
 }
 
 impl ScalarUDFImpl for EncodeFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "encode"
     }
@@ -97,48 +110,21 @@ impl ScalarUDFImpl for EncodeFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        use DataType::*;
-
-        Ok(match arg_types[0] {
-            Utf8 => Utf8,
-            LargeUtf8 => LargeUtf8,
-            Utf8View => Utf8,
-            Binary => Utf8,
-            LargeBinary => LargeUtf8,
-            Null => Null,
-            _ => {
-                return plan_err!(
-                    "The encode function can only accept Utf8 or Binary or Null."
-                );
-            }
-        })
-    }
-
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        encode(&args.args)
-    }
-
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        let [expression, format] = take_function_args(self.name(), arg_types)?;
-
-        if format != &DataType::Utf8 {
-            return Err(DataFusionError::Plan("2nd argument should be Utf8".into()));
+        match &arg_types[0] {
+            DataType::LargeBinary => Ok(DataType::LargeUtf8),
+            _ => Ok(DataType::Utf8),
         }
+    }
 
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let [expression, encoding] = take_function_args("encode", &args.args)?;
+        let encoding = Encoding::try_from(encoding)?;
         match expression {
-            DataType::Utf8 | DataType::Utf8View | DataType::Null => {
-                Ok(vec![DataType::Utf8; 2])
+            _ if expression.data_type().is_null() => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
             }
-            DataType::LargeUtf8 => Ok(vec![DataType::LargeUtf8, DataType::Utf8]),
-            DataType::Binary => Ok(vec![DataType::Binary, DataType::Utf8]),
-            DataType::LargeBinary => Ok(vec![DataType::LargeBinary, DataType::Utf8]),
-            _ => plan_err!(
-                "1st argument should be Utf8 or Binary or Null, got {:?}",
-                arg_types[0]
-            ),
+            ColumnarValue::Array(array) => encode_array(array, encoding),
+            ColumnarValue::Scalar(scalar) => encode_scalar(scalar, encoding),
         }
     }
 
@@ -172,15 +158,22 @@ impl Default for DecodeFunc {
 impl DecodeFunc {
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Binary,
+                        vec![TypeSignatureClass::Native(logical_string())],
+                        NativeType::Binary,
+                    ),
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                ],
+                Volatility::Immutable,
+            ),
         }
     }
 }
 
 impl ScalarUDFImpl for DecodeFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "decode"
     }
@@ -190,40 +183,21 @@ impl ScalarUDFImpl for DecodeFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(arg_types[0].to_owned())
-    }
-
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        decode(&args.args)
-    }
-
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        if arg_types.len() != 2 {
-            return plan_err!(
-                "{} expects to get 2 arguments, but got {}",
-                self.name(),
-                arg_types.len()
-            );
-        }
-
-        if arg_types[1] != DataType::Utf8 {
-            return plan_err!("2nd argument should be Utf8");
+        match &arg_types[0] {
+            DataType::LargeBinary => Ok(DataType::LargeBinary),
+            _ => Ok(DataType::Binary),
         }
+    }
 
-        match arg_types[0] {
-            DataType::Utf8 | DataType::Utf8View | DataType::Binary | DataType::Null => {
-                Ok(vec![DataType::Binary, DataType::Utf8])
-            }
-            DataType::LargeUtf8 | DataType::LargeBinary => {
-                Ok(vec![DataType::LargeBinary, DataType::Utf8])
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let [expression, encoding] = take_function_args("decode", &args.args)?;
+        let encoding = Encoding::try_from(encoding)?;
+        match expression {
+            _ if expression.data_type().is_null() => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Binary(None)))
             }
-            _ => plan_err!(
-                "1st argument should be Utf8 or Binary or Null, got {:?}",
-                arg_types[0]
-            ),
+            ColumnarValue::Array(array) => decode_array(array, encoding),
+            ColumnarValue::Scalar(scalar) => decode_scalar(scalar, encoding),
         }
     }
 
@@ -232,324 +206,317 @@ impl ScalarUDFImpl for DecodeFunc {
     }
 }
 
-#[derive(Debug, Copy, Clone)]
-enum Encoding {
-    Base64,
-    Hex,
-}
-
-fn encode_process(value: &ColumnarValue, encoding: Encoding) -> Result<ColumnarValue> {
+fn encode_scalar(value: &ScalarValue, encoding: Encoding) -> Result<ColumnarValue> {
     match value {
-        ColumnarValue::Array(a) => match a.data_type() {
-            DataType::Utf8 => encoding.encode_utf8_array::<i32>(a.as_ref()),
-            DataType::LargeUtf8 => encoding.encode_utf8_array::<i64>(a.as_ref()),
-            DataType::Utf8View => encoding.encode_utf8_array::<i32>(a.as_ref()),
-            DataType::Binary => encoding.encode_binary_array::<i32>(a.as_ref()),
-            DataType::LargeBinary => encoding.encode_binary_array::<i64>(a.as_ref()),
-            other => exec_err!(
-                "Unsupported data type {other:?} for function encode({encoding})"
-            ),
-        },
-        ColumnarValue::Scalar(scalar) => {
-            match scalar {
-                ScalarValue::Utf8(a) => {
-                    Ok(encoding.encode_scalar(a.as_ref().map(|s: &String| s.as_bytes())))
-                }
-                ScalarValue::LargeUtf8(a) => Ok(encoding
-                    .encode_large_scalar(a.as_ref().map(|s: &String| s.as_bytes()))),
-                ScalarValue::Utf8View(a) => {
-                    Ok(encoding.encode_scalar(a.as_ref().map(|s: &String| s.as_bytes())))
-                }
-                ScalarValue::Binary(a) => Ok(
-                    encoding.encode_scalar(a.as_ref().map(|v: &Vec<u8>| v.as_slice()))
-                ),
-                ScalarValue::LargeBinary(a) => Ok(encoding
-                    .encode_large_scalar(a.as_ref().map(|v: &Vec<u8>| v.as_slice()))),
-                other => exec_err!(
-                    "Unsupported data type {other:?} for function encode({encoding})"
-                ),
-            }
+        ScalarValue::Binary(maybe_bytes)
+        | ScalarValue::BinaryView(maybe_bytes)
+        | ScalarValue::FixedSizeBinary(_, maybe_bytes) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(
+                maybe_bytes
+                    .as_ref()
+                    .map(|bytes| encoding.encode_bytes(bytes)),
+            )))
         }
+        ScalarValue::LargeBinary(maybe_bytes) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(
+                maybe_bytes
+                    .as_ref()
+                    .map(|bytes| encoding.encode_bytes(bytes)),
+            )))
+        }
+        v => internal_err!("Unexpected value for encode: {v}"),
     }
 }
 
-fn decode_process(value: &ColumnarValue, encoding: Encoding) -> Result<ColumnarValue> {
-    match value {
-        ColumnarValue::Array(a) => match a.data_type() {
-            DataType::Utf8 => encoding.decode_utf8_array::<i32>(a.as_ref()),
-            DataType::LargeUtf8 => encoding.decode_utf8_array::<i64>(a.as_ref()),
-            DataType::Utf8View => encoding.decode_utf8_array::<i32>(a.as_ref()),
-            DataType::Binary => encoding.decode_binary_array::<i32>(a.as_ref()),
-            DataType::LargeBinary => encoding.decode_binary_array::<i64>(a.as_ref()),
-            other => exec_err!(
-                "Unsupported data type {other:?} for function decode({encoding})"
-            ),
-        },
-        ColumnarValue::Scalar(scalar) => {
-            match scalar {
-                ScalarValue::Utf8(a) => {
-                    encoding.decode_scalar(a.as_ref().map(|s: &String| s.as_bytes()))
-                }
-                ScalarValue::LargeUtf8(a) => encoding
-                    .decode_large_scalar(a.as_ref().map(|s: &String| s.as_bytes())),
-                ScalarValue::Utf8View(a) => {
-                    encoding.decode_scalar(a.as_ref().map(|s: &String| s.as_bytes()))
-                }
-                ScalarValue::Binary(a) => {
-                    encoding.decode_scalar(a.as_ref().map(|v: &Vec<u8>| v.as_slice()))
-                }
-                ScalarValue::LargeBinary(a) => encoding
-                    .decode_large_scalar(a.as_ref().map(|v: &Vec<u8>| v.as_slice())),
-                other => exec_err!(
-                    "Unsupported data type {other:?} for function decode({encoding})"
-                ),
-            }
+fn encode_array(array: &ArrayRef, encoding: Encoding) -> Result<ColumnarValue> {
+    let array = match array.data_type() {
+        DataType::Binary => encoding.encode_array::<_, i32>(&array.as_binary::<i32>()),
+        DataType::BinaryView => encoding.encode_array::<_, i32>(&array.as_binary_view()),
+        DataType::LargeBinary => {
+            encoding.encode_array::<_, i64>(&array.as_binary::<i64>())
         }
-    }
+        DataType::FixedSizeBinary(_) => {
+            encoding.encode_array::<_, i32>(&array.as_fixed_size_binary())
+        }
+        dt => {
+            internal_err!("Unexpected data type for encode: {dt}")
+        }
+    };
+    array.map(ColumnarValue::Array)
 }
 
-fn hex_encode(input: &[u8]) -> String {
-    hex::encode(input)
+fn decode_scalar(value: &ScalarValue, encoding: Encoding) -> Result<ColumnarValue> {
+    match value {
+        ScalarValue::Binary(maybe_bytes)
+        | ScalarValue::BinaryView(maybe_bytes)
+        | ScalarValue::FixedSizeBinary(_, maybe_bytes) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::Binary(
+                maybe_bytes
+                    .as_ref()
+                    .map(|x| encoding.decode_bytes(x))
+                    .transpose()?,
+            )))
+        }
+        ScalarValue::LargeBinary(maybe_bytes) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::LargeBinary(
+                maybe_bytes
+                    .as_ref()
+                    .map(|x| encoding.decode_bytes(x))
+                    .transpose()?,
+            )))
+        }
+        v => internal_err!("Unexpected value for decode: {v}"),
+    }
 }
 
-fn base64_encode(input: &[u8]) -> String {
-    BASE64_ENGINE.encode(input)
+/// Estimate how many bytes are actually represented by the array; in case the
+/// the array slices it's internal buffer, this returns the byte size of that slice
+/// but not the byte size of the entire buffer.
+///
+/// This is an estimation only as it can estimate higher if null slots are non-zero
+/// sized.
+fn estimate_byte_data_size<O: OffsetSizeTrait>(array: &GenericBinaryArray<O>) -> usize {
+    let offsets = array.value_offsets();
+    // Unwraps are safe as should always have 1 element in offset buffer
+    let start = *offsets.first().unwrap();
+    let end = *offsets.last().unwrap();
+    let data_size = end - start;
+    data_size.as_usize()
 }
 
-fn hex_decode(input: &[u8], buf: &mut [u8]) -> Result<usize> {
-    // only write input / 2 bytes to buf
-    let out_len = input.len() / 2;
-    let buf = &mut buf[..out_len];
-    hex::decode_to_slice(input, buf)
-        .map_err(|e| internal_datafusion_err!("Failed to decode from hex: {e}"))?;
-    Ok(out_len)
+fn decode_array(array: &ArrayRef, encoding: Encoding) -> Result<ColumnarValue> {
+    let array = match array.data_type() {
+        DataType::Binary => {
+            let array = array.as_binary::<i32>();
+            encoding.decode_array::<_, i32>(&array, estimate_byte_data_size(array))
+        }
+        DataType::BinaryView => {
+            let array = array.as_binary_view();
+            // Don't know if there is a more strict upper bound we can infer
+            // for view arrays byte data size.
+            encoding.decode_array::<_, i32>(&array, array.get_buffer_memory_size())
+        }
+        DataType::LargeBinary => {
+            let array = array.as_binary::<i64>();
+            encoding.decode_array::<_, i64>(&array, estimate_byte_data_size(array))
+        }
+        DataType::FixedSizeBinary(size) => {
+            let array = array.as_fixed_size_binary();
+            // TODO: could we be more conservative by accounting for nulls?
+            let estimate = array.len().saturating_mul(*size as usize);
+            encoding.decode_array::<_, i32>(&array, estimate)
+        }
+        dt => {
+            internal_err!("Unexpected data type for decode: {dt}")
+        }
+    };
+    array.map(ColumnarValue::Array)
 }
 
-fn base64_decode(input: &[u8], buf: &mut [u8]) -> Result<usize> {
-    BASE64_ENGINE
-        .decode_slice(input, buf)
-        .map_err(|e| internal_datafusion_err!("Failed to decode from base64: {e}"))
+#[derive(Debug, Copy, Clone)]
+enum Encoding {
+    Base64,
+    Base64Padded,
+    Hex,
 }
 
-macro_rules! encode_to_array {
-    ($METHOD: ident, $INPUT:expr) => {{
-        let utf8_array: StringArray = $INPUT
-            .iter()
-            .map(|x| x.map(|x| $METHOD(x.as_ref())))
-            .collect();
-        Arc::new(utf8_array)
-    }};
+impl fmt::Display for Encoding {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let name = match self {
+            Self::Base64 => "base64",
+            Self::Base64Padded => "base64pad",
+            Self::Hex => "hex",
+        };
+        write!(f, "{name}")
+    }
 }
 
-fn decode_to_array<F, T: ByteArrayType>(
-    method: F,
-    input: &GenericByteArray<T>,
-    conservative_upper_bound_size: usize,
-) -> Result<ArrayRef>
-where
-    F: Fn(&[u8], &mut [u8]) -> Result<usize>,
-{
-    let mut values = vec![0; conservative_upper_bound_size];
-    let mut offsets = OffsetBufferBuilder::new(input.len());
-    let mut total_bytes_decoded = 0;
-    for v in input {
-        if let Some(v) = v {
-            let cursor = &mut values[total_bytes_decoded..];
-            let decoded = method(v.as_ref(), cursor)?;
-            total_bytes_decoded += decoded;
-            offsets.push_length(decoded);
-        } else {
-            offsets.push_length(0);
+impl TryFrom<&ColumnarValue> for Encoding {
+    type Error = DataFusionError;
+
+    fn try_from(encoding: &ColumnarValue) -> Result<Self> {
+        let encoding = match encoding {
+            ColumnarValue::Scalar(encoding) => match encoding.try_as_str().flatten() {
+                Some(encoding) => encoding,
+                _ => return exec_err!("Encoding must be a non-null string"),
+            },
+            ColumnarValue::Array(_) => {
+                return not_impl_err!(
+                    "Encoding must be a scalar; array specified encoding is not yet supported"
+                );
+            }
+        };
+        match encoding {
+            "base64" => Ok(Self::Base64),
+            "base64pad" => Ok(Self::Base64Padded),
+            "hex" => Ok(Self::Hex),
+            _ => {
+                let options = [Self::Base64, Self::Base64Padded, Self::Hex]
+                    .iter()
+                    .map(|i| i.to_string())
+                    .collect::<Vec<_>>()
+                    .join(", ");
+                plan_err!(
+                    "There is no built-in encoding named '{encoding}', currently supported encodings are: {options}"
+                )
+            }
         }
     }
-    // We reserved an upper bound size for the values buffer, but we only use the actual size
-    values.truncate(total_bytes_decoded);
-    let binary_array = BinaryArray::try_new(
-        offsets.finish(),
-        Buffer::from_vec(values),
-        input.nulls().cloned(),
-    )?;
-    Ok(Arc::new(binary_array))
 }
 
 impl Encoding {
-    fn encode_scalar(self, value: Option<&[u8]>) -> ColumnarValue {
-        ColumnarValue::Scalar(match self {
-            Self::Base64 => ScalarValue::Utf8(value.map(|v| BASE64_ENGINE.encode(v))),
-            Self::Hex => ScalarValue::Utf8(value.map(hex::encode)),
-        })
+    fn encode_bytes(self, value: &[u8]) -> String {
+        match self {
+            Self::Base64 => BASE64_ENGINE.encode(value),
+            Self::Base64Padded => BASE64_ENGINE_PADDED.encode(value),
+            Self::Hex => hex::encode(value),
+        }
     }
 
-    fn encode_large_scalar(self, value: Option<&[u8]>) -> ColumnarValue {
-        ColumnarValue::Scalar(match self {
-            Self::Base64 => {
-                ScalarValue::LargeUtf8(value.map(|v| BASE64_ENGINE.encode(v)))
+    fn decode_bytes(self, value: &[u8]) -> Result<Vec<u8>> {
+        match self {
+            Self::Base64 | Self::Base64Padded => {
+                BASE64_ENGINE.decode(value).map_err(|e| {
+                    exec_datafusion_err!("Failed to decode value using {self}: {e}")
+                })
             }
-            Self::Hex => ScalarValue::LargeUtf8(value.map(hex::encode)),
-        })
-    }
-
-    fn encode_binary_array<T>(self, value: &dyn Array) -> Result<ColumnarValue>
-    where
-        T: OffsetSizeTrait,
-    {
-        let input_value = as_generic_binary_array::<T>(value)?;
-        let array: ArrayRef = match self {
-            Self::Base64 => encode_to_array!(base64_encode, input_value),
-            Self::Hex => encode_to_array!(hex_encode, input_value),
-        };
-        Ok(ColumnarValue::Array(array))
-    }
-
-    fn encode_utf8_array<T>(self, value: &dyn Array) -> Result<ColumnarValue>
-    where
-        T: OffsetSizeTrait,
-    {
-        let input_value = as_generic_string_array::<T>(value)?;
-        let array: ArrayRef = match self {
-            Self::Base64 => encode_to_array!(base64_encode, input_value),
-            Self::Hex => encode_to_array!(hex_encode, input_value),
-        };
-        Ok(ColumnarValue::Array(array))
-    }
-
-    fn decode_scalar(self, value: Option<&[u8]>) -> Result<ColumnarValue> {
-        let value = match value {
-            Some(value) => value,
-            None => return Ok(ColumnarValue::Scalar(ScalarValue::Binary(None))),
-        };
-
-        let out = match self {
-            Self::Base64 => BASE64_ENGINE.decode(value).map_err(|e| {
-                internal_datafusion_err!("Failed to decode value using base64: {e}")
-            })?,
-            Self::Hex => hex::decode(value).map_err(|e| {
-                internal_datafusion_err!("Failed to decode value using hex: {e}")
-            })?,
-        };
-
-        Ok(ColumnarValue::Scalar(ScalarValue::Binary(Some(out))))
-    }
-
-    fn decode_large_scalar(self, value: Option<&[u8]>) -> Result<ColumnarValue> {
-        let value = match value {
-            Some(value) => value,
-            None => return Ok(ColumnarValue::Scalar(ScalarValue::LargeBinary(None))),
-        };
-
-        let out = match self {
-            Self::Base64 => BASE64_ENGINE.decode(value).map_err(|e| {
-                internal_datafusion_err!("Failed to decode value using base64: {e}")
-            })?,
             Self::Hex => hex::decode(value).map_err(|e| {
-                internal_datafusion_err!("Failed to decode value using hex: {e}")
-            })?,
-        };
-
-        Ok(ColumnarValue::Scalar(ScalarValue::LargeBinary(Some(out))))
+                exec_datafusion_err!("Failed to decode value using hex: {e}")
+            }),
+        }
     }
 
-    fn decode_binary_array<T>(self, value: &dyn Array) -> Result<ColumnarValue>
+    // OutputOffset important to ensure Large types output Large arrays
+    fn encode_array<'a, InputBinaryArray, OutputOffset>(
+        self,
+        array: &InputBinaryArray,
+    ) -> Result<ArrayRef>
     where
-        T: OffsetSizeTrait,
+        InputBinaryArray: BinaryArrayType<'a>,
+        OutputOffset: OffsetSizeTrait,
     {
-        let input_value = as_generic_binary_array::<T>(value)?;
-        let array = self.decode_byte_array(input_value)?;
-        Ok(ColumnarValue::Array(array))
+        match self {
+            Self::Base64 => {
+                let array: GenericStringArray<OutputOffset> = array
+                    .iter()
+                    .map(|x| x.map(|x| BASE64_ENGINE.encode(x)))
+                    .collect();
+                Ok(Arc::new(array))
+            }
+            Self::Base64Padded => {
+                let array: GenericStringArray<OutputOffset> = array
+                    .iter()
+                    .map(|x| x.map(|x| BASE64_ENGINE_PADDED.encode(x)))
+                    .collect();
+                Ok(Arc::new(array))
+            }
+            Self::Hex => {
+                let array: GenericStringArray<OutputOffset> =
+                    array.iter().map(|x| x.map(hex::encode)).collect();
+                Ok(Arc::new(array))
+            }
+        }
     }
 
-    fn decode_utf8_array<T>(self, value: &dyn Array) -> Result<ColumnarValue>
+    // OutputOffset important to ensure Large types output Large arrays
+    fn decode_array<'a, InputBinaryArray, OutputOffset>(
+        self,
+        value: &InputBinaryArray,
+        approx_data_size: usize,
+    ) -> Result<ArrayRef>
     where
-        T: OffsetSizeTrait,
+        InputBinaryArray: BinaryArrayType<'a>,
+        OutputOffset: OffsetSizeTrait,
     {
-        let input_value = as_generic_string_array::<T>(value)?;
-        let array = self.decode_byte_array(input_value)?;
-        Ok(ColumnarValue::Array(array))
-    }
+        fn hex_decode(input: &[u8], buf: &mut [u8]) -> Result<usize> {
+            // only write input / 2 bytes to buf
+            let out_len = input.len() / 2;
+            let buf = &mut buf[..out_len];
+            hex::decode_to_slice(input, buf)
+                .map_err(|e| exec_datafusion_err!("Failed to decode from hex: {e}"))?;
+            Ok(out_len)
+        }
+
+        fn base64_decode(input: &[u8], buf: &mut [u8]) -> Result<usize> {
+            BASE64_ENGINE
+                .decode_slice(input, buf)
+                .map_err(|e| exec_datafusion_err!("Failed to decode from base64: {e}"))
+        }
 
-    fn decode_byte_array<T: ByteArrayType>(
-        &self,
-        input_value: &GenericByteArray<T>,
-    ) -> Result<ArrayRef> {
         match self {
-            Self::Base64 => {
-                let upper_bound =
-                    base64::decoded_len_estimate(input_value.values().len());
-                decode_to_array(base64_decode, input_value, upper_bound)
+            Self::Base64 | Self::Base64Padded => {
+                let upper_bound = base64::decoded_len_estimate(approx_data_size);
+                delegated_decode::<_, _, OutputOffset>(base64_decode, value, upper_bound)
             }
             Self::Hex => {
                 // Calculate the upper bound for decoded byte size
                 // For hex encoding, each pair of hex characters (2 bytes) represents 1 byte when decoded
                 // So the upper bound is half the length of the input values.
-                let upper_bound = input_value.values().len() / 2;
-                decode_to_array(hex_decode, input_value, upper_bound)
+                let upper_bound = approx_data_size / 2;
+                delegated_decode::<_, _, OutputOffset>(hex_decode, value, upper_bound)
             }
         }
     }
 }
 
-impl fmt::Display for Encoding {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "{}", format!("{self:?}").to_lowercase())
+fn delegated_decode<'a, DecodeFunction, InputBinaryArray, OutputOffset>(
+    decode: DecodeFunction,
+    input: &InputBinaryArray,
+    conservative_upper_bound_size: usize,
+) -> Result<ArrayRef>
+where
+    DecodeFunction: Fn(&[u8], &mut [u8]) -> Result<usize>,
+    InputBinaryArray: BinaryArrayType<'a>,
+    OutputOffset: OffsetSizeTrait,
+{
+    let mut values = vec![0; conservative_upper_bound_size];
+    let mut offsets = OffsetBufferBuilder::new(input.len());
+    let mut total_bytes_decoded = 0;
+    for v in input.iter() {
+        if let Some(v) = v {
+            let cursor = &mut values[total_bytes_decoded..];
+            let decoded = decode(v, cursor)?;
+            total_bytes_decoded += decoded;
+            offsets.push_length(decoded);
+        } else {
+            offsets.push_length(0);
+        }
     }
+    // We reserved an upper bound size for the values buffer, but we only use the actual size
+    values.truncate(total_bytes_decoded);
+    let binary_array = GenericBinaryArray::<OutputOffset>::try_new(
+        offsets.finish(),
+        Buffer::from_vec(values),
+        input.nulls().cloned(),
+    )?;
+    Ok(Arc::new(binary_array))
 }
 
-impl FromStr for Encoding {
-    type Err = DataFusionError;
-    fn from_str(name: &str) -> Result<Encoding> {
-        Ok(match name {
-            "base64" => Self::Base64,
-            "hex" => Self::Hex,
-            _ => {
-                let options = [Self::Base64, Self::Hex]
-                    .iter()
-                    .map(|i| i.to_string())
-                    .collect::<Vec<_>>()
-                    .join(", ");
-                return plan_err!(
-                    "There is no built-in encoding named '{name}', currently supported encodings are: {options}"
-                );
-            }
-        })
+#[cfg(test)]
+mod tests {
+    use arrow::array::BinaryArray;
+    use arrow_buffer::OffsetBuffer;
+
+    use super::*;
+
+    #[test]
+    fn test_estimate_byte_data_size() {
+        // Offsets starting at 0, but don't count entire data buffer size
+        let array = BinaryArray::new(
+            OffsetBuffer::new(vec![0, 5, 10, 15].into()),
+            vec![0; 100].into(),
+            None,
+        );
+        let size = estimate_byte_data_size(&array);
+        assert_eq!(size, 15);
+
+        // Offsets starting at 0, but don't count entire data buffer size
+        let array = BinaryArray::new(
+            OffsetBuffer::new(vec![50, 51, 51, 60, 80, 81].into()),
+            vec![0; 100].into(),
+            Some(vec![true, false, false, true, true].into()),
+        );
+        let size = estimate_byte_data_size(&array);
+        assert_eq!(size, 31);
     }
 }
-
-/// Encodes the given data, accepts Binary, LargeBinary, Utf8, Utf8View or LargeUtf8 and returns a [`ColumnarValue`].
-/// Second argument is the encoding to use.
-/// Standard encodings are base64 and hex.
-fn encode(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-    let [expression, format] = take_function_args("encode", args)?;
-
-    let encoding = match format {
-        ColumnarValue::Scalar(scalar) => match scalar.try_as_str() {
-            Some(Some(method)) => method.parse::<Encoding>(),
-            _ => not_impl_err!(
-                "Second argument to encode must be non null constant string: Encode using dynamically decided method is not yet supported. Got {scalar:?}"
-            ),
-        },
-        ColumnarValue::Array(_) => not_impl_err!(
-            "Second argument to encode must be a constant: Encode using dynamically decided method is not yet supported"
-        ),
-    }?;
-    encode_process(expression, encoding)
-}
-
-/// Decodes the given data, accepts Binary, LargeBinary, Utf8, Utf8View or LargeUtf8 and returns a [`ColumnarValue`].
-/// Second argument is the encoding to use.
-/// Standard encodings are base64 and hex.
-fn decode(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-    let [expression, format] = take_function_args("decode", args)?;
-
-    let encoding = match format {
-        ColumnarValue::Scalar(scalar) => match scalar.try_as_str() {
-            Some(Some(method))=> method.parse::<Encoding>(),
-            _ => not_impl_err!(
-                "Second argument to decode must be a non null constant string: Decode using dynamically decided method is not yet supported. Got {scalar:?}"
-            ),
-        },
-        ColumnarValue::Array(_) => not_impl_err!(
-            "Second argument to decode must be a utf8 constant: Decode using dynamically decided method is not yet supported"
-        ),
-    }?;
-    decode_process(expression, encoding)
-}
diff --git a/datafusion/functions/src/lib.rs b/datafusion/functions/src/lib.rs
index 7eb32b7ed795b..b9ce113efa627 100644
--- a/datafusion/functions/src/lib.rs
+++ b/datafusion/functions/src/lib.rs
@@ -23,6 +23,7 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! Function packages for [DataFusion].
 //!
diff --git a/datafusion/functions/src/macros.rs b/datafusion/functions/src/macros.rs
index 9e195f2d52914..71528b4d16bf0 100644
--- a/datafusion/functions/src/macros.rs
+++ b/datafusion/functions/src/macros.rs
@@ -41,6 +41,17 @@
 /// - `Vec<Expr>` argument (single argument followed by a comma)
 /// - Variable number of `Expr` arguments (zero or more arguments, must be without commas)
 /// - Functions that require config (marked with `@config` prefix)
+///
+/// Note on configuration construction paths:
+/// - The convenience wrappers generated for `@config` functions call the inner
+///   constructor with `ConfigOptions::default()`. These wrappers are intended
+///   primarily for programmatic `Expr` construction and convenience usage.
+/// - When functions are registered in a session, DataFusion will call
+///   `with_updated_config()` to create a `ScalarUDF` instance using the session's
+///   actual `ConfigOptions`. This also happens when configuration changes at runtime
+///   (e.g., via `SET` statements). In short: the macro uses the default config for
+///   convenience constructors; the session config is applied when functions are
+///   registered or when configuration is updated.
 #[macro_export]
 macro_rules! export_functions {
     ($(($FUNC:ident, $DOC:expr, $($arg:tt)*)),*) => {
@@ -59,6 +70,24 @@ macro_rules! export_functions {
         }
     };
 
+    // function that requires config and takes a vector argument
+    (single $FUNC:ident, $DOC:expr, @config $arg:ident,) => {
+        #[doc = $DOC]
+        pub fn $FUNC($arg: Vec<datafusion_expr::Expr>) -> datafusion_expr::Expr {
+            use datafusion_common::config::ConfigOptions;
+            super::$FUNC(&ConfigOptions::default()).call($arg)
+        }
+    };
+
+    // function that requires config and variadic arguments
+    (single $FUNC:ident, $DOC:expr, @config $($arg:ident)*) => {
+        #[doc = $DOC]
+        pub fn $FUNC($($arg: datafusion_expr::Expr),*) -> datafusion_expr::Expr {
+            use datafusion_common::config::ConfigOptions;
+            super::$FUNC(&ConfigOptions::default()).call(vec![$($arg),*])
+        }
+    };
+
     // single vector argument (a single argument followed by a comma)
     (single $FUNC:ident, $DOC:expr, $arg:ident,) => {
         #[doc = $DOC]
@@ -77,13 +106,13 @@ macro_rules! export_functions {
 }
 
 /// Creates a singleton `ScalarUDF` of the `$UDF` function and a function
-/// named `$NAME` which returns that singleton.
+/// named `$NAME` which returns that singleton. Optionally use a custom constructor
+/// `$CTOR` which defaults to `$UDF::new()` if not specified.
 ///
 /// This is used to ensure creating the list of `ScalarUDF` only happens once.
 #[macro_export]
 macro_rules! make_udf_function {
-    ($UDF:ty, $NAME:ident) => {
-        #[allow(rustdoc::redundant_explicit_links)]
+    ($UDF:ty, $NAME:ident, $CTOR:expr) => {
         #[doc = concat!("Return a [`ScalarUDF`](datafusion_expr::ScalarUDF) implementation of ", stringify!($NAME))]
         pub fn $NAME() -> std::sync::Arc<datafusion_expr::ScalarUDF> {
             // Singleton instance of the function
@@ -91,12 +120,15 @@ macro_rules! make_udf_function {
                 std::sync::Arc<datafusion_expr::ScalarUDF>,
             > = std::sync::LazyLock::new(|| {
                 std::sync::Arc::new(datafusion_expr::ScalarUDF::new_from_impl(
-                    <$UDF>::new(),
+                    ($CTOR)(),
                 ))
             });
             std::sync::Arc::clone(&INSTANCE)
         }
     };
+    ($UDF:ty, $NAME:ident) => {
+        make_udf_function!($UDF, $NAME, <$UDF>::new);
+    };
 }
 
 /// Creates a singleton `ScalarUDF` of the `$UDF` function and a function
@@ -105,7 +137,6 @@ macro_rules! make_udf_function {
 #[macro_export]
 macro_rules! make_udf_function_with_config {
     ($UDF:ty, $NAME:ident) => {
-        #[allow(rustdoc::redundant_explicit_links)]
         #[doc = concat!("Return a [`ScalarUDF`](datafusion_expr::ScalarUDF) implementation of ", stringify!($NAME))]
         pub fn $NAME(config: &datafusion_common::config::ConfigOptions) -> std::sync::Arc<datafusion_expr::ScalarUDF> {
             std::sync::Arc::new(datafusion_expr::ScalarUDF::new_from_impl(
@@ -164,9 +195,7 @@ macro_rules! downcast_named_arg {
 /// $ARRAY_TYPE: the type of array to cast the argument to
 #[macro_export]
 macro_rules! downcast_arg {
-    ($ARG:expr, $ARRAY_TYPE:ident) => {{
-        $crate::downcast_named_arg!($ARG, "", $ARRAY_TYPE)
-    }};
+    ($ARG:expr, $ARRAY_TYPE:ident) => {{ $crate::downcast_named_arg!($ARG, "", $ARRAY_TYPE) }};
 }
 
 /// Macro to create a unary math UDF.
@@ -184,12 +213,12 @@ macro_rules! make_math_unary_udf {
         $crate::make_udf_function!($NAME::$UDF, $NAME);
 
         mod $NAME {
-            use std::any::Any;
+
             use std::sync::Arc;
 
             use arrow::array::{ArrayRef, AsArray};
             use arrow::datatypes::{DataType, Float32Type, Float64Type};
-            use datafusion_common::{exec_err, Result};
+            use datafusion_common::{Result, exec_err};
             use datafusion_expr::interval_arithmetic::Interval;
             use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
             use datafusion_expr::{
@@ -216,9 +245,6 @@ macro_rules! make_math_unary_udf {
             }
 
             impl ScalarUDFImpl for $UDF {
-                fn as_any(&self) -> &dyn Any {
-                    self
-                }
                 fn name(&self) -> &str {
                     stringify!($NAME)
                 }
@@ -268,7 +294,7 @@ macro_rules! make_math_unary_udf {
                             return exec_err!(
                                 "Unsupported data type {other:?} for function {}",
                                 self.name()
-                            )
+                            );
                         }
                     };
 
@@ -298,14 +324,15 @@ macro_rules! make_math_binary_udf {
         $crate::make_udf_function!($NAME::$UDF, $NAME);
 
         mod $NAME {
-            use std::any::Any;
+
             use std::sync::Arc;
 
             use arrow::array::{ArrayRef, AsArray};
             use arrow::datatypes::{DataType, Float32Type, Float64Type};
-            use datafusion_common::{exec_err, Result};
-            use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
+            use datafusion_common::utils::take_function_args;
+            use datafusion_common::{Result, ScalarValue, internal_err};
             use datafusion_expr::TypeSignature;
+            use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
             use datafusion_expr::{
                 ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl,
                 Signature, Volatility,
@@ -332,9 +359,6 @@ macro_rules! make_math_binary_udf {
             }
 
             impl ScalarUDFImpl for $UDF {
-                fn as_any(&self) -> &dyn Any {
-                    self
-                }
                 fn name(&self) -> &str {
                     stringify!($NAME)
                 }
@@ -364,37 +388,76 @@ macro_rules! make_math_binary_udf {
                     &self,
                     args: ScalarFunctionArgs,
                 ) -> Result<ColumnarValue> {
-                    let args = ColumnarValue::values_to_arrays(&args.args)?;
-                    let arr: ArrayRef = match args[0].data_type() {
-                        DataType::Float64 => {
-                            let y = args[0].as_primitive::<Float64Type>();
-                            let x = args[1].as_primitive::<Float64Type>();
-                            let result = arrow::compute::binary::<_, _, _, Float64Type>(
-                                y,
-                                x,
-                                |y, x| f64::$BINARY_FUNC(y, x),
-                            )?;
-                            Arc::new(result) as _
+                    let ScalarFunctionArgs {
+                        args, return_field, ..
+                    } = args;
+                    let return_type = return_field.data_type();
+                    let [y, x] = take_function_args(self.name(), args)?;
+
+                    match (y, x) {
+                        (
+                            ColumnarValue::Scalar(y_scalar),
+                            ColumnarValue::Scalar(x_scalar),
+                        ) => match (&y_scalar, &x_scalar) {
+                            (y, x) if y.is_null() || x.is_null() => {
+                                ColumnarValue::Scalar(ScalarValue::Null)
+                                    .cast_to(return_type, None)
+                            }
+                            (
+                                ScalarValue::Float64(Some(yv)),
+                                ScalarValue::Float64(Some(xv)),
+                            ) => Ok(ColumnarValue::Scalar(ScalarValue::Float64(Some(
+                                f64::$BINARY_FUNC(*yv, *xv),
+                            )))),
+                            (
+                                ScalarValue::Float32(Some(yv)),
+                                ScalarValue::Float32(Some(xv)),
+                            ) => Ok(ColumnarValue::Scalar(ScalarValue::Float32(Some(
+                                f32::$BINARY_FUNC(*yv, *xv),
+                            )))),
+                            _ => internal_err!(
+                                "Unexpected scalar types for function {}: {:?}, {:?}",
+                                self.name(),
+                                y_scalar.data_type(),
+                                x_scalar.data_type()
+                            ),
+                        },
+                        (y, x) => {
+                            let args = ColumnarValue::values_to_arrays(&[y, x])?;
+                            let arr: ArrayRef = match args[0].data_type() {
+                                DataType::Float64 => {
+                                    let y = args[0].as_primitive::<Float64Type>();
+                                    let x = args[1].as_primitive::<Float64Type>();
+                                    let result =
+                                        arrow::compute::binary::<_, _, _, Float64Type>(
+                                            y,
+                                            x,
+                                            |y, x| f64::$BINARY_FUNC(y, x),
+                                        )?;
+                                    Arc::new(result) as _
+                                }
+                                DataType::Float32 => {
+                                    let y = args[0].as_primitive::<Float32Type>();
+                                    let x = args[1].as_primitive::<Float32Type>();
+                                    let result =
+                                        arrow::compute::binary::<_, _, _, Float32Type>(
+                                            y,
+                                            x,
+                                            |y, x| f32::$BINARY_FUNC(y, x),
+                                        )?;
+                                    Arc::new(result) as _
+                                }
+                                other => {
+                                    return internal_err!(
+                                        "Unsupported data type {other:?} for function {}",
+                                        self.name()
+                                    );
+                                }
+                            };
+
+                            Ok(ColumnarValue::Array(arr))
                         }
-                        DataType::Float32 => {
-                            let y = args[0].as_primitive::<Float32Type>();
-                            let x = args[1].as_primitive::<Float32Type>();
-                            let result = arrow::compute::binary::<_, _, _, Float32Type>(
-                                y,
-                                x,
-                                |y, x| f32::$BINARY_FUNC(y, x),
-                            )?;
-                            Arc::new(result) as _
-                        }
-                        other => {
-                            return exec_err!(
-                                "Unsupported data type {other:?} for function {}",
-                                self.name()
-                            )
-                        }
-                    };
-
-                    Ok(ColumnarValue::Array(arr))
+                    }
                 }
 
                 fn documentation(&self) -> Option<&Documentation> {
diff --git a/datafusion/functions/src/math/abs.rs b/datafusion/functions/src/math/abs.rs
index b3dc2b2eb6f8f..02ac89756d919 100644
--- a/datafusion/functions/src/math/abs.rs
+++ b/datafusion/functions/src/math/abs.rs
@@ -17,17 +17,16 @@
 
 //! math expressions
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{
-    ArrayRef, Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array,
-    Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array,
-    Int8Array,
+    ArrayRef, Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array,
+    Float16Array, Float32Array, Float64Array, Int8Array, Int16Array, Int32Array,
+    Int64Array,
 };
 use arrow::datatypes::DataType;
 use arrow::error::ArrowError;
-use datafusion_common::{not_impl_err, utils::take_function_args, Result};
+use datafusion_common::{Result, not_impl_err, utils::take_function_args};
 use datafusion_expr::interval_arithmetic::Interval;
 use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion_expr::{
@@ -39,6 +38,7 @@ use num_traits::sign::Signed;
 
 type MathArrayFunction = fn(&ArrayRef) -> Result<ArrayRef>;
 
+#[macro_export]
 macro_rules! make_abs_function {
     ($ARRAY_TYPE:ident) => {{
         |input: &ArrayRef| {
@@ -49,6 +49,7 @@ macro_rules! make_abs_function {
     }};
 }
 
+#[macro_export]
 macro_rules! make_try_abs_function {
     ($ARRAY_TYPE:ident) => {{
         |input: &ArrayRef| {
@@ -61,13 +62,15 @@ macro_rules! make_try_abs_function {
                         x
                     ))
                 })
-            })?;
+            })
+            .and_then(|v| Ok(v.with_data_type(input.data_type().clone())))?; // maintain decimal's precision and scale
             Ok(Arc::new(res) as ArrayRef)
         }
     }};
 }
 
-macro_rules! make_decimal_abs_function {
+#[macro_export]
+macro_rules! make_wrapping_abs_function {
     ($ARRAY_TYPE:ident) => {{
         |input: &ArrayRef| {
             let array = downcast_named_arg!(&input, "abs arg", $ARRAY_TYPE);
@@ -101,10 +104,10 @@ fn create_abs_function(input_data_type: &DataType) -> Result<MathArrayFunction>
         | DataType::UInt64 => Ok(|input: &ArrayRef| Ok(Arc::clone(input))),
 
         // Decimal types
-        DataType::Decimal32(_, _) => Ok(make_decimal_abs_function!(Decimal32Array)),
-        DataType::Decimal64(_, _) => Ok(make_decimal_abs_function!(Decimal64Array)),
-        DataType::Decimal128(_, _) => Ok(make_decimal_abs_function!(Decimal128Array)),
-        DataType::Decimal256(_, _) => Ok(make_decimal_abs_function!(Decimal256Array)),
+        DataType::Decimal32(_, _) => Ok(make_wrapping_abs_function!(Decimal32Array)),
+        DataType::Decimal64(_, _) => Ok(make_wrapping_abs_function!(Decimal64Array)),
+        DataType::Decimal128(_, _) => Ok(make_wrapping_abs_function!(Decimal128Array)),
+        DataType::Decimal256(_, _) => Ok(make_wrapping_abs_function!(Decimal256Array)),
 
         other => not_impl_err!("Unsupported data type {other:?} for function abs"),
     }
@@ -143,10 +146,6 @@ impl AbsFunc {
 }
 
 impl ScalarUDFImpl for AbsFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "abs"
     }
@@ -175,9 +174,9 @@ impl ScalarUDFImpl for AbsFunc {
         let range = &arg.range;
         let zero_point = Interval::make_zero(&range.lower().data_type())?;
 
-        if range.gt_eq(&zero_point)? == Interval::CERTAINLY_TRUE {
+        if range.gt_eq(&zero_point)? == Interval::TRUE {
             Ok(arg.sort_properties)
-        } else if range.lt_eq(&zero_point)? == Interval::CERTAINLY_TRUE {
+        } else if range.lt_eq(&zero_point)? == Interval::TRUE {
             Ok(-arg.sort_properties)
         } else {
             Ok(SortProperties::Unordered)
diff --git a/datafusion/functions/src/math/ceil.rs b/datafusion/functions/src/math/ceil.rs
new file mode 100644
index 0000000000000..395cb4eae03f5
--- /dev/null
+++ b/datafusion/functions/src/math/ceil.rs
@@ -0,0 +1,201 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, AsArray};
+use arrow::datatypes::{
+    DataType, Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, Float32Type,
+    Float64Type,
+};
+use datafusion_common::{Result, ScalarValue, exec_err};
+use datafusion_expr::interval_arithmetic::Interval;
+use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature, TypeSignatureClass, Volatility,
+};
+use datafusion_macros::user_doc;
+
+use super::decimal::{apply_decimal_op, ceil_decimal_value};
+
+#[user_doc(
+    doc_section(label = "Math Functions"),
+    description = "Returns the nearest integer greater than or equal to a number.",
+    syntax_example = "ceil(numeric_expression)",
+    standard_argument(name = "numeric_expression", prefix = "Numeric"),
+    sql_example = r#"```sql
+> SELECT ceil(3.14);
++------------+
+| ceil(3.14) |
++------------+
+| 4.0        |
++------------+
+```"#
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct CeilFunc {
+    signature: Signature,
+}
+
+impl Default for CeilFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl CeilFunc {
+    pub fn new() -> Self {
+        let decimal_sig = Coercion::new_exact(TypeSignatureClass::Decimal);
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![decimal_sig]),
+                    TypeSignature::Uniform(1, vec![DataType::Float64, DataType::Float32]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for CeilFunc {
+    fn name(&self) -> &str {
+        "ceil"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        match &arg_types[0] {
+            DataType::Null => Ok(DataType::Float64),
+            other => Ok(other.clone()),
+        }
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let arg = &args.args[0];
+
+        // Scalar fast path for float types - avoid array conversion overhead entirely
+        if let ColumnarValue::Scalar(scalar) = arg {
+            match scalar {
+                ScalarValue::Float64(v) => {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Float64(
+                        v.map(f64::ceil),
+                    )));
+                }
+                ScalarValue::Float32(v) => {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Float32(
+                        v.map(f32::ceil),
+                    )));
+                }
+                ScalarValue::Null => {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Float64(None)));
+                }
+                // For decimals: convert to array of size 1, process, then extract scalar
+                // This ensures we don't expand the array while reusing overflow validation
+                _ => {}
+            }
+        }
+
+        // Track if input was a scalar to convert back at the end
+        let is_scalar = matches!(arg, ColumnarValue::Scalar(_));
+
+        // Array path (also handles decimal scalars converted to size-1 arrays)
+        let value = arg.to_array(args.number_rows)?;
+
+        let result: ArrayRef = match value.data_type() {
+            DataType::Float64 => Arc::new(
+                value
+                    .as_primitive::<Float64Type>()
+                    .unary::<_, Float64Type>(f64::ceil),
+            ),
+            DataType::Float32 => Arc::new(
+                value
+                    .as_primitive::<Float32Type>()
+                    .unary::<_, Float32Type>(f32::ceil),
+            ),
+            DataType::Null => {
+                return Ok(ColumnarValue::Scalar(ScalarValue::Float64(None)));
+            }
+            DataType::Decimal32(precision, scale) => {
+                apply_decimal_op::<Decimal32Type, _>(
+                    &value,
+                    *precision,
+                    *scale,
+                    self.name(),
+                    ceil_decimal_value,
+                )?
+            }
+            DataType::Decimal64(precision, scale) => {
+                apply_decimal_op::<Decimal64Type, _>(
+                    &value,
+                    *precision,
+                    *scale,
+                    self.name(),
+                    ceil_decimal_value,
+                )?
+            }
+            DataType::Decimal128(precision, scale) => {
+                apply_decimal_op::<Decimal128Type, _>(
+                    &value,
+                    *precision,
+                    *scale,
+                    self.name(),
+                    ceil_decimal_value,
+                )?
+            }
+            DataType::Decimal256(precision, scale) => {
+                apply_decimal_op::<Decimal256Type, _>(
+                    &value,
+                    *precision,
+                    *scale,
+                    self.name(),
+                    ceil_decimal_value,
+                )?
+            }
+            other => {
+                return exec_err!(
+                    "Unsupported data type {other:?} for function {}",
+                    self.name()
+                );
+            }
+        };
+
+        // If input was a scalar, convert result back to scalar
+        if is_scalar {
+            ScalarValue::try_from_array(&result, 0).map(ColumnarValue::Scalar)
+        } else {
+            Ok(ColumnarValue::Array(result))
+        }
+    }
+
+    fn output_ordering(&self, input: &[ExprProperties]) -> Result<SortProperties> {
+        Ok(input[0].sort_properties)
+    }
+
+    fn evaluate_bounds(&self, inputs: &[&Interval]) -> Result<Interval> {
+        let data_type = inputs[0].data_type();
+        Interval::make_unbounded(&data_type)
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
diff --git a/datafusion/functions/src/math/cot.rs b/datafusion/functions/src/math/cot.rs
index 43f2012d073dd..24f0a412e3a8a 100644
--- a/datafusion/functions/src/math/cot.rs
+++ b/datafusion/functions/src/math/cot.rs
@@ -15,15 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
-use arrow::array::{ArrayRef, AsArray};
+use arrow::array::AsArray;
 use arrow::datatypes::DataType::{Float32, Float64};
 use arrow::datatypes::{DataType, Float32Type, Float64Type};
 
-use crate::utils::make_scalar_function;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, internal_err};
 use datafusion_expr::{ColumnarValue, Documentation, ScalarFunctionArgs};
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
@@ -72,10 +71,6 @@ impl CotFunc {
 }
 
 impl ScalarUDFImpl for CotFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "cot"
     }
@@ -96,24 +91,47 @@ impl ScalarUDFImpl for CotFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(cot, vec![])(&args.args)
-    }
-}
+        let return_field = args.return_field;
+        let [arg] = take_function_args(self.name(), args.args)?;
+
+        match arg {
+            ColumnarValue::Scalar(scalar) => {
+                if scalar.is_null() {
+                    return ColumnarValue::Scalar(ScalarValue::Null)
+                        .cast_to(return_field.data_type(), None);
+                }
 
-///cot SQL function
-fn cot(args: &[ArrayRef]) -> Result<ArrayRef> {
-    match args[0].data_type() {
-        Float64 => Ok(Arc::new(
-            args[0]
-                .as_primitive::<Float64Type>()
-                .unary::<_, Float64Type>(|x: f64| compute_cot64(x)),
-        ) as ArrayRef),
-        Float32 => Ok(Arc::new(
-            args[0]
-                .as_primitive::<Float32Type>()
-                .unary::<_, Float32Type>(|x: f32| compute_cot32(x)),
-        ) as ArrayRef),
-        other => exec_err!("Unsupported data type {other:?} for function cot"),
+                match scalar {
+                    ScalarValue::Float64(Some(v)) => Ok(ColumnarValue::Scalar(
+                        ScalarValue::Float64(Some(compute_cot64(v))),
+                    )),
+                    ScalarValue::Float32(Some(v)) => Ok(ColumnarValue::Scalar(
+                        ScalarValue::Float32(Some(compute_cot32(v))),
+                    )),
+                    _ => {
+                        internal_err!(
+                            "Unexpected scalar type for cot: {:?}",
+                            scalar.data_type()
+                        )
+                    }
+                }
+            }
+            ColumnarValue::Array(array) => match array.data_type() {
+                Float64 => Ok(ColumnarValue::Array(Arc::new(
+                    array
+                        .as_primitive::<Float64Type>()
+                        .unary::<_, Float64Type>(compute_cot64),
+                ))),
+                Float32 => Ok(ColumnarValue::Array(Arc::new(
+                    array
+                        .as_primitive::<Float32Type>()
+                        .unary::<_, Float32Type>(compute_cot32),
+                ))),
+                other => {
+                    internal_err!("Unexpected data type {other:?} for function cot")
+                }
+            },
+        }
     }
 }
 
@@ -129,54 +147,212 @@ fn compute_cot64(x: f64) -> f64 {
 
 #[cfg(test)]
 mod test {
-    use crate::math::cot::cot;
+    use std::sync::Arc;
+
     use arrow::array::{ArrayRef, Float32Array, Float64Array};
+    use arrow::datatypes::{DataType, Field};
+    use datafusion_common::ScalarValue;
     use datafusion_common::cast::{as_float32_array, as_float64_array};
-    use std::sync::Arc;
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+
+    use crate::math::cot::CotFunc;
 
     #[test]
     fn test_cot_f32() {
-        let args: Vec<ArrayRef> =
-            vec![Arc::new(Float32Array::from(vec![12.1, 30.0, 90.0, -30.0]))];
-        let result = cot(&args).expect("failed to initialize function cot");
-        let floats =
-            as_float32_array(&result).expect("failed to initialize function cot");
-
-        let expected = Float32Array::from(vec![
-            -1.986_460_4,
-            -0.156_119_96,
-            -0.501_202_8,
-            0.156_119_96,
-        ]);
-
-        let eps = 1e-6;
-        assert_eq!(floats.len(), 4);
-        assert!((floats.value(0) - expected.value(0)).abs() < eps);
-        assert!((floats.value(1) - expected.value(1)).abs() < eps);
-        assert!((floats.value(2) - expected.value(2)).abs() < eps);
-        assert!((floats.value(3) - expected.value(3)).abs() < eps);
+        let array = Arc::new(Float32Array::from(vec![12.1, 30.0, 90.0, -30.0]));
+        let arg_fields = vec![Field::new("a", DataType::Float32, false).into()];
+        let args = ScalarFunctionArgs {
+            args: vec![ColumnarValue::Array(Arc::clone(&array) as ArrayRef)],
+            arg_fields,
+            number_rows: array.len(),
+            return_field: Field::new("f", DataType::Float32, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let result = CotFunc::new()
+            .invoke_with_args(args)
+            .expect("failed to initialize function cot");
+
+        match result {
+            ColumnarValue::Array(arr) => {
+                let floats = as_float32_array(&arr)
+                    .expect("failed to convert result to a Float32Array");
+
+                let expected = Float32Array::from(vec![
+                    -1.986_460_4,
+                    -0.156_119_96,
+                    -0.501_202_8,
+                    0.156_119_96,
+                ]);
+
+                let eps = 1e-6;
+                assert_eq!(floats.len(), 4);
+                assert!((floats.value(0) - expected.value(0)).abs() < eps);
+                assert!((floats.value(1) - expected.value(1)).abs() < eps);
+                assert!((floats.value(2) - expected.value(2)).abs() < eps);
+                assert!((floats.value(3) - expected.value(3)).abs() < eps);
+            }
+            ColumnarValue::Scalar(_) => {
+                panic!("Expected an array value")
+            }
+        }
     }
 
     #[test]
     fn test_cot_f64() {
-        let args: Vec<ArrayRef> =
-            vec![Arc::new(Float64Array::from(vec![12.1, 30.0, 90.0, -30.0]))];
-        let result = cot(&args).expect("failed to initialize function cot");
-        let floats =
-            as_float64_array(&result).expect("failed to initialize function cot");
-
-        let expected = Float64Array::from(vec![
-            -1.986_458_685_881_4,
-            -0.156_119_952_161_6,
-            -0.501_202_783_380_1,
-            0.156_119_952_161_6,
-        ]);
-
-        let eps = 1e-12;
-        assert_eq!(floats.len(), 4);
-        assert!((floats.value(0) - expected.value(0)).abs() < eps);
-        assert!((floats.value(1) - expected.value(1)).abs() < eps);
-        assert!((floats.value(2) - expected.value(2)).abs() < eps);
-        assert!((floats.value(3) - expected.value(3)).abs() < eps);
+        let array = Arc::new(Float64Array::from(vec![12.1, 30.0, 90.0, -30.0]));
+        let arg_fields = vec![Field::new("a", DataType::Float64, false).into()];
+        let args = ScalarFunctionArgs {
+            args: vec![ColumnarValue::Array(Arc::clone(&array) as ArrayRef)],
+            arg_fields,
+            number_rows: array.len(),
+            return_field: Field::new("f", DataType::Float64, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let result = CotFunc::new()
+            .invoke_with_args(args)
+            .expect("failed to initialize function cot");
+
+        match result {
+            ColumnarValue::Array(arr) => {
+                let floats = as_float64_array(&arr)
+                    .expect("failed to convert result to a Float64Array");
+
+                let expected = Float64Array::from(vec![
+                    -1.986_458_685_881_4,
+                    -0.156_119_952_161_6,
+                    -0.501_202_783_380_1,
+                    0.156_119_952_161_6,
+                ]);
+
+                let eps = 1e-12;
+                assert_eq!(floats.len(), 4);
+                assert!((floats.value(0) - expected.value(0)).abs() < eps);
+                assert!((floats.value(1) - expected.value(1)).abs() < eps);
+                assert!((floats.value(2) - expected.value(2)).abs() < eps);
+                assert!((floats.value(3) - expected.value(3)).abs() < eps);
+            }
+            ColumnarValue::Scalar(_) => {
+                panic!("Expected an array value")
+            }
+        }
+    }
+
+    #[test]
+    fn test_cot_scalar_f64() {
+        let arg_fields = vec![Field::new("a", DataType::Float64, false).into()];
+        let args = ScalarFunctionArgs {
+            args: vec![ColumnarValue::Scalar(ScalarValue::Float64(Some(1.0)))],
+            arg_fields,
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Float64, false).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let result = CotFunc::new()
+            .invoke_with_args(args)
+            .expect("cot scalar should succeed");
+
+        match result {
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(v))) => {
+                // cot(1.0) = 1/tan(1.0) ≈ 0.6420926159343306
+                let expected = 1.0_f64 / 1.0_f64.tan();
+                assert!((v - expected).abs() < 1e-12);
+            }
+            _ => panic!("Expected Float64 scalar"),
+        }
+    }
+
+    #[test]
+    fn test_cot_scalar_f32() {
+        let arg_fields = vec![Field::new("a", DataType::Float32, false).into()];
+        let args = ScalarFunctionArgs {
+            args: vec![ColumnarValue::Scalar(ScalarValue::Float32(Some(1.0)))],
+            arg_fields,
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Float32, false).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let result = CotFunc::new()
+            .invoke_with_args(args)
+            .expect("cot scalar should succeed");
+
+        match result {
+            ColumnarValue::Scalar(ScalarValue::Float32(Some(v))) => {
+                let expected = 1.0_f32 / 1.0_f32.tan();
+                assert!((v - expected).abs() < 1e-6);
+            }
+            _ => panic!("Expected Float32 scalar"),
+        }
+    }
+
+    #[test]
+    fn test_cot_scalar_null() {
+        let arg_fields = vec![Field::new("a", DataType::Float64, true).into()];
+        let args = ScalarFunctionArgs {
+            args: vec![ColumnarValue::Scalar(ScalarValue::Float64(None))],
+            arg_fields,
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Float64, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let result = CotFunc::new()
+            .invoke_with_args(args)
+            .expect("cot null should succeed");
+
+        match result {
+            ColumnarValue::Scalar(scalar) => {
+                assert!(scalar.is_null());
+            }
+            _ => panic!("Expected scalar result"),
+        }
+    }
+
+    #[test]
+    fn test_cot_scalar_zero() {
+        let arg_fields = vec![Field::new("a", DataType::Float64, false).into()];
+        let args = ScalarFunctionArgs {
+            args: vec![ColumnarValue::Scalar(ScalarValue::Float64(Some(0.0)))],
+            arg_fields,
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Float64, false).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let result = CotFunc::new()
+            .invoke_with_args(args)
+            .expect("cot zero should succeed");
+
+        match result {
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(v))) => {
+                // cot(0) = 1/tan(0) = infinity
+                assert!(v.is_infinite());
+            }
+            _ => panic!("Expected Float64 scalar"),
+        }
+    }
+
+    #[test]
+    fn test_cot_scalar_pi() {
+        let arg_fields = vec![Field::new("a", DataType::Float64, false).into()];
+        let args = ScalarFunctionArgs {
+            args: vec![ColumnarValue::Scalar(ScalarValue::Float64(Some(
+                std::f64::consts::PI,
+            )))],
+            arg_fields,
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Float64, false).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let result = CotFunc::new()
+            .invoke_with_args(args)
+            .expect("cot pi should succeed");
+
+        match result {
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(v))) => {
+                // cot(PI) = 1/tan(PI) - very large negative number due to floating point
+                let expected = 1.0_f64 / std::f64::consts::PI.tan();
+                assert!((v - expected).abs() < 1e-6);
+            }
+            _ => panic!("Expected Float64 scalar"),
+        }
     }
 }
diff --git a/datafusion/functions/src/math/decimal.rs b/datafusion/functions/src/math/decimal.rs
new file mode 100644
index 0000000000000..abaded4568a93
--- /dev/null
+++ b/datafusion/functions/src/math/decimal.rs
@@ -0,0 +1,111 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, AsArray, PrimitiveArray};
+use arrow::datatypes::{ArrowNativeTypeOp, DecimalType};
+use arrow::error::ArrowError;
+use arrow_buffer::ArrowNativeType;
+use datafusion_common::{DataFusionError, Result};
+
+pub(super) fn apply_decimal_op<T, F>(
+    array: &ArrayRef,
+    precision: u8,
+    scale: i8,
+    fn_name: &str,
+    op: F,
+) -> Result<ArrayRef>
+where
+    T: DecimalType,
+    T::Native: ArrowNativeType + ArrowNativeTypeOp,
+    F: Fn(T::Native, T::Native) -> T::Native,
+{
+    if scale <= 0 {
+        return Ok(Arc::clone(array));
+    }
+
+    let factor = decimal_scale_factor::<T>(scale, fn_name)?;
+    let decimal = array.as_primitive::<T>();
+    let data_type = array.data_type().clone();
+
+    let result: PrimitiveArray<T> = decimal.try_unary(|value| {
+        let new_value = op(value, factor);
+        T::validate_decimal_precision(new_value, precision, scale).map_err(|_| {
+            ArrowError::ComputeError(format!("Decimal overflow while applying {fn_name}"))
+        })?;
+        Ok::<_, ArrowError>(new_value)
+    })?;
+
+    let result = result.with_data_type(data_type);
+
+    Ok(Arc::new(result))
+}
+
+fn decimal_scale_factor<T>(scale: i8, fn_name: &str) -> Result<T::Native>
+where
+    T: DecimalType,
+    T::Native: ArrowNativeType + ArrowNativeTypeOp,
+{
+    let base = <T::Native as ArrowNativeType>::from_usize(10).ok_or_else(|| {
+        DataFusionError::Execution(format!(
+            "Cannot get 10_{} from usize: {:?}",
+            std::any::type_name::<T::Native>(),
+            10_usize
+        ))
+    })?;
+
+    base.pow_checked(scale as u32).map_err(|_| {
+        DataFusionError::Execution(format!("Decimal overflow while applying {fn_name}"))
+    })
+}
+
+pub(super) fn ceil_decimal_value<T>(value: T, factor: T) -> T
+where
+    T: ArrowNativeTypeOp + std::ops::Rem<Output = T>,
+{
+    let remainder = value % factor;
+
+    if remainder == T::ZERO {
+        return value;
+    }
+
+    if value >= T::ZERO {
+        let increment = factor.sub_wrapping(remainder);
+        value.add_wrapping(increment)
+    } else {
+        value.sub_wrapping(remainder)
+    }
+}
+
+pub(super) fn floor_decimal_value<T>(value: T, factor: T) -> T
+where
+    T: ArrowNativeTypeOp + std::ops::Rem<Output = T>,
+{
+    let remainder = value % factor;
+
+    if remainder == T::ZERO {
+        return value;
+    }
+
+    if value >= T::ZERO {
+        value.sub_wrapping(remainder)
+    } else {
+        let adjustment = factor.add_wrapping(remainder);
+        value.sub_wrapping(adjustment)
+    }
+}
diff --git a/datafusion/functions/src/math/factorial.rs b/datafusion/functions/src/math/factorial.rs
index 79f6da94dd0e1..b4c41838e164f 100644
--- a/datafusion/functions/src/math/factorial.rs
+++ b/datafusion/functions/src/math/factorial.rs
@@ -15,18 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::{
-    array::{ArrayRef, Int64Array},
-    error::ArrowError,
-};
-use std::any::Any;
+use arrow::array::{ArrayRef, AsArray, Int64Array};
 use std::sync::Arc;
 
-use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::Int64;
+use arrow::datatypes::{DataType, Int64Type};
 
-use crate::utils::make_scalar_function;
-use datafusion_common::{arrow_datafusion_err, exec_err, DataFusionError, Result};
+use datafusion_common::{
+    Result, ScalarValue, exec_err, internal_err, utils::take_function_args,
+};
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
     Volatility,
@@ -67,10 +64,6 @@ impl FactorialFunc {
 }
 
 impl ScalarUDFImpl for FactorialFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "factorial"
     }
@@ -84,7 +77,39 @@ impl ScalarUDFImpl for FactorialFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(factorial, vec![])(&args.args)
+        let [arg] = take_function_args(self.name(), args.args)?;
+
+        match arg {
+            ColumnarValue::Scalar(scalar) => {
+                if scalar.is_null() {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Int64(None)));
+                }
+
+                match scalar {
+                    ScalarValue::Int64(Some(v)) => {
+                        let result = compute_factorial(v)?;
+                        Ok(ColumnarValue::Scalar(ScalarValue::Int64(Some(result))))
+                    }
+                    _ => {
+                        internal_err!(
+                            "Unexpected data type {:?} for function factorial",
+                            scalar.data_type()
+                        )
+                    }
+                }
+            }
+            ColumnarValue::Array(array) => match array.data_type() {
+                Int64 => {
+                    let result: Int64Array = array
+                        .as_primitive::<Int64Type>()
+                        .try_unary(compute_factorial)?;
+                    Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef))
+                }
+                other => {
+                    internal_err!("Unexpected data type {other:?} for function factorial")
+                }
+            },
+        }
     }
 
     fn documentation(&self) -> Option<&Documentation> {
@@ -92,50 +117,36 @@ impl ScalarUDFImpl for FactorialFunc {
     }
 }
 
-/// Factorial SQL function
-fn factorial(args: &[ArrayRef]) -> Result<ArrayRef> {
-    match args[0].data_type() {
-        Int64 => {
-            let arg = downcast_named_arg!((&args[0]), "value", Int64Array);
-            Ok(arg
-                .iter()
-                .map(|a| match a {
-                    Some(a) => (2..=a)
-                        .try_fold(1i64, i64::checked_mul)
-                        .ok_or_else(|| {
-                            arrow_datafusion_err!(ArrowError::ComputeError(format!(
-                                "Overflow happened on FACTORIAL({a})"
-                            )))
-                        })
-                        .map(Some),
-                    _ => Ok(None),
-                })
-                .collect::<Result<Int64Array>>()
-                .map(Arc::new)? as ArrayRef)
-        }
-        other => exec_err!("Unsupported data type {other:?} for function factorial."),
-    }
-}
-
-#[cfg(test)]
-mod test {
-
-    use datafusion_common::cast::as_int64_array;
-
-    use super::*;
-
-    #[test]
-    fn test_factorial_i64() {
-        let args: Vec<ArrayRef> = vec![
-            Arc::new(Int64Array::from(vec![0, 1, 2, 4])), // input
-        ];
-
-        let result = factorial(&args).expect("failed to initialize function factorial");
-        let ints =
-            as_int64_array(&result).expect("failed to initialize function factorial");
-
-        let expected = Int64Array::from(vec![1, 1, 2, 24]);
-
-        assert_eq!(ints, &expected);
+const FACTORIALS: [i64; 21] = [
+    1,
+    1,
+    2,
+    6,
+    24,
+    120,
+    720,
+    5040,
+    40320,
+    362880,
+    3628800,
+    39916800,
+    479001600,
+    6227020800,
+    87178291200,
+    1307674368000,
+    20922789888000,
+    355687428096000,
+    6402373705728000,
+    121645100408832000,
+    2432902008176640000,
+]; // if return type changes, this constant needs to be updated accordingly
+
+fn compute_factorial(n: i64) -> Result<i64> {
+    if n < 0 {
+        Ok(1)
+    } else if n < FACTORIALS.len() as i64 {
+        Ok(FACTORIALS[n as usize])
+    } else {
+        exec_err!("Overflow happened on FACTORIAL({n})")
     }
 }
diff --git a/datafusion/functions/src/math/floor.rs b/datafusion/functions/src/math/floor.rs
new file mode 100644
index 0000000000000..e02aa141c5b71
--- /dev/null
+++ b/datafusion/functions/src/math/floor.rs
@@ -0,0 +1,684 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, AsArray};
+use arrow::compute::{DecimalCast, rescale_decimal};
+use arrow::datatypes::{
+    ArrowNativeTypeOp, DataType, Decimal32Type, Decimal64Type, Decimal128Type,
+    Decimal256Type, DecimalType, Float32Type, Float64Type,
+};
+use datafusion_common::{Result, ScalarValue, exec_err};
+use datafusion_expr::interval_arithmetic::Interval;
+use datafusion_expr::preimage::PreimageResult;
+use datafusion_expr::simplify::SimplifyContext;
+use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignature, TypeSignatureClass, Volatility,
+};
+use datafusion_macros::user_doc;
+use num_traits::{CheckedAdd, Float, One};
+
+use super::decimal::{apply_decimal_op, floor_decimal_value};
+
+#[user_doc(
+    doc_section(label = "Math Functions"),
+    description = "Returns the nearest integer less than or equal to a number.",
+    syntax_example = "floor(numeric_expression)",
+    standard_argument(name = "numeric_expression", prefix = "Numeric"),
+    sql_example = r#"```sql
+> SELECT floor(3.14);
++-------------+
+| floor(3.14) |
++-------------+
+| 3.0         |
++-------------+
+```"#
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct FloorFunc {
+    signature: Signature,
+}
+
+impl Default for FloorFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl FloorFunc {
+    pub fn new() -> Self {
+        let decimal_sig = Coercion::new_exact(TypeSignatureClass::Decimal);
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![decimal_sig]),
+                    TypeSignature::Uniform(1, vec![DataType::Float64, DataType::Float32]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+// ============ Macro for preimage bounds ============
+/// Generates the code to call the appropriate bounds function and wrap results.
+macro_rules! preimage_bounds {
+    // Float types: call float_preimage_bounds and wrap in ScalarValue
+    (float: $variant:ident, $value:expr) => {
+        float_preimage_bounds($value).map(|(lo, hi)| {
+            (
+                ScalarValue::$variant(Some(lo)),
+                ScalarValue::$variant(Some(hi)),
+            )
+        })
+    };
+
+    // Integer types: call int_preimage_bounds and wrap in ScalarValue
+    (int: $variant:ident, $value:expr) => {
+        int_preimage_bounds($value).map(|(lo, hi)| {
+            (
+                ScalarValue::$variant(Some(lo)),
+                ScalarValue::$variant(Some(hi)),
+            )
+        })
+    };
+
+    // Decimal types: call decimal_preimage_bounds with precision/scale and wrap in ScalarValue
+    (decimal: $variant:ident, $decimal_type:ty, $value:expr, $precision:expr, $scale:expr) => {
+        decimal_preimage_bounds::<$decimal_type>($value, $precision, $scale).map(
+            |(lo, hi)| {
+                (
+                    ScalarValue::$variant(Some(lo), $precision, $scale),
+                    ScalarValue::$variant(Some(hi), $precision, $scale),
+                )
+            },
+        )
+    };
+}
+
+impl ScalarUDFImpl for FloorFunc {
+    fn name(&self) -> &str {
+        "floor"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        match &arg_types[0] {
+            DataType::Null => Ok(DataType::Float64),
+            other => Ok(other.clone()),
+        }
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let arg = &args.args[0];
+
+        // Scalar fast path for float types - avoid array conversion overhead entirely
+        if let ColumnarValue::Scalar(scalar) = arg {
+            match scalar {
+                ScalarValue::Float64(v) => {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Float64(
+                        v.map(f64::floor),
+                    )));
+                }
+                ScalarValue::Float32(v) => {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Float32(
+                        v.map(f32::floor),
+                    )));
+                }
+                ScalarValue::Null => {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Float64(None)));
+                }
+                // For decimals: convert to array of size 1, process, then extract scalar
+                // This ensures we don't expand the array while reusing overflow validation
+                _ => {}
+            }
+        }
+
+        // Track if input was a scalar to convert back at the end
+        let is_scalar = matches!(arg, ColumnarValue::Scalar(_));
+
+        // Array path (also handles decimal scalars converted to size-1 arrays)
+        let value = arg.to_array(args.number_rows)?;
+
+        let result: ArrayRef = match value.data_type() {
+            DataType::Float64 => Arc::new(
+                value
+                    .as_primitive::<Float64Type>()
+                    .unary::<_, Float64Type>(f64::floor),
+            ),
+            DataType::Float32 => Arc::new(
+                value
+                    .as_primitive::<Float32Type>()
+                    .unary::<_, Float32Type>(f32::floor),
+            ),
+            DataType::Null => {
+                return Ok(ColumnarValue::Scalar(ScalarValue::Float64(None)));
+            }
+            DataType::Decimal32(precision, scale) => {
+                apply_decimal_op::<Decimal32Type, _>(
+                    &value,
+                    *precision,
+                    *scale,
+                    self.name(),
+                    floor_decimal_value,
+                )?
+            }
+            DataType::Decimal64(precision, scale) => {
+                apply_decimal_op::<Decimal64Type, _>(
+                    &value,
+                    *precision,
+                    *scale,
+                    self.name(),
+                    floor_decimal_value,
+                )?
+            }
+            DataType::Decimal128(precision, scale) => {
+                apply_decimal_op::<Decimal128Type, _>(
+                    &value,
+                    *precision,
+                    *scale,
+                    self.name(),
+                    floor_decimal_value,
+                )?
+            }
+            DataType::Decimal256(precision, scale) => {
+                apply_decimal_op::<Decimal256Type, _>(
+                    &value,
+                    *precision,
+                    *scale,
+                    self.name(),
+                    floor_decimal_value,
+                )?
+            }
+            other => {
+                return exec_err!(
+                    "Unsupported data type {other:?} for function {}",
+                    self.name()
+                );
+            }
+        };
+
+        // If input was a scalar, convert result back to scalar
+        if is_scalar {
+            ScalarValue::try_from_array(&result, 0).map(ColumnarValue::Scalar)
+        } else {
+            Ok(ColumnarValue::Array(result))
+        }
+    }
+
+    fn output_ordering(&self, input: &[ExprProperties]) -> Result<SortProperties> {
+        Ok(input[0].sort_properties)
+    }
+
+    fn evaluate_bounds(&self, inputs: &[&Interval]) -> Result<Interval> {
+        let data_type = inputs[0].data_type();
+        Interval::make_unbounded(&data_type)
+    }
+
+    /// Compute the preimage for floor function.
+    ///
+    /// For `floor(x) = N`, the preimage is `x >= N AND x < N + 1`
+    /// because floor(x) = N for all x in [N, N+1).
+    ///
+    /// This enables predicate pushdown optimizations, transforming:
+    /// `floor(col) = 100` into `col >= 100 AND col < 101`
+    fn preimage(
+        &self,
+        args: &[Expr],
+        lit_expr: &Expr,
+        _info: &SimplifyContext,
+    ) -> Result<PreimageResult> {
+        // floor takes exactly one argument and we do not expect to reach here with multiple arguments.
+        debug_assert!(args.len() == 1, "floor() takes exactly one argument");
+
+        let arg = args[0].clone();
+
+        // Extract the literal value being compared to
+        let Expr::Literal(lit_value, _) = lit_expr else {
+            return Ok(PreimageResult::None);
+        };
+
+        // Compute lower bound (N) and upper bound (N + 1) using helper functions
+        let Some((lower, upper)) = (match lit_value {
+            // Floating-point types
+            ScalarValue::Float64(Some(n)) => preimage_bounds!(float: Float64, *n),
+            ScalarValue::Float32(Some(n)) => preimage_bounds!(float: Float32, *n),
+
+            // Integer types (not reachable from SQL/SLT: floor() only accepts Float64/Float32/Decimal,
+            // so the RHS literal is always coerced to one of those before preimage runs; kept for
+            // programmatic use and unit tests)
+            ScalarValue::Int8(Some(n)) => preimage_bounds!(int: Int8, *n),
+            ScalarValue::Int16(Some(n)) => preimage_bounds!(int: Int16, *n),
+            ScalarValue::Int32(Some(n)) => preimage_bounds!(int: Int32, *n),
+            ScalarValue::Int64(Some(n)) => preimage_bounds!(int: Int64, *n),
+
+            // Decimal types
+            // DECIMAL(precision, scale) where precision ≤ 38 -> Decimal128(precision, scale)
+            // DECIMAL(precision, scale) where precision > 38 -> Decimal256(precision, scale)
+            // Decimal32 and Decimal64 are unreachable from SQL/SLT.
+            ScalarValue::Decimal32(Some(n), precision, scale) => {
+                preimage_bounds!(decimal: Decimal32, Decimal32Type, *n, *precision, *scale)
+            }
+            ScalarValue::Decimal64(Some(n), precision, scale) => {
+                preimage_bounds!(decimal: Decimal64, Decimal64Type, *n, *precision, *scale)
+            }
+            ScalarValue::Decimal128(Some(n), precision, scale) => {
+                preimage_bounds!(decimal: Decimal128, Decimal128Type, *n, *precision, *scale)
+            }
+            ScalarValue::Decimal256(Some(n), precision, scale) => {
+                preimage_bounds!(decimal: Decimal256, Decimal256Type, *n, *precision, *scale)
+            }
+
+            // Unsupported types
+            _ => None,
+        }) else {
+            return Ok(PreimageResult::None);
+        };
+
+        Ok(PreimageResult::Range {
+            expr: arg,
+            interval: Box::new(Interval::try_new(lower, upper)?),
+        })
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+// ============ Helper functions for preimage bounds ============
+
+/// Compute preimage bounds for floor function on floating-point types.
+/// For floor(x) = n, the preimage is [n, n+1).
+/// Returns None if:
+/// - The value is non-finite (infinity, NaN)
+/// - The value is not an integer (floor always returns integers, so floor(x) = 1.3 has no solution)
+/// - Adding 1 would lose precision at extreme values
+fn float_preimage_bounds<F: Float>(n: F) -> Option<(F, F)> {
+    let one = F::one();
+    // Check for non-finite values (infinity, NaN)
+    if !n.is_finite() {
+        return None;
+    }
+    // floor always returns an integer, so if n has a fractional part, there's no solution
+    if n.fract() != F::zero() {
+        return None;
+    }
+    // Check for precision loss at extreme values
+    if n + one <= n {
+        return None;
+    }
+    Some((n, n + one))
+}
+
+/// Compute preimage bounds for floor function on integer types.
+/// For floor(x) = n, the preimage is [n, n+1).
+/// Returns None if adding 1 would overflow.
+fn int_preimage_bounds<I: CheckedAdd + One + Copy>(n: I) -> Option<(I, I)> {
+    let upper = n.checked_add(&I::one())?;
+    Some((n, upper))
+}
+
+/// Compute preimage bounds for floor function on decimal types.
+/// For floor(x) = n, the preimage is [n, n+1).
+/// Returns None if:
+/// - The value has a fractional part (floor always returns integers)
+/// - Adding 1 would overflow
+fn decimal_preimage_bounds<D: DecimalType>(
+    value: D::Native,
+    precision: u8,
+    scale: i8,
+) -> Option<(D::Native, D::Native)>
+where
+    D::Native: DecimalCast + ArrowNativeTypeOp + std::ops::Rem<Output = D::Native>,
+{
+    // Use rescale_decimal to compute "1" at target scale (avoids manual pow)
+    // Convert integer 1 (scale=0) to the target scale
+    let one_scaled: D::Native = rescale_decimal::<D, D>(
+        D::Native::ONE, // value = 1
+        1,              // input_precision = 1
+        0,              // input_scale = 0 (integer)
+        precision,      // output_precision
+        scale,          // output_scale
+    )?;
+
+    // floor always returns an integer, so if value has a fractional part, there's no solution
+    // Check: value % one_scaled != 0 means fractional part exists
+    if scale > 0 && value % one_scaled != D::Native::ZERO {
+        return None;
+    }
+
+    // Compute upper bound using checked addition
+    // Before preimage stage, the internal i128/i256(value) is validated based on the precision and scale.
+    // MAX_DECIMAL128_FOR_EACH_PRECISION and MAX_DECIMAL256_FOR_EACH_PRECISION are used to validate the internal i128/i256.
+    // Any invalid i128/i256 will not reach here.
+    // Therefore, the add_checked will always succeed if tested via SQL/SLT path.
+    let upper = value.add_checked(one_scaled).ok()?;
+
+    Some((value, upper))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow_buffer::i256;
+    use datafusion_expr::col;
+
+    /// Helper to test valid preimage cases that should return a Range
+    fn assert_preimage_range(
+        input: ScalarValue,
+        expected_lower: ScalarValue,
+        expected_upper: ScalarValue,
+    ) {
+        let floor_func = FloorFunc::new();
+        let args = vec![col("x")];
+        let lit_expr = Expr::Literal(input.clone(), None);
+        let info = SimplifyContext::default();
+
+        let result = floor_func.preimage(&args, &lit_expr, &info).unwrap();
+
+        match result {
+            PreimageResult::Range { expr, interval } => {
+                assert_eq!(expr, col("x"));
+                assert_eq!(interval.lower().clone(), expected_lower);
+                assert_eq!(interval.upper().clone(), expected_upper);
+            }
+            PreimageResult::None => {
+                panic!("Expected Range, got None for input {input:?}")
+            }
+        }
+    }
+
+    /// Helper to test cases that should return None
+    fn assert_preimage_none(input: ScalarValue) {
+        let floor_func = FloorFunc::new();
+        let args = vec![col("x")];
+        let lit_expr = Expr::Literal(input.clone(), None);
+        let info = SimplifyContext::default();
+
+        let result = floor_func.preimage(&args, &lit_expr, &info).unwrap();
+        assert!(
+            matches!(result, PreimageResult::None),
+            "Expected None for input {input:?}"
+        );
+    }
+
+    #[test]
+    fn test_floor_preimage_valid_cases() {
+        // Float64
+        assert_preimage_range(
+            ScalarValue::Float64(Some(100.0)),
+            ScalarValue::Float64(Some(100.0)),
+            ScalarValue::Float64(Some(101.0)),
+        );
+        // Float32
+        assert_preimage_range(
+            ScalarValue::Float32(Some(50.0)),
+            ScalarValue::Float32(Some(50.0)),
+            ScalarValue::Float32(Some(51.0)),
+        );
+        // Int64
+        assert_preimage_range(
+            ScalarValue::Int64(Some(42)),
+            ScalarValue::Int64(Some(42)),
+            ScalarValue::Int64(Some(43)),
+        );
+        // Int32
+        assert_preimage_range(
+            ScalarValue::Int32(Some(100)),
+            ScalarValue::Int32(Some(100)),
+            ScalarValue::Int32(Some(101)),
+        );
+        // Negative values
+        assert_preimage_range(
+            ScalarValue::Float64(Some(-5.0)),
+            ScalarValue::Float64(Some(-5.0)),
+            ScalarValue::Float64(Some(-4.0)),
+        );
+        // Zero
+        assert_preimage_range(
+            ScalarValue::Float64(Some(0.0)),
+            ScalarValue::Float64(Some(0.0)),
+            ScalarValue::Float64(Some(1.0)),
+        );
+    }
+
+    #[test]
+    fn test_floor_preimage_non_integer_float() {
+        // floor(x) = 1.3 has NO SOLUTION because floor always returns an integer
+        // Therefore preimage should return None for non-integer literals
+        assert_preimage_none(ScalarValue::Float64(Some(1.3)));
+        assert_preimage_none(ScalarValue::Float64(Some(-2.5)));
+        assert_preimage_none(ScalarValue::Float32(Some(3.7)));
+    }
+
+    #[test]
+    fn test_floor_preimage_integer_overflow() {
+        // All integer types at MAX value should return None
+        assert_preimage_none(ScalarValue::Int64(Some(i64::MAX)));
+        assert_preimage_none(ScalarValue::Int32(Some(i32::MAX)));
+        assert_preimage_none(ScalarValue::Int16(Some(i16::MAX)));
+        assert_preimage_none(ScalarValue::Int8(Some(i8::MAX)));
+    }
+
+    #[test]
+    fn test_floor_preimage_float_edge_cases() {
+        // Float64 edge cases
+        assert_preimage_none(ScalarValue::Float64(Some(f64::INFINITY)));
+        assert_preimage_none(ScalarValue::Float64(Some(f64::NEG_INFINITY)));
+        assert_preimage_none(ScalarValue::Float64(Some(f64::NAN)));
+        assert_preimage_none(ScalarValue::Float64(Some(f64::MAX))); // precision loss
+
+        // Float32 edge cases
+        assert_preimage_none(ScalarValue::Float32(Some(f32::INFINITY)));
+        assert_preimage_none(ScalarValue::Float32(Some(f32::NEG_INFINITY)));
+        assert_preimage_none(ScalarValue::Float32(Some(f32::NAN)));
+        assert_preimage_none(ScalarValue::Float32(Some(f32::MAX))); // precision loss
+    }
+
+    #[test]
+    fn test_floor_preimage_null_values() {
+        assert_preimage_none(ScalarValue::Float64(None));
+        assert_preimage_none(ScalarValue::Float32(None));
+        assert_preimage_none(ScalarValue::Int64(None));
+    }
+
+    // ============ Decimal32 Tests (mirrors float/int tests) ============
+
+    #[test]
+    fn test_floor_preimage_decimal_valid_cases() {
+        // ===== Decimal32 =====
+        // Positive integer decimal: 100.00 (scale=2, so raw=10000)
+        // floor(x) = 100.00 -> x in [100.00, 101.00)
+        assert_preimage_range(
+            ScalarValue::Decimal32(Some(10000), 9, 2),
+            ScalarValue::Decimal32(Some(10000), 9, 2), // 100.00
+            ScalarValue::Decimal32(Some(10100), 9, 2), // 101.00
+        );
+
+        // Smaller positive: 50.00
+        assert_preimage_range(
+            ScalarValue::Decimal32(Some(5000), 9, 2),
+            ScalarValue::Decimal32(Some(5000), 9, 2), // 50.00
+            ScalarValue::Decimal32(Some(5100), 9, 2), // 51.00
+        );
+
+        // Negative integer decimal: -5.00
+        assert_preimage_range(
+            ScalarValue::Decimal32(Some(-500), 9, 2),
+            ScalarValue::Decimal32(Some(-500), 9, 2), // -5.00
+            ScalarValue::Decimal32(Some(-400), 9, 2), // -4.00
+        );
+
+        // Zero: 0.00
+        assert_preimage_range(
+            ScalarValue::Decimal32(Some(0), 9, 2),
+            ScalarValue::Decimal32(Some(0), 9, 2), // 0.00
+            ScalarValue::Decimal32(Some(100), 9, 2), // 1.00
+        );
+
+        // Scale 0 (pure integer): 42
+        assert_preimage_range(
+            ScalarValue::Decimal32(Some(42), 9, 0),
+            ScalarValue::Decimal32(Some(42), 9, 0),
+            ScalarValue::Decimal32(Some(43), 9, 0),
+        );
+
+        // ===== Decimal64 =====
+        assert_preimage_range(
+            ScalarValue::Decimal64(Some(10000), 18, 2),
+            ScalarValue::Decimal64(Some(10000), 18, 2), // 100.00
+            ScalarValue::Decimal64(Some(10100), 18, 2), // 101.00
+        );
+
+        // Negative
+        assert_preimage_range(
+            ScalarValue::Decimal64(Some(-500), 18, 2),
+            ScalarValue::Decimal64(Some(-500), 18, 2), // -5.00
+            ScalarValue::Decimal64(Some(-400), 18, 2), // -4.00
+        );
+
+        // Zero
+        assert_preimage_range(
+            ScalarValue::Decimal64(Some(0), 18, 2),
+            ScalarValue::Decimal64(Some(0), 18, 2),
+            ScalarValue::Decimal64(Some(100), 18, 2),
+        );
+
+        // ===== Decimal128 =====
+        assert_preimage_range(
+            ScalarValue::Decimal128(Some(10000), 38, 2),
+            ScalarValue::Decimal128(Some(10000), 38, 2), // 100.00
+            ScalarValue::Decimal128(Some(10100), 38, 2), // 101.00
+        );
+
+        // Negative
+        assert_preimage_range(
+            ScalarValue::Decimal128(Some(-500), 38, 2),
+            ScalarValue::Decimal128(Some(-500), 38, 2), // -5.00
+            ScalarValue::Decimal128(Some(-400), 38, 2), // -4.00
+        );
+
+        // Zero
+        assert_preimage_range(
+            ScalarValue::Decimal128(Some(0), 38, 2),
+            ScalarValue::Decimal128(Some(0), 38, 2),
+            ScalarValue::Decimal128(Some(100), 38, 2),
+        );
+
+        // ===== Decimal256 =====
+        assert_preimage_range(
+            ScalarValue::Decimal256(Some(i256::from(10000)), 76, 2),
+            ScalarValue::Decimal256(Some(i256::from(10000)), 76, 2), // 100.00
+            ScalarValue::Decimal256(Some(i256::from(10100)), 76, 2), // 101.00
+        );
+
+        // Negative
+        assert_preimage_range(
+            ScalarValue::Decimal256(Some(i256::from(-500)), 76, 2),
+            ScalarValue::Decimal256(Some(i256::from(-500)), 76, 2), // -5.00
+            ScalarValue::Decimal256(Some(i256::from(-400)), 76, 2), // -4.00
+        );
+
+        // Zero
+        assert_preimage_range(
+            ScalarValue::Decimal256(Some(i256::ZERO), 76, 2),
+            ScalarValue::Decimal256(Some(i256::ZERO), 76, 2),
+            ScalarValue::Decimal256(Some(i256::from(100)), 76, 2),
+        );
+    }
+
+    #[test]
+    fn test_floor_preimage_decimal_non_integer() {
+        // floor(x) = 1.30 has NO SOLUTION because floor always returns an integer
+        // Therefore preimage should return None for non-integer decimals
+
+        // Decimal32
+        assert_preimage_none(ScalarValue::Decimal32(Some(130), 9, 2)); // 1.30
+        assert_preimage_none(ScalarValue::Decimal32(Some(-250), 9, 2)); // -2.50
+        assert_preimage_none(ScalarValue::Decimal32(Some(370), 9, 2)); // 3.70
+        assert_preimage_none(ScalarValue::Decimal32(Some(1), 9, 2)); // 0.01
+
+        // Decimal64
+        assert_preimage_none(ScalarValue::Decimal64(Some(130), 18, 2)); // 1.30
+        assert_preimage_none(ScalarValue::Decimal64(Some(-250), 18, 2)); // -2.50
+
+        // Decimal128
+        assert_preimage_none(ScalarValue::Decimal128(Some(130), 38, 2)); // 1.30
+        assert_preimage_none(ScalarValue::Decimal128(Some(-250), 38, 2)); // -2.50
+
+        // Decimal256
+        assert_preimage_none(ScalarValue::Decimal256(Some(i256::from(130)), 76, 2)); // 1.30
+        assert_preimage_none(ScalarValue::Decimal256(Some(i256::from(-250)), 76, 2)); // -2.50
+
+        // Decimal32: i32::MAX - 50
+        // This return None because the value is not an integer, not because it is out of range.
+        assert_preimage_none(ScalarValue::Decimal32(Some(i32::MAX - 50), 10, 2));
+
+        // Decimal64: i64::MAX - 50
+        // This return None because the value is not an integer, not because it is out of range.
+        assert_preimage_none(ScalarValue::Decimal64(Some(i64::MAX - 50), 19, 2));
+    }
+
+    #[test]
+    fn test_floor_preimage_decimal_overflow() {
+        // Test near MAX where adding scale_factor would overflow
+
+        // Decimal32: i32::MAX
+        assert_preimage_none(ScalarValue::Decimal32(Some(i32::MAX), 10, 0));
+
+        // Decimal64: i64::MAX
+        assert_preimage_none(ScalarValue::Decimal64(Some(i64::MAX), 19, 0));
+    }
+
+    #[test]
+    fn test_floor_preimage_decimal_edge_cases() {
+        // ===== Decimal32 =====
+        // Large value that doesn't overflow
+        // Decimal(9,2) max value is 9,999,999.99 (stored as 999,999,999)
+        // Use a large value that fits Decimal(9,2) and is divisible by 100
+        let safe_max_aligned_32 = 999_999_900; // 9,999,999.00
+        assert_preimage_range(
+            ScalarValue::Decimal32(Some(safe_max_aligned_32), 9, 2),
+            ScalarValue::Decimal32(Some(safe_max_aligned_32), 9, 2),
+            ScalarValue::Decimal32(Some(safe_max_aligned_32 + 100), 9, 2),
+        );
+
+        // Negative edge: use a large negative value that fits Decimal(9,2)
+        // Decimal(9,2) min value is -9,999,999.99 (stored as -999,999,999)
+        let min_aligned_32 = -999_999_900; // -9,999,999.00
+        assert_preimage_range(
+            ScalarValue::Decimal32(Some(min_aligned_32), 9, 2),
+            ScalarValue::Decimal32(Some(min_aligned_32), 9, 2),
+            ScalarValue::Decimal32(Some(min_aligned_32 + 100), 9, 2),
+        );
+    }
+
+    #[test]
+    fn test_floor_preimage_decimal_null() {
+        assert_preimage_none(ScalarValue::Decimal32(None, 9, 2));
+        assert_preimage_none(ScalarValue::Decimal64(None, 18, 2));
+        assert_preimage_none(ScalarValue::Decimal128(None, 38, 2));
+        assert_preimage_none(ScalarValue::Decimal256(None, 76, 2));
+    }
+}
diff --git a/datafusion/functions/src/math/gcd.rs b/datafusion/functions/src/math/gcd.rs
index 0b85e7b54a782..8b92c454d9b4c 100644
--- a/datafusion/functions/src/math/gcd.rs
+++ b/datafusion/functions/src/math/gcd.rs
@@ -15,15 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{new_null_array, ArrayRef, AsArray, Int64Array, PrimitiveArray};
+use arrow::array::{ArrayRef, AsArray, PrimitiveArray};
 use arrow::compute::try_binary;
 use arrow::datatypes::{DataType, Int64Type};
 use arrow::error::ArrowError;
-use std::any::Any;
 use std::mem::swap;
 use std::sync::Arc;
 
-use datafusion_common::{exec_err, internal_datafusion_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_datafusion_err};
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
     Volatility,
@@ -69,10 +68,6 @@ impl GcdFunc {
 }
 
 impl ScalarUDFImpl for GcdFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "gcd"
     }
@@ -94,20 +89,23 @@ impl ScalarUDFImpl for GcdFunc {
             [ColumnarValue::Array(a), ColumnarValue::Array(b)] => {
                 compute_gcd_for_arrays(&a, &b)
             }
-            [ColumnarValue::Scalar(ScalarValue::Int64(a)), ColumnarValue::Scalar(ScalarValue::Int64(b))] => {
-                match (a, b) {
-                    (Some(a), Some(b)) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
-                        Some(compute_gcd(a, b)?),
-                    ))),
-                    _ => Ok(ColumnarValue::Scalar(ScalarValue::Int64(None))),
-                }
-            }
-            [ColumnarValue::Array(a), ColumnarValue::Scalar(ScalarValue::Int64(b))] => {
-                compute_gcd_with_scalar(&a, b)
-            }
-            [ColumnarValue::Scalar(ScalarValue::Int64(a)), ColumnarValue::Array(b)] => {
-                compute_gcd_with_scalar(&b, a)
-            }
+            [
+                ColumnarValue::Scalar(ScalarValue::Int64(a)),
+                ColumnarValue::Scalar(ScalarValue::Int64(b)),
+            ] => match (a, b) {
+                (Some(a), Some(b)) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
+                    Some(compute_gcd(a, b)?),
+                ))),
+                _ => Ok(ColumnarValue::Scalar(ScalarValue::Int64(None))),
+            },
+            [
+                ColumnarValue::Array(a),
+                ColumnarValue::Scalar(ScalarValue::Int64(b)),
+            ] => compute_gcd_with_scalar(&a, b),
+            [
+                ColumnarValue::Scalar(ScalarValue::Int64(a)),
+                ColumnarValue::Array(b),
+            ] => compute_gcd_with_scalar(&b, a),
             _ => exec_err!("Unsupported argument types for function gcd"),
         }
     }
@@ -128,23 +126,25 @@ fn compute_gcd_for_arrays(a: &ArrayRef, b: &ArrayRef) -> Result<ColumnarValue> {
 }
 
 fn compute_gcd_with_scalar(arr: &ArrayRef, scalar: Option<i64>) -> Result<ColumnarValue> {
+    let prim = arr.as_primitive::<Int64Type>();
     match scalar {
+        Some(scalar_value) if scalar_value != 0 && scalar_value != i64::MIN => {
+            // The gcd result divides both inputs' absolute values. When the
+            // scalar is neither 0 nor i64::MIN, the gcd's absolute value fits
+            // in i64, so the cast to i64 below cannot overflow. This allows us
+            // to use `unary` instead of `try_unary`, which allows LLVM to
+            // vectorize more effectively.
+            let sv = scalar_value.unsigned_abs();
+            let result: PrimitiveArray<Int64Type> =
+                prim.unary(|val| unsigned_gcd(val.unsigned_abs(), sv) as i64);
+            Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef))
+        }
         Some(scalar_value) => {
-            let result: Result<Int64Array> = arr
-                .as_primitive::<Int64Type>()
-                .iter()
-                .map(|val| match val {
-                    Some(val) => Ok(Some(compute_gcd(val, scalar_value)?)),
-                    _ => Ok(None),
-                })
-                .collect();
-
-            result.map(|arr| ColumnarValue::Array(Arc::new(arr) as ArrayRef))
+            let result: PrimitiveArray<Int64Type> =
+                prim.try_unary(|val| compute_gcd(val, scalar_value))?;
+            Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef))
         }
-        None => Ok(ColumnarValue::Array(new_null_array(
-            &DataType::Int64,
-            arr.len(),
-        ))),
+        None => Ok(ColumnarValue::Scalar(ScalarValue::Int64(None))),
     }
 }
 
@@ -176,7 +176,8 @@ pub fn compute_gcd(x: i64, y: i64) -> Result<i64, ArrowError> {
     let a = x.unsigned_abs();
     let b = y.unsigned_abs();
     let r = unsigned_gcd(a, b);
-    // gcd(i64::MIN, i64::MIN) = i64::MIN.unsigned_abs() cannot fit into i64
+    // The result can be up to 2^63 (e.g. gcd(i64::MIN, 0) or
+    // gcd(i64::MIN, i64::MIN)), which does not fit into i64.
     r.try_into().map_err(|_| {
         ArrowError::ComputeError(format!("Signed integer overflow in GCD({x}, {y})"))
     })
diff --git a/datafusion/functions/src/math/iszero.rs b/datafusion/functions/src/math/iszero.rs
index 68cd3aca28fdc..de6fc669692ee 100644
--- a/datafusion/functions/src/math/iszero.rs
+++ b/datafusion/functions/src/math/iszero.rs
@@ -15,23 +15,28 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
-use arrow::array::{ArrayRef, AsArray, BooleanArray};
-use arrow::datatypes::DataType::{Boolean, Float32, Float64};
-use arrow::datatypes::{DataType, Float32Type, Float64Type};
+use arrow::array::{ArrowNativeTypeOp, AsArray, BooleanArray};
+use arrow::datatypes::DataType::{
+    Boolean, Decimal32, Decimal64, Decimal128, Decimal256, Float16, Float32, Float64,
+    Int8, Int16, Int32, Int64, Null, UInt8, UInt16, UInt32, UInt64,
+};
+use arrow::datatypes::{
+    DataType, Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, Float16Type,
+    Float32Type, Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, UInt8Type,
+    UInt16Type, UInt32Type, UInt64Type,
+};
 
-use datafusion_common::{exec_err, Result};
-use datafusion_expr::TypeSignature::Exact;
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, internal_err};
+use datafusion_expr::{Coercion, TypeSignatureClass};
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
     Volatility,
 };
 use datafusion_macros::user_doc;
 
-use crate::utils::make_scalar_function;
-
 #[user_doc(
     doc_section(label = "Math Functions"),
     description = "Returns true if a given number is +0.0 or -0.0 otherwise returns false.",
@@ -59,21 +64,15 @@ impl Default for IsZeroFunc {
 
 impl IsZeroFunc {
     pub fn new() -> Self {
-        use DataType::*;
+        // Accept any numeric type (ints, uints, floats, decimals) without implicit casts.
+        let numeric = Coercion::new_exact(TypeSignatureClass::Numeric);
         Self {
-            signature: Signature::one_of(
-                vec![Exact(vec![Float32]), Exact(vec![Float64])],
-                Volatility::Immutable,
-            ),
+            signature: Signature::coercible(vec![numeric], Volatility::Immutable),
         }
     }
 }
 
 impl ScalarUDFImpl for IsZeroFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "iszero"
     }
@@ -87,70 +86,155 @@ impl ScalarUDFImpl for IsZeroFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(iszero, vec![])(&args.args)
+        let [arg] = take_function_args(self.name(), args.args)?;
+
+        match arg {
+            ColumnarValue::Scalar(scalar) => {
+                if scalar.is_null() {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
+                }
+
+                match scalar {
+                    ScalarValue::Float64(Some(v)) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0.0))))
+                    }
+                    ScalarValue::Float32(Some(v)) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0.0))))
+                    }
+                    ScalarValue::Float16(Some(v)) => Ok(ColumnarValue::Scalar(
+                        ScalarValue::Boolean(Some(v.is_zero())),
+                    )),
+
+                    ScalarValue::Int8(Some(v)) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+                    ScalarValue::Int16(Some(v)) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+                    ScalarValue::Int32(Some(v)) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+                    ScalarValue::Int64(Some(v)) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+                    ScalarValue::UInt8(Some(v)) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+                    ScalarValue::UInt16(Some(v)) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+                    ScalarValue::UInt32(Some(v)) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+                    ScalarValue::UInt64(Some(v)) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+
+                    ScalarValue::Decimal32(Some(v), ..) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+                    ScalarValue::Decimal64(Some(v), ..) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+                    ScalarValue::Decimal128(Some(v), ..) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+                    ScalarValue::Decimal256(Some(v), ..) => Ok(ColumnarValue::Scalar(
+                        ScalarValue::Boolean(Some(v.is_zero())),
+                    )),
+
+                    _ => {
+                        internal_err!(
+                            "Unexpected scalar type for iszero: {:?}",
+                            scalar.data_type()
+                        )
+                    }
+                }
+            }
+            ColumnarValue::Array(array) => match array.data_type() {
+                Null => Ok(ColumnarValue::Array(Arc::new(BooleanArray::new_null(
+                    array.len(),
+                )))),
+
+                Float64 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<Float64Type>(),
+                    |x| x == 0.0,
+                )))),
+                Float32 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<Float32Type>(),
+                    |x| x == 0.0,
+                )))),
+                Float16 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<Float16Type>(),
+                    |x| x.is_zero(),
+                )))),
+
+                Int8 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<Int8Type>(),
+                    |x| x == 0,
+                )))),
+                Int16 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<Int16Type>(),
+                    |x| x == 0,
+                )))),
+                Int32 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<Int32Type>(),
+                    |x| x == 0,
+                )))),
+                Int64 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<Int64Type>(),
+                    |x| x == 0,
+                )))),
+                UInt8 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<UInt8Type>(),
+                    |x| x == 0,
+                )))),
+                UInt16 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<UInt16Type>(),
+                    |x| x == 0,
+                )))),
+                UInt32 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<UInt32Type>(),
+                    |x| x == 0,
+                )))),
+                UInt64 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<UInt64Type>(),
+                    |x| x == 0,
+                )))),
+
+                Decimal32(_, _) => {
+                    Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Decimal32Type>(),
+                        |x| x == 0,
+                    ))))
+                }
+                Decimal64(_, _) => {
+                    Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Decimal64Type>(),
+                        |x| x == 0,
+                    ))))
+                }
+                Decimal128(_, _) => {
+                    Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Decimal128Type>(),
+                        |x| x == 0,
+                    ))))
+                }
+                Decimal256(_, _) => {
+                    Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Decimal256Type>(),
+                        |x| x.is_zero(),
+                    ))))
+                }
+
+                other => {
+                    internal_err!("Unexpected data type {other:?} for function iszero")
+                }
+            },
+        }
     }
 
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 }
-
-/// Iszero SQL function
-pub fn iszero(args: &[ArrayRef]) -> Result<ArrayRef> {
-    match args[0].data_type() {
-        Float64 => Ok(Arc::new(BooleanArray::from_unary(
-            args[0].as_primitive::<Float64Type>(),
-            |x| x == 0.0,
-        )) as ArrayRef),
-
-        Float32 => Ok(Arc::new(BooleanArray::from_unary(
-            args[0].as_primitive::<Float32Type>(),
-            |x| x == 0.0,
-        )) as ArrayRef),
-
-        other => exec_err!("Unsupported data type {other:?} for function iszero"),
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use std::sync::Arc;
-
-    use arrow::array::{ArrayRef, Float32Array, Float64Array};
-
-    use datafusion_common::cast::as_boolean_array;
-
-    use crate::math::iszero::iszero;
-
-    #[test]
-    fn test_iszero_f64() {
-        let args: Vec<ArrayRef> =
-            vec![Arc::new(Float64Array::from(vec![1.0, 0.0, 3.0, -0.0]))];
-
-        let result = iszero(&args).expect("failed to initialize function iszero");
-        let booleans =
-            as_boolean_array(&result).expect("failed to initialize function iszero");
-
-        assert_eq!(booleans.len(), 4);
-        assert!(!booleans.value(0));
-        assert!(booleans.value(1));
-        assert!(!booleans.value(2));
-        assert!(booleans.value(3));
-    }
-
-    #[test]
-    fn test_iszero_f32() {
-        let args: Vec<ArrayRef> =
-            vec![Arc::new(Float32Array::from(vec![1.0, 0.0, 3.0, -0.0]))];
-
-        let result = iszero(&args).expect("failed to initialize function iszero");
-        let booleans =
-            as_boolean_array(&result).expect("failed to initialize function iszero");
-
-        assert_eq!(booleans.len(), 4);
-        assert!(!booleans.value(0));
-        assert!(booleans.value(1));
-        assert!(!booleans.value(2));
-        assert!(booleans.value(3));
-    }
-}
diff --git a/datafusion/functions/src/math/lcm.rs b/datafusion/functions/src/math/lcm.rs
index bfb20dfd5ce41..9398e9f8d6e00 100644
--- a/datafusion/functions/src/math/lcm.rs
+++ b/datafusion/functions/src/math/lcm.rs
@@ -15,15 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
-use arrow::array::{ArrayRef, Int64Array};
+use arrow::array::{ArrayRef, AsArray, PrimitiveArray};
+use arrow::compute::try_binary;
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::Int64;
+use arrow::datatypes::Int64Type;
 
 use arrow::error::ArrowError;
-use datafusion_common::{arrow_datafusion_err, exec_err, DataFusionError, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
     Volatility,
@@ -69,10 +70,6 @@ impl LcmFunc {
 }
 
 impl ScalarUDFImpl for LcmFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "lcm"
     }
@@ -96,7 +93,7 @@ impl ScalarUDFImpl for LcmFunc {
 
 /// Lcm SQL function
 fn lcm(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let compute_lcm = |x: i64, y: i64| {
+    let compute_lcm = |x: i64, y: i64| -> Result<i64, ArrowError> {
         if x == 0 || y == 0 {
             return Ok(0);
         }
@@ -110,55 +107,20 @@ fn lcm(args: &[ArrayRef]) -> Result<ArrayRef> {
             .checked_mul(b)
             .and_then(|v| i64::try_from(v).ok())
             .ok_or_else(|| {
-                arrow_datafusion_err!(ArrowError::ComputeError(format!(
+                ArrowError::ComputeError(format!(
                     "Signed integer overflow in LCM({x}, {y})"
-                )))
+                ))
             })
     };
 
     match args[0].data_type() {
         Int64 => {
-            let arg1 = downcast_named_arg!(&args[0], "x", Int64Array);
-            let arg2 = downcast_named_arg!(&args[1], "y", Int64Array);
-
-            Ok(arg1
-                .iter()
-                .zip(arg2.iter())
-                .map(|(a1, a2)| match (a1, a2) {
-                    (Some(a1), Some(a2)) => Ok(Some(compute_lcm(a1, a2)?)),
-                    _ => Ok(None),
-                })
-                .collect::<Result<Int64Array>>()
-                .map(Arc::new)? as ArrayRef)
+            let arg1 = args[0].as_primitive::<Int64Type>();
+            let arg2 = args[1].as_primitive::<Int64Type>();
+
+            let result: PrimitiveArray<Int64Type> = try_binary(arg1, arg2, compute_lcm)?;
+            Ok(Arc::new(result) as ArrayRef)
         }
         other => exec_err!("Unsupported data type {other:?} for function lcm"),
     }
 }
-
-#[cfg(test)]
-mod test {
-    use std::sync::Arc;
-
-    use arrow::array::{ArrayRef, Int64Array};
-
-    use datafusion_common::cast::as_int64_array;
-
-    use crate::math::lcm::lcm;
-
-    #[test]
-    fn test_lcm_i64() {
-        let args: Vec<ArrayRef> = vec![
-            Arc::new(Int64Array::from(vec![0, 3, 25, -16])), // x
-            Arc::new(Int64Array::from(vec![0, -2, 15, 8])),  // y
-        ];
-
-        let result = lcm(&args).expect("failed to initialize function lcm");
-        let ints = as_int64_array(&result).expect("failed to initialize function lcm");
-
-        assert_eq!(ints.len(), 4);
-        assert_eq!(ints.value(0), 0);
-        assert_eq!(ints.value(1), 6);
-        assert_eq!(ints.value(2), 75);
-        assert_eq!(ints.value(3), 16);
-    }
-}
diff --git a/datafusion/functions/src/math/log.rs b/datafusion/functions/src/math/log.rs
index f66f6fcfc1f88..ac94f78e0c723 100644
--- a/datafusion/functions/src/math/log.rs
+++ b/datafusion/functions/src/math/log.rs
@@ -17,31 +17,30 @@
 
 //! Math function: `log()`.
 
-use std::any::Any;
-use std::sync::Arc;
-
 use super::power::PowerFunc;
 
-use crate::utils::{calculate_binary_math, decimal128_to_i128};
+use crate::utils::calculate_binary_math;
 use arrow::array::{Array, ArrayRef};
 use arrow::datatypes::{
-    DataType, Decimal128Type, Decimal256Type, Float32Type, Float64Type, Int32Type,
-    Int64Type, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION,
+    DataType, Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, Float16Type,
+    Float32Type, Float64Type,
 };
 use arrow::error::ArrowError;
 use arrow_buffer::i256;
+use datafusion_common::types::NativeType;
 use datafusion_common::{
-    exec_err, internal_err, plan_datafusion_err, plan_err, Result, ScalarValue,
+    Result, ScalarValue, exec_err, internal_err, plan_datafusion_err, plan_err,
 };
 use datafusion_expr::expr::ScalarFunction;
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion_expr::{
-    lit, ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDF,
-    TypeSignature::*,
+    Coercion, ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDF,
+    TypeSignature, TypeSignatureClass, lit,
 };
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
+use num_traits::{Float, ToPrimitive};
 
 #[user_doc(
     doc_section(label = "Math Functions"),
@@ -72,37 +71,28 @@ impl Default for LogFunc {
 
 impl LogFunc {
     pub fn new() -> Self {
+        // Converts decimals & integers to float64, accepting other floats as is
+        let as_float = Coercion::new_implicit(
+            TypeSignatureClass::Float,
+            vec![TypeSignatureClass::Numeric],
+            NativeType::Float64,
+        );
         Self {
             signature: Signature::one_of(
+                // Ensure decimals have precedence over floats since we have
+                // a native decimal implementation for log
                 vec![
-                    Numeric(1),
-                    Numeric(2),
-                    Exact(vec![DataType::Float32, DataType::Float32]),
-                    Exact(vec![DataType::Float64, DataType::Float64]),
-                    Exact(vec![
-                        DataType::Int64,
-                        DataType::Decimal128(DECIMAL128_MAX_PRECISION, 0),
-                    ]),
-                    Exact(vec![
-                        DataType::Float32,
-                        DataType::Decimal128(DECIMAL128_MAX_PRECISION, 0),
-                    ]),
-                    Exact(vec![
-                        DataType::Float64,
-                        DataType::Decimal128(DECIMAL128_MAX_PRECISION, 0),
-                    ]),
-                    Exact(vec![
-                        DataType::Int64,
-                        DataType::Decimal256(DECIMAL256_MAX_PRECISION, 0),
-                    ]),
-                    Exact(vec![
-                        DataType::Float32,
-                        DataType::Decimal256(DECIMAL256_MAX_PRECISION, 0),
-                    ]),
-                    Exact(vec![
-                        DataType::Float64,
-                        DataType::Decimal256(DECIMAL256_MAX_PRECISION, 0),
+                    // log(value)
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Decimal,
+                    )]),
+                    TypeSignature::Coercible(vec![as_float.clone()]),
+                    // log(base, value)
+                    TypeSignature::Coercible(vec![
+                        as_float.clone(),
+                        Coercion::new_exact(TypeSignatureClass::Decimal),
                     ]),
+                    TypeSignature::Coercible(vec![as_float.clone(), as_float.clone()]),
                 ],
                 Volatility::Immutable,
             ),
@@ -110,45 +100,92 @@ impl LogFunc {
     }
 }
 
-/// Binary function to calculate an integer logarithm of Decimal128 `value` using `base` base
-/// Returns error if base is invalid
-fn log_decimal128(value: i128, scale: i8, base: f64) -> Result<f64, ArrowError> {
-    if !base.is_finite() || base.trunc() != base {
-        return Err(ArrowError::ComputeError(format!(
-            "Log cannot use non-integer base: {base}"
-        )));
+/// Checks if the base is valid for the efficient integer logarithm algorithm.
+#[inline]
+fn is_valid_integer_base(base: f64) -> bool {
+    base.trunc() == base && base >= 2.0 && base <= u32::MAX as f64
+}
+
+/// Calculate logarithm for Decimal32 values.
+/// For integer bases >= 2 with zero scale, return an exact integer log when the
+/// value is a perfect power of the base. Otherwise falls back to f64 computation.
+fn log_decimal32(value: i32, scale: i8, base: f64) -> Result<f64, ArrowError> {
+    if scale == 0
+        && is_valid_integer_base(base)
+        && let Ok(unscaled) = u32::try_from(value)
+        && unscaled > 0
+    {
+        let base_u32 = base as u32;
+        let int_log = unscaled.ilog(base_u32);
+        if base_u32.checked_pow(int_log) == Some(unscaled) {
+            return Ok(int_log as f64);
+        }
     }
-    if (base as u32) < 2 {
-        return Err(ArrowError::ComputeError(format!(
-            "Log base must be greater than 1: {base}"
-        )));
+    decimal_to_f64(value, scale).map(|v| v.log(base))
+}
+
+/// Calculate logarithm for Decimal64 values.
+/// For integer bases >= 2 with zero scale, return an exact integer log when the
+/// value is a perfect power of the base. Otherwise falls back to f64 computation.
+fn log_decimal64(value: i64, scale: i8, base: f64) -> Result<f64, ArrowError> {
+    if scale == 0
+        && is_valid_integer_base(base)
+        && let Ok(unscaled) = u64::try_from(value)
+        && unscaled > 0
+    {
+        let base_u64 = base as u64;
+        let int_log = unscaled.ilog(base_u64);
+        if base_u64.checked_pow(int_log) == Some(unscaled) {
+            return Ok(int_log as f64);
+        }
     }
+    decimal_to_f64(value, scale).map(|v| v.log(base))
+}
 
-    let unscaled_value = decimal128_to_i128(value, scale)?;
-    if unscaled_value > 0 {
-        let log_value: u32 = unscaled_value.ilog(base as i128);
-        Ok(log_value as f64)
-    } else {
-        // Reflect f64::log behaviour
-        Ok(f64::NAN)
+/// Calculate logarithm for Decimal128 values.
+/// For integer bases >= 2 with zero scale, return an exact integer log when the
+/// value is a perfect power of the base. Otherwise falls back to f64 computation.
+fn log_decimal128(value: i128, scale: i8, base: f64) -> Result<f64, ArrowError> {
+    if scale == 0
+        && is_valid_integer_base(base)
+        && let Ok(unscaled) = u128::try_from(value)
+        && unscaled > 0
+    {
+        let base_u128 = base as u128;
+        let int_log = unscaled.ilog(base_u128);
+        if base_u128.checked_pow(int_log) == Some(unscaled) {
+            return Ok(int_log as f64);
+        }
     }
+    decimal_to_f64(value, scale).map(|v| v.log(base))
+}
+
+/// Convert a scaled decimal value to f64.
+#[inline]
+fn decimal_to_f64<T: ToPrimitive + Copy>(value: T, scale: i8) -> Result<f64, ArrowError> {
+    let value_f64 = value.to_f64().ok_or_else(|| {
+        ArrowError::ComputeError("Cannot convert value to f64".to_string())
+    })?;
+    let scale_factor = 10f64.powi(scale as i32);
+    Ok(value_f64 / scale_factor)
 }
 
-/// Binary function to calculate an integer logarithm of Decimal128 `value` using `base` base
-/// Returns error if base is invalid or if value is out of bounds of Decimal128
 fn log_decimal256(value: i256, scale: i8, base: f64) -> Result<f64, ArrowError> {
+    // Try to convert to i128 for the optimized path
     match value.to_i128() {
-        Some(value) => log_decimal128(value, scale, base),
-        None => Err(ArrowError::NotYetImplemented(format!(
-            "Log of Decimal256 larger than Decimal128 is not yet supported: {value}"
-        ))),
+        Some(v) => log_decimal128(v, scale, base),
+        None => {
+            // For very large Decimal256 values, use f64 computation
+            let value_f64 = value.to_f64().ok_or_else(|| {
+                ArrowError::ComputeError(format!("Cannot convert {value} to f64"))
+            })?;
+            let scale_factor = 10f64.powi(scale as i32);
+            Ok((value_f64 / scale_factor).log(base))
+        }
     }
 }
 
 impl ScalarUDFImpl for LogFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "log"
     }
@@ -160,6 +197,7 @@ impl ScalarUDFImpl for LogFunc {
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
         // Check last argument (value)
         match &arg_types.last().ok_or(plan_datafusion_err!("No args"))? {
+            DataType::Float16 => Ok(DataType::Float16),
             DataType::Float32 => Ok(DataType::Float32),
             _ => Ok(DataType::Float64),
         }
@@ -192,76 +230,78 @@ impl ScalarUDFImpl for LogFunc {
 
     // Support overloaded log(base, x) and log(x) which defaults to log(10, x)
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        let args = ColumnarValue::values_to_arrays(&args.args)?;
+        if args.arg_fields.iter().any(|a| a.data_type().is_null()) {
+            return ColumnarValue::Scalar(ScalarValue::Null)
+                .cast_to(args.return_type(), None);
+        }
 
-        let (base, value) = if args.len() == 2 {
-            // note in f64::log params order is different than in sql. e.g in sql log(base, x) == f64::log(x, base)
-            (ColumnarValue::Array(Arc::clone(&args[0])), &args[1])
+        let (base, value) = if args.args.len() == 2 {
+            (args.args[0].clone(), &args.args[1])
         } else {
-            // log(num) - assume base is 10
-            let ret_type = if args[0].data_type().is_null() {
-                &DataType::Float64
-            } else {
-                args[0].data_type()
-            };
+            // no base specified, default to 10
             (
-                ColumnarValue::Array(
-                    ScalarValue::new_ten(ret_type)?.to_array_of_size(args[0].len())?,
-                ),
-                &args[0],
+                ColumnarValue::Scalar(ScalarValue::new_ten(args.return_type())?),
+                &args.args[0],
             )
         };
+        let value = value.to_array(args.number_rows)?;
 
-        // All log functors have format 'log(value, base)'
-        // Therefore, for `calculate_binary_math` the first type means a type of main array
-        // The second type is the type of the base array (even if derived from main)
-        let arr: ArrayRef = match value.data_type() {
-            DataType::Float32 => calculate_binary_math::<
-                Float32Type,
-                Float32Type,
-                Float32Type,
-                _,
-            >(value, &base, |x, b| Ok(f32::log(x, b)))?,
-            DataType::Float64 => calculate_binary_math::<
-                Float64Type,
-                Float64Type,
-                Float64Type,
-                _,
-            >(value, &base, |x, b| Ok(f64::log(x, b)))?,
-            DataType::Int32 => {
-                calculate_binary_math::<Int32Type, Float64Type, Float64Type, _>(
-                    value,
+        let output: ArrayRef = match value.data_type() {
+            DataType::Float16 => {
+                calculate_binary_math::<Float16Type, Float16Type, Float16Type, _>(
+                    &value,
+                    &base,
+                    |value, base| Ok(value.log(base)),
+                )?
+            }
+            DataType::Float32 => {
+                calculate_binary_math::<Float32Type, Float32Type, Float32Type, _>(
+                    &value,
+                    &base,
+                    |value, base| Ok(value.log(base)),
+                )?
+            }
+            DataType::Float64 => {
+                calculate_binary_math::<Float64Type, Float64Type, Float64Type, _>(
+                    &value,
                     &base,
-                    |x, b| Ok(f64::log(x as f64, b)),
+                    |value, base| Ok(value.log(base)),
                 )?
             }
-            DataType::Int64 => {
-                calculate_binary_math::<Int64Type, Float64Type, Float64Type, _>(
-                    value,
+            DataType::Decimal32(_, scale) => {
+                calculate_binary_math::<Decimal32Type, Float64Type, Float64Type, _>(
+                    &value,
                     &base,
-                    |x, b| Ok(f64::log(x as f64, b)),
+                    |value, base| log_decimal32(value, *scale, base),
                 )?
             }
-            DataType::Decimal128(_precision, scale) => {
+            DataType::Decimal64(_, scale) => {
+                calculate_binary_math::<Decimal64Type, Float64Type, Float64Type, _>(
+                    &value,
+                    &base,
+                    |value, base| log_decimal64(value, *scale, base),
+                )?
+            }
+            DataType::Decimal128(_, scale) => {
                 calculate_binary_math::<Decimal128Type, Float64Type, Float64Type, _>(
-                    value,
+                    &value,
                     &base,
-                    |x, b| log_decimal128(x, *scale, b),
+                    |value, base| log_decimal128(value, *scale, base),
                 )?
             }
-            DataType::Decimal256(_precision, scale) => {
+            DataType::Decimal256(_, scale) => {
                 calculate_binary_math::<Decimal256Type, Float64Type, Float64Type, _>(
-                    value,
+                    &value,
                     &base,
-                    |x, b| log_decimal256(x, *scale, b),
+                    |value, base| log_decimal256(value, *scale, base),
                 )?
             }
             other => {
-                return exec_err!("Unsupported data type {other:?} for function log")
+                return exec_err!("Unsupported data type {other:?} for function log");
             }
         };
 
-        Ok(ColumnarValue::Array(arr))
+        Ok(ColumnarValue::Array(output))
     }
 
     fn documentation(&self) -> Option<&Documentation> {
@@ -275,19 +315,43 @@ impl ScalarUDFImpl for LogFunc {
     fn simplify(
         &self,
         mut args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
+        let mut arg_types = args
+            .iter()
+            .map(|arg| info.get_data_type(arg))
+            .collect::<Result<Vec<_>>>()?;
+        let return_type = self.return_type(&arg_types)?;
+
+        // Null propagation
+        if arg_types.iter().any(|dt| dt.is_null()) {
+            return Ok(ExprSimplifyResult::Simplified(lit(
+                ScalarValue::Null.cast_to(&return_type)?
+            )));
+        }
+
         // Args are either
         // log(number)
         // log(base, number)
         let num_args = args.len();
-        if num_args > 2 {
+        if num_args != 1 && num_args != 2 {
             return plan_err!("Expected log to have 1 or 2 arguments, got {num_args}");
         }
-        let number = args.pop().ok_or_else(|| {
-            plan_datafusion_err!("Expected log to have 1 or 2 arguments, got 0")
-        })?;
-        let number_datatype = info.get_data_type(&number)?;
+
+        match arg_types.last().unwrap() {
+            DataType::Decimal32(_, scale)
+            | DataType::Decimal64(_, scale)
+            | DataType::Decimal128(_, scale)
+            | DataType::Decimal256(_, scale)
+                if *scale < 0 =>
+            {
+                return Ok(ExprSimplifyResult::Original(args));
+            }
+            _ => (),
+        };
+
+        let number = args.pop().unwrap();
+        let number_datatype = arg_types.pop().unwrap();
         // default to base 10
         let base = if let Some(base) = args.pop() {
             base
@@ -321,7 +385,7 @@ impl ScalarUDFImpl for LogFunc {
                         _ => {
                             return internal_err!(
                                 "Unexpected number of arguments in log::simplify"
-                            )
+                            );
                         }
                     };
                     Ok(ExprSimplifyResult::Original(args))
@@ -333,12 +397,12 @@ impl ScalarUDFImpl for LogFunc {
 
 /// Returns true if the function is `PowerFunc`
 fn is_pow(func: &ScalarUDF) -> bool {
-    func.inner().as_any().downcast_ref::<PowerFunc>().is_some()
+    func.inner().is::<PowerFunc>()
 }
 
 #[cfg(test)]
 mod tests {
-    use std::collections::HashMap;
+    use std::sync::Arc;
 
     use super::*;
 
@@ -346,12 +410,17 @@ mod tests {
         Date32Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array,
     };
     use arrow::compute::SortOptions;
-    use arrow::datatypes::{Field, DECIMAL256_MAX_PRECISION};
+    use arrow::datatypes::{DECIMAL256_MAX_PRECISION, Field};
     use datafusion_common::cast::{as_float32_array, as_float64_array};
     use datafusion_common::config::ConfigOptions;
-    use datafusion_common::DFSchema;
-    use datafusion_expr::execution_props::ExecutionProps;
-    use datafusion_expr::simplify::SimplifyContext;
+
+    #[test]
+    fn test_log_decimal_native() {
+        let value = 10_i128.pow(35);
+        let expected = (value as f64).log2();
+        let actual = log_decimal128(value, 0, 2.0).unwrap();
+        assert!((actual - expected).abs() < 1e-10);
+    }
 
     #[test]
     fn test_log_invalid_base_type() {
@@ -679,10 +748,7 @@ mod tests {
     #[test]
     // Test log() simplification errors
     fn test_log_simplify_errors() {
-        let props = ExecutionProps::new();
-        let schema =
-            Arc::new(DFSchema::new_with_metadata(vec![], HashMap::new()).unwrap());
-        let context = SimplifyContext::new(&props).with_schema(schema);
+        let context = SimplifyContext::default();
         // Expect 0 args to error
         let _ = LogFunc::new().simplify(vec![], &context).unwrap_err();
         // Expect 3 args to error
@@ -694,10 +760,7 @@ mod tests {
     #[test]
     // Test that non-simplifiable log() expressions are unchanged after simplification
     fn test_log_simplify_original() {
-        let props = ExecutionProps::new();
-        let schema =
-            Arc::new(DFSchema::new_with_metadata(vec![], HashMap::new()).unwrap());
-        let context = SimplifyContext::new(&props).with_schema(schema);
+        let context = SimplifyContext::default();
         // One argument with no simplifications
         let result = LogFunc::new().simplify(vec![lit(2)], &context).unwrap();
         let ExprSimplifyResult::Original(args) = result else {
@@ -918,7 +981,8 @@ mod tests {
                 assert!((floats.value(1) - 2.0).abs() < 1e-10);
                 assert!((floats.value(2) - 3.0).abs() < 1e-10);
                 assert!((floats.value(3) - 4.0).abs() < 1e-10);
-                assert!((floats.value(4) - 4.0).abs() < 1e-10); // Integer rounding
+                let expected = 12600_f64.log(10.0);
+                assert!((floats.value(4) - expected).abs() < 1e-10);
                 assert!(floats.value(5).is_nan());
             }
             ColumnarValue::Scalar(_) => {
@@ -1048,13 +1112,14 @@ mod tests {
                     .expect("failed to convert result to a Float64Array");
 
                 assert_eq!(floats.len(), 7);
-                eprintln!("floats {:?}", &floats);
                 assert!((floats.value(0) - 1.0).abs() < 1e-10);
                 assert!((floats.value(1) - 2.0).abs() < 1e-10);
                 assert!((floats.value(2) - 3.0).abs() < 1e-10);
                 assert!((floats.value(3) - 4.0).abs() < 1e-10);
-                assert!((floats.value(4) - 4.0).abs() < 1e-10); // Integer rounding for float log
-                assert!((floats.value(5) - 38.0).abs() < 1e-10);
+                let expected = 12600_f64.log(10.0);
+                assert!((floats.value(4) - expected).abs() < 1e-10);
+                let expected = ((i128::MAX - 1000) as f64).log(10.0);
+                assert!((floats.value(5) - expected).abs() < 1e-10);
                 assert!(floats.value(6).is_nan());
             }
             ColumnarValue::Scalar(_) => {
@@ -1064,7 +1129,8 @@ mod tests {
     }
 
     #[test]
-    fn test_log_decimal128_wrong_base() {
+    fn test_log_decimal128_invalid_base() {
+        // Invalid base (-2.0) should return NaN, matching f64::log behavior
         let arg_fields = vec![
             Field::new("b", DataType::Float64, false).into(),
             Field::new("x", DataType::Decimal128(38, 0), false).into(),
@@ -1079,16 +1145,26 @@ mod tests {
             return_field: Field::new("f", DataType::Float64, true).into(),
             config_options: Arc::new(ConfigOptions::default()),
         };
-        let result = LogFunc::new().invoke_with_args(args);
-        assert!(result.is_err());
-        assert_eq!(
-            "Arrow error: Compute error: Log base must be greater than 1: -2",
-            result.unwrap_err().to_string().lines().next().unwrap()
-        );
+        let result = LogFunc::new()
+            .invoke_with_args(args)
+            .expect("should not error on invalid base");
+
+        match result {
+            ColumnarValue::Array(arr) => {
+                let floats = as_float64_array(&arr)
+                    .expect("failed to convert result to a Float64Array");
+                assert_eq!(floats.len(), 1);
+                assert!(floats.value(0).is_nan());
+            }
+            ColumnarValue::Scalar(_) => {
+                panic!("Expected an array value")
+            }
+        }
     }
 
     #[test]
-    fn test_log_decimal256_error() {
+    fn test_log_decimal256_large() {
+        // Large Decimal256 values that don't fit in i128 now use f64 fallback
         let arg_field = Field::new("a", DataType::Decimal256(38, 0), false).into();
         let args = ScalarFunctionArgs {
             args: vec![
@@ -1102,10 +1178,26 @@ mod tests {
             return_field: Field::new("f", DataType::Float64, true).into(),
             config_options: Arc::new(ConfigOptions::default()),
         };
-        let result = LogFunc::new().invoke_with_args(args);
-        assert!(result.is_err());
-        assert_eq!(result.unwrap_err().to_string().lines().next().unwrap(),
-            "Arrow error: Not yet implemented: Log of Decimal256 larger than Decimal128 is not yet supported: 170141183460469231731687303715884106727"
-        );
+        let result = LogFunc::new()
+            .invoke_with_args(args)
+            .expect("should handle large Decimal256 via f64 fallback");
+
+        match result {
+            ColumnarValue::Array(arr) => {
+                let floats = as_float64_array(&arr)
+                    .expect("failed to convert result to a Float64Array");
+                assert_eq!(floats.len(), 1);
+                // The f64 fallback may lose some precision for very large numbers,
+                // but we verify we get a reasonable positive result (not NaN/infinity)
+                let log_result = floats.value(0);
+                assert!(
+                    log_result.is_finite() && log_result > 0.0,
+                    "Expected positive finite log result, got {log_result}"
+                );
+            }
+            ColumnarValue::Scalar(_) => {
+                panic!("Expected an array value")
+            }
+        }
     }
 }
diff --git a/datafusion/functions/src/math/mod.rs b/datafusion/functions/src/math/mod.rs
index 4eb337a30110e..610e773d68fd0 100644
--- a/datafusion/functions/src/math/mod.rs
+++ b/datafusion/functions/src/math/mod.rs
@@ -23,8 +23,11 @@ use std::sync::Arc;
 
 pub mod abs;
 pub mod bounds;
+pub mod ceil;
 pub mod cot;
+mod decimal;
 pub mod factorial;
+pub mod floor;
 pub mod gcd;
 pub mod iszero;
 pub mod lcm;
@@ -104,14 +107,7 @@ make_math_unary_udf!(
     super::bounds::unbounded_bounds,
     super::get_cbrt_doc
 );
-make_math_unary_udf!(
-    CeilFunc,
-    ceil,
-    ceil,
-    super::ceil_order,
-    super::bounds::unbounded_bounds,
-    super::get_ceil_doc
-);
+make_udf_function!(ceil::CeilFunc, ceil);
 make_math_unary_udf!(
     CosFunc,
     cos,
@@ -146,14 +142,7 @@ make_math_unary_udf!(
     super::get_exp_doc
 );
 make_udf_function!(factorial::FactorialFunc, factorial);
-make_math_unary_udf!(
-    FloorFunc,
-    floor,
-    floor,
-    super::floor_order,
-    super::bounds::unbounded_bounds,
-    super::get_floor_doc
-);
+make_udf_function!(floor::FloorFunc, floor);
 make_udf_function!(log::LogFunc, log);
 make_udf_function!(gcd::GcdFunc, gcd);
 make_udf_function!(nans::IsNanFunc, isnan);
diff --git a/datafusion/functions/src/math/monotonicity.rs b/datafusion/functions/src/math/monotonicity.rs
index 5b8252137be11..4a0db9ef0cf7a 100644
--- a/datafusion/functions/src/math/monotonicity.rs
+++ b/datafusion/functions/src/math/monotonicity.rs
@@ -17,11 +17,11 @@
 
 use std::sync::LazyLock;
 
-use datafusion_common::{exec_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err};
 use datafusion_doc::scalar_doc_sections::DOC_SECTION_MATH;
+use datafusion_expr::Documentation;
 use datafusion_expr::interval_arithmetic::Interval;
 use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
-use datafusion_expr::Documentation;
 
 /// Non-increasing on the interval \[−1, 1\], undefined otherwise.
 pub fn acos_order(input: &[ExprProperties]) -> Result<SortProperties> {
@@ -31,7 +31,7 @@ pub fn acos_order(input: &[ExprProperties]) -> Result<SortProperties> {
     let valid_domain =
         Interval::make_symmetric_unit_interval(&range.lower().data_type())?;
 
-    if valid_domain.contains(range)? == Interval::CERTAINLY_TRUE {
+    if valid_domain.contains(range)? == Interval::TRUE {
         Ok(-arg.sort_properties)
     } else {
         exec_err!("Input range of ACOS contains out-of-domain values")
@@ -72,7 +72,7 @@ pub fn acosh_order(input: &[ExprProperties]) -> Result<SortProperties> {
         ScalarValue::try_from(&range.upper().data_type())?,
     )?;
 
-    if valid_domain.contains(range)? == Interval::CERTAINLY_TRUE {
+    if valid_domain.contains(range)? == Interval::TRUE {
         Ok(arg.sort_properties)
     } else {
         exec_err!("Input range of ACOSH contains out-of-domain values")
@@ -110,7 +110,7 @@ pub fn asin_order(input: &[ExprProperties]) -> Result<SortProperties> {
     let valid_domain =
         Interval::make_symmetric_unit_interval(&range.lower().data_type())?;
 
-    if valid_domain.contains(range)? == Interval::CERTAINLY_TRUE {
+    if valid_domain.contains(range)? == Interval::TRUE {
         Ok(arg.sort_properties)
     } else {
         exec_err!("Input range of ASIN contains out-of-domain values")
@@ -207,7 +207,7 @@ pub fn atanh_order(input: &[ExprProperties]) -> Result<SortProperties> {
     let valid_domain =
         Interval::make_symmetric_unit_interval(&range.lower().data_type())?;
 
-    if valid_domain.contains(range)? == Interval::CERTAINLY_TRUE {
+    if valid_domain.contains(range)? == Interval::TRUE {
         Ok(arg.sort_properties)
     } else {
         exec_err!("Input range of ATANH contains out-of-domain values")
@@ -309,30 +309,6 @@ pub fn ceil_order(input: &[ExprProperties]) -> Result<SortProperties> {
     Ok(input[0].sort_properties)
 }
 
-static DOCUMENTATION_CEIL: LazyLock<Documentation> = LazyLock::new(|| {
-    Documentation::builder(
-        DOC_SECTION_MATH,
-        "Returns the nearest integer greater than or equal to a number.",
-        "ceil(numeric_expression)",
-    )
-    .with_standard_argument("numeric_expression", Some("Numeric"))
-    .with_sql_example(
-        r#"```sql
-    > SELECT ceil(3.14);
-+------------+
-| ceil(3.14) |
-+------------+
-| 4.0        |
-+------------+
-```"#,
-    )
-    .build()
-});
-
-pub fn get_ceil_doc() -> &'static Documentation {
-    &DOCUMENTATION_CEIL
-}
-
 /// Non-increasing on \[0, π\] and then non-decreasing on \[π, 2π\].
 /// This pattern repeats periodically with a period of 2π.
 // TODO: Implement ordering rule of the ATAN2 function.
@@ -371,9 +347,9 @@ pub fn cosh_order(input: &[ExprProperties]) -> Result<SortProperties> {
 
     let zero_point = Interval::make_zero(&range.lower().data_type())?;
 
-    if range.gt_eq(&zero_point)? == Interval::CERTAINLY_TRUE {
+    if range.gt_eq(&zero_point)? == Interval::TRUE {
         Ok(arg.sort_properties)
-    } else if range.lt_eq(&zero_point)? == Interval::CERTAINLY_TRUE {
+    } else if range.lt_eq(&zero_point)? == Interval::TRUE {
         Ok(-arg.sort_properties)
     } else {
         Ok(SortProperties::Unordered)
@@ -467,30 +443,6 @@ pub fn floor_order(input: &[ExprProperties]) -> Result<SortProperties> {
     Ok(input[0].sort_properties)
 }
 
-static DOCUMENTATION_FLOOR: LazyLock<Documentation> = LazyLock::new(|| {
-    Documentation::builder(
-        DOC_SECTION_MATH,
-        "Returns the nearest integer less than or equal to a number.",
-        "floor(numeric_expression)",
-    )
-    .with_standard_argument("numeric_expression", Some("Numeric"))
-    .with_sql_example(
-        r#"```sql
-> SELECT floor(3.14);
-+-------------+
-| floor(3.14) |
-+-------------+
-| 3.0         |
-+-------------+
-```"#,
-    )
-    .build()
-});
-
-pub fn get_floor_doc() -> &'static Documentation {
-    &DOCUMENTATION_FLOOR
-}
-
 /// Non-decreasing for x ≥ 0, undefined otherwise.
 pub fn ln_order(input: &[ExprProperties]) -> Result<SortProperties> {
     let arg = &input[0];
@@ -498,7 +450,7 @@ pub fn ln_order(input: &[ExprProperties]) -> Result<SortProperties> {
 
     let zero_point = Interval::make_zero(&range.lower().data_type())?;
 
-    if range.gt_eq(&zero_point)? == Interval::CERTAINLY_TRUE {
+    if range.gt_eq(&zero_point)? == Interval::TRUE {
         Ok(arg.sort_properties)
     } else {
         exec_err!("Input range of LN contains out-of-domain values")
@@ -536,7 +488,7 @@ pub fn log2_order(input: &[ExprProperties]) -> Result<SortProperties> {
 
     let zero_point = Interval::make_zero(&range.lower().data_type())?;
 
-    if range.gt_eq(&zero_point)? == Interval::CERTAINLY_TRUE {
+    if range.gt_eq(&zero_point)? == Interval::TRUE {
         Ok(arg.sort_properties)
     } else {
         exec_err!("Input range of LOG2 contains out-of-domain values")
@@ -574,7 +526,7 @@ pub fn log10_order(input: &[ExprProperties]) -> Result<SortProperties> {
 
     let zero_point = Interval::make_zero(&range.lower().data_type())?;
 
-    if range.gt_eq(&zero_point)? == Interval::CERTAINLY_TRUE {
+    if range.gt_eq(&zero_point)? == Interval::TRUE {
         Ok(arg.sort_properties)
     } else {
         exec_err!("Input range of LOG10 contains out-of-domain values")
@@ -701,7 +653,7 @@ pub fn sqrt_order(input: &[ExprProperties]) -> Result<SortProperties> {
 
     let zero_point = Interval::make_zero(&range.lower().data_type())?;
 
-    if range.gt_eq(&zero_point)? == Interval::CERTAINLY_TRUE {
+    if range.gt_eq(&zero_point)? == Interval::TRUE {
         Ok(arg.sort_properties)
     } else {
         exec_err!("Input range of SQRT contains out-of-domain values")
@@ -785,7 +737,6 @@ pub fn get_tanh_doc() -> &'static Documentation {
 #[cfg(test)]
 mod tests {
     use arrow::compute::SortOptions;
-    use datafusion_common::Result;
 
     use super::*;
 
diff --git a/datafusion/functions/src/math/nans.rs b/datafusion/functions/src/math/nans.rs
index 759b0f5fd50ac..c5ea2fa079a45 100644
--- a/datafusion/functions/src/math/nans.rs
+++ b/datafusion/functions/src/math/nans.rs
@@ -17,14 +17,22 @@
 
 //! Math function: `isnan()`.
 
-use arrow::datatypes::{DataType, Float32Type, Float64Type};
-use datafusion_common::{exec_err, Result};
-use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, TypeSignature};
-
 use arrow::array::{ArrayRef, AsArray, BooleanArray};
-use datafusion_expr::{Documentation, ScalarUDFImpl, Signature, Volatility};
+use arrow::datatypes::DataType::{
+    Decimal32, Decimal64, Decimal128, Decimal256, Float16, Float32, Float64, Int8, Int16,
+    Int32, Int64, Null, UInt8, UInt16, UInt32, UInt64,
+};
+use arrow::datatypes::{
+    DataType, Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, Float16Type,
+    Float32Type, Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, UInt8Type,
+    UInt16Type, UInt32Type, UInt64Type,
+};
+use datafusion_common::{Result, ScalarValue, exec_err, utils::take_function_args};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
+};
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::sync::Arc;
 
 #[user_doc(
@@ -54,23 +62,15 @@ impl Default for IsNanFunc {
 
 impl IsNanFunc {
     pub fn new() -> Self {
-        use DataType::*;
+        // Accept any numeric type (ints, uints, floats, decimals) without implicit casts.
+        let numeric = Coercion::new_exact(TypeSignatureClass::Numeric);
         Self {
-            signature: Signature::one_of(
-                vec![
-                    TypeSignature::Exact(vec![Float32]),
-                    TypeSignature::Exact(vec![Float64]),
-                ],
-                Volatility::Immutable,
-            ),
+            signature: Signature::coercible(vec![numeric], Volatility::Immutable),
         }
     }
 }
 
 impl ScalarUDFImpl for IsNanFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "isnan"
     }
@@ -84,26 +84,123 @@ impl ScalarUDFImpl for IsNanFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        let args = ColumnarValue::values_to_arrays(&args.args)?;
-
-        let arr: ArrayRef = match args[0].data_type() {
-            DataType::Float64 => Arc::new(BooleanArray::from_unary(
-                args[0].as_primitive::<Float64Type>(),
-                f64::is_nan,
-            )) as ArrayRef,
-
-            DataType::Float32 => Arc::new(BooleanArray::from_unary(
-                args[0].as_primitive::<Float32Type>(),
-                f32::is_nan,
-            )) as ArrayRef,
-            other => {
-                return exec_err!(
-                    "Unsupported data type {other:?} for function {}",
-                    self.name()
-                )
+        let [arg] = take_function_args(self.name(), args.args)?;
+
+        match arg {
+            ColumnarValue::Scalar(scalar) => {
+                if scalar.is_null() {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
+                }
+
+                let result = match scalar {
+                    ScalarValue::Float64(Some(v)) => Some(v.is_nan()),
+                    ScalarValue::Float32(Some(v)) => Some(v.is_nan()),
+                    ScalarValue::Float16(Some(v)) => Some(v.is_nan()),
+
+                    // Non-float numeric inputs are never NaN
+                    ScalarValue::Int8(_)
+                    | ScalarValue::Int16(_)
+                    | ScalarValue::Int32(_)
+                    | ScalarValue::Int64(_)
+                    | ScalarValue::UInt8(_)
+                    | ScalarValue::UInt16(_)
+                    | ScalarValue::UInt32(_)
+                    | ScalarValue::UInt64(_)
+                    | ScalarValue::Decimal32(_, _, _)
+                    | ScalarValue::Decimal64(_, _, _)
+                    | ScalarValue::Decimal128(_, _, _)
+                    | ScalarValue::Decimal256(_, _, _) => Some(false),
+
+                    other => {
+                        return exec_err!(
+                            "Unsupported data type {other:?} for function {}",
+                            self.name()
+                        );
+                    }
+                };
+
+                Ok(ColumnarValue::Scalar(ScalarValue::Boolean(result)))
             }
-        };
-        Ok(ColumnarValue::Array(arr))
+            ColumnarValue::Array(array) => {
+                // NOTE: BooleanArray::from_unary preserves nulls.
+                let arr: ArrayRef = match array.data_type() {
+                    Null => Arc::new(BooleanArray::new_null(array.len())) as ArrayRef,
+
+                    Float64 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Float64Type>(),
+                        f64::is_nan,
+                    )) as ArrayRef,
+                    Float32 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Float32Type>(),
+                        f32::is_nan,
+                    )) as ArrayRef,
+                    Float16 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Float16Type>(),
+                        |x| x.is_nan(),
+                    )) as ArrayRef,
+
+                    // Non-float numeric arrays are never NaN
+                    Decimal32(_, _) => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Decimal32Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+                    Decimal64(_, _) => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Decimal64Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+                    Decimal128(_, _) => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Decimal128Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+                    Decimal256(_, _) => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Decimal256Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+
+                    Int8 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Int8Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+                    Int16 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Int16Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+                    Int32 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Int32Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+                    Int64 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Int64Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+                    UInt8 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<UInt8Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+                    UInt16 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<UInt16Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+                    UInt32 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<UInt32Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+                    UInt64 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<UInt64Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+
+                    other => {
+                        return exec_err!(
+                            "Unsupported data type {other:?} for function {}",
+                            self.name()
+                        );
+                    }
+                };
+
+                Ok(ColumnarValue::Array(arr))
+            }
+        }
     }
 
     fn documentation(&self) -> Option<&Documentation> {
diff --git a/datafusion/functions/src/math/nanvl.rs b/datafusion/functions/src/math/nanvl.rs
index f0835b4d48a0c..251e98bb72c03 100644
--- a/datafusion/functions/src/math/nanvl.rs
+++ b/datafusion/functions/src/math/nanvl.rs
@@ -15,15 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
-use crate::utils::make_scalar_function;
-
-use arrow::array::{ArrayRef, AsArray, Float32Array, Float64Array};
-use arrow::datatypes::DataType::{Float32, Float64};
-use arrow::datatypes::{DataType, Float32Type, Float64Type};
-use datafusion_common::{exec_err, DataFusionError, Result};
+use arrow::array::{ArrayRef, AsArray, Float16Array, Float32Array, Float64Array};
+use arrow::datatypes::DataType::{Float16, Float32, Float64};
+use arrow::datatypes::{DataType, Float16Type, Float32Type, Float64Type};
+use datafusion_common::{Result, ScalarValue, exec_err, utils::take_function_args};
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
@@ -66,10 +63,13 @@ impl Default for NanvlFunc {
 
 impl NanvlFunc {
     pub fn new() -> Self {
-        use DataType::*;
         Self {
             signature: Signature::one_of(
-                vec![Exact(vec![Float32, Float32]), Exact(vec![Float64, Float64])],
+                vec![
+                    Exact(vec![Float16, Float16]),
+                    Exact(vec![Float32, Float32]),
+                    Exact(vec![Float64, Float64]),
+                ],
                 Volatility::Immutable,
             ),
         }
@@ -77,10 +77,6 @@ impl NanvlFunc {
 }
 
 impl ScalarUDFImpl for NanvlFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "nanvl"
     }
@@ -91,13 +87,31 @@ impl ScalarUDFImpl for NanvlFunc {
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
         match &arg_types[0] {
+            Float16 => Ok(Float16),
             Float32 => Ok(Float32),
             _ => Ok(Float64),
         }
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(nanvl, vec![])(&args.args)
+        let [x, y] = take_function_args(self.name(), args.args)?;
+
+        match (x, y) {
+            (ColumnarValue::Scalar(ScalarValue::Float16(Some(v))), y) if v.is_nan() => {
+                Ok(y)
+            }
+            (ColumnarValue::Scalar(ScalarValue::Float32(Some(v))), y) if v.is_nan() => {
+                Ok(y)
+            }
+            (ColumnarValue::Scalar(ScalarValue::Float64(Some(v))), y) if v.is_nan() => {
+                Ok(y)
+            }
+            (x @ ColumnarValue::Scalar(_), _) => Ok(x),
+            (x, y) => {
+                let args = ColumnarValue::values_to_arrays(&[x, y])?;
+                Ok(ColumnarValue::Array(nanvl(&args)?))
+            }
+        }
     }
 
     fn documentation(&self) -> Option<&Documentation> {
@@ -106,37 +120,49 @@ impl ScalarUDFImpl for NanvlFunc {
 }
 
 /// Nanvl SQL function
+///
+/// - x is NaN -> output is y (which may itself be NULL)
+/// - otherwise -> output is x (which may itself be NULL)
 fn nanvl(args: &[ArrayRef]) -> Result<ArrayRef> {
     match args[0].data_type() {
         Float64 => {
-            let compute_nanvl = |x: f64, y: f64| {
-                if x.is_nan() {
-                    y
-                } else {
-                    x
-                }
-            };
-
-            let x = args[0].as_primitive() as &Float64Array;
-            let y = args[1].as_primitive() as &Float64Array;
-            arrow::compute::binary::<_, _, _, Float64Type>(x, y, compute_nanvl)
-                .map(|res| Arc::new(res) as _)
-                .map_err(DataFusionError::from)
+            let x = args[0].as_primitive::<Float64Type>();
+            let y = args[1].as_primitive::<Float64Type>();
+            let result: Float64Array = x
+                .iter()
+                .zip(y.iter())
+                .map(|(x_value, y_value)| match x_value {
+                    Some(x_value) if x_value.is_nan() => y_value,
+                    _ => x_value,
+                })
+                .collect();
+            Ok(Arc::new(result) as ArrayRef)
         }
         Float32 => {
-            let compute_nanvl = |x: f32, y: f32| {
-                if x.is_nan() {
-                    y
-                } else {
-                    x
-                }
-            };
-
-            let x = args[0].as_primitive() as &Float32Array;
-            let y = args[1].as_primitive() as &Float32Array;
-            arrow::compute::binary::<_, _, _, Float32Type>(x, y, compute_nanvl)
-                .map(|res| Arc::new(res) as _)
-                .map_err(DataFusionError::from)
+            let x = args[0].as_primitive::<Float32Type>();
+            let y = args[1].as_primitive::<Float32Type>();
+            let result: Float32Array = x
+                .iter()
+                .zip(y.iter())
+                .map(|(x_value, y_value)| match x_value {
+                    Some(x_value) if x_value.is_nan() => y_value,
+                    _ => x_value,
+                })
+                .collect();
+            Ok(Arc::new(result) as ArrayRef)
+        }
+        Float16 => {
+            let x = args[0].as_primitive::<Float16Type>();
+            let y = args[1].as_primitive::<Float16Type>();
+            let result: Float16Array = x
+                .iter()
+                .zip(y.iter())
+                .map(|(x_value, y_value)| match x_value {
+                    Some(x_value) if x_value.is_nan() => y_value,
+                    _ => x_value,
+                })
+                .collect();
+            Ok(Arc::new(result) as ArrayRef)
         }
         other => exec_err!("Unsupported data type {other:?} for function nanvl"),
     }
@@ -154,8 +180,8 @@ mod test {
     #[test]
     fn test_nanvl_f64() {
         let args: Vec<ArrayRef> = vec![
-            Arc::new(Float64Array::from(vec![1.0, f64::NAN, 3.0, f64::NAN])), // y
-            Arc::new(Float64Array::from(vec![5.0, 6.0, f64::NAN, f64::NAN])), // x
+            Arc::new(Float64Array::from(vec![1.0, f64::NAN, 3.0, f64::NAN])), // x
+            Arc::new(Float64Array::from(vec![5.0, 6.0, f64::NAN, f64::NAN])), // y
         ];
 
         let result = nanvl(&args).expect("failed to initialize function nanvl");
@@ -172,8 +198,8 @@ mod test {
     #[test]
     fn test_nanvl_f32() {
         let args: Vec<ArrayRef> = vec![
-            Arc::new(Float32Array::from(vec![1.0, f32::NAN, 3.0, f32::NAN])), // y
-            Arc::new(Float32Array::from(vec![5.0, 6.0, f32::NAN, f32::NAN])), // x
+            Arc::new(Float32Array::from(vec![1.0, f32::NAN, 3.0, f32::NAN])), // x
+            Arc::new(Float32Array::from(vec![5.0, 6.0, f32::NAN, f32::NAN])), // y
         ];
 
         let result = nanvl(&args).expect("failed to initialize function nanvl");
diff --git a/datafusion/functions/src/math/pi.rs b/datafusion/functions/src/math/pi.rs
index 71a8e21a52f26..55707f2c71feb 100644
--- a/datafusion/functions/src/math/pi.rs
+++ b/datafusion/functions/src/math/pi.rs
@@ -15,11 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
-
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::Float64;
-use datafusion_common::{internal_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, assert_or_internal_err};
 use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
@@ -52,10 +50,6 @@ impl PiFunc {
 }
 
 impl ScalarUDFImpl for PiFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "pi"
     }
@@ -69,9 +63,11 @@ impl ScalarUDFImpl for PiFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        if !args.args.is_empty() {
-            return internal_err!("{} function does not accept arguments", self.name());
-        }
+        assert_or_internal_err!(
+            args.args.is_empty(),
+            "{} function does not accept arguments",
+            self.name()
+        );
         Ok(ColumnarValue::Scalar(ScalarValue::Float64(Some(
             std::f64::consts::PI,
         ))))
diff --git a/datafusion/functions/src/math/power.rs b/datafusion/functions/src/math/power.rs
index ad2e795d086e9..ea7c85a33782e 100644
--- a/datafusion/functions/src/math/power.rs
+++ b/datafusion/functions/src/math/power.rs
@@ -16,24 +16,27 @@
 // under the License.
 
 //! Math function: `power()`.
-use std::any::Any;
-use std::sync::Arc;
-
 use super::log::LogFunc;
 
-use arrow::array::{ArrayRef, AsArray, Int64Array};
-use arrow::datatypes::{ArrowNativeTypeOp, DataType, Float64Type};
-use datafusion_common::{
-    arrow_datafusion_err, exec_datafusion_err, exec_err, plan_datafusion_err,
-    DataFusionError, Result, ScalarValue,
+use crate::utils::{calculate_binary_decimal_math, calculate_binary_math};
+use arrow::array::{Array, ArrayRef};
+use arrow::datatypes::i256;
+use arrow::datatypes::{
+    ArrowNativeType, ArrowNativeTypeOp, DataType, Decimal32Type, Decimal64Type,
+    Decimal128Type, Decimal256Type, Float64Type, Int64Type,
 };
+use arrow::error::ArrowError;
+use datafusion_common::types::{NativeType, logical_float64, logical_int64};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, internal_err};
 use datafusion_expr::expr::ScalarFunction;
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr::{
-    ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDF, TypeSignature,
+    Coercion, ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDF,
+    ScalarUDFImpl, Signature, TypeSignature, TypeSignatureClass, Volatility, lit,
 };
-use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
+use num_traits::{NumCast, ToPrimitive};
 
 #[user_doc(
     doc_section(label = "Math Functions"),
@@ -64,12 +67,23 @@ impl Default for PowerFunc {
 
 impl PowerFunc {
     pub fn new() -> Self {
-        use DataType::*;
+        let integer = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int64()),
+            vec![TypeSignatureClass::Integer],
+            NativeType::Int64,
+        );
+        let decimal = Coercion::new_exact(TypeSignatureClass::Decimal);
+        let float = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_float64()),
+            vec![TypeSignatureClass::Numeric],
+            NativeType::Float64,
+        );
         Self {
             signature: Signature::one_of(
                 vec![
-                    TypeSignature::Exact(vec![Int64, Int64]),
-                    TypeSignature::Exact(vec![Float64, Float64]),
+                    TypeSignature::Coercible(vec![decimal.clone(), integer]),
+                    TypeSignature::Coercible(vec![decimal.clone(), float.clone()]),
+                    TypeSignature::Coercible(vec![float; 2]),
                 ],
                 Volatility::Immutable,
             ),
@@ -78,10 +92,304 @@ impl PowerFunc {
     }
 }
 
-impl ScalarUDFImpl for PowerFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
+/// Binary function to calculate a math power to integer exponent
+/// for scaled integer types.
+///
+/// Formula
+/// The power for a scaled integer `b` is
+///
+/// ```text
+/// (b * 10^(-s)) ^ e
+/// ```
+/// However, the result should be scaled back from scale 0 to scale `s`,
+/// which is done by multiplying by `10^s`.
+/// At the end, the formula is:
+///
+/// ```text
+///   b^e * 10^(-s * e) * 10^s = b^e / 10^(s * (e-1))
+/// ```
+/// Example of 2.5 ^ 4 = 39:
+///   2.5 is represented as 25 with scale 1
+///   The unscaled result is 25^4 = 390625
+///   Scale it back to 1: 390625 / 10^4 = 39
+fn pow_decimal_int<T>(base: T, scale: i8, exp: i64) -> Result<T, ArrowError>
+where
+    T: ArrowNativeType + ArrowNativeTypeOp + ToPrimitive + NumCast + Copy,
+{
+    // Negative exponent: fall back to float computation
+    if exp < 0 {
+        return pow_decimal_float(base, scale, exp as f64);
+    }
+
+    let exp: u32 = exp.try_into().map_err(|_| {
+        ArrowError::ArithmeticOverflow(format!("Unsupported exp value: {exp}"))
+    })?;
+    // Handle edge case for exp == 0
+    // If scale < 0, 10^scale (e.g., 10^-2 = 0.01) becomes 0 in integer arithmetic.
+    if exp == 0 {
+        return if scale >= 0 {
+            T::usize_as(10).pow_checked(scale as u32).map_err(|_| {
+                ArrowError::ArithmeticOverflow(format!(
+                    "Cannot make unscale factor for {scale} and {exp}"
+                ))
+            })
+        } else {
+            Ok(T::ZERO)
+        };
+    }
+    let powered: T = base.pow_checked(exp).map_err(|_| {
+        ArrowError::ArithmeticOverflow(format!("Cannot raise base {base:?} to exp {exp}"))
+    })?;
+
+    // Calculate the scale adjustment: s * (e - 1)
+    // We use i64 to prevent overflow during the intermediate multiplication
+    let mul_exp = (scale as i64).wrapping_mul(exp as i64 - 1);
+
+    if mul_exp == 0 {
+        return Ok(powered);
+    }
+
+    // If mul_exp is positive, we divide (standard case).
+    // If mul_exp is negative, we multiply (negative scale case).
+    if mul_exp > 0 {
+        let div_factor: T =
+            T::usize_as(10).pow_checked(mul_exp as u32).map_err(|_| {
+                ArrowError::ArithmeticOverflow(format!(
+                    "Cannot make div factor for {scale} and {exp}"
+                ))
+            })?;
+        powered.div_checked(div_factor)
+    } else {
+        // mul_exp is negative, so we multiply by 10^(-mul_exp)
+        let abs_exp = mul_exp.checked_neg().ok_or_else(|| {
+            ArrowError::ArithmeticOverflow(
+                "Overflow while negating scale exponent".to_string(),
+            )
+        })?;
+        let mul_factor: T =
+            T::usize_as(10).pow_checked(abs_exp as u32).map_err(|_| {
+                ArrowError::ArithmeticOverflow(format!(
+                    "Cannot make mul factor for {scale} and {exp}"
+                ))
+            })?;
+        powered.mul_checked(mul_factor)
+    }
+}
+
+/// Binary function to calculate a math power to float exponent
+/// for scaled integer types.
+fn pow_decimal_float<T>(base: T, scale: i8, exp: f64) -> Result<T, ArrowError>
+where
+    T: ArrowNativeType + ArrowNativeTypeOp + ToPrimitive + NumCast + Copy,
+{
+    if exp.is_finite() && exp.trunc() == exp && exp >= 0f64 && exp < u32::MAX as f64 {
+        return pow_decimal_int(base, scale, exp as i64);
+    }
+
+    if !exp.is_finite() {
+        return Err(ArrowError::ComputeError(format!(
+            "Cannot use non-finite exp: {exp}"
+        )));
+    }
+
+    pow_decimal_float_fallback(base, scale, exp)
+}
+
+/// Compute the f64 power result and scale it back.
+/// Returns the rounded i128 result for conversion to target type.
+#[inline]
+fn compute_pow_f64_result(
+    base_f64: f64,
+    scale: i8,
+    exp: f64,
+) -> Result<i128, ArrowError> {
+    let result_f64 = base_f64.powf(exp);
+
+    if !result_f64.is_finite() {
+        return Err(ArrowError::ArithmeticOverflow(format!(
+            "Result of {base_f64}^{exp} is not finite"
+        )));
+    }
+
+    let scale_factor = 10f64.powi(scale as i32);
+    let result_scaled = result_f64 * scale_factor;
+    let result_rounded = result_scaled.round();
+
+    if result_rounded.abs() > i128::MAX as f64 {
+        return Err(ArrowError::ArithmeticOverflow(format!(
+            "Result {result_rounded} is too large for the target decimal type"
+        )));
+    }
+
+    Ok(result_rounded as i128)
+}
+
+/// Convert i128 result to target decimal native type using NumCast.
+/// Returns error if value overflows the target type.
+#[inline]
+fn decimal_from_i128<T>(value: i128) -> Result<T, ArrowError>
+where
+    T: NumCast,
+{
+    NumCast::from(value).ok_or_else(|| {
+        ArrowError::ArithmeticOverflow(format!(
+            "Value {value} is too large for the target decimal type"
+        ))
+    })
+}
+
+/// Fallback implementation using f64 for negative or non-integer exponents.
+/// This handles cases that cannot be computed using integer arithmetic.
+fn pow_decimal_float_fallback<T>(base: T, scale: i8, exp: f64) -> Result<T, ArrowError>
+where
+    T: ToPrimitive + NumCast + Copy,
+{
+    if scale < 0 {
+        return Err(ArrowError::NotYetImplemented(format!(
+            "Negative scale is not yet supported: {scale}"
+        )));
+    }
+
+    let scale_factor = 10f64.powi(scale as i32);
+    let base_f64 = base.to_f64().ok_or_else(|| {
+        ArrowError::ComputeError("Cannot convert base to f64".to_string())
+    })? / scale_factor;
+
+    let result_i128 = compute_pow_f64_result(base_f64, scale, exp)?;
+
+    decimal_from_i128(result_i128)
+}
+
+/// Decimal256 specialized float exponent version.
+fn pow_decimal256_float(base: i256, scale: i8, exp: f64) -> Result<i256, ArrowError> {
+    if exp.is_finite() && exp.trunc() == exp && exp >= 0f64 && exp < u32::MAX as f64 {
+        return pow_decimal256_int(base, scale, exp as i64);
+    }
+
+    if !exp.is_finite() {
+        return Err(ArrowError::ComputeError(format!(
+            "Cannot use non-finite exp: {exp}"
+        )));
+    }
+
+    pow_decimal256_float_fallback(base, scale, exp)
+}
+
+/// Decimal256 specialized integer exponent version.
+fn pow_decimal256_int(base: i256, scale: i8, exp: i64) -> Result<i256, ArrowError> {
+    if exp < 0 {
+        return pow_decimal256_float(base, scale, exp as f64);
+    }
+
+    let exp: u32 = exp.try_into().map_err(|_| {
+        ArrowError::ArithmeticOverflow(format!("Unsupported exp value: {exp}"))
+    })?;
+
+    if exp == 0 {
+        return if scale >= 0 {
+            i256::from_i128(10).pow_checked(scale as u32).map_err(|_| {
+                ArrowError::ArithmeticOverflow(format!(
+                    "Cannot make unscale factor for {scale} and {exp}"
+                ))
+            })
+        } else {
+            Ok(i256::from_i128(0))
+        };
+    }
+
+    let powered: i256 = base.pow_checked(exp).map_err(|_| {
+        ArrowError::ArithmeticOverflow(format!("Cannot raise base {base:?} to exp {exp}"))
+    })?;
+
+    let mul_exp = (scale as i64).wrapping_mul(exp as i64 - 1);
+
+    if mul_exp == 0 {
+        return Ok(powered);
+    }
+
+    if mul_exp > 0 {
+        let div_factor: i256 =
+            i256::from_i128(10)
+                .pow_checked(mul_exp as u32)
+                .map_err(|_| {
+                    ArrowError::ArithmeticOverflow(format!(
+                        "Cannot make div factor for {scale} and {exp}"
+                    ))
+                })?;
+        powered.div_checked(div_factor)
+    } else {
+        let abs_exp = mul_exp.checked_neg().ok_or_else(|| {
+            ArrowError::ArithmeticOverflow(
+                "Overflow while negating scale exponent".to_string(),
+            )
+        })?;
+        let mul_factor: i256 =
+            i256::from_i128(10)
+                .pow_checked(abs_exp as u32)
+                .map_err(|_| {
+                    ArrowError::ArithmeticOverflow(format!(
+                        "Cannot make mul factor for {scale} and {exp}"
+                    ))
+                })?;
+        powered.mul_checked(mul_factor)
     }
+}
+
+/// Fallback implementation for Decimal256.
+fn pow_decimal256_float_fallback(
+    base: i256,
+    scale: i8,
+    exp: f64,
+) -> Result<i256, ArrowError> {
+    if scale < 0 {
+        return Err(ArrowError::NotYetImplemented(format!(
+            "Negative scale is not yet supported: {scale}"
+        )));
+    }
+
+    let scale_factor = 10f64.powi(scale as i32);
+    let base_f64 = base.to_f64().ok_or_else(|| {
+        ArrowError::ComputeError("Cannot convert base to f64".to_string())
+    })? / scale_factor;
+
+    let result_i128 = compute_pow_f64_result(base_f64, scale, exp)?;
+
+    // i256 can be constructed from i128 directly
+    Ok(i256::from_i128(result_i128))
+}
+
+/// Fallback implementation for decimal power when exponent is an array.
+/// Casts decimal to float64, computes power, and casts back to original decimal type.
+/// This is used for performance when exponent varies per-row.
+fn pow_decimal_with_float_fallback(
+    base: &ArrayRef,
+    exponent: &ColumnarValue,
+    num_rows: usize,
+) -> Result<ColumnarValue> {
+    use arrow::compute::cast;
+
+    let original_type = base.data_type().clone();
+    let base_f64 = cast(base.as_ref(), &DataType::Float64)?;
+
+    let exp_f64 = match exponent {
+        ColumnarValue::Array(arr) => cast(arr.as_ref(), &DataType::Float64)?,
+        ColumnarValue::Scalar(scalar) => {
+            let scalar_f64 = scalar.cast_to(&DataType::Float64)?;
+            scalar_f64.to_array_of_size(num_rows)?
+        }
+    };
+
+    let result_f64 = calculate_binary_math::<Float64Type, Float64Type, Float64Type, _>(
+        &base_f64,
+        &ColumnarValue::Array(exp_f64),
+        |b, e| Ok(f64::powf(b, e)),
+    )?;
+
+    let result = cast(result_f64.as_ref(), &original_type)?;
+    Ok(ColumnarValue::Array(result))
+}
+
+impl ScalarUDFImpl for PowerFunc {
     fn name(&self) -> &str {
         "power"
     }
@@ -91,9 +399,10 @@ impl ScalarUDFImpl for PowerFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        match arg_types[0] {
-            DataType::Int64 => Ok(DataType::Int64),
-            _ => Ok(DataType::Float64),
+        if arg_types[0].is_null() {
+            Ok(DataType::Float64)
+        } else {
+            Ok(arg_types[0].clone())
         }
     }
 
@@ -102,75 +411,173 @@ impl ScalarUDFImpl for PowerFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        let args = ColumnarValue::values_to_arrays(&args.args)?;
-
-        let arr: ArrayRef = match args[0].data_type() {
-            DataType::Float64 => {
-                let bases = args[0].as_primitive::<Float64Type>();
-                let exponents = args[1].as_primitive::<Float64Type>();
-                let result = arrow::compute::binary::<_, _, _, Float64Type>(
-                    bases,
-                    exponents,
-                    f64::powf,
-                )?;
-                Arc::new(result) as _
+        let [base, exponent] = take_function_args(self.name(), &args.args)?;
+
+        // For decimal types, only use native decimal
+        // operations when we have a scalar exponent. When the exponent is an array,
+        // fall back to float computation for better performance.
+        let use_float_fallback = matches!(
+            base.data_type(),
+            DataType::Decimal32(_, _)
+                | DataType::Decimal64(_, _)
+                | DataType::Decimal128(_, _)
+                | DataType::Decimal256(_, _)
+        ) && matches!(exponent, ColumnarValue::Array(_));
+
+        let base = base.to_array(args.number_rows)?;
+
+        // If decimal with array exponent, cast to float and compute
+        if use_float_fallback {
+            return pow_decimal_with_float_fallback(&base, exponent, args.number_rows);
+        }
+
+        let arr: ArrayRef = match (base.data_type(), exponent.data_type()) {
+            (DataType::Float64, DataType::Float64) => {
+                calculate_binary_math::<Float64Type, Float64Type, Float64Type, _>(
+                    &base,
+                    exponent,
+                    |b, e| Ok(f64::powf(b, e)),
+                )?
             }
-            DataType::Int64 => {
-                let bases = downcast_named_arg!(&args[0], "base", Int64Array);
-                let exponents = downcast_named_arg!(&args[1], "exponent", Int64Array);
-                bases
-                    .iter()
-                    .zip(exponents.iter())
-                    .map(|(base, exp)| match (base, exp) {
-                        (Some(base), Some(exp)) => Ok(Some(base.pow_checked(
-                            exp.try_into().map_err(|_| {
-                                exec_datafusion_err!(
-                                    "Can't use negative exponents: {exp} in integer computation, please use Float."
-                                )
-                            })?,
-                        ).map_err(|e| arrow_datafusion_err!(e))?)),
-                        _ => Ok(None),
-                    })
-                    .collect::<Result<Int64Array>>()
-                    .map(Arc::new)? as _
+            (DataType::Decimal32(precision, scale), DataType::Int64) => {
+                calculate_binary_decimal_math::<Decimal32Type, Int64Type, Decimal32Type, _>(
+                    &base,
+                    exponent,
+                    |b, e| pow_decimal_int(b, *scale, e),
+                    *precision,
+                    *scale,
+                )?
             }
-
-            other => {
-                return exec_err!(
-                    "Unsupported data type {other:?} for function {}",
-                    self.name()
-                )
+            (DataType::Decimal32(precision, scale), DataType::Float64) => {
+                calculate_binary_decimal_math::<
+                    Decimal32Type,
+                    Float64Type,
+                    Decimal32Type,
+                    _,
+                >(
+                    &base,
+                    exponent,
+                    |b, e| pow_decimal_float(b, *scale, e),
+                    *precision,
+                    *scale,
+                )?
+            }
+            (DataType::Decimal64(precision, scale), DataType::Int64) => {
+                calculate_binary_decimal_math::<Decimal64Type, Int64Type, Decimal64Type, _>(
+                    &base,
+                    exponent,
+                    |b, e| pow_decimal_int(b, *scale, e),
+                    *precision,
+                    *scale,
+                )?
+            }
+            (DataType::Decimal64(precision, scale), DataType::Float64) => {
+                calculate_binary_decimal_math::<
+                    Decimal64Type,
+                    Float64Type,
+                    Decimal64Type,
+                    _,
+                >(
+                    &base,
+                    exponent,
+                    |b, e| pow_decimal_float(b, *scale, e),
+                    *precision,
+                    *scale,
+                )?
+            }
+            (DataType::Decimal128(precision, scale), DataType::Int64) => {
+                calculate_binary_decimal_math::<
+                    Decimal128Type,
+                    Int64Type,
+                    Decimal128Type,
+                    _,
+                >(
+                    &base,
+                    exponent,
+                    |b, e| pow_decimal_int(b, *scale, e),
+                    *precision,
+                    *scale,
+                )?
+            }
+            (DataType::Decimal128(precision, scale), DataType::Float64) => {
+                calculate_binary_decimal_math::<
+                    Decimal128Type,
+                    Float64Type,
+                    Decimal128Type,
+                    _,
+                >(
+                    &base,
+                    exponent,
+                    |b, e| pow_decimal_float(b, *scale, e),
+                    *precision,
+                    *scale,
+                )?
+            }
+            (DataType::Decimal256(precision, scale), DataType::Int64) => {
+                calculate_binary_decimal_math::<
+                    Decimal256Type,
+                    Int64Type,
+                    Decimal256Type,
+                    _,
+                >(
+                    &base,
+                    exponent,
+                    |b, e| pow_decimal256_int(b, *scale, e),
+                    *precision,
+                    *scale,
+                )?
+            }
+            (DataType::Decimal256(precision, scale), DataType::Float64) => {
+                calculate_binary_decimal_math::<
+                    Decimal256Type,
+                    Float64Type,
+                    Decimal256Type,
+                    _,
+                >(
+                    &base,
+                    exponent,
+                    |b, e| pow_decimal256_float(b, *scale, e),
+                    *precision,
+                    *scale,
+                )?
+            }
+            (base_type, exp_type) => {
+                return internal_err!(
+                    "Unsupported data types for base {base_type:?} and exponent {exp_type:?} for power"
+                );
             }
         };
-
         Ok(ColumnarValue::Array(arr))
     }
 
     /// Simplify the `power` function by the relevant rules:
-    /// 1. Power(a, 0) ===> 0
+    /// 1. Power(a, 0) ===> 1
     /// 2. Power(a, 1) ===> a
     /// 3. Power(a, Log(a, b)) ===> b
     fn simplify(
         &self,
-        mut args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
-        let exponent = args.pop().ok_or_else(|| {
-            plan_datafusion_err!("Expected power to have 2 arguments, got 0")
-        })?;
-        let base = args.pop().ok_or_else(|| {
-            plan_datafusion_err!("Expected power to have 2 arguments, got 1")
-        })?;
-
+        let [base, exponent] = take_function_args("power", args)?;
+        let base_type = info.get_data_type(&base)?;
         let exponent_type = info.get_data_type(&exponent)?;
+
+        // Null propagation
+        if base_type.is_null() || exponent_type.is_null() {
+            let return_type = self.return_type(&[base_type, exponent_type])?;
+            return Ok(ExprSimplifyResult::Simplified(lit(
+                ScalarValue::Null.cast_to(&return_type)?
+            )));
+        }
+
         match exponent {
             Expr::Literal(value, _)
                 if value == ScalarValue::new_zero(&exponent_type)? =>
             {
-                Ok(ExprSimplifyResult::Simplified(Expr::Literal(
-                    ScalarValue::new_one(&info.get_data_type(&base)?)?,
-                    None,
-                )))
+                Ok(ExprSimplifyResult::Simplified(lit(ScalarValue::new_one(
+                    &base_type,
+                )?)))
             }
             Expr::Literal(value, _) if value == ScalarValue::new_one(&exponent_type)? => {
                 Ok(ExprSimplifyResult::Simplified(base))
@@ -192,91 +599,63 @@ impl ScalarUDFImpl for PowerFunc {
 
 /// Return true if this function call is a call to `Log`
 fn is_log(func: &ScalarUDF) -> bool {
-    func.inner().as_any().downcast_ref::<LogFunc>().is_some()
+    func.inner().is::<LogFunc>()
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::array::Float64Array;
-    use arrow::datatypes::Field;
-    use datafusion_common::cast::{as_float64_array, as_int64_array};
-    use datafusion_common::config::ConfigOptions;
 
     #[test]
-    fn test_power_f64() {
-        let arg_fields = vec![
-            Field::new("a", DataType::Float64, true).into(),
-            Field::new("a", DataType::Float64, true).into(),
-        ];
-        let args = ScalarFunctionArgs {
-            args: vec![
-                ColumnarValue::Array(Arc::new(Float64Array::from(vec![
-                    2.0, 2.0, 3.0, 5.0,
-                ]))), // base
-                ColumnarValue::Array(Arc::new(Float64Array::from(vec![
-                    3.0, 2.0, 4.0, 4.0,
-                ]))), // exponent
-            ],
-            arg_fields,
-            number_rows: 4,
-            return_field: Field::new("f", DataType::Float64, true).into(),
-            config_options: Arc::new(ConfigOptions::default()),
-        };
-        let result = PowerFunc::new()
-            .invoke_with_args(args)
-            .expect("failed to initialize function power");
-
-        match result {
-            ColumnarValue::Array(arr) => {
-                let floats = as_float64_array(&arr)
-                    .expect("failed to convert result to a Float64Array");
-                assert_eq!(floats.len(), 4);
-                assert_eq!(floats.value(0), 8.0);
-                assert_eq!(floats.value(1), 4.0);
-                assert_eq!(floats.value(2), 81.0);
-                assert_eq!(floats.value(3), 625.0);
-            }
-            ColumnarValue::Scalar(_) => {
-                panic!("Expected an array value")
-            }
-        }
+    fn test_pow_decimal128_helper() {
+        // Expression: 2.5 ^ 4 = 39.0625
+        assert_eq!(pow_decimal_int(25i128, 1, 4).unwrap(), 390i128);
+        assert_eq!(pow_decimal_int(2500i128, 3, 4).unwrap(), 39062i128);
+        assert_eq!(pow_decimal_int(25000i128, 4, 4).unwrap(), 390625i128);
+
+        // Expression: 25 ^ 4 = 390625
+        assert_eq!(pow_decimal_int(25i128, 0, 4).unwrap(), 390625i128);
+
+        // Expressions for edge cases
+        assert_eq!(pow_decimal_int(25i128, 1, 1).unwrap(), 25i128);
+        assert_eq!(pow_decimal_int(25i128, 0, 1).unwrap(), 25i128);
+        assert_eq!(pow_decimal_int(25i128, 0, 0).unwrap(), 1i128);
+        assert_eq!(pow_decimal_int(25i128, 1, 0).unwrap(), 10i128);
+
+        assert_eq!(pow_decimal_int(25i128, -1, 4).unwrap(), 390625000i128);
     }
 
     #[test]
-    fn test_power_i64() {
-        let arg_fields = vec![
-            Field::new("a", DataType::Int64, true).into(),
-            Field::new("a", DataType::Int64, true).into(),
-        ];
-        let args = ScalarFunctionArgs {
-            args: vec![
-                ColumnarValue::Array(Arc::new(Int64Array::from(vec![2, 2, 3, 5]))), // base
-                ColumnarValue::Array(Arc::new(Int64Array::from(vec![3, 2, 4, 4]))), // exponent
-            ],
-            arg_fields,
-            number_rows: 4,
-            return_field: Field::new("f", DataType::Int64, true).into(),
-            config_options: Arc::new(ConfigOptions::default()),
-        };
-        let result = PowerFunc::new()
-            .invoke_with_args(args)
-            .expect("failed to initialize function power");
-
-        match result {
-            ColumnarValue::Array(arr) => {
-                let ints = as_int64_array(&arr)
-                    .expect("failed to convert result to a Int64Array");
-
-                assert_eq!(ints.len(), 4);
-                assert_eq!(ints.value(0), 8);
-                assert_eq!(ints.value(1), 4);
-                assert_eq!(ints.value(2), 81);
-                assert_eq!(ints.value(3), 625);
-            }
-            ColumnarValue::Scalar(_) => {
-                panic!("Expected an array value")
-            }
-        }
+    fn test_pow_decimal_float_fallback() {
+        // Test negative exponent: 4^(-1) = 0.25
+        // 4 with scale 2 = 400, result should be 25 (0.25 with scale 2)
+        let result: i128 = pow_decimal_float(400i128, 2, -1.0).unwrap();
+        assert_eq!(result, 25);
+
+        // Test non-integer exponent: 4^0.5 = 2
+        // 4 with scale 2 = 400, result should be 200 (2.0 with scale 2)
+        let result: i128 = pow_decimal_float(400i128, 2, 0.5).unwrap();
+        assert_eq!(result, 200);
+
+        // Test 8^(1/3) = 2 (cube root)
+        // 8 with scale 1 = 80, result should be 20 (2.0 with scale 1)
+        let result: i128 = pow_decimal_float(80i128, 1, 1.0 / 3.0).unwrap();
+        assert_eq!(result, 20);
+
+        // Test negative base with integer exponent still works
+        // (-2)^3 = -8
+        // -2 with scale 1 = -20, result should be -80 (-8.0 with scale 1)
+        let result: i128 = pow_decimal_float(-20i128, 1, 3.0).unwrap();
+        assert_eq!(result, -80);
+
+        // Test positive integer exponent goes through fast path
+        // 2.5^4 = 39.0625
+        // 25 with scale 1, result should be 390 (39.0 with scale 1) - truncated
+        let result: i128 = pow_decimal_float(25i128, 1, 4.0).unwrap();
+        assert_eq!(result, 390); // Uses integer path
+
+        // Test non-finite exponent returns error
+        assert!(pow_decimal_float(100i128, 2, f64::NAN).is_err());
+        assert!(pow_decimal_float(100i128, 2, f64::INFINITY).is_err());
     }
 }
diff --git a/datafusion/functions/src/math/random.rs b/datafusion/functions/src/math/random.rs
index d63e76a06d011..aa5294b6229bc 100644
--- a/datafusion/functions/src/math/random.rs
+++ b/datafusion/functions/src/math/random.rs
@@ -15,15 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::Float64Array;
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::Float64;
-use rand::{rng, Rng};
+use rand::{Rng, rng};
 
-use datafusion_common::{internal_err, Result};
+use datafusion_common::{Result, assert_or_internal_err};
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_expr::{Documentation, ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
@@ -62,10 +61,6 @@ impl RandomFunc {
 }
 
 impl ScalarUDFImpl for RandomFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "random"
     }
@@ -79,9 +74,11 @@ impl ScalarUDFImpl for RandomFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        if !args.args.is_empty() {
-            return internal_err!("{} function does not accept arguments", self.name());
-        }
+        assert_or_internal_err!(
+            args.args.is_empty(),
+            "{} function does not accept arguments",
+            self.name()
+        );
         let mut rng = rng();
         let mut values = vec![0.0; args.number_rows];
         // Equivalent to set each element with rng.random_range(0.0..1.0), but more efficient
diff --git a/datafusion/functions/src/math/round.rs b/datafusion/functions/src/math/round.rs
index 837f0be432403..78016c0f52f71 100644
--- a/datafusion/functions/src/math/round.rs
+++ b/datafusion/functions/src/math/round.rs
@@ -15,23 +15,138 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
-use std::sync::Arc;
-
-use crate::utils::make_scalar_function;
+use crate::utils::{calculate_binary_decimal_math, calculate_binary_math};
 
-use arrow::array::{ArrayRef, AsArray, PrimitiveArray};
-use arrow::compute::{cast_with_options, CastOptions};
-use arrow::datatypes::DataType::{Float32, Float64, Int32};
-use arrow::datatypes::{DataType, Float32Type, Float64Type, Int32Type};
-use datafusion_common::{exec_datafusion_err, exec_err, Result, ScalarValue};
+use arrow::array::ArrayRef;
+use arrow::datatypes::DataType::{
+    Decimal32, Decimal64, Decimal128, Decimal256, Float32, Float64,
+};
+use arrow::datatypes::{
+    ArrowNativeTypeOp, DataType, Decimal32Type, Decimal64Type, Decimal128Type,
+    Decimal256Type, DecimalType, Float32Type, Float64Type, Int32Type,
+};
+use arrow::datatypes::{Field, FieldRef};
+use arrow::error::ArrowError;
+use datafusion_common::types::{
+    NativeType, logical_float32, logical_float64, logical_int32,
+};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
 use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
-use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
-    Volatility,
+    Coercion, ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignature, TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
+use std::sync::Arc;
+
+fn output_scale_for_decimal(precision: u8, input_scale: i8, decimal_places: i32) -> i8 {
+    // `decimal_places` controls the maximum output scale, but scale cannot exceed the input scale.
+    //
+    // For negative-scale decimals, allow further scale reduction to match negative `decimal_places`
+    // (e.g. scale -2 rounded to -3 becomes scale -3). This preserves fixed precision by
+    // representing the rounded result at a coarser scale.
+    if input_scale < 0 {
+        // Decimal scales must be within [-precision, precision] and fit in i8. For negative-scale
+        // decimals, allow rounding to move the output scale further negative, but cap it at
+        // `-precision` (beyond that, the rounded result is always 0).
+        let min_scale = -i32::from(precision);
+        let new_scale = i32::from(input_scale).min(decimal_places).max(min_scale);
+        return new_scale as i8;
+    }
+
+    // The `min` ensures the result is always within i8 range because `input_scale` is i8.
+    let decimal_places = decimal_places.max(0);
+    i32::from(input_scale).min(decimal_places) as i8
+}
+
+fn normalize_decimal_places_for_decimal(
+    decimal_places: i32,
+    precision: u8,
+    scale: i8,
+) -> Option<i32> {
+    if decimal_places >= 0 {
+        return Some(decimal_places);
+    }
+
+    // For fixed precision decimals, the absolute value is strictly less than 10^(precision - scale).
+    // If the rounding position is beyond that (abs(decimal_places) > precision - scale), the
+    // rounded result is always 0, and we can avoid overflow in intermediate 10^n computations.
+    let max_rounding_pow10 = i64::from(precision) - i64::from(scale);
+    if max_rounding_pow10 <= 0 {
+        return None;
+    }
+
+    let abs_decimal_places = i64::from(decimal_places.unsigned_abs());
+    (abs_decimal_places <= max_rounding_pow10).then_some(decimal_places)
+}
+
+fn validate_decimal_precision<T: DecimalType>(
+    value: T::Native,
+    precision: u8,
+    scale: i8,
+) -> Result<T::Native, ArrowError> {
+    T::validate_decimal_precision(value, precision, scale).map_err(|e| {
+        ArrowError::ComputeError(format!(
+            "Decimal overflow: rounded value exceeds precision {precision}: {e}"
+        ))
+    })?;
+    Ok(value)
+}
+
+fn calculate_new_precision_scale<T: DecimalType>(
+    precision: u8,
+    scale: i8,
+    decimal_places: Option<i32>,
+) -> Result<DataType> {
+    if let Some(decimal_places) = decimal_places {
+        let new_scale = output_scale_for_decimal(precision, scale, decimal_places);
+
+        // When rounding an integer decimal (scale == 0) to a negative `decimal_places`, a carry can
+        // add an extra digit to the integer part (e.g. 99 -> 100 when rounding to -1). This can
+        // only happen when the rounding position is within the existing precision.
+        let abs_decimal_places = decimal_places.unsigned_abs();
+        let new_precision = if scale == 0
+            && decimal_places < 0
+            && abs_decimal_places <= u32::from(precision)
+        {
+            precision.saturating_add(1).min(T::MAX_PRECISION)
+        } else {
+            precision
+        };
+        Ok(T::TYPE_CONSTRUCTOR(new_precision, new_scale))
+    } else {
+        let new_precision = precision.saturating_add(1).min(T::MAX_PRECISION);
+        Ok(T::TYPE_CONSTRUCTOR(new_precision, scale))
+    }
+}
+
+fn decimal_places_from_scalar(scalar: &ScalarValue) -> Result<i32> {
+    let out_of_range = |value: String| {
+        datafusion_common::DataFusionError::Execution(format!(
+            "round decimal_places {value} is out of supported i32 range"
+        ))
+    };
+    match scalar {
+        ScalarValue::Int8(Some(v)) => Ok(i32::from(*v)),
+        ScalarValue::Int16(Some(v)) => Ok(i32::from(*v)),
+        ScalarValue::Int32(Some(v)) => Ok(*v),
+        ScalarValue::Int64(Some(v)) => {
+            i32::try_from(*v).map_err(|_| out_of_range(v.to_string()))
+        }
+        ScalarValue::UInt8(Some(v)) => Ok(i32::from(*v)),
+        ScalarValue::UInt16(Some(v)) => Ok(i32::from(*v)),
+        ScalarValue::UInt32(Some(v)) => {
+            i32::try_from(*v).map_err(|_| out_of_range(v.to_string()))
+        }
+        ScalarValue::UInt64(Some(v)) => {
+            i32::try_from(*v).map_err(|_| out_of_range(v.to_string()))
+        }
+        other => exec_err!(
+            "Unexpected datatype for decimal_places: {}",
+            other.data_type()
+        ),
+    }
+}
 
 #[user_doc(
     doc_section(label = "Math Functions"),
@@ -64,14 +179,33 @@ impl Default for RoundFunc {
 
 impl RoundFunc {
     pub fn new() -> Self {
-        use DataType::*;
+        let decimal = Coercion::new_exact(TypeSignatureClass::Decimal);
+        let decimal_places = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int32()),
+            vec![TypeSignatureClass::Integer],
+            NativeType::Int32,
+        );
+        let float32 = Coercion::new_exact(TypeSignatureClass::Native(logical_float32()));
+        let float64 = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_float64()),
+            vec![TypeSignatureClass::Numeric],
+            NativeType::Float64,
+        );
         Self {
             signature: Signature::one_of(
                 vec![
-                    Exact(vec![Float64, Int64]),
-                    Exact(vec![Float32, Int64]),
-                    Exact(vec![Float64]),
-                    Exact(vec![Float32]),
+                    TypeSignature::Coercible(vec![
+                        decimal.clone(),
+                        decimal_places.clone(),
+                    ]),
+                    TypeSignature::Coercible(vec![decimal]),
+                    TypeSignature::Coercible(vec![
+                        float32.clone(),
+                        decimal_places.clone(),
+                    ]),
+                    TypeSignature::Coercible(vec![float32]),
+                    TypeSignature::Coercible(vec![float64.clone(), decimal_places]),
+                    TypeSignature::Coercible(vec![float64]),
                 ],
                 Volatility::Immutable,
             ),
@@ -80,10 +214,6 @@ impl RoundFunc {
 }
 
 impl ScalarUDFImpl for RoundFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "round"
     }
@@ -92,15 +222,218 @@ impl ScalarUDFImpl for RoundFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        match arg_types[0] {
-            Float32 => Ok(Float32),
-            _ => Ok(Float64),
-        }
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let input_field = &args.arg_fields[0];
+        let input_type = input_field.data_type();
+
+        // If decimal_places is a scalar literal, we can incorporate it into the output type
+        // (scale reduction). Otherwise, keep the input scale as we can't pick a per-row scale.
+        //
+        // Note: `scalar_arguments` contains the original literal values (pre-coercion), so
+        // integer literals may appear as Int64 even though the signature coerces them to Int32.
+        let decimal_places: Option<i32> = match args.scalar_arguments.get(1) {
+            None => Some(0),    // No dp argument means default to 0
+            Some(None) => None, // dp is not a literal (e.g. column)
+            Some(Some(scalar)) if scalar.is_null() => Some(0), // null dp => default to 0
+            Some(Some(scalar)) => Some(decimal_places_from_scalar(scalar)?),
+        };
+
+        // Calculate return type based on input type
+        // For decimals: reduce scale to decimal_places (reclaims precision for integer part)
+        // This matches Spark/DuckDB behavior where ROUND adjusts the scale
+        // BUT only if dp is a scalar literal - otherwise keep original scale and add
+        // extra precision to accommodate potential carry-over.
+        let return_type =
+            match input_type {
+                Float32 => Float32,
+                Decimal32(precision, scale) => calculate_new_precision_scale::<
+                    Decimal32Type,
+                >(
+                    *precision, *scale, decimal_places
+                )?,
+                Decimal64(precision, scale) => calculate_new_precision_scale::<
+                    Decimal64Type,
+                >(
+                    *precision, *scale, decimal_places
+                )?,
+                Decimal128(precision, scale) => calculate_new_precision_scale::<
+                    Decimal128Type,
+                >(
+                    *precision, *scale, decimal_places
+                )?,
+                Decimal256(precision, scale) => calculate_new_precision_scale::<
+                    Decimal256Type,
+                >(
+                    *precision, *scale, decimal_places
+                )?,
+                _ => Float64,
+            };
+
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new(self.name(), return_type, nullable)))
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("use return_field_from_args instead")
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(round, vec![])(&args.args)
+        if args.arg_fields.iter().any(|a| a.data_type().is_null()) {
+            return ColumnarValue::Scalar(ScalarValue::Null)
+                .cast_to(args.return_type(), None);
+        }
+
+        let default_decimal_places = ColumnarValue::Scalar(ScalarValue::Int32(Some(0)));
+        let decimal_places = if args.args.len() == 2 {
+            &args.args[1]
+        } else {
+            &default_decimal_places
+        };
+
+        if let (ColumnarValue::Scalar(value_scalar), ColumnarValue::Scalar(dp_scalar)) =
+            (&args.args[0], decimal_places)
+        {
+            if value_scalar.is_null() || dp_scalar.is_null() {
+                return ColumnarValue::Scalar(ScalarValue::Null)
+                    .cast_to(args.return_type(), None);
+            }
+
+            let dp = if let ScalarValue::Int32(Some(dp)) = dp_scalar {
+                *dp
+            } else {
+                return internal_err!(
+                    "Unexpected datatype for decimal_places: {}",
+                    dp_scalar.data_type()
+                );
+            };
+
+            match (value_scalar, args.return_type()) {
+                (ScalarValue::Float32(Some(v)), _) => {
+                    let rounded = round_float(*v, dp)?;
+                    Ok(ColumnarValue::Scalar(ScalarValue::from(rounded)))
+                }
+                (ScalarValue::Float64(Some(v)), _) => {
+                    let rounded = round_float(*v, dp)?;
+                    Ok(ColumnarValue::Scalar(ScalarValue::from(rounded)))
+                }
+                (
+                    ScalarValue::Decimal32(Some(v), in_precision, scale),
+                    Decimal32(out_precision, out_scale),
+                ) => {
+                    let rounded =
+                        round_decimal_or_zero(*v, *in_precision, *scale, *out_scale, dp)?;
+                    let rounded = if *out_precision == Decimal32Type::MAX_PRECISION
+                        && *scale == 0
+                        && dp < 0
+                    {
+                        // With scale == 0 and negative dp, rounding can carry into an additional
+                        // digit (e.g. 99 -> 100). If we're already at max precision we can't widen
+                        // the type, so validate and error rather than producing an invalid decimal.
+                        validate_decimal_precision::<Decimal32Type>(
+                            rounded,
+                            *out_precision,
+                            *out_scale,
+                        )
+                    } else {
+                        Ok(rounded)
+                    }?;
+                    let scalar =
+                        ScalarValue::Decimal32(Some(rounded), *out_precision, *out_scale);
+                    Ok(ColumnarValue::Scalar(scalar))
+                }
+                (
+                    ScalarValue::Decimal64(Some(v), in_precision, scale),
+                    Decimal64(out_precision, out_scale),
+                ) => {
+                    let rounded =
+                        round_decimal_or_zero(*v, *in_precision, *scale, *out_scale, dp)?;
+                    let rounded = if *out_precision == Decimal64Type::MAX_PRECISION
+                        && *scale == 0
+                        && dp < 0
+                    {
+                        // See Decimal32 branch for details.
+                        validate_decimal_precision::<Decimal64Type>(
+                            rounded,
+                            *out_precision,
+                            *out_scale,
+                        )
+                    } else {
+                        Ok(rounded)
+                    }?;
+                    let scalar =
+                        ScalarValue::Decimal64(Some(rounded), *out_precision, *out_scale);
+                    Ok(ColumnarValue::Scalar(scalar))
+                }
+                (
+                    ScalarValue::Decimal128(Some(v), in_precision, scale),
+                    Decimal128(out_precision, out_scale),
+                ) => {
+                    let rounded =
+                        round_decimal_or_zero(*v, *in_precision, *scale, *out_scale, dp)?;
+                    let rounded = if *out_precision == Decimal128Type::MAX_PRECISION
+                        && *scale == 0
+                        && dp < 0
+                    {
+                        // See Decimal32 branch for details.
+                        validate_decimal_precision::<Decimal128Type>(
+                            rounded,
+                            *out_precision,
+                            *out_scale,
+                        )
+                    } else {
+                        Ok(rounded)
+                    }?;
+                    let scalar = ScalarValue::Decimal128(
+                        Some(rounded),
+                        *out_precision,
+                        *out_scale,
+                    );
+                    Ok(ColumnarValue::Scalar(scalar))
+                }
+                (
+                    ScalarValue::Decimal256(Some(v), in_precision, scale),
+                    Decimal256(out_precision, out_scale),
+                ) => {
+                    let rounded =
+                        round_decimal_or_zero(*v, *in_precision, *scale, *out_scale, dp)?;
+                    let rounded = if *out_precision == Decimal256Type::MAX_PRECISION
+                        && *scale == 0
+                        && dp < 0
+                    {
+                        // See Decimal32 branch for details.
+                        validate_decimal_precision::<Decimal256Type>(
+                            rounded,
+                            *out_precision,
+                            *out_scale,
+                        )
+                    } else {
+                        Ok(rounded)
+                    }?;
+                    let scalar = ScalarValue::Decimal256(
+                        Some(rounded),
+                        *out_precision,
+                        *out_scale,
+                    );
+                    Ok(ColumnarValue::Scalar(scalar))
+                }
+                (ScalarValue::Null, _) => ColumnarValue::Scalar(ScalarValue::Null)
+                    .cast_to(args.return_type(), None),
+                (value_scalar, return_type) => {
+                    internal_err!(
+                        "Unexpected datatype for round(value, decimal_places): value {}, return type {}",
+                        value_scalar.data_type(),
+                        return_type
+                    )
+                }
+            }
+        } else {
+            round_columnar(
+                &args.args[0],
+                decimal_places,
+                args.number_rows,
+                args.return_type(),
+            )
+        }
     }
 
     fn output_ordering(&self, input: &[ExprProperties]) -> Result<SortProperties> {
@@ -123,107 +456,270 @@ impl ScalarUDFImpl for RoundFunc {
     }
 }
 
-/// Round SQL function
-pub fn round(args: &[ArrayRef]) -> Result<ArrayRef> {
-    if args.len() != 1 && args.len() != 2 {
-        return exec_err!(
-            "round function requires one or two arguments, got {}",
-            args.len()
-        );
+fn round_columnar(
+    value: &ColumnarValue,
+    decimal_places: &ColumnarValue,
+    number_rows: usize,
+    return_type: &DataType,
+) -> Result<ColumnarValue> {
+    let value_array = value.to_array(number_rows)?;
+    let both_scalars = matches!(value, ColumnarValue::Scalar(_))
+        && matches!(decimal_places, ColumnarValue::Scalar(_));
+    let decimal_places_is_array = matches!(decimal_places, ColumnarValue::Array(_));
+
+    let arr: ArrayRef = match (value_array.data_type(), return_type) {
+        (Float64, _) => {
+            let result = calculate_binary_math::<Float64Type, Int32Type, Float64Type, _>(
+                value_array.as_ref(),
+                decimal_places,
+                round_float::<f64>,
+            )?;
+            result as _
+        }
+        (Float32, _) => {
+            let result = calculate_binary_math::<Float32Type, Int32Type, Float32Type, _>(
+                value_array.as_ref(),
+                decimal_places,
+                round_float::<f32>,
+            )?;
+            result as _
+        }
+        (Decimal32(input_precision, scale), Decimal32(precision, new_scale)) => {
+            // reduce scale to reclaim integer precision
+            let result = calculate_binary_decimal_math::<
+                Decimal32Type,
+                Int32Type,
+                Decimal32Type,
+                _,
+            >(
+                value_array.as_ref(),
+                decimal_places,
+                |v, dp| {
+                    let rounded = round_decimal_or_zero(
+                        v,
+                        *input_precision,
+                        *scale,
+                        *new_scale,
+                        dp,
+                    )?;
+                    if *precision == Decimal32Type::MAX_PRECISION
+                        && (decimal_places_is_array || (*scale == 0 && dp < 0))
+                    {
+                        // If we're already at max precision, we can't widen the result type. For
+                        // dp arrays, or for scale == 0 with negative dp, rounding can overflow the
+                        // fixed-precision type. Validate per-row and return an error instead of
+                        // producing an invalid decimal that Arrow may display incorrectly.
+                        validate_decimal_precision::<Decimal32Type>(
+                            rounded, *precision, *new_scale,
+                        )
+                    } else {
+                        Ok(rounded)
+                    }
+                },
+                *precision,
+                *new_scale,
+            )?;
+            result as _
+        }
+        (Decimal64(input_precision, scale), Decimal64(precision, new_scale)) => {
+            let result = calculate_binary_decimal_math::<
+                Decimal64Type,
+                Int32Type,
+                Decimal64Type,
+                _,
+            >(
+                value_array.as_ref(),
+                decimal_places,
+                |v, dp| {
+                    let rounded = round_decimal_or_zero(
+                        v,
+                        *input_precision,
+                        *scale,
+                        *new_scale,
+                        dp,
+                    )?;
+                    if *precision == Decimal64Type::MAX_PRECISION
+                        && (decimal_places_is_array || (*scale == 0 && dp < 0))
+                    {
+                        // See Decimal32 branch for details.
+                        validate_decimal_precision::<Decimal64Type>(
+                            rounded, *precision, *new_scale,
+                        )
+                    } else {
+                        Ok(rounded)
+                    }
+                },
+                *precision,
+                *new_scale,
+            )?;
+            result as _
+        }
+        (Decimal128(input_precision, scale), Decimal128(precision, new_scale)) => {
+            let result = calculate_binary_decimal_math::<
+                Decimal128Type,
+                Int32Type,
+                Decimal128Type,
+                _,
+            >(
+                value_array.as_ref(),
+                decimal_places,
+                |v, dp| {
+                    let rounded = round_decimal_or_zero(
+                        v,
+                        *input_precision,
+                        *scale,
+                        *new_scale,
+                        dp,
+                    )?;
+                    if *precision == Decimal128Type::MAX_PRECISION
+                        && (decimal_places_is_array || (*scale == 0 && dp < 0))
+                    {
+                        // See Decimal32 branch for details.
+                        validate_decimal_precision::<Decimal128Type>(
+                            rounded, *precision, *new_scale,
+                        )
+                    } else {
+                        Ok(rounded)
+                    }
+                },
+                *precision,
+                *new_scale,
+            )?;
+            result as _
+        }
+        (Decimal256(input_precision, scale), Decimal256(precision, new_scale)) => {
+            let result = calculate_binary_decimal_math::<
+                Decimal256Type,
+                Int32Type,
+                Decimal256Type,
+                _,
+            >(
+                value_array.as_ref(),
+                decimal_places,
+                |v, dp| {
+                    let rounded = round_decimal_or_zero(
+                        v,
+                        *input_precision,
+                        *scale,
+                        *new_scale,
+                        dp,
+                    )?;
+                    if *precision == Decimal256Type::MAX_PRECISION
+                        && (decimal_places_is_array || (*scale == 0 && dp < 0))
+                    {
+                        // See Decimal32 branch for details.
+                        validate_decimal_precision::<Decimal256Type>(
+                            rounded, *precision, *new_scale,
+                        )
+                    } else {
+                        Ok(rounded)
+                    }
+                },
+                *precision,
+                *new_scale,
+            )?;
+            result as _
+        }
+        (other, _) => exec_err!("Unsupported data type {other:?} for function round")?,
+    };
+
+    if both_scalars {
+        ScalarValue::try_from_array(&arr, 0).map(ColumnarValue::Scalar)
+    } else {
+        Ok(ColumnarValue::Array(arr))
     }
+}
 
-    let mut decimal_places = ColumnarValue::Scalar(ScalarValue::Int64(Some(0)));
+fn round_float<T>(value: T, decimal_places: i32) -> Result<T, ArrowError>
+where
+    T: num_traits::Float,
+{
+    let factor = T::from(10_f64.powi(decimal_places)).ok_or_else(|| {
+        ArrowError::ComputeError(format!(
+            "Invalid value for decimal places: {decimal_places}"
+        ))
+    })?;
+    Ok((value * factor).round() / factor)
+}
 
-    if args.len() == 2 {
-        decimal_places = ColumnarValue::Array(Arc::clone(&args[1]));
+fn round_decimal<V: ArrowNativeTypeOp>(
+    value: V,
+    input_scale: i8,
+    output_scale: i8,
+    decimal_places: i32,
+) -> Result<V, ArrowError> {
+    let diff = i64::from(input_scale) - i64::from(decimal_places);
+    if diff <= 0 {
+        return Ok(value);
     }
 
-    match args[0].data_type() {
-        Float64 => match decimal_places {
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(decimal_places))) => {
-                let decimal_places: i32 = decimal_places.try_into().map_err(|e| {
-                    exec_datafusion_err!(
-                        "Invalid value for decimal places: {decimal_places}: {e}"
-                    )
-                })?;
-
-                let result = args[0]
-                    .as_primitive::<Float64Type>()
-                    .unary::<_, Float64Type>(|value: f64| {
-                        (value * 10.0_f64.powi(decimal_places)).round()
-                            / 10.0_f64.powi(decimal_places)
-                    });
-                Ok(Arc::new(result) as _)
-            }
-            ColumnarValue::Array(decimal_places) => {
-                let options = CastOptions {
-                    safe: false, // raise error if the cast is not possible
-                    ..Default::default()
-                };
-                let decimal_places = cast_with_options(&decimal_places, &Int32, &options)
-                    .map_err(|e| {
-                        exec_datafusion_err!("Invalid values for decimal places: {e}")
-                    })?;
-
-                let values = args[0].as_primitive::<Float64Type>();
-                let decimal_places = decimal_places.as_primitive::<Int32Type>();
-                let result = arrow::compute::binary::<_, _, _, Float64Type>(
-                    values,
-                    decimal_places,
-                    |value, decimal_places| {
-                        (value * 10.0_f64.powi(decimal_places)).round()
-                            / 10.0_f64.powi(decimal_places)
-                    },
-                )?;
-                Ok(Arc::new(result) as _)
-            }
-            _ => {
-                exec_err!("round function requires a scalar or array for decimal_places")
-            }
-        },
+    debug_assert!(diff <= i64::from(u32::MAX));
+    let diff = diff as u32;
+
+    let one = V::ONE;
+    let two = V::from_usize(2).ok_or_else(|| {
+        ArrowError::ComputeError("Internal error: could not create constant 2".into())
+    })?;
+    let ten = V::from_usize(10).ok_or_else(|| {
+        ArrowError::ComputeError("Internal error: could not create constant 10".into())
+    })?;
+
+    let factor = ten.pow_checked(diff).map_err(|_| {
+        ArrowError::ComputeError(format!(
+            "Overflow while rounding decimal with scale {input_scale} and decimal places {decimal_places}"
+        ))
+    })?;
+
+    let mut quotient = value.div_wrapping(factor);
+    let remainder = value.mod_wrapping(factor);
+
+    // `factor` is an even number (10^n, n > 0), so `factor / 2` is the tie threshold
+    let threshold = factor.div_wrapping(two);
+    if remainder >= threshold {
+        quotient = quotient.add_checked(one).map_err(|_| {
+            ArrowError::ComputeError("Overflow while rounding decimal".into())
+        })?;
+    } else if remainder <= threshold.neg_wrapping() {
+        quotient = quotient.sub_checked(one).map_err(|_| {
+            ArrowError::ComputeError("Overflow while rounding decimal".into())
+        })?;
+    }
 
-        Float32 => match decimal_places {
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(decimal_places))) => {
-                let decimal_places: i32 = decimal_places.try_into().map_err(|e| {
-                    exec_datafusion_err!(
-                        "Invalid value for decimal places: {decimal_places}: {e}"
-                    )
-                })?;
-                let result = args[0]
-                    .as_primitive::<Float32Type>()
-                    .unary::<_, Float32Type>(|value: f32| {
-                        (value * 10.0_f32.powi(decimal_places)).round()
-                            / 10.0_f32.powi(decimal_places)
-                    });
-                Ok(Arc::new(result) as _)
-            }
-            ColumnarValue::Array(_) => {
-                let ColumnarValue::Array(decimal_places) =
-                    decimal_places.cast_to(&Int32, None).map_err(|e| {
-                        exec_datafusion_err!("Invalid values for decimal places: {e}")
-                    })?
-                else {
-                    panic!("Unexpected result of ColumnarValue::Array.cast")
-                };
-
-                let values = args[0].as_primitive::<Float32Type>();
-                let decimal_places = decimal_places.as_primitive::<Int32Type>();
-                let result: PrimitiveArray<Float32Type> = arrow::compute::binary(
-                    values,
-                    decimal_places,
-                    |value, decimal_places| {
-                        (value * 10.0_f32.powi(decimal_places)).round()
-                            / 10.0_f32.powi(decimal_places)
-                    },
-                )?;
-                Ok(Arc::new(result) as _)
-            }
-            _ => {
-                exec_err!("round function requires a scalar or array for decimal_places")
-            }
-        },
+    // `quotient` is the rounded value at scale `decimal_places`. Rescale to the desired
+    // `output_scale` (which is always >= `decimal_places` in cases where diff > 0).
+    let scale_shift = i64::from(output_scale) - i64::from(decimal_places);
+    if scale_shift == 0 {
+        return Ok(quotient);
+    }
 
-        other => exec_err!("Unsupported data type {other:?} for function round"),
+    debug_assert!(scale_shift > 0);
+    debug_assert!(scale_shift <= i64::from(u32::MAX));
+    let scale_shift = scale_shift as u32;
+    let shift_factor = ten.pow_checked(scale_shift).map_err(|_| {
+        ArrowError::ComputeError(format!(
+            "Overflow while rounding decimal with scale {input_scale} and decimal places {decimal_places}"
+        ))
+    })?;
+    quotient
+        .mul_checked(shift_factor)
+        .map_err(|_| ArrowError::ComputeError("Overflow while rounding decimal".into()))
+}
+
+fn round_decimal_or_zero<V: ArrowNativeTypeOp>(
+    value: V,
+    precision: u8,
+    input_scale: i8,
+    output_scale: i8,
+    decimal_places: i32,
+) -> Result<V, ArrowError> {
+    if let Some(dp) =
+        normalize_decimal_places_for_decimal(decimal_places, precision, input_scale)
+    {
+        round_decimal(value, input_scale, output_scale, dp)
+    } else {
+        V::from_usize(0).ok_or_else(|| {
+            ArrowError::ComputeError("Internal error: could not create constant 0".into())
+        })
     }
 }
 
@@ -231,11 +727,33 @@ pub fn round(args: &[ArrayRef]) -> Result<ArrayRef> {
 mod test {
     use std::sync::Arc;
 
-    use crate::math::round::round;
-
     use arrow::array::{ArrayRef, Float32Array, Float64Array, Int64Array};
-    use datafusion_common::cast::{as_float32_array, as_float64_array};
     use datafusion_common::DataFusionError;
+    use datafusion_common::ScalarValue;
+    use datafusion_common::cast::{as_float32_array, as_float64_array};
+    use datafusion_expr::ColumnarValue;
+
+    fn round_arrays(
+        value: ArrayRef,
+        decimal_places: Option<ArrayRef>,
+    ) -> Result<ArrayRef, DataFusionError> {
+        let number_rows = value.len();
+        // NOTE: For decimal inputs, the actual ROUND return type can differ from the
+        // input type (scale reduction for literal `decimal_places`). These unit tests
+        // only exercise Float32/Float64 behavior.
+        let return_type = value.data_type().clone();
+        let value = ColumnarValue::Array(value);
+        let decimal_places = decimal_places
+            .map(ColumnarValue::Array)
+            .unwrap_or_else(|| ColumnarValue::Scalar(ScalarValue::Int32(Some(0))));
+
+        let result =
+            super::round_columnar(&value, &decimal_places, number_rows, &return_type)?;
+        match result {
+            ColumnarValue::Array(array) => Ok(array),
+            ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(1),
+        }
+    }
 
     #[test]
     fn test_round_f32() {
@@ -244,7 +762,8 @@ mod test {
             Arc::new(Int64Array::from(vec![0, 1, 2, 3, 4, 5, -1, -2, -3, -4])), // decimal_places
         ];
 
-        let result = round(&args).expect("failed to initialize function round");
+        let result = round_arrays(Arc::clone(&args[0]), Some(Arc::clone(&args[1])))
+            .expect("failed to initialize function round");
         let floats =
             as_float32_array(&result).expect("failed to initialize function round");
 
@@ -262,7 +781,8 @@ mod test {
             Arc::new(Int64Array::from(vec![0, 1, 2, 3, 4, 5, -1, -2, -3, -4])), // decimal_places
         ];
 
-        let result = round(&args).expect("failed to initialize function round");
+        let result = round_arrays(Arc::clone(&args[0]), Some(Arc::clone(&args[1])))
+            .expect("failed to initialize function round");
         let floats =
             as_float64_array(&result).expect("failed to initialize function round");
 
@@ -279,7 +799,8 @@ mod test {
             Arc::new(Float32Array::from(vec![125.2345, 12.345, 1.234, 0.1234])), // input
         ];
 
-        let result = round(&args).expect("failed to initialize function round");
+        let result = round_arrays(Arc::clone(&args[0]), None)
+            .expect("failed to initialize function round");
         let floats =
             as_float32_array(&result).expect("failed to initialize function round");
 
@@ -294,7 +815,8 @@ mod test {
             Arc::new(Float64Array::from(vec![125.2345, 12.345, 1.234, 0.1234])), // input
         ];
 
-        let result = round(&args).expect("failed to initialize function round");
+        let result = round_arrays(Arc::clone(&args[0]), None)
+            .expect("failed to initialize function round");
         let floats =
             as_float64_array(&result).expect("failed to initialize function round");
 
@@ -310,9 +832,12 @@ mod test {
             Arc::new(Int64Array::from(vec![2147483648])), // decimal_places
         ];
 
-        let result = round(&args);
+        let result = round_arrays(Arc::clone(&args[0]), Some(Arc::clone(&args[1])));
 
         assert!(result.is_err());
-        assert!(matches!(result, Err(DataFusionError::Execution(_))));
+        assert!(matches!(
+            result,
+            Err(DataFusionError::ArrowError(_, _)) | Err(DataFusionError::Execution(_))
+        ));
     }
 }
diff --git a/datafusion/functions/src/math/signum.rs b/datafusion/functions/src/math/signum.rs
index bbe6178f39b79..8c8eeacf12394 100644
--- a/datafusion/functions/src/math/signum.rs
+++ b/datafusion/functions/src/math/signum.rs
@@ -15,14 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
-use arrow::array::{ArrayRef, AsArray};
+use arrow::array::AsArray;
 use arrow::datatypes::DataType::{Float32, Float64};
 use arrow::datatypes::{DataType, Float32Type, Float64Type};
 
-use datafusion_common::{exec_err, Result};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, internal_err};
 use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
@@ -30,8 +30,6 @@ use datafusion_expr::{
 };
 use datafusion_macros::user_doc;
 
-use crate::utils::make_scalar_function;
-
 #[user_doc(
     doc_section(label = "Math Functions"),
     description = r#"Returns the sign of a number.
@@ -73,10 +71,6 @@ impl SignumFunc {
 }
 
 impl ScalarUDFImpl for SignumFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "signum"
     }
@@ -98,7 +92,53 @@ impl ScalarUDFImpl for SignumFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(signum, vec![])(&args.args)
+        let return_type = args.return_type().clone();
+        let [arg] = take_function_args(self.name(), args.args)?;
+
+        match arg {
+            ColumnarValue::Scalar(scalar) => {
+                if scalar.is_null() {
+                    return ColumnarValue::Scalar(ScalarValue::Null)
+                        .cast_to(&return_type, None);
+                }
+
+                match scalar {
+                    ScalarValue::Float64(Some(v)) => {
+                        let result = if v == 0.0 { 0.0 } else { v.signum() };
+                        Ok(ColumnarValue::Scalar(ScalarValue::Float64(Some(result))))
+                    }
+                    ScalarValue::Float32(Some(v)) => {
+                        let result = if v == 0.0 { 0.0 } else { v.signum() };
+                        Ok(ColumnarValue::Scalar(ScalarValue::Float32(Some(result))))
+                    }
+                    _ => {
+                        internal_err!(
+                            "Unexpected scalar type for signum: {:?}",
+                            scalar.data_type()
+                        )
+                    }
+                }
+            }
+            ColumnarValue::Array(array) => match array.data_type() {
+                Float64 => Ok(ColumnarValue::Array(Arc::new(
+                    array.as_primitive::<Float64Type>().unary::<_, Float64Type>(
+                        |x: f64| {
+                            if x == 0.0 { 0.0 } else { x.signum() }
+                        },
+                    ),
+                ))),
+                Float32 => Ok(ColumnarValue::Array(Arc::new(
+                    array.as_primitive::<Float32Type>().unary::<_, Float32Type>(
+                        |x: f32| {
+                            if x == 0.0 { 0.0 } else { x.signum() }
+                        },
+                    ),
+                ))),
+                other => {
+                    internal_err!("Unsupported data type {other:?} for function signum")
+                }
+            },
+        }
     }
 
     fn documentation(&self) -> Option<&Documentation> {
@@ -106,41 +146,6 @@ impl ScalarUDFImpl for SignumFunc {
     }
 }
 
-/// signum SQL function
-pub fn signum(args: &[ArrayRef]) -> Result<ArrayRef> {
-    match args[0].data_type() {
-        Float64 => Ok(Arc::new(
-            args[0]
-                .as_primitive::<Float64Type>()
-                .unary::<_, Float64Type>(
-                    |x: f64| {
-                        if x == 0_f64 {
-                            0_f64
-                        } else {
-                            x.signum()
-                        }
-                    },
-                ),
-        ) as ArrayRef),
-
-        Float32 => Ok(Arc::new(
-            args[0]
-                .as_primitive::<Float32Type>()
-                .unary::<_, Float32Type>(
-                    |x: f32| {
-                        if x == 0_f32 {
-                            0_f32
-                        } else {
-                            x.signum()
-                        }
-                    },
-                ),
-        ) as ArrayRef),
-
-        other => exec_err!("Unsupported data type {other:?} for function signum"),
-    }
-}
-
 #[cfg(test)]
 mod test {
     use std::sync::Arc;
diff --git a/datafusion/functions/src/math/trunc.rs b/datafusion/functions/src/math/trunc.rs
index 9d1b4336f6389..991ad0e9c470d 100644
--- a/datafusion/functions/src/math/trunc.rs
+++ b/datafusion/functions/src/math/trunc.rs
@@ -15,7 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use crate::utils::make_scalar_function;
@@ -24,9 +23,9 @@ use arrow::array::{ArrayRef, AsArray, PrimitiveArray};
 use arrow::datatypes::DataType::{Float32, Float64};
 use arrow::datatypes::{DataType, Float32Type, Float64Type, Int64Type};
 use datafusion_common::ScalarValue::Int64;
-use datafusion_common::{exec_err, Result};
-use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
+use datafusion_common::{Result, ScalarValue, exec_err};
 use datafusion_expr::TypeSignature::Exact;
+use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
     Volatility,
@@ -90,10 +89,6 @@ impl TruncFunc {
 }
 
 impl ScalarUDFImpl for TruncFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "trunc"
     }
@@ -110,7 +105,50 @@ impl ScalarUDFImpl for TruncFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(trunc, vec![])(&args.args)
+        // Extract precision from second argument (default 0)
+        let precision = match args.args.get(1) {
+            Some(ColumnarValue::Scalar(Int64(Some(p)))) => Some(*p),
+            Some(ColumnarValue::Scalar(Int64(None))) => None, // null precision
+            Some(ColumnarValue::Array(_)) => {
+                // Precision is an array - use array path
+                return make_scalar_function(trunc, vec![])(&args.args);
+            }
+            None => Some(0), // default precision
+            Some(cv) => {
+                return exec_err!(
+                    "trunc function requires precision to be Int64, got {:?}",
+                    cv.data_type()
+                );
+            }
+        };
+
+        // Scalar fast path using tuple matching for (value, precision)
+        match (&args.args[0], precision) {
+            // Null cases
+            (ColumnarValue::Scalar(sv), _) if sv.is_null() => {
+                ColumnarValue::Scalar(ScalarValue::Null).cast_to(args.return_type(), None)
+            }
+            (_, None) => {
+                ColumnarValue::Scalar(ScalarValue::Null).cast_to(args.return_type(), None)
+            }
+            // Scalar cases
+            (ColumnarValue::Scalar(ScalarValue::Float64(Some(v))), Some(p)) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(if p == 0 {
+                    v.trunc()
+                } else {
+                    compute_truncate64(*v, p)
+                }))),
+            ),
+            (ColumnarValue::Scalar(ScalarValue::Float32(Some(v))), Some(p)) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Float32(Some(if p == 0 {
+                    v.trunc()
+                } else {
+                    compute_truncate32(*v, p)
+                }))),
+            ),
+            // Array path for everything else
+            _ => make_scalar_function(trunc, vec![])(&args.args),
+        }
     }
 
     fn output_ordering(&self, input: &[ExprProperties]) -> Result<SortProperties> {
@@ -158,11 +196,7 @@ fn trunc(args: &[ArrayRef]) -> Result<ArrayRef> {
                     args[0]
                         .as_primitive::<Float64Type>()
                         .unary::<_, Float64Type>(|x: f64| {
-                            if x == 0_f64 {
-                                0_f64
-                            } else {
-                                x.trunc()
-                            }
+                            if x == 0_f64 { 0_f64 } else { x.trunc() }
                         }),
                 ) as ArrayRef)
             }
@@ -184,11 +218,7 @@ fn trunc(args: &[ArrayRef]) -> Result<ArrayRef> {
                     args[0]
                         .as_primitive::<Float32Type>()
                         .unary::<_, Float32Type>(|x: f32| {
-                            if x == 0_f32 {
-                                0_f32
-                            } else {
-                                x.trunc()
-                            }
+                            if x == 0_f32 { 0_f32 } else { x.trunc() }
                         }),
                 ) as ArrayRef)
             }
@@ -210,12 +240,12 @@ fn trunc(args: &[ArrayRef]) -> Result<ArrayRef> {
 
 fn compute_truncate32(x: f32, y: i64) -> f32 {
     let factor = 10.0_f32.powi(y as i32);
-    (x * factor).round() / factor
+    (x * factor).trunc() / factor
 }
 
 fn compute_truncate64(x: f64, y: i64) -> f64 {
     let factor = 10.0_f64.powi(y as i32);
-    (x * factor).round() / factor
+    (x * factor).trunc() / factor
 }
 
 #[cfg(test)]
@@ -246,9 +276,9 @@ mod test {
 
         assert_eq!(floats.len(), 5);
         assert_eq!(floats.value(0), 15.0);
-        assert_eq!(floats.value(1), 1_234.268);
+        assert_eq!(floats.value(1), 1_234.267);
         assert_eq!(floats.value(2), 1_233.12);
-        assert_eq!(floats.value(3), 3.312_98);
+        assert_eq!(floats.value(3), 3.312_97);
         assert_eq!(floats.value(4), -21.123_4);
     }
 
@@ -271,9 +301,9 @@ mod test {
 
         assert_eq!(floats.len(), 5);
         assert_eq!(floats.value(0), 5.0);
-        assert_eq!(floats.value(1), 234.268);
+        assert_eq!(floats.value(1), 234.267);
         assert_eq!(floats.value(2), 123.12);
-        assert_eq!(floats.value(3), 123.312_98);
+        assert_eq!(floats.value(3), 123.312_97);
         assert_eq!(floats.value(4), -321.123_1);
     }
 
diff --git a/datafusion/functions/src/planner.rs b/datafusion/functions/src/planner.rs
index ccd167997003e..9854326945e95 100644
--- a/datafusion/functions/src/planner.rs
+++ b/datafusion/functions/src/planner.rs
@@ -19,9 +19,9 @@
 
 use datafusion_common::Result;
 use datafusion_expr::{
+    Expr,
     expr::ScalarFunction,
     planner::{ExprPlanner, PlannerResult},
-    Expr,
 };
 
 #[deprecated(
diff --git a/datafusion/functions/src/regex/mod.rs b/datafusion/functions/src/regex/mod.rs
index da4e23f91de7d..75cc5d9514cbd 100644
--- a/datafusion/functions/src/regex/mod.rs
+++ b/datafusion/functions/src/regex/mod.rs
@@ -19,8 +19,8 @@
 
 use arrow::error::ArrowError;
 use regex::Regex;
-use std::collections::hash_map::Entry;
 use std::collections::HashMap;
+use std::collections::hash_map::Entry;
 use std::sync::Arc;
 pub mod regexpcount;
 pub mod regexpinstr;
diff --git a/datafusion/functions/src/regex/regexpcount.rs b/datafusion/functions/src/regex/regexpcount.rs
index 8bad506217aa5..7123c4d5e60d5 100644
--- a/datafusion/functions/src/regex/regexpcount.rs
+++ b/datafusion/functions/src/regex/regexpcount.rs
@@ -22,10 +22,10 @@ use arrow::datatypes::{
     DataType::Int64, DataType::LargeUtf8, DataType::Utf8, DataType::Utf8View,
 };
 use arrow::error::ArrowError;
-use datafusion_common::{exec_err, internal_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature::Exact,
-    TypeSignature::Uniform, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature::Exact, TypeSignature::Uniform, Volatility,
 };
 use datafusion_macros::user_doc;
 use itertools::izip;
@@ -92,10 +92,6 @@ impl RegexpCountFunc {
 }
 
 impl ScalarUDFImpl for RegexpCountFunc {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "regexp_count"
     }
@@ -108,10 +104,7 @@ impl ScalarUDFImpl for RegexpCountFunc {
         Ok(Int64)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
 
         let len = args
@@ -146,7 +139,9 @@ impl ScalarUDFImpl for RegexpCountFunc {
 pub fn regexp_count_func(args: &[ArrayRef]) -> Result<ArrayRef> {
     let args_len = args.len();
     if !(2..=4).contains(&args_len) {
-        return exec_err!("regexp_count was called with {args_len} arguments. It requires at least 2 and at most 4.");
+        return exec_err!(
+            "regexp_count was called with {args_len} arguments. It requires at least 2 and at most 4."
+        );
     }
 
     let values = &args[0];
@@ -183,7 +178,7 @@ pub fn regexp_count_func(args: &[ArrayRef]) -> Result<ArrayRef> {
 ///
 /// # Errors
 /// Returns an error if the input arrays have mismatched lengths or if the regular expression fails to compile.
-pub fn regexp_count(
+fn regexp_count(
     values: &dyn Array,
     regex_array: &dyn Datum,
     start_array: Option<&dyn Datum>,
@@ -201,8 +196,8 @@ pub fn regexp_count(
 
     match (values.data_type(), regex_array.data_type(), flags_array) {
         (Utf8, Utf8, None) => regexp_count_inner(
-            values.as_string::<i32>(),
-            regex_array.as_string::<i32>(),
+            &values.as_string::<i32>(),
+            &regex_array.as_string::<i32>(),
             is_regex_scalar,
             start_array.map(|start| start.as_primitive::<Int64Type>()),
             is_start_scalar,
@@ -210,17 +205,17 @@ pub fn regexp_count(
             is_flags_scalar,
         ),
         (Utf8, Utf8, Some(flags_array)) if *flags_array.data_type() == Utf8 => regexp_count_inner(
-            values.as_string::<i32>(),
-            regex_array.as_string::<i32>(),
+            &values.as_string::<i32>(),
+            &regex_array.as_string::<i32>(),
             is_regex_scalar,
             start_array.map(|start| start.as_primitive::<Int64Type>()),
             is_start_scalar,
-            Some(flags_array.as_string::<i32>()),
+            Some(&flags_array.as_string::<i32>()),
             is_flags_scalar,
         ),
         (LargeUtf8, LargeUtf8, None) => regexp_count_inner(
-            values.as_string::<i64>(),
-            regex_array.as_string::<i64>(),
+            &values.as_string::<i64>(),
+            &regex_array.as_string::<i64>(),
             is_regex_scalar,
             start_array.map(|start| start.as_primitive::<Int64Type>()),
             is_start_scalar,
@@ -228,17 +223,17 @@ pub fn regexp_count(
             is_flags_scalar,
         ),
         (LargeUtf8, LargeUtf8, Some(flags_array)) if *flags_array.data_type() == LargeUtf8 => regexp_count_inner(
-            values.as_string::<i64>(),
-            regex_array.as_string::<i64>(),
+            &values.as_string::<i64>(),
+            &regex_array.as_string::<i64>(),
             is_regex_scalar,
             start_array.map(|start| start.as_primitive::<Int64Type>()),
             is_start_scalar,
-            Some(flags_array.as_string::<i64>()),
+            Some(&flags_array.as_string::<i64>()),
             is_flags_scalar,
         ),
         (Utf8View, Utf8View, None) => regexp_count_inner(
-            values.as_string_view(),
-            regex_array.as_string_view(),
+            &values.as_string_view(),
+            &regex_array.as_string_view(),
             is_regex_scalar,
             start_array.map(|start| start.as_primitive::<Int64Type>()),
             is_start_scalar,
@@ -246,12 +241,12 @@ pub fn regexp_count(
             is_flags_scalar,
         ),
         (Utf8View, Utf8View, Some(flags_array)) if *flags_array.data_type() == Utf8View => regexp_count_inner(
-            values.as_string_view(),
-            regex_array.as_string_view(),
+            &values.as_string_view(),
+            &regex_array.as_string_view(),
             is_regex_scalar,
             start_array.map(|start| start.as_primitive::<Int64Type>()),
             is_start_scalar,
-            Some(flags_array.as_string_view()),
+            Some(&flags_array.as_string_view()),
             is_flags_scalar,
         ),
         _ => Err(ArrowError::ComputeError(
@@ -260,13 +255,13 @@ pub fn regexp_count(
     }
 }
 
-pub fn regexp_count_inner<'a, S>(
-    values: S,
-    regex_array: S,
+fn regexp_count_inner<'a, S>(
+    values: &S,
+    regex_array: &S,
     is_regex_scalar: bool,
     start_array: Option<&Int64Array>,
     is_start_scalar: bool,
-    flags_array: Option<S>,
+    flags_array: Option<&S>,
     is_flags_scalar: bool,
 ) -> Result<ArrayRef, ArrowError>
 where
@@ -306,7 +301,7 @@ where
         (true, true, true) => {
             let regex = match regex_scalar {
                 None | Some("") => {
-                    return Ok(Arc::new(Int64Array::from(vec![0; values.len()])))
+                    return Ok(Arc::new(Int64Array::from(vec![0; values.len()])));
                 }
                 Some(regex) => regex,
             };
@@ -323,7 +318,7 @@ where
         (true, true, false) => {
             let regex = match regex_scalar {
                 None | Some("") => {
-                    return Ok(Arc::new(Int64Array::from(vec![0; values.len()])))
+                    return Ok(Arc::new(Int64Array::from(vec![0; values.len()])));
                 }
                 Some(regex) => regex,
             };
@@ -352,7 +347,7 @@ where
         (true, false, true) => {
             let regex = match regex_scalar {
                 None | Some("") => {
-                    return Ok(Arc::new(Int64Array::from(vec![0; values.len()])))
+                    return Ok(Arc::new(Int64Array::from(vec![0; values.len()])));
                 }
                 Some(regex) => regex,
             };
@@ -372,7 +367,7 @@ where
         (true, false, false) => {
             let regex = match regex_scalar {
                 None | Some("") => {
-                    return Ok(Arc::new(Int64Array::from(vec![0; values.len()])))
+                    return Ok(Arc::new(Int64Array::from(vec![0; values.len()])));
                 }
                 Some(regex) => regex,
             };
@@ -567,8 +562,16 @@ fn count_matches(
             ));
         }
 
-        let find_slice = value.chars().skip(start as usize - 1).collect::<String>();
-        let count = pattern.find_iter(find_slice.as_str()).count();
+        // Find the byte offset for the start position (1-based character index)
+        let byte_offset = value
+            .char_indices()
+            .nth((start as usize).saturating_sub(1))
+            .map(|(idx, _)| idx)
+            .unwrap_or(value.len());
+
+        // Use string slicing instead of collecting chars into a new String
+        let find_slice = &value[byte_offset..];
+        let count = pattern.find_iter(find_slice).count();
         Ok(count as i64)
     } else {
         let count = pattern.find_iter(value).count();
@@ -582,7 +585,6 @@ mod tests {
     use arrow::array::{GenericStringArray, StringViewArray};
     use arrow::datatypes::Field;
     use datafusion_common::config::ConfigOptions;
-    use datafusion_expr::ScalarFunctionArgs;
 
     #[test]
     fn test_regexp_count() {
diff --git a/datafusion/functions/src/regex/regexpinstr.rs b/datafusion/functions/src/regex/regexpinstr.rs
index 851c182a90dd0..d46e4452dbab1 100644
--- a/datafusion/functions/src/regex/regexpinstr.rs
+++ b/datafusion/functions/src/regex/regexpinstr.rs
@@ -23,10 +23,10 @@ use arrow::datatypes::{
     DataType::Int64, DataType::LargeUtf8, DataType::Utf8, DataType::Utf8View,
 };
 use arrow::error::ArrowError;
-use datafusion_common::{exec_err, internal_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature::Exact,
-    TypeSignature::Uniform, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature::Exact, TypeSignature::Uniform, Volatility,
 };
 use datafusion_macros::user_doc;
 use itertools::izip;
@@ -109,10 +109,6 @@ impl RegexpInstrFunc {
 }
 
 impl ScalarUDFImpl for RegexpInstrFunc {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "regexp_instr"
     }
@@ -125,10 +121,7 @@ impl ScalarUDFImpl for RegexpInstrFunc {
         Ok(Int64)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
 
         let len = args
@@ -163,7 +156,9 @@ impl ScalarUDFImpl for RegexpInstrFunc {
 pub fn regexp_instr_func(args: &[ArrayRef]) -> Result<ArrayRef> {
     let args_len = args.len();
     if !(2..=6).contains(&args_len) {
-        return exec_err!("regexp_instr was called with {args_len} arguments. It requires at least 2 and at most 6.");
+        return exec_err!(
+            "regexp_instr was called with {args_len} arguments. It requires at least 2 and at most 6."
+        );
     }
 
     let values = &args[0];
@@ -205,7 +200,7 @@ pub fn regexp_instr_func(args: &[ArrayRef]) -> Result<ArrayRef> {
 ///
 /// # Errors
 /// Returns an error if the input arrays have mismatched lengths or if the regular expression fails to compile.
-pub fn regexp_instr(
+fn regexp_instr(
     values: &dyn Array,
     regex_array: &dyn Datum,
     start_array: Option<&dyn Datum>,
@@ -233,48 +228,48 @@ pub fn regexp_instr(
 
     match (values.data_type(), regex_array.data_type(), flags_array) {
         (Utf8, Utf8, None) => regexp_instr_inner(
-            values.as_string::<i32>(),
-            regex_array.as_string::<i32>(),
+            &values.as_string::<i32>(),
+            &regex_array.as_string::<i32>(),
             start_array.map(|start| start.as_primitive::<Int64Type>()),
             nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
             None,
             subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
         ),
         (Utf8, Utf8, Some(flags_array)) if *flags_array.data_type() == Utf8 => regexp_instr_inner(
-            values.as_string::<i32>(),
-            regex_array.as_string::<i32>(),
+            &values.as_string::<i32>(),
+            &regex_array.as_string::<i32>(),
             start_array.map(|start| start.as_primitive::<Int64Type>()),
             nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
             Some(flags_array.as_string::<i32>()),
             subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
         ),
         (LargeUtf8, LargeUtf8, None) => regexp_instr_inner(
-            values.as_string::<i64>(),
-            regex_array.as_string::<i64>(),
+            &values.as_string::<i64>(),
+            &regex_array.as_string::<i64>(),
             start_array.map(|start| start.as_primitive::<Int64Type>()),
             nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
             None,
             subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
         ),
         (LargeUtf8, LargeUtf8, Some(flags_array)) if *flags_array.data_type() == LargeUtf8 => regexp_instr_inner(
-            values.as_string::<i64>(),
-            regex_array.as_string::<i64>(),
+            &values.as_string::<i64>(),
+            &regex_array.as_string::<i64>(),
             start_array.map(|start| start.as_primitive::<Int64Type>()),
             nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
             Some(flags_array.as_string::<i64>()),
             subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
         ),
         (Utf8View, Utf8View, None) => regexp_instr_inner(
-            values.as_string_view(),
-            regex_array.as_string_view(),
+            &values.as_string_view(),
+            &regex_array.as_string_view(),
             start_array.map(|start| start.as_primitive::<Int64Type>()),
             nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
             None,
             subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
         ),
         (Utf8View, Utf8View, Some(flags_array)) if *flags_array.data_type() == Utf8View => regexp_instr_inner(
-            values.as_string_view(),
-            regex_array.as_string_view(),
+            &values.as_string_view(),
+            &regex_array.as_string_view(),
             start_array.map(|start| start.as_primitive::<Int64Type>()),
             nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
             Some(flags_array.as_string_view()),
@@ -286,10 +281,9 @@ pub fn regexp_instr(
     }
 }
 
-#[allow(clippy::too_many_arguments)]
-pub fn regexp_instr_inner<'a, S>(
-    values: S,
-    regex_array: S,
+fn regexp_instr_inner<'a, S>(
+    values: &S,
+    regex_array: &S,
     start_array: Option<&Int64Array>,
     nth_array: Option<&Int64Array>,
     flags_array: Option<S>,
@@ -357,14 +351,14 @@ fn handle_subexp(
     value: &str,
     byte_start_offset: usize,
 ) -> Result<Option<i64>, ArrowError> {
-    if let Some(captures) = pattern.captures(search_slice) {
-        if let Some(matched) = captures.get(subexpr as usize) {
-            // Convert byte offset relative to search_slice back to 1-based character offset
-            // relative to the original `value` string.
-            let start_char_offset =
-                value[..byte_start_offset + matched.start()].chars().count() as i64 + 1;
-            return Ok(Some(start_char_offset));
-        }
+    if let Some(captures) = pattern.captures(search_slice)
+        && let Some(matched) = captures.get(subexpr as usize)
+    {
+        // Convert byte offset relative to search_slice back to 1-based character offset
+        // relative to the original `value` string.
+        let start_char_offset =
+            value[..byte_start_offset + matched.start()].chars().count() as i64 + 1;
+        return Ok(Some(start_char_offset));
     }
     Ok(Some(0)) // Return 0 if the subexpression was not found
 }
@@ -448,11 +442,9 @@ where
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::array::Int64Array;
     use arrow::array::{GenericStringArray, StringViewArray};
     use arrow::datatypes::Field;
     use datafusion_common::config::ConfigOptions;
-    use datafusion_expr::ScalarFunctionArgs;
     #[test]
     fn test_regexp_instr() {
         test_case_sensitive_regexp_instr_nulls();
diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs
index d75eb9141c056..56754b13db227 100644
--- a/datafusion/functions/src/regex/regexplike.rs
+++ b/datafusion/functions/src/regex/regexplike.rs
@@ -17,25 +17,24 @@
 
 //! Regex expressions
 
-use arrow::array::{Array, ArrayRef, AsArray, GenericStringArray};
+use arrow::array::{Array, ArrayRef, AsArray, BooleanArray, GenericStringArray};
 use arrow::compute::kernels::regexp;
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
 use datafusion_common::types::logical_string;
 use datafusion_common::{
-    arrow_datafusion_err, exec_err, internal_err, plan_err, DataFusionError, Result,
-    ScalarValue,
+    Result, ScalarValue, arrow_datafusion_err, exec_err, internal_err, plan_err,
 };
 use datafusion_expr::{
-    binary_expr, cast, Coercion, ColumnarValue, Documentation, Expr, ScalarUDFImpl,
-    Signature, TypeSignature, TypeSignatureClass, Volatility,
+    Coercion, ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignature, TypeSignatureClass, Volatility, binary_expr, cast,
 };
 use datafusion_macros::user_doc;
 
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr_common::operator::Operator;
 use datafusion_expr_common::type_coercion::binary::BinaryTypeCoercer;
-use std::any::Any;
+use regex::Regex;
 use std::sync::Arc;
 
 #[user_doc(
@@ -56,7 +55,7 @@ SELECT regexp_like('aBc', '(b|d)', 'i');
 | true                                             |
 +--------------------------------------------------+
 ```
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
 "#,
     standard_argument(name = "str", prefix = "String"),
     standard_argument(name = "regexp", prefix = "Regular"),
@@ -103,10 +102,6 @@ impl RegexpLikeFunc {
 }
 
 impl ScalarUDFImpl for RegexpLikeFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "regexp_like"
     }
@@ -126,40 +121,54 @@ impl ScalarUDFImpl for RegexpLikeFunc {
         })
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
-
-        let len = args
-            .iter()
-            .fold(Option::<usize>::None, |acc, arg| match arg {
-                ColumnarValue::Scalar(_) => acc,
-                ColumnarValue::Array(a) => Some(a.len()),
-            });
-
-        let is_scalar = len.is_none();
-        let inferred_length = len.unwrap_or(1);
-        let args = args
-            .iter()
-            .map(|arg| arg.to_array(inferred_length))
-            .collect::<Result<Vec<_>>>()?;
-
-        let result = regexp_like(&args);
-        if is_scalar {
-            // If all inputs are scalar, keeps output as scalar
-            let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0));
-            result.map(ColumnarValue::Scalar)
-        } else {
-            result.map(ColumnarValue::Array)
+        match args.as_slice() {
+            [ColumnarValue::Scalar(value), ColumnarValue::Scalar(pattern)] => {
+                let value = scalar_string(value)?;
+                let pattern = scalar_string(pattern)?;
+                regexp_like_scalar(value, pattern, None)
+            }
+            [
+                ColumnarValue::Scalar(value),
+                ColumnarValue::Scalar(pattern),
+                ColumnarValue::Scalar(flags),
+            ] => {
+                let value = scalar_string(value)?;
+                let pattern = scalar_string(pattern)?;
+                let flags = scalar_string(flags)?;
+                regexp_like_scalar(value, pattern, flags)
+            }
+            [ColumnarValue::Array(values), ColumnarValue::Scalar(pattern)] => {
+                let pattern = scalar_string(pattern)?;
+                let array = regexp_like_array_scalar(values, pattern, None)?;
+                Ok(ColumnarValue::Array(array))
+            }
+            [
+                ColumnarValue::Array(values),
+                ColumnarValue::Scalar(pattern),
+                ColumnarValue::Scalar(flags),
+            ] => {
+                let flags = scalar_string(flags)?;
+                if flags.is_some_and(|flagz| flagz.contains('g')) {
+                    plan_err!("regexp_like() does not support the \"global\" option")
+                } else {
+                    let pattern = scalar_string(pattern)?;
+                    let array = regexp_like_array_scalar(values, pattern, flags)?;
+                    Ok(ColumnarValue::Array(array))
+                }
+            }
+            _ => {
+                let args = ColumnarValue::values_to_arrays(args)?;
+                regexp_like(&args).map(ColumnarValue::Array)
+            }
         }
     }
 
     fn simplify(
         &self,
         mut args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         // Try to simplify regexp_like usage to one of the builtin operators since those have
         // optimized code paths for the case where the regular expression pattern is a scalar.
@@ -276,43 +285,125 @@ pub fn regexp_like(args: &[ArrayRef]) -> Result<ArrayRef> {
                 Utf8 => args[2].as_string::<i32>(),
                 LargeUtf8 => {
                     let large_string_array = args[2].as_string::<i64>();
-                    let string_vec: Vec<Option<&str>> = (0..large_string_array.len()).map(|i| {
-                        if large_string_array.is_null(i) {
-                            None
-                        } else {
-                            Some(large_string_array.value(i))
-                        }
-                    })
-                    .collect();
+                    let string_vec: Vec<Option<&str>> = (0..large_string_array.len())
+                        .map(|i| {
+                            if large_string_array.is_null(i) {
+                                None
+                            } else {
+                                Some(large_string_array.value(i))
+                            }
+                        })
+                        .collect();
 
                     &GenericStringArray::<i32>::from(string_vec)
-                },
+                }
                 _ => {
                     let string_view_array = args[2].as_string_view();
-                    let string_vec: Vec<Option<String>> = (0..string_view_array.len()).map(|i| {
-                        if string_view_array.is_null(i) {
-                            None
-                        } else {
-                            Some(string_view_array.value(i).to_string())
-                        }
-                    })
-                    .collect();
+                    let string_vec: Vec<Option<String>> = (0..string_view_array.len())
+                        .map(|i| {
+                            if string_view_array.is_null(i) {
+                                None
+                            } else {
+                                Some(string_view_array.value(i).to_string())
+                            }
+                        })
+                        .collect();
                     &GenericStringArray::<i32>::from(string_vec)
-                },
+                }
             };
 
-            if flags.iter().any(|s| s == Some("g")) {
+            if flags
+                .iter()
+                .any(|s| s.is_some_and(|flagz| flagz.contains('g')))
+            {
                 return plan_err!("regexp_like() does not support the \"global\" option");
             }
 
             handle_regexp_like(&args[0], &args[1], Some(flags))
-        },
+        }
         other => exec_err!(
             "`regexp_like` was called with {other} arguments. It requires at least 2 and at most 3."
         ),
     }
 }
 
+fn scalar_string(value: &ScalarValue) -> Result<Option<&str>> {
+    match value.try_as_str() {
+        Some(v) => Ok(v),
+        None => internal_err!(
+            "Unsupported data type {:?} for function `regexp_like`",
+            value.data_type()
+        ),
+    }
+}
+
+fn regexp_like_array_scalar(
+    values: &ArrayRef,
+    pattern: Option<&str>,
+    flags: Option<&str>,
+) -> Result<ArrayRef> {
+    use DataType::*;
+
+    let Some(pattern) = pattern else {
+        return Ok(Arc::new(BooleanArray::new_null(values.len())));
+    };
+    let array = match values.data_type() {
+        Utf8 => {
+            let array = values.as_string::<i32>();
+            regexp::regexp_is_match_scalar(array, pattern, flags)?
+        }
+        Utf8View => {
+            let array = values.as_string_view();
+            regexp::regexp_is_match_scalar(array, pattern, flags)?
+        }
+        LargeUtf8 => {
+            let array = values.as_string::<i64>();
+            regexp::regexp_is_match_scalar(array, pattern, flags)?
+        }
+        other => {
+            return internal_err!(
+                "Unsupported data type {other:?} for function `regexp_like`"
+            );
+        }
+    };
+
+    Ok(Arc::new(array))
+}
+
+fn regexp_like_scalar(
+    value: Option<&str>,
+    pattern: Option<&str>,
+    flags: Option<&str>,
+) -> Result<ColumnarValue> {
+    if flags.is_some_and(|flagz| flagz.contains('g')) {
+        return plan_err!("regexp_like() does not support the \"global\" option");
+    }
+
+    if value.is_none() || pattern.is_none() {
+        return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
+    }
+
+    let value = value.unwrap();
+    let pattern = pattern.unwrap();
+    let pattern = match flags {
+        Some(flagz) => format!("(?{flagz}){pattern}"),
+        None => pattern.to_string(),
+    };
+
+    let result = if pattern.is_empty() {
+        true
+    } else {
+        let re = Regex::new(pattern.as_str()).map_err(|e| {
+            datafusion_common::DataFusionError::Execution(format!(
+                "Regular expression did not compile: {e:?}"
+            ))
+        })?;
+        re.is_match(value)
+    };
+
+    Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(result))))
+}
+
 fn handle_regexp_like(
     values: &ArrayRef,
     patterns: &ArrayRef,
@@ -355,7 +446,7 @@ fn handle_regexp_like(
                 .map_err(|e| arrow_datafusion_err!(e))?
         }
         (Utf8, LargeUtf8) => {
-            let value = values.as_string_view();
+            let value = values.as_string::<i32>();
             let pattern = patterns.as_string::<i64>();
 
             regexp::regexp_is_match(value, pattern, flags)
@@ -385,7 +476,7 @@ fn handle_regexp_like(
         other => {
             return internal_err!(
                 "Unsupported data type {other:?} for function `regexp_like`"
-            )
+            );
         }
     };
 
@@ -398,8 +489,37 @@ mod tests {
 
     use arrow::array::StringArray;
     use arrow::array::{BooleanBuilder, StringViewArray};
+    use arrow::datatypes::{DataType, Field};
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_common::{Result, ScalarValue};
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
 
-    use crate::regex::regexplike::regexp_like;
+    use crate::regex::regexplike::{RegexpLikeFunc, regexp_like};
+
+    fn invoke_regexp_like(args: Vec<ColumnarValue>) -> Result<ColumnarValue> {
+        let number_rows = args
+            .iter()
+            .find_map(|arg| match arg {
+                ColumnarValue::Array(array) => Some(array.len()),
+                _ => None,
+            })
+            .unwrap_or(1);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Arc::new(Field::new(format!("arg_{idx}"), arg.data_type(), true))
+            })
+            .collect::<Vec<_>>();
+
+        RegexpLikeFunc::new().invoke_with_args(ScalarFunctionArgs {
+            args,
+            arg_fields,
+            number_rows,
+            return_field: Arc::new(Field::new("f", DataType::Boolean, true)),
+            config_options: Arc::new(ConfigOptions::default()),
+        })
+    }
 
     #[test]
     fn test_case_sensitive_regexp_like_utf8() {
@@ -498,4 +618,66 @@ mod tests {
             "Error during planning: regexp_like() does not support the \"global\" option"
         );
     }
+
+    #[test]
+    fn test_regexp_like_scalar_invoke() {
+        let args = vec![
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("foobarbequebaz".to_string()))),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("(bar)(beque)".to_string()))),
+        ];
+        let result = invoke_regexp_like(args).unwrap();
+        match result {
+            ColumnarValue::Scalar(ScalarValue::Boolean(Some(true))) => {}
+            other => panic!("Unexpected result {other:?}"),
+        }
+    }
+
+    #[test]
+    fn test_regexp_like_array_scalar_invoke() {
+        let values = Arc::new(StringArray::from(vec!["abc", "xyz"]));
+        let args = vec![
+            ColumnarValue::Array(values),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("^(a)".to_string()))),
+        ];
+        let result = invoke_regexp_like(args).unwrap();
+        let mut expected_builder = BooleanBuilder::new();
+        expected_builder.append_value(true);
+        expected_builder.append_value(false);
+        let expected = expected_builder.finish();
+        match result {
+            ColumnarValue::Array(array) => {
+                assert_eq!(array.as_ref(), &expected);
+            }
+            other => panic!("Unexpected result {other:?}"),
+        }
+    }
+
+    #[test]
+    fn test_regexp_like_scalar_flags_with_global() {
+        let args = vec![
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("abc".to_string()))),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("^(a)".to_string()))),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("ig".to_string()))),
+        ];
+        let err = invoke_regexp_like(args).expect_err("global flag should be rejected");
+        assert_eq!(
+            err.strip_backtrace(),
+            "Error during planning: regexp_like() does not support the \"global\" option"
+        );
+    }
+
+    #[test]
+    fn test_regexp_like_array_scalar_flags_with_global() {
+        let values = Arc::new(StringArray::from(vec!["abc", "xyz"]));
+        let args = vec![
+            ColumnarValue::Array(values),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("^(a)".to_string()))),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("ig".to_string()))),
+        ];
+        let err = invoke_regexp_like(args).expect_err("global flag should be rejected");
+        assert_eq!(
+            err.strip_backtrace(),
+            "Error during planning: regexp_like() does not support the \"global\" option"
+        );
+    }
 }
diff --git a/datafusion/functions/src/regex/regexpmatch.rs b/datafusion/functions/src/regex/regexpmatch.rs
index ba52822a02f8c..34153d9c8ab96 100644
--- a/datafusion/functions/src/regex/regexpmatch.rs
+++ b/datafusion/functions/src/regex/regexpmatch.rs
@@ -20,14 +20,13 @@ use arrow::array::{Array, ArrayRef, AsArray};
 use arrow::compute::kernels::regexp;
 use arrow::datatypes::DataType;
 use arrow::datatypes::Field;
-use datafusion_common::exec_err;
+use datafusion_common::Result;
 use datafusion_common::ScalarValue;
+use datafusion_common::exec_err;
 use datafusion_common::{arrow_datafusion_err, plan_err};
-use datafusion_common::{DataFusionError, Result};
-use datafusion_expr::{ColumnarValue, Documentation, TypeSignature};
+use datafusion_expr::{ColumnarValue, Documentation, ScalarFunctionArgs, TypeSignature};
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::sync::Arc;
 
 #[user_doc(
@@ -48,7 +47,7 @@ use std::sync::Arc;
             | [B]                                               |
             +---------------------------------------------------+
 ```
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
 "#,
     standard_argument(name = "str", prefix = "String"),
     argument(
@@ -100,10 +99,6 @@ impl RegexpMatchFunc {
 }
 
 impl ScalarUDFImpl for RegexpMatchFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "regexp_match"
     }
@@ -119,10 +114,7 @@ impl ScalarUDFImpl for RegexpMatchFunc {
         })
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
         let len = args
             .iter()
@@ -155,29 +147,35 @@ impl ScalarUDFImpl for RegexpMatchFunc {
 
 pub fn regexp_match(args: &[ArrayRef]) -> Result<ArrayRef> {
     match args.len() {
-        2 => {
-            regexp::regexp_match(&args[0], &args[1], None)
-                .map_err(|e| arrow_datafusion_err!(e))
-        }
+        2 => regexp::regexp_match(&args[0], &args[1], None)
+            .map_err(|e| arrow_datafusion_err!(e)),
         3 => {
             match args[2].data_type() {
                 DataType::Utf8View => {
                     if args[2].as_string_view().iter().any(|s| s == Some("g")) {
-                        return plan_err!("regexp_match() does not support the \"global\" option");
+                        return plan_err!(
+                            "regexp_match() does not support the \"global\" option"
+                        );
                     }
                 }
                 DataType::Utf8 => {
                     if args[2].as_string::<i32>().iter().any(|s| s == Some("g")) {
-                        return plan_err!("regexp_match() does not support the \"global\" option");
+                        return plan_err!(
+                            "regexp_match() does not support the \"global\" option"
+                        );
                     }
                 }
                 DataType::LargeUtf8 => {
                     if args[2].as_string::<i64>().iter().any(|s| s == Some("g")) {
-                        return plan_err!("regexp_match() does not support the \"global\" option");
+                        return plan_err!(
+                            "regexp_match() does not support the \"global\" option"
+                        );
                     }
                 }
                 e => {
-                    return plan_err!("regexp_match was called with unexpected data type {e:?}");
+                    return plan_err!(
+                        "regexp_match was called with unexpected data type {e:?}"
+                    );
                 }
             }
 
@@ -254,6 +252,9 @@ mod tests {
             regexp_match(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
                 .expect_err("unsupported flag should have failed");
 
-        assert_eq!(re_err.strip_backtrace(), "Error during planning: regexp_match() does not support the \"global\" option");
+        assert_eq!(
+            re_err.strip_backtrace(),
+            "Error during planning: regexp_match() does not support the \"global\" option"
+        );
     }
 }
diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs
index ca3d19822e137..215dd33324375 100644
--- a/datafusion/functions/src/regex/regexpreplace.rs
+++ b/datafusion/functions/src/regex/regexpreplace.rs
@@ -16,30 +16,34 @@
 // under the License.
 
 //! Regex expressions
+use memchr::memchr;
+
 use arrow::array::ArrayDataBuilder;
 use arrow::array::BufferBuilder;
 use arrow::array::GenericStringArray;
 use arrow::array::StringViewBuilder;
-use arrow::array::{new_null_array, ArrayIter, AsArray};
 use arrow::array::{Array, ArrayRef, OffsetSizeTrait};
 use arrow::array::{ArrayAccessor, StringViewArray};
+use arrow::array::{ArrayIter, AsArray, new_null_array};
 use arrow::datatypes::DataType;
+use datafusion_common::ScalarValue;
 use datafusion_common::cast::{
     as_large_string_array, as_string_array, as_string_view_array,
 };
 use datafusion_common::exec_err;
 use datafusion_common::plan_err;
-use datafusion_common::ScalarValue;
 use datafusion_common::{
-    cast::as_generic_string_array, internal_err, DataFusionError, Result,
+    DataFusionError, Result, cast::as_generic_string_array, internal_err,
 };
-use datafusion_expr::function::Hint;
 use datafusion_expr::ColumnarValue;
 use datafusion_expr::TypeSignature;
-use datafusion_expr::{Documentation, ScalarUDFImpl, Signature, Volatility};
+use datafusion_expr::function::Hint;
+use datafusion_expr::{
+    Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
 use datafusion_macros::user_doc;
-use regex::Regex;
-use std::any::Any;
+use regex::{CaptureLocations, Regex};
+use std::borrow::Cow;
 use std::collections::HashMap;
 use std::sync::{Arc, LazyLock};
 
@@ -61,7 +65,7 @@ SELECT regexp_replace('aBc', '(b|d)', 'Ab\\1a', 'i');
 | aAbBac                                                            |
 +-------------------------------------------------------------------+
 ```
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
 "#,
     standard_argument(name = "str", prefix = "String"),
     argument(
@@ -76,7 +80,7 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
     argument(
         name = "flags",
         description = r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
-- **g**: (global) Search globally and don't return after the first match        
+- **g**: (global) Search globally and don't return after the first match
 - **i**: case-insensitive: letters match both upper and lower case
 - **m**: multi-line mode: ^ and $ match begin/end of line
 - **s**: allow . to match \n
@@ -111,10 +115,6 @@ impl RegexpReplaceFunc {
 }
 
 impl ScalarUDFImpl for RegexpReplaceFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "regexp_replace"
     }
@@ -149,10 +149,7 @@ impl ScalarUDFImpl for RegexpReplaceFunc {
         })
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
 
         let len = args
@@ -189,16 +186,114 @@ fn regexp_replace_func(args: &[ColumnarValue]) -> Result<ArrayRef> {
     }
 }
 
-/// replace POSIX capture groups (like \1) with Rust Regex group (like ${1})
+/// replace POSIX capture groups (like \1 or \\1) with Rust Regex group (like ${1})
 /// used by regexp_replace
+/// Handles both single backslash (\1) and double backslash (\\1) which can occur
+/// when SQL strings with escaped backslashes are passed through
+///
+/// Note: \0 is converted to ${0}, which in Rust's regex replacement syntax
+/// substitutes the entire match. This is consistent with POSIX behavior where
+/// \0 (or &) refers to the entire matched string.
 fn regex_replace_posix_groups(replacement: &str) -> String {
     static CAPTURE_GROUPS_RE_LOCK: LazyLock<Regex> =
-        LazyLock::new(|| Regex::new(r"(\\)(\d*)").unwrap());
+        LazyLock::new(|| Regex::new(r"\\{1,2}(\d+)").unwrap());
     CAPTURE_GROUPS_RE_LOCK
-        .replace_all(replacement, "$${$2}")
+        .replace_all(replacement, "$${$1}")
         .into_owned()
 }
 
+struct ShortRegex {
+    /// Shortened anchored regex used to extract capture group 1 directly.
+    /// See [`try_build_short_extract_regex`] for details.
+    short_re: Regex,
+    /// Reusable capture locations for `short_re` to avoid per-row allocation.
+    locs: CaptureLocations,
+}
+
+/// Holds the normal compiled regex together with the optional fast path used
+/// for `regexp_replace(str, '^...(capture)...*$', '\1')`.
+struct OptimizedRegex {
+    /// Full regex used for the normal replacement path and as a correctness fallback.
+    re: Regex,
+    /// Precomputed state for the direct-extraction fast path, when applicable.
+    short_re: Option<ShortRegex>,
+}
+
+impl OptimizedRegex {
+    /// Builds any reusable state needed by the extraction fast path.
+    ///
+    /// The fast path is only enabled for single replacements where the pattern
+    /// and replacement satisfy [`try_build_short_extract_regex`].
+    fn new(re: Regex, limit: usize, pattern: &str, replacement: &str) -> Self {
+        let short_re = if limit == 1 {
+            try_build_short_extract_regex(pattern, replacement)
+        } else {
+            None
+        };
+
+        let short_re = short_re.map(|short_re| {
+            let locs = short_re.capture_locations();
+            ShortRegex { short_re, locs }
+        });
+
+        Self { re, short_re }
+    }
+
+    /// Applies the direct-extraction fast path when it preserves the result of
+    /// `Regex::replacen`; otherwise falls back to the full regex replacement.
+    fn replacen<'a>(
+        &mut self,
+        val: &'a str,
+        limit: usize,
+        replacement: &str,
+    ) -> Cow<'a, str> {
+        // If this pattern is not eligible for direct extraction, use the full regex.
+        let Some(ShortRegex { short_re, locs }) = self.short_re.as_mut() else {
+            return self.re.replacen(val, limit, replacement);
+        };
+
+        // If the shortened regex does not match, the original anchored regex would
+        // also leave the input unchanged.
+        if short_re.captures_read(locs, val).is_none() {
+            return Cow::Borrowed(val);
+        };
+
+        // `captures_read` succeeded, so the overall shortened match is present.
+        let match_end = locs.get(0).unwrap().1;
+        if memchr(b'\n', &val.as_bytes()[match_end..]).is_some() {
+            // If there is a newline after the match, we can't use the short
+            // regex since it won't match across lines. Fall back to the full
+            // regex replacement.
+            return self.re.replacen(val, limit, replacement);
+        };
+        // The fast path only applies to `${1}` replacements, so the result is
+        // either capture group 1 or the empty string if that group did not match.
+        if let Some((start, end)) = locs.get(1) {
+            Cow::Borrowed(&val[start..end])
+        } else {
+            Cow::Borrowed("")
+        }
+    }
+}
+
+/// For anchored patterns like `^...(capture)....*$` where the replacement
+/// is `\1`, build a shorter regex (stripping trailing `.*$`) and use
+/// `captures_read` with `CaptureLocations` for direct extraction — no
+/// `expand()`, no `String` allocation.
+/// This pattern appears in ClickBench Q28: which uses a regexp like
+/// `^https?://(?:www\.)?([^/]+)/.*$`
+fn try_build_short_extract_regex(pattern: &str, replacement: &str) -> Option<Regex> {
+    if replacement != "${1}" || !pattern.starts_with('^') || !pattern.ends_with(".*$") {
+        return None;
+    }
+    let short = &pattern[..pattern.len() - 3];
+    let re = Regex::new(short).ok()?;
+    if re.captures_len() != 2 {
+        return None;
+    }
+    Some(re)
+}
+
 /// Replaces substring(s) matching a PCRE-like regular expression.
 ///
 /// The full list of supported features and syntax can be found at
@@ -382,48 +477,32 @@ where
     }
 }
 
-fn _regexp_replace_early_abort<T: ArrayAccessor>(
-    input_array: T,
-    sz: usize,
-) -> Result<ArrayRef> {
-    // Mimicking the existing behavior of regexp_replace, if any of the scalar arguments
-    // are actually null, then the result will be an array of the same size as the first argument with all nulls.
-    //
-    // Also acts like an early abort mechanism when the input array is empty.
-    Ok(new_null_array(input_array.data_type(), sz))
-}
-
 /// Get the first argument from the given string array.
 ///
 /// Note: If the array is empty or the first argument is null,
-/// then calls the given early abort function.
+/// then aborts early.
 macro_rules! fetch_string_arg {
-    ($ARG:expr, $NAME:expr, $EARLY_ABORT:ident, $ARRAY_SIZE:expr) => {{
+    ($ARG:expr, $NAME:expr, $ARRAY_SIZE:expr) => {{
         let string_array_type = ($ARG).data_type();
         match string_array_type {
+            dt if $ARG.len() == 0 || $ARG.is_null(0) => {
+                // Mimicking the existing behavior of regexp_replace, if any of the scalar arguments
+                // are actually null, then the result will be an array of the same size as the first argument with all nulls.
+                //
+                // Also acts like an early abort mechanism when the input array is empty.
+                return Ok(new_null_array(dt, $ARRAY_SIZE));
+            }
             DataType::Utf8 => {
                 let array = as_string_array($ARG)?;
-                if array.len() == 0 || array.is_null(0) {
-                    return $EARLY_ABORT(array, $ARRAY_SIZE);
-                } else {
-                    array.value(0)
-                }
+                array.value(0)
             }
             DataType::LargeUtf8 => {
                 let array = as_large_string_array($ARG)?;
-                if array.len() == 0 || array.is_null(0) {
-                    return $EARLY_ABORT(array, $ARRAY_SIZE);
-                } else {
-                    array.value(0)
-                }
+                array.value(0)
             }
             DataType::Utf8View => {
                 let array = as_string_view_array($ARG)?;
-                if array.len() == 0 || array.is_null(0) {
-                    return $EARLY_ABORT(array, $ARRAY_SIZE);
-                } else {
-                    array.value(0)
-                }
+                array.value(0)
             }
             _ => unreachable!(
                 "Invalid data type for regexp_replace: {}",
@@ -438,25 +517,19 @@ macro_rules! fetch_string_arg {
 /// hold a single Regex object for the replace operation. This also speeds
 /// up the pre-processing time of the replacement string, since it only
 /// needs to processed once.
-fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
+fn regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
     args: &[ArrayRef],
 ) -> Result<ArrayRef> {
     let array_size = args[0].len();
-    let pattern =
-        fetch_string_arg!(&args[1], "pattern", _regexp_replace_early_abort, array_size);
-    let replacement = fetch_string_arg!(
-        &args[2],
-        "replacement",
-        _regexp_replace_early_abort,
-        array_size
-    );
+    let pattern = fetch_string_arg!(&args[1], "pattern", array_size);
+    let replacement = fetch_string_arg!(&args[2], "replacement", array_size);
     let flags = match args.len() {
         3 => None,
-        4 => Some(fetch_string_arg!(&args[3], "flags", _regexp_replace_early_abort, array_size)),
+        4 => Some(fetch_string_arg!(&args[3], "flags", array_size)),
         other => {
             return exec_err!(
                 "regexp_replace was called with {other} arguments. It requires at least 3 and at most 4."
-            )
+            );
         }
     };
 
@@ -479,6 +552,8 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
     // with rust ones.
     let replacement = regex_replace_posix_groups(replacement);
 
+    let mut opt_re = OptimizedRegex::new(re, limit, &pattern, &replacement);
+
     let string_array_type = args[0].data_type();
     match string_array_type {
         DataType::Utf8 | DataType::LargeUtf8 => {
@@ -497,7 +572,7 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
 
             string_array.iter().for_each(|val| {
                 if let Some(val) = val {
-                    let result = re.replacen(val, limit, replacement.as_str());
+                    let result = opt_re.replacen(val, limit, replacement.as_str());
                     vals.append_slice(result.as_bytes());
                 }
                 new_offsets.append(T::from_usize(vals.len()).unwrap());
@@ -518,8 +593,8 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
 
             for val in string_view_array.iter() {
                 if let Some(val) = val {
-                    let result = re.replacen(val, limit, replacement.as_str());
-                    builder.append_value(result);
+                    let result = opt_re.replacen(val, limit, replacement.as_str());
+                    builder.append_value(result.as_ref());
                 } else {
                     builder.append_null();
                 }
@@ -537,7 +612,7 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
 
 /// Determine which implementation of the regexp_replace to use based
 /// on the given set of arguments.
-pub fn specialize_regexp_replace<T: OffsetSizeTrait>(
+fn specialize_regexp_replace<T: OffsetSizeTrait>(
     args: &[ColumnarValue],
 ) -> Result<ArrayRef> {
     // This will serve as a dispatch table where we can
@@ -598,7 +673,7 @@ pub fn specialize_regexp_replace<T: OffsetSizeTrait>(
                     arg.to_array(expansion_len)
                 })
                 .collect::<Result<Vec<_>>>()?;
-            _regexp_replace_static_pattern_replace::<T>(&args)
+            regexp_replace_static_pattern_replace::<T>(&args)
         }
 
         // If there are no specialized implementations, we'll fall back to the
@@ -681,6 +756,42 @@ mod tests {
 
     use super::*;
 
+    #[test]
+    fn test_regex_replace_posix_groups() {
+        // Test that \1, \2, etc. are replaced with ${1}, ${2}, etc.
+        assert_eq!(regex_replace_posix_groups(r"\1"), "${1}");
+        assert_eq!(regex_replace_posix_groups(r"\12"), "${12}");
+        assert_eq!(regex_replace_posix_groups(r"X\1Y"), "X${1}Y");
+        assert_eq!(regex_replace_posix_groups(r"\1\2"), "${1}${2}");
+
+        // Test double backslash (from SQL escaped strings like '\\1')
+        assert_eq!(regex_replace_posix_groups(r"\\1"), "${1}");
+        assert_eq!(regex_replace_posix_groups(r"X\\1Y"), "X${1}Y");
+        assert_eq!(regex_replace_posix_groups(r"\\1\\2"), "${1}${2}");
+
+        // Test 3 or 4 backslashes before digits to document expected behavior
+        assert_eq!(regex_replace_posix_groups(r"\\\1"), r"\${1}");
+        assert_eq!(regex_replace_posix_groups(r"\\\\1"), r"\\${1}");
+        assert_eq!(regex_replace_posix_groups(r"\\\1\\\\2"), r"\${1}\\${2}");
+
+        // Test that a lone backslash is NOT replaced (requires at least one digit)
+        assert_eq!(regex_replace_posix_groups(r"\"), r"\");
+        assert_eq!(regex_replace_posix_groups(r"foo\bar"), r"foo\bar");
+
+        // Test that backslash followed by non-digit is preserved
+        assert_eq!(regex_replace_posix_groups(r"\n"), r"\n");
+        assert_eq!(regex_replace_posix_groups(r"\t"), r"\t");
+
+        // Test \0 behavior: \0 is converted to ${0}, which in Rust's regex
+        // replacement syntax substitutes the entire match. This is consistent
+        // with POSIX behavior where \0 (or &) refers to the entire matched string.
+        assert_eq!(regex_replace_posix_groups(r"\0"), "${0}");
+        assert_eq!(
+            regex_replace_posix_groups(r"prefix\0suffix"),
+            "prefix${0}suffix"
+        );
+    }
+
     macro_rules! static_pattern_regexp_replace {
         ($name:ident, $T:ty, $O:ty) => {
             #[test]
@@ -696,7 +807,7 @@ mod tests {
                 let replacements = <$T>::from(replacement);
                 let expected = <$T>::from(expected);
 
-                let re = _regexp_replace_static_pattern_replace::<$O>(&[
+                let re = regexp_replace_static_pattern_replace::<$O>(&[
                     Arc::new(values),
                     Arc::new(patterns),
                     Arc::new(replacements),
@@ -741,7 +852,7 @@ mod tests {
                 let flags = StringArray::from(vec!["i"; 5]);
                 let expected = <$T>::from(expected);
 
-                let re = _regexp_replace_static_pattern_replace::<$O>(&[
+                let re = regexp_replace_static_pattern_replace::<$O>(&[
                     Arc::new(values),
                     Arc::new(patterns),
                     Arc::new(replacements),
@@ -773,7 +884,7 @@ mod tests {
         let replacements = StringArray::from(vec!["foo"; 5]);
         let expected = StringArray::from(vec![None::<&str>; 5]);
 
-        let re = _regexp_replace_static_pattern_replace::<i32>(&[
+        let re = regexp_replace_static_pattern_replace::<i32>(&[
             Arc::new(values),
             Arc::new(patterns),
             Arc::new(replacements),
@@ -790,7 +901,7 @@ mod tests {
         let replacements = StringArray::from(Vec::<Option<&str>>::new());
         let expected = StringArray::from(Vec::<Option<&str>>::new());
 
-        let re = _regexp_replace_static_pattern_replace::<i32>(&[
+        let re = regexp_replace_static_pattern_replace::<i32>(&[
             Arc::new(values),
             Arc::new(patterns),
             Arc::new(replacements),
@@ -808,7 +919,7 @@ mod tests {
         let flags = StringArray::from(vec![None::<&str>; 5]);
         let expected = StringArray::from(vec![None::<&str>; 5]);
 
-        let re = _regexp_replace_static_pattern_replace::<i32>(&[
+        let re = regexp_replace_static_pattern_replace::<i32>(&[
             Arc::new(values),
             Arc::new(patterns),
             Arc::new(replacements),
@@ -827,7 +938,7 @@ mod tests {
         let patterns = StringArray::from(vec!["["; 5]);
         let replacements = StringArray::from(vec!["foo"; 5]);
 
-        let re = _regexp_replace_static_pattern_replace::<i32>(&[
+        let re = regexp_replace_static_pattern_replace::<i32>(&[
             Arc::new(values),
             Arc::new(patterns),
             Arc::new(replacements),
@@ -864,7 +975,7 @@ mod tests {
             Some("c"),
         ]);
 
-        let re = _regexp_replace_static_pattern_replace::<i32>(&[
+        let re = regexp_replace_static_pattern_replace::<i32>(&[
             Arc::new(values),
             Arc::new(patterns),
             Arc::new(replacements),
@@ -892,7 +1003,7 @@ mod tests {
         let replacements = StringArray::from(vec!["foo"; 1]);
         let expected = StringArray::from(vec![Some("b"), None, Some("foo"), None, None]);
 
-        let re = _regexp_replace_static_pattern_replace::<i32>(&[
+        let re = regexp_replace_static_pattern_replace::<i32>(&[
             Arc::new(values),
             Arc::new(patterns),
             Arc::new(replacements),
diff --git a/datafusion/functions/src/string/ascii.rs b/datafusion/functions/src/string/ascii.rs
index bdf30833127a2..bb5a8d0125a70 100644
--- a/datafusion/functions/src/string/ascii.rs
+++ b/datafusion/functions/src/string/ascii.rs
@@ -15,17 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::utils::make_scalar_function;
 use arrow::array::{ArrayRef, AsArray, Int32Array, StringArrayType};
 use arrow::datatypes::DataType;
 use arrow::error::ArrowError;
 use datafusion_common::types::logical_string;
-use datafusion_common::{internal_err, Result};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, internal_err};
 use datafusion_expr::{ColumnarValue, Documentation, TypeSignatureClass};
 use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility};
 use datafusion_expr_common::signature::Coercion;
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::sync::Arc;
 
 #[user_doc(
@@ -74,10 +73,6 @@ impl AsciiFunc {
 }
 
 impl ScalarUDFImpl for AsciiFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "ascii"
     }
@@ -91,7 +86,31 @@ impl ScalarUDFImpl for AsciiFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(ascii, vec![])(&args.args)
+        let [arg] = take_function_args(self.name(), args.args)?;
+
+        match arg {
+            ColumnarValue::Scalar(scalar) => {
+                if scalar.is_null() {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Int32(None)));
+                }
+
+                match scalar {
+                    ScalarValue::Utf8(Some(s))
+                    | ScalarValue::LargeUtf8(Some(s))
+                    | ScalarValue::Utf8View(Some(s)) => {
+                        let result = s.chars().next().map_or(0, |c| c as i32);
+                        Ok(ColumnarValue::Scalar(ScalarValue::Int32(Some(result))))
+                    }
+                    _ => {
+                        internal_err!(
+                            "Unexpected data type {:?} for function ascii",
+                            scalar.data_type()
+                        )
+                    }
+                }
+            }
+            ColumnarValue::Array(array) => Ok(ColumnarValue::Array(ascii(&[array])?)),
+        }
     }
 
     fn documentation(&self) -> Option<&Documentation> {
@@ -99,7 +118,7 @@ impl ScalarUDFImpl for AsciiFunc {
     }
 }
 
-fn calculate_ascii<'a, V>(array: V) -> Result<ArrayRef, ArrowError>
+fn calculate_ascii<'a, V>(array: &V) -> Result<ArrayRef, ArrowError>
 where
     V: StringArrayType<'a, Item = &'a str>,
 {
@@ -124,15 +143,15 @@ pub fn ascii(args: &[ArrayRef]) -> Result<ArrayRef> {
     match args[0].data_type() {
         DataType::Utf8 => {
             let string_array = args[0].as_string::<i32>();
-            Ok(calculate_ascii(string_array)?)
+            Ok(calculate_ascii(&string_array)?)
         }
         DataType::LargeUtf8 => {
             let string_array = args[0].as_string::<i64>();
-            Ok(calculate_ascii(string_array)?)
+            Ok(calculate_ascii(&string_array)?)
         }
         DataType::Utf8View => {
             let string_array = args[0].as_string_view();
-            Ok(calculate_ascii(string_array)?)
+            Ok(calculate_ascii(&string_array)?)
         }
         _ => internal_err!("Unsupported data type"),
     }
diff --git a/datafusion/functions/src/string/bit_length.rs b/datafusion/functions/src/string/bit_length.rs
index 1578331e57f89..76d8bb73bba87 100644
--- a/datafusion/functions/src/string/bit_length.rs
+++ b/datafusion/functions/src/string/bit_length.rs
@@ -17,7 +17,6 @@
 
 use arrow::compute::kernels::length::bit_length;
 use arrow::datatypes::DataType;
-use std::any::Any;
 
 use crate::utils::utf8_to_int_type;
 use datafusion_common::types::logical_string;
@@ -70,10 +69,6 @@ impl BitLengthFunc {
 }
 
 impl ScalarUDFImpl for BitLengthFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "bit_length"
     }
diff --git a/datafusion/functions/src/string/btrim.rs b/datafusion/functions/src/string/btrim.rs
index a7fbdb3c69213..279f444d9ffe7 100644
--- a/datafusion/functions/src/string/btrim.rs
+++ b/datafusion/functions/src/string/btrim.rs
@@ -20,17 +20,16 @@ use crate::utils::{make_scalar_function, utf8_to_str_type};
 use arrow::array::{ArrayRef, OffsetSizeTrait};
 use arrow::datatypes::DataType;
 use datafusion_common::types::logical_string;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::function::Hint;
 use datafusion_expr::{
     Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
     TypeSignature, TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::sync::Arc;
 
-/// Returns the longest string with leading and trailing characters removed. If the characters are not specified, whitespace is removed.
+/// Returns the longest string with leading and trailing characters removed. If the characters are not specified, spaces are removed.
 /// btrim('xyxtrimyyx', 'xyz') = 'trim'
 fn btrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     let use_string_view = args[0].data_type() == &DataType::Utf8View;
@@ -40,12 +39,12 @@ fn btrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     } else {
         args.to_owned()
     };
-    general_trim::<T>(&args, TrimType::Both, use_string_view)
+    general_trim::<T, TrimBoth>(&args, use_string_view)
 }
 
 #[user_doc(
     doc_section(label = "String Functions"),
-    description = "Trims the specified trim string from the start and end of a string. If no trim string is provided, all whitespace is removed from the start and end of the input string.",
+    description = "Trims the specified trim string from the start and end of a string. If no trim string is provided, all spaces are removed from the start and end of the input string.",
     syntax_example = "btrim(str[, trim_str])",
     sql_example = r#"```sql
 > select btrim('__datafusion____', '_');
@@ -58,7 +57,7 @@ fn btrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     standard_argument(name = "str", prefix = "String"),
     argument(
         name = "trim_str",
-        description = r"String expression to operate on. Can be a constant, column, or function, and any combination of operators. _Default is whitespace characters._"
+        description = r"String expression to operate on. Can be a constant, column, or function, and any combination of operators. _Default is a space._"
     ),
     alternative_syntax = "trim(BOTH trim_str FROM str)",
     alternative_syntax = "trim(trim_str FROM str)",
@@ -98,10 +97,6 @@ impl BTrimFunc {
 }
 
 impl ScalarUDFImpl for BTrimFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "btrim"
     }
diff --git a/datafusion/functions/src/string/chr.rs b/datafusion/functions/src/string/chr.rs
index 4d2beafbae53a..60df0c47cfa14 100644
--- a/datafusion/functions/src/string/chr.rs
+++ b/datafusion/functions/src/string/chr.rs
@@ -15,56 +15,64 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
-use arrow::array::ArrayRef;
-use arrow::array::GenericStringBuilder;
+use arrow::array::{Array, ArrayRef, Int64Array};
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::Int64;
 use arrow::datatypes::DataType::Utf8;
 
-use crate::utils::make_scalar_function;
+use crate::strings::GenericStringArrayBuilder;
 use datafusion_common::cast::as_int64_array;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
 use datafusion_expr::{ColumnarValue, Documentation, Volatility};
 use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature};
 use datafusion_macros::user_doc;
 
 /// Returns the character with the given code.
 /// chr(65) = 'A'
-pub fn chr(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let integer_array = as_int64_array(&args[0])?;
-
-    let mut builder = GenericStringBuilder::<i32>::with_capacity(
-        integer_array.len(),
-        // 1 byte per character, assuming that is the common case
-        integer_array.len(),
+fn chr_array(integer_array: &Int64Array) -> Result<ArrayRef> {
+    let len = integer_array.len();
+    let mut builder = GenericStringArrayBuilder::<i32>::with_capacity(
+        len, // 1 byte per character, assuming that is the common case
+        len,
     );
 
     let mut buf = [0u8; 4];
+    let nulls = integer_array.nulls();
 
-    for integer in integer_array {
-        match integer {
-            Some(integer) => {
-                if let Ok(u) = u32::try_from(integer) {
-                    if let Some(c) = core::char::from_u32(u) {
-                        builder.append_value(c.encode_utf8(&mut buf));
-                        continue;
-                    }
-                }
-
-                return exec_err!("invalid Unicode scalar value: {integer}");
+    if let Some(n) = nulls {
+        for i in 0..len {
+            if n.is_null(i) {
+                builder.append_placeholder();
+                continue;
+            }
+            // SAFETY: bounds + null check above.
+            let integer = unsafe { integer_array.value_unchecked(i) };
+            if let Ok(u) = u32::try_from(integer)
+                && let Some(c) = core::char::from_u32(u)
+            {
+                builder.append_value(c.encode_utf8(&mut buf));
+                continue;
             }
-            None => {
-                builder.append_null();
+            return exec_err!("invalid Unicode scalar value: {integer}");
+        }
+    } else {
+        for i in 0..len {
+            // SAFETY: no null buffer means every index is valid.
+            let integer = unsafe { integer_array.value_unchecked(i) };
+            if let Ok(u) = u32::try_from(integer)
+                && let Some(c) = core::char::from_u32(u)
+            {
+                builder.append_value(c.encode_utf8(&mut buf));
+                continue;
             }
+            return exec_err!("invalid Unicode scalar value: {integer}");
         }
     }
 
-    let result = builder.finish();
-
-    Ok(Arc::new(result) as ArrayRef)
+    Ok(Arc::new(builder.finish(nulls.cloned())?) as ArrayRef)
 }
 
 #[user_doc(
@@ -102,10 +110,6 @@ impl ChrFunc {
 }
 
 impl ScalarUDFImpl for ChrFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "chr"
     }
@@ -119,7 +123,32 @@ impl ScalarUDFImpl for ChrFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(chr, vec![])(&args.args)
+        let [arg] = take_function_args(self.name(), args.args)?;
+
+        match arg {
+            ColumnarValue::Scalar(ScalarValue::Int64(Some(code_point))) => {
+                if let Ok(u) = u32::try_from(code_point)
+                    && let Some(c) = core::char::from_u32(u)
+                {
+                    Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                        c.to_string(),
+                    ))))
+                } else {
+                    exec_err!("invalid Unicode scalar value: {code_point}")
+                }
+            }
+            ColumnarValue::Scalar(ScalarValue::Int64(None)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
+            }
+            ColumnarValue::Array(array) => {
+                let integer_array = as_int64_array(&array)?;
+                Ok(ColumnarValue::Array(chr_array(integer_array)?))
+            }
+            other => internal_err!(
+                "Unexpected data type {:?} for function chr",
+                other.data_type()
+            ),
+        }
     }
 
     fn documentation(&self) -> Option<&Documentation> {
@@ -130,13 +159,26 @@ impl ScalarUDFImpl for ChrFunc {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::array::{Array, Int64Array, StringArray};
+
+    use arrow::array::{Array, StringArray};
+    use arrow::datatypes::Field;
     use datafusion_common::assert_contains;
+    use datafusion_common::config::ConfigOptions;
+
+    fn invoke_chr(arg: ColumnarValue, number_rows: usize) -> Result<ColumnarValue> {
+        ChrFunc::new().invoke_with_args(ScalarFunctionArgs {
+            args: vec![arg],
+            arg_fields: vec![Field::new("a", Int64, true).into()],
+            number_rows,
+            return_field: Field::new("f", Utf8, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        })
+    }
 
     #[test]
     fn test_chr_normal() {
         let input = Arc::new(Int64Array::from(vec![
-            Some(0),        // null
+            Some(0),        // \u{0000}
             Some(65),       // A
             Some(66),       // B
             Some(67),       // C
@@ -149,8 +191,13 @@ mod tests {
             Some(9),        // tab
             Some(0x10FFFF), // 0x10FFFF, the largest Unicode code point
         ]));
-        let result = chr(&[input]).unwrap();
-        let string_array = result.as_any().downcast_ref::<StringArray>().unwrap();
+
+        let result = invoke_chr(ColumnarValue::Array(input), 12).unwrap();
+        let ColumnarValue::Array(arr) = result else {
+            panic!("Expected array");
+        };
+        let string_array = arr.as_any().downcast_ref::<StringArray>().unwrap();
+
         let expected = [
             "\u{0000}",
             "A",
@@ -167,62 +214,61 @@ mod tests {
         ];
 
         assert_eq!(string_array.len(), expected.len());
+        assert_eq!(string_array.null_count(), 1);
+        assert!(string_array.is_null(7));
         for (i, e) in expected.iter().enumerate() {
+            if i == 7 {
+                continue;
+            }
+            assert!(!string_array.is_null(i));
             assert_eq!(string_array.value(i), *e);
         }
     }
 
     #[test]
     fn test_chr_error() {
-        // invalid Unicode code points (too large)
         let input = Arc::new(Int64Array::from(vec![i64::MAX]));
-        let result = chr(&[input]);
+        let result = invoke_chr(ColumnarValue::Array(input), 1);
         assert!(result.is_err());
         assert_contains!(
             result.err().unwrap().to_string(),
             "invalid Unicode scalar value: 9223372036854775807"
         );
 
-        // invalid Unicode code points (too large) case 2
         let input = Arc::new(Int64Array::from(vec![0x10FFFF + 1]));
-        let result = chr(&[input]);
+        let result = invoke_chr(ColumnarValue::Array(input), 1);
         assert!(result.is_err());
         assert_contains!(
             result.err().unwrap().to_string(),
             "invalid Unicode scalar value: 1114112"
         );
 
-        // invalid Unicode code points (surrogate code point)
-        // link: <https://learn.microsoft.com/en-us/globalization/encoding/unicode-standard#surrogate-pairs>
         let input = Arc::new(Int64Array::from(vec![0xD800 + 1]));
-        let result = chr(&[input]);
+        let result = invoke_chr(ColumnarValue::Array(input), 1);
         assert!(result.is_err());
         assert_contains!(
             result.err().unwrap().to_string(),
             "invalid Unicode scalar value: 55297"
         );
 
-        // negative input
-        let input = Arc::new(Int64Array::from(vec![i64::MIN + 2i64])); // will be 2 if cast to u32
-        let result = chr(&[input]);
+        let input = Arc::new(Int64Array::from(vec![i64::MIN + 2i64]));
+        let result = invoke_chr(ColumnarValue::Array(input), 1);
         assert!(result.is_err());
         assert_contains!(
             result.err().unwrap().to_string(),
             "invalid Unicode scalar value: -9223372036854775806"
         );
 
-        // negative input case 2
         let input = Arc::new(Int64Array::from(vec![-1]));
-        let result = chr(&[input]);
+        let result = invoke_chr(ColumnarValue::Array(input), 1);
         assert!(result.is_err());
         assert_contains!(
             result.err().unwrap().to_string(),
             "invalid Unicode scalar value: -1"
         );
 
-        // one error with valid values after
-        let input = Arc::new(Int64Array::from(vec![65, -1, 66])); // A, -1, B
-        let result = chr(&[input]);
+        let input = Arc::new(Int64Array::from(vec![65, -1, 66]));
+        let result = invoke_chr(ColumnarValue::Array(input), 3);
         assert!(result.is_err());
         assert_contains!(
             result.err().unwrap().to_string(),
@@ -232,10 +278,36 @@ mod tests {
 
     #[test]
     fn test_chr_empty() {
-        // empty input array
         let input = Arc::new(Int64Array::from(Vec::<i64>::new()));
-        let result = chr(&[input]).unwrap();
-        let string_array = result.as_any().downcast_ref::<StringArray>().unwrap();
+        let result = invoke_chr(ColumnarValue::Array(input), 0).unwrap();
+        let ColumnarValue::Array(arr) = result else {
+            panic!("Expected array");
+        };
+        let string_array = arr.as_any().downcast_ref::<StringArray>().unwrap();
         assert_eq!(string_array.len(), 0);
     }
+
+    #[test]
+    fn test_chr_scalar() {
+        let result =
+            invoke_chr(ColumnarValue::Scalar(ScalarValue::Int64(Some(65))), 1).unwrap();
+
+        match result {
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => {
+                assert_eq!(s, "A");
+            }
+            other => panic!("Unexpected result: {other:?}"),
+        }
+    }
+
+    #[test]
+    fn test_chr_scalar_null() {
+        let result =
+            invoke_chr(ColumnarValue::Scalar(ScalarValue::Int64(None)), 1).unwrap();
+
+        match result {
+            ColumnarValue::Scalar(ScalarValue::Utf8(None)) => {}
+            other => panic!("Unexpected result: {other:?}"),
+        }
+    }
 }
diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs
index 5e0567eafea2e..70699c3653c4b 100644
--- a/datafusion/functions/src/string/common.rs
+++ b/datafusion/functions/src/string/common.rs
@@ -17,175 +17,200 @@
 
 //! Common utilities for implementing string functions
 
-use std::fmt::{Display, Formatter};
 use std::sync::Arc;
 
-use crate::strings::make_and_append_view;
+use crate::strings::{GenericStringArrayBuilder, StringViewArrayBuilder, append_view};
 use arrow::array::{
-    new_null_array, Array, ArrayRef, GenericStringArray, GenericStringBuilder,
-    NullBufferBuilder, OffsetSizeTrait, StringBuilder, StringViewArray,
+    Array, ArrayRef, GenericStringArray, NullBufferBuilder, OffsetSizeTrait,
+    StringViewArray, new_null_array,
 };
-use arrow::buffer::{Buffer, ScalarBuffer};
+use arrow::buffer::{Buffer, OffsetBuffer, ScalarBuffer};
 use arrow::datatypes::DataType;
-use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
 use datafusion_common::Result;
-use datafusion_common::{exec_err, ScalarValue};
+use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
+use datafusion_common::{ScalarValue, exec_err};
 use datafusion_expr::ColumnarValue;
 
-pub(crate) enum TrimType {
-    Left,
-    Right,
-    Both,
+/// Trait for trim operations, allowing compile-time dispatch instead of runtime matching.
+///
+/// Each implementation performs its specific trim operation and returns
+/// (trimmed_str, start_offset) where start_offset is the byte offset
+/// from the beginning of the input string where the trimmed result starts.
+pub(crate) trait Trimmer {
+    fn trim<'a>(input: &'a str, pattern: &[char]) -> (&'a str, u32);
+
+    /// Optimized trim for a single ASCII byte.
+    /// Uses byte-level scanning instead of char-level iteration.
+    fn trim_ascii_char(input: &str, byte: u8) -> (&str, u32);
+}
+
+/// Returns the number of leading bytes matching `byte`
+#[inline]
+fn leading_bytes(bytes: &[u8], byte: u8) -> usize {
+    bytes.iter().take_while(|&&b| b == byte).count()
 }
 
-impl Display for TrimType {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        match self {
-            TrimType::Left => write!(f, "ltrim"),
-            TrimType::Right => write!(f, "rtrim"),
-            TrimType::Both => write!(f, "btrim"),
+/// Returns the number of trailing bytes matching `byte`
+#[inline]
+fn trailing_bytes(bytes: &[u8], byte: u8) -> usize {
+    bytes.iter().rev().take_while(|&&b| b == byte).count()
+}
+
+/// Left trim - removes leading characters
+pub(crate) struct TrimLeft;
+
+impl Trimmer for TrimLeft {
+    #[inline]
+    fn trim<'a>(input: &'a str, pattern: &[char]) -> (&'a str, u32) {
+        if pattern.len() == 1 && pattern[0].is_ascii() {
+            return Self::trim_ascii_char(input, pattern[0] as u8);
         }
+        let trimmed = input.trim_start_matches(pattern);
+        let offset = (input.len() - trimmed.len()) as u32;
+        (trimmed, offset)
+    }
+
+    #[inline]
+    fn trim_ascii_char(input: &str, byte: u8) -> (&str, u32) {
+        let start = leading_bytes(input.as_bytes(), byte);
+        (&input[start..], start as u32)
+    }
+}
+
+/// Right trim - removes trailing characters
+pub(crate) struct TrimRight;
+
+impl Trimmer for TrimRight {
+    #[inline]
+    fn trim<'a>(input: &'a str, pattern: &[char]) -> (&'a str, u32) {
+        if pattern.len() == 1 && pattern[0].is_ascii() {
+            return Self::trim_ascii_char(input, pattern[0] as u8);
+        }
+        let trimmed = input.trim_end_matches(pattern);
+        (trimmed, 0)
+    }
+
+    #[inline]
+    fn trim_ascii_char(input: &str, byte: u8) -> (&str, u32) {
+        let bytes = input.as_bytes();
+        let end = bytes.len() - trailing_bytes(bytes, byte);
+        (&input[..end], 0)
     }
 }
 
-pub(crate) fn general_trim<T: OffsetSizeTrait>(
+/// Both trim - removes both leading and trailing characters
+pub(crate) struct TrimBoth;
+
+impl Trimmer for TrimBoth {
+    #[inline]
+    fn trim<'a>(input: &'a str, pattern: &[char]) -> (&'a str, u32) {
+        if pattern.len() == 1 && pattern[0].is_ascii() {
+            return Self::trim_ascii_char(input, pattern[0] as u8);
+        }
+        let left_trimmed = input.trim_start_matches(pattern);
+        let offset = (input.len() - left_trimmed.len()) as u32;
+        let trimmed = left_trimmed.trim_end_matches(pattern);
+        (trimmed, offset)
+    }
+
+    #[inline]
+    fn trim_ascii_char(input: &str, byte: u8) -> (&str, u32) {
+        let bytes = input.as_bytes();
+        let start = leading_bytes(bytes, byte);
+        let end = bytes.len() - trailing_bytes(&bytes[start..], byte);
+        (&input[start..end], start as u32)
+    }
+}
+
+pub(crate) fn general_trim<T: OffsetSizeTrait, Tr: Trimmer>(
     args: &[ArrayRef],
-    trim_type: TrimType,
     use_string_view: bool,
 ) -> Result<ArrayRef> {
-    let func = match trim_type {
-        TrimType::Left => |input, pattern: &str| {
-            let pattern = pattern.chars().collect::<Vec<char>>();
-            let ltrimmed_str =
-                str::trim_start_matches::<&[char]>(input, pattern.as_ref());
-            // `ltrimmed_str` is actually `input`[start_offset..],
-            // so `start_offset` = len(`input`) - len(`ltrimmed_str`)
-            let start_offset = input.len() - ltrimmed_str.len();
-
-            (ltrimmed_str, start_offset as u32)
-        },
-        TrimType::Right => |input, pattern: &str| {
-            let pattern = pattern.chars().collect::<Vec<char>>();
-            let rtrimmed_str = str::trim_end_matches::<&[char]>(input, pattern.as_ref());
-
-            // `ltrimmed_str` is actually `input`[0..new_len], so `start_offset` is 0
-            (rtrimmed_str, 0)
-        },
-        TrimType::Both => |input, pattern: &str| {
-            let pattern = pattern.chars().collect::<Vec<char>>();
-            let ltrimmed_str =
-                str::trim_start_matches::<&[char]>(input, pattern.as_ref());
-            // `btrimmed_str` can be got by rtrim(ltrim(`input`)),
-            // so its `start_offset` should be same as ltrim situation above
-            let start_offset = input.len() - ltrimmed_str.len();
-            let btrimmed_str =
-                str::trim_end_matches::<&[char]>(ltrimmed_str, pattern.as_ref());
-
-            (btrimmed_str, start_offset as u32)
-        },
-    };
-
     if use_string_view {
-        string_view_trim(func, args)
+        string_view_trim::<Tr>(args)
     } else {
-        string_trim::<T>(func, args)
+        string_trim::<T, Tr>(args)
     }
 }
 
 /// Applies the trim function to the given string view array(s)
 /// and returns a new string view array with the trimmed values.
 ///
-/// # `trim_func`: The function to apply to each string view.
-///
-/// ## Arguments
-/// - The original string
-/// - the pattern to trim
-///
-/// ## Returns
-///  - trimmed str (must be a substring of the first argument)
-///  - start offset, needed in `string_view_trim`
-///
-/// ## Examples
-///
-/// For `ltrim`:
-/// - `fn("  abc", " ") -> ("abc", 2)`
-/// - `fn("abd", " ") -> ("abd", 0)`
-///
-/// For `btrim`:
-/// - `fn("  abc  ", " ") -> ("abc", 2)`
-/// - `fn("abd", " ") -> ("abd", 0)`
-// removing 'a will cause compiler complaining lifetime of `func`
-fn string_view_trim<'a>(
-    trim_func: fn(&'a str, &'a str) -> (&'a str, u32),
-    args: &'a [ArrayRef],
-) -> Result<ArrayRef> {
+/// Pre-computes the pattern characters once for scalar patterns to avoid
+/// repeated allocations per row.
+fn string_view_trim<Tr: Trimmer>(args: &[ArrayRef]) -> Result<ArrayRef> {
     let string_view_array = as_string_view_array(&args[0])?;
     let mut views_buf = Vec::with_capacity(string_view_array.len());
     let mut null_builder = NullBufferBuilder::new(string_view_array.len());
 
     match args.len() {
         1 => {
-            let array_iter = string_view_array.iter();
-            let views_iter = string_view_array.views().iter();
-            for (src_str_opt, raw_view) in array_iter.zip(views_iter) {
-                trim_and_append_str(
-                    src_str_opt,
-                    Some(" "),
-                    trim_func,
-                    &mut views_buf,
-                    &mut null_builder,
-                    raw_view,
-                );
+            // Trim spaces by default
+            for (src_str_opt, raw_view) in string_view_array
+                .iter()
+                .zip(string_view_array.views().iter())
+            {
+                if let Some(src_str) = src_str_opt {
+                    let (trimmed, offset) = Tr::trim_ascii_char(src_str, b' ');
+                    append_view(&mut views_buf, raw_view, trimmed, offset);
+                    null_builder.append_non_null();
+                } else {
+                    null_builder.append_null();
+                    views_buf.push(0);
+                }
             }
         }
         2 => {
             let characters_array = as_string_view_array(&args[1])?;
 
             if characters_array.len() == 1 {
-                // Only one `trim characters` exist
+                // Scalar pattern - pre-compute pattern chars once
                 if characters_array.is_null(0) {
                     return Ok(new_null_array(
-                        // The schema is expecting utf8 as null
                         &DataType::Utf8View,
                         string_view_array.len(),
                     ));
                 }
 
-                let characters = characters_array.value(0);
-                let array_iter = string_view_array.iter();
-                let views_iter = string_view_array.views().iter();
-                for (src_str_opt, raw_view) in array_iter.zip(views_iter) {
-                    trim_and_append_str(
+                let pattern: Vec<char> = characters_array.value(0).chars().collect();
+                for (src_str_opt, raw_view) in string_view_array
+                    .iter()
+                    .zip(string_view_array.views().iter())
+                {
+                    trim_and_append_view::<Tr>(
                         src_str_opt,
-                        Some(characters),
-                        trim_func,
+                        &pattern,
                         &mut views_buf,
                         &mut null_builder,
                         raw_view,
                     );
                 }
             } else {
-                // A specific `trim characters` for a row in the string view array
-                let characters_iter = characters_array.iter();
-                let array_iter = string_view_array.iter();
-                let views_iter = string_view_array.views().iter();
-                for ((src_str_opt, raw_view), characters_opt) in
-                    array_iter.zip(views_iter).zip(characters_iter)
+                // Per-row pattern - must compute pattern chars for each row
+                let mut pattern: Vec<char> = Vec::new();
+                for ((src_str_opt, raw_view), characters_opt) in string_view_array
+                    .iter()
+                    .zip(string_view_array.views().iter())
+                    .zip(characters_array.iter())
                 {
-                    trim_and_append_str(
-                        src_str_opt,
-                        characters_opt,
-                        trim_func,
-                        &mut views_buf,
-                        &mut null_builder,
-                        raw_view,
-                    );
+                    if let (Some(src_str), Some(characters)) =
+                        (src_str_opt, characters_opt)
+                    {
+                        pattern.clear();
+                        pattern.extend(characters.chars());
+                        let (trimmed, offset) = Tr::trim(src_str, &pattern);
+                        append_view(&mut views_buf, raw_view, trimmed, offset);
+                        null_builder.append_non_null();
+                    } else {
+                        null_builder.append_null();
+                        views_buf.push(0);
+                    }
                 }
             }
         }
         other => {
             return exec_err!(
-            "Function TRIM was called with {other} arguments. It requires at least 1 and at most 2."
+                "Function TRIM was called with {other} arguments. It requires at least 1 and at most 2."
             );
         }
     }
@@ -210,33 +235,24 @@ fn string_view_trim<'a>(
 /// Trims the given string and appends the trimmed string to the views buffer
 /// and the null buffer.
 ///
-/// Calls `trim_func` on the string value in `original_view`, for non_null
-/// values and appends the updated view to the views buffer / null_builder.
-///
 /// Arguments
 /// - `src_str_opt`: The original string value (represented by the view)
-/// - `trim_characters_opt`: The characters to trim from the string
-/// - `trim_func`: The function to apply to the string (see [`string_view_trim`] for details)
+/// - `pattern`: Pre-computed character pattern to trim
 /// - `views_buf`: The buffer to append the updated views to
 /// - `null_builder`: The buffer to append the null values to
 /// - `original_view`: The original view value (that contains src_str_opt)
-fn trim_and_append_str<'a>(
-    src_str_opt: Option<&'a str>,
-    trim_characters_opt: Option<&'a str>,
-    trim_func: fn(&'a str, &'a str) -> (&'a str, u32),
+#[inline]
+fn trim_and_append_view<Tr: Trimmer>(
+    src_str_opt: Option<&str>,
+    pattern: &[char],
     views_buf: &mut Vec<u128>,
     null_builder: &mut NullBufferBuilder,
     original_view: &u128,
 ) {
-    if let (Some(src_str), Some(characters)) = (src_str_opt, trim_characters_opt) {
-        let (trim_str, start_offset) = trim_func(src_str, characters);
-        make_and_append_view(
-            views_buf,
-            null_builder,
-            original_view,
-            trim_str,
-            start_offset,
-        );
+    if let Some(src_str) = src_str_opt {
+        let (trimmed, offset) = Tr::trim(src_str, pattern);
+        append_view(views_buf, original_view, trimmed, offset);
+        null_builder.append_non_null();
     } else {
         null_builder.append_null();
         views_buf.push(0);
@@ -246,18 +262,17 @@ fn trim_and_append_str<'a>(
 /// Applies the trim function to the given string array(s)
 /// and returns a new string array with the trimmed values.
 ///
-/// See [`string_view_trim`] for details on `func`
-fn string_trim<'a, T: OffsetSizeTrait>(
-    func: fn(&'a str, &'a str) -> (&'a str, u32),
-    args: &'a [ArrayRef],
-) -> Result<ArrayRef> {
+/// Pre-computes the pattern characters once for scalar patterns to avoid
+/// repeated allocations per row.
+fn string_trim<T: OffsetSizeTrait, Tr: Trimmer>(args: &[ArrayRef]) -> Result<ArrayRef> {
     let string_array = as_generic_string_array::<T>(&args[0])?;
 
     match args.len() {
         1 => {
+            // Trim spaces by default
             let result = string_array
                 .iter()
-                .map(|string| string.map(|string: &str| func(string, " ").0))
+                .map(|string| string.map(|s| Tr::trim_ascii_char(s, b' ').0))
                 .collect::<GenericStringArray<T>>();
 
             Ok(Arc::new(result) as ArrayRef)
@@ -266,6 +281,7 @@ fn string_trim<'a, T: OffsetSizeTrait>(
             let characters_array = as_generic_string_array::<T>(&args[1])?;
 
             if characters_array.len() == 1 {
+                // Scalar pattern - pre-compute pattern chars once
                 if characters_array.is_null(0) {
                     return Ok(new_null_array(
                         string_array.data_type(),
@@ -273,19 +289,25 @@ fn string_trim<'a, T: OffsetSizeTrait>(
                     ));
                 }
 
-                let characters = characters_array.value(0);
+                let pattern: Vec<char> = characters_array.value(0).chars().collect();
                 let result = string_array
                     .iter()
-                    .map(|item| item.map(|string| func(string, characters).0))
+                    .map(|item| item.map(|s| Tr::trim(s, &pattern).0))
                     .collect::<GenericStringArray<T>>();
                 return Ok(Arc::new(result) as ArrayRef);
             }
 
+            // Per-row pattern - must compute pattern chars for each row
+            let mut pattern: Vec<char> = Vec::new();
             let result = string_array
                 .iter()
                 .zip(characters_array.iter())
                 .map(|(string, characters)| match (string, characters) {
-                    (Some(string), Some(characters)) => Some(func(string, characters).0),
+                    (Some(s), Some(c)) => {
+                        pattern.clear();
+                        pattern.extend(c.chars());
+                        Some(Tr::trim(s, &pattern).0)
+                    }
                     _ => None,
                 })
                 .collect::<GenericStringArray<T>>();
@@ -294,7 +316,7 @@ fn string_trim<'a, T: OffsetSizeTrait>(
         }
         other => {
             exec_err!(
-            "Function TRIM was called with {other} arguments. It requires at least 1 and at most 2."
+                "Function TRIM was called with {other} arguments. It requires at least 1 and at most 2."
             )
         }
     }
@@ -327,20 +349,30 @@ where
             >(array, op)?)),
             DataType::Utf8View => {
                 let string_array = as_string_view_array(array)?;
-                let mut string_builder = StringBuilder::with_capacity(
-                    string_array.len(),
-                    string_array.get_array_memory_size(),
-                );
-
-                for str in string_array.iter() {
-                    if let Some(str) = str {
-                        string_builder.append_value(op(str));
-                    } else {
-                        string_builder.append_null();
+                let item_len = string_array.len();
+                // Null-preserving: reuse the input null buffer as the output null buffer.
+                let nulls = string_array.nulls().cloned();
+                let mut builder = StringViewArrayBuilder::with_capacity(item_len);
+
+                if let Some(ref n) = nulls {
+                    for i in 0..item_len {
+                        if n.is_null(i) {
+                            builder.append_placeholder();
+                        } else {
+                            // SAFETY: `n.is_null(i)` was false in the branch above.
+                            let s = unsafe { string_array.value_unchecked(i) };
+                            builder.append_value(&op(s));
+                        }
+                    }
+                } else {
+                    for i in 0..item_len {
+                        // SAFETY: no null buffer means every index is valid.
+                        let s = unsafe { string_array.value_unchecked(i) };
+                        builder.append_value(&op(s));
                     }
                 }
 
-                Ok(ColumnarValue::Array(Arc::new(string_builder.finish())))
+                Ok(ColumnarValue::Array(Arc::new(builder.finish(nulls)?)))
             }
             other => exec_err!("Unsupported data type {other:?} for function {name}"),
         },
@@ -355,7 +387,7 @@ where
             }
             ScalarValue::Utf8View(a) => {
                 let result = a.as_ref().map(|x| op(x));
-                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(result)))
+                Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(result)))
             }
             other => exec_err!("Unsupported data type {other:?} for function {name}"),
         },
@@ -370,32 +402,44 @@ where
     const PRE_ALLOC_BYTES: usize = 8;
 
     let string_array = as_generic_string_array::<O>(array)?;
-    let value_data = string_array.value_data();
-
-    // All values are ASCII.
-    if value_data.is_ascii() {
+    if string_array.is_ascii() {
         return case_conversion_ascii_array::<O, _>(string_array, op);
     }
 
     // Values contain non-ASCII.
     let item_len = string_array.len();
-    let capacity = string_array.value_data().len() + PRE_ALLOC_BYTES;
-    let mut builder = GenericStringBuilder::<O>::with_capacity(item_len, capacity);
+    let offsets = string_array.value_offsets();
+    let start = offsets.first().unwrap().as_usize();
+    let end = offsets.last().unwrap().as_usize();
+    let capacity = (end - start) + PRE_ALLOC_BYTES;
+    // Null-preserving: reuse the input null buffer as the output null buffer.
+    let nulls = string_array.nulls().cloned();
+    let mut builder = GenericStringArrayBuilder::<O>::with_capacity(item_len, capacity);
 
-    if string_array.null_count() == 0 {
-        let iter =
-            (0..item_len).map(|i| Some(op(unsafe { string_array.value_unchecked(i) })));
-        builder.extend(iter);
+    if let Some(ref n) = nulls {
+        for i in 0..item_len {
+            if n.is_null(i) {
+                builder.append_placeholder();
+            } else {
+                // SAFETY: `n.is_null(i)` was false in the branch above.
+                let s = unsafe { string_array.value_unchecked(i) };
+                builder.append_value(&op(s));
+            }
+        }
     } else {
-        let iter = string_array.iter().map(|string| string.map(&op));
-        builder.extend(iter);
+        for i in 0..item_len {
+            // SAFETY: no null buffer means every index is valid.
+            let s = unsafe { string_array.value_unchecked(i) };
+            builder.append_value(&op(s));
+        }
     }
-    Ok(Arc::new(builder.finish()))
+    Ok(Arc::new(builder.finish(nulls)?))
 }
 
-/// All values of string_array are ASCII, and when converting case, there is no changes in the byte
-/// array length. Therefore, the StringArray can be treated as a complete ASCII string for
-/// case conversion, and we can reuse the offsets buffer and the nulls buffer.
+/// Fast path for case conversion on an all-ASCII string array. ASCII case
+/// conversion is byte-length-preserving, so we can convert the entire addressed
+/// range in one call and reuse the offsets and nulls buffers — rebasing the
+/// offsets when the input is a sliced array.
 fn case_conversion_ascii_array<'a, O, F>(
     string_array: &'a GenericStringArray<O>,
     op: F,
@@ -404,21 +448,33 @@ where
     O: OffsetSizeTrait,
     F: Fn(&'a str) -> String,
 {
-    let value_data = string_array.value_data();
-    // SAFETY: all items stored in value_data satisfy UTF8.
-    // ref: impl ByteArrayNativeType for str {...}
-    let str_values = unsafe { std::str::from_utf8_unchecked(value_data) };
+    let value_offsets = string_array.value_offsets();
+    let start = value_offsets.first().unwrap().as_usize();
+    let end = value_offsets.last().unwrap().as_usize();
+    let relevant = &string_array.value_data()[start..end];
+
+    // SAFETY: `relevant` is a subslice of the string array's value buffer,
+    // which is valid UTF-8.
+    let str_values = unsafe { std::str::from_utf8_unchecked(relevant) };
 
-    // conversion
     let converted_values = op(str_values);
-    assert_eq!(converted_values.len(), str_values.len());
-    let bytes = converted_values.into_bytes();
+    debug_assert_eq!(converted_values.len(), str_values.len());
+    let values = Buffer::from_vec(converted_values.into_bytes());
+
+    // Shift offsets from `start`-based to 0-based so they index into `values`.
+    let offsets = if start == 0 {
+        string_array.offsets().clone()
+    } else {
+        let s = O::usize_as(start);
+        let rebased: Vec<O> = value_offsets.iter().map(|&o| o - s).collect();
+        // SAFETY: subtracting a constant from monotonic offsets preserves
+        // monotonicity, and `start` is the minimum offset so no underflow.
+        unsafe { OffsetBuffer::new_unchecked(ScalarBuffer::from(rebased)) }
+    };
 
-    // build result
-    let values = Buffer::from_vec(bytes);
-    let offsets = string_array.offsets().clone();
     let nulls = string_array.nulls().cloned();
-    // SAFETY: offsets and nulls are consistent with the input array.
+    // SAFETY: offsets are monotonic and in-bounds for `values`; nulls
+    // (if any) match the slice length.
     Ok(Arc::new(unsafe {
         GenericStringArray::<O>::new_unchecked(offsets, values, nulls)
     }))
diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs
index a93e70e714e8b..b10db23472c99 100644
--- a/datafusion/functions/src/string/concat.rs
+++ b/datafusion/functions/src/string/concat.rs
@@ -15,21 +15,23 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{as_largestring_array, Array};
+use arrow::array::{Array, as_largestring_array};
 use arrow::datatypes::DataType;
 use datafusion_expr::sort_properties::ExprProperties;
-use std::any::Any;
 use std::sync::Arc;
 
 use crate::string::concat;
 use crate::strings::{
-    ColumnarValueRef, LargeStringArrayBuilder, StringArrayBuilder, StringViewArrayBuilder,
+    ColumnarValueRef, ConcatLargeStringBuilder, ConcatStringBuilder,
+    ConcatStringViewBuilder,
+};
+use datafusion_common::cast::{as_binary_array, as_string_array, as_string_view_array};
+use datafusion_common::{
+    Result, ScalarValue, exec_datafusion_err, internal_err, plan_err,
 };
-use datafusion_common::cast::{as_string_array, as_string_view_array};
-use datafusion_common::{internal_err, plan_err, Result, ScalarValue};
 use datafusion_expr::expr::ScalarFunction;
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
-use datafusion_expr::{lit, ColumnarValue, Documentation, Expr, Volatility};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{ColumnarValue, Documentation, Expr, Volatility, lit};
 use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature};
 use datafusion_macros::user_doc;
 
@@ -68,18 +70,25 @@ impl ConcatFunc {
         use DataType::*;
         Self {
             signature: Signature::variadic(
-                vec![Utf8View, Utf8, LargeUtf8],
+                vec![Utf8View, Utf8, LargeUtf8, Binary],
                 Volatility::Immutable,
             ),
         }
     }
 }
 
-impl ScalarUDFImpl for ConcatFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
+fn deduce_return_type(arg_types: &[DataType]) -> DataType {
+    use DataType::*;
+    if arg_types.contains(&Utf8View) {
+        Utf8View
+    } else if arg_types.contains(&LargeUtf8) {
+        LargeUtf8
+    } else {
+        Utf8
     }
+}
 
+impl ScalarUDFImpl for ConcatFunc {
     fn name(&self) -> &str {
         "concat"
     }
@@ -88,19 +97,11 @@ impl ScalarUDFImpl for ConcatFunc {
         &self.signature
     }
 
+    /// Match the return type to the input types to avoid unnecessary casts. On
+    /// mixed inputs, prefer Utf8View; prefer LargeUtf8 over Utf8 to avoid
+    /// potential overflow on LargeUtf8 input.
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        use DataType::*;
-        let mut dt = &Utf8;
-        arg_types.iter().for_each(|data_type| {
-            if data_type == &Utf8View {
-                dt = data_type;
-            }
-            if data_type == &LargeUtf8 && dt != &Utf8View {
-                dt = data_type;
-            }
-        });
-
-        Ok(dt.to_owned())
+        Ok(deduce_return_type(arg_types))
     }
 
     /// Concatenates the text representations of all the arguments. NULL arguments are ignored.
@@ -108,43 +109,38 @@ impl ScalarUDFImpl for ConcatFunc {
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let ScalarFunctionArgs { args, .. } = args;
 
-        let mut return_datatype = DataType::Utf8;
-        args.iter().for_each(|col| {
-            if col.data_type() == DataType::Utf8View {
-                return_datatype = col.data_type();
-            }
-            if col.data_type() == DataType::LargeUtf8
-                && return_datatype != DataType::Utf8View
-            {
-                return_datatype = col.data_type();
-            }
-        });
+        let arg_types: Vec<DataType> = args.iter().map(|c| c.data_type()).collect();
+        let return_datatype = deduce_return_type(&arg_types);
 
-        let array_len = args
-            .iter()
-            .filter_map(|x| match x {
-                ColumnarValue::Array(array) => Some(array.len()),
-                _ => None,
-            })
-            .next();
+        let array_len = args.iter().find_map(|x| match x {
+            ColumnarValue::Array(array) => Some(array.len()),
+            _ => None,
+        });
 
         // Scalar
         if array_len.is_none() {
-            let mut result = String::new();
-            for arg in args {
+            let mut values: Vec<&[u8]> = Vec::with_capacity(args.len());
+            for arg in &args {
                 let ColumnarValue::Scalar(scalar) = arg else {
                     return internal_err!("concat expected scalar value, got {arg:?}");
                 };
-
-                match scalar.try_as_str() {
-                    Some(Some(v)) => result.push_str(v),
-                    Some(None) => {} // null literal
-                    None => plan_err!(
-                        "Concat function does not support scalar type {}",
-                        scalar
-                    )?,
+                if let ScalarValue::Binary(Some(value)) = scalar {
+                    values.push(value);
+                } else {
+                    match scalar.try_as_str() {
+                        Some(Some(v)) => values.push(v.as_bytes()),
+                        Some(None) => {} // null literal
+                        None => plan_err!(
+                            "Concat function does not support scalar type {}",
+                            scalar
+                        )?,
+                    }
                 }
             }
+            let concat_bytes = values.concat();
+            let result = std::str::from_utf8(&concat_bytes)
+                .map_err(|_| exec_datafusion_err!("invalid UTF-8 in binary literal"))?
+                .to_string();
 
             return match return_datatype {
                 DataType::Utf8View => {
@@ -177,6 +173,13 @@ impl ScalarUDFImpl for ConcatFunc {
                         columns.push(ColumnarValueRef::Scalar(s.as_bytes()));
                     }
                 }
+                ColumnarValue::Scalar(ScalarValue::Binary(maybe_value)) => {
+                    if let Some(b) = maybe_value {
+                        // data_size is a capacity hint, so doesn't matter if it is chars or bytes
+                        data_size += b.len() * len;
+                        columns.push(ColumnarValueRef::Scalar(b.as_slice()));
+                    }
+                }
                 ColumnarValue::Array(array) => {
                     match array.data_type() {
                         DataType::Utf8 => {
@@ -189,7 +192,7 @@ impl ScalarUDFImpl for ConcatFunc {
                                 ColumnarValueRef::NonNullableArray(string_array)
                             };
                             columns.push(column);
-                        },
+                        }
                         DataType::LargeUtf8 => {
                             let string_array = as_largestring_array(array);
 
@@ -197,23 +200,40 @@ impl ScalarUDFImpl for ConcatFunc {
                             let column = if array.is_nullable() {
                                 ColumnarValueRef::NullableLargeStringArray(string_array)
                             } else {
-                                ColumnarValueRef::NonNullableLargeStringArray(string_array)
+                                ColumnarValueRef::NonNullableLargeStringArray(
+                                    string_array,
+                                )
                             };
                             columns.push(column);
-                        },
+                        }
                         DataType::Utf8View => {
                             let string_array = as_string_view_array(array)?;
 
-                            data_size += string_array.len();
+                            // This is an estimate; in particular, it will
+                            // undercount arrays of short strings (<= 12 bytes).
+                            data_size += string_array.total_buffer_bytes_used();
                             let column = if array.is_nullable() {
                                 ColumnarValueRef::NullableStringViewArray(string_array)
                             } else {
                                 ColumnarValueRef::NonNullableStringViewArray(string_array)
                             };
                             columns.push(column);
-                        },
+                        }
+                        DataType::Binary => {
+                            let string_array = as_binary_array(array)?;
+
+                            data_size += string_array.values().len();
+                            let column = if array.is_nullable() {
+                                ColumnarValueRef::NullableBinaryArray(string_array)
+                            } else {
+                                ColumnarValueRef::NonNullableBinaryArray(string_array)
+                            };
+                            columns.push(column);
+                        }
                         other => {
-                            return plan_err!("Input was {other} which is not a supported datatype for concat function")
+                            return plan_err!(
+                                "Input was {other} which is not a supported datatype for concat function"
+                            );
                         }
                     };
                 }
@@ -223,39 +243,39 @@ impl ScalarUDFImpl for ConcatFunc {
 
         match return_datatype {
             DataType::Utf8 => {
-                let mut builder = StringArrayBuilder::with_capacity(len, data_size);
+                let mut builder = ConcatStringBuilder::with_capacity(len, data_size);
                 for i in 0..len {
                     columns
                         .iter()
                         .for_each(|column| builder.write::<true>(column, i));
-                    builder.append_offset();
+                    builder.append_offset()?;
                 }
 
-                let string_array = builder.finish(None);
+                let string_array = builder.finish(None)?;
                 Ok(ColumnarValue::Array(Arc::new(string_array)))
             }
             DataType::Utf8View => {
-                let mut builder = StringViewArrayBuilder::with_capacity(len, data_size);
+                let mut builder = ConcatStringViewBuilder::with_capacity(len, data_size);
                 for i in 0..len {
                     columns
                         .iter()
                         .for_each(|column| builder.write::<true>(column, i));
-                    builder.append_offset();
+                    builder.append_offset()?;
                 }
 
-                let string_array = builder.finish();
+                let string_array = builder.finish(None)?;
                 Ok(ColumnarValue::Array(Arc::new(string_array)))
             }
             DataType::LargeUtf8 => {
-                let mut builder = LargeStringArrayBuilder::with_capacity(len, data_size);
+                let mut builder = ConcatLargeStringBuilder::with_capacity(len, data_size);
                 for i in 0..len {
                     columns
                         .iter()
                         .for_each(|column| builder.write::<true>(column, i));
-                    builder.append_offset();
+                    builder.append_offset()?;
                 }
 
-                let string_array = builder.finish(None);
+                let string_array = builder.finish(None)?;
                 Ok(ColumnarValue::Array(Arc::new(string_array)))
             }
             _ => unreachable!(),
@@ -273,7 +293,7 @@ impl ScalarUDFImpl for ConcatFunc {
     fn simplify(
         &self,
         args: Vec<Expr>,
-        _info: &dyn SimplifyInfo,
+        _info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         simplify_concat(args)
     }
@@ -287,7 +307,7 @@ impl ScalarUDFImpl for ConcatFunc {
     }
 }
 
-pub fn simplify_concat(args: Vec<Expr>) -> Result<ExprSimplifyResult> {
+pub(crate) fn simplify_concat(args: Vec<Expr>) -> Result<ExprSimplifyResult> {
     let mut new_args = Vec::with_capacity(args.len());
     let mut contiguous_scalar = "".to_string();
 
@@ -305,9 +325,8 @@ pub fn simplify_concat(args: Vec<Expr>) -> Result<ExprSimplifyResult> {
     for arg in args.clone() {
         match arg {
             Expr::Literal(ScalarValue::Utf8(None), _) => {}
-            Expr::Literal(ScalarValue::LargeUtf8(None), _) => {
-            }
-            Expr::Literal(ScalarValue::Utf8View(None), _) => { }
+            Expr::Literal(ScalarValue::LargeUtf8(None), _) => {}
+            Expr::Literal(ScalarValue::Utf8View(None), _) => {}
 
             // filter out `null` args
             // All literals have been converted to Utf8 or LargeUtf8 in type_coercion.
@@ -325,7 +344,7 @@ pub fn simplify_concat(args: Vec<Expr>) -> Result<ExprSimplifyResult> {
             Expr::Literal(x, _) => {
                 return internal_err!(
                     "The scalar {x} should be casted to string type during the type coercion."
-                )
+                );
             }
             // If the arg is not a literal, we should first push the current `contiguous_scalar`
             // to the `new_args` (if it is not empty) and reset it to empty string.
@@ -334,8 +353,10 @@ pub fn simplify_concat(args: Vec<Expr>) -> Result<ExprSimplifyResult> {
                 if !contiguous_scalar.is_empty() {
                     match return_type {
                         DataType::Utf8 => new_args.push(lit(contiguous_scalar)),
-                        DataType::LargeUtf8 => new_args.push(lit(ScalarValue::LargeUtf8(Some(contiguous_scalar)))),
-                        DataType::Utf8View => new_args.push(lit(ScalarValue::Utf8View(Some(contiguous_scalar)))),
+                        DataType::LargeUtf8 => new_args
+                            .push(lit(ScalarValue::LargeUtf8(Some(contiguous_scalar)))),
+                        DataType::Utf8View => new_args
+                            .push(lit(ScalarValue::Utf8View(Some(contiguous_scalar)))),
                         _ => unreachable!(),
                     }
                     contiguous_scalar = "".to_string();
@@ -374,11 +395,11 @@ pub fn simplify_concat(args: Vec<Expr>) -> Result<ExprSimplifyResult> {
 mod tests {
     use super::*;
     use crate::utils::test::test_function;
-    use arrow::array::{Array, LargeStringArray, StringViewArray};
+    use DataType::*;
     use arrow::array::{ArrayRef, StringArray};
+    use arrow::array::{LargeStringArray, StringViewArray};
     use arrow::datatypes::Field;
     use datafusion_common::config::ConfigOptions;
-    use DataType::*;
 
     #[test]
     fn test_functions() -> Result<()> {
@@ -450,7 +471,33 @@ mod tests {
             Utf8View,
             StringViewArray
         );
-
+        test_function!(
+            ConcatFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Binary(Some(
+                    "Café".as_bytes().into()
+                ))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(None)),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("cc".to_string()))),
+            ],
+            Ok(Some("Cafécc")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            ConcatFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Binary(Some(Vec::from(
+                    "Café".as_bytes()
+                )))),
+                ColumnarValue::Scalar(ScalarValue::Binary(Some("cc".as_bytes().into()))),
+            ],
+            Ok(Some("Cafécc")),
+            &str,
+            Utf8,
+            StringArray
+        );
         Ok(())
     }
 
diff --git a/datafusion/functions/src/string/concat_ws.rs b/datafusion/functions/src/string/concat_ws.rs
index cdd30ac8755ab..2c2d4bd42165b 100644
--- a/datafusion/functions/src/string/concat_ws.rs
+++ b/datafusion/functions/src/string/concat_ws.rs
@@ -15,8 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{as_largestring_array, Array, StringArray};
-use std::any::Any;
+use arrow::array::Array;
 use std::sync::Arc;
 
 use arrow::datatypes::DataType;
@@ -24,12 +23,17 @@ use arrow::datatypes::DataType;
 use crate::string::concat;
 use crate::string::concat::simplify_concat;
 use crate::string::concat_ws;
-use crate::strings::{ColumnarValueRef, StringArrayBuilder};
-use datafusion_common::cast::{as_string_array, as_string_view_array};
-use datafusion_common::{exec_err, internal_err, plan_err, Result, ScalarValue};
+use crate::strings::{
+    ColumnarValueRef, ConcatLargeStringBuilder, ConcatStringBuilder,
+    ConcatStringViewBuilder,
+};
+use datafusion_common::cast::{
+    as_large_string_array, as_string_array, as_string_view_array,
+};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err, plan_err};
 use datafusion_expr::expr::ScalarFunction;
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
-use datafusion_expr::{lit, ColumnarValue, Documentation, Expr, Volatility};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{ColumnarValue, Documentation, Expr, Volatility, lit};
 use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature};
 use datafusion_macros::user_doc;
 
@@ -83,10 +87,6 @@ impl ConcatWsFunc {
 }
 
 impl ScalarUDFImpl for ConcatWsFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "concat_ws"
     }
@@ -95,17 +95,27 @@ impl ScalarUDFImpl for ConcatWsFunc {
         &self.signature
     }
 
-    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+    /// Match the return type to the input types to avoid unnecessary casts. On
+    /// mixed inputs, prefer Utf8View; prefer LargeUtf8 over Utf8 to avoid
+    /// potential overflow on LargeUtf8 input.
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
         use DataType::*;
-        Ok(Utf8)
+        if arg_types.contains(&Utf8View) {
+            Ok(Utf8View)
+        } else if arg_types.contains(&LargeUtf8) {
+            Ok(LargeUtf8)
+        } else {
+            Ok(Utf8)
+        }
     }
 
-    /// Concatenates all but the first argument, with separators. The first argument is used as the separator string, and should not be NULL. Other NULL arguments are ignored.
+    /// Concatenates all but the first argument, with separators. The first
+    /// argument is used as the separator string, and should not be NULL. Other
+    /// NULL arguments are ignored.
     /// concat_ws(',', 'abcde', 2, NULL, 22) = 'abcde,2,22'
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let ScalarFunctionArgs { args, .. } = args;
 
-        // do not accept 0 arguments.
         if args.len() < 2 {
             return exec_err!(
                 "concat_ws was called with {} arguments. It requires at least 2.",
@@ -113,68 +123,67 @@ impl ScalarUDFImpl for ConcatWsFunc {
             );
         }
 
-        let array_len = args
-            .iter()
-            .filter_map(|x| match x {
-                ColumnarValue::Array(array) => Some(array.len()),
-                _ => None,
-            })
-            .next();
+        let return_datatype = if args.iter().any(|c| c.data_type() == DataType::Utf8View)
+        {
+            DataType::Utf8View
+        } else if args.iter().any(|c| c.data_type() == DataType::LargeUtf8) {
+            DataType::LargeUtf8
+        } else {
+            DataType::Utf8
+        };
+
+        let array_len = args.iter().find_map(|x| match x {
+            ColumnarValue::Array(array) => Some(array.len()),
+            _ => None,
+        });
 
         // Scalar
         if array_len.is_none() {
             let ColumnarValue::Scalar(scalar) = &args[0] else {
-                // loop above checks for all args being scalar
                 unreachable!()
             };
             let sep = match scalar.try_as_str() {
                 Some(Some(s)) => s,
                 Some(None) => {
                     // null literal string
-                    return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)));
+                    return match return_datatype {
+                        DataType::Utf8View => {
+                            Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(None)))
+                        }
+                        DataType::LargeUtf8 => {
+                            Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)))
+                        }
+                        _ => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))),
+                    };
                 }
                 None => return internal_err!("Expected string literal, got {scalar:?}"),
             };
 
-            let mut result = String::new();
-            // iterator over Option<str>
-            let iter = &mut args[1..].iter().map(|arg| {
+            let mut values = Vec::with_capacity(args.len() - 1);
+            for arg in &args[1..] {
                 let ColumnarValue::Scalar(scalar) = arg else {
-                    // loop above checks for all args being scalar
                     unreachable!()
                 };
-                scalar.try_as_str()
-            });
-
-            // append first non null arg
-            for scalar in iter.by_ref() {
-                match scalar {
-                    Some(Some(s)) => {
-                        result.push_str(s);
-                        break;
-                    }
-                    Some(None) => {} // null literal string
-                    None => {
-                        return internal_err!("Expected string literal, got {scalar:?}")
-                    }
-                }
-            }
 
-            // handle subsequent non null args
-            for scalar in iter.by_ref() {
-                match scalar {
-                    Some(Some(s)) => {
-                        result.push_str(sep);
-                        result.push_str(s);
-                    }
+                match scalar.try_as_str() {
+                    Some(Some(v)) => values.push(v),
                     Some(None) => {} // null literal string
                     None => {
-                        return internal_err!("Expected string literal, got {scalar:?}")
+                        return internal_err!("Expected string literal, got {scalar:?}");
                     }
                 }
             }
+            let result = values.join(sep);
 
-            return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(result))));
+            return match return_datatype {
+                DataType::Utf8View => {
+                    Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(Some(result))))
+                }
+                DataType::LargeUtf8 => {
+                    Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(result))))
+                }
+                _ => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(result)))),
+            };
         }
 
         // Array
@@ -183,23 +192,61 @@ impl ScalarUDFImpl for ConcatWsFunc {
 
         // parse sep
         let sep = match &args[0] {
-            ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => {
-                data_size += s.len() * len * (args.len() - 2); // estimate
-                ColumnarValueRef::Scalar(s.as_bytes())
-            }
-            ColumnarValue::Scalar(ScalarValue::Utf8(None)) => {
-                return Ok(ColumnarValue::Array(Arc::new(StringArray::new_null(len))));
-            }
-            ColumnarValue::Array(array) => {
-                let string_array = as_string_array(array)?;
-                data_size += string_array.values().len() * (args.len() - 2); // estimate
-                if array.is_nullable() {
-                    ColumnarValueRef::NullableArray(string_array)
-                } else {
-                    ColumnarValueRef::NonNullableArray(string_array)
+            ColumnarValue::Scalar(scalar) => match scalar.try_as_str() {
+                Some(Some(s)) => {
+                    data_size += s.len() * len * (args.len() - 2); // estimate
+                    ColumnarValueRef::Scalar(s.as_bytes())
                 }
-            }
-            _ => unreachable!("concat ws"),
+                Some(None) => {
+                    return match return_datatype {
+                        DataType::Utf8View => {
+                            Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(None)))
+                        }
+                        DataType::LargeUtf8 => {
+                            Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)))
+                        }
+                        _ => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))),
+                    };
+                }
+                None => {
+                    return internal_err!("Expected string separator, got {scalar:?}");
+                }
+            },
+            ColumnarValue::Array(array) => match array.data_type() {
+                DataType::Utf8 => {
+                    let string_array = as_string_array(array)?;
+                    data_size += string_array.values().len() * (args.len() - 2);
+                    if array.is_nullable() {
+                        ColumnarValueRef::NullableArray(string_array)
+                    } else {
+                        ColumnarValueRef::NonNullableArray(string_array)
+                    }
+                }
+                DataType::LargeUtf8 => {
+                    let string_array = as_large_string_array(array)?;
+                    data_size += string_array.values().len() * (args.len() - 2);
+                    if array.is_nullable() {
+                        ColumnarValueRef::NullableLargeStringArray(string_array)
+                    } else {
+                        ColumnarValueRef::NonNullableLargeStringArray(string_array)
+                    }
+                }
+                DataType::Utf8View => {
+                    let string_array = as_string_view_array(array)?;
+                    data_size +=
+                        string_array.total_buffer_bytes_used() * (args.len() - 2);
+                    if array.is_nullable() {
+                        ColumnarValueRef::NullableStringViewArray(string_array)
+                    } else {
+                        ColumnarValueRef::NonNullableStringViewArray(string_array)
+                    }
+                }
+                other => {
+                    return plan_err!(
+                        "Input was {other} which is not a supported datatype for concat_ws separator"
+                    );
+                }
+            },
         };
 
         let mut columns = Vec::with_capacity(args.len() - 1);
@@ -225,31 +272,37 @@ impl ScalarUDFImpl for ConcatWsFunc {
                                 ColumnarValueRef::NonNullableArray(string_array)
                             };
                             columns.push(column);
-                        },
+                        }
                         DataType::LargeUtf8 => {
-                            let string_array = as_largestring_array(array);
+                            let string_array = as_large_string_array(array)?;
 
                             data_size += string_array.values().len();
                             let column = if array.is_nullable() {
                                 ColumnarValueRef::NullableLargeStringArray(string_array)
                             } else {
-                                ColumnarValueRef::NonNullableLargeStringArray(string_array)
+                                ColumnarValueRef::NonNullableLargeStringArray(
+                                    string_array,
+                                )
                             };
                             columns.push(column);
-                        },
+                        }
                         DataType::Utf8View => {
                             let string_array = as_string_view_array(array)?;
 
-                            data_size += string_array.data_buffers().iter().map(|buf| buf.len()).sum::<usize>();
+                            // This is an estimate; in particular, it will
+                            // undercount arrays of short strings (<= 12 bytes).
+                            data_size += string_array.total_buffer_bytes_used();
                             let column = if array.is_nullable() {
                                 ColumnarValueRef::NullableStringViewArray(string_array)
                             } else {
                                 ColumnarValueRef::NonNullableStringViewArray(string_array)
                             };
                             columns.push(column);
-                        },
+                        }
                         other => {
-                            return plan_err!("Input was {other} which is not a supported datatype for concat_ws function.")
+                            return plan_err!(
+                                "Input was {other} which is not a supported datatype for concat_ws function."
+                            );
                         }
                     };
                 }
@@ -257,32 +310,71 @@ impl ScalarUDFImpl for ConcatWsFunc {
             }
         }
 
-        let mut builder = StringArrayBuilder::with_capacity(len, data_size);
-        for i in 0..len {
-            if !sep.is_valid(i) {
-                builder.append_offset();
-                continue;
+        match return_datatype {
+            DataType::Utf8View => {
+                let mut builder = ConcatStringViewBuilder::with_capacity(len, data_size);
+                for i in 0..len {
+                    if !sep.is_valid(i) {
+                        builder.append_offset()?;
+                        continue;
+                    }
+                    let mut first = true;
+                    for column in &columns {
+                        if column.is_valid(i) {
+                            if !first {
+                                builder.write::<false>(&sep, i);
+                            }
+                            builder.write::<false>(column, i);
+                            first = false;
+                        }
+                    }
+                    builder.append_offset()?;
+                }
+                Ok(ColumnarValue::Array(Arc::new(builder.finish(sep.nulls())?)))
             }
-
-            let mut iter = columns.iter();
-            for column in iter.by_ref() {
-                if column.is_valid(i) {
-                    builder.write::<false>(column, i);
-                    break;
+            DataType::LargeUtf8 => {
+                let mut builder = ConcatLargeStringBuilder::with_capacity(len, data_size);
+                for i in 0..len {
+                    if !sep.is_valid(i) {
+                        builder.append_offset()?;
+                        continue;
+                    }
+                    let mut first = true;
+                    for column in &columns {
+                        if column.is_valid(i) {
+                            if !first {
+                                builder.write::<false>(&sep, i);
+                            }
+                            builder.write::<false>(column, i);
+                            first = false;
+                        }
+                    }
+                    builder.append_offset()?;
                 }
+                Ok(ColumnarValue::Array(Arc::new(builder.finish(sep.nulls())?)))
             }
-
-            for column in iter {
-                if column.is_valid(i) {
-                    builder.write::<false>(&sep, i);
-                    builder.write::<false>(column, i);
+            _ => {
+                let mut builder = ConcatStringBuilder::with_capacity(len, data_size);
+                for i in 0..len {
+                    if !sep.is_valid(i) {
+                        builder.append_offset()?;
+                        continue;
+                    }
+                    let mut first = true;
+                    for column in &columns {
+                        if column.is_valid(i) {
+                            if !first {
+                                builder.write::<false>(&sep, i);
+                            }
+                            builder.write::<false>(column, i);
+                            first = false;
+                        }
+                    }
+                    builder.append_offset()?;
                 }
+                Ok(ColumnarValue::Array(Arc::new(builder.finish(sep.nulls())?)))
             }
-
-            builder.append_offset();
         }
-
-        Ok(ColumnarValue::Array(Arc::new(builder.finish(sep.nulls()))))
     }
 
     /// Simply the `concat_ws` function by
@@ -293,7 +385,7 @@ impl ScalarUDFImpl for ConcatWsFunc {
     fn simplify(
         &self,
         args: Vec<Expr>,
-        _info: &dyn SimplifyInfo,
+        _info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         match &args[..] {
             [delimiter, vals @ ..] => simplify_concat_ws(delimiter, vals),
@@ -307,6 +399,21 @@ impl ScalarUDFImpl for ConcatWsFunc {
 }
 
 fn simplify_concat_ws(delimiter: &Expr, args: &[Expr]) -> Result<ExprSimplifyResult> {
+    // Preserve the delimiter's string type for any new literals produced
+    // during simplification.
+    let delimiter_type = match delimiter {
+        Expr::Literal(v, _) => v.data_type(),
+        _ => DataType::Utf8,
+    };
+
+    let typed_lit = |s: String| -> Expr {
+        match delimiter_type {
+            DataType::LargeUtf8 => lit(ScalarValue::LargeUtf8(Some(s))),
+            DataType::Utf8View => lit(ScalarValue::Utf8View(Some(s))),
+            _ => lit(s),
+        }
+    };
+
     match delimiter {
         Expr::Literal(
             ScalarValue::Utf8(delimiter)
@@ -315,8 +422,8 @@ fn simplify_concat_ws(delimiter: &Expr, args: &[Expr]) -> Result<ExprSimplifyRes
             _,
         ) => {
             match delimiter {
-                // when the delimiter is an empty string,
-                // we can use `concat` to replace `concat_ws`
+                // When the delimiter is the empty string, replace `concat_ws`
+                // with `concat`
                 Some(delimiter) if delimiter.is_empty() => {
                     match simplify_concat(args.to_vec())? {
                         ExprSimplifyResult::Original(_) => {
@@ -332,29 +439,41 @@ fn simplify_concat_ws(delimiter: &Expr, args: &[Expr]) -> Result<ExprSimplifyRes
                 }
                 Some(delimiter) => {
                     let mut new_args = Vec::with_capacity(args.len());
-                    new_args.push(lit(delimiter));
+                    new_args.push(typed_lit(delimiter.to_string()));
                     let mut contiguous_scalar = None;
                     for arg in args {
                         match arg {
                             // filter out null args
-                            Expr::Literal(ScalarValue::Utf8(None) | ScalarValue::LargeUtf8(None) | ScalarValue::Utf8View(None), _) => {}
-                            Expr::Literal(ScalarValue::Utf8(Some(v)) | ScalarValue::LargeUtf8(Some(v)) | ScalarValue::Utf8View(Some(v)), _) => {
-                                match contiguous_scalar {
-                                    None => contiguous_scalar = Some(v.to_string()),
-                                    Some(mut pre) => {
-                                        pre += delimiter;
-                                        pre += v;
-                                        contiguous_scalar = Some(pre)
-                                    }
+                            Expr::Literal(
+                                ScalarValue::Utf8(None)
+                                | ScalarValue::LargeUtf8(None)
+                                | ScalarValue::Utf8View(None),
+                                _,
+                            ) => {}
+                            Expr::Literal(
+                                ScalarValue::Utf8(Some(v))
+                                | ScalarValue::LargeUtf8(Some(v))
+                                | ScalarValue::Utf8View(Some(v)),
+                                _,
+                            ) => match contiguous_scalar {
+                                None => contiguous_scalar = Some(v.to_string()),
+                                Some(mut pre) => {
+                                    pre += delimiter;
+                                    pre += v;
+                                    contiguous_scalar = Some(pre)
                                 }
+                            },
+                            Expr::Literal(s, _) => {
+                                return internal_err!(
+                                    "The scalar {s} should be casted to string type during the type coercion."
+                                );
                             }
-                            Expr::Literal(s, _) => return internal_err!("The scalar {s} should be casted to string type during the type coercion."),
                             // If the arg is not a literal, we should first push the current `contiguous_scalar`
                             // to the `new_args` and reset it to None.
                             // Then pushing this arg to the `new_args`.
                             arg => {
                                 if let Some(val) = contiguous_scalar {
-                                    new_args.push(lit(val));
+                                    new_args.push(typed_lit(val));
                                 }
                                 new_args.push(arg.clone());
                                 contiguous_scalar = None;
@@ -362,7 +481,7 @@ fn simplify_concat_ws(delimiter: &Expr, args: &[Expr]) -> Result<ExprSimplifyRes
                         }
                     }
                     if let Some(val) = contiguous_scalar {
-                        new_args.push(lit(val));
+                        new_args.push(typed_lit(val));
                     }
 
                     Ok(ExprSimplifyResult::Simplified(Expr::ScalarFunction(
@@ -372,11 +491,18 @@ fn simplify_concat_ws(delimiter: &Expr, args: &[Expr]) -> Result<ExprSimplifyRes
                         },
                     )))
                 }
-                // if the delimiter is null, then the value of the whole expression is null.
-                None => Ok(ExprSimplifyResult::Simplified(Expr::Literal(
-                    ScalarValue::Utf8(None),
-                    None,
-                ))),
+                // If the delimiter is null, then the value of the whole expression is null.
+                None => {
+                    let null_scalar = match delimiter_type {
+                        DataType::LargeUtf8 => ScalarValue::LargeUtf8(None),
+                        DataType::Utf8View => ScalarValue::Utf8View(None),
+                        _ => ScalarValue::Utf8(None),
+                    };
+                    Ok(ExprSimplifyResult::Simplified(Expr::Literal(
+                        null_scalar,
+                        None,
+                    )))
+                }
             }
         }
         Expr::Literal(d, _) => internal_err!(
@@ -406,12 +532,12 @@ mod tests {
     use std::sync::Arc;
 
     use crate::string::concat_ws::ConcatWsFunc;
-    use arrow::array::{Array, ArrayRef, StringArray};
-    use arrow::datatypes::DataType::Utf8;
+    use arrow::array::{Array, ArrayRef, LargeStringArray, StringArray, StringViewArray};
+    use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
     use arrow::datatypes::Field;
-    use datafusion_common::config::ConfigOptions;
     use datafusion_common::Result;
     use datafusion_common::ScalarValue;
+    use datafusion_common::config::ConfigOptions;
     use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
 
     use crate::utils::test::test_function;
@@ -547,4 +673,265 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn concat_ws_utf8view_scalar_separator() -> Result<()> {
+        let c0 = ColumnarValue::Scalar(ScalarValue::Utf8View(Some(",".to_string())));
+        let c1 =
+            ColumnarValue::Array(Arc::new(StringArray::from(vec!["foo", "bar", "baz"])));
+        let c2 = ColumnarValue::Array(Arc::new(StringArray::from(vec![
+            Some("x"),
+            None,
+            Some("z"),
+        ])));
+
+        let arg_fields = vec![
+            Field::new("a", Utf8View, true).into(),
+            Field::new("a", Utf8, true).into(),
+            Field::new("a", Utf8, true).into(),
+        ];
+        let args = ScalarFunctionArgs {
+            args: vec![c0, c1, c2],
+            arg_fields,
+            number_rows: 3,
+            return_field: Field::new("f", Utf8View, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+
+        let result = ConcatWsFunc::new().invoke_with_args(args)?;
+        let expected =
+            Arc::new(StringViewArray::from(vec!["foo,x", "bar", "baz,z"])) as ArrayRef;
+        match &result {
+            ColumnarValue::Array(array) => {
+                assert_eq!(&expected, array);
+            }
+            _ => panic!("Expected array result"),
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn concat_ws_largeutf8_scalar_separator() -> Result<()> {
+        let c0 = ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(",".to_string())));
+        let c1 =
+            ColumnarValue::Array(Arc::new(StringArray::from(vec!["foo", "bar", "baz"])));
+        let c2 = ColumnarValue::Array(Arc::new(StringArray::from(vec![
+            Some("x"),
+            None,
+            Some("z"),
+        ])));
+
+        let arg_fields = vec![
+            Field::new("a", LargeUtf8, true).into(),
+            Field::new("a", Utf8, true).into(),
+            Field::new("a", Utf8, true).into(),
+        ];
+        let args = ScalarFunctionArgs {
+            args: vec![c0, c1, c2],
+            arg_fields,
+            number_rows: 3,
+            return_field: Field::new("f", LargeUtf8, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+
+        let result = ConcatWsFunc::new().invoke_with_args(args)?;
+        let expected =
+            Arc::new(LargeStringArray::from(vec!["foo,x", "bar", "baz,z"])) as ArrayRef;
+        match &result {
+            ColumnarValue::Array(array) => {
+                assert_eq!(&expected, array);
+            }
+            _ => panic!("Expected array result"),
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn concat_ws_utf8view_nullable_separator() -> Result<()> {
+        let c0 = ColumnarValue::Array(Arc::new(StringViewArray::from(vec![
+            Some(","),
+            None,
+            Some("+"),
+        ])));
+        let c1 = ColumnarValue::Array(Arc::new(StringViewArray::from(vec![
+            "foo", "bar", "baz",
+        ])));
+        let c2 = ColumnarValue::Array(Arc::new(StringViewArray::from(vec![
+            Some("x"),
+            Some("y"),
+            Some("z"),
+        ])));
+
+        let arg_fields = vec![
+            Field::new("a", Utf8View, true).into(),
+            Field::new("a", Utf8View, true).into(),
+            Field::new("a", Utf8View, true).into(),
+        ];
+        let args = ScalarFunctionArgs {
+            args: vec![c0, c1, c2],
+            arg_fields,
+            number_rows: 3,
+            return_field: Field::new("f", Utf8View, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+
+        let result = ConcatWsFunc::new().invoke_with_args(args)?;
+        let expected = Arc::new(StringViewArray::from(vec![
+            Some("foo,x"),
+            None,
+            Some("baz+z"),
+        ])) as ArrayRef;
+        match &result {
+            ColumnarValue::Array(array) => {
+                assert_eq!(&expected, array);
+            }
+            _ => panic!("Expected array result"),
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn concat_ws_largeutf8_arrays() -> Result<()> {
+        let c0 = ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(",".to_string())));
+        let c1 = ColumnarValue::Array(Arc::new(LargeStringArray::from(vec![
+            "foo", "bar", "baz",
+        ])));
+        let c2 = ColumnarValue::Array(Arc::new(LargeStringArray::from(vec![
+            Some("x"),
+            None,
+            Some("z"),
+        ])));
+
+        let arg_fields = vec![
+            Field::new("a", LargeUtf8, true).into(),
+            Field::new("a", LargeUtf8, true).into(),
+            Field::new("a", LargeUtf8, true).into(),
+        ];
+        let args = ScalarFunctionArgs {
+            args: vec![c0, c1, c2],
+            arg_fields,
+            number_rows: 3,
+            return_field: Field::new("f", LargeUtf8, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+
+        let result = ConcatWsFunc::new().invoke_with_args(args)?;
+        let expected =
+            Arc::new(LargeStringArray::from(vec!["foo,x", "bar", "baz,z"])) as ArrayRef;
+        match &result {
+            ColumnarValue::Array(array) => {
+                assert_eq!(&expected, array);
+            }
+            _ => panic!("Expected array result"),
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn concat_ws_utf8view_null_separator() -> Result<()> {
+        // All-scalar path: null Utf8View separator should return Utf8View(None)
+        let c0 = ColumnarValue::Scalar(ScalarValue::Utf8View(None));
+        let c1 = ColumnarValue::Scalar(ScalarValue::Utf8View(Some("aa".to_string())));
+        let c2 = ColumnarValue::Scalar(ScalarValue::Utf8View(Some("bb".to_string())));
+
+        let arg_fields = vec![
+            Field::new("a", Utf8View, true).into(),
+            Field::new("a", Utf8View, true).into(),
+            Field::new("a", Utf8View, true).into(),
+        ];
+        let args = ScalarFunctionArgs {
+            args: vec![c0, c1, c2],
+            arg_fields,
+            number_rows: 1,
+            return_field: Field::new("f", Utf8View, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+
+        let result = ConcatWsFunc::new().invoke_with_args(args)?;
+        match result {
+            ColumnarValue::Scalar(ScalarValue::Utf8View(None)) => {}
+            other => panic!("Expected Utf8View(None), got {other:?}"),
+        }
+
+        // Array path: null Utf8View scalar separator with array args
+        let c0 = ColumnarValue::Scalar(ScalarValue::Utf8View(None));
+        let c1 =
+            ColumnarValue::Array(Arc::new(StringViewArray::from(vec!["foo", "bar"])));
+
+        let arg_fields = vec![
+            Field::new("a", Utf8View, true).into(),
+            Field::new("a", Utf8View, true).into(),
+        ];
+        let args = ScalarFunctionArgs {
+            args: vec![c0, c1],
+            arg_fields,
+            number_rows: 2,
+            return_field: Field::new("f", Utf8View, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+
+        let result = ConcatWsFunc::new().invoke_with_args(args)?;
+        match result {
+            ColumnarValue::Scalar(ScalarValue::Utf8View(None)) => {}
+            other => panic!("Expected Utf8View(None), got {other:?}"),
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn concat_ws_largeutf8_null_separator() -> Result<()> {
+        // All-scalar path: null LargeUtf8 separator should return LargeUtf8(None)
+        let c0 = ColumnarValue::Scalar(ScalarValue::LargeUtf8(None));
+        let c1 = ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("aa".to_string())));
+        let c2 = ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("bb".to_string())));
+
+        let arg_fields = vec![
+            Field::new("a", LargeUtf8, true).into(),
+            Field::new("a", LargeUtf8, true).into(),
+            Field::new("a", LargeUtf8, true).into(),
+        ];
+        let args = ScalarFunctionArgs {
+            args: vec![c0, c1, c2],
+            arg_fields,
+            number_rows: 1,
+            return_field: Field::new("f", LargeUtf8, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+
+        let result = ConcatWsFunc::new().invoke_with_args(args)?;
+        match result {
+            ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)) => {}
+            other => panic!("Expected LargeUtf8(None), got {other:?}"),
+        }
+
+        // Array path: null LargeUtf8 scalar separator with array args
+        let c0 = ColumnarValue::Scalar(ScalarValue::LargeUtf8(None));
+        let c1 =
+            ColumnarValue::Array(Arc::new(LargeStringArray::from(vec!["foo", "bar"])));
+
+        let arg_fields = vec![
+            Field::new("a", LargeUtf8, true).into(),
+            Field::new("a", LargeUtf8, true).into(),
+        ];
+        let args = ScalarFunctionArgs {
+            args: vec![c0, c1],
+            arg_fields,
+            number_rows: 2,
+            return_field: Field::new("f", LargeUtf8, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+
+        let result = ConcatWsFunc::new().invoke_with_args(args)?;
+        match result {
+            ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)) => {}
+            other => panic!("Expected LargeUtf8(None), got {other:?}"),
+        }
+
+        Ok(())
+    }
 }
diff --git a/datafusion/functions/src/string/contains.rs b/datafusion/functions/src/string/contains.rs
index 7e50676933c8d..8a75e3ac703ee 100644
--- a/datafusion/functions/src/string/contains.rs
+++ b/datafusion/functions/src/string/contains.rs
@@ -15,20 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::utils::make_scalar_function;
-use arrow::array::{Array, ArrayRef, AsArray};
+use arrow::array::{Array, ArrayRef, Scalar};
 use arrow::compute::contains as arrow_contains;
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::{Boolean, LargeUtf8, Utf8, Utf8View};
 use datafusion_common::types::logical_string;
-use datafusion_common::{exec_err, DataFusionError, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::binary::{binary_to_string_coercion, string_coercion};
 use datafusion_expr::{
     Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
     TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::sync::Arc;
 
 #[user_doc(
@@ -72,10 +70,6 @@ impl ContainsFunc {
 }
 
 impl ScalarUDFImpl for ContainsFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "contains"
     }
@@ -89,7 +83,7 @@ impl ScalarUDFImpl for ContainsFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(contains, vec![])(&args.args)
+        contains(args.args.as_slice())
     }
 
     fn documentation(&self) -> Option<&Documentation> {
@@ -97,43 +91,71 @@ impl ScalarUDFImpl for ContainsFunc {
     }
 }
 
+fn to_array(value: &ColumnarValue) -> Result<(ArrayRef, bool)> {
+    match value {
+        ColumnarValue::Array(array) => Ok((Arc::clone(array), false)),
+        ColumnarValue::Scalar(scalar) => Ok((scalar.to_array()?, true)),
+    }
+}
+
+/// Helper to call arrow_contains with proper Datum handling.
+/// When an argument is marked as scalar, we wrap it in `Scalar` to tell arrow's
+/// kernel to use the optimized single-value code path instead of iterating.
+fn call_arrow_contains(
+    haystack: &ArrayRef,
+    haystack_is_scalar: bool,
+    needle: &ArrayRef,
+    needle_is_scalar: bool,
+) -> Result<ColumnarValue> {
+    // Arrow's Datum trait is implemented for ArrayRef, Arc<dyn Array>, and Scalar<T>
+    // We pass ArrayRef directly when not scalar, or wrap in Scalar when it is
+    let result = match (haystack_is_scalar, needle_is_scalar) {
+        (false, false) => arrow_contains(haystack, needle)?,
+        (false, true) => arrow_contains(haystack, &Scalar::new(Arc::clone(needle)))?,
+        (true, false) => arrow_contains(&Scalar::new(Arc::clone(haystack)), needle)?,
+        (true, true) => arrow_contains(
+            &Scalar::new(Arc::clone(haystack)),
+            &Scalar::new(Arc::clone(needle)),
+        )?,
+    };
+
+    // If both inputs were scalar, return a scalar result
+    if haystack_is_scalar && needle_is_scalar {
+        let scalar = datafusion_common::ScalarValue::try_from_array(&result, 0)?;
+        Ok(ColumnarValue::Scalar(scalar))
+    } else {
+        Ok(ColumnarValue::Array(Arc::new(result)))
+    }
+}
+
 /// use `arrow::compute::contains` to do the calculation for contains
-fn contains(args: &[ArrayRef]) -> Result<ArrayRef, DataFusionError> {
+fn contains(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+    let (haystack, haystack_is_scalar) = to_array(&args[0])?;
+    let (needle, needle_is_scalar) = to_array(&args[1])?;
+
     if let Some(coercion_data_type) =
-        string_coercion(args[0].data_type(), args[1].data_type()).or_else(|| {
-            binary_to_string_coercion(args[0].data_type(), args[1].data_type())
+        string_coercion(haystack.data_type(), needle.data_type()).or_else(|| {
+            binary_to_string_coercion(haystack.data_type(), needle.data_type())
         })
     {
-        let arg0 = if args[0].data_type() == &coercion_data_type {
-            Arc::clone(&args[0])
+        let haystack = if haystack.data_type() == &coercion_data_type {
+            haystack
         } else {
-            arrow::compute::kernels::cast::cast(&args[0], &coercion_data_type)?
+            arrow::compute::kernels::cast::cast(&haystack, &coercion_data_type)?
         };
-        let arg1 = if args[1].data_type() == &coercion_data_type {
-            Arc::clone(&args[1])
+        let needle = if needle.data_type() == &coercion_data_type {
+            needle
         } else {
-            arrow::compute::kernels::cast::cast(&args[1], &coercion_data_type)?
+            arrow::compute::kernels::cast::cast(&needle, &coercion_data_type)?
         };
 
         match coercion_data_type {
-            Utf8View => {
-                let mod_str = arg0.as_string_view();
-                let match_str = arg1.as_string_view();
-                let res = arrow_contains(mod_str, match_str)?;
-                Ok(Arc::new(res) as ArrayRef)
-            }
-            Utf8 => {
-                let mod_str = arg0.as_string::<i32>();
-                let match_str = arg1.as_string::<i32>();
-                let res = arrow_contains(mod_str, match_str)?;
-                Ok(Arc::new(res) as ArrayRef)
-            }
-            LargeUtf8 => {
-                let mod_str = arg0.as_string::<i64>();
-                let match_str = arg1.as_string::<i64>();
-                let res = arrow_contains(mod_str, match_str)?;
-                Ok(Arc::new(res) as ArrayRef)
-            }
+            Utf8View | Utf8 | LargeUtf8 => call_arrow_contains(
+                &haystack,
+                haystack_is_scalar,
+                &needle,
+                needle_is_scalar,
+            ),
             other => {
                 exec_err!("Unsupported data type {other:?} for function `contains`.")
             }
@@ -153,8 +175,8 @@ mod test {
     use crate::expr_fn::contains;
     use arrow::array::{BooleanArray, StringArray};
     use arrow::datatypes::{DataType, Field};
-    use datafusion_common::config::ConfigOptions;
     use datafusion_common::ScalarValue;
+    use datafusion_common::config::ConfigOptions;
     use datafusion_expr::{ColumnarValue, Expr, ScalarFunctionArgs, ScalarUDFImpl};
     use std::sync::Arc;
 
diff --git a/datafusion/functions/src/string/ends_with.rs b/datafusion/functions/src/string/ends_with.rs
index 6090d9c84d4cd..6b84e260a2d11 100644
--- a/datafusion/functions/src/string/ends_with.rs
+++ b/datafusion/functions/src/string/ends_with.rs
@@ -15,15 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
-use arrow::array::ArrayRef;
+use arrow::array::{ArrayRef, Scalar};
+use arrow::compute::kernels::comparison::ends_with as arrow_ends_with;
 use arrow::datatypes::DataType;
 
-use crate::utils::make_scalar_function;
 use datafusion_common::types::logical_string;
-use datafusion_common::{internal_err, Result};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, exec_err};
 use datafusion_expr::binary::{binary_to_string_coercion, string_coercion};
 use datafusion_expr::{
     Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
@@ -78,10 +78,6 @@ impl EndsWithFunc {
 }
 
 impl ScalarUDFImpl for EndsWithFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "ends_with"
     }
@@ -95,12 +91,70 @@ impl ScalarUDFImpl for EndsWithFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        match args.args[0].data_type() {
-            DataType::Utf8View | DataType::Utf8 | DataType::LargeUtf8 => {
-                make_scalar_function(ends_with, vec![])(&args.args)
+        let [str_arg, suffix_arg] = take_function_args(self.name(), &args.args)?;
+
+        // Determine the common type for coercion
+        let coercion_type = string_coercion(
+            &str_arg.data_type(),
+            &suffix_arg.data_type(),
+        )
+        .or_else(|| {
+            binary_to_string_coercion(&str_arg.data_type(), &suffix_arg.data_type())
+        });
+
+        let Some(coercion_type) = coercion_type else {
+            return exec_err!(
+                "Unsupported data types {:?}, {:?} for function `ends_with`.",
+                str_arg.data_type(),
+                suffix_arg.data_type()
+            );
+        };
+
+        // Helper to cast an array if needed
+        let maybe_cast = |arr: &ArrayRef, target: &DataType| -> Result<ArrayRef> {
+            if arr.data_type() == target {
+                Ok(Arc::clone(arr))
+            } else {
+                Ok(arrow::compute::kernels::cast::cast(arr, target)?)
+            }
+        };
+
+        match (str_arg, suffix_arg) {
+            // Both scalars - just compute directly
+            (ColumnarValue::Scalar(str_scalar), ColumnarValue::Scalar(suffix_scalar)) => {
+                let str_arr = str_scalar.to_array_of_size(1)?;
+                let suffix_arr = suffix_scalar.to_array_of_size(1)?;
+                let str_arr = maybe_cast(&str_arr, &coercion_type)?;
+                let suffix_arr = maybe_cast(&suffix_arr, &coercion_type)?;
+                let result = arrow_ends_with(&str_arr, &suffix_arr)?;
+                Ok(ColumnarValue::Scalar(ScalarValue::try_from_array(
+                    &result, 0,
+                )?))
+            }
+            // String is array, suffix is scalar - use Scalar wrapper for optimization
+            (ColumnarValue::Array(str_arr), ColumnarValue::Scalar(suffix_scalar)) => {
+                let str_arr = maybe_cast(str_arr, &coercion_type)?;
+                let suffix_arr = suffix_scalar.to_array_of_size(1)?;
+                let suffix_arr = maybe_cast(&suffix_arr, &coercion_type)?;
+                let suffix_scalar = Scalar::new(suffix_arr);
+                let result = arrow_ends_with(&str_arr, &suffix_scalar)?;
+                Ok(ColumnarValue::Array(Arc::new(result)))
+            }
+            // String is scalar, suffix is array - use Scalar wrapper for string
+            (ColumnarValue::Scalar(str_scalar), ColumnarValue::Array(suffix_arr)) => {
+                let str_arr = str_scalar.to_array_of_size(1)?;
+                let str_arr = maybe_cast(&str_arr, &coercion_type)?;
+                let str_scalar = Scalar::new(str_arr);
+                let suffix_arr = maybe_cast(suffix_arr, &coercion_type)?;
+                let result = arrow_ends_with(&str_scalar, &suffix_arr)?;
+                Ok(ColumnarValue::Array(Arc::new(result)))
             }
-            other => {
-                internal_err!("Unsupported data type {other:?} for function ends_with. Expected Utf8, LargeUtf8 or Utf8View")?
+            // Both arrays - pass directly
+            (ColumnarValue::Array(str_arr), ColumnarValue::Array(suffix_arr)) => {
+                let str_arr = maybe_cast(str_arr, &coercion_type)?;
+                let suffix_arr = maybe_cast(suffix_arr, &coercion_type)?;
+                let result = arrow_ends_with(&str_arr, &suffix_arr)?;
+                Ok(ColumnarValue::Array(Arc::new(result)))
             }
         }
     }
@@ -110,47 +164,24 @@ impl ScalarUDFImpl for EndsWithFunc {
     }
 }
 
-/// Returns true if string ends with suffix.
-/// ends_with('alphabet', 'abet') = 't'
-fn ends_with(args: &[ArrayRef]) -> Result<ArrayRef> {
-    if let Some(coercion_data_type) =
-        string_coercion(args[0].data_type(), args[1].data_type()).or_else(|| {
-            binary_to_string_coercion(args[0].data_type(), args[1].data_type())
-        })
-    {
-        let arg0 = if args[0].data_type() == &coercion_data_type {
-            Arc::clone(&args[0])
-        } else {
-            arrow::compute::kernels::cast::cast(&args[0], &coercion_data_type)?
-        };
-        let arg1 = if args[1].data_type() == &coercion_data_type {
-            Arc::clone(&args[1])
-        } else {
-            arrow::compute::kernels::cast::cast(&args[1], &coercion_data_type)?
-        };
-        let result = arrow::compute::kernels::comparison::ends_with(&arg0, &arg1)?;
-        Ok(Arc::new(result) as ArrayRef)
-    } else {
-        internal_err!(
-            "Unsupported data types for ends_with. Expected Utf8, LargeUtf8 or Utf8View"
-        )
-    }
-}
-
 #[cfg(test)]
 mod tests {
-    use arrow::array::{Array, BooleanArray};
+    use arrow::array::{Array, BooleanArray, StringArray};
     use arrow::datatypes::DataType::Boolean;
+    use arrow::datatypes::{DataType, Field};
+    use std::sync::Arc;
 
     use datafusion_common::Result;
     use datafusion_common::ScalarValue;
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
 
     use crate::string::ends_with::EndsWithFunc;
     use crate::utils::test::test_function;
 
     #[test]
-    fn test_functions() -> Result<()> {
+    fn test_scalar_scalar() -> Result<()> {
+        // Test Scalar + Scalar combinations
         test_function!(
             EndsWithFunc::new(),
             vec![
@@ -196,6 +227,186 @@ mod tests {
             BooleanArray
         );
 
+        // Test with LargeUtf8
+        test_function!(
+            EndsWithFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(
+                    "alphabet".to_string()
+                ))),
+                ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("bet".to_string()))),
+            ],
+            Ok(Some(true)),
+            bool,
+            Boolean,
+            BooleanArray
+        );
+
+        // Test with Utf8View
+        test_function!(
+            EndsWithFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
+                    "alphabet".to_string()
+                ))),
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some("bet".to_string()))),
+            ],
+            Ok(Some(true)),
+            bool,
+            Boolean,
+            BooleanArray
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_scalar() -> Result<()> {
+        // Test Array + Scalar (the optimized path)
+        let array = ColumnarValue::Array(Arc::new(StringArray::from(vec![
+            Some("alphabet"),
+            Some("alphabet"),
+            Some("beta"),
+            None,
+        ])));
+        let scalar = ColumnarValue::Scalar(ScalarValue::Utf8(Some("bet".to_string())));
+
+        let args = vec![array, scalar];
+        test_function!(
+            EndsWithFunc::new(),
+            args,
+            Ok(Some(true)), // First element result: "alphabet" ends with "bet"
+            bool,
+            Boolean,
+            BooleanArray
+        );
+
         Ok(())
     }
+
+    #[test]
+    fn test_array_scalar_full_result() {
+        // Test Array + Scalar and verify all results
+        let func = EndsWithFunc::new();
+        let array = Arc::new(StringArray::from(vec![
+            Some("alphabet"),
+            Some("alphabet"),
+            Some("beta"),
+            None,
+        ]));
+        let args = vec![
+            ColumnarValue::Array(array),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("bet".to_string()))),
+        ];
+
+        let result = func
+            .invoke_with_args(ScalarFunctionArgs {
+                args,
+                arg_fields: vec![
+                    Field::new("a", DataType::Utf8, true).into(),
+                    Field::new("b", DataType::Utf8, true).into(),
+                ],
+                number_rows: 4,
+                return_field: Field::new("f", Boolean, true).into(),
+                config_options: Arc::new(ConfigOptions::default()),
+            })
+            .unwrap();
+
+        let result_array = result.into_array(4).unwrap();
+        let bool_array = result_array
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .unwrap();
+
+        assert!(bool_array.value(0)); // "alphabet" ends with "bet"
+        assert!(bool_array.value(1)); // "alphabet" ends with "bet"
+        assert!(!bool_array.value(2)); // "beta" does not end with "bet"
+        assert!(bool_array.is_null(3)); // null input -> null output
+    }
+
+    #[test]
+    fn test_scalar_array() {
+        // Test Scalar + Array
+        let func = EndsWithFunc::new();
+        let suffixes = Arc::new(StringArray::from(vec![
+            Some("bet"),
+            Some("alph"),
+            Some("phabet"),
+            None,
+        ]));
+        let args = vec![
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("alphabet".to_string()))),
+            ColumnarValue::Array(suffixes),
+        ];
+
+        let result = func
+            .invoke_with_args(ScalarFunctionArgs {
+                args,
+                arg_fields: vec![
+                    Field::new("a", DataType::Utf8, true).into(),
+                    Field::new("b", DataType::Utf8, true).into(),
+                ],
+                number_rows: 4,
+                return_field: Field::new("f", Boolean, true).into(),
+                config_options: Arc::new(ConfigOptions::default()),
+            })
+            .unwrap();
+
+        let result_array = result.into_array(4).unwrap();
+        let bool_array = result_array
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .unwrap();
+
+        assert!(bool_array.value(0)); // "alphabet" ends with "bet"
+        assert!(!bool_array.value(1)); // "alphabet" does not end with "alph"
+        assert!(bool_array.value(2)); // "alphabet" ends with "phabet"
+        assert!(bool_array.is_null(3)); // null suffix -> null output
+    }
+
+    #[test]
+    fn test_array_array() {
+        // Test Array + Array
+        let func = EndsWithFunc::new();
+        let strings = Arc::new(StringArray::from(vec![
+            Some("alphabet"),
+            Some("rust"),
+            Some("datafusion"),
+            None,
+        ]));
+        let suffixes = Arc::new(StringArray::from(vec![
+            Some("bet"),
+            Some("st"),
+            Some("hello"),
+            Some("test"),
+        ]));
+        let args = vec![
+            ColumnarValue::Array(strings),
+            ColumnarValue::Array(suffixes),
+        ];
+
+        let result = func
+            .invoke_with_args(ScalarFunctionArgs {
+                args,
+                arg_fields: vec![
+                    Field::new("a", DataType::Utf8, true).into(),
+                    Field::new("b", DataType::Utf8, true).into(),
+                ],
+                number_rows: 4,
+                return_field: Field::new("f", Boolean, true).into(),
+                config_options: Arc::new(ConfigOptions::default()),
+            })
+            .unwrap();
+
+        let result_array = result.into_array(4).unwrap();
+        let bool_array = result_array
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .unwrap();
+
+        assert!(bool_array.value(0)); // "alphabet" ends with "bet"
+        assert!(bool_array.value(1)); // "rust" ends with "st"
+        assert!(!bool_array.value(2)); // "datafusion" does not end with "hello"
+        assert!(bool_array.is_null(3)); // null string -> null output
+    }
 }
diff --git a/datafusion/functions/src/string/levenshtein.rs b/datafusion/functions/src/string/levenshtein.rs
index 2f7894df903d6..38fa8fa878de9 100644
--- a/datafusion/functions/src/string/levenshtein.rs
+++ b/datafusion/functions/src/string/levenshtein.rs
@@ -15,7 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, Int32Array, Int64Array, OffsetSizeTrait};
@@ -26,7 +25,7 @@ use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
 use datafusion_common::types::logical_string;
 use datafusion_common::utils::datafusion_strsim;
 use datafusion_common::utils::take_function_args;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::type_coercion::binary::{
     binary_to_string_coercion, string_coercion,
 };
@@ -83,10 +82,6 @@ impl LevenshteinFunc {
 }
 
 impl ScalarUDFImpl for LevenshteinFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "levenshtein"
     }
@@ -101,7 +96,9 @@ impl ScalarUDFImpl for LevenshteinFunc {
         {
             utf8_to_int_type(&coercion_data_type, "levenshtein")
         } else {
-            exec_err!("Unsupported data types for levenshtein. Expected Utf8, LargeUtf8 or Utf8View")
+            exec_err!(
+                "Unsupported data types for levenshtein. Expected Utf8, LargeUtf8 or Utf8View"
+            )
         }
     }
 
@@ -149,12 +146,18 @@ fn levenshtein<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
             DataType::Utf8View => {
                 let str1_array = as_string_view_array(&str1)?;
                 let str2_array = as_string_view_array(&str2)?;
+
+                // Reusable buffer to avoid allocating for each row
+                let mut cache = Vec::new();
+
                 let result = str1_array
                     .iter()
                     .zip(str2_array.iter())
                     .map(|(string1, string2)| match (string1, string2) {
                         (Some(string1), Some(string2)) => {
-                            Some(datafusion_strsim::levenshtein(string1, string2) as i32)
+                            Some(datafusion_strsim::levenshtein_with_buffer(
+                                string1, string2, &mut cache,
+                            ) as i32)
                         }
                         _ => None,
                     })
@@ -164,12 +167,18 @@ fn levenshtein<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
             DataType::Utf8 => {
                 let str1_array = as_generic_string_array::<T>(&str1)?;
                 let str2_array = as_generic_string_array::<T>(&str2)?;
+
+                // Reusable buffer to avoid allocating for each row
+                let mut cache = Vec::new();
+
                 let result = str1_array
                     .iter()
                     .zip(str2_array.iter())
                     .map(|(string1, string2)| match (string1, string2) {
                         (Some(string1), Some(string2)) => {
-                            Some(datafusion_strsim::levenshtein(string1, string2) as i32)
+                            Some(datafusion_strsim::levenshtein_with_buffer(
+                                string1, string2, &mut cache,
+                            ) as i32)
                         }
                         _ => None,
                     })
@@ -179,12 +188,18 @@ fn levenshtein<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
             DataType::LargeUtf8 => {
                 let str1_array = as_generic_string_array::<T>(&str1)?;
                 let str2_array = as_generic_string_array::<T>(&str2)?;
+
+                // Reusable buffer to avoid allocating for each row
+                let mut cache = Vec::new();
+
                 let result = str1_array
                     .iter()
                     .zip(str2_array.iter())
                     .map(|(string1, string2)| match (string1, string2) {
                         (Some(string1), Some(string2)) => {
-                            Some(datafusion_strsim::levenshtein(string1, string2) as i64)
+                            Some(datafusion_strsim::levenshtein_with_buffer(
+                                string1, string2, &mut cache,
+                            ) as i64)
                         }
                         _ => None,
                     })
@@ -198,7 +213,9 @@ fn levenshtein<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
             }
         }
     } else {
-        exec_err!("Unsupported data types for levenshtein. Expected Utf8, LargeUtf8 or Utf8View")
+        exec_err!(
+            "Unsupported data types for levenshtein. Expected Utf8, LargeUtf8 or Utf8View"
+        )
     }
 }
 
diff --git a/datafusion/functions/src/string/lower.rs b/datafusion/functions/src/string/lower.rs
index ee56a6a549857..0b0476d0ec712 100644
--- a/datafusion/functions/src/string/lower.rs
+++ b/datafusion/functions/src/string/lower.rs
@@ -16,12 +16,10 @@
 // under the License.
 
 use arrow::datatypes::DataType;
-use std::any::Any;
 
 use crate::string::common::to_lower;
-use crate::utils::utf8_to_str_type;
-use datafusion_common::types::logical_string;
 use datafusion_common::Result;
+use datafusion_common::types::logical_string;
 use datafusion_expr::{
     Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
     TypeSignatureClass, Volatility,
@@ -69,10 +67,6 @@ impl LowerFunc {
 }
 
 impl ScalarUDFImpl for LowerFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "lower"
     }
@@ -82,7 +76,7 @@ impl ScalarUDFImpl for LowerFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_to_str_type(&arg_types[0], "lower")
+        Ok(arg_types[0].clone())
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
@@ -97,28 +91,29 @@ impl ScalarUDFImpl for LowerFunc {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::array::{Array, ArrayRef, StringArray};
-    use arrow::datatypes::DataType::Utf8;
+    use arrow::array::{Array, ArrayRef, StringArray, StringViewArray};
     use arrow::datatypes::Field;
     use datafusion_common::config::ConfigOptions;
     use std::sync::Arc;
 
-    fn to_lower(input: ArrayRef, expected: ArrayRef) -> Result<()> {
+    fn invoke_lower(input: ArrayRef) -> Result<ArrayRef> {
         let func = LowerFunc::new();
-        let arg_fields = vec![Field::new("a", input.data_type().clone(), true).into()];
-
+        let data_type = input.data_type().clone();
         let args = ScalarFunctionArgs {
             number_rows: input.len(),
             args: vec![ColumnarValue::Array(input)],
-            arg_fields,
-            return_field: Field::new("f", Utf8, true).into(),
+            arg_fields: vec![Field::new("a", data_type.clone(), true).into()],
+            return_field: Field::new("f", data_type, true).into(),
             config_options: Arc::new(ConfigOptions::default()),
         };
-
-        let result = match func.invoke_with_args(args)? {
-            ColumnarValue::Array(result) => result,
+        match func.invoke_with_args(args)? {
+            ColumnarValue::Array(r) => Ok(r),
             _ => unreachable!("lower"),
-        };
+        }
+    }
+
+    fn to_lower(input: ArrayRef, expected: ArrayRef) -> Result<()> {
+        let result = invoke_lower(input)?;
         assert_eq!(&expected, &result);
         Ok(())
     }
@@ -197,4 +192,42 @@ mod tests {
 
         to_lower(input, expected)
     }
+
+    #[test]
+    fn lower_utf8view() -> Result<()> {
+        let input = Arc::new(StringViewArray::from(vec![
+            Some("ARROW"),
+            None,
+            Some("TSCHÜSS"),
+        ])) as ArrayRef;
+
+        let expected = Arc::new(StringViewArray::from(vec![
+            Some("arrow"),
+            None,
+            Some("tschüss"),
+        ])) as ArrayRef;
+
+        to_lower(input, expected)
+    }
+
+    #[test]
+    fn lower_sliced_utf8() -> Result<()> {
+        let parent = Arc::new(StringArray::from(vec![
+            Some("AAAAAAAA"),
+            Some("HELLO"),
+            Some("WORLD"),
+            Some(""),
+            Some("ZZZZZZZZ"),
+        ])) as ArrayRef;
+        let sliced = parent.slice(1, 3);
+        let result = invoke_lower(sliced)?;
+        let result_sa = result.as_any().downcast_ref::<StringArray>().unwrap();
+
+        let expected = StringArray::from(vec![Some("hello"), Some("world"), Some("")]);
+        assert_eq!(result_sa, &expected);
+        // The slice's addressed bytes are "HELLO" + "WORLD" = 10; the ASCII
+        // fast path must produce a tight output buffer (not the parent's).
+        assert_eq!(result_sa.value_data().len(), 10);
+        Ok(())
+    }
 }
diff --git a/datafusion/functions/src/string/ltrim.rs b/datafusion/functions/src/string/ltrim.rs
index dc6d30d38188c..e49ffeb0541ff 100644
--- a/datafusion/functions/src/string/ltrim.rs
+++ b/datafusion/functions/src/string/ltrim.rs
@@ -17,13 +17,12 @@
 
 use arrow::array::{ArrayRef, OffsetSizeTrait};
 use arrow::datatypes::DataType;
-use std::any::Any;
 use std::sync::Arc;
 
 use crate::string::common::*;
-use crate::utils::{make_scalar_function, utf8_to_str_type};
+use crate::utils::make_scalar_function;
 use datafusion_common::types::logical_string;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::function::Hint;
 use datafusion_expr::{
     Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
@@ -31,7 +30,7 @@ use datafusion_expr::{
 };
 use datafusion_macros::user_doc;
 
-/// Returns the longest string  with leading characters removed. If the characters are not specified, whitespace is removed.
+/// Returns the longest string with leading characters removed. If the characters are not specified, spaces are removed.
 /// ltrim('zzzytest', 'xyz') = 'test'
 fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     let use_string_view = args[0].data_type() == &DataType::Utf8View;
@@ -41,12 +40,12 @@ fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     } else {
         args.to_owned()
     };
-    general_trim::<T>(&args, TrimType::Left, use_string_view)
+    general_trim::<T, TrimLeft>(&args, use_string_view)
 }
 
 #[user_doc(
     doc_section(label = "String Functions"),
-    description = "Trims the specified trim string from the beginning of a string. If no trim string is provided, all whitespace is removed from the start of the input string.",
+    description = "Trims the specified trim string from the beginning of a string. If no trim string is provided, spaces are removed from the start of the input string.",
     syntax_example = "ltrim(str[, trim_str])",
     sql_example = r#"```sql
 > select ltrim('  datafusion  ');
@@ -65,7 +64,7 @@ fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     standard_argument(name = "str", prefix = "String"),
     argument(
         name = "trim_str",
-        description = r"String expression to trim from the beginning of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is whitespace characters._"
+        description = r"String expression to trim from the beginning of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is a space._"
     ),
     alternative_syntax = "trim(LEADING trim_str FROM str)",
     related_udf(name = "btrim"),
@@ -102,10 +101,6 @@ impl LtrimFunc {
 }
 
 impl ScalarUDFImpl for LtrimFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "ltrim"
     }
@@ -115,11 +110,7 @@ impl ScalarUDFImpl for LtrimFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if arg_types[0] == DataType::Utf8View {
-            Ok(DataType::Utf8View)
-        } else {
-            utf8_to_str_type(&arg_types[0], "ltrim")
-        }
+        Ok(arg_types[0].clone())
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
diff --git a/datafusion/functions/src/string/octet_length.rs b/datafusion/functions/src/string/octet_length.rs
index aa8257ef8fc53..ecffb2a6de7af 100644
--- a/datafusion/functions/src/string/octet_length.rs
+++ b/datafusion/functions/src/string/octet_length.rs
@@ -17,7 +17,6 @@
 
 use arrow::compute::kernels::length::length;
 use arrow::datatypes::DataType;
-use std::any::Any;
 
 use crate::utils::utf8_to_int_type;
 use datafusion_common::types::logical_string;
@@ -70,10 +69,6 @@ impl OctetLengthFunc {
 }
 
 impl ScalarUDFImpl for OctetLengthFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "octet_length"
     }
@@ -119,7 +114,7 @@ mod tests {
     use arrow::datatypes::DataType::Int32;
 
     use datafusion_common::ScalarValue;
-    use datafusion_common::{exec_err, Result};
+    use datafusion_common::{Result, exec_err};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
 
     use crate::string::octet_length::OctetLengthFunc;
diff --git a/datafusion/functions/src/string/repeat.rs b/datafusion/functions/src/string/repeat.rs
index 3f6128b6516b9..b551d2ac707a9 100644
--- a/datafusion/functions/src/string/repeat.rs
+++ b/datafusion/functions/src/string/repeat.rs
@@ -15,19 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
-use std::sync::Arc;
-
-use crate::utils::{make_scalar_function, utf8_to_str_type};
-use arrow::array::{
-    ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array,
-    OffsetSizeTrait, StringArrayType, StringViewArray,
+use crate::strings::{
+    BulkNullStringArrayBuilder, GenericStringArrayBuilder, StringViewArrayBuilder,
 };
+use crate::utils::utf8_to_str_type;
+use arrow::array::{Array, ArrayRef, AsArray, Int64Array, StringArrayType};
+use arrow::buffer::NullBuffer;
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
 use datafusion_common::cast::as_int64_array;
-use datafusion_common::types::{logical_int64, logical_string, NativeType};
-use datafusion_common::{exec_err, DataFusionError, Result};
+use datafusion_common::types::{NativeType, logical_int64, logical_string};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{DataFusionError, Result, ScalarValue, exec_err, internal_err};
 use datafusion_expr::{ColumnarValue, Documentation, Volatility};
 use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature};
 use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
@@ -82,10 +81,6 @@ impl RepeatFunc {
 }
 
 impl ScalarUDFImpl for RepeatFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "repeat"
     }
@@ -95,11 +90,69 @@ impl ScalarUDFImpl for RepeatFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types[0] == Utf8View {
+            return Ok(Utf8View);
+        }
         utf8_to_str_type(&arg_types[0], "repeat")
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(repeat, vec![])(&args.args)
+        let return_type = args.return_field.data_type().clone();
+        let [string_arg, count_arg] = take_function_args(self.name(), args.args)?;
+
+        // Early return if either argument is a scalar null
+        if let ColumnarValue::Scalar(s) = &string_arg
+            && s.is_null()
+        {
+            return Ok(ColumnarValue::Scalar(ScalarValue::try_from(&return_type)?));
+        }
+        if let ColumnarValue::Scalar(c) = &count_arg
+            && c.is_null()
+        {
+            return Ok(ColumnarValue::Scalar(ScalarValue::try_from(&return_type)?));
+        }
+
+        match (&string_arg, &count_arg) {
+            (
+                ColumnarValue::Scalar(string_scalar),
+                ColumnarValue::Scalar(count_scalar),
+            ) => {
+                let count = match count_scalar {
+                    ScalarValue::Int64(Some(n)) => *n,
+                    _ => {
+                        return internal_err!(
+                            "Unexpected data type {:?} for repeat count",
+                            count_scalar.data_type()
+                        );
+                    }
+                };
+
+                let result = match string_scalar {
+                    ScalarValue::Utf8View(Some(s)) => ScalarValue::Utf8View(Some(
+                        compute_repeat(s, count, i32::MAX as usize)?,
+                    )),
+                    ScalarValue::Utf8(Some(s)) => ScalarValue::Utf8(Some(
+                        compute_repeat(s, count, i32::MAX as usize)?,
+                    )),
+                    ScalarValue::LargeUtf8(Some(s)) => ScalarValue::LargeUtf8(Some(
+                        compute_repeat(s, count, i64::MAX as usize)?,
+                    )),
+                    _ => {
+                        return internal_err!(
+                            "Unexpected data type {:?} for function repeat",
+                            string_scalar.data_type()
+                        );
+                    }
+                };
+
+                Ok(ColumnarValue::Scalar(result))
+            }
+            _ => {
+                let string_array = string_arg.to_array(args.number_rows)?;
+                let count_array = count_arg.to_array(args.number_rows)?;
+                Ok(ColumnarValue::Array(repeat(&string_array, &count_array)?))
+            }
+        }
     }
 
     fn documentation(&self) -> Option<&Documentation> {
@@ -107,34 +160,57 @@ impl ScalarUDFImpl for RepeatFunc {
     }
 }
 
+/// Computes repeat for a single string value with max size check
+#[inline]
+fn compute_repeat(s: &str, count: i64, max_size: usize) -> Result<String> {
+    if count <= 0 {
+        return Ok(String::new());
+    }
+    let result_len = s.len().saturating_mul(count as usize);
+    if result_len > max_size {
+        return exec_err!(
+            "string size overflow on repeat, max size is {}, but got {}",
+            max_size,
+            result_len
+        );
+    }
+    Ok(s.repeat(count as usize))
+}
+
 /// Repeats string the specified number of times.
 /// repeat('Pg', 4) = 'PgPgPgPg'
-fn repeat(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let number_array = as_int64_array(&args[1])?;
-    match args[0].data_type() {
+fn repeat(string_array: &ArrayRef, count_array: &ArrayRef) -> Result<ArrayRef> {
+    let number_array = as_int64_array(count_array)?;
+    match string_array.data_type() {
         Utf8View => {
-            let string_view_array = args[0].as_string_view();
-            repeat_impl::<i32, &StringViewArray>(
-                string_view_array,
+            let string_view_array = string_array.as_string_view();
+            let (_, max_item_capacity) = calculate_capacities(
+                &string_view_array,
                 number_array,
                 i32::MAX as usize,
-            )
+            )?;
+            let builder = StringViewArrayBuilder::with_capacity(string_array.len());
+            repeat_impl(&string_view_array, number_array, max_item_capacity, builder)
         }
         Utf8 => {
-            let string_array = args[0].as_string::<i32>();
-            repeat_impl::<i32, &GenericStringArray<i32>>(
-                string_array,
-                number_array,
-                i32::MAX as usize,
-            )
+            let string_arr = string_array.as_string::<i32>();
+            let (total_capacity, max_item_capacity) =
+                calculate_capacities(&string_arr, number_array, i32::MAX as usize)?;
+            let builder = GenericStringArrayBuilder::<i32>::with_capacity(
+                string_array.len(),
+                total_capacity,
+            );
+            repeat_impl(&string_arr, number_array, max_item_capacity, builder)
         }
         LargeUtf8 => {
-            let string_array = args[0].as_string::<i64>();
-            repeat_impl::<i64, &GenericStringArray<i64>>(
-                string_array,
-                number_array,
-                i64::MAX as usize,
-            )
+            let string_arr = string_array.as_string::<i64>();
+            let (total_capacity, max_item_capacity) =
+                calculate_capacities(&string_arr, number_array, i64::MAX as usize)?;
+            let builder = GenericStringArrayBuilder::<i64>::with_capacity(
+                string_array.len(),
+                total_capacity,
+            );
+            repeat_impl(&string_arr, number_array, max_item_capacity, builder)
         }
         other => exec_err!(
             "Unsupported data type {other:?} for function repeat. \
@@ -143,16 +219,17 @@ fn repeat(args: &[ArrayRef]) -> Result<ArrayRef> {
     }
 }
 
-fn repeat_impl<'a, T, S>(
-    string_array: S,
+fn calculate_capacities<'a, S>(
+    string_array: &S,
     number_array: &Int64Array,
     max_str_len: usize,
-) -> Result<ArrayRef>
+) -> Result<(usize, usize)>
 where
-    T: OffsetSizeTrait,
     S: StringArrayType<'a>,
 {
     let mut total_capacity = 0;
+    let mut max_item_capacity = 0;
+
     string_array.iter().zip(number_array.iter()).try_for_each(
         |(string, number)| -> Result<(), DataFusionError> {
             match (string, number) {
@@ -166,6 +243,7 @@ where
                         );
                     }
                     total_capacity += item_capacity;
+                    max_item_capacity = max_item_capacity.max(item_capacity);
                 }
                 _ => (),
             }
@@ -173,33 +251,89 @@ where
         },
     )?;
 
-    let mut builder =
-        GenericStringBuilder::<T>::with_capacity(string_array.len(), total_capacity);
+    Ok((total_capacity, max_item_capacity))
+}
 
-    string_array.iter().zip(number_array.iter()).try_for_each(
-        |(string, number)| -> Result<(), DataFusionError> {
-            match (string, number) {
-                (Some(string), Some(number)) if number >= 0 => {
-                    builder.append_value(string.repeat(number as usize));
-                }
-                (Some(_), Some(_)) => builder.append_value(""),
-                _ => builder.append_null(),
+fn repeat_impl<'a, S, B>(
+    string_array: &S,
+    number_array: &Int64Array,
+    max_item_capacity: usize,
+    mut builder: B,
+) -> Result<ArrayRef>
+where
+    S: StringArrayType<'a> + 'a,
+    B: BulkNullStringArrayBuilder,
+{
+    // Reusable buffer to avoid allocations in string.repeat()
+    let mut buffer = Vec::<u8>::with_capacity(max_item_capacity);
+
+    // Helper function to repeat a string into a buffer using doubling strategy
+    // count must be > 0
+    #[inline]
+    fn repeat_to_buffer(buffer: &mut Vec<u8>, string: &str, count: usize) {
+        buffer.clear();
+        if !string.is_empty() {
+            let src = string.as_bytes();
+            // Initial copy
+            buffer.extend_from_slice(src);
+            // Doubling strategy: copy what we have so far until we reach the target
+            while buffer.len() < src.len() * count {
+                let copy_len = buffer.len().min(src.len() * count - buffer.len());
+                // SAFETY: we're copying valid UTF-8 bytes that we already verified
+                buffer.extend_from_within(..copy_len);
             }
-            Ok(())
-        },
-    )?;
-    let array = builder.finish();
+        }
+    }
+
+    // Output is null IFF either input is null
+    let nulls = NullBuffer::union(string_array.nulls(), number_array.nulls());
+
+    if let Some(ref n) = nulls {
+        for i in 0..string_array.len() {
+            if n.is_null(i) {
+                builder.append_placeholder();
+                continue;
+            }
+            // SAFETY: index `i` in both arrays is valid
+            let string = unsafe { string_array.value_unchecked(i) };
+            let count = unsafe { number_array.value_unchecked(i) };
+            if count > 0 {
+                repeat_to_buffer(&mut buffer, string, count as usize);
+                // SAFETY: buffer contains valid UTF-8 since we only copy from a valid &str
+                builder.append_value(unsafe { std::str::from_utf8_unchecked(&buffer) });
+            } else {
+                builder.append_value("");
+            }
+        }
+    } else {
+        for i in 0..string_array.len() {
+            // SAFETY: no nulls, so every index in both arrays is valid
+            let string = unsafe { string_array.value_unchecked(i) };
+            let count = unsafe { number_array.value_unchecked(i) };
+            if count > 0 {
+                repeat_to_buffer(&mut buffer, string, count as usize);
+                // SAFETY: buffer contains valid UTF-8 since we only copy from a valid &str
+                builder.append_value(unsafe { std::str::from_utf8_unchecked(&buffer) });
+            } else {
+                builder.append_value("");
+            }
+        }
+    }
 
-    Ok(Arc::new(array) as ArrayRef)
+    builder.finish(nulls)
 }
 
 #[cfg(test)]
 mod tests {
-    use arrow::array::{Array, StringArray};
-    use arrow::datatypes::DataType::Utf8;
+    use std::sync::Arc;
+
+    use arrow::array::{
+        Array, ArrayRef, Int64Array, LargeStringArray, StringArray, StringViewArray,
+    };
+    use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
 
     use datafusion_common::ScalarValue;
-    use datafusion_common::{exec_err, Result};
+    use datafusion_common::{Result, exec_err};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
 
     use crate::string::repeat::RepeatFunc;
@@ -249,8 +383,8 @@ mod tests {
             ],
             Ok(Some("PgPgPgPg")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             RepeatFunc::new(),
@@ -260,8 +394,19 @@ mod tests {
             ],
             Ok(None),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
+        );
+        test_function!(
+            RepeatFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(String::from("Pg")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(None)),
+            ],
+            Ok(None),
+            &str,
+            LargeUtf8,
+            LargeStringArray
         );
         test_function!(
             RepeatFunc::new(),
@@ -271,8 +416,8 @@ mod tests {
             ],
             Ok(None),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             RepeatFunc::new(),
@@ -292,4 +437,69 @@ mod tests {
 
         Ok(())
     }
+
+    // Slicing the input arrays produces a NullBuffer with a non-zero offset.
+    // The tests below use 6-row inputs sliced to (1, 4) so that:
+    //   slot 0 (orig 1): "a"  × 3    → "aaa"
+    //   slot 1 (orig 2): "bb" × 2    → "bbbb"
+    //   slot 2 (orig 3): "c"  × NULL → NULL (count-side null)
+    //   slot 3 (orig 4): NULL × 1    → NULL (string-side null)
+    fn sliced_offset_inputs<F>(make_strings: F) -> (ArrayRef, ArrayRef)
+    where
+        F: FnOnce(Vec<Option<&'static str>>) -> ArrayRef,
+    {
+        let strings = make_strings(vec![
+            None,
+            Some("a"),
+            Some("bb"),
+            Some("c"),
+            None,
+            Some("d"),
+        ]);
+        let counts: ArrayRef = Arc::new(Int64Array::from(vec![
+            Some(2),
+            Some(3),
+            Some(2),
+            None,
+            Some(1),
+            Some(2),
+        ]));
+        (strings.slice(1, 4), counts.slice(1, 4))
+    }
+
+    fn assert_sliced_offset_output<A: Array + 'static>(result: ArrayRef)
+    where
+        for<'a> &'a A: arrow::array::ArrayAccessor<Item = &'a str>,
+    {
+        let result = result.as_any().downcast_ref::<A>().unwrap();
+        assert_eq!(result.len(), 4);
+        assert_eq!(arrow::array::ArrayAccessor::value(&result, 0), "aaa");
+        assert_eq!(arrow::array::ArrayAccessor::value(&result, 1), "bbbb");
+        assert!(result.is_null(2));
+        assert!(result.is_null(3));
+        assert_eq!(result.null_count(), 2);
+    }
+
+    #[test]
+    fn test_repeat_sliced_string_with_null_offset() {
+        let (strings, counts) = sliced_offset_inputs(|v| Arc::new(StringArray::from(v)));
+        let result = super::repeat(&strings, &counts).unwrap();
+        assert_sliced_offset_output::<StringArray>(result);
+    }
+
+    #[test]
+    fn test_repeat_sliced_large_string_with_null_offset() {
+        let (strings, counts) =
+            sliced_offset_inputs(|v| Arc::new(LargeStringArray::from(v)));
+        let result = super::repeat(&strings, &counts).unwrap();
+        assert_sliced_offset_output::<LargeStringArray>(result);
+    }
+
+    #[test]
+    fn test_repeat_sliced_string_view_with_null_offset() {
+        let (strings, counts) =
+            sliced_offset_inputs(|v| Arc::new(StringViewArray::from(v)));
+        let result = super::repeat(&strings, &counts).unwrap();
+        assert_sliced_offset_output::<StringViewArray>(result);
+    }
 }
diff --git a/datafusion/functions/src/string/replace.rs b/datafusion/functions/src/string/replace.rs
index f127b452b2d34..f588b8c3cca77 100644
--- a/datafusion/functions/src/string/replace.rs
+++ b/datafusion/functions/src/string/replace.rs
@@ -15,16 +15,17 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
-use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray};
+use arrow::array::{Array, ArrayRef, OffsetSizeTrait};
+use arrow::buffer::NullBuffer;
 use arrow::datatypes::DataType;
 
+use crate::strings::GenericStringArrayBuilder;
 use crate::utils::{make_scalar_function, utf8_to_str_type};
 use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
 use datafusion_common::types::logical_string;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::type_coercion::binary::{
     binary_to_string_coercion, string_coercion,
 };
@@ -79,10 +80,6 @@ impl ReplaceFunc {
 }
 
 impl ScalarUDFImpl for ReplaceFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "replace"
     }
@@ -101,7 +98,9 @@ impl ScalarUDFImpl for ReplaceFunc {
         {
             utf8_to_str_type(&coercion_data_type, "replace")
         } else {
-            exec_err!("Unsupported data types for replace. Expected Utf8, LargeUtf8 or Utf8View")
+            exec_err!(
+                "Unsupported data types for replace. Expected Utf8, LargeUtf8 or Utf8View"
+            )
         }
     }
 
@@ -163,17 +162,44 @@ fn replace_view(args: &[ArrayRef]) -> Result<ArrayRef> {
     let from_array = as_string_view_array(&args[1])?;
     let to_array = as_string_view_array(&args[2])?;
 
-    let result = string_array
-        .iter()
-        .zip(from_array.iter())
-        .zip(to_array.iter())
-        .map(|((string, from), to)| match (string, from, to) {
-            (Some(string), Some(from), Some(to)) => Some(string.replace(from, to)),
-            _ => None,
-        })
-        .collect::<StringArray>();
-
-    Ok(Arc::new(result) as ArrayRef)
+    let len = string_array.len();
+    let mut builder = GenericStringArrayBuilder::<i32>::with_capacity(len, 0);
+    let mut buffer = String::new();
+    let nulls = NullBuffer::union(
+        NullBuffer::union(string_array.nulls(), from_array.nulls()).as_ref(),
+        to_array.nulls(),
+    );
+
+    // Hoist the nulls.is_some() check out of the loop. LLVM does not always
+    // unswitch this loop on its own (the Utf8View body is large enough to
+    // exceed its cost-benefit threshold).
+    if let Some(nulls_ref) = nulls.as_ref() {
+        for i in 0..len {
+            if nulls_ref.is_null(i) {
+                builder.append_placeholder();
+                continue;
+            }
+            // SAFETY: union of input nulls is non-null at i, so each input is too.
+            let string = unsafe { string_array.value_unchecked(i) };
+            let from = unsafe { from_array.value_unchecked(i) };
+            let to = unsafe { to_array.value_unchecked(i) };
+            buffer.clear();
+            replace_into_string(&mut buffer, string, from, to);
+            builder.append_value(&buffer);
+        }
+    } else {
+        for i in 0..len {
+            // SAFETY: i < len, and no input has a null buffer.
+            let string = unsafe { string_array.value_unchecked(i) };
+            let from = unsafe { from_array.value_unchecked(i) };
+            let to = unsafe { to_array.value_unchecked(i) };
+            buffer.clear();
+            replace_into_string(&mut buffer, string, from, to);
+            builder.append_value(&buffer);
+        }
+    }
+
+    Ok(Arc::new(builder.finish(nulls)?) as ArrayRef)
 }
 
 /// Replaces all occurrences in string of substring from with substring to.
@@ -183,17 +209,85 @@ fn replace<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     let from_array = as_generic_string_array::<T>(&args[1])?;
     let to_array = as_generic_string_array::<T>(&args[2])?;
 
-    let result = string_array
-        .iter()
-        .zip(from_array.iter())
-        .zip(to_array.iter())
-        .map(|((string, from), to)| match (string, from, to) {
-            (Some(string), Some(from), Some(to)) => Some(string.replace(from, to)),
-            _ => None,
-        })
-        .collect::<GenericStringArray<T>>();
-
-    Ok(Arc::new(result) as ArrayRef)
+    let len = string_array.len();
+    let mut builder = GenericStringArrayBuilder::<T>::with_capacity(len, 0);
+    let mut buffer = String::new();
+    let nulls = NullBuffer::union(
+        NullBuffer::union(string_array.nulls(), from_array.nulls()).as_ref(),
+        to_array.nulls(),
+    );
+
+    // Hoist the nulls.is_some() check out of the loop. LLVM unswitches this
+    // automatically today, but kept explicit so the no-nulls fast path is not
+    // contingent on the optimizer's cost heuristic.
+    if let Some(nulls_ref) = nulls.as_ref() {
+        for i in 0..len {
+            if nulls_ref.is_null(i) {
+                builder.append_placeholder();
+                continue;
+            }
+            // SAFETY: union of input nulls is non-null at i, so each input is too.
+            let string = unsafe { string_array.value_unchecked(i) };
+            let from = unsafe { from_array.value_unchecked(i) };
+            let to = unsafe { to_array.value_unchecked(i) };
+            buffer.clear();
+            replace_into_string(&mut buffer, string, from, to);
+            builder.append_value(&buffer);
+        }
+    } else {
+        for i in 0..len {
+            // SAFETY: i < len, and no input has a null buffer.
+            let string = unsafe { string_array.value_unchecked(i) };
+            let from = unsafe { from_array.value_unchecked(i) };
+            let to = unsafe { to_array.value_unchecked(i) };
+            buffer.clear();
+            replace_into_string(&mut buffer, string, from, to);
+            builder.append_value(&buffer);
+        }
+    }
+
+    Ok(Arc::new(builder.finish(nulls)?) as ArrayRef)
+}
+
+/// Helper function to perform string replacement into a reusable String buffer
+#[inline]
+fn replace_into_string(buffer: &mut String, string: &str, from: &str, to: &str) {
+    if from.is_empty() {
+        // When from is empty, insert 'to' at the beginning, between each character, and at the end
+        // This matches the behavior of str::replace()
+        buffer.push_str(to);
+        for ch in string.chars() {
+            buffer.push(ch);
+            buffer.push_str(to);
+        }
+        return;
+    }
+
+    // Fast path for replacing a single ASCII character with another single ASCII character.
+    // Extends the buffer's underlying Vec<u8> directly, for performance.
+    if let ([from_byte], [to_byte]) = (from.as_bytes(), to.as_bytes())
+        && from_byte.is_ascii()
+        && to_byte.is_ascii()
+    {
+        // SAFETY: Replacing an ASCII byte with another ASCII byte preserves UTF-8 validity.
+        unsafe {
+            buffer.as_mut_vec().extend(
+                string
+                    .as_bytes()
+                    .iter()
+                    .map(|&b| if b == *from_byte { *to_byte } else { b }),
+            );
+        }
+        return;
+    }
+
+    let mut last_end = 0;
+    for (start, _part) in string.match_indices(from) {
+        buffer.push_str(&string[last_end..start]);
+        buffer.push_str(to);
+        last_end = start + from.len();
+    }
+    buffer.push_str(&string[last_end..]);
 }
 
 #[cfg(test)]
diff --git a/datafusion/functions/src/string/rtrim.rs b/datafusion/functions/src/string/rtrim.rs
index be0595f65542a..05ad9e855976d 100644
--- a/datafusion/functions/src/string/rtrim.rs
+++ b/datafusion/functions/src/string/rtrim.rs
@@ -17,13 +17,12 @@
 
 use arrow::array::{ArrayRef, OffsetSizeTrait};
 use arrow::datatypes::DataType;
-use std::any::Any;
 use std::sync::Arc;
 
 use crate::string::common::*;
-use crate::utils::{make_scalar_function, utf8_to_str_type};
+use crate::utils::make_scalar_function;
 use datafusion_common::types::logical_string;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::function::Hint;
 use datafusion_expr::{
     Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
@@ -31,7 +30,7 @@ use datafusion_expr::{
 };
 use datafusion_macros::user_doc;
 
-/// Returns the longest string  with trailing characters removed. If the characters are not specified, whitespace is removed.
+/// Returns the longest string with trailing characters removed. If the characters are not specified, spaces are removed.
 /// rtrim('testxxzx', 'xyz') = 'test'
 fn rtrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     let use_string_view = args[0].data_type() == &DataType::Utf8View;
@@ -41,12 +40,12 @@ fn rtrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     } else {
         args.to_owned()
     };
-    general_trim::<T>(&args, TrimType::Right, use_string_view)
+    general_trim::<T, TrimRight>(&args, use_string_view)
 }
 
 #[user_doc(
     doc_section(label = "String Functions"),
-    description = "Trims the specified trim string from the end of a string. If no trim string is provided, all whitespace is removed from the end of the input string.",
+    description = "Trims the specified trim string from the end of a string. If no trim string is provided, all spaces are removed from the end of the input string.",
     syntax_example = "rtrim(str[, trim_str])",
     alternative_syntax = "trim(TRAILING trim_str FROM str)",
     sql_example = r#"```sql
@@ -66,7 +65,7 @@ fn rtrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     standard_argument(name = "str", prefix = "String"),
     argument(
         name = "trim_str",
-        description = "String expression to trim from the end of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is whitespace characters._"
+        description = "String expression to trim from the end of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is a space._"
     ),
     related_udf(name = "btrim"),
     related_udf(name = "ltrim")
@@ -102,10 +101,6 @@ impl RtrimFunc {
 }
 
 impl ScalarUDFImpl for RtrimFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "rtrim"
     }
@@ -115,11 +110,7 @@ impl ScalarUDFImpl for RtrimFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if arg_types[0] == DataType::Utf8View {
-            Ok(DataType::Utf8View)
-        } else {
-            utf8_to_str_type(&arg_types[0], "rtrim")
-        }
+        Ok(arg_types[0].clone())
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
diff --git a/datafusion/functions/src/string/split_part.rs b/datafusion/functions/src/string/split_part.rs
index 8462dd5149cbf..1994c65bcf326 100644
--- a/datafusion/functions/src/string/split_part.rs
+++ b/datafusion/functions/src/string/split_part.rs
@@ -17,18 +17,22 @@
 
 use crate::utils::utf8_to_str_type;
 use arrow::array::{
-    ArrayRef, GenericStringArray, Int64Array, OffsetSizeTrait, StringArrayType,
-    StringViewArray,
+    Array, ArrayRef, AsArray, ByteView, GenericStringBuilder, Int64Array,
+    StringArrayType, StringLikeArrayBuilder, StringViewArray, StringViewBuilder,
+    make_view, new_null_array,
 };
-use arrow::array::{AsArray, GenericStringBuilder};
+use arrow::buffer::ScalarBuffer;
 use arrow::datatypes::DataType;
-use datafusion_common::cast::as_int64_array;
 use datafusion_common::ScalarValue;
-use datafusion_common::{exec_err, DataFusionError, Result};
-use datafusion_expr::{ColumnarValue, Documentation, TypeSignature, Volatility};
+use datafusion_common::cast::as_int64_array;
+use datafusion_common::types::{NativeType, logical_int64, logical_string};
+use datafusion_common::{Result, exec_datafusion_err, exec_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Documentation, TypeSignatureClass, Volatility,
+};
 use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature};
 use datafusion_macros::user_doc;
-use std::any::Any;
+use memchr::memmem;
 use std::sync::Arc;
 
 #[user_doc(
@@ -45,7 +49,10 @@ use std::sync::Arc;
 ```"#,
     standard_argument(name = "str", prefix = "String"),
     argument(name = "delimiter", description = "String or character to split on."),
-    argument(name = "pos", description = "Position of the part to return.")
+    argument(
+        name = "pos",
+        description = "Position of the part to return (counting from 1). Negative values count backward from the end of the string."
+    )
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
 pub struct SplitPartFunc {
@@ -60,19 +67,16 @@ impl Default for SplitPartFunc {
 
 impl SplitPartFunc {
     pub fn new() -> Self {
-        use DataType::*;
         Self {
-            signature: Signature::one_of(
+            signature: Signature::coercible(
                 vec![
-                    TypeSignature::Exact(vec![Utf8View, Utf8View, Int64]),
-                    TypeSignature::Exact(vec![Utf8View, Utf8, Int64]),
-                    TypeSignature::Exact(vec![Utf8View, LargeUtf8, Int64]),
-                    TypeSignature::Exact(vec![Utf8, Utf8View, Int64]),
-                    TypeSignature::Exact(vec![Utf8, Utf8, Int64]),
-                    TypeSignature::Exact(vec![LargeUtf8, Utf8View, Int64]),
-                    TypeSignature::Exact(vec![LargeUtf8, Utf8, Int64]),
-                    TypeSignature::Exact(vec![Utf8, LargeUtf8, Int64]),
-                    TypeSignature::Exact(vec![LargeUtf8, LargeUtf8, Int64]),
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_int64()),
+                        vec![TypeSignatureClass::Integer],
+                        NativeType::Int64,
+                    ),
                 ],
                 Volatility::Immutable,
             ),
@@ -81,10 +85,6 @@ impl SplitPartFunc {
 }
 
 impl ScalarUDFImpl for SplitPartFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "split_part"
     }
@@ -94,12 +94,26 @@ impl ScalarUDFImpl for SplitPartFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_to_str_type(&arg_types[0], "split_part")
+        if arg_types[0] == DataType::Utf8View {
+            Ok(DataType::Utf8View)
+        } else {
+            utf8_to_str_type(&arg_types[0], "split_part")
+        }
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let ScalarFunctionArgs { args, .. } = args;
 
+        // Fast path: array string, scalar delimiter and position.
+        if let (
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Scalar(delim_scalar),
+            ColumnarValue::Scalar(pos_scalar),
+        ) = (&args[0], &args[1], &args[2])
+        {
+            return split_part_scalar(string_array, delim_scalar, pos_scalar);
+        }
+
         // First, determine if any of the arguments is an Array
         let len = args.iter().find_map(|arg| match arg {
             ColumnarValue::Array(a) => Some(a.len()),
@@ -120,71 +134,66 @@ impl ScalarUDFImpl for SplitPartFunc {
 
         // Unpack the ArrayRefs from the arguments
         let n_array = as_int64_array(&args[2])?;
-        let result = match (args[0].data_type(), args[1].data_type()) {
-            (DataType::Utf8View, DataType::Utf8View) => {
-                split_part_impl::<&StringViewArray, &StringViewArray, i32>(
-                    args[0].as_string_view(),
-                    args[1].as_string_view(),
-                    n_array,
-                )
-            }
-            (DataType::Utf8View, DataType::Utf8) => {
-                split_part_impl::<&StringViewArray, &GenericStringArray<i32>, i32>(
-                    args[0].as_string_view(),
-                    args[1].as_string::<i32>(),
-                    n_array,
-                )
-            }
-            (DataType::Utf8View, DataType::LargeUtf8) => {
-                split_part_impl::<&StringViewArray, &GenericStringArray<i64>, i32>(
-                    args[0].as_string_view(),
-                    args[1].as_string::<i64>(),
-                    n_array,
-                )
-            }
-            (DataType::Utf8, DataType::Utf8View) => {
-                split_part_impl::<&GenericStringArray<i32>, &StringViewArray, i32>(
-                    args[0].as_string::<i32>(),
-                    args[1].as_string_view(),
-                    n_array,
-                )
-            }
-            (DataType::LargeUtf8, DataType::Utf8View) => {
-                split_part_impl::<&GenericStringArray<i64>, &StringViewArray, i64>(
-                    args[0].as_string::<i64>(),
-                    args[1].as_string_view(),
-                    n_array,
-                )
-            }
-            (DataType::Utf8, DataType::Utf8) => {
-                split_part_impl::<&GenericStringArray<i32>, &GenericStringArray<i32>, i32>(
-                    args[0].as_string::<i32>(),
-                    args[1].as_string::<i32>(),
-                    n_array,
-                )
-            }
-            (DataType::LargeUtf8, DataType::LargeUtf8) => {
-                split_part_impl::<&GenericStringArray<i64>, &GenericStringArray<i64>, i64>(
-                    args[0].as_string::<i64>(),
-                    args[1].as_string::<i64>(),
-                    n_array,
-                )
-            }
-            (DataType::Utf8, DataType::LargeUtf8) => {
-                split_part_impl::<&GenericStringArray<i32>, &GenericStringArray<i64>, i32>(
-                    args[0].as_string::<i32>(),
-                    args[1].as_string::<i64>(),
-                    n_array,
+
+        // Dispatch on delimiter type for a given string array and builder.
+        macro_rules! split_part_for_delimiter_type {
+            ($str_arr:expr, $builder:expr) => {
+                match args[1].data_type() {
+                    DataType::Utf8View => split_part_impl(
+                        $str_arr,
+                        &args[1].as_string_view(),
+                        n_array,
+                        $builder,
+                    ),
+                    DataType::Utf8 => split_part_impl(
+                        $str_arr,
+                        &args[1].as_string::<i32>(),
+                        n_array,
+                        $builder,
+                    ),
+                    DataType::LargeUtf8 => split_part_impl(
+                        $str_arr,
+                        &args[1].as_string::<i64>(),
+                        n_array,
+                        $builder,
+                    ),
+                    other => {
+                        exec_err!("Unsupported delimiter type {other:?} for split_part")
+                    }
+                }
+            };
+        }
+
+        let result = match args[0].data_type() {
+            DataType::Utf8View => split_part_for_delimiter_type!(
+                &args[0].as_string_view(),
+                StringViewBuilder::with_capacity(inferred_length)
+            ),
+            DataType::Utf8 => {
+                let str_arr = &args[0].as_string::<i32>();
+                // Conservative under-estimate for data capacity: split_part
+                // output is typically much smaller than the input, so avoid
+                // pre-allocating the full input data size.
+                split_part_for_delimiter_type!(
+                    str_arr,
+                    GenericStringBuilder::<i32>::with_capacity(
+                        inferred_length,
+                        inferred_length,
+                    )
                 )
             }
-            (DataType::LargeUtf8, DataType::Utf8) => {
-                split_part_impl::<&GenericStringArray<i64>, &GenericStringArray<i32>, i64>(
-                    args[0].as_string::<i64>(),
-                    args[1].as_string::<i32>(),
-                    n_array,
+            DataType::LargeUtf8 => {
+                let str_arr = &args[0].as_string::<i64>();
+                // Conservative under-estimate; see Utf8 comment above.
+                split_part_for_delimiter_type!(
+                    str_arr,
+                    GenericStringBuilder::<i64>::with_capacity(
+                        inferred_length,
+                        inferred_length,
+                    )
                 )
             }
-            _ => exec_err!("Unsupported combination of argument types for split_part"),
+            other => exec_err!("Unsupported string type {other:?} for split_part"),
         };
         if is_scalar {
             // If all inputs are scalar, keep the output as scalar
@@ -200,58 +209,401 @@ impl ScalarUDFImpl for SplitPartFunc {
     }
 }
 
-/// impl
-pub fn split_part_impl<'a, StringArrType, DelimiterArrType, StringArrayLen>(
-    string_array: StringArrType,
-    delimiter_array: DelimiterArrType,
+/// Finds the `n`th (0-based) split part of `string` by `delimiter`.
+#[inline]
+fn split_nth<'a>(string: &'a str, delimiter: &str, n: usize) -> Option<&'a str> {
+    if delimiter.len() == 1 {
+        // A single-byte UTF-8 string is always ASCII, so we can safely cast
+        // just the first byte to a character. `str::split(char)` internally
+        // uses memchr::memchr and is notably faster than `str::split(&str)`,
+        // even for a single character string.
+        string.split(delimiter.as_bytes()[0] as char).nth(n)
+    } else {
+        string.split(delimiter).nth(n)
+    }
+}
+
+/// Like `split_nth` but splits from the right (`n` is 0-based from the end).
+#[inline]
+fn rsplit_nth<'a>(string: &'a str, delimiter: &str, n: usize) -> Option<&'a str> {
+    if delimiter.len() == 1 {
+        // A single-byte UTF-8 string is always ASCII, so we can safely cast
+        // just the first byte to a character. `str::rsplit(char)` internally
+        // uses memchr::memrchr and is notably faster than `str::rsplit(&str)`,
+        // even for a single character string.
+        string.rsplit(delimiter.as_bytes()[0] as char).nth(n)
+    } else {
+        string.rsplit(delimiter).nth(n)
+    }
+}
+
+/// Fast path for `split_part(array, scalar_delimiter, scalar_position)`.
+fn split_part_scalar(
+    string_array: &ArrayRef,
+    delim_scalar: &ScalarValue,
+    pos_scalar: &ScalarValue,
+) -> Result<ColumnarValue> {
+    // Empty input array → empty result.
+    if string_array.is_empty() {
+        return Ok(ColumnarValue::Array(new_null_array(
+            string_array.data_type(),
+            0,
+        )));
+    }
+
+    let delimiter = delim_scalar.try_as_str().ok_or_else(|| {
+        exec_datafusion_err!(
+            "Unsupported delimiter type {:?} for split_part",
+            delim_scalar.data_type()
+        )
+    })?;
+
+    let position = match pos_scalar {
+        ScalarValue::Int64(v) => *v,
+        other => {
+            return exec_err!(
+                "Unsupported position type {:?} for split_part",
+                other.data_type()
+            );
+        }
+    };
+
+    // Null delimiter or position → every row is null.
+    let (Some(delimiter), Some(position)) = (delimiter, position) else {
+        return Ok(ColumnarValue::Array(new_null_array(
+            string_array.data_type(),
+            string_array.len(),
+        )));
+    };
+
+    if position == 0 {
+        return exec_err!("field position must not be zero");
+    }
+
+    let result = match string_array.data_type() {
+        DataType::Utf8View => {
+            split_part_scalar_view(string_array.as_string_view(), delimiter, position)
+        }
+        DataType::Utf8 => {
+            let arr = string_array.as_string::<i32>();
+            // Conservative under-estimate for data capacity: split_part output
+            // is typically much smaller than the input, so avoid pre-allocating
+            // the full input data size.
+            split_part_scalar_impl(
+                arr,
+                delimiter,
+                position,
+                GenericStringBuilder::<i32>::with_capacity(arr.len(), arr.len()),
+            )
+        }
+        DataType::LargeUtf8 => {
+            let arr = string_array.as_string::<i64>();
+            // Conservative under-estimate; see Utf8 comment above.
+            split_part_scalar_impl(
+                arr,
+                delimiter,
+                position,
+                GenericStringBuilder::<i64>::with_capacity(arr.len(), arr.len()),
+            )
+        }
+        other => exec_err!("Unsupported string type {other:?} for split_part"),
+    }?;
+
+    Ok(ColumnarValue::Array(result))
+}
+
+/// Inner implementation for the scalar-delimiter, scalar-position fast path.
+/// Constructing a `memmem::Finder` is somewhat expensive but it's a win when
+/// done once and amortized over the entire batch.
+fn split_part_scalar_impl<'a, S, B>(
+    string_array: S,
+    delimiter: &str,
+    position: i64,
+    builder: B,
+) -> Result<ArrayRef>
+where
+    S: StringArrayType<'a> + Copy,
+    B: StringLikeArrayBuilder,
+{
+    if delimiter.is_empty() {
+        // PostgreSQL: empty delimiter treats input as a single field,
+        // so only position 1 or -1 returns the input string.
+        return if position == 1 || position == -1 {
+            map_strings(string_array, builder, Some)
+        } else {
+            map_strings(string_array, builder, |_| None)
+        };
+    }
+
+    let delim_bytes = delimiter.as_bytes();
+    let delim_len = delimiter.len();
+
+    if position > 0 {
+        let idx: usize = (position - 1).try_into().map_err(|_| {
+            exec_datafusion_err!(
+                "split_part index {position} exceeds maximum supported value"
+            )
+        })?;
+        let finder = memmem::Finder::new(delim_bytes);
+        map_strings(string_array, builder, |s| {
+            split_nth_finder(s, &finder, delim_len, idx)
+        })
+    } else {
+        let idx: usize = (position.unsigned_abs() - 1).try_into().map_err(|_| {
+            exec_datafusion_err!(
+                "split_part index {position} exceeds minimum supported value"
+            )
+        })?;
+        let finder_rev = memmem::FinderRev::new(delim_bytes);
+        map_strings(string_array, builder, |s| {
+            rsplit_nth_finder(s, &finder_rev, delim_len, idx)
+        })
+    }
+}
+
+/// Applies `f` to each non-null string in `string_array`, appending the
+/// result (or `""` when `f` returns `None`) to `builder`.
+#[inline]
+fn map_strings<'a, S, B, F>(string_array: S, mut builder: B, f: F) -> Result<ArrayRef>
+where
+    S: StringArrayType<'a> + Copy,
+    B: StringLikeArrayBuilder,
+    F: Fn(&'a str) -> Option<&'a str>,
+{
+    for string in string_array.iter() {
+        match string {
+            Some(s) => builder.append_value(f(s).unwrap_or("")),
+            None => builder.append_null(),
+        }
+    }
+    Ok(Arc::new(builder.finish()) as ArrayRef)
+}
+
+/// Finds the `n`th (0-based) split part using a pre-built `memmem::Finder`.
+#[inline]
+fn split_nth_finder<'a>(
+    string: &'a str,
+    finder: &memmem::Finder,
+    delim_len: usize,
+    n: usize,
+) -> Option<&'a str> {
+    let bytes = string.as_bytes();
+    let mut start = 0;
+    for _ in 0..n {
+        match finder.find(&bytes[start..]) {
+            Some(pos) => start += pos + delim_len,
+            None => return None,
+        }
+    }
+    match finder.find(&bytes[start..]) {
+        Some(pos) => Some(&string[start..start + pos]),
+        None => Some(&string[start..]),
+    }
+}
+
+/// Like `split_nth_finder` but splits from the right (`n` is 0-based from
+/// the end).
+#[inline]
+fn rsplit_nth_finder<'a>(
+    string: &'a str,
+    finder: &memmem::FinderRev,
+    delim_len: usize,
+    n: usize,
+) -> Option<&'a str> {
+    let bytes = string.as_bytes();
+    let mut end = bytes.len();
+    for _ in 0..n {
+        match finder.rfind(&bytes[..end]) {
+            Some(pos) => end = pos,
+            None => return None,
+        }
+    }
+    match finder.rfind(&bytes[..end]) {
+        Some(pos) => Some(&string[pos + delim_len..end]),
+        None => Some(&string[..end]),
+    }
+}
+
+/// Zero-copy scalar fast path for `StringViewArray` inputs.
+///
+/// Instead of copying substring bytes into a new buffer, constructs
+/// `StringView` entries that point back into the original array's data
+/// buffers.
+fn split_part_scalar_view(
+    string_view_array: &StringViewArray,
+    delimiter: &str,
+    position: i64,
+) -> Result<ArrayRef> {
+    let len = string_view_array.len();
+    let mut views_buf = Vec::with_capacity(len);
+    let views = string_view_array.views();
+
+    if delimiter.is_empty() {
+        // PostgreSQL: empty delimiter treats input as a single field.
+        let empty_view = make_view(b"", 0, 0);
+        let return_input = position == 1 || position == -1;
+        for i in 0..len {
+            if string_view_array.is_null(i) {
+                views_buf.push(0);
+            } else if return_input {
+                views_buf.push(views[i]);
+            } else {
+                views_buf.push(empty_view);
+            }
+        }
+    } else if position > 0 {
+        let idx: usize = (position - 1).try_into().map_err(|_| {
+            exec_datafusion_err!(
+                "split_part index {position} exceeds maximum supported value"
+            )
+        })?;
+        let finder = memmem::Finder::new(delimiter.as_bytes());
+        split_view_loop(string_view_array, views, &mut views_buf, |s| {
+            split_nth_finder(s, &finder, delimiter.len(), idx)
+        });
+    } else {
+        let idx: usize = (position.unsigned_abs() - 1).try_into().map_err(|_| {
+            exec_datafusion_err!(
+                "split_part index {position} exceeds minimum supported value"
+            )
+        })?;
+        let finder_rev = memmem::FinderRev::new(delimiter.as_bytes());
+        split_view_loop(string_view_array, views, &mut views_buf, |s| {
+            rsplit_nth_finder(s, &finder_rev, delimiter.len(), idx)
+        });
+    }
+
+    let views_buf = ScalarBuffer::from(views_buf);
+
+    // Nulls pass through unchanged, so we can use the input's null array.
+    let nulls = string_view_array.nulls().cloned();
+
+    // Safety: each view is either copied unchanged from the input, or built
+    // by `substr_view` from a substring that is a contiguous sub-range of the
+    // original string value stored in the input's data buffers.
+    unsafe {
+        Ok(Arc::new(StringViewArray::new_unchecked(
+            views_buf,
+            string_view_array.data_buffers().to_vec(),
+            nulls,
+        )) as ArrayRef)
+    }
+}
+
+/// Creates a `StringView` referencing a substring of an existing view's buffer.
+/// For substrings ≤ 12 bytes, creates an inline view instead.
+#[inline]
+fn substr_view(original_view: &u128, substr: &str, start_offset: u32) -> u128 {
+    if substr.len() > 12 {
+        let view = ByteView::from(*original_view);
+        make_view(
+            substr.as_bytes(),
+            view.buffer_index,
+            view.offset + start_offset,
+        )
+    } else {
+        make_view(substr.as_bytes(), 0, 0)
+    }
+}
+
+/// Applies `split_fn` to each non-null string and appends the resulting view to
+/// `views_buf`.
+#[inline(always)]
+fn split_view_loop<F>(
+    string_view_array: &StringViewArray,
+    views: &[u128],
+    views_buf: &mut Vec<u128>,
+    split_fn: F,
+) where
+    F: Fn(&str) -> Option<&str>,
+{
+    let empty_view = make_view(b"", 0, 0);
+    for (i, raw_view) in views.iter().enumerate() {
+        if string_view_array.is_null(i) {
+            views_buf.push(0);
+            continue;
+        }
+        let string = string_view_array.value(i);
+        match split_fn(string) {
+            Some(substr) => {
+                let start_offset = substr.as_ptr() as usize - string.as_ptr() as usize;
+                views_buf.push(substr_view(raw_view, substr, start_offset as u32));
+            }
+            None => views_buf.push(empty_view),
+        }
+    }
+}
+
+fn split_part_impl<'a, StringArrType, DelimiterArrType, B>(
+    string_array: &StringArrType,
+    delimiter_array: &DelimiterArrType,
     n_array: &Int64Array,
+    mut builder: B,
 ) -> Result<ArrayRef>
 where
     StringArrType: StringArrayType<'a>,
     DelimiterArrType: StringArrayType<'a>,
-    StringArrayLen: OffsetSizeTrait,
+    B: StringLikeArrayBuilder,
 {
-    let mut builder: GenericStringBuilder<StringArrayLen> = GenericStringBuilder::new();
-
-    string_array
+    for ((string, delimiter), n) in string_array
         .iter()
         .zip(delimiter_array.iter())
         .zip(n_array.iter())
-        .try_for_each(|((string, delimiter), n)| -> Result<(), DataFusionError> {
-            match (string, delimiter, n) {
-                (Some(string), Some(delimiter), Some(n)) => {
-                    let split_string: Vec<&str> = string.split(delimiter).collect();
-                    let len = split_string.len();
-
-                    let index = match n.cmp(&0) {
-                        std::cmp::Ordering::Less => len as i64 + n,
-                        std::cmp::Ordering::Equal => {
-                            return exec_err!("field position must not be zero");
+    {
+        match (string, delimiter, n) {
+            (Some(string), Some(delimiter), Some(n)) => {
+                let result = match n.cmp(&0) {
+                    std::cmp::Ordering::Greater => {
+                        let idx: usize = (n - 1).try_into().map_err(|_| {
+                            exec_datafusion_err!(
+                                "split_part index {n} exceeds maximum supported value"
+                            )
+                        })?;
+                        if delimiter.is_empty() {
+                            // Match PostgreSQL's behavior: empty delimiter
+                            // treats input as a single field, so only position
+                            // 1 returns data.
+                            (n == 1).then_some(string)
+                        } else {
+                            split_nth(string, delimiter, idx)
                         }
-                        std::cmp::Ordering::Greater => n - 1,
-                    } as usize;
-
-                    if index < len {
-                        builder.append_value(split_string[index]);
-                    } else {
-                        builder.append_value("");
                     }
-                }
-                _ => builder.append_null(),
+                    std::cmp::Ordering::Less => {
+                        let idx: usize =
+                            (n.unsigned_abs() - 1).try_into().map_err(|_| {
+                                exec_datafusion_err!(
+                                    "split_part index {n} exceeds minimum supported value"
+                                )
+                            })?;
+                        if delimiter.is_empty() {
+                            // Match PostgreSQL's behavior: empty delimiter
+                            // treats input as a single field, so only position
+                            // -1 returns data.
+                            (n == -1).then_some(string)
+                        } else {
+                            rsplit_nth(string, delimiter, idx)
+                        }
+                    }
+                    std::cmp::Ordering::Equal => {
+                        return exec_err!("field position must not be zero");
+                    }
+                };
+                builder.append_value(result.unwrap_or(""));
             }
-            Ok(())
-        })?;
+            _ => builder.append_null(),
+        }
+    }
 
     Ok(Arc::new(builder.finish()) as ArrayRef)
 }
 
 #[cfg(test)]
 mod tests {
-    use arrow::array::{Array, StringArray};
+    use arrow::array::{Array, AsArray, StringArray, StringViewArray};
     use arrow::datatypes::DataType::Utf8;
 
     use datafusion_common::ScalarValue;
-    use datafusion_common::{exec_err, Result};
+    use datafusion_common::{Result, exec_err};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
 
     use crate::string::split_part::SplitPartFunc;
@@ -315,6 +667,158 @@ mod tests {
             Utf8,
             StringArray
         );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(
+                    "abc~@~def~@~ghi"
+                )))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("~@~")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(i64::MIN))),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        // Edge cases with delimiters
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(",")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
+            ],
+            Ok(Some("a")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(",")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
+            ],
+            Ok(Some("a,b")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(" ")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
+            ],
+            Ok(Some("a,b")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(" ")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8,
+            StringArray
+        );
+
+        // Edge cases with delimiters with negative n
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(-1))),
+            ],
+            Ok(Some("a,b")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(" ")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(-1))),
+            ],
+            Ok(Some("a,b")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(-2))),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8,
+            StringArray
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_split_part_stringview_sliced() -> Result<()> {
+        use super::split_part_scalar_view;
+
+        let strings: StringViewArray = vec![
+            Some("skip_this.value"),
+            Some("this_is_a_long_prefix.suffix"),
+            Some("short.val"),
+            Some("another_long_result.rest"),
+            None,
+        ]
+        .into_iter()
+        .collect();
+
+        // Slice off the first element to get a non-zero offset array.
+        let sliced = strings.slice(1, 4);
+        let result = split_part_scalar_view(&sliced, ".", 1)?;
+        let result = result.as_string_view();
+        assert_eq!(result.len(), 4);
+        assert_eq!(result.value(0), "this_is_a_long_prefix");
+        assert_eq!(result.value(1), "short");
+        assert_eq!(result.value(2), "another_long_result");
+        assert!(result.is_null(3));
 
         Ok(())
     }
diff --git a/datafusion/functions/src/string/starts_with.rs b/datafusion/functions/src/string/starts_with.rs
index c4159cba86f34..c89bd66d72cdf 100644
--- a/datafusion/functions/src/string/starts_with.rs
+++ b/datafusion/functions/src/string/starts_with.rs
@@ -15,50 +15,24 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
-use arrow::array::ArrayRef;
+use arrow::array::{ArrayRef, Scalar};
+use arrow::compute::kernels::comparison::starts_with as arrow_starts_with;
 use arrow::datatypes::DataType;
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion_common::types::logical_string;
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, exec_err};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr::type_coercion::binary::{
     binary_to_string_coercion, string_coercion,
 };
-
-use crate::utils::make_scalar_function;
-use datafusion_common::types::logical_string;
-use datafusion_common::{internal_err, Result, ScalarValue};
 use datafusion_expr::{
-    cast, Coercion, ColumnarValue, Documentation, Expr, Like, ScalarFunctionArgs,
-    ScalarUDFImpl, Signature, TypeSignatureClass, Volatility,
+    Coercion, ColumnarValue, Documentation, Expr, Like, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignatureClass, Volatility, cast,
 };
 use datafusion_macros::user_doc;
 
-/// Returns true if string starts with prefix.
-/// starts_with('alphabet', 'alph') = 't'
-fn starts_with(args: &[ArrayRef]) -> Result<ArrayRef> {
-    if let Some(coercion_data_type) =
-        string_coercion(args[0].data_type(), args[1].data_type()).or_else(|| {
-            binary_to_string_coercion(args[0].data_type(), args[1].data_type())
-        })
-    {
-        let arg0 = if args[0].data_type() == &coercion_data_type {
-            Arc::clone(&args[0])
-        } else {
-            arrow::compute::kernels::cast::cast(&args[0], &coercion_data_type)?
-        };
-        let arg1 = if args[1].data_type() == &coercion_data_type {
-            Arc::clone(&args[1])
-        } else {
-            arrow::compute::kernels::cast::cast(&args[1], &coercion_data_type)?
-        };
-        let result = arrow::compute::kernels::comparison::starts_with(&arg0, &arg1)?;
-        Ok(Arc::new(result) as ArrayRef)
-    } else {
-        internal_err!("Unsupported data types for starts_with. Expected Utf8, LargeUtf8 or Utf8View")
-    }
-}
-
 #[user_doc(
     doc_section(label = "String Functions"),
     description = "Tests if a string starts with a substring.",
@@ -100,10 +74,6 @@ impl StartsWithFunc {
 }
 
 impl ScalarUDFImpl for StartsWithFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "starts_with"
     }
@@ -117,30 +87,93 @@ impl ScalarUDFImpl for StartsWithFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        match args.args[0].data_type() {
-            DataType::Utf8View | DataType::Utf8 | DataType::LargeUtf8 => {
-                make_scalar_function(starts_with, vec![])(&args.args)
+        let [str_arg, prefix_arg] = take_function_args(self.name(), &args.args)?;
+
+        // Determine the common type for coercion
+        let coercion_type = string_coercion(
+            &str_arg.data_type(),
+            &prefix_arg.data_type(),
+        )
+        .or_else(|| {
+            binary_to_string_coercion(&str_arg.data_type(), &prefix_arg.data_type())
+        });
+
+        let Some(coercion_type) = coercion_type else {
+            return exec_err!(
+                "Unsupported data types {:?}, {:?} for function `starts_with`.",
+                str_arg.data_type(),
+                prefix_arg.data_type()
+            );
+        };
+
+        // Helper to cast an array if needed
+        let maybe_cast = |arr: &ArrayRef, target: &DataType| -> Result<ArrayRef> {
+            if arr.data_type() == target {
+                Ok(Arc::clone(arr))
+            } else {
+                Ok(arrow::compute::kernels::cast::cast(arr, target)?)
+            }
+        };
+
+        match (str_arg, prefix_arg) {
+            // Both scalars - just compute directly
+            (ColumnarValue::Scalar(str_scalar), ColumnarValue::Scalar(prefix_scalar)) => {
+                let str_arr = str_scalar.to_array_of_size(1)?;
+                let prefix_arr = prefix_scalar.to_array_of_size(1)?;
+                let str_arr = maybe_cast(&str_arr, &coercion_type)?;
+                let prefix_arr = maybe_cast(&prefix_arr, &coercion_type)?;
+                let result = arrow_starts_with(&str_arr, &prefix_arr)?;
+                Ok(ColumnarValue::Scalar(ScalarValue::try_from_array(
+                    &result, 0,
+                )?))
+            }
+            // String is array, prefix is scalar - use Scalar wrapper for optimization
+            (ColumnarValue::Array(str_arr), ColumnarValue::Scalar(prefix_scalar)) => {
+                let str_arr = maybe_cast(str_arr, &coercion_type)?;
+                let prefix_arr = prefix_scalar.to_array_of_size(1)?;
+                let prefix_arr = maybe_cast(&prefix_arr, &coercion_type)?;
+                let prefix_scalar = Scalar::new(prefix_arr);
+                let result = arrow_starts_with(&str_arr, &prefix_scalar)?;
+                Ok(ColumnarValue::Array(Arc::new(result)))
+            }
+            // String is scalar, prefix is array - use Scalar wrapper for string
+            (ColumnarValue::Scalar(str_scalar), ColumnarValue::Array(prefix_arr)) => {
+                let str_arr = str_scalar.to_array_of_size(1)?;
+                let str_arr = maybe_cast(&str_arr, &coercion_type)?;
+                let str_scalar = Scalar::new(str_arr);
+                let prefix_arr = maybe_cast(prefix_arr, &coercion_type)?;
+                let result = arrow_starts_with(&str_scalar, &prefix_arr)?;
+                Ok(ColumnarValue::Array(Arc::new(result)))
+            }
+            // Both arrays - pass directly
+            (ColumnarValue::Array(str_arr), ColumnarValue::Array(prefix_arr)) => {
+                let str_arr = maybe_cast(str_arr, &coercion_type)?;
+                let prefix_arr = maybe_cast(prefix_arr, &coercion_type)?;
+                let result = arrow_starts_with(&str_arr, &prefix_arr)?;
+                Ok(ColumnarValue::Array(Arc::new(result)))
             }
-            _ => internal_err!("Unsupported data types for starts_with. Expected Utf8, LargeUtf8 or Utf8View")?,
         }
     }
 
     fn simplify(
         &self,
         args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         if let Expr::Literal(scalar_value, _) = &args[1] {
             // Convert starts_with(col, 'prefix') to col LIKE 'prefix%' with proper escaping
-            // Example: starts_with(col, 'ja%') -> col LIKE 'ja\%%'
-            //   1. 'ja%'         (input pattern)
-            //   2. 'ja\%'        (escape special char '%')
-            //   3. 'ja\%%'       (add suffix for starts_with)
+            // Escapes pattern characters: starts_with(col, 'j\_a%') -> col LIKE 'j\\\_a\%%'
+            //   1. 'j\_a%'         (input pattern)
+            //   2. 'j\\\_a\%'       (escape special chars '%', '_' and '\')
+            //   3. 'j\\\_a\%%'      (add unescaped % suffix for starts_with)
             let like_expr = match scalar_value {
                 ScalarValue::Utf8(Some(pattern))
                 | ScalarValue::LargeUtf8(Some(pattern))
                 | ScalarValue::Utf8View(Some(pattern)) => {
-                    let escaped_pattern = pattern.replace("%", "\\%");
+                    let escaped_pattern = pattern
+                        .replace("\\", "\\\\")
+                        .replace("%", "\\%")
+                        .replace("_", "\\_");
                     let like_pattern = format!("{escaped_pattern}%");
                     Expr::Literal(ScalarValue::Utf8(Some(like_pattern)), None)
                 }
@@ -188,16 +221,16 @@ impl ScalarUDFImpl for StartsWithFunc {
 #[cfg(test)]
 mod tests {
     use crate::utils::test::test_function;
-    use arrow::array::{Array, BooleanArray};
+    use arrow::array::{Array, BooleanArray, StringArray};
     use arrow::datatypes::DataType::Boolean;
-    use datafusion_common::{Result, ScalarValue};
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+    use arrow::datatypes::Field;
+    use datafusion_common::config::ConfigOptions;
 
     use super::*;
 
     #[test]
-    fn test_functions() -> Result<()> {
-        // Generate test cases for starts_with
+    fn test_scalar_scalar() -> Result<()> {
+        // Test Scalar + Scalar combinations
         let test_cases = vec![
             (Some("alphabet"), Some("alph"), Some(true)),
             (Some("alphabet"), Some("bet"), Some(false)),
@@ -241,4 +274,154 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_array_scalar() -> Result<()> {
+        // Test Array + Scalar (the optimized path)
+        let array = ColumnarValue::Array(Arc::new(StringArray::from(vec![
+            Some("alphabet"),
+            Some("alphabet"),
+            Some("beta"),
+            None,
+        ])));
+        let scalar = ColumnarValue::Scalar(ScalarValue::Utf8(Some("alph".to_string())));
+
+        let args = vec![array, scalar];
+        test_function!(
+            StartsWithFunc::new(),
+            args,
+            Ok(Some(true)), // First element result
+            bool,
+            Boolean,
+            BooleanArray
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_scalar_full_result() {
+        // Test Array + Scalar and verify all results
+        let func = StartsWithFunc::new();
+        let array = Arc::new(StringArray::from(vec![
+            Some("alphabet"),
+            Some("alphabet"),
+            Some("beta"),
+            None,
+        ]));
+        let args = vec![
+            ColumnarValue::Array(array),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("alph".to_string()))),
+        ];
+
+        let result = func
+            .invoke_with_args(ScalarFunctionArgs {
+                args,
+                arg_fields: vec![
+                    Field::new("a", DataType::Utf8, true).into(),
+                    Field::new("b", DataType::Utf8, true).into(),
+                ],
+                number_rows: 4,
+                return_field: Field::new("f", Boolean, true).into(),
+                config_options: Arc::new(ConfigOptions::default()),
+            })
+            .unwrap();
+
+        let result_array = result.into_array(4).unwrap();
+        let bool_array = result_array
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .unwrap();
+
+        assert!(bool_array.value(0)); // "alphabet" starts with "alph"
+        assert!(bool_array.value(1)); // "alphabet" starts with "alph"
+        assert!(!bool_array.value(2)); // "beta" does not start with "alph"
+        assert!(bool_array.is_null(3)); // null input -> null output
+    }
+
+    #[test]
+    fn test_scalar_array() {
+        // Test Scalar + Array
+        let func = StartsWithFunc::new();
+        let prefixes = Arc::new(StringArray::from(vec![
+            Some("alph"),
+            Some("bet"),
+            Some("alpha"),
+            None,
+        ]));
+        let args = vec![
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("alphabet".to_string()))),
+            ColumnarValue::Array(prefixes),
+        ];
+
+        let result = func
+            .invoke_with_args(ScalarFunctionArgs {
+                args,
+                arg_fields: vec![
+                    Field::new("a", DataType::Utf8, true).into(),
+                    Field::new("b", DataType::Utf8, true).into(),
+                ],
+                number_rows: 4,
+                return_field: Field::new("f", Boolean, true).into(),
+                config_options: Arc::new(ConfigOptions::default()),
+            })
+            .unwrap();
+
+        let result_array = result.into_array(4).unwrap();
+        let bool_array = result_array
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .unwrap();
+
+        assert!(bool_array.value(0)); // "alphabet" starts with "alph"
+        assert!(!bool_array.value(1)); // "alphabet" does not start with "bet"
+        assert!(bool_array.value(2)); // "alphabet" starts with "alpha"
+        assert!(bool_array.is_null(3)); // null prefix -> null output
+    }
+
+    #[test]
+    fn test_array_array() {
+        // Test Array + Array
+        let func = StartsWithFunc::new();
+        let strings = Arc::new(StringArray::from(vec![
+            Some("alphabet"),
+            Some("rust"),
+            Some("datafusion"),
+            None,
+        ]));
+        let prefixes = Arc::new(StringArray::from(vec![
+            Some("alph"),
+            Some("ru"),
+            Some("hello"),
+            Some("test"),
+        ]));
+        let args = vec![
+            ColumnarValue::Array(strings),
+            ColumnarValue::Array(prefixes),
+        ];
+
+        let result = func
+            .invoke_with_args(ScalarFunctionArgs {
+                args,
+                arg_fields: vec![
+                    Field::new("a", DataType::Utf8, true).into(),
+                    Field::new("b", DataType::Utf8, true).into(),
+                ],
+                number_rows: 4,
+                return_field: Field::new("f", Boolean, true).into(),
+                config_options: Arc::new(ConfigOptions::default()),
+            })
+            .unwrap();
+
+        let result_array = result.into_array(4).unwrap();
+        let bool_array = result_array
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .unwrap();
+
+        assert!(bool_array.value(0)); // "alphabet" starts with "alph"
+        assert!(bool_array.value(1)); // "rust" starts with "ru"
+        assert!(!bool_array.value(2)); // "datafusion" does not start with "hello"
+        assert!(bool_array.is_null(3)); // null string -> null output
+    }
 }
diff --git a/datafusion/functions/src/string/to_hex.rs b/datafusion/functions/src/string/to_hex.rs
index 26be0066c2df3..497a0a1206922 100644
--- a/datafusion/functions/src/string/to_hex.rs
+++ b/datafusion/functions/src/string/to_hex.rs
@@ -15,64 +15,166 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
-use std::fmt::Write;
 use std::sync::Arc;
 
-use crate::utils::make_scalar_function;
-use arrow::array::{ArrayRef, GenericStringBuilder};
-use arrow::datatypes::DataType::{
-    Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, Utf8,
-};
+use arrow::array::{Array, ArrayRef, StringArray};
+use arrow::buffer::{Buffer, OffsetBuffer};
 use arrow::datatypes::{
-    ArrowNativeType, ArrowPrimitiveType, DataType, Int16Type, Int32Type, Int64Type,
-    Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
+    ArrowNativeType, ArrowPrimitiveType, DataType, Int8Type, Int16Type, Int32Type,
+    Int64Type, UInt8Type, UInt16Type, UInt32Type, UInt64Type,
 };
 use datafusion_common::cast::as_primitive_array;
-use datafusion_common::Result;
-use datafusion_common::{exec_err, plan_err};
-
-use datafusion_expr::{ColumnarValue, Documentation};
-use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility};
-use datafusion_expr_common::signature::TypeSignature::Exact;
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
+};
 use datafusion_macros::user_doc;
 
+/// Hex lookup table for fast conversion
+const HEX_CHARS: &[u8; 16] = b"0123456789abcdef";
+
 /// Converts the number to its equivalent hexadecimal representation.
 /// to_hex(2147483647) = '7fffffff'
-pub fn to_hex<T: ArrowPrimitiveType>(args: &[ArrayRef]) -> Result<ArrayRef>
+fn to_hex_array<T: ArrowPrimitiveType>(array: &ArrayRef) -> Result<ArrayRef>
 where
-    T::Native: std::fmt::LowerHex,
+    T::Native: ToHex,
 {
-    let integer_array = as_primitive_array::<T>(&args[0])?;
+    let integer_array = as_primitive_array::<T>(array)?;
+    let len = integer_array.len();
 
-    let mut result = GenericStringBuilder::<i32>::with_capacity(
-        integer_array.len(),
-        // * 8 to convert to bits, / 4 bits per hex char
-        integer_array.len() * (T::Native::get_byte_width() * 8 / 4),
-    );
+    // Max hex string length: 16 chars for u64/i64
+    let max_hex_len = T::Native::get_byte_width() * 2;
 
-    for integer in integer_array {
-        if let Some(value) = integer {
-            if let Some(value_usize) = value.to_usize() {
-                write!(result, "{value_usize:x}")?;
-            } else if let Some(value_isize) = value.to_isize() {
-                write!(result, "{value_isize:x}")?;
-            } else {
-                return exec_err!(
-                    "Unsupported data type {integer:?} for function to_hex"
-                );
-            }
-            result.append_value("");
-        } else {
-            result.append_null();
-        }
+    // Pre-allocate buffers - avoid the builder API overhead
+    let mut offsets: Vec<i32> = Vec::with_capacity(len + 1);
+    let mut values: Vec<u8> = Vec::with_capacity(len * max_hex_len);
+
+    // Reusable buffer for hex conversion
+    let mut hex_buffer = [0u8; 16];
+
+    // Start with offset 0
+    offsets.push(0);
+
+    // Process all values directly (including null slots - we write empty strings for nulls)
+    // The null bitmap will mark which entries are actually null
+    for value in integer_array.values() {
+        let hex_len = value.write_hex_to_buffer(&mut hex_buffer);
+        values.extend_from_slice(&hex_buffer[16 - hex_len..]);
+        offsets.push(values.len() as i32);
     }
 
-    let result = result.finish();
+    // Copy null bitmap from input (nulls pass through unchanged)
+    let nulls = integer_array.nulls().cloned();
+
+    // SAFETY: offsets are valid (monotonically increasing, last value equals values.len())
+    // and values contains valid UTF-8 (only ASCII hex digits)
+    let offsets =
+        unsafe { OffsetBuffer::new_unchecked(Buffer::from_vec(offsets).into()) };
+    let result = StringArray::new(offsets, Buffer::from_vec(values), nulls);
 
     Ok(Arc::new(result) as ArrayRef)
 }
 
+#[inline]
+fn to_hex_scalar<T: ToHex>(value: T) -> String {
+    let mut hex_buffer = [0u8; 16];
+    let hex_len = value.write_hex_to_buffer(&mut hex_buffer);
+    // SAFETY: hex_buffer is ASCII hex digits
+    unsafe { std::str::from_utf8_unchecked(&hex_buffer[16 - hex_len..]).to_string() }
+}
+
+/// Trait for converting integer types to hexadecimal in a buffer
+trait ToHex: ArrowNativeType {
+    /// Write hex representation to buffer and return the number of hex digits written.
+    /// The hex digits are written right-aligned in the buffer (starting from position 16 - len).
+    fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize;
+}
+
+/// Write unsigned value to hex buffer and return the number of digits written.
+/// Digits are written right-aligned in the buffer.
+#[inline]
+fn write_unsigned_hex_to_buffer(value: u64, buffer: &mut [u8; 16]) -> usize {
+    if value == 0 {
+        buffer[15] = b'0';
+        return 1;
+    }
+
+    // Write hex digits from right to left
+    let mut pos = 16;
+    let mut v = value;
+    while v > 0 {
+        pos -= 1;
+        buffer[pos] = HEX_CHARS[(v & 0xf) as usize];
+        v >>= 4;
+    }
+
+    16 - pos
+}
+
+/// Write signed value to hex buffer (two's complement for negative) and return digit count
+#[inline]
+fn write_signed_hex_to_buffer(value: i64, buffer: &mut [u8; 16]) -> usize {
+    // For negative values, use two's complement representation (same as casting to u64)
+    write_unsigned_hex_to_buffer(value as u64, buffer)
+}
+
+impl ToHex for i8 {
+    #[inline]
+    fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
+        write_signed_hex_to_buffer(self as i64, buffer)
+    }
+}
+
+impl ToHex for i16 {
+    #[inline]
+    fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
+        write_signed_hex_to_buffer(self as i64, buffer)
+    }
+}
+
+impl ToHex for i32 {
+    #[inline]
+    fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
+        write_signed_hex_to_buffer(self as i64, buffer)
+    }
+}
+
+impl ToHex for i64 {
+    #[inline]
+    fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
+        write_signed_hex_to_buffer(self, buffer)
+    }
+}
+
+impl ToHex for u8 {
+    #[inline]
+    fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
+        write_unsigned_hex_to_buffer(self as u64, buffer)
+    }
+}
+
+impl ToHex for u16 {
+    #[inline]
+    fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
+        write_unsigned_hex_to_buffer(self as u64, buffer)
+    }
+}
+
+impl ToHex for u32 {
+    #[inline]
+    fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
+        write_unsigned_hex_to_buffer(self as u64, buffer)
+    }
+}
+
+impl ToHex for u64 {
+    #[inline]
+    fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
+        write_unsigned_hex_to_buffer(self, buffer)
+    }
+}
+
 #[user_doc(
     doc_section(label = "String Functions"),
     description = "Converts an integer to a hexadecimal string.",
@@ -101,17 +203,8 @@ impl Default for ToHexFunc {
 impl ToHexFunc {
     pub fn new() -> Self {
         Self {
-            signature: Signature::one_of(
-                vec![
-                    Exact(vec![Int8]),
-                    Exact(vec![Int16]),
-                    Exact(vec![Int32]),
-                    Exact(vec![Int64]),
-                    Exact(vec![UInt8]),
-                    Exact(vec![UInt16]),
-                    Exact(vec![UInt32]),
-                    Exact(vec![UInt64]),
-                ],
+            signature: Signature::coercible(
+                vec![Coercion::new_exact(TypeSignatureClass::Integer)],
                 Volatility::Immutable,
             ),
         }
@@ -119,10 +212,6 @@ impl ToHexFunc {
 }
 
 impl ScalarUDFImpl for ToHexFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "to_hex"
     }
@@ -131,26 +220,76 @@ impl ScalarUDFImpl for ToHexFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(match arg_types[0] {
-            Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 => Utf8,
-            _ => {
-                return plan_err!("The to_hex function can only accept integers.");
-            }
-        })
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Utf8)
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        match args.args[0].data_type() {
-            Int64 => make_scalar_function(to_hex::<Int64Type>, vec![])(&args.args),
-            UInt64 => make_scalar_function(to_hex::<UInt64Type>, vec![])(&args.args),
-            Int32 => make_scalar_function(to_hex::<Int32Type>, vec![])(&args.args),
-            UInt32 => make_scalar_function(to_hex::<UInt32Type>, vec![])(&args.args),
-            Int16 => make_scalar_function(to_hex::<Int16Type>, vec![])(&args.args),
-            UInt16 => make_scalar_function(to_hex::<UInt16Type>, vec![])(&args.args),
-            Int8 => make_scalar_function(to_hex::<Int8Type>, vec![])(&args.args),
-            UInt8 => make_scalar_function(to_hex::<UInt8Type>, vec![])(&args.args),
-            other => exec_err!("Unsupported data type {other:?} for function to_hex"),
+        let arg = &args.args[0];
+
+        match arg {
+            ColumnarValue::Scalar(ScalarValue::Int64(Some(v))) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))),
+            ),
+            ColumnarValue::Scalar(ScalarValue::UInt64(Some(v))) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))),
+            ),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(v))) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))),
+            ),
+            ColumnarValue::Scalar(ScalarValue::UInt32(Some(v))) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))),
+            ),
+            ColumnarValue::Scalar(ScalarValue::Int16(Some(v))) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))),
+            ),
+            ColumnarValue::Scalar(ScalarValue::UInt16(Some(v))) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))),
+            ),
+            ColumnarValue::Scalar(ScalarValue::Int8(Some(v))) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))),
+            ),
+            ColumnarValue::Scalar(ScalarValue::UInt8(Some(v))) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))),
+            ),
+
+            // NULL scalars
+            ColumnarValue::Scalar(s) if s.is_null() => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
+            }
+
+            ColumnarValue::Array(array) => match array.data_type() {
+                DataType::Int64 => {
+                    Ok(ColumnarValue::Array(to_hex_array::<Int64Type>(array)?))
+                }
+                DataType::UInt64 => {
+                    Ok(ColumnarValue::Array(to_hex_array::<UInt64Type>(array)?))
+                }
+                DataType::Int32 => {
+                    Ok(ColumnarValue::Array(to_hex_array::<Int32Type>(array)?))
+                }
+                DataType::UInt32 => {
+                    Ok(ColumnarValue::Array(to_hex_array::<UInt32Type>(array)?))
+                }
+                DataType::Int16 => {
+                    Ok(ColumnarValue::Array(to_hex_array::<Int16Type>(array)?))
+                }
+                DataType::UInt16 => {
+                    Ok(ColumnarValue::Array(to_hex_array::<UInt16Type>(array)?))
+                }
+                DataType::Int8 => {
+                    Ok(ColumnarValue::Array(to_hex_array::<Int8Type>(array)?))
+                }
+                DataType::UInt8 => {
+                    Ok(ColumnarValue::Array(to_hex_array::<UInt8Type>(array)?))
+                }
+                other => exec_err!("Unsupported data type {other:?} for function to_hex"),
+            },
+
+            other => internal_err!(
+                "Unexpected argument type {:?} for function to_hex",
+                other.data_type()
+            ),
         }
     }
 
@@ -162,8 +301,8 @@ impl ScalarUDFImpl for ToHexFunc {
 #[cfg(test)]
 mod tests {
     use arrow::array::{
-        Int16Array, Int32Array, Int64Array, Int8Array, StringArray, UInt16Array,
-        UInt32Array, UInt64Array, UInt8Array,
+        Int8Array, Int16Array, Int32Array, Int64Array, StringArray, UInt8Array,
+        UInt16Array, UInt32Array, UInt64Array,
     };
     use datafusion_common::cast::as_string_array;
 
@@ -189,8 +328,8 @@ mod tests {
                 let expected = $expected;
 
                 let array = <$array_type>::from(input);
-                let array_ref = Arc::new(array);
-                let hex_result = to_hex::<$arrow_type>(&[array_ref])?;
+                let array_ref: ArrayRef = Arc::new(array);
+                let hex_result = to_hex_array::<$arrow_type>(&array_ref)?;
                 let hex_array = as_string_array(&hex_result)?;
                 let expected_array = StringArray::from(expected);
 
diff --git a/datafusion/functions/src/string/upper.rs b/datafusion/functions/src/string/upper.rs
index 8bb2ec1d511cd..a89c3dfb05063 100644
--- a/datafusion/functions/src/string/upper.rs
+++ b/datafusion/functions/src/string/upper.rs
@@ -16,16 +16,14 @@
 // under the License.
 
 use crate::string::common::to_upper;
-use crate::utils::utf8_to_str_type;
 use arrow::datatypes::DataType;
-use datafusion_common::types::logical_string;
 use datafusion_common::Result;
+use datafusion_common::types::logical_string;
 use datafusion_expr::{
     Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
     TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
 
 #[user_doc(
     doc_section(label = "String Functions"),
@@ -68,10 +66,6 @@ impl UpperFunc {
 }
 
 impl ScalarUDFImpl for UpperFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "upper"
     }
@@ -81,7 +75,7 @@ impl ScalarUDFImpl for UpperFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_to_str_type(&arg_types[0], "upper")
+        Ok(arg_types[0].clone())
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
@@ -96,8 +90,7 @@ impl ScalarUDFImpl for UpperFunc {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::array::{Array, ArrayRef, StringArray};
-    use arrow::datatypes::DataType::Utf8;
+    use arrow::array::{Array, ArrayRef, StringArray, StringViewArray};
     use arrow::datatypes::Field;
     use datafusion_common::config::ConfigOptions;
     use std::sync::Arc;
@@ -110,7 +103,7 @@ mod tests {
             number_rows: input.len(),
             args: vec![ColumnarValue::Array(input)],
             arg_fields: vec![arg_field],
-            return_field: Field::new("f", Utf8, true).into(),
+            return_field: Field::new("f", expected.data_type().clone(), true).into(),
             config_options: Arc::new(ConfigOptions::default()),
         };
 
@@ -196,4 +189,21 @@ mod tests {
 
         to_upper(input, expected)
     }
+
+    #[test]
+    fn upper_utf8view() -> Result<()> {
+        let input = Arc::new(StringViewArray::from(vec![
+            Some("arrow"),
+            None,
+            Some("tschüß"),
+        ])) as ArrayRef;
+
+        let expected = Arc::new(StringViewArray::from(vec![
+            Some("ARROW"),
+            None,
+            Some("TSCHÜSS"),
+        ])) as ArrayRef;
+
+        to_upper(input, expected)
+    }
 }
diff --git a/datafusion/functions/src/string/uuid.rs b/datafusion/functions/src/string/uuid.rs
index a5ad6db5354f3..0bf680b44a183 100644
--- a/datafusion/functions/src/string/uuid.rs
+++ b/datafusion/functions/src/string/uuid.rs
@@ -15,23 +15,22 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
-use arrow::array::GenericStringBuilder;
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::Utf8;
 use rand::Rng;
 use uuid::Uuid;
 
-use datafusion_common::{internal_err, Result};
+use crate::strings::GenericStringArrayBuilder;
+use datafusion_common::{Result, assert_or_internal_err};
 use datafusion_expr::{ColumnarValue, Documentation, Volatility};
 use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature};
 use datafusion_macros::user_doc;
 
 #[user_doc(
     doc_section(label = "String Functions"),
-    description = "Returns [`UUID v4`](https://en.wikipedia.org/wiki/Universally_unique_identifier#Version_4_(random)) string value which is unique per row.",
+    description = "Returns [`UUID v4`](https://en.wikipedia.org/wiki/Universally_unique_identifier#Version_4_%28random%29) string value which is unique per row.",
     syntax_example = "uuid()",
     sql_example = r#"```sql
 > select uuid();
@@ -56,16 +55,12 @@ impl Default for UuidFunc {
 impl UuidFunc {
     pub fn new() -> Self {
         Self {
-            signature: Signature::exact(vec![], Volatility::Volatile),
+            signature: Signature::nullary(Volatility::Volatile),
         }
     }
 }
 
 impl ScalarUDFImpl for UuidFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "uuid"
     }
@@ -81,16 +76,18 @@ impl ScalarUDFImpl for UuidFunc {
     /// Prints random (v4) uuid values per row
     /// uuid() = 'a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11'
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        if !args.args.is_empty() {
-            return internal_err!("{} function does not accept arguments", self.name());
-        }
+        assert_or_internal_err!(
+            args.args.is_empty(),
+            "{} function does not accept arguments",
+            self.name()
+        );
 
         // Generate random u128 values
         let mut rng = rand::rng();
         let mut randoms = vec![0u128; args.number_rows];
         rng.fill(&mut randoms[..]);
 
-        let mut builder = GenericStringBuilder::<i32>::with_capacity(
+        let mut builder = GenericStringArrayBuilder::<i32>::with_capacity(
             args.number_rows,
             args.number_rows * 36,
         );
@@ -104,7 +101,7 @@ impl ScalarUDFImpl for UuidFunc {
             builder.append_value(fmt.encode_lower(&mut buffer));
         }
 
-        Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+        Ok(ColumnarValue::Array(Arc::new(builder.finish(None)?)))
     }
 
     fn documentation(&self) -> Option<&Documentation> {
diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs
index 108c20e136670..8d9109bc2ab60 100644
--- a/datafusion/functions/src/strings.rs
+++ b/datafusion/functions/src/strings.rs
@@ -15,24 +15,38 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::marker::PhantomData;
 use std::mem::size_of;
+use std::sync::Arc;
+
+use datafusion_common::{Result, exec_datafusion_err, internal_err};
 
 use arrow::array::{
-    make_view, Array, ArrayAccessor, ArrayDataBuilder, ByteView, LargeStringArray,
-    NullBufferBuilder, StringArray, StringViewArray, StringViewBuilder,
+    Array, ArrayAccessor, ArrayDataBuilder, ArrayRef, BinaryArray, ByteView,
+    GenericStringArray, LargeStringArray, OffsetSizeTrait, StringArray, StringViewArray,
+    make_view,
 };
-use arrow::buffer::{MutableBuffer, NullBuffer};
+use arrow::buffer::{Buffer, MutableBuffer, NullBuffer, ScalarBuffer};
 use arrow::datatypes::DataType;
 
-/// Optimized version of the StringBuilder in Arrow that:
-/// 1. Precalculating the expected length of the result, avoiding reallocations.
-/// 2. Avoids creating / incrementally creating a `NullBufferBuilder`
-pub struct StringArrayBuilder {
+/// Builder used by `concat`/`concat_ws` to assemble a [`StringArray`] one row
+/// at a time from multiple input columns.
+///
+/// Each row is written via repeated `write` calls (one per input fragment)
+/// followed by a single `append_offset` to commit the row.  The output null
+/// buffer is computed in bulk by the caller and supplied to `finish`, avoiding
+/// per-row NULL handling work.
+///
+/// For the common "produce one `&str` per row" pattern, prefer
+/// `GenericStringArrayBuilder` instead.
+pub(crate) struct ConcatStringBuilder {
     offsets_buffer: MutableBuffer,
     value_buffer: MutableBuffer,
+    /// If true, a safety check is required during the `finish` call
+    tainted: bool,
 }
 
-impl StringArrayBuilder {
+impl ConcatStringBuilder {
     pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
         let capacity = item_capacity
             .checked_add(1)
@@ -45,6 +59,7 @@ impl StringArrayBuilder {
         Self {
             offsets_buffer,
             value_buffer: MutableBuffer::with_capacity(data_capacity),
+            tainted: false,
         }
     }
 
@@ -56,6 +71,7 @@ impl StringArrayBuilder {
         match column {
             ColumnarValueRef::Scalar(s) => {
                 self.value_buffer.extend_from_slice(s);
+                self.tainted = true;
             }
             ColumnarValueRef::NullableArray(array) => {
                 if !CHECK_VALID || array.is_valid(i) {
@@ -75,6 +91,12 @@ impl StringArrayBuilder {
                         .extend_from_slice(array.value(i).as_bytes());
                 }
             }
+            ColumnarValueRef::NullableBinaryArray(array) => {
+                if !CHECK_VALID || array.is_valid(i) {
+                    self.value_buffer.extend_from_slice(array.value(i));
+                }
+                self.tainted = true;
+            }
             ColumnarValueRef::NonNullableArray(array) => {
                 self.value_buffer
                     .extend_from_slice(array.value(i).as_bytes());
@@ -87,31 +109,36 @@ impl StringArrayBuilder {
                 self.value_buffer
                     .extend_from_slice(array.value(i).as_bytes());
             }
+            ColumnarValueRef::NonNullableBinaryArray(array) => {
+                self.value_buffer.extend_from_slice(array.value(i));
+                self.tainted = true;
+            }
         }
     }
 
-    pub fn append_offset(&mut self) {
+    pub fn append_offset(&mut self) -> Result<()> {
         let next_offset: i32 = self
             .value_buffer
             .len()
             .try_into()
-            .expect("byte array offset overflow");
+            .map_err(|_| exec_datafusion_err!("byte array offset overflow"))?;
         self.offsets_buffer.push(next_offset);
+        Ok(())
     }
 
     /// Finalize the builder into a concrete [`StringArray`].
     ///
-    /// # Panics
+    /// # Errors
     ///
-    /// This method can panic when:
+    /// Returns an error when:
     ///
     /// - the provided `null_buffer` is not the same length as the `offsets_buffer`.
-    pub fn finish(self, null_buffer: Option<NullBuffer>) -> StringArray {
+    pub fn finish(self, null_buffer: Option<NullBuffer>) -> Result<StringArray> {
         let row_count = self.offsets_buffer.len() / size_of::<i32>() - 1;
-        if let Some(ref null_buffer) = null_buffer {
-            assert_eq!(
-                null_buffer.len(),
-                row_count,
+        if let Some(ref null_buffer) = null_buffer
+            && null_buffer.len() != row_count
+        {
+            return internal_err!(
                 "Null buffer and offsets buffer must be the same length"
             );
         }
@@ -120,24 +147,45 @@ impl StringArrayBuilder {
             .add_buffer(self.offsets_buffer.into())
             .add_buffer(self.value_buffer.into())
             .nulls(null_buffer);
-        // SAFETY: all data that was appended was valid UTF8 and the values
-        // and offsets were created correctly
-        let array_data = unsafe { array_builder.build_unchecked() };
-        StringArray::from(array_data)
+        if self.tainted {
+            // Raw binary arrays with possible invalid utf-8 were used,
+            // so let ArrayDataBuilder perform validation
+            let array_data = array_builder.build()?;
+            Ok(StringArray::from(array_data))
+        } else {
+            // SAFETY: all data that was appended was valid UTF8 and the values
+            // and offsets were created correctly
+            let array_data = unsafe { array_builder.build_unchecked() };
+            Ok(StringArray::from(array_data))
+        }
     }
 }
 
-pub struct StringViewArrayBuilder {
-    builder: StringViewBuilder,
-    block: String,
+/// Builder used by `concat`/`concat_ws` to assemble a [`StringViewArray`] one
+/// row at a time from multiple input columns.
+///
+/// Each row is written via repeated `write` calls (one per input
+/// fragment) followed by a single `append_offset` to commit the row
+/// as a single string view. The output null buffer is supplied by the caller
+/// at `finish` time, avoiding per-row NULL handling work.
+///
+/// For the common "produce one `&str` per row" pattern, prefer
+/// [`StringViewArrayBuilder`] instead.
+pub(crate) struct ConcatStringViewBuilder {
+    views: Vec<u128>,
+    data: Vec<u8>,
+    block: Vec<u8>,
+    /// If true, a safety check is required during the `append_offset` call
+    tainted: bool,
 }
 
-impl StringViewArrayBuilder {
-    pub fn with_capacity(_item_capacity: usize, data_capacity: usize) -> Self {
-        let builder = StringViewBuilder::with_capacity(data_capacity);
+impl ConcatStringViewBuilder {
+    pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
         Self {
-            builder,
-            block: String::new(),
+            views: Vec::with_capacity(item_capacity),
+            data: Vec::with_capacity(data_capacity),
+            block: vec![],
+            tainted: false,
         }
     }
 
@@ -148,60 +196,124 @@ impl StringViewArrayBuilder {
     ) {
         match column {
             ColumnarValueRef::Scalar(s) => {
-                self.block.push_str(std::str::from_utf8(s).unwrap());
+                self.block.extend_from_slice(s);
+                self.tainted = true;
             }
             ColumnarValueRef::NullableArray(array) => {
                 if !CHECK_VALID || array.is_valid(i) {
-                    self.block.push_str(
-                        std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
-                    );
+                    self.block.extend_from_slice(array.value(i).as_bytes());
                 }
             }
             ColumnarValueRef::NullableLargeStringArray(array) => {
                 if !CHECK_VALID || array.is_valid(i) {
-                    self.block.push_str(
-                        std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
-                    );
+                    self.block.extend_from_slice(array.value(i).as_bytes());
                 }
             }
             ColumnarValueRef::NullableStringViewArray(array) => {
                 if !CHECK_VALID || array.is_valid(i) {
-                    self.block.push_str(
-                        std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
-                    );
+                    self.block.extend_from_slice(array.value(i).as_bytes());
                 }
             }
+            ColumnarValueRef::NullableBinaryArray(array) => {
+                if !CHECK_VALID || array.is_valid(i) {
+                    self.block.extend_from_slice(array.value(i));
+                }
+                self.tainted = true;
+            }
             ColumnarValueRef::NonNullableArray(array) => {
-                self.block
-                    .push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
+                self.block.extend_from_slice(array.value(i).as_bytes());
             }
             ColumnarValueRef::NonNullableLargeStringArray(array) => {
-                self.block
-                    .push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
+                self.block.extend_from_slice(array.value(i).as_bytes());
             }
             ColumnarValueRef::NonNullableStringViewArray(array) => {
-                self.block
-                    .push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
+                self.block.extend_from_slice(array.value(i).as_bytes());
+            }
+            ColumnarValueRef::NonNullableBinaryArray(array) => {
+                self.block.extend_from_slice(array.value(i));
+                self.tainted = true;
             }
         }
     }
 
-    pub fn append_offset(&mut self) {
-        self.builder.append_value(&self.block);
-        self.block = String::new();
+    /// Finalizes the current row by converting the accumulated data into a
+    /// StringView and appending it to the views buffer.
+    pub fn append_offset(&mut self) -> Result<()> {
+        if self.tainted {
+            std::str::from_utf8(&self.block)
+                .map_err(|_| exec_datafusion_err!("invalid UTF-8 in binary literal"))?;
+        }
+
+        let v = &self.block;
+        if v.len() > 12 {
+            let offset: u32 = self
+                .data
+                .len()
+                .try_into()
+                .map_err(|_| exec_datafusion_err!("byte array offset overflow"))?;
+            self.data.extend_from_slice(v);
+            self.views.push(make_view(v, 0, offset));
+        } else {
+            self.views.push(make_view(v, 0, 0));
+        }
+
+        self.block.clear();
+        self.tainted = false;
+        Ok(())
     }
 
-    pub fn finish(mut self) -> StringViewArray {
-        self.builder.finish()
+    /// Finalize the builder into a concrete [`StringViewArray`].
+    ///
+    /// # Errors
+    ///
+    /// Returns an error when:
+    ///
+    /// - the provided `null_buffer` length does not match the row count.
+    pub fn finish(self, null_buffer: Option<NullBuffer>) -> Result<StringViewArray> {
+        if let Some(ref nulls) = null_buffer
+            && nulls.len() != self.views.len()
+        {
+            return internal_err!(
+                "Null buffer length ({}) must match row count ({})",
+                nulls.len(),
+                self.views.len()
+            );
+        }
+
+        let buffers: Vec<Buffer> = if self.data.is_empty() {
+            vec![]
+        } else {
+            vec![Buffer::from(self.data)]
+        };
+
+        // SAFETY: views were constructed with correct lengths, offsets, and
+        // prefixes. UTF-8 validity was checked in append_offset() for any row
+        // where tainted data (e.g., binary literals) was appended.
+        let array = unsafe {
+            StringViewArray::new_unchecked(
+                ScalarBuffer::from(self.views),
+                buffers,
+                null_buffer,
+            )
+        };
+        Ok(array)
     }
 }
 
-pub struct LargeStringArrayBuilder {
+/// Builder used by `concat`/`concat_ws` to assemble a [`LargeStringArray`] one
+/// row at a time from multiple input columns. See [`ConcatStringBuilder`] for
+/// details on the row-composition contract.
+///
+/// For the common "produce one `&str` per row" pattern, prefer
+/// `GenericStringArrayBuilder` instead.
+pub(crate) struct ConcatLargeStringBuilder {
     offsets_buffer: MutableBuffer,
     value_buffer: MutableBuffer,
+    /// If true, a safety check is required during the `finish` call
+    tainted: bool,
 }
 
-impl LargeStringArrayBuilder {
+impl ConcatLargeStringBuilder {
     pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
         let capacity = item_capacity
             .checked_add(1)
@@ -214,6 +326,7 @@ impl LargeStringArrayBuilder {
         Self {
             offsets_buffer,
             value_buffer: MutableBuffer::with_capacity(data_capacity),
+            tainted: false,
         }
     }
 
@@ -225,6 +338,7 @@ impl LargeStringArrayBuilder {
         match column {
             ColumnarValueRef::Scalar(s) => {
                 self.value_buffer.extend_from_slice(s);
+                self.tainted = true;
             }
             ColumnarValueRef::NullableArray(array) => {
                 if !CHECK_VALID || array.is_valid(i) {
@@ -244,6 +358,12 @@ impl LargeStringArrayBuilder {
                         .extend_from_slice(array.value(i).as_bytes());
                 }
             }
+            ColumnarValueRef::NullableBinaryArray(array) => {
+                if !CHECK_VALID || array.is_valid(i) {
+                    self.value_buffer.extend_from_slice(array.value(i));
+                }
+                self.tainted = true;
+            }
             ColumnarValueRef::NonNullableArray(array) => {
                 self.value_buffer
                     .extend_from_slice(array.value(i).as_bytes());
@@ -256,31 +376,36 @@ impl LargeStringArrayBuilder {
                 self.value_buffer
                     .extend_from_slice(array.value(i).as_bytes());
             }
+            ColumnarValueRef::NonNullableBinaryArray(array) => {
+                self.value_buffer.extend_from_slice(array.value(i));
+                self.tainted = true;
+            }
         }
     }
 
-    pub fn append_offset(&mut self) {
+    pub fn append_offset(&mut self) -> Result<()> {
         let next_offset: i64 = self
             .value_buffer
             .len()
             .try_into()
-            .expect("byte array offset overflow");
+            .map_err(|_| exec_datafusion_err!("byte array offset overflow"))?;
         self.offsets_buffer.push(next_offset);
+        Ok(())
     }
 
     /// Finalize the builder into a concrete [`LargeStringArray`].
     ///
-    /// # Panics
+    /// # Errors
     ///
-    /// This method can panic when:
+    /// Returns an error when:
     ///
     /// - the provided `null_buffer` is not the same length as the `offsets_buffer`.
-    pub fn finish(self, null_buffer: Option<NullBuffer>) -> LargeStringArray {
+    pub fn finish(self, null_buffer: Option<NullBuffer>) -> Result<LargeStringArray> {
         let row_count = self.offsets_buffer.len() / size_of::<i64>() - 1;
-        if let Some(ref null_buffer) = null_buffer {
-            assert_eq!(
-                null_buffer.len(),
-                row_count,
+        if let Some(ref null_buffer) = null_buffer
+            && null_buffer.len() != row_count
+        {
+            return internal_err!(
                 "Null buffer and offsets buffer must be the same length"
             );
         }
@@ -289,14 +414,312 @@ impl LargeStringArrayBuilder {
             .add_buffer(self.offsets_buffer.into())
             .add_buffer(self.value_buffer.into())
             .nulls(null_buffer);
-        // SAFETY: all data that was appended was valid Large UTF8 and the values
-        // and offsets were created correctly
-        let array_data = unsafe { array_builder.build_unchecked() };
-        LargeStringArray::from(array_data)
+        if self.tainted {
+            // Raw binary arrays with possible invalid utf-8 were used,
+            // so let ArrayDataBuilder perform validation
+            let array_data = array_builder.build()?;
+            Ok(LargeStringArray::from(array_data))
+        } else {
+            // SAFETY: all data that was appended was valid Large UTF8 and the values
+            // and offsets were created correctly
+            let array_data = unsafe { array_builder.build_unchecked() };
+            Ok(LargeStringArray::from(array_data))
+        }
     }
 }
 
-/// Append a new view to the views buffer with the given substr
+// ----------------------------------------------------------------------------
+// Bulk-nulls builders
+//
+// These builders are similar to Arrow's `GenericStringBuilder` and
+// `StringViewBuilder`, except that callers must pass the NULL bitmap to
+// `finish()`, rather than maintaining it iteratively (per-row). For callers
+// that can compute the NULL bitmap in bulk (which is true of many
+// string-related UDFs), this can be significantly more efficient.
+//
+// For a row known to be null, call `append_placeholder` to advance the row
+// count without touching the value buffer; the caller MUST ensure that the
+// corresponding bit is cleared (0 = null) in the null buffer passed to
+// `finish`.
+// ----------------------------------------------------------------------------
+
+/// Builder for a [`GenericStringArray<O>`] that defers null tracking to
+/// `finish`. Instantiate with `O = i32` for [`StringArray`] (Utf8) or
+/// `O = i64` for [`LargeStringArray`] (LargeUtf8).
+pub(crate) struct GenericStringArrayBuilder<O: OffsetSizeTrait> {
+    offsets_buffer: MutableBuffer,
+    value_buffer: MutableBuffer,
+    placeholder_count: usize,
+    _phantom: PhantomData<O>,
+}
+
+impl<O: OffsetSizeTrait> GenericStringArrayBuilder<O> {
+    pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
+        let capacity = item_capacity
+            .checked_add(1)
+            .map(|i| i.saturating_mul(size_of::<O>()))
+            .expect("capacity integer overflow");
+
+        let mut offsets_buffer = MutableBuffer::with_capacity(capacity);
+        offsets_buffer.push(O::usize_as(0));
+        Self {
+            offsets_buffer,
+            value_buffer: MutableBuffer::with_capacity(data_capacity),
+            placeholder_count: 0,
+            _phantom: PhantomData,
+        }
+    }
+
+    /// Append `value` as the next row.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the cumulative byte length exceeds `O::MAX`.
+    #[inline]
+    pub fn append_value(&mut self, value: &str) {
+        self.value_buffer.extend_from_slice(value.as_bytes());
+        let next_offset =
+            O::from_usize(self.value_buffer.len()).expect("byte array offset overflow");
+        self.offsets_buffer.push(next_offset);
+    }
+
+    /// Append an empty placeholder row. The corresponding slot must be masked
+    /// as null by the null buffer passed to `finish`.
+    #[inline]
+    pub fn append_placeholder(&mut self) {
+        let next_offset =
+            O::from_usize(self.value_buffer.len()).expect("byte array offset overflow");
+        self.offsets_buffer.push(next_offset);
+        self.placeholder_count += 1;
+    }
+
+    /// Finalize into a [`GenericStringArray<O>`] using the caller-supplied
+    /// null buffer.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error when `null_buffer.len()` does not match the number of
+    /// appended rows.
+    pub fn finish(
+        self,
+        null_buffer: Option<NullBuffer>,
+    ) -> Result<GenericStringArray<O>> {
+        let row_count = self.offsets_buffer.len() / size_of::<O>() - 1;
+        if let Some(ref n) = null_buffer
+            && n.len() != row_count
+        {
+            return internal_err!(
+                "Null buffer length ({}) must match row count ({row_count})",
+                n.len()
+            );
+        }
+        let null_count = null_buffer.as_ref().map_or(0, |n| n.null_count());
+        debug_assert!(
+            null_count >= self.placeholder_count,
+            "{} placeholder rows but null buffer has {null_count} nulls",
+            self.placeholder_count,
+        );
+        let array_data = ArrayDataBuilder::new(GenericStringArray::<O>::DATA_TYPE)
+            .len(row_count)
+            .add_buffer(self.offsets_buffer.into())
+            .add_buffer(self.value_buffer.into())
+            .nulls(null_buffer);
+        // SAFETY: every appended value came from a `&str`, so the value
+        // buffer is valid UTF-8 and offsets are monotonically non-decreasing.
+        let array_data = unsafe { array_data.build_unchecked() };
+        Ok(GenericStringArray::<O>::from(array_data))
+    }
+}
+
+/// Starting size for the long-string data block; matches Arrow's
+/// `GenericByteViewBuilder` default.
+const STARTING_BLOCK_SIZE: u32 = 8 * 1024;
+/// Maximum size each long-string data block grows to; matches Arrow's
+/// `GenericByteViewBuilder` default.
+const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024;
+
+/// Builder for a [`StringViewArray`] that defers null tracking to `finish`.
+///
+/// Modeled on Arrow's [`arrow::array::builder::StringViewBuilder`] but
+/// without per-row [`arrow::array::builder::NullBufferBuilder`] maintenance.
+/// Short strings (≤ 12 bytes) are inlined into the view itself; long strings
+/// are appended into an in-progress data block. When the in-progress block
+/// fills up it is flushed into `completed` and a new block — double the size
+/// of the last, capped at [`MAX_BLOCK_SIZE`] — is started.
+pub(crate) struct StringViewArrayBuilder {
+    views: Vec<u128>,
+    in_progress: Vec<u8>,
+    completed: Vec<Buffer>,
+    /// Current block-size target; doubles each time a block is flushed, up to
+    /// [`MAX_BLOCK_SIZE`].
+    block_size: u32,
+    placeholder_count: usize,
+}
+
+impl StringViewArrayBuilder {
+    pub fn with_capacity(item_capacity: usize) -> Self {
+        Self {
+            views: Vec::with_capacity(item_capacity),
+            in_progress: Vec::new(),
+            completed: Vec::new(),
+            block_size: STARTING_BLOCK_SIZE,
+            placeholder_count: 0,
+        }
+    }
+
+    /// Doubles the block-size target (capped at [`MAX_BLOCK_SIZE`]) and
+    /// returns the new size. The first call returns `2 * STARTING_BLOCK_SIZE`.
+    fn next_block_size(&mut self) -> u32 {
+        if self.block_size < MAX_BLOCK_SIZE {
+            self.block_size = self.block_size.saturating_mul(2);
+        }
+        self.block_size
+    }
+
+    /// Append `value` as the next row.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the value length, the in-progress buffer offset, or the
+    /// number of completed buffers exceeds `i32::MAX`. The ByteView spec
+    /// uses signed 32-bit integers for these fields; exceeding `i32::MAX`
+    /// would produce an array that does not round-trip through Arrow IPC
+    /// (see <https://github.com/apache/arrow-rs/issues/6172>).
+    #[inline]
+    pub fn append_value(&mut self, value: &str) {
+        let v = value.as_bytes();
+        let length: u32 =
+            i32::try_from(v.len()).expect("value length exceeds i32::MAX") as u32;
+        if length <= 12 {
+            self.views.push(make_view(v, 0, 0));
+            return;
+        }
+
+        let required_cap = self.in_progress.len() + length as usize;
+        if self.in_progress.capacity() < required_cap {
+            self.flush_in_progress();
+            let to_reserve = (length as usize).max(self.next_block_size() as usize);
+            self.in_progress.reserve(to_reserve);
+        }
+
+        let buffer_index: u32 = i32::try_from(self.completed.len())
+            .expect("buffer count exceeds i32::MAX")
+            as u32;
+        let offset: u32 = i32::try_from(self.in_progress.len())
+            .expect("offset exceeds i32::MAX") as u32;
+        self.in_progress.extend_from_slice(v);
+
+        // Build the ByteView inline rather than going through `make_view`,
+        // which is marked as `[inline(never)]`.
+        let view = ByteView {
+            length,
+            // SAFETY: length > 12 here, so v has at least 4 bytes.
+            prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()),
+            buffer_index,
+            offset,
+        };
+        self.views.push(view.into());
+    }
+
+    /// Append an empty placeholder row. The corresponding slot must be
+    /// masked as null by the null buffer passed to `finish`.
+    #[inline]
+    pub fn append_placeholder(&mut self) {
+        // Zero-length inline view — `length` field is 0, no buffer ref.
+        self.views.push(0);
+        self.placeholder_count += 1;
+    }
+
+    fn flush_in_progress(&mut self) {
+        if !self.in_progress.is_empty() {
+            let block = std::mem::take(&mut self.in_progress);
+            self.completed.push(Buffer::from_vec(block));
+        }
+    }
+
+    /// Finalize into a [`StringViewArray`] using the caller-supplied null
+    /// buffer.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error when `null_buffer.len()` does not match the number of
+    /// appended rows.
+    pub fn finish(mut self, null_buffer: Option<NullBuffer>) -> Result<StringViewArray> {
+        if let Some(ref n) = null_buffer
+            && n.len() != self.views.len()
+        {
+            return internal_err!(
+                "Null buffer length ({}) must match row count ({})",
+                n.len(),
+                self.views.len()
+            );
+        }
+        let null_count = null_buffer.as_ref().map_or(0, |n| n.null_count());
+        debug_assert!(
+            null_count >= self.placeholder_count,
+            "{} placeholder rows but null buffer has {null_count} nulls",
+            self.placeholder_count,
+        );
+        self.flush_in_progress();
+        // SAFETY: every long-string view references bytes we wrote ourselves
+        // into `self.completed`, with prefixes derived from those same bytes.
+        // Inline views were built from valid `&str`. Placeholder views are
+        // zero-length with no buffer reference.
+        let array = unsafe {
+            StringViewArray::new_unchecked(
+                ScalarBuffer::from(self.views),
+                self.completed,
+                null_buffer,
+            )
+        };
+        Ok(array)
+    }
+}
+
+/// Trait abstracting over the bulk-NULL string array builders.
+///
+/// Similar to Arrow's `StringLikeArrayBuilder`, this allows generic dispatch
+/// over the three string array types (Utf8, LargeUtf8, Utf8View) when the
+/// function body is uniform across them.
+pub(crate) trait BulkNullStringArrayBuilder {
+    fn append_value(&mut self, value: &str);
+    fn append_placeholder(&mut self);
+    fn finish(self, nulls: Option<NullBuffer>) -> Result<ArrayRef>;
+}
+
+impl<O: OffsetSizeTrait> BulkNullStringArrayBuilder for GenericStringArrayBuilder<O> {
+    #[inline]
+    fn append_value(&mut self, value: &str) {
+        GenericStringArrayBuilder::<O>::append_value(self, value)
+    }
+    #[inline]
+    fn append_placeholder(&mut self) {
+        GenericStringArrayBuilder::<O>::append_placeholder(self)
+    }
+    fn finish(self, nulls: Option<NullBuffer>) -> Result<ArrayRef> {
+        Ok(Arc::new(GenericStringArrayBuilder::<O>::finish(
+            self, nulls,
+        )?))
+    }
+}
+
+impl BulkNullStringArrayBuilder for StringViewArrayBuilder {
+    #[inline]
+    fn append_value(&mut self, value: &str) {
+        StringViewArrayBuilder::append_value(self, value)
+    }
+    #[inline]
+    fn append_placeholder(&mut self) {
+        StringViewArrayBuilder::append_placeholder(self)
+    }
+    fn finish(self, nulls: Option<NullBuffer>) -> Result<ArrayRef> {
+        Ok(Arc::new(StringViewArrayBuilder::finish(self, nulls)?))
+    }
+}
+
+/// Append a new view to the views buffer with the given substr.
+///
+/// Callers are responsible for their own null tracking.
 ///
 /// # Safety
 ///
@@ -305,13 +728,15 @@ impl LargeStringArrayBuilder {
 ///
 /// # Arguments
 /// - views_buffer: The buffer to append the new view to
-/// - null_builder: The buffer to append the null value to
 /// - original_view: The original view value
 /// - substr: The substring to append. Must be a valid substring of the original view
 /// - start_offset: The start offset of the substring in the view
-pub fn make_and_append_view(
+///
+/// LLVM is apparently overly eager to inline this function into some hot loops,
+/// which bloats them and regresses performance, so we disable inlining for now.
+#[inline(never)]
+pub(crate) fn append_view(
     views_buffer: &mut Vec<u128>,
-    null_builder: &mut NullBufferBuilder,
     original_view: &u128,
     substr: &str,
     start_offset: u32,
@@ -325,15 +750,13 @@ pub fn make_and_append_view(
             view.offset + start_offset,
         )
     } else {
-        // inline value does not need block id or offset
         make_view(substr.as_bytes(), 0, 0)
     };
     views_buffer.push(sub_view);
-    null_builder.append_non_null();
 }
 
 #[derive(Debug)]
-pub enum ColumnarValueRef<'a> {
+pub(crate) enum ColumnarValueRef<'a> {
     Scalar(&'a [u8]),
     NullableArray(&'a StringArray),
     NonNullableArray(&'a StringArray),
@@ -341,6 +764,8 @@ pub enum ColumnarValueRef<'a> {
     NonNullableLargeStringArray(&'a LargeStringArray),
     NullableStringViewArray(&'a StringViewArray),
     NonNullableStringViewArray(&'a StringViewArray),
+    NullableBinaryArray(&'a BinaryArray),
+    NonNullableBinaryArray(&'a BinaryArray),
 }
 
 impl ColumnarValueRef<'_> {
@@ -350,10 +775,12 @@ impl ColumnarValueRef<'_> {
             Self::Scalar(_)
             | Self::NonNullableArray(_)
             | Self::NonNullableLargeStringArray(_)
-            | Self::NonNullableStringViewArray(_) => true,
+            | Self::NonNullableStringViewArray(_)
+            | Self::NonNullableBinaryArray(_) => true,
             Self::NullableArray(array) => array.is_valid(i),
             Self::NullableStringViewArray(array) => array.is_valid(i),
             Self::NullableLargeStringArray(array) => array.is_valid(i),
+            Self::NullableBinaryArray(array) => array.is_valid(i),
         }
     }
 
@@ -363,10 +790,12 @@ impl ColumnarValueRef<'_> {
             Self::Scalar(_)
             | Self::NonNullableArray(_)
             | Self::NonNullableStringViewArray(_)
-            | Self::NonNullableLargeStringArray(_) => None,
+            | Self::NonNullableLargeStringArray(_)
+            | Self::NonNullableBinaryArray(_) => None,
             Self::NullableArray(array) => array.nulls().cloned(),
             Self::NullableStringViewArray(array) => array.nulls().cloned(),
             Self::NullableLargeStringArray(array) => array.nulls().cloned(),
+            Self::NullableBinaryArray(array) => array.nulls().cloned(),
         }
     }
 }
@@ -377,13 +806,266 @@ mod tests {
 
     #[test]
     #[should_panic(expected = "capacity integer overflow")]
-    fn test_overflow_string_array_builder() {
-        let _builder = StringArrayBuilder::with_capacity(usize::MAX, usize::MAX);
+    fn test_overflow_concat_string_builder() {
+        let _builder = ConcatStringBuilder::with_capacity(usize::MAX, usize::MAX);
     }
 
     #[test]
     #[should_panic(expected = "capacity integer overflow")]
-    fn test_overflow_large_string_array_builder() {
-        let _builder = LargeStringArrayBuilder::with_capacity(usize::MAX, usize::MAX);
+    fn test_overflow_concat_large_string_builder() {
+        let _builder = ConcatLargeStringBuilder::with_capacity(usize::MAX, usize::MAX);
+    }
+
+    #[test]
+    fn string_array_builder_empty() {
+        let builder = GenericStringArrayBuilder::<i32>::with_capacity(0, 0);
+        let array = builder.finish(None).unwrap();
+        assert_eq!(array.len(), 0);
+    }
+
+    #[test]
+    fn string_array_builder_no_nulls() {
+        let mut builder = GenericStringArrayBuilder::<i32>::with_capacity(3, 16);
+        builder.append_value("foo");
+        builder.append_value("");
+        builder.append_value("hello world");
+        let array = builder.finish(None).unwrap();
+        assert_eq!(array.len(), 3);
+        assert_eq!(array.value(0), "foo");
+        assert_eq!(array.value(1), "");
+        assert_eq!(array.value(2), "hello world");
+        assert_eq!(array.null_count(), 0);
+    }
+
+    #[test]
+    fn string_array_builder_with_nulls() {
+        let mut builder = GenericStringArrayBuilder::<i32>::with_capacity(3, 8);
+        builder.append_value("a");
+        builder.append_placeholder();
+        builder.append_value("c");
+        let nulls = NullBuffer::from(vec![true, false, true]);
+        let array = builder.finish(Some(nulls)).unwrap();
+        assert_eq!(array.len(), 3);
+        assert_eq!(array.value(0), "a");
+        assert!(array.is_null(1));
+        assert_eq!(array.value(2), "c");
+    }
+
+    #[test]
+    fn string_array_builder_null_buffer_length_mismatch() {
+        let mut builder = GenericStringArrayBuilder::<i32>::with_capacity(2, 4);
+        builder.append_value("a");
+        builder.append_value("b");
+        let nulls = NullBuffer::from(vec![true, false, true]);
+        assert!(builder.finish(Some(nulls)).is_err());
+    }
+
+    #[test]
+    #[cfg(debug_assertions)]
+    #[should_panic(expected = "placeholder rows")]
+    fn string_array_builder_placeholder_without_null_mask() {
+        let mut builder = GenericStringArrayBuilder::<i32>::with_capacity(2, 4);
+        builder.append_value("a");
+        builder.append_placeholder();
+        // Slot 1 is a placeholder but the null buffer doesn't mark it null.
+        let nulls = NullBuffer::from(vec![true, true]);
+        let _ = builder.finish(Some(nulls));
+    }
+
+    #[test]
+    #[cfg(debug_assertions)]
+    #[should_panic(expected = "placeholder rows")]
+    fn string_array_builder_placeholder_with_none_null_buffer() {
+        let mut builder = GenericStringArrayBuilder::<i32>::with_capacity(1, 4);
+        builder.append_placeholder();
+        let _ = builder.finish(None);
+    }
+
+    #[test]
+    fn string_array_builder_all_placeholders() {
+        let mut builder = GenericStringArrayBuilder::<i32>::with_capacity(3, 0);
+        builder.append_placeholder();
+        builder.append_placeholder();
+        builder.append_placeholder();
+        let nulls = NullBuffer::from(vec![false, false, false]);
+        let array = builder.finish(Some(nulls)).unwrap();
+        assert_eq!(array.len(), 3);
+        assert_eq!(array.null_count(), 3);
+        assert!((0..3).all(|i| array.is_null(i)));
+    }
+
+    #[test]
+    fn large_string_array_builder_with_nulls() {
+        let mut builder = GenericStringArrayBuilder::<i64>::with_capacity(3, 8);
+        builder.append_value("a");
+        builder.append_placeholder();
+        builder.append_value("c");
+        let nulls = NullBuffer::from(vec![true, false, true]);
+        let array = builder.finish(Some(nulls)).unwrap();
+        assert_eq!(array.len(), 3);
+        assert_eq!(array.value(0), "a");
+        assert!(array.is_null(1));
+        assert_eq!(array.value(2), "c");
+    }
+
+    #[test]
+    fn string_view_array_builder_empty() {
+        let builder = StringViewArrayBuilder::with_capacity(0);
+        let array = builder.finish(None).unwrap();
+        assert_eq!(array.len(), 0);
+    }
+
+    #[test]
+    fn string_view_array_builder_inline_and_buffer() {
+        let mut builder = StringViewArrayBuilder::with_capacity(3);
+        builder.append_value("short"); // ≤ 12 bytes, inline
+        builder.append_value("a string longer than twelve bytes");
+        builder.append_value("");
+        let array = builder.finish(None).unwrap();
+        assert_eq!(array.len(), 3);
+        assert_eq!(array.value(0), "short");
+        assert_eq!(array.value(1), "a string longer than twelve bytes");
+        assert_eq!(array.value(2), "");
+    }
+
+    #[test]
+    fn string_view_array_builder_with_nulls() {
+        let mut builder = StringViewArrayBuilder::with_capacity(4);
+        builder.append_value("a string longer than twelve bytes");
+        builder.append_placeholder();
+        builder.append_value("short");
+        builder.append_placeholder();
+        let nulls = NullBuffer::from(vec![true, false, true, false]);
+        let array = builder.finish(Some(nulls)).unwrap();
+        assert_eq!(array.len(), 4);
+        assert_eq!(array.value(0), "a string longer than twelve bytes");
+        assert!(array.is_null(1));
+        assert_eq!(array.value(2), "short");
+        assert!(array.is_null(3));
+    }
+
+    #[test]
+    fn string_view_array_builder_all_placeholders() {
+        let mut builder = StringViewArrayBuilder::with_capacity(3);
+        builder.append_placeholder();
+        builder.append_placeholder();
+        builder.append_placeholder();
+        let nulls = NullBuffer::from(vec![false, false, false]);
+        let array = builder.finish(Some(nulls)).unwrap();
+        assert_eq!(array.len(), 3);
+        assert_eq!(array.null_count(), 3);
+        assert!((0..3).all(|i| array.is_null(i)));
+    }
+
+    #[test]
+    fn string_view_array_builder_null_buffer_length_mismatch() {
+        let mut builder = StringViewArrayBuilder::with_capacity(2);
+        builder.append_value("a");
+        builder.append_value("b");
+        let nulls = NullBuffer::from(vec![true, false, true]);
+        assert!(builder.finish(Some(nulls)).is_err());
+    }
+
+    #[test]
+    #[cfg(debug_assertions)]
+    #[should_panic(expected = "placeholder rows")]
+    fn string_view_array_builder_placeholder_without_null_mask() {
+        let mut builder = StringViewArrayBuilder::with_capacity(2);
+        builder.append_value("a");
+        builder.append_placeholder();
+        let nulls = NullBuffer::from(vec![true, true]);
+        let _ = builder.finish(Some(nulls));
+    }
+
+    #[test]
+    #[cfg(debug_assertions)]
+    #[should_panic(expected = "placeholder rows")]
+    fn string_view_array_builder_placeholder_with_none_null_buffer() {
+        let mut builder = StringViewArrayBuilder::with_capacity(1);
+        builder.append_placeholder();
+        let _ = builder.finish(None);
+    }
+
+    #[test]
+    fn string_view_array_builder_flushes_full_blocks() {
+        // Each value is 300 bytes. The first data block is 2 × STARTING_BLOCK_SIZE
+        // = 16 KiB, so ~50 values saturate it and the rest spill into additional
+        // blocks.
+        let value = "x".repeat(300);
+        let mut builder = StringViewArrayBuilder::with_capacity(100);
+        for _ in 0..100 {
+            builder.append_value(&value);
+        }
+        let array = builder.finish(None).unwrap();
+        assert_eq!(array.len(), 100);
+        assert!(
+            array.data_buffers().len() > 1,
+            "expected multiple data buffers, got {}",
+            array.data_buffers().len()
+        );
+        for i in 0..100 {
+            assert_eq!(array.value(i), value);
+        }
+    }
+
+    /// Build an array via `BulkNullStringArrayBuilder` to verify that the
+    /// trait methods produce the same result as the inherent methods.
+    fn build_via_trait<B: BulkNullStringArrayBuilder>(
+        mut builder: B,
+        nulls: Option<NullBuffer>,
+    ) -> ArrayRef {
+        builder.append_value("a");
+        builder.append_placeholder();
+        builder.append_value("hello world!");
+        builder.finish(nulls).unwrap()
+    }
+
+    #[test]
+    fn bulk_null_trait_string_i32() {
+        let builder = GenericStringArrayBuilder::<i32>::with_capacity(3, 16);
+        let nulls = NullBuffer::from(vec![true, false, true]);
+        let array = build_via_trait(builder, Some(nulls));
+        let array = array.as_any().downcast_ref::<StringArray>().unwrap();
+        assert_eq!(array.len(), 3);
+        assert_eq!(array.value(0), "a");
+        assert!(array.is_null(1));
+        assert_eq!(array.value(2), "hello world!");
+    }
+
+    #[test]
+    fn bulk_null_trait_string_i64() {
+        let builder = GenericStringArrayBuilder::<i64>::with_capacity(3, 16);
+        let nulls = NullBuffer::from(vec![true, false, true]);
+        let array = build_via_trait(builder, Some(nulls));
+        let array = array.as_any().downcast_ref::<LargeStringArray>().unwrap();
+        assert_eq!(array.len(), 3);
+        assert_eq!(array.value(0), "a");
+        assert!(array.is_null(1));
+        assert_eq!(array.value(2), "hello world!");
+    }
+
+    #[test]
+    fn bulk_null_trait_string_view() {
+        let builder = StringViewArrayBuilder::with_capacity(3);
+        let nulls = NullBuffer::from(vec![true, false, true]);
+        let array = build_via_trait(builder, Some(nulls));
+        let array = array.as_any().downcast_ref::<StringViewArray>().unwrap();
+        assert_eq!(array.len(), 3);
+        assert_eq!(array.value(0), "a");
+        assert!(array.is_null(1));
+        assert_eq!(array.value(2), "hello world!");
+    }
+
+    #[test]
+    fn bulk_null_trait_no_nulls() {
+        let mut builder = GenericStringArrayBuilder::<i32>::with_capacity(2, 8);
+        BulkNullStringArrayBuilder::append_value(&mut builder, "x");
+        BulkNullStringArrayBuilder::append_value(&mut builder, "yy");
+        let array = BulkNullStringArrayBuilder::finish(builder, None).unwrap();
+        let array = array.as_any().downcast_ref::<StringArray>().unwrap();
+        assert_eq!(array.len(), 2);
+        assert_eq!(array.value(0), "x");
+        assert_eq!(array.value(1), "yy");
+        assert_eq!(array.null_count(), 0);
     }
 }
diff --git a/datafusion/functions/src/unicode/character_length.rs b/datafusion/functions/src/unicode/character_length.rs
index 85fe0956a951b..465b15ace1d10 100644
--- a/datafusion/functions/src/unicode/character_length.rs
+++ b/datafusion/functions/src/unicode/character_length.rs
@@ -23,10 +23,10 @@ use arrow::array::{
 use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};
 use datafusion_common::Result;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
 use std::sync::Arc;
 
 #[user_doc(
@@ -72,10 +72,6 @@ impl CharacterLengthFunc {
 }
 
 impl ScalarUDFImpl for CharacterLengthFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "character_length"
     }
@@ -88,10 +84,7 @@ impl ScalarUDFImpl for CharacterLengthFunc {
         utf8_to_int_type(&arg_types[0], "character_length")
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(character_length, vec![])(&args.args)
     }
 
@@ -111,21 +104,21 @@ fn character_length(args: &[ArrayRef]) -> Result<ArrayRef> {
     match args[0].data_type() {
         DataType::Utf8 => {
             let string_array = args[0].as_string::<i32>();
-            character_length_general::<Int32Type, _>(string_array)
+            character_length_general::<Int32Type, _>(&string_array)
         }
         DataType::LargeUtf8 => {
             let string_array = args[0].as_string::<i64>();
-            character_length_general::<Int64Type, _>(string_array)
+            character_length_general::<Int64Type, _>(&string_array)
         }
         DataType::Utf8View => {
             let string_array = args[0].as_string_view();
-            character_length_general::<Int32Type, _>(string_array)
+            character_length_general::<Int32Type, _>(&string_array)
         }
         _ => unreachable!("CharacterLengthFunc"),
     }
 }
 
-fn character_length_general<'a, T, V>(array: V) -> Result<ArrayRef>
+fn character_length_general<'a, T, V>(array: &V) -> Result<ArrayRef>
 where
     T: ArrowPrimitiveType,
     T::Native: OffsetSizeTrait,
@@ -227,7 +220,9 @@ mod tests {
         #[cfg(not(feature = "unicode_expressions"))]
         test_function!(
             CharacterLengthFunc::new(),
-            &[ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("josé"))))],
+            &[ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                String::from("josé")
+            )))],
             internal_err!(
                 "function character_length requires compilation with feature flag: unicode_expressions."
             ),
diff --git a/datafusion/functions/src/unicode/common.rs b/datafusion/functions/src/unicode/common.rs
new file mode 100644
index 0000000000000..092f2b8003b1b
--- /dev/null
+++ b/datafusion/functions/src/unicode/common.rs
@@ -0,0 +1,275 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Common utilities for implementing unicode functions
+
+use arrow::array::{
+    Array, ArrayRef, ByteView, GenericStringArray, Int64Array, OffsetSizeTrait,
+    StringViewArray, make_view,
+};
+use arrow::datatypes::DataType;
+use arrow_buffer::{NullBuffer, ScalarBuffer};
+use datafusion_common::Result;
+use datafusion_common::ScalarValue;
+use datafusion_common::cast::{
+    as_generic_string_array, as_int64_array, as_string_view_array,
+};
+use datafusion_common::exec_err;
+use datafusion_expr::ColumnarValue;
+use std::cmp::Ordering;
+use std::ops::Range;
+use std::sync::Arc;
+
+/// If `cv` is a non-null scalar string, return its value.
+pub(crate) fn try_as_scalar_str(cv: &ColumnarValue) -> Option<&str> {
+    match cv {
+        ColumnarValue::Scalar(s) => s.try_as_str().flatten(),
+        _ => None,
+    }
+}
+
+/// If `cv` is a non-null scalar Int64, return its value.
+pub(crate) fn try_as_scalar_i64(cv: &ColumnarValue) -> Option<i64> {
+    match cv {
+        ColumnarValue::Scalar(ScalarValue::Int64(v)) => *v,
+        _ => None,
+    }
+}
+
+/// A trait for `left` and `right` byte slicing operations
+pub(crate) trait LeftRightSlicer {
+    fn slice(string: &str, n: i64) -> Range<usize>;
+}
+
+pub(crate) struct LeftSlicer {}
+
+impl LeftRightSlicer for LeftSlicer {
+    fn slice(string: &str, n: i64) -> Range<usize> {
+        0..left_right_byte_length(string, n)
+    }
+}
+
+pub(crate) struct RightSlicer {}
+
+impl LeftRightSlicer for RightSlicer {
+    fn slice(string: &str, n: i64) -> Range<usize> {
+        if n == 0 {
+            // Return nothing for `n=0`
+            0..0
+        } else if n == i64::MIN {
+            // Special case for i64::MIN overflow
+            0..0
+        } else {
+            left_right_byte_length(string, -n)..string.len()
+        }
+    }
+}
+
+/// Returns the byte offset of the `n`th codepoint in `string`,
+/// or `string.len()` if the string has fewer than `n` codepoints.
+#[inline]
+pub(crate) fn byte_offset_of_char(string: &str, n: usize) -> usize {
+    string
+        .char_indices()
+        .nth(n)
+        .map_or(string.len(), |(i, _)| i)
+}
+
+/// If `string` has more than `n` codepoints, returns the byte offset of
+/// the `n`-th codepoint boundary. Otherwise returns the total codepoint count.
+#[inline]
+pub(crate) fn char_count_or_boundary(string: &str, n: usize) -> StringCharLen {
+    let mut count = 0;
+    for (byte_idx, _) in string.char_indices() {
+        if count == n {
+            return StringCharLen::ByteOffset(byte_idx);
+        }
+        count += 1;
+    }
+    StringCharLen::CharCount(count)
+}
+
+/// Result of [`char_count_or_boundary`].
+pub(crate) enum StringCharLen {
+    /// The string has more than `n` codepoints; contains the byte offset
+    /// at the `n`-th codepoint boundary.
+    ByteOffset(usize),
+    /// The string has `n` or fewer codepoints; contains the exact count.
+    CharCount(usize),
+}
+
+/// Calculate the byte length of the substring of `n` chars from string `string`
+#[inline]
+fn left_right_byte_length(string: &str, n: i64) -> usize {
+    match n.cmp(&0) {
+        Ordering::Less => string
+            .char_indices()
+            .nth_back((n.unsigned_abs().min(usize::MAX as u64) - 1) as usize)
+            .map(|(index, _)| index)
+            .unwrap_or(0),
+        Ordering::Equal => 0,
+        Ordering::Greater => {
+            byte_offset_of_char(string, n.unsigned_abs().min(usize::MAX as u64) as usize)
+        }
+    }
+}
+
+/// General implementation for `left` and `right` functions
+pub(crate) fn general_left_right<F: LeftRightSlicer>(
+    args: &[ArrayRef],
+) -> Result<ArrayRef> {
+    let n_array = as_int64_array(&args[1])?;
+
+    match args[0].data_type() {
+        DataType::Utf8 => {
+            let string_array = as_generic_string_array::<i32>(&args[0])?;
+            general_left_right_array::<i32, F>(string_array, n_array)
+        }
+        DataType::LargeUtf8 => {
+            let string_array = as_generic_string_array::<i64>(&args[0])?;
+            general_left_right_array::<i64, F>(string_array, n_array)
+        }
+        DataType::Utf8View => {
+            let string_view_array = as_string_view_array(&args[0])?;
+            general_left_right_view::<F>(string_view_array, n_array)
+        }
+        _ => exec_err!("Not supported"),
+    }
+}
+
+/// Returns true if all offsets in the array fit in i32, meaning the values
+/// buffer can be referenced by StringView's offset field.
+fn values_fit_in_i32<T: OffsetSizeTrait>(string_array: &GenericStringArray<T>) -> bool {
+    string_array
+        .offsets()
+        .last()
+        .map(|offset| offset.as_usize() <= i32::MAX as usize)
+        .unwrap_or(true)
+}
+
+/// `left`/`right` for Utf8/LargeUtf8 input.
+///
+/// When offsets fit in i32, produces a zero-copy `StringViewArray` with views
+/// pointing into the input values buffer. Otherwise falls back to building a
+/// `StringViewArray` by copying.
+fn general_left_right_array<T: OffsetSizeTrait, F: LeftRightSlicer>(
+    string_array: &GenericStringArray<T>,
+    n_array: &Int64Array,
+) -> Result<ArrayRef> {
+    if !values_fit_in_i32(string_array) {
+        let result = string_array
+            .iter()
+            .zip(n_array.iter())
+            .map(|(string, n)| match (string, n) {
+                (Some(string), Some(n)) => Some(&string[F::slice(string, n)]),
+                _ => None,
+            })
+            .collect::<StringViewArray>();
+        return Ok(Arc::new(result) as ArrayRef);
+    }
+
+    let len = string_array.len();
+    let offsets = string_array.value_offsets();
+    let nulls = NullBuffer::union(string_array.nulls(), n_array.nulls());
+
+    let mut views_buf = Vec::with_capacity(len);
+    let mut has_out_of_line = false;
+
+    for (i, offset) in offsets.iter().enumerate().take(len) {
+        if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
+            views_buf.push(0);
+            continue;
+        }
+
+        // SAFETY: we just checked validity above
+        let string = unsafe { string_array.value_unchecked(i) };
+        let n = n_array.value(i);
+        let range = F::slice(string, n);
+        let result_bytes = &string.as_bytes()[range.clone()];
+        if result_bytes.len() > 12 {
+            has_out_of_line = true;
+        }
+
+        let buf_offset = offset.as_usize() as u32 + range.start as u32;
+        views_buf.push(make_view(result_bytes, 0, buf_offset));
+    }
+
+    let views = ScalarBuffer::from(views_buf);
+    let data_buffers = if has_out_of_line {
+        vec![string_array.values().clone()]
+    } else {
+        vec![]
+    };
+
+    // SAFETY:
+    // - Each view is produced by `make_view` with correct bytes and offset
+    // - Out-of-line views reference buffer index 0, which is the original
+    //   values buffer included in data_buffers when has_out_of_line is true
+    // - values_fit_in_i32 guarantees all offsets fit in i32
+    unsafe {
+        let array = StringViewArray::new_unchecked(views, data_buffers, nulls);
+        Ok(Arc::new(array) as ArrayRef)
+    }
+}
+
+/// `general_left_right` for StringViewArray input.
+fn general_left_right_view<F: LeftRightSlicer>(
+    string_view_array: &StringViewArray,
+    n_array: &Int64Array,
+) -> Result<ArrayRef> {
+    let views = string_view_array.views();
+    let new_nulls = NullBuffer::union(string_view_array.nulls(), n_array.nulls());
+    let len = n_array.len();
+    let mut has_out_of_line = false;
+
+    let new_views = (0..len)
+        .map(|idx| {
+            if new_nulls.as_ref().is_some_and(|n| n.is_null(idx)) {
+                return 0;
+            }
+
+            // SAFETY: we just checked validity above
+            let string: &str = unsafe { string_view_array.value_unchecked(idx) };
+            let n = n_array.value(idx);
+
+            let range = F::slice(string, n);
+            let result_bytes = &string.as_bytes()[range.clone()];
+            if result_bytes.len() > 12 {
+                has_out_of_line = true;
+            }
+
+            let byte_view = ByteView::from(views[idx]);
+            let new_offset = byte_view.offset + (range.start as u32);
+            make_view(result_bytes, byte_view.buffer_index, new_offset)
+        })
+        .collect::<Vec<u128>>();
+
+    let views = ScalarBuffer::from(new_views);
+    let data_buffers = if has_out_of_line {
+        string_view_array.data_buffers().to_vec()
+    } else {
+        vec![]
+    };
+
+    // SAFETY:
+    // - Each view is produced by `make_view` with correct bytes and offset
+    // - Out-of-line views reuse the original buffer index and adjusted offset
+    unsafe {
+        let array = StringViewArray::new_unchecked(views, data_buffers, new_nulls);
+        Ok(Arc::new(array) as ArrayRef)
+    }
+}
diff --git a/datafusion/functions/src/unicode/find_in_set.rs b/datafusion/functions/src/unicode/find_in_set.rs
index fa68e539600b0..0a83eb3ed61ef 100644
--- a/datafusion/functions/src/unicode/find_in_set.rs
+++ b/datafusion/functions/src/unicode/find_in_set.rs
@@ -15,18 +15,17 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{
-    new_null_array, ArrayAccessor, ArrayIter, ArrayRef, ArrowPrimitiveType, AsArray,
-    OffsetSizeTrait, PrimitiveArray,
+    ArrayAccessor, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, PrimitiveArray,
 };
 use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};
+use arrow_buffer::NullBuffer;
 
 use crate::utils::utf8_to_int_type;
 use datafusion_common::{
-    exec_err, internal_err, utils::take_function_args, Result, ScalarValue,
+    Result, ScalarValue, exec_err, internal_err, utils::take_function_args,
 };
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
@@ -81,10 +80,6 @@ impl FindInSetFunc {
 }
 
 impl ScalarUDFImpl for FindInSetFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "find_in_set"
     }
@@ -98,9 +93,8 @@ impl ScalarUDFImpl for FindInSetFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        let ScalarFunctionArgs { args, .. } = args;
-
-        let [string, str_list] = take_function_args(self.name(), args)?;
+        let return_field = args.return_field;
+        let [string, str_list] = take_function_args(self.name(), args.args)?;
 
         match (string, str_list) {
             // both inputs are scalars
@@ -139,9 +133,11 @@ impl ScalarUDFImpl for FindInSetFunc {
                     | ScalarValue::LargeUtf8(str_list_literal),
                 ),
             ) => {
-                let result_array = match str_list_literal {
+                match str_list_literal {
                     // find_in_set(column_a, null) = null
-                    None => new_null_array(str_array.data_type(), str_array.len()),
+                    None => Ok(ColumnarValue::Scalar(ScalarValue::try_new_null(
+                        return_field.data_type(),
+                    )?)),
                     Some(str_list_literal) => {
                         let str_list = str_list_literal.split(',').collect::<Vec<&str>>();
                         let result = match str_array.data_type() {
@@ -149,31 +145,32 @@ impl ScalarUDFImpl for FindInSetFunc {
                                 let string_array = str_array.as_string::<i32>();
                                 find_in_set_right_literal::<Int32Type, _>(
                                     string_array,
-                                    str_list,
+                                    &str_list,
                                 )
                             }
                             DataType::LargeUtf8 => {
                                 let string_array = str_array.as_string::<i64>();
                                 find_in_set_right_literal::<Int64Type, _>(
                                     string_array,
-                                    str_list,
+                                    &str_list,
                                 )
                             }
                             DataType::Utf8View => {
                                 let string_array = str_array.as_string_view();
                                 find_in_set_right_literal::<Int32Type, _>(
                                     string_array,
-                                    str_list,
+                                    &str_list,
                                 )
                             }
                             other => {
-                                exec_err!("Unsupported data type {other:?} for function find_in_set")
+                                exec_err!(
+                                    "Unsupported data type {other:?} for function find_in_set"
+                                )
                             }
                         };
-                        Arc::new(result?)
+                        Ok(ColumnarValue::Array(Arc::new(result?)))
                     }
-                };
-                Ok(ColumnarValue::Array(result_array))
+                }
             }
 
             // `string` is scalar, `str_list` is an array
@@ -185,38 +182,45 @@ impl ScalarUDFImpl for FindInSetFunc {
                 ),
                 ColumnarValue::Array(str_list_array),
             ) => {
-                let res = match string_literal {
+                match string_literal {
                     // find_in_set(null, column_b) = null
-                    None => {
-                        new_null_array(str_list_array.data_type(), str_list_array.len())
-                    }
+                    None => Ok(ColumnarValue::Scalar(ScalarValue::try_new_null(
+                        return_field.data_type(),
+                    )?)),
                     Some(string) => {
                         let result = match str_list_array.data_type() {
                             DataType::Utf8 => {
                                 let str_list = str_list_array.as_string::<i32>();
-                                find_in_set_left_literal::<Int32Type, _>(string, str_list)
+                                find_in_set_left_literal::<Int32Type, _>(
+                                    &string, str_list,
+                                )
                             }
                             DataType::LargeUtf8 => {
                                 let str_list = str_list_array.as_string::<i64>();
-                                find_in_set_left_literal::<Int64Type, _>(string, str_list)
+                                find_in_set_left_literal::<Int64Type, _>(
+                                    &string, str_list,
+                                )
                             }
                             DataType::Utf8View => {
                                 let str_list = str_list_array.as_string_view();
-                                find_in_set_left_literal::<Int32Type, _>(string, str_list)
+                                find_in_set_left_literal::<Int32Type, _>(
+                                    &string, str_list,
+                                )
                             }
                             other => {
-                                exec_err!("Unsupported data type {other:?} for function find_in_set")
+                                exec_err!(
+                                    "Unsupported data type {other:?} for function find_in_set"
+                                )
                             }
                         };
-                        Arc::new(result?)
+                        Ok(ColumnarValue::Array(Arc::new(result?)))
                     }
-                };
-                Ok(ColumnarValue::Array(res))
+                }
             }
 
             // both inputs are arrays
             (ColumnarValue::Array(base_array), ColumnarValue::Array(exp_array)) => {
-                let res = find_in_set(base_array, exp_array)?;
+                let res = find_in_set(&base_array, &exp_array)?;
 
                 Ok(ColumnarValue::Array(res))
             }
@@ -234,7 +238,7 @@ impl ScalarUDFImpl for FindInSetFunc {
 /// Returns a value in the range of 1 to N if the string `str` is in the string list `strlist`
 /// consisting of N substrings. A string list is a string composed of substrings separated by `,`
 /// characters.
-fn find_in_set(str: ArrayRef, str_list: ArrayRef) -> Result<ArrayRef> {
+fn find_in_set(str: &ArrayRef, str_list: &ArrayRef) -> Result<ArrayRef> {
     match str.data_type() {
         DataType::Utf8 => {
             let string_array = str.as_string::<i32>();
@@ -257,90 +261,89 @@ fn find_in_set(str: ArrayRef, str_list: ArrayRef) -> Result<ArrayRef> {
     }
 }
 
-pub fn find_in_set_general<'a, T, V>(
-    string_array: V,
-    str_list_array: V,
-) -> Result<ArrayRef>
+fn find_in_set_general<'a, T, V>(string_array: V, str_list_array: V) -> Result<ArrayRef>
 where
     T: ArrowPrimitiveType,
     T::Native: OffsetSizeTrait,
-    V: ArrayAccessor<Item = &'a str>,
+    V: ArrayAccessor<Item = &'a str> + Copy,
 {
-    let string_iter = ArrayIter::new(string_array);
-    let str_list_iter = ArrayIter::new(str_list_array);
-
-    let mut builder = PrimitiveArray::<T>::builder(string_iter.len());
-
-    string_iter
-        .zip(str_list_iter)
-        .for_each(
-            |(string_opt, str_list_opt)| match (string_opt, str_list_opt) {
-                (Some(string), Some(str_list)) => {
-                    let position = str_list
-                        .split(',')
-                        .position(|s| s == string)
-                        .map_or(0, |idx| idx + 1);
-                    builder.append_value(T::Native::from_usize(position).unwrap());
-                }
-                _ => builder.append_null(),
-            },
-        );
+    let len = string_array.len();
+    let nulls = NullBuffer::union(string_array.nulls(), str_list_array.nulls());
+    let zero = T::Native::from_usize(0).unwrap();
+
+    let values: Vec<T::Native> = (0..len)
+        .map(|i| {
+            if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
+                return zero;
+            }
+            let string = string_array.value(i);
+            let str_list = str_list_array.value(i);
+            let position = str_list
+                .split(',')
+                .position(|s| s == string)
+                .map_or(0, |idx| idx + 1);
+            T::Native::from_usize(position).unwrap()
+        })
+        .collect();
 
-    Ok(Arc::new(builder.finish()) as ArrayRef)
+    Ok(Arc::new(PrimitiveArray::<T>::new(values.into(), nulls)) as ArrayRef)
 }
 
-fn find_in_set_left_literal<'a, T, V>(
-    string: String,
-    str_list_array: V,
-) -> Result<ArrayRef>
+fn find_in_set_left_literal<'a, T, V>(string: &str, str_list_array: V) -> Result<ArrayRef>
 where
     T: ArrowPrimitiveType,
     T::Native: OffsetSizeTrait,
-    V: ArrayAccessor<Item = &'a str>,
+    V: ArrayAccessor<Item = &'a str> + Copy,
 {
-    let mut builder = PrimitiveArray::<T>::builder(str_list_array.len());
-
-    let str_list_iter = ArrayIter::new(str_list_array);
-
-    str_list_iter.for_each(|str_list_opt| match str_list_opt {
-        Some(str_list) => {
+    let len = str_list_array.len();
+    let nulls = str_list_array.nulls().cloned();
+    let zero = T::Native::from_usize(0).unwrap();
+
+    let values: Vec<T::Native> = (0..len)
+        .map(|i| {
+            if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
+                return zero;
+            }
+            let str_list = str_list_array.value(i);
             let position = str_list
                 .split(',')
                 .position(|s| s == string)
                 .map_or(0, |idx| idx + 1);
-            builder.append_value(T::Native::from_usize(position).unwrap());
-        }
-        None => builder.append_null(),
-    });
+            T::Native::from_usize(position).unwrap()
+        })
+        .collect();
 
-    Ok(Arc::new(builder.finish()) as ArrayRef)
+    Ok(Arc::new(PrimitiveArray::<T>::new(values.into(), nulls)) as ArrayRef)
 }
 
 fn find_in_set_right_literal<'a, T, V>(
     string_array: V,
-    str_list: Vec<&str>,
+    str_list: &[&str],
 ) -> Result<ArrayRef>
 where
     T: ArrowPrimitiveType,
     T::Native: OffsetSizeTrait,
-    V: ArrayAccessor<Item = &'a str>,
+    V: ArrayAccessor<Item = &'a str> + Copy,
 {
-    let mut builder = PrimitiveArray::<T>::builder(string_array.len());
-
-    let string_iter = ArrayIter::new(string_array);
-
-    string_iter.for_each(|string_opt| match string_opt {
-        Some(string) => {
+    let len = string_array.len();
+    let nulls = string_array.nulls().cloned();
+    let zero = T::Native::from_usize(0).unwrap();
+
+    let values: Vec<T::Native> = (0..len)
+        .map(|i| {
+            if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
+                return zero;
+            }
+            let string = string_array.value(i);
             let position = str_list
                 .iter()
                 .position(|s| *s == string)
                 .map_or(0, |idx| idx + 1);
-            builder.append_value(T::Native::from_usize(position).unwrap());
-        }
-        None => builder.append_null(),
-    });
+            T::Native::from_usize(position).unwrap()
+        })
+        .collect();
 
-    Ok(Arc::new(builder.finish()) as ArrayRef)
+    Ok(Arc::new(PrimitiveArray::<T>::new(values.into(), nulls)) as ArrayRef)
 }
 
 #[cfg(test)]
diff --git a/datafusion/functions/src/unicode/initcap.rs b/datafusion/functions/src/unicode/initcap.rs
index 62862fbe78980..711b2c49b09f6 100644
--- a/datafusion/functions/src/unicode/initcap.rs
+++ b/datafusion/functions/src/unicode/initcap.rs
@@ -15,21 +15,20 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
-use arrow::array::{
-    Array, ArrayRef, GenericStringBuilder, OffsetSizeTrait, StringViewBuilder,
-};
+use arrow::array::{Array, ArrayRef, GenericStringArray, OffsetSizeTrait};
+use arrow::buffer::{Buffer, OffsetBuffer};
 use arrow::datatypes::DataType;
 
+use crate::strings::{GenericStringArrayBuilder, StringViewArrayBuilder};
 use crate::utils::{make_scalar_function, utf8_to_str_type};
 use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
 use datafusion_common::types::logical_string;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, ScalarValue, exec_err};
 use datafusion_expr::{
-    Coercion, ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignatureClass,
-    Volatility,
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -75,10 +74,6 @@ impl InitcapFunc {
 }
 
 impl ScalarUDFImpl for InitcapFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "initcap"
     }
@@ -95,10 +90,40 @@ impl ScalarUDFImpl for InitcapFunc {
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let arg = &args.args[0];
+
+        // Scalar fast path - handle directly without array conversion
+        if let ColumnarValue::Scalar(scalar) = arg {
+            return match scalar {
+                ScalarValue::Utf8(None)
+                | ScalarValue::LargeUtf8(None)
+                | ScalarValue::Utf8View(None) => Ok(arg.clone()),
+                ScalarValue::Utf8(Some(s)) => {
+                    let mut result = String::new();
+                    initcap_string(s, &mut result);
+                    Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(result))))
+                }
+                ScalarValue::LargeUtf8(Some(s)) => {
+                    let mut result = String::new();
+                    initcap_string(s, &mut result);
+                    Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(result))))
+                }
+                ScalarValue::Utf8View(Some(s)) => {
+                    let mut result = String::new();
+                    initcap_string(s, &mut result);
+                    Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(Some(result))))
+                }
+                other => {
+                    exec_err!(
+                        "Unsupported data type {:?} for function `initcap`",
+                        other.data_type()
+                    )
+                }
+            };
+        }
+
+        // Array path
         let args = &args.args;
         match args[0].data_type() {
             DataType::Utf8 => make_scalar_function(initcap::<i32>, vec![])(args),
@@ -115,8 +140,8 @@ impl ScalarUDFImpl for InitcapFunc {
     }
 }
 
-/// Converts the first letter of each word to upper case and the rest to lower
-/// case. Words are sequences of alphanumeric characters separated by
+/// Converts the first letter of each word to uppercase and the rest to
+/// lowercase. Words are sequences of alphanumeric characters separated by
 /// non-alphanumeric characters.
 ///
 /// Example:
@@ -126,38 +151,125 @@ impl ScalarUDFImpl for InitcapFunc {
 fn initcap<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     let string_array = as_generic_string_array::<T>(&args[0])?;
 
-    let mut builder = GenericStringBuilder::<T>::with_capacity(
-        string_array.len(),
+    if string_array.is_ascii() {
+        return Ok(initcap_ascii_array(string_array));
+    }
+
+    let len = string_array.len();
+    let mut builder = GenericStringArrayBuilder::<T>::with_capacity(
+        len,
         string_array.value_data().len(),
     );
 
     let mut container = String::new();
-    string_array.iter().for_each(|str| match str {
-        Some(s) => {
+    let nulls = string_array.nulls().cloned();
+    if let Some(ref n) = nulls {
+        for i in 0..len {
+            if n.is_null(i) {
+                builder.append_placeholder();
+            } else {
+                // SAFETY: not null per check above.
+                let s = unsafe { string_array.value_unchecked(i) };
+                initcap_string(s, &mut container);
+                builder.append_value(&container);
+            }
+        }
+    } else {
+        for i in 0..len {
+            // SAFETY: no null buffer means every index is valid.
+            let s = unsafe { string_array.value_unchecked(i) };
             initcap_string(s, &mut container);
             builder.append_value(&container);
         }
-        None => builder.append_null(),
-    });
+    }
 
-    Ok(Arc::new(builder.finish()) as ArrayRef)
+    Ok(Arc::new(builder.finish(nulls)?) as ArrayRef)
 }
 
-fn initcap_utf8view(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let string_view_array = as_string_view_array(&args[0])?;
+/// Fast path for `Utf8` or `LargeUtf8` arrays that are ASCII-only. We can use a
+/// single pass over the buffer and operate directly on bytes.
+fn initcap_ascii_array<T: OffsetSizeTrait>(
+    string_array: &GenericStringArray<T>,
+) -> ArrayRef {
+    let offsets = string_array.offsets();
+    let src = string_array.value_data();
+    let first_offset = offsets.first().unwrap().as_usize();
+    let last_offset = offsets.last().unwrap().as_usize();
 
-    let mut builder = StringViewBuilder::with_capacity(string_view_array.len());
+    // For sliced arrays, only convert the visible bytes, not the entire input
+    // buffer.
+    let mut out = Vec::with_capacity(last_offset - first_offset);
 
+    for window in offsets.windows(2) {
+        let start = window[0].as_usize();
+        let end = window[1].as_usize();
+
+        let mut prev_is_alnum = false;
+        for &b in &src[start..end] {
+            let converted = if prev_is_alnum {
+                b.to_ascii_lowercase()
+            } else {
+                b.to_ascii_uppercase()
+            };
+            out.push(converted);
+            prev_is_alnum = b.is_ascii_alphanumeric();
+        }
+    }
+
+    let values = Buffer::from_vec(out);
+    let out_offsets = if first_offset == 0 {
+        offsets.clone()
+    } else {
+        // For sliced arrays, we need to rebase the offsets to reflect that the
+        // output only contains the bytes in the visible slice.
+        let rebased_offsets = offsets
+            .iter()
+            .map(|offset| T::usize_as(offset.as_usize() - first_offset))
+            .collect::<Vec<_>>();
+        OffsetBuffer::<T>::new(rebased_offsets.into())
+    };
+
+    // SAFETY: ASCII case conversion preserves byte length, so the original
+    // string boundaries are preserved. `out_offsets` is either identical to
+    // the input offsets or a rebased version relative to the compacted values
+    // buffer.
+    Arc::new(unsafe {
+        GenericStringArray::<T>::new_unchecked(
+            out_offsets,
+            values,
+            string_array.nulls().cloned(),
+        )
+    })
+}
+
+fn initcap_utf8view(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let string_view_array = as_string_view_array(&args[0])?;
+    let len = string_view_array.len();
+    let mut builder = StringViewArrayBuilder::with_capacity(len);
     let mut container = String::new();
-    string_view_array.iter().for_each(|str| match str {
-        Some(s) => {
+
+    let nulls = string_view_array.nulls().cloned();
+    if let Some(ref n) = nulls {
+        for i in 0..len {
+            if n.is_null(i) {
+                builder.append_placeholder();
+            } else {
+                // SAFETY: not null per check above.
+                let s = unsafe { string_view_array.value_unchecked(i) };
+                initcap_string(s, &mut container);
+                builder.append_value(&container);
+            }
+        }
+    } else {
+        for i in 0..len {
+            // SAFETY: no null buffer means every index is valid.
+            let s = unsafe { string_view_array.value_unchecked(i) };
             initcap_string(s, &mut container);
             builder.append_value(&container);
         }
-        None => builder.append_null(),
-    });
+    }
 
-    Ok(Arc::new(builder.finish()) as ArrayRef)
+    Ok(Arc::new(builder.finish(nulls)?) as ArrayRef)
 }
 
 fn initcap_string(input: &str, container: &mut String) {
@@ -165,13 +277,16 @@ fn initcap_string(input: &str, container: &mut String) {
     let mut prev_is_alphanumeric = false;
 
     if input.is_ascii() {
-        for c in input.chars() {
+        container.reserve(input.len());
+        // SAFETY: each byte is ASCII, so the result is valid UTF-8.
+        let out = unsafe { container.as_mut_vec() };
+        for &b in input.as_bytes() {
             if prev_is_alphanumeric {
-                container.push(c.to_ascii_lowercase());
+                out.push(b.to_ascii_lowercase());
             } else {
-                container.push(c.to_ascii_uppercase());
-            };
-            prev_is_alphanumeric = c.is_ascii_alphanumeric();
+                out.push(b.to_ascii_uppercase());
+            }
+            prev_is_alphanumeric = b.is_ascii_alphanumeric();
         }
     } else {
         for c in input.chars() {
@@ -189,10 +304,11 @@ fn initcap_string(input: &str, container: &mut String) {
 mod tests {
     use crate::unicode::initcap::InitcapFunc;
     use crate::utils::test::test_function;
-    use arrow::array::{Array, StringArray, StringViewArray};
+    use arrow::array::{Array, ArrayRef, LargeStringArray, StringArray, StringViewArray};
     use arrow::datatypes::DataType::{Utf8, Utf8View};
     use datafusion_common::{Result, ScalarValue};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+    use std::sync::Arc;
 
     #[test]
     fn test_functions() -> Result<()> {
@@ -296,4 +412,114 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_initcap_ascii_array() -> Result<()> {
+        let array = StringArray::from(vec![
+            Some("hello world"),
+            None,
+            Some("foo-bar_baz/baX"),
+            Some(""),
+            Some("123 abc 456DEF"),
+            Some("ALL CAPS"),
+            Some("already correct"),
+        ]);
+        let args: Vec<ArrayRef> = vec![Arc::new(array)];
+        let result = super::initcap::<i32>(&args)?;
+        let result = result.as_any().downcast_ref::<StringArray>().unwrap();
+
+        assert_eq!(result.len(), 7);
+        assert_eq!(result.value(0), "Hello World");
+        assert!(result.is_null(1));
+        assert_eq!(result.value(2), "Foo-Bar_Baz/Bax");
+        assert_eq!(result.value(3), "");
+        assert_eq!(result.value(4), "123 Abc 456def");
+        assert_eq!(result.value(5), "All Caps");
+        assert_eq!(result.value(6), "Already Correct");
+        Ok(())
+    }
+
+    #[test]
+    fn test_initcap_ascii_large_array() -> Result<()> {
+        let array = LargeStringArray::from(vec![
+            Some("hello world"),
+            None,
+            Some("foo-bar_baz/baX"),
+            Some(""),
+            Some("123 abc 456DEF"),
+            Some("ALL CAPS"),
+            Some("already correct"),
+        ]);
+        let args: Vec<ArrayRef> = vec![Arc::new(array)];
+        let result = super::initcap::<i64>(&args)?;
+        let result = result.as_any().downcast_ref::<LargeStringArray>().unwrap();
+
+        assert_eq!(result.len(), 7);
+        assert_eq!(result.value(0), "Hello World");
+        assert!(result.is_null(1));
+        assert_eq!(result.value(2), "Foo-Bar_Baz/Bax");
+        assert_eq!(result.value(3), "");
+        assert_eq!(result.value(4), "123 Abc 456def");
+        assert_eq!(result.value(5), "All Caps");
+        assert_eq!(result.value(6), "Already Correct");
+        Ok(())
+    }
+
+    /// Test that initcap works correctly on a sliced ASCII StringArray.
+    #[test]
+    fn test_initcap_sliced_ascii_array() -> Result<()> {
+        let array = StringArray::from(vec![
+            Some("hello world"),
+            Some("foo bar"),
+            Some("baz qux"),
+        ]);
+        // Slice to get only the last two elements. The resulting array's
+        // offsets are [11, 18, 25] (non-zero start), but value_data still
+        // contains the full original buffer.
+        let sliced = array.slice(1, 2);
+        let args: Vec<ArrayRef> = vec![Arc::new(sliced)];
+        let result = super::initcap::<i32>(&args)?;
+        let result = result.as_any().downcast_ref::<StringArray>().unwrap();
+
+        assert_eq!(result.len(), 2);
+        assert_eq!(result.value(0), "Foo Bar");
+        assert_eq!(result.value(1), "Baz Qux");
+
+        // The output values buffer should be compact
+        assert_eq!(*result.offsets().first().unwrap(), 0);
+        assert_eq!(
+            result.value_data().len(),
+            *result.offsets().last().unwrap() as usize
+        );
+        Ok(())
+    }
+
+    /// Test that initcap works correctly on a sliced ASCII LargeStringArray.
+    #[test]
+    fn test_initcap_sliced_ascii_large_array() -> Result<()> {
+        let array = LargeStringArray::from(vec![
+            Some("hello world"),
+            Some("foo bar"),
+            Some("baz qux"),
+        ]);
+        // Slice to get only the last two elements. The resulting array's
+        // offsets are [11, 18, 25] (non-zero start), but value_data still
+        // contains the full original buffer.
+        let sliced = array.slice(1, 2);
+        let args: Vec<ArrayRef> = vec![Arc::new(sliced)];
+        let result = super::initcap::<i64>(&args)?;
+        let result = result.as_any().downcast_ref::<LargeStringArray>().unwrap();
+
+        assert_eq!(result.len(), 2);
+        assert_eq!(result.value(0), "Foo Bar");
+        assert_eq!(result.value(1), "Baz Qux");
+
+        // The output values buffer should be compact
+        assert_eq!(*result.offsets().first().unwrap(), 0);
+        assert_eq!(
+            result.value_data().len(),
+            *result.offsets().last().unwrap() as usize
+        );
+        Ok(())
+    }
 }
diff --git a/datafusion/functions/src/unicode/left.rs b/datafusion/functions/src/unicode/left.rs
index fceb2a131a2b0..423ab4d5dc54b 100644
--- a/datafusion/functions/src/unicode/left.rs
+++ b/datafusion/functions/src/unicode/left.rs
@@ -15,25 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
-use std::cmp::Ordering;
-use std::sync::Arc;
-
-use arrow::array::{
-    Array, ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array,
-    OffsetSizeTrait,
-};
+use crate::unicode::common::{LeftSlicer, general_left_right};
+use crate::utils::make_scalar_function;
 use arrow::datatypes::DataType;
-
-use crate::utils::{make_scalar_function, utf8_to_str_type};
-use datafusion_common::cast::{
-    as_generic_string_array, as_int64_array, as_string_view_array,
-};
-use datafusion_common::exec_err;
 use datafusion_common::Result;
+use datafusion_common::exec_err;
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -81,10 +71,6 @@ impl LeftFunc {
 }
 
 impl ScalarUDFImpl for LeftFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "left"
     }
@@ -93,23 +79,24 @@ impl ScalarUDFImpl for LeftFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_to_str_type(&arg_types[0], "left")
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Utf8View)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    /// Returns first n characters in the string, or when n is negative, returns all but last |n| characters.
+    /// left('abcde', 2) = 'ab'
+    /// left('abcde', -2) = 'abc'
+    /// The implementation uses UTF-8 code points as characters
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
         match args[0].data_type() {
-            DataType::Utf8 | DataType::Utf8View => {
-                make_scalar_function(left::<i32>, vec![])(args)
+            DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => {
+                make_scalar_function(general_left_right::<LeftSlicer>, vec![])(args)
             }
-            DataType::LargeUtf8 => make_scalar_function(left::<i64>, vec![])(args),
             other => exec_err!(
-                "Unsupported data type {other:?} for function left,\
-                expected Utf8View, Utf8 or LargeUtf8."
+                "Unsupported data type {other:?} for function {},\
+                expected Utf8View, Utf8 or LargeUtf8.",
+                self.name()
             ),
         }
     }
@@ -119,54 +106,10 @@ impl ScalarUDFImpl for LeftFunc {
     }
 }
 
-/// Returns first n characters in the string, or when n is negative, returns all but last |n| characters.
-/// left('abcde', 2) = 'ab'
-/// The implementation uses UTF-8 code points as characters
-pub fn left<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let n_array = as_int64_array(&args[1])?;
-
-    if args[0].data_type() == &DataType::Utf8View {
-        let string_array = as_string_view_array(&args[0])?;
-        left_impl::<T, _>(string_array, n_array)
-    } else {
-        let string_array = as_generic_string_array::<T>(&args[0])?;
-        left_impl::<T, _>(string_array, n_array)
-    }
-}
-
-fn left_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor<Item = &'a str>>(
-    string_array: V,
-    n_array: &Int64Array,
-) -> Result<ArrayRef> {
-    let iter = ArrayIter::new(string_array);
-    let result = iter
-        .zip(n_array.iter())
-        .map(|(string, n)| match (string, n) {
-            (Some(string), Some(n)) => match n.cmp(&0) {
-                Ordering::Less => {
-                    let len = string.chars().count() as i64;
-                    Some(if n.abs() < len {
-                        string.chars().take((len + n) as usize).collect::<String>()
-                    } else {
-                        "".to_string()
-                    })
-                }
-                Ordering::Equal => Some("".to_string()),
-                Ordering::Greater => {
-                    Some(string.chars().take(n as usize).collect::<String>())
-                }
-            },
-            _ => None,
-        })
-        .collect::<GenericStringArray<T>>();
-
-    Ok(Arc::new(result) as ArrayRef)
-}
-
 #[cfg(test)]
 mod tests {
-    use arrow::array::{Array, StringArray};
-    use arrow::datatypes::DataType::Utf8;
+    use arrow::array::{Array, StringViewArray};
+    use arrow::datatypes::DataType::Utf8View;
 
     use datafusion_common::{Result, ScalarValue};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
@@ -184,8 +127,8 @@ mod tests {
             ],
             Ok(Some("ab")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             LeftFunc::new(),
@@ -195,8 +138,8 @@ mod tests {
             ],
             Ok(Some("abcde")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             LeftFunc::new(),
@@ -206,8 +149,19 @@ mod tests {
             ],
             Ok(Some("abc")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
+        );
+        test_function!(
+            LeftFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::from("abcde")),
+                ColumnarValue::Scalar(ScalarValue::from(i64::MIN)),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             LeftFunc::new(),
@@ -217,8 +171,8 @@ mod tests {
             ],
             Ok(Some("")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             LeftFunc::new(),
@@ -228,8 +182,8 @@ mod tests {
             ],
             Ok(Some("")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             LeftFunc::new(),
@@ -239,8 +193,8 @@ mod tests {
             ],
             Ok(None),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             LeftFunc::new(),
@@ -250,8 +204,8 @@ mod tests {
             ],
             Ok(None),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             LeftFunc::new(),
@@ -261,8 +215,8 @@ mod tests {
             ],
             Ok(Some("joséé")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             LeftFunc::new(),
@@ -272,8 +226,8 @@ mod tests {
             ],
             Ok(Some("joséé")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         #[cfg(not(feature = "unicode_expressions"))]
         test_function!(
@@ -286,9 +240,77 @@ mod tests {
                 "function left requires compilation with feature flag: unicode_expressions."
             ),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
+        );
+
+        // StringView cases
+        test_function!(
+            LeftFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some("abcde".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::from(2i64)),
+            ],
+            Ok(Some("ab")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+        test_function!(
+            LeftFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some("abcde".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::from(200i64)),
+            ],
+            Ok(Some("abcde")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+        test_function!(
+            LeftFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some("".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::from(200i64)),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8View,
+            StringViewArray
         );
+        test_function!(
+            LeftFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
+                    "joséésoj".to_string()
+                ))),
+                ColumnarValue::Scalar(ScalarValue::from(-3i64)),
+            ],
+            Ok(Some("joséé")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+
+        // Unicode indexing case
+        let input = "joé楽s𐀀so↓j";
+        for n in 1..=input.chars().count() {
+            let expected = input
+                .chars()
+                .take(input.chars().count() - n)
+                .collect::<String>();
+            test_function!(
+                LeftFunc::new(),
+                vec![
+                    ColumnarValue::Scalar(ScalarValue::from(input)),
+                    ColumnarValue::Scalar(ScalarValue::from(-(n as i64))),
+                ],
+                Ok(Some(expected.as_str())),
+                &str,
+                Utf8View,
+                StringViewArray
+            );
+        }
 
         Ok(())
     }
diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs
index 621dbd4970f26..d27bc8633e730 100644
--- a/datafusion/functions/src/unicode/lpad.rs
+++ b/datafusion/functions/src/unicode/lpad.rs
@@ -15,24 +15,23 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::fmt::Write;
 use std::sync::Arc;
 
+use DataType::{LargeUtf8, Utf8, Utf8View};
 use arrow::array::{
     Array, ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array,
     OffsetSizeTrait, StringArrayType, StringViewArray,
 };
 use arrow::datatypes::DataType;
-use unicode_segmentation::UnicodeSegmentation;
-use DataType::{LargeUtf8, Utf8, Utf8View};
 
 use crate::utils::{make_scalar_function, utf8_to_str_type};
 use datafusion_common::cast::as_int64_array;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -49,7 +48,10 @@ use datafusion_macros::user_doc;
 +---------------------------------------------+
 ```"#,
     standard_argument(name = "str", prefix = "String"),
-    argument(name = "n", description = "String length to pad to."),
+    argument(
+        name = "n",
+        description = "String length to pad to. If the input string is longer than this length, it is truncated (on the right)."
+    ),
     argument(
         name = "padding_str",
         description = "Optional string expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._"
@@ -93,10 +95,6 @@ impl LPadFunc {
 }
 
 impl ScalarUDFImpl for LPadFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "lpad"
     }
@@ -109,14 +107,67 @@ impl ScalarUDFImpl for LPadFunc {
         utf8_to_str_type(&arg_types[0], "lpad")
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        let args = &args.args;
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs {
+            args, number_rows, ..
+        } = args;
+
+        const MAX_SCALAR_TARGET_LEN: usize = 16384;
+
+        // If target_len and fill (if specified) are constants, use the scalar
+        // fast path.
+        if let Some(target_len) = try_as_scalar_i64(&args[1]) {
+            let target_len: usize = match usize::try_from(target_len) {
+                Ok(n) if n <= i32::MAX as usize => n,
+                Ok(n) => {
+                    return exec_err!(
+                        "lpad requested length {n} too large, maximum allowed length is {}",
+                        i32::MAX
+                    );
+                }
+                Err(_) => 0, // negative → 0
+            };
+
+            let fill_str = if args.len() == 3 {
+                try_as_scalar_str(&args[2])
+            } else {
+                Some(" ")
+            };
+
+            // Skip the fast path for very large `target_len` values to avoid
+            // consuming too much memory. Such large padding values are uncommon
+            // in practice.
+            if target_len <= MAX_SCALAR_TARGET_LEN
+                && let Some(fill) = fill_str
+            {
+                let string_array = args[0].to_array_of_size(number_rows)?;
+                let result = match string_array.data_type() {
+                    Utf8View => lpad_scalar_args::<_, i32>(
+                        string_array.as_string_view(),
+                        target_len,
+                        fill,
+                    ),
+                    Utf8 => lpad_scalar_args::<_, i32>(
+                        string_array.as_string::<i32>(),
+                        target_len,
+                        fill,
+                    ),
+                    LargeUtf8 => lpad_scalar_args::<_, i64>(
+                        string_array.as_string::<i64>(),
+                        target_len,
+                        fill,
+                    ),
+                    other => {
+                        exec_err!("Unsupported data type {other:?} for function lpad")
+                    }
+                }?;
+                return Ok(ColumnarValue::Array(result));
+            }
+        }
+
         match args[0].data_type() {
-            Utf8 | Utf8View => make_scalar_function(lpad::<i32>, vec![])(args),
-            LargeUtf8 => make_scalar_function(lpad::<i64>, vec![])(args),
+            Utf8 | Utf8View => make_scalar_function(lpad::<i32>, vec![])(&args),
+            LargeUtf8 => make_scalar_function(lpad::<i64>, vec![])(&args),
             other => exec_err!("Unsupported data type {other:?} for function lpad"),
         }
     }
@@ -126,10 +177,127 @@ impl ScalarUDFImpl for LPadFunc {
     }
 }
 
-/// Extends the string to length 'length' by prepending the characters fill (a space by default).
-/// If the string is already longer than length then it is truncated (on the right).
+use super::common::{
+    StringCharLen, char_count_or_boundary, try_as_scalar_i64, try_as_scalar_str,
+};
+
+/// Optimized lpad for constant target_len and fill arguments.
+fn lpad_scalar_args<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
+    string_array: V,
+    target_len: usize,
+    fill: &str,
+) -> Result<ArrayRef> {
+    if string_array.is_ascii() && fill.is_ascii() {
+        lpad_scalar_ascii::<V, T>(string_array, target_len, fill)
+    } else {
+        lpad_scalar_unicode::<V, T>(string_array, target_len, fill)
+    }
+}
+
+fn lpad_scalar_ascii<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
+    string_array: V,
+    target_len: usize,
+    fill: &str,
+) -> Result<ArrayRef> {
+    // With a scalar `target_len` and `fill`, we can precompute a padding
+    // buffer of `target_len` fill characters repeated cyclically.
+    let padding_buf = if !fill.is_empty() {
+        let mut buf = String::with_capacity(target_len);
+        while buf.len() < target_len {
+            let remaining = target_len - buf.len();
+            if remaining >= fill.len() {
+                buf.push_str(fill);
+            } else {
+                buf.push_str(&fill[..remaining]);
+            }
+        }
+        buf
+    } else {
+        String::new()
+    };
+
+    // Each output row is exactly `target_len` ASCII bytes (padding + string).
+    let data_capacity = string_array.len().saturating_mul(target_len);
+    let mut builder =
+        GenericStringBuilder::<T>::with_capacity(string_array.len(), data_capacity);
+
+    for maybe_string in string_array.iter() {
+        match maybe_string {
+            Some(string) => {
+                let str_len = string.len();
+                if target_len <= str_len {
+                    builder.append_value(&string[..target_len]);
+                } else if fill.is_empty() {
+                    builder.append_value(string);
+                } else {
+                    let pad_needed = target_len - str_len;
+                    builder.write_str(&padding_buf[..pad_needed])?;
+                    builder.append_value(string);
+                }
+            }
+            None => builder.append_null(),
+        }
+    }
+
+    Ok(Arc::new(builder.finish()) as ArrayRef)
+}
+
+fn lpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
+    string_array: V,
+    target_len: usize,
+    fill: &str,
+) -> Result<ArrayRef> {
+    let fill_chars: Vec<char> = fill.chars().collect();
+
+    // With a scalar `target_len` and `fill`, we can precompute a padding buffer
+    // of `target_len` fill characters repeated cyclically. Because Unicode
+    // characters are variable-width, we build a byte-offset table to map from
+    // character count to the corresponding byte position in the padding buffer.
+    let (padding_buf, char_byte_offsets) = if !fill_chars.is_empty() {
+        let mut buf = String::new();
+        let mut offsets = Vec::with_capacity(target_len + 1);
+        offsets.push(0usize);
+        for i in 0..target_len {
+            buf.push(fill_chars[i % fill_chars.len()]);
+            offsets.push(buf.len());
+        }
+        (buf, offsets)
+    } else {
+        (String::new(), vec![0])
+    };
+
+    // Each output row is `target_len` chars; multiply by 4 (max UTF-8 bytes
+    // per char) for an upper bound in bytes.
+    let data_capacity = string_array.len().saturating_mul(target_len * 4);
+    let mut builder =
+        GenericStringBuilder::<T>::with_capacity(string_array.len(), data_capacity);
+
+    for maybe_string in string_array.iter() {
+        match maybe_string {
+            Some(string) => match char_count_or_boundary(string, target_len) {
+                StringCharLen::ByteOffset(offset) => {
+                    builder.append_value(&string[..offset]);
+                }
+                StringCharLen::CharCount(char_count) => {
+                    if !fill_chars.is_empty() {
+                        let pad_chars = target_len - char_count;
+                        let pad_bytes = char_byte_offsets[pad_chars];
+                        builder.write_str(&padding_buf[..pad_bytes])?;
+                    }
+                    builder.append_value(string);
+                }
+            },
+            None => builder.append_null(),
+        }
+    }
+
+    Ok(Arc::new(builder.finish()) as ArrayRef)
+}
+
+/// Left-pads `string` to `target_len` using the fill string (default: space).
+/// Truncates from the right if `string` is already longer than `target_len`.
 /// lpad('hi', 5, 'xy') = 'xyxhi'
-pub fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     if args.len() <= 1 || args.len() > 3 {
         return exec_err!(
             "lpad was called with {} arguments. It requires at least 2 and at most 3.",
@@ -141,7 +309,7 @@ pub fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
 
     match (args.len(), args[0].data_type()) {
         (2, Utf8View) => lpad_impl::<&StringViewArray, &GenericStringArray<i32>, T>(
-            args[0].as_string_view(),
+            &args[0].as_string_view(),
             length_array,
             None,
         ),
@@ -149,23 +317,23 @@ pub fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
             &GenericStringArray<T>,
             &GenericStringArray<T>,
             T,
-        >(args[0].as_string::<T>(), length_array, None),
+        >(&args[0].as_string::<T>(), length_array, None),
         (3, Utf8View) => lpad_with_replace::<&StringViewArray, T>(
-            args[0].as_string_view(),
+            &args[0].as_string_view(),
             length_array,
             &args[2],
         ),
         (3, Utf8 | LargeUtf8) => lpad_with_replace::<&GenericStringArray<T>, T>(
-            args[0].as_string::<T>(),
+            &args[0].as_string::<T>(),
             length_array,
             &args[2],
         ),
-        (_, _) => unreachable!("lpad"),
+        (len, dt) => unreachable!("lpad: unexpected arg count ({len}) or type ({dt})"),
     }
 }
 
 fn lpad_with_replace<'a, V, T: OffsetSizeTrait>(
-    string_array: V,
+    string_array: &V,
     length_array: &Int64Array,
     fill_array: &'a ArrayRef,
 ) -> Result<ArrayRef>
@@ -195,7 +363,7 @@ where
 }
 
 fn lpad_impl<'a, V, V2, T>(
-    string_array: V,
+    string_array: &V,
     length_array: &Int64Array,
     fill_array: Option<V2>,
 ) -> Result<ArrayRef>
@@ -206,37 +374,73 @@ where
 {
     let array = if let Some(fill_array) = fill_array {
         let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
+        let mut fill_chars_buf = Vec::new();
 
-        for ((string, length), fill) in string_array
+        for ((string, target_len), fill) in string_array
             .iter()
             .zip(length_array.iter())
             .zip(fill_array.iter())
         {
-            if let (Some(string), Some(length), Some(fill)) = (string, length, fill) {
-                if length > i32::MAX as i64 {
-                    return exec_err!("lpad requested length {length} too large");
+            if let (Some(string), Some(target_len), Some(fill)) =
+                (string, target_len, fill)
+            {
+                if target_len > i32::MAX as i64 {
+                    return exec_err!(
+                        "lpad requested length {target_len} too large, maximum allowed length is {}",
+                        i32::MAX
+                    );
                 }
 
-                let length = if length < 0 { 0 } else { length as usize };
-                if length == 0 {
+                let target_len = if target_len < 0 {
+                    0
+                } else {
+                    target_len as usize
+                };
+                if target_len == 0 {
                     builder.append_value("");
                     continue;
                 }
 
-                let graphemes = string.graphemes(true).collect::<Vec<&str>>();
-                let fill_chars = fill.chars().collect::<Vec<char>>();
-
-                if length < graphemes.len() {
-                    builder.append_value(graphemes[..length].concat());
-                } else if fill_chars.is_empty() {
-                    builder.append_value(string);
+                if string.is_ascii() && fill.is_ascii() {
+                    // ASCII fast path: byte length == character length.
+                    let str_len = string.len();
+                    if target_len < str_len {
+                        builder.append_value(&string[..target_len]);
+                    } else if fill.is_empty() {
+                        builder.append_value(string);
+                    } else {
+                        let pad_len = target_len - str_len;
+                        let fill_len = fill.len();
+                        let full_reps = pad_len / fill_len;
+                        let remainder = pad_len % fill_len;
+                        for _ in 0..full_reps {
+                            builder.write_str(fill)?;
+                        }
+                        if remainder > 0 {
+                            builder.write_str(&fill[..remainder])?;
+                        }
+                        builder.append_value(string);
+                    }
                 } else {
-                    for l in 0..length - graphemes.len() {
-                        let c = *fill_chars.get(l % fill_chars.len()).unwrap();
-                        builder.write_char(c)?;
+                    fill_chars_buf.clear();
+                    fill_chars_buf.extend(fill.chars());
+
+                    match char_count_or_boundary(string, target_len) {
+                        StringCharLen::ByteOffset(offset) => {
+                            builder.append_value(&string[..offset]);
+                        }
+                        StringCharLen::CharCount(char_count) => {
+                            if !fill_chars_buf.is_empty() {
+                                for l in 0..target_len - char_count {
+                                    let c = *fill_chars_buf
+                                        .get(l % fill_chars_buf.len())
+                                        .unwrap();
+                                    builder.write_char(c)?;
+                                }
+                            }
+                            builder.append_value(string);
+                        }
                     }
-                    builder.write_str(string)?;
-                    builder.append_value("");
                 }
             } else {
                 builder.append_null();
@@ -247,25 +451,48 @@ where
     } else {
         let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
 
-        for (string, length) in string_array.iter().zip(length_array.iter()) {
-            if let (Some(string), Some(length)) = (string, length) {
-                if length > i32::MAX as i64 {
-                    return exec_err!("lpad requested length {length} too large");
+        for (string, target_len) in string_array.iter().zip(length_array.iter()) {
+            if let (Some(string), Some(target_len)) = (string, target_len) {
+                if target_len > i32::MAX as i64 {
+                    return exec_err!(
+                        "lpad requested length {target_len} too large, maximum allowed length is {}",
+                        i32::MAX
+                    );
                 }
 
-                let length = if length < 0 { 0 } else { length as usize };
-                if length == 0 {
+                let target_len = if target_len < 0 {
+                    0
+                } else {
+                    target_len as usize
+                };
+                if target_len == 0 {
                     builder.append_value("");
                     continue;
                 }
 
-                let graphemes = string.graphemes(true).collect::<Vec<&str>>();
-                if length < graphemes.len() {
-                    builder.append_value(graphemes[..length].concat());
+                if string.is_ascii() {
+                    // ASCII fast path: byte length == character length
+                    let str_len = string.len();
+                    if target_len < str_len {
+                        builder.append_value(&string[..target_len]);
+                    } else {
+                        for _ in 0..(target_len - str_len) {
+                            builder.write_str(" ")?;
+                        }
+                        builder.append_value(string);
+                    }
                 } else {
-                    builder.write_str(" ".repeat(length - graphemes.len()).as_str())?;
-                    builder.write_str(string)?;
-                    builder.append_value("");
+                    match char_count_or_boundary(string, target_len) {
+                        StringCharLen::ByteOffset(offset) => {
+                            builder.append_value(&string[..offset]);
+                        }
+                        StringCharLen::CharCount(char_count) => {
+                            for _ in 0..(target_len - char_count) {
+                                builder.write_str(" ")?;
+                            }
+                            builder.append_value(string);
+                        }
+                    }
                 }
             } else {
                 builder.append_null();
@@ -512,6 +739,17 @@ mod tests {
             None,
             Ok(None)
         );
+        test_lpad!(
+            Some("hello".into()),
+            ScalarValue::Int64(Some(2i64)),
+            Ok(Some("he"))
+        );
+        test_lpad!(
+            Some("hi".into()),
+            ScalarValue::Int64(Some(6i64)),
+            Some("xy".into()),
+            Ok(Some("xyxyhi"))
+        );
         test_lpad!(
             Some("josé".into()),
             ScalarValue::Int64(Some(10i64)),
@@ -526,9 +764,13 @@ mod tests {
         );
 
         #[cfg(not(feature = "unicode_expressions"))]
-        test_lpad!(Some("josé".into()), ScalarValue::Int64(Some(5i64)), internal_err!(
+        test_lpad!(
+            Some("josé".into()),
+            ScalarValue::Int64(Some(5i64)),
+            internal_err!(
                 "function lpad requires compilation with feature flag: unicode_expressions."
-        ));
+            )
+        );
 
         Ok(())
     }
diff --git a/datafusion/functions/src/unicode/mod.rs b/datafusion/functions/src/unicode/mod.rs
index 4a0dd21d749af..7250b3915fb5c 100644
--- a/datafusion/functions/src/unicode/mod.rs
+++ b/datafusion/functions/src/unicode/mod.rs
@@ -22,6 +22,7 @@ use std::sync::Arc;
 use datafusion_expr::ScalarUDF;
 
 pub mod character_length;
+pub mod common;
 pub mod find_in_set;
 pub mod initcap;
 pub mod left;
diff --git a/datafusion/functions/src/unicode/planner.rs b/datafusion/functions/src/unicode/planner.rs
index e4f29be3d13dc..38c82486416a6 100644
--- a/datafusion/functions/src/unicode/planner.rs
+++ b/datafusion/functions/src/unicode/planner.rs
@@ -17,9 +17,9 @@
 
 //! SQL planning extensions like [`UnicodeFunctionPlanner`]
 
+use datafusion_expr::Expr;
 use datafusion_expr::expr::ScalarFunction;
 use datafusion_expr::planner::{ExprPlanner, PlannerResult};
-use datafusion_expr::Expr;
 
 #[derive(Default, Debug)]
 pub struct UnicodeFunctionPlanner;
diff --git a/datafusion/functions/src/unicode/reverse.rs b/datafusion/functions/src/unicode/reverse.rs
index 500e762ec250b..50493f475ea74 100644
--- a/datafusion/functions/src/unicode/reverse.rs
+++ b/datafusion/functions/src/unicode/reverse.rs
@@ -15,20 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
-use crate::utils::{make_scalar_function, utf8_to_str_type};
+use crate::utils::make_scalar_function;
+use DataType::{LargeUtf8, Utf8, Utf8View};
 use arrow::array::{
-    Array, ArrayRef, AsArray, GenericStringBuilder, OffsetSizeTrait, StringArrayType,
+    Array, ArrayRef, AsArray, LargeStringBuilder, StringArrayType, StringBuilder,
+    StringLikeArrayBuilder, StringViewBuilder,
 };
 use arrow::datatypes::DataType;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
-use DataType::{LargeUtf8, Utf8, Utf8View};
 
 #[user_doc(
     doc_section(label = "String Functions"),
@@ -69,10 +70,6 @@ impl ReverseFunc {
 }
 
 impl ScalarUDFImpl for ReverseFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "reverse"
     }
@@ -82,17 +79,13 @@ impl ScalarUDFImpl for ReverseFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_to_str_type(&arg_types[0], "reverse")
+        Ok(arg_types[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
         match args[0].data_type() {
-            Utf8 | Utf8View => make_scalar_function(reverse::<i32>, vec![])(args),
-            LargeUtf8 => make_scalar_function(reverse::<i64>, vec![])(args),
+            Utf8 | Utf8View | LargeUtf8 => make_scalar_function(reverse, vec![])(args),
             other => {
                 exec_err!("Unsupported data type {other:?} for function reverse")
             }
@@ -106,21 +99,39 @@ impl ScalarUDFImpl for ReverseFunc {
 
 /// Reverses the order of the characters in the string `reverse('abcde') = 'edcba'`.
 /// The implementation uses UTF-8 code points as characters
-pub fn reverse<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
-    if args[0].data_type() == &Utf8View {
-        reverse_impl::<T, _>(args[0].as_string_view())
-    } else {
-        reverse_impl::<T, _>(args[0].as_string::<T>())
+fn reverse(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let len = args[0].len();
+
+    match args[0].data_type() {
+        Utf8 => reverse_impl(
+            &args[0].as_string::<i32>(),
+            StringBuilder::with_capacity(len, 1024),
+        ),
+        Utf8View => reverse_impl(
+            &args[0].as_string_view(),
+            StringViewBuilder::with_capacity(len),
+        ),
+        LargeUtf8 => reverse_impl(
+            &args[0].as_string::<i64>(),
+            LargeStringBuilder::with_capacity(len, 1024),
+        ),
+        _ => unreachable!(
+            "Reverse can only be applied to Utf8View, Utf8 and LargeUtf8 types"
+        ),
     }
 }
 
-fn reverse_impl<'a, T: OffsetSizeTrait, V: StringArrayType<'a>>(
-    string_array: V,
-) -> Result<ArrayRef> {
-    let mut builder = GenericStringBuilder::<T>::with_capacity(string_array.len(), 1024);
-
+fn reverse_impl<'a, StringArrType, StringBuilderType>(
+    string_array: &StringArrType,
+    mut array_builder: StringBuilderType,
+) -> Result<ArrayRef>
+where
+    StringArrType: StringArrayType<'a>,
+    StringBuilderType: StringLikeArrayBuilder,
+{
     let mut string_buf = String::new();
     let mut byte_buf = Vec::<u8>::new();
+
     for string in string_array.iter() {
         if let Some(s) = string {
             if s.is_ascii() {
@@ -129,25 +140,25 @@ fn reverse_impl<'a, T: OffsetSizeTrait, V: StringArrayType<'a>>(
                 byte_buf.reverse();
                 // SAFETY: Since the original string was ASCII, reversing the bytes still results in valid UTF-8.
                 let reversed = unsafe { std::str::from_utf8_unchecked(&byte_buf) };
-                builder.append_value(reversed);
+                array_builder.append_value(reversed);
                 byte_buf.clear();
             } else {
                 string_buf.extend(s.chars().rev());
-                builder.append_value(&string_buf);
+                array_builder.append_value(&string_buf);
                 string_buf.clear();
             }
         } else {
-            builder.append_null();
+            array_builder.append_null();
         }
     }
 
-    Ok(Arc::new(builder.finish()) as ArrayRef)
+    Ok(Arc::new(array_builder.finish()) as ArrayRef)
 }
 
 #[cfg(test)]
 mod tests {
-    use arrow::array::{Array, LargeStringArray, StringArray};
-    use arrow::datatypes::DataType::{LargeUtf8, Utf8};
+    use arrow::array::{Array, LargeStringArray, StringArray, StringViewArray};
+    use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
 
     use datafusion_common::{Result, ScalarValue};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
@@ -180,8 +191,8 @@ mod tests {
                 vec![ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT))],
                 $EXPECTED,
                 &str,
-                Utf8,
-                StringArray
+                Utf8View,
+                StringViewArray
             );
         };
     }
diff --git a/datafusion/functions/src/unicode/right.rs b/datafusion/functions/src/unicode/right.rs
index c492f606e9c5b..0ed170fef72d7 100644
--- a/datafusion/functions/src/unicode/right.rs
+++ b/datafusion/functions/src/unicode/right.rs
@@ -15,25 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
-use std::cmp::{max, Ordering};
-use std::sync::Arc;
-
-use arrow::array::{
-    Array, ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array,
-    OffsetSizeTrait,
-};
+use crate::unicode::common::{RightSlicer, general_left_right};
+use crate::utils::make_scalar_function;
 use arrow::datatypes::DataType;
-
-use crate::utils::{make_scalar_function, utf8_to_str_type};
-use datafusion_common::cast::{
-    as_generic_string_array, as_int64_array, as_string_view_array,
-};
-use datafusion_common::exec_err;
 use datafusion_common::Result;
+use datafusion_common::exec_err;
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -81,10 +71,6 @@ impl RightFunc {
 }
 
 impl ScalarUDFImpl for RightFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "right"
     }
@@ -93,23 +79,24 @@ impl ScalarUDFImpl for RightFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_to_str_type(&arg_types[0], "right")
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Utf8View)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    /// Returns right n characters in the string, or when n is negative, returns all but first |n| characters.
+    /// right('abcde', 2) = 'de'
+    /// right('abcde', -2) = 'cde'
+    /// The implementation uses UTF-8 code points as characters
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
         match args[0].data_type() {
-            DataType::Utf8 | DataType::Utf8View => {
-                make_scalar_function(right::<i32>, vec![])(args)
+            DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => {
+                make_scalar_function(general_left_right::<RightSlicer>, vec![])(args)
             }
-            DataType::LargeUtf8 => make_scalar_function(right::<i64>, vec![])(args),
             other => exec_err!(
-                "Unsupported data type {other:?} for function right,\
-            expected Utf8View, Utf8 or LargeUtf8."
+                "Unsupported data type {other:?} for function {},\
+                expected Utf8View, Utf8 or LargeUtf8.",
+                self.name()
             ),
         }
     }
@@ -119,58 +106,10 @@ impl ScalarUDFImpl for RightFunc {
     }
 }
 
-/// Returns last n characters in the string, or when n is negative, returns all but first |n| characters.
-/// right('abcde', 2) = 'de'
-/// The implementation uses UTF-8 code points as characters
-pub fn right<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let n_array = as_int64_array(&args[1])?;
-    if args[0].data_type() == &DataType::Utf8View {
-        // string_view_right(args)
-        let string_array = as_string_view_array(&args[0])?;
-        right_impl::<T, _>(&mut string_array.iter(), n_array)
-    } else {
-        // string_right::<T>(args)
-        let string_array = &as_generic_string_array::<T>(&args[0])?;
-        right_impl::<T, _>(&mut string_array.iter(), n_array)
-    }
-}
-
-// Currently the return type can only be Utf8 or LargeUtf8, to reach fully support, we need
-// to edit the `get_optimal_return_type` in utils.rs to make the udfs be able to return Utf8View
-// See https://github.com/apache/datafusion/issues/11790#issuecomment-2283777166
-fn right_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor<Item = &'a str>>(
-    string_array_iter: &mut ArrayIter<V>,
-    n_array: &Int64Array,
-) -> Result<ArrayRef> {
-    let result = string_array_iter
-        .zip(n_array.iter())
-        .map(|(string, n)| match (string, n) {
-            (Some(string), Some(n)) => match n.cmp(&0) {
-                Ordering::Less => Some(
-                    string
-                        .chars()
-                        .skip(n.unsigned_abs() as usize)
-                        .collect::<String>(),
-                ),
-                Ordering::Equal => Some("".to_string()),
-                Ordering::Greater => Some(
-                    string
-                        .chars()
-                        .skip(max(string.chars().count() as i64 - n, 0) as usize)
-                        .collect::<String>(),
-                ),
-            },
-            _ => None,
-        })
-        .collect::<GenericStringArray<T>>();
-
-    Ok(Arc::new(result) as ArrayRef)
-}
-
 #[cfg(test)]
 mod tests {
-    use arrow::array::{Array, StringArray};
-    use arrow::datatypes::DataType::Utf8;
+    use arrow::array::{Array, StringViewArray};
+    use arrow::datatypes::DataType::Utf8View;
 
     use datafusion_common::{Result, ScalarValue};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
@@ -188,8 +127,8 @@ mod tests {
             ],
             Ok(Some("de")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             RightFunc::new(),
@@ -199,8 +138,8 @@ mod tests {
             ],
             Ok(Some("abcde")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             RightFunc::new(),
@@ -210,8 +149,19 @@ mod tests {
             ],
             Ok(Some("cde")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
+        );
+        test_function!(
+            RightFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::from("abcde")),
+                ColumnarValue::Scalar(ScalarValue::from(i64::MIN)),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             RightFunc::new(),
@@ -221,8 +171,8 @@ mod tests {
             ],
             Ok(Some("")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             RightFunc::new(),
@@ -232,8 +182,8 @@ mod tests {
             ],
             Ok(Some("")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             RightFunc::new(),
@@ -243,8 +193,8 @@ mod tests {
             ],
             Ok(None),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             RightFunc::new(),
@@ -254,30 +204,30 @@ mod tests {
             ],
             Ok(None),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             RightFunc::new(),
             vec![
-                ColumnarValue::Scalar(ScalarValue::from("joséésoj")),
+                ColumnarValue::Scalar(ScalarValue::from("joséérend")),
                 ColumnarValue::Scalar(ScalarValue::from(5i64)),
             ],
-            Ok(Some("éésoj")),
+            Ok(Some("érend")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             RightFunc::new(),
             vec![
-                ColumnarValue::Scalar(ScalarValue::from("joséésoj")),
+                ColumnarValue::Scalar(ScalarValue::from("joséérend")),
                 ColumnarValue::Scalar(ScalarValue::from(-3i64)),
             ],
-            Ok(Some("éésoj")),
+            Ok(Some("éérend")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         #[cfg(not(feature = "unicode_expressions"))]
         test_function!(
@@ -290,9 +240,74 @@ mod tests {
                 "function right requires compilation with feature flag: unicode_expressions."
             ),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
+        );
+
+        // StringView cases
+        test_function!(
+            RightFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some("abcde".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::from(2i64)),
+            ],
+            Ok(Some("de")),
+            &str,
+            Utf8View,
+            StringViewArray
         );
+        test_function!(
+            RightFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some("abcde".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::from(200i64)),
+            ],
+            Ok(Some("abcde")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+        test_function!(
+            RightFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some("".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::from(200i64)),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+        test_function!(
+            RightFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
+                    "joséérend".to_string()
+                ))),
+                ColumnarValue::Scalar(ScalarValue::from(-3i64)),
+            ],
+            Ok(Some("éérend")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+
+        // Unicode indexing case
+        let input = "joé楽s𐀀so↓j";
+        for n in 1..=input.chars().count() {
+            let expected = input.chars().skip(n).collect::<String>();
+            test_function!(
+                RightFunc::new(),
+                vec![
+                    ColumnarValue::Scalar(ScalarValue::from(input)),
+                    ColumnarValue::Scalar(ScalarValue::from(-(n as i64))),
+                ],
+                Ok(Some(expected.as_str())),
+                &str,
+                Utf8View,
+                StringViewArray
+            );
+        }
 
         Ok(())
     }
diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs
index 6ec78b07980b8..b3e14f93526ab 100644
--- a/datafusion/functions/src/unicode/rpad.rs
+++ b/datafusion/functions/src/unicode/rpad.rs
@@ -15,25 +15,25 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::utils::{make_scalar_function, utf8_to_str_type};
+use std::fmt::Write;
+use std::sync::Arc;
+
+use DataType::{LargeUtf8, Utf8, Utf8View};
 use arrow::array::{
     ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array,
     OffsetSizeTrait, StringArrayType, StringViewArray,
 };
 use arrow::datatypes::DataType;
+
+use crate::utils::{make_scalar_function, utf8_to_str_type};
 use datafusion_common::cast::as_int64_array;
-use datafusion_common::DataFusionError;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
-use std::any::Any;
-use std::fmt::Write;
-use std::sync::Arc;
-use unicode_segmentation::UnicodeSegmentation;
-use DataType::{LargeUtf8, Utf8, Utf8View};
 
 #[user_doc(
     doc_section(label = "String Functions"),
@@ -48,7 +48,10 @@ use DataType::{LargeUtf8, Utf8, Utf8View};
 +-----------------------------------------------+
 ```"#,
     standard_argument(name = "str", prefix = "String"),
-    argument(name = "n", description = "String length to pad to."),
+    argument(
+        name = "n",
+        description = "String length to pad to. If the input string is longer than this length, it is truncated."
+    ),
     argument(
         name = "padding_str",
         description = "String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._"
@@ -92,10 +95,6 @@ impl RPadFunc {
 }
 
 impl ScalarUDFImpl for RPadFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "rpad"
     }
@@ -108,36 +107,69 @@ impl ScalarUDFImpl for RPadFunc {
         utf8_to_str_type(&arg_types[0], "rpad")
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        let args = &args.args;
-        match (
-            args.len(),
-            args[0].data_type(),
-            args.get(2).map(|arg| arg.data_type()),
-        ) {
-            (2, Utf8 | Utf8View, _) => {
-                make_scalar_function(rpad::<i32, i32>, vec![])(args)
-            }
-            (2, LargeUtf8, _) => make_scalar_function(rpad::<i64, i64>, vec![])(args),
-            (3, Utf8 | Utf8View, Some(Utf8 | Utf8View)) => {
-                make_scalar_function(rpad::<i32, i32>, vec![])(args)
-            }
-            (3, LargeUtf8, Some(LargeUtf8)) => {
-                make_scalar_function(rpad::<i64, i64>, vec![])(args)
-            }
-            (3, Utf8 | Utf8View, Some(LargeUtf8)) => {
-                make_scalar_function(rpad::<i32, i64>, vec![])(args)
-            }
-            (3, LargeUtf8, Some(Utf8 | Utf8View)) => {
-                make_scalar_function(rpad::<i64, i32>, vec![])(args)
-            }
-            (_, _, _) => {
-                exec_err!("Unsupported combination of data types for function rpad")
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs {
+            args, number_rows, ..
+        } = args;
+
+        const MAX_SCALAR_TARGET_LEN: usize = 16384;
+
+        // If target_len and fill (if specified) are constants, use the
+        // scalar fast path.
+        if let Some(target_len) = try_as_scalar_i64(&args[1]) {
+            let target_len: usize = match usize::try_from(target_len) {
+                Ok(n) if n <= i32::MAX as usize => n,
+                Ok(n) => {
+                    return exec_err!(
+                        "rpad requested length {n} too large, maximum allowed length is {}",
+                        i32::MAX
+                    );
+                }
+                Err(_) => 0, // negative → 0
+            };
+
+            let fill_str = if args.len() == 3 {
+                try_as_scalar_str(&args[2])
+            } else {
+                Some(" ")
+            };
+
+            // Skip the fast path for very large `target_len` values to avoid
+            // consuming too much memory. Such large padding values are uncommon
+            // in practice.
+            if target_len <= MAX_SCALAR_TARGET_LEN
+                && let Some(fill) = fill_str
+            {
+                let string_array = args[0].to_array_of_size(number_rows)?;
+                let result = match string_array.data_type() {
+                    Utf8View => rpad_scalar_args::<_, i32>(
+                        string_array.as_string_view(),
+                        target_len,
+                        fill,
+                    ),
+                    Utf8 => rpad_scalar_args::<_, i32>(
+                        string_array.as_string::<i32>(),
+                        target_len,
+                        fill,
+                    ),
+                    LargeUtf8 => rpad_scalar_args::<_, i64>(
+                        string_array.as_string::<i64>(),
+                        target_len,
+                        fill,
+                    ),
+                    other => {
+                        exec_err!("Unsupported data type {other:?} for function rpad")
+                    }
+                }?;
+                return Ok(ColumnarValue::Array(result));
             }
         }
+
+        match args[0].data_type() {
+            Utf8 | Utf8View => make_scalar_function(rpad::<i32>, vec![])(&args),
+            LargeUtf8 => make_scalar_function(rpad::<i64>, vec![])(&args),
+            other => exec_err!("Unsupported data type {other:?} for function rpad"),
+        }
     }
 
     fn documentation(&self) -> Option<&Documentation> {
@@ -145,154 +177,335 @@ impl ScalarUDFImpl for RPadFunc {
     }
 }
 
-pub fn rpad<StringArrayLen: OffsetSizeTrait, FillArrayLen: OffsetSizeTrait>(
-    args: &[ArrayRef],
+use super::common::{
+    StringCharLen, char_count_or_boundary, try_as_scalar_i64, try_as_scalar_str,
+};
+
+/// Optimized rpad for constant target_len and fill arguments.
+fn rpad_scalar_args<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
+    string_array: V,
+    target_len: usize,
+    fill: &str,
 ) -> Result<ArrayRef> {
-    if args.len() < 2 || args.len() > 3 {
+    if string_array.is_ascii() && fill.is_ascii() {
+        rpad_scalar_ascii::<V, T>(string_array, target_len, fill)
+    } else {
+        rpad_scalar_unicode::<V, T>(string_array, target_len, fill)
+    }
+}
+
+fn rpad_scalar_ascii<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
+    string_array: V,
+    target_len: usize,
+    fill: &str,
+) -> Result<ArrayRef> {
+    // With a scalar `target_len` and `fill`, we can precompute a padding
+    // buffer of `target_len` fill characters repeated cyclically.
+    let padding_buf = if !fill.is_empty() {
+        let mut buf = String::with_capacity(target_len);
+        while buf.len() < target_len {
+            let remaining = target_len - buf.len();
+            if remaining >= fill.len() {
+                buf.push_str(fill);
+            } else {
+                buf.push_str(&fill[..remaining]);
+            }
+        }
+        buf
+    } else {
+        String::new()
+    };
+
+    // Each output row is exactly `target_len` ASCII bytes (string + padding).
+    let data_capacity = string_array.len().saturating_mul(target_len);
+    let mut builder =
+        GenericStringBuilder::<T>::with_capacity(string_array.len(), data_capacity);
+
+    for maybe_string in string_array.iter() {
+        match maybe_string {
+            Some(string) => {
+                let str_len = string.len();
+                if target_len <= str_len {
+                    builder.append_value(&string[..target_len]);
+                } else if fill.is_empty() {
+                    builder.append_value(string);
+                } else {
+                    let pad_needed = target_len - str_len;
+                    builder.write_str(string)?;
+                    builder.write_str(&padding_buf[..pad_needed])?;
+                    builder.append_value("");
+                }
+            }
+            None => builder.append_null(),
+        }
+    }
+
+    Ok(Arc::new(builder.finish()) as ArrayRef)
+}
+
+fn rpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
+    string_array: V,
+    target_len: usize,
+    fill: &str,
+) -> Result<ArrayRef> {
+    let fill_chars: Vec<char> = fill.chars().collect();
+
+    // With a scalar `target_len` and `fill`, we can precompute a padding buffer
+    // of `target_len` fill characters repeated cyclically. Because Unicode
+    // characters are variable-width, we build a byte-offset table to map from
+    // character count to the corresponding byte position in the padding buffer.
+    let (padding_buf, char_byte_offsets) = if !fill_chars.is_empty() {
+        let mut buf = String::new();
+        let mut offsets = Vec::with_capacity(target_len + 1);
+        offsets.push(0usize);
+        for i in 0..target_len {
+            buf.push(fill_chars[i % fill_chars.len()]);
+            offsets.push(buf.len());
+        }
+        (buf, offsets)
+    } else {
+        (String::new(), vec![0])
+    };
+
+    // Each output row is `target_len` chars; multiply by 4 (max UTF-8 bytes
+    // per char) for an upper bound in bytes.
+    let data_capacity = string_array.len().saturating_mul(target_len * 4);
+    let mut builder =
+        GenericStringBuilder::<T>::with_capacity(string_array.len(), data_capacity);
+
+    for maybe_string in string_array.iter() {
+        match maybe_string {
+            Some(string) => match char_count_or_boundary(string, target_len) {
+                StringCharLen::ByteOffset(offset) => {
+                    builder.append_value(&string[..offset]);
+                }
+                StringCharLen::CharCount(char_count) => {
+                    builder.write_str(string)?;
+                    if !fill_chars.is_empty() {
+                        let pad_chars = target_len - char_count;
+                        let pad_bytes = char_byte_offsets[pad_chars];
+                        builder.write_str(&padding_buf[..pad_bytes])?;
+                    }
+                    builder.append_value("");
+                }
+            },
+            None => builder.append_null(),
+        }
+    }
+
+    Ok(Arc::new(builder.finish()) as ArrayRef)
+}
+
+fn rpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+    if args.len() <= 1 || args.len() > 3 {
         return exec_err!(
-            "rpad was called with {} arguments. It requires 2 or 3 arguments.",
+            "rpad was called with {} arguments. It requires at least 2 and at most 3.",
             args.len()
         );
     }
 
     let length_array = as_int64_array(&args[1])?;
-    match (
-        args.len(),
-        args[0].data_type(),
-        args.get(2).map(|arg| arg.data_type()),
-    ) {
-        (2, Utf8View, _) => {
-            rpad_impl::<&StringViewArray, &StringViewArray, StringArrayLen>(
-                args[0].as_string_view(),
-                length_array,
-                None,
-            )
-        }
-        (3, Utf8View, Some(Utf8View)) => {
-            rpad_impl::<&StringViewArray, &StringViewArray, StringArrayLen>(
-                args[0].as_string_view(),
-                length_array,
-                Some(args[2].as_string_view()),
-            )
-        }
-        (3, Utf8View, Some(Utf8 | LargeUtf8)) => {
-            rpad_impl::<&StringViewArray, &GenericStringArray<FillArrayLen>, StringArrayLen>(
-                args[0].as_string_view(),
-                length_array,
-                Some(args[2].as_string::<FillArrayLen>()),
-            )
-        }
-        (3, Utf8 | LargeUtf8, Some(Utf8View)) => rpad_impl::<
-            &GenericStringArray<StringArrayLen>,
-            &StringViewArray,
-            StringArrayLen,
-        >(
-            args[0].as_string::<StringArrayLen>(),
+
+    match (args.len(), args[0].data_type()) {
+        (2, Utf8View) => rpad_impl::<&StringViewArray, &GenericStringArray<i32>, T>(
+            &args[0].as_string_view(),
             length_array,
-            Some(args[2].as_string_view()),
+            None,
         ),
-        (_, _, _) => rpad_impl::<
-            &GenericStringArray<StringArrayLen>,
-            &GenericStringArray<FillArrayLen>,
-            StringArrayLen,
-        >(
-            args[0].as_string::<StringArrayLen>(),
+        (2, Utf8 | LargeUtf8) => rpad_impl::<
+            &GenericStringArray<T>,
+            &GenericStringArray<T>,
+            T,
+        >(&args[0].as_string::<T>(), length_array, None),
+        (3, Utf8View) => rpad_with_replace::<&StringViewArray, T>(
+            &args[0].as_string_view(),
             length_array,
-            args.get(2).map(|arg| arg.as_string::<FillArrayLen>()),
+            &args[2],
         ),
+        (3, Utf8 | LargeUtf8) => rpad_with_replace::<&GenericStringArray<T>, T>(
+            &args[0].as_string::<T>(),
+            length_array,
+            &args[2],
+        ),
+        (len, dt) => unreachable!("rpad: unexpected arg count ({len}) or type ({dt})"),
     }
 }
 
-/// Extends the string to length 'length' by appending the characters fill (a space by default). If the string is already longer than length then it is truncated.
-/// rpad('hi', 5, 'xy') = 'hixyx'
-pub fn rpad_impl<'a, StringArrType, FillArrType, StringArrayLen>(
-    string_array: StringArrType,
+fn rpad_with_replace<'a, V, T: OffsetSizeTrait>(
+    string_array: &V,
     length_array: &Int64Array,
-    fill_array: Option<FillArrType>,
+    fill_array: &'a ArrayRef,
 ) -> Result<ArrayRef>
 where
-    StringArrType: StringArrayType<'a>,
-    FillArrType: StringArrayType<'a>,
-    StringArrayLen: OffsetSizeTrait,
+    V: StringArrayType<'a>,
 {
-    let mut builder: GenericStringBuilder<StringArrayLen> = GenericStringBuilder::new();
-
-    match fill_array {
-        None => {
-            string_array.iter().zip(length_array.iter()).try_for_each(
-                |(string, length)| -> Result<(), DataFusionError> {
-                    match (string, length) {
-                        (Some(string), Some(length)) => {
-                            if length > i32::MAX as i64 {
-                                return exec_err!(
-                                    "rpad requested length {} too large",
-                                    length
-                                );
-                            }
-                            let length = if length < 0 { 0 } else { length as usize };
-                            if length == 0 {
-                                builder.append_value("");
-                            } else {
-                                let graphemes =
-                                    string.graphemes(true).collect::<Vec<&str>>();
-                                if length < graphemes.len() {
-                                    builder.append_value(graphemes[..length].concat());
-                                } else {
-                                    builder.write_str(string)?;
-                                    builder.write_str(
-                                        &" ".repeat(length - graphemes.len()),
-                                    )?;
-                                    builder.append_value("");
+    match fill_array.data_type() {
+        Utf8View => rpad_impl::<V, &StringViewArray, T>(
+            string_array,
+            length_array,
+            Some(fill_array.as_string_view()),
+        ),
+        LargeUtf8 => rpad_impl::<V, &GenericStringArray<i64>, T>(
+            string_array,
+            length_array,
+            Some(fill_array.as_string::<i64>()),
+        ),
+        Utf8 => rpad_impl::<V, &GenericStringArray<i32>, T>(
+            string_array,
+            length_array,
+            Some(fill_array.as_string::<i32>()),
+        ),
+        other => {
+            exec_err!("Unsupported data type {other:?} for function rpad")
+        }
+    }
+}
+
+fn rpad_impl<'a, V, V2, T>(
+    string_array: &V,
+    length_array: &Int64Array,
+    fill_array: Option<V2>,
+) -> Result<ArrayRef>
+where
+    V: StringArrayType<'a>,
+    V2: StringArrayType<'a>,
+    T: OffsetSizeTrait,
+{
+    let array = if let Some(fill_array) = fill_array {
+        let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
+        let mut fill_chars_buf = Vec::new();
+
+        for ((string, target_len), fill) in string_array
+            .iter()
+            .zip(length_array.iter())
+            .zip(fill_array.iter())
+        {
+            if let (Some(string), Some(target_len), Some(fill)) =
+                (string, target_len, fill)
+            {
+                if target_len > i32::MAX as i64 {
+                    return exec_err!(
+                        "rpad requested length {target_len} too large, maximum allowed length is {}",
+                        i32::MAX
+                    );
+                }
+
+                let target_len = if target_len < 0 {
+                    0
+                } else {
+                    target_len as usize
+                };
+                if target_len == 0 {
+                    builder.append_value("");
+                    continue;
+                }
+
+                if string.is_ascii() && fill.is_ascii() {
+                    // ASCII fast path: byte length == character length.
+                    let str_len = string.len();
+                    if target_len < str_len {
+                        builder.append_value(&string[..target_len]);
+                    } else if fill.is_empty() {
+                        builder.append_value(string);
+                    } else {
+                        let pad_len = target_len - str_len;
+                        let fill_len = fill.len();
+                        let full_reps = pad_len / fill_len;
+                        let remainder = pad_len % fill_len;
+                        builder.write_str(string)?;
+                        for _ in 0..full_reps {
+                            builder.write_str(fill)?;
+                        }
+                        if remainder > 0 {
+                            builder.write_str(&fill[..remainder])?;
+                        }
+                        builder.append_value("");
+                    }
+                } else {
+                    fill_chars_buf.clear();
+                    fill_chars_buf.extend(fill.chars());
+
+                    match char_count_or_boundary(string, target_len) {
+                        StringCharLen::ByteOffset(offset) => {
+                            builder.append_value(&string[..offset]);
+                        }
+                        StringCharLen::CharCount(char_count) => {
+                            builder.write_str(string)?;
+                            if !fill_chars_buf.is_empty() {
+                                for l in 0..target_len - char_count {
+                                    let c = *fill_chars_buf
+                                        .get(l % fill_chars_buf.len())
+                                        .unwrap();
+                                    builder.write_char(c)?;
                                 }
                             }
+                            builder.append_value("");
                         }
-                        _ => builder.append_null(),
                     }
-                    Ok(())
-                },
-            )?;
+                }
+            } else {
+                builder.append_null();
+            }
         }
-        Some(fill_array) => {
-            string_array
-                .iter()
-                .zip(length_array.iter())
-                .zip(fill_array.iter())
-                .try_for_each(
-                    |((string, length), fill)| -> Result<(), DataFusionError> {
-                        match (string, length, fill) {
-                            (Some(string), Some(length), Some(fill)) => {
-                                if length > i32::MAX as i64 {
-                                    return exec_err!(
-                                        "rpad requested length {} too large",
-                                        length
-                                    );
-                                }
-                                let length = if length < 0 { 0 } else { length as usize };
-                                let graphemes =
-                                    string.graphemes(true).collect::<Vec<&str>>();
-
-                                if length < graphemes.len() {
-                                    builder.append_value(graphemes[..length].concat());
-                                } else if fill.is_empty() {
-                                    builder.append_value(string);
-                                } else {
-                                    builder.write_str(string)?;
-                                    fill.chars()
-                                        .cycle()
-                                        .take(length - graphemes.len())
-                                        .for_each(|ch| builder.write_char(ch).unwrap());
-                                    builder.append_value("");
-                                }
+
+        builder.finish()
+    } else {
+        let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
+
+        for (string, target_len) in string_array.iter().zip(length_array.iter()) {
+            if let (Some(string), Some(target_len)) = (string, target_len) {
+                if target_len > i32::MAX as i64 {
+                    return exec_err!(
+                        "rpad requested length {target_len} too large, maximum allowed length is {}",
+                        i32::MAX
+                    );
+                }
+
+                let target_len = if target_len < 0 {
+                    0
+                } else {
+                    target_len as usize
+                };
+                if target_len == 0 {
+                    builder.append_value("");
+                    continue;
+                }
+
+                if string.is_ascii() {
+                    // ASCII fast path: byte length == character length
+                    let str_len = string.len();
+                    if target_len < str_len {
+                        builder.append_value(&string[..target_len]);
+                    } else {
+                        builder.write_str(string)?;
+                        for _ in 0..(target_len - str_len) {
+                            builder.write_str(" ")?;
+                        }
+                        builder.append_value("");
+                    }
+                } else {
+                    match char_count_or_boundary(string, target_len) {
+                        StringCharLen::ByteOffset(offset) => {
+                            builder.append_value(&string[..offset]);
+                        }
+                        StringCharLen::CharCount(char_count) => {
+                            builder.write_str(string)?;
+                            for _ in 0..(target_len - char_count) {
+                                builder.write_str(" ")?;
                             }
-                            _ => builder.append_null(),
+                            builder.append_value("");
                         }
-                        Ok(())
-                    },
-                )?;
+                    }
+                }
+            } else {
+                builder.append_null();
+            }
         }
-    }
 
-    Ok(Arc::new(builder.finish()) as ArrayRef)
+        builder.finish()
+    };
+
+    Ok(Arc::new(array) as ArrayRef)
 }
 
 #[cfg(test)]
@@ -447,6 +660,29 @@ mod tests {
             Utf8,
             StringArray
         );
+        test_function!(
+            RPadFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::from("hello")),
+                ColumnarValue::Scalar(ScalarValue::from(2i64)),
+            ],
+            Ok(Some("he")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            RPadFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::from("hi")),
+                ColumnarValue::Scalar(ScalarValue::from(6i64)),
+                ColumnarValue::Scalar(ScalarValue::from("xy")),
+            ],
+            Ok(Some("hixyxy")),
+            &str,
+            Utf8,
+            StringArray
+        );
         test_function!(
             RPadFunc::new(),
             vec![
diff --git a/datafusion/functions/src/unicode/strpos.rs b/datafusion/functions/src/unicode/strpos.rs
index 4f238b2644bdf..d361ecdbc1721 100644
--- a/datafusion/functions/src/unicode/strpos.rs
+++ b/datafusion/functions/src/unicode/strpos.rs
@@ -15,7 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use crate::utils::{make_scalar_function, utf8_to_int_type};
@@ -26,12 +25,13 @@ use arrow::datatypes::{
     ArrowNativeType, DataType, Field, FieldRef, Int32Type, Int64Type,
 };
 use datafusion_common::types::logical_string;
-use datafusion_common::{exec_err, internal_err, Result};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
 use datafusion_expr::{
-    Coercion, ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignatureClass,
-    Volatility,
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
+use memchr::{memchr, memmem};
 
 #[user_doc(
     doc_section(label = "String Functions"),
@@ -77,10 +77,6 @@ impl StrposFunc {
 }
 
 impl ScalarUDFImpl for StrposFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "strpos"
     }
@@ -109,10 +105,15 @@ impl ScalarUDFImpl for StrposFunc {
         )
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        // Fast path for array haystack and scalar needle
+        if let (
+            ColumnarValue::Array(haystack_array),
+            ColumnarValue::Scalar(needle_scalar),
+        ) = (&args.args[0], &args.args[1])
+        {
+            return strpos_scalar_needle(haystack_array, needle_scalar);
+        }
         make_scalar_function(strpos, vec![])(&args.args)
     }
 
@@ -126,103 +127,121 @@ impl ScalarUDFImpl for StrposFunc {
 }
 
 fn strpos(args: &[ArrayRef]) -> Result<ArrayRef> {
-    match (args[0].data_type(), args[1].data_type()) {
-        (DataType::Utf8, DataType::Utf8) => {
-            let string_array = args[0].as_string::<i32>();
-            let substring_array = args[1].as_string::<i32>();
-            calculate_strpos::<_, _, Int32Type>(string_array, substring_array)
-        }
-        (DataType::Utf8, DataType::Utf8View) => {
-            let string_array = args[0].as_string::<i32>();
-            let substring_array = args[1].as_string_view();
-            calculate_strpos::<_, _, Int32Type>(string_array, substring_array)
-        }
-        (DataType::Utf8, DataType::LargeUtf8) => {
-            let string_array = args[0].as_string::<i32>();
-            let substring_array = args[1].as_string::<i64>();
-            calculate_strpos::<_, _, Int32Type>(string_array, substring_array)
-        }
-        (DataType::LargeUtf8, DataType::Utf8) => {
-            let string_array = args[0].as_string::<i64>();
-            let substring_array = args[1].as_string::<i32>();
-            calculate_strpos::<_, _, Int64Type>(string_array, substring_array)
-        }
-        (DataType::LargeUtf8, DataType::Utf8View) => {
-            let string_array = args[0].as_string::<i64>();
-            let substring_array = args[1].as_string_view();
-            calculate_strpos::<_, _, Int64Type>(string_array, substring_array)
-        }
-        (DataType::LargeUtf8, DataType::LargeUtf8) => {
-            let string_array = args[0].as_string::<i64>();
-            let substring_array = args[1].as_string::<i64>();
-            calculate_strpos::<_, _, Int64Type>(string_array, substring_array)
+    /// Dispatches the needle array to the correct string type and calls
+    /// `strpos_general` with the given haystack and result type.
+    macro_rules! dispatch_needle {
+        ($haystack:expr, $result_type:ty, $args:expr) => {
+            match $args[1].data_type() {
+                DataType::Utf8 => strpos_general::<_, _, $result_type>(
+                    $haystack,
+                    $args[1].as_string::<i32>(),
+                ),
+                DataType::LargeUtf8 => strpos_general::<_, _, $result_type>(
+                    $haystack,
+                    $args[1].as_string::<i64>(),
+                ),
+                DataType::Utf8View => strpos_general::<_, _, $result_type>(
+                    $haystack,
+                    $args[1].as_string_view(),
+                ),
+                other => exec_err!("Unsupported data type {other:?} for strpos needle"),
+            }
+        };
+    }
+
+    match args[0].data_type() {
+        DataType::Utf8 => dispatch_needle!(args[0].as_string::<i32>(), Int32Type, args),
+        DataType::LargeUtf8 => {
+            dispatch_needle!(args[0].as_string::<i64>(), Int64Type, args)
         }
-        (DataType::Utf8View, DataType::Utf8View) => {
-            let string_array = args[0].as_string_view();
-            let substring_array = args[1].as_string_view();
-            calculate_strpos::<_, _, Int32Type>(string_array, substring_array)
+        DataType::Utf8View => dispatch_needle!(args[0].as_string_view(), Int32Type, args),
+        other => {
+            exec_err!("Unsupported data type {other:?} for strpos haystack")
         }
-        (DataType::Utf8View, DataType::Utf8) => {
-            let string_array = args[0].as_string_view();
-            let substring_array = args[1].as_string::<i32>();
-            calculate_strpos::<_, _, Int32Type>(string_array, substring_array)
+    }
+}
+
+/// Find `needle` in `haystack` using `memchr` to quickly skip to positions
+/// where the first byte matches, then verify the remaining bytes. Returns
+/// the 0-based byte offset of the match, or `None` if not found. An empty
+/// `needle` matches at offset 0.
+fn find_substring_bytes(haystack: &[u8], needle: &[u8]) -> Option<usize> {
+    let needle_len = needle.len();
+    let haystack_len = haystack.len();
+
+    if needle_len == 0 {
+        return Some(0);
+    }
+    if needle_len > haystack_len {
+        return None;
+    }
+
+    let first_byte = needle[0];
+    let mut offset = 0;
+
+    while let Some(pos) = memchr(first_byte, &haystack[offset..]) {
+        let start = offset + pos;
+        if start + needle_len > haystack.len() {
+            return None;
         }
-        (DataType::Utf8View, DataType::LargeUtf8) => {
-            let string_array = args[0].as_string_view();
-            let substring_array = args[1].as_string::<i64>();
-            calculate_strpos::<_, _, Int32Type>(string_array, substring_array)
+        if haystack[start..start + needle_len] == *needle {
+            return Some(start);
         }
+        offset = start + 1;
+    }
 
-        other => {
-            exec_err!("Unsupported data type combination {other:?} for function strpos")
-        }
+    None
+}
+
+/// Converts a byte offset within a haystack to a 1-based character position.
+/// For ASCII data, byte offset == char offset so we just add 1. For non-ASCII,
+/// we count UTF-8 characters in the prefix before the match.
+#[inline]
+fn byte_offset_to_char_pos<T: ArrowPrimitiveType>(
+    haystack: &str,
+    byte_offset: usize,
+    ascii_only: bool,
+) -> Option<T::Native> {
+    if ascii_only {
+        return T::Native::from_usize(byte_offset + 1);
     }
+    // SAFETY: byte_offset is at a UTF-8 char boundary because both haystack
+    // and needle are valid UTF-8, and UTF-8 is self-synchronizing: a valid
+    // needle byte sequence can only match starting at a char boundary in a
+    // valid haystack.
+    debug_assert!(haystack.is_char_boundary(byte_offset));
+    let prefix =
+        unsafe { std::str::from_utf8_unchecked(&haystack.as_bytes()[..byte_offset]) };
+    T::Native::from_usize(prefix.chars().count() + 1)
 }
 
-/// Returns starting index of specified substring within string, or zero if it's not present. (Same as position(substring in string), but note the reversed argument order.)
-/// strpos('high', 'ig') = 2
-/// The implementation uses UTF-8 code points as characters
-fn calculate_strpos<'a, V1, V2, T: ArrowPrimitiveType>(
-    string_array: V1,
-    substring_array: V2,
+/// Fallback strpos implementation for when both haystack and needle are arrays.
+/// Building a new `memmem::Finder` for every row is too expensive; it is faster
+/// to use `memchr::memchr`.
+fn strpos_general<'a, V1, V2, T: ArrowPrimitiveType>(
+    haystack_array: V1,
+    needle_array: V2,
 ) -> Result<ArrayRef>
 where
-    V1: StringArrayType<'a, Item = &'a str>,
-    V2: StringArrayType<'a, Item = &'a str>,
+    V1: StringArrayType<'a, Item = &'a str> + Copy,
+    V2: StringArrayType<'a, Item = &'a str> + Copy,
 {
-    let ascii_only = substring_array.is_ascii() && string_array.is_ascii();
-    let string_iter = string_array.iter();
-    let substring_iter = substring_array.iter();
-
-    let result = string_iter
-        .zip(substring_iter)
-        .map(|(string, substring)| match (string, substring) {
-            (Some(string), Some(substring)) => {
-                // If only ASCII characters are present, we can use the slide window method to find
-                // the sub vector in the main vector. This is faster than string.find() method.
-                if ascii_only {
-                    // If the substring is empty, the result is 1.
-                    if substring.is_empty() {
-                        T::Native::from_usize(1)
-                    } else {
-                        T::Native::from_usize(
-                            string
-                                .as_bytes()
-                                .windows(substring.len())
-                                .position(|w| w == substring.as_bytes())
-                                .map(|x| x + 1)
-                                .unwrap_or(0),
-                        )
+    let ascii_only = needle_array.is_ascii() && haystack_array.is_ascii();
+    let haystack_iter = haystack_array.iter();
+    let needle_iter = needle_array.iter();
+
+    let result = haystack_iter
+        .zip(needle_iter)
+        .map(|(haystack, needle)| match (haystack, needle) {
+            (Some(haystack), Some(needle)) => {
+                let haystack_bytes = haystack.as_bytes();
+                let needle_bytes = needle.as_bytes();
+
+                match find_substring_bytes(haystack_bytes, needle_bytes) {
+                    None => T::Native::from_usize(0),
+                    Some(byte_offset) => {
+                        byte_offset_to_char_pos::<T>(haystack, byte_offset, ascii_only)
                     }
-                } else {
-                    // The `find` method returns the byte index of the substring.
-                    // We count the number of chars up to that byte index.
-                    T::Native::from_usize(
-                        string
-                            .find(substring)
-                            .map(|x| string[..x].chars().count() + 1)
-                            .unwrap_or(0),
-                    )
                 }
             }
             _ => None,
@@ -232,6 +251,85 @@ where
     Ok(Arc::new(result) as ArrayRef)
 }
 
+/// Fast-path strpos implementation for when the haystack is an array and the
+/// needle is a scalar.  We can pre-build a `memmem::Finder` once and reuse it
+/// for every haystack row.
+fn strpos_scalar_needle(
+    haystack_array: &ArrayRef,
+    needle_scalar: &ScalarValue,
+) -> Result<ColumnarValue> {
+    let Some(needle_str) = needle_scalar.try_as_str() else {
+        return exec_err!(
+            "Unsupported data type {:?} for strpos needle",
+            needle_scalar.data_type()
+        );
+    };
+
+    // Null needle => null result for every row
+    let Some(needle_str) = needle_str else {
+        return match haystack_array.data_type() {
+            DataType::LargeUtf8 => {
+                Ok(ColumnarValue::Array(Arc::new(
+                    PrimitiveArray::<Int64Type>::new_null(haystack_array.len()),
+                )))
+            }
+            DataType::Utf8 | DataType::Utf8View => Ok(ColumnarValue::Array(Arc::new(
+                PrimitiveArray::<Int32Type>::new_null(haystack_array.len()),
+            ))),
+            other => exec_err!("Unsupported data type {other:?} for strpos haystack"),
+        };
+    };
+
+    let result = match haystack_array.data_type() {
+        DataType::Utf8 => strpos_with_finder::<_, Int32Type>(
+            haystack_array.as_string::<i32>(),
+            needle_str,
+        ),
+        DataType::LargeUtf8 => strpos_with_finder::<_, Int64Type>(
+            haystack_array.as_string::<i64>(),
+            needle_str,
+        ),
+        DataType::Utf8View => strpos_with_finder::<_, Int32Type>(
+            haystack_array.as_string_view(),
+            needle_str,
+        ),
+        other => {
+            exec_err!("Unsupported data type {other:?} for strpos haystack")
+        }
+    }?;
+    Ok(ColumnarValue::Array(result))
+}
+
+fn strpos_with_finder<'a, V, T: ArrowPrimitiveType>(
+    haystack_array: V,
+    needle: &str,
+) -> Result<ArrayRef>
+where
+    V: StringArrayType<'a, Item = &'a str> + Copy,
+{
+    let needle_bytes = needle.as_bytes();
+    let ascii_haystack = haystack_array.is_ascii();
+    let finder = memmem::Finder::new(needle_bytes);
+
+    let result = haystack_array
+        .iter()
+        .map(|string| match string {
+            Some(string) => {
+                let haystack_bytes = string.as_bytes();
+                match finder.find(haystack_bytes) {
+                    None => T::Native::from_usize(0),
+                    Some(byte_offset) => {
+                        byte_offset_to_char_pos::<T>(string, byte_offset, ascii_haystack)
+                    }
+                }
+            }
+            None => None,
+        })
+        .collect::<PrimitiveArray<T>>();
+
+    Ok(Arc::new(result) as ArrayRef)
+}
+
 #[cfg(test)]
 mod tests {
     use arrow::array::{Array, Int32Array, Int64Array};
diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs
index 46b3cc63d0b6d..cc0882cf3013d 100644
--- a/datafusion/functions/src/unicode/substr.rs
+++ b/datafusion/functions/src/unicode/substr.rs
@@ -15,21 +15,24 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
-use crate::strings::make_and_append_view;
+use crate::strings::append_view;
 use crate::utils::make_scalar_function;
 use arrow::array::{
-    Array, ArrayIter, ArrayRef, AsArray, Int64Array, NullBufferBuilder, StringArrayType,
-    StringViewArray, StringViewBuilder,
+    Array, ArrayRef, AsArray, GenericStringArray, Int64Array, OffsetSizeTrait,
+    StringArrayType, StringViewArray, StringViewBuilder, make_view,
 };
-use arrow::buffer::ScalarBuffer;
+use arrow::buffer::{NullBuffer, ScalarBuffer};
 use arrow::datatypes::DataType;
 use datafusion_common::cast::as_int64_array;
-use datafusion_common::{exec_err, plan_err, Result};
+use datafusion_common::types::{
+    NativeType, logical_int32, logical_int64, logical_string,
+};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature, TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -44,12 +47,12 @@ use datafusion_macros::user_doc;
 | substr(Utf8("datafusion"),Int64(5),Int64(3)) |
 +----------------------------------------------+
 | fus                                          |
-+----------------------------------------------+ 
++----------------------------------------------+
 ```"#,
     standard_argument(name = "str", prefix = "String"),
     argument(
         name = "start_pos",
-        description = "Character position to start the substring at. The first character in the string has a position of 1."
+        description = "Character position to start the substring at. The first character in the string has a position of 1. If the start position is less than 1, it is treated as if it is before the start of the string and the (absolute) number of characters before position 1 is subtracted from `length` (if given). For example, `substr('abc', -3, 6)` returns `'ab'`."
     ),
     argument(
         name = "length",
@@ -70,24 +73,36 @@ impl Default for SubstrFunc {
 
 impl SubstrFunc {
     pub fn new() -> Self {
+        let string = Coercion::new_exact(TypeSignatureClass::Native(logical_string()));
+        let int64 = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int64()),
+            vec![TypeSignatureClass::Native(logical_int32())],
+            NativeType::Int64,
+        );
         Self {
-            signature: Signature::user_defined(Volatility::Immutable)
-                .with_parameter_names(vec![
-                    "str".to_string(),
-                    "start_pos".to_string(),
-                    "length".to_string(),
-                ])
-                .expect("valid parameter names"),
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![string.clone(), int64.clone()]),
+                    TypeSignature::Coercible(vec![
+                        string.clone(),
+                        int64.clone(),
+                        int64.clone(),
+                    ]),
+                ],
+                Volatility::Immutable,
+            )
+            .with_parameter_names(vec![
+                "str".to_string(),
+                "start_pos".to_string(),
+                "length".to_string(),
+            ])
+            .expect("valid parameter names"),
             aliases: vec![String::from("substring")],
         }
     }
 }
 
 impl ScalarUDFImpl for SubstrFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "substr"
     }
@@ -101,10 +116,7 @@ impl ScalarUDFImpl for SubstrFunc {
         Ok(DataType::Utf8View)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(substr, vec![])(&args.args)
     }
 
@@ -112,90 +124,21 @@ impl ScalarUDFImpl for SubstrFunc {
         &self.aliases
     }
 
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        if arg_types.len() < 2 || arg_types.len() > 3 {
-            return plan_err!(
-                "The {} function requires 2 or 3 arguments, but got {}.",
-                self.name(),
-                arg_types.len()
-            );
-        }
-        let first_data_type = match &arg_types[0] {
-            DataType::Null => Ok(DataType::Utf8),
-            DataType::LargeUtf8 | DataType::Utf8View | DataType::Utf8 => Ok(arg_types[0].clone()),
-            DataType::Dictionary(key_type, value_type) => {
-                if key_type.is_integer() {
-                    match value_type.as_ref() {
-                        DataType::Null => Ok(DataType::Utf8),
-                        DataType::LargeUtf8 | DataType::Utf8View | DataType::Utf8 => Ok(*value_type.clone()),
-                        _ => plan_err!(
-                                "The first argument of the {} function can only be a string, but got {:?}.",
-                                self.name(),
-                                arg_types[0]
-                        ),
-                    }
-                } else {
-                    plan_err!(
-                        "The first argument of the {} function can only be a string, but got {:?}.",
-                        self.name(),
-                        arg_types[0]
-                    )
-                }
-            }
-            _ => plan_err!(
-                "The first argument of the {} function can only be a string, but got {:?}.",
-                self.name(),
-                arg_types[0]
-            )
-        }?;
-
-        if ![DataType::Int64, DataType::Int32, DataType::Null].contains(&arg_types[1]) {
-            return plan_err!(
-                "The second argument of the {} function can only be an integer, but got {:?}.",
-                self.name(),
-                arg_types[1]
-            );
-        }
-
-        if arg_types.len() == 3
-            && ![DataType::Int64, DataType::Int32, DataType::Null].contains(&arg_types[2])
-        {
-            return plan_err!(
-                "The third argument of the {} function can only be an integer, but got {:?}.",
-                self.name(),
-                arg_types[2]
-            );
-        }
-
-        if arg_types.len() == 2 {
-            Ok(vec![first_data_type.to_owned(), DataType::Int64])
-        } else {
-            Ok(vec![
-                first_data_type.to_owned(),
-                DataType::Int64,
-                DataType::Int64,
-            ])
-        }
-    }
-
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 }
 
-/// Extracts the substring of string starting at the start'th character, and extending for count characters if that is specified. (Same as substring(string from start for count).)
-/// substr('alphabet', 3) = 'phabet'
-/// substr('alphabet', 3, 2) = 'ph'
-/// The implementation uses UTF-8 code points as characters
-pub fn substr(args: &[ArrayRef]) -> Result<ArrayRef> {
+/// Dispatches `substr` to the appropriate string array implementation.
+fn substr(args: &[ArrayRef]) -> Result<ArrayRef> {
     match args[0].data_type() {
         DataType::Utf8 => {
             let string_array = args[0].as_string::<i32>();
-            string_substr::<_>(string_array, &args[1..])
+            generic_string_substr(string_array, &args[1..])
         }
         DataType::LargeUtf8 => {
             let string_array = args[0].as_string::<i64>();
-            string_substr::<_>(string_array, &args[1..])
+            generic_string_substr(string_array, &args[1..])
         }
         DataType::Utf8View => {
             let string_array = args[0].as_string_view();
@@ -208,67 +151,74 @@ pub fn substr(args: &[ArrayRef]) -> Result<ArrayRef> {
     }
 }
 
-// Convert the given `start` and `count` to valid byte indices within `input` string
-//
-// Input `start` and `count` are equivalent to PostgreSQL's `substr(s, start, count)`
-// `start` is 1-based, if `count` is not provided count to the end of the string
-// Input indices are character-based, and return values are byte indices
-// The input bounds can be outside string bounds, this function will return
-// the intersection between input bounds and valid string bounds
-// `input_ascii_only` is used to optimize this function if `input` is ASCII-only
-//
-// * Example
-// 'Hi🌏' in-mem (`[]` for one char, `x` for one byte): [x][x][xxxx]
-// `get_true_start_end('Hi🌏', 1, None) -> (0, 6)`
-// `get_true_start_end('Hi🌏', 1, 1) -> (0, 1)`
-// `get_true_start_end('Hi🌏', -10, 2) -> (0, 0)`
-fn get_true_start_end(
+/// Convert the given `start` and `count` to valid byte indices within `input` string.
+///
+/// Input `start` and `count` are equivalent to PostgreSQL's `substr(s, start, count)`.
+/// `start` is 1-based; if `count` is not provided, returns indices to the end of the string.
+/// Input indices are character-based, and return values are byte indices.
+/// The input bounds can be outside string bounds; this function will return
+/// the intersection between input bounds and valid string bounds.
+/// `is_input_ascii_only` is used to optimize this function if `input` is ASCII-only.
+///
+/// # Example
+/// ```text
+/// 'Hi🌏' in-mem (`[]` for one char, `x` for one byte): [x][x][xxxx]
+/// get_true_start_end('Hi🌏', 1, None) -> Ok((0, 6))
+/// get_true_start_end('Hi🌏', 1, Some(1)) -> Ok((0, 1))
+/// get_true_start_end('Hi🌏', -10, Some(2)) -> Ok((0, 0))
+/// ```
+pub fn get_true_start_end(
     input: &str,
     start: i64,
-    count: Option<u64>,
+    count: Option<i64>,
     is_input_ascii_only: bool,
-) -> (usize, usize) {
-    let start = start.checked_sub(1).unwrap_or(start);
+) -> Result<(usize, usize)> {
+    if let Some(count) = count
+        && count < 0
+    {
+        return exec_err!("negative count not allowed: {count}");
+    }
+
+    // The caller-provided `start` is 1-indexed.
+    let Some(start) = start.checked_sub(1) else {
+        return exec_err!("start position overflow: {start}");
+    };
 
     let end = match count {
-        Some(count) => start + count as i64,
+        Some(count) => start.saturating_add(count),
         None => input.len() as i64,
     };
-    let count_to_end = count.is_some();
 
     let start = start.clamp(0, input.len() as i64) as usize;
     let end = end.clamp(0, input.len() as i64) as usize;
-    let count = end - start;
 
-    // If input is ASCII-only, byte-based indices equals to char-based indices
+    // If input is ASCII-only, byte-based indices equal char-based indices
     if is_input_ascii_only {
-        return (start, end);
+        return Ok((start, end));
     }
 
-    // Otherwise, calculate byte indices from char indices
-    // Note this decoding is relatively expensive for this simple `substr` function,,
-    // so the implementation attempts to decode in one pass (and caused the complexity)
-    let (mut st, mut ed) = (input.len(), input.len());
-    let mut start_counting = false;
-    let mut cnt = 0;
-    for (char_cnt, (byte_cnt, _)) in input.char_indices().enumerate() {
-        if char_cnt == start {
-            st = byte_cnt;
-            if count_to_end {
-                start_counting = true;
-            } else {
+    // Otherwise, calculate byte indices from char indices.  We initialize both
+    // `byte_start` and `byte_end` to the string length to handle cases where
+    // the requested 'start' or 'end' positions are at or beyond the end of the
+    // string (resulting in an empty substring).
+    let mut byte_start = input.len();
+    let mut byte_end = input.len();
+
+    for (char_idx, (byte_idx, _)) in input.char_indices().enumerate() {
+        if char_idx == start {
+            byte_start = byte_idx;
+            // If no length is specified, we only need the start offset.
+            if count.is_none() {
                 break;
             }
         }
-        if start_counting {
-            if cnt == count {
-                ed = byte_cnt;
-                break;
-            }
-            cnt += 1;
+        if char_idx == end {
+            byte_end = byte_idx;
+            break;
         }
     }
-    (st, ed)
+
+    Ok((byte_start, byte_end))
 }
 
 // String characters are variable length encoded in UTF-8, `substr()` function's
@@ -281,7 +231,7 @@ fn get_true_start_end(
 // string, such as `substr(long_str_with_1k_chars, 1, 32)`.
 // In such case the overhead of ASCII-validation may not be worth it, so
 // skip the validation for short prefix for now.
-fn enable_ascii_fast_path<'a, V: StringArrayType<'a>>(
+pub fn enable_ascii_fast_path<'a, V: StringArrayType<'a>>(
     string_array: &V,
     start: &Int64Array,
     count: Option<&Int64Array>,
@@ -293,7 +243,7 @@ fn enable_ascii_fast_path<'a, V: StringArrayType<'a>>(
 
             // HACK: can be simplified if function has specialized
             // implementation for `ScalarValue` (implement without `make_scalar_function()`)
-            let avg_prefix_len = start
+            let total_prefix_len = start
                 .iter()
                 .zip(count.iter())
                 .take(n_sample)
@@ -301,11 +251,11 @@ fn enable_ascii_fast_path<'a, V: StringArrayType<'a>>(
                     let start = start.unwrap_or(0);
                     let count = count.unwrap_or(0);
                     // To get substring, need to decode from 0 to start+count instead of start to start+count
-                    start + count
+                    start.saturating_add(count)
                 })
-                .sum::<i64>();
+                .fold(0i64, |acc, val| acc.saturating_add(val));
 
-            avg_prefix_len as f64 / n_sample as f64 <= short_prefix_threshold
+            (total_prefix_len as f64 / n_sample as f64) <= short_prefix_threshold
         }
         None => false,
     };
@@ -318,104 +268,41 @@ fn enable_ascii_fast_path<'a, V: StringArrayType<'a>>(
     }
 }
 
-// The decoding process refs the trait at: arrow/arrow-data/src/byte_view.rs:44
-// From<u128> for ByteView
 fn string_view_substr(
     string_view_array: &StringViewArray,
     args: &[ArrayRef],
 ) -> Result<ArrayRef> {
-    let mut views_buf = Vec::with_capacity(string_view_array.len());
-    let mut null_builder = NullBufferBuilder::new(string_view_array.len());
-
     let start_array = as_int64_array(&args[0])?;
-    let count_array_opt = if args.len() == 2 {
-        Some(as_int64_array(&args[1])?)
-    } else {
-        None
-    };
+    let count_array_opt = args.get(1).map(|a| as_int64_array(a)).transpose()?;
 
-    let enable_ascii_fast_path =
+    let is_ascii =
         enable_ascii_fast_path(&string_view_array, start_array, count_array_opt);
 
-    // In either case of `substr(s, i)` or `substr(s, i, cnt)`
-    // If any of input argument is `NULL`, the result is `NULL`
-    match args.len() {
-        1 => {
-            for ((str_opt, raw_view), start_opt) in string_view_array
-                .iter()
-                .zip(string_view_array.views().iter())
-                .zip(start_array.iter())
-            {
-                if let (Some(str), Some(start)) = (str_opt, start_opt) {
-                    let (start, end) =
-                        get_true_start_end(str, start, None, enable_ascii_fast_path);
-                    let substr = &str[start..end];
-
-                    make_and_append_view(
-                        &mut views_buf,
-                        &mut null_builder,
-                        raw_view,
-                        substr,
-                        start as u32,
-                    );
-                } else {
-                    null_builder.append_null();
-                    views_buf.push(0);
-                }
-            }
-        }
-        2 => {
-            let count_array = count_array_opt.unwrap();
-            for (((str_opt, raw_view), start_opt), count_opt) in string_view_array
-                .iter()
-                .zip(string_view_array.views().iter())
-                .zip(start_array.iter())
-                .zip(count_array.iter())
-            {
-                if let (Some(str), Some(start), Some(count)) =
-                    (str_opt, start_opt, count_opt)
-                {
-                    if count < 0 {
-                        return exec_err!(
-                            "negative substring length not allowed: substr(<str>, {start}, {count})"
-                        );
-                    } else {
-                        if start == i64::MIN {
-                            return exec_err!(
-                                "negative overflow when calculating skip value"
-                            );
-                        }
-                        let (start, end) = get_true_start_end(
-                            str,
-                            start,
-                            Some(count as u64),
-                            enable_ascii_fast_path,
-                        );
-                        let substr = &str[start..end];
-
-                        make_and_append_view(
-                            &mut views_buf,
-                            &mut null_builder,
-                            raw_view,
-                            substr,
-                            start as u32,
-                        );
-                    }
-                } else {
-                    null_builder.append_null();
-                    views_buf.push(0);
-                }
-            }
-        }
-        other => {
-            return exec_err!(
-                "substr was called with {other} arguments. It requires 2 or 3."
-            )
+    // Combine null bitmaps from all inputs in bulk.
+    let nulls = NullBuffer::union(
+        NullBuffer::union(string_view_array.nulls(), start_array.nulls()).as_ref(),
+        count_array_opt.and_then(|a| a.nulls()),
+    );
+
+    let mut views_buf = Vec::with_capacity(string_view_array.len());
+
+    for (i, raw_view) in string_view_array.views().iter().enumerate() {
+        if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
+            views_buf.push(0);
+            continue;
         }
+
+        let string = string_view_array.value(i);
+        let start = start_array.value(i);
+        let count = count_array_opt.map(|a| a.value(i));
+
+        let (byte_start, byte_end) = get_true_start_end(string, start, count, is_ascii)?;
+        let substr = &string[byte_start..byte_end];
+
+        append_view(&mut views_buf, raw_view, substr, byte_start as u32);
     }
 
     let views_buf = ScalarBuffer::from(views_buf);
-    let nulls_buf = null_builder.finish();
 
     // Safety:
     // (1) The blocks of the given views are all provided
@@ -425,98 +312,146 @@ fn string_view_substr(
         let array = StringViewArray::new_unchecked(
             views_buf,
             string_view_array.data_buffers().to_vec(),
-            nulls_buf,
+            nulls,
         );
         Ok(Arc::new(array) as ArrayRef)
     }
 }
 
-fn string_substr<'a, V>(string_array: V, args: &[ArrayRef]) -> Result<ArrayRef>
-where
-    V: StringArrayType<'a>,
-{
+fn values_fit_in_i32<T: OffsetSizeTrait>(string_array: &GenericStringArray<T>) -> bool {
+    // The Arrow spec defines StringView offset fields as signed 32-bit
+    // integers, so the maximum representable offset is i32::MAX.
+    string_array
+        .offsets()
+        .last()
+        .map(|offset| offset.as_usize() <= i32::MAX as usize)
+        .unwrap_or(true)
+}
+
+#[inline]
+fn append_view_from_buffer(
+    views_buf: &mut Vec<u128>,
+    substr: &str,
+    byte_offset: usize,
+) -> bool {
+    let byte_offset =
+        u32::try_from(byte_offset).expect("validated string buffer offset fits in i32");
+    let view = make_view(substr.as_bytes(), 0, byte_offset);
+    views_buf.push(view);
+    substr.len() > 12
+}
+
+#[expect(clippy::needless_range_loop)]
+fn generic_string_substr<T: OffsetSizeTrait>(
+    string_array: &GenericStringArray<T>,
+    args: &[ArrayRef],
+) -> Result<ArrayRef> {
+    // We'd like to return a StringViewArray that points into the input string
+    // array's values buffer. Since the Arrow spec defines StringView offsets
+    // as i32, we can't use this approach when the values buffer is >2GB, so
+    // fallback to copying.
+    if !values_fit_in_i32(string_array) {
+        return generic_string_substr_copy(string_array, args);
+    }
+
     let start_array = as_int64_array(&args[0])?;
-    let count_array_opt = if args.len() == 2 {
-        Some(as_int64_array(&args[1])?)
+    let count_array_opt = args.get(1).map(|a| as_int64_array(a)).transpose()?;
+
+    let is_ascii = enable_ascii_fast_path(&string_array, start_array, count_array_opt);
+    let offsets = string_array.value_offsets();
+    let mut views_buf = Vec::with_capacity(string_array.len());
+    let mut has_out_of_line = false;
+
+    // Combine null bitmaps from all inputs in bulk.
+    let nulls = NullBuffer::union(
+        NullBuffer::union(string_array.nulls(), start_array.nulls()).as_ref(),
+        count_array_opt.and_then(|a| a.nulls()),
+    );
+
+    for i in 0..string_array.len() {
+        if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
+            views_buf.push(0);
+            continue;
+        }
+
+        let string = string_array.value(i);
+        let source_offset = offsets[i].as_usize();
+        let start = start_array.value(i);
+        let count = count_array_opt.map(|a| a.value(i));
+
+        let (byte_start, byte_end) = get_true_start_end(string, start, count, is_ascii)?;
+        has_out_of_line |= append_view_from_buffer(
+            &mut views_buf,
+            &string[byte_start..byte_end],
+            source_offset + byte_start,
+        );
+    }
+
+    let views_buf = ScalarBuffer::from(views_buf);
+
+    // If all result strings are stored inline, we don't need to retain the
+    // input string array.
+    let data_buffers = if has_out_of_line {
+        vec![string_array.values().clone()]
     } else {
-        None
+        vec![]
     };
 
-    let enable_ascii_fast_path =
-        enable_ascii_fast_path(&string_array, start_array, count_array_opt);
-
-    match args.len() {
-        1 => {
-            let iter = ArrayIter::new(string_array);
-            let mut result_builder = StringViewBuilder::new();
-            for (string, start) in iter.zip(start_array.iter()) {
-                match (string, start) {
-                    (Some(string), Some(start)) => {
-                        let (start, end) = get_true_start_end(
-                            string,
-                            start,
-                            None,
-                            enable_ascii_fast_path,
-                        ); // start, end is byte-based
-                        let substr = &string[start..end];
-                        result_builder.append_value(substr);
-                    }
-                    _ => {
-                        result_builder.append_null();
-                    }
-                }
-            }
-            Ok(Arc::new(result_builder.finish()) as ArrayRef)
-        }
-        2 => {
-            let iter = ArrayIter::new(string_array);
-            let count_array = count_array_opt.unwrap();
-            let mut result_builder = StringViewBuilder::new();
-
-            for ((string, start), count) in
-                iter.zip(start_array.iter()).zip(count_array.iter())
-            {
-                match (string, start, count) {
-                    (Some(string), Some(start), Some(count)) => {
-                        if count < 0 {
-                            return exec_err!(
-                                "negative substring length not allowed: substr(<str>, {start}, {count})"
-                            );
-                        } else {
-                            if start == i64::MIN {
-                                return exec_err!(
-                                    "negative overflow when calculating skip value"
-                                );
-                            }
-                            let (start, end) = get_true_start_end(
-                                string,
-                                start,
-                                Some(count as u64),
-                                enable_ascii_fast_path,
-                            ); // start, end is byte-based
-                            let substr = &string[start..end];
-                            result_builder.append_value(substr);
-                        }
-                    }
-                    _ => {
-                        result_builder.append_null();
-                    }
-                }
-            }
-            Ok(Arc::new(result_builder.finish()) as ArrayRef)
-        }
-        other => {
-            exec_err!("substr was called with {other} arguments. It requires 2 or 3.")
+    // Safety:
+    // (1) The blocks of the given views are all provided
+    // (2) Each referenced range in the source values buffer is within bounds
+    unsafe {
+        let array = StringViewArray::new_unchecked(views_buf, data_buffers, nulls);
+        Ok(Arc::new(array) as ArrayRef)
+    }
+}
+
+// Fallback for `generic_string_substr` if we can't use zerocopy because the
+// input string array is too large.
+fn generic_string_substr_copy<T: OffsetSizeTrait>(
+    string_array: &GenericStringArray<T>,
+    args: &[ArrayRef],
+) -> Result<ArrayRef> {
+    let start_array = as_int64_array(&args[0])?;
+    let count_array_opt = args.get(1).map(|a| as_int64_array(a)).transpose()?;
+
+    let is_ascii = enable_ascii_fast_path(&string_array, start_array, count_array_opt);
+
+    // Combine null bitmaps from all inputs in bulk.
+    let nulls = NullBuffer::union(
+        NullBuffer::union(string_array.nulls(), start_array.nulls()).as_ref(),
+        count_array_opt.and_then(|a| a.nulls()),
+    );
+
+    let mut result_builder = StringViewBuilder::new();
+
+    for i in 0..string_array.len() {
+        if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
+            result_builder.append_null();
+            continue;
         }
+
+        let string = string_array.value(i);
+        let start = start_array.value(i);
+        let count = count_array_opt.map(|a| a.value(i));
+
+        let (byte_start, byte_end) = get_true_start_end(string, start, count, is_ascii)?;
+        result_builder.append_value(&string[byte_start..byte_end]);
     }
+
+    Ok(Arc::new(result_builder.finish()) as ArrayRef)
 }
 
 #[cfg(test)]
 mod tests {
-    use arrow::array::{Array, StringViewArray};
+    use std::sync::Arc;
+
+    use arrow::array::{
+        Array, ArrayRef, AsArray, Int64Array, StringArray, StringViewArray,
+    };
     use arrow::datatypes::DataType::Utf8View;
 
-    use datafusion_common::{exec_err, Result, ScalarValue};
+    use datafusion_common::{Result, ScalarValue, exec_err};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
 
     use crate::unicode::substr::SubstrFunc;
@@ -821,7 +756,7 @@ mod tests {
                 ColumnarValue::Scalar(ScalarValue::from(1i64)),
                 ColumnarValue::Scalar(ScalarValue::from(-1i64)),
             ],
-            exec_err!("negative substring length not allowed: substr(<str>, 1, -1)"),
+            exec_err!("negative count not allowed: -1"),
             &str,
             Utf8View,
             StringViewArray
@@ -856,9 +791,9 @@ mod tests {
             SubstrFunc::new(),
             vec![
                 ColumnarValue::Scalar(ScalarValue::from("abc")),
-                ColumnarValue::Scalar(ScalarValue::from(-9223372036854775808i64)),
+                ColumnarValue::Scalar(ScalarValue::from(i64::MIN)),
             ],
-            Ok(Some("abc")),
+            exec_err!("start position overflow: -9223372036854775808"),
             &str,
             Utf8View,
             StringViewArray
@@ -867,10 +802,22 @@ mod tests {
             SubstrFunc::new(),
             vec![
                 ColumnarValue::Scalar(ScalarValue::from("overflow")),
-                ColumnarValue::Scalar(ScalarValue::from(-9223372036854775808i64)),
+                ColumnarValue::Scalar(ScalarValue::from(i64::MIN)),
                 ColumnarValue::Scalar(ScalarValue::from(1i64)),
             ],
-            exec_err!("negative overflow when calculating skip value"),
+            exec_err!("start position overflow: -9223372036854775808"),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+        test_function!(
+            SubstrFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::from("large count")),
+                ColumnarValue::Scalar(ScalarValue::from(2i64)),
+                ColumnarValue::Scalar(ScalarValue::from(i64::MAX)),
+            ],
+            Ok(Some("arge count")),
             &str,
             Utf8View,
             StringViewArray
@@ -878,4 +825,25 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_sliced_string_array_array_args() -> Result<()> {
+        // Use strings longer than 12 bytes so the result views are out-of-line.
+        let string_array = Arc::new(StringArray::from(vec![
+            "skipped_prefix_value",
+            "alphabet_long_string",
+            "joséésojanother_long",
+        ])) as ArrayRef;
+        let string_array = string_array.slice(1, 2);
+        let start_array = Arc::new(Int64Array::from(vec![3, 5])) as ArrayRef;
+        let count_array = Arc::new(Int64Array::from(vec![15, 14])) as ArrayRef;
+
+        let result = super::substr(&[string_array, start_array, count_array])?;
+        let result = result.as_string_view();
+
+        assert_eq!(result.value(0), "phabet_long_str");
+        assert_eq!(result.value(1), "ésojanother_lo");
+
+        Ok(())
+    }
 }
diff --git a/datafusion/functions/src/unicode/substrindex.rs b/datafusion/functions/src/unicode/substrindex.rs
index a7ee7388f9013..3aeff1b476d76 100644
--- a/datafusion/functions/src/unicode/substrindex.rs
+++ b/datafusion/functions/src/unicode/substrindex.rs
@@ -15,22 +15,28 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{
-    ArrayAccessor, ArrayIter, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait,
-    PrimitiveArray, StringBuilder,
+    Array, ArrayRef, AsArray, ByteView, GenericStringArray, OffsetSizeTrait,
+    PrimitiveArray, StringArrayType, StringViewArray, make_view, new_null_array,
 };
-use arrow::datatypes::{DataType, Int32Type, Int64Type};
+use arrow::buffer::ScalarBuffer;
+use arrow::datatypes::{DataType, Int64Type};
+use arrow_buffer::NullBuffer;
 
-use crate::utils::{make_scalar_function, utf8_to_str_type};
-use datafusion_common::{exec_err, utils::take_function_args, Result};
+use crate::strings::GenericStringArrayBuilder;
+use crate::utils::make_scalar_function;
+use datafusion_common::{
+    Result, ScalarValue, exec_datafusion_err, exec_err, utils::take_function_args,
+};
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
+use memchr::{memchr_iter, memmem, memrchr_iter};
 
 #[user_doc(
     doc_section(label = "String Functions"),
@@ -92,10 +98,6 @@ impl SubstrIndexFunc {
 }
 
 impl ScalarUDFImpl for SubstrIndexFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "substr_index"
     }
@@ -105,14 +107,22 @@ impl ScalarUDFImpl for SubstrIndexFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_to_str_type(&arg_types[0], "substr_index")
+        Ok(arg_types[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        make_scalar_function(substr_index, vec![])(&args.args)
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+
+        if let (
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Scalar(delim_scalar),
+            ColumnarValue::Scalar(count_scalar),
+        ) = (&args[0], &args[1], &args[2])
+        {
+            return substr_index_scalar(string_array, delim_scalar, count_scalar);
+        }
+
+        make_scalar_function(substr_index, vec![])(&args)
     }
 
     fn aliases(&self) -> &[String] {
@@ -137,31 +147,35 @@ fn substr_index(args: &[ArrayRef]) -> Result<ArrayRef> {
             let string_array = str.as_string::<i32>();
             let delimiter_array = delim.as_string::<i32>();
             let count_array: &PrimitiveArray<Int64Type> = count.as_primitive();
-            substr_index_general::<Int32Type, _, _>(
+            substr_index_general(
                 string_array,
                 delimiter_array,
                 count_array,
+                GenericStringArrayBuilder::<i32>::with_capacity(
+                    string_array.len(),
+                    visible_string_bytes(string_array),
+                ),
             )
         }
         DataType::LargeUtf8 => {
             let string_array = str.as_string::<i64>();
             let delimiter_array = delim.as_string::<i64>();
             let count_array: &PrimitiveArray<Int64Type> = count.as_primitive();
-            substr_index_general::<Int64Type, _, _>(
+            substr_index_general(
                 string_array,
                 delimiter_array,
                 count_array,
+                GenericStringArrayBuilder::<i64>::with_capacity(
+                    string_array.len(),
+                    visible_string_bytes(string_array),
+                ),
             )
         }
         DataType::Utf8View => {
             let string_array = str.as_string_view();
             let delimiter_array = delim.as_string_view();
             let count_array: &PrimitiveArray<Int64Type> = count.as_primitive();
-            substr_index_general::<Int32Type, _, _>(
-                string_array,
-                delimiter_array,
-                count_array,
-            )
+            substr_index_view(string_array, delimiter_array, count_array)
         }
         other => {
             exec_err!("Unsupported data type {other:?} for function substr_index")
@@ -169,75 +183,457 @@ fn substr_index(args: &[ArrayRef]) -> Result<ArrayRef> {
     }
 }
 
-pub fn substr_index_general<
-    'a,
-    T: ArrowPrimitiveType,
-    V: ArrayAccessor<Item = &'a str>,
-    P: ArrayAccessor<Item = i64>,
->(
-    string_array: V,
-    delimiter_array: V,
-    count_array: P,
+fn substr_index_scalar(
+    string_array: &ArrayRef,
+    delim_scalar: &ScalarValue,
+    count_scalar: &ScalarValue,
+) -> Result<ColumnarValue> {
+    if string_array.is_empty() {
+        return Ok(ColumnarValue::Array(new_null_array(
+            string_array.data_type(),
+            0,
+        )));
+    }
+
+    let delimiter = delim_scalar.try_as_str().ok_or_else(|| {
+        exec_datafusion_err!(
+            "Unsupported delimiter type {:?} for substr_index",
+            delim_scalar.data_type()
+        )
+    })?;
+
+    let count = match count_scalar {
+        ScalarValue::Int64(v) => *v,
+        other => {
+            return exec_err!(
+                "Unsupported count type {:?} for substr_index",
+                other.data_type()
+            );
+        }
+    };
+
+    let (Some(delimiter), Some(count)) = (delimiter, count) else {
+        return Ok(ColumnarValue::Array(new_null_array(
+            string_array.data_type(),
+            string_array.len(),
+        )));
+    };
+
+    let result = match string_array.data_type() {
+        DataType::Utf8View => {
+            substr_index_scalar_view(string_array.as_string_view(), delimiter, count)
+        }
+        DataType::Utf8 => {
+            let arr = string_array.as_string::<i32>();
+            substr_index_scalar_impl(
+                arr,
+                delimiter,
+                count,
+                GenericStringArrayBuilder::<i32>::with_capacity(
+                    arr.len(),
+                    visible_string_bytes(arr),
+                ),
+            )
+        }
+        DataType::LargeUtf8 => {
+            let arr = string_array.as_string::<i64>();
+            substr_index_scalar_impl(
+                arr,
+                delimiter,
+                count,
+                GenericStringArrayBuilder::<i64>::with_capacity(
+                    arr.len(),
+                    visible_string_bytes(arr),
+                ),
+            )
+        }
+        other => exec_err!("Unsupported string type {other:?} for substr_index"),
+    }?;
+
+    Ok(ColumnarValue::Array(result))
+}
+
+#[inline]
+fn visible_string_bytes<T: OffsetSizeTrait>(
+    string_array: &GenericStringArray<T>,
+) -> usize {
+    let offsets = string_array.value_offsets();
+    offsets[offsets.len() - 1].as_usize() - offsets[0].as_usize()
+}
+
+fn substr_index_general<'a, S, O>(
+    string_array: S,
+    delimiter_array: S,
+    count_array: &PrimitiveArray<Int64Type>,
+    mut builder: GenericStringArrayBuilder<O>,
 ) -> Result<ArrayRef>
 where
-    T::Native: OffsetSizeTrait,
+    S: StringArrayType<'a> + Copy,
+    O: OffsetSizeTrait,
 {
-    let mut builder = StringBuilder::new();
-    let string_iter = ArrayIter::new(string_array);
-    let delimiter_array_iter = ArrayIter::new(delimiter_array);
-    let count_array_iter = ArrayIter::new(count_array);
-    string_iter
-        .zip(delimiter_array_iter)
-        .zip(count_array_iter)
-        .for_each(|((string, delimiter), n)| match (string, delimiter, n) {
-            (Some(string), Some(delimiter), Some(n)) => {
-                // In MySQL, these cases will return an empty string.
-                if n == 0 || string.is_empty() || delimiter.is_empty() {
-                    builder.append_value("");
-                    return;
+    let num_rows = string_array.len();
+    // Output is null if and only if any input is null.
+    let nulls = NullBuffer::union(
+        NullBuffer::union(string_array.nulls(), delimiter_array.nulls()).as_ref(),
+        count_array.nulls(),
+    );
+
+    for i in 0..num_rows {
+        if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
+            builder.append_placeholder();
+            continue;
+        }
+        // SAFETY: `i < num_rows` and the union of input nulls is valid at i,
+        // so each input is also valid at i.
+        let string = unsafe { string_array.value_unchecked(i) };
+        let delimiter = unsafe { delimiter_array.value_unchecked(i) };
+        let n = unsafe { count_array.value_unchecked(i) };
+        builder.append_value(substr_index_slice(string, delimiter, n));
+    }
+
+    Ok(Arc::new(builder.finish(nulls)?) as ArrayRef)
+}
+
+fn substr_index_view(
+    string_array: &StringViewArray,
+    delimiter_array: &StringViewArray,
+    count_array: &PrimitiveArray<Int64Type>,
+) -> Result<ArrayRef> {
+    let nulls = NullBuffer::union(
+        NullBuffer::union(string_array.nulls(), delimiter_array.nulls()).as_ref(),
+        count_array.nulls(),
+    );
+    let views = string_array.views();
+    let mut views_buf = Vec::with_capacity(string_array.len());
+    let mut has_out_of_line = false;
+
+    for (i, raw_view) in views.iter().enumerate() {
+        if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
+            views_buf.push(0);
+            continue;
+        }
+
+        let string = string_array.value(i);
+        let delimiter = delimiter_array.value(i);
+        let count = count_array.value(i);
+        let substr = substr_index_slice(string, delimiter, count);
+        has_out_of_line |= append_substr_view(&mut views_buf, raw_view, string, substr);
+    }
+
+    let data_buffers = if has_out_of_line {
+        string_array.data_buffers().to_vec()
+    } else {
+        vec![]
+    };
+
+    // Safety: each appended view is either:
+    // (1) a copied null sentinel,
+    // (2) the original valid input view, or
+    // (3) built by `append_view` for a contiguous substring of the input row.
+    unsafe {
+        Ok(Arc::new(StringViewArray::new_unchecked(
+            ScalarBuffer::from(views_buf),
+            data_buffers,
+            nulls,
+        )) as ArrayRef)
+    }
+}
+
+fn substr_index_scalar_impl<'a, S, O>(
+    string_array: S,
+    delimiter: &str,
+    count: i64,
+    builder: GenericStringArrayBuilder<O>,
+) -> Result<ArrayRef>
+where
+    S: StringArrayType<'a> + Copy,
+    O: OffsetSizeTrait,
+{
+    if count == 0 || delimiter.is_empty() {
+        return map_strings(string_array, builder, |string| &string[..0]);
+    }
+
+    if delimiter.len() == 1 {
+        let delimiter_byte = delimiter.as_bytes()[0];
+        return map_strings(string_array, builder, |string| {
+            substr_index_single_byte(string, delimiter_byte, count)
+        });
+    }
+
+    let occurrence_idx = usize::try_from(count.unsigned_abs()).unwrap_or(usize::MAX) - 1;
+    if count > 0 {
+        let finder = memmem::Finder::new(delimiter.as_bytes());
+        map_strings(string_array, builder, |string| {
+            substr_index_slice_finder(string, &finder, delimiter.len(), occurrence_idx)
+        })
+    } else {
+        let finder_rev = memmem::FinderRev::new(delimiter.as_bytes());
+        map_strings(string_array, builder, |string| {
+            substr_index_rslice_finder(
+                string,
+                &finder_rev,
+                delimiter.len(),
+                occurrence_idx,
+            )
+        })
+    }
+}
+
+fn substr_index_scalar_view(
+    string_array: &StringViewArray,
+    delimiter: &str,
+    count: i64,
+) -> Result<ArrayRef> {
+    let views = string_array.views();
+    let mut views_buf = Vec::with_capacity(string_array.len());
+    let mut has_out_of_line = false;
+
+    if count == 0 || delimiter.is_empty() {
+        let empty_view = make_view(b"", 0, 0);
+        for i in 0..string_array.len() {
+            if string_array.is_null(i) {
+                views_buf.push(0);
+            } else {
+                views_buf.push(empty_view);
+            }
+        }
+    } else if delimiter.len() == 1 {
+        let delimiter_byte = delimiter.as_bytes()[0];
+        for (i, raw_view) in views.iter().enumerate() {
+            if string_array.is_null(i) {
+                views_buf.push(0);
+                continue;
+            }
+
+            let string = string_array.value(i);
+            let substr = substr_index_single_byte(string, delimiter_byte, count);
+            has_out_of_line |=
+                append_substr_view(&mut views_buf, raw_view, string, substr);
+        }
+    } else {
+        let occurrence_idx =
+            usize::try_from(count.unsigned_abs()).unwrap_or(usize::MAX) - 1;
+        if count > 0 {
+            let finder = memmem::Finder::new(delimiter.as_bytes());
+            for (i, raw_view) in views.iter().enumerate() {
+                if string_array.is_null(i) {
+                    views_buf.push(0);
+                    continue;
                 }
 
-                let occurrences = usize::try_from(n.unsigned_abs()).unwrap_or(usize::MAX);
-                let length = if n > 0 {
-                    let split = string.split(delimiter);
-                    split
-                        .take(occurrences)
-                        .map(|s| s.len() + delimiter.len())
-                        .sum::<usize>()
-                        - delimiter.len()
-                } else {
-                    let split = string.rsplit(delimiter);
-                    split
-                        .take(occurrences)
-                        .map(|s| s.len() + delimiter.len())
-                        .sum::<usize>()
-                        - delimiter.len()
-                };
-                if n > 0 {
-                    match string.get(..length) {
-                        Some(substring) => builder.append_value(substring),
-                        None => builder.append_null(),
-                    }
-                } else {
-                    match string.get(string.len().saturating_sub(length)..) {
-                        Some(substring) => builder.append_value(substring),
-                        None => builder.append_null(),
-                    }
+                let string = string_array.value(i);
+                let substr = substr_index_slice_finder(
+                    string,
+                    &finder,
+                    delimiter.len(),
+                    occurrence_idx,
+                );
+                has_out_of_line |=
+                    append_substr_view(&mut views_buf, raw_view, string, substr);
+            }
+        } else {
+            let finder_rev = memmem::FinderRev::new(delimiter.as_bytes());
+            for (i, raw_view) in views.iter().enumerate() {
+                if string_array.is_null(i) {
+                    views_buf.push(0);
+                    continue;
                 }
+
+                let string = string_array.value(i);
+                let substr = substr_index_rslice_finder(
+                    string,
+                    &finder_rev,
+                    delimiter.len(),
+                    occurrence_idx,
+                );
+                has_out_of_line |=
+                    append_substr_view(&mut views_buf, raw_view, string, substr);
             }
-            _ => builder.append_null(),
-        });
+        }
+    }
+
+    let data_buffers = if has_out_of_line {
+        string_array.data_buffers().to_vec()
+    } else {
+        vec![]
+    };
+
+    // Safety: each appended view is either:
+    // (1) a copied null sentinel,
+    // (2) the original valid input view,
+    // (3) an inline empty string view, or
+    // (4) built by `append_view` for a contiguous substring of the input row.
+    unsafe {
+        Ok(Arc::new(StringViewArray::new_unchecked(
+            ScalarBuffer::from(views_buf),
+            data_buffers,
+            string_array.nulls().cloned(),
+        )) as ArrayRef)
+    }
+}
+
+fn map_strings<'a, S, O, F>(
+    string_array: S,
+    mut builder: GenericStringArrayBuilder<O>,
+    f: F,
+) -> Result<ArrayRef>
+where
+    S: StringArrayType<'a> + Copy,
+    O: OffsetSizeTrait,
+    F: Fn(&'a str) -> &'a str,
+{
+    let nulls = string_array.nulls().cloned();
+    for i in 0..string_array.len() {
+        if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
+            builder.append_placeholder();
+            continue;
+        }
+        // SAFETY: `i < string_array.len()` and `nulls` is valid at i, so the
+        // input is also valid at i.
+        let s = unsafe { string_array.value_unchecked(i) };
+        builder.append_value(f(s));
+    }
+    Ok(Arc::new(builder.finish(nulls)?) as ArrayRef)
+}
+
+#[inline]
+fn substr_index_slice<'a>(string: &'a str, delimiter: &str, count: i64) -> &'a str {
+    if count == 0 || string.is_empty() || delimiter.is_empty() {
+        return &string[..0];
+    }
+
+    if delimiter.len() == 1 {
+        return substr_index_single_byte(string, delimiter.as_bytes()[0], count);
+    }
+
+    let occurrences = usize::try_from(count.unsigned_abs()).unwrap_or(usize::MAX);
+    if count > 0 {
+        string
+            .match_indices(delimiter)
+            .nth(occurrences - 1)
+            .map(|(idx, _)| &string[..idx])
+            .unwrap_or(string)
+    } else {
+        string
+            .rmatch_indices(delimiter)
+            .nth(occurrences - 1)
+            .map(|(idx, _)| &string[idx + delimiter.len()..])
+            .unwrap_or(string)
+    }
+}
+
+#[inline]
+fn substr_index_single_byte(string: &str, delimiter: u8, count: i64) -> &str {
+    let occurrences = usize::try_from(count.unsigned_abs()).unwrap_or(usize::MAX);
+    let idx = if count > 0 {
+        memchr_iter(delimiter, string.as_bytes()).nth(occurrences - 1)
+    } else {
+        memrchr_iter(delimiter, string.as_bytes())
+            .nth(occurrences - 1)
+            .map(|idx| idx + 1)
+    };
+
+    match idx {
+        Some(idx) if count > 0 => &string[..idx],
+        Some(idx) => &string[idx..],
+        None => string,
+    }
+}
+
+#[inline]
+fn substr_index_slice_finder<'a>(
+    string: &'a str,
+    finder: &memmem::Finder,
+    delimiter_len: usize,
+    occurrence_idx: usize,
+) -> &'a str {
+    let bytes = string.as_bytes();
+    let mut start = 0;
+    for _ in 0..occurrence_idx {
+        match finder.find(&bytes[start..]) {
+            Some(pos) => start += pos + delimiter_len,
+            None => return string,
+        }
+    }
+
+    match finder.find(&bytes[start..]) {
+        Some(pos) => &string[..start + pos],
+        None => string,
+    }
+}
+
+#[inline]
+fn substr_index_rslice_finder<'a>(
+    string: &'a str,
+    finder: &memmem::FinderRev,
+    delimiter_len: usize,
+    occurrence_idx: usize,
+) -> &'a str {
+    let bytes = string.as_bytes();
+    let mut end = bytes.len();
+    for _ in 0..occurrence_idx {
+        match finder.rfind(&bytes[..end]) {
+            Some(pos) => end = pos,
+            None => return string,
+        }
+    }
 
-    Ok(Arc::new(builder.finish()) as ArrayRef)
+    match finder.rfind(&bytes[..end]) {
+        Some(pos) => &string[pos + delimiter_len..],
+        None => string,
+    }
+}
+
+#[inline]
+fn substr_view(original_view: &u128, substr: &str, start_offset: u32) -> u128 {
+    if substr.len() > 12 {
+        let view = ByteView::from(*original_view);
+        make_view(
+            substr.as_bytes(),
+            view.buffer_index,
+            view.offset + start_offset,
+        )
+    } else {
+        make_view(substr.as_bytes(), 0, 0)
+    }
+}
+
+#[inline]
+fn append_substr_view(
+    views_buf: &mut Vec<u128>,
+    raw_view: &u128,
+    string: &str,
+    substr: &str,
+) -> bool {
+    if substr.len() == string.len() {
+        views_buf.push(*raw_view);
+        return substr.len() > 12;
+    }
+
+    if substr.is_empty() {
+        views_buf.push(make_view(b"", 0, 0));
+        return false;
+    }
+
+    let start_offset = substr.as_ptr() as usize - string.as_ptr() as usize;
+    let start_offset =
+        u32::try_from(start_offset).expect("string view offsets fit in u32");
+    views_buf.push(substr_view(raw_view, substr, start_offset));
+    substr.len() > 12
 }
 
 #[cfg(test)]
 mod tests {
-    use arrow::array::{Array, StringArray};
-    use arrow::datatypes::DataType::Utf8;
+    use arrow::array::{
+        Array, ArrayRef, AsArray, Int64Array, StringArray, StringViewArray,
+    };
+    use arrow::datatypes::DataType::{Utf8, Utf8View};
+    use arrow::datatypes::{DataType, Field};
 
+    use datafusion_common::config::ConfigOptions;
     use datafusion_common::{Result, ScalarValue};
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+    use std::sync::Arc;
 
     use crate::unicode::substrindex::SubstrIndexFunc;
     use crate::utils::test::test_function;
@@ -328,6 +724,135 @@ mod tests {
             Utf8,
             StringArray
         );
+        test_function!(
+            SubstrIndexFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
+                    "verylongprefix.segment.tail".into(),
+                ))),
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(".".into()))),
+                ColumnarValue::Scalar(ScalarValue::from(1i64)),
+            ],
+            Ok(Some("verylongprefix")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+        test_function!(
+            SubstrIndexFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
+                    "www.apache.org".into(),
+                ))),
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(".".into()))),
+                ColumnarValue::Scalar(ScalarValue::from(-1i64)),
+            ],
+            Ok(Some("org")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_substr_index_utf8view_scalar_fast_path() -> Result<()> {
+        let input = Arc::new(StringViewArray::from(vec![
+            Some("alpha.beta.gamma"),
+            Some("short.val"),
+            None,
+        ])) as ArrayRef;
+
+        let arg_fields = vec![
+            Field::new("a", Utf8View, true).into(),
+            Field::new("b", Utf8View, true).into(),
+            Field::new("c", DataType::Int64, true).into(),
+        ];
+
+        let args = ScalarFunctionArgs {
+            number_rows: input.len(),
+            args: vec![
+                ColumnarValue::Array(Arc::clone(&input)),
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(".".into()))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
+            ],
+            arg_fields,
+            return_field: Field::new("f", Utf8View, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+
+        let result = match SubstrIndexFunc::new().invoke_with_args(args)? {
+            ColumnarValue::Array(result) => result,
+            other => panic!("expected array result, got {other:?}"),
+        };
+        let result = result.as_string_view();
+
+        assert_eq!(result.len(), 3);
+        assert_eq!(result.value(0), "alpha");
+        assert_eq!(result.value(1), "short");
+        assert!(result.is_null(2));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_substr_index_utf8view_array_sliced() -> Result<()> {
+        use super::substr_index_view;
+
+        let strings: StringViewArray = vec![
+            Some("skip_this.value"),
+            Some("this_is_a_long_prefix.suffix"),
+            Some("short.val"),
+            Some("another_long_result.rest"),
+            None,
+        ]
+        .into_iter()
+        .collect();
+        let delimiters: StringViewArray =
+            vec![Some("."), Some("."), Some("."), Some("."), Some(".")]
+                .into_iter()
+                .collect();
+        let counts = Int64Array::from(vec![1, 1, -1, 1, 1]);
+
+        let sliced_strings = strings.slice(1, 4);
+        let sliced_delimiters = delimiters.slice(1, 4);
+        let sliced_counts = counts.slice(1, 4);
+
+        let result =
+            substr_index_view(&sliced_strings, &sliced_delimiters, &sliced_counts)?;
+        let result = result.as_string_view();
+
+        assert_eq!(result.len(), 4);
+        assert_eq!(result.value(0), "this_is_a_long_prefix");
+        assert_eq!(result.value(1), "val");
+        assert_eq!(result.value(2), "another_long_result");
+        assert!(result.is_null(3));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_substr_index_utf8view_scalar_reuses_original_view_when_unchanged()
+    -> Result<()> {
+        use super::substr_index_scalar_view;
+
+        let strings: StringViewArray = vec![
+            Some("very_long_value_without_separator"),
+            Some("short"),
+            None,
+        ]
+        .into_iter()
+        .collect();
+
+        let result = substr_index_scalar_view(&strings, ".", 1)?;
+        let result = result.as_string_view();
+
+        assert_eq!(result.len(), 3);
+        assert_eq!(result.value(0), "very_long_value_without_separator");
+        assert_eq!(result.value(1), "short");
+        assert_eq!(result.views()[0], strings.views()[0]);
+        assert_eq!(result.views()[1], strings.views()[1]);
+        assert!(result.is_null(2));
 
         Ok(())
     }
diff --git a/datafusion/functions/src/unicode/translate.rs b/datafusion/functions/src/unicode/translate.rs
index 911b8d311996e..29dc660b86f62 100644
--- a/datafusion/functions/src/unicode/translate.rs
+++ b/datafusion/functions/src/unicode/translate.rs
@@ -15,28 +15,26 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
-use std::sync::Arc;
-
 use arrow::array::{
-    ArrayAccessor, ArrayIter, ArrayRef, AsArray, GenericStringArray, OffsetSizeTrait,
+    ArrayAccessor, ArrayIter, ArrayRef, AsArray, LargeStringBuilder, StringBuilder,
+    StringLikeArrayBuilder, StringViewBuilder,
 };
 use arrow::datatypes::DataType;
 use datafusion_common::HashMap;
-use unicode_segmentation::UnicodeSegmentation;
 
-use crate::utils::{make_scalar_function, utf8_to_str_type};
-use datafusion_common::{exec_err, Result};
+use crate::utils::make_scalar_function;
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 
 #[user_doc(
     doc_section(label = "String Functions"),
-    description = "Translates characters in a string to specified translation characters.",
-    syntax_example = "translate(str, chars, translation)",
+    description = "Performs character-wise substitution based on a mapping.",
+    syntax_example = "translate(str, from, to)",
     sql_example = r#"```sql
 > select translate('twice', 'wic', 'her');
 +--------------------------------------------------+
@@ -46,10 +44,10 @@ use datafusion_macros::user_doc;
 +--------------------------------------------------+
 ```"#,
     standard_argument(name = "str", prefix = "String"),
-    argument(name = "chars", description = "Characters to translate."),
+    argument(name = "from", description = "The characters to be replaced."),
     argument(
-        name = "translation",
-        description = "Translation characters. Translation characters replace only characters at the same position in the **chars** string."
+        name = "to",
+        description = "The characters to replace them with. Each character in **from** that is found in **str** is replaced by the character at the same index in **to**. Any characters in **from** that don't have a corresponding character in **to** are removed. If a character appears more than once in **from**, the first occurrence determines the mapping."
     )
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
@@ -71,6 +69,7 @@ impl TranslateFunc {
                 vec![
                     Exact(vec![Utf8View, Utf8, Utf8]),
                     Exact(vec![Utf8, Utf8, Utf8]),
+                    Exact(vec![LargeUtf8, Utf8, Utf8]),
                 ],
                 Volatility::Immutable,
             ),
@@ -79,10 +78,6 @@ impl TranslateFunc {
 }
 
 impl ScalarUDFImpl for TranslateFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "translate"
     }
@@ -92,13 +87,73 @@ impl ScalarUDFImpl for TranslateFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_to_str_type(&arg_types[0], "translate")
+        Ok(arg_types[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        // When from and to are scalars, pre-build the translation map once
+        if let (Some(from_str), Some(to_str)) = (
+            try_as_scalar_str(&args.args[1]),
+            try_as_scalar_str(&args.args[2]),
+        ) {
+            let to_chars: Vec<char> = to_str.chars().collect();
+
+            let mut from_map: HashMap<char, usize> = HashMap::new();
+            for (index, c) in from_str.chars().enumerate() {
+                from_map.entry(c).or_insert(index);
+            }
+
+            let ascii_table = build_ascii_translate_table(from_str, to_str);
+
+            let string_array = args.args[0].to_array_of_size(args.number_rows)?;
+            let len = string_array.len();
+
+            let result = match string_array.data_type() {
+                DataType::Utf8View => {
+                    let arr = string_array.as_string_view();
+                    let builder = StringViewBuilder::with_capacity(len);
+                    translate_with_map(
+                        arr,
+                        &from_map,
+                        &to_chars,
+                        ascii_table.as_ref(),
+                        builder,
+                    )
+                }
+                DataType::Utf8 => {
+                    let arr = string_array.as_string::<i32>();
+                    let builder =
+                        StringBuilder::with_capacity(len, arr.value_data().len());
+                    translate_with_map(
+                        arr,
+                        &from_map,
+                        &to_chars,
+                        ascii_table.as_ref(),
+                        builder,
+                    )
+                }
+                DataType::LargeUtf8 => {
+                    let arr = string_array.as_string::<i64>();
+                    let builder =
+                        LargeStringBuilder::with_capacity(len, arr.value_data().len());
+                    translate_with_map(
+                        arr,
+                        &from_map,
+                        &to_chars,
+                        ascii_table.as_ref(),
+                        builder,
+                    )
+                }
+                other => {
+                    return exec_err!(
+                        "Unsupported data type {other:?} for function translate"
+                    );
+                }
+            }?;
+
+            return Ok(ColumnarValue::Array(result));
+        }
+
         make_scalar_function(invoke_translate, vec![])(&args.args)
     }
 
@@ -107,25 +162,33 @@ impl ScalarUDFImpl for TranslateFunc {
     }
 }
 
+use super::common::try_as_scalar_str;
+
 fn invoke_translate(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let len = args[0].len();
     match args[0].data_type() {
         DataType::Utf8View => {
             let string_array = args[0].as_string_view();
             let from_array = args[1].as_string::<i32>();
             let to_array = args[2].as_string::<i32>();
-            translate::<i32, _, _>(string_array, from_array, to_array)
+            let builder = StringViewBuilder::with_capacity(len);
+            translate(string_array, from_array, to_array, builder)
         }
         DataType::Utf8 => {
             let string_array = args[0].as_string::<i32>();
             let from_array = args[1].as_string::<i32>();
             let to_array = args[2].as_string::<i32>();
-            translate::<i32, _, _>(string_array, from_array, to_array)
+            let builder =
+                StringBuilder::with_capacity(len, string_array.value_data().len());
+            translate(string_array, from_array, to_array, builder)
         }
         DataType::LargeUtf8 => {
             let string_array = args[0].as_string::<i64>();
-            let from_array = args[1].as_string::<i64>();
-            let to_array = args[2].as_string::<i64>();
-            translate::<i64, _, _>(string_array, from_array, to_array)
+            let from_array = args[1].as_string::<i32>();
+            let to_array = args[2].as_string::<i32>();
+            let builder =
+                LargeStringBuilder::with_capacity(len, string_array.value_data().len());
+            translate(string_array, from_array, to_array, builder)
         }
         other => {
             exec_err!("Unsupported data type {other:?} for function translate")
@@ -135,59 +198,157 @@ fn invoke_translate(args: &[ArrayRef]) -> Result<ArrayRef> {
 
 /// Replaces each character in string that matches a character in the from set with the corresponding character in the to set. If from is longer than to, occurrences of the extra characters in from are deleted.
 /// translate('12345', '143', 'ax') = 'a2x5'
-fn translate<'a, T: OffsetSizeTrait, V, B>(
+fn translate<'a, V, B, O>(
     string_array: V,
     from_array: B,
     to_array: B,
+    mut builder: O,
 ) -> Result<ArrayRef>
 where
     V: ArrayAccessor<Item = &'a str>,
     B: ArrayAccessor<Item = &'a str>,
+    O: StringLikeArrayBuilder,
 {
     let string_array_iter = ArrayIter::new(string_array);
     let from_array_iter = ArrayIter::new(from_array);
     let to_array_iter = ArrayIter::new(to_array);
 
-    let result = string_array_iter
-        .zip(from_array_iter)
-        .zip(to_array_iter)
-        .map(|((string, from), to)| match (string, from, to) {
+    let mut from_map: HashMap<char, usize> = HashMap::new();
+    let mut to_chars: Vec<char> = Vec::new();
+    let mut result_buf = String::new();
+
+    for ((string, from), to) in string_array_iter.zip(from_array_iter).zip(to_array_iter)
+    {
+        match (string, from, to) {
             (Some(string), Some(from), Some(to)) => {
-                // create a hashmap of [char, index] to change from O(n) to O(1) for from list
-                let from_map: HashMap<&str, usize> = from
-                    .graphemes(true)
-                    .collect::<Vec<&str>>()
-                    .iter()
-                    .enumerate()
-                    .map(|(index, c)| (c.to_owned(), index))
-                    .collect();
-
-                let to = to.graphemes(true).collect::<Vec<&str>>();
-
-                Some(
-                    string
-                        .graphemes(true)
-                        .collect::<Vec<&str>>()
-                        .iter()
-                        .flat_map(|c| match from_map.get(*c) {
-                            Some(n) => to.get(*n).copied(),
-                            None => Some(*c),
-                        })
-                        .collect::<Vec<&str>>()
-                        .concat(),
-                )
+                from_map.clear();
+                to_chars.clear();
+                result_buf.clear();
+
+                for (index, c) in from.chars().enumerate() {
+                    from_map.entry(c).or_insert(index);
+                }
+
+                to_chars.extend(to.chars());
+
+                translate_char_by_char(string, &from_map, &to_chars, &mut result_buf);
+
+                builder.append_value(&result_buf);
             }
-            _ => None,
-        })
-        .collect::<GenericStringArray<T>>();
+            _ => builder.append_null(),
+        }
+    }
 
-    Ok(Arc::new(result) as ArrayRef)
+    Ok(builder.finish())
+}
+
+/// Translate `input` character-by-character using `from_map` and `to_chars`,
+/// appending the result to `buf`.
+#[inline]
+fn translate_char_by_char(
+    input: &str,
+    from_map: &HashMap<char, usize>,
+    to_chars: &[char],
+    buf: &mut String,
+) {
+    for c in input.chars() {
+        match from_map.get(&c) {
+            Some(n) => {
+                if let Some(&replacement) = to_chars.get(*n) {
+                    buf.push(replacement);
+                }
+            }
+            None => buf.push(c),
+        }
+    }
+}
+
+/// Sentinel value in the ASCII translate table indicating the character should
+/// be deleted (the `from` character has no corresponding `to` character).  Any
+/// value > 127 works since valid ASCII is 0–127.
+const ASCII_DELETE: u8 = 0xFF;
+
+/// If `from` and `to` are both ASCII, build a fixed-size lookup table for
+/// translation. Each entry maps an input byte to its replacement byte, or to
+/// [`ASCII_DELETE`] if the character should be removed.  Returns `None` if
+/// either string contains non-ASCII characters.
+fn build_ascii_translate_table(from: &str, to: &str) -> Option<[u8; 128]> {
+    if !from.is_ascii() || !to.is_ascii() {
+        return None;
+    }
+    let mut table = [0u8; 128];
+    for i in 0..128u8 {
+        table[i as usize] = i;
+    }
+    let to_bytes = to.as_bytes();
+    let mut seen = [false; 128];
+    for (i, from_byte) in from.bytes().enumerate() {
+        let idx = from_byte as usize;
+        if !seen[idx] {
+            seen[idx] = true;
+            if i < to_bytes.len() {
+                table[idx] = to_bytes[i];
+            } else {
+                table[idx] = ASCII_DELETE;
+            }
+        }
+    }
+    Some(table)
+}
+
+/// Optimized translate for constant `from` and `to` arguments: uses a pre-built
+/// translation map instead of rebuilding it for every row.  When an ASCII byte
+/// lookup table is provided, ASCII input rows use the lookup table; non-ASCII
+/// inputs fall back to the char-based map.
+fn translate_with_map<'a, V, O>(
+    string_array: V,
+    from_map: &HashMap<char, usize>,
+    to_chars: &[char],
+    ascii_table: Option<&[u8; 128]>,
+    mut builder: O,
+) -> Result<ArrayRef>
+where
+    V: ArrayAccessor<Item = &'a str>,
+    O: StringLikeArrayBuilder,
+{
+    let mut result_buf = String::new();
+    let mut ascii_buf: Vec<u8> = Vec::new();
+
+    for string in ArrayIter::new(string_array) {
+        match string {
+            Some(s) => {
+                // Fast path: byte-level table lookup for ASCII strings
+                if let Some(table) = ascii_table
+                    && s.is_ascii()
+                {
+                    ascii_buf.clear();
+                    for &b in s.as_bytes() {
+                        let mapped = table[b as usize];
+                        if mapped != ASCII_DELETE {
+                            ascii_buf.push(mapped);
+                        }
+                    }
+                    // SAFETY: all bytes are ASCII, hence valid UTF-8.
+                    builder.append_value(unsafe {
+                        std::str::from_utf8_unchecked(&ascii_buf)
+                    });
+                } else {
+                    result_buf.clear();
+                    translate_char_by_char(s, from_map, to_chars, &mut result_buf);
+                    builder.append_value(&result_buf);
+                }
+            }
+            None => builder.append_null(),
+        }
+    }
+
+    Ok(builder.finish())
 }
 
 #[cfg(test)]
 mod tests {
-    use arrow::array::{Array, StringArray};
-    use arrow::datatypes::DataType::Utf8;
+    use arrow::array::{Array, StringArray, StringViewArray};
+    use arrow::datatypes::DataType::{Utf8, Utf8View};
 
     use datafusion_common::{Result, ScalarValue};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
@@ -245,6 +406,18 @@ mod tests {
             Utf8,
             StringArray
         );
+        test_function!(
+            TranslateFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::from("abcabc")),
+                ColumnarValue::Scalar(ScalarValue::from("aa")),
+                ColumnarValue::Scalar(ScalarValue::from("de"))
+            ],
+            Ok(Some("dbcdbc")),
+            &str,
+            Utf8,
+            StringArray
+        );
         test_function!(
             TranslateFunc::new(),
             vec![
@@ -257,6 +430,60 @@ mod tests {
             Utf8,
             StringArray
         );
+        // Non-ASCII input with ASCII scalar from/to: exercises the
+        // char-based fallback within translate_with_map.
+        test_function!(
+            TranslateFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::from("café")),
+                ColumnarValue::Scalar(ScalarValue::from("ae")),
+                ColumnarValue::Scalar(ScalarValue::from("AE"))
+            ],
+            Ok(Some("cAfé")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        // Utf8View input should produce Utf8View output
+        test_function!(
+            TranslateFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some("12345".into()))),
+                ColumnarValue::Scalar(ScalarValue::from("143")),
+                ColumnarValue::Scalar(ScalarValue::from("ax"))
+            ],
+            Ok(Some("a2x5")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+        // Null Utf8View input
+        test_function!(
+            TranslateFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(None)),
+                ColumnarValue::Scalar(ScalarValue::from("143")),
+                ColumnarValue::Scalar(ScalarValue::from("ax"))
+            ],
+            Ok(None),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+        // Non-ASCII Utf8View input
+        test_function!(
+            TranslateFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some("é2íñ5".into()))),
+                ColumnarValue::Scalar(ScalarValue::from("éñí")),
+                ColumnarValue::Scalar(ScalarValue::from("óü"))
+            ],
+            Ok(Some("ó2ü5")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+
         #[cfg(not(feature = "unicode_expressions"))]
         test_function!(
             TranslateFunc::new(),
diff --git a/datafusion/functions/src/utils.rs b/datafusion/functions/src/utils.rs
index 932d61e8007cd..b9bde1454994c 100644
--- a/datafusion/functions/src/utils.rs
+++ b/datafusion/functions/src/utils.rs
@@ -17,11 +17,11 @@
 
 use arrow::array::{Array, ArrayRef, ArrowPrimitiveType, AsArray, PrimitiveArray};
 use arrow::compute::try_binary;
-use arrow::datatypes::DataType;
+use arrow::datatypes::{DataType, DecimalType};
 use arrow::error::ArrowError;
 use datafusion_common::{DataFusionError, Result, ScalarValue};
-use datafusion_expr::function::Hint;
 use datafusion_expr::ColumnarValue;
+use datafusion_expr::function::Hint;
 use std::sync::Arc;
 
 /// Creates a function to identify the optimal return type of a string function given
@@ -134,44 +134,67 @@ pub fn calculate_binary_math<L, R, O, F>(
     fun: F,
 ) -> Result<Arc<PrimitiveArray<O>>>
 where
-    R: ArrowPrimitiveType,
     L: ArrowPrimitiveType,
+    R: ArrowPrimitiveType,
     O: ArrowPrimitiveType,
     F: Fn(L::Native, R::Native) -> Result<O::Native, ArrowError>,
     R::Native: TryFrom<ScalarValue>,
 {
-    Ok(match right {
+    let left = left.as_primitive::<L>();
+    let right = right.cast_to(&R::DATA_TYPE, None)?;
+    let result = match right {
         ColumnarValue::Scalar(scalar) => {
-            let right_value: R::Native =
-                R::Native::try_from(scalar.clone()).map_err(|_| {
+            if scalar.is_null() {
+                // Null scalar is castable to any numeric, creating a non-null expression.
+                // Provide null array explicitly to make result null
+                PrimitiveArray::<O>::new_null(left.len())
+            } else {
+                let right = R::Native::try_from(scalar.clone()).map_err(|_| {
                     DataFusionError::NotImplemented(format!(
                         "Cannot convert scalar value {} to {}",
                         &scalar,
                         R::DATA_TYPE
                     ))
                 })?;
-            let left_array = left.as_primitive::<L>();
-            // Bind right value
-            let result =
-                left_array.try_unary::<_, O, _>(|lvalue| fun(lvalue, right_value))?;
-            Arc::new(result) as _
+                left.try_unary::<_, O, _>(|lvalue| fun(lvalue, right))?
+            }
         }
         ColumnarValue::Array(right) => {
-            let right_casted = arrow::compute::cast(&right, &R::DATA_TYPE)?;
-            let right_array = right_casted.as_primitive::<R>();
-
-            // Types are compatible even they are decimals with different scale or precision
-            let result = if PrimitiveArray::<L>::is_compatible(&L::DATA_TYPE) {
-                let left_array = left.as_primitive::<L>();
-                try_binary::<_, _, _, O>(left_array, right_array, &fun)?
-            } else {
-                let left_casted = arrow::compute::cast(left, &L::DATA_TYPE)?;
-                let left_array = left_casted.as_primitive::<L>();
-                try_binary::<_, _, _, O>(left_array, right_array, &fun)?
-            };
-            Arc::new(result) as _
+            let right = right.as_primitive::<R>();
+            try_binary::<_, _, _, O>(left, right, &fun)?
         }
-    })
+    };
+    Ok(Arc::new(result) as _)
+}
+
+/// Computes a binary math function for input arrays using a specified function
+/// and apply rescaling to given precision and scale.
+/// Generic types:
+/// - `L`: Left array decimal type
+/// - `R`: Right array primitive type
+/// - `O`: Output array decimal type
+/// - `F`: Functor computing `fun(l: L, r: R) -> Result<OutputType>`
+pub fn calculate_binary_decimal_math<L, R, O, F>(
+    left: &dyn Array,
+    right: &ColumnarValue,
+    fun: F,
+    precision: u8,
+    scale: i8,
+) -> Result<Arc<PrimitiveArray<O>>>
+where
+    L: DecimalType,
+    R: ArrowPrimitiveType,
+    O: DecimalType,
+    F: Fn(L::Native, R::Native) -> Result<O::Native, ArrowError>,
+    R::Native: TryFrom<ScalarValue>,
+{
+    let result_array = calculate_binary_math::<L, R, O, F>(left, right, fun)?;
+    Ok(Arc::new(
+        result_array
+            .as_ref()
+            .clone()
+            .with_precision_and_scale(precision, scale)?,
+    ))
 }
 
 /// Converts Decimal128 components (value and scale) to an unscaled i128
@@ -192,6 +215,40 @@ pub fn decimal128_to_i128(value: i128, scale: i8) -> Result<i128, ArrowError> {
     }
 }
 
+pub fn decimal32_to_i32(value: i32, scale: i8) -> Result<i32, ArrowError> {
+    if scale < 0 {
+        Err(ArrowError::ComputeError(
+            "Negative scale is not supported".into(),
+        ))
+    } else if scale == 0 {
+        Ok(value)
+    } else {
+        match 10_i32.checked_pow(scale as u32) {
+            Some(divisor) => Ok(value / divisor),
+            None => Err(ArrowError::ComputeError(format!(
+                "Cannot get a power of {scale}"
+            ))),
+        }
+    }
+}
+
+pub fn decimal64_to_i64(value: i64, scale: i8) -> Result<i64, ArrowError> {
+    if scale < 0 {
+        Err(ArrowError::ComputeError(
+            "Negative scale is not supported".into(),
+        ))
+    } else if scale == 0 {
+        Ok(value)
+    } else {
+        match i64::from(10).checked_pow(scale as u32) {
+            Some(divisor) => Ok(value / divisor),
+            None => Err(ArrowError::ComputeError(format!(
+                "Cannot get a power of {scale}"
+            ))),
+        }
+    }
+}
+
 #[cfg(test)]
 pub mod test {
     /// $FUNC ScalarUDFImpl to test
@@ -306,12 +363,30 @@ pub mod test {
         };
     }
 
-    use arrow::datatypes::DataType;
-    #[allow(unused_imports)]
+    use arrow::{
+        array::Int32Array,
+        datatypes::{DataType, Int32Type},
+    };
+    use itertools::Either;
     pub(crate) use test_function;
 
     use super::*;
 
+    #[test]
+    fn test_calculate_binary_math_scalar_null() {
+        let left = Int32Array::from(vec![1, 2]);
+        let right = ColumnarValue::Scalar(ScalarValue::Int32(None));
+        let result = calculate_binary_math::<Int32Type, Int32Type, Int32Type, _>(
+            &left,
+            &right,
+            |x, y| Ok(x + y),
+        )
+        .unwrap();
+
+        assert_eq!(result.len(), 2);
+        assert_eq!(result.null_count(), 2);
+    }
+
     #[test]
     fn string_to_int_type() {
         let v = utf8_to_int_type(&DataType::Utf8, "test").unwrap();
@@ -350,4 +425,106 @@ pub mod test {
             }
         }
     }
+
+    #[test]
+    fn test_decimal32_to_i32() {
+        let cases: [(i32, i8, Either<i32, String>); _] = [
+            (123, 0, Either::Left(123)),
+            (1230, 1, Either::Left(123)),
+            (123000, 3, Either::Left(123)),
+            (1234567, 2, Either::Left(12345)),
+            (-1234567, 2, Either::Left(-12345)),
+            (1, 0, Either::Left(1)),
+            (
+                123,
+                -3,
+                Either::Right("Negative scale is not supported".into()),
+            ),
+            (
+                123,
+                i8::MAX,
+                Either::Right("Cannot get a power of 127".into()),
+            ),
+            (999999999, 0, Either::Left(999999999)),
+            (999999999, 3, Either::Left(999999)),
+        ];
+
+        for (value, scale, expected) in cases {
+            match decimal32_to_i32(value, scale) {
+                Ok(actual) => {
+                    let expected_value =
+                        expected.left().expect("Got value but expected none");
+                    assert_eq!(
+                        actual, expected_value,
+                        "{value} and {scale} vs {expected_value:?}"
+                    );
+                }
+                Err(ArrowError::ComputeError(msg)) => {
+                    assert_eq!(
+                        msg,
+                        expected.right().expect("Got error but expected value")
+                    );
+                }
+                Err(_) => {
+                    assert!(expected.is_right())
+                }
+            }
+        }
+    }
+
+    #[test]
+    fn test_decimal64_to_i64() {
+        let cases: [(i64, i8, Either<i64, String>); _] = [
+            (123, 0, Either::Left(123)),
+            (1234567890, 2, Either::Left(12345678)),
+            (-1234567890, 2, Either::Left(-12345678)),
+            (
+                123,
+                -3,
+                Either::Right("Negative scale is not supported".into()),
+            ),
+            (
+                123,
+                i8::MAX,
+                Either::Right("Cannot get a power of 127".into()),
+            ),
+            (
+                999999999999999999i64,
+                0,
+                Either::Left(999999999999999999i64),
+            ),
+            (
+                999999999999999999i64,
+                3,
+                Either::Left(999999999999999999i64 / 1000),
+            ),
+            (
+                -999999999999999999i64,
+                3,
+                Either::Left(-999999999999999999i64 / 1000),
+            ),
+        ];
+
+        for (value, scale, expected) in cases {
+            match decimal64_to_i64(value, scale) {
+                Ok(actual) => {
+                    let expected_value =
+                        expected.left().expect("Got value but expected none");
+                    assert_eq!(
+                        actual, expected_value,
+                        "{value} and {scale} vs {expected_value:?}"
+                    );
+                }
+                Err(ArrowError::ComputeError(msg)) => {
+                    assert_eq!(
+                        msg,
+                        expected.right().expect("Got error but expected value")
+                    );
+                }
+                Err(_) => {
+                    assert!(expected.is_right())
+                }
+            }
+        }
+    }
 }
diff --git a/datafusion/macros/Cargo.toml b/datafusion/macros/Cargo.toml
index 64781ddeaf421..91f1dde62aaac 100644
--- a/datafusion/macros/Cargo.toml
+++ b/datafusion/macros/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -42,5 +45,5 @@ proc-macro = true
 
 [dependencies]
 datafusion-doc = { workspace = true }
-quote = "1.0.41"
-syn = { version = "2.0.108", features = ["full"] }
+quote = "1.0.44"
+syn = { version = "2.0.117", features = ["full"] }
diff --git a/datafusion/macros/src/user_doc.rs b/datafusion/macros/src/user_doc.rs
index 58c2cc2b1b2ac..ce9e7d55ef103 100644
--- a/datafusion/macros/src/user_doc.rs
+++ b/datafusion/macros/src/user_doc.rs
@@ -25,7 +25,7 @@ extern crate proc_macro;
 use datafusion_doc::scalar_doc_sections::doc_sections_const;
 use proc_macro::TokenStream;
 use quote::quote;
-use syn::{parse_macro_input, DeriveInput, LitStr};
+use syn::{DeriveInput, LitStr, parse_macro_input};
 
 /// This procedural macro is intended to parse a rust custom attribute and create user documentation
 /// from it by constructing a `DocumentBuilder()` automatically. The `Documentation` can be
diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml
index f10510e0973c3..a767526feb930 100644
--- a/datafusion/optimizer/Cargo.toml
+++ b/datafusion/optimizer/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -40,6 +43,14 @@ name = "datafusion_optimizer"
 [features]
 recursive_protection = ["dep:recursive"]
 
+# Note -- please DO NOT add a dependency here to any of the datafusion-functions
+# crates. While it is tempting to try and add an optimizer pass that uses
+# datafusion-functions Doing so makes it harder for downstream crates to
+# provide their own function library and smaller install footprint.
+#
+# If you want to add special handling for a specific function, use the methods
+# on the ScalarUDFImpl or AggregateUDFImpl traits (or add a new method to those
+# traits).
 [dependencies]
 arrow = { workspace = true }
 chrono = { workspace = true }
@@ -52,7 +63,7 @@ itertools = { workspace = true }
 log = { workspace = true }
 recursive = { workspace = true, optional = true }
 regex = { workspace = true }
-regex-syntax = "0.8.6"
+regex-syntax = "0.8.9"
 
 [dev-dependencies]
 async-trait = { workspace = true }
@@ -68,3 +79,7 @@ insta = { workspace = true }
 [[bench]]
 name = "projection_unnecessary"
 harness = false
+
+[[bench]]
+name = "optimize_projections"
+harness = false
diff --git a/datafusion/optimizer/benches/optimize_projections.rs b/datafusion/optimizer/benches/optimize_projections.rs
new file mode 100644
index 0000000000000..d190c5ceabb2f
--- /dev/null
+++ b/datafusion/optimizer/benches/optimize_projections.rs
@@ -0,0 +1,235 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Micro-benchmarks for the `OptimizeProjections` logical optimizer rule.
+//!
+//! Each case models a plan shape typical of TPC-H, TPC-DS, or ClickBench.
+//! Schemas use realistic widths and the rule operates on a fresh
+//! `LogicalPlan` per iteration (construction is in the criterion setup
+//! closure and excluded from measurement).
+
+use std::hint::black_box;
+
+use arrow::datatypes::{DataType, Field, Schema};
+use criterion::{BatchSize, Criterion, criterion_group, criterion_main};
+use datafusion_expr::{
+    JoinType, LogicalPlan, LogicalPlanBuilder, col, lit, logical_plan::table_scan,
+};
+use datafusion_functions_aggregate::expr_fn::sum;
+use datafusion_optimizer::optimize_projections::OptimizeProjections;
+use datafusion_optimizer::{OptimizerContext, OptimizerRule};
+
+fn table(name: &str, cols: usize) -> LogicalPlan {
+    let fields: Vec<Field> = (0..cols)
+        .map(|i| Field::new(format!("c{i}"), DataType::Int32, true))
+        .collect();
+    table_scan(Some(name), &Schema::new(fields), None)
+        .unwrap()
+        .build()
+        .unwrap()
+}
+
+fn scan_with_filter(name: &str, cols: usize, filter_col: usize) -> LogicalPlan {
+    LogicalPlanBuilder::from(table(name, cols))
+        .filter(col(format!("{name}.c{filter_col}")).gt(lit(0i32)))
+        .unwrap()
+        .build()
+        .unwrap()
+}
+
+/// TPC-H Q3-like: customer ⨝ orders ⨝ lineitem with filters above each scan,
+/// GROUP BY 3 keys, 1 SUM aggregate. Models the canonical filter→join→aggregate
+/// analytical shape after PushDownFilter.
+fn plan_tpch_q3() -> LogicalPlan {
+    let customer = scan_with_filter("customer", 8, 6);
+    let orders = scan_with_filter("orders", 9, 4);
+    let lineitem = scan_with_filter("lineitem", 16, 10);
+
+    LogicalPlanBuilder::from(customer)
+        .join_on(
+            orders,
+            JoinType::Inner,
+            vec![col("customer.c0").eq(col("orders.c1"))],
+        )
+        .unwrap()
+        .join_on(
+            lineitem,
+            JoinType::Inner,
+            vec![col("lineitem.c0").eq(col("orders.c0"))],
+        )
+        .unwrap()
+        .aggregate(
+            vec![col("lineitem.c0"), col("orders.c4"), col("orders.c7")],
+            vec![sum(col("lineitem.c5") - col("lineitem.c6"))],
+        )
+        .unwrap()
+        .build()
+        .unwrap()
+}
+
+/// TPC-H Q5-like: 6-way join through region→nation→customer→orders→lineitem
+/// →supplier, GROUP BY 1 key, 1 SUM. Exercises nested-join pruning depth.
+fn plan_tpch_q5() -> LogicalPlan {
+    let region = scan_with_filter("region", 3, 1);
+    let nation = table("nation", 4);
+    let customer = table("customer", 8);
+    let orders = table("orders", 9);
+    let lineitem = table("lineitem", 16);
+    let supplier = table("supplier", 7);
+
+    LogicalPlanBuilder::from(region)
+        .join_on(
+            nation,
+            JoinType::Inner,
+            vec![col("region.c0").eq(col("nation.c2"))],
+        )
+        .unwrap()
+        .join_on(
+            customer,
+            JoinType::Inner,
+            vec![col("nation.c0").eq(col("customer.c3"))],
+        )
+        .unwrap()
+        .join_on(
+            orders,
+            JoinType::Inner,
+            vec![col("customer.c0").eq(col("orders.c1"))],
+        )
+        .unwrap()
+        .join_on(
+            lineitem,
+            JoinType::Inner,
+            vec![col("lineitem.c0").eq(col("orders.c0"))],
+        )
+        .unwrap()
+        .join_on(
+            supplier,
+            JoinType::Inner,
+            vec![col("lineitem.c2").eq(col("supplier.c0"))],
+        )
+        .unwrap()
+        .aggregate(
+            vec![col("nation.c1")],
+            vec![sum(col("lineitem.c5") - col("lineitem.c6"))],
+        )
+        .unwrap()
+        .build()
+        .unwrap()
+}
+
+/// ClickBench-style: single wide `hits` table (100 cols), conjunctive filter,
+/// GROUP BY 2 keys, 2 SUM aggregates. Stresses wide-schema column lookup.
+fn plan_clickbench_groupby() -> LogicalPlan {
+    let hits = table("hits", 100);
+    let predicate = col("hits.c5")
+        .gt(lit(100i32))
+        .and(col("hits.c12").lt(lit(1000i32)));
+    LogicalPlanBuilder::from(hits)
+        .filter(predicate)
+        .unwrap()
+        .aggregate(
+            vec![col("hits.c3"), col("hits.c7")],
+            vec![sum(col("hits.c42")), sum(col("hits.c60"))],
+        )
+        .unwrap()
+        .build()
+        .unwrap()
+}
+
+/// TPC-DS-style CTE shape: a SubqueryAlias wrapping a filter+projection over
+/// a wide fact table, joined back on two dimension tables and aggregated.
+fn plan_tpcds_subquery() -> LogicalPlan {
+    let store_sales = table("store_sales", 23);
+    let customer = table("customer", 18);
+    let item = table("item", 22);
+
+    let sub = LogicalPlanBuilder::from(store_sales)
+        .filter(col("store_sales.c5").gt(lit(0i32)))
+        .unwrap()
+        .project(vec![
+            col("store_sales.c0"),
+            col("store_sales.c3"),
+            col("store_sales.c13"),
+        ])
+        .unwrap()
+        .alias("sub")
+        .unwrap()
+        .build()
+        .unwrap();
+
+    LogicalPlanBuilder::from(customer)
+        .join_on(
+            sub,
+            JoinType::Inner,
+            vec![col("customer.c0").eq(col("sub.c3"))],
+        )
+        .unwrap()
+        .join_on(
+            item,
+            JoinType::Inner,
+            vec![col("item.c0").eq(col("sub.c0"))],
+        )
+        .unwrap()
+        .aggregate(vec![col("customer.c2")], vec![sum(col("sub.c13"))])
+        .unwrap()
+        .build()
+        .unwrap()
+}
+
+/// Narrow 10-column table, single filter, project 3 cols. Guards against
+/// regressions on the common small-schema case where a lookup-map fix for
+/// wide schemas might hurt by adding hashing overhead.
+fn plan_small_schema() -> LogicalPlan {
+    LogicalPlanBuilder::from(table("t", 10))
+        .filter(col("t.c3").gt(lit(0i32)))
+        .unwrap()
+        .project(vec![col("t.c0"), col("t.c1"), col("t.c5")])
+        .unwrap()
+        .build()
+        .unwrap()
+}
+
+type BenchCase = (&'static str, fn() -> LogicalPlan);
+
+fn bench_optimize_projections(c: &mut Criterion) {
+    let rule = OptimizeProjections::new();
+    let config = OptimizerContext::new();
+    let mut group = c.benchmark_group("optimize_projections");
+
+    let cases: &[BenchCase] = &[
+        ("tpch_q3", plan_tpch_q3),
+        ("tpch_q5", plan_tpch_q5),
+        ("clickbench_groupby", plan_clickbench_groupby),
+        ("tpcds_subquery", plan_tpcds_subquery),
+        ("small_schema", plan_small_schema),
+    ];
+
+    for (name, build) in cases {
+        group.bench_function(*name, |b| {
+            b.iter_batched(
+                build,
+                |plan| black_box(rule.rewrite(plan, &config).unwrap()),
+                BatchSize::SmallInput,
+            );
+        });
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_optimize_projections);
+criterion_main!(benches);
diff --git a/datafusion/optimizer/benches/projection_unnecessary.rs b/datafusion/optimizer/benches/projection_unnecessary.rs
index bdc59de4820b7..2082ed6a37515 100644
--- a/datafusion/optimizer/benches/projection_unnecessary.rs
+++ b/datafusion/optimizer/benches/projection_unnecessary.rs
@@ -16,10 +16,10 @@
 // under the License.
 
 use arrow::datatypes::{DataType, Field, Schema};
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ToDFSchema;
 use datafusion_common::{Column, TableReference};
-use datafusion_expr::{logical_plan::LogicalPlan, projection_schema, Expr};
+use datafusion_expr::{Expr, logical_plan::LogicalPlan, projection_schema};
 use datafusion_optimizer::optimize_projections::is_projection_unnecessary;
 use std::hint::black_box;
 use std::sync::Arc;
diff --git a/datafusion/optimizer/src/analyzer/function_rewrite.rs b/datafusion/optimizer/src/analyzer/function_rewrite.rs
index c6bf14ebce2e3..9faa60d939fe3 100644
--- a/datafusion/optimizer/src/analyzer/function_rewrite.rs
+++ b/datafusion/optimizer/src/analyzer/function_rewrite.rs
@@ -23,9 +23,9 @@ use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_common::{DFSchema, Result};
 
 use crate::utils::NamePreserver;
+use datafusion_expr::LogicalPlan;
 use datafusion_expr::expr_rewriter::FunctionRewrite;
 use datafusion_expr::utils::merge_schema;
-use datafusion_expr::LogicalPlan;
 use std::sync::Arc;
 
 /// Analyzer rule that invokes [`FunctionRewrite`]s on expressions
diff --git a/datafusion/optimizer/src/analyzer/mod.rs b/datafusion/optimizer/src/analyzer/mod.rs
index 272692f983683..ddb3b828f01dd 100644
--- a/datafusion/optimizer/src/analyzer/mod.rs
+++ b/datafusion/optimizer/src/analyzer/mod.rs
@@ -22,9 +22,9 @@ use std::sync::Arc;
 
 use log::debug;
 
+use datafusion_common::Result;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::instant::Instant;
-use datafusion_common::Result;
 use datafusion_expr::expr_rewriter::FunctionRewrite;
 use datafusion_expr::{InvariantLevel, LogicalPlan};
 
diff --git a/datafusion/optimizer/src/analyzer/resolve_grouping_function.rs b/datafusion/optimizer/src/analyzer/resolve_grouping_function.rs
index fa7ff1b8b19d6..95649ab8286b7 100644
--- a/datafusion/optimizer/src/analyzer/resolve_grouping_function.rs
+++ b/datafusion/optimizer/src/analyzer/resolve_grouping_function.rs
@@ -28,14 +28,14 @@ use arrow::datatypes::DataType;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
 use datafusion_common::{
-    internal_datafusion_err, plan_err, Column, DFSchemaRef, Result, ScalarValue,
+    Column, DFSchema, Result, ScalarValue, internal_datafusion_err, plan_err,
 };
 use datafusion_expr::expr::{AggregateFunction, Alias};
 use datafusion_expr::logical_plan::LogicalPlan;
 use datafusion_expr::utils::grouping_set_to_exprlist;
 use datafusion_expr::{
-    bitwise_and, bitwise_or, bitwise_shift_left, bitwise_shift_right, cast, Aggregate,
-    Expr, Projection,
+    Aggregate, Expr, Projection, bitwise_and, bitwise_or, bitwise_shift_left,
+    bitwise_shift_right, cast,
 };
 use itertools::Itertools;
 
@@ -72,9 +72,10 @@ fn group_expr_to_bitmap_index(group_expr: &[Expr]) -> Result<HashMap<&Expr, usiz
         .collect::<HashMap<_, _>>())
 }
 
+#[allow(clippy::allow_attributes, clippy::mutable_key_type)] // Expr contains Arc with interior mutability but is intentionally used as hash key
 fn replace_grouping_exprs(
     input: Arc<LogicalPlan>,
-    schema: DFSchemaRef,
+    schema: &DFSchema,
     group_expr: Vec<Expr>,
     aggr_expr: Vec<Expr>,
 ) -> Result<LogicalPlan> {
@@ -96,12 +97,19 @@ fn replace_grouping_exprs(
         .into_iter()
         .zip(columns.into_iter().skip(group_expr_len + grouping_id_len))
     {
+        let grouping_id_type = is_grouping_set
+            .then(|| {
+                schema
+                    .field_with_name(None, Aggregate::INTERNAL_GROUPING_ID)
+                    .map(|f| f.data_type().clone())
+            })
+            .transpose()?;
         match expr {
             Expr::AggregateFunction(ref function) if is_grouping_function(&expr) => {
                 let grouping_expr = grouping_function_on_id(
                     function,
                     &group_expr_to_bitmap_index,
-                    is_grouping_set,
+                    grouping_id_type,
                 )?;
                 projection_exprs.push(Expr::Alias(Alias::new(
                     grouping_expr,
@@ -109,6 +117,24 @@ fn replace_grouping_exprs(
                     column.name,
                 )));
             }
+            Expr::Alias(Alias {
+                ref relation,
+                ref name,
+                ..
+            }) if is_grouping_function(&expr) => {
+                let function = unwrap_alias_to_grouping_function(&expr)?;
+                let grouping_expr = grouping_function_on_id(
+                    function,
+                    &group_expr_to_bitmap_index,
+                    grouping_id_type,
+                )?;
+                // Preserve the outermost user-provided alias
+                projection_exprs.push(Expr::Alias(Alias::new(
+                    grouping_expr,
+                    relation.clone(),
+                    name.clone(),
+                )));
+            }
             _ => {
                 projection_exprs.push(Expr::Column(column));
                 new_agg_expr.push(expr);
@@ -139,7 +165,7 @@ fn analyze_internal(plan: LogicalPlan) -> Result<Transformed<LogicalPlan>> {
             schema,
             ..
         }) if contains_grouping_function(&aggr_expr) => Ok(Transformed::yes(
-            replace_grouping_exprs(input, schema, group_expr, aggr_expr)?,
+            replace_grouping_exprs(input, schema.as_ref(), group_expr, aggr_expr)?,
         )),
         _ => Ok(Transformed::no(plan)),
     })?;
@@ -147,10 +173,27 @@ fn analyze_internal(plan: LogicalPlan) -> Result<Transformed<LogicalPlan>> {
     Ok(transformed_plan)
 }
 
+/// Recursively unwrap `Expr::Alias` nodes to reach the inner `AggregateFunction`.
+/// Returns an error if the innermost expression is not an `AggregateFunction`,
+/// which should not happen if `is_grouping_function` returned true.
+fn unwrap_alias_to_grouping_function(expr: &Expr) -> Result<&AggregateFunction> {
+    match expr {
+        Expr::AggregateFunction(function) => Ok(function),
+        Expr::Alias(Alias { expr, .. }) => unwrap_alias_to_grouping_function(expr),
+        _ => plan_err!("Expected grouping aggregate function inside alias, got {expr}"),
+    }
+}
+
 fn is_grouping_function(expr: &Expr) -> bool {
     // TODO: Do something better than name here should grouping be a built
     // in expression?
-    matches!(expr, Expr::AggregateFunction(AggregateFunction { ref func, .. }) if func.name() == "grouping")
+    match expr {
+        Expr::AggregateFunction(AggregateFunction { func, .. }) => {
+            func.name() == "grouping"
+        }
+        Expr::Alias(Alias { expr, .. }) => is_grouping_function(expr),
+        _ => false,
+    }
 }
 
 fn contains_grouping_function(exprs: &[Expr]) -> bool {
@@ -158,6 +201,7 @@ fn contains_grouping_function(exprs: &[Expr]) -> bool {
 }
 
 /// Validate that the arguments to the grouping function are in the group by clause.
+#[allow(clippy::allow_attributes, clippy::mutable_key_type)] // Expr contains Arc with interior mutability but is intentionally used as hash key
 fn validate_args(
     function: &AggregateFunction,
     group_by_expr: &HashMap<&Expr, usize>,
@@ -178,43 +222,48 @@ fn validate_args(
     }
 }
 
+#[allow(clippy::allow_attributes, clippy::mutable_key_type)] // Expr contains Arc with interior mutability but is intentionally used as hash key
 fn grouping_function_on_id(
     function: &AggregateFunction,
     group_by_expr: &HashMap<&Expr, usize>,
-    is_grouping_set: bool,
+    // None means not a grouping set (result is always 0).
+    grouping_id_type: Option<DataType>,
 ) -> Result<Expr> {
     validate_args(function, group_by_expr)?;
     let args = &function.params.args;
 
     // Postgres allows grouping function for group by without grouping sets, the result is then
     // always 0
-    if !is_grouping_set {
+    let Some(grouping_id_type) = grouping_id_type else {
         return Ok(Expr::Literal(ScalarValue::from(0i32), None));
-    }
-
-    let group_by_expr_count = group_by_expr.len();
-    let literal = |value: usize| {
-        if group_by_expr_count < 8 {
-            Expr::Literal(ScalarValue::from(value as u8), None)
-        } else if group_by_expr_count < 16 {
-            Expr::Literal(ScalarValue::from(value as u16), None)
-        } else if group_by_expr_count < 32 {
-            Expr::Literal(ScalarValue::from(value as u32), None)
-        } else {
-            Expr::Literal(ScalarValue::from(value as u64), None)
-        }
     };
 
+    // Use the actual __grouping_id column type to size literals correctly. This
+    // accounts for duplicate-ordinal bits that `Aggregate::grouping_id_type`
+    // packs into the high bits of the column, which a simple count of grouping
+    // expressions would miss.
+    let literal = |value: usize| match &grouping_id_type {
+        DataType::UInt8 => Expr::Literal(ScalarValue::from(value as u8), None),
+        DataType::UInt16 => Expr::Literal(ScalarValue::from(value as u16), None),
+        DataType::UInt32 => Expr::Literal(ScalarValue::from(value as u32), None),
+        DataType::UInt64 => Expr::Literal(ScalarValue::from(value as u64), None),
+        other => panic!("unexpected __grouping_id type: {other}"),
+    };
     let grouping_id_column = Expr::Column(Column::from(Aggregate::INTERNAL_GROUPING_ID));
-    // The grouping call is exactly our internal grouping id
-    if args.len() == group_by_expr_count
+    if args.len() == group_by_expr.len()
         && args
             .iter()
             .rev()
             .enumerate()
             .all(|(idx, expr)| group_by_expr.get(expr) == Some(&idx))
     {
-        return Ok(cast(grouping_id_column, DataType::Int32));
+        let n = group_by_expr.len();
+        // Mask the ordinal bits above position `n` so only the semantic bitmask is visible.
+        // checked_shl returns None when n >= 64 (all bits are semantic), mapping to u64::MAX.
+        let semantic_mask: u64 = 1u64.checked_shl(n as u32).map_or(u64::MAX, |m| m - 1);
+        let masked_id =
+            bitwise_and(grouping_id_column.clone(), literal(semantic_mask as usize));
+        return Ok(cast(masked_id, DataType::Int32));
     }
 
     args.iter()
diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs
index 4fb0f8553b4ba..7b81feab47a99 100644
--- a/datafusion/optimizer/src/analyzer/type_coercion.rs
+++ b/datafusion/optimizer/src/analyzer/type_coercion.rs
@@ -17,42 +17,49 @@
 
 //! Optimizer rule for type validation and coercion
 
-use std::sync::Arc;
-
+use arrow::compute::can_cast_types;
 use datafusion_expr::binary::BinaryTypeCoercer;
-use itertools::{izip, Itertools as _};
-
-use arrow::datatypes::{DataType, Field, IntervalUnit, Schema};
+use itertools::{Itertools as _, izip};
+use std::sync::{Arc, LazyLock};
 
 use crate::analyzer::AnalyzerRule;
 use crate::utils::NamePreserver;
+
+use arrow::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUnit};
+use arrow::temporal_conversions::SECONDS_IN_DAY;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter};
 use datafusion_common::{
+    Column, DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, TableReference,
     exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_datafusion_err,
-    plan_err, Column, DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue,
-    TableReference,
+    plan_err,
 };
 use datafusion_expr::expr::{
-    self, AggregateFunctionParams, Alias, Between, BinaryExpr, Case, Exists, InList,
-    InSubquery, Like, ScalarFunction, Sort, WindowFunction,
+    self, AggregateFunctionParams, Alias, Between, BinaryExpr, Case, Exists,
+    HigherOrderFunction, InList, InSubquery, Like, ScalarFunction, SetComparison, Sort,
+    WindowFunction,
 };
 use datafusion_expr::expr_rewriter::coerce_plan_expr_for_schema;
 use datafusion_expr::expr_schema::cast_subquery;
 use datafusion_expr::logical_plan::Subquery;
-use datafusion_expr::type_coercion::binary::{comparison_coercion, like_coercion};
+use datafusion_expr::type_coercion::binary::{
+    comparison_coercion, like_coercion, type_union_coercion,
+};
 use datafusion_expr::type_coercion::functions::{
-    data_types_with_scalar_udf, fields_with_aggregate_udf,
+    UDFCoercionExt, fields_with_udf, value_fields_with_higher_order_udf_and_lambdas,
 };
 use datafusion_expr::type_coercion::other::{
-    get_coerce_type_for_case_expression, get_coerce_type_for_list,
+    get_coerce_type_for_case_expression, get_coerce_type_for_case_when,
+    get_coerce_type_for_list,
+};
+use datafusion_expr::type_coercion::{
+    is_datetime, is_interval, is_signed_numeric, is_timestamp,
 };
-use datafusion_expr::type_coercion::{is_datetime, is_utf8_or_utf8view_or_large_utf8};
 use datafusion_expr::utils::merge_schema;
 use datafusion_expr::{
-    is_false, is_not_false, is_not_true, is_not_unknown, is_true, is_unknown, not,
-    AggregateUDF, Expr, ExprSchemable, Join, Limit, LogicalPlan, Operator, Projection,
-    ScalarUDF, Union, WindowFrame, WindowFrameBound, WindowFrameUnits,
+    Cast, Expr, ExprSchemable, Join, Limit, LogicalPlan, Operator, Projection, Union,
+    ValueOrLambda, WindowFrame, WindowFrameBound, WindowFrameUnits, is_false,
+    is_not_false, is_not_true, is_not_unknown, is_true, is_unknown, lit, not,
 };
 
 /// Performs type coercion by determining the schema
@@ -90,11 +97,11 @@ impl AnalyzerRule for TypeCoercion {
     }
 
     fn analyze(&self, plan: LogicalPlan, config: &ConfigOptions) -> Result<LogicalPlan> {
-        let empty_schema = DFSchema::empty();
+        static EMPTY_SCHEMA: LazyLock<DFSchema> = LazyLock::new(DFSchema::empty);
 
         // recurse
         let transformed_plan = plan
-            .transform_up_with_subqueries(|plan| analyze_internal(&empty_schema, plan))?
+            .transform_up_with_subqueries(|plan| analyze_internal(&EMPTY_SCHEMA, plan))?
             .data;
 
         // finish
@@ -290,17 +297,150 @@ impl<'a> TypeCoercionRewriter<'a> {
         right: Expr,
         right_schema: &DFSchema,
     ) -> Result<(Expr, Expr)> {
-        let (left_type, right_type) = BinaryTypeCoercer::new(
-            &left.get_type(left_schema)?,
+        let left_data_type = left.get_type(left_schema)?;
+        let right_data_type = right.get_type(right_schema)?;
+        let (left_type, right_type) =
+            BinaryTypeCoercer::new(&left_data_type, &op, &right_data_type)
+                .get_input_types()?;
+        let left_cast_ok = can_cast_types(&left_data_type, &left_type);
+        let right_cast_ok = can_cast_types(&right_data_type, &right_type);
+
+        // handle special cases for
+        // * Date +/- int => Date
+        // * Date + time => Timestamp
+        let left_expr = if !left_cast_ok {
+            Self::coerce_date_time_math_op(
+                left,
+                &op,
+                &left_data_type,
+                &left_type,
+                &right_type,
+            )?
+        } else {
+            left.cast_to(&left_type, left_schema)?
+        };
+
+        let right_expr = if !right_cast_ok {
+            Self::coerce_date_time_math_op(
+                right,
+                &op,
+                &right_data_type,
+                &right_type,
+                &left_type,
+            )?
+        } else {
+            right.cast_to(&right_type, right_schema)?
+        };
+
+        Ok((left_expr, right_expr))
+    }
+
+    fn coerce_date_time_math_op(
+        expr: Expr,
+        op: &Operator,
+        left_current_type: &DataType,
+        left_target_type: &DataType,
+        right_target_type: &DataType,
+    ) -> Result<Expr, DataFusionError> {
+        use DataType::*;
+
+        fn cast(expr: Expr, target_type: DataType) -> Expr {
+            Expr::Cast(Cast::new(Box::new(expr), target_type))
+        }
+
+        fn time_to_nanos(
+            expr: Expr,
+            expr_type: &DataType,
+        ) -> Result<Expr, DataFusionError> {
+            let expr = match expr_type {
+                Time32(TimeUnit::Second) => {
+                    cast(cast(expr, Int32), Int64)
+                        * lit(ScalarValue::Int64(Some(1_000_000_000)))
+                }
+                Time32(TimeUnit::Millisecond) => {
+                    cast(cast(expr, Int32), Int64)
+                        * lit(ScalarValue::Int64(Some(1_000_000)))
+                }
+                Time64(TimeUnit::Microsecond) => {
+                    cast(expr, Int64) * lit(ScalarValue::Int64(Some(1_000)))
+                }
+                Time64(TimeUnit::Nanosecond) => cast(expr, Int64),
+                t => return internal_err!("Unexpected time data type {t}"),
+            };
+
+            Ok(expr)
+        }
+
+        let e = match (
             &op,
-            &right.get_type(right_schema)?,
-        )
-        .get_input_types()?;
+            &left_current_type,
+            &left_target_type,
+            &right_target_type,
+        ) {
+            // int +/- date => date
+            (
+                Operator::Plus | Operator::Minus,
+                Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64,
+                Interval(IntervalUnit::MonthDayNano),
+                Date32 | Date64,
+            ) => {
+                // cast to i64 first
+                let expr = match *left_current_type {
+                    Int64 => expr,
+                    _ => cast(expr, Int64),
+                };
+                // next, multiply by 86400 to get seconds
+                let expr = expr * lit(ScalarValue::from(SECONDS_IN_DAY));
+                // cast to duration
+                let expr = cast(expr, Duration(TimeUnit::Second));
+                // finally cast to interval
+                cast(expr, Interval(IntervalUnit::MonthDayNano))
+            }
+            // These might seem to be a bit convoluted, however for arrow to do date + time arithmetic
+            // date must be cast to Timestamp(Nanosecond) and time cast to Duration(Nanosecond)
+            // (they must be the same timeunit).
+            //
+            // For Time32/64 we first need to cast to an Int64, convert that to nanoseconds based
+            // on the time unit, then cast that to duration.
+            //
+            // Time + date -> timestamp or
+            (
+                Operator::Plus | Operator::Minus,
+                Time32(_) | Time64(_),
+                Duration(TimeUnit::Nanosecond),
+                Timestamp(TimeUnit::Nanosecond, None),
+            ) => {
+                // cast to int64, convert to nanoseconds
+                let expr = time_to_nanos(expr, left_current_type)?;
+                // cast to duration
+                cast(expr, Duration(TimeUnit::Nanosecond))
+            }
+            // Similar to above, for arrow to do time - time we need to convert to an interval.
+            // To do that we first need to cast to an Int64, convert that to nanoseconds based
+            // on the time unit, then cast that to duration, then finally cast to an interval.
+            //
+            // Time - time -> timestamp
+            (
+                Operator::Plus | Operator::Minus,
+                Time32(_) | Time64(_),
+                Interval(IntervalUnit::MonthDayNano),
+                Interval(IntervalUnit::MonthDayNano),
+            ) => {
+                // cast to int64, convert to nanoseconds
+                let expr = time_to_nanos(expr, left_current_type)?;
+                // cast to duration
+                let expr = cast(expr, Duration(TimeUnit::Nanosecond));
+                // finally cast to interval
+                cast(expr, Interval(IntervalUnit::MonthDayNano))
+            }
+            _ => {
+                return plan_err!(
+                    "Cannot automatically convert {left_current_type} to {left_target_type}"
+                );
+            }
+        };
 
-        Ok((
-            left.cast_to(&left_type, left_schema)?,
-            right.cast_to(&right_type, right_schema)?,
-        ))
+        Ok(e)
     }
 }
 
@@ -368,6 +508,43 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> {
                     negated,
                 ))))
             }
+            Expr::SetComparison(SetComparison {
+                expr,
+                subquery,
+                op,
+                quantifier,
+            }) => {
+                let new_plan = analyze_internal(
+                    self.schema,
+                    Arc::unwrap_or_clone(subquery.subquery),
+                )?
+                .data;
+                let expr_type = expr.get_type(self.schema)?;
+                let subquery_type = new_plan.schema().field(0).data_type();
+                if (expr_type.is_numeric() && subquery_type.is_string())
+                    || (subquery_type.is_numeric() && expr_type.is_string())
+                {
+                    return plan_err!(
+                        "expr type {expr_type} can't cast to {subquery_type} in SetComparison"
+                    );
+                }
+                let common_type = comparison_coercion(&expr_type, subquery_type).ok_or(
+                    plan_datafusion_err!(
+                        "expr type {expr_type} can't cast to {subquery_type} in SetComparison"
+                    ),
+                )?;
+                let new_subquery = Subquery {
+                    subquery: Arc::new(new_plan),
+                    outer_ref_columns: subquery.outer_ref_columns,
+                    spans: subquery.spans,
+                };
+                Ok(Transformed::yes(Expr::SetComparison(SetComparison::new(
+                    Box::new(expr.cast_to(&common_type, self.schema)?),
+                    cast_subquery(new_subquery, &common_type)?,
+                    op,
+                    quantifier,
+                ))))
+            }
             Expr::Not(expr) => Ok(Transformed::yes(not(get_casted_expr_for_bool_op(
                 *expr,
                 self.schema,
@@ -390,6 +567,20 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> {
             Expr::IsNotUnknown(expr) => Ok(Transformed::yes(is_not_unknown(
                 get_casted_expr_for_bool_op(*expr, self.schema)?,
             ))),
+            Expr::Negative(expr) => {
+                let data_type = expr.get_type(self.schema)?;
+                if data_type.is_null()
+                    || is_signed_numeric(&data_type)
+                    || is_interval(&data_type)
+                    || is_timestamp(&data_type)
+                {
+                    Ok(Transformed::no(Expr::Negative(expr)))
+                } else {
+                    plan_err!(
+                        "Negation only supports numeric, interval and timestamp types"
+                    )
+                }
+            }
             Expr::Like(Like {
                 negated,
                 expr,
@@ -480,7 +671,8 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> {
                     get_coerce_type_for_list(&expr_data_type, &list_data_types);
                 match result_type {
                     None => plan_err!(
-                        "Can not find compatible types to compare {expr_data_type} with [{}]", list_data_types.iter().join(", ")
+                        "Can not find compatible types to compare {expr_data_type} with [{}]",
+                        list_data_types.iter().join(", ")
                     ),
                     Some(coerced_type) => {
                         // find the coerced type
@@ -491,9 +683,9 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> {
                                 list_expr.cast_to(&coerced_type, self.schema)
                             })
                             .collect::<Result<Vec<_>>>()?;
-                        Ok(Transformed::yes(Expr::InList(InList ::new(
-                             Box::new(cast_expr),
-                             cast_list_expr,
+                        Ok(Transformed::yes(Expr::InList(InList::new(
+                            Box::new(cast_expr),
+                            cast_list_expr,
                             negated,
                         ))))
                     }
@@ -504,11 +696,8 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> {
                 Ok(Transformed::yes(Expr::Case(case)))
             }
             Expr::ScalarFunction(ScalarFunction { func, args }) => {
-                let new_expr = coerce_arguments_for_signature_with_scalar_udf(
-                    args,
-                    self.schema,
-                    &func,
-                )?;
+                let new_expr =
+                    coerce_arguments_for_signature(args, self.schema, func.as_ref())?;
                 Ok(Transformed::yes(Expr::ScalarFunction(
                     ScalarFunction::new_udf(func, new_expr),
                 )))
@@ -524,11 +713,8 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> {
                         null_treatment,
                     },
             }) => {
-                let new_expr = coerce_arguments_for_signature_with_aggregate_udf(
-                    args,
-                    self.schema,
-                    &func,
-                )?;
+                let new_expr =
+                    coerce_arguments_for_signature(args, self.schema, func.as_ref())?;
                 Ok(Transformed::yes(Expr::AggregateFunction(
                     expr::AggregateFunction::new_udf(
                         func,
@@ -559,13 +745,11 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> {
 
                 let args = match &fun {
                     expr::WindowFunctionDefinition::AggregateUDF(udf) => {
-                        coerce_arguments_for_signature_with_aggregate_udf(
-                            args,
-                            self.schema,
-                            udf,
-                        )?
+                        coerce_arguments_for_signature(args, self.schema, udf.as_ref())?
+                    }
+                    expr::WindowFunctionDefinition::WindowUDF(udf) => {
+                        coerce_arguments_for_signature(args, self.schema, udf.as_ref())?
                     }
-                    _ => args,
                 };
 
                 let new_expr = Expr::from(WindowFunction {
@@ -582,6 +766,35 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> {
                 });
                 Ok(Transformed::yes(new_expr))
             }
+            Expr::HigherOrderFunction(HigherOrderFunction { func, args }) => {
+                let current_fields = args
+                    .iter()
+                    .map(|arg| match arg {
+                        Expr::Lambda(lambda) => Ok(ValueOrLambda::Lambda(
+                            lambda.body.to_field(self.schema)?.1,
+                        )),
+                        _ => Ok(ValueOrLambda::Value(arg.to_field(self.schema)?.1)),
+                    })
+                    .collect::<Result<Vec<_>>>()?;
+
+                let new_fields = value_fields_with_higher_order_udf_and_lambdas(
+                    &current_fields,
+                    func.as_ref(),
+                )?;
+
+                let new_args = std::iter::zip(args, new_fields)
+                    .map(|(arg, new_field)| match (&arg, new_field) {
+                        (Expr::Lambda(_lambda), ValueOrLambda::Lambda(_)) => Ok(arg),
+                        (Expr::Lambda(_lambda), ValueOrLambda::Value(_)) => internal_err!("value_fields_with_higher_order_udf returned a value for a lambda argument"),
+                        (_, ValueOrLambda::Value(new_field)) => arg.cast_to(new_field.data_type(), self.schema),
+                        (_, ValueOrLambda::Lambda(_)) => internal_err!("value_fields_with_higher_order_udf returned a lambda for a value argument"),
+                    })
+                    .collect::<Result<_>>()?;
+
+                Ok(Transformed::yes(Expr::HigherOrderFunction(
+                    HigherOrderFunction::new(func, new_args),
+                )))
+            }
             // TODO: remove the next line after `Expr::Wildcard` is removed
             #[expect(deprecated)]
             Expr::Alias(_)
@@ -591,13 +804,14 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> {
             | Expr::SimilarTo(_)
             | Expr::IsNotNull(_)
             | Expr::IsNull(_)
-            | Expr::Negative(_)
             | Expr::Cast(_)
             | Expr::TryCast(_)
             | Expr::Wildcard { .. }
             | Expr::GroupingSet(_)
             | Expr::Placeholder(_)
-            | Expr::OuterReferenceColumn(_, _) => Ok(Transformed::no(expr)),
+            | Expr::OuterReferenceColumn(_, _)
+            | Expr::Lambda(_)
+            | Expr::LambdaVariable(_) => Ok(Transformed::no(expr)),
         }
     }
 }
@@ -687,7 +901,7 @@ fn coerce_scalar_range_aware(
         // If type coercion fails, check if the largest type in family works:
         if let Some(largest_type) = get_widest_type_in_family(target_type) {
             coerce_scalar(largest_type, value).map_or_else(
-                |_| exec_err!("Cannot cast {value:?} to {target_type}"),
+                |_| exec_err!("Cannot cast {value} to {target_type}"),
                 |_| ScalarValue::try_from(target_type),
             )
         } else {
@@ -726,12 +940,15 @@ fn coerce_frame_bound(
 
 fn extract_window_frame_target_type(col_type: &DataType) -> Result<DataType> {
     if col_type.is_numeric()
-        || is_utf8_or_utf8view_or_large_utf8(col_type)
-        || matches!(col_type, DataType::List(_))
-        || matches!(col_type, DataType::LargeList(_))
-        || matches!(col_type, DataType::FixedSizeList(_, _))
-        || matches!(col_type, DataType::Null)
-        || matches!(col_type, DataType::Boolean)
+        || col_type.is_string()
+        || col_type.is_null()
+        || matches!(
+            col_type,
+            DataType::List(_)
+                | DataType::LargeList(_)
+                | DataType::FixedSizeList(_, _)
+                | DataType::Boolean
+        )
     {
         Ok(col_type.clone())
     } else if is_datetime(col_type) {
@@ -784,48 +1001,17 @@ fn get_casted_expr_for_bool_op(expr: Expr, schema: &DFSchema) -> Result<Expr> {
 /// `signature`, if possible.
 ///
 /// See the module level documentation for more detail on coercion.
-fn coerce_arguments_for_signature_with_scalar_udf(
+fn coerce_arguments_for_signature<F: UDFCoercionExt>(
     expressions: Vec<Expr>,
     schema: &DFSchema,
-    func: &ScalarUDF,
+    func: &F,
 ) -> Result<Vec<Expr>> {
-    if expressions.is_empty() {
-        return Ok(expressions);
-    }
-
-    let current_types = expressions
-        .iter()
-        .map(|e| e.get_type(schema))
-        .collect::<Result<Vec<_>>>()?;
-
-    let new_types = data_types_with_scalar_udf(&current_types, func)?;
-
-    expressions
-        .into_iter()
-        .enumerate()
-        .map(|(i, expr)| expr.cast_to(&new_types[i], schema))
-        .collect()
-}
-
-/// Returns `expressions` coerced to types compatible with
-/// `signature`, if possible.
-///
-/// See the module level documentation for more detail on coercion.
-fn coerce_arguments_for_signature_with_aggregate_udf(
-    expressions: Vec<Expr>,
-    schema: &DFSchema,
-    func: &AggregateUDF,
-) -> Result<Vec<Expr>> {
-    if expressions.is_empty() {
-        return Ok(expressions);
-    }
-
     let current_fields = expressions
         .iter()
         .map(|e| e.to_field(schema).map(|(_, f)| f))
         .collect::<Result<Vec<_>>>()?;
 
-    let new_types = fields_with_aggregate_udf(&current_fields, func)?
+    let coerced_types = fields_with_udf(&current_fields, func)?
         .into_iter()
         .map(|f| f.data_type().clone())
         .collect::<Vec<_>>();
@@ -833,7 +1019,7 @@ fn coerce_arguments_for_signature_with_aggregate_udf(
     expressions
         .into_iter()
         .enumerate()
-        .map(|(i, expr)| expr.cast_to(&new_types[i], schema))
+        .map(|(i, expr)| expr.cast_to(&coerced_types[i], schema))
         .collect()
 }
 
@@ -894,8 +1080,7 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result<Case> {
                 .iter()
                 .map(|(when, _then)| when.get_type(schema))
                 .collect::<Result<Vec<_>>>()?;
-            let coerced_type =
-                get_coerce_type_for_case_expression(&when_types, Some(case_type));
+            let coerced_type = get_coerce_type_for_case_when(&when_types, case_type);
             coerced_type.ok_or_else(|| {
                 plan_datafusion_err!(
                     "Failed to coerce case ({case_type}) and when ({}) \
@@ -973,7 +1158,7 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result<Case> {
 /// **Field-level metadata merging**: Later fields take precedence for duplicate metadata keys.
 ///
 /// **Type coercion precedence**: The coerced type is determined by iteratively applying
-/// `comparison_coercion()` between the accumulated type and each new input's type. The
+/// `type_union_coercion()` between the accumulated type and each new input's type. The
 /// result depends on type coercion rules, not input order.
 ///
 /// **Nullability merging**: Nullability is accumulated using logical OR (`||`).
@@ -996,7 +1181,7 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result<Case> {
 /// ```
 ///
 /// **Precedence Summary**:
-/// - **Datatypes**: Determined by `comparison_coercion()` rules, not input order
+/// - **Datatypes**: Determined by `type_union_coercion()` rules, not input order
 /// - **Nullability**: Later inputs can add nullability but cannot remove it (logical OR)
 /// - **Metadata**: Later inputs take precedence for same keys (HashMap::extend semantics)
 pub fn coerce_union_schema(inputs: &[Arc<LogicalPlan>]) -> Result<DFSchema> {
@@ -1046,7 +1231,7 @@ fn coerce_union_schema_with_schema(
             plan_schema.fields().iter()
         ) {
             let coerced_type =
-                comparison_coercion(union_datatype, plan_field.data_type()).ok_or_else(
+                type_union_coercion(union_datatype, plan_field.data_type()).ok_or_else(
                     || {
                         plan_datafusion_err!(
                             "Incompatible inputs for Union: Previous inputs were \
@@ -1112,17 +1297,17 @@ fn project_with_column_index(
 
 #[cfg(test)]
 mod test {
-    use std::any::Any;
+
     use std::sync::Arc;
 
     use arrow::datatypes::DataType::Utf8;
     use arrow::datatypes::{DataType, Field, Schema, SchemaBuilder, TimeUnit};
     use insta::assert_snapshot;
 
+    use crate::analyzer::Analyzer;
     use crate::analyzer::type_coercion::{
-        coerce_case_expression, TypeCoercion, TypeCoercionRewriter,
+        TypeCoercion, TypeCoercionRewriter, coerce_case_expression,
     };
-    use crate::analyzer::Analyzer;
     use crate::assert_analyzed_plan_with_config_eq_snapshot;
     use datafusion_common::config::ConfigOptions;
     use datafusion_common::tree_node::{TransformedResult, TreeNode};
@@ -1131,10 +1316,10 @@ mod test {
     use datafusion_expr::logical_plan::{EmptyRelation, Projection, Sort};
     use datafusion_expr::test::function_stub::avg_udaf;
     use datafusion_expr::{
-        cast, col, create_udaf, is_true, lit, AccumulatorFactoryFunction, AggregateUDF,
-        BinaryExpr, Case, ColumnarValue, Expr, ExprSchemable, Filter, LogicalPlan,
-        Operator, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
-        SimpleAggregateUDF, Subquery, Union, Volatility,
+        AccumulatorFactoryFunction, AggregateUDF, BinaryExpr, Case, ColumnarValue, Expr,
+        ExprSchemable, Filter, LogicalPlan, Operator, ScalarFunctionArgs, ScalarUDF,
+        ScalarUDFImpl, Signature, SimpleAggregateUDF, Subquery, Union, Volatility, cast,
+        col, create_udaf, is_true, lit,
     };
     use datafusion_functions_aggregate::average::AvgAccumulator;
     use datafusion_sql::TableReference;
@@ -1235,6 +1420,17 @@ mod test {
         )
     }
 
+    #[test]
+    fn negative_expr_wrapped_by_is_null_errors() -> Result<()> {
+        let predicate = Expr::IsNull(Box::new(Expr::Negative(Box::new(lit("a")))));
+        let plan = LogicalPlan::Filter(Filter::try_new(predicate, empty())?);
+
+        assert_type_coercion_error(
+            plan,
+            "Negation only supports numeric, interval and timestamp types",
+        )
+    }
+
     #[test]
     fn test_coerce_union() -> Result<()> {
         let left_plan = Arc::new(LogicalPlan::EmptyRelation(EmptyRelation {
@@ -1305,7 +1501,7 @@ mod test {
             true,
             plan.clone(),
             @r"
-        Projection: CAST(a AS LargeUtf8)
+        Projection: CAST(a AS LargeUtf8) AS a
           EmptyRelation: rows=0
         "
         )?;
@@ -1341,7 +1537,7 @@ mod test {
             true,
             plan.clone(),
             @r"
-        Projection: CAST(a AS LargeUtf8)
+        Projection: CAST(a AS LargeUtf8) AS a
           EmptyRelation: rows=0
         "
         )?;
@@ -1371,7 +1567,7 @@ mod test {
             true,
             sort_plan.clone(),
             @r"
-        Projection: CAST(a AS LargeUtf8)
+        Projection: CAST(a AS LargeUtf8) AS a
           Sort: a ASC NULLS FIRST
             Projection: a
               EmptyRelation: rows=0
@@ -1400,7 +1596,7 @@ mod test {
             true,
             plan.clone(),
             @r"
-        Projection: CAST(a AS LargeUtf8)
+        Projection: CAST(a AS LargeUtf8) AS a
           Sort: a ASC NULLS FIRST
             Projection: a
               EmptyRelation: rows=0
@@ -1436,7 +1632,7 @@ mod test {
             true,
             plan.clone(),
             @r"
-        Projection: CAST(a AS LargeBinary)
+        Projection: CAST(a AS LargeBinary) AS a
           EmptyRelation: rows=0
         "
         )?;
@@ -1493,7 +1689,7 @@ mod test {
             true,
             sort_plan.clone(),
             @r"
-        Projection: CAST(a AS LargeBinary)
+        Projection: CAST(a AS LargeBinary) AS a
           Sort: a ASC NULLS FIRST
             Projection: a
               EmptyRelation: rows=0
@@ -1524,7 +1720,7 @@ mod test {
             true,
             plan.clone(),
             @r"
-        Projection: CAST(a AS LargeBinary)
+        Projection: CAST(a AS LargeBinary) AS a
           Sort: a ASC NULLS FIRST
             Projection: a
               EmptyRelation: rows=0
@@ -1559,10 +1755,6 @@ mod test {
     }
 
     impl ScalarUDFImpl for TestScalarUDF {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
         fn name(&self) -> &str {
             "TestScalarUDF"
         }
@@ -1754,7 +1946,10 @@ mod test {
             .err()
             .unwrap()
             .strip_backtrace();
-        assert!(err.starts_with("Error during planning: Failed to coerce arguments to satisfy a call to 'avg' function: coercion from Utf8 to the signature Uniform(1, [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float32, Float64]) failed"));
+        assert!(
+            err.contains("Function 'avg' failed to match any signature"),
+            "Err: {err:?}"
+        );
         Ok(())
     }
 
@@ -1882,7 +2077,7 @@ mod test {
         let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty)?);
         assert_type_coercion_error(
             plan,
-            "Cannot infer common argument type for comparison operation Int64 IS DISTINCT FROM Boolean"
+            "Cannot infer common argument type for comparison operation Int64 IS DISTINCT FROM Boolean",
         )?;
 
         // is not true
@@ -2028,7 +2223,7 @@ mod test {
         let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty)?);
         assert_type_coercion_error(
             plan,
-            "Cannot infer common argument type for comparison operation Utf8 IS DISTINCT FROM Boolean"
+            "Cannot infer common argument type for comparison operation Utf8 IS DISTINCT FROM Boolean",
         )?;
 
         // is not unknown
@@ -2211,6 +2406,9 @@ mod test {
         let actual = coerce_case_expression(case, &schema)?;
         assert_eq!(expected, actual);
 
+        // CASE string WHEN float/integer/string: comparison coercion
+        // prefers numeric, so the common type for the CASE expr and
+        // WHEN values is Float32.
         let case = Case {
             expr: Some(Box::new(col("string"))),
             when_then_expr: vec![
@@ -2220,7 +2418,7 @@ mod test {
             ],
             else_expr: Some(Box::new(col("string"))),
         };
-        let case_when_common_type = Utf8;
+        let case_when_common_type = DataType::Float32;
         let then_else_common_type = Utf8;
         let expected = cast_helper(
             case.clone(),
@@ -2465,7 +2663,7 @@ mod test {
         assert_analyzed_plan_eq!(
             plan,
             @r#"
-        Projection: a = CAST(CAST(a AS Map("key_value": Struct("key": Utf8, "value": nullable Float64), unsorted)) AS Map("entries": Struct("key": Utf8, "value": nullable Float64), unsorted))
+        Projection: a = CAST(CAST(a AS Map("key_value": non-null Struct("key": non-null Utf8, "value": Float64), unsorted)) AS Map("entries": non-null Struct("key": non-null Utf8, "value": Float64), unsorted))
           EmptyRelation: rows=0
         "#
         )
diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index 2510068494591..2775d62144c56 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -27,14 +27,16 @@ use crate::optimizer::ApplyOrder;
 use crate::utils::NamePreserver;
 use datafusion_common::alias::AliasGenerator;
 
-use datafusion_common::cse::{CSEController, FoundCommonNodes, CSE};
+use datafusion_common::cse::{CSE, CSEController, FoundCommonNodes};
 use datafusion_common::tree_node::{Transformed, TreeNode};
-use datafusion_common::{qualified_name, Column, DFSchema, DFSchemaRef, Result};
-use datafusion_expr::expr::{Alias, ScalarFunction};
+use datafusion_common::{Column, DFSchema, DFSchemaRef, Result, qualified_name};
+use datafusion_expr::expr::{Alias, HigherOrderFunction, ScalarFunction};
 use datafusion_expr::logical_plan::{
     Aggregate, Filter, LogicalPlan, Projection, Sort, Window,
 };
-use datafusion_expr::{col, BinaryExpr, Case, Expr, Operator, SortExpr};
+use datafusion_expr::{
+    BinaryExpr, Case, Expr, ExpressionPlacement, Operator, SortExpr, col,
+};
 
 const CSE_PREFIX: &str = "__common_expr";
 
@@ -323,11 +325,7 @@ impl CommonSubexprEliminate {
                                 .map(|expr| Some(name_preserver.save(expr)))
                                 .collect::<Vec<_>>()
                         } else {
-                            new_aggr_expr
-                                .clone()
-                                .into_iter()
-                                .map(|_| None)
-                                .collect::<Vec<_>>()
+                            (0..new_aggr_expr.len()).map(|_| None).collect()
                         };
 
                         let mut agg_exprs = common_exprs
@@ -588,8 +586,12 @@ impl OptimizerRule for CommonSubexprEliminate {
             | LogicalPlan::Unnest(_)
             | LogicalPlan::RecursiveQuery(_) => {
                 // This rule handles recursion itself in a `ApplyOrder::TopDown` like
-                // manner.
-                plan.map_children(|c| self.rewrite(c, config))?
+                // manner. Process uncorrelated subqueries in expressions
+                // (e.g., Expr::ScalarSubquery), then direct children.
+                plan.map_uncorrelated_subqueries(|c| self.rewrite(c, config))?
+                    .transform_sibling(|plan| {
+                        plan.map_children(|c| self.rewrite(c, config))
+                    })?
             }
         };
 
@@ -649,12 +651,15 @@ impl CSEController for ExprCSEController<'_> {
 
     fn conditional_children(node: &Expr) -> Option<(Vec<&Expr>, Vec<&Expr>)> {
         match node {
-            // In case of `ScalarFunction`s we don't know which children are surely
+            // In case of `ScalarFunction`s and `HigherOrderFunction`s we don't know which children are surely
             // executed so start visiting all children conditionally and stop the
             // recursion with `TreeNodeRecursion::Jump`.
             Expr::ScalarFunction(ScalarFunction { func, args }) => {
                 func.conditional_arguments(args)
             }
+            Expr::HigherOrderFunction(HigherOrderFunction { func, args }) => {
+                func.conditional_arguments(args)
+            }
 
             // In case of `And` and `Or` the first child is surely executed, but we
             // account subexpressions as conditional in the second.
@@ -695,18 +700,38 @@ impl CSEController for ExprCSEController<'_> {
 
     fn is_valid(node: &Expr) -> bool {
         !node.is_volatile_node()
+            && !matches!(node, Expr::Lambda(_) | Expr::LambdaVariable(_))
     }
 
     fn is_ignored(&self, node: &Expr) -> bool {
+        // MoveTowardsLeafNodes expressions (e.g. get_field) are cheap struct
+        // field accesses that the ExtractLeafExpressions / PushDownLeafProjections
+        // rules deliberately duplicate when needed (one copy for a filter
+        // predicate, another for an output column). CSE deduplicating them
+        // creates intermediate projections that fight with those rules,
+        // causing optimizer instability — ExtractLeafExpressions will undo
+        // the dedup, creating an infinite loop that runs until the iteration
+        // limit is hit. Skip them.
+        if node.placement() == ExpressionPlacement::MoveTowardsLeafNodes {
+            return true;
+        }
+
         // TODO: remove the next line after `Expr::Wildcard` is removed
         #[expect(deprecated)]
         let is_normal_minus_aggregates = matches!(
             node,
+            // TODO: there's an argument for removing `Literal` from here,
+            // maybe using `Expr::placemement().should_push_to_leaves()` instead
+            // so that we extract common literals and don't broadcast them to num_batch_rows multiple times.
+            // However that currently breaks things like `percentile_cont()` which expect literal arguments
+            // (and would instead be getting `col(__common_expr_n)`).
             Expr::Literal(..)
                 | Expr::Column(..)
                 | Expr::ScalarVariable(..)
                 | Expr::Alias(..)
                 | Expr::Wildcard { .. }
+                | Expr::Lambda(_)
+                | Expr::LambdaVariable(_)
         );
 
         let is_aggr = matches!(node, Expr::AggregateFunction(..));
@@ -810,21 +835,22 @@ fn extract_expressions(expr: &Expr, result: &mut Vec<Expr>) {
 
 #[cfg(test)]
 mod test {
-    use std::any::Any;
+
     use std::iter;
 
     use arrow::datatypes::{DataType, Field, Schema};
-    use datafusion_expr::logical_plan::{table_scan, JoinType};
+    use datafusion_expr::logical_plan::{JoinType, table_scan};
     use datafusion_expr::{
-        grouping_set, is_null, not, AccumulatorFactoryFunction, AggregateUDF,
-        ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
-        SimpleAggregateUDF, Volatility,
+        AccumulatorFactoryFunction, AggregateUDF, ColumnarValue, ScalarFunctionArgs,
+        ScalarUDF, ScalarUDFImpl, Signature, SimpleAggregateUDF, Volatility,
+        grouping_set, is_null, not,
     };
     use datafusion_expr::{lit, logical_plan::builder::LogicalPlanBuilder};
 
     use super::*;
     use crate::assert_optimized_plan_eq_snapshot;
     use crate::optimizer::OptimizerContext;
+    use crate::test::udfs::leaf_udf_expr;
     use crate::test::*;
     use datafusion_expr::test::function_stub::{avg, sum};
 
@@ -1680,9 +1706,6 @@ mod test {
     }
 
     impl ScalarUDFImpl for TestUdf {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
         fn name(&self) -> &str {
             "my_udf"
         }
@@ -1806,10 +1829,6 @@ mod test {
         }
     }
     impl ScalarUDFImpl for RandomStub {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
         fn name(&self) -> &str {
             "random"
         }
@@ -1826,4 +1845,56 @@ mod test {
             panic!("dummy - not implemented")
         }
     }
+
+    /// Identical MoveTowardsLeafNodes expressions should NOT be deduplicated
+    /// by CSE — they are cheap (e.g. struct field access) and the extraction
+    /// rules deliberately duplicate them. Deduplicating causes optimizer
+    /// instability where one optimizer rule will undo the work of another,
+    /// resulting in an infinite optimization loop until the
+    /// we hit the max iteration limit and then give up.
+    #[test]
+    fn test_leaf_expression_not_extracted() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let leaf = leaf_udf_expr(col("a"));
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![leaf.clone().alias("c1"), leaf.alias("c2")])?
+            .build()?;
+
+        // Plan should be unchanged — no __common_expr introduced
+        assert_optimized_plan_equal!(
+            plan,
+            @r"
+        Projection: leaf_udf(test.a) AS c1, leaf_udf(test.a) AS c2
+          TableScan: test
+        "
+        )
+    }
+
+    /// When a MoveTowardsLeafNodes expression appears as a sub-expression of
+    /// a larger expression that IS duplicated, only the outer expression gets
+    /// deduplicated; the leaf sub-expression stays inline.
+    #[test]
+    fn test_leaf_subexpression_not_extracted() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        // leaf_udf(a) + b appears twice — the outer `+` is a common
+        // sub-expression, but leaf_udf(a) by itself is MoveTowardsLeafNodes
+        // and should not be extracted separately.
+        let common = leaf_udf_expr(col("a")) + col("b");
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![common.clone().alias("c1"), common.alias("c2")])?
+            .build()?;
+
+        // The whole `leaf_udf(a) + b` gets deduplicated as __common_expr_1,
+        // but leaf_udf(a) alone is NOT pulled out.
+        assert_optimized_plan_equal!(
+            plan,
+            @r"
+        Projection: __common_expr_1 AS c1, __common_expr_1 AS c2
+          Projection: leaf_udf(test.a) + test.b AS __common_expr_1, test.a, test.b, test.c
+            TableScan: test
+        "
+        )
+    }
 }
diff --git a/datafusion/optimizer/src/decorrelate.rs b/datafusion/optimizer/src/decorrelate.rs
index 63236787743a4..2a71205c64c8b 100644
--- a/datafusion/optimizer/src/decorrelate.rs
+++ b/datafusion/optimizer/src/decorrelate.rs
@@ -26,17 +26,18 @@ use crate::simplify_expressions::ExprSimplifier;
 use datafusion_common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter,
 };
-use datafusion_common::{plan_err, Column, DFSchemaRef, HashMap, Result, ScalarValue};
+use datafusion_common::{
+    Column, DFSchemaRef, HashMap, Result, ScalarValue, assert_or_internal_err, plan_err,
+};
 use datafusion_expr::expr::Alias;
 use datafusion_expr::simplify::SimplifyContext;
 use datafusion_expr::utils::{
     collect_subquery_cols, conjunction, find_join_exprs, split_conjunction,
 };
 use datafusion_expr::{
-    expr, lit, BinaryExpr, Cast, EmptyRelation, Expr, FetchType, LogicalPlan,
-    LogicalPlanBuilder, Operator,
+    BinaryExpr, Cast, EmptyRelation, Expr, FetchType, LogicalPlan, LogicalPlanBuilder,
+    Operator, expr, lit,
 };
-use datafusion_physical_expr::execution_props::ExecutionProps;
 
 /// This struct rewrite the sub query plan by pull up the correlated
 /// expressions(contains outer reference columns) from the inner subquery's
@@ -136,6 +137,12 @@ impl TreeNodeRewriter for PullUpCorrelatedExpr {
     fn f_down(&mut self, plan: LogicalPlan) -> Result<Transformed<LogicalPlan>> {
         match plan {
             LogicalPlan::Filter(_) => Ok(Transformed::no(plan)),
+            // Subquery nodes are scope boundaries for correlation. A nested
+            // Subquery's outer references belong to a different decorrelation
+            // level and must not be pulled up into the current scope.
+            LogicalPlan::Subquery(_) => {
+                Ok(Transformed::new(plan, false, TreeNodeRecursion::Jump))
+            }
             LogicalPlan::Union(_) | LogicalPlan::Sort(_) | LogicalPlan::Extension(_) => {
                 let plan_hold_outer = !plan.all_out_ref_exprs().is_empty();
                 if plan_hold_outer {
@@ -180,7 +187,7 @@ impl TreeNodeRewriter for PullUpCorrelatedExpr {
                     find_join_exprs(subquery_filter_exprs)?;
                 if let Some(in_predicate) = &self.in_predicate_opt {
                     // in_predicate may be already included in the join filters, remove it from the join filters first.
-                    join_filters = remove_duplicated_filter(join_filters, in_predicate);
+                    join_filters = remove_duplicated_filter(join_filters, in_predicate)?;
                 }
                 let correlated_subquery_cols =
                     collect_subquery_cols(&join_filters, subquery_schema)?;
@@ -461,25 +468,39 @@ fn collect_local_correlated_cols(
     }
 }
 
-fn remove_duplicated_filter(filters: Vec<Expr>, in_predicate: &Expr) -> Vec<Expr> {
-    filters
+fn remove_duplicated_filter(
+    filters: Vec<Expr>,
+    in_predicate: &Expr,
+) -> Result<Vec<Expr>> {
+    // We assume below that swapping the order of operands to an operator does
+    // not change behavior, which is only true if the operator is commutative.
+    assert_or_internal_err!(
+        match in_predicate {
+            Expr::BinaryExpr(b) => b.op.swap() == Some(b.op),
+            _ => true,
+        },
+        "remove_duplicated_filter: in_predicate must use a commutative operator"
+    );
+
+    Ok(filters
         .into_iter()
         .filter(|filter| {
             if filter == in_predicate {
                 return false;
             }
 
-            // ignore the binary order
+            // Treat swapped operand order to a binary operator as equivalent
             !match (filter, in_predicate) {
                 (Expr::BinaryExpr(a_expr), Expr::BinaryExpr(b_expr)) => {
-                    (a_expr.op == b_expr.op)
-                        && (a_expr.left == b_expr.left && a_expr.right == b_expr.right)
-                        || (a_expr.left == b_expr.right && a_expr.right == b_expr.left)
+                    a_expr.op == b_expr.op
+                        && ((a_expr.left == b_expr.left && a_expr.right == b_expr.right)
+                            || (a_expr.left == b_expr.right
+                                && a_expr.right == b_expr.left))
                 }
                 _ => false,
             }
         })
-        .collect::<Vec<_>>()
+        .collect::<Vec<_>>())
 }
 
 fn agg_exprs_evaluation_result_on_empty_batch(
@@ -509,8 +530,9 @@ fn agg_exprs_evaluation_result_on_empty_batch(
             .data()?;
 
         let result_expr = result_expr.unalias();
-        let props = ExecutionProps::new();
-        let info = SimplifyContext::new(&props).with_schema(Arc::clone(schema));
+        let info = SimplifyContext::builder()
+            .with_schema(Arc::clone(schema))
+            .build();
         let simplifier = ExprSimplifier::new(info);
         let result_expr = simplifier.simplify(result_expr)?;
         expr_result_map_for_count_bug.insert(e.schema_name().to_string(), result_expr);
@@ -543,8 +565,9 @@ fn proj_exprs_evaluation_result_on_empty_batch(
             .data()?;
 
         if result_expr.ne(expr) {
-            let props = ExecutionProps::new();
-            let info = SimplifyContext::new(&props).with_schema(Arc::clone(schema));
+            let info = SimplifyContext::builder()
+                .with_schema(Arc::clone(schema))
+                .build();
             let simplifier = ExprSimplifier::new(info);
             let result_expr = simplifier.simplify(result_expr)?;
             let expr_name = match expr {
@@ -584,8 +607,7 @@ fn filter_exprs_evaluation_result_on_empty_batch(
         .data()?;
 
     let pull_up_expr = if result_expr.ne(filter_expr) {
-        let props = ExecutionProps::new();
-        let info = SimplifyContext::new(&props).with_schema(schema);
+        let info = SimplifyContext::builder().with_schema(schema).build();
         let simplifier = ExprSimplifier::new(info);
         let result_expr = simplifier.simplify(result_expr)?;
         match &result_expr {
diff --git a/datafusion/optimizer/src/decorrelate_lateral_join.rs b/datafusion/optimizer/src/decorrelate_lateral_join.rs
index 7d2072ad1ce99..ea25ab479f070 100644
--- a/datafusion/optimizer/src/decorrelate_lateral_join.rs
+++ b/datafusion/optimizer/src/decorrelate_lateral_join.rs
@@ -17,27 +17,28 @@
 
 //! [`DecorrelateLateralJoin`] decorrelates logical plans produced by lateral joins.
 
-use std::collections::BTreeSet;
+use std::sync::Arc;
 
-use crate::decorrelate::PullUpCorrelatedExpr;
+use crate::decorrelate::{PullUpCorrelatedExpr, UN_MATCHED_ROW_INDICATOR};
 use crate::optimizer::ApplyOrder;
+use crate::utils::evaluates_to_null;
 use crate::{OptimizerConfig, OptimizerRule};
-use datafusion_expr::{lit, Join};
+use datafusion_expr::{Expr, Join, expr};
 
 use datafusion_common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
 };
-use datafusion_common::Result;
-use datafusion_expr::logical_plan::JoinType;
+use datafusion_common::{Column, DFSchema, Result, ScalarValue, TableReference};
+use datafusion_expr::logical_plan::{JoinType, Subquery};
 use datafusion_expr::utils::conjunction;
-use datafusion_expr::{LogicalPlan, LogicalPlanBuilder};
+use datafusion_expr::{LogicalPlan, LogicalPlanBuilder, SubqueryAlias};
 
 /// Optimizer rule for rewriting lateral joins to joins
 #[derive(Default, Debug)]
 pub struct DecorrelateLateralJoin {}
 
 impl DecorrelateLateralJoin {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self::default()
     }
@@ -70,74 +71,298 @@ impl OptimizerRule for DecorrelateLateralJoin {
     }
 }
 
-// Build the decorrelated join based on the original lateral join query. For now, we only support cross/inner
-// lateral joins.
+// Build the decorrelated join based on the original lateral join query.
+// Supports INNER and LEFT lateral joins.
 fn rewrite_internal(join: Join) -> Result<Transformed<LogicalPlan>> {
-    if join.join_type != JoinType::Inner {
+    if !matches!(join.join_type, JoinType::Inner | JoinType::Left) {
         return Ok(Transformed::no(LogicalPlan::Join(join)));
     }
+    let original_join_type = join.join_type;
 
-    match join.right.apply_with_subqueries(|p| {
-        // TODO: support outer joins
-        if p.contains_outer_reference() {
-            Ok(TreeNodeRecursion::Stop)
-        } else {
-            Ok(TreeNodeRecursion::Continue)
-        }
-    })? {
-        TreeNodeRecursion::Stop => {}
-        TreeNodeRecursion::Continue => {
-            // The left side contains outer references, we need to decorrelate it.
-            return Ok(Transformed::new(
-                LogicalPlan::Join(join),
-                false,
-                TreeNodeRecursion::Jump,
-            ));
-        }
-        TreeNodeRecursion::Jump => {
-            unreachable!("")
-        }
-    }
-
-    let LogicalPlan::Subquery(subquery) = join.right.as_ref() else {
+    // The right side is wrapped in a Subquery node when it contains outer
+    // references. Quickly skip joins that don't have this structure.
+    let Some((subquery, alias)) = extract_lateral_subquery(join.right.as_ref()) else {
         return Ok(Transformed::no(LogicalPlan::Join(join)));
     };
 
-    if join.join_type != JoinType::Inner {
+    // If the subquery has no outer references, there is nothing to decorrelate.
+    // A LATERAL with no outer references is just a cross join.
+    let has_outer_refs = matches!(
+        subquery.subquery.apply_with_subqueries(|p| {
+            if p.contains_outer_reference() {
+                Ok(TreeNodeRecursion::Stop)
+            } else {
+                Ok(TreeNodeRecursion::Continue)
+            }
+        })?,
+        TreeNodeRecursion::Stop
+    );
+    if !has_outer_refs {
         return Ok(Transformed::no(LogicalPlan::Join(join)));
     }
+
     let subquery_plan = subquery.subquery.as_ref();
+    let original_join_filter = join.filter.clone();
+
+    // Walk the subquery plan bottom-up, extracting correlated filter
+    // predicates into join conditions and converting ungrouped aggregates
+    // into group-by aggregates keyed on the correlation columns.
     let mut pull_up = PullUpCorrelatedExpr::new().with_need_handle_count_bug(true);
     let rewritten_subquery = subquery_plan.clone().rewrite(&mut pull_up).data()?;
     if !pull_up.can_pull_up {
         return Ok(Transformed::no(LogicalPlan::Join(join)));
     }
 
-    let mut all_correlated_cols = BTreeSet::new();
-    pull_up
-        .correlated_subquery_cols_map
-        .values()
-        .for_each(|cols| all_correlated_cols.extend(cols.clone()));
-    let join_filter_opt = conjunction(pull_up.join_filters);
-    let join_filter = match join_filter_opt {
-        Some(join_filter) => join_filter,
-        None => lit(true),
-    };
-    // -- inner join but the right side always has one row, we need to rewrite it to a left join
-    // SELECT * FROM t0, LATERAL (SELECT sum(v1) FROM t1 WHERE t0.v0 = t1.v0);
-    // -- inner join but the right side number of rows is related to the filter (join) condition, so keep inner join.
-    // SELECT * FROM t0, LATERAL (SELECT * FROM t1 WHERE t0.v0 = t1.v0);
-    let new_plan = LogicalPlanBuilder::from(join.left)
-        .join_on(
-            rewritten_subquery,
+    // TODO: support HAVING in lateral subqueries.
+    // <https://github.com/apache/datafusion/issues/21198>
+    if pull_up.pull_up_having_expr.is_some() {
+        return Ok(Transformed::no(LogicalPlan::Join(join)));
+    }
+
+    // The correlation predicates (extracted from the subquery's WHERE) become
+    // the rewritten join's ON clause. See below for discussion of how the
+    // user's original ON clause is handled.
+    let correlation_filter = conjunction(pull_up.join_filters);
+
+    // Look up each aggregate's default value on empty input (e.g., COUNT → 0,
+    // SUM → NULL). This must happen before wrapping in SubqueryAlias, because
+    // the map is keyed by LogicalPlan and wrapping changes the plan.
+    let collected_count_expr_map = pull_up
+        .collected_count_expr_map
+        .get(&rewritten_subquery)
+        .cloned();
+
+    // Re-wrap in SubqueryAlias if the original had one, preserving the alias name.
+    // The SubqueryAlias re-qualifies all columns with the alias, so we must also
+    // rewrite column references in both the correlation and ON-clause filters.
+    let (right_plan, correlation_filter, original_join_filter) =
+        if let Some(ref alias) = alias {
+            let inner_schema = Arc::clone(rewritten_subquery.schema());
+            let right = LogicalPlan::SubqueryAlias(SubqueryAlias::try_new(
+                Arc::new(rewritten_subquery),
+                alias.clone(),
+            )?);
+            let corr = correlation_filter
+                .map(|f| requalify_filter(f, &inner_schema, alias))
+                .transpose()?;
+            let on = original_join_filter
+                .map(|f| requalify_filter(f, &inner_schema, alias))
+                .transpose()?;
+            (right, corr, on)
+        } else {
+            (rewritten_subquery, correlation_filter, original_join_filter)
+        };
+
+    // For LEFT lateral joins, verify that all column references in the
+    // correlation filter are resolvable within the join's left and right
+    // schemas. If the lateral subquery references columns from an outer scope,
+    // the extracted filter will contain unresolvable columns and we must skip
+    // decorrelation.
+    //
+    // INNER lateral joins do not need this check: later optimizer passes
+    // (filter pushdown, join reordering) can restructure the plan to resolve
+    // cross-scope references. LEFT joins cannot be freely reordered.
+    if original_join_type == JoinType::Left
+        && let Some(ref filter) = correlation_filter
+    {
+        let left_schema = join.left.schema();
+        let right_schema = right_plan.schema();
+        let has_outer_scope_refs = filter
+            .column_refs()
+            .iter()
+            .any(|col| !left_schema.has_column(col) && !right_schema.has_column(col));
+        if has_outer_scope_refs {
+            return Ok(Transformed::no(LogicalPlan::Join(join)));
+        }
+    }
+
+    // Use a left join when the user wrote LEFT LATERAL or when a scalar
+    // aggregation was pulled up (preserves outer rows with no matches).
+    let join_type =
+        if original_join_type == JoinType::Left || pull_up.pulled_up_scalar_agg {
+            JoinType::Left
+        } else {
+            JoinType::Inner
+        };
+
+    // The correlation predicates (extracted from the subquery's WHERE) are
+    // turned into the rewritten join's ON clause. There are three cases that
+    // determine how the user's original ON clause is handled:
+    //
+    // - INNER lateral: user ON clause becomes a post-join filter. This restores
+    //   inner-join semantics if the join is upgraded to LEFT for count-bug
+    //   handling.
+    //
+    // - LEFT lateral with grouped (or no) agg: user ON clause is merged into
+    //   the rewritten ON clause, alongside the correlation predicates. LEFT
+    //   join semantics correctly preserve unmatched rows with NULLs.
+    //
+    // - LEFT lateral with an ungrouped aggregate (which decorrelation converts
+    //   to a group-by keyed on the correlation columns): user ON clause cannot
+    //   be placed in the join condition (it would conflict with count-bug
+    //   compensation) or as a post-join filter (that would remove
+    //   left-preserved rows). Instead, a projection is added after count-bug
+    //   compensation that replaces each right-side column with NULL when the ON
+    //   condition is not satisfied:
+    //
+    //      CASE WHEN (on_cond) IS NOT TRUE THEN NULL ELSE <col> END
+    //
+    //   This simulates LEFT JOIN semantics for the user's ON clause without
+    //   interfering with count-bug compensation.
+    let (join_filter, post_join_filter, on_condition_for_projection) =
+        if original_join_type == JoinType::Left {
             if pull_up.pulled_up_scalar_agg {
-                JoinType::Left
+                (correlation_filter, None, original_join_filter)
             } else {
-                JoinType::Inner
-            },
-            Some(join_filter),
-        )?
+                let combined = conjunction(
+                    correlation_filter.into_iter().chain(original_join_filter),
+                );
+                (combined, None, None)
+            }
+        } else {
+            (correlation_filter, original_join_filter, None)
+        };
+
+    let left_field_count = join.left.schema().fields().len();
+    let new_plan = LogicalPlanBuilder::from(join.left)
+        .join_on(right_plan, join_type, join_filter)?
         .build()?;
-    // TODO: handle count(*) bug
+
+    // Handle the count bug: in the rewritten left join, unmatched outer
+    // rows get NULLs for all right-side columns. But some aggregates
+    // have non-NULL defaults on empty input (e.g., COUNT returns 0, not
+    // NULL). Add a projection that wraps those columns:
+    //   CASE WHEN __always_true IS NULL THEN <default> ELSE <column> END
+    let new_plan = if let Some(expr_map) = collected_count_expr_map {
+        let join_schema = new_plan.schema();
+        let alias_qualifier = alias.as_ref();
+        let mut proj_exprs: Vec<Expr> = vec![];
+
+        for (i, (qualifier, field)) in join_schema.iter().enumerate() {
+            let col = Expr::Column(Column::new(qualifier.cloned(), field.name()));
+
+            // Only compensate right-side (subquery) fields. Left-side fields
+            // may share a name with an aggregate alias but must not be wrapped.
+            let name = field.name();
+            if i >= left_field_count
+                && let Some(default_value) = expr_map.get(name.as_str())
+                && !evaluates_to_null(default_value.clone(), default_value.column_refs())?
+            {
+                // Column whose aggregate doesn't naturally return NULL
+                // on empty input (e.g., COUNT returns 0). Wrap it.
+                let indicator_col =
+                    Column::new(alias_qualifier.cloned(), UN_MATCHED_ROW_INDICATOR);
+                let case_expr = Expr::Case(expr::Case {
+                    expr: None,
+                    when_then_expr: vec![(
+                        Box::new(Expr::IsNull(Box::new(Expr::Column(indicator_col)))),
+                        Box::new(default_value.clone()),
+                    )],
+                    else_expr: Some(Box::new(col)),
+                });
+                proj_exprs.push(case_expr.alias_qualified(qualifier.cloned(), name));
+                continue;
+            }
+            proj_exprs.push(col);
+        }
+
+        LogicalPlanBuilder::from(new_plan)
+            .project(proj_exprs)?
+            .build()?
+    } else {
+        new_plan
+    };
+
+    // For LEFT lateral joins with an ungrouped aggregate, simulate LEFT JOIN
+    // semantics for the user's ON clause by adding a projection that replaces
+    // right-side columns with NULL when the ON condition is false (see
+    // commentary above).
+    //
+    // Note: the ON condition expression is duplicated per column, so this
+    // assumes it is deterministic.
+    let new_plan = if let Some(on_cond) = on_condition_for_projection {
+        let schema = Arc::clone(new_plan.schema());
+        let mut proj_exprs: Vec<Expr> = vec![];
+
+        for (i, (qualifier, field)) in schema.iter().enumerate() {
+            let col = Expr::Column(Column::new(qualifier.cloned(), field.name()));
+
+            if i < left_field_count {
+                proj_exprs.push(col);
+                continue;
+            }
+
+            let typed_null =
+                Expr::Literal(ScalarValue::try_from(field.data_type())?, None);
+            let case_expr = Expr::Case(expr::Case {
+                expr: None,
+                when_then_expr: vec![(
+                    Box::new(Expr::IsNotTrue(Box::new(on_cond.clone()))),
+                    Box::new(typed_null),
+                )],
+                else_expr: Some(Box::new(col)),
+            });
+            proj_exprs.push(case_expr.alias_qualified(qualifier.cloned(), field.name()));
+        }
+
+        LogicalPlanBuilder::from(new_plan)
+            .project(proj_exprs)?
+            .build()?
+    } else {
+        new_plan
+    };
+
+    // Apply the original ON clause as a post-join filter (INNER lateral only).
+    let new_plan = if let Some(on_filter) = post_join_filter {
+        LogicalPlanBuilder::from(new_plan)
+            .filter(on_filter)?
+            .build()?
+    } else {
+        new_plan
+    };
+
     Ok(Transformed::new(new_plan, true, TreeNodeRecursion::Jump))
 }
+
+/// Extract the Subquery and optional alias from a lateral join's right side.
+fn extract_lateral_subquery(
+    plan: &LogicalPlan,
+) -> Option<(Subquery, Option<TableReference>)> {
+    match plan {
+        LogicalPlan::Subquery(sq) => Some((sq.clone(), None)),
+        LogicalPlan::SubqueryAlias(SubqueryAlias { input, alias, .. }) => {
+            if let LogicalPlan::Subquery(sq) = input.as_ref() {
+                Some((sq.clone(), Some(alias.clone())))
+            } else {
+                None
+            }
+        }
+        _ => None,
+    }
+}
+
+/// Rewrite column references in a join filter expression so that columns
+/// belonging to the inner (right) side use the SubqueryAlias qualifier.
+///
+/// The `PullUpCorrelatedExpr` pass extracts join filters with the inner
+/// columns qualified by their original table names (e.g., `t2.t1_id`).
+/// When the inner plan is wrapped in a `SubqueryAlias("sub")`, those
+/// columns are re-qualified as `sub.t1_id`. This function applies the
+/// same requalification to the filter so it matches the aliased schema.
+fn requalify_filter(
+    filter: Expr,
+    inner_schema: &DFSchema,
+    alias: &TableReference,
+) -> Result<Expr> {
+    filter
+        .transform(|expr| {
+            if let Expr::Column(col) = &expr
+                && inner_schema.has_column(col)
+            {
+                let new_col = Column::new(Some(alias.clone()), col.name.clone());
+                return Ok(Transformed::yes(Expr::Column(new_col)));
+            }
+            Ok(Transformed::no(expr))
+        })
+        .data()
+}
diff --git a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs
index ccf90f91e68f9..0609109ec6e58 100644
--- a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs
+++ b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs
@@ -27,14 +27,17 @@ use crate::{OptimizerConfig, OptimizerRule};
 
 use datafusion_common::alias::AliasGenerator;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::{internal_err, plan_err, Column, Result};
+use datafusion_common::{
+    Column, DFSchemaRef, ExprSchema, NullEquality, Result, assert_or_internal_err,
+    plan_err,
+};
 use datafusion_expr::expr::{Exists, InSubquery};
 use datafusion_expr::expr_rewriter::create_col_from_scalar_expr;
 use datafusion_expr::logical_plan::{JoinType, Subquery};
 use datafusion_expr::utils::{conjunction, expr_to_columns, split_conjunction_owned};
 use datafusion_expr::{
-    exists, in_subquery, lit, not, not_exists, not_in_subquery, BinaryExpr, Expr, Filter,
-    LogicalPlan, LogicalPlanBuilder, Operator,
+    BinaryExpr, Expr, Filter, LogicalPlan, LogicalPlanBuilder, Operator, exists,
+    in_subquery, lit, not, not_exists, not_in_subquery,
 };
 
 use log::debug;
@@ -44,7 +47,7 @@ use log::debug;
 pub struct DecorrelatePredicateSubquery {}
 
 impl DecorrelatePredicateSubquery {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self::default()
     }
@@ -79,14 +82,14 @@ impl OptimizerRule for DecorrelatePredicateSubquery {
                 .into_iter()
                 .partition(has_subquery);
 
-        if with_subqueries.is_empty() {
-            return internal_err!(
-                "can not find expected subqueries in DecorrelatePredicateSubquery"
-            );
-        }
+        assert_or_internal_err!(
+            !with_subqueries.is_empty(),
+            "can not find expected subqueries in DecorrelatePredicateSubquery"
+        );
 
         // iterate through all exists clauses in predicate, turning each into a join
         let mut cur_input = Arc::unwrap_or_clone(filter.input);
+        let original_schema = cur_input.schema().columns();
         for subquery_expr in with_subqueries {
             match extract_subquery_info(subquery_expr) {
                 // The subquery expression is at the top level of the filter
@@ -113,6 +116,13 @@ impl OptimizerRule for DecorrelatePredicateSubquery {
             let new_filter = Filter::try_new(expr, Arc::new(cur_input))?;
             cur_input = LogicalPlan::Filter(new_filter);
         }
+
+        if cur_input.schema().fields().len() != original_schema.len() {
+            cur_input = LogicalPlanBuilder::from(cur_input)
+                .project(original_schema.into_iter().map(Expr::from))?
+                .build()?;
+        }
+
         Ok(Transformed::yes(cur_input))
     }
 
@@ -136,7 +146,7 @@ fn rewrite_inner_subqueries(
         Expr::Exists(Exists {
             subquery: Subquery { subquery, .. },
             negated,
-        }) => match mark_join(&cur_input, Arc::clone(&subquery), None, negated, alias)? {
+        }) => match mark_join(&cur_input, &subquery, None, negated, alias)? {
             Some((plan, exists_expr)) => {
                 cur_input = plan;
                 Ok(Transformed::yes(exists_expr))
@@ -154,13 +164,7 @@ fn rewrite_inner_subqueries(
                 .map_or(plan_err!("single expression required."), |output_expr| {
                     Ok(Expr::eq(*expr.clone(), output_expr))
                 })?;
-            match mark_join(
-                &cur_input,
-                Arc::clone(&subquery),
-                Some(in_predicate),
-                negated,
-                alias,
-            )? {
+            match mark_join(&cur_input, &subquery, Some(&in_predicate), negated, alias)? {
                 Some((plan, exists_expr)) => {
                     cur_input = plan;
                     Ok(Transformed::yes(exists_expr))
@@ -275,7 +279,13 @@ fn build_join_top(
     };
     let subquery = query_info.query.subquery.as_ref();
     let subquery_alias = alias.next("__correlated_sq");
-    build_join(left, subquery, in_predicate_opt, join_type, subquery_alias)
+    build_join(
+        left,
+        subquery,
+        in_predicate_opt.as_ref(),
+        join_type,
+        subquery_alias,
+    )
 }
 
 /// This is used to handle the case when the subquery is embedded in a more complex boolean
@@ -295,8 +305,8 @@ fn build_join_top(
 ///           TableScan: t2
 fn mark_join(
     left: &LogicalPlan,
-    subquery: Arc<LogicalPlan>,
-    in_predicate_opt: Option<Expr>,
+    subquery: &LogicalPlan,
+    in_predicate_opt: Option<&Expr>,
     negated: bool,
     alias_generator: &Arc<AliasGenerator>,
 ) -> Result<Option<(LogicalPlan, Expr)>> {
@@ -306,20 +316,53 @@ fn mark_join(
     let exists_expr = if negated { !exists_col } else { exists_col };
 
     Ok(
-        build_join(left, &subquery, in_predicate_opt, JoinType::LeftMark, alias)?
+        build_join(left, subquery, in_predicate_opt, JoinType::LeftMark, alias)?
             .map(|plan| (plan, exists_expr)),
     )
 }
 
+/// Check if join keys in the join filter may contain NULL values
+///
+/// Returns true if any join key column is nullable on either side.
+/// This is used to optimize null-aware anti joins: if all join keys are non-nullable,
+/// we can use a regular anti join instead of the more expensive null-aware variant.
+fn join_keys_may_be_null(
+    join_filter: &Expr,
+    left_schema: &DFSchemaRef,
+    right_schema: &DFSchemaRef,
+) -> Result<bool> {
+    // Extract columns from the join filter
+    let mut columns = std::collections::HashSet::new();
+    expr_to_columns(join_filter, &mut columns)?;
+
+    // Check if any column is nullable
+    for col in columns {
+        // Check in left schema
+        if let Ok(field) = left_schema.field_from_column(&col)
+            && field.as_ref().is_nullable()
+        {
+            return Ok(true);
+        }
+        // Check in right schema
+        if let Ok(field) = right_schema.field_from_column(&col)
+            && field.as_ref().is_nullable()
+        {
+            return Ok(true);
+        }
+    }
+
+    Ok(false)
+}
+
 fn build_join(
     left: &LogicalPlan,
     subquery: &LogicalPlan,
-    in_predicate_opt: Option<Expr>,
+    in_predicate_opt: Option<&Expr>,
     join_type: JoinType,
     alias: String,
 ) -> Result<Option<LogicalPlan>> {
     let mut pull_up = PullUpCorrelatedExpr::new()
-        .with_in_predicate_opt(in_predicate_opt.clone())
+        .with_in_predicate_opt(in_predicate_opt.cloned())
         .with_exists_sub_query(in_predicate_opt.is_none());
 
     let new_plan = subquery.clone().rewrite(&mut pull_up).data()?;
@@ -342,7 +385,7 @@ fn build_join(
             replace_qualified_name(filter, &all_correlated_cols, &alias).map(Some)
         })?;
 
-    let join_filter = match (join_filter_opt, in_predicate_opt.clone()) {
+    let join_filter = match (join_filter_opt, in_predicate_opt.cloned()) {
         (
             Some(join_filter),
             Some(Expr::BinaryExpr(BinaryExpr {
@@ -365,8 +408,8 @@ fn build_join(
             })),
         ) => {
             let right_col = create_col_from_scalar_expr(right.deref(), alias)?;
-            let in_predicate = Expr::eq(left.deref().clone(), Expr::Column(right_col));
-            in_predicate
+
+            Expr::eq(left.deref().clone(), Expr::Column(right_col))
         }
         (None, None) => lit(true),
         _ => return Ok(None),
@@ -378,7 +421,7 @@ fn build_join(
         // Gather all columns needed for the join filter + predicates
         let mut needed = std::collections::HashSet::new();
         expr_to_columns(&join_filter, &mut needed)?;
-        if let Some(ref in_pred) = in_predicate_opt {
+        if let Some(in_pred) = in_predicate_opt {
             expr_to_columns(in_pred, &mut needed)?;
         }
 
@@ -404,6 +447,8 @@ fn build_join(
             // Degenerate case: no right columns referenced by the predicate(s)
             sub_query_alias.clone()
         };
+
+        // Mark joins don't use null-aware semantics (they use three-valued logic with mark column)
         let new_plan = LogicalPlanBuilder::from(left.clone())
             .join_on(right_projected, join_type, Some(join_filter))?
             .build()?;
@@ -416,10 +461,36 @@ fn build_join(
         return Ok(Some(new_plan));
     }
 
+    // Determine if this should be a null-aware anti join
+    // Null-aware semantics are only needed for NOT IN subqueries, not NOT EXISTS:
+    // - NOT IN: Uses three-valued logic, requires null-aware handling
+    // - NOT EXISTS: Uses two-valued logic, regular anti join is correct
+    // We can distinguish them: NOT IN has in_predicate_opt, NOT EXISTS does not
+    //
+    // Additionally, if the join keys are non-nullable on both sides, we don't need
+    // null-aware semantics because NULLs cannot exist in the data.
+    let null_aware = join_type == JoinType::LeftAnti
+        && in_predicate_opt.is_some()
+        && join_keys_may_be_null(&join_filter, left.schema(), sub_query_alias.schema())?;
+
     // join our sub query into the main plan
-    let new_plan = LogicalPlanBuilder::from(left.clone())
-        .join_on(sub_query_alias, join_type, Some(join_filter))?
-        .build()?;
+    let new_plan = if null_aware {
+        // Use join_detailed_with_options to set null_aware flag
+        LogicalPlanBuilder::from(left.clone())
+            .join_detailed_with_options(
+                sub_query_alias,
+                join_type,
+                (Vec::<Column>::new(), Vec::<Column>::new()), // No equijoin keys, filter-based join
+                Some(join_filter),
+                NullEquality::NullEqualsNothing,
+                true, // null_aware
+            )?
+            .build()?
+    } else {
+        LogicalPlanBuilder::from(left.clone())
+            .join_on(sub_query_alias, join_type, Some(join_filter))?
+            .build()?
+    };
     debug!(
         "predicate subquery optimized:\n{}",
         new_plan.display_indent()
@@ -475,7 +546,7 @@ mod tests {
     use crate::assert_optimized_plan_eq_display_indent_snapshot;
     use arrow::datatypes::{DataType, Field, Schema};
     use datafusion_expr::builder::table_source;
-    use datafusion_expr::{and, binary_expr, col, lit, not, out_ref_col, table_scan};
+    use datafusion_expr::{and, binary_expr, col, out_ref_col, table_scan};
 
     macro_rules! assert_optimized_plan_equal {
         (
@@ -614,7 +685,7 @@ mod tests {
 
         assert_optimized_plan_equal!(
                 plan,
-                @r###"
+                @r"
         Projection: customer.c_custkey [c_custkey:Int64]
           LeftSemi Join:  Filter: customer.c_custkey = __correlated_sq_2.o_custkey [c_custkey:Int64, c_name:Utf8]
             LeftSemi Join:  Filter: customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]
@@ -625,7 +696,7 @@ mod tests {
             SubqueryAlias: __correlated_sq_2 [o_custkey:Int64]
               Projection: orders.o_custkey [o_custkey:Int64]
                 TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
-        "###    
+        "    
         )
     }
 
@@ -1673,13 +1744,14 @@ mod tests {
             plan,
             @r"
         Projection: customer.c_custkey [c_custkey:Int64]
-          Filter: __correlated_sq_1.mark OR customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8, mark:Boolean]
-            LeftMark Join:  Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8, mark:Boolean]
-              TableScan: customer [c_custkey:Int64, c_name:Utf8]
-              SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]
-                Projection: orders.o_custkey [o_custkey:Int64]
-                  Filter: customer.c_custkey = orders.o_custkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
-                    TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+          Projection: customer.c_custkey, customer.c_name [c_custkey:Int64, c_name:Utf8]
+            Filter: __correlated_sq_1.mark OR customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8, mark:Boolean]
+              LeftMark Join:  Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8, mark:Boolean]
+                TableScan: customer [c_custkey:Int64, c_name:Utf8]
+                SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]
+                  Projection: orders.o_custkey [o_custkey:Int64]
+                    Filter: customer.c_custkey = orders.o_custkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+                      TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
         "
         )
     }
@@ -1978,7 +2050,7 @@ mod tests {
             TableScan: test [a:UInt32, b:UInt32, c:UInt32]
             SubqueryAlias: __correlated_sq_1 [arr:Int32;N]
               Unnest: lists[sq.arr|depth=1] structs[] [arr:Int32;N]
-                TableScan: sq [arr:List(Field { data_type: Int32, nullable: true });N]
+                TableScan: sq [arr:List(Int32);N]
         "
         )
     }
@@ -2013,7 +2085,7 @@ mod tests {
             TableScan: test [a:UInt32, b:UInt32, c:UInt32]
             SubqueryAlias: __correlated_sq_1 [a:UInt32;N]
               Unnest: lists[sq.a|depth=1] structs[] [a:UInt32;N]
-                TableScan: sq [a:List(Field { data_type: UInt32, nullable: true });N]
+                TableScan: sq [a:List(UInt32);N]
         "
         )
     }
diff --git a/datafusion/optimizer/src/eliminate_cross_join.rs b/datafusion/optimizer/src/eliminate_cross_join.rs
index ae1d7df46d52e..8306d4b54c256 100644
--- a/datafusion/optimizer/src/eliminate_cross_join.rs
+++ b/datafusion/optimizer/src/eliminate_cross_join.rs
@@ -27,13 +27,13 @@ use datafusion_expr::logical_plan::{
     Filter, Join, JoinConstraint, JoinType, LogicalPlan, Projection,
 };
 use datafusion_expr::utils::{can_hash, find_valid_equijoin_key_pair};
-use datafusion_expr::{and, build_join_schema, ExprSchemable, Operator};
+use datafusion_expr::{ExprSchemable, Operator, and, build_join_schema};
 
 #[derive(Default, Debug)]
 pub struct EliminateCrossJoin;
 
 impl EliminateCrossJoin {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -212,7 +212,12 @@ fn rewrite_children(
     plan: LogicalPlan,
     config: &dyn OptimizerConfig,
 ) -> Result<Transformed<LogicalPlan>> {
-    let transformed_plan = plan.map_children(|input| optimizer.rewrite(input, config))?;
+    // Process uncorrelated subqueries in expressions, then direct children.
+    let transformed_plan = plan
+        .map_uncorrelated_subqueries(|input| optimizer.rewrite(input, config))?
+        .transform_sibling(|plan| {
+            plan.map_children(|input| optimizer.rewrite(input, config))
+        })?;
 
     // recompute schema if the plan was transformed
     if transformed_plan.transformed {
@@ -276,10 +281,9 @@ fn can_flatten_join_inputs(plan: &LogicalPlan) -> bool {
             join_type: JoinType::Inner,
             ..
         }) = child
+            && !can_flatten_join_inputs(child)
         {
-            if !can_flatten_join_inputs(child) {
-                return false;
-            }
+            return false;
         }
     }
     true
@@ -316,10 +320,10 @@ fn find_inner_join(
             )?;
 
             // Save join keys
-            if let Some((valid_l, valid_r)) = key_pair {
-                if can_hash(&valid_l.get_type(left_input.schema())?) {
-                    join_keys.push((valid_l, valid_r));
-                }
+            if let Some((valid_l, valid_r)) = key_pair
+                && can_hash(&valid_l.get_type(left_input.schema())?)
+            {
+                join_keys.push((valid_l, valid_r));
             }
         }
 
@@ -342,6 +346,7 @@ fn find_inner_join(
                 filter: None,
                 schema: join_schema,
                 null_equality,
+                null_aware: false,
             }));
         }
     }
@@ -364,6 +369,7 @@ fn find_inner_join(
         join_type: JoinType::Inner,
         join_constraint: JoinConstraint::On,
         null_equality,
+        null_aware: false,
     }))
 }
 
@@ -449,9 +455,9 @@ mod tests {
     use crate::test::*;
 
     use datafusion_expr::{
+        Operator::{And, Or},
         binary_expr, col, lit,
         logical_plan::builder::LogicalPlanBuilder,
-        Operator::{And, Or},
     };
     use insta::assert_snapshot;
 
@@ -523,7 +529,7 @@ mod tests {
             plan,
             @ r"
         Filter: t1.a = t2.a OR t2.b = t1.a [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
-          Cross Join:  [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
+          Cross Join: [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
             TableScan: t1 [a:UInt32, b:UInt32, c:UInt32]
             TableScan: t2 [a:UInt32, b:UInt32, c:UInt32]
         "
@@ -609,7 +615,7 @@ mod tests {
             plan,
             @ r"
         Filter: t1.a = t2.a AND t2.c < UInt32(15) OR t1.b = t2.b AND t2.c = UInt32(688) [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
-          Cross Join:  [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
+          Cross Join: [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
             TableScan: t1 [a:UInt32, b:UInt32, c:UInt32]
             TableScan: t2 [a:UInt32, b:UInt32, c:UInt32]
         "
@@ -635,7 +641,7 @@ mod tests {
             plan,
             @ r"
         Filter: t1.a = t2.a AND t2.c < UInt32(15) OR t1.a = t2.a OR t2.c = UInt32(688) [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
-          Cross Join:  [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
+          Cross Join: [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
             TableScan: t1 [a:UInt32, b:UInt32, c:UInt32]
             TableScan: t2 [a:UInt32, b:UInt32, c:UInt32]
         "
@@ -857,7 +863,7 @@ mod tests {
             plan,
             @ r"
         Filter: t3.a = t1.a AND t4.c < UInt32(15) OR t3.a = t1.a OR t4.c = UInt32(688) [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
-          Cross Join:  [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
+          Cross Join: [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
             Filter: t2.c < UInt32(15) OR t2.c = UInt32(688) [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
               Inner Join: t1.a = t2.a [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
                 TableScan: t1 [a:UInt32, b:UInt32, c:UInt32]
@@ -937,7 +943,7 @@ mod tests {
                 TableScan: t1 [a:UInt32, b:UInt32, c:UInt32]
                 TableScan: t2 [a:UInt32, b:UInt32, c:UInt32]
             Filter: t3.a = t4.a AND t4.c < UInt32(15) OR t3.a = t4.a AND t3.c = UInt32(688) OR t3.a = t4.a OR t3.b = t4.b [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
-              Cross Join:  [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
+              Cross Join: [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
                 TableScan: t3 [a:UInt32, b:UInt32, c:UInt32]
                 TableScan: t4 [a:UInt32, b:UInt32, c:UInt32]
         "
@@ -1011,7 +1017,7 @@ mod tests {
         Filter: t4.c < UInt32(15) OR t4.c = UInt32(688) [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
           Inner Join: t1.a = t3.a [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
             Filter: t1.a = t2.a OR t2.c < UInt32(15) OR t1.a = t2.a AND t2.c = UInt32(688) [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
-              Cross Join:  [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
+              Cross Join: [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
                 TableScan: t1 [a:UInt32, b:UInt32, c:UInt32]
                 TableScan: t2 [a:UInt32, b:UInt32, c:UInt32]
             Filter: t4.c < UInt32(15) OR t3.c = UInt32(688) OR t3.b = t4.b [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
@@ -1247,7 +1253,7 @@ mod tests {
             plan,
             @ r"
         Filter: t1.a + UInt32(100) = t2.a * UInt32(2) OR t2.b = t1.a [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
-          Cross Join:  [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
+          Cross Join: [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
             TableScan: t1 [a:UInt32, b:UInt32, c:UInt32]
             TableScan: t2 [a:UInt32, b:UInt32, c:UInt32]
         "
@@ -1368,6 +1374,7 @@ mod tests {
             filter: None,
             schema: join_schema,
             null_equality: NullEquality::NullEqualsNull, // Test preservation
+            null_aware: false,
         });
 
         // Apply filter that can create join conditions
diff --git a/datafusion/optimizer/src/eliminate_duplicated_expr.rs b/datafusion/optimizer/src/eliminate_duplicated_expr.rs
index a6651df938a70..97aa6e1d8480d 100644
--- a/datafusion/optimizer/src/eliminate_duplicated_expr.rs
+++ b/datafusion/optimizer/src/eliminate_duplicated_expr.rs
@@ -20,7 +20,7 @@
 use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
 use datafusion_common::tree_node::Transformed;
-use datafusion_common::Result;
+use datafusion_common::{Result, get_required_sort_exprs_indices, internal_err};
 use datafusion_expr::logical_plan::LogicalPlan;
 use datafusion_expr::{Aggregate, Expr, Sort, SortExpr};
 use std::hash::{Hash, Hasher};
@@ -32,7 +32,7 @@ use indexmap::IndexSet;
 pub struct EliminateDuplicatedExpr;
 
 impl EliminateDuplicatedExpr {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -76,12 +76,36 @@ impl OptimizerRule for EliminateDuplicatedExpr {
                     .map(|wrapper| wrapper.0)
                     .collect();
 
+                let sort_expr_names = unique_exprs
+                    .iter()
+                    .map(|sort_expr| sort_expr.expr.schema_name().to_string())
+                    .collect::<Vec<_>>();
+                let required_indices = get_required_sort_exprs_indices(
+                    sort.input.schema().as_ref(),
+                    &sort_expr_names,
+                );
+
+                let unique_exprs = if required_indices.len() < unique_exprs.len() {
+                    required_indices
+                        .into_iter()
+                        .map(|idx| unique_exprs[idx].clone())
+                        .collect()
+                } else {
+                    unique_exprs
+                };
+
                 let transformed = if len != unique_exprs.len() {
                     Transformed::yes
                 } else {
                     Transformed::no
                 };
 
+                if unique_exprs.is_empty() {
+                    return internal_err!(
+                        "FD pruning unexpectedly removed all ORDER BY expressions"
+                    );
+                }
+
                 Ok(transformed(LogicalPlan::Sort(Sort {
                     expr: unique_exprs,
                     input: sort.input,
@@ -118,9 +142,9 @@ impl OptimizerRule for EliminateDuplicatedExpr {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::OptimizerContext;
     use crate::assert_optimized_plan_eq_snapshot;
     use crate::test::*;
-    use crate::OptimizerContext;
     use datafusion_expr::{col, logical_plan::builder::LogicalPlanBuilder};
     use std::sync::Arc;
 
@@ -130,7 +154,8 @@ mod tests {
             @ $expected:literal $(,)?
         ) => {{
             let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
-            let rules: Vec<Arc<dyn crate::OptimizerRule + Send + Sync>> = vec![Arc::new(EliminateDuplicatedExpr::new())];
+            let rules: Vec<Arc<dyn crate::OptimizerRule + Send + Sync>> =
+                vec![Arc::new(EliminateDuplicatedExpr::new())];
             assert_optimized_plan_eq_snapshot!(
                 optimizer_ctx,
                 rules,
diff --git a/datafusion/optimizer/src/eliminate_filter.rs b/datafusion/optimizer/src/eliminate_filter.rs
index 1b763d6f8957b..8be5fb0857a9e 100644
--- a/datafusion/optimizer/src/eliminate_filter.rs
+++ b/datafusion/optimizer/src/eliminate_filter.rs
@@ -34,7 +34,7 @@ use crate::{OptimizerConfig, OptimizerRule};
 pub struct EliminateFilter;
 
 impl EliminateFilter {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -81,10 +81,10 @@ impl OptimizerRule for EliminateFilter {
 mod tests {
     use std::sync::Arc;
 
-    use crate::assert_optimized_plan_eq_snapshot;
     use crate::OptimizerContext;
+    use crate::assert_optimized_plan_eq_snapshot;
     use datafusion_common::{Result, ScalarValue};
-    use datafusion_expr::{col, lit, logical_plan::builder::LogicalPlanBuilder, Expr};
+    use datafusion_expr::{Expr, col, lit, logical_plan::builder::LogicalPlanBuilder};
 
     use crate::eliminate_filter::EliminateFilter;
     use crate::test::*;
diff --git a/datafusion/optimizer/src/eliminate_group_by_constant.rs b/datafusion/optimizer/src/eliminate_group_by_constant.rs
index 4e16fc0aa159c..e21241ba7d993 100644
--- a/datafusion/optimizer/src/eliminate_group_by_constant.rs
+++ b/datafusion/optimizer/src/eliminate_group_by_constant.rs
@@ -15,12 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`EliminateGroupByConstant`] removes constant expressions from `GROUP BY` clause
+//! [`EliminateGroupByConstant`] removes constant and functionally redundant
+//! expressions from `GROUP BY` clause
 use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
 
-use datafusion_common::tree_node::Transformed;
+use std::collections::HashSet;
+
 use datafusion_common::Result;
+use datafusion_common::tree_node::Transformed;
 use datafusion_expr::{Aggregate, Expr, LogicalPlan, LogicalPlanBuilder, Volatility};
 
 /// Optimizer rule that removes constant expressions from `GROUP BY` clause
@@ -47,25 +50,30 @@ impl OptimizerRule for EliminateGroupByConstant {
     ) -> Result<Transformed<LogicalPlan>> {
         match plan {
             LogicalPlan::Aggregate(aggregate) => {
-                let (const_group_expr, nonconst_group_expr): (Vec<_>, Vec<_>) = aggregate
+                // Collect bare column references in GROUP BY
+                let group_by_columns: HashSet<&datafusion_common::Column> = aggregate
+                    .group_expr
+                    .iter()
+                    .filter_map(|expr| match expr {
+                        Expr::Column(c) => Some(c),
+                        _ => None,
+                    })
+                    .collect();
+
+                let (redundant, required): (Vec<_>, Vec<_>) = aggregate
                     .group_expr
                     .iter()
-                    .partition(|expr| is_constant_expression(expr));
-
-                // If no constant expressions found (nothing to optimize) or
-                // constant expression is the only expression in aggregate,
-                // optimization is skipped
-                if const_group_expr.is_empty()
-                    || (!const_group_expr.is_empty()
-                        && nonconst_group_expr.is_empty()
-                        && aggregate.aggr_expr.is_empty())
+                    .partition(|expr| is_redundant_group_expr(expr, &group_by_columns));
+
+                if redundant.is_empty()
+                    || (required.is_empty() && aggregate.aggr_expr.is_empty())
                 {
                     return Ok(Transformed::no(LogicalPlan::Aggregate(aggregate)));
                 }
 
                 let simplified_aggregate = LogicalPlan::Aggregate(Aggregate::try_new(
                     aggregate.input,
-                    nonconst_group_expr.into_iter().cloned().collect(),
+                    required.into_iter().cloned().collect(),
                     aggregate.aggr_expr.clone(),
                 )?);
 
@@ -91,23 +99,47 @@ impl OptimizerRule for EliminateGroupByConstant {
     }
 }
 
-/// Checks if expression is constant, and can be eliminated from group by.
-///
-/// Intended to be used only within this rule, helper function, which heavily
-/// relies on `SimplifyExpressions` result.
-fn is_constant_expression(expr: &Expr) -> bool {
+/// Checks if a GROUP BY expression is redundant (can be removed without
+/// changing grouping semantics). An expression is redundant if it is a
+/// deterministic function of constants and columns already present as bare
+/// column references in the GROUP BY.
+fn is_redundant_group_expr(
+    expr: &Expr,
+    group_by_columns: &HashSet<&datafusion_common::Column>,
+) -> bool {
+    // Bare column references are never redundant - they define the grouping
+    if matches!(expr, Expr::Column(_)) {
+        return false;
+    }
+    is_deterministic_of(expr, group_by_columns)
+}
+
+/// Returns true if `expr` is a deterministic expression whose only column
+/// references are contained in `known_columns`.
+fn is_deterministic_of(
+    expr: &Expr,
+    known_columns: &HashSet<&datafusion_common::Column>,
+) -> bool {
     match expr {
-        Expr::Alias(e) => is_constant_expression(&e.expr),
+        Expr::Alias(e) => is_deterministic_of(&e.expr, known_columns),
+        Expr::Column(c) => known_columns.contains(c),
+        Expr::Literal(_, _) => true,
         Expr::BinaryExpr(e) => {
-            is_constant_expression(&e.left) && is_constant_expression(&e.right)
+            is_deterministic_of(&e.left, known_columns)
+                && is_deterministic_of(&e.right, known_columns)
         }
-        Expr::Literal(_, _) => true,
         Expr::ScalarFunction(e) => {
             matches!(
                 e.func.signature().volatility,
                 Volatility::Immutable | Volatility::Stable
-            ) && e.args.iter().all(is_constant_expression)
+            ) && e
+                .args
+                .iter()
+                .all(|arg| is_deterministic_of(arg, known_columns))
         }
+        Expr::Cast(e) => is_deterministic_of(&e.expr, known_columns),
+        Expr::TryCast(e) => is_deterministic_of(&e.expr, known_columns),
+        Expr::Negative(e) => is_deterministic_of(e, known_columns),
         _ => false,
     }
 }
@@ -115,16 +147,15 @@ fn is_constant_expression(expr: &Expr) -> bool {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::OptimizerContext;
     use crate::assert_optimized_plan_eq_snapshot;
     use crate::test::*;
-    use crate::OptimizerContext;
 
     use arrow::datatypes::DataType;
-    use datafusion_common::Result;
     use datafusion_expr::expr::ScalarFunction;
     use datafusion_expr::{
-        col, lit, ColumnarValue, LogicalPlanBuilder, ScalarFunctionArgs, ScalarUDF,
-        ScalarUDFImpl, Signature, TypeSignature,
+        ColumnarValue, LogicalPlanBuilder, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl,
+        Signature, TypeSignature, col, lit,
     };
 
     use datafusion_functions_aggregate::expr_fn::count;
@@ -161,9 +192,6 @@ mod tests {
     }
 
     impl ScalarUDFImpl for ScalarUDFMock {
-        fn as_any(&self) -> &dyn std::any::Any {
-            self
-        }
         fn name(&self) -> &str {
             "scalar_fn_mock"
         }
@@ -268,6 +296,43 @@ mod tests {
         ")
     }
 
+    #[test]
+    fn test_eliminate_deterministic_expr_of_group_by_column() -> Result<()> {
+        let scan = test_table_scan()?;
+        // GROUP BY a, a - 1, a - 2, a - 3  ->  GROUP BY a
+        let plan = LogicalPlanBuilder::from(scan)
+            .aggregate(
+                vec![
+                    col("a"),
+                    col("a") - lit(1u32),
+                    col("a") - lit(2u32),
+                    col("a") - lit(3u32),
+                ],
+                vec![count(col("c"))],
+            )?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Projection: test.a, test.a - UInt32(1), test.a - UInt32(2), test.a - UInt32(3), count(test.c)
+          Aggregate: groupBy=[[test.a]], aggr=[[count(test.c)]]
+            TableScan: test
+        ")
+    }
+
+    #[test]
+    fn test_no_eliminate_independent_columns() -> Result<()> {
+        // GROUP BY a, b - 1 should NOT eliminate b - 1 (b is not a group by column)
+        let scan = test_table_scan()?;
+        let plan = LogicalPlanBuilder::from(scan)
+            .aggregate(vec![col("a"), col("b") - lit(1u32)], vec![count(col("c"))])?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Aggregate: groupBy=[[test.a, test.b - UInt32(1)]], aggr=[[count(test.c)]]
+          TableScan: test
+        ")
+    }
+
     #[test]
     fn test_no_op_volatile_scalar_fn_with_constant_arg() -> Result<()> {
         let udf = ScalarUDF::new_from_impl(ScalarUDFMock::new_with_volatility(
diff --git a/datafusion/optimizer/src/eliminate_join.rs b/datafusion/optimizer/src/eliminate_join.rs
index 412bbea2ae92c..885910c1e4182 100644
--- a/datafusion/optimizer/src/eliminate_join.rs
+++ b/datafusion/optimizer/src/eliminate_join.rs
@@ -22,8 +22,8 @@ use datafusion_common::tree_node::Transformed;
 use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::JoinType::Inner;
 use datafusion_expr::{
-    logical_plan::{EmptyRelation, LogicalPlan},
     Expr,
+    logical_plan::{EmptyRelation, LogicalPlan},
 };
 
 /// Eliminates joins when join condition is false.
@@ -74,9 +74,9 @@ impl OptimizerRule for EliminateJoin {
 
 #[cfg(test)]
 mod tests {
+    use crate::OptimizerContext;
     use crate::assert_optimized_plan_eq_snapshot;
     use crate::eliminate_join::EliminateJoin;
-    use crate::OptimizerContext;
     use datafusion_common::Result;
     use datafusion_expr::JoinType::Inner;
     use datafusion_expr::{lit, logical_plan::builder::LogicalPlanBuilder};
diff --git a/datafusion/optimizer/src/eliminate_limit.rs b/datafusion/optimizer/src/eliminate_limit.rs
index 8e25d3246f6c2..1ec3c856080eb 100644
--- a/datafusion/optimizer/src/eliminate_limit.rs
+++ b/datafusion/optimizer/src/eliminate_limit.rs
@@ -18,8 +18,8 @@
 //! [`EliminateLimit`] eliminates `LIMIT` when possible
 use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
-use datafusion_common::tree_node::Transformed;
 use datafusion_common::Result;
+use datafusion_common::tree_node::Transformed;
 use datafusion_expr::logical_plan::{EmptyRelation, FetchType, LogicalPlan, SkipType};
 use std::sync::Arc;
 
@@ -34,7 +34,7 @@ use std::sync::Arc;
 pub struct EliminateLimit;
 
 impl EliminateLimit {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -77,7 +77,7 @@ impl OptimizerRule for EliminateLimit {
                 } else if matches!(limit.get_skip_type()?, SkipType::Literal(0)) {
                     // If fetch is `None` and skip is 0, then Limit takes no effect and
                     // we can remove it. Its input also can be Limit, so we should apply again.
-                    #[allow(clippy::used_underscore_binding)]
+                    #[expect(clippy::used_underscore_binding)]
                     return self.rewrite(Arc::unwrap_or_clone(limit.input), _config);
                 }
                 Ok(Transformed::no(LogicalPlan::Limit(limit)))
@@ -90,14 +90,13 @@ impl OptimizerRule for EliminateLimit {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::test::*;
     use crate::OptimizerContext;
+    use crate::test::*;
     use datafusion_common::Column;
     use datafusion_expr::{
         col,
-        logical_plan::{builder::LogicalPlanBuilder, JoinType},
+        logical_plan::{JoinType, builder::LogicalPlanBuilder},
     };
-    use std::sync::Arc;
 
     use crate::assert_optimized_plan_eq_snapshot;
     use crate::push_down_limit::PushDownLimit;
diff --git a/datafusion/optimizer/src/eliminate_one_union.rs b/datafusion/optimizer/src/eliminate_one_union.rs
deleted file mode 100644
index 3e027811420c4..0000000000000
--- a/datafusion/optimizer/src/eliminate_one_union.rs
+++ /dev/null
@@ -1,121 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! [`EliminateOneUnion`]  eliminates single element `Union`
-
-use crate::{OptimizerConfig, OptimizerRule};
-use datafusion_common::{tree_node::Transformed, Result};
-use datafusion_expr::logical_plan::{LogicalPlan, Union};
-use std::sync::Arc;
-
-use crate::optimizer::ApplyOrder;
-
-#[derive(Default, Debug)]
-/// An optimization rule that eliminates union with one element.
-pub struct EliminateOneUnion;
-
-impl EliminateOneUnion {
-    #[allow(missing_docs)]
-    pub fn new() -> Self {
-        Self {}
-    }
-}
-
-impl OptimizerRule for EliminateOneUnion {
-    fn name(&self) -> &str {
-        "eliminate_one_union"
-    }
-
-    fn supports_rewrite(&self) -> bool {
-        true
-    }
-
-    fn rewrite(
-        &self,
-        plan: LogicalPlan,
-        _config: &dyn OptimizerConfig,
-    ) -> Result<Transformed<LogicalPlan>> {
-        match plan {
-            LogicalPlan::Union(Union { mut inputs, .. }) if inputs.len() == 1 => Ok(
-                Transformed::yes(Arc::unwrap_or_clone(inputs.pop().unwrap())),
-            ),
-            _ => Ok(Transformed::no(plan)),
-        }
-    }
-
-    fn apply_order(&self) -> Option<ApplyOrder> {
-        Some(ApplyOrder::TopDown)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::test::*;
-    use arrow::datatypes::{DataType, Field, Schema};
-    use datafusion_common::ToDFSchema;
-    use datafusion_expr::{
-        expr_rewriter::coerce_plan_expr_for_schema, logical_plan::table_scan,
-    };
-    use std::sync::Arc;
-
-    fn schema() -> Schema {
-        Schema::new(vec![
-            Field::new("id", DataType::Int32, false),
-            Field::new("key", DataType::Utf8, false),
-            Field::new("value", DataType::Int32, false),
-        ])
-    }
-
-    fn assert_optimized_plan_equal(plan: LogicalPlan, expected: &str) -> Result<()> {
-        assert_optimized_plan_with_rules(
-            vec![Arc::new(EliminateOneUnion::new())],
-            plan,
-            expected,
-            true,
-        )
-    }
-
-    #[test]
-    fn eliminate_nothing() -> Result<()> {
-        let plan_builder = table_scan(Some("table"), &schema(), None)?;
-
-        let plan = plan_builder.clone().union(plan_builder.build()?)?.build()?;
-
-        let expected = "\
-        Union\
-        \n  TableScan: table\
-        \n  TableScan: table";
-        assert_optimized_plan_equal(plan, expected)
-    }
-
-    #[test]
-    fn eliminate_one_union() -> Result<()> {
-        let table_plan = coerce_plan_expr_for_schema(
-            table_scan(Some("table"), &schema(), None)?.build()?,
-            &schema().to_dfschema()?,
-        )?;
-        let schema = Arc::clone(table_plan.schema());
-        let single_union_plan = LogicalPlan::Union(Union {
-            inputs: vec![Arc::new(table_plan)],
-            schema,
-        });
-
-        let expected = "TableScan: table";
-        assert_optimized_plan_equal(single_union_plan, expected)
-    }
-}
diff --git a/datafusion/optimizer/src/eliminate_outer_join.rs b/datafusion/optimizer/src/eliminate_outer_join.rs
index 45877642f2766..db21f0eb6c2d0 100644
--- a/datafusion/optimizer/src/eliminate_outer_join.rs
+++ b/datafusion/optimizer/src/eliminate_outer_join.rs
@@ -23,7 +23,7 @@ use datafusion_expr::{Expr, Filter, Operator};
 
 use crate::optimizer::ApplyOrder;
 use datafusion_common::tree_node::Transformed;
-use datafusion_expr::expr::{BinaryExpr, Cast, TryCast};
+use datafusion_expr::expr::{BinaryExpr, Cast, InList, Like, TryCast};
 use std::sync::Arc;
 
 ///
@@ -52,7 +52,7 @@ use std::sync::Arc;
 pub struct EliminateOuterJoin;
 
 impl EliminateOuterJoin {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -119,6 +119,7 @@ impl OptimizerRule for EliminateOuterJoin {
                         filter: join.filter.clone(),
                         schema: Arc::clone(&join.schema),
                         null_equality: join.null_equality,
+                        null_aware: join.null_aware,
                     }));
                     Filter::try_new(filter.predicate, new_join)
                         .map(|f| Transformed::yes(LogicalPlan::Filter(f)))
@@ -289,14 +290,62 @@ fn extract_non_nullable_columns(
                 false,
             )
         }
-        Expr::Cast(Cast { expr, data_type: _ })
-        | Expr::TryCast(TryCast { expr, data_type: _ }) => extract_non_nullable_columns(
+        Expr::Cast(Cast { expr, field: _ })
+        | Expr::TryCast(TryCast { expr, field: _ }) => extract_non_nullable_columns(
             expr,
             non_nullable_cols,
             left_schema,
             right_schema,
             false,
         ),
+        // IN list and BETWEEN are null-rejecting on the input expression:
+        // if the input column is NULL, the result is NULL (filtered out),
+        // regardless of whether the list/range contains NULLs.
+        Expr::InList(InList { expr, .. }) => extract_non_nullable_columns(
+            expr,
+            non_nullable_cols,
+            left_schema,
+            right_schema,
+            false,
+        ),
+        Expr::Between(between) => extract_non_nullable_columns(
+            &between.expr,
+            non_nullable_cols,
+            left_schema,
+            right_schema,
+            false,
+        ),
+        // LIKE is null-rejecting: if either the input column or the pattern
+        // is NULL, the result is NULL (filtered out by WHERE).
+        Expr::Like(Like { expr, pattern, .. }) => {
+            extract_non_nullable_columns(
+                expr,
+                non_nullable_cols,
+                left_schema,
+                right_schema,
+                false,
+            );
+            extract_non_nullable_columns(
+                pattern,
+                non_nullable_cols,
+                left_schema,
+                right_schema,
+                false,
+            );
+        }
+        // IS TRUE, IS FALSE, and IS NOT UNKNOWN are null-rejecting:
+        // if the input is NULL, they return false (filtered out by WHERE).
+        // Note: IS NOT TRUE, IS NOT FALSE, and IS UNKNOWN are NOT null-rejecting
+        // because they return true for NULL input.
+        Expr::IsTrue(arg) | Expr::IsFalse(arg) | Expr::IsNotUnknown(arg) => {
+            extract_non_nullable_columns(
+                arg,
+                non_nullable_cols,
+                left_schema,
+                right_schema,
+                false,
+            )
+        }
         _ => {}
     }
 }
@@ -304,15 +353,16 @@ fn extract_non_nullable_columns(
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::OptimizerContext;
     use crate::assert_optimized_plan_eq_snapshot;
     use crate::test::*;
-    use crate::OptimizerContext;
     use arrow::datatypes::DataType;
+    use datafusion_common::ScalarValue;
     use datafusion_expr::{
+        Operator::{And, Or},
         binary_expr, cast, col, lit,
         logical_plan::builder::LogicalPlanBuilder,
         try_cast,
-        Operator::{And, Or},
     };
 
     macro_rules! assert_optimized_plan_equal {
@@ -436,11 +486,138 @@ mod tests {
     }
 
     #[test]
-    fn eliminate_full_with_type_cast() -> Result<()> {
+    fn eliminate_left_with_in_list() -> Result<()> {
         let t1 = test_table_scan_with_name("t1")?;
         let t2 = test_table_scan_with_name("t2")?;
 
-        // eliminate to inner join
+        // t2.b IN (1, 2, 3) rejects nulls — if t2.b is NULL the IN returns
+        // NULL which is filtered out. So Left Join should become Inner Join.
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Left,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(col("t2.b").in_list(vec![lit(1u32), lit(2u32), lit(3u32)], false))?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: t2.b IN ([UInt32(1), UInt32(2), UInt32(3)])
+          Inner Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    #[test]
+    fn eliminate_left_with_in_list_containing_null() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // IN list with NULL still rejects null input columns:
+        // if t2.b is NULL, NULL IN (1, NULL) evaluates to NULL, which is filtered out
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Left,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(
+                col("t2.b")
+                    .in_list(vec![lit(1u32), lit(ScalarValue::UInt32(None))], false),
+            )?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: t2.b IN ([UInt32(1), UInt32(NULL)])
+          Inner Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    #[test]
+    fn eliminate_left_with_not_in_list() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // NOT IN also rejects nulls: if t2.b is NULL, NOT (NULL IN (...))
+        // evaluates to NULL, which is filtered out
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Left,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(col("t2.b").in_list(vec![lit(1u32), lit(2u32)], true))?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: t2.b NOT IN ([UInt32(1), UInt32(2)])
+          Inner Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    #[test]
+    fn eliminate_left_with_between() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // BETWEEN rejects nulls: if t2.b is NULL, NULL BETWEEN 1 AND 10
+        // evaluates to NULL, which is filtered out
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Left,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(col("t2.b").between(lit(1u32), lit(10u32)))?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: t2.b BETWEEN UInt32(1) AND UInt32(10)
+          Inner Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    #[test]
+    fn eliminate_right_with_between() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // Right join: filter on left (nullable) side with BETWEEN should convert to Inner
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Right,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(col("t1.b").between(lit(1u32), lit(10u32)))?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: t1.b BETWEEN UInt32(1) AND UInt32(10)
+          Inner Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    #[test]
+    fn eliminate_full_with_between() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // Full join with BETWEEN on both sides should become Inner
         let plan = LogicalPlanBuilder::from(t1)
             .join(
                 t2,
@@ -449,14 +626,548 @@ mod tests {
                 None,
             )?
             .filter(binary_expr(
-                cast(col("t1.b"), DataType::Int64).gt(lit(10u32)),
+                col("t1.b").between(lit(1u32), lit(10u32)),
                 And,
-                try_cast(col("t2.c"), DataType::Int64).lt(lit(20u32)),
+                col("t2.b").between(lit(5u32), lit(20u32)),
             ))?
             .build()?;
 
         assert_optimized_plan_equal!(plan, @r"
-        Filter: CAST(t1.b AS Int64) > UInt32(10) AND TRY_CAST(t2.c AS Int64) < UInt32(20)
+        Filter: t1.b BETWEEN UInt32(1) AND UInt32(10) AND t2.b BETWEEN UInt32(5) AND UInt32(20)
+          Inner Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    #[test]
+    fn eliminate_full_with_in_list() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // Full join with IN filters on both sides should become Inner
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Full,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(binary_expr(
+                col("t1.b").in_list(vec![lit(1u32), lit(2u32)], false),
+                And,
+                col("t2.b").in_list(vec![lit(3u32), lit(4u32)], false),
+            ))?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: t1.b IN ([UInt32(1), UInt32(2)]) AND t2.b IN ([UInt32(3), UInt32(4)])
+          Inner Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    #[test]
+    fn no_eliminate_left_with_in_list_or_is_null() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // WHERE (t2.b IN (1, 2)) OR (t2.b IS NULL)
+        // The OR with IS NULL makes the predicate null-tolerant:
+        // when t2.b is NULL, IS NULL returns true, so the whole OR is true.
+        // The outer join must be preserved.
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Left,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(binary_expr(
+                col("t2.b").in_list(vec![lit(1u32), lit(2u32)], false),
+                Or,
+                col("t2.b").is_null(),
+            ))?
+            .build()?;
+
+        // Should NOT be converted to Inner — OR with IS NULL preserves null rows
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: t2.b IN ([UInt32(1), UInt32(2)]) OR t2.b IS NULL
+          Left Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    #[test]
+    fn eliminate_left_with_like() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // LIKE rejects nulls: if t2.b is NULL, the result is NULL (filtered out)
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Left,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(col("t2.b").like(lit("%pattern%")))?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r#"
+        Filter: t2.b LIKE Utf8("%pattern%")
+          Inner Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        "#)
+    }
+
+    #[test]
+    fn eliminate_left_with_like_pattern_column() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // LIKE with nullable column on the pattern side:
+        // 'x' LIKE t2.b → if t2.b is NULL, result is NULL (filtered out)
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Left,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(lit("x").like(col("t2.b")))?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r#"
+        Filter: Utf8("x") LIKE t2.b
+          Inner Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        "#)
+    }
+
+    #[test]
+    fn eliminate_full_with_like_cross_side() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // LIKE with columns from both sides: t1.c LIKE t2.b
+        // If t1 is NULL → NULL LIKE t2.b → NULL → filtered out (left non-nullable)
+        // If t2 is NULL → t1.c LIKE NULL → NULL → filtered out (right non-nullable)
+        // Both sides are non-nullable → FULL → INNER
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Full,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(col("t1.c").like(col("t2.b")))?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: t1.c LIKE t2.b
+          Inner Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    #[test]
+    fn eliminate_left_with_is_true() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // IS TRUE rejects nulls: if the expression is NULL, IS TRUE returns false
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Left,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(col("t2.b").gt(lit(10u32)).is_true())?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: t2.b > UInt32(10) IS TRUE
+          Inner Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    #[test]
+    fn eliminate_left_with_is_false() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // IS FALSE rejects nulls: if the expression is NULL, IS FALSE returns false
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Left,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(col("t2.b").gt(lit(10u32)).is_false())?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: t2.b > UInt32(10) IS FALSE
+          Inner Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    #[test]
+    fn eliminate_left_with_is_not_unknown() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // IS NOT UNKNOWN rejects nulls: if the expression is NULL, IS NOT UNKNOWN returns false
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Left,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(col("t2.b").gt(lit(10u32)).is_not_unknown())?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: t2.b > UInt32(10) IS NOT UNKNOWN
+          Inner Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    #[test]
+    fn no_eliminate_left_with_is_not_true() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // IS NOT TRUE is NOT null-rejecting: if the expression is NULL,
+        // IS NOT TRUE returns true, so null rows pass through
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Left,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(col("t2.b").gt(lit(10u32)).is_not_true())?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: t2.b > UInt32(10) IS NOT TRUE
+          Left Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    #[test]
+    fn no_eliminate_left_with_is_unknown() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // IS UNKNOWN is NOT null-rejecting: if the expression is NULL,
+        // IS UNKNOWN returns true, so null rows pass through
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Left,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(col("t2.b").gt(lit(10u32)).is_unknown())?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: t2.b > UInt32(10) IS UNKNOWN
+          Left Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    #[test]
+    fn eliminate_full_with_type_cast() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // eliminate to inner join
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Full,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(binary_expr(
+                cast(col("t1.b"), DataType::Int64).gt(lit(10u32)),
+                And,
+                try_cast(col("t2.c"), DataType::Int64).lt(lit(20u32)),
+            ))?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: CAST(t1.b AS Int64) > UInt32(10) AND TRY_CAST(t2.c AS Int64) < UInt32(20)
+          Inner Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    // ----- FULL JOIN → LEFT / RIGHT tests -----
+    #[test]
+    fn eliminate_full_to_left_with_left_filter() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // FULL JOIN with null-rejecting filter only on left side → LEFT JOIN
+        // (left side becomes non-nullable, right side stays nullable)
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Full,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(col("t1.b").gt(lit(10u32)))?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: t1.b > UInt32(10)
+          Left Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    #[test]
+    fn eliminate_full_to_right_with_right_filter() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // FULL JOIN with null-rejecting filter only on right side → RIGHT JOIN
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Full,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(col("t2.b").in_list(vec![lit(1u32), lit(2u32)], false))?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: t2.b IN ([UInt32(1), UInt32(2)])
+          Right Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    #[test]
+    fn eliminate_full_to_left_with_like() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // FULL JOIN with LIKE on left side only → LEFT JOIN
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Full,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(col("t1.b").like(lit("%val%")))?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r#"
+        Filter: t1.b LIKE Utf8("%val%")
+          Left Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        "#)
+    }
+
+    #[test]
+    fn eliminate_full_to_right_with_is_true() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // FULL JOIN with IS TRUE on right side only → RIGHT JOIN
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Full,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(col("t2.b").gt(lit(10u32)).is_true())?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: t2.b > UInt32(10) IS TRUE
+          Right Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    // ----- Nested AND / OR tests -----
+
+    #[test]
+    fn eliminate_left_with_and_multiple_null_rejecting() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // Multiple null-rejecting predicates combined with AND on nullable side
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Left,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(binary_expr(
+                col("t2.b").in_list(vec![lit(1u32), lit(2u32)], false),
+                And,
+                col("t2.c").between(lit(5u32), lit(20u32)),
+            ))?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: t2.b IN ([UInt32(1), UInt32(2)]) AND t2.c BETWEEN UInt32(5) AND UInt32(20)
+          Inner Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    #[test]
+    fn eliminate_left_with_or_same_side() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // OR of two null-rejecting predicates on different columns of the same
+        // nullable side. If t2 rows are NULL (from LEFT JOIN), both t2.b and
+        // t2.c are NULL, so the entire OR evaluates to NULL → filtered out.
+        // This IS null-rejecting, so join should be eliminated.
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Left,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(binary_expr(
+                col("t2.b").gt(lit(10u32)),
+                Or,
+                col("t2.c").lt(lit(20u32)),
+            ))?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: t2.b > UInt32(10) OR t2.c < UInt32(20)
+          Inner Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    #[test]
+    fn no_eliminate_left_with_or_cross_side() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // OR with columns from different sides — t1.b (preserved) OR t2.b
+        // (nullable). When t2 is NULL, t1.b > 10 can still be true, so the
+        // OR is NOT null-rejecting. Join must be preserved.
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Left,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(binary_expr(
+                col("t1.b").gt(lit(10u32)),
+                Or,
+                col("t2.b").lt(lit(20u32)),
+            ))?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: t1.b > UInt32(10) OR t2.b < UInt32(20)
+          Left Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        ")
+    }
+
+    // ----- Mixed predicate tests -----
+
+    #[test]
+    fn eliminate_full_with_mixed_predicates() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // FULL JOIN with different null-rejecting expr types on each side:
+        // LIKE on left, BETWEEN on right → INNER JOIN
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Full,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(binary_expr(
+                col("t1.b").like(lit("%pattern%")),
+                And,
+                col("t2.b").between(lit(1u32), lit(10u32)),
+            ))?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r#"
+        Filter: t1.b LIKE Utf8("%pattern%") AND t2.b BETWEEN UInt32(1) AND UInt32(10)
+          Inner Join: t1.a = t2.a
+            TableScan: t1
+            TableScan: t2
+        "#)
+    }
+
+    #[test]
+    fn eliminate_left_with_is_true_and_in_list() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // AND of IS TRUE and IN on nullable side — both null-rejecting
+        let plan = LogicalPlanBuilder::from(t1)
+            .join(
+                t2,
+                JoinType::Left,
+                (vec![Column::from_name("a")], vec![Column::from_name("a")]),
+                None,
+            )?
+            .filter(binary_expr(
+                col("t2.b").gt(lit(5u32)).is_true(),
+                And,
+                col("t2.c").in_list(vec![lit(1u32), lit(2u32)], false),
+            ))?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Filter: t2.b > UInt32(5) IS TRUE AND t2.c IN ([UInt32(1), UInt32(2)])
           Inner Join: t1.a = t2.a
             TableScan: t1
             TableScan: t2
diff --git a/datafusion/optimizer/src/extract_equijoin_predicate.rs b/datafusion/optimizer/src/extract_equijoin_predicate.rs
index c76de942de805..0a50761e8a9f7 100644
--- a/datafusion/optimizer/src/extract_equijoin_predicate.rs
+++ b/datafusion/optimizer/src/extract_equijoin_predicate.rs
@@ -19,7 +19,7 @@
 use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
 use datafusion_common::tree_node::Transformed;
-use datafusion_common::{internal_err, DFSchema};
+use datafusion_common::{DFSchema, assert_or_internal_err};
 use datafusion_common::{NullEquality, Result};
 use datafusion_expr::utils::split_conjunction_owned;
 use datafusion_expr::utils::{can_hash, find_valid_equijoin_key_pair};
@@ -42,7 +42,7 @@ type EquijoinPredicate = (Expr, Expr);
 pub struct ExtractEquijoinPredicate;
 
 impl ExtractEquijoinPredicate {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -76,6 +76,7 @@ impl OptimizerRule for ExtractEquijoinPredicate {
                 join_constraint,
                 schema,
                 null_equality,
+                null_aware,
             }) => {
                 let left_schema = left.schema();
                 let right_schema = right.schema();
@@ -117,6 +118,7 @@ impl OptimizerRule for ExtractEquijoinPredicate {
                             // According to `is not distinct from`'s semantics, it's
                             // safe to override it
                             null_equality: NullEquality::NullEqualsNull,
+                            null_aware,
                         })));
                     }
                 }
@@ -132,6 +134,7 @@ impl OptimizerRule for ExtractEquijoinPredicate {
                         join_constraint,
                         schema,
                         null_equality,
+                        null_aware,
                     })))
                 } else {
                     Ok(Transformed::no(LogicalPlan::Join(Join {
@@ -143,6 +146,7 @@ impl OptimizerRule for ExtractEquijoinPredicate {
                         join_constraint,
                         schema,
                         null_equality,
+                        null_aware,
                     })))
                 }
             }
@@ -223,13 +227,12 @@ fn split_op_and_other_join_predicates(
     right_schema: &DFSchema,
     operator: Operator,
 ) -> Result<(Vec<EquijoinPredicate>, Option<Expr>)> {
-    if !matches!(operator, Operator::Eq | Operator::IsNotDistinctFrom) {
-        return internal_err!(
-            "split_op_and_other_join_predicates only supports 'Eq' or 'IsNotDistinctFrom' operators, \
-            but received: {:?}",
-            operator
-        );
-    }
+    assert_or_internal_err!(
+        matches!(operator, Operator::Eq | Operator::IsNotDistinctFrom),
+        "split_op_and_other_join_predicates only supports 'Eq' or 'IsNotDistinctFrom' operators, \
+        but received: {:?}",
+        operator
+    );
 
     let exprs = split_conjunction_owned(filter);
 
@@ -274,7 +277,7 @@ mod tests {
     use crate::test::*;
     use arrow::datatypes::DataType;
     use datafusion_expr::{
-        col, lit, logical_plan::builder::LogicalPlanBuilder, JoinType,
+        JoinType, col, lit, logical_plan::builder::LogicalPlanBuilder,
     };
     use std::sync::Arc;
 
diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs
new file mode 100644
index 0000000000000..c5c5610aeaed9
--- /dev/null
+++ b/datafusion/optimizer/src/extract_leaf_expressions.rs
@@ -0,0 +1,3038 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Two-pass optimizer pipeline that pushes cheap expressions (like struct field
+//! access `user['status']`) closer to data sources, enabling early data reduction
+//! and source-level optimizations (e.g., Parquet column pruning). See
+//! [`ExtractLeafExpressions`] (pass 1) and [`PushDownLeafProjections`] (pass 2).
+
+use indexmap::{IndexMap, IndexSet};
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use datafusion_common::alias::AliasGenerator;
+use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion};
+use datafusion_common::{Column, DFSchema, Result, qualified_name};
+use datafusion_expr::logical_plan::LogicalPlan;
+use datafusion_expr::{Expr, ExpressionPlacement, Projection};
+
+use crate::optimizer::ApplyOrder;
+use crate::push_down_filter::replace_cols_by_name;
+use crate::utils::{ColumnReference, has_all_column_refs, schema_columns};
+use crate::{OptimizerConfig, OptimizerRule};
+
+/// Prefix for aliases generated by the extraction optimizer passes.
+///
+/// This prefix is **reserved for internal optimizer use**. User-defined aliases
+/// starting with this prefix may be misidentified as optimizer-generated
+/// extraction aliases, leading to unexpected behavior. Do not use this prefix
+/// in user queries.
+const EXTRACTED_EXPR_PREFIX: &str = "__datafusion_extracted";
+
+/// Returns `true` if any sub-expression in `exprs` has
+/// [`ExpressionPlacement::MoveTowardsLeafNodes`] placement.
+///
+/// This is a lightweight pre-check that short-circuits as soon as one
+/// extractable expression is found, avoiding the expensive allocations
+/// (column HashSets, extractors, expression rewrites) that the full
+/// extraction pipeline requires.
+fn has_extractable_expr(exprs: &[Expr]) -> bool {
+    exprs.iter().any(|expr| {
+        expr.exists(|e| Ok(e.placement() == ExpressionPlacement::MoveTowardsLeafNodes))
+            .unwrap_or(false)
+    })
+}
+
+/// Extracts `MoveTowardsLeafNodes` sub-expressions from non-projection nodes
+/// into **extraction projections** (pass 1 of 2).
+///
+/// This handles Filter, Sort, Limit, Aggregate, and Join nodes. For Projection
+/// nodes, extraction and pushdown are handled by [`PushDownLeafProjections`].
+///
+/// # Key Concepts
+///
+/// **Extraction projection**: a projection inserted *below* a node that
+/// pre-computes a cheap expression and exposes it under an alias
+/// (`__datafusion_extracted_N`). The parent node then references the alias
+/// instead of the original expression.
+///
+/// **Recovery projection**: a projection inserted *above* a node to restore
+/// the original output schema when extraction changes it.
+/// Schema-preserving nodes (Filter, Sort, Limit) gain extra columns from
+/// the extraction projection that bubble up; the recovery projection selects
+/// only the original columns to hide the extras.
+///
+/// # Example
+///
+/// Given a filter with a struct field access:
+///
+/// ```text
+/// Filter: user['status'] = 'active'
+///   TableScan: t [id, user]
+/// ```
+///
+/// This rule:
+/// 1. Inserts an **extraction projection** below the filter:
+/// 2. Adds a **recovery projection** above to hide the extra column:
+///
+/// ```text
+/// Projection: id, user                                                        <-- recovery projection
+///   Filter: __datafusion_extracted_1 = 'active'
+///     Projection: user['status'] AS __datafusion_extracted_1, id, user         <-- extraction projection
+///       TableScan: t [id, user]
+/// ```
+///
+/// **Important:** The `PushDownFilter` rule is aware of projections created by this rule
+/// and will not push filters through them. It uses `ExpressionPlacement` to detect
+/// `MoveTowardsLeafNodes` expressions and skip filter pushdown past them.
+#[derive(Default, Debug)]
+pub struct ExtractLeafExpressions {}
+
+impl ExtractLeafExpressions {
+    /// Create a new [`ExtractLeafExpressions`]
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl OptimizerRule for ExtractLeafExpressions {
+    fn name(&self) -> &str {
+        "extract_leaf_expressions"
+    }
+
+    fn rewrite(
+        &self,
+        plan: LogicalPlan,
+        config: &dyn OptimizerConfig,
+    ) -> Result<Transformed<LogicalPlan>> {
+        if !config.options().optimizer.enable_leaf_expression_pushdown {
+            return Ok(Transformed::no(plan));
+        }
+        let alias_generator = config.alias_generator();
+
+        // Advance the alias generator past any user-provided __datafusion_extracted_N
+        // aliases to prevent collisions when generating new extraction aliases.
+        advance_generator_past_existing(&plan, alias_generator)?;
+
+        plan.transform_down_with_subqueries(|plan| {
+            extract_from_plan(plan, alias_generator)
+        })
+    }
+}
+
+/// Scans the current plan node's expressions for pre-existing
+/// `__datafusion_extracted_N` aliases and advances the generator
+/// counter past them to avoid collisions with user-provided aliases.
+fn advance_generator_past_existing(
+    plan: &LogicalPlan,
+    alias_generator: &AliasGenerator,
+) -> Result<()> {
+    plan.apply(|plan| {
+        plan.expressions().iter().try_for_each(|expr| {
+            expr.apply(|e| {
+                if let Expr::Alias(alias) = e
+                    && let Some(id) = alias
+                        .name
+                        .strip_prefix(EXTRACTED_EXPR_PREFIX)
+                        .and_then(|s| s.strip_prefix('_'))
+                        .and_then(|s| s.parse().ok())
+                {
+                    alias_generator.update_min_id(id);
+                }
+                Ok(TreeNodeRecursion::Continue)
+            })?;
+            Ok::<(), datafusion_common::error::DataFusionError>(())
+        })?;
+        Ok(TreeNodeRecursion::Continue)
+    })
+    .map(|_| ())
+}
+
+/// Extracts `MoveTowardsLeafNodes` sub-expressions from a plan node.
+///
+/// Works for any number of inputs (0, 1, 2, …N). For multi-input nodes
+/// like Join, each extracted sub-expression is routed to the correct input
+/// by checking which input's schema contains all of the expression's column
+/// references.
+fn extract_from_plan(
+    plan: LogicalPlan,
+    alias_generator: &Arc<AliasGenerator>,
+) -> Result<Transformed<LogicalPlan>> {
+    // Only extract from plan types whose output schema is predictable after
+    // expression rewriting.  Nodes like Window derive column names from
+    // their expressions, so rewriting `get_field` inside a window function
+    // changes the output schema and breaks the recovery projection.
+    if !matches!(
+        &plan,
+        LogicalPlan::Aggregate(_)
+            | LogicalPlan::Filter(_)
+            | LogicalPlan::Sort(_)
+            | LogicalPlan::Limit(_)
+            | LogicalPlan::Join(_)
+    ) {
+        return Ok(Transformed::no(plan));
+    }
+
+    let inputs = plan.inputs();
+    if inputs.is_empty() {
+        return Ok(Transformed::no(plan));
+    }
+
+    // Fast pre-check: skip all allocations if no extractable expressions exist
+    if !has_extractable_expr(&plan.expressions()) {
+        return Ok(Transformed::no(plan));
+    }
+
+    // Save original output schema before any transformation
+    let original_schema = Arc::clone(plan.schema());
+
+    // Build per-input schemas from borrowed inputs (before plan is consumed
+    // by map_expressions). We only need schemas and column sets for routing;
+    // the actual inputs are cloned later only if extraction succeeds.
+    let input_schemas: Vec<Arc<DFSchema>> =
+        inputs.iter().map(|i| Arc::clone(i.schema())).collect();
+
+    // Build per-input extractors
+    let mut extractors: Vec<LeafExpressionExtractor> = input_schemas
+        .iter()
+        .map(|schema| LeafExpressionExtractor::new(schema.as_ref(), alias_generator))
+        .collect();
+
+    // Build per-input column sets for routing expressions to the correct input
+    let input_column_sets: Vec<std::collections::HashSet<ColumnReference>> =
+        input_schemas
+            .iter()
+            .map(|schema| schema_columns(schema.as_ref()))
+            .collect();
+
+    // Transform expressions via map_expressions with routing
+    let transformed = plan.map_expressions(|expr| {
+        routing_extract(expr, &mut extractors, &input_column_sets)
+    })?;
+
+    // If no expressions were rewritten, nothing was extracted
+    if !transformed.transformed {
+        return Ok(transformed);
+    }
+
+    // Clone inputs now that we know extraction succeeded. Wrap in Arc
+    // upfront since build_extraction_projection expects &Arc<LogicalPlan>.
+    let owned_inputs: Vec<Arc<LogicalPlan>> = transformed
+        .data
+        .inputs()
+        .into_iter()
+        .map(|i| Arc::new(i.clone()))
+        .collect();
+
+    // Build per-input extraction projections (None means no extractions for that input)
+    let new_inputs: Vec<LogicalPlan> = owned_inputs
+        .into_iter()
+        .zip(extractors.iter())
+        .map(|(input_arc, extractor)| {
+            match extractor.build_extraction_projection(&input_arc)? {
+                Some(plan) => Ok(plan),
+                // No extractions for this input — recover the LogicalPlan
+                // without cloning (refcount is 1 since build returned None).
+                None => Ok(Arc::unwrap_or_clone(input_arc)),
+            }
+        })
+        .collect::<Result<Vec<_>>>()?;
+
+    // Rebuild the plan keeping its rewritten expressions but replacing
+    // inputs with the new extraction projections.
+    let new_plan = transformed
+        .data
+        .with_new_exprs(transformed.data.expressions(), new_inputs)?;
+
+    // Add recovery projection if the output schema changed
+    let recovered = build_recovery_projection(original_schema.as_ref(), new_plan)?;
+
+    Ok(Transformed::yes(recovered))
+}
+
+/// Given an expression, returns the index of the input whose columns fully
+/// cover the expression's column references.
+/// Returns `None` if the expression references columns from multiple inputs
+/// or if multiple inputs match (ambiguous, e.g. unqualified columns present
+/// in both sides of a join).
+fn find_owning_input(
+    expr: &Expr,
+    input_column_sets: &[std::collections::HashSet<ColumnReference>],
+) -> Option<usize> {
+    let mut found = None;
+    for (idx, cols) in input_column_sets.iter().enumerate() {
+        if has_all_column_refs(expr, cols) {
+            if found.is_some() {
+                // Ambiguous — multiple inputs match
+                return None;
+            }
+            found = Some(idx);
+        }
+    }
+    found
+}
+
+/// Walks an expression tree top-down, extracting `MoveTowardsLeafNodes`
+/// sub-expressions and routing each to the correct per-input extractor.
+fn routing_extract(
+    expr: Expr,
+    extractors: &mut [LeafExpressionExtractor],
+    input_column_sets: &[std::collections::HashSet<ColumnReference>],
+) -> Result<Transformed<Expr>> {
+    expr.transform_down(|e| {
+        // Skip expressions already aliased with extracted expression pattern
+        if let Expr::Alias(alias) = &e
+            && alias.name.starts_with(EXTRACTED_EXPR_PREFIX)
+        {
+            return Ok(Transformed {
+                data: e,
+                transformed: false,
+                tnr: TreeNodeRecursion::Jump,
+            });
+        }
+
+        // Don't extract Alias nodes directly — preserve the alias and let
+        // transform_down recurse into the inner expression
+        if matches!(&e, Expr::Alias(_)) {
+            return Ok(Transformed::no(e));
+        }
+
+        match e.placement() {
+            ExpressionPlacement::MoveTowardsLeafNodes => {
+                if let Some(idx) = find_owning_input(&e, input_column_sets) {
+                    let col_ref = extractors[idx].add_extracted(e)?;
+                    Ok(Transformed::yes(col_ref))
+                } else {
+                    // References columns from multiple inputs — cannot extract
+                    Ok(Transformed::no(e))
+                }
+            }
+            ExpressionPlacement::Column => {
+                // Track columns that the parent node references so the
+                // extraction projection includes them as pass-through.
+                // Without this, the extraction projection would only
+                // contain __datafusion_extracted_N aliases, and the parent couldn't
+                // resolve its other column references.
+                if let Expr::Column(col) = &e
+                    && let Some(idx) = find_owning_input(&e, input_column_sets)
+                {
+                    extractors[idx].columns_needed.insert(col.clone());
+                }
+                Ok(Transformed::no(e))
+            }
+            _ => Ok(Transformed::no(e)),
+        }
+    })
+}
+
+/// Rewrites extraction pairs and column references from one qualifier
+/// space to another.
+///
+/// Builds a replacement map by zipping `from_schema` (whose qualifiers
+/// currently appear in `pairs` / `columns`) with `to_schema` (the
+/// qualifiers we want), then applies `replace_cols_by_name`.
+///
+/// Used for SubqueryAlias (alias-space -> input-space) and Union
+/// (union output-space -> per-branch input-space).
+fn remap_pairs_and_columns(
+    pairs: &[(Expr, String)],
+    columns: &IndexSet<Column>,
+    from_schema: &DFSchema,
+    to_schema: &DFSchema,
+) -> Result<ExtractionTarget> {
+    let mut replace_map = HashMap::new();
+    for ((from_q, from_f), (to_q, to_f)) in from_schema.iter().zip(to_schema.iter()) {
+        replace_map.insert(
+            qualified_name(from_q, from_f.name()),
+            Expr::Column(Column::new(to_q.cloned(), to_f.name())),
+        );
+    }
+    let remapped_pairs: Vec<(Expr, String)> = pairs
+        .iter()
+        .map(|(expr, alias)| {
+            Ok((
+                replace_cols_by_name(expr.clone(), &replace_map)?,
+                alias.clone(),
+            ))
+        })
+        .collect::<Result<_>>()?;
+    let remapped_columns: IndexSet<Column> = columns
+        .iter()
+        .filter_map(|col| {
+            let rewritten =
+                replace_cols_by_name(Expr::Column(col.clone()), &replace_map).ok()?;
+            if let Expr::Column(c) = rewritten {
+                Some(c)
+            } else {
+                Some(col.clone())
+            }
+        })
+        .collect();
+    Ok(ExtractionTarget {
+        pairs: remapped_pairs,
+        columns: remapped_columns,
+    })
+}
+
+// =============================================================================
+// Helper Types & Functions for Extraction Targeting
+// =============================================================================
+
+/// A bundle of extraction pairs (expression + alias) and standalone columns
+/// that need to be pushed through a plan node.
+struct ExtractionTarget {
+    /// Extracted expressions paired with their generated aliases.
+    pairs: Vec<(Expr, String)>,
+    /// Standalone column references needed by the parent node.
+    columns: IndexSet<Column>,
+}
+
+/// Build a replacement map from a projection: output_column_name -> underlying_expr.
+///
+/// This is used to resolve column references through a renaming projection.
+/// For example, if a projection has `user AS x`, this maps `x` -> `col("user")`.
+fn build_projection_replace_map(projection: &Projection) -> HashMap<String, Expr> {
+    projection
+        .schema
+        .iter()
+        .zip(projection.expr.iter())
+        .map(|((qualifier, field), expr)| {
+            let key = Column::from((qualifier, field)).flat_name();
+            (key, expr.clone().unalias())
+        })
+        .collect()
+}
+
+/// Build a recovery projection to restore the original output schema.
+///
+/// After extraction, a node's output schema may differ from the original:
+///
+/// - **Schema-preserving nodes** (Filter/Sort/Limit): the extraction projection
+///   below adds extra `__datafusion_extracted_N` columns that bubble up through
+///   the node. Recovery selects only the original columns to hide the extras.
+///   ```text
+///   Original schema: [id, user]
+///   After extraction: [__datafusion_extracted_1, id, user]   ← extra column leaked through
+///   Recovery: SELECT id, user FROM ...                       ← hides __datafusion_extracted_1
+///   ```
+///
+/// - **Schema-defining nodes** (Aggregate): same number of columns but names
+///   may differ because extracted aliases replaced the original expressions.
+///   Recovery maps positionally, aliasing where names changed.
+///   ```text
+///   Original: [SUM(user['balance'])]
+///   After:    [SUM(__datafusion_extracted_1)]                ← name changed
+///   Recovery: SUM(__datafusion_extracted_1) AS "SUM(user['balance'])"
+///   ```
+///
+/// - **Schemas identical** → no recovery projection needed.
+fn build_recovery_projection(
+    original_schema: &DFSchema,
+    input: LogicalPlan,
+) -> Result<LogicalPlan> {
+    let new_schema = input.schema();
+    let orig_len = original_schema.fields().len();
+    let new_len = new_schema.fields().len();
+
+    if orig_len == new_len {
+        // Same number of fields — check if schemas are identical
+        let schemas_match = original_schema.iter().zip(new_schema.iter()).all(
+            |((orig_q, orig_f), (new_q, new_f))| {
+                orig_f.name() == new_f.name() && orig_q == new_q
+            },
+        );
+        if schemas_match {
+            return Ok(input);
+        }
+
+        // Schema-defining nodes (Aggregate, Join): names may differ at some
+        // positions because extracted aliases replaced the original expressions.
+        // Map positionally, aliasing where the name changed.
+        //
+        // Invariant: `with_new_exprs` on all supported node types (Aggregate,
+        // Filter, Sort, Limit, Join) preserves column order, so positional
+        // mapping is safe here.
+        debug_assert!(
+            orig_len == new_len,
+            "build_recovery_projection: positional mapping requires same field count, \
+             got original={orig_len} vs new={new_len}"
+        );
+        let mut proj_exprs = Vec::with_capacity(orig_len);
+        for (i, (orig_qualifier, orig_field)) in original_schema.iter().enumerate() {
+            let (new_qualifier, new_field) = new_schema.qualified_field(i);
+            if orig_field.name() == new_field.name() && orig_qualifier == new_qualifier {
+                proj_exprs.push(Expr::from((orig_qualifier, orig_field)));
+            } else {
+                let new_col = Expr::Column(Column::from((new_qualifier, new_field)));
+                proj_exprs.push(
+                    new_col.alias_qualified(orig_qualifier.cloned(), orig_field.name()),
+                );
+            }
+        }
+        let projection = Projection::try_new(proj_exprs, Arc::new(input))?;
+        Ok(LogicalPlan::Projection(projection))
+    } else {
+        // Schema-preserving nodes: new schema has extra extraction columns.
+        // Original columns still exist by name; select them to hide extras.
+        let col_exprs: Vec<Expr> = original_schema.iter().map(Expr::from).collect();
+        let projection = Projection::try_new(col_exprs, Arc::new(input))?;
+        Ok(LogicalPlan::Projection(projection))
+    }
+}
+
+/// Collects `MoveTowardsLeafNodes` sub-expressions found during expression
+/// tree traversal and can build an extraction projection from them.
+///
+/// # Example
+///
+/// Given `Filter: user['status'] = 'active' AND user['name'] IS NOT NULL`:
+/// - `add_extracted(user['status'])` → stores it, returns `col("__datafusion_extracted_1")`
+/// - `add_extracted(user['name'])`   → stores it, returns `col("__datafusion_extracted_2")`
+/// - `build_extraction_projection()` produces:
+///   `Projection: user['status'] AS __datafusion_extracted_1, user['name'] AS __datafusion_extracted_2, <all input columns>`
+struct LeafExpressionExtractor<'a> {
+    /// Extracted expressions: maps expression -> alias
+    extracted: IndexMap<Expr, String>,
+    /// Columns referenced by extracted expressions or the parent node,
+    /// included as pass-through in the extraction projection.
+    columns_needed: IndexSet<Column>,
+    /// Input schema
+    input_schema: &'a DFSchema,
+    /// Alias generator
+    alias_generator: &'a Arc<AliasGenerator>,
+}
+
+impl<'a> LeafExpressionExtractor<'a> {
+    fn new(input_schema: &'a DFSchema, alias_generator: &'a Arc<AliasGenerator>) -> Self {
+        Self {
+            extracted: IndexMap::new(),
+            columns_needed: IndexSet::new(),
+            input_schema,
+            alias_generator,
+        }
+    }
+
+    /// Adds an expression to extracted set, returns column reference.
+    fn add_extracted(&mut self, expr: Expr) -> Result<Expr> {
+        // Deduplication: reuse existing alias if same expression
+        if let Some(alias) = self.extracted.get(&expr) {
+            return Ok(Expr::Column(Column::new_unqualified(alias)));
+        }
+
+        // Track columns referenced by this expression
+        for col in expr.column_refs() {
+            self.columns_needed.insert(col.clone());
+        }
+
+        // Generate unique alias
+        let alias = self.alias_generator.next(EXTRACTED_EXPR_PREFIX);
+        self.extracted.insert(expr, alias.clone());
+
+        Ok(Expr::Column(Column::new_unqualified(&alias)))
+    }
+
+    /// Builds an extraction projection above the given input, or merges into
+    /// it if the input is already a projection. Delegates to
+    /// [`build_extraction_projection_impl`].
+    ///
+    /// Returns `None` if there are no extractions.
+    fn build_extraction_projection(
+        &self,
+        input: &Arc<LogicalPlan>,
+    ) -> Result<Option<LogicalPlan>> {
+        if self.extracted.is_empty() {
+            return Ok(None);
+        }
+        let pairs: Vec<(Expr, String)> = self
+            .extracted
+            .iter()
+            .map(|(e, a)| (e.clone(), a.clone()))
+            .collect();
+        let proj = build_extraction_projection_impl(
+            &pairs,
+            &self.columns_needed,
+            input,
+            self.input_schema,
+        )?;
+        Ok(Some(LogicalPlan::Projection(proj)))
+    }
+}
+
+/// Build an extraction projection above the target node (shared by both passes).
+///
+/// If the target is an existing projection, merges into it. This requires
+/// resolving column references through the projection's rename mapping:
+/// if the projection has `user AS u`, and an extracted expression references
+/// `u['name']`, we must rewrite it to `user['name']` since the merged
+/// projection reads from the same input as the original.
+///
+/// Deduplicates by resolved expression equality and adds pass-through
+/// columns as needed. Otherwise builds a fresh projection with extracted
+/// expressions + ALL input schema columns.
+fn build_extraction_projection_impl(
+    extracted_exprs: &[(Expr, String)],
+    columns_needed: &IndexSet<Column>,
+    target: &Arc<LogicalPlan>,
+    target_schema: &DFSchema,
+) -> Result<Projection> {
+    if let LogicalPlan::Projection(existing) = target.as_ref() {
+        // Merge into existing projection
+        let mut proj_exprs = existing.expr.clone();
+
+        // Build a map of existing expressions (by Expr equality) to their aliases
+        let existing_extractions: IndexMap<Expr, String> = existing
+            .expr
+            .iter()
+            .filter_map(|e| {
+                if let Expr::Alias(alias) = e
+                    && alias.name.starts_with(EXTRACTED_EXPR_PREFIX)
+                {
+                    return Some((*alias.expr.clone(), alias.name.clone()));
+                }
+                None
+            })
+            .collect();
+
+        // Resolve column references through the projection's rename mapping
+        let replace_map = build_projection_replace_map(existing);
+
+        // Add new extracted expressions, resolving column refs through the projection
+        for (expr, alias) in extracted_exprs {
+            let resolved = replace_cols_by_name(expr.clone().alias(alias), &replace_map)?;
+            let resolved_inner = if let Expr::Alias(a) = &resolved {
+                a.expr.as_ref()
+            } else {
+                &resolved
+            };
+            if let Some(existing_alias) = existing_extractions.get(resolved_inner) {
+                // Same expression already extracted under a different alias —
+                // add the expression with the new alias so both names are
+                // available in the output. We can't reference the existing alias
+                // as a column within the same projection, so we duplicate the
+                // computation.
+                if existing_alias != alias {
+                    proj_exprs.push(resolved);
+                }
+            } else {
+                proj_exprs.push(resolved);
+            }
+        }
+
+        // Add any new pass-through columns that aren't already in the projection.
+        // We check against existing.input.schema() (the projection's source) rather
+        // than target_schema (the projection's output) because columns produced
+        // by alias expressions (e.g., CSE's __common_expr_N) exist in the output but
+        // not the input, and cannot be added as pass-through Column references.
+        let existing_cols: IndexSet<Column> = existing
+            .expr
+            .iter()
+            .filter_map(|e| {
+                if let Expr::Column(c) = e {
+                    Some(c.clone())
+                } else {
+                    None
+                }
+            })
+            .collect();
+
+        let input_schema = existing.input.schema();
+        for col in columns_needed {
+            let col_expr = Expr::Column(col.clone());
+            let resolved = replace_cols_by_name(col_expr, &replace_map)?;
+            if let Expr::Column(resolved_col) = &resolved
+                && !existing_cols.contains(resolved_col)
+                && input_schema.has_column(resolved_col)
+            {
+                proj_exprs.push(Expr::Column(resolved_col.clone()));
+            }
+            // If resolved to non-column expr, it's already computed by existing projection
+        }
+
+        Projection::try_new(proj_exprs, Arc::clone(&existing.input))
+    } else {
+        // Build new projection with extracted expressions + all input columns
+        let mut proj_exprs = Vec::new();
+        for (expr, alias) in extracted_exprs {
+            proj_exprs.push(expr.clone().alias(alias));
+        }
+        for (qualifier, field) in target_schema.iter() {
+            proj_exprs.push(Expr::from((qualifier, field)));
+        }
+        Projection::try_new(proj_exprs, Arc::clone(target))
+    }
+}
+
+// =============================================================================
+// Pass 2: PushDownLeafProjections
+// =============================================================================
+
+/// Pushes extraction projections down through schema-preserving nodes towards
+/// leaf nodes (pass 2 of 2, after [`ExtractLeafExpressions`]).
+///
+/// Handles two types of projections:
+/// - **Pure extraction projections** (all `__datafusion_extracted` aliases + columns):
+///   pushes through Filter/Sort/Limit, merges into existing projections, or routes
+///   into multi-input node inputs (Join, SubqueryAlias, etc.)
+/// - **Mixed projections** (user projections containing `MoveTowardsLeafNodes`
+///   sub-expressions): splits into a recovery projection + extraction projection,
+///   then pushes the extraction projection down.
+///
+/// # Example: Pushing through a Filter
+///
+/// After pass 1, the extraction projection sits directly below the filter:
+/// ```text
+/// Projection: id, user                                                              <-- recovery
+///   Filter: __datafusion_extracted_1 = 'active'
+///     Projection: user['status'] AS __datafusion_extracted_1, id, user               <-- extraction
+///       TableScan: t [id, user]
+/// ```
+///
+/// Pass 2 pushes the extraction projection through the recovery and filter,
+/// and a subsequent `OptimizeProjections` pass removes the (now-redundant)
+/// recovery projection:
+/// ```text
+/// Filter: __datafusion_extracted_1 = 'active'
+///   Projection: user['status'] AS __datafusion_extracted_1, id, user                 <-- extraction (pushed down)
+///     TableScan: t [id, user]
+/// ```
+#[derive(Default, Debug)]
+pub struct PushDownLeafProjections {}
+
+impl PushDownLeafProjections {
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl OptimizerRule for PushDownLeafProjections {
+    fn name(&self) -> &str {
+        "push_down_leaf_projections"
+    }
+
+    fn apply_order(&self) -> Option<ApplyOrder> {
+        Some(ApplyOrder::TopDown)
+    }
+
+    fn rewrite(
+        &self,
+        plan: LogicalPlan,
+        config: &dyn OptimizerConfig,
+    ) -> Result<Transformed<LogicalPlan>> {
+        if !config.options().optimizer.enable_leaf_expression_pushdown {
+            return Ok(Transformed::no(plan));
+        }
+        let alias_generator = config.alias_generator();
+        match try_push_input(&plan, alias_generator)? {
+            Some(new_plan) => Ok(Transformed::yes(new_plan)),
+            None => Ok(Transformed::no(plan)),
+        }
+    }
+}
+
+/// Attempts to push a projection's extractable expressions further down.
+///
+/// Returns `Some(new_subtree)` if the projection was pushed down or merged,
+/// `None` if there is nothing to push or the projection sits above a barrier.
+fn try_push_input(
+    input: &LogicalPlan,
+    alias_generator: &Arc<AliasGenerator>,
+) -> Result<Option<LogicalPlan>> {
+    let LogicalPlan::Projection(proj) = input else {
+        return Ok(None);
+    };
+    split_and_push_projection(proj, alias_generator)
+}
+
+/// Splits a projection into extractable pieces, pushes them towards leaf
+/// nodes, and adds a recovery projection if needed.
+///
+/// Handles both:
+/// - **Pure extraction projections** (all `__datafusion_extracted` aliases + columns)
+/// - **Mixed projections** (containing `MoveTowardsLeafNodes` sub-expressions)
+///
+/// Returns `Some(new_subtree)` if extractions were pushed down,
+/// `None` if there is nothing to extract or push.
+///
+/// # Example: Mixed Projection
+///
+/// ```text
+/// Input plan:
+///   Projection: user['name'] IS NOT NULL AS has_name, id
+///     Filter: ...
+///       TableScan
+///
+/// Phase 1 (Split):
+///   extraction_pairs: [(user['name'], "__datafusion_extracted_1")]
+///   recovery_exprs:   [__datafusion_extracted_1 IS NOT NULL AS has_name, id]
+///
+/// Phase 2 (Push):
+///   Push extraction projection through Filter toward TableScan
+///
+/// Phase 3 (Recovery):
+///   Projection: __datafusion_extracted_1 IS NOT NULL AS has_name, id       <-- recovery
+///     Filter: ...
+///       Projection: user['name'] AS __datafusion_extracted_1, id           <-- extraction (pushed)
+///         TableScan
+/// ```
+fn split_and_push_projection(
+    proj: &Projection,
+    alias_generator: &Arc<AliasGenerator>,
+) -> Result<Option<LogicalPlan>> {
+    // Fast pre-check: skip if there are no pre-existing extracted aliases
+    // and no new extractable expressions.
+    let has_existing_extracted = proj.expr.iter().any(|e| {
+        matches!(e, Expr::Alias(alias) if alias.name.starts_with(EXTRACTED_EXPR_PREFIX))
+    });
+    if !has_existing_extracted && !has_extractable_expr(&proj.expr) {
+        return Ok(None);
+    }
+
+    let input = &proj.input;
+    let input_schema = input.schema();
+
+    // ── Phase 1: Split ──────────────────────────────────────────────────
+    // For each projection expression, collect extraction pairs and build
+    // recovery expressions.
+    //
+    // Pre-existing `__datafusion_extracted` aliases are inserted into the
+    // extractor's `IndexMap` with the **full** `Expr::Alias(…)` as the key,
+    // so the alias name participates in equality. This prevents collisions
+    // when CSE rewrites produce the same inner expression under different
+    // alias names (e.g. `__common_expr_4 AS __datafusion_extracted_1` and
+    // `__common_expr_4 AS __datafusion_extracted_3`). New extractions from
+    // `routing_extract` use bare (non-Alias) keys and get normal dedup.
+    //
+    // When building the final `extraction_pairs`, the Alias wrapper is
+    // stripped so consumers see the usual `(inner_expr, alias_name)` tuples.
+
+    let mut extractors = vec![LeafExpressionExtractor::new(
+        input_schema.as_ref(),
+        alias_generator,
+    )];
+    let input_column_sets = vec![schema_columns(input_schema.as_ref())];
+
+    let original_schema = proj.schema.as_ref();
+    let mut recovery_exprs: Vec<Expr> = Vec::with_capacity(proj.expr.len());
+    let mut needs_recovery = false;
+    let mut has_new_extractions = false;
+    let mut proj_exprs_captured: usize = 0;
+    // Track standalone column expressions (Case B) to detect column refs
+    // from extracted aliases (Case A) that aren't also standalone expressions.
+    let mut standalone_columns: IndexSet<Column> = IndexSet::new();
+
+    for (expr, (qualifier, field)) in proj.expr.iter().zip(original_schema.iter()) {
+        if let Expr::Alias(alias) = expr
+            && alias.name.starts_with(EXTRACTED_EXPR_PREFIX)
+        {
+            // Insert the full Alias expression as the key so that
+            // distinct alias names don't collide in the IndexMap.
+            let alias_name = alias.name.clone();
+
+            for col_ref in alias.expr.column_refs() {
+                extractors[0].columns_needed.insert(col_ref.clone());
+            }
+
+            extractors[0]
+                .extracted
+                .insert(expr.clone(), alias_name.clone());
+            recovery_exprs.push(Expr::Column(Column::new_unqualified(&alias_name)));
+            proj_exprs_captured += 1;
+        } else if let Expr::Column(col) = expr {
+            // Plain column pass-through — track it in the extractor
+            extractors[0].columns_needed.insert(col.clone());
+            standalone_columns.insert(col.clone());
+            recovery_exprs.push(expr.clone());
+            proj_exprs_captured += 1;
+        } else {
+            // Everything else: run through routing_extract
+            let transformed =
+                routing_extract(expr.clone(), &mut extractors, &input_column_sets)?;
+            if transformed.transformed {
+                has_new_extractions = true;
+            }
+            let transformed_expr = transformed.data;
+
+            // Build recovery expression, aliasing back to original name if needed
+            let original_name = field.name();
+            let needs_alias = if let Expr::Column(col) = &transformed_expr {
+                col.name.as_str() != original_name
+            } else {
+                let expr_name = transformed_expr.schema_name().to_string();
+                original_name != &expr_name
+            };
+            let recovery_expr = if needs_alias {
+                needs_recovery = true;
+                transformed_expr
+                    .clone()
+                    .alias_qualified(qualifier.cloned(), original_name)
+            } else {
+                transformed_expr.clone()
+            };
+
+            // If the expression was transformed (i.e., has extracted sub-parts),
+            // it differs from what the pushed projection outputs → needs recovery.
+            // Also, any non-column, non-__datafusion_extracted expression needs recovery
+            // because the pushed extraction projection won't output it directly.
+            if transformed.transformed || !matches!(expr, Expr::Column(_)) {
+                needs_recovery = true;
+            }
+
+            recovery_exprs.push(recovery_expr);
+        }
+    }
+
+    // Build extraction_pairs, stripping the Alias wrapper from pre-existing
+    // entries (they used the full Alias as the map key to avoid dedup).
+    let extractor = &extractors[0];
+    let extraction_pairs: Vec<(Expr, String)> = extractor
+        .extracted
+        .iter()
+        .map(|(e, a)| match e {
+            Expr::Alias(alias) => (*alias.expr.clone(), a.clone()),
+            _ => (e.clone(), a.clone()),
+        })
+        .collect();
+    let columns_needed = &extractor.columns_needed;
+
+    // If no extractions found, nothing to do
+    if extraction_pairs.is_empty() {
+        return Ok(None);
+    }
+
+    // If columns_needed has entries that aren't standalone projection columns
+    // (i.e., they came from column refs inside extracted aliases), a merge
+    // into an inner projection will widen the schema with those extra columns,
+    // requiring a recovery projection to restore the original schema.
+    if columns_needed
+        .iter()
+        .any(|c| !standalone_columns.contains(c))
+    {
+        needs_recovery = true;
+    }
+
+    // ── Phase 2: Push down ──────────────────────────────────────────────
+    let proj_input = Arc::clone(&proj.input);
+    let pushed = push_extraction_pairs(
+        &extraction_pairs,
+        columns_needed,
+        proj,
+        &proj_input,
+        alias_generator,
+        proj_exprs_captured,
+    )?;
+
+    // ── Phase 3: Recovery ───────────────────────────────────────────────
+    // Determine the base plan: either the pushed result or an in-place extraction.
+    let base_plan = match pushed {
+        Some(plan) => plan,
+        None => {
+            if !has_new_extractions {
+                // Only pre-existing __datafusion_extracted aliases and columns, no new
+                // extractions from routing_extract. The original projection is
+                // already an extraction projection that couldn't be pushed
+                // further. Return None.
+                return Ok(None);
+            }
+            // Build extraction projection in-place (couldn't push down)
+            let input_arc = Arc::clone(input);
+            let extraction = build_extraction_projection_impl(
+                &extraction_pairs,
+                columns_needed,
+                &input_arc,
+                input_schema.as_ref(),
+            )?;
+            LogicalPlan::Projection(extraction)
+        }
+    };
+
+    // Wrap with recovery projection if the output schema changed
+    if needs_recovery {
+        let recovery = LogicalPlan::Projection(Projection::try_new(
+            recovery_exprs,
+            Arc::new(base_plan),
+        )?);
+        Ok(Some(recovery))
+    } else {
+        Ok(Some(base_plan))
+    }
+}
+
+/// Returns true if the plan is a Projection where ALL expressions are either
+/// `Alias(EXTRACTED_EXPR_PREFIX, ...)` or `Column`, with at least one extraction.
+/// Such projections can safely be pushed further without re-extraction.
+fn is_pure_extraction_projection(plan: &LogicalPlan) -> bool {
+    let LogicalPlan::Projection(proj) = plan else {
+        return false;
+    };
+    let mut has_extraction = false;
+    for expr in &proj.expr {
+        match expr {
+            Expr::Alias(alias) if alias.name.starts_with(EXTRACTED_EXPR_PREFIX) => {
+                has_extraction = true;
+            }
+            Expr::Column(_) => {}
+            _ => return false,
+        }
+    }
+    has_extraction
+}
+
+/// Pushes extraction pairs down through the projection's input node,
+/// dispatching to the appropriate handler based on the input node type.
+fn push_extraction_pairs(
+    pairs: &[(Expr, String)],
+    columns_needed: &IndexSet<Column>,
+    proj: &Projection,
+    proj_input: &Arc<LogicalPlan>,
+    alias_generator: &Arc<AliasGenerator>,
+    proj_exprs_captured: usize,
+) -> Result<Option<LogicalPlan>> {
+    match proj_input.as_ref() {
+        // Merge into existing projection, then try to push the result further down.
+        // Only merge when every expression in the outer projection is fully
+        // captured as either an extraction pair (Case A: __datafusion_extracted
+        // alias) or a plain column (Case B). Uncaptured expressions (e.g.
+        // `col AS __common_expr_1` from CSE, or complex expressions with
+        // extracted sub-parts) would be lost during the merge.
+        LogicalPlan::Projection(_) if proj_exprs_captured == proj.expr.len() => {
+            let target_schema = Arc::clone(proj_input.schema());
+            let merged = build_extraction_projection_impl(
+                pairs,
+                columns_needed,
+                proj_input,
+                target_schema.as_ref(),
+            )?;
+            let merged_plan = LogicalPlan::Projection(merged);
+
+            // After merging, try to push the result further down, but ONLY
+            // if the merged result is still a pure extraction projection
+            // (all __datafusion_extracted aliases + columns). If the merge inherited
+            // bare MoveTowardsLeafNodes expressions from the inner projection,
+            // pushing would re-extract them into new aliases and fail when
+            // the (None, true) fallback can't find the original aliases.
+            // This handles: Extraction → Recovery(cols) → Filter → ... → TableScan
+            // by pushing through the recovery projection AND the filter in one pass.
+            if is_pure_extraction_projection(&merged_plan)
+                && let Some(pushed) = try_push_input(&merged_plan, alias_generator)?
+            {
+                return Ok(Some(pushed));
+            }
+            Ok(Some(merged_plan))
+        }
+        // Generic: handles Filter/Sort/Limit (via recursion),
+        // SubqueryAlias (with qualifier remap in try_push_into_inputs),
+        // Join, and anything else.
+        // Safely bails out for nodes that don't pass through extracted
+        // columns (Aggregate, Window) via the output schema check.
+        _ => try_push_into_inputs(
+            pairs,
+            columns_needed,
+            proj_input.as_ref(),
+            alias_generator,
+        ),
+    }
+}
+
+/// Routes extraction pairs and columns to the appropriate inputs.
+///
+/// - **Union**: broadcasts to every input via [`remap_pairs_and_columns`].
+/// - **Other nodes**: routes each expression to the one input that owns
+///   all of its column references (via [`find_owning_input`]).
+///
+/// Returns `None` if any expression can't be routed or no input has pairs.
+fn route_to_inputs(
+    pairs: &[(Expr, String)],
+    columns: &IndexSet<Column>,
+    node: &LogicalPlan,
+    input_column_sets: &[std::collections::HashSet<ColumnReference>],
+    input_schemas: &[Arc<DFSchema>],
+) -> Result<Option<Vec<ExtractionTarget>>> {
+    let num_inputs = input_schemas.len();
+    let mut per_input: Vec<ExtractionTarget> = (0..num_inputs)
+        .map(|_| ExtractionTarget {
+            pairs: vec![],
+            columns: IndexSet::new(),
+        })
+        .collect();
+
+    if matches!(node, LogicalPlan::Union(_)) {
+        // Union output schema and each input schema have the same fields by
+        // index but may differ in qualifiers (e.g. output `s` vs input
+        // `simple_struct.s`). Remap pairs/columns to each input's space.
+        let union_schema = node.schema();
+        for (idx, input_schema) in input_schemas.iter().enumerate() {
+            per_input[idx] =
+                remap_pairs_and_columns(pairs, columns, union_schema, input_schema)?;
+        }
+    } else {
+        for (expr, alias) in pairs {
+            match find_owning_input(expr, input_column_sets) {
+                Some(idx) => per_input[idx].pairs.push((expr.clone(), alias.clone())),
+                None => return Ok(None), // Cross-input expression — bail out
+            }
+        }
+        for col in columns {
+            let col_expr = Expr::Column(col.clone());
+            match find_owning_input(&col_expr, input_column_sets) {
+                Some(idx) => {
+                    per_input[idx].columns.insert(col.clone());
+                }
+                None => return Ok(None), // Ambiguous column — bail out
+            }
+        }
+    }
+
+    // Check at least one input has extractions to push
+    if per_input.iter().all(|t| t.pairs.is_empty()) {
+        return Ok(None);
+    }
+
+    Ok(Some(per_input))
+}
+
+/// Pushes extraction expressions into a node's inputs by routing each
+/// expression to the input that owns all of its column references.
+///
+/// Works for any number of inputs (1, 2, …N). For single-input nodes,
+/// all expressions trivially route to that input. For multi-input nodes
+/// (Join, etc.), each expression is routed to the side that owns its columns.
+///
+/// Returns `Some(new_node)` if all expressions could be routed AND the
+/// rebuilt node's output schema contains all extracted aliases.
+/// Returns `None` if any expression references columns from multiple inputs
+/// or the node doesn't pass through the extracted columns.
+///
+/// # Example: Join with expressions from both sides
+///
+/// ```text
+/// Extraction projection above a Join:
+///   Projection: left.user['name'] AS __datafusion_extracted_1, right.order['total'] AS __datafusion_extracted_2, ...
+///     Join: left.id = right.user_id
+///       TableScan: left [id, user]
+///       TableScan: right [user_id, order]
+///
+/// After routing each expression to its owning input:
+///   Join: left.id = right.user_id
+///     Projection: user['name'] AS __datafusion_extracted_1, id, user              <-- left-side extraction
+///       TableScan: left [id, user]
+///     Projection: order['total'] AS __datafusion_extracted_2, user_id, order      <-- right-side extraction
+///       TableScan: right [user_id, order]
+/// ```
+fn try_push_into_inputs(
+    pairs: &[(Expr, String)],
+    columns_needed: &IndexSet<Column>,
+    node: &LogicalPlan,
+    alias_generator: &Arc<AliasGenerator>,
+) -> Result<Option<LogicalPlan>> {
+    let inputs = node.inputs();
+    if inputs.is_empty() {
+        return Ok(None);
+    }
+
+    // SubqueryAlias remaps qualifiers between input and output.
+    // Rewrite pairs/columns from alias-space to input-space before routing.
+    let remapped = if let LogicalPlan::SubqueryAlias(sa) = node {
+        remap_pairs_and_columns(pairs, columns_needed, &sa.schema, sa.input.schema())?
+    } else {
+        ExtractionTarget {
+            pairs: pairs.to_vec(),
+            columns: columns_needed.clone(),
+        }
+    };
+    let pairs = &remapped.pairs[..];
+    let columns_needed = &remapped.columns;
+
+    // Build per-input schemas and column sets for routing
+    let input_schemas: Vec<Arc<DFSchema>> =
+        inputs.iter().map(|i| Arc::clone(i.schema())).collect();
+    let input_column_sets: Vec<std::collections::HashSet<ColumnReference>> =
+        input_schemas.iter().map(|s| schema_columns(s)).collect();
+
+    // Route pairs and columns to the appropriate inputs
+    let per_input = match route_to_inputs(
+        pairs,
+        columns_needed,
+        node,
+        &input_column_sets,
+        &input_schemas,
+    )? {
+        Some(routed) => routed,
+        None => return Ok(None),
+    };
+
+    let num_inputs = inputs.len();
+
+    // Build per-input extraction projections and push them as far as possible
+    // immediately. This is critical because map_children preserves cached schemas,
+    // so if the TopDown pass later pushes a child further (changing its output
+    // schema), the parent node's schema becomes stale.
+    let mut new_inputs: Vec<LogicalPlan> = Vec::with_capacity(num_inputs);
+    for (idx, input) in inputs.into_iter().enumerate() {
+        if per_input[idx].pairs.is_empty() {
+            new_inputs.push(input.clone());
+        } else {
+            let input_arc = Arc::new(input.clone());
+            let target_schema = Arc::clone(input.schema());
+            let proj = build_extraction_projection_impl(
+                &per_input[idx].pairs,
+                &per_input[idx].columns,
+                &input_arc,
+                target_schema.as_ref(),
+            )?;
+            // Verify all requested aliases appear in the projection's output.
+            // A merge may deduplicate if the same expression already exists
+            // under a different alias, leaving the requested alias missing.
+            let proj_schema = proj.schema.as_ref();
+            for (_expr, alias) in &per_input[idx].pairs {
+                if !proj_schema.fields().iter().any(|f| f.name() == alias) {
+                    return Ok(None);
+                }
+            }
+            let proj_plan = LogicalPlan::Projection(proj);
+            // Try to push the extraction projection further down within
+            // this input (e.g., through Filter → existing extraction projection).
+            // This ensures the input's output schema is stable and won't change
+            // when the TopDown pass later visits children.
+            match try_push_input(&proj_plan, alias_generator)? {
+                Some(pushed) => new_inputs.push(pushed),
+                None => new_inputs.push(proj_plan),
+            }
+        }
+    }
+
+    // Rebuild the node with new inputs
+    let new_node = node.with_new_exprs(node.expressions(), new_inputs)?;
+
+    // Safety check: verify all extracted aliases appear in the rebuilt
+    // node's output schema. Nodes like Aggregate define their own output
+    // and won't pass through extracted columns — bail out for those.
+    let output_schema = new_node.schema();
+    for (_expr, alias) in pairs {
+        if !output_schema.fields().iter().any(|f| f.name() == alias) {
+            return Ok(None);
+        }
+    }
+
+    Ok(Some(new_node))
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+    use crate::optimize_projections::OptimizeProjections;
+    use crate::test::udfs::PlacementTestUDF;
+    use crate::test::*;
+    use crate::{Optimizer, OptimizerContext};
+    use datafusion_expr::expr::ScalarFunction;
+    use datafusion_expr::{
+        ScalarUDF, col, lit, logical_plan::builder::LogicalPlanBuilder,
+    };
+
+    fn leaf_udf(expr: Expr, name: &str) -> Expr {
+        Expr::ScalarFunction(ScalarFunction::new_udf(
+            Arc::new(ScalarUDF::new_from_impl(
+                PlacementTestUDF::new()
+                    .with_placement(ExpressionPlacement::MoveTowardsLeafNodes),
+            )),
+            vec![expr, lit(name)],
+        ))
+    }
+
+    // =========================================================================
+    // Combined optimization stage formatter
+    // =========================================================================
+
+    /// Runs all 4 optimization stages and returns a single formatted string.
+    /// Stages that produce the same plan as the previous stage show
+    /// "(same as <previous>)" to reduce noise.
+    ///
+    /// Stages:
+    /// 1. **Original** - OptimizeProjections only (baseline)
+    /// 2. **After Extraction** - + ExtractLeafExpressions
+    /// 3. **After Pushdown** - + PushDownLeafProjections
+    /// 4. **Optimized** - + final OptimizeProjections
+    fn format_optimization_stages(plan: &LogicalPlan) -> Result<String> {
+        let run = |rules: Vec<Arc<dyn OptimizerRule + Send + Sync>>| -> Result<String> {
+            let ctx = OptimizerContext::new().with_max_passes(1);
+            let optimizer = Optimizer::with_rules(rules);
+            let optimized = optimizer.optimize(plan.clone(), &ctx, |_, _| {})?;
+            Ok(format!("{optimized}"))
+        };
+
+        let original = run(vec![Arc::new(OptimizeProjections::new())])?;
+
+        let after_extract = run(vec![
+            Arc::new(OptimizeProjections::new()),
+            Arc::new(ExtractLeafExpressions::new()),
+        ])?;
+
+        let after_pushdown = run(vec![
+            Arc::new(OptimizeProjections::new()),
+            Arc::new(ExtractLeafExpressions::new()),
+            Arc::new(PushDownLeafProjections::new()),
+        ])?;
+
+        let optimized = run(vec![
+            Arc::new(OptimizeProjections::new()),
+            Arc::new(ExtractLeafExpressions::new()),
+            Arc::new(PushDownLeafProjections::new()),
+            Arc::new(OptimizeProjections::new()),
+        ])?;
+
+        let mut out = format!("## Original Plan\n{original}");
+
+        out.push_str("\n\n## After Extraction\n");
+        if after_extract == original {
+            out.push_str("(same as original)");
+        } else {
+            out.push_str(&after_extract);
+        }
+
+        out.push_str("\n\n## After Pushdown\n");
+        if after_pushdown == after_extract {
+            out.push_str("(same as after extraction)");
+        } else {
+            out.push_str(&after_pushdown);
+        }
+
+        out.push_str("\n\n## Optimized\n");
+        if optimized == after_pushdown {
+            out.push_str("(same as after pushdown)");
+        } else {
+            out.push_str(&optimized);
+        }
+
+        Ok(out)
+    }
+
+    /// Assert all optimization stages for a plan in a single insta snapshot.
+    macro_rules! assert_stages {
+        ($plan:expr, @ $expected:literal $(,)?) => {{
+            let result = format_optimization_stages(&$plan)?;
+            insta::assert_snapshot!(result, @ $expected);
+            Ok::<(), datafusion_common::DataFusionError>(())
+        }};
+    }
+
+    #[test]
+    fn test_extract_from_filter() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan.clone())
+            .filter(leaf_udf(col("user"), "status").eq(lit("active")))?
+            .select(vec![
+                table_scan
+                    .schema()
+                    .index_of_column_by_name(None, "id")
+                    .unwrap(),
+            ])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: test.id
+          Filter: leaf_udf(test.user, Utf8("status")) = Utf8("active")
+            TableScan: test projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id
+          Projection: test.id, test.user
+            Filter: __datafusion_extracted_1 = Utf8("active")
+              Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user
+                TableScan: test projection=[id, user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        Projection: test.id
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id
+              TableScan: test projection=[id, user]
+        "#)
+    }
+
+    #[test]
+    fn test_no_extraction_for_column() -> Result<()> {
+        let table_scan = test_table_scan()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(col("a").eq(lit(1)))?
+            .build()?;
+
+        assert_stages!(plan, @"
+        ## Original Plan
+        Filter: test.a = Int32(1)
+          TableScan: test projection=[a, b, c]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        ")
+    }
+
+    #[test]
+    fn test_extract_from_projection() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![leaf_udf(col("user"), "name")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("name"))
+          TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name"))
+          Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user
+            TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: leaf_udf(test.user, Utf8("name"))
+          TableScan: test projection=[user]
+        "#)
+    }
+
+    #[test]
+    fn test_extract_from_projection_with_subexpression() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![
+                leaf_udf(col("user"), "name")
+                    .is_not_null()
+                    .alias("has_name"),
+            ])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("name")) IS NOT NULL AS has_name
+          TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 IS NOT NULL AS has_name
+          Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user
+            TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: leaf_udf(test.user, Utf8("name")) IS NOT NULL AS has_name
+          TableScan: test projection=[user]
+        "#)
+    }
+
+    #[test]
+    fn test_projection_no_extraction_for_column() -> Result<()> {
+        let table_scan = test_table_scan()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("a"), col("b")])?
+            .build()?;
+
+        assert_stages!(plan, @"
+        ## Original Plan
+        TableScan: test projection=[a, b]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        ")
+    }
+
+    #[test]
+    fn test_filter_with_deduplication() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let field_access = leaf_udf(col("user"), "name");
+        // Filter with the same expression used twice
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(
+                field_access
+                    .clone()
+                    .is_not_null()
+                    .and(field_access.is_null()),
+            )?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Filter: leaf_udf(test.user, Utf8("name")) IS NOT NULL AND leaf_udf(test.user, Utf8("name")) IS NULL
+          TableScan: test projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, test.user
+          Filter: __datafusion_extracted_1 IS NOT NULL AND __datafusion_extracted_1 IS NULL
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user
+              TableScan: test projection=[id, user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    #[test]
+    fn test_already_leaf_expression_in_filter() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(leaf_udf(col("user"), "name").eq(lit("test")))?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Filter: leaf_udf(test.user, Utf8("name")) = Utf8("test")
+          TableScan: test projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, test.user
+          Filter: __datafusion_extracted_1 = Utf8("test")
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user
+              TableScan: test projection=[id, user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    #[test]
+    fn test_extract_from_aggregate_group_by() -> Result<()> {
+        use datafusion_expr::test::function_stub::count;
+
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(vec![leaf_udf(col("user"), "status")], vec![count(lit(1))])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Aggregate: groupBy=[[leaf_udf(test.user, Utf8("status"))]], aggr=[[COUNT(Int32(1))]]
+          TableScan: test projection=[user]
+
+        ## After Extraction
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("status")), COUNT(Int32(1))
+          Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]]
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user
+              TableScan: test projection=[user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("status")), COUNT(Int32(1))
+          Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]]
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1
+              TableScan: test projection=[user]
+        "#)
+    }
+
+    #[test]
+    fn test_extract_from_aggregate_args() -> Result<()> {
+        use datafusion_expr::test::function_stub::count;
+
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(
+                vec![col("user")],
+                vec![count(leaf_udf(col("user"), "value"))],
+            )?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Aggregate: groupBy=[[test.user]], aggr=[[COUNT(leaf_udf(test.user, Utf8("value")))]]
+          TableScan: test projection=[user]
+
+        ## After Extraction
+        Projection: test.user, COUNT(__datafusion_extracted_1) AS COUNT(leaf_udf(test.user,Utf8("value")))
+          Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_1)]]
+            Projection: leaf_udf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user
+              TableScan: test projection=[user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    #[test]
+    fn test_projection_with_filter_combined() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(leaf_udf(col("user"), "status").eq(lit("active")))?
+            .project(vec![leaf_udf(col("user"), "name")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("name"))
+          Filter: leaf_udf(test.user, Utf8("status")) = Utf8("active")
+            TableScan: test projection=[user]
+
+        ## After Extraction
+        Projection: leaf_udf(test.user, Utf8("name"))
+          Projection: test.user
+            Filter: __datafusion_extracted_1 = Utf8("active")
+              Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user
+                TableScan: test projection=[user]
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name"))
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2
+              TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name"))
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2
+              TableScan: test projection=[user]
+        "#)
+    }
+
+    #[test]
+    fn test_projection_preserves_alias() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![leaf_udf(col("user"), "name").alias("username")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("name")) AS username
+          TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS username
+          Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user
+            TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: leaf_udf(test.user, Utf8("name")) AS username
+          TableScan: test projection=[user]
+        "#)
+    }
+
+    /// Test: Projection with different field than Filter
+    /// SELECT id, s['label'] FROM t WHERE s['value'] > 150
+    /// Both s['label'] and s['value'] should be in a single extraction projection.
+    #[test]
+    fn test_projection_different_field_from_filter() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(leaf_udf(col("user"), "value").gt(lit(150)))?
+            .project(vec![col("user"), leaf_udf(col("user"), "label")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: test.user, leaf_udf(test.user, Utf8("label"))
+          Filter: leaf_udf(test.user, Utf8("value")) > Int32(150)
+            TableScan: test projection=[user]
+
+        ## After Extraction
+        Projection: test.user, leaf_udf(test.user, Utf8("label"))
+          Projection: test.user
+            Filter: __datafusion_extracted_1 > Int32(150)
+              Projection: leaf_udf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user
+                TableScan: test projection=[user]
+
+        ## After Pushdown
+        Projection: test.user, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("label"))
+          Filter: __datafusion_extracted_1 > Int32(150)
+            Projection: leaf_udf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user, leaf_udf(test.user, Utf8("label")) AS __datafusion_extracted_2
+              TableScan: test projection=[user]
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    #[test]
+    fn test_projection_deduplication() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let field = leaf_udf(col("user"), "name");
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![field.clone(), field.clone().alias("name2")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("name")), leaf_udf(test.user, Utf8("name")) AS name2
+          TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")), __datafusion_extracted_1 AS name2
+          Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user
+            TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: leaf_udf(test.user, Utf8("name")), leaf_udf(test.user, Utf8("name")) AS name2
+          TableScan: test projection=[user]
+        "#)
+    }
+
+    // =========================================================================
+    // Additional tests for code coverage
+    // =========================================================================
+
+    /// Extractions push through Sort nodes to reach the TableScan.
+    #[test]
+    fn test_extract_through_sort() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .sort(vec![col("user").sort(true, true)])?
+            .project(vec![leaf_udf(col("user"), "name")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("name"))
+          Sort: test.user ASC NULLS FIRST
+            TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name"))
+          Sort: test.user ASC NULLS FIRST
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user
+              TableScan: test projection=[user]
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    /// Extractions push through Limit nodes to reach the TableScan.
+    #[test]
+    fn test_extract_through_limit() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .limit(0, Some(10))?
+            .project(vec![leaf_udf(col("user"), "name")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("name"))
+          Limit: skip=0, fetch=10
+            TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name"))
+          Limit: skip=0, fetch=10
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user
+              TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name"))
+          Limit: skip=0, fetch=10
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1
+              TableScan: test projection=[user]
+        "#)
+    }
+
+    /// Aliased aggregate functions like count(...).alias("cnt") are handled.
+    #[test]
+    fn test_extract_from_aliased_aggregate() -> Result<()> {
+        use datafusion_expr::test::function_stub::count;
+
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(
+                vec![col("user")],
+                vec![count(leaf_udf(col("user"), "value")).alias("cnt")],
+            )?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Aggregate: groupBy=[[test.user]], aggr=[[COUNT(leaf_udf(test.user, Utf8("value"))) AS cnt]]
+          TableScan: test projection=[user]
+
+        ## After Extraction
+        Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_1) AS cnt]]
+          Projection: leaf_udf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user
+            TableScan: test projection=[user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    /// Aggregates with no MoveTowardsLeafNodes expressions return unchanged.
+    #[test]
+    fn test_aggregate_no_extraction() -> Result<()> {
+        use datafusion_expr::test::function_stub::count;
+
+        let table_scan = test_table_scan()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(vec![col("a")], vec![count(col("b"))])?
+            .build()?;
+
+        assert_stages!(plan, @"
+        ## Original Plan
+        Aggregate: groupBy=[[test.a]], aggr=[[COUNT(test.b)]]
+          TableScan: test projection=[a, b]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        ")
+    }
+
+    /// Projections containing extracted expression aliases are skipped (already extracted).
+    #[test]
+    fn test_skip_extracted_projection() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![
+                leaf_udf(col("user"), "name").alias("__datafusion_extracted_manual"),
+                col("user"),
+            ])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_manual, test.user
+          TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    /// Multiple extractions merge into a single extracted expression projection.
+    #[test]
+    fn test_merge_into_existing_extracted_projection() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(leaf_udf(col("user"), "status").eq(lit("active")))?
+            .filter(leaf_udf(col("user"), "name").is_not_null())?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Filter: leaf_udf(test.user, Utf8("name")) IS NOT NULL
+          Filter: leaf_udf(test.user, Utf8("status")) = Utf8("active")
+            TableScan: test projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, test.user
+          Filter: __datafusion_extracted_1 IS NOT NULL
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user
+              Projection: test.id, test.user
+                Filter: __datafusion_extracted_2 = Utf8("active")
+                  Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user
+                    TableScan: test projection=[id, user]
+
+        ## After Pushdown
+        Projection: test.id, test.user
+          Filter: __datafusion_extracted_1 IS NOT NULL
+            Filter: __datafusion_extracted_2 = Utf8("active")
+              Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1
+                TableScan: test projection=[id, user]
+
+        ## Optimized
+        Projection: test.id, test.user
+          Filter: __datafusion_extracted_1 IS NOT NULL
+            Projection: test.id, test.user, __datafusion_extracted_1
+              Filter: __datafusion_extracted_2 = Utf8("active")
+                Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1
+                  TableScan: test projection=[id, user]
+        "#)
+    }
+
+    /// Extractions push through passthrough projections (columns only).
+    #[test]
+    fn test_extract_through_passthrough_projection() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("user")])?
+            .project(vec![leaf_udf(col("user"), "name")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("name"))
+          TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name"))
+          Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user
+            TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: leaf_udf(test.user, Utf8("name"))
+          TableScan: test projection=[user]
+        "#)
+    }
+
+    /// Projections with aliased columns (nothing to extract) return unchanged.
+    #[test]
+    fn test_projection_early_return_no_extraction() -> Result<()> {
+        let table_scan = test_table_scan()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("a").alias("x"), col("b")])?
+            .build()?;
+
+        assert_stages!(plan, @"
+        ## Original Plan
+        Projection: test.a AS x, test.b
+          TableScan: test projection=[a, b]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        ")
+    }
+
+    /// Projections with arithmetic expressions but no MoveTowardsLeafNodes return unchanged.
+    #[test]
+    fn test_projection_with_arithmetic_no_extraction() -> Result<()> {
+        let table_scan = test_table_scan()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![(col("a") + col("b")).alias("sum")])?
+            .build()?;
+
+        assert_stages!(plan, @"
+        ## Original Plan
+        Projection: test.a + test.b AS sum
+          TableScan: test projection=[a, b]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        ")
+    }
+
+    /// Aggregate extractions merge into existing extracted projection created by Filter.
+    #[test]
+    fn test_aggregate_merge_into_extracted_projection() -> Result<()> {
+        use datafusion_expr::test::function_stub::count;
+
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(leaf_udf(col("user"), "status").eq(lit("active")))?
+            .aggregate(vec![leaf_udf(col("user"), "name")], vec![count(lit(1))])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Aggregate: groupBy=[[leaf_udf(test.user, Utf8("name"))]], aggr=[[COUNT(Int32(1))]]
+          Filter: leaf_udf(test.user, Utf8("status")) = Utf8("active")
+            TableScan: test projection=[user]
+
+        ## After Extraction
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")), COUNT(Int32(1))
+          Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]]
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user
+              Projection: test.user
+                Filter: __datafusion_extracted_2 = Utf8("active")
+                  Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user
+                    TableScan: test projection=[user]
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")), COUNT(Int32(1))
+          Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]]
+            Filter: __datafusion_extracted_2 = Utf8("active")
+              Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1
+                TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")), COUNT(Int32(1))
+          Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]]
+            Projection: __datafusion_extracted_1
+              Filter: __datafusion_extracted_2 = Utf8("active")
+                Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1
+                  TableScan: test projection=[user]
+        "#)
+    }
+
+    /// Projection containing a MoveTowardsLeafNodes sub-expression above an
+    /// Aggregate. Aggregate blocks pushdown, so the (None, true) recovery
+    /// fallback path fires: in-place extraction + recovery projection.
+    #[test]
+    fn test_projection_with_leaf_expr_above_aggregate() -> Result<()> {
+        use datafusion_expr::test::function_stub::count;
+
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(vec![col("user")], vec![count(lit(1))])?
+            .project(vec![
+                leaf_udf(col("user"), "name")
+                    .is_not_null()
+                    .alias("has_name"),
+                col("COUNT(Int32(1))"),
+            ])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("name")) IS NOT NULL AS has_name, COUNT(Int32(1))
+          Aggregate: groupBy=[[test.user]], aggr=[[COUNT(Int32(1))]]
+            TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 IS NOT NULL AS has_name, COUNT(Int32(1))
+          Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user, COUNT(Int32(1))
+            Aggregate: groupBy=[[test.user]], aggr=[[COUNT(Int32(1))]]
+              TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: leaf_udf(test.user, Utf8("name")) IS NOT NULL AS has_name, COUNT(Int32(1))
+          Aggregate: groupBy=[[test.user]], aggr=[[COUNT(Int32(1))]]
+            TableScan: test projection=[user]
+        "#)
+    }
+
+    /// Merging adds new pass-through columns not in the existing extracted projection.
+    #[test]
+    fn test_merge_with_new_columns() -> Result<()> {
+        let table_scan = test_table_scan()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(leaf_udf(col("a"), "x").eq(lit(1)))?
+            .filter(leaf_udf(col("b"), "y").eq(lit(2)))?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Filter: leaf_udf(test.b, Utf8("y")) = Int32(2)
+          Filter: leaf_udf(test.a, Utf8("x")) = Int32(1)
+            TableScan: test projection=[a, b, c]
+
+        ## After Extraction
+        Projection: test.a, test.b, test.c
+          Filter: __datafusion_extracted_1 = Int32(2)
+            Projection: leaf_udf(test.b, Utf8("y")) AS __datafusion_extracted_1, test.a, test.b, test.c
+              Projection: test.a, test.b, test.c
+                Filter: __datafusion_extracted_2 = Int32(1)
+                  Projection: leaf_udf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c
+                    TableScan: test projection=[a, b, c]
+
+        ## After Pushdown
+        Projection: test.a, test.b, test.c
+          Filter: __datafusion_extracted_1 = Int32(2)
+            Filter: __datafusion_extracted_2 = Int32(1)
+              Projection: leaf_udf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c, leaf_udf(test.b, Utf8("y")) AS __datafusion_extracted_1
+                TableScan: test projection=[a, b, c]
+
+        ## Optimized
+        Projection: test.a, test.b, test.c
+          Filter: __datafusion_extracted_1 = Int32(2)
+            Projection: test.a, test.b, test.c, __datafusion_extracted_1
+              Filter: __datafusion_extracted_2 = Int32(1)
+                Projection: leaf_udf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c, leaf_udf(test.b, Utf8("y")) AS __datafusion_extracted_1
+                  TableScan: test projection=[a, b, c]
+        "#)
+    }
+
+    // =========================================================================
+    // Join extraction tests
+    // =========================================================================
+
+    /// Create a second table scan with struct field for join tests
+    fn test_table_scan_with_struct_named(name: &str) -> Result<LogicalPlan> {
+        use arrow::datatypes::Schema;
+        let schema = Schema::new(test_table_scan_with_struct_fields());
+        datafusion_expr::logical_plan::table_scan(Some(name), &schema, None)?.build()
+    }
+
+    /// Extraction from equijoin keys (`on` expressions).
+    #[test]
+    fn test_extract_from_join_on() -> Result<()> {
+        use datafusion_expr::JoinType;
+
+        let left = test_table_scan_with_struct()?;
+        let right = test_table_scan_with_struct_named("right")?;
+
+        let plan = LogicalPlanBuilder::from(left)
+            .join_with_expr_keys(
+                right,
+                JoinType::Inner,
+                (
+                    vec![leaf_udf(col("user"), "id")],
+                    vec![leaf_udf(col("user"), "id")],
+                ),
+                None,
+            )?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Inner Join: leaf_udf(test.user, Utf8("id")) = leaf_udf(right.user, Utf8("id"))
+          TableScan: test projection=[id, user]
+          TableScan: right projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, test.user, right.id, right.user
+          Inner Join: __datafusion_extracted_1 = __datafusion_extracted_2
+            Projection: leaf_udf(test.user, Utf8("id")) AS __datafusion_extracted_1, test.id, test.user
+              TableScan: test projection=[id, user]
+            Projection: leaf_udf(right.user, Utf8("id")) AS __datafusion_extracted_2, right.id, right.user
+              TableScan: right projection=[id, user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    /// Extraction from non-equi join filter.
+    #[test]
+    fn test_extract_from_join_filter() -> Result<()> {
+        use datafusion_expr::JoinType;
+
+        let left = test_table_scan_with_struct()?;
+        let right = test_table_scan_with_struct_named("right")?;
+
+        let plan = LogicalPlanBuilder::from(left)
+            .join_on(
+                right,
+                JoinType::Inner,
+                vec![
+                    col("test.user").eq(col("right.user")),
+                    leaf_udf(col("test.user"), "status").eq(lit("active")),
+                ],
+            )?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Inner Join:  Filter: test.user = right.user AND leaf_udf(test.user, Utf8("status")) = Utf8("active")
+          TableScan: test projection=[id, user]
+          TableScan: right projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, test.user, right.id, right.user
+          Inner Join:  Filter: test.user = right.user AND __datafusion_extracted_1 = Utf8("active")
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user
+              TableScan: test projection=[id, user]
+            TableScan: right projection=[id, user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    /// Extraction from both left and right sides of a join.
+    #[test]
+    fn test_extract_from_join_both_sides() -> Result<()> {
+        use datafusion_expr::JoinType;
+
+        let left = test_table_scan_with_struct()?;
+        let right = test_table_scan_with_struct_named("right")?;
+
+        let plan = LogicalPlanBuilder::from(left)
+            .join_on(
+                right,
+                JoinType::Inner,
+                vec![
+                    col("test.user").eq(col("right.user")),
+                    leaf_udf(col("test.user"), "status").eq(lit("active")),
+                    leaf_udf(col("right.user"), "role").eq(lit("admin")),
+                ],
+            )?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Inner Join:  Filter: test.user = right.user AND leaf_udf(test.user, Utf8("status")) = Utf8("active") AND leaf_udf(right.user, Utf8("role")) = Utf8("admin")
+          TableScan: test projection=[id, user]
+          TableScan: right projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, test.user, right.id, right.user
+          Inner Join:  Filter: test.user = right.user AND __datafusion_extracted_1 = Utf8("active") AND __datafusion_extracted_2 = Utf8("admin")
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user
+              TableScan: test projection=[id, user]
+            Projection: leaf_udf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id, right.user
+              TableScan: right projection=[id, user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    /// Join with no MoveTowardsLeafNodes expressions returns unchanged.
+    #[test]
+    fn test_extract_from_join_no_extraction() -> Result<()> {
+        use datafusion_expr::JoinType;
+
+        let left = test_table_scan()?;
+        let right = test_table_scan_with_name("right")?;
+
+        let plan = LogicalPlanBuilder::from(left)
+            .join(right, JoinType::Inner, (vec!["a"], vec!["a"]), None)?
+            .build()?;
+
+        assert_stages!(plan, @"
+        ## Original Plan
+        Inner Join: test.a = right.a
+          TableScan: test projection=[a, b, c]
+          TableScan: right projection=[a, b, c]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        ")
+    }
+
+    /// Join followed by filter with extraction.
+    #[test]
+    fn test_extract_from_filter_above_join() -> Result<()> {
+        use datafusion_expr::JoinType;
+
+        let left = test_table_scan_with_struct()?;
+        let right = test_table_scan_with_struct_named("right")?;
+
+        let plan = LogicalPlanBuilder::from(left)
+            .join_with_expr_keys(
+                right,
+                JoinType::Inner,
+                (
+                    vec![leaf_udf(col("user"), "id")],
+                    vec![leaf_udf(col("user"), "id")],
+                ),
+                None,
+            )?
+            .filter(leaf_udf(col("test.user"), "status").eq(lit("active")))?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Filter: leaf_udf(test.user, Utf8("status")) = Utf8("active")
+          Inner Join: leaf_udf(test.user, Utf8("id")) = leaf_udf(right.user, Utf8("id"))
+            TableScan: test projection=[id, user]
+            TableScan: right projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, test.user, right.id, right.user
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, right.id, right.user
+              Projection: test.id, test.user, right.id, right.user
+                Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3
+                  Projection: leaf_udf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user
+                    TableScan: test projection=[id, user]
+                  Projection: leaf_udf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user
+                    TableScan: right projection=[id, user]
+
+        ## After Pushdown
+        Projection: test.id, test.user, right.id, right.user
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3
+              Projection: leaf_udf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1
+                TableScan: test projection=[id, user]
+              Projection: leaf_udf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user
+                TableScan: right projection=[id, user]
+
+        ## Optimized
+        Projection: test.id, test.user, right.id, right.user
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            Projection: test.id, test.user, __datafusion_extracted_1, right.id, right.user
+              Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3
+                Projection: leaf_udf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1
+                  TableScan: test projection=[id, user]
+                Projection: leaf_udf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user
+                  TableScan: right projection=[id, user]
+        "#)
+    }
+
+    /// Extraction projection (get_field in SELECT) above a Join pushes into
+    /// the correct input side.
+    #[test]
+    fn test_extract_projection_above_join() -> Result<()> {
+        use datafusion_expr::JoinType;
+
+        let left = test_table_scan_with_struct()?;
+        let right = test_table_scan_with_struct_named("right")?;
+
+        let plan = LogicalPlanBuilder::from(left)
+            .join(right, JoinType::Inner, (vec!["id"], vec!["id"]), None)?
+            .project(vec![
+                leaf_udf(col("test.user"), "status"),
+                leaf_udf(col("right.user"), "role"),
+            ])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("status")), leaf_udf(right.user, Utf8("role"))
+          Inner Join: test.id = right.id
+            TableScan: test projection=[id, user]
+            TableScan: right projection=[id, user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("status")), __datafusion_extracted_2 AS leaf_udf(right.user,Utf8("role"))
+          Inner Join: test.id = right.id
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user
+              TableScan: test projection=[id, user]
+            Projection: leaf_udf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id, right.user
+              TableScan: right projection=[id, user]
+
+        ## Optimized
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("status")), __datafusion_extracted_2 AS leaf_udf(right.user,Utf8("role"))
+          Inner Join: test.id = right.id
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id
+              TableScan: test projection=[id, user]
+            Projection: leaf_udf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id
+              TableScan: right projection=[id, user]
+        "#)
+    }
+
+    /// Join where both sides have same-named columns: a qualified reference
+    /// to the right side must be routed to the right input, not the left.
+    #[test]
+    fn test_extract_from_join_qualified_right_side() -> Result<()> {
+        use datafusion_expr::JoinType;
+
+        let left = test_table_scan_with_struct()?;
+        let right = test_table_scan_with_struct_named("right")?;
+
+        // Filter references right.user explicitly — must route to right side
+        let plan = LogicalPlanBuilder::from(left)
+            .join_on(
+                right,
+                JoinType::Inner,
+                vec![
+                    col("test.id").eq(col("right.id")),
+                    leaf_udf(col("right.user"), "status").eq(lit("active")),
+                ],
+            )?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Inner Join:  Filter: test.id = right.id AND leaf_udf(right.user, Utf8("status")) = Utf8("active")
+          TableScan: test projection=[id, user]
+          TableScan: right projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, test.user, right.id, right.user
+          Inner Join:  Filter: test.id = right.id AND __datafusion_extracted_1 = Utf8("active")
+            TableScan: test projection=[id, user]
+            Projection: leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, right.user
+              TableScan: right projection=[id, user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    /// When both inputs contain the same unqualified column, an unqualified
+    /// column reference is ambiguous and `find_owning_input` must return
+    /// `None` rather than always returning 0 (the left side).
+    #[test]
+    fn test_find_owning_input_ambiguous_unqualified_column() {
+        use std::collections::HashSet;
+
+        // Simulate schema_columns output for two sides of a join where both
+        // have a "user" column — each set contains the qualified and
+        // unqualified form.
+        let relation = "test".into();
+        let left_cols: HashSet<ColumnReference> = [
+            ColumnReference::new(Some(&relation), "user"),
+            ColumnReference::new_unqualified("user"),
+        ]
+        .into_iter()
+        .collect();
+
+        let relation = "right".into();
+        let right_cols: HashSet<ColumnReference> = [
+            ColumnReference::new(Some(&relation), "user"),
+            ColumnReference::new_unqualified("user"),
+        ]
+        .into_iter()
+        .collect();
+
+        let input_column_sets = vec![left_cols, right_cols];
+
+        // Unqualified "user" matches both sets — must return None (ambiguous)
+        let unqualified = Expr::Column(Column::new_unqualified("user"));
+        assert_eq!(find_owning_input(&unqualified, &input_column_sets), None);
+
+        // Qualified "right.user" matches only the right set — must return Some(1)
+        let qualified_right = Expr::Column(Column::new(Some("right"), "user"));
+        assert_eq!(
+            find_owning_input(&qualified_right, &input_column_sets),
+            Some(1)
+        );
+
+        // Qualified "test.user" matches only the left set — must return Some(0)
+        let qualified_left = Expr::Column(Column::new(Some("test"), "user"));
+        assert_eq!(
+            find_owning_input(&qualified_left, &input_column_sets),
+            Some(0)
+        );
+    }
+
+    /// Two leaf_udf expressions from different sides of a Join in a Filter.
+    /// Each is routed to its respective input side independently.
+    #[test]
+    fn test_extract_from_join_cross_input_expression() -> Result<()> {
+        let left = test_table_scan_with_struct()?;
+        let right = test_table_scan_with_struct_named("right")?;
+
+        let plan = LogicalPlanBuilder::from(left)
+            .join_on(
+                right,
+                datafusion_expr::JoinType::Inner,
+                vec![col("test.id").eq(col("right.id"))],
+            )?
+            .filter(
+                leaf_udf(col("test.user"), "status")
+                    .eq(leaf_udf(col("right.user"), "status")),
+            )?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Filter: leaf_udf(test.user, Utf8("status")) = leaf_udf(right.user, Utf8("status"))
+          Inner Join:  Filter: test.id = right.id
+            TableScan: test projection=[id, user]
+            TableScan: right projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, test.user, right.id, right.user
+          Filter: __datafusion_extracted_1 = __datafusion_extracted_2
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, right.id, right.user
+              Inner Join:  Filter: test.id = right.id
+                TableScan: test projection=[id, user]
+                TableScan: right projection=[id, user]
+
+        ## After Pushdown
+        Projection: test.id, test.user, right.id, right.user
+          Filter: __datafusion_extracted_1 = __datafusion_extracted_2
+            Inner Join:  Filter: test.id = right.id
+              Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user
+                TableScan: test projection=[id, user]
+              Projection: leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_2, right.id, right.user
+                TableScan: right projection=[id, user]
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    // =========================================================================
+    // Column-rename through intermediate node tests
+    // =========================================================================
+
+    /// Projection with leaf expr above Filter above renaming Projection.
+    #[test]
+    fn test_extract_through_filter_with_column_rename() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("user").alias("x")])?
+            .filter(col("x").is_not_null())?
+            .project(vec![leaf_udf(col("x"), "a")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(x, Utf8("a"))
+          Filter: x IS NOT NULL
+            Projection: test.user AS x
+              TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS leaf_udf(x,Utf8("a"))
+          Filter: x IS NOT NULL
+            Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user
+              TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: __datafusion_extracted_1 AS leaf_udf(x,Utf8("a"))
+          Filter: x IS NOT NULL
+            Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1
+              TableScan: test projection=[user]
+        "#)
+    }
+
+    /// Same as above but with a partial extraction (leaf + arithmetic).
+    #[test]
+    fn test_extract_partial_through_filter_with_column_rename() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("user").alias("x")])?
+            .filter(col("x").is_not_null())?
+            .project(vec![leaf_udf(col("x"), "a").is_not_null()])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(x, Utf8("a")) IS NOT NULL
+          Filter: x IS NOT NULL
+            Projection: test.user AS x
+              TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 IS NOT NULL AS leaf_udf(x,Utf8("a")) IS NOT NULL
+          Filter: x IS NOT NULL
+            Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user
+              TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: __datafusion_extracted_1 IS NOT NULL AS leaf_udf(x,Utf8("a")) IS NOT NULL
+          Filter: x IS NOT NULL
+            Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1
+              TableScan: test projection=[user]
+        "#)
+    }
+
+    /// Tests merge_into_extracted_projection path through a renaming projection.
+    #[test]
+    fn test_extract_from_filter_above_renaming_projection() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("user").alias("x")])?
+            .filter(leaf_udf(col("x"), "a").eq(lit("active")))?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Filter: leaf_udf(x, Utf8("a")) = Utf8("active")
+          Projection: test.user AS x
+            TableScan: test projection=[user]
+
+        ## After Extraction
+        Projection: x
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user
+              TableScan: test projection=[user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        Projection: x
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1
+              TableScan: test projection=[user]
+        "#)
+    }
+
+    // =========================================================================
+    // SubqueryAlias extraction tests
+    // =========================================================================
+
+    /// Extraction projection pushes through SubqueryAlias.
+    #[test]
+    fn test_extract_through_subquery_alias() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .alias("sub")?
+            .project(vec![leaf_udf(col("sub.user"), "name")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(sub.user, Utf8("name"))
+          SubqueryAlias: sub
+            TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS leaf_udf(sub.user,Utf8("name"))
+          SubqueryAlias: sub
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user
+              TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: __datafusion_extracted_1 AS leaf_udf(sub.user,Utf8("name"))
+          SubqueryAlias: sub
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1
+              TableScan: test projection=[user]
+        "#)
+    }
+
+    /// Extraction projection pushes through SubqueryAlias + Filter.
+    #[test]
+    fn test_extract_through_subquery_alias_with_filter() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .alias("sub")?
+            .filter(leaf_udf(col("sub.user"), "status").eq(lit("active")))?
+            .project(vec![leaf_udf(col("sub.user"), "name")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(sub.user, Utf8("name"))
+          Filter: leaf_udf(sub.user, Utf8("status")) = Utf8("active")
+            SubqueryAlias: sub
+              TableScan: test projection=[user]
+
+        ## After Extraction
+        Projection: leaf_udf(sub.user, Utf8("name"))
+          Projection: sub.user
+            Filter: __datafusion_extracted_1 = Utf8("active")
+              Projection: leaf_udf(sub.user, Utf8("status")) AS __datafusion_extracted_1, sub.user
+                SubqueryAlias: sub
+                  TableScan: test projection=[user]
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_2 AS leaf_udf(sub.user,Utf8("name"))
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            SubqueryAlias: sub
+              Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.user
+                TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: __datafusion_extracted_2 AS leaf_udf(sub.user,Utf8("name"))
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            SubqueryAlias: sub
+              Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2
+                TableScan: test projection=[user]
+        "#)
+    }
+
+    /// Two layers of SubqueryAlias: extraction pushes through both.
+    #[test]
+    fn test_extract_through_nested_subquery_alias() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .alias("inner_sub")?
+            .alias("outer_sub")?
+            .project(vec![leaf_udf(col("outer_sub.user"), "name")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(outer_sub.user, Utf8("name"))
+          SubqueryAlias: outer_sub
+            SubqueryAlias: inner_sub
+              TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS leaf_udf(outer_sub.user,Utf8("name"))
+          SubqueryAlias: outer_sub
+            SubqueryAlias: inner_sub
+              Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user
+                TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: __datafusion_extracted_1 AS leaf_udf(outer_sub.user,Utf8("name"))
+          SubqueryAlias: outer_sub
+            SubqueryAlias: inner_sub
+              Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1
+                TableScan: test projection=[user]
+        "#)
+    }
+
+    /// Plain columns through SubqueryAlias -- no extraction needed.
+    #[test]
+    fn test_subquery_alias_no_extraction() -> Result<()> {
+        let table_scan = test_table_scan()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .alias("sub")?
+            .project(vec![col("sub.a"), col("sub.b")])?
+            .build()?;
+
+        assert_stages!(plan, @"
+        ## Original Plan
+        SubqueryAlias: sub
+          TableScan: test projection=[a, b]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        ")
+    }
+
+    /// Two UDFs with the same `name()` but different concrete types should NOT be
+    /// deduplicated -- they are semantically different expressions that happen to
+    /// collide on `schema_name()`.
+    #[test]
+    fn test_different_udfs_same_schema_name_not_deduplicated() -> Result<()> {
+        let udf_a = Arc::new(ScalarUDF::new_from_impl(
+            PlacementTestUDF::new()
+                .with_placement(ExpressionPlacement::MoveTowardsLeafNodes)
+                .with_id(1),
+        ));
+        let udf_b = Arc::new(ScalarUDF::new_from_impl(
+            PlacementTestUDF::new()
+                .with_placement(ExpressionPlacement::MoveTowardsLeafNodes)
+                .with_id(2),
+        ));
+
+        let expr_a = Expr::ScalarFunction(ScalarFunction::new_udf(
+            udf_a,
+            vec![col("user"), lit("field")],
+        ));
+        let expr_b = Expr::ScalarFunction(ScalarFunction::new_udf(
+            udf_b,
+            vec![col("user"), lit("field")],
+        ));
+
+        // Verify preconditions: same schema_name but different Expr
+        assert_eq!(
+            expr_a.schema_name().to_string(),
+            expr_b.schema_name().to_string(),
+            "Both expressions should have the same schema_name"
+        );
+        assert_ne!(
+            expr_a, expr_b,
+            "Expressions should NOT be equal (different UDF instances)"
+        );
+
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan.clone())
+            .filter(expr_a.clone().eq(lit("a")).and(expr_b.clone().eq(lit("b"))))?
+            .select(vec![
+                table_scan
+                    .schema()
+                    .index_of_column_by_name(None, "id")
+                    .unwrap(),
+            ])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: test.id
+          Filter: leaf_udf(test.user, Utf8("field")) = Utf8("a") AND leaf_udf(test.user, Utf8("field")) = Utf8("b")
+            TableScan: test projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id
+          Projection: test.id, test.user
+            Filter: __datafusion_extracted_1 = Utf8("a") AND __datafusion_extracted_2 = Utf8("b")
+              Projection: leaf_udf(test.user, Utf8("field")) AS __datafusion_extracted_1, leaf_udf(test.user, Utf8("field")) AS __datafusion_extracted_2, test.id, test.user
+                TableScan: test projection=[id, user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        Projection: test.id
+          Filter: __datafusion_extracted_1 = Utf8("a") AND __datafusion_extracted_2 = Utf8("b")
+            Projection: leaf_udf(test.user, Utf8("field")) AS __datafusion_extracted_1, leaf_udf(test.user, Utf8("field")) AS __datafusion_extracted_2, test.id
+              TableScan: test projection=[id, user]
+        "#)
+    }
+
+    // =========================================================================
+    // Filter pushdown interaction tests
+    // =========================================================================
+
+    /// Extraction pushdown through a filter that already had its own
+    /// `leaf_udf` extracted.
+    #[test]
+    fn test_extraction_pushdown_through_filter_with_extracted_predicate() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(leaf_udf(col("user"), "status").eq(lit("active")))?
+            .project(vec![col("id"), leaf_udf(col("user"), "name")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: test.id, leaf_udf(test.user, Utf8("name"))
+          Filter: leaf_udf(test.user, Utf8("status")) = Utf8("active")
+            TableScan: test projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, leaf_udf(test.user, Utf8("name"))
+          Projection: test.id, test.user
+            Filter: __datafusion_extracted_1 = Utf8("active")
+              Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user
+                TableScan: test projection=[id, user]
+
+        ## After Pushdown
+        Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name"))
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2
+              TableScan: test projection=[id, user]
+
+        ## Optimized
+        Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name"))
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2
+              TableScan: test projection=[id, user]
+        "#)
+    }
+
+    /// Same expression in filter predicate and projection output.
+    #[test]
+    fn test_extraction_pushdown_same_expr_in_filter_and_projection() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let field_expr = leaf_udf(col("user"), "status");
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(field_expr.clone().gt(lit(5)))?
+            .project(vec![col("id"), field_expr])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: test.id, leaf_udf(test.user, Utf8("status"))
+          Filter: leaf_udf(test.user, Utf8("status")) > Int32(5)
+            TableScan: test projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, leaf_udf(test.user, Utf8("status"))
+          Projection: test.id, test.user
+            Filter: __datafusion_extracted_1 > Int32(5)
+              Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user
+                TableScan: test projection=[id, user]
+
+        ## After Pushdown
+        Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("status"))
+          Filter: __datafusion_extracted_1 > Int32(5)
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2
+              TableScan: test projection=[id, user]
+
+        ## Optimized
+        Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("status"))
+          Filter: __datafusion_extracted_1 > Int32(5)
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2
+              TableScan: test projection=[id, user]
+        "#)
+    }
+
+    /// Left join with a `leaf_udf` filter on the right side AND
+    /// the projection also selects `leaf_udf` from the right side.
+    #[test]
+    fn test_left_join_with_filter_and_projection_extraction() -> Result<()> {
+        use datafusion_expr::JoinType;
+
+        let left = test_table_scan_with_struct()?;
+        let right = test_table_scan_with_struct_named("right")?;
+
+        let plan = LogicalPlanBuilder::from(left)
+            .join_on(
+                right,
+                JoinType::Left,
+                vec![
+                    col("test.id").eq(col("right.id")),
+                    leaf_udf(col("right.user"), "status").gt(lit(5)),
+                ],
+            )?
+            .project(vec![
+                col("test.id"),
+                leaf_udf(col("test.user"), "name"),
+                leaf_udf(col("right.user"), "status"),
+            ])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: test.id, leaf_udf(test.user, Utf8("name")), leaf_udf(right.user, Utf8("status"))
+          Left Join:  Filter: test.id = right.id AND leaf_udf(right.user, Utf8("status")) > Int32(5)
+            TableScan: test projection=[id, user]
+            TableScan: right projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, leaf_udf(test.user, Utf8("name")), leaf_udf(right.user, Utf8("status"))
+          Projection: test.id, test.user, right.id, right.user
+            Left Join:  Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5)
+              TableScan: test projection=[id, user]
+              Projection: leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, right.user
+                TableScan: right projection=[id, user]
+
+        ## After Pushdown
+        Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")), __datafusion_extracted_3 AS leaf_udf(right.user,Utf8("status"))
+          Left Join:  Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5)
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id, test.user
+              TableScan: test projection=[id, user]
+            Projection: leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, right.user, leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_3
+              TableScan: right projection=[id, user]
+
+        ## Optimized
+        Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")), __datafusion_extracted_3 AS leaf_udf(right.user,Utf8("status"))
+          Left Join:  Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5)
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id
+              TableScan: test projection=[id, user]
+            Projection: leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_3
+              TableScan: right projection=[id, user]
+        "#)
+    }
+
+    /// Extraction projection pushed through a filter whose predicate
+    /// references a different extracted expression.
+    #[test]
+    fn test_pure_extraction_proj_push_through_filter() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(leaf_udf(col("user"), "status").gt(lit(5)))?
+            .project(vec![
+                col("id"),
+                leaf_udf(col("user"), "name"),
+                leaf_udf(col("user"), "status"),
+            ])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: test.id, leaf_udf(test.user, Utf8("name")), leaf_udf(test.user, Utf8("status"))
+          Filter: leaf_udf(test.user, Utf8("status")) > Int32(5)
+            TableScan: test projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, leaf_udf(test.user, Utf8("name")), leaf_udf(test.user, Utf8("status"))
+          Projection: test.id, test.user
+            Filter: __datafusion_extracted_1 > Int32(5)
+              Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user
+                TableScan: test projection=[id, user]
+
+        ## After Pushdown
+        Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")), __datafusion_extracted_3 AS leaf_udf(test.user,Utf8("status"))
+          Filter: __datafusion_extracted_1 > Int32(5)
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_3
+              TableScan: test projection=[id, user]
+
+        ## Optimized
+        Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")), __datafusion_extracted_3 AS leaf_udf(test.user,Utf8("status"))
+          Filter: __datafusion_extracted_1 > Int32(5)
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_3
+              TableScan: test projection=[id, user]
+        "#)
+    }
+
+    /// When an extraction projection's __extracted alias references a column
+    /// (e.g. `user`) that is NOT a standalone expression in the projection,
+    /// the merge into the inner projection should still succeed.
+    #[test]
+    fn test_merge_extraction_into_projection_with_column_ref_inflation() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+
+        // Inner projection (simulates a trimmed projection)
+        let inner = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("user"), col("id")])?
+            .build()?;
+
+        // Outer projection: __extracted alias + id (but NOT user as standalone).
+        // The alias references `user` internally, inflating columns_needed.
+        let plan = LogicalPlanBuilder::from(inner)
+            .project(vec![
+                leaf_udf(col("user"), "status")
+                    .alias(format!("{EXTRACTED_EXPR_PREFIX}_1")),
+                col("id"),
+            ])?
+            .build()?;
+
+        // Run only PushDownLeafProjections
+        let ctx = OptimizerContext::new().with_max_passes(1);
+        let optimizer =
+            Optimizer::with_rules(vec![Arc::new(PushDownLeafProjections::new())]);
+        let result = optimizer.optimize(plan, &ctx, |_, _| {})?;
+
+        // With the fix: merge succeeds → extraction merged into inner projection.
+        // Without the fix: merge rejected → two separate projections remain.
+        insta::assert_snapshot!(format!("{result}"), @r#"
+        Projection: __datafusion_extracted_1, test.id
+          Projection: test.user, test.id, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1
+            TableScan: test
+        "#);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/optimizer/src/filter_null_join_keys.rs b/datafusion/optimizer/src/filter_null_join_keys.rs
index 8ad7fa53c0e33..c8f419d3e543e 100644
--- a/datafusion/optimizer/src/filter_null_join_keys.rs
+++ b/datafusion/optimizer/src/filter_null_join_keys.rs
@@ -23,7 +23,7 @@ use crate::{OptimizerConfig, OptimizerRule};
 use datafusion_common::tree_node::Transformed;
 use datafusion_common::{NullEquality, Result};
 use datafusion_expr::utils::conjunction;
-use datafusion_expr::{logical_plan::Filter, Expr, ExprSchemable, LogicalPlan};
+use datafusion_expr::{Expr, ExprSchemable, LogicalPlan, logical_plan::Filter};
 use std::sync::Arc;
 
 /// The FilterNullJoinKeys rule will identify joins with equi-join conditions
@@ -108,12 +108,12 @@ fn create_not_null_predicate(filters: Vec<Expr>) -> Expr {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::assert_optimized_plan_eq_snapshot;
     use crate::OptimizerContext;
+    use crate::assert_optimized_plan_eq_snapshot;
     use arrow::datatypes::{DataType, Field, Schema};
     use datafusion_common::Column;
     use datafusion_expr::logical_plan::table_scan;
-    use datafusion_expr::{col, lit, JoinType, LogicalPlanBuilder};
+    use datafusion_expr::{JoinType, LogicalPlanBuilder, col, lit};
 
     macro_rules! assert_optimized_plan_equal {
         (
diff --git a/datafusion/optimizer/src/join_key_set.rs b/datafusion/optimizer/src/join_key_set.rs
index 0a97173b30966..de795c0aeacfa 100644
--- a/datafusion/optimizer/src/join_key_set.rs
+++ b/datafusion/optimizer/src/join_key_set.rs
@@ -157,7 +157,7 @@ impl Equivalent<(Expr, Expr)> for ExprPair<'_> {
 #[cfg(test)]
 mod test {
     use crate::join_key_set::JoinKeySet;
-    use datafusion_expr::{col, Expr};
+    use datafusion_expr::{Expr, col};
 
     #[test]
     fn test_insert() {
diff --git a/datafusion/optimizer/src/lib.rs b/datafusion/optimizer/src/lib.rs
index 85fa9493f449d..e610091824092 100644
--- a/datafusion/optimizer/src/lib.rs
+++ b/datafusion/optimizer/src/lib.rs
@@ -23,6 +23,7 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! # DataFusion Optimizer
 //!
@@ -48,17 +49,24 @@ pub mod eliminate_filter;
 pub mod eliminate_group_by_constant;
 pub mod eliminate_join;
 pub mod eliminate_limit;
-pub mod eliminate_nested_union;
-pub mod eliminate_one_union;
+#[deprecated(since = "52.0.0", note = "Please use OptimizeUnions instead")]
+pub mod eliminate_nested_union {
+    use crate::optimize_unions::OptimizeUnions;
+    #[deprecated(since = "52.0.0", note = "Please use OptimizeUnions instead")]
+    pub type EliminateNestedUnion = OptimizeUnions;
+}
 pub mod eliminate_outer_join;
 pub mod extract_equijoin_predicate;
+pub mod extract_leaf_expressions;
 pub mod filter_null_join_keys;
 pub mod optimize_projections;
+pub mod optimize_unions;
 pub mod optimizer;
 pub mod propagate_empty_relation;
 pub mod push_down_filter;
 pub mod push_down_limit;
 pub mod replace_distinct_aggregate;
+pub mod rewrite_set_comparison;
 pub mod scalar_subquery_to_join;
 pub mod simplify_expressions;
 pub mod single_distinct_to_groupby;
diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs
index 5db71417bc8fd..af944abc6f0b4 100644
--- a/datafusion/optimizer/src/optimize_projections/mod.rs
+++ b/datafusion/optimizer/src/optimize_projections/mod.rs
@@ -21,17 +21,16 @@ mod required_indices;
 
 use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
-use std::collections::HashSet;
 use std::sync::Arc;
 
 use datafusion_common::{
-    get_required_group_by_exprs_indices, internal_datafusion_err, internal_err, Column,
-    DFSchema, HashMap, JoinType, Result,
+    Column, DFSchema, HashMap, JoinType, Result, assert_eq_or_internal_err,
+    get_required_group_by_exprs_indices, internal_datafusion_err, internal_err,
 };
 use datafusion_expr::expr::Alias;
 use datafusion_expr::{
-    logical_plan::LogicalPlan, Aggregate, Distinct, EmptyRelation, Expr, Projection,
-    TableScan, Unnest, Window,
+    Aggregate, Distinct, EmptyRelation, Expr, Projection, TableScan, Unnest, Window,
+    logical_plan::LogicalPlan,
 };
 
 use crate::optimize_projections::required_indices::RequiredIndices;
@@ -77,7 +76,7 @@ use datafusion_common::tree_node::{
 pub struct OptimizeProjections {}
 
 impl OptimizeProjections {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -136,9 +135,11 @@ fn optimize_projections(
     // their parents' required indices.
     match plan {
         LogicalPlan::Projection(proj) => {
-            return merge_consecutive_projections(proj)?.transform_data(|proj| {
-                rewrite_projection_given_requirements(proj, config, &indices)
-            })
+            return merge_consecutive_projections(proj)?
+                .transform_data(|proj| {
+                    rewrite_projection_given_requirements(proj, config, &indices)
+                })?
+                .transform_data(|plan| optimize_subqueries(plan, config));
         }
         LogicalPlan::Aggregate(aggregate) => {
             // Split parent requirements to GROUP BY and aggregate sections:
@@ -147,26 +148,39 @@ fn optimize_projections(
             // `aggregate.aggr_expr`:
             let (group_by_reqs, aggregate_reqs) = indices.split_off(n_group_exprs);
 
-            // Get absolutely necessary GROUP BY fields:
-            let group_by_expr_existing = aggregate
-                .group_expr
-                .iter()
-                .map(|group_by_expr| group_by_expr.schema_name().to_string())
-                .collect::<Vec<_>>();
-
-            let new_group_bys = if let Some(simplest_groupby_indices) =
-                get_required_group_by_exprs_indices(
-                    aggregate.input.schema(),
-                    &group_by_expr_existing,
-                ) {
-                // Some of the fields in the GROUP BY may be required by the
-                // parent even if these fields are unnecessary in terms of
-                // functional dependency.
-                group_by_reqs
-                    .append(&simplest_groupby_indices)
-                    .get_at_indices(&aggregate.group_expr)
-            } else {
+            // Get absolutely necessary GROUP BY fields.
+            //
+            // When the input has no functional dependencies, we can
+            // short-circuit this analysis.
+            let new_group_bys = if aggregate
+                .input
+                .schema()
+                .functional_dependencies()
+                .is_empty()
+            {
                 aggregate.group_expr
+            } else {
+                let group_by_expr_existing = aggregate
+                    .group_expr
+                    .iter()
+                    .map(|group_by_expr| group_by_expr.schema_name().to_string())
+                    .collect::<Vec<_>>();
+
+                if let Some(simplest_groupby_indices) =
+                    get_required_group_by_exprs_indices(
+                        aggregate.input.schema(),
+                        &group_by_expr_existing,
+                    )
+                {
+                    // Some of the fields in the GROUP BY may be required by
+                    // the parent even if these fields are unnecessary in
+                    // terms of functional dependency.
+                    group_by_reqs
+                        .append(&simplest_groupby_indices)
+                        .get_at_indices(&aggregate.group_expr)
+                } else {
+                    aggregate.group_expr
+                }
             };
 
             // Only use the absolutely necessary aggregate expressions required
@@ -210,7 +224,8 @@ fn optimize_projections(
                     new_aggr_expr,
                 )
                 .map(LogicalPlan::Aggregate)
-            });
+            })?
+            .transform_data(|plan| optimize_subqueries(plan, config));
         }
         LogicalPlan::Window(window) => {
             let input_schema = Arc::clone(window.input.schema());
@@ -250,7 +265,8 @@ fn optimize_projections(
                         .map(LogicalPlan::Window)
                         .map(Transformed::yes)
                 }
-            });
+            })?
+            .transform_data(|plan| optimize_subqueries(plan, config));
         }
         LogicalPlan::TableScan(table_scan) => {
             let TableScan {
@@ -268,15 +284,11 @@ fn optimize_projections(
                 Some(projection) => indices.into_mapped_indices(|idx| projection[idx]),
                 None => indices.into_inner(),
             };
-            return TableScan::try_new(
-                table_name,
-                source,
-                Some(projection),
-                filters,
-                fetch,
-            )
-            .map(LogicalPlan::TableScan)
-            .map(Transformed::yes);
+            let new_scan =
+                TableScan::try_new(table_name, source, Some(projection), filters, fetch)?;
+
+            return Transformed::yes(LogicalPlan::TableScan(new_scan))
+                .transform_data(|plan| optimize_subqueries(plan, config));
         }
         // Other node types are handled below
         _ => {}
@@ -334,26 +346,34 @@ fn optimize_projections(
                 .collect()
         }
         LogicalPlan::Extension(extension) => {
-            let Some(necessary_children_indices) =
+            if let Some(necessary_children_indices) =
                 extension.node.necessary_children_exprs(indices.indices())
-            else {
+            {
+                let children = extension.node.inputs();
+                assert_eq_or_internal_err!(
+                    children.len(),
+                    necessary_children_indices.len(),
+                    "Inconsistent length between children and necessary children indices. \
+                Make sure `.necessary_children_exprs` implementation of the \
+                `UserDefinedLogicalNode` is consistent with actual children length \
+                for the node."
+                );
+                children
+                    .into_iter()
+                    .zip(necessary_children_indices)
+                    .map(|(child, necessary_indices)| {
+                        RequiredIndices::new_from_indices(necessary_indices)
+                            .with_plan_exprs(&plan, child.schema())
+                    })
+                    .collect::<Result<Vec<_>>>()?
+            } else {
                 // Requirements from parent cannot be routed down to user defined logical plan safely
-                return Ok(Transformed::no(plan));
-            };
-            let children = extension.node.inputs();
-            if children.len() != necessary_children_indices.len() {
-                return internal_err!("Inconsistent length between children and necessary children indices. \
-                Make sure `.necessary_children_exprs` implementation of the `UserDefinedLogicalNode` is \
-                consistent with actual children length for the node.");
+                // Assume it requires all input exprs here
+                plan.inputs()
+                    .into_iter()
+                    .map(RequiredIndices::new_for_all_exprs)
+                    .collect()
             }
-            children
-                .into_iter()
-                .zip(necessary_children_indices)
-                .map(|(child, necessary_indices)| {
-                    RequiredIndices::new_from_indices(necessary_indices)
-                        .with_plan_exprs(&plan, child.schema())
-                })
-                .collect::<Result<Vec<_>>>()?
         }
         LogicalPlan::EmptyRelation(_)
         | LogicalPlan::Values(_)
@@ -387,8 +407,9 @@ fn optimize_projections(
         }
         LogicalPlan::Join(join) => {
             let left_len = join.left.schema().fields().len();
+            let right_len = join.right.schema().fields().len();
             let (left_req_indices, right_req_indices) =
-                split_join_requirements(left_len, indices, &join.join_type);
+                split_join_requirements(left_len, right_len, indices, &join.join_type);
             let left_indices =
                 left_req_indices.with_plan_exprs(&plan, join.left.schema())?;
             let right_indices =
@@ -432,11 +453,11 @@ fn optimize_projections(
     // Required indices are currently ordered (child0, child1, ...)
     // but the loop pops off the last element, so we need to reverse the order
     child_required_indices.reverse();
-    if child_required_indices.len() != plan.inputs().len() {
-        return internal_err!(
-            "OptimizeProjection: child_required_indices length mismatch with plan inputs"
-        );
-    }
+    assert_eq_or_internal_err!(
+        child_required_indices.len(),
+        plan.inputs().len(),
+        "OptimizeProjection: child_required_indices length mismatch with plan inputs"
+    );
 
     // Rewrite children of the plan
     let transformed_plan = plan.map_children(|child| {
@@ -460,6 +481,9 @@ fn optimize_projections(
         )
     })?;
 
+    let transformed_plan =
+        transformed_plan.transform_data(|plan| optimize_subqueries(plan, config))?;
+
     // If any of the children are transformed, we need to potentially update the plan's schema
     if transformed_plan.transformed {
         transformed_plan.map_data(|plan| plan.recompute_schema())
@@ -468,8 +492,19 @@ fn optimize_projections(
     }
 }
 
-/// Merges consecutive projections.
-///
+/// Optimizes uncorrelated subquery plans embedded in expressions of the given
+/// plan node (e.g., `Expr::ScalarSubquery`). `map_children` only visits direct
+/// plan inputs, so subqueries must be handled separately.
+fn optimize_subqueries(
+    plan: LogicalPlan,
+    config: &dyn OptimizerConfig,
+) -> Result<Transformed<LogicalPlan>> {
+    plan.map_uncorrelated_subqueries(|subquery_plan| {
+        let indices = RequiredIndices::new_for_all_exprs(&subquery_plan);
+        optimize_projections(subquery_plan, config, indices)
+    })
+}
+
 /// Given a projection `proj`, this function attempts to merge it with a previous
 /// projection if it exists and if merging is beneficial. Merging is considered
 /// beneficial when expressions in the current projection are non-trivial and
@@ -527,15 +562,14 @@ fn merge_consecutive_projections(proj: Projection) -> Result<Transformed<Project
     expr.iter()
         .for_each(|expr| expr.add_column_ref_counts(&mut column_referral_map));
 
-    // If an expression is non-trivial and appears more than once, do not merge
+    // If an expression is non-trivial (KeepInPlace) and appears more than once, do not merge
     // them as consecutive projections will benefit from a compute-once approach.
     // For details, see: https://github.com/apache/datafusion/issues/8296
     if column_referral_map.into_iter().any(|(col, usage)| {
         usage > 1
-            && !is_expr_trivial(
-                &prev_projection.expr
-                    [prev_projection.schema.index_of_column(col).unwrap()],
-            )
+            && !prev_projection.expr[prev_projection.schema.index_of_column(col).unwrap()]
+                .placement()
+                .should_push_to_leaves()
     }) {
         // no change
         return Projection::try_new_with_schema(expr, input, schema).map(Transformed::no);
@@ -562,7 +596,19 @@ fn merge_consecutive_projections(proj: Projection) -> Result<Transformed<Project
                 metadata,
             }) => rewrite_expr(*expr, &prev_projection).map(|result| {
                 result.update_data(|expr| {
-                    Expr::Alias(Alias::new(expr, relation, name).with_metadata(metadata))
+                    // After substitution, the inner expression may now have the
+                    // same schema_name as the alias (e.g. when an extraction
+                    // alias like `__extracted_1 AS f(x)` is resolved back to
+                    // `f(x)`). Wrapping in a redundant self-alias causes a
+                    // cosmetic `f(x) AS f(x)` due to Display vs schema_name
+                    // formatting differences. Drop the alias when it matches.
+                    if metadata.is_none() && expr.schema_name().to_string() == name {
+                        expr
+                    } else {
+                        Expr::Alias(
+                            Alias::new(expr, relation, name).with_metadata(metadata),
+                        )
+                    }
                 })
             }),
             e => rewrite_expr(e, &prev_projection),
@@ -588,11 +634,6 @@ fn merge_consecutive_projections(proj: Projection) -> Result<Transformed<Project
     }
 }
 
-// Check whether `expr` is trivial; i.e. it doesn't imply any computation.
-fn is_expr_trivial(expr: &Expr) -> bool {
-    matches!(expr, Expr::Column(_) | Expr::Literal(_, _))
-}
-
 /// Rewrites a projection expression using the projection before it (i.e. its input)
 /// This is a subroutine to the `merge_consecutive_projections` function.
 ///
@@ -672,56 +713,6 @@ fn rewrite_expr(expr: Expr, input: &Projection) -> Result<Transformed<Expr>> {
     })
 }
 
-/// Accumulates outer-referenced columns by the
-/// given expression, `expr`.
-///
-/// # Parameters
-///
-/// * `expr` - The expression to analyze for outer-referenced columns.
-/// * `columns` - A mutable reference to a `HashSet<Column>` where detected
-///   columns are collected.
-fn outer_columns<'a>(expr: &'a Expr, columns: &mut HashSet<&'a Column>) {
-    // inspect_expr_pre doesn't handle subquery references, so find them explicitly
-    expr.apply(|expr| {
-        match expr {
-            Expr::OuterReferenceColumn(_, col) => {
-                columns.insert(col);
-            }
-            Expr::ScalarSubquery(subquery) => {
-                outer_columns_helper_multi(&subquery.outer_ref_columns, columns);
-            }
-            Expr::Exists(exists) => {
-                outer_columns_helper_multi(&exists.subquery.outer_ref_columns, columns);
-            }
-            Expr::InSubquery(insubquery) => {
-                outer_columns_helper_multi(
-                    &insubquery.subquery.outer_ref_columns,
-                    columns,
-                );
-            }
-            _ => {}
-        };
-        Ok(TreeNodeRecursion::Continue)
-    })
-    // unwrap: closure above never returns Err, so can not be Err here
-    .unwrap();
-}
-
-/// A recursive subroutine that accumulates outer-referenced columns by the
-/// given expressions (`exprs`).
-///
-/// # Parameters
-///
-/// * `exprs` - The expressions to analyze for outer-referenced columns.
-/// * `columns` - A mutable reference to a `HashSet<Column>` where detected
-///   columns are collected.
-fn outer_columns_helper_multi<'a, 'b>(
-    exprs: impl IntoIterator<Item = &'a Expr>,
-    columns: &'b mut HashSet<&'a Column>,
-) {
-    exprs.into_iter().for_each(|e| outer_columns(e, columns));
-}
-
 /// Splits requirement indices for a join into left and right children based on
 /// the join type.
 ///
@@ -742,6 +733,7 @@ fn outer_columns_helper_multi<'a, 'b>(
 /// # Parameters
 ///
 /// * `left_len` - The length of the left child.
+/// * `right_len` - The length of the right child.
 /// * `indices` - A slice of requirement indices.
 /// * `join_type` - The type of join (e.g. `INNER`, `LEFT`, `RIGHT`).
 ///
@@ -753,21 +745,29 @@ fn outer_columns_helper_multi<'a, 'b>(
 /// adjusted based on the join type.
 fn split_join_requirements(
     left_len: usize,
+    right_len: usize,
     indices: RequiredIndices,
     join_type: &JoinType,
 ) -> (RequiredIndices, RequiredIndices) {
     match join_type {
         // In these cases requirements are split between left/right children:
-        JoinType::Inner
-        | JoinType::Left
-        | JoinType::Right
-        | JoinType::Full
-        | JoinType::LeftMark
-        | JoinType::RightMark => {
+        JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => {
             // Decrease right side indices by `left_len` so that they point to valid
             // positions within the right child:
             indices.split_off(left_len)
         }
+        JoinType::LeftMark => {
+            // LeftMark output: [left_cols(0..left_len), mark]
+            // The mark column is synthetic (produced by the join itself),
+            // so discard it and route only to the left child.
+            let (left_indices, _mark) = indices.split_off(left_len);
+            (left_indices, RequiredIndices::new())
+        }
+        JoinType::RightMark => {
+            // Same as LeftMark, but for the right child.
+            let (right_indices, _mark) = indices.split_off(right_len);
+            (RequiredIndices::new(), right_indices)
+        }
         // All requirements can be re-routed to left child directly.
         JoinType::LeftAnti | JoinType::LeftSemi => (indices, RequiredIndices::new()),
         // All requirements can be re-routed to right side directly.
@@ -879,12 +879,11 @@ pub fn is_projection_unnecessary(
 /// subqueries like scalar, EXISTS, or IN. These cases prevent projection
 /// pushdown for now because we cannot safely reason about their column usage.
 fn plan_contains_other_subqueries(plan: &LogicalPlan, cte_name: &str) -> bool {
-    if let LogicalPlan::SubqueryAlias(alias) = plan {
-        if alias.alias.table() != cte_name
-            && !subquery_alias_targets_recursive_cte(alias.input.as_ref(), cte_name)
-        {
-            return true;
-        }
+    if let LogicalPlan::SubqueryAlias(alias) = plan
+        && alias.alias.table() != cte_name
+        && !subquery_alias_targets_recursive_cte(alias.input.as_ref(), cte_name)
+    {
+        return true;
     }
 
     let mut found = false;
@@ -954,14 +953,15 @@ mod tests {
     };
     use datafusion_expr::ExprFunctionExt;
     use datafusion_expr::{
-        binary_expr, build_join_schema,
+        BinaryExpr, Expr, Extension, Like, LogicalPlan, Operator, Projection,
+        UserDefinedLogicalNodeCore, WindowFunctionDefinition, binary_expr,
+        build_join_schema,
         builder::table_scan_with_filters,
         col,
         expr::{self, Cast},
         lit,
         logical_plan::{builder::LogicalPlanBuilder, table_scan},
-        not, try_cast, when, BinaryExpr, Expr, Extension, Like, LogicalPlan, Operator,
-        Projection, UserDefinedLogicalNodeCore, WindowFunctionDefinition,
+        not, try_cast, when,
     };
     use insta::assert_snapshot;
 
@@ -1168,6 +1168,57 @@ mod tests {
         }
     }
 
+    /// A user-defined node that does NOT implement `necessary_children_exprs`,
+    /// so the optimizer cannot determine which columns are required from its
+    /// children and must assume all columns are needed.
+    #[derive(Debug, Hash, PartialEq, Eq)]
+    struct OpaqueRequirementsUserDefined {
+        input: Arc<LogicalPlan>,
+        schema: DFSchemaRef,
+    }
+
+    // Manual implementation needed because of `schema` field. Comparison excludes this field.
+    impl PartialOrd for OpaqueRequirementsUserDefined {
+        fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+            self.input
+                .partial_cmp(&other.input)
+                .filter(|cmp| *cmp != Ordering::Equal || self == other)
+        }
+    }
+
+    impl UserDefinedLogicalNodeCore for OpaqueRequirementsUserDefined {
+        fn name(&self) -> &str {
+            "OpaqueRequirementsUserDefined"
+        }
+
+        fn inputs(&self) -> Vec<&LogicalPlan> {
+            vec![&self.input]
+        }
+
+        fn schema(&self) -> &DFSchemaRef {
+            &self.schema
+        }
+
+        fn expressions(&self) -> Vec<Expr> {
+            vec![]
+        }
+
+        fn with_exprs_and_inputs(
+            &self,
+            _exprs: Vec<Expr>,
+            mut inputs: Vec<LogicalPlan>,
+        ) -> Result<Self> {
+            Ok(Self {
+                input: Arc::new(inputs.swap_remove(0)),
+                schema: Arc::clone(&self.schema),
+            })
+        }
+
+        fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result {
+            write!(f, "OpaqueRequirementsUserDefined")
+        }
+    }
+
     #[test]
     fn merge_two_projection() -> Result<()> {
         let table_scan = test_table_scan()?;
@@ -2200,6 +2251,29 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_continue_processing_through_extension() -> Result<()> {
+        let table_scan = test_table_scan()?;
+        let plan = LogicalPlanBuilder::from(table_scan.clone())
+            .project(vec![col("a")])?
+            .project(vec![col("a")])?
+            .build()?;
+        let plan = LogicalPlan::Extension(Extension {
+            node: Arc::new(OpaqueRequirementsUserDefined {
+                input: Arc::new(plan),
+                schema: Arc::clone(table_scan.schema()),
+            }),
+        });
+        let plan = optimize(plan).expect("failed to optimize plan");
+        assert_optimized_plan_equal!(
+            plan,
+            @r"
+        OpaqueRequirementsUserDefined
+          TableScan: test projection=[a]
+        "
+        )
+    }
+
     /// tests that it removes an aggregate is never used downstream
     #[test]
     fn table_unused_aggregate() -> Result<()> {
@@ -2307,6 +2381,68 @@ mod tests {
         )
     }
 
+    // Regression test for https://github.com/apache/datafusion/issues/20083
+    // Optimizer must not fail when LeftMark joins from EXISTS OR EXISTS
+    // feed into a Left join.
+    #[test]
+    fn optimize_projections_exists_or_exists_with_outer_join() -> Result<()> {
+        use datafusion_expr::utils::disjunction;
+        use datafusion_expr::{exists, out_ref_col};
+
+        let table_a = test_table_scan_with_name("a")?;
+        let table_b = test_table_scan_with_name("b")?;
+
+        let sq_a = Arc::new(
+            LogicalPlanBuilder::from(test_table_scan_with_name("sq_a")?)
+                .filter(col("sq_a.a").eq(out_ref_col(DataType::UInt32, "a.a")))?
+                .project(vec![lit(1)])?
+                .build()?,
+        );
+
+        let sq_b = Arc::new(
+            LogicalPlanBuilder::from(test_table_scan_with_name("sq_b")?)
+                .filter(col("sq_b.b").eq(out_ref_col(DataType::UInt32, "a.b")))?
+                .project(vec![lit(1)])?
+                .build()?,
+        );
+
+        let plan = LogicalPlanBuilder::from(table_a)
+            .filter(disjunction(vec![exists(sq_a), exists(sq_b)]).unwrap())?
+            .join(table_b, JoinType::Left, (vec!["a"], vec!["a"]), None)?
+            .build()?;
+
+        let optimizer = Optimizer::new();
+        let config = OptimizerContext::new();
+        optimizer.optimize(plan, &config, observe)?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn optimize_projections_left_mark_join_with_projection() -> Result<()> {
+        let table_a = test_table_scan_with_name("a")?;
+        let table_b = test_table_scan_with_name("b")?;
+        let table_c = test_table_scan_with_name("c")?;
+
+        let plan = LogicalPlanBuilder::from(table_a)
+            .join(table_b, JoinType::LeftMark, (vec!["a"], vec!["a"]), None)?
+            .project(vec![col("a.a"), col("a.b"), col("a.c")])?
+            .join(table_c, JoinType::Left, (vec!["a"], vec!["a"]), None)?
+            .build()?;
+
+        assert_optimized_plan_equal!(
+            plan,
+            @r"
+        Left Join: a.a = c.a
+          Projection: a.a, a.b, a.c
+            LeftMark Join: a.a = b.a
+              TableScan: a projection=[a, b, c]
+              TableScan: b projection=[a]
+          TableScan: c projection=[a, b, c]
+        "
+        )
+    }
+
     fn observe(_plan: &LogicalPlan, _rule: &dyn OptimizerRule) {}
 
     fn optimize(plan: LogicalPlan) -> Result<LogicalPlan> {
diff --git a/datafusion/optimizer/src/optimize_projections/required_indices.rs b/datafusion/optimizer/src/optimize_projections/required_indices.rs
index c1e0885c9b5f2..5e73a9fbeceda 100644
--- a/datafusion/optimizer/src/optimize_projections/required_indices.rs
+++ b/datafusion/optimizer/src/optimize_projections/required_indices.rs
@@ -17,8 +17,7 @@
 
 //! [`RequiredIndices`] helper for OptimizeProjection
 
-use crate::optimize_projections::outer_columns;
-use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
 use datafusion_common::{Column, DFSchemaRef, Result};
 use datafusion_expr::{Expr, LogicalPlan};
 
@@ -105,44 +104,59 @@ impl RequiredIndices {
     /// Adds the indices of the fields referred to by the given expression
     /// `expr` within the given schema (`input_schema`).
     ///
-    /// Self is NOT compacted (and thus this method is not pub)
+    /// Self is NOT compacted (duplicate indices are removed by a subsequent
+    /// [`Self::compact`] call), and thus this method is not pub.
     ///
     /// # Parameters
     ///
     /// * `input_schema`: The input schema to analyze for index requirements.
     /// * `expr`: An expression for which we want to find necessary field indices.
     fn add_expr(&mut self, input_schema: &DFSchemaRef, expr: &Expr) {
-        // TODO could remove these clones (and visit the expression directly)
-        let mut cols = expr.column_refs();
-        // Get outer-referenced (subquery) columns:
-        outer_columns(expr, &mut cols);
-        self.indices.reserve(cols.len());
-        for col in cols {
-            if let Some(idx) = input_schema.maybe_index_of_column(col) {
-                self.indices.push(idx);
+        // `apply` does not descend into subqueries, so recurse manually to
+        // handle those cases.
+        expr.apply(|e| {
+            match e {
+                Expr::Column(c) | Expr::OuterReferenceColumn(_, c) => {
+                    if let Some(idx) = input_schema.maybe_index_of_column(c) {
+                        self.indices.push(idx);
+                    }
+                }
+                Expr::ScalarSubquery(sub) => {
+                    self.add_exprs(input_schema, &sub.outer_ref_columns);
+                }
+                Expr::Exists(ex) => {
+                    self.add_exprs(input_schema, &ex.subquery.outer_ref_columns);
+                }
+                Expr::InSubquery(isq) => {
+                    self.add_exprs(input_schema, &isq.subquery.outer_ref_columns);
+                }
+                _ => {}
             }
+            Ok(TreeNodeRecursion::Continue)
+        })
+        .expect("traversal is infallible");
+    }
+
+    /// Like [`Self::add_expr`], but for multiple expressions.
+    fn add_exprs<'a>(
+        &mut self,
+        input_schema: &DFSchemaRef,
+        exprs: impl IntoIterator<Item = &'a Expr>,
+    ) {
+        for expr in exprs {
+            self.add_expr(input_schema, expr);
         }
     }
 
     /// Adds the indices of the fields referred to by the given expressions
-    /// `within the given schema.
-    ///
-    /// # Parameters
-    ///
-    /// * `input_schema`: The input schema to analyze for index requirements.
-    /// * `exprs`: the expressions for which we want to find field indices.
+    /// within the given schema.
     pub fn with_exprs<'a>(
-        self,
+        mut self,
         schema: &DFSchemaRef,
         exprs: impl IntoIterator<Item = &'a Expr>,
     ) -> Self {
-        exprs
-            .into_iter()
-            .fold(self, |mut acc, expr| {
-                acc.add_expr(schema, expr);
-                acc
-            })
-            .compact()
+        self.add_exprs(schema, exprs);
+        self.compact()
     }
 
     /// Adds all `indices` into this instance.
diff --git a/datafusion/optimizer/src/eliminate_nested_union.rs b/datafusion/optimizer/src/optimize_unions.rs
similarity index 79%
rename from datafusion/optimizer/src/eliminate_nested_union.rs
rename to datafusion/optimizer/src/optimize_unions.rs
index f8f93727cd9ba..80f8ebeef1697 100644
--- a/datafusion/optimizer/src/eliminate_nested_union.rs
+++ b/datafusion/optimizer/src/optimize_unions.rs
@@ -15,30 +15,32 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`EliminateNestedUnion`]: flattens nested `Union` to a single `Union`
+//! [`OptimizeUnions`]: removes `Union` nodes in the logical plan.
 use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
-use datafusion_common::tree_node::Transformed;
 use datafusion_common::Result;
+use datafusion_common::tree_node::Transformed;
 use datafusion_expr::expr_rewriter::coerce_plan_expr_for_schema;
-use datafusion_expr::{Distinct, LogicalPlan, Union};
+use datafusion_expr::{Distinct, LogicalPlan, Projection, Union};
 use itertools::Itertools;
 use std::sync::Arc;
 
 #[derive(Default, Debug)]
-/// An optimization rule that replaces nested unions with a single union.
-pub struct EliminateNestedUnion;
+/// An optimization rule that
+/// 1. replaces nested unions with a single union.
+/// 2. removes unions with a single input.
+pub struct OptimizeUnions;
 
-impl EliminateNestedUnion {
-    #[allow(missing_docs)]
+impl OptimizeUnions {
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
 }
 
-impl OptimizerRule for EliminateNestedUnion {
+impl OptimizerRule for OptimizeUnions {
     fn name(&self) -> &str {
-        "eliminate_nested_union"
+        "optimize_unions"
     }
 
     fn apply_order(&self) -> Option<ApplyOrder> {
@@ -55,15 +57,18 @@ impl OptimizerRule for EliminateNestedUnion {
         _config: &dyn OptimizerConfig,
     ) -> Result<Transformed<LogicalPlan>> {
         match plan {
+            LogicalPlan::Union(Union { mut inputs, .. }) if inputs.len() == 1 => Ok(
+                Transformed::yes(Arc::unwrap_or_clone(inputs.pop().unwrap())),
+            ),
             LogicalPlan::Union(Union { inputs, schema }) => {
                 let inputs = inputs
                     .into_iter()
                     .flat_map(extract_plans_from_union)
-                    .map(|plan| coerce_plan_expr_for_schema(plan, &schema))
+                    .map(|plan| Ok(Arc::new(coerce_plan_expr_for_schema(plan, &schema)?)))
                     .collect::<Result<Vec<_>>>()?;
 
                 Ok(Transformed::yes(LogicalPlan::Union(Union {
-                    inputs: inputs.into_iter().map(Arc::new).collect_vec(),
+                    inputs,
                     schema,
                 })))
             }
@@ -100,6 +105,38 @@ fn extract_plans_from_union(plan: Arc<LogicalPlan>) -> Vec<LogicalPlan> {
             .into_iter()
             .map(Arc::unwrap_or_clone)
             .collect::<Vec<_>>(),
+        // While unnesting, unwrap a Projection whose input is a nested Union,
+        // flatten the inner Union, and push the same Projection down onto
+        // each of the nested Union’s children.
+        //
+        // Example:
+        //   Union { Projection { Union { plan1, plan2 } }, plan3 }
+        //     => Union { Projection { plan1 }, Projection { plan2 }, plan3 }
+        LogicalPlan::Projection(Projection {
+            expr,
+            input,
+            schema,
+            ..
+        }) => match Arc::unwrap_or_clone(input) {
+            LogicalPlan::Union(Union { inputs, .. }) => inputs
+                .into_iter()
+                .map(Arc::unwrap_or_clone)
+                .map(|plan| {
+                    LogicalPlan::Projection(
+                        Projection::try_new_with_schema(
+                            expr.clone(),
+                            Arc::new(plan),
+                            Arc::clone(&schema),
+                        )
+                        .unwrap(),
+                    )
+                })
+                .collect::<Vec<_>>(),
+
+            plan => vec![LogicalPlan::Projection(
+                Projection::try_new_with_schema(expr, Arc::new(plan), schema).unwrap(),
+            )],
+        },
         plan => vec![plan],
     }
 }
@@ -114,10 +151,10 @@ fn extract_plan_from_distinct(plan: Arc<LogicalPlan>) -> Arc<LogicalPlan> {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::analyzer::type_coercion::TypeCoercion;
+    use crate::OptimizerContext;
     use crate::analyzer::Analyzer;
+    use crate::analyzer::type_coercion::TypeCoercion;
     use crate::assert_optimized_plan_eq_snapshot;
-    use crate::OptimizerContext;
     use arrow::datatypes::{DataType, Field, Schema};
     use datafusion_common::config::ConfigOptions;
     use datafusion_expr::{col, logical_plan::table_scan};
@@ -139,7 +176,7 @@ mod tests {
             let analyzed_plan = Analyzer::with_rules(vec![Arc::new(TypeCoercion::new())])
                 .execute_and_check($plan, &options, |_, _| {})?;
             let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
-            let rules: Vec<Arc<dyn crate::OptimizerRule + Send + Sync>> = vec![Arc::new(EliminateNestedUnion::new())];
+            let rules: Vec<Arc<dyn crate::OptimizerRule + Send + Sync>> = vec![Arc::new(OptimizeUnions::new())];
             assert_optimized_plan_eq_snapshot!(
                 optimizer_ctx,
                 rules,
@@ -326,6 +363,27 @@ mod tests {
         ")
     }
 
+    #[test]
+    fn eliminate_nested_union_in_projection() -> Result<()> {
+        let plan_builder = table_scan(Some("table"), &schema(), None)?;
+
+        let plan = plan_builder
+            .clone()
+            .union(plan_builder.clone().build()?)?
+            .project(vec![col("id").alias("table_id"), col("key"), col("value")])?
+            .union(plan_builder.build()?)?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Union
+          Projection: id AS table_id, key, value
+            TableScan: table
+          Projection: id AS table_id, key, value
+            TableScan: table
+          TableScan: table
+        ")
+    }
+
     #[test]
     fn eliminate_nested_union_with_type_cast_projection() -> Result<()> {
         let table_1 = table_scan(
@@ -420,4 +478,26 @@ mod tests {
               TableScan: table_1
         ")
     }
+
+    #[test]
+    fn eliminate_one_union() -> Result<()> {
+        let plan = table_scan(Some("table"), &schema(), None)?.build()?;
+        let schema = Arc::clone(plan.schema());
+        // note it is not possible to create a single input union via
+        // LogicalPlanBuilder so create it manually here
+        let plan = LogicalPlan::Union(Union {
+            inputs: vec![Arc::new(plan)],
+            schema,
+        });
+
+        // Note we can't use the same assert_optimized_plan_equal as creating a
+        // single input union is not possible via LogicalPlanBuilder and other passes
+        // throw errors / don't handle the schema correctly.
+        assert_optimized_plan_eq_snapshot!(
+            OptimizerContext::new().with_max_passes(1),
+            vec![Arc::new(OptimizeUnions::new())],
+            plan,
+            @"TableScan: table"
+        )
+    }
 }
diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs
index 084152d40e92c..d0fbb31414dab 100644
--- a/datafusion/optimizer/src/optimizer.rs
+++ b/datafusion/optimizer/src/optimizer.rs
@@ -22,14 +22,14 @@ use std::sync::Arc;
 
 use chrono::{DateTime, Utc};
 use datafusion_expr::registry::FunctionRegistry;
-use datafusion_expr::{assert_expected_schema, InvariantLevel};
+use datafusion_expr::{InvariantLevel, assert_expected_schema};
 use log::{debug, warn};
 
 use datafusion_common::alias::AliasGenerator;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::instant::Instant;
 use datafusion_common::tree_node::{Transformed, TreeNodeRewriter};
-use datafusion_common::{internal_err, DFSchema, DataFusionError, HashSet, Result};
+use datafusion_common::{DFSchema, DataFusionError, HashSet, Result, internal_err};
 use datafusion_expr::logical_plan::LogicalPlan;
 
 use crate::common_subexpr_eliminate::CommonSubexprEliminate;
@@ -41,28 +41,29 @@ use crate::eliminate_filter::EliminateFilter;
 use crate::eliminate_group_by_constant::EliminateGroupByConstant;
 use crate::eliminate_join::EliminateJoin;
 use crate::eliminate_limit::EliminateLimit;
-use crate::eliminate_nested_union::EliminateNestedUnion;
-use crate::eliminate_one_union::EliminateOneUnion;
 use crate::eliminate_outer_join::EliminateOuterJoin;
 use crate::extract_equijoin_predicate::ExtractEquijoinPredicate;
+use crate::extract_leaf_expressions::{ExtractLeafExpressions, PushDownLeafProjections};
 use crate::filter_null_join_keys::FilterNullJoinKeys;
 use crate::optimize_projections::OptimizeProjections;
+use crate::optimize_unions::OptimizeUnions;
 use crate::plan_signature::LogicalPlanSignature;
 use crate::propagate_empty_relation::PropagateEmptyRelation;
 use crate::push_down_filter::PushDownFilter;
 use crate::push_down_limit::PushDownLimit;
 use crate::replace_distinct_aggregate::ReplaceDistinctWithAggregate;
+use crate::rewrite_set_comparison::RewriteSetComparison;
 use crate::scalar_subquery_to_join::ScalarSubqueryToJoin;
 use crate::simplify_expressions::SimplifyExpressions;
 use crate::single_distinct_to_groupby::SingleDistinctToGroupBy;
 use crate::utils::log_plan;
 
-/// `OptimizerRule`s transforms one [`LogicalPlan`] into another which
-/// computes the same results, but in a potentially more efficient
-/// way. If there are no suitable transformations for the input plan,
-/// the optimizer should simply return it unmodified.
+/// Transforms one [`LogicalPlan`] into another which computes the same results,
+/// but in a potentially more efficient way.
 ///
-/// To change the semantics of a `LogicalPlan`, see [`AnalyzerRule`]
+/// See notes on [`Self::rewrite`] for details on how to implement an `OptimizerRule`.
+///
+/// To change the semantics of a `LogicalPlan`, see [`AnalyzerRule`].
 ///
 /// Use [`SessionState::add_optimizer_rule`] to register additional
 /// `OptimizerRule`s.
@@ -87,8 +88,40 @@ pub trait OptimizerRule: Debug {
         true
     }
 
-    /// Try to rewrite `plan` to an optimized form, returning `Transformed::yes`
-    /// if the plan was rewritten and `Transformed::no` if it was not.
+    /// Try to rewrite `plan` to an optimized form, returning [`Transformed::yes`]
+    /// if the plan was rewritten and [`Transformed::no`] if it was not.
+    ///
+    /// # Notes for implementations:
+    ///
+    /// ## Return the same plan if no changes were made
+    ///
+    /// If there are no suitable transformations for the input plan,
+    /// the optimizer should simply return it unmodified.
+    ///
+    /// The optimizer will call `rewrite` several times until a fixed point is
+    /// reached, so it is important that `rewrite` return [`Transformed::no`] if
+    /// the output is the same.
+    ///
+    /// ## Matching on functions
+    ///
+    /// The rule should avoid function-specific transformations, and instead use
+    /// methods on [`ScalarUDFImpl`] and [`AggregateUDFImpl`]. Specifically, the
+    /// rule should not check function names as functions can be overridden, and
+    /// may not have the same semantics as the functions provided with
+    /// DataFusion.
+    ///
+    /// For example, if a rule rewrites a function based on the check
+    /// `func.name() == "sum"`, it may rewrite the plan incorrectly if the
+    /// registered `sum` function has different semantics (for example, the
+    /// `sum` function from the `datafusion-spark` crate).
+    ///
+    /// There are still several cases that rely on function name checking in
+    /// the rules included with DataFusion. Please see [#18643] for more details
+    /// and to help remove these cases.
+    ///
+    /// [`ScalarUDFImpl`]: datafusion_expr::ScalarUDFImpl
+    /// [`AggregateUDFImpl`]: datafusion_expr::ScalarUDFImpl
+    /// [#18643]: https://github.com/apache/datafusion/issues/18643
     fn rewrite(
         &self,
         _plan: LogicalPlan,
@@ -101,8 +134,9 @@ pub trait OptimizerRule: Debug {
 /// Options to control the DataFusion Optimizer.
 pub trait OptimizerConfig {
     /// Return the time at which the query execution started. This
-    /// time is used as the value for now()
-    fn query_execution_start_time(&self) -> DateTime<Utc>;
+    /// time is used as the value for `now()`. If `None`, time-dependent
+    /// functions like `now()` will not be simplified during optimization.
+    fn query_execution_start_time(&self) -> Option<DateTime<Utc>>;
 
     /// Return alias generator used to generate unique aliases for subqueries
     fn alias_generator(&self) -> &Arc<AliasGenerator>;
@@ -119,8 +153,9 @@ pub trait OptimizerConfig {
 #[derive(Debug)]
 pub struct OptimizerContext {
     /// Query execution start time that can be used to rewrite
-    /// expressions such as `now()` to use a literal value instead
-    query_execution_start_time: DateTime<Utc>,
+    /// expressions such as `now()` to use a literal value instead.
+    /// If `None`, time-dependent functions will not be simplified.
+    query_execution_start_time: Option<DateTime<Utc>>,
 
     /// Alias generator used to generate unique aliases for subqueries
     alias_generator: Arc<AliasGenerator>,
@@ -140,7 +175,7 @@ impl OptimizerContext {
     /// Create a optimizer config with provided [ConfigOptions].
     pub fn new_with_config_options(options: Arc<ConfigOptions>) -> Self {
         Self {
-            query_execution_start_time: Utc::now(),
+            query_execution_start_time: Some(Utc::now()),
             alias_generator: Arc::new(AliasGenerator::new()),
             options,
         }
@@ -154,13 +189,19 @@ impl OptimizerContext {
         self
     }
 
-    /// Specify whether the optimizer should skip rules that produce
-    /// errors, or fail the query
+    /// Set the query execution start time
     pub fn with_query_execution_start_time(
         mut self,
-        query_execution_tart_time: DateTime<Utc>,
+        query_execution_start_time: DateTime<Utc>,
     ) -> Self {
-        self.query_execution_start_time = query_execution_tart_time;
+        self.query_execution_start_time = Some(query_execution_start_time);
+        self
+    }
+
+    /// Clear the query execution start time. When `None`, time-dependent
+    /// functions like `now()` will not be simplified during optimization.
+    pub fn without_query_execution_start_time(mut self) -> Self {
+        self.query_execution_start_time = None;
         self
     }
 
@@ -186,7 +227,7 @@ impl Default for OptimizerContext {
 }
 
 impl OptimizerConfig for OptimizerContext {
-    fn query_execution_start_time(&self) -> DateTime<Utc> {
+    fn query_execution_start_time(&self) -> Option<DateTime<Utc>> {
         self.query_execution_start_time
     }
 
@@ -227,8 +268,18 @@ impl Default for Optimizer {
 impl Optimizer {
     /// Create a new optimizer using the recommended list of rules
     pub fn new() -> Self {
+        // NOTEs:
+        // - The order of rules in this list is important, as it determines the
+        //   order in which they are applied.
+        // - Adding a new rule here is expensive as it will be applied to all
+        //   queries, and will likely increase the optimization time. Please extend
+        //   existing rules when possible, rather than adding a new rule.
+        //   If you do add a new rule considering having aggressive no-op paths
+        //   (e.g. if the plan doesn't contain any of the nodes you are looking for
+        //    return `Transformed::no`; only works if you control the traversal).
         let rules: Vec<Arc<dyn OptimizerRule + Sync + Send>> = vec![
-            Arc::new(EliminateNestedUnion::new()),
+            Arc::new(RewriteSetComparison::new()),
+            Arc::new(OptimizeUnions::new()),
             Arc::new(SimplifyExpressions::new()),
             Arc::new(ReplaceDistinctWithAggregate::new()),
             Arc::new(EliminateJoin::new()),
@@ -241,8 +292,6 @@ impl Optimizer {
             Arc::new(EliminateCrossJoin::new()),
             Arc::new(EliminateLimit::new()),
             Arc::new(PropagateEmptyRelation::new()),
-            // Must be after PropagateEmptyRelation
-            Arc::new(EliminateOneUnion::new()),
             Arc::new(FilterNullJoinKeys::default()),
             Arc::new(EliminateOuterJoin::new()),
             // Filters can't be pushed down past Limits, we should do PushDownFilter after PushDownLimit
@@ -253,6 +302,8 @@ impl Optimizer {
             // that might benefit from the following rules
             Arc::new(EliminateGroupByConstant::new()),
             Arc::new(CommonSubexprEliminate::new()),
+            Arc::new(ExtractLeafExpressions::new()),
+            Arc::new(PushDownLeafProjections::new()),
             Arc::new(OptimizeProjections::new()),
         ];
 
@@ -291,9 +342,7 @@ impl TreeNodeRewriter for Rewriter<'_> {
 
     fn f_down(&mut self, node: LogicalPlan) -> Result<Transformed<LogicalPlan>> {
         if self.apply_order == ApplyOrder::TopDown {
-            {
-                self.rule.rewrite(node, self.config)
-            }
+            self.rule.rewrite(node, self.config)
         } else {
             Ok(Transformed::no(node))
         }
@@ -301,9 +350,7 @@ impl TreeNodeRewriter for Rewriter<'_> {
 
     fn f_up(&mut self, node: LogicalPlan) -> Result<Transformed<LogicalPlan>> {
         if self.apply_order == ApplyOrder::BottomUp {
-            {
-                self.rule.rewrite(node, self.config)
-            }
+            self.rule.rewrite(node, self.config)
         } else {
             Ok(Transformed::no(node))
         }
@@ -449,7 +496,7 @@ impl Optimizer {
 /// These are invariants which should hold true before and after [`LogicalPlan`] optimization.
 ///
 /// This differs from [`LogicalPlan::check_invariants`], which addresses if a singular
-/// LogicalPlan is valid. Instead this address if the optimization was valid based upon permitted changes.
+/// LogicalPlan is valid. Instead, this address if the optimization was valid based upon permitted changes.
 fn assert_valid_optimization(
     plan: &LogicalPlan,
     prev_schema: &Arc<DFSchema>,
@@ -467,10 +514,10 @@ mod tests {
 
     use datafusion_common::tree_node::Transformed;
     use datafusion_common::{
-        assert_contains, plan_err, DFSchema, DFSchemaRef, DataFusionError, Result,
+        DFSchema, DFSchemaRef, DataFusionError, Result, assert_contains, plan_err,
     };
     use datafusion_expr::logical_plan::EmptyRelation;
-    use datafusion_expr::{col, lit, LogicalPlan, LogicalPlanBuilder, Projection};
+    use datafusion_expr::{LogicalPlan, LogicalPlanBuilder, Projection, col, lit};
 
     use crate::optimizer::Optimizer;
     use crate::test::test_table_scan;
diff --git a/datafusion/optimizer/src/plan_signature.rs b/datafusion/optimizer/src/plan_signature.rs
index 73e6b418272a9..6f46d7b663342 100644
--- a/datafusion/optimizer/src/plan_signature.rs
+++ b/datafusion/optimizer/src/plan_signature.rs
@@ -89,7 +89,7 @@ mod tests {
     use std::sync::Arc;
 
     use datafusion_common::{DFSchema, Result};
-    use datafusion_expr::{lit, LogicalPlan};
+    use datafusion_expr::{LogicalPlan, lit};
 
     use crate::plan_signature::get_node_number;
 
diff --git a/datafusion/optimizer/src/propagate_empty_relation.rs b/datafusion/optimizer/src/propagate_empty_relation.rs
index 4db3215dfb76a..6565d4f187339 100644
--- a/datafusion/optimizer/src/propagate_empty_relation.rs
+++ b/datafusion/optimizer/src/propagate_empty_relation.rs
@@ -19,11 +19,11 @@
 
 use std::sync::Arc;
 
-use datafusion_common::tree_node::Transformed;
 use datafusion_common::JoinType;
-use datafusion_common::{plan_err, Result};
+use datafusion_common::tree_node::Transformed;
+use datafusion_common::{Column, DFSchemaRef, Result, ScalarValue, plan_err};
 use datafusion_expr::logical_plan::LogicalPlan;
-use datafusion_expr::{EmptyRelation, Projection, Union};
+use datafusion_expr::{EmptyRelation, Expr, Projection, Union, cast, lit};
 
 use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
@@ -33,7 +33,7 @@ use crate::{OptimizerConfig, OptimizerRule};
 pub struct PropagateEmptyRelation;
 
 impl PropagateEmptyRelation {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -73,12 +73,8 @@ impl OptimizerRule for PropagateEmptyRelation {
                 Ok(Transformed::no(plan))
             }
             LogicalPlan::Join(ref join) => {
-                // TODO: For Join, more join type need to be careful:
-                // For LeftOut/Full Join, if the right side is empty, the Join can be eliminated with a Projection with left side
-                // columns + right side columns replaced with null values.
-                // For RightOut/Full Join, if the left side is empty, the Join can be eliminated with a Projection with right side
-                // columns + left side columns replaced with null values.
                 let (left_empty, right_empty) = binary_plan_children_is_empty(&plan)?;
+                let left_field_count = join.left.schema().fields().len();
 
                 match join.join_type {
                     // For Full Join, only both sides are empty, the Join result is empty.
@@ -88,6 +84,24 @@ impl OptimizerRule for PropagateEmptyRelation {
                             schema: Arc::clone(&join.schema),
                         }),
                     )),
+                    // For Full Join, if one side is empty, replace with a
+                    // Projection that null-pads the empty side's columns.
+                    JoinType::Full if right_empty => {
+                        Ok(Transformed::yes(build_null_padded_projection(
+                            Arc::clone(&join.left),
+                            &join.schema,
+                            left_field_count,
+                            true,
+                        )?))
+                    }
+                    JoinType::Full if left_empty => {
+                        Ok(Transformed::yes(build_null_padded_projection(
+                            Arc::clone(&join.right),
+                            &join.schema,
+                            left_field_count,
+                            false,
+                        )?))
+                    }
                     JoinType::Inner if left_empty || right_empty => Ok(Transformed::yes(
                         LogicalPlan::EmptyRelation(EmptyRelation {
                             produce_one_row: false,
@@ -100,12 +114,32 @@ impl OptimizerRule for PropagateEmptyRelation {
                             schema: Arc::clone(&join.schema),
                         }),
                     )),
+                    // Left Join with empty right: all left rows survive
+                    // with NULLs for right columns.
+                    JoinType::Left if right_empty => {
+                        Ok(Transformed::yes(build_null_padded_projection(
+                            Arc::clone(&join.left),
+                            &join.schema,
+                            left_field_count,
+                            true,
+                        )?))
+                    }
                     JoinType::Right if right_empty => Ok(Transformed::yes(
                         LogicalPlan::EmptyRelation(EmptyRelation {
                             produce_one_row: false,
                             schema: Arc::clone(&join.schema),
                         }),
                     )),
+                    // Right Join with empty left: all right rows survive
+                    // with NULLs for left columns.
+                    JoinType::Right if left_empty => {
+                        Ok(Transformed::yes(build_null_padded_projection(
+                            Arc::clone(&join.right),
+                            &join.schema,
+                            left_field_count,
+                            false,
+                        )?))
+                    }
                     JoinType::LeftSemi if left_empty || right_empty => Ok(
                         Transformed::yes(LogicalPlan::EmptyRelation(EmptyRelation {
                             produce_one_row: false,
@@ -140,10 +174,10 @@ impl OptimizerRule for PropagateEmptyRelation {
                 }
             }
             LogicalPlan::Aggregate(ref agg) => {
-                if !agg.group_expr.is_empty() {
-                    if let Some(empty_plan) = empty_child(&plan)? {
-                        return Ok(Transformed::yes(empty_plan));
-                    }
+                if !agg.group_expr.is_empty()
+                    && let Some(empty_plan) = empty_child(&plan)?
+                {
+                    return Ok(Transformed::yes(empty_plan));
                 }
                 Ok(Transformed::no(LogicalPlan::Aggregate(agg.clone())))
             }
@@ -230,26 +264,76 @@ fn empty_child(plan: &LogicalPlan) -> Result<Option<LogicalPlan>> {
     }
 }
 
+/// Builds a Projection that replaces one side of an outer join with NULL literals.
+///
+/// When one side of an outer join is an `EmptyRelation`, the join can be eliminated
+/// by projecting the surviving side's columns as-is and replacing the empty side's
+/// columns with `CAST(NULL AS <type>)`.
+///
+/// The join schema is used as the projection's output schema to preserve nullability
+/// guarantees (important for FULL JOIN where the surviving side's columns are marked
+/// nullable in the join schema even if they aren't in the source schema).
+///
+/// # Example
+///
+/// For a `LEFT JOIN` where the right side is empty:
+/// ```text
+/// Left Join (orders.id = returns.order_id)        Projection(orders.id, orders.amount,
+///   ├── TableScan: orders                   =>      CAST(NULL AS Int64) AS order_id,
+///   └── EmptyRelation                               CAST(NULL AS Utf8) AS reason)
+///                                                   └── TableScan: orders
+/// ```
+fn build_null_padded_projection(
+    surviving_plan: Arc<LogicalPlan>,
+    join_schema: &DFSchemaRef,
+    left_field_count: usize,
+    empty_side_is_right: bool,
+) -> Result<LogicalPlan> {
+    let exprs = join_schema
+        .iter()
+        .enumerate()
+        .map(|(i, (qualifier, field))| {
+            let on_empty_side = if empty_side_is_right {
+                i >= left_field_count
+            } else {
+                i < left_field_count
+            };
+
+            if on_empty_side {
+                cast(lit(ScalarValue::Null), field.data_type().clone())
+                    .alias_qualified(qualifier.cloned(), field.name())
+            } else {
+                Expr::Column(Column::new(qualifier.cloned(), field.name()))
+            }
+        })
+        .collect::<Vec<_>>();
+
+    Ok(LogicalPlan::Projection(Projection::try_new_with_schema(
+        exprs,
+        surviving_plan,
+        Arc::clone(join_schema),
+    )?))
+}
+
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
 
     use arrow::datatypes::{DataType, Field, Schema};
 
-    use datafusion_common::{Column, DFSchema, JoinType};
+    use datafusion_common::{Column, DFSchema};
     use datafusion_expr::logical_plan::table_scan;
     use datafusion_expr::{
-        binary_expr, col, lit, logical_plan::builder::LogicalPlanBuilder, Operator,
+        Operator, binary_expr, col, lit, logical_plan::builder::LogicalPlanBuilder,
     };
 
+    use crate::OptimizerContext;
     use crate::assert_optimized_plan_eq_snapshot;
     use crate::eliminate_filter::EliminateFilter;
-    use crate::eliminate_nested_union::EliminateNestedUnion;
+    use crate::optimize_unions::OptimizeUnions;
     use crate::test::{
         assert_optimized_plan_with_rules, test_table_scan, test_table_scan_fields,
         test_table_scan_with_name,
     };
-    use crate::OptimizerContext;
 
     use super::*;
 
@@ -277,7 +361,7 @@ mod tests {
         assert_optimized_plan_with_rules(
             vec![
                 Arc::new(EliminateFilter::new()),
-                Arc::new(EliminateNestedUnion::new()),
+                Arc::new(OptimizeUnions::new()),
                 Arc::new(PropagateEmptyRelation::new()),
             ],
             plan,
@@ -571,6 +655,111 @@ mod tests {
         assert_empty_left_empty_right_lp(true, false, JoinType::RightAnti, false)
     }
 
+    #[test]
+    fn test_left_join_right_empty_null_pad() -> Result<()> {
+        let left =
+            LogicalPlanBuilder::from(test_table_scan_with_name("left")?).build()?;
+        let right_empty = LogicalPlanBuilder::from(test_table_scan_with_name("right")?)
+            .filter(lit(false))?
+            .build()?;
+
+        let plan = LogicalPlanBuilder::from(left)
+            .join_using(
+                right_empty,
+                JoinType::Left,
+                vec![Column::from_name("a".to_string())],
+            )?
+            .build()?;
+
+        let expected = "Projection: left.a, left.b, left.c, CAST(NULL AS UInt32) AS a, CAST(NULL AS UInt32) AS b, CAST(NULL AS UInt32) AS c\n  TableScan: left";
+        assert_together_optimized_plan(plan, expected, true)
+    }
+
+    #[test]
+    fn test_right_join_left_empty_null_pad() -> Result<()> {
+        let left_empty = LogicalPlanBuilder::from(test_table_scan_with_name("left")?)
+            .filter(lit(false))?
+            .build()?;
+        let right =
+            LogicalPlanBuilder::from(test_table_scan_with_name("right")?).build()?;
+
+        let plan = LogicalPlanBuilder::from(left_empty)
+            .join_using(
+                right,
+                JoinType::Right,
+                vec![Column::from_name("a".to_string())],
+            )?
+            .build()?;
+
+        let expected = "Projection: CAST(NULL AS UInt32) AS a, CAST(NULL AS UInt32) AS b, CAST(NULL AS UInt32) AS c, right.a, right.b, right.c\n  TableScan: right";
+        assert_together_optimized_plan(plan, expected, true)
+    }
+
+    #[test]
+    fn test_full_join_right_empty_null_pad() -> Result<()> {
+        let left =
+            LogicalPlanBuilder::from(test_table_scan_with_name("left")?).build()?;
+        let right_empty = LogicalPlanBuilder::from(test_table_scan_with_name("right")?)
+            .filter(lit(false))?
+            .build()?;
+
+        let plan = LogicalPlanBuilder::from(left)
+            .join_using(
+                right_empty,
+                JoinType::Full,
+                vec![Column::from_name("a".to_string())],
+            )?
+            .build()?;
+
+        let expected = "Projection: left.a, left.b, left.c, CAST(NULL AS UInt32) AS a, CAST(NULL AS UInt32) AS b, CAST(NULL AS UInt32) AS c\n  TableScan: left";
+        assert_together_optimized_plan(plan, expected, true)
+    }
+
+    #[test]
+    fn test_full_join_left_empty_null_pad() -> Result<()> {
+        let left_empty = LogicalPlanBuilder::from(test_table_scan_with_name("left")?)
+            .filter(lit(false))?
+            .build()?;
+        let right =
+            LogicalPlanBuilder::from(test_table_scan_with_name("right")?).build()?;
+
+        let plan = LogicalPlanBuilder::from(left_empty)
+            .join_using(
+                right,
+                JoinType::Full,
+                vec![Column::from_name("a".to_string())],
+            )?
+            .build()?;
+
+        let expected = "Projection: CAST(NULL AS UInt32) AS a, CAST(NULL AS UInt32) AS b, CAST(NULL AS UInt32) AS c, right.a, right.b, right.c\n  TableScan: right";
+        assert_together_optimized_plan(plan, expected, true)
+    }
+
+    #[test]
+    fn test_left_join_complex_on_right_empty_null_pad() -> Result<()> {
+        let left =
+            LogicalPlanBuilder::from(test_table_scan_with_name("left")?).build()?;
+        let right_empty = LogicalPlanBuilder::from(test_table_scan_with_name("right")?)
+            .filter(lit(false))?
+            .build()?;
+
+        // Complex ON condition: left.a = right.a AND left.b > right.b
+        let plan = LogicalPlanBuilder::from(left)
+            .join(
+                right_empty,
+                JoinType::Left,
+                (
+                    vec![Column::from_name("a".to_string())],
+                    vec![Column::from_name("a".to_string())],
+                ),
+                Some(col("left.b").gt(col("right.b"))),
+            )?
+            .build()?;
+
+        let expected = "Projection: left.a, left.b, left.c, CAST(NULL AS UInt32) AS a, CAST(NULL AS UInt32) AS b, CAST(NULL AS UInt32) AS c\n  TableScan: left";
+        assert_together_optimized_plan(plan, expected, true)
+    }
+
     #[test]
     fn test_empty_with_non_empty() -> Result<()> {
         let table_scan = test_table_scan()?;
diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs
index 1c0790b3e3acd..9c69276fa1db3 100644
--- a/datafusion/optimizer/src/push_down_filter.rs
+++ b/datafusion/optimizer/src/push_down_filter.rs
@@ -23,12 +23,15 @@ use std::sync::Arc;
 use arrow::datatypes::DataType;
 use indexmap::IndexSet;
 use itertools::Itertools;
+use log::{Level, debug, log_enabled};
 
+use datafusion_common::instant::Instant;
 use datafusion_common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
 };
 use datafusion_common::{
-    internal_err, plan_err, qualified_name, Column, DFSchema, Result,
+    Column, DFSchema, Result, assert_eq_or_internal_err, assert_or_internal_err,
+    internal_err, plan_err, qualified_name,
 };
 use datafusion_expr::expr::WindowFunction;
 use datafusion_expr::expr_rewriter::replace_col;
@@ -37,13 +40,16 @@ use datafusion_expr::utils::{
     conjunction, expr_to_columns, split_conjunction, split_conjunction_owned,
 };
 use datafusion_expr::{
-    and, or, BinaryExpr, Expr, Filter, Operator, Projection, TableProviderFilterPushDown,
+    BinaryExpr, Expr, Filter, Operator, Projection, TableProviderFilterPushDown, and, or,
 };
 
 use crate::optimizer::ApplyOrder;
 use crate::simplify_expressions::simplify_predicates;
-use crate::utils::{has_all_column_refs, is_restrict_null_predicate};
+use crate::utils::{
+    ColumnReference, has_all_column_refs, is_restrict_null_predicate, schema_columns,
+};
 use crate::{OptimizerConfig, OptimizerRule};
+use datafusion_expr::ExpressionPlacement;
 
 /// Optimizer rule for pushing (moving) filter expressions down in a plan so
 /// they are applied as early as possible.
@@ -174,27 +180,9 @@ pub(crate) fn lr_is_preserved(join_type: JoinType) -> (bool, bool) {
     }
 }
 
-/// For a given JOIN type, determine whether each input of the join is preserved
-/// for the join condition (`ON` clause filters).
-///
-/// It is only correct to push filters below a join for preserved inputs.
-///
-/// # Return Value
-/// A tuple of booleans - (left_preserved, right_preserved).
-///
-/// See [`lr_is_preserved`] for a definition of "preserved".
+/// See [`JoinType::on_lr_is_preserved`] for details.
 pub(crate) fn on_lr_is_preserved(join_type: JoinType) -> (bool, bool) {
-    match join_type {
-        JoinType::Inner => (true, true),
-        JoinType::Left => (false, true),
-        JoinType::Right => (true, false),
-        JoinType::Full => (false, false),
-        JoinType::LeftSemi | JoinType::RightSemi => (true, true),
-        JoinType::LeftAnti => (false, true),
-        JoinType::RightAnti => (true, false),
-        JoinType::LeftMark => (false, true),
-        JoinType::RightMark => (true, false),
-    }
+    join_type.on_lr_is_preserved()
 }
 
 /// Evaluates the columns referenced in the given expression to see if they refer
@@ -204,11 +192,11 @@ struct ColumnChecker<'a> {
     /// schema of left join input
     left_schema: &'a DFSchema,
     /// columns in left_schema, computed on demand
-    left_columns: Option<HashSet<Column>>,
+    left_columns: Option<HashSet<ColumnReference<'a>>>,
     /// schema of right join input
     right_schema: &'a DFSchema,
     /// columns in left_schema, computed on demand
-    right_columns: Option<HashSet<Column>>,
+    right_columns: Option<HashSet<ColumnReference<'a>>>,
 }
 
 impl<'a> ColumnChecker<'a> {
@@ -238,20 +226,6 @@ impl<'a> ColumnChecker<'a> {
     }
 }
 
-/// Returns all columns in the schema
-fn schema_columns(schema: &DFSchema) -> HashSet<Column> {
-    schema
-        .iter()
-        .flat_map(|(qualifier, field)| {
-            [
-                Column::new(qualifier.cloned(), field.name()),
-                // we need to push down filter using unqualified column as well
-                Column::new_unqualified(field.name()),
-            ]
-        })
-        .collect::<HashSet<_>>()
-}
-
 /// Determine whether the predicate can evaluate as the join conditions
 fn can_evaluate_as_join_condition(predicate: &Expr) -> Result<bool> {
     let mut is_evaluate = true;
@@ -262,6 +236,7 @@ fn can_evaluate_as_join_condition(predicate: &Expr) -> Result<bool> {
         | Expr::ScalarVariable(_, _) => Ok(TreeNodeRecursion::Jump),
         Expr::Exists { .. }
         | Expr::InSubquery(_)
+        | Expr::SetComparison(_)
         | Expr::ScalarSubquery(_)
         | Expr::OuterReferenceColumn(_, _)
         | Expr::Unnest(_) => {
@@ -287,7 +262,10 @@ fn can_evaluate_as_join_condition(predicate: &Expr) -> Result<bool> {
         | Expr::Cast(_)
         | Expr::TryCast(_)
         | Expr::InList { .. }
-        | Expr::ScalarFunction(_) => Ok(TreeNodeRecursion::Continue),
+        | Expr::ScalarFunction(_)
+        | Expr::HigherOrderFunction(_)
+        | Expr::Lambda(_)
+        | Expr::LambdaVariable(_) => Ok(TreeNodeRecursion::Continue),
         // TODO: remove the next line after `Expr::Wildcard` is removed
         #[expect(deprecated)]
         Expr::AggregateFunction(_)
@@ -333,10 +311,8 @@ fn can_evaluate_as_join_condition(predicate: &Expr) -> Result<bool> {
 /// * do nothing.
 fn extract_or_clauses_for_join<'a>(
     filters: &'a [Expr],
-    schema: &'a DFSchema,
+    schema_cols: &'a HashSet<ColumnReference>,
 ) -> impl Iterator<Item = Expr> + 'a {
-    let schema_columns = schema_columns(schema);
-
     // new formed OR clauses and their column references
     filters.iter().filter_map(move |expr| {
         if let Expr::BinaryExpr(BinaryExpr {
@@ -345,8 +321,8 @@ fn extract_or_clauses_for_join<'a>(
             right,
         }) = expr
         {
-            let left_expr = extract_or_clause(left.as_ref(), &schema_columns);
-            let right_expr = extract_or_clause(right.as_ref(), &schema_columns);
+            let left_expr = extract_or_clause(left.as_ref(), schema_cols);
+            let right_expr = extract_or_clause(right.as_ref(), schema_cols);
 
             // If nothing can be extracted from any sub clauses, do nothing for this OR clause.
             if let (Some(left_expr), Some(right_expr)) = (left_expr, right_expr) {
@@ -368,7 +344,10 @@ fn extract_or_clauses_for_join<'a>(
 /// Otherwise, return None.
 ///
 /// For other clause, apply the rule above to extract clause.
-fn extract_or_clause(expr: &Expr, schema_columns: &HashSet<Column>) -> Option<Expr> {
+fn extract_or_clause(
+    expr: &Expr,
+    schema_columns: &HashSet<ColumnReference>,
+) -> Option<Expr> {
     let mut predicate = None;
 
     match expr {
@@ -434,6 +413,10 @@ fn push_down_all_join(
     // 3) should be kept as filter conditions
     let left_schema = join.left.schema();
     let right_schema = join.right.schema();
+
+    let left_schema_columns = schema_columns(left_schema.as_ref());
+    let right_schema_columns = schema_columns(right_schema.as_ref());
+
     let mut left_push = vec![];
     let mut right_push = vec![];
     let mut keep_predicates = vec![];
@@ -453,11 +436,11 @@ fn push_down_all_join(
         }
     }
 
-    // For infer predicates, if they can not push through join, just drop them
+    // Push predicates inferred from the join expression
     for predicate in inferred_join_predicates {
-        if left_preserved && checker.is_left_only(&predicate) {
+        if checker.is_left_only(&predicate) {
             left_push.push(predicate);
-        } else if right_preserved && checker.is_right_only(&predicate) {
+        } else if checker.is_right_only(&predicate) {
             right_push.push(predicate);
         }
     }
@@ -480,12 +463,24 @@ fn push_down_all_join(
     // Extract from OR clause, generate new predicates for both side of join if possible.
     // We only track the unpushable predicates above.
     if left_preserved {
-        left_push.extend(extract_or_clauses_for_join(&keep_predicates, left_schema));
-        left_push.extend(extract_or_clauses_for_join(&join_conditions, left_schema));
+        left_push.extend(extract_or_clauses_for_join(
+            &keep_predicates,
+            &left_schema_columns,
+        ));
+        left_push.extend(extract_or_clauses_for_join(
+            &join_conditions,
+            &left_schema_columns,
+        ));
     }
     if right_preserved {
-        right_push.extend(extract_or_clauses_for_join(&keep_predicates, right_schema));
-        right_push.extend(extract_or_clauses_for_join(&join_conditions, right_schema));
+        right_push.extend(extract_or_clauses_for_join(
+            &keep_predicates,
+            &right_schema_columns,
+        ));
+        right_push.extend(extract_or_clauses_for_join(
+            &join_conditions,
+            &right_schema_columns,
+        ));
     }
 
     // For predicates from join filter, we should check with if a join side is preserved
@@ -493,13 +488,13 @@ fn push_down_all_join(
     if on_left_preserved {
         left_push.extend(extract_or_clauses_for_join(
             &on_filter_join_conditions,
-            left_schema,
+            &left_schema_columns,
         ));
     }
     if on_right_preserved {
         right_push.extend(extract_or_clauses_for_join(
             &on_filter_join_conditions,
-            right_schema,
+            &right_schema_columns,
         ));
     }
 
@@ -540,8 +535,19 @@ fn push_down_join(
         .map_or_else(Vec::new, |filter| split_conjunction_owned(filter.clone()));
 
     // Are there any new join predicates that can be inferred from the filter expressions?
-    let inferred_join_predicates =
-        infer_join_predicates(&join, &predicates, &on_filters)?;
+    let inferred_join_predicates = with_debug_timing("infer_join_predicates", || {
+        infer_join_predicates(&join, &predicates, &on_filters)
+    })?;
+
+    if log_enabled!(Level::Debug) {
+        debug!(
+            "push_down_filter: join_type={:?}, parent_predicates={}, on_filters={}, inferred_join_predicates={}",
+            join.join_type,
+            predicates.len(),
+            on_filters.len(),
+            inferred_join_predicates.len()
+        );
+    }
 
     if on_filters.is_empty()
         && predicates.is_empty()
@@ -615,7 +621,7 @@ impl InferredPredicates {
     fn new(join_type: JoinType) -> Self {
         Self {
             predicates: vec![],
-            is_inner_join: matches!(join_type, JoinType::Inner),
+            is_inner_join: join_type == JoinType::Inner,
         }
     }
 
@@ -765,8 +771,9 @@ impl OptimizerRule for PushDownFilter {
     fn rewrite(
         &self,
         plan: LogicalPlan,
-        _config: &dyn OptimizerConfig,
+        config: &dyn OptimizerConfig,
     ) -> Result<Transformed<LogicalPlan>> {
+        let _ = config.options();
         if let LogicalPlan::Join(join) = plan {
             return push_down_join(join, None);
         };
@@ -779,7 +786,15 @@ impl OptimizerRule for PushDownFilter {
 
         let predicate = split_conjunction_owned(filter.predicate.clone());
         let old_predicate_len = predicate.len();
-        let new_predicates = simplify_predicates(predicate)?;
+        let new_predicates =
+            with_debug_timing("simplify_predicates", || simplify_predicates(predicate))?;
+        if log_enabled!(Level::Debug) {
+            debug!(
+                "push_down_filter: simplify_predicates old_count={}, new_count={}",
+                old_predicate_len,
+                new_predicates.len()
+            );
+        }
         if old_predicate_len != new_predicates.len() {
             let Some(new_predicate) = conjunction(new_predicates) else {
                 // new_predicates is empty - remove the filter entirely
@@ -789,6 +804,13 @@ impl OptimizerRule for PushDownFilter {
             filter.predicate = new_predicate;
         }
 
+        // If the child has a fetch (limit) or skip (offset), pushing a filter
+        // below it would change semantics: the limit/offset should apply before
+        // the filter, not after.
+        if filter.input.fetch()?.is_some() || filter.input.skip()?.is_some() {
+            return Ok(Transformed::no(LogicalPlan::Filter(filter)));
+        }
+
         match Arc::unwrap_or_clone(filter.input) {
             LogicalPlan::Filter(child_filter) => {
                 let parents_predicates = split_conjunction_owned(filter.predicate);
@@ -810,8 +832,7 @@ impl OptimizerRule for PushDownFilter {
                     new_predicate,
                     child_filter.input,
                 )?);
-                #[allow(clippy::used_underscore_binding)]
-                self.rewrite(new_filter, _config)
+                self.rewrite(new_filter, config)
             }
             LogicalPlan::Repartition(repartition) => {
                 let new_filter =
@@ -1126,8 +1147,16 @@ impl OptimizerRule for PushDownFilter {
             LogicalPlan::TableScan(scan) => {
                 let filter_predicates = split_conjunction(&filter.predicate);
 
-                let (volatile_filters, non_volatile_filters): (Vec<&Expr>, Vec<&Expr>) =
+                // Filters containing scalar subqueries cannot be pushed to
+                // providers because the subquery result is not available
+                // until execution time.
+                let (subquery_filters, pushdown_candidates): (Vec<&Expr>, Vec<&Expr>) =
                     filter_predicates
+                        .into_iter()
+                        .partition(|pred| pred.contains_scalar_subquery());
+
+                let (volatile_filters, non_volatile_filters): (Vec<&Expr>, Vec<&Expr>) =
+                    pushdown_candidates
                         .into_iter()
                         .partition(|pred| pred.is_volatile());
 
@@ -1135,12 +1164,13 @@ impl OptimizerRule for PushDownFilter {
                 let supported_filters = scan
                     .source
                     .supports_filters_pushdown(non_volatile_filters.as_slice())?;
-                if non_volatile_filters.len() != supported_filters.len() {
-                    return internal_err!(
-                        "Vec returned length: {} from supports_filters_pushdown is not the same size as the filters passed, which length is: {}",
-                        supported_filters.len(),
-                        non_volatile_filters.len());
-                }
+                assert_eq_or_internal_err!(
+                    non_volatile_filters.len(),
+                    supported_filters.len(),
+                    "Vec returned length: {} from supports_filters_pushdown is not the same size as the filters passed, which length is: {}",
+                    supported_filters.len(),
+                    non_volatile_filters.len()
+                );
 
                 // Compose scan filters from non-volatile filters of `Exact` or `Inexact` pushdown type
                 let zip = non_volatile_filters.into_iter().zip(supported_filters);
@@ -1159,11 +1189,13 @@ impl OptimizerRule for PushDownFilter {
                     .cloned()
                     .collect();
 
-                // Compose predicates to be of `Unsupported` or `Inexact` pushdown type, and also include volatile filters
+                // Compose predicates to be of `Unsupported` or `Inexact` pushdown type,
+                // and also include volatile and subquery-containing filters
                 let new_predicate: Vec<Expr> = zip
                     .filter(|(_, res)| res != &TableProviderFilterPushDown::Exact)
                     .map(|(pred, _)| pred)
                     .chain(volatile_filters)
+                    .chain(subquery_filters)
                     .cloned()
                     .collect();
 
@@ -1292,10 +1324,13 @@ fn rewrite_projection(
     predicates: Vec<Expr>,
     mut projection: Projection,
 ) -> Result<(Transformed<LogicalPlan>, Option<Expr>)> {
-    // A projection is filter-commutable if it do not contain volatile predicates or contain volatile
-    // predicates that are not used in the filter. However, we should re-writes all predicate expressions.
-    // collect projection.
-    let (volatile_map, non_volatile_map): (HashMap<_, _>, HashMap<_, _>) = projection
+    // Partition projection expressions into non-pushable vs pushable.
+    // Non-pushable expressions are volatile (must not be duplicated) or
+    // MoveTowardsLeafNodes (cheap expressions like get_field where re-inlining
+    // into a filter causes optimizer instability — ExtractLeafExpressions will
+    // undo the push-down, creating an infinite loop that runs until the
+    // iteration limit is hit).
+    let (non_pushable_map, pushable_map): (HashMap<_, _>, HashMap<_, _>) = projection
         .schema
         .iter()
         .zip(projection.expr.iter())
@@ -1305,12 +1340,15 @@ fn rewrite_projection(
 
             (qualified_name(qualifier, field.name()), expr)
         })
-        .partition(|(_, value)| value.is_volatile());
+        .partition(|(_, value)| {
+            value.is_volatile()
+                || value.placement() == ExpressionPlacement::MoveTowardsLeafNodes
+        });
 
     let mut push_predicates = vec![];
     let mut keep_predicates = vec![];
     for expr in predicates {
-        if contain(&expr, &volatile_map) {
+        if contain(&expr, &non_pushable_map) {
             keep_predicates.push(expr);
         } else {
             push_predicates.push(expr);
@@ -1322,7 +1360,7 @@ fn rewrite_projection(
             // re-write all filters based on this projection
             // E.g. in `Filter: b\n  Projection: a > 1 as b`, we can swap them, but the filter must be "a > 1"
             let new_filter = LogicalPlan::Filter(Filter::try_new(
-                replace_cols_by_name(expr, &non_volatile_map)?,
+                replace_cols_by_name(expr, &pushable_map)?,
                 std::mem::take(&mut projection.input),
             )?);
 
@@ -1333,7 +1371,10 @@ fn rewrite_projection(
                 conjunction(keep_predicates),
             ))
         }
-        None => Ok((Transformed::no(LogicalPlan::Projection(projection)), None)),
+        None => Ok((
+            Transformed::no(LogicalPlan::Projection(projection)),
+            conjunction(keep_predicates),
+        )),
     }
 }
 
@@ -1370,20 +1411,34 @@ fn insert_below(
     })?;
 
     // make sure we did the actual replacement
-    if new_child.is_some() {
-        return internal_err!("node had no  inputs");
-    }
+    assert_or_internal_err!(new_child.is_none(), "node had no inputs");
 
     Ok(transformed_plan)
 }
 
 impl PushDownFilter {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
 }
 
+fn with_debug_timing<T, F>(label: &'static str, f: F) -> Result<T>
+where
+    F: FnOnce() -> Result<T>,
+{
+    if !log_enabled!(Level::Debug) {
+        return f();
+    }
+    let start = Instant::now();
+    let result = f();
+    debug!(
+        "push_down_filter_timing: section={label}, elapsed_us={}",
+        start.elapsed().as_micros()
+    );
+    result
+}
+
 /// replaces columns by its name on the projection.
 pub fn replace_cols_by_name(
     e: Expr,
@@ -1424,28 +1479,28 @@ fn contain(e: &Expr, check_map: &HashMap<String, Expr>) -> bool {
 
 #[cfg(test)]
 mod tests {
-    use std::any::Any;
     use std::cmp::Ordering;
     use std::fmt::{Debug, Formatter};
 
-    use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+    use arrow::datatypes::{Field, Schema, SchemaRef};
     use async_trait::async_trait;
 
     use datafusion_common::{DFSchemaRef, DataFusionError, ScalarValue};
-    use datafusion_expr::expr::{ScalarFunction, WindowFunction};
+    use datafusion_expr::expr::ScalarFunction;
     use datafusion_expr::logical_plan::table_scan;
     use datafusion_expr::{
-        col, in_list, in_subquery, lit, ColumnarValue, ExprFunctionExt, Extension,
-        LogicalPlanBuilder, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
-        TableSource, TableType, UserDefinedLogicalNodeCore, Volatility,
-        WindowFunctionDefinition,
+        ColumnarValue, ExprFunctionExt, Extension, LogicalPlanBuilder,
+        ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, TableSource, TableType,
+        UserDefinedLogicalNodeCore, Volatility, WindowFunctionDefinition, col, in_list,
+        in_subquery, lit,
     };
 
+    use crate::OptimizerContext;
     use crate::assert_optimized_plan_eq_snapshot;
     use crate::optimizer::Optimizer;
     use crate::simplify_expressions::SimplifyExpressions;
+    use crate::test::udfs::leaf_udf_expr;
     use crate::test::*;
-    use crate::OptimizerContext;
     use datafusion_expr::test::function_stub::sum;
     use insta::assert_snapshot;
 
@@ -2331,7 +2386,7 @@ mod tests {
             plan,
             @r"
         Projection: test.a, test1.d
-          Cross Join: 
+          Cross Join:
             Projection: test.a, test.b, test.c
               TableScan: test, full_filters=[test.a = Int32(1)]
             Projection: test1.d, test1.e, test1.f
@@ -2361,7 +2416,7 @@ mod tests {
             plan,
             @r"
         Projection: test.a, test1.a
-          Cross Join: 
+          Cross Join:
             Projection: test.a, test.b, test.c
               TableScan: test, full_filters=[test.a = Int32(1)]
             Projection: test1.a, test1.b, test1.c
@@ -2720,8 +2775,7 @@ mod tests {
         )
     }
 
-    /// post-left-join predicate on a column common to both sides is only pushed to the left side
-    /// i.e. - not duplicated to the right side
+    /// post-left-join predicate on a column common to both sides is pushed to both sides
     #[test]
     fn filter_using_left_join_on_common() -> Result<()> {
         let table_scan = test_table_scan()?;
@@ -2749,20 +2803,19 @@ mod tests {
               TableScan: test2
         ",
         );
-        // filter sent to left side of the join, not the right
+        // filter sent to left side of the join and to the right
         assert_optimized_plan_equal!(
             plan,
             @r"
         Left Join: Using test.a = test2.a
           TableScan: test, full_filters=[test.a <= Int64(1)]
           Projection: test2.a
-            TableScan: test2
+            TableScan: test2, full_filters=[test2.a <= Int64(1)]
         "
         )
     }
 
-    /// post-right-join predicate on a column common to both sides is only pushed to the right side
-    /// i.e. - not duplicated to the left side.
+    /// post-right-join predicate on a column common to both sides is pushed to both sides
     #[test]
     fn filter_using_right_join_on_common() -> Result<()> {
         let table_scan = test_table_scan()?;
@@ -2790,12 +2843,12 @@ mod tests {
               TableScan: test2
         ",
         );
-        // filter sent to right side of join, not duplicated to the left
+        // filter sent to right side of join, sent to the left as well
         assert_optimized_plan_equal!(
             plan,
             @r"
         Right Join: Using test.a = test2.a
-          TableScan: test
+          TableScan: test, full_filters=[test.a <= Int64(1)]
           Projection: test2.a
             TableScan: test2, full_filters=[test2.a <= Int64(1)]
         "
@@ -2977,7 +3030,7 @@ mod tests {
           Projection: test.a, test.b, test.c
             TableScan: test
           Projection: test2.a, test2.b, test2.c
-            TableScan: test2, full_filters=[test2.c > UInt32(4)]
+            TableScan: test2, full_filters=[test2.a > UInt32(1), test2.c > UInt32(4)]
         "
         )
     }
@@ -3099,10 +3152,6 @@ mod tests {
                 .map(|_| self.filter_support.clone())
                 .collect())
         }
-
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
     }
 
     fn table_scan_with_pushdown_provider_builder(
@@ -3924,9 +3973,6 @@ mod tests {
     }
 
     impl ScalarUDFImpl for TestScalarUDF {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
         fn name(&self) -> &str {
             "TestScalarUDF"
         }
@@ -4222,4 +4268,127 @@ mod tests {
         "
         )
     }
+
+    /// Test that filters are NOT pushed through MoveTowardsLeafNodes projections.
+    /// These are cheap expressions (like get_field) where re-inlining into a filter
+    /// has no benefit and causes optimizer instability — ExtractLeafExpressions will
+    /// undo the push-down, creating an infinite loop that runs until the iteration
+    /// limit is hit.
+    #[test]
+    fn filter_not_pushed_through_move_towards_leaves_projection() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        // Create a projection with a MoveTowardsLeafNodes expression
+        let proj = LogicalPlanBuilder::from(table_scan)
+            .project(vec![
+                leaf_udf_expr(col("a")).alias("val"),
+                col("b"),
+                col("c"),
+            ])?
+            .build()?;
+
+        // Put a filter on the MoveTowardsLeafNodes column
+        let plan = LogicalPlanBuilder::from(proj)
+            .filter(col("val").gt(lit(150i64)))?
+            .build()?;
+
+        // Filter should NOT be pushed through — val maps to a MoveTowardsLeafNodes expr
+        assert_optimized_plan_equal!(
+            plan,
+            @r"
+        Filter: val > Int64(150)
+          Projection: leaf_udf(test.a) AS val, test.b, test.c
+            TableScan: test
+        "
+        )
+    }
+
+    /// Test mixed predicates: Column predicate pushed, MoveTowardsLeafNodes kept.
+    #[test]
+    fn filter_mixed_predicates_partial_push() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        // Create a projection with both MoveTowardsLeafNodes and Column expressions
+        let proj = LogicalPlanBuilder::from(table_scan)
+            .project(vec![
+                leaf_udf_expr(col("a")).alias("val"),
+                col("b"),
+                col("c"),
+            ])?
+            .build()?;
+
+        // Filter with both: val > 150 (MoveTowardsLeafNodes) AND b > 5 (Column)
+        let plan = LogicalPlanBuilder::from(proj)
+            .filter(col("val").gt(lit(150i64)).and(col("b").gt(lit(5i64))))?
+            .build()?;
+
+        // val > 150 should be kept above, b > 5 should be pushed through
+        assert_optimized_plan_equal!(
+            plan,
+            @r"
+        Filter: val > Int64(150)
+          Projection: leaf_udf(test.a) AS val, test.b, test.c
+            TableScan: test, full_filters=[test.b > Int64(5)]
+        "
+        )
+    }
+
+    #[test]
+    fn filter_not_pushed_down_through_table_scan_with_fetch() -> Result<()> {
+        let scan = test_table_scan()?;
+        let scan_with_fetch = match scan {
+            LogicalPlan::TableScan(scan) => LogicalPlan::TableScan(TableScan {
+                fetch: Some(10),
+                ..scan
+            }),
+            _ => unreachable!(),
+        };
+        let plan = LogicalPlanBuilder::from(scan_with_fetch)
+            .filter(col("a").gt(lit(10i64)))?
+            .build()?;
+        // Filter must NOT be pushed into the table scan when it has a fetch (limit)
+        assert_optimized_plan_equal!(
+            plan,
+            @r"
+        Filter: test.a > Int64(10)
+          TableScan: test, fetch=10
+        "
+        )
+    }
+
+    #[test]
+    fn filter_push_down_through_sort_without_fetch() -> Result<()> {
+        let table_scan = test_table_scan()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .sort(vec![col("a").sort(true, true)])?
+            .filter(col("a").gt(lit(10i64)))?
+            .build()?;
+        // Filter should be pushed below the sort
+        assert_optimized_plan_equal!(
+            plan,
+            @r"
+        Sort: test.a ASC NULLS FIRST
+          TableScan: test, full_filters=[test.a > Int64(10)]
+        "
+        )
+    }
+
+    #[test]
+    fn filter_not_pushed_down_through_sort_with_fetch() -> Result<()> {
+        let table_scan = test_table_scan()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .sort_with_limit(vec![col("a").sort(true, true)], Some(5))?
+            .filter(col("a").gt(lit(10i64)))?
+            .build()?;
+        // Filter must NOT be pushed below the sort when it has a fetch (limit),
+        // because the limit should apply before the filter.
+        assert_optimized_plan_equal!(
+            plan,
+            @r"
+        Filter: test.a > Int64(10)
+          Sort: test.a ASC NULLS FIRST, fetch=5
+            TableScan: test
+        "
+        )
+    }
 }
diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs
index 80d4a2de6679d..4a26cd5884f6b 100644
--- a/datafusion/optimizer/src/push_down_limit.rs
+++ b/datafusion/optimizer/src/push_down_limit.rs
@@ -23,11 +23,11 @@ use std::sync::Arc;
 use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
 
+use datafusion_common::Result;
 use datafusion_common::tree_node::Transformed;
 use datafusion_common::utils::combine_limit;
-use datafusion_common::Result;
 use datafusion_expr::logical_plan::{Join, JoinType, Limit, LogicalPlan};
-use datafusion_expr::{lit, FetchType, SkipType};
+use datafusion_expr::{FetchType, SkipType, lit};
 
 /// Optimization rule that tries to push down `LIMIT`.
 //. It will push down through projection, limits (taking the smaller limit)
@@ -35,7 +35,7 @@ use datafusion_expr::{lit, FetchType, SkipType};
 pub struct PushDownLimit {}
 
 impl PushDownLimit {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -47,10 +47,11 @@ impl OptimizerRule for PushDownLimit {
         true
     }
 
+    #[expect(clippy::only_used_in_recursion)]
     fn rewrite(
         &self,
         plan: LogicalPlan,
-        _config: &dyn OptimizerConfig,
+        config: &dyn OptimizerConfig,
     ) -> Result<Transformed<LogicalPlan>> {
         let LogicalPlan::Limit(mut limit) = plan else {
             return Ok(Transformed::no(plan));
@@ -81,8 +82,7 @@ impl OptimizerRule for PushDownLimit {
             });
 
             // recursively reapply the rule on the new plan
-            #[allow(clippy::used_underscore_binding)]
-            return self.rewrite(plan, _config);
+            return self.rewrite(plan, config);
         }
 
         // no fetch to push, so return the original plan
@@ -281,8 +281,8 @@ mod test {
     use crate::OptimizerContext;
     use datafusion_common::DFSchemaRef;
     use datafusion_expr::{
-        col, exists, logical_plan::builder::LogicalPlanBuilder, Expr, Extension,
-        UserDefinedLogicalNodeCore,
+        Expr, Extension, UserDefinedLogicalNodeCore, col, exists,
+        logical_plan::builder::LogicalPlanBuilder,
     };
     use datafusion_functions_aggregate::expr_fn::max;
 
@@ -1044,7 +1044,7 @@ mod test {
             plan,
             @r"
         Limit: skip=0, fetch=1000
-          Cross Join: 
+          Cross Join:
             Limit: skip=0, fetch=1000
               TableScan: test, fetch=1000
             Limit: skip=0, fetch=1000
@@ -1067,7 +1067,7 @@ mod test {
             plan,
             @r"
         Limit: skip=1000, fetch=1000
-          Cross Join: 
+          Cross Join:
             Limit: skip=0, fetch=2000
               TableScan: test, fetch=2000
             Limit: skip=0, fetch=2000
diff --git a/datafusion/optimizer/src/replace_distinct_aggregate.rs b/datafusion/optimizer/src/replace_distinct_aggregate.rs
index 215f5e240d5de..06df61e766615 100644
--- a/datafusion/optimizer/src/replace_distinct_aggregate.rs
+++ b/datafusion/optimizer/src/replace_distinct_aggregate.rs
@@ -25,8 +25,8 @@ use datafusion_common::tree_node::Transformed;
 use datafusion_common::{Column, Result};
 use datafusion_expr::expr_rewriter::normalize_cols;
 use datafusion_expr::utils::expand_wildcard;
-use datafusion_expr::{col, lit, ExprFunctionExt, Limit, LogicalPlanBuilder};
 use datafusion_expr::{Aggregate, Distinct, DistinctOn, Expr, LogicalPlan};
+use datafusion_expr::{ExprFunctionExt, Limit, LogicalPlanBuilder, col, lit};
 
 /// Optimizer that replaces logical [[Distinct]] with a logical [[Aggregate]]
 ///
@@ -69,7 +69,7 @@ use datafusion_expr::{Aggregate, Distinct, DistinctOn, Expr, LogicalPlan};
 pub struct ReplaceDistinctWithAggregate {}
 
 impl ReplaceDistinctWithAggregate {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -109,7 +109,7 @@ impl OptimizerRule for ReplaceDistinctWithAggregate {
                             .enumerate()
                             .all(|(idx, f_idx)| idx == *f_idx)
                     {
-                        return Ok(Transformed::yes(input.as_ref().clone()));
+                        return Ok(Transformed::yes(Arc::unwrap_or_clone(input)));
                     }
                 }
 
@@ -214,7 +214,7 @@ mod tests {
     use crate::OptimizerContext;
     use datafusion_common::Result;
     use datafusion_expr::{
-        col, logical_plan::builder::LogicalPlanBuilder, table_scan, Expr,
+        Expr, col, logical_plan::builder::LogicalPlanBuilder, table_scan,
     };
     use datafusion_functions_aggregate::sum::sum;
 
diff --git a/datafusion/optimizer/src/rewrite_set_comparison.rs b/datafusion/optimizer/src/rewrite_set_comparison.rs
new file mode 100644
index 0000000000000..c8c35b518743a
--- /dev/null
+++ b/datafusion/optimizer/src/rewrite_set_comparison.rs
@@ -0,0 +1,171 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Optimizer rule rewriting `SetComparison` subqueries (e.g. `= ANY`,
+//! `> ALL`) into boolean expressions built from `EXISTS` subqueries
+//! that capture SQL three-valued logic.
+
+use crate::{OptimizerConfig, OptimizerRule};
+use datafusion_common::tree_node::{Transformed, TreeNode};
+use datafusion_common::{Column, DFSchema, ExprSchema, Result, ScalarValue, plan_err};
+use datafusion_expr::expr::{self, Exists, SetComparison, SetQuantifier};
+use datafusion_expr::logical_plan::Subquery;
+use datafusion_expr::logical_plan::builder::LogicalPlanBuilder;
+use datafusion_expr::{Expr, LogicalPlan, lit};
+use std::sync::Arc;
+
+use datafusion_expr::utils::merge_schema;
+
+/// Rewrite `SetComparison` expressions to scalar subqueries that return the
+/// correct boolean value (including SQL NULL semantics). After this rule
+/// runs, later rules such as `ScalarSubqueryToJoin` can decorrelate and
+/// remove the remaining subquery.
+#[derive(Debug, Default)]
+pub struct RewriteSetComparison;
+
+impl RewriteSetComparison {
+    /// Create a new `RewriteSetComparison` optimizer rule.
+    pub fn new() -> Self {
+        Self
+    }
+
+    fn rewrite_plan(&self, plan: LogicalPlan) -> Result<Transformed<LogicalPlan>> {
+        let schema = merge_schema(&plan.inputs());
+        plan.map_expressions(|expr| {
+            expr.transform_up(|expr| rewrite_set_comparison(expr, &schema))
+        })
+    }
+}
+
+impl OptimizerRule for RewriteSetComparison {
+    fn name(&self) -> &str {
+        "rewrite_set_comparison"
+    }
+
+    fn rewrite(
+        &self,
+        plan: LogicalPlan,
+        _config: &dyn OptimizerConfig,
+    ) -> Result<Transformed<LogicalPlan>> {
+        plan.transform_up_with_subqueries(|plan| self.rewrite_plan(plan))
+    }
+}
+
+fn rewrite_set_comparison(
+    expr: Expr,
+    outer_schema: &DFSchema,
+) -> Result<Transformed<Expr>> {
+    match expr {
+        Expr::SetComparison(set_comparison) => {
+            let rewritten = build_set_comparison_subquery(set_comparison, outer_schema)?;
+            Ok(Transformed::yes(rewritten))
+        }
+        _ => Ok(Transformed::no(expr)),
+    }
+}
+
+fn build_set_comparison_subquery(
+    set_comparison: SetComparison,
+    outer_schema: &DFSchema,
+) -> Result<Expr> {
+    let SetComparison {
+        expr,
+        subquery,
+        op,
+        quantifier,
+    } = set_comparison;
+
+    let left_expr = to_outer_reference(*expr, outer_schema)?;
+    let subquery_schema = subquery.subquery.schema();
+    if subquery_schema.fields().is_empty() {
+        return plan_err!("single expression required.");
+    }
+    // avoid `head_output_expr` for aggr/window plan, it will gives group-by expr if exists
+    let right_expr = Expr::Column(Column::from(subquery_schema.qualified_field(0)));
+
+    let comparison = Expr::BinaryExpr(expr::BinaryExpr::new(
+        Box::new(left_expr),
+        op,
+        Box::new(right_expr),
+    ));
+
+    let true_exists =
+        exists_subquery(&subquery, Expr::IsTrue(Box::new(comparison.clone())))?;
+    let null_exists =
+        exists_subquery(&subquery, Expr::IsNull(Box::new(comparison.clone())))?;
+
+    let result_expr = match quantifier {
+        SetQuantifier::Any => Expr::Case(expr::Case {
+            expr: None,
+            when_then_expr: vec![
+                (Box::new(true_exists), Box::new(lit(true))),
+                (
+                    Box::new(null_exists),
+                    Box::new(Expr::Literal(ScalarValue::Boolean(None), None)),
+                ),
+            ],
+            else_expr: Some(Box::new(lit(false))),
+        }),
+        SetQuantifier::All => {
+            let false_exists =
+                exists_subquery(&subquery, Expr::IsFalse(Box::new(comparison.clone())))?;
+            Expr::Case(expr::Case {
+                expr: None,
+                when_then_expr: vec![
+                    (Box::new(false_exists), Box::new(lit(false))),
+                    (
+                        Box::new(null_exists),
+                        Box::new(Expr::Literal(ScalarValue::Boolean(None), None)),
+                    ),
+                ],
+                else_expr: Some(Box::new(lit(true))),
+            })
+        }
+    };
+
+    Ok(result_expr)
+}
+
+fn exists_subquery(subquery: &Subquery, filter: Expr) -> Result<Expr> {
+    let plan = LogicalPlanBuilder::from(subquery.subquery.as_ref().clone())
+        .filter(filter)?
+        .build()?;
+    let outer_ref_columns = plan.all_out_ref_exprs();
+    Ok(Expr::Exists(Exists {
+        subquery: Subquery {
+            subquery: Arc::new(plan),
+            outer_ref_columns,
+            spans: subquery.spans.clone(),
+        },
+        negated: false,
+    }))
+}
+
+fn to_outer_reference(expr: Expr, outer_schema: &DFSchema) -> Result<Expr> {
+    expr.transform_up(|expr| match expr {
+        Expr::Column(col) => {
+            let field = outer_schema.field_from_column(&col)?;
+            Ok(Transformed::yes(Expr::OuterReferenceColumn(
+                Arc::clone(field),
+                col,
+            )))
+        }
+        Expr::OuterReferenceColumn(_, _) => Ok(Transformed::no(expr)),
+        _ => Ok(Transformed::no(expr)),
+    })
+    .map(|t| t.data)
+}
diff --git a/datafusion/optimizer/src/scalar_subquery_to_join.rs b/datafusion/optimizer/src/scalar_subquery_to_join.rs
index 48d1182527013..941fcffb798f5 100644
--- a/datafusion/optimizer/src/scalar_subquery_to_join.rs
+++ b/datafusion/optimizer/src/scalar_subquery_to_join.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`ScalarSubqueryToJoin`] rewriting scalar subquery filters to `JOIN`s
+//! [`ScalarSubqueryToJoin`] rewriting correlated scalar subquery filters to `JOIN`s
 
 use std::collections::{BTreeSet, HashMap};
 use std::sync::Arc;
@@ -30,28 +30,33 @@ use datafusion_common::alias::AliasGenerator;
 use datafusion_common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter,
 };
-use datafusion_common::{internal_err, plan_err, Column, Result, ScalarValue};
+use datafusion_common::{Column, Result, ScalarValue, assert_or_internal_err, plan_err};
 use datafusion_expr::expr_rewriter::create_col_from_scalar_expr;
 use datafusion_expr::logical_plan::{JoinType, Subquery};
 use datafusion_expr::utils::conjunction;
-use datafusion_expr::{expr, EmptyRelation, Expr, LogicalPlan, LogicalPlanBuilder};
+use datafusion_expr::{EmptyRelation, Expr, LogicalPlan, LogicalPlanBuilder, expr};
 
-/// Optimizer rule for rewriting subquery filters to joins
-/// and places additional projection on top of the filter, to preserve
+/// Optimizer rule that rewrites correlated scalar subquery filters to joins and
+/// places an additional projection on top of the filter, to preserve the
 /// original schema.
 #[derive(Default, Debug)]
 pub struct ScalarSubqueryToJoin {}
 
 impl ScalarSubqueryToJoin {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self::default()
     }
 
-    /// Finds expressions that have a scalar subquery in them (and recurses when found)
+    /// Finds expressions that contain correlated scalar subqueries (and
+    /// recurses when found).
     ///
     /// # Arguments
-    /// * `predicate` - A conjunction to split and search
+    /// * `predicate` - A conjunction to split and search.
+    /// * `alias_gen` - Generator used to produce unique aliases for each
+    ///   extracted scalar subquery (e.g. `__scalar_sq_1`, `__scalar_sq_2`).
+    ///   Each subquery is replaced by a column reference using the generated
+    ///   alias, and the same alias is later used to construct the join.
     ///
     /// Returns a tuple (subqueries, alias)
     fn extract_subquery_exprs(
@@ -85,7 +90,7 @@ impl OptimizerRule for ScalarSubqueryToJoin {
             LogicalPlan::Filter(filter) => {
                 // Optimization: skip the rest of the rule and its copies if
                 // there are no scalar subqueries
-                if !contains_scalar_subquery(&filter.predicate) {
+                if !contains_correlated_scalar_subquery(&filter.predicate) {
                     return Ok(Transformed::no(LogicalPlan::Filter(filter)));
                 }
 
@@ -94,9 +99,10 @@ impl OptimizerRule for ScalarSubqueryToJoin {
                     config.alias_generator(),
                 )?;
 
-                if subqueries.is_empty() {
-                    return internal_err!("Expected subqueries not found in filter");
-                }
+                assert_or_internal_err!(
+                    !subqueries.is_empty(),
+                    "Expected subqueries not found in filter"
+                );
 
                 // iterate through all subqueries in predicate, turning each into a left join
                 let mut cur_input = filter.input.as_ref().clone();
@@ -110,7 +116,7 @@ impl OptimizerRule for ScalarSubqueryToJoin {
                                     // replace column references with entry in map, if it exists
                                     if let Some(map_expr) = expr
                                         .try_as_col()
-                                        .and_then(|col| expr_check_map.get(&col.name))
+                                        .and_then(|col| expr_check_map.get(col))
                                     {
                                         Ok(Transformed::yes(map_expr.clone()))
                                     } else {
@@ -136,14 +142,22 @@ impl OptimizerRule for ScalarSubqueryToJoin {
                 Ok(Transformed::yes(new_plan))
             }
             LogicalPlan::Projection(projection) => {
-                // Optimization: skip the rest of the rule and its copies if
-                // there are no scalar subqueries
-                if !projection.expr.iter().any(contains_scalar_subquery) {
+                // Optimization: skip the rest of the rule and its copies if there
+                // are no correlated scalar subqueries
+                if !projection
+                    .expr
+                    .iter()
+                    .any(contains_correlated_scalar_subquery)
+                {
                     return Ok(Transformed::no(LogicalPlan::Projection(projection)));
                 }
 
                 let mut all_subqueries = vec![];
+                #[allow(clippy::allow_attributes, clippy::mutable_key_type)]
+                // Expr contains Arc with interior mutability but is intentionally used as hash key
                 let mut expr_to_rewrite_expr_map = HashMap::new();
+                #[allow(clippy::allow_attributes, clippy::mutable_key_type)]
+                // Expr contains Arc with interior mutability but is intentionally used as hash key
                 let mut subquery_to_expr_map = HashMap::new();
                 for expr in projection.expr.iter() {
                     let (subqueries, rewrite_exprs) =
@@ -154,9 +168,10 @@ impl OptimizerRule for ScalarSubqueryToJoin {
                     all_subqueries.extend(subqueries);
                     expr_to_rewrite_expr_map.insert(expr, rewrite_exprs);
                 }
-                if all_subqueries.is_empty() {
-                    return internal_err!("Expected subqueries not found in projection");
-                }
+                assert_or_internal_err!(
+                    !all_subqueries.is_empty(),
+                    "Expected subqueries not found in projection"
+                );
                 // iterate through all subqueries in predicate, turning each into a left join
                 let mut cur_input = projection.input.as_ref().clone();
                 for (subquery, alias) in all_subqueries {
@@ -164,29 +179,25 @@ impl OptimizerRule for ScalarSubqueryToJoin {
                         build_join(&subquery, &cur_input, &alias)?
                     {
                         cur_input = optimized_subquery;
-                        if !expr_check_map.is_empty() {
-                            if let Some(expr) = subquery_to_expr_map.get(&subquery) {
-                                if let Some(rewrite_expr) =
-                                    expr_to_rewrite_expr_map.get(expr)
-                                {
-                                    let new_expr = rewrite_expr
-                                        .clone()
-                                        .transform_up(|expr| {
-                                            // replace column references with entry in map, if it exists
-                                            if let Some(map_expr) =
-                                                expr.try_as_col().and_then(|col| {
-                                                    expr_check_map.get(&col.name)
-                                                })
-                                            {
-                                                Ok(Transformed::yes(map_expr.clone()))
-                                            } else {
-                                                Ok(Transformed::no(expr))
-                                            }
-                                        })
-                                        .data()?;
-                                    expr_to_rewrite_expr_map.insert(expr, new_expr);
-                                }
-                            }
+                        if !expr_check_map.is_empty()
+                            && let Some(expr) = subquery_to_expr_map.get(&subquery)
+                            && let Some(rewrite_expr) = expr_to_rewrite_expr_map.get(expr)
+                        {
+                            let new_expr = rewrite_expr
+                                .clone()
+                                .transform_up(|expr| {
+                                    // replace column references with entry in map, if it exists
+                                    if let Some(map_expr) = expr
+                                        .try_as_col()
+                                        .and_then(|col| expr_check_map.get(col))
+                                    {
+                                        Ok(Transformed::yes(map_expr.clone()))
+                                    } else {
+                                        Ok(Transformed::no(expr))
+                                    }
+                                })
+                                .data()?;
+                            expr_to_rewrite_expr_map.insert(expr, new_expr);
                         }
                     } else {
                         // if we can't handle all of the subqueries then bail for now
@@ -224,11 +235,14 @@ impl OptimizerRule for ScalarSubqueryToJoin {
     }
 }
 
-/// Returns true if the expression has a scalar subquery somewhere in it
-/// false otherwise
-fn contains_scalar_subquery(expr: &Expr) -> bool {
-    expr.exists(|expr| Ok(matches!(expr, Expr::ScalarSubquery(_))))
-        .expect("Inner is always Ok")
+/// Returns true if the expression contains a correlated scalar subquery, false
+/// otherwise.  Uncorrelated scalar subqueries are handled by the physical
+/// planner via `ScalarSubqueryExec` and do not need to be converted to joins.
+fn contains_correlated_scalar_subquery(expr: &Expr) -> bool {
+    expr.exists(|expr| {
+        Ok(matches!(expr, Expr::ScalarSubquery(sq) if !sq.outer_ref_columns.is_empty()))
+    })
+    .expect("Inner is always Ok")
 }
 
 struct ExtractScalarSubQuery<'a> {
@@ -241,19 +255,21 @@ impl TreeNodeRewriter for ExtractScalarSubQuery<'_> {
 
     fn f_down(&mut self, expr: Expr) -> Result<Transformed<Expr>> {
         match expr {
-            Expr::ScalarSubquery(subquery) => {
-                let subqry_alias = self.alias_gen.next("__scalar_sq");
-                self.sub_query_info
-                    .push((subquery.clone(), subqry_alias.clone()));
+            // Skip uncorrelated scalar subqueries
+            Expr::ScalarSubquery(ref subquery)
+                if !subquery.outer_ref_columns.is_empty() =>
+            {
+                let subquery = subquery.clone();
                 let scalar_expr = subquery
                     .subquery
                     .head_output_expr()?
                     .map_or(plan_err!("single expression required."), Ok)?;
+                let subqry_alias = self.alias_gen.next("__scalar_sq");
+                let col =
+                    create_col_from_scalar_expr(&scalar_expr, subqry_alias.clone())?;
+                self.sub_query_info.push((subquery, subqry_alias));
                 Ok(Transformed::new(
-                    Expr::Column(create_col_from_scalar_expr(
-                        &scalar_expr,
-                        subqry_alias,
-                    )?),
+                    Expr::Column(col),
                     true,
                     TreeNodeRecursion::Jump,
                 ))
@@ -303,7 +319,7 @@ fn build_join(
     subquery: &Subquery,
     filter_input: &LogicalPlan,
     subquery_alias: &str,
-) -> Result<Option<(LogicalPlan, HashMap<String, Expr>)>> {
+) -> Result<Option<(LogicalPlan, HashMap<Column, Expr>)>> {
     let subquery_plan = subquery.subquery.as_ref();
     let mut pull_up = PullUpCorrelatedExpr::new().with_need_handle_count_bug(true);
     let new_plan = subquery_plan.clone().rewrite(&mut pull_up).data()?;
@@ -360,14 +376,19 @@ fn build_join(
                 // If expr always returns null when column is null, skip processing
                 continue;
             }
+
+            let indicator_col =
+                Column::new(Some(subquery_alias), UN_MATCHED_ROW_INDICATOR);
+            // Qualify with the subquery alias to avoid ambiguity when the
+            // outer table has a column with the same name as the aggregate.
+            let value_col = Column::new(Some(subquery_alias), name.clone());
+
             let computer_expr = if let Some(filter) = &pull_up.pull_up_having_expr {
                 Expr::Case(expr::Case {
                     expr: None,
                     when_then_expr: vec![
                         (
-                            Box::new(Expr::IsNull(Box::new(Expr::Column(
-                                Column::new_unqualified(UN_MATCHED_ROW_INDICATOR),
-                            )))),
+                            Box::new(Expr::IsNull(Box::new(Expr::Column(indicator_col)))),
                             Box::new(result),
                         ),
                         (
@@ -375,29 +396,23 @@ fn build_join(
                             Box::new(Expr::Literal(ScalarValue::Null, None)),
                         ),
                     ],
-                    else_expr: Some(Box::new(Expr::Column(Column::new_unqualified(
-                        name.clone(),
-                    )))),
+                    else_expr: Some(Box::new(Expr::Column(value_col.clone()))),
                 })
             } else {
                 Expr::Case(expr::Case {
                     expr: None,
                     when_then_expr: vec![(
-                        Box::new(Expr::IsNull(Box::new(Expr::Column(
-                            Column::new_unqualified(UN_MATCHED_ROW_INDICATOR),
-                        )))),
+                        Box::new(Expr::IsNull(Box::new(Expr::Column(indicator_col)))),
                         Box::new(result),
                     )],
-                    else_expr: Some(Box::new(Expr::Column(Column::new_unqualified(
-                        name.clone(),
-                    )))),
+                    else_expr: Some(Box::new(Expr::Column(value_col.clone()))),
                 })
             };
             let mut expr_rewrite = TypeCoercionRewriter {
                 schema: new_plan.schema(),
             };
             computation_project_expr
-                .insert(name, computer_expr.rewrite(&mut expr_rewrite).data()?);
+                .insert(value_col, computer_expr.rewrite(&mut expr_rewrite).data()?);
         }
     }
 
@@ -415,7 +430,7 @@ mod tests {
     use datafusion_expr::test::function_stub::sum;
 
     use crate::assert_optimized_plan_eq_display_indent_snapshot;
-    use datafusion_expr::{col, lit, out_ref_col, scalar_subquery, Between};
+    use datafusion_expr::{Between, col, lit, out_ref_col, scalar_subquery};
     use datafusion_functions_aggregate::min_max::{max, min};
 
     macro_rules! assert_optimized_plan_equal {
@@ -626,15 +641,13 @@ mod tests {
             plan,
             @r"
         Projection: customer.c_custkey [c_custkey:Int64]
-          Projection: customer.c_custkey, customer.c_name [c_custkey:Int64, c_name:Utf8]
-            Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
-              Left Join:  Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
-                TableScan: customer [c_custkey:Int64, c_name:Utf8]
-                SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N]
-                  Projection: max(orders.o_custkey) [max(orders.o_custkey):Int64;N]
-                    Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]] [max(orders.o_custkey):Int64;N]
-                      Filter: orders.o_custkey = orders.o_custkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
-                        TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+          Filter: customer.c_custkey = (<subquery>) [c_custkey:Int64, c_name:Utf8]
+            Subquery: [max(orders.o_custkey):Int64;N]
+              Projection: max(orders.o_custkey) [max(orders.o_custkey):Int64;N]
+                Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]] [max(orders.o_custkey):Int64;N]
+                  Filter: orders.o_custkey = orders.o_custkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+                    TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+            TableScan: customer [c_custkey:Int64, c_name:Utf8]
         "
         )
     }
@@ -1031,14 +1044,12 @@ mod tests {
             plan,
             @r"
         Projection: customer.c_custkey [c_custkey:Int64]
-          Projection: customer.c_custkey, customer.c_name [c_custkey:Int64, c_name:Utf8]
-            Filter: customer.c_custkey < __scalar_sq_1.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
-              Left Join:  Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
-                TableScan: customer [c_custkey:Int64, c_name:Utf8]
-                SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N]
-                  Projection: max(orders.o_custkey) [max(orders.o_custkey):Int64;N]
-                    Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]] [max(orders.o_custkey):Int64;N]
-                      TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+          Filter: customer.c_custkey < (<subquery>) [c_custkey:Int64, c_name:Utf8]
+            Subquery: [max(orders.o_custkey):Int64;N]
+              Projection: max(orders.o_custkey) [max(orders.o_custkey):Int64;N]
+                Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]] [max(orders.o_custkey):Int64;N]
+                  TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+            TableScan: customer [c_custkey:Int64, c_name:Utf8]
         "
         )
     }
@@ -1061,14 +1072,12 @@ mod tests {
             plan,
             @r"
         Projection: customer.c_custkey [c_custkey:Int64]
-          Projection: customer.c_custkey, customer.c_name [c_custkey:Int64, c_name:Utf8]
-            Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
-              Left Join:  Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
-                TableScan: customer [c_custkey:Int64, c_name:Utf8]
-                SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N]
-                  Projection: max(orders.o_custkey) [max(orders.o_custkey):Int64;N]
-                    Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]] [max(orders.o_custkey):Int64;N]
-                      TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+          Filter: customer.c_custkey = (<subquery>) [c_custkey:Int64, c_name:Utf8]
+            Subquery: [max(orders.o_custkey):Int64;N]
+              Projection: max(orders.o_custkey) [max(orders.o_custkey):Int64;N]
+                Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]] [max(orders.o_custkey):Int64;N]
+                  TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+            TableScan: customer [c_custkey:Int64, c_name:Utf8]
         "
         )
     }
@@ -1160,19 +1169,16 @@ mod tests {
             plan,
             @r"
         Projection: customer.c_custkey [c_custkey:Int64]
-          Projection: customer.c_custkey, customer.c_name [c_custkey:Int64, c_name:Utf8]
-            Filter: customer.c_custkey BETWEEN __scalar_sq_1.min(orders.o_custkey) AND __scalar_sq_2.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N, max(orders.o_custkey):Int64;N]
-              Left Join:  Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N, max(orders.o_custkey):Int64;N]
-                Left Join:  Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N]
-                  TableScan: customer [c_custkey:Int64, c_name:Utf8]
-                  SubqueryAlias: __scalar_sq_1 [min(orders.o_custkey):Int64;N]
-                    Projection: min(orders.o_custkey) [min(orders.o_custkey):Int64;N]
-                      Aggregate: groupBy=[[]], aggr=[[min(orders.o_custkey)]] [min(orders.o_custkey):Int64;N]
-                        TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
-                SubqueryAlias: __scalar_sq_2 [max(orders.o_custkey):Int64;N]
-                  Projection: max(orders.o_custkey) [max(orders.o_custkey):Int64;N]
-                    Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]] [max(orders.o_custkey):Int64;N]
-                      TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+          Filter: customer.c_custkey BETWEEN (<subquery>) AND (<subquery>) [c_custkey:Int64, c_name:Utf8]
+            Subquery: [min(orders.o_custkey):Int64;N]
+              Projection: min(orders.o_custkey) [min(orders.o_custkey):Int64;N]
+                Aggregate: groupBy=[[]], aggr=[[min(orders.o_custkey)]] [min(orders.o_custkey):Int64;N]
+                  TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+            Subquery: [max(orders.o_custkey):Int64;N]
+              Projection: max(orders.o_custkey) [max(orders.o_custkey):Int64;N]
+                Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]] [max(orders.o_custkey):Int64;N]
+                  TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+            TableScan: customer [c_custkey:Int64, c_name:Utf8]
         "
         )
     }
diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
index 56fe95fffd150..143d8eae695af 100644
--- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
+++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
@@ -18,7 +18,7 @@
 //! Expression simplification API
 
 use arrow::{
-    array::{new_null_array, AsArray},
+    array::{Array, AsArray, new_null_array},
     datatypes::{DataType, Field, Schema},
     record_batch::RecordBatch,
 };
@@ -26,38 +26,46 @@ use std::borrow::Cow;
 use std::collections::HashSet;
 use std::ops::Not;
 use std::sync::Arc;
+use std::sync::LazyLock;
 
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::nested_struct::has_one_of_more_common_fields;
 use datafusion_common::{
+    DFSchema, DataFusionError, Result, ScalarValue, exec_datafusion_err, internal_err,
+};
+use datafusion_common::{
+    HashMap,
     cast::{as_large_list_array, as_list_array},
     metadata::FieldMetadata,
     tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRewriter},
 };
-use datafusion_common::{
-    exec_datafusion_err, internal_err, DFSchema, DataFusionError, Result, ScalarValue,
-};
+use datafusion_expr::expr::HigherOrderFunction;
 use datafusion_expr::{
-    and, binary::BinaryTypeCoercer, lit, or, BinaryExpr, Case, ColumnarValue, Expr, Like,
-    Operator, Volatility,
+    BinaryExpr, Case, ColumnarValue, Expr, ExprSchemable, Like, Operator, Volatility,
+    and, binary::BinaryTypeCoercer, lit, or, preimage::PreimageResult,
 };
+use datafusion_expr::{Cast, TryCast, simplify::ExprSimplifyResult};
 use datafusion_expr::{expr::ScalarFunction, interval_arithmetic::NullableInterval};
 use datafusion_expr::{
     expr::{InList, InSubquery},
     utils::{iter_conjunction, iter_conjunction_owned},
 };
-use datafusion_expr::{simplify::ExprSimplifyResult, Cast, TryCast};
 use datafusion_physical_expr::{create_physical_expr, execution_props::ExecutionProps};
 
 use super::inlist_simplifier::ShortenInListSimplifier;
 use super::utils::*;
-use crate::analyzer::type_coercion::TypeCoercionRewriter;
-use crate::simplify_expressions::guarantees::GuaranteeRewriter;
+use crate::simplify_expressions::SimplifyContext;
 use crate::simplify_expressions::regex::simplify_regex_expr;
 use crate::simplify_expressions::unwrap_cast::{
     is_cast_expr_and_support_unwrap_cast_in_comparison_for_binary,
     is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist,
     unwrap_cast_in_comparison_for_binary,
 };
-use crate::simplify_expressions::SimplifyInfo;
+use crate::{
+    analyzer::type_coercion::TypeCoercionRewriter,
+    simplify_expressions::udf_preimage::rewrite_with_preimage,
+};
+use datafusion_expr::expr_rewriter::rewrite_with_guarantees_map;
 use datafusion_expr_common::casts::try_cast_literal_to_type;
 use indexmap::IndexSet;
 use regex::Regex;
@@ -71,7 +79,6 @@ use regex::Regex;
 /// ```
 /// use arrow::datatypes::{DataType, Field, Schema};
 /// use datafusion_common::{DataFusionError, ToDFSchema};
-/// use datafusion_expr::execution_props::ExecutionProps;
 /// use datafusion_expr::simplify::SimplifyContext;
 /// use datafusion_expr::{col, lit};
 /// use datafusion_optimizer::simplify_expressions::ExprSimplifier;
@@ -82,8 +89,7 @@ use regex::Regex;
 ///     .unwrap();
 ///
 /// // Create the simplifier
-/// let props = ExecutionProps::new();
-/// let context = SimplifyContext::new(&props).with_schema(schema);
+/// let context = SimplifyContext::builder().with_schema(schema).build();
 /// let simplifier = ExprSimplifier::new(context);
 ///
 /// // Use the simplifier
@@ -95,8 +101,8 @@ use regex::Regex;
 /// let simplified = simplifier.simplify(expr).unwrap();
 /// assert_eq!(simplified, col("b").lt(lit(2)));
 /// ```
-pub struct ExprSimplifier<S> {
-    info: S,
+pub struct ExprSimplifier {
+    info: SimplifyContext,
     /// Guarantees about the values of columns. This is provided by the user
     /// in [ExprSimplifier::with_guarantees()].
     guarantees: Vec<(Expr, NullableInterval)>,
@@ -110,13 +116,12 @@ pub struct ExprSimplifier<S> {
 pub const THRESHOLD_INLINE_INLIST: usize = 3;
 pub const DEFAULT_MAX_SIMPLIFIER_CYCLES: u32 = 3;
 
-impl<S: SimplifyInfo> ExprSimplifier<S> {
-    /// Create a new `ExprSimplifier` with the given `info` such as an
-    /// instance of [`SimplifyContext`]. See
-    /// [`simplify`](Self::simplify) for an example.
+impl ExprSimplifier {
+    /// Create a new `ExprSimplifier` with the given [`SimplifyContext`].
+    /// See [`simplify`](Self::simplify) for an example.
     ///
     /// [`SimplifyContext`]: datafusion_expr::simplify::SimplifyContext
-    pub fn new(info: S) -> Self {
+    pub fn new(info: SimplifyContext) -> Self {
         Self {
             info,
             guarantees: vec![],
@@ -141,40 +146,21 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
     /// `b > 2`
     ///
     /// ```
-    /// use arrow::datatypes::DataType;
-    /// use datafusion_common::DFSchema;
+    /// use arrow::datatypes::{DataType, Field, Schema};
+    /// use datafusion_common::{DFSchema, ToDFSchema};
     /// use datafusion_common::Result;
-    /// use datafusion_expr::execution_props::ExecutionProps;
     /// use datafusion_expr::simplify::SimplifyContext;
-    /// use datafusion_expr::simplify::SimplifyInfo;
     /// use datafusion_expr::{col, lit, Expr};
     /// use datafusion_optimizer::simplify_expressions::ExprSimplifier;
     /// use std::sync::Arc;
     ///
-    /// /// Simple implementation that provides `Simplifier` the information it needs
-    /// /// See SimplifyContext for a structure that does this.
-    /// #[derive(Default)]
-    /// struct Info {
-    ///     execution_props: ExecutionProps,
-    /// };
-    ///
-    /// impl SimplifyInfo for Info {
-    ///     fn is_boolean_type(&self, expr: &Expr) -> Result<bool> {
-    ///         Ok(false)
-    ///     }
-    ///     fn nullable(&self, expr: &Expr) -> Result<bool> {
-    ///         Ok(true)
-    ///     }
-    ///     fn execution_props(&self) -> &ExecutionProps {
-    ///         &self.execution_props
-    ///     }
-    ///     fn get_data_type(&self, expr: &Expr) -> Result<DataType> {
-    ///         Ok(DataType::Int32)
-    ///     }
-    /// }
-    ///
+    /// // Create a schema and SimplifyContext
+    /// let schema = Schema::new(vec![Field::new("b", DataType::Int32, true)])
+    ///     .to_dfschema_ref()
+    ///     .unwrap();
     /// // Create the simplifier
-    /// let simplifier = ExprSimplifier::new(Info::default());
+    /// let context = SimplifyContext::builder().with_schema(schema).build();
+    /// let simplifier = ExprSimplifier::new(context);
     ///
     /// // b < 2
     /// let b_lt_2 = col("b").gt(lit(2));
@@ -200,7 +186,7 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
         since = "48.0.0",
         note = "Use `simplify_with_cycle_count_transformed` instead"
     )]
-    #[allow(unused_mut)]
+    #[expect(unused_mut)]
     pub fn simplify_with_cycle_count(&self, mut expr: Expr) -> Result<(Expr, u32)> {
         let (transformed, cycle_count) =
             self.simplify_with_cycle_count_transformed(expr)?;
@@ -224,9 +210,11 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
         mut expr: Expr,
     ) -> Result<(Transformed<Expr>, u32)> {
         let mut simplifier = Simplifier::new(&self.info);
-        let mut const_evaluator = ConstEvaluator::try_new(self.info.execution_props())?;
+        let config_options = Some(Arc::clone(self.info.config_options()));
+        let mut const_evaluator = ConstEvaluator::try_new(config_options)?;
         let mut shorten_in_list_simplifier = ShortenInListSimplifier::new();
-        let mut guarantee_rewriter = GuaranteeRewriter::new(&self.guarantees);
+        let guarantees_map: HashMap<&Expr, &NullableInterval> =
+            self.guarantees.iter().map(|(k, v)| (k, v)).collect();
 
         if self.canonicalize {
             expr = expr.rewrite(&mut Canonicalizer::new()).data()?
@@ -243,7 +231,9 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
             } = expr
                 .rewrite(&mut const_evaluator)?
                 .transform_data(|expr| expr.rewrite(&mut simplifier))?
-                .transform_data(|expr| expr.rewrite(&mut guarantee_rewriter))?;
+                .transform_data(|expr| {
+                    rewrite_with_guarantees_map(expr, &guarantees_map)
+                })?;
             expr = data;
             num_cycles += 1;
             // Track if any transformation occurred
@@ -283,7 +273,6 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
     /// ```rust
     /// use arrow::datatypes::{DataType, Field, Schema};
     /// use datafusion_common::{Result, ScalarValue, ToDFSchema};
-    /// use datafusion_expr::execution_props::ExecutionProps;
     /// use datafusion_expr::interval_arithmetic::{Interval, NullableInterval};
     /// use datafusion_expr::simplify::SimplifyContext;
     /// use datafusion_expr::{col, lit, Expr};
@@ -298,8 +287,7 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
     /// .unwrap();
     ///
     /// // Create the simplifier
-    /// let props = ExecutionProps::new();
-    /// let context = SimplifyContext::new(&props).with_schema(schema);
+    /// let context = SimplifyContext::builder().with_schema(schema).build();
     ///
     /// // Expression: (x >= 3) AND (y + 2 < 10) AND (z > 5)
     /// let expr_x = col("x").gt_eq(lit(3_i64));
@@ -345,7 +333,6 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
     /// ```rust
     /// use arrow::datatypes::{DataType, Field, Schema};
     /// use datafusion_common::{Result, ScalarValue, ToDFSchema};
-    /// use datafusion_expr::execution_props::ExecutionProps;
     /// use datafusion_expr::interval_arithmetic::{Interval, NullableInterval};
     /// use datafusion_expr::simplify::SimplifyContext;
     /// use datafusion_expr::{col, lit, Expr};
@@ -360,8 +347,7 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
     /// .unwrap();
     ///
     /// // Create the simplifier
-    /// let props = ExecutionProps::new();
-    /// let context = SimplifyContext::new(&props).with_schema(schema);
+    /// let context = SimplifyContext::builder().with_schema(schema).build();
     /// let simplifier = ExprSimplifier::new(context);
     ///
     /// // Expression: a = c AND 1 = b
@@ -406,7 +392,6 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
     /// use arrow::datatypes::{DataType, Field, Schema};
     /// use datafusion_expr::{col, lit, Expr};
     /// use datafusion_common::{Result, ScalarValue, ToDFSchema};
-    /// use datafusion_expr::execution_props::ExecutionProps;
     /// use datafusion_expr::simplify::SimplifyContext;
     /// use datafusion_optimizer::simplify_expressions::ExprSimplifier;
     ///
@@ -416,9 +401,7 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
     ///   .to_dfschema_ref().unwrap();
     ///
     /// // Create the simplifier
-    /// let props = ExecutionProps::new();
-    /// let context = SimplifyContext::new(&props)
-    ///    .with_schema(schema);
+    /// let context = SimplifyContext::builder().with_schema(schema).build();
     /// let simplifier = ExprSimplifier::new(context);
     ///
     /// // Expression: a IS NOT NULL
@@ -492,12 +475,11 @@ impl TreeNodeRewriter for Canonicalizer {
     }
 }
 
-#[allow(rustdoc::private_intra_doc_links)]
 /// Partially evaluate `Expr`s so constant subtrees are evaluated at plan time.
 ///
 /// Note it does not handle algebraic rewrites such as `(a or false)`
 /// --> `a`, which is handled by [`Simplifier`]
-struct ConstEvaluator<'a> {
+struct ConstEvaluator {
     /// `can_evaluate` is used during the depth-first-search of the
     /// `Expr` tree to track if any siblings (or their descendants) were
     /// non evaluatable (e.g. had a column reference or volatile
@@ -511,13 +493,15 @@ struct ConstEvaluator<'a> {
     /// means there were no non evaluatable siblings (or their
     /// descendants) so this `Expr` can be evaluated
     can_evaluate: Vec<bool>,
-
-    execution_props: &'a ExecutionProps,
-    input_schema: DFSchema,
-    input_batch: RecordBatch,
+    /// Execution properties needed to call [`create_physical_expr`].
+    /// `ConstEvaluator` only evaluates expressions without column references
+    /// (i.e. constant expressions) and doesn't use the variable binding features
+    /// of `ExecutionProps` (we explicitly filter out [`Expr::ScalarVariable`]).
+    /// The `config_options` are passed from the session to allow scalar functions
+    /// to access configuration like timezone.
+    execution_props: ExecutionProps,
 }
 
-#[allow(dead_code)]
 /// The simplify result of ConstEvaluator
 enum ConstSimplifyResult {
     // Expr was simplified and contains the new expression
@@ -528,7 +512,7 @@ enum ConstSimplifyResult {
     SimplifyRuntimeError(DataFusionError, Expr),
 }
 
-impl TreeNodeRewriter for ConstEvaluator<'_> {
+impl TreeNodeRewriter for ConstEvaluator {
     type Node = Expr;
 
     fn f_down(&mut self, expr: Expr) -> Result<Transformed<Expr>> {
@@ -576,10 +560,9 @@ impl TreeNodeRewriter for ConstEvaluator<'_> {
                     // This provides clearer error messages and fails fast.
                     if let Expr::Cast(Cast { ref expr, .. })
                     | Expr::TryCast(TryCast { ref expr, .. }) = expr
+                        && matches!(expr.as_ref(), Expr::Literal(_, _))
                     {
-                        if matches!(expr.as_ref(), Expr::Literal(_, _)) {
-                            return Err(err);
-                        }
+                        return Err(err);
                     }
                     // For other expressions (like CASE, COALESCE), preserve the original
                     // to allow short-circuit evaluation at execution time
@@ -592,29 +575,38 @@ impl TreeNodeRewriter for ConstEvaluator<'_> {
     }
 }
 
-impl<'a> ConstEvaluator<'a> {
-    /// Create a new `ConstantEvaluator`. Session constants (such as
-    /// the time for `now()` are taken from the passed
-    /// `execution_props`.
-    pub fn try_new(execution_props: &'a ExecutionProps) -> Result<Self> {
+static DUMMY_SCHEMA: LazyLock<Arc<Schema>> =
+    LazyLock::new(|| Arc::new(Schema::new(vec![Field::new(".", DataType::Null, true)])));
+
+static DUMMY_DF_SCHEMA: LazyLock<DFSchema> =
+    LazyLock::new(|| DFSchema::try_from(Arc::clone(&*DUMMY_SCHEMA)).unwrap());
+
+static DUMMY_BATCH: LazyLock<RecordBatch> = LazyLock::new(|| {
+    // Need a single "input" row to produce a single output row
+    let col = new_null_array(&DataType::Null, 1);
+    RecordBatch::try_new(DUMMY_SCHEMA.clone(), vec![col]).unwrap()
+});
+
+impl ConstEvaluator {
+    /// Create a new `ConstantEvaluator`.
+    ///
+    /// Note: `ConstEvaluator` filters out expressions with scalar variables
+    /// (like `$var`) and volatile functions, so it creates its own default
+    /// `ExecutionProps` internally. The filtered expressions will be evaluated
+    /// at runtime where proper variable bindings are available.
+    ///
+    /// The `config_options` parameter is used to pass session configuration
+    /// (like timezone) to scalar functions during constant evaluation.
+    pub fn try_new(config_options: Option<Arc<ConfigOptions>>) -> Result<Self> {
         // The dummy column name is unused and doesn't matter as only
         // expressions without column references can be evaluated
-        static DUMMY_COL_NAME: &str = ".";
-        let schema = Arc::new(Schema::new(vec![Field::new(
-            DUMMY_COL_NAME,
-            DataType::Null,
-            true,
-        )]));
-        let input_schema = DFSchema::try_from(Arc::clone(&schema))?;
-        // Need a single "input" row to produce a single output row
-        let col = new_null_array(&DataType::Null, 1);
-        let input_batch = RecordBatch::try_new(schema, vec![col])?;
+
+        let mut execution_props = ExecutionProps::new();
+        execution_props.config_options = config_options;
 
         Ok(Self {
             can_evaluate: vec![],
             execution_props,
-            input_schema,
-            input_batch,
         })
     }
 
@@ -645,6 +637,7 @@ impl<'a> ConstEvaluator<'a> {
             | Expr::OuterReferenceColumn(_, _)
             | Expr::Exists { .. }
             | Expr::InSubquery(_)
+            | Expr::SetComparison(_)
             | Expr::ScalarSubquery(_)
             | Expr::WindowFunction { .. }
             | Expr::GroupingSet(_)
@@ -653,6 +646,37 @@ impl<'a> ConstEvaluator<'a> {
             Expr::ScalarFunction(ScalarFunction { func, .. }) => {
                 Self::volatility_ok(func.signature().volatility)
             }
+            Expr::HigherOrderFunction(HigherOrderFunction { func, .. }) => {
+                Self::volatility_ok(func.signature().volatility)
+            }
+            Expr::Cast(Cast { expr, field }) | Expr::TryCast(TryCast { expr, field }) => {
+                if let (
+                    Ok(DataType::Struct(source_fields)),
+                    DataType::Struct(target_fields),
+                ) = (expr.get_type(&DFSchema::empty()), field.data_type())
+                {
+                    // Don't const-fold struct casts with different field counts
+                    if source_fields.len() != target_fields.len() {
+                        return false;
+                    }
+
+                    // Skip const-folding when there is no field name overlap
+                    if !has_one_of_more_common_fields(&source_fields, target_fields) {
+                        return false;
+                    }
+
+                    // Don't const-fold struct casts with empty (0-row) literals
+                    // The simplifier uses a 1-row input batch, which causes dimension mismatches
+                    // when evaluating 0-row struct literals
+                    if let Expr::Literal(ScalarValue::Struct(struct_array), _) =
+                        expr.as_ref()
+                        && struct_array.len() == 0
+                    {
+                        return false;
+                    }
+                }
+                true
+            }
             Expr::Literal(_, _)
             | Expr::Alias(..)
             | Expr::Unnest(_)
@@ -671,9 +695,9 @@ impl<'a> ConstEvaluator<'a> {
             | Expr::Like { .. }
             | Expr::SimilarTo { .. }
             | Expr::Case(_)
-            | Expr::Cast { .. }
-            | Expr::TryCast { .. }
-            | Expr::InList { .. } => true,
+            | Expr::InList { .. }
+            | Expr::Lambda(_)
+            | Expr::LambdaVariable(_) => true,
         }
     }
 
@@ -684,12 +708,12 @@ impl<'a> ConstEvaluator<'a> {
         }
 
         let phys_expr =
-            match create_physical_expr(&expr, &self.input_schema, self.execution_props) {
+            match create_physical_expr(&expr, &DUMMY_DF_SCHEMA, &self.execution_props) {
                 Ok(e) => e,
                 Err(err) => return ConstSimplifyResult::SimplifyRuntimeError(err, expr),
             };
         let metadata = phys_expr
-            .return_field(self.input_batch.schema_ref())
+            .return_field(DUMMY_BATCH.schema_ref())
             .ok()
             .and_then(|f| {
                 let m = f.metadata();
@@ -698,7 +722,7 @@ impl<'a> ConstEvaluator<'a> {
                     false => Some(FieldMetadata::from(m)),
                 }
             });
-        let col_val = match phys_expr.evaluate(&self.input_batch) {
+        let col_val = match phys_expr.evaluate(&DUMMY_BATCH) {
             Ok(v) => v,
             Err(err) => return ConstSimplifyResult::SimplifyRuntimeError(err, expr),
         };
@@ -706,7 +730,10 @@ impl<'a> ConstEvaluator<'a> {
             ColumnarValue::Array(a) => {
                 if a.len() != 1 {
                     ConstSimplifyResult::SimplifyRuntimeError(
-                        exec_datafusion_err!("Could not evaluate the expression, found a result of length {}", a.len()),
+                        exec_datafusion_err!(
+                            "Could not evaluate the expression, found a result of length {}",
+                            a.len()
+                        ),
                         expr,
                     )
                 } else if as_list_array(&a).is_ok() {
@@ -722,35 +749,12 @@ impl<'a> ConstEvaluator<'a> {
                 } else {
                     // Non-ListArray
                     match ScalarValue::try_from_array(&a, 0) {
-                        Ok(s) => {
-                            // TODO: support the optimization for `Map` type after support impl hash for it
-                            if matches!(&s, ScalarValue::Map(_)) {
-                                ConstSimplifyResult::SimplifyRuntimeError(
-                                    DataFusionError::NotImplemented("Const evaluate for Map type is still not supported".to_string()),
-                                    expr,
-                                )
-                            } else {
-                                ConstSimplifyResult::Simplified(s, metadata)
-                            }
-                        }
+                        Ok(s) => ConstSimplifyResult::Simplified(s, metadata),
                         Err(err) => ConstSimplifyResult::SimplifyRuntimeError(err, expr),
                     }
                 }
             }
-            ColumnarValue::Scalar(s) => {
-                // TODO: support the optimization for `Map` type after support impl hash for it
-                if matches!(&s, ScalarValue::Map(_)) {
-                    ConstSimplifyResult::SimplifyRuntimeError(
-                        DataFusionError::NotImplemented(
-                            "Const evaluate for Map type is still not supported"
-                                .to_string(),
-                        ),
-                        expr,
-                    )
-                } else {
-                    ConstSimplifyResult::Simplified(s, metadata)
-                }
-            }
+            ColumnarValue::Scalar(s) => ConstSimplifyResult::Simplified(s, metadata),
         }
     }
 }
@@ -764,17 +768,17 @@ impl<'a> ConstEvaluator<'a> {
 /// * `false = true` and `true = false` to `false`
 /// * `!!expr` to `expr`
 /// * `expr = null` and `expr != null` to `null`
-struct Simplifier<'a, S> {
-    info: &'a S,
+struct Simplifier<'a> {
+    info: &'a SimplifyContext,
 }
 
-impl<'a, S> Simplifier<'a, S> {
-    pub fn new(info: &'a S) -> Self {
+impl<'a> Simplifier<'a> {
+    pub fn new(info: &'a SimplifyContext) -> Self {
         Self { info }
     }
 }
 
-impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
+impl TreeNodeRewriter for Simplifier<'_> {
     type Node = Expr;
 
     /// rewrite the expression simplifying any constant expressions
@@ -1069,9 +1073,27 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                         right: left_right,
                     }))
                 } else {
-                    return internal_err!("can_reduce_to_equal_statement should only be called with a BinaryExpr");
+                    return internal_err!(
+                        "can_reduce_to_equal_statement should only be called with a BinaryExpr"
+                    );
                 }
             }
+            // A = L1 AND A != L2 --> A = L1 (when L1 != L2)
+            Expr::BinaryExpr(BinaryExpr {
+                left,
+                op: And,
+                right,
+            }) if is_eq_and_ne_with_different_literal(&left, &right) => {
+                Transformed::yes(*left)
+            }
+            // A != L2 AND A = L1 --> A = L1 (when L1 != L2)
+            Expr::BinaryExpr(BinaryExpr {
+                left,
+                op: And,
+                right,
+            }) if is_eq_and_ne_with_different_literal(&right, &left) => {
+                Transformed::yes(*right)
+            }
 
             //
             // Rules for Multiply
@@ -1639,17 +1661,19 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                 left,
                 op: op @ (RegexMatch | RegexNotMatch | RegexIMatch | RegexNotIMatch),
                 right,
-            }) => Transformed::yes(simplify_regex_expr(left, op, right)?),
+            }) => simplify_regex_expr(left, op, right)?,
 
             // Rules for Like
             Expr::Like(like) => {
                 // `\` is implicit escape, see https://github.com/apache/datafusion/issues/13291
                 let escape_char = like.escape_char.unwrap_or('\\');
-                match as_string_scalar(&like.pattern) {
-                    Some((data_type, pattern_str)) => {
+
+                match StringScalar::try_from_expr(&like.pattern) {
+                    Some(string_scalar) => {
+                        let pattern_str = string_scalar.as_str();
                         match pattern_str {
                             None => return Ok(Transformed::yes(lit_bool_null())),
-                            Some(pattern_str) if pattern_str == "%" => {
+                            Some("%") => {
                                 // exp LIKE '%' is
                                 //   - when exp is not NULL, it's true
                                 //   - when exp is NULL, it's NULL
@@ -1676,15 +1700,15 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                             {
                                 // Repeated occurrences of wildcard are redundant so remove them
                                 // exp LIKE '%%'  --> exp LIKE '%'
-                                let simplified_pattern = Regex::new("%%+")
-                                    .unwrap()
-                                    .replace_all(pattern_str, "%")
-                                    .to_string();
+
+                                static LIKE_REGEX: LazyLock<Regex> =
+                                    LazyLock::new(|| Regex::new("%%+").unwrap());
+                                let simplified_pattern =
+                                    LIKE_REGEX.replace_all(pattern_str, "%").to_string();
                                 Transformed::yes(Expr::Like(Like {
-                                    pattern: Box::new(to_string_scalar(
-                                        data_type,
-                                        Some(simplified_pattern),
-                                    )),
+                                    pattern: Box::new(
+                                        string_scalar.to_expr(&simplified_pattern),
+                                    ),
                                     ..like
                                 }))
                             }
@@ -1766,6 +1790,8 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
             }) if are_inlist_and_eq(left.as_ref(), right.as_ref()) => {
                 let lhs = to_inlist(*left).unwrap();
                 let rhs = to_inlist(*right).unwrap();
+                #[allow(clippy::allow_attributes, clippy::mutable_key_type)]
+                // Expr contains Arc with interior mutability but is intentionally used as hash key
                 let mut seen: HashSet<Expr> = HashSet::new();
                 let list = lhs
                     .list
@@ -1979,30 +2005,184 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                 }))
             }
 
+            // =======================================
+            // preimage_in_comparison
+            // =======================================
+            //
+            // For case:
+            // date_part('YEAR', expr) op literal
+            //
+            // For details see datafusion_expr::ScalarUDFImpl::preimage
+            Expr::BinaryExpr(BinaryExpr { left, op, right }) => {
+                use datafusion_expr::Operator::*;
+                let is_preimage_op = matches!(
+                    op,
+                    Eq | NotEq
+                        | Lt
+                        | LtEq
+                        | Gt
+                        | GtEq
+                        | IsDistinctFrom
+                        | IsNotDistinctFrom
+                );
+                if !is_preimage_op || is_null(&right) {
+                    return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
+                        left,
+                        op,
+                        right,
+                    })));
+                }
+
+                if let PreimageResult::Range { interval, expr } =
+                    get_preimage(left.as_ref(), right.as_ref(), info)?
+                {
+                    rewrite_with_preimage(*interval, op, expr)?
+                } else if let Some(swapped) = op.swap() {
+                    if let PreimageResult::Range { interval, expr } =
+                        get_preimage(right.as_ref(), left.as_ref(), info)?
+                    {
+                        rewrite_with_preimage(*interval, swapped, expr)?
+                    } else {
+                        Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))
+                    }
+                } else {
+                    Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))
+                }
+            }
+            // For case:
+            // date_part('YEAR', expr) IN (literal1, literal2, ...)
+            Expr::InList(InList {
+                expr,
+                list,
+                negated,
+            }) => {
+                if list.len() > THRESHOLD_INLINE_INLIST || list.iter().any(is_null) {
+                    return Ok(Transformed::no(Expr::InList(InList {
+                        expr,
+                        list,
+                        negated,
+                    })));
+                }
+
+                let (op, combiner): (Operator, fn(Expr, Expr) -> Expr) =
+                    if negated { (NotEq, and) } else { (Eq, or) };
+
+                let mut rewritten: Option<Expr> = None;
+                for item in &list {
+                    let PreimageResult::Range { interval, expr } =
+                        get_preimage(expr.as_ref(), item, info)?
+                    else {
+                        return Ok(Transformed::no(Expr::InList(InList {
+                            expr,
+                            list,
+                            negated,
+                        })));
+                    };
+
+                    let range_expr = rewrite_with_preimage(*interval, op, expr)?.data;
+                    rewritten = Some(match rewritten {
+                        None => range_expr,
+                        Some(acc) => combiner(acc, range_expr),
+                    });
+                }
+
+                if let Some(rewritten) = rewritten {
+                    Transformed::yes(rewritten)
+                } else {
+                    Transformed::no(Expr::InList(InList {
+                        expr,
+                        list,
+                        negated,
+                    }))
+                }
+            }
+
             // no additional rewrites possible
             expr => Transformed::no(expr),
         })
     }
 }
 
-fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option<String>)> {
+fn get_preimage(
+    left_expr: &Expr,
+    right_expr: &Expr,
+    info: &SimplifyContext,
+) -> Result<PreimageResult> {
+    let Expr::ScalarFunction(ScalarFunction { func, args }) = left_expr else {
+        return Ok(PreimageResult::None);
+    };
+    if !is_literal_or_literal_cast(right_expr) {
+        return Ok(PreimageResult::None);
+    }
+    if func.signature().volatility != Volatility::Immutable {
+        return Ok(PreimageResult::None);
+    }
+    func.preimage(args, right_expr, info)
+}
+
+fn is_literal_or_literal_cast(expr: &Expr) -> bool {
     match expr {
-        Expr::Literal(ScalarValue::Utf8(s), _) => Some((DataType::Utf8, s)),
-        Expr::Literal(ScalarValue::LargeUtf8(s), _) => Some((DataType::LargeUtf8, s)),
-        Expr::Literal(ScalarValue::Utf8View(s), _) => Some((DataType::Utf8View, s)),
-        _ => None,
+        Expr::Literal(_, _) => true,
+        Expr::Cast(Cast { expr, .. }) => matches!(expr.as_ref(), Expr::Literal(_, _)),
+        Expr::TryCast(TryCast { expr, .. }) => {
+            matches!(expr.as_ref(), Expr::Literal(_, _))
+        }
+        _ => false,
     }
 }
 
-fn to_string_scalar(data_type: DataType, value: Option<String>) -> Expr {
-    match data_type {
-        DataType::Utf8 => Expr::Literal(ScalarValue::Utf8(value), None),
-        DataType::LargeUtf8 => Expr::Literal(ScalarValue::LargeUtf8(value), None),
-        DataType::Utf8View => Expr::Literal(ScalarValue::Utf8View(value), None),
-        _ => unreachable!(),
+/// Helper for working with string scalar values (Utf8, LargeUtf8, Utf8View)
+pub(crate) enum StringScalar<'a> {
+    Utf8(&'a ScalarValue),
+    LargeUtf8(&'a ScalarValue),
+    Utf8View(&'a ScalarValue),
+}
+
+impl<'a> StringScalar<'a> {
+    /// Create a `StringScalar` view from an `Expr` if it is a supported string literal.
+    /// Returns `None` if the expression is not a string literal.
+    pub(crate) fn try_from_expr(expr: &'a Expr) -> Option<Self> {
+        match expr {
+            Expr::Literal(scalar, _) => Self::try_from_scalar(scalar),
+            _ => None,
+        }
+    }
+
+    /// Create a `StringScalar` view from a `ScalarValue` if it is a supported string type.
+    /// Returns `None` if the scalar value is not a supported string type.
+    fn try_from_scalar(scalar: &'a ScalarValue) -> Option<Self> {
+        match scalar {
+            ScalarValue::Utf8(_) => Some(Self::Utf8(scalar)),
+            ScalarValue::LargeUtf8(_) => Some(Self::LargeUtf8(scalar)),
+            ScalarValue::Utf8View(_) => Some(Self::Utf8View(scalar)),
+            _ => None,
+        }
+    }
+
+    /// Returns the underlying string slice.
+    pub(crate) fn as_str(&self) -> Option<&'a str> {
+        match self {
+            Self::Utf8(scalar) | Self::LargeUtf8(scalar) | Self::Utf8View(scalar) => {
+                scalar.try_as_str().flatten()
+            }
+        }
+    }
+
+    /// Build a new `Expr` of the same string type with the given value.
+    pub(crate) fn to_expr(&self, val: &str) -> Expr {
+        match self {
+            Self::Utf8(_) => Expr::Literal(ScalarValue::Utf8(Some(val.to_owned())), None),
+            Self::LargeUtf8(_) => {
+                Expr::Literal(ScalarValue::LargeUtf8(Some(val.to_owned())), None)
+            }
+            Self::Utf8View(_) => {
+                Expr::Literal(ScalarValue::Utf8View(Some(val.to_owned())), None)
+            }
+        }
     }
 }
 
+#[allow(clippy::allow_attributes, clippy::mutable_key_type)] // Expr contains Arc with interior mutability but is intentionally used as hash key
 fn has_common_conjunction(lhs: &Expr, rhs: &Expr) -> bool {
     let lhs_set: HashSet<&Expr> = iter_conjunction(lhs).collect();
     iter_conjunction(rhs).any(|e| lhs_set.contains(&e) && !e.is_volatile())
@@ -2087,6 +2267,7 @@ fn to_inlist(expr: Expr) -> Option<InList> {
 
 /// Return the union of two inlist expressions
 /// maintaining the order of the elements in the two lists
+#[allow(clippy::allow_attributes, clippy::mutable_key_type)] // Expr contains Arc with interior mutability but is intentionally used as hash key
 fn inlist_union(mut l1: InList, l2: InList, negated: bool) -> Result<Expr> {
     // extend the list in l1 with the elements in l2 that are not already in l1
     let l1_items: HashSet<_> = l1.list.iter().collect();
@@ -2105,6 +2286,7 @@ fn inlist_union(mut l1: InList, l2: InList, negated: bool) -> Result<Expr> {
 
 /// Return the intersection of two inlist expressions
 /// maintaining the order of the elements in the two lists
+#[allow(clippy::allow_attributes, clippy::mutable_key_type)] // Expr contains Arc with interior mutability but is intentionally used as hash key
 fn inlist_intersection(mut l1: InList, l2: &InList, negated: bool) -> Result<Expr> {
     let l2_items = l2.list.iter().collect::<HashSet<_>>();
 
@@ -2121,6 +2303,7 @@ fn inlist_intersection(mut l1: InList, l2: &InList, negated: bool) -> Result<Exp
 
 /// Return the all items in l1 that are not in l2
 /// maintaining the order of the elements in the two lists
+#[allow(clippy::allow_attributes, clippy::mutable_key_type)] // Expr contains Arc with interior mutability but is intentionally used as hash key
 fn inlist_except(mut l1: InList, l2: &InList) -> Result<Expr> {
     let l2_items = l2.list.iter().collect::<HashSet<_>>();
 
@@ -2134,7 +2317,7 @@ fn inlist_except(mut l1: InList, l2: &InList) -> Result<Expr> {
 }
 
 /// Returns expression testing a boolean `expr` for being exactly `true` (not `false` or NULL).
-fn is_exactly_true(expr: Expr, info: &impl SimplifyInfo) -> Result<Expr> {
+fn is_exactly_true(expr: Expr, info: &SimplifyContext) -> Result<Expr> {
     if !info.nullable(&expr)? {
         Ok(expr)
     } else {
@@ -2150,8 +2333,8 @@ fn is_exactly_true(expr: Expr, info: &impl SimplifyInfo) -> Result<Expr> {
 // A / 1 -> A
 //
 // Move this function body out of the large match branch avoid stack overflow
-fn simplify_right_is_one_case<S: SimplifyInfo>(
-    info: &S,
+fn simplify_right_is_one_case(
+    info: &SimplifyContext,
     left: Box<Expr>,
     op: &Operator,
     right: &Expr,
@@ -2175,10 +2358,12 @@ fn simplify_right_is_one_case<S: SimplifyInfo>(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::simplify_expressions::SimplifyContext;
     use crate::test::test_table_scan_with_name;
-    use arrow::datatypes::FieldRef;
-    use datafusion_common::{assert_contains, DFSchemaRef, ToDFSchema};
+    use arrow::{
+        array::{Int32Array, StructArray},
+        datatypes::{FieldRef, Fields},
+    };
+    use datafusion_common::{DFSchemaRef, ToDFSchema, assert_contains};
     use datafusion_expr::{
         expr::WindowFunction,
         function::{
@@ -2204,9 +2389,11 @@ mod tests {
     // ------------------------------
     #[test]
     fn api_basic() {
-        let props = ExecutionProps::new();
-        let simplifier =
-            ExprSimplifier::new(SimplifyContext::new(&props).with_schema(test_schema()));
+        let simplifier = ExprSimplifier::new(
+            SimplifyContext::builder()
+                .with_schema(test_schema())
+                .build(),
+        );
 
         let expr = lit(1) + lit(2);
         let expected = lit(3);
@@ -2216,9 +2403,10 @@ mod tests {
     #[test]
     fn basic_coercion() {
         let schema = test_schema();
-        let props = ExecutionProps::new();
         let simplifier = ExprSimplifier::new(
-            SimplifyContext::new(&props).with_schema(Arc::clone(&schema)),
+            SimplifyContext::builder()
+                .with_schema(Arc::clone(&schema))
+                .build(),
         );
 
         // Note expr type is int32 (not int64)
@@ -2246,9 +2434,11 @@ mod tests {
 
     #[test]
     fn simplify_and_constant_prop() {
-        let props = ExecutionProps::new();
-        let simplifier =
-            ExprSimplifier::new(SimplifyContext::new(&props).with_schema(test_schema()));
+        let simplifier = ExprSimplifier::new(
+            SimplifyContext::builder()
+                .with_schema(test_schema())
+                .build(),
+        );
 
         // should be able to simplify to false
         // (i * (1 - 2)) > 0
@@ -2259,9 +2449,11 @@ mod tests {
 
     #[test]
     fn simplify_and_constant_prop_with_case() {
-        let props = ExecutionProps::new();
-        let simplifier =
-            ExprSimplifier::new(SimplifyContext::new(&props).with_schema(test_schema()));
+        let simplifier = ExprSimplifier::new(
+            SimplifyContext::builder()
+                .with_schema(test_schema())
+                .build(),
+        );
 
         //   CASE
         //     WHEN i>5 AND false THEN i > 5
@@ -2429,6 +2621,27 @@ mod tests {
         assert_eq!(simplify(expr_b), expected);
     }
 
+    #[test]
+    fn test_simplify_eq_and_neq_with_different_literals() {
+        // A = 1 AND A != 0 --> A = 1 (when 1 != 0)
+        let expr = col("c2").eq(lit(1)).and(col("c2").not_eq(lit(0)));
+        let expected = col("c2").eq(lit(1));
+        assert_eq!(simplify(expr), expected);
+
+        // A != 0 AND A = 1 --> A = 1 (when 1 != 0)
+        let expr = col("c2").not_eq(lit(0)).and(col("c2").eq(lit(1)));
+        let expected = col("c2").eq(lit(1));
+        assert_eq!(simplify(expr), expected);
+
+        // Should NOT simplify when literals are the same (A = 1 AND A != 1)
+        // This is a contradiction but handled by other rules
+        let expr = col("c2").eq(lit(1)).and(col("c2").not_eq(lit(1)));
+        // Should not be simplified by this rule (left unchanged or handled elsewhere)
+        let result = simplify(expr.clone());
+        // The expression should not have been simplified
+        assert_eq!(result, expr);
+    }
+
     #[test]
     fn test_simplify_multiply_by_one() {
         let expr_a = col("c2") * lit(1);
@@ -3375,18 +3588,17 @@ mod tests {
 
     fn try_simplify(expr: Expr) -> Result<Expr> {
         let schema = expr_test_schema();
-        let execution_props = ExecutionProps::new();
-        let simplifier = ExprSimplifier::new(
-            SimplifyContext::new(&execution_props).with_schema(schema),
-        );
+        let simplifier =
+            ExprSimplifier::new(SimplifyContext::builder().with_schema(schema).build());
         simplifier.simplify(expr)
     }
 
     fn coerce(expr: Expr) -> Expr {
         let schema = expr_test_schema();
-        let execution_props = ExecutionProps::new();
         let simplifier = ExprSimplifier::new(
-            SimplifyContext::new(&execution_props).with_schema(Arc::clone(&schema)),
+            SimplifyContext::builder()
+                .with_schema(Arc::clone(&schema))
+                .build(),
         );
         simplifier.coerce(expr, schema.as_ref()).unwrap()
     }
@@ -3397,10 +3609,8 @@ mod tests {
 
     fn try_simplify_with_cycle_count(expr: Expr) -> Result<(Expr, u32)> {
         let schema = expr_test_schema();
-        let execution_props = ExecutionProps::new();
-        let simplifier = ExprSimplifier::new(
-            SimplifyContext::new(&execution_props).with_schema(schema),
-        );
+        let simplifier =
+            ExprSimplifier::new(SimplifyContext::builder().with_schema(schema).build());
         let (expr, count) = simplifier.simplify_with_cycle_count_transformed(expr)?;
         Ok((expr.data, count))
     }
@@ -3414,11 +3624,9 @@ mod tests {
         guarantees: Vec<(Expr, NullableInterval)>,
     ) -> Expr {
         let schema = expr_test_schema();
-        let execution_props = ExecutionProps::new();
-        let simplifier = ExprSimplifier::new(
-            SimplifyContext::new(&execution_props).with_schema(schema),
-        )
-        .with_guarantees(guarantees);
+        let simplifier =
+            ExprSimplifier::new(SimplifyContext::builder().with_schema(schema).build())
+                .with_guarantees(guarantees);
         simplifier.simplify(expr).unwrap()
     }
 
@@ -4320,8 +4528,7 @@ mod tests {
     fn just_simplifier_simplify_null_in_empty_inlist() {
         let simplify = |expr: Expr| -> Expr {
             let schema = expr_test_schema();
-            let execution_props = ExecutionProps::new();
-            let info = SimplifyContext::new(&execution_props).with_schema(schema);
+            let info = SimplifyContext::builder().with_schema(schema).build();
             let simplifier = &mut Simplifier::new(&info);
             expr.rewrite(simplifier)
                 .expect("Failed to simplify expression")
@@ -4687,10 +4894,9 @@ mod tests {
 
     #[test]
     fn simplify_common_factor_conjunction_in_disjunction() {
-        let props = ExecutionProps::new();
         let schema = boolean_test_schema();
         let simplifier =
-            ExprSimplifier::new(SimplifyContext::new(&props).with_schema(schema));
+            ExprSimplifier::new(SimplifyContext::builder().with_schema(schema).build());
 
         let a = || col("A");
         let b = || col("B");
@@ -4764,10 +4970,6 @@ mod tests {
     }
 
     impl AggregateUDFImpl for SimplifyMockUdaf {
-        fn as_any(&self) -> &dyn std::any::Any {
-            self
-        }
-
         fn name(&self) -> &str {
             "mock_simplify"
         }
@@ -4845,10 +5047,6 @@ mod tests {
     }
 
     impl WindowUDFImpl for SimplifyMockUdwf {
-        fn as_any(&self) -> &dyn std::any::Any {
-            self
-        }
-
         fn name(&self) -> &str {
             "mock_simplify"
         }
@@ -4893,10 +5091,6 @@ mod tests {
         }
     }
     impl ScalarUDFImpl for VolatileUdf {
-        fn as_any(&self) -> &dyn std::any::Any {
-            self
-        }
-
         fn name(&self) -> &str {
             "VolatileUdf"
         }
@@ -5020,9 +5214,8 @@ mod tests {
 
         // The simplification should now fail with an error at plan time
         let schema = test_schema();
-        let props = ExecutionProps::new();
         let simplifier =
-            ExprSimplifier::new(SimplifyContext::new(&props).with_schema(schema));
+            ExprSimplifier::new(SimplifyContext::builder().with_schema(schema).build());
         let result = simplifier.simplify(expr);
         assert!(result.is_err(), "Expected error for invalid cast");
         let err_msg = result.unwrap_err().to_string();
@@ -5036,4 +5229,168 @@ mod tests {
             else_expr: None,
         })
     }
+
+    // --------------------------------
+    // --- Struct Cast Tests -----
+    // --------------------------------
+
+    /// Helper to create a `Struct` literal cast expression from `source_fields` and `target_fields`.
+    fn make_struct_cast_expr(source_fields: Fields, target_fields: Fields) -> Expr {
+        // Create 1-row struct array (not 0-row) so it can be evaluated by simplifier
+        let arrays: Vec<Arc<dyn Array>> = vec![
+            Arc::new(Int32Array::from(vec![Some(1)])),
+            Arc::new(Int32Array::from(vec![Some(2)])),
+        ];
+        let struct_array = StructArray::try_new(source_fields, arrays, None).unwrap();
+
+        Expr::Cast(Cast::new(
+            Box::new(Expr::Literal(
+                ScalarValue::Struct(Arc::new(struct_array)),
+                None,
+            )),
+            DataType::Struct(target_fields),
+        ))
+    }
+
+    #[test]
+    fn test_struct_cast_different_field_counts_not_foldable() {
+        // Test that struct casts with different field counts are NOT marked as foldable
+        // When field counts differ, const-folding should not be attempted
+
+        let source_fields = Fields::from(vec![
+            Arc::new(Field::new("a", DataType::Int32, true)),
+            Arc::new(Field::new("b", DataType::Int32, true)),
+        ]);
+
+        let target_fields = Fields::from(vec![
+            Arc::new(Field::new("x", DataType::Int32, true)),
+            Arc::new(Field::new("y", DataType::Int32, true)),
+            Arc::new(Field::new("z", DataType::Int32, true)),
+        ]);
+
+        let expr = make_struct_cast_expr(source_fields, target_fields);
+
+        let simplifier = ExprSimplifier::new(
+            SimplifyContext::builder()
+                .with_schema(test_schema())
+                .build(),
+        );
+
+        // The cast should remain unchanged since field counts differ
+        let result = simplifier.simplify(expr.clone()).unwrap();
+        // Ensure const-folding was not attempted (the expression remains exactly the same)
+        assert_eq!(
+            result, expr,
+            "Struct cast with different field counts should remain unchanged (no const-folding)"
+        );
+    }
+
+    #[test]
+    fn test_struct_cast_same_field_count_foldable() {
+        // Test that struct casts with same field counts can be considered for const-folding
+
+        let source_fields = Fields::from(vec![
+            Arc::new(Field::new("a", DataType::Int32, true)),
+            Arc::new(Field::new("b", DataType::Int32, true)),
+        ]);
+
+        let target_fields = Fields::from(vec![
+            Arc::new(Field::new("a", DataType::Int32, true)),
+            Arc::new(Field::new("b", DataType::Int32, true)),
+        ]);
+
+        let expr = make_struct_cast_expr(source_fields, target_fields);
+
+        let simplifier = ExprSimplifier::new(
+            SimplifyContext::builder()
+                .with_schema(test_schema())
+                .build(),
+        );
+
+        // The cast should be simplified
+        let result = simplifier.simplify(expr.clone()).unwrap();
+        // Struct casts with same field count should be const-folded to a literal
+        assert!(matches!(result, Expr::Literal(_, _)));
+        // Ensure the simplifier made a change (not identical to original)
+        assert_ne!(
+            result, expr,
+            "Struct cast with same field count should be simplified (not identical to input)"
+        );
+    }
+
+    #[test]
+    fn test_struct_cast_different_names_same_count() {
+        // Test struct cast with same field count but different names
+        // Field count matches; simplification should be skipped because names do not overlap
+
+        let source_fields = Fields::from(vec![
+            Arc::new(Field::new("a", DataType::Int32, true)),
+            Arc::new(Field::new("b", DataType::Int32, true)),
+        ]);
+
+        let target_fields = Fields::from(vec![
+            Arc::new(Field::new("x", DataType::Int32, true)),
+            Arc::new(Field::new("y", DataType::Int32, true)),
+        ]);
+
+        let expr = make_struct_cast_expr(source_fields, target_fields);
+
+        let simplifier = ExprSimplifier::new(
+            SimplifyContext::builder()
+                .with_schema(test_schema())
+                .build(),
+        );
+
+        // The cast should remain unchanged because there is no name overlap
+        let result = simplifier.simplify(expr.clone()).unwrap();
+        assert_eq!(
+            result, expr,
+            "Struct cast with different names but same field count should not be simplified"
+        );
+    }
+
+    #[test]
+    fn test_struct_cast_empty_array_not_foldable() {
+        // Test that struct casts with 0-row (empty) struct arrays are NOT const-folded
+        // The simplifier uses a 1-row input batch, which causes dimension mismatches
+        // when evaluating 0-row struct literals
+
+        let source_fields = Fields::from(vec![
+            Arc::new(Field::new("a", DataType::Int32, true)),
+            Arc::new(Field::new("b", DataType::Int32, true)),
+        ]);
+
+        let target_fields = Fields::from(vec![
+            Arc::new(Field::new("a", DataType::Int32, true)),
+            Arc::new(Field::new("b", DataType::Int32, true)),
+        ]);
+
+        // Create a 0-row (empty) struct array
+        let arrays: Vec<Arc<dyn Array>> = vec![
+            Arc::new(Int32Array::new(vec![].into(), None)),
+            Arc::new(Int32Array::new(vec![].into(), None)),
+        ];
+        let struct_array = StructArray::try_new(source_fields, arrays, None).unwrap();
+
+        let expr = Expr::Cast(Cast::new(
+            Box::new(Expr::Literal(
+                ScalarValue::Struct(Arc::new(struct_array)),
+                None,
+            )),
+            DataType::Struct(target_fields),
+        ));
+
+        let simplifier = ExprSimplifier::new(
+            SimplifyContext::builder()
+                .with_schema(test_schema())
+                .build(),
+        );
+
+        // The cast should remain unchanged since the struct array is empty (0-row)
+        let result = simplifier.simplify(expr.clone()).unwrap();
+        assert_eq!(
+            result, expr,
+            "Struct cast with empty (0-row) array should remain unchanged"
+        );
+    }
 }
diff --git a/datafusion/optimizer/src/simplify_expressions/guarantees.rs b/datafusion/optimizer/src/simplify_expressions/guarantees.rs
deleted file mode 100644
index 515fd29003af9..0000000000000
--- a/datafusion/optimizer/src/simplify_expressions/guarantees.rs
+++ /dev/null
@@ -1,476 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Simplifier implementation for [`ExprSimplifier::with_guarantees()`]
-//!
-//! [`ExprSimplifier::with_guarantees()`]: crate::simplify_expressions::expr_simplifier::ExprSimplifier::with_guarantees
-
-use std::{borrow::Cow, collections::HashMap};
-
-use datafusion_common::tree_node::{Transformed, TreeNodeRewriter};
-use datafusion_common::{DataFusionError, Result};
-use datafusion_expr::interval_arithmetic::{Interval, NullableInterval};
-use datafusion_expr::{expr::InList, lit, Between, BinaryExpr, Expr};
-
-/// Rewrite expressions to incorporate guarantees.
-///
-/// Guarantees are a mapping from an expression (which currently is always a
-/// column reference) to a [NullableInterval]. The interval represents the known
-/// possible values of the column. Using these known values, expressions are
-/// rewritten so they can be simplified using `ConstEvaluator` and `Simplifier`.
-///
-/// For example, if we know that a column is not null and has values in the
-/// range [1, 10), we can rewrite `x IS NULL` to `false` or `x < 10` to `true`.
-///
-/// See a full example in [`ExprSimplifier::with_guarantees()`].
-///
-/// [`ExprSimplifier::with_guarantees()`]: crate::simplify_expressions::expr_simplifier::ExprSimplifier::with_guarantees
-pub struct GuaranteeRewriter<'a> {
-    guarantees: HashMap<&'a Expr, &'a NullableInterval>,
-}
-
-impl<'a> GuaranteeRewriter<'a> {
-    pub fn new(
-        guarantees: impl IntoIterator<Item = &'a (Expr, NullableInterval)>,
-    ) -> Self {
-        Self {
-            // TODO: Clippy wants the "map" call removed, but doing so generates
-            //       a compilation error. Remove the clippy directive once this
-            //       issue is fixed.
-            #[allow(clippy::map_identity)]
-            guarantees: guarantees.into_iter().map(|(k, v)| (k, v)).collect(),
-        }
-    }
-}
-
-impl TreeNodeRewriter for GuaranteeRewriter<'_> {
-    type Node = Expr;
-
-    fn f_up(&mut self, expr: Expr) -> Result<Transformed<Expr>> {
-        if self.guarantees.is_empty() {
-            return Ok(Transformed::no(expr));
-        }
-
-        match &expr {
-            Expr::IsNull(inner) => match self.guarantees.get(inner.as_ref()) {
-                Some(NullableInterval::Null { .. }) => Ok(Transformed::yes(lit(true))),
-                Some(NullableInterval::NotNull { .. }) => {
-                    Ok(Transformed::yes(lit(false)))
-                }
-                _ => Ok(Transformed::no(expr)),
-            },
-            Expr::IsNotNull(inner) => match self.guarantees.get(inner.as_ref()) {
-                Some(NullableInterval::Null { .. }) => Ok(Transformed::yes(lit(false))),
-                Some(NullableInterval::NotNull { .. }) => Ok(Transformed::yes(lit(true))),
-                _ => Ok(Transformed::no(expr)),
-            },
-            Expr::Between(Between {
-                expr: inner,
-                negated,
-                low,
-                high,
-            }) => {
-                if let (Some(interval), Expr::Literal(low, _), Expr::Literal(high, _)) = (
-                    self.guarantees.get(inner.as_ref()),
-                    low.as_ref(),
-                    high.as_ref(),
-                ) {
-                    let expr_interval = NullableInterval::NotNull {
-                        values: Interval::try_new(low.clone(), high.clone())?,
-                    };
-
-                    let contains = expr_interval.contains(*interval)?;
-
-                    if contains.is_certainly_true() {
-                        Ok(Transformed::yes(lit(!negated)))
-                    } else if contains.is_certainly_false() {
-                        Ok(Transformed::yes(lit(*negated)))
-                    } else {
-                        Ok(Transformed::no(expr))
-                    }
-                } else {
-                    Ok(Transformed::no(expr))
-                }
-            }
-
-            Expr::BinaryExpr(BinaryExpr { left, op, right }) => {
-                // The left or right side of expression might either have a guarantee
-                // or be a literal. Either way, we can resolve them to a NullableInterval.
-                let left_interval = self
-                    .guarantees
-                    .get(left.as_ref())
-                    .map(|interval| Cow::Borrowed(*interval))
-                    .or_else(|| {
-                        if let Expr::Literal(value, _) = left.as_ref() {
-                            Some(Cow::Owned(value.clone().into()))
-                        } else {
-                            None
-                        }
-                    });
-                let right_interval = self
-                    .guarantees
-                    .get(right.as_ref())
-                    .map(|interval| Cow::Borrowed(*interval))
-                    .or_else(|| {
-                        if let Expr::Literal(value, _) = right.as_ref() {
-                            Some(Cow::Owned(value.clone().into()))
-                        } else {
-                            None
-                        }
-                    });
-
-                match (left_interval, right_interval) {
-                    (Some(left_interval), Some(right_interval)) => {
-                        let result =
-                            left_interval.apply_operator(op, right_interval.as_ref())?;
-                        if result.is_certainly_true() {
-                            Ok(Transformed::yes(lit(true)))
-                        } else if result.is_certainly_false() {
-                            Ok(Transformed::yes(lit(false)))
-                        } else {
-                            Ok(Transformed::no(expr))
-                        }
-                    }
-                    _ => Ok(Transformed::no(expr)),
-                }
-            }
-
-            // Columns (if interval is collapsed to a single value)
-            Expr::Column(_) => {
-                if let Some(interval) = self.guarantees.get(&expr) {
-                    Ok(Transformed::yes(interval.single_value().map_or(expr, lit)))
-                } else {
-                    Ok(Transformed::no(expr))
-                }
-            }
-
-            Expr::InList(InList {
-                expr: inner,
-                list,
-                negated,
-            }) => {
-                if let Some(interval) = self.guarantees.get(inner.as_ref()) {
-                    // Can remove items from the list that don't match the guarantee
-                    let new_list: Vec<Expr> = list
-                        .iter()
-                        .filter_map(|expr| {
-                            if let Expr::Literal(item, _) = expr {
-                                match interval
-                                    .contains(NullableInterval::from(item.clone()))
-                                {
-                                    // If we know for certain the value isn't in the column's interval,
-                                    // we can skip checking it.
-                                    Ok(interval) if interval.is_certainly_false() => None,
-                                    Ok(_) => Some(Ok(expr.clone())),
-                                    Err(e) => Some(Err(e)),
-                                }
-                            } else {
-                                Some(Ok(expr.clone()))
-                            }
-                        })
-                        .collect::<Result<_, DataFusionError>>()?;
-
-                    Ok(Transformed::yes(Expr::InList(InList {
-                        expr: inner.clone(),
-                        list: new_list,
-                        negated: *negated,
-                    })))
-                } else {
-                    Ok(Transformed::no(expr))
-                }
-            }
-
-            _ => Ok(Transformed::no(expr)),
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use arrow::datatypes::DataType;
-    use datafusion_common::tree_node::{TransformedResult, TreeNode};
-    use datafusion_common::ScalarValue;
-    use datafusion_expr::{col, Operator};
-
-    #[test]
-    fn test_null_handling() {
-        // IsNull / IsNotNull can be rewritten to true / false
-        let guarantees = [
-            // Note: AlwaysNull case handled by test_column_single_value test,
-            // since it's a special case of a column with a single value.
-            (
-                col("x"),
-                NullableInterval::NotNull {
-                    values: Interval::make_unbounded(&DataType::Boolean).unwrap(),
-                },
-            ),
-        ];
-        let mut rewriter = GuaranteeRewriter::new(guarantees.iter());
-
-        // x IS NULL => guaranteed false
-        let expr = col("x").is_null();
-        let output = expr.rewrite(&mut rewriter).data().unwrap();
-        assert_eq!(output, lit(false));
-
-        // x IS NOT NULL => guaranteed true
-        let expr = col("x").is_not_null();
-        let output = expr.rewrite(&mut rewriter).data().unwrap();
-        assert_eq!(output, lit(true));
-    }
-
-    fn validate_simplified_cases<T>(rewriter: &mut GuaranteeRewriter, cases: &[(Expr, T)])
-    where
-        ScalarValue: From<T>,
-        T: Clone,
-    {
-        for (expr, expected_value) in cases {
-            let output = expr.clone().rewrite(rewriter).data().unwrap();
-            let expected = lit(ScalarValue::from(expected_value.clone()));
-            assert_eq!(
-                output, expected,
-                "{expr} simplified to {output}, but expected {expected}"
-            );
-        }
-    }
-
-    fn validate_unchanged_cases(rewriter: &mut GuaranteeRewriter, cases: &[Expr]) {
-        for expr in cases {
-            let output = expr.clone().rewrite(rewriter).data().unwrap();
-            assert_eq!(
-                &output, expr,
-                "{expr} was simplified to {output}, but expected it to be unchanged"
-            );
-        }
-    }
-
-    #[test]
-    fn test_inequalities_non_null_unbounded() {
-        let guarantees = [
-            // y ∈ [2021-01-01, ∞) (not null)
-            (
-                col("x"),
-                NullableInterval::NotNull {
-                    values: Interval::try_new(
-                        ScalarValue::Date32(Some(18628)),
-                        ScalarValue::Date32(None),
-                    )
-                    .unwrap(),
-                },
-            ),
-        ];
-        let mut rewriter = GuaranteeRewriter::new(guarantees.iter());
-
-        // (original_expr, expected_simplification)
-        let simplified_cases = &[
-            (col("x").lt(lit(ScalarValue::Date32(Some(18628)))), false),
-            (col("x").lt_eq(lit(ScalarValue::Date32(Some(17000)))), false),
-            (col("x").gt(lit(ScalarValue::Date32(Some(18627)))), true),
-            (col("x").gt_eq(lit(ScalarValue::Date32(Some(18628)))), true),
-            (col("x").eq(lit(ScalarValue::Date32(Some(17000)))), false),
-            (col("x").not_eq(lit(ScalarValue::Date32(Some(17000)))), true),
-            (
-                col("x").between(
-                    lit(ScalarValue::Date32(Some(16000))),
-                    lit(ScalarValue::Date32(Some(17000))),
-                ),
-                false,
-            ),
-            (
-                col("x").not_between(
-                    lit(ScalarValue::Date32(Some(16000))),
-                    lit(ScalarValue::Date32(Some(17000))),
-                ),
-                true,
-            ),
-            (
-                Expr::BinaryExpr(BinaryExpr {
-                    left: Box::new(col("x")),
-                    op: Operator::IsDistinctFrom,
-                    right: Box::new(lit(ScalarValue::Null)),
-                }),
-                true,
-            ),
-            (
-                Expr::BinaryExpr(BinaryExpr {
-                    left: Box::new(col("x")),
-                    op: Operator::IsDistinctFrom,
-                    right: Box::new(lit(ScalarValue::Date32(Some(17000)))),
-                }),
-                true,
-            ),
-        ];
-
-        validate_simplified_cases(&mut rewriter, simplified_cases);
-
-        let unchanged_cases = &[
-            col("x").lt(lit(ScalarValue::Date32(Some(19000)))),
-            col("x").lt_eq(lit(ScalarValue::Date32(Some(19000)))),
-            col("x").gt(lit(ScalarValue::Date32(Some(19000)))),
-            col("x").gt_eq(lit(ScalarValue::Date32(Some(19000)))),
-            col("x").eq(lit(ScalarValue::Date32(Some(19000)))),
-            col("x").not_eq(lit(ScalarValue::Date32(Some(19000)))),
-            col("x").between(
-                lit(ScalarValue::Date32(Some(18000))),
-                lit(ScalarValue::Date32(Some(19000))),
-            ),
-            col("x").not_between(
-                lit(ScalarValue::Date32(Some(18000))),
-                lit(ScalarValue::Date32(Some(19000))),
-            ),
-        ];
-
-        validate_unchanged_cases(&mut rewriter, unchanged_cases);
-    }
-
-    #[test]
-    fn test_inequalities_maybe_null() {
-        let guarantees = [
-            // x ∈ ("abc", "def"]? (maybe null)
-            (
-                col("x"),
-                NullableInterval::MaybeNull {
-                    values: Interval::try_new(
-                        ScalarValue::from("abc"),
-                        ScalarValue::from("def"),
-                    )
-                    .unwrap(),
-                },
-            ),
-        ];
-        let mut rewriter = GuaranteeRewriter::new(guarantees.iter());
-
-        // (original_expr, expected_simplification)
-        let simplified_cases = &[
-            (
-                Expr::BinaryExpr(BinaryExpr {
-                    left: Box::new(col("x")),
-                    op: Operator::IsDistinctFrom,
-                    right: Box::new(lit("z")),
-                }),
-                true,
-            ),
-            (
-                Expr::BinaryExpr(BinaryExpr {
-                    left: Box::new(col("x")),
-                    op: Operator::IsNotDistinctFrom,
-                    right: Box::new(lit("z")),
-                }),
-                false,
-            ),
-        ];
-
-        validate_simplified_cases(&mut rewriter, simplified_cases);
-
-        let unchanged_cases = &[
-            col("x").lt(lit("z")),
-            col("x").lt_eq(lit("z")),
-            col("x").gt(lit("a")),
-            col("x").gt_eq(lit("a")),
-            col("x").eq(lit("abc")),
-            col("x").not_eq(lit("a")),
-            col("x").between(lit("a"), lit("z")),
-            col("x").not_between(lit("a"), lit("z")),
-            Expr::BinaryExpr(BinaryExpr {
-                left: Box::new(col("x")),
-                op: Operator::IsDistinctFrom,
-                right: Box::new(lit(ScalarValue::Null)),
-            }),
-        ];
-
-        validate_unchanged_cases(&mut rewriter, unchanged_cases);
-    }
-
-    #[test]
-    fn test_column_single_value() {
-        let scalars = [
-            ScalarValue::Null,
-            ScalarValue::Int32(Some(1)),
-            ScalarValue::Boolean(Some(true)),
-            ScalarValue::Boolean(None),
-            ScalarValue::from("abc"),
-            ScalarValue::LargeUtf8(Some("def".to_string())),
-            ScalarValue::Date32(Some(18628)),
-            ScalarValue::Date32(None),
-            ScalarValue::Decimal128(Some(1000), 19, 2),
-        ];
-
-        for scalar in scalars {
-            let guarantees = [(col("x"), NullableInterval::from(scalar.clone()))];
-            let mut rewriter = GuaranteeRewriter::new(guarantees.iter());
-
-            let output = col("x").rewrite(&mut rewriter).data().unwrap();
-            assert_eq!(output, Expr::Literal(scalar.clone(), None));
-        }
-    }
-
-    #[test]
-    fn test_in_list() {
-        let guarantees = [
-            // x ∈ [1, 10] (not null)
-            (
-                col("x"),
-                NullableInterval::NotNull {
-                    values: Interval::try_new(
-                        ScalarValue::Int32(Some(1)),
-                        ScalarValue::Int32(Some(10)),
-                    )
-                    .unwrap(),
-                },
-            ),
-        ];
-        let mut rewriter = GuaranteeRewriter::new(guarantees.iter());
-
-        // These cases should be simplified so the list doesn't contain any
-        // values the guarantee says are outside the range.
-        // (column_name, starting_list, negated, expected_list)
-        let cases = &[
-            // x IN (9, 11) => x IN (9)
-            ("x", vec![9, 11], false, vec![9]),
-            // x IN (10, 2) => x IN (10, 2)
-            ("x", vec![10, 2], false, vec![10, 2]),
-            // x NOT IN (9, 11) => x NOT IN (9)
-            ("x", vec![9, 11], true, vec![9]),
-            // x NOT IN (0, 22) => x NOT IN ()
-            ("x", vec![0, 22], true, vec![]),
-        ];
-
-        for (column_name, starting_list, negated, expected_list) in cases {
-            let expr = col(*column_name).in_list(
-                starting_list
-                    .iter()
-                    .map(|v| lit(ScalarValue::Int32(Some(*v))))
-                    .collect(),
-                *negated,
-            );
-            let output = expr.clone().rewrite(&mut rewriter).data().unwrap();
-            let expected_list = expected_list
-                .iter()
-                .map(|v| lit(ScalarValue::Int32(Some(*v))))
-                .collect();
-            assert_eq!(
-                output,
-                Expr::InList(InList {
-                    expr: Box::new(col(*column_name)),
-                    list: expected_list,
-                    negated: *negated,
-                })
-            );
-        }
-    }
-}
diff --git a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs
index a1c1dc17d2945..17112d4f0ae24 100644
--- a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs
+++ b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs
@@ -19,10 +19,10 @@
 
 use super::THRESHOLD_INLINE_INLIST;
 
-use datafusion_common::tree_node::{Transformed, TreeNodeRewriter};
 use datafusion_common::Result;
-use datafusion_expr::expr::InList;
+use datafusion_common::tree_node::{Transformed, TreeNodeRewriter};
 use datafusion_expr::Expr;
+use datafusion_expr::expr::InList;
 
 pub(super) struct ShortenInListSimplifier {}
 
@@ -43,52 +43,50 @@ impl TreeNodeRewriter for ShortenInListSimplifier {
             ref list,
             negated,
         }) = expr
+            && !list.is_empty()
+            && (
+                // For lists with only 1 value we allow more complex expressions to be simplified
+                // e.g SUBSTR(c1, 2, 3) IN ('1') -> SUBSTR(c1, 2, 3) = '1'
+                // for more than one we avoid repeating this potentially expensive
+                // expressions
+                list.len() == 1
+                    || list.len() <= THRESHOLD_INLINE_INLIST
+                        && expr.try_as_col().is_some()
+            )
         {
-            if !list.is_empty()
-                && (
-                    // For lists with only 1 value we allow more complex expressions to be simplified
-                    // e.g SUBSTR(c1, 2, 3) IN ('1') -> SUBSTR(c1, 2, 3) = '1'
-                    // for more than one we avoid repeating this potentially expensive
-                    // expressions
-                    list.len() == 1
-                        || list.len() <= THRESHOLD_INLINE_INLIST
-                            && expr.try_as_col().is_some()
-                )
-            {
-                let first_val = list[0].clone();
-                if negated {
-                    return Ok(Transformed::yes(list.iter().skip(1).cloned().fold(
-                        (*expr.clone()).not_eq(first_val),
-                        |acc, y| {
-                            // Note that `A and B and C and D` is a left-deep tree structure
-                            // as such we want to maintain this structure as much as possible
-                            // to avoid reordering the expression during each optimization
-                            // pass.
-                            //
-                            // Left-deep tree structure for `A and B and C and D`:
-                            // ```
-                            //        &
-                            //       / \
-                            //      &   D
-                            //     / \
-                            //    &   C
-                            //   / \
-                            //  A   B
-                            // ```
-                            //
-                            // The code below maintain the left-deep tree structure.
-                            acc.and((*expr.clone()).not_eq(y))
-                        },
-                    )));
-                } else {
-                    return Ok(Transformed::yes(list.iter().skip(1).cloned().fold(
-                        (*expr.clone()).eq(first_val),
-                        |acc, y| {
-                            // Same reasoning as above
-                            acc.or((*expr.clone()).eq(y))
-                        },
-                    )));
-                }
+            let first_val = list[0].clone();
+            if negated {
+                return Ok(Transformed::yes(list.iter().skip(1).cloned().fold(
+                    (*expr.clone()).not_eq(first_val),
+                    |acc, y| {
+                        // Note that `A and B and C and D` is a left-deep tree structure
+                        // as such we want to maintain this structure as much as possible
+                        // to avoid reordering the expression during each optimization
+                        // pass.
+                        //
+                        // Left-deep tree structure for `A and B and C and D`:
+                        // ```
+                        //        &
+                        //       / \
+                        //      &   D
+                        //     / \
+                        //    &   C
+                        //   / \
+                        //  A   B
+                        // ```
+                        //
+                        // The code below maintain the left-deep tree structure.
+                        acc.and((*expr.clone()).not_eq(y))
+                    },
+                )));
+            } else {
+                return Ok(Transformed::yes(list.iter().skip(1).cloned().fold(
+                    (*expr.clone()).eq(first_val),
+                    |acc, y| {
+                        // Same reasoning as above
+                        acc.or((*expr.clone()).eq(y))
+                    },
+                )));
             }
         }
 
diff --git a/datafusion/optimizer/src/simplify_expressions/linear_aggregates.rs b/datafusion/optimizer/src/simplify_expressions/linear_aggregates.rs
new file mode 100644
index 0000000000000..21389cf326c24
--- /dev/null
+++ b/datafusion/optimizer/src/simplify_expressions/linear_aggregates.rs
@@ -0,0 +1,229 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Simplification to refactor multiple aggregate functions to use the same aggregate function
+
+use datafusion_common::HashMap;
+use datafusion_expr::expr::AggregateFunctionParams;
+use datafusion_expr::{BinaryExpr, Expr};
+use datafusion_expr_common::operator::Operator;
+
+/// Threshold of the number of aggregates that share similar arguments before
+/// triggering rewrite.
+///
+/// There is a threshold because the canonical SUM rewrite described in
+/// [`AggregateUDFImpl::simplify_expr_op_literal`] actually results in more
+/// aggregates (2) for each original aggregate. It is important that CSE then
+/// eliminate them.
+///
+/// [`AggregateUDFImpl::simplify_expr_op_literal`]: datafusion_expr::AggregateUDFImpl::simplify_expr_op_literal
+const DUPLICATE_THRESHOLD: usize = 2;
+
+/// Rewrites multiple aggregate expressions that have a common linear component
+/// into multiple aggregate expressions that share that common component.
+///
+/// For example, rewrites patterns such as
+/// * `SUM(x + 1), SUM(x + 2), ...`
+///
+/// Into
+/// * `SUM(x) + 1 * COUNT(x), SUM(x) + 2 * COUNT(x), ...`
+///
+/// See the background [`AggregateUDFImpl::simplify_expr_op_literal`] for details.
+///
+/// Returns `true` if any of the arguments are rewritten (modified), `false`
+/// otherwise.
+///
+/// ## Design goals:
+/// 1. Keep the aggregate specific logic out of the optimizer (can't depend directly on SUM)
+/// 2. Optimize for the case that this rewrite will not apply (it almost never does)
+///
+/// [`AggregateUDFImpl::simplify_expr_op_literal`]: datafusion_expr::AggregateUDFImpl::simplify_expr_op_literal
+pub(super) fn rewrite_multiple_linear_aggregates(
+    agg_expr: &mut [Expr],
+) -> datafusion_common::Result<bool> {
+    // map <expr>: count of expressions that have a common argument
+    let mut common_args = HashMap::new();
+
+    // First pass -- figure out any aggregates that can be split and have common
+    // expressions.
+    for agg in agg_expr.iter() {
+        let Expr::AggregateFunction(agg_function) = agg else {
+            continue;
+        };
+
+        let Some(arg) = candidate_linear_param(&agg_function.params) else {
+            continue;
+        };
+
+        let Some(expr_literal) = ExprLiteral::try_new(arg) else {
+            continue;
+        };
+
+        let counter = common_args.entry(expr_literal.expr()).or_insert(0);
+        *counter += 1;
+    }
+
+    // (agg_index, new_expr)
+    let mut new_aggs = vec![];
+
+    // Second pass, actually rewrite any aggregates that have a common
+    // expression and enough duplicates.
+    for (idx, agg) in agg_expr.iter().enumerate() {
+        let Expr::AggregateFunction(agg_function) = agg else {
+            continue;
+        };
+
+        let Some(arg) = candidate_linear_param(&agg_function.params) else {
+            continue;
+        };
+
+        let Some(expr_literal) = ExprLiteral::try_new(arg) else {
+            continue;
+        };
+
+        // Not enough common expressions to make it worth rewriting
+        if common_args.get(expr_literal.expr()).unwrap_or(&0) < &DUPLICATE_THRESHOLD {
+            continue;
+        }
+
+        if let Some(new_agg_function) = agg_function.func.simplify_expr_op_literal(
+            agg_function,
+            expr_literal.expr(),
+            expr_literal.op(),
+            expr_literal.lit(),
+            expr_literal.arg_is_left(),
+        )? {
+            new_aggs.push((idx, new_agg_function));
+        }
+    }
+
+    if new_aggs.is_empty() {
+        return Ok(false);
+    }
+
+    // Otherwise replace the aggregate expressions
+    drop(common_args); // release borrow
+    for (idx, new_agg) in new_aggs {
+        let orig_name = agg_expr[idx].name_for_alias()?;
+        agg_expr[idx] = new_agg.alias_if_changed(orig_name)?
+    }
+
+    Ok(true)
+}
+
+/// Returns Some(&Expr) with the single argument if this is a suitable candidate
+/// for the  linear rewrite
+fn candidate_linear_param(params: &AggregateFunctionParams) -> Option<&Expr> {
+    // Explicitly destructure to ensure we check all relevant fields
+    let AggregateFunctionParams {
+        args,
+        distinct,
+        filter,
+        order_by,
+        null_treatment,
+    } = params;
+
+    // Disqualify anything "non standard"
+    if *distinct
+        || filter.is_some()
+        || !order_by.is_empty()
+        || null_treatment.is_some()
+        || args.len() != 1
+    {
+        return None;
+    }
+    let arg = args.first()?;
+    if arg.is_volatile() {
+        return None;
+    };
+    Some(arg)
+}
+
+/// A view into a [`Expr::BinaryExpr`]  that is arbitrary expression and a
+/// literal
+///
+/// This is an enum to distinguish the direction of the operator arguments
+#[derive(Debug, Clone)]
+pub enum ExprLiteral<'a> {
+    /// if the expression is `<arg> <op> <lit>`
+    ArgOpLit {
+        arg: &'a Expr,
+        op: Operator,
+        lit: &'a Expr,
+    },
+    /// if the expression is `<lit> <op> <arg>`
+    LitOpArg {
+        lit: &'a Expr,
+        op: Operator,
+        arg: &'a Expr,
+    },
+}
+
+impl<'a> ExprLiteral<'a> {
+    /// Try and split the Expr into its parts
+    fn try_new(expr: &'a Expr) -> Option<Self> {
+        match expr {
+            // <lit> <op> <expr>
+            Expr::BinaryExpr(BinaryExpr { left, op, right })
+                if matches!(left.as_ref(), Expr::Literal(..)) =>
+            {
+                Some(Self::LitOpArg {
+                    arg: right,
+                    lit: left,
+                    op: *op,
+                })
+            }
+
+            // <expr> + <lit>
+            Expr::BinaryExpr(BinaryExpr { left, op, right })
+                if matches!(right.as_ref(), Expr::Literal(..)) =>
+            {
+                Some(Self::ArgOpLit {
+                    arg: left,
+                    lit: right,
+                    op: *op,
+                })
+            }
+            _ => None,
+        }
+    }
+
+    fn expr(&self) -> &'a Expr {
+        match self {
+            Self::ArgOpLit { arg, .. } => arg,
+            Self::LitOpArg { arg, .. } => arg,
+        }
+    }
+
+    fn lit(&self) -> &'a Expr {
+        match self {
+            Self::ArgOpLit { lit, .. } => lit,
+            Self::LitOpArg { lit, .. } => lit,
+        }
+    }
+
+    fn op(&self) -> Operator {
+        match self {
+            Self::ArgOpLit { op, .. } => *op,
+            Self::LitOpArg { op, .. } => *op,
+        }
+    }
+
+    fn arg_is_left(&self) -> bool {
+        matches!(self, Self::ArgOpLit { .. })
+    }
+}
diff --git a/datafusion/optimizer/src/simplify_expressions/mod.rs b/datafusion/optimizer/src/simplify_expressions/mod.rs
index 7ae38eec9a3ad..89c79d3fb4203 100644
--- a/datafusion/optimizer/src/simplify_expressions/mod.rs
+++ b/datafusion/optimizer/src/simplify_expressions/mod.rs
@@ -19,20 +19,22 @@
 //! [`ExprSimplifier`] simplifies individual `Expr`s.
 
 pub mod expr_simplifier;
-mod guarantees;
 mod inlist_simplifier;
+mod linear_aggregates;
 mod regex;
 pub mod simplify_exprs;
+pub mod simplify_literal;
 mod simplify_predicates;
+mod udf_preimage;
 mod unwrap_cast;
 mod utils;
 
 // backwards compatibility
-pub use datafusion_expr::simplify::{SimplifyContext, SimplifyInfo};
+pub use datafusion_expr::simplify::SimplifyContext;
 
 pub use expr_simplifier::*;
 pub use simplify_exprs::*;
 pub use simplify_predicates::simplify_predicates;
 
 // Export for test in datafusion/core/tests/optimizer_integration.rs
-pub use guarantees::GuaranteeRewriter;
+pub use datafusion_expr::expr_rewriter::GuaranteeRewriter;
diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs
index 82c5ea3d8d820..b341c328e992a 100644
--- a/datafusion/optimizer/src/simplify_expressions/regex.rs
+++ b/datafusion/optimizer/src/simplify_expressions/regex.rs
@@ -15,10 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use datafusion_common::tree_node::Transformed;
 use datafusion_common::{DataFusionError, Result, ScalarValue};
-use datafusion_expr::{lit, BinaryExpr, Expr, Like, Operator};
+use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit};
 use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look};
 
+use crate::simplify_expressions::expr_simplifier::StringScalar;
+
 /// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions.
 const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4;
 
@@ -36,59 +39,76 @@ const ANY_CHAR_REGEX_PATTERN: &str = ".*";
 /// - partial anchored regex patterns (e.g. `^foo`) to `LIKE 'foo%'`
 /// - combinations (alternatives) of the above, will be concatenated with `OR` or `AND`
 /// - `EQ .*` to NotNull
-/// - `NE .*` means IS EMPTY
+/// - `NE .*` to col IS NULL AND Boolean(NULL) (false for any string, or NULL if col is NULL)
 ///
 /// Dev note: unit tests of this function are in `expr_simplifier.rs`, case `test_simplify_regex`.
 pub fn simplify_regex_expr(
     left: Box<Expr>,
     op: Operator,
     right: Box<Expr>,
-) -> Result<Expr> {
-    let mode = OperatorMode::new(&op);
+) -> Result<Transformed<Expr>> {
+    // Check if the right operand is a supported string literal
+    let Some(string_scalar) = StringScalar::try_from_expr(right.as_ref()) else {
+        return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
+            left,
+            op,
+            right,
+        })));
+    };
+    let pattern = string_scalar.as_str();
+    let Some(pattern) = pattern else {
+        return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
+            left,
+            op,
+            right,
+        })));
+    };
 
-    if let Expr::Literal(ScalarValue::Utf8(Some(pattern)), _) = right.as_ref() {
-        // Handle the special case for ".*" pattern
-        if pattern == ANY_CHAR_REGEX_PATTERN {
-            let new_expr = if mode.not {
-                // not empty
-                let empty_lit = Box::new(lit(""));
-                Expr::BinaryExpr(BinaryExpr {
-                    left,
-                    op: Operator::Eq,
-                    right: empty_lit,
-                })
-            } else {
-                // not null
-                left.is_not_null()
-            };
-            return Ok(new_expr);
-        }
+    let mode = OperatorMode::new(&op);
+    // Handle the special case for ".*" pattern
+    if pattern == ANY_CHAR_REGEX_PATTERN {
+        let new_expr = if mode.not {
+            let null_bool = lit(ScalarValue::Boolean(None));
+            Expr::BinaryExpr(BinaryExpr {
+                left: Box::new(left.is_null()),
+                op: Operator::And,
+                right: Box::new(null_bool),
+            })
+        } else {
+            // not null
+            left.is_not_null()
+        };
+        return Ok(Transformed::yes(new_expr));
+    }
 
-        match regex_syntax::Parser::new().parse(pattern) {
-            Ok(hir) => {
-                let kind = hir.kind();
-                if let HirKind::Alternation(alts) = kind {
-                    if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION {
-                        if let Some(expr) = lower_alt(&mode, &left, alts) {
-                            return Ok(expr);
-                        }
-                    }
-                } else if let Some(expr) = lower_simple(&mode, &left, &hir) {
-                    return Ok(expr);
+    match regex_syntax::Parser::new().parse(pattern) {
+        Ok(hir) => {
+            let kind = hir.kind();
+            if let HirKind::Alternation(alts) = kind {
+                if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION
+                    && let Some(expr) = lower_alt(&mode, &left, alts, &string_scalar)
+                {
+                    return Ok(Transformed::yes(expr));
                 }
+            } else if let Some(expr) = lower_simple(&mode, &left, &hir, &string_scalar) {
+                return Ok(Transformed::yes(expr));
             }
-            Err(e) => {
-                // error out early since the execution may fail anyways
-                return Err(DataFusionError::Context(
-                    "Invalid regex".to_owned(),
-                    Box::new(DataFusionError::External(Box::new(e))),
-                ));
-            }
+        }
+        Err(e) => {
+            // error out early since the execution may fail anyways
+            return Err(DataFusionError::Context(
+                "Invalid regex".to_owned(),
+                Box::new(DataFusionError::External(Box::new(e))),
+            ));
         }
     }
 
     // Leave untouched if optimization didn't work
-    Ok(Expr::BinaryExpr(BinaryExpr { left, op, right }))
+    Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
+        left,
+        op,
+        right,
+    })))
 }
 
 #[derive(Debug)]
@@ -117,11 +137,11 @@ impl OperatorMode {
     }
 
     /// Creates an [`LIKE`](Expr::Like) from the given `LIKE` pattern.
-    fn expr(&self, expr: Box<Expr>, pattern: String) -> Expr {
+    fn expr(&self, expr: Box<Expr>, pattern: Box<Expr>) -> Expr {
         let like = Like {
             negated: self.not,
             expr,
-            pattern: Box::new(Expr::Literal(ScalarValue::from(pattern), None)),
+            pattern,
             escape_char: None,
             case_insensitive: self.i,
         };
@@ -287,11 +307,11 @@ fn anchored_alternation_to_exprs(v: &[Hir]) -> Option<Vec<Expr>> {
             let mut literals = Vec::with_capacity(alters.len());
             for hir in alters {
                 let mut is_safe = false;
-                if let HirKind::Literal(l) = hir.kind() {
-                    if let Some(safe_literal) = str_from_literal(l).map(lit) {
-                        literals.push(safe_literal);
-                        is_safe = true;
-                    }
+                if let HirKind::Literal(l) = hir.kind()
+                    && let Some(safe_literal) = str_from_literal(l).map(lit)
+                {
+                    literals.push(safe_literal);
+                    is_safe = true;
                 }
 
                 if !is_safe {
@@ -311,14 +331,24 @@ fn anchored_alternation_to_exprs(v: &[Hir]) -> Option<Vec<Expr>> {
 }
 
 /// Tries to lower (transform) a simple regex pattern to a LIKE expression.
-fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
+fn lower_simple(
+    mode: &OperatorMode,
+    left: &Expr,
+    hir: &Hir,
+    string_scalar: &StringScalar,
+) -> Option<Expr> {
     match hir.kind() {
         HirKind::Empty => {
-            return Some(mode.expr(Box::new(left.clone()), "%".to_owned()));
+            return Some(
+                mode.expr(Box::new(left.clone()), Box::new(string_scalar.to_expr("%"))),
+            );
         }
         HirKind::Literal(l) => {
             let s = like_str_from_literal(l)?;
-            return Some(mode.expr(Box::new(left.clone()), format!("%{s}%")));
+            return Some(mode.expr(
+                Box::new(left.clone()),
+                Box::new(string_scalar.to_expr(&format!("%{s}%"))),
+            ));
         }
         HirKind::Concat(inner) if is_anchored_literal(inner) => {
             return anchored_literal_to_expr(inner).map(|right| {
@@ -333,7 +363,10 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
             if let Some(pattern) = partial_anchored_literal_to_like(inner)
                 .or_else(|| collect_concat_to_like_string(inner))
             {
-                return Some(mode.expr(Box::new(left.clone()), pattern));
+                return Some(mode.expr(
+                    Box::new(left.clone()),
+                    Box::new(string_scalar.to_expr(&pattern)),
+                ));
             }
         }
         _ => {}
@@ -344,11 +377,16 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
 /// Calls [`lower_simple`] for each alternative and combine the results with `or` or `and`
 /// based on [`OperatorMode`]. Any fail attempt to lower an alternative will makes this
 /// function to return `None`.
-fn lower_alt(mode: &OperatorMode, left: &Expr, alts: &[Hir]) -> Option<Expr> {
+fn lower_alt(
+    mode: &OperatorMode,
+    left: &Expr,
+    alts: &[Hir],
+    string_scalar: &StringScalar,
+) -> Option<Expr> {
     let mut accu: Option<Expr> = None;
 
     for part in alts {
-        if let Some(expr) = lower_simple(mode, left, part) {
+        if let Some(expr) = lower_simple(mode, left, part, string_scalar) {
             accu = match accu {
                 Some(accu) => {
                     if mode.not {
diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
index 4faf9389cfac4..3e495f5355103 100644
--- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
+++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
@@ -20,19 +20,20 @@
 use std::sync::Arc;
 
 use datafusion_common::tree_node::{Transformed, TreeNode};
-use datafusion_common::{DFSchema, DFSchemaRef, DataFusionError, Result};
-use datafusion_expr::execution_props::ExecutionProps;
-use datafusion_expr::logical_plan::LogicalPlan;
-use datafusion_expr::simplify::SimplifyContext;
-use datafusion_expr::utils::merge_schema;
+use datafusion_common::{Column, DFSchema, DFSchemaRef, DataFusionError, Result};
 use datafusion_expr::Expr;
+use datafusion_expr::logical_plan::{Aggregate, LogicalPlan, Projection};
+use datafusion_expr::simplify::SimplifyContext;
+use datafusion_expr::utils::{
+    columnize_expr, find_aggregate_exprs, grouping_set_to_exprlist, merge_schema,
+};
 
+use super::ExprSimplifier;
 use crate::optimizer::ApplyOrder;
+use crate::simplify_expressions::linear_aggregates::rewrite_multiple_linear_aggregates;
 use crate::utils::NamePreserver;
 use crate::{OptimizerConfig, OptimizerRule};
 
-use super::ExprSimplifier;
-
 /// Optimizer Pass that simplifies [`LogicalPlan`]s by rewriting
 /// [`Expr`]`s evaluating constants and applying algebraic
 /// simplifications
@@ -67,17 +68,14 @@ impl OptimizerRule for SimplifyExpressions {
         plan: LogicalPlan,
         config: &dyn OptimizerConfig,
     ) -> Result<Transformed<LogicalPlan>, DataFusionError> {
-        let mut execution_props = ExecutionProps::new();
-        execution_props.query_execution_start_time = config.query_execution_start_time();
-        execution_props.config_options = Some(config.options());
-        Self::optimize_internal(plan, &execution_props)
+        Self::optimize_internal(plan, config)
     }
 }
 
 impl SimplifyExpressions {
     fn optimize_internal(
         plan: LogicalPlan,
-        execution_props: &ExecutionProps,
+        config: &dyn OptimizerConfig,
     ) -> Result<Transformed<LogicalPlan>> {
         let schema = if !plan.inputs().is_empty() {
             DFSchemaRef::new(merge_schema(&plan.inputs()))
@@ -100,7 +98,11 @@ impl SimplifyExpressions {
             Arc::new(DFSchema::empty())
         };
 
-        let info = SimplifyContext::new(execution_props).with_schema(schema);
+        let info = SimplifyContext::builder()
+            .with_schema(schema)
+            .with_config_options(config.options())
+            .with_query_execution_start_time(config.query_execution_start_time())
+            .build();
 
         // Inputs have already been rewritten (due to bottom-up traversal handled by Optimizer)
         // Just need to rewrite our own expressions
@@ -138,17 +140,110 @@ impl SimplifyExpressions {
             } else {
                 rewrite_expr(expr)
             }
-        })
+        })?
+        .transform_data(rewrite_aggregate_non_aggregate_aggr_expr)
     }
 }
 
 impl SimplifyExpressions {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
 }
 
+/// Ensures that `LogicalPlan::Aggregate` is well formed after rewrites
+/// by potentially introducing an extra `Projection`.
+///
+/// Also applies the [`rewrite_multiple_linear_aggregates`] special case
+///
+/// # Rationale:
+///
+/// [`LogicalPlan::Aggregate`] requires agg expressions to be (possibly aliased)
+/// [`Expr::AggregateFunction`]. Some UDAF simplifiers may return other [`Expr`]
+/// variants.
+///
+/// # Operation
+///
+/// Rewrites things like this (note that `exp1` is not an aggregate):
+/// * `Aggregate(group_expr, aggr_expr=[exp1 + agg(exp2)])`
+///
+/// into:
+/// * `Projection(exp1 + _X)`
+/// * `  Aggregate(group_expr, aggr_expr=[agg(exp2) AS _X])`
+fn rewrite_aggregate_non_aggregate_aggr_expr(
+    plan: LogicalPlan,
+) -> Result<Transformed<LogicalPlan>> {
+    let LogicalPlan::Aggregate(Aggregate {
+        input,
+        group_expr,
+        mut aggr_expr,
+        schema,
+        ..
+    }) = plan
+    else {
+        return Ok(Transformed::no(plan));
+    };
+
+    let rewrote_aggs = rewrite_multiple_linear_aggregates(&mut aggr_expr)?;
+
+    // Ensure that all Aggregate arguments are AggregateExpr
+    if aggr_expr.iter().all(is_top_level_aggregate_expr) {
+        let new_plan = LogicalPlan::Aggregate(Aggregate::try_new_with_schema(
+            input, group_expr, aggr_expr, schema,
+        )?);
+        return if !rewrote_aggs {
+            Ok(Transformed::no(new_plan))
+        } else {
+            Ok(Transformed::yes(new_plan))
+        };
+    }
+
+    // Otherwise we need to add a Projection above Aggregate to calculate
+    // the final output expressions.
+
+    let inner_aggr_expr = find_aggregate_exprs(aggr_expr.iter());
+    let inner_aggregate = LogicalPlan::Aggregate(Aggregate::try_new(
+        Arc::clone(&input),
+        group_expr.clone(),
+        inner_aggr_expr,
+    )?);
+    let inner_aggregate = Arc::new(inner_aggregate);
+
+    let mut projection_exprs = aggregate_output_exprs(&group_expr)?;
+    projection_exprs.extend(aggr_expr);
+    let projection_exprs = projection_exprs
+        .into_iter()
+        .map(|expr| columnize_expr(expr, inner_aggregate.as_ref()))
+        .collect::<Result<Vec<_>>>()?;
+
+    Ok(Transformed::yes(LogicalPlan::Projection(
+        Projection::try_new(projection_exprs, inner_aggregate)?,
+    )))
+}
+
+fn is_top_level_aggregate_expr(expr: &Expr) -> bool {
+    matches!(
+        expr.clone().unalias_nested().data,
+        Expr::AggregateFunction(_)
+    )
+}
+
+fn aggregate_output_exprs(group_expr: &[Expr]) -> Result<Vec<Expr>> {
+    let mut output_exprs = grouping_set_to_exprlist(group_expr)?
+        .into_iter()
+        .cloned()
+        .collect::<Vec<_>>();
+
+    if matches!(group_expr, [Expr::GroupingSet(_)]) {
+        output_exprs.push(Expr::Column(Column::from_name(
+            Aggregate::INTERNAL_GROUPING_ID,
+        )));
+    }
+
+    Ok(output_exprs)
+}
+
 #[cfg(test)]
 mod tests {
     use std::ops::Not;
@@ -156,14 +251,15 @@ mod tests {
     use arrow::datatypes::{DataType, Field, Schema};
     use chrono::{DateTime, Utc};
 
+    use datafusion_common::ScalarValue;
     use datafusion_expr::logical_plan::builder::table_scan_with_filters;
     use datafusion_expr::logical_plan::table_scan;
     use datafusion_expr::*;
-    use datafusion_functions_aggregate::expr_fn::{max, min};
+    use datafusion_functions_aggregate::expr_fn::{max, min, sum};
 
+    use crate::OptimizerContext;
     use crate::assert_optimized_plan_eq_snapshot;
     use crate::test::{assert_fields_eq, test_table_scan_with_name};
-    use crate::OptimizerContext;
 
     use super::*;
 
@@ -219,7 +315,7 @@ mod tests {
 
         assert_optimized_plan_equal!(
             table_scan,
-            @ r"TableScan: test projection=[a], full_filters=[Boolean(true)]"
+            @ "TableScan: test projection=[a], full_filters=[Boolean(true)]"
         )
     }
 
@@ -252,13 +348,59 @@ mod tests {
         assert_optimized_plan_equal!(
             plan,
             @ r"
-            Filter: test.b > Int32(1)
-              Projection: test.a
-                TableScan: test
-            "
+        Filter: test.b > Int32(1)
+          Projection: test.a
+            TableScan: test
+        "
         )
     }
 
+    #[test]
+    fn test_simplify_udaf_to_non_aggregate_expr() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Int64, false)]);
+        let table_scan = table_scan(Some("test"), &schema, None)?
+            .build()
+            .expect("building scan");
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(Vec::<Expr>::new(), vec![sum(col("a") + lit(2i64))])?
+            .build()?;
+
+        assert_optimized_plan_equal!(
+            plan,
+            @r"
+        Aggregate: groupBy=[[]], aggr=[[sum(test.a + Int64(2))]]
+          TableScan: test
+        "
+        )?;
+        Ok(())
+    }
+
+    #[test]
+    fn test_simplify_common_sum_arg() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Int64, false)]);
+        let table_scan = table_scan(Some("test"), &schema, None)?
+            .build()
+            .expect("building scan");
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(
+                Vec::<Expr>::new(),
+                vec![sum(col("a") + lit(2i64)), sum(col("a") + lit(3i64))],
+            )?
+            .build()?;
+
+        assert_optimized_plan_equal!(
+            plan,
+            @r"
+        Projection: sum(test.a) + Int64(2) * CAST(count(test.a) AS Int64) AS sum(test.a + Int64(2)), sum(test.a) + Int64(3) * CAST(count(test.a) AS Int64) AS sum(test.a + Int64(3))
+          Aggregate: groupBy=[[]], aggr=[[sum(test.a), count(test.a)]]
+            TableScan: test
+        "
+        )?;
+        Ok(())
+    }
+
     #[test]
     fn test_simplify_optimized_plan_with_or() -> Result<()> {
         let table_scan = test_table_scan();
@@ -270,10 +412,10 @@ mod tests {
         assert_optimized_plan_equal!(
             plan,
             @ r"
-            Filter: test.b > Int32(1)
-              Projection: test.a
-                TableScan: test
-            "
+        Filter: test.b > Int32(1)
+          Projection: test.a
+            TableScan: test
+        "
         )
     }
 
@@ -492,8 +634,7 @@ mod tests {
             .build()?;
 
         let actual = get_optimized_plan_formatted(plan, &time);
-        let expected =
-            "Projection: NOT test.a AS Boolean(true) OR Boolean(false) != test.a\
+        let expected = "Projection: NOT test.a AS Boolean(true) OR Boolean(false) != test.a\
                         \n  TableScan: test";
 
         assert_eq!(expected, actual);
@@ -872,7 +1013,7 @@ mod tests {
         ]);
         let table_scan = table_scan(Some("test"), &schema, None)?.build()?;
 
-        // Test `= ".*"` transforms to true (except for empty strings)
+        // Test `~ ".*"` transforms to true for any non-NULL string
         let plan = LogicalPlanBuilder::from(table_scan.clone())
             .filter(binary_expr(col("a"), Operator::RegexMatch, lit(".*")))?
             .build()?;
@@ -885,22 +1026,22 @@ mod tests {
         "
         )?;
 
-        // Test `!= ".*"` transforms to checking if the column is empty
+        // Test `!~ ".*"` preserves NULL semantics while remaining false for non-NULL strings
         let plan = LogicalPlanBuilder::from(table_scan.clone())
             .filter(binary_expr(col("a"), Operator::RegexNotMatch, lit(".*")))?
             .build()?;
 
         assert_optimized_plan_equal!(
             plan,
-            @ r#"
-        Filter: test.a = Utf8("")
+            @ r"
+        Filter: test.a IS NULL AND Boolean(NULL)
           TableScan: test
-        "#
+        "
         )?;
 
         // Test case-insensitive versions
 
-        // Test `=~ ".*"` (case-insensitive) transforms to true (except for empty strings)
+        // Test `~* ".*"` transforms to true for any non-NULL string
         let plan = LogicalPlanBuilder::from(table_scan.clone())
             .filter(binary_expr(col("b"), Operator::RegexIMatch, lit(".*")))?
             .build()?;
@@ -913,17 +1054,51 @@ mod tests {
         "
         )?;
 
-        // Test `!~ ".*"` (case-insensitive) transforms to checking if the column is empty
+        // Test NULL `!~ ".*"` transforms to Boolean(NULL)
+        let plan = LogicalPlanBuilder::from(table_scan.clone())
+            .filter(binary_expr(
+                lit(ScalarValue::Utf8(None)),
+                Operator::RegexNotMatch,
+                lit(".*"),
+            ))?
+            .build()?;
+
+        assert_optimized_plan_equal!(
+            plan,
+            @ r"
+        Filter: Boolean(NULL)
+          TableScan: test
+        "
+        )?;
+
+        // Test `!~* ".*"` preserves NULL semantics while remaining false for non-NULL strings
         let plan = LogicalPlanBuilder::from(table_scan.clone())
             .filter(binary_expr(col("a"), Operator::RegexNotIMatch, lit(".*")))?
             .build()?;
 
         assert_optimized_plan_equal!(
             plan,
-            @ r#"
-        Filter: test.a = Utf8("")
+            @ r"
+        Filter: test.a IS NULL AND Boolean(NULL)
           TableScan: test
-        "#
+        "
+        )?;
+
+        // Test NULL `!~* ".*"` transforms to Boolean(NULL)
+        let plan = LogicalPlanBuilder::from(table_scan.clone())
+            .filter(binary_expr(
+                lit(ScalarValue::Utf8(None)),
+                Operator::RegexNotIMatch,
+                lit(".*"),
+            ))?
+            .build()?;
+
+        assert_optimized_plan_equal!(
+            plan,
+            @ r"
+        Filter: Boolean(NULL)
+          TableScan: test
+        "
         )
     }
 
diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_literal.rs b/datafusion/optimizer/src/simplify_expressions/simplify_literal.rs
new file mode 100644
index 0000000000000..72e9dbc99dfae
--- /dev/null
+++ b/datafusion/optimizer/src/simplify_expressions/simplify_literal.rs
@@ -0,0 +1,151 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Parses and simplifies an expression to a literal of a given type.
+//!
+//! This module provides functionality to parse and simplify static expressions
+//! used in SQL constructs like `FROM TABLE SAMPLE (10 + 50 * 2)`. If they are required
+//! in a planning (not an execution) phase, they need to be reduced to literals of a given type.
+
+use crate::simplify_expressions::ExprSimplifier;
+use arrow::datatypes::ArrowPrimitiveType;
+use datafusion_common::{
+    DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, plan_datafusion_err,
+    plan_err,
+};
+use datafusion_expr::Expr;
+use datafusion_expr::simplify::SimplifyContext;
+use std::sync::Arc;
+
+/// Parse and simplifies an expression to a numeric literal,
+/// corresponding to an arrow primitive type `T` (for example, Float64Type).
+///
+/// This function simplifies and coerces the expression, then extracts the underlying
+/// native type using `TryFrom<ScalarValue>`.
+///
+/// # Example
+/// ```ignore
+/// let value: f64 = parse_literal::<Float64Type>(expr)?;
+/// ```
+pub fn parse_literal<T>(expr: &Expr) -> Result<T::Native>
+where
+    T: ArrowPrimitiveType,
+    T::Native: TryFrom<ScalarValue, Error = DataFusionError>,
+{
+    // Empty schema is sufficient because it parses only literal expressions
+    let schema = DFSchemaRef::new(DFSchema::empty());
+
+    log::debug!("Parsing expr {:?} to type {}", expr, T::DATA_TYPE);
+
+    let simplifier = ExprSimplifier::new(
+        SimplifyContext::builder()
+            .with_schema(Arc::clone(&schema))
+            .build(),
+    );
+
+    // Simplify and coerce expression in case of constant arithmetic operations (e.g., 10 + 5)
+    let simplified_expr: Expr = simplifier
+        .simplify(expr.clone())
+        .map_err(|err| plan_datafusion_err!("Cannot simplify {expr:?}: {err}"))?;
+    let coerced_expr: Expr = simplifier.coerce(simplified_expr, schema.as_ref())?;
+    log::debug!("Coerced expression: {:?}", &coerced_expr);
+
+    match coerced_expr {
+        Expr::Literal(scalar_value, _) => {
+            // It is a literal - proceed to the underlying value
+            // Cast to the target type if needed
+            let casted_scalar = scalar_value.cast_to(&T::DATA_TYPE)?;
+
+            // Extract the native type
+            T::Native::try_from(casted_scalar).map_err(|err| {
+                plan_datafusion_err!(
+                    "Cannot extract {} from scalar value: {err}",
+                    std::any::type_name::<T>()
+                )
+            })
+        }
+        actual => {
+            plan_err!(
+                "Cannot extract literal from coerced {actual:?} expression given {expr:?} expression"
+            )
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::{Float64Type, Int64Type};
+    use datafusion_expr::{BinaryExpr, lit};
+    use datafusion_expr_common::operator::Operator;
+
+    #[test]
+    fn test_parse_sql_float_literal() {
+        let test_cases = vec![
+            (Expr::Literal(ScalarValue::Float64(Some(0.0)), None), 0.0),
+            (Expr::Literal(ScalarValue::Float64(Some(1.0)), None), 1.0),
+            (
+                Expr::BinaryExpr(BinaryExpr::new(
+                    Box::new(lit(50.0)),
+                    Operator::Minus,
+                    Box::new(lit(10.0)),
+                )),
+                40.0,
+            ),
+            (
+                Expr::Literal(ScalarValue::Utf8(Some("1e2".into())), None),
+                100.0,
+            ),
+            (
+                Expr::Literal(ScalarValue::Utf8(Some("2.5e-1".into())), None),
+                0.25,
+            ),
+        ];
+
+        for (expr, expected) in test_cases {
+            let result: Result<f64> = parse_literal::<Float64Type>(&expr);
+
+            match result {
+                Ok(value) => {
+                    assert!(
+                        (value - expected).abs() < 1e-10,
+                        "For expression '{expr}': expected {expected}, got {value}",
+                    );
+                }
+                Err(e) => panic!("Failed to parse expression '{expr}': {e}"),
+            }
+        }
+    }
+
+    #[test]
+    fn test_parse_sql_integer_literal() {
+        let expr = Expr::BinaryExpr(BinaryExpr::new(
+            Box::new(lit(2)),
+            Operator::Plus,
+            Box::new(lit(4)),
+        ));
+
+        let result: Result<i64> = parse_literal::<Int64Type>(&expr);
+
+        match result {
+            Ok(value) => {
+                assert_eq!(6, value);
+            }
+            Err(e) => panic!("Failed to parse expression: {e}"),
+        }
+    }
+}
diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs
new file mode 100644
index 0000000000000..d888a54d56574
--- /dev/null
+++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs
@@ -0,0 +1,402 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_common::{Result, internal_err, tree_node::Transformed};
+use datafusion_expr::{Expr, Operator, and, lit, or};
+use datafusion_expr_common::interval_arithmetic::Interval;
+
+/// Rewrites a binary expression using its "preimage"
+///
+/// Specifically it rewrites expressions of the form `<expr> OP x` (e.g. `<expr> =
+/// x`) where `<expr>` is known to have a pre-image (aka the entire single
+/// range for which it is valid) and `x` is not `NULL`
+///
+/// For details see [`datafusion_expr::ScalarUDFImpl::preimage`]
+pub(super) fn rewrite_with_preimage(
+    preimage_interval: Interval,
+    op: Operator,
+    expr: Expr,
+) -> Result<Transformed<Expr>> {
+    let (lower, upper) = preimage_interval.into_bounds();
+    let (lower, upper) = (lit(lower), lit(upper));
+
+    let rewritten_expr = match op {
+        // <expr> < x   ==>  <expr> < lower
+        Operator::Lt => expr.lt(lower),
+        // <expr> >= x  ==>  <expr> >= lower
+        Operator::GtEq => expr.gt_eq(lower),
+        // <expr> > x ==> <expr> >= upper
+        Operator::Gt => expr.gt_eq(upper),
+        // <expr> <= x ==> <expr> < upper
+        Operator::LtEq => expr.lt(upper),
+        // <expr> = x ==> (<expr> >= lower) and (<expr> < upper)
+        Operator::Eq => and(expr.clone().gt_eq(lower), expr.lt(upper)),
+        // <expr> != x ==> (<expr> < lower) or (<expr> >= upper)
+        Operator::NotEq => or(expr.clone().lt(lower), expr.gt_eq(upper)),
+        // <expr> is not distinct from x ==> (<expr> is NULL and x is NULL) or ((<expr> >= lower) and (<expr> < upper))
+        // but since x is always not NULL => (<expr> is not NULL) and (<expr> >= lower) and (<expr> < upper)
+        Operator::IsNotDistinctFrom => expr
+            .clone()
+            .is_not_null()
+            .and(expr.clone().gt_eq(lower))
+            .and(expr.lt(upper)),
+        // <expr> is distinct from x ==> (<expr> < lower) or (<expr> >= upper) or (<expr> is NULL and x is not NULL) or (<expr> is not NULL and x is NULL)
+        // but given that x is always not NULL => (<expr> < lower) or (<expr> >= upper) or (<expr> is NULL)
+        Operator::IsDistinctFrom => expr
+            .clone()
+            .lt(lower)
+            .or(expr.clone().gt_eq(upper))
+            .or(expr.is_null()),
+        _ => return internal_err!("Expect comparison operators"),
+    };
+    Ok(Transformed::yes(rewritten_expr))
+}
+
+#[cfg(test)]
+mod test {
+
+    use std::sync::Arc;
+
+    use arrow::datatypes::{DataType, Field};
+    use datafusion_common::{DFSchema, DFSchemaRef, Result, ScalarValue};
+    use datafusion_expr::{
+        ColumnarValue, Expr, Operator, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl,
+        Signature, Volatility, and, binary_expr, col, lit, or, preimage::PreimageResult,
+        simplify::SimplifyContext,
+    };
+
+    use super::Interval;
+    use crate::simplify_expressions::ExprSimplifier;
+
+    fn is_distinct_from(left: Expr, right: Expr) -> Expr {
+        binary_expr(left, Operator::IsDistinctFrom, right)
+    }
+
+    fn is_not_distinct_from(left: Expr, right: Expr) -> Expr {
+        binary_expr(left, Operator::IsNotDistinctFrom, right)
+    }
+
+    #[derive(Debug, PartialEq, Eq, Hash)]
+    struct PreimageUdf {
+        /// Defaults to an exact signature with one Int32 argument and Immutable volatility
+        signature: Signature,
+        /// If true, returns a preimage; otherwise, returns None
+        enabled: bool,
+    }
+
+    impl PreimageUdf {
+        fn new() -> Self {
+            Self {
+                signature: Signature::exact(vec![DataType::Int32], Volatility::Immutable),
+                enabled: true,
+            }
+        }
+
+        /// Set the enabled flag
+        fn with_enabled(mut self, enabled: bool) -> Self {
+            self.enabled = enabled;
+            self
+        }
+
+        /// Set the volatility
+        fn with_volatility(mut self, volatility: Volatility) -> Self {
+            self.signature.volatility = volatility;
+            self
+        }
+    }
+
+    impl ScalarUDFImpl for PreimageUdf {
+        fn name(&self) -> &str {
+            "preimage_func"
+        }
+
+        fn signature(&self) -> &Signature {
+            &self.signature
+        }
+
+        fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+            Ok(DataType::Int32)
+        }
+
+        fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+            Ok(ColumnarValue::Scalar(ScalarValue::Int32(Some(500))))
+        }
+
+        fn preimage(
+            &self,
+            args: &[Expr],
+            lit_expr: &Expr,
+            _info: &SimplifyContext,
+        ) -> Result<PreimageResult> {
+            if !self.enabled {
+                return Ok(PreimageResult::None);
+            }
+            if args.len() != 1 {
+                return Ok(PreimageResult::None);
+            }
+
+            let expr = args.first().cloned().expect("Should be column expression");
+            match lit_expr {
+                Expr::Literal(ScalarValue::Int32(Some(500)), _) => {
+                    Ok(PreimageResult::Range {
+                        expr,
+                        interval: Box::new(Interval::try_new(
+                            ScalarValue::Int32(Some(100)),
+                            ScalarValue::Int32(Some(200)),
+                        )?),
+                    })
+                }
+                Expr::Literal(ScalarValue::Int32(Some(600)), _) => {
+                    Ok(PreimageResult::Range {
+                        expr,
+                        interval: Box::new(Interval::try_new(
+                            ScalarValue::Int32(Some(300)),
+                            ScalarValue::Int32(Some(400)),
+                        )?),
+                    })
+                }
+                _ => Ok(PreimageResult::None),
+            }
+        }
+    }
+
+    fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr {
+        let simplify_context = SimplifyContext::builder()
+            .with_schema(Arc::clone(schema))
+            .build();
+        ExprSimplifier::new(simplify_context)
+            .simplify(expr)
+            .unwrap()
+    }
+
+    fn preimage_udf_expr() -> Expr {
+        ScalarUDF::new_from_impl(PreimageUdf::new()).call(vec![col("x")])
+    }
+
+    fn non_immutable_udf_expr() -> Expr {
+        ScalarUDF::new_from_impl(PreimageUdf::new().with_volatility(Volatility::Volatile))
+            .call(vec![col("x")])
+    }
+
+    fn no_preimage_udf_expr() -> Expr {
+        ScalarUDF::new_from_impl(PreimageUdf::new().with_enabled(false))
+            .call(vec![col("x")])
+    }
+
+    fn test_schema() -> DFSchemaRef {
+        Arc::new(
+            DFSchema::from_unqualified_fields(
+                vec![Field::new("x", DataType::Int32, true)].into(),
+                Default::default(),
+            )
+            .unwrap(),
+        )
+    }
+
+    fn test_schema_xy() -> DFSchemaRef {
+        Arc::new(
+            DFSchema::from_unqualified_fields(
+                vec![
+                    Field::new("x", DataType::Int32, false),
+                    Field::new("y", DataType::Int32, false),
+                ]
+                .into(),
+                Default::default(),
+            )
+            .unwrap(),
+        )
+    }
+
+    #[test]
+    fn test_preimage_eq_rewrite() {
+        // Equality rewrite when preimage and column expression are available.
+        let schema = test_schema();
+        let expr = preimage_udf_expr().eq(lit(500));
+        let expected = and(col("x").gt_eq(lit(100)), col("x").lt(lit(200)));
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_noteq_rewrite() {
+        // Inequality rewrite expands to disjoint ranges.
+        let schema = test_schema();
+        let expr = preimage_udf_expr().not_eq(lit(500));
+        let expected = col("x").lt(lit(100)).or(col("x").gt_eq(lit(200)));
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_eq_rewrite_swapped() {
+        // Equality rewrite works when the literal appears on the left.
+        let schema = test_schema();
+        let expr = lit(500).eq(preimage_udf_expr());
+        let expected = and(col("x").gt_eq(lit(100)), col("x").lt(lit(200)));
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_lt_rewrite() {
+        // Less-than comparison rewrites to the lower bound.
+        let schema = test_schema();
+        let expr = preimage_udf_expr().lt(lit(500));
+        let expected = col("x").lt(lit(100));
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_lteq_rewrite() {
+        // Less-than-or-equal comparison rewrites to the upper bound.
+        let schema = test_schema();
+        let expr = preimage_udf_expr().lt_eq(lit(500));
+        let expected = col("x").lt(lit(200));
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_gt_rewrite() {
+        // Greater-than comparison rewrites to the upper bound (inclusive).
+        let schema = test_schema();
+        let expr = preimage_udf_expr().gt(lit(500));
+        let expected = col("x").gt_eq(lit(200));
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_gteq_rewrite() {
+        // Greater-than-or-equal comparison rewrites to the lower bound.
+        let schema = test_schema();
+        let expr = preimage_udf_expr().gt_eq(lit(500));
+        let expected = col("x").gt_eq(lit(100));
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_is_not_distinct_from_rewrite() {
+        // IS NOT DISTINCT FROM rewrites to equality plus expression not-null check
+        // for non-null literal RHS.
+        let schema = test_schema();
+        let expr = is_not_distinct_from(preimage_udf_expr(), lit(500));
+        let expected = col("x")
+            .is_not_null()
+            .and(col("x").gt_eq(lit(100)))
+            .and(col("x").lt(lit(200)));
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_is_distinct_from_rewrite() {
+        // IS DISTINCT FROM adds an explicit NULL branch for the column.
+        let schema = test_schema();
+        let expr = is_distinct_from(preimage_udf_expr(), lit(500));
+        let expected = col("x")
+            .lt(lit(100))
+            .or(col("x").gt_eq(lit(200)))
+            .or(col("x").is_null());
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_in_list_rewrite() {
+        let schema = test_schema();
+        let expr = preimage_udf_expr().in_list(vec![lit(500), lit(600)], false);
+        let expected = or(
+            and(col("x").gt_eq(lit(100)), col("x").lt(lit(200))),
+            and(col("x").gt_eq(lit(300)), col("x").lt(lit(400))),
+        );
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_not_in_list_rewrite() {
+        let schema = test_schema();
+        let expr = preimage_udf_expr().in_list(vec![lit(500), lit(600)], true);
+        let expected = and(
+            or(col("x").lt(lit(100)), col("x").gt_eq(lit(200))),
+            or(col("x").lt(lit(300)), col("x").gt_eq(lit(400))),
+        );
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_in_list_long_list_no_rewrite() {
+        let schema = test_schema();
+        let expr = preimage_udf_expr().in_list((1..100).map(lit).collect(), false);
+
+        assert_eq!(optimize_test(expr.clone(), &schema), expr);
+    }
+
+    #[test]
+    fn test_preimage_non_literal_rhs_no_rewrite() {
+        // Non-literal RHS should not be rewritten.
+        let schema = test_schema_xy();
+        let expr = preimage_udf_expr().eq(col("y"));
+        let expected = expr.clone();
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_null_literal_no_rewrite_distinct_ops() {
+        // NULL literal RHS should not be rewritten for DISTINCTness operators:
+        // - `expr IS DISTINCT FROM NULL`  <=> `NOT (expr IS NULL)`
+        // - `expr IS NOT DISTINCT FROM NULL` <=> `expr IS NULL`
+        //
+        // For normal comparisons (=, !=, <, <=, >, >=), `expr OP NULL` evaluates to NULL
+        // under SQL tri-state logic, and DataFusion's simplifier constant-folds it.
+        // https://docs.rs/datafusion/latest/datafusion/physical_optimizer/pruning/struct.PruningPredicate.html#boolean-tri-state-logic
+
+        let schema = test_schema();
+
+        let expr = is_distinct_from(preimage_udf_expr(), lit(ScalarValue::Int32(None)));
+        assert_eq!(optimize_test(expr.clone(), &schema), expr);
+
+        let expr =
+            is_not_distinct_from(preimage_udf_expr(), lit(ScalarValue::Int32(None)));
+        assert_eq!(optimize_test(expr.clone(), &schema), expr);
+    }
+
+    #[test]
+    fn test_preimage_non_immutable_no_rewrite() {
+        // Non-immutable UDFs should not participate in preimage rewrites.
+        let schema = test_schema();
+        let expr = non_immutable_udf_expr().eq(lit(500));
+        let expected = expr.clone();
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_no_preimage_no_rewrite() {
+        // If the UDF provides no preimage, the expression should remain unchanged.
+        let schema = test_schema();
+        let expr = no_preimage_udf_expr().eq(lit(500));
+        let expected = expr.clone();
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+}
diff --git a/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs b/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs
index b1f3b006e0cfc..a5b65d0d8e7a4 100644
--- a/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs
+++ b/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs
@@ -55,14 +55,14 @@
 //! ```
 
 use arrow::datatypes::DataType;
-use datafusion_common::{internal_err, tree_node::Transformed};
 use datafusion_common::{Result, ScalarValue};
-use datafusion_expr::{lit, BinaryExpr};
-use datafusion_expr::{simplify::SimplifyInfo, Cast, Expr, Operator, TryCast};
+use datafusion_common::{internal_err, tree_node::Transformed};
+use datafusion_expr::{BinaryExpr, lit};
+use datafusion_expr::{Cast, Expr, Operator, TryCast, simplify::SimplifyContext};
 use datafusion_expr_common::casts::{is_supported_type, try_cast_literal_to_type};
 
-pub(super) fn unwrap_cast_in_comparison_for_binary<S: SimplifyInfo>(
-    info: &S,
+pub(super) fn unwrap_cast_in_comparison_for_binary(
+    info: &SimplifyContext,
     cast_expr: Expr,
     literal: Expr,
     op: Operator,
@@ -104,10 +104,8 @@ pub(super) fn unwrap_cast_in_comparison_for_binary<S: SimplifyInfo>(
     }
 }
 
-pub(super) fn is_cast_expr_and_support_unwrap_cast_in_comparison_for_binary<
-    S: SimplifyInfo,
->(
-    info: &S,
+pub(super) fn is_cast_expr_and_support_unwrap_cast_in_comparison_for_binary(
+    info: &SimplifyContext,
     expr: &Expr,
     op: Operator,
     literal: &Expr,
@@ -142,10 +140,8 @@ pub(super) fn is_cast_expr_and_support_unwrap_cast_in_comparison_for_binary<
     }
 }
 
-pub(super) fn is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
-    S: SimplifyInfo,
->(
-    info: &S,
+pub(super) fn is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist(
+    info: &SimplifyContext,
     expr: &Expr,
     list: &[Expr],
 ) -> bool {
@@ -241,7 +237,6 @@ mod tests {
     use crate::simplify_expressions::ExprSimplifier;
     use arrow::datatypes::{Field, TimeUnit};
     use datafusion_common::{DFSchema, DFSchemaRef};
-    use datafusion_expr::execution_props::ExecutionProps;
     use datafusion_expr::simplify::SimplifyContext;
     use datafusion_expr::{cast, col, in_list, try_cast};
 
@@ -592,9 +587,10 @@ mod tests {
     }
 
     fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr {
-        let props = ExecutionProps::new();
         let simplifier = ExprSimplifier::new(
-            SimplifyContext::new(&props).with_schema(Arc::clone(schema)),
+            SimplifyContext::builder()
+                .with_schema(Arc::clone(schema))
+                .build(),
         );
 
         simplifier.simplify(expr).unwrap()
diff --git a/datafusion/optimizer/src/simplify_expressions/utils.rs b/datafusion/optimizer/src/simplify_expressions/utils.rs
index 35e256f3064e3..b0908b47602f7 100644
--- a/datafusion/optimizer/src/simplify_expressions/utils.rs
+++ b/datafusion/optimizer/src/simplify_expressions/utils.rs
@@ -18,11 +18,11 @@
 //! Utility functions for expression simplification
 
 use arrow::datatypes::i256;
-use datafusion_common::{internal_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, internal_err};
 use datafusion_expr::{
+    Case, Expr, Like, Operator,
     expr::{Between, BinaryExpr, InList},
     expr_fn::{and, bitwise_and, bitwise_or, or},
-    Case, Expr, Like, Operator,
 };
 
 pub static POWS_OF_TEN: [i128; 38] = [
@@ -290,6 +290,54 @@ pub fn is_lit(expr: &Expr) -> bool {
     matches!(expr, Expr::Literal(_, _))
 }
 
+/// Checks if `eq_expr` is `A = L1` and `ne_expr` is `A != L2` where L1 != L2.
+/// This pattern can be simplified to just `A = L1` since if A equals L1
+/// and L1 is different from L2, then A is automatically not equal to L2.
+pub fn is_eq_and_ne_with_different_literal(eq_expr: &Expr, ne_expr: &Expr) -> bool {
+    fn extract_var_and_literal(expr: &Expr) -> Option<(&Expr, &Expr)> {
+        match expr {
+            Expr::BinaryExpr(BinaryExpr {
+                left,
+                op: Operator::Eq,
+                right,
+            })
+            | Expr::BinaryExpr(BinaryExpr {
+                left,
+                op: Operator::NotEq,
+                right,
+            }) => match (left.as_ref(), right.as_ref()) {
+                (Expr::Literal(_, _), var) => Some((var, left)),
+                (var, Expr::Literal(_, _)) => Some((var, right)),
+                _ => None,
+            },
+            _ => None,
+        }
+    }
+    match (eq_expr, ne_expr) {
+        (
+            Expr::BinaryExpr(BinaryExpr {
+                op: Operator::Eq, ..
+            }),
+            Expr::BinaryExpr(BinaryExpr {
+                op: Operator::NotEq,
+                ..
+            }),
+        ) => {
+            // Check if both compare the same expression against different literals
+            if let (Some((var1, lit1)), Some((var2, lit2))) = (
+                extract_var_and_literal(eq_expr),
+                extract_var_and_literal(ne_expr),
+            ) && var1 == var2
+                && lit1 != lit2
+            {
+                return true;
+            }
+            false
+        }
+        _ => false,
+    }
+}
+
 /// negate a Not clause
 /// input is the clause to be negated.(args of Not clause)
 /// For BinaryExpr, use the negation of op instead.
diff --git a/datafusion/optimizer/src/single_distinct_to_groupby.rs b/datafusion/optimizer/src/single_distinct_to_groupby.rs
index e9a23c7c4dc50..00c8fab228117 100644
--- a/datafusion/optimizer/src/single_distinct_to_groupby.rs
+++ b/datafusion/optimizer/src/single_distinct_to_groupby.rs
@@ -23,15 +23,14 @@ use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
 
 use datafusion_common::{
-    internal_err, tree_node::Transformed, DataFusionError, HashSet, Result,
+    DataFusionError, HashSet, Result, assert_eq_or_internal_err, tree_node::Transformed,
 };
 use datafusion_expr::builder::project;
 use datafusion_expr::expr::AggregateFunctionParams;
 use datafusion_expr::{
-    col,
+    Expr, col,
     expr::AggregateFunction,
     logical_plan::{Aggregate, LogicalPlan},
-    Expr,
 };
 
 /// single distinct to group by optimizer rule
@@ -56,7 +55,7 @@ pub struct SingleDistinctToGroupBy {}
 const SINGLE_DISTINCT_ALIAS: &str = "alias1";
 
 impl SingleDistinctToGroupBy {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -183,15 +182,25 @@ impl OptimizerRule for SingleDistinctToGroupBy {
                     .map(|aggr_expr| match aggr_expr {
                         Expr::AggregateFunction(AggregateFunction {
                             func,
-                            params: AggregateFunctionParams { mut args, distinct, .. }
+                            params:
+                                AggregateFunctionParams {
+                                    mut args,
+                                    distinct,
+                                    filter,
+                                    order_by,
+                                    null_treatment,
+                                },
                         }) => {
                             if distinct {
-                                if args.len() != 1 {
-                                    return internal_err!("DISTINCT aggregate should have exactly one argument");
-                                }
+                                assert_eq_or_internal_err!(
+                                    args.len(),
+                                    1,
+                                    "DISTINCT aggregate should have exactly one argument"
+                                );
                                 let arg = args.swap_remove(0);
 
-                                if group_fields_set.insert(arg.schema_name().to_string()) {
+                                if group_fields_set.insert(arg.schema_name().to_string())
+                                {
                                     inner_group_exprs
                                         .push(arg.alias(SINGLE_DISTINCT_ALIAS));
                                 }
@@ -199,9 +208,9 @@ impl OptimizerRule for SingleDistinctToGroupBy {
                                     func,
                                     vec![col(SINGLE_DISTINCT_ALIAS)],
                                     false, // intentional to remove distinct here
-                                    None,
-                                    vec![],
-                                    None,
+                                    filter,
+                                    order_by,
+                                    null_treatment,
                                 )))
                                 // if the aggregate function is not distinct, we need to rewrite it like two phase aggregation
                             } else {
@@ -212,9 +221,9 @@ impl OptimizerRule for SingleDistinctToGroupBy {
                                         Arc::clone(&func),
                                         args,
                                         false,
-                                        None,
-                                        vec![],
-                                        None,
+                                        filter,
+                                        order_by,
+                                        null_treatment,
                                     ))
                                     .alias(&alias_str),
                                 );
@@ -282,8 +291,8 @@ mod tests {
     use super::*;
     use crate::assert_optimized_plan_eq_display_indent_snapshot;
     use crate::test::*;
-    use datafusion_expr::expr::GroupingSet;
     use datafusion_expr::ExprFunctionExt;
+    use datafusion_expr::expr::GroupingSet;
     use datafusion_expr::{lit, logical_plan::builder::LogicalPlanBuilder};
     use datafusion_functions_aggregate::count::count_udaf;
     use datafusion_functions_aggregate::expr_fn::{count, count_distinct, max, min, sum};
diff --git a/datafusion/optimizer/src/test/mod.rs b/datafusion/optimizer/src/test/mod.rs
index 6e0b734bb9280..2915e77be2e12 100644
--- a/datafusion/optimizer/src/test/mod.rs
+++ b/datafusion/optimizer/src/test/mod.rs
@@ -20,10 +20,11 @@ use crate::optimizer::Optimizer;
 use crate::{OptimizerContext, OptimizerRule};
 use arrow::datatypes::{DataType, Field, Schema};
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::{assert_contains, Result};
-use datafusion_expr::{logical_plan::table_scan, LogicalPlan, LogicalPlanBuilder};
+use datafusion_common::{Result, assert_contains};
+use datafusion_expr::{LogicalPlan, LogicalPlanBuilder, logical_plan::table_scan};
 use std::sync::Arc;
 
+pub mod udfs;
 pub mod user_defined;
 
 pub fn test_table_scan_fields() -> Vec<Field> {
@@ -34,6 +35,28 @@ pub fn test_table_scan_fields() -> Vec<Field> {
     ]
 }
 
+pub fn test_table_scan_with_struct_fields() -> Vec<Field> {
+    vec![
+        Field::new("id", DataType::UInt32, false),
+        Field::new(
+            "user",
+            DataType::Struct(
+                vec![
+                    Field::new("name", DataType::Utf8, true),
+                    Field::new("status", DataType::Utf8, true),
+                ]
+                .into(),
+            ),
+            true,
+        ),
+    ]
+}
+
+pub fn test_table_scan_with_struct() -> Result<LogicalPlan> {
+    let schema = Schema::new(test_table_scan_with_struct_fields());
+    table_scan(Some("test"), &schema, None)?.build()
+}
+
 /// some tests share a common table with different names
 pub fn test_table_scan_with_name(name: &str) -> Result<LogicalPlan> {
     let schema = Schema::new(test_table_scan_fields());
diff --git a/datafusion/optimizer/src/test/udfs.rs b/datafusion/optimizer/src/test/udfs.rs
new file mode 100644
index 0000000000000..ba71b6a04a7a2
--- /dev/null
+++ b/datafusion/optimizer/src/test/udfs.rs
@@ -0,0 +1,98 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::DataType;
+use datafusion_common::Result;
+use datafusion_expr::{
+    ColumnarValue, Expr, ExpressionPlacement, ScalarFunctionArgs, ScalarUDF,
+    ScalarUDFImpl, Signature, TypeSignature,
+};
+
+/// A configurable test UDF for optimizer tests.
+/// Defaults to `MoveTowardsLeafNodes` placement. Use `with_placement()` to override.
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct PlacementTestUDF {
+    signature: Signature,
+    placement: ExpressionPlacement,
+    id: usize,
+}
+
+impl Default for PlacementTestUDF {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl PlacementTestUDF {
+    pub fn new() -> Self {
+        Self {
+            // Accept any one or two arguments and return UInt32 for testing purposes.
+            // The actual types don't matter since this UDF is not intended for execution.
+            signature: Signature::new(
+                TypeSignature::OneOf(vec![TypeSignature::Any(1), TypeSignature::Any(2)]),
+                datafusion_expr::Volatility::Immutable,
+            ),
+            placement: ExpressionPlacement::MoveTowardsLeafNodes,
+            id: 0,
+        }
+    }
+
+    /// Set the expression placement for this UDF, which is used by optimizer rules to determine where in the plan the expression should be placed.
+    /// This also resets the name of the UDF to a default based on the placement.
+    pub fn with_placement(mut self, placement: ExpressionPlacement) -> Self {
+        self.placement = placement;
+        self
+    }
+
+    /// Set the id of the UDF.
+    /// This is an arbitrary made up field to allow creating multiple distinct UDFs with the same placement.
+    pub fn with_id(mut self, id: usize) -> Self {
+        self.id = id;
+        self
+    }
+}
+
+impl ScalarUDFImpl for PlacementTestUDF {
+    fn name(&self) -> &str {
+        match self.placement {
+            ExpressionPlacement::MoveTowardsLeafNodes => "leaf_udf",
+            ExpressionPlacement::KeepInPlace => "keep_in_place_udf",
+            ExpressionPlacement::Column => "column_udf",
+            ExpressionPlacement::Literal => "literal_udf",
+        }
+    }
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::UInt32)
+    }
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        panic!("PlacementTestUDF: not intended for execution")
+    }
+    fn placement(&self, _args: &[ExpressionPlacement]) -> ExpressionPlacement {
+        self.placement
+    }
+}
+
+/// Create a `leaf_udf(arg)` expression with `MoveTowardsLeafNodes` placement.
+pub fn leaf_udf_expr(arg: Expr) -> Expr {
+    let udf = ScalarUDF::new_from_impl(
+        PlacementTestUDF::new().with_placement(ExpressionPlacement::MoveTowardsLeafNodes),
+    );
+    udf.call(vec![arg])
+}
diff --git a/datafusion/optimizer/src/test/user_defined.rs b/datafusion/optimizer/src/test/user_defined.rs
index a39f90b5da5db..878ce274d5ed6 100644
--- a/datafusion/optimizer/src/test/user_defined.rs
+++ b/datafusion/optimizer/src/test/user_defined.rs
@@ -19,8 +19,8 @@
 
 use datafusion_common::DFSchemaRef;
 use datafusion_expr::{
-    logical_plan::{Extension, UserDefinedLogicalNodeCore},
     Expr, LogicalPlan,
+    logical_plan::{Extension, UserDefinedLogicalNodeCore},
 };
 use std::{
     fmt::{self, Debug},
diff --git a/datafusion/optimizer/src/utils.rs b/datafusion/optimizer/src/utils.rs
index 81763fa0552fb..ad151d1ddb8e0 100644
--- a/datafusion/optimizer/src/utils.rs
+++ b/datafusion/optimizer/src/utils.rs
@@ -20,14 +20,15 @@
 use std::collections::{BTreeSet, HashMap, HashSet};
 
 use crate::analyzer::type_coercion::TypeCoercionRewriter;
-use arrow::array::{new_null_array, Array, RecordBatch};
+use arrow::array::{Array, RecordBatch, new_null_array};
 use arrow::datatypes::{DataType, Field, Schema};
+use datafusion_common::TableReference;
 use datafusion_common::cast::as_boolean_array;
 use datafusion_common::tree_node::{TransformedResult, TreeNode};
 use datafusion_common::{Column, DFSchema, Result, ScalarValue};
 use datafusion_expr::execution_props::ExecutionProps;
 use datafusion_expr::expr_rewriter::replace_col;
-use datafusion_expr::{logical_plan::LogicalPlan, ColumnarValue, Expr};
+use datafusion_expr::{ColumnarValue, Expr, logical_plan::LogicalPlan};
 use datafusion_physical_expr::create_physical_expr;
 use log::{debug, trace};
 use std::sync::Arc;
@@ -37,12 +38,17 @@ use std::sync::Arc;
 pub use datafusion_expr::expr_rewriter::NamePreserver;
 
 /// Returns true if `expr` contains all columns in `schema_cols`
-pub(crate) fn has_all_column_refs(expr: &Expr, schema_cols: &HashSet<Column>) -> bool {
+pub(crate) fn has_all_column_refs(
+    expr: &Expr,
+    schema_cols: &HashSet<ColumnReference>,
+) -> bool {
     let column_refs = expr.column_refs();
     // note can't use HashSet::intersect because of different types (owned vs References)
-    schema_cols
+    column_refs
         .iter()
-        .filter(|c| column_refs.contains(c))
+        .filter(|c| {
+            schema_cols.contains(&ColumnReference::new(c.relation.as_ref(), c.name()))
+        })
         .count()
         == column_refs.len()
 }
@@ -62,6 +68,40 @@ pub(crate) fn replace_qualified_name(
     replace_col(expr, &replace_map)
 }
 
+/// Column reference to avoid copying string around
+#[derive(PartialEq, Eq, Hash, Debug)]
+pub(crate) struct ColumnReference<'a> {
+    pub relation: Option<&'a TableReference>,
+    pub name: &'a str,
+}
+
+impl<'a> ColumnReference<'a> {
+    pub fn new(relation: Option<&'a TableReference>, name: &'a str) -> Self {
+        Self { relation, name }
+    }
+
+    pub fn new_unqualified(name: &'a str) -> Self {
+        Self {
+            relation: None,
+            name,
+        }
+    }
+}
+
+/// Returns references to all columns in the schema
+pub(crate) fn schema_columns<'a>(schema: &'a DFSchema) -> HashSet<ColumnReference<'a>> {
+    schema
+        .iter()
+        .flat_map(|(qualifier, field)| {
+            [
+                ColumnReference::new(qualifier, field.name()),
+                // we need to push down filter using unqualified column as well
+                ColumnReference::new_unqualified(field.name()),
+            ]
+        })
+        .collect::<HashSet<_>>()
+}
+
 /// Log the plan in debug/tracing mode after some part of the optimizer runs
 pub fn log_plan(description: &str, plan: &LogicalPlan) {
     debug!("{description}:\n{}\n", plan.display_indent());
@@ -154,7 +194,7 @@ fn coerce(expr: Expr, schema: &DFSchema) -> Result<Expr> {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use datafusion_expr::{binary_expr, case, col, in_list, is_null, lit, Operator};
+    use datafusion_expr::{Operator, binary_expr, case, col, in_list, is_null, lit};
 
     #[test]
     fn expr_is_restrict_null_predicate() -> Result<()> {
diff --git a/datafusion/optimizer/tests/optimizer_integration.rs b/datafusion/optimizer/tests/optimizer_integration.rs
index c0f48b8ebfc40..14e02c8693b2a 100644
--- a/datafusion/optimizer/tests/optimizer_integration.rs
+++ b/datafusion/optimizer/tests/optimizer_integration.rs
@@ -15,17 +15,25 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
+use std::cmp::Ordering;
 use std::collections::HashMap;
+use std::fmt::Formatter;
 use std::sync::Arc;
 
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit};
 
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::{plan_err, Result, TableReference};
+use datafusion_common::{
+    DFSchemaRef, Result, ScalarValue, TableReference, ToDFSchema, plan_err,
+};
+use datafusion_expr::expr::Cast;
+use datafusion_expr::logical_plan::builder::LogicalPlanBuilder;
 use datafusion_expr::planner::ExprPlanner;
 use datafusion_expr::test::function_stub::sum_udaf;
-use datafusion_expr::{AggregateUDF, LogicalPlan, ScalarUDF, TableSource, WindowUDF};
+use datafusion_expr::{
+    AggregateUDF, Expr, Extension, LogicalPlan, ScalarUDF, SortExpr,
+    TableProviderFilterPushDown, TableSource, UserDefinedLogicalNodeCore, WindowUDF, col,
+};
 use datafusion_functions_aggregate::average::avg_udaf;
 use datafusion_functions_aggregate::count::count_udaf;
 use datafusion_functions_aggregate::planner::AggregateFunctionPlanner;
@@ -67,22 +75,22 @@ fn recursive_cte_with_nested_subquery() -> Result<()> {
 
     assert_snapshot!(
         format!("{plan}"),
-        @r#"
-        SubqueryAlias: numbers
-          Projection: sub.id AS id, sub.level AS level
-            RecursiveQuery: is_distinct=false
-              Projection: sub.id, sub.level
-                SubqueryAlias: sub
-                  Projection: test.col_int32 AS id, Int64(1) AS level
-                    TableScan: test
-              Projection: t.col_int32, numbers.level + Int64(1)
-                Inner Join: CAST(t.col_int32 AS Int64) = CAST(numbers.id AS Int64) + Int64(1)
-                  SubqueryAlias: t
-                    Filter: CAST(test.col_int32 AS Int64) IS NOT NULL
-                      TableScan: test
-                  Filter: CAST(numbers.id AS Int64) + Int64(1) IS NOT NULL
-                    TableScan: numbers
-        "#
+        @r"
+    SubqueryAlias: numbers
+      Projection: sub.id AS id, sub.level AS level
+        RecursiveQuery: is_distinct=false
+          Projection: sub.id, sub.level
+            SubqueryAlias: sub
+              Projection: test.col_int32 AS id, Int64(1) AS level
+                TableScan: test
+          Projection: t.col_int32, numbers.level + Int64(1)
+            Inner Join: CAST(t.col_int32 AS Int64) = CAST(numbers.id AS Int64) + Int64(1)
+              SubqueryAlias: t
+                Filter: CAST(test.col_int32 AS Int64) IS NOT NULL
+                  TableScan: test
+              Filter: CAST(numbers.id AS Int64) + Int64(1) IS NOT NULL
+                TableScan: numbers
+    "
     );
 
     Ok(())
@@ -95,10 +103,10 @@ fn case_when() -> Result<()> {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-Projection: CASE WHEN test.col_int32 > Int32(0) THEN Int64(1) ELSE Int64(0) END AS CASE WHEN test.col_int32 > Int64(0) THEN Int64(1) ELSE Int64(0) END
-  TableScan: test projection=[col_int32]
-"#
+    @r"
+    Projection: CASE WHEN test.col_int32 > Int32(0) THEN Int64(1) ELSE Int64(0) END AS CASE WHEN test.col_int32 > Int64(0) THEN Int64(1) ELSE Int64(0) END
+      TableScan: test projection=[col_int32]
+    "
     );
 
     let sql = "SELECT CASE WHEN col_uint32 > 0 THEN 1 ELSE 0 END FROM test";
@@ -106,10 +114,10 @@ Projection: CASE WHEN test.col_int32 > Int32(0) THEN Int64(1) ELSE Int64(0) END
 
     assert_snapshot!(
         format!("{plan}"),
-    @r#"
+    @r"
     Projection: CASE WHEN test.col_uint32 > UInt32(0) THEN Int64(1) ELSE Int64(0) END AS CASE WHEN test.col_uint32 > Int64(0) THEN Int64(1) ELSE Int64(0) END
       TableScan: test projection=[col_uint32]
-    "#
+    "
     );
     Ok(())
 }
@@ -128,15 +136,13 @@ fn subquery_filter_with_cast() -> Result<()> {
     assert_snapshot!(
     format!("{plan}"),
     @r#"
-    Projection: test.col_int32
-      Inner Join:  Filter: CAST(test.col_int32 AS Float64) > __scalar_sq_1.avg(test.col_int32)
-        TableScan: test projection=[col_int32]
-        SubqueryAlias: __scalar_sq_1
-          Aggregate: groupBy=[[]], aggr=[[avg(CAST(test.col_int32 AS Float64))]]
-            Projection: test.col_int32
-              Filter: __common_expr_4 >= Date32("2002-05-08") AND __common_expr_4 <= Date32("2002-05-13")
-                Projection: CAST(test.col_utf8 AS Date32) AS __common_expr_4, test.col_int32
-                  TableScan: test projection=[col_int32, col_utf8]
+    Filter: CAST(test.col_int32 AS Float64) > (<subquery>)
+      Subquery:
+        Aggregate: groupBy=[[]], aggr=[[avg(CAST(test.col_int32 AS Float64))]]
+          Projection: test.col_int32
+            Filter: CAST(test.col_utf8 AS Date32) >= Date32("2002-05-08") AND CAST(test.col_utf8 AS Date32) <= Date32("2002-05-13")
+              TableScan: test projection=[col_int32, col_utf8]
+      TableScan: test projection=[col_int32]
     "#
     );
     Ok(())
@@ -149,11 +155,11 @@ fn case_when_aggregate() -> Result<()> {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        Projection: test.col_utf8, sum(CASE WHEN test.col_int32 > Int64(0) THEN Int64(1) ELSE Int64(0) END) AS n
-          Aggregate: groupBy=[[test.col_utf8]], aggr=[[sum(CASE WHEN test.col_int32 > Int32(0) THEN Int64(1) ELSE Int64(0) END) AS sum(CASE WHEN test.col_int32 > Int64(0) THEN Int64(1) ELSE Int64(0) END)]]
-            TableScan: test projection=[col_int32, col_utf8]
-        "#
+    @r"
+    Projection: test.col_utf8, sum(CASE WHEN test.col_int32 > Int64(0) THEN Int64(1) ELSE Int64(0) END) AS n
+      Aggregate: groupBy=[[test.col_utf8]], aggr=[[sum(CASE WHEN test.col_int32 > Int32(0) THEN Int64(1) ELSE Int64(0) END) AS sum(CASE WHEN test.col_int32 > Int64(0) THEN Int64(1) ELSE Int64(0) END)]]
+        TableScan: test projection=[col_int32, col_utf8]
+    "
     );
     Ok(())
 }
@@ -165,11 +171,11 @@ fn unsigned_target_type() -> Result<()> {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
+    @r"
     Projection: test.col_utf8
       Filter: test.col_uint32 > UInt32(0)
         TableScan: test projection=[col_uint32, col_utf8]
-    "#
+    "
     );
     Ok(())
 }
@@ -182,10 +188,10 @@ fn distribute_by() -> Result<()> {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        Repartition: DistributeBy(test.col_utf8)
-          TableScan: test projection=[col_int32, col_utf8]
-        "#
+    @r"
+    Repartition: DistributeBy(test.col_utf8)
+      TableScan: test projection=[col_int32, col_utf8]
+    "
     );
     Ok(())
 }
@@ -200,16 +206,16 @@ fn semi_join_with_join_filter() -> Result<()> {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        Projection: test.col_utf8
-          LeftSemi Join: test.col_int32 = __correlated_sq_1.col_int32 Filter: test.col_uint32 != __correlated_sq_1.col_uint32
+    @r"
+    Projection: test.col_utf8
+      LeftSemi Join: test.col_int32 = __correlated_sq_1.col_int32 Filter: test.col_uint32 != __correlated_sq_1.col_uint32
+        Filter: test.col_int32 IS NOT NULL
+          TableScan: test projection=[col_int32, col_uint32, col_utf8]
+        SubqueryAlias: __correlated_sq_1
+          SubqueryAlias: t2
             Filter: test.col_int32 IS NOT NULL
-              TableScan: test projection=[col_int32, col_uint32, col_utf8]
-            SubqueryAlias: __correlated_sq_1
-              SubqueryAlias: t2
-                Filter: test.col_int32 IS NOT NULL
-                  TableScan: test projection=[col_int32, col_uint32]
-        "#
+              TableScan: test projection=[col_int32, col_uint32]
+    "
     );
     Ok(())
 }
@@ -224,15 +230,15 @@ fn anti_join_with_join_filter() -> Result<()> {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-Projection: test.col_utf8
-  LeftAnti Join: test.col_int32 = __correlated_sq_1.col_int32 Filter: test.col_uint32 != __correlated_sq_1.col_uint32
-    TableScan: test projection=[col_int32, col_uint32, col_utf8]
-    SubqueryAlias: __correlated_sq_1
-      SubqueryAlias: t2
-        Filter: test.col_int32 IS NOT NULL
-          TableScan: test projection=[col_int32, col_uint32]
-"#
+    @r"
+    Projection: test.col_utf8
+      LeftAnti Join: test.col_int32 = __correlated_sq_1.col_int32 Filter: test.col_uint32 != __correlated_sq_1.col_uint32
+        TableScan: test projection=[col_int32, col_uint32, col_utf8]
+        SubqueryAlias: __correlated_sq_1
+          SubqueryAlias: t2
+            Filter: test.col_int32 IS NOT NULL
+              TableScan: test projection=[col_int32, col_uint32]
+    "
     );
     Ok(())
 }
@@ -245,16 +251,16 @@ fn where_exists_distinct() -> Result<()> {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-LeftSemi Join: test.col_int32 = __correlated_sq_1.col_int32
-  Filter: test.col_int32 IS NOT NULL
-    TableScan: test projection=[col_int32]
-  SubqueryAlias: __correlated_sq_1
-    Aggregate: groupBy=[[t2.col_int32]], aggr=[[]]
-      SubqueryAlias: t2
-        Filter: test.col_int32 IS NOT NULL
-          TableScan: test projection=[col_int32]
-"#
+    @r"
+    LeftSemi Join: test.col_int32 = __correlated_sq_1.col_int32
+      Filter: test.col_int32 IS NOT NULL
+        TableScan: test projection=[col_int32]
+      SubqueryAlias: __correlated_sq_1
+        Aggregate: groupBy=[[t2.col_int32]], aggr=[[]]
+          SubqueryAlias: t2
+            Filter: test.col_int32 IS NOT NULL
+              TableScan: test projection=[col_int32]
+    "
 
     );
     Ok(())
@@ -269,15 +275,17 @@ fn intersect() -> Result<()> {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-LeftSemi Join: test.col_int32 = test.col_int32, test.col_utf8 = test.col_utf8
-  Aggregate: groupBy=[[test.col_int32, test.col_utf8]], aggr=[[]]
-    LeftSemi Join: test.col_int32 = test.col_int32, test.col_utf8 = test.col_utf8
-      Aggregate: groupBy=[[test.col_int32, test.col_utf8]], aggr=[[]]
-        TableScan: test projection=[col_int32, col_utf8]
+    @r"
+    LeftSemi Join: left.col_int32 = test.col_int32, left.col_utf8 = test.col_utf8
+      Aggregate: groupBy=[[left.col_int32, left.col_utf8]], aggr=[[]]
+        LeftSemi Join: left.col_int32 = right.col_int32, left.col_utf8 = right.col_utf8
+          Aggregate: groupBy=[[left.col_int32, left.col_utf8]], aggr=[[]]
+            SubqueryAlias: left
+              TableScan: test projection=[col_int32, col_utf8]
+          SubqueryAlias: right
+            TableScan: test projection=[col_int32, col_utf8]
       TableScan: test projection=[col_int32, col_utf8]
-  TableScan: test projection=[col_int32, col_utf8]
-"#
+    "
     );
     Ok(())
 }
@@ -291,11 +299,11 @@ fn between_date32_plus_interval() -> Result<()> {
     assert_snapshot!(
     format!("{plan}"),
     @r#"
-Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
-  Projection:
-    Filter: test.col_date32 >= Date32("1998-03-18") AND test.col_date32 <= Date32("1998-06-16")
-      TableScan: test projection=[col_date32]
-"#
+    Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+      Projection:
+        Filter: test.col_date32 >= Date32("1998-03-18") AND test.col_date32 <= Date32("1998-06-16")
+          TableScan: test projection=[col_date32]
+    "#
     );
     Ok(())
 }
@@ -309,11 +317,11 @@ fn between_date64_plus_interval() -> Result<()> {
     assert_snapshot!(
     format!("{plan}"),
     @r#"
-        Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
-          Projection:
-            Filter: test.col_date64 >= Date64("1998-03-18") AND test.col_date64 <= Date64("1998-06-16")
-              TableScan: test projection=[col_date64]
-        "#
+    Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+      Projection:
+        Filter: test.col_date64 >= Date64("1998-03-18") AND test.col_date64 <= Date64("1998-06-16")
+          TableScan: test projection=[col_date64]
+    "#
     );
     Ok(())
 }
@@ -337,16 +345,16 @@ fn join_keys_in_subquery_alias() {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        Inner Join: a.col_int32 = b.key
-          SubqueryAlias: a
-            Filter: test.col_int32 IS NOT NULL
-              TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]
-          SubqueryAlias: b
-            Projection: test.col_int32 AS key
-              Filter: test.col_int32 IS NOT NULL
-                TableScan: test projection=[col_int32]
-        "#
+    @r"
+    Inner Join: a.col_int32 = b.key
+      SubqueryAlias: a
+        Filter: test.col_int32 IS NOT NULL
+          TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]
+      SubqueryAlias: b
+        Projection: test.col_int32 AS key
+          Filter: test.col_int32 IS NOT NULL
+            TableScan: test projection=[col_int32]
+    "
     );
 }
 
@@ -357,20 +365,20 @@ fn join_keys_in_subquery_alias_1() {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        Inner Join: a.col_int32 = b.key
-          SubqueryAlias: a
+    @r"
+    Inner Join: a.col_int32 = b.key
+      SubqueryAlias: a
+        Filter: test.col_int32 IS NOT NULL
+          TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]
+      SubqueryAlias: b
+        Projection: test.col_int32 AS key
+          Inner Join: test.col_int32 = c.col_int32
             Filter: test.col_int32 IS NOT NULL
-              TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]
-          SubqueryAlias: b
-            Projection: test.col_int32 AS key
-              Inner Join: test.col_int32 = c.col_int32
-                Filter: test.col_int32 IS NOT NULL
-                  TableScan: test projection=[col_int32]
-                SubqueryAlias: c
-                  Filter: test.col_int32 IS NOT NULL
-                    TableScan: test projection=[col_int32]
-        "#
+              TableScan: test projection=[col_int32]
+            SubqueryAlias: c
+              Filter: test.col_int32 IS NOT NULL
+                TableScan: test projection=[col_int32]
+    "
     );
 }
 
@@ -381,12 +389,12 @@ fn push_down_filter_groupby_expr_contains_alias() {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        Projection: test.col_int32 + test.col_uint32 AS c, count(Int64(1)) AS count(*)
-          Aggregate: groupBy=[[CAST(test.col_int32 AS Int64) + CAST(test.col_uint32 AS Int64)]], aggr=[[count(Int64(1))]]
-            Filter: CAST(test.col_int32 AS Int64) + CAST(test.col_uint32 AS Int64) > Int64(3)
-              TableScan: test projection=[col_int32, col_uint32]
-        "#
+    @r"
+    Projection: test.col_int32 + test.col_uint32 AS c, count(Int64(1)) AS count(*)
+      Aggregate: groupBy=[[CAST(test.col_int32 AS Int64) + CAST(test.col_uint32 AS Int64)]], aggr=[[count(Int64(1))]]
+        Filter: CAST(test.col_int32 AS Int64) + CAST(test.col_uint32 AS Int64) > Int64(3)
+          TableScan: test projection=[col_int32, col_uint32]
+    "
     );
 }
 
@@ -398,14 +406,14 @@ fn test_same_name_but_not_ambiguous() {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        LeftSemi Join: t1.col_int32 = t2.col_int32
-          Aggregate: groupBy=[[t1.col_int32]], aggr=[[]]
-            SubqueryAlias: t1
-              TableScan: test projection=[col_int32]
-          SubqueryAlias: t2
-            TableScan: test projection=[col_int32]
-        "#
+    @r"
+    LeftSemi Join: t1.col_int32 = t2.col_int32
+      Aggregate: groupBy=[[t1.col_int32]], aggr=[[]]
+        SubqueryAlias: t1
+          TableScan: test projection=[col_int32]
+      SubqueryAlias: t2
+        TableScan: test projection=[col_int32]
+    "
     );
 }
 
@@ -420,10 +428,10 @@ fn eliminate_nested_filters() {
 
     assert_snapshot!(
           format!("{plan}"),
-          @r#"
-Filter: test.col_int32 > Int32(0)
-  TableScan: test projection=[col_int32]
-  "#
+          @r"
+    Filter: test.col_int32 > Int32(0)
+      TableScan: test projection=[col_int32]
+    "
     );
 }
 
@@ -438,11 +446,11 @@ fn eliminate_redundant_null_check_on_count() {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        Projection: test.col_int32, count(Int64(1)) AS count(*) AS c
-          Aggregate: groupBy=[[test.col_int32]], aggr=[[count(Int64(1))]]
-            TableScan: test projection=[col_int32]
-        "#
+    @r"
+    Projection: test.col_int32, count(Int64(1)) AS count(*) AS c
+      Aggregate: groupBy=[[test.col_int32]], aggr=[[count(Int64(1))]]
+        TableScan: test projection=[col_int32]
+    "
     );
 }
 
@@ -466,13 +474,13 @@ fn test_propagate_empty_relation_inner_join_and_unions() {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-Union
-  TableScan: test projection=[col_int32]
-  TableScan: test projection=[col_int32]
-  Filter: test.col_int32 < Int32(0)
-    TableScan: test projection=[col_int32]
-  "#);
+    @r"
+    Union
+      TableScan: test projection=[col_int32]
+      TableScan: test projection=[col_int32]
+      Filter: test.col_int32 < Int32(0)
+        TableScan: test projection=[col_int32]
+    ");
 }
 
 #[test]
@@ -483,10 +491,10 @@ fn select_wildcard_with_repeated_column_but_is_aliased() {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        Projection: test.col_int32, test.col_uint32, test.col_utf8, test.col_date32, test.col_date64, test.col_ts_nano_none, test.col_ts_nano_utc, test.col_int32 AS col_32
-          TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]
-        "#
+    @r"
+    Projection: test.col_int32, test.col_uint32, test.col_utf8, test.col_date32, test.col_date64, test.col_ts_nano_none, test.col_ts_nano_utc, test.col_int32 AS col_32
+      TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]
+    "
     );
 }
 
@@ -507,16 +515,16 @@ fn select_correlated_predicate_subquery_with_uppercase_ident() {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        LeftSemi Join: test.col_int32 = __correlated_sq_1.COL_INT32
-          Filter: test.col_int32 IS NOT NULL
-            TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]
-          SubqueryAlias: __correlated_sq_1
-            SubqueryAlias: T1
-              Projection: test.col_int32 AS COL_INT32
-                Filter: test.col_int32 IS NOT NULL
-                  TableScan: test projection=[col_int32]
-        "#
+    @r"
+    LeftSemi Join: test.col_int32 = __correlated_sq_1.COL_INT32
+      Filter: test.col_int32 IS NOT NULL
+        TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]
+      SubqueryAlias: __correlated_sq_1
+        SubqueryAlias: T1
+          Projection: test.col_int32 AS COL_INT32
+            Filter: test.col_int32 IS NOT NULL
+              TableScan: test projection=[col_int32]
+    "
     );
 }
 
@@ -536,14 +544,15 @@ fn recursive_cte_projection_pushdown() -> Result<()> {
     // columns from the base table and recursive table, eliminating unused columns
     assert_snapshot!(
         format!("{plan}"),
-        @r#"SubqueryAlias: nodes
-  RecursiveQuery: is_distinct=false
-    Projection: test.col_int32 AS id
-      TableScan: test projection=[col_int32]
-    Projection: CAST(CAST(nodes.id AS Int64) + Int64(1) AS Int32)
-      Filter: nodes.id < Int32(3)
-        TableScan: nodes projection=[id]
-"#
+        @r"
+    SubqueryAlias: nodes
+      RecursiveQuery: is_distinct=false
+        Projection: test.col_int32 AS id
+          TableScan: test projection=[col_int32]
+        Projection: CAST(CAST(nodes.id AS Int64) + Int64(1) AS Int32)
+          Filter: nodes.id < Int32(3)
+            TableScan: nodes projection=[id]
+    "
     );
     Ok(())
 }
@@ -559,14 +568,16 @@ fn recursive_cte_with_aliased_self_reference() -> Result<()> {
 
     assert_snapshot!(
         format!("{plan}"),
-        @r#"SubqueryAlias: nodes
-  RecursiveQuery: is_distinct=false
-    Projection: test.col_int32 AS id
-      TableScan: test projection=[col_int32]
-    Projection: CAST(CAST(child.id AS Int64) + Int64(1) AS Int32)
-      SubqueryAlias: child
-        Filter: nodes.id < Int32(3)
-          TableScan: nodes projection=[id]"#,
+        @r"
+    SubqueryAlias: nodes
+      RecursiveQuery: is_distinct=false
+        Projection: test.col_int32 AS id
+          TableScan: test projection=[col_int32]
+        Projection: CAST(CAST(child.id AS Int64) + Int64(1) AS Int32)
+          SubqueryAlias: child
+            Filter: nodes.id < Int32(3)
+              TableScan: nodes projection=[id]
+    ",
     );
     Ok(())
 }
@@ -586,15 +597,16 @@ fn recursive_cte_with_unused_columns() -> Result<()> {
     // even when they're defined in the CTE but not actually needed
     assert_snapshot!(
         format!("{plan}"),
-        @r#"SubqueryAlias: series
-  RecursiveQuery: is_distinct=false
-    Projection: Int64(1) AS n
-      Filter: test.col_int32 = Int32(1)
-        TableScan: test projection=[col_int32]
-    Projection: series.n + Int64(1)
-      Filter: series.n < Int64(3)
-        TableScan: series projection=[n]
-"#
+        @r"
+    SubqueryAlias: series
+      RecursiveQuery: is_distinct=false
+        Projection: Int64(1) AS n
+          Filter: test.col_int32 = Int32(1)
+            TableScan: test projection=[col_int32]
+        Projection: series.n + Int64(1)
+          Filter: series.n < Int64(3)
+            TableScan: series projection=[n]
+    "
     );
     Ok(())
 }
@@ -618,15 +630,16 @@ fn recursive_cte_projection_pushdown_baseline() -> Result<()> {
     // and only the needed column is selected from the recursive table
     assert_snapshot!(
         format!("{plan}"),
-        @r#"SubqueryAlias: countdown
-  RecursiveQuery: is_distinct=false
-    Projection: test.col_int32 AS n
-      Filter: test.col_int32 = Int32(5)
-        TableScan: test projection=[col_int32]
-    Projection: CAST(CAST(countdown.n AS Int64) - Int64(1) AS Int32)
-      Filter: countdown.n > Int32(1)
-        TableScan: countdown projection=[n]
-"#
+        @r"
+    SubqueryAlias: countdown
+      RecursiveQuery: is_distinct=false
+        Projection: test.col_int32 AS n
+          Filter: test.col_int32 = Int32(5)
+            TableScan: test projection=[col_int32]
+        Projection: CAST(CAST(countdown.n AS Int64) - Int64(1) AS Int32)
+          Filter: countdown.n > Int32(1)
+            TableScan: countdown projection=[n]
+    "
     );
     Ok(())
 }
@@ -683,6 +696,144 @@ fn test_sql(sql: &str) -> Result<LogicalPlan> {
 
 fn observe(_plan: &LogicalPlan, _rule: &dyn OptimizerRule) {}
 
+fn optimize_plan(plan: LogicalPlan) -> Result<LogicalPlan> {
+    let config = OptimizerContext::new().with_skip_failing_rules(false);
+    let optimizer = Optimizer::new();
+    optimizer.optimize(plan, &config, observe)
+}
+
+/// Extension node that does NOT implement `necessary_children_exprs`.
+/// Used to test that the optimizer still processes subtrees below such nodes.
+#[derive(Debug, Hash, PartialEq, Eq)]
+struct OpaqueRequirementsExtension {
+    input: Arc<LogicalPlan>,
+    schema: DFSchemaRef,
+}
+
+impl PartialOrd for OpaqueRequirementsExtension {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        self.input
+            .partial_cmp(&other.input)
+            .filter(|cmp| *cmp != Ordering::Equal || self == other)
+    }
+}
+
+impl UserDefinedLogicalNodeCore for OpaqueRequirementsExtension {
+    fn name(&self) -> &str {
+        "OpaqueRequirementsExtension"
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.input]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn with_exprs_and_inputs(
+        &self,
+        _exprs: Vec<Expr>,
+        mut inputs: Vec<LogicalPlan>,
+    ) -> Result<Self> {
+        Ok(Self {
+            input: Arc::new(inputs.swap_remove(0)),
+            schema: Arc::clone(&self.schema),
+        })
+    }
+
+    fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result {
+        write!(f, "OpaqueRequirementsExtension")
+    }
+}
+
+struct InexactFilterTableSource {
+    schema: SchemaRef,
+}
+
+impl TableSource for InexactFilterTableSource {
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    fn supports_filters_pushdown(
+        &self,
+        filters: &[&Expr],
+    ) -> Result<Vec<TableProviderFilterPushDown>> {
+        Ok(vec![TableProviderFilterPushDown::Inexact; filters.len()])
+    }
+}
+
+/// Reproduction of https://github.com/apache/datafusion/issues/18816
+/// Extension nodes without `necessary_children_exprs` should not prevent
+/// the optimizer from pruning unnecessary columns in subtrees.
+#[test]
+fn extension_node_does_not_block_projection_pruning() -> Result<()> {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Int32, true),
+        Field::new("b", DataType::Int32, true),
+        Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true),
+    ]));
+
+    let table_source: Arc<dyn TableSource> = Arc::new(InexactFilterTableSource {
+        schema: Arc::clone(&schema),
+    });
+
+    let ts_cast = Expr::Cast(Cast::new(
+        Box::new(col("t.ts")),
+        DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".into())),
+    ));
+    let ts_millis_1000 = Expr::Literal(
+        ScalarValue::TimestampMillisecond(Some(1000), Some("UTC".into())),
+        None,
+    );
+    let ts_millis_2000 = Expr::Literal(
+        ScalarValue::TimestampMillisecond(Some(2000), Some("UTC".into())),
+        None,
+    );
+
+    let plan = LogicalPlanBuilder::scan("t", table_source, None)?
+        .project(vec![col("t.a"), ts_cast.alias_qualified(Some("t"), "ts")])?
+        .filter(
+            col("t.ts")
+                .gt(ts_millis_1000)
+                .and(col("t.ts").lt(ts_millis_2000)),
+        )?
+        .sort(vec![
+            SortExpr::new(col("t.a"), true, true),
+            SortExpr::new(col("t.ts"), true, true),
+        ])?
+        .build()?;
+
+    let df_schema = schema.to_dfschema_ref()?;
+    let plan = LogicalPlan::Extension(Extension {
+        node: Arc::new(OpaqueRequirementsExtension {
+            input: Arc::new(plan),
+            schema: df_schema,
+        }),
+    });
+
+    let optimized = optimize_plan(plan)?;
+    assert_snapshot!(
+        format!("{optimized}"),
+        @r#"
+    OpaqueRequirementsExtension
+      Sort: t.a ASC NULLS FIRST, t.ts ASC NULLS FIRST
+        Projection: t.a, CAST(t.ts AS Timestamp(ms, "UTC")) AS ts
+          Projection: t.a, t.ts
+            Filter: __common_expr_3 > TimestampMillisecond(1000, Some("UTC")) AND __common_expr_3 < TimestampMillisecond(2000, Some("UTC"))
+              Projection: CAST(t.ts AS Timestamp(ms, "UTC")) AS __common_expr_3, t.a, t.ts
+                TableScan: t projection=[a, ts], partial_filters=[t.ts > TimestampNanosecond(1000000000, None), t.ts < TimestampNanosecond(2000000000, None), CAST(t.ts AS Timestamp(ms, "UTC")) > TimestampMillisecond(1000, Some("UTC")), CAST(t.ts AS Timestamp(ms, "UTC")) < TimestampMillisecond(2000, Some("UTC"))]
+    "#,
+    );
+
+    Ok(())
+}
+
 #[derive(Default)]
 struct MyContextProvider {
     options: ConfigOptions,
@@ -728,6 +879,13 @@ impl ContextProvider for MyContextProvider {
         None
     }
 
+    fn get_higher_order_meta(
+        &self,
+        _name: &str,
+    ) -> Option<Arc<dyn datafusion_expr::HigherOrderUDF>> {
+        None
+    }
+
     fn get_aggregate_meta(&self, name: &str) -> Option<Arc<AggregateUDF>> {
         self.udafs.get(name).cloned()
     }
@@ -756,6 +914,10 @@ impl ContextProvider for MyContextProvider {
         Vec::new()
     }
 
+    fn higher_order_function_names(&self) -> Vec<String> {
+        Vec::new()
+    }
+
     fn udaf_names(&self) -> Vec<String> {
         Vec::new()
     }
@@ -774,10 +936,6 @@ struct MyTableSource {
 }
 
 impl TableSource for MyTableSource {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         self.schema.clone()
     }
diff --git a/datafusion/physical-expr-adapter/Cargo.toml b/datafusion/physical-expr-adapter/Cargo.toml
index 03e1b1f06578d..453c8bdaacb4a 100644
--- a/datafusion/physical-expr-adapter/Cargo.toml
+++ b/datafusion/physical-expr-adapter/Cargo.toml
@@ -24,4 +24,7 @@ datafusion-physical-expr = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
 itertools = { workspace = true }
 
+[lints]
+workspace = true
+
 [dev-dependencies]
diff --git a/datafusion/physical-expr-adapter/LICENSE.txt b/datafusion/physical-expr-adapter/LICENSE.txt
new file mode 120000
index 0000000000000..1ef648f64b34f
--- /dev/null
+++ b/datafusion/physical-expr-adapter/LICENSE.txt
@@ -0,0 +1 @@
+../../LICENSE.txt
\ No newline at end of file
diff --git a/datafusion/physical-expr-adapter/NOTICE.txt b/datafusion/physical-expr-adapter/NOTICE.txt
new file mode 120000
index 0000000000000..fb051c92b10b2
--- /dev/null
+++ b/datafusion/physical-expr-adapter/NOTICE.txt
@@ -0,0 +1 @@
+../../NOTICE.txt
\ No newline at end of file
diff --git a/datafusion/physical-expr-adapter/src/lib.rs b/datafusion/physical-expr-adapter/src/lib.rs
index 12ea0025e2667..ea4db19ee110e 100644
--- a/datafusion/physical-expr-adapter/src/lib.rs
+++ b/datafusion/physical-expr-adapter/src/lib.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 #![doc(
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
@@ -26,6 +27,7 @@
 pub mod schema_rewriter;
 
 pub use schema_rewriter::{
-    DefaultPhysicalExprAdapter, DefaultPhysicalExprAdapterFactory, PhysicalExprAdapter,
-    PhysicalExprAdapterFactory,
+    BatchAdapter, BatchAdapterFactory, DefaultPhysicalExprAdapter,
+    DefaultPhysicalExprAdapterFactory, PhysicalExprAdapter, PhysicalExprAdapterFactory,
+    replace_columns_with_literals,
 };
diff --git a/datafusion/physical-expr-adapter/src/schema_rewriter.rs b/datafusion/physical-expr-adapter/src/schema_rewriter.rs
index 61cc97dae300e..9fb4950317ff8 100644
--- a/datafusion/physical-expr-adapter/src/schema_rewriter.rs
+++ b/datafusion/physical-expr-adapter/src/schema_rewriter.rs
@@ -15,47 +15,85 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Physical expression schema rewriting utilities
+//! Physical expression schema rewriting utilities: [`PhysicalExprAdapter`],
+//! [`PhysicalExprAdapterFactory`], default implementations,
+//! and [`replace_columns_with_literals`].
 
+use std::borrow::Borrow;
+use std::collections::HashMap;
+use std::hash::Hash;
 use std::sync::Arc;
 
-use arrow::compute::can_cast_types;
-use arrow::datatypes::{DataType, FieldRef, Schema, SchemaRef};
+use arrow::array::RecordBatch;
+use arrow::datatypes::{DataType, FieldRef, SchemaRef};
 use datafusion_common::{
-    exec_err,
+    DataFusionError, Result, ScalarValue, exec_err,
+    metadata::FieldMetadata,
+    nested_struct::validate_data_type_compatibility,
     tree_node::{Transformed, TransformedResult, TreeNode},
-    Result, ScalarValue,
 };
 use datafusion_functions::core::getfield::GetFieldFunc;
+use datafusion_physical_expr::PhysicalExprSimplifier;
+use datafusion_physical_expr::projection::{ProjectionExprs, Projector};
 use datafusion_physical_expr::{
-    expressions::{self, CastExpr, Column},
     ScalarFunctionExpr,
+    expressions::{self, CastExpr, Column},
 };
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use itertools::Itertools;
 
-/// Trait for adapting physical expressions to match a target schema.
+/// Replace column references in the given physical expression with literal values.
 ///
-/// This is used in file scans to rewrite expressions so that they can be evaluated
-/// against the physical schema of the file being scanned. It allows for handling
-/// differences between logical and physical schemas, such as type mismatches or missing columns.
+/// Some use cases for this include:
+/// - Partition column pruning: When scanning partitioned data, partition column references
+///   can be replaced with their literal values for the specific partition being scanned.
+/// - Constant folding: In some cases, columns that can be proven to be constant
+///   from statistical analysis may be replaced with their literal values to optimize expression evaluation.
+/// - Filling in non-null default values: in a custom [`PhysicalExprAdapter`] implementation,
+///   column references can be replaced with default literal values instead of nulls.
 ///
-/// ## Overview
+/// # Arguments
+/// - `expr`: The physical expression in which to replace column references.
+/// - `replacements`: A mapping from column names to their corresponding literal `ScalarValue`s.
+///   Accepts various HashMap types including `HashMap<&str, &ScalarValue>`,
+///   `HashMap<String, ScalarValue>`, `HashMap<String, &ScalarValue>`, etc.
 ///
-/// The `PhysicalExprAdapter` allows rewriting physical expressions to match different schemas, including:
+/// # Returns
+/// - `Result<Arc<dyn PhysicalExpr>>`: The rewritten physical expression with columns replaced by literals.
+pub fn replace_columns_with_literals<K, V>(
+    expr: Arc<dyn PhysicalExpr>,
+    replacements: &HashMap<K, V>,
+) -> Result<Arc<dyn PhysicalExpr>>
+where
+    K: Borrow<str> + Eq + Hash,
+    V: Borrow<ScalarValue>,
+{
+    expr.transform_down(|expr| {
+        if let Some(column) = expr.downcast_ref::<Column>()
+            && let Some(replacement_value) = replacements.get(column.name())
+        {
+            return Ok(Transformed::yes(expressions::lit(
+                replacement_value.borrow().clone(),
+            )));
+        }
+        Ok(Transformed::no(expr))
+    })
+    .data()
+}
+
+/// Trait for adapting [`PhysicalExpr`] expressions to match a target schema.
 ///
-/// - **Type casting**: When logical and physical schemas have different types, expressions are
-///   automatically wrapped with cast operations. For example, `lit(ScalarValue::Int32(123)) = int64_column`
-///   gets rewritten to `lit(ScalarValue::Int32(123)) = cast(int64_column, 'Int32')`.
-///   Note that this does not attempt to simplify such expressions - that is done by shared simplifiers.
+/// This is used in file scans to rewrite expressions so that they can be
+/// evaluated against the physical schema of the file being scanned. It allows
+/// for handling differences between logical and physical schemas, such as type
+/// mismatches or missing columns common in [Schema evolution] scenarios.
 ///
-/// - **Missing columns**: When a column exists in the logical schema but not in the physical schema,
-///   references to it are replaced with null literals.
+/// [Schema evolution]: https://www.dremio.com/wiki/schema-evolution/
 ///
-/// - **Struct field access**: Expressions like `struct_column.field_that_is_missing_in_schema` are
-///   rewritten to `null` when the field doesn't exist in the physical schema.
+/// ## Default Implementations
 ///
-/// - **Partition columns**: Partition column references can be replaced with their literal values
-///   when scanning specific partitions.
+/// The default implementation [`DefaultPhysicalExprAdapter`]  handles common
+/// cases.
 ///
 /// ## Custom Implementations
 ///
@@ -79,7 +117,7 @@ use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 /// impl PhysicalExprAdapter for CustomPhysicalExprAdapter {
 ///     fn rewrite(&self, expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
 ///         expr.transform(|expr| {
-///             if let Some(column) = expr.as_any().downcast_ref::<Column>() {
+///             if let Some(column) = expr.downcast_ref::<Column>() {
 ///                 // Check if the column exists in the physical schema
 ///                 if self.physical_file_schema.index_of(column.name()).is_err() {
 ///                     // If the column is missing, fill it with a default value instead of null
@@ -92,17 +130,6 @@ use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 ///             Ok(Transformed::no(expr))
 ///         }).data()
 ///     }
-///
-///     fn with_partition_values(
-///         &self,
-///         partition_values: Vec<(FieldRef, ScalarValue)>,
-///     ) -> Arc<dyn PhysicalExprAdapter> {
-///         // For simplicity, this example ignores partition values
-///         Arc::new(CustomPhysicalExprAdapter {
-///             logical_file_schema: self.logical_file_schema.clone(),
-///             physical_file_schema: self.physical_file_schema.clone(),
-///         })
-///     }
 /// }
 ///
 /// #[derive(Debug)]
@@ -113,11 +140,11 @@ use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 ///         &self,
 ///         logical_file_schema: SchemaRef,
 ///         physical_file_schema: SchemaRef,
-///     ) -> Arc<dyn PhysicalExprAdapter> {
-///         Arc::new(CustomPhysicalExprAdapter {
+///     ) -> Result<Arc<dyn PhysicalExprAdapter>> {
+///         Ok(Arc::new(CustomPhysicalExprAdapter {
 ///             logical_file_schema,
 ///             physical_file_schema,
-///         })
+///         }))
 ///     }
 /// }
 /// ```
@@ -135,21 +162,22 @@ pub trait PhysicalExprAdapter: Send + Sync + std::fmt::Debug {
     ///
     /// Returns:
     /// - `Arc<dyn PhysicalExpr>`: The rewritten physical expression that can be evaluated against the physical schema.
+    ///
+    /// See Also:
+    /// - [`replace_columns_with_literals`]: for replacing partition column references with their literal values.
     fn rewrite(&self, expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>>;
-
-    fn with_partition_values(
-        &self,
-        partition_values: Vec<(FieldRef, ScalarValue)>,
-    ) -> Arc<dyn PhysicalExprAdapter>;
 }
 
+/// Creates instances of [`PhysicalExprAdapter`] for given logical and physical schemas.
+///
+/// See [`DefaultPhysicalExprAdapterFactory`] for the default implementation.
 pub trait PhysicalExprAdapterFactory: Send + Sync + std::fmt::Debug {
     /// Create a new instance of the physical expression adapter.
     fn create(
         &self,
         logical_file_schema: SchemaRef,
         physical_file_schema: SchemaRef,
-    ) -> Arc<dyn PhysicalExprAdapter>;
+    ) -> Result<Arc<dyn PhysicalExprAdapter>>;
 }
 
 #[derive(Debug, Clone)]
@@ -160,31 +188,51 @@ impl PhysicalExprAdapterFactory for DefaultPhysicalExprAdapterFactory {
         &self,
         logical_file_schema: SchemaRef,
         physical_file_schema: SchemaRef,
-    ) -> Arc<dyn PhysicalExprAdapter> {
-        Arc::new(DefaultPhysicalExprAdapter {
+    ) -> Result<Arc<dyn PhysicalExprAdapter>> {
+        Ok(Arc::new(DefaultPhysicalExprAdapter {
             logical_file_schema,
             physical_file_schema,
-            partition_values: Vec::new(),
-        })
+        }))
     }
 }
 
-/// Default implementation for rewriting physical expressions to match different schemas.
+/// Default implementation of [`PhysicalExprAdapter`] for rewriting physical
+/// expressions to match different schemas.
+///
+/// ## Overview
+///
+///  [`DefaultPhysicalExprAdapter`] rewrites physical expressions to match
+///  different schemas, including:
+///
+/// - **Type casting**: When logical and physical schemas have different types, expressions are
+///   automatically wrapped with cast operations. For example, `lit(ScalarValue::Int32(123)) = int64_column`
+///   gets rewritten to `lit(ScalarValue::Int32(123)) = cast(int64_column, 'Int32')`.
+///   Note that this does not attempt to simplify such expressions - that is done by shared simplifiers.
+///
+/// - **Missing columns**: When a column exists in the logical schema but not in the physical schema,
+///   references to it are replaced with null literals.
+///
+/// - **Struct field access**: Expressions like `struct_column.field_that_is_missing_in_schema` are
+///   rewritten to `null` when the field doesn't exist in the physical schema.
+///
+/// - **Default column values**: Partition column references can be replaced with their literal values
+///   when scanning specific partitions. See [`replace_columns_with_literals`] for more details.
 ///
 /// # Example
 ///
 /// ```rust
-/// use datafusion_physical_expr_adapter::{DefaultPhysicalExprAdapterFactory, PhysicalExprAdapterFactory};
-/// use arrow::datatypes::Schema;
-/// use std::sync::Arc;
-///
+/// # use datafusion_physical_expr_adapter::{DefaultPhysicalExprAdapterFactory, PhysicalExprAdapterFactory};
+/// # use arrow::datatypes::Schema;
+/// # use std::sync::Arc;
+/// #
 /// # fn example(
 /// #     predicate: std::sync::Arc<dyn datafusion_physical_expr_common::physical_expr::PhysicalExpr>,
 /// #     physical_file_schema: &Schema,
 /// #     logical_file_schema: &Schema,
 /// # ) -> datafusion_common::Result<()> {
 /// let factory = DefaultPhysicalExprAdapterFactory;
-/// let adapter = factory.create(Arc::new(logical_file_schema.clone()), Arc::new(physical_file_schema.clone()));
+/// let adapter =
+///     factory.create(Arc::new(logical_file_schema.clone()), Arc::new(physical_file_schema.clone()))?;
 /// let adapted_predicate = adapter.rewrite(predicate)?;
 /// # Ok(())
 /// # }
@@ -193,7 +241,6 @@ impl PhysicalExprAdapterFactory for DefaultPhysicalExprAdapterFactory {
 pub struct DefaultPhysicalExprAdapter {
     logical_file_schema: SchemaRef,
     physical_file_schema: SchemaRef,
-    partition_values: Vec<(FieldRef, ScalarValue)>,
 }
 
 impl DefaultPhysicalExprAdapter {
@@ -205,7 +252,6 @@ impl DefaultPhysicalExprAdapter {
         Self {
             logical_file_schema,
             physical_file_schema,
-            partition_values: Vec::new(),
         }
     }
 }
@@ -213,32 +259,20 @@ impl DefaultPhysicalExprAdapter {
 impl PhysicalExprAdapter for DefaultPhysicalExprAdapter {
     fn rewrite(&self, expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
         let rewriter = DefaultPhysicalExprAdapterRewriter {
-            logical_file_schema: &self.logical_file_schema,
-            physical_file_schema: &self.physical_file_schema,
-            partition_fields: &self.partition_values,
+            logical_file_schema: Arc::clone(&self.logical_file_schema),
+            physical_file_schema: Arc::clone(&self.physical_file_schema),
         };
         expr.transform(|expr| rewriter.rewrite_expr(Arc::clone(&expr)))
             .data()
     }
-
-    fn with_partition_values(
-        &self,
-        partition_values: Vec<(FieldRef, ScalarValue)>,
-    ) -> Arc<dyn PhysicalExprAdapter> {
-        Arc::new(DefaultPhysicalExprAdapter {
-            partition_values,
-            ..self.clone()
-        })
-    }
 }
 
-struct DefaultPhysicalExprAdapterRewriter<'a> {
-    logical_file_schema: &'a Schema,
-    physical_file_schema: &'a Schema,
-    partition_fields: &'a [(FieldRef, ScalarValue)],
+struct DefaultPhysicalExprAdapterRewriter {
+    logical_file_schema: SchemaRef,
+    physical_file_schema: SchemaRef,
 }
 
-impl<'a> DefaultPhysicalExprAdapterRewriter<'a> {
+impl DefaultPhysicalExprAdapterRewriter {
     fn rewrite_expr(
         &self,
         expr: Arc<dyn PhysicalExpr>,
@@ -247,7 +281,7 @@ impl<'a> DefaultPhysicalExprAdapterRewriter<'a> {
             return Ok(Transformed::yes(transformed));
         }
 
-        if let Some(column) = expr.as_any().downcast_ref::<Column>() {
+        if let Some(column) = expr.downcast_ref::<Column>() {
             return self.rewrite_column(Arc::clone(&expr), column);
         }
 
@@ -277,10 +311,7 @@ impl<'a> DefaultPhysicalExprAdapterRewriter<'a> {
             None => return Ok(None),
         };
 
-        let lit = match field_name_expr
-            .as_any()
-            .downcast_ref::<expressions::Literal>()
-        {
+        let lit = match field_name_expr.downcast_ref::<expressions::Literal>() {
             Some(lit) => lit,
             None => return Ok(None),
         };
@@ -290,7 +321,7 @@ impl<'a> DefaultPhysicalExprAdapterRewriter<'a> {
             None => return Ok(None),
         };
 
-        let column = match source_expr.as_any().downcast_ref::<Column>() {
+        let column = match source_expr.downcast_ref::<Column>() {
             Some(column) => column,
             None => return Ok(None),
         };
@@ -333,7 +364,10 @@ impl<'a> DefaultPhysicalExprAdapterRewriter<'a> {
         };
 
         let null_value = ScalarValue::Null.cast_to(logical_struct_field.data_type())?;
-        Ok(Some(expressions::lit(null_value)))
+        Ok(Some(Arc::new(expressions::Literal::new_with_metadata(
+            null_value,
+            Some(FieldMetadata::from(logical_struct_field.as_ref())),
+        ))))
     }
 
     fn rewrite_column(
@@ -346,10 +380,6 @@ impl<'a> DefaultPhysicalExprAdapterRewriter<'a> {
         {
             Ok(field) => field,
             Err(e) => {
-                // If the column is a partition field, we can use the partition value
-                if let Some(partition_value) = self.get_partition_value(column.name()) {
-                    return Ok(Transformed::yes(expressions::lit(partition_value)));
-                }
                 // This can be hit if a custom rewrite injected a reference to a column that doesn't exist in the logical schema.
                 // For example, a pre-computed column that is kept only in the physical schema.
                 // If the column exists in the physical schema, we can still use it.
@@ -369,89 +399,268 @@ impl<'a> DefaultPhysicalExprAdapterRewriter<'a> {
             }
         };
 
-        // Check if the column exists in the physical schema
-        let physical_column_index =
-            match self.physical_file_schema.index_of(column.name()) {
-                Ok(index) => index,
-                Err(_) => {
-                    if !logical_field.is_nullable() {
-                        return exec_err!(
-                        "Non-nullable column '{}' is missing from the physical schema",
-                        column.name()
-                    );
-                    }
-                    // If the column is missing from the physical schema fill it in with nulls as `SchemaAdapter` would do.
-                    // TODO: do we need to sync this with what the `SchemaAdapter` actually does?
-                    // While the default implementation fills in nulls in theory a custom `SchemaAdapter` could do something else!
-                    // See https://github.com/apache/datafusion/issues/16527
-                    let null_value =
-                        ScalarValue::Null.cast_to(logical_field.data_type())?;
-                    return Ok(Transformed::yes(expressions::lit(null_value)));
-                }
-            };
-        let physical_field = self.physical_file_schema.field(physical_column_index);
-
-        let column = match (
-            column.index() == physical_column_index,
-            logical_field.data_type() == physical_field.data_type(),
-        ) {
-            // If the column index matches and the data types match, we can use the column as is
-            (true, true) => return Ok(Transformed::no(expr)),
-            // If the indexes or data types do not match, we need to create a new column expression
-            (true, _) => column.clone(),
-            (false, _) => {
-                Column::new_with_schema(logical_field.name(), self.physical_file_schema)?
+        let Some((resolved_column, physical_field)) =
+            self.resolve_physical_column(column)?
+        else {
+            if !logical_field.is_nullable() {
+                return exec_err!(
+                    "Non-nullable column '{}' is missing from the physical schema",
+                    column.name()
+                );
             }
+            // If the column is missing from the physical schema fill it in with nulls.
+            // For a different behavior, provide a custom `PhysicalExprAdapter` implementation.
+            let null_value = ScalarValue::Null.cast_to(logical_field.data_type())?;
+            return Ok(Transformed::yes(Arc::new(
+                expressions::Literal::new_with_metadata(
+                    null_value,
+                    Some(FieldMetadata::from(logical_field)),
+                ),
+            )));
         };
 
-        if logical_field.data_type() == physical_field.data_type() {
-            // If the data types match, we can use the column as is
-            return Ok(Transformed::yes(Arc::new(column)));
+        let fields_match = logical_field == physical_field.as_ref();
+        if fields_match {
+            if resolved_column.index() == column.index() {
+                return Ok(Transformed::no(expr));
+            }
+
+            // If the fields match (including metadata/nullability), we can use the column as is
+            return Ok(Transformed::yes(Arc::new(resolved_column)));
         }
 
-        // We need to cast the column to the logical data type
+        // We need a cast expression whenever the logical and physical fields differ,
+        // whether that difference is only metadata/nullability or also data type.
         // TODO: add optimization to move the cast from the column to literal expressions in the case of `col = 123`
         // since that's much cheaper to evalaute.
         // See https://github.com/apache/datafusion/issues/15780#issuecomment-2824716928
-        let is_compatible =
-            can_cast_types(physical_field.data_type(), logical_field.data_type());
-        if !is_compatible {
-            return exec_err!(
-                "Cannot cast column '{}' from '{}' (physical data type) to '{}' (logical data type)",
-                column.name(),
+        validate_data_type_compatibility(
+            resolved_column.name(),
+            physical_field.data_type(),
+            logical_field.data_type(),
+        )
+        .map_err(|e| {
+            DataFusionError::Execution(format!(
+                "Cannot cast column '{}' from '{}' (physical data type) to '{}' (logical data type): {e}",
+                resolved_column.name(),
                 physical_field.data_type(),
                 logical_field.data_type()
-            );
-        }
+            ))
+        })?;
 
-        let cast_expr = Arc::new(CastExpr::new(
-            Arc::new(column),
-            logical_field.data_type().clone(),
+        Ok(Transformed::yes(Arc::new(CastExpr::new_with_target_field(
+            Arc::new(resolved_column),
+            Arc::new(logical_field.clone()),
             None,
-        ));
+        ))))
+    }
 
-        Ok(Transformed::yes(cast_expr))
+    /// Resolves a logical column to the corresponding physical column and field.
+    fn resolve_physical_column(
+        &self,
+        column: &Column,
+    ) -> Result<Option<(Column, FieldRef)>> {
+        // The physical schema adaptation step intentionally resolves columns by **name first**
+        // rather than trusting the incoming index. This mirrors what the old refactoring
+        // did before `resolve_physical_column()` was extracted: the planner might hand us a
+        // `Column` whose `index` field is stale (e.g. after projection/rename rewrites), so
+        // resolving by name ensures we match the correct physical slot. Once we know the
+        // proper index we rebuild the `Column` with `new_with_schema` so callers can rely
+        // on `column.index()` later without having to re-query the schema.
+        let Ok(physical_column_index) = self.physical_file_schema.index_of(column.name())
+        else {
+            return Ok(None);
+        };
+
+        let column = if column.index() == physical_column_index {
+            column.clone()
+        } else {
+            Column::new_with_schema(column.name(), self.physical_file_schema.as_ref())?
+        };
+
+        let physical_field = Arc::new(
+            self.physical_file_schema
+                .field(physical_column_index)
+                .clone(),
+        );
+
+        Ok(Some((column, physical_field)))
     }
+}
 
-    fn get_partition_value(&self, column_name: &str) -> Option<ScalarValue> {
-        self.partition_fields
-            .iter()
-            .find(|(field, _)| field.name() == column_name)
-            .map(|(_, value)| value.clone())
+/// Factory for creating [`BatchAdapter`] instances to adapt record batches
+/// to a target schema.
+///
+/// This binds a target schema and allows creating adapters for different source schemas.
+/// It handles:
+/// - **Column reordering**: Columns are reordered to match the target schema
+/// - **Type casting**: Automatic type conversion (e.g., Int32 to Int64)
+/// - **Missing columns**: Nullable columns missing from source are filled with nulls
+/// - **Struct field adaptation**: Nested struct fields are recursively adapted
+///
+/// ## Examples
+///
+/// ```rust
+/// use arrow::array::{Int32Array, Int64Array, StringArray, RecordBatch};
+/// use arrow::datatypes::{DataType, Field, Schema};
+/// use datafusion_physical_expr_adapter::BatchAdapterFactory;
+/// use std::sync::Arc;
+///
+/// // Target schema has different column order and types
+/// let target_schema = Arc::new(Schema::new(vec![
+///     Field::new("name", DataType::Utf8, true),
+///     Field::new("id", DataType::Int64, false),    // Int64 in target
+///     Field::new("score", DataType::Float64, true), // Missing from source
+/// ]));
+///
+/// // Source schema has different column order and Int32 for id
+/// let source_schema = Arc::new(Schema::new(vec![
+///     Field::new("id", DataType::Int32, false),    // Int32 in source
+///     Field::new("name", DataType::Utf8, true),
+///     // Note: 'score' column is missing from source
+/// ]));
+///
+/// // Create factory with target schema
+/// let factory = BatchAdapterFactory::new(Arc::clone(&target_schema));
+///
+/// // Create adapter for this specific source schema
+/// let adapter = factory.make_adapter(&source_schema).unwrap();
+///
+/// // Create a source batch
+/// let source_batch = RecordBatch::try_new(
+///     source_schema,
+///     vec![
+///         Arc::new(Int32Array::from(vec![1, 2, 3])),
+///         Arc::new(StringArray::from(vec!["Alice", "Bob", "Carol"])),
+///     ],
+/// ).unwrap();
+///
+/// // Adapt the batch to match target schema
+/// let adapted = adapter.adapt_batch(&source_batch).unwrap();
+///
+/// assert_eq!(adapted.num_columns(), 3);
+/// assert_eq!(adapted.column(0).data_type(), &DataType::Utf8);   // name
+/// assert_eq!(adapted.column(1).data_type(), &DataType::Int64);  // id (cast from Int32)
+/// assert_eq!(adapted.column(2).data_type(), &DataType::Float64); // score (filled with nulls)
+/// ```
+#[derive(Debug)]
+pub struct BatchAdapterFactory {
+    target_schema: SchemaRef,
+    expr_adapter_factory: Arc<dyn PhysicalExprAdapterFactory>,
+}
+
+impl BatchAdapterFactory {
+    /// Create a new [`BatchAdapterFactory`] with the given target schema.
+    pub fn new(target_schema: SchemaRef) -> Self {
+        let expr_adapter_factory = Arc::new(DefaultPhysicalExprAdapterFactory);
+        Self {
+            target_schema,
+            expr_adapter_factory,
+        }
+    }
+
+    /// Set a custom [`PhysicalExprAdapterFactory`] to use when adapting expressions.
+    ///
+    /// Use this to customize behavior when adapting batches, e.g. to fill in missing values
+    /// with defaults instead of nulls.
+    ///
+    /// See [`PhysicalExprAdapter`] for more details.
+    pub fn with_adapter_factory(
+        self,
+        factory: Arc<dyn PhysicalExprAdapterFactory>,
+    ) -> Self {
+        Self {
+            expr_adapter_factory: factory,
+            ..self
+        }
+    }
+
+    /// Create a new [`BatchAdapter`] for the given source schema.
+    ///
+    /// Batches fed into this [`BatchAdapter`] *must* conform to the source schema,
+    /// no validation is performed at runtime to minimize overheads.
+    pub fn make_adapter(&self, source_schema: &SchemaRef) -> Result<BatchAdapter> {
+        let expr_adapter = self
+            .expr_adapter_factory
+            .create(Arc::clone(&self.target_schema), Arc::clone(source_schema))?;
+
+        let simplifier = PhysicalExprSimplifier::new(&self.target_schema);
+
+        let projection = ProjectionExprs::from_indices(
+            &(0..self.target_schema.fields().len()).collect_vec(),
+            &self.target_schema,
+        );
+
+        let adapted = projection
+            .try_map_exprs(|e| simplifier.simplify(expr_adapter.rewrite(e)?))?;
+        let projector = adapted.make_projector(source_schema)?;
+
+        Ok(BatchAdapter { projector })
+    }
+}
+
+/// Adapter for transforming record batches to match a target schema.
+///
+/// Create instances via [`BatchAdapterFactory`].
+///
+/// ## Performance
+///
+/// The adapter pre-computes the projection expressions during creation,
+/// so the [`adapt_batch`](BatchAdapter::adapt_batch) call is efficient and suitable
+/// for use in hot paths like streaming file scans.
+#[derive(Debug)]
+pub struct BatchAdapter {
+    projector: Projector,
+}
+
+impl BatchAdapter {
+    /// Adapt the given record batch to match the target schema.
+    ///
+    /// The input batch *must* conform to the source schema used when
+    /// creating this adapter.
+    pub fn adapt_batch(&self, batch: &RecordBatch) -> Result<RecordBatch> {
+        self.projector.project_batch(batch)
     }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::array::{RecordBatch, RecordBatchOptions};
-    use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
-    use datafusion_common::{assert_contains, record_batch, Result, ScalarValue};
+    use arrow::array::{
+        Array, BooleanArray, GenericListArray, Int32Array, Int64Array, RecordBatch,
+        RecordBatchOptions, StringArray, StringViewArray, StructArray,
+    };
+    use arrow::datatypes::{Field, Fields, Schema};
+    use datafusion_common::{assert_contains, record_batch};
     use datafusion_expr::Operator;
-    use datafusion_physical_expr::expressions::{col, lit, CastExpr, Column, Literal};
-    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-    use itertools::Itertools;
-    use std::sync::Arc;
+    use datafusion_physical_expr::expressions::{Column, Literal, col};
+
+    fn assert_cast_expr(expr: &Arc<dyn PhysicalExpr>) -> &CastExpr {
+        expr.downcast_ref::<CastExpr>().expect("Expected CastExpr")
+    }
+
+    fn assert_cast_input_column(cast_expr: &CastExpr, name: &str, index: usize) {
+        let inner_col = cast_expr
+            .expr()
+            .downcast_ref::<Column>()
+            .expect("Expected inner Column");
+        assert_eq!(inner_col.name(), name);
+        assert_eq!(inner_col.index(), index);
+    }
+
+    fn stale_index_cast_schemas() -> (SchemaRef, SchemaRef) {
+        let physical_schema = Arc::new(Schema::new(vec![
+            Field::new("b", DataType::Binary, true),
+            Field::new("a", DataType::Int32, false),
+        ]));
+
+        let logical_schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Binary, true),
+        ]));
+
+        (logical_schema, physical_schema)
+    }
 
     fn create_test_schema() -> (Schema, Schema) {
         let physical_schema = Schema::new(vec![
@@ -473,20 +682,54 @@ mod tests {
         let (physical_schema, logical_schema) = create_test_schema();
 
         let factory = DefaultPhysicalExprAdapterFactory;
-        let adapter = factory.create(Arc::new(logical_schema), Arc::new(physical_schema));
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema))
+            .unwrap();
         let column_expr = Arc::new(Column::new("a", 0));
 
         let result = adapter.rewrite(column_expr).unwrap();
 
         // Should be wrapped in a cast expression
-        assert!(result.as_any().downcast_ref::<CastExpr>().is_some());
+        assert!(result.downcast_ref::<CastExpr>().is_some());
+    }
+
+    #[test]
+    fn test_rewrite_column_with_metadata_or_nullability_mismatch() -> Result<()> {
+        let physical_schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
+        let logical_schema =
+            Schema::new(vec![Field::new("a", DataType::Int64, false).with_metadata(
+                HashMap::from([("logical_meta".to_string(), "1".to_string())]),
+            )]);
+
+        let factory = DefaultPhysicalExprAdapterFactory;
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema.clone()))
+            .unwrap();
+
+        let result = adapter.rewrite(Arc::new(Column::new("a", 0)))?;
+
+        // Ensure the expression preserves the logical field nullability/metadata.
+        let return_field = result.return_field(physical_schema.as_ref())?;
+        assert_eq!(return_field.data_type(), &DataType::Int64);
+        assert!(!return_field.is_nullable());
+        assert_eq!(
+            return_field
+                .metadata()
+                .get("logical_meta")
+                .map(String::as_str),
+            Some("1")
+        );
+
+        Ok(())
     }
 
     #[test]
     fn test_rewrite_multi_column_expr_with_type_cast() {
         let (physical_schema, logical_schema) = create_test_schema();
         let factory = DefaultPhysicalExprAdapterFactory;
-        let adapter = factory.create(Arc::new(logical_schema), Arc::new(physical_schema));
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema))
+            .unwrap();
 
         // Create a complex expression: (a + 5) OR (c > 0.0) that tests the recursive case of the rewriter
         let column_a = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
@@ -494,7 +737,7 @@ mod tests {
         let expr = expressions::BinaryExpr::new(
             Arc::clone(&column_a),
             Operator::Plus,
-            Arc::new(expressions::Literal::new(ScalarValue::Int64(Some(5)))),
+            Arc::new(Literal::new(ScalarValue::Int64(Some(5)))),
         );
         let expr = expressions::BinaryExpr::new(
             Arc::new(expr),
@@ -502,37 +745,36 @@ mod tests {
             Arc::new(expressions::BinaryExpr::new(
                 Arc::clone(&column_c),
                 Operator::Gt,
-                Arc::new(expressions::Literal::new(ScalarValue::Float64(Some(0.0)))),
+                Arc::new(Literal::new(ScalarValue::Float64(Some(0.0)))),
             )),
         );
 
         let result = adapter.rewrite(Arc::new(expr)).unwrap();
-        println!("Rewritten expression: {result}");
-
-        let expected = expressions::BinaryExpr::new(
-            Arc::new(CastExpr::new(
-                Arc::new(Column::new("a", 0)),
-                DataType::Int64,
-                None,
-            )),
-            Operator::Plus,
-            Arc::new(expressions::Literal::new(ScalarValue::Int64(Some(5)))),
-        );
-        let expected = Arc::new(expressions::BinaryExpr::new(
-            Arc::new(expected),
-            Operator::Or,
-            Arc::new(expressions::BinaryExpr::new(
-                lit(ScalarValue::Float64(None)), // c is missing, so it becomes null
-                Operator::Gt,
-                Arc::new(expressions::Literal::new(ScalarValue::Float64(Some(0.0)))),
-            )),
-        )) as Arc<dyn PhysicalExpr>;
-
-        assert_eq!(
-            result.to_string(),
-            expected.to_string(),
-            "The rewritten expression did not match the expected output"
-        );
+        let outer = result
+            .downcast_ref::<expressions::BinaryExpr>()
+            .expect("Expected outer BinaryExpr");
+        assert_eq!(*outer.op(), Operator::Or);
+
+        let left = outer
+            .left()
+            .downcast_ref::<expressions::BinaryExpr>()
+            .expect("Expected left BinaryExpr");
+        assert_eq!(*left.op(), Operator::Plus);
+
+        let left_cast = assert_cast_expr(left.left());
+        assert_eq!(left_cast.target_field().data_type(), &DataType::Int64);
+        assert_cast_input_column(left_cast, "a", 0);
+
+        let right = outer
+            .right()
+            .downcast_ref::<expressions::BinaryExpr>()
+            .expect("Expected right BinaryExpr");
+        assert_eq!(*right.op(), Operator::Gt);
+        let null_literal = right
+            .left()
+            .downcast_ref::<Literal>()
+            .expect("Expected null literal");
+        assert_eq!(*null_literal.value(), ScalarValue::Float64(None));
     }
 
     #[test]
@@ -550,11 +792,17 @@ mod tests {
         )]);
 
         let factory = DefaultPhysicalExprAdapterFactory;
-        let adapter = factory.create(Arc::new(logical_schema), Arc::new(physical_schema));
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema))
+            .unwrap();
         let column_expr = Arc::new(Column::new("data", 0));
 
         let error_msg = adapter.rewrite(column_expr).unwrap_err().to_string();
-        assert_contains!(error_msg, "Cannot cast column 'data'");
+        // validate_struct_compatibility provides more specific error about which field can't be cast
+        assert_contains!(
+            error_msg,
+            "Cannot cast struct field 'field1' from type Binary to type Int32"
+        );
     }
 
     #[test]
@@ -584,20 +832,27 @@ mod tests {
         )]);
 
         let factory = DefaultPhysicalExprAdapterFactory;
-        let adapter = factory.create(Arc::new(logical_schema), Arc::new(physical_schema));
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema))
+            .unwrap();
         let column_expr = Arc::new(Column::new("data", 0));
 
         let result = adapter.rewrite(column_expr).unwrap();
 
-        let expected = Arc::new(CastExpr::new(
+        let logical_struct_fields: Fields = vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new("name", DataType::Utf8View, true),
+        ]
+        .into();
+        let logical_field = Arc::new(Field::new(
+            "data",
+            DataType::Struct(logical_struct_fields),
+            false,
+        ));
+
+        let expected = Arc::new(CastExpr::new_with_target_field(
             Arc::new(Column::new("data", 0)),
-            DataType::Struct(
-                vec![
-                    Field::new("id", DataType::Int64, false),
-                    Field::new("name", DataType::Utf8View, true),
-                ]
-                .into(),
-            ),
+            logical_field,
             None,
         )) as Arc<dyn PhysicalExpr>;
 
@@ -609,13 +864,15 @@ mod tests {
         let (physical_schema, logical_schema) = create_test_schema();
 
         let factory = DefaultPhysicalExprAdapterFactory;
-        let adapter = factory.create(Arc::new(logical_schema), Arc::new(physical_schema));
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema))
+            .unwrap();
         let column_expr = Arc::new(Column::new("c", 2));
 
         let result = adapter.rewrite(column_expr)?;
 
         // Should be replaced with a literal null
-        if let Some(literal) = result.as_any().downcast_ref::<expressions::Literal>() {
+        if let Some(literal) = result.downcast_ref::<Literal>() {
             assert_eq!(*literal.value(), ScalarValue::Float64(None));
         } else {
             panic!("Expected literal expression");
@@ -624,6 +881,38 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_rewrite_missing_column_propagates_metadata() -> Result<()> {
+        let physical_schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let logical_schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, true).with_metadata(HashMap::from([(
+                "logical_meta".to_string(),
+                "1".to_string(),
+            )])),
+        ]);
+
+        let factory = DefaultPhysicalExprAdapterFactory;
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema.clone()))
+            .unwrap();
+
+        let result = adapter.rewrite(Arc::new(Column::new("b", 1)))?;
+        let literal = result
+            .downcast_ref::<Literal>()
+            .expect("Expected literal expression");
+
+        assert_eq!(
+            literal
+                .return_field(physical_schema.as_ref())?
+                .metadata()
+                .get("logical_meta")
+                .map(String::as_str),
+            Some("1")
+        );
+        Ok(())
+    }
+
     #[test]
     fn test_rewrite_missing_column_non_nullable_error() {
         let physical_schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
@@ -633,7 +922,9 @@ mod tests {
         ]);
 
         let factory = DefaultPhysicalExprAdapterFactory;
-        let adapter = factory.create(Arc::new(logical_schema), Arc::new(physical_schema));
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema))
+            .unwrap();
         let column_expr = Arc::new(Column::new("b", 1));
 
         let error_msg = adapter.rewrite(column_expr).unwrap_err().to_string();
@@ -649,7 +940,9 @@ mod tests {
         ]);
 
         let factory = DefaultPhysicalExprAdapterFactory;
-        let adapter = factory.create(Arc::new(logical_schema), Arc::new(physical_schema));
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema))
+            .unwrap();
         let column_expr = Arc::new(Column::new("b", 1));
 
         let result = adapter.rewrite(column_expr).unwrap();
@@ -661,30 +954,50 @@ mod tests {
     }
 
     #[test]
-    fn test_rewrite_partition_column() -> Result<()> {
-        let (physical_schema, logical_schema) = create_test_schema();
-
-        let partition_field =
-            Arc::new(Field::new("partition_col", DataType::Utf8, false));
+    fn test_replace_columns_with_literals() -> Result<()> {
         let partition_value = ScalarValue::Utf8(Some("test_value".to_string()));
-        let partition_values = vec![(partition_field, partition_value)];
+        let replacements = HashMap::from([("partition_col", &partition_value)]);
 
-        let factory = DefaultPhysicalExprAdapterFactory;
-        let adapter = factory.create(Arc::new(logical_schema), Arc::new(physical_schema));
-        let adapter = adapter.with_partition_values(partition_values);
-
-        let column_expr = Arc::new(Column::new("partition_col", 0));
-        let result = adapter.rewrite(column_expr)?;
+        let column_expr =
+            Arc::new(Column::new("partition_col", 0)) as Arc<dyn PhysicalExpr>;
+        let result = replace_columns_with_literals(column_expr, &replacements)?;
 
         // Should be replaced with the partition value
-        if let Some(literal) = result.as_any().downcast_ref::<expressions::Literal>() {
-            assert_eq!(
-                *literal.value(),
-                ScalarValue::Utf8(Some("test_value".to_string()))
-            );
-        } else {
-            panic!("Expected literal expression");
-        }
+        let literal = result
+            .downcast_ref::<Literal>()
+            .expect("Expected literal expression");
+        assert_eq!(*literal.value(), partition_value);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_replace_columns_with_literals_no_match() -> Result<()> {
+        let value = ScalarValue::Utf8(Some("test_value".to_string()));
+        let replacements = HashMap::from([("other_col", &value)]);
+
+        let column_expr =
+            Arc::new(Column::new("partition_col", 0)) as Arc<dyn PhysicalExpr>;
+        let result = replace_columns_with_literals(column_expr, &replacements)?;
+
+        assert!(result.downcast_ref::<Column>().is_some());
+        Ok(())
+    }
+
+    #[test]
+    fn test_replace_columns_with_literals_nested_expr() -> Result<()> {
+        let value_a = ScalarValue::Int64(Some(10));
+        let value_b = ScalarValue::Int64(Some(20));
+        let replacements = HashMap::from([("a", &value_a), ("b", &value_b)]);
+
+        let expr = Arc::new(expressions::BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Plus,
+            Arc::new(Column::new("b", 1)),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let result = replace_columns_with_literals(expr, &replacements)?;
+        assert_eq!(result.to_string(), "10 + 20");
 
         Ok(())
     }
@@ -694,7 +1007,9 @@ mod tests {
         let (physical_schema, logical_schema) = create_test_schema();
 
         let factory = DefaultPhysicalExprAdapterFactory;
-        let adapter = factory.create(Arc::new(logical_schema), Arc::new(physical_schema));
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema))
+            .unwrap();
         let column_expr = Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>;
 
         let result = adapter.rewrite(Arc::clone(&column_expr))?;
@@ -718,7 +1033,9 @@ mod tests {
         ]);
 
         let factory = DefaultPhysicalExprAdapterFactory;
-        let adapter = factory.create(Arc::new(logical_schema), Arc::new(physical_schema));
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema))
+            .unwrap();
         let column_expr = Arc::new(Column::new("b", 1));
 
         let result = adapter.rewrite(column_expr);
@@ -776,8 +1093,9 @@ mod tests {
         ];
 
         let factory = DefaultPhysicalExprAdapterFactory;
-        let adapter =
-            factory.create(Arc::clone(&logical_schema), Arc::clone(&physical_schema));
+        let adapter = factory
+            .create(Arc::clone(&logical_schema), Arc::clone(&physical_schema))
+            .unwrap();
 
         let adapted_projection = projection
             .into_iter()
@@ -804,7 +1122,7 @@ mod tests {
         assert_eq!(
             res.column(0)
                 .as_any()
-                .downcast_ref::<arrow::array::StringArray>()
+                .downcast_ref::<StringArray>()
                 .unwrap()
                 .iter()
                 .collect_vec(),
@@ -813,7 +1131,7 @@ mod tests {
         assert_eq!(
             res.column(1)
                 .as_any()
-                .downcast_ref::<arrow::array::Int64Array>()
+                .downcast_ref::<Int64Array>()
                 .unwrap()
                 .iter()
                 .collect_vec(),
@@ -821,6 +1139,255 @@ mod tests {
         );
     }
 
+    /// Test that struct columns are properly adapted including:
+    /// - Type casting of subfields (Int32 -> Int64, Utf8 -> Utf8View)
+    /// - Missing fields in logical schema are filled with nulls
+    #[test]
+    fn test_adapt_struct_batches() {
+        // Physical struct: {id: Int32, name: Utf8}
+        let physical_struct_fields: Fields = vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("name", DataType::Utf8, true),
+        ]
+        .into();
+
+        let struct_array = StructArray::new(
+            physical_struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])) as _,
+                Arc::new(StringArray::from(vec![
+                    Some("alice"),
+                    None,
+                    Some("charlie"),
+                ])) as _,
+            ],
+            None,
+        );
+
+        let physical_schema = Arc::new(Schema::new(vec![Field::new(
+            "data",
+            DataType::Struct(physical_struct_fields),
+            false,
+        )]));
+
+        let physical_batch = RecordBatch::try_new(
+            Arc::clone(&physical_schema),
+            vec![Arc::new(struct_array)],
+        )
+        .unwrap();
+
+        // Logical struct: {id: Int64, name: Utf8View, extra: Boolean}
+        // - id: cast from Int32 to Int64
+        // - name: cast from Utf8 to Utf8View
+        // - extra: missing from physical, should be filled with nulls
+        let logical_struct_fields: Fields = vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new("name", DataType::Utf8View, true),
+            Field::new("extra", DataType::Boolean, true), // New field, not in physical
+        ]
+        .into();
+
+        let logical_schema = Arc::new(Schema::new(vec![Field::new(
+            "data",
+            DataType::Struct(logical_struct_fields),
+            false,
+        )]));
+
+        let projection = vec![col("data", &logical_schema).unwrap()];
+
+        let factory = DefaultPhysicalExprAdapterFactory;
+        let adapter = factory
+            .create(Arc::clone(&logical_schema), Arc::clone(&physical_schema))
+            .unwrap();
+
+        let adapted_projection = projection
+            .into_iter()
+            .map(|expr| adapter.rewrite(expr).unwrap())
+            .collect_vec();
+
+        let adapted_schema = Arc::new(Schema::new(
+            adapted_projection
+                .iter()
+                .map(|expr| expr.return_field(&physical_schema).unwrap())
+                .collect_vec(),
+        ));
+
+        let res = batch_project(
+            adapted_projection,
+            &physical_batch,
+            Arc::clone(&adapted_schema),
+        )
+        .unwrap();
+
+        assert_eq!(res.num_columns(), 1);
+
+        let result_struct = res
+            .column(0)
+            .as_any()
+            .downcast_ref::<StructArray>()
+            .unwrap();
+
+        // Verify id field is cast to Int64
+        let id_col = result_struct.column_by_name("id").unwrap();
+        assert_eq!(id_col.data_type(), &DataType::Int64);
+        let id_values = id_col.as_any().downcast_ref::<Int64Array>().unwrap();
+        assert_eq!(
+            id_values.iter().collect_vec(),
+            vec![Some(1), Some(2), Some(3)]
+        );
+
+        // Verify name field is cast to Utf8View
+        let name_col = result_struct.column_by_name("name").unwrap();
+        assert_eq!(name_col.data_type(), &DataType::Utf8View);
+        let name_values = name_col.as_any().downcast_ref::<StringViewArray>().unwrap();
+        assert_eq!(
+            name_values.iter().collect_vec(),
+            vec![Some("alice"), None, Some("charlie")]
+        );
+
+        // Verify extra field (missing from physical) is filled with nulls
+        let extra_col = result_struct.column_by_name("extra").unwrap();
+        assert_eq!(extra_col.data_type(), &DataType::Boolean);
+        let extra_values = extra_col.as_any().downcast_ref::<BooleanArray>().unwrap();
+        assert_eq!(extra_values.iter().collect_vec(), vec![None, None, None]);
+    }
+
+    /// Test that List<Struct> columns are properly adapted with struct evolution.
+    #[test]
+    fn test_adapt_list_struct_batches() {
+        // Physical: List<{id: Int32, name: Utf8}>
+        let physical_struct_fields: Fields = vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("name", DataType::Utf8, true),
+        ]
+        .into();
+
+        let struct_array = StructArray::new(
+            physical_struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])) as _,
+                Arc::new(StringArray::from(vec![
+                    Some("alice"),
+                    None,
+                    Some("charlie"),
+                ])) as _,
+            ],
+            None,
+        );
+
+        // One list element per row
+        let item_field = Arc::new(Field::new(
+            "item",
+            DataType::Struct(physical_struct_fields.clone()),
+            true,
+        ));
+        let offsets =
+            arrow::buffer::OffsetBuffer::from_lengths(vec![1usize; struct_array.len()]);
+        let list_array = GenericListArray::<i32>::new(
+            item_field,
+            offsets,
+            Arc::new(struct_array),
+            None,
+        );
+
+        let physical_schema = Arc::new(Schema::new(vec![Field::new(
+            "data",
+            DataType::List(Arc::new(Field::new(
+                "item",
+                DataType::Struct(physical_struct_fields),
+                true,
+            ))),
+            false,
+        )]));
+
+        let physical_batch = RecordBatch::try_new(
+            Arc::clone(&physical_schema),
+            vec![Arc::new(list_array)],
+        )
+        .unwrap();
+
+        // Logical: List<{id: Int64, name: Utf8View, extra: Boolean}>
+        let logical_struct_fields: Fields = vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new("name", DataType::Utf8View, true),
+            Field::new("extra", DataType::Boolean, true),
+        ]
+        .into();
+
+        let logical_schema = Arc::new(Schema::new(vec![Field::new(
+            "data",
+            DataType::List(Arc::new(Field::new(
+                "item",
+                DataType::Struct(logical_struct_fields.clone()),
+                true,
+            ))),
+            false,
+        )]));
+
+        let projection = vec![col("data", &logical_schema).unwrap()];
+
+        let factory = DefaultPhysicalExprAdapterFactory;
+        let adapter = factory
+            .create(Arc::clone(&logical_schema), Arc::clone(&physical_schema))
+            .unwrap();
+
+        let adapted_projection = projection
+            .into_iter()
+            .map(|expr| adapter.rewrite(expr).unwrap())
+            .collect_vec();
+
+        let adapted_schema = Arc::new(Schema::new(
+            adapted_projection
+                .iter()
+                .map(|expr| expr.return_field(&physical_schema).unwrap())
+                .collect_vec(),
+        ));
+
+        let res = batch_project(
+            adapted_projection,
+            &physical_batch,
+            Arc::clone(&adapted_schema),
+        )
+        .unwrap();
+
+        assert_eq!(res.num_columns(), 1);
+
+        let result_list = res
+            .column(0)
+            .as_any()
+            .downcast_ref::<GenericListArray<i32>>()
+            .unwrap();
+
+        // Check each list element contains the evolved struct
+        assert_eq!(result_list.len(), 3);
+        let flat_structs = result_list
+            .values()
+            .as_any()
+            .downcast_ref::<StructArray>()
+            .unwrap();
+
+        let id_col = flat_structs.column_by_name("id").unwrap();
+        assert_eq!(id_col.data_type(), &DataType::Int64);
+        let id_values = id_col.as_any().downcast_ref::<Int64Array>().unwrap();
+        assert_eq!(
+            id_values.iter().collect_vec(),
+            vec![Some(1), Some(2), Some(3)]
+        );
+
+        let name_col = flat_structs.column_by_name("name").unwrap();
+        assert_eq!(name_col.data_type(), &DataType::Utf8View);
+        let name_values = name_col.as_any().downcast_ref::<StringViewArray>().unwrap();
+        assert_eq!(
+            name_values.iter().collect_vec(),
+            vec![Some("alice"), None, Some("charlie")]
+        );
+
+        let extra_col = flat_structs.column_by_name("extra").unwrap();
+        assert_eq!(extra_col.data_type(), &DataType::Boolean);
+        let extra_values = extra_col.as_any().downcast_ref::<BooleanArray>().unwrap();
+        assert_eq!(extra_values.iter().collect_vec(), vec![None, None, None]);
+    }
+
     #[test]
     fn test_try_rewrite_struct_field_access() {
         // Test the core logic of try_rewrite_struct_field_access
@@ -845,9 +1412,8 @@ mod tests {
         )]);
 
         let rewriter = DefaultPhysicalExprAdapterRewriter {
-            logical_file_schema: &logical_schema,
-            physical_file_schema: &physical_schema,
-            partition_fields: &[],
+            logical_file_schema: Arc::new(logical_schema),
+            physical_file_schema: Arc::new(physical_schema),
         };
 
         // Test that when a field exists in physical schema, it returns None
@@ -859,4 +1425,262 @@ mod tests {
         // with ScalarUDF, which is complex to set up in a unit test. The integration tests in
         // datafusion/core/tests/parquet/schema_adapter.rs provide better coverage for this functionality.
     }
+
+    // ============================================================================
+    // BatchAdapterFactory and BatchAdapter tests
+    // ============================================================================
+
+    #[test]
+    fn test_batch_adapter_factory_basic() {
+        // Target schema
+        let target_schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Utf8, true),
+        ]));
+
+        // Source schema with different column order and type
+        let source_schema = Arc::new(Schema::new(vec![
+            Field::new("b", DataType::Utf8, true),
+            Field::new("a", DataType::Int32, false), // Int32 -> Int64
+        ]));
+
+        let factory = BatchAdapterFactory::new(Arc::clone(&target_schema));
+        let adapter = factory.make_adapter(&source_schema).unwrap();
+
+        // Create source batch
+        let source_batch = RecordBatch::try_new(
+            Arc::clone(&source_schema),
+            vec![
+                Arc::new(StringArray::from(vec![Some("hello"), None, Some("world")])),
+                Arc::new(Int32Array::from(vec![1, 2, 3])),
+            ],
+        )
+        .unwrap();
+
+        let adapted = adapter.adapt_batch(&source_batch).unwrap();
+
+        // Verify schema matches target
+        assert_eq!(adapted.num_columns(), 2);
+        assert_eq!(adapted.schema().field(0).name(), "a");
+        assert_eq!(adapted.schema().field(0).data_type(), &DataType::Int64);
+        assert_eq!(adapted.schema().field(1).name(), "b");
+        assert_eq!(adapted.schema().field(1).data_type(), &DataType::Utf8);
+
+        // Verify data
+        let col_a = adapted
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int64Array>()
+            .unwrap();
+        assert_eq!(col_a.iter().collect_vec(), vec![Some(1), Some(2), Some(3)]);
+
+        let col_b = adapted
+            .column(1)
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .unwrap();
+        assert_eq!(
+            col_b.iter().collect_vec(),
+            vec![Some("hello"), None, Some("world")]
+        );
+    }
+
+    #[test]
+    fn test_batch_adapter_factory_missing_column() {
+        // Target schema with a column missing from source
+        let target_schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, true), // exists in source
+            Field::new("c", DataType::Float64, true), // missing from source
+        ]));
+
+        let source_schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, true),
+        ]));
+
+        let factory = BatchAdapterFactory::new(Arc::clone(&target_schema));
+        let adapter = factory.make_adapter(&source_schema).unwrap();
+
+        let source_batch = RecordBatch::try_new(
+            Arc::clone(&source_schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2])),
+                Arc::new(StringArray::from(vec!["x", "y"])),
+            ],
+        )
+        .unwrap();
+
+        let adapted = adapter.adapt_batch(&source_batch).unwrap();
+
+        assert_eq!(adapted.num_columns(), 3);
+
+        // Missing column should be filled with nulls
+        let col_c = adapted.column(2);
+        assert_eq!(col_c.data_type(), &DataType::Float64);
+        assert_eq!(col_c.null_count(), 2); // All nulls
+    }
+
+    #[test]
+    fn test_batch_adapter_factory_with_struct() {
+        // Target has struct with Int64 id
+        let target_struct_fields: Fields = vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new("name", DataType::Utf8, true),
+        ]
+        .into();
+        let target_schema = Arc::new(Schema::new(vec![Field::new(
+            "data",
+            DataType::Struct(target_struct_fields),
+            false,
+        )]));
+
+        // Source has struct with Int32 id
+        let source_struct_fields: Fields = vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("name", DataType::Utf8, true),
+        ]
+        .into();
+        let source_schema = Arc::new(Schema::new(vec![Field::new(
+            "data",
+            DataType::Struct(source_struct_fields.clone()),
+            false,
+        )]));
+
+        let struct_array = StructArray::new(
+            source_struct_fields,
+            vec![
+                Arc::new(Int32Array::from(vec![10, 20])) as _,
+                Arc::new(StringArray::from(vec!["a", "b"])) as _,
+            ],
+            None,
+        );
+
+        let source_batch = RecordBatch::try_new(
+            Arc::clone(&source_schema),
+            vec![Arc::new(struct_array)],
+        )
+        .unwrap();
+
+        let factory = BatchAdapterFactory::new(Arc::clone(&target_schema));
+        let adapter = factory.make_adapter(&source_schema).unwrap();
+        let adapted = adapter.adapt_batch(&source_batch).unwrap();
+
+        let result_struct = adapted
+            .column(0)
+            .as_any()
+            .downcast_ref::<StructArray>()
+            .unwrap();
+
+        // Verify id was cast to Int64
+        let id_col = result_struct.column_by_name("id").unwrap();
+        assert_eq!(id_col.data_type(), &DataType::Int64);
+        let id_values = id_col.as_any().downcast_ref::<Int64Array>().unwrap();
+        assert_eq!(id_values.iter().collect_vec(), vec![Some(10), Some(20)]);
+    }
+
+    #[test]
+    fn test_batch_adapter_factory_identity() {
+        // When source and target schemas are identical, should pass through efficiently
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, true),
+        ]));
+
+        let factory = BatchAdapterFactory::new(Arc::clone(&schema));
+        let adapter = factory.make_adapter(&schema).unwrap();
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])),
+                Arc::new(StringArray::from(vec!["a", "b", "c"])),
+            ],
+        )
+        .unwrap();
+
+        let adapted = adapter.adapt_batch(&batch).unwrap();
+
+        assert_eq!(adapted.num_columns(), 2);
+        assert_eq!(adapted.schema().field(0).data_type(), &DataType::Int32);
+        assert_eq!(adapted.schema().field(1).data_type(), &DataType::Utf8);
+    }
+
+    #[test]
+    fn test_batch_adapter_factory_reuse() {
+        // Factory can create multiple adapters for different source schemas
+        let target_schema = Arc::new(Schema::new(vec![
+            Field::new("x", DataType::Int64, false),
+            Field::new("y", DataType::Utf8, true),
+        ]));
+
+        let factory = BatchAdapterFactory::new(Arc::clone(&target_schema));
+
+        // First source schema
+        let source1 = Arc::new(Schema::new(vec![
+            Field::new("x", DataType::Int32, false),
+            Field::new("y", DataType::Utf8, true),
+        ]));
+        let adapter1 = factory.make_adapter(&source1).unwrap();
+
+        // Second source schema (different order)
+        let source2 = Arc::new(Schema::new(vec![
+            Field::new("y", DataType::Utf8, true),
+            Field::new("x", DataType::Int64, false),
+        ]));
+        let adapter2 = factory.make_adapter(&source2).unwrap();
+
+        // Both should work correctly
+        assert!(format!("{adapter1:?}").contains("BatchAdapter"));
+        assert!(format!("{adapter2:?}").contains("BatchAdapter"));
+    }
+
+    #[test]
+    fn test_rewrite_column_index_and_type_mismatch() {
+        let physical_schema = Schema::new(vec![
+            Field::new("b", DataType::Utf8, true),
+            Field::new("a", DataType::Int32, false), // Index 1
+        ]);
+
+        let logical_schema = Schema::new(vec![
+            Field::new("a", DataType::Int64, false), // Index 0, Different Type
+            Field::new("b", DataType::Utf8, true),
+        ]);
+
+        let adapter = DefaultPhysicalExprAdapterFactory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema))
+            .unwrap();
+
+        // Logical column "a" is at index 0
+        let column_expr = Arc::new(Column::new("a", 0));
+
+        let result = adapter.rewrite(column_expr).unwrap();
+
+        // Should be a CastExpr
+        let cast_expr = assert_cast_expr(&result);
+
+        // Verify the inner column points to the correct physical index (1)
+        assert_cast_input_column(cast_expr, "a", 1);
+
+        // Verify cast types
+        assert_eq!(
+            cast_expr.data_type(&Schema::empty()).unwrap(),
+            DataType::Int64
+        );
+    }
+
+    #[test]
+    fn test_rewrite_resolves_physical_column_by_name_before_casting() {
+        let (logical_schema, physical_schema) = stale_index_cast_schemas();
+        let adapter = DefaultPhysicalExprAdapterFactory
+            .create(logical_schema, physical_schema)
+            .unwrap();
+
+        // Deliberately provide the wrong index for column `a`.
+        // Regression: this must still resolve against physical field `a` by name.
+        let rewritten = adapter.rewrite(Arc::new(Column::new("a", 0))).unwrap();
+        let cast_expr = assert_cast_expr(&rewritten);
+        assert_cast_input_column(cast_expr, "a", 1);
+        assert_eq!(cast_expr.target_field().data_type(), &DataType::Int64);
+    }
 }
diff --git a/datafusion/physical-expr-common/Cargo.toml b/datafusion/physical-expr-common/Cargo.toml
index 58dc767dbad2a..0e4748b81d3ff 100644
--- a/datafusion/physical-expr-common/Cargo.toml
+++ b/datafusion/physical-expr-common/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -38,9 +41,20 @@ workspace = true
 name = "datafusion_physical_expr_common"
 
 [dependencies]
-ahash = { workspace = true }
 arrow = { workspace = true }
+chrono = { workspace = true }
 datafusion-common = { workspace = true }
 datafusion-expr-common = { workspace = true }
 hashbrown = { workspace = true }
+indexmap = { workspace = true }
 itertools = { workspace = true }
+parking_lot = { workspace = true }
+pin-project = { workspace = true }
+
+[dev-dependencies]
+criterion = { workspace = true }
+rand = { workspace = true }
+
+[[bench]]
+harness = false
+name = "compare_nested"
diff --git a/datafusion/physical-expr-common/benches/compare_nested.rs b/datafusion/physical-expr-common/benches/compare_nested.rs
new file mode 100644
index 0000000000000..56c122fef9420
--- /dev/null
+++ b/datafusion/physical-expr-common/benches/compare_nested.rs
@@ -0,0 +1,74 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, Int32Array, Scalar, StringArray, StructArray};
+use arrow::datatypes::{DataType, Field, Fields};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_expr_common::operator::Operator;
+use datafusion_physical_expr_common::datum::compare_op_for_nested;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+/// Build a StructArray with fields {x: Int32, y: Utf8}.
+fn make_struct_array(num_rows: usize, rng: &mut StdRng) -> ArrayRef {
+    let ints: Int32Array = (0..num_rows).map(|_| Some(rng.random::<i32>())).collect();
+
+    let strings: StringArray = (0..num_rows)
+        .map(|_| {
+            let s: String = (0..12)
+                .map(|_| rng.random_range(b'a'..=b'z') as char)
+                .collect();
+            Some(s)
+        })
+        .collect();
+
+    let fields = Fields::from(vec![
+        Field::new("x", DataType::Int32, false),
+        Field::new("y", DataType::Utf8, false),
+    ]);
+
+    Arc::new(
+        StructArray::try_new(fields, vec![Arc::new(ints), Arc::new(strings)], None)
+            .unwrap(),
+    )
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let num_rows = 8192;
+    let mut rng = StdRng::seed_from_u64(42);
+
+    let lhs = make_struct_array(num_rows, &mut rng);
+    let rhs_array = make_struct_array(num_rows, &mut rng);
+    let rhs_scalar = Scalar::new(make_struct_array(1, &mut rng));
+
+    c.bench_function("compare_nested array_array", |b| {
+        b.iter(|| {
+            black_box(compare_op_for_nested(Operator::Eq, &lhs, &rhs_array).unwrap())
+        })
+    });
+
+    c.bench_function("compare_nested array_scalar", |b| {
+        b.iter(|| {
+            black_box(compare_op_for_nested(Operator::Eq, &lhs, &rhs_scalar).unwrap())
+        })
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/physical-expr-common/src/binary_map.rs b/datafusion/physical-expr-common/src/binary_map.rs
index b37d9a7773eeb..ad184d6500d56 100644
--- a/datafusion/physical-expr-common/src/binary_map.rs
+++ b/datafusion/physical-expr-common/src/binary_map.rs
@@ -18,15 +18,15 @@
 //! [`ArrowBytesMap`] and [`ArrowBytesSet`] for storing maps/sets of values from
 //! StringArray / LargeStringArray / BinaryArray / LargeBinaryArray.
 
-use ahash::RandomState;
 use arrow::array::{
-    cast::AsArray,
-    types::{ByteArrayType, GenericBinaryType, GenericStringType},
     Array, ArrayRef, BufferBuilder, GenericBinaryArray, GenericStringArray,
     NullBufferBuilder, OffsetSizeTrait,
+    cast::AsArray,
+    types::{ByteArrayType, GenericBinaryType, GenericStringType},
 };
 use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
 use arrow::datatypes::DataType;
+use datafusion_common::hash_utils::RandomState;
 use datafusion_common::hash_utils::create_hashes;
 use datafusion_common::utils::proxy::{HashTableAllocExt, VecAllocExt};
 use std::any::type_name;
@@ -250,7 +250,7 @@ where
             map_size: 0,
             buffer: BufferBuilder::new(INITIAL_BUFFER_CAPACITY),
             offsets: vec![O::default()], // first offset is always 0
-            random_state: RandomState::new(),
+            random_state: RandomState::default(),
             hashes_buffer: vec![],
             null: None,
         }
@@ -349,7 +349,7 @@ where
         let batch_hashes = &mut self.hashes_buffer;
         batch_hashes.clear();
         batch_hashes.resize(values.len(), 0);
-        create_hashes(&[Arc::clone(values)], &self.random_state, batch_hashes)
+        create_hashes([values], &self.random_state, batch_hashes)
             // hash is supported for all types and create_hashes only
             // returns errors for unsupported types
             .unwrap();
@@ -389,7 +389,7 @@ where
                 // is value is already present in the set?
                 let entry = self.map.find_mut(hash, |header| {
                     // compare value if hashes match
-                    if header.len != value_len {
+                    if header.hash != hash || header.len != value_len {
                         return false;
                     }
                     // value is stored inline so no need to consult buffer
@@ -427,7 +427,7 @@ where
                 // Check if the value is already present in the set
                 let entry = self.map.find_mut(hash, |header| {
                     // compare value if hashes match
-                    if header.len != value_len {
+                    if header.hash != hash {
                         return false;
                     }
                     // Need to compare the bytes in the buffer
diff --git a/datafusion/physical-expr-common/src/binary_view_map.rs b/datafusion/physical-expr-common/src/binary_view_map.rs
index 7ce943030a453..abc3e28f82627 100644
--- a/datafusion/physical-expr-common/src/binary_view_map.rs
+++ b/datafusion/physical-expr-common/src/binary_view_map.rs
@@ -17,19 +17,19 @@
 
 //! [`ArrowBytesViewMap`] and [`ArrowBytesViewSet`] for storing maps/sets of values from
 //! `StringViewArray`/`BinaryViewArray`.
-//! Much of the code is from `binary_map.rs`, but with simpler implementation because we directly use the
-//! [`GenericByteViewBuilder`].
-use ahash::RandomState;
+use crate::binary_map::OutputType;
+use arrow::array::NullBufferBuilder;
 use arrow::array::cast::AsArray;
-use arrow::array::{Array, ArrayBuilder, ArrayRef, GenericByteViewBuilder};
+use arrow::array::{Array, ArrayRef, BinaryViewArray, ByteView, make_view};
+use arrow::buffer::{Buffer, ScalarBuffer};
 use arrow::datatypes::{BinaryViewType, ByteViewType, DataType, StringViewType};
+use datafusion_common::hash_utils::RandomState;
 use datafusion_common::hash_utils::create_hashes;
 use datafusion_common::utils::proxy::{HashTableAllocExt, VecAllocExt};
 use std::fmt::Debug;
+use std::mem::size_of;
 use std::sync::Arc;
 
-use crate::binary_map::OutputType;
-
 /// HashSet optimized for storing string or binary values that can produce that
 /// the final set as a `GenericBinaryViewArray` with minimal copies.
 #[derive(Debug)]
@@ -114,6 +114,9 @@ impl ArrowBytesViewSet {
 /// This map is used by the special `COUNT DISTINCT` aggregate function to
 /// store the distinct values, and by the `GROUP BY` operator to store
 /// group values when they are a single string array.
+/// Max size of the in-progress buffer before flushing to completed buffers
+const BYTE_VIEW_MAX_BLOCK_SIZE: usize = 2 * 1024 * 1024;
+
 pub struct ArrowBytesViewMap<V>
 where
     V: Debug + PartialEq + Eq + Clone + Copy + Default,
@@ -125,8 +128,15 @@ where
     /// Total size of the map in bytes
     map_size: usize,
 
-    /// Builder for output array
-    builder: GenericByteViewBuilder<BinaryViewType>,
+    /// Views for all stored values (in insertion order)
+    views: Vec<u128>,
+    /// In-progress buffer for out-of-line string data
+    in_progress: Vec<u8>,
+    /// Completed buffers containing string data
+    completed: Vec<Buffer>,
+    /// Tracks null values (true = null)
+    nulls: NullBufferBuilder,
+
     /// random state used to generate hashes
     random_state: RandomState,
     /// buffer that stores hash values (reused across batches to save allocations)
@@ -149,8 +159,11 @@ where
             output_type,
             map: hashbrown::hash_table::HashTable::with_capacity(INITIAL_MAP_CAPACITY),
             map_size: 0,
-            builder: GenericByteViewBuilder::new(),
-            random_state: RandomState::new(),
+            views: Vec::new(),
+            in_progress: Vec::new(),
+            completed: Vec::new(),
+            nulls: NullBufferBuilder::new(0),
+            random_state: RandomState::default(),
             hashes_buffer: vec![],
             null: None,
         }
@@ -243,7 +256,7 @@ where
         let batch_hashes = &mut self.hashes_buffer;
         batch_hashes.clear();
         batch_hashes.resize(values.len(), 0);
-        create_hashes(&[Arc::clone(values)], &self.random_state, batch_hashes)
+        create_hashes([values], &self.random_state, batch_hashes)
             // hash is supported for all types and create_hashes only
             // returns errors for unsupported types
             .unwrap();
@@ -251,53 +264,92 @@ where
         // step 2: insert each value into the set, if not already present
         let values = values.as_byte_view::<B>();
 
+        // Get raw views buffer for direct comparison
+        let input_views = values.views();
+
         // Ensure lengths are equivalent
-        assert_eq!(values.len(), batch_hashes.len());
+        assert_eq!(values.len(), self.hashes_buffer.len());
+
+        for i in 0..values.len() {
+            let view_u128 = input_views[i];
+            let hash = self.hashes_buffer[i];
 
-        for (value, &hash) in values.iter().zip(batch_hashes.iter()) {
-            // handle null value
-            let Some(value) = value else {
+            // handle null value via validity bitmap check
+            if values.is_null(i) {
                 let payload = if let Some(&(payload, _offset)) = self.null.as_ref() {
                     payload
                 } else {
                     let payload = make_payload_fn(None);
-                    let null_index = self.builder.len();
-                    self.builder.append_null();
+                    let null_index = self.views.len();
+                    self.views.push(0);
+                    self.nulls.append_null();
                     self.null = Some((payload, null_index));
                     payload
                 };
                 observe_payload_fn(payload);
                 continue;
-            };
-
-            // get the value as bytes
-            let value: &[u8] = value.as_ref();
+            }
 
-            let entry = self.map.find_mut(hash, |header| {
-                let v = self.builder.get_value(header.view_idx);
+            // Extract length from the view (first 4 bytes of u128 in little-endian)
+            let len = view_u128 as u32;
 
-                if v.len() != value.len() {
-                    return false;
-                }
+            // Check if value already exists
+            let maybe_payload = {
+                // Borrow completed and in_progress for comparison
+                let completed = &self.completed;
+                let in_progress = &self.in_progress;
 
-                v == value
-            });
+                self.map
+                    .find(hash, |header| {
+                        if header.hash != hash {
+                            return false;
+                        }
+
+                        // Fast path: inline strings can be compared directly
+                        if len <= 12 {
+                            return header.view == view_u128;
+                        }
+
+                        // For larger strings: first compare the 4-byte prefix
+                        let stored_prefix = (header.view >> 32) as u32;
+                        let input_prefix = (view_u128 >> 32) as u32;
+                        if stored_prefix != input_prefix {
+                            return false;
+                        }
+
+                        // Prefix matched - compare full bytes
+                        let byte_view = ByteView::from(header.view);
+                        let stored_len = byte_view.length as usize;
+                        let buffer_index = byte_view.buffer_index as usize;
+                        let offset = byte_view.offset as usize;
+
+                        let stored_value = if buffer_index < completed.len() {
+                            &completed[buffer_index].as_slice()
+                                [offset..offset + stored_len]
+                        } else {
+                            &in_progress[offset..offset + stored_len]
+                        };
+                        let input_value: &[u8] = values.value(i).as_ref();
+                        stored_value == input_value
+                    })
+                    .map(|entry| entry.payload)
+            };
 
-            let payload = if let Some(entry) = entry {
-                entry.payload
+            let payload = if let Some(payload) = maybe_payload {
+                payload
             } else {
-                // no existing value, make a new one.
+                // no existing value, make a new one
+                let value: &[u8] = values.value(i).as_ref();
                 let payload = make_payload_fn(Some(value));
 
-                let inner_view_idx = self.builder.len();
+                // Create view pointing to our buffers
+                let new_view = self.append_value(value);
                 let new_header = Entry {
-                    view_idx: inner_view_idx,
+                    view: new_view,
                     hash,
                     payload,
                 };
 
-                self.builder.append_value(value);
-
                 self.map
                     .insert_accounted(new_header, |h| h.hash, &mut self.map_size);
                 payload
@@ -312,29 +364,58 @@ where
     ///
     /// The values are guaranteed to be returned in the same order in which
     /// they were first seen.
-    pub fn into_state(self) -> ArrayRef {
-        let mut builder = self.builder;
-        match self.output_type {
-            OutputType::BinaryView => {
-                let array = builder.finish();
+    pub fn into_state(mut self) -> ArrayRef {
+        // Flush any remaining in-progress buffer
+        if !self.in_progress.is_empty() {
+            let flushed = std::mem::take(&mut self.in_progress);
+            self.completed.push(Buffer::from_vec(flushed));
+        }
 
-                Arc::new(array)
-            }
+        // Build null buffer if we have any nulls
+        let null_buffer = self.nulls.finish();
+
+        let views = ScalarBuffer::from(self.views);
+        let array =
+            unsafe { BinaryViewArray::new_unchecked(views, self.completed, null_buffer) };
+
+        match self.output_type {
+            OutputType::BinaryView => Arc::new(array),
             OutputType::Utf8View => {
-                // SAFETY:
-                // we asserted the input arrays were all the correct type and
-                // thus since all the values that went in were valid (e.g. utf8)
-                // so are all the values that come out
-                let array = builder.finish();
+                // SAFETY: all input was valid utf8
                 let array = unsafe { array.to_string_view_unchecked() };
                 Arc::new(array)
             }
-            _ => {
-                unreachable!("Utf8/Binary should use `ArrowBytesMap`")
-            }
+            _ => unreachable!("Utf8/Binary should use `ArrowBytesMap`"),
         }
     }
 
+    /// Append a value to our buffers and return the view pointing to it
+    fn append_value(&mut self, value: &[u8]) -> u128 {
+        let len = value.len();
+        let view = if len <= 12 {
+            make_view(value, 0, 0)
+        } else {
+            // Ensure buffer is big enough
+            if self.in_progress.len() + len > BYTE_VIEW_MAX_BLOCK_SIZE {
+                let flushed = std::mem::replace(
+                    &mut self.in_progress,
+                    Vec::with_capacity(BYTE_VIEW_MAX_BLOCK_SIZE),
+                );
+                self.completed.push(Buffer::from_vec(flushed));
+            }
+
+            let buffer_index = self.completed.len() as u32;
+            let offset = self.in_progress.len() as u32;
+            self.in_progress.extend_from_slice(value);
+
+            make_view(value, buffer_index, offset)
+        };
+
+        self.views.push(view);
+        self.nulls.append_non_null();
+        view
+    }
+
     /// Total number of entries (including null, if present)
     pub fn len(&self) -> usize {
         self.non_null_len() + self.null.map(|_| 1).unwrap_or(0)
@@ -353,8 +434,16 @@ where
     /// Return the total size, in bytes, of memory used to store the data in
     /// this set, not including `self`
     pub fn size(&self) -> usize {
+        let views_size = self.views.len() * size_of::<u128>();
+        let in_progress_size = self.in_progress.capacity();
+        let completed_size: usize = self.completed.iter().map(|b| b.len()).sum();
+        let nulls_size = self.nulls.allocated_size();
+
         self.map_size
-            + self.builder.allocated_size()
+            + views_size
+            + in_progress_size
+            + completed_size
+            + nulls_size
             + self.hashes_buffer.allocated_size()
     }
 }
@@ -367,7 +456,8 @@ where
         f.debug_struct("ArrowBytesMap")
             .field("map", &"<map>")
             .field("map_size", &self.map_size)
-            .field("view_builder", &self.builder)
+            .field("views_len", &self.views.len())
+            .field("completed_buffers", &self.completed.len())
             .field("random_state", &self.random_state)
             .field("hashes_buffer", &self.hashes_buffer)
             .finish()
@@ -375,13 +465,20 @@ where
 }
 
 /// Entry in the hash table -- see [`ArrowBytesViewMap`] for more details
+///
+/// Stores the view pointing to our internal buffers, eliminating the need
+/// for a separate builder index. For inline strings (<=12 bytes), the view
+/// contains the entire value. For out-of-line strings, the view contains
+/// buffer_index and offset pointing directly to our storage.
 #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
 struct Entry<V>
 where
     V: Debug + PartialEq + Eq + Clone + Copy + Default,
 {
-    /// The idx into the views array
-    view_idx: usize,
+    /// The u128 view pointing to our internal buffers. For inline strings,
+    /// this contains the complete value. For larger strings, this contains
+    /// the buffer_index/offset into our completed/in_progress buffers.
+    view: u128,
 
     hash: u64,
 
@@ -391,7 +488,7 @@ where
 
 #[cfg(test)]
 mod tests {
-    use arrow::array::{BinaryViewArray, GenericByteViewArray, StringViewArray};
+    use arrow::array::{GenericByteViewArray, StringViewArray};
     use datafusion_common::HashMap;
 
     use super::*;
diff --git a/datafusion/physical-expr-common/src/datum.rs b/datafusion/physical-expr-common/src/datum.rs
index 7084bc440e86b..bd5790507f662 100644
--- a/datafusion/physical-expr-common/src/datum.rs
+++ b/datafusion/physical-expr-common/src/datum.rs
@@ -16,13 +16,15 @@
 // under the License.
 
 use arrow::array::BooleanArray;
-use arrow::array::{make_comparator, ArrayRef, Datum};
-use arrow::buffer::NullBuffer;
-use arrow::compute::SortOptions;
+use arrow::array::{ArrayRef, Datum, make_comparator};
+use arrow::buffer::{BooleanBuffer, NullBuffer};
+use arrow::compute::kernels::cmp::{
+    distinct, eq, gt, gt_eq, lt, lt_eq, neq, not_distinct,
+};
+use arrow::compute::{SortOptions, ilike, like, nilike, nlike};
 use arrow::error::ArrowError;
-use datafusion_common::DataFusionError;
-use datafusion_common::{arrow_datafusion_err, internal_err};
 use datafusion_common::{Result, ScalarValue};
+use datafusion_common::{arrow_datafusion_err, assert_or_internal_err, internal_err};
 use datafusion_expr_common::columnar_value::ColumnarValue;
 use datafusion_expr_common::operator::Operator;
 use std::sync::Arc;
@@ -53,39 +55,67 @@ pub fn apply(
     }
 }
 
-/// Applies a binary [`Datum`] comparison kernel `f` to `lhs` and `rhs`
+/// Applies a binary [`Datum`] comparison operator `op` to `lhs` and `rhs`
 pub fn apply_cmp(
+    op: Operator,
     lhs: &ColumnarValue,
     rhs: &ColumnarValue,
-    f: impl Fn(&dyn Datum, &dyn Datum) -> Result<BooleanArray, ArrowError>,
 ) -> Result<ColumnarValue> {
-    apply(lhs, rhs, |l, r| Ok(Arc::new(f(l, r)?)))
+    if lhs.data_type().is_nested() {
+        apply_cmp_for_nested(op, lhs, rhs)
+    } else {
+        let f = match op {
+            Operator::Eq => eq,
+            Operator::NotEq => neq,
+            Operator::Lt => lt,
+            Operator::LtEq => lt_eq,
+            Operator::Gt => gt,
+            Operator::GtEq => gt_eq,
+            Operator::IsDistinctFrom => distinct,
+            Operator::IsNotDistinctFrom => not_distinct,
+
+            Operator::LikeMatch => like,
+            Operator::ILikeMatch => ilike,
+            Operator::NotLikeMatch => nlike,
+            Operator::NotILikeMatch => nilike,
+
+            _ => {
+                return internal_err!("Invalid compare operator: {}", op);
+            }
+        };
+
+        apply(lhs, rhs, |l, r| Ok(Arc::new(f(l, r)?)))
+    }
 }
 
-/// Applies a binary [`Datum`] comparison kernel `f` to `lhs` and `rhs` for nested type like
+/// Applies a binary [`Datum`] comparison operator `op` to `lhs` and `rhs` for nested type like
 /// List, FixedSizeList, LargeList, Struct, Union, Map, or a dictionary of a nested type
 pub fn apply_cmp_for_nested(
     op: Operator,
     lhs: &ColumnarValue,
     rhs: &ColumnarValue,
 ) -> Result<ColumnarValue> {
-    if matches!(
-        op,
-        Operator::Eq
-            | Operator::NotEq
-            | Operator::Lt
-            | Operator::Gt
-            | Operator::LtEq
-            | Operator::GtEq
-            | Operator::IsDistinctFrom
-            | Operator::IsNotDistinctFrom
-    ) {
-        apply(lhs, rhs, |l, r| {
-            Ok(Arc::new(compare_op_for_nested(op, l, r)?))
-        })
-    } else {
-        internal_err!("invalid operator for nested")
-    }
+    let left_data_type = lhs.data_type();
+    let right_data_type = rhs.data_type();
+
+    assert_or_internal_err!(
+        matches!(
+            op,
+            Operator::Eq
+                | Operator::NotEq
+                | Operator::Lt
+                | Operator::Gt
+                | Operator::LtEq
+                | Operator::GtEq
+                | Operator::IsDistinctFrom
+                | Operator::IsNotDistinctFrom
+        ) && left_data_type.equals_datatype(&right_data_type),
+        "invalid operator or data type mismatch for nested data, op {op} left {left_data_type}, right {right_data_type}",
+    );
+
+    apply(lhs, rhs, |l, r| {
+        Ok(Arc::new(compare_op_for_nested(op, l, r)?))
+    })
 }
 
 /// Compare with eq with either nested or non-nested
@@ -97,7 +127,7 @@ pub fn compare_with_eq(
     if is_nested {
         compare_op_for_nested(Operator::Eq, lhs, rhs)
     } else {
-        arrow::compute::kernels::cmp::eq(lhs, rhs).map_err(|e| arrow_datafusion_err!(e))
+        eq(lhs, rhs).map_err(|e| arrow_datafusion_err!(e))
     }
 }
 
@@ -112,9 +142,7 @@ pub fn compare_op_for_nested(
     let l_len = l.len();
     let r_len = r.len();
 
-    if l_len != r_len && !is_l_scalar && !is_r_scalar {
-        return internal_err!("len mismatch");
-    }
+    assert_or_internal_err!(l_len == r_len || is_l_scalar || is_r_scalar, "len mismatch");
 
     let len = match is_l_scalar {
         true => r_len,
@@ -143,9 +171,9 @@ pub fn compare_op_for_nested(
     };
 
     let values = match (is_l_scalar, is_r_scalar) {
-        (false, false) => (0..len).map(|i| cmp_with_op(i, i)).collect(),
-        (true, false) => (0..len).map(|i| cmp_with_op(0, i)).collect(),
-        (false, true) => (0..len).map(|i| cmp_with_op(i, 0)).collect(),
+        (false, false) => BooleanBuffer::collect_bool(len, |i| cmp_with_op(i, i)),
+        (true, false) => BooleanBuffer::collect_bool(len, |i| cmp_with_op(0, i)),
+        (false, true) => BooleanBuffer::collect_bool(len, |i| cmp_with_op(i, 0)),
         (true, true) => std::iter::once(cmp_with_op(0, 0)).collect(),
     };
 
@@ -161,14 +189,14 @@ pub fn compare_op_for_nested(
             (false, false) | (true, true) => NullBuffer::union(l.nulls(), r.nulls()),
             (true, false) => {
                 // When left is null-scalar and right is array, expand left nulls to match result length
-                match l.nulls().filter(|nulls| !nulls.is_valid(0)) {
+                match l.nulls().filter(|nulls| nulls.is_null(0)) {
                     Some(_) => Some(NullBuffer::new_null(len)), // Left scalar is null
                     None => r.nulls().cloned(),                 // Left scalar is non-null
                 }
             }
             (false, true) => {
                 // When right is null-scalar and left is array, expand right nulls to match result length
-                match r.nulls().filter(|nulls| !nulls.is_valid(0)) {
+                match r.nulls().filter(|nulls| nulls.is_null(0)) {
                     Some(_) => Some(NullBuffer::new_null(len)), // Right scalar is null
                     None => l.nulls().cloned(), // Right scalar is non-null
                 }
diff --git a/datafusion/physical-expr-common/src/lib.rs b/datafusion/physical-expr-common/src/lib.rs
index e21206d906422..b6eaacdca2505 100644
--- a/datafusion/physical-expr-common/src/lib.rs
+++ b/datafusion/physical-expr-common/src/lib.rs
@@ -23,6 +23,7 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! Physical Expr Common packages for [DataFusion]
 //! This package contains high level PhysicalExpr trait
@@ -32,6 +33,7 @@
 pub mod binary_map;
 pub mod binary_view_map;
 pub mod datum;
+pub mod metrics;
 pub mod physical_expr;
 pub mod sort_expr;
 pub mod tree_node;
diff --git a/datafusion/physical-plan/src/metrics/baseline.rs b/datafusion/physical-expr-common/src/metrics/baseline.rs
similarity index 65%
rename from datafusion/physical-plan/src/metrics/baseline.rs
rename to datafusion/physical-expr-common/src/metrics/baseline.rs
index 858773b94664d..52ad4aac9fd98 100644
--- a/datafusion/physical-plan/src/metrics/baseline.rs
+++ b/datafusion/physical-expr-common/src/metrics/baseline.rs
@@ -17,21 +17,25 @@
 
 //! Metrics common for almost all operators
 
-use std::task::Poll;
+use std::{borrow::Cow, collections::BTreeMap, sync::Arc, task::Poll};
 
 use arrow::record_batch::RecordBatch;
+use datafusion_common::{Result, utils::memory::get_record_batch_memory_size};
 
-use crate::spill::get_record_batch_memory_size;
+use super::{
+    Count, ExecutionPlanMetricsSet, Metric, MetricBuilder, MetricsSet, Time, Timestamp,
+};
 
-use super::{Count, ExecutionPlanMetricsSet, MetricBuilder, Time, Timestamp};
-use datafusion_common::Result;
+const OUTPUT_ROWS_SKEW_METRIC_NAME: &str = "output_rows_skew";
 
 /// Helper for creating and tracking common "baseline" metrics for
 /// each operator
 ///
 /// Example:
 /// ```
-/// use datafusion_physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet};
+/// use datafusion_physical_expr_common::metrics::{
+///     BaselineMetrics, ExecutionPlanMetricsSet,
+/// };
 /// let metrics = ExecutionPlanMetricsSet::new();
 ///
 /// let partition = 2;
@@ -63,6 +67,9 @@ pub struct BaselineMetrics {
     /// multiple times.
     /// Issue: <https://github.com/apache/datafusion/issues/16841>
     output_bytes: Count,
+
+    /// output batches: the total output batch count
+    output_batches: Count,
     // Remember to update `docs/source/user-guide/metrics.md` when updating comments
     // or adding new metrics
 }
@@ -75,17 +82,20 @@ impl BaselineMetrics {
 
         Self {
             end_time: MetricBuilder::new(metrics)
-                .with_type(super::MetricType::SUMMARY)
+                .with_type(super::MetricType::Summary)
                 .end_timestamp(partition),
             elapsed_compute: MetricBuilder::new(metrics)
-                .with_type(super::MetricType::SUMMARY)
+                .with_type(super::MetricType::Summary)
                 .elapsed_compute(partition),
             output_rows: MetricBuilder::new(metrics)
-                .with_type(super::MetricType::SUMMARY)
+                .with_type(super::MetricType::Summary)
                 .output_rows(partition),
             output_bytes: MetricBuilder::new(metrics)
-                .with_type(super::MetricType::SUMMARY)
+                .with_type(super::MetricType::Summary)
                 .output_bytes(partition),
+            output_batches: MetricBuilder::new(metrics)
+                .with_type(super::MetricType::Dev)
+                .output_batches(partition),
         }
     }
 
@@ -100,6 +110,7 @@ impl BaselineMetrics {
             elapsed_compute: self.elapsed_compute.clone(),
             output_rows: Default::default(),
             output_bytes: Default::default(),
+            output_batches: Default::default(),
         }
     }
 
@@ -113,6 +124,66 @@ impl BaselineMetrics {
         &self.output_rows
     }
 
+    /// return the metric for the total number of output batches produced
+    pub fn output_batches(&self) -> &Count {
+        &self.output_batches
+    }
+
+    /// Returns a derived metric that summarizes how unevenly `output_rows`
+    /// are distributed across partitions.
+    ///
+    /// The score is normalized to the range `[0%, 100%]`, where `0%`
+    /// indicates a perfectly balanced distribution and `100%` indicates the
+    /// most skewed distribution.
+    ///
+    /// The calculation is:
+    /// `effective_parallelism = square(sum(r_i)) / sum(square(r_i))`
+    /// `output_rows_skew = (1 - ((effective_parallelism - 1) / (partition_count - 1))) * 100%`
+    ///
+    /// Example: for 4 partitions with output rows `[10, 10, 10, 10]`,
+    /// `effective_parallelism = 40^2 / (10^2 + 10^2 + 10^2 + 10^2) = 4`,
+    /// so `output_rows_skew = 0%`. For `[40, 0, 0, 0]`, the score is `100%`.
+    pub fn output_rows_skew_metric(metrics: &MetricsSet) -> Option<Arc<Metric>> {
+        let output_rows = metrics
+            .iter()
+            .filter_map(|metric| match (metric.partition(), metric.value()) {
+                (Some(partition), super::MetricValue::OutputRows(count)) => {
+                    Some((partition, count.value() as u128))
+                }
+                _ => None,
+            })
+            .fold(
+                BTreeMap::<usize, u128>::new(),
+                |mut output_rows, (partition, rows)| {
+                    *output_rows.entry(partition).or_default() += rows;
+                    output_rows
+                },
+            )
+            .into_values()
+            .collect::<Vec<_>>();
+
+        if output_rows.is_empty() {
+            return None;
+        }
+
+        let ratio_metrics = super::RatioMetrics::new().with_display_raw_values(false);
+        if let Some(score) = output_rows_skew_score(&output_rows) {
+            ratio_metrics.set_part((score * 10_000.0).round() as usize);
+            ratio_metrics.set_total(10_000);
+        }
+
+        Some(Arc::new(
+            Metric::new(
+                super::MetricValue::Ratio {
+                    name: Cow::Borrowed(OUTPUT_ROWS_SKEW_METRIC_NAME),
+                    ratio_metrics,
+                },
+                None,
+            )
+            .with_type(super::MetricType::Dev),
+        ))
+    }
+
     /// Records the fact that this operator's execution is complete
     /// (recording the `end_time` metric).
     ///
@@ -166,6 +237,38 @@ impl Drop for BaselineMetrics {
     }
 }
 
+/// See [`BaselineMetrics::output_rows_skew_metric`] for the algorithm.
+fn output_rows_skew_score(output_rows: &[u128]) -> Option<f64> {
+    if output_rows.is_empty() {
+        return None;
+    }
+
+    let partition_count = output_rows.len();
+    if partition_count == 1 {
+        return Some(0.0);
+    }
+
+    let (total_rows, sum_of_squares) =
+        output_rows
+            .iter()
+            .fold((0.0, 0.0), |(total_rows, sum_of_squares), rows| {
+                let rows = *rows as f64;
+                (total_rows + rows, sum_of_squares + rows.powi(2))
+            });
+    if total_rows == 0.0 {
+        return None;
+    }
+
+    if sum_of_squares == 0.0 {
+        return None;
+    }
+
+    let effective_parallelism = total_rows.powi(2) / sum_of_squares;
+    let balanced_score = (effective_parallelism - 1.0) / (partition_count as f64 - 1.0);
+
+    Some((1.0 - balanced_score).clamp(0.0, 1.0))
+}
+
 /// Helper for creating and tracking spill-related metrics for
 /// each operator
 #[derive(Debug, Clone)]
@@ -191,7 +294,7 @@ impl SpillMetrics {
     }
 }
 
-/// Metrics for tracking [`crate::stream::BatchSplitStream`] activity
+/// Metrics for tracking batch splitting activity
 #[derive(Debug, Clone)]
 pub struct SplitMetrics {
     /// Number of times an input [`RecordBatch`] was split
@@ -203,6 +306,7 @@ impl SplitMetrics {
     pub fn new(metrics: &ExecutionPlanMetricsSet, partition: usize) -> Self {
         Self {
             batches_split: MetricBuilder::new(metrics)
+                .with_category(super::MetricCategory::Rows)
                 .counter("batches_split", partition),
         }
     }
@@ -229,6 +333,7 @@ impl RecordOutput for RecordBatch {
         bm.record_output(self.num_rows());
         let n_bytes = get_record_batch_memory_size(&self);
         bm.output_bytes.add(n_bytes);
+        bm.output_batches.add(1);
         self
     }
 }
@@ -238,6 +343,7 @@ impl RecordOutput for &RecordBatch {
         bm.record_output(self.num_rows());
         let n_bytes = get_record_batch_memory_size(self);
         bm.output_bytes.add(n_bytes);
+        bm.output_batches.add(1);
         self
     }
 }
diff --git a/datafusion/physical-plan/src/metrics/builder.rs b/datafusion/physical-expr-common/src/metrics/builder.rs
similarity index 73%
rename from datafusion/physical-plan/src/metrics/builder.rs
rename to datafusion/physical-expr-common/src/metrics/builder.rs
index 6ea947b6d21b0..e9c0b76af2582 100644
--- a/datafusion/physical-plan/src/metrics/builder.rs
+++ b/datafusion/physical-expr-common/src/metrics/builder.rs
@@ -20,8 +20,8 @@
 use std::{borrow::Cow, sync::Arc};
 
 use crate::metrics::{
-    value::{PruningMetrics, RatioMetrics},
-    MetricType,
+    MetricCategory, MetricType,
+    value::{PruningMetrics, RatioMergeStrategy, RatioMetrics},
 };
 
 use super::{
@@ -34,7 +34,7 @@ use super::{
 /// case of constant strings
 ///
 /// ```rust
-/// use datafusion_physical_plan::metrics::*;
+/// use datafusion_physical_expr_common::metrics::*;
 ///
 /// let metrics = ExecutionPlanMetricsSet::new();
 /// let partition = 1;
@@ -60,6 +60,10 @@ pub struct MetricBuilder<'a> {
     /// The type controlling the verbosity/category for this builder
     /// See comments in [`MetricType`] for details
     metric_type: MetricType,
+
+    /// Semantic category (rows / bytes / timing).
+    /// `None` means "always include" (the default for custom metrics).
+    metric_category: Option<MetricCategory>,
 }
 
 impl<'a> MetricBuilder<'a> {
@@ -72,7 +76,8 @@ impl<'a> MetricBuilder<'a> {
             metrics,
             partition: None,
             labels: vec![],
-            metric_type: MetricType::DEV,
+            metric_type: MetricType::Dev,
+            metric_category: None,
         }
     }
 
@@ -88,6 +93,15 @@ impl<'a> MetricBuilder<'a> {
         self
     }
 
+    /// Set the semantic category for the metric being constructed.
+    ///
+    /// See [`MetricCategory`] for details on the determinism properties
+    /// of each category.
+    pub fn with_category(mut self, category: MetricCategory) -> Self {
+        self.metric_category = Some(category);
+        self
+    }
+
     /// Add a label to the metric being constructed
     pub fn with_new_label(
         self,
@@ -111,17 +125,21 @@ impl<'a> MetricBuilder<'a> {
             partition,
             metrics,
             metric_type,
+            metric_category,
         } = self;
-        let metric = Arc::new(
-            Metric::new_with_labels(value, partition, labels).with_type(metric_type),
-        );
-        metrics.register(metric);
+        let mut metric =
+            Metric::new_with_labels(value, partition, labels).with_type(metric_type);
+        if let Some(category) = metric_category {
+            metric = metric.with_category(category);
+        }
+        metrics.register(Arc::new(metric));
     }
 
     /// Consume self and create a new counter for recording output rows
     pub fn output_rows(self, partition: usize) -> Count {
         let count = Count::new();
-        self.with_partition(partition)
+        self.with_category(MetricCategory::Rows)
+            .with_partition(partition)
             .build(MetricValue::OutputRows(count.clone()));
         count
     }
@@ -130,7 +148,8 @@ impl<'a> MetricBuilder<'a> {
     /// triggered by an operator
     pub fn spill_count(self, partition: usize) -> Count {
         let count = Count::new();
-        self.with_partition(partition)
+        self.with_category(MetricCategory::Rows)
+            .with_partition(partition)
             .build(MetricValue::SpillCount(count.clone()));
         count
     }
@@ -139,7 +158,8 @@ impl<'a> MetricBuilder<'a> {
     /// triggered by an operator
     pub fn spilled_bytes(self, partition: usize) -> Count {
         let count = Count::new();
-        self.with_partition(partition)
+        self.with_category(MetricCategory::Bytes)
+            .with_partition(partition)
             .build(MetricValue::SpilledBytes(count.clone()));
         count
     }
@@ -148,7 +168,8 @@ impl<'a> MetricBuilder<'a> {
     /// triggered by an operator
     pub fn spilled_rows(self, partition: usize) -> Count {
         let count = Count::new();
-        self.with_partition(partition)
+        self.with_category(MetricCategory::Rows)
+            .with_partition(partition)
             .build(MetricValue::SpilledRows(count.clone()));
         count
     }
@@ -156,15 +177,26 @@ impl<'a> MetricBuilder<'a> {
     /// Consume self and create a new counter for recording total output bytes
     pub fn output_bytes(self, partition: usize) -> Count {
         let count = Count::new();
-        self.with_partition(partition)
+        self.with_category(MetricCategory::Bytes)
+            .with_partition(partition)
             .build(MetricValue::OutputBytes(count.clone()));
         count
     }
 
+    /// Consume self and create a new counter for recording total output batches
+    pub fn output_batches(self, partition: usize) -> Count {
+        let count = Count::new();
+        self.with_category(MetricCategory::Rows)
+            .with_partition(partition)
+            .build(MetricValue::OutputBatches(count.clone()));
+        count
+    }
+
     /// Consume self and create a new gauge for reporting current memory usage
     pub fn mem_used(self, partition: usize) -> Gauge {
         let gauge = Gauge::new();
-        self.with_partition(partition)
+        self.with_category(MetricCategory::Bytes)
+            .with_partition(partition)
             .build(MetricValue::CurrentMemoryUsage(gauge.clone()));
         gauge
     }
@@ -215,7 +247,8 @@ impl<'a> MetricBuilder<'a> {
     /// CPU time spent by an operator
     pub fn elapsed_compute(self, partition: usize) -> Time {
         let time = Time::new();
-        self.with_partition(partition)
+        self.with_category(MetricCategory::Timing)
+            .with_partition(partition)
             .build(MetricValue::ElapsedCompute(time.clone()));
         time
     }
@@ -228,10 +261,12 @@ impl<'a> MetricBuilder<'a> {
         partition: usize,
     ) -> Time {
         let time = Time::new();
-        self.with_partition(partition).build(MetricValue::Time {
-            name: subset_name.into(),
-            time: time.clone(),
-        });
+        self.with_category(MetricCategory::Timing)
+            .with_partition(partition)
+            .build(MetricValue::Time {
+                name: subset_name.into(),
+                time: time.clone(),
+            });
         time
     }
 
@@ -239,7 +274,8 @@ impl<'a> MetricBuilder<'a> {
     /// starting time of execution for a partition
     pub fn start_timestamp(self, partition: usize) -> Timestamp {
         let timestamp = Timestamp::new();
-        self.with_partition(partition)
+        self.with_category(MetricCategory::Timing)
+            .with_partition(partition)
             .build(MetricValue::StartTimestamp(timestamp.clone()));
         timestamp
     }
@@ -248,7 +284,8 @@ impl<'a> MetricBuilder<'a> {
     /// ending time of execution for a partition
     pub fn end_timestamp(self, partition: usize) -> Timestamp {
         let timestamp = Timestamp::new();
-        self.with_partition(partition)
+        self.with_category(MetricCategory::Timing)
+            .with_partition(partition)
             .build(MetricValue::EndTimestamp(timestamp.clone()));
         timestamp
     }
@@ -260,7 +297,8 @@ impl<'a> MetricBuilder<'a> {
         partition: usize,
     ) -> PruningMetrics {
         let pruning_metrics = PruningMetrics::new();
-        self.with_partition(partition)
+        self.with_category(MetricCategory::Rows)
+            .with_partition(partition)
             .build(MetricValue::PruningMetrics {
                 name: name.into(),
                 // inner values will be `Arc::clone()`
@@ -275,11 +313,23 @@ impl<'a> MetricBuilder<'a> {
         name: impl Into<Cow<'static, str>>,
         partition: usize,
     ) -> RatioMetrics {
-        let ratio_metrics = RatioMetrics::new();
-        self.with_partition(partition).build(MetricValue::Ratio {
-            name: name.into(),
-            ratio_metrics: ratio_metrics.clone(),
-        });
+        self.ratio_metrics_with_strategy(name, partition, RatioMergeStrategy::default())
+    }
+
+    /// Consumes self and creates a new [`RatioMetrics`] with a specific merge strategy
+    pub fn ratio_metrics_with_strategy(
+        self,
+        name: impl Into<Cow<'static, str>>,
+        partition: usize,
+        merge_strategy: RatioMergeStrategy,
+    ) -> RatioMetrics {
+        let ratio_metrics = RatioMetrics::new().with_merge_strategy(merge_strategy);
+        self.with_category(MetricCategory::Rows)
+            .with_partition(partition)
+            .build(MetricValue::Ratio {
+                name: name.into(),
+                ratio_metrics: ratio_metrics.clone(),
+            });
         ratio_metrics
     }
 }
diff --git a/datafusion/physical-plan/src/metrics/custom.rs b/datafusion/physical-expr-common/src/metrics/custom.rs
similarity index 98%
rename from datafusion/physical-plan/src/metrics/custom.rs
rename to datafusion/physical-expr-common/src/metrics/custom.rs
index 4421db94dc179..0bd7ba1b10a25 100644
--- a/datafusion/physical-plan/src/metrics/custom.rs
+++ b/datafusion/physical-expr-common/src/metrics/custom.rs
@@ -44,7 +44,7 @@ use std::{any::Any, fmt::Debug, fmt::Display, sync::Arc};
 /// # use std::any::Any;
 /// # use std::sync::atomic::{AtomicUsize, Ordering};
 ///
-/// # use datafusion_physical_plan::metrics::CustomMetricValue;
+/// # use datafusion_physical_expr_common::metrics::CustomMetricValue;
 ///
 /// #[derive(Debug, Default)]
 /// struct MyCounter {
diff --git a/datafusion/physical-expr-common/src/metrics/elapsed_compute.rs b/datafusion/physical-expr-common/src/metrics/elapsed_compute.rs
new file mode 100644
index 0000000000000..f792f9f46bdc1
--- /dev/null
+++ b/datafusion/physical-expr-common/src/metrics/elapsed_compute.rs
@@ -0,0 +1,101 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::future::Future;
+use std::pin::Pin;
+use std::task::{Context, Poll};
+use std::time::Duration;
+
+use datafusion_common::instant::Instant;
+use pin_project::{pin_project, pinned_drop};
+
+use super::Time;
+
+/// Wraps any [`Future`] and accumulates the wall-clock time spent inside
+/// each [`Future::poll`] call into `elapsed_compute`. Everything that
+/// executes synchronously within a `poll()` scope is measured — including
+/// CPU-bound work, memory copies, and any blocking the future performs
+/// before returning. Time between polls (when the runtime has suspended the
+/// future waiting for I/O, a channel, or a waker) is not measured.
+///
+/// For futures that mix synchronous CPU work with async I/O this gives a
+/// good approximation of CPU time: async I/O causes the future to yield
+/// (`Poll::Pending`), so the I/O latency is excluded automatically.
+///
+/// Note: uses `pin-project` rather than `pin-project-lite` in order to
+/// support `PinnedDrop`, which ensures accumulated time is flushed even
+/// if the future is cancelled (dropped before completion).
+#[pin_project(PinnedDrop)]
+pub struct ElapsedComputeFuture<T> {
+    #[pin]
+    inner: T,
+    /// Local accumulator: elapsed time is collected here during each `poll()`
+    /// and only flushed to `elapsed_compute` on completion (`Poll::Ready`) or
+    /// on drop (`PinnedDrop`). Keeping a separate local `Duration` avoids
+    /// performing an atomic operation on every `poll()` call, at the cost of
+    /// the reported metric value being unavailable until the future finishes.
+    curr: Duration,
+    elapsed_compute: Time,
+}
+
+#[pinned_drop]
+impl<T> PinnedDrop for ElapsedComputeFuture<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if self.curr > Duration::default() {
+            let self_projected = self.project();
+            self_projected
+                .elapsed_compute
+                .add_duration(*self_projected.curr);
+        }
+    }
+}
+
+impl<O, F: Future<Output = O>> Future for ElapsedComputeFuture<F> {
+    type Output = O;
+
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let self_projected = self.project();
+        let start = Instant::now();
+        let result = self_projected.inner.poll(cx);
+        *self_projected.curr += start.elapsed();
+        if result.is_ready() {
+            self_projected
+                .elapsed_compute
+                .add_duration(*self_projected.curr);
+            *self_projected.curr = Duration::default();
+        }
+        result
+    }
+}
+
+/// Extension trait that wraps any [`Future`] with [`ElapsedComputeFuture`].
+pub trait ElapsedComputeFutureExt: Future + Sized {
+    /// Wraps this future so that the time spent inside each [`Future::poll`]
+    /// call is accumulated into `elapsed_compute`. See [`ElapsedComputeFuture`]
+    /// for a full description of what is and is not measured.
+    fn with_elapsed_compute(self, elapsed_compute: Time) -> ElapsedComputeFuture<Self>;
+}
+
+impl<O, F: Future<Output = O>> ElapsedComputeFutureExt for F {
+    fn with_elapsed_compute(self, elapsed_compute: Time) -> ElapsedComputeFuture<Self> {
+        ElapsedComputeFuture {
+            inner: self,
+            curr: Duration::default(),
+            elapsed_compute,
+        }
+    }
+}
diff --git a/datafusion/physical-expr-common/src/metrics/expression.rs b/datafusion/physical-expr-common/src/metrics/expression.rs
new file mode 100644
index 0000000000000..8dd4705d486d4
--- /dev/null
+++ b/datafusion/physical-expr-common/src/metrics/expression.rs
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Metrics helpers for expression evaluation.
+
+use super::{ExecutionPlanMetricsSet, MetricBuilder, MetricType, ScopedTimerGuard, Time};
+
+/// Tracks evaluation time for a sequence of expressions.
+///
+/// # Example
+/// Given SQL query:
+///     EXPLAIN ANALYZE
+///     SELECT a+1, pow(a,2)
+///     FROM generate_series(1, 1000000) as t1(a)
+///
+/// This struct holds two time metrics for the projection expressions
+/// `a+1` and `pow(a,2)`, respectively.
+///
+/// The output reads:
+/// `ProjectionExec: expr=[a@0 + 1 as t1.a + Int64(1), power(CAST(a@0 AS Float64), 2) as pow(t1.a,Int64(2))], metrics=[... expr_0_eval_time=9.23ms, expr_1_eval_time=32.35ms...]`
+#[derive(Debug, Clone)]
+pub struct ExpressionEvaluatorMetrics {
+    expression_times: Vec<Time>,
+}
+
+impl ExpressionEvaluatorMetrics {
+    /// Create metrics for a collection of expressions.
+    ///
+    /// # Args
+    /// - metrics: see `MetricBuilder` for details.
+    /// - partition: see `MetricBuilder` for details.
+    /// - expression_labels: unique identifier for each metric, so that the metric
+    ///   can get aggregated across multiple partitions. It is not the name showed
+    ///   in the `EXPLAIN ANALYZE`, the metric name will be `expr_{idx}_eval_time`
+    ///   according to the expression order.
+    pub fn new<T>(
+        metrics: &ExecutionPlanMetricsSet,
+        partition: usize,
+        expression_labels: impl IntoIterator<Item = T>,
+    ) -> Self
+    where
+        T: Into<String>,
+    {
+        let expression_times = expression_labels
+            .into_iter()
+            .enumerate()
+            .map(|(idx, label)| {
+                MetricBuilder::new(metrics)
+                    .with_new_label("expr", label.into())
+                    .with_type(MetricType::Dev)
+                    // Existing PhysicalExpr formatter is a bit verbose, so use simple name
+                    .subset_time(format!("expr_{idx}_eval_time"), partition)
+            })
+            .collect();
+
+        Self { expression_times }
+    }
+
+    /// Returns a timer guard for the expression at `index`, if present.
+    #[inline]
+    pub fn scoped_timer(&self, index: usize) -> Option<ScopedTimerGuard<'_>> {
+        self.expression_times.get(index).map(Time::timer)
+    }
+
+    /// The number of tracked expressions.
+    pub fn len(&self) -> usize {
+        self.expression_times.len()
+    }
+
+    /// True when no expressions are tracked.
+    pub fn is_empty(&self) -> bool {
+        self.expression_times.is_empty()
+    }
+}
diff --git a/datafusion/physical-plan/src/metrics/mod.rs b/datafusion/physical-expr-common/src/metrics/mod.rs
similarity index 79%
rename from datafusion/physical-plan/src/metrics/mod.rs
rename to datafusion/physical-expr-common/src/metrics/mod.rs
index 4e98af722d4e0..eecd8cfabd5eb 100644
--- a/datafusion/physical-plan/src/metrics/mod.rs
+++ b/datafusion/physical-expr-common/src/metrics/mod.rs
@@ -20,35 +20,40 @@
 mod baseline;
 mod builder;
 mod custom;
+mod elapsed_compute;
+mod expression;
 mod value;
 
+use datafusion_common::HashMap;
+pub use datafusion_common::format::{MetricCategory, MetricType};
 use parking_lot::Mutex;
 use std::{
     borrow::Cow,
-    fmt::{Debug, Display},
+    fmt::{self, Debug, Display},
     sync::Arc,
+    vec::IntoIter,
 };
 
-use datafusion_common::HashMap;
-
 // public exports
+
 pub use baseline::{BaselineMetrics, RecordOutput, SpillMetrics, SplitMetrics};
 pub use builder::MetricBuilder;
 pub use custom::CustomMetricValue;
+pub use elapsed_compute::{ElapsedComputeFuture, ElapsedComputeFutureExt};
+pub use expression::ExpressionEvaluatorMetrics;
 pub use value::{
-    Count, Gauge, MetricValue, PruningMetrics, RatioMetrics, ScopedTimerGuard, Time,
-    Timestamp,
+    Count, Gauge, MetricValue, PruningMetrics, RatioMergeStrategy, RatioMetrics,
+    ScopedTimerGuard, Time, Timestamp,
 };
 
-/// Something that tracks a value of interest (metric) of a DataFusion
-/// [`ExecutionPlan`] execution.
+/// Something that tracks a value of interest (metric) during execution.
 ///
 /// Typically [`Metric`]s are not created directly, but instead
 /// are created using [`MetricBuilder`] or methods on
 /// [`ExecutionPlanMetricsSet`].
 ///
 /// ```
-/// use datafusion_physical_plan::metrics::*;
+/// use datafusion_physical_expr_common::metrics::*;
 ///
 /// let metrics = ExecutionPlanMetricsSet::new();
 /// assert!(metrics.clone_inner().output_rows().is_none());
@@ -66,8 +71,6 @@ pub use value::{
 /// // As well as from the metrics set
 /// assert_eq!(metrics.clone_inner().output_rows(), Some(13));
 /// ```
-///
-/// [`ExecutionPlan`]: super::ExecutionPlan
 
 #[derive(Debug)]
 pub struct Metric {
@@ -82,31 +85,17 @@ pub struct Metric {
     partition: Option<usize>,
 
     metric_type: MetricType,
-}
 
-/// Categorizes metrics so the display layer can choose the desired verbosity.
-///
-/// # How is it used:
-/// The `datafusion.explain.analyze_level` configuration controls which category is shown.
-/// - When set to `dev`, all metrics with type `MetricType::Summary` or `MetricType::DEV`
-///   will be shown.
-/// - When set to `summary`, only metrics with type `MetricType::Summary` are shown.
-///
-/// # Difference from `EXPLAIN ANALYZE VERBOSE`:  
-/// The `VERBOSE` keyword controls whether per-partition metrics are shown (when specified),  
-/// or aggregated metrics are displayed (when omitted).  
-/// In contrast, the `analyze_level` configuration determines which categories or
-/// levels of metrics are displayed.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub enum MetricType {
-    /// Common metrics for high-level insights (answering which operator is slow)
-    SUMMARY,
-    /// For deep operator-level introspection for developers
-    DEV,
+    /// Optional semantic category (rows / bytes / timing).
+    ///
+    /// When `None` (the default for custom metrics), the metric is
+    /// **always included** unless the user sets
+    /// `analyze_categories = 'none'`.
+    metric_category: Option<MetricCategory>,
 }
 
 impl Display for Metric {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         write!(f, "{}", self.value.name())?;
 
         let mut iter = self
@@ -147,7 +136,8 @@ impl Metric {
             value,
             labels: vec![],
             partition,
-            metric_type: MetricType::DEV,
+            metric_type: MetricType::Dev,
+            metric_category: None,
         }
     }
 
@@ -162,16 +152,26 @@ impl Metric {
             value,
             labels,
             partition,
-            metric_type: MetricType::DEV,
+            metric_type: MetricType::Dev,
+            metric_category: None,
         }
     }
 
-    /// Set the type for this metric. Defaults to [`MetricType::DEV`]
+    /// Set the type for this metric. Defaults to [`MetricType::Dev`]
     pub fn with_type(mut self, metric_type: MetricType) -> Self {
         self.metric_type = metric_type;
         self
     }
 
+    /// Set the semantic category for this metric.
+    ///
+    /// See [`MetricCategory`] for details on the determinism properties
+    /// of each category.
+    pub fn with_category(mut self, category: MetricCategory) -> Self {
+        self.metric_category = Some(category);
+        self
+    }
+
     /// Add a new label to this metric
     pub fn with_label(mut self, label: Label) -> Self {
         self.labels.push(label);
@@ -202,11 +202,16 @@ impl Metric {
     pub fn metric_type(&self) -> MetricType {
         self.metric_type
     }
+
+    /// Return the metric category, if one was declared.
+    ///
+    /// `None` means the metric is always included (except in `none` mode).
+    pub fn metric_category(&self) -> Option<MetricCategory> {
+        self.metric_category
+    }
 }
 
-/// A snapshot of the metrics for a particular ([`ExecutionPlan`]).
-///
-/// [`ExecutionPlan`]: super::ExecutionPlan
+/// A snapshot of the metrics for a particular execution plan.
 #[derive(Default, Debug, Clone)]
 pub struct MetricsSet {
     metrics: Vec<Arc<Metric>>,
@@ -299,6 +304,7 @@ impl MetricsSet {
             MetricValue::SpillCount(_) => false,
             MetricValue::SpilledBytes(_) => false,
             MetricValue::OutputBytes(_) => false,
+            MetricValue::OutputBatches(_) => false,
             MetricValue::SpilledRows(_) => false,
             MetricValue::CurrentMemoryUsage(_) => false,
             MetricValue::Gauge { name, .. } => name == metric_name,
@@ -329,6 +335,9 @@ impl MetricsSet {
                     let partition = None;
                     let mut accum = Metric::new(metric.value().new_empty(), partition)
                         .with_type(metric.metric_type());
+                    if let Some(cat) = metric.metric_category() {
+                        accum = accum.with_category(cat);
+                    }
                     accum.value_mut().aggregate(metric.value());
                     accum
                 });
@@ -381,11 +390,37 @@ impl MetricsSet {
             .collect::<Vec<_>>();
         Self { metrics }
     }
+
+    /// Returns a new `MetricsSet` filtered by [`MetricCategory`].
+    ///
+    /// - Metrics that declared a category are kept only when that
+    ///   category appears in `allowed`.
+    /// - Metrics with **no** declared category are treated as
+    ///   [`Uncategorized`](MetricCategory::Uncategorized) for filtering.
+    /// - An **empty** `allowed` slice means "plan only": all metrics are
+    ///   removed.
+    pub fn filter_by_categories(self, allowed: &[MetricCategory]) -> Self {
+        if allowed.is_empty() {
+            return Self { metrics: vec![] };
+        }
+
+        let metrics = self
+            .metrics
+            .into_iter()
+            .filter(|metric| {
+                let cat = metric
+                    .metric_category()
+                    .unwrap_or(MetricCategory::Uncategorized);
+                allowed.contains(&cat)
+            })
+            .collect::<Vec<_>>();
+        Self { metrics }
+    }
 }
 
 impl Display for MetricsSet {
     /// Format the [`MetricsSet`] as a single string
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         let mut is_first = true;
         for i in self.metrics.iter() {
             if !is_first {
@@ -400,17 +435,46 @@ impl Display for MetricsSet {
     }
 }
 
-/// A set of [`Metric`]s for an individual "operator" (e.g. `&dyn
-/// ExecutionPlan`).
+impl IntoIterator for MetricsSet {
+    type Item = Arc<Metric>;
+    type IntoIter = IntoIter<Self::Item>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.metrics.into_iter()
+    }
+}
+
+impl<'a> IntoIterator for &'a MetricsSet {
+    type Item = &'a Arc<Metric>;
+    type IntoIter = std::slice::Iter<'a, Arc<Metric>>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.metrics.iter()
+    }
+}
+
+impl Extend<Arc<Metric>> for MetricsSet {
+    fn extend<I: IntoIterator<Item = Arc<Metric>>>(&mut self, iter: I) {
+        self.metrics.extend(iter);
+    }
+}
+
+impl FromIterator<Arc<Metric>> for MetricsSet {
+    fn from_iter<T: IntoIterator<Item = Arc<Metric>>>(iter: T) -> Self {
+        Self {
+            metrics: iter.into_iter().collect(),
+        }
+    }
+}
+
+/// A set of [`Metric`]s for an individual operator.
 ///
-/// This structure is intended as a convenience for [`ExecutionPlan`]
+/// This structure is intended as a convenience for execution plan
 /// implementations so they can generate different streams for multiple
 /// partitions but easily report them together.
 ///
 /// Each `clone()` of this structure will add metrics to the same
 /// underlying metrics set
-///
-/// [`ExecutionPlan`]: super::ExecutionPlan
 #[derive(Default, Debug, Clone)]
 pub struct ExecutionPlanMetricsSet {
     inner: Arc<Mutex<MetricsSet>>,
@@ -436,6 +500,14 @@ impl ExecutionPlanMetricsSet {
     }
 }
 
+impl From<MetricsSet> for ExecutionPlanMetricsSet {
+    fn from(metrics: MetricsSet) -> Self {
+        Self {
+            inner: Arc::new(Mutex::new(metrics)),
+        }
+    }
+}
+
 /// `name=value` pairs identifying a metric. This concept is called various things
 /// in various different systems:
 ///
@@ -478,7 +550,7 @@ impl Label {
 }
 
 impl Display for Label {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         write!(f, "{}={}", self.name, self.value)
     }
 }
@@ -723,6 +795,52 @@ mod tests {
         };
     }
 
+    #[test]
+    fn test_extend() {
+        let mut metrics = MetricsSet::new();
+        let m1 = Arc::new(Metric::new(MetricValue::OutputRows(Count::new()), None));
+        let m2 = Arc::new(Metric::new(MetricValue::SpillCount(Count::new()), None));
+
+        metrics.extend([Arc::clone(&m1), Arc::clone(&m2)]);
+        assert_eq!(metrics.iter().count(), 2);
+
+        let m3 = Arc::new(Metric::new(MetricValue::SpilledBytes(Count::new()), None));
+        metrics.extend(std::iter::once(Arc::clone(&m3)));
+        assert_eq!(metrics.iter().count(), 3);
+    }
+
+    #[test]
+    fn test_collect() {
+        let m1 = Arc::new(Metric::new(MetricValue::OutputRows(Count::new()), None));
+        let m2 = Arc::new(Metric::new(MetricValue::SpillCount(Count::new()), None));
+
+        let metrics: MetricsSet =
+            vec![Arc::clone(&m1), Arc::clone(&m2)].into_iter().collect();
+        assert_eq!(metrics.iter().count(), 2);
+
+        let empty: MetricsSet = std::iter::empty().collect();
+        assert_eq!(empty.iter().count(), 0);
+    }
+
+    #[test]
+    fn test_into_iterator_by_ref() {
+        let mut metrics = MetricsSet::new();
+        metrics.push(Arc::new(Metric::new(
+            MetricValue::OutputRows(Count::new()),
+            None,
+        )));
+        metrics.push(Arc::new(Metric::new(
+            MetricValue::SpillCount(Count::new()),
+            None,
+        )));
+
+        let mut count = 0;
+        for _m in &metrics {
+            count += 1;
+        }
+        assert_eq!(count, 2);
+    }
+
     #[test]
     fn test_sorted_for_display() {
         let metrics = ExecutionPlanMetricsSet::new();
@@ -741,9 +859,15 @@ mod tests {
             n.join(", ")
         }
 
-        assert_eq!("end_timestamp, start_timestamp, elapsed_compute, the_second_counter, the_counter, the_third_counter, the_time, output_rows", metric_names(&metrics));
+        assert_eq!(
+            "end_timestamp, start_timestamp, elapsed_compute, the_second_counter, the_counter, the_third_counter, the_time, output_rows",
+            metric_names(&metrics)
+        );
 
         let metrics = metrics.sorted_for_display();
-        assert_eq!("output_rows, elapsed_compute, the_counter, the_second_counter, the_third_counter, the_time, start_timestamp, end_timestamp", metric_names(&metrics));
+        assert_eq!(
+            "output_rows, elapsed_compute, the_counter, the_second_counter, the_third_counter, the_time, start_timestamp, end_timestamp",
+            metric_names(&metrics)
+        );
     }
 }
diff --git a/datafusion/physical-plan/src/metrics/value.rs b/datafusion/physical-expr-common/src/metrics/value.rs
similarity index 74%
rename from datafusion/physical-plan/src/metrics/value.rs
rename to datafusion/physical-expr-common/src/metrics/value.rs
index 298d63e5e216a..d744dcde42684 100644
--- a/datafusion/physical-plan/src/metrics/value.rs
+++ b/datafusion/physical-expr-common/src/metrics/value.rs
@@ -19,15 +19,16 @@
 
 use super::CustomMetricValue;
 use chrono::{DateTime, Utc};
-use datafusion_common::instant::Instant;
-use datafusion_execution::memory_pool::human_readable_size;
+use datafusion_common::{
+    human_readable_count, human_readable_duration, human_readable_size, instant::Instant,
+};
 use parking_lot::Mutex;
 use std::{
     borrow::{Borrow, Cow},
     fmt::{Debug, Display},
     sync::{
-        atomic::{AtomicUsize, Ordering},
         Arc,
+        atomic::{AtomicUsize, Ordering},
     },
     time::Duration,
 };
@@ -49,7 +50,7 @@ impl PartialEq for Count {
 
 impl Display for Count {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(f, "{}", self.value())
+        write!(f, "{}", human_readable_count(self.value()))
     }
 }
 
@@ -169,8 +170,7 @@ impl PartialEq for Time {
 
 impl Display for Time {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        let duration = Duration::from_nanos(self.value() as u64);
-        write!(f, "{duration:?}")
+        write!(f, "{}", human_readable_duration(self.value() as u64))
     }
 }
 
@@ -372,14 +372,31 @@ impl Drop for ScopedTimerGuard<'_> {
 pub struct PruningMetrics {
     pruned: Arc<AtomicUsize>,
     matched: Arc<AtomicUsize>,
+    fully_matched: Arc<AtomicUsize>,
 }
 
 impl Display for PruningMetrics {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         let matched = self.matched.load(Ordering::Relaxed);
         let total = self.pruned.load(Ordering::Relaxed) + matched;
-
-        write!(f, "{total} total → {matched} matched")
+        let fully_matched = self.fully_matched.load(Ordering::Relaxed);
+
+        if fully_matched != 0 {
+            write!(
+                f,
+                "{} total → {} matched -> {} fully matched",
+                human_readable_count(total),
+                human_readable_count(matched),
+                human_readable_count(fully_matched)
+            )
+        } else {
+            write!(
+                f,
+                "{} total → {} matched",
+                human_readable_count(total),
+                human_readable_count(matched)
+            )
+        }
     }
 }
 
@@ -395,6 +412,7 @@ impl PruningMetrics {
         Self {
             pruned: Arc::new(AtomicUsize::new(0)),
             matched: Arc::new(AtomicUsize::new(0)),
+            fully_matched: Arc::new(AtomicUsize::new(0)),
         }
     }
 
@@ -412,6 +430,13 @@ impl PruningMetrics {
         self.matched.fetch_add(n, Ordering::Relaxed);
     }
 
+    /// Add `n` to the metric's fully matched value
+    pub fn add_fully_matched(&self, n: usize) {
+        // relaxed ordering for operations on `value` poses no issues
+        // we're purely using atomic ops with no associated memory ops
+        self.fully_matched.fetch_add(n, Ordering::Relaxed);
+    }
+
     /// Subtract `n` to the metric's matched value.
     pub fn subtract_matched(&self, n: usize) {
         // relaxed ordering for operations on `value` poses no issues
@@ -428,6 +453,11 @@ impl PruningMetrics {
     pub fn matched(&self) -> usize {
         self.matched.load(Ordering::Relaxed)
     }
+
+    /// Number of items fully matched
+    pub fn fully_matched(&self) -> usize {
+        self.fully_matched.load(Ordering::Relaxed)
+    }
 }
 
 /// Counters tracking ratio metrics (e.g. matched vs total)
@@ -437,6 +467,17 @@ impl PruningMetrics {
 pub struct RatioMetrics {
     part: Arc<AtomicUsize>,
     total: Arc<AtomicUsize>,
+    merge_strategy: RatioMergeStrategy,
+    /// Ratios are displayed as `1% (1/100)`; this controls the latter part.
+    display_raw_values: bool,
+}
+
+#[derive(Debug, Clone, Default)]
+pub enum RatioMergeStrategy {
+    #[default]
+    AddPartAddTotal,
+    AddPartSetTotal,
+    SetPartAddTotal,
 }
 
 impl RatioMetrics {
@@ -445,9 +486,21 @@ impl RatioMetrics {
         Self {
             part: Arc::new(AtomicUsize::new(0)),
             total: Arc::new(AtomicUsize::new(0)),
+            merge_strategy: RatioMergeStrategy::AddPartAddTotal,
+            display_raw_values: true,
         }
     }
 
+    pub fn with_merge_strategy(mut self, merge_strategy: RatioMergeStrategy) -> Self {
+        self.merge_strategy = merge_strategy;
+        self
+    }
+
+    pub fn with_display_raw_values(mut self, display_raw_values: bool) -> Self {
+        self.display_raw_values = display_raw_values;
+        self
+    }
+
     /// Add `n` to the numerator (`part`) value
     pub fn add_part(&self, n: usize) {
         self.part.fetch_add(n, Ordering::Relaxed);
@@ -458,10 +511,32 @@ impl RatioMetrics {
         self.total.fetch_add(n, Ordering::Relaxed);
     }
 
+    /// Set the numerator (`part`) value to `n`, overwriting any existing value
+    pub fn set_part(&self, n: usize) {
+        self.part.store(n, Ordering::Relaxed);
+    }
+
+    /// Set the denominator (`total`) value to `n`, overwriting any existing value
+    pub fn set_total(&self, n: usize) {
+        self.total.store(n, Ordering::Relaxed);
+    }
+
     /// Merge the value from `other` into `self`
     pub fn merge(&self, other: &Self) {
-        self.add_part(other.part());
-        self.add_total(other.total());
+        match self.merge_strategy {
+            RatioMergeStrategy::AddPartAddTotal => {
+                self.add_part(other.part());
+                self.add_total(other.total());
+            }
+            RatioMergeStrategy::AddPartSetTotal => {
+                self.add_part(other.part());
+                self.set_total(other.total());
+            }
+            RatioMergeStrategy::SetPartAddTotal => {
+                self.set_part(other.part());
+                self.add_total(other.total());
+            }
+        }
     }
 
     /// Return the numerator (`part`) value
@@ -477,41 +552,56 @@ impl RatioMetrics {
 
 impl PartialEq for RatioMetrics {
     fn eq(&self, other: &Self) -> bool {
-        self.part() == other.part() && self.total() == other.total()
+        self.part() == other.part()
+            && self.total() == other.total()
+            && self.display_raw_values == other.display_raw_values
     }
 }
 
-/// Format a float number with `digits` most significant numbers.
-///
-/// fmt_significant(12.5) -> "12"
-/// fmt_significant(0.0543) -> "0.054"
-/// fmt_significant(0.000123) -> "0.00012"
-fn fmt_significant(mut x: f64, digits: usize) -> String {
-    if x == 0.0 {
-        return "0".to_string();
-    }
-
-    let exp = x.abs().log10().floor(); // exponent of first significant digit
-    let scale = 10f64.powf(-(exp - (digits as f64 - 1.0)));
-    x = (x * scale).round() / scale; // round to N significant digits
-    format!("{x}")
-}
-
 impl Display for RatioMetrics {
+    /// Format the ratio to a format like '18.26% (220/1150)'
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         let part = self.part();
         let total = self.total();
 
+        // Format the ratio first (for example, `6667/10000` -> `66.67%`),
+        // then optionally append the raw values as ` (6.67 K/10.00 K)`.
+        if total == 0 {
+            write!(f, "N/A")?;
+        } else {
+            // Use basis points so we can round with integer math:
+            // e.g. 18.26% has basis point 1826
+            let basis_points = (((part as u128 * 10_000) + (total as u128 / 2))
+                / total as u128) as usize;
+            let whole = basis_points / 100;
+            let fractional = basis_points % 100;
+
+            if fractional == 0 {
+                write!(f, "{whole}%")?;
+            } else if fractional.is_multiple_of(10) {
+                write!(f, "{whole}.{}%", fractional / 10)?;
+            } else {
+                write!(f, "{whole}.{fractional:02}%")?;
+            }
+        }
+
+        if !self.display_raw_values {
+            return Ok(());
+        }
+
         if total == 0 {
             if part == 0 {
-                write!(f, "N/A (0/0)")
+                write!(f, " (0/0)")
             } else {
-                write!(f, "N/A ({part}/0)")
+                write!(f, " ({}/0)", human_readable_count(part))
             }
         } else {
-            let percentage = (part as f64 / total as f64) * 100.0;
-
-            write!(f, "{}% ({part}/{total})", fmt_significant(percentage, 2))
+            write!(
+                f,
+                " ({}/{})",
+                human_readable_count(part),
+                human_readable_count(total)
+            )
         }
     }
 }
@@ -551,6 +641,8 @@ pub enum MetricValue {
     SpilledBytes(Count),
     /// Total size of output bytes produced: "output_bytes" metric
     OutputBytes(Count),
+    /// Total number of output batches produced: "output_batches" metric
+    OutputBatches(Count),
     /// Total size of spilled rows produced: "spilled_rows" metric
     SpilledRows(Count),
     /// Current memory used
@@ -618,6 +710,9 @@ impl PartialEq for MetricValue {
             (MetricValue::OutputBytes(count), MetricValue::OutputBytes(other)) => {
                 count == other
             }
+            (MetricValue::OutputBatches(count), MetricValue::OutputBatches(other)) => {
+                count == other
+            }
             (MetricValue::SpilledRows(count), MetricValue::SpilledRows(other)) => {
                 count == other
             }
@@ -699,6 +794,7 @@ impl MetricValue {
             Self::SpillCount(_) => "spill_count",
             Self::SpilledBytes(_) => "spilled_bytes",
             Self::OutputBytes(_) => "output_bytes",
+            Self::OutputBatches(_) => "output_batches",
             Self::SpilledRows(_) => "spilled_rows",
             Self::CurrentMemoryUsage(_) => "mem_used",
             Self::ElapsedCompute(_) => "elapsed_compute",
@@ -721,6 +817,7 @@ impl MetricValue {
             Self::SpillCount(count) => count.value(),
             Self::SpilledBytes(bytes) => bytes.value(),
             Self::OutputBytes(bytes) => bytes.value(),
+            Self::OutputBatches(count) => count.value(),
             Self::SpilledRows(count) => count.value(),
             Self::CurrentMemoryUsage(used) => used.value(),
             Self::ElapsedCompute(time) => time.value(),
@@ -755,6 +852,7 @@ impl MetricValue {
             Self::SpillCount(_) => Self::SpillCount(Count::new()),
             Self::SpilledBytes(_) => Self::SpilledBytes(Count::new()),
             Self::OutputBytes(_) => Self::OutputBytes(Count::new()),
+            Self::OutputBatches(_) => Self::OutputBatches(Count::new()),
             Self::SpilledRows(_) => Self::SpilledRows(Count::new()),
             Self::CurrentMemoryUsage(_) => Self::CurrentMemoryUsage(Gauge::new()),
             Self::ElapsedCompute(_) => Self::ElapsedCompute(Time::new()),
@@ -776,10 +874,18 @@ impl MetricValue {
                 name: name.clone(),
                 pruning_metrics: PruningMetrics::new(),
             },
-            Self::Ratio { name, .. } => Self::Ratio {
-                name: name.clone(),
-                ratio_metrics: RatioMetrics::new(),
-            },
+            Self::Ratio {
+                name,
+                ratio_metrics,
+            } => {
+                let merge_strategy = ratio_metrics.merge_strategy.clone();
+                Self::Ratio {
+                    name: name.clone(),
+                    ratio_metrics: RatioMetrics::new()
+                        .with_merge_strategy(merge_strategy)
+                        .with_display_raw_values(ratio_metrics.display_raw_values),
+                }
+            }
             Self::Custom { name, value } => Self::Custom {
                 name: name.clone(),
                 value: value.new_empty(),
@@ -802,6 +908,7 @@ impl MetricValue {
             | (Self::SpillCount(count), Self::SpillCount(other_count))
             | (Self::SpilledBytes(count), Self::SpilledBytes(other_count))
             | (Self::OutputBytes(count), Self::OutputBytes(other_count))
+            | (Self::OutputBatches(count), Self::OutputBatches(other_count))
             | (Self::SpilledRows(count), Self::SpilledRows(other_count))
             | (
                 Self::Count { count, .. },
@@ -842,8 +949,11 @@ impl MetricValue {
             ) => {
                 let pruned = other_pruning_metrics.pruned.load(Ordering::Relaxed);
                 let matched = other_pruning_metrics.matched.load(Ordering::Relaxed);
+                let fully_matched =
+                    other_pruning_metrics.fully_matched.load(Ordering::Relaxed);
                 pruning_metrics.add_pruned(pruned);
                 pruning_metrics.add_matched(matched);
+                pruning_metrics.add_fully_matched(fully_matched);
             }
             (
                 Self::Ratio { ratio_metrics, .. },
@@ -879,6 +989,7 @@ impl MetricValue {
             Self::OutputRows(_) => 0,
             Self::ElapsedCompute(_) => 1,
             Self::OutputBytes(_) => 2,
+            Self::OutputBatches(_) => 3,
             // Other metrics
             Self::PruningMetrics { name, .. } => match name.as_ref() {
                 // The following metrics belong to `DataSourceExec` with a Parquet data source.
@@ -888,23 +999,24 @@ impl MetricValue {
                 // You may update these metrics as long as their relative order remains unchanged.
                 //
                 // Reference PR: <https://github.com/apache/datafusion/pull/18379>
-                "files_ranges_pruned_statistics" => 3,
-                "row_groups_pruned_statistics" => 4,
-                "row_groups_pruned_bloom_filter" => 5,
-                "page_index_rows_pruned" => 6,
-                _ => 7,
+                "files_ranges_pruned_statistics" => 4,
+                "row_groups_pruned_statistics" => 5,
+                "row_groups_pruned_bloom_filter" => 6,
+                "page_index_pages_pruned" => 7,
+                "page_index_rows_pruned" => 8,
+                _ => 9,
             },
-            Self::SpillCount(_) => 8,
-            Self::SpilledBytes(_) => 9,
-            Self::SpilledRows(_) => 10,
-            Self::CurrentMemoryUsage(_) => 11,
-            Self::Count { .. } => 12,
-            Self::Gauge { .. } => 13,
-            Self::Time { .. } => 14,
-            Self::Ratio { .. } => 15,
-            Self::StartTimestamp(_) => 16, // show timestamps last
-            Self::EndTimestamp(_) => 17,
-            Self::Custom { .. } => 18,
+            Self::SpillCount(_) => 10,
+            Self::SpilledBytes(_) => 11,
+            Self::SpilledRows(_) => 12,
+            Self::CurrentMemoryUsage(_) => 13,
+            Self::Count { .. } => 14,
+            Self::Gauge { .. } => 15,
+            Self::Time { .. } => 16,
+            Self::Ratio { .. } => 17,
+            Self::StartTimestamp(_) => 18, // show timestamps last
+            Self::EndTimestamp(_) => 19,
+            Self::Custom { .. } => 20,
         }
     }
 
@@ -919,6 +1031,7 @@ impl Display for MetricValue {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         match self {
             Self::OutputRows(count)
+            | Self::OutputBatches(count)
             | Self::SpillCount(count)
             | Self::SpilledRows(count)
             | Self::Count { count, .. } => {
@@ -928,8 +1041,14 @@ impl Display for MetricValue {
                 let readable_count = human_readable_size(count.value());
                 write!(f, "{readable_count}")
             }
-            Self::CurrentMemoryUsage(gauge) | Self::Gauge { gauge, .. } => {
-                write!(f, "{gauge}")
+            Self::CurrentMemoryUsage(gauge) => {
+                // CurrentMemoryUsage is in bytes, format like SpilledBytes
+                let readable_size = human_readable_size(gauge.value());
+                write!(f, "{readable_size}")
+            }
+            Self::Gauge { gauge, .. } => {
+                // Generic gauge metrics - format with human-readable count
+                write!(f, "{}", human_readable_count(gauge.value()))
             }
             Self::ElapsedCompute(time) | Self::Time { time, .. } => {
                 // distinguish between no time recorded and very small
@@ -949,8 +1068,8 @@ impl Display for MetricValue {
                 write!(f, "{pruning_metrics}")
             }
             Self::Ratio { ratio_metrics, .. } => write!(f, "{ratio_metrics}"),
-            Self::Custom { name, value } => {
-                write!(f, "name:{name} {value}")
+            Self::Custom { value, .. } => {
+                write!(f, "{value}")
             }
         }
     }
@@ -961,7 +1080,7 @@ mod tests {
     use std::any::Any;
 
     use chrono::TimeZone;
-    use datafusion_execution::memory_pool::units::MB;
+    use datafusion_common::units::MB;
 
     use super::*;
 
@@ -1003,12 +1122,11 @@ mod tests {
     fn new_custom_counter(name: &'static str, value: usize) -> MetricValue {
         let custom_counter = CustomCounter::default();
         custom_counter.count.fetch_add(value, Ordering::Relaxed);
-        let custom_val = MetricValue::Custom {
+
+        MetricValue::Custom {
             name: Cow::Borrowed(name),
             value: Arc::new(custom_counter),
-        };
-
-        custom_val
+        }
     }
 
     #[test]
@@ -1046,6 +1164,12 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_display_custom_metric() {
+        let custom_val = new_custom_counter("hi", 11);
+        assert_eq!(custom_val.to_string(), "count: 11");
+    }
+
     #[test]
     fn test_display_output_rows() {
         let count = Count::new();
@@ -1099,7 +1223,7 @@ mod tests {
 
         time.add_duration(Duration::from_nanos(1042));
         for value in &values {
-            assert_eq!("1.042µs", value.to_string(), "value {value:?}");
+            assert_eq!("1.04µs", value.to_string(), "value {value:?}");
         }
     }
 
@@ -1126,7 +1250,82 @@ mod tests {
         };
         tiny_ratio_metrics.add_part(1);
         tiny_ratio_metrics.add_total(3000);
-        assert_eq!("0.033% (1/3000)", tiny_ratio.to_string());
+        assert_eq!("0.03% (1/3.00 K)", tiny_ratio.to_string());
+
+        ratio_metrics.set_part(6667);
+        ratio_metrics.set_total(10_000);
+        assert_eq!("66.67% (6.67 K/10.00 K)", ratio.to_string());
+
+        let percentage_only = RatioMetrics::new().with_display_raw_values(false);
+        let ratio = MetricValue::Ratio {
+            name: Cow::Borrowed("percentage_only"),
+            ratio_metrics: percentage_only.clone(),
+        };
+        assert_eq!("N/A", ratio.to_string());
+        percentage_only.set_part(6667);
+        percentage_only.set_total(10_000);
+        assert_eq!("66.67%", ratio.to_string());
+    }
+
+    #[test]
+    fn test_ratio_set_methods() {
+        let ratio_metrics = RatioMetrics::new();
+
+        // Ensure set methods don't increment
+        ratio_metrics.set_part(10);
+        ratio_metrics.set_part(10);
+        ratio_metrics.set_total(40);
+        ratio_metrics.set_total(40);
+        assert_eq!("25% (10/40)", ratio_metrics.to_string());
+
+        let ratio_metrics = RatioMetrics::new();
+
+        // Calling set should change the value
+        ratio_metrics.set_part(10);
+        ratio_metrics.set_part(30);
+        ratio_metrics.set_total(40);
+        ratio_metrics.set_total(50);
+        assert_eq!("60% (30/50)", ratio_metrics.to_string());
+    }
+
+    #[test]
+    fn test_ratio_merge_strategy() {
+        // Test AddPartSetTotal strategy
+        let ratio_metrics1 =
+            RatioMetrics::new().with_merge_strategy(RatioMergeStrategy::AddPartSetTotal);
+
+        ratio_metrics1.set_part(10);
+        ratio_metrics1.set_total(40);
+        assert_eq!("25% (10/40)", ratio_metrics1.to_string());
+        let ratio_metrics2 =
+            RatioMetrics::new().with_merge_strategy(RatioMergeStrategy::AddPartSetTotal);
+        ratio_metrics2.set_part(20);
+        ratio_metrics2.set_total(40);
+        assert_eq!("50% (20/40)", ratio_metrics2.to_string());
+
+        ratio_metrics1.merge(&ratio_metrics2);
+        assert_eq!("75% (30/40)", ratio_metrics1.to_string());
+
+        // Test SetPartAddTotal strategy
+        let ratio_metrics1 =
+            RatioMetrics::new().with_merge_strategy(RatioMergeStrategy::SetPartAddTotal);
+        ratio_metrics1.set_part(20);
+        ratio_metrics1.set_total(50);
+        let ratio_metrics2 = RatioMetrics::new();
+        ratio_metrics2.set_part(20);
+        ratio_metrics2.set_total(50);
+        ratio_metrics1.merge(&ratio_metrics2);
+        assert_eq!("20% (20/100)", ratio_metrics1.to_string());
+
+        // Test AddPartAddTotal strategy (default)
+        let ratio_metrics1 = RatioMetrics::new();
+        ratio_metrics1.set_part(20);
+        ratio_metrics1.set_total(50);
+        let ratio_metrics2 = RatioMetrics::new();
+        ratio_metrics2.set_part(20);
+        ratio_metrics2.set_total(50);
+        ratio_metrics1.merge(&ratio_metrics2);
+        assert_eq!("40% (40/100)", ratio_metrics1.to_string());
     }
 
     #[test]
@@ -1246,4 +1445,104 @@ mod tests {
             "Expected ~10ms total, got {new_recorded} ns",
         );
     }
+
+    #[test]
+    fn test_human_readable_metric_formatting() {
+        // Test Count formatting with various sizes
+        let small_count = Count::new();
+        small_count.add(42);
+        assert_eq!(
+            MetricValue::OutputRows(small_count.clone()).to_string(),
+            "42"
+        );
+
+        let thousand_count = Count::new();
+        thousand_count.add(10_100);
+        assert_eq!(
+            MetricValue::OutputRows(thousand_count.clone()).to_string(),
+            "10.10 K"
+        );
+
+        let million_count = Count::new();
+        million_count.add(1_532_000);
+        assert_eq!(
+            MetricValue::SpilledRows(million_count.clone()).to_string(),
+            "1.53 M"
+        );
+
+        let billion_count = Count::new();
+        billion_count.add(2_500_000_000);
+        assert_eq!(
+            MetricValue::OutputBatches(billion_count.clone()).to_string(),
+            "2.50 B"
+        );
+
+        // Test Time formatting with various durations
+        let micros_time = Time::new();
+        micros_time.add_duration(Duration::from_nanos(1_234));
+        assert_eq!(
+            MetricValue::ElapsedCompute(micros_time.clone()).to_string(),
+            "1.23µs"
+        );
+
+        let millis_time = Time::new();
+        millis_time.add_duration(Duration::from_nanos(11_295_377));
+        assert_eq!(
+            MetricValue::ElapsedCompute(millis_time.clone()).to_string(),
+            "11.30ms"
+        );
+
+        let seconds_time = Time::new();
+        seconds_time.add_duration(Duration::from_nanos(1_234_567_890));
+        assert_eq!(
+            MetricValue::ElapsedCompute(seconds_time.clone()).to_string(),
+            "1.23s"
+        );
+
+        // Test CurrentMemoryUsage formatting (should use size, not count)
+        let mem_gauge = Gauge::new();
+        mem_gauge.add(100 * MB as usize);
+        assert_eq!(
+            MetricValue::CurrentMemoryUsage(mem_gauge.clone()).to_string(),
+            "100.0 MB"
+        );
+
+        // Test custom Gauge formatting (should use count)
+        let custom_gauge = Gauge::new();
+        custom_gauge.add(50_000);
+        assert_eq!(
+            MetricValue::Gauge {
+                name: "custom".into(),
+                gauge: custom_gauge.clone()
+            }
+            .to_string(),
+            "50.00 K"
+        );
+
+        // Test PruningMetrics formatting
+        let pruning = PruningMetrics::new();
+        pruning.add_matched(500_000);
+        pruning.add_pruned(500_000);
+        assert_eq!(
+            MetricValue::PruningMetrics {
+                name: "test_pruning".into(),
+                pruning_metrics: pruning.clone()
+            }
+            .to_string(),
+            "1.00 M total → 500.0 K matched"
+        );
+
+        // Test RatioMetrics formatting
+        let ratio = RatioMetrics::new();
+        ratio.add_part(250_000);
+        ratio.add_total(1_000_000);
+        assert_eq!(
+            MetricValue::Ratio {
+                name: "test_ratio".into(),
+                ratio_metrics: ratio.clone()
+            }
+            .to_string(),
+            "25% (250.0 K/1.00 M)"
+        );
+    }
 }
diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs
index 492383663d455..8e3e23c10ed0b 100644
--- a/datafusion/physical-expr-common/src/physical_expr.rs
+++ b/datafusion/physical-expr-common/src/physical_expr.rs
@@ -23,16 +23,19 @@ use std::sync::Arc;
 
 use crate::utils::scatter;
 
-use arrow::array::{new_empty_array, ArrayRef, BooleanArray};
+use arrow::array::{Array, ArrayRef, BooleanArray, new_empty_array};
 use arrow::compute::filter_record_batch;
 use arrow::datatypes::{DataType, Field, FieldRef, Schema};
 use arrow::record_batch::RecordBatch;
 use datafusion_common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
 };
-use datafusion_common::{exec_err, internal_err, not_impl_err, Result, ScalarValue};
+use datafusion_common::{
+    Result, ScalarValue, assert_eq_or_internal_err, exec_err, not_impl_err,
+};
 use datafusion_expr_common::columnar_value::ColumnarValue;
 use datafusion_expr_common::interval_arithmetic::Interval;
+use datafusion_expr_common::placement::ExpressionPlacement;
 use datafusion_expr_common::sort_properties::ExprProperties;
 use datafusion_expr_common::statistics::Distribution;
 
@@ -69,10 +72,9 @@ pub type PhysicalExprRef = Arc<dyn PhysicalExpr>;
 /// [`create_physical_expr`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/fn.create_physical_expr.html
 /// [`Column`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/expressions/struct.Column.html
 pub trait PhysicalExpr: Any + Send + Sync + Display + Debug + DynEq + DynHash {
-    /// Returns the physical expression as [`Any`] so that it can be
-    /// downcast to a specific implementation.
-    fn as_any(&self) -> &dyn Any;
-    /// Get the data type of this expression, given the schema of the input
+    /// Get the data type of this expression, given the schema of the input.
+    /// Returns an error if the data type cannot be determined, ex. if the
+    /// schema is missing a required field.
     fn data_type(&self, input_schema: &Schema) -> Result<DataType> {
         Ok(self.return_field(input_schema)?.data_type().to_owned())
     }
@@ -103,20 +105,21 @@ pub trait PhysicalExpr: Any + Send + Sync + Display + Debug + DynEq + DynHash {
     ) -> Result<ColumnarValue> {
         let row_count = batch.num_rows();
         if row_count != selection.len() {
-            return exec_err!("Selection array length does not match batch row count: {} != {row_count}", selection.len());
+            return exec_err!(
+                "Selection array length does not match batch row count: {} != {row_count}",
+                selection.len()
+            );
         }
 
-        let selection_count = selection.true_count();
-
         // First, check if we can avoid filtering altogether.
-        if selection_count == row_count {
+        if selection.null_count() == 0 && !selection.has_false() {
             // All values from the `selection` filter are true and match the input batch.
             // No need to perform any filtering.
             return self.evaluate(batch);
         }
 
         // Next, prepare the result array for each 'true' row in the selection vector.
-        let filtered_result = if selection_count == 0 {
+        let filtered_result = if !selection.has_true() {
             // Do not call `evaluate` when the selection is empty.
             // `evaluate_selection` is used to conditionally evaluate expressions.
             // When the expression in question is fallible, evaluating it with an empty
@@ -129,7 +132,7 @@ pub trait PhysicalExpr: Any + Send + Sync + Display + Debug + DynEq + DynHash {
             // If we reach this point, there's no other option than to filter the batch.
             // This is a fairly costly operation since it requires creating partial copies
             // (worst case of length `row_count - 1`) of all the arrays in the record batch.
-            // The resulting `filtered_batch` will contain `selection_count` rows.
+            // The resulting `filtered_batch` will contain one row per true in `selection`.
             let filtered_batch = filter_record_batch(batch, selection)?;
             self.evaluate(&filtered_batch)?
         };
@@ -160,6 +163,9 @@ pub trait PhysicalExpr: Any + Send + Sync + Display + Debug + DynEq + DynHash {
     fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>>;
 
     /// Returns a new PhysicalExpr where all children were replaced by new exprs.
+    ///
+    /// If the implementation returns a [`PhysicalExpr::expression_id`], then
+    /// the identifier should be preserved by the new expression.
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn PhysicalExpr>>,
@@ -178,10 +184,18 @@ pub trait PhysicalExpr: Any + Send + Sync + Display + Debug + DynEq + DynHash {
     /// A `Result` containing the output interval for the expression in
     /// case of success, or an error object in case of failure.
     ///
+    /// Note that the output bounds must form an **envelope** that contains all
+    /// possible outputs of the expression given the input bounds. While
+    /// expressions should output the tightest possible bounds, they do not need
+    /// to be exact and can be conservative.
+    ///
     /// # Example
     ///
     /// If the expression is `a + b`, and the input intervals are `a: [1, 2]`
     /// and `b: [3, 4]`, then the output interval would be `[4, 6]`.
+    ///
+    /// If the expression is `sin(a)`, it is correct (though not precise) to
+    /// produce the interval `[-1, 1]` for any input interval for `a`.
     fn evaluate_bounds(&self, _children: &[&Interval]) -> Result<Interval> {
         not_impl_err!("Not implemented for {self}")
     }
@@ -247,9 +261,9 @@ pub trait PhysicalExpr: Any + Send + Sync + Display + Debug + DynEq + DynHash {
         let output_interval = self.evaluate_bounds(children_ranges_refs.as_slice())?;
         let dt = output_interval.data_type();
         if dt.eq(&DataType::Boolean) {
-            let p = if output_interval.eq(&Interval::CERTAINLY_TRUE) {
+            let p = if output_interval.eq(&Interval::TRUE) {
                 ScalarValue::new_one(&dt)
-            } else if output_interval.eq(&Interval::CERTAINLY_FALSE) {
+            } else if output_interval.eq(&Interval::FALSE) {
                 ScalarValue::new_zero(&dt)
             } else {
                 ScalarValue::try_from(&dt)
@@ -309,9 +323,9 @@ pub trait PhysicalExpr: Any + Send + Sync + Display + Debug + DynEq + DynHash {
                     Ok((*child).clone())
                 } else if new_interval.data_type().eq(&DataType::Boolean) {
                     let dt = old_interval.data_type();
-                    let p = if new_interval.eq(&Interval::CERTAINLY_TRUE) {
+                    let p = if new_interval.eq(&Interval::TRUE) {
                         ScalarValue::new_one(&dt)
-                    } else if new_interval.eq(&Interval::CERTAINLY_FALSE) {
+                    } else if new_interval.eq(&Interval::FALSE) {
                         ScalarValue::new_zero(&dt)
                     } else {
                         unreachable!("Given that we have a range reduction for a boolean interval, we should have certainty")
@@ -425,6 +439,33 @@ pub trait PhysicalExpr: Any + Send + Sync + Display + Debug + DynEq + DynHash {
     fn is_volatile_node(&self) -> bool {
         false
     }
+
+    /// Returns placement information for this expression.
+    ///
+    /// This is used by optimizers to make decisions about expression placement,
+    /// such as whether to push expressions down through projections.
+    ///
+    /// The default implementation returns [`ExpressionPlacement::KeepInPlace`].
+    fn placement(&self) -> ExpressionPlacement {
+        ExpressionPlacement::KeepInPlace
+    }
+
+    /// Return a stable, globally-unique identifier for this [`PhysicalExpr`], if it
+    /// has one.
+    ///
+    /// This identifier tracks which expressions which are connected (e.g. `DynamicFilterPhysicalExpr`
+    /// where two expressions may be different but store the same mutable inner state). Tracking
+    /// connected expressions helps preserve referential integrity within plan nodes
+    /// during serialization and deserialization.
+    ///
+    /// This id must be preserved across [`PhysicalExpr::with_new_children`] or any other
+    /// methods which may want to preserve identity.
+    ///
+    /// Default is `None`: the expression has no identity worth preserving across a
+    /// serialization boundary.
+    fn expression_id(&self) -> Option<u64> {
+        None
+    }
 }
 
 #[deprecated(
@@ -433,9 +474,29 @@ pub trait PhysicalExpr: Any + Send + Sync + Display + Debug + DynEq + DynHash {
 )]
 pub use datafusion_expr_common::dyn_eq::{DynEq, DynHash};
 
+impl dyn PhysicalExpr {
+    /// Returns `true` if the expression is of type `T`.
+    ///
+    /// Prefer this over `downcast_ref::<T>().is_some()`. Works correctly when
+    /// called on `Arc<dyn PhysicalExpr>` via auto-deref.
+    pub fn is<T: PhysicalExpr>(&self) -> bool {
+        (self as &dyn Any).is::<T>()
+    }
+
+    /// Attempts to downcast this expression to a concrete type `T`, returning
+    /// `None` if the expression is not of that type.
+    ///
+    /// Works correctly when called on `Arc<dyn PhysicalExpr>` via auto-deref,
+    /// unlike `(&arc as &dyn Any).downcast_ref::<T>()` which would attempt to
+    /// downcast the `Arc` itself.
+    pub fn downcast_ref<T: PhysicalExpr>(&self) -> Option<&T> {
+        (self as &dyn Any).downcast_ref()
+    }
+}
+
 impl PartialEq for dyn PhysicalExpr {
     fn eq(&self, other: &Self) -> bool {
-        self.dyn_eq(other.as_any())
+        self.dyn_eq(other as &dyn Any)
     }
 }
 impl Eq for dyn PhysicalExpr {}
@@ -453,9 +514,13 @@ pub fn with_new_children_if_necessary(
     children: Vec<Arc<dyn PhysicalExpr>>,
 ) -> Result<Arc<dyn PhysicalExpr>> {
     let old_children = expr.children();
-    if children.len() != old_children.len() {
-        internal_err!("PhysicalExpr: Wrong number of children")
-    } else if children.is_empty()
+    assert_eq_or_internal_err!(
+        children.len(),
+        old_children.len(),
+        "PhysicalExpr: Wrong number of children"
+    );
+
+    if children.is_empty()
         || children
             .iter()
             .zip(old_children.iter())
@@ -508,7 +573,6 @@ where
 /// # Example
 /// ```
 /// # // The boilerplate needed to create a `PhysicalExpr` for the example
-/// # use std::any::Any;
 /// use std::collections::HashMap;
 /// # use std::fmt::Formatter;
 /// # use std::sync::Arc;
@@ -519,7 +583,7 @@ where
 /// # use datafusion_physical_expr_common::physical_expr::{fmt_sql, DynEq, PhysicalExpr};
 /// # #[derive(Debug, PartialEq, Eq, Hash)]
 /// # struct MyExpr {}
-/// # impl PhysicalExpr for MyExpr {fn as_any(&self) -> &dyn Any { unimplemented!() }
+/// # impl PhysicalExpr for MyExpr {
 /// # fn data_type(&self, input_schema: &Schema) -> Result<DataType> { unimplemented!() }
 /// # fn nullable(&self, input_schema: &Schema) -> Result<bool> { unimplemented!() }
 /// # fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> { unimplemented!() }
@@ -565,12 +629,30 @@ pub fn fmt_sql(expr: &dyn PhysicalExpr) -> impl Display + '_ {
 ///
 /// # Returns
 ///
-/// Returns an `Option<Arc<dyn PhysicalExpr>>` which is the snapshot of the
-/// `PhysicalExpr` if it is dynamic. If the `PhysicalExpr` does not have
-/// any dynamic references or state, it returns `None`.
+/// Returns a snapshot of the `PhysicalExpr` if it is dynamic, otherwise
+/// returns itself.
 pub fn snapshot_physical_expr(
     expr: Arc<dyn PhysicalExpr>,
 ) -> Result<Arc<dyn PhysicalExpr>> {
+    snapshot_physical_expr_opt(expr).data()
+}
+
+/// Take a snapshot of the given `PhysicalExpr` if it is dynamic.
+///
+/// Take a snapshot of this `PhysicalExpr` if it is dynamic.
+/// This is used to capture the current state of `PhysicalExpr`s that may contain
+/// dynamic references to other operators in order to serialize it over the wire
+/// or treat it via downcast matching.
+///
+/// See the documentation of [`PhysicalExpr::snapshot`] for more details.
+///
+/// # Returns
+///
+/// Returns a `[`Transformed`] indicating whether a snapshot was taken,
+/// along with the resulting `PhysicalExpr`.
+pub fn snapshot_physical_expr_opt(
+    expr: Arc<dyn PhysicalExpr>,
+) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
     expr.transform_up(|e| {
         if let Some(snapshot) = e.snapshot()? {
             Ok(Transformed::yes(snapshot))
@@ -578,7 +660,6 @@ pub fn snapshot_physical_expr(
             Ok(Transformed::no(Arc::clone(&e)))
         }
     })
-    .data()
 }
 
 /// Check the generation of this `PhysicalExpr`.
@@ -647,10 +728,6 @@ mod test {
     struct TestExpr {}
 
     impl PhysicalExpr for TestExpr {
-        fn as_any(&self) -> &dyn std::any::Any {
-            self
-        }
-
         fn data_type(&self, _schema: &Schema) -> datafusion_common::Result<DataType> {
             Ok(DataType::Int64)
         }
@@ -778,44 +855,44 @@ mod test {
     #[test]
     pub fn test_evaluate_selection_with_non_empty_record_batch() {
         test_evaluate_selection(
-            unsafe { &RecordBatch::new_unchecked(Arc::new(Schema::empty()), vec![], 10) },
+            &unsafe { RecordBatch::new_unchecked(Arc::new(Schema::empty()), vec![], 10) },
             &BooleanArray::from(vec![true; 10]),
             &ColumnarValue::Array(Arc::new(Int64Array::from(vec![1; 10]))),
         );
     }
 
     #[test]
-    pub fn test_evaluate_selection_with_non_empty_record_batch_with_larger_false_selection(
-    ) {
+    pub fn test_evaluate_selection_with_non_empty_record_batch_with_larger_false_selection()
+     {
         test_evaluate_selection_error(
-            unsafe { &RecordBatch::new_unchecked(Arc::new(Schema::empty()), vec![], 10) },
+            &unsafe { RecordBatch::new_unchecked(Arc::new(Schema::empty()), vec![], 10) },
             &BooleanArray::from(vec![false; 20]),
         );
     }
 
     #[test]
-    pub fn test_evaluate_selection_with_non_empty_record_batch_with_larger_true_selection(
-    ) {
+    pub fn test_evaluate_selection_with_non_empty_record_batch_with_larger_true_selection()
+     {
         test_evaluate_selection_error(
-            unsafe { &RecordBatch::new_unchecked(Arc::new(Schema::empty()), vec![], 10) },
+            &unsafe { RecordBatch::new_unchecked(Arc::new(Schema::empty()), vec![], 10) },
             &BooleanArray::from(vec![true; 20]),
         );
     }
 
     #[test]
-    pub fn test_evaluate_selection_with_non_empty_record_batch_with_smaller_false_selection(
-    ) {
+    pub fn test_evaluate_selection_with_non_empty_record_batch_with_smaller_false_selection()
+     {
         test_evaluate_selection_error(
-            unsafe { &RecordBatch::new_unchecked(Arc::new(Schema::empty()), vec![], 10) },
+            &unsafe { RecordBatch::new_unchecked(Arc::new(Schema::empty()), vec![], 10) },
             &BooleanArray::from(vec![false; 5]),
         );
     }
 
     #[test]
-    pub fn test_evaluate_selection_with_non_empty_record_batch_with_smaller_true_selection(
-    ) {
+    pub fn test_evaluate_selection_with_non_empty_record_batch_with_smaller_true_selection()
+     {
         test_evaluate_selection_error(
-            unsafe { &RecordBatch::new_unchecked(Arc::new(Schema::empty()), vec![], 10) },
+            &unsafe { RecordBatch::new_unchecked(Arc::new(Schema::empty()), vec![], 10) },
             &BooleanArray::from(vec![true; 5]),
         );
     }
diff --git a/datafusion/physical-expr-common/src/sort_expr.rs b/datafusion/physical-expr-common/src/sort_expr.rs
index d19d7024a516e..84ffb92eaa600 100644
--- a/datafusion/physical-expr-common/src/sort_expr.rs
+++ b/datafusion/physical-expr-common/src/sort_expr.rs
@@ -24,14 +24,14 @@ use std::ops::{Deref, DerefMut};
 use std::sync::Arc;
 use std::vec::IntoIter;
 
-use crate::physical_expr::{fmt_sql, PhysicalExpr};
+use crate::physical_expr::{PhysicalExpr, fmt_sql};
 
 use arrow::compute::kernels::sort::{SortColumn, SortOptions};
 use arrow::datatypes::Schema;
 use arrow::record_batch::RecordBatch;
 use datafusion_common::{HashSet, Result};
 use datafusion_expr_common::columnar_value::ColumnarValue;
-
+use indexmap::IndexSet;
 /// Represents Sort operation for a column in a RecordBatch
 ///
 /// Example:
@@ -53,7 +53,6 @@ use datafusion_expr_common::columnar_value::ColumnarValue;
 /// # #[derive(Clone, Debug, PartialEq, Eq, Hash)]
 /// # struct MyPhysicalExpr;
 /// # impl PhysicalExpr for MyPhysicalExpr {
-/// #  fn as_any(&self) -> &dyn Any {todo!() }
 /// #  fn data_type(&self, input_schema: &Schema) -> Result<DataType> {todo!()}
 /// #  fn nullable(&self, input_schema: &Schema) -> Result<bool> {todo!() }
 /// #  fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {todo!() }
@@ -353,14 +352,14 @@ impl From<PhysicalSortRequirement> for PhysicalSortExpr {
 /// 1. It is non-degenerate, meaning it contains at least one element.
 /// 2. It is duplicate-free, meaning it does not contain multiple entries for
 ///    the same column.
-#[derive(Debug, Clone)]
+#[derive(Clone, Debug)]
 pub struct LexOrdering {
     /// Vector of sort expressions representing the lexicographical ordering.
     exprs: Vec<PhysicalSortExpr>,
     /// Set of expressions in the lexicographical ordering, used to ensure
     /// that the ordering is duplicate-free. Note that the elements in this
     /// set are the same underlying physical expressions as in `exprs`.
-    set: HashSet<Arc<dyn PhysicalExpr>>,
+    set: IndexSet<Arc<dyn PhysicalExpr>>,
 }
 
 impl LexOrdering {
@@ -371,7 +370,7 @@ impl LexOrdering {
         let mut candidate = Self {
             // not valid yet; valid publicly-returned instance must be non-empty
             exprs: Vec::new(),
-            set: HashSet::new(),
+            set: IndexSet::new(),
         };
         for expr in exprs {
             candidate.push(expr);
@@ -421,11 +420,78 @@ impl LexOrdering {
             return false;
         }
         for PhysicalSortExpr { expr, .. } in self.exprs[len..].iter() {
-            self.set.remove(expr);
+            self.set.swap_remove(expr);
         }
         self.exprs.truncate(len);
         true
     }
+
+    /// Check if reversing this ordering would satisfy another ordering requirement.
+    ///
+    /// This supports **prefix matching**: if this ordering is `[A DESC, B ASC]`
+    /// and `other` is `[A ASC]`, reversing this gives `[A ASC, B DESC]`, which
+    /// satisfies `other` since `[A ASC]` is a prefix.
+    ///
+    /// # Arguments
+    /// * `other` - The ordering requirement to check against
+    ///
+    /// # Returns
+    /// `true` if reversing this ordering would satisfy `other`
+    ///
+    /// # Example
+    /// ```text
+    /// self:  [number DESC, letter ASC]
+    /// other: [number ASC]
+    /// After reversing self: [number ASC, letter DESC]  ✓ Prefix match!
+    /// ```
+    pub fn is_reverse(&self, other: &LexOrdering) -> bool {
+        let self_exprs = self.as_ref();
+        let other_exprs = other.as_ref();
+
+        if other_exprs.len() > self_exprs.len() {
+            return false;
+        }
+
+        other_exprs.iter().zip(self_exprs.iter()).all(|(req, cur)| {
+            req.expr.eq(&cur.expr) && is_reversed_sort_options(&req.options, &cur.options)
+        })
+    }
+
+    /// Returns the sort options for the given expression if one is defined in this `LexOrdering`.
+    pub fn get_sort_options(&self, expr: &dyn PhysicalExpr) -> Option<SortOptions> {
+        for e in self {
+            if e.expr.as_ref().dyn_eq(expr) {
+                return Some(e.options);
+            }
+        }
+
+        None
+    }
+}
+
+/// Check if two SortOptions represent reversed orderings.
+///
+/// Returns `true` if both `descending` and `nulls_first` are opposite.
+///
+/// # Example
+/// ```
+/// use arrow::compute::SortOptions;
+/// # use datafusion_physical_expr_common::sort_expr::is_reversed_sort_options;
+///
+/// let asc_nulls_last = SortOptions {
+///     descending: false,
+///     nulls_first: false,
+/// };
+/// let desc_nulls_first = SortOptions {
+///     descending: true,
+///     nulls_first: true,
+/// };
+///
+/// assert!(is_reversed_sort_options(&asc_nulls_last, &desc_nulls_first));
+/// assert!(is_reversed_sort_options(&desc_nulls_first, &asc_nulls_last));
+/// ```
+pub fn is_reversed_sort_options(lhs: &SortOptions, rhs: &SortOptions) -> bool {
+    lhs.descending != rhs.descending && lhs.nulls_first != rhs.nulls_first
 }
 
 impl PartialEq for LexOrdering {
@@ -732,3 +798,50 @@ impl DerefMut for OrderingRequirements {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_is_reversed_sort_options() {
+        // Test basic reversal: ASC NULLS LAST ↔ DESC NULLS FIRST
+        let asc_nulls_last = SortOptions {
+            descending: false,
+            nulls_first: false,
+        };
+        let desc_nulls_first = SortOptions {
+            descending: true,
+            nulls_first: true,
+        };
+        assert!(is_reversed_sort_options(&asc_nulls_last, &desc_nulls_first));
+        assert!(is_reversed_sort_options(&desc_nulls_first, &asc_nulls_last));
+
+        // Test another reversal: ASC NULLS FIRST ↔ DESC NULLS LAST
+        let asc_nulls_first = SortOptions {
+            descending: false,
+            nulls_first: true,
+        };
+        let desc_nulls_last = SortOptions {
+            descending: true,
+            nulls_first: false,
+        };
+        assert!(is_reversed_sort_options(&asc_nulls_first, &desc_nulls_last));
+        assert!(is_reversed_sort_options(&desc_nulls_last, &asc_nulls_first));
+
+        // Test non-reversal: same options
+        assert!(!is_reversed_sort_options(&asc_nulls_last, &asc_nulls_last));
+        assert!(!is_reversed_sort_options(
+            &desc_nulls_first,
+            &desc_nulls_first
+        ));
+
+        // Test non-reversal: only descending differs
+        assert!(!is_reversed_sort_options(&asc_nulls_last, &desc_nulls_last));
+        assert!(!is_reversed_sort_options(&desc_nulls_last, &asc_nulls_last));
+
+        // Test non-reversal: only nulls_first differs
+        assert!(!is_reversed_sort_options(&asc_nulls_last, &asc_nulls_first));
+        assert!(!is_reversed_sort_options(&asc_nulls_first, &asc_nulls_last));
+    }
+}
diff --git a/datafusion/physical-expr-common/src/tree_node.rs b/datafusion/physical-expr-common/src/tree_node.rs
index c37e67575bf00..6c7d04a22535f 100644
--- a/datafusion/physical-expr-common/src/tree_node.rs
+++ b/datafusion/physical-expr-common/src/tree_node.rs
@@ -20,10 +20,10 @@
 use std::fmt::{self, Display, Formatter};
 use std::sync::Arc;
 
-use crate::physical_expr::{with_new_children_if_necessary, PhysicalExpr};
+use crate::physical_expr::{PhysicalExpr, with_new_children_if_necessary};
 
-use datafusion_common::tree_node::{ConcreteTreeNode, DynTreeNode};
 use datafusion_common::Result;
+use datafusion_common::tree_node::{ConcreteTreeNode, DynTreeNode};
 
 impl DynTreeNode for dyn PhysicalExpr {
     fn arc_children(&self) -> Vec<&Arc<Self>> {
diff --git a/datafusion/physical-expr-common/src/utils.rs b/datafusion/physical-expr-common/src/utils.rs
index 05b216ab75ebc..e469885f83316 100644
--- a/datafusion/physical-expr-common/src/utils.rs
+++ b/datafusion/physical-expr-common/src/utils.rs
@@ -15,13 +15,27 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::borrow::Cow;
 use std::sync::Arc;
 
+use crate::metrics::ExpressionEvaluatorMetrics;
 use crate::physical_expr::PhysicalExpr;
 use crate::tree_node::ExprContext;
 
-use arrow::array::{make_array, Array, ArrayRef, BooleanArray, MutableArrayData};
-use arrow::compute::{and_kleene, is_not_null, SlicesIterator};
+use arrow::array::cast::AsArray;
+use arrow::array::{
+    Array, ArrayDataBuilder, ArrayRef, BooleanArray, BooleanBufferBuilder,
+    DictionaryArray, FixedSizeBinaryArray, GenericByteArray, GenericByteViewArray,
+    MutableArrayData, PrimitiveArray, make_array, new_null_array,
+};
+use arrow::buffer::{BooleanBuffer, Buffer, NullBuffer, ScalarBuffer};
+use arrow::compute::{SlicesIterator, prep_null_mask_filter};
+use arrow::datatypes::{
+    ArrowDictionaryKeyType, ArrowNativeType, ArrowPrimitiveType, ByteArrayType,
+    ByteViewType, DataType,
+};
+use arrow::record_batch::RecordBatch;
+use arrow::{downcast_dictionary_array, downcast_primitive_array};
 use datafusion_common::Result;
 use datafusion_expr_common::sort_properties::ExprProperties;
 
@@ -48,6 +62,11 @@ impl ExprPropertiesNode {
     }
 }
 
+/// If the mask selects more than this fraction of rows, use
+/// `set_slices()` to copy contiguous ranges. Otherwise iterate
+/// over individual positions using `set_indices()`.
+const SCATTER_SLICES_SELECTIVITY_THRESHOLD: f64 = 0.8;
+
 /// Scatter `truthy` array by boolean mask. When the mask evaluates `true`, next values of `truthy`
 /// are taken, when the mask evaluates `false` values null values are filled.
 ///
@@ -55,23 +74,303 @@ impl ExprPropertiesNode {
 /// * `mask` - Boolean values used to determine where to put the `truthy` values
 /// * `truthy` - All values of this array are to scatter according to `mask` into final result.
 pub fn scatter(mask: &BooleanArray, truthy: &dyn Array) -> Result<ArrayRef> {
-    let truthy = truthy.to_data();
+    let mask = match mask.null_count() {
+        0 => Cow::Borrowed(mask),
+        n if n == mask.len() => {
+            return Ok(new_null_array(truthy.data_type(), mask.len()));
+        }
+        _ => Cow::Owned(prep_null_mask_filter(mask)),
+    };
+
+    let output_len = mask.len();
+
+    // Fast path: no true values mean all-null object
+    if !mask.has_true() {
+        return Ok(new_null_array(truthy.data_type(), output_len));
+    }
+
+    // Fast path: all true means output = truthy
+    if mask.null_count() == 0 && !mask.has_false() {
+        return Ok(truthy.slice(0, truthy.len()));
+    }
+
+    let count = mask.true_count();
+    let selectivity = count as f64 / output_len as f64;
+    let mask_buffer = mask.values();
+
+    scatter_array(truthy, mask_buffer, output_len, selectivity)
+}
+
+fn scatter_array(
+    truthy: &dyn Array,
+    mask: &BooleanBuffer,
+    output_len: usize,
+    selectivity: f64,
+) -> Result<ArrayRef> {
+    downcast_primitive_array! {
+        truthy => Ok(Arc::new(scatter_primitive(truthy, mask, output_len, selectivity))),
+        DataType::Boolean => {
+            Ok(Arc::new(scatter_boolean(truthy.as_boolean(), mask, output_len, selectivity)))
+        }
+        DataType::Utf8 => {
+            Ok(Arc::new(scatter_bytes(truthy.as_string::<i32>(), mask, output_len, selectivity)))
+        }
+        DataType::LargeUtf8 => {
+            Ok(Arc::new(scatter_bytes(truthy.as_string::<i64>(), mask, output_len, selectivity)))
+        }
+        DataType::Utf8View => {
+            Ok(Arc::new(scatter_byte_view(truthy.as_string_view(), mask, output_len, selectivity)))
+        }
+        DataType::Binary => {
+            Ok(Arc::new(scatter_bytes(truthy.as_binary::<i32>(), mask, output_len, selectivity)))
+        }
+        DataType::LargeBinary => {
+            Ok(Arc::new(scatter_bytes(truthy.as_binary::<i64>(), mask, output_len, selectivity)))
+        }
+        DataType::BinaryView => {
+            Ok(Arc::new(scatter_byte_view(truthy.as_binary_view(), mask, output_len, selectivity)))
+        }
+        DataType::FixedSizeBinary(_) => {
+            Ok(Arc::new(scatter_fixed_size_binary(
+                truthy.as_fixed_size_binary(), mask, output_len, selectivity,
+            )))
+        }
+        DataType::Dictionary(_, _) => {
+            downcast_dictionary_array! {
+                truthy => Ok(Arc::new(scatter_dict(truthy, mask, output_len, selectivity))),
+                _t => scatter_fallback(truthy, mask, output_len)
+            }
+        }
+        _ => scatter_fallback(truthy, mask, output_len)
+    }
+}
+
+#[inline(never)]
+fn scatter_native<T: ArrowNativeType>(
+    src: &[T],
+    mask: &BooleanBuffer,
+    output_len: usize,
+    selectivity: f64,
+) -> ScalarBuffer<T> {
+    let mut output = vec![T::default(); output_len];
+    let mut src_offset = 0;
+
+    if selectivity > SCATTER_SLICES_SELECTIVITY_THRESHOLD {
+        for (start, end) in mask.set_slices() {
+            let len = end - start;
+            output[start..end].copy_from_slice(&src[src_offset..src_offset + len]);
+            src_offset += len;
+        }
+    } else {
+        for dst_idx in mask.set_indices() {
+            output[dst_idx] = src[src_offset];
+            src_offset += 1;
+        }
+    }
+
+    ScalarBuffer::from(output)
+}
+
+fn scatter_bits(
+    src: &BooleanBuffer,
+    mask: &BooleanBuffer,
+    output_len: usize,
+    selectivity: f64,
+) -> BooleanBuffer {
+    let mut builder = BooleanBufferBuilder::new(output_len);
+    builder.advance(output_len);
+    let mut src_offset = 0;
+
+    if selectivity > SCATTER_SLICES_SELECTIVITY_THRESHOLD {
+        for (start, end) in mask.set_slices() {
+            for i in start..end {
+                if src.value(src_offset) {
+                    builder.set_bit(i, true);
+                }
+                src_offset += 1;
+            }
+        }
+    } else {
+        for dst_idx in mask.set_indices() {
+            if src.value(src_offset) {
+                builder.set_bit(dst_idx, true);
+            }
+            src_offset += 1;
+        }
+    }
+
+    builder.finish()
+}
+
+fn scatter_null_mask(
+    src_nulls: Option<&NullBuffer>,
+    mask: &BooleanBuffer,
+    output_len: usize,
+    selectivity: f64,
+) -> Option<NullBuffer> {
+    let false_count = output_len - mask.count_set_bits();
+    let src_null_count = src_nulls.map(|n| n.null_count()).unwrap_or(0);
+
+    if src_null_count == 0 {
+        if false_count == 0 {
+            None
+        } else {
+            Some(NullBuffer::new(mask.clone()))
+        }
+    } else {
+        let src_nulls = src_nulls.unwrap();
+        let scattered = scatter_bits(src_nulls.inner(), mask, output_len, selectivity);
+        Some(NullBuffer::new(scattered)).filter(|n| n.null_count() > 0)
+    }
+}
+
+fn scatter_primitive<T: ArrowPrimitiveType>(
+    truthy: &PrimitiveArray<T>,
+    mask: &BooleanBuffer,
+    output_len: usize,
+    selectivity: f64,
+) -> PrimitiveArray<T> {
+    let values = scatter_native(truthy.values(), mask, output_len, selectivity);
+    let nulls = scatter_null_mask(truthy.nulls(), mask, output_len, selectivity);
+
+    PrimitiveArray::new(values, nulls).with_data_type(truthy.data_type().clone())
+}
 
-    // update the mask so that any null values become false
-    // (SlicesIterator doesn't respect nulls)
-    let mask = and_kleene(mask, &is_not_null(mask)?)?;
+fn scatter_boolean(
+    truthy: &BooleanArray,
+    mask: &BooleanBuffer,
+    output_len: usize,
+    selectivity: f64,
+) -> BooleanArray {
+    let values = scatter_bits(truthy.values(), mask, output_len, selectivity);
+    let nulls = scatter_null_mask(truthy.nulls(), mask, output_len, selectivity);
 
-    let mut mutable = MutableArrayData::new(vec![&truthy], true, mask.len());
+    BooleanArray::new(values, nulls)
+}
+
+fn scatter_bytes<T: ByteArrayType>(
+    truthy: &GenericByteArray<T>,
+    mask: &BooleanBuffer,
+    output_len: usize,
+    selectivity: f64,
+) -> GenericByteArray<T> {
+    let src_offsets = truthy.value_offsets();
+    let src_data = truthy.value_data();
+
+    // Build output offsets: false positions get zero-length (offset stays same)
+    let mut dst_offsets: Vec<T::Offset> = Vec::with_capacity(output_len + 1);
+    let mut cur_offset = T::Offset::default();
+    dst_offsets.push(cur_offset);
 
-    // the SlicesIterator slices only the true values. So the gaps left by this iterator we need to
-    // fill with falsy values
+    let mut src_idx = 0;
+    for i in 0..output_len {
+        if mask.value(i) {
+            let len =
+                src_offsets[src_idx + 1].as_usize() - src_offsets[src_idx].as_usize();
+            cur_offset += T::Offset::from_usize(len).unwrap();
+            src_idx += 1;
+        }
+        dst_offsets.push(cur_offset);
+    }
+
+    let byte_start = src_offsets[0].as_usize();
+    let byte_end = src_offsets[src_idx].as_usize();
+    let dst_data: Buffer = src_data[byte_start..byte_end].into();
+
+    let nulls = scatter_null_mask(truthy.nulls(), mask, output_len, selectivity);
+
+    let offsets_buffer: Buffer = dst_offsets.into();
+    // SAFETY: offsets and data buffers are constructed consistently for `output_len`
+    let data = unsafe {
+        ArrayDataBuilder::new(truthy.data_type().clone())
+            .len(output_len)
+            .add_buffer(offsets_buffer)
+            .add_buffer(dst_data)
+            .nulls(nulls)
+            .build_unchecked()
+    };
+    GenericByteArray::from(data)
+}
+
+fn scatter_byte_view<T: ByteViewType>(
+    truthy: &GenericByteViewArray<T>,
+    mask: &BooleanBuffer,
+    output_len: usize,
+    selectivity: f64,
+) -> GenericByteViewArray<T> {
+    let new_views = scatter_native(truthy.views(), mask, output_len, selectivity);
+    let nulls = scatter_null_mask(truthy.nulls(), mask, output_len, selectivity);
+
+    // SAFETY: views and data buffers are copied from a valid source array
+    unsafe {
+        GenericByteViewArray::new_unchecked(
+            new_views,
+            truthy.data_buffers().to_vec(),
+            nulls,
+        )
+    }
+}
+
+fn scatter_fixed_size_binary(
+    truthy: &FixedSizeBinaryArray,
+    mask: &BooleanBuffer,
+    output_len: usize,
+    selectivity: f64,
+) -> FixedSizeBinaryArray {
+    let value_length = truthy.value_length() as usize;
+    let mut output = vec![0u8; output_len * value_length];
+    let mut src_idx = 0;
+
+    if selectivity > SCATTER_SLICES_SELECTIVITY_THRESHOLD {
+        for (start, end) in mask.set_slices() {
+            for dst_idx in start..end {
+                let src_bytes = truthy.value(src_idx);
+                let dst_start = dst_idx * value_length;
+                output[dst_start..dst_start + value_length].copy_from_slice(src_bytes);
+                src_idx += 1;
+            }
+        }
+    } else {
+        for dst_idx in mask.set_indices() {
+            let src_bytes = truthy.value(src_idx);
+            let dst_start = dst_idx * value_length;
+            output[dst_start..dst_start + value_length].copy_from_slice(src_bytes);
+            src_idx += 1;
+        }
+    }
+    let nulls = scatter_null_mask(truthy.nulls(), mask, output_len, selectivity);
+
+    FixedSizeBinaryArray::new(truthy.value_length(), Buffer::from(output), nulls)
+}
+
+fn scatter_dict<K: ArrowDictionaryKeyType>(
+    truthy: &DictionaryArray<K>,
+    mask: &BooleanBuffer,
+    output_len: usize,
+    selectivity: f64,
+) -> DictionaryArray<K> {
+    let scattered_keys = scatter_primitive(truthy.keys(), mask, output_len, selectivity);
+    DictionaryArray::new(scattered_keys, Arc::clone(truthy.values()))
+}
+
+fn scatter_fallback(
+    truthy: &dyn Array,
+    mask: &BooleanBuffer,
+    output_len: usize,
+) -> Result<ArrayRef> {
+    let truthy_data = truthy.to_data();
+    let mut mutable = MutableArrayData::new(vec![&truthy_data], true, output_len);
+
+    // the SlicesIterator slices only the true values. So the gaps left by
+    // this iterator we need to fill with falsy values
 
     // keep track of how much is filled
     let mut filled = 0;
     // keep track of current position we have in truthy array
     let mut true_pos = 0;
 
-    SlicesIterator::new(&mask).for_each(|(start, end)| {
+    let mask_array = BooleanArray::new(mask.clone(), None);
+    SlicesIterator::new(&mask_array).for_each(|(start, end)| {
         // the gap needs to be filled with nulls
         if start > filled {
             mutable.extend_nulls(start - filled);
@@ -82,21 +381,55 @@ pub fn scatter(mask: &BooleanArray, truthy: &dyn Array) -> Result<ArrayRef> {
         true_pos += len;
         filled = end;
     });
+
     // the remaining part is falsy
-    if filled < mask.len() {
-        mutable.extend_nulls(mask.len() - filled);
+    if filled < output_len {
+        mutable.extend_nulls(output_len - filled);
     }
 
     let data = mutable.freeze();
     Ok(make_array(data))
 }
 
+/// Evaluates expressions against a record batch.
+/// This will convert the resulting ColumnarValues to ArrayRefs,
+/// duplicating any ScalarValues that may have been returned,
+/// and validating that the returned arrays all have the same
+/// number of rows as the input batch.
+#[inline]
+pub fn evaluate_expressions_to_arrays<'a>(
+    exprs: impl IntoIterator<Item = &'a Arc<dyn PhysicalExpr>>,
+    batch: &RecordBatch,
+) -> Result<Vec<ArrayRef>> {
+    evaluate_expressions_to_arrays_with_metrics(exprs, batch, None)
+}
+
+/// Same as [`evaluate_expressions_to_arrays`] but records optional per-expression metrics.
+///
+/// For metrics tracking, see [`ExpressionEvaluatorMetrics`] for details.
+#[inline]
+pub fn evaluate_expressions_to_arrays_with_metrics<'a>(
+    exprs: impl IntoIterator<Item = &'a Arc<dyn PhysicalExpr>>,
+    batch: &RecordBatch,
+    metrics: Option<&ExpressionEvaluatorMetrics>,
+) -> Result<Vec<ArrayRef>> {
+    let num_rows = batch.num_rows();
+    exprs
+        .into_iter()
+        .enumerate()
+        .map(|(idx, e)| {
+            let _timer = metrics.and_then(|m| m.scoped_timer(idx));
+            e.evaluate(batch)
+                .and_then(|col| col.into_array_of_size(num_rows))
+        })
+        .collect::<Result<Vec<ArrayRef>>>()
+}
+
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
-
-    use arrow::array::Int32Array;
 
+    use arrow::array::{Int32Array, StringArray, StringViewArray, as_string_array};
+    use arrow::compute::filter;
     use datafusion_common::cast::{as_boolean_array, as_int32_array};
 
     use super::*;
@@ -166,4 +499,163 @@ mod tests {
         assert_eq!(&expected, result);
         Ok(())
     }
+
+    #[test]
+    fn scatter_all_true() -> Result<()> {
+        let truthy = Arc::new(Int32Array::from(vec![1, 2, 3]));
+        let mask = BooleanArray::from(vec![true, true, true]);
+
+        let result = scatter(&mask, truthy.as_ref())?;
+        let result = as_int32_array(&result)?;
+        assert_eq!(&Int32Array::from(vec![1, 2, 3]), result);
+        Ok(())
+    }
+
+    #[test]
+    fn scatter_all_false() -> Result<()> {
+        let truthy = Arc::new(Int32Array::from(Vec::<i32>::new()));
+        let mask = BooleanArray::from(vec![false, false, false]);
+
+        let result = scatter(&mask, truthy.as_ref())?;
+        let result = as_int32_array(&result)?;
+        let expected = Int32Array::from(vec![None, None, None]);
+        assert_eq!(&expected, result);
+        Ok(())
+    }
+
+    #[test]
+    fn scatter_empty() -> Result<()> {
+        let truthy = Arc::new(Int32Array::from(Vec::<i32>::new()));
+        let mask = BooleanArray::from(Vec::<bool>::new());
+
+        let result = scatter(&mask, truthy.as_ref())?;
+        assert_eq!(result.len(), 0);
+        Ok(())
+    }
+
+    #[test]
+    fn scatter_primitive_with_source_nulls() -> Result<()> {
+        let truthy = Arc::new(Int32Array::from(vec![Some(1), None, Some(3)]));
+        let mask = BooleanArray::from(vec![true, false, true, true, false]);
+
+        let expected = Int32Array::from_iter(vec![Some(1), None, None, Some(3), None]);
+        let result = scatter(&mask, truthy.as_ref())?;
+        let result = as_int32_array(&result)?;
+
+        assert_eq!(&expected, result);
+        Ok(())
+    }
+
+    #[test]
+    fn scatter_string_test() -> Result<()> {
+        let truthy = Arc::new(StringArray::from(vec!["hello", "world"]));
+        let mask = BooleanArray::from(vec![true, false, false, true]);
+
+        let result = scatter(&mask, truthy.as_ref())?;
+        let result = as_string_array(&result);
+
+        assert_eq!(result.len(), 4);
+        assert!(result.is_valid(0));
+        assert_eq!(result.value(0), "hello");
+        assert!(result.is_null(1));
+        assert!(result.is_null(2));
+        assert!(result.is_valid(3));
+        assert_eq!(result.value(3), "world");
+        Ok(())
+    }
+
+    #[test]
+    fn scatter_string_view_test() -> Result<()> {
+        let truthy = Arc::new(StringViewArray::from(vec![
+            "short",
+            "a longer string that exceeds inline",
+        ]));
+        let mask = BooleanArray::from(vec![false, true, true, false]);
+
+        let result = scatter(&mask, truthy.as_ref())?;
+        let result = result.as_any().downcast_ref::<StringViewArray>().unwrap();
+
+        assert_eq!(result.len(), 4);
+        assert!(result.is_null(0));
+        assert_eq!(result.value(1), "short");
+        assert_eq!(result.value(2), "a longer string that exceeds inline");
+        assert!(result.is_null(3));
+        Ok(())
+    }
+
+    #[test]
+    fn scatter_dictionary_test() -> Result<()> {
+        use arrow::datatypes::Int8Type;
+
+        let values = StringArray::from(vec!["a", "b"]);
+        let truthy = Arc::new(
+            DictionaryArray::<Int8Type>::try_new(
+                arrow::array::Int8Array::from(vec![0, 1, 0]),
+                Arc::new(values),
+            )
+            .unwrap(),
+        );
+        let mask = BooleanArray::from(vec![true, false, true, true, false]);
+
+        let result = scatter(&mask, truthy.as_ref())?;
+        let result = result
+            .as_any()
+            .downcast_ref::<DictionaryArray<Int8Type>>()
+            .unwrap();
+
+        assert_eq!(result.len(), 5);
+        assert!(result.is_valid(0));
+        assert!(result.is_null(1));
+        assert!(result.is_valid(2));
+        assert!(result.is_valid(3));
+        assert!(result.is_null(4));
+        Ok(())
+    }
+
+    #[test]
+    fn scatter_fixed_size_binary_test() -> Result<()> {
+        let truthy = Arc::new(FixedSizeBinaryArray::from(vec![
+            &[1u8, 2][..],
+            &[3, 4][..],
+            &[5, 6][..],
+        ]));
+        let mask = BooleanArray::from(vec![true, false, true, false, true]);
+
+        let result = scatter(&mask, truthy.as_ref())?;
+        let result = result
+            .as_any()
+            .downcast_ref::<FixedSizeBinaryArray>()
+            .unwrap();
+
+        assert_eq!(result.len(), 5);
+        assert!(result.is_valid(0));
+        assert_eq!(result.value(0), &[1, 2]);
+        assert!(result.is_null(1));
+        assert!(result.is_valid(2));
+        assert_eq!(result.value(2), &[3, 4]);
+        assert!(result.is_null(3));
+        assert!(result.is_valid(4));
+        assert_eq!(result.value(4), &[5, 6]);
+        Ok(())
+    }
+
+    #[test]
+    fn scatter_filter_roundtrip() -> Result<()> {
+        let original = Arc::new(Int32Array::from(vec![10, 20, 30, 40, 50]));
+        let mask = BooleanArray::from(vec![true, false, true, false, true]);
+
+        // filter compacts: [10, 30, 50]
+        let filtered = filter(original.as_ref(), &mask).unwrap();
+        // scatter expands back: [10, null, 30, null, 50]
+        let scattered = scatter(&mask, filtered.as_ref())?;
+        let scattered = as_int32_array(&scattered)?;
+
+        assert_eq!(scattered.len(), 5);
+        assert_eq!(scattered.value(0), 10);
+        assert!(scattered.is_null(1));
+        assert_eq!(scattered.value(2), 30);
+        assert!(scattered.is_null(3));
+        assert_eq!(scattered.value(4), 50);
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml
index b7654a0f6f603..b755353d75658 100644
--- a/datafusion/physical-expr/Cargo.toml
+++ b/datafusion/physical-expr/Cargo.toml
@@ -31,27 +31,33 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
 [lib]
 name = "datafusion_physical_expr"
 
+[features]
+recursive_protection = ["dep:recursive"]
+
 [dependencies]
-ahash = { workspace = true }
 arrow = { workspace = true }
 datafusion-common = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-expr-common = { workspace = true }
 datafusion-functions-aggregate-common = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
-half = { workspace = true }
 hashbrown = { workspace = true }
 indexmap = { workspace = true }
 itertools = { workspace = true, features = ["use_std"] }
 parking_lot = { workspace = true }
-paste = "^1.0"
 petgraph = "0.8.3"
+recursive = { workspace = true, optional = true }
+tokio = { workspace = true }
+half = { workspace = true }
 
 [dev-dependencies]
 arrow = { workspace = true, features = ["test_utils"] }
@@ -65,6 +71,10 @@ rstest = { workspace = true }
 harness = false
 name = "in_list"
 
+[[bench]]
+harness = false
+name = "in_list_strategy"
+
 [[bench]]
 harness = false
 name = "case_when"
@@ -76,3 +86,14 @@ name = "is_null"
 [[bench]]
 harness = false
 name = "binary_op"
+
+[[bench]]
+harness = false
+name = "simplify"
+
+[[bench]]
+harness = false
+name = "string_concat"
+
+[package.metadata.cargo-machete]
+ignored = ["half"]
diff --git a/datafusion/physical-expr/benches/binary_op.rs b/datafusion/physical-expr/benches/binary_op.rs
index 9bffd79dc00f9..99fc40fa1c91b 100644
--- a/datafusion/physical-expr/benches/binary_op.rs
+++ b/datafusion/physical-expr/benches/binary_op.rs
@@ -20,12 +20,12 @@ use arrow::{
     datatypes::{DataType, Field, Schema},
 };
 use arrow::{array::StringArray, record_batch::RecordBatch};
-use criterion::{criterion_group, criterion_main, Criterion};
-use datafusion_expr::{and, binary_expr, col, lit, or, Operator};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_expr::{Operator, and, binary_expr, col, lit, or};
 use datafusion_physical_expr::{
+    PhysicalExpr,
     expressions::{BinaryExpr, Column},
     planner::logical2physical,
-    PhysicalExpr,
 };
 use std::hint::black_box;
 use std::sync::Arc;
@@ -286,6 +286,7 @@ fn generate_test_strings(num_rows: usize) -> (Vec<String>, Vec<String>) {
 /// Creates record batches with boolean arrays that test different short-circuit scenarios.
 /// When TEST_ALL_FALSE = true: creates data for AND operator benchmarks (needs early false exit)
 /// When TEST_ALL_FALSE = false: creates data for OR operator benchmarks (needs early true exit)
+#[expect(clippy::needless_pass_by_value)]
 fn create_record_batch<const TEST_ALL_FALSE: bool>(
     schema: Arc<Schema>,
     b_values: &[String],
diff --git a/datafusion/physical-expr/benches/case_when.rs b/datafusion/physical-expr/benches/case_when.rs
index 9ed6b58da7f7e..33931a2ba98e4 100644
--- a/datafusion/physical-expr/benches/case_when.rs
+++ b/datafusion/physical-expr/benches/case_when.rs
@@ -19,13 +19,14 @@ use arrow::array::{Array, ArrayRef, Int32Array, Int32Builder, StringArray};
 use arrow::datatypes::{ArrowNativeTypeOp, Field, Schema};
 use arrow::record_batch::RecordBatch;
 use arrow::util::test_util::seedable_rng;
-use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
 use datafusion_expr::Operator;
-use datafusion_physical_expr::expressions::{case, col, lit, BinaryExpr};
+use datafusion_physical_expr::expressions::{BinaryExpr, case, col, lit};
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use itertools::Itertools;
-use rand::distr::uniform::SampleUniform;
 use rand::distr::Alphanumeric;
+use rand::distr::uniform::SampleUniform;
 use rand::rngs::StdRng;
 use rand::{Rng, RngCore};
 use std::fmt::{Display, Formatter};
@@ -93,6 +94,7 @@ fn criterion_benchmark(c: &mut Criterion) {
     run_benchmarks(c, &make_batch(8192, 100));
 
     benchmark_lookup_table_case_when(c, 8192);
+    benchmark_divide_by_zero_protection(c, 8192);
 }
 
 fn run_benchmarks(c: &mut Criterion, batch: &RecordBatch) {
@@ -293,7 +295,7 @@ fn create_random_string_generator(
 /// `null_percentage` is the percentage of null values
 /// The rest of the values will be outside the specified range
 fn generate_values_for_lookup<T, A>(
-    options: Options<T>,
+    options: &Options<T>,
     generate_other_value: impl Fn(&mut StdRng, &[T]) -> T,
 ) -> A
 where
@@ -416,7 +418,7 @@ fn benchmark_lookup_table_case_when(c: &mut Criterion, batch_size: usize) {
                         &input,
                         |b, input| {
                             let array: Int32Array = generate_values_for_lookup(
-                                Options::<i32> {
+                                &Options::<i32> {
                                     number_of_rows: batch_size,
                                     range_of_values: when_thens_primitive_to_string
                                         .iter()
@@ -469,7 +471,7 @@ fn benchmark_lookup_table_case_when(c: &mut Criterion, batch_size: usize) {
                         &input,
                         |b, input| {
                             let array: StringArray = generate_values_for_lookup(
-                                Options::<String> {
+                                &Options::<String> {
                                     number_of_rows: batch_size,
                                     range_of_values: when_thens_string_to_primitive
                                         .iter()
@@ -517,5 +519,83 @@ fn benchmark_lookup_table_case_when(c: &mut Criterion, batch_size: usize) {
     }
 }
 
+fn benchmark_divide_by_zero_protection(c: &mut Criterion, batch_size: usize) {
+    let mut group = c.benchmark_group("divide_by_zero_protection");
+
+    for zero_percentage in [0.0, 0.1, 0.5, 0.9] {
+        let rng = &mut seedable_rng();
+
+        let numerator: Int32Array =
+            (0..batch_size).map(|_| Some(rng.random::<i32>())).collect();
+
+        let divisor_values: Vec<Option<i32>> = (0..batch_size)
+            .map(|_| {
+                let roll: f32 = rng.random();
+                if roll < zero_percentage {
+                    Some(0)
+                } else {
+                    let mut val = rng.random::<i32>();
+                    while val == 0 {
+                        val = rng.random::<i32>();
+                    }
+                    Some(val)
+                }
+            })
+            .collect();
+
+        let divisor: Int32Array = divisor_values.iter().cloned().collect();
+        let divisor_copy: Int32Array = divisor_values.iter().cloned().collect();
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("numerator", numerator.data_type().clone(), true),
+            Field::new("divisor", divisor.data_type().clone(), true),
+            Field::new("divisor_copy", divisor_copy.data_type().clone(), true),
+        ]));
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(numerator),
+                Arc::new(divisor),
+                Arc::new(divisor_copy),
+            ],
+        )
+        .unwrap();
+
+        let numerator_col = col("numerator", &batch.schema()).unwrap();
+        let divisor_col = col("divisor", &batch.schema()).unwrap();
+
+        // DivideByZeroProtection: WHEN condition checks `divisor_col > 0` and division
+        // uses `divisor_col` as divisor. Since the checked column matches the divisor,
+        // this triggers the DivideByZeroProtection optimization.
+        group.bench_function(
+            format!(
+                "{} rows, {}% zeros: DivideByZeroProtection",
+                batch_size,
+                (zero_percentage * 100.0) as i32
+            ),
+            |b| {
+                let when = Arc::new(BinaryExpr::new(
+                    Arc::clone(&divisor_col),
+                    Operator::NotEq,
+                    lit(0i32),
+                ));
+                let then = Arc::new(BinaryExpr::new(
+                    Arc::clone(&numerator_col),
+                    Operator::Divide,
+                    Arc::clone(&divisor_col),
+                ));
+                let else_null: Arc<dyn PhysicalExpr> = lit(ScalarValue::Int32(None));
+                let expr =
+                    Arc::new(case(None, vec![(when, then)], Some(else_null)).unwrap());
+
+                b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
+            },
+        );
+    }
+
+    group.finish();
+}
+
 criterion_group!(benches, criterion_benchmark);
 criterion_main!(benches);
diff --git a/datafusion/physical-expr/benches/in_list.rs b/datafusion/physical-expr/benches/in_list.rs
index 778204055bbdd..021d8259cdfdf 100644
--- a/datafusion/physical-expr/benches/in_list.rs
+++ b/datafusion/physical-expr/benches/in_list.rs
@@ -15,17 +15,24 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{Array, ArrayRef, Float32Array, Int32Array, StringArray};
+use arrow::array::{
+    Array, ArrayRef, Float32Array, Int16Array, Int32Array, StringArray, StringViewArray,
+    TimestampNanosecondArray, UInt8Array,
+};
 use arrow::datatypes::{Field, Schema};
 use arrow::record_batch::RecordBatch;
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
+use datafusion_physical_expr::PhysicalExpr;
 use datafusion_physical_expr::expressions::{col, in_list, lit};
 use rand::distr::Alphanumeric;
 use rand::prelude::*;
+use std::any::TypeId;
 use std::hint::black_box;
 use std::sync::Arc;
+use std::time::Duration;
 
+/// Measures how long `in_list(col("a"), exprs)` takes to evaluate against a single RecordBatch.
 fn do_bench(c: &mut Criterion, name: &str, values: ArrayRef, exprs: &[ScalarValue]) {
     let schema = Schema::new(vec![Field::new("a", values.data_type().clone(), true)]);
     let exprs = exprs.iter().map(|s| lit(s.clone())).collect();
@@ -37,78 +44,401 @@ fn do_bench(c: &mut Criterion, name: &str, values: ArrayRef, exprs: &[ScalarValu
     });
 }
 
+/// Generates a random alphanumeric string of the specified length.
 fn random_string(rng: &mut StdRng, len: usize) -> String {
     let value = rng.sample_iter(&Alphanumeric).take(len).collect();
     String::from_utf8(value).unwrap()
 }
 
-fn do_benches(
+const IN_LIST_LENGTHS: [usize; 4] = [3, 8, 28, 100];
+const LIST_WITH_COLUMNS_LENGTHS: [usize; 3] = [3, 8, 28];
+const NULL_PERCENTS: [f64; 2] = [0., 0.2];
+const MATCH_PERCENTS: [f64; 3] = [0.0, 0.5, 1.0];
+const STRING_LENGTHS: [usize; 3] = [3, 12, 100];
+const ARRAY_LENGTH: usize = 8192;
+
+/// Mixed string lengths for realistic benchmarks.
+/// ~50% short (≤12 bytes), ~50% long (>12 bytes).
+const MIXED_STRING_LENGTHS: &[usize] = &[3, 6, 9, 12, 16, 20, 25, 30];
+
+/// Returns a friendly type name for the array type.
+fn array_type_name<A: 'static>() -> &'static str {
+    let id = TypeId::of::<A>();
+    if id == TypeId::of::<StringArray>() {
+        "Utf8"
+    } else if id == TypeId::of::<StringViewArray>() {
+        "Utf8View"
+    } else if id == TypeId::of::<Float32Array>() {
+        "Float32"
+    } else if id == TypeId::of::<Int16Array>() {
+        "Int16"
+    } else if id == TypeId::of::<Int32Array>() {
+        "Int32"
+    } else if id == TypeId::of::<TimestampNanosecondArray>() {
+        "TimestampNs"
+    } else if id == TypeId::of::<UInt8Array>() {
+        "UInt8"
+    } else {
+        "Unknown"
+    }
+}
+
+/// Builds a benchmark name from array type, list size, and null percentage.
+fn bench_name<A: 'static>(in_list_length: usize, null_percent: f64) -> String {
+    format!(
+        "in_list/{}/list={in_list_length}/nulls={}%",
+        array_type_name::<A>(),
+        (null_percent * 100.0) as u32
+    )
+}
+
+/// Runs in_list benchmarks for a string array type across all list-size × null-ratio × string-length combinations.
+fn bench_string_type<A>(
     c: &mut Criterion,
-    array_length: usize,
-    in_list_length: usize,
-    null_percent: f64,
-) {
-    let mut rng = StdRng::seed_from_u64(120320);
-    for string_length in [5, 10, 20] {
-        let values: StringArray = (0..array_length)
-            .map(|_| {
-                rng.random_bool(null_percent)
-                    .then(|| random_string(&mut rng, string_length))
-            })
-            .collect();
-
-        let in_list: Vec<_> = (0..in_list_length)
-            .map(|_| ScalarValue::from(random_string(&mut rng, string_length)))
-            .collect();
-
-        do_bench(
-            c,
-            &format!(
-                "in_list_utf8({string_length}) ({array_length}, {null_percent}) IN ({in_list_length}, 0)"
-            ),
-            Arc::new(values),
-            &in_list,
-        )
+    rng: &mut StdRng,
+    make_scalar: fn(String) -> ScalarValue,
+) where
+    A: Array + FromIterator<Option<String>> + 'static,
+{
+    for in_list_length in IN_LIST_LENGTHS {
+        for null_percent in NULL_PERCENTS {
+            for string_length in STRING_LENGTHS {
+                let values: A = (0..ARRAY_LENGTH)
+                    .map(|_| {
+                        rng.random_bool(1.0 - null_percent)
+                            .then(|| random_string(rng, string_length))
+                    })
+                    .collect();
+
+                let in_list: Vec<_> = (0..in_list_length)
+                    .map(|_| make_scalar(random_string(rng, string_length)))
+                    .collect();
+
+                do_bench(
+                    c,
+                    &format!(
+                        "{}/str={string_length}",
+                        bench_name::<A>(in_list_length, null_percent)
+                    ),
+                    Arc::new(values),
+                    &in_list,
+                )
+            }
+        }
     }
+}
 
-    let values: Float32Array = (0..array_length)
-        .map(|_| rng.random_bool(null_percent).then(|| rng.random()))
-        .collect();
+/// Runs in_list benchmarks for a numeric array type across all list-size × null-ratio combinations.
+fn bench_numeric_type<T, A>(
+    c: &mut Criterion,
+    rng: &mut StdRng,
+    mut gen_value: impl FnMut(&mut StdRng) -> T,
+    make_scalar: fn(T) -> ScalarValue,
+) where
+    A: Array + FromIterator<Option<T>> + 'static,
+{
+    for in_list_length in IN_LIST_LENGTHS {
+        for null_percent in NULL_PERCENTS {
+            let values: A = (0..ARRAY_LENGTH)
+                .map(|_| rng.random_bool(1.0 - null_percent).then(|| gen_value(rng)))
+                .collect();
 
-    let in_list: Vec<_> = (0..in_list_length)
-        .map(|_| ScalarValue::Float32(Some(rng.random())))
-        .collect();
+            let in_list: Vec<_> = (0..in_list_length)
+                .map(|_| make_scalar(gen_value(rng)))
+                .collect();
 
-    do_bench(
-        c,
-        &format!("in_list_f32 ({array_length}, {null_percent}) IN ({in_list_length}, 0)"),
-        Arc::new(values),
-        &in_list,
-    );
+            do_bench(
+                c,
+                &bench_name::<A>(in_list_length, null_percent),
+                Arc::new(values),
+                &in_list,
+            );
+        }
+    }
+}
+
+/// Generates a random string with a length chosen from MIXED_STRING_LENGTHS.
+fn random_mixed_length_string(rng: &mut StdRng) -> String {
+    let len = *MIXED_STRING_LENGTHS.choose(rng).unwrap();
+    random_string(rng, len)
+}
+
+/// Benchmarks realistic mixed-length IN list scenario.
+///
+/// Tests with:
+/// - Mixed short (≤12 bytes) and long (>12 bytes) strings in the IN list
+/// - Varying prefixes (fully random strings)
+/// - Configurable match rate (% of values that are in the IN list)
+/// - Various IN list sizes (3, 8, 28, 100)
+fn bench_realistic_mixed_strings<A>(
+    c: &mut Criterion,
+    rng: &mut StdRng,
+    make_scalar: fn(String) -> ScalarValue,
+) where
+    A: Array + FromIterator<Option<String>> + 'static,
+{
+    for in_list_length in IN_LIST_LENGTHS {
+        for match_percent in [0.0, 0.25, 0.75] {
+            for null_percent in NULL_PERCENTS {
+                // Generate IN list with mixed-length random strings
+                let in_list_strings: Vec<String> = (0..in_list_length)
+                    .map(|_| random_mixed_length_string(rng))
+                    .collect();
+
+                let in_list: Vec<_> = in_list_strings
+                    .iter()
+                    .map(|s| make_scalar(s.clone()))
+                    .collect();
+
+                // Generate values array with controlled match rate
+                let values: A = (0..ARRAY_LENGTH)
+                    .map(|_| {
+                        if !rng.random_bool(1.0 - null_percent) {
+                            None
+                        } else if rng.random_bool(match_percent) {
+                            // Pick from IN list (will match)
+                            Some(in_list_strings.choose(rng).unwrap().clone())
+                        } else {
+                            // Generate new random string (unlikely to match)
+                            Some(random_mixed_length_string(rng))
+                        }
+                    })
+                    .collect();
 
-    let values: Int32Array = (0..array_length)
-        .map(|_| rng.random_bool(null_percent).then(|| rng.random()))
+                do_bench(
+                    c,
+                    &format!(
+                        "in_list/{}/mixed/list={}/match={}%/nulls={}%",
+                        array_type_name::<A>(),
+                        in_list_length,
+                        (match_percent * 100.0) as u32,
+                        (null_percent * 100.0) as u32
+                    ),
+                    Arc::new(values),
+                    &in_list,
+                );
+            }
+        }
+    }
+}
+
+/// Benchmarks the column-reference evaluation path (no static filter) by including
+/// a column reference in the IN list, which prevents static filter creation.
+///
+/// This simulates SQL like:
+/// ```sql
+/// CREATE TABLE t (a INT, b0 INT, b1 INT, b2 INT);
+/// SELECT * FROM t WHERE a IN (b0, b1, b2);
+/// ```
+///
+/// - `values`: the "needle" column (`a`)
+/// - `list_cols`: the "haystack" columns (`b0`, `b1`, …)
+fn do_bench_with_columns(
+    c: &mut Criterion,
+    name: &str,
+    values: ArrayRef,
+    list_cols: &[ArrayRef],
+) {
+    let mut fields = vec![Field::new("a", values.data_type().clone(), true)];
+    let mut columns: Vec<ArrayRef> = vec![values];
+
+    // Build list expressions: column refs (forces non-constant evaluation path)
+    let schema_fields: Vec<Field> = list_cols
+        .iter()
+        .enumerate()
+        .map(|(i, col_arr)| {
+            let name = format!("b{i}");
+            fields.push(Field::new(&name, col_arr.data_type().clone(), true));
+            columns.push(Arc::clone(col_arr));
+            Field::new(&name, col_arr.data_type().clone(), true)
+        })
         .collect();
 
-    let in_list: Vec<_> = (0..in_list_length)
-        .map(|_| ScalarValue::Int32(Some(rng.random())))
+    let schema = Schema::new(fields);
+    let list_exprs: Vec<Arc<dyn PhysicalExpr>> = schema_fields
+        .iter()
+        .map(|f| col(f.name(), &schema).unwrap())
         .collect();
 
-    do_bench(
-        c,
-        &format!("in_list_i32 ({array_length}, {null_percent}) IN ({in_list_length}, 0)"),
-        Arc::new(values),
-        &in_list,
-    )
+    let expr = in_list(col("a", &schema).unwrap(), list_exprs, &false, &schema).unwrap();
+    let batch = RecordBatch::try_new(Arc::new(schema), columns).unwrap();
+
+    c.bench_function(name, |b| {
+        b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
+    });
 }
 
-fn criterion_benchmark(c: &mut Criterion) {
-    for in_list_length in [1, 3, 10, 100] {
-        for null_percent in [0., 0.2] {
-            do_benches(c, 1024, in_list_length, null_percent)
+/// Benchmarks the IN list path with column references for Int32 arrays.
+///
+/// Equivalent SQL:
+/// ```sql
+/// CREATE TABLE t (a INT, b0 INT, b1 INT, ...);
+/// SELECT * FROM t WHERE a IN (b0, b1, ...);
+/// ```
+fn bench_with_columns_int32(c: &mut Criterion) {
+    let mut rng = StdRng::seed_from_u64(42);
+
+    for list_size in LIST_WITH_COLUMNS_LENGTHS {
+        for match_percent in MATCH_PERCENTS {
+            for null_percent in NULL_PERCENTS {
+                // Generate the "needle" column
+                let values: Int32Array = (0..ARRAY_LENGTH)
+                    .map(|_| {
+                        rng.random_bool(1.0 - null_percent)
+                            .then(|| rng.random_range(0..1000))
+                    })
+                    .collect();
+
+                // Generate list columns with controlled match rate
+                let list_cols: Vec<ArrayRef> = (0..list_size)
+                    .map(|_| {
+                        let col: Int32Array = (0..ARRAY_LENGTH)
+                            .map(|row| {
+                                if rng.random_bool(1.0 - null_percent) {
+                                    if rng.random_bool(match_percent) {
+                                        // Copy from values to create a match
+                                        if values.is_null(row) {
+                                            Some(rng.random_range(0..1000))
+                                        } else {
+                                            Some(values.value(row))
+                                        }
+                                    } else {
+                                        // Random value (unlikely to match)
+                                        Some(rng.random_range(1000..2000))
+                                    }
+                                } else {
+                                    None
+                                }
+                            })
+                            .collect();
+                        Arc::new(col) as ArrayRef
+                    })
+                    .collect();
+
+                do_bench_with_columns(
+                    c,
+                    &format!(
+                        "in_list_cols/Int32/list={}/match={}%/nulls={}%",
+                        list_size,
+                        (match_percent * 100.0) as u32,
+                        (null_percent * 100.0) as u32
+                    ),
+                    Arc::new(values),
+                    &list_cols,
+                );
+            }
+        }
+    }
+}
+
+/// Benchmarks the IN list path with column references for Utf8 arrays.
+///
+/// Equivalent SQL:
+/// ```sql
+/// CREATE TABLE t (a VARCHAR, b0 VARCHAR, b1 VARCHAR, ...);
+/// SELECT * FROM t WHERE a IN (b0, b1, ...);
+/// ```
+fn bench_with_columns_utf8(c: &mut Criterion) {
+    let mut rng = StdRng::seed_from_u64(99);
+
+    for list_size in LIST_WITH_COLUMNS_LENGTHS {
+        for match_percent in MATCH_PERCENTS {
+            // Generate the "needle" column
+            let value_strings: Vec<Option<String>> = (0..ARRAY_LENGTH)
+                .map(|_| rng.random_bool(0.8).then(|| random_string(&mut rng, 12)))
+                .collect();
+            let values: StringArray =
+                value_strings.iter().map(|s| s.as_deref()).collect();
+
+            // Generate list columns with controlled match rate
+            let list_cols: Vec<ArrayRef> = (0..list_size)
+                .map(|_| {
+                    let col: StringArray = (0..ARRAY_LENGTH)
+                        .map(|row| {
+                            if rng.random_bool(match_percent) {
+                                // Copy from values to create a match
+                                value_strings[row].as_deref()
+                            } else {
+                                Some("no_match_value_xyz")
+                            }
+                        })
+                        .collect();
+                    Arc::new(col) as ArrayRef
+                })
+                .collect();
+
+            do_bench_with_columns(
+                c,
+                &format!(
+                    "in_list_cols/Utf8/list={}/match={}%",
+                    list_size,
+                    (match_percent * 100.0) as u32,
+                ),
+                Arc::new(values),
+                &list_cols,
+            );
         }
     }
 }
 
-criterion_group!(benches, criterion_benchmark);
+/// Entry point: registers in_list benchmarks for string and numeric array types.
+fn criterion_benchmark(c: &mut Criterion) {
+    let mut rng = StdRng::seed_from_u64(120320);
+
+    // Benchmarks for string array types (Utf8, Utf8View)
+    bench_string_type::<StringArray>(c, &mut rng, |s| ScalarValue::Utf8(Some(s)));
+    bench_string_type::<StringViewArray>(c, &mut rng, |s| ScalarValue::Utf8View(Some(s)));
+
+    // Realistic mixed-length string benchmarks (TPC-H style)
+    bench_realistic_mixed_strings::<StringArray>(c, &mut rng, |s| {
+        ScalarValue::Utf8(Some(s))
+    });
+    bench_realistic_mixed_strings::<StringViewArray>(c, &mut rng, |s| {
+        ScalarValue::Utf8View(Some(s))
+    });
+
+    // Benchmarks for numeric types
+    bench_numeric_type::<u8, UInt8Array>(
+        c,
+        &mut rng,
+        |rng| rng.random(),
+        |v| ScalarValue::UInt8(Some(v)),
+    );
+    bench_numeric_type::<i16, Int16Array>(
+        c,
+        &mut rng,
+        |rng| rng.random(),
+        |v| ScalarValue::Int16(Some(v)),
+    );
+    bench_numeric_type::<f32, Float32Array>(
+        c,
+        &mut rng,
+        |rng| rng.random(),
+        |v| ScalarValue::Float32(Some(v)),
+    );
+    bench_numeric_type::<i32, Int32Array>(
+        c,
+        &mut rng,
+        |rng| rng.random(),
+        |v| ScalarValue::Int32(Some(v)),
+    );
+    bench_numeric_type::<i64, TimestampNanosecondArray>(
+        c,
+        &mut rng,
+        |rng| rng.random(),
+        |v| ScalarValue::TimestampNanosecond(Some(v), None),
+    );
+
+    // Column-reference path benchmarks (non-constant list expressions)
+    bench_with_columns_int32(c);
+    bench_with_columns_utf8(c);
+}
+
+criterion_group! {
+    name = benches;
+    config = Criterion::default()
+        .warm_up_time(Duration::from_millis(100))
+        .measurement_time(Duration::from_millis(500));
+    targets = criterion_benchmark
+}
 criterion_main!(benches);
diff --git a/datafusion/physical-expr/benches/in_list_strategy.rs b/datafusion/physical-expr/benches/in_list_strategy.rs
new file mode 100644
index 0000000000000..5c4922fdcf8a9
--- /dev/null
+++ b/datafusion/physical-expr/benches/in_list_strategy.rs
@@ -0,0 +1,1037 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Focused benchmarks for `InList` cases.
+//!
+//! This benchmark file adds targeted coverage for representative `IN LIST`
+//! workloads with controlled parameters:
+//!
+//! - **Controlled match rates**: Exercises both hit-heavy and miss-heavy paths
+//! - **List size scaling**: Measures behavior across small and large `IN` lists
+//! - **Type coverage**: Covers primitive, string, string-view, dictionary, and
+//!   fixed-size-binary inputs
+//! - **Shared-prefix strings**: Adds collision-heavy string cases where values
+//!   only differ late in the string
+//! - **Mixed-length strings**: Covers inputs that combine short and long values
+//! - **Null handling**: Includes representative `NULL` and `NOT IN` cases
+//!
+//! # Case Coverage
+//!
+//! | Case | Types | Characteristics | List Sizes Tested |
+//! |------|-------|-----------------|-------------------|
+//! | Narrow integer cases | UInt8 | small value domain | 4, 16 |
+//! | Narrow integer cases | Int16 | larger value domain | 4, 64, 256 |
+//! | 32-bit primitive cases | Int32, Float32 | small and large lists | 4, 32, 64, 256 |
+//! | 64-bit primitive cases | Int64, TimestampNs | small and large lists | 4, 16, 32, 128 |
+//! | Utf8 short-string cases | Utf8 | 8-byte strings | 4, 64, 256 |
+//! | Utf8 long-string cases | Utf8 | 24-byte strings | 4, 64, 256 |
+//! | Utf8View short-string cases | Utf8View | 8-byte strings | 4, 16, 64, 256 |
+//! | Utf8View length-12 cases | Utf8View | 12-byte strings | 16, 64 |
+//! | Utf8View long-string cases | Utf8View | 24-byte strings | 4, 16, 64, 256 |
+//! | Shared-prefix string cases | Utf8, Utf8View | same prefix, different suffix | 16, 32, 64 |
+//! | Fixed-size binary cases | FixedSizeBinary(16) | fixed-width binary values | 4, 64, 256, 10000 |
+
+use arrow::array::*;
+use arrow::datatypes::{Field, Int32Type, Schema};
+use arrow::record_batch::RecordBatch;
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_physical_expr::expressions::{col, in_list, lit};
+use rand::distr::Alphanumeric;
+use rand::prelude::*;
+use std::sync::Arc;
+
+const ARRAY_SIZE: usize = 8192;
+
+/// Match rates to test both code paths (miss-heavy and balanced)
+const MATCH_RATES: [u32; 2] = [0, 50];
+
+// =============================================================================
+// NUMERIC BENCHMARK HELPERS
+// =============================================================================
+
+/// Configuration for numeric benchmarks, grouping test parameters.
+struct NumericBenchConfig<T> {
+    list_size: usize,
+    match_rate: f64,
+    null_rate: f64,
+    make_value: fn(&mut StdRng) -> T,
+    to_scalar: fn(T) -> ScalarValue,
+    negated: bool,
+}
+
+impl<T> NumericBenchConfig<T> {
+    fn new(
+        list_size: usize,
+        match_rate: f64,
+        make_value: fn(&mut StdRng) -> T,
+        to_scalar: fn(T) -> ScalarValue,
+    ) -> Self {
+        Self {
+            list_size,
+            match_rate,
+            null_rate: 0.0,
+            make_value,
+            to_scalar,
+            negated: false,
+        }
+    }
+
+    fn with_null_rate(mut self, null_rate: f64) -> Self {
+        self.null_rate = null_rate;
+        self
+    }
+
+    fn with_negated(mut self) -> Self {
+        self.negated = true;
+        self
+    }
+}
+
+/// Creates and runs a benchmark for numeric types with controlled match rate.
+/// Uses a seed derived from list_size to avoid subset correlation between sizes.
+fn bench_numeric<T, A>(
+    c: &mut Criterion,
+    group: &str,
+    name: &str,
+    cfg: &NumericBenchConfig<T>,
+) where
+    T: Clone,
+    A: Array + FromIterator<Option<T>> + 'static,
+{
+    // Use different seed per list_size to avoid subset correlation
+    let seed = 0xDEAD_BEEF_u64.wrapping_add(cfg.list_size as u64 * 0x1234_5678);
+    let mut rng = StdRng::seed_from_u64(seed);
+
+    // Generate IN list values
+    let haystack: Vec<T> = (0..cfg.list_size)
+        .map(|_| (cfg.make_value)(&mut rng))
+        .collect();
+
+    // Generate array with controlled match rate and null rate
+    let values: A = (0..ARRAY_SIZE)
+        .map(|_| {
+            if cfg.null_rate > 0.0 && rng.random_bool(cfg.null_rate) {
+                None
+            } else if !haystack.is_empty() && rng.random_bool(cfg.match_rate) {
+                Some(haystack.choose(&mut rng).unwrap().clone())
+            } else {
+                Some((cfg.make_value)(&mut rng))
+            }
+        })
+        .collect();
+
+    let schema = Schema::new(vec![Field::new("a", values.data_type().clone(), true)]);
+    let exprs: Vec<_> = haystack
+        .iter()
+        .map(|v: &T| lit((cfg.to_scalar)(v.clone())))
+        .collect();
+    let expr = in_list(col("a", &schema).unwrap(), exprs, &cfg.negated, &schema).unwrap();
+    let batch =
+        RecordBatch::try_new(Arc::new(schema), vec![Arc::new(values) as ArrayRef])
+            .unwrap();
+
+    c.bench_with_input(BenchmarkId::new(group, name), &batch, |b, batch| {
+        b.iter(|| expr.evaluate(batch).unwrap())
+    });
+}
+
+// =============================================================================
+// STRING BENCHMARK HELPERS
+// =============================================================================
+
+fn random_string(rng: &mut StdRng, len: usize) -> String {
+    String::from_utf8(rng.sample_iter(&Alphanumeric).take(len).collect()).unwrap()
+}
+
+/// Creates a set of strings that share a common prefix but differ in suffix.
+/// Uses random alphanumeric suffix to avoid bench-maxing on numeric patterns.
+fn strings_with_shared_prefix(
+    rng: &mut StdRng,
+    count: usize,
+    prefix_len: usize,
+) -> Vec<String> {
+    let prefix = random_string(rng, prefix_len);
+    (0..count)
+        .map(|_| format!("{}{}", prefix, random_string(rng, 8))) // prefix + random 8-char suffix
+        .collect()
+}
+
+/// Configuration for string benchmarks, grouping test parameters.
+struct StringBenchConfig {
+    list_size: usize,
+    match_rate: f64,
+    null_rate: f64,
+    string_len: usize,
+    to_scalar: fn(String) -> ScalarValue,
+    negated: bool,
+}
+
+impl StringBenchConfig {
+    fn new(
+        list_size: usize,
+        match_rate: f64,
+        string_len: usize,
+        to_scalar: fn(String) -> ScalarValue,
+    ) -> Self {
+        Self {
+            list_size,
+            match_rate,
+            null_rate: 0.0,
+            string_len,
+            to_scalar,
+            negated: false,
+        }
+    }
+
+    fn with_null_rate(mut self, null_rate: f64) -> Self {
+        self.null_rate = null_rate;
+        self
+    }
+
+    fn with_negated(mut self) -> Self {
+        self.negated = true;
+        self
+    }
+}
+
+/// Creates and runs a benchmark for string types with controlled match rate.
+/// Uses a seed derived from list_size and string_len to avoid correlation.
+fn bench_string<A>(c: &mut Criterion, group: &str, name: &str, cfg: &StringBenchConfig)
+where
+    A: Array + FromIterator<Option<String>> + 'static,
+{
+    // Use different seed per (list_size, string_len) to avoid correlation
+    let seed = 0xCAFE_BABE_u64
+        .wrapping_add(cfg.list_size as u64 * 0x1111)
+        .wrapping_add(cfg.string_len as u64 * 0x2222);
+    let mut rng = StdRng::seed_from_u64(seed);
+
+    // Generate IN list values
+    let haystack: Vec<String> = (0..cfg.list_size)
+        .map(|_| random_string(&mut rng, cfg.string_len))
+        .collect();
+
+    // Generate array with controlled match rate and null rate
+    let values: A = (0..ARRAY_SIZE)
+        .map(|_| {
+            if cfg.null_rate > 0.0 && rng.random_bool(cfg.null_rate) {
+                None
+            } else if !haystack.is_empty() && rng.random_bool(cfg.match_rate) {
+                Some(haystack.choose(&mut rng).unwrap().clone())
+            } else {
+                Some(random_string(&mut rng, cfg.string_len))
+            }
+        })
+        .collect();
+
+    let schema = Schema::new(vec![Field::new("a", values.data_type().clone(), true)]);
+    let exprs: Vec<_> = haystack
+        .iter()
+        .map(|v| lit((cfg.to_scalar)(v.clone())))
+        .collect();
+    let expr = in_list(col("a", &schema).unwrap(), exprs, &cfg.negated, &schema).unwrap();
+    let batch =
+        RecordBatch::try_new(Arc::new(schema), vec![Arc::new(values) as ArrayRef])
+            .unwrap();
+
+    c.bench_with_input(BenchmarkId::new(group, name), &batch, |b, batch| {
+        b.iter(|| expr.evaluate(batch).unwrap())
+    });
+}
+
+/// Benchmarks strings with shared prefixes and different suffixes.
+/// Uses variable prefix lengths and random suffixes to avoid bench-maxing.
+fn bench_string_shared_prefix<A>(
+    c: &mut Criterion,
+    group: &str,
+    name: &str,
+    list_size: usize,
+    match_rate: f64,
+    prefix_len: usize,
+    to_scalar: fn(String) -> ScalarValue,
+) where
+    A: Array + FromIterator<Option<String>> + 'static,
+{
+    let seed = 0xFEED_FACE_u64
+        .wrapping_add(list_size as u64 * 0x3333)
+        .wrapping_add(prefix_len as u64 * 0x4444);
+    let mut rng = StdRng::seed_from_u64(seed);
+
+    // Generate IN list with a shared prefix.
+    let haystack = strings_with_shared_prefix(&mut rng, list_size, prefix_len);
+
+    // Generate non-matching strings with the same prefix to keep misses close
+    // to the matching set.
+    let non_match_pool = strings_with_shared_prefix(&mut rng, 100, prefix_len);
+
+    // Generate array with controlled match rate
+    let values: A = (0..ARRAY_SIZE)
+        .map(|_| {
+            Some(if !haystack.is_empty() && rng.random_bool(match_rate) {
+                haystack.choose(&mut rng).unwrap().clone()
+            } else {
+                non_match_pool.choose(&mut rng).unwrap().clone()
+            })
+        })
+        .collect();
+
+    let schema = Schema::new(vec![Field::new("a", values.data_type().clone(), true)]);
+    let exprs: Vec<_> = haystack.iter().map(|v| lit(to_scalar(v.clone()))).collect();
+    let expr = in_list(col("a", &schema).unwrap(), exprs, &false, &schema).unwrap();
+    let batch =
+        RecordBatch::try_new(Arc::new(schema), vec![Arc::new(values) as ArrayRef])
+            .unwrap();
+
+    c.bench_with_input(BenchmarkId::new(group, name), &batch, |b, batch| {
+        b.iter(|| expr.evaluate(batch).unwrap())
+    });
+}
+
+/// Benchmarks mixed-length strings (some short <= 12, some long > 12).
+/// Uses a more realistic length distribution than the fixed-width cases.
+fn bench_string_mixed_lengths<A>(
+    c: &mut Criterion,
+    group: &str,
+    name: &str,
+    list_size: usize,
+    match_rate: f64,
+    to_scalar: fn(String) -> ScalarValue,
+) where
+    A: Array + FromIterator<Option<String>> + 'static,
+{
+    let seed = 0xABCD_EF01_u64.wrapping_add(list_size as u64 * 0x5555);
+    let mut rng = StdRng::seed_from_u64(seed);
+
+    // Mixed lengths: some short (<= 12), some long (> 12)
+    let lengths = [4, 8, 12, 16, 20, 24];
+
+    // Generate IN list with mixed lengths
+    let haystack: Vec<String> = (0..list_size)
+        .map(|_| {
+            let len = *lengths.choose(&mut rng).unwrap();
+            random_string(&mut rng, len)
+        })
+        .collect();
+
+    // Generate array with controlled match rate and mixed lengths
+    let values: A = (0..ARRAY_SIZE)
+        .map(|_| {
+            Some(if !haystack.is_empty() && rng.random_bool(match_rate) {
+                haystack.choose(&mut rng).unwrap().clone()
+            } else {
+                let len = *lengths.choose(&mut rng).unwrap();
+                random_string(&mut rng, len)
+            })
+        })
+        .collect();
+
+    let schema = Schema::new(vec![Field::new("a", values.data_type().clone(), true)]);
+    let exprs: Vec<_> = haystack.iter().map(|v| lit(to_scalar(v.clone()))).collect();
+    let expr = in_list(col("a", &schema).unwrap(), exprs, &false, &schema).unwrap();
+    let batch =
+        RecordBatch::try_new(Arc::new(schema), vec![Arc::new(values) as ArrayRef])
+            .unwrap();
+
+    c.bench_with_input(BenchmarkId::new(group, name), &batch, |b, batch| {
+        b.iter(|| expr.evaluate(batch).unwrap())
+    });
+}
+
+// =============================================================================
+// NARROW INTEGER CASE BENCHMARKS
+// =============================================================================
+
+fn bench_narrow_integer(c: &mut Criterion) {
+    // UInt8: small value domain
+    // NOTE: With 256 possible values, list_size=16 covers 6.25% of value space,
+    // so even "match=0%" has ~6% accidental matches from random data.
+    for list_size in [4, 16] {
+        for match_pct in MATCH_RATES {
+            bench_numeric::<u8, UInt8Array>(
+                c,
+                "narrow_integer",
+                &format!("u8/list={list_size}/match={match_pct}%"),
+                &NumericBenchConfig::new(
+                    list_size,
+                    match_pct as f64 / 100.0,
+                    |rng| rng.random(),
+                    |v| ScalarValue::UInt8(Some(v)),
+                ),
+            );
+        }
+    }
+
+    // Int16: larger value domain with wider list sizes
+    for list_size in [4, 64, 256] {
+        for match_pct in MATCH_RATES {
+            bench_numeric::<i16, Int16Array>(
+                c,
+                "narrow_integer",
+                &format!("i16/list={list_size}/match={match_pct}%"),
+                &NumericBenchConfig::new(
+                    list_size,
+                    match_pct as f64 / 100.0,
+                    |rng| rng.random(),
+                    |v| ScalarValue::Int16(Some(v)),
+                ),
+            );
+        }
+    }
+}
+
+// =============================================================================
+// PRIMITIVE SIZE-SCALING BENCHMARKS
+// =============================================================================
+
+fn bench_primitive(c: &mut Criterion) {
+    // Int32: small and larger list sizes
+    for list_size in [4, 32, 64, 256] {
+        let list_case = if list_size <= 32 {
+            "small_list"
+        } else {
+            "large_list"
+        };
+        for match_pct in MATCH_RATES {
+            bench_numeric::<i32, Int32Array>(
+                c,
+                "primitive",
+                &format!("i32/{list_case}/list={list_size}/match={match_pct}%"),
+                &NumericBenchConfig::new(
+                    list_size,
+                    match_pct as f64 / 100.0,
+                    |rng| rng.random(),
+                    |v| ScalarValue::Int32(Some(v)),
+                ),
+            );
+        }
+    }
+
+    // Int64: small and larger list sizes
+    for list_size in [4, 16, 32, 128] {
+        let list_case = if list_size <= 16 {
+            "small_list"
+        } else {
+            "large_list"
+        };
+        for match_pct in MATCH_RATES {
+            bench_numeric::<i64, Int64Array>(
+                c,
+                "primitive",
+                &format!("i64/{list_case}/list={list_size}/match={match_pct}%"),
+                &NumericBenchConfig::new(
+                    list_size,
+                    match_pct as f64 / 100.0,
+                    |rng| rng.random(),
+                    |v| ScalarValue::Int64(Some(v)),
+                ),
+            );
+        }
+    }
+
+    // NOT IN benchmark: test negated path
+    bench_numeric::<i32, Int32Array>(
+        c,
+        "primitive",
+        "i32/small_list/list=16/match=50%/NOT_IN",
+        &NumericBenchConfig::new(
+            16,
+            0.5,
+            |rng| rng.random(),
+            |v| ScalarValue::Int32(Some(v)),
+        )
+        .with_negated(),
+    );
+}
+
+// =============================================================================
+// FLOAT AND TIMESTAMP CASE BENCHMARKS
+// =============================================================================
+
+fn bench_f32(c: &mut Criterion) {
+    // Float32: uses the same list sizes as the Int32 cases.
+    for list_size in [4, 32, 64] {
+        let list_case = if list_size <= 32 {
+            "small_list"
+        } else {
+            "large_list"
+        };
+        for match_pct in MATCH_RATES {
+            bench_numeric::<f32, Float32Array>(
+                c,
+                "f32",
+                &format!("{list_case}/list={list_size}/match={match_pct}%"),
+                &NumericBenchConfig::new(
+                    list_size,
+                    match_pct as f64 / 100.0,
+                    |rng| rng.random::<f32>() * 1000.0,
+                    |v| ScalarValue::Float32(Some(v)),
+                ),
+            );
+        }
+    }
+}
+
+fn bench_timestamp_ns(c: &mut Criterion) {
+    // TimestampNanosecond: uses the same list sizes as the Int64-style cases.
+    for list_size in [4, 16, 32] {
+        let list_case = if list_size <= 16 {
+            "small_list"
+        } else {
+            "large_list"
+        };
+        for match_pct in MATCH_RATES {
+            bench_numeric::<i64, TimestampNanosecondArray>(
+                c,
+                "timestamp_ns",
+                &format!("{list_case}/list={list_size}/match={match_pct}%"),
+                &NumericBenchConfig::new(
+                    list_size,
+                    match_pct as f64 / 100.0,
+                    |rng| rng.random::<i64>().abs(),
+                    |v| ScalarValue::TimestampNanosecond(Some(v), None),
+                ),
+            );
+        }
+    }
+}
+
+// =============================================================================
+// UTF8 STRING CASE BENCHMARKS
+// =============================================================================
+
+fn bench_utf8(c: &mut Criterion) {
+    let to_scalar: fn(String) -> ScalarValue = |s| ScalarValue::Utf8(Some(s));
+
+    // Short strings (8 bytes)
+    for list_size in [4, 64, 256] {
+        for match_pct in MATCH_RATES {
+            bench_string::<StringArray>(
+                c,
+                "utf8",
+                &format!("short_8b/list={list_size}/match={match_pct}%"),
+                &StringBenchConfig::new(
+                    list_size,
+                    match_pct as f64 / 100.0,
+                    8,
+                    to_scalar,
+                ),
+            );
+        }
+    }
+
+    // Long strings (24 bytes)
+    for list_size in [4, 64, 256] {
+        for match_pct in MATCH_RATES {
+            bench_string::<StringArray>(
+                c,
+                "utf8",
+                &format!("long_24b/list={list_size}/match={match_pct}%"),
+                &StringBenchConfig::new(
+                    list_size,
+                    match_pct as f64 / 100.0,
+                    24,
+                    to_scalar,
+                ),
+            );
+        }
+    }
+
+    // Mixed-length strings: realistic distribution
+    for list_size in [16, 64] {
+        for match_pct in MATCH_RATES {
+            bench_string_mixed_lengths::<StringArray>(
+                c,
+                "utf8",
+                &format!("mixed_len/list={list_size}/match={match_pct}%"),
+                list_size,
+                match_pct as f64 / 100.0,
+                to_scalar,
+            );
+        }
+    }
+
+    // Shared-prefix strings: same prefix, different suffix
+    bench_string_shared_prefix::<StringArray>(
+        c,
+        "utf8",
+        "shared_prefix/pfx=12/list=32/match=50%",
+        32,
+        0.5,
+        12,
+        to_scalar,
+    );
+
+    // NOT IN benchmark
+    bench_string::<StringArray>(
+        c,
+        "utf8",
+        "short_8b/list=16/match=50%/NOT_IN",
+        &StringBenchConfig::new(16, 0.5, 8, to_scalar).with_negated(),
+    );
+}
+
+// =============================================================================
+// UTF8VIEW STRING CASE BENCHMARKS
+// =============================================================================
+
+fn bench_utf8view(c: &mut Criterion) {
+    let to_scalar: fn(String) -> ScalarValue = |s| ScalarValue::Utf8View(Some(s));
+
+    // Short strings (8 bytes)
+    for list_size in [4, 16, 64, 256] {
+        for match_pct in MATCH_RATES {
+            bench_string::<StringViewArray>(
+                c,
+                "utf8view",
+                &format!("short_8b/list={list_size}/match={match_pct}%"),
+                &StringBenchConfig::new(
+                    list_size,
+                    match_pct as f64 / 100.0,
+                    8,
+                    to_scalar,
+                ),
+            );
+        }
+    }
+
+    // Length-12 strings
+    for list_size in [16, 64] {
+        for match_pct in MATCH_RATES {
+            bench_string::<StringViewArray>(
+                c,
+                "utf8view",
+                &format!("len_12b/list={list_size}/match={match_pct}%"),
+                &StringBenchConfig::new(
+                    list_size,
+                    match_pct as f64 / 100.0,
+                    12,
+                    to_scalar,
+                ),
+            );
+        }
+    }
+
+    // Long strings (24 bytes)
+    for list_size in [4, 16, 64, 256] {
+        for match_pct in MATCH_RATES {
+            bench_string::<StringViewArray>(
+                c,
+                "utf8view",
+                &format!("long_24b/list={list_size}/match={match_pct}%"),
+                &StringBenchConfig::new(
+                    list_size,
+                    match_pct as f64 / 100.0,
+                    24,
+                    to_scalar,
+                ),
+            );
+        }
+    }
+
+    // Mixed-length strings: realistic distribution
+    for list_size in [16, 64] {
+        for match_pct in MATCH_RATES {
+            bench_string_mixed_lengths::<StringViewArray>(
+                c,
+                "utf8view",
+                &format!("mixed_len/list={list_size}/match={match_pct}%"),
+                list_size,
+                match_pct as f64 / 100.0,
+                to_scalar,
+            );
+        }
+    }
+
+    // Shared-prefix strings with varying prefix lengths
+    for (prefix_len, list_size) in [(8, 16), (12, 32), (16, 64)] {
+        for match_pct in MATCH_RATES {
+            bench_string_shared_prefix::<StringViewArray>(
+                c,
+                "utf8view",
+                &format!(
+                    "shared_prefix/pfx={prefix_len}/list={list_size}/match={match_pct}%"
+                ),
+                list_size,
+                match_pct as f64 / 100.0,
+                prefix_len,
+                to_scalar,
+            );
+        }
+    }
+}
+
+// =============================================================================
+// DICTIONARY ARRAY BENCHMARKS
+// =============================================================================
+
+/// Helper to benchmark dictionary-encoded Int32 arrays
+fn bench_dict_int32(
+    c: &mut Criterion,
+    name: &str,
+    dict_size: usize,
+    list_size: usize,
+    negated: bool,
+) {
+    let seed = 0xD1C7_0000_u64
+        .wrapping_add(dict_size as u64 * 0x1111)
+        .wrapping_add(list_size as u64 * 0x2222);
+    let mut rng = StdRng::seed_from_u64(seed);
+
+    let dict_values: Vec<i32> = (0..dict_size).map(|_| rng.random()).collect();
+    let haystack: Vec<i32> = dict_values.iter().take(list_size).cloned().collect();
+
+    let indices: Vec<i32> = (0..ARRAY_SIZE)
+        .map(|_| rng.random_range(0..dict_size as i32))
+        .collect();
+    let indices_array = Int32Array::from(indices);
+    let values_array = Int32Array::from(dict_values);
+    let dict_array =
+        DictionaryArray::<Int32Type>::try_new(indices_array, Arc::new(values_array))
+            .unwrap();
+
+    let schema = Schema::new(vec![Field::new("a", dict_array.data_type().clone(), true)]);
+    let exprs: Vec<_> = haystack
+        .iter()
+        .map(|v| lit(ScalarValue::Int32(Some(*v))))
+        .collect();
+    let expr = in_list(col("a", &schema).unwrap(), exprs, &negated, &schema).unwrap();
+    let batch =
+        RecordBatch::try_new(Arc::new(schema), vec![Arc::new(dict_array) as ArrayRef])
+            .unwrap();
+
+    c.bench_with_input(BenchmarkId::new("dictionary", name), &batch, |b, batch| {
+        b.iter(|| expr.evaluate(batch).unwrap())
+    });
+}
+
+/// Helper to benchmark dictionary-encoded string arrays
+fn bench_dict_string(
+    c: &mut Criterion,
+    name: &str,
+    dict_size: usize,
+    list_size: usize,
+    string_len: usize,
+) {
+    let seed = 0xD1C7_5778_u64
+        .wrapping_add(dict_size as u64 * 0x3333)
+        .wrapping_add(string_len as u64 * 0x4444);
+    let mut rng = StdRng::seed_from_u64(seed);
+
+    let dict_values: Vec<String> = (0..dict_size)
+        .map(|_| random_string(&mut rng, string_len))
+        .collect();
+    let haystack: Vec<String> = dict_values.iter().take(list_size).cloned().collect();
+
+    let indices: Vec<i32> = (0..ARRAY_SIZE)
+        .map(|_| rng.random_range(0..dict_size as i32))
+        .collect();
+    let indices_array = Int32Array::from(indices);
+    let values_array = StringArray::from(dict_values);
+    let dict_array =
+        DictionaryArray::<Int32Type>::try_new(indices_array, Arc::new(values_array))
+            .unwrap();
+
+    let schema = Schema::new(vec![Field::new("a", dict_array.data_type().clone(), true)]);
+    let exprs: Vec<_> = haystack
+        .iter()
+        .map(|v| lit(ScalarValue::Utf8(Some(v.clone()))))
+        .collect();
+    let expr = in_list(col("a", &schema).unwrap(), exprs, &false, &schema).unwrap();
+    let batch =
+        RecordBatch::try_new(Arc::new(schema), vec![Arc::new(dict_array) as ArrayRef])
+            .unwrap();
+
+    c.bench_with_input(BenchmarkId::new("dictionary", name), &batch, |b, batch| {
+        b.iter(|| expr.evaluate(batch).unwrap())
+    });
+}
+
+fn bench_dictionary(c: &mut Criterion) {
+    // Int32 dictionary: varying list sizes across dictionary values
+    // Dictionary with 100 unique values
+    for list_size in [4, 16, 64] {
+        bench_dict_int32(
+            c,
+            &format!("i32/dict=100/list={list_size}"),
+            100,
+            list_size,
+            false,
+        );
+    }
+
+    // Int32 dictionary: varying dictionary cardinality
+    for dict_size in [10, 1000] {
+        bench_dict_int32(
+            c,
+            &format!("i32/dict={dict_size}/list=16"),
+            dict_size,
+            16,
+            false,
+        );
+    }
+
+    // Int32 dictionary: NOT IN path
+    bench_dict_int32(c, "i32/dict=100/list=16/NOT_IN", 100, 16, true);
+
+    // String dictionary: short strings (<= 12 bytes, common for codes/categories)
+    for list_size in [8, 32] {
+        bench_dict_string(
+            c,
+            &format!("utf8_short/dict=50/list={list_size}"),
+            50,
+            list_size,
+            8,
+        );
+    }
+
+    // String dictionary: long strings (>12 bytes)
+    bench_dict_string(c, "utf8_long/dict=100/list=16", 100, 16, 24);
+
+    // String dictionary: large cardinality (realistic category counts)
+    bench_dict_string(c, "utf8_short/dict=500/list=20", 500, 20, 10);
+}
+
+// =============================================================================
+// NULL HANDLING BENCHMARKS
+// =============================================================================
+//
+// Tests representative null-containing inputs across primitive and string cases.
+
+fn bench_nulls(c: &mut Criterion) {
+    // =========================================================================
+    // PRIMITIVE CASES
+    // =========================================================================
+
+    // UInt8 case with nulls
+    bench_numeric::<u8, UInt8Array>(
+        c,
+        "nulls",
+        "narrow_integer/u8/list=16/match=50%/nulls=20%",
+        &NumericBenchConfig::new(
+            16,
+            0.5,
+            |rng| rng.random(),
+            |v| ScalarValue::UInt8(Some(v)),
+        )
+        .with_null_rate(0.2),
+    );
+
+    // Int32 small-list case with nulls
+    bench_numeric::<i32, Int32Array>(
+        c,
+        "nulls",
+        "primitive/i32/small_list/list=16/match=50%/nulls=20%",
+        &NumericBenchConfig::new(
+            16,
+            0.5,
+            |rng| rng.random(),
+            |v| ScalarValue::Int32(Some(v)),
+        )
+        .with_null_rate(0.2),
+    );
+
+    // Int32 large-list case with nulls
+    bench_numeric::<i32, Int32Array>(
+        c,
+        "nulls",
+        "primitive/i32/large_list/list=64/match=50%/nulls=20%",
+        &NumericBenchConfig::new(
+            64,
+            0.5,
+            |rng| rng.random(),
+            |v| ScalarValue::Int32(Some(v)),
+        )
+        .with_null_rate(0.2),
+    );
+
+    // =========================================================================
+    // STRING CASES
+    // =========================================================================
+
+    let utf8_scalar: fn(String) -> ScalarValue = |s| ScalarValue::Utf8(Some(s));
+    let utf8view_scalar: fn(String) -> ScalarValue = |s| ScalarValue::Utf8View(Some(s));
+
+    // Utf8 short-string case with nulls
+    bench_string::<StringArray>(
+        c,
+        "nulls",
+        "utf8/short_8b/list=16/match=50%/nulls=20%",
+        &StringBenchConfig::new(16, 0.5, 8, utf8_scalar).with_null_rate(0.2),
+    );
+
+    // Utf8 long-string case with nulls
+    bench_string::<StringArray>(
+        c,
+        "nulls",
+        "utf8/long_24b/list=16/match=50%/nulls=20%",
+        &StringBenchConfig::new(16, 0.5, 24, utf8_scalar).with_null_rate(0.2),
+    );
+
+    // Utf8View short-string case with nulls
+    bench_string::<StringViewArray>(
+        c,
+        "nulls",
+        "utf8view/short_8b/list=16/match=50%/nulls=20%",
+        &StringBenchConfig::new(16, 0.5, 8, utf8view_scalar).with_null_rate(0.2),
+    );
+
+    // Utf8View long-string case with nulls
+    bench_string::<StringViewArray>(
+        c,
+        "nulls",
+        "utf8view/long_24b/list=16/match=50%/nulls=20%",
+        &StringBenchConfig::new(16, 0.5, 24, utf8view_scalar).with_null_rate(0.2),
+    );
+
+    // =========================================================================
+    // NOT IN CASES WITH NULLS
+    // =========================================================================
+
+    // Primitive NOT IN case with nulls
+    bench_numeric::<i32, Int32Array>(
+        c,
+        "nulls",
+        "primitive/i32/small_list/list=16/match=50%/nulls=20%/NOT_IN",
+        &NumericBenchConfig::new(
+            16,
+            0.5,
+            |rng| rng.random(),
+            |v| ScalarValue::Int32(Some(v)),
+        )
+        .with_null_rate(0.2)
+        .with_negated(),
+    );
+
+    // String NOT IN case with nulls
+    bench_string::<StringViewArray>(
+        c,
+        "nulls",
+        "utf8view/short_8b/list=16/match=50%/nulls=20%/NOT_IN",
+        &StringBenchConfig::new(16, 0.5, 8, utf8view_scalar)
+            .with_null_rate(0.2)
+            .with_negated(),
+    );
+
+    // =========================================================================
+    // HIGH NULL-RATE CASES
+    // =========================================================================
+
+    // 50% nulls - half the array is null
+    bench_numeric::<i32, Int32Array>(
+        c,
+        "nulls",
+        "primitive/i32/small_list/list=16/match=50%/nulls=50%",
+        &NumericBenchConfig::new(
+            16,
+            0.5,
+            |rng| rng.random(),
+            |v| ScalarValue::Int32(Some(v)),
+        )
+        .with_null_rate(0.5),
+    );
+
+    bench_string::<StringViewArray>(
+        c,
+        "nulls",
+        "utf8view/short_8b/list=16/match=50%/nulls=50%",
+        &StringBenchConfig::new(16, 0.5, 8, utf8view_scalar).with_null_rate(0.5),
+    );
+}
+
+// =============================================================================
+// FIXED SIZE BINARY BENCHMARKS (FixedSizeBinary<16>, e.g. UUIDs)
+// =============================================================================
+
+/// Generates a random 16-byte value (UUID-sized).
+fn random_fixed_binary_16(rng: &mut StdRng) -> Vec<u8> {
+    let mut buf = vec![0u8; 16];
+    rng.fill(&mut buf[..]);
+    buf
+}
+
+/// Benchmarks FixedSizeBinary(16) IN list evaluation.
+/// FixedSizeBinary doesn't use the generic numeric helpers since its array
+/// construction differs from primitive types.
+fn bench_fixed_size_binary_inner(
+    c: &mut Criterion,
+    name: &str,
+    list_size: usize,
+    match_rate: f64,
+) {
+    let seed = 0xF1ED_B1A7_u64.wrapping_add(list_size as u64 * 0x6666);
+    let mut rng = StdRng::seed_from_u64(seed);
+
+    // Generate IN list values (16-byte each)
+    let haystack: Vec<Vec<u8>> = (0..list_size)
+        .map(|_| random_fixed_binary_16(&mut rng))
+        .collect();
+
+    // Generate array with controlled match rate
+    let values: Vec<Vec<u8>> = (0..ARRAY_SIZE)
+        .map(|_| {
+            if !haystack.is_empty() && rng.random_bool(match_rate) {
+                haystack.choose(&mut rng).unwrap().clone()
+            } else {
+                random_fixed_binary_16(&mut rng)
+            }
+        })
+        .collect();
+
+    let refs: Vec<&[u8]> = values.iter().map(|v| v.as_slice()).collect();
+    let array = FixedSizeBinaryArray::from(refs);
+
+    let schema = Schema::new(vec![Field::new("a", array.data_type().clone(), true)]);
+    let exprs: Vec<_> = haystack
+        .iter()
+        .map(|v| lit(ScalarValue::FixedSizeBinary(16, Some(v.clone()))))
+        .collect();
+    let expr = in_list(col("a", &schema).unwrap(), exprs, &false, &schema).unwrap();
+    let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array) as ArrayRef])
+        .unwrap();
+
+    c.bench_with_input(
+        BenchmarkId::new("fixed_size_binary", name),
+        &batch,
+        |b, batch| b.iter(|| expr.evaluate(batch).unwrap()),
+    );
+}
+
+fn bench_fixed_size_binary(c: &mut Criterion) {
+    for list_size in [4, 64, 256, 10000] {
+        for match_pct in MATCH_RATES {
+            bench_fixed_size_binary_inner(
+                c,
+                &format!("fsb16/list={list_size}/match={match_pct}%"),
+                list_size,
+                match_pct as f64 / 100.0,
+            );
+        }
+    }
+}
+
+// =============================================================================
+// CRITERION SETUP
+// =============================================================================
+
+criterion_group! {
+    name = benches;
+    config = Criterion::default();
+    targets = bench_narrow_integer, bench_primitive, bench_f32, bench_timestamp_ns, bench_utf8, bench_utf8view, bench_dictionary, bench_nulls, bench_fixed_size_binary
+}
+
+criterion_main!(benches);
diff --git a/datafusion/physical-expr/benches/is_null.rs b/datafusion/physical-expr/benches/is_null.rs
index 80b2907a9e989..0637ade1b3eec 100644
--- a/datafusion/physical-expr/benches/is_null.rs
+++ b/datafusion/physical-expr/benches/is_null.rs
@@ -15,9 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{builder::Int32Builder, RecordBatch};
+use arrow::array::{RecordBatch, builder::Int32Builder};
 use arrow::datatypes::{DataType, Field, Schema};
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_physical_expr::expressions::{Column, IsNotNullExpr, IsNullExpr};
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use std::hint::black_box;
diff --git a/datafusion/physical-expr/benches/simplify.rs b/datafusion/physical-expr/benches/simplify.rs
new file mode 100644
index 0000000000000..cc00c710004e8
--- /dev/null
+++ b/datafusion/physical-expr/benches/simplify.rs
@@ -0,0 +1,299 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! This is an attempt at reproducing some predicates generated by TPC-DS query #76,
+//! and trying to figure out how long it takes to simplify them.
+
+use arrow::datatypes::{DataType, Field, Schema};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_expr::simplifier::PhysicalExprSimplifier;
+use std::hint::black_box;
+use std::sync::Arc;
+
+use datafusion_common::ScalarValue;
+use datafusion_expr::Operator;
+
+use datafusion_physical_expr::expressions::{
+    BinaryExpr, CaseExpr, Column, IsNullExpr, Literal,
+};
+
+fn catalog_sales_schema() -> Schema {
+    Schema::new(vec![
+        Field::new("cs_sold_date_sk", DataType::Int64, true), // 0
+        Field::new("cs_sold_time_sk", DataType::Int64, true), // 1
+        Field::new("cs_ship_date_sk", DataType::Int64, true), // 2
+        Field::new("cs_bill_customer_sk", DataType::Int64, true), // 3
+        Field::new("cs_bill_cdemo_sk", DataType::Int64, true), // 4
+        Field::new("cs_bill_hdemo_sk", DataType::Int64, true), // 5
+        Field::new("cs_bill_addr_sk", DataType::Int64, true), // 6
+        Field::new("cs_ship_customer_sk", DataType::Int64, true), // 7
+        Field::new("cs_ship_cdemo_sk", DataType::Int64, true), // 8
+        Field::new("cs_ship_hdemo_sk", DataType::Int64, true), // 9
+        Field::new("cs_ship_addr_sk", DataType::Int64, true), // 10
+        Field::new("cs_call_center_sk", DataType::Int64, true), // 11
+        Field::new("cs_catalog_page_sk", DataType::Int64, true), // 12
+        Field::new("cs_ship_mode_sk", DataType::Int64, true), // 13
+        Field::new("cs_warehouse_sk", DataType::Int64, true), // 14
+        Field::new("cs_item_sk", DataType::Int64, true),      // 15
+        Field::new("cs_promo_sk", DataType::Int64, true),     // 16
+        Field::new("cs_order_number", DataType::Int64, true), // 17
+        Field::new("cs_quantity", DataType::Int64, true),     // 18
+        Field::new("cs_wholesale_cost", DataType::Decimal128(7, 2), true),
+        Field::new("cs_list_price", DataType::Decimal128(7, 2), true),
+        Field::new("cs_sales_price", DataType::Decimal128(7, 2), true),
+        Field::new("cs_ext_discount_amt", DataType::Decimal128(7, 2), true),
+        Field::new("cs_ext_sales_price", DataType::Decimal128(7, 2), true),
+        Field::new("cs_ext_wholesale_cost", DataType::Decimal128(7, 2), true),
+        Field::new("cs_ext_list_price", DataType::Decimal128(7, 2), true),
+        Field::new("cs_ext_tax", DataType::Decimal128(7, 2), true),
+        Field::new("cs_coupon_amt", DataType::Decimal128(7, 2), true),
+        Field::new("cs_ext_ship_cost", DataType::Decimal128(7, 2), true),
+        Field::new("cs_net_paid", DataType::Decimal128(7, 2), true),
+        Field::new("cs_net_paid_inc_tax", DataType::Decimal128(7, 2), true),
+        Field::new("cs_net_paid_inc_ship", DataType::Decimal128(7, 2), true),
+        Field::new("cs_net_paid_inc_ship_tax", DataType::Decimal128(7, 2), true),
+        Field::new("cs_net_profit", DataType::Decimal128(7, 2), true),
+    ])
+}
+
+fn web_sales_schema() -> Schema {
+    Schema::new(vec![
+        Field::new("ws_sold_date_sk", DataType::Int64, true),
+        Field::new("ws_sold_time_sk", DataType::Int64, true),
+        Field::new("ws_ship_date_sk", DataType::Int64, true),
+        Field::new("ws_item_sk", DataType::Int64, true),
+        Field::new("ws_bill_customer_sk", DataType::Int64, true),
+        Field::new("ws_bill_cdemo_sk", DataType::Int64, true),
+        Field::new("ws_bill_hdemo_sk", DataType::Int64, true),
+        Field::new("ws_bill_addr_sk", DataType::Int64, true),
+        Field::new("ws_ship_customer_sk", DataType::Int64, true),
+        Field::new("ws_ship_cdemo_sk", DataType::Int64, true),
+        Field::new("ws_ship_hdemo_sk", DataType::Int64, true),
+        Field::new("ws_ship_addr_sk", DataType::Int64, true),
+        Field::new("ws_web_page_sk", DataType::Int64, true),
+        Field::new("ws_web_site_sk", DataType::Int64, true),
+        Field::new("ws_ship_mode_sk", DataType::Int64, true),
+        Field::new("ws_warehouse_sk", DataType::Int64, true),
+        Field::new("ws_promo_sk", DataType::Int64, true),
+        Field::new("ws_order_number", DataType::Int64, true),
+        Field::new("ws_quantity", DataType::Int64, true),
+        Field::new("ws_wholesale_cost", DataType::Decimal128(7, 2), true),
+        Field::new("ws_list_price", DataType::Decimal128(7, 2), true),
+        Field::new("ws_sales_price", DataType::Decimal128(7, 2), true),
+        Field::new("ws_ext_discount_amt", DataType::Decimal128(7, 2), true),
+        Field::new("ws_ext_sales_price", DataType::Decimal128(7, 2), true),
+        Field::new("ws_ext_wholesale_cost", DataType::Decimal128(7, 2), true),
+        Field::new("ws_ext_list_price", DataType::Decimal128(7, 2), true),
+        Field::new("ws_ext_tax", DataType::Decimal128(7, 2), true),
+        Field::new("ws_coupon_amt", DataType::Decimal128(7, 2), true),
+        Field::new("ws_ext_ship_cost", DataType::Decimal128(7, 2), true),
+        Field::new("ws_net_paid", DataType::Decimal128(7, 2), true),
+        Field::new("ws_net_paid_inc_tax", DataType::Decimal128(7, 2), true),
+        Field::new("ws_net_paid_inc_ship", DataType::Decimal128(7, 2), true),
+        Field::new("ws_net_paid_inc_ship_tax", DataType::Decimal128(7, 2), true),
+        Field::new("ws_net_profit", DataType::Decimal128(7, 2), true),
+    ])
+}
+
+// Helper to create a literal
+fn lit_i64(val: i64) -> Arc<dyn PhysicalExpr> {
+    Arc::new(Literal::new(ScalarValue::Int64(Some(val))))
+}
+
+fn lit_i32(val: i32) -> Arc<dyn PhysicalExpr> {
+    Arc::new(Literal::new(ScalarValue::Int32(Some(val))))
+}
+
+fn lit_bool(val: bool) -> Arc<dyn PhysicalExpr> {
+    Arc::new(Literal::new(ScalarValue::Boolean(Some(val))))
+}
+
+// Helper to create binary expressions
+fn and(
+    left: Arc<dyn PhysicalExpr>,
+    right: Arc<dyn PhysicalExpr>,
+) -> Arc<dyn PhysicalExpr> {
+    Arc::new(BinaryExpr::new(left, Operator::And, right))
+}
+
+fn gte(
+    left: Arc<dyn PhysicalExpr>,
+    right: Arc<dyn PhysicalExpr>,
+) -> Arc<dyn PhysicalExpr> {
+    Arc::new(BinaryExpr::new(left, Operator::GtEq, right))
+}
+
+fn lte(
+    left: Arc<dyn PhysicalExpr>,
+    right: Arc<dyn PhysicalExpr>,
+) -> Arc<dyn PhysicalExpr> {
+    Arc::new(BinaryExpr::new(left, Operator::LtEq, right))
+}
+
+fn modulo(
+    left: Arc<dyn PhysicalExpr>,
+    right: Arc<dyn PhysicalExpr>,
+) -> Arc<dyn PhysicalExpr> {
+    Arc::new(BinaryExpr::new(left, Operator::Modulo, right))
+}
+
+fn eq(
+    left: Arc<dyn PhysicalExpr>,
+    right: Arc<dyn PhysicalExpr>,
+) -> Arc<dyn PhysicalExpr> {
+    Arc::new(BinaryExpr::new(left, Operator::Eq, right))
+}
+
+/// Build a predicate similar to TPC-DS q76 catalog_sales filter.
+/// Uses placeholder columns instead of hash expressions.
+pub fn catalog_sales_predicate(num_partitions: usize) -> Arc<dyn PhysicalExpr> {
+    let cs_sold_date_sk: Arc<dyn PhysicalExpr> =
+        Arc::new(Column::new("cs_sold_date_sk", 0));
+    let cs_ship_addr_sk: Arc<dyn PhysicalExpr> =
+        Arc::new(Column::new("cs_ship_addr_sk", 10));
+    let cs_item_sk: Arc<dyn PhysicalExpr> = Arc::new(Column::new("cs_item_sk", 15));
+
+    // Use a simple modulo expression as placeholder for hash
+    let item_hash_mod = modulo(cs_item_sk.clone(), lit_i64(num_partitions as i64));
+    let date_hash_mod = modulo(cs_sold_date_sk.clone(), lit_i64(num_partitions as i64));
+
+    // cs_ship_addr_sk IS NULL
+    let is_null_expr: Arc<dyn PhysicalExpr> = Arc::new(IsNullExpr::new(cs_ship_addr_sk));
+
+    // Build item_sk CASE expression with num_partitions branches
+    let item_when_then: Vec<(Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>)> = (0
+        ..num_partitions)
+        .map(|partition| {
+            let when_expr = eq(item_hash_mod.clone(), lit_i32(partition as i32));
+            let then_expr = and(
+                gte(cs_item_sk.clone(), lit_i64(partition as i64)),
+                lte(cs_item_sk.clone(), lit_i64(18000)),
+            );
+            (when_expr, then_expr)
+        })
+        .collect();
+
+    let item_case_expr: Arc<dyn PhysicalExpr> =
+        Arc::new(CaseExpr::try_new(None, item_when_then, Some(lit_bool(false))).unwrap());
+
+    // Build sold_date_sk CASE expression with num_partitions branches
+    let date_when_then: Vec<(Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>)> = (0
+        ..num_partitions)
+        .map(|partition| {
+            let when_expr = eq(date_hash_mod.clone(), lit_i32(partition as i32));
+            let then_expr = and(
+                gte(cs_sold_date_sk.clone(), lit_i64(2415000 + partition as i64)),
+                lte(cs_sold_date_sk.clone(), lit_i64(2488070)),
+            );
+            (when_expr, then_expr)
+        })
+        .collect();
+
+    let date_case_expr: Arc<dyn PhysicalExpr> =
+        Arc::new(CaseExpr::try_new(None, date_when_then, Some(lit_bool(false))).unwrap());
+
+    // Final: is_null AND item_case AND date_case
+    and(and(is_null_expr, item_case_expr), date_case_expr)
+}
+/// Build a predicate similar to TPC-DS q76 web_sales filter.
+/// Uses placeholder columns instead of hash expressions.
+fn web_sales_predicate(num_partitions: usize) -> Arc<dyn PhysicalExpr> {
+    let ws_sold_date_sk: Arc<dyn PhysicalExpr> =
+        Arc::new(Column::new("ws_sold_date_sk", 0));
+    let ws_item_sk: Arc<dyn PhysicalExpr> = Arc::new(Column::new("ws_item_sk", 3));
+    let ws_ship_customer_sk: Arc<dyn PhysicalExpr> =
+        Arc::new(Column::new("ws_ship_customer_sk", 8));
+
+    // Use simple modulo expression as placeholder for hash
+    let item_hash_mod = modulo(ws_item_sk.clone(), lit_i64(num_partitions as i64));
+    let date_hash_mod = modulo(ws_sold_date_sk.clone(), lit_i64(num_partitions as i64));
+
+    // ws_ship_customer_sk IS NULL
+    let is_null_expr: Arc<dyn PhysicalExpr> =
+        Arc::new(IsNullExpr::new(ws_ship_customer_sk));
+
+    // Build item_sk CASE expression with num_partitions branches
+    let item_when_then: Vec<(Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>)> = (0
+        ..num_partitions)
+        .map(|partition| {
+            let when_expr = eq(item_hash_mod.clone(), lit_i32(partition as i32));
+            let then_expr = and(
+                gte(ws_item_sk.clone(), lit_i64(partition as i64)),
+                lte(ws_item_sk.clone(), lit_i64(18000)),
+            );
+            (when_expr, then_expr)
+        })
+        .collect();
+
+    let item_case_expr: Arc<dyn PhysicalExpr> =
+        Arc::new(CaseExpr::try_new(None, item_when_then, Some(lit_bool(false))).unwrap());
+
+    // Build sold_date_sk CASE expression with num_partitions branches
+    let date_when_then: Vec<(Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>)> = (0
+        ..num_partitions)
+        .map(|partition| {
+            let when_expr = eq(date_hash_mod.clone(), lit_i32(partition as i32));
+            let then_expr = and(
+                gte(ws_sold_date_sk.clone(), lit_i64(2415000 + partition as i64)),
+                lte(ws_sold_date_sk.clone(), lit_i64(2488070)),
+            );
+            (when_expr, then_expr)
+        })
+        .collect();
+
+    let date_case_expr: Arc<dyn PhysicalExpr> =
+        Arc::new(CaseExpr::try_new(None, date_when_then, Some(lit_bool(false))).unwrap());
+
+    and(and(is_null_expr, item_case_expr), date_case_expr)
+}
+
+/// Measures how long `PhysicalExprSimplifier::simplify` takes for a given expression.
+fn bench_simplify(
+    c: &mut Criterion,
+    name: &str,
+    schema: &Schema,
+    expr: &Arc<dyn PhysicalExpr>,
+) {
+    let simplifier = PhysicalExprSimplifier::new(schema);
+    c.bench_function(name, |b| {
+        b.iter(|| black_box(simplifier.simplify(black_box(Arc::clone(expr))).unwrap()))
+    });
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let cs_schema = catalog_sales_schema();
+    let ws_schema = web_sales_schema();
+
+    for num_partitions in [16, 128] {
+        bench_simplify(
+            c,
+            &format!("tpc-ds/q76/cs/{num_partitions}"),
+            &cs_schema,
+            &catalog_sales_predicate(num_partitions),
+        );
+        bench_simplify(
+            c,
+            &format!("tpc-ds/q76/ws/{num_partitions}"),
+            &ws_schema,
+            &web_sales_predicate(num_partitions),
+        );
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/physical-expr/benches/string_concat.rs b/datafusion/physical-expr/benches/string_concat.rs
new file mode 100644
index 0000000000000..23f54c7637bdd
--- /dev/null
+++ b/datafusion/physical-expr/benches/string_concat.rs
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::StringViewArray;
+use arrow::datatypes::{DataType, Field, Schema};
+use arrow::record_batch::RecordBatch;
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_expr::Operator;
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_expr::expressions::{BinaryExpr, Column};
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+const NUM_ROWS: usize = 8192;
+const SEED: u64 = 42;
+
+fn create_string_view_array(
+    num_rows: usize,
+    str_len: usize,
+    null_density: f64,
+    seed: u64,
+) -> StringViewArray {
+    let mut rng = StdRng::seed_from_u64(seed);
+    let values: Vec<Option<String>> = (0..num_rows)
+        .map(|_| {
+            if rng.random::<f64>() < null_density {
+                None
+            } else {
+                let s: String = (0..str_len)
+                    .map(|_| rng.random_range(b'a'..=b'z') as char)
+                    .collect();
+                Some(s)
+            }
+        })
+        .collect();
+    StringViewArray::from_iter(values)
+}
+
+fn bench_concat_utf8view(c: &mut Criterion) {
+    let mut group = c.benchmark_group("concat_utf8view");
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("left", DataType::Utf8View, true),
+        Field::new("right", DataType::Utf8View, true),
+    ]));
+
+    // left || right
+    let expr = BinaryExpr::new(
+        Arc::new(Column::new("left", 0)),
+        Operator::StringConcat,
+        Arc::new(Column::new("right", 1)),
+    );
+
+    for null_density in [0.0, 0.1, 0.5] {
+        let left = create_string_view_array(NUM_ROWS, 16, null_density, SEED);
+        let right = create_string_view_array(NUM_ROWS, 16, null_density, SEED + 1);
+
+        let batch =
+            RecordBatch::try_new(schema.clone(), vec![Arc::new(left), Arc::new(right)])
+                .unwrap();
+
+        let label = format!("nulls_{}", (null_density * 100.0) as u32);
+        group.bench_with_input(
+            BenchmarkId::new("concat", &label),
+            &null_density,
+            |b, _| {
+                b.iter(|| {
+                    black_box(expr.evaluate(black_box(&batch)).unwrap());
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_concat_utf8view);
+criterion_main!(benches);
diff --git a/datafusion/physical-expr/src/aggregate.rs b/datafusion/physical-expr/src/aggregate.rs
index 2a8467eb88327..3fd2b42b2e4af 100644
--- a/datafusion/physical-expr/src/aggregate.rs
+++ b/datafusion/physical-expr/src/aggregate.rs
@@ -16,12 +16,12 @@
 // under the License.
 
 pub(crate) mod groups_accumulator {
-    #[allow(unused_imports)]
+    #[expect(unused_imports)]
     pub(crate) mod accumulate {
         pub use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::NullState;
     }
     pub use datafusion_functions_aggregate_common::aggregate::groups_accumulator::{
-        accumulate::NullState, GroupsAccumulatorAdapter,
+        GroupsAccumulatorAdapter, accumulate::NullState,
     };
 }
 pub(crate) mod stats {
@@ -29,8 +29,8 @@ pub(crate) mod stats {
 }
 pub mod utils {
     pub use datafusion_functions_aggregate_common::utils::{
-        get_accum_scalar_values_as_arrays, get_sort_options, ordering_fields,
-        DecimalAverager, Hashable,
+        DecimalAverager, Hashable, get_accum_scalar_values_as_arrays, get_sort_options,
+        ordering_fields,
     };
 }
 
@@ -41,7 +41,9 @@ use crate::expressions::Column;
 
 use arrow::compute::SortOptions;
 use arrow::datatypes::{DataType, FieldRef, Schema, SchemaRef};
-use datafusion_common::{internal_err, not_impl_err, Result, ScalarValue};
+use datafusion_common::{
+    Result, ScalarValue, assert_or_internal_err, internal_err, not_impl_err,
+};
 use datafusion_expr::{AggregateUDF, ReversedUDAF, SetMonotonicity};
 use datafusion_expr_common::accumulator::Accumulator;
 use datafusion_expr_common::groups_accumulator::GroupsAccumulator;
@@ -124,10 +126,6 @@ impl AggregateExprBuilder {
     /// # }
     /// #
     /// # impl AggregateUDFImpl for FirstValueUdf {
-    /// #     fn as_any(&self) -> &dyn Any {
-    /// #         unimplemented!()
-    /// #     }
-    /// #
     /// #     fn name(&self) -> &str {
     /// #         unimplemented!()
     /// #     }
@@ -198,9 +196,7 @@ impl AggregateExprBuilder {
             is_distinct,
             is_reversed,
         } = self;
-        if args.is_empty() {
-            return internal_err!("args should not be empty");
-        }
+        assert_or_internal_err!(!args.is_empty(), "args should not be empty");
 
         let ordering_types = order_bys
             .iter()
@@ -226,7 +222,7 @@ impl AggregateExprBuilder {
             None => {
                 return internal_err!(
                     "AggregateExprBuilder::alias must be provided prior to calling build"
-                )
+                );
             }
             Some(alias) => alias,
         };
@@ -739,18 +735,18 @@ fn replace_order_by_clause(order_by: &mut String) {
         (" ASC NULLS LAST]", " DESC NULLS FIRST]"),
     ];
 
-    if let Some(start) = order_by.find("ORDER BY [") {
-        if let Some(end) = order_by[start..].find(']') {
-            let order_by_start = start + 9;
-            let order_by_end = start + end;
-
-            let column_order = &order_by[order_by_start..=order_by_end];
-            for (suffix, replacement) in suffixes {
-                if column_order.ends_with(suffix) {
-                    let new_order = column_order.replace(suffix, replacement);
-                    order_by.replace_range(order_by_start..=order_by_end, &new_order);
-                    break;
-                }
+    if let Some(start) = order_by.find("ORDER BY [")
+        && let Some(end) = order_by[start..].find(']')
+    {
+        let order_by_start = start + 9;
+        let order_by_end = start + end;
+
+        let column_order = &order_by[order_by_start..=order_by_end];
+        for (suffix, replacement) in suffixes {
+            if column_order.ends_with(suffix) {
+                let new_order = column_order.replace(suffix, replacement);
+                order_by.replace_range(order_by_start..=order_by_end, &new_order);
+                break;
             }
         }
     }
diff --git a/datafusion/physical-expr/src/analysis.rs b/datafusion/physical-expr/src/analysis.rs
index 1d59dab8fd6dd..1dca36b75f9f5 100644
--- a/datafusion/physical-expr/src/analysis.rs
+++ b/datafusion/physical-expr/src/analysis.rs
@@ -20,17 +20,18 @@
 use std::fmt::Debug;
 use std::sync::Arc;
 
+use crate::PhysicalExpr;
 use crate::expressions::Column;
 use crate::intervals::cp_solver::{ExprIntervalGraph, PropagationResult};
 use crate::utils::collect_columns;
-use crate::PhysicalExpr;
 
 use arrow::datatypes::Schema;
 use datafusion_common::stats::Precision;
 use datafusion_common::{
-    internal_datafusion_err, internal_err, ColumnStatistics, Result, ScalarValue,
+    ColumnStatistics, Result, ScalarValue, assert_or_internal_err,
+    internal_datafusion_err, internal_err,
 };
-use datafusion_expr::interval_arithmetic::{cardinality_ratio, Interval};
+use datafusion_expr::interval_arithmetic::{Interval, cardinality_ratio};
 
 /// The shared context used during the analysis of an expression. Includes
 /// the boundaries for all known columns.
@@ -166,31 +167,29 @@ pub fn analyze(
     schema: &Schema,
 ) -> Result<AnalysisContext> {
     let initial_boundaries = &context.boundaries;
+
     if initial_boundaries
         .iter()
         .all(|bound| bound.interval.is_none())
     {
-        if initial_boundaries
-            .iter()
-            .any(|bound| bound.distinct_count != Precision::Exact(0))
-        {
-            return internal_err!(
-                "ExprBoundaries has a non-zero distinct count although it represents an empty table"
-            );
-        }
-        if context.selectivity != Some(0.0) {
-            return internal_err!(
-                "AnalysisContext has a non-zero selectivity although it represents an empty table"
-            );
-        }
+        assert_or_internal_err!(
+            !initial_boundaries
+                .iter()
+                .any(|bound| bound.distinct_count != Precision::Exact(0)),
+            "ExprBoundaries has a non-zero distinct count although it represents an empty table"
+        );
+        assert_or_internal_err!(
+            context.selectivity.unwrap_or(0.0) == 0.0,
+            "AnalysisContext has a non-zero selectivity although it represents an empty table"
+        );
         Ok(context)
     } else if initial_boundaries
         .iter()
         .any(|bound| bound.interval.is_none())
     {
         internal_err!(
-                "AnalysisContext is an inconsistent state. Some columns represent empty table while others don't"
-            )
+            "AnalysisContext is an inconsistent state. Some columns represent empty table while others don't"
+        )
     } else {
         let mut target_boundaries = context.boundaries;
         let mut graph = ExprIntervalGraph::try_new(Arc::clone(expr), schema)?;
@@ -203,22 +202,19 @@ pub fn analyze(
         let target_expr_and_indices = graph.gather_node_indices(columns.as_slice());
 
         for (expr, index) in &target_expr_and_indices {
-            if let Some(column) = expr.as_any().downcast_ref::<Column>() {
-                if let Some(bound) =
+            if let Some(column) = expr.downcast_ref::<Column>()
+                && let Some(bound) =
                     target_boundaries.iter().find(|b| b.column == *column)
-                {
-                    // Now, it's safe to unwrap
-                    target_indices_and_boundaries
-                        .push((*index, bound.interval.as_ref().unwrap().clone()));
-                }
+            {
+                // Now, it's safe to unwrap
+                target_indices_and_boundaries
+                    .push((*index, bound.interval.as_ref().unwrap().clone()));
             }
         }
 
-        match graph
-            .update_ranges(&mut target_indices_and_boundaries, Interval::CERTAINLY_TRUE)?
-        {
+        match graph.update_ranges(&mut target_indices_and_boundaries, Interval::TRUE)? {
             PropagationResult::Success => {
-                shrink_boundaries(graph, target_boundaries, target_expr_and_indices)
+                shrink_boundaries(&graph, target_boundaries, &target_expr_and_indices)
             }
             PropagationResult::Infeasible => {
                 // If the propagation result is infeasible, set intervals to None
@@ -239,31 +235,69 @@ pub fn analyze(
 /// Following this, it constructs and returns a new `AnalysisContext` with the
 /// updated parameters.
 fn shrink_boundaries(
-    graph: ExprIntervalGraph,
+    graph: &ExprIntervalGraph,
     mut target_boundaries: Vec<ExprBoundaries>,
-    target_expr_and_indices: Vec<(Arc<dyn PhysicalExpr>, usize)>,
+    target_expr_and_indices: &[(Arc<dyn PhysicalExpr>, usize)],
 ) -> Result<AnalysisContext> {
     let initial_boundaries = target_boundaries.clone();
     target_expr_and_indices.iter().for_each(|(expr, i)| {
-        if let Some(column) = expr.as_any().downcast_ref::<Column>() {
-            if let Some(bound) = target_boundaries
+        if let Some(column) = expr.downcast_ref::<Column>()
+            && let Some(bound) = target_boundaries
                 .iter_mut()
                 .find(|bound| bound.column.eq(column))
-            {
-                bound.interval = Some(graph.get_interval(*i));
-            };
-        }
+        {
+            bound.interval = Some(graph.get_interval(*i));
+        };
     });
 
     let selectivity = calculate_selectivity(&target_boundaries, &initial_boundaries)?;
 
-    if !(0.0..=1.0).contains(&selectivity) {
-        return internal_err!("Selectivity is out of limit: {}", selectivity);
-    }
+    assert_or_internal_err!(
+        (0.0..=1.0).contains(&selectivity),
+        "Selectivity is out of limit: {selectivity}",
+    );
 
     Ok(AnalysisContext::new(target_boundaries).with_selectivity(selectivity))
 }
 
+/// Returns `Some(1.0 / distinct_count)` when the filter demonstrably collapsed
+/// a non-singleton interval down to a single point, i.e. an equality predicate
+/// was applied.  Returns `None` in all other cases, signalling that the caller
+/// should fall back to [`cardinality_ratio`].
+///
+/// The `initial_interval` guard prevents double-counting selectivity when the
+/// column statistics already described a singleton before any filter was
+/// applied: if the initial interval was already the same single point, no
+/// additional selectivity has been gained and the `1 / NDV` shortcut must not
+/// fire.
+fn singleton_selectivity(
+    initial_interval: &Interval,
+    target_interval: &Interval,
+    distinct_count: usize,
+) -> Option<f64> {
+    // The target must have collapsed to a single non-null value.
+    if distinct_count == 0
+        || target_interval.lower().is_null()
+        || target_interval.lower() != target_interval.upper()
+    {
+        return None;
+    }
+
+    // Only treat this as a newly-applied equality filter when the initial
+    // interval was not already that same singleton.  If it was, the stats
+    // already encoded this restriction and applying 1/NDV again would
+    // under-estimate the row count.
+    let initial_is_same_singleton = !initial_interval.lower().is_null()
+        && initial_interval.lower() == initial_interval.upper()
+        && initial_interval.lower() == target_interval.lower();
+
+    if initial_is_same_singleton {
+        return None;
+    }
+
+    Some(1.0 / distinct_count as f64)
+}
+
 /// This function calculates the filter predicate's selectivity by comparing
 /// the initial and pruned column boundaries. Selectivity is defined as the
 /// ratio of rows in a table that satisfy the filter's predicate.
@@ -282,13 +316,24 @@ fn calculate_selectivity(
     let mut acc: f64 = 1.0;
     for (initial, target) in initial_boundaries.iter().zip(target_boundaries) {
         match (initial.interval.as_ref(), target.interval.as_ref()) {
-            (Some(initial), Some(target)) => {
-                acc *= cardinality_ratio(initial, target);
+            (Some(initial_interval), Some(target_interval)) => {
+                if let Precision::Exact(distinct_count)
+                | Precision::Inexact(distinct_count) = target.distinct_count
+                    && let Some(s) = singleton_selectivity(
+                        initial_interval,
+                        target_interval,
+                        distinct_count,
+                    )
+                {
+                    acc *= s;
+                    continue;
+                }
+                acc *= cardinality_ratio(initial_interval, target_interval);
             }
             (None, Some(_)) => {
                 return internal_err!(
-                "Initial boundary cannot be None while having a Some() target boundary"
-            );
+                    "Initial boundary cannot be None while having a Some() target boundary"
+                );
             }
             _ => return Ok(0.0),
         }
@@ -302,14 +347,14 @@ mod tests {
     use std::sync::Arc;
 
     use arrow::datatypes::{DataType, Field, Schema};
-    use datafusion_common::{assert_contains, DFSchema};
+    use datafusion_common::{DFSchema, ScalarValue, assert_contains, stats::Precision};
     use datafusion_expr::{
-        col, execution_props::ExecutionProps, interval_arithmetic::Interval, lit, Expr,
+        Expr, col, execution_props::ExecutionProps, interval_arithmetic::Interval, lit,
     };
 
-    use crate::{create_physical_expr, AnalysisContext};
+    use crate::{AnalysisContext, create_physical_expr, expressions::Column};
 
-    use super::{analyze, ExprBoundaries};
+    use super::{ExprBoundaries, analyze, calculate_selectivity, singleton_selectivity};
 
     fn make_field(name: &str, data_type: DataType) -> Field {
         let nullable = false;
@@ -376,7 +421,9 @@ mod tests {
             )
             .unwrap();
             let Some(actual) = &analysis_result.boundaries[0].interval else {
-                panic!("The analysis result should contain non-empty intervals for all columns");
+                panic!(
+                    "The analysis result should contain non-empty intervals for all columns"
+                );
             };
             let expected = Interval::make(lower, upper).unwrap();
             assert_eq!(
@@ -438,4 +485,92 @@ mod tests {
         .unwrap_err();
         assert_contains!(analysis_error.to_string(), expected_error);
     }
+
+    // ---------------------------------------------------------------------------
+    // Unit tests for singleton_selectivity and calculate_selectivity
+    // ---------------------------------------------------------------------------
+
+    fn make_boundary(lower: i32, upper: i32, distinct_count: usize) -> ExprBoundaries {
+        ExprBoundaries {
+            column: Column::new("a", 0),
+            interval: Some(
+                Interval::try_new(
+                    ScalarValue::Int32(Some(lower)),
+                    ScalarValue::Int32(Some(upper)),
+                )
+                .unwrap(),
+            ),
+            distinct_count: Precision::Exact(distinct_count),
+        }
+    }
+
+    /// When the initial interval is already the same singleton as the target,
+    /// `singleton_selectivity` must return `None` so we do not double-apply
+    /// 1/NDV selectivity.
+    #[test]
+    fn test_singleton_selectivity_skipped_when_initial_is_same_singleton() {
+        let singleton =
+            Interval::try_new(ScalarValue::Int32(Some(5)), ScalarValue::Int32(Some(5)))
+                .unwrap();
+        // Both initial and target are [5, 5] — no new equality filter was applied.
+        assert_eq!(
+            singleton_selectivity(&singleton, &singleton, 10),
+            None,
+            "shortcut must not fire when initial interval was already the same singleton"
+        );
+    }
+
+    /// When the initial interval is a broader range and the target collapses to
+    /// a singleton, `singleton_selectivity` must return `Some(1/NDV)`.
+    #[test]
+    fn test_singleton_selectivity_applied_when_range_collapses() {
+        let initial =
+            Interval::try_new(ScalarValue::Int32(Some(1)), ScalarValue::Int32(Some(100)))
+                .unwrap();
+        let target =
+            Interval::try_new(ScalarValue::Int32(Some(5)), ScalarValue::Int32(Some(5)))
+                .unwrap();
+        let result = singleton_selectivity(&initial, &target, 10);
+        assert_eq!(
+            result,
+            Some(0.1),
+            "shortcut must return 1/NDV when a range collapses to a singleton"
+        );
+    }
+
+    /// Regression test: `calculate_selectivity` must not apply the `1/NDV`
+    /// shortcut when the column statistics already describe a singleton interval
+    /// (i.e. before the filter, the column only ever held one value).  In that
+    /// case the target and initial intervals are the same singleton, so the
+    /// cardinality ratio is 1.0 and the overall selectivity should remain 1.0.
+    #[test]
+    fn test_calculate_selectivity_already_singleton_initial_interval() {
+        let already_singleton = make_boundary(7, 7, 1);
+
+        let selectivity = calculate_selectivity(
+            std::slice::from_ref(&already_singleton),
+            std::slice::from_ref(&already_singleton),
+        )
+        .unwrap();
+
+        let wide_initial = make_boundary(1, 100, 50);
+        let same_singleton_target = make_boundary(7, 7, 50);
+        let selectivity_new =
+            calculate_selectivity(&[same_singleton_target], &[wide_initial]).unwrap();
+        assert!(
+            (selectivity_new - 0.02).abs() < 1e-10,
+            "expected selectivity 1/NDV = 0.02, got {selectivity_new}"
+        );
+
+        let singleton_initial = make_boundary(7, 7, 50);
+        let singleton_target = make_boundary(7, 7, 50);
+        let selectivity_no_new_filter =
+            calculate_selectivity(&[singleton_target], &[singleton_initial]).unwrap();
+        assert!(
+            (selectivity_no_new_filter - 1.0).abs() < 1e-10,
+            "expected selectivity 1.0 when initial was already the same singleton, got {selectivity_no_new_filter}"
+        );
+
+        let _ = selectivity; // silence unused warning
+    }
 }
diff --git a/datafusion/physical-expr/src/async_scalar_function.rs b/datafusion/physical-expr/src/async_scalar_function.rs
index b434694a20cc8..5612e63b530e7 100644
--- a/datafusion/physical-expr/src/async_scalar_function.rs
+++ b/datafusion/physical-expr/src/async_scalar_function.rs
@@ -16,16 +16,16 @@
 // under the License.
 
 use crate::ScalarFunctionExpr;
-use arrow::array::{make_array, MutableArrayData, RecordBatch};
+use arrow::array::RecordBatch;
+use arrow::compute::concat;
 use arrow::datatypes::{DataType, Field, FieldRef, Schema};
-use datafusion_common::config::ConfigOptions;
 use datafusion_common::Result;
+use datafusion_common::config::ConfigOptions;
 use datafusion_common::{internal_err, not_impl_err};
-use datafusion_expr::async_udf::AsyncScalarUDF;
 use datafusion_expr::ScalarFunctionArgs;
+use datafusion_expr::async_udf::AsyncScalarUDF;
 use datafusion_expr_common::columnar_value::ColumnarValue;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-use std::any::Any;
 use std::fmt::Display;
 use std::hash::{Hash, Hasher};
 use std::sync::Arc;
@@ -67,7 +67,7 @@ impl AsyncFuncExpr {
         func: Arc<dyn PhysicalExpr>,
         schema: &Schema,
     ) -> Result<Self> {
-        let Some(_) = func.as_any().downcast_ref::<ScalarFunctionExpr>() else {
+        let Some(_) = func.downcast_ref::<ScalarFunctionExpr>() else {
             return internal_err!(
                 "unexpected function type, expected ScalarFunctionExpr, got: {:?}",
                 func
@@ -98,12 +98,10 @@ impl AsyncFuncExpr {
 
     /// Return the ideal batch size for this function
     pub fn ideal_batch_size(&self) -> Result<Option<usize>> {
-        if let Some(expr) = self.func.as_any().downcast_ref::<ScalarFunctionExpr>() {
-            if let Some(udf) =
-                expr.fun().inner().as_any().downcast_ref::<AsyncScalarUDF>()
-            {
-                return Ok(udf.ideal_batch_size());
-            }
+        if let Some(expr) = self.func.downcast_ref::<ScalarFunctionExpr>()
+            && let Some(udf) = expr.fun().inner().downcast_ref::<AsyncScalarUDF>()
+        {
+            return Ok(udf.ideal_batch_size());
         }
         not_impl_err!("Can't get ideal_batch_size from {:?}", self.func)
     }
@@ -116,8 +114,7 @@ impl AsyncFuncExpr {
         batch: &RecordBatch,
         config_options: Arc<ConfigOptions>,
     ) -> Result<ColumnarValue> {
-        let Some(scalar_function_expr) =
-            self.func.as_any().downcast_ref::<ScalarFunctionExpr>()
+        let Some(scalar_function_expr) = self.func.downcast_ref::<ScalarFunctionExpr>()
         else {
             return internal_err!(
                 "unexpected function type, expected ScalarFunctionExpr, got: {:?}",
@@ -128,7 +125,6 @@ impl AsyncFuncExpr {
         let Some(async_udf) = scalar_function_expr
             .fun()
             .inner()
-            .as_any()
             .downcast_ref::<AsyncScalarUDF>()
         else {
             return not_impl_err!(
@@ -192,25 +188,25 @@ impl AsyncFuncExpr {
             );
         }
 
-        let datas = ColumnarValue::values_to_arrays(&result_batches)?
+        let datas = result_batches
+            .into_iter()
+            .map(|cv| match cv {
+                ColumnarValue::Array(arr) => Ok(arr),
+                ColumnarValue::Scalar(scalar) => Ok(scalar.to_array_of_size(1)?),
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        // Get references to the arrays as dyn Array to call concat
+        let dyn_arrays = datas
             .iter()
-            .map(|b| b.to_data())
+            .map(|arr| arr as &dyn arrow::array::Array)
             .collect::<Vec<_>>();
-        let total_len = datas.iter().map(|d| d.len()).sum();
-        let mut mutable = MutableArrayData::new(datas.iter().collect(), false, total_len);
-        datas.iter().enumerate().for_each(|(i, data)| {
-            mutable.extend(i, 0, data.len());
-        });
-        let array_ref = make_array(mutable.freeze());
-        Ok(ColumnarValue::Array(array_ref))
+        let result_array = concat(&dyn_arrays)?;
+        Ok(ColumnarValue::Array(result_array))
     }
 }
 
 impl PhysicalExpr for AsyncFuncExpr {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn data_type(&self, input_schema: &Schema) -> Result<DataType> {
         self.func.data_type(input_schema)
     }
diff --git a/datafusion/physical-expr/src/equivalence/class.rs b/datafusion/physical-expr/src/equivalence/class.rs
index 5b64884f65bb8..d00a4a32278f0 100644
--- a/datafusion/physical-expr/src/equivalence/class.rs
+++ b/datafusion/physical-expr/src/equivalence/class.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::any::Any;
 use std::fmt::Display;
 use std::ops::Deref;
 use std::sync::Arc;
@@ -27,7 +28,7 @@ use crate::projection::ProjectionTargets;
 use crate::{PhysicalExpr, PhysicalExprRef, PhysicalSortExpr, PhysicalSortRequirement};
 
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::{HashMap, JoinType, Result, ScalarValue};
+use datafusion_common::{JoinType, Result, ScalarValue};
 use datafusion_physical_expr_common::physical_expr::format_physical_expr_list;
 
 use indexmap::{IndexMap, IndexSet};
@@ -153,7 +154,7 @@ impl From<Arc<dyn PhysicalExpr>> for ConstExpr {
         // By default, assume constant expressions are not same across partitions.
         // However, if we have a literal, it will have a single value that is the
         // same across all partitions.
-        let across = if let Some(lit) = expr.as_any().downcast_ref::<Literal>() {
+        let across = if let Some(lit) = expr.downcast_ref::<Literal>() {
             AcrossPartitions::Uniform(Some(lit.value().clone()))
         } else {
             AcrossPartitions::Heterogeneous
@@ -201,7 +202,7 @@ impl EquivalenceClass {
     /// Insert the expression into this class, meaning it is known to be equal to
     /// all other expressions in this class.
     pub fn push(&mut self, expr: Arc<dyn PhysicalExpr>) {
-        if let Some(lit) = expr.as_any().downcast_ref::<Literal>() {
+        if let Some(lit) = expr.downcast_ref::<Literal>() {
             let expr_across = AcrossPartitions::Uniform(Some(lit.value().clone()));
             if let Some(across) = self.constant.as_mut() {
                 // TODO: Return an error if constant values do not agree.
@@ -303,7 +304,7 @@ type AugmentedMapping<'a> = IndexMap<
 #[derive(Clone, Debug, Default)]
 pub struct EquivalenceGroup {
     /// A mapping from expressions to their equivalence class key.
-    map: HashMap<Arc<dyn PhysicalExpr>, usize>,
+    map: IndexMap<Arc<dyn PhysicalExpr>, usize>,
     /// The equivalence classes in this group.
     classes: Vec<EquivalenceClass>,
 }
@@ -436,7 +437,7 @@ impl EquivalenceGroup {
         let cls = self.classes.swap_remove(idx);
         // Remove its entries from the lookup table:
         for expr in cls.iter() {
-            self.map.remove(expr);
+            self.map.swap_remove(expr);
         }
         // Update the lookup table for the moved class:
         if idx < self.classes.len() {
@@ -448,7 +449,7 @@ impl EquivalenceGroup {
     /// Updates the entry in lookup table for the given equivalence class with
     /// the given index.
     fn update_lookup_table(
-        map: &mut HashMap<Arc<dyn PhysicalExpr>, usize>,
+        map: &mut IndexMap<Arc<dyn PhysicalExpr>, usize>,
         cls: &EquivalenceClass,
         idx: usize,
     ) {
@@ -591,7 +592,7 @@ impl EquivalenceGroup {
         expr: &Arc<dyn PhysicalExpr>,
     ) -> Option<Arc<dyn PhysicalExpr>> {
         // Literals don't need to be projected
-        if expr.as_any().downcast_ref::<Literal>().is_some() {
+        if expr.downcast_ref::<Literal>().is_some() {
             return Some(Arc::clone(expr));
         }
 
@@ -734,13 +735,13 @@ impl EquivalenceGroup {
         &self,
         expr: &Arc<dyn PhysicalExpr>,
     ) -> Option<AcrossPartitions> {
-        if let Some(lit) = expr.as_any().downcast_ref::<Literal>() {
+        if let Some(lit) = expr.downcast_ref::<Literal>() {
             return Some(AcrossPartitions::Uniform(Some(lit.value().clone())));
         }
-        if let Some(cls) = self.get_equivalence_class(expr) {
-            if cls.constant.is_some() {
-                return cls.constant.clone();
-            }
+        if let Some(cls) = self.get_equivalence_class(expr)
+            && cls.constant.is_some()
+        {
+            return cls.constant.clone();
         }
         // TODO: This function should be able to return values of non-literal
         //       complex constants as well; e.g. it should return `8` for the
@@ -819,15 +820,15 @@ impl EquivalenceGroup {
 
         // Check if expressions are equivalent through equivalence classes
         // We need to check both directions since expressions might be in different classes
-        if let Some(left_class) = self.get_equivalence_class(left) {
-            if left_class.contains(right) {
-                return true;
-            }
+        if let Some(left_class) = self.get_equivalence_class(left)
+            && left_class.contains(right)
+        {
+            return true;
         }
-        if let Some(right_class) = self.get_equivalence_class(right) {
-            if right_class.contains(left) {
-                return true;
-            }
+        if let Some(right_class) = self.get_equivalence_class(right)
+            && right_class.contains(left)
+        {
+            return true;
         }
 
         // For non-leaf nodes, check structural equality
@@ -841,7 +842,7 @@ impl EquivalenceGroup {
         }
 
         // Type equality check through reflection
-        if left.as_any().type_id() != right.as_any().type_id() {
+        if (left as &dyn Any).type_id() != (right as &dyn Any).type_id() {
             return false;
         }
 
@@ -910,10 +911,9 @@ impl From<Vec<EquivalenceClass>> for EquivalenceGroup {
 mod tests {
     use super::*;
     use crate::equivalence::tests::create_test_params;
-    use crate::expressions::{binary, col, lit, BinaryExpr, Column, Literal};
+    use crate::expressions::{BinaryExpr, Column, binary, col, lit};
     use arrow::datatypes::{DataType, Field, Schema};
 
-    use datafusion_common::{Result, ScalarValue};
     use datafusion_expr::Operator;
 
     #[test]
@@ -1082,8 +1082,7 @@ mod tests {
                 left: Arc::clone(&col_a),
                 right: Arc::clone(&col_b),
                 expected: false,
-                description:
-                    "Columns in different equivalence classes should not be equal",
+                description: "Columns in different equivalence classes should not be equal",
             },
             // Literal tests
             TestCase {
@@ -1111,8 +1110,7 @@ mod tests {
                     Arc::clone(&col_y),
                 )) as _,
                 expected: true,
-                description:
-                    "Binary expressions with equivalent operands should be equal",
+                description: "Binary expressions with equivalent operands should be equal",
             },
             TestCase {
                 left: Arc::new(BinaryExpr::new(
@@ -1126,8 +1124,7 @@ mod tests {
                     Arc::clone(&col_a),
                 )) as _,
                 expected: false,
-                description:
-                    "Binary expressions with non-equivalent operands should not be equal",
+                description: "Binary expressions with non-equivalent operands should not be equal",
             },
             TestCase {
                 left: Arc::new(BinaryExpr::new(
diff --git a/datafusion/physical-expr/src/equivalence/mod.rs b/datafusion/physical-expr/src/equivalence/mod.rs
index a7289103806b8..64bb62901310f 100644
--- a/datafusion/physical-expr/src/equivalence/mod.rs
+++ b/datafusion/physical-expr/src/equivalence/mod.rs
@@ -31,9 +31,9 @@ pub use class::{AcrossPartitions, ConstExpr, EquivalenceClass, EquivalenceGroup}
 pub use ordering::OrderingEquivalenceClass;
 // Re-export for backwards compatibility, we recommend importing from
 // datafusion_physical_expr::projection instead
-pub use crate::projection::{project_ordering, project_orderings, ProjectionMapping};
+pub use crate::projection::{ProjectionMapping, project_ordering, project_orderings};
 pub use properties::{
-    calculate_union, join_equivalence_properties, EquivalenceProperties,
+    EquivalenceProperties, calculate_union, join_equivalence_properties,
 };
 
 // Convert each tuple to a `PhysicalSortExpr` and construct a vector.
@@ -57,10 +57,9 @@ pub fn convert_to_orderings<T: Borrow<Arc<dyn PhysicalExpr>>>(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::expressions::{col, Column};
+    use crate::expressions::{Column, col};
     use crate::{LexRequirement, PhysicalSortExpr};
 
-    use arrow::compute::SortOptions;
     use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
     use datafusion_common::Result;
     use datafusion_physical_expr_common::sort_expr::PhysicalSortRequirement;
diff --git a/datafusion/physical-expr/src/equivalence/ordering.rs b/datafusion/physical-expr/src/equivalence/ordering.rs
index aa65c4a80ae9a..2ce8a8d246fe7 100644
--- a/datafusion/physical-expr/src/equivalence/ordering.rs
+++ b/datafusion/physical-expr/src/equivalence/ordering.rs
@@ -21,7 +21,7 @@ use std::sync::Arc;
 use std::vec::IntoIter;
 
 use crate::expressions::with_new_schema;
-use crate::{add_offset_to_physical_sort_exprs, LexOrdering, PhysicalExpr};
+use crate::{LexOrdering, PhysicalExpr, add_offset_to_physical_sort_exprs};
 
 use arrow::compute::SortOptions;
 use arrow::datatypes::SchemaRef;
@@ -326,10 +326,10 @@ mod tests {
 
     use crate::equivalence::tests::create_test_schema;
     use crate::equivalence::{
-        convert_to_orderings, convert_to_sort_exprs, EquivalenceClass, EquivalenceGroup,
-        EquivalenceProperties, OrderingEquivalenceClass,
+        EquivalenceClass, EquivalenceGroup, EquivalenceProperties,
+        OrderingEquivalenceClass, convert_to_orderings, convert_to_sort_exprs,
     };
-    use crate::expressions::{col, BinaryExpr, Column};
+    use crate::expressions::{BinaryExpr, Column, col};
     use crate::utils::tests::TestScalarUDF;
     use crate::{
         AcrossPartitions, ConstExpr, PhysicalExpr, PhysicalExprRef, PhysicalSortExpr,
@@ -338,8 +338,8 @@ mod tests {
 
     use arrow::compute::SortOptions;
     use arrow::datatypes::{DataType, Field, Schema};
-    use datafusion_common::config::ConfigOptions;
     use datafusion_common::Result;
+    use datafusion_common::config::ConfigOptions;
     use datafusion_expr::{Operator, ScalarUDF};
 
     #[test]
@@ -639,8 +639,9 @@ mod tests {
         ];
 
         for (orderings, eq_group, constants, reqs, expected) in test_cases {
-            let err_msg =
-                format!("error in test orderings: {orderings:?}, eq_group: {eq_group:?}, constants: {constants:?}, reqs: {reqs:?}, expected: {expected:?}");
+            let err_msg = format!(
+                "error in test orderings: {orderings:?}, eq_group: {eq_group:?}, constants: {constants:?}, reqs: {reqs:?}, expected: {expected:?}"
+            );
             let mut eq_properties = EquivalenceProperties::new(Arc::clone(&test_schema));
             let orderings = convert_to_orderings(&orderings);
             eq_properties.add_orderings(orderings);
diff --git a/datafusion/physical-expr/src/equivalence/properties/dependency.rs b/datafusion/physical-expr/src/equivalence/properties/dependency.rs
index 8945d18be430f..2ebc71559fcf4 100644
--- a/datafusion/physical-expr/src/equivalence/properties/dependency.rs
+++ b/datafusion/physical-expr/src/equivalence/properties/dependency.rs
@@ -383,14 +383,13 @@ pub fn generate_dependency_orderings(
 #[cfg(test)]
 mod tests {
     use std::ops::Not;
-    use std::sync::Arc;
 
     use super::*;
     use crate::equivalence::tests::{
         convert_to_sort_reqs, create_test_params, create_test_schema, parse_sort_expr,
     };
-    use crate::equivalence::{convert_to_sort_exprs, ProjectionMapping};
-    use crate::expressions::{col, BinaryExpr, CastExpr, Column};
+    use crate::equivalence::{ProjectionMapping, convert_to_sort_exprs};
+    use crate::expressions::{BinaryExpr, CastExpr, Column, col};
     use crate::projection::tests::output_schema;
     use crate::{ConstExpr, EquivalenceProperties, ScalarFunctionExpr};
 
@@ -398,10 +397,9 @@ mod tests {
     use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
     use datafusion_common::config::ConfigOptions;
     use datafusion_common::{Constraint, Constraints, Result};
-    use datafusion_expr::sort_properties::SortProperties;
     use datafusion_expr::Operator;
+    use datafusion_expr::sort_properties::SortProperties;
     use datafusion_functions::string::concat;
-    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
     use datafusion_physical_expr_common::sort_expr::{
         LexRequirement, PhysicalSortRequirement,
     };
@@ -933,33 +931,35 @@ mod tests {
         struct TestCase {
             name: &'static str,
             constants: Vec<Arc<dyn PhysicalExpr>>,
-            equal_conditions: Vec<[Arc<dyn PhysicalExpr>; 2]>,
-            sort_columns: &'static [&'static str],
+            equal_condition: [Arc<dyn PhysicalExpr>; 2],
             should_satisfy_ordering: bool,
         }
 
         let col_a = col("a", schema.as_ref())?;
         let col_b = col("b", schema.as_ref())?;
         let col_c = col("c", schema.as_ref())?;
-        let cast_c = Arc::new(CastExpr::new(col_c, DataType::Date32, None)) as _;
+        let cast_c = Arc::new(CastExpr::new_with_target_field(
+            col_c,
+            Arc::new(Field::new("c", DataType::Date32, true)),
+            None,
+        )) as _;
+        let required_sort = vec![PhysicalSortExpr::new_default(col("c", &schema)?)];
 
         let cases = vec![
             TestCase {
-                name: "(a, b, c) -> (c)",
+                name: "cast_c = a",
                 // b is constant, so it should be removed from the sort order
                 constants: vec![Arc::clone(&col_b)],
-                equal_conditions: vec![[Arc::clone(&cast_c), Arc::clone(&col_a)]],
-                sort_columns: &["c"],
+                equal_condition: [Arc::clone(&cast_c), Arc::clone(&col_a)],
                 should_satisfy_ordering: true,
             },
             // Same test with above test, where equality order is swapped.
             // Algorithm shouldn't depend on this order.
             TestCase {
-                name: "(a, b, c) -> (c)",
+                name: "a = cast_c",
                 // b is constant, so it should be removed from the sort order
                 constants: vec![col_b],
-                equal_conditions: vec![[Arc::clone(&col_a), Arc::clone(&cast_c)]],
-                sort_columns: &["c"],
+                equal_condition: [Arc::clone(&col_a), Arc::clone(&cast_c)],
                 should_satisfy_ordering: true,
             },
             TestCase {
@@ -967,8 +967,7 @@ mod tests {
                 // b is not constant anymore
                 constants: vec![],
                 // a and c are still compatible, but this is irrelevant since the original ordering is (a, b, c)
-                equal_conditions: vec![[Arc::clone(&cast_c), Arc::clone(&col_a)]],
-                sort_columns: &["c"],
+                equal_condition: [Arc::clone(&cast_c), Arc::clone(&col_a)],
                 should_satisfy_ordering: false,
             },
         ];
@@ -981,9 +980,8 @@ mod tests {
                 // Equal conditions before constants
                 {
                     let mut properties = base_properties.clone();
-                    for [left, right] in case.equal_conditions.clone() {
-                        properties.add_equal_conditions(left, right)?
-                    }
+                    let [left, right] = case.equal_condition.clone();
+                    properties.add_equal_conditions(left, right)?;
                     properties.add_constants(
                         case.constants.iter().cloned().map(ConstExpr::from),
                     )?;
@@ -995,20 +993,13 @@ mod tests {
                     properties.add_constants(
                         case.constants.iter().cloned().map(ConstExpr::from),
                     )?;
-                    for [left, right] in case.equal_conditions {
-                        properties.add_equal_conditions(left, right)?
-                    }
+                    let [left, right] = case.equal_condition;
+                    properties.add_equal_conditions(left, right)?;
                     properties
                 },
             ] {
-                let sort = case
-                    .sort_columns
-                    .iter()
-                    .map(|&name| col(name, &schema).map(PhysicalSortExpr::new_default))
-                    .collect::<Result<Vec<_>>>()?;
-
                 assert_eq!(
-                    properties.ordering_satisfy(sort)?,
+                    properties.ordering_satisfy(required_sort.clone())?,
                     case.should_satisfy_ordering,
                     "failed test '{}'",
                     case.name
@@ -1528,4 +1519,102 @@ mod tests {
 
         Ok(())
     }
+
+    /// Test that orderings propagate through struct-producing projections.
+    ///
+    /// When a projection creates a struct via `named_struct('a', col_a, ...)`,
+    /// the output should preserve the ordering of `col_a` as an ordering on
+    /// `get_field(col("s"), "a")`. This enables sort elimination when the
+    /// framework sorts by a struct field that corresponds to an already-sorted
+    /// input column.
+    #[test]
+    fn test_ordering_propagation_through_named_struct() -> Result<()> {
+        use crate::expressions::Literal;
+        use datafusion_common::ScalarValue;
+        use datafusion_functions::core::{get_field, named_struct};
+
+        let input_schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+        ]));
+
+        let col_a = col("a", &input_schema)?;
+        let col_b = col("b", &input_schema)?;
+        let config = Arc::new(ConfigOptions::new());
+
+        // Build: named_struct('a', col_a, 'b', col_b) AS s
+        let named_struct_udf = named_struct();
+        let named_struct_expr = Arc::new(ScalarFunctionExpr::new(
+            "named_struct",
+            named_struct_udf,
+            vec![
+                Arc::new(Literal::new(ScalarValue::Utf8(Some("a".to_string())))),
+                Arc::clone(&col_a),
+                Arc::new(Literal::new(ScalarValue::Utf8(Some("b".to_string())))),
+                Arc::clone(&col_b),
+            ],
+            Arc::new(Field::new(
+                "named_struct",
+                DataType::Struct(
+                    vec![
+                        Field::new("a", DataType::Int32, true),
+                        Field::new("b", DataType::Int32, true),
+                    ]
+                    .into(),
+                ),
+                true,
+            )),
+            Arc::clone(&config),
+        )) as Arc<dyn PhysicalExpr>;
+
+        // Projection: named_struct(...) AS s
+        let proj_exprs = vec![(named_struct_expr, "s".to_string())];
+        let projection_mapping = ProjectionMapping::try_new(proj_exprs, &input_schema)?;
+
+        // Input is ordered by [a ASC]
+        let mut input_properties = EquivalenceProperties::new(Arc::clone(&input_schema));
+        let sort_a = PhysicalSortExpr::new(
+            Arc::clone(&col_a),
+            SortOptions {
+                descending: false,
+                nulls_first: false,
+            },
+        );
+        input_properties.add_orderings([vec![sort_a]]);
+
+        // Project through the named_struct
+        let out_schema = output_schema(&projection_mapping, &input_schema)?;
+        let out_properties = input_properties.project(&projection_mapping, out_schema);
+
+        // Build the sort expression: get_field(col("s"), "a")
+        // This is what the framework would generate for ORDER BY s.a
+        let get_field_udf = get_field();
+        let col_s = Arc::new(Column::new("s", 0)) as Arc<dyn PhysicalExpr>;
+        let get_field_expr = Arc::new(ScalarFunctionExpr::new(
+            "get_field",
+            get_field_udf,
+            vec![
+                Arc::clone(&col_s),
+                Arc::new(Literal::new(ScalarValue::Utf8(Some("a".to_string())))),
+            ],
+            Arc::new(Field::new("a", DataType::Int32, true)),
+            Arc::clone(&config),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let sort_get_field_a = PhysicalSortExpr::new(
+            get_field_expr,
+            SortOptions {
+                descending: false,
+                nulls_first: false,
+            },
+        );
+
+        // The output should satisfy ordering by get_field(s, "a")
+        assert!(
+            out_properties.ordering_satisfy(vec![sort_get_field_a])?,
+            "Output should be ordered by get_field(s, 'a') since input is ordered by col_a"
+        );
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-expr/src/equivalence/properties/joins.rs b/datafusion/physical-expr/src/equivalence/properties/joins.rs
index 485b11d586397..536badba435d3 100644
--- a/datafusion/physical-expr/src/equivalence/properties/joins.rs
+++ b/datafusion/physical-expr/src/equivalence/properties/joins.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use super::EquivalenceProperties;
-use crate::{equivalence::OrderingEquivalenceClass, PhysicalExprRef};
+use crate::{PhysicalExprRef, equivalence::OrderingEquivalenceClass};
 
 use arrow::datatypes::SchemaRef;
 use datafusion_common::{JoinSide, JoinType, Result};
@@ -140,7 +140,6 @@ mod tests {
 
     use arrow::compute::SortOptions;
     use arrow::datatypes::{DataType, Field, Fields, Schema};
-    use datafusion_common::Result;
 
     #[test]
     fn test_join_equivalence_properties() -> Result<()> {
diff --git a/datafusion/physical-expr/src/equivalence/properties/mod.rs b/datafusion/physical-expr/src/equivalence/properties/mod.rs
index 4d919d623bf9b..bb74cd1d9c7b3 100644
--- a/datafusion/physical-expr/src/equivalence/properties/mod.rs
+++ b/datafusion/physical-expr/src/equivalence/properties/mod.rs
@@ -27,21 +27,21 @@ use std::mem;
 use std::sync::Arc;
 
 use self::dependency::{
-    construct_prefix_orderings, generate_dependency_orderings, referred_dependencies,
-    Dependencies, DependencyMap,
+    Dependencies, DependencyMap, construct_prefix_orderings,
+    generate_dependency_orderings, referred_dependencies,
 };
 use crate::equivalence::{
     AcrossPartitions, EquivalenceGroup, OrderingEquivalenceClass, ProjectionMapping,
 };
-use crate::expressions::{with_new_schema, CastExpr, Column, Literal};
+use crate::expressions::{CastExpr, Column, Literal, with_new_schema};
 use crate::{
     ConstExpr, LexOrdering, LexRequirement, PhysicalExpr, PhysicalSortExpr,
     PhysicalSortRequirement,
 };
 
-use arrow::datatypes::SchemaRef;
+use arrow::datatypes::{DataType, SchemaRef};
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::{plan_err, Constraint, Constraints, HashMap, Result};
+use datafusion_common::{Constraint, Constraints, HashMap, Result, plan_err};
 use datafusion_expr::interval_arithmetic::Interval;
 use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion_physical_expr_common::sort_expr::options_compatible;
@@ -195,6 +195,27 @@ impl OrderingEquivalenceCache {
 }
 
 impl EquivalenceProperties {
+    /// Helper used by the ordering equivalence rule when considering whether a
+    /// cast-bearing expression can replace an existing sort key without
+    /// invalidating the ordering.
+    ///
+    /// The substitution is only allowed when the cast wraps the very same child
+    /// expression that the original sort used and the casted type is a
+    /// widening/order-preserving conversion. Without those restrictions, a
+    /// narrowing cast could collapse distinct values and violate the existing
+    /// sort order.
+    fn substitute_cast_ordering(
+        r_expr: Arc<dyn PhysicalExpr>,
+        sort_expr: &PhysicalSortExpr,
+        expr_type: &DataType,
+    ) -> Option<PhysicalSortExpr> {
+        let cast_expr = r_expr.downcast_ref::<CastExpr>()?;
+
+        (cast_expr.expr().eq(&sort_expr.expr)
+            && CastExpr::check_bigger_cast(cast_expr.cast_type(), expr_type))
+        .then(|| PhysicalSortExpr::new(r_expr, sort_expr.options))
+    }
+
     /// Creates an empty `EquivalenceProperties` object.
     pub fn new(schema: SchemaRef) -> Self {
         Self {
@@ -207,8 +228,13 @@ impl EquivalenceProperties {
     }
 
     /// Adds constraints to the properties.
-    pub fn with_constraints(mut self, constraints: Constraints) -> Self {
+    pub fn set_constraints(&mut self, constraints: Constraints) {
         self.constraints = constraints;
+    }
+
+    /// Adds constraints to the properties.
+    pub fn with_constraints(mut self, constraints: Constraints) -> Self {
+        self.set_constraints(constraints);
         self
     }
 
@@ -380,7 +406,7 @@ impl EquivalenceProperties {
         right: Arc<dyn PhysicalExpr>,
     ) -> Result<()> {
         // Add equal expressions to the state:
-        if self.eq_group.add_equal_conditions(Arc::clone(&left), right) {
+        if self.eq_group.add_equal_conditions(left, right) {
             self.update_oeq_cache()?;
         }
         self.update_oeq_cache()?;
@@ -712,8 +738,7 @@ impl EquivalenceProperties {
                         // Build a map of column positions in the ordering:
                         let mut col_positions = HashMap::with_capacity(length);
                         for (pos, req) in ordering.iter().enumerate() {
-                            if let Some(col) = req.expr.as_any().downcast_ref::<Column>()
-                            {
+                            if let Some(col) = req.expr.downcast_ref::<Column>() {
                                 let nullable = col.nullable(&self.schema).unwrap_or(true);
                                 col_positions.insert(col.index(), (pos, nullable));
                             }
@@ -757,8 +782,7 @@ impl EquivalenceProperties {
                         // Build a map of column positions in the ordering:
                         let mut col_positions = HashMap::with_capacity(length);
                         for (pos, req) in ordering.iter().enumerate() {
-                            if let Some(col) = req.expr.as_any().downcast_ref::<Column>()
-                            {
+                            if let Some(col) = req.expr.downcast_ref::<Column>() {
                                 let nullable = col.nullable(&self.schema).unwrap_or(true);
                                 col_positions.insert(col.index(), (pos, nullable));
                             }
@@ -828,35 +852,25 @@ impl EquivalenceProperties {
             order
                 .into_iter()
                 .map(|sort_expr| {
-                    let referring_exprs = mapping
-                        .iter()
-                        .map(|(source, _target)| source)
-                        .filter(|source| expr_refers(source, &sort_expr.expr))
-                        .cloned();
-                    let mut result = vec![];
                     // The sort expression comes from this schema, so the
                     // following call to `unwrap` is safe.
                     let expr_type = sort_expr.expr.data_type(schema).unwrap();
+                    let original_sort_expr = sort_expr.clone();
                     // TODO: Add one-to-one analysis for ScalarFunctions.
-                    for r_expr in referring_exprs {
-                        // We check whether this expression is substitutable.
-                        if let Some(cast_expr) =
-                            r_expr.as_any().downcast_ref::<CastExpr>()
-                        {
-                            // For casts, we need to know whether the cast
-                            // expression matches:
-                            if cast_expr.expr.eq(&sort_expr.expr)
-                                && cast_expr.is_bigger_cast(&expr_type)
-                            {
-                                result.push(PhysicalSortExpr::new(
-                                    r_expr,
-                                    sort_expr.options,
-                                ));
-                            }
-                        }
-                    }
-                    result.push(sort_expr);
-                    result
+                    mapping
+                        .iter()
+                        .map(|(source, _target)| source)
+                        .filter(|source| expr_refers(source, &original_sort_expr.expr))
+                        .cloned()
+                        .filter_map(|r_expr| {
+                            Self::substitute_cast_ordering(
+                                r_expr,
+                                &original_sort_expr,
+                                &expr_type,
+                            )
+                        })
+                        .chain(std::iter::once(sort_expr))
+                        .collect::<Vec<_>>()
                 })
                 // Generate all valid orderings given substituted expressions:
                 .multi_cartesian_product()
@@ -1118,7 +1132,7 @@ impl EquivalenceProperties {
             .iter()
             .flat_map(|(_, targets)| {
                 targets.iter().flat_map(|(target, _)| {
-                    target.as_any().downcast_ref::<Column>().map(|c| c.index())
+                    target.downcast_ref::<Column>().map(|c| c.index())
                 })
             })
             .collect::<Vec<_>>();
@@ -1277,7 +1291,7 @@ impl EquivalenceProperties {
             // Rewriting equivalence properties in terms of new schema is not
             // safe when schemas are not aligned:
             return plan_err!(
-                "Schemas have to be aligned to rewrite equivalences:\n Old schema: {:?}\n New schema: {:?}",
+                "Schemas have to be aligned to rewrite equivalences:\n Old schema: {}\n New schema: {}",
                 self.schema,
                 schema
             );
@@ -1376,10 +1390,10 @@ fn update_properties(
         // We have an intermediate (non-leaf) node, account for its children:
         let children_props = node.children.iter().map(|c| c.data.clone()).collect_vec();
         node.data = node.expr.get_properties(&children_props)?;
-    } else if node.expr.as_any().is::<Literal>() {
+    } else if node.expr.is::<Literal>() {
         // We have a Literal, which is one of the two possible leaf node types:
         node.data = node.expr.get_properties(&[])?;
-    } else if node.expr.as_any().is::<Column>() {
+    } else if node.expr.is::<Column>() {
         // We have a Column, which is the other possible leaf node type:
         node.data.range =
             Interval::make_unbounded(&node.expr.data_type(eq_properties.schema())?)?
@@ -1450,13 +1464,13 @@ fn get_expr_properties(
             range: Interval::make_unbounded(&expr.data_type(schema)?)?,
             preserves_lex_ordering: false,
         })
-    } else if expr.as_any().downcast_ref::<Column>().is_some() {
+    } else if expr.downcast_ref::<Column>().is_some() {
         Ok(ExprProperties {
             sort_properties: SortProperties::Unordered,
             range: Interval::make_unbounded(&expr.data_type(schema)?)?,
             preserves_lex_ordering: false,
         })
-    } else if let Some(literal) = expr.as_any().downcast_ref::<Literal>() {
+    } else if let Some(literal) = expr.downcast_ref::<Literal>() {
         Ok(ExprProperties {
             sort_properties: SortProperties::Singleton,
             range: literal.value().into(),
diff --git a/datafusion/physical-expr/src/equivalence/properties/union.rs b/datafusion/physical-expr/src/equivalence/properties/union.rs
index efbefd0d39bfb..d77129472a8ba 100644
--- a/datafusion/physical-expr/src/equivalence/properties/union.rs
+++ b/datafusion/physical-expr/src/equivalence/properties/union.rs
@@ -23,7 +23,7 @@ use crate::equivalence::class::AcrossPartitions;
 use crate::{ConstExpr, PhysicalSortExpr};
 
 use arrow::datatypes::SchemaRef;
-use datafusion_common::{internal_err, Result};
+use datafusion_common::{Result, internal_err};
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
 
 /// Computes the union (in the sense of `UnionExec`) `EquivalenceProperties`
@@ -307,9 +307,9 @@ fn advance_if_matches_constant<'a>(
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::PhysicalExpr;
     use crate::equivalence::tests::{create_test_schema, parse_sort_expr};
     use crate::expressions::col;
-    use crate::PhysicalExpr;
 
     use arrow::datatypes::{DataType, Field, Schema};
     use datafusion_common::ScalarValue;
diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs
index ce3d4ced4e3a2..396c0b87c4292 100644
--- a/datafusion/physical-expr/src/expressions/binary.rs
+++ b/datafusion/physical-expr/src/expressions/binary.rs
@@ -17,38 +17,39 @@
 
 mod kernels;
 
-use crate::intervals::cp_solver::{propagate_arithmetic, propagate_comparison};
 use crate::PhysicalExpr;
+use crate::intervals::cp_solver::{propagate_arithmetic, propagate_comparison};
 use std::hash::Hash;
-use std::{any::Any, sync::Arc};
+use std::sync::Arc;
 
 use arrow::array::*;
 use arrow::compute::kernels::boolean::{and_kleene, or_kleene};
-use arrow::compute::kernels::cmp::*;
-use arrow::compute::kernels::concat_elements::concat_elements_utf8;
-use arrow::compute::{
-    cast, filter_record_batch, ilike, like, nilike, nlike, SlicesIterator,
+use arrow::compute::kernels::concat_elements::{
+    concat_element_binary, concat_elements_utf8,
 };
+use arrow::compute::{SlicesIterator, cast, filter_record_batch};
 use arrow::datatypes::*;
 use arrow::error::ArrowError;
 use datafusion_common::cast::as_boolean_array;
-use datafusion_common::{internal_err, not_impl_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, internal_err, not_impl_err};
+
 use datafusion_expr::binary::BinaryTypeCoercer;
-use datafusion_expr::interval_arithmetic::{apply_operator, Interval};
+use datafusion_expr::interval_arithmetic::{Interval, apply_operator};
 use datafusion_expr::sort_properties::ExprProperties;
 use datafusion_expr::statistics::Distribution::{Bernoulli, Gaussian};
 use datafusion_expr::statistics::{
-    combine_bernoullis, combine_gaussians, create_bernoulli_from_comparison,
-    new_generic_from_binary_op, Distribution,
+    Distribution, combine_bernoullis, combine_gaussians,
+    create_bernoulli_from_comparison, new_generic_from_binary_op,
 };
 use datafusion_expr::{ColumnarValue, Operator};
-use datafusion_physical_expr_common::datum::{apply, apply_cmp, apply_cmp_for_nested};
+use datafusion_physical_expr_common::datum::{apply, apply_cmp};
 
 use kernels::{
     bitwise_and_dyn, bitwise_and_dyn_scalar, bitwise_or_dyn, bitwise_or_dyn_scalar,
     bitwise_shift_left_dyn, bitwise_shift_left_dyn_scalar, bitwise_shift_right_dyn,
     bitwise_shift_right_dyn_scalar, bitwise_xor_dyn, bitwise_xor_dyn_scalar,
-    concat_elements_utf8view, regex_match_dyn, regex_match_dyn_scalar,
+    concat_elements_binary_view_array, concat_elements_utf8view, regex_match_dyn,
+    regex_match_dyn_scalar,
 };
 
 /// Binary expression
@@ -132,7 +133,7 @@ impl std::fmt::Display for BinaryExpr {
             expr: &dyn PhysicalExpr,
             precedence: u8,
         ) -> std::fmt::Result {
-            if let Some(child) = expr.as_any().downcast_ref::<BinaryExpr>() {
+            if let Some(child) = expr.downcast_ref::<BinaryExpr>() {
                 let p = child.op.precedence();
                 if p == 0 || p < precedence {
                     write!(f, "({child})")?;
@@ -165,12 +166,95 @@ fn boolean_op(
     op(ll, rr).map(|t| Arc::new(t) as _)
 }
 
-impl PhysicalExpr for BinaryExpr {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
+/// Returns true if both operands are Date types (Date32 or Date64)
+/// Used to detect Date - Date operations which should return Int64 (days difference)
+fn is_date_minus_date(lhs: &DataType, rhs: &DataType) -> bool {
+    matches!(
+        (lhs, rhs),
+        (DataType::Date32, DataType::Date32) | (DataType::Date64, DataType::Date64)
+    )
+}
+
+/// Computes the difference between two dates and returns the result as Int64 (days)
+/// This aligns with PostgreSQL, DuckDB, and MySQL behavior where date - date returns an integer
+///
+/// Implementation: Uses Arrow's sub_wrapping to get Duration, then converts to Int64 days
+fn apply_date_subtraction(
+    lhs: &ColumnarValue,
+    rhs: &ColumnarValue,
+) -> Result<ColumnarValue> {
+    use arrow::compute::kernels::numeric::sub_wrapping;
+
+    // Use Arrow's sub_wrapping to compute the Duration result
+    let duration_result = apply(lhs, rhs, sub_wrapping)?;
+
+    // Convert Duration to Int64 (days)
+    match duration_result {
+        ColumnarValue::Array(array) => {
+            let int64_array = duration_to_days(&array)?;
+            Ok(ColumnarValue::Array(int64_array))
+        }
+        ColumnarValue::Scalar(scalar) => {
+            // Convert scalar Duration to Int64 days
+            let array = scalar.to_array_of_size(1)?;
+            let int64_array = duration_to_days(&array)?;
+            let int64_scalar = ScalarValue::try_from_array(int64_array.as_ref(), 0)?;
+            Ok(ColumnarValue::Scalar(int64_scalar))
+        }
+    }
+}
+
+/// Converts a Duration array to Int64 days
+/// Handles different Duration time units (Second, Millisecond, Microsecond, Nanosecond)
+fn duration_to_days(array: &ArrayRef) -> Result<ArrayRef> {
+    use datafusion_common::cast::{
+        as_duration_microsecond_array, as_duration_millisecond_array,
+        as_duration_nanosecond_array, as_duration_second_array,
+    };
+
+    const SECONDS_PER_DAY: i64 = 86_400;
+    const MILLIS_PER_DAY: i64 = 86_400_000;
+    const MICROS_PER_DAY: i64 = 86_400_000_000;
+    const NANOS_PER_DAY: i64 = 86_400_000_000_000;
+
+    match array.data_type() {
+        DataType::Duration(TimeUnit::Second) => {
+            let duration_array = as_duration_second_array(array)?;
+            let result: Int64Array = duration_array
+                .iter()
+                .map(|v| v.map(|val| val / SECONDS_PER_DAY))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        DataType::Duration(TimeUnit::Millisecond) => {
+            let duration_array = as_duration_millisecond_array(array)?;
+            let result: Int64Array = duration_array
+                .iter()
+                .map(|v| v.map(|val| val / MILLIS_PER_DAY))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        DataType::Duration(TimeUnit::Microsecond) => {
+            let duration_array = as_duration_microsecond_array(array)?;
+            let result: Int64Array = duration_array
+                .iter()
+                .map(|v| v.map(|val| val / MICROS_PER_DAY))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        DataType::Duration(TimeUnit::Nanosecond) => {
+            let duration_array = as_duration_nanosecond_array(array)?;
+            let result: Int64Array = duration_array
+                .iter()
+                .map(|v| v.map(|val| val / NANOS_PER_DAY))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        other => internal_err!("duration_to_days expected Duration type, got: {}", other),
     }
+}
 
+impl PhysicalExpr for BinaryExpr {
     fn data_type(&self, input_schema: &Schema) -> Result<DataType> {
         BinaryTypeCoercer::new(
             &self.left.data_type(input_schema)?,
@@ -208,11 +292,11 @@ impl PhysicalExpr for BinaryExpr {
                     ColumnarValue::Array(array) => {
                         // When the array on the right is all true or all false, skip the scatter process
                         let boolean_array = array.as_boolean();
-                        let true_count = boolean_array.true_count();
-                        let length = boolean_array.len();
-                        if true_count == length {
+                        if boolean_array.null_count() == 0 && !boolean_array.has_false() {
                             return Ok(lhs);
-                        } else if true_count == 0 && boolean_array.null_count() == 0 {
+                        } else if boolean_array.null_count() == 0
+                            && !boolean_array.has_true()
+                        {
                             // If the right-hand array is returned at this point,the lengths will be inconsistent;
                             // returning a scalar can avoid this issue
                             return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(
@@ -251,52 +335,49 @@ impl PhysicalExpr for BinaryExpr {
         let schema = batch.schema();
         let input_schema = schema.as_ref();
 
-        if left_data_type.is_nested() {
-            if !left_data_type.equals_datatype(&right_data_type) {
-                return internal_err!("Cannot evaluate binary expression because of type mismatch: left {}, right {} ", left_data_type, right_data_type);
-            }
-            return apply_cmp_for_nested(self.op, &lhs, &rhs);
-        }
-
         match self.op {
             Operator::Plus if self.fail_on_overflow => return apply(&lhs, &rhs, add),
             Operator::Plus => return apply(&lhs, &rhs, add_wrapping),
+            // Special case: Date - Date returns Int64 (days difference)
+            // This aligns with PostgreSQL, DuckDB, and MySQL behavior
+            Operator::Minus if is_date_minus_date(&left_data_type, &right_data_type) => {
+                return apply_date_subtraction(&lhs, &rhs);
+            }
             Operator::Minus if self.fail_on_overflow => return apply(&lhs, &rhs, sub),
             Operator::Minus => return apply(&lhs, &rhs, sub_wrapping),
             Operator::Multiply if self.fail_on_overflow => return apply(&lhs, &rhs, mul),
             Operator::Multiply => return apply(&lhs, &rhs, mul_wrapping),
             Operator::Divide => return apply(&lhs, &rhs, div),
             Operator::Modulo => return apply(&lhs, &rhs, rem),
-            Operator::Eq => return apply_cmp(&lhs, &rhs, eq),
-            Operator::NotEq => return apply_cmp(&lhs, &rhs, neq),
-            Operator::Lt => return apply_cmp(&lhs, &rhs, lt),
-            Operator::Gt => return apply_cmp(&lhs, &rhs, gt),
-            Operator::LtEq => return apply_cmp(&lhs, &rhs, lt_eq),
-            Operator::GtEq => return apply_cmp(&lhs, &rhs, gt_eq),
-            Operator::IsDistinctFrom => return apply_cmp(&lhs, &rhs, distinct),
-            Operator::IsNotDistinctFrom => return apply_cmp(&lhs, &rhs, not_distinct),
-            Operator::LikeMatch => return apply_cmp(&lhs, &rhs, like),
-            Operator::ILikeMatch => return apply_cmp(&lhs, &rhs, ilike),
-            Operator::NotLikeMatch => return apply_cmp(&lhs, &rhs, nlike),
-            Operator::NotILikeMatch => return apply_cmp(&lhs, &rhs, nilike),
+
+            Operator::Eq
+            | Operator::NotEq
+            | Operator::Lt
+            | Operator::Gt
+            | Operator::LtEq
+            | Operator::GtEq
+            | Operator::IsDistinctFrom
+            | Operator::IsNotDistinctFrom
+            | Operator::LikeMatch
+            | Operator::ILikeMatch
+            | Operator::NotLikeMatch
+            | Operator::NotILikeMatch => {
+                return apply_cmp(self.op, &lhs, &rhs);
+            }
             _ => {}
         }
 
         let result_type = self.data_type(input_schema)?;
 
         // If the left-hand side is an array and the right-hand side is a non-null scalar, try the optimized kernel.
-        if let (ColumnarValue::Array(array), ColumnarValue::Scalar(ref scalar)) =
-            (&lhs, &rhs)
+        if let (ColumnarValue::Array(array), ColumnarValue::Scalar(scalar)) = (&lhs, &rhs)
+            && !scalar.is_null()
+            && let Some(result_array) =
+                self.evaluate_array_scalar(array, scalar.clone())?
         {
-            if !scalar.is_null() {
-                if let Some(result_array) =
-                    self.evaluate_array_scalar(array, scalar.clone())?
-                {
-                    let final_array = result_array
-                        .and_then(|a| to_result_type_array(&self.op, a, &result_type));
-                    return final_array.map(ColumnarValue::Array);
-                }
-            }
+            let final_array = result_array
+                .and_then(|a| to_result_type_array(&self.op, a, &result_type));
+            return final_array.map(ColumnarValue::Array);
         }
 
         // if both arrays or both literals - extract arrays and continue execution
@@ -340,33 +421,27 @@ impl PhysicalExpr for BinaryExpr {
         let right_interval = children[1];
 
         if self.op.eq(&Operator::And) {
-            if interval.eq(&Interval::CERTAINLY_TRUE) {
+            if interval.eq(&Interval::TRUE) {
                 // A certainly true logical conjunction can only derive from possibly
                 // true operands. Otherwise, we prove infeasibility.
-                Ok((!left_interval.eq(&Interval::CERTAINLY_FALSE)
-                    && !right_interval.eq(&Interval::CERTAINLY_FALSE))
-                .then(|| vec![Interval::CERTAINLY_TRUE, Interval::CERTAINLY_TRUE]))
-            } else if interval.eq(&Interval::CERTAINLY_FALSE) {
+                Ok((!left_interval.eq(&Interval::FALSE)
+                    && !right_interval.eq(&Interval::FALSE))
+                .then(|| vec![Interval::TRUE, Interval::TRUE]))
+            } else if interval.eq(&Interval::FALSE) {
                 // If the logical conjunction is certainly false, one of the
                 // operands must be false. However, it's not always possible to
                 // determine which operand is false, leading to different scenarios.
 
                 // If one operand is certainly true and the other one is uncertain,
                 // then the latter must be certainly false.
-                if left_interval.eq(&Interval::CERTAINLY_TRUE)
-                    && right_interval.eq(&Interval::UNCERTAIN)
+                if left_interval.eq(&Interval::TRUE)
+                    && right_interval.eq(&Interval::TRUE_OR_FALSE)
                 {
-                    Ok(Some(vec![
-                        Interval::CERTAINLY_TRUE,
-                        Interval::CERTAINLY_FALSE,
-                    ]))
-                } else if right_interval.eq(&Interval::CERTAINLY_TRUE)
-                    && left_interval.eq(&Interval::UNCERTAIN)
+                    Ok(Some(vec![Interval::TRUE, Interval::FALSE]))
+                } else if right_interval.eq(&Interval::TRUE)
+                    && left_interval.eq(&Interval::TRUE_OR_FALSE)
                 {
-                    Ok(Some(vec![
-                        Interval::CERTAINLY_FALSE,
-                        Interval::CERTAINLY_TRUE,
-                    ]))
+                    Ok(Some(vec![Interval::FALSE, Interval::TRUE]))
                 }
                 // If both children are uncertain, or if one is certainly false,
                 // we cannot conclusively refine their intervals. In this case,
@@ -380,33 +455,27 @@ impl PhysicalExpr for BinaryExpr {
                 Ok(Some(vec![]))
             }
         } else if self.op.eq(&Operator::Or) {
-            if interval.eq(&Interval::CERTAINLY_FALSE) {
+            if interval.eq(&Interval::FALSE) {
                 // A certainly false logical disjunction can only derive from certainly
                 // false operands. Otherwise, we prove infeasibility.
-                Ok((!left_interval.eq(&Interval::CERTAINLY_TRUE)
-                    && !right_interval.eq(&Interval::CERTAINLY_TRUE))
-                .then(|| vec![Interval::CERTAINLY_FALSE, Interval::CERTAINLY_FALSE]))
-            } else if interval.eq(&Interval::CERTAINLY_TRUE) {
+                Ok((!left_interval.eq(&Interval::TRUE)
+                    && !right_interval.eq(&Interval::TRUE))
+                .then(|| vec![Interval::FALSE, Interval::FALSE]))
+            } else if interval.eq(&Interval::TRUE) {
                 // If the logical disjunction is certainly true, one of the
                 // operands must be true. However, it's not always possible to
                 // determine which operand is true, leading to different scenarios.
 
                 // If one operand is certainly false and the other one is uncertain,
                 // then the latter must be certainly true.
-                if left_interval.eq(&Interval::CERTAINLY_FALSE)
-                    && right_interval.eq(&Interval::UNCERTAIN)
+                if left_interval.eq(&Interval::FALSE)
+                    && right_interval.eq(&Interval::TRUE_OR_FALSE)
                 {
-                    Ok(Some(vec![
-                        Interval::CERTAINLY_FALSE,
-                        Interval::CERTAINLY_TRUE,
-                    ]))
-                } else if right_interval.eq(&Interval::CERTAINLY_FALSE)
-                    && left_interval.eq(&Interval::UNCERTAIN)
+                    Ok(Some(vec![Interval::FALSE, Interval::TRUE]))
+                } else if right_interval.eq(&Interval::FALSE)
+                    && left_interval.eq(&Interval::TRUE_OR_FALSE)
                 {
-                    Ok(Some(vec![
-                        Interval::CERTAINLY_TRUE,
-                        Interval::CERTAINLY_FALSE,
-                    ]))
+                    Ok(Some(vec![Interval::TRUE, Interval::FALSE]))
                 }
                 // If both children are uncertain, or if one is certainly true,
                 // we cannot conclusively refine their intervals. In this case,
@@ -439,10 +508,10 @@ impl PhysicalExpr for BinaryExpr {
             // We might be able to construct the output statistics more accurately,
             // without falling back to an unknown distribution, if we are dealing
             // with Gaussian distributions and numerical operations.
-            if let (Gaussian(left), Gaussian(right)) = (left, right) {
-                if let Some(result) = combine_gaussians(&self.op, left, right)? {
-                    return Ok(Gaussian(result));
-                }
+            if let (Gaussian(left), Gaussian(right)) = (left, right)
+                && let Some(result) = combine_gaussians(&self.op, left, right)?
+            {
+                return Ok(Gaussian(result));
             }
         } else if self.op.is_logic_operator() {
             // If we are dealing with logical operators, we expect (and can only
@@ -519,7 +588,7 @@ impl PhysicalExpr for BinaryExpr {
             expr: &dyn PhysicalExpr,
             precedence: u8,
         ) -> std::fmt::Result {
-            if let Some(child) = expr.as_any().downcast_ref::<BinaryExpr>() {
+            if let Some(child) = expr.downcast_ref::<BinaryExpr>() {
                 let p = child.op.precedence();
                 if p == 0 || p < precedence {
                     write!(f, "(")?;
@@ -559,8 +628,8 @@ fn to_result_type_array(
                     Ok(cast(&array, result_type)?)
                 } else {
                     internal_err!(
-                            "Incompatible Dictionary value type {value_type} with result type {result_type} of Binary operator {op:?}"
-                        )
+                        "Incompatible Dictionary value type {value_type} with result type {result_type} of Binary operator {op:?}"
+                    )
                 }
             }
             _ => Ok(array),
@@ -580,10 +649,10 @@ impl BinaryExpr {
     ) -> Result<Option<Result<ArrayRef>>> {
         use Operator::*;
         let scalar_result = match &self.op {
-            RegexMatch => regex_match_dyn_scalar(array, scalar, false, false),
-            RegexIMatch => regex_match_dyn_scalar(array, scalar, false, true),
-            RegexNotMatch => regex_match_dyn_scalar(array, scalar, true, false),
-            RegexNotIMatch => regex_match_dyn_scalar(array, scalar, true, true),
+            RegexMatch => regex_match_dyn_scalar(array, &scalar, false, false),
+            RegexIMatch => regex_match_dyn_scalar(array, &scalar, false, true),
+            RegexNotMatch => regex_match_dyn_scalar(array, &scalar, true, false),
+            RegexNotIMatch => regex_match_dyn_scalar(array, &scalar, true, true),
             BitwiseAnd => bitwise_and_dyn_scalar(array, scalar),
             BitwiseOr => bitwise_or_dyn_scalar(array, scalar),
             BitwiseXor => bitwise_xor_dyn_scalar(array, scalar),
@@ -632,19 +701,19 @@ impl BinaryExpr {
                     )
                 }
             }
-            RegexMatch => regex_match_dyn(left, right, false, false),
-            RegexIMatch => regex_match_dyn(left, right, false, true),
-            RegexNotMatch => regex_match_dyn(left, right, true, false),
-            RegexNotIMatch => regex_match_dyn(left, right, true, true),
+            RegexMatch => regex_match_dyn(&left, &right, false, false),
+            RegexIMatch => regex_match_dyn(&left, &right, false, true),
+            RegexNotMatch => regex_match_dyn(&left, &right, true, false),
+            RegexNotIMatch => regex_match_dyn(&left, &right, true, true),
             BitwiseAnd => bitwise_and_dyn(left, right),
             BitwiseOr => bitwise_or_dyn(left, right),
             BitwiseXor => bitwise_xor_dyn(left, right),
             BitwiseShiftRight => bitwise_shift_right_dyn(left, right),
             BitwiseShiftLeft => bitwise_shift_left_dyn(left, right),
-            StringConcat => concat_elements(left, right),
+            StringConcat => concat_elements(&left, &right),
             AtArrow | ArrowAt | Arrow | LongArrow | HashArrow | HashLongArrow | AtAt
             | HashMinus | AtQuestion | Question | QuestionAnd | QuestionPipe
-            | IntegerDivide => {
+            | IntegerDivide | Colon => {
                 not_impl_err!(
                     "Binary operator '{:?}' is not supported in the physical expr",
                     self.op
@@ -861,7 +930,7 @@ fn pre_selection_scatter(
     Ok(ColumnarValue::Array(Arc::new(boolean_result)))
 }
 
-fn concat_elements(left: Arc<dyn Array>, right: Arc<dyn Array>) -> Result<ArrayRef> {
+fn concat_elements(left: &ArrayRef, right: &ArrayRef) -> Result<ArrayRef> {
     Ok(match left.data_type() {
         DataType::Utf8 => Arc::new(concat_elements_utf8(
             left.as_string::<i32>(),
@@ -875,6 +944,18 @@ fn concat_elements(left: Arc<dyn Array>, right: Arc<dyn Array>) -> Result<ArrayR
             left.as_string_view(),
             right.as_string_view(),
         )?),
+        DataType::Binary => Arc::new(concat_element_binary::<i32>(
+            left.as_binary(),
+            right.as_binary(),
+        )?),
+        DataType::LargeBinary => Arc::new(concat_element_binary::<i64>(
+            left.as_binary(),
+            right.as_binary(),
+        )?),
+        DataType::BinaryView => Arc::new(concat_elements_binary_view_array(
+            left.as_binary_view(),
+            right.as_binary_view(),
+        )?),
         other => {
             return internal_err!(
                 "Data type {other:?} not supported for binary operation 'concat_elements' on string arrays"
@@ -914,7 +995,7 @@ pub fn similar_to(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::expressions::{col, lit, try_cast, Column, Literal};
+    use crate::expressions::{Column, Literal, col, lit, try_cast};
     use datafusion_expr::lit as expr_lit;
 
     use datafusion_common::plan_datafusion_err;
@@ -1037,7 +1118,8 @@ mod tests {
             ]);
             let a = $A_ARRAY::from($A_VEC);
             let b = $B_ARRAY::from($B_VEC);
-            let (lhs, rhs) = BinaryTypeCoercer::new(&$A_TYPE, &$OP, &$B_TYPE).get_input_types()?;
+            let (lhs, rhs) =
+                BinaryTypeCoercer::new(&$A_TYPE, &$OP, &$B_TYPE).get_input_types()?;
 
             let left = try_cast(col("a", &schema)?, &schema, lhs)?;
             let right = try_cast(col("b", &schema)?, &schema, rhs)?;
@@ -1053,7 +1135,10 @@ mod tests {
             assert_eq!(expression.data_type(&schema)?, $C_TYPE);
 
             // compute
-            let result = expression.evaluate(&batch)?.into_array(batch.num_rows()).expect("Failed to convert to array");
+            let result = expression
+                .evaluate(&batch)?
+                .into_array(batch.num_rows())
+                .expect("Failed to convert to array");
 
             // verify that the array's data_type is correct
             assert_eq!(*result.data_type(), $C_TYPE);
@@ -1067,8 +1152,7 @@ mod tests {
             for (i, x) in $VEC.iter().enumerate() {
                 let v = result.value(i);
                 assert_eq!(
-                    v,
-                    *x,
+                    v, *x,
                     "Unexpected output at position {i}:\n\nActual:\n{v}\n\nExpected:\n{x}"
                 );
             }
@@ -4445,11 +4529,13 @@ mod tests {
 
         // evaluate expression
         let result = expr.evaluate(&batch);
-        assert!(result
-            .err()
-            .unwrap()
-            .to_string()
-            .contains("Overflow happened on: 2147483647 + 1"));
+        assert!(
+            result
+                .err()
+                .unwrap()
+                .to_string()
+                .contains("Overflow happened on: 2147483647 + 1")
+        );
         Ok(())
     }
 
@@ -4474,11 +4560,13 @@ mod tests {
 
         // evaluate expression
         let result = expr.evaluate(&batch);
-        assert!(result
-            .err()
-            .unwrap()
-            .to_string()
-            .contains("Overflow happened on: -2147483648 - 1"));
+        assert!(
+            result
+                .err()
+                .unwrap()
+                .to_string()
+                .contains("Overflow happened on: -2147483648 - 1")
+        );
         Ok(())
     }
 
@@ -4503,11 +4591,13 @@ mod tests {
 
         // evaluate expression
         let result = expr.evaluate(&batch);
-        assert!(result
-            .err()
-            .unwrap()
-            .to_string()
-            .contains("Overflow happened on: 2147483647 * 2"));
+        assert!(
+            result
+                .err()
+                .unwrap()
+                .to_string()
+                .contains("Overflow happened on: 2147483647 * 2")
+        );
         Ok(())
     }
 
@@ -4576,7 +4666,6 @@ mod tests {
         schema: &Schema,
     ) -> Result<BinaryExpr> {
         Ok(binary_op(left, op, right, schema)?
-            .as_any()
             .downcast_ref::<BinaryExpr>()
             .unwrap()
             .clone())
@@ -4816,9 +4905,10 @@ mod tests {
             let child_refs = child_view.iter().collect::<Vec<_>>();
             for op in &ops {
                 let expr = binary_expr(Arc::clone(&a), *op, Arc::clone(&b), schema)?;
-                assert!(expr
-                    .propagate_statistics(&parent, child_refs.as_slice())?
-                    .is_some());
+                assert!(
+                    expr.propagate_statistics(&parent, child_refs.as_slice())?
+                        .is_some()
+                );
             }
         }
 
diff --git a/datafusion/physical-expr/src/expressions/binary/kernels.rs b/datafusion/physical-expr/src/expressions/binary/kernels.rs
index 6c96975ed6446..e573d7ece2afa 100644
--- a/datafusion/physical-expr/src/expressions/binary/kernels.rs
+++ b/datafusion/physical-expr/src/expressions/binary/kernels.rs
@@ -18,6 +18,7 @@
 //! This module contains computation kernels that are specific to
 //! datafusion and not (yet) targeted to  port upstream to arrow
 use arrow::array::*;
+use arrow::buffer::{MutableBuffer, NullBuffer};
 use arrow::compute::kernels::bitwise::{
     bitwise_and, bitwise_and_scalar, bitwise_or, bitwise_or_scalar, bitwise_shift_left,
     bitwise_shift_left_scalar, bitwise_shift_right, bitwise_shift_right_scalar,
@@ -27,8 +28,8 @@ use arrow::compute::kernels::boolean::not;
 use arrow::compute::kernels::comparison::{regexp_is_match, regexp_is_match_scalar};
 use arrow::datatypes::DataType;
 use arrow::error::ArrowError;
-use datafusion_common::{internal_err, plan_err};
 use datafusion_common::{Result, ScalarValue};
+use datafusion_common::{internal_err, plan_err};
 
 use std::sync::Arc;
 
@@ -108,16 +109,35 @@ macro_rules! call_scalar_kernel {
 /// downcasts left / right to the appropriate integral type and calls the kernel
 macro_rules! create_left_integral_dyn_scalar_kernel {
     ($FUNC:ident, $KERNEL:ident) => {
-        pub(crate) fn $FUNC(array: &dyn Array, scalar: ScalarValue) -> Option<Result<ArrayRef>> {
+        pub(crate) fn $FUNC(
+            array: &dyn Array,
+            scalar: ScalarValue,
+        ) -> Option<Result<ArrayRef>> {
             let result = match array.data_type() {
-                DataType::Int8 => call_scalar_kernel!(array, scalar, $KERNEL, Int8Array, i8),
-                DataType::Int16 => call_scalar_kernel!(array, scalar, $KERNEL, Int16Array, i16),
-                DataType::Int32 => call_scalar_kernel!(array, scalar, $KERNEL, Int32Array, i32),
-                DataType::Int64 => call_scalar_kernel!(array, scalar, $KERNEL, Int64Array, i64),
-                DataType::UInt8 => call_scalar_kernel!(array, scalar, $KERNEL, UInt8Array, u8),
-                DataType::UInt16 => call_scalar_kernel!(array, scalar, $KERNEL, UInt16Array, u16),
-                DataType::UInt32 => call_scalar_kernel!(array, scalar, $KERNEL, UInt32Array, u32),
-                DataType::UInt64 => call_scalar_kernel!(array, scalar, $KERNEL, UInt64Array, u64),
+                DataType::Int8 => {
+                    call_scalar_kernel!(array, scalar, $KERNEL, Int8Array, i8)
+                }
+                DataType::Int16 => {
+                    call_scalar_kernel!(array, scalar, $KERNEL, Int16Array, i16)
+                }
+                DataType::Int32 => {
+                    call_scalar_kernel!(array, scalar, $KERNEL, Int32Array, i32)
+                }
+                DataType::Int64 => {
+                    call_scalar_kernel!(array, scalar, $KERNEL, Int64Array, i64)
+                }
+                DataType::UInt8 => {
+                    call_scalar_kernel!(array, scalar, $KERNEL, UInt8Array, u8)
+                }
+                DataType::UInt16 => {
+                    call_scalar_kernel!(array, scalar, $KERNEL, UInt16Array, u16)
+                }
+                DataType::UInt32 => {
+                    call_scalar_kernel!(array, scalar, $KERNEL, UInt32Array, u32)
+                }
+                DataType::UInt64 => {
+                    call_scalar_kernel!(array, scalar, $KERNEL, UInt64Array, u64)
+                }
                 other => plan_err!(
                     "Data type {} not supported for binary operation '{}' on dyn arrays",
                     other,
@@ -141,11 +161,11 @@ create_left_integral_dyn_scalar_kernel!(
     bitwise_shift_left_scalar
 );
 
-/// Concatenates two `StringViewArray`s element-wise.  
+/// Concatenates two `StringViewArray`s element-wise.
 /// If either element is `Null`, the result element is also `Null`.
 ///
 /// # Errors
-/// - Returns an error if the input arrays have different lengths.  
+/// - Returns an error if the input arrays have different lengths.
 /// - Returns an error if any concatenated string exceeds `u32::MAX` (≈4 GB) in length.
 pub fn concat_elements_utf8view(
     left: &StringViewArray,
@@ -158,24 +178,71 @@ pub fn concat_elements_utf8view(
             right.len()
         )));
     }
-    let capacity = left.len();
-    let mut result = StringViewBuilder::with_capacity(capacity);
+    let mut result = StringViewBuilder::with_capacity(left.len());
 
-    // Avoid reallocations by writing to a reused buffer (note we
-    // could be even more efficient r by creating the view directly
-    // here and avoid the buffer but that would be more complex)
+    // Avoid reallocations by writing to a reused buffer (note we could be even
+    // more efficient by creating the view directly here and avoid the buffer
+    // but that would be more complex)
     let mut buffer = String::new();
 
-    for (left, right) in left.iter().zip(right.iter()) {
-        if let (Some(left), Some(right)) = (left, right) {
-            use std::fmt::Write;
+    // Pre-compute combined null bitmap, so the per-row NULL check is more
+    // efficient
+    let nulls = NullBuffer::union(left.nulls(), right.nulls());
+
+    for i in 0..left.len() {
+        if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
+            result.append_null();
+        } else {
+            let l = left.value(i);
+            let r = right.value(i);
             buffer.clear();
-            write!(&mut buffer, "{left}{right}")
-                .expect("writing into string buffer failed");
+            buffer.push_str(l);
+            buffer.push_str(r);
             result.try_append_value(&buffer)?;
+        }
+    }
+    Ok(result.finish())
+}
+
+/// Concatenates two `BinaryViewArray`s element-wise.
+/// If either element is `Null`, the result element is also `Null`.
+///
+/// # Errors
+/// - Returns an error if the input arrays have different lengths.
+/// - Returns an error if any concatenated string exceeds `u32::MAX` in length.
+pub fn concat_elements_binary_view_array(
+    left: &BinaryViewArray,
+    right: &BinaryViewArray,
+) -> std::result::Result<BinaryViewArray, ArrowError> {
+    if left.len() != right.len() {
+        return Err(ArrowError::ComputeError(format!(
+            "Arrays must have the same length: {} != {}",
+            left.len(),
+            right.len()
+        )));
+    }
+    let mut result = BinaryViewBuilder::with_capacity(left.len());
+
+    // Avoid reallocations by writing to a reused buffer (note we could be even
+    // more efficient by creating the view directly here and avoid the buffer
+    // but that would be more complex)
+    let mut buffer = MutableBuffer::new(0);
+
+    // Pre-compute combined null bitmap, so the per-row NULL check is more
+    // efficient
+    let nulls = NullBuffer::union(left.nulls(), right.nulls());
+
+    for i in 0..left.len() {
+        if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
+            result.append_null();
         } else {
-            // at least one of the values is null, so the output is also null
-            result.append_null()
+            let l = left.value(i);
+            let r = right.value(i);
+            buffer.clear();
+            buffer.extend_from_slice(l);
+            buffer.extend_from_slice(r);
+            // No try-version of append_value
+            result.try_append_value(&buffer)?;
         }
     }
     Ok(result.finish())
@@ -207,8 +274,8 @@ macro_rules! regexp_is_match_flag {
 }
 
 pub(crate) fn regex_match_dyn(
-    left: ArrayRef,
-    right: ArrayRef,
+    left: &ArrayRef,
+    right: &ArrayRef,
     not_match: bool,
     flag: bool,
 ) -> Result<ArrayRef> {
@@ -259,7 +326,7 @@ macro_rules! regexp_is_match_flag_scalar {
 
 pub(crate) fn regex_match_dyn_scalar(
     left: &dyn Array,
-    right: ScalarValue,
+    right: &ScalarValue,
     not_match: bool,
     flag: bool,
 ) -> Option<Result<ArrayRef>> {
@@ -296,8 +363,8 @@ pub(crate) fn regex_match_dyn_scalar(
             )
         }
         other => internal_err!(
-                "Data type {} not supported for operation 'regex_match_dyn_scalar' on string array",
-                other
+            "Data type {} not supported for operation 'regex_match_dyn_scalar' on string array",
+            other
         ),
     };
     Some(result)
diff --git a/datafusion/physical-expr/src/expressions/case.rs b/datafusion/physical-expr/src/expressions/case.rs
index 010df564a948a..919b6684e0177 100644
--- a/datafusion/physical-expr/src/expressions/case.rs
+++ b/datafusion/physical-expr/src/expressions/case.rs
@@ -15,32 +15,38 @@
 // specific language governing permissions and limitations
 // under the License.
 
+mod literal_lookup_table;
+
 use super::{Column, Literal};
-use crate::expressions::case::ResultState::{Complete, Empty, Partial};
-use crate::expressions::try_cast;
 use crate::PhysicalExpr;
+use crate::expressions::{LambdaExpr, LambdaVariable, lit, try_cast};
 use arrow::array::*;
 use arrow::compute::kernels::zip::zip;
 use arrow::compute::{
-    is_not_null, not, nullif, prep_null_mask_filter, FilterBuilder, FilterPredicate,
+    FilterBuilder, FilterPredicate, is_not_null, not, nullif, prep_null_mask_filter,
 };
-use arrow::datatypes::{DataType, Schema, UInt32Type};
+use arrow::datatypes::{DataType, Schema, UInt32Type, UnionMode};
 use arrow::error::ArrowError;
 use datafusion_common::cast::as_boolean_array;
-use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion};
 use datafusion_common::{
-    exec_err, internal_datafusion_err, internal_err, DataFusionError, HashMap, HashSet,
-    Result, ScalarValue,
+    DataFusionError, Result, ScalarValue, assert_or_internal_err, exec_err,
+    internal_datafusion_err, internal_err,
 };
 use datafusion_expr::ColumnarValue;
+use indexmap::{IndexMap, IndexSet};
+use std::borrow::Cow;
+use std::hash::Hash;
+use std::sync::Arc;
+
+use crate::expressions::case::literal_lookup_table::LiteralLookupTable;
+use arrow::compute::kernels::merge::{MergeIndex, merge, merge_n};
+use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion};
 use datafusion_physical_expr_common::datum::compare_with_eq;
+use datafusion_physical_expr_common::utils::scatter;
 use itertools::Itertools;
-use std::borrow::Cow;
 use std::fmt::{Debug, Formatter};
-use std::hash::Hash;
-use std::{any::Any, sync::Arc};
 
-type WhenThen = (Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>);
+pub(super) type WhenThen = (Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>);
 
 #[derive(Debug, Hash, PartialEq, Eq)]
 enum EvalMethod {
@@ -59,7 +65,7 @@ enum EvalMethod {
     /// for expressions that are infallible and can be cheaply computed for the entire
     /// record batch rather than just for the rows where the predicate is true.
     ///
-    /// CASE WHEN condition THEN column [ELSE NULL] END
+    /// CASE WHEN condition THEN infallible_expression [ELSE NULL] END
     InfallibleExprOrNull,
     /// This is a specialization for a specific use case where we can take a fast path
     /// if there is just one when/then pair and both the `then` and `else` expressions
@@ -67,12 +73,45 @@ enum EvalMethod {
     /// CASE WHEN condition THEN literal ELSE literal END
     ScalarOrScalar,
     /// This is a specialization for a specific use case where we can take a fast path
-    /// if there is just one when/then pair and both the `then` and `else` are expressions
+    /// if there is just one when/then pair, the `then` is an expression, and `else` is either
+    /// an expression, literal NULL or absent.
+    ///
+    /// In contrast to [`EvalMethod::InfallibleExprOrNull`], this specialization can handle fallible
+    /// `then` expressions.
     ///
-    /// CASE WHEN condition THEN expression ELSE expression END
+    /// CASE WHEN condition THEN expression [ELSE expression] END
     ExpressionOrExpression(ProjectedCaseBody),
+
+    /// This is a specialization for [`EvalMethod::WithExpression`] when the value and results are literals
+    ///
+    /// See [`LiteralLookupTable`] for more details
+    WithExprScalarLookupTable(LiteralLookupTable),
 }
 
+/// Implementing hash so we can use `derive` on [`EvalMethod`].
+///
+/// not implementing actual [`Hash`] as it is not dyn compatible so we cannot implement it for
+/// `dyn` [`literal_lookup_table::WhenLiteralIndexMap`].
+///
+/// So implementing empty hash is still valid as the data is derived from `PhysicalExpr` s which are already hashed
+impl Hash for LiteralLookupTable {
+    fn hash<H: std::hash::Hasher>(&self, _state: &mut H) {}
+}
+
+/// Implementing Equal so we can use `derive` on [`EvalMethod`].
+///
+/// not implementing actual [`PartialEq`] as it is not dyn compatible so we cannot implement it for
+/// `dyn` [`literal_lookup_table::WhenLiteralIndexMap`].
+///
+/// So we always return true as the data is derived from `PhysicalExpr` s which are already compared
+impl PartialEq for LiteralLookupTable {
+    fn eq(&self, _other: &Self) -> bool {
+        true
+    }
+}
+
+impl Eq for LiteralLookupTable {}
+
 /// The body of a CASE expression which consists of an optional base expression, the "when/then"
 /// branches and an optional "else" branch.
 #[derive(Debug, Hash, PartialEq, Eq)]
@@ -89,11 +128,18 @@ impl CaseBody {
     /// Derives a [ProjectedCaseBody] from this [CaseBody].
     fn project(&self) -> Result<ProjectedCaseBody> {
         // Determine the set of columns that are used in all the expressions of the case body.
-        let mut used_column_indices = HashSet::<usize>::new();
+        let mut used_column_indices = IndexSet::<usize>::new();
         let mut collect_column_indices = |expr: &Arc<dyn PhysicalExpr>| {
             expr.apply(|expr| {
-                if let Some(column) = expr.as_any().downcast_ref::<Column>() {
+                if let Some(column) = expr.downcast_ref::<Column>() {
                     used_column_indices.insert(column.index());
+                } else if let Some(lambda_variable) =
+                    expr.downcast_ref::<LambdaVariable>()
+                {
+                    used_column_indices.insert(lambda_variable.index());
+                } else if expr.is::<LambdaExpr>() {
+                    //todo: remove this branch when lambda supports column capture
+                    return Ok(TreeNodeRecursion::Jump);
                 }
                 Ok(TreeNodeRecursion::Continue)
             })
@@ -116,14 +162,14 @@ impl CaseBody {
             .iter()
             .enumerate()
             .map(|(projected, original)| (*original, projected))
-            .collect::<HashMap<usize, usize>>();
+            .collect::<IndexMap<usize, usize>>();
 
         // Construct the projected body by rewriting each expression from the original body
         // using the column index mapping.
         let project = |expr: &Arc<dyn PhysicalExpr>| -> Result<Arc<dyn PhysicalExpr>> {
             Arc::clone(expr)
                 .transform_down(|e| {
-                    if let Some(column) = e.as_any().downcast_ref::<Column>() {
+                    if let Some(column) = e.downcast_ref::<Column>() {
                         let original = column.index();
                         let projected = *column_index_map.get(&original).unwrap();
                         if projected != original {
@@ -132,6 +178,20 @@ impl CaseBody {
                                 projected,
                             ))));
                         }
+                    } else if let Some(lambda_variable) =
+                        e.downcast_ref::<LambdaVariable>()
+                    {
+                        let original = lambda_variable.index();
+                        let projected = *column_index_map.get(&original).unwrap();
+                        if projected != original {
+                            return Ok(Transformed::yes(Arc::new(LambdaVariable::new(
+                                projected,
+                                Arc::clone(lambda_variable.field()),
+                            ))));
+                        }
+                    } else if e.is::<LambdaExpr>() {
+                        //todo: remove this branch when lambda supports column capture
+                        return Ok(Transformed::new(e, false, TreeNodeRecursion::Jump));
                     }
                     Ok(Transformed::no(e))
                 })
@@ -242,17 +302,30 @@ impl std::fmt::Display for CaseExpr {
 /// this is limited to use with Column expressions but could potentially be used for other
 /// expressions in the future
 fn is_cheap_and_infallible(expr: &Arc<dyn PhysicalExpr>) -> bool {
-    expr.as_any().is::<Column>()
+    expr.is::<Column>()
 }
 
 /// Creates a [FilterPredicate] from a boolean array.
-fn create_filter(predicate: &BooleanArray) -> FilterPredicate {
+fn create_filter(predicate: &BooleanArray, optimize: bool) -> FilterPredicate {
     let mut filter_builder = FilterBuilder::new(predicate);
-    // Always optimize the filter since we use them multiple times.
-    filter_builder = filter_builder.optimize();
+    if optimize {
+        // Always optimize the filter since we use them multiple times.
+        filter_builder = filter_builder.optimize();
+    }
     filter_builder.build()
 }
 
+fn multiple_arrays(data_type: &DataType) -> bool {
+    match data_type {
+        DataType::Struct(fields) => {
+            fields.len() > 1
+                || fields.len() == 1 && multiple_arrays(fields[0].data_type())
+        }
+        DataType::Union(fields, UnionMode::Sparse) => !fields.is_empty(),
+        _ => false,
+    }
+}
+
 // This should be removed when https://github.com/apache/arrow-rs/pull/8693
 // is merged and becomes available.
 fn filter_record_batch(
@@ -290,111 +363,6 @@ fn filter_array(
     filter.filter(array)
 }
 
-/// Merges elements by index from a list of [`ArrayData`], creating a new [`ColumnarValue`] from
-/// those values.
-///
-/// Each element in `indices` is the index of an array in `values`. The `indices` array is processed
-/// sequentially. The first occurrence of index value `n` will be mapped to the first
-/// value of the array at index `n`. The second occurrence to the second value, and so on.
-/// An index value where `PartialResultIndex::is_none` is `true` is used to indicate null values.
-///
-/// # Implementation notes
-///
-/// This algorithm is similar in nature to both `zip` and `interleave`, but there are some important
-/// differences.
-///
-/// In contrast to `zip`, this function supports multiple input arrays. Instead of a boolean
-/// selection vector, an index array is to take values from the input arrays, and a special marker
-/// value is used to indicate null values.
-///
-/// In contrast to `interleave`, this function does not use pairs of indices. The values in
-/// `indices` serve the same purpose as the first value in the pairs passed to `interleave`.
-/// The index in the array is implicit and is derived from the number of times a particular array
-/// index occurs.
-/// The more constrained indexing mechanism used by this algorithm makes it easier to copy values
-/// in contiguous slices. In the example below, the two subsequent elements from array `2` can be
-/// copied in a single operation from the source array instead of copying them one by one.
-/// Long spans of null values are also especially cheap because they do not need to be represented
-/// in an input array.
-///
-/// # Safety
-///
-/// This function does not check that the number of occurrences of any particular array index matches
-/// the length of the corresponding input array. If an array contains more values than required, the
-/// spurious values will be ignored. If an array contains fewer values than necessary, this function
-/// will panic.
-///
-/// # Example
-///
-/// ```text
-/// ┌───────────┐  ┌─────────┐                             ┌─────────┐
-/// │┌─────────┐│  │   None  │                             │   NULL  │
-/// ││    A    ││  ├─────────┤                             ├─────────┤
-/// │└─────────┘│  │    1    │                             │    B    │
-/// │┌─────────┐│  ├─────────┤                             ├─────────┤
-/// ││    B    ││  │    0    │    merge(values, indices)   │    A    │
-/// │└─────────┘│  ├─────────┤  ─────────────────────────▶ ├─────────┤
-/// │┌─────────┐│  │   None  │                             │   NULL  │
-/// ││    C    ││  ├─────────┤                             ├─────────┤
-/// │├─────────┤│  │    2    │                             │    C    │
-/// ││    D    ││  ├─────────┤                             ├─────────┤
-/// │└─────────┘│  │    2    │                             │    D    │
-/// └───────────┘  └─────────┘                             └─────────┘
-///    values        indices                                  result
-/// ```
-fn merge(values: &[ArrayData], indices: &[PartialResultIndex]) -> Result<ArrayRef> {
-    #[cfg(debug_assertions)]
-    for ix in indices {
-        if let Some(index) = ix.index() {
-            assert!(
-                index < values.len(),
-                "Index out of bounds: {} >= {}",
-                index,
-                values.len()
-            );
-        }
-    }
-
-    let data_refs = values.iter().collect();
-    let mut mutable = MutableArrayData::new(data_refs, true, indices.len());
-
-    // This loop extends the mutable array by taking slices from the partial results.
-    //
-    // take_offsets keeps track of how many values have been taken from each array.
-    let mut take_offsets = vec![0; values.len() + 1];
-    let mut start_row_ix = 0;
-    loop {
-        let array_ix = indices[start_row_ix];
-
-        // Determine the length of the slice to take.
-        let mut end_row_ix = start_row_ix + 1;
-        while end_row_ix < indices.len() && indices[end_row_ix] == array_ix {
-            end_row_ix += 1;
-        }
-        let slice_length = end_row_ix - start_row_ix;
-
-        // Extend mutable with either nulls or with values from the array.
-        match array_ix.index() {
-            None => mutable.extend_nulls(slice_length),
-            Some(index) => {
-                let start_offset = take_offsets[index];
-                let end_offset = start_offset + slice_length;
-                mutable.extend(index, start_offset, end_offset);
-                take_offsets[index] = end_offset;
-            }
-        }
-
-        if end_row_ix == indices.len() {
-            break;
-        } else {
-            // Set the start_row_ix for the next slice.
-            start_row_ix = end_row_ix;
-        }
-    }
-
-    Ok(make_array(mutable.freeze()))
-}
-
 /// An index into the partial results array that's more compact than `usize`.
 ///
 /// `u32::MAX` is reserved as a special 'none' value. This is used instead of
@@ -425,9 +393,10 @@ impl PartialResultIndex {
             return internal_err!("Partial result index exceeds limit");
         };
 
-        if index == NONE_VALUE {
-            return internal_err!("Partial result index exceeds limit");
-        }
+        assert_or_internal_err!(
+            index != NONE_VALUE,
+            "Partial result index exceeds limit"
+        );
 
         Ok(Self { index })
     }
@@ -436,7 +405,9 @@ impl PartialResultIndex {
     fn is_none(&self) -> bool {
         self.index == NONE_VALUE
     }
+}
 
+impl MergeIndex for PartialResultIndex {
     /// Returns `Some(index)` if this value is not the 'none' placeholder, `None` otherwise.
     fn index(&self) -> Option<usize> {
         if self.is_none() {
@@ -464,7 +435,7 @@ enum ResultState {
     Partial {
         // A `Vec` of partial results that should be merged.
         // `partial_result_indices` contains indexes into this vec.
-        arrays: Vec<ArrayData>,
+        arrays: Vec<ArrayRef>,
         // Indicates per result row from which array in `partial_results` a value should be taken.
         indices: Vec<PartialResultIndex>,
     },
@@ -497,7 +468,7 @@ impl ResultBuilder {
         Self {
             data_type: data_type.clone(),
             row_count,
-            state: Empty,
+            state: ResultState::Empty,
         }
     }
 
@@ -545,7 +516,7 @@ impl ResultBuilder {
                 } else if row_indices.len() == self.row_count {
                     self.set_complete_result(ColumnarValue::Array(a))
                 } else {
-                    self.add_partial_result(row_indices, a.to_data())
+                    self.add_partial_result(row_indices, a)
                 }
             }
             ColumnarValue::Scalar(s) => {
@@ -554,7 +525,7 @@ impl ResultBuilder {
                 } else {
                     self.add_partial_result(
                         row_indices,
-                        s.to_array_of_size(row_indices.len())?.to_data(),
+                        s.to_array_of_size(row_indices.len())?,
                     )
                 }
             }
@@ -569,28 +540,29 @@ impl ResultBuilder {
     fn add_partial_result(
         &mut self,
         row_indices: &ArrayRef,
-        row_values: ArrayData,
+        row_values: ArrayRef,
     ) -> Result<()> {
-        if row_indices.null_count() != 0 {
-            return internal_err!("Row indices must not contain nulls");
-        }
+        assert_or_internal_err!(
+            row_indices.null_count() == 0,
+            "Row indices must not contain nulls"
+        );
 
         match &mut self.state {
-            Empty => {
+            ResultState::Empty => {
                 let array_index = PartialResultIndex::zero();
                 let mut indices = vec![PartialResultIndex::none(); self.row_count];
                 for row_ix in row_indices.as_primitive::<UInt32Type>().values().iter() {
                     indices[*row_ix as usize] = array_index;
                 }
 
-                self.state = Partial {
+                self.state = ResultState::Partial {
                     arrays: vec![row_values],
                     indices,
                 };
 
                 Ok(())
             }
-            Partial { arrays, indices } => {
+            ResultState::Partial { arrays, indices } => {
                 let array_index = PartialResultIndex::try_new(arrays.len())?;
 
                 arrays.push(row_values);
@@ -600,15 +572,17 @@ impl ResultBuilder {
                     // `case_when_with_expr` and `case_when_no_expr`, already ensure that
                     // they only calculate a value for each row at most once.
                     #[cfg(debug_assertions)]
-                    if !indices[*row_ix as usize].is_none() {
-                        return internal_err!("Duplicate value for row {}", *row_ix);
-                    }
+                    assert_or_internal_err!(
+                        indices[*row_ix as usize].is_none(),
+                        "Duplicate value for row {}",
+                        *row_ix
+                    );
 
                     indices[*row_ix as usize] = array_index;
                 }
                 Ok(())
             }
-            Complete(_) => internal_err!(
+            ResultState::Complete(_) => internal_err!(
                 "Cannot add a partial result when complete result is already set"
             ),
         }
@@ -621,23 +595,23 @@ impl ResultBuilder {
     /// without any merging overhead.
     fn set_complete_result(&mut self, value: ColumnarValue) -> Result<()> {
         match &self.state {
-            Empty => {
-                self.state = Complete(value);
+            ResultState::Empty => {
+                self.state = ResultState::Complete(value);
                 Ok(())
             }
-            Partial { .. } => {
+            ResultState::Partial { .. } => {
                 internal_err!(
                     "Cannot set a complete result when there are already partial results"
                 )
             }
-            Complete(_) => internal_err!("Complete result already set"),
+            ResultState::Complete(_) => internal_err!("Complete result already set"),
         }
     }
 
     /// Finishes building the result and returns the final array.
     fn finish(self) -> Result<ColumnarValue> {
         match self.state {
-            Empty => {
+            ResultState::Empty => {
                 // No complete result and no partial results.
                 // This can happen for case expressions with no else branch where no rows
                 // matched.
@@ -645,11 +619,12 @@ impl ResultBuilder {
                     &self.data_type,
                 )?))
             }
-            Partial { arrays, indices } => {
+            ResultState::Partial { arrays, indices } => {
                 // Merge partial results into a single array.
-                Ok(ColumnarValue::Array(merge(&arrays, &indices)?))
+                let array_refs = arrays.iter().map(|a| a.as_ref()).collect::<Vec<_>>();
+                Ok(ColumnarValue::Array(merge_n(&array_refs, &indices)?))
             }
-            Complete(v) => {
+            ResultState::Complete(v) => {
                 // If we have a complete result, we can just return it.
                 Ok(v)
             }
@@ -667,7 +642,7 @@ impl CaseExpr {
         // normalize null literals to None in the else_expr (this already happens
         // during SQL planning, but not necessarily for other use cases)
         let else_expr = match &else_expr {
-            Some(e) => match e.as_any().downcast_ref::<Literal>() {
+            Some(e) => match e.downcast_ref::<Literal>() {
                 Some(lit) if lit.value().is_null() => None,
                 _ => else_expr,
             },
@@ -684,28 +659,40 @@ impl CaseExpr {
             else_expr,
         };
 
-        let eval_method = if body.expr.is_some() {
-            EvalMethod::WithExpression(body.project()?)
-        } else if body.when_then_expr.len() == 1
-            && is_cheap_and_infallible(&(body.when_then_expr[0].1))
-            && body.else_expr.is_none()
-        {
-            EvalMethod::InfallibleExprOrNull
-        } else if body.when_then_expr.len() == 1
-            && body.when_then_expr[0].1.as_any().is::<Literal>()
-            && body.else_expr.is_some()
-            && body.else_expr.as_ref().unwrap().as_any().is::<Literal>()
-        {
-            EvalMethod::ScalarOrScalar
-        } else if body.when_then_expr.len() == 1 && body.else_expr.is_some() {
-            EvalMethod::ExpressionOrExpression(body.project()?)
-        } else {
-            EvalMethod::NoExpression(body.project()?)
-        };
+        let eval_method = Self::find_best_eval_method(&body)?;
 
         Ok(Self { body, eval_method })
     }
 
+    fn find_best_eval_method(body: &CaseBody) -> Result<EvalMethod> {
+        if body.expr.is_some() {
+            if let Some(mapping) = LiteralLookupTable::maybe_new(body) {
+                return Ok(EvalMethod::WithExprScalarLookupTable(mapping));
+            }
+
+            return Ok(EvalMethod::WithExpression(body.project()?));
+        }
+
+        Ok(
+            if body.when_then_expr.len() == 1
+                && is_cheap_and_infallible(&(body.when_then_expr[0].1))
+                && body.else_expr.is_none()
+            {
+                EvalMethod::InfallibleExprOrNull
+            } else if body.when_then_expr.len() == 1
+                && body.when_then_expr[0].1.is::<Literal>()
+                && body.else_expr.is_some()
+                && body.else_expr.as_ref().unwrap().is::<Literal>()
+            {
+                EvalMethod::ScalarOrScalar
+            } else if body.when_then_expr.len() == 1 {
+                EvalMethod::ExpressionOrExpression(body.project()?)
+            } else {
+                EvalMethod::NoExpression(body.project()?)
+            },
+        )
+    }
+
     /// Optional base expression that can be compared to literal values in the "when" expressions
     pub fn expr(&self) -> Option<&Arc<dyn PhysicalExpr>> {
         self.body.expr.as_ref()
@@ -723,6 +710,26 @@ impl CaseExpr {
 }
 
 impl CaseBody {
+    fn data_type(&self, input_schema: &Schema) -> Result<DataType> {
+        // since all then results have the same data type, we can choose any one as the
+        // return data type except for the null.
+        let mut data_type = DataType::Null;
+        for i in 0..self.when_then_expr.len() {
+            data_type = self.when_then_expr[i].1.data_type(input_schema)?;
+            if !data_type.equals_datatype(&DataType::Null) {
+                break;
+            }
+        }
+        // if all then results are null, we use data type of else expr instead if possible.
+        if data_type.equals_datatype(&DataType::Null)
+            && let Some(e) = &self.else_expr
+        {
+            data_type = e.data_type(input_schema)?;
+        }
+
+        Ok(data_type)
+    }
+
     /// See [CaseExpr::case_when_with_expr].
     fn case_when_with_expr(
         &self,
@@ -749,12 +756,13 @@ impl CaseBody {
         // Since each when expression is tested against the base expression using the equality
         // operator, null base values can never match any when expression. `x = NULL` is falsy,
         // for all possible values of `x`.
-        if base_values.null_count() > 0 {
+        let base_null_count = base_values.logical_null_count();
+        if base_null_count > 0 {
             // Use `is_not_null` since this is a cheap clone of the null buffer from 'base_value'.
             // We already checked there are nulls, so we can be sure a new buffer will not be
             // created.
             let base_not_nulls = is_not_null(base_values.as_ref())?;
-            let base_all_null = base_values.null_count() == remainder_batch.num_rows();
+            let base_all_null = base_null_count == remainder_batch.num_rows();
 
             // If there is an else expression, use that as the default value for the null rows
             // Otherwise the default `null` value from the result builder will be used.
@@ -767,7 +775,7 @@ impl CaseBody {
                     result_builder.add_branch_result(&remainder_rows, nulls_value)?;
                 } else {
                     // Filter out the null rows and evaluate the else expression for those
-                    let nulls_filter = create_filter(&not(&base_not_nulls)?);
+                    let nulls_filter = create_filter(&not(&base_not_nulls)?, true);
                     let nulls_batch =
                         filter_record_batch(&remainder_batch, &nulls_filter)?;
                     let nulls_rows = filter_array(&remainder_rows, &nulls_filter)?;
@@ -782,7 +790,7 @@ impl CaseBody {
             }
 
             // Remove the null rows from the remainder batch
-            let not_null_filter = create_filter(&base_not_nulls);
+            let not_null_filter = create_filter(&base_not_nulls, true);
             remainder_batch =
                 Cow::Owned(filter_record_batch(&remainder_batch, &not_null_filter)?);
             remainder_rows = filter_array(&remainder_rows, &not_null_filter)?;
@@ -802,22 +810,19 @@ impl CaseBody {
                     compare_with_eq(&a, &base_values, base_value_is_nested)
                 }
                 ColumnarValue::Scalar(s) => {
-                    let scalar = Scalar::new(s.to_array()?);
-                    compare_with_eq(&scalar, &base_values, base_value_is_nested)
+                    compare_with_eq(&s.to_scalar()?, &base_values, base_value_is_nested)
                 }
             }?;
 
-            // `true_count` ignores `true` values where the validity bit is not set, so there's
-            // no need to call `prep_null_mask_filter`.
-            let when_true_count = when_value.true_count();
-
-            // If the 'when' predicate did not match any rows, continue to the next branch immediately
-            if when_true_count == 0 {
+            // If the 'when' predicate did not match any rows, continue to the next branch immediately.
+            // Only counts valid slots that are true (masked-null predicate slots are ignored),
+            // so no `prep_null_mask_filter` needed here.
+            if !when_value.has_true() {
                 continue;
             }
 
             // If the 'when' predicate matched all remaining rows, there is no need to filter
-            if when_true_count == remainder_batch.num_rows() {
+            if when_value.null_count() == 0 && !when_value.has_false() {
                 let then_expression = &self.when_then_expr[i].1;
                 let then_value = then_expression.evaluate(&remainder_batch)?;
                 result_builder.add_branch_result(&remainder_rows, then_value)?;
@@ -829,7 +834,7 @@ impl CaseBody {
             // for the current branch
             // Still no need to call `prep_null_mask_filter` since `create_filter` will already do
             // this unconditionally.
-            let then_filter = create_filter(&when_value);
+            let then_filter = create_filter(&when_value, true);
             let then_batch = filter_record_batch(&remainder_batch, &then_filter)?;
             let then_rows = filter_array(&remainder_rows, &then_filter)?;
 
@@ -852,7 +857,7 @@ impl CaseBody {
                     not(&prep_null_mask_filter(&when_value))
                 }
             }?;
-            let next_filter = create_filter(&next_selection);
+            let next_filter = create_filter(&next_selection, true);
             remainder_batch =
                 Cow::Owned(filter_record_batch(&remainder_batch, &next_filter)?);
             remainder_rows = filter_array(&remainder_rows, &next_filter)?;
@@ -896,17 +901,15 @@ impl CaseBody {
                 internal_datafusion_err!("WHEN expression did not return a BooleanArray")
             })?;
 
-            // `true_count` ignores `true` values where the validity bit is not set, so there's
-            // no need to call `prep_null_mask_filter`.
-            let when_true_count = when_value.true_count();
-
-            // If the 'when' predicate did not match any rows, continue to the next branch immediately
-            if when_true_count == 0 {
+            // If the 'when' predicate did not match any rows, continue to the next branch immediately.
+            // Only counts valid slots that are true (masked-null predicate slots are ignored)
+            // so no `prep_null_mask_filter` needed here.
+            if !when_value.has_true() {
                 continue;
             }
 
             // If the 'when' predicate matched all remaining rows, there is no need to filter
-            if when_true_count == remainder_batch.num_rows() {
+            if when_value.null_count() == 0 && !when_value.has_false() {
                 let then_expression = &self.when_then_expr[i].1;
                 let then_value = then_expression.evaluate(&remainder_batch)?;
                 result_builder.add_branch_result(&remainder_rows, then_value)?;
@@ -918,7 +921,7 @@ impl CaseBody {
             // for the current branch
             // Still no need to call `prep_null_mask_filter` since `create_filter` will already do
             // this unconditionally.
-            let then_filter = create_filter(when_value);
+            let then_filter = create_filter(when_value, true);
             let then_batch = filter_record_batch(&remainder_batch, &then_filter)?;
             let then_rows = filter_array(&remainder_rows, &then_filter)?;
 
@@ -941,7 +944,7 @@ impl CaseBody {
                     not(&prep_null_mask_filter(when_value))
                 }
             }?;
-            let next_filter = create_filter(&next_selection);
+            let next_filter = create_filter(&next_selection, true);
             remainder_batch =
                 Cow::Owned(filter_record_batch(&remainder_batch, &next_filter)?);
             remainder_rows = filter_array(&remainder_rows, &next_filter)?;
@@ -964,24 +967,56 @@ impl CaseBody {
         &self,
         batch: &RecordBatch,
         when_value: &BooleanArray,
-        return_type: &DataType,
     ) -> Result<ColumnarValue> {
-        let then_value = self.when_then_expr[0]
-            .1
-            .evaluate_selection(batch, when_value)?
-            .into_array(batch.num_rows())?;
+        let when_value = match when_value.null_count() {
+            0 => Cow::Borrowed(when_value),
+            _ => {
+                // `prep_null_mask_filter` is required to ensure null is treated as false
+                Cow::Owned(prep_null_mask_filter(when_value))
+            }
+        };
 
-        // evaluate else expression on the values not covered by when_value
-        let remainder = not(when_value)?;
-        let e = self.else_expr.as_ref().unwrap();
-        // keep `else_expr`'s data type and return type consistent
-        let expr = try_cast(Arc::clone(e), &batch.schema(), return_type.clone())
-            .unwrap_or_else(|_| Arc::clone(e));
-        let else_ = expr
-            .evaluate_selection(batch, &remainder)?
-            .into_array(batch.num_rows())?;
+        let optimize_filter = batch.num_columns() > 1
+            || (batch.num_columns() == 1 && multiple_arrays(batch.column(0).data_type()));
+
+        let when_filter = create_filter(&when_value, optimize_filter);
+        let then_batch = filter_record_batch(batch, &when_filter)?;
+        let then_value = self.when_then_expr[0].1.evaluate(&then_batch)?;
 
-        Ok(ColumnarValue::Array(zip(&remainder, &else_, &then_value)?))
+        match &self.else_expr {
+            None => {
+                let then_array = then_value.to_array(when_value.true_count())?;
+                scatter(&when_value, then_array.as_ref()).map(ColumnarValue::Array)
+            }
+            Some(else_expr) => {
+                let else_selection = not(&when_value)?;
+                let else_filter = create_filter(&else_selection, optimize_filter);
+                let else_batch = filter_record_batch(batch, &else_filter)?;
+
+                // keep `else_expr`'s data type and return type consistent
+                let return_type = self.data_type(&batch.schema())?;
+                let else_expr =
+                    try_cast(Arc::clone(else_expr), &batch.schema(), return_type.clone())
+                        .unwrap_or_else(|_| Arc::clone(else_expr));
+
+                let else_value = else_expr.evaluate(&else_batch)?;
+
+                Ok(ColumnarValue::Array(match (then_value, else_value) {
+                    (ColumnarValue::Array(t), ColumnarValue::Array(e)) => {
+                        merge(&when_value, &t, &e)
+                    }
+                    (ColumnarValue::Scalar(t), ColumnarValue::Array(e)) => {
+                        merge(&when_value, &t.to_scalar()?, &e)
+                    }
+                    (ColumnarValue::Array(t), ColumnarValue::Scalar(e)) => {
+                        merge(&when_value, &t, &e.to_scalar()?)
+                    }
+                    (ColumnarValue::Scalar(t), ColumnarValue::Scalar(e)) => {
+                        merge(&when_value, &t.to_scalar()?, &e.to_scalar()?)
+                    }
+                }?))
+            }
+        }
     }
 }
 
@@ -1113,11 +1148,12 @@ impl CaseExpr {
         batch: &RecordBatch,
         projected: &ProjectedCaseBody,
     ) -> Result<ColumnarValue> {
-        let return_type = self.data_type(&batch.schema())?;
-
         // evaluate when condition on batch
         let when_value = self.body.when_then_expr[0].0.evaluate(batch)?;
-        let when_value = when_value.into_array(batch.num_rows())?;
+        // `num_rows == 1` is intentional to avoid expanding scalars.
+        // If the `when_value` is effectively a scalar, the 'all true' and 'all false' checks
+        // below will avoid incorrectly using the scalar as a merge/zip mask.
+        let when_value = when_value.into_array(1)?;
         let when_value = as_boolean_array(&when_value).map_err(|e| {
             DataFusionError::Context(
                 "WHEN expression did not return a BooleanArray".to_string(),
@@ -1125,70 +1161,116 @@ impl CaseExpr {
             )
         })?;
 
-        // For the true and false/null selection vectors, bypass `evaluate_selection` and merging
-        // results. This avoids materializing the array for the other branch which we will discard
-        // entirely anyway.
-        let true_count = when_value.true_count();
-        if true_count == batch.num_rows() {
-            return self.body.when_then_expr[0].1.evaluate(batch);
-        } else if true_count == 0 {
-            return self.body.else_expr.as_ref().unwrap().evaluate(batch);
+        if when_value.null_count() == 0 && !when_value.has_false() {
+            // All input rows are true, just call the 'then' expression
+            self.body.when_then_expr[0].1.evaluate(batch)
+        } else if !when_value.has_true() {
+            // All input rows are false/null, just call the 'else' expression
+            match &self.body.else_expr {
+                Some(else_expr) => else_expr.evaluate(batch),
+                None => {
+                    let return_type = self.data_type(&batch.schema())?;
+                    Ok(ColumnarValue::Scalar(ScalarValue::try_new_null(
+                        &return_type,
+                    )?))
+                }
+            }
+        } else if projected.projection.len() < batch.num_columns() {
+            // The case expressions do not use all the columns of the input batch.
+            // Project first to reduce time spent filtering.
+            let projected_batch = batch.project(&projected.projection)?;
+            projected.body.expr_or_expr(&projected_batch, when_value)
+        } else {
+            // All columns are used in the case expressions, so there is no need to project.
+            self.body.expr_or_expr(batch, when_value)
         }
+    }
 
-        // Treat 'NULL' as false value
-        let when_value = match when_value.null_count() {
-            0 => Cow::Borrowed(when_value),
-            _ => Cow::Owned(prep_null_mask_filter(when_value)),
-        };
+    fn with_lookup_table(
+        &self,
+        batch: &RecordBatch,
+        lookup_table: &LiteralLookupTable,
+    ) -> Result<ColumnarValue> {
+        let expr = self.body.expr.as_ref().unwrap();
+        let evaluated_expression = expr.evaluate(batch)?;
 
-        if projected.projection.len() < batch.num_columns() {
-            let projected_batch = batch.project(&projected.projection)?;
-            projected
-                .body
-                .expr_or_expr(&projected_batch, &when_value, &return_type)
+        let is_scalar = matches!(evaluated_expression, ColumnarValue::Scalar(_));
+        let evaluated_expression = evaluated_expression.to_array(1)?;
+
+        let values = lookup_table.map_keys_to_values(&evaluated_expression)?;
+
+        let result = if is_scalar {
+            ColumnarValue::Scalar(ScalarValue::try_from_array(values.as_ref(), 0)?)
         } else {
-            self.body.expr_or_expr(batch, &when_value, &return_type)
-        }
+            ColumnarValue::Array(values)
+        };
+
+        Ok(result)
     }
 }
 
 impl PhysicalExpr for CaseExpr {
-    /// Return a reference to Any that can be used for down-casting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn data_type(&self, input_schema: &Schema) -> Result<DataType> {
-        // since all then results have the same data type, we can choose any one as the
-        // return data type except for the null.
-        let mut data_type = DataType::Null;
-        for i in 0..self.body.when_then_expr.len() {
-            data_type = self.body.when_then_expr[i].1.data_type(input_schema)?;
-            if !data_type.equals_datatype(&DataType::Null) {
-                break;
-            }
-        }
-        // if all then results are null, we use data type of else expr instead if possible.
-        if data_type.equals_datatype(&DataType::Null) {
-            if let Some(e) = &self.body.else_expr {
-                data_type = e.data_type(input_schema)?;
-            }
-        }
-
-        Ok(data_type)
+        self.body.data_type(input_schema)
     }
 
     fn nullable(&self, input_schema: &Schema) -> Result<bool> {
-        // this expression is nullable if any of the input expressions are nullable
-        let then_nullable = self
+        let nullable_then = self
             .body
             .when_then_expr
             .iter()
-            .map(|(_, t)| t.nullable(input_schema))
-            .collect::<Result<Vec<_>>>()?;
-        if then_nullable.contains(&true) {
-            Ok(true)
+            .filter_map(|(w, t)| {
+                let is_nullable = match t.nullable(input_schema) {
+                    // Pass on error determining nullability verbatim
+                    Err(e) => return Some(Err(e)),
+                    Ok(n) => n,
+                };
+
+                // Branches with a then expression that is not nullable do not impact the
+                // nullability of the case expression.
+                if !is_nullable {
+                    return None;
+                }
+
+                // For case-with-expression assume all 'then' expressions are reachable
+                if self.body.expr.is_some() {
+                    return Some(Ok(()));
+                }
+
+                // For branches with a nullable 'then' expression, try to determine
+                // if the 'then' expression is ever reachable in the situation where
+                // it would evaluate to null.
+
+                // Replace the `then` expression with `NULL` in the `when` expression
+                let with_null = match replace_with_null(w, t.as_ref(), input_schema) {
+                    Err(e) => return Some(Err(e)),
+                    Ok(e) => e,
+                };
+
+                // Try to const evaluate the modified `when` expression.
+                let predicate_result = match evaluate_predicate(&with_null) {
+                    Err(e) => return Some(Err(e)),
+                    Ok(b) => b,
+                };
+
+                match predicate_result {
+                    // Evaluation was inconclusive or true, so the 'then' expression is reachable
+                    None | Some(true) => Some(Ok(())),
+                    // Evaluation proves the branch will never be taken.
+                    // The most common pattern for this is `WHEN x IS NOT NULL THEN x`.
+                    Some(false) => None,
+                }
+            })
+            .next();
+
+        if let Some(nullable_then) = nullable_then {
+            // There is at least one reachable nullable 'then' expression, so the case
+            // expression itself is nullable.
+            // Use `Result::map` to propagate the error from `nullable_then` if there is one.
+            nullable_then.map(|_| true)
         } else if let Some(e) = &self.body.else_expr {
+            // There are no reachable nullable 'then' expressions, so all we still need to
+            // check is the 'else' expression's nullability.
             e.nullable(input_schema)
         } else {
             // CASE produces NULL if there is no `else` expr
@@ -1215,6 +1297,9 @@ impl PhysicalExpr for CaseExpr {
             }
             EvalMethod::ScalarOrScalar => self.scalar_or_scalar(batch),
             EvalMethod::ExpressionOrExpression(p) => self.expr_or_expr(batch, p),
+            EvalMethod::WithExprScalarLookupTable(lookup_table) => {
+                self.with_lookup_table(batch, lookup_table)
+            }
         }
     }
 
@@ -1291,6 +1376,51 @@ impl PhysicalExpr for CaseExpr {
     }
 }
 
+/// Attempts to const evaluate the given `predicate`.
+/// Returns:
+/// - `Some(true)` if the predicate evaluates to a truthy value.
+/// - `Some(false)` if the predicate evaluates to a falsy value.
+/// - `None` if the predicate could not be evaluated.
+fn evaluate_predicate(predicate: &Arc<dyn PhysicalExpr>) -> Result<Option<bool>> {
+    // Create a dummy record with no columns and one row
+    let batch = RecordBatch::try_new_with_options(
+        Arc::new(Schema::empty()),
+        vec![],
+        &RecordBatchOptions::new().with_row_count(Some(1)),
+    )?;
+
+    // Evaluate the predicate and interpret the result as a boolean
+    let result = match predicate.evaluate(&batch) {
+        // An error during evaluation means we couldn't const evaluate the predicate, so return `None`
+        Err(_) => None,
+        Ok(ColumnarValue::Array(array)) => Some(
+            ScalarValue::try_from_array(array.as_ref(), 0)?
+                .cast_to(&DataType::Boolean)?,
+        ),
+        Ok(ColumnarValue::Scalar(scalar)) => Some(scalar.cast_to(&DataType::Boolean)?),
+    };
+    Ok(result.map(|v| matches!(v, ScalarValue::Boolean(Some(true)))))
+}
+
+fn replace_with_null(
+    expr: &Arc<dyn PhysicalExpr>,
+    expr_to_replace: &dyn PhysicalExpr,
+    input_schema: &Schema,
+) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+    let with_null = Arc::clone(expr)
+        .transform_down(|e| {
+            if e.as_ref().dyn_eq(expr_to_replace) {
+                let data_type = e.data_type(input_schema)?;
+                let null_literal = lit(ScalarValue::try_new_null(&data_type)?);
+                Ok(Transformed::yes(null_literal))
+            } else {
+                Ok(Transformed::no(e))
+            }
+        })?
+        .data;
+    Ok(with_null)
+}
+
 /// Create a CASE expression
 pub fn case(
     expr: Option<Arc<dyn PhysicalExpr>>,
@@ -1304,16 +1434,18 @@ pub fn case(
 mod tests {
     use super::*;
 
-    use crate::expressions::{binary, cast, col, lit, BinaryExpr};
+    use crate::expressions;
+    use crate::expressions::{BinaryExpr, binary, cast, col, is_not_null};
     use arrow::buffer::Buffer;
     use arrow::datatypes::DataType::Float64;
     use arrow::datatypes::Field;
     use datafusion_common::cast::{as_float64_array, as_int32_array};
     use datafusion_common::plan_err;
     use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-    use datafusion_expr::type_coercion::binary::comparison_coercion;
-    use datafusion_expr::Operator;
+    use datafusion_expr::type_coercion::binary::type_union_coercion;
+    use datafusion_expr_common::operator::Operator;
     use datafusion_physical_expr_common::physical_expr::fmt_sql;
+    use half::f16;
 
     #[test]
     fn case_with_expr() -> Result<()> {
@@ -1345,6 +1477,164 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn case_with_expr_dictionary() -> Result<()> {
+        let schema = Schema::new(vec![Field::new(
+            "a",
+            DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)),
+            true,
+        )]);
+        let keys = UInt8Array::from(vec![0u8, 1u8, 2u8, 3u8]);
+        let values = StringArray::from(vec![Some("foo"), Some("baz"), None, Some("bar")]);
+        let dictionary = DictionaryArray::new(keys, Arc::new(values));
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(dictionary)])?;
+
+        let schema = batch.schema();
+
+        // CASE a WHEN 'foo' THEN 123 WHEN 'bar' THEN 456 END
+        let when1 = lit("foo");
+        let then1 = lit(123i32);
+        let when2 = lit("bar");
+        let then2 = lit(456i32);
+
+        let expr = generate_case_when_with_type_coercion(
+            Some(col("a", &schema)?),
+            vec![(when1, then1), (when2, then2)],
+            None,
+            schema.as_ref(),
+        )?;
+        let result = expr
+            .evaluate(&batch)?
+            .into_array(batch.num_rows())
+            .expect("Failed to convert to array");
+        let result = as_int32_array(&result)?;
+
+        let expected = &Int32Array::from(vec![Some(123), None, None, Some(456)]);
+
+        assert_eq!(expected, result);
+
+        Ok(())
+    }
+
+    // Make sure we are not failing when got literal in case when but input is dictionary encoded
+    #[test]
+    fn case_with_expr_primitive_dictionary() -> Result<()> {
+        let schema = Schema::new(vec![Field::new(
+            "a",
+            DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::UInt64)),
+            true,
+        )]);
+        let keys = UInt8Array::from(vec![0u8, 1u8, 2u8, 3u8]);
+        let values = UInt64Array::from(vec![Some(10), Some(20), None, Some(30)]);
+        let dictionary = DictionaryArray::new(keys, Arc::new(values));
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(dictionary)])?;
+
+        let schema = batch.schema();
+
+        // CASE a WHEN 10 THEN 123 WHEN 30 THEN 456 END
+        let when1 = lit(10_u64);
+        let then1 = lit(123_i32);
+        let when2 = lit(30_u64);
+        let then2 = lit(456_i32);
+
+        let expr = generate_case_when_with_type_coercion(
+            Some(col("a", &schema)?),
+            vec![(when1, then1), (when2, then2)],
+            None,
+            schema.as_ref(),
+        )?;
+        let result = expr
+            .evaluate(&batch)?
+            .into_array(batch.num_rows())
+            .expect("Failed to convert to array");
+        let result = as_int32_array(&result)?;
+
+        let expected = &Int32Array::from(vec![Some(123), None, None, Some(456)]);
+
+        assert_eq!(expected, result);
+
+        Ok(())
+    }
+
+    // Make sure we are not failing when got literal in case when but input is dictionary encoded
+    #[test]
+    fn case_with_expr_boolean_dictionary() -> Result<()> {
+        let schema = Schema::new(vec![Field::new(
+            "a",
+            DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Boolean)),
+            true,
+        )]);
+        let keys = UInt8Array::from(vec![0u8, 1u8, 2u8, 3u8]);
+        let values = BooleanArray::from(vec![Some(true), Some(false), None, Some(true)]);
+        let dictionary = DictionaryArray::new(keys, Arc::new(values));
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(dictionary)])?;
+
+        let schema = batch.schema();
+
+        // CASE a WHEN true THEN 123 WHEN false THEN 456 END
+        let when1 = lit(true);
+        let then1 = lit(123i32);
+        let when2 = lit(false);
+        let then2 = lit(456i32);
+
+        let expr = generate_case_when_with_type_coercion(
+            Some(col("a", &schema)?),
+            vec![(when1, then1), (when2, then2)],
+            None,
+            schema.as_ref(),
+        )?;
+        let result = expr
+            .evaluate(&batch)?
+            .into_array(batch.num_rows())
+            .expect("Failed to convert to array");
+        let result = as_int32_array(&result)?;
+
+        let expected = &Int32Array::from(vec![Some(123), Some(456), None, Some(123)]);
+
+        assert_eq!(expected, result);
+
+        Ok(())
+    }
+
+    #[test]
+    fn case_with_expr_all_null_dictionary() -> Result<()> {
+        let schema = Schema::new(vec![Field::new(
+            "a",
+            DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)),
+            true,
+        )]);
+        let keys = UInt8Array::from(vec![2u8, 2u8, 2u8, 2u8]);
+        let values = StringArray::from(vec![Some("foo"), Some("baz"), None, Some("bar")]);
+        let dictionary = DictionaryArray::new(keys, Arc::new(values));
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(dictionary)])?;
+
+        let schema = batch.schema();
+
+        // CASE a WHEN 'foo' THEN 123 WHEN 'bar' THEN 456 END
+        let when1 = lit("foo");
+        let then1 = lit(123i32);
+        let when2 = lit("bar");
+        let then2 = lit(456i32);
+
+        let expr = generate_case_when_with_type_coercion(
+            Some(col("a", &schema)?),
+            vec![(when1, then1), (when2, then2)],
+            None,
+            schema.as_ref(),
+        )?;
+        let result = expr
+            .evaluate(&batch)?
+            .into_array(batch.num_rows())
+            .expect("Failed to convert to array");
+        let result = as_int32_array(&result)?;
+
+        let expected = &Int32Array::from(vec![None, None, None, None]);
+
+        assert_eq!(expected, result);
+
+        Ok(())
+    }
+
     #[test]
     fn case_with_expr_else() -> Result<()> {
         let batch = case_test_batch()?;
@@ -1928,7 +2218,7 @@ mod tests {
 
         let expr2 = Arc::clone(&expr)
             .transform(|e| {
-                let transformed = match e.as_any().downcast_ref::<Literal>() {
+                let transformed = match e.downcast_ref::<Literal>() {
                     Some(lit_value) => match lit_value.value() {
                         ScalarValue::Utf8(Some(str_value)) => {
                             Some(lit(str_value.to_uppercase()))
@@ -1948,7 +2238,7 @@ mod tests {
 
         let expr3 = Arc::clone(&expr)
             .transform_down(|e| {
-                let transformed = match e.as_any().downcast_ref::<Literal>() {
+                let transformed = match e.downcast_ref::<Literal>() {
                     Some(lit_value) => match lit_value.value() {
                         ScalarValue::Utf8(Some(str_value)) => {
                             Some(lit(str_value.to_uppercase()))
@@ -2000,7 +2290,7 @@ mod tests {
             make_lit_i32(250),
         ));
         let expr = CaseExpr::try_new(None, vec![(predicate, make_col("c2", 1))], None)?;
-        assert!(matches!(expr.eval_method, EvalMethod::InfallibleExprOrNull));
+        assert_eq!(expr.eval_method, EvalMethod::InfallibleExprOrNull);
         match expr.evaluate(&batch)? {
             ColumnarValue::Array(array) => {
                 assert_eq!(1000, array.len());
@@ -2102,9 +2392,7 @@ mod tests {
         thens_type
             .iter()
             .try_fold(else_type, |left_type, right_type| {
-                // TODO: now just use the `equal` coercion rule for case when. If find the issue, and
-                // refactor again.
-                comparison_coercion(&left_type, right_type)
+                type_union_coercion(&left_type, right_type)
             })
     }
 
@@ -2139,34 +2427,678 @@ mod tests {
         Ok(())
     }
 
+    fn when_then_else(
+        when: &Arc<dyn PhysicalExpr>,
+        then: &Arc<dyn PhysicalExpr>,
+        els: &Arc<dyn PhysicalExpr>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        let case = CaseExpr::try_new(
+            None,
+            vec![(Arc::clone(when), Arc::clone(then))],
+            Some(Arc::clone(els)),
+        )?;
+        Ok(Arc::new(case))
+    }
+
     #[test]
-    fn test_merge() {
-        let a1 = StringArray::from(vec![Some("A")]).to_data();
-        let a2 = StringArray::from(vec![Some("B")]).to_data();
-        let a3 = StringArray::from(vec![Some("C"), Some("D")]).to_data();
-
-        let indices = vec![
-            PartialResultIndex::none(),
-            PartialResultIndex::try_new(1).unwrap(),
-            PartialResultIndex::try_new(0).unwrap(),
-            PartialResultIndex::none(),
-            PartialResultIndex::try_new(2).unwrap(),
-            PartialResultIndex::try_new(2).unwrap(),
-        ];
-
-        let merged = merge(&[a1, a2, a3], &indices).unwrap();
-        let merged = merged.as_string::<i32>();
-
-        assert_eq!(merged.len(), indices.len());
-        assert!(!merged.is_valid(0));
-        assert!(merged.is_valid(1));
-        assert_eq!(merged.value(1), "B");
-        assert!(merged.is_valid(2));
-        assert_eq!(merged.value(2), "A");
-        assert!(!merged.is_valid(3));
-        assert!(merged.is_valid(4));
-        assert_eq!(merged.value(4), "C");
-        assert!(merged.is_valid(5));
-        assert_eq!(merged.value(5), "D");
+    fn test_case_expression_nullability_with_nullable_column() -> Result<()> {
+        case_expression_nullability(true)
+    }
+
+    #[test]
+    fn test_case_expression_nullability_with_not_nullable_column() -> Result<()> {
+        case_expression_nullability(false)
+    }
+
+    fn case_expression_nullability(col_is_nullable: bool) -> Result<()> {
+        let schema =
+            Schema::new(vec![Field::new("foo", DataType::Int32, col_is_nullable)]);
+
+        let foo = col("foo", &schema)?;
+        let foo_is_not_null = is_not_null(Arc::clone(&foo))?;
+        let foo_is_null = expressions::is_null(Arc::clone(&foo))?;
+        let not_foo_is_null = expressions::not(Arc::clone(&foo_is_null))?;
+        let zero = lit(0);
+        let foo_eq_zero =
+            binary(Arc::clone(&foo), Operator::Eq, Arc::clone(&zero), &schema)?;
+
+        assert_not_nullable(when_then_else(&foo_is_not_null, &foo, &zero)?, &schema);
+        assert_not_nullable(when_then_else(&not_foo_is_null, &foo, &zero)?, &schema);
+        assert_not_nullable(when_then_else(&foo_eq_zero, &foo, &zero)?, &schema);
+
+        assert_not_nullable(
+            when_then_else(
+                &binary(
+                    Arc::clone(&foo_is_not_null),
+                    Operator::And,
+                    Arc::clone(&foo_eq_zero),
+                    &schema,
+                )?,
+                &foo,
+                &zero,
+            )?,
+            &schema,
+        );
+
+        assert_not_nullable(
+            when_then_else(
+                &binary(
+                    Arc::clone(&foo_eq_zero),
+                    Operator::And,
+                    Arc::clone(&foo_is_not_null),
+                    &schema,
+                )?,
+                &foo,
+                &zero,
+            )?,
+            &schema,
+        );
+
+        assert_not_nullable(
+            when_then_else(
+                &binary(
+                    Arc::clone(&foo_is_not_null),
+                    Operator::Or,
+                    Arc::clone(&foo_eq_zero),
+                    &schema,
+                )?,
+                &foo,
+                &zero,
+            )?,
+            &schema,
+        );
+
+        assert_not_nullable(
+            when_then_else(
+                &binary(
+                    Arc::clone(&foo_eq_zero),
+                    Operator::Or,
+                    Arc::clone(&foo_is_not_null),
+                    &schema,
+                )?,
+                &foo,
+                &zero,
+            )?,
+            &schema,
+        );
+
+        assert_nullability(
+            when_then_else(
+                &binary(
+                    Arc::clone(&foo_is_null),
+                    Operator::Or,
+                    Arc::clone(&foo_eq_zero),
+                    &schema,
+                )?,
+                &foo,
+                &zero,
+            )?,
+            &schema,
+            col_is_nullable,
+        );
+
+        assert_nullability(
+            when_then_else(
+                &binary(
+                    binary(Arc::clone(&foo), Operator::Eq, Arc::clone(&zero), &schema)?,
+                    Operator::Or,
+                    Arc::clone(&foo_is_null),
+                    &schema,
+                )?,
+                &foo,
+                &zero,
+            )?,
+            &schema,
+            col_is_nullable,
+        );
+
+        assert_not_nullable(
+            when_then_else(
+                &binary(
+                    binary(
+                        binary(
+                            Arc::clone(&foo),
+                            Operator::Eq,
+                            Arc::clone(&zero),
+                            &schema,
+                        )?,
+                        Operator::And,
+                        Arc::clone(&foo_is_not_null),
+                        &schema,
+                    )?,
+                    Operator::Or,
+                    binary(
+                        binary(
+                            Arc::clone(&foo),
+                            Operator::Eq,
+                            Arc::clone(&foo),
+                            &schema,
+                        )?,
+                        Operator::And,
+                        Arc::clone(&foo_is_not_null),
+                        &schema,
+                    )?,
+                    &schema,
+                )?,
+                &foo,
+                &zero,
+            )?,
+            &schema,
+        );
+
+        Ok(())
+    }
+
+    fn assert_not_nullable(expr: Arc<dyn PhysicalExpr>, schema: &Schema) {
+        assert!(!expr.nullable(schema).unwrap());
+    }
+
+    fn assert_nullable(expr: Arc<dyn PhysicalExpr>, schema: &Schema) {
+        assert!(expr.nullable(schema).unwrap());
+    }
+
+    fn assert_nullability(expr: Arc<dyn PhysicalExpr>, schema: &Schema, nullable: bool) {
+        if nullable {
+            assert_nullable(expr, schema);
+        } else {
+            assert_not_nullable(expr, schema);
+        }
+    }
+
+    // Test Lookup evaluation
+
+    fn test_case_when_literal_lookup(
+        values: ArrayRef,
+        lookup_map: &[(ScalarValue, ScalarValue)],
+        else_value: Option<ScalarValue>,
+        expected: ArrayRef,
+    ) {
+        // Create lookup
+        // CASE <expr>
+        // WHEN <when_constant_1> THEN <then_constant_1>
+        // WHEN <when_constant_2> THEN <then_constant_2>
+        // [ ELSE <else_constant> ]
+
+        let schema = Schema::new(vec![Field::new(
+            "a",
+            values.data_type().clone(),
+            values.is_nullable(),
+        )]);
+        let schema = Arc::new(schema);
+
+        let batch = RecordBatch::try_new(schema, vec![values])
+            .expect("failed to create RecordBatch");
+
+        let schema = batch.schema_ref();
+        let case = col("a", schema).expect("failed to create col");
+
+        let when_then = lookup_map
+            .iter()
+            .map(|(when, then)| {
+                (
+                    Arc::new(Literal::new(when.clone())) as _,
+                    Arc::new(Literal::new(then.clone())) as _,
+                )
+            })
+            .collect::<Vec<WhenThen>>();
+
+        let else_expr = else_value.map(|else_value| {
+            Arc::new(Literal::new(else_value)) as Arc<dyn PhysicalExpr>
+        });
+        let expr = CaseExpr::try_new(Some(case), when_then, else_expr)
+            .expect("failed to create case");
+
+        // Assert that we are testing what we intend to assert
+        assert!(
+            matches!(
+                expr.eval_method,
+                EvalMethod::WithExprScalarLookupTable { .. }
+            ),
+            "we should use the expected eval method"
+        );
+
+        let actual = expr
+            .evaluate(&batch)
+            .expect("failed to evaluate case")
+            .into_array(batch.num_rows())
+            .expect("Failed to convert to array");
+
+        assert_eq!(
+            actual.data_type(),
+            expected.data_type(),
+            "Data type mismatch"
+        );
+
+        assert_eq!(
+            actual.as_ref(),
+            expected.as_ref(),
+            "actual (left) does not match expected (right)"
+        );
+    }
+
+    fn create_lookup<When, Then>(
+        when_then_pairs: impl IntoIterator<Item = (When, Then)>,
+    ) -> Vec<(ScalarValue, ScalarValue)>
+    where
+        ScalarValue: From<When>,
+        ScalarValue: From<Then>,
+    {
+        when_then_pairs
+            .into_iter()
+            .map(|(when, then)| (ScalarValue::from(when), ScalarValue::from(then)))
+            .collect()
+    }
+
+    fn create_input_and_expected<Input, Expected, InputFromItem, ExpectedFromItem>(
+        input_and_expected_pairs: impl IntoIterator<Item = (InputFromItem, ExpectedFromItem)>,
+    ) -> (Input, Expected)
+    where
+        Input: Array + From<Vec<InputFromItem>>,
+        Expected: Array + From<Vec<ExpectedFromItem>>,
+    {
+        let (input_items, expected_items): (Vec<InputFromItem>, Vec<ExpectedFromItem>) =
+            input_and_expected_pairs.into_iter().unzip();
+
+        (Input::from(input_items), Expected::from(expected_items))
+    }
+
+    fn test_lookup_eval_with_and_without_else(
+        lookup_map: &[(ScalarValue, ScalarValue)],
+        input_values: ArrayRef,
+        expected: StringArray,
+    ) {
+        // Testing without ELSE should fallback to None
+        test_case_when_literal_lookup(
+            Arc::clone(&input_values),
+            lookup_map,
+            None,
+            Arc::new(expected.clone()),
+        );
+
+        // Testing with Else
+        let else_value = "___fallback___";
+
+        // Changing each expected None to be fallback
+        let expected_with_else = expected
+            .iter()
+            .map(|item| item.unwrap_or(else_value))
+            .map(Some)
+            .collect::<StringArray>();
+
+        // Test case
+        test_case_when_literal_lookup(
+            input_values,
+            lookup_map,
+            Some(ScalarValue::Utf8(Some(else_value.to_string()))),
+            Arc::new(expected_with_else),
+        );
+    }
+
+    #[test]
+    fn test_case_when_literal_lookup_int32_to_string() {
+        let lookup_map = create_lookup([
+            (Some(4), Some("four")),
+            (Some(2), Some("two")),
+            (Some(3), Some("three")),
+            (Some(1), Some("one")),
+        ]);
+
+        let (input_values, expected) =
+            create_input_and_expected::<Int32Array, StringArray, _, _>([
+                (1, Some("one")),
+                (2, Some("two")),
+                (3, Some("three")),
+                (3, Some("three")),
+                (2, Some("two")),
+                (3, Some("three")),
+                (5, None), // No match in WHEN
+                (5, None), // No match in WHEN
+                (3, Some("three")),
+                (5, None), // No match in WHEN
+            ]);
+
+        test_lookup_eval_with_and_without_else(
+            &lookup_map,
+            Arc::new(input_values),
+            expected,
+        );
+    }
+
+    #[test]
+    fn test_case_when_literal_lookup_none_case_should_never_match() {
+        let lookup_map = create_lookup([
+            (Some(4), Some("four")),
+            (None, Some("none")),
+            (Some(2), Some("two")),
+            (Some(1), Some("one")),
+        ]);
+
+        let (input_values, expected) =
+            create_input_and_expected::<Int32Array, StringArray, _, _>([
+                (Some(1), Some("one")),
+                (Some(5), None), // No match in WHEN
+                (None, None), // None cases are never match in CASE <expr> WHEN <value> syntax
+                (Some(2), Some("two")),
+                (None, None), // None cases are never match in CASE <expr> WHEN <value> syntax
+                (None, None), // None cases are never match in CASE <expr> WHEN <value> syntax
+                (Some(2), Some("two")),
+                (Some(5), None), // No match in WHEN
+            ]);
+
+        test_lookup_eval_with_and_without_else(
+            &lookup_map,
+            Arc::new(input_values),
+            expected,
+        );
+    }
+
+    #[test]
+    fn test_case_when_literal_lookup_int32_to_string_with_duplicate_cases() {
+        let lookup_map = create_lookup([
+            (Some(4), Some("four")),
+            (Some(4), Some("no 4")),
+            (Some(2), Some("two")),
+            (Some(2), Some("no 2")),
+            (Some(3), Some("three")),
+            (Some(3), Some("no 3")),
+            (Some(2), Some("no 2")),
+            (Some(4), Some("no 4")),
+            (Some(2), Some("no 2")),
+            (Some(3), Some("no 3")),
+            (Some(4), Some("no 4")),
+            (Some(2), Some("no 2")),
+            (Some(3), Some("no 3")),
+            (Some(3), Some("no 3")),
+        ]);
+
+        let (input_values, expected) =
+            create_input_and_expected::<Int32Array, StringArray, _, _>([
+                (1, None), // No match in WHEN
+                (2, Some("two")),
+                (3, Some("three")),
+                (3, Some("three")),
+                (2, Some("two")),
+                (3, Some("three")),
+                (5, None), // No match in WHEN
+                (5, None), // No match in WHEN
+                (3, Some("three")),
+                (5, None), // No match in WHEN
+            ]);
+
+        test_lookup_eval_with_and_without_else(
+            &lookup_map,
+            Arc::new(input_values),
+            expected,
+        );
+    }
+
+    #[test]
+    fn test_case_when_literal_lookup_f32_to_string_with_special_values_and_duplicate_cases()
+     {
+        let lookup_map = create_lookup([
+            (Some(4.0), Some("four point zero")),
+            (Some(f32::NAN), Some("NaN")),
+            (Some(3.2), Some("three point two")),
+            // Duplicate case to make sure it is not used
+            (Some(f32::NAN), Some("should not use this NaN branch")),
+            (Some(f32::INFINITY), Some("Infinity")),
+            (Some(0.0), Some("zero")),
+            // Duplicate case to make sure it is not used
+            (
+                Some(f32::INFINITY),
+                Some("should not use this Infinity branch"),
+            ),
+            (Some(1.1), Some("one point one")),
+        ]);
+
+        let (input_values, expected) =
+            create_input_and_expected::<Float32Array, StringArray, _, _>([
+                (1.1, Some("one point one")),
+                (f32::NAN, Some("NaN")),
+                (3.2, Some("three point two")),
+                (3.2, Some("three point two")),
+                (0.0, Some("zero")),
+                (f32::INFINITY, Some("Infinity")),
+                (3.2, Some("three point two")),
+                (f32::NEG_INFINITY, None), // No match in WHEN
+                (f32::NEG_INFINITY, None), // No match in WHEN
+                (3.2, Some("three point two")),
+                (-0.0, None), // No match in WHEN
+            ]);
+
+        test_lookup_eval_with_and_without_else(
+            &lookup_map,
+            Arc::new(input_values),
+            expected,
+        );
+    }
+
+    #[test]
+    fn test_case_when_literal_lookup_f16_to_string_with_special_values() {
+        let lookup_map = create_lookup([
+            (
+                ScalarValue::Float16(Some(f16::from_f32(3.2))),
+                Some("3 dot 2"),
+            ),
+            (ScalarValue::Float16(Some(f16::NAN)), Some("NaN")),
+            (
+                ScalarValue::Float16(Some(f16::from_f32(17.4))),
+                Some("17 dot 4"),
+            ),
+            (ScalarValue::Float16(Some(f16::INFINITY)), Some("Infinity")),
+            (ScalarValue::Float16(Some(f16::ZERO)), Some("zero")),
+        ]);
+
+        let (input_values, expected) =
+            create_input_and_expected::<Float16Array, StringArray, _, _>([
+                (f16::from_f32(3.2), Some("3 dot 2")),
+                (f16::NAN, Some("NaN")),
+                (f16::from_f32(17.4), Some("17 dot 4")),
+                (f16::from_f32(17.4), Some("17 dot 4")),
+                (f16::INFINITY, Some("Infinity")),
+                (f16::from_f32(17.4), Some("17 dot 4")),
+                (f16::NEG_INFINITY, None), // No match in WHEN
+                (f16::NEG_INFINITY, None), // No match in WHEN
+                (f16::from_f32(17.4), Some("17 dot 4")),
+                (f16::NEG_ZERO, None), // No match in WHEN
+            ]);
+
+        test_lookup_eval_with_and_without_else(
+            &lookup_map,
+            Arc::new(input_values),
+            expected,
+        );
+    }
+
+    #[test]
+    fn test_case_when_literal_lookup_f32_to_string_with_special_values() {
+        let lookup_map = create_lookup([
+            (3.2, Some("3 dot 2")),
+            (f32::NAN, Some("NaN")),
+            (17.4, Some("17 dot 4")),
+            (f32::INFINITY, Some("Infinity")),
+            (f32::ZERO, Some("zero")),
+        ]);
+
+        let (input_values, expected) =
+            create_input_and_expected::<Float32Array, StringArray, _, _>([
+                (3.2, Some("3 dot 2")),
+                (f32::NAN, Some("NaN")),
+                (17.4, Some("17 dot 4")),
+                (17.4, Some("17 dot 4")),
+                (f32::INFINITY, Some("Infinity")),
+                (17.4, Some("17 dot 4")),
+                (f32::NEG_INFINITY, None), // No match in WHEN
+                (f32::NEG_INFINITY, None), // No match in WHEN
+                (17.4, Some("17 dot 4")),
+                (-0.0, None), // No match in WHEN
+            ]);
+
+        test_lookup_eval_with_and_without_else(
+            &lookup_map,
+            Arc::new(input_values),
+            expected,
+        );
+    }
+
+    #[test]
+    fn test_case_when_literal_lookup_f64_to_string_with_special_values() {
+        let lookup_map = create_lookup([
+            (3.2, Some("3 dot 2")),
+            (f64::NAN, Some("NaN")),
+            (17.4, Some("17 dot 4")),
+            (f64::INFINITY, Some("Infinity")),
+            (f64::ZERO, Some("zero")),
+        ]);
+
+        let (input_values, expected) =
+            create_input_and_expected::<Float64Array, StringArray, _, _>([
+                (3.2, Some("3 dot 2")),
+                (f64::NAN, Some("NaN")),
+                (17.4, Some("17 dot 4")),
+                (17.4, Some("17 dot 4")),
+                (f64::INFINITY, Some("Infinity")),
+                (17.4, Some("17 dot 4")),
+                (f64::NEG_INFINITY, None), // No match in WHEN
+                (f64::NEG_INFINITY, None), // No match in WHEN
+                (17.4, Some("17 dot 4")),
+                (-0.0, None), // No match in WHEN
+            ]);
+
+        test_lookup_eval_with_and_without_else(
+            &lookup_map,
+            Arc::new(input_values),
+            expected,
+        );
+    }
+
+    // Test that we don't lose the decimal precision and scale info
+    #[test]
+    fn test_decimal_with_non_default_precision_and_scale() {
+        let lookup_map = create_lookup([
+            (ScalarValue::Decimal32(Some(4), 3, 2), Some("four")),
+            (ScalarValue::Decimal32(Some(2), 3, 2), Some("two")),
+            (ScalarValue::Decimal32(Some(3), 3, 2), Some("three")),
+            (ScalarValue::Decimal32(Some(1), 3, 2), Some("one")),
+        ]);
+
+        let (input_values, expected) =
+            create_input_and_expected::<Decimal32Array, StringArray, _, _>([
+                (1, Some("one")),
+                (2, Some("two")),
+                (3, Some("three")),
+                (3, Some("three")),
+                (2, Some("two")),
+                (3, Some("three")),
+                (5, None), // No match in WHEN
+                (5, None), // No match in WHEN
+                (3, Some("three")),
+                (5, None), // No match in WHEN
+            ]);
+
+        let input_values = input_values
+            .with_precision_and_scale(3, 2)
+            .expect("must be able to set precision and scale");
+
+        test_lookup_eval_with_and_without_else(
+            &lookup_map,
+            Arc::new(input_values),
+            expected,
+        );
+    }
+
+    // Test that we don't lose the timezone info
+    #[test]
+    fn test_timestamp_with_non_default_timezone() {
+        let timezone: Option<Arc<str>> = Some("-10:00".into());
+        let lookup_map = create_lookup([
+            (
+                ScalarValue::TimestampMillisecond(Some(4), timezone.clone()),
+                Some("four"),
+            ),
+            (
+                ScalarValue::TimestampMillisecond(Some(2), timezone.clone()),
+                Some("two"),
+            ),
+            (
+                ScalarValue::TimestampMillisecond(Some(3), timezone.clone()),
+                Some("three"),
+            ),
+            (
+                ScalarValue::TimestampMillisecond(Some(1), timezone.clone()),
+                Some("one"),
+            ),
+        ]);
+
+        let (input_values, expected) =
+            create_input_and_expected::<TimestampMillisecondArray, StringArray, _, _>([
+                (1, Some("one")),
+                (2, Some("two")),
+                (3, Some("three")),
+                (3, Some("three")),
+                (2, Some("two")),
+                (3, Some("three")),
+                (5, None), // No match in WHEN
+                (5, None), // No match in WHEN
+                (3, Some("three")),
+                (5, None), // No match in WHEN
+            ]);
+
+        let input_values = input_values.with_timezone_opt(timezone);
+
+        test_lookup_eval_with_and_without_else(
+            &lookup_map,
+            Arc::new(input_values),
+            expected,
+        );
+    }
+
+    #[test]
+    fn test_with_strings_to_int32() {
+        let lookup_map = create_lookup([
+            (Some("why"), Some(42)),
+            (Some("what"), Some(22)),
+            (Some("when"), Some(17)),
+        ]);
+
+        let (input_values, expected) =
+            create_input_and_expected::<StringArray, Int32Array, _, _>([
+                (Some("why"), Some(42)),
+                (Some("5"), None), // No match in WHEN
+                (None, None), // None cases are never match in CASE <expr> WHEN <value> syntax
+                (Some("what"), Some(22)),
+                (None, None), // None cases are never match in CASE <expr> WHEN <value> syntax
+                (None, None), // None cases are never match in CASE <expr> WHEN <value> syntax
+                (Some("what"), Some(22)),
+                (Some("5"), None), // No match in WHEN
+            ]);
+
+        let input_values = Arc::new(input_values) as ArrayRef;
+
+        // Testing without ELSE should fallback to None
+        test_case_when_literal_lookup(
+            Arc::clone(&input_values),
+            &lookup_map,
+            None,
+            Arc::new(expected.clone()),
+        );
+
+        // Testing with Else
+        let else_value = 101;
+
+        // Changing each expected None to be fallback
+        let expected_with_else = expected
+            .iter()
+            .map(|item| item.unwrap_or(else_value))
+            .map(Some)
+            .collect::<Int32Array>();
+
+        // Test case
+        test_case_when_literal_lookup(
+            input_values,
+            &lookup_map,
+            Some(ScalarValue::Int32(Some(else_value))),
+            Arc::new(expected_with_else),
+        );
     }
 }
diff --git a/datafusion/physical-expr/src/expressions/case/literal_lookup_table/boolean_lookup_table.rs b/datafusion/physical-expr/src/expressions/case/literal_lookup_table/boolean_lookup_table.rs
new file mode 100644
index 0000000000000..15b3d04955b2e
--- /dev/null
+++ b/datafusion/physical-expr/src/expressions/case/literal_lookup_table/boolean_lookup_table.rs
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::expressions::case::literal_lookup_table::WhenLiteralIndexMap;
+use arrow::array::{Array, ArrayRef, AsArray, BooleanArray};
+use arrow::datatypes::DataType;
+use datafusion_common::{ScalarValue, internal_err};
+
+#[derive(Clone, Debug)]
+pub(super) struct BooleanIndexMap {
+    true_index: Option<u32>,
+    false_index: Option<u32>,
+}
+
+impl BooleanIndexMap {
+    /// Try creating a new lookup table from the given literals and else index
+    /// The index of each literal in the vector is used as the mapped value in the lookup table.
+    ///
+    /// `literals` are guaranteed to be unique and non-nullable
+    pub(super) fn try_new(
+        unique_non_null_literals: Vec<ScalarValue>,
+    ) -> datafusion_common::Result<Self> {
+        let mut true_index: Option<u32> = None;
+        let mut false_index: Option<u32> = None;
+
+        for (index, literal) in unique_non_null_literals.into_iter().enumerate() {
+            match literal {
+                ScalarValue::Boolean(Some(true)) => {
+                    if true_index.is_some() {
+                        return internal_err!(
+                            "Duplicate true literal found in literals for BooleanIndexMap"
+                        );
+                    }
+                    true_index = Some(index as u32);
+                }
+                ScalarValue::Boolean(Some(false)) => {
+                    if false_index.is_some() {
+                        return internal_err!(
+                            "Duplicate false literal found in literals for BooleanIndexMap"
+                        );
+                    }
+                    false_index = Some(index as u32);
+                }
+                ScalarValue::Boolean(None) => {
+                    return internal_err!(
+                        "Null literal found in non-null literals for BooleanIndexMap"
+                    );
+                }
+                _ => {
+                    return internal_err!(
+                        "Non-boolean literal found in literals for BooleanIndexMap"
+                    );
+                }
+            }
+        }
+
+        Ok(Self {
+            true_index,
+            false_index,
+        })
+    }
+
+    fn map_boolean_array_to_when_indices(
+        &self,
+        array: &BooleanArray,
+        else_index: u32,
+    ) -> datafusion_common::Result<Vec<u32>> {
+        let true_index = self.true_index.unwrap_or(else_index);
+        let false_index = self.false_index.unwrap_or(else_index);
+
+        Ok(array
+            .into_iter()
+            .map(|value| match value {
+                Some(true) => true_index,
+                Some(false) => false_index,
+                None => else_index,
+            })
+            .collect::<Vec<u32>>())
+    }
+}
+
+impl WhenLiteralIndexMap for BooleanIndexMap {
+    fn map_to_when_indices(
+        &self,
+        array: &ArrayRef,
+        else_index: u32,
+    ) -> datafusion_common::Result<Vec<u32>> {
+        match array.data_type() {
+            DataType::Boolean => {
+                self.map_boolean_array_to_when_indices(array.as_boolean(), else_index)
+            }
+            // We support dictionary boolean array as we create the lookup table in `CaseWhen` expression
+            // creation when we don't know the schema, so we may receive dictionary encoded boolean arrays at execution time.
+            DataType::Dictionary(_, value_type)
+                if value_type.as_ref() == &DataType::Boolean =>
+            {
+                // Since it is not common to have dictionary encoded boolean arrays
+                // at all than it is ok to do the cast here to simplify the implementation.
+                let converted = arrow::compute::cast(array.as_ref(), &DataType::Boolean)?;
+                self.map_boolean_array_to_when_indices(converted.as_boolean(), else_index)
+            }
+            _ => internal_err!(
+                "Expected boolean array for BooleanIndexMap, got {:?}",
+                array.data_type()
+            ),
+        }
+    }
+}
diff --git a/datafusion/physical-expr/src/expressions/case/literal_lookup_table/bytes_like_lookup_table.rs b/datafusion/physical-expr/src/expressions/case/literal_lookup_table/bytes_like_lookup_table.rs
new file mode 100644
index 0000000000000..e5cf3f84fd919
--- /dev/null
+++ b/datafusion/physical-expr/src/expressions/case/literal_lookup_table/bytes_like_lookup_table.rs
@@ -0,0 +1,223 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::expressions::case::literal_lookup_table::WhenLiteralIndexMap;
+use arrow::array::{
+    Array, ArrayRef, AsArray, BinaryArray, BinaryViewArray, DictionaryArray,
+    FixedSizeBinaryArray, LargeBinaryArray, LargeStringArray, StringArray,
+    StringViewArray, downcast_integer,
+};
+use arrow::datatypes::{
+    ArrowDictionaryKeyType, BinaryViewType, DataType, StringViewType,
+};
+use datafusion_common::{HashMap, ScalarValue, internal_err, plan_datafusion_err};
+use std::fmt::Debug;
+
+/// Map from byte-like literal values to their first occurrence index
+///
+/// This is a wrapper for handling different kinds of literal maps
+#[derive(Clone, Debug)]
+pub(super) struct BytesLikeIndexMap {
+    /// Map from non-null literal value the first occurrence index in the literals
+    map: HashMap<Vec<u8>, u32>,
+}
+
+impl BytesLikeIndexMap {
+    /// Try creating a new lookup table from the given literals and else index
+    /// The index of each literal in the vector is used as the mapped value in the lookup table.
+    ///
+    /// `literals` are guaranteed to be unique and non-nullable
+    pub(super) fn try_new(
+        unique_non_null_literals: Vec<ScalarValue>,
+    ) -> datafusion_common::Result<Self> {
+        let input = ScalarValue::iter_to_array(unique_non_null_literals)?;
+
+        // Literals are guaranteed to not contain nulls
+        if input.logical_null_count() > 0 {
+            return internal_err!("Literal values for WHEN clauses cannot contain nulls");
+        }
+
+        let map: HashMap<Vec<u8>, u32> = try_get_bytes_iterator(&input)?
+            // Flattening Option<&[u8]> to &[u8] as literals cannot contain nulls
+            .flatten()
+            .enumerate()
+            .map(|(map_index, value)| (value.to_vec(), map_index as u32))
+            // Because literals are unique we can collect directly, and we can avoid only inserting the first occurrence
+            .collect();
+
+        Ok(Self { map })
+    }
+}
+
+impl WhenLiteralIndexMap for BytesLikeIndexMap {
+    fn map_to_when_indices(
+        &self,
+        array: &ArrayRef,
+        else_index: u32,
+    ) -> datafusion_common::Result<Vec<u32>> {
+        let indices = try_get_bytes_iterator(array)?
+            .map(|value| match value {
+                Some(value) => self.map.get(value).copied().unwrap_or(else_index),
+                None => else_index,
+            })
+            .collect::<Vec<u32>>();
+
+        Ok(indices)
+    }
+}
+
+fn try_get_bytes_iterator(
+    array: &ArrayRef,
+) -> datafusion_common::Result<Box<dyn Iterator<Item = Option<&[u8]>> + '_>> {
+    Ok(match array.data_type() {
+        DataType::Utf8 => Box::new(array.as_string::<i32>().into_iter().map(|item| {
+            item.map(|v| {
+                let bytes: &[u8] = v.as_ref();
+
+                bytes
+            })
+        })),
+
+        DataType::LargeUtf8 => {
+            Box::new(array.as_string::<i64>().into_iter().map(|item| {
+                item.map(|v| {
+                    let bytes: &[u8] = v.as_ref();
+
+                    bytes
+                })
+            }))
+        }
+
+        DataType::Binary => Box::new(array.as_binary::<i32>().into_iter()),
+
+        DataType::LargeBinary => Box::new(array.as_binary::<i64>().into_iter()),
+
+        DataType::FixedSizeBinary(_) => Box::new(array.as_binary::<i64>().into_iter()),
+
+        DataType::Utf8View => Box::new(
+            array
+                .as_byte_view::<StringViewType>()
+                .into_iter()
+                .map(|item| {
+                    item.map(|v| {
+                        let bytes: &[u8] = v.as_ref();
+
+                        bytes
+                    })
+                }),
+        ),
+        DataType::BinaryView => {
+            Box::new(array.as_byte_view::<BinaryViewType>().into_iter())
+        }
+
+        DataType::Dictionary(key, _) => {
+            macro_rules! downcast_dictionary_array_helper {
+                ($t:ty) => {{ get_bytes_iterator_for_dictionary(array.as_dictionary::<$t>())? }};
+            }
+
+            downcast_integer! {
+                key.as_ref() => (downcast_dictionary_array_helper),
+                k => unreachable!("unsupported dictionary key type: {}", k)
+            }
+        }
+        t => {
+            return Err(plan_datafusion_err!(
+                "Unsupported data type for bytes lookup table: {}",
+                t
+            ));
+        }
+    })
+}
+
+fn get_bytes_iterator_for_dictionary<K: ArrowDictionaryKeyType + Send + Sync>(
+    array: &DictionaryArray<K>,
+) -> datafusion_common::Result<Box<dyn Iterator<Item = Option<&[u8]>> + '_>> {
+    Ok(match array.values().data_type() {
+        DataType::Utf8 => Box::new(
+            array
+                .downcast_dict::<StringArray>()
+                .unwrap()
+                .into_iter()
+                .map(|item| {
+                    item.map(|v| {
+                        let bytes: &[u8] = v.as_ref();
+
+                        bytes
+                    })
+                }),
+        ),
+
+        DataType::LargeUtf8 => Box::new(
+            array
+                .downcast_dict::<LargeStringArray>()
+                .unwrap()
+                .into_iter()
+                .map(|item| {
+                    item.map(|v| {
+                        let bytes: &[u8] = v.as_ref();
+
+                        bytes
+                    })
+                }),
+        ),
+
+        DataType::Binary => {
+            Box::new(array.downcast_dict::<BinaryArray>().unwrap().into_iter())
+        }
+
+        DataType::LargeBinary => Box::new(
+            array
+                .downcast_dict::<LargeBinaryArray>()
+                .unwrap()
+                .into_iter(),
+        ),
+
+        DataType::FixedSizeBinary(_) => Box::new(
+            array
+                .downcast_dict::<FixedSizeBinaryArray>()
+                .unwrap()
+                .into_iter(),
+        ),
+
+        DataType::Utf8View => Box::new(
+            array
+                .downcast_dict::<StringViewArray>()
+                .unwrap()
+                .into_iter()
+                .map(|item| {
+                    item.map(|v| {
+                        let bytes: &[u8] = v.as_ref();
+
+                        bytes
+                    })
+                }),
+        ),
+        DataType::BinaryView => Box::new(
+            array
+                .downcast_dict::<BinaryViewArray>()
+                .unwrap()
+                .into_iter(),
+        ),
+
+        t => {
+            return Err(plan_datafusion_err!(
+                "Unsupported data type for lookup table dictionary value: {}",
+                t
+            ));
+        }
+    })
+}
diff --git a/datafusion/physical-expr/src/expressions/case/literal_lookup_table/mod.rs b/datafusion/physical-expr/src/expressions/case/literal_lookup_table/mod.rs
new file mode 100644
index 0000000000000..0d4291ccc934b
--- /dev/null
+++ b/datafusion/physical-expr/src/expressions/case/literal_lookup_table/mod.rs
@@ -0,0 +1,327 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod boolean_lookup_table;
+mod bytes_like_lookup_table;
+mod primitive_lookup_table;
+
+use crate::expressions::Literal;
+use crate::expressions::case::CaseBody;
+use crate::expressions::case::literal_lookup_table::boolean_lookup_table::BooleanIndexMap;
+use crate::expressions::case::literal_lookup_table::bytes_like_lookup_table::BytesLikeIndexMap;
+use crate::expressions::case::literal_lookup_table::primitive_lookup_table::PrimitiveIndexMap;
+use arrow::array::{Array, ArrayRef, UInt32Array, downcast_primitive};
+use arrow::datatypes::DataType;
+use datafusion_common::{ScalarValue, arrow_datafusion_err, plan_datafusion_err};
+use indexmap::IndexMap;
+use std::fmt::Debug;
+
+/// Optimization for CASE expressions with literal WHEN and THEN clauses
+///
+/// for this form:
+/// ```sql
+/// CASE <expr_a>
+///     WHEN <literal_a> THEN <literal_e>
+///     WHEN <literal_b> THEN <literal_f>
+///     WHEN <literal_c> THEN <literal_g>
+///     WHEN <literal_d> THEN <literal_h>
+///     ELSE <optional-fallback_literal>
+/// END
+/// ```
+///
+/// # Improvement idea
+/// TODO - we should think of unwrapping the `IN` expressions into multiple equality comparisons
+/// so it will use this optimization as well, e.g.
+/// ```sql
+/// -- Before
+/// CASE
+///     WHEN (<expr_a> = <literal_a>) THEN <literal_e>
+///     WHEN (<expr_a> in (<literal_b>, <literal_c>) THEN <literal_f>
+///     WHEN (<expr_a> = <literal_d>) THEN <literal_g>
+/// ELSE <optional-fallback_literal>
+///
+/// -- After
+/// CASE
+///     WHEN (<expr_a> = <literal_a>) THEN <literal_e>
+///     WHEN (<expr_a> = <literal_b>) THEN <literal_f>
+///     WHEN (<expr_a> = <literal_c>) THEN <literal_g>
+///     WHEN (<expr_a> = <literal_d>) THEN <literal_h>
+///     ELSE <optional-fallback_literal>
+/// END
+/// ```
+///
+#[derive(Debug)]
+pub(in super::super) struct LiteralLookupTable {
+    /// The lookup table to use for evaluating the CASE expression
+    lookup: Box<dyn WhenLiteralIndexMap>,
+
+    else_index: u32,
+
+    /// [`ArrayRef`] where `array[i] = then_literals[i]`
+    /// the last value in the array is the else_expr
+    ///
+    /// This will be used to take from based on the indices returned by the lookup table to build the final output
+    then_and_else_values: ArrayRef,
+}
+
+impl LiteralLookupTable {
+    pub(in super::super) fn maybe_new(body: &CaseBody) -> Option<Self> {
+        // We can't use the optimization if we don't have any when then pairs
+        if body.when_then_expr.is_empty() {
+            return None;
+        }
+
+        // If we only have 1 than this optimization is not useful
+        if body.when_then_expr.len() == 1 {
+            return None;
+        }
+
+        // Try to downcast all the WHEN/THEN expressions to literals
+        let when_then_exprs_maybe_literals = body
+            .when_then_expr
+            .iter()
+            .map(|(when, then)| {
+                let when_maybe_literal = when.downcast_ref::<Literal>();
+                let then_maybe_literal = then.downcast_ref::<Literal>();
+
+                when_maybe_literal.zip(then_maybe_literal)
+            })
+            .collect::<Vec<_>>();
+
+        // If not all the WHEN/THEN expressions are literals we cannot use this optimization
+        if when_then_exprs_maybe_literals.contains(&None) {
+            return None;
+        }
+
+        let when_then_exprs_scalars = when_then_exprs_maybe_literals
+            .into_iter()
+            // Unwrap the options as we have already checked there is no None
+            .flatten()
+            .map(|(when_lit, then_lit)| {
+                (when_lit.value().clone(), then_lit.value().clone())
+            })
+            // Only keep non-null WHEN literals
+            // as they cannot be matched - case NULL WHEN NULL THEN ... ELSE ... END always goes to ELSE
+            .filter(|(when_lit, _)| !when_lit.is_null())
+            .collect::<Vec<_>>();
+
+        if when_then_exprs_scalars.is_empty() {
+            // All WHEN literals were nulls, so cannot use optimization
+            //
+            // instead, another optimization would be to go straight to the ELSE clause
+            return None;
+        }
+
+        // Keep only the first occurrence of each when literal (as the first match is used)
+        // and remove nulls (as they cannot be matched - case NULL WHEN NULL THEN ... ELSE ... END always goes to ELSE)
+        let (when, then): (Vec<ScalarValue>, Vec<ScalarValue>) = {
+            let mut map = IndexMap::with_capacity(body.when_then_expr.len());
+
+            for (when, then) in when_then_exprs_scalars.into_iter() {
+                // Don't overwrite existing entries as we want to keep the first occurrence
+                if !map.contains_key(&when) {
+                    map.insert(when, then);
+                }
+            }
+
+            map.into_iter().unzip()
+        };
+
+        let else_value: ScalarValue = if let Some(else_expr) = &body.else_expr {
+            let literal = else_expr.downcast_ref::<Literal>()?;
+
+            literal.value().clone()
+        } else {
+            let Ok(null_scalar) = ScalarValue::try_new_null(&then[0].data_type()) else {
+                return None;
+            };
+
+            null_scalar
+        };
+
+        {
+            let when_data_type = when[0].data_type();
+
+            // If not all the WHEN literals are the same data type we cannot use this optimization
+            if when.iter().any(|l| l.data_type() != when_data_type) {
+                return None;
+            }
+        }
+
+        {
+            let data_type = then[0].data_type();
+
+            // If not all the then and the else literals are the same data type we cannot use this optimization
+            if then.iter().any(|l| l.data_type() != data_type) {
+                return None;
+            }
+
+            if else_value.data_type() != data_type {
+                return None;
+            }
+        }
+
+        let then_and_else_values = ScalarValue::iter_to_array(
+            then.iter()
+                // The else is in the end
+                .chain(std::iter::once(&else_value))
+                .cloned(),
+        )
+        .ok()?;
+        // The else expression is in the end
+        let else_index = then_and_else_values.len() as u32 - 1;
+
+        let lookup = try_creating_lookup_table(when).ok()?;
+
+        Some(Self {
+            lookup,
+            then_and_else_values,
+            else_index,
+        })
+    }
+
+    pub(in super::super) fn map_keys_to_values(
+        &self,
+        keys_array: &ArrayRef,
+    ) -> datafusion_common::Result<ArrayRef> {
+        let take_indices = self
+            .lookup
+            .map_to_when_indices(keys_array, self.else_index)?;
+
+        // Zero-copy conversion
+        let take_indices = UInt32Array::from(take_indices);
+
+        // An optimize version would depend on the type of the values_to_take_from
+        // For example, if the type is view we can just keep pointing to the same value (similar to dictionary)
+        // if the type is dictionary we can just use the indices as is (or cast them to the key type) and create a new dictionary array
+        let output =
+            arrow::compute::take(&self.then_and_else_values, &take_indices, None)
+                .map_err(|e| arrow_datafusion_err!(e))?;
+
+        Ok(output)
+    }
+}
+
+/// Map values that match the WHEN literal to the index of their corresponding WHEN clause
+///
+/// For example, for this CASE expression:
+///
+/// ```sql
+/// CASE <expr_a>
+///     WHEN <literal_a> THEN <result_e>
+///     WHEN <literal_b> THEN <result_f>
+///     WHEN <literal_c> THEN <result_g>
+///     WHEN <literal_d> THEN <result_h>
+///     ELSE <fallback_result>
+/// END
+/// ```
+///
+/// this will map <literal_a> to 0, <literal_b> to 1, <literal_c> to 2, <literal_d> to 3
+pub(super) trait WhenLiteralIndexMap: Debug + Send + Sync {
+    /// Given an array of values, returns a vector of WHEN clause indices corresponding to each value in the provided array.
+    ///
+    /// For example, for this CASE expression:
+    ///
+    /// ```sql
+    /// CASE <expr_a>
+    ///     WHEN <literal_a> THEN <result_e>
+    ///     WHEN <literal_b> THEN <result_f>
+    ///     WHEN <literal_c> THEN <result_g>
+    ///     WHEN <literal_d> THEN <result_h>
+    ///     ELSE <fallback_result>
+    /// END
+    /// ```
+    ///
+    /// the array will be the evaluated values of `<expr_a>`
+    /// and if that array is:
+    /// - `[<literal_a>, <literal_c>, <literal_x>, <literal_b>, <literal_a>]`
+    ///
+    /// the returned vector will be:
+    /// - `[0, 2, else_index, 1, 0]`
+    ///
+    fn map_to_when_indices(
+        &self,
+        array: &ArrayRef,
+        else_index: u32,
+    ) -> datafusion_common::Result<Vec<u32>>;
+}
+
+fn try_creating_lookup_table(
+    unique_non_null_literals: Vec<ScalarValue>,
+) -> datafusion_common::Result<Box<dyn WhenLiteralIndexMap>> {
+    assert_ne!(
+        unique_non_null_literals.len(),
+        0,
+        "Must have at least one literal"
+    );
+    match unique_non_null_literals[0].data_type() {
+        DataType::Boolean => {
+            let lookup_table = BooleanIndexMap::try_new(unique_non_null_literals)?;
+            Ok(Box::new(lookup_table))
+        }
+
+        data_type if data_type.is_primitive() => {
+            macro_rules! create_matching_map {
+                ($t:ty) => {{
+                    let lookup_table =
+                        PrimitiveIndexMap::<$t>::try_new(unique_non_null_literals)?;
+                    Ok(Box::new(lookup_table))
+                }};
+            }
+
+            downcast_primitive! {
+                data_type => (create_matching_map),
+                _ => Err(plan_datafusion_err!(
+                    "Unsupported field type for primitive: {:?}",
+                    data_type
+                )),
+            }
+        }
+
+        DataType::Utf8
+        | DataType::LargeUtf8
+        | DataType::Binary
+        | DataType::LargeBinary
+        | DataType::FixedSizeBinary(_)
+        | DataType::Utf8View
+        | DataType::BinaryView => {
+            let lookup_table = BytesLikeIndexMap::try_new(unique_non_null_literals)?;
+            Ok(Box::new(lookup_table))
+        }
+
+        DataType::Dictionary(_key, value)
+            if matches!(
+                value.as_ref(),
+                DataType::Utf8
+                    | DataType::LargeUtf8
+                    | DataType::Binary
+                    | DataType::LargeBinary
+                    | DataType::FixedSizeBinary(_)
+                    | DataType::Utf8View
+                    | DataType::BinaryView
+            ) =>
+        {
+            let lookup_table = BytesLikeIndexMap::try_new(unique_non_null_literals)?;
+            Ok(Box::new(lookup_table))
+        }
+
+        _ => Err(plan_datafusion_err!(
+            "Unsupported data type for lookup table: {}",
+            unique_non_null_literals[0].data_type()
+        )),
+    }
+}
diff --git a/datafusion/physical-expr/src/expressions/case/literal_lookup_table/primitive_lookup_table.rs b/datafusion/physical-expr/src/expressions/case/literal_lookup_table/primitive_lookup_table.rs
new file mode 100644
index 0000000000000..36d282c2a402b
--- /dev/null
+++ b/datafusion/physical-expr/src/expressions/case/literal_lookup_table/primitive_lookup_table.rs
@@ -0,0 +1,229 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::expressions::case::literal_lookup_table::WhenLiteralIndexMap;
+use arrow::array::{
+    Array, ArrayRef, ArrowNativeTypeOp, ArrowPrimitiveType, AsArray, PrimitiveArray,
+};
+use arrow::datatypes::{DataType, IntervalDayTime, IntervalMonthDayNano, i256};
+use datafusion_common::{HashMap, ScalarValue, internal_err};
+use half::f16;
+use std::fmt::Debug;
+use std::hash::Hash;
+
+#[derive(Clone)]
+pub(super) struct PrimitiveIndexMap<T>
+where
+    T: ArrowPrimitiveType,
+    T::Native: ToHashableKey,
+{
+    data_type: DataType,
+    /// Literal value to map index
+    ///
+    /// If searching this map becomes a bottleneck consider using linear map implementations for small hashmaps
+    map: HashMap<<T::Native as ToHashableKey>::HashableKey, u32>,
+}
+
+impl<T> Debug for PrimitiveIndexMap<T>
+where
+    T: ArrowPrimitiveType,
+    T::Native: ToHashableKey,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("PrimitiveIndexMap")
+            .field("map", &self.map)
+            .finish()
+    }
+}
+
+impl<T> PrimitiveIndexMap<T>
+where
+    T: ArrowPrimitiveType,
+    T::Native: ToHashableKey,
+{
+    /// Try creating a new lookup table from the given literals and else index.
+    /// The index of each literal in the vector is used as the mapped value in the lookup table.
+    ///
+    /// `literals` are guaranteed to be unique and non-nullable
+    pub(super) fn try_new(
+        unique_non_null_literals: Vec<ScalarValue>,
+    ) -> datafusion_common::Result<Self> {
+        let input = ScalarValue::iter_to_array(unique_non_null_literals)?;
+
+        // Literals are guaranteed to not contain nulls
+        if input.null_count() > 0 {
+            return internal_err!("Literal values for WHEN clauses cannot contain nulls");
+        }
+
+        let map = input
+            .as_primitive::<T>()
+            .values()
+            .iter()
+            .enumerate()
+            // Because literals are unique we can collect directly, and we can avoid only inserting the first occurrence
+            .map(|(map_index, value)| (value.into_hashable_key(), map_index as u32))
+            .collect();
+
+        Ok(Self {
+            map,
+            data_type: input.data_type().clone(),
+        })
+    }
+
+    fn map_primitive_array_to_when_indices(
+        &self,
+        array: &PrimitiveArray<T>,
+        else_index: u32,
+    ) -> datafusion_common::Result<Vec<u32>> {
+        let indices = array
+            .into_iter()
+            .map(|value| match value {
+                Some(value) => self
+                    .map
+                    .get(&value.into_hashable_key())
+                    .copied()
+                    .unwrap_or(else_index),
+
+                None => else_index,
+            })
+            .collect::<Vec<u32>>();
+
+        Ok(indices)
+    }
+}
+
+impl<T> WhenLiteralIndexMap for PrimitiveIndexMap<T>
+where
+    T: ArrowPrimitiveType,
+    T::Native: ToHashableKey,
+{
+    fn map_to_when_indices(
+        &self,
+        array: &ArrayRef,
+        else_index: u32,
+    ) -> datafusion_common::Result<Vec<u32>> {
+        match array.data_type() {
+            dt if dt == &self.data_type => {
+                let primitive_array = array.as_primitive::<T>();
+
+                self.map_primitive_array_to_when_indices(primitive_array, else_index)
+            }
+            // We support dictionary primitive array as we create the lookup table in `CaseWhen` expression
+            // creation when we don't know the schema, so we may receive dictionary encoded primitive arrays at execution time.
+            DataType::Dictionary(_, value_type)
+                if value_type.as_ref() == &self.data_type =>
+            {
+                // Cast here to simplify the implementation.
+                let converted = arrow::compute::cast(array.as_ref(), &self.data_type)?;
+                self.map_primitive_array_to_when_indices(
+                    converted.as_primitive::<T>(),
+                    else_index,
+                )
+            }
+            _ => internal_err!(
+                "PrimitiveIndexMap expected array of type {:?} but got {:?}",
+                self.data_type,
+                array.data_type()
+            ),
+        }
+    }
+}
+
+// TODO - We need to port it to arrow so that it can be reused in other places
+
+/// Trait that help convert a value to a key that is hashable and equatable
+/// This is needed as some types like f16/f32/f64 do not implement Hash/Eq directly
+pub(super) trait ToHashableKey: ArrowNativeTypeOp {
+    /// The type that is hashable and equatable
+    /// It must be an Arrow native type but it NOT GUARANTEED to be the same as Self
+    /// this is just a helper trait so you can reuse the same code for all arrow native types
+    type HashableKey: Hash + Eq + Debug + Clone + Copy + Send + Sync;
+
+    /// Converts self to a hashable key
+    /// the result of this value can be used as the key in hash maps/sets
+    fn into_hashable_key(self) -> Self::HashableKey;
+}
+
+macro_rules! impl_to_hashable_key {
+    (@single_already_hashable | $t:ty) => {
+        impl ToHashableKey for $t {
+            type HashableKey = $t;
+
+            #[inline]
+            fn into_hashable_key(self) -> Self::HashableKey {
+                self
+            }
+        }
+    };
+    (@already_hashable | $($t:ty),+ $(,)?) => {
+        $(
+            impl_to_hashable_key!(@single_already_hashable | $t);
+        )+
+    };
+    (@float | $t:ty => $hashable:ty) => {
+        impl ToHashableKey for $t {
+            type HashableKey = $hashable;
+
+            #[inline]
+            fn into_hashable_key(self) -> Self::HashableKey {
+                self.to_bits()
+            }
+        }
+    };
+}
+
+impl_to_hashable_key!(@already_hashable | i8, i16, i32, i64, i128, i256, u8, u16, u32, u64, IntervalDayTime, IntervalMonthDayNano);
+impl_to_hashable_key!(@float | f16 => u16);
+impl_to_hashable_key!(@float | f32 => u32);
+impl_to_hashable_key!(@float | f64 => u64);
+
+#[cfg(test)]
+mod tests {
+    use super::ToHashableKey;
+    use arrow::array::downcast_primitive;
+
+    // This test ensure that all arrow primitive types implement ToHashableKey
+    // otherwise the code will not compile
+    #[test]
+    fn should_implement_to_hashable_key_for_all_primitives() {
+        #[derive(Debug, Default)]
+        struct ExampleSet<T>
+        where
+            T: arrow::datatypes::ArrowPrimitiveType,
+            T::Native: ToHashableKey,
+        {
+            _map: std::collections::HashSet<<T::Native as ToHashableKey>::HashableKey>,
+        }
+
+        macro_rules! create_matching_set {
+            ($t:ty) => {{
+                let _lookup_table = ExampleSet::<$t> {
+                    _map: Default::default(),
+                };
+
+                return;
+            }};
+        }
+
+        let data_type = arrow::datatypes::DataType::Float16;
+
+        downcast_primitive! {
+            data_type => (create_matching_set),
+            _ => panic!("not implemented for {data_type}"),
+        }
+    }
+}
diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs
index 0419161b532ce..ad214a89ceb71 100644
--- a/datafusion/physical-expr/src/expressions/cast.rs
+++ b/datafusion/physical-expr/src/expressions/cast.rs
@@ -15,18 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::fmt;
 use std::hash::Hash;
 use std::sync::Arc;
 
 use crate::physical_expr::PhysicalExpr;
 
-use arrow::compute::{can_cast_types, CastOptions};
+use arrow::compute::{CastOptions, can_cast_types};
 use arrow::datatypes::{DataType, DataType::*, FieldRef, Schema};
 use arrow::record_batch::RecordBatch;
+use datafusion_common::datatype::DataTypeExt;
 use datafusion_common::format::DEFAULT_FORMAT_OPTIONS;
-use datafusion_common::{not_impl_err, Result};
+use datafusion_common::nested_struct::{
+    requires_nested_struct_cast, validate_data_type_compatibility,
+};
+use datafusion_common::{Result, not_impl_err};
 use datafusion_expr_common::columnar_value::ColumnarValue;
 use datafusion_expr_common::interval_arithmetic::Interval;
 use datafusion_expr_common::sort_properties::ExprProperties;
@@ -41,13 +44,23 @@ const DEFAULT_SAFE_CAST_OPTIONS: CastOptions<'static> = CastOptions {
     format_options: DEFAULT_FORMAT_OPTIONS,
 };
 
+/// Check if name-based struct casting is allowed by validating field compatibility.
+///
+/// This function applies the same validation rules as execution time to ensure
+/// planning-time validation matches runtime validation, enabling fail-fast behavior
+/// instead of deferring errors to execution. Handles structs at any nesting level
+/// (e.g., `List<Struct>`, `Dictionary<_, Struct>`).
+fn can_cast_named_struct_types(source: &DataType, target: &DataType) -> bool {
+    validate_data_type_compatibility("", source, target).is_ok()
+}
+
 /// CAST expression casts an expression to a specific data type and returns a runtime error on invalid cast
 #[derive(Debug, Clone, Eq)]
 pub struct CastExpr {
     /// The expression to cast
     pub expr: Arc<dyn PhysicalExpr>,
-    /// The data type to cast to
-    cast_type: DataType,
+    /// Field metadata describing the desired output after casting
+    target_field: FieldRef,
     /// Cast options
     cast_options: CastOptions<'static>,
 }
@@ -56,7 +69,7 @@ pub struct CastExpr {
 impl PartialEq for CastExpr {
     fn eq(&self, other: &Self) -> bool {
         self.expr.eq(&other.expr)
-            && self.cast_type.eq(&other.cast_type)
+            && self.target_field.eq(&other.target_field)
             && self.cast_options.eq(&other.cast_options)
     }
 }
@@ -64,21 +77,55 @@ impl PartialEq for CastExpr {
 impl Hash for CastExpr {
     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
         self.expr.hash(state);
-        self.cast_type.hash(state);
+        self.target_field.hash(state);
         self.cast_options.hash(state);
     }
 }
 
 impl CastExpr {
-    /// Create a new CastExpr
+    /// Create a new `CastExpr` using only a `DataType`.
+    ///
+    /// This constructor is provided for compatibility with existing call sites
+    /// that only know the target type.  It synthesizes a ``Field`` with the
+    /// given type (**nullable by default**) and no name metadata.  Callers that
+    /// already have a `FieldRef` (for example, coming from schema inference or a
+    /// resolved column) should prefer [`CastExpr::new_with_target_field`], which
+    /// preserves the field's name, nullability, and other metadata.  In other
+    /// words:
+    ///
+    /// * use `new()` when only a `DataType` is available and you want the legacy
+    ///   semantics of a type-only cast
+    /// * use `new_with_target_field()` when you need explicit field
+    ///   metadata/name/nullability preserved
     pub fn new(
         expr: Arc<dyn PhysicalExpr>,
         cast_type: DataType,
         cast_options: Option<CastOptions<'static>>,
+    ) -> Self {
+        Self::new_with_target_field(
+            expr,
+            cast_type.into_nullable_field_ref(),
+            cast_options,
+        )
+    }
+
+    /// Create a new `CastExpr` with an explicit target `FieldRef`.
+    ///
+    /// The provided `target_field` is used verbatim for the expression's
+    /// return schema, so the field's name, nullability, and other metadata are
+    /// preserved.  This is the preferred constructor when the caller already
+    /// has field information (for example, during logical-to-physical planning).
+    ///
+    /// See [`CastExpr::new`] for the compatibility constructor that only accepts
+    /// a `DataType`.
+    pub fn new_with_target_field(
+        expr: Arc<dyn PhysicalExpr>,
+        target_field: FieldRef,
+        cast_options: Option<CastOptions<'static>>,
     ) -> Self {
         Self {
             expr,
-            cast_type,
+            target_field,
             cast_options: cast_options.unwrap_or(DEFAULT_CAST_OPTIONS),
         }
     }
@@ -90,7 +137,12 @@ impl CastExpr {
 
     /// The data type to cast to
     pub fn cast_type(&self) -> &DataType {
-        &self.cast_type
+        self.target_field.data_type()
+    }
+
+    /// Field metadata describing the output column after casting.
+    pub fn target_field(&self) -> &FieldRef {
+        &self.target_field
     }
 
     /// The cast options
@@ -98,13 +150,29 @@ impl CastExpr {
         &self.cast_options
     }
 
-    /// Check if the cast is a widening cast (e.g. from `Int8` to `Int16`).
-    pub fn is_bigger_cast(&self, src: &DataType) -> bool {
-        if self.cast_type.eq(src) {
+    fn resolved_target_field(&self, input_schema: &Schema) -> Result<FieldRef> {
+        if is_default_target_field(&self.target_field) {
+            self.expr.return_field(input_schema).map(|field| {
+                Arc::new(
+                    field
+                        .as_ref()
+                        .clone()
+                        .with_data_type(self.cast_type().clone()),
+                )
+            })
+        } else {
+            Ok(Arc::clone(&self.target_field))
+        }
+    }
+
+    /// Check if casting from the specified source type to the target type is a
+    /// widening cast (e.g. from `Int8` to `Int16`).
+    pub fn check_bigger_cast(cast_type: &DataType, src: &DataType) -> bool {
+        if cast_type.eq(src) {
             return true;
         }
         matches!(
-            (src, &self.cast_type),
+            (src, cast_type),
             (Int8, Int16 | Int32 | Int64)
                 | (Int16, Int32 | Int64)
                 | (Int32, Int64)
@@ -119,41 +187,69 @@ impl CastExpr {
                 | (Utf8, LargeUtf8)
         )
     }
+
+    /// Check if the cast is a widening cast (e.g. from `Int8` to `Int16`).
+    pub fn is_bigger_cast(&self, src: &DataType) -> bool {
+        Self::check_bigger_cast(self.cast_type(), src)
+    }
+}
+
+fn is_default_target_field(target_field: &FieldRef) -> bool {
+    target_field.name().is_empty()
+        && target_field.is_nullable()
+        && target_field.metadata().is_empty()
+}
+
+pub(crate) fn is_order_preserving_cast_family(
+    source_type: &DataType,
+    target_type: &DataType,
+) -> bool {
+    (source_type.is_numeric() || *source_type == Boolean) && target_type.is_numeric()
+        || source_type.is_temporal() && target_type.is_temporal()
+        || source_type.eq(target_type)
+}
+
+pub(crate) fn cast_expr_properties(
+    child: &ExprProperties,
+    target_type: &DataType,
+) -> Result<ExprProperties> {
+    let unbounded = Interval::make_unbounded(target_type)?;
+    if is_order_preserving_cast_family(&child.range.data_type(), target_type) {
+        Ok(child.clone().with_range(unbounded))
+    } else {
+        Ok(ExprProperties::new_unknown().with_range(unbounded))
+    }
 }
 
 impl fmt::Display for CastExpr {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "CAST({} AS {:?})", self.expr, self.cast_type)
+        write!(f, "CAST({} AS {})", self.expr, self.cast_type())
     }
 }
 
 impl PhysicalExpr for CastExpr {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
-        Ok(self.cast_type.clone())
+        Ok(self.cast_type().clone())
     }
 
     fn nullable(&self, input_schema: &Schema) -> Result<bool> {
-        self.expr.nullable(input_schema)
+        // A cast is nullable if **either** the child is nullable or the
+        // target field allows nulls.  This conservative rule prevents
+        // optimizers from assuming a non-null result when a null input could
+        // still propagate.  `return_field()` continues to expose the exact
+        // target metadata separately.
+        let child_nullable = self.expr.nullable(input_schema)?;
+        let target_nullable = self.resolved_target_field(input_schema)?.is_nullable();
+        Ok(child_nullable || target_nullable)
     }
 
     fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
         let value = self.expr.evaluate(batch)?;
-        value.cast_to(&self.cast_type, Some(&self.cast_options))
+        value.cast_to(self.cast_type(), Some(&self.cast_options))
     }
 
     fn return_field(&self, input_schema: &Schema) -> Result<FieldRef> {
-        Ok(self
-            .expr
-            .return_field(input_schema)?
-            .as_ref()
-            .clone()
-            .with_data_type(self.cast_type.clone())
-            .into())
+        self.resolved_target_field(input_schema)
     }
 
     fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
@@ -164,16 +260,16 @@ impl PhysicalExpr for CastExpr {
         self: Arc<Self>,
         children: Vec<Arc<dyn PhysicalExpr>>,
     ) -> Result<Arc<dyn PhysicalExpr>> {
-        Ok(Arc::new(CastExpr::new(
+        Ok(Arc::new(CastExpr::new_with_target_field(
             Arc::clone(&children[0]),
-            self.cast_type.clone(),
+            Arc::clone(&self.target_field),
             Some(self.cast_options.clone()),
         )))
     }
 
     fn evaluate_bounds(&self, children: &[&Interval]) -> Result<Interval> {
         // Cast current node's interval to the right type:
-        children[0].cast_to(&self.cast_type, &self.cast_options)
+        children[0].cast_to(self.cast_type(), &self.cast_options)
     }
 
     fn propagate_constraints(
@@ -185,32 +281,20 @@ impl PhysicalExpr for CastExpr {
         // Get child's datatype:
         let cast_type = child_interval.data_type();
         Ok(Some(vec![
-            interval.cast_to(&cast_type, &DEFAULT_SAFE_CAST_OPTIONS)?
+            interval.cast_to(&cast_type, &DEFAULT_SAFE_CAST_OPTIONS)?,
         ]))
     }
 
     /// A [`CastExpr`] preserves the ordering of its child if the cast is done
     /// under the same datatype family.
     fn get_properties(&self, children: &[ExprProperties]) -> Result<ExprProperties> {
-        let source_datatype = children[0].range.data_type();
-        let target_type = &self.cast_type;
-
-        let unbounded = Interval::make_unbounded(target_type)?;
-        if (source_datatype.is_numeric() || source_datatype == Boolean)
-            && target_type.is_numeric()
-            || source_datatype.is_temporal() && target_type.is_temporal()
-            || source_datatype.eq(target_type)
-        {
-            Ok(children[0].clone().with_range(unbounded))
-        } else {
-            Ok(ExprProperties::new_unknown().with_range(unbounded))
-        }
+        cast_expr_properties(&children[0], self.cast_type())
     }
 
     fn fmt_sql(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         write!(f, "CAST(")?;
         self.expr.fmt_sql(f)?;
-        write!(f, " AS {:?}", self.cast_type)?;
+        write!(f, " AS {:?}", self.cast_type())?;
 
         write!(f, ")")
     }
@@ -225,15 +309,55 @@ pub fn cast_with_options(
     input_schema: &Schema,
     cast_type: DataType,
     cast_options: Option<CastOptions<'static>>,
+) -> Result<Arc<dyn PhysicalExpr>> {
+    cast_with_target_field(
+        expr,
+        input_schema,
+        cast_type.into_nullable_field_ref(),
+        cast_options,
+    )
+}
+
+/// Return a PhysicalExpression representing `expr` casted to `target_field`,
+/// preserving any explicit field semantics such as name, nullability, and
+/// metadata.
+///
+/// If the input expression already has the same data type, this helper still
+/// preserves an explicit `target_field` by constructing a field-aware
+/// [`CastExpr`]. Only the default synthesized field created by the legacy
+/// type-only API is elided back to the original child expression.
+pub fn cast_with_target_field(
+    expr: Arc<dyn PhysicalExpr>,
+    input_schema: &Schema,
+    target_field: FieldRef,
+    cast_options: Option<CastOptions<'static>>,
 ) -> Result<Arc<dyn PhysicalExpr>> {
     let expr_type = expr.data_type(input_schema)?;
-    if expr_type == cast_type {
-        Ok(Arc::clone(&expr))
-    } else if can_cast_types(&expr_type, &cast_type) {
-        Ok(Arc::new(CastExpr::new(expr, cast_type, cast_options)))
+    let cast_type = target_field.data_type();
+    if expr_type == *cast_type && is_default_target_field(&target_field) {
+        return Ok(Arc::clone(&expr));
+    }
+
+    let can_build_cast = if requires_nested_struct_cast(&expr_type, cast_type) {
+        // Allow casts involving structs (including nested inside Lists, Dictionaries,
+        // etc.) that pass name-based compatibility validation. This validation is
+        // applied at planning time (now) to fail fast, rather than deferring errors
+        // to execution time. The name-based casting logic will be executed at runtime
+        // via ColumnarValue::cast_to.
+        can_cast_named_struct_types(&expr_type, cast_type)
     } else {
-        not_impl_err!("Unsupported CAST from {expr_type} to {cast_type}")
+        can_cast_types(&expr_type, cast_type)
+    };
+
+    if !can_build_cast {
+        return not_impl_err!("Unsupported CAST from {expr_type} to {cast_type}");
     }
+
+    Ok(Arc::new(CastExpr::new_with_target_field(
+        expr,
+        target_field,
+        cast_options,
+    )))
 }
 
 /// Return a PhysicalExpression representing `expr` casted to
@@ -256,14 +380,45 @@ mod tests {
 
     use arrow::{
         array::{
-            Array, Decimal128Array, Float32Array, Float64Array, Int16Array, Int32Array,
-            Int64Array, Int8Array, StringArray, Time64NanosecondArray,
-            TimestampNanosecondArray, UInt32Array,
+            Array, ArrayRef, Decimal128Array, Float32Array, Float64Array, Int8Array,
+            Int16Array, Int32Array, Int64Array, StringArray, StructArray,
+            Time64NanosecondArray, TimestampNanosecondArray, UInt32Array,
         },
         datatypes::*,
     };
+    use datafusion_common::ScalarValue;
+    use datafusion_common::cast::{
+        as_boolean_array, as_int64_array, as_string_array, as_struct_array,
+        as_uint8_array,
+    };
     use datafusion_physical_expr_common::physical_expr::fmt_sql;
     use insta::assert_snapshot;
+    use std::collections::HashMap;
+
+    fn make_struct_array(fields: Fields, arrays: Vec<ArrayRef>) -> StructArray {
+        StructArray::new(fields, arrays, None)
+    }
+
+    fn cast_struct_array(
+        column: &str,
+        input_field: Field,
+        target_field: Field,
+        input_array: StructArray,
+    ) -> Result<StructArray> {
+        let schema = Arc::new(Schema::new(vec![input_field]));
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(input_array) as ArrayRef],
+        )?;
+        let expr = CastExpr::new_with_target_field(
+            col(column, schema.as_ref())?,
+            Arc::new(target_field),
+            None,
+        );
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        Ok(as_struct_array(result.as_ref())?.clone())
+    }
 
     // runs an end-to-end test of physical type cast
     // 1. construct a record batch with a column "a" of type A
@@ -283,10 +438,7 @@ mod tests {
                 cast_with_options(col("a", &schema)?, &schema, $TYPE, $CAST_OPTIONS)?;
 
             // verify that its display is correct
-            assert_eq!(
-                format!("CAST(a@0 AS {:?})", $TYPE),
-                format!("{}", expression)
-            );
+            assert_eq!(format!("CAST(a@0 AS {})", $TYPE), format!("{}", expression));
 
             // verify that the expression's type is correct
             assert_eq!(expression.data_type(&schema)?, $TYPE);
@@ -310,7 +462,7 @@ mod tests {
             for (i, x) in $VEC.iter().enumerate() {
                 match x {
                     Some(x) => assert_eq!(result.value(i), *x),
-                    None => assert!(!result.is_valid(i)),
+                    None => assert!(result.is_null(i)),
                 }
             }
         }};
@@ -335,10 +487,7 @@ mod tests {
                 cast_with_options(col("a", &schema)?, &schema, $TYPE, $CAST_OPTIONS)?;
 
             // verify that its display is correct
-            assert_eq!(
-                format!("CAST(a@0 AS {:?})", $TYPE),
-                format!("{}", expression)
-            );
+            assert_eq!(format!("CAST(a@0 AS {})", $TYPE), format!("{}", expression));
 
             // verify that the expression's type is correct
             assert_eq!(expression.data_type(&schema)?, $TYPE);
@@ -365,7 +514,7 @@ mod tests {
             for (i, x) in $VEC.iter().enumerate() {
                 match x {
                     Some(x) => assert_eq!(result.value(i), *x),
-                    None => assert!(!result.is_valid(i)),
+                    None => assert!(result.is_null(i)),
                 }
             }
         }};
@@ -740,6 +889,9 @@ mod tests {
         Ok(())
     }
 
+    // Tests for timestamp timezone casting have been moved to timestamps.slt
+    // See the "Casting between timestamp with and without timezone" section
+
     #[test]
     fn invalid_cast() {
         // Ensure a useful error happens at plan time if invalid casts are used
@@ -765,14 +917,209 @@ mod tests {
         match result {
             Ok(_) => panic!("expected error"),
             Err(e) => {
-                assert!(e
-                    .to_string()
-                    .contains("Cannot cast string '9.1' to value of Int32 type"))
+                assert!(
+                    e.to_string()
+                        .contains("Cannot cast string '9.1' to value of Int32 type")
+                )
             }
         }
         Ok(())
     }
 
+    #[test]
+    fn field_aware_cast_preserves_target_field_semantics() -> Result<()> {
+        let metadata = HashMap::from([("target_meta".to_string(), "1".to_string())]);
+
+        for (child_nullable, target_nullable) in [(true, false), (false, true)] {
+            let schema = Schema::new(vec![Field::new("a", Int32, child_nullable)]);
+            let expr = CastExpr::new_with_target_field(
+                col("a", &schema)?,
+                Arc::new(
+                    Field::new("cast_target", Int64, target_nullable)
+                        .with_metadata(metadata.clone()),
+                ),
+                None,
+            );
+
+            let field = expr.return_field(&schema)?;
+            assert_eq!(field.name(), "cast_target");
+            assert_eq!(field.data_type(), &Int64);
+            assert_eq!(field.is_nullable(), target_nullable);
+            assert_eq!(
+                field.metadata().get("target_meta").map(String::as_str),
+                Some("1")
+            );
+            assert_eq!(expr.nullable(&schema)?, child_nullable || target_nullable);
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn type_only_cast_preserves_legacy_field_name_and_nullability() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", Int32, false)]);
+        let expr = CastExpr::new(col("a", &schema)?, Int64, None);
+
+        let field = expr.return_field(&schema)?;
+
+        assert_eq!(field.name(), "a");
+        assert_eq!(field.data_type(), &Int64);
+        assert!(!field.is_nullable());
+        assert!(!expr.nullable(&schema)?);
+
+        Ok(())
+    }
+
+    #[test]
+    fn struct_cast_validation_uses_nested_target_fields() -> Result<()> {
+        let source_type = Struct(Fields::from(vec![
+            Arc::new(Field::new("x", Int32, true)),
+            Arc::new(Field::new("y", Utf8, true)),
+        ]));
+        let schema = Schema::new(vec![Field::new("a", source_type.clone(), true)]);
+
+        let valid_target = Struct(Fields::from(vec![
+            Arc::new(Field::new("y", Utf8, true)),
+            Arc::new(Field::new("x", Int64, true)),
+        ]));
+        cast_with_options(col("a", &schema)?, &schema, valid_target, None)?;
+
+        let invalid_target = Struct(Fields::from(vec![
+            Arc::new(Field::new("y", Utf8, true)),
+            Arc::new(Field::new("missing", Int64, false)),
+        ]));
+        let err = cast_with_options(col("a", &schema)?, &schema, invalid_target, None)
+            .expect_err("missing required struct field should fail");
+
+        assert!(err.to_string().contains("Unsupported CAST"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn field_aware_cast_struct_array_missing_child() -> Result<()> {
+        let source_a = Field::new("a", Int32, true);
+        let source_b = Field::new("b", Utf8, true);
+        let target_field = Field::new(
+            "s",
+            Struct(
+                vec![
+                    Arc::new(Field::new("a", Int64, true)),
+                    Arc::new(Field::new("c", Utf8, true)),
+                ]
+                .into(),
+            ),
+            true,
+        );
+
+        let struct_array = cast_struct_array(
+            "s",
+            Field::new(
+                "s",
+                Struct(
+                    vec![Arc::new(source_a.clone()), Arc::new(source_b.clone())].into(),
+                ),
+                true,
+            ),
+            target_field,
+            make_struct_array(
+                vec![Arc::new(source_a), Arc::new(source_b)].into(),
+                vec![
+                    Arc::new(Int32Array::from(vec![Some(1), None])) as ArrayRef,
+                    Arc::new(StringArray::from(vec![Some("alpha"), Some("beta")]))
+                        as ArrayRef,
+                ],
+            ),
+        )?;
+        let cast_a = as_int64_array(struct_array.column_by_name("a").unwrap().as_ref())?;
+        assert_eq!(cast_a.value(0), 1);
+        assert!(cast_a.is_null(1));
+
+        let cast_c = as_string_array(struct_array.column_by_name("c").unwrap().as_ref())?;
+        assert!(cast_c.is_null(0));
+        assert!(cast_c.is_null(1));
+        Ok(())
+    }
+
+    #[test]
+    fn field_aware_cast_nested_struct_array() -> Result<()> {
+        let inner_source = Field::new(
+            "inner",
+            Struct(vec![Arc::new(Field::new("x", Int32, true))].into()),
+            true,
+        );
+        let inner_target = Field::new(
+            "inner",
+            Struct(
+                vec![
+                    Arc::new(Field::new("x", Int64, true)),
+                    Arc::new(Field::new("y", Boolean, true)),
+                ]
+                .into(),
+            ),
+            true,
+        );
+        let target_field =
+            Field::new("root", Struct(vec![Arc::new(inner_target)].into()), true);
+
+        let inner_struct = make_struct_array(
+            vec![Arc::new(Field::new("x", Int32, true))].into(),
+            vec![Arc::new(Int32Array::from(vec![Some(7), None])) as ArrayRef],
+        );
+        let outer_struct = make_struct_array(
+            vec![Arc::new(inner_source.clone())].into(),
+            vec![Arc::new(inner_struct) as ArrayRef],
+        );
+        let struct_array = cast_struct_array(
+            "root",
+            Field::new("root", Struct(vec![Arc::new(inner_source)].into()), true),
+            target_field,
+            outer_struct,
+        )?;
+        let inner =
+            as_struct_array(struct_array.column_by_name("inner").unwrap().as_ref())?;
+        let x = as_int64_array(inner.column_by_name("x").unwrap().as_ref())?;
+        assert_eq!(x.value(0), 7);
+        assert!(x.is_null(1));
+        let y = as_boolean_array(inner.column_by_name("y").unwrap().as_ref())?;
+        assert!(y.is_null(0));
+        assert!(y.is_null(1));
+        Ok(())
+    }
+
+    #[test]
+    fn field_aware_cast_struct_scalar() -> Result<()> {
+        let source_field = Field::new("a", Int32, true);
+        let target_field = Field::new(
+            "s",
+            Struct(vec![Arc::new(Field::new("a", UInt8, true))].into()),
+            true,
+        );
+
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "s",
+            Struct(vec![Arc::new(source_field.clone())].into()),
+            true,
+        )]));
+        let scalar_struct = make_struct_array(
+            vec![Arc::new(source_field)].into(),
+            vec![Arc::new(Int32Array::from(vec![Some(9)])) as ArrayRef],
+        );
+        let literal = Arc::new(crate::expressions::Literal::new(ScalarValue::Struct(
+            Arc::new(scalar_struct),
+        )));
+        let expr = CastExpr::new_with_target_field(literal, Arc::new(target_field), None);
+
+        let batch = RecordBatch::new_empty(schema);
+        let result = expr.evaluate(&batch)?;
+        let ColumnarValue::Scalar(ScalarValue::Struct(array)) = result else {
+            panic!("expected struct scalar");
+        };
+        let casted = as_uint8_array(array.column_by_name("a").unwrap().as_ref())?;
+        assert_eq!(casted.value(0), 9);
+        Ok(())
+    }
+
     #[test]
     #[ignore] // TODO: https://github.com/apache/datafusion/issues/5396
     fn test_cast_decimal() -> Result<()> {
diff --git a/datafusion/physical-expr/src/expressions/cast_column.rs b/datafusion/physical-expr/src/expressions/cast_column.rs
deleted file mode 100644
index 80d71c3def408..0000000000000
--- a/datafusion/physical-expr/src/expressions/cast_column.rs
+++ /dev/null
@@ -1,409 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Physical expression for struct-aware casting of columns.
-
-use crate::physical_expr::PhysicalExpr;
-use arrow::{
-    compute::CastOptions,
-    datatypes::{DataType, FieldRef, Schema},
-    record_batch::RecordBatch,
-};
-use datafusion_common::{
-    format::DEFAULT_CAST_OPTIONS, nested_struct::cast_column, Result, ScalarValue,
-};
-use datafusion_expr_common::columnar_value::ColumnarValue;
-use std::{
-    any::Any,
-    fmt::{self, Display},
-    hash::Hash,
-    sync::Arc,
-};
-/// A physical expression that applies [`cast_column`] to its input.
-///
-/// [`CastColumnExpr`] extends the regular [`CastExpr`](super::CastExpr) by
-/// retaining schema metadata for both the input and output fields. This allows
-/// the evaluator to perform struct-aware casts that honour nested field
-/// ordering, preserve nullability, and fill missing fields with null values.
-///
-/// This expression is intended for schema rewriting scenarios where the
-/// planner already resolved the input column but needs to adapt its physical
-/// representation to a new [`arrow::datatypes::Field`]. It mirrors the behaviour of the
-/// [`datafusion_common::nested_struct::cast_column`] helper while integrating
-/// with the `PhysicalExpr` trait so it can participate in the execution plan
-/// like any other column expression.
-#[derive(Debug, Clone, Eq)]
-pub struct CastColumnExpr {
-    /// The physical expression producing the value to cast.
-    expr: Arc<dyn PhysicalExpr>,
-    /// The logical field of the input column.
-    input_field: FieldRef,
-    /// The field metadata describing the desired output column.
-    target_field: FieldRef,
-    /// Options forwarded to [`cast_column`].
-    cast_options: CastOptions<'static>,
-}
-
-// Manually derive `PartialEq`/`Hash` as `Arc<dyn PhysicalExpr>` does not
-// implement these traits by default for the trait object.
-impl PartialEq for CastColumnExpr {
-    fn eq(&self, other: &Self) -> bool {
-        self.expr.eq(&other.expr)
-            && self.input_field.eq(&other.input_field)
-            && self.target_field.eq(&other.target_field)
-            && self.cast_options.eq(&other.cast_options)
-    }
-}
-
-impl Hash for CastColumnExpr {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.expr.hash(state);
-        self.input_field.hash(state);
-        self.target_field.hash(state);
-        self.cast_options.hash(state);
-    }
-}
-
-impl CastColumnExpr {
-    /// Create a new [`CastColumnExpr`].
-    pub fn new(
-        expr: Arc<dyn PhysicalExpr>,
-        input_field: FieldRef,
-        target_field: FieldRef,
-        cast_options: Option<CastOptions<'static>>,
-    ) -> Self {
-        Self {
-            expr,
-            input_field,
-            target_field,
-            cast_options: cast_options.unwrap_or(DEFAULT_CAST_OPTIONS),
-        }
-    }
-
-    /// The expression that produces the value to be cast.
-    pub fn expr(&self) -> &Arc<dyn PhysicalExpr> {
-        &self.expr
-    }
-
-    /// Field metadata describing the resolved input column.
-    pub fn input_field(&self) -> &FieldRef {
-        &self.input_field
-    }
-
-    /// Field metadata describing the output column after casting.
-    pub fn target_field(&self) -> &FieldRef {
-        &self.target_field
-    }
-}
-
-impl Display for CastColumnExpr {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(
-            f,
-            "CAST_COLUMN({} AS {:?})",
-            self.expr,
-            self.target_field.data_type()
-        )
-    }
-}
-
-impl PhysicalExpr for CastColumnExpr {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
-        Ok(self.target_field.data_type().clone())
-    }
-
-    fn nullable(&self, _input_schema: &Schema) -> Result<bool> {
-        Ok(self.target_field.is_nullable())
-    }
-
-    fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
-        let value = self.expr.evaluate(batch)?;
-        match value {
-            ColumnarValue::Array(array) => {
-                let casted =
-                    cast_column(&array, self.target_field.as_ref(), &self.cast_options)?;
-                Ok(ColumnarValue::Array(casted))
-            }
-            ColumnarValue::Scalar(scalar) => {
-                let as_array = scalar.to_array_of_size(1)?;
-                let casted = cast_column(
-                    &as_array,
-                    self.target_field.as_ref(),
-                    &self.cast_options,
-                )?;
-                let result = ScalarValue::try_from_array(casted.as_ref(), 0)?;
-                Ok(ColumnarValue::Scalar(result))
-            }
-        }
-    }
-
-    fn return_field(&self, _input_schema: &Schema) -> Result<FieldRef> {
-        Ok(Arc::clone(&self.target_field))
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-        vec![&self.expr]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        mut children: Vec<Arc<dyn PhysicalExpr>>,
-    ) -> Result<Arc<dyn PhysicalExpr>> {
-        assert_eq!(children.len(), 1);
-        let child = children.pop().expect("CastColumnExpr child");
-        Ok(Arc::new(Self::new(
-            child,
-            Arc::clone(&self.input_field),
-            Arc::clone(&self.target_field),
-            Some(self.cast_options.clone()),
-        )))
-    }
-
-    fn fmt_sql(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        Display::fmt(self, f)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use crate::expressions::{Column, Literal};
-    use arrow::{
-        array::{Array, ArrayRef, BooleanArray, Int32Array, StringArray, StructArray},
-        datatypes::{DataType, Field, Fields, SchemaRef},
-    };
-    use datafusion_common::{
-        cast::{as_int64_array, as_string_array, as_struct_array, as_uint8_array},
-        Result as DFResult, ScalarValue,
-    };
-
-    fn make_schema(field: &Field) -> SchemaRef {
-        Arc::new(Schema::new(vec![field.clone()]))
-    }
-
-    fn make_struct_array(fields: Fields, arrays: Vec<ArrayRef>) -> StructArray {
-        StructArray::new(fields, arrays, None)
-    }
-
-    #[test]
-    fn cast_primitive_array() -> DFResult<()> {
-        let input_field = Field::new("a", DataType::Int32, true);
-        let target_field = Field::new("a", DataType::Int64, true);
-        let schema = make_schema(&input_field);
-
-        let values = Arc::new(Int32Array::from(vec![Some(1), None, Some(3)]));
-        let batch = RecordBatch::try_new(Arc::clone(&schema), vec![values])?;
-
-        let column = Arc::new(Column::new_with_schema("a", schema.as_ref())?);
-        let expr = CastColumnExpr::new(
-            column,
-            Arc::new(input_field.clone()),
-            Arc::new(target_field.clone()),
-            None,
-        );
-
-        let result = expr.evaluate(&batch)?;
-        let ColumnarValue::Array(array) = result else {
-            panic!("expected array");
-        };
-        let casted = as_int64_array(array.as_ref())?;
-        assert_eq!(casted.value(0), 1);
-        assert!(casted.is_null(1));
-        assert_eq!(casted.value(2), 3);
-        Ok(())
-    }
-
-    #[test]
-    fn cast_struct_array_missing_child() -> DFResult<()> {
-        let source_a = Field::new("a", DataType::Int32, true);
-        let source_b = Field::new("b", DataType::Utf8, true);
-        let input_field = Field::new(
-            "s",
-            DataType::Struct(
-                vec![Arc::new(source_a.clone()), Arc::new(source_b.clone())].into(),
-            ),
-            true,
-        );
-        let target_a = Field::new("a", DataType::Int64, true);
-        let target_c = Field::new("c", DataType::Utf8, true);
-        let target_field = Field::new(
-            "s",
-            DataType::Struct(
-                vec![Arc::new(target_a.clone()), Arc::new(target_c.clone())].into(),
-            ),
-            true,
-        );
-
-        let schema = make_schema(&input_field);
-        let struct_array = make_struct_array(
-            vec![Arc::new(source_a.clone()), Arc::new(source_b.clone())].into(),
-            vec![
-                Arc::new(Int32Array::from(vec![Some(1), None])) as ArrayRef,
-                Arc::new(StringArray::from(vec![Some("alpha"), Some("beta")]))
-                    as ArrayRef,
-            ],
-        );
-        let batch = RecordBatch::try_new(
-            Arc::clone(&schema),
-            vec![Arc::new(struct_array) as Arc<_>],
-        )?;
-
-        let column = Arc::new(Column::new_with_schema("s", schema.as_ref())?);
-        let expr = CastColumnExpr::new(
-            column,
-            Arc::new(input_field.clone()),
-            Arc::new(target_field.clone()),
-            None,
-        );
-
-        let result = expr.evaluate(&batch)?;
-        let ColumnarValue::Array(array) = result else {
-            panic!("expected array");
-        };
-        let struct_array = as_struct_array(array.as_ref())?;
-        let cast_a = as_int64_array(struct_array.column_by_name("a").unwrap().as_ref())?;
-        assert_eq!(cast_a.value(0), 1);
-        assert!(cast_a.is_null(1));
-
-        let cast_c = as_string_array(struct_array.column_by_name("c").unwrap().as_ref())?;
-        assert!(cast_c.is_null(0));
-        assert!(cast_c.is_null(1));
-        Ok(())
-    }
-
-    #[test]
-    fn cast_nested_struct_array() -> DFResult<()> {
-        let inner_source = Field::new(
-            "inner",
-            DataType::Struct(
-                vec![Arc::new(Field::new("x", DataType::Int32, true))].into(),
-            ),
-            true,
-        );
-        let outer_field = Field::new(
-            "root",
-            DataType::Struct(vec![Arc::new(inner_source.clone())].into()),
-            true,
-        );
-
-        let inner_target = Field::new(
-            "inner",
-            DataType::Struct(
-                vec![
-                    Arc::new(Field::new("x", DataType::Int64, true)),
-                    Arc::new(Field::new("y", DataType::Boolean, true)),
-                ]
-                .into(),
-            ),
-            true,
-        );
-        let target_field = Field::new(
-            "root",
-            DataType::Struct(vec![Arc::new(inner_target.clone())].into()),
-            true,
-        );
-
-        let schema = make_schema(&outer_field);
-
-        let inner_struct = make_struct_array(
-            vec![Arc::new(Field::new("x", DataType::Int32, true))].into(),
-            vec![Arc::new(Int32Array::from(vec![Some(7), None])) as ArrayRef],
-        );
-        let outer_struct = make_struct_array(
-            vec![Arc::new(inner_source.clone())].into(),
-            vec![Arc::new(inner_struct) as ArrayRef],
-        );
-        let batch = RecordBatch::try_new(
-            Arc::clone(&schema),
-            vec![Arc::new(outer_struct) as ArrayRef],
-        )?;
-
-        let column = Arc::new(Column::new_with_schema("root", schema.as_ref())?);
-        let expr = CastColumnExpr::new(
-            column,
-            Arc::new(outer_field.clone()),
-            Arc::new(target_field.clone()),
-            None,
-        );
-
-        let result = expr.evaluate(&batch)?;
-        let ColumnarValue::Array(array) = result else {
-            panic!("expected array");
-        };
-        let struct_array = as_struct_array(array.as_ref())?;
-        let inner =
-            as_struct_array(struct_array.column_by_name("inner").unwrap().as_ref())?;
-        let x = as_int64_array(inner.column_by_name("x").unwrap().as_ref())?;
-        assert_eq!(x.value(0), 7);
-        assert!(x.is_null(1));
-        let y = inner.column_by_name("y").unwrap();
-        let y = y
-            .as_any()
-            .downcast_ref::<BooleanArray>()
-            .expect("boolean array");
-        assert!(y.is_null(0));
-        assert!(y.is_null(1));
-        Ok(())
-    }
-
-    #[test]
-    fn cast_struct_scalar() -> DFResult<()> {
-        let source_field = Field::new("a", DataType::Int32, true);
-        let input_field = Field::new(
-            "s",
-            DataType::Struct(vec![Arc::new(source_field.clone())].into()),
-            true,
-        );
-        let target_field = Field::new(
-            "s",
-            DataType::Struct(
-                vec![Arc::new(Field::new("a", DataType::UInt8, true))].into(),
-            ),
-            true,
-        );
-
-        let schema = make_schema(&input_field);
-        let scalar_struct = StructArray::new(
-            vec![Arc::new(source_field.clone())].into(),
-            vec![Arc::new(Int32Array::from(vec![Some(9)])) as ArrayRef],
-            None,
-        );
-        let literal =
-            Arc::new(Literal::new(ScalarValue::Struct(Arc::new(scalar_struct))));
-        let expr = CastColumnExpr::new(
-            literal,
-            Arc::new(input_field.clone()),
-            Arc::new(target_field.clone()),
-            None,
-        );
-
-        let batch = RecordBatch::new_empty(Arc::clone(&schema));
-        let result = expr.evaluate(&batch)?;
-        let ColumnarValue::Scalar(ScalarValue::Struct(array)) = result else {
-            panic!("expected struct scalar");
-        };
-        let casted = array.column_by_name("a").unwrap();
-        let casted = as_uint8_array(casted.as_ref())?;
-        assert_eq!(casted.value(0), 9);
-        Ok(())
-    }
-}
diff --git a/datafusion/physical-expr/src/expressions/column.rs b/datafusion/physical-expr/src/expressions/column.rs
index 9ca464b304306..ba8cd5e6360a1 100644
--- a/datafusion/physical-expr/src/expressions/column.rs
+++ b/datafusion/physical-expr/src/expressions/column.rs
@@ -17,7 +17,6 @@
 
 //! Physical column reference: [`Column`]
 
-use std::any::Any;
 use std::hash::Hash;
 use std::sync::Arc;
 
@@ -28,8 +27,9 @@ use arrow::{
     record_batch::RecordBatch,
 };
 use datafusion_common::tree_node::{Transformed, TreeNode};
-use datafusion_common::{internal_err, plan_err, Result};
+use datafusion_common::{Result, internal_err, plan_err};
 use datafusion_expr::ColumnarValue;
+use datafusion_expr_common::placement::ExpressionPlacement;
 
 /// Represents the column at a given index in a RecordBatch
 ///
@@ -105,11 +105,6 @@ impl std::fmt::Display for Column {
 }
 
 impl PhysicalExpr for Column {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     /// Get the data type of this expression, given the schema of the input
     fn data_type(&self, input_schema: &Schema) -> Result<DataType> {
         self.bounds_check(input_schema)?;
@@ -146,6 +141,10 @@ impl PhysicalExpr for Column {
     fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{}", self.name)
     }
+
+    fn placement(&self) -> ExpressionPlacement {
+        ExpressionPlacement::Column
+    }
 }
 
 impl Column {
@@ -158,7 +157,11 @@ impl Column {
                 self.name,
                 self.index,
                 input_schema.fields.len(),
-                input_schema.fields().iter().map(|f| f.name()).collect::<Vec<_>>()
+                input_schema
+                    .fields()
+                    .iter()
+                    .map(|f| f.name())
+                    .collect::<Vec<_>>()
             )
         }
     }
@@ -180,7 +183,7 @@ pub fn with_new_schema(
 ) -> Result<Arc<dyn PhysicalExpr>> {
     Ok(expr
         .transform_up(|expr| {
-            if let Some(col) = expr.as_any().downcast_ref::<Column>() {
+            if let Some(col) = expr.downcast_ref::<Column>() {
                 let idx = col.index();
                 let Some(field) = schema.fields().get(idx) else {
                     return plan_err!(
diff --git a/datafusion/physical-expr/src/expressions/dynamic_filters.rs b/datafusion/physical-expr/src/expressions/dynamic_filters.rs
index 964a193db833a..2db328377a5e1 100644
--- a/datafusion/physical-expr/src/expressions/dynamic_filters.rs
+++ b/datafusion/physical-expr/src/expressions/dynamic_filters.rs
@@ -16,22 +16,46 @@
 // under the License.
 
 use parking_lot::RwLock;
-use std::{any::Any, fmt::Display, hash::Hash, sync::Arc};
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::{fmt::Display, hash::Hash, sync::Arc};
+use tokio::sync::watch;
 
 use crate::PhysicalExpr;
 use arrow::datatypes::{DataType, Schema};
 use datafusion_common::{
-    tree_node::{Transformed, TransformedResult, TreeNode},
     Result,
+    tree_node::{Transformed, TransformedResult, TreeNode},
 };
 use datafusion_expr::ColumnarValue;
-use datafusion_physical_expr_common::physical_expr::{DynEq, DynHash};
+use datafusion_physical_expr_common::physical_expr::DynHash;
+
+/// State of a dynamic filter, tracking both updates and completion.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum FilterState {
+    /// Filter is in progress and may receive more updates.
+    InProgress { generation: u64 },
+    /// Filter is complete and will not receive further updates.
+    Complete { generation: u64 },
+}
+
+impl FilterState {
+    fn generation(&self) -> u64 {
+        match self {
+            FilterState::InProgress { generation }
+            | FilterState::Complete { generation } => *generation,
+        }
+    }
+}
 
 /// A dynamic [`PhysicalExpr`] that can be updated by anyone with a reference to it.
 ///
 /// Any `ExecutionPlan` that uses this expression and holds a reference to it internally should probably also
 /// implement `ExecutionPlan::reset_state` to remain compatible with recursive queries and other situations where
 /// the same `ExecutionPlan` is reused with different data.
+///
+/// For more background, please also see the [Dynamic Filters: Passing Information Between Operators During Execution for 25x Faster Queries blog]
+///
+/// [Dynamic Filters: Passing Information Between Operators During Execution for 25x Faster Queries blog]: https://datafusion.apache.org/blog/2025/09/10/dynamic-filters
 #[derive(Debug)]
 pub struct DynamicFilterPhysicalExpr {
     /// The original children of this PhysicalExpr, if any.
@@ -44,6 +68,8 @@ pub struct DynamicFilterPhysicalExpr {
     remapped_children: Option<Vec<Arc<dyn PhysicalExpr>>>,
     /// The source of dynamic filters.
     inner: Arc<RwLock<Inner>>,
+    /// Broadcasts filter state (updates and completion) to all waiters.
+    state_watch: watch::Sender<FilterState>,
     /// For testing purposes track the data type and nullability to make sure they don't change.
     /// If they do, there's a bug in the implementation.
     /// But this can have overhead in production, so it's only included in our tests.
@@ -51,21 +77,56 @@ pub struct DynamicFilterPhysicalExpr {
     nullable: Arc<RwLock<Option<bool>>>,
 }
 
-#[derive(Debug)]
-struct Inner {
+/// Atomic internal state of a [`DynamicFilterPhysicalExpr`].
+///
+/// `expression_id` lives here because it identifies the actual filter expression `expr`.
+/// Derived `DynamicFilterPhysicalExpr`s (e.g. via [`PhysicalExpr::with_new_children`]) are
+/// the same logical filter and must report the same `expression_id`.
+///
+/// **Warning:** exposed publicly solely so that proto (de)serialization in
+/// `datafusion-proto` can read and rebuild this state. Do not treat this type
+/// or its layout as a stable API.
+#[derive(Clone)]
+pub struct Inner {
+    /// A unique identifier for the expression.
+    pub expression_id: u64,
     /// A counter that gets incremented every time the expression is updated so that we can track changes cheaply.
     /// This is used for [`PhysicalExpr::snapshot_generation`] to have a cheap check for changes.
-    generation: u64,
-    expr: Arc<dyn PhysicalExpr>,
+    pub generation: u64,
+    pub expr: Arc<dyn PhysicalExpr>,
+    /// Flag for quick synchronous check if filter is complete.
+    /// This is redundant with the watch channel state, but allows us to return immediately
+    /// from `wait_complete()` without subscribing if already complete.
+    pub is_complete: bool,
+}
+
+// TODO: Include expression_id in Debug output.
+//
+// See https://github.com/apache/datafusion/issues/20418. Currently, plan nodes
+// like `HashJoinExec`, `AggregateExec`, `SortExec` do not serialize their
+// dynamic filter. They auto-create one on decode with a fresh `expression_id`,
+// so a round-trip Debug comparison would diverge purely on the id even when
+// the rest of the state is preserved. Excluding it from Debug keeps those
+// roundtrip equality assertions meaningful until that work lands.
+impl std::fmt::Debug for Inner {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Inner")
+            .field("generation", &self.generation)
+            .field("expr", &self.expr)
+            .field("is_complete", &self.is_complete)
+            .finish()
+    }
 }
 
 impl Inner {
     fn new(expr: Arc<dyn PhysicalExpr>) -> Self {
         Self {
+            expression_id: EXPR_ID_SOURCE.next(),
             // Start with generation 1 which gives us a different result for [`PhysicalExpr::generation`] than the default 0.
             // This is not currently used anywhere but it seems useful to have this simple distinction.
             generation: 1,
             expr,
+            is_complete: false,
         }
     }
 
@@ -77,8 +138,11 @@ impl Inner {
 
 impl Hash for DynamicFilterPhysicalExpr {
     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        let inner = self.current().expect("Failed to get current expression");
-        inner.dyn_hash(state);
+        // Use pointer identity of the inner Arc for stable hashing.
+        // This is stable across update() calls and consistent with Eq.
+        // See issue #19641 for details on why content-based hashing violates
+        // the Hash/Eq contract when the underlying expression can change.
+        Arc::as_ptr(&self.inner).hash(state);
         self.children.dyn_hash(state);
         self.remapped_children.dyn_hash(state);
     }
@@ -86,11 +150,13 @@ impl Hash for DynamicFilterPhysicalExpr {
 
 impl PartialEq for DynamicFilterPhysicalExpr {
     fn eq(&self, other: &Self) -> bool {
-        let inner = self.current().expect("Failed to get current expression");
-        let our_children = self.remapped_children.as_ref().unwrap_or(&self.children);
-        let other_children = other.remapped_children.as_ref().unwrap_or(&other.children);
-        let other = other.current().expect("Failed to get current expression");
-        inner.dyn_eq(other.as_any()) && our_children == other_children
+        // Two dynamic filters are equal if they share the same inner source
+        // AND have the same children configuration.
+        // This is consistent with Hash using Arc::as_ptr.
+        // See issue #19641 for details on the Hash/Eq contract violation fix.
+        Arc::ptr_eq(&self.inner, &other.inner)
+            && self.children == other.children
+            && self.remapped_children == other.remapped_children
     }
 }
 
@@ -134,10 +200,12 @@ impl DynamicFilterPhysicalExpr {
         children: Vec<Arc<dyn PhysicalExpr>>,
         inner: Arc<dyn PhysicalExpr>,
     ) -> Self {
+        let (state_watch, _) = watch::channel(FilterState::InProgress { generation: 1 });
         Self {
             children,
             remapped_children: None, // Initially no remapped children
             inner: Arc::new(RwLock::new(Inner::new(inner))),
+            state_watch,
             data_type: Arc::new(RwLock::new(None)),
             nullable: Arc::new(RwLock::new(None)),
         }
@@ -185,7 +253,7 @@ impl DynamicFilterPhysicalExpr {
         Self::remap_children(&self.children, self.remapped_children.as_ref(), expr)
     }
 
-    /// Update the current expression.
+    /// Update the current expression and notify all waiters.
     /// Any children of this expression must be a subset of the original children
     /// passed to the constructor.
     /// This should be called e.g.:
@@ -204,13 +272,94 @@ impl DynamicFilterPhysicalExpr {
 
         // Load the current inner, increment generation, and store the new one
         let mut current = self.inner.write();
+        let new_generation = current.generation + 1;
         *current = Inner {
-            generation: current.generation + 1,
+            // Preserve the expression id across updates.
+            expression_id: current.expression_id,
+            generation: new_generation,
             expr: new_expr,
+            is_complete: current.is_complete,
         };
+        drop(current); // Release the lock before broadcasting
+
+        // Broadcast the new state to all waiters
+        let _ = self.state_watch.send(FilterState::InProgress {
+            generation: new_generation,
+        });
         Ok(())
     }
 
+    /// Mark this dynamic filter as complete and broadcast to all waiters.
+    ///
+    /// This signals that all expected updates have been received.
+    /// Waiters using [`Self::wait_complete`] will be notified.
+    pub fn mark_complete(&self) {
+        let mut current = self.inner.write();
+        let current_generation = current.generation;
+        current.is_complete = true;
+        drop(current);
+
+        // Broadcast completion to all waiters
+        let _ = self.state_watch.send(FilterState::Complete {
+            generation: current_generation,
+        });
+    }
+
+    /// Wait asynchronously for any update to this filter.
+    ///
+    /// This method will return when [`Self::update`] is called and the generation increases.
+    /// It does not guarantee that the filter is complete.
+    ///
+    /// Producers (e.g.) HashJoinExec may never update the expression or mark it as completed if there are no consumers.
+    /// If you call this method on a dynamic filter created by such a producer and there are no consumers registered this method would wait indefinitely.
+    /// This should not happen under normal operation and would indicate a programming error either in your producer or in DataFusion if the producer is a built in node.
+    pub async fn wait_update(&self) {
+        let mut rx = self.state_watch.subscribe();
+        // Get the current generation
+        let current_gen = rx.borrow_and_update().generation();
+
+        // Wait until generation increases
+        let _ = rx.wait_for(|state| state.generation() > current_gen).await;
+    }
+
+    /// Wait asynchronously until this dynamic filter is marked as complete.
+    ///
+    /// This method returns immediately if the filter is already complete.
+    /// Otherwise, it waits until [`Self::mark_complete`] is called.
+    ///
+    /// Unlike [`Self::wait_update`], this method guarantees that when it returns,
+    /// the filter is fully complete with no more updates expected.
+    ///
+    /// Producers (e.g.) HashJoinExec may never update the expression or mark it as completed if there are no consumers.
+    /// If you call this method on a dynamic filter created by such a producer and there are no consumers registered this method would wait indefinitely.
+    /// This should not happen under normal operation and would indicate a programming error either in your producer or in DataFusion if the producer is a built in node.
+    pub async fn wait_complete(&self) {
+        if self.inner.read().is_complete {
+            return;
+        }
+
+        let mut rx = self.state_watch.subscribe();
+        let _ = rx
+            .wait_for(|state| matches!(state, FilterState::Complete { .. }))
+            .await;
+    }
+
+    /// Check if this dynamic filter is being actively used by any consumers.
+    ///
+    /// Returns `true` if there are references beyond the producer (e.g., the HashJoinExec
+    /// that created the filter). This is useful to avoid computing expensive filter
+    /// expressions when no consumer will actually use them.
+    ///
+    /// # Implementation Details
+    ///
+    /// We check both Arc counts to handle two cases:
+    /// - Transformed filters (via `with_new_children`) share the inner Arc (inner count > 1)
+    /// - Direct clones (via `Arc::clone`) increment the outer count (outer count > 1)
+    pub fn is_used(self: &Arc<Self>) -> bool {
+        // Strong count > 1 means at least one consumer is holding a reference beyond the producer.
+        Arc::strong_count(self) > 1 || Arc::strong_count(&self.inner) > 1
+    }
+
     fn render(
         &self,
         f: &mut std::fmt::Formatter<'_>,
@@ -230,13 +379,65 @@ impl DynamicFilterPhysicalExpr {
 
         write!(f, " ]")
     }
-}
 
-impl PhysicalExpr for DynamicFilterPhysicalExpr {
-    fn as_any(&self) -> &dyn Any {
-        self
+    /// Return the filter's original children (before any remapping).
+    ///
+    /// **Warning:** intended only for `datafusion-proto` (de)serialization.
+    /// Not a stable API.
+    pub fn original_children(&self) -> &[Arc<dyn PhysicalExpr>] {
+        &self.children
     }
 
+    /// Return the filter's remapped children, if any have been set via
+    /// [`PhysicalExpr::with_new_children`].
+    ///
+    /// **Warning:** intended only for `datafusion-proto` (de)serialization.
+    /// Not a stable API.
+    pub fn remapped_children(&self) -> Option<&[Arc<dyn PhysicalExpr>]> {
+        self.remapped_children.as_deref()
+    }
+
+    /// Rebuild a `DynamicFilterPhysicalExpr` from its stored parts. Used by
+    /// proto deserialization.
+    ///
+    /// **Warning:** intended only for `datafusion-proto` (de)serialization.
+    /// Not a stable API.
+    pub fn from_parts(
+        children: Vec<Arc<dyn PhysicalExpr>>,
+        remapped_children: Option<Vec<Arc<dyn PhysicalExpr>>>,
+        inner: Inner,
+    ) -> Self {
+        let state = if inner.is_complete {
+            FilterState::Complete {
+                generation: inner.generation,
+            }
+        } else {
+            FilterState::InProgress {
+                generation: inner.generation,
+            }
+        };
+        let (state_watch, _) = watch::channel(state);
+
+        Self {
+            children,
+            remapped_children,
+            inner: Arc::new(RwLock::new(inner)),
+            state_watch,
+            data_type: Arc::new(RwLock::new(None)),
+            nullable: Arc::new(RwLock::new(None)),
+        }
+    }
+
+    /// Return a clone of the atomically-captured `Inner` state.
+    ///
+    /// **Warning:** intended only for `datafusion-proto` (de)serialization.
+    /// Not a stable API.
+    pub fn inner(&self) -> Inner {
+        self.inner.read().clone()
+    }
+}
+
+impl PhysicalExpr for DynamicFilterPhysicalExpr {
     fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
         self.remapped_children
             .as_ref()
@@ -252,7 +453,9 @@ impl PhysicalExpr for DynamicFilterPhysicalExpr {
         Ok(Arc::new(Self {
             children: self.children.clone(),
             remapped_children: Some(children),
+            // Note: expression_id is preserved
             inner: Arc::clone(&self.inner),
+            state_watch: self.state_watch.clone(),
             data_type: Arc::clone(&self.data_type),
             nullable: Arc::clone(&self.nullable),
         }))
@@ -331,12 +534,40 @@ impl PhysicalExpr for DynamicFilterPhysicalExpr {
         // Return the current generation of the expression.
         self.inner.read().generation
     }
+
+    fn expression_id(&self) -> Option<u64> {
+        Some(self.inner.read().expression_id)
+    }
+}
+
+/// An atomic counter used to generate monotonic u64 ids.
+struct ExpressionIdAtomicCounter {
+    inner: AtomicU64,
 }
 
+impl ExpressionIdAtomicCounter {
+    const fn new() -> Self {
+        Self {
+            inner: AtomicU64::new(0),
+        }
+    }
+
+    /// Returns a fresh `expression_id` by incrementing the internal counter.
+    fn next(&self) -> u64 {
+        self.inner.fetch_add(1, Ordering::Relaxed)
+    }
+}
+
+/// Process-wide source of deterministic `expression_id`s for [`DynamicFilterPhysicalExpr`].
+///
+/// Currently, no other [`PhysicalExpr`]s use this source. If needed, it can be moved out of this
+/// file and be made public for other expressions to use.
+static EXPR_ID_SOURCE: ExpressionIdAtomicCounter = ExpressionIdAtomicCounter::new();
+
 #[cfg(test)]
 mod test {
     use crate::{
-        expressions::{col, lit, BinaryExpr},
+        expressions::{BinaryExpr, col, lit},
         utils::reassign_expr_columns,
     };
     use arrow::{
@@ -509,4 +740,369 @@ mod test {
             "Expected err when evaluate is called after changing the expression."
         );
     }
+
+    #[tokio::test]
+    async fn test_wait_complete_already_complete() {
+        let dynamic_filter = Arc::new(DynamicFilterPhysicalExpr::new(
+            vec![],
+            lit(42) as Arc<dyn PhysicalExpr>,
+        ));
+
+        // Mark as complete immediately
+        dynamic_filter.mark_complete();
+
+        // wait_complete should return immediately
+        dynamic_filter.wait_complete().await;
+    }
+
+    #[test]
+    fn test_with_new_children_independence() {
+        // Create a schema with columns a, b, c, d
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Int32, false),
+            Field::new("d", DataType::Int32, false),
+        ]));
+
+        // Create expression col(a) + col(b)
+        let col_a = col("a", &schema).unwrap();
+        let col_b = col("b", &schema).unwrap();
+        let col_c = col("c", &schema).unwrap();
+        let col_d = col("d", &schema).unwrap();
+
+        let expr = Arc::new(BinaryExpr::new(
+            Arc::clone(&col_a),
+            datafusion_expr::Operator::Plus,
+            Arc::clone(&col_b),
+        ));
+
+        // Create DynamicFilterPhysicalExpr with children [col_a, col_b]
+        let dynamic_filter = Arc::new(DynamicFilterPhysicalExpr::new(
+            vec![Arc::clone(&col_a), Arc::clone(&col_b)],
+            expr as Arc<dyn PhysicalExpr>,
+        ));
+
+        // Clone the Arc (two references to the same DynamicFilterPhysicalExpr)
+        let clone_1 = Arc::clone(&dynamic_filter);
+        let clone_2 = Arc::clone(&dynamic_filter);
+
+        // Call with_new_children with different children on each clone
+        // clone_1: replace [a, b] with [b, c] -> expression becomes b + c
+        let remapped_1 = clone_1
+            .with_new_children(vec![Arc::clone(&col_b), Arc::clone(&col_c)])
+            .unwrap();
+
+        // clone_2: replace [a, b] with [b, d] -> expression becomes b + d
+        let remapped_2 = clone_2
+            .with_new_children(vec![Arc::clone(&col_b), Arc::clone(&col_d)])
+            .unwrap();
+
+        // Create a RecordBatch with columns a=1,2,3  b=10,20,30  c=100,200,300  d=1000,2000,3000
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])), // a
+                Arc::new(arrow::array::Int32Array::from(vec![10, 20, 30])), // b
+                Arc::new(arrow::array::Int32Array::from(vec![100, 200, 300])), // c
+                Arc::new(arrow::array::Int32Array::from(vec![1000, 2000, 3000])), // d
+            ],
+        )
+        .unwrap();
+
+        // Evaluate both remapped expressions
+        let result_1 = remapped_1.evaluate(&batch).unwrap();
+        let result_2 = remapped_2.evaluate(&batch).unwrap();
+
+        // Extract arrays from results
+        let ColumnarValue::Array(arr_1) = result_1 else {
+            panic!("Expected ColumnarValue::Array for result_1");
+        };
+        let ColumnarValue::Array(arr_2) = result_2 else {
+            panic!("Expected ColumnarValue::Array for result_2");
+        };
+
+        // Verify result_1 = b + c = [110, 220, 330]
+        let expected_1: Arc<dyn arrow::array::Array> =
+            Arc::new(arrow::array::Int32Array::from(vec![110, 220, 330]));
+        assert!(
+            arr_1.eq(&expected_1),
+            "Expected b + c = [110, 220, 330], got {arr_1:?}",
+        );
+
+        // Verify result_2 = b + d = [1010, 2020, 3030]
+        let expected_2: Arc<dyn arrow::array::Array> =
+            Arc::new(arrow::array::Int32Array::from(vec![1010, 2020, 3030]));
+        assert!(
+            arr_2.eq(&expected_2),
+            "Expected b + d = [1010, 2020, 3030], got {arr_2:?}",
+        );
+    }
+
+    #[test]
+    fn test_is_used() {
+        let filter = Arc::new(DynamicFilterPhysicalExpr::new(
+            vec![],
+            lit(true) as Arc<dyn PhysicalExpr>,
+        ));
+
+        // Initially, only one reference to the inner Arc exists
+        assert!(
+            !filter.is_used(),
+            "Filter should not be used with only one inner reference"
+        );
+
+        // Simulate a consumer created via transformation (what happens during filter pushdown).
+        // When filters are pushed down and transformed via reassign_expr_columns/transform_down,
+        // with_new_children() is called which creates a new outer Arc but clones the inner Arc.
+        let consumer1_expr = Arc::clone(&filter).with_new_children(vec![]).unwrap();
+        let _consumer1 = consumer1_expr
+            .downcast_ref::<DynamicFilterPhysicalExpr>()
+            .expect("Should be DynamicFilterPhysicalExpr");
+
+        // Now the inner Arc is shared (inner_count = 2)
+        assert!(
+            filter.is_used(),
+            "Filter should be used when inner Arc is shared with transformed consumer"
+        );
+
+        // Create another transformed consumer
+        let consumer2_expr = Arc::clone(&filter).with_new_children(vec![]).unwrap();
+        let _consumer2 = consumer2_expr
+            .downcast_ref::<DynamicFilterPhysicalExpr>()
+            .expect("Should be DynamicFilterPhysicalExpr");
+
+        assert!(
+            filter.is_used(),
+            "Filter should still be used with multiple consumers"
+        );
+    }
+
+    /// Test that verifies the Hash/Eq contract is now satisfied (issue #19641 fix).
+    ///
+    /// After the fix, Hash uses Arc::as_ptr(&self.inner) which is stable across
+    /// update() calls, fixing the HashMap key instability issue.
+    #[test]
+    fn test_hash_stable_after_update() {
+        use std::collections::hash_map::DefaultHasher;
+        use std::hash::{Hash, Hasher};
+
+        // Create filter with initial value
+        let filter =
+            DynamicFilterPhysicalExpr::new(vec![], lit(true) as Arc<dyn PhysicalExpr>);
+
+        // Compute hash BEFORE update
+        let mut hasher_before = DefaultHasher::new();
+        filter.hash(&mut hasher_before);
+        let hash_before = hasher_before.finish();
+
+        // Update changes the underlying expression
+        filter
+            .update(lit(false) as Arc<dyn PhysicalExpr>)
+            .expect("Update should succeed");
+
+        // Compute hash AFTER update
+        let mut hasher_after = DefaultHasher::new();
+        filter.hash(&mut hasher_after);
+        let hash_after = hasher_after.finish();
+
+        // FIXED: Hash should now be STABLE after update() because we use
+        // Arc::as_ptr for identity-based hashing instead of expression content.
+        assert_eq!(
+            hash_before, hash_after,
+            "Hash should be stable after update() - fix for issue #19641"
+        );
+
+        // Self-equality should still hold
+        assert!(filter.eq(&filter), "Self-equality should hold");
+    }
+
+    /// Test that verifies separate DynamicFilterPhysicalExpr instances
+    /// with the same expression are NOT equal (identity-based comparison).
+    #[test]
+    fn test_identity_based_equality() {
+        // Create two separate filters with identical initial expressions
+        let filter1 =
+            DynamicFilterPhysicalExpr::new(vec![], lit(true) as Arc<dyn PhysicalExpr>);
+        let filter2 =
+            DynamicFilterPhysicalExpr::new(vec![], lit(true) as Arc<dyn PhysicalExpr>);
+
+        // Different instances should NOT be equal even with same expression
+        // because they have independent inner Arcs (different update lifecycles)
+        assert!(
+            !filter1.eq(&filter2),
+            "Different instances should not be equal (identity-based)"
+        );
+
+        // Self-equality should hold
+        assert!(filter1.eq(&filter1), "Self-equality should hold");
+    }
+
+    /// Test that hash is stable for the same filter instance.
+    /// After the fix, hash uses Arc::as_ptr which is pointer-based.
+    #[test]
+    fn test_hash_stable_for_same_instance() {
+        use std::collections::hash_map::DefaultHasher;
+        use std::hash::{Hash, Hasher};
+
+        let filter =
+            DynamicFilterPhysicalExpr::new(vec![], lit(true) as Arc<dyn PhysicalExpr>);
+
+        // Compute hash twice for the same instance
+        let hash1 = {
+            let mut h = DefaultHasher::new();
+            filter.hash(&mut h);
+            h.finish()
+        };
+        let hash2 = {
+            let mut h = DefaultHasher::new();
+            filter.hash(&mut h);
+            h.finish()
+        };
+
+        assert_eq!(hash1, hash2, "Same instance should have stable hash");
+
+        // Update the expression
+        filter
+            .update(lit(false) as Arc<dyn PhysicalExpr>)
+            .expect("Update should succeed");
+
+        // Hash should STILL be the same (identity-based)
+        let hash3 = {
+            let mut h = DefaultHasher::new();
+            filter.hash(&mut h);
+            h.finish()
+        };
+
+        assert_eq!(
+            hash1, hash3,
+            "Hash should be stable after update (identity-based)"
+        );
+    }
+
+    /// Verifies that `from_parts` rebuilds a `DynamicFilterPhysicalExpr`
+    /// whose observable state (original children, remapped children,
+    /// expression id, inner generation/expr/is_complete) matches the source
+    /// filter.
+    #[test]
+    fn test_from_parts_preserves_state() {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+        let col_a = col("a", &schema).unwrap();
+
+        // Create a dynamic filter with children
+        let expr = Arc::new(BinaryExpr::new(
+            Arc::clone(&col_a),
+            datafusion_expr::Operator::Gt,
+            lit(10) as Arc<dyn PhysicalExpr>,
+        ));
+        let filter = DynamicFilterPhysicalExpr::new(
+            vec![Arc::clone(&col_a)],
+            expr as Arc<dyn PhysicalExpr>,
+        );
+
+        // Add remapped children.
+        let reassigned_schema = Arc::new(Schema::new(vec![
+            Field::new("b", DataType::Int32, false),
+            Field::new("a", DataType::Int32, false),
+        ]));
+        let reassigned = reassign_expr_columns(
+            Arc::new(filter) as Arc<dyn PhysicalExpr>,
+            &reassigned_schema,
+        )
+        .expect("reassign_expr_columns should succeed");
+        let reassigned = reassigned
+            .downcast_ref::<DynamicFilterPhysicalExpr>()
+            .expect("Expected dynamic filter after reassignment");
+
+        reassigned
+            .update(lit(42) as Arc<dyn PhysicalExpr>)
+            .expect("Update should succeed");
+        reassigned.mark_complete();
+
+        // Capture the parts and reconstruct. `expression_id` rides in `inner`.
+        let reconstructed = DynamicFilterPhysicalExpr::from_parts(
+            reassigned.original_children().to_vec(),
+            reassigned.remapped_children().map(|r| r.to_vec()),
+            reassigned.inner(),
+        );
+
+        assert_eq!(
+            reassigned.original_children(),
+            reconstructed.original_children(),
+        );
+        assert_eq!(
+            reassigned.remapped_children(),
+            reconstructed.remapped_children(),
+        );
+        assert_eq!(reassigned.expression_id(), reconstructed.expression_id());
+        let r = reassigned.inner();
+        let c = reconstructed.inner();
+        assert_eq!(r.generation, c.generation);
+        assert_eq!(r.is_complete, c.is_complete);
+        assert_eq!(format!("{:?}", r.expr), format!("{:?}", c.expr));
+    }
+
+    #[tokio::test]
+    async fn test_expression_id() {
+        let source_schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+        let col_a = col("a", &source_schema).unwrap();
+
+        // Create a source filter
+        let source = Arc::new(DynamicFilterPhysicalExpr::new(
+            vec![Arc::clone(&col_a)],
+            lit(true) as Arc<dyn PhysicalExpr>,
+        ));
+        let source_clone = Arc::clone(&source);
+
+        // Create a derived filter by reassigning the source filter to a different schema.
+        let derived_schema = Arc::new(Schema::new(vec![
+            Field::new("x", DataType::Int32, false),
+            Field::new("a", DataType::Int32, false),
+        ]));
+        let derived = reassign_expr_columns(
+            Arc::clone(&source) as Arc<dyn PhysicalExpr>,
+            &derived_schema,
+        )
+        .expect("reassign_expr_columns should succeed");
+
+        let derived_expression_id = derived
+            .expression_id()
+            .expect("derived filter should have an expression id");
+        let source_expression_id = source
+            .expression_id()
+            .expect("source filter should have an expression id");
+        let source_clone_expression_id = source_clone
+            .expression_id()
+            .expect("source clone should have an expression id");
+
+        assert_eq!(
+            source_clone_expression_id, source_expression_id,
+            "cloned filter should preserve its expression id",
+        );
+
+        assert_eq!(
+            derived_expression_id, source_expression_id,
+            "derived filters should carry forward the source expression id",
+        );
+
+        // `update()` rewrites the entire `Inner` struct in place; pin down
+        // that the rewrite preserves `expression_id`.
+        source
+            .update(lit(99) as Arc<dyn PhysicalExpr>)
+            .expect("update should succeed");
+        assert_eq!(
+            source.expression_id().unwrap(),
+            source_expression_id,
+            "update() must not change expression_id",
+        );
+
+        // `mark_complete()` also touches `Inner`; same invariant.
+        source.mark_complete();
+        assert_eq!(
+            source.expression_id().unwrap(),
+            source_expression_id,
+            "mark_complete() must not change expression_id",
+        );
+    }
 }
diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs
index fa91635d9bfd9..44e6a99c8a033 100644
--- a/datafusion/physical-expr/src/expressions/in_list.rs
+++ b/datafusion/physical-expr/src/expressions/in_list.rs
@@ -17,42 +17,39 @@
 
 //! Implementation of `InList` expressions: [`InListExpr`]
 
-use std::any::Any;
 use std::fmt::Debug;
 use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 
-use crate::physical_expr::physical_exprs_bag_equal;
 use crate::PhysicalExpr;
+use crate::physical_expr::physical_exprs_bag_equal;
 
-use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano};
 use arrow::array::*;
-use arrow::buffer::BooleanBuffer;
+use arrow::buffer::{BooleanBuffer, NullBuffer};
+use arrow::compute::SortOptions;
 use arrow::compute::kernels::boolean::{not, or_kleene};
-use arrow::compute::take;
+use arrow::compute::kernels::cmp::eq as arrow_eq;
 use arrow::datatypes::*;
-use arrow::util::bit_iterator::BitIndexIterator;
-use arrow::{downcast_dictionary_array, downcast_primitive_array};
-use datafusion_common::cast::{
-    as_boolean_array, as_generic_binary_array, as_string_array,
-};
-use datafusion_common::hash_utils::HashValue;
+
 use datafusion_common::{
-    exec_err, internal_err, not_impl_err, DFSchema, Result, ScalarValue,
+    DFSchema, Result, ScalarValue, assert_or_internal_err, exec_err,
 };
-use datafusion_expr::ColumnarValue;
-use datafusion_physical_expr_common::datum::compare_with_eq;
+use datafusion_expr::{ColumnarValue, expr_vec_fmt};
 
-use ahash::RandomState;
-use datafusion_common::HashMap;
-use hashbrown::hash_map::RawEntryMut;
+mod array_static_filter;
+mod primitive_filter;
+mod static_filter;
+mod strategy;
+
+use static_filter::StaticFilter;
+use strategy::instantiate_static_filter;
 
 /// InList
 pub struct InListExpr {
     expr: Arc<dyn PhysicalExpr>,
     list: Vec<Arc<dyn PhysicalExpr>>,
     negated: bool,
-    static_filter: Option<Arc<dyn Set>>,
+    static_filter: Option<Arc<dyn StaticFilter + Send + Sync>>,
 }
 
 impl Debug for InListExpr {
@@ -65,148 +62,19 @@ impl Debug for InListExpr {
     }
 }
 
-/// A type-erased container of array elements
-pub trait Set: Send + Sync {
-    fn contains(&self, v: &dyn Array, negated: bool) -> Result<BooleanArray>;
-    fn has_nulls(&self) -> bool;
-}
-
-struct ArrayHashSet {
-    state: RandomState,
-    /// Used to provide a lookup from value to in list index
-    ///
-    /// Note: usize::hash is not used, instead the raw entry
-    /// API is used to store entries w.r.t their value
-    map: HashMap<usize, (), ()>,
-}
-
-struct ArraySet<T> {
-    array: T,
-    hash_set: ArrayHashSet,
-}
-
-impl<T> ArraySet<T>
-where
-    T: Array + From<ArrayData>,
-{
-    fn new(array: &T, hash_set: ArrayHashSet) -> Self {
-        Self {
-            array: downcast_array(array),
-            hash_set,
-        }
-    }
-}
-
-impl<T> Set for ArraySet<T>
-where
-    T: Array + 'static,
-    for<'a> &'a T: ArrayAccessor,
-    for<'a> <&'a T as ArrayAccessor>::Item: IsEqual,
-{
-    fn contains(&self, v: &dyn Array, negated: bool) -> Result<BooleanArray> {
-        downcast_dictionary_array! {
-            v => {
-                let values_contains = self.contains(v.values().as_ref(), negated)?;
-                let result = take(&values_contains, v.keys(), None)?;
-                return Ok(downcast_array(result.as_ref()))
-            }
-            _ => {}
-        }
-
-        let v = v.as_any().downcast_ref::<T>().unwrap();
-        let in_array = &self.array;
-        let has_nulls = in_array.null_count() != 0;
-
-        Ok(ArrayIter::new(v)
-            .map(|v| {
-                v.and_then(|v| {
-                    let hash = v.hash_one(&self.hash_set.state);
-                    let contains = self
-                        .hash_set
-                        .map
-                        .raw_entry()
-                        .from_hash(hash, |idx| in_array.value(*idx).is_equal(&v))
-                        .is_some();
-
-                    match contains {
-                        true => Some(!negated),
-                        false if has_nulls => None,
-                        false => Some(negated),
-                    }
-                })
-            })
-            .collect())
-    }
-
-    fn has_nulls(&self) -> bool {
-        self.array.null_count() != 0
-    }
-}
-
-/// Computes an [`ArrayHashSet`] for the provided [`Array`] if there
-/// are nulls present or there are more than the configured number of
-/// elements.
+/// Returns true if Arrow's vectorized `eq` kernel supports this data type.
 ///
-/// Note: This is split into a separate function as higher-rank trait bounds currently
-/// cause type inference to misbehave
-fn make_hash_set<T>(array: T) -> ArrayHashSet
-where
-    T: ArrayAccessor,
-    T::Item: IsEqual,
-{
-    let state = RandomState::new();
-    let mut map: HashMap<usize, (), ()> =
-        HashMap::with_capacity_and_hasher(array.len(), ());
-
-    let insert_value = |idx| {
-        let value = array.value(idx);
-        let hash = value.hash_one(&state);
-        if let RawEntryMut::Vacant(v) = map
-            .raw_entry_mut()
-            .from_hash(hash, |x| array.value(*x).is_equal(&value))
-        {
-            v.insert_with_hasher(hash, idx, (), |x| array.value(*x).hash_one(&state));
-        }
-    };
-
-    match array.nulls() {
-        Some(nulls) => {
-            BitIndexIterator::new(nulls.validity(), nulls.offset(), nulls.len())
-                .for_each(insert_value)
-        }
-        None => (0..array.len()).for_each(insert_value),
+/// Supported: primitives, boolean, strings (Utf8/LargeUtf8/Utf8View),
+/// binary (Binary/LargeBinary/BinaryView/FixedSizeBinary), Null, and
+/// Dictionary-encoded variants of the above.
+/// Unsupported: nested types (Struct, List, Map, Union) and RunEndEncoded.
+fn supports_arrow_eq(dt: &DataType) -> bool {
+    use DataType::*;
+    match dt {
+        Boolean | Binary | LargeBinary | BinaryView | FixedSizeBinary(_) => true,
+        Dictionary(_, v) => supports_arrow_eq(v.as_ref()),
+        _ => dt.is_primitive() || dt.is_null() || dt.is_string(),
     }
-
-    ArrayHashSet { state, map }
-}
-
-/// Creates a `Box<dyn Set>` for the given list of `IN` expressions and `batch`
-fn make_set(array: &dyn Array) -> Result<Arc<dyn Set>> {
-    Ok(downcast_primitive_array! {
-        array => Arc::new(ArraySet::new(array, make_hash_set(array))),
-        DataType::Boolean => {
-            let array = as_boolean_array(array)?;
-            Arc::new(ArraySet::new(array, make_hash_set(array)))
-        },
-        DataType::Utf8 => {
-            let array = as_string_array(array)?;
-            Arc::new(ArraySet::new(array, make_hash_set(array)))
-        }
-        DataType::LargeUtf8 => {
-            let array = as_largestring_array(array);
-            Arc::new(ArraySet::new(array, make_hash_set(array)))
-        }
-        DataType::Binary => {
-            let array = as_generic_binary_array::<i32>(array)?;
-            Arc::new(ArraySet::new(array, make_hash_set(array)))
-        }
-        DataType::LargeBinary => {
-            let array = as_generic_binary_array::<i64>(array)?;
-            Arc::new(ArraySet::new(array, make_hash_set(array)))
-        }
-        DataType::Dictionary(_, _) => unreachable!("dictionary should have been flattened"),
-        d => return not_impl_err!("DataType::{d} not supported in InList")
-    })
 }
 
 /// Evaluates the list of expressions into an array, flattening any dictionaries
@@ -231,56 +99,37 @@ fn evaluate_list(
     ScalarValue::iter_to_array(scalars)
 }
 
-fn try_cast_static_filter_to_set(
+/// Try to evaluate a list of expressions as constants.
+///
+/// Returns:
+/// - `Ok(Some(ArrayRef))` if all expressions are constants (can be evaluated on an empty RecordBatch)
+/// - `Ok(None)` if the list contains non-constant expressions
+/// - `Err(...)` only for actual errors (not for non-constant expressions)
+///
+/// This is used to detect when a list contains only literals, casts of literals,
+/// or other constant expressions.
+fn try_evaluate_constant_list(
     list: &[Arc<dyn PhysicalExpr>],
     schema: &Schema,
-) -> Result<Arc<dyn Set>> {
+) -> Result<Option<ArrayRef>> {
     let batch = RecordBatch::new_empty(Arc::new(schema.clone()));
-    make_set(evaluate_list(list, &batch)?.as_ref())
-}
-
-/// Custom equality check function which is used with [`ArrayHashSet`] for existence check.
-trait IsEqual: HashValue {
-    fn is_equal(&self, other: &Self) -> bool;
-}
-
-impl<T: IsEqual + ?Sized> IsEqual for &T {
-    fn is_equal(&self, other: &Self) -> bool {
-        T::is_equal(self, other)
+    match evaluate_list(list, &batch) {
+        Ok(array) => Ok(Some(array)),
+        Err(_) => {
+            // Non-constant expressions can't be evaluated on an empty batch
+            // This is not an error, just means we can't use a static filter
+            Ok(None)
+        }
     }
 }
 
-macro_rules! is_equal {
-    ($($t:ty),+) => {
-        $(impl IsEqual for $t {
-            fn is_equal(&self, other: &Self) -> bool {
-                self == other
-            }
-        })*
-    };
-}
-is_equal!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64);
-is_equal!(bool, str, [u8]);
-is_equal!(IntervalDayTime, IntervalMonthDayNano);
-
-macro_rules! is_equal_float {
-    ($($t:ty),+) => {
-        $(impl IsEqual for $t {
-            fn is_equal(&self, other: &Self) -> bool {
-                self.to_bits() == other.to_bits()
-            }
-        })*
-    };
-}
-is_equal_float!(half::f16, f32, f64);
-
 impl InListExpr {
     /// Create a new InList expression
-    pub fn new(
+    fn new(
         expr: Arc<dyn PhysicalExpr>,
         list: Vec<Arc<dyn PhysicalExpr>>,
         negated: bool,
-        static_filter: Option<Arc<dyn Set>>,
+        static_filter: Option<Arc<dyn StaticFilter + Send + Sync>>,
     ) -> Self {
         Self {
             expr,
@@ -300,23 +149,85 @@ impl InListExpr {
         &self.list
     }
 
+    pub fn is_empty(&self) -> bool {
+        self.list.is_empty()
+    }
+
+    pub fn len(&self) -> usize {
+        self.list.len()
+    }
+
     /// Is this negated e.g. NOT IN LIST
     pub fn negated(&self) -> bool {
         self.negated
     }
-}
 
-#[macro_export]
-macro_rules! expr_vec_fmt {
-    ( $ARRAY:expr ) => {{
-        $ARRAY
-            .iter()
-            .map(|e| format!("{e}"))
-            .collect::<Vec<String>>()
-            .join(", ")
-    }};
-}
+    /// Create a new InList expression directly from an array, bypassing expression evaluation.
+    ///
+    /// This is more efficient than `in_list()` when you already have the list as an array,
+    /// as it avoids the conversion: `ArrayRef -> Vec<PhysicalExpr> -> ArrayRef -> StaticFilter`.
+    /// Instead it goes directly: `ArrayRef -> StaticFilter`.
+    ///
+    /// The `list` field will be empty when using this constructor, as the array is stored
+    /// directly in the static filter.
+    ///
+    /// This does not make the expression any more performant at runtime, but it does make it slightly
+    /// cheaper to build.
+    pub fn try_new_from_array(
+        expr: Arc<dyn PhysicalExpr>,
+        array: ArrayRef,
+        negated: bool,
+    ) -> Result<Self> {
+        let list = (0..array.len())
+            .map(|i| {
+                let scalar = ScalarValue::try_from_array(array.as_ref(), i)?;
+                Ok(crate::expressions::lit(scalar) as Arc<dyn PhysicalExpr>)
+            })
+            .collect::<Result<Vec<_>>>()?;
+        Ok(Self::new(
+            expr,
+            list,
+            negated,
+            Some(instantiate_static_filter(array)?),
+        ))
+    }
+
+    /// Create a new InList expression, using a static filter when possible.
+    ///
+    /// This validates data types and attempts to create a static filter for constant
+    /// list expressions. Uses specialized StaticFilter implementations for better
+    /// performance (e.g., Int32StaticFilter for Int32).
+    ///
+    /// Returns an error if data types don't match. If the list contains non-constant
+    /// expressions, falls back to dynamic evaluation at runtime.
+    pub fn try_new(
+        expr: Arc<dyn PhysicalExpr>,
+        list: Vec<Arc<dyn PhysicalExpr>>,
+        negated: bool,
+        schema: &Schema,
+    ) -> Result<Self> {
+        // Check the data types match
+        let expr_data_type = expr.data_type(schema)?;
+        for list_expr in list.iter() {
+            let list_expr_data_type = list_expr.data_type(schema)?;
+            assert_or_internal_err!(
+                DFSchema::datatype_is_logically_equal(
+                    &expr_data_type,
+                    &list_expr_data_type
+                ),
+                "The data type inlist should be same, the value type is {expr_data_type}, one of list expr type is {list_expr_data_type}"
+            );
+        }
+
+        // Try to create a static filter if all list expressions are constants
+        let static_filter = match try_evaluate_constant_list(&list, schema)? {
+            Some(in_array) => Some(instantiate_static_filter(in_array)?),
+            None => None, // Non-constant expressions, fall back to dynamic evaluation
+        };
 
+        Ok(Self::new(expr, list, negated, static_filter))
+    }
+}
 impl std::fmt::Display for InListExpr {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         let list = expr_vec_fmt!(self.list);
@@ -336,11 +247,6 @@ impl std::fmt::Display for InListExpr {
 }
 
 impl PhysicalExpr for InListExpr {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
         Ok(DataType::Boolean)
     }
@@ -351,7 +257,7 @@ impl PhysicalExpr for InListExpr {
         }
 
         if let Some(static_filter) = &self.static_filter {
-            Ok(static_filter.has_nulls())
+            Ok(static_filter.null_count() > 0)
         } else {
             for expr in &self.list {
                 if expr.nullable(input_schema)? {
@@ -366,35 +272,125 @@ impl PhysicalExpr for InListExpr {
         let num_rows = batch.num_rows();
         let value = self.expr.evaluate(batch)?;
         let r = match &self.static_filter {
-            Some(f) => f.contains(value.into_array(num_rows)?.as_ref(), self.negated)?,
+            Some(filter) => {
+                match value {
+                    ColumnarValue::Array(array) => {
+                        filter.contains(&array, self.negated)?
+                    }
+                    ColumnarValue::Scalar(scalar) => {
+                        if scalar.is_null() {
+                            // SQL three-valued logic: null IN (...) is always null
+                            // The code below would handle this correctly but this is a faster path
+                            let nulls = NullBuffer::new_null(num_rows);
+                            return Ok(ColumnarValue::Array(Arc::new(
+                                BooleanArray::new(
+                                    BooleanBuffer::new_unset(num_rows),
+                                    Some(nulls),
+                                ),
+                            )));
+                        }
+                        // Use a 1 row array to avoid code duplication/branching
+                        // Since all we do is compute hash and lookup this should be efficient enough
+                        let array = scalar.to_array()?;
+                        let result_array =
+                            filter.contains(array.as_ref(), self.negated)?;
+                        // Broadcast the single result to all rows
+                        // Must check is_null() to preserve NULL values (SQL three-valued logic)
+                        if result_array.is_null(0) {
+                            let nulls = NullBuffer::new_null(num_rows);
+                            BooleanArray::new(
+                                BooleanBuffer::new_unset(num_rows),
+                                Some(nulls),
+                            )
+                        } else if result_array.value(0) {
+                            BooleanArray::new(BooleanBuffer::new_set(num_rows), None)
+                        } else {
+                            BooleanArray::new(BooleanBuffer::new_unset(num_rows), None)
+                        }
+                    }
+                }
+            }
             None => {
+                // No static filter: iterate through each expression, compare, and OR results.
+                // Use Arrow's vectorized eq kernel for types it supports (primitive,
+                // boolean, string, binary, dictionary), falling back to row-by-row
+                // comparator for unsupported types (nested, RunEndEncoded, etc.).
                 let value = value.into_array(num_rows)?;
-                let is_nested = value.data_type().is_nested();
-                let found = self.list.iter().map(|expr| expr.evaluate(batch)).try_fold(
-                    BooleanArray::new(BooleanBuffer::new_unset(num_rows), None),
-                    |result, expr| -> Result<BooleanArray> {
-                        let rhs = compare_with_eq(
-                            &value,
-                            &expr?.into_array(num_rows)?,
-                            is_nested,
-                        )?;
-                        Ok(or_kleene(&result, &rhs)?)
-                    },
-                )?;
-
-                if self.negated {
-                    not(&found)?
+                let lhs_supports_arrow_eq = supports_arrow_eq(value.data_type());
+
+                // Helper: compare value against a single list expression
+                let compare_one = |expr: &Arc<dyn PhysicalExpr>| -> Result<BooleanArray> {
+                    match expr.evaluate(batch)? {
+                        ColumnarValue::Array(array) => {
+                            if lhs_supports_arrow_eq
+                                && supports_arrow_eq(array.data_type())
+                            {
+                                Ok(arrow_eq(&value, &array)?)
+                            } else {
+                                let cmp = make_comparator(
+                                    value.as_ref(),
+                                    array.as_ref(),
+                                    SortOptions::default(),
+                                )?;
+                                let buffer = BooleanBuffer::collect_bool(num_rows, |i| {
+                                    cmp(i, i).is_eq()
+                                });
+                                let nulls =
+                                    NullBuffer::union(value.nulls(), array.nulls());
+                                Ok(BooleanArray::new(buffer, nulls))
+                            }
+                        }
+                        ColumnarValue::Scalar(scalar) => {
+                            // Check if scalar is null once, before the loop
+                            if scalar.is_null() {
+                                // If scalar is null, all comparisons return null
+                                Ok(BooleanArray::from(vec![None; num_rows]))
+                            } else if lhs_supports_arrow_eq {
+                                let scalar_datum = scalar.to_scalar()?;
+                                Ok(arrow_eq(&value, &scalar_datum)?)
+                            } else {
+                                // Convert scalar to 1-element array
+                                let array = scalar.to_array()?;
+                                let cmp = make_comparator(
+                                    value.as_ref(),
+                                    array.as_ref(),
+                                    SortOptions::default(),
+                                )?;
+                                // Compare each row of value with the single scalar element
+                                let buffer = BooleanBuffer::collect_bool(num_rows, |i| {
+                                    cmp(i, 0).is_eq()
+                                });
+                                Ok(BooleanArray::new(buffer, value.nulls().cloned()))
+                            }
+                        }
+                    }
+                };
+
+                // Evaluate first expression directly to avoid a redundant
+                // or_kleene with an all-false accumulator.
+                let mut found = if let Some(first) = self.list.first() {
+                    compare_one(first)?
                 } else {
-                    found
+                    BooleanArray::new(BooleanBuffer::new_unset(num_rows), None)
+                };
+
+                for expr in self.list.iter().skip(1) {
+                    // Short-circuit: if every non-null row is already true,
+                    // no further list items can change the result.
+                    if found.null_count() == 0 && !found.has_false() {
+                        break;
+                    }
+                    found = or_kleene(&found, &compare_one(expr)?)?;
                 }
+
+                if self.negated { not(&found)? } else { found }
             }
         };
         Ok(ColumnarValue::Array(Arc::new(r)))
     }
 
     fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-        let mut children = vec![];
-        children.push(&self.expr);
+        let mut children = vec![&self.expr];
         children.extend(&self.list);
         children
     }
@@ -408,7 +404,7 @@ impl PhysicalExpr for InListExpr {
             Arc::clone(&children[0]),
             children[1..].to_vec(),
             self.negated,
-            self.static_filter.clone(),
+            self.static_filter.as_ref().map(Arc::clone),
         )))
     }
 
@@ -443,8 +439,8 @@ impl Hash for InListExpr {
     fn hash<H: Hasher>(&self, state: &mut H) {
         self.expr.hash(state);
         self.negated.hash(state);
-        self.list.hash(state);
         // Add `self.static_filter` when hash is available
+        self.list.hash(state);
     }
 }
 
@@ -455,35 +451,19 @@ pub fn in_list(
     negated: &bool,
     schema: &Schema,
 ) -> Result<Arc<dyn PhysicalExpr>> {
-    // check the data type
-    let expr_data_type = expr.data_type(schema)?;
-    for list_expr in list.iter() {
-        let list_expr_data_type = list_expr.data_type(schema)?;
-        if !DFSchema::datatype_is_logically_equal(&expr_data_type, &list_expr_data_type) {
-            return internal_err!(
-                "The data type inlist should be same, the value type is {expr_data_type}, one of list expr type is {list_expr_data_type}"
-            );
-        }
-    }
-    let static_filter = try_cast_static_filter_to_set(&list, schema).ok();
-    Ok(Arc::new(InListExpr::new(
-        expr,
-        list,
-        *negated,
-        static_filter,
-    )))
+    Ok(Arc::new(InListExpr::try_new(expr, list, *negated, schema)?))
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::expressions;
     use crate::expressions::{col, lit, try_cast};
+    use arrow::datatypes::{IntervalDayTime, IntervalMonthDayNano, i256};
     use datafusion_common::plan_err;
     use datafusion_expr::type_coercion::binary::comparison_coercion;
     use datafusion_physical_expr_common::physical_expr::fmt_sql;
     use insta::assert_snapshot;
-    use itertools::Itertools as _;
+    use itertools::Itertools;
 
     type InListCastResult = (Arc<dyn PhysicalExpr>, Vec<Arc<dyn PhysicalExpr>>);
 
@@ -529,7 +509,18 @@ mod tests {
             })
     }
 
-    // applies the in_list expr to an input batch and list
+    /// Test helper macro that evaluates an IN LIST expression with automatic type casting.
+    ///
+    /// # Parameters
+    /// - `$BATCH`: The `RecordBatch` containing the input data to evaluate against
+    /// - `$LIST`: A `Vec<Arc<dyn PhysicalExpr>>` of literal expressions representing the IN list values
+    /// - `$NEGATED`: A `&bool` indicating whether this is a NOT IN operation (true) or IN operation (false)
+    /// - `$EXPECTED`: A `Vec<Option<bool>>` representing the expected boolean results for each row
+    /// - `$COL`: An `Arc<dyn PhysicalExpr>` representing the column expression to evaluate
+    /// - `$SCHEMA`: A `&Schema` reference for the input batch
+    ///
+    /// This macro first applies type casting to the column and list expressions to ensure
+    /// type compatibility, then delegates to `in_list_raw!` to perform the evaluation and assertion.
     macro_rules! in_list {
         ($BATCH:expr, $LIST:expr, $NEGATED:expr, $EXPECTED:expr, $COL:expr, $SCHEMA:expr) => {{
             let (cast_expr, cast_list_exprs) = in_list_cast($COL, $LIST, $SCHEMA)?;
@@ -544,187 +535,420 @@ mod tests {
         }};
     }
 
-    // applies the in_list expr to an input batch and list without cast
+    /// Test helper macro that evaluates an IN LIST expression without automatic type casting.
+    ///
+    /// # Parameters
+    /// - `$BATCH`: The `RecordBatch` containing the input data to evaluate against
+    /// - `$LIST`: A `Vec<Arc<dyn PhysicalExpr>>` of literal expressions representing the IN list values
+    /// - `$NEGATED`: A `&bool` indicating whether this is a NOT IN operation (true) or IN operation (false)
+    /// - `$EXPECTED`: A `Vec<Option<bool>>` representing the expected boolean results for each row
+    /// - `$COL`: An `Arc<dyn PhysicalExpr>` representing the column expression to evaluate
+    /// - `$SCHEMA`: A `&Schema` reference for the input batch
+    ///
+    /// This macro creates an IN LIST expression, evaluates it against the batch, converts the result
+    /// to a `BooleanArray`, and asserts that it matches the expected output. Use this when the column
+    /// and list expressions are already the correct types and don't require casting.
     macro_rules! in_list_raw {
         ($BATCH:expr, $LIST:expr, $NEGATED:expr, $EXPECTED:expr, $COL:expr, $SCHEMA:expr) => {{
-            let expr = in_list($COL, $LIST, $NEGATED, $SCHEMA).unwrap();
+            let col_expr = $COL;
+            let expr = in_list(Arc::clone(&col_expr), $LIST, $NEGATED, $SCHEMA).unwrap();
             let result = expr
                 .evaluate(&$BATCH)?
                 .into_array($BATCH.num_rows())
                 .expect("Failed to convert to array");
-            let result =
-                as_boolean_array(&result).expect("failed to downcast to BooleanArray");
+            let result = as_boolean_array(&result);
             let expected = &BooleanArray::from($EXPECTED);
-            assert_eq!(expected, result);
+            assert_eq!(
+                expected,
+                result,
+                "Failed for: {}\n{}: {:?}",
+                fmt_sql(expr.as_ref()),
+                fmt_sql(col_expr.as_ref()),
+                col_expr
+                    .evaluate(&$BATCH)?
+                    .into_array($BATCH.num_rows())
+                    .unwrap()
+            );
         }};
     }
 
-    #[test]
-    fn in_list_utf8() -> Result<()> {
-        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]);
-        let a = StringArray::from(vec![Some("a"), Some("d"), None]);
-        let col_a = col("a", &schema)?;
-        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
+    /// Test case for primitive types following the standard IN LIST pattern.
+    ///
+    /// Each test case represents a data type with:
+    /// - `value_in`: A value that appears in both the test array and the IN list (matches → true)
+    /// - `value_not_in`: A value that appears in the test array but NOT in the IN list (doesn't match → false)
+    /// - `other_list_values`: Additional values in the IN list besides `value_in`
+    /// - `null_value`: Optional null scalar value for NULL handling tests. When None, tests
+    ///   without nulls are run, exercising the `(false, false)` and `(false, true)` branches.
+    struct InListPrimitiveTestCase {
+        name: &'static str,
+        value_in: ScalarValue,
+        value_not_in: ScalarValue,
+        other_list_values: Vec<ScalarValue>,
+        null_value: Option<ScalarValue>,
+    }
 
-        // expression: "a in ("a", "b")"
-        let list = vec![lit("a"), lit("b")];
-        in_list!(
-            batch,
-            list,
-            &false,
-            vec![Some(true), Some(false), None],
-            Arc::clone(&col_a),
-            &schema
-        );
+    /// Generic test data struct for primitive types.
+    ///
+    /// Holds test values needed for IN LIST tests, allowing the data
+    /// to be declared explicitly and reused across multiple types.
+    #[derive(Clone)]
+    struct PrimitiveTestCaseData<T> {
+        value_in: T,
+        value_not_in: T,
+        other_list_values: Vec<T>,
+    }
 
-        // expression: "a not in ("a", "b")"
-        let list = vec![lit("a"), lit("b")];
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), Some(true), None],
-            Arc::clone(&col_a),
-            &schema
-        );
+    /// Helper to create test cases for any primitive type using generic data.
+    ///
+    /// Uses TryInto for flexible type conversion, allowing test data to be
+    /// declared in any convertible type (e.g., i32 for all integer types).
+    /// Creates a test case WITH null support (for null handling tests).
+    fn primitive_test_case<T, D, F>(
+        name: &'static str,
+        constructor: F,
+        data: PrimitiveTestCaseData<D>,
+    ) -> InListPrimitiveTestCase
+    where
+        D: TryInto<T> + Clone,
+        <D as TryInto<T>>::Error: Debug,
+        F: Fn(Option<T>) -> ScalarValue,
+        T: Clone,
+    {
+        InListPrimitiveTestCase {
+            name,
+            value_in: constructor(Some(data.value_in.try_into().unwrap())),
+            value_not_in: constructor(Some(data.value_not_in.try_into().unwrap())),
+            other_list_values: data
+                .other_list_values
+                .into_iter()
+                .map(|v| constructor(Some(v.try_into().unwrap())))
+                .collect(),
+            null_value: Some(constructor(None)),
+        }
+    }
 
-        // expression: "a in ("a", "b", null)"
-        let list = vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))];
-        in_list!(
-            batch,
-            list,
-            &false,
-            vec![Some(true), None, None],
-            Arc::clone(&col_a),
-            &schema
-        );
+    /// Helper to create test cases WITHOUT null support.
+    /// These test cases exercise the `(false, true)` branch (no nulls, negated).
+    fn primitive_test_case_no_nulls<T, D, F>(
+        name: &'static str,
+        constructor: F,
+        data: PrimitiveTestCaseData<D>,
+    ) -> InListPrimitiveTestCase
+    where
+        D: TryInto<T> + Clone,
+        <D as TryInto<T>>::Error: Debug,
+        F: Fn(Option<T>) -> ScalarValue,
+        T: Clone,
+    {
+        InListPrimitiveTestCase {
+            name,
+            value_in: constructor(Some(data.value_in.try_into().unwrap())),
+            value_not_in: constructor(Some(data.value_not_in.try_into().unwrap())),
+            other_list_values: data
+                .other_list_values
+                .into_iter()
+                .map(|v| constructor(Some(v.try_into().unwrap())))
+                .collect(),
+            null_value: None,
+        }
+    }
 
-        // expression: "a not in ("a", "b", null)"
-        let list = vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))];
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), None, None],
-            Arc::clone(&col_a),
-            &schema
-        );
+    /// Runs test cases for multiple types, providing detailed SQL error messages on failure.
+    ///
+    /// For each test case, runs IN LIST scenarios based on whether null_value is Some or None:
+    /// - With null_value (Some): 4 tests including null handling
+    /// - Without null_value (None): 2 tests exercising the no-nulls paths
+    fn run_test_cases(test_cases: Vec<InListPrimitiveTestCase>) -> Result<()> {
+        for test_case in test_cases {
+            let test_name = test_case.name;
+
+            // Get the data type from the scalar value
+            let data_type = test_case.value_in.data_type();
+
+            // Build the base list: [value_in, ...other_list_values]
+            let build_base_list = || -> Vec<Arc<dyn PhysicalExpr>> {
+                let mut list = vec![lit(test_case.value_in.clone())];
+                list.extend(test_case.other_list_values.iter().map(|v| lit(v.clone())));
+                list
+            };
+
+            match &test_case.null_value {
+                Some(null_val) => {
+                    // Tests WITH nulls in the needle array
+                    let schema =
+                        Schema::new(vec![Field::new("a", data_type.clone(), true)]);
+
+                    // Create array from scalar values: [value_in, value_not_in, None]
+                    let array = ScalarValue::iter_to_array(vec![
+                        test_case.value_in.clone(),
+                        test_case.value_not_in.clone(),
+                        null_val.clone(),
+                    ])?;
+
+                    let col_a = col("a", &schema)?;
+                    let batch = RecordBatch::try_new(
+                        Arc::new(schema.clone()),
+                        vec![Arc::clone(&array)],
+                    )?;
+
+                    // Test 1: a IN (list) → [true, false, null]
+                    let list = build_base_list();
+                    in_list!(
+                        batch,
+                        list,
+                        &false,
+                        vec![Some(true), Some(false), None],
+                        Arc::clone(&col_a),
+                        &schema
+                    );
+
+                    // Test 2: a NOT IN (list) → [false, true, null]
+                    let list = build_base_list();
+                    in_list!(
+                        batch,
+                        list,
+                        &true,
+                        vec![Some(false), Some(true), None],
+                        Arc::clone(&col_a),
+                        &schema
+                    );
+
+                    // Test 3: a IN (list, NULL) → [true, null, null]
+                    let mut list = build_base_list();
+                    list.push(lit(null_val.clone()));
+                    in_list!(
+                        batch,
+                        list,
+                        &false,
+                        vec![Some(true), None, None],
+                        Arc::clone(&col_a),
+                        &schema
+                    );
+
+                    // Test 4: a NOT IN (list, NULL) → [false, null, null]
+                    let mut list = build_base_list();
+                    list.push(lit(null_val.clone()));
+                    in_list!(
+                        batch,
+                        list,
+                        &true,
+                        vec![Some(false), None, None],
+                        Arc::clone(&col_a),
+                        &schema
+                    );
+                }
+                None => {
+                    // Tests WITHOUT nulls - exercises the (false, false) and (false, true) branches
+                    let schema =
+                        Schema::new(vec![Field::new("a", data_type.clone(), false)]);
+
+                    // Create array from scalar values: [value_in, value_not_in] (no NULL)
+                    let array = ScalarValue::iter_to_array(vec![
+                        test_case.value_in.clone(),
+                        test_case.value_not_in.clone(),
+                    ])?;
+
+                    let col_a = col("a", &schema)?;
+                    let batch = RecordBatch::try_new(
+                        Arc::new(schema.clone()),
+                        vec![Arc::clone(&array)],
+                    )?;
+
+                    // Test 1: a IN (list) → [true, false] - exercises (false, false) branch
+                    let list = build_base_list();
+                    in_list!(
+                        batch,
+                        list,
+                        &false,
+                        vec![Some(true), Some(false)],
+                        Arc::clone(&col_a),
+                        &schema
+                    );
+
+                    // Test 2: a NOT IN (list) → [false, true] - exercises (false, true) branch
+                    let list = build_base_list();
+                    in_list!(
+                        batch,
+                        list,
+                        &true,
+                        vec![Some(false), Some(true)],
+                        Arc::clone(&col_a),
+                        &schema
+                    );
+
+                    eprintln!(
+                        "Test '{test_name}': exercised (false, true) branch (no nulls, negated)",
+                    );
+                }
+            }
+        }
 
         Ok(())
     }
 
+    /// Test IN LIST for all integer types (Int8/16/32/64, UInt8/16/32/64).
+    ///
+    /// Test data: 0 (in list), 2 (not in list), [1, 3, 5] (other list values)
     #[test]
-    fn in_list_binary() -> Result<()> {
-        let schema = Schema::new(vec![Field::new("a", DataType::Binary, true)]);
-        let a = BinaryArray::from(vec![
-            Some([1, 2, 3].as_slice()),
-            Some([1, 2, 2].as_slice()),
-            None,
-        ]);
-        let col_a = col("a", &schema)?;
-        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
-
-        // expression: "a in ([1, 2, 3], [4, 5, 6])"
-        let list = vec![lit([1, 2, 3].as_slice()), lit([4, 5, 6].as_slice())];
-        in_list!(
-            batch,
-            list.clone(),
-            &false,
-            vec![Some(true), Some(false), None],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        // expression: "a not in ([1, 2, 3], [4, 5, 6])"
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), Some(true), None],
-            Arc::clone(&col_a),
-            &schema
-        );
+    fn in_list_int_types() -> Result<()> {
+        let int_data = PrimitiveTestCaseData {
+            value_in: 0,
+            value_not_in: 2,
+            other_list_values: vec![1, 3, 5],
+        };
 
-        // expression: "a in ([1, 2, 3], [4, 5, 6], null)"
-        let list = vec![
-            lit([1, 2, 3].as_slice()),
-            lit([4, 5, 6].as_slice()),
-            lit(ScalarValue::Binary(None)),
-        ];
-        in_list!(
-            batch,
-            list.clone(),
-            &false,
-            vec![Some(true), None, None],
-            Arc::clone(&col_a),
-            &schema
-        );
+        run_test_cases(vec![
+            // Tests WITH nulls
+            primitive_test_case("int8", ScalarValue::Int8, int_data.clone()),
+            primitive_test_case("int16", ScalarValue::Int16, int_data.clone()),
+            primitive_test_case("int32", ScalarValue::Int32, int_data.clone()),
+            primitive_test_case("int64", ScalarValue::Int64, int_data.clone()),
+            primitive_test_case("uint8", ScalarValue::UInt8, int_data.clone()),
+            primitive_test_case("uint16", ScalarValue::UInt16, int_data.clone()),
+            primitive_test_case("uint32", ScalarValue::UInt32, int_data.clone()),
+            primitive_test_case("uint64", ScalarValue::UInt64, int_data.clone()),
+            // Tests WITHOUT nulls - exercises (false, true) branch
+            primitive_test_case_no_nulls("int32_no_nulls", ScalarValue::Int32, int_data),
+        ])
+    }
 
-        // expression: "a in ([1, 2, 3], [4, 5, 6], null)"
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), None, None],
-            Arc::clone(&col_a),
-            &schema
-        );
+    /// Test IN LIST for all string types (Utf8, LargeUtf8, Utf8View).
+    ///
+    /// Test data: "a" (in list), "d" (not in list), ["b", "c"] (other list values)
+    #[test]
+    fn in_list_string_types() -> Result<()> {
+        let string_data = PrimitiveTestCaseData {
+            value_in: "a",
+            value_not_in: "d",
+            other_list_values: vec!["b", "c"],
+        };
 
-        Ok(())
+        run_test_cases(vec![
+            primitive_test_case("utf8", ScalarValue::Utf8, string_data.clone()),
+            primitive_test_case(
+                "large_utf8",
+                ScalarValue::LargeUtf8,
+                string_data.clone(),
+            ),
+            primitive_test_case("utf8_view", ScalarValue::Utf8View, string_data),
+        ])
     }
 
+    /// Test IN LIST for all binary types (Binary, LargeBinary, BinaryView).
+    ///
+    /// Test data: [1,2,3] (in list), [1,2,2] (not in list), [[4,5,6], [7,8,9]] (other list values)
     #[test]
-    fn in_list_int64() -> Result<()> {
-        let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
-        let a = Int64Array::from(vec![Some(0), Some(2), None]);
-        let col_a = col("a", &schema)?;
-        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
+    fn in_list_binary_types() -> Result<()> {
+        let binary_data = PrimitiveTestCaseData {
+            value_in: vec![1_u8, 2, 3],
+            value_not_in: vec![1_u8, 2, 2],
+            other_list_values: vec![vec![4_u8, 5, 6], vec![7_u8, 8, 9]],
+        };
 
-        // expression: "a in (0, 1)"
-        let list = vec![lit(0i64), lit(1i64)];
-        in_list!(
-            batch,
-            list,
-            &false,
-            vec![Some(true), Some(false), None],
-            Arc::clone(&col_a),
-            &schema
-        );
+        run_test_cases(vec![
+            primitive_test_case("binary", ScalarValue::Binary, binary_data.clone()),
+            primitive_test_case(
+                "large_binary",
+                ScalarValue::LargeBinary,
+                binary_data.clone(),
+            ),
+            primitive_test_case("binary_view", ScalarValue::BinaryView, binary_data),
+        ])
+    }
 
-        // expression: "a not in (0, 1)"
-        let list = vec![lit(0i64), lit(1i64)];
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), Some(true), None],
-            Arc::clone(&col_a),
-            &schema
-        );
+    /// Test IN LIST for date types (Date32, Date64).
+    ///
+    /// Test data: 0 (in list), 2 (not in list), [1, 3] (other list values)
+    #[test]
+    fn in_list_date_types() -> Result<()> {
+        let date_data = PrimitiveTestCaseData {
+            value_in: 0,
+            value_not_in: 2,
+            other_list_values: vec![1, 3],
+        };
 
-        // expression: "a in (0, 1, NULL)"
-        let list = vec![lit(0i64), lit(1i64), lit(ScalarValue::Null)];
-        in_list!(
-            batch,
-            list,
-            &false,
-            vec![Some(true), None, None],
-            Arc::clone(&col_a),
-            &schema
-        );
+        run_test_cases(vec![
+            primitive_test_case("date32", ScalarValue::Date32, date_data.clone()),
+            primitive_test_case("date64", ScalarValue::Date64, date_data),
+        ])
+    }
 
-        // expression: "a not in (0, 1, NULL)"
-        let list = vec![lit(0i64), lit(1i64), lit(ScalarValue::Null)];
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), None, None],
-            Arc::clone(&col_a),
-            &schema
-        );
+    /// Test IN LIST for Decimal128 type.
+    ///
+    /// Test data: 0 (in list), 200 (not in list), [100, 300] (other list values) with precision=10, scale=2
+    #[test]
+    fn in_list_decimal() -> Result<()> {
+        run_test_cases(vec![InListPrimitiveTestCase {
+            name: "decimal128",
+            value_in: ScalarValue::Decimal128(Some(0), 10, 2),
+            value_not_in: ScalarValue::Decimal128(Some(200), 10, 2),
+            other_list_values: vec![
+                ScalarValue::Decimal128(Some(100), 10, 2),
+                ScalarValue::Decimal128(Some(300), 10, 2),
+            ],
+            null_value: Some(ScalarValue::Decimal128(None, 10, 2)),
+        }])
+    }
 
-        Ok(())
+    /// Test IN LIST for timestamp types.
+    ///
+    /// Test data: 0 (in list), 2000 (not in list), [1000, 3000] (other list values)
+    #[test]
+    fn in_list_timestamp_types() -> Result<()> {
+        run_test_cases(vec![
+            InListPrimitiveTestCase {
+                name: "timestamp_nanosecond",
+                value_in: ScalarValue::TimestampNanosecond(Some(0), None),
+                value_not_in: ScalarValue::TimestampNanosecond(Some(2000), None),
+                other_list_values: vec![
+                    ScalarValue::TimestampNanosecond(Some(1000), None),
+                    ScalarValue::TimestampNanosecond(Some(3000), None),
+                ],
+                null_value: Some(ScalarValue::TimestampNanosecond(None, None)),
+            },
+            InListPrimitiveTestCase {
+                name: "timestamp_millisecond_with_tz",
+                value_in: ScalarValue::TimestampMillisecond(
+                    Some(1500000),
+                    Some("+05:00".into()),
+                ),
+                value_not_in: ScalarValue::TimestampMillisecond(
+                    Some(2500000),
+                    Some("+05:00".into()),
+                ),
+                other_list_values: vec![ScalarValue::TimestampMillisecond(
+                    Some(3500000),
+                    Some("+05:00".into()),
+                )],
+                null_value: Some(ScalarValue::TimestampMillisecond(
+                    None,
+                    Some("+05:00".into()),
+                )),
+            },
+            InListPrimitiveTestCase {
+                name: "timestamp_millisecond_mixed_tz",
+                value_in: ScalarValue::TimestampMillisecond(
+                    Some(1500000),
+                    Some("+05:00".into()),
+                ),
+                value_not_in: ScalarValue::TimestampMillisecond(
+                    Some(2500000),
+                    Some("+05:00".into()),
+                ),
+                other_list_values: vec![
+                    ScalarValue::TimestampMillisecond(
+                        Some(3500000),
+                        Some("+01:00".into()),
+                    ),
+                    ScalarValue::TimestampMillisecond(Some(4500000), Some("UTC".into())),
+                ],
+                null_value: Some(ScalarValue::TimestampMillisecond(
+                    None,
+                    Some("+05:00".into()),
+                )),
+            },
+        ])
     }
 
     #[test]
@@ -885,237 +1109,89 @@ mod tests {
         Ok(())
     }
 
-    #[test]
-    fn in_list_date64() -> Result<()> {
-        let schema = Schema::new(vec![Field::new("a", DataType::Date64, true)]);
-        let a = Date64Array::from(vec![Some(0), Some(2), None]);
-        let col_a = col("a", &schema)?;
-        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
-
-        // expression: "a in (0, 1)"
-        let list = vec![
-            lit(ScalarValue::Date64(Some(0))),
-            lit(ScalarValue::Date64(Some(1))),
-        ];
-        in_list!(
-            batch,
-            list,
-            &false,
-            vec![Some(true), Some(false), None],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        // expression: "a not in (0, 1)"
-        let list = vec![
-            lit(ScalarValue::Date64(Some(0))),
-            lit(ScalarValue::Date64(Some(1))),
-        ];
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), Some(true), None],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        // expression: "a in (0, 1, NULL)"
-        let list = vec![
-            lit(ScalarValue::Date64(Some(0))),
-            lit(ScalarValue::Date64(Some(1))),
-            lit(ScalarValue::Null),
-        ];
-        in_list!(
-            batch,
-            list,
-            &false,
-            vec![Some(true), None, None],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        // expression: "a not in (0, 1, NULL)"
-        let list = vec![
-            lit(ScalarValue::Date64(Some(0))),
-            lit(ScalarValue::Date64(Some(1))),
-            lit(ScalarValue::Null),
-        ];
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), None, None],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        Ok(())
+    macro_rules! test_nullable {
+        ($COL:expr, $LIST:expr, $SCHEMA:expr, $EXPECTED:expr) => {{
+            let (cast_expr, cast_list_exprs) = in_list_cast($COL, $LIST, $SCHEMA)?;
+            let expr = in_list(cast_expr, cast_list_exprs, &false, $SCHEMA).unwrap();
+            let result = expr.nullable($SCHEMA)?;
+            assert_eq!($EXPECTED, result);
+        }};
     }
 
     #[test]
-    fn in_list_date32() -> Result<()> {
-        let schema = Schema::new(vec![Field::new("a", DataType::Date32, true)]);
-        let a = Date32Array::from(vec![Some(0), Some(2), None]);
-        let col_a = col("a", &schema)?;
-        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
+    fn in_list_nullable() -> Result<()> {
+        let schema = Schema::new(vec![
+            Field::new("c1_nullable", DataType::Int64, true),
+            Field::new("c2_non_nullable", DataType::Int64, false),
+        ]);
 
-        // expression: "a in (0, 1)"
-        let list = vec![
-            lit(ScalarValue::Date32(Some(0))),
-            lit(ScalarValue::Date32(Some(1))),
-        ];
-        in_list!(
-            batch,
-            list,
-            &false,
-            vec![Some(true), Some(false), None],
-            Arc::clone(&col_a),
-            &schema
-        );
+        let c1_nullable = col("c1_nullable", &schema)?;
+        let c2_non_nullable = col("c2_non_nullable", &schema)?;
 
-        // expression: "a not in (0, 1)"
-        let list = vec![
-            lit(ScalarValue::Date32(Some(0))),
-            lit(ScalarValue::Date32(Some(1))),
-        ];
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), Some(true), None],
-            Arc::clone(&col_a),
-            &schema
-        );
+        // static_filter has no nulls
+        let list = vec![lit(1_i64), lit(2_i64)];
+        test_nullable!(Arc::clone(&c1_nullable), list.clone(), &schema, true);
+        test_nullable!(Arc::clone(&c2_non_nullable), list.clone(), &schema, false);
 
-        // expression: "a in (0, 1, NULL)"
-        let list = vec![
-            lit(ScalarValue::Date32(Some(0))),
-            lit(ScalarValue::Date32(Some(1))),
-            lit(ScalarValue::Null),
-        ];
-        in_list!(
-            batch,
-            list,
-            &false,
-            vec![Some(true), None, None],
-            Arc::clone(&col_a),
-            &schema
-        );
+        // static_filter has nulls
+        let list = vec![lit(1_i64), lit(2_i64), lit(ScalarValue::Null)];
+        test_nullable!(Arc::clone(&c1_nullable), list.clone(), &schema, true);
+        test_nullable!(Arc::clone(&c2_non_nullable), list.clone(), &schema, true);
 
-        // expression: "a not in (0, 1, NULL)"
-        let list = vec![
-            lit(ScalarValue::Date32(Some(0))),
-            lit(ScalarValue::Date32(Some(1))),
-            lit(ScalarValue::Null),
-        ];
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), None, None],
-            Arc::clone(&col_a),
-            &schema
-        );
+        let list = vec![Arc::clone(&c1_nullable)];
+        test_nullable!(Arc::clone(&c2_non_nullable), list.clone(), &schema, true);
+
+        let list = vec![Arc::clone(&c2_non_nullable)];
+        test_nullable!(Arc::clone(&c1_nullable), list.clone(), &schema, true);
+
+        let list = vec![Arc::clone(&c2_non_nullable), Arc::clone(&c2_non_nullable)];
+        test_nullable!(Arc::clone(&c2_non_nullable), list.clone(), &schema, false);
 
         Ok(())
     }
 
     #[test]
-    fn in_list_decimal() -> Result<()> {
-        // Now, we can check the NULL type
-        let schema =
-            Schema::new(vec![Field::new("a", DataType::Decimal128(13, 4), true)]);
-        let array = vec![Some(100_0000_i128), None, Some(200_5000_i128)]
-            .into_iter()
-            .collect::<Decimal128Array>();
-        let array = array.with_precision_and_scale(13, 4).unwrap();
-        let col_a = col("a", &schema)?;
-        let batch =
-            RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(array)])?;
+    fn in_list_no_cols() -> Result<()> {
+        // test logic when the in_list expression doesn't have any columns
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let a = Int32Array::from(vec![Some(1), Some(2), None]);
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
 
-        // expression: "a in (100,200), the data type of list is INT32
-        let list = vec![lit(100i32), lit(200i32)];
-        in_list!(
-            batch,
-            list,
-            &false,
-            vec![Some(true), None, Some(false)],
-            Arc::clone(&col_a),
-            &schema
-        );
-        // expression: "a not in (100,200)
-        let list = vec![lit(100i32), lit(200i32)];
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), None, Some(true)],
-            Arc::clone(&col_a),
-            &schema
-        );
+        let list = vec![lit(ScalarValue::from(1i32)), lit(ScalarValue::from(6i32))];
 
-        // expression: "a in (200,NULL), the data type of list is INT32 AND NULL
-        let list = vec![lit(ScalarValue::Int32(Some(100))), lit(ScalarValue::Null)];
+        // 1 IN (1, 6)
+        let expr = lit(ScalarValue::Int32(Some(1)));
         in_list!(
             batch,
             list.clone(),
             &false,
-            vec![Some(true), None, None],
-            Arc::clone(&col_a),
-            &schema
-        );
-        // expression: "a not in (200,NULL), the data type of list is INT32 AND NULL
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), None, None],
-            Arc::clone(&col_a),
+            // should have three outputs, as the input batch has three rows
+            vec![Some(true), Some(true), Some(true)],
+            expr,
             &schema
         );
 
-        // expression: "a in (200.5, 100), the data type of list is FLOAT32 and INT32
-        let list = vec![lit(200.50f32), lit(100i32)];
+        // 2 IN (1, 6)
+        let expr = lit(ScalarValue::Int32(Some(2)));
         in_list!(
             batch,
-            list,
+            list.clone(),
             &false,
-            vec![Some(true), None, Some(true)],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        // expression: "a not in (200.5, 100), the data type of list is FLOAT32 and INT32
-        let list = vec![lit(200.50f32), lit(101i32)];
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(true), None, Some(false)],
-            Arc::clone(&col_a),
+            // should have three outputs, as the input batch has three rows
+            vec![Some(false), Some(false), Some(false)],
+            expr,
             &schema
         );
 
-        // test the optimization: set
-        // expression: "a in (99..300), the data type of list is INT32
-        let list = (99i32..300).map(lit).collect::<Vec<_>>();
-
+        // NULL IN (1, 6)
+        let expr = lit(ScalarValue::Int32(None));
         in_list!(
             batch,
             list.clone(),
             &false,
-            vec![Some(true), None, Some(false)],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), None, Some(true)],
-            Arc::clone(&col_a),
+            // should have three outputs, as the input batch has three rows
+            vec![None, None, None],
+            expr,
             &schema
         );
 
@@ -1123,302 +1199,61 @@ mod tests {
     }
 
     #[test]
-    fn test_cast_static_filter_to_set() -> Result<()> {
-        // random schema
-        let schema =
-            Schema::new(vec![Field::new("a", DataType::Decimal128(13, 4), true)]);
-
-        // list of phy expr
-        let mut phy_exprs = vec![
-            lit(1i64),
-            expressions::cast(lit(2i32), &schema, DataType::Int64)?,
-            try_cast(lit(3.13f32), &schema, DataType::Int64)?,
-        ];
-        let result = try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap();
-
-        let array = Int64Array::from(vec![1, 2, 3, 4]);
-        let r = result.contains(&array, false).unwrap();
-        assert_eq!(r, BooleanArray::from(vec![true, true, true, false]));
-
-        try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap();
-        // cast(cast(lit())), but the cast to the same data type, one case will be ignored
-        phy_exprs.push(expressions::cast(
-            expressions::cast(lit(2i32), &schema, DataType::Int64)?,
-            &schema,
-            DataType::Int64,
-        )?);
-        try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap();
-
-        phy_exprs.clear();
-
-        // case(cast(lit())), the cast to the diff data type
-        phy_exprs.push(expressions::cast(
-            expressions::cast(lit(2i32), &schema, DataType::Int64)?,
-            &schema,
-            DataType::Int32,
-        )?);
-        try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap();
-
-        // column
-        phy_exprs.push(col("a", &schema)?);
-        assert!(try_cast_static_filter_to_set(&phy_exprs, &schema).is_err());
+    fn in_list_utf8_with_dict_types() -> Result<()> {
+        fn dict_lit(key_type: DataType, value: &str) -> Arc<dyn PhysicalExpr> {
+            lit(ScalarValue::Dictionary(
+                Box::new(key_type),
+                Box::new(ScalarValue::new_utf8(value.to_string())),
+            ))
+        }
 
-        Ok(())
-    }
+        fn null_dict_lit(key_type: DataType) -> Arc<dyn PhysicalExpr> {
+            lit(ScalarValue::Dictionary(
+                Box::new(key_type),
+                Box::new(ScalarValue::Utf8(None)),
+            ))
+        }
 
-    #[test]
-    fn in_list_timestamp() -> Result<()> {
         let schema = Schema::new(vec![Field::new(
             "a",
-            DataType::Timestamp(TimeUnit::Microsecond, None),
+            DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8)),
             true,
         )]);
-        let a = TimestampMicrosecondArray::from(vec![
-            Some(1388588401000000000),
-            Some(1288588501000000000),
-            None,
-        ]);
+        let a: UInt16DictionaryArray =
+            vec![Some("a"), Some("d"), None].into_iter().collect();
         let col_a = col("a", &schema)?;
         let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
 
-        let list = vec![
-            lit(ScalarValue::TimestampMicrosecond(
-                Some(1388588401000000000),
-                None,
-            )),
-            lit(ScalarValue::TimestampMicrosecond(
-                Some(1388588401000000001),
-                None,
-            )),
-            lit(ScalarValue::TimestampMicrosecond(
-                Some(1388588401000000002),
-                None,
-            )),
+        // expression: "a in ("a", "b")"
+        let lists = [
+            vec![lit("a"), lit("b")],
+            vec![
+                dict_lit(DataType::Int8, "a"),
+                dict_lit(DataType::UInt16, "b"),
+            ],
         ];
+        for list in lists.iter() {
+            in_list_raw!(
+                batch,
+                list.clone(),
+                &false,
+                vec![Some(true), Some(false), None],
+                Arc::clone(&col_a),
+                &schema
+            );
+        }
 
-        in_list!(
-            batch,
-            list.clone(),
-            &false,
-            vec![Some(true), Some(false), None],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        in_list!(
-            batch,
-            list.clone(),
-            &true,
-            vec![Some(false), Some(true), None],
-            Arc::clone(&col_a),
-            &schema
-        );
-        Ok(())
-    }
-
-    #[test]
-    fn in_expr_with_multiple_element_in_list() -> Result<()> {
-        let schema = Schema::new(vec![
-            Field::new("a", DataType::Float64, true),
-            Field::new("b", DataType::Float64, true),
-            Field::new("c", DataType::Float64, true),
-        ]);
-        let a = Float64Array::from(vec![
-            Some(0.0),
-            Some(1.0),
-            Some(2.0),
-            Some(f64::NAN),
-            Some(-f64::NAN),
-        ]);
-        let b = Float64Array::from(vec![
-            Some(8.0),
-            Some(1.0),
-            Some(5.0),
-            Some(f64::NAN),
-            Some(3.0),
-        ]);
-        let c = Float64Array::from(vec![
-            Some(6.0),
-            Some(7.0),
-            None,
-            Some(5.0),
-            Some(-f64::NAN),
-        ]);
-        let col_a = col("a", &schema)?;
-        let col_b = col("b", &schema)?;
-        let col_c = col("c", &schema)?;
-        let batch = RecordBatch::try_new(
-            Arc::new(schema.clone()),
-            vec![Arc::new(a), Arc::new(b), Arc::new(c)],
-        )?;
-
-        let list = vec![Arc::clone(&col_b), Arc::clone(&col_c)];
-        in_list!(
-            batch,
-            list.clone(),
-            &false,
-            vec![Some(false), Some(true), None, Some(true), Some(true)],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(true), Some(false), None, Some(false), Some(false)],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        Ok(())
-    }
-
-    macro_rules! test_nullable {
-        ($COL:expr, $LIST:expr, $SCHEMA:expr, $EXPECTED:expr) => {{
-            let (cast_expr, cast_list_exprs) = in_list_cast($COL, $LIST, $SCHEMA)?;
-            let expr = in_list(cast_expr, cast_list_exprs, &false, $SCHEMA).unwrap();
-            let result = expr.nullable($SCHEMA)?;
-            assert_eq!($EXPECTED, result);
-        }};
-    }
-
-    #[test]
-    fn in_list_nullable() -> Result<()> {
-        let schema = Schema::new(vec![
-            Field::new("c1_nullable", DataType::Int64, true),
-            Field::new("c2_non_nullable", DataType::Int64, false),
-        ]);
-
-        let c1_nullable = col("c1_nullable", &schema)?;
-        let c2_non_nullable = col("c2_non_nullable", &schema)?;
-
-        // static_filter has no nulls
-        let list = vec![lit(1_i64), lit(2_i64)];
-        test_nullable!(Arc::clone(&c1_nullable), list.clone(), &schema, true);
-        test_nullable!(Arc::clone(&c2_non_nullable), list.clone(), &schema, false);
-
-        // static_filter has nulls
-        let list = vec![lit(1_i64), lit(2_i64), lit(ScalarValue::Null)];
-        test_nullable!(Arc::clone(&c1_nullable), list.clone(), &schema, true);
-        test_nullable!(Arc::clone(&c2_non_nullable), list.clone(), &schema, true);
-
-        let list = vec![Arc::clone(&c1_nullable)];
-        test_nullable!(Arc::clone(&c2_non_nullable), list.clone(), &schema, true);
-
-        let list = vec![Arc::clone(&c2_non_nullable)];
-        test_nullable!(Arc::clone(&c1_nullable), list.clone(), &schema, true);
-
-        let list = vec![Arc::clone(&c2_non_nullable), Arc::clone(&c2_non_nullable)];
-        test_nullable!(Arc::clone(&c2_non_nullable), list.clone(), &schema, false);
-
-        Ok(())
-    }
-
-    #[test]
-    fn in_list_no_cols() -> Result<()> {
-        // test logic when the in_list expression doesn't have any columns
-        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
-        let a = Int32Array::from(vec![Some(1), Some(2), None]);
-        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
-
-        let list = vec![lit(ScalarValue::from(1i32)), lit(ScalarValue::from(6i32))];
-
-        // 1 IN (1, 6)
-        let expr = lit(ScalarValue::Int32(Some(1)));
-        in_list!(
-            batch,
-            list.clone(),
-            &false,
-            // should have three outputs, as the input batch has three rows
-            vec![Some(true), Some(true), Some(true)],
-            expr,
-            &schema
-        );
-
-        // 2 IN (1, 6)
-        let expr = lit(ScalarValue::Int32(Some(2)));
-        in_list!(
-            batch,
-            list.clone(),
-            &false,
-            // should have three outputs, as the input batch has three rows
-            vec![Some(false), Some(false), Some(false)],
-            expr,
-            &schema
-        );
-
-        // NULL IN (1, 6)
-        let expr = lit(ScalarValue::Int32(None));
-        in_list!(
-            batch,
-            list.clone(),
-            &false,
-            // should have three outputs, as the input batch has three rows
-            vec![None, None, None],
-            expr,
-            &schema
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn in_list_utf8_with_dict_types() -> Result<()> {
-        fn dict_lit(key_type: DataType, value: &str) -> Arc<dyn PhysicalExpr> {
-            lit(ScalarValue::Dictionary(
-                Box::new(key_type),
-                Box::new(ScalarValue::new_utf8(value.to_string())),
-            ))
-        }
-
-        fn null_dict_lit(key_type: DataType) -> Arc<dyn PhysicalExpr> {
-            lit(ScalarValue::Dictionary(
-                Box::new(key_type),
-                Box::new(ScalarValue::Utf8(None)),
-            ))
-        }
-
-        let schema = Schema::new(vec![Field::new(
-            "a",
-            DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8)),
-            true,
-        )]);
-        let a: UInt16DictionaryArray =
-            vec![Some("a"), Some("d"), None].into_iter().collect();
-        let col_a = col("a", &schema)?;
-        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
-
-        // expression: "a in ("a", "b")"
-        let lists = [
-            vec![lit("a"), lit("b")],
-            vec![
-                dict_lit(DataType::Int8, "a"),
-                dict_lit(DataType::UInt16, "b"),
-            ],
-        ];
-        for list in lists.iter() {
-            in_list_raw!(
-                batch,
-                list.clone(),
-                &false,
-                vec![Some(true), Some(false), None],
-                Arc::clone(&col_a),
-                &schema
-            );
-        }
-
-        // expression: "a not in ("a", "b")"
-        for list in lists.iter() {
-            in_list_raw!(
-                batch,
-                list.clone(),
-                &true,
-                vec![Some(false), Some(true), None],
-                Arc::clone(&col_a),
-                &schema
-            );
-        }
+        // expression: "a not in ("a", "b")"
+        for list in lists.iter() {
+            in_list_raw!(
+                batch,
+                list.clone(),
+                &true,
+                vec![Some(false), Some(true), None],
+                Arc::clone(&col_a),
+                &schema
+            );
+        }
 
         // expression: "a in ("a", "b", null)"
         let lists = [
@@ -1514,4 +1349,2256 @@ mod tests {
         assert_snapshot!(display_string, @"a@0 NOT IN (SET) ([a, b, NULL])");
         Ok(())
     }
+
+    #[test]
+    fn in_list_struct() -> Result<()> {
+        // Create schema with a struct column
+        let struct_fields = Fields::from(vec![
+            Field::new("x", DataType::Int32, false),
+            Field::new("y", DataType::Utf8, false),
+        ]);
+        let schema = Schema::new(vec![Field::new(
+            "a",
+            DataType::Struct(struct_fields.clone()),
+            true,
+        )]);
+
+        // Create test data: array of structs
+        let x_array = Arc::new(Int32Array::from(vec![1, 2, 3]));
+        let y_array = Arc::new(StringArray::from(vec!["a", "b", "c"]));
+        let struct_array =
+            StructArray::new(struct_fields.clone(), vec![x_array, y_array], None);
+
+        let col_a = col("a", &schema)?;
+        let batch =
+            RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(struct_array)])?;
+
+        // Create literal structs for the IN list
+        // Struct {x: 1, y: "a"}
+        let struct1 = ScalarValue::Struct(Arc::new(StructArray::new(
+            struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1])),
+                Arc::new(StringArray::from(vec!["a"])),
+            ],
+            None,
+        )));
+
+        // Struct {x: 3, y: "c"}
+        let struct3 = ScalarValue::Struct(Arc::new(StructArray::new(
+            struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![3])),
+                Arc::new(StringArray::from(vec!["c"])),
+            ],
+            None,
+        )));
+
+        // Test: a IN ({1, "a"}, {3, "c"})
+        let list = vec![lit(struct1.clone()), lit(struct3.clone())];
+        in_list_raw!(
+            batch,
+            list.clone(),
+            &false,
+            vec![Some(true), Some(false), Some(true)],
+            Arc::clone(&col_a),
+            &schema
+        );
+
+        // Test: a NOT IN ({1, "a"}, {3, "c"})
+        in_list_raw!(
+            batch,
+            list,
+            &true,
+            vec![Some(false), Some(true), Some(false)],
+            Arc::clone(&col_a),
+            &schema
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn in_list_struct_with_nulls() -> Result<()> {
+        // Create schema with a struct column
+        let struct_fields = Fields::from(vec![
+            Field::new("x", DataType::Int32, false),
+            Field::new("y", DataType::Utf8, false),
+        ]);
+        let schema = Schema::new(vec![Field::new(
+            "a",
+            DataType::Struct(struct_fields.clone()),
+            true,
+        )]);
+
+        // Create test data with a null struct
+        let x_array = Arc::new(Int32Array::from(vec![1, 2]));
+        let y_array = Arc::new(StringArray::from(vec!["a", "b"]));
+        let struct_array = StructArray::new(
+            struct_fields.clone(),
+            vec![x_array, y_array],
+            Some(NullBuffer::from(vec![true, false])),
+        );
+
+        let col_a = col("a", &schema)?;
+        let batch =
+            RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(struct_array)])?;
+
+        // Create literal struct for the IN list
+        let struct1 = ScalarValue::Struct(Arc::new(StructArray::new(
+            struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1])),
+                Arc::new(StringArray::from(vec!["a"])),
+            ],
+            None,
+        )));
+
+        // Test: a IN ({1, "a"})
+        let list = vec![lit(struct1.clone())];
+        in_list_raw!(
+            batch,
+            list.clone(),
+            &false,
+            vec![Some(true), None],
+            Arc::clone(&col_a),
+            &schema
+        );
+
+        // Test: a NOT IN ({1, "a"})
+        in_list_raw!(
+            batch,
+            list,
+            &true,
+            vec![Some(false), None],
+            Arc::clone(&col_a),
+            &schema
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn in_list_struct_with_null_in_list() -> Result<()> {
+        // Create schema with a struct column
+        let struct_fields = Fields::from(vec![
+            Field::new("x", DataType::Int32, false),
+            Field::new("y", DataType::Utf8, false),
+        ]);
+        let schema = Schema::new(vec![Field::new(
+            "a",
+            DataType::Struct(struct_fields.clone()),
+            true,
+        )]);
+
+        // Create test data
+        let x_array = Arc::new(Int32Array::from(vec![1, 2, 3]));
+        let y_array = Arc::new(StringArray::from(vec!["a", "b", "c"]));
+        let struct_array =
+            StructArray::new(struct_fields.clone(), vec![x_array, y_array], None);
+
+        let col_a = col("a", &schema)?;
+        let batch =
+            RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(struct_array)])?;
+
+        // Create literal structs including a NULL
+        let struct1 = ScalarValue::Struct(Arc::new(StructArray::new(
+            struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1])),
+                Arc::new(StringArray::from(vec!["a"])),
+            ],
+            None,
+        )));
+
+        let null_struct = ScalarValue::Struct(Arc::new(StructArray::new_null(
+            struct_fields.clone(),
+            1,
+        )));
+
+        // Test: a IN ({1, "a"}, NULL)
+        let list = vec![lit(struct1), lit(null_struct.clone())];
+        in_list_raw!(
+            batch,
+            list.clone(),
+            &false,
+            vec![Some(true), None, None],
+            Arc::clone(&col_a),
+            &schema
+        );
+
+        // Test: a NOT IN ({1, "a"}, NULL)
+        in_list_raw!(
+            batch,
+            list,
+            &true,
+            vec![Some(false), None, None],
+            Arc::clone(&col_a),
+            &schema
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn in_list_nested_struct() -> Result<()> {
+        // Create nested struct schema
+        let inner_struct_fields = Fields::from(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, false),
+        ]);
+        let outer_struct_fields = Fields::from(vec![
+            Field::new(
+                "inner",
+                DataType::Struct(inner_struct_fields.clone()),
+                false,
+            ),
+            Field::new("c", DataType::Int32, false),
+        ]);
+        let schema = Schema::new(vec![Field::new(
+            "x",
+            DataType::Struct(outer_struct_fields.clone()),
+            true,
+        )]);
+
+        // Create test data with nested structs
+        let inner1 = Arc::new(StructArray::new(
+            inner_struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2])),
+                Arc::new(StringArray::from(vec!["x", "y"])),
+            ],
+            None,
+        ));
+        let c_array = Arc::new(Int32Array::from(vec![10, 20]));
+        let outer_array =
+            StructArray::new(outer_struct_fields.clone(), vec![inner1, c_array], None);
+
+        let col_x = col("x", &schema)?;
+        let batch =
+            RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(outer_array)])?;
+
+        // Create a nested struct literal matching the first row
+        let inner_match = Arc::new(StructArray::new(
+            inner_struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1])),
+                Arc::new(StringArray::from(vec!["x"])),
+            ],
+            None,
+        ));
+        let outer_match = ScalarValue::Struct(Arc::new(StructArray::new(
+            outer_struct_fields.clone(),
+            vec![inner_match, Arc::new(Int32Array::from(vec![10]))],
+            None,
+        )));
+
+        // Test: x IN ({{1, "x"}, 10})
+        let list = vec![lit(outer_match)];
+        in_list_raw!(
+            batch,
+            list.clone(),
+            &false,
+            vec![Some(true), Some(false)],
+            Arc::clone(&col_x),
+            &schema
+        );
+
+        // Test: x NOT IN ({{1, "x"}, 10})
+        in_list_raw!(
+            batch,
+            list,
+            &true,
+            vec![Some(false), Some(true)],
+            Arc::clone(&col_x),
+            &schema
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn in_list_struct_with_exprs_not_array() -> Result<()> {
+        // Test InList using expressions (not the array constructor) with structs
+        // By using InListExpr::new directly, we bypass the array optimization
+        // and use the Exprs variant, testing the expression evaluation path
+
+        // Create schema with a struct column {x: Int32, y: Utf8}
+        let struct_fields = Fields::from(vec![
+            Field::new("x", DataType::Int32, false),
+            Field::new("y", DataType::Utf8, false),
+        ]);
+        let schema = Schema::new(vec![Field::new(
+            "a",
+            DataType::Struct(struct_fields.clone()),
+            true,
+        )]);
+
+        // Create test data: array of structs [{1, "a"}, {2, "b"}, {3, "c"}]
+        let x_array = Arc::new(Int32Array::from(vec![1, 2, 3]));
+        let y_array = Arc::new(StringArray::from(vec!["a", "b", "c"]));
+        let struct_array =
+            StructArray::new(struct_fields.clone(), vec![x_array, y_array], None);
+
+        let col_a = col("a", &schema)?;
+        let batch =
+            RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(struct_array)])?;
+
+        // Create struct literals with the SAME shape (so types are compatible)
+        // Struct {x: 1, y: "a"}
+        let struct1 = ScalarValue::Struct(Arc::new(StructArray::new(
+            struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1])),
+                Arc::new(StringArray::from(vec!["a"])),
+            ],
+            None,
+        )));
+
+        // Struct {x: 3, y: "c"}
+        let struct3 = ScalarValue::Struct(Arc::new(StructArray::new(
+            struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![3])),
+                Arc::new(StringArray::from(vec!["c"])),
+            ],
+            None,
+        )));
+
+        // Create list of struct expressions
+        let list = vec![lit(struct1), lit(struct3)];
+
+        // Use InListExpr::new directly (not in_list()) to bypass array optimization
+        // This creates an InList without a static filter
+        let expr = Arc::new(InListExpr::new(Arc::clone(&col_a), list, false, None));
+
+        // Verify that the expression doesn't have a static filter
+        // by checking the display string does NOT contain "(SET)"
+        let display_string = expr.to_string();
+        assert!(
+            !display_string.contains("(SET)"),
+            "Expected display string to NOT contain '(SET)' (should use Exprs variant), but got: {display_string}",
+        );
+
+        // Evaluate the expression
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+
+        // Expected: first row {1, "a"} matches struct1,
+        //           second row {2, "b"} doesn't match,
+        //           third row {3, "c"} matches struct3
+        let expected = BooleanArray::from(vec![Some(true), Some(false), Some(true)]);
+        assert_eq!(result, &expected);
+
+        // Test NOT IN as well
+        let expr_not = Arc::new(InListExpr::new(
+            Arc::clone(&col_a),
+            vec![
+                lit(ScalarValue::Struct(Arc::new(StructArray::new(
+                    struct_fields.clone(),
+                    vec![
+                        Arc::new(Int32Array::from(vec![1])),
+                        Arc::new(StringArray::from(vec!["a"])),
+                    ],
+                    None,
+                )))),
+                lit(ScalarValue::Struct(Arc::new(StructArray::new(
+                    struct_fields.clone(),
+                    vec![
+                        Arc::new(Int32Array::from(vec![3])),
+                        Arc::new(StringArray::from(vec!["c"])),
+                    ],
+                    None,
+                )))),
+            ],
+            true,
+            None,
+        ));
+
+        let result_not = expr_not.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result_not = as_boolean_array(&result_not);
+
+        let expected_not = BooleanArray::from(vec![Some(false), Some(true), Some(false)]);
+        assert_eq!(result_not, &expected_not);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_null_handling_comprehensive() -> Result<()> {
+        // Comprehensive test demonstrating SQL three-valued logic for IN expressions
+        // This test explicitly shows all possible outcomes: true, false, and null
+        let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
+
+        // Test data: [1, 2, 3, null]
+        // - 1 will match in both lists
+        // - 2 will not match in either list
+        // - 3 will not match in either list
+        // - null is always null
+        let a = Int64Array::from(vec![Some(1), Some(2), Some(3), None]);
+        let col_a = col("a", &schema)?;
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
+
+        // Case 1: List WITHOUT null - demonstrates true/false/null outcomes
+        // "a IN (1, 4)" - 1 matches, 2 and 3 don't match, null is null
+        let list = vec![lit(1i64), lit(4i64)];
+        in_list!(
+            batch,
+            list,
+            &false,
+            vec![
+                Some(true),  // 1 is in the list → true
+                Some(false), // 2 is not in the list → false
+                Some(false), // 3 is not in the list → false
+                None,        // null IN (...) → null (SQL three-valued logic)
+            ],
+            Arc::clone(&col_a),
+            &schema
+        );
+
+        // Case 2: List WITH null - demonstrates null propagation for non-matches
+        // "a IN (1, NULL)" - 1 matches (true), 2/3 don't match but list has null (null), null is null
+        let list = vec![lit(1i64), lit(ScalarValue::Int64(None))];
+        in_list!(
+            batch,
+            list,
+            &false,
+            vec![
+                Some(true), // 1 is in the list → true (found match)
+                None, // 2 is not in list, but list has NULL → null (might match NULL)
+                None, // 3 is not in list, but list has NULL → null (might match NULL)
+                None, // null IN (...) → null (SQL three-valued logic)
+            ],
+            Arc::clone(&col_a),
+            &schema
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_with_only_nulls() -> Result<()> {
+        // Edge case: IN list contains ONLY null values
+        let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
+        let a = Int64Array::from(vec![Some(1), Some(2), None]);
+        let col_a = col("a", &schema)?;
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
+
+        // "a IN (NULL, NULL)" - list has only nulls
+        let list = vec![lit(ScalarValue::Int64(None)), lit(ScalarValue::Int64(None))];
+
+        // All results should be NULL because:
+        // - Non-null values (1, 2) can't match anything concrete, but list might contain matching value
+        // - NULL value is always NULL in IN expressions
+        in_list!(
+            batch,
+            list.clone(),
+            &false,
+            vec![None, None, None],
+            Arc::clone(&col_a),
+            &schema
+        );
+
+        // "a NOT IN (NULL, NULL)" - list has only nulls
+        // All results should still be NULL due to three-valued logic
+        in_list!(
+            batch,
+            list,
+            &true,
+            vec![None, None, None],
+            Arc::clone(&col_a),
+            &schema
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_multiple_nulls_deduplication() -> Result<()> {
+        // Test that multiple NULLs in the list are handled correctly
+        // This verifies deduplication doesn't break null handling
+        let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
+        let col_a = col("a", &schema)?;
+
+        // Create array with multiple nulls: [1, 2, NULL, NULL, 3, NULL]
+        let array = Arc::new(Int64Array::from(vec![
+            Some(1),
+            Some(2),
+            None,
+            None,
+            Some(3),
+            None,
+        ])) as ArrayRef;
+
+        // Create InListExpr from array
+        let expr = Arc::new(InListExpr::try_new_from_array(
+            Arc::clone(&col_a),
+            array,
+            false,
+        )?) as Arc<dyn PhysicalExpr>;
+
+        // Create test data: [1, 2, 3, 4, null]
+        let a = Int64Array::from(vec![Some(1), Some(2), Some(3), Some(4), None]);
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
+
+        // Evaluate the expression
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+
+        // Expected behavior with multiple NULLs in list:
+        // - Values in the list (1,2,3) → true
+        // - Values not in the list (4) → NULL (because list contains NULL)
+        // - NULL input → NULL
+        let expected = BooleanArray::from(vec![
+            Some(true), // 1 is in list
+            Some(true), // 2 is in list
+            Some(true), // 3 is in list
+            None,       // 4 not in list, but list has NULLs
+            None,       // NULL input
+        ]);
+        assert_eq!(result, &expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_not_in_null_handling_comprehensive() -> Result<()> {
+        // Comprehensive test demonstrating SQL three-valued logic for NOT IN expressions
+        // This test explicitly shows all possible outcomes for NOT IN: true, false, and null
+        let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
+
+        // Test data: [1, 2, 3, null]
+        let a = Int64Array::from(vec![Some(1), Some(2), Some(3), None]);
+        let col_a = col("a", &schema)?;
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
+
+        // Case 1: List WITHOUT null - demonstrates true/false/null outcomes for NOT IN
+        // "a NOT IN (1, 4)" - 1 matches (false), 2 and 3 don't match (true), null is null
+        let list = vec![lit(1i64), lit(4i64)];
+        in_list!(
+            batch,
+            list,
+            &true,
+            vec![
+                Some(false), // 1 is in the list → NOT IN returns false
+                Some(true),  // 2 is not in the list → NOT IN returns true
+                Some(true),  // 3 is not in the list → NOT IN returns true
+                None,        // null NOT IN (...) → null (SQL three-valued logic)
+            ],
+            Arc::clone(&col_a),
+            &schema
+        );
+
+        // Case 2: List WITH null - demonstrates null propagation for NOT IN
+        // "a NOT IN (1, NULL)" - 1 matches (false), 2/3 don't match but list has null (null), null is null
+        let list = vec![lit(1i64), lit(ScalarValue::Int64(None))];
+        in_list!(
+            batch,
+            list,
+            &true,
+            vec![
+                Some(false), // 1 is in the list → NOT IN returns false
+                None, // 2 is not in known values, but list has NULL → null (can't prove it's not in list)
+                None, // 3 is not in known values, but list has NULL → null (can't prove it's not in list)
+                None, // null NOT IN (...) → null (SQL three-valued logic)
+            ],
+            Arc::clone(&col_a),
+            &schema
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_null_type_column() -> Result<()> {
+        // Test with a column that has DataType::Null (not just nullable values)
+        // All values in a NullArray are null by definition
+        let schema = Schema::new(vec![Field::new("a", DataType::Null, true)]);
+        let a = NullArray::new(3);
+        let col_a = col("a", &schema)?;
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
+
+        // "null_column IN (1, 2)" - comparing Null type against Int64 list
+        // Note: This tests type coercion behavior between Null and Int64
+        let list = vec![lit(1i64), lit(2i64)];
+
+        // All results should be NULL because:
+        // - Every value in the column is null (DataType::Null)
+        // - null IN (anything) always returns null per SQL three-valued logic
+        in_list!(
+            batch,
+            list.clone(),
+            &false,
+            vec![None, None, None],
+            Arc::clone(&col_a),
+            &schema
+        );
+
+        // "null_column NOT IN (1, 2)"
+        // Same behavior for NOT IN - null NOT IN (anything) is still null
+        in_list!(
+            batch,
+            list,
+            &true,
+            vec![None, None, None],
+            Arc::clone(&col_a),
+            &schema
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_null_type_list() -> Result<()> {
+        // Test with a list that has DataType::Null
+        let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
+        let a = Int64Array::from(vec![Some(1), Some(2), None]);
+        let col_a = col("a", &schema)?;
+
+        // Create a NullArray as the list
+        let null_array = Arc::new(NullArray::new(2)) as ArrayRef;
+
+        // Try to create InListExpr with a NullArray list
+        // This tests whether try_new_from_array can handle Null type arrays
+        let expr = Arc::new(InListExpr::try_new_from_array(
+            Arc::clone(&col_a),
+            null_array,
+            false,
+        )?) as Arc<dyn PhysicalExpr>;
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+
+        // If it succeeds, all results should be NULL
+        // because the list contains only null type values
+        let expected = BooleanArray::from(vec![None, None, None]);
+        assert_eq!(result, &expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_null_type_both() -> Result<()> {
+        // Test when both column and list are DataType::Null
+        let schema = Schema::new(vec![Field::new("a", DataType::Null, true)]);
+        let a = NullArray::new(3);
+        let col_a = col("a", &schema)?;
+
+        // Create a NullArray as the list
+        let null_array = Arc::new(NullArray::new(2)) as ArrayRef;
+
+        // Try to create InListExpr with both Null types
+        let expr = Arc::new(InListExpr::try_new_from_array(
+            Arc::clone(&col_a),
+            null_array,
+            false,
+        )?) as Arc<dyn PhysicalExpr>;
+
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+
+        // If successful, all results should be NULL
+        // null IN [null, null] -> null
+        let expected = BooleanArray::from(vec![None, None, None]);
+        assert_eq!(result, &expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_comprehensive_null_handling() -> Result<()> {
+        // Comprehensive test for IN LIST operations with various NULL handling scenarios.
+        // This test covers the key cases validated against DuckDB as the source of truth.
+        //
+        // Note: Some scalar literal tests (like NULL IN (1, 2)) are omitted as they
+        // appear to expose an issue with static filter optimization. These are covered
+        // by existing tests like in_list_no_cols().
+
+        let schema = Arc::new(Schema::new(vec![Field::new("b", DataType::Int32, true)]));
+        let col_b = col("b", &schema)?;
+        let null_i32 = ScalarValue::Int32(None);
+
+        // Helper to create a batch
+        let make_batch = |values: Vec<Option<i32>>| -> Result<RecordBatch> {
+            let array = Arc::new(Int32Array::from(values));
+            Ok(RecordBatch::try_new(Arc::clone(&schema), vec![array])?)
+        };
+
+        // Helper to run a test
+        let run_test = |batch: &RecordBatch,
+                        expr: Arc<dyn PhysicalExpr>,
+                        list: Vec<Arc<dyn PhysicalExpr>>,
+                        expected: Vec<Option<bool>>|
+         -> Result<()> {
+            let in_expr = in_list(expr, list, &false, schema.as_ref())?;
+            let result = in_expr.evaluate(batch)?.into_array(batch.num_rows())?;
+            let result = as_boolean_array(&result);
+            assert_eq!(result, &BooleanArray::from(expected));
+            Ok(())
+        };
+
+        // ========================================================================
+        // COLUMN TESTS - col(b) IN [1, 2]
+        // ========================================================================
+
+        // [1] IN (1, 2) => [TRUE]
+        let batch = make_batch(vec![Some(1)])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(1i32), lit(2i32)],
+            vec![Some(true)],
+        )?;
+
+        // [1, 2] IN (1, 2) => [TRUE, TRUE]
+        let batch = make_batch(vec![Some(1), Some(2)])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(1i32), lit(2i32)],
+            vec![Some(true), Some(true)],
+        )?;
+
+        // [3, 4] IN (1, 2) => [FALSE, FALSE]
+        let batch = make_batch(vec![Some(3), Some(4)])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(1i32), lit(2i32)],
+            vec![Some(false), Some(false)],
+        )?;
+
+        // [1, NULL] IN (1, 2) => [TRUE, NULL]
+        let batch = make_batch(vec![Some(1), None])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(1i32), lit(2i32)],
+            vec![Some(true), None],
+        )?;
+
+        // [3, NULL] IN (1, 2) => [FALSE, NULL] (no match, NULL is NULL)
+        let batch = make_batch(vec![Some(3), None])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(1i32), lit(2i32)],
+            vec![Some(false), None],
+        )?;
+
+        // ========================================================================
+        // COLUMN WITH NULL IN LIST - col(b) IN [NULL, 1]
+        // ========================================================================
+
+        // [1] IN (NULL, 1) => [TRUE] (found match)
+        let batch = make_batch(vec![Some(1)])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(null_i32.clone()), lit(1i32)],
+            vec![Some(true)],
+        )?;
+
+        // [2] IN (NULL, 1) => [NULL] (no match, but list has NULL)
+        let batch = make_batch(vec![Some(2)])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(null_i32.clone()), lit(1i32)],
+            vec![None],
+        )?;
+
+        // [NULL] IN (NULL, 1) => [NULL]
+        let batch = make_batch(vec![None])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(null_i32.clone()), lit(1i32)],
+            vec![None],
+        )?;
+
+        // ========================================================================
+        // COLUMN WITH ALL NULLS IN LIST - col(b) IN [NULL, NULL]
+        // ========================================================================
+
+        // [1] IN (NULL, NULL) => [NULL]
+        let batch = make_batch(vec![Some(1)])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(null_i32.clone()), lit(null_i32.clone())],
+            vec![None],
+        )?;
+
+        // [NULL] IN (NULL, NULL) => [NULL]
+        let batch = make_batch(vec![None])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(null_i32.clone()), lit(null_i32.clone())],
+            vec![None],
+        )?;
+
+        // ========================================================================
+        // LITERAL IN LIST WITH COLUMN - lit(1) IN [2, col(b)]
+        // ========================================================================
+
+        // 1 IN (2, [1]) => [TRUE] (matches column value)
+        let batch = make_batch(vec![Some(1)])?;
+        run_test(
+            &batch,
+            lit(1i32),
+            vec![lit(2i32), Arc::clone(&col_b)],
+            vec![Some(true)],
+        )?;
+
+        // 1 IN (2, [3]) => [FALSE] (no match)
+        let batch = make_batch(vec![Some(3)])?;
+        run_test(
+            &batch,
+            lit(1i32),
+            vec![lit(2i32), Arc::clone(&col_b)],
+            vec![Some(false)],
+        )?;
+
+        // 1 IN (2, [NULL]) => [NULL] (no match, column is NULL)
+        let batch = make_batch(vec![None])?;
+        run_test(
+            &batch,
+            lit(1i32),
+            vec![lit(2i32), Arc::clone(&col_b)],
+            vec![None],
+        )?;
+
+        // ========================================================================
+        // COLUMN IN LIST CONTAINING ITSELF - col(b) IN [1, col(b)]
+        // ========================================================================
+
+        // [1] IN (1, [1]) => [TRUE] (always matches - either list literal or itself)
+        let batch = make_batch(vec![Some(1)])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(1i32), Arc::clone(&col_b)],
+            vec![Some(true)],
+        )?;
+
+        // [2] IN (1, [2]) => [TRUE] (matches itself)
+        let batch = make_batch(vec![Some(2)])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(1i32), Arc::clone(&col_b)],
+            vec![Some(true)],
+        )?;
+
+        // [NULL] IN (1, [NULL]) => [NULL] (NULL is never equal to anything)
+        let batch = make_batch(vec![None])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(1i32), Arc::clone(&col_b)],
+            vec![None],
+        )?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_scalar_literal_cases() -> Result<()> {
+        // Test scalar literal cases (both NULL and non-NULL) to ensure SQL three-valued
+        // logic is correctly implemented. This covers the important case where a scalar
+        // value is tested against a list containing NULL.
+
+        let schema = Arc::new(Schema::new(vec![Field::new("b", DataType::Int32, true)]));
+        let null_i32 = ScalarValue::Int32(None);
+
+        // Helper to create a batch
+        let make_batch = |values: Vec<Option<i32>>| -> Result<RecordBatch> {
+            let array = Arc::new(Int32Array::from(values));
+            Ok(RecordBatch::try_new(Arc::clone(&schema), vec![array])?)
+        };
+
+        // Helper to run a test
+        let run_test = |batch: &RecordBatch,
+                        expr: Arc<dyn PhysicalExpr>,
+                        list: Vec<Arc<dyn PhysicalExpr>>,
+                        negated: bool,
+                        expected: Vec<Option<bool>>|
+         -> Result<()> {
+            let in_expr = in_list(expr, list, &negated, schema.as_ref())?;
+            let result = in_expr.evaluate(batch)?.into_array(batch.num_rows())?;
+            let result = as_boolean_array(&result);
+            let expected_array = BooleanArray::from(expected);
+            assert_eq!(
+                result,
+                &expected_array,
+                "Expected {:?}, got {:?}",
+                expected_array,
+                result.iter().collect::<Vec<_>>()
+            );
+            Ok(())
+        };
+
+        let batch = make_batch(vec![Some(1)])?;
+
+        // ========================================================================
+        // NULL LITERAL TESTS
+        // According to SQL semantics, NULL IN (any_list) should always return NULL
+        // ========================================================================
+
+        // NULL IN (1, 1) => NULL
+        run_test(
+            &batch,
+            lit(null_i32.clone()),
+            vec![lit(1i32), lit(1i32)],
+            false,
+            vec![None],
+        )?;
+
+        // NULL IN (NULL, 1) => NULL
+        run_test(
+            &batch,
+            lit(null_i32.clone()),
+            vec![lit(null_i32.clone()), lit(1i32)],
+            false,
+            vec![None],
+        )?;
+
+        // NULL IN (NULL, NULL) => NULL
+        run_test(
+            &batch,
+            lit(null_i32.clone()),
+            vec![lit(null_i32.clone()), lit(null_i32.clone())],
+            false,
+            vec![None],
+        )?;
+
+        // ========================================================================
+        // NON-NULL SCALAR LITERALS WITH NULL IN LIST - Int32
+        // When a scalar value is NOT in a list containing NULL, the result is NULL
+        // When a scalar value IS in the list, the result is TRUE (NULL doesn't matter)
+        // ========================================================================
+
+        // 3 IN (0, 1, 2, NULL) => NULL (not in list, but list has NULL)
+        run_test(
+            &batch,
+            lit(3i32),
+            vec![lit(0i32), lit(1i32), lit(2i32), lit(null_i32.clone())],
+            false,
+            vec![None],
+        )?;
+
+        // 3 NOT IN (0, 1, 2, NULL) => NULL (not in list, but list has NULL)
+        run_test(
+            &batch,
+            lit(3i32),
+            vec![lit(0i32), lit(1i32), lit(2i32), lit(null_i32.clone())],
+            true,
+            vec![None],
+        )?;
+
+        // 1 IN (0, 1, 2, NULL) => TRUE (found match, NULL doesn't matter)
+        run_test(
+            &batch,
+            lit(1i32),
+            vec![lit(0i32), lit(1i32), lit(2i32), lit(null_i32.clone())],
+            false,
+            vec![Some(true)],
+        )?;
+
+        // 1 NOT IN (0, 1, 2, NULL) => FALSE (found match, NULL doesn't matter)
+        run_test(
+            &batch,
+            lit(1i32),
+            vec![lit(0i32), lit(1i32), lit(2i32), lit(null_i32.clone())],
+            true,
+            vec![Some(false)],
+        )?;
+
+        // ========================================================================
+        // NON-NULL SCALAR LITERALS WITH NULL IN LIST - String
+        // Same semantics as Int32 but with string type
+        // ========================================================================
+
+        let schema_str =
+            Arc::new(Schema::new(vec![Field::new("s", DataType::Utf8, true)]));
+        let batch_str = RecordBatch::try_new(
+            Arc::clone(&schema_str),
+            vec![Arc::new(StringArray::from(vec![Some("dummy")]))],
+        )?;
+        let null_str = ScalarValue::Utf8(None);
+
+        let run_test_str = |expr: Arc<dyn PhysicalExpr>,
+                            list: Vec<Arc<dyn PhysicalExpr>>,
+                            negated: bool,
+                            expected: Vec<Option<bool>>|
+         -> Result<()> {
+            let in_expr = in_list(expr, list, &negated, schema_str.as_ref())?;
+            let result = in_expr
+                .evaluate(&batch_str)?
+                .into_array(batch_str.num_rows())?;
+            let result = as_boolean_array(&result);
+            let expected_array = BooleanArray::from(expected);
+            assert_eq!(
+                result,
+                &expected_array,
+                "Expected {:?}, got {:?}",
+                expected_array,
+                result.iter().collect::<Vec<_>>()
+            );
+            Ok(())
+        };
+
+        // 'c' IN ('a', 'b', NULL) => NULL (not in list, but list has NULL)
+        run_test_str(
+            lit("c"),
+            vec![lit("a"), lit("b"), lit(null_str.clone())],
+            false,
+            vec![None],
+        )?;
+
+        // 'c' NOT IN ('a', 'b', NULL) => NULL (not in list, but list has NULL)
+        run_test_str(
+            lit("c"),
+            vec![lit("a"), lit("b"), lit(null_str.clone())],
+            true,
+            vec![None],
+        )?;
+
+        // 'a' IN ('a', 'b', NULL) => TRUE (found match, NULL doesn't matter)
+        run_test_str(
+            lit("a"),
+            vec![lit("a"), lit("b"), lit(null_str.clone())],
+            false,
+            vec![Some(true)],
+        )?;
+
+        // 'a' NOT IN ('a', 'b', NULL) => FALSE (found match, NULL doesn't matter)
+        run_test_str(
+            lit("a"),
+            vec![lit("a"), lit("b"), lit(null_str.clone())],
+            true,
+            vec![Some(false)],
+        )?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_tuple_cases() -> Result<()> {
+        // Test tuple/struct cases from the original request: (lit, lit) IN (lit, lit)
+        // These test row-wise comparisons like (1, 2) IN ((1, 2), (3, 4))
+
+        let schema = Arc::new(Schema::new(vec![Field::new("b", DataType::Int32, true)]));
+
+        // Helper to create struct scalars for tuple comparisons
+        let make_struct = |v1: Option<i32>, v2: Option<i32>| -> ScalarValue {
+            let fields = Fields::from(vec![
+                Field::new("field_0", DataType::Int32, true),
+                Field::new("field_1", DataType::Int32, true),
+            ]);
+            ScalarValue::Struct(Arc::new(StructArray::new(
+                fields,
+                vec![
+                    Arc::new(Int32Array::from(vec![v1])),
+                    Arc::new(Int32Array::from(vec![v2])),
+                ],
+                None,
+            )))
+        };
+
+        // Need a single row batch for scalar tests
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(vec![Some(1)]))],
+        )?;
+
+        // Helper to run tuple tests
+        let run_tuple_test = |lhs: ScalarValue,
+                              list: Vec<ScalarValue>,
+                              expected: Vec<Option<bool>>|
+         -> Result<()> {
+            let expr = in_list(
+                lit(lhs),
+                list.into_iter().map(lit).collect(),
+                &false,
+                schema.as_ref(),
+            )?;
+            let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+            let result = as_boolean_array(&result);
+            assert_eq!(result, &BooleanArray::from(expected));
+            Ok(())
+        };
+
+        // (NULL, NULL) IN ((1, 2)) => FALSE (tuples don't match)
+        run_tuple_test(
+            make_struct(None, None),
+            vec![make_struct(Some(1), Some(2))],
+            vec![Some(false)],
+        )?;
+
+        // (NULL, NULL) IN ((NULL, 1)) => FALSE
+        run_tuple_test(
+            make_struct(None, None),
+            vec![make_struct(None, Some(1))],
+            vec![Some(false)],
+        )?;
+
+        // (NULL, NULL) IN ((NULL, NULL)) => TRUE (exact match including nulls)
+        run_tuple_test(
+            make_struct(None, None),
+            vec![make_struct(None, None)],
+            vec![Some(true)],
+        )?;
+
+        // (NULL, 1) IN ((1, 2)) => FALSE
+        run_tuple_test(
+            make_struct(None, Some(1)),
+            vec![make_struct(Some(1), Some(2))],
+            vec![Some(false)],
+        )?;
+
+        // (NULL, 1) IN ((NULL, 1)) => TRUE (exact match)
+        run_tuple_test(
+            make_struct(None, Some(1)),
+            vec![make_struct(None, Some(1))],
+            vec![Some(true)],
+        )?;
+
+        // (NULL, 1) IN ((NULL, NULL)) => FALSE
+        run_tuple_test(
+            make_struct(None, Some(1)),
+            vec![make_struct(None, None)],
+            vec![Some(false)],
+        )?;
+
+        // (1, 2) IN ((1, 2)) => TRUE
+        run_tuple_test(
+            make_struct(Some(1), Some(2)),
+            vec![make_struct(Some(1), Some(2))],
+            vec![Some(true)],
+        )?;
+
+        // (1, 3) IN ((1, 2)) => FALSE
+        run_tuple_test(
+            make_struct(Some(1), Some(3)),
+            vec![make_struct(Some(1), Some(2))],
+            vec![Some(false)],
+        )?;
+
+        // (4, 4) IN ((1, 2)) => FALSE
+        run_tuple_test(
+            make_struct(Some(4), Some(4)),
+            vec![make_struct(Some(1), Some(2))],
+            vec![Some(false)],
+        )?;
+
+        // (1, 1) IN ((NULL, 1)) => FALSE
+        run_tuple_test(
+            make_struct(Some(1), Some(1)),
+            vec![make_struct(None, Some(1))],
+            vec![Some(false)],
+        )?;
+
+        // (1, 1) IN ((NULL, NULL)) => FALSE
+        run_tuple_test(
+            make_struct(Some(1), Some(1)),
+            vec![make_struct(None, None)],
+            vec![Some(false)],
+        )?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_dictionary_int32() -> Result<()> {
+        // Create schema with dictionary-encoded Int32 column
+        let dict_type =
+            DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Int32));
+        let schema = Schema::new(vec![Field::new("a", dict_type.clone(), false)]);
+        let col_a = col("a", &schema)?;
+
+        // Create IN list with Int32 literals: (100, 200, 300)
+        let list = vec![lit(100i32), lit(200i32), lit(300i32)];
+
+        // Create InListExpr via in_list() - this uses Int32StaticFilter for Int32 lists
+        let expr = in_list(col_a, list, &false, &schema)?;
+
+        // Create dictionary-encoded batch with values [100, 200, 500]
+        // Dictionary: keys [0, 1, 2] -> values [100, 200, 500]
+        // Using values clearly distinct from keys to avoid confusion
+        let keys = Int8Array::from(vec![0, 1, 2]);
+        let values = Int32Array::from(vec![100, 200, 500]);
+        let dict_array: ArrayRef =
+            Arc::new(DictionaryArray::try_new(keys, Arc::new(values))?);
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![dict_array])?;
+
+        // Expected: [100 IN (100,200,300), 200 IN (100,200,300), 500 IN (100,200,300)] = [true, true, false]
+        let result = expr.evaluate(&batch)?.into_array(3)?;
+        let result = as_boolean_array(&result);
+        assert_eq!(result, &BooleanArray::from(vec![true, true, false]));
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_dictionary_types() -> Result<()> {
+        // Helper functions for creating dictionary literals
+        fn dict_lit_int64(key_type: DataType, value: i64) -> Arc<dyn PhysicalExpr> {
+            lit(ScalarValue::Dictionary(
+                Box::new(key_type),
+                Box::new(ScalarValue::Int64(Some(value))),
+            ))
+        }
+
+        fn dict_lit_float64(key_type: DataType, value: f64) -> Arc<dyn PhysicalExpr> {
+            lit(ScalarValue::Dictionary(
+                Box::new(key_type),
+                Box::new(ScalarValue::Float64(Some(value))),
+            ))
+        }
+
+        // Test case structures
+        struct DictNeedleTest {
+            list_values: Vec<Arc<dyn PhysicalExpr>>,
+            expected: Vec<Option<bool>>,
+        }
+
+        struct DictionaryInListTestCase {
+            name: &'static str,
+            dict_type: DataType,
+            dict_keys: Vec<Option<i8>>,
+            dict_values: ArrayRef,
+            list_values_no_null: Vec<Arc<dyn PhysicalExpr>>,
+            list_values_with_null: Vec<Arc<dyn PhysicalExpr>>,
+            expected_1: Vec<Option<bool>>,
+            expected_2: Vec<Option<bool>>,
+            expected_3: Vec<Option<bool>>,
+            expected_4: Vec<Option<bool>>,
+            dict_needle_test: Option<DictNeedleTest>,
+        }
+
+        // Test harness function
+        fn run_dictionary_in_list_test(
+            test_case: DictionaryInListTestCase,
+        ) -> Result<()> {
+            // Create schema with dictionary type
+            let schema =
+                Schema::new(vec![Field::new("a", test_case.dict_type.clone(), true)]);
+            let col_a = col("a", &schema)?;
+
+            // Create dictionary array from keys and values
+            let keys = Int8Array::from(test_case.dict_keys.clone());
+            let dict_array: ArrayRef =
+                Arc::new(DictionaryArray::try_new(keys, test_case.dict_values)?);
+            let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![dict_array])?;
+
+            let exp1 = test_case.expected_1.clone();
+            let exp2 = test_case.expected_2.clone();
+            let exp3 = test_case.expected_3.clone();
+            let exp4 = test_case.expected_4;
+
+            // Test 1: a IN (values_no_null)
+            in_list!(
+                batch,
+                test_case.list_values_no_null.clone(),
+                &false,
+                exp1,
+                Arc::clone(&col_a),
+                &schema
+            );
+
+            // Test 2: a NOT IN (values_no_null)
+            in_list!(
+                batch,
+                test_case.list_values_no_null.clone(),
+                &true,
+                exp2,
+                Arc::clone(&col_a),
+                &schema
+            );
+
+            // Test 3: a IN (values_with_null)
+            in_list!(
+                batch,
+                test_case.list_values_with_null.clone(),
+                &false,
+                exp3,
+                Arc::clone(&col_a),
+                &schema
+            );
+
+            // Test 4: a NOT IN (values_with_null)
+            in_list!(
+                batch,
+                test_case.list_values_with_null,
+                &true,
+                exp4,
+                Arc::clone(&col_a),
+                &schema
+            );
+
+            // Optional: Dictionary needle test (if provided)
+            if let Some(needle_test) = test_case.dict_needle_test {
+                in_list_raw!(
+                    batch,
+                    needle_test.list_values,
+                    &false,
+                    needle_test.expected,
+                    Arc::clone(&col_a),
+                    &schema
+                );
+            }
+
+            Ok(())
+        }
+
+        // Test case 1: UTF8
+        // Dictionary: keys [0, 1, null] → values ["a", "d", -]
+        // Rows: ["a", "d", null]
+        let utf8_case = DictionaryInListTestCase {
+            name: "dictionary_utf8",
+            dict_type: DataType::Dictionary(
+                Box::new(DataType::Int8),
+                Box::new(DataType::Utf8),
+            ),
+            dict_keys: vec![Some(0), Some(1), None],
+            dict_values: Arc::new(StringArray::from(vec![Some("a"), Some("d")])),
+            list_values_no_null: vec![lit("a"), lit("b")],
+            list_values_with_null: vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))],
+            expected_1: vec![Some(true), Some(false), None],
+            expected_2: vec![Some(false), Some(true), None],
+            expected_3: vec![Some(true), None, None],
+            expected_4: vec![Some(false), None, None],
+            dict_needle_test: None,
+        };
+
+        // Test case 2: Int64 with dictionary needles
+        // Dictionary: keys [0, 1, null] → values [10, 20, -]
+        // Rows: [10, 20, null]
+        let int64_case = DictionaryInListTestCase {
+            name: "dictionary_int64",
+            dict_type: DataType::Dictionary(
+                Box::new(DataType::Int8),
+                Box::new(DataType::Int64),
+            ),
+            dict_keys: vec![Some(0), Some(1), None],
+            dict_values: Arc::new(Int64Array::from(vec![Some(10), Some(20)])),
+            list_values_no_null: vec![lit(10i64), lit(15i64)],
+            list_values_with_null: vec![
+                lit(10i64),
+                lit(15i64),
+                lit(ScalarValue::Int64(None)),
+            ],
+            expected_1: vec![Some(true), Some(false), None],
+            expected_2: vec![Some(false), Some(true), None],
+            expected_3: vec![Some(true), None, None],
+            expected_4: vec![Some(false), None, None],
+            dict_needle_test: Some(DictNeedleTest {
+                list_values: vec![
+                    dict_lit_int64(DataType::Int16, 10),
+                    dict_lit_int64(DataType::Int16, 15),
+                ],
+                expected: vec![Some(true), Some(false), None],
+            }),
+        };
+
+        // Test case 3: Float64 with NaN and dictionary needles
+        // Dictionary: keys [0, 1, null, 2] → values [1.5, 3.7, NaN, -]
+        // Rows: [1.5, 3.7, null, NaN]
+        // Note: NaN is a value (not null), so it goes in the values array
+        let float64_case = DictionaryInListTestCase {
+            name: "dictionary_float64",
+            dict_type: DataType::Dictionary(
+                Box::new(DataType::Int8),
+                Box::new(DataType::Float64),
+            ),
+            dict_keys: vec![Some(0), Some(1), None, Some(2)],
+            dict_values: Arc::new(Float64Array::from(vec![
+                Some(1.5),      // index 0
+                Some(3.7),      // index 1
+                Some(f64::NAN), // index 2
+            ])),
+            list_values_no_null: vec![lit(1.5f64), lit(2.0f64)],
+            list_values_with_null: vec![
+                lit(1.5f64),
+                lit(2.0f64),
+                lit(ScalarValue::Float64(None)),
+            ],
+            // Test 1: a IN (1.5, 2.0) → [true, false, null, false]
+            // NaN is false because NaN not in list and no NULL in list
+            expected_1: vec![Some(true), Some(false), None, Some(false)],
+            // Test 2: a NOT IN (1.5, 2.0) → [false, true, null, true]
+            // NaN is true because NaN not in list
+            expected_2: vec![Some(false), Some(true), None, Some(true)],
+            // Test 3: a IN (1.5, 2.0, NULL) → [true, null, null, null]
+            // 3.7 and NaN become null due to NULL in list (three-valued logic)
+            expected_3: vec![Some(true), None, None, None],
+            // Test 4: a NOT IN (1.5, 2.0, NULL) → [false, null, null, null]
+            // 3.7 and NaN become null due to NULL in list
+            expected_4: vec![Some(false), None, None, None],
+            dict_needle_test: Some(DictNeedleTest {
+                list_values: vec![
+                    dict_lit_float64(DataType::UInt16, 1.5),
+                    dict_lit_float64(DataType::UInt16, 2.0),
+                ],
+                expected: vec![Some(true), Some(false), None, Some(false)],
+            }),
+        };
+
+        // Execute all test cases
+        let test_name = utf8_case.name;
+        run_dictionary_in_list_test(utf8_case).map_err(|e| {
+            datafusion_common::DataFusionError::Execution(format!(
+                "Dictionary test '{test_name}' failed: {e}"
+            ))
+        })?;
+
+        let test_name = int64_case.name;
+        run_dictionary_in_list_test(int64_case).map_err(|e| {
+            datafusion_common::DataFusionError::Execution(format!(
+                "Dictionary test '{test_name}' failed: {e}"
+            ))
+        })?;
+
+        let test_name = float64_case.name;
+        run_dictionary_in_list_test(float64_case).map_err(|e| {
+            datafusion_common::DataFusionError::Execution(format!(
+                "Dictionary test '{test_name}' failed: {e}"
+            ))
+        })?;
+
+        // Additional test: Dictionary deduplication with repeated keys
+        // This tests that multiple rows with the same key (pointing to the same value)
+        // are evaluated correctly
+        let dedup_case = DictionaryInListTestCase {
+            name: "dictionary_deduplication",
+            dict_type: DataType::Dictionary(
+                Box::new(DataType::Int8),
+                Box::new(DataType::Utf8),
+            ),
+            // Keys: [0, 1, 0, 1, null] - keys 0 and 1 are repeated
+            // This creates data: ["a", "d", "a", "d", null]
+            dict_keys: vec![Some(0), Some(1), Some(0), Some(1), None],
+            dict_values: Arc::new(StringArray::from(vec![Some("a"), Some("d")])),
+            list_values_no_null: vec![lit("a"), lit("b")],
+            list_values_with_null: vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))],
+            // Test 1: a IN ("a", "b") → [true, false, true, false, null]
+            // Rows 0 and 2 both have key 0 → "a", so both are true
+            expected_1: vec![Some(true), Some(false), Some(true), Some(false), None],
+            // Test 2: a NOT IN ("a", "b") → [false, true, false, true, null]
+            expected_2: vec![Some(false), Some(true), Some(false), Some(true), None],
+            // Test 3: a IN ("a", "b", NULL) → [true, null, true, null, null]
+            // "d" becomes null due to NULL in list
+            expected_3: vec![Some(true), None, Some(true), None, None],
+            // Test 4: a NOT IN ("a", "b", NULL) → [false, null, false, null, null]
+            expected_4: vec![Some(false), None, Some(false), None, None],
+            dict_needle_test: None,
+        };
+
+        let test_name = dedup_case.name;
+        run_dictionary_in_list_test(dedup_case).map_err(|e| {
+            datafusion_common::DataFusionError::Execution(format!(
+                "Dictionary test '{test_name}' failed: {e}"
+            ))
+        })?;
+
+        // Additional test for Float64 NaN in IN list
+        let dict_type =
+            DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Float64));
+        let schema = Schema::new(vec![Field::new("a", dict_type.clone(), true)]);
+        let col_a = col("a", &schema)?;
+
+        let keys = Int8Array::from(vec![Some(0), Some(1), None, Some(2)]);
+        let values = Float64Array::from(vec![Some(1.5), Some(3.7), Some(f64::NAN)]);
+        let dict_array: ArrayRef =
+            Arc::new(DictionaryArray::try_new(keys, Arc::new(values))?);
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![dict_array])?;
+
+        // Test: a IN (1.5, 2.0, NaN)
+        let list_with_nan = vec![lit(1.5f64), lit(2.0f64), lit(f64::NAN)];
+        in_list!(
+            batch,
+            list_with_nan,
+            &false,
+            vec![Some(true), Some(false), None, Some(true)],
+            col_a,
+            &schema
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_esoteric_types() -> Result<()> {
+        // Test esoteric/less common types to validate the transform and mapping flow.
+        // These types are reinterpreted to base primitive types (e.g., Timestamp -> UInt64,
+        // Interval -> Decimal128, Float16 -> UInt16). We just need to verify basic
+        // functionality works - no need for comprehensive null handling tests.
+
+        // Helper: simple IN test that expects [Some(true), Some(false)]
+        let test_type = |data_type: DataType,
+                         in_array: ArrayRef,
+                         list_values: Vec<ScalarValue>|
+         -> Result<()> {
+            let schema = Schema::new(vec![Field::new("a", data_type.clone(), false)]);
+            let col_a = col("a", &schema)?;
+            let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![in_array])?;
+
+            let list = list_values.into_iter().map(lit).collect();
+            in_list!(
+                batch,
+                list,
+                &false,
+                vec![Some(true), Some(false)],
+                col_a,
+                &schema
+            );
+            Ok(())
+        };
+
+        // Timestamp types (all units map to Int64 -> UInt64)
+        test_type(
+            DataType::Timestamp(TimeUnit::Second, None),
+            Arc::new(TimestampSecondArray::from(vec![Some(1000), Some(2000)])),
+            vec![
+                ScalarValue::TimestampSecond(Some(1000), None),
+                ScalarValue::TimestampSecond(Some(1500), None),
+            ],
+        )?;
+
+        test_type(
+            DataType::Timestamp(TimeUnit::Millisecond, None),
+            Arc::new(TimestampMillisecondArray::from(vec![
+                Some(1000000),
+                Some(2000000),
+            ])),
+            vec![
+                ScalarValue::TimestampMillisecond(Some(1000000), None),
+                ScalarValue::TimestampMillisecond(Some(1500000), None),
+            ],
+        )?;
+
+        test_type(
+            DataType::Timestamp(TimeUnit::Microsecond, None),
+            Arc::new(TimestampMicrosecondArray::from(vec![
+                Some(1000000000),
+                Some(2000000000),
+            ])),
+            vec![
+                ScalarValue::TimestampMicrosecond(Some(1000000000), None),
+                ScalarValue::TimestampMicrosecond(Some(1500000000), None),
+            ],
+        )?;
+
+        // Time32 and Time64 (map to Int32 -> UInt32 and Int64 -> UInt64 respectively)
+        test_type(
+            DataType::Time32(TimeUnit::Second),
+            Arc::new(Time32SecondArray::from(vec![Some(3600), Some(7200)])),
+            vec![
+                ScalarValue::Time32Second(Some(3600)),
+                ScalarValue::Time32Second(Some(5400)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Time32(TimeUnit::Millisecond),
+            Arc::new(Time32MillisecondArray::from(vec![
+                Some(3600000),
+                Some(7200000),
+            ])),
+            vec![
+                ScalarValue::Time32Millisecond(Some(3600000)),
+                ScalarValue::Time32Millisecond(Some(5400000)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Time64(TimeUnit::Microsecond),
+            Arc::new(Time64MicrosecondArray::from(vec![
+                Some(3600000000),
+                Some(7200000000),
+            ])),
+            vec![
+                ScalarValue::Time64Microsecond(Some(3600000000)),
+                ScalarValue::Time64Microsecond(Some(5400000000)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Time64(TimeUnit::Nanosecond),
+            Arc::new(Time64NanosecondArray::from(vec![
+                Some(3600000000000),
+                Some(7200000000000),
+            ])),
+            vec![
+                ScalarValue::Time64Nanosecond(Some(3600000000000)),
+                ScalarValue::Time64Nanosecond(Some(5400000000000)),
+            ],
+        )?;
+
+        // Duration types (map to Int64 -> UInt64)
+        test_type(
+            DataType::Duration(TimeUnit::Second),
+            Arc::new(DurationSecondArray::from(vec![Some(86400), Some(172800)])),
+            vec![
+                ScalarValue::DurationSecond(Some(86400)),
+                ScalarValue::DurationSecond(Some(129600)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Duration(TimeUnit::Millisecond),
+            Arc::new(DurationMillisecondArray::from(vec![
+                Some(86400000),
+                Some(172800000),
+            ])),
+            vec![
+                ScalarValue::DurationMillisecond(Some(86400000)),
+                ScalarValue::DurationMillisecond(Some(129600000)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Duration(TimeUnit::Microsecond),
+            Arc::new(DurationMicrosecondArray::from(vec![
+                Some(86400000000),
+                Some(172800000000),
+            ])),
+            vec![
+                ScalarValue::DurationMicrosecond(Some(86400000000)),
+                ScalarValue::DurationMicrosecond(Some(129600000000)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Duration(TimeUnit::Nanosecond),
+            Arc::new(DurationNanosecondArray::from(vec![
+                Some(86400000000000),
+                Some(172800000000000),
+            ])),
+            vec![
+                ScalarValue::DurationNanosecond(Some(86400000000000)),
+                ScalarValue::DurationNanosecond(Some(129600000000000)),
+            ],
+        )?;
+
+        // Interval types (map to 16-byte Decimal128Type)
+        test_type(
+            DataType::Interval(IntervalUnit::YearMonth),
+            Arc::new(IntervalYearMonthArray::from(vec![Some(12), Some(24)])),
+            vec![
+                ScalarValue::IntervalYearMonth(Some(12)),
+                ScalarValue::IntervalYearMonth(Some(18)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Interval(IntervalUnit::DayTime),
+            Arc::new(IntervalDayTimeArray::from(vec![
+                Some(IntervalDayTime {
+                    days: 1,
+                    milliseconds: 0,
+                }),
+                Some(IntervalDayTime {
+                    days: 2,
+                    milliseconds: 0,
+                }),
+            ])),
+            vec![
+                ScalarValue::IntervalDayTime(Some(IntervalDayTime {
+                    days: 1,
+                    milliseconds: 0,
+                })),
+                ScalarValue::IntervalDayTime(Some(IntervalDayTime {
+                    days: 1,
+                    milliseconds: 500,
+                })),
+            ],
+        )?;
+
+        test_type(
+            DataType::Interval(IntervalUnit::MonthDayNano),
+            Arc::new(IntervalMonthDayNanoArray::from(vec![
+                Some(IntervalMonthDayNano {
+                    months: 1,
+                    days: 0,
+                    nanoseconds: 0,
+                }),
+                Some(IntervalMonthDayNano {
+                    months: 2,
+                    days: 0,
+                    nanoseconds: 0,
+                }),
+            ])),
+            vec![
+                ScalarValue::IntervalMonthDayNano(Some(IntervalMonthDayNano {
+                    months: 1,
+                    days: 0,
+                    nanoseconds: 0,
+                })),
+                ScalarValue::IntervalMonthDayNano(Some(IntervalMonthDayNano {
+                    months: 1,
+                    days: 15,
+                    nanoseconds: 0,
+                })),
+            ],
+        )?;
+
+        // Decimal256 (maps to Decimal128Type for 16-byte width)
+        // Need to use with_precision_and_scale() to set the metadata
+        let precision = 38;
+        let scale = 10;
+        test_type(
+            DataType::Decimal256(precision, scale),
+            Arc::new(
+                Decimal256Array::from(vec![
+                    Some(i256::from(12345)),
+                    Some(i256::from(67890)),
+                ])
+                .with_precision_and_scale(precision, scale)?,
+            ),
+            vec![
+                ScalarValue::Decimal256(Some(i256::from(12345)), precision, scale),
+                ScalarValue::Decimal256(Some(i256::from(54321)), precision, scale),
+            ],
+        )?;
+
+        Ok(())
+    }
+
+    /// Helper: creates an InListExpr with `static_filter = None`
+    /// to force the column-reference evaluation path.
+    fn make_in_list_with_columns(
+        expr: Arc<dyn PhysicalExpr>,
+        list: Vec<Arc<dyn PhysicalExpr>>,
+        negated: bool,
+    ) -> Arc<InListExpr> {
+        Arc::new(InListExpr::new(expr, list, negated, None))
+    }
+
+    #[test]
+    fn test_in_list_with_columns_int32_scalars() -> Result<()> {
+        // Column-reference path with scalar literals (bypassing static filter)
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let col_a = col("a", &schema)?;
+        let batch = RecordBatch::try_new(
+            Arc::new(schema),
+            vec![Arc::new(Int32Array::from(vec![
+                Some(1),
+                Some(2),
+                Some(3),
+                None,
+            ]))],
+        )?;
+
+        let list = vec![
+            lit(ScalarValue::Int32(Some(1))),
+            lit(ScalarValue::Int32(Some(3))),
+        ];
+        let expr = make_in_list_with_columns(col_a, list, false);
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+        assert_eq!(
+            result,
+            &BooleanArray::from(vec![Some(true), Some(false), Some(true), None,])
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_with_columns_int32_column_refs() -> Result<()> {
+        // IN list with column references
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+            Field::new("c", DataType::Int32, true),
+        ]);
+        let batch = RecordBatch::try_new(
+            Arc::new(schema.clone()),
+            vec![
+                Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3), None])),
+                Arc::new(Int32Array::from(vec![
+                    Some(1),
+                    Some(99),
+                    Some(99),
+                    Some(99),
+                ])),
+                Arc::new(Int32Array::from(vec![Some(99), Some(99), Some(3), None])),
+            ],
+        )?;
+
+        let col_a = col("a", &schema)?;
+        let list = vec![col("b", &schema)?, col("c", &schema)?];
+        let expr = make_in_list_with_columns(col_a, list, false);
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+        // row 0: 1 IN (1, 99) → true
+        // row 1: 2 IN (99, 99) → false
+        // row 2: 3 IN (99, 3) → true
+        // row 3: NULL IN (99, NULL) → NULL
+        assert_eq!(
+            result,
+            &BooleanArray::from(vec![Some(true), Some(false), Some(true), None,])
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_with_columns_utf8_column_refs() -> Result<()> {
+        // IN list with Utf8 column references
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Utf8, false),
+            Field::new("b", DataType::Utf8, false),
+        ]);
+        let batch = RecordBatch::try_new(
+            Arc::new(schema.clone()),
+            vec![
+                Arc::new(StringArray::from(vec!["x", "y", "z"])),
+                Arc::new(StringArray::from(vec!["x", "x", "z"])),
+            ],
+        )?;
+
+        let col_a = col("a", &schema)?;
+        let list = vec![col("b", &schema)?];
+        let expr = make_in_list_with_columns(col_a, list, false);
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+        // row 0: "x" IN ("x") → true
+        // row 1: "y" IN ("x") → false
+        // row 2: "z" IN ("z") → true
+        assert_eq!(result, &BooleanArray::from(vec![true, false, true]));
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_with_columns_negated() -> Result<()> {
+        // NOT IN with column references
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]);
+        let batch = RecordBatch::try_new(
+            Arc::new(schema.clone()),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])),
+                Arc::new(Int32Array::from(vec![1, 99, 3])),
+            ],
+        )?;
+
+        let col_a = col("a", &schema)?;
+        let list = vec![col("b", &schema)?];
+        let expr = make_in_list_with_columns(col_a, list, true);
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+        // row 0: 1 NOT IN (1) → false
+        // row 1: 2 NOT IN (99) → true
+        // row 2: 3 NOT IN (3) → false
+        assert_eq!(result, &BooleanArray::from(vec![false, true, false]));
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_with_columns_null_in_list() -> Result<()> {
+        // IN list with NULL scalar (column-reference path)
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let col_a = col("a", &schema)?;
+        let batch = RecordBatch::try_new(
+            Arc::new(schema),
+            vec![Arc::new(Int32Array::from(vec![1, 2]))],
+        )?;
+
+        let list = vec![
+            lit(ScalarValue::Int32(None)),
+            lit(ScalarValue::Int32(Some(1))),
+        ];
+        let expr = make_in_list_with_columns(col_a, list, false);
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+        // row 0: 1 IN (NULL, 1) → true (true OR null = true)
+        // row 1: 2 IN (NULL, 1) → NULL (false OR null = null)
+        assert_eq!(result, &BooleanArray::from(vec![Some(true), None]));
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_with_columns_float_nan() -> Result<()> {
+        // Verify NaN == NaN is true in the column-reference path
+        // (consistent with Arrow's totalOrder semantics)
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Float64, false),
+            Field::new("b", DataType::Float64, false),
+        ]);
+        let batch = RecordBatch::try_new(
+            Arc::new(schema.clone()),
+            vec![
+                Arc::new(Float64Array::from(vec![f64::NAN, 1.0, f64::NAN])),
+                Arc::new(Float64Array::from(vec![f64::NAN, 2.0, 0.0])),
+            ],
+        )?;
+
+        let col_a = col("a", &schema)?;
+        let list = vec![col("b", &schema)?];
+        let expr = make_in_list_with_columns(col_a, list, false);
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+        // row 0: NaN IN (NaN) → true
+        // row 1: 1.0 IN (2.0) → false
+        // row 2: NaN IN (0.0) → false
+        assert_eq!(result, &BooleanArray::from(vec![true, false, false]));
+        Ok(())
+    }
+
+    /// Tests that short-circuit evaluation produces correct results.
+    /// When all rows match after the first list item, remaining items
+    /// should be skipped without affecting correctness.
+    #[test]
+    fn test_in_list_with_columns_short_circuit() -> Result<()> {
+        // a IN (b, c) where b already matches every row of a
+        // The short-circuit should skip evaluating c
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Int32, false),
+        ]);
+        let batch = RecordBatch::try_new(
+            Arc::new(schema.clone()),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])),
+                Arc::new(Int32Array::from(vec![1, 2, 3])), // b == a for all rows
+                Arc::new(Int32Array::from(vec![99, 99, 99])),
+            ],
+        )?;
+
+        let col_a = col("a", &schema)?;
+        let list = vec![col("b", &schema)?, col("c", &schema)?];
+        let expr = make_in_list_with_columns(col_a, list, false);
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+        assert_eq!(result, &BooleanArray::from(vec![true, true, true]));
+        Ok(())
+    }
+
+    /// Short-circuit must NOT skip when nulls are present (three-valued logic).
+    /// Even if all non-null values are true, null rows keep the result as null.
+    #[test]
+    fn test_in_list_with_columns_short_circuit_with_nulls() -> Result<()> {
+        // a IN (b, c) where a has nulls
+        // Even if b matches all non-null rows, result should preserve nulls
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Int32, false),
+        ]);
+        let batch = RecordBatch::try_new(
+            Arc::new(schema.clone()),
+            vec![
+                Arc::new(Int32Array::from(vec![Some(1), None, Some(3)])),
+                Arc::new(Int32Array::from(vec![1, 2, 3])), // matches non-null rows
+                Arc::new(Int32Array::from(vec![99, 99, 99])),
+            ],
+        )?;
+
+        let col_a = col("a", &schema)?;
+        let list = vec![col("b", &schema)?, col("c", &schema)?];
+        let expr = make_in_list_with_columns(col_a, list, false);
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+        // row 0: 1 IN (1, 99) → true
+        // row 1: NULL IN (2, 99) → NULL
+        // row 2: 3 IN (3, 99) → true
+        assert_eq!(
+            result,
+            &BooleanArray::from(vec![Some(true), None, Some(true)])
+        );
+        Ok(())
+    }
+
+    /// Tests the make_comparator + collect_bool fallback path using
+    /// struct column references (nested types don't support arrow_eq).
+    #[test]
+    fn test_in_list_with_columns_struct() -> Result<()> {
+        let struct_fields = Fields::from(vec![
+            Field::new("x", DataType::Int32, false),
+            Field::new("y", DataType::Utf8, false),
+        ]);
+        let struct_dt = DataType::Struct(struct_fields.clone());
+
+        let schema = Schema::new(vec![
+            Field::new("a", struct_dt.clone(), true),
+            Field::new("b", struct_dt.clone(), false),
+            Field::new("c", struct_dt.clone(), false),
+        ]);
+
+        // a: [{1,"a"}, {2,"b"}, NULL,    {4,"d"}]
+        // b: [{1,"a"}, {9,"z"}, {3,"c"}, {4,"d"}]
+        // c: [{9,"z"}, {2,"b"}, {9,"z"}, {9,"z"}]
+        let a = Arc::new(StructArray::new(
+            struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3, 4])),
+                Arc::new(StringArray::from(vec!["a", "b", "c", "d"])),
+            ],
+            Some(vec![true, true, false, true].into()),
+        ));
+        let b = Arc::new(StructArray::new(
+            struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 9, 3, 4])),
+                Arc::new(StringArray::from(vec!["a", "z", "c", "d"])),
+            ],
+            None,
+        ));
+        let c = Arc::new(StructArray::new(
+            struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![9, 2, 9, 9])),
+                Arc::new(StringArray::from(vec!["z", "b", "z", "z"])),
+            ],
+            None,
+        ));
+
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![a, b, c])?;
+
+        let col_a = col("a", &schema)?;
+        let list = vec![col("b", &schema)?, col("c", &schema)?];
+        let expr = make_in_list_with_columns(col_a, list, false);
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+        // row 0: {1,"a"} IN ({1,"a"}, {9,"z"}) → true  (matches b)
+        // row 1: {2,"b"} IN ({9,"z"}, {2,"b"}) → true  (matches c)
+        // row 2: NULL    IN ({3,"c"}, {9,"z"}) → NULL
+        // row 3: {4,"d"} IN ({4,"d"}, {9,"z"}) → true  (matches b)
+        assert_eq!(
+            result,
+            &BooleanArray::from(vec![Some(true), Some(true), None, Some(true)])
+        );
+
+        // Also test NOT IN
+        let col_a = col("a", &schema)?;
+        let list = vec![col("b", &schema)?, col("c", &schema)?];
+        let expr = make_in_list_with_columns(col_a, list, true);
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+        // row 0: {1,"a"} NOT IN ({1,"a"}, {9,"z"}) → false
+        // row 1: {2,"b"} NOT IN ({9,"z"}, {2,"b"}) → false
+        // row 2: NULL    NOT IN ({3,"c"}, {9,"z"}) → NULL
+        // row 3: {4,"d"} NOT IN ({4,"d"}, {9,"z"}) → false
+        assert_eq!(
+            result,
+            &BooleanArray::from(vec![Some(false), Some(false), None, Some(false)])
+        );
+        Ok(())
+    }
+
+    // -----------------------------------------------------------------------
+    // Tests for try_new_from_array: evaluates `needle IN in_array`.
+    //
+    // This exercises the code path used by HashJoin dynamic filter pushdown,
+    // where in_array is built directly from the join's build-side arrays.
+    // Unlike try_new (used by SQL IN expressions), which always produces a
+    // non-Dictionary in_array because evaluate_list() flattens Dictionary
+    // scalars, try_new_from_array passes the array directly and can produce
+    // a Dictionary in_array.
+    // -----------------------------------------------------------------------
+
+    fn wrap_in_dict(array: ArrayRef) -> ArrayRef {
+        let keys = Int32Array::from((0..array.len() as i32).collect::<Vec<_>>());
+        Arc::new(DictionaryArray::new(keys, array))
+    }
+
+    /// Evaluates `needle IN in_array` via try_new_from_array, the same
+    /// path used by HashJoin dynamic filter pushdown (not the SQL literal
+    /// IN path which goes through try_new).
+    fn eval_in_list_from_array(
+        needle: ArrayRef,
+        in_array: ArrayRef,
+    ) -> Result<BooleanArray> {
+        let schema =
+            Schema::new(vec![Field::new("a", needle.data_type().clone(), false)]);
+        let col_a = col("a", &schema)?;
+        let expr = Arc::new(InListExpr::try_new_from_array(col_a, in_array, false)?)
+            as Arc<dyn PhysicalExpr>;
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![needle])?;
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        Ok(as_boolean_array(&result).clone())
+    }
+
+    #[test]
+    fn test_in_list_from_array_type_combinations() -> Result<()> {
+        use arrow::compute::cast;
+
+        // All cases: needle[0] and needle[2] match, needle[1] does not.
+        let expected = BooleanArray::from(vec![Some(true), Some(false), Some(true)]);
+
+        // Base arrays cast to each target type
+        let base_in = Arc::new(Int64Array::from(vec![1i64, 2, 3])) as ArrayRef;
+        let base_needle = Arc::new(Int64Array::from(vec![1i64, 4, 2])) as ArrayRef;
+
+        // Test all specializations in instantiate_static_filter
+        let primitive_types = vec![
+            DataType::Int8,
+            DataType::Int16,
+            DataType::Int32,
+            DataType::Int64,
+            DataType::UInt8,
+            DataType::UInt16,
+            DataType::UInt32,
+            DataType::UInt64,
+            DataType::Float32,
+            DataType::Float64,
+        ];
+
+        for dt in &primitive_types {
+            let in_array = cast(&base_in, dt)?;
+            let needle = cast(&base_needle, dt)?;
+
+            // T in_array, T needle
+            assert_eq!(
+                expected,
+                eval_in_list_from_array(Arc::clone(&needle), Arc::clone(&in_array))?,
+                "same-type failed for {dt:?}"
+            );
+
+            // T in_array, Dict(Int32, T) needle
+            assert_eq!(
+                expected,
+                eval_in_list_from_array(wrap_in_dict(needle), in_array)?,
+                "dict-needle failed for {dt:?}"
+            );
+        }
+
+        // Utf8 (falls through to ArrayStaticFilter)
+        let utf8_in = Arc::new(StringArray::from(vec!["a", "b", "c"])) as ArrayRef;
+        let utf8_needle = Arc::new(StringArray::from(vec!["a", "d", "b"])) as ArrayRef;
+
+        // Utf8 in_array, Utf8 needle
+        assert_eq!(
+            expected,
+            eval_in_list_from_array(Arc::clone(&utf8_needle), Arc::clone(&utf8_in),)?
+        );
+
+        // Utf8 in_array, Dict(Utf8) needle
+        assert_eq!(
+            expected,
+            eval_in_list_from_array(
+                wrap_in_dict(Arc::clone(&utf8_needle)),
+                Arc::clone(&utf8_in),
+            )?
+        );
+
+        // Dict(Utf8) in_array, Dict(Utf8) needle: the #20937 bug
+        assert_eq!(
+            expected,
+            eval_in_list_from_array(
+                wrap_in_dict(Arc::clone(&utf8_needle)),
+                wrap_in_dict(Arc::clone(&utf8_in)),
+            )?
+        );
+
+        // Struct in_array, Struct needle: multi-column join
+        let struct_fields = Fields::from(vec![
+            Field::new("c0", DataType::Utf8, true),
+            Field::new("c1", DataType::Int64, true),
+        ]);
+        let make_struct = |c0: ArrayRef, c1: ArrayRef| -> ArrayRef {
+            let pairs: Vec<(FieldRef, ArrayRef)> =
+                struct_fields.iter().cloned().zip([c0, c1]).collect();
+            Arc::new(StructArray::from(pairs))
+        };
+        assert_eq!(
+            expected,
+            eval_in_list_from_array(
+                make_struct(
+                    Arc::clone(&utf8_needle),
+                    Arc::new(Int64Array::from(vec![1, 4, 2])),
+                ),
+                make_struct(
+                    Arc::clone(&utf8_in),
+                    Arc::new(Int64Array::from(vec![1, 2, 3])),
+                ),
+            )?
+        );
+
+        // Struct with Dict fields: multi-column Dict join
+        let dict_struct_fields = Fields::from(vec![
+            Field::new(
+                "c0",
+                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                true,
+            ),
+            Field::new("c1", DataType::Int64, true),
+        ]);
+        let make_dict_struct = |c0: ArrayRef, c1: ArrayRef| -> ArrayRef {
+            let pairs: Vec<(FieldRef, ArrayRef)> =
+                dict_struct_fields.iter().cloned().zip([c0, c1]).collect();
+            Arc::new(StructArray::from(pairs))
+        };
+        assert_eq!(
+            expected,
+            eval_in_list_from_array(
+                make_dict_struct(
+                    wrap_in_dict(Arc::clone(&utf8_needle)),
+                    Arc::new(Int64Array::from(vec![1, 4, 2])),
+                ),
+                make_dict_struct(
+                    wrap_in_dict(Arc::clone(&utf8_in)),
+                    Arc::new(Int64Array::from(vec![1, 2, 3])),
+                ),
+            )?
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_from_array_type_mismatch_errors() -> Result<()> {
+        // Utf8 needle, Dict(Utf8) in_array
+        let err = eval_in_list_from_array(
+            Arc::new(StringArray::from(vec!["a", "d", "b"])),
+            wrap_in_dict(Arc::new(StringArray::from(vec!["a", "b", "c"]))),
+        )
+        .unwrap_err()
+        .to_string();
+        assert!(
+            err.contains("Can't compare arrays of different types"),
+            "{err}"
+        );
+
+        // Dict(Utf8) needle, Int64 in_array: specialized Int64StaticFilter
+        // rejects the Utf8 dictionary values at construction time
+        let err = eval_in_list_from_array(
+            wrap_in_dict(Arc::new(StringArray::from(vec!["a", "d", "b"]))),
+            Arc::new(Int64Array::from(vec![1, 2, 3])),
+        )
+        .unwrap_err()
+        .to_string();
+        assert!(err.contains("Failed to downcast"), "{err}");
+
+        // Dict(Int64) needle, Dict(Utf8) in_array: both Dict but different
+        // value types, make_comparator rejects the comparison
+        let err = eval_in_list_from_array(
+            wrap_in_dict(Arc::new(Int64Array::from(vec![1, 4, 2]))),
+            wrap_in_dict(Arc::new(StringArray::from(vec!["a", "b", "c"]))),
+        )
+        .unwrap_err()
+        .to_string();
+        assert!(
+            err.contains("Can't compare arrays of different types"),
+            "{err}"
+        );
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-expr/src/expressions/in_list/array_static_filter.rs b/datafusion/physical-expr/src/expressions/in_list/array_static_filter.rs
new file mode 100644
index 0000000000000..93bfcd49600d0
--- /dev/null
+++ b/datafusion/physical-expr/src/expressions/in_list/array_static_filter.rs
@@ -0,0 +1,160 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Array, ArrayRef, BooleanArray, downcast_array, downcast_dictionary_array,
+    make_comparator,
+};
+use arrow::buffer::{BooleanBuffer, NullBuffer};
+use arrow::compute::{SortOptions, take};
+use arrow::datatypes::DataType;
+use arrow::util::bit_iterator::BitIndexIterator;
+use datafusion_common::HashMap;
+use datafusion_common::Result;
+use datafusion_common::hash_utils::{RandomState, with_hashes};
+use hashbrown::hash_map::RawEntryMut;
+
+use super::static_filter::StaticFilter;
+
+/// Static filter for InList that stores the array and hash set for O(1) lookups
+#[derive(Debug, Clone)]
+pub(super) struct ArrayStaticFilter {
+    in_array: ArrayRef,
+    state: RandomState,
+    /// Used to provide a lookup from value to in list index
+    ///
+    /// Note: usize::hash is not used, instead the raw entry
+    /// API is used to store entries w.r.t their value
+    map: HashMap<usize, (), ()>,
+}
+
+impl StaticFilter for ArrayStaticFilter {
+    fn null_count(&self) -> usize {
+        self.in_array.null_count()
+    }
+
+    /// Checks if values in `v` are contained in the `in_array` using this hash set for lookup.
+    fn contains(&self, v: &dyn Array, negated: bool) -> Result<BooleanArray> {
+        // Null type comparisons always return null (SQL three-valued logic)
+        if v.data_type() == &DataType::Null
+            || self.in_array.data_type() == &DataType::Null
+        {
+            let nulls = NullBuffer::new_null(v.len());
+            return Ok(BooleanArray::new(
+                BooleanBuffer::new_unset(v.len()),
+                Some(nulls),
+            ));
+        }
+
+        // Unwrap dictionary-encoded needles when the value type matches
+        // in_array, evaluating against the dictionary values and mapping
+        // back via keys.
+        downcast_dictionary_array! {
+            v => {
+                // Only unwrap when the haystack (in_array) type matches
+                // the dictionary value type
+                if v.values().data_type() == self.in_array.data_type() {
+                    let values_contains = self.contains(v.values().as_ref(), negated)?;
+                    let result = take(&values_contains, v.keys(), None)?;
+                    return Ok(downcast_array(result.as_ref()));
+                }
+            }
+            _ => {}
+        }
+
+        let needle_nulls = v.logical_nulls();
+        let needle_nulls = needle_nulls.as_ref();
+        let haystack_has_nulls = self.in_array.null_count() != 0;
+
+        with_hashes([v], &self.state, |hashes| {
+            let cmp = make_comparator(v, &self.in_array, SortOptions::default())?;
+            Ok((0..v.len())
+                .map(|i| {
+                    // SQL three-valued logic: null IN (...) is always null
+                    if needle_nulls.is_some_and(|nulls| nulls.is_null(i)) {
+                        return None;
+                    }
+
+                    let hash = hashes[i];
+                    let contains = self
+                        .map
+                        .raw_entry()
+                        .from_hash(hash, |idx| cmp(i, *idx).is_eq())
+                        .is_some();
+
+                    match contains {
+                        true => Some(!negated),
+                        false if haystack_has_nulls => None,
+                        false => Some(negated),
+                    }
+                })
+                .collect())
+        })
+    }
+}
+
+impl ArrayStaticFilter {
+    /// Computes a [`StaticFilter`] for the provided [`Array`] if there
+    /// are nulls present or there are more than the configured number of
+    /// elements.
+    ///
+    /// Note: This is split into a separate function as higher-rank trait bounds currently
+    /// cause type inference to misbehave
+    pub(super) fn try_new(in_array: ArrayRef) -> Result<ArrayStaticFilter> {
+        // Null type has no natural order - return empty hash set
+        if in_array.data_type() == &DataType::Null {
+            return Ok(ArrayStaticFilter {
+                in_array,
+                state: RandomState::default(),
+                map: HashMap::with_hasher(()),
+            });
+        }
+
+        let state = RandomState::default();
+        let mut map: HashMap<usize, (), ()> = HashMap::with_hasher(());
+
+        with_hashes([&in_array], &state, |hashes| -> Result<()> {
+            let cmp = make_comparator(&in_array, &in_array, SortOptions::default())?;
+
+            let insert_value = |idx| {
+                let hash = hashes[idx];
+                if let RawEntryMut::Vacant(v) = map
+                    .raw_entry_mut()
+                    .from_hash(hash, |x| cmp(*x, idx).is_eq())
+                {
+                    v.insert_with_hasher(hash, idx, (), |x| hashes[*x]);
+                }
+            };
+
+            match in_array.nulls() {
+                Some(nulls) => {
+                    BitIndexIterator::new(nulls.validity(), nulls.offset(), nulls.len())
+                        .for_each(insert_value)
+                }
+                None => (0..in_array.len()).for_each(insert_value),
+            }
+
+            Ok(())
+        })?;
+
+        Ok(Self {
+            in_array,
+            state,
+            map,
+        })
+    }
+}
diff --git a/datafusion/physical-expr/src/expressions/in_list/primitive_filter.rs b/datafusion/physical-expr/src/expressions/in_list/primitive_filter.rs
new file mode 100644
index 0000000000000..2c084a1cb247b
--- /dev/null
+++ b/datafusion/physical-expr/src/expressions/in_list/primitive_filter.rs
@@ -0,0 +1,233 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Array, ArrayRef, AsArray, BooleanArray, downcast_array, downcast_dictionary_array,
+};
+use arrow::buffer::{BooleanBuffer, NullBuffer};
+use arrow::compute::take;
+use arrow::datatypes::*;
+use datafusion_common::{HashSet, Result, exec_datafusion_err};
+use std::hash::{Hash, Hasher};
+
+use super::static_filter::StaticFilter;
+
+/// Wrapper for f32 that implements Hash and Eq using bit comparison.
+/// This treats NaN values as equal to each other when they have the same bit pattern.
+#[derive(Clone, Copy)]
+struct OrderedFloat32(f32);
+
+impl Hash for OrderedFloat32 {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.0.to_ne_bytes().hash(state);
+    }
+}
+
+impl PartialEq for OrderedFloat32 {
+    fn eq(&self, other: &Self) -> bool {
+        self.0.to_bits() == other.0.to_bits()
+    }
+}
+
+impl Eq for OrderedFloat32 {}
+
+impl From<f32> for OrderedFloat32 {
+    fn from(v: f32) -> Self {
+        Self(v)
+    }
+}
+
+/// Wrapper for f64 that implements Hash and Eq using bit comparison.
+/// This treats NaN values as equal to each other when they have the same bit pattern.
+#[derive(Clone, Copy)]
+struct OrderedFloat64(f64);
+
+impl Hash for OrderedFloat64 {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.0.to_ne_bytes().hash(state);
+    }
+}
+
+impl PartialEq for OrderedFloat64 {
+    fn eq(&self, other: &Self) -> bool {
+        self.0.to_bits() == other.0.to_bits()
+    }
+}
+
+impl Eq for OrderedFloat64 {}
+
+impl From<f64> for OrderedFloat64 {
+    fn from(v: f64) -> Self {
+        Self(v)
+    }
+}
+
+// Macro to generate specialized StaticFilter implementations for primitive types
+macro_rules! primitive_static_filter {
+    ($Name:ident, $ArrowType:ty) => {
+        primitive_static_filter!(
+            $Name,
+            $ArrowType,
+            <$ArrowType as ArrowPrimitiveType>::Native,
+            |v| v
+        );
+    };
+    ($Name:ident, $ArrowType:ty, $SetValueType:ty, $to_set_value:expr) => {
+        pub(super) struct $Name {
+            null_count: usize,
+            values: HashSet<$SetValueType>,
+        }
+
+        impl $Name {
+            pub(super) fn try_new(in_array: &ArrayRef) -> Result<Self> {
+                let in_array = in_array
+                    .as_primitive_opt::<$ArrowType>()
+                    .ok_or_else(|| exec_datafusion_err!("Failed to downcast an array to a '{}' array", stringify!($ArrowType)))?;
+
+                let mut values = HashSet::with_capacity(in_array.len());
+                let null_count = in_array.null_count();
+
+                for v in in_array.iter().flatten() {
+                    values.insert(($to_set_value)(v));
+                }
+
+                Ok(Self { null_count, values })
+            }
+        }
+
+        impl StaticFilter for $Name {
+            fn null_count(&self) -> usize {
+                self.null_count
+            }
+
+            fn contains(&self, v: &dyn Array, negated: bool) -> Result<BooleanArray> {
+                // Handle dictionary arrays by recursing on the values
+                downcast_dictionary_array! {
+                    v => {
+                        let values_contains = self.contains(v.values().as_ref(), negated)?;
+                        let result = take(&values_contains, v.keys(), None)?;
+                        return Ok(downcast_array(result.as_ref()))
+                    }
+                    _ => {}
+                }
+
+                let v = v
+                    .as_primitive_opt::<$ArrowType>()
+                    .ok_or_else(|| exec_datafusion_err!("Failed to downcast an array to a '{}' array", stringify!($ArrowType)))?;
+
+                let haystack_has_nulls = self.null_count > 0;
+                let needle_values = v.values();
+                let needle_nulls = v.nulls();
+                let needle_has_nulls = v.null_count() > 0;
+
+                // Truth table for `value [NOT] IN (set)` with SQL three-valued logic:
+                // ("-" means the value doesn't affect the result)
+                //
+                // | needle_null | haystack_null | negated | in set? | result |
+                // |-------------|---------------|---------|---------|--------|
+                // | true        | -             | false   | -       | null   |
+                // | true        | -             | true    | -       | null   |
+                // | false       | true          | false   | yes     | true   |
+                // | false       | true          | false   | no      | null   |
+                // | false       | true          | true    | yes     | false  |
+                // | false       | true          | true    | no      | null   |
+                // | false       | false         | false   | yes     | true   |
+                // | false       | false         | false   | no      | false  |
+                // | false       | false         | true    | yes     | false  |
+                // | false       | false         | true    | no      | true   |
+
+                // Compute the "contains" result using collect_bool (fast batched approach)
+                // This ignores nulls - we handle them separately
+                let contains_buffer = if negated {
+                    BooleanBuffer::collect_bool(needle_values.len(), |i| {
+                        !self.values.contains(&($to_set_value)(needle_values[i]))
+                    })
+                } else {
+                    BooleanBuffer::collect_bool(needle_values.len(), |i| {
+                        self.values.contains(&($to_set_value)(needle_values[i]))
+                    })
+                };
+
+                // Compute the null mask
+                // Output is null when:
+                // 1. needle value is null, OR
+                // 2. needle value is not in set AND haystack has nulls
+                let result_nulls = match (needle_has_nulls, haystack_has_nulls) {
+                    (false, false) => {
+                        // No nulls anywhere
+                        None
+                    }
+                    (true, false) => {
+                        // Only needle has nulls - just use needle's null mask
+                        needle_nulls.cloned()
+                    }
+                    (false, true) => {
+                        // Only haystack has nulls - result is null when value not in set
+                        // Valid (not null) when original "in set" is true
+                        // For NOT IN: contains_buffer = !original, so validity = !contains_buffer
+                        let validity = if negated {
+                            !&contains_buffer
+                        } else {
+                            contains_buffer.clone()
+                        };
+                        Some(NullBuffer::new(validity))
+                    }
+                    (true, true) => {
+                        // Both have nulls - combine needle nulls with haystack-induced nulls
+                        let needle_validity = needle_nulls.map(|n| n.inner().clone())
+                            .unwrap_or_else(|| BooleanBuffer::new_set(needle_values.len()));
+
+                        // Valid when original "in set" is true (see above)
+                        let haystack_validity = if negated {
+                            !&contains_buffer
+                        } else {
+                            contains_buffer.clone()
+                        };
+
+                        // Combined validity: valid only where both are valid
+                        let combined_validity = &needle_validity & &haystack_validity;
+                        Some(NullBuffer::new(combined_validity))
+                    }
+                };
+
+                Ok(BooleanArray::new(contains_buffer, result_nulls))
+            }
+        }
+    };
+}
+
+// Generate specialized filters for all integer primitive types
+primitive_static_filter!(Int8StaticFilter, Int8Type);
+primitive_static_filter!(Int16StaticFilter, Int16Type);
+primitive_static_filter!(Int32StaticFilter, Int32Type);
+primitive_static_filter!(Int64StaticFilter, Int64Type);
+primitive_static_filter!(UInt8StaticFilter, UInt8Type);
+primitive_static_filter!(UInt16StaticFilter, UInt16Type);
+primitive_static_filter!(UInt32StaticFilter, UInt32Type);
+primitive_static_filter!(UInt64StaticFilter, UInt64Type);
+
+// Macro to generate specialized StaticFilter implementations for float types
+// Floats require a wrapper type (OrderedFloat*) to implement Hash/Eq due to NaN semantics
+macro_rules! float_static_filter {
+    ($Name:ident, $ArrowType:ty, $OrderedType:ty) => {
+        primitive_static_filter!($Name, $ArrowType, $OrderedType, <$OrderedType>::from);
+    };
+}
+
+// Generate specialized filters for float types using ordered wrappers
+float_static_filter!(Float32StaticFilter, Float32Type, OrderedFloat32);
+float_static_filter!(Float64StaticFilter, Float64Type, OrderedFloat64);
diff --git a/datafusion/physical-expr/src/expressions/in_list/static_filter.rs b/datafusion/physical-expr/src/expressions/in_list/static_filter.rs
new file mode 100644
index 0000000000000..47bffb85ad8c1
--- /dev/null
+++ b/datafusion/physical-expr/src/expressions/in_list/static_filter.rs
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, BooleanArray};
+use datafusion_common::Result;
+
+/// Trait for InList static filters
+pub(super) trait StaticFilter {
+    fn null_count(&self) -> usize;
+
+    /// Checks if values in `v` are contained in the filter
+    fn contains(&self, v: &dyn Array, negated: bool) -> Result<BooleanArray>;
+}
diff --git a/datafusion/physical-expr/src/expressions/in_list/strategy.rs b/datafusion/physical-expr/src/expressions/in_list/strategy.rs
new file mode 100644
index 0000000000000..955ab5ad290a3
--- /dev/null
+++ b/datafusion/physical-expr/src/expressions/in_list/strategy.rs
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::ArrayRef;
+use arrow::datatypes::DataType;
+use datafusion_common::Result;
+
+use super::array_static_filter::ArrayStaticFilter;
+use super::primitive_filter::*;
+use super::static_filter::StaticFilter;
+
+pub(super) fn instantiate_static_filter(
+    in_array: ArrayRef,
+) -> Result<Arc<dyn StaticFilter + Send + Sync>> {
+    match in_array.data_type() {
+        // Integer primitive types
+        DataType::Int8 => Ok(Arc::new(Int8StaticFilter::try_new(&in_array)?)),
+        DataType::Int16 => Ok(Arc::new(Int16StaticFilter::try_new(&in_array)?)),
+        DataType::Int32 => Ok(Arc::new(Int32StaticFilter::try_new(&in_array)?)),
+        DataType::Int64 => Ok(Arc::new(Int64StaticFilter::try_new(&in_array)?)),
+        DataType::UInt8 => Ok(Arc::new(UInt8StaticFilter::try_new(&in_array)?)),
+        DataType::UInt16 => Ok(Arc::new(UInt16StaticFilter::try_new(&in_array)?)),
+        DataType::UInt32 => Ok(Arc::new(UInt32StaticFilter::try_new(&in_array)?)),
+        DataType::UInt64 => Ok(Arc::new(UInt64StaticFilter::try_new(&in_array)?)),
+        // Float primitive types (use ordered wrappers for Hash/Eq)
+        DataType::Float32 => Ok(Arc::new(Float32StaticFilter::try_new(&in_array)?)),
+        DataType::Float64 => Ok(Arc::new(Float64StaticFilter::try_new(&in_array)?)),
+        _ => {
+            /* fall through to generic implementation for unsupported types (Struct, etc.) */
+            Ok(Arc::new(ArrayStaticFilter::try_new(in_array)?))
+        }
+    }
+}
diff --git a/datafusion/physical-expr/src/expressions/is_not_null.rs b/datafusion/physical-expr/src/expressions/is_not_null.rs
index 62be8ebbc13e3..86acf0a4ea116 100644
--- a/datafusion/physical-expr/src/expressions/is_not_null.rs
+++ b/datafusion/physical-expr/src/expressions/is_not_null.rs
@@ -26,7 +26,7 @@ use datafusion_common::Result;
 use datafusion_common::ScalarValue;
 use datafusion_expr::ColumnarValue;
 use std::hash::Hash;
-use std::{any::Any, sync::Arc};
+use std::sync::Arc;
 
 /// IS NOT NULL expression
 #[derive(Debug, Eq)]
@@ -67,11 +67,6 @@ impl std::fmt::Display for IsNotNullExpr {
 }
 
 impl PhysicalExpr for IsNotNullExpr {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
         Ok(DataType::Boolean)
     }
diff --git a/datafusion/physical-expr/src/expressions/is_null.rs b/datafusion/physical-expr/src/expressions/is_null.rs
index 356fe2a866672..8534ddb8d104f 100644
--- a/datafusion/physical-expr/src/expressions/is_null.rs
+++ b/datafusion/physical-expr/src/expressions/is_null.rs
@@ -26,7 +26,7 @@ use datafusion_common::Result;
 use datafusion_common::ScalarValue;
 use datafusion_expr::ColumnarValue;
 use std::hash::Hash;
-use std::{any::Any, sync::Arc};
+use std::sync::Arc;
 
 /// IS NULL expression
 #[derive(Debug, Eq)]
@@ -67,11 +67,6 @@ impl std::fmt::Display for IsNullExpr {
 }
 
 impl PhysicalExpr for IsNullExpr {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
         Ok(DataType::Boolean)
     }
diff --git a/datafusion/physical-expr/src/expressions/lambda.rs b/datafusion/physical-expr/src/expressions/lambda.rs
new file mode 100644
index 0000000000000..267f5f605f8ff
--- /dev/null
+++ b/datafusion/physical-expr/src/expressions/lambda.rs
@@ -0,0 +1,160 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Physical lambda expression: [`LambdaExpr`]
+
+use std::hash::Hash;
+use std::sync::Arc;
+
+use crate::physical_expr::PhysicalExpr;
+use arrow::{
+    datatypes::{DataType, Schema},
+    record_batch::RecordBatch,
+};
+use datafusion_common::plan_err;
+use datafusion_common::{HashSet, Result, internal_err};
+use datafusion_expr::ColumnarValue;
+
+/// Represents a lambda with the given parameters names and body
+#[derive(Debug, Eq, Clone)]
+pub struct LambdaExpr {
+    params: Vec<String>,
+    body: Arc<dyn PhysicalExpr>,
+}
+
+// Manually derive PartialEq and Hash to work around https://github.com/rust-lang/rust/issues/78808 [https://github.com/apache/datafusion/issues/13196]
+impl PartialEq for LambdaExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.params.eq(&other.params) && self.body.eq(&other.body)
+    }
+}
+
+impl Hash for LambdaExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.params.hash(state);
+        self.body.hash(state);
+    }
+}
+
+impl LambdaExpr {
+    /// Create a new lambda expression with the given parameters and body
+    pub fn try_new(params: Vec<String>, body: Arc<dyn PhysicalExpr>) -> Result<Self> {
+        if all_unique(&params) {
+            Ok(Self::new(params, body))
+        } else {
+            plan_err!("lambda params must be unique, got ({})", params.join(", "))
+        }
+    }
+
+    fn new(params: Vec<String>, body: Arc<dyn PhysicalExpr>) -> Self {
+        Self { params, body }
+    }
+
+    /// Get the lambda's params names
+    pub fn params(&self) -> &[String] {
+        &self.params
+    }
+
+    /// Get the lambda's body
+    pub fn body(&self) -> &Arc<dyn PhysicalExpr> {
+        &self.body
+    }
+}
+
+impl std::fmt::Display for LambdaExpr {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "({}) -> {}", self.params.join(", "), self.body)
+    }
+}
+
+impl PhysicalExpr for LambdaExpr {
+    fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
+        Ok(DataType::Null)
+    }
+
+    fn nullable(&self, _input_schema: &Schema) -> Result<bool> {
+        Ok(true)
+    }
+
+    fn evaluate(&self, _batch: &RecordBatch) -> Result<ColumnarValue> {
+        internal_err!("LambdaExpr::evaluate() should not be called")
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.body]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        let [body] = children.as_slice() else {
+            return internal_err!(
+                "LambdaExpr expects exactly 1 child, got {}",
+                children.len()
+            );
+        };
+
+        Ok(Arc::new(Self::new(self.params.clone(), Arc::clone(body))))
+    }
+
+    fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "({}) -> {}", self.params.join(", "), self.body)
+    }
+}
+
+/// Create a lambda expression
+pub fn lambda(
+    params: impl IntoIterator<Item = impl Into<String>>,
+    body: Arc<dyn PhysicalExpr>,
+) -> Result<Arc<dyn PhysicalExpr>> {
+    Ok(Arc::new(LambdaExpr::try_new(
+        params.into_iter().map(Into::into).collect(),
+        body,
+    )?))
+}
+
+fn all_unique(params: &[String]) -> bool {
+    match params.len() {
+        0 | 1 => true,
+        2 => params[0] != params[1],
+        _ => {
+            let mut set = HashSet::with_capacity(params.len());
+
+            params.iter().all(|p| set.insert(p.as_str()))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::expressions::{NoOp, lambda::lambda};
+    use arrow::{array::RecordBatch, datatypes::Schema};
+    use std::sync::Arc;
+
+    #[test]
+    fn test_lambda_evaluate() {
+        let lambda = lambda(["a"], Arc::new(NoOp::new())).unwrap();
+        let batch = RecordBatch::new_empty(Arc::new(Schema::empty()));
+        assert!(lambda.evaluate(&batch).is_err());
+    }
+
+    #[test]
+    fn test_lambda_duplicate_name() {
+        assert!(lambda(["a", "a"], Arc::new(NoOp::new())).is_err());
+    }
+}
diff --git a/datafusion/physical-expr/src/expressions/lambda_variable.rs b/datafusion/physical-expr/src/expressions/lambda_variable.rs
new file mode 100644
index 0000000000000..fb9657e897550
--- /dev/null
+++ b/datafusion/physical-expr/src/expressions/lambda_variable.rs
@@ -0,0 +1,145 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Physical lambda variable reference: [`LambdaVariable`]
+
+use std::hash::Hash;
+use std::sync::Arc;
+
+use crate::physical_expr::PhysicalExpr;
+use arrow::datatypes::FieldRef;
+use arrow::{
+    datatypes::{DataType, Schema},
+    record_batch::RecordBatch,
+};
+
+use datafusion_common::{Result, exec_err, internal_err};
+use datafusion_expr::ColumnarValue;
+
+/// Represents the lambda variable with a given index and field
+#[derive(Debug, Clone)]
+pub struct LambdaVariable {
+    index: usize,
+    field: FieldRef,
+}
+
+impl Eq for LambdaVariable {}
+
+impl PartialEq for LambdaVariable {
+    fn eq(&self, other: &Self) -> bool {
+        self.index == other.index && self.field == other.field
+    }
+}
+
+impl Hash for LambdaVariable {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.index.hash(state);
+        self.field.hash(state);
+    }
+}
+
+impl LambdaVariable {
+    /// Create a new lambda variable expression
+    pub fn new(index: usize, field: FieldRef) -> Self {
+        Self { index, field }
+    }
+
+    /// Get the variable's name
+    pub fn name(&self) -> &str {
+        self.field.name()
+    }
+
+    /// Get the variable's index
+    pub fn index(&self) -> usize {
+        self.index
+    }
+
+    /// Get the variable's field
+    pub fn field(&self) -> &FieldRef {
+        &self.field
+    }
+}
+
+impl std::fmt::Display for LambdaVariable {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{}@{}", self.name(), self.index)
+    }
+}
+
+impl PhysicalExpr for LambdaVariable {
+    fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
+        Ok(self.field.data_type().clone())
+    }
+
+    fn nullable(&self, _input_schema: &Schema) -> Result<bool> {
+        Ok(self.field.is_nullable())
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
+        if self.index >= batch.num_columns() {
+            return internal_err!(
+                "PhysicalExpr LambdaVariable references column '{}' at index {} (zero-based) but batch only has {} columns: {:?}",
+                self.name(),
+                self.index,
+                batch.num_columns(),
+                batch
+                    .schema_ref()
+                    .fields()
+                    .iter()
+                    .map(|f| f.name())
+                    .collect::<Vec<_>>()
+            );
+        }
+
+        if self.field.as_ref() != batch.schema_ref().field(self.index) {
+            return exec_err!(
+                "Physical LambdaVariable field doesn't match batch field during evaluation {} != {}",
+                self.field,
+                batch.schema_ref().field(self.index)
+            );
+        }
+
+        Ok(ColumnarValue::Array(Arc::clone(batch.column(self.index))))
+    }
+
+    fn return_field(&self, _input_schema: &Schema) -> Result<FieldRef> {
+        Ok(Arc::clone(&self.field))
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        _children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        Ok(self)
+    }
+
+    fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}@{}", self.name(), self.index)
+    }
+}
+
+/// Create a lambda variable expression
+pub fn lambda_variable(name: &str, schema: &Schema) -> Result<Arc<dyn PhysicalExpr>> {
+    let index = schema.index_of(name)?;
+    let field = Arc::clone(&schema.fields()[index]);
+
+    Ok(Arc::new(LambdaVariable::new(index, field)))
+}
diff --git a/datafusion/physical-expr/src/expressions/like.rs b/datafusion/physical-expr/src/expressions/like.rs
index e86c778d51619..07ceb4e7d7d49 100644
--- a/datafusion/physical-expr/src/expressions/like.rs
+++ b/datafusion/physical-expr/src/expressions/like.rs
@@ -18,11 +18,11 @@
 use crate::PhysicalExpr;
 use arrow::datatypes::{DataType, Schema};
 use arrow::record_batch::RecordBatch;
-use datafusion_common::{internal_err, Result};
-use datafusion_expr::ColumnarValue;
+use datafusion_common::{Result, assert_or_internal_err};
+use datafusion_expr::{ColumnarValue, Operator};
 use datafusion_physical_expr_common::datum::apply_cmp;
 use std::hash::Hash;
-use std::{any::Any, sync::Arc};
+use std::sync::Arc;
 
 // Like expression
 #[derive(Debug, Eq)]
@@ -105,10 +105,6 @@ impl std::fmt::Display for LikeExpr {
 }
 
 impl PhysicalExpr for LikeExpr {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
         Ok(DataType::Boolean)
     }
@@ -118,14 +114,13 @@ impl PhysicalExpr for LikeExpr {
     }
 
     fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
-        use arrow::compute::*;
         let lhs = self.expr.evaluate(batch)?;
         let rhs = self.pattern.evaluate(batch)?;
         match (self.negated, self.case_insensitive) {
-            (false, false) => apply_cmp(&lhs, &rhs, like),
-            (false, true) => apply_cmp(&lhs, &rhs, ilike),
-            (true, false) => apply_cmp(&lhs, &rhs, nlike),
-            (true, true) => apply_cmp(&lhs, &rhs, nilike),
+            (false, false) => apply_cmp(Operator::LikeMatch, &lhs, &rhs),
+            (false, true) => apply_cmp(Operator::ILikeMatch, &lhs, &rhs),
+            (true, false) => apply_cmp(Operator::NotLikeMatch, &lhs, &rhs),
+            (true, true) => apply_cmp(Operator::NotILikeMatch, &lhs, &rhs),
         }
     }
 
@@ -170,11 +165,10 @@ pub fn like(
 ) -> Result<Arc<dyn PhysicalExpr>> {
     let expr_type = &expr.data_type(input_schema)?;
     let pattern_type = &pattern.data_type(input_schema)?;
-    if !expr_type.eq(pattern_type) && !can_like_type(expr_type) {
-        return internal_err!(
-            "The type of {expr_type} AND {pattern_type} of like physical should be same"
-        );
-    }
+    assert_or_internal_err!(
+        expr_type.eq(pattern_type) || can_like_type(expr_type),
+        "The type of {expr_type} AND {pattern_type} of like physical should be same"
+    );
     Ok(Arc::new(LikeExpr::new(
         negated,
         case_insensitive,
diff --git a/datafusion/physical-expr/src/expressions/literal.rs b/datafusion/physical-expr/src/expressions/literal.rs
index 94e91d43a1c48..7351158c54e31 100644
--- a/datafusion/physical-expr/src/expressions/literal.rs
+++ b/datafusion/physical-expr/src/expressions/literal.rs
@@ -17,7 +17,6 @@
 
 //! Literal expressions for physical operations
 
-use std::any::Any;
 use std::hash::Hash;
 use std::sync::Arc;
 
@@ -33,6 +32,7 @@ use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::Expr;
 use datafusion_expr_common::columnar_value::ColumnarValue;
 use datafusion_expr_common::interval_arithmetic::Interval;
+use datafusion_expr_common::placement::ExpressionPlacement;
 use datafusion_expr_common::sort_properties::{ExprProperties, SortProperties};
 
 /// Represents a literal value
@@ -91,11 +91,6 @@ impl std::fmt::Display for Literal {
 }
 
 impl PhysicalExpr for Literal {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
         Ok(self.value.data_type())
     }
@@ -134,9 +129,14 @@ impl PhysicalExpr for Literal {
     fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         std::fmt::Display::fmt(self, f)
     }
+
+    fn placement(&self) -> ExpressionPlacement {
+        ExpressionPlacement::Literal
+    }
 }
 
 /// Create a literal expression
+#[expect(clippy::needless_pass_by_value)]
 pub fn lit<T: datafusion_expr::Literal>(value: T) -> Arc<dyn PhysicalExpr> {
     match value.lit() {
         Expr::Literal(v, _) => Arc::new(Literal::new(v)),
@@ -149,7 +149,6 @@ mod tests {
     use super::*;
 
     use arrow::array::Int32Array;
-    use arrow::datatypes::Field;
     use datafusion_common::cast::as_int32_array;
     use datafusion_physical_expr_common::physical_expr::fmt_sql;
 
diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs
index 59d675753d985..7cf874c448ea0 100644
--- a/datafusion/physical-expr/src/expressions/mod.rs
+++ b/datafusion/physical-expr/src/expressions/mod.rs
@@ -21,12 +21,13 @@
 mod binary;
 mod case;
 mod cast;
-mod cast_column;
 mod column;
 mod dynamic_filters;
 mod in_list;
 mod is_not_null;
 mod is_null;
+mod lambda;
+mod lambda_variable;
 mod like;
 mod literal;
 mod negative;
@@ -35,24 +36,27 @@ mod not;
 mod try_cast;
 mod unknown_column;
 
+pub use crate::PhysicalSortExpr;
 /// Module with some convenient methods used in expression building
 pub use crate::aggregate::stats::StatsType;
-pub use crate::PhysicalSortExpr;
 
-pub use binary::{binary, similar_to, BinaryExpr};
-pub use case::{case, CaseExpr};
-pub use cast::{cast, CastExpr};
-pub use cast_column::CastColumnExpr;
-pub use column::{col, with_new_schema, Column};
+pub use binary::{BinaryExpr, binary, similar_to};
+pub use case::{CaseExpr, case};
+pub use cast::{CastExpr, cast};
+pub use column::{Column, col, with_new_schema};
 pub use datafusion_expr::utils::format_state_name;
-pub use dynamic_filters::DynamicFilterPhysicalExpr;
-pub use in_list::{in_list, InListExpr};
-pub use is_not_null::{is_not_null, IsNotNullExpr};
-pub use is_null::{is_null, IsNullExpr};
-pub use like::{like, LikeExpr};
-pub use literal::{lit, Literal};
-pub use negative::{negative, NegativeExpr};
+pub use dynamic_filters::{DynamicFilterPhysicalExpr, Inner as DynamicFilterInner};
+pub use in_list::{InListExpr, in_list};
+pub use is_not_null::{IsNotNullExpr, is_not_null};
+pub use is_null::{IsNullExpr, is_null};
+pub use lambda::{LambdaExpr, lambda};
+pub use lambda_variable::{LambdaVariable, lambda_variable};
+pub use like::{LikeExpr, like};
+pub use literal::{Literal, lit};
+pub use negative::{NegativeExpr, negative};
 pub use no_op::NoOp;
-pub use not::{not, NotExpr};
-pub use try_cast::{try_cast, TryCastExpr};
+pub use not::{NotExpr, not};
+pub use try_cast::{TryCastExpr, try_cast};
 pub use unknown_column::UnKnownColumn;
+
+pub(crate) use cast::cast_with_target_field;
diff --git a/datafusion/physical-expr/src/expressions/negative.rs b/datafusion/physical-expr/src/expressions/negative.rs
index fa7224768a777..da576f2872f6c 100644
--- a/datafusion/physical-expr/src/expressions/negative.rs
+++ b/datafusion/physical-expr/src/expressions/negative.rs
@@ -17,7 +17,6 @@
 
 //! Negation (-) expression
 
-use std::any::Any;
 use std::hash::Hash;
 use std::sync::Arc;
 
@@ -29,15 +28,15 @@ use arrow::{
     datatypes::{DataType, Schema},
     record_batch::RecordBatch,
 };
-use datafusion_common::{internal_err, plan_err, Result};
+use datafusion_common::{Result, internal_err, plan_err};
 use datafusion_expr::interval_arithmetic::Interval;
 use datafusion_expr::sort_properties::ExprProperties;
 use datafusion_expr::statistics::Distribution::{
     self, Bernoulli, Exponential, Gaussian, Generic, Uniform,
 };
 use datafusion_expr::{
-    type_coercion::{is_interval, is_null, is_signed_numeric, is_timestamp},
     ColumnarValue,
+    type_coercion::{is_interval, is_signed_numeric, is_timestamp},
 };
 
 /// Negative expression
@@ -79,11 +78,6 @@ impl std::fmt::Display for NegativeExpr {
 }
 
 impl PhysicalExpr for NegativeExpr {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn data_type(&self, input_schema: &Schema) -> Result<DataType> {
         self.arg.data_type(input_schema)
     }
@@ -190,7 +184,7 @@ pub fn negative(
     input_schema: &Schema,
 ) -> Result<Arc<dyn PhysicalExpr>> {
     let data_type = arg.data_type(input_schema)?;
-    if is_null(&data_type) {
+    if data_type.is_null() {
         Ok(arg)
     } else if !is_signed_numeric(&data_type)
         && !is_interval(&data_type)
@@ -205,19 +199,18 @@ pub fn negative(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::expressions::{col, Column};
+    use crate::expressions::{Column, col};
 
     use arrow::array::*;
-    use arrow::datatypes::DataType::{Float32, Float64, Int16, Int32, Int64, Int8};
+    use arrow::datatypes::DataType::{Float32, Float64, Int8, Int16, Int32, Int64};
     use arrow::datatypes::*;
     use datafusion_common::cast::as_primitive_array;
     use datafusion_common::{DataFusionError, ScalarValue};
 
     use datafusion_physical_expr_common::physical_expr::fmt_sql;
-    use paste::paste;
 
     macro_rules! test_array_negative_op {
-        ($DATA_TY:tt, $($VALUE:expr),*   ) => {
+        ($DATA_TY:tt, $ARRAY_TY:ty, $($VALUE:expr),*   ) => {
             let schema = Schema::new(vec![Field::new("a", DataType::$DATA_TY, true)]);
             let expr = negative(col("a", &schema)?, &schema)?;
             assert_eq!(expr.data_type(&schema)?, DataType::$DATA_TY);
@@ -230,8 +223,8 @@ mod tests {
             )+
             arr.push(None);
             arr_expected.push(None);
-            let input = paste!{[<$DATA_TY Array>]::from(arr)};
-            let expected = &paste!{[<$DATA_TY Array>]::from(arr_expected)};
+            let input = <$ARRAY_TY>::from(arr);
+            let expected = &<$ARRAY_TY>::from(arr_expected);
             let batch =
                 RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(input)])?;
             let result = expr.evaluate(&batch)?.into_array(batch.num_rows()).expect("Failed to convert to array");
@@ -243,12 +236,12 @@ mod tests {
 
     #[test]
     fn array_negative_op() -> Result<()> {
-        test_array_negative_op!(Int8, 2i8, 1i8);
-        test_array_negative_op!(Int16, 234i16, 123i16);
-        test_array_negative_op!(Int32, 2345i32, 1234i32);
-        test_array_negative_op!(Int64, 23456i64, 12345i64);
-        test_array_negative_op!(Float32, 2345.0f32, 1234.0f32);
-        test_array_negative_op!(Float64, 23456.0f64, 12345.0f64);
+        test_array_negative_op!(Int8, Int8Array, 2i8, 1i8);
+        test_array_negative_op!(Int16, Int16Array, 234i16, 123i16);
+        test_array_negative_op!(Int32, Int32Array, 2345i32, 1234i32);
+        test_array_negative_op!(Int64, Int64Array, 23456i64, 12345i64);
+        test_array_negative_op!(Float32, Float32Array, 2345.0f32, 1234.0f32);
+        test_array_negative_op!(Float64, Float64Array, 23456.0f64, 12345.0f64);
         Ok(())
     }
 
@@ -277,11 +270,13 @@ mod tests {
         );
 
         // Bernoulli
-        assert!(negative_expr
-            .evaluate_statistics(&[&Distribution::new_bernoulli(ScalarValue::from(
-                0.75
-            ))?])
-            .is_err());
+        assert!(
+            negative_expr
+                .evaluate_statistics(&[&Distribution::new_bernoulli(ScalarValue::from(
+                    0.75
+                ))?])
+                .is_err()
+        );
 
         // Exponential
         assert_eq!(
diff --git a/datafusion/physical-expr/src/expressions/no_op.rs b/datafusion/physical-expr/src/expressions/no_op.rs
index 94610996c6b00..c866c6ab07113 100644
--- a/datafusion/physical-expr/src/expressions/no_op.rs
+++ b/datafusion/physical-expr/src/expressions/no_op.rs
@@ -17,7 +17,6 @@
 
 //! NoOp placeholder for physical operations
 
-use std::any::Any;
 use std::hash::Hash;
 use std::sync::Arc;
 
@@ -26,7 +25,7 @@ use arrow::{
     datatypes::{DataType, Schema},
     record_batch::RecordBatch,
 };
-use datafusion_common::{internal_err, Result};
+use datafusion_common::{Result, internal_err};
 use datafusion_expr::ColumnarValue;
 
 /// A place holder expression, can not be evaluated.
@@ -49,11 +48,6 @@ impl std::fmt::Display for NoOp {
 }
 
 impl PhysicalExpr for NoOp {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
         Ok(DataType::Null)
     }
diff --git a/datafusion/physical-expr/src/expressions/not.rs b/datafusion/physical-expr/src/expressions/not.rs
index 8184ef601e543..917d3a953573b 100644
--- a/datafusion/physical-expr/src/expressions/not.rs
+++ b/datafusion/physical-expr/src/expressions/not.rs
@@ -17,7 +17,6 @@
 
 //! Not expression
 
-use std::any::Any;
 use std::fmt;
 use std::hash::Hash;
 use std::sync::Arc;
@@ -26,10 +25,10 @@ use crate::PhysicalExpr;
 
 use arrow::datatypes::{DataType, FieldRef, Schema};
 use arrow::record_batch::RecordBatch;
-use datafusion_common::{cast::as_boolean_array, internal_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, cast::as_boolean_array, internal_err};
+use datafusion_expr::ColumnarValue;
 use datafusion_expr::interval_arithmetic::Interval;
 use datafusion_expr::statistics::Distribution::{self, Bernoulli};
-use datafusion_expr::ColumnarValue;
 
 /// Not expression
 #[derive(Debug, Eq)]
@@ -70,11 +69,6 @@ impl fmt::Display for NotExpr {
 }
 
 impl PhysicalExpr for NotExpr {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
         Ok(DataType::Boolean)
     }
@@ -155,16 +149,16 @@ impl PhysicalExpr for NotExpr {
         match (parent, children[0]) {
             (Bernoulli(parent), Bernoulli(child)) => {
                 let parent_range = parent.range();
-                let result = if parent_range == Interval::CERTAINLY_TRUE {
-                    if child.range() == Interval::CERTAINLY_TRUE {
+                let result = if parent_range == Interval::TRUE {
+                    if child.range() == Interval::TRUE {
                         None
                     } else {
                         Some(vec![Distribution::new_bernoulli(ScalarValue::new_zero(
                             &child.data_type(),
                         )?)?])
                     }
-                } else if parent_range == Interval::CERTAINLY_FALSE {
-                    if child.range() == Interval::CERTAINLY_FALSE {
+                } else if parent_range == Interval::FALSE {
+                    if child.range() == Interval::FALSE {
                         None
                     } else {
                         Some(vec![Distribution::new_bernoulli(ScalarValue::new_one(
@@ -196,7 +190,7 @@ mod tests {
     use std::sync::LazyLock;
 
     use super::*;
-    use crate::expressions::{col, Column};
+    use crate::expressions::{Column, col};
 
     use arrow::{array::BooleanArray, datatypes::*};
     use datafusion_physical_expr_common::physical_expr::fmt_sql;
@@ -265,28 +259,31 @@ mod tests {
         let expr = not(a)?;
 
         // Uniform with non-boolean bounds
-        assert!(expr
-            .evaluate_statistics(&[&Distribution::new_uniform(
+        assert!(
+            expr.evaluate_statistics(&[&Distribution::new_uniform(
                 Interval::make_unbounded(&DataType::Float64)?
             )?])
-            .is_err());
+            .is_err()
+        );
 
         // Exponential
-        assert!(expr
-            .evaluate_statistics(&[&Distribution::new_exponential(
+        assert!(
+            expr.evaluate_statistics(&[&Distribution::new_exponential(
                 ScalarValue::from(1.0),
                 ScalarValue::from(1.0),
                 true
             )?])
-            .is_err());
+            .is_err()
+        );
 
         // Gaussian
-        assert!(expr
-            .evaluate_statistics(&[&Distribution::new_gaussian(
+        assert!(
+            expr.evaluate_statistics(&[&Distribution::new_gaussian(
                 ScalarValue::from(1.0),
                 ScalarValue::from(1.0),
             )?])
-            .is_err());
+            .is_err()
+        );
 
         // Bernoulli
         assert_eq!(
@@ -310,24 +307,26 @@ mod tests {
             Distribution::new_bernoulli(ScalarValue::from(0.75))?
         );
 
-        assert!(expr
-            .evaluate_statistics(&[&Distribution::new_generic(
+        assert!(
+            expr.evaluate_statistics(&[&Distribution::new_generic(
                 ScalarValue::Null,
                 ScalarValue::Null,
                 ScalarValue::Null,
                 Interval::make_unbounded(&DataType::UInt8)?
             )?])
-            .is_err());
+            .is_err()
+        );
 
         // Unknown with non-boolean interval as range
-        assert!(expr
-            .evaluate_statistics(&[&Distribution::new_generic(
+        assert!(
+            expr.evaluate_statistics(&[&Distribution::new_generic(
                 ScalarValue::Null,
                 ScalarValue::Null,
                 ScalarValue::Null,
                 Interval::make_unbounded(&DataType::Float64)?
             )?])
-            .is_err());
+            .is_err()
+        );
 
         Ok(())
     }
diff --git a/datafusion/physical-expr/src/expressions/try_cast.rs b/datafusion/physical-expr/src/expressions/try_cast.rs
index b32aabbe5b006..ba59d113acaab 100644
--- a/datafusion/physical-expr/src/expressions/try_cast.rs
+++ b/datafusion/physical-expr/src/expressions/try_cast.rs
@@ -15,7 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::fmt;
 use std::hash::Hash;
 use std::sync::Arc;
@@ -27,7 +26,7 @@ use arrow::datatypes::{DataType, FieldRef, Schema};
 use arrow::record_batch::RecordBatch;
 use compute::can_cast_types;
 use datafusion_common::format::DEFAULT_FORMAT_OPTIONS;
-use datafusion_common::{not_impl_err, Result};
+use datafusion_common::{Result, not_impl_err};
 use datafusion_expr::ColumnarValue;
 
 /// TRY_CAST expression casts an expression to a specific data type and returns NULL on invalid cast
@@ -72,16 +71,11 @@ impl TryCastExpr {
 
 impl fmt::Display for TryCastExpr {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "TRY_CAST({} AS {:?})", self.expr, self.cast_type)
+        write!(f, "TRY_CAST({} AS {})", self.expr, self.cast_type)
     }
 }
 
 impl PhysicalExpr for TryCastExpr {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
         Ok(self.cast_type.clone())
     }
@@ -155,8 +149,8 @@ mod tests {
     };
     use arrow::{
         array::{
-            Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array,
-            Int8Array, TimestampNanosecondArray, UInt32Array,
+            Array, Float32Array, Float64Array, Int8Array, Int16Array, Int32Array,
+            Int64Array, TimestampNanosecondArray, UInt32Array,
         },
         datatypes::*,
     };
@@ -180,7 +174,7 @@ mod tests {
 
             // verify that its display is correct
             assert_eq!(
-                format!("TRY_CAST(a@0 AS {:?})", $TYPE),
+                format!("TRY_CAST(a@0 AS {})", $TYPE),
                 format!("{}", expression)
             );
 
@@ -206,7 +200,7 @@ mod tests {
             for (i, x) in $VEC.iter().enumerate() {
                 match x {
                     Some(x) => assert_eq!(result.value(i), *x),
-                    None => assert!(!result.is_valid(i)),
+                    None => assert!(result.is_null(i)),
                 }
             }
         }};
@@ -231,7 +225,7 @@ mod tests {
 
             // verify that its display is correct
             assert_eq!(
-                format!("TRY_CAST(a@0 AS {:?})", $TYPE),
+                format!("TRY_CAST(a@0 AS {})", $TYPE),
                 format!("{}", expression)
             );
 
@@ -260,7 +254,7 @@ mod tests {
             for (i, x) in $VEC.iter().enumerate() {
                 match x {
                     Some(x) => assert_eq!(result.value(i), *x),
-                    None => assert!(!result.is_valid(i)),
+                    None => assert!(result.is_null(i)),
                 }
             }
         }};
diff --git a/datafusion/physical-expr/src/expressions/unknown_column.rs b/datafusion/physical-expr/src/expressions/unknown_column.rs
index 2face4eb6bdb6..4969fc33743c7 100644
--- a/datafusion/physical-expr/src/expressions/unknown_column.rs
+++ b/datafusion/physical-expr/src/expressions/unknown_column.rs
@@ -17,7 +17,6 @@
 
 //! UnKnownColumn expression
 
-use std::any::Any;
 use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 
@@ -27,7 +26,7 @@ use arrow::{
     datatypes::{DataType, Schema},
     record_batch::RecordBatch,
 };
-use datafusion_common::{internal_err, Result};
+use datafusion_common::{Result, internal_err};
 use datafusion_expr::ColumnarValue;
 
 #[derive(Debug, Clone, Eq)]
@@ -56,11 +55,6 @@ impl std::fmt::Display for UnKnownColumn {
 }
 
 impl PhysicalExpr for UnKnownColumn {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     /// Get the data type of this expression, given the schema of the input
     fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
         Ok(DataType::Null)
diff --git a/datafusion/physical-expr/src/higher_order_function.rs b/datafusion/physical-expr/src/higher_order_function.rs
new file mode 100644
index 0000000000000..3b2002a150a02
--- /dev/null
+++ b/datafusion/physical-expr/src/higher_order_function.rs
@@ -0,0 +1,704 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Declaration of built-in (higher order) functions.
+//! This module contains built-in functions' enumeration and metadata.
+//!
+//! Generally, a function has:
+//! * a signature
+//! * a return type, that is a function of the incoming argument's types
+//! * the computation, that must accept each valid signature
+//!
+//! * Signature: see `Signature`
+//! * Return type: a function `(arg_types) -> return_type`. E.g. for array_transform, ([[f32]], v -> v*2) -> [f32], ([[f32]], v -> v > 3.0) -> [bool].
+//!
+//! This module also has a set of coercion rules to improve user experience: if an argument i32 is passed
+//! to a function that supports f64, it is coerced to f64.
+
+use std::fmt::{self, Debug, Formatter};
+use std::hash::{Hash, Hasher};
+use std::sync::Arc;
+
+use crate::PhysicalExpr;
+use crate::expressions::{LambdaExpr, Literal};
+
+use arrow::array::{Array, RecordBatch};
+use arrow::datatypes::{DataType, FieldRef, Schema};
+use datafusion_common::config::{ConfigEntry, ConfigOptions};
+use datafusion_common::datatype::FieldExt;
+use datafusion_common::utils::remove_list_null_values;
+use datafusion_common::{
+    Result, ScalarValue, exec_err, internal_datafusion_err, internal_err,
+    plan_datafusion_err, plan_err,
+};
+use datafusion_expr::type_coercion::functions::value_fields_with_higher_order_udf;
+use datafusion_expr::{
+    ColumnarValue, HigherOrderFunctionArgs, HigherOrderReturnFieldArgs, HigherOrderUDF,
+    LambdaArgument, LambdaParametersProgress, ValueOrLambda, Volatility, expr_vec_fmt,
+};
+
+/// Per-argument classification cached at construction time.
+///
+/// Walking the wrapped lambda tree and scanning a `Vec<usize>` of lambda
+/// positions used to be done on every `evaluate` call. Both costs collapse
+/// to a single up-front pass by storing the classification (and the resolved
+/// inner [`LambdaExpr`]) here.
+enum ArgSlot {
+    /// A regular value-producing expression at this position.
+    Value,
+    /// A lambda position. Stores the inner [`LambdaExpr`] pre-extracted from
+    /// any wrapper expressions that may have been introduced via
+    /// [`PhysicalExpr::with_new_children`] tree rewrites.
+    Lambda(Arc<LambdaExpr>),
+}
+
+/// Physical expression of a higher order function
+pub struct HigherOrderFunctionExpr {
+    /// A shared instance of the higher-order function
+    fun: Arc<dyn HigherOrderUDF>,
+    /// The name of the higher-order function
+    name: String,
+    /// List of expressions to feed to the function as arguments
+    ///
+    /// For example, for `array_transform([2, 3], v -> v != 2)`, this will be:
+    ///
+    /// ```text
+    /// ListExpression [2,3]
+    /// LambdaExpression
+    ///     parameters: ["v"]
+    ///     body:
+    ///         BinaryExpression (!=)
+    ///             left:
+    ///                 LambdaVariableExpression("v", Field::new("", Int32, false))
+    ///             right:
+    ///                 LiteralExpression(2)
+    /// ```
+    args: Vec<Arc<dyn PhysicalExpr>>,
+    /// Per-arg classification, parallel to `args`. Length always equals
+    /// `args.len()`. Lambda variants carry the resolved inner [`LambdaExpr`]
+    /// so `evaluate` doesn't walk through wrapper nodes.
+    slots: Vec<ArgSlot>,
+    /// The output field associated this expression
+    ///
+    /// For example, for `array_transform([2, 3], v -> v != 2)`, this will be
+    /// `Field::new("", DataType::new_list(DataType::Boolean, true), true)`
+    return_field: FieldRef,
+    /// The config options at execution time
+    config_options: Arc<ConfigOptions>,
+}
+
+impl Debug for HigherOrderFunctionExpr {
+    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+        let lambda_positions: Vec<_> = self
+            .slots
+            .iter()
+            .enumerate()
+            .filter_map(|(i, slot)| matches!(slot, ArgSlot::Lambda(_)).then_some(i))
+            .collect();
+        f.debug_struct("HigherOrderFunctionExpr")
+            .field("fun", &"<FUNC>")
+            .field("name", &self.name)
+            .field("args", &self.args)
+            .field("lambda_positions", &lambda_positions)
+            .field("return_field", &self.return_field)
+            .finish()
+    }
+}
+
+impl HigherOrderFunctionExpr {
+    /// Create a new Higher Order function
+    ///
+    /// Note that lambda arguments must be present directly in args as [LambdaExpr],
+    /// and not as a wrapped child of any arg
+    pub fn try_new_with_schema(
+        fun: Arc<dyn HigherOrderUDF>,
+        args: Vec<Arc<dyn PhysicalExpr>>,
+        schema: &Schema,
+        config_options: Arc<ConfigOptions>,
+    ) -> Result<Self> {
+        let name = fun.name().to_string();
+        let mut slots = Vec::with_capacity(args.len());
+        let arg_fields = args
+            .iter()
+            .map(|e| match e.downcast_ref::<LambdaExpr>() {
+                Some(lambda) => {
+                    slots.push(ArgSlot::Lambda(Arc::new(lambda.clone())));
+                    Ok(ValueOrLambda::Lambda(lambda.body().return_field(schema)?))
+                }
+                None => {
+                    slots.push(ArgSlot::Value);
+                    Ok(ValueOrLambda::Value(e.return_field(schema)?))
+                }
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        // verify that input data types is consistent with function's `HigherOrderTypeSignature`
+        value_fields_with_higher_order_udf(&arg_fields, fun.as_ref())?;
+
+        let arguments = args
+            .iter()
+            .map(|e| e.downcast_ref::<Literal>().map(|literal| literal.value()))
+            .collect::<Vec<_>>();
+
+        let ret_args = HigherOrderReturnFieldArgs {
+            arg_fields: &arg_fields,
+            scalar_arguments: &arguments,
+        };
+
+        let return_field = fun.return_field_from_args(ret_args)?;
+
+        Ok(Self {
+            fun,
+            name,
+            args,
+            slots,
+            return_field,
+            config_options,
+        })
+    }
+
+    /// Get the higher order function implementation
+    pub fn fun(&self) -> &dyn HigherOrderUDF {
+        self.fun.as_ref()
+    }
+
+    /// The name for this expression
+    pub fn name(&self) -> &str {
+        &self.name
+    }
+
+    /// Input arguments
+    pub fn args(&self) -> &[Arc<dyn PhysicalExpr>] {
+        &self.args
+    }
+
+    /// Data type produced by this expression
+    pub fn return_type(&self) -> &DataType {
+        self.return_field.data_type()
+    }
+
+    pub fn nullable(&self) -> bool {
+        self.return_field.is_nullable()
+    }
+
+    pub fn config_options(&self) -> &ConfigOptions {
+        &self.config_options
+    }
+
+    /// Resolve every lambda's parameter list. Returns an empty `Vec` when
+    /// there are no lambdas, avoiding the [`HigherOrderUDF::lambda_parameters`]
+    /// virtual call entirely.
+    fn resolve_lambda_parameters(
+        &self,
+        fields: &[ValueOrLambda<FieldRef, Option<FieldRef>>],
+    ) -> Result<Vec<Vec<FieldRef>>> {
+        let num_lambdas = self
+            .slots
+            .iter()
+            .filter(|s| matches!(s, ArgSlot::Lambda(_)))
+            .count();
+        if num_lambdas == 0 {
+            return Ok(Vec::new());
+        }
+        match self.fun().lambda_parameters(0, fields)? {
+            LambdaParametersProgress::Partial(_) => plan_err!(
+                "{} lambda_parameters returned a partial result when the return type of all it's lambdas were provided",
+                self.name()
+            ),
+            LambdaParametersProgress::Complete(items) => {
+                // functions can support multiple lambdas where some trailing ones are optional,
+                // but to simplify the implementor, lambda_parameters returns the parameters of all of them,
+                // so we can't do equality check. one example is spark reduce:
+                // https://spark.apache.org/docs/latest/api/sql/index.html#reduce
+                if items.len() < num_lambdas {
+                    return exec_err!(
+                        "{} invocation defined {num_lambdas} but lambda_parameters returned only {}",
+                        self.name(),
+                        items.len()
+                    );
+                }
+                Ok(items)
+            }
+        }
+    }
+}
+
+impl fmt::Display for HigherOrderFunctionExpr {
+    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+        write!(f, "{}({})", self.name, expr_vec_fmt!(self.args))
+    }
+}
+
+impl PartialEq for HigherOrderFunctionExpr {
+    fn eq(&self, o: &Self) -> bool {
+        if std::ptr::eq(self, o) {
+            // The equality implementation is somewhat expensive, so let's short-circuit when possible.
+            return true;
+        }
+        // `slots` is a deterministic function of `fun` and `args`, so it's
+        // not part of the comparison.
+        let Self {
+            fun,
+            name,
+            args,
+            slots: _,
+            return_field,
+            config_options,
+        } = self;
+        fun.eq(&o.fun)
+            && name.eq(&o.name)
+            && args.eq(&o.args)
+            && return_field.eq(&o.return_field)
+            && (Arc::ptr_eq(config_options, &o.config_options)
+                || sorted_config_entries(config_options)
+                    == sorted_config_entries(&o.config_options))
+    }
+}
+impl Eq for HigherOrderFunctionExpr {}
+impl Hash for HigherOrderFunctionExpr {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let Self {
+            fun,
+            name,
+            args,
+            slots: _,
+            return_field,
+            config_options: _, // expensive to hash, and often equal
+        } = self;
+        fun.hash(state);
+        name.hash(state);
+        args.hash(state);
+        return_field.hash(state);
+    }
+}
+
+fn sorted_config_entries(config_options: &ConfigOptions) -> Vec<ConfigEntry> {
+    let mut entries = config_options.entries();
+    entries.sort_by(|l, r| l.key.cmp(&r.key));
+    entries
+}
+
+impl PhysicalExpr for HigherOrderFunctionExpr {
+    fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
+        let mut arg_fields = Vec::with_capacity(self.args.len());
+        let mut fields = Vec::with_capacity(self.args.len());
+        for (arg, slot) in self.args.iter().zip(&self.slots) {
+            match slot {
+                ArgSlot::Lambda(lambda) => {
+                    let field = lambda.body().return_field(batch.schema_ref())?;
+                    arg_fields.push(ValueOrLambda::Lambda(Arc::clone(&field)));
+                    fields.push(ValueOrLambda::Lambda(Some(field)));
+                }
+                ArgSlot::Value => {
+                    let field = arg.return_field(batch.schema_ref())?;
+                    arg_fields.push(ValueOrLambda::Value(Arc::clone(&field)));
+                    fields.push(ValueOrLambda::Value(field));
+                }
+            }
+        }
+
+        let mut lambda_parameters = self.resolve_lambda_parameters(&fields)?.into_iter();
+
+        let args = self
+            .args
+            .iter()
+            .zip(&self.slots)
+            .map(|(arg, slot)| match slot {
+                ArgSlot::Lambda(lambda) => {
+                    let lambda_params = lambda_parameters.next().ok_or_else(|| {
+                        internal_datafusion_err!(
+                            "params len should have been checked above"
+                        )
+                    })?;
+
+                    if lambda.params().len() > lambda_params.len() {
+                        return exec_err!(
+                            "lambda defined {} params but higher-order function support only {}",
+                            lambda.params().len(),
+                            lambda_params.len()
+                        );
+                    }
+
+                    let params = std::iter::zip(lambda.params(), lambda_params)
+                        .map(|(name, param)| param.renamed(name.as_str()))
+                        .collect();
+
+                    Ok(ValueOrLambda::Lambda(LambdaArgument::new(
+                        params,
+                        Arc::clone(lambda.body()),
+                    )))
+                }
+                ArgSlot::Value => {
+                    let value = arg.evaluate(batch)?;
+
+                    let value = if self.fun.clear_null_values()
+                        && matches!(
+                            value.data_type(),
+                            DataType::List(_) | DataType::LargeList(_)
+                        )
+                    {
+                        let arr = value.into_array(batch.num_rows())?;
+                        if arr.null_count() == 0 {
+                            ColumnarValue::Array(arr)
+                        } else {
+                            ColumnarValue::Array(remove_list_null_values(&arr)?)
+                        }
+                    } else {
+                        value
+                    };
+
+                    Ok(ValueOrLambda::Value(value))
+                }
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        let input_empty = args.is_empty();
+        let input_all_scalar = args
+            .iter()
+            .all(|arg| matches!(arg, ValueOrLambda::Value(ColumnarValue::Scalar(_))));
+
+        // evaluate the function
+        let output = self.fun.invoke_with_args(HigherOrderFunctionArgs {
+            args,
+            arg_fields,
+            number_rows: batch.num_rows(),
+            return_field: Arc::clone(&self.return_field),
+            config_options: Arc::clone(&self.config_options),
+        })?;
+
+        if let ColumnarValue::Array(array) = &output
+            && array.len() != batch.num_rows()
+        {
+            // If the arguments are a non-empty slice of scalar values, we can assume that
+            // returning a one-element array is equivalent to returning a scalar.
+            let preserve_scalar = array.len() == 1 && !input_empty && input_all_scalar;
+            return if preserve_scalar {
+                ScalarValue::try_from_array(array, 0).map(ColumnarValue::Scalar)
+            } else {
+                internal_err!(
+                    "higher-order function {} returned a different number of rows than expected. Expected: {}, Got: {}",
+                    self.name,
+                    batch.num_rows(),
+                    array.len()
+                )
+            };
+        }
+        Ok(output)
+    }
+
+    fn return_field(&self, _input_schema: &Schema) -> Result<FieldRef> {
+        Ok(Arc::clone(&self.return_field))
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        self.args.iter().collect()
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        if children.len() != self.args.len() {
+            return internal_err!(
+                "HigherOrderFunctionExpr expects exactly {} child, got {}",
+                self.args.len(),
+                children.len()
+            );
+        }
+
+        // Re-derive `slots` for the new children using the original slot kinds
+        // as the source of truth for which positions must (still) be lambdas.
+        let mut new_slots = Vec::with_capacity(children.len());
+        for (i, child) in children.iter().enumerate() {
+            match &self.slots[i] {
+                ArgSlot::Lambda(_) => {
+                    let lambda = wrapped_lambda(child).ok_or_else(|| {
+                        plan_datafusion_err!(
+                            "{} unable to unwrap lambda from {} at position {i}",
+                            &children[i],
+                            self.name()
+                        )
+                    })?;
+                    new_slots.push(ArgSlot::Lambda(Arc::new(lambda.clone())));
+                }
+                ArgSlot::Value => {
+                    if child.is::<LambdaExpr>() {
+                        return plan_err!(
+                            "{} received a lambda via with_new_children at position {i} that wasn't a lambda before",
+                            self.name()
+                        );
+                    }
+                    new_slots.push(ArgSlot::Value);
+                }
+            }
+        }
+
+        Ok(Arc::new(HigherOrderFunctionExpr {
+            name: self.name.clone(),
+            fun: Arc::clone(&self.fun),
+            args: children,
+            slots: new_slots,
+            return_field: Arc::clone(&self.return_field),
+            config_options: Arc::clone(&self.config_options),
+        }))
+    }
+
+    fn fmt_sql(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(f, "{}(", self.name)?;
+        for (i, expr) in self.args.iter().enumerate() {
+            if i > 0 {
+                write!(f, ", ")?;
+            }
+            expr.fmt_sql(f)?;
+        }
+        write!(f, ")")
+    }
+
+    fn is_volatile_node(&self) -> bool {
+        self.fun.signature().volatility == Volatility::Volatile
+    }
+}
+
+fn wrapped_lambda(expr: &Arc<dyn PhysicalExpr>) -> Option<&LambdaExpr> {
+    let mut current = expr;
+
+    loop {
+        if let Some(lambda) = current.downcast_ref::<LambdaExpr>() {
+            return Some(lambda);
+        } else if current.is::<HigherOrderFunctionExpr>() {
+            return None;
+        }
+
+        match current.children().as_slice() {
+            [single_child] => current = *single_child,
+            _ => return None,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use super::*;
+    use crate::HigherOrderFunctionExpr;
+    use crate::expressions::Column;
+    use crate::expressions::NoOp;
+    use crate::expressions::lambda;
+    use crate::expressions::not;
+    use arrow::array::NullArray;
+    use arrow::array::RecordBatchOptions;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common::Result;
+    use datafusion_common::assert_contains;
+    use datafusion_expr::{
+        HigherOrderFunctionArgs, HigherOrderSignature, HigherOrderUDF,
+    };
+    use datafusion_expr_common::columnar_value::ColumnarValue;
+    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+    use datafusion_physical_expr_common::physical_expr::is_volatile;
+
+    /// Test helper to create a mock UDF with a specific volatility
+    #[derive(Debug, PartialEq, Eq, Hash)]
+    struct MockHigherOrderUDF {
+        signature: HigherOrderSignature,
+    }
+
+    impl HigherOrderUDF for MockHigherOrderUDF {
+        fn name(&self) -> &str {
+            "mock_function"
+        }
+
+        fn signature(&self) -> &HigherOrderSignature {
+            &self.signature
+        }
+
+        fn lambda_parameters(
+            &self,
+            _step: usize,
+            _fields: &[ValueOrLambda<FieldRef, Option<FieldRef>>],
+        ) -> Result<LambdaParametersProgress> {
+            Ok(LambdaParametersProgress::Complete(vec![vec![Arc::new(
+                Field::new("", DataType::Null, true),
+            )]]))
+        }
+
+        fn return_field_from_args(
+            &self,
+            args: HigherOrderReturnFieldArgs,
+        ) -> Result<FieldRef> {
+            match &args.arg_fields[0] {
+                ValueOrLambda::Lambda(field) | ValueOrLambda::Value(field) => {
+                    Ok(Arc::clone(field))
+                }
+            }
+        }
+
+        fn invoke_with_args(
+            &self,
+            args: HigherOrderFunctionArgs,
+        ) -> Result<ColumnarValue> {
+            match &args.args[0] {
+                ValueOrLambda::Lambda(lambda) => {
+                    lambda.evaluate(&[&|| Ok(Arc::new(NullArray::new(args.number_rows)))])
+                }
+                ValueOrLambda::Value(value) => Ok(value.clone()),
+            }
+        }
+    }
+
+    #[test]
+    fn test_higher_order_function_volatile_node() {
+        // Create a volatile UDF
+        let volatile_udf = Arc::new(MockHigherOrderUDF {
+            signature: HigherOrderSignature::variadic_any(Volatility::Volatile),
+        });
+
+        // Create a non-volatile UDF
+        let stable_udf = Arc::new(MockHigherOrderUDF {
+            signature: HigherOrderSignature::variadic_any(Volatility::Stable),
+        });
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Float32, false)]);
+        let args = vec![Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>];
+        let config_options = Arc::new(ConfigOptions::new());
+
+        // Test volatile function
+        let volatile_expr = HigherOrderFunctionExpr::try_new_with_schema(
+            volatile_udf,
+            args.clone(),
+            &schema,
+            Arc::clone(&config_options),
+        )
+        .unwrap();
+
+        assert!(volatile_expr.is_volatile_node());
+        let volatile_arc: Arc<dyn PhysicalExpr> = Arc::new(volatile_expr);
+        assert!(is_volatile(&volatile_arc));
+
+        // Test non-volatile function
+        let stable_expr = HigherOrderFunctionExpr::try_new_with_schema(
+            stable_udf,
+            args,
+            &schema,
+            config_options,
+        )
+        .unwrap();
+
+        assert!(!stable_expr.is_volatile_node());
+        let stable_arc: Arc<dyn PhysicalExpr> = Arc::new(stable_expr);
+        assert!(!is_volatile(&stable_arc));
+    }
+
+    #[test]
+    fn test_higher_order_function_wrapped_lambda() {
+        let fun = Arc::new(MockHigherOrderUDF {
+            signature: HigherOrderSignature::variadic_any(Volatility::Stable),
+        });
+
+        let expected = ScalarValue::Int32(Some(42));
+
+        let hof = HigherOrderFunctionExpr::try_new_with_schema(
+            fun,
+            vec![lambda(["a"], Arc::new(Literal::new(expected.clone()))).unwrap()],
+            &Schema::empty(),
+            Arc::new(ConfigOptions::new()),
+        )
+        .unwrap();
+
+        let new_children = vec![not(Arc::clone(&hof.args[0])).unwrap()];
+        let wrapped = Arc::new(hof).with_new_children(new_children).unwrap();
+
+        let result = wrapped
+            .evaluate(
+                &RecordBatch::try_new_with_options(
+                    Arc::new(Schema::empty()),
+                    vec![],
+                    &RecordBatchOptions::new().with_row_count(Some(0)),
+                )
+                .unwrap(),
+            )
+            .unwrap();
+
+        let ColumnarValue::Scalar(result) = result else {
+            unreachable!()
+        };
+
+        assert_eq!(result, expected);
+    }
+
+    #[test]
+    fn test_higher_order_function_badly_wrapped_lambda() {
+        let fun = Arc::new(MockHigherOrderUDF {
+            signature: HigherOrderSignature::variadic_any(Volatility::Stable),
+        });
+
+        let hof = HigherOrderFunctionExpr::try_new_with_schema(
+            fun,
+            vec![
+                not(
+                    lambda(["a"], Arc::new(Literal::new(ScalarValue::Int32(Some(42)))))
+                        .unwrap(),
+                )
+                .unwrap(),
+            ],
+            &Schema::empty(),
+            Arc::new(ConfigOptions::new()),
+        )
+        .unwrap();
+
+        let result = hof
+            .evaluate(
+                &RecordBatch::try_new_with_options(
+                    Arc::new(Schema::empty()),
+                    vec![],
+                    &RecordBatchOptions::new().with_row_count(Some(0)),
+                )
+                .unwrap(),
+            )
+            .unwrap_err();
+
+        assert_contains!(
+            result.to_string(),
+            "LambdaExpr::evaluate() should not be called"
+        );
+    }
+
+    #[test]
+    fn test_higher_order_function_unexpected_lambda() {
+        let fun = Arc::new(MockHigherOrderUDF {
+            signature: HigherOrderSignature::variadic_any(Volatility::Stable),
+        });
+
+        let hof = HigherOrderFunctionExpr::try_new_with_schema(
+            fun,
+            vec![Arc::new(NoOp::new())],
+            &Schema::empty(),
+            Arc::new(ConfigOptions::new()),
+        )
+        .unwrap();
+
+        let result = Arc::new(hof)
+            .with_new_children(vec![lambda(["a"], Arc::new(NoOp::new())).unwrap()])
+            .unwrap_err();
+
+        assert_contains!(
+            result.to_string(),
+            "mock_function received a lambda via with_new_children at position 0 that wasn't a lambda before"
+        );
+    }
+}
diff --git a/datafusion/physical-expr/src/intervals/cp_solver.rs b/datafusion/physical-expr/src/intervals/cp_solver.rs
index 573cc88db7ab5..aee65f35dc49c 100644
--- a/datafusion/physical-expr/src/intervals/cp_solver.rs
+++ b/datafusion/physical-expr/src/intervals/cp_solver.rs
@@ -148,19 +148,19 @@ use std::sync::Arc;
 use super::utils::{
     convert_duration_type_to_interval, convert_interval_type_to_duration, get_inverse_op,
 };
-use crate::expressions::{BinaryExpr, Literal};
-use crate::utils::{build_dag, ExprTreeNode};
 use crate::PhysicalExpr;
+use crate::expressions::{BinaryExpr, Literal};
+use crate::utils::{ExprTreeNode, build_dag};
 
 use arrow::datatypes::{DataType, Schema};
-use datafusion_common::{internal_err, not_impl_err, Result};
-use datafusion_expr::interval_arithmetic::{apply_operator, satisfy_greater, Interval};
+use datafusion_common::{Result, internal_err, not_impl_err};
 use datafusion_expr::Operator;
+use datafusion_expr::interval_arithmetic::{Interval, apply_operator, satisfy_greater};
 
+use petgraph::Outgoing;
 use petgraph::graph::NodeIndex;
 use petgraph::stable_graph::{DefaultIx, StableGraph};
 use petgraph::visit::{Bfs, Dfs, DfsPostOrder, EdgeRef};
-use petgraph::Outgoing;
 
 /// This object implements a directed acyclic expression graph (DAEG) that
 /// is used to compute ranges for expressions through interval arithmetic.
@@ -220,7 +220,7 @@ impl ExprIntervalGraphNode {
     /// any other expression starts with an indefinite interval (`[-∞, ∞]`).
     pub fn make_node(node: &ExprTreeNode<NodeIndex>, schema: &Schema) -> Result<Self> {
         let expr = Arc::clone(&node.expr);
-        if let Some(literal) = expr.as_any().downcast_ref::<Literal>() {
+        if let Some(literal) = expr.downcast_ref::<Literal>() {
             let value = literal.value();
             Interval::try_new(value.clone(), value.clone())
                 .map(|interval| Self::new_with_interval(expr, interval))
@@ -345,7 +345,7 @@ pub fn propagate_comparison(
     left_child: &Interval,
     right_child: &Interval,
 ) -> Result<Option<(Interval, Interval)>> {
-    if parent == &Interval::CERTAINLY_TRUE {
+    if parent == &Interval::TRUE {
         match op {
             Operator::Eq => left_child.intersect(right_child).map(|result| {
                 result.map(|intersection| (intersection.clone(), intersection))
@@ -360,7 +360,7 @@ pub fn propagate_comparison(
                 "The operator must be a comparison operator to propagate intervals"
             ),
         }
-    } else if parent == &Interval::CERTAINLY_FALSE {
+    } else if parent == &Interval::FALSE {
         match op {
             Operator::Eq => {
                 // TODO: Propagation is not possible until we support interval sets.
@@ -518,10 +518,10 @@ impl ExprIntervalGraph {
         // (1) given_range ⊇ bounds => Nothing to propagate
         // (2) ∅ ⊂ (given_range ∩ bounds) ⊂ bounds => Can propagate
         // (3) Disjoint sets => Infeasible
-        if given_range.contains(bounds)? == Interval::CERTAINLY_TRUE {
+        if given_range.contains(bounds)? == Interval::TRUE {
             // First case:
             Ok(PropagationResult::CannotPropagate)
-        } else if bounds.contains(&given_range)? != Interval::CERTAINLY_FALSE {
+        } else if bounds.contains(&given_range)? != Interval::FALSE {
             // Second case:
             let result = self.propagate_constraints(given_range);
             self.update_intervals(leaf_bounds);
@@ -643,10 +643,9 @@ impl ExprIntervalGraph {
             let node_interval = self.graph[node].interval();
             // Special case: true OR could in principle be propagated by 3 interval sets,
             // (i.e. left true, or right true, or both true) however we do not support this yet.
-            if node_interval == &Interval::CERTAINLY_TRUE
+            if node_interval == &Interval::TRUE
                 && self.graph[node]
                     .expr
-                    .as_any()
                     .downcast_ref::<BinaryExpr>()
                     .is_some_and(|expr| expr.op() == &Operator::Or)
             {
@@ -768,7 +767,7 @@ fn reverse_tuple<T, U>((first, second): (T, U)) -> (U, T) {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::expressions::{BinaryExpr, Column};
+    use crate::expressions::Column;
     use crate::intervals::test_utils::gen_conjunctive_numerical_expr;
 
     use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano};
@@ -780,7 +779,7 @@ mod tests {
     use rand::{Rng, SeedableRng};
     use rstest::*;
 
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     fn experiment(
         expr: Arc<dyn PhysicalExpr>,
         exprs_with_interval: (Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>),
@@ -815,8 +814,7 @@ mod tests {
             .map(|((_, interval), (_, index))| (*index, interval.clone()))
             .collect_vec();
 
-        let exp_result =
-            graph.update_ranges(&mut col_stat_nodes[..], Interval::CERTAINLY_TRUE)?;
+        let exp_result = graph.update_ranges(&mut col_stat_nodes[..], Interval::TRUE)?;
         assert_eq!(exp_result, result);
         col_stat_nodes.iter().zip(expected_nodes.iter()).for_each(
             |((_, calculated_interval_node), (_, expected))| {
@@ -893,12 +891,12 @@ mod tests {
                     PropagationResult::Success,
                     &Schema::new(vec![
                         Field::new(
-                            left_col.as_any().downcast_ref::<Column>().unwrap().name(),
+                            left_col.downcast_ref::<Column>().unwrap().name(),
                             DataType::$SCALAR,
                             true,
                         ),
                         Field::new(
-                            right_col.as_any().downcast_ref::<Column>().unwrap().name(),
+                            right_col.downcast_ref::<Column>().unwrap().name(),
                             DataType::$SCALAR,
                             true,
                         ),
@@ -940,16 +938,8 @@ mod tests {
             Interval::make(Some(100), None)?,
             PropagationResult::Infeasible,
             &Schema::new(vec![
-                Field::new(
-                    left_col.as_any().downcast_ref::<Column>().unwrap().name(),
-                    DataType::Int32,
-                    true,
-                ),
-                Field::new(
-                    right_col.as_any().downcast_ref::<Column>().unwrap().name(),
-                    DataType::Int32,
-                    true,
-                ),
+                Field::new(left_col.name(), DataType::Int32, true),
+                Field::new(right_col.name(), DataType::Int32, true),
             ]),
         )
     }
@@ -1575,12 +1565,7 @@ mod tests {
                 Interval::make(None, Some(999_i64))?,
                 Interval::make(Some(1000_i64), Some(1000_i64))?,
             ))),
-            propagate_comparison(
-                &Operator::Lt,
-                &Interval::CERTAINLY_TRUE,
-                &left,
-                &right
-            )?
+            propagate_comparison(&Operator::Lt, &Interval::TRUE, &left, &right)?
         );
 
         let left =
@@ -1604,12 +1589,7 @@ mod tests {
                     ScalarValue::TimestampNanosecond(Some(1000), None),
                 )?
             ))),
-            propagate_comparison(
-                &Operator::Lt,
-                &Interval::CERTAINLY_TRUE,
-                &left,
-                &right
-            )?
+            propagate_comparison(&Operator::Lt, &Interval::TRUE, &left, &right)?
         );
 
         let left = Interval::make_unbounded(&DataType::Timestamp(
@@ -1635,12 +1615,7 @@ mod tests {
                     ScalarValue::TimestampNanosecond(Some(1000), Some("+05:00".into())),
                 )?
             ))),
-            propagate_comparison(
-                &Operator::Lt,
-                &Interval::CERTAINLY_TRUE,
-                &left,
-                &right
-            )?
+            propagate_comparison(&Operator::Lt, &Interval::TRUE, &left, &right)?
         );
 
         Ok(())
@@ -1653,38 +1628,38 @@ mod tests {
             Operator::Or,
             Arc::new(Column::new("b", 1)),
         ));
-        let parent = Interval::CERTAINLY_FALSE;
+        let parent = Interval::FALSE;
         let children_set = vec![
-            vec![&Interval::CERTAINLY_FALSE, &Interval::UNCERTAIN],
-            vec![&Interval::UNCERTAIN, &Interval::CERTAINLY_FALSE],
-            vec![&Interval::CERTAINLY_FALSE, &Interval::CERTAINLY_FALSE],
-            vec![&Interval::UNCERTAIN, &Interval::UNCERTAIN],
+            vec![&Interval::FALSE, &Interval::TRUE_OR_FALSE],
+            vec![&Interval::TRUE_OR_FALSE, &Interval::FALSE],
+            vec![&Interval::FALSE, &Interval::FALSE],
+            vec![&Interval::TRUE_OR_FALSE, &Interval::TRUE_OR_FALSE],
         ];
         for children in children_set {
             assert_eq!(
                 expr.propagate_constraints(&parent, &children)?.unwrap(),
-                vec![Interval::CERTAINLY_FALSE, Interval::CERTAINLY_FALSE],
+                vec![Interval::FALSE, Interval::FALSE],
             );
         }
 
-        let parent = Interval::CERTAINLY_FALSE;
+        let parent = Interval::FALSE;
         let children_set = vec![
-            vec![&Interval::CERTAINLY_TRUE, &Interval::UNCERTAIN],
-            vec![&Interval::UNCERTAIN, &Interval::CERTAINLY_TRUE],
+            vec![&Interval::TRUE, &Interval::TRUE_OR_FALSE],
+            vec![&Interval::TRUE_OR_FALSE, &Interval::TRUE],
         ];
         for children in children_set {
             assert_eq!(expr.propagate_constraints(&parent, &children)?, None,);
         }
 
-        let parent = Interval::CERTAINLY_TRUE;
-        let children = vec![&Interval::CERTAINLY_FALSE, &Interval::UNCERTAIN];
+        let parent = Interval::TRUE;
+        let children = vec![&Interval::FALSE, &Interval::TRUE_OR_FALSE];
         assert_eq!(
             expr.propagate_constraints(&parent, &children)?.unwrap(),
-            vec![Interval::CERTAINLY_FALSE, Interval::CERTAINLY_TRUE]
+            vec![Interval::FALSE, Interval::TRUE]
         );
 
-        let parent = Interval::CERTAINLY_TRUE;
-        let children = vec![&Interval::UNCERTAIN, &Interval::UNCERTAIN];
+        let parent = Interval::TRUE;
+        let children = vec![&Interval::TRUE_OR_FALSE, &Interval::TRUE_OR_FALSE];
         assert_eq!(
             expr.propagate_constraints(&parent, &children)?.unwrap(),
             // Empty means unchanged intervals.
@@ -1701,25 +1676,22 @@ mod tests {
             Operator::And,
             Arc::new(Column::new("b", 1)),
         ));
-        let parent = Interval::CERTAINLY_FALSE;
+        let parent = Interval::FALSE;
         let children_and_results_set = vec![
             (
-                vec![&Interval::CERTAINLY_TRUE, &Interval::UNCERTAIN],
-                vec![Interval::CERTAINLY_TRUE, Interval::CERTAINLY_FALSE],
+                vec![&Interval::TRUE, &Interval::TRUE_OR_FALSE],
+                vec![Interval::TRUE, Interval::FALSE],
             ),
             (
-                vec![&Interval::UNCERTAIN, &Interval::CERTAINLY_TRUE],
-                vec![Interval::CERTAINLY_FALSE, Interval::CERTAINLY_TRUE],
+                vec![&Interval::TRUE_OR_FALSE, &Interval::TRUE],
+                vec![Interval::FALSE, Interval::TRUE],
             ),
             (
-                vec![&Interval::UNCERTAIN, &Interval::UNCERTAIN],
+                vec![&Interval::TRUE_OR_FALSE, &Interval::TRUE_OR_FALSE],
                 // Empty means unchanged intervals.
                 vec![],
             ),
-            (
-                vec![&Interval::CERTAINLY_FALSE, &Interval::UNCERTAIN],
-                vec![],
-            ),
+            (vec![&Interval::FALSE, &Interval::TRUE_OR_FALSE], vec![]),
         ];
         for (children, result) in children_and_results_set {
             assert_eq!(
diff --git a/datafusion/physical-expr/src/intervals/test_utils.rs b/datafusion/physical-expr/src/intervals/test_utils.rs
index c3d38a974ab02..805ffd27613ee 100644
--- a/datafusion/physical-expr/src/intervals/test_utils.rs
+++ b/datafusion/physical-expr/src/intervals/test_utils.rs
@@ -19,13 +19,13 @@
 
 use std::sync::Arc;
 
-use crate::expressions::{binary, BinaryExpr, Literal};
 use crate::PhysicalExpr;
+use crate::expressions::{BinaryExpr, Literal, binary};
 use arrow::datatypes::Schema;
 use datafusion_common::{DataFusionError, ScalarValue};
 use datafusion_expr::Operator;
 
-#[allow(clippy::too_many_arguments)]
+#[expect(clippy::too_many_arguments)]
 /// This test function generates a conjunctive statement with two numeric
 /// terms with the following form:
 /// left_col (op_1) a  >/>= right_col (op_2) b AND left_col (op_3) c </<= right_col (op_4) d
@@ -61,7 +61,7 @@ pub fn gen_conjunctive_numerical_expr(
     Arc::new(BinaryExpr::new(left_expr, Operator::And, right_expr))
 }
 
-#[allow(clippy::too_many_arguments)]
+#[expect(clippy::too_many_arguments)]
 /// This test function generates a conjunctive statement with
 /// two scalar values with the following form:
 /// left_col (op_1) a  > right_col (op_2) b AND left_col (op_3) c < right_col (op_4) d
diff --git a/datafusion/physical-expr/src/intervals/utils.rs b/datafusion/physical-expr/src/intervals/utils.rs
index 22752a00e9259..1090660a6b5e6 100644
--- a/datafusion/physical-expr/src/intervals/utils.rs
+++ b/datafusion/physical-expr/src/intervals/utils.rs
@@ -20,15 +20,15 @@
 use std::sync::Arc;
 
 use crate::{
-    expressions::{BinaryExpr, CastExpr, Column, Literal, NegativeExpr},
     PhysicalExpr,
+    expressions::{BinaryExpr, CastExpr, Column, Literal, NegativeExpr},
 };
 
 use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano};
 use arrow::datatypes::{DataType, SchemaRef};
-use datafusion_common::{internal_err, Result, ScalarValue};
-use datafusion_expr::interval_arithmetic::Interval;
+use datafusion_common::{Result, ScalarValue, internal_err};
 use datafusion_expr::Operator;
+use datafusion_expr::interval_arithmetic::Interval;
 
 /// Indicates whether interval arithmetic is supported for the given expression.
 /// Currently, we do not support all [`PhysicalExpr`]s for interval calculations.
@@ -36,26 +36,25 @@ use datafusion_expr::Operator;
 /// will relax as more types of `PhysicalExpr`s and `Operator`s are supported.
 /// Currently, [`CastExpr`], [`NegativeExpr`], [`BinaryExpr`], [`Column`] and [`Literal`] are supported.
 pub fn check_support(expr: &Arc<dyn PhysicalExpr>, schema: &SchemaRef) -> bool {
-    let expr_any = expr.as_any();
-    if let Some(binary_expr) = expr_any.downcast_ref::<BinaryExpr>() {
+    if let Some(binary_expr) = expr.downcast_ref::<BinaryExpr>() {
         is_operator_supported(binary_expr.op())
             && check_support(binary_expr.left(), schema)
             && check_support(binary_expr.right(), schema)
-    } else if let Some(column) = expr_any.downcast_ref::<Column>() {
+    } else if let Some(column) = expr.downcast_ref::<Column>() {
         if let Ok(field) = schema.field_with_name(column.name()) {
             is_datatype_supported(field.data_type())
         } else {
             false
         }
-    } else if let Some(literal) = expr_any.downcast_ref::<Literal>() {
+    } else if let Some(literal) = expr.downcast_ref::<Literal>() {
         if let Ok(dt) = literal.data_type(schema) {
             is_datatype_supported(&dt)
         } else {
             false
         }
-    } else if let Some(cast) = expr_any.downcast_ref::<CastExpr>() {
+    } else if let Some(cast) = expr.downcast_ref::<CastExpr>() {
         check_support(cast.expr(), schema)
-    } else if let Some(negative) = expr_any.downcast_ref::<NegativeExpr>() {
+    } else if let Some(negative) = expr.downcast_ref::<NegativeExpr>() {
         check_support(negative.arg(), schema)
     } else {
         false
@@ -104,6 +103,9 @@ pub fn is_datatype_supported(data_type: &DataType) -> bool {
             | &DataType::UInt8
             | &DataType::Float64
             | &DataType::Float32
+            | &DataType::Date32
+            | &DataType::Date64
+            | &DataType::Timestamp(_, _)
     )
 }
 
diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs
index aa8c9e50fd71e..848bf81d15979 100644
--- a/datafusion/physical-expr/src/lib.rs
+++ b/datafusion/physical-expr/src/lib.rs
@@ -23,6 +23,7 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 // Backward compatibility
 pub mod aggregate;
@@ -33,12 +34,14 @@ pub mod binary_map {
 pub mod async_scalar_function;
 pub mod equivalence;
 pub mod expressions;
+pub mod higher_order_function;
 pub mod intervals;
 mod partitioning;
 mod physical_expr;
 pub mod planner;
 pub mod projection;
 mod scalar_function;
+pub mod scalar_subquery;
 pub mod simplifier;
 pub mod statistics;
 pub mod utils;
@@ -51,9 +54,9 @@ pub mod execution_props {
 }
 
 pub use aggregate::groups_accumulator::{GroupsAccumulatorAdapter, NullState};
-pub use analysis::{analyze, AnalysisContext, ExprBoundaries};
+pub use analysis::{AnalysisContext, ExprBoundaries, analyze};
 pub use equivalence::{
-    calculate_union, AcrossPartitions, ConstExpr, EquivalenceProperties,
+    AcrossPartitions, ConstExpr, EquivalenceProperties, calculate_union,
 };
 pub use partitioning::{Distribution, Partitioning};
 pub use physical_expr::{
@@ -68,6 +71,7 @@ pub use datafusion_physical_expr_common::sort_expr::{
     PhysicalSortRequirement,
 };
 
+pub use higher_order_function::HigherOrderFunctionExpr;
 pub use planner::{create_physical_expr, create_physical_exprs};
 pub use scalar_function::ScalarFunctionExpr;
 pub use simplifier::PhysicalExprSimplifier;
diff --git a/datafusion/physical-expr/src/partitioning.rs b/datafusion/physical-expr/src/partitioning.rs
index d6b2b1b046f75..d24c60b63e6bd 100644
--- a/datafusion/physical-expr/src/partitioning.rs
+++ b/datafusion/physical-expr/src/partitioning.rs
@@ -18,8 +18,8 @@
 //! [`Partitioning`] and [`Distribution`] for `ExecutionPlans`
 
 use crate::{
-    equivalence::ProjectionMapping, expressions::UnKnownColumn, physical_exprs_equal,
-    EquivalenceProperties, PhysicalExpr,
+    EquivalenceProperties, PhysicalExpr, equivalence::ProjectionMapping,
+    expressions::UnKnownColumn, physical_exprs_equal,
 };
 use datafusion_physical_expr_common::physical_expr::format_physical_expr_list;
 use std::fmt;
@@ -139,6 +139,28 @@ impl Display for Partitioning {
         }
     }
 }
+
+/// Represents how a [`Partitioning`] satisfies a [`Distribution`] requirement.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum PartitioningSatisfaction {
+    /// The partitioning does not satisfy the distribution requirement
+    NotSatisfied,
+    /// The partitioning exactly matches the distribution requirement
+    Exact,
+    /// The partitioning satisfies the distribution requirement via subset logic
+    Subset,
+}
+
+impl PartitioningSatisfaction {
+    pub fn is_satisfied(&self) -> bool {
+        matches!(self, Self::Exact | Self::Subset)
+    }
+
+    pub fn is_subset(&self) -> bool {
+        *self == Self::Subset
+    }
+}
+
 impl Partitioning {
     /// Returns the number of partitions in this partitioning scheme
     pub fn partition_count(&self) -> usize {
@@ -148,51 +170,104 @@ impl Partitioning {
         }
     }
 
-    /// Returns true when the guarantees made by this [`Partitioning`] are sufficient to
-    /// satisfy the partitioning scheme mandated by the `required` [`Distribution`].
+    /// Returns true if `subset_exprs` is a subset of `exprs`.
+    /// For example: Hash(a, b) is subset of Hash(a) since a partition with all occurrences of
+    /// a distinct (a) must also contain all occurrences of a distinct (a, b) with the same (a).
+    fn is_subset_partitioning(
+        subset_exprs: &[Arc<dyn PhysicalExpr>],
+        superset_exprs: &[Arc<dyn PhysicalExpr>],
+    ) -> bool {
+        // Require strict subset: fewer expressions, not equal
+        if subset_exprs.is_empty() || subset_exprs.len() >= superset_exprs.len() {
+            return false;
+        }
+
+        subset_exprs.iter().all(|subset_expr| {
+            superset_exprs
+                .iter()
+                .any(|superset_expr| subset_expr.eq(superset_expr))
+        })
+    }
+
+    #[deprecated(since = "52.0.0", note = "Use satisfaction instead")]
     pub fn satisfy(
         &self,
         required: &Distribution,
         eq_properties: &EquivalenceProperties,
     ) -> bool {
+        self.satisfaction(required, eq_properties, false)
+            == PartitioningSatisfaction::Exact
+    }
+
+    /// Returns how this [`Partitioning`] satisfies the partitioning scheme mandated
+    /// by the `required` [`Distribution`].
+    pub fn satisfaction(
+        &self,
+        required: &Distribution,
+        eq_properties: &EquivalenceProperties,
+        allow_subset: bool,
+    ) -> PartitioningSatisfaction {
         match required {
-            Distribution::UnspecifiedDistribution => true,
-            Distribution::SinglePartition if self.partition_count() == 1 => true,
+            Distribution::UnspecifiedDistribution => PartitioningSatisfaction::Exact,
+            Distribution::SinglePartition if self.partition_count() == 1 => {
+                PartitioningSatisfaction::Exact
+            }
             // When partition count is 1, hash requirement is satisfied.
-            Distribution::HashPartitioned(_) if self.partition_count() == 1 => true,
-            Distribution::HashPartitioned(required_exprs) => {
-                match self {
-                    // Here we do not check the partition count for hash partitioning and assumes the partition count
-                    // and hash functions in the system are the same. In future if we plan to support storage partition-wise joins,
-                    // then we need to have the partition count and hash functions validation.
-                    Partitioning::Hash(partition_exprs, _) => {
-                        let fast_match =
-                            physical_exprs_equal(required_exprs, partition_exprs);
-                        // If the required exprs do not match, need to leverage the eq_properties provided by the child
-                        // and normalize both exprs based on the equivalent groups.
-                        if !fast_match {
-                            let eq_groups = eq_properties.eq_group();
-                            if !eq_groups.is_empty() {
-                                let normalized_required_exprs = required_exprs
-                                    .iter()
-                                    .map(|e| eq_groups.normalize_expr(Arc::clone(e)))
-                                    .collect::<Vec<_>>();
-                                let normalized_partition_exprs = partition_exprs
-                                    .iter()
-                                    .map(|e| eq_groups.normalize_expr(Arc::clone(e)))
-                                    .collect::<Vec<_>>();
-                                return physical_exprs_equal(
-                                    &normalized_required_exprs,
-                                    &normalized_partition_exprs,
-                                );
-                            }
+            Distribution::HashPartitioned(_) if self.partition_count() == 1 => {
+                PartitioningSatisfaction::Exact
+            }
+            Distribution::HashPartitioned(required_exprs) => match self {
+                // Here we do not check the partition count for hash partitioning and assumes the partition count
+                // and hash functions in the system are the same. In future if we plan to support storage partition-wise joins,
+                // then we need to have the partition count and hash functions validation.
+                Partitioning::Hash(partition_exprs, _) => {
+                    // Empty hash partitioning is invalid
+                    if partition_exprs.is_empty() || required_exprs.is_empty() {
+                        return PartitioningSatisfaction::NotSatisfied;
+                    }
+
+                    // Fast path: exact match
+                    if physical_exprs_equal(required_exprs, partition_exprs) {
+                        return PartitioningSatisfaction::Exact;
+                    }
+
+                    // Normalization path using equivalence groups
+                    let eq_groups = eq_properties.eq_group();
+                    if !eq_groups.is_empty() {
+                        let normalized_required_exprs = required_exprs
+                            .iter()
+                            .map(|e| eq_groups.normalize_expr(Arc::clone(e)))
+                            .collect::<Vec<_>>();
+                        let normalized_partition_exprs = partition_exprs
+                            .iter()
+                            .map(|e| eq_groups.normalize_expr(Arc::clone(e)))
+                            .collect::<Vec<_>>();
+                        if physical_exprs_equal(
+                            &normalized_required_exprs,
+                            &normalized_partition_exprs,
+                        ) {
+                            return PartitioningSatisfaction::Exact;
                         }
-                        fast_match
+
+                        if allow_subset
+                            && Self::is_subset_partitioning(
+                                &normalized_partition_exprs,
+                                &normalized_required_exprs,
+                            )
+                        {
+                            return PartitioningSatisfaction::Subset;
+                        }
+                    } else if allow_subset
+                        && Self::is_subset_partitioning(partition_exprs, required_exprs)
+                    {
+                        return PartitioningSatisfaction::Subset;
                     }
-                    _ => false,
+
+                    PartitioningSatisfaction::NotSatisfied
                 }
-            }
-            _ => false,
+                _ => PartitioningSatisfaction::NotSatisfied,
+            },
+            _ => PartitioningSatisfaction::NotSatisfied,
         }
     }
 
@@ -317,11 +392,21 @@ mod tests {
 
         for distribution in distribution_types {
             let result = (
-                single_partition.satisfy(&distribution, &eq_properties),
-                unspecified_partition.satisfy(&distribution, &eq_properties),
-                round_robin_partition.satisfy(&distribution, &eq_properties),
-                hash_partition1.satisfy(&distribution, &eq_properties),
-                hash_partition2.satisfy(&distribution, &eq_properties),
+                single_partition
+                    .satisfaction(&distribution, &eq_properties, true)
+                    .is_satisfied(),
+                unspecified_partition
+                    .satisfaction(&distribution, &eq_properties, true)
+                    .is_satisfied(),
+                round_robin_partition
+                    .satisfaction(&distribution, &eq_properties, true)
+                    .is_satisfied(),
+                hash_partition1
+                    .satisfaction(&distribution, &eq_properties, true)
+                    .is_satisfied(),
+                hash_partition2
+                    .satisfaction(&distribution, &eq_properties, true)
+                    .is_satisfied(),
             );
 
             match distribution {
@@ -339,4 +424,425 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_partitioning_satisfy_by_subset() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Int64, false),
+            Field::new("c", DataType::Int64, false),
+        ]));
+
+        let col_a: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("a", &schema)?);
+        let col_b: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("b", &schema)?);
+        let col_c: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("c", &schema)?);
+        let eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
+
+        let test_cases = vec![
+            (
+                "Hash([a]) vs Hash([a, b])",
+                Partitioning::Hash(vec![Arc::clone(&col_a)], 4),
+                Distribution::HashPartitioned(vec![
+                    Arc::clone(&col_a),
+                    Arc::clone(&col_b),
+                ]),
+                PartitioningSatisfaction::Subset,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([a]) vs Hash([a, b, c])",
+                Partitioning::Hash(vec![Arc::clone(&col_a)], 4),
+                Distribution::HashPartitioned(vec![
+                    Arc::clone(&col_a),
+                    Arc::clone(&col_b),
+                    Arc::clone(&col_c),
+                ]),
+                PartitioningSatisfaction::Subset,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([a, b]) vs Hash([a, b, c])",
+                Partitioning::Hash(vec![Arc::clone(&col_a), Arc::clone(&col_b)], 4),
+                Distribution::HashPartitioned(vec![
+                    Arc::clone(&col_a),
+                    Arc::clone(&col_b),
+                    Arc::clone(&col_c),
+                ]),
+                PartitioningSatisfaction::Subset,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([b]) vs Hash([a, b, c])",
+                Partitioning::Hash(vec![Arc::clone(&col_b)], 4),
+                Distribution::HashPartitioned(vec![
+                    Arc::clone(&col_a),
+                    Arc::clone(&col_b),
+                    Arc::clone(&col_c),
+                ]),
+                PartitioningSatisfaction::Subset,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([b, a]) vs Hash([a, b, c])",
+                Partitioning::Hash(vec![Arc::clone(&col_a)], 4),
+                Distribution::HashPartitioned(vec![
+                    Arc::clone(&col_a),
+                    Arc::clone(&col_b),
+                    Arc::clone(&col_c),
+                ]),
+                PartitioningSatisfaction::Subset,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+        ];
+
+        for (desc, partition, required, expected_with_subset, expected_without_subset) in
+            test_cases
+        {
+            let result = partition.satisfaction(&required, &eq_properties, true);
+            assert_eq!(
+                result, expected_with_subset,
+                "Failed for {desc} with subset enabled"
+            );
+
+            let result = partition.satisfaction(&required, &eq_properties, false);
+            assert_eq!(
+                result, expected_without_subset,
+                "Failed for {desc} with subset disabled"
+            );
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_partitioning_current_superset() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Int64, false),
+            Field::new("c", DataType::Int64, false),
+        ]));
+
+        let col_a: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("a", &schema)?);
+        let col_b: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("b", &schema)?);
+        let col_c: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("c", &schema)?);
+        let eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
+
+        let test_cases = vec![
+            (
+                "Hash([a, b]) vs Hash([a])",
+                Partitioning::Hash(vec![Arc::clone(&col_a), Arc::clone(&col_b)], 4),
+                Distribution::HashPartitioned(vec![Arc::clone(&col_a)]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([a, b, c]) vs Hash([a])",
+                Partitioning::Hash(
+                    vec![Arc::clone(&col_a), Arc::clone(&col_b), Arc::clone(&col_c)],
+                    4,
+                ),
+                Distribution::HashPartitioned(vec![Arc::clone(&col_a)]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([a, b, c]) vs Hash([a, b])",
+                Partitioning::Hash(
+                    vec![Arc::clone(&col_a), Arc::clone(&col_b), Arc::clone(&col_c)],
+                    4,
+                ),
+                Distribution::HashPartitioned(vec![
+                    Arc::clone(&col_a),
+                    Arc::clone(&col_b),
+                ]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+        ];
+
+        for (desc, partition, required, expected_with_subset, expected_without_subset) in
+            test_cases
+        {
+            let result = partition.satisfaction(&required, &eq_properties, true);
+            assert_eq!(
+                result, expected_with_subset,
+                "Failed for {desc} with subset enabled"
+            );
+
+            let result = partition.satisfaction(&required, &eq_properties, false);
+            assert_eq!(
+                result, expected_without_subset,
+                "Failed for {desc} with subset disabled"
+            );
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_partitioning_partial_overlap() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Int64, false),
+            Field::new("c", DataType::Int64, false),
+        ]));
+
+        let col_a: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("a", &schema)?);
+        let col_b: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("b", &schema)?);
+        let col_c: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("c", &schema)?);
+        let eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
+
+        let test_cases = vec![(
+            "Partial overlap: Hash([a, c]) vs Hash([a, b])",
+            Partitioning::Hash(vec![Arc::clone(&col_a), Arc::clone(&col_c)], 4),
+            Distribution::HashPartitioned(vec![Arc::clone(&col_a), Arc::clone(&col_b)]),
+            PartitioningSatisfaction::NotSatisfied,
+            PartitioningSatisfaction::NotSatisfied,
+        )];
+
+        for (desc, partition, required, expected_with_subset, expected_without_subset) in
+            test_cases
+        {
+            let result = partition.satisfaction(&required, &eq_properties, true);
+            assert_eq!(
+                result, expected_with_subset,
+                "Failed for {desc} with subset enabled"
+            );
+
+            let result = partition.satisfaction(&required, &eq_properties, false);
+            assert_eq!(
+                result, expected_without_subset,
+                "Failed for {desc} with subset disabled"
+            );
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_partitioning_no_overlap() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Int64, false),
+            Field::new("c", DataType::Int64, false),
+        ]));
+
+        let col_a: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("a", &schema)?);
+        let col_b: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("b", &schema)?);
+        let col_c: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("c", &schema)?);
+        let eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
+
+        let test_cases = vec![
+            (
+                "Hash([a]) vs Hash([b, c])",
+                Partitioning::Hash(vec![Arc::clone(&col_a)], 4),
+                Distribution::HashPartitioned(vec![
+                    Arc::clone(&col_b),
+                    Arc::clone(&col_c),
+                ]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([a, b]) vs Hash([c])",
+                Partitioning::Hash(vec![Arc::clone(&col_a), Arc::clone(&col_b)], 4),
+                Distribution::HashPartitioned(vec![Arc::clone(&col_c)]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+        ];
+
+        for (desc, partition, required, expected_with_subset, expected_without_subset) in
+            test_cases
+        {
+            let result = partition.satisfaction(&required, &eq_properties, true);
+            assert_eq!(
+                result, expected_with_subset,
+                "Failed for {desc} with subset enabled"
+            );
+
+            let result = partition.satisfaction(&required, &eq_properties, false);
+            assert_eq!(
+                result, expected_without_subset,
+                "Failed for {desc} with subset disabled"
+            );
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_partitioning_exact_match() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Int64, false),
+        ]));
+
+        let col_a: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("a", &schema)?);
+        let col_b: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("b", &schema)?);
+        let eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
+
+        let test_cases = vec![
+            (
+                "Hash([a, b]) vs Hash([a, b])",
+                Partitioning::Hash(vec![Arc::clone(&col_a), Arc::clone(&col_b)], 4),
+                Distribution::HashPartitioned(vec![
+                    Arc::clone(&col_a),
+                    Arc::clone(&col_b),
+                ]),
+                PartitioningSatisfaction::Exact,
+                PartitioningSatisfaction::Exact,
+            ),
+            (
+                "Hash([a]) vs Hash([a])",
+                Partitioning::Hash(vec![Arc::clone(&col_a)], 4),
+                Distribution::HashPartitioned(vec![Arc::clone(&col_a)]),
+                PartitioningSatisfaction::Exact,
+                PartitioningSatisfaction::Exact,
+            ),
+        ];
+
+        for (desc, partition, required, expected_with_subset, expected_without_subset) in
+            test_cases
+        {
+            let result = partition.satisfaction(&required, &eq_properties, true);
+            assert_eq!(
+                result, expected_with_subset,
+                "Failed for {desc} with subset enabled"
+            );
+
+            let result = partition.satisfaction(&required, &eq_properties, false);
+            assert_eq!(
+                result, expected_without_subset,
+                "Failed for {desc} with subset disabled"
+            );
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_partitioning_unknown() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Int64, false),
+        ]));
+
+        let col_a: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("a", &schema)?);
+        let col_b: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("b", &schema)?);
+        let unknown: Arc<dyn PhysicalExpr> = Arc::new(UnKnownColumn::new("dropped"));
+        let eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
+
+        let test_cases = vec![
+            (
+                "Hash([unknown]) vs Hash([a, b])",
+                Partitioning::Hash(vec![Arc::clone(&unknown)], 4),
+                Distribution::HashPartitioned(vec![
+                    Arc::clone(&col_a),
+                    Arc::clone(&col_b),
+                ]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([a, b]) vs Hash([unknown])",
+                Partitioning::Hash(vec![Arc::clone(&col_a), Arc::clone(&col_b)], 4),
+                Distribution::HashPartitioned(vec![Arc::clone(&unknown)]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([unknown]) vs Hash([unknown])",
+                Partitioning::Hash(vec![Arc::clone(&unknown)], 4),
+                Distribution::HashPartitioned(vec![Arc::clone(&unknown)]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+        ];
+
+        for (desc, partition, required, expected_with_subset, expected_without_subset) in
+            test_cases
+        {
+            let result = partition.satisfaction(&required, &eq_properties, true);
+            assert_eq!(
+                result, expected_with_subset,
+                "Failed for {desc} with subset enabled"
+            );
+
+            let result = partition.satisfaction(&required, &eq_properties, false);
+            assert_eq!(
+                result, expected_without_subset,
+                "Failed for {desc} with subset disabled"
+            );
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_partitioning_empty_hash() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]));
+
+        let col_a: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("a", &schema)?);
+        let eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
+
+        let test_cases = vec![
+            (
+                "Hash([]) vs Hash([a])",
+                Partitioning::Hash(vec![], 4),
+                Distribution::HashPartitioned(vec![Arc::clone(&col_a)]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([a]) vs Hash([])",
+                Partitioning::Hash(vec![Arc::clone(&col_a)], 4),
+                Distribution::HashPartitioned(vec![]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([]) vs Hash([])",
+                Partitioning::Hash(vec![], 4),
+                Distribution::HashPartitioned(vec![]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+        ];
+
+        for (desc, partition, required, expected_with_subset, expected_without_subset) in
+            test_cases
+        {
+            let result = partition.satisfaction(&required, &eq_properties, true);
+            assert_eq!(
+                result, expected_with_subset,
+                "Failed for {desc} with subset enabled"
+            );
+
+            let result = partition.satisfaction(&required, &eq_properties, false);
+            assert_eq!(
+                result, expected_without_subset,
+                "Failed for {desc} with subset disabled"
+            );
+        }
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-expr/src/physical_expr.rs b/datafusion/physical-expr/src/physical_expr.rs
index c658a8eddc233..77ede76e1daa8 100644
--- a/datafusion/physical-expr/src/physical_expr.rs
+++ b/datafusion/physical-expr/src/physical_expr.rs
@@ -18,13 +18,13 @@
 use std::sync::Arc;
 
 use crate::expressions::{self, Column};
-use crate::{create_physical_expr, LexOrdering, PhysicalSortExpr};
+use crate::{LexOrdering, PhysicalSortExpr, create_physical_expr};
 
 use arrow::compute::SortOptions;
 use arrow::datatypes::{Schema, SchemaRef};
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::{plan_err, Result};
 use datafusion_common::{DFSchema, HashMap};
+use datafusion_common::{Result, plan_err};
 use datafusion_expr::execution_props::ExecutionProps;
 use datafusion_expr::{Expr, SortExpr};
 
@@ -38,7 +38,7 @@ pub fn add_offset_to_expr(
     expr: Arc<dyn PhysicalExpr>,
     offset: isize,
 ) -> Result<Arc<dyn PhysicalExpr>> {
-    expr.transform_down(|e| match e.as_any().downcast_ref::<Column>() {
+    expr.transform_down(|e| match e.downcast_ref::<Column>() {
         Some(col) => {
             let Some(idx) = col.index().checked_add_signed(offset) else {
                 return plan_err!("Column index overflow");
@@ -233,18 +233,17 @@ pub fn add_offset_to_physical_sort_exprs(
 mod tests {
     use super::*;
 
-    use crate::expressions::{BinaryExpr, Column, Literal};
+    use crate::expressions::{BinaryExpr, Literal};
     use crate::physical_expr::{
         physical_exprs_bag_equal, physical_exprs_contains, physical_exprs_equal,
     };
     use datafusion_physical_expr_common::physical_expr::is_volatile;
 
-    use arrow::datatypes::{DataType, Schema};
+    use arrow::datatypes::DataType;
     use arrow::record_batch::RecordBatch;
-    use datafusion_common::{Result, ScalarValue};
+    use datafusion_common::ScalarValue;
     use datafusion_expr::ColumnarValue;
     use datafusion_expr::Operator;
-    use std::any::Any;
     use std::fmt;
 
     #[test]
@@ -394,10 +393,6 @@ mod tests {
     }
 
     impl PhysicalExpr for MockVolatileExpr {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
         fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
             Ok(DataType::Boolean)
         }
diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs
index 7790380dffd56..9cb20de252aa0 100644
--- a/datafusion/physical-expr/src/planner.rs
+++ b/datafusion/physical-expr/src/planner.rs
@@ -15,26 +15,33 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::collections::HashMap;
 use std::sync::Arc;
 
-use crate::ScalarFunctionExpr;
+use crate::scalar_subquery::ScalarSubqueryExpr;
+use crate::{HigherOrderFunctionExpr, ScalarFunctionExpr};
 use crate::{
-    expressions::{self, binary, like, similar_to, Column, Literal},
     PhysicalExpr,
+    expressions::{self, Column, Literal, binary, like, similar_to},
 };
 
 use arrow::datatypes::Schema;
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::metadata::FieldMetadata;
+use datafusion_common::datatype::FieldExt;
+use datafusion_common::metadata::{FieldMetadata, format_type_and_metadata};
 use datafusion_common::{
-    exec_err, not_impl_err, plan_err, DFSchema, Result, ScalarValue, ToDFSchema,
+    DFSchema, Result, ScalarValue, ToDFSchema, exec_err, internal_datafusion_err,
+    not_impl_err, plan_datafusion_err, plan_err,
 };
 use datafusion_expr::execution_props::ExecutionProps;
-use datafusion_expr::expr::{Alias, Cast, InList, Placeholder, ScalarFunction};
-use datafusion_expr::var_provider::is_system_variables;
+use datafusion_expr::expr::{
+    Alias, Cast, HigherOrderFunction, InList, Lambda, LambdaVariable, Placeholder,
+    ScalarFunction,
+};
 use datafusion_expr::var_provider::VarType;
+use datafusion_expr::var_provider::is_system_variables;
 use datafusion_expr::{
-    binary_expr, lit, Between, BinaryExpr, Expr, Like, Operator, TryCast,
+    Between, BinaryExpr, Expr, ExprSchemable, Like, Operator, TryCast, binary_expr, lit,
 };
 
 /// [PhysicalExpr] evaluate DataFusion expressions such as `A + 1`, or `CAST(c1
@@ -105,6 +112,7 @@ use datafusion_expr::{
 /// * `e` - The logical expression
 /// * `input_dfschema` - The DataFusion schema for the input, used to resolve `Column` references
 ///   to qualified or unqualified fields by name.
+#[cfg_attr(feature = "recursive_protection", recursive::recursive)]
 pub fn create_physical_expr(
     e: &Expr,
     input_dfschema: &DFSchema,
@@ -287,16 +295,31 @@ pub fn create_physical_expr(
                 };
             Ok(expressions::case(expr, when_then_expr, else_expr)?)
         }
-        Expr::Cast(Cast { expr, data_type }) => expressions::cast(
-            create_physical_expr(expr, input_dfschema, execution_props)?,
-            input_schema,
-            data_type.clone(),
-        ),
-        Expr::TryCast(TryCast { expr, data_type }) => expressions::try_cast(
+        Expr::Cast(Cast { expr, field }) => expressions::cast_with_target_field(
             create_physical_expr(expr, input_dfschema, execution_props)?,
             input_schema,
-            data_type.clone(),
+            Arc::clone(field),
+            None,
         ),
+        Expr::TryCast(TryCast { expr, field }) => {
+            if !field.metadata().is_empty() {
+                let (_, src_field) = expr.to_field(input_dfschema)?;
+                return plan_err!(
+                    "TryCast from {} to {} is not supported",
+                    format_type_and_metadata(
+                        src_field.data_type(),
+                        Some(src_field.metadata()),
+                    ),
+                    format_type_and_metadata(field.data_type(), Some(field.metadata()))
+                );
+            }
+
+            expressions::try_cast(
+                create_physical_expr(expr, input_dfschema, execution_props)?,
+                input_schema,
+                field.data_type().clone(),
+            )
+        }
         Expr::Not(expr) => {
             expressions::not(create_physical_expr(expr, input_dfschema, execution_props)?)
         }
@@ -380,9 +403,139 @@ pub fn create_physical_expr(
                 expressions::in_list(value_expr, list_exprs, negated, input_schema)
             }
         },
+        Expr::ScalarSubquery(sq) => {
+            match execution_props.subquery_indexes.get(sq) {
+                Some(&index) => {
+                    let schema = sq.subquery.schema();
+                    if schema.fields().len() != 1 {
+                        return plan_err!(
+                            "Scalar subquery must return exactly one column, got {}",
+                            schema.fields().len()
+                        );
+                    }
+                    let dt = schema.field(0).data_type().clone();
+                    let nullable = schema.field(0).is_nullable();
+                    Ok(Arc::new(ScalarSubqueryExpr::new(
+                        dt,
+                        nullable,
+                        index,
+                        execution_props.subquery_results.clone(),
+                    )))
+                }
+                None => {
+                    // Not found: either a correlated subquery that wasn't
+                    // rewritten to a join, or an uncorrelated one that wasn't
+                    // registered by the physical planner.
+                    not_impl_err!(
+                        "Physical plan does not support logical expression {e:?}"
+                    )
+                }
+            }
+        }
         Expr::Placeholder(Placeholder { id, .. }) => {
             exec_err!("Placeholder '{id}' was not provided a value for execution.")
         }
+        Expr::HigherOrderFunction(invocation @ HigherOrderFunction { func, args }) => {
+            let num_lambdas = args
+                .iter()
+                .filter(|arg| matches!(arg, Expr::Lambda(_)))
+                .count();
+
+            let mut lambda_parameters =
+                invocation.lambda_parameters(input_dfschema)?.into_iter();
+
+            if num_lambdas > lambda_parameters.len() {
+                return plan_err!(
+                    "{} lambda_parameters returned only {} values for {num_lambdas} lambdas",
+                    func.name(),
+                    lambda_parameters.len()
+                );
+            }
+
+            let physical_args = args
+                .iter()
+                .map(|arg| match arg {
+                    Expr::Lambda(lambda) => {
+                        let lambda_parameters = lambda_parameters
+                            .next()
+                            .ok_or_else(|| {
+                                internal_datafusion_err!(
+                                    "lambda_parameters len should have been checked above"
+                                )
+                            })?
+                            .into_iter()
+                            .zip(&lambda.params)
+                            .map(|(field, name)| field.renamed(name.as_str()))
+                            .collect();
+
+                        let lambda_schema = DFSchema::from_unqualified_fields(
+                            lambda_parameters,
+                            HashMap::new(),
+                        )?;
+
+                        create_physical_expr(arg, &lambda_schema, execution_props)
+                    }
+                    _ => create_physical_expr(arg, input_dfschema, execution_props),
+                })
+                .collect::<Result<_>>()?;
+
+            let config_options = match execution_props.config_options.as_ref() {
+                Some(config_options) => Arc::clone(config_options),
+                None => Arc::new(ConfigOptions::default()),
+            };
+
+            Ok(Arc::new(HigherOrderFunctionExpr::try_new_with_schema(
+                Arc::clone(func),
+                physical_args,
+                input_schema,
+                config_options,
+            )?))
+        }
+        Expr::Lambda(Lambda { params, body }) => {
+            // tracked at https://github.com/apache/datafusion/issues/21172
+            if body.any_column_refs() {
+                return plan_err!("lambda doesn't support column capture");
+            }
+
+            expressions::lambda(
+                params,
+                create_physical_expr(body, input_dfschema, execution_props)?,
+            )
+        }
+        Expr::LambdaVariable(LambdaVariable {
+            name,
+            field,
+            spans: _,
+        }) => {
+            let field = field.as_ref().ok_or_else(|| {
+                plan_datafusion_err!("unresolved LambdaVariable {name}")
+            })?;
+
+            let index = input_dfschema.inner().index_of(name)?;
+            let schema_field = input_dfschema.field(index);
+
+            // LambdaVariable.field will be made optional as in Expr::Placeholder
+            // and only LambdaVariable.name used, and field.name ignored,
+            // so they're not enforced to match for logical expressions
+            // Rename the field to match the schema one and use it's PartialEq impl instead
+            // of checking property by property and fail if new properties get's added to it.
+            // While not necessary, the sql planner does create lambda vars with matching names,
+            // so this shouldn't allocate with a lambda var from it
+            let renamed_field = Arc::clone(field).renamed(name);
+
+            if &renamed_field != schema_field {
+                return plan_err!(
+                    "LambdaVariable field and schema field mismatch {} != {}",
+                    renamed_field,
+                    schema_field
+                );
+            }
+
+            Ok(Arc::new(expressions::LambdaVariable::new(
+                index,
+                Arc::clone(schema_field),
+            )))
+        }
         other => {
             not_impl_err!("Physical plan does not support logical expression {other:?}")
         }
@@ -416,11 +569,25 @@ pub fn logical2physical(expr: &Expr, schema: &Schema) -> Arc<dyn PhysicalExpr> {
 mod tests {
     use arrow::array::{ArrayRef, BooleanArray, RecordBatch, StringArray};
     use arrow::datatypes::{DataType, Field};
-
-    use datafusion_expr::{col, lit};
+    use datafusion_expr::col;
 
     use super::*;
 
+    fn test_cast_schema() -> Schema {
+        Schema::new(vec![Field::new("a", DataType::Int32, false)])
+    }
+
+    fn lower_cast_expr(expr: &Expr, schema: &Schema) -> Result<Arc<dyn PhysicalExpr>> {
+        let df_schema = DFSchema::try_from(schema.clone())?;
+        create_physical_expr(expr, &df_schema, &ExecutionProps::new())
+    }
+
+    fn as_planner_cast(physical: &Arc<dyn PhysicalExpr>) -> &expressions::CastExpr {
+        physical
+            .downcast_ref::<expressions::CastExpr>()
+            .expect("planner should lower logical CAST to CastExpr")
+    }
+
     #[test]
     fn test_create_physical_expr_scalar_input_output() -> Result<()> {
         let expr = col("letter").eq(lit("A"));
@@ -445,4 +612,96 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_cast_lowering_preserves_target_field_metadata() -> Result<()> {
+        let schema = test_cast_schema();
+        let target_field = Arc::new(
+            Field::new("cast_target", DataType::Int64, true)
+                .with_metadata([("target_meta".to_string(), "1".to_string())].into()),
+        );
+        let cast_expr = Expr::Cast(Cast::new_from_field(
+            Box::new(col("a")),
+            Arc::clone(&target_field),
+        ));
+
+        let physical = lower_cast_expr(&cast_expr, &schema)?;
+        let cast = as_planner_cast(&physical);
+
+        assert_eq!(cast.target_field(), &target_field);
+        assert_eq!(physical.return_field(&schema)?, target_field);
+        assert!(physical.nullable(&schema)?);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_cast_lowering_preserves_standard_cast_semantics() -> Result<()> {
+        let schema = test_cast_schema();
+        let cast_expr = Expr::Cast(Cast::new(Box::new(col("a")), DataType::Int64));
+
+        let physical = lower_cast_expr(&cast_expr, &schema)?;
+        let cast = as_planner_cast(&physical);
+        let returned_field = physical.return_field(&schema)?;
+
+        assert_eq!(cast.cast_type(), &DataType::Int64);
+        assert_eq!(returned_field.name(), "a");
+        assert_eq!(returned_field.data_type(), &DataType::Int64);
+        assert!(!physical.nullable(&schema)?);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_cast_lowering_preserves_same_type_field_semantics() -> Result<()> {
+        let schema = test_cast_schema();
+        let target_field = Arc::new(
+            Field::new("same_type_cast", DataType::Int32, true).with_metadata(
+                [("target_meta".to_string(), "same-type".to_string())].into(),
+            ),
+        );
+        let cast_expr = Expr::Cast(Cast::new_from_field(
+            Box::new(col("a")),
+            Arc::clone(&target_field),
+        ));
+
+        let physical = lower_cast_expr(&cast_expr, &schema)?;
+        let cast = as_planner_cast(&physical);
+
+        assert_eq!(cast.target_field(), &target_field);
+        assert_eq!(physical.return_field(&schema)?, target_field);
+        assert!(physical.nullable(&schema)?);
+
+        Ok(())
+    }
+
+    /// Test that deeply nested expressions do not cause a stack overflow.
+    ///
+    /// This test only runs when the `recursive_protection` feature is enabled,
+    /// as it would overflow the stack otherwise.
+    #[test]
+    #[cfg_attr(not(feature = "recursive_protection"), ignore)]
+    fn test_deeply_nested_binary_expr() -> Result<()> {
+        // Create a deeply nested binary expression tree: ((((a + a) + a) + a) + ... )
+        // With 1000 levels of nesting, this would overflow the stack without recursion protection.
+        let depth = 1000;
+
+        let mut expr = col("a");
+        for _ in 0..depth {
+            expr = Expr::BinaryExpr(BinaryExpr {
+                left: Box::new(expr),
+                op: Operator::Plus,
+                right: Box::new(col("a")),
+            });
+        }
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let df_schema = DFSchema::try_from(schema)?;
+
+        // This should not stack overflow
+        let _physical_expr =
+            create_physical_expr(&expr, &df_schema, &ExecutionProps::new())?;
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-expr/src/projection.rs b/datafusion/physical-expr/src/projection.rs
index a120ab427e1de..62c4d425706d7 100644
--- a/datafusion/physical-expr/src/projection.rs
+++ b/datafusion/physical-expr/src/projection.rs
@@ -15,23 +15,34 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! [`ProjectionExpr`] and [`ProjectionExprs`] for representing projections.
+
 use std::ops::Deref;
 use std::sync::Arc;
 
-use crate::expressions::Column;
-use crate::utils::collect_columns;
 use crate::PhysicalExpr;
+use crate::expressions::{Column, Literal};
+use crate::scalar_function::ScalarFunctionExpr;
+use crate::utils::collect_columns;
 
-use arrow::datatypes::{Field, Schema, SchemaRef};
+use arrow::array::{RecordBatch, RecordBatchOptions};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use datafusion_common::stats::{ColumnStatistics, Precision};
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::{internal_datafusion_err, internal_err, plan_err, Result};
-
+use datafusion_common::{
+    Result, ScalarValue, Statistics, assert_or_internal_err, internal_datafusion_err,
+    plan_err,
+};
+
+use datafusion_physical_expr_common::metrics::ExecutionPlanMetricsSet;
+use datafusion_physical_expr_common::metrics::ExpressionEvaluatorMetrics;
+use datafusion_physical_expr_common::physical_expr::fmt_sql;
 use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays_with_metrics;
 use indexmap::IndexMap;
 use itertools::Itertools;
 
-/// A projection expression as used by projection operations.
+/// An expression used by projection operations.
 ///
 /// The expression is evaluated and the result is stored in a column
 /// with the name specified by `alias`.
@@ -39,6 +50,8 @@ use itertools::Itertools;
 /// For example, the SQL expression `a + b AS sum_ab` would be represented
 /// as a `ProjectionExpr` where `expr` is the expression `a + b`
 /// and `alias` is the string `sum_ab`.
+///
+/// See [`ProjectionExprs`] for a collection of projection expressions.
 #[derive(Debug, Clone)]
 pub struct ProjectionExpr {
     /// The expression that will be evaluated.
@@ -47,6 +60,15 @@ pub struct ProjectionExpr {
     pub alias: String,
 }
 
+impl PartialEq for ProjectionExpr {
+    fn eq(&self, other: &Self) -> bool {
+        let ProjectionExpr { expr, alias } = self;
+        expr.eq(&other.expr) && *alias == other.alias
+    }
+}
+
+impl Eq for ProjectionExpr {}
+
 impl std::fmt::Display for ProjectionExpr {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         if self.expr.to_string() == self.alias {
@@ -59,7 +81,8 @@ impl std::fmt::Display for ProjectionExpr {
 
 impl ProjectionExpr {
     /// Create a new projection expression
-    pub fn new(expr: Arc<dyn PhysicalExpr>, alias: String) -> Self {
+    pub fn new(expr: Arc<dyn PhysicalExpr>, alias: impl Into<String>) -> Self {
+        let alias = alias.into();
         Self { expr, alias }
     }
 
@@ -94,14 +117,18 @@ impl From<ProjectionExpr> for (Arc<dyn PhysicalExpr>, String) {
     }
 }
 
-/// A collection of projection expressions.
+/// A collection of  [`ProjectionExpr`] instances, representing a complete
+/// projection operation.
 ///
-/// This struct encapsulates multiple `ProjectionExpr` instances,
-/// representing a complete projection operation and provides
-/// methods to manipulate and analyze the projection as a whole.
-#[derive(Debug, Clone)]
+/// Projection operations are used in query plans to select specific columns or
+/// compute new columns based on existing ones.
+///
+/// See [`ProjectionExprs::from_indices`] to select a subset of columns by
+/// indices.
+#[derive(Debug, Clone, PartialEq, Eq)]
 pub struct ProjectionExprs {
-    exprs: Vec<ProjectionExpr>,
+    /// [`Arc`] used for a cheap clone, which improves physical plan optimization performance.
+    exprs: Arc<[ProjectionExpr]>,
 }
 
 impl std::fmt::Display for ProjectionExprs {
@@ -113,14 +140,16 @@ impl std::fmt::Display for ProjectionExprs {
 
 impl From<Vec<ProjectionExpr>> for ProjectionExprs {
     fn from(value: Vec<ProjectionExpr>) -> Self {
-        Self { exprs: value }
+        Self {
+            exprs: value.into(),
+        }
     }
 }
 
 impl From<&[ProjectionExpr]> for ProjectionExprs {
     fn from(value: &[ProjectionExpr]) -> Self {
         Self {
-            exprs: value.to_vec(),
+            exprs: value.iter().cloned().collect(),
         }
     }
 }
@@ -128,7 +157,7 @@ impl From<&[ProjectionExpr]> for ProjectionExprs {
 impl FromIterator<ProjectionExpr> for ProjectionExprs {
     fn from_iter<T: IntoIterator<Item = ProjectionExpr>>(exprs: T) -> Self {
         Self {
-            exprs: exprs.into_iter().collect::<Vec<_>>(),
+            exprs: exprs.into_iter().collect(),
         }
     }
 }
@@ -140,12 +169,17 @@ impl AsRef<[ProjectionExpr]> for ProjectionExprs {
 }
 
 impl ProjectionExprs {
-    pub fn new<I>(exprs: I) -> Self
-    where
-        I: IntoIterator<Item = ProjectionExpr>,
-    {
+    /// Make a new [`ProjectionExprs`] from expressions iterator.
+    pub fn new(exprs: impl IntoIterator<Item = ProjectionExpr>) -> Self {
         Self {
-            exprs: exprs.into_iter().collect::<Vec<_>>(),
+            exprs: exprs.into_iter().collect(),
+        }
+    }
+
+    /// Make a new [`ProjectionExprs`] from expressions.
+    pub fn from_expressions(exprs: impl Into<Arc<[ProjectionExpr]>>) -> Self {
+        Self {
+            exprs: exprs.into(),
         }
     }
 
@@ -192,7 +226,7 @@ impl ProjectionExprs {
     /// assert_eq!(projection_with_dups.as_ref()[1].alias, "a"); // duplicate
     /// assert_eq!(projection_with_dups.as_ref()[2].alias, "b");
     /// ```
-    pub fn from_indices(indices: &[usize], schema: &SchemaRef) -> Self {
+    pub fn from_indices(indices: &[usize], schema: &Schema) -> Self {
         let projection_exprs = indices.iter().map(|&i| {
             let field = schema.field(i);
             ProjectionExpr {
@@ -227,6 +261,50 @@ impl ProjectionExprs {
         self.exprs.iter().map(|e| Arc::clone(&e.expr))
     }
 
+    /// Apply a fallible transformation to the [`PhysicalExpr`] of each projection.
+    ///
+    /// This method transforms the expression in each [`ProjectionExpr`] while preserving
+    /// the alias. This is useful for rewriting expressions, such as when adapting
+    /// expressions to a different schema.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// use std::sync::Arc;
+    /// use arrow::datatypes::{DataType, Field, Schema};
+    /// use datafusion_common::Result;
+    /// use datafusion_physical_expr::expressions::Column;
+    /// use datafusion_physical_expr::projection::ProjectionExprs;
+    /// use datafusion_physical_expr::PhysicalExpr;
+    ///
+    /// // Create a schema and projection
+    /// let schema = Arc::new(Schema::new(vec![
+    ///     Field::new("a", DataType::Int32, false),
+    ///     Field::new("b", DataType::Int32, false),
+    /// ]));
+    /// let projection = ProjectionExprs::from_indices(&[0, 1], &schema);
+    ///
+    /// // Transform each expression (this example just clones them)
+    /// let transformed = projection.try_map_exprs(|expr| Ok(expr))?;
+    /// assert_eq!(transformed.as_ref().len(), 2);
+    /// # Ok::<(), datafusion_common::DataFusionError>(())
+    /// ```
+    pub fn try_map_exprs<F>(self, mut f: F) -> Result<Self>
+    where
+        F: FnMut(Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>>,
+    {
+        let exprs = self
+            .exprs
+            .iter()
+            .cloned()
+            .map(|mut proj| {
+                proj.expr = f(proj.expr)?;
+                Ok(proj)
+            })
+            .collect::<Result<Arc<_>>>()?;
+        Ok(Self::from_expressions(exprs))
+    }
+
     /// Apply another projection on top of this projection, returning the combined projection.
     /// For example, if this projection is `SELECT c@2 AS x, b@1 AS y, a@0 as z` and the other projection is `SELECT x@0 + 1 AS c1, y@1 + z@2 as c2`,
     /// we return a projection equivalent to `SELECT c@2 + 1 AS c1, b@1 + a@0 as c2`.
@@ -294,17 +372,9 @@ impl ProjectionExprs {
     /// applied on top of this projection.
     pub fn try_merge(&self, other: &ProjectionExprs) -> Result<ProjectionExprs> {
         let mut new_exprs = Vec::with_capacity(other.exprs.len());
-        for proj_expr in &other.exprs {
-            let new_expr = update_expr(&proj_expr.expr, &self.exprs, true)?
-                .ok_or_else(|| {
-                    internal_datafusion_err!(
-                        "Failed to combine projections: expression {} could not be applied on top of existing projections {}",
-                        proj_expr.expr,
-                        self.exprs.iter().map(|e| format!("{e}")).join(", ")
-                    )
-                })?;
+        for proj_expr in other.exprs.iter() {
             new_exprs.push(ProjectionExpr {
-                expr: new_expr,
+                expr: self.unproject_expr(&proj_expr.expr)?,
                 alias: proj_expr.alias.clone(),
             });
         }
@@ -351,12 +421,19 @@ impl ProjectionExprs {
     ///
     /// Use [`column_indices()`](Self::column_indices) instead if the projection may contain
     /// non-column expressions or if you need a deduplicated sorted list.
+    ///
+    /// # Panics
+    ///
+    /// Panics if any expression in the projection is not a simple column reference.
+    #[deprecated(
+        since = "52.0.0",
+        note = "Use column_indices() instead. This method will be removed in 58.0.0 or 6 months after 52.0.0 is released, whichever comes first."
+    )]
     pub fn ordered_column_indices(&self) -> Vec<usize> {
         self.exprs
             .iter()
             .map(|e| {
                 e.expr
-                    .as_any()
                     .downcast_ref::<Column>()
                     .expect("Expected column reference in projection")
                     .index()
@@ -365,9 +442,16 @@ impl ProjectionExprs {
     }
 
     /// Project a schema according to this projection.
-    /// For example, for a projection `SELECT a AS x, b + 1 AS y`, where `a` is at index 0 and `b` is at index 1,
-    /// if the input schema is `[a: Int32, b: Int32, c: Int32]`, the output schema would be `[x: Int32, y: Int32]`.
-    /// Fields' metadata are preserved from the input schema.
+    ///
+    /// For example, given a projection:
+    /// * `SELECT a AS x, b + 1 AS y`
+    /// * where `a` is at index 0
+    /// * `b` is at index 1
+    ///
+    /// If the input schema is `[a: Int32, b: Int32, c: Int32]`, the output
+    /// schema would be `[x: Int32, y: Int32]`.
+    ///
+    /// Note that [`Field`] metadata are preserved from the input schema.
     pub fn project_schema(&self, input_schema: &Schema) -> Result<Schema> {
         let fields: Result<Vec<Field>> = self
             .exprs
@@ -396,40 +480,247 @@ impl ProjectionExprs {
         ))
     }
 
+    /// "unproject" an expression by applying this projection in reverse,
+    /// returning a new set of expressions that reference the original input
+    /// columns.
+    ///
+    /// For example, consider
+    /// * an expression `c1_c2 > 5`, and a schema `[c1, c2]`
+    /// * a projection `c1 + c2 as c1_c2`
+    ///
+    /// This method would rewrite the expression to `c1 + c2 > 5`
+    pub fn unproject_expr(
+        &self,
+        expr: &Arc<dyn PhysicalExpr>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        update_expr(expr, &self.exprs, true)?.ok_or_else(|| {
+            internal_datafusion_err!(
+                "Failed to unproject an expression {} with ProjectionExprs {}",
+                expr,
+                self.exprs.iter().map(|e| format!("{e}")).join(", ")
+            )
+        })
+    }
+
+    /// "project" an expression using these projection's expressions
+    ///
+    /// For example, consider
+    /// * an expression `c1 + c2 > 5`, and a schema `[c1, c2]`
+    /// * a projection `c1 + c2 as c1_c2`
+    ///
+    /// * This method would rewrite the expression to `c1_c2 > 5`
+    pub fn project_expr(
+        &self,
+        expr: &Arc<dyn PhysicalExpr>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        update_expr(expr, &self.exprs, false)?.ok_or_else(|| {
+            internal_datafusion_err!(
+                "Failed to project an expression {} with ProjectionExprs {}",
+                expr,
+                self.exprs.iter().map(|e| format!("{e}")).join(", ")
+            )
+        })
+    }
+
+    /// Create a new [`Projector`] from this projection and an input schema.
+    ///
+    /// A [`Projector`] can be used to apply this projection to record batches.
+    ///
+    /// # Errors
+    /// This function returns an error if the output schema cannot be constructed from the input schema
+    /// with the given projection expressions.
+    /// For example, if an expression only works with integer columns but the input schema has a string column at that index.
+    pub fn make_projector(&self, input_schema: &Schema) -> Result<Projector> {
+        let output_schema = Arc::new(self.project_schema(input_schema)?);
+        Ok(Projector {
+            projection: self.clone(),
+            output_schema,
+            expression_metrics: None,
+        })
+    }
+
+    pub fn create_expression_metrics(
+        &self,
+        metrics: &ExecutionPlanMetricsSet,
+        partition: usize,
+    ) -> ExpressionEvaluatorMetrics {
+        let labels: Vec<String> = self
+            .exprs
+            .iter()
+            .map(|proj_expr| {
+                let expr_sql = fmt_sql(proj_expr.expr.as_ref()).to_string();
+                if proj_expr.expr.to_string() == proj_expr.alias {
+                    expr_sql
+                } else {
+                    format!("{expr_sql} AS {}", proj_expr.alias)
+                }
+            })
+            .collect();
+        ExpressionEvaluatorMetrics::new(metrics, partition, labels)
+    }
+
     /// Project statistics according to this projection.
     /// For example, for a projection `SELECT a AS x, b + 1 AS y`, where `a` is at index 0 and `b` is at index 1,
     /// if the input statistics has column statistics for columns `a`, `b`, and `c`, the output statistics would have column statistics for columns `x` and `y`.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// use arrow::datatypes::{DataType, Field, Schema};
+    /// use datafusion_common::stats::{ColumnStatistics, Precision, Statistics};
+    /// use datafusion_physical_expr::projection::ProjectionExprs;
+    /// use datafusion_common::Result;
+    /// use datafusion_common::ScalarValue;
+    /// use std::sync::Arc;
+    ///
+    /// fn main() -> Result<()> {
+    ///     // Input schema: a: Int32, b: Int32, c: Int32
+    ///     let input_schema = Arc::new(Schema::new(vec![
+    ///         Field::new("a", DataType::Int32, false),
+    ///         Field::new("b", DataType::Int32, false),
+    ///         Field::new("c", DataType::Int32, false),
+    ///     ]));
+    ///
+    ///     // Input statistics with column stats for a, b, c
+    ///     let input_stats = Statistics {
+    ///         num_rows: Precision::Exact(100),
+    ///         total_byte_size: Precision::Exact(1200),
+    ///         column_statistics: vec![
+    ///             // Column a stats
+    ///             ColumnStatistics::new_unknown()
+    ///                 .with_null_count(Precision::Exact(0))
+    ///                 .with_min_value(Precision::Exact(ScalarValue::Int32(Some(0))))
+    ///                 .with_max_value(Precision::Exact(ScalarValue::Int32(Some(100))))
+    ///                 .with_distinct_count(Precision::Exact(100)),
+    ///             // Column b stats
+    ///             ColumnStatistics::new_unknown()
+    ///                 .with_null_count(Precision::Exact(0))
+    ///                 .with_min_value(Precision::Exact(ScalarValue::Int32(Some(10))))
+    ///                 .with_max_value(Precision::Exact(ScalarValue::Int32(Some(60))))
+    ///                 .with_distinct_count(Precision::Exact(50)),
+    ///             // Column c stats
+    ///             ColumnStatistics::new_unknown()
+    ///                 .with_null_count(Precision::Exact(5))
+    ///                 .with_min_value(Precision::Exact(ScalarValue::Int32(Some(-10))))
+    ///                 .with_max_value(Precision::Exact(ScalarValue::Int32(Some(200))))
+    ///                 .with_distinct_count(Precision::Exact(25)),
+    ///         ],
+    ///     };
+    ///
+    ///     // Create a projection that selects columns c and a (indices 2 and 0)
+    ///     let projection = ProjectionExprs::from_indices(&[2, 0], &input_schema);
+    ///
+    ///     // Compute output schema
+    ///     let output_schema = projection.project_schema(&input_schema)?;
+    ///
+    ///     // Project the statistics
+    ///     let output_stats = projection.project_statistics(input_stats, &output_schema)?;
+    ///
+    ///     // The output should have 2 column statistics (for c and a, in that order)
+    ///     assert_eq!(output_stats.column_statistics.len(), 2);
+    ///
+    ///     // First column in output is c (was at index 2)
+    ///     assert_eq!(
+    ///         output_stats.column_statistics[0].min_value,
+    ///         Precision::Exact(ScalarValue::Int32(Some(-10)))
+    ///     );
+    ///     assert_eq!(
+    ///         output_stats.column_statistics[0].null_count,
+    ///         Precision::Exact(5)
+    ///     );
+    ///
+    ///     // Second column in output is a (was at index 0)
+    ///     assert_eq!(
+    ///         output_stats.column_statistics[1].min_value,
+    ///         Precision::Exact(ScalarValue::Int32(Some(0)))
+    ///     );
+    ///     assert_eq!(
+    ///         output_stats.column_statistics[1].distinct_count,
+    ///         Precision::Exact(100)
+    ///     );
+    ///
+    ///     // Total byte size is recalculated based on projected columns
+    ///     assert_eq!(
+    ///         output_stats.total_byte_size,
+    ///         Precision::Exact(800), // each Int32 column is 4 bytes * 100 rows * 2 columns
+    ///     );
+    ///
+    ///     // Number of rows remains the same
+    ///     assert_eq!(output_stats.num_rows, Precision::Exact(100));
+    ///
+    ///     Ok(())
+    /// }
+    /// ```
     pub fn project_statistics(
         &self,
-        mut stats: datafusion_common::Statistics,
-        input_schema: &Schema,
-    ) -> Result<datafusion_common::Statistics> {
-        let mut primitive_row_size = 0;
-        let mut primitive_row_size_possible = true;
-        let mut column_statistics = vec![];
-
-        for proj_expr in &self.exprs {
+        mut stats: Statistics,
+        output_schema: &Schema,
+    ) -> Result<Statistics> {
+        let mut column_statistics = Vec::with_capacity(self.exprs.len());
+
+        for proj_expr in self.exprs.iter() {
             let expr = &proj_expr.expr;
-            let col_stats = if let Some(col) = expr.as_any().downcast_ref::<Column>() {
-                stats.column_statistics[col.index()].clone()
+            let col_stats = if let Some(col) = expr.downcast_ref::<Column>() {
+                std::mem::take(&mut stats.column_statistics[col.index()])
+            } else if let Some(literal) = expr.downcast_ref::<Literal>() {
+                // Handle literal expressions (constants) by calculating proper statistics
+                let data_type = expr.data_type(output_schema)?;
+
+                if literal.value().is_null() {
+                    let null_count = match stats.num_rows {
+                        Precision::Exact(num_rows) => Precision::Exact(num_rows),
+                        _ => Precision::Absent,
+                    };
+
+                    ColumnStatistics {
+                        min_value: Precision::Exact(literal.value().clone()),
+                        max_value: Precision::Exact(literal.value().clone()),
+                        distinct_count: Precision::Exact(1),
+                        null_count,
+                        sum_value: Precision::Exact(literal.value().clone()),
+                        byte_size: Precision::Exact(0),
+                    }
+                } else {
+                    let value = literal.value();
+                    let distinct_count = Precision::Exact(1);
+                    let null_count = Precision::Exact(0);
+
+                    let byte_size = if let Some(byte_width) = data_type.primitive_width()
+                    {
+                        stats.num_rows.multiply(&Precision::Exact(byte_width))
+                    } else {
+                        // Complex types depend on array encoding, so set to Absent
+                        Precision::Absent
+                    };
+
+                    let widened_sum = Precision::Exact(value.clone()).cast_to_sum_type();
+                    let sum_value = widened_sum
+                        .get_value()
+                        .and_then(|sum| {
+                            Precision::<ScalarValue>::from(stats.num_rows)
+                                .cast_to(&sum.data_type())
+                                .ok()
+                        })
+                        .map(|row_count| widened_sum.multiply(&row_count))
+                        .unwrap_or(Precision::Absent);
+
+                    ColumnStatistics {
+                        min_value: Precision::Exact(value.clone()),
+                        max_value: Precision::Exact(value.clone()),
+                        distinct_count,
+                        null_count,
+                        sum_value,
+                        byte_size,
+                    }
+                }
             } else {
                 // TODO stats: estimate more statistics from expressions
                 // (expressions should compute their statistics themselves)
                 ColumnStatistics::new_unknown()
             };
             column_statistics.push(col_stats);
-            let data_type = expr.data_type(input_schema)?;
-            if let Some(value) = data_type.primitive_width() {
-                primitive_row_size += value;
-                continue;
-            }
-            primitive_row_size_possible = false;
-        }
-
-        if primitive_row_size_possible {
-            stats.total_byte_size =
-                Precision::Exact(primitive_row_size).multiply(&stats.num_rows);
         }
+        stats.calculate_total_byte_size(output_schema);
         stats.column_statistics = column_statistics;
         Ok(stats)
     }
@@ -444,35 +735,165 @@ impl<'a> IntoIterator for &'a ProjectionExprs {
     }
 }
 
-impl IntoIterator for ProjectionExprs {
-    type Item = ProjectionExpr;
-    type IntoIter = std::vec::IntoIter<ProjectionExpr>;
+/// Applies a projection to record batches.
+///
+/// A [`Projector`] uses a set of projection expressions to transform
+/// and a pre-computed output schema to project record batches accordingly.
+///
+/// The main reason to use a `Projector` is to avoid repeatedly computing
+/// the output schema for each batch, which can be costly if the projection
+/// expressions are complex.
+#[derive(Clone, Debug)]
+pub struct Projector {
+    projection: ProjectionExprs,
+    output_schema: SchemaRef,
+    /// If `Some`, metrics will be tracked for projection evaluation.
+    expression_metrics: Option<ExpressionEvaluatorMetrics>,
+}
+
+impl Projector {
+    /// Construct the projector with metrics. After execution, related metrics will
+    /// be tracked inside `ExecutionPlanMetricsSet`
+    ///
+    /// See [`ExpressionEvaluatorMetrics`] for details.
+    pub fn with_metrics(
+        &self,
+        metrics: &ExecutionPlanMetricsSet,
+        partition: usize,
+    ) -> Self {
+        let expr_metrics = self
+            .projection
+            .create_expression_metrics(metrics, partition);
+        Self {
+            expression_metrics: Some(expr_metrics),
+            projection: self.projection.clone(),
+            output_schema: Arc::clone(&self.output_schema),
+        }
+    }
 
-    fn into_iter(self) -> Self::IntoIter {
-        self.exprs.into_iter()
+    /// Project a record batch according to this projector's expressions.
+    ///
+    /// # Errors
+    /// This function returns an error if any expression evaluation fails
+    /// or if the output schema of the resulting record batch does not match
+    /// the pre-computed output schema of the projector.
+    pub fn project_batch(&self, batch: &RecordBatch) -> Result<RecordBatch> {
+        let arrays = evaluate_expressions_to_arrays_with_metrics(
+            self.projection.exprs.iter().map(|p| &p.expr),
+            batch,
+            self.expression_metrics.as_ref(),
+        )?;
+
+        if arrays.is_empty() {
+            let options =
+                RecordBatchOptions::new().with_row_count(Some(batch.num_rows()));
+            RecordBatch::try_new_with_options(
+                Arc::clone(&self.output_schema),
+                arrays,
+                &options,
+            )
+            .map_err(Into::into)
+        } else {
+            RecordBatch::try_new(Arc::clone(&self.output_schema), arrays)
+                .map_err(Into::into)
+        }
+    }
+
+    pub fn output_schema(&self) -> &SchemaRef {
+        &self.output_schema
+    }
+
+    pub fn projection(&self) -> &ProjectionExprs {
+        &self.projection
     }
 }
 
-/// The function operates in two modes:
+/// Describes an immutable reference counted projection.
+///
+/// This structure represents projecting a set of columns by index.
+/// [`Arc`] is used to make it cheap to clone.
+pub type ProjectionRef = Arc<[usize]>;
+
+/// Combine two projections.
+///
+/// If `p1` is [`None`] then there are no changes.
+/// Otherwise, if passed `p2` is not [`None`] then it is remapped
+/// according to the `p1`. Otherwise, there are no changes.
 ///
-/// 1) When `sync_with_child` is `true`:
+/// # Example
 ///
-///    The function updates the indices of `expr` if the expression resides
-///    in the input plan. For instance, given the expressions `a@1 + b@2`
-///    and `c@0` with the input schema `c@2, a@0, b@1`, the expressions are
-///    updated to `a@0 + b@1` and `c@2`.
+/// If stored projection is [0, 2] and we call `apply_projection([0, 2, 3])`,
+/// then the resulting projection will be [0, 3].
 ///
-/// 2) When `sync_with_child` is `false`:
+/// # Error
 ///
-///    The function determines how the expression would be updated if a projection
-///    was placed before the plan associated with the expression. If the expression
-///    cannot be rewritten after the projection, it returns `None`. For example,
-///    given the expressions `c@0`, `a@1` and `b@2`, and the projection with
-///    an output schema of `a, c_new`, then `c@0` becomes `c_new@1`, `a@1` becomes
-///    `a@0`, but `b@2` results in `None` since the projection does not include `b`.
+/// Returns an internal error if `p1` contains index that is greater than `p2` len.
+///
+pub fn combine_projections(
+    p1: Option<&ProjectionRef>,
+    p2: Option<&ProjectionRef>,
+) -> Result<Option<ProjectionRef>> {
+    let Some(p1) = p1 else {
+        return Ok(None);
+    };
+    let Some(p2) = p2 else {
+        return Ok(Some(Arc::clone(p1)));
+    };
+
+    Ok(Some(
+        p1.iter()
+            .map(|i| {
+                let idx = *i;
+                assert_or_internal_err!(
+                    idx < p2.len(),
+                    "unable to apply projection: index {} is greater than new projection len {}",
+                    idx,
+                    p2.len(),
+                );
+                Ok(p2[*i])
+            })
+            .collect::<Result<Arc<[usize]>>>()?,
+    ))
+}
+
+/// The function projects / unprojects an expression with respect to set of
+/// projection expressions.
+///
+/// See also [`ProjectionExprs::unproject_expr`] and [`ProjectionExprs::project_expr`]
+///
+/// 1) When `unproject` is `true`:
+///
+///    Rewrites an expression with respect to the projection expressions,
+///    effectively "unprojecting" it to reference the original input columns.
+///
+///    For example, given
+///    * the expressions `a@1 + b@2` and `c@0`
+///    * and projection expressions `c@2, a@0, b@1`
+///
+///    Then
+///    * `a@1 + b@2` becomes `a@0 + b@1`
+///    * `c@0` becomes `c@2`
+///
+/// 2) When `unproject` is `false`:
+///
+///    Rewrites the expression to reference the projected expressions,
+///    effectively "projecting" it. The resulting expression will reference the
+///    indices as they appear in the projection.
+///
+///    If the expression cannot be rewritten after the projection, it returns
+///    `None`.
+///
+///    For example, given
+///    * the expressions `c@0`, `a@1` and `b@2`
+///    * the projection `a@1 as a, c@0 as c_new`,
+///
+///    Then
+///    * `c@0` becomes `c_new@1`
+///    * `a@1` becomes `a@0`
+///    * `b@2` results in `None` since the projection does not include `b`.
 ///
 /// # Errors
-/// This function returns an error if `sync_with_child` is `true` and if any expression references
+/// This function returns an error if `unproject` is `true` and if any expression references
 /// an index that is out of bounds for `projected_exprs`.
 /// For example:
 ///
@@ -483,7 +904,7 @@ impl IntoIterator for ProjectionExprs {
 pub fn update_expr(
     expr: &Arc<dyn PhysicalExpr>,
     projected_exprs: &[ProjectionExpr],
-    sync_with_child: bool,
+    unproject: bool,
 ) -> Result<Option<Arc<dyn PhysicalExpr>>> {
     #[derive(Debug, PartialEq)]
     enum RewriteState {
@@ -504,10 +925,10 @@ pub fn update_expr(
                 return Ok(Transformed::no(expr));
             }
 
-            let Some(column) = expr.as_any().downcast_ref::<Column>() else {
+            let Some(column) = expr.downcast_ref::<Column>() else {
                 return Ok(Transformed::no(expr));
             };
-            if sync_with_child {
+            if unproject {
                 state = RewriteState::RewrittenValid;
                 // Update the index of `column`:
                 let projected_expr = projected_exprs.get(column.index()).ok_or_else(|| {
@@ -526,7 +947,7 @@ pub fn update_expr(
                     .iter()
                     .enumerate()
                     .find_map(|(index, proj_expr)| {
-                        proj_expr.expr.as_any().downcast_ref::<Column>().and_then(
+                        proj_expr.expr.downcast_ref::<Column>().and_then(
                             |projected_column| {
                                 (column.name().eq(projected_column.name())
                                     && column.index() == projected_column.index())
@@ -545,7 +966,13 @@ pub fn update_expr(
         })
         .data()?;
 
-    Ok((state == RewriteState::RewrittenValid).then_some(new_expr))
+    match state {
+        RewriteState::RewrittenInvalid => Ok(None),
+        // Both Unchanged and RewrittenValid are valid:
+        // - Unchanged means no columns to rewrite (e.g., literals)
+        // - RewrittenValid means columns were successfully rewritten
+        RewriteState::Unchanged | RewriteState::RewrittenValid => Ok(Some(new_expr)),
+    }
 }
 
 /// Stores target expressions, along with their indices, that associate with a
@@ -616,7 +1043,7 @@ impl ProjectionMapping {
         let mut map = IndexMap::<_, ProjectionTargets>::new();
         for (expr_idx, (expr, name)) in expr.into_iter().enumerate() {
             let target_expr = Arc::new(Column::new(&name, expr_idx)) as _;
-            let source_expr = expr.transform_down(|e| match e.as_any().downcast_ref::<Column>() {
+            let source_expr = expr.transform_down(|e| match e.downcast_ref::<Column>() {
                 Some(col) => {
                     // Sometimes, an expression and its name in the input_schema
                     // doesn't match. This can cause problems, so we make sure
@@ -625,22 +1052,77 @@ impl ProjectionMapping {
                     let idx = col.index();
                     let matching_field = input_schema.field(idx);
                     let matching_name = matching_field.name();
-                    if col.name() != matching_name {
-                        return internal_err!(
-                            "Input field name {} does not match with the projection expression {}",
-                            matching_name,
-                            col.name()
-                        );
-                    }
+                    assert_or_internal_err!(
+                        col.name() == matching_name,
+                        "Input field name {matching_name} does not match with the projection expression {}",
+                        col.name()
+                    );
                     let matching_column = Column::new(matching_name, idx);
                     Ok(Transformed::yes(Arc::new(matching_column)))
                 }
                 None => Ok(Transformed::no(e)),
             })
             .data()?;
-            map.entry(source_expr)
+            map.entry(Arc::clone(&source_expr))
                 .or_default()
-                .push((target_expr, expr_idx));
+                .push((Arc::clone(&target_expr), expr_idx));
+
+            // For struct-producing functions (e.g. named_struct), decompose
+            // into field-level mapping entries so that orderings propagate
+            // through struct projections. For example, if the projection has
+            // `named_struct('ticker', p.ticker, ...) AS details`, this adds:
+            //   p.ticker → get_field(col("details"), "ticker")
+            // enabling the optimizer to know that sorting by
+            // `details.ticker` is equivalent to sorting by `p.ticker`.
+            if let Some(func_expr) = source_expr.downcast_ref::<ScalarFunctionExpr>() {
+                let literal_args: Vec<Option<ScalarValue>> = func_expr
+                    .args()
+                    .iter()
+                    .map(|arg| arg.downcast_ref::<Literal>().map(|l| l.value().clone()))
+                    .collect();
+
+                if let Some(field_mapping) =
+                    func_expr.fun().struct_field_mapping(&literal_args)
+                    && let DataType::Struct(struct_fields) = func_expr.return_type()
+                {
+                    for (accessor_args, source_arg_idx) in &field_mapping.fields {
+                        let value_expr = Arc::clone(&func_expr.args()[*source_arg_idx]);
+
+                        // Build accessor args: [target_col, ...field_name_literals]
+                        let mut accessor_fn_args: Vec<Arc<dyn PhysicalExpr>> =
+                            vec![Arc::clone(&target_expr)];
+                        accessor_fn_args.extend(accessor_args.iter().map(|sv| {
+                            Arc::new(Literal::new(sv.clone())) as Arc<dyn PhysicalExpr>
+                        }));
+
+                        // Look up the field's return type from the struct schema
+                        let return_field = accessor_args
+                            .first()
+                            .and_then(|sv| sv.try_as_str().flatten())
+                            .and_then(|field_name| {
+                                struct_fields
+                                    .iter()
+                                    .find(|f| f.name() == field_name)
+                                    .cloned()
+                            });
+
+                        if let Some(return_field) = return_field {
+                            let field_access_expr = Arc::new(ScalarFunctionExpr::new(
+                                field_mapping.field_accessor.name(),
+                                Arc::clone(&field_mapping.field_accessor),
+                                accessor_fn_args,
+                                return_field,
+                                Arc::new(func_expr.config_options().clone()),
+                            ))
+                                as Arc<dyn PhysicalExpr>;
+
+                            map.entry(value_expr)
+                                .or_default()
+                                .push((field_access_expr, expr_idx));
+                        }
+                    }
+                }
+            }
         }
         Ok(Self { map })
     }
@@ -738,7 +1220,7 @@ pub fn project_ordering(
     let mut projected_exprs = vec![];
     for PhysicalSortExpr { expr, options } in ordering.iter() {
         let transformed = Arc::clone(expr).transform_up(|expr| {
-            let Some(col) = expr.as_any().downcast_ref::<Column>() else {
+            let Some(col) = expr.downcast_ref::<Column>() else {
                 return Ok(Transformed::no(expr));
             };
 
@@ -773,15 +1255,14 @@ pub(crate) mod tests {
     use std::collections::HashMap;
 
     use super::*;
-    use crate::equivalence::{convert_to_orderings, EquivalenceProperties};
-    use crate::expressions::{col, BinaryExpr, Literal};
+    use crate::equivalence::{EquivalenceProperties, convert_to_orderings};
+    use crate::expressions::{BinaryExpr, col};
     use crate::utils::tests::TestScalarUDF;
     use crate::{PhysicalExprRef, ScalarFunctionExpr};
 
     use arrow::compute::SortOptions;
-    use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+    use arrow::datatypes::{DataType, TimeUnit};
     use datafusion_common::config::ConfigOptions;
-    use datafusion_common::{ScalarValue, Statistics};
     use datafusion_expr::{Operator, ScalarUDF};
     use insta::assert_snapshot;
 
@@ -795,8 +1276,10 @@ pub(crate) mod tests {
             let data_type = source.data_type(input_schema)?;
             let nullable = source.nullable(input_schema)?;
             for (target, _) in targets.iter() {
-                let Some(column) = target.as_any().downcast_ref::<Column>() else {
-                    return plan_err!("Expects to have column");
+                // Skip non-Column targets (e.g. struct field decomposition
+                // entries which are ScalarFunctionExpr targets).
+                let Some(column) = target.downcast_ref::<Column>() else {
+                    continue;
                 };
                 fields.push(Field::new(column.name(), data_type.clone(), nullable));
             }
@@ -1647,6 +2130,7 @@ pub(crate) mod tests {
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
                     sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
                     null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(1),
@@ -1654,6 +2138,7 @@ pub(crate) mod tests {
                     min_value: Precision::Exact(ScalarValue::from("a")),
                     sum_value: Precision::Absent,
                     null_count: Precision::Exact(3),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Absent,
@@ -1661,6 +2146,7 @@ pub(crate) mod tests {
                     min_value: Precision::Exact(ScalarValue::Float32(Some(0.1))),
                     sum_value: Precision::Exact(ScalarValue::Float32(Some(5.5))),
                     null_count: Precision::Absent,
+                    byte_size: Precision::Absent,
                 },
             ],
         }
@@ -1689,11 +2175,15 @@ pub(crate) mod tests {
             },
         ]);
 
-        let result = projection.project_statistics(source, &schema).unwrap();
+        let result = projection
+            .project_statistics(source, &projection.project_schema(&schema).unwrap())
+            .unwrap();
 
         let expected = Statistics {
             num_rows: Precision::Exact(5),
-            total_byte_size: Precision::Exact(23),
+            // Because there is a variable length Utf8 column we cannot calculate exact byte size after projection
+            // Thus we set it to Inexact (originally it was Exact(23))
+            total_byte_size: Precision::Inexact(23),
             column_statistics: vec![
                 ColumnStatistics {
                     distinct_count: Precision::Exact(1),
@@ -1701,6 +2191,7 @@ pub(crate) mod tests {
                     min_value: Precision::Exact(ScalarValue::from("a")),
                     sum_value: Precision::Absent,
                     null_count: Precision::Exact(3),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(5),
@@ -1708,6 +2199,7 @@ pub(crate) mod tests {
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
                     sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
                     null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
                 },
             ],
         };
@@ -1731,7 +2223,9 @@ pub(crate) mod tests {
             },
         ]);
 
-        let result = projection.project_statistics(source, &schema).unwrap();
+        let result = projection
+            .project_statistics(source, &projection.project_schema(&schema).unwrap())
+            .unwrap();
 
         let expected = Statistics {
             num_rows: Precision::Exact(5),
@@ -1743,6 +2237,7 @@ pub(crate) mod tests {
                     min_value: Precision::Exact(ScalarValue::Float32(Some(0.1))),
                     sum_value: Precision::Exact(ScalarValue::Float32(Some(5.5))),
                     null_count: Precision::Absent,
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(5),
@@ -1750,6 +2245,7 @@ pub(crate) mod tests {
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
                     sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
                     null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
                 },
             ],
         };
@@ -2009,6 +2505,88 @@ pub(crate) mod tests {
         );
     }
 
+    #[test]
+    fn test_merge_empty_projection_with_literal() -> Result<()> {
+        // This test reproduces the issue from roundtrip_empty_projection test
+        // Query like: SELECT 1 FROM table
+        // where the file scan needs no columns (empty projection)
+        // but we project a literal on top
+
+        // Empty base projection (no columns needed from file)
+        let base_projection = ProjectionExprs::new(vec![]);
+
+        // Top projection with a literal expression: SELECT 1
+        let top_projection = ProjectionExprs::new(vec![ProjectionExpr {
+            expr: Arc::new(Literal::new(ScalarValue::Int64(Some(1)))),
+            alias: "Int64(1)".to_string(),
+        }]);
+
+        // This should succeed - literals don't reference columns so they should
+        // pass through unchanged when merged with an empty projection
+        let merged = base_projection.try_merge(&top_projection)?;
+        assert_snapshot!(format!("{merged}"), @"Projection[1 AS Int64(1)]");
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_update_expr_with_literal() -> Result<()> {
+        // Test that update_expr correctly handles expressions without column references
+        let literal_expr: Arc<dyn PhysicalExpr> =
+            Arc::new(Literal::new(ScalarValue::Int64(Some(42))));
+        let empty_projection: Vec<ProjectionExpr> = vec![];
+
+        // Updating a literal with an empty projection should return the literal unchanged
+        let result = update_expr(&literal_expr, &empty_projection, true)?;
+        assert!(result.is_some(), "Literal expression should be valid");
+
+        let result_expr = result.unwrap();
+        assert_eq!(
+            result_expr.downcast_ref::<Literal>().unwrap().value(),
+            &ScalarValue::Int64(Some(42))
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_update_expr_with_complex_literal_expr() -> Result<()> {
+        // Test update_expr with an expression containing both literals and a column
+        // This tests the case where we have: literal + column
+        let expr: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            Arc::new(Literal::new(ScalarValue::Int64(Some(10)))),
+            Operator::Plus,
+            Arc::new(Column::new("x", 0)),
+        ));
+
+        // Base projection that maps column 0 to a different expression
+        let base_projection = vec![ProjectionExpr {
+            expr: Arc::new(Column::new("a", 5)),
+            alias: "x".to_string(),
+        }];
+
+        // The expression should be updated: 10 + x@0 becomes 10 + a@5
+        let result = update_expr(&expr, &base_projection, true)?;
+        assert!(result.is_some(), "Expression should be valid");
+
+        let result_expr = result.unwrap();
+        let binary = result_expr
+            .downcast_ref::<BinaryExpr>()
+            .expect("Should be a BinaryExpr");
+
+        // Left side should still be the literal
+        assert!(binary.left().downcast_ref::<Literal>().is_some());
+
+        // Right side should be updated to reference column at index 5
+        let right_col = binary
+            .right()
+            .downcast_ref::<Column>()
+            .expect("Right should be a Column");
+        assert_eq!(right_col.index(), 5);
+
+        Ok(())
+    }
+
     #[test]
     fn test_project_schema_simple_columns() -> Result<()> {
         // Input schema: [col0: Int64, col1: Utf8, col2: Float32]
@@ -2128,7 +2706,10 @@ pub(crate) mod tests {
             },
         ]);
 
-        let output_stats = projection.project_statistics(input_stats, &input_schema)?;
+        let output_stats = projection.project_statistics(
+            input_stats,
+            &projection.project_schema(&input_schema)?,
+        )?;
 
         // Row count should be preserved
         assert_eq!(output_stats.num_rows, Precision::Exact(5));
@@ -2180,7 +2761,10 @@ pub(crate) mod tests {
             },
         ]);
 
-        let output_stats = projection.project_statistics(input_stats, &input_schema)?;
+        let output_stats = projection.project_statistics(
+            input_stats,
+            &projection.project_schema(&input_schema)?,
+        )?;
 
         // Row count should be preserved
         assert_eq!(output_stats.num_rows, Precision::Exact(5));
@@ -2224,7 +2808,10 @@ pub(crate) mod tests {
             },
         ]);
 
-        let output_stats = projection.project_statistics(input_stats, &input_schema)?;
+        let output_stats = projection.project_statistics(
+            input_stats,
+            &projection.project_schema(&input_schema)?,
+        )?;
 
         // Row count should be preserved
         assert_eq!(output_stats.num_rows, Precision::Exact(5));
@@ -2246,7 +2833,10 @@ pub(crate) mod tests {
 
         let projection = ProjectionExprs::new(vec![]);
 
-        let output_stats = projection.project_statistics(input_stats, &input_schema)?;
+        let output_stats = projection.project_statistics(
+            input_stats,
+            &projection.project_schema(&input_schema)?,
+        )?;
 
         // Row count should be preserved
         assert_eq!(output_stats.num_rows, Precision::Exact(5));
@@ -2259,4 +2849,246 @@ pub(crate) mod tests {
 
         Ok(())
     }
+
+    // Test statistics calculation for non-null literal (numeric constant)
+    #[test]
+    fn test_project_statistics_with_literal() -> Result<()> {
+        let input_stats = get_stats();
+        let input_schema = get_schema();
+
+        // Projection with literal: SELECT 42 AS constant, col0 AS num
+        let projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Literal::new(ScalarValue::Int64(Some(42)))),
+                alias: "constant".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col0", 0)),
+                alias: "num".to_string(),
+            },
+        ]);
+
+        let output_stats = projection.project_statistics(
+            input_stats,
+            &projection.project_schema(&input_schema)?,
+        )?;
+
+        // Row count should be preserved
+        assert_eq!(output_stats.num_rows, Precision::Exact(5));
+
+        // Should have 2 column statistics
+        assert_eq!(output_stats.column_statistics.len(), 2);
+
+        // First column (literal 42) should have proper constant statistics
+        assert_eq!(
+            output_stats.column_statistics[0].min_value,
+            Precision::Exact(ScalarValue::Int64(Some(42)))
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].max_value,
+            Precision::Exact(ScalarValue::Int64(Some(42)))
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].distinct_count,
+            Precision::Exact(1)
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].null_count,
+            Precision::Exact(0)
+        );
+        // Int64 is 8 bytes, 5 rows = 40 bytes
+        assert_eq!(
+            output_stats.column_statistics[0].byte_size,
+            Precision::Exact(40)
+        );
+        // For a constant column, sum_value = value * num_rows = 42 * 5 = 210
+        assert_eq!(
+            output_stats.column_statistics[0].sum_value,
+            Precision::Exact(ScalarValue::Int64(Some(210)))
+        );
+
+        // Second column (col0) should preserve statistics
+        assert_eq!(
+            output_stats.column_statistics[1].distinct_count,
+            Precision::Exact(5)
+        );
+        assert_eq!(
+            output_stats.column_statistics[1].max_value,
+            Precision::Exact(ScalarValue::Int64(Some(21)))
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_project_statistics_with_i32_literal_sum_widens_to_i64() -> Result<()> {
+        let input_stats = get_stats();
+        let input_schema = get_schema();
+
+        let projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+                alias: "constant".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col0", 0)),
+                alias: "num".to_string(),
+            },
+        ]);
+
+        let output_stats = projection.project_statistics(
+            input_stats,
+            &projection.project_schema(&input_schema)?,
+        )?;
+
+        assert_eq!(
+            output_stats.column_statistics[0].sum_value,
+            Precision::Exact(ScalarValue::Int64(Some(50)))
+        );
+
+        Ok(())
+    }
+
+    // Test statistics calculation for NULL literal (constant NULL column)
+    #[test]
+    fn test_project_statistics_with_null_literal() -> Result<()> {
+        let input_stats = get_stats();
+        let input_schema = get_schema();
+
+        // Projection with NULL literal: SELECT NULL AS null_col, col0 AS num
+        let projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Literal::new(ScalarValue::Int64(None))),
+                alias: "null_col".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col0", 0)),
+                alias: "num".to_string(),
+            },
+        ]);
+
+        let output_stats = projection.project_statistics(
+            input_stats,
+            &projection.project_schema(&input_schema)?,
+        )?;
+
+        // Row count should be preserved
+        assert_eq!(output_stats.num_rows, Precision::Exact(5));
+
+        // Should have 2 column statistics
+        assert_eq!(output_stats.column_statistics.len(), 2);
+
+        // First column (NULL literal) should have proper constant NULL statistics
+        assert_eq!(
+            output_stats.column_statistics[0].min_value,
+            Precision::Exact(ScalarValue::Int64(None))
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].max_value,
+            Precision::Exact(ScalarValue::Int64(None))
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].distinct_count,
+            Precision::Exact(1) // All NULLs are considered the same
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].null_count,
+            Precision::Exact(5) // All rows are NULL
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].byte_size,
+            Precision::Exact(0)
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].sum_value,
+            Precision::Exact(ScalarValue::Int64(None))
+        );
+
+        // Second column (col0) should preserve statistics
+        assert_eq!(
+            output_stats.column_statistics[1].distinct_count,
+            Precision::Exact(5)
+        );
+        assert_eq!(
+            output_stats.column_statistics[1].max_value,
+            Precision::Exact(ScalarValue::Int64(Some(21)))
+        );
+
+        Ok(())
+    }
+
+    // Test statistics calculation for complex type literal (e.g., Utf8 string)
+    #[test]
+    fn test_project_statistics_with_complex_type_literal() -> Result<()> {
+        let input_stats = get_stats();
+        let input_schema = get_schema();
+
+        // Projection with Utf8 literal (complex type): SELECT 'hello' AS text, col0 AS num
+        let projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Literal::new(ScalarValue::Utf8(Some(
+                    "hello".to_string(),
+                )))),
+                alias: "text".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col0", 0)),
+                alias: "num".to_string(),
+            },
+        ]);
+
+        let output_stats = projection.project_statistics(
+            input_stats,
+            &projection.project_schema(&input_schema)?,
+        )?;
+
+        // Row count should be preserved
+        assert_eq!(output_stats.num_rows, Precision::Exact(5));
+
+        // Should have 2 column statistics
+        assert_eq!(output_stats.column_statistics.len(), 2);
+
+        // First column (Utf8 literal 'hello') should have proper constant statistics
+        // but byte_size should be Absent for complex types
+        assert_eq!(
+            output_stats.column_statistics[0].min_value,
+            Precision::Exact(ScalarValue::Utf8(Some("hello".to_string())))
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].max_value,
+            Precision::Exact(ScalarValue::Utf8(Some("hello".to_string())))
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].distinct_count,
+            Precision::Exact(1)
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].null_count,
+            Precision::Exact(0)
+        );
+        // Complex types (Utf8, List, etc.) should have byte_size = Absent
+        // because we can't calculate exact size without knowing the actual data
+        assert_eq!(
+            output_stats.column_statistics[0].byte_size,
+            Precision::Absent
+        );
+        // Non-numeric types (Utf8) should have sum_value = Absent
+        // because sum is only meaningful for numeric types
+        assert_eq!(
+            output_stats.column_statistics[0].sum_value,
+            Precision::Absent
+        );
+
+        // Second column (col0) should preserve statistics
+        assert_eq!(
+            output_stats.column_statistics[1].distinct_count,
+            Precision::Exact(5)
+        );
+        assert_eq!(
+            output_stats.column_statistics[1].max_value,
+            Precision::Exact(ScalarValue::Int64(Some(21)))
+        );
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-expr/src/scalar_function.rs b/datafusion/physical-expr/src/scalar_function.rs
index 743d5b99cde95..418d005c971ea 100644
--- a/datafusion/physical-expr/src/scalar_function.rs
+++ b/datafusion/physical-expr/src/scalar_function.rs
@@ -29,24 +29,23 @@
 //! This module also has a set of coercion rules to improve user experience: if an argument i32 is passed
 //! to a function that supports f64, it is coerced to f64.
 
-use std::any::Any;
 use std::fmt::{self, Debug, Formatter};
 use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 
-use crate::expressions::Literal;
 use crate::PhysicalExpr;
+use crate::expressions::Literal;
 
 use arrow::array::{Array, RecordBatch};
 use arrow::datatypes::{DataType, FieldRef, Schema};
 use datafusion_common::config::{ConfigEntry, ConfigOptions};
-use datafusion_common::{internal_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, internal_err};
 use datafusion_expr::interval_arithmetic::Interval;
 use datafusion_expr::sort_properties::ExprProperties;
-use datafusion_expr::type_coercion::functions::data_types_with_scalar_udf;
+use datafusion_expr::type_coercion::functions::fields_with_udf;
 use datafusion_expr::{
-    expr_vec_fmt, ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF,
-    Volatility,
+    ColumnarValue, ExpressionPlacement, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF,
+    ScalarUDFImpl, Volatility, expr_vec_fmt,
 };
 
 /// Physical expression of a scalar function
@@ -101,19 +100,11 @@ impl ScalarFunctionExpr {
             .collect::<Result<Vec<_>>>()?;
 
         // verify that input data types is consistent with function's `TypeSignature`
-        let arg_types = arg_fields
-            .iter()
-            .map(|f| f.data_type().clone())
-            .collect::<Vec<_>>();
-        data_types_with_scalar_udf(&arg_types, &fun)?;
+        fields_with_udf(&arg_fields, fun.as_ref())?;
 
         let arguments = args
             .iter()
-            .map(|e| {
-                e.as_any()
-                    .downcast_ref::<Literal>()
-                    .map(|literal| literal.value())
-            })
+            .map(|e| e.downcast_ref::<Literal>().map(|literal| literal.value()))
             .collect::<Vec<_>>();
         let ret_args = ReturnFieldArgs {
             arg_fields: &arg_fields,
@@ -173,19 +164,10 @@ impl ScalarFunctionExpr {
     /// Otherwise returns `Some(ScalarFunctionExpr)`.
     pub fn try_downcast_func<T>(expr: &dyn PhysicalExpr) -> Option<&ScalarFunctionExpr>
     where
-        T: 'static,
+        T: ScalarUDFImpl,
     {
-        match expr.as_any().downcast_ref::<ScalarFunctionExpr>() {
-            Some(scalar_expr)
-                if scalar_expr
-                    .fun()
-                    .inner()
-                    .as_any()
-                    .downcast_ref::<T>()
-                    .is_some() =>
-            {
-                Some(scalar_expr)
-            }
+        match expr.downcast_ref::<ScalarFunctionExpr>() {
+            Some(scalar_expr) if scalar_expr.fun().inner().is::<T>() => Some(scalar_expr),
             _ => None,
         }
     }
@@ -243,11 +225,6 @@ fn sorted_config_entries(config_options: &ConfigOptions) -> Vec<ConfigEntry> {
 }
 
 impl PhysicalExpr for ScalarFunctionExpr {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
         Ok(self.return_field.data_type().clone())
     }
@@ -283,19 +260,22 @@ impl PhysicalExpr for ScalarFunctionExpr {
             config_options: Arc::clone(&self.config_options),
         })?;
 
-        if let ColumnarValue::Array(array) = &output {
-            if array.len() != batch.num_rows() {
-                // If the arguments are a non-empty slice of scalar values, we can assume that
-                // returning a one-element array is equivalent to returning a scalar.
-                let preserve_scalar =
-                    array.len() == 1 && !input_empty && input_all_scalar;
-                return if preserve_scalar {
-                    ScalarValue::try_from_array(array, 0).map(ColumnarValue::Scalar)
-                } else {
-                    internal_err!("UDF {} returned a different number of rows than expected. Expected: {}, Got: {}",
-                            self.name, batch.num_rows(), array.len())
-                };
-            }
+        if let ColumnarValue::Array(array) = &output
+            && array.len() != batch.num_rows()
+        {
+            // If the arguments are a non-empty slice of scalar values, we can assume that
+            // returning a one-element array is equivalent to returning a scalar.
+            let preserve_scalar = array.len() == 1 && !input_empty && input_all_scalar;
+            return if preserve_scalar {
+                ScalarValue::try_from_array(array, 0).map(ColumnarValue::Scalar)
+            } else {
+                internal_err!(
+                    "UDF {} returned a different number of rows than expected. Expected: {}, Got: {}",
+                    self.name,
+                    batch.num_rows(),
+                    array.len()
+                )
+            };
         }
         Ok(output)
     }
@@ -363,16 +343,21 @@ impl PhysicalExpr for ScalarFunctionExpr {
     fn is_volatile_node(&self) -> bool {
         self.fun.signature().volatility == Volatility::Volatile
     }
+
+    fn placement(&self) -> ExpressionPlacement {
+        let arg_placements: Vec<_> =
+            self.args.iter().map(|arg| arg.placement()).collect();
+        self.fun.placement(&arg_placements)
+    }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
     use crate::expressions::Column;
-    use arrow::datatypes::{DataType, Field, Schema};
-    use datafusion_expr::{ScalarUDF, ScalarUDFImpl, Signature};
+    use arrow::datatypes::Field;
+    use datafusion_expr::{ScalarUDFImpl, Signature};
     use datafusion_physical_expr_common::physical_expr::is_volatile;
-    use std::any::Any;
 
     /// Test helper to create a mock UDF with a specific volatility
     #[derive(Debug, PartialEq, Eq, Hash)]
@@ -381,10 +366,6 @@ mod tests {
     }
 
     impl ScalarUDFImpl for MockScalarUDF {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
         fn name(&self) -> &str {
             "mock_function"
         }
diff --git a/datafusion/physical-expr/src/scalar_subquery.rs b/datafusion/physical-expr/src/scalar_subquery.rs
new file mode 100644
index 0000000000000..ea00847151e66
--- /dev/null
+++ b/datafusion/physical-expr/src/scalar_subquery.rs
@@ -0,0 +1,240 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Physical expression for uncorrelated scalar subqueries.
+
+use std::fmt;
+use std::hash::Hash;
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, FieldRef, Schema};
+use arrow::record_batch::RecordBatch;
+use datafusion_common::{Result, internal_datafusion_err};
+use datafusion_expr::execution_props::{ScalarSubqueryResults, SubqueryIndex};
+use datafusion_expr_common::columnar_value::ColumnarValue;
+use datafusion_expr_common::sort_properties::{ExprProperties, SortProperties};
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+
+/// A physical expression whose value is provided by a scalar subquery.
+///
+/// Subquery execution is handled by `ScalarSubqueryExec`, which stores the
+/// result in a shared [`ScalarSubqueryResults`] container. This expression
+/// simply reads from that container at the appropriate index.
+#[derive(Debug)]
+pub struct ScalarSubqueryExpr {
+    data_type: DataType,
+    nullable: bool,
+    /// Index of this subquery in the shared results container.
+    index: SubqueryIndex,
+    /// Shared results container populated by `ScalarSubqueryExec`.
+    results: ScalarSubqueryResults,
+}
+
+impl ScalarSubqueryExpr {
+    pub fn new(
+        data_type: DataType,
+        nullable: bool,
+        index: SubqueryIndex,
+        results: ScalarSubqueryResults,
+    ) -> Self {
+        Self {
+            data_type,
+            nullable,
+            index,
+            results,
+        }
+    }
+
+    pub fn data_type(&self) -> &DataType {
+        &self.data_type
+    }
+
+    pub fn nullable(&self) -> bool {
+        self.nullable
+    }
+
+    /// Returns the index of this subquery in the shared results container.
+    pub fn index(&self) -> SubqueryIndex {
+        self.index
+    }
+
+    pub fn results(&self) -> &ScalarSubqueryResults {
+        &self.results
+    }
+}
+
+impl fmt::Display for ScalarSubqueryExpr {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self.results.get(self.index) {
+            Some(v) => write!(f, "scalar_subquery({v})"),
+            None => write!(f, "scalar_subquery(<pending>)"),
+        }
+    }
+}
+
+// Two ScalarSubqueryExprs are considered the "same" if they refer to the
+// same underlying shared results container and the same index within it.
+impl Hash for ScalarSubqueryExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.results.hash(state);
+        self.index.hash(state);
+    }
+}
+
+impl PartialEq for ScalarSubqueryExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.results == other.results && self.index == other.index
+    }
+}
+
+impl Eq for ScalarSubqueryExpr {}
+
+impl PhysicalExpr for ScalarSubqueryExpr {
+    fn return_field(&self, _input_schema: &Schema) -> Result<FieldRef> {
+        Ok(Arc::new(Field::new(
+            "scalar_subquery",
+            self.data_type.clone(),
+            self.nullable,
+        )))
+    }
+
+    fn evaluate(&self, _batch: &RecordBatch) -> Result<ColumnarValue> {
+        let value = self.results.get(self.index).ok_or_else(|| {
+            internal_datafusion_err!(
+                "ScalarSubqueryExpr evaluated before the subquery was executed"
+            )
+        })?;
+        Ok(ColumnarValue::Scalar(value))
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        _children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        Ok(self)
+    }
+
+    fn get_properties(&self, _children: &[ExprProperties]) -> Result<ExprProperties> {
+        Ok(ExprProperties::new_unknown().with_order(SortProperties::Singleton))
+    }
+
+    fn fmt_sql(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "(scalar subquery)")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use arrow::array::Int32Array;
+    use arrow::datatypes::Field;
+    use datafusion_common::ScalarValue;
+
+    fn make_results(values: Vec<Option<ScalarValue>>) -> ScalarSubqueryResults {
+        let results = ScalarSubqueryResults::new(values.len());
+        for (index, value) in values.into_iter().enumerate() {
+            if let Some(value) = value {
+                results.set(SubqueryIndex::new(index), value).unwrap();
+            }
+        }
+        results
+    }
+
+    #[test]
+    fn test_evaluate_with_value() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let a = Int32Array::from(vec![1, 2, 3]);
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)])?;
+
+        let results = make_results(vec![Some(ScalarValue::Int32(Some(42)))]);
+        let expr = ScalarSubqueryExpr::new(
+            DataType::Int32,
+            false,
+            SubqueryIndex::new(0),
+            results,
+        );
+
+        let result = expr.evaluate(&batch)?;
+        match result {
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(42))) => {}
+            other => panic!("Expected Scalar(Int32(42)), got {other:?}"),
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn test_evaluate_before_populated() {
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let a = Int32Array::from(vec![1]);
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap();
+
+        let results = ScalarSubqueryResults::new(1);
+        let expr = ScalarSubqueryExpr::new(
+            DataType::Int32,
+            false,
+            SubqueryIndex::new(0),
+            results,
+        );
+
+        let result = expr.evaluate(&batch);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_identity_equality() {
+        let results = make_results(vec![None, None]);
+
+        let e1a = ScalarSubqueryExpr::new(
+            DataType::Int32,
+            false,
+            SubqueryIndex::new(0),
+            results.clone(),
+        );
+        let e1b = ScalarSubqueryExpr::new(
+            DataType::Int32,
+            false,
+            SubqueryIndex::new(0),
+            results.clone(),
+        );
+        let e2 = ScalarSubqueryExpr::new(
+            DataType::Int32,
+            false,
+            SubqueryIndex::new(1),
+            results.clone(),
+        );
+
+        // Same container + same index → equal
+        assert_eq!(e1a, e1b);
+        // Same container, different index → not equal
+        assert_ne!(e1a, e2);
+
+        // Different container, same index → not equal
+        let other_results = make_results(vec![None]);
+        let e3 = ScalarSubqueryExpr::new(
+            DataType::Int32,
+            false,
+            SubqueryIndex::new(0),
+            other_results,
+        );
+        assert_ne!(e1a, e3);
+    }
+}
diff --git a/datafusion/physical-expr/src/simplifier/const_evaluator.rs b/datafusion/physical-expr/src/simplifier/const_evaluator.rs
new file mode 100644
index 0000000000000..34c080bfa0fb8
--- /dev/null
+++ b/datafusion/physical-expr/src/simplifier/const_evaluator.rs
@@ -0,0 +1,198 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Constant expression evaluation for the physical expression simplifier
+
+use std::sync::Arc;
+
+use arrow::array::new_null_array;
+use arrow::datatypes::{DataType, Field, Schema};
+use arrow::record_batch::RecordBatch;
+use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion};
+use datafusion_common::{Result, ScalarValue, internal_datafusion_err};
+use datafusion_expr_common::columnar_value::ColumnarValue;
+
+use crate::PhysicalExpr;
+use crate::expressions::{Column, Literal};
+
+/// Simplify expressions that consist only of literals by evaluating them.
+///
+/// This function checks if all children of the given expression are literals.
+/// If so, it evaluates the expression against a dummy RecordBatch and returns
+/// the result as a new Literal.
+///
+/// # Example transformations
+/// - `1 + 2` -> `3`
+/// - `(1 + 2) * 3` -> `9` (with bottom-up traversal)
+/// - `'hello' || ' world'` -> `'hello world'`
+#[deprecated(
+    since = "53.0.0",
+    note = "This function will be removed in a future release in favor of a private implementation that depends on other implementation details. Please open an issue if you have a use case for keeping it."
+)]
+pub fn simplify_const_expr(
+    expr: Arc<dyn PhysicalExpr>,
+) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
+    let batch = create_dummy_batch()?;
+    // If expr is already a const literal or can't be evaluated into one.
+    if expr.is::<Literal>() || (!can_evaluate_as_constant(&expr)) {
+        return Ok(Transformed::no(expr));
+    }
+
+    // Evaluate the expression
+    match expr.evaluate(batch) {
+        Ok(ColumnarValue::Scalar(scalar)) => {
+            Ok(Transformed::yes(Arc::new(Literal::new(scalar))))
+        }
+        Ok(ColumnarValue::Array(arr)) if arr.len() == 1 => {
+            // Some operations return an array even for scalar inputs
+            let scalar = ScalarValue::try_from_array(&arr, 0)?;
+            Ok(Transformed::yes(Arc::new(Literal::new(scalar))))
+        }
+        Ok(_) => {
+            // Unexpected result - keep original expression
+            Ok(Transformed::no(expr))
+        }
+        Err(_) => {
+            // On error, keep original expression
+            // The expression might succeed at runtime due to short-circuit evaluation
+            // or other runtime conditions
+            Ok(Transformed::no(expr))
+        }
+    }
+}
+
+/// Simplify expressions whose immediate children are all literals.
+///
+/// This function only checks the direct children of the expression,
+/// not the entire subtree. It is designed to be used with bottom-up tree
+/// traversal, where children are simplified before parents.
+///
+/// # Example transformations
+/// - `1 + 2` -> `3`
+/// - `(1 + 2) * 3` -> `9` (with bottom-up traversal, inner expr simplified first)
+/// - `'hello' || ' world'` -> `'hello world'`
+pub(crate) fn simplify_const_expr_immediate(
+    expr: Arc<dyn PhysicalExpr>,
+    batch: &RecordBatch,
+) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
+    // Already a literal - nothing to do
+    if expr.is::<Literal>() {
+        return Ok(Transformed::no(expr));
+    }
+
+    // Column references cannot be evaluated at plan time
+    if expr.is::<Column>() {
+        return Ok(Transformed::no(expr));
+    }
+
+    // Volatile nodes cannot be evaluated at plan time
+    if expr.is_volatile_node() {
+        return Ok(Transformed::no(expr));
+    }
+
+    // Since transform visits bottom-up, children have already been simplified.
+    // If all children are now Literals, this node can be const-evaluated.
+    // This is O(k) where k = number of children, instead of O(subtree).
+    let all_children_literal = expr.children().iter().all(|child| child.is::<Literal>());
+
+    if !all_children_literal {
+        return Ok(Transformed::no(expr));
+    }
+
+    // Evaluate the expression
+    match expr.evaluate(batch) {
+        Ok(ColumnarValue::Scalar(scalar)) => {
+            Ok(Transformed::yes(Arc::new(Literal::new(scalar))))
+        }
+        Ok(ColumnarValue::Array(arr)) if arr.len() == 1 => {
+            // Some operations return an array even for scalar inputs
+            let scalar = ScalarValue::try_from_array(&arr, 0)?;
+            Ok(Transformed::yes(Arc::new(Literal::new(scalar))))
+        }
+        Ok(_) => {
+            // Unexpected result - keep original expression
+            Ok(Transformed::no(expr))
+        }
+        Err(_) => {
+            // On error, keep original expression
+            // The expression might succeed at runtime due to short-circuit evaluation
+            // or other runtime conditions
+            Ok(Transformed::no(expr))
+        }
+    }
+}
+
+/// Create a 1-row dummy RecordBatch for evaluating constant expressions.
+///
+/// The batch is never actually accessed for data - it's just needed because
+/// the PhysicalExpr::evaluate API requires a RecordBatch. For expressions
+/// that only contain literals, the batch content is irrelevant.
+///
+/// This is the same approach used in the logical expression `ConstEvaluator`.
+pub(crate) fn create_dummy_batch() -> Result<&'static RecordBatch> {
+    static DUMMY_BATCH: std::sync::OnceLock<Result<RecordBatch>> =
+        std::sync::OnceLock::new();
+    DUMMY_BATCH
+        .get_or_init(|| {
+            // RecordBatch requires at least one column
+            let dummy_schema =
+                Arc::new(Schema::new(vec![Field::new("_", DataType::Null, true)]));
+            let col = new_null_array(&DataType::Null, 1);
+            Ok(RecordBatch::try_new(dummy_schema, vec![col])?)
+        })
+        .as_ref()
+        .map_err(|e| {
+            internal_datafusion_err!(
+                "Failed to create dummy batch for constant expression evaluation: {e}"
+            )
+        })
+}
+
+fn can_evaluate_as_constant(expr: &Arc<dyn PhysicalExpr>) -> bool {
+    let mut can_evaluate = true;
+
+    expr.apply(|e| {
+        if e.is::<Column>() || e.is_volatile_node() {
+            can_evaluate = false;
+            Ok(TreeNodeRecursion::Stop)
+        } else {
+            Ok(TreeNodeRecursion::Continue)
+        }
+    })
+    .expect("apply should not fail");
+
+    can_evaluate
+}
+
+/// Check if this expression has any column references.
+#[deprecated(
+    since = "53.0.0",
+    note = "This function isn't used internally and is trivial to implement, therefore it will be removed in a future release."
+)]
+pub fn has_column_references(expr: &Arc<dyn PhysicalExpr>) -> bool {
+    let mut has_columns = false;
+    expr.apply(|expr| {
+        if expr.downcast_ref::<Column>().is_some() {
+            has_columns = true;
+            Ok(TreeNodeRecursion::Stop)
+        } else {
+            Ok(TreeNodeRecursion::Continue)
+        }
+    })
+    .expect("apply should not fail");
+    has_columns
+}
diff --git a/datafusion/physical-expr/src/simplifier/mod.rs b/datafusion/physical-expr/src/simplifier/mod.rs
index 80d6ee0a7b914..272c1f595a08f 100644
--- a/datafusion/physical-expr/src/simplifier/mod.rs
+++ b/datafusion/physical-expr/src/simplifier/mod.rs
@@ -18,16 +18,22 @@
 //! Simplifier for Physical Expressions
 
 use arrow::datatypes::Schema;
-use datafusion_common::{
-    tree_node::{Transformed, TreeNode, TreeNodeRewriter},
-    Result,
-};
+use datafusion_common::{Result, tree_node::TreeNode};
 use std::sync::Arc;
 
-use crate::PhysicalExpr;
+use crate::{
+    PhysicalExpr,
+    simplifier::{
+        const_evaluator::create_dummy_batch, unwrap_cast::unwrap_cast_in_comparison,
+    },
+};
 
+pub mod const_evaluator;
+pub mod not;
 pub mod unwrap_cast;
 
+const MAX_LOOP_COUNT: usize = 5;
+
 /// Simplifies physical expressions by applying various optimizations
 ///
 /// This can be useful after adapting expressions from a table schema
@@ -44,37 +50,54 @@ impl<'a> PhysicalExprSimplifier<'a> {
     }
 
     /// Simplify a physical expression
-    pub fn simplify(
-        &mut self,
-        expr: Arc<dyn PhysicalExpr>,
-    ) -> Result<Arc<dyn PhysicalExpr>> {
-        Ok(expr.rewrite(self)?.data)
-    }
-}
+    pub fn simplify(&self, expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
+        let mut current_expr = expr;
+        let mut count = 0;
+        let schema = self.schema;
 
-impl<'a> TreeNodeRewriter for PhysicalExprSimplifier<'a> {
-    type Node = Arc<dyn PhysicalExpr>;
+        let batch = create_dummy_batch()?;
 
-    fn f_up(&mut self, node: Self::Node) -> Result<Transformed<Self::Node>> {
-        // Apply unwrap cast optimization
-        #[cfg(test)]
-        let original_type = node.data_type(self.schema).unwrap();
-        let unwrapped = unwrap_cast::unwrap_cast_in_comparison(node, self.schema)?;
-        #[cfg(test)]
-        assert_eq!(
-            unwrapped.data.data_type(self.schema).unwrap(),
-            original_type,
-            "Simplified expression should have the same data type as the original"
-        );
-        Ok(unwrapped)
+        while count < MAX_LOOP_COUNT {
+            count += 1;
+            let result = current_expr.transform(|node| {
+                #[cfg(debug_assertions)]
+                let original_type = node.data_type(schema).unwrap();
+
+                // Apply NOT expression simplification first, then unwrap cast optimization,
+                // then constant expression evaluation
+                #[expect(deprecated, reason = "`simplify_not_expr` is marked as deprecated until it's made private.")]
+                let rewritten = not::simplify_not_expr(node, schema)?
+                    .transform_data(|node| unwrap_cast_in_comparison(node, schema))?
+                    .transform_data(|node| {
+                        const_evaluator::simplify_const_expr_immediate(node, batch)
+                    })?;
+
+                #[cfg(debug_assertions)]
+                assert_eq!(
+                    rewritten.data.data_type(schema).unwrap(),
+                    original_type,
+                    "Simplified expression should have the same data type as the original"
+                );
+
+                Ok(rewritten)
+            })?;
+
+            if !result.transformed {
+                return Ok(result.data);
+            }
+            current_expr = result.data;
+        }
+        Ok(current_expr)
     }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::expressions::{col, lit, BinaryExpr, CastExpr, Literal, TryCastExpr};
-    use arrow::datatypes::{DataType, Field, Schema};
+    use crate::expressions::{
+        BinaryExpr, CastExpr, Literal, NotExpr, TryCastExpr, col, in_list, lit,
+    };
+    use arrow::datatypes::{DataType, Field};
     use datafusion_common::ScalarValue;
     use datafusion_expr::Operator;
 
@@ -86,10 +109,43 @@ mod tests {
         ])
     }
 
+    fn not_test_schema() -> Schema {
+        Schema::new(vec![
+            Field::new("a", DataType::Boolean, false),
+            Field::new("b", DataType::Boolean, false),
+            Field::new("c", DataType::Int32, false),
+        ])
+    }
+
+    /// Helper function to extract a Literal from a PhysicalExpr
+    fn as_literal(expr: &Arc<dyn PhysicalExpr>) -> &Literal {
+        expr.downcast_ref::<Literal>()
+            .unwrap_or_else(|| panic!("Expected Literal, got: {expr}"))
+    }
+
+    /// Helper function to extract a BinaryExpr from a PhysicalExpr
+    fn as_binary(expr: &Arc<dyn PhysicalExpr>) -> &BinaryExpr {
+        expr.downcast_ref::<BinaryExpr>()
+            .unwrap_or_else(|| panic!("Expected BinaryExpr, got: {expr}"))
+    }
+
+    /// Assert that simplifying `input` produces `expected`
+    fn assert_not_simplify(
+        simplifier: &PhysicalExprSimplifier,
+        input: Arc<dyn PhysicalExpr>,
+        expected: Arc<dyn PhysicalExpr>,
+    ) {
+        let result = simplifier.simplify(Arc::clone(&input)).unwrap();
+        assert_eq!(
+            &result, &expected,
+            "Simplification should transform:\n  input: {input}\n  to:    {expected}\n  got:   {result}"
+        );
+    }
+
     #[test]
     fn test_simplify() {
         let schema = test_schema();
-        let mut simplifier = PhysicalExprSimplifier::new(&schema);
+        let simplifier = PhysicalExprSimplifier::new(&schema);
 
         // Create: cast(c2 as INT32) != INT32(99)
         let column_expr = col("c2", &schema).unwrap();
@@ -101,26 +157,22 @@ mod tests {
         // Apply full simplification (uses TreeNodeRewriter)
         let optimized = simplifier.simplify(binary_expr).unwrap();
 
-        let optimized_binary = optimized.as_any().downcast_ref::<BinaryExpr>().unwrap();
+        let optimized_binary = as_binary(&optimized);
 
         // Should be optimized to: c2 != INT64(99) (c2 is INT64, literal cast to match)
         let left_expr = optimized_binary.left();
         assert!(
-            left_expr.as_any().downcast_ref::<CastExpr>().is_none()
-                && left_expr.as_any().downcast_ref::<TryCastExpr>().is_none()
+            left_expr.downcast_ref::<CastExpr>().is_none()
+                && left_expr.downcast_ref::<TryCastExpr>().is_none()
         );
-        let right_literal = optimized_binary
-            .right()
-            .as_any()
-            .downcast_ref::<Literal>()
-            .unwrap();
+        let right_literal = as_literal(optimized_binary.right());
         assert_eq!(right_literal.value(), &ScalarValue::Int64(Some(99)));
     }
 
     #[test]
     fn test_nested_expression_simplification() {
         let schema = test_schema();
-        let mut simplifier = PhysicalExprSimplifier::new(&schema);
+        let simplifier = PhysicalExprSimplifier::new(&schema);
 
         // Create nested expression: (cast(c1 as INT64) > INT64(5)) OR (cast(c2 as INT32) <= INT32(10))
         let c1_expr = col("c1", &schema).unwrap();
@@ -138,51 +190,422 @@ mod tests {
         // Apply simplification
         let optimized = simplifier.simplify(or_expr).unwrap();
 
-        let or_binary = optimized.as_any().downcast_ref::<BinaryExpr>().unwrap();
+        let or_binary = as_binary(&optimized);
 
         // Verify left side: c1 > INT32(5)
-        let left_binary = or_binary
-            .left()
-            .as_any()
-            .downcast_ref::<BinaryExpr>()
-            .unwrap();
+        let left_binary = as_binary(or_binary.left());
         let left_left_expr = left_binary.left();
         assert!(
-            left_left_expr.as_any().downcast_ref::<CastExpr>().is_none()
-                && left_left_expr
-                    .as_any()
-                    .downcast_ref::<TryCastExpr>()
-                    .is_none()
+            left_left_expr.downcast_ref::<CastExpr>().is_none()
+                && left_left_expr.downcast_ref::<TryCastExpr>().is_none()
         );
-        let left_literal = left_binary
-            .right()
-            .as_any()
-            .downcast_ref::<Literal>()
-            .unwrap();
+        let left_literal = as_literal(left_binary.right());
         assert_eq!(left_literal.value(), &ScalarValue::Int32(Some(5)));
 
         // Verify right side: c2 <= INT64(10)
-        let right_binary = or_binary
-            .right()
-            .as_any()
-            .downcast_ref::<BinaryExpr>()
-            .unwrap();
+        let right_binary = as_binary(or_binary.right());
         let right_left_expr = right_binary.left();
         assert!(
-            right_left_expr
-                .as_any()
-                .downcast_ref::<CastExpr>()
-                .is_none()
-                && right_left_expr
-                    .as_any()
-                    .downcast_ref::<TryCastExpr>()
-                    .is_none()
+            right_left_expr.downcast_ref::<CastExpr>().is_none()
+                && right_left_expr.downcast_ref::<TryCastExpr>().is_none()
         );
-        let right_literal = right_binary
-            .right()
-            .as_any()
-            .downcast_ref::<Literal>()
-            .unwrap();
+        let right_literal = as_literal(right_binary.right());
         assert_eq!(right_literal.value(), &ScalarValue::Int64(Some(10)));
     }
+
+    #[test]
+    fn test_double_negation_elimination() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // NOT(NOT(c > 5)) -> c > 5
+        let inner_expr: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            col("c", &schema)?,
+            Operator::Gt,
+            lit(ScalarValue::Int32(Some(5))),
+        ));
+        let inner_not = Arc::new(NotExpr::new(Arc::clone(&inner_expr)));
+        let double_not: Arc<dyn PhysicalExpr> = Arc::new(NotExpr::new(inner_not));
+
+        let expected = inner_expr;
+        assert_not_simplify(&simplifier, double_not, expected);
+        Ok(())
+    }
+
+    #[test]
+    fn test_not_literal() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // NOT(TRUE) -> FALSE
+        let not_true = Arc::new(NotExpr::new(lit(ScalarValue::Boolean(Some(true)))));
+        let expected = lit(ScalarValue::Boolean(Some(false)));
+        assert_not_simplify(&simplifier, not_true, expected);
+
+        // NOT(FALSE) -> TRUE
+        let not_false = Arc::new(NotExpr::new(lit(ScalarValue::Boolean(Some(false)))));
+        let expected = lit(ScalarValue::Boolean(Some(true)));
+        assert_not_simplify(&simplifier, not_false, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_negate_comparison() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // NOT(c = 5) -> c != 5
+        let not_eq = Arc::new(NotExpr::new(Arc::new(BinaryExpr::new(
+            col("c", &schema)?,
+            Operator::Eq,
+            lit(ScalarValue::Int32(Some(5))),
+        ))));
+        let expected = Arc::new(BinaryExpr::new(
+            col("c", &schema)?,
+            Operator::NotEq,
+            lit(ScalarValue::Int32(Some(5))),
+        ));
+        assert_not_simplify(&simplifier, not_eq, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_demorgans_law_and() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // NOT(a AND b) -> NOT a OR NOT b
+        let and_expr = Arc::new(BinaryExpr::new(
+            col("a", &schema)?,
+            Operator::And,
+            col("b", &schema)?,
+        ));
+        let not_and: Arc<dyn PhysicalExpr> = Arc::new(NotExpr::new(and_expr));
+
+        let expected: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            Arc::new(NotExpr::new(col("a", &schema)?)),
+            Operator::Or,
+            Arc::new(NotExpr::new(col("b", &schema)?)),
+        ));
+        assert_not_simplify(&simplifier, not_and, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_demorgans_law_or() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // NOT(a OR b) -> NOT a AND NOT b
+        let or_expr = Arc::new(BinaryExpr::new(
+            col("a", &schema)?,
+            Operator::Or,
+            col("b", &schema)?,
+        ));
+        let not_or: Arc<dyn PhysicalExpr> = Arc::new(NotExpr::new(or_expr));
+
+        let expected: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            Arc::new(NotExpr::new(col("a", &schema)?)),
+            Operator::And,
+            Arc::new(NotExpr::new(col("b", &schema)?)),
+        ));
+        assert_not_simplify(&simplifier, not_or, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_demorgans_with_comparison_simplification() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // NOT(c = 1 AND c = 2) -> c != 1 OR c != 2
+        let eq1 = Arc::new(BinaryExpr::new(
+            col("c", &schema)?,
+            Operator::Eq,
+            lit(ScalarValue::Int32(Some(1))),
+        ));
+        let eq2 = Arc::new(BinaryExpr::new(
+            col("c", &schema)?,
+            Operator::Eq,
+            lit(ScalarValue::Int32(Some(2))),
+        ));
+        let and_expr = Arc::new(BinaryExpr::new(eq1, Operator::And, eq2));
+        let not_and: Arc<dyn PhysicalExpr> = Arc::new(NotExpr::new(and_expr));
+
+        let expected: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            Arc::new(BinaryExpr::new(
+                col("c", &schema)?,
+                Operator::NotEq,
+                lit(ScalarValue::Int32(Some(1))),
+            )),
+            Operator::Or,
+            Arc::new(BinaryExpr::new(
+                col("c", &schema)?,
+                Operator::NotEq,
+                lit(ScalarValue::Int32(Some(2))),
+            )),
+        ));
+        assert_not_simplify(&simplifier, not_and, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_not_of_not_and_not() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // NOT(NOT(a) AND NOT(b)) -> a OR b
+        let not_a = Arc::new(NotExpr::new(col("a", &schema)?));
+        let not_b = Arc::new(NotExpr::new(col("b", &schema)?));
+        let and_expr = Arc::new(BinaryExpr::new(not_a, Operator::And, not_b));
+        let not_and: Arc<dyn PhysicalExpr> = Arc::new(NotExpr::new(and_expr));
+
+        let expected: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            col("a", &schema)?,
+            Operator::Or,
+            col("b", &schema)?,
+        ));
+        assert_not_simplify(&simplifier, not_and, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_not_in_list() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // NOT(c IN (1, 2, 3)) -> c NOT IN (1, 2, 3)
+        let list = vec![
+            lit(ScalarValue::Int32(Some(1))),
+            lit(ScalarValue::Int32(Some(2))),
+            lit(ScalarValue::Int32(Some(3))),
+        ];
+        let in_list_expr = in_list(col("c", &schema)?, list.clone(), &false, &schema)?;
+        let not_in: Arc<dyn PhysicalExpr> = Arc::new(NotExpr::new(in_list_expr));
+
+        let expected = in_list(col("c", &schema)?, list, &true, &schema)?;
+        assert_not_simplify(&simplifier, not_in, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_not_not_in_list() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // NOT(c NOT IN (1, 2, 3)) -> c IN (1, 2, 3)
+        let list = vec![
+            lit(ScalarValue::Int32(Some(1))),
+            lit(ScalarValue::Int32(Some(2))),
+            lit(ScalarValue::Int32(Some(3))),
+        ];
+        let not_in_list_expr = in_list(col("c", &schema)?, list.clone(), &true, &schema)?;
+        let not_not_in: Arc<dyn PhysicalExpr> = Arc::new(NotExpr::new(not_in_list_expr));
+
+        let expected = in_list(col("c", &schema)?, list, &false, &schema)?;
+        assert_not_simplify(&simplifier, not_not_in, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_double_not_in_list() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // NOT(NOT(c IN (1, 2, 3))) -> c IN (1, 2, 3)
+        let list = vec![
+            lit(ScalarValue::Int32(Some(1))),
+            lit(ScalarValue::Int32(Some(2))),
+            lit(ScalarValue::Int32(Some(3))),
+        ];
+        let in_list_expr = in_list(col("c", &schema)?, list.clone(), &false, &schema)?;
+        let not_in = Arc::new(NotExpr::new(in_list_expr));
+        let double_not: Arc<dyn PhysicalExpr> = Arc::new(NotExpr::new(not_in));
+
+        let expected = in_list(col("c", &schema)?, list, &false, &schema)?;
+        assert_not_simplify(&simplifier, double_not, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_deeply_nested_not() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // Create a deeply nested NOT expression: NOT(NOT(NOT(...NOT(c > 5)...)))
+        // This tests that we don't get stack overflow with many nested NOTs.
+        // With recursive_protection enabled (default), this should work by
+        // automatically growing the stack as needed.
+        let inner_expr: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            col("c", &schema)?,
+            Operator::Gt,
+            lit(ScalarValue::Int32(Some(5))),
+        ));
+
+        let mut expr = Arc::clone(&inner_expr);
+        // Create 200 layers of NOT to test deep recursion handling
+        for _ in 0..200 {
+            expr = Arc::new(NotExpr::new(expr));
+        }
+
+        // With 200 NOTs (even number), should simplify back to the original expression
+        let expected = inner_expr;
+        assert_not_simplify(&simplifier, Arc::clone(&expr), expected);
+
+        // Manually dismantle the deep input expression to avoid Stack Overflow on Drop
+        // If we just let `expr` go out of scope, Rust's recursive Drop will blow the stack
+        // even with recursive_protection, because Drop doesn't use the #[recursive] attribute.
+        // We peel off layers one by one to avoid deep recursion in Drop.
+        while let Some(not_expr) = expr.downcast_ref::<NotExpr>() {
+            // Clone the child (Arc increment).
+            // Now child has 2 refs: one in parent, one in `child`.
+            let child = Arc::clone(not_expr.arg());
+
+            // Reassign `expr` to `child`.
+            // This drops the old `expr` (Parent).
+            // Parent refcount -> 0, Parent is dropped.
+            // Parent drops its reference to Child.
+            // Child refcount decrements 2 -> 1.
+            // Child is NOT dropped recursively because we still hold it in `expr`
+            expr = child;
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_simplify_literal_binary_expr() {
+        let schema = Schema::empty();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // 1 + 2 -> 3
+        let expr: Arc<dyn PhysicalExpr> =
+            Arc::new(BinaryExpr::new(lit(1i32), Operator::Plus, lit(2i32)));
+        let result = simplifier.simplify(expr).unwrap();
+        let literal = as_literal(&result);
+        assert_eq!(literal.value(), &ScalarValue::Int32(Some(3)));
+    }
+
+    #[test]
+    fn test_simplify_literal_comparison() {
+        let schema = Schema::empty();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // 5 > 3 -> true
+        let expr: Arc<dyn PhysicalExpr> =
+            Arc::new(BinaryExpr::new(lit(5i32), Operator::Gt, lit(3i32)));
+        let result = simplifier.simplify(expr).unwrap();
+        let literal = as_literal(&result);
+        assert_eq!(literal.value(), &ScalarValue::Boolean(Some(true)));
+
+        // 2 > 3 -> false
+        let expr: Arc<dyn PhysicalExpr> =
+            Arc::new(BinaryExpr::new(lit(2i32), Operator::Gt, lit(3i32)));
+        let result = simplifier.simplify(expr).unwrap();
+        let literal = as_literal(&result);
+        assert_eq!(literal.value(), &ScalarValue::Boolean(Some(false)));
+    }
+
+    #[test]
+    fn test_simplify_nested_literal_expr() {
+        let schema = Schema::empty();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // (1 + 2) * 3 -> 9
+        let inner: Arc<dyn PhysicalExpr> =
+            Arc::new(BinaryExpr::new(lit(1i32), Operator::Plus, lit(2i32)));
+        let expr: Arc<dyn PhysicalExpr> =
+            Arc::new(BinaryExpr::new(inner, Operator::Multiply, lit(3i32)));
+        let result = simplifier.simplify(expr).unwrap();
+        let literal = as_literal(&result);
+        assert_eq!(literal.value(), &ScalarValue::Int32(Some(9)));
+    }
+
+    #[test]
+    fn test_simplify_deeply_nested_literals() {
+        let schema = Schema::empty();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // ((1 + 2) * 3) + ((4 - 1) * 2) -> 9 + 6 -> 15
+        let left: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            Arc::new(BinaryExpr::new(lit(1i32), Operator::Plus, lit(2i32))),
+            Operator::Multiply,
+            lit(3i32),
+        ));
+        let right: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            Arc::new(BinaryExpr::new(lit(4i32), Operator::Minus, lit(1i32))),
+            Operator::Multiply,
+            lit(2i32),
+        ));
+        let expr: Arc<dyn PhysicalExpr> =
+            Arc::new(BinaryExpr::new(left, Operator::Plus, right));
+        let result = simplifier.simplify(expr).unwrap();
+        let literal = as_literal(&result);
+        assert_eq!(literal.value(), &ScalarValue::Int32(Some(15)));
+    }
+
+    #[test]
+    fn test_no_simplify_with_column() {
+        let schema = test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // c1 + 2 should NOT be simplified (has column reference)
+        let expr: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            col("c1", &schema).unwrap(),
+            Operator::Plus,
+            lit(2i32),
+        ));
+        let result = simplifier.simplify(expr).unwrap();
+        // Should remain a BinaryExpr, not become a Literal
+        assert!(result.downcast_ref::<BinaryExpr>().is_some());
+    }
+
+    #[test]
+    fn test_partial_simplify_with_column() {
+        let schema = test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // (1 + 2) + c1 should simplify the literal part: 3 + c1
+        let literal_part: Arc<dyn PhysicalExpr> =
+            Arc::new(BinaryExpr::new(lit(1i32), Operator::Plus, lit(2i32)));
+        let expr: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            literal_part,
+            Operator::Plus,
+            col("c1", &schema).unwrap(),
+        ));
+        let result = simplifier.simplify(expr).unwrap();
+
+        // Should be a BinaryExpr with a Literal(3) on the left
+        let binary = as_binary(&result);
+        let left_literal = as_literal(binary.left());
+        assert_eq!(left_literal.value(), &ScalarValue::Int32(Some(3)));
+    }
+
+    #[test]
+    fn test_simplify_literal_string_concat() {
+        let schema = Schema::empty();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // 'hello' || ' world' -> 'hello world'
+        let expr: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            lit("hello"),
+            Operator::StringConcat,
+            lit(" world"),
+        ));
+        let result = simplifier.simplify(expr).unwrap();
+        let literal = as_literal(&result);
+        assert_eq!(
+            literal.value(),
+            &ScalarValue::Utf8(Some("hello world".to_string()))
+        );
+    }
 }
diff --git a/datafusion/physical-expr/src/simplifier/not.rs b/datafusion/physical-expr/src/simplifier/not.rs
new file mode 100644
index 0000000000000..886cadd6a262d
--- /dev/null
+++ b/datafusion/physical-expr/src/simplifier/not.rs
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Simplify NOT expressions in physical expressions
+//!
+//! This module provides optimizations for NOT expressions such as:
+//! - Double negation elimination: NOT(NOT(expr)) -> expr
+//! - NOT with binary comparisons: NOT(a = b) -> a != b
+//! - NOT with IN expressions: NOT(a IN (list)) -> a NOT IN (list)
+//! - De Morgan's laws: NOT(A AND B) -> NOT A OR NOT B
+//! - Constant folding: NOT(TRUE) -> FALSE, NOT(FALSE) -> TRUE
+//!
+//! This function is designed to work with TreeNodeRewriter's f_up traversal,
+//! which means children are already simplified when this function is called.
+//! The TreeNodeRewriter will automatically call this function repeatedly until
+//! no more transformations are possible.
+
+use std::sync::Arc;
+
+use arrow::datatypes::Schema;
+use datafusion_common::{Result, ScalarValue, tree_node::Transformed};
+use datafusion_expr::Operator;
+
+use crate::PhysicalExpr;
+use crate::expressions::{BinaryExpr, InListExpr, Literal, NotExpr, in_list, lit};
+
+/// Attempts to simplify NOT expressions by applying one level of transformation
+///
+/// This function applies a single simplification rule and returns. When used with
+/// TreeNodeRewriter, multiple passes will automatically be applied until no more
+/// transformations are possible.
+#[deprecated(
+    since = "53.0.0",
+    note = "This function will be made private in a future release, please file an issue if you have a reason for keeping it public."
+)]
+pub fn simplify_not_expr(
+    expr: Arc<dyn PhysicalExpr>,
+    schema: &Schema,
+) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
+    // Check if this is a NOT expression
+    let not_expr = match expr.downcast_ref::<NotExpr>() {
+        Some(not_expr) => not_expr,
+        None => return Ok(Transformed::no(expr)),
+    };
+
+    let inner_expr = not_expr.arg();
+
+    // Handle NOT(NOT(expr)) -> expr (double negation elimination)
+    if let Some(inner_not) = inner_expr.downcast_ref::<NotExpr>() {
+        return Ok(Transformed::yes(Arc::clone(inner_not.arg())));
+    }
+
+    // Handle NOT(literal) -> !literal
+    if let Some(literal) = inner_expr.downcast_ref::<Literal>() {
+        if let ScalarValue::Boolean(Some(val)) = literal.value() {
+            return Ok(Transformed::yes(lit(ScalarValue::Boolean(Some(!val)))));
+        }
+        if let ScalarValue::Boolean(None) = literal.value() {
+            return Ok(Transformed::yes(lit(ScalarValue::Boolean(None))));
+        }
+    }
+
+    // Handle NOT(IN list) -> NOT IN list
+    if let Some(in_list_expr) = inner_expr.downcast_ref::<InListExpr>() {
+        let negated = !in_list_expr.negated();
+        let new_in_list = in_list(
+            Arc::clone(in_list_expr.expr()),
+            in_list_expr.list().to_vec(),
+            &negated,
+            schema,
+        )?;
+        return Ok(Transformed::yes(new_in_list));
+    }
+
+    // Handle NOT(binary_expr)
+    if let Some(binary_expr) = inner_expr.downcast_ref::<BinaryExpr>() {
+        if let Some(negated_op) = binary_expr.op().negate() {
+            let new_binary = Arc::new(BinaryExpr::new(
+                Arc::clone(binary_expr.left()),
+                negated_op,
+                Arc::clone(binary_expr.right()),
+            ));
+            return Ok(Transformed::yes(new_binary));
+        }
+
+        // Handle De Morgan's laws for AND/OR
+        match binary_expr.op() {
+            Operator::And => {
+                // NOT(A AND B) -> NOT A OR NOT B
+                let not_left: Arc<dyn PhysicalExpr> =
+                    Arc::new(NotExpr::new(Arc::clone(binary_expr.left())));
+                let not_right: Arc<dyn PhysicalExpr> =
+                    Arc::new(NotExpr::new(Arc::clone(binary_expr.right())));
+                let new_binary =
+                    Arc::new(BinaryExpr::new(not_left, Operator::Or, not_right));
+                return Ok(Transformed::yes(new_binary));
+            }
+            Operator::Or => {
+                // NOT(A OR B) -> NOT A AND NOT B
+                let not_left: Arc<dyn PhysicalExpr> =
+                    Arc::new(NotExpr::new(Arc::clone(binary_expr.left())));
+                let not_right: Arc<dyn PhysicalExpr> =
+                    Arc::new(NotExpr::new(Arc::clone(binary_expr.right())));
+                let new_binary =
+                    Arc::new(BinaryExpr::new(not_left, Operator::And, not_right));
+                return Ok(Transformed::yes(new_binary));
+            }
+            _ => {}
+        }
+    }
+
+    // If no simplification possible, return the original expression
+    Ok(Transformed::no(expr))
+}
diff --git a/datafusion/physical-expr/src/simplifier/unwrap_cast.rs b/datafusion/physical-expr/src/simplifier/unwrap_cast.rs
index d409ce9cb5bf2..4f4dfb2c20a81 100644
--- a/datafusion/physical-expr/src/simplifier/unwrap_cast.rs
+++ b/datafusion/physical-expr/src/simplifier/unwrap_cast.rs
@@ -34,29 +34,24 @@
 use std::sync::Arc;
 
 use arrow::datatypes::{DataType, Schema};
-use datafusion_common::{
-    tree_node::{Transformed, TreeNode},
-    Result, ScalarValue,
-};
+use datafusion_common::{Result, ScalarValue, tree_node::Transformed};
 use datafusion_expr::Operator;
 use datafusion_expr_common::casts::try_cast_literal_to_type;
 
-use crate::expressions::{lit, BinaryExpr, CastExpr, Literal, TryCastExpr};
 use crate::PhysicalExpr;
+use crate::expressions::{BinaryExpr, CastExpr, Literal, TryCastExpr, lit};
 
 /// Attempts to unwrap casts in comparison expressions.
 pub(crate) fn unwrap_cast_in_comparison(
     expr: Arc<dyn PhysicalExpr>,
     schema: &Schema,
 ) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
-    expr.transform_down(|e| {
-        if let Some(binary) = e.as_any().downcast_ref::<BinaryExpr>() {
-            if let Some(unwrapped) = try_unwrap_cast_binary(binary, schema)? {
-                return Ok(Transformed::yes(unwrapped));
-            }
-        }
-        Ok(Transformed::no(e))
-    })
+    if let Some(binary) = expr.downcast_ref::<BinaryExpr>()
+        && let Some(unwrapped) = try_unwrap_cast_binary(binary, schema)?
+    {
+        return Ok(Transformed::yes(unwrapped));
+    }
+    Ok(Transformed::no(expr))
 }
 
 /// Try to unwrap casts in binary expressions
@@ -67,37 +62,34 @@ fn try_unwrap_cast_binary(
     // Case 1: cast(left_expr) op literal
     if let (Some((inner_expr, _cast_type)), Some(literal)) = (
         extract_cast_info(binary.left()),
-        binary.right().as_any().downcast_ref::<Literal>(),
-    ) {
-        if binary.op().supports_propagation() {
-            if let Some(unwrapped) = try_unwrap_cast_comparison(
-                Arc::clone(inner_expr),
-                literal.value(),
-                *binary.op(),
-                schema,
-            )? {
-                return Ok(Some(unwrapped));
-            }
-        }
+        binary.right().downcast_ref::<Literal>(),
+    ) && binary.op().supports_propagation()
+        && let Some(unwrapped) = try_unwrap_cast_comparison(
+            Arc::clone(inner_expr),
+            literal.value(),
+            *binary.op(),
+            schema,
+        )?
+    {
+        return Ok(Some(unwrapped));
     }
 
     // Case 2: literal op cast(right_expr)
     if let (Some(literal), Some((inner_expr, _cast_type))) = (
-        binary.left().as_any().downcast_ref::<Literal>(),
+        binary.left().downcast_ref::<Literal>(),
         extract_cast_info(binary.right()),
     ) {
         // For literal op cast(expr), we need to swap the operator
-        if let Some(swapped_op) = binary.op().swap() {
-            if binary.op().supports_propagation() {
-                if let Some(unwrapped) = try_unwrap_cast_comparison(
-                    Arc::clone(inner_expr),
-                    literal.value(),
-                    swapped_op,
-                    schema,
-                )? {
-                    return Ok(Some(unwrapped));
-                }
-            }
+        if let Some(swapped_op) = binary.op().swap()
+            && binary.op().supports_propagation()
+            && let Some(unwrapped) = try_unwrap_cast_comparison(
+                Arc::clone(inner_expr),
+                literal.value(),
+                swapped_op,
+                schema,
+            )?
+        {
+            return Ok(Some(unwrapped));
         }
         // If the operator cannot be swapped, we skip this optimization case
         // but don't prevent other optimizations
@@ -113,9 +105,9 @@ fn try_unwrap_cast_binary(
 fn extract_cast_info(
     expr: &Arc<dyn PhysicalExpr>,
 ) -> Option<(&Arc<dyn PhysicalExpr>, &DataType)> {
-    if let Some(cast) = expr.as_any().downcast_ref::<CastExpr>() {
+    if let Some(cast) = expr.downcast_ref::<CastExpr>() {
         Some((cast.expr(), cast.cast_type()))
-    } else if let Some(try_cast) = expr.as_any().downcast_ref::<TryCastExpr>() {
+    } else if let Some(try_cast) = expr.downcast_ref::<TryCastExpr>() {
         Some((try_cast.expr(), try_cast.cast_type()))
     } else {
         None
@@ -145,27 +137,25 @@ fn try_unwrap_cast_comparison(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::expressions::{col, lit};
-    use arrow::datatypes::{DataType, Field, Schema};
-    use datafusion_common::ScalarValue;
-    use datafusion_expr::Operator;
+    use crate::expressions::col;
+    use arrow::datatypes::Field;
+    use datafusion_common::tree_node::TreeNode;
 
     /// Check if an expression is a cast expression
     fn is_cast_expr(expr: &Arc<dyn PhysicalExpr>) -> bool {
-        expr.as_any().downcast_ref::<CastExpr>().is_some()
-            || expr.as_any().downcast_ref::<TryCastExpr>().is_some()
+        expr.downcast_ref::<CastExpr>().is_some()
+            || expr.downcast_ref::<TryCastExpr>().is_some()
     }
 
     /// Check if a binary expression is suitable for cast unwrapping
     fn is_binary_expr_with_cast_and_literal(binary: &BinaryExpr) -> bool {
         // Check if left is cast and right is literal
         let left_cast_right_literal = is_cast_expr(binary.left())
-            && binary.right().as_any().downcast_ref::<Literal>().is_some();
+            && binary.right().downcast_ref::<Literal>().is_some();
 
         // Check if left is literal and right is cast
-        let left_literal_right_cast =
-            binary.left().as_any().downcast_ref::<Literal>().is_some()
-                && is_cast_expr(binary.right());
+        let left_literal_right_cast = binary.left().downcast_ref::<Literal>().is_some()
+            && is_cast_expr(binary.right());
 
         left_cast_right_literal || left_literal_right_cast
     }
@@ -197,17 +187,13 @@ mod tests {
 
         // The result should be: c1 > INT32(10)
         let optimized = result.data;
-        let optimized_binary = optimized.as_any().downcast_ref::<BinaryExpr>().unwrap();
+        let optimized_binary = optimized.downcast_ref::<BinaryExpr>().unwrap();
 
         // Check that left side is no longer a cast
         assert!(!is_cast_expr(optimized_binary.left()));
 
         // Check that right side is a literal with the correct type and value
-        let right_literal = optimized_binary
-            .right()
-            .as_any()
-            .downcast_ref::<Literal>()
-            .unwrap();
+        let right_literal = optimized_binary.right().downcast_ref::<Literal>().unwrap();
         assert_eq!(right_literal.value(), &ScalarValue::Int32(Some(10)));
     }
 
@@ -230,7 +216,7 @@ mod tests {
 
         // The result should be equivalent to: c1 > INT32(10)
         let optimized = result.data;
-        let optimized_binary = optimized.as_any().downcast_ref::<BinaryExpr>().unwrap();
+        let optimized_binary = optimized.downcast_ref::<BinaryExpr>().unwrap();
 
         // Check the operator was swapped
         assert_eq!(*optimized_binary.op(), Operator::Gt);
@@ -263,9 +249,7 @@ mod tests {
         let literal_expr = lit(10i64);
         let binary_expr =
             Arc::new(BinaryExpr::new(cast_expr, Operator::Gt, literal_expr));
-        let binary_ref = binary_expr.as_any().downcast_ref::<BinaryExpr>().unwrap();
-
-        assert!(is_binary_expr_with_cast_and_literal(binary_ref));
+        assert!(is_binary_expr_with_cast_and_literal(&binary_expr));
     }
 
     #[test]
@@ -297,7 +281,7 @@ mod tests {
 
         // The result should be: decimal_col >= Decimal128(400, 9, 2)
         let optimized = result.data;
-        let optimized_binary = optimized.as_any().downcast_ref::<BinaryExpr>().unwrap();
+        let optimized_binary = optimized.downcast_ref::<BinaryExpr>().unwrap();
 
         // Check operator was swapped correctly
         assert_eq!(*optimized_binary.op(), Operator::GtEq);
@@ -306,11 +290,7 @@ mod tests {
         assert!(!is_cast_expr(optimized_binary.left()));
 
         // Check that right side is a literal with the correct type
-        let right_literal = optimized_binary
-            .right()
-            .as_any()
-            .downcast_ref::<Literal>()
-            .unwrap();
+        let right_literal = optimized_binary.right().downcast_ref::<Literal>().unwrap();
         assert_eq!(
             right_literal.value().data_type(),
             DataType::Decimal128(9, 2)
@@ -346,8 +326,7 @@ mod tests {
             assert!(result.transformed);
 
             let optimized = result.data;
-            let optimized_binary =
-                optimized.as_any().downcast_ref::<BinaryExpr>().unwrap();
+            let optimized_binary = optimized.downcast_ref::<BinaryExpr>().unwrap();
 
             // Check the operator was swapped correctly
             assert_eq!(
@@ -360,11 +339,8 @@ mod tests {
             assert!(!is_cast_expr(optimized_binary.left()));
 
             // Check that the literal was cast to the column type
-            let right_literal = optimized_binary
-                .right()
-                .as_any()
-                .downcast_ref::<Literal>()
-                .unwrap();
+            let right_literal =
+                optimized_binary.right().downcast_ref::<Literal>().unwrap();
             assert_eq!(right_literal.value(), &ScalarValue::Int32(Some(100)));
         }
     }
@@ -437,12 +413,8 @@ mod tests {
 
         // Verify the NULL was cast to the column type
         let optimized = result.data;
-        let optimized_binary = optimized.as_any().downcast_ref::<BinaryExpr>().unwrap();
-        let right_literal = optimized_binary
-            .right()
-            .as_any()
-            .downcast_ref::<Literal>()
-            .unwrap();
+        let optimized_binary = optimized.downcast_ref::<BinaryExpr>().unwrap();
+        let right_literal = optimized_binary.right().downcast_ref::<Literal>().unwrap();
         assert_eq!(right_literal.value(), &ScalarValue::Int32(None));
     }
 
@@ -487,28 +459,22 @@ mod tests {
 
         let and_expr = Arc::new(BinaryExpr::new(compare1, Operator::And, compare2));
 
-        // Apply unwrap cast optimization
-        let result = unwrap_cast_in_comparison(and_expr, &schema).unwrap();
+        // Apply unwrap cast optimization recursively
+        let result = (and_expr as Arc<dyn PhysicalExpr>)
+            .transform_down(|node| unwrap_cast_in_comparison(node, &schema))
+            .unwrap();
 
         // Should be transformed
         assert!(result.transformed);
 
         // Verify the AND operator is preserved
         let optimized = result.data;
-        let and_binary = optimized.as_any().downcast_ref::<BinaryExpr>().unwrap();
+        let and_binary = optimized.downcast_ref::<BinaryExpr>().unwrap();
         assert_eq!(*and_binary.op(), Operator::And);
 
         // Both sides should have their casts unwrapped
-        let left_binary = and_binary
-            .left()
-            .as_any()
-            .downcast_ref::<BinaryExpr>()
-            .unwrap();
-        let right_binary = and_binary
-            .right()
-            .as_any()
-            .downcast_ref::<BinaryExpr>()
-            .unwrap();
+        let left_binary = and_binary.left().downcast_ref::<BinaryExpr>().unwrap();
+        let right_binary = and_binary.right().downcast_ref::<BinaryExpr>().unwrap();
 
         assert!(!is_cast_expr(left_binary.left()));
         assert!(!is_cast_expr(right_binary.left()));
@@ -532,17 +498,13 @@ mod tests {
         assert!(result.transformed);
 
         let optimized = result.data;
-        let optimized_binary = optimized.as_any().downcast_ref::<BinaryExpr>().unwrap();
+        let optimized_binary = optimized.downcast_ref::<BinaryExpr>().unwrap();
 
         // Verify the try_cast was removed
         assert!(!is_cast_expr(optimized_binary.left()));
 
         // Verify the literal was converted
-        let right_literal = optimized_binary
-            .right()
-            .as_any()
-            .downcast_ref::<Literal>()
-            .unwrap();
+        let right_literal = optimized_binary.right().downcast_ref::<Literal>().unwrap();
         assert_eq!(right_literal.value(), &ScalarValue::Int32(Some(100)));
     }
 
@@ -605,42 +567,28 @@ mod tests {
         // Create AND expression
         let and_expr = Arc::new(BinaryExpr::new(c1_binary, Operator::And, c2_binary));
 
-        // Apply unwrap cast optimization
-        let result = unwrap_cast_in_comparison(and_expr, &schema).unwrap();
+        // Apply unwrap cast optimization recursively
+        let result = (and_expr as Arc<dyn PhysicalExpr>)
+            .transform_down(|node| unwrap_cast_in_comparison(node, &schema))
+            .unwrap();
 
         // Should be transformed
         assert!(result.transformed);
 
         // Verify both sides of the AND were optimized
         let optimized = result.data;
-        let and_binary = optimized.as_any().downcast_ref::<BinaryExpr>().unwrap();
+        let and_binary = optimized.downcast_ref::<BinaryExpr>().unwrap();
 
         // Left side should be: c1 > INT32(10)
-        let left_binary = and_binary
-            .left()
-            .as_any()
-            .downcast_ref::<BinaryExpr>()
-            .unwrap();
+        let left_binary = and_binary.left().downcast_ref::<BinaryExpr>().unwrap();
         assert!(!is_cast_expr(left_binary.left()));
-        let left_literal = left_binary
-            .right()
-            .as_any()
-            .downcast_ref::<Literal>()
-            .unwrap();
+        let left_literal = left_binary.right().downcast_ref::<Literal>().unwrap();
         assert_eq!(left_literal.value(), &ScalarValue::Int32(Some(10)));
 
         // Right side should be: c2 = INT64(20) (c2 is already INT64, literal cast to match)
-        let right_binary = and_binary
-            .right()
-            .as_any()
-            .downcast_ref::<BinaryExpr>()
-            .unwrap();
+        let right_binary = and_binary.right().downcast_ref::<BinaryExpr>().unwrap();
         assert!(!is_cast_expr(right_binary.left()));
-        let right_literal = right_binary
-            .right()
-            .as_any()
-            .downcast_ref::<Literal>()
-            .unwrap();
+        let right_literal = right_binary.right().downcast_ref::<Literal>().unwrap();
         assert_eq!(right_literal.value(), &ScalarValue::Int64(Some(20)));
     }
 }
diff --git a/datafusion/physical-expr/src/statistics/stats_solver.rs b/datafusion/physical-expr/src/statistics/stats_solver.rs
index ec58076caf3b1..407fa6fd1f928 100644
--- a/datafusion/physical-expr/src/statistics/stats_solver.rs
+++ b/datafusion/physical-expr/src/statistics/stats_solver.rs
@@ -20,18 +20,18 @@ use std::sync::Arc;
 use crate::expressions::Literal;
 use crate::intervals::cp_solver::PropagationResult;
 use crate::physical_expr::PhysicalExpr;
-use crate::utils::{build_dag, ExprTreeNode};
+use crate::utils::{ExprTreeNode, build_dag};
 
 use arrow::datatypes::{DataType, Schema};
 use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::statistics::Distribution;
 use datafusion_expr_common::interval_arithmetic::Interval;
 
+use petgraph::Outgoing;
 use petgraph::adj::DefaultIx;
 use petgraph::prelude::Bfs;
 use petgraph::stable_graph::{NodeIndex, StableGraph};
 use petgraph::visit::DfsPostOrder;
-use petgraph::Outgoing;
 
 /// This object implements a directed acyclic expression graph (DAEG) that
 /// is used to compute statistics/distributions for expressions hierarchically.
@@ -86,7 +86,7 @@ impl ExprStatisticsGraphNode {
     /// indefinite range (i.e. `[-∞, ∞]`).
     pub fn make_node(node: &ExprTreeNode<NodeIndex>, schema: &Schema) -> Result<Self> {
         let expr = Arc::clone(&node.expr);
-        if let Some(literal) = expr.as_any().downcast_ref::<Literal>() {
+        if let Some(literal) = expr.downcast_ref::<Literal>() {
             let value = literal.value();
             Interval::try_new(value.clone(), value.clone())
                 .and_then(|interval| Self::new_uniform(expr, interval))
@@ -156,7 +156,7 @@ impl ExprStatisticsGraph {
                 // If the given statistics enable us to obtain a more precise
                 // range for the root, update it:
                 let subset = root_range.contains(given_range)?;
-                self.graph[self.root].dist = if subset == Interval::CERTAINLY_TRUE {
+                self.graph[self.root].dist = if subset == Interval::TRUE {
                     // Given statistics is strictly more informative, use it as is:
                     given_stats
                 } else {
@@ -205,7 +205,7 @@ impl ExprStatisticsGraph {
 mod tests {
     use std::sync::Arc;
 
-    use crate::expressions::{binary, try_cast, Column};
+    use crate::expressions::{Column, binary, try_cast};
     use crate::intervals::cp_solver::PropagationResult;
     use crate::statistics::stats_solver::ExprStatisticsGraph;
 
diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs
index 8a57cc7b7c154..add7a34f70b2c 100644
--- a/datafusion/physical-expr/src/utils/guarantee.rs
+++ b/datafusion/physical-expr/src/utils/guarantee.rs
@@ -19,7 +19,7 @@
 //! constant.
 
 use crate::utils::split_disjunction;
-use crate::{split_conjunction, PhysicalExpr};
+use crate::{PhysicalExpr, split_conjunction};
 use datafusion_common::{Column, HashMap, ScalarValue};
 use datafusion_expr::Operator;
 use std::collections::HashSet;
@@ -93,6 +93,7 @@ impl LiteralGuarantee {
     /// Create a new instance of the guarantee if the provided operator is
     /// supported. Returns None otherwise. See [`LiteralGuarantee::analyze`] to
     /// create these structures from an predicate (boolean expression).
+    #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key
     fn new<'a>(
         column_name: impl Into<String>,
         guarantee: Guarantee,
@@ -124,10 +125,9 @@ impl LiteralGuarantee {
             // for an `AND` conjunction to be true, all terms individually must be true
             .fold(GuaranteeBuilder::new(), |builder, expr| {
                 if let Some(cel) = ColOpLit::try_new(expr) {
-                    builder.aggregate_conjunct(cel)
-                } else if let Some(inlist) = expr
-                    .as_any()
-                    .downcast_ref::<crate::expressions::InListExpr>()
+                    builder.aggregate_conjunct(&cel)
+                } else if let Some(inlist) =
+                    expr.downcast_ref::<crate::expressions::InListExpr>()
                 {
                     if let Some(inlist) = ColInList::try_new(inlist) {
                         builder.aggregate_multi_conjunct(
@@ -292,7 +292,7 @@ impl<'a> GuaranteeBuilder<'a> {
     /// # Examples
     /// * `AND (a = 1)`: `a` is guaranteed to be 1
     /// * `AND (a != 1)`: a is guaranteed to not be 1
-    fn aggregate_conjunct(self, col_op_lit: ColOpLit<'a>) -> Self {
+    fn aggregate_conjunct(self, col_op_lit: &ColOpLit<'a>) -> Self {
         self.aggregate_multi_conjunct(
             col_op_lit.col,
             col_op_lit.guarantee,
@@ -309,6 +309,7 @@ impl<'a> GuaranteeBuilder<'a> {
     /// * `AND (a IN (1,2,3))`: a is in (1, 2, or 3)
     /// * `AND (a != 1 OR a != 2 OR a != 3)`: a is not in (1, 2, or 3)
     /// * `AND (a NOT IN (1,2,3))`: a is not in (1, 2, or 3)
+    #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key
     fn aggregate_multi_conjunct(
         mut self,
         col: &'a crate::expressions::Column,
@@ -391,15 +392,10 @@ impl<'a> ColOpLit<'a> {
     ///
     /// Returns None otherwise
     fn try_new(expr: &'a Arc<dyn PhysicalExpr>) -> Option<Self> {
-        let binary_expr = expr
-            .as_any()
-            .downcast_ref::<crate::expressions::BinaryExpr>()?;
+        let binary_expr = expr.downcast_ref::<crate::expressions::BinaryExpr>()?;
 
-        let (left, op, right) = (
-            binary_expr.left().as_any(),
-            binary_expr.op(),
-            binary_expr.right().as_any(),
-        );
+        let (left, op, right) =
+            (binary_expr.left(), binary_expr.op(), binary_expr.right());
         let guarantee = match op {
             Operator::Eq => Guarantee::In,
             Operator::NotEq => Guarantee::NotIn,
@@ -447,15 +443,12 @@ impl<'a> ColInList<'a> {
     /// Returns None otherwise
     fn try_new(inlist: &'a crate::expressions::InListExpr) -> Option<Self> {
         // Only support single-column inlist currently, multi-column inlist is not supported
-        let col = inlist
-            .expr()
-            .as_any()
-            .downcast_ref::<crate::expressions::Column>()?;
+        let col = inlist.expr().downcast_ref::<crate::expressions::Column>()?;
 
         let literals = inlist
             .list()
             .iter()
-            .map(|e| e.as_any().downcast_ref::<crate::expressions::Literal>())
+            .map(|e| e.downcast_ref::<crate::expressions::Literal>())
             .collect::<Option<Vec<_>>>()?;
 
         let guarantee = if inlist.negated() {
@@ -480,10 +473,7 @@ enum ColOpLitOrInList<'a> {
 
 impl<'a> ColOpLitOrInList<'a> {
     fn try_new(expr: &'a Arc<dyn PhysicalExpr>) -> Option<Self> {
-        match expr
-            .as_any()
-            .downcast_ref::<crate::expressions::InListExpr>()
-        {
+        match expr.downcast_ref::<crate::expressions::InListExpr>() {
             Some(inlist) => Some(Self::ColInList(ColInList::try_new(inlist)?)),
             None => ColOpLit::try_new(expr).map(Self::ColOpLit),
         }
@@ -550,7 +540,7 @@ mod test {
 
     use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
     use datafusion_expr::expr_fn::*;
-    use datafusion_expr::{lit, Expr};
+    use datafusion_expr::{Expr, lit};
 
     use itertools::Itertools;
 
diff --git a/datafusion/physical-expr/src/utils/mod.rs b/datafusion/physical-expr/src/utils/mod.rs
index 745ae855efee2..1be57c9192626 100644
--- a/datafusion/physical-expr/src/utils/mod.rs
+++ b/datafusion/physical-expr/src/utils/mod.rs
@@ -21,10 +21,11 @@ pub use guarantee::{Guarantee, LiteralGuarantee};
 use std::borrow::Borrow;
 use std::sync::Arc;
 
-use crate::expressions::{BinaryExpr, Column};
+use crate::expressions::{BinaryExpr, Column, Literal};
 use crate::tree_node::ExprContext;
-use crate::PhysicalExpr;
-use crate::PhysicalSortExpr;
+use crate::{
+    AcrossPartitions, ConstExpr, EquivalenceProperties, PhysicalExpr, PhysicalSortExpr,
+};
 
 use arrow::datatypes::Schema;
 use datafusion_common::tree_node::{
@@ -45,6 +46,65 @@ pub fn split_conjunction(
     split_impl(Operator::And, predicate, vec![])
 }
 
+impl ConstExpr {
+    /// Collects predicate-derived constants from equality conjunctions.
+    ///
+    /// For each equality predicate of the form `lhs = rhs`, if either side is
+    /// already known constant according to `input_eqs`, or is a literal, then
+    /// the other side is also constant and will be returned as a [`ConstExpr`].
+    ///
+    /// Literals are treated as uniform constants across partitions, so
+    /// `col = literal` produces a constant for `col` with the literal value.
+    ///
+    /// For example, given predicate `a = 5 AND b = c` where `c` is already
+    /// known constant, this returns constants for both `a` (Uniform with value
+    /// 5) and `b` (propagating `c`'s across-partitions value).
+    pub fn collect_predicate_constants(
+        input_eqs: &EquivalenceProperties,
+        predicate: &Arc<dyn PhysicalExpr>,
+    ) -> Vec<ConstExpr> {
+        /// Returns the `AcrossPartitions` value for `expr` if it is constant:
+        /// either already known constant in `input_eqs`, or a `Literal`
+        /// (which is inherently constant across all partitions).
+        fn expr_constant_or_literal(
+            expr: &Arc<dyn PhysicalExpr>,
+            input_eqs: &EquivalenceProperties,
+        ) -> Option<AcrossPartitions> {
+            input_eqs.is_expr_constant(expr).or_else(|| {
+                expr.downcast_ref::<Literal>()
+                    .map(|l| AcrossPartitions::Uniform(Some(l.value().clone())))
+            })
+        }
+
+        let mut constants = Vec::new();
+        for conjunction in split_conjunction(predicate) {
+            if let Some(binary) = conjunction.downcast_ref::<BinaryExpr>()
+                && binary.op() == &Operator::Eq
+            {
+                // Check if either side is constant — either already known
+                // constant from the input equivalence properties, or a literal
+                // value (which is inherently constant across all partitions).
+                let left_const = expr_constant_or_literal(binary.left(), input_eqs);
+                let right_const = expr_constant_or_literal(binary.right(), input_eqs);
+
+                if let Some(left_across) = left_const {
+                    // LEFT is constant, so RIGHT must also be constant.
+                    // Use RIGHT's known across value if available, otherwise
+                    // propagate LEFT's (e.g. Uniform from a literal).
+                    let across = right_const.unwrap_or(left_across);
+                    constants.push(ConstExpr::new(Arc::clone(binary.right()), across));
+                } else if let Some(right_across) = right_const {
+                    // RIGHT is constant, so LEFT must also be constant.
+                    constants
+                        .push(ConstExpr::new(Arc::clone(binary.left()), right_across));
+                }
+            }
+        }
+
+        constants
+    }
+}
+
 /// Create a conjunction of the given predicates.
 /// If the input is empty, return a literal true.
 /// If the input contains a single predicate, return the predicate.
@@ -84,7 +144,7 @@ fn split_impl<'a>(
     predicate: &'a Arc<dyn PhysicalExpr>,
     mut exprs: Vec<&'a Arc<dyn PhysicalExpr>>,
 ) -> Vec<&'a Arc<dyn PhysicalExpr>> {
-    match predicate.as_any().downcast_ref::<BinaryExpr>() {
+    match predicate.downcast_ref::<BinaryExpr>() {
         Some(binary) if binary.op() == &operator => {
             let exprs = split_impl(operator, binary.left(), exprs);
             split_impl(operator, binary.right(), exprs)
@@ -115,16 +175,14 @@ pub fn map_columns_before_projection(
     let column_mapping = proj_exprs
         .iter()
         .filter_map(|(expr, name)| {
-            expr.as_any()
-                .downcast_ref::<Column>()
+            expr.downcast_ref::<Column>()
                 .map(|column| (name.clone(), column.clone()))
         })
         .collect::<HashMap<_, _>>();
     parent_required
         .iter()
         .filter_map(|r| {
-            r.as_any()
-                .downcast_ref::<Column>()
+            r.downcast_ref::<Column>()
                 .and_then(|c| column_mapping.get(c.name()))
         })
         .map(|e| Arc::new(e.clone()) as _)
@@ -228,8 +286,8 @@ where
 pub fn collect_columns(expr: &Arc<dyn PhysicalExpr>) -> HashSet<Column> {
     let mut columns = HashSet::<Column>::new();
     expr.apply(|expr| {
-        if let Some(column) = expr.as_any().downcast_ref::<Column>() {
-            columns.get_or_insert_owned(column);
+        if let Some(column) = expr.downcast_ref::<Column>() {
+            columns.get_or_insert_with(column, |c| c.clone());
         }
         Ok(TreeNodeRecursion::Continue)
     })
@@ -252,7 +310,7 @@ pub fn reassign_expr_columns(
     schema: &Schema,
 ) -> Result<Arc<dyn PhysicalExpr>> {
     expr.transform_down(|expr| {
-        if let Some(column) = expr.as_any().downcast_ref::<Column>() {
+        if let Some(column) = expr.downcast_ref::<Column>() {
             let index = schema.index_of(column.name())?;
 
             return Ok(Transformed::yes(Arc::new(Column::new(
@@ -267,15 +325,15 @@ pub fn reassign_expr_columns(
 
 #[cfg(test)]
 pub(crate) mod tests {
-    use std::any::Any;
+
     use std::fmt::{Display, Formatter};
 
     use super::*;
-    use crate::expressions::{binary, cast, col, in_list, lit, Literal};
+    use crate::expressions::{Literal, binary, cast, col, in_list, lit};
 
     use arrow::array::{ArrayRef, Float32Array, Float64Array};
-    use arrow::datatypes::{DataType, Field, Schema};
-    use datafusion_common::{exec_err, internal_datafusion_err, ScalarValue};
+    use arrow::datatypes::{DataType, Field};
+    use datafusion_common::{ScalarValue, exec_err, internal_datafusion_err};
     use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
     use datafusion_expr::{
         ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
@@ -302,9 +360,6 @@ pub(crate) mod tests {
     }
 
     impl ScalarUDFImpl for TestScalarUDF {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
         fn name(&self) -> &str {
             "test-scalar-udf"
         }
@@ -394,11 +449,11 @@ pub(crate) mod tests {
 
     fn make_dummy_node(node: &ExprTreeNode<NodeIndex>) -> Result<PhysicalExprDummyNode> {
         let expr = Arc::clone(&node.expr);
-        let dummy_property = if expr.as_any().is::<BinaryExpr>() {
+        let dummy_property = if expr.is::<BinaryExpr>() {
             "Binary"
-        } else if expr.as_any().is::<Column>() {
+        } else if expr.is::<Column>() {
             "Column"
-        } else if expr.as_any().is::<Literal>() {
+        } else if expr.is::<Literal>() {
             "Literal"
         } else {
             "Other"
@@ -562,4 +617,31 @@ pub(crate) mod tests {
         assert_eq!(collect_columns(&expr3), expected);
         Ok(())
     }
+
+    #[test]
+    fn test_collect_predicate_constants_propagates_uniform_literal_value() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "ticker",
+            DataType::Utf8,
+            false,
+        )]));
+        let predicate = binary(
+            col("ticker", schema.as_ref())?,
+            Operator::Eq,
+            lit(ScalarValue::Utf8(Some("NGJ26".to_string()))),
+            schema.as_ref(),
+        )?;
+        let eq_properties = EquivalenceProperties::new(schema);
+
+        let constants =
+            ConstExpr::collect_predicate_constants(&eq_properties, &predicate);
+
+        assert_eq!(constants.len(), 1);
+        assert_eq!(
+            constants[0].across_partitions,
+            AcrossPartitions::Uniform(Some(ScalarValue::Utf8(Some("NGJ26".to_string()))))
+        );
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-expr/src/window/aggregate.rs b/datafusion/physical-expr/src/window/aggregate.rs
index 2ed9770902d58..1ff13d107c036 100644
--- a/datafusion/physical-expr/src/window/aggregate.rs
+++ b/datafusion/physical-expr/src/window/aggregate.rs
@@ -23,7 +23,7 @@ use std::sync::Arc;
 
 use crate::aggregate::AggregateFunctionExpr;
 use crate::window::standard::add_new_ordering_expr_with_partition_by;
-use crate::window::window_expr::{filter_array, AggregateWindowExpr, WindowFn};
+use crate::window::window_expr::{AggregateWindowExpr, WindowFn, filter_array};
 use crate::window::{
     PartitionBatches, PartitionWindowAggStates, SlidingAggregateWindowExpr, WindowExpr,
 };
@@ -33,7 +33,7 @@ use arrow::array::ArrayRef;
 use arrow::array::BooleanArray;
 use arrow::datatypes::FieldRef;
 use arrow::record_batch::RecordBatch;
-use datafusion_common::{exec_datafusion_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_datafusion_err};
 use datafusion_expr::{Accumulator, WindowFrame, WindowFrameBound, WindowFrameUnits};
 use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 
diff --git a/datafusion/physical-expr/src/window/sliding_aggregate.rs b/datafusion/physical-expr/src/window/sliding_aggregate.rs
index f93b13fef4dff..a71df3ec88472 100644
--- a/datafusion/physical-expr/src/window/sliding_aggregate.rs
+++ b/datafusion/physical-expr/src/window/sliding_aggregate.rs
@@ -22,11 +22,11 @@ use std::ops::Range;
 use std::sync::Arc;
 
 use crate::aggregate::AggregateFunctionExpr;
-use crate::window::window_expr::{filter_array, AggregateWindowExpr, WindowFn};
+use crate::window::window_expr::{AggregateWindowExpr, WindowFn, filter_array};
 use crate::window::{
     PartitionBatches, PartitionWindowAggStates, PlainAggregateWindowExpr, WindowExpr,
 };
-use crate::{expressions::PhysicalSortExpr, PhysicalExpr};
+use crate::{PhysicalExpr, expressions::PhysicalSortExpr};
 
 use arrow::array::{ArrayRef, BooleanArray};
 use arrow::datatypes::FieldRef;
diff --git a/datafusion/physical-expr/src/window/standard.rs b/datafusion/physical-expr/src/window/standard.rs
index e9e7f6abf6368..f8d92d5de4ad5 100644
--- a/datafusion/physical-expr/src/window/standard.rs
+++ b/datafusion/physical-expr/src/window/standard.rs
@@ -22,17 +22,17 @@ use std::ops::Range;
 use std::sync::Arc;
 
 use super::{StandardWindowFunctionExpr, WindowExpr};
-use crate::window::window_expr::{get_orderby_values, WindowFn};
+use crate::window::window_expr::{WindowFn, get_orderby_values};
 use crate::window::{PartitionBatches, PartitionWindowAggStates, WindowState};
 use crate::{EquivalenceProperties, PhysicalExpr};
 
-use arrow::array::{new_empty_array, ArrayRef};
+use arrow::array::{ArrayRef, new_empty_array};
 use arrow::datatypes::FieldRef;
 use arrow::record_batch::RecordBatch;
 use datafusion_common::utils::evaluate_partition_ranges;
 use datafusion_common::{Result, ScalarValue};
-use datafusion_expr::window_state::{WindowAggState, WindowFrameContext};
 use datafusion_expr::WindowFrame;
+use datafusion_expr::window_state::{WindowAggState, WindowFrameContext};
 use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 
 /// A window expr that takes the form of a [`StandardWindowFunctionExpr`].
diff --git a/datafusion/physical-expr/src/window/standard_window_function_expr.rs b/datafusion/physical-expr/src/window/standard_window_function_expr.rs
index ca7c3a4db3d4f..a6ea5e44a4997 100644
--- a/datafusion/physical-expr/src/window/standard_window_function_expr.rs
+++ b/datafusion/physical-expr/src/window/standard_window_function_expr.rs
@@ -23,18 +23,18 @@ use arrow::record_batch::RecordBatch;
 use datafusion_common::Result;
 use datafusion_expr::{LimitEffect, PartitionEvaluator};
 
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays;
 use std::any::Any;
 use std::sync::Arc;
 
 /// Evaluates a window function by instantiating a
-/// `[PartitionEvaluator]` for calculating the function's output in
+/// [`PartitionEvaluator`] for calculating the function's output in
 /// that partition.
 ///
 /// Note that unlike aggregation based window functions, some window
 /// functions such as `rank` ignore the values in the window frame,
 /// but others such as `first_value`, `last_value`, and
 /// `nth_value` need the value.
-#[allow(rustdoc::private_intra_doc_links)]
 pub trait StandardWindowFunctionExpr: Send + Sync + std::fmt::Debug {
     /// Returns the aggregate expression as [`Any`] so that it can be
     /// downcast to a specific implementation.
@@ -57,13 +57,7 @@ pub trait StandardWindowFunctionExpr: Send + Sync + std::fmt::Debug {
     ///
     /// Typically, the resulting vector is a single element vector.
     fn evaluate_args(&self, batch: &RecordBatch) -> Result<Vec<ArrayRef>> {
-        self.expressions()
-            .iter()
-            .map(|e| {
-                e.evaluate(batch)
-                    .and_then(|v| v.into_array(batch.num_rows()))
-            })
-            .collect()
+        evaluate_expressions_to_arrays(&self.expressions(), batch)
     }
 
     /// Create a [`PartitionEvaluator`] for evaluating the function on
diff --git a/datafusion/physical-expr/src/window/window_expr.rs b/datafusion/physical-expr/src/window/window_expr.rs
index a6b5bf1871161..0f0ec647a50ae 100644
--- a/datafusion/physical-expr/src/window/window_expr.rs
+++ b/datafusion/physical-expr/src/window/window_expr.rs
@@ -23,17 +23,16 @@ use std::sync::Arc;
 use crate::PhysicalExpr;
 
 use arrow::array::BooleanArray;
-use arrow::array::{new_empty_array, Array, ArrayRef};
+use arrow::array::{Array, ArrayRef, new_empty_array};
+use arrow::compute::SortOptions;
 use arrow::compute::filter as arrow_filter;
 use arrow::compute::kernels::sort::SortColumn;
-use arrow::compute::SortOptions;
 use arrow::datatypes::FieldRef;
 use arrow::record_batch::RecordBatch;
 use datafusion_common::cast::as_boolean_array;
 use datafusion_common::utils::compare_rows;
 use datafusion_common::{
-    arrow_datafusion_err, exec_datafusion_err, internal_err, DataFusionError, Result,
-    ScalarValue,
+    Result, ScalarValue, arrow_datafusion_err, exec_datafusion_err, internal_err,
 };
 use datafusion_expr::window_state::{
     PartitionBatchState, WindowAggState, WindowFrameContext, WindowFrameStateGroups,
@@ -41,6 +40,7 @@ use datafusion_expr::window_state::{
 use datafusion_expr::{Accumulator, PartitionEvaluator, WindowFrame, WindowFrameBound};
 use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays;
 use indexmap::IndexMap;
 
 /// Common trait for [window function] implementations
@@ -90,13 +90,7 @@ pub trait WindowExpr: Send + Sync + Debug {
     /// Evaluate the window function arguments against the batch and return
     /// array ref, normally the resulting `Vec` is a single element one.
     fn evaluate_args(&self, batch: &RecordBatch) -> Result<Vec<ArrayRef>> {
-        self.expressions()
-            .iter()
-            .map(|e| {
-                e.evaluate(batch)
-                    .and_then(|v| v.into_array(batch.num_rows()))
-            })
-            .collect()
+        evaluate_expressions_to_arrays(&self.expressions(), batch)
     }
 
     /// Evaluate the window function values against the batch
@@ -287,7 +281,7 @@ pub trait AggregateWindowExpr: WindowExpr {
     /// * `window_frame_ctx`: Details about the window frame (see [`WindowFrameContext`]).
     /// * `idx`: The index of the current row in the record batch.
     /// * `not_end`: is the current row not the end of the partition (see [`PartitionBatchState`]).
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     fn get_result_column(
         &self,
         accumulator: &mut Box<dyn Accumulator>,
diff --git a/datafusion/physical-optimizer/Cargo.toml b/datafusion/physical-optimizer/Cargo.toml
index 4df011fc0a05e..38c8a7c37211f 100644
--- a/datafusion/physical-optimizer/Cargo.toml
+++ b/datafusion/physical-optimizer/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -53,5 +56,6 @@ recursive = { workspace = true, optional = true }
 [dev-dependencies]
 datafusion-expr = { workspace = true }
 datafusion-functions = { workspace = true }
+datafusion-functions-window = { workspace = true }
 insta = { workspace = true }
 tokio = { workspace = true }
diff --git a/datafusion/physical-optimizer/src/aggregate_statistics.rs b/datafusion/physical-optimizer/src/aggregate_statistics.rs
index 672317060d902..75da1873263d8 100644
--- a/datafusion/physical-optimizer/src/aggregate_statistics.rs
+++ b/datafusion/physical-optimizer/src/aggregate_statistics.rs
@@ -16,15 +16,15 @@
 // under the License.
 
 //! Utilizing exact statistics from sources to avoid scanning data
+use datafusion_common::Result;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::scalar::ScalarValue;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::Result;
-use datafusion_physical_plan::aggregates::AggregateExec;
+use datafusion_physical_plan::aggregates::{AggregateExec, AggregateInputMode};
 use datafusion_physical_plan::placeholder_row::PlaceholderRowExec;
 use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr};
 use datafusion_physical_plan::udaf::{AggregateFunctionExpr, StatisticsArgs};
-use datafusion_physical_plan::{expressions, ExecutionPlan};
+use datafusion_physical_plan::{ExecutionPlan, expressions};
 use std::sync::Arc;
 
 use crate::PhysicalOptimizerRule;
@@ -34,7 +34,7 @@ use crate::PhysicalOptimizerRule;
 pub struct AggregateStatistics {}
 
 impl AggregateStatistics {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -42,6 +42,7 @@ impl AggregateStatistics {
 
 impl PhysicalOptimizerRule for AggregateStatistics {
     #[cfg_attr(feature = "recursive_protection", recursive::recursive)]
+    #[expect(clippy::allow_attributes)] // See https://github.com/apache/datafusion/issues/18881#issuecomment-3621545670
     #[allow(clippy::only_used_in_recursion)] // See https://github.com/rust-lang/rust-clippy/issues/14566
     fn optimize(
         &self,
@@ -50,7 +51,6 @@ impl PhysicalOptimizerRule for AggregateStatistics {
     ) -> Result<Arc<dyn ExecutionPlan>> {
         if let Some(partial_agg_exec) = take_optimizable(&*plan) {
             let partial_agg_exec = partial_agg_exec
-                .as_any()
                 .downcast_ref::<AggregateExec>()
                 .expect("take_optimizable() ensures that this is a AggregateExec");
             let stats = partial_agg_exec.input().partition_statistics(None)?;
@@ -114,27 +114,23 @@ impl PhysicalOptimizerRule for AggregateStatistics {
 /// We would have preferred to return a casted ref to AggregateExec but the recursion requires
 /// the `ExecutionPlan.children()` method that returns an owned reference.
 fn take_optimizable(node: &dyn ExecutionPlan) -> Option<Arc<dyn ExecutionPlan>> {
-    if let Some(final_agg_exec) = node.as_any().downcast_ref::<AggregateExec>() {
-        if !final_agg_exec.mode().is_first_stage()
-            && final_agg_exec.group_expr().is_empty()
-        {
-            let mut child = Arc::clone(final_agg_exec.input());
-            loop {
-                if let Some(partial_agg_exec) =
-                    child.as_any().downcast_ref::<AggregateExec>()
-                {
-                    if partial_agg_exec.mode().is_first_stage()
-                        && partial_agg_exec.group_expr().is_empty()
-                        && partial_agg_exec.filter_expr().iter().all(|e| e.is_none())
-                    {
-                        return Some(child);
-                    }
-                }
-                if let [childrens_child] = child.children().as_slice() {
-                    child = Arc::clone(childrens_child);
-                } else {
-                    break;
-                }
+    if let Some(final_agg_exec) = node.downcast_ref::<AggregateExec>()
+        && final_agg_exec.mode().input_mode() == AggregateInputMode::Partial
+        && final_agg_exec.group_expr().is_empty()
+    {
+        let mut child = Arc::clone(final_agg_exec.input());
+        loop {
+            if let Some(partial_agg_exec) = child.downcast_ref::<AggregateExec>()
+                && partial_agg_exec.mode().input_mode() == AggregateInputMode::Raw
+                && partial_agg_exec.group_expr().is_empty()
+                && partial_agg_exec.filter_expr().iter().all(|e| e.is_none())
+            {
+                return Some(child);
+            }
+            if let [childrens_child] = child.children().as_slice() {
+                child = Arc::clone(childrens_child);
+            } else {
+                break;
             }
         }
     }
diff --git a/datafusion/physical-optimizer/src/coalesce_async_exec_input.rs b/datafusion/physical-optimizer/src/coalesce_async_exec_input.rs
deleted file mode 100644
index 0b46c68f2daed..0000000000000
--- a/datafusion/physical-optimizer/src/coalesce_async_exec_input.rs
+++ /dev/null
@@ -1,71 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use crate::PhysicalOptimizerRule;
-use datafusion_common::config::ConfigOptions;
-use datafusion_common::internal_err;
-use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_physical_plan::async_func::AsyncFuncExec;
-use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec;
-use datafusion_physical_plan::ExecutionPlan;
-use std::sync::Arc;
-
-/// Optimizer rule that introduces CoalesceAsyncExec to reduce the number of async executions.
-#[derive(Default, Debug)]
-pub struct CoalesceAsyncExecInput {}
-
-impl CoalesceAsyncExecInput {
-    #[allow(missing_docs)]
-    pub fn new() -> Self {
-        Self::default()
-    }
-}
-
-impl PhysicalOptimizerRule for CoalesceAsyncExecInput {
-    fn optimize(
-        &self,
-        plan: Arc<dyn ExecutionPlan>,
-        config: &ConfigOptions,
-    ) -> datafusion_common::Result<Arc<dyn ExecutionPlan>> {
-        let target_batch_size = config.execution.batch_size;
-        plan.transform(|plan| {
-            if let Some(async_exec) = plan.as_any().downcast_ref::<AsyncFuncExec>() {
-                if async_exec.children().len() != 1 {
-                    return internal_err!(
-                        "Expected AsyncFuncExec to have exactly one child"
-                    );
-                }
-                let child = Arc::clone(async_exec.children()[0]);
-                let coalesce_exec =
-                    Arc::new(CoalesceBatchesExec::new(child, target_batch_size));
-                let coalesce_async_exec = plan.with_new_children(vec![coalesce_exec])?;
-                Ok(Transformed::yes(coalesce_async_exec))
-            } else {
-                Ok(Transformed::no(plan))
-            }
-        })
-        .data()
-    }
-
-    fn name(&self) -> &str {
-        "coalesce_async_exec_input"
-    }
-
-    fn schema_check(&self) -> bool {
-        true
-    }
-}
diff --git a/datafusion/physical-optimizer/src/coalesce_batches.rs b/datafusion/physical-optimizer/src/coalesce_batches.rs
deleted file mode 100644
index 5cf2c877c61a4..0000000000000
--- a/datafusion/physical-optimizer/src/coalesce_batches.rs
+++ /dev/null
@@ -1,94 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! CoalesceBatches optimizer that groups batches together rows
-//! in bigger batches to avoid overhead with small batches
-
-use crate::PhysicalOptimizerRule;
-
-use std::sync::Arc;
-
-use datafusion_common::config::ConfigOptions;
-use datafusion_common::error::Result;
-use datafusion_physical_expr::Partitioning;
-use datafusion_physical_plan::{
-    coalesce_batches::CoalesceBatchesExec, filter::FilterExec, joins::HashJoinExec,
-    repartition::RepartitionExec, ExecutionPlan,
-};
-
-use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-
-/// Optimizer rule that introduces CoalesceBatchesExec to avoid overhead with small batches that
-/// are produced by highly selective filters
-#[derive(Default, Debug)]
-pub struct CoalesceBatches {}
-
-impl CoalesceBatches {
-    #[allow(missing_docs)]
-    pub fn new() -> Self {
-        Self::default()
-    }
-}
-impl PhysicalOptimizerRule for CoalesceBatches {
-    fn optimize(
-        &self,
-        plan: Arc<dyn ExecutionPlan>,
-        config: &ConfigOptions,
-    ) -> Result<Arc<dyn ExecutionPlan>> {
-        if !config.execution.coalesce_batches {
-            return Ok(plan);
-        }
-
-        let target_batch_size = config.execution.batch_size;
-        plan.transform_up(|plan| {
-            let plan_any = plan.as_any();
-            // The goal here is to detect operators that could produce small batches and only
-            // wrap those ones with a CoalesceBatchesExec operator. An alternate approach here
-            // would be to build the coalescing logic directly into the operators
-            // See https://github.com/apache/datafusion/issues/139
-            let wrap_in_coalesce = plan_any.downcast_ref::<FilterExec>().is_some()
-                || plan_any.downcast_ref::<HashJoinExec>().is_some()
-                // Don't need to add CoalesceBatchesExec after a round robin RepartitionExec
-                || plan_any
-                    .downcast_ref::<RepartitionExec>()
-                    .map(|repart_exec| {
-                        !matches!(
-                            repart_exec.partitioning().clone(),
-                            Partitioning::RoundRobinBatch(_)
-                        )
-                    })
-                    .unwrap_or(false);
-            if wrap_in_coalesce {
-                Ok(Transformed::yes(Arc::new(CoalesceBatchesExec::new(
-                    plan,
-                    target_batch_size,
-                ))))
-            } else {
-                Ok(Transformed::no(plan))
-            }
-        })
-        .data()
-    }
-
-    fn name(&self) -> &str {
-        "coalesce_batches"
-    }
-
-    fn schema_check(&self) -> bool {
-        true
-    }
-}
diff --git a/datafusion/physical-optimizer/src/combine_partial_final_agg.rs b/datafusion/physical-optimizer/src/combine_partial_final_agg.rs
index bffb2c9df98ec..74e938e75ed64 100644
--- a/datafusion/physical-optimizer/src/combine_partial_final_agg.rs
+++ b/datafusion/physical-optimizer/src/combine_partial_final_agg.rs
@@ -21,16 +21,16 @@
 use std::sync::Arc;
 
 use datafusion_common::error::Result;
+use datafusion_physical_plan::ExecutionPlan;
 use datafusion_physical_plan::aggregates::{
     AggregateExec, AggregateMode, PhysicalGroupBy,
 };
-use datafusion_physical_plan::ExecutionPlan;
 
 use crate::PhysicalOptimizerRule;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
 use datafusion_physical_expr::aggregate::AggregateFunctionExpr;
-use datafusion_physical_expr::{physical_exprs_equal, PhysicalExpr};
+use datafusion_physical_expr::{PhysicalExpr, physical_exprs_equal};
 
 /// CombinePartialFinalAggregate optimizer rule combines the adjacent Partial and Final AggregateExecs
 /// into a Single AggregateExec if their grouping exprs and aggregate exprs equal.
@@ -40,7 +40,7 @@ use datafusion_physical_expr::{physical_exprs_equal, PhysicalExpr};
 pub struct CombinePartialFinalAggregate {}
 
 impl CombinePartialFinalAggregate {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -54,7 +54,7 @@ impl PhysicalOptimizerRule for CombinePartialFinalAggregate {
     ) -> Result<Arc<dyn ExecutionPlan>> {
         plan.transform_down(|plan| {
             // Check if the plan is AggregateExec
-            let Some(agg_exec) = plan.as_any().downcast_ref::<AggregateExec>() else {
+            let Some(agg_exec) = plan.downcast_ref::<AggregateExec>() else {
                 return Ok(Transformed::no(plan));
             };
 
@@ -66,13 +66,12 @@ impl PhysicalOptimizerRule for CombinePartialFinalAggregate {
             }
 
             // Check if the input is AggregateExec
-            let Some(input_agg_exec) =
-                agg_exec.input().as_any().downcast_ref::<AggregateExec>()
+            let Some(input_agg_exec) = agg_exec.input().downcast_ref::<AggregateExec>()
             else {
                 return Ok(Transformed::no(plan));
             };
 
-            let transformed = if matches!(input_agg_exec.mode(), AggregateMode::Partial)
+            let transformed = if *input_agg_exec.mode() == AggregateMode::Partial
                 && can_combine(
                     (
                         agg_exec.group_expr(),
@@ -98,7 +97,9 @@ impl PhysicalOptimizerRule for CombinePartialFinalAggregate {
                     Arc::clone(input_agg_exec.input()),
                     input_agg_exec.input_schema(),
                 )
-                .map(|combined_agg| combined_agg.with_limit(agg_exec.limit()))
+                .map(|combined_agg| {
+                    combined_agg.with_limit_options(agg_exec.limit_options())
+                })
                 .ok()
                 .map(Arc::new)
             } else {
diff --git a/datafusion/physical-optimizer/src/enforce_distribution.rs b/datafusion/physical-optimizer/src/enforce_distribution.rs
index e9e28fec064ff..c522867c05196 100644
--- a/datafusion/physical-optimizer/src/enforce_distribution.rs
+++ b/datafusion/physical-optimizer/src/enforce_distribution.rs
@@ -21,6 +21,7 @@
 //! according to the configuration), this rule increases partition counts in
 //! the physical plan.
 
+use std::any::Any;
 use std::fmt::Debug;
 use std::sync::Arc;
 
@@ -36,12 +37,13 @@ use datafusion_common::config::ConfigOptions;
 use datafusion_common::error::Result;
 use datafusion_common::stats::Precision;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_expr::logical_plan::JoinType;
+use datafusion_expr::logical_plan::{Aggregate, JoinType};
 use datafusion_physical_expr::expressions::{Column, NoOp};
 use datafusion_physical_expr::utils::map_columns_before_projection;
 use datafusion_physical_expr::{
-    physical_exprs_equal, EquivalenceProperties, PhysicalExpr, PhysicalExprRef,
+    EquivalenceProperties, PhysicalExpr, PhysicalExprRef, physical_exprs_equal,
 };
+use datafusion_physical_plan::ExecutionPlanProperties;
 use datafusion_physical_plan::aggregates::{
     AggregateExec, AggregateMode, PhysicalGroupBy,
 };
@@ -54,10 +56,9 @@ use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr};
 use datafusion_physical_plan::repartition::RepartitionExec;
 use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
 use datafusion_physical_plan::tree_node::PlanContext;
-use datafusion_physical_plan::union::{can_interleave, InterleaveExec, UnionExec};
+use datafusion_physical_plan::union::{InterleaveExec, UnionExec, can_interleave};
 use datafusion_physical_plan::windows::WindowAggExec;
-use datafusion_physical_plan::windows::{get_best_fitting_window, BoundedWindowAggExec};
-use datafusion_physical_plan::ExecutionPlanProperties;
+use datafusion_physical_plan::windows::{BoundedWindowAggExec, get_best_fitting_window};
 use datafusion_physical_plan::{Distribution, ExecutionPlan, Partitioning};
 
 use itertools::izip;
@@ -183,7 +184,7 @@ use itertools::izip;
 pub struct EnforceDistribution {}
 
 impl EnforceDistribution {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -286,17 +287,15 @@ pub fn adjust_input_keys_ordering(
 ) -> Result<Transformed<PlanWithKeyRequirements>> {
     let plan = Arc::clone(&requirements.plan);
 
-    if let Some(HashJoinExec {
-        left,
-        right,
-        on,
-        filter,
-        join_type,
-        projection,
-        mode,
-        null_equality,
-        ..
-    }) = plan.as_any().downcast_ref::<HashJoinExec>()
+    if let Some(
+        exec @ HashJoinExec {
+            left,
+            on,
+            join_type,
+            mode,
+            ..
+        },
+    ) = plan.downcast_ref::<HashJoinExec>()
     {
         match mode {
             PartitionMode::Partitioned => {
@@ -304,18 +303,10 @@ pub fn adjust_input_keys_ordering(
                     Vec<(PhysicalExprRef, PhysicalExprRef)>,
                     Vec<SortOptions>,
                 )| {
-                    HashJoinExec::try_new(
-                        Arc::clone(left),
-                        Arc::clone(right),
-                        new_conditions.0,
-                        filter.clone(),
-                        join_type,
-                        // TODO: although projection is not used in the join here, because projection pushdown is after enforce_distribution. Maybe we need to handle it later. Same as filter.
-                        projection.clone(),
-                        PartitionMode::Partitioned,
-                        *null_equality,
-                    )
-                    .map(|e| Arc::new(e) as _)
+                    exec.builder()
+                        .with_partition_mode(PartitionMode::Partitioned)
+                        .with_on(new_conditions.0)
+                        .build_exec()
                 };
                 return reorder_partitioned_join_keys(
                     requirements,
@@ -348,8 +339,7 @@ pub fn adjust_input_keys_ordering(
                 requirements.data.clear();
             }
         }
-    } else if let Some(CrossJoinExec { left, .. }) =
-        plan.as_any().downcast_ref::<CrossJoinExec>()
+    } else if let Some(CrossJoinExec { left, .. }) = plan.downcast_ref::<CrossJoinExec>()
     {
         let left_columns_len = left.schema().fields().len();
         // Push down requirements to the right side
@@ -365,7 +355,7 @@ pub fn adjust_input_keys_ordering(
         sort_options,
         null_equality,
         ..
-    }) = plan.as_any().downcast_ref::<SortMergeJoinExec>()
+    }) = plan.downcast_ref::<SortMergeJoinExec>()
     {
         let join_constructor = |new_conditions: (
             Vec<(PhysicalExprRef, PhysicalExprRef)>,
@@ -389,7 +379,7 @@ pub fn adjust_input_keys_ordering(
             &join_constructor,
         )
         .map(Transformed::yes);
-    } else if let Some(aggregate_exec) = plan.as_any().downcast_ref::<AggregateExec>() {
+    } else if let Some(aggregate_exec) = plan.downcast_ref::<AggregateExec>() {
         if !requirements.data.is_empty() {
             if aggregate_exec.mode() == &AggregateMode::FinalPartitioned {
                 return reorder_aggregate_keys(requirements, aggregate_exec)
@@ -401,7 +391,7 @@ pub fn adjust_input_keys_ordering(
             // Keep everything unchanged
             return Ok(Transformed::no(requirements));
         }
-    } else if let Some(proj) = plan.as_any().downcast_ref::<ProjectionExec>() {
+    } else if let Some(proj) = plan.downcast_ref::<ProjectionExec>() {
         let expr = proj.expr();
         // For Projection, we need to transform the requirements to the columns before the Projection
         // And then to push down the requirements
@@ -417,14 +407,14 @@ pub fn adjust_input_keys_ordering(
             // Can not satisfy, clear the current requirements and generate new empty requirements
             requirements.data.clear();
         }
-    } else if plan.as_any().downcast_ref::<RepartitionExec>().is_some()
-        || plan
-            .as_any()
-            .downcast_ref::<CoalescePartitionsExec>()
-            .is_some()
-        || plan.as_any().downcast_ref::<WindowAggExec>().is_some()
+    } else if plan.is::<RepartitionExec>()
+        || plan.is::<CoalescePartitionsExec>()
+        || plan.is::<WindowAggExec>()
     {
         requirements.data.clear();
+    } else if requirements.data.is_empty() {
+        // No requirements to push down and no plan changes — skip rebuild.
+        return Ok(Transformed::no(requirements));
     } else {
         // By default, push down the parent requirements to children
         for child in requirements.children.iter_mut() {
@@ -457,14 +447,14 @@ where
         positions,
     ) = try_reorder(join_key_pairs, parent_required, eq_properties);
 
-    if let Some(positions) = positions {
-        if !positions.is_empty() {
-            let new_join_on = new_join_conditions(&left_keys, &right_keys);
-            let new_sort_options = (0..sort_options.len())
-                .map(|idx| sort_options[positions[idx]])
-                .collect();
-            join_plan.plan = join_constructor((new_join_on, new_sort_options))?;
-        }
+    if let Some(positions) = positions
+        && !positions.is_empty()
+    {
+        let new_join_on = new_join_conditions(&left_keys, &right_keys);
+        let new_sort_options = (0..sort_options.len())
+            .map(|idx| sort_options[positions[idx]])
+            .collect();
+        join_plan.plan = join_constructor((new_join_on, new_sort_options))?;
     }
 
     join_plan.children[0].data = left_keys;
@@ -493,83 +483,75 @@ pub fn reorder_aggregate_keys(
     if parent_required.len() == output_exprs.len()
         && agg_exec.group_expr().null_expr().is_empty()
         && !physical_exprs_equal(&output_exprs, parent_required)
+        && let Some(positions) = expected_expr_positions(&output_exprs, parent_required)
+        && let Some(agg_exec) = agg_exec.input().downcast_ref::<AggregateExec>()
+        && *agg_exec.mode() == AggregateMode::Partial
     {
-        if let Some(positions) = expected_expr_positions(&output_exprs, parent_required) {
-            if let Some(agg_exec) =
-                agg_exec.input().as_any().downcast_ref::<AggregateExec>()
-            {
-                if matches!(agg_exec.mode(), &AggregateMode::Partial) {
-                    let group_exprs = agg_exec.group_expr().expr();
-                    let new_group_exprs = positions
-                        .into_iter()
-                        .map(|idx| group_exprs[idx].clone())
-                        .collect();
-                    let partial_agg = Arc::new(AggregateExec::try_new(
-                        AggregateMode::Partial,
-                        PhysicalGroupBy::new_single(new_group_exprs),
-                        agg_exec.aggr_expr().to_vec(),
-                        agg_exec.filter_expr().to_vec(),
-                        Arc::clone(agg_exec.input()),
-                        Arc::clone(&agg_exec.input_schema),
-                    )?);
-                    // Build new group expressions that correspond to the output
-                    // of the "reordered" aggregator:
-                    let group_exprs = partial_agg.group_expr().expr();
-                    let new_group_by = PhysicalGroupBy::new_single(
-                        partial_agg
-                            .output_group_expr()
-                            .into_iter()
-                            .enumerate()
-                            .map(|(idx, expr)| (expr, group_exprs[idx].1.clone()))
-                            .collect(),
-                    );
-                    let new_final_agg = Arc::new(AggregateExec::try_new(
-                        AggregateMode::FinalPartitioned,
-                        new_group_by,
-                        agg_exec.aggr_expr().to_vec(),
-                        agg_exec.filter_expr().to_vec(),
-                        Arc::clone(&partial_agg) as _,
-                        agg_exec.input_schema(),
-                    )?);
-
-                    agg_node.plan = Arc::clone(&new_final_agg) as _;
-                    agg_node.data.clear();
-                    agg_node.children = vec![PlanWithKeyRequirements::new(
-                        partial_agg as _,
-                        vec![],
-                        agg_node.children.swap_remove(0).children,
-                    )];
-
-                    // Need to create a new projection to change the expr ordering back
-                    let agg_schema = new_final_agg.schema();
-                    let mut proj_exprs = output_columns
-                        .iter()
-                        .map(|col| {
-                            let name = col.name();
-                            let index = agg_schema.index_of(name)?;
-                            Ok(ProjectionExpr {
-                                expr: Arc::new(Column::new(name, index)) as _,
-                                alias: name.to_owned(),
-                            })
-                        })
-                        .collect::<Result<Vec<_>>>()?;
-                    let agg_fields = agg_schema.fields();
-                    for (idx, field) in
-                        agg_fields.iter().enumerate().skip(output_columns.len())
-                    {
-                        let name = field.name();
-                        let plan = Arc::new(Column::new(name, idx)) as _;
-                        proj_exprs.push(ProjectionExpr {
-                            expr: plan,
-                            alias: name.clone(),
-                        })
-                    }
-                    return ProjectionExec::try_new(proj_exprs, new_final_agg).map(|p| {
-                        PlanWithKeyRequirements::new(Arc::new(p), vec![], vec![agg_node])
-                    });
-                }
-            }
+        let group_exprs = agg_exec.group_expr().expr();
+        let new_group_exprs = positions
+            .into_iter()
+            .map(|idx| group_exprs[idx].clone())
+            .collect();
+        let partial_agg = Arc::new(AggregateExec::try_new(
+            AggregateMode::Partial,
+            PhysicalGroupBy::new_single(new_group_exprs),
+            agg_exec.aggr_expr().to_vec(),
+            agg_exec.filter_expr().to_vec(),
+            Arc::clone(agg_exec.input()),
+            Arc::clone(&agg_exec.input_schema),
+        )?);
+        // Build new group expressions that correspond to the output
+        // of the "reordered" aggregator:
+        let group_exprs = partial_agg.group_expr().expr();
+        let new_group_by = PhysicalGroupBy::new_single(
+            partial_agg
+                .output_group_expr()
+                .into_iter()
+                .enumerate()
+                .map(|(idx, expr)| (expr, group_exprs[idx].1.clone()))
+                .collect(),
+        );
+        let new_final_agg = Arc::new(AggregateExec::try_new(
+            AggregateMode::FinalPartitioned,
+            new_group_by,
+            agg_exec.aggr_expr().to_vec(),
+            agg_exec.filter_expr().to_vec(),
+            Arc::clone(&partial_agg) as _,
+            agg_exec.input_schema(),
+        )?);
+
+        agg_node.plan = Arc::clone(&new_final_agg) as _;
+        agg_node.data.clear();
+        agg_node.children = vec![PlanWithKeyRequirements::new(
+            partial_agg as _,
+            vec![],
+            agg_node.children.swap_remove(0).children,
+        )];
+
+        // Need to create a new projection to change the expr ordering back
+        let agg_schema = new_final_agg.schema();
+        let mut proj_exprs = output_columns
+            .iter()
+            .map(|col| {
+                let name = col.name();
+                let index = agg_schema.index_of(name)?;
+                Ok(ProjectionExpr {
+                    expr: Arc::new(Column::new(name, index)) as _,
+                    alias: name.to_owned(),
+                })
+            })
+            .collect::<Result<Vec<_>>>()?;
+        let agg_fields = agg_schema.fields();
+        for (idx, field) in agg_fields.iter().enumerate().skip(output_columns.len()) {
+            let name = field.name();
+            let plan = Arc::new(Column::new(name, idx)) as _;
+            proj_exprs.push(ProjectionExpr {
+                expr: plan,
+                alias: name.clone(),
+            })
         }
+        return ProjectionExec::try_new(proj_exprs, new_final_agg)
+            .map(|p| PlanWithKeyRequirements::new(Arc::new(p), vec![], vec![agg_node]));
     }
     Ok(agg_node)
 }
@@ -581,11 +563,13 @@ fn shift_right_required(
     let new_right_required = parent_required
         .iter()
         .filter_map(|r| {
-            r.as_any().downcast_ref::<Column>().and_then(|col| {
-                col.index()
-                    .checked_sub(left_columns_len)
-                    .map(|index| Arc::new(Column::new(col.name(), index)) as _)
-            })
+            (r.as_ref() as &dyn Any)
+                .downcast_ref::<Column>()
+                .and_then(|col| {
+                    col.index()
+                        .checked_sub(left_columns_len)
+                        .map(|index| Arc::new(Column::new(col.name(), index)) as _)
+                })
         })
         .collect::<Vec<_>>();
 
@@ -616,20 +600,17 @@ fn shift_right_required(
 pub fn reorder_join_keys_to_inputs(
     plan: Arc<dyn ExecutionPlan>,
 ) -> Result<Arc<dyn ExecutionPlan>> {
-    let plan_any = plan.as_any();
-    if let Some(HashJoinExec {
-        left,
-        right,
-        on,
-        filter,
-        join_type,
-        projection,
-        mode,
-        null_equality,
-        ..
-    }) = plan_any.downcast_ref::<HashJoinExec>()
+    if let Some(
+        exec @ HashJoinExec {
+            left,
+            right,
+            on,
+            mode,
+            ..
+        },
+    ) = plan.downcast_ref::<HashJoinExec>()
     {
-        if matches!(mode, PartitionMode::Partitioned) {
+        if *mode == PartitionMode::Partitioned {
             let (join_keys, positions) = reorder_current_join_keys(
                 extract_join_keys(on),
                 Some(left.output_partitioning()),
@@ -643,16 +624,11 @@ pub fn reorder_join_keys_to_inputs(
                     right_keys,
                 } = join_keys;
                 let new_join_on = new_join_conditions(&left_keys, &right_keys);
-                return Ok(Arc::new(HashJoinExec::try_new(
-                    Arc::clone(left),
-                    Arc::clone(right),
-                    new_join_on,
-                    filter.clone(),
-                    join_type,
-                    projection.clone(),
-                    PartitionMode::Partitioned,
-                    *null_equality,
-                )?));
+                return exec
+                    .builder()
+                    .with_partition_mode(PartitionMode::Partitioned)
+                    .with_on(new_join_on)
+                    .build_exec();
             }
         }
     } else if let Some(SortMergeJoinExec {
@@ -664,7 +640,7 @@ pub fn reorder_join_keys_to_inputs(
         sort_options,
         null_equality,
         ..
-    }) = plan_any.downcast_ref::<SortMergeJoinExec>()
+    }) = plan.downcast_ref::<SortMergeJoinExec>()
     {
         let (join_keys, positions) = reorder_current_join_keys(
             extract_join_keys(on),
@@ -673,27 +649,27 @@ pub fn reorder_join_keys_to_inputs(
             left.equivalence_properties(),
             right.equivalence_properties(),
         );
-        if let Some(positions) = positions {
-            if !positions.is_empty() {
-                let JoinKeyPairs {
-                    left_keys,
-                    right_keys,
-                } = join_keys;
-                let new_join_on = new_join_conditions(&left_keys, &right_keys);
-                let new_sort_options = (0..sort_options.len())
-                    .map(|idx| sort_options[positions[idx]])
-                    .collect();
-                return SortMergeJoinExec::try_new(
-                    Arc::clone(left),
-                    Arc::clone(right),
-                    new_join_on,
-                    filter.clone(),
-                    *join_type,
-                    new_sort_options,
-                    *null_equality,
-                )
-                .map(|smj| Arc::new(smj) as _);
-            }
+        if let Some(positions) = positions
+            && !positions.is_empty()
+        {
+            let JoinKeyPairs {
+                left_keys,
+                right_keys,
+            } = join_keys;
+            let new_join_on = new_join_conditions(&left_keys, &right_keys);
+            let new_sort_options = (0..sort_options.len())
+                .map(|idx| sort_options[positions[idx]])
+                .collect();
+            return SortMergeJoinExec::try_new(
+                Arc::clone(left),
+                Arc::clone(right),
+                new_join_on,
+                filter.clone(),
+                *join_type,
+                new_sort_options,
+                *null_equality,
+            )
+            .map(|smj| Arc::new(smj) as _);
         }
     }
     Ok(plan)
@@ -889,6 +865,8 @@ fn add_roundrobin_on_top(
 /// * `hash_exprs`: Stores Physical Exprs that are used during hashing.
 /// * `n_target`: desired target partition number, if partition number of the
 ///   current executor is less than this value. Partition number will be increased.
+/// * `allow_subset_satisfy_partitioning`: Whether to allow subset partitioning logic in satisfaction checks.
+///   Set to `false` for partitioned hash joins to ensure exact hash matching.
 ///
 /// # Returns
 ///
@@ -898,6 +876,7 @@ fn add_hash_on_top(
     input: DistributionContext,
     hash_exprs: Vec<Arc<dyn PhysicalExpr>>,
     n_target: usize,
+    allow_subset_satisfy_partitioning: bool,
 ) -> Result<DistributionContext> {
     // Early return if hash repartition is unnecessary
     // `RepartitionExec: partitioning=Hash([...], 1), input_partitions=1` is unnecessary.
@@ -906,15 +885,23 @@ fn add_hash_on_top(
     }
 
     let dist = Distribution::HashPartitioned(hash_exprs);
-    let satisfied = input
-        .plan
-        .output_partitioning()
-        .satisfy(&dist, input.plan.equivalence_properties());
+    let satisfaction = input.plan.output_partitioning().satisfaction(
+        &dist,
+        input.plan.equivalence_properties(),
+        allow_subset_satisfy_partitioning,
+    );
 
     // Add hash repartitioning when:
-    // - The hash distribution requirement is not satisfied, or
-    // - We can increase parallelism by adding hash partitioning.
-    if !satisfied || n_target > input.plan.output_partitioning().partition_count() {
+    // - When subset satisfaction is enabled (current >= threshold): only repartition if not satisfied
+    // - When below threshold (current < threshold): repartition if expressions don't match OR to increase parallelism
+    let needs_repartition = if allow_subset_satisfy_partitioning {
+        !satisfaction.is_satisfied()
+    } else {
+        !satisfaction.is_satisfied()
+            || n_target > input.plan.output_partitioning().partition_count()
+    };
+
+    if needs_repartition {
         // When there is an existing ordering, we preserve ordering during
         // repartition. This will be rolled back in the future if any of the
         // following conditions is true:
@@ -942,6 +929,43 @@ fn add_hash_on_top(
 ///
 /// * `input`: Current node.
 ///
+/// Checks whether preserving the child's ordering enables the parent to
+/// run in streaming mode. Compares the parent's pipeline behavior with
+/// the ordered child vs. an unordered (coalesced) child. If removing the
+/// ordering would cause the parent to switch from streaming to blocking,
+/// keeping the order-preserving variant is beneficial.
+///
+/// Only applicable to single-child operators; returns `Ok(false)` for
+/// multi-child operators (e.g. joins) where child substitution semantics are
+/// ambiguous.
+fn preserving_order_enables_streaming(
+    parent: &Arc<dyn ExecutionPlan>,
+    ordered_child: &Arc<dyn ExecutionPlan>,
+) -> Result<bool> {
+    // Only applicable to single-child operators that maintain input order
+    // (e.g. AggregateExec in PartiallySorted mode). Operators that don't
+    // maintain input order (e.g. SortExec) handle ordering themselves —
+    // preserving SPM for them is unnecessary.
+    if parent.children().len() != 1 {
+        return Ok(false);
+    }
+    if !parent.maintains_input_order()[0] {
+        return Ok(false);
+    }
+    // Build parent with the ordered child
+    let with_ordered =
+        Arc::clone(parent).with_new_children(vec![Arc::clone(ordered_child)])?;
+    if with_ordered.pipeline_behavior() == EmissionType::Final {
+        // Parent is blocking even with ordering — no benefit
+        return Ok(false);
+    }
+    // Build parent with an unordered child via CoalescePartitionsExec.
+    let unordered_child: Arc<dyn ExecutionPlan> =
+        Arc::new(CoalescePartitionsExec::new(Arc::clone(ordered_child)));
+    let without_ordered = Arc::clone(parent).with_new_children(vec![unordered_child])?;
+    Ok(without_ordered.pipeline_behavior() == EmissionType::Final)
+}
+
 /// # Returns
 ///
 /// Updated node with an execution plan, where the desired single distribution
@@ -1043,16 +1067,14 @@ pub fn replace_order_preserving_variants(
             CoalescePartitionsExec::new(child_plan).with_fetch(context.plan.fetch()),
         );
         return Ok(context);
-    } else if let Some(repartition) =
-        context.plan.as_any().downcast_ref::<RepartitionExec>()
+    } else if let Some(repartition) = context.plan.downcast_ref::<RepartitionExec>()
+        && repartition.preserve_order()
     {
-        if repartition.preserve_order() {
-            context.plan = Arc::new(RepartitionExec::try_new(
-                Arc::clone(&context.children[0].plan),
-                repartition.partitioning().clone(),
-            )?);
-            return Ok(context);
-        }
+        context.plan = Arc::new(RepartitionExec::try_new(
+            Arc::clone(&context.children[0].plan),
+            repartition.partitioning().clone(),
+        )?);
+        return Ok(context);
     }
 
     context.update_plan_from_children()
@@ -1184,6 +1206,7 @@ pub fn ensure_distribution(
     let should_use_estimates = config
         .execution
         .use_row_number_estimates_to_optimize_partitioning;
+    let subset_satisfaction_threshold = config.optimizer.subset_repartition_threshold;
     let unbounded_and_pipeline_friendly = dist_context.plan.boundedness().is_unbounded()
         && matches!(
             dist_context.plan.pipeline_behavior(),
@@ -1203,7 +1226,7 @@ pub fn ensure_distribution(
         children,
     } = remove_dist_changing_operators(dist_context)?;
 
-    if let Some(exec) = plan.as_any().downcast_ref::<WindowAggExec>() {
+    if let Some(exec) = plan.downcast_ref::<WindowAggExec>() {
         if let Some(updated_window) = get_best_fitting_window(
             exec.window_expr(),
             exec.input(),
@@ -1211,16 +1234,50 @@ pub fn ensure_distribution(
         )? {
             plan = updated_window;
         }
-    } else if let Some(exec) = plan.as_any().downcast_ref::<BoundedWindowAggExec>() {
-        if let Some(updated_window) = get_best_fitting_window(
+    } else if let Some(exec) = plan.downcast_ref::<BoundedWindowAggExec>()
+        && let Some(updated_window) = get_best_fitting_window(
             exec.window_expr(),
             exec.input(),
             &exec.partition_keys(),
-        )? {
-            plan = updated_window;
-        }
+        )?
+    {
+        plan = updated_window;
     };
 
+    // For joins in partitioned mode, we need exact hash matching between
+    // both sides, so subset partitioning logic must be disabled.
+    //
+    // Why: Different hash expressions produce different hash values, causing
+    // rows with the same join key to land in different partitions. Since
+    // partitioned joins match partition N left with partition N right, rows
+    // that should match may be in different partitions and miss each other.
+    //
+    // Example JOIN ON left.a = right.a:
+    //
+    // Left: Hash([a])
+    //  Partition 1: a=1
+    //  Partition 2: a=2
+    //
+    // Right: Hash([a, b])
+    //  Partition 1: (a=1, b=1) -> Same a=1
+    //  Partition 2: (a=2, b=2)
+    //  Partition 3: (a=1, b=2) -> Same a=1
+    //
+    // Partitioned join execution:
+    //  P1 left (a=1) joins P1 right (a=1, b=1) -> Match
+    //  P2 left (a=2) joins P2 right (a=2, b=2) -> Match
+    //  P3 left (empty) joins P3 right (a=1, b=2) -> Missing, errors
+    //
+    // The row (a=1, b=2) should match left.a=1 but they're in different
+    // partitions, causing panics.
+    //
+    // CollectLeft/CollectRight modes are safe because one side is collected
+    // to a single partition which eliminates partition-to-partition mapping.
+    let is_partitioned_join = plan
+        .downcast_ref::<HashJoinExec>()
+        .is_some_and(|join| join.mode == PartitionMode::Partitioned)
+        || plan.is::<SortMergeJoinExec>();
+
     let repartition_status_flags =
         get_repartition_requirement_status(&plan, batch_size, should_use_estimates)?;
     // This loop iterates over all the children to:
@@ -1246,12 +1303,44 @@ pub fn ensure_distribution(
                 hash_necessary,
             },
         )| {
+            let increases_partition_count =
+                child.plan.output_partitioning().partition_count() < target_partitions;
+
             let add_roundrobin = enable_round_robin
                 // Operator benefits from partitioning (e.g. filter):
                 && roundrobin_beneficial
                 && roundrobin_beneficial_stats
                 // Unless partitioning increases the partition count, it is not beneficial:
-                && child.plan.output_partitioning().partition_count() < target_partitions;
+                && increases_partition_count;
+
+            // Allow subset satisfaction when:
+            // 1. Current partition count >= threshold
+            // 2. Not a partitioned join since must use exact hash matching for joins
+            // 3. Not a grouping set aggregate (requires exact hash including __grouping_id)
+            let current_partitions = child.plan.output_partitioning().partition_count();
+
+            // Check if the hash partitioning requirement includes __grouping_id column.
+            // Grouping set aggregates (ROLLUP, CUBE, GROUPING SETS) require exact hash
+            // partitioning on all group columns including __grouping_id to ensure partial
+            // aggregates from different partitions are correctly combined.
+            let requires_grouping_id = matches!(&requirement, Distribution::HashPartitioned(exprs)
+                if exprs.iter().any(|expr| {
+                    (expr.as_ref() as &dyn Any)
+                        .downcast_ref::<Column>()
+                        .is_some_and(|col| col.name() == Aggregate::INTERNAL_GROUPING_ID)
+                })
+            );
+
+            let allow_subset_satisfy_partitioning = (current_partitions
+                >= subset_satisfaction_threshold
+                // `preserve_file_partitions` exposes existing file-group
+                // partitioning to the optimizer. Respect it when the only
+                // reason to repartition would be to increase partition count
+                // beyond the preserved file-group count.
+                || (config.optimizer.preserve_file_partitions > 0
+                    && current_partitions < target_partitions))
+                && !is_partitioned_join
+                && !requires_grouping_id;
 
             // When `repartition_file_scans` is set, attempt to increase
             // parallelism at the source.
@@ -1259,12 +1348,12 @@ pub fn ensure_distribution(
             // If repartitioning is not possible (a.k.a. None is returned from `ExecutionPlan::repartitioned`)
             // then no repartitioning will have occurred. As the default implementation returns None, it is only
             // specific physical plan nodes, such as certain datasources, which are repartitioned.
-            if repartition_file_scans && roundrobin_beneficial_stats {
-                if let Some(new_child) =
+            if repartition_file_scans
+                && roundrobin_beneficial_stats
+                && let Some(new_child) =
                     child.plan.repartitioned(target_partitions, config)?
-                {
-                    child.plan = new_child;
-                }
+            {
+                child.plan = new_child;
             }
 
             // Satisfy the distribution requirement if it is unmet.
@@ -1273,15 +1362,15 @@ pub fn ensure_distribution(
                     child = add_merge_on_top(child);
                 }
                 Distribution::HashPartitioned(exprs) => {
-                    if add_roundrobin {
-                        // Add round-robin repartitioning on top of the operator
-                        // to increase parallelism.
-                        child = add_roundrobin_on_top(child, target_partitions)?;
-                    }
+                    // See https://github.com/apache/datafusion/issues/18341#issuecomment-3503238325 for background
                     // When inserting hash is necessary to satisfy hash requirement, insert hash repartition.
                     if hash_necessary {
-                        child =
-                            add_hash_on_top(child, exprs.to_vec(), target_partitions)?;
+                        child = add_hash_on_top(
+                            child,
+                            exprs.to_vec(),
+                            target_partitions,
+                            allow_subset_satisfy_partitioning,
+                        )?;
                     }
                 }
                 Distribution::UnspecifiedDistribution => {
@@ -1293,6 +1382,12 @@ pub fn ensure_distribution(
                 }
             };
 
+            let streaming_benefit = if child.data {
+                preserving_order_enables_streaming(&plan, &child.plan)?
+            } else {
+                false
+            };
+
             // There is an ordering requirement of the operator:
             if let Some(required_input_ordering) = required_input_ordering {
                 // Either:
@@ -1305,6 +1400,7 @@ pub fn ensure_distribution(
                     .ordering_satisfy_requirement(sort_req.clone())?;
 
                 if (!ordering_satisfied || !order_preserving_variants_desirable)
+                    && !streaming_benefit
                     && child.data
                 {
                     child = replace_order_preserving_variants(child)?;
@@ -1315,8 +1411,7 @@ pub fn ensure_distribution(
                         child = add_sort_above_with_check(
                             child,
                             sort_req,
-                            plan.as_any()
-                                .downcast_ref::<OutputRequirementExec>()
+                            plan.downcast_ref::<OutputRequirementExec>()
                                 .map(|output| output.fetch())
                                 .unwrap_or(None),
                         )?;
@@ -1325,16 +1420,26 @@ pub fn ensure_distribution(
                 // Stop tracking distribution changing operators
                 child.data = false;
             } else {
+                let streaming_benefit = if child.data {
+                    preserving_order_enables_streaming(&plan, &child.plan)?
+                } else {
+                    false
+                };
                 // no ordering requirement
                 match requirement {
                     // Operator requires specific distribution.
                     Distribution::SinglePartition | Distribution::HashPartitioned(_) => {
-                        // Since there is no ordering requirement, preserving ordering is pointless
-                        child = replace_order_preserving_variants(child)?;
+                        // If the parent doesn't maintain input order, preserving
+                        // ordering is pointless. However, if it does maintain
+                        // input order, we keep order-preserving variants so
+                        // ordering can flow through to ancestors that need it.
+                        if !maintains && !streaming_benefit {
+                            child = replace_order_preserving_variants(child)?;
+                        }
                     }
                     Distribution::UnspecifiedDistribution => {
                         // Since ordering is lost, trying to preserve ordering is pointless
-                        if !maintains || plan.as_any().is::<OutputRequirementExec>() {
+                        if !maintains || plan.is::<OutputRequirementExec>() {
                             child = replace_order_preserving_variants(child)?;
                         }
                     }
@@ -1350,7 +1455,7 @@ pub fn ensure_distribution(
         .map(|c| Arc::clone(&c.plan))
         .collect::<Vec<_>>();
 
-    plan = if plan.as_any().is::<UnionExec>()
+    plan = if plan.is::<UnionExec>()
         && !config.optimizer.prefer_existing_union
         && can_interleave(children_plans.iter())
     {
@@ -1395,31 +1500,31 @@ pub type DistributionContext = PlanContext<bool>;
 
 fn update_children(mut dist_context: DistributionContext) -> Result<DistributionContext> {
     for child_context in dist_context.children.iter_mut() {
-        let child_plan_any = child_context.plan.as_any();
-        child_context.data =
-            if let Some(repartition) = child_plan_any.downcast_ref::<RepartitionExec>() {
-                !matches!(
-                    repartition.partitioning(),
-                    Partitioning::UnknownPartitioning(_)
-                )
-            } else {
-                child_plan_any.is::<SortPreservingMergeExec>()
-                    || child_plan_any.is::<CoalescePartitionsExec>()
-                    || child_context.plan.children().is_empty()
-                    || child_context.children[0].data
-                    || child_context
-                        .plan
-                        .required_input_distribution()
-                        .iter()
-                        .zip(child_context.children.iter())
-                        .any(|(required_dist, child_context)| {
-                            child_context.data
-                                && matches!(
-                                    required_dist,
-                                    Distribution::UnspecifiedDistribution
-                                )
-                        })
-            }
+        child_context.data = if let Some(repartition) =
+            child_context.plan.downcast_ref::<RepartitionExec>()
+        {
+            !matches!(
+                repartition.partitioning(),
+                Partitioning::UnknownPartitioning(_)
+            )
+        } else {
+            child_context.plan.is::<SortPreservingMergeExec>()
+                || child_context.plan.is::<CoalescePartitionsExec>()
+                || child_context.plan.children().is_empty()
+                || child_context.children[0].data
+                || child_context
+                    .plan
+                    .required_input_distribution()
+                    .iter()
+                    .zip(child_context.children.iter())
+                    .any(|(required_dist, child_context)| {
+                        child_context.data
+                            && matches!(
+                                required_dist,
+                                Distribution::UnspecifiedDistribution
+                            )
+                    })
+        }
     }
 
     dist_context.data = false;
diff --git a/datafusion/physical-optimizer/src/enforce_sorting/mod.rs b/datafusion/physical-optimizer/src/enforce_sorting/mod.rs
index 28d187bbf8930..729a6b3121a83 100644
--- a/datafusion/physical-optimizer/src/enforce_sorting/mod.rs
+++ b/datafusion/physical-optimizer/src/enforce_sorting/mod.rs
@@ -40,23 +40,23 @@ pub mod sort_pushdown;
 
 use std::sync::Arc;
 
+use crate::PhysicalOptimizerRule;
 use crate::enforce_sorting::replace_with_order_preserving_variants::{
-    replace_with_order_preserving_variants, OrderPreservationContext,
+    OrderPreservationContext, replace_with_order_preserving_variants,
 };
 use crate::enforce_sorting::sort_pushdown::{
-    assign_initial_requirements, pushdown_sorts, SortPushDown,
+    SortPushDown, assign_initial_requirements, pushdown_sorts,
 };
 use crate::output_requirements::OutputRequirementExec;
 use crate::utils::{
     add_sort_above, add_sort_above_with_check, is_coalesce_partitions, is_limit,
-    is_repartition, is_sort, is_sort_preserving_merge, is_union, is_window,
+    is_repartition, is_sort, is_sort_preserving_merge, is_window,
 };
-use crate::PhysicalOptimizerRule;
 
+use datafusion_common::Result;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::plan_err;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::Result;
 use datafusion_physical_expr::{Distribution, Partitioning};
 use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement};
 use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
@@ -67,7 +67,7 @@ use datafusion_physical_plan::sorts::sort::SortExec;
 use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
 use datafusion_physical_plan::tree_node::PlanContext;
 use datafusion_physical_plan::windows::{
-    get_best_fitting_window, BoundedWindowAggExec, WindowAggExec,
+    BoundedWindowAggExec, WindowAggExec, get_best_fitting_window,
 };
 use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties, InputOrderMode};
 
@@ -79,7 +79,7 @@ use itertools::izip;
 pub struct EnforceSorting {}
 
 impl EnforceSorting {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -265,8 +265,7 @@ impl PhysicalOptimizerRule for EnforceSorting {
 fn replace_with_partial_sort(
     plan: Arc<dyn ExecutionPlan>,
 ) -> Result<Arc<dyn ExecutionPlan>> {
-    let plan_any = plan.as_any();
-    let Some(sort_plan) = plan_any.downcast_ref::<SortExec>() else {
+    let Some(sort_plan) = plan.downcast_ref::<SortExec>() else {
         return Ok(plan);
     };
 
@@ -509,17 +508,13 @@ pub fn ensure_sorting(
                 child = add_sort_above(
                     child,
                     req,
-                    plan.as_any()
-                        .downcast_ref::<OutputRequirementExec>()
+                    plan.downcast_ref::<OutputRequirementExec>()
                         .map(|output| output.fetch())
                         .unwrap_or(None),
                 );
                 child = update_sort_ctx_children_data(child, true)?;
             }
-        } else if physical_ordering.is_none()
-            || !plan.maintains_input_order()[idx]
-            || is_union(plan)
-        {
+        } else if physical_ordering.is_none() || !plan.maintains_input_order()[idx] {
             // We have a `SortExec` whose effect may be neutralized by another
             // order-imposing operator, remove this sort:
             child = update_child_to_remove_unnecessary_sort(idx, child, plan)?;
@@ -551,13 +546,14 @@ pub fn ensure_sorting(
 
 /// Analyzes if there are any immediate sort removals by checking the `SortExec`s
 /// and their ordering requirement satisfactions with children
-/// If the sort is unnecessary, either replaces it with [`SortPreservingMergeExec`]/`LimitExec`
-/// or removes the [`SortExec`].
+/// If the sort is unnecessary, either replaces it with
+/// [`SortPreservingMergeExec`] and/or a limit node, or removes the
+/// [`SortExec`].
 /// Otherwise, returns the original plan
 fn analyze_immediate_sort_removal(
     mut node: PlanWithCorrespondingSort,
 ) -> Result<Transformed<PlanWithCorrespondingSort>> {
-    let Some(sort_exec) = node.plan.as_any().downcast_ref::<SortExec>() else {
+    let Some(sort_exec) = node.plan.downcast_ref::<SortExec>() else {
         return Ok(Transformed::no(node));
     };
     let sort_input = sort_exec.input();
@@ -584,11 +580,17 @@ fn analyze_immediate_sort_removal(
         // Remove the sort:
         node.children = node.children.swap_remove(0).children;
         if let Some(fetch) = sort_exec.fetch() {
+            let required_ordering = sort_exec.properties().output_ordering().cloned();
             // If the sort has a fetch, we need to add a limit:
             if properties.output_partitioning().partition_count() == 1 {
-                Arc::new(GlobalLimitExec::new(Arc::clone(sort_input), 0, Some(fetch)))
+                let mut global_limit =
+                    GlobalLimitExec::new(Arc::clone(sort_input), 0, Some(fetch));
+                global_limit.set_required_ordering(required_ordering);
+                Arc::new(global_limit)
             } else {
-                Arc::new(LocalLimitExec::new(Arc::clone(sort_input), fetch))
+                let mut local_limit = LocalLimitExec::new(Arc::clone(sort_input), fetch);
+                local_limit.set_required_ordering(required_ordering);
+                Arc::new(local_limit)
             }
         } else {
             Arc::clone(sort_input)
@@ -616,22 +618,22 @@ fn adjust_window_sort_removal(
     )?;
     window_tree.children.push(child_node);
 
-    let plan = window_tree.plan.as_any();
     let child_plan = &window_tree.children[0].plan;
-    let (window_expr, new_window) =
-        if let Some(exec) = plan.downcast_ref::<WindowAggExec>() {
-            let window_expr = exec.window_expr();
-            let new_window =
-                get_best_fitting_window(window_expr, child_plan, &exec.partition_keys())?;
-            (window_expr, new_window)
-        } else if let Some(exec) = plan.downcast_ref::<BoundedWindowAggExec>() {
-            let window_expr = exec.window_expr();
-            let new_window =
-                get_best_fitting_window(window_expr, child_plan, &exec.partition_keys())?;
-            (window_expr, new_window)
-        } else {
-            return plan_err!("Expected WindowAggExec or BoundedWindowAggExec");
-        };
+    let (window_expr, new_window) = if let Some(exec) =
+        window_tree.plan.downcast_ref::<WindowAggExec>()
+    {
+        let window_expr = exec.window_expr();
+        let new_window =
+            get_best_fitting_window(window_expr, child_plan, &exec.partition_keys())?;
+        (window_expr, new_window)
+    } else if let Some(exec) = window_tree.plan.downcast_ref::<BoundedWindowAggExec>() {
+        let window_expr = exec.window_expr();
+        let new_window =
+            get_best_fitting_window(window_expr, child_plan, &exec.partition_keys())?;
+        (window_expr, new_window)
+    } else {
+        return plan_err!("Expected WindowAggExec or BoundedWindowAggExec");
+    };
 
     window_tree.plan = if let Some(new_window) = new_window {
         // We were able to change the window to accommodate the input, use it:
@@ -702,7 +704,7 @@ fn remove_bottleneck_in_subplan(
             .collect::<Result<_>>()?;
     }
     let mut new_reqs = requirements.update_plan_from_children()?;
-    if let Some(repartition) = new_reqs.plan.as_any().downcast_ref::<RepartitionExec>() {
+    if let Some(repartition) = new_reqs.plan.downcast_ref::<RepartitionExec>() {
         let input_partitioning = repartition.input().output_partitioning();
         // We can remove this repartitioning operator if it is now a no-op:
         let mut can_remove = input_partitioning.eq(repartition.partitioning());
@@ -740,7 +742,7 @@ fn remove_corresponding_sort_from_sub_plan(
     requires_single_partition: bool,
 ) -> Result<PlanWithCorrespondingSort> {
     // A `SortExec` is always at the bottom of the tree.
-    if let Some(sort_exec) = node.plan.as_any().downcast_ref::<SortExec>() {
+    if let Some(sort_exec) = node.plan.downcast_ref::<SortExec>() {
         // Do not remove sorts with fetch:
         if sort_exec.fetch().is_none() {
             node = node.children.swap_remove(0);
@@ -773,9 +775,7 @@ fn remove_corresponding_sort_from_sub_plan(
         if is_sort_preserving_merge(&node.plan) {
             node.children = node.children.swap_remove(0).children;
             node.plan = Arc::clone(node.plan.children().swap_remove(0));
-        } else if let Some(repartition) =
-            node.plan.as_any().downcast_ref::<RepartitionExec>()
-        {
+        } else if let Some(repartition) = node.plan.downcast_ref::<RepartitionExec>() {
             node.plan = Arc::new(RepartitionExec::try_new(
                 Arc::clone(&node.children[0].plan),
                 repartition.properties().output_partitioning().clone(),
@@ -807,10 +807,9 @@ fn remove_corresponding_sort_from_sub_plan(
 fn get_sort_exprs(
     sort_any: &Arc<dyn ExecutionPlan>,
 ) -> Result<(&LexOrdering, Option<usize>)> {
-    if let Some(sort_exec) = sort_any.as_any().downcast_ref::<SortExec>() {
+    if let Some(sort_exec) = sort_any.downcast_ref::<SortExec>() {
         Ok((sort_exec.expr(), sort_exec.fetch()))
-    } else if let Some(spm) = sort_any.as_any().downcast_ref::<SortPreservingMergeExec>()
-    {
+    } else if let Some(spm) = sort_any.downcast_ref::<SortPreservingMergeExec>() {
         Ok((spm.expr(), spm.fetch()))
     } else {
         plan_err!("Given ExecutionPlan is not a SortExec or a SortPreservingMergeExec")
diff --git a/datafusion/physical-optimizer/src/enforce_sorting/replace_with_order_preserving_variants.rs b/datafusion/physical-optimizer/src/enforce_sorting/replace_with_order_preserving_variants.rs
index b536e7960208e..6ab84dc95eab9 100644
--- a/datafusion/physical-optimizer/src/enforce_sorting/replace_with_order_preserving_variants.rs
+++ b/datafusion/physical-optimizer/src/enforce_sorting/replace_with_order_preserving_variants.rs
@@ -27,13 +27,13 @@ use crate::utils::{
 
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::tree_node::Transformed;
-use datafusion_common::{internal_err, Result};
+use datafusion_common::{Result, assert_or_internal_err};
+use datafusion_physical_plan::ExecutionPlanProperties;
 use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion_physical_plan::execution_plan::EmissionType;
 use datafusion_physical_plan::repartition::RepartitionExec;
 use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
 use datafusion_physical_plan::tree_node::PlanContext;
-use datafusion_physical_plan::ExecutionPlanProperties;
 
 use itertools::izip;
 
@@ -139,16 +139,21 @@ pub fn plan_with_order_preserving_variants(
         if let Some(ordering) = child.output_ordering() {
             let mut fetch = fetch;
             if let Some(coalesce_fetch) = sort_input.plan.fetch() {
-                if let Some(sort_fetch) = fetch {
-                    if coalesce_fetch < sort_fetch {
-                        return internal_err!(
-                            "CoalescePartitionsExec fetch [{:?}] should be greater than or equal to SortExec fetch [{:?}]", coalesce_fetch, sort_fetch
+                fetch = match fetch {
+                    Some(sort_fetch) => {
+                        assert_or_internal_err!(
+                            coalesce_fetch >= sort_fetch,
+                            "CoalescePartitionsExec fetch [{:?}] should be greater than or equal to SortExec fetch [{:?}]",
+                            coalesce_fetch,
+                            sort_fetch
                         );
+                        Some(sort_fetch)
+                    }
+                    None => {
+                        // If the sort node does not have a fetch, we need to keep the coalesce node's fetch.
+                        Some(coalesce_fetch)
                     }
-                } else {
-                    // If the sort node does not have a fetch, we need to keep the coalesce node's fetch.
-                    fetch = Some(coalesce_fetch);
-                }
+                };
             };
             // When the input of a `CoalescePartitionsExec` has an ordering,
             // replace it with a `SortPreservingMergeExec` if appropriate:
diff --git a/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs b/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs
index 6e4e784866129..400161a94cff4 100644
--- a/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs
+++ b/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs
@@ -24,21 +24,22 @@ use crate::utils::{
 
 use arrow::datatypes::SchemaRef;
 use datafusion_common::tree_node::{Transformed, TreeNode};
-use datafusion_common::{internal_err, HashSet, JoinSide, Result};
+use datafusion_common::{HashSet, JoinSide, Result, internal_err};
 use datafusion_expr::JoinType;
 use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_expr::utils::collect_columns;
 use datafusion_physical_expr::{
-    add_offset_to_physical_sort_exprs, EquivalenceProperties,
+    EquivalenceProperties, add_offset_to_physical_sort_exprs,
 };
 use datafusion_physical_expr_common::sort_expr::{
     LexOrdering, LexRequirement, OrderingRequirements, PhysicalSortExpr,
     PhysicalSortRequirement,
 };
+use datafusion_physical_plan::aggregates::AggregateExec;
 use datafusion_physical_plan::execution_plan::CardinalityEffect;
 use datafusion_physical_plan::filter::FilterExec;
 use datafusion_physical_plan::joins::utils::{
-    calculate_join_output_ordering, ColumnIndex,
+    ColumnIndex, calculate_join_output_ordering,
 };
 use datafusion_physical_plan::joins::{HashJoinExec, SortMergeJoinExec};
 use datafusion_physical_plan::projection::ProjectionExec;
@@ -279,7 +280,7 @@ fn pushdown_requirement_to_children(
             }
             RequirementsCompatibility::NonCompatible => Ok(None),
         }
-    } else if let Some(sort_exec) = plan.as_any().downcast_ref::<SortExec>() {
+    } else if let Some(sort_exec) = plan.downcast_ref::<SortExec>() {
         let Some(sort_ordering) = sort_exec.properties().output_ordering().cloned()
         else {
             return internal_err!("SortExec should have output ordering");
@@ -318,7 +319,7 @@ fn pushdown_requirement_to_children(
         // `UnionExec` does not have real sort requirements for its input, we
         // just propagate the sort requirements down:
         Ok(Some(vec![Some(parent_required); plan.children().len()]))
-    } else if let Some(smj) = plan.as_any().downcast_ref::<SortMergeJoinExec>() {
+    } else if let Some(smj) = plan.downcast_ref::<SortMergeJoinExec>() {
         let left_columns_len = smj.left().schema().fields().len();
         let parent_ordering: Vec<PhysicalSortExpr> = parent_required
             .first()
@@ -353,12 +354,14 @@ fn pushdown_requirement_to_children(
                 Ok(None)
             }
         }
+    } else if let Some(aggregate_exec) = plan.downcast_ref::<AggregateExec>() {
+        handle_aggregate_pushdown(aggregate_exec, parent_required)
     } else if maintains_input_order.is_empty()
         || !maintains_input_order.iter().any(|o| *o)
-        || plan.as_any().is::<RepartitionExec>()
-        || plan.as_any().is::<FilterExec>()
+        || plan.is::<RepartitionExec>()
+        || plan.is::<FilterExec>()
         // TODO: Add support for Projection push down
-        || plan.as_any().is::<ProjectionExec>()
+        || plan.is::<ProjectionExec>()
         || pushdown_would_violate_requirements(&parent_required, plan.as_ref())
     {
         // If the current plan is a leaf node or can not maintain any of the input ordering, can not pushed down requirements.
@@ -380,14 +383,85 @@ fn pushdown_requirement_to_children(
             // ordering requirement invalidates requirement of sort preserving merge exec.
             Ok(None)
         }
-    } else if let Some(hash_join) = plan.as_any().downcast_ref::<HashJoinExec>() {
+    } else if let Some(hash_join) = plan.downcast_ref::<HashJoinExec>() {
         handle_hash_join(hash_join, parent_required)
     } else {
-        handle_custom_pushdown(plan, parent_required, maintains_input_order)
+        handle_custom_pushdown(plan, parent_required, &maintains_input_order)
     }
     // TODO: Add support for Projection push down
 }
 
+/// Try to push sorting through  [`AggregateExec`]
+///
+/// `AggregateExec` only preserves the input order of its group by columns
+/// (not aggregates in general, which are formed from arbitrary expressions over
+/// input)
+///
+/// Thus function rewrites the parent required ordering in terms of the
+/// aggregate input if possible. This rewritten requirement represents the
+/// ordering of the `AggregateExec`'s **input** that would also satisfy the
+/// **parent** ordering.
+///
+/// If no such mapping is possible (e.g. because the sort references aggregate
+/// columns), returns None.
+fn handle_aggregate_pushdown(
+    aggregate_exec: &AggregateExec,
+    parent_required: OrderingRequirements,
+) -> Result<Option<Vec<Option<OrderingRequirements>>>> {
+    if !aggregate_exec
+        .maintains_input_order()
+        .into_iter()
+        .any(|o| o)
+    {
+        return Ok(None);
+    }
+
+    let group_expr = aggregate_exec.group_expr();
+    // GROUPING SETS introduce additional output columns and NULL substitutions;
+    // skip pushdown until we can map those cases safely.
+    if group_expr.has_grouping_set() {
+        return Ok(None);
+    }
+
+    let group_input_exprs = group_expr.input_exprs();
+    let parent_requirement = parent_required.into_single();
+    let mut child_requirement = Vec::with_capacity(parent_requirement.len());
+
+    for req in parent_requirement {
+        // Sort above AggregateExec should reference its output columns. Map each
+        // output group-by column to its original input expression.
+        let Some(column) = req.expr.downcast_ref::<Column>() else {
+            return Ok(None);
+        };
+        if column.index() >= group_input_exprs.len() {
+            // AggregateExec does not produce output that is sorted on aggregate
+            // columns so those can not be pushed through.
+            return Ok(None);
+        }
+        child_requirement.push(PhysicalSortRequirement::new(
+            Arc::clone(&group_input_exprs[column.index()]),
+            req.options,
+        ));
+    }
+
+    let Some(child_requirement) = LexRequirement::new(child_requirement) else {
+        return Ok(None);
+    };
+
+    // Keep sort above aggregate unless input ordering already satisfies the
+    // mapped requirement.
+    if aggregate_exec
+        .input()
+        .equivalence_properties()
+        .ordering_satisfy_requirement(child_requirement.iter().cloned())?
+    {
+        let child_requirements = OrderingRequirements::new(child_requirement);
+        Ok(Some(vec![Some(child_requirements)]))
+    } else {
+        Ok(None)
+    }
+}
+
 /// Return true if pushing the sort requirements through a node would violate
 /// the input sorting requirements for the plan
 fn pushdown_would_violate_requirements(
@@ -530,14 +604,13 @@ fn expr_source_side(
             let mut right_ordering = ordering.clone();
             let (mut valid_left, mut valid_right) = (true, true);
             for (left, right) in ordering.iter_mut().zip(right_ordering.iter_mut()) {
-                let col = left.expr.as_any().downcast_ref::<Column>()?;
+                let col = left.expr.downcast_ref::<Column>()?;
                 let eq_class = eq_group.get_equivalence_class(&left.expr);
                 if col.index() < left_columns_len {
                     if valid_right {
                         valid_right = eq_class.is_some_and(|cls| {
                             for expr in cls.iter() {
                                 if expr
-                                    .as_any()
                                     .downcast_ref::<Column>()
                                     .is_some_and(|c| c.index() >= left_columns_len)
                                 {
@@ -552,7 +625,6 @@ fn expr_source_side(
                     valid_left = eq_class.is_some_and(|cls| {
                         for expr in cls.iter() {
                             if expr
-                                .as_any()
                                 .downcast_ref::<Column>()
                                 .is_some_and(|c| c.index() < left_columns_len)
                             {
@@ -578,11 +650,11 @@ fn expr_source_side(
         }
         JoinType::LeftSemi | JoinType::LeftAnti => ordering
             .iter()
-            .all(|e| e.expr.as_any().is::<Column>())
+            .all(|e| e.expr.is::<Column>())
             .then_some((JoinSide::Left, ordering)),
         JoinType::RightSemi | JoinType::RightAnti => ordering
             .iter()
-            .all(|e| e.expr.as_any().is::<Column>())
+            .all(|e| e.expr.is::<Column>())
             .then_some((JoinSide::Right, ordering)),
     }
 }
@@ -604,10 +676,12 @@ fn expr_source_side(
 fn handle_custom_pushdown(
     plan: &Arc<dyn ExecutionPlan>,
     parent_required: OrderingRequirements,
-    maintains_input_order: Vec<bool>,
+    maintains_input_order: &[bool],
 ) -> Result<Option<Vec<Option<OrderingRequirements>>>> {
+    let plan_children = plan.children();
+
     // If the plan has no children, return early:
-    if plan.children().is_empty() {
+    if plan_children.is_empty() {
         return Ok(None);
     }
 
@@ -625,8 +699,7 @@ fn handle_custom_pushdown(
         .collect();
 
     // Get the number of fields in each child's schema:
-    let children_schema_lengths: Vec<usize> = plan
-        .children()
+    let children_schema_lengths: Vec<usize> = plan_children
         .iter()
         .map(|c| c.schema().fields().len())
         .collect();
@@ -660,11 +733,11 @@ fn handle_custom_pushdown(
         let updated_parent_req = requirement
             .into_iter()
             .map(|req| {
-                let child_schema = plan.children()[maintained_child_idx].schema();
+                let child_schema = plan_children[maintained_child_idx].schema();
                 let updated_columns = req
                     .expr
                     .transform_up(|expr| {
-                        if let Some(col) = expr.as_any().downcast_ref::<Column>() {
+                        if let Some(col) = expr.downcast_ref::<Column>() {
                             let new_index = col.index() - sub_offset;
                             Ok(Transformed::yes(Arc::new(Column::new(
                                 child_schema.field(new_index).name(),
@@ -723,7 +796,7 @@ fn handle_hash_join(
         .collect();
 
     let column_indices = build_join_column_index(plan);
-    let projected_indices: Vec<_> = if let Some(projection) = &plan.projection {
+    let projected_indices: Vec<_> = if let Some(projection) = plan.projection.as_ref() {
         projection.iter().map(|&i| &column_indices[i]).collect()
     } else {
         column_indices.iter().collect()
@@ -735,17 +808,19 @@ fn handle_hash_join(
 
     let all_from_right_child = all_indices.iter().all(|i| *i >= len_of_left_fields);
 
+    let plan_children = plan.children();
+
     // If all columns are from the right child, update the parent requirements
     if all_from_right_child {
         // Transform the parent-required expression for the child schema by adjusting columns
         let updated_parent_req = requirement
             .into_iter()
             .map(|req| {
-                let child_schema = plan.children()[1].schema();
+                let child_schema = plan_children[1].schema();
                 let updated_columns = req
                     .expr
                     .transform_up(|expr| {
-                        if let Some(col) = expr.as_any().downcast_ref::<Column>() {
+                        if let Some(col) = expr.downcast_ref::<Column>() {
                             let index = projected_indices[col.index()].index;
                             Ok(Transformed::yes(Arc::new(Column::new(
                                 child_schema.field(index).name(),
diff --git a/datafusion/physical-optimizer/src/ensure_coop.rs b/datafusion/physical-optimizer/src/ensure_coop.rs
index 0c0b63c0b3e79..102e21a4853a4 100644
--- a/datafusion/physical-optimizer/src/ensure_coop.rs
+++ b/datafusion/physical-optimizer/src/ensure_coop.rs
@@ -25,12 +25,12 @@ use std::sync::Arc;
 
 use crate::PhysicalOptimizerRule;
 
-use datafusion_common::config::ConfigOptions;
-use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion};
 use datafusion_common::Result;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::{Transformed, TreeNode};
+use datafusion_physical_plan::ExecutionPlan;
 use datafusion_physical_plan::coop::CooperativeExec;
 use datafusion_physical_plan::execution_plan::{EvaluationType, SchedulingType};
-use datafusion_physical_plan::ExecutionPlan;
 
 /// `EnsureCooperative` is a [`PhysicalOptimizerRule`] that inspects the physical plan for
 /// sub plans that do not participate in cooperative scheduling. The plan is subdivided into sub
@@ -67,23 +67,57 @@ impl PhysicalOptimizerRule for EnsureCooperative {
         plan: Arc<dyn ExecutionPlan>,
         _config: &ConfigOptions,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        plan.transform_up(|plan| {
-            let is_leaf = plan.children().is_empty();
-            let is_exchange = plan.properties().evaluation_type == EvaluationType::Eager;
-            if (is_leaf || is_exchange)
-                && plan.properties().scheduling_type != SchedulingType::Cooperative
-            {
-                // Wrap non-cooperative leaves or eager evaluation roots in a cooperative exec to
-                // ensure the plans they participate in are properly cooperative.
-                Ok(Transformed::new(
-                    Arc::new(CooperativeExec::new(Arc::clone(&plan))),
-                    true,
-                    TreeNodeRecursion::Continue,
-                ))
-            } else {
+        use std::cell::RefCell;
+
+        let ancestry_stack = RefCell::new(Vec::<(SchedulingType, EvaluationType)>::new());
+
+        plan.transform_down_up(
+            // Down phase: Push parent properties <SchedulingType, EvaluationType> into the stack
+            |plan| {
+                let props = plan.properties();
+                ancestry_stack
+                    .borrow_mut()
+                    .push((props.scheduling_type, props.evaluation_type));
                 Ok(Transformed::no(plan))
-            }
-        })
+            },
+            // Up phase: Wrap nodes with CooperativeExec if needed
+            |plan| {
+                ancestry_stack.borrow_mut().pop();
+
+                let props = plan.properties();
+                let is_cooperative = props.scheduling_type == SchedulingType::Cooperative;
+                let is_leaf = plan.children().is_empty();
+                let is_exchange = props.evaluation_type == EvaluationType::Eager;
+
+                let mut is_under_cooperative_context = false;
+                for (scheduling_type, evaluation_type) in
+                    ancestry_stack.borrow().iter().rev()
+                {
+                    // If nearest ancestor is cooperative, we are under a cooperative context
+                    if *scheduling_type == SchedulingType::Cooperative {
+                        is_under_cooperative_context = true;
+                        break;
+                    // If nearest ancestor is eager, the cooperative context will be reset
+                    } else if *evaluation_type == EvaluationType::Eager {
+                        is_under_cooperative_context = false;
+                        break;
+                    }
+                }
+
+                // Wrap if:
+                // 1. Node is a leaf or exchange point
+                // 2. Node is not already cooperative
+                // 3. Not under any Cooperative context
+                if (is_leaf || is_exchange)
+                    && !is_cooperative
+                    && !is_under_cooperative_context
+                {
+                    return Ok(Transformed::yes(Arc::new(CooperativeExec::new(plan))));
+                }
+
+                Ok(Transformed::no(plan))
+            },
+        )
         .map(|t| t.data)
     }
 
@@ -96,7 +130,6 @@ impl PhysicalOptimizerRule for EnsureCooperative {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use datafusion_common::config::ConfigOptions;
     use datafusion_physical_plan::{displayable, test::scan_partitioned};
     use insta::assert_snapshot;
 
@@ -110,9 +143,272 @@ mod tests {
 
         let display = displayable(optimized.as_ref()).indent(true).to_string();
         // Use insta snapshot to ensure full plan structure
-        assert_snapshot!(display, @r###"
-            CooperativeExec
-              DataSourceExec: partitions=1, partition_sizes=[1]
-            "###);
+        assert_snapshot!(display, @r"
+        CooperativeExec
+          DataSourceExec: partitions=1, partition_sizes=[1]
+        ");
+    }
+
+    #[tokio::test]
+    async fn test_optimizer_is_idempotent() {
+        // Comprehensive idempotency test: verify f(f(...f(x))) = f(x)
+        // This test covers:
+        // 1. Multiple runs on unwrapped plan
+        // 2. Multiple runs on already-wrapped plan
+        // 3. No accumulation of CooperativeExec nodes
+
+        let config = ConfigOptions::new();
+        let rule = EnsureCooperative::new();
+
+        // Test 1: Start with unwrapped plan, run multiple times
+        let unwrapped_plan = scan_partitioned(1);
+        let mut current = unwrapped_plan;
+        let mut stable_result = String::new();
+
+        for run in 1..=5 {
+            current = rule.optimize(current, &config).unwrap();
+            let display = displayable(current.as_ref()).indent(true).to_string();
+
+            if run == 1 {
+                stable_result = display.clone();
+                assert_eq!(display.matches("CooperativeExec").count(), 1);
+            } else {
+                assert_eq!(
+                    display, stable_result,
+                    "Run {run} should match run 1 (idempotent)"
+                );
+                assert_eq!(
+                    display.matches("CooperativeExec").count(),
+                    1,
+                    "Should always have exactly 1 CooperativeExec, not accumulate"
+                );
+            }
+        }
+
+        // Test 2: Start with already-wrapped plan, verify no double wrapping
+        let pre_wrapped = Arc::new(CooperativeExec::new(scan_partitioned(1)));
+        let result = rule.optimize(pre_wrapped, &config).unwrap();
+        let display = displayable(result.as_ref()).indent(true).to_string();
+
+        assert_eq!(
+            display.matches("CooperativeExec").count(),
+            1,
+            "Should not double-wrap already cooperative plans"
+        );
+        assert_eq!(
+            display, stable_result,
+            "Pre-wrapped plan should produce same result as unwrapped after optimization"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_selective_wrapping() {
+        // Test that wrapping is selective: only leaf/eager nodes, not intermediate nodes
+        // Also verify depth tracking prevents double wrapping in subtrees
+        use datafusion_physical_expr::expressions::lit;
+        use datafusion_physical_plan::filter::FilterExec;
+
+        let config = ConfigOptions::new();
+        let rule = EnsureCooperative::new();
+
+        // Case 1: Filter -> Scan (middle node should not be wrapped)
+        let scan = scan_partitioned(1);
+        let filter = Arc::new(FilterExec::try_new(lit(true), scan).unwrap());
+        let optimized = rule.optimize(filter, &config).unwrap();
+        let display = displayable(optimized.as_ref()).indent(true).to_string();
+
+        assert_eq!(display.matches("CooperativeExec").count(), 1);
+        assert!(display.contains("FilterExec"));
+
+        // Case 2: Filter -> CoopExec -> Scan (depth tracking prevents double wrap)
+        let scan2 = scan_partitioned(1);
+        let wrapped_scan = Arc::new(CooperativeExec::new(scan2));
+        let filter2 = Arc::new(FilterExec::try_new(lit(true), wrapped_scan).unwrap());
+        let optimized2 = rule.optimize(filter2, &config).unwrap();
+        let display2 = displayable(optimized2.as_ref()).indent(true).to_string();
+
+        assert_eq!(display2.matches("CooperativeExec").count(), 1);
+    }
+
+    #[tokio::test]
+    async fn test_multiple_leaf_nodes() {
+        // When there are multiple leaf nodes, each should be wrapped separately
+        use datafusion_physical_plan::union::UnionExec;
+
+        let scan1 = scan_partitioned(1);
+        let scan2 = scan_partitioned(1);
+        let union = UnionExec::try_new(vec![scan1, scan2]).unwrap();
+
+        let config = ConfigOptions::new();
+        let optimized = EnsureCooperative::new()
+            .optimize(union as Arc<dyn ExecutionPlan>, &config)
+            .unwrap();
+
+        let display = displayable(optimized.as_ref()).indent(true).to_string();
+
+        // Each leaf should have its own CooperativeExec
+        assert_eq!(
+            display.matches("CooperativeExec").count(),
+            2,
+            "Each leaf node should be wrapped separately"
+        );
+        assert_eq!(
+            display.matches("DataSourceExec").count(),
+            2,
+            "Both data sources should be present"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_eager_evaluation_resets_cooperative_context() {
+        // Test that cooperative context is reset when encountering an eager evaluation boundary.
+        use arrow::datatypes::Schema;
+        use datafusion_common::internal_err;
+        use datafusion_common::tree_node::TreeNodeRecursion;
+        use datafusion_execution::TaskContext;
+        use datafusion_physical_expr::EquivalenceProperties;
+        use datafusion_physical_plan::{
+            DisplayAs, DisplayFormatType, Partitioning, PhysicalExpr, PlanProperties,
+            SendableRecordBatchStream,
+            execution_plan::{Boundedness, EmissionType},
+        };
+
+        #[derive(Debug)]
+        struct DummyExec {
+            name: String,
+            input: Arc<dyn ExecutionPlan>,
+            scheduling_type: SchedulingType,
+            evaluation_type: EvaluationType,
+            properties: Arc<PlanProperties>,
+        }
+
+        impl DummyExec {
+            fn new(
+                name: &str,
+                input: Arc<dyn ExecutionPlan>,
+                scheduling_type: SchedulingType,
+                evaluation_type: EvaluationType,
+            ) -> Self {
+                let properties = PlanProperties::new(
+                    EquivalenceProperties::new(Arc::new(Schema::empty())),
+                    Partitioning::UnknownPartitioning(1),
+                    EmissionType::Incremental,
+                    Boundedness::Bounded,
+                )
+                .with_scheduling_type(scheduling_type)
+                .with_evaluation_type(evaluation_type);
+
+                Self {
+                    name: name.to_string(),
+                    input,
+                    scheduling_type,
+                    evaluation_type,
+                    properties: Arc::new(properties),
+                }
+            }
+        }
+
+        impl DisplayAs for DummyExec {
+            fn fmt_as(
+                &self,
+                _: DisplayFormatType,
+                f: &mut Formatter,
+            ) -> std::fmt::Result {
+                write!(f, "{}", self.name)
+            }
+        }
+
+        impl ExecutionPlan for DummyExec {
+            fn name(&self) -> &str {
+                &self.name
+            }
+            fn properties(&self) -> &Arc<PlanProperties> {
+                &self.properties
+            }
+            fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+                vec![&self.input]
+            }
+            fn with_new_children(
+                self: Arc<Self>,
+                children: Vec<Arc<dyn ExecutionPlan>>,
+            ) -> Result<Arc<dyn ExecutionPlan>> {
+                Ok(Arc::new(DummyExec::new(
+                    &self.name,
+                    Arc::clone(&children[0]),
+                    self.scheduling_type,
+                    self.evaluation_type,
+                )))
+            }
+            fn execute(
+                &self,
+                _: usize,
+                _: Arc<TaskContext>,
+            ) -> Result<SendableRecordBatchStream> {
+                internal_err!("DummyExec does not support execution")
+            }
+
+            fn apply_expressions(
+                &self,
+                _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+            ) -> Result<TreeNodeRecursion> {
+                Ok(TreeNodeRecursion::Continue)
+            }
+        }
+
+        // Build a plan similar to the original test:
+        // scan -> exch1(NonCoop,Eager) -> CoopExec -> filter -> exch2(Coop,Eager) -> filter
+        let scan = scan_partitioned(1);
+        let exch1 = Arc::new(DummyExec::new(
+            "exch1",
+            scan,
+            SchedulingType::NonCooperative,
+            EvaluationType::Eager,
+        ));
+        let coop = Arc::new(CooperativeExec::new(exch1));
+        let filter1 = Arc::new(DummyExec::new(
+            "filter1",
+            coop,
+            SchedulingType::NonCooperative,
+            EvaluationType::Lazy,
+        ));
+        let exch2 = Arc::new(DummyExec::new(
+            "exch2",
+            filter1,
+            SchedulingType::Cooperative,
+            EvaluationType::Eager,
+        ));
+        let filter2 = Arc::new(DummyExec::new(
+            "filter2",
+            exch2,
+            SchedulingType::NonCooperative,
+            EvaluationType::Lazy,
+        ));
+
+        let config = ConfigOptions::new();
+        let optimized = EnsureCooperative::new().optimize(filter2, &config).unwrap();
+
+        let display = displayable(optimized.as_ref()).indent(true).to_string();
+
+        // Expected wrapping:
+        // - Scan (leaf) gets wrapped
+        // - exch1 (eager+noncoop) keeps its manual CooperativeExec wrapper
+        // - filter1 is protected by exch2's cooperative context, no extra wrap
+        // - exch2 (already Cooperative) does NOT get wrapped
+        // - filter2 (not leaf or eager) does NOT get wrapped
+        assert_eq!(
+            display.matches("CooperativeExec").count(),
+            2,
+            "Should have 2 CooperativeExec: one wrapping scan, one wrapping exch1"
+        );
+
+        assert_snapshot!(display, @r"
+        filter2
+          exch2
+            filter1
+              CooperativeExec
+                exch1
+                  CooperativeExec
+                    DataSourceExec: partitions=1, partition_sizes=[1]
+        ");
     }
 }
diff --git a/datafusion/physical-optimizer/src/filter_pushdown.rs b/datafusion/physical-optimizer/src/filter_pushdown.rs
index 5ee7023ff6ee2..28f8155002a50 100644
--- a/datafusion/physical-optimizer/src/filter_pushdown.rs
+++ b/datafusion/physical-optimizer/src/filter_pushdown.rs
@@ -36,16 +36,16 @@ use std::sync::Arc;
 use crate::PhysicalOptimizerRule;
 
 use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
-use datafusion_common::{config::ConfigOptions, internal_err, Result};
+use datafusion_common::{Result, assert_eq_or_internal_err, config::ConfigOptions};
 use datafusion_physical_expr::PhysicalExpr;
 use datafusion_physical_expr_common::physical_expr::is_volatile;
 use datafusion_physical_plan::filter_pushdown::{
     ChildFilterPushdownResult, ChildPushdownResult, FilterPushdownPhase,
     FilterPushdownPropagation, PushedDown,
 };
-use datafusion_physical_plan::{with_new_children_if_necessary, ExecutionPlan};
+use datafusion_physical_plan::{ExecutionPlan, with_new_children_if_necessary};
 
-use itertools::{izip, Itertools};
+use itertools::{Itertools, izip};
 
 /// Attempts to recursively push given filters from the top of the tree into leaves.
 ///
@@ -422,7 +422,7 @@ impl PhysicalOptimizerRule for FilterPushdown {
         config: &ConfigOptions,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         Ok(
-            push_down_filters(Arc::clone(&plan), vec![], config, self.phase)?
+            push_down_filters(&Arc::clone(&plan), vec![], config, self.phase)?
                 .updated_node
                 .unwrap_or(plan),
         )
@@ -438,7 +438,7 @@ impl PhysicalOptimizerRule for FilterPushdown {
 }
 
 fn push_down_filters(
-    node: Arc<dyn ExecutionPlan>,
+    node: &Arc<dyn ExecutionPlan>,
     parent_predicates: Vec<Arc<dyn PhysicalExpr>>,
     config: &ConfigOptions,
     phase: FilterPushdownPhase,
@@ -461,22 +461,18 @@ fn push_down_filters(
 
     let filter_description_parent_filters = filter_description.parent_filters();
     let filter_description_self_filters = filter_description.self_filters();
-    if filter_description_parent_filters.len() != children.len() {
-        return internal_err!(
-            "Filter pushdown expected FilterDescription to have parent filters for {}, but got {} for node {}",
-            children.len(),
-            filter_description_parent_filters.len(),
-            node.name()
-        );
-    }
-    if filter_description_self_filters.len() != children.len() {
-        return internal_err!(
-            "Filter pushdown expected FilterDescription to have self filters for {}, but got {} for node {}",
-            children.len(),
-            filter_description_self_filters.len(),
-            node.name()
-        );
-    }
+    assert_eq_or_internal_err!(
+        filter_description_parent_filters.len(),
+        children.len(),
+        "Filter pushdown expected parent filters count to match number of children for node {}",
+        node.name()
+    );
+    assert_eq_or_internal_err!(
+        filter_description_self_filters.len(),
+        children.len(),
+        "Filter pushdown expected self filters count to match number of children for node {}",
+        node.name()
+    );
 
     for (child_idx, (child, parent_filters, self_filters)) in izip!(
         children,
@@ -510,7 +506,8 @@ fn push_down_filters(
         let num_parent_filters = all_predicates.len() - num_self_filters;
 
         // Any filters that could not be pushed down to a child are marked as not-supported to our parents
-        let result = push_down_filters(Arc::clone(child), all_predicates, config, phase)?;
+        let result =
+            push_down_filters(&Arc::clone(child), all_predicates, config, phase)?;
 
         if let Some(new_child) = result.updated_node {
             // If we have a filter pushdown result, we need to update our children
@@ -524,15 +521,12 @@ fn push_down_filters(
         // from our parents and filters that the current node injected. We need to de-entangle
         // this since we do need to distinguish between them.
         let mut all_filters = result.filters.into_iter().collect_vec();
-        if all_filters.len() != num_self_filters + num_parent_filters {
-            return internal_err!(
-                "Filter pushdown did not return the expected number of filters: expected {} self filters and {} parent filters, but got {}. Likely culprit is {}",
-                num_self_filters,
-                num_parent_filters,
-                all_filters.len(),
-                child.name()
-            );
-        }
+        assert_eq_or_internal_err!(
+            all_filters.len(),
+            num_self_filters + num_parent_filters,
+            "Filter pushdown did not return the expected number of filters from {}",
+            child.name()
+        );
         let parent_filters = all_filters
             .split_off(num_self_filters)
             .into_iter()
@@ -571,7 +565,7 @@ fn push_down_filters(
     }
 
     // Re-create this node with new children
-    let updated_node = with_new_children_if_necessary(Arc::clone(&node), new_children)?;
+    let updated_node = with_new_children_if_necessary(Arc::clone(node), new_children)?;
 
     // TODO: by calling `handle_child_pushdown_result` we are assuming that the
     // `ExecutionPlan` implementation will not change the plan itself.
@@ -596,7 +590,7 @@ fn push_down_filters(
     )?;
     // Compare pointers for new_node and node, if they are different we must replace
     // ourselves because of changes in our children.
-    if res.updated_node.is_none() && !Arc::ptr_eq(&updated_node, &node) {
+    if res.updated_node.is_none() && !Arc::ptr_eq(&updated_node, node) {
         res.updated_node = Some(updated_node)
     }
     Ok(res)
diff --git a/datafusion/physical-optimizer/src/hash_join_buffering.rs b/datafusion/physical-optimizer/src/hash_join_buffering.rs
new file mode 100644
index 0000000000000..7a198cac13fc9
--- /dev/null
+++ b/datafusion/physical-optimizer/src/hash_join_buffering.rs
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::PhysicalOptimizerRule;
+use datafusion_common::JoinSide;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::buffer::BufferExec;
+use datafusion_physical_plan::joins::HashJoinExec;
+use std::sync::Arc;
+
+/// Looks for all the [HashJoinExec]s in the plan and places a [BufferExec] node with the
+/// configured capacity in the probe side:
+///
+/// ```text
+///            ┌───────────────────┐
+///            │   HashJoinExec    │
+///            └─────▲────────▲────┘
+///          ┌───────┘        └─────────┐
+///          │                          │
+/// ┌────────────────┐         ┌─────────────────┐
+/// │   Build side   │       + │   BufferExec    │
+/// └────────────────┘         └────────▲────────┘
+///                                     │
+///                            ┌────────┴────────┐
+///                            │   Probe side    │
+///                            └─────────────────┘
+/// ```
+///
+/// Which allows eagerly pulling it even before the build side has completely finished.
+#[derive(Debug, Default)]
+pub struct HashJoinBuffering {}
+
+impl HashJoinBuffering {
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+impl PhysicalOptimizerRule for HashJoinBuffering {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        config: &ConfigOptions,
+    ) -> datafusion_common::Result<Arc<dyn ExecutionPlan>> {
+        let capacity = config.execution.hash_join_buffering_capacity;
+        if capacity == 0 {
+            return Ok(plan);
+        }
+
+        plan.transform_down(|plan| {
+            let Some(node) = plan.downcast_ref::<HashJoinExec>() else {
+                return Ok(Transformed::no(plan));
+            };
+            let plan = Arc::clone(&plan);
+            Ok(Transformed::yes(
+                if HashJoinExec::probe_side() == JoinSide::Left {
+                    // Do not stack BufferExec nodes together.
+                    if node.left.is::<BufferExec>() {
+                        return Ok(Transformed::no(plan));
+                    }
+                    plan.with_new_children(vec![
+                        Arc::new(BufferExec::new(Arc::clone(&node.left), capacity)),
+                        Arc::clone(&node.right),
+                    ])?
+                } else {
+                    // Do not stack BufferExec nodes together.
+                    if node.right.is::<BufferExec>() {
+                        return Ok(Transformed::no(plan));
+                    }
+                    plan.with_new_children(vec![
+                        Arc::clone(&node.left),
+                        Arc::new(BufferExec::new(Arc::clone(&node.right), capacity)),
+                    ])?
+                },
+            ))
+        })
+        .data()
+    }
+
+    fn name(&self) -> &str {
+        "HashJoinBuffering"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
diff --git a/datafusion/physical-optimizer/src/join_selection.rs b/datafusion/physical-optimizer/src/join_selection.rs
index b55c01f62e992..74c6cbb19aea9 100644
--- a/datafusion/physical-optimizer/src/join_selection.rs
+++ b/datafusion/physical-optimizer/src/join_selection.rs
@@ -24,19 +24,22 @@
 //! `PartitionMode` and the build side using the available statistics for hash joins.
 
 use crate::PhysicalOptimizerRule;
+use crate::optimizer::{ConfigOnlyContext, PhysicalOptimizerContext};
+use datafusion_common::Statistics;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::error::Result;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::{internal_err, JoinSide, JoinType};
+use datafusion_common::{JoinSide, JoinType, internal_err};
 use datafusion_expr_common::sort_properties::SortProperties;
-use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_expr::LexOrdering;
+use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_plan::execution_plan::EmissionType;
 use datafusion_physical_plan::joins::utils::ColumnIndex;
 use datafusion_physical_plan::joins::{
     CrossJoinExec, HashJoinExec, NestedLoopJoinExec, PartitionMode,
     StreamJoinPartitionMode, SymmetricHashJoinExec,
 };
+use datafusion_physical_plan::operator_statistics::StatisticsRegistry;
 use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
 use std::sync::Arc;
 
@@ -47,26 +50,55 @@ use std::sync::Arc;
 pub struct JoinSelection {}
 
 impl JoinSelection {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
 }
 
+/// Get statistics for a plan node, using the registry if available.
+fn get_stats(
+    plan: &dyn ExecutionPlan,
+    registry: Option<&StatisticsRegistry>,
+) -> Result<Arc<Statistics>> {
+    if let Some(reg) = registry {
+        reg.compute(plan)
+            .map(|s| Arc::<Statistics>::clone(s.base_arc()))
+    } else {
+        plan.partition_statistics(None)
+    }
+}
+
 // TODO: We need some performance test for Right Semi/Right Join swap to Left Semi/Left Join in case that the right side is smaller but not much smaller.
 // TODO: In PrestoSQL, the optimizer flips join sides only if one side is much smaller than the other by more than SIZE_DIFFERENCE_THRESHOLD times, by default is 8 times.
-/// Checks statistics for join swap.
+/// Checks whether join inputs should be swapped using available statistics.
+///
+/// It follows these steps:
+/// 1. If a [`StatisticsRegistry`] is provided, use it for cross-operator estimates
+///    (e.g., intermediate join outputs that would otherwise have `Absent` statistics).
+/// 2. Compare the in-memory sizes of both sides, and place the smaller side on
+///    the left (build) side.
+/// 3. If in-memory byte sizes are unavailable, fall back to row counts.
+/// 4. Do not reorder the join if neither statistic is available, or if
+///    `datafusion.optimizer.join_reordering` is disabled.
+///
+/// Used configurations inside arg `config`
+/// - `config.optimizer.join_reordering`: allows or forbids statistics-driven join swapping
 pub(crate) fn should_swap_join_order(
     left: &dyn ExecutionPlan,
     right: &dyn ExecutionPlan,
+    config: &ConfigOptions,
+    registry: Option<&StatisticsRegistry>,
 ) -> Result<bool> {
-    // Get the left and right table's total bytes
-    // If both the left and right tables contain total_byte_size statistics,
-    // use `total_byte_size` to determine `should_swap_join_order`, else use `num_rows`
-    let left_stats = left.partition_statistics(None)?;
-    let right_stats = right.partition_statistics(None)?;
-    // First compare `total_byte_size` of left and right side,
-    // if information in this field is insufficient fallback to the `num_rows`
+    if !config.optimizer.join_reordering {
+        return Ok(false);
+    }
+
+    let left_stats = get_stats(left, registry)?;
+    let right_stats = get_stats(right, registry)?;
+
+    // First compare total_byte_size, then fall back to num_rows if byte
+    // sizes are unavailable.
     match (
         left_stats.total_byte_size.get_value(),
         right_stats.total_byte_size.get_value(),
@@ -86,17 +118,20 @@ fn supports_collect_by_thresholds(
     plan: &dyn ExecutionPlan,
     threshold_byte_size: usize,
     threshold_num_rows: usize,
+    registry: Option<&StatisticsRegistry>,
 ) -> bool {
-    // Currently we do not trust the 0 value from stats, due to stats collection might have bug
-    // TODO check the logic in datasource::get_statistics_with_limit()
-    let Ok(stats) = plan.partition_statistics(None) else {
+    let Ok(stats) = get_stats(plan, registry) else {
         return false;
     };
 
+    // Stats use `Precision<T>` to represent stats, where `Absent` means unknown.
+    // `Exact(0)` and `Inexact(0)` are both valid stats, and we should not treat
+    // them as unknown, `Absent` will return None (this is in regards to why
+    // `!=0` is not checked)
     if let Some(byte_size) = stats.total_byte_size.get_value() {
-        *byte_size != 0 && *byte_size < threshold_byte_size
+        *byte_size < threshold_byte_size
     } else if let Some(num_rows) = stats.num_rows.get_value() {
-        *num_rows != 0 && *num_rows < threshold_num_rows
+        *num_rows < threshold_num_rows
     } else {
         false
     }
@@ -108,11 +143,25 @@ impl PhysicalOptimizerRule for JoinSelection {
         plan: Arc<dyn ExecutionPlan>,
         config: &ConfigOptions,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        // First, we make pipeline-fixing modifications to joins so as to accommodate
-        // unbounded inputs. Each pipeline-fixing subrule, which is a function
-        // of type `PipelineFixerSubrule`, takes a single [`PipelineStatePropagator`]
-        // argument storing state variables that indicate the unboundedness status
-        // of the current [`ExecutionPlan`] as we traverse the plan tree.
+        self.optimize_with_context(plan, &ConfigOnlyContext::new(config))
+    }
+
+    fn optimize_with_context(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        context: &dyn PhysicalOptimizerContext,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let config = context.config_options();
+        let mut default_registry = None;
+        let registry: Option<&StatisticsRegistry> =
+            if config.optimizer.use_statistics_registry {
+                Some(context.statistics_registry().unwrap_or_else(|| {
+                    default_registry
+                        .insert(StatisticsRegistry::default_with_builtin_providers())
+                }))
+            } else {
+                None
+            };
         let subrules: Vec<Box<PipelineFixerSubrule>> = vec![
             Box::new(hash_join_convert_symmetric_subrule),
             Box::new(hash_join_swap_subrule),
@@ -120,27 +169,9 @@ impl PhysicalOptimizerRule for JoinSelection {
         let new_plan = plan
             .transform_up(|p| apply_subrules(p, &subrules, config))
             .data()?;
-        // Next, we apply another subrule that tries to optimize joins using any
-        // statistics their inputs might have.
-        // - For a hash join with partition mode [`PartitionMode::Auto`], we will
-        //   make a cost-based decision to select which `PartitionMode` mode
-        //   (`Partitioned`/`CollectLeft`) is optimal. If the statistics information
-        //   is not available, we will fall back to [`PartitionMode::Partitioned`].
-        // - We optimize/swap join sides so that the left (build) side of the join
-        //   is the small side. If the statistics information is not available, we
-        //   do not modify join sides.
-        // - We will also swap left and right sides for cross joins so that the left
-        //   side is the small side.
-        let config = &config.optimizer;
-        let collect_threshold_byte_size = config.hash_join_single_partition_threshold;
-        let collect_threshold_num_rows = config.hash_join_single_partition_threshold_rows;
         new_plan
             .transform_up(|plan| {
-                statistical_join_selection_subrule(
-                    plan,
-                    collect_threshold_byte_size,
-                    collect_threshold_num_rows,
-                )
+                statistical_join_selection_subrule(plan, config, registry)
             })
             .data()
     }
@@ -160,59 +191,65 @@ impl PhysicalOptimizerRule for JoinSelection {
 /// `CollectLeft` mode is applicable. Otherwise, it will try to swap the join sides.
 /// When the `ignore_threshold` is false, this function will also check left
 /// and right sizes in bytes or rows.
+///
+/// Used configurations inside arg `config`
+/// - `config.optimizer.hash_join_single_partition_threshold`: byte threshold for `CollectLeft`
+/// - `config.optimizer.hash_join_single_partition_threshold_rows`: row threshold for `CollectLeft`
+/// - `config.optimizer.join_reordering`: allows or forbids input swapping
 pub(crate) fn try_collect_left(
     hash_join: &HashJoinExec,
     ignore_threshold: bool,
-    threshold_byte_size: usize,
-    threshold_num_rows: usize,
+    config: &ConfigOptions,
+    registry: Option<&StatisticsRegistry>,
 ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
     let left = hash_join.left();
     let right = hash_join.right();
+    let optimizer_config = &config.optimizer;
 
     let left_can_collect = ignore_threshold
         || supports_collect_by_thresholds(
             &**left,
-            threshold_byte_size,
-            threshold_num_rows,
+            optimizer_config.hash_join_single_partition_threshold,
+            optimizer_config.hash_join_single_partition_threshold_rows,
+            registry,
         );
     let right_can_collect = ignore_threshold
         || supports_collect_by_thresholds(
             &**right,
-            threshold_byte_size,
-            threshold_num_rows,
+            optimizer_config.hash_join_single_partition_threshold,
+            optimizer_config.hash_join_single_partition_threshold_rows,
+            registry,
         );
 
     match (left_can_collect, right_can_collect) {
         (true, true) => {
+            // Don't swap null-aware anti joins as they have specific side requirements
             if hash_join.join_type().supports_swap()
-                && should_swap_join_order(&**left, &**right)?
+                && !hash_join.null_aware
+                && should_swap_join_order(&**left, &**right, config, registry)?
             {
                 Ok(Some(hash_join.swap_inputs(PartitionMode::CollectLeft)?))
             } else {
-                Ok(Some(Arc::new(HashJoinExec::try_new(
-                    Arc::clone(left),
-                    Arc::clone(right),
-                    hash_join.on().to_vec(),
-                    hash_join.filter().cloned(),
-                    hash_join.join_type(),
-                    hash_join.projection.clone(),
-                    PartitionMode::CollectLeft,
-                    hash_join.null_equality(),
-                )?)))
+                Ok(Some(Arc::new(
+                    hash_join
+                        .builder()
+                        .with_partition_mode(PartitionMode::CollectLeft)
+                        .build()?,
+                )))
             }
         }
-        (true, false) => Ok(Some(Arc::new(HashJoinExec::try_new(
-            Arc::clone(left),
-            Arc::clone(right),
-            hash_join.on().to_vec(),
-            hash_join.filter().cloned(),
-            hash_join.join_type(),
-            hash_join.projection.clone(),
-            PartitionMode::CollectLeft,
-            hash_join.null_equality(),
-        )?))),
+        (true, false) => Ok(Some(Arc::new(
+            hash_join
+                .builder()
+                .with_partition_mode(PartitionMode::CollectLeft)
+                .build()?,
+        ))),
         (false, true) => {
-            if hash_join.join_type().supports_swap() {
+            // Don't swap null-aware anti joins as they have specific side requirements
+            if optimizer_config.join_reordering
+                && hash_join.join_type().supports_swap()
+                && !hash_join.null_aware
+            {
                 hash_join.swap_inputs(PartitionMode::CollectLeft).map(Some)
             } else {
                 Ok(None)
@@ -227,88 +264,105 @@ pub(crate) fn try_collect_left(
 /// Checks if the join order should be swapped based on the join type and input statistics.
 /// If swapping is optimal and supported, creates a swapped partitioned hash join; otherwise,
 /// creates a standard partitioned hash join.
+///
+/// Used configurations inside arg `config`
+/// - `config.optimizer.join_reordering`: allows or forbids statistics-driven join swapping
 pub(crate) fn partitioned_hash_join(
     hash_join: &HashJoinExec,
+    config: &ConfigOptions,
+    registry: Option<&StatisticsRegistry>,
 ) -> Result<Arc<dyn ExecutionPlan>> {
     let left = hash_join.left();
     let right = hash_join.right();
-    if hash_join.join_type().supports_swap() && should_swap_join_order(&**left, &**right)?
+    // Don't swap null-aware anti joins as they have specific side requirements
+    if hash_join.join_type().supports_swap()
+        && !hash_join.null_aware
+        && should_swap_join_order(&**left, &**right, config, registry)?
     {
         hash_join.swap_inputs(PartitionMode::Partitioned)
     } else {
-        Ok(Arc::new(HashJoinExec::try_new(
-            Arc::clone(left),
-            Arc::clone(right),
-            hash_join.on().to_vec(),
-            hash_join.filter().cloned(),
-            hash_join.join_type(),
-            hash_join.projection.clone(),
-            PartitionMode::Partitioned,
-            hash_join.null_equality(),
-        )?))
+        // Null-aware anti joins must use CollectLeft mode because they track probe-side state
+        // (probe_side_non_empty, probe_side_has_null) per-partition, but need global knowledge
+        // for correct null handling. With partitioning, a partition might not see probe rows
+        // even if the probe side is globally non-empty, leading to incorrect NULL row handling.
+        let partition_mode = if hash_join.null_aware {
+            PartitionMode::CollectLeft
+        } else {
+            PartitionMode::Partitioned
+        };
+
+        Ok(Arc::new(
+            hash_join
+                .builder()
+                .with_partition_mode(partition_mode)
+                .build()?,
+        ))
     }
 }
 
 /// This subrule tries to modify a given plan so that it can
-/// optimize hash and cross joins in the plan according to available statistical information.
+/// optimize hash and cross joins in the plan according to available statistical
+/// information.
+///
+/// Used configurations inside arg `config`
+/// - `config.optimizer.hash_join_single_partition_threshold`: byte threshold for `CollectLeft`
+/// - `config.optimizer.hash_join_single_partition_threshold_rows`: row threshold for `CollectLeft`
+/// - `config.optimizer.join_reordering`: allows or forbids input swapping
 fn statistical_join_selection_subrule(
     plan: Arc<dyn ExecutionPlan>,
-    collect_threshold_byte_size: usize,
-    collect_threshold_num_rows: usize,
+    config: &ConfigOptions,
+    registry: Option<&StatisticsRegistry>,
 ) -> Result<Transformed<Arc<dyn ExecutionPlan>>> {
-    let transformed =
-        if let Some(hash_join) = plan.as_any().downcast_ref::<HashJoinExec>() {
-            match hash_join.partition_mode() {
-                PartitionMode::Auto => try_collect_left(
-                    hash_join,
-                    false,
-                    collect_threshold_byte_size,
-                    collect_threshold_num_rows,
-                )?
+    let transformed = if let Some(hash_join) = plan.downcast_ref::<HashJoinExec>() {
+        match hash_join.partition_mode() {
+            PartitionMode::Auto => try_collect_left(hash_join, false, config, registry)?
                 .map_or_else(
-                    || partitioned_hash_join(hash_join).map(Some),
+                    || partitioned_hash_join(hash_join, config, registry).map(Some),
                     |v| Ok(Some(v)),
                 )?,
-                PartitionMode::CollectLeft => try_collect_left(hash_join, true, 0, 0)?
-                    .map_or_else(
-                        || partitioned_hash_join(hash_join).map(Some),
-                        |v| Ok(Some(v)),
-                    )?,
-                PartitionMode::Partitioned => {
-                    let left = hash_join.left();
-                    let right = hash_join.right();
-                    if hash_join.join_type().supports_swap()
-                        && should_swap_join_order(&**left, &**right)?
-                    {
-                        hash_join
-                            .swap_inputs(PartitionMode::Partitioned)
-                            .map(Some)?
-                    } else {
-                        None
-                    }
-                }
-            }
-        } else if let Some(cross_join) = plan.as_any().downcast_ref::<CrossJoinExec>() {
-            let left = cross_join.left();
-            let right = cross_join.right();
-            if should_swap_join_order(&**left, &**right)? {
-                cross_join.swap_inputs().map(Some)?
-            } else {
-                None
+            PartitionMode::CollectLeft => {
+                try_collect_left(hash_join, true, config, registry)?.map_or_else(
+                    || partitioned_hash_join(hash_join, config, registry).map(Some),
+                    |v| Ok(Some(v)),
+                )?
             }
-        } else if let Some(nl_join) = plan.as_any().downcast_ref::<NestedLoopJoinExec>() {
-            let left = nl_join.left();
-            let right = nl_join.right();
-            if nl_join.join_type().supports_swap()
-                && should_swap_join_order(&**left, &**right)?
-            {
-                nl_join.swap_inputs().map(Some)?
-            } else {
-                None
+            PartitionMode::Partitioned => {
+                let left = hash_join.left();
+                let right = hash_join.right();
+                // Don't swap null-aware anti joins as they have specific side requirements
+                if hash_join.join_type().supports_swap()
+                    && !hash_join.null_aware
+                    && should_swap_join_order(&**left, &**right, config, registry)?
+                {
+                    hash_join
+                        .swap_inputs(PartitionMode::Partitioned)
+                        .map(Some)?
+                } else {
+                    None
+                }
             }
+        }
+    } else if let Some(cross_join) = plan.downcast_ref::<CrossJoinExec>() {
+        let left = cross_join.left();
+        let right = cross_join.right();
+        if should_swap_join_order(&**left, &**right, config, registry)? {
+            cross_join.swap_inputs().map(Some)?
         } else {
             None
-        };
+        }
+    } else if let Some(nl_join) = plan.downcast_ref::<NestedLoopJoinExec>() {
+        let left = nl_join.left();
+        let right = nl_join.right();
+        if nl_join.join_type().supports_swap()
+            && should_swap_join_order(&**left, &**right, config, registry)?
+        {
+            nl_join.swap_inputs().map(Some)?
+        } else {
+            None
+        }
+    } else {
+        None
+    };
 
     Ok(if let Some(transformed) = transformed {
         Transformed::yes(transformed)
@@ -342,7 +396,7 @@ fn hash_join_convert_symmetric_subrule(
     config_options: &ConfigOptions,
 ) -> Result<Arc<dyn ExecutionPlan>> {
     // Check if the current plan node is a HashJoinExec.
-    if let Some(hash_join) = input.as_any().downcast_ref::<HashJoinExec>() {
+    if let Some(hash_join) = input.downcast_ref::<HashJoinExec>() {
         let left_unbounded = hash_join.left.boundedness().is_unbounded();
         let left_incremental = matches!(
             hash_join.left.pipeline_behavior(),
@@ -481,19 +535,16 @@ pub fn hash_join_swap_subrule(
     mut input: Arc<dyn ExecutionPlan>,
     _config_options: &ConfigOptions,
 ) -> Result<Arc<dyn ExecutionPlan>> {
-    if let Some(hash_join) = input.as_any().downcast_ref::<HashJoinExec>() {
-        if hash_join.left.boundedness().is_unbounded()
-            && !hash_join.right.boundedness().is_unbounded()
-            && matches!(
-                *hash_join.join_type(),
-                JoinType::Inner
-                    | JoinType::Left
-                    | JoinType::LeftSemi
-                    | JoinType::LeftAnti
-            )
-        {
-            input = swap_join_according_to_unboundedness(hash_join)?;
-        }
+    if let Some(hash_join) = input.downcast_ref::<HashJoinExec>()
+        && hash_join.left.boundedness().is_unbounded()
+        && !hash_join.right.boundedness().is_unbounded()
+        && !hash_join.null_aware // Don't swap null-aware anti joins
+        && matches!(
+            *hash_join.join_type(),
+            JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti
+        )
+    {
+        input = swap_join_according_to_unboundedness(hash_join)?;
     }
     Ok(input)
 }
@@ -539,10 +590,14 @@ fn apply_subrules(
     subrules: &Vec<Box<PipelineFixerSubrule>>,
     config_options: &ConfigOptions,
 ) -> Result<Transformed<Arc<dyn ExecutionPlan>>> {
+    let original = Arc::clone(&input);
     for subrule in subrules {
         input = subrule(input, config_options)?;
     }
-    Ok(Transformed::yes(input))
+
+    let transformed = !Arc::ptr_eq(&original, &input);
+
+    Ok(Transformed::new_transformed(input, transformed))
 }
 
 // See tests in datafusion/core/tests/physical_optimizer
diff --git a/datafusion/physical-optimizer/src/lib.rs b/datafusion/physical-optimizer/src/lib.rs
index 79db43c1cbe94..5fac8948b7f04 100644
--- a/datafusion/physical-optimizer/src/lib.rs
+++ b/datafusion/physical-optimizer/src/lib.rs
@@ -23,10 +23,9 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 pub mod aggregate_statistics;
-pub mod coalesce_async_exec_input;
-pub mod coalesce_batches;
 pub mod combine_partial_final_agg;
 pub mod enforce_distribution;
 pub mod enforce_sorting;
@@ -40,9 +39,13 @@ pub mod optimizer;
 pub mod output_requirements;
 pub mod projection_pushdown;
 pub use datafusion_pruning as pruning;
+pub mod hash_join_buffering;
+pub mod pushdown_sort;
 pub mod sanity_checker;
 pub mod topk_aggregation;
+pub mod topk_repartition;
 pub mod update_aggr_exprs;
 pub mod utils;
+pub mod window_topn;
 
-pub use optimizer::PhysicalOptimizerRule;
+pub use optimizer::{ConfigOnlyContext, PhysicalOptimizerContext, PhysicalOptimizerRule};
diff --git a/datafusion/physical-optimizer/src/limit_pushdown.rs b/datafusion/physical-optimizer/src/limit_pushdown.rs
index 7469c3af9344c..c5fa0cc3ee78c 100644
--- a/datafusion/physical-optimizer/src/limit_pushdown.rs
+++ b/datafusion/physical-optimizer/src/limit_pushdown.rs
@@ -17,6 +17,48 @@
 
 //! [`LimitPushdown`] pushes `LIMIT` down through `ExecutionPlan`s to reduce
 //! data transfer as much as possible.
+//!
+//! # Plan Limit Absorption
+//! In addition to pushing down `GlobalLimitExec` and `LocalLimitExec` nodes in
+//! the plan, some operators can "absorb" a limit and stop early during
+//! execution.
+//!
+//! ## Background: vectorized volcano execution model
+//! DataFusion uses a batched volcano model. For most operators, output is
+//! produced in batches of `datafusion.execution.batch_size` (default 8192), so
+//! the batch sizes typically look like:
+//! ```text
+//! 8192, 8192, ..., 8192, 100 (the final batch may be partial)
+//! ```
+//!
+//! ## Example
+//! For a join with an expensive, selective predicate:
+//! ```text
+//! GlobalLimitExec: skip=0, fetch=10
+//! -- NestedLoopJoinExec(on=expr_expensive_and_selective)
+//! --- DataSourceExec()
+//! --- DataSourceExec()
+//! ```
+//!
+//! Under this model, `NestedLoopJoinExec` would keep working until it can emit
+//! a full batch (8192 rows), even though the query only needs 10. If the limit
+//! cannot be pushed below the join, we can still embed it inside the join so it
+//! stops once the limit is satisfied. The transformed plan looks like:
+//!
+//! ```text
+//! NestedLoopJoinExec(on=expr_expensive_and_selective, fetch=10)
+//! --- DataSourceExec()
+//! --- DataSourceExec()
+//! ```
+//!
+//! ## Implementation
+//! The current optimizer rule optionally pushes `fetch` requirements into
+//! operators via [`ExecutionPlan::with_fetch`].
+//!
+//! To support early termination in operators, [`LimitedBatchCoalescer`](https://docs.rs/datafusion/latest/datafusion/physical_plan/coalesce/struct.LimitedBatchCoalescer.html)
+//! can help manage the output buffer.
+//!
+//! Reference implementation in Hash Join: <https://github.com/apache/datafusion/pull/20228>
 
 use std::fmt::Debug;
 use std::sync::Arc;
@@ -37,23 +79,23 @@ use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
 pub struct LimitPushdown {}
 
 /// This is a "data class" we use within the [`LimitPushdown`] rule to push
-/// down [`LimitExec`] in the plan. GlobalRequirements are hold as a rule-wide state
+/// down limits in the plan. GlobalRequirements are hold as a rule-wide state
 /// and holds the fetch and skip information. The struct also has a field named
 /// satisfied which means if the "current" plan is valid in terms of limits or not.
 ///
 /// For example: If the plan is satisfied with current fetch info, we decide to not add a LocalLimit
 ///
 /// [`LimitPushdown`]: crate::limit_pushdown::LimitPushdown
-/// [`LimitExec`]: crate::limit_pushdown::LimitExec
 #[derive(Default, Clone, Debug)]
 pub struct GlobalRequirements {
     fetch: Option<usize>,
     skip: usize,
     satisfied: bool,
+    preserve_order: bool,
 }
 
 impl LimitPushdown {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -69,6 +111,7 @@ impl PhysicalOptimizerRule for LimitPushdown {
             fetch: None,
             skip: 0,
             satisfied: false,
+            preserve_order: false,
         };
         pushdown_limits(plan, global_state)
     }
@@ -82,44 +125,11 @@ impl PhysicalOptimizerRule for LimitPushdown {
     }
 }
 
-/// This enumeration makes `skip` and `fetch` calculations easier by providing
-/// a single API for both local and global limit operators.
-#[derive(Debug)]
-pub enum LimitExec {
-    Global(GlobalLimitExec),
-    Local(LocalLimitExec),
-}
-
-impl LimitExec {
-    fn input(&self) -> &Arc<dyn ExecutionPlan> {
-        match self {
-            Self::Global(global) => global.input(),
-            Self::Local(local) => local.input(),
-        }
-    }
-
-    fn fetch(&self) -> Option<usize> {
-        match self {
-            Self::Global(global) => global.fetch(),
-            Self::Local(local) => Some(local.fetch()),
-        }
-    }
-
-    fn skip(&self) -> usize {
-        match self {
-            Self::Global(global) => global.skip(),
-            Self::Local(_) => 0,
-        }
-    }
-}
-
-impl From<LimitExec> for Arc<dyn ExecutionPlan> {
-    fn from(limit_exec: LimitExec) -> Self {
-        match limit_exec {
-            LimitExec::Global(global) => Arc::new(global),
-            LimitExec::Local(local) => Arc::new(local),
-        }
-    }
+struct LimitInfo {
+    input: Arc<dyn ExecutionPlan>,
+    fetch: Option<usize>,
+    skip: usize,
+    preserve_order: bool,
 }
 
 /// This function is the main helper function of the `LimitPushDown` rule.
@@ -134,24 +144,26 @@ pub fn pushdown_limit_helper(
     mut global_state: GlobalRequirements,
 ) -> Result<(Transformed<Arc<dyn ExecutionPlan>>, GlobalRequirements)> {
     // Extract limit, if exist, and return child inputs.
-    if let Some(limit_exec) = extract_limit(&pushdown_plan) {
+    if let Some(limit_info) = extract_limit(&pushdown_plan) {
         // If we have fetch/skip info in the global state already, we need to
         // decide which one to continue with:
         let (skip, fetch) = combine_limit(
             global_state.skip,
             global_state.fetch,
-            limit_exec.skip(),
-            limit_exec.fetch(),
+            limit_info.skip,
+            limit_info.fetch,
         );
         global_state.skip = skip;
         global_state.fetch = fetch;
+        global_state.preserve_order = limit_info.preserve_order;
+        global_state.satisfied = false;
 
         // Now the global state has the most recent information, we can remove
-        // the `LimitExec` plan. We will decide later if we should add it again
-        // or not.
+        // the limit node. We will decide later if we should add it again or
+        // not.
         return Ok((
             Transformed {
-                data: Arc::clone(limit_exec.input()),
+                data: limit_info.input,
                 transformed: true,
                 tnr: TreeNodeRecursion::Stop,
             },
@@ -162,7 +174,7 @@ pub fn pushdown_limit_helper(
     // If we have a non-limit operator with fetch capability, update global
     // state as necessary:
     if pushdown_plan.fetch().is_some() {
-        if global_state.fetch.is_none() {
+        if global_state.skip == 0 {
             global_state.satisfied = true;
         }
         (global_state.skip, global_state.fetch) = combine_limit(
@@ -201,7 +213,7 @@ pub fn pushdown_limit_helper(
             Ok((Transformed::no(pushdown_plan), global_state))
         } else if let Some(plan_with_fetch) = pushdown_plan.with_fetch(skip_and_fetch) {
             // This plan is combining input partitions, so we need to add the
-            // fetch info to plan if possible. If not, we must add a `LimitExec`
+            // fetch info to plan if possible. If not, we must add a limit node
             // with the information from the global state.
             let mut new_plan = plan_with_fetch;
             // Execution plans can't (yet) handle skip, so if we have one,
@@ -241,17 +253,28 @@ pub fn pushdown_limit_helper(
         let maybe_fetchable = pushdown_plan.with_fetch(skip_and_fetch);
         if global_state.satisfied {
             if let Some(plan_with_fetch) = maybe_fetchable {
-                Ok((Transformed::yes(plan_with_fetch), global_state))
+                let plan_with_preserve_order = plan_with_fetch
+                    .with_preserve_order(global_state.preserve_order)
+                    .unwrap_or(plan_with_fetch);
+                Ok((Transformed::yes(plan_with_preserve_order), global_state))
             } else {
                 Ok((Transformed::no(pushdown_plan), global_state))
             }
         } else {
             global_state.satisfied = true;
             pushdown_plan = if let Some(plan_with_fetch) = maybe_fetchable {
+                let plan_with_preserve_order = plan_with_fetch
+                    .with_preserve_order(global_state.preserve_order)
+                    .unwrap_or(plan_with_fetch);
+
                 if global_skip > 0 {
-                    add_global_limit(plan_with_fetch, global_skip, Some(global_fetch))
+                    add_global_limit(
+                        plan_with_preserve_order,
+                        global_skip,
+                        Some(global_fetch),
+                    )
                 } else {
-                    plan_with_fetch
+                    plan_with_preserve_order
                 }
             } else {
                 add_limit(pushdown_plan, global_skip, global_fetch)
@@ -278,39 +301,50 @@ pub(crate) fn pushdown_limits(
 
     // Apply pushdown limits in children
     let children = new_node.data.children();
+    let mut changed = false;
     let new_children = children
         .into_iter()
-        .map(|child| {
-            pushdown_limits(Arc::<dyn ExecutionPlan>::clone(child), global_state.clone())
+        .map(|child: &Arc<dyn ExecutionPlan>| {
+            let new_child = pushdown_limits(
+                Arc::<dyn ExecutionPlan>::clone(child),
+                global_state.clone(),
+            )?;
+            // Tracking if any of the children changed
+            changed |= !Arc::ptr_eq(child, &new_child);
+            Ok(new_child)
         })
         .collect::<Result<_>>()?;
-    new_node.data.with_new_children(new_children)
+
+    if changed {
+        new_node.data.with_new_children(new_children)
+    } else {
+        Ok(new_node.data)
+    }
 }
 
-/// Transforms the [`ExecutionPlan`] into a [`LimitExec`] if it is a
+/// Extracts limit information from the [`ExecutionPlan`] if it is a
 /// [`GlobalLimitExec`] or a [`LocalLimitExec`].
-fn extract_limit(plan: &Arc<dyn ExecutionPlan>) -> Option<LimitExec> {
-    if let Some(global_limit) = plan.as_any().downcast_ref::<GlobalLimitExec>() {
-        Some(LimitExec::Global(GlobalLimitExec::new(
-            Arc::clone(global_limit.input()),
-            global_limit.skip(),
-            global_limit.fetch(),
-        )))
+fn extract_limit(plan: &Arc<dyn ExecutionPlan>) -> Option<LimitInfo> {
+    if let Some(global_limit) = plan.downcast_ref::<GlobalLimitExec>() {
+        Some(LimitInfo {
+            input: Arc::clone(global_limit.input()),
+            fetch: global_limit.fetch(),
+            skip: global_limit.skip(),
+            preserve_order: global_limit.required_ordering().is_some(),
+        })
     } else {
-        plan.as_any()
-            .downcast_ref::<LocalLimitExec>()
-            .map(|local_limit| {
-                LimitExec::Local(LocalLimitExec::new(
-                    Arc::clone(local_limit.input()),
-                    local_limit.fetch(),
-                ))
+        plan.downcast_ref::<LocalLimitExec>()
+            .map(|local_limit| LimitInfo {
+                input: Arc::clone(local_limit.input()),
+                fetch: Some(local_limit.fetch()),
+                skip: 0,
+                preserve_order: local_limit.required_ordering().is_some(),
             })
     }
 }
 
 /// Checks if the given plan combines input partitions.
 fn combines_input_partitions(plan: &Arc<dyn ExecutionPlan>) -> bool {
-    let plan = plan.as_any();
     plan.is::<CoalescePartitionsExec>() || plan.is::<SortPreservingMergeExec>()
 }
 
diff --git a/datafusion/physical-optimizer/src/limit_pushdown_past_window.rs b/datafusion/physical-optimizer/src/limit_pushdown_past_window.rs
index 1c671cd074886..092570b051979 100644
--- a/datafusion/physical-optimizer/src/limit_pushdown_past_window.rs
+++ b/datafusion/physical-optimizer/src/limit_pushdown_past_window.rs
@@ -16,16 +16,16 @@
 // under the License.
 
 use crate::PhysicalOptimizerRule;
+use datafusion_common::ScalarValue;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::tree_node::{Transformed, TreeNode};
-use datafusion_common::ScalarValue;
 use datafusion_expr::{LimitEffect, WindowFrameBound, WindowFrameUnits};
 use datafusion_physical_expr::window::{
     PlainAggregateWindowExpr, SlidingAggregateWindowExpr, StandardWindowExpr,
     StandardWindowFunctionExpr, WindowExpr,
 };
 use datafusion_physical_plan::execution_plan::CardinalityEffect;
-use datafusion_physical_plan::limit::GlobalLimitExec;
+use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
 use datafusion_physical_plan::repartition::RepartitionExec;
 use datafusion_physical_plan::sorts::sort::SortExec;
 use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
@@ -104,7 +104,7 @@ impl PhysicalOptimizerRule for LimitPushPastWindows {
             }
 
             // grow the limit if we hit a window function
-            if let Some(window) = node.as_any().downcast_ref::<BoundedWindowAggExec>() {
+            if let Some(window) = node.downcast_ref::<BoundedWindowAggExec>() {
                 phase = Phase::Apply;
                 if !grow_limit(window, &mut ctx) {
                     return reset(node, &mut ctx);
@@ -113,17 +113,17 @@ impl PhysicalOptimizerRule for LimitPushPastWindows {
             }
 
             // Apply the limit if we hit a sortpreservingmerge node
-            if phase == Phase::Apply {
-                if let Some(out) = apply_limit(&node, &mut ctx) {
-                    return Ok(out);
-                }
+            if phase == Phase::Apply
+                && let Some(out) = apply_limit(&node, &mut ctx)
+            {
+                return Ok(out);
             }
 
             // nodes along the way
             if !node.supports_limit_pushdown() {
                 return reset(node, &mut ctx);
             }
-            if let Some(part) = node.as_any().downcast_ref::<RepartitionExec>() {
+            if let Some(part) = node.downcast_ref::<RepartitionExec>() {
                 let output = part.partitioning().partition_count();
                 let input = part.input().output_partitioning().partition_count();
                 if output < input {
@@ -185,7 +185,7 @@ fn apply_limit(
     node: &Arc<dyn ExecutionPlan>,
     ctx: &mut TraverseState,
 ) -> Option<Transformed<Arc<dyn ExecutionPlan>>> {
-    if !node.as_any().is::<SortExec>() && !node.as_any().is::<SortPreservingMergeExec>() {
+    if !node.is::<SortExec>() && !node.is::<SortPreservingMergeExec>() {
         return None;
     }
     let latest = ctx.limit.take();
@@ -202,11 +202,17 @@ fn apply_limit(
 }
 
 fn get_limit(node: &Arc<dyn ExecutionPlan>, ctx: &mut TraverseState) -> bool {
-    if let Some(limit) = node.as_any().downcast_ref::<GlobalLimitExec>() {
+    if let Some(limit) = node.downcast_ref::<GlobalLimitExec>() {
         ctx.reset_limit(limit.fetch().map(|fetch| fetch + limit.skip()));
         return true;
     }
-    if let Some(limit) = node.as_any().downcast_ref::<SortPreservingMergeExec>() {
+    // In distributed execution, GlobalLimitExec becomes LocalLimitExec
+    // per partition. Handle it the same way (LocalLimitExec has no skip).
+    if let Some(limit) = node.downcast_ref::<LocalLimitExec>() {
+        ctx.reset_limit(Some(limit.fetch()));
+        return true;
+    }
+    if let Some(limit) = node.downcast_ref::<SortPreservingMergeExec>() {
         ctx.reset_limit(limit.fetch());
         return true;
     }
@@ -254,3 +260,110 @@ fn bound_to_usize(bound: &WindowFrameBound) -> Option<usize> {
         _ => None,
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_expr::WindowFrame;
+    use datafusion_functions_window::row_number::row_number_udwf;
+    use datafusion_physical_expr::expressions::col;
+    use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+    use datafusion_physical_plan::InputOrderMode;
+    use datafusion_physical_plan::displayable;
+    use datafusion_physical_plan::placeholder_row::PlaceholderRowExec;
+    use datafusion_physical_plan::windows::{
+        BoundedWindowAggExec, create_udwf_window_expr,
+    };
+    use insta::assert_snapshot;
+
+    fn plan_str(plan: &dyn ExecutionPlan) -> String {
+        displayable(plan).indent(true).to_string()
+    }
+
+    fn schema() -> Arc<Schema> {
+        Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]))
+    }
+
+    /// Build: LocalLimitExec or GlobalLimitExec → BoundedWindowAggExec(row_number) → SortExec
+    fn build_window_plan(
+        use_local_limit: bool,
+    ) -> datafusion_common::Result<Arc<dyn ExecutionPlan>> {
+        let s = schema();
+        let input: Arc<dyn ExecutionPlan> =
+            Arc::new(PlaceholderRowExec::new(Arc::clone(&s)));
+
+        let ordering =
+            LexOrdering::new(vec![PhysicalSortExpr::new_default(col("a", &s)?).asc()])
+                .unwrap();
+
+        let sort: Arc<dyn ExecutionPlan> = Arc::new(
+            SortExec::new(ordering.clone(), input).with_preserve_partitioning(true),
+        );
+
+        let window_expr = Arc::new(StandardWindowExpr::new(
+            create_udwf_window_expr(
+                &row_number_udwf(),
+                &[],
+                &s,
+                "row_number".to_string(),
+                false,
+            )?,
+            &[],
+            ordering.as_ref(),
+            Arc::new(WindowFrame::new_bounds(
+                WindowFrameUnits::Rows,
+                WindowFrameBound::Preceding(ScalarValue::UInt64(None)),
+                WindowFrameBound::CurrentRow,
+            )),
+        ));
+
+        let window: Arc<dyn ExecutionPlan> = Arc::new(BoundedWindowAggExec::try_new(
+            vec![window_expr],
+            sort,
+            InputOrderMode::Sorted,
+            true,
+        )?);
+
+        let limit: Arc<dyn ExecutionPlan> = if use_local_limit {
+            Arc::new(LocalLimitExec::new(window, 100))
+        } else {
+            Arc::new(GlobalLimitExec::new(window, 0, Some(100)))
+        };
+
+        Ok(limit)
+    }
+
+    fn optimize(plan: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
+        let mut config = ConfigOptions::new();
+        config.optimizer.enable_window_limits = true;
+        LimitPushPastWindows::new().optimize(plan, &config).unwrap()
+    }
+
+    /// GlobalLimitExec above a windowed sort should push fetch into the SortExec.
+    #[test]
+    fn global_limit_pushes_past_window() {
+        let plan = build_window_plan(false).unwrap();
+        let optimized = optimize(plan);
+        assert_snapshot!(plan_str(optimized.as_ref()), @r#"
+        GlobalLimitExec: skip=0, fetch=100
+          BoundedWindowAggExec: wdw=[row_number: Field { "row_number": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+            SortExec: TopK(fetch=100), expr=[a@0 ASC], preserve_partitioning=[true]
+              PlaceholderRowExec
+        "#);
+    }
+
+    /// LocalLimitExec above a windowed sort should also push fetch into the SortExec.
+    /// This is the case in distributed execution where GlobalLimitExec becomes LocalLimitExec.
+    #[test]
+    fn local_limit_pushes_past_window() {
+        let plan = build_window_plan(true).unwrap();
+        let optimized = optimize(plan);
+        assert_snapshot!(plan_str(optimized.as_ref()), @r#"
+        LocalLimitExec: fetch=100
+          BoundedWindowAggExec: wdw=[row_number: Field { "row_number": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+            SortExec: TopK(fetch=100), expr=[a@0 ASC], preserve_partitioning=[true]
+              PlaceholderRowExec
+        "#);
+    }
+}
diff --git a/datafusion/physical-optimizer/src/limited_distinct_aggregation.rs b/datafusion/physical-optimizer/src/limited_distinct_aggregation.rs
index 3666ff3798b67..852dc2a2a9434 100644
--- a/datafusion/physical-optimizer/src/limited_distinct_aggregation.rs
+++ b/datafusion/physical-optimizer/src/limited_distinct_aggregation.rs
@@ -20,13 +20,13 @@
 
 use std::sync::Arc;
 
-use datafusion_physical_plan::aggregates::AggregateExec;
+use datafusion_physical_plan::aggregates::{AggregateExec, LimitOptions};
 use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
 use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
 
+use datafusion_common::Result;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::Result;
 
 use crate::PhysicalOptimizerRule;
 use itertools::Itertools;
@@ -54,16 +54,8 @@ impl LimitedDistinctAggregation {
         }
 
         // We found what we want: clone, copy the limit down, and return modified node
-        let new_aggr = AggregateExec::try_new(
-            *aggr.mode(),
-            aggr.group_expr().clone(),
-            aggr.aggr_expr().to_vec(),
-            aggr.filter_expr().to_vec(),
-            aggr.input().to_owned(),
-            aggr.input_schema(),
-        )
-        .expect("Unable to copy Aggregate!")
-        .with_limit(Some(limit));
+        let new_aggr = aggr.with_new_limit_options(Some(LimitOptions::new(limit)));
+
         Some(Arc::new(new_aggr))
     }
 
@@ -77,11 +69,10 @@ impl LimitedDistinctAggregation {
         let mut global_skip: usize = 0;
         let children: Vec<Arc<dyn ExecutionPlan>>;
         let mut is_global_limit = false;
-        if let Some(local_limit) = plan.as_any().downcast_ref::<LocalLimitExec>() {
+        if let Some(local_limit) = plan.downcast_ref::<LocalLimitExec>() {
             limit = local_limit.fetch();
             children = local_limit.children().into_iter().cloned().collect();
-        } else if let Some(global_limit) = plan.as_any().downcast_ref::<GlobalLimitExec>()
-        {
+        } else if let Some(global_limit) = plan.downcast_ref::<GlobalLimitExec>() {
             global_fetch = global_limit.fetch();
             global_fetch?;
             global_skip = global_limit.skip();
@@ -112,18 +103,15 @@ impl LimitedDistinctAggregation {
             if !rewrite_applicable {
                 return Ok(Transformed::no(plan));
             }
-            if let Some(aggr) = plan.as_any().downcast_ref::<AggregateExec>() {
-                if found_match_aggr {
-                    if let Some(parent_aggr) =
-                        match_aggr.as_any().downcast_ref::<AggregateExec>()
-                    {
-                        if !parent_aggr.group_expr().eq(aggr.group_expr()) {
-                            // a partial and final aggregation with different groupings disqualifies
-                            // rewriting the child aggregation
-                            rewrite_applicable = false;
-                            return Ok(Transformed::no(plan));
-                        }
-                    }
+            if let Some(aggr) = plan.downcast_ref::<AggregateExec>() {
+                if found_match_aggr
+                    && let Some(parent_aggr) = match_aggr.downcast_ref::<AggregateExec>()
+                    && !parent_aggr.group_expr().eq(aggr.group_expr())
+                {
+                    // a partial and final aggregation with different groupings disqualifies
+                    // rewriting the child aggregation
+                    rewrite_applicable = false;
+                    return Ok(Transformed::no(plan));
                 }
                 // either we run into an Aggregate and transform it, or disable the rewrite
                 // for subsequent children
diff --git a/datafusion/physical-optimizer/src/optimizer.rs b/datafusion/physical-optimizer/src/optimizer.rs
index 4d00f1029db71..05df642f8446b 100644
--- a/datafusion/physical-optimizer/src/optimizer.rs
+++ b/datafusion/physical-optimizer/src/optimizer.rs
@@ -21,7 +21,6 @@ use std::fmt::Debug;
 use std::sync::Arc;
 
 use crate::aggregate_statistics::AggregateStatistics;
-use crate::coalesce_batches::CoalesceBatches;
 use crate::combine_partial_final_agg::CombinePartialFinalAggregate;
 use crate::enforce_distribution::EnforceDistribution;
 use crate::enforce_sorting::EnforceSorting;
@@ -34,13 +33,58 @@ use crate::output_requirements::OutputRequirements;
 use crate::projection_pushdown::ProjectionPushdown;
 use crate::sanity_checker::SanityCheckPlan;
 use crate::topk_aggregation::TopKAggregation;
+use crate::topk_repartition::TopKRepartition;
 use crate::update_aggr_exprs::OptimizeAggregateOrder;
 
-use crate::coalesce_async_exec_input::CoalesceAsyncExecInput;
+use crate::hash_join_buffering::HashJoinBuffering;
 use crate::limit_pushdown_past_window::LimitPushPastWindows;
-use datafusion_common::config::ConfigOptions;
+use crate::pushdown_sort::PushdownSort;
+use crate::window_topn::WindowTopN;
 use datafusion_common::Result;
+use datafusion_common::config::ConfigOptions;
 use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::operator_statistics::StatisticsRegistry;
+
+/// Context available to physical optimizer rules.
+///
+/// This trait provides access to configuration options and optional statistics
+/// registry for enhanced statistics lookup. It allows optimizer rules to access
+/// extended context without changing the core [`PhysicalOptimizerRule::optimize`]
+/// signature.
+pub trait PhysicalOptimizerContext: Send + Sync {
+    /// Returns the configuration options.
+    fn config_options(&self) -> &ConfigOptions;
+
+    /// Returns the statistics registry for enhanced statistics lookup.
+    ///
+    /// Returns `None` if no registry is configured, in which case rules
+    /// should fall back to using `ExecutionPlan::partition_statistics()`.
+    fn statistics_registry(&self) -> Option<&StatisticsRegistry> {
+        None
+    }
+}
+
+/// Simple context wrapping [`ConfigOptions`] for backward compatibility.
+///
+/// This struct provides a minimal implementation of [`PhysicalOptimizerContext`]
+/// that only supplies configuration options. Used when no statistics registry
+/// is available or needed.
+pub struct ConfigOnlyContext<'a> {
+    config: &'a ConfigOptions,
+}
+
+impl<'a> ConfigOnlyContext<'a> {
+    /// Create a new context wrapping the given config options.
+    pub fn new(config: &'a ConfigOptions) -> Self {
+        Self { config }
+    }
+}
+
+impl PhysicalOptimizerContext for ConfigOnlyContext<'_> {
+    fn config_options(&self) -> &ConfigOptions {
+        self.config
+    }
+}
 
 /// `PhysicalOptimizerRule` transforms one ['ExecutionPlan'] into another which
 /// computes the same results, but in a potentially more efficient way.
@@ -49,14 +93,30 @@ use datafusion_physical_plan::ExecutionPlan;
 /// `PhysicalOptimizerRule`s.
 ///
 /// [`SessionState::add_physical_optimizer_rule`]: https://docs.rs/datafusion/latest/datafusion/execution/session_state/struct.SessionState.html#method.add_physical_optimizer_rule
-pub trait PhysicalOptimizerRule: Debug {
-    /// Rewrite `plan` to an optimized form
+pub trait PhysicalOptimizerRule: Debug + std::any::Any {
+    /// Rewrite `plan` to an optimized form.
+    ///
+    /// This is the primary optimization method. For rules that need access to
+    /// the statistics registry, override [`optimize_with_context`](Self::optimize_with_context) instead.
     fn optimize(
         &self,
         plan: Arc<dyn ExecutionPlan>,
         config: &ConfigOptions,
     ) -> Result<Arc<dyn ExecutionPlan>>;
 
+    /// Rewrite `plan` with access to extended context (statistics registry, etc.).
+    ///
+    /// Override this method if you need access to the statistics registry for
+    /// enhanced statistics lookup. The default implementation simply calls
+    /// [`optimize`](Self::optimize) with the config options from the context.
+    fn optimize_with_context(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        context: &dyn PhysicalOptimizerContext,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        self.optimize(plan, context.config_options())
+    }
+
     /// A human readable name for this optimizer rule
     fn name(&self) -> &str;
 
@@ -83,6 +143,12 @@ impl Default for PhysicalOptimizer {
 impl PhysicalOptimizer {
     /// Create a new optimizer using the recommended list of rules
     pub fn new() -> Self {
+        // NOTEs:
+        // - The order of rules in this list is important, as it determines the
+        //   order in which they are applied.
+        // - Adding a new rule here is expensive as it will be applied to all
+        //   queries, and will likely increase the optimization time. Please extend
+        //   existing rules when possible, rather than adding a new rule.
         let rules: Vec<Arc<dyn PhysicalOptimizerRule + Send + Sync>> = vec![
             // If there is a output requirement of the query, make sure that
             // this information is not lost across different rules during optimization.
@@ -118,12 +184,13 @@ impl PhysicalOptimizer {
             Arc::new(EnforceSorting::new()),
             // Run once after the local sorting requirement is changed
             Arc::new(OptimizeAggregateOrder::new()),
+            // WindowTopN: replaces Filter(rn<=K) → Window(ROW_NUMBER) → Sort
+            // with Window(ROW_NUMBER) → PartitionedTopKExec(fetch=K).
+            // Must run after EnforceSorting (which inserts SortExec) and before
+            // ProjectionPushdown (which embeds projections into FilterExec).
+            Arc::new(WindowTopN::new()),
             // TODO: `try_embed_to_hash_join` in the ProjectionPushdown rule would be block by the CoalesceBatches, so add it before CoalesceBatches. Maybe optimize it in the future.
             Arc::new(ProjectionPushdown::new()),
-            // The CoalesceBatches rule will not influence the distribution and ordering of the
-            // whole plan tree. Therefore, to avoid influencing other rules, it should run last.
-            Arc::new(CoalesceBatches::new()),
-            Arc::new(CoalesceAsyncExecInput::new()),
             // Remove the ancillary output requirement operator since we are done with the planning
             // phase.
             Arc::new(OutputRequirements::new_remove_mode()),
@@ -136,10 +203,19 @@ impl PhysicalOptimizer {
             // This can possibly be combined with [LimitPushdown]
             // It needs to come after [EnforceSorting]
             Arc::new(LimitPushPastWindows::new()),
+            // The HashJoinBuffering rule adds a BufferExec node with the configured capacity
+            // in the prob side of hash joins. That way, the probe side gets eagerly polled before
+            // the build side is completely finished.
+            Arc::new(HashJoinBuffering::new()),
             // The LimitPushdown rule tries to push limits down as far as possible,
             // replacing operators with fetching variants, or adding limits
             // past operators that support limit pushdown.
             Arc::new(LimitPushdown::new()),
+            // TopKRepartition pushes TopK (Sort with fetch) below Hash
+            // repartition when the partition key is a prefix of the sort key.
+            // This reduces data volume before a hash shuffle. It must run
+            // after LimitPushdown so that the TopK already exists on the SortExec.
+            Arc::new(TopKRepartition::new()),
             // The ProjectionPushdown rule tries to push projections towards
             // the sources in the execution plan. As a result of this process,
             // a projection can disappear if it reaches the source providers, and
@@ -147,9 +223,11 @@ impl PhysicalOptimizer {
             // are not present, the load of executors such as join or union will be
             // reduced by narrowing their input tables.
             Arc::new(ProjectionPushdown::new()),
+            // PushdownSort: Detect sorts that can be pushed down to data sources.
+            Arc::new(PushdownSort::new()),
             Arc::new(EnsureCooperative::new()),
             // This FilterPushdown handles dynamic filters that may have references to the source ExecutionPlan.
-            // Therefore it should be run at the end of the optimization process since any changes to the plan may break the dynamic filter's references.
+            // Therefore, it should be run at the end of the optimization process since any changes to the plan may break the dynamic filter's references.
             // See `FilterPushdownPhase` for more details.
             Arc::new(FilterPushdown::new_post_optimization()),
             // The SanityCheckPlan rule checks whether the order and
diff --git a/datafusion/physical-optimizer/src/output_requirements.rs b/datafusion/physical-optimizer/src/output_requirements.rs
index 9e5e980219767..81df6f943c15e 100644
--- a/datafusion/physical-optimizer/src/output_requirements.rs
+++ b/datafusion/physical-optimizer/src/output_requirements.rs
@@ -27,14 +27,16 @@ use std::sync::Arc;
 use crate::PhysicalOptimizerRule;
 
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
+use datafusion_common::tree_node::{
+    Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
+};
 use datafusion_common::{Result, Statistics};
 use datafusion_execution::TaskContext;
 use datafusion_physical_expr::Distribution;
 use datafusion_physical_expr_common::sort_expr::OrderingRequirements;
 use datafusion_physical_plan::execution_plan::Boundedness;
 use datafusion_physical_plan::projection::{
-    make_with_child, update_expr, update_ordering_requirement, ProjectionExec,
+    ProjectionExec, make_with_child, update_expr, update_ordering_requirement,
 };
 use datafusion_physical_plan::sorts::sort::SortExec;
 use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
@@ -98,7 +100,7 @@ pub struct OutputRequirementExec {
     input: Arc<dyn ExecutionPlan>,
     order_requirement: Option<OrderingRequirements>,
     dist_requirement: Distribution,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
     fetch: Option<usize>,
 }
 
@@ -114,7 +116,7 @@ impl OutputRequirementExec {
             input,
             order_requirement: requirements,
             dist_requirement,
-            cache,
+            cache: Arc::new(cache),
             fetch,
         }
     }
@@ -196,11 +198,7 @@ impl ExecutionPlan for OutputRequirementExec {
         "OutputRequirementExec"
     }
 
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -244,11 +242,7 @@ impl ExecutionPlan for OutputRequirementExec {
         unreachable!();
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.input.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         self.input.partition_statistics(partition)
     }
 
@@ -300,6 +294,36 @@ impl ExecutionPlan for OutputRequirementExec {
     fn fetch(&self) -> Option<usize> {
         self.fetch
     }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion_physical_expr_common::physical_expr::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in order_requirement
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(order_reqs) = &self.order_requirement {
+            let lexes = match order_reqs {
+                OrderingRequirements::Hard(alternatives) => alternatives,
+                OrderingRequirements::Soft(alternatives) => alternatives,
+            };
+            for lex in lexes {
+                for sort_expr in lex {
+                    tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+                }
+            }
+        }
+
+        // Visit expressions in dist_requirement if it's HashPartitioned
+        if let Distribution::HashPartitioned(exprs) = &self.dist_requirement {
+            for expr in exprs {
+                tnr = tnr.visit_sibling(|| f(expr.as_ref()))?;
+            }
+        }
+
+        Ok(tnr)
+    }
 }
 
 impl PhysicalOptimizerRule for OutputRequirements {
@@ -312,9 +336,7 @@ impl PhysicalOptimizerRule for OutputRequirements {
             RuleMode::Add => require_top_ordering(plan),
             RuleMode::Remove => plan
                 .transform_up(|plan| {
-                    if let Some(sort_req) =
-                        plan.as_any().downcast_ref::<OutputRequirementExec>()
-                    {
+                    if let Some(sort_req) = plan.downcast_ref::<OutputRequirementExec>() {
                         Ok(Transformed::yes(sort_req.input()))
                     } else {
                         Ok(Transformed::no(plan))
@@ -361,7 +383,7 @@ fn require_top_ordering_helper(
     // Global ordering defines desired ordering in the final result.
     if children.len() != 1 {
         Ok((plan, false))
-    } else if let Some(sort_exec) = plan.as_any().downcast_ref::<SortExec>() {
+    } else if let Some(sort_exec) = plan.downcast_ref::<SortExec>() {
         // In case of constant columns, output ordering of the `SortExec` would
         // be an empty set. Therefore; we check the sort expression field to
         // assign the requirements.
@@ -379,7 +401,7 @@ fn require_top_ordering_helper(
             )) as _,
             true,
         ))
-    } else if let Some(spm) = plan.as_any().downcast_ref::<SortPreservingMergeExec>() {
+    } else if let Some(spm) = plan.downcast_ref::<SortPreservingMergeExec>() {
         let reqs = OrderingRequirements::from(spm.expr().clone());
         let fetch = spm.fetch();
         Ok((
@@ -402,7 +424,14 @@ fn require_top_ordering_helper(
         // be responsible for (i.e. the originator of) the global ordering.
         let (new_child, is_changed) =
             require_top_ordering_helper(Arc::clone(children.swap_remove(0)))?;
-        Ok((plan.with_new_children(vec![new_child])?, is_changed))
+
+        let plan = if is_changed {
+            plan.with_new_children(vec![new_child])?
+        } else {
+            plan
+        };
+
+        Ok((plan, is_changed))
     } else {
         // Stop searching, there is no global ordering desired for the query.
         Ok((plan, false))
diff --git a/datafusion/physical-optimizer/src/projection_pushdown.rs b/datafusion/physical-optimizer/src/projection_pushdown.rs
index 987e3cb6f713e..fe71c211769c8 100644
--- a/datafusion/physical-optimizer/src/projection_pushdown.rs
+++ b/datafusion/physical-optimizer/src/projection_pushdown.rs
@@ -32,13 +32,13 @@ use datafusion_common::tree_node::{
 };
 use datafusion_common::{JoinSide, JoinType, Result};
 use datafusion_physical_expr::expressions::Column;
-use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-use datafusion_physical_plan::joins::utils::{ColumnIndex, JoinFilter};
+use datafusion_physical_expr_common::physical_expr::{PhysicalExpr, is_volatile};
+use datafusion_physical_plan::ExecutionPlan;
 use datafusion_physical_plan::joins::NestedLoopJoinExec;
+use datafusion_physical_plan::joins::utils::{ColumnIndex, JoinFilter};
 use datafusion_physical_plan::projection::{
-    remove_unnecessary_projections, ProjectionExec,
+    ProjectionExec, remove_unnecessary_projections,
 };
-use datafusion_physical_plan::ExecutionPlan;
 
 /// This rule inspects `ProjectionExec`'s in the given physical plan and tries to
 /// remove or swap with its child.
@@ -50,7 +50,7 @@ use datafusion_physical_plan::ExecutionPlan;
 pub struct ProjectionPushdown {}
 
 impl ProjectionPushdown {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -64,15 +64,13 @@ impl PhysicalOptimizerRule for ProjectionPushdown {
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let alias_generator = AliasGenerator::new();
         let plan = plan
-            .transform_up(|plan| {
-                match plan.as_any().downcast_ref::<NestedLoopJoinExec>() {
-                    None => Ok(Transformed::no(plan)),
-                    Some(hash_join) => try_push_down_join_filter(
-                        Arc::clone(&plan),
-                        hash_join,
-                        &alias_generator,
-                    ),
-                }
+            .transform_up(|plan| match plan.downcast_ref::<NestedLoopJoinExec>() {
+                None => Ok(Transformed::no(plan)),
+                Some(hash_join) => try_push_down_join_filter(
+                    Arc::clone(&plan),
+                    hash_join,
+                    &alias_generator,
+                ),
             })
             .map(|t| t.data)?;
 
@@ -129,13 +127,13 @@ fn try_push_down_join_filter(
 
     let join_filter = minimize_join_filter(
         Arc::clone(rhs_rewrite.data.1.expression()),
-        rhs_rewrite.data.1.column_indices().to_vec(),
+        rhs_rewrite.data.1.column_indices(),
         lhs_rewrite.data.0.schema().as_ref(),
         rhs_rewrite.data.0.schema().as_ref(),
     );
 
     let new_lhs_length = lhs_rewrite.data.0.schema().fields.len();
-    let projections = match projections {
+    let projections = match projections.as_ref() {
         None => match join.join_type() {
             JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => {
                 // Build projections that ignore the newly projected columns.
@@ -238,13 +236,13 @@ fn try_push_down_projection(
 /// columns are not needed anymore.
 fn minimize_join_filter(
     expr: Arc<dyn PhysicalExpr>,
-    old_column_indices: Vec<ColumnIndex>,
+    old_column_indices: &[ColumnIndex],
     lhs_schema: &Schema,
     rhs_schema: &Schema,
 ) -> JoinFilter {
     let mut used_columns = HashSet::new();
     expr.apply(|expr| {
-        if let Some(col) = expr.as_any().downcast_ref::<Column>() {
+        if let Some(col) = expr.downcast_ref::<Column>() {
             used_columns.insert(col.index());
         }
         Ok(TreeNodeRecursion::Continue)
@@ -267,7 +265,7 @@ fn minimize_join_filter(
         .collect::<Fields>();
 
     let final_expr = expr
-        .transform_up(|expr| match expr.as_any().downcast_ref::<Column>() {
+        .transform_up(|expr| match expr.downcast_ref::<Column>() {
             None => Ok(Transformed::no(expr)),
             Some(column) => {
                 let new_idx = used_columns
@@ -349,8 +347,7 @@ impl<'a> JoinFilterRewriter<'a> {
         // Recurse if there is a dependency to both sides or if the entire expression is volatile.
         let depends_on_other_side =
             self.depends_on_join_side(&expr, self.join_side.negate())?;
-        let is_volatile = is_volatile_expression_tree(expr.as_ref());
-        if depends_on_other_side || is_volatile {
+        if depends_on_other_side || is_volatile(&expr) {
             return expr.map_children(|expr| self.rewrite(expr));
         }
 
@@ -381,7 +378,7 @@ impl<'a> JoinFilterRewriter<'a> {
         // executed against the filter schema.
         let new_idx = self.join_side_projections.len();
         let rewritten_expr = expr.transform_up(|expr| {
-            Ok(match expr.as_any().downcast_ref::<Column>() {
+            Ok(match expr.downcast_ref::<Column>() {
                 None => Transformed::no(expr),
                 Some(column) => {
                     let intermediate_column =
@@ -415,7 +412,7 @@ impl<'a> JoinFilterRewriter<'a> {
         join_side: JoinSide,
     ) -> Result<bool> {
         let mut result = false;
-        expr.apply(|expr| match expr.as_any().downcast_ref::<Column>() {
+        expr.apply(|expr| match expr.downcast_ref::<Column>() {
             None => Ok(TreeNodeRecursion::Continue),
             Some(c) => {
                 let column_index = &self.intermediate_column_indices[c.index()];
@@ -431,26 +428,14 @@ impl<'a> JoinFilterRewriter<'a> {
     }
 }
 
-fn is_volatile_expression_tree(expr: &dyn PhysicalExpr) -> bool {
-    if expr.is_volatile_node() {
-        return true;
-    }
-
-    expr.children()
-        .iter()
-        .map(|expr| is_volatile_expression_tree(expr.as_ref()))
-        .reduce(|lhs, rhs| lhs || rhs)
-        .unwrap_or(false)
-}
-
 #[cfg(test)]
 mod test {
     use super::*;
     use arrow::datatypes::{DataType, Field, FieldRef, Schema};
     use datafusion_expr_common::operator::Operator;
     use datafusion_functions::math::random;
-    use datafusion_physical_expr::expressions::{binary, lit};
     use datafusion_physical_expr::ScalarFunctionExpr;
+    use datafusion_physical_expr::expressions::{binary, lit};
     use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
     use datafusion_physical_plan::displayable;
     use datafusion_physical_plan::empty::EmptyExec;
diff --git a/datafusion/physical-optimizer/src/pushdown_sort.rs b/datafusion/physical-optimizer/src/pushdown_sort.rs
new file mode 100644
index 0000000000000..40a6fe2c205c7
--- /dev/null
+++ b/datafusion/physical-optimizer/src/pushdown_sort.rs
@@ -0,0 +1,199 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Sort Pushdown Optimization
+//!
+//! This optimizer attempts to push sort requirements down through the execution plan
+//! tree to data sources that can natively handle them (e.g., by scanning files in
+//! reverse order).
+//!
+//! ## How it works
+//!
+//! 1. Detects `SortExec` nodes in the plan
+//! 2. Calls `try_pushdown_sort()` on the input to recursively push the sort requirement
+//! 3. Each node type defines its own pushdown behavior:
+//!    - **Transparent nodes** (CoalesceBatchesExec, RepartitionExec, etc.) delegate to
+//!      their children and wrap the result
+//!    - **Data sources** (DataSourceExec) check if they can optimize for the ordering
+//!    - **Blocking nodes** return `Unsupported` to stop pushdown
+//! 4. Based on the result:
+//!    - `Exact`: Remove the Sort operator (data source guarantees perfect ordering)
+//!    - `Inexact`: Keep Sort but use optimized input (enables early termination for TopK)
+//!    - `Unsupported`: No change
+//!
+//! ## Capabilities
+//!
+//! - **Sort elimination**: when a data source's natural ordering satisfies the
+//!   request, return `Exact` and remove the `SortExec` entirely. Preserves
+//!   `fetch` (LIMIT) from the eliminated `SortExec` for early termination.
+//! - **Statistics-based file sorting**: sort files within each partition by
+//!   min/max statistics. When files are non-overlapping but listed in wrong
+//!   order (e.g., alphabetical order ≠ sort key order), this fixes the ordering
+//!   and enables sort elimination. Works for both single-partition and
+//!   multi-partition plans with multi-file groups.
+//! - **Reverse scan optimization**: when required sort is the reverse of the data source's
+//!   natural ordering, enable reverse scanning (reading row groups in reverse order)
+//! - **Prefix matching**: if data has ordering [A DESC, B ASC] and query needs
+//!   [A DESC], the existing ordering satisfies the requirement (`Exact`).
+//!   If the query needs [A ASC] (reverse of the prefix), a reverse scan is
+//!   used (`Inexact`, `SortExec` retained)
+//!
+//! Related issue: <https://github.com/apache/datafusion/issues/17348>
+
+use crate::PhysicalOptimizerRule;
+use datafusion_common::Result;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::SortOrderPushdownResult;
+use datafusion_physical_plan::buffer::BufferExec;
+use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
+use datafusion_physical_plan::sorts::sort::SortExec;
+use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
+use std::sync::Arc;
+
+/// A PhysicalOptimizerRule that attempts to push down sort requirements to data sources.
+///
+/// See module-level documentation for details.
+#[derive(Debug, Clone, Default)]
+pub struct PushdownSort;
+
+impl PushdownSort {
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl PhysicalOptimizerRule for PushdownSort {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        // Check if sort pushdown optimization is enabled
+        if !config.optimizer.enable_sort_pushdown {
+            return Ok(plan);
+        }
+
+        let buffer_capacity = config.execution.sort_pushdown_buffer_capacity;
+
+        // Use transform_down to find and optimize all SortExec nodes (including nested ones)
+        // Also handles SPM → SortExec pattern to insert BufferExec when sort is eliminated
+        plan.transform_down(|plan: Arc<dyn ExecutionPlan>| {
+            // Pattern 1: SPM → SortExec(preserve_partitioning)
+            // When we eliminate the SortExec, SPM loses its memory buffer and reads
+            // directly from I/O-bound sources. Insert a BufferExec to compensate.
+            if let Some(spm) = plan.downcast_ref::<SortPreservingMergeExec>()
+                && let Some(sort_child) = spm.input().downcast_ref::<SortExec>()
+                && sort_child.preserve_partitioning()
+            {
+                let sort_input = Arc::clone(sort_child.input());
+                let required_ordering = sort_child.expr();
+                match sort_input.try_pushdown_sort(required_ordering)? {
+                    SortOrderPushdownResult::Exact { inner } => {
+                        // Preserve fetch (LIMIT) from the eliminated SortExec.
+                        // Use LocalLimitExec (not Global) since input is multi-partition.
+                        let inner = if let Some(fetch) = sort_child.fetch() {
+                            inner.with_fetch(Some(fetch)).unwrap_or_else(|| {
+                                Arc::new(LocalLimitExec::new(inner, fetch))
+                            })
+                        } else {
+                            inner
+                        };
+                        // Insert BufferExec to replace SortExec's buffering role.
+                        // SortExec buffered all data in memory; BufferExec provides
+                        // bounded buffering so SPM doesn't stall on I/O.
+                        let buffered: Arc<dyn ExecutionPlan> =
+                            Arc::new(BufferExec::new(inner, buffer_capacity));
+                        let new_spm =
+                            SortPreservingMergeExec::new(spm.expr().clone(), buffered)
+                                .with_fetch(spm.fetch());
+                        return Ok(Transformed::yes(Arc::new(new_spm)));
+                    }
+                    SortOrderPushdownResult::Inexact { inner } => {
+                        let new_sort = SortExec::new(required_ordering.clone(), inner)
+                            .with_fetch(sort_child.fetch())
+                            .with_preserve_partitioning(true);
+                        let new_spm = SortPreservingMergeExec::new(
+                            spm.expr().clone(),
+                            Arc::new(new_sort),
+                        )
+                        .with_fetch(spm.fetch());
+                        return Ok(Transformed::yes(Arc::new(new_spm)));
+                    }
+                    SortOrderPushdownResult::Unsupported => {
+                        return Ok(Transformed::no(plan));
+                    }
+                }
+            }
+
+            // Pattern 2: Standalone SortExec (no SPM parent)
+            let Some(sort_exec) = plan.downcast_ref::<SortExec>() else {
+                return Ok(Transformed::no(plan));
+            };
+
+            let sort_input = Arc::clone(sort_exec.input());
+            let required_ordering = sort_exec.expr();
+
+            // Try to push the sort requirement down through the plan tree
+            // Each node type defines its own pushdown behavior via try_pushdown_sort()
+            match sort_input.try_pushdown_sort(required_ordering)? {
+                SortOrderPushdownResult::Exact { inner } => {
+                    // Data source guarantees perfect ordering - remove the Sort operator.
+                    //
+                    // If the SortExec carried a fetch (LIMIT), we must preserve it.
+                    // First try pushing the limit into the source via `with_fetch()`.
+                    // If the source doesn't support `with_fetch`, fall back to
+                    // wrapping with GlobalLimitExec.
+                    if let Some(fetch) = sort_exec.fetch() {
+                        let inner = inner.with_fetch(Some(fetch)).unwrap_or_else(|| {
+                            Arc::new(GlobalLimitExec::new(inner, 0, Some(fetch)))
+                        });
+                        Ok(Transformed::yes(inner))
+                    } else {
+                        Ok(Transformed::yes(inner))
+                    }
+                }
+                SortOrderPushdownResult::Inexact { inner } => {
+                    // Data source is optimized for the ordering but not perfectly sorted
+                    // Keep the Sort operator but use the optimized input
+                    // Benefits: TopK queries can terminate early, better cache locality
+                    Ok(Transformed::yes(Arc::new(
+                        SortExec::new(required_ordering.clone(), inner)
+                            .with_fetch(sort_exec.fetch())
+                            .with_preserve_partitioning(
+                                sort_exec.preserve_partitioning(),
+                            ),
+                    )))
+                }
+                SortOrderPushdownResult::Unsupported => {
+                    // Cannot optimize for this ordering - no change
+                    Ok(Transformed::no(plan))
+                }
+            }
+        })
+        .data()
+    }
+
+    fn name(&self) -> &str {
+        "PushdownSort"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
diff --git a/datafusion/physical-optimizer/src/sanity_checker.rs b/datafusion/physical-optimizer/src/sanity_checker.rs
index acc70d39f057b..40c6245d894d4 100644
--- a/datafusion/physical-optimizer/src/sanity_checker.rs
+++ b/datafusion/physical-optimizer/src/sanity_checker.rs
@@ -28,11 +28,11 @@ use datafusion_physical_plan::ExecutionPlan;
 
 use datafusion_common::config::{ConfigOptions, OptimizerOptions};
 use datafusion_common::plan_err;
-use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
+use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
 use datafusion_physical_expr::intervals::utils::{check_support, is_datatype_supported};
 use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
 use datafusion_physical_plan::joins::SymmetricHashJoinExec;
-use datafusion_physical_plan::{get_plan_string, ExecutionPlanProperties};
+use datafusion_physical_plan::{ExecutionPlanProperties, get_plan_string};
 
 use crate::PhysicalOptimizerRule;
 use datafusion_physical_expr_common::sort_expr::format_physical_sort_requirement_list;
@@ -47,7 +47,7 @@ use itertools::izip;
 pub struct SanityCheckPlan {}
 
 impl SanityCheckPlan {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -59,8 +59,8 @@ impl PhysicalOptimizerRule for SanityCheckPlan {
         plan: Arc<dyn ExecutionPlan>,
         config: &ConfigOptions,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        plan.transform_up(|p| check_plan_sanity(p, &config.optimizer))
-            .data()
+        check_plan_sanity_recursive(&plan, &config.optimizer)?;
+        Ok(plan)
     }
 
     fn name(&self) -> &str {
@@ -72,19 +72,31 @@ impl PhysicalOptimizerRule for SanityCheckPlan {
     }
 }
 
+/// Bottom-up (post-order) read-only traversal that checks plan sanity.
+#[cfg_attr(feature = "recursive_protection", recursive::recursive)]
+fn check_plan_sanity_recursive(
+    plan: &Arc<dyn ExecutionPlan>,
+    optimizer_options: &OptimizerOptions,
+) -> Result<TreeNodeRecursion> {
+    plan.apply_children(|child| check_plan_sanity_recursive(child, optimizer_options))?;
+    check_plan_sanity(plan, optimizer_options)?;
+    Ok(TreeNodeRecursion::Continue)
+}
+
 /// This function propagates finiteness information and rejects any plan with
 /// pipeline-breaking operators acting on infinite inputs.
 pub fn check_finiteness_requirements(
-    input: Arc<dyn ExecutionPlan>,
+    input: &dyn ExecutionPlan,
     optimizer_options: &OptimizerOptions,
-) -> Result<Transformed<Arc<dyn ExecutionPlan>>> {
-    if let Some(exec) = input.as_any().downcast_ref::<SymmetricHashJoinExec>() {
-        if !(optimizer_options.allow_symmetric_joins_without_pruning
+) -> Result<()> {
+    if let Some(exec) = input.downcast_ref::<SymmetricHashJoinExec>()
+        && !(optimizer_options.allow_symmetric_joins_without_pruning
             || (exec.check_if_order_information_available()? && is_prunable(exec)))
-        {
-            return plan_err!("Join operation cannot operate on a non-prunable stream without enabling \
-                              the 'allow_symmetric_joins_without_pruning' configuration flag");
-        }
+    {
+        return plan_err!(
+            "Join operation cannot operate on a non-prunable stream without enabling \
+                              the 'allow_symmetric_joins_without_pruning' configuration flag"
+        );
     }
 
     if matches!(
@@ -100,7 +112,7 @@ pub fn check_finiteness_requirements(
             input
         )
     } else {
-        Ok(Transformed::no(input))
+        Ok(())
     }
 }
 
@@ -125,10 +137,10 @@ fn is_prunable(join: &SymmetricHashJoinExec) -> bool {
 /// Ensures that the plan is pipeline friendly and the order and
 /// distribution requirements from its children are satisfied.
 pub fn check_plan_sanity(
-    plan: Arc<dyn ExecutionPlan>,
+    plan: &Arc<dyn ExecutionPlan>,
     optimizer_options: &OptimizerOptions,
-) -> Result<Transformed<Arc<dyn ExecutionPlan>>> {
-    check_finiteness_requirements(Arc::clone(&plan), optimizer_options)?;
+) -> Result<()> {
+    check_finiteness_requirements(plan.as_ref(), optimizer_options)?;
 
     for ((idx, child), sort_req, dist_req) in izip!(
         plan.children().into_iter().enumerate(),
@@ -139,7 +151,7 @@ pub fn check_plan_sanity(
         if let Some(sort_req) = sort_req {
             let sort_req = sort_req.into_single();
             if !child_eq_props.ordering_satisfy_requirement(sort_req.clone())? {
-                let plan_str = get_plan_string(&plan);
+                let plan_str = get_plan_string(plan);
                 return plan_err!(
                     "Plan: {:?} does not satisfy order requirements: {}. Child-{} order: {}",
                     plan_str,
@@ -152,9 +164,10 @@ pub fn check_plan_sanity(
 
         if !child
             .output_partitioning()
-            .satisfy(&dist_req, child_eq_props)
+            .satisfaction(&dist_req, child_eq_props, true)
+            .is_satisfied()
         {
-            let plan_str = get_plan_string(&plan);
+            let plan_str = get_plan_string(plan);
             return plan_err!(
                 "Plan: {:?} does not satisfy distribution requirements: {}. Child-{} output partitioning: {}",
                 plan_str,
@@ -165,7 +178,7 @@ pub fn check_plan_sanity(
         }
     }
 
-    Ok(Transformed::no(plan))
+    Ok(())
 }
 
 // See tests in datafusion/core/tests/physical_optimizer
diff --git a/datafusion/physical-optimizer/src/topk_aggregation.rs b/datafusion/physical-optimizer/src/topk_aggregation.rs
index b7505f0df4edb..e1779c04a6a92 100644
--- a/datafusion/physical-optimizer/src/topk_aggregation.rs
+++ b/datafusion/physical-optimizer/src/topk_aggregation.rs
@@ -20,16 +20,16 @@
 use std::sync::Arc;
 
 use crate::PhysicalOptimizerRule;
-use arrow::datatypes::DataType;
+use datafusion_common::Result;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::Result;
 use datafusion_physical_expr::expressions::Column;
-use datafusion_physical_plan::aggregates::AggregateExec;
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::aggregates::LimitOptions;
+use datafusion_physical_plan::aggregates::{AggregateExec, topk_types_supported};
 use datafusion_physical_plan::execution_plan::CardinalityEffect;
 use datafusion_physical_plan::projection::ProjectionExec;
 use datafusion_physical_plan::sorts::sort::SortExec;
-use datafusion_physical_plan::ExecutionPlan;
 use itertools::Itertools;
 
 /// An optimizer rule that passes a `limit` hint to aggregations if the whole result is not needed
@@ -48,52 +48,59 @@ impl TopKAggregation {
         order_desc: bool,
         limit: usize,
     ) -> Option<Arc<dyn ExecutionPlan>> {
-        // ensure the sort direction matches aggregate function
-        let (field, desc) = aggr.get_minmax_desc()?;
-        if desc != order_desc {
-            return None;
-        }
-        let group_key = aggr.group_expr().expr().iter().exactly_one().ok()?;
-        let kt = group_key.0.data_type(&aggr.input().schema()).ok()?;
-        if !kt.is_primitive()
-            && kt != DataType::Utf8
-            && kt != DataType::Utf8View
-            && kt != DataType::LargeUtf8
-        {
+        // Current only support single group key
+        let (group_key, group_key_alias) =
+            aggr.group_expr().expr().iter().exactly_one().ok()?;
+        let kt = group_key.data_type(&aggr.input().schema()).ok()?;
+        let vt = if let Some((field, _)) = aggr.get_minmax_desc() {
+            field.data_type().clone()
+        } else {
+            kt.clone()
+        };
+        if !topk_types_supported(&kt, &vt) {
             return None;
         }
         if aggr.filter_expr().iter().any(|e| e.is_some()) {
             return None;
         }
 
-        // ensure the sort is on the same field as the aggregate output
-        if order_by != field.name() {
+        // Check if this is ordering by an aggregate function (MIN/MAX)
+        if let Some((field, desc)) = aggr.get_minmax_desc() {
+            // ensure the sort direction matches aggregate function
+            if desc != order_desc {
+                return None;
+            }
+            // ensure the sort is on the same field as the aggregate output
+            if order_by != field.name() {
+                return None;
+            }
+        } else if aggr.aggr_expr().is_empty() {
+            // This is a GROUP BY without aggregates, check if ordering is on the group key itself
+            if order_by != group_key_alias {
+                return None;
+            }
+        } else {
+            // Has aggregates but not MIN/MAX, or doesn't DISTINCT
             return None;
         }
 
         // We found what we want: clone, copy the limit down, and return modified node
-        let new_aggr = AggregateExec::try_new(
-            *aggr.mode(),
-            aggr.group_expr().clone(),
-            aggr.aggr_expr().to_vec(),
-            aggr.filter_expr().to_vec(),
-            Arc::clone(aggr.input()),
-            aggr.input_schema(),
-        )
-        .expect("Unable to copy Aggregate!")
-        .with_limit(Some(limit));
+        let new_aggr = AggregateExec::with_new_limit_options(
+            aggr,
+            Some(LimitOptions::new_with_order(limit, order_desc)),
+        );
         Some(Arc::new(new_aggr))
     }
 
     fn transform_sort(plan: &Arc<dyn ExecutionPlan>) -> Option<Arc<dyn ExecutionPlan>> {
-        let sort = plan.as_any().downcast_ref::<SortExec>()?;
+        let sort = plan.downcast_ref::<SortExec>()?;
 
         let children = sort.children();
         let child = children.into_iter().exactly_one().ok()?;
         let order = sort.properties().output_ordering()?;
         let order = order.iter().exactly_one().ok()?;
         let order_desc = order.options.descending;
-        let order = order.expr.as_any().downcast_ref::<Column>()?;
+        let order = order.expr.downcast_ref::<Column>()?;
         let mut cur_col_name = order.name().to_string();
         let limit = sort.fetch()?;
 
@@ -102,17 +109,16 @@ impl TopKAggregation {
             if !cardinality_preserved {
                 return Ok(Transformed::no(plan));
             }
-            if let Some(aggr) = plan.as_any().downcast_ref::<AggregateExec>() {
+            if let Some(aggr) = plan.downcast_ref::<AggregateExec>() {
                 // either we run into an Aggregate and transform it
                 match Self::transform_agg(aggr, &cur_col_name, order_desc, limit) {
                     None => cardinality_preserved = false,
                     Some(plan) => return Ok(Transformed::yes(plan)),
                 }
-            } else if let Some(proj) = plan.as_any().downcast_ref::<ProjectionExec>() {
+            } else if let Some(proj) = plan.downcast_ref::<ProjectionExec>() {
                 // track renames due to successive projections
                 for proj_expr in proj.expr() {
-                    let Some(src_col) = proj_expr.expr.as_any().downcast_ref::<Column>()
-                    else {
+                    let Some(src_col) = proj_expr.expr.downcast_ref::<Column>() else {
                         continue;
                     };
                     if proj_expr.alias == cur_col_name {
diff --git a/datafusion/physical-optimizer/src/topk_repartition.rs b/datafusion/physical-optimizer/src/topk_repartition.rs
new file mode 100644
index 0000000000000..115bdc3cb535f
--- /dev/null
+++ b/datafusion/physical-optimizer/src/topk_repartition.rs
@@ -0,0 +1,367 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Push TopK (Sort with fetch) past Hash Repartition
+//!
+//! When a `SortExec` with a fetch limit (TopK) sits above a
+//! `RepartitionExec(Hash)`, and the hash partition expressions are a prefix
+//! of the sort expressions, this rule inserts a copy of the TopK below
+//! the repartition to reduce the volume of data flowing through the shuffle.
+//!
+//! This is correct because the hash partition key being a prefix of the sort
+//! key guarantees that all rows with the same partition key end up in the same
+//! output partition. Therefore, rows that survive the final TopK after
+//! repartitioning will always survive the pre-repartition TopK as well.
+//!
+//! ## Example
+//!
+//! Before:
+//! ```text
+//! SortExec: TopK(fetch=3), expr=[a ASC, b ASC]
+//!   RepartitionExec: Hash([a], 4)
+//!     DataSourceExec
+//! ```
+//!
+//! After:
+//! ```text
+//! SortExec: TopK(fetch=3), expr=[a ASC, b ASC]
+//!   RepartitionExec: Hash([a], 4)
+//!     SortExec: TopK(fetch=3), expr=[a ASC, b ASC]
+//!       DataSourceExec
+//! ```
+
+use crate::PhysicalOptimizerRule;
+use datafusion_common::Result;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
+use std::sync::Arc;
+// CoalesceBatchesExec is deprecated on main (replaced by arrow-rs BatchCoalescer),
+// but older DataFusion versions may still insert it between SortExec and RepartitionExec.
+#[expect(deprecated)]
+use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec;
+use datafusion_physical_plan::repartition::RepartitionExec;
+use datafusion_physical_plan::sorts::sort::SortExec;
+use datafusion_physical_plan::{ExecutionPlan, Partitioning};
+
+/// A physical optimizer rule that pushes TopK (Sort with fetch) past
+/// hash repartition when the partition key is a prefix of the sort key.
+///
+/// See module-level documentation for details.
+#[derive(Debug, Clone, Default)]
+pub struct TopKRepartition;
+
+impl TopKRepartition {
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl PhysicalOptimizerRule for TopKRepartition {
+    #[expect(deprecated)] // CoalesceBatchesExec: kept for older DataFusion versions
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        if !config.optimizer.enable_topk_repartition {
+            return Ok(plan);
+        }
+        plan.transform_down(|node| {
+            // Match SortExec with fetch (TopK)
+            let Some(sort_exec) = node.downcast_ref::<SortExec>() else {
+                return Ok(Transformed::no(node));
+            };
+            let Some(fetch) = sort_exec.fetch() else {
+                return Ok(Transformed::no(node));
+            };
+
+            // The child might be a CoalesceBatchesExec; look through it
+            let sort_input = sort_exec.input();
+            let (repart_parent, repart_exec) = if let Some(rp) =
+                sort_input.downcast_ref::<RepartitionExec>()
+            {
+                // found a RepartitionExec, use it
+                (None, rp)
+            } else if let Some(cb_exec) = sort_input.downcast_ref::<CoalesceBatchesExec>()
+            {
+                // There's a CoalesceBatchesExec between TopK & RepartitionExec
+                // in this case we will need to reconstruct both nodes
+                let cb_input = cb_exec.input();
+                let Some(rp) = cb_input.downcast_ref::<RepartitionExec>() else {
+                    return Ok(Transformed::no(node));
+                };
+                (Some(Arc::clone(sort_input)), rp)
+            } else {
+                return Ok(Transformed::no(node));
+            };
+
+            // Only handle Hash partitioning
+            let Partitioning::Hash(hash_exprs, num_partitions) =
+                repart_exec.partitioning()
+            else {
+                return Ok(Transformed::no(node));
+            };
+
+            let sort_exprs = sort_exec.expr();
+
+            // Check that hash expressions are a prefix of the sort expressions.
+            // Each hash expression must match the corresponding sort expression
+            // (ignoring sort options like ASC/DESC since hash doesn't care about order).
+            if hash_exprs.len() > sort_exprs.len() {
+                return Ok(Transformed::no(node));
+            }
+            for (hash_expr, sort_expr) in hash_exprs.iter().zip(sort_exprs.iter()) {
+                if !hash_expr.eq(&sort_expr.expr) {
+                    return Ok(Transformed::no(node));
+                }
+            }
+
+            // Don't push if the input to the repartition is already bounded
+            // (e.g., another TopK), as it would be redundant.
+            let repart_input = repart_exec.input();
+            if repart_input.is::<SortExec>() {
+                return Ok(Transformed::no(node));
+            }
+
+            // Insert a copy of the TopK below the repartition
+            let new_sort: Arc<dyn ExecutionPlan> = Arc::new(
+                SortExec::new(sort_exprs.clone(), Arc::clone(repart_input))
+                    .with_fetch(Some(fetch))
+                    .with_preserve_partitioning(sort_exec.preserve_partitioning()),
+            );
+
+            let new_partitioning =
+                Partitioning::Hash(hash_exprs.clone(), *num_partitions);
+            let new_repartition: Arc<dyn ExecutionPlan> =
+                Arc::new(RepartitionExec::try_new(new_sort, new_partitioning)?);
+
+            // Rebuild the tree above the repartition
+            let new_sort_input = if let Some(parent) = repart_parent {
+                parent.with_new_children(vec![new_repartition])?
+            } else {
+                new_repartition
+            };
+
+            let new_top_sort: Arc<dyn ExecutionPlan> = Arc::new(
+                SortExec::new(sort_exprs.clone(), new_sort_input)
+                    .with_fetch(Some(fetch))
+                    .with_preserve_partitioning(sort_exec.preserve_partitioning()),
+            );
+
+            Ok(Transformed::yes(new_top_sort))
+        })
+        .data()
+    }
+
+    fn name(&self) -> &str {
+        "TopKRepartition"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_physical_expr::expressions::col;
+    use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+    use datafusion_physical_plan::displayable;
+    use datafusion_physical_plan::test::scan_partitioned;
+    use insta::assert_snapshot;
+
+    fn schema() -> Arc<Schema> {
+        Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Utf8, false),
+            Field::new("b", DataType::Int64, false),
+        ]))
+    }
+
+    fn sort_exprs(schema: &Schema) -> LexOrdering {
+        LexOrdering::new(vec![
+            PhysicalSortExpr::new_default(col("a", schema).unwrap()).asc(),
+            PhysicalSortExpr::new_default(col("b", schema).unwrap()).asc(),
+        ])
+        .unwrap()
+    }
+
+    /// TopK above Hash(a) repartition should get pushed below it,
+    /// because `a` is a prefix of the sort key `(a, b)`.
+    #[test]
+    fn topk_pushed_below_hash_repartition() {
+        let s = schema();
+        let input = scan_partitioned(1);
+        let ordering = sort_exprs(&s);
+
+        let repartition = Arc::new(
+            RepartitionExec::try_new(
+                input,
+                Partitioning::Hash(vec![col("a", &s).unwrap()], 4),
+            )
+            .unwrap(),
+        );
+
+        let sort = Arc::new(
+            SortExec::new(ordering, repartition)
+                .with_fetch(Some(3))
+                .with_preserve_partitioning(true),
+        );
+
+        let config = ConfigOptions::new();
+        let optimized = TopKRepartition::new().optimize(sort, &config).unwrap();
+
+        let display = displayable(optimized.as_ref()).indent(true).to_string();
+        assert_snapshot!(display, @r"
+        SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true], sort_prefix=[a@0 ASC]
+          RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1, maintains_sort_order=true
+            SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true]
+              DataSourceExec: partitions=1, partition_sizes=[1]
+        ");
+    }
+
+    /// TopK with no fetch (unbounded sort) should NOT be pushed.
+    #[test]
+    fn unbounded_sort_not_pushed() {
+        let s = schema();
+        let input = scan_partitioned(1);
+        let ordering = sort_exprs(&s);
+
+        let repartition = Arc::new(
+            RepartitionExec::try_new(
+                input,
+                Partitioning::Hash(vec![col("a", &s).unwrap()], 4),
+            )
+            .unwrap(),
+        );
+
+        let sort: Arc<dyn ExecutionPlan> = Arc::new(
+            SortExec::new(ordering, repartition).with_preserve_partitioning(true),
+        );
+
+        let config = ConfigOptions::new();
+        let optimized = TopKRepartition::new().optimize(sort, &config).unwrap();
+
+        let display = displayable(optimized.as_ref()).indent(true).to_string();
+        assert_snapshot!(display, @r"
+        SortExec: expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true]
+          RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1
+            DataSourceExec: partitions=1, partition_sizes=[1]
+        ");
+    }
+
+    /// Hash key NOT a prefix of sort key should NOT be pushed.
+    #[test]
+    fn non_prefix_hash_key_not_pushed() {
+        let s = schema();
+        let input = scan_partitioned(1);
+        let ordering = sort_exprs(&s);
+
+        // Hash by `b`, but sort by `(a, b)` - b is not a prefix
+        let repartition = Arc::new(
+            RepartitionExec::try_new(
+                input,
+                Partitioning::Hash(vec![col("b", &s).unwrap()], 4),
+            )
+            .unwrap(),
+        );
+
+        let sort: Arc<dyn ExecutionPlan> = Arc::new(
+            SortExec::new(ordering, repartition)
+                .with_fetch(Some(3))
+                .with_preserve_partitioning(true),
+        );
+
+        let config = ConfigOptions::new();
+        let optimized = TopKRepartition::new().optimize(sort, &config).unwrap();
+
+        let display = displayable(optimized.as_ref()).indent(true).to_string();
+        assert_snapshot!(display, @r"
+        SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true]
+          RepartitionExec: partitioning=Hash([b@1], 4), input_partitions=1
+            DataSourceExec: partitions=1, partition_sizes=[1]
+        ");
+    }
+
+    /// TopK above CoalesceBatchesExec above Hash(a) repartition should
+    /// push through both, inserting a new TopK below the repartition.
+    #[expect(deprecated)]
+    #[test]
+    fn topk_pushed_through_coalesce_batches() {
+        let s = schema();
+        let input = scan_partitioned(1);
+        let ordering = sort_exprs(&s);
+
+        let repartition = Arc::new(
+            RepartitionExec::try_new(
+                input,
+                Partitioning::Hash(vec![col("a", &s).unwrap()], 4),
+            )
+            .unwrap(),
+        );
+
+        let coalesce: Arc<dyn ExecutionPlan> =
+            Arc::new(CoalesceBatchesExec::new(repartition, 8192));
+
+        let sort = Arc::new(
+            SortExec::new(ordering, coalesce)
+                .with_fetch(Some(3))
+                .with_preserve_partitioning(true),
+        );
+
+        let config = ConfigOptions::new();
+        let optimized = TopKRepartition::new().optimize(sort, &config).unwrap();
+
+        let display = displayable(optimized.as_ref()).indent(true).to_string();
+        assert_snapshot!(display, @r"
+        SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true], sort_prefix=[a@0 ASC]
+          CoalesceBatchesExec: target_batch_size=8192
+            RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1, maintains_sort_order=true
+              SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true]
+                DataSourceExec: partitions=1, partition_sizes=[1]
+        ");
+    }
+
+    /// RoundRobin repartition should NOT be pushed.
+    #[test]
+    fn round_robin_not_pushed() {
+        let s = schema();
+        let input = scan_partitioned(1);
+        let ordering = sort_exprs(&s);
+
+        let repartition = Arc::new(
+            RepartitionExec::try_new(input, Partitioning::RoundRobinBatch(4)).unwrap(),
+        );
+
+        let sort: Arc<dyn ExecutionPlan> = Arc::new(
+            SortExec::new(ordering, repartition)
+                .with_fetch(Some(3))
+                .with_preserve_partitioning(true),
+        );
+
+        let config = ConfigOptions::new();
+        let optimized = TopKRepartition::new().optimize(sort, &config).unwrap();
+
+        let display = displayable(optimized.as_ref()).indent(true).to_string();
+        assert_snapshot!(display, @r"
+        SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true]
+          RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+            DataSourceExec: partitions=1, partition_sizes=[1]
+        ");
+    }
+}
diff --git a/datafusion/physical-optimizer/src/update_aggr_exprs.rs b/datafusion/physical-optimizer/src/update_aggr_exprs.rs
index 61bc715592af6..2430918e2c2db 100644
--- a/datafusion/physical-optimizer/src/update_aggr_exprs.rs
+++ b/datafusion/physical-optimizer/src/update_aggr_exprs.rs
@@ -22,10 +22,12 @@ use std::sync::Arc;
 
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::{plan_datafusion_err, Result};
+use datafusion_common::{Result, plan_datafusion_err};
 use datafusion_physical_expr::aggregate::AggregateFunctionExpr;
 use datafusion_physical_expr::{EquivalenceProperties, PhysicalSortRequirement};
-use datafusion_physical_plan::aggregates::{concat_slices, AggregateExec};
+use datafusion_physical_plan::aggregates::{
+    AggregateExec, AggregateInputMode, concat_slices,
+};
 use datafusion_physical_plan::windows::get_ordered_partition_by_indices;
 use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
 
@@ -49,7 +51,7 @@ use crate::PhysicalOptimizerRule;
 pub struct OptimizeAggregateOrder {}
 
 impl OptimizeAggregateOrder {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self::default()
     }
@@ -76,12 +78,12 @@ impl PhysicalOptimizerRule for OptimizeAggregateOrder {
         _config: &ConfigOptions,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         plan.transform_up(|plan| {
-            if let Some(aggr_exec) = plan.as_any().downcast_ref::<AggregateExec>() {
+            if let Some(aggr_exec) = plan.downcast_ref::<AggregateExec>() {
                 // Final stage implementations do not rely on ordering -- those
                 // ordering fields may be pruned out by first stage aggregates.
                 // Hence, necessary information for proper merge is added during
                 // the first stage to the state field, which the final stage uses.
-                if !aggr_exec.mode().is_first_stage() {
+                if aggr_exec.mode().input_mode() == AggregateInputMode::Partial {
                     return Ok(Transformed::no(plan));
                 }
                 let input = aggr_exec.input();
diff --git a/datafusion/physical-optimizer/src/utils.rs b/datafusion/physical-optimizer/src/utils.rs
index 13a1745216e83..a6b01637c970e 100644
--- a/datafusion/physical-optimizer/src/utils.rs
+++ b/datafusion/physical-optimizer/src/utils.rs
@@ -79,37 +79,37 @@ pub fn add_sort_above_with_check<T: Clone + Default>(
 
 /// Checks whether the given operator is a [`SortExec`].
 pub fn is_sort(plan: &Arc<dyn ExecutionPlan>) -> bool {
-    plan.as_any().is::<SortExec>()
+    plan.is::<SortExec>()
 }
 
 /// Checks whether the given operator is a window;
 /// i.e. either a [`WindowAggExec`] or a [`BoundedWindowAggExec`].
 pub fn is_window(plan: &Arc<dyn ExecutionPlan>) -> bool {
-    plan.as_any().is::<WindowAggExec>() || plan.as_any().is::<BoundedWindowAggExec>()
+    plan.is::<WindowAggExec>() || plan.is::<BoundedWindowAggExec>()
 }
 
 /// Checks whether the given operator is a [`UnionExec`].
 pub fn is_union(plan: &Arc<dyn ExecutionPlan>) -> bool {
-    plan.as_any().is::<UnionExec>()
+    plan.is::<UnionExec>()
 }
 
 /// Checks whether the given operator is a [`SortPreservingMergeExec`].
 pub fn is_sort_preserving_merge(plan: &Arc<dyn ExecutionPlan>) -> bool {
-    plan.as_any().is::<SortPreservingMergeExec>()
+    plan.is::<SortPreservingMergeExec>()
 }
 
 /// Checks whether the given operator is a [`CoalescePartitionsExec`].
 pub fn is_coalesce_partitions(plan: &Arc<dyn ExecutionPlan>) -> bool {
-    plan.as_any().is::<CoalescePartitionsExec>()
+    plan.is::<CoalescePartitionsExec>()
 }
 
 /// Checks whether the given operator is a [`RepartitionExec`].
 pub fn is_repartition(plan: &Arc<dyn ExecutionPlan>) -> bool {
-    plan.as_any().is::<RepartitionExec>()
+    plan.is::<RepartitionExec>()
 }
 
 /// Checks whether the given operator is a limit;
 /// i.e. either a [`LocalLimitExec`] or a [`GlobalLimitExec`].
 pub fn is_limit(plan: &Arc<dyn ExecutionPlan>) -> bool {
-    plan.as_any().is::<GlobalLimitExec>() || plan.as_any().is::<LocalLimitExec>()
+    plan.is::<GlobalLimitExec>() || plan.is::<LocalLimitExec>()
 }
diff --git a/datafusion/physical-optimizer/src/window_topn.rs b/datafusion/physical-optimizer/src/window_topn.rs
new file mode 100644
index 0000000000000..40dbddfbdf9fb
--- /dev/null
+++ b/datafusion/physical-optimizer/src/window_topn.rs
@@ -0,0 +1,331 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`WindowTopN`] optimizer rule for per-partition top-K window queries.
+//!
+//! Detects queries of the form:
+//!
+//! ```sql
+//! SELECT * FROM (
+//!     SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val) as rn
+//!     FROM t
+//! ) WHERE rn <= K;
+//! ```
+//!
+//! And replaces the `FilterExec → BoundedWindowAggExec → SortExec` pipeline
+//! with `BoundedWindowAggExec → PartitionedTopKExec(fetch=K)`, removing both
+//! the `FilterExec` and `SortExec`.
+//!
+//! See [`PartitionedTopKExec`]
+//! for details on the replacement operator.
+
+use std::sync::Arc;
+
+use crate::PhysicalOptimizerRule;
+use arrow::datatypes::DataType;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::Operator;
+use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal};
+use datafusion_physical_expr::window::StandardWindowExpr;
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::filter::FilterExec;
+use datafusion_physical_plan::projection::ProjectionExec;
+use datafusion_physical_plan::sorts::partitioned_topk::PartitionedTopKExec;
+use datafusion_physical_plan::sorts::sort::SortExec;
+use datafusion_physical_plan::windows::{BoundedWindowAggExec, WindowUDFExpr};
+
+/// Physical optimizer rule that converts per-partition `ROW_NUMBER` top-K
+/// queries into a more efficient plan using [`PartitionedTopKExec`].
+///
+/// # Pattern Detected
+///
+/// ```text
+/// FilterExec(rn <= K)
+///   [optional ProjectionExec]
+///     BoundedWindowAggExec(ROW_NUMBER PARTITION BY ... ORDER BY ...)
+///       SortExec(partition_keys, order_keys)
+/// ```
+///
+/// # Replacement
+///
+/// ```text
+/// [optional ProjectionExec]
+///   BoundedWindowAggExec(ROW_NUMBER PARTITION BY ... ORDER BY ...)
+///     PartitionedTopKExec(partition_keys, order_keys, fetch=K)
+/// ```
+///
+/// The `FilterExec` is removed entirely (all output rows have `rn ∈ {1..K}`).
+/// The `SortExec` is replaced by `PartitionedTopKExec` which maintains a
+/// per-partition top-K heap instead of sorting the entire dataset.
+///
+/// # Supported Predicates
+///
+/// - `rn <= K` → fetch = K
+/// - `rn < K` → fetch = K - 1
+/// - `K >= rn` (flipped) → fetch = K
+/// - `K > rn` (flipped) → fetch = K - 1
+///
+/// # When the Rule Fires
+///
+/// All of the following must be true:
+/// - Config flag `enable_window_topn` is `true`
+/// - The plan matches `FilterExec → [ProjectionExec] → BoundedWindowAggExec → SortExec`
+/// - The window function is `ROW_NUMBER` (not `RANK`, `DENSE_RANK`, etc.)
+/// - `ROW_NUMBER` has a `PARTITION BY` clause (global top-K is already
+///   handled by `SortExec` with `fetch`)
+/// - The filter predicate compares the window output column to an integer
+///   literal using `<=`, `<`, `>=`, or `>`
+///
+/// [`PartitionedTopKExec`]: datafusion_physical_plan::sorts::partitioned_topk::PartitionedTopKExec
+#[derive(Default, Clone, Debug)]
+pub struct WindowTopN;
+
+impl WindowTopN {
+    pub fn new() -> Self {
+        Self
+    }
+
+    /// Attempt to transform a single plan node.
+    ///
+    /// Returns `Some(new_plan)` if the node matches the
+    /// `FilterExec → [ProjectionExec] → BoundedWindowAggExec → SortExec`
+    /// pattern and can be rewritten, or `None` if the node should be
+    /// left unchanged.
+    fn try_transform(plan: &Arc<dyn ExecutionPlan>) -> Option<Arc<dyn ExecutionPlan>> {
+        // Step 1: Match FilterExec at the top
+        let filter = plan.downcast_ref::<FilterExec>()?;
+
+        // Don't handle filters with projections
+        if filter.projection().is_some() {
+            return None;
+        }
+
+        // Step 2: Extract limit from predicate (rn <= K, rn < K, etc.)
+        let (col_idx, limit_n) = extract_window_limit(filter.predicate())?;
+
+        // Step 3: Walk through optional ProjectionExec to find BoundedWindowAggExec
+        let child = filter.input();
+        let (window_exec, proj_between) = find_window_below(child)?;
+
+        // Step 4: Verify col_idx references a ROW_NUMBER window output column
+        let input_field_count = window_exec.input().schema().fields().len();
+        if col_idx < input_field_count {
+            return None; // Filter is on an input column, not a window column
+        }
+        let window_expr_idx = col_idx - input_field_count;
+        let window_exprs = window_exec.window_expr();
+        if window_expr_idx >= window_exprs.len() {
+            return None;
+        }
+        if !is_row_number(&window_exprs[window_expr_idx]) {
+            return None;
+        }
+
+        // Step 5: Verify child of window is SortExec
+        let sort_exec = window_exec.input().downcast_ref::<SortExec>()?;
+        let sort_child = sort_exec.input();
+
+        // Step 6: Determine partition_prefix_len from the window expression
+        let partition_by = window_exprs[window_expr_idx].partition_by();
+        let partition_prefix_len = partition_by.len();
+
+        // Without PARTITION BY, this is just a global top-K which
+        // SortExec with fetch already handles efficiently.
+        if partition_prefix_len == 0 {
+            return None;
+        }
+
+        // Step 7: Build PartitionedTopKExec using SortExec's expressions
+        let partitioned_topk = PartitionedTopKExec::try_new(
+            Arc::clone(sort_child),
+            sort_exec.expr().clone(),
+            partition_prefix_len,
+            limit_n,
+        )
+        .ok()?;
+
+        // Step 8: Rebuild window with new child
+        let new_window = Arc::clone(&child_as_arc(window_exec))
+            .with_new_children(vec![Arc::new(partitioned_topk)])
+            .ok()?;
+
+        // Step 9: If ProjectionExec was between Filter and Window, rebuild it
+        let result = match proj_between {
+            Some(proj) => Arc::clone(&child_as_arc(proj))
+                .with_new_children(vec![new_window])
+                .ok()?,
+            None => new_window,
+        };
+
+        Some(result)
+    }
+}
+
+/// Helper to get an `Arc<dyn ExecutionPlan>` from a reference.
+/// We need this because `with_new_children` takes `Arc<Self>`.
+fn child_as_arc<T: ExecutionPlan + Clone + 'static>(plan: &T) -> Arc<dyn ExecutionPlan> {
+    Arc::new(plan.clone())
+}
+
+impl PhysicalOptimizerRule for WindowTopN {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        if !config.optimizer.enable_window_topn {
+            return Ok(plan);
+        }
+
+        plan.transform_down(|node| {
+            Ok(
+                if let Some(transformed) = WindowTopN::try_transform(&node) {
+                    Transformed::yes(transformed)
+                } else {
+                    Transformed::no(node)
+                },
+            )
+        })
+        .data()
+    }
+
+    fn name(&self) -> &str {
+        "WindowTopN"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+/// Extract a window limit from a predicate expression.
+///
+/// Returns `(column_index, fetch)` if the predicate constrains a column
+/// to at most N rows.
+///
+/// # Supported Patterns
+///
+/// | Predicate | Returns |
+/// |-----------|---------|
+/// | `Column(idx) <= Literal(N)` | `(idx, N)` |
+/// | `Column(idx) < Literal(N)` | `(idx, N-1)` |
+/// | `Literal(N) >= Column(idx)` | `(idx, N)` |
+/// | `Literal(N) > Column(idx)` | `(idx, N-1)` |
+///
+/// # Examples
+///
+/// - `rn <= 5` → `Some((2, 5))` (assuming rn is column index 2)
+/// - `rn < 3` → `Some((2, 2))`
+/// - `10 >= rn` → `Some((2, 10))`
+/// - `rn = 1` → `None` (equality not supported)
+/// - `val <= 5` → `Some((1, 5))` (caller must verify it's a window column)
+fn extract_window_limit(
+    predicate: &Arc<dyn datafusion_physical_expr::PhysicalExpr>,
+) -> Option<(usize, usize)> {
+    let binary = predicate.downcast_ref::<BinaryExpr>()?;
+    let op = binary.op();
+    let left = binary.left();
+    let right = binary.right();
+
+    // Try Column op Literal
+    if let (Some(col), Some(lit_val)) = (
+        left.downcast_ref::<Column>(),
+        right.downcast_ref::<Literal>(),
+    ) {
+        let n = scalar_to_usize(lit_val.value())?;
+        return match *op {
+            Operator::LtEq => Some((col.index(), n)),
+            Operator::Lt => Some((col.index(), n - 1)),
+            _ => None,
+        };
+    }
+
+    // Try Literal op Column (flipped)
+    if let (Some(lit_val), Some(col)) = (
+        left.downcast_ref::<Literal>(),
+        right.downcast_ref::<Column>(),
+    ) {
+        let n = scalar_to_usize(lit_val.value())?;
+        return match *op {
+            Operator::GtEq => Some((col.index(), n)),
+            Operator::Gt => Some((col.index(), n - 1)),
+            _ => None,
+        };
+    }
+
+    None
+}
+
+/// Convert a [`ScalarValue`] to `usize` if it's a positive integer.
+///
+/// Returns `None` for null values, zero, negative integers, and
+/// non-integer types (floats, strings, decimals, etc.).
+fn scalar_to_usize(value: &ScalarValue) -> Option<usize> {
+    if !value.data_type().is_integer() {
+        return None;
+    }
+    let casted = value.cast_to(&DataType::UInt64).ok()?;
+    match casted {
+        ScalarValue::UInt64(Some(v)) if v > 0 => usize::try_from(v).ok(),
+        _ => None,
+    }
+}
+
+/// Check if a window expression is `ROW_NUMBER`.
+///
+/// Downcasts through `StandardWindowExpr` → `WindowUDFExpr` and checks
+/// that the UDF name is `"row_number"`. Returns `false` for all other
+/// window functions (e.g., `RANK`, `DENSE_RANK`, `SUM`).
+fn is_row_number(expr: &Arc<dyn datafusion_physical_expr::window::WindowExpr>) -> bool {
+    let Some(swe) = expr.as_any().downcast_ref::<StandardWindowExpr>() else {
+        return false;
+    };
+    let swfe = swe.get_standard_func_expr();
+    let Some(udf) = swfe.as_any().downcast_ref::<WindowUDFExpr>() else {
+        return false;
+    };
+    udf.fun().name() == "row_number"
+}
+
+/// Walk below a plan node looking for a [`BoundedWindowAggExec`].
+///
+/// Handles two cases:
+/// - Direct child: `FilterExec → BoundedWindowAggExec`
+/// - With projection: `FilterExec → ProjectionExec → BoundedWindowAggExec`
+///
+/// Returns the window exec and an optional `ProjectionExec` in between,
+/// or `None` if no `BoundedWindowAggExec` is found within one or two levels.
+fn find_window_below(
+    plan: &Arc<dyn ExecutionPlan>,
+) -> Option<(&BoundedWindowAggExec, Option<&ProjectionExec>)> {
+    // Direct child is BoundedWindowAggExec
+    if let Some(window) = plan.downcast_ref::<BoundedWindowAggExec>() {
+        return Some((window, None));
+    }
+
+    // Child is ProjectionExec with BoundedWindowAggExec below
+    if let Some(proj) = plan.downcast_ref::<ProjectionExec>() {
+        let proj_child = proj.input();
+        if let Some(window) = proj_child.downcast_ref::<BoundedWindowAggExec>() {
+            return Some((window, Some(proj)));
+        }
+    }
+
+    None
+}
diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml
index 607224782fc46..374fc275a06e0 100644
--- a/datafusion/physical-plan/Cargo.toml
+++ b/datafusion/physical-plan/Cargo.toml
@@ -31,11 +31,15 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
 [features]
 force_hash_collisions = []
+test_utils = ["arrow/test_utils"]
 tokio_coop = []
 tokio_coop_fallback = []
 
@@ -43,16 +47,16 @@ tokio_coop_fallback = []
 name = "datafusion_physical_plan"
 
 [dependencies]
-ahash = { workspace = true }
 arrow = { workspace = true }
+arrow-data = { workspace = true }
 arrow-ord = { workspace = true }
 arrow-schema = { workspace = true }
 async-trait = { workspace = true }
-chrono = { workspace = true }
 datafusion-common = { workspace = true }
 datafusion-common-runtime = { workspace = true, default-features = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
+datafusion-functions = { workspace = true }
 datafusion-functions-aggregate-common = { workspace = true }
 datafusion-functions-window-common = { workspace = true }
 datafusion-physical-expr = { workspace = true, default-features = true }
@@ -63,11 +67,13 @@ hashbrown = { workspace = true }
 indexmap = { workspace = true }
 itertools = { workspace = true, features = ["use_std"] }
 log = { workspace = true }
+num-traits = { workspace = true }
 parking_lot = { workspace = true }
 pin-project-lite = "^0.2.7"
 tokio = { workspace = true }
 
 [dev-dependencies]
+arrow-data = { workspace = true }
 criterion = { workspace = true, features = ["async_futures"] }
 datafusion-functions-aggregate = { workspace = true }
 datafusion-functions-window = { workspace = true }
@@ -93,6 +99,12 @@ name = "spill_io"
 harness = false
 name = "sort_preserving_merge"
 
+[[bench]]
+harness = false
+name = "sort_merge_join"
+required-features = ["test_utils"]
+
 [[bench]]
 harness = false
 name = "aggregate_vectorized"
+required-features = ["test_utils"]
diff --git a/datafusion/physical-plan/benches/aggregate_vectorized.rs b/datafusion/physical-plan/benches/aggregate_vectorized.rs
index 3c1899406c985..a93088a4ebe72 100644
--- a/datafusion/physical-plan/benches/aggregate_vectorized.rs
+++ b/datafusion/physical-plan/benches/aggregate_vectorized.rs
@@ -25,11 +25,11 @@ use arrow::util::test_util::seedable_rng;
 use arrow_schema::DataType;
 use criterion::measurement::WallTime;
 use criterion::{
-    criterion_group, criterion_main, BenchmarkGroup, BenchmarkId, Criterion,
+    BenchmarkGroup, BenchmarkId, Criterion, criterion_group, criterion_main,
 };
+use datafusion_physical_plan::aggregates::group_values::multi_group_by::GroupColumn;
 use datafusion_physical_plan::aggregates::group_values::multi_group_by::bytes_view::ByteViewGroupValueBuilder;
 use datafusion_physical_plan::aggregates::group_values::multi_group_by::primitive::PrimitiveGroupValueBuilder;
-use datafusion_physical_plan::aggregates::group_values::multi_group_by::GroupColumn;
 use rand::distr::{Bernoulli, Distribution};
 use std::hint::black_box;
 use std::sync::Arc;
@@ -271,6 +271,7 @@ fn bench_single_primitive<const NULLABLE: bool>(
 }
 
 /// Test `vectorized_equal_to` with different number of true in the initial results
+#[expect(clippy::needless_pass_by_value)]
 fn vectorized_equal_to<GroupColumnBuilder: GroupColumn>(
     group: &mut BenchmarkGroup<WallTime>,
     mut builder: GroupColumnBuilder,
diff --git a/datafusion/physical-plan/benches/partial_ordering.rs b/datafusion/physical-plan/benches/partial_ordering.rs
index e1a9d0b583e98..bdadd6274b75e 100644
--- a/datafusion/physical-plan/benches/partial_ordering.rs
+++ b/datafusion/physical-plan/benches/partial_ordering.rs
@@ -20,7 +20,7 @@ use std::sync::Arc;
 use arrow::array::{ArrayRef, Int32Array};
 use datafusion_physical_plan::aggregates::order::GroupOrderingPartial;
 
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 
 const BATCH_SIZE: usize = 8192;
 
diff --git a/datafusion/physical-plan/benches/sort_merge_join.rs b/datafusion/physical-plan/benches/sort_merge_join.rs
new file mode 100644
index 0000000000000..82610b2a54c2b
--- /dev/null
+++ b/datafusion/physical-plan/benches/sort_merge_join.rs
@@ -0,0 +1,204 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Criterion benchmarks for Sort Merge Join
+//!
+//! These benchmarks measure the join kernel in isolation by feeding
+//! pre-sorted RecordBatches directly into SortMergeJoinExec, avoiding
+//! sort / scan overhead.
+
+use std::sync::Arc;
+
+use arrow::array::{Int64Array, RecordBatch, StringArray};
+use arrow::compute::SortOptions;
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::NullEquality;
+use datafusion_execution::TaskContext;
+use datafusion_physical_expr::expressions::col;
+use datafusion_physical_plan::collect;
+use datafusion_physical_plan::joins::{SortMergeJoinExec, utils::JoinOn};
+use datafusion_physical_plan::test::TestMemoryExec;
+use tokio::runtime::Runtime;
+
+/// Build pre-sorted RecordBatches (split into ~8192-row chunks).
+///
+/// Schema: (key: Int64, data: Int64, payload: Utf8)
+///
+/// `key_mod` controls distinct key count: key = row_index % key_mod.
+fn build_sorted_batches(
+    num_rows: usize,
+    key_mod: usize,
+    schema: &SchemaRef,
+) -> Vec<RecordBatch> {
+    let mut rows: Vec<(i64, i64)> = (0..num_rows)
+        .map(|i| ((i % key_mod) as i64, i as i64))
+        .collect();
+    rows.sort();
+
+    let keys: Vec<i64> = rows.iter().map(|(k, _)| *k).collect();
+    let data: Vec<i64> = rows.iter().map(|(_, d)| *d).collect();
+    let payload: Vec<String> = data.iter().map(|d| format!("val_{d}")).collect();
+
+    let batch = RecordBatch::try_new(
+        Arc::clone(schema),
+        vec![
+            Arc::new(Int64Array::from(keys)),
+            Arc::new(Int64Array::from(data)),
+            Arc::new(StringArray::from(payload)),
+        ],
+    )
+    .unwrap();
+
+    let batch_size = 8192;
+    let mut batches = Vec::new();
+    let mut offset = 0;
+    while offset < batch.num_rows() {
+        let len = (batch.num_rows() - offset).min(batch_size);
+        batches.push(batch.slice(offset, len));
+        offset += len;
+    }
+    batches
+}
+
+fn make_exec(
+    batches: &[RecordBatch],
+    schema: &SchemaRef,
+) -> Arc<dyn datafusion_physical_plan::ExecutionPlan> {
+    TestMemoryExec::try_new_exec(&[batches.to_vec()], Arc::clone(schema), None).unwrap()
+}
+
+fn schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![
+        Field::new("key", DataType::Int64, false),
+        Field::new("data", DataType::Int64, false),
+        Field::new("payload", DataType::Utf8, false),
+    ]))
+}
+
+fn do_join(
+    left: Arc<dyn datafusion_physical_plan::ExecutionPlan>,
+    right: Arc<dyn datafusion_physical_plan::ExecutionPlan>,
+    join_type: datafusion_common::JoinType,
+    rt: &Runtime,
+) -> usize {
+    let on: JoinOn = vec![(
+        col("key", &left.schema()).unwrap(),
+        col("key", &right.schema()).unwrap(),
+    )];
+    let join = SortMergeJoinExec::try_new(
+        left,
+        right,
+        on,
+        None,
+        join_type,
+        vec![SortOptions::default()],
+        NullEquality::NullEqualsNothing,
+    )
+    .unwrap();
+
+    let task_ctx = Arc::new(TaskContext::default());
+    rt.block_on(async {
+        let batches = collect(Arc::new(join), task_ctx).await.unwrap();
+        batches.iter().map(|b| b.num_rows()).sum()
+    })
+}
+
+fn bench_smj(c: &mut Criterion) {
+    let rt = Runtime::new().unwrap();
+    let s = schema();
+
+    let mut group = c.benchmark_group("sort_merge_join");
+
+    // 1:1 Inner Join — 100K rows each, unique keys
+    // Best case for contiguous-range optimization: every index array is [0,1,2,...].
+    {
+        let n = 100_000;
+        let left_batches = build_sorted_batches(n, n, &s);
+        let right_batches = build_sorted_batches(n, n, &s);
+        group.bench_function(BenchmarkId::new("inner_1to1", n), |b| {
+            b.iter(|| {
+                let left = make_exec(&left_batches, &s);
+                let right = make_exec(&right_batches, &s);
+                do_join(left, right, datafusion_common::JoinType::Inner, &rt)
+            })
+        });
+    }
+
+    // 1:10 Inner Join — 100K left, 100K right, 10K distinct keys
+    {
+        let n = 100_000;
+        let key_mod = 10_000;
+        let left_batches = build_sorted_batches(n, key_mod, &s);
+        let right_batches = build_sorted_batches(n, key_mod, &s);
+        group.bench_function(BenchmarkId::new("inner_1to10", n), |b| {
+            b.iter(|| {
+                let left = make_exec(&left_batches, &s);
+                let right = make_exec(&right_batches, &s);
+                do_join(left, right, datafusion_common::JoinType::Inner, &rt)
+            })
+        });
+    }
+
+    // Left Join — 100K each, ~5% unmatched on left
+    {
+        let n = 100_000;
+        let left_batches = build_sorted_batches(n, n + n / 20, &s);
+        let right_batches = build_sorted_batches(n, n, &s);
+        group.bench_function(BenchmarkId::new("left_1to1_unmatched", n), |b| {
+            b.iter(|| {
+                let left = make_exec(&left_batches, &s);
+                let right = make_exec(&right_batches, &s);
+                do_join(left, right, datafusion_common::JoinType::Left, &rt)
+            })
+        });
+    }
+
+    // Left Semi Join — 100K left, 100K right, 10K keys
+    {
+        let n = 100_000;
+        let key_mod = 10_000;
+        let left_batches = build_sorted_batches(n, key_mod, &s);
+        let right_batches = build_sorted_batches(n, key_mod, &s);
+        group.bench_function(BenchmarkId::new("left_semi_1to10", n), |b| {
+            b.iter(|| {
+                let left = make_exec(&left_batches, &s);
+                let right = make_exec(&right_batches, &s);
+                do_join(left, right, datafusion_common::JoinType::LeftSemi, &rt)
+            })
+        });
+    }
+
+    // Left Anti Join — 100K left, 100K right, partial match
+    {
+        let n = 100_000;
+        let left_batches = build_sorted_batches(n, n + n / 5, &s);
+        let right_batches = build_sorted_batches(n, n, &s);
+        group.bench_function(BenchmarkId::new("left_anti_partial", n), |b| {
+            b.iter(|| {
+                let left = make_exec(&left_batches, &s);
+                let right = make_exec(&right_batches, &s);
+                do_join(left, right, datafusion_common::JoinType::LeftAnti, &rt)
+            })
+        });
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_smj);
+criterion_main!(benches);
diff --git a/datafusion/physical-plan/benches/sort_preserving_merge.rs b/datafusion/physical-plan/benches/sort_preserving_merge.rs
index f223fd806b694..76ebf230a30e0 100644
--- a/datafusion/physical-plan/benches/sort_preserving_merge.rs
+++ b/datafusion/physical-plan/benches/sort_preserving_merge.rs
@@ -20,9 +20,9 @@ use arrow::{
     record_batch::RecordBatch,
 };
 use arrow_schema::{SchemaRef, SortOptions};
-use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
+use criterion::{BatchSize, Criterion, criterion_group, criterion_main};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::{expressions::col, LexOrdering, PhysicalSortExpr};
+use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr, expressions::col};
 use datafusion_physical_plan::test::TestMemoryExec;
 use datafusion_physical_plan::{
     collect, sorts::sort_preserving_merge::SortPreservingMergeExec,
diff --git a/datafusion/physical-plan/benches/spill_io.rs b/datafusion/physical-plan/benches/spill_io.rs
index 40c8f7634c8c4..fac2547a131b4 100644
--- a/datafusion/physical-plan/benches/spill_io.rs
+++ b/datafusion/physical-plan/benches/spill_io.rs
@@ -22,15 +22,15 @@ use arrow::array::{
 use arrow::datatypes::{DataType, Field, Schema};
 use criterion::measurement::WallTime;
 use criterion::{
-    criterion_group, criterion_main, BatchSize, BenchmarkGroup, BenchmarkId, Criterion,
+    BatchSize, BenchmarkGroup, BenchmarkId, Criterion, criterion_group, criterion_main,
 };
 use datafusion_common::config::SpillCompression;
+use datafusion_common::human_readable_size;
 use datafusion_common::instant::Instant;
-use datafusion_execution::memory_pool::human_readable_size;
 use datafusion_execution::runtime_env::RuntimeEnv;
+use datafusion_physical_plan::SpillManager;
 use datafusion_physical_plan::common::collect;
 use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, SpillMetrics};
-use datafusion_physical_plan::SpillManager;
 use rand::{Rng, SeedableRng};
 use std::sync::Arc;
 use tokio::runtime::Runtime;
@@ -490,6 +490,7 @@ fn bench_spill_compression(c: &mut Criterion) {
     group.finish();
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn benchmark_spill_batches_for_all_codec(
     group: &mut BenchmarkGroup<'_, WallTime>,
     batch_label: &str,
diff --git a/datafusion/physical-plan/src/aggregates/group_values/metrics.rs b/datafusion/physical-plan/src/aggregates/group_values/metrics.rs
index c4e29ea71060b..b6c32204e85f0 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/metrics.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/metrics.rs
@@ -53,7 +53,7 @@ mod tests {
     use crate::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy};
     use crate::metrics::MetricsSet;
     use crate::test::TestMemoryExec;
-    use crate::{collect, ExecutionPlan};
+    use crate::{ExecutionPlan, collect};
     use arrow::array::{Float64Array, UInt32Array};
     use arrow::datatypes::{DataType, Field, Schema};
     use arrow::record_batch::RecordBatch;
diff --git a/datafusion/physical-plan/src/aggregates/group_values/mod.rs b/datafusion/physical-plan/src/aggregates/group_values/mod.rs
index 4bd7f03506a15..2f3b1a19e7d73 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/mod.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/mod.rs
@@ -22,7 +22,7 @@ use arrow::array::types::{
     Time64MicrosecondType, Time64NanosecondType, TimestampMicrosecondType,
     TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
 };
-use arrow::array::{downcast_primitive, ArrayRef, RecordBatch};
+use arrow::array::{ArrayRef, downcast_primitive};
 use arrow::datatypes::{DataType, SchemaRef, TimeUnit};
 use datafusion_common::Result;
 
@@ -112,7 +112,7 @@ pub trait GroupValues: Send {
     fn emit(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>>;
 
     /// Clear the contents and shrink the capacity to the size of the batch (free up memory usage)
-    fn clear_shrink(&mut self, batch: &RecordBatch);
+    fn clear_shrink(&mut self, num_rows: usize);
 }
 
 /// Return a specialized implementation of [`GroupValues`] for the given schema.
diff --git a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/boolean.rs b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/boolean.rs
index 03e26446f5751..91a39f28f33c1 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/boolean.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/boolean.rs
@@ -18,7 +18,7 @@
 use std::sync::Arc;
 
 use crate::aggregates::group_values::multi_group_by::Nulls;
-use crate::aggregates::group_values::multi_group_by::{nulls_equal_to, GroupColumn};
+use crate::aggregates::group_values::multi_group_by::{GroupColumn, nulls_equal_to};
 use crate::aggregates::group_values::null_builder::MaybeNullBufferBuilder;
 use arrow::array::{Array as _, ArrayRef, AsArray, BooleanArray, BooleanBufferBuilder};
 use datafusion_common::Result;
diff --git a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes.rs b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes.rs
index d52721c2ee6c3..cd173741b6464 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes.rs
@@ -16,18 +16,18 @@
 // under the License.
 
 use crate::aggregates::group_values::multi_group_by::{
-    nulls_equal_to, GroupColumn, Nulls,
+    GroupColumn, Nulls, nulls_equal_to,
 };
 use crate::aggregates::group_values::null_builder::MaybeNullBufferBuilder;
 use arrow::array::{
-    types::GenericStringType, Array, ArrayRef, AsArray, BufferBuilder,
-    GenericBinaryArray, GenericByteArray, GenericStringArray, OffsetSizeTrait,
+    Array, ArrayRef, AsArray, BufferBuilder, GenericBinaryArray, GenericByteArray,
+    GenericStringArray, OffsetSizeTrait, types::GenericStringType,
 };
 use arrow::buffer::{OffsetBuffer, ScalarBuffer};
 use arrow::datatypes::{ByteArrayType, DataType, GenericBinaryType};
 use datafusion_common::utils::proxy::VecAllocExt;
-use datafusion_common::{exec_datafusion_err, Result};
-use datafusion_physical_expr_common::binary_map::{OutputType, INITIAL_BUFFER_CAPACITY};
+use datafusion_common::{Result, exec_datafusion_err};
+use datafusion_physical_expr_common::binary_map::{INITIAL_BUFFER_CAPACITY, OutputType};
 use itertools::izip;
 use std::mem::size_of;
 use std::sync::Arc;
diff --git a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs
index fde477c2cf7b5..a91dd3115d879 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs
@@ -16,10 +16,10 @@
 // under the License.
 
 use crate::aggregates::group_values::multi_group_by::{
-    nulls_equal_to, GroupColumn, Nulls,
+    GroupColumn, Nulls, nulls_equal_to,
 };
 use crate::aggregates::group_values::null_builder::MaybeNullBufferBuilder;
-use arrow::array::{make_view, Array, ArrayRef, AsArray, ByteView, GenericByteViewArray};
+use arrow::array::{Array, ArrayRef, AsArray, ByteView, GenericByteViewArray, make_view};
 use arrow::buffer::{Buffer, ScalarBuffer};
 use arrow::datatypes::ByteViewType;
 use datafusion_common::Result;
@@ -99,7 +99,8 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
 
     fn equal_to_inner(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool {
         let array = array.as_byte_view::<B>();
-        self.do_equal_to_inner(lhs_row, array, rhs_row)
+        // since this is a single row comparison, don't bother specializing for nulls/buffers
+        self.do_equal_to_inner::<true, true>(lhs_row, array, rhs_row)
     }
 
     fn append_val_inner(&mut self, array: &ArrayRef, row: usize) {
@@ -117,15 +118,16 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
         self.do_append_val_inner(arr, row);
     }
 
-    fn vectorized_equal_to_inner(
+    // Don't inline to keep the code small and give LLVM the best chance of
+    // vectorizing the inner loop
+    #[inline(never)]
+    fn vectorized_equal_to_inner<const HAS_NULLS: bool, const HAS_BUFFERS: bool>(
         &self,
         lhs_rows: &[usize],
-        array: &ArrayRef,
+        array: &GenericByteViewArray<B>,
         rhs_rows: &[usize],
         equal_to_results: &mut [bool],
     ) {
-        let array = array.as_byte_view::<B>();
-
         let iter = izip!(
             lhs_rows.iter(),
             rhs_rows.iter(),
@@ -138,7 +140,8 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
                 continue;
             }
 
-            *equal_to_result = self.do_equal_to_inner(lhs_row, array, rhs_row);
+            *equal_to_result =
+                self.do_equal_to_inner::<HAS_NULLS, HAS_BUFFERS>(lhs_row, array, rhs_row);
         }
     }
 
@@ -216,26 +219,42 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
         }
     }
 
-    fn do_equal_to_inner(
+    /// Compare the value at `lhs_row` in this builder with
+    /// the value at `rhs_row` in input `array`
+    ///
+    /// Templated so that the inner compare loop can be
+    /// specialized based on the input array
+    #[inline(always)]
+    fn do_equal_to_inner<const HAS_NULLS: bool, const HAS_BUFFERS: bool>(
         &self,
         lhs_row: usize,
         array: &GenericByteViewArray<B>,
         rhs_row: usize,
     ) -> bool {
         // Check if nulls equal firstly
-        let exist_null = self.nulls.is_null(lhs_row);
-        let input_null = array.is_null(rhs_row);
-        if let Some(result) = nulls_equal_to(exist_null, input_null) {
-            return result;
+        if HAS_NULLS {
+            let exist_null = self.nulls.is_null(lhs_row);
+            let input_null = array.is_null(rhs_row);
+            if let Some(result) = nulls_equal_to(exist_null, input_null) {
+                return result;
+            }
         }
 
         // Otherwise, we need to check their values
-        let exist_view = self.views[lhs_row];
+
+        // SAFETY: the `lhs_row` and rhs_row` are valid
+        let exist_view = unsafe { *self.views.get_unchecked(lhs_row) };
         let exist_view_len = exist_view as u32;
 
-        let input_view = array.views()[rhs_row];
+        let input_view = unsafe { *array.views().get_unchecked(rhs_row) };
         let input_view_len = input_view as u32;
 
+        // fast path, if we know there are no buffers, then the view must be inlined
+        // so we can simply compare the u128 views
+        if !HAS_BUFFERS {
+            return exist_view == input_view;
+        }
+
         // The check logic
         //   - Check len equality
         //   - If inlined, check inlined value
@@ -246,19 +265,8 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
         }
 
         if exist_view_len <= 12 {
-            let exist_inline = unsafe {
-                GenericByteViewArray::<B>::inline_value(
-                    &exist_view,
-                    exist_view_len as usize,
-                )
-            };
-            let input_inline = unsafe {
-                GenericByteViewArray::<B>::inline_value(
-                    &input_view,
-                    input_view_len as usize,
-                )
-            };
-            exist_inline == input_inline
+            // both inlined, so compare inlined value
+            exist_view == input_view
         } else {
             let exist_prefix =
                 unsafe { GenericByteViewArray::<B>::inline_value(&exist_view, 4) };
@@ -269,30 +277,28 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
                 return false;
             }
 
+            // get the full values and compare
             let exist_full = {
                 let byte_view = ByteView::from(exist_view);
-                self.value(
-                    byte_view.buffer_index as usize,
-                    byte_view.offset as usize,
-                    byte_view.length as usize,
-                )
+                let buffer_index = byte_view.buffer_index as usize;
+                let offset = byte_view.offset as usize;
+                let length = byte_view.length as usize;
+                debug_assert!(buffer_index <= self.completed.len());
+
+                unsafe {
+                    if buffer_index < self.completed.len() {
+                        let block = self.completed.get_unchecked(buffer_index);
+                        block.as_slice().get_unchecked(offset..offset + length)
+                    } else {
+                        self.in_progress.get_unchecked(offset..offset + length)
+                    }
+                }
             };
             let input_full: &[u8] = unsafe { array.value_unchecked(rhs_row).as_ref() };
             exist_full == input_full
         }
     }
 
-    fn value(&self, buffer_index: usize, offset: usize, length: usize) -> &[u8] {
-        debug_assert!(buffer_index <= self.completed.len());
-
-        if buffer_index < self.completed.len() {
-            let block = &self.completed[buffer_index];
-            &block[offset..offset + length]
-        } else {
-            &self.in_progress[offset..offset + length]
-        }
-    }
-
     fn build_inner(self) -> ArrayRef {
         let Self {
             views,
@@ -451,21 +457,23 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
         last_take_len: usize,
     ) -> Vec<Buffer> {
         let mut take_buffers = Vec::with_capacity(last_remaining_buffer_index + 1);
+        debug_assert!(last_remaining_buffer_index <= self.completed.len());
 
-        // Take `0 ~ last_remaining_buffer_index - 1` buffers
-        if !self.completed.is_empty() || last_remaining_buffer_index == 0 {
-            take_buffers.extend(self.completed.drain(0..last_remaining_buffer_index));
-        }
-
-        // Process the `last_remaining_buffer_index` buffers
+        // Process the `last_remaining_buffer_index` buffer before draining so the index is valid.
         let last_buffer = if last_remaining_buffer_index < self.completed.len() {
             // If it is in `completed`, simply clone
             self.completed[last_remaining_buffer_index].clone()
         } else {
             // If it is `in_progress`, copied `0 ~ offset` part
+            debug_assert!(last_take_len <= self.in_progress.len());
             let taken_last_buffer = self.in_progress[0..last_take_len].to_vec();
             Buffer::from_vec(taken_last_buffer)
         };
+
+        // Take `0 ~ last_remaining_buffer_index - 1` buffers
+        if last_remaining_buffer_index > 0 {
+            take_buffers.extend(self.completed.drain(0..last_remaining_buffer_index));
+        }
         take_buffers.push(last_buffer);
 
         take_buffers
@@ -507,7 +515,36 @@ impl<B: ByteViewType> GroupColumn for ByteViewGroupValueBuilder<B> {
         rows: &[usize],
         equal_to_results: &mut [bool],
     ) {
-        self.vectorized_equal_to_inner(group_indices, array, rows, equal_to_results);
+        let has_nulls = array.null_count() != 0;
+        let array = array.as_byte_view::<B>();
+        let has_buffers = !array.data_buffers().is_empty();
+        // call specialized version based on nulls and buffers presence
+        match (has_nulls, has_buffers) {
+            (true, true) => self.vectorized_equal_to_inner::<true, true>(
+                group_indices,
+                array,
+                rows,
+                equal_to_results,
+            ),
+            (true, false) => self.vectorized_equal_to_inner::<true, false>(
+                group_indices,
+                array,
+                rows,
+                equal_to_results,
+            ),
+            (false, true) => self.vectorized_equal_to_inner::<false, true>(
+                group_indices,
+                array,
+                rows,
+                equal_to_results,
+            ),
+            (false, false) => self.vectorized_equal_to_inner::<false, false>(
+                group_indices,
+                array,
+                rows,
+                equal_to_results,
+            ),
+        }
     }
 
     fn vectorized_append(&mut self, array: &ArrayRef, rows: &[usize]) -> Result<()> {
@@ -913,4 +950,28 @@ mod tests {
         let taken_array = builder.take_n(final_ones_to_append);
         assert_eq!(&taken_array, &input_array);
     }
+
+    #[test]
+    fn test_byte_view_take_n_partial_completed_nonzero_index() {
+        let mut builder =
+            ByteViewGroupValueBuilder::<StringViewType>::new().with_max_block_size(30);
+        let input_array = StringViewArray::from(vec![
+            Some("aaaaaaaaaaaaaa"),
+            Some("bbbbbbbbbbbbbb"),
+            Some("cccccccccccccc"),
+            Some("dddddddddddddd"),
+            Some("eeeeeeeeeeeeee"),
+        ]);
+        let input_array: ArrayRef = Arc::new(input_array);
+
+        for row in 0..input_array.len() {
+            builder.append_val(&input_array, row).unwrap();
+        }
+
+        assert_eq!(builder.completed.len(), 2);
+        assert_eq!(builder.in_progress.len(), 14);
+
+        let taken_array = builder.take_n(3);
+        assert_eq!(&taken_array, &input_array.slice(0, 3));
+    }
 }
diff --git a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/mod.rs b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/mod.rs
index 9adf028eca7f6..cc4576eabddbd 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/mod.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/mod.rs
@@ -24,24 +24,24 @@ pub mod primitive;
 
 use std::mem::{self, size_of};
 
+use crate::aggregates::group_values::GroupValues;
 use crate::aggregates::group_values::multi_group_by::{
     boolean::BooleanGroupValueBuilder, bytes::ByteGroupValueBuilder,
     bytes_view::ByteViewGroupValueBuilder, primitive::PrimitiveGroupValueBuilder,
 };
-use crate::aggregates::group_values::GroupValues;
-use ahash::RandomState;
-use arrow::array::{Array, ArrayRef, RecordBatch};
+use arrow::array::{Array, ArrayRef};
 use arrow::compute::cast;
 use arrow::datatypes::{
     BinaryViewType, DataType, Date32Type, Date64Type, Decimal128Type, Float32Type,
-    Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, Schema, SchemaRef,
+    Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, Schema, SchemaRef,
     StringViewType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
     Time64NanosecondType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType,
-    TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type,
-    UInt8Type,
+    TimestampNanosecondType, TimestampSecondType, UInt8Type, UInt16Type, UInt32Type,
+    UInt64Type,
 };
+use datafusion_common::hash_utils::RandomState;
 use datafusion_common::hash_utils::create_hashes;
-use datafusion_common::{internal_datafusion_err, not_impl_err, Result};
+use datafusion_common::{Result, internal_datafusion_err, not_impl_err};
 use datafusion_execution::memory_pool::proxy::{HashTableAllocExt, VecAllocExt};
 use datafusion_expr::EmitTo;
 use datafusion_physical_expr::binary_map::OutputType;
@@ -540,14 +540,13 @@ impl<const STREAMING: bool> GroupValuesColumn<STREAMING> {
                 // into `vectorized_equal_to_row_indices` and `vectorized_equal_to_group_indices`.
                 let list_offset = group_index_view.value() as usize;
                 let group_index_list = &self.group_index_lists[list_offset];
-                for &group_index in group_index_list {
-                    self.vectorized_operation_buffers
-                        .equal_to_row_indices
-                        .push(row);
-                    self.vectorized_operation_buffers
-                        .equal_to_group_indices
-                        .push(group_index);
-                }
+
+                self.vectorized_operation_buffers
+                    .equal_to_group_indices
+                    .extend_from_slice(group_index_list);
+                self.vectorized_operation_buffers
+                    .equal_to_row_indices
+                    .extend(std::iter::repeat_n(row, group_index_list.len()));
             } else {
                 let group_index = group_index_view.value() as usize;
                 self.vectorized_operation_buffers
@@ -1048,7 +1047,7 @@ impl<const STREAMING: bool> GroupValues for GroupValuesColumn<STREAMING> {
                         }
                     }
                     dt => {
-                        return not_impl_err!("{dt} not supported in GroupValuesColumn")
+                        return not_impl_err!("{dt} not supported in GroupValuesColumn");
                     }
                 }
             }
@@ -1181,14 +1180,13 @@ impl<const STREAMING: bool> GroupValues for GroupValuesColumn<STREAMING> {
         Ok(output)
     }
 
-    fn clear_shrink(&mut self, batch: &RecordBatch) {
-        let count = batch.num_rows();
+    fn clear_shrink(&mut self, num_rows: usize) {
         self.group_values.clear();
         self.map.clear();
-        self.map.shrink_to(count, |_| 0); // hasher does not matter since the map is cleared
+        self.map.shrink_to(num_rows, |_| 0); // hasher does not matter since the map is cleared
         self.map_size = self.map.capacity() * size_of::<(u64, usize)>();
         self.hashes_buffer.clear();
-        self.hashes_buffer.shrink_to(count);
+        self.hashes_buffer.shrink_to(num_rows);
 
         // Such structures are only used in `non-streaming` case
         if !STREAMING {
@@ -1261,7 +1259,7 @@ mod tests {
     use datafusion_expr::EmitTo;
 
     use crate::aggregates::group_values::{
-        multi_group_by::GroupValuesColumn, GroupValues,
+        GroupValues, multi_group_by::GroupValuesColumn,
     };
 
     use super::GroupIndexView;
diff --git a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/primitive.rs b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/primitive.rs
index a586197e50341..31126348b3fd4 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/primitive.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/primitive.rs
@@ -16,11 +16,11 @@
 // under the License.
 
 use crate::aggregates::group_values::multi_group_by::{
-    nulls_equal_to, GroupColumn, Nulls,
+    GroupColumn, Nulls, nulls_equal_to,
 };
 use crate::aggregates::group_values::null_builder::MaybeNullBufferBuilder;
 use arrow::array::ArrowNativeTypeOp;
-use arrow::array::{cast::AsArray, Array, ArrayRef, ArrowPrimitiveType, PrimitiveArray};
+use arrow::array::{Array, ArrayRef, ArrowPrimitiveType, PrimitiveArray, cast::AsArray};
 use arrow::buffer::ScalarBuffer;
 use arrow::datatypes::DataType;
 use datafusion_common::Result;
@@ -56,6 +56,85 @@ where
             nulls: MaybeNullBufferBuilder::new(),
         }
     }
+
+    fn vectorized_equal_to_non_nullable(
+        &self,
+        lhs_rows: &[usize],
+        array: &ArrayRef,
+        rhs_rows: &[usize],
+        equal_to_results: &mut [bool],
+    ) {
+        assert!(
+            !NULLABLE || (array.null_count() == 0 && !self.nulls.might_have_nulls()),
+            "called with nullable input"
+        );
+        let array_values = array.as_primitive::<T>().values();
+
+        let iter = izip!(
+            lhs_rows.iter(),
+            rhs_rows.iter(),
+            equal_to_results.iter_mut(),
+        );
+
+        for (&lhs_row, &rhs_row, equal_to_result) in iter {
+            let result = {
+                // Getting unchecked not only for bound checks but because the bound checks are
+                // what prevents auto-vectorization
+                let left = if cfg!(debug_assertions) {
+                    self.group_values[lhs_row]
+                } else {
+                    // SAFETY: indices are guaranteed to be in bounds
+                    unsafe { *self.group_values.get_unchecked(lhs_row) }
+                };
+                let right = if cfg!(debug_assertions) {
+                    array_values[rhs_row]
+                } else {
+                    // SAFETY: indices are guaranteed to be in bounds
+                    unsafe { *array_values.get_unchecked(rhs_row) }
+                };
+
+                // Always evaluate, to allow for auto-vectorization
+                left.is_eq(right)
+            };
+
+            *equal_to_result = result && *equal_to_result;
+        }
+    }
+
+    pub fn vectorized_equal_nullable(
+        &self,
+        lhs_rows: &[usize],
+        array: &ArrayRef,
+        rhs_rows: &[usize],
+        equal_to_results: &mut [bool],
+    ) {
+        assert!(NULLABLE, "called with non-nullable input");
+        let array = array.as_primitive::<T>();
+
+        let iter = izip!(
+            lhs_rows.iter(),
+            rhs_rows.iter(),
+            equal_to_results.iter_mut(),
+        );
+
+        for (&lhs_row, &rhs_row, equal_to_result) in iter {
+            // Has found not equal to in previous column, don't need to check
+            if !*equal_to_result {
+                continue;
+            }
+
+            // Perf: skip null check (by short circuit) if input is not nullable
+            let exist_null = self.nulls.is_null(lhs_row);
+            let input_null = array.is_null(rhs_row);
+            if let Some(result) = nulls_equal_to(exist_null, input_null) {
+                *equal_to_result = result;
+                continue;
+            }
+
+            // Otherwise, we need to check their values
+            *equal_to_result = self.group_values[lhs_row].is_eq(array.value(rhs_row));
+        }
+    }
 }
 
 impl<T: ArrowPrimitiveType, const NULLABLE: bool> GroupColumn
@@ -99,32 +178,15 @@ impl<T: ArrowPrimitiveType, const NULLABLE: bool> GroupColumn
         rhs_rows: &[usize],
         equal_to_results: &mut [bool],
     ) {
-        let array = array.as_primitive::<T>();
-
-        let iter = izip!(
-            lhs_rows.iter(),
-            rhs_rows.iter(),
-            equal_to_results.iter_mut(),
-        );
-
-        for (&lhs_row, &rhs_row, equal_to_result) in iter {
-            // Has found not equal to in previous column, don't need to check
-            if !*equal_to_result {
-                continue;
-            }
-
-            // Perf: skip null check (by short circuit) if input is not nullable
-            if NULLABLE {
-                let exist_null = self.nulls.is_null(lhs_row);
-                let input_null = array.is_null(rhs_row);
-                if let Some(result) = nulls_equal_to(exist_null, input_null) {
-                    *equal_to_result = result;
-                    continue;
-                }
-                // Otherwise, we need to check their values
-            }
-
-            *equal_to_result = self.group_values[lhs_row].is_eq(array.value(rhs_row));
+        if !NULLABLE || (array.null_count() == 0 && !self.nulls.might_have_nulls()) {
+            self.vectorized_equal_to_non_nullable(
+                lhs_rows,
+                array,
+                rhs_rows,
+                equal_to_results,
+            );
+        } else {
+            self.vectorized_equal_nullable(lhs_rows, array, rhs_rows, equal_to_results);
         }
     }
 
diff --git a/datafusion/physical-plan/src/aggregates/group_values/null_builder.rs b/datafusion/physical-plan/src/aggregates/group_values/null_builder.rs
index 23ffc69f218bf..6a84d685b6c79 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/null_builder.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/null_builder.rs
@@ -89,4 +89,12 @@ impl MaybeNullBufferBuilder {
         new_builder.truncate(n);
         new_builder.finish()
     }
+
+    /// Returns true if this builder might have any nulls
+    ///
+    /// This is guaranteed to be true if there are nulls
+    /// but may be true even if there are no nulls
+    pub(crate) fn might_have_nulls(&self) -> bool {
+        self.nulls.as_slice().is_some()
+    }
 }
diff --git a/datafusion/physical-plan/src/aggregates/group_values/row.rs b/datafusion/physical-plan/src/aggregates/group_values/row.rs
index 34893fcc4ed98..a3bd31f76c233 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/row.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/row.rs
@@ -16,13 +16,16 @@
 // under the License.
 
 use crate::aggregates::group_values::GroupValues;
-use ahash::RandomState;
-use arrow::array::{Array, ArrayRef, ListArray, RecordBatch, StructArray};
+use arrow::array::{
+    Array, ArrayRef, ListArray, PrimitiveArray, RunArray, StructArray,
+    downcast_run_end_index,
+};
 use arrow::compute::cast;
 use arrow::datatypes::{DataType, SchemaRef};
 use arrow::row::{RowConverter, Rows, SortField};
-use datafusion_common::hash_utils::create_hashes;
 use datafusion_common::Result;
+use datafusion_common::hash_utils::RandomState;
+use datafusion_common::hash_utils::create_hashes;
 use datafusion_execution::memory_pool::proxy::{HashTableAllocExt, VecAllocExt};
 use datafusion_expr::EmitTo;
 use hashbrown::hash_table::HashTable;
@@ -236,30 +239,28 @@ impl GroupValues for GroupValuesRows {
         // https://github.com/apache/datafusion/issues/7647
         for (field, array) in self.schema.fields.iter().zip(&mut output) {
             let expected = field.data_type();
-            *array =
-                dictionary_encode_if_necessary(Arc::<dyn Array>::clone(array), expected)?;
+            *array = dictionary_encode_if_necessary(array, expected)?;
         }
 
         self.group_values = Some(group_values);
         Ok(output)
     }
 
-    fn clear_shrink(&mut self, batch: &RecordBatch) {
-        let count = batch.num_rows();
+    fn clear_shrink(&mut self, num_rows: usize) {
         self.group_values = self.group_values.take().map(|mut rows| {
             rows.clear();
             rows
         });
         self.map.clear();
-        self.map.shrink_to(count, |_| 0); // hasher does not matter since the map is cleared
+        self.map.shrink_to(num_rows, |_| 0); // hasher does not matter since the map is cleared
         self.map_size = self.map.capacity() * size_of::<(u64, usize)>();
         self.hashes_buffer.clear();
-        self.hashes_buffer.shrink_to(count);
+        self.hashes_buffer.shrink_to(num_rows);
     }
 }
 
 fn dictionary_encode_if_necessary(
-    array: ArrayRef,
+    array: &ArrayRef,
     expected: &DataType,
 ) -> Result<ArrayRef> {
     match (expected, array.data_type()) {
@@ -269,10 +270,7 @@ fn dictionary_encode_if_necessary(
                 .iter()
                 .zip(struct_array.columns())
                 .map(|(expected_field, column)| {
-                    dictionary_encode_if_necessary(
-                        Arc::<dyn Array>::clone(column),
-                        expected_field.data_type(),
-                    )
+                    dictionary_encode_if_necessary(column, expected_field.data_type())
                 })
                 .collect::<Result<Vec<_>>>()?;
 
@@ -289,13 +287,40 @@ fn dictionary_encode_if_necessary(
                 Arc::<arrow::datatypes::Field>::clone(expected_field),
                 list.offsets().clone(),
                 dictionary_encode_if_necessary(
-                    Arc::<dyn Array>::clone(list.values()),
+                    list.values(),
                     expected_field.data_type(),
                 )?,
                 list.nulls().cloned(),
             )?))
         }
         (DataType::Dictionary(_, _), _) => Ok(cast(array.as_ref(), expected)?),
-        (_, _) => Ok(Arc::<dyn Array>::clone(&array)),
+        (
+            DataType::RunEndEncoded(run_ends_field, expected_values_field),
+            &DataType::RunEndEncoded(_, _),
+        ) => {
+            macro_rules! reencode_ree {
+                ($run_end_type:ty) => {{
+                    let run_array = array
+                        .as_any()
+                        .downcast_ref::<RunArray<$run_end_type>>()
+                        .unwrap();
+                    let values = dictionary_encode_if_necessary(
+                        &(Arc::clone(run_array.values()) as ArrayRef),
+                        expected_values_field.data_type(),
+                    )?;
+                    let run_ends = PrimitiveArray::<$run_end_type>::new(
+                        run_array.run_ends().inner().clone(),
+                        None,
+                    );
+                    Ok(Arc::new(RunArray::try_new(&run_ends, &values)?))
+                }};
+            }
+            downcast_run_end_index! {
+                run_ends_field.data_type() => (reencode_ree),
+                _ => unreachable!("unsupported run end type: {}", run_ends_field.data_type()),
+            }
+        }
+        (DataType::RunEndEncoded(_, _), _) => Ok(cast(array.as_ref(), expected)?),
+        (_, _) => Ok(Arc::<dyn Array>::clone(array)),
     }
 }
diff --git a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/boolean.rs b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/boolean.rs
index 44b763a91f523..e993c0c53d199 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/boolean.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/boolean.rs
@@ -19,7 +19,6 @@ use crate::aggregates::group_values::GroupValues;
 
 use arrow::array::{
     ArrayRef, AsArray as _, BooleanArray, BooleanBufferBuilder, NullBufferBuilder,
-    RecordBatch,
 };
 use datafusion_common::Result;
 use datafusion_expr::EmitTo;
@@ -146,7 +145,7 @@ impl GroupValues for GroupValuesBoolean {
         Ok(vec![Arc::new(BooleanArray::new(values, nulls)) as _])
     }
 
-    fn clear_shrink(&mut self, _batch: &RecordBatch) {
+    fn clear_shrink(&mut self, _num_rows: usize) {
         self.false_group = None;
         self.true_group = None;
         self.null_group = None;
diff --git a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/bytes.rs b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/bytes.rs
index b901aee313fb7..b881a51b25474 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/bytes.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/bytes.rs
@@ -19,7 +19,7 @@ use std::mem::size_of;
 
 use crate::aggregates::group_values::GroupValues;
 
-use arrow::array::{Array, ArrayRef, OffsetSizeTrait, RecordBatch};
+use arrow::array::{Array, ArrayRef, OffsetSizeTrait};
 use datafusion_common::Result;
 use datafusion_expr::EmitTo;
 use datafusion_physical_expr_common::binary_map::{ArrowBytesMap, OutputType};
@@ -120,7 +120,7 @@ impl<O: OffsetSizeTrait> GroupValues for GroupValuesBytes<O> {
         Ok(vec![group_values])
     }
 
-    fn clear_shrink(&mut self, _batch: &RecordBatch) {
+    fn clear_shrink(&mut self, _num_rows: usize) {
         // in theory we could potentially avoid this reallocation and clear the
         // contents of the maps, but for now we just reset the map from the beginning
         self.map.take();
diff --git a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/bytes_view.rs b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/bytes_view.rs
index be9a0334e3ee6..7a56f7c52c11a 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/bytes_view.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/bytes_view.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use crate::aggregates::group_values::GroupValues;
-use arrow::array::{Array, ArrayRef, RecordBatch};
+use arrow::array::{Array, ArrayRef};
 use datafusion_expr::EmitTo;
 use datafusion_physical_expr::binary_map::OutputType;
 use datafusion_physical_expr_common::binary_view_map::ArrowBytesViewMap;
@@ -122,7 +122,7 @@ impl GroupValues for GroupValuesBytesView {
         Ok(vec![group_values])
     }
 
-    fn clear_shrink(&mut self, _batch: &RecordBatch) {
+    fn clear_shrink(&mut self, _num_rows: usize) {
         // in theory we could potentially avoid this reallocation and clear the
         // contents of the maps, but for now we just reset the map from the beginning
         self.map.take();
diff --git a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs
index f35c580b0e632..efaf7eba0f1b5 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs
@@ -16,19 +16,20 @@
 // under the License.
 
 use crate::aggregates::group_values::GroupValues;
-use ahash::RandomState;
 use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano};
 use arrow::array::{
-    cast::AsArray, ArrayRef, ArrowNativeTypeOp, ArrowPrimitiveType, NullBufferBuilder,
-    PrimitiveArray,
+    ArrayRef, ArrowNativeTypeOp, ArrowPrimitiveType, NullBufferBuilder, PrimitiveArray,
+    cast::AsArray,
 };
-use arrow::datatypes::{i256, DataType};
-use arrow::record_batch::RecordBatch;
+use arrow::datatypes::{DataType, i256};
 use datafusion_common::Result;
+use datafusion_common::hash_utils::RandomState;
 use datafusion_execution::memory_pool::proxy::VecAllocExt;
 use datafusion_expr::EmitTo;
 use half::f16;
 use hashbrown::hash_table::HashTable;
+#[cfg(not(feature = "force_hash_collisions"))]
+use std::hash::BuildHasher;
 use std::mem::size_of;
 use std::sync::Arc;
 
@@ -129,7 +130,9 @@ where
                     let hash = key.hash(state);
                     let insert = self.map.entry(
                         hash,
-                        |&(g, _)| unsafe { self.values.get_unchecked(g).is_eq(key) },
+                        |&(g, h)| unsafe {
+                            hash == h && self.values.get_unchecked(g).is_eq(key)
+                        },
                         |&(_, h)| h,
                     );
 
@@ -213,11 +216,10 @@ where
         Ok(vec![Arc::new(array.with_data_type(self.data_type.clone()))])
     }
 
-    fn clear_shrink(&mut self, batch: &RecordBatch) {
-        let count = batch.num_rows();
+    fn clear_shrink(&mut self, num_rows: usize) {
         self.values.clear();
-        self.values.shrink_to(count);
+        self.values.shrink_to(num_rows);
         self.map.clear();
-        self.map.shrink_to(count, |_| 0); // hasher does not matter since the map is cleared
+        self.map.shrink_to(num_rows, |_| 0); // hasher does not matter since the map is cleared
     }
 }
diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs
index 30d1441f5773e..93887d26a9a3d 100644
--- a/datafusion/physical-plan/src/aggregates/mod.rs
+++ b/datafusion/physical-plan/src/aggregates/mod.rs
@@ -17,7 +17,6 @@
 
 //! Aggregates functionalities
 
-use std::any::Any;
 use std::sync::Arc;
 
 use super::{DisplayAs, ExecutionPlanProperties, PlanProperties};
@@ -27,39 +26,46 @@ use crate::aggregates::{
 };
 use crate::execution_plan::{CardinalityEffect, EmissionType};
 use crate::filter_pushdown::{
-    ChildFilterDescription, FilterDescription, FilterPushdownPhase, PushedDownPredicate,
+    ChildFilterDescription, ChildPushdownResult, FilterDescription, FilterPushdownPhase,
+    FilterPushdownPropagation, PushedDownPredicate,
 };
 use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet};
-use crate::windows::get_ordered_partition_by_indices;
 use crate::{
     DisplayFormatType, Distribution, ExecutionPlan, InputOrderMode,
-    SendableRecordBatchStream, Statistics,
+    SendableRecordBatchStream, Statistics, check_if_same_properties,
 };
 use datafusion_common::config::ConfigOptions;
 use datafusion_physical_expr::utils::collect_columns;
-use std::collections::HashSet;
+use parking_lot::Mutex;
+use std::collections::{HashMap, HashSet};
 
-use arrow::array::{ArrayRef, UInt16Array, UInt32Array, UInt64Array, UInt8Array};
-use arrow::datatypes::{Field, Schema, SchemaRef};
+use arrow::array::{ArrayRef, UInt8Array, UInt16Array, UInt32Array, UInt64Array};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use arrow_schema::FieldRef;
 use datafusion_common::stats::Precision;
-use datafusion_common::{internal_err, not_impl_err, Constraint, Constraints, Result};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{
+    Constraint, Constraints, Result, ScalarValue, assert_eq_or_internal_err, not_impl_err,
+};
 use datafusion_execution::TaskContext;
 use datafusion_expr::{Accumulator, Aggregate};
 use datafusion_physical_expr::aggregate::AggregateFunctionExpr;
 use datafusion_physical_expr::equivalence::ProjectionMapping;
-use datafusion_physical_expr::expressions::Column;
+use datafusion_physical_expr::expressions::{Column, DynamicFilterPhysicalExpr, lit};
 use datafusion_physical_expr::{
-    physical_exprs_contains, ConstExpr, EquivalenceProperties,
+    ConstExpr, EquivalenceProperties, physical_exprs_contains,
 };
-use datafusion_physical_expr_common::physical_expr::{fmt_sql, PhysicalExpr};
+use datafusion_physical_expr_common::physical_expr::{PhysicalExpr, fmt_sql};
 use datafusion_physical_expr_common::sort_expr::{
     LexOrdering, LexRequirement, OrderingRequirements, PhysicalSortRequirement,
 };
 
 use datafusion_expr::utils::AggregateOrderSensitivity;
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays;
 use itertools::Itertools;
+use topk::hash_table::is_supported_hash_key_type;
+use topk::heap::is_supported_heap_type;
 
 pub mod group_values;
 mod no_grouping;
@@ -68,15 +74,71 @@ mod row_hash;
 mod topk;
 mod topk_stream;
 
+/// Returns true if TopK aggregation data structures support the provided key and value types.
+///
+/// This function checks whether both the key type (used for grouping) and value type
+/// (used in min/max aggregation) can be handled by the TopK aggregation heap and hash table.
+/// Supported types include Arrow primitives (integers, floats, decimals, intervals) and
+/// UTF-8 strings (`Utf8`, `LargeUtf8`, `Utf8View`).
+/// ```text
+pub fn topk_types_supported(key_type: &DataType, value_type: &DataType) -> bool {
+    is_supported_hash_key_type(key_type) && is_supported_heap_type(value_type)
+}
+
 /// Hard-coded seed for aggregations to ensure hash values differ from `RepartitionExec`, avoiding collisions.
-const AGGREGATION_HASH_SEED: ahash::RandomState =
-    ahash::RandomState::with_seeds('A' as u64, 'G' as u64, 'G' as u64, 'R' as u64);
+const AGGREGATION_HASH_SEED: datafusion_common::hash_utils::RandomState =
+    // This seed is chosen to be a large 64-bit number
+    datafusion_common::hash_utils::RandomState::with_seed(15395726432021054657);
+
+/// Whether an aggregate stage consumes raw input data or intermediate
+/// accumulator state from a previous aggregation stage.
+///
+/// See the [table on `AggregateMode`](AggregateMode#variants-and-their-inputoutput-modes)
+/// for how this relates to aggregate modes.
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
+pub enum AggregateInputMode {
+    /// The stage consumes raw, unaggregated input data and calls
+    /// [`Accumulator::update_batch`].
+    Raw,
+    /// The stage consumes intermediate accumulator state from a previous
+    /// aggregation stage and calls [`Accumulator::merge_batch`].
+    Partial,
+}
+
+/// Whether an aggregate stage produces intermediate accumulator state
+/// or final output values.
+///
+/// See the [table on `AggregateMode`](AggregateMode#variants-and-their-inputoutput-modes)
+/// for how this relates to aggregate modes.
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
+pub enum AggregateOutputMode {
+    /// The stage produces intermediate accumulator state, serialized via
+    /// [`Accumulator::state`].
+    Partial,
+    /// The stage produces final output values via
+    /// [`Accumulator::evaluate`].
+    Final,
+}
 
 /// Aggregation modes
 ///
 /// See [`Accumulator::state`] for background information on multi-phase
 /// aggregation and how these modes are used.
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+///
+/// # Variants and their input/output modes
+///
+/// Each variant can be characterized by its [`AggregateInputMode`] and
+/// [`AggregateOutputMode`]:
+///
+/// ```text
+///                       | Input: Raw data           | Input: Partial state
+/// Output: Final values  | Single, SinglePartitioned | Final, FinalPartitioned
+/// Output: Partial state | Partial                   | PartialReduce
+/// ```
+///
+/// Use [`AggregateMode::input_mode`] and [`AggregateMode::output_mode`]
+/// to query these properties.
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
 pub enum AggregateMode {
     /// One of multiple layers of aggregation, any input partitioning
     ///
@@ -127,18 +189,56 @@ pub enum AggregateMode {
     /// This mode requires that the input has more than one partition, and is
     /// partitioned by group key (like FinalPartitioned).
     SinglePartitioned,
+    /// Combine multiple partial aggregations to produce a new partial
+    /// aggregation.
+    ///
+    /// Input is intermediate accumulator state (like Final), but output is
+    /// also intermediate accumulator state (like Partial). This enables
+    /// tree-reduce aggregation strategies where partial results from
+    /// multiple workers are combined in multiple stages before a final
+    /// evaluation.
+    ///
+    /// ```text
+    ///               Final
+    ///            /        \
+    ///     PartialReduce   PartialReduce
+    ///     /         \      /         \
+    ///  Partial   Partial  Partial   Partial
+    /// ```
+    PartialReduce,
 }
 
 impl AggregateMode {
-    /// Checks whether this aggregation step describes a "first stage" calculation.
-    /// In other words, its input is not another aggregation result and the
-    /// `merge_batch` method will not be called for these modes.
-    pub fn is_first_stage(&self) -> bool {
+    /// Returns the [`AggregateInputMode`] for this mode: whether this
+    /// stage consumes raw input data or intermediate accumulator state.
+    ///
+    /// See the [table above](AggregateMode#variants-and-their-inputoutput-modes)
+    /// for details.
+    pub fn input_mode(&self) -> AggregateInputMode {
         match self {
             AggregateMode::Partial
             | AggregateMode::Single
-            | AggregateMode::SinglePartitioned => true,
-            AggregateMode::Final | AggregateMode::FinalPartitioned => false,
+            | AggregateMode::SinglePartitioned => AggregateInputMode::Raw,
+            AggregateMode::Final
+            | AggregateMode::FinalPartitioned
+            | AggregateMode::PartialReduce => AggregateInputMode::Partial,
+        }
+    }
+
+    /// Returns the [`AggregateOutputMode`] for this mode: whether this
+    /// stage produces intermediate accumulator state or final output values.
+    ///
+    /// See the [table above](AggregateMode#variants-and-their-inputoutput-modes)
+    /// for details.
+    pub fn output_mode(&self) -> AggregateOutputMode {
+        match self {
+            AggregateMode::Final
+            | AggregateMode::FinalPartitioned
+            | AggregateMode::Single
+            | AggregateMode::SinglePartitioned => AggregateOutputMode::Final,
+            AggregateMode::Partial | AggregateMode::PartialReduce => {
+                AggregateOutputMode::Partial
+            }
         }
     }
 }
@@ -172,6 +272,9 @@ pub struct PhysicalGroupBy {
     /// expression in null_expr. If `groups[i][j]` is true, then the
     /// j-th expression in the i-th group is NULL, otherwise it is `expr[j]`.
     groups: Vec<Vec<bool>>,
+    /// True when GROUPING SETS/CUBE/ROLLUP are used so `__grouping_id` should
+    /// be included in the output schema.
+    has_grouping_set: bool,
 }
 
 impl PhysicalGroupBy {
@@ -180,11 +283,13 @@ impl PhysicalGroupBy {
         expr: Vec<(Arc<dyn PhysicalExpr>, String)>,
         null_expr: Vec<(Arc<dyn PhysicalExpr>, String)>,
         groups: Vec<Vec<bool>>,
+        has_grouping_set: bool,
     ) -> Self {
         Self {
             expr,
             null_expr,
             groups,
+            has_grouping_set,
         }
     }
 
@@ -196,6 +301,7 @@ impl PhysicalGroupBy {
             expr,
             null_expr: vec![],
             groups: vec![vec![false; num_exprs]],
+            has_grouping_set: false,
         }
     }
 
@@ -212,6 +318,11 @@ impl PhysicalGroupBy {
         exprs_nullable
     }
 
+    /// Returns true if this has no grouping at all (including no GROUPING SETS)
+    pub fn is_true_no_grouping(&self) -> bool {
+        self.is_empty() && !self.has_grouping_set
+    }
+
     /// Returns the group expressions
     pub fn expr(&self) -> &[(Arc<dyn PhysicalExpr>, String)] {
         &self.expr
@@ -227,14 +338,20 @@ impl PhysicalGroupBy {
         &self.groups
     }
 
+    /// Returns true if this grouping uses GROUPING SETS, CUBE or ROLLUP.
+    pub fn has_grouping_set(&self) -> bool {
+        self.has_grouping_set
+    }
+
     /// Returns true if this `PhysicalGroupBy` has no group expressions
     pub fn is_empty(&self) -> bool {
         self.expr.is_empty()
     }
 
-    /// Check whether grouping set is single group
+    /// Returns true if this is a "simple" GROUP BY (not using GROUPING SETS/CUBE/ROLLUP).
+    /// This determines whether the `__grouping_id` column is included in the output schema.
     pub fn is_single(&self) -> bool {
-        self.null_expr.is_empty()
+        !self.has_grouping_set
     }
 
     /// Calculate GROUP BY expressions according to input schema.
@@ -248,7 +365,7 @@ impl PhysicalGroupBy {
     /// The number of expressions in the output schema.
     fn num_output_exprs(&self) -> usize {
         let mut num_exprs = self.expr.len();
-        if !self.is_single() {
+        if self.has_grouping_set {
             num_exprs += 1
         }
         num_exprs
@@ -265,7 +382,7 @@ impl PhysicalGroupBy {
                 .take(num_output_exprs)
                 .map(|(index, (_, name))| Arc::new(Column::new(name, index)) as _),
         );
-        if !self.is_single() {
+        if self.has_grouping_set {
             output_exprs.push(Arc::new(Column::new(
                 Aggregate::INTERNAL_GROUPING_ID,
                 self.expr.len(),
@@ -276,11 +393,16 @@ impl PhysicalGroupBy {
 
     /// Returns the number expression as grouping keys.
     pub fn num_group_exprs(&self) -> usize {
-        if self.is_single() {
-            self.expr.len()
-        } else {
-            self.expr.len() + 1
-        }
+        self.expr.len() + usize::from(self.has_grouping_set)
+    }
+
+    /// Returns the Arrow data type of the `__grouping_id` column.
+    ///
+    /// The type is chosen to be wide enough to hold both the semantic bitmask
+    /// (in the low `n` bits, where `n` is the number of grouping expressions)
+    /// and the duplicate ordinal (in the high bits).
+    fn grouping_id_data_type(&self) -> DataType {
+        Aggregate::grouping_id_type(self.expr.len(), max_duplicate_ordinal(&self.groups))
     }
 
     pub fn group_schema(&self, schema: &Schema) -> Result<SchemaRef> {
@@ -303,11 +425,11 @@ impl PhysicalGroupBy {
                 .into(),
             );
         }
-        if !self.is_single() {
+        if self.has_grouping_set {
             fields.push(
                 Field::new(
                     Aggregate::INTERNAL_GROUPING_ID,
-                    Aggregate::grouping_id_type(self.expr.len()),
+                    self.grouping_id_data_type(),
                     false,
                 )
                 .into(),
@@ -339,17 +461,17 @@ impl PhysicalGroupBy {
                 )
                 .collect();
         let num_exprs = expr.len();
-        let groups = if self.expr.is_empty() {
+        let groups = if self.expr.is_empty() && !self.has_grouping_set {
             // No GROUP BY expressions - should have no groups
             vec![]
         } else {
-            // Has GROUP BY expressions - create a single group
             vec![vec![false; num_exprs]]
         };
         Self {
             expr,
             null_expr: vec![],
             groups,
+            has_grouping_set: false,
         }
     }
 }
@@ -369,10 +491,11 @@ impl PartialEq for PhysicalGroupBy {
                 .zip(other.null_expr.iter())
                 .all(|((expr1, name1), (expr2, name2))| expr1.eq(expr2) && name1 == name2)
             && self.groups == other.groups
+            && self.has_grouping_set == other.has_grouping_set
     }
 }
 
-#[allow(clippy::large_enum_variant)]
+#[expect(clippy::large_enum_variant)]
 enum StreamType {
     AggregateStream(AggregateStream),
     GroupedHash(GroupedHashAggregateStream),
@@ -389,22 +512,144 @@ impl From<StreamType> for SendableRecordBatchStream {
     }
 }
 
+/// # Aggregate Dynamic Filter Pushdown Overview
+///
+/// For queries like
+///   -- `example_table(type TEXT, val INT)`
+///   SELECT min(val)
+///   FROM example_table
+///   WHERE type='A';
+///
+/// And `example_table`'s physical representation is a partitioned parquet file with
+/// column statistics
+/// - part-0.parquet: val {min=0, max=100}
+/// - part-1.parquet: val {min=100, max=200}
+/// - ...
+/// - part-100.parquet: val {min=10000, max=10100}
+///
+/// After scanning the 1st file, we know we only have to read files if their minimal
+/// value on `val` column is less than 0, the minimal `val` value in the 1st file.
+///
+/// We can skip scanning the remaining file by implementing dynamic filter, the
+/// intuition is we keep a shared data structure for current min in both `AggregateExec`
+/// and `DataSourceExec`, and let it update during execution, so the scanner can
+/// know during execution if it's possible to skip scanning certain files. See
+/// physical optimizer rule `FilterPushdown` for details.
+///
+/// # Implementation
+///
+/// ## Enable Condition
+/// - No grouping (no `GROUP BY` clause in the sql, only a single global group to aggregate)
+/// - The aggregate expression must be `min`/`max`, and evaluate directly on columns.
+///   Note multiple aggregate expressions that satisfy this requirement are allowed,
+///   and a dynamic filter will be constructed combining all applicable expr's
+///   states. See more in the following example with dynamic filter on multiple columns.
+///
+/// ## Filter Construction
+/// The filter is kept in the `DataSourceExec`, and it will gets update during execution,
+/// the reader will interpret it as "the upstream only needs rows that such filter
+/// predicate is evaluated to true", and certain scanner implementation like `parquet`
+/// can evalaute column statistics on those dynamic filters, to decide if they can
+/// prune a whole range.
+///
+/// ### Examples
+/// - Expr: `min(a)`, Dynamic Filter: `a < a_cur_min`
+/// - Expr: `min(a), max(a), min(b)`, Dynamic Filter: `(a < a_cur_min) OR (a > a_cur_max) OR (b < b_cur_min)`
+#[derive(Debug, Clone)]
+struct AggrDynFilter {
+    /// The physical expr for the dynamic filter shared between the `AggregateExec`
+    /// and the parquet scanner.
+    filter: Arc<DynamicFilterPhysicalExpr>,
+    /// The current bounds for the dynamic filter, updates during the execution to
+    /// tighten the bound for more effective pruning.
+    ///
+    /// Each vector element is for the accumulators that support dynamic filter.
+    /// e.g. This `AggregateExec` has accumulator:
+    /// min(a), avg(a), max(b)
+    /// And this field stores [PerAccumulatorDynFilter(min(a)), PerAccumulatorDynFilter(min(b))]
+    supported_accumulators_info: Vec<PerAccumulatorDynFilter>,
+}
+
+// ---- Aggregate Dynamic Filter Utility Structs ----
+
+/// Aggregate expressions that support the dynamic filter pushdown in aggregation.
+/// See comments in [`AggrDynFilter`] for conditions.
+#[derive(Debug, Clone)]
+struct PerAccumulatorDynFilter {
+    aggr_type: DynamicFilterAggregateType,
+    /// During planning and optimization, the parent structure is kept in `AggregateExec`,
+    /// this index is into `aggr_expr` vec inside `AggregateExec`.
+    /// During execution, the parent struct is moved into `AggregateStream` (stream
+    /// for no grouping aggregate execution), and this index is into    `aggregate_expressions`
+    /// vec inside `AggregateStreamInner`
+    aggr_index: usize,
+    // The current bound. Shared among all streams.
+    shared_bound: Arc<Mutex<ScalarValue>>,
+}
+
+/// Aggregate types that are supported for dynamic filter in `AggregateExec`
+#[derive(Debug, Clone)]
+enum DynamicFilterAggregateType {
+    Min,
+    Max,
+}
+
+/// Configuration for limit-based optimizations in aggregation
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct LimitOptions {
+    /// The maximum number of rows to return
+    pub limit: usize,
+    /// Optional ordering direction (true = descending, false = ascending)
+    /// This is used for TopK aggregation to maintain a priority queue with the correct ordering
+    pub descending: Option<bool>,
+}
+
+impl LimitOptions {
+    /// Create a new LimitOptions with a limit and no specific ordering
+    pub fn new(limit: usize) -> Self {
+        Self {
+            limit,
+            descending: None,
+        }
+    }
+
+    /// Create a new LimitOptions with a limit and ordering direction
+    pub fn new_with_order(limit: usize, descending: bool) -> Self {
+        Self {
+            limit,
+            descending: Some(descending),
+        }
+    }
+
+    pub fn limit(&self) -> usize {
+        self.limit
+    }
+
+    pub fn descending(&self) -> Option<bool> {
+        self.descending
+    }
+}
+
 /// Hash aggregate execution plan
 #[derive(Debug, Clone)]
 pub struct AggregateExec {
     /// Aggregation mode (full, partial)
     mode: AggregateMode,
     /// Group by expressions
-    group_by: PhysicalGroupBy,
+    /// [`Arc`] used for a cheap clone, which improves physical plan optimization performance.
+    group_by: Arc<PhysicalGroupBy>,
     /// Aggregate expressions
-    aggr_expr: Vec<Arc<AggregateFunctionExpr>>,
+    /// The same reason to [`Arc`] it as for [`Self::group_by`].
+    aggr_expr: Arc<[Arc<AggregateFunctionExpr>]>,
     /// FILTER (WHERE clause) expression for each aggregate expression
-    filter_expr: Vec<Option<Arc<dyn PhysicalExpr>>>,
-    /// Set if the output of this aggregation is truncated by a upstream sort/limit clause
-    limit: Option<usize>,
+    /// The same reason to [`Arc`] it as for [`Self::group_by`].
+    filter_expr: Arc<[Option<Arc<dyn PhysicalExpr>>]>,
+    /// Configuration for limit-based optimizations
+    limit_options: Option<LimitOptions>,
     /// Input plan, could be a partial aggregate or the input to the aggregate
     pub input: Arc<dyn ExecutionPlan>,
-    /// Schema after the aggregate is applied
+    /// Schema after the aggregate is applied. Contains the group by columns followed by the
+    /// aggregate outputs.
     schema: SchemaRef,
     /// Input schema before any aggregation is applied. For partial aggregate this will be the
     /// same as input.schema() but for the final aggregate it will be the same as the input
@@ -417,7 +662,14 @@ pub struct AggregateExec {
     required_input_ordering: Option<OrderingRequirements>,
     /// Describes how the input is ordered relative to the group by columns
     input_order_mode: InputOrderMode,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
+    /// During initialization, if the plan supports dynamic filtering (see [`AggrDynFilter`]),
+    /// it is set to `Some(..)` regardless of whether it can be pushed down to a child node.
+    ///
+    /// During filter pushdown optimization, if a child node can accept this filter,
+    /// it remains `Some(..)` to enable dynamic filtering during aggregate execution;
+    /// otherwise, it is cleared to `None`.
+    dynamic_filter: Option<Arc<AggrDynFilter>>,
 }
 
 impl AggregateExec {
@@ -426,22 +678,43 @@ impl AggregateExec {
     /// Rewrites aggregate exec with new aggregate expressions.
     pub fn with_new_aggr_exprs(
         &self,
-        aggr_expr: Vec<Arc<AggregateFunctionExpr>>,
+        aggr_expr: impl Into<Arc<[Arc<AggregateFunctionExpr>]>>,
     ) -> Self {
         Self {
-            aggr_expr,
+            aggr_expr: aggr_expr.into(),
+            // clone the rest of the fields
+            required_input_ordering: self.required_input_ordering.clone(),
+            metrics: ExecutionPlanMetricsSet::new(),
+            input_order_mode: self.input_order_mode.clone(),
+            cache: Arc::clone(&self.cache),
+            mode: self.mode,
+            group_by: Arc::clone(&self.group_by),
+            filter_expr: Arc::clone(&self.filter_expr),
+            limit_options: self.limit_options,
+            input: Arc::clone(&self.input),
+            schema: Arc::clone(&self.schema),
+            input_schema: Arc::clone(&self.input_schema),
+            dynamic_filter: self.dynamic_filter.clone(),
+        }
+    }
+
+    /// Clone this exec, overriding only the limit hint.
+    pub fn with_new_limit_options(&self, limit_options: Option<LimitOptions>) -> Self {
+        Self {
+            limit_options,
             // clone the rest of the fields
             required_input_ordering: self.required_input_ordering.clone(),
             metrics: ExecutionPlanMetricsSet::new(),
             input_order_mode: self.input_order_mode.clone(),
-            cache: self.cache.clone(),
+            cache: Arc::clone(&self.cache),
             mode: self.mode,
-            group_by: self.group_by.clone(),
-            filter_expr: self.filter_expr.clone(),
-            limit: self.limit,
+            group_by: Arc::clone(&self.group_by),
+            aggr_expr: Arc::clone(&self.aggr_expr),
+            filter_expr: Arc::clone(&self.filter_expr),
             input: Arc::clone(&self.input),
             schema: Arc::clone(&self.schema),
             input_schema: Arc::clone(&self.input_schema),
+            dynamic_filter: self.dynamic_filter.clone(),
         }
     }
 
@@ -452,12 +725,13 @@ impl AggregateExec {
     /// Create a new hash aggregate execution plan
     pub fn try_new(
         mode: AggregateMode,
-        group_by: PhysicalGroupBy,
+        group_by: impl Into<Arc<PhysicalGroupBy>>,
         aggr_expr: Vec<Arc<AggregateFunctionExpr>>,
         filter_expr: Vec<Option<Arc<dyn PhysicalExpr>>>,
         input: Arc<dyn ExecutionPlan>,
         input_schema: SchemaRef,
     ) -> Result<Self> {
+        let group_by = group_by.into();
         let schema = create_schema(&input.schema(), &group_by, &aggr_expr, mode)?;
 
         let schema = Arc::new(schema);
@@ -480,20 +754,26 @@ impl AggregateExec {
     /// a rule may re-write aggregate expressions (e.g. reverse them) during
     /// initialization, field names may change inadvertently if one re-creates
     /// the schema in such cases.
-    #[allow(clippy::too_many_arguments)]
     fn try_new_with_schema(
         mode: AggregateMode,
-        group_by: PhysicalGroupBy,
+        group_by: impl Into<Arc<PhysicalGroupBy>>,
         mut aggr_expr: Vec<Arc<AggregateFunctionExpr>>,
-        filter_expr: Vec<Option<Arc<dyn PhysicalExpr>>>,
+        filter_expr: impl Into<Arc<[Option<Arc<dyn PhysicalExpr>>]>>,
         input: Arc<dyn ExecutionPlan>,
         input_schema: SchemaRef,
         schema: SchemaRef,
     ) -> Result<Self> {
+        let group_by = group_by.into();
+        let filter_expr = filter_expr.into();
+
         // Make sure arguments are consistent in size
-        if aggr_expr.len() != filter_expr.len() {
-            return internal_err!("Inconsistent aggregate expr: {:?} and filter expr: {:?} for AggregateExec, their size should match", aggr_expr, filter_expr);
-        }
+        assert_eq_or_internal_err!(
+            aggr_expr.len(),
+            filter_expr.len(),
+            "Inconsistent aggregate expr: {:?} and filter expr: {:?} for AggregateExec, their size should match",
+            aggr_expr,
+            filter_expr
+        );
 
         let input_eq_properties = input.equivalence_properties();
         // Get GROUP BY expressions:
@@ -501,12 +781,13 @@ impl AggregateExec {
         // If existing ordering satisfies a prefix of the GROUP BY expressions,
         // prefix requirements with this section. In this case, aggregation will
         // work more efficiently.
-        let indices = get_ordered_partition_by_indices(&groupby_exprs, &input)?;
-        let mut new_requirements = indices
-            .iter()
-            .map(|&idx| {
-                PhysicalSortRequirement::new(Arc::clone(&groupby_exprs[idx]), None)
-            })
+        // Copy the `PhysicalSortExpr`s to retain the sort options.
+        let (new_sort_exprs, indices) =
+            input_eq_properties.find_longest_permutation(&groupby_exprs)?;
+
+        let mut new_requirements = new_sort_exprs
+            .into_iter()
+            .map(PhysicalSortRequirement::from)
             .collect::<Vec<_>>();
 
         let req = get_finer_aggregate_exprs_requirement(
@@ -551,23 +832,28 @@ impl AggregateExec {
             &group_expr_mapping,
             &mode,
             &input_order_mode,
-            aggr_expr.as_slice(),
+            aggr_expr.as_ref(),
         )?;
 
-        Ok(AggregateExec {
+        let mut exec = AggregateExec {
             mode,
             group_by,
-            aggr_expr,
+            aggr_expr: aggr_expr.into(),
             filter_expr,
             input,
             schema,
             input_schema,
             metrics: ExecutionPlanMetricsSet::new(),
             required_input_ordering,
-            limit: None,
+            limit_options: None,
             input_order_mode,
-            cache,
-        })
+            cache: Arc::new(cache),
+            dynamic_filter: None,
+        };
+
+        exec.init_dynamic_filter();
+
+        Ok(exec)
     }
 
     /// Aggregation mode (full, partial)
@@ -575,11 +861,17 @@ impl AggregateExec {
         &self.mode
     }
 
-    /// Set the `limit` of this AggExec
-    pub fn with_limit(mut self, limit: Option<usize>) -> Self {
-        self.limit = limit;
+    /// Set the limit options for this AggExec
+    pub fn with_limit_options(mut self, limit_options: Option<LimitOptions>) -> Self {
+        self.limit_options = limit_options;
         self
     }
+
+    /// Get the limit options (if set)
+    pub fn limit_options(&self) -> Option<LimitOptions> {
+        self.limit_options
+    }
+
     /// Grouping expressions
     pub fn group_expr(&self) -> &PhysicalGroupBy {
         &self.group_by
@@ -610,30 +902,24 @@ impl AggregateExec {
         Arc::clone(&self.input_schema)
     }
 
-    /// number of rows soft limit of the AggregateExec
-    pub fn limit(&self) -> Option<usize> {
-        self.limit
-    }
-
     fn execute_typed(
         &self,
         partition: usize,
-        context: Arc<TaskContext>,
+        context: &Arc<TaskContext>,
     ) -> Result<StreamType> {
-        // no group by at all
-        if self.group_by.expr.is_empty() {
+        if self.group_by.is_true_no_grouping() {
             return Ok(StreamType::AggregateStream(AggregateStream::new(
                 self, context, partition,
             )?));
         }
 
         // grouping by an expression that has a sort/limit upstream
-        if let Some(limit) = self.limit {
-            if !self.is_unordered_unfiltered_group_by_distinct() {
-                return Ok(StreamType::GroupedPriorityQueue(
-                    GroupedTopKAggregateStream::new(self, context, partition, limit)?,
-                ));
-            }
+        if let Some(config) = self.limit_options
+            && !self.is_unordered_unfiltered_group_by_distinct()
+        {
+            return Ok(StreamType::GroupedPriorityQueue(
+                GroupedTopKAggregateStream::new(self, context, partition, config.limit)?,
+            ));
         }
 
         // grouping by something else and we need to just materialize all results
@@ -653,8 +939,15 @@ impl AggregateExec {
     /// This method qualifies the use of the LimitedDistinctAggregation rewrite rule
     /// on an AggregateExec.
     pub fn is_unordered_unfiltered_group_by_distinct(&self) -> bool {
+        if self
+            .limit_options()
+            .and_then(|config| config.descending)
+            .is_some()
+        {
+            return false;
+        }
         // ensure there is a group by
-        if self.group_expr().is_empty() {
+        if self.group_expr().is_empty() && !self.group_expr().has_grouping_set() {
             return false;
         }
         // ensure there are no aggregate expressions
@@ -713,7 +1006,7 @@ impl AggregateExec {
                 .iter()
                 .flat_map(|(_, target_cols)| {
                     target_cols.iter().flat_map(|(expr, _)| {
-                        expr.as_any().downcast_ref::<Column>().map(|c| c.index())
+                        expr.downcast_ref::<Column>().map(|c| c.index())
                     })
                 })
                 .collect(),
@@ -724,14 +1017,15 @@ impl AggregateExec {
 
         // Get output partitioning:
         let input_partitioning = input.output_partitioning().clone();
-        let output_partitioning = if mode.is_first_stage() {
-            // First stage aggregation will not change the output partitioning,
-            // but needs to respect aliases (e.g. mapping in the GROUP BY
-            // expression).
-            let input_eq_properties = input.equivalence_properties();
-            input_partitioning.project(group_expr_mapping, input_eq_properties)
-        } else {
-            input_partitioning.clone()
+        let output_partitioning = match mode.input_mode() {
+            AggregateInputMode::Raw => {
+                // First stage aggregation will not change the output partitioning,
+                // but needs to respect aliases (e.g. mapping in the GROUP BY
+                // expression).
+                let input_eq_properties = input.equivalence_properties();
+                input_partitioning.project(group_expr_mapping, input_eq_properties)
+            }
+            AggregateInputMode::Partial => input_partitioning.clone(),
         };
 
         // TODO: Emission type and boundedness information can be enhanced here
@@ -753,7 +1047,74 @@ impl AggregateExec {
         &self.input_order_mode
     }
 
-    fn statistics_inner(&self, child_statistics: Statistics) -> Result<Statistics> {
+    /// Returns the dynamic filter expression for this aggregate, if set.
+    pub fn dynamic_filter(&self) -> Option<&Arc<DynamicFilterPhysicalExpr>> {
+        self.dynamic_filter.as_ref().map(|df| &df.filter)
+    }
+
+    /// Replace the dynamic filter expression, recomputing any internal state
+    /// which may depend on the previous dynamic filter.
+    ///
+    /// This is a no-op if the aggregate does not support dynamic filtering.
+    ///
+    /// If dynamic filtering is supported, this method returns an error if the filter's
+    /// children reference invalid columns in the aggregate's input schema.
+    pub fn with_dynamic_filter(
+        mut self,
+        filter: Arc<DynamicFilterPhysicalExpr>,
+    ) -> Result<Self> {
+        if let Some(supported_accumulators_info) = self.supported_accumulators_info() {
+            for child in filter.children() {
+                child.data_type(&self.input_schema)?;
+            }
+            self.dynamic_filter = Some(Arc::new(AggrDynFilter {
+                filter,
+                supported_accumulators_info,
+            }));
+        }
+        Ok(self)
+    }
+
+    /// Estimates output statistics for this aggregate node.
+    ///
+    /// For grouped aggregations with known input row count > 1, the output row
+    /// count is estimated as:
+    ///
+    /// ```text
+    /// ndv        = sum over each grouping set of product(max(NDV_i + nulls_i, 1))
+    /// output_rows = input_rows                       // baseline
+    /// output_rows = min(output_rows, ndv)             // if NDV available
+    /// output_rows = min(output_rows, limit)           // if TopK active
+    /// ```
+    ///
+    /// **Example 1 — single group key:**
+    /// `GROUP BY city` where input_rows = 10,000, NDV(city) = 200
+    /// → output_rows = min(10_000, 200) = 200
+    ///
+    /// **Example 2 — two group keys with TopK:**
+    /// `GROUP BY city, category` where input_rows = 10,000, NDV(city) = 200,
+    /// NDV(category) = 5, limit = 100
+    /// → ndv = 200 × 5 = 1,000
+    /// → output_rows = min(10_000, 1_000) = 1,000
+    /// → output_rows = min(1_000, 100) = 100
+    ///
+    /// When `input_rows` is absent but NDV is available, falls back to:
+    ///
+    /// ```text
+    /// output_rows = min(ndv, limit)   // if both available
+    /// output_rows = ndv               // if only NDV available
+    /// output_rows = limit             // if only limit available
+    /// ```
+    ///
+    /// NDV estimation details (see [`Self::compute_group_ndv`]):
+    /// - For each grouping set, only active (non-NULL) columns contribute
+    /// - Per-column contribution is `max(NDV + null_adj, 1)` where `null_adj`
+    ///   is 1 when nulls are present, 0 otherwise (a null group is a distinct
+    ///   output row; `.max(1)` prevents a zero NDV from zeroing the product)
+    /// - Per-set products are summed across all grouping sets
+    /// - Requires NDV stats for ALL active group-by columns; if any lacks stats,
+    ///   falls back to `input_rows` (or `Absent` if that is also unknown)
+    fn statistics_inner(&self, child_statistics: &Statistics) -> Result<Statistics> {
         // TODO stats: group expressions:
         // - once expressions will be able to compute their own stats, use it here
         // - case where we group by on a column for which with have the `distinct` stat
@@ -765,16 +1126,13 @@ impl AggregateExec {
             let mut column_statistics = Statistics::unknown_column(&self.schema());
 
             for (idx, (expr, _)) in self.group_by.expr.iter().enumerate() {
-                if let Some(col) = expr.as_any().downcast_ref::<Column>() {
-                    column_statistics[idx].max_value = child_statistics.column_statistics
-                        [col.index()]
-                    .max_value
-                    .clone();
-
-                    column_statistics[idx].min_value = child_statistics.column_statistics
-                        [col.index()]
-                    .min_value
-                    .clone();
+                if let Some(col) = expr.downcast_ref::<Column>() {
+                    let child_col_stats =
+                        &child_statistics.column_statistics[col.index()];
+                    column_statistics[idx].max_value = child_col_stats.max_value.clone();
+                    column_statistics[idx].min_value = child_col_stats.min_value.clone();
+                    column_statistics[idx].distinct_count =
+                        child_col_stats.distinct_count;
                 }
             }
 
@@ -784,37 +1142,221 @@ impl AggregateExec {
             AggregateMode::Final | AggregateMode::FinalPartitioned
                 if self.group_by.expr.is_empty() =>
             {
+                let total_byte_size =
+                    Self::calculate_scaled_byte_size(child_statistics, 1);
+
                 Ok(Statistics {
                     num_rows: Precision::Exact(1),
                     column_statistics,
-                    total_byte_size: Precision::Absent,
+                    total_byte_size,
                 })
             }
             _ => {
-                // When the input row count is 1, we can adopt that statistic keeping its reliability.
-                // When it is larger than 1, we degrade the precision since it may decrease after aggregation.
-                let num_rows = if let Some(value) = child_statistics.num_rows.get_value()
-                {
-                    if *value > 1 {
-                        child_statistics.num_rows.to_inexact()
-                    } else if *value == 0 {
-                        child_statistics.num_rows
-                    } else {
-                        // num_rows = 1 case
-                        let grouping_set_num = self.group_by.groups.len();
-                        child_statistics.num_rows.map(|x| x * grouping_set_num)
-                    }
-                } else {
-                    Precision::Absent
-                };
+                let num_rows = self.estimate_num_rows(child_statistics);
+
+                let total_byte_size = num_rows
+                    .get_value()
+                    .and_then(|&output_rows| {
+                        Self::calculate_scaled_byte_size(child_statistics, output_rows)
+                            .get_value()
+                            .map(|&bytes| Precision::Inexact(bytes))
+                    })
+                    .unwrap_or(Precision::Absent);
+
                 Ok(Statistics {
                     num_rows,
                     column_statistics,
-                    total_byte_size: Precision::Absent,
+                    total_byte_size,
                 })
             }
         }
     }
+
+    /// Estimates the output row count for grouped aggregations, combining NDV,
+    /// input row count, and TopK limit into a single [`Precision<usize>`].
+    fn estimate_num_rows(&self, child_statistics: &Statistics) -> Precision<usize> {
+        let ndv = if !self.group_by.expr.is_empty() {
+            self.compute_group_ndv(child_statistics)
+        } else {
+            None
+        };
+        let limit = self.limit_options.as_ref().map(|lo| lo.limit);
+
+        if let Some(&value) = child_statistics.num_rows.get_value() {
+            if value > 1 {
+                let mut num_rows = child_statistics.num_rows.to_inexact();
+                if let Some(ndv) = ndv {
+                    num_rows = num_rows.map(|n| n.min(ndv));
+                }
+                if let Some(limit) = limit {
+                    num_rows = num_rows.map(|n| n.min(limit));
+                }
+                num_rows
+            } else if value == 0 {
+                child_statistics.num_rows
+            } else {
+                let grouping_set_num = self.group_by.groups.len();
+                let mut num_rows =
+                    child_statistics.num_rows.map(|x| x * grouping_set_num);
+                if let Some(limit) = limit {
+                    num_rows = num_rows.map(|n| n.min(limit));
+                }
+                num_rows
+            }
+        } else {
+            match (ndv, limit) {
+                (Some(n), Some(l)) => Precision::Inexact(n.min(l)),
+                (Some(n), None) => Precision::Inexact(n),
+                (None, Some(l)) => Precision::Inexact(l),
+                (None, None) => Precision::Absent,
+            }
+        }
+    }
+
+    /// Computes the estimated number of distinct groups across all grouping sets.
+    /// For each grouping set, computes `product(NDV_i + null_adj_i)` for active columns,
+    /// then sums across all sets. Returns `None` if any active column is not a direct
+    /// column reference or lacks `distinct_count` stats. Non-column expressions
+    /// (e.g. `abs(a)`) are not yet supported because expression-level statistics
+    /// propagation is still in progress (see <https://github.com/apache/datafusion/pull/21122>).
+    /// When `null_count` is absent or unknown, null_adjustment defaults to 0.
+    ///
+    /// **Single key:** `GROUP BY a` where NDV(a) = 100, null_count(a) = 5
+    /// → product = max(100 + 1, 1) = 101, total = 101
+    ///
+    /// **Two keys:** `GROUP BY a, b` where NDV(a) = 100, NDV(b) = 50, no nulls
+    /// → product = 100 × 50 = 5,000, total = 5,000
+    ///
+    /// **Grouping sets:** `GROUPING SETS ((a), (b), (a, b))` with NDV(a) = 100, NDV(b) = 50
+    /// → set(a) = 100, set(b) = 50, set(a, b) = 100 × 50 = 5,000
+    /// → total = 100 + 50 + 5,000 = 5,150
+    fn compute_group_ndv(&self, child_statistics: &Statistics) -> Option<usize> {
+        let mut total: usize = 0;
+        for group_mask in &self.group_by.groups {
+            let mut set_product: usize = 1;
+            for (j, (expr, _)) in self.group_by.expr.iter().enumerate() {
+                if group_mask[j] {
+                    continue;
+                }
+                let col = expr.downcast_ref::<Column>()?;
+                let col_stats = &child_statistics.column_statistics[col.index()];
+                let ndv = *col_stats.distinct_count.get_value()?;
+                let null_adjustment = match col_stats.null_count.get_value() {
+                    Some(&n) if n > 0 => 1usize,
+                    _ => 0,
+                };
+                set_product = set_product
+                    .saturating_mul(ndv.saturating_add(null_adjustment).max(1));
+            }
+            total = total.saturating_add(set_product);
+        }
+        Some(total)
+    }
+
+    /// Check if dynamic filter is possible for the current plan node.
+    /// - If yes, init one inside `AggregateExec`'s `dynamic_filter` field.
+    /// - If not supported, `self.dynamic_filter` should be kept `None`
+    fn init_dynamic_filter(&mut self) {
+        // Already initialized.
+        if self.dynamic_filter.is_some() {
+            return;
+        }
+
+        if let Some(supported_accumulators_info) = self.supported_accumulators_info() {
+            // Collect column references for the dynamic filter expression.
+            let all_cols: Vec<Arc<dyn PhysicalExpr>> = supported_accumulators_info
+                .iter()
+                .map(|info| Arc::clone(&self.aggr_expr[info.aggr_index].expressions()[0]))
+                .collect();
+
+            self.dynamic_filter = Some(Arc::new(AggrDynFilter {
+                filter: Arc::new(DynamicFilterPhysicalExpr::new(all_cols, lit(true))),
+                supported_accumulators_info,
+            }));
+        }
+    }
+
+    /// Returns the supported accumulator info if this aggregate supports
+    /// dynamic filtering, or `None` otherwise.
+    ///
+    /// Dynamic filtering requires:
+    /// - `Partial` aggregation mode with no group-by expressions
+    /// - All aggregate functions are `min` or `max` with a single column arg
+    fn supported_accumulators_info(&self) -> Option<Vec<PerAccumulatorDynFilter>> {
+        if !self.group_by.is_empty() || !matches!(self.mode, AggregateMode::Partial) {
+            return None;
+        }
+
+        // Collect supported accumulators.
+        // It is assumed the order of aggregate expressions are not changed
+        // from `AggregateExec` to `AggregateStream`.
+        let mut aggr_dyn_filters = Vec::new();
+        for (i, aggr_expr) in self.aggr_expr.iter().enumerate() {
+            // 1. Only `min` or `max` aggregate function
+            let fun_name = aggr_expr.fun().name();
+            // HACK: Should check the function type more precisely
+            // Issue: <https://github.com/apache/datafusion/issues/18643>
+            let aggr_type = if fun_name.eq_ignore_ascii_case("min") {
+                DynamicFilterAggregateType::Min
+            } else if fun_name.eq_ignore_ascii_case("max") {
+                DynamicFilterAggregateType::Max
+            } else {
+                return None;
+            };
+
+            // 2. arg should be only 1 column reference
+            if let [arg] = aggr_expr.expressions().as_slice()
+                && arg.is::<Column>()
+            {
+                aggr_dyn_filters.push(PerAccumulatorDynFilter {
+                    aggr_type,
+                    aggr_index: i,
+                    shared_bound: Arc::new(Mutex::new(ScalarValue::Null)),
+                });
+            }
+        }
+
+        if aggr_dyn_filters.is_empty() {
+            None
+        } else {
+            Some(aggr_dyn_filters)
+        }
+    }
+
+    /// Calculate scaled byte size based on row count ratio.
+    /// Returns `Precision::Absent` if input statistics are insufficient.
+    /// Returns `Precision::Inexact` with the scaled value otherwise.
+    ///
+    /// This is a simple heuristic that assumes uniform row sizes.
+    #[inline]
+    fn calculate_scaled_byte_size(
+        input_stats: &Statistics,
+        target_row_count: usize,
+    ) -> Precision<usize> {
+        match (
+            input_stats.num_rows.get_value(),
+            input_stats.total_byte_size.get_value(),
+        ) {
+            (Some(&input_rows), Some(&input_bytes)) if input_rows > 0 => {
+                let bytes_per_row = input_bytes as f64 / input_rows as f64;
+                let scaled_bytes =
+                    (bytes_per_row * target_row_count as f64).ceil() as usize;
+                Precision::Inexact(scaled_bytes)
+            }
+            _ => Precision::Absent,
+        }
+    }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for AggregateExec {
@@ -874,8 +1416,8 @@ impl DisplayAs for AggregateExec {
                     .map(|agg| agg.name().to_string())
                     .collect();
                 write!(f, ", aggr=[{}]", a.join(", "))?;
-                if let Some(limit) = self.limit {
-                    write!(f, ", lim=[{limit}]")?;
+                if let Some(config) = self.limit_options {
+                    write!(f, ", lim=[{}]", config.limit)?;
                 }
 
                 if self.input_order_mode != InputOrderMode::Linear {
@@ -934,6 +1476,9 @@ impl DisplayAs for AggregateExec {
                 if !a.is_empty() {
                     writeln!(f, "aggr={}", a.join(", "))?;
                 }
+                if let Some(config) = self.limit_options {
+                    writeln!(f, "limit={}", config.limit)?;
+                }
             }
         }
         Ok(())
@@ -946,17 +1491,13 @@ impl ExecutionPlan for AggregateExec {
     }
 
     /// Return a reference to Any that can be used for down-casting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
     fn required_input_distribution(&self) -> Vec<Distribution> {
         match &self.mode {
-            AggregateMode::Partial => {
+            AggregateMode::Partial | AggregateMode::PartialReduce => {
                 vec![Distribution::UnspecifiedDistribution]
             }
             AggregateMode::FinalPartitioned | AggregateMode::SinglePartitioned => {
@@ -989,20 +1530,53 @@ impl ExecutionPlan for AggregateExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to group by expressions
+        let mut tnr = TreeNodeRecursion::Continue;
+        for expr in self.group_by.input_exprs() {
+            tnr = tnr.visit_sibling(|| f(expr.as_ref()))?;
+        }
+
+        // Apply to aggregate expressions
+        for aggr in self.aggr_expr.iter() {
+            for expr in aggr.expressions() {
+                tnr = tnr.visit_sibling(|| f(expr.as_ref()))?;
+            }
+        }
+
+        // Apply to filter expressions (FILTER WHERE clauses)
+        for filter in self.filter_expr.iter().flatten() {
+            tnr = tnr.visit_sibling(|| f(filter.as_ref()))?;
+        }
+
+        // Apply to dynamic filter expression if present
+        if let Some(dyn_filter) = &self.dynamic_filter {
+            tnr = tnr.visit_sibling(|| f(dyn_filter.filter.as_ref()))?;
+        }
+
+        Ok(tnr)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
+
         let mut me = AggregateExec::try_new_with_schema(
             self.mode,
-            self.group_by.clone(),
-            self.aggr_expr.clone(),
-            self.filter_expr.clone(),
+            Arc::clone(&self.group_by),
+            self.aggr_expr.to_vec(),
+            Arc::clone(&self.filter_expr),
             Arc::clone(&children[0]),
             Arc::clone(&self.input_schema),
             Arc::clone(&self.schema),
         )?;
-        me.limit = self.limit;
+        me.limit_options = self.limit_options;
+        me.dynamic_filter.clone_from(&self.dynamic_filter);
 
         Ok(Arc::new(me))
     }
@@ -1012,7 +1586,7 @@ impl ExecutionPlan for AggregateExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        self.execute_typed(partition, context)
+        self.execute_typed(partition, &context)
             .map(|stream| stream.into())
     }
 
@@ -1020,12 +1594,9 @@ impl ExecutionPlan for AggregateExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        self.statistics_inner(self.input().partition_statistics(partition)?)
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let child_statistics = self.input().partition_statistics(partition)?;
+        Ok(Arc::new(self.statistics_inner(&child_statistics)?))
     }
 
     fn cardinality_effect(&self) -> CardinalityEffect {
@@ -1033,12 +1604,12 @@ impl ExecutionPlan for AggregateExec {
     }
 
     /// Push down parent filters when possible (see implementation comment for details),
-    /// but do not introduce any new self filters.
+    /// and also pushdown self dynamic filters (see `AggrDynFilter` for details)
     fn gather_filters_for_pushdown(
         &self,
-        _phase: FilterPushdownPhase,
+        phase: FilterPushdownPhase,
         parent_filters: Vec<Arc<dyn PhysicalExpr>>,
-        _config: &ConfigOptions,
+        config: &ConfigOptions,
     ) -> Result<FilterDescription> {
         // It's safe to push down filters through aggregates when filters only reference
         // grouping columns, because such filters determine which groups to compute, not
@@ -1048,11 +1619,15 @@ impl ExecutionPlan for AggregateExec {
         // This optimization is NOT safe for filters on aggregated columns (like filtering on
         // the result of SUM or COUNT), as those require computing all groups first.
 
-        let grouping_columns: HashSet<_> = self
-            .group_by
-            .expr()
-            .iter()
-            .flat_map(|(expr, _)| collect_columns(expr))
+        // Build grouping columns using output indices because parent filters reference the
+        // AggregateExec's output schema where grouping columns in the output schema. The
+        // grouping expressions reference input columns which may not match the output schema.
+        //
+        // It is safe to assume that the output_schema contains group by columns in the same order
+        // as the group by expression. See [`create_schema`] and [`AggregateExec`].
+        let output_schema = self.schema();
+        let grouping_columns: HashSet<_> = (0..self.group_by.expr().len())
+            .map(|i| Column::new(output_schema.field(i).name(), i))
             .collect();
 
         // Analyze each filter separately to determine if it can be pushed down
@@ -1077,9 +1652,7 @@ impl ExecutionPlan for AggregateExec {
                 let filter_column_indices: Vec<usize> = filter_columns
                     .iter()
                     .filter_map(|filter_col| {
-                        self.group_by.expr().iter().position(|(expr, _)| {
-                            collect_columns(expr).contains(filter_col)
-                        })
+                        grouping_columns.get(filter_col).map(|col| col.index())
                     })
                     .collect();
 
@@ -1111,10 +1684,72 @@ impl ExecutionPlan for AggregateExec {
                 .map(PushedDownPredicate::unsupported),
         );
 
+        // Include self dynamic filter when it's possible
+        if phase == FilterPushdownPhase::Post
+            && config.optimizer.enable_aggregate_dynamic_filter_pushdown
+            && let Some(self_dyn_filter) = &self.dynamic_filter
+        {
+            let dyn_filter = Arc::clone(&self_dyn_filter.filter);
+            child_desc = child_desc.with_self_filter(dyn_filter);
+        }
+
         Ok(FilterDescription::new().with_child(child_desc))
     }
+
+    /// If child accepts self's dynamic filter, keep `self.dynamic_filter` with Some,
+    /// otherwise clear it to None.
+    fn handle_child_pushdown_result(
+        &self,
+        phase: FilterPushdownPhase,
+        child_pushdown_result: ChildPushdownResult,
+        _config: &ConfigOptions,
+    ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
+        let mut result = FilterPushdownPropagation::if_any(child_pushdown_result.clone());
+
+        // If this node tried to pushdown some dynamic filter before, now we check
+        // if the child accept the filter
+        if phase == FilterPushdownPhase::Post
+            && let Some(dyn_filter) = &self.dynamic_filter
+        {
+            // let child_accepts_dyn_filter = child_pushdown_result
+            //     .self_filters
+            //     .first()
+            //     .map(|filters| {
+            //         assert_eq_or_internal_err!(
+            //             filters.len(),
+            //             1,
+            //             "Aggregate only pushdown one self dynamic filter"
+            //         );
+            //         let filter = filters.get(0).unwrap(); // Asserted above
+            //         Ok(matches!(filter.discriminant, PushedDown::Yes))
+            //     })
+            //     .unwrap_or_else(|| internal_err!("The length of self filters equals to the number of child of this ExecutionPlan, so it must be 1"))?;
+
+            // HACK: The above snippet should be used, however, now the child reply
+            // `PushDown::No` can indicate they're not able to push down row-level
+            // filter, but still keep the filter for statistics pruning.
+            // So here, we try to use ref count to determine if the dynamic filter
+            // has actually be pushed down.
+            // Issue: <https://github.com/apache/datafusion/issues/18856>
+            let child_accepts_dyn_filter = Arc::strong_count(dyn_filter) > 1;
+
+            if !child_accepts_dyn_filter {
+                // Child can't consume the self dynamic filter, so disable it by setting
+                // to `None`
+                let mut new_node = self.clone();
+                new_node.dynamic_filter = None;
+
+                result = result
+                    .with_updated_node(Arc::new(new_node) as Arc<dyn ExecutionPlan>);
+            }
+        }
+
+        Ok(result)
+    }
 }
 
+/// Creates the output schema for an [`AggregateExec`] containing the group by columns followed
+/// by the aggregate columns.
 fn create_schema(
     input_schema: &Schema,
     group_by: &PhysicalGroupBy,
@@ -1124,20 +1759,17 @@ fn create_schema(
     let mut fields = Vec::with_capacity(group_by.num_output_exprs() + aggr_expr.len());
     fields.extend(group_by.output_fields(input_schema)?);
 
-    match mode {
-        AggregateMode::Partial => {
-            // in partial mode, the fields of the accumulator's state
+    match mode.output_mode() {
+        AggregateOutputMode::Final => {
+            // in final mode, the field with the final result of the accumulator
             for expr in aggr_expr {
-                fields.extend(expr.state_fields()?.iter().cloned());
+                fields.push(expr.field())
             }
         }
-        AggregateMode::Final
-        | AggregateMode::FinalPartitioned
-        | AggregateMode::Single
-        | AggregateMode::SinglePartitioned => {
-            // in final mode, the field with the final result of the accumulator
+        AggregateOutputMode::Partial => {
+            // in partial mode, the fields of the accumulator's state
             for expr in aggr_expr {
-                fields.push(expr.field())
+                fields.extend(expr.state_fields()?.iter().cloned());
             }
         }
     }
@@ -1177,7 +1809,7 @@ fn get_aggregate_expr_req(
     // If the aggregation is performing a "second stage" calculation,
     // then ignore the ordering requirement. Ordering requirement applies
     // only to the aggregation input data.
-    if !agg_mode.is_first_stage() {
+    if agg_mode.input_mode() == AggregateInputMode::Partial {
         return None;
     }
 
@@ -1343,10 +1975,8 @@ pub fn aggregate_expressions(
     mode: &AggregateMode,
     col_idx_base: usize,
 ) -> Result<Vec<Vec<Arc<dyn PhysicalExpr>>>> {
-    match mode {
-        AggregateMode::Partial
-        | AggregateMode::Single
-        | AggregateMode::SinglePartitioned => Ok(aggr_expr
+    match mode.input_mode() {
+        AggregateInputMode::Raw => Ok(aggr_expr
             .iter()
             .map(|agg| {
                 let mut result = agg.expressions();
@@ -1357,8 +1987,8 @@ pub fn aggregate_expressions(
                 result
             })
             .collect()),
-        // In this mode, we build the merge expressions of the aggregation.
-        AggregateMode::Final | AggregateMode::FinalPartitioned => {
+        AggregateInputMode::Partial => {
+            // In merge mode, we build the merge expressions of the aggregation.
             let mut col_idx_base = col_idx_base;
             aggr_expr
                 .iter()
@@ -1406,8 +2036,15 @@ pub fn finalize_aggregation(
     accumulators: &mut [AccumulatorItem],
     mode: &AggregateMode,
 ) -> Result<Vec<ArrayRef>> {
-    match mode {
-        AggregateMode::Partial => {
+    match mode.output_mode() {
+        AggregateOutputMode::Final => {
+            // Merge the state to the final value
+            accumulators
+                .iter_mut()
+                .map(|accumulator| accumulator.evaluate().and_then(|v| v.to_array()))
+                .collect()
+        }
+        AggregateOutputMode::Partial => {
             // Build the vector of states
             accumulators
                 .iter_mut()
@@ -1421,38 +2058,17 @@ pub fn finalize_aggregation(
                 .flatten_ok()
                 .collect()
         }
-        AggregateMode::Final
-        | AggregateMode::FinalPartitioned
-        | AggregateMode::Single
-        | AggregateMode::SinglePartitioned => {
-            // Merge the state to the final value
-            accumulators
-                .iter_mut()
-                .map(|accumulator| accumulator.evaluate().and_then(|v| v.to_array()))
-                .collect()
-        }
     }
 }
 
-/// Evaluates expressions against a record batch.
-fn evaluate(
-    expr: &[Arc<dyn PhysicalExpr>],
-    batch: &RecordBatch,
-) -> Result<Vec<ArrayRef>> {
-    expr.iter()
-        .map(|expr| {
-            expr.evaluate(batch)
-                .and_then(|v| v.into_array(batch.num_rows()))
-        })
-        .collect()
-}
-
-/// Evaluates expressions against a record batch.
+/// Evaluates groups of expressions against a record batch.
 pub fn evaluate_many(
     expr: &[Vec<Arc<dyn PhysicalExpr>>],
     batch: &RecordBatch,
 ) -> Result<Vec<Vec<ArrayRef>>> {
-    expr.iter().map(|expr| evaluate(expr, batch)).collect()
+    expr.iter()
+        .map(|expr| evaluate_expressions_to_arrays(expr, batch))
+        .collect()
 }
 
 fn evaluate_optional(
@@ -1471,25 +2087,70 @@ fn evaluate_optional(
         .collect()
 }
 
-fn group_id_array(group: &[bool], batch: &RecordBatch) -> Result<ArrayRef> {
-    if group.len() > 64 {
+/// Builds the internal `__grouping_id` array for a single grouping set.
+///
+/// The returned array packs two values into a single integer:
+///
+/// - Low `n` bits (positions 0 .. n-1): the semantic bitmask.  A `1` bit
+///   at position `i` means that the `i`-th grouping column (counting from the
+///   least significant bit, i.e. the *last* column in the `group` slice) is
+///   `NULL` for this grouping set.
+/// - High bits (positions n and above): the duplicate `ordinal`, which
+///   distinguishes multiple occurrences of the same grouping-set pattern.  The
+///   ordinal is `0` for the first occurrence, `1` for the second, and so on.
+///
+/// The integer type is chosen to be the smallest `UInt8 / UInt16 / UInt32 /
+/// UInt64` that can represent both parts.  It matches the type returned by
+/// [`Aggregate::grouping_id_type`].
+fn group_id_array(
+    group: &[bool],
+    ordinal: usize,
+    max_ordinal: usize,
+    batch: &RecordBatch,
+) -> Result<ArrayRef> {
+    let n = group.len();
+    if n > 64 {
         return not_impl_err!(
             "Grouping sets with more than 64 columns are not supported"
         );
     }
-    let group_id = group.iter().fold(0u64, |acc, &is_null| {
+    let ordinal_bits = usize::BITS as usize - max_ordinal.leading_zeros() as usize;
+    let total_bits = n + ordinal_bits;
+    if total_bits > 64 {
+        return not_impl_err!(
+            "Grouping sets with {n} columns and a maximum duplicate ordinal of \
+             {max_ordinal} require {total_bits} bits, which exceeds 64"
+        );
+    }
+    let semantic_id = group.iter().fold(0u64, |acc, &is_null| {
         (acc << 1) | if is_null { 1 } else { 0 }
     });
+    let full_id = semantic_id | ((ordinal as u64) << n);
     let num_rows = batch.num_rows();
-    if group.len() <= 8 {
-        Ok(Arc::new(UInt8Array::from(vec![group_id as u8; num_rows])))
-    } else if group.len() <= 16 {
-        Ok(Arc::new(UInt16Array::from(vec![group_id as u16; num_rows])))
-    } else if group.len() <= 32 {
-        Ok(Arc::new(UInt32Array::from(vec![group_id as u32; num_rows])))
+    if total_bits <= 8 {
+        Ok(Arc::new(UInt8Array::from(vec![full_id as u8; num_rows])))
+    } else if total_bits <= 16 {
+        Ok(Arc::new(UInt16Array::from(vec![full_id as u16; num_rows])))
+    } else if total_bits <= 32 {
+        Ok(Arc::new(UInt32Array::from(vec![full_id as u32; num_rows])))
     } else {
-        Ok(Arc::new(UInt64Array::from(vec![group_id; num_rows])))
+        Ok(Arc::new(UInt64Array::from(vec![full_id; num_rows])))
+    }
+}
+
+/// Returns the highest duplicate ordinal across all grouping sets.
+///
+/// At the call-site, the ordinal is the 0-based index assigned to each
+/// occurrence of a repeated grouping-set pattern: the first occurrence gets
+/// ordinal 0, the second gets 1, and so on.  If the same `Vec<bool>` appears
+/// three times the ordinals are 0, 1, 2 and this function returns 2.
+/// Returns 0 when no grouping set is duplicated.
+fn max_duplicate_ordinal(groups: &[Vec<bool>]) -> usize {
+    let mut counts: HashMap<&[bool], usize> = HashMap::new();
+    for group in groups {
+        *counts.entry(group).or_insert(0) += 1;
     }
+    counts.into_values().max().unwrap_or(0).saturating_sub(1)
 }
 
 /// Evaluate a group by expression against a `RecordBatch`
@@ -1506,28 +2167,25 @@ pub fn evaluate_group_by(
     group_by: &PhysicalGroupBy,
     batch: &RecordBatch,
 ) -> Result<Vec<Vec<ArrayRef>>> {
-    let exprs: Vec<ArrayRef> = group_by
-        .expr
-        .iter()
-        .map(|(expr, _)| {
-            let value = expr.evaluate(batch)?;
-            value.into_array(batch.num_rows())
-        })
-        .collect::<Result<Vec<_>>>()?;
-
-    let null_exprs: Vec<ArrayRef> = group_by
-        .null_expr
-        .iter()
-        .map(|(expr, _)| {
-            let value = expr.evaluate(batch)?;
-            value.into_array(batch.num_rows())
-        })
-        .collect::<Result<Vec<_>>>()?;
+    let max_ordinal = max_duplicate_ordinal(&group_by.groups);
+    let mut ordinal_per_pattern: HashMap<&[bool], usize> = HashMap::new();
+    let exprs = evaluate_expressions_to_arrays(
+        group_by.expr.iter().map(|(expr, _)| expr),
+        batch,
+    )?;
+    let null_exprs = evaluate_expressions_to_arrays(
+        group_by.null_expr.iter().map(|(expr, _)| expr),
+        batch,
+    )?;
 
     group_by
         .groups
         .iter()
         .map(|group| {
+            let ordinal = ordinal_per_pattern.entry(group).or_insert(0);
+            let current_ordinal = *ordinal;
+            *ordinal += 1;
+
             let mut group_values = Vec::with_capacity(group_by.num_group_exprs());
             group_values.extend(group.iter().enumerate().map(|(idx, is_null)| {
                 if *is_null {
@@ -1537,7 +2195,12 @@ pub fn evaluate_group_by(
                 }
             }));
             if !group_by.is_single() {
-                group_values.push(group_id_array(group, batch)?);
+                group_values.push(group_id_array(
+                    group,
+                    current_ordinal,
+                    max_ordinal,
+                    batch,
+                )?);
             }
             Ok(group_values)
         })
@@ -1549,26 +2212,28 @@ mod tests {
     use std::task::{Context, Poll};
 
     use super::*;
-    use crate::coalesce_batches::CoalesceBatchesExec;
+    use crate::RecordBatchStream;
     use crate::coalesce_partitions::CoalescePartitionsExec;
     use crate::common;
     use crate::common::collect;
+    use crate::empty::EmptyExec;
     use crate::execution_plan::Boundedness;
     use crate::expressions::col;
     use crate::metrics::MetricValue;
-    use crate::test::assert_is_pending;
-    use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec};
     use crate::test::TestMemoryExec;
-    use crate::RecordBatchStream;
+    use crate::test::assert_is_pending;
+    use crate::test::exec::{
+        BlockingExec, StatisticsExec, assert_strong_count_converges_to_zero,
+    };
 
     use arrow::array::{
-        DictionaryArray, Float32Array, Float64Array, Int32Array, StructArray,
+        DictionaryArray, Float32Array, Float64Array, Int32Array, Int64Array, StructArray,
         UInt32Array, UInt64Array,
     };
-    use arrow::compute::{concat_batches, SortOptions};
-    use arrow::datatypes::{DataType, Int32Type};
+    use arrow::compute::{SortOptions, concat_batches};
+    use arrow::datatypes::Int32Type;
     use datafusion_common::test_util::{batches_to_sort_string, batches_to_string};
-    use datafusion_common::{internal_err, DataFusionError, ScalarValue};
+    use datafusion_common::{DataFusionError, internal_err};
     use datafusion_execution::config::SessionConfig;
     use datafusion_execution::memory_pool::FairSpillPool;
     use datafusion_execution::runtime_env::RuntimeEnvBuilder;
@@ -1577,13 +2242,15 @@ mod tests {
     use datafusion_functions_aggregate::count::count_udaf;
     use datafusion_functions_aggregate::first_last::{first_value_udaf, last_value_udaf};
     use datafusion_functions_aggregate::median::median_udaf;
+    use datafusion_functions_aggregate::min_max::min_udaf;
     use datafusion_functions_aggregate::sum::sum_udaf;
-    use datafusion_physical_expr::aggregate::AggregateExprBuilder;
-    use datafusion_physical_expr::expressions::lit;
-    use datafusion_physical_expr::expressions::Literal;
     use datafusion_physical_expr::Partitioning;
     use datafusion_physical_expr::PhysicalSortExpr;
+    use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+    use datafusion_physical_expr::expressions::Literal;
 
+    use crate::projection::ProjectionExec;
+    use datafusion_physical_expr::projection::ProjectionExpr;
     use futures::{FutureExt, Stream};
     use insta::{allow_duplicates, assert_snapshot};
 
@@ -1714,6 +2381,7 @@ mod tests {
                 vec![true, false],  // (NULL, b)
                 vec![false, false], // (a,b)
             ],
+            true,
         );
 
         let aggregates = vec![Arc::new(
@@ -1748,30 +2416,30 @@ mod tests {
             allow_duplicates! {
             assert_snapshot!(batches_to_sort_string(&result),
             @r"
-+---+-----+---------------+-----------------+
-| a | b   | __grouping_id | COUNT(1)[count] |
-+---+-----+---------------+-----------------+
-|   | 1.0 | 2             | 1               |
-|   | 1.0 | 2             | 1               |
-|   | 2.0 | 2             | 1               |
-|   | 2.0 | 2             | 1               |
-|   | 3.0 | 2             | 1               |
-|   | 3.0 | 2             | 1               |
-|   | 4.0 | 2             | 1               |
-|   | 4.0 | 2             | 1               |
-| 2 |     | 1             | 1               |
-| 2 |     | 1             | 1               |
-| 2 | 1.0 | 0             | 1               |
-| 2 | 1.0 | 0             | 1               |
-| 3 |     | 1             | 1               |
-| 3 |     | 1             | 2               |
-| 3 | 2.0 | 0             | 2               |
-| 3 | 3.0 | 0             | 1               |
-| 4 |     | 1             | 1               |
-| 4 |     | 1             | 2               |
-| 4 | 3.0 | 0             | 1               |
-| 4 | 4.0 | 0             | 2               |
-+---+-----+---------------+-----------------+
+            +---+-----+---------------+-----------------+
+            | a | b   | __grouping_id | COUNT(1)[count] |
+            +---+-----+---------------+-----------------+
+            |   | 1.0 | 2             | 1               |
+            |   | 1.0 | 2             | 1               |
+            |   | 2.0 | 2             | 1               |
+            |   | 2.0 | 2             | 1               |
+            |   | 3.0 | 2             | 1               |
+            |   | 3.0 | 2             | 1               |
+            |   | 4.0 | 2             | 1               |
+            |   | 4.0 | 2             | 1               |
+            | 2 |     | 1             | 1               |
+            | 2 |     | 1             | 1               |
+            | 2 | 1.0 | 0             | 1               |
+            | 2 | 1.0 | 0             | 1               |
+            | 3 |     | 1             | 1               |
+            | 3 |     | 1             | 2               |
+            | 3 | 2.0 | 0             | 2               |
+            | 3 | 3.0 | 0             | 1               |
+            | 4 |     | 1             | 1               |
+            | 4 |     | 1             | 2               |
+            | 4 | 3.0 | 0             | 1               |
+            | 4 | 4.0 | 0             | 2               |
+            +---+-----+---------------+-----------------+
             "
             );
             }
@@ -1779,22 +2447,22 @@ mod tests {
             allow_duplicates! {
             assert_snapshot!(batches_to_sort_string(&result),
             @r"
-+---+-----+---------------+-----------------+
-| a | b   | __grouping_id | COUNT(1)[count] |
-+---+-----+---------------+-----------------+
-|   | 1.0 | 2             | 2               |
-|   | 2.0 | 2             | 2               |
-|   | 3.0 | 2             | 2               |
-|   | 4.0 | 2             | 2               |
-| 2 |     | 1             | 2               |
-| 2 | 1.0 | 0             | 2               |
-| 3 |     | 1             | 3               |
-| 3 | 2.0 | 0             | 2               |
-| 3 | 3.0 | 0             | 1               |
-| 4 |     | 1             | 3               |
-| 4 | 3.0 | 0             | 1               |
-| 4 | 4.0 | 0             | 2               |
-+---+-----+---------------+-----------------+
+            +---+-----+---------------+-----------------+
+            | a | b   | __grouping_id | COUNT(1)[count] |
+            +---+-----+---------------+-----------------+
+            |   | 1.0 | 2             | 2               |
+            |   | 2.0 | 2             | 2               |
+            |   | 3.0 | 2             | 2               |
+            |   | 4.0 | 2             | 2               |
+            | 2 |     | 1             | 2               |
+            | 2 | 1.0 | 0             | 2               |
+            | 3 |     | 1             | 3               |
+            | 3 | 2.0 | 0             | 2               |
+            | 3 | 3.0 | 0             | 1               |
+            | 4 |     | 1             | 3               |
+            | 4 | 3.0 | 0             | 1               |
+            | 4 | 4.0 | 0             | 2               |
+            +---+-----+---------------+-----------------+
             "
             );
             }
@@ -1828,23 +2496,23 @@ mod tests {
         assert_snapshot!(
             batches_to_sort_string(&result),
             @r"
-            +---+-----+---------------+----------+
-            | a | b   | __grouping_id | COUNT(1) |
-            +---+-----+---------------+----------+
-            |   | 1.0 | 2             | 2        |
-            |   | 2.0 | 2             | 2        |
-            |   | 3.0 | 2             | 2        |
-            |   | 4.0 | 2             | 2        |
-            | 2 |     | 1             | 2        |
-            | 2 | 1.0 | 0             | 2        |
-            | 3 |     | 1             | 3        |
-            | 3 | 2.0 | 0             | 2        |
-            | 3 | 3.0 | 0             | 1        |
-            | 4 |     | 1             | 3        |
-            | 4 | 3.0 | 0             | 1        |
-            | 4 | 4.0 | 0             | 2        |
-            +---+-----+---------------+----------+
-            "
+        +---+-----+---------------+----------+
+        | a | b   | __grouping_id | COUNT(1) |
+        +---+-----+---------------+----------+
+        |   | 1.0 | 2             | 2        |
+        |   | 2.0 | 2             | 2        |
+        |   | 3.0 | 2             | 2        |
+        |   | 4.0 | 2             | 2        |
+        | 2 |     | 1             | 2        |
+        | 2 | 1.0 | 0             | 2        |
+        | 3 |     | 1             | 3        |
+        | 3 | 2.0 | 0             | 2        |
+        | 3 | 3.0 | 0             | 1        |
+        | 4 |     | 1             | 3        |
+        | 4 | 3.0 | 0             | 1        |
+        | 4 | 4.0 | 0             | 2        |
+        +---+-----+---------------+----------+
+        "
         );
         }
 
@@ -1863,6 +2531,7 @@ mod tests {
             vec![(col("a", &input_schema)?, "a".to_string())],
             vec![],
             vec![vec![false]],
+            false,
         );
 
         let aggregates: Vec<Arc<AggregateFunctionExpr>> = vec![Arc::new(
@@ -1894,27 +2563,27 @@ mod tests {
         if spill {
             allow_duplicates! {
             assert_snapshot!(batches_to_sort_string(&result), @r"
-                +---+---------------+-------------+
-                | a | AVG(b)[count] | AVG(b)[sum] |
-                +---+---------------+-------------+
-                | 2 | 1             | 1.0         |
-                | 2 | 1             | 1.0         |
-                | 3 | 1             | 2.0         |
-                | 3 | 2             | 5.0         |
-                | 4 | 3             | 11.0        |
-                +---+---------------+-------------+
+            +---+---------------+-------------+
+            | a | AVG(b)[count] | AVG(b)[sum] |
+            +---+---------------+-------------+
+            | 2 | 1             | 1.0         |
+            | 2 | 1             | 1.0         |
+            | 3 | 1             | 2.0         |
+            | 3 | 2             | 5.0         |
+            | 4 | 3             | 11.0        |
+            +---+---------------+-------------+
             ");
             }
         } else {
             allow_duplicates! {
             assert_snapshot!(batches_to_sort_string(&result), @r"
-                +---+---------------+-------------+
-                | a | AVG(b)[count] | AVG(b)[sum] |
-                +---+---------------+-------------+
-                | 2 | 2             | 2.0         |
-                | 3 | 3             | 7.0         |
-                | 4 | 3             | 11.0        |
-                +---+---------------+-------------+
+            +---+---------------+-------------+
+            | a | AVG(b)[count] | AVG(b)[sum] |
+            +---+---------------+-------------+
+            | 2 | 2             | 2.0         |
+            | 3 | 3             | 7.0         |
+            | 4 | 3             | 11.0        |
+            +---+---------------+-------------+
             ");
             }
         };
@@ -1932,6 +2601,10 @@ mod tests {
             input_schema,
         )?);
 
+        // Verify statistics are preserved proportionally through aggregation
+        let final_stats = merged_aggregate.partition_statistics(None)?;
+        assert!(final_stats.total_byte_size.get_value().is_some());
+
         let task_ctx = if spill {
             // enlarge memory limit to let the final aggregation finish
             new_spill_ctx(2, 2600)
@@ -1945,14 +2618,14 @@ mod tests {
 
         allow_duplicates! {
         assert_snapshot!(batches_to_sort_string(&result), @r"
-            +---+--------------------+
-            | a | AVG(b)             |
-            +---+--------------------+
-            | 2 | 1.0                |
-            | 3 | 2.3333333333333335 |
-            | 4 | 3.6666666666666665 |
-            +---+--------------------+
-            ");
+        +---+--------------------+
+        | a | AVG(b)             |
+        +---+--------------------+
+        | 2 | 1.0                |
+        | 3 | 2.3333333333333335 |
+        | 4 | 3.6666666666666665 |
+        +---+--------------------+
+        ");
             // For row 2: 3, (2 + 3 + 2) / 3
             // For row 3: 4, (3 + 4 + 4) / 3
         }
@@ -1988,14 +2661,17 @@ mod tests {
     struct TestYieldingExec {
         /// True if this exec should yield back to runtime the first time it is polled
         pub yield_first: bool,
-        cache: PlanProperties,
+        cache: Arc<PlanProperties>,
     }
 
     impl TestYieldingExec {
         fn new(yield_first: bool) -> Self {
             let schema = some_data().0;
             let cache = Self::compute_properties(schema);
-            Self { yield_first, cache }
+            Self {
+                yield_first,
+                cache: Arc::new(cache),
+            }
         }
 
         /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
@@ -2032,11 +2708,7 @@ mod tests {
             "TestYieldingExec"
         }
 
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             &self.cache
         }
 
@@ -2044,6 +2716,13 @@ mod tests {
             vec![]
         }
 
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
+
         fn with_new_children(
             self: Arc<Self>,
             _: Vec<Arc<dyn ExecutionPlan>>,
@@ -2065,20 +2744,19 @@ mod tests {
             Ok(Box::pin(stream))
         }
 
-        fn statistics(&self) -> Result<Statistics> {
-            self.partition_statistics(None)
-        }
-
-        fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+        fn partition_statistics(
+            &self,
+            partition: Option<usize>,
+        ) -> Result<Arc<Statistics>> {
             if partition.is_some() {
-                return Ok(Statistics::new_unknown(self.schema().as_ref()));
+                return Ok(Arc::new(Statistics::new_unknown(self.schema().as_ref())));
             }
             let (_, batches) = some_data();
-            Ok(common::compute_record_batch_statistics(
+            Ok(Arc::new(common::compute_record_batch_statistics(
                 &[batches],
                 &self.schema(),
                 None,
-            ))
+            )))
         }
     }
 
@@ -2204,6 +2882,7 @@ mod tests {
             vec![(col("a", &input_schema)?, "a".to_string())],
             vec![],
             vec![vec![false]],
+            false,
         );
 
         // something that allocates within the aggregator
@@ -2224,7 +2903,7 @@ mod tests {
         ] {
             let n_aggr = aggregates.len();
             let partial_aggregate = Arc::new(AggregateExec::try_new(
-                AggregateMode::Partial,
+                AggregateMode::Single,
                 groups,
                 aggregates,
                 vec![None; n_aggr],
@@ -2232,7 +2911,7 @@ mod tests {
                 Arc::clone(&input_schema),
             )?);
 
-            let stream = partial_aggregate.execute_typed(0, Arc::clone(&task_ctx))?;
+            let stream = partial_aggregate.execute_typed(0, &task_ctx)?;
 
             // ensure that we really got the version we wanted
             match version {
@@ -2339,17 +3018,9 @@ mod tests {
 
     #[tokio::test]
     async fn run_first_last_multi_partitions() -> Result<()> {
-        for use_coalesce_batches in [false, true] {
-            for is_first_acc in [false, true] {
-                for spill in [false, true] {
-                    first_last_multi_partitions(
-                        use_coalesce_batches,
-                        is_first_acc,
-                        spill,
-                        4200,
-                    )
-                    .await?
-                }
+        for is_first_acc in [false, true] {
+            for spill in [false, true] {
+                first_last_multi_partitions(is_first_acc, spill, 4200).await?
             }
         }
         Ok(())
@@ -2392,15 +3063,7 @@ mod tests {
             .map(Arc::new)
     }
 
-    // This function either constructs the physical plan below,
-    //
-    // "AggregateExec: mode=Final, gby=[a@0 as a], aggr=[FIRST_VALUE(b)]",
-    // "  CoalesceBatchesExec: target_batch_size=1024",
-    // "    CoalescePartitionsExec",
-    // "      AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[FIRST_VALUE(b)], ordering_mode=None",
-    // "        DataSourceExec: partitions=4, partition_sizes=[1, 1, 1, 1]",
-    //
-    // or
+    // This function constructs the physical plan below,
     //
     // "AggregateExec: mode=Final, gby=[a@0 as a], aggr=[FIRST_VALUE(b)]",
     // "  CoalescePartitionsExec",
@@ -2410,7 +3073,6 @@ mod tests {
     // and checks whether the function `merge_batch` works correctly for
     // FIRST_VALUE and LAST_VALUE functions.
     async fn first_last_multi_partitions(
-        use_coalesce_batches: bool,
         is_first_acc: bool,
         spill: bool,
         max_memory: usize,
@@ -2458,13 +3120,8 @@ mod tests {
             memory_exec,
             Arc::clone(&schema),
         )?);
-        let coalesce = if use_coalesce_batches {
-            let coalesce = Arc::new(CoalescePartitionsExec::new(aggregate_exec));
-            Arc::new(CoalesceBatchesExec::new(coalesce, 1024)) as Arc<dyn ExecutionPlan>
-        } else {
-            Arc::new(CoalescePartitionsExec::new(aggregate_exec))
-                as Arc<dyn ExecutionPlan>
-        };
+        let coalesce = Arc::new(CoalescePartitionsExec::new(aggregate_exec))
+            as Arc<dyn ExecutionPlan>;
         let aggregate_final = Arc::new(AggregateExec::try_new(
             AggregateMode::Final,
             groups,
@@ -2478,26 +3135,26 @@ mod tests {
         if is_first_acc {
             allow_duplicates! {
             assert_snapshot!(batches_to_string(&result), @r"
-                +---+--------------------------------------------+
-                | a | first_value(b) ORDER BY [b ASC NULLS LAST] |
-                +---+--------------------------------------------+
-                | 2 | 0.0                                        |
-                | 3 | 1.0                                        |
-                | 4 | 3.0                                        |
-                +---+--------------------------------------------+
-                ");
+            +---+--------------------------------------------+
+            | a | first_value(b) ORDER BY [b ASC NULLS LAST] |
+            +---+--------------------------------------------+
+            | 2 | 0.0                                        |
+            | 3 | 1.0                                        |
+            | 4 | 3.0                                        |
+            +---+--------------------------------------------+
+            ");
             }
         } else {
             allow_duplicates! {
             assert_snapshot!(batches_to_string(&result), @r"
-                +---+-------------------------------------------+
-                | a | last_value(b) ORDER BY [b ASC NULLS LAST] |
-                +---+-------------------------------------------+
-                | 2 | 3.0                                       |
-                | 3 | 5.0                                       |
-                | 4 | 6.0                                       |
-                +---+-------------------------------------------+
-                ");
+            +---+-------------------------------------------+
+            | a | last_value(b) ORDER BY [b ASC NULLS LAST] |
+            +---+-------------------------------------------+
+            | 2 | 3.0                                       |
+            | 3 | 5.0                                       |
+            | 4 | 6.0                                       |
+            +---+-------------------------------------------+
+            ");
             }
         };
         Ok(())
@@ -2648,14 +3305,16 @@ mod tests {
                 vec![true, false, true],
                 vec![true, true, false],
             ],
+            true,
         );
 
-        let aggregates: Vec<Arc<AggregateFunctionExpr>> =
-            vec![AggregateExprBuilder::new(count_udaf(), vec![lit(1)])
+        let aggregates: Vec<Arc<AggregateFunctionExpr>> = vec![
+            AggregateExprBuilder::new(count_udaf(), vec![lit(1)])
                 .schema(Arc::clone(&schema))
                 .alias("1")
                 .build()
-                .map(Arc::new)?];
+                .map(Arc::new)?,
+        ];
 
         let input_batches = (0..4)
             .map(|_| {
@@ -2684,13 +3343,13 @@ mod tests {
 
         allow_duplicates! {
         assert_snapshot!(batches_to_sort_string(&output), @r"
-            +-----+-----+-------+---------------+-------+
-            | a   | b   | const | __grouping_id | 1     |
-            +-----+-----+-------+---------------+-------+
-            |     |     | 1     | 6             | 32768 |
-            |     | 0.0 |       | 5             | 32768 |
-            | 0.0 |     |       | 3             | 32768 |
-            +-----+-----+-------+---------------+-------+
+        +-----+-----+-------+---------------+-------+
+        | a   | b   | const | __grouping_id | 1     |
+        +-----+-----+-------+---------------+-------+
+        |     |     | 1     | 6             | 32768 |
+        |     | 0.0 |       | 5             | 32768 |
+        | 0.0 |     |       | 3             | 32768 |
+        +-----+-----+-------+---------------+-------+
         ");
         }
 
@@ -2771,14 +3430,13 @@ mod tests {
             "labels".to_string(),
         )]);
 
-        let aggr_expr = vec![AggregateExprBuilder::new(
-            sum_udaf(),
-            vec![col("value", &batch.schema())?],
-        )
-        .schema(Arc::clone(&batch.schema()))
-        .alias(String::from("SUM(value)"))
-        .build()
-        .map(Arc::new)?];
+        let aggr_expr = vec![
+            AggregateExprBuilder::new(sum_udaf(), vec![col("value", &batch.schema())?])
+                .schema(Arc::clone(&batch.schema()))
+                .alias(String::from("SUM(value)"))
+                .build()
+                .map(Arc::new)?,
+        ];
 
         let input = TestMemoryExec::try_new_exec(
             &[vec![batch.clone()]],
@@ -2800,13 +3458,13 @@ mod tests {
 
         allow_duplicates! {
         assert_snapshot!(batches_to_string(&output), @r"
-            +--------------+------------+
-            | labels       | SUM(value) |
-            +--------------+------------+
-            | {a: a, b: b} | 2          |
-            | {a: , b: c}  | 1          |
-            +--------------+------------+
-            ");
+        +--------------+------------+
+        | labels       | SUM(value) |
+        +--------------+------------+
+        | {a: a, b: b} | 2          |
+        | {a: , b: c}  | 1          |
+        +--------------+------------+
+        ");
         }
 
         Ok(())
@@ -2822,14 +3480,13 @@ mod tests {
         let group_by =
             PhysicalGroupBy::new_single(vec![(col("key", &schema)?, "key".to_string())]);
 
-        let aggr_expr =
-            vec![
-                AggregateExprBuilder::new(count_udaf(), vec![col("val", &schema)?])
-                    .schema(Arc::clone(&schema))
-                    .alias(String::from("COUNT(val)"))
-                    .build()
-                    .map(Arc::new)?,
-            ];
+        let aggr_expr = vec![
+            AggregateExprBuilder::new(count_udaf(), vec![col("val", &schema)?])
+                .schema(Arc::clone(&schema))
+                .alias(String::from("COUNT(val)"))
+                .build()
+                .map(Arc::new)?,
+        ];
 
         let input_data = vec![
             RecordBatch::try_new(
@@ -2902,14 +3559,13 @@ mod tests {
         let group_by =
             PhysicalGroupBy::new_single(vec![(col("key", &schema)?, "key".to_string())]);
 
-        let aggr_expr =
-            vec![
-                AggregateExprBuilder::new(count_udaf(), vec![col("val", &schema)?])
-                    .schema(Arc::clone(&schema))
-                    .alias(String::from("COUNT(val)"))
-                    .build()
-                    .map(Arc::new)?,
-            ];
+        let aggr_expr = vec![
+            AggregateExprBuilder::new(count_udaf(), vec![col("val", &schema)?])
+                .schema(Arc::clone(&schema))
+                .alias(String::from("COUNT(val)"))
+                .build()
+                .map(Arc::new)?,
+        ];
 
         let input_data = vec![
             RecordBatch::try_new(
@@ -2988,14 +3644,13 @@ mod tests {
             Field::new("b", DataType::Float32, false),
         ]));
 
-        let aggr_expr =
-            vec![
-                AggregateExprBuilder::new(count_udaf(), vec![col("a", &input_schema)?])
-                    .schema(Arc::clone(&input_schema))
-                    .alias("COUNT(a)")
-                    .build()
-                    .map(Arc::new)?,
-            ];
+        let aggr_expr = vec![
+            AggregateExprBuilder::new(count_udaf(), vec![col("a", &input_schema)?])
+                .schema(Arc::clone(&input_schema))
+                .alias("COUNT(a)")
+                .build()
+                .map(Arc::new)?,
+        ];
 
         let grouping_set = PhysicalGroupBy::new(
             vec![
@@ -3010,6 +3665,7 @@ mod tests {
                 vec![false, true],  // (a, NULL)
                 vec![false, false], // (a,b)
             ],
+            true,
         );
         let aggr_schema = create_schema(
             &input_schema,
@@ -3061,18 +3717,16 @@ mod tests {
             vec![(col("a", &schema)?, "a".to_string())],
             vec![],
             vec![vec![false]],
+            false,
         );
 
         // Test with MIN for simple intermediate state (min) and AVG for multiple intermediate states (partial sum, partial count).
         let aggregates: Vec<Arc<AggregateFunctionExpr>> = vec![
             Arc::new(
-                AggregateExprBuilder::new(
-                    datafusion_functions_aggregate::min_max::min_udaf(),
-                    vec![col("b", &schema)?],
-                )
-                .schema(Arc::clone(&schema))
-                .alias("MIN(b)")
-                .build()?,
+                AggregateExprBuilder::new(min_udaf(), vec![col("b", &schema)?])
+                    .schema(Arc::clone(&schema))
+                    .alias("MIN(b)")
+                    .build()?,
             ),
             Arc::new(
                 AggregateExprBuilder::new(avg_udaf(), vec![col("b", &schema)?])
@@ -3109,13 +3763,13 @@ mod tests {
 
         allow_duplicates! {
             assert_snapshot!(batches_to_string(&result), @r"
-                +---+--------+--------+
-                | a | MIN(b) | AVG(b) |
-                +---+--------+--------+
-                | 2 | 1.0    | 1.0    |
-                | 3 | 2.0    | 2.0    |
-                | 4 | 3.0    | 3.5    |
-                +---+--------+--------+
+            +---+--------+--------+
+            | a | MIN(b) | AVG(b) |
+            +---+--------+--------+
+            | 2 | 1.0    | 1.0    |
+            | 3 | 2.0    | 2.0    |
+            | 4 | 3.0    | 3.5    |
+            +---+--------+--------+
             ");
         }
 
@@ -3142,7 +3796,9 @@ mod tests {
                     "Expected spill but SpillCount metric not found or SpillCount was 0."
                 );
             } else if !expect_spill && spill_count > 0 {
-                panic!("Expected no spill but found SpillCount metric with value greater than 0.");
+                panic!(
+                    "Expected no spill but found SpillCount metric with value greater than 0."
+                );
             }
         } else {
             panic!("No metrics returned from the operator; cannot verify spilling.");
@@ -3157,4 +3813,1119 @@ mod tests {
         run_test_with_spill_pool_if_necessary(20_000, false).await?;
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_grouped_aggregation_respects_memory_limit() -> Result<()> {
+        // test with spill
+        fn create_record_batch(
+            schema: &Arc<Schema>,
+            data: (Vec<u32>, Vec<f64>),
+        ) -> Result<RecordBatch> {
+            Ok(RecordBatch::try_new(
+                Arc::clone(schema),
+                vec![
+                    Arc::new(UInt32Array::from(data.0)),
+                    Arc::new(Float64Array::from(data.1)),
+                ],
+            )?)
+        }
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::UInt32, false),
+            Field::new("b", DataType::Float64, false),
+        ]));
+
+        let batches = vec![
+            create_record_batch(&schema, (vec![2, 3, 4, 4], vec![1.0, 2.0, 3.0, 4.0]))?,
+            create_record_batch(&schema, (vec![2, 3, 4, 4], vec![1.0, 2.0, 3.0, 4.0]))?,
+        ];
+        let plan: Arc<dyn ExecutionPlan> =
+            TestMemoryExec::try_new_exec(&[batches], Arc::clone(&schema), None)?;
+        let proj = ProjectionExec::try_new(
+            vec![
+                ProjectionExpr::new(lit("0"), "l".to_string()),
+                ProjectionExpr::new_from_expression(col("a", &schema)?, &schema)?,
+                ProjectionExpr::new_from_expression(col("b", &schema)?, &schema)?,
+            ],
+            plan,
+        )?;
+        let plan: Arc<dyn ExecutionPlan> = Arc::new(proj);
+        let schema = plan.schema();
+
+        let grouping_set = PhysicalGroupBy::new(
+            vec![
+                (col("l", &schema)?, "l".to_string()),
+                (col("a", &schema)?, "a".to_string()),
+            ],
+            vec![],
+            vec![vec![false, false]],
+            false,
+        );
+
+        // Test with MIN for simple intermediate state (min) and AVG for multiple intermediate states (partial sum, partial count).
+        let aggregates: Vec<Arc<AggregateFunctionExpr>> = vec![
+            Arc::new(
+                AggregateExprBuilder::new(min_udaf(), vec![col("b", &schema)?])
+                    .schema(Arc::clone(&schema))
+                    .alias("MIN(b)")
+                    .build()?,
+            ),
+            Arc::new(
+                AggregateExprBuilder::new(avg_udaf(), vec![col("b", &schema)?])
+                    .schema(Arc::clone(&schema))
+                    .alias("AVG(b)")
+                    .build()?,
+            ),
+        ];
+
+        let single_aggregate = Arc::new(AggregateExec::try_new(
+            AggregateMode::Single,
+            grouping_set,
+            aggregates,
+            vec![None, None],
+            plan,
+            Arc::clone(&schema),
+        )?);
+
+        let batch_size = 2;
+        let memory_pool = Arc::new(FairSpillPool::new(2000));
+        let task_ctx = Arc::new(
+            TaskContext::default()
+                .with_session_config(SessionConfig::new().with_batch_size(batch_size))
+                .with_runtime(Arc::new(
+                    RuntimeEnvBuilder::new()
+                        .with_memory_pool(memory_pool)
+                        .build()?,
+                )),
+        );
+
+        let result = collect(single_aggregate.execute(0, Arc::clone(&task_ctx))?).await;
+        match result {
+            Ok(result) => {
+                assert_spill_count_metric(true, single_aggregate);
+
+                allow_duplicates! {
+                    assert_snapshot!(batches_to_string(&result), @r"
+                +---+---+--------+--------+
+                | l | a | MIN(b) | AVG(b) |
+                +---+---+--------+--------+
+                | 0 | 2 | 1.0    | 1.0    |
+                | 0 | 3 | 2.0    | 2.0    |
+                | 0 | 4 | 3.0    | 3.5    |
+                +---+---+--------+--------+
+            ");
+                }
+            }
+            Err(e) => assert!(matches!(e, DataFusionError::ResourcesExhausted(_))),
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_aggregate_statistics_edge_cases() -> Result<()> {
+        use datafusion_common::ColumnStatistics;
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Float64, false),
+        ]));
+
+        let absent_byte_stats = Statistics {
+            num_rows: Precision::Exact(100),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![
+                ColumnStatistics::new_unknown(),
+                ColumnStatistics::new_unknown(),
+            ],
+        };
+        let agg = build_test_aggregate(
+            &schema,
+            absent_byte_stats,
+            PhysicalGroupBy::default(),
+            None,
+        )?;
+        let stats = agg.partition_statistics(None)?;
+        assert_eq!(stats.total_byte_size, Precision::Absent);
+
+        let zero_row_stats = Statistics {
+            num_rows: Precision::Exact(0),
+            total_byte_size: Precision::Exact(0),
+            column_statistics: vec![
+                ColumnStatistics::new_unknown(),
+                ColumnStatistics::new_unknown(),
+            ],
+        };
+        let agg_zero = build_test_aggregate(
+            &schema,
+            zero_row_stats,
+            PhysicalGroupBy::default(),
+            None,
+        )?;
+        let stats_zero = agg_zero.partition_statistics(None)?;
+        assert_eq!(stats_zero.total_byte_size, Precision::Absent);
+
+        Ok(())
+    }
+
+    fn build_test_aggregate(
+        schema: &SchemaRef,
+        stats: Statistics,
+        group_by: PhysicalGroupBy,
+        limit: Option<LimitOptions>,
+    ) -> Result<AggregateExec> {
+        let input = Arc::new(StatisticsExec::new(stats, (**schema).clone()))
+            as Arc<dyn ExecutionPlan>;
+
+        let mut agg = AggregateExec::try_new(
+            AggregateMode::Final,
+            group_by,
+            vec![Arc::new(
+                AggregateExprBuilder::new(count_udaf(), vec![col("a", schema)?])
+                    .schema(Arc::clone(schema))
+                    .alias("COUNT(a)")
+                    .build()?,
+            )],
+            vec![None],
+            input,
+            Arc::clone(schema),
+        )?;
+
+        if let Some(limit) = limit {
+            agg = agg.with_limit_options(Some(limit));
+        }
+
+        Ok(agg)
+    }
+
+    fn simple_group_by(schema: &SchemaRef, cols: &[&str]) -> PhysicalGroupBy {
+        if cols.is_empty() {
+            PhysicalGroupBy::default()
+        } else {
+            PhysicalGroupBy::new_single(
+                cols.iter()
+                    .map(|name| {
+                        (
+                            col(name, schema).unwrap() as Arc<dyn PhysicalExpr>,
+                            name.to_string(),
+                        )
+                    })
+                    .collect(),
+            )
+        }
+    }
+
+    #[test]
+    fn test_aggregate_cardinality_estimation() -> Result<()> {
+        use datafusion_common::ColumnStatistics;
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+        ]));
+
+        struct TestCase {
+            name: &'static str,
+            input_rows: Precision<usize>,
+            col_a_stats: ColumnStatistics,
+            col_b_stats: ColumnStatistics,
+            group_by_cols: Vec<&'static str>,
+            limit_options: Option<LimitOptions>,
+            expected_num_rows: Precision<usize>,
+        }
+
+        let cases = vec![
+            // --- NDV-based estimation ---
+            TestCase {
+                name: "single group-by col with NDV tightens estimate",
+                input_rows: Precision::Exact(1_000_000),
+                col_a_stats: ColumnStatistics {
+                    distinct_count: Precision::Exact(500),
+                    ..ColumnStatistics::new_unknown()
+                },
+                col_b_stats: ColumnStatistics::new_unknown(),
+                group_by_cols: vec!["a"],
+                limit_options: None,
+                expected_num_rows: Precision::Inexact(500),
+            },
+            TestCase {
+                name: "multi-col group-by multiplies NDVs",
+                input_rows: Precision::Exact(1_000_000),
+                col_a_stats: ColumnStatistics {
+                    distinct_count: Precision::Exact(100),
+                    ..ColumnStatistics::new_unknown()
+                },
+                col_b_stats: ColumnStatistics {
+                    distinct_count: Precision::Exact(50),
+                    ..ColumnStatistics::new_unknown()
+                },
+                group_by_cols: vec!["a", "b"],
+                limit_options: None,
+                expected_num_rows: Precision::Inexact(5_000),
+            },
+            TestCase {
+                name: "NDV product capped by input rows",
+                input_rows: Precision::Exact(200),
+                col_a_stats: ColumnStatistics {
+                    distinct_count: Precision::Exact(100),
+                    ..ColumnStatistics::new_unknown()
+                },
+                col_b_stats: ColumnStatistics {
+                    distinct_count: Precision::Exact(50),
+                    ..ColumnStatistics::new_unknown()
+                },
+                group_by_cols: vec!["a", "b"],
+                limit_options: None,
+                expected_num_rows: Precision::Inexact(200),
+            },
+            TestCase {
+                name: "null adjustment adds +1 per column",
+                input_rows: Precision::Exact(1_000_000),
+                col_a_stats: ColumnStatistics {
+                    distinct_count: Precision::Exact(99),
+                    null_count: Precision::Exact(10),
+                    ..ColumnStatistics::new_unknown()
+                },
+                col_b_stats: ColumnStatistics::new_unknown(),
+                group_by_cols: vec!["a"],
+                limit_options: None,
+                // 99 + 1 (null adjustment) = 100
+                expected_num_rows: Precision::Inexact(100),
+            },
+            TestCase {
+                name: "null adjustment on multiple columns",
+                input_rows: Precision::Exact(1_000_000),
+                col_a_stats: ColumnStatistics {
+                    distinct_count: Precision::Exact(99),
+                    null_count: Precision::Exact(5),
+                    ..ColumnStatistics::new_unknown()
+                },
+                col_b_stats: ColumnStatistics {
+                    distinct_count: Precision::Exact(49),
+                    null_count: Precision::Exact(3),
+                    ..ColumnStatistics::new_unknown()
+                },
+                group_by_cols: vec!["a", "b"],
+                limit_options: None,
+                // (99+1) * (49+1) = 100 * 50 = 5000
+                expected_num_rows: Precision::Inexact(5_000),
+            },
+            TestCase {
+                name: "zero null_count means no adjustment",
+                input_rows: Precision::Exact(1_000_000),
+                col_a_stats: ColumnStatistics {
+                    distinct_count: Precision::Exact(100),
+                    null_count: Precision::Exact(0),
+                    ..ColumnStatistics::new_unknown()
+                },
+                col_b_stats: ColumnStatistics::new_unknown(),
+                group_by_cols: vec!["a"],
+                limit_options: None,
+                expected_num_rows: Precision::Inexact(100),
+            },
+            // --- Bail-out: partial NDV stats (Spark-style) ---
+            TestCase {
+                name: "bail out when one group-by col lacks NDV",
+                input_rows: Precision::Exact(1_000_000),
+                col_a_stats: ColumnStatistics {
+                    distinct_count: Precision::Exact(100),
+                    ..ColumnStatistics::new_unknown()
+                },
+                col_b_stats: ColumnStatistics::new_unknown(),
+                group_by_cols: vec!["a", "b"],
+                limit_options: None,
+                expected_num_rows: Precision::Inexact(1_000_000),
+            },
+            TestCase {
+                name: "bail out when all group-by cols lack NDV",
+                input_rows: Precision::Exact(1_000_000),
+                col_a_stats: ColumnStatistics::new_unknown(),
+                col_b_stats: ColumnStatistics::new_unknown(),
+                group_by_cols: vec!["a"],
+                limit_options: None,
+                expected_num_rows: Precision::Inexact(1_000_000),
+            },
+            // --- TopK limit capping ---
+            TestCase {
+                name: "TopK limit caps output rows",
+                input_rows: Precision::Exact(1_000_000),
+                col_a_stats: ColumnStatistics::new_unknown(),
+                col_b_stats: ColumnStatistics::new_unknown(),
+                group_by_cols: vec!["a"],
+                limit_options: Some(LimitOptions::new(10)),
+                expected_num_rows: Precision::Inexact(10),
+            },
+            TestCase {
+                name: "NDV + TopK limit: min(NDV, limit) when NDV < limit",
+                input_rows: Precision::Exact(1_000_000),
+                col_a_stats: ColumnStatistics {
+                    distinct_count: Precision::Exact(5),
+                    ..ColumnStatistics::new_unknown()
+                },
+                col_b_stats: ColumnStatistics::new_unknown(),
+                group_by_cols: vec!["a"],
+                limit_options: Some(LimitOptions::new(10)),
+                expected_num_rows: Precision::Inexact(5),
+            },
+            TestCase {
+                name: "NDV + TopK limit: min(NDV, limit) when limit < NDV",
+                input_rows: Precision::Exact(1_000_000),
+                col_a_stats: ColumnStatistics {
+                    distinct_count: Precision::Exact(500),
+                    ..ColumnStatistics::new_unknown()
+                },
+                col_b_stats: ColumnStatistics::new_unknown(),
+                group_by_cols: vec!["a"],
+                limit_options: Some(LimitOptions::new(10)),
+                expected_num_rows: Precision::Inexact(10),
+            },
+            // --- Absent input rows ---
+            TestCase {
+                name: "absent input rows without limit stays absent",
+                input_rows: Precision::Absent,
+                col_a_stats: ColumnStatistics::new_unknown(),
+                col_b_stats: ColumnStatistics::new_unknown(),
+                group_by_cols: vec!["a"],
+                limit_options: None,
+                expected_num_rows: Precision::Absent,
+            },
+            TestCase {
+                name: "absent input rows with TopK limit gives inexact(limit)",
+                input_rows: Precision::Absent,
+                col_a_stats: ColumnStatistics::new_unknown(),
+                col_b_stats: ColumnStatistics::new_unknown(),
+                group_by_cols: vec!["a"],
+                limit_options: Some(LimitOptions::new(10)),
+                expected_num_rows: Precision::Inexact(10),
+            },
+            // --- No group-by (global aggregation) ---
+            TestCase {
+                name: "no group-by cols (Final mode) returns Exact(1)",
+                input_rows: Precision::Exact(1_000_000),
+                col_a_stats: ColumnStatistics::new_unknown(),
+                col_b_stats: ColumnStatistics::new_unknown(),
+                group_by_cols: vec![],
+                limit_options: None,
+                expected_num_rows: Precision::Exact(1),
+            },
+            // --- One input row ---
+            TestCase {
+                name: "one input row returns Exact(1)",
+                input_rows: Precision::Exact(1),
+                col_a_stats: ColumnStatistics {
+                    distinct_count: Precision::Exact(1),
+                    ..ColumnStatistics::new_unknown()
+                },
+                col_b_stats: ColumnStatistics::new_unknown(),
+                group_by_cols: vec!["a"],
+                limit_options: None,
+                expected_num_rows: Precision::Exact(1),
+            },
+            // --- Zero input rows ---
+            TestCase {
+                name: "zero input rows returns Exact(0)",
+                input_rows: Precision::Exact(0),
+                col_a_stats: ColumnStatistics::new_unknown(),
+                col_b_stats: ColumnStatistics::new_unknown(),
+                group_by_cols: vec!["a"],
+                limit_options: None,
+                expected_num_rows: Precision::Exact(0),
+            },
+            // --- Inexact NDV stats ---
+            TestCase {
+                name: "inexact NDV still used for estimation",
+                input_rows: Precision::Exact(1_000_000),
+                col_a_stats: ColumnStatistics {
+                    distinct_count: Precision::Inexact(200),
+                    ..ColumnStatistics::new_unknown()
+                },
+                col_b_stats: ColumnStatistics::new_unknown(),
+                group_by_cols: vec!["a"],
+                limit_options: None,
+                expected_num_rows: Precision::Inexact(200),
+            },
+            TestCase {
+                name: "inexact NDV combined with limit",
+                input_rows: Precision::Exact(1_000_000),
+                col_a_stats: ColumnStatistics {
+                    distinct_count: Precision::Inexact(200),
+                    ..ColumnStatistics::new_unknown()
+                },
+                col_b_stats: ColumnStatistics::new_unknown(),
+                group_by_cols: vec!["a"],
+                limit_options: Some(LimitOptions::new(10)),
+                expected_num_rows: Precision::Inexact(10),
+            },
+            // --- NDV zero column (all-null) ---
+            TestCase {
+                name: "all-null column contributes 1 to the product, not 0",
+                input_rows: Precision::Exact(1_000),
+                col_a_stats: ColumnStatistics {
+                    distinct_count: Precision::Exact(0),
+                    null_count: Precision::Exact(1_000),
+                    ..ColumnStatistics::new_unknown()
+                },
+                col_b_stats: ColumnStatistics {
+                    distinct_count: Precision::Exact(50),
+                    ..ColumnStatistics::new_unknown()
+                },
+                group_by_cols: vec!["a", "b"],
+                limit_options: None,
+                // NDV(a)=0 with nulls => max(0+1, 1)=1, NDV(b)=50 => 1*50=50
+                expected_num_rows: Precision::Inexact(50),
+            },
+            // --- Absent num_rows with NDV ---
+            TestCase {
+                name: "absent num_rows falls back to NDV estimate",
+                input_rows: Precision::Absent,
+                col_a_stats: ColumnStatistics {
+                    distinct_count: Precision::Exact(100),
+                    ..ColumnStatistics::new_unknown()
+                },
+                col_b_stats: ColumnStatistics::new_unknown(),
+                group_by_cols: vec!["a"],
+                limit_options: None,
+                expected_num_rows: Precision::Inexact(100),
+            },
+            TestCase {
+                name: "absent num_rows with NDV and limit returns min(ndv, limit)",
+                input_rows: Precision::Absent,
+                col_a_stats: ColumnStatistics {
+                    distinct_count: Precision::Exact(100),
+                    ..ColumnStatistics::new_unknown()
+                },
+                col_b_stats: ColumnStatistics::new_unknown(),
+                group_by_cols: vec!["a"],
+                limit_options: Some(LimitOptions::new(10)),
+                expected_num_rows: Precision::Inexact(10),
+            },
+        ];
+
+        for case in cases {
+            let input_stats = Statistics {
+                num_rows: case.input_rows,
+                total_byte_size: Precision::Inexact(1_000_000),
+                column_statistics: vec![
+                    case.col_a_stats.clone(),
+                    case.col_b_stats.clone(),
+                ],
+            };
+
+            let group_by = simple_group_by(&schema, &case.group_by_cols);
+            let agg =
+                build_test_aggregate(&schema, input_stats, group_by, case.limit_options)?;
+
+            let stats = agg.partition_statistics(None)?;
+            assert_eq!(
+                stats.num_rows, case.expected_num_rows,
+                "FAILED: '{}' — expected {:?}, got {:?}",
+                case.name, case.expected_num_rows, stats.num_rows
+            );
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_aggregate_stats_distinct_count_propagation() -> Result<()> {
+        use datafusion_common::ColumnStatistics;
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+        ]));
+
+        let input_stats = Statistics {
+            num_rows: Precision::Exact(1000),
+            total_byte_size: Precision::Inexact(10000),
+            column_statistics: vec![
+                ColumnStatistics {
+                    distinct_count: Precision::Exact(100),
+                    null_count: Precision::Exact(5),
+                    ..ColumnStatistics::new_unknown()
+                },
+                ColumnStatistics::new_unknown(),
+            ],
+        };
+        let agg = build_test_aggregate(
+            &schema,
+            input_stats,
+            simple_group_by(&schema, &["a"]),
+            None,
+        )?;
+
+        let stats = agg.partition_statistics(None)?;
+        assert_eq!(
+            stats.column_statistics[0].distinct_count,
+            Precision::Exact(100),
+            "distinct_count should be propagated from child for group-by columns"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_aggregate_stats_grouping_sets() -> Result<()> {
+        use datafusion_common::ColumnStatistics;
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+        ]));
+
+        let input_stats = Statistics {
+            num_rows: Precision::Exact(1_000_000),
+            total_byte_size: Precision::Inexact(1_000_000),
+            column_statistics: vec![
+                ColumnStatistics {
+                    distinct_count: Precision::Exact(100),
+                    ..ColumnStatistics::new_unknown()
+                },
+                ColumnStatistics {
+                    distinct_count: Precision::Exact(50),
+                    ..ColumnStatistics::new_unknown()
+                },
+            ],
+        };
+
+        // CUBE-like grouping set: (a, NULL), (NULL, b), (a, b) — 3 groups
+        let grouping_set = PhysicalGroupBy::new(
+            vec![
+                (col("a", &schema)? as Arc<dyn PhysicalExpr>, "a".to_string()),
+                (col("b", &schema)? as Arc<dyn PhysicalExpr>, "b".to_string()),
+            ],
+            vec![
+                (lit(ScalarValue::Int32(None)), "a".to_string()),
+                (lit(ScalarValue::Int32(None)), "b".to_string()),
+            ],
+            vec![
+                vec![false, true],  // (a, NULL)
+                vec![true, false],  // (NULL, b)
+                vec![false, false], // (a, b)
+            ],
+            true,
+        );
+
+        let agg = build_test_aggregate(&schema, input_stats, grouping_set, None)?;
+
+        let stats = agg.partition_statistics(None)?;
+        // Per-set NDV: (a,NULL)=100, (NULL,b)=50, (a,b)=100*50=5000
+        // Total = 100 + 50 + 5000 = 5150
+        assert_eq!(
+            stats.num_rows,
+            Precision::Inexact(5_150),
+            "grouping sets should sum per-set NDV products"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_aggregate_stats_non_column_expr_bails_out() -> Result<()> {
+        use datafusion_common::ColumnStatistics;
+        use datafusion_expr::Operator;
+        use datafusion_physical_expr::expressions::BinaryExpr;
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+        ]));
+
+        let input_stats = Statistics {
+            num_rows: Precision::Exact(1_000_000),
+            total_byte_size: Precision::Inexact(1_000_000),
+            column_statistics: vec![
+                ColumnStatistics {
+                    distinct_count: Precision::Exact(100),
+                    ..ColumnStatistics::new_unknown()
+                },
+                ColumnStatistics {
+                    distinct_count: Precision::Exact(50),
+                    ..ColumnStatistics::new_unknown()
+                },
+            ],
+        };
+
+        // GROUP BY (a + b) — not a direct column reference
+        let expr_a_plus_b: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            col("a", &schema)?,
+            Operator::Plus,
+            col("b", &schema)?,
+        ));
+
+        let group_by =
+            PhysicalGroupBy::new_single(vec![(expr_a_plus_b, "a+b".to_string())]);
+        let agg = build_test_aggregate(&schema, input_stats, group_by, None)?;
+
+        let stats = agg.partition_statistics(None)?;
+        assert_eq!(
+            stats.num_rows,
+            Precision::Inexact(1_000_000),
+            "non-column group-by expression should bail out to input_rows"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_order_is_retained_when_spilling() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Int64, false),
+            Field::new("c", DataType::Int64, false),
+        ]));
+
+        let batches = vec![vec![
+            RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![
+                    Arc::new(Int64Array::from(vec![2])),
+                    Arc::new(Int64Array::from(vec![2])),
+                    Arc::new(Int64Array::from(vec![1])),
+                ],
+            )?,
+            RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![
+                    Arc::new(Int64Array::from(vec![1])),
+                    Arc::new(Int64Array::from(vec![1])),
+                    Arc::new(Int64Array::from(vec![1])),
+                ],
+            )?,
+            RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![
+                    Arc::new(Int64Array::from(vec![0])),
+                    Arc::new(Int64Array::from(vec![0])),
+                    Arc::new(Int64Array::from(vec![1])),
+                ],
+            )?,
+        ]];
+        let scan = TestMemoryExec::try_new(&batches, Arc::clone(&schema), None)?;
+        let scan = scan.try_with_sort_information(vec![
+            LexOrdering::new([PhysicalSortExpr::new(
+                col("b", schema.as_ref())?,
+                SortOptions::default().desc(),
+            )])
+            .unwrap(),
+        ])?;
+
+        let aggr = Arc::new(AggregateExec::try_new(
+            AggregateMode::Single,
+            PhysicalGroupBy::new(
+                vec![
+                    (col("b", schema.as_ref())?, "b".to_string()),
+                    (col("c", schema.as_ref())?, "c".to_string()),
+                ],
+                vec![],
+                vec![vec![false, false]],
+                false,
+            ),
+            vec![Arc::new(
+                AggregateExprBuilder::new(sum_udaf(), vec![col("c", schema.as_ref())?])
+                    .schema(Arc::clone(&schema))
+                    .alias("SUM(c)")
+                    .build()?,
+            )],
+            vec![None],
+            Arc::new(scan) as Arc<dyn ExecutionPlan>,
+            Arc::clone(&schema),
+        )?);
+
+        let task_ctx = new_spill_ctx(1, 600);
+        let result = collect(aggr.execute(0, Arc::clone(&task_ctx))?).await?;
+        assert_spill_count_metric(true, aggr);
+
+        allow_duplicates! {
+            assert_snapshot!(batches_to_string(&result), @r"
+            +---+---+--------+
+            | b | c | SUM(c) |
+            +---+---+--------+
+            | 2 | 1 | 1      |
+            | 1 | 1 | 1      |
+            | 0 | 1 | 1      |
+            +---+---+--------+
+        ");
+        }
+        Ok(())
+    }
+
+    /// Tests that when the memory pool is too small to accommodate the sort
+    /// reservation during spill, the error is properly propagated as
+    /// ResourcesExhausted rather than silently exceeding memory limits.
+    #[tokio::test]
+    async fn test_sort_reservation_fails_during_spill() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("g", DataType::Int64, false),
+            Field::new("a", DataType::Float64, false),
+            Field::new("b", DataType::Float64, false),
+            Field::new("c", DataType::Float64, false),
+            Field::new("d", DataType::Float64, false),
+            Field::new("e", DataType::Float64, false),
+        ]));
+
+        let batches = vec![vec![
+            RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![
+                    Arc::new(Int64Array::from(vec![1])),
+                    Arc::new(Float64Array::from(vec![10.0])),
+                    Arc::new(Float64Array::from(vec![20.0])),
+                    Arc::new(Float64Array::from(vec![30.0])),
+                    Arc::new(Float64Array::from(vec![40.0])),
+                    Arc::new(Float64Array::from(vec![50.0])),
+                ],
+            )?,
+            RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![
+                    Arc::new(Int64Array::from(vec![2])),
+                    Arc::new(Float64Array::from(vec![11.0])),
+                    Arc::new(Float64Array::from(vec![21.0])),
+                    Arc::new(Float64Array::from(vec![31.0])),
+                    Arc::new(Float64Array::from(vec![41.0])),
+                    Arc::new(Float64Array::from(vec![51.0])),
+                ],
+            )?,
+            RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![
+                    Arc::new(Int64Array::from(vec![3])),
+                    Arc::new(Float64Array::from(vec![12.0])),
+                    Arc::new(Float64Array::from(vec![22.0])),
+                    Arc::new(Float64Array::from(vec![32.0])),
+                    Arc::new(Float64Array::from(vec![42.0])),
+                    Arc::new(Float64Array::from(vec![52.0])),
+                ],
+            )?,
+        ]];
+
+        let scan = TestMemoryExec::try_new(&batches, Arc::clone(&schema), None)?;
+
+        let aggr = Arc::new(AggregateExec::try_new(
+            AggregateMode::Single,
+            PhysicalGroupBy::new(
+                vec![(col("g", schema.as_ref())?, "g".to_string())],
+                vec![],
+                vec![vec![false]],
+                false,
+            ),
+            vec![
+                Arc::new(
+                    AggregateExprBuilder::new(
+                        avg_udaf(),
+                        vec![col("a", schema.as_ref())?],
+                    )
+                    .schema(Arc::clone(&schema))
+                    .alias("AVG(a)")
+                    .build()?,
+                ),
+                Arc::new(
+                    AggregateExprBuilder::new(
+                        avg_udaf(),
+                        vec![col("b", schema.as_ref())?],
+                    )
+                    .schema(Arc::clone(&schema))
+                    .alias("AVG(b)")
+                    .build()?,
+                ),
+                Arc::new(
+                    AggregateExprBuilder::new(
+                        avg_udaf(),
+                        vec![col("c", schema.as_ref())?],
+                    )
+                    .schema(Arc::clone(&schema))
+                    .alias("AVG(c)")
+                    .build()?,
+                ),
+                Arc::new(
+                    AggregateExprBuilder::new(
+                        avg_udaf(),
+                        vec![col("d", schema.as_ref())?],
+                    )
+                    .schema(Arc::clone(&schema))
+                    .alias("AVG(d)")
+                    .build()?,
+                ),
+                Arc::new(
+                    AggregateExprBuilder::new(
+                        avg_udaf(),
+                        vec![col("e", schema.as_ref())?],
+                    )
+                    .schema(Arc::clone(&schema))
+                    .alias("AVG(e)")
+                    .build()?,
+                ),
+            ],
+            vec![None, None, None, None, None],
+            Arc::new(scan) as Arc<dyn ExecutionPlan>,
+            Arc::clone(&schema),
+        )?);
+
+        // Pool must be large enough for accumulation to start but too small for
+        // sort_memory after clearing.
+        let task_ctx = new_spill_ctx(1, 500);
+        let result = collect(aggr.execute(0, Arc::clone(&task_ctx))?).await;
+
+        match &result {
+            Ok(_) => panic!("Expected ResourcesExhausted error but query succeeded"),
+            Err(e) => {
+                let root = e.find_root();
+                assert!(
+                    matches!(root, DataFusionError::ResourcesExhausted(_)),
+                    "Expected ResourcesExhausted, got: {root}",
+                );
+                let msg = root.to_string();
+                assert!(
+                    msg.contains("Failed to reserve memory for sort during spill"),
+                    "Expected sort reservation error, got: {msg}",
+                );
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Tests that PartialReduce mode:
+    /// 1. Accepts state as input (like Final)
+    /// 2. Produces state as output (like Partial)
+    /// 3. Can be followed by a Final stage to get the correct result
+    ///
+    /// This simulates a tree-reduce pattern:
+    ///   Partial -> PartialReduce -> Final
+    #[tokio::test]
+    async fn test_partial_reduce_mode() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::UInt32, false),
+            Field::new("b", DataType::Float64, false),
+        ]));
+
+        // Produce two partitions of input data
+        let batch1 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(UInt32Array::from(vec![1, 2, 3])),
+                Arc::new(Float64Array::from(vec![10.0, 20.0, 30.0])),
+            ],
+        )?;
+        let batch2 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(UInt32Array::from(vec![1, 2, 3])),
+                Arc::new(Float64Array::from(vec![40.0, 50.0, 60.0])),
+            ],
+        )?;
+
+        let groups =
+            PhysicalGroupBy::new_single(vec![(col("a", &schema)?, "a".to_string())]);
+        let aggregates: Vec<Arc<AggregateFunctionExpr>> = vec![Arc::new(
+            AggregateExprBuilder::new(sum_udaf(), vec![col("b", &schema)?])
+                .schema(Arc::clone(&schema))
+                .alias("SUM(b)")
+                .build()?,
+        )];
+
+        // Step 1: Partial aggregation on partition 1
+        let input1 =
+            TestMemoryExec::try_new_exec(&[vec![batch1]], Arc::clone(&schema), None)?;
+        let partial1 = Arc::new(AggregateExec::try_new(
+            AggregateMode::Partial,
+            groups.clone(),
+            aggregates.clone(),
+            vec![None],
+            input1,
+            Arc::clone(&schema),
+        )?);
+
+        // Step 2: Partial aggregation on partition 2
+        let input2 =
+            TestMemoryExec::try_new_exec(&[vec![batch2]], Arc::clone(&schema), None)?;
+        let partial2 = Arc::new(AggregateExec::try_new(
+            AggregateMode::Partial,
+            groups.clone(),
+            aggregates.clone(),
+            vec![None],
+            input2,
+            Arc::clone(&schema),
+        )?);
+
+        // Collect partial results
+        let task_ctx = Arc::new(TaskContext::default());
+        let partial_result1 =
+            crate::collect(Arc::clone(&partial1) as _, Arc::clone(&task_ctx)).await?;
+        let partial_result2 =
+            crate::collect(Arc::clone(&partial2) as _, Arc::clone(&task_ctx)).await?;
+
+        // The partial results have state schema (group cols + accumulator state)
+        let partial_schema = partial1.schema();
+
+        // Step 3: PartialReduce — combine partial results, still producing state
+        let combined_input = TestMemoryExec::try_new_exec(
+            &[partial_result1, partial_result2],
+            Arc::clone(&partial_schema),
+            None,
+        )?;
+        // Coalesce into a single partition for the PartialReduce
+        let coalesced = Arc::new(CoalescePartitionsExec::new(combined_input));
+
+        let partial_reduce = Arc::new(AggregateExec::try_new(
+            AggregateMode::PartialReduce,
+            groups.clone(),
+            aggregates.clone(),
+            vec![None],
+            coalesced,
+            Arc::clone(&partial_schema),
+        )?);
+
+        // Verify PartialReduce output schema matches Partial output schema
+        // (both produce state, not final values)
+        assert_eq!(partial_reduce.schema(), partial_schema);
+
+        // Collect PartialReduce results
+        let reduce_result =
+            crate::collect(Arc::clone(&partial_reduce) as _, Arc::clone(&task_ctx))
+                .await?;
+
+        // Step 4: Final aggregation on the PartialReduce output
+        let final_input = TestMemoryExec::try_new_exec(
+            &[reduce_result],
+            Arc::clone(&partial_schema),
+            None,
+        )?;
+        let final_agg = Arc::new(AggregateExec::try_new(
+            AggregateMode::Final,
+            groups.clone(),
+            aggregates.clone(),
+            vec![None],
+            final_input,
+            Arc::clone(&partial_schema),
+        )?);
+
+        let result = crate::collect(final_agg, Arc::clone(&task_ctx)).await?;
+
+        // Expected: group 1 -> 10+40=50, group 2 -> 20+50=70, group 3 -> 30+60=90
+        assert_snapshot!(batches_to_sort_string(&result), @r"
+            +---+--------+
+            | a | SUM(b) |
+            +---+--------+
+            | 1 | 50.0   |
+            | 2 | 70.0   |
+            | 3 | 90.0   |
+            +---+--------+
+        ");
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_with_dynamic_filter() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]));
+        let child = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+
+        // Partial min triggers init_dynamic_filter.
+        let agg = AggregateExec::try_new(
+            AggregateMode::Partial,
+            PhysicalGroupBy::new_single(vec![]),
+            vec![Arc::new(
+                AggregateExprBuilder::new(min_udaf(), vec![col("a", &schema)?])
+                    .schema(Arc::clone(&schema))
+                    .alias("min_a")
+                    .build()?,
+            )],
+            vec![None],
+            child,
+            Arc::clone(&schema),
+        )?;
+        let original_inner_id = agg
+            .dynamic_filter()
+            .expect("should have dynamic filter after init")
+            .expression_id()
+            .expect("DynamicFilterPhysicalExpr always has an expression_id");
+
+        let new_df = Arc::new(DynamicFilterPhysicalExpr::new(
+            vec![col("a", &schema)?],
+            lit(true),
+        ));
+        let agg = agg.with_dynamic_filter(Arc::clone(&new_df))?;
+        let restored = agg
+            .dynamic_filter()
+            .expect("should still have dynamic filter");
+        assert_eq!(
+            restored
+                .expression_id()
+                .expect("DynamicFilterPhysicalExpr always has an expression_id"),
+            new_df
+                .expression_id()
+                .expect("DynamicFilterPhysicalExpr always has an expression_id"),
+        );
+        assert_ne!(
+            restored
+                .expression_id()
+                .expect("DynamicFilterPhysicalExpr always has an expression_id"),
+            original_inner_id,
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_with_dynamic_filter_noop_when_unsupported() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Int64, false),
+        ]));
+        let child = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+
+        // Final mode with a group-by does not support dynamic filters.
+        let agg = AggregateExec::try_new(
+            AggregateMode::Final,
+            PhysicalGroupBy::new_single(vec![(col("a", &schema)?, "a".to_string())]),
+            vec![Arc::new(
+                AggregateExprBuilder::new(sum_udaf(), vec![col("b", &schema)?])
+                    .schema(Arc::clone(&schema))
+                    .alias("sum_b")
+                    .build()?,
+            )],
+            vec![None],
+            child,
+            Arc::clone(&schema),
+        )?;
+        assert!(agg.dynamic_filter().is_none());
+
+        // with_dynamic_filter should be a no-op.
+        let df = Arc::new(DynamicFilterPhysicalExpr::new(
+            vec![col("a", &schema)?],
+            lit(true),
+        ));
+        let agg = agg.with_dynamic_filter(df)?;
+        assert!(agg.dynamic_filter().is_none());
+        Ok(())
+    }
+
+    #[test]
+    fn test_with_dynamic_filter_rejects_invalid_columns() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]));
+        let child = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+
+        let agg = AggregateExec::try_new(
+            AggregateMode::Partial,
+            PhysicalGroupBy::new_single(vec![]),
+            vec![Arc::new(
+                AggregateExprBuilder::new(min_udaf(), vec![col("a", &schema)?])
+                    .schema(Arc::clone(&schema))
+                    .alias("min_a")
+                    .build()?,
+            )],
+            vec![None],
+            child,
+            Arc::clone(&schema),
+        )?;
+
+        // Column index 99 is out of bounds for the input schema.
+        let df = Arc::new(DynamicFilterPhysicalExpr::new(
+            vec![Arc::new(Column::new("bad", 99)) as _],
+            lit(true),
+        ));
+        assert!(agg.with_dynamic_filter(df).is_err());
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/aggregates/no_grouping.rs b/datafusion/physical-plan/src/aggregates/no_grouping.rs
index 9474a5f88c92a..a7dd7c9a66cb1 100644
--- a/datafusion/physical-plan/src/aggregates/no_grouping.rs
+++ b/datafusion/physical-plan/src/aggregates/no_grouping.rs
@@ -18,27 +18,31 @@
 //! Aggregate without grouping columns
 
 use crate::aggregates::{
-    aggregate_expressions, create_accumulators, finalize_aggregation, AccumulatorItem,
-    AggregateMode,
+    AccumulatorItem, AggrDynFilter, AggregateInputMode, AggregateMode,
+    DynamicFilterAggregateType, aggregate_expressions, create_accumulators,
+    finalize_aggregation,
 };
 use crate::metrics::{BaselineMetrics, RecordOutput};
 use crate::{RecordBatchStream, SendableRecordBatchStream};
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
-use datafusion_common::Result;
+use datafusion_common::{Result, ScalarValue, internal_datafusion_err, internal_err};
 use datafusion_execution::TaskContext;
+use datafusion_expr::Operator;
 use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_expr::expressions::{BinaryExpr, lit};
 use futures::stream::BoxStream;
 use std::borrow::Cow;
+use std::cmp::Ordering;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 
+use super::AggregateExec;
 use crate::filter::batch_filter;
 use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays;
 use futures::stream::{Stream, StreamExt};
 
-use super::AggregateExec;
-
 /// stream struct for aggregation without grouping columns
 pub(crate) struct AggregateStream {
     stream: BoxStream<'static, Result<RecordBatch>>,
@@ -53,44 +57,268 @@ pub(crate) struct AggregateStream {
 ///
 /// The latter requires a state object, which is [`AggregateStreamInner`].
 struct AggregateStreamInner {
+    // ==== Properties ====
     schema: SchemaRef,
     mode: AggregateMode,
     input: SendableRecordBatchStream,
-    baseline_metrics: BaselineMetrics,
     aggregate_expressions: Vec<Vec<Arc<dyn PhysicalExpr>>>,
-    filter_expressions: Vec<Option<Arc<dyn PhysicalExpr>>>,
+    filter_expressions: Arc<[Option<Arc<dyn PhysicalExpr>>]>,
+
+    // ==== Runtime States/Buffers ====
     accumulators: Vec<AccumulatorItem>,
-    reservation: MemoryReservation,
+    // None if the dynamic filter is not applicable. See details in `AggrDynFilter`.
+    agg_dyn_filter_state: Option<Arc<AggrDynFilter>>,
     finished: bool,
+
+    // ==== Execution Resources ====
+    baseline_metrics: BaselineMetrics,
+    reservation: MemoryReservation,
+}
+
+impl AggregateStreamInner {
+    // TODO: check if we get Null handling correct
+    /// # Examples
+    /// - Example 1
+    ///   Accumulators: min(c1)
+    ///   Current Bounds: min(c1)=10
+    ///   --> dynamic filter PhysicalExpr: c1 < 10
+    ///
+    /// - Example 2
+    ///   Accumulators: min(c1), max(c1), min(c2)
+    ///   Current Bounds: min(c1)=10, max(c1)=100, min(c2)=20
+    ///   --> dynamic filter PhysicalExpr: (c1 < 10) OR (c1>100) OR (c2 < 20)
+    ///
+    /// # Errors
+    /// Returns internal errors if the dynamic filter is not enabled, or other
+    /// invariant check fails.
+    fn build_dynamic_filter_from_accumulator_bounds(
+        &self,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        let Some(filter_state) = self.agg_dyn_filter_state.as_ref() else {
+            return internal_err!(
+                "`build_dynamic_filter_from_accumulator_bounds()` is only called when dynamic filter is enabled"
+            );
+        };
+
+        let mut predicates: Vec<Arc<dyn PhysicalExpr>> =
+            Vec::with_capacity(filter_state.supported_accumulators_info.len());
+
+        for acc_info in &filter_state.supported_accumulators_info {
+            // Skip if we don't yet have a meaningful bound
+            let bound = {
+                let guard = acc_info.shared_bound.lock();
+                if (*guard).is_null() {
+                    continue;
+                }
+                guard.clone()
+            };
+
+            let agg_exprs = self
+                .aggregate_expressions
+                .get(acc_info.aggr_index)
+                .ok_or_else(|| {
+                    internal_datafusion_err!(
+                        "Invalid aggregate expression index {} for dynamic filter",
+                        acc_info.aggr_index
+                    )
+                })?;
+            // Only aggregates with a single argument are supported.
+            let column_expr = agg_exprs.first().ok_or_else(|| {
+                internal_datafusion_err!(
+                    "Aggregate expression at index {} expected a single argument",
+                    acc_info.aggr_index
+                )
+            })?;
+
+            let literal = lit(bound);
+            let predicate: Arc<dyn PhysicalExpr> = match acc_info.aggr_type {
+                DynamicFilterAggregateType::Min => Arc::new(BinaryExpr::new(
+                    Arc::clone(column_expr),
+                    Operator::Lt,
+                    literal,
+                )),
+                DynamicFilterAggregateType::Max => Arc::new(BinaryExpr::new(
+                    Arc::clone(column_expr),
+                    Operator::Gt,
+                    literal,
+                )),
+            };
+            predicates.push(predicate);
+        }
+
+        let combined = predicates.into_iter().reduce(|acc, pred| {
+            Arc::new(BinaryExpr::new(acc, Operator::Or, pred)) as Arc<dyn PhysicalExpr>
+        });
+
+        Ok(combined.unwrap_or_else(|| lit(true)))
+    }
+
+    // If the dynamic filter is enabled, update it using the current accumulator's
+    // values
+    fn maybe_update_dyn_filter(&mut self) -> Result<()> {
+        // Step 1: Update each partition's current bound
+        let Some(filter_state) = self.agg_dyn_filter_state.as_ref() else {
+            return Ok(());
+        };
+
+        let mut bounds_changed = false;
+
+        for acc_info in &filter_state.supported_accumulators_info {
+            let acc =
+                self.accumulators
+                    .get_mut(acc_info.aggr_index)
+                    .ok_or_else(|| {
+                        internal_datafusion_err!(
+                            "Invalid accumulator index {} for dynamic filter",
+                            acc_info.aggr_index
+                        )
+                    })?;
+            // First get current partition's bound, then update the shared bound among
+            // all partitions.
+            let current_bound = acc.evaluate()?;
+            {
+                let mut bound = acc_info.shared_bound.lock();
+                let new_bound = match acc_info.aggr_type {
+                    DynamicFilterAggregateType::Max => {
+                        scalar_max(&bound, &current_bound)?
+                    }
+                    DynamicFilterAggregateType::Min => {
+                        scalar_min(&bound, &current_bound)?
+                    }
+                };
+                if new_bound != *bound {
+                    *bound = new_bound;
+                    bounds_changed = true;
+                }
+            }
+        }
+
+        // Step 2: Sync the dynamic filter physical expression with reader,
+        // but only if any bound actually changed.
+        if bounds_changed {
+            let predicate = self.build_dynamic_filter_from_accumulator_bounds()?;
+            filter_state.filter.update(predicate)?;
+        }
+
+        Ok(())
+    }
+}
+
+/// Returns the element-wise minimum of two `ScalarValue`s.
+///
+/// # Null semantics
+/// - `min(NULL, NULL)      = NULL`
+/// - `min(NULL, x)         = x`
+/// - `min(x, NULL)         = x`
+///
+/// # Errors
+/// Returns internal error if v1 and v2 has incompatible types.
+fn scalar_min(v1: &ScalarValue, v2: &ScalarValue) -> Result<ScalarValue> {
+    if let Some(result) = scalar_cmp_null_short_circuit(v1, v2) {
+        return Ok(result);
+    }
+
+    match v1.partial_cmp(v2) {
+        Some(Ordering::Less | Ordering::Equal) => Ok(v1.clone()),
+        Some(Ordering::Greater) => Ok(v2.clone()),
+        None => datafusion_common::internal_err!(
+            "cannot compare values of different or incompatible types: {v1:?} vs {v2:?}"
+        ),
+    }
+}
+
+/// Returns the element-wise maximum of two `ScalarValue`s.
+///
+/// # Null semantics
+/// - `max(NULL, NULL)      = NULL`
+/// - `max(NULL, x)         = x`
+/// - `max(x, NULL)         = x`
+///
+/// # Errors
+/// Returns internal error if v1 and v2 has incompatible types.
+fn scalar_max(v1: &ScalarValue, v2: &ScalarValue) -> Result<ScalarValue> {
+    if let Some(result) = scalar_cmp_null_short_circuit(v1, v2) {
+        return Ok(result);
+    }
+
+    match v1.partial_cmp(v2) {
+        Some(Ordering::Greater | Ordering::Equal) => Ok(v1.clone()),
+        Some(Ordering::Less) => Ok(v2.clone()),
+        None => datafusion_common::internal_err!(
+            "cannot compare values of different or incompatible types: {v1:?} vs {v2:?}"
+        ),
+    }
+}
+
+fn scalar_cmp_null_short_circuit(
+    v1: &ScalarValue,
+    v2: &ScalarValue,
+) -> Option<ScalarValue> {
+    match (v1, v2) {
+        (ScalarValue::Null, ScalarValue::Null) => Some(ScalarValue::Null),
+        (ScalarValue::Null, other) | (other, ScalarValue::Null) => Some(other.clone()),
+        _ => None,
+    }
+}
+
+/// Prepend the grouping ID column to the output columns if present.
+///
+/// For GROUPING SETS with no GROUP BY expressions, the schema includes a `__grouping_id`
+/// column that must be present in the output. This function inserts it at the beginning
+/// of the columns array to maintain schema alignment.
+fn prepend_grouping_id_column(
+    mut columns: Vec<Arc<dyn arrow::array::Array>>,
+    grouping_id: Option<&ScalarValue>,
+) -> Result<Vec<Arc<dyn arrow::array::Array>>> {
+    if let Some(id) = grouping_id {
+        let num_rows = columns.first().map(|array| array.len()).unwrap_or(1);
+        let grouping_ids = id.to_array_of_size(num_rows)?;
+        columns.insert(0, grouping_ids);
+    }
+    Ok(columns)
 }
 
 impl AggregateStream {
     /// Create a new AggregateStream
     pub fn new(
         agg: &AggregateExec,
-        context: Arc<TaskContext>,
+        context: &Arc<TaskContext>,
         partition: usize,
     ) -> Result<Self> {
         let agg_schema = Arc::clone(&agg.schema);
-        let agg_filter_expr = agg.filter_expr.clone();
+        let agg_filter_expr = Arc::clone(&agg.filter_expr);
 
         let baseline_metrics = BaselineMetrics::new(&agg.metrics, partition);
-        let input = agg.input.execute(partition, Arc::clone(&context))?;
+        let input = agg.input.execute(partition, Arc::clone(context))?;
 
         let aggregate_expressions = aggregate_expressions(&agg.aggr_expr, &agg.mode, 0)?;
-        let filter_expressions = match agg.mode {
-            AggregateMode::Partial
-            | AggregateMode::Single
-            | AggregateMode::SinglePartitioned => agg_filter_expr,
-            AggregateMode::Final | AggregateMode::FinalPartitioned => {
-                vec![None; agg.aggr_expr.len()]
-            }
+        let filter_expressions = match agg.mode.input_mode() {
+            AggregateInputMode::Raw => agg_filter_expr,
+            AggregateInputMode::Partial => vec![None; agg.aggr_expr.len()].into(),
         };
         let accumulators = create_accumulators(&agg.aggr_expr)?;
 
         let reservation = MemoryConsumer::new(format!("AggregateStream[{partition}]"))
             .register(context.memory_pool());
 
+        // Enable dynamic filter if:
+        // 1. AggregateExec did the check and ensure it supports the dynamic filter
+        //    (its dynamic_filter field will be Some(..))
+        // 2. Aggregate dynamic filter is enabled from the config
+        let mut maybe_dynamic_filter = match agg.dynamic_filter.as_ref() {
+            Some(filter) => Some(Arc::clone(filter)),
+            _ => None,
+        };
+
+        if !context
+            .session_config()
+            .options()
+            .optimizer
+            .enable_aggregate_dynamic_filter_pushdown
+        {
+            maybe_dynamic_filter = None;
+        }
+
         let inner = AggregateStreamInner {
             schema: Arc::clone(&agg.schema),
             mode: agg.mode,
@@ -101,27 +329,33 @@ impl AggregateStream {
             accumulators,
             reservation,
             finished: false,
+            agg_dyn_filter_state: maybe_dynamic_filter,
         };
+
         let stream = futures::stream::unfold(inner, |mut this| async move {
             if this.finished {
                 return None;
             }
 
-            let elapsed_compute = this.baseline_metrics.elapsed_compute();
-
             loop {
                 let result = match this.input.next().await {
                     Some(Ok(batch)) => {
-                        let timer = elapsed_compute.timer();
-                        let result = aggregate_batch(
-                            &this.mode,
-                            batch,
-                            &mut this.accumulators,
-                            &this.aggregate_expressions,
-                            &this.filter_expressions,
-                        );
+                        let result = {
+                            let elapsed_compute = this.baseline_metrics.elapsed_compute();
+                            let _timer = elapsed_compute.timer(); // Stops on drop
+                            aggregate_batch(
+                                &this.mode,
+                                &batch,
+                                &mut this.accumulators,
+                                &this.aggregate_expressions,
+                                &this.filter_expressions,
+                            )
+                        };
 
-                        timer.done();
+                        let result = result.and_then(|allocated| {
+                            this.maybe_update_dyn_filter()?;
+                            Ok(allocated)
+                        });
 
                         // allocate memory
                         // This happens AFTER we actually used the memory, but simplifies the whole accounting and we are OK with
@@ -139,6 +373,9 @@ impl AggregateStream {
                         let timer = this.baseline_metrics.elapsed_compute().timer();
                         let result =
                             finalize_aggregation(&mut this.accumulators, &this.mode)
+                                .and_then(|columns| {
+                                    prepend_grouping_id_column(columns, None)
+                                })
                                 .and_then(|columns| {
                                     RecordBatch::try_new(
                                         Arc::clone(&this.schema),
@@ -195,7 +432,7 @@ impl RecordBatchStream for AggregateStream {
 /// TODO: Make this a member function
 fn aggregate_batch(
     mode: &AggregateMode,
-    batch: RecordBatch,
+    batch: &RecordBatch,
     accumulators: &mut [AccumulatorItem],
     expressions: &[Vec<Arc<dyn PhysicalExpr>>],
     filters: &[Option<Arc<dyn PhysicalExpr>>],
@@ -215,27 +452,18 @@ fn aggregate_batch(
         .try_for_each(|((accum, expr), filter)| {
             // 1.2
             let batch = match filter {
-                Some(filter) => Cow::Owned(batch_filter(&batch, filter)?),
-                None => Cow::Borrowed(&batch),
+                Some(filter) => Cow::Owned(batch_filter(batch, filter)?),
+                None => Cow::Borrowed(batch),
             };
 
-            let n_rows = batch.num_rows();
-
             // 1.3
-            let values = expr
-                .iter()
-                .map(|e| e.evaluate(&batch).and_then(|v| v.into_array(n_rows)))
-                .collect::<Result<Vec<_>>>()?;
+            let values = evaluate_expressions_to_arrays(expr, batch.as_ref())?;
 
             // 1.4
             let size_pre = accum.size();
-            let res = match mode {
-                AggregateMode::Partial
-                | AggregateMode::Single
-                | AggregateMode::SinglePartitioned => accum.update_batch(&values),
-                AggregateMode::Final | AggregateMode::FinalPartitioned => {
-                    accum.merge_batch(&values)
-                }
+            let res = match mode.input_mode() {
+                AggregateInputMode::Raw => accum.update_batch(&values),
+                AggregateInputMode::Partial => accum.merge_batch(&values),
             };
             let size_post = accum.size();
             allocated += size_post.saturating_sub(size_pre);
diff --git a/datafusion/physical-plan/src/aggregates/order/mod.rs b/datafusion/physical-plan/src/aggregates/order/mod.rs
index bbcb30d877cf0..97fbd519c825c 100644
--- a/datafusion/physical-plan/src/aggregates/order/mod.rs
+++ b/datafusion/physical-plan/src/aggregates/order/mod.rs
@@ -52,7 +52,8 @@ impl GroupOrdering {
         }
     }
 
-    // How many groups be emitted, or None if no data can be emitted
+    /// Returns how many groups can be emitted while respecting the current
+    /// ordering guarantees, or `None` if no data can be emitted.
     pub fn emit_to(&self) -> Option<EmitTo> {
         match self {
             GroupOrdering::None => None,
@@ -61,7 +62,29 @@ impl GroupOrdering {
         }
     }
 
-    /// Updates the state the input is done
+    /// Returns the emit strategy to use under memory pressure (OOM).
+    ///
+    /// Returns the strategy that must be used when emitting up to `n` groups
+    /// while respecting the current ordering guarantees.
+    ///
+    /// Returns `None` if no data can be emitted.
+    pub fn oom_emit_to(&self, n: usize) -> Option<EmitTo> {
+        if n == 0 {
+            return None;
+        }
+
+        match self {
+            GroupOrdering::None => Some(EmitTo::First(n)),
+            GroupOrdering::Partial(_) | GroupOrdering::Full(_) => {
+                self.emit_to().map(|emit_to| match emit_to {
+                    EmitTo::First(max) => EmitTo::First(n.min(max)),
+                    EmitTo::All => EmitTo::First(n),
+                })
+            }
+        }
+    }
+
+    /// Updates the state to indicate that the input is complete.
     pub fn input_done(&mut self) {
         match self {
             GroupOrdering::None => {}
@@ -70,8 +93,8 @@ impl GroupOrdering {
         }
     }
 
-    /// remove the first n groups from the internal state, shifting
-    /// all existing indexes down by `n`
+    /// Removes the first `n` groups from the internal state, shifting all
+    /// existing indexes down by `n`.
     pub fn remove_groups(&mut self, n: usize) {
         match self {
             GroupOrdering::None => {}
@@ -80,16 +103,14 @@ impl GroupOrdering {
         }
     }
 
-    /// Called when new groups are added in a batch
+    /// Called when new groups are added in a batch.
     ///
-    /// * `total_num_groups`: total number of groups (so max
-    ///   group_index is total_num_groups - 1).
-    ///
-    /// * `group_values`: group key values for *each row* in the batch
+    /// * `batch_group_values`: group key values for each row in the batch
     ///
     /// * `group_indices`: indices for each row in the batch
     ///
-    /// * `hashes`: hash values for each row in the batch
+    /// * `total_num_groups`: total number of groups (so max
+    ///   group_index is total_num_groups - 1).
     pub fn new_groups(
         &mut self,
         batch_group_values: &[ArrayRef],
@@ -112,7 +133,7 @@ impl GroupOrdering {
         Ok(())
     }
 
-    /// Return the size of memory used by the ordering state, in bytes
+    /// Returns the size of memory used by the ordering state, in bytes.
     pub fn size(&self) -> usize {
         size_of::<Self>()
             + match self {
@@ -122,3 +143,63 @@ impl GroupOrdering {
             }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::sync::Arc;
+
+    use arrow::array::Int32Array;
+
+    #[test]
+    fn test_oom_emit_to_none_ordering() {
+        let group_ordering = GroupOrdering::None;
+
+        assert_eq!(group_ordering.oom_emit_to(0), None);
+        assert_eq!(group_ordering.oom_emit_to(5), Some(EmitTo::First(5)));
+    }
+
+    /// Creates a partially ordered grouping state with three groups.
+    ///
+    /// `sort_key_values` controls whether a sort boundary exists in the batch:
+    /// distinct values such as `[1, 2, 3]` create boundaries, while repeated
+    /// values such as `[1, 1, 1]` do not.
+    fn partial_ordering(sort_key_values: Vec<i32>) -> Result<GroupOrdering> {
+        let mut group_ordering =
+            GroupOrdering::Partial(GroupOrderingPartial::try_new(vec![0])?);
+
+        let batch_group_values: Vec<ArrayRef> = vec![
+            Arc::new(Int32Array::from(sort_key_values)),
+            Arc::new(Int32Array::from(vec![10, 20, 30])),
+        ];
+        let group_indices = vec![0, 1, 2];
+
+        group_ordering.new_groups(&batch_group_values, &group_indices, 3)?;
+
+        Ok(group_ordering)
+    }
+
+    #[test]
+    fn test_oom_emit_to_partial_clamps_to_boundary() -> Result<()> {
+        let group_ordering = partial_ordering(vec![1, 2, 3])?;
+
+        // Can emit both `1` and `2` groups because we have seen `3`
+        assert_eq!(group_ordering.emit_to(), Some(EmitTo::First(2)));
+        assert_eq!(group_ordering.oom_emit_to(1), Some(EmitTo::First(1)));
+        assert_eq!(group_ordering.oom_emit_to(3), Some(EmitTo::First(2)));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_oom_emit_to_partial_without_boundary() -> Result<()> {
+        let group_ordering = partial_ordering(vec![1, 1, 1])?;
+
+        // Can't emit the last `1` group as it may have more values
+        assert_eq!(group_ordering.emit_to(), None);
+        assert_eq!(group_ordering.oom_emit_to(3), None);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/aggregates/row_hash.rs b/datafusion/physical-plan/src/aggregates/row_hash.rs
index 3c6577af42860..056a7f171a516 100644
--- a/datafusion/physical-plan/src/aggregates/row_hash.rs
+++ b/datafusion/physical-plan/src/aggregates/row_hash.rs
@@ -21,35 +21,38 @@ use std::sync::Arc;
 use std::task::{Context, Poll};
 use std::vec;
 
-use super::order::GroupOrdering;
 use super::AggregateExec;
-use crate::aggregates::group_values::{new_group_values, GroupByMetrics, GroupValues};
+use super::order::GroupOrdering;
+use crate::aggregates::group_values::{GroupByMetrics, GroupValues, new_group_values};
 use crate::aggregates::order::GroupOrderingFull;
 use crate::aggregates::{
-    create_schema, evaluate_group_by, evaluate_many, evaluate_optional, AggregateMode,
-    PhysicalGroupBy,
+    AggregateInputMode, AggregateMode, AggregateOutputMode, PhysicalGroupBy,
+    create_schema, evaluate_group_by, evaluate_many, evaluate_optional,
 };
-use crate::metrics::{BaselineMetrics, MetricBuilder, RecordOutput};
-use crate::sorts::sort::sort_batch;
+use crate::metrics::{BaselineMetrics, MetricBuilder, MetricCategory, RecordOutput};
 use crate::sorts::streaming_merge::{SortedSpillFile, StreamingMergeBuilder};
-use crate::spill::spill_manager::SpillManager;
-use crate::stream::RecordBatchStreamAdapter;
-use crate::{aggregates, metrics, PhysicalExpr};
+use crate::spill::spill_manager::{GetSlicedSize, SpillManager};
+use crate::{PhysicalExpr, aggregates, metrics};
 use crate::{RecordBatchStream, SendableRecordBatchStream};
 
 use arrow::array::*;
 use arrow::datatypes::SchemaRef;
-use datafusion_common::{internal_err, DataFusionError, Result};
+use datafusion_common::{
+    DataFusionError, Result, assert_eq_or_internal_err, assert_or_internal_err,
+    internal_err, resources_datafusion_err,
+};
+use datafusion_execution::TaskContext;
 use datafusion_execution::memory_pool::proxy::VecAllocExt;
 use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
-use datafusion_execution::TaskContext;
 use datafusion_expr::{EmitTo, GroupsAccumulator};
 use datafusion_physical_expr::aggregate::AggregateFunctionExpr;
 use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_expr::{GroupsAccumulatorAdapter, PhysicalSortExpr};
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
 
+use crate::sorts::IncrementalSortIterator;
 use datafusion_common::instant::Instant;
+use datafusion_common::utils::memory::get_record_batch_memory_size;
 use futures::ready;
 use futures::stream::{Stream, StreamExt};
 use log::debug;
@@ -191,7 +194,9 @@ impl SkipAggregationProbe {
         if self.input_rows >= self.probe_rows_threshold {
             self.should_skip = self.num_groups as f64 / self.input_rows as f64
                 >= self.probe_ratio_threshold;
-            self.is_locked = true;
+            // Set is_locked to true only if we have decided to skip, otherwise we can try to skip
+            // during processing the next record_batch.
+            self.is_locked = self.should_skip;
         }
     }
 
@@ -205,6 +210,17 @@ impl SkipAggregationProbe {
     }
 }
 
+/// Controls the behavior when an out-of-memory condition occurs.
+#[derive(PartialEq, Debug)]
+enum OutOfMemoryMode {
+    /// When out of memory occurs, spill state to disk
+    Spill,
+    /// When out of memory occurs, attempt to emit group values early
+    EmitEarly,
+    /// When out of memory occurs, immediately report the error
+    ReportError,
+}
+
 /// HashTable based Grouping Aggregator
 ///
 /// # Design Goals
@@ -362,10 +378,10 @@ pub(crate) struct GroupedHashAggregateStream {
     ///
     /// For example, for an aggregate like `SUM(x) FILTER (WHERE x >= 100)`,
     /// the filter expression is  `x > 100`.
-    filter_expressions: Vec<Option<Arc<dyn PhysicalExpr>>>,
+    filter_expressions: Arc<[Option<Arc<dyn PhysicalExpr>>]>,
 
     /// GROUP BY expressions
-    group_by: PhysicalGroupBy,
+    group_by: Arc<PhysicalGroupBy>,
 
     /// max rows in output RecordBatches
     batch_size: usize,
@@ -428,6 +444,9 @@ pub(crate) struct GroupedHashAggregateStream {
     /// The memory reservation for this grouping
     reservation: MemoryReservation,
 
+    /// The behavior to trigger when out of memory occurs
+    oom_mode: OutOfMemoryMode,
+
     /// Execution metrics
     baseline_metrics: BaselineMetrics,
 
@@ -442,22 +461,22 @@ impl GroupedHashAggregateStream {
     /// Create a new GroupedHashAggregateStream
     pub fn new(
         agg: &AggregateExec,
-        context: Arc<TaskContext>,
+        context: &Arc<TaskContext>,
         partition: usize,
     ) -> Result<Self> {
         debug!("Creating GroupedHashAggregateStream");
         let agg_schema = Arc::clone(&agg.schema);
-        let agg_group_by = agg.group_by.clone();
-        let agg_filter_expr = agg.filter_expr.clone();
+        let agg_group_by = Arc::clone(&agg.group_by);
+        let agg_filter_expr = Arc::clone(&agg.filter_expr);
 
         let batch_size = context.session_config().batch_size();
-        let input = agg.input.execute(partition, Arc::clone(&context))?;
+        let input = agg.input.execute(partition, Arc::clone(context))?;
         let baseline_metrics = BaselineMetrics::new(&agg.metrics, partition);
         let group_by_metrics = GroupByMetrics::new(&agg.metrics, partition);
 
         let timer = baseline_metrics.elapsed_compute().timer();
 
-        let aggregate_exprs = agg.aggr_expr.clone();
+        let aggregate_exprs = Arc::clone(&agg.aggr_expr);
 
         // arguments for each aggregate, one vec of expressions per
         // aggregate
@@ -473,13 +492,9 @@ impl GroupedHashAggregateStream {
             agg_group_by.num_group_exprs(),
         )?;
 
-        let filter_expressions = match agg.mode {
-            AggregateMode::Partial
-            | AggregateMode::Single
-            | AggregateMode::SinglePartitioned => agg_filter_expr,
-            AggregateMode::Final | AggregateMode::FinalPartitioned => {
-                vec![None; agg.aggr_expr.len()]
-            }
+        let filter_expressions = match agg.mode.input_mode() {
+            AggregateInputMode::Raw => agg_filter_expr,
+            AggregateInputMode::Partial => vec![None; agg.aggr_expr.len()].into(),
         };
 
         // Instantiate the accumulators
@@ -505,12 +520,12 @@ impl GroupedHashAggregateStream {
         // Therefore, when we spill these intermediate states or pass them to another
         // aggregation operator, we must use a schema that includes both the group
         // columns **and** the partial-state columns.
-        let partial_agg_schema = create_schema(
+        let spill_schema = Arc::new(create_schema(
             &agg.input().schema(),
             &agg_group_by,
             &aggregate_exprs,
             AggregateMode::Partial,
-        )?;
+        )?);
 
         // Need to update the GROUP BY expressions to point to the correct column after schema change
         let merging_group_by_expr = agg_group_by
@@ -522,20 +537,25 @@ impl GroupedHashAggregateStream {
             })
             .collect();
 
-        let partial_agg_schema = Arc::new(partial_agg_schema);
+        let output_ordering = agg.cache.output_ordering();
 
-        let spill_expr =
+        let spill_sort_exprs =
             group_schema
                 .fields
                 .into_iter()
                 .enumerate()
                 .map(|(idx, field)| {
-                    PhysicalSortExpr::new_default(Arc::new(Column::new(
-                        field.name().as_str(),
-                        idx,
-                    )) as _)
+                    let output_expr = Column::new(field.name().as_str(), idx);
+
+                    // Try to use the sort options from the output ordering, if available.
+                    // This ensures that spilled state is sorted in the required order as well.
+                    let sort_options = output_ordering
+                        .and_then(|o| o.get_sort_options(&output_expr))
+                        .unwrap_or_default();
+
+                    PhysicalSortExpr::new(Arc::new(output_expr), sort_options)
                 });
-        let Some(spill_expr) = LexOrdering::new(spill_expr) else {
+        let Some(spill_ordering) = LexOrdering::new(spill_sort_exprs) else {
             return internal_err!("Spill expression is empty");
         };
 
@@ -545,11 +565,35 @@ impl GroupedHashAggregateStream {
             .collect::<Vec<_>>()
             .join(", ");
         let name = format!("GroupedHashAggregateStream[{partition}] ({agg_fn_names})");
-        let reservation = MemoryConsumer::new(name)
-            .with_can_spill(true)
-            .register(context.memory_pool());
         let group_ordering = GroupOrdering::try_new(&agg.input_order_mode)?;
+        let oom_mode = match (agg.mode, &group_ordering) {
+            // In partial aggregation mode, always prefer to emit incomplete results early.
+            (AggregateMode::Partial, _) => OutOfMemoryMode::EmitEarly,
+            // For non-partial aggregation modes, emitting incomplete results is not an option.
+            // Instead, use disk spilling to store sorted, incomplete results, and merge them
+            // afterwards.
+            (_, GroupOrdering::None | GroupOrdering::Partial(_))
+                if context.runtime_env().disk_manager.tmp_files_enabled() =>
+            {
+                OutOfMemoryMode::Spill
+            }
+            // For `GroupOrdering::Full`, the incoming stream is already sorted. This ensures the
+            // number of incomplete groups can be kept small at all times. If we still hit
+            // an out-of-memory condition, spilling to disk would not be beneficial since the same
+            // situation is likely to reoccur when reading back the spilled data.
+            // Therefore, we fall back to simply reporting the error immediately.
+            // This mode will also be used if the `DiskManager` is not configured to allow spilling
+            // to disk.
+            _ => OutOfMemoryMode::ReportError,
+        };
+
         let group_values = new_group_values(group_schema, &group_ordering)?;
+        let reservation = MemoryConsumer::new(name)
+            // We interpret 'can spill' as 'can handle memory back pressure'.
+            // This value needs to be set to true for the default memory pool implementations
+            // to ensure fair application of back pressure amongst the memory consumers.
+            .with_can_spill(oom_mode != OutOfMemoryMode::ReportError)
+            .register(context.memory_pool());
         timer.done();
 
         let exec_state = ExecutionState::ReadingInput;
@@ -557,18 +601,19 @@ impl GroupedHashAggregateStream {
         let spill_manager = SpillManager::new(
             context.runtime_env(),
             metrics::SpillMetrics::new(&agg.metrics, partition),
-            Arc::clone(&partial_agg_schema),
+            Arc::clone(&spill_schema),
         )
         .with_compression_type(context.session_config().spill_compression());
 
         let spill_state = SpillState {
             spills: vec![],
-            spill_expr,
-            spill_schema: partial_agg_schema,
+            spill_expr: spill_ordering,
+            spill_schema,
             is_stream_merging: false,
             merging_aggregate_arguments,
             merging_group_by: PhysicalGroupBy::new_single(merging_group_by_expr),
             peak_mem_used: MetricBuilder::new(&agg.metrics)
+                .with_category(MetricCategory::Bytes)
                 .gauge("peak_mem_used", partition),
             spill_manager,
         };
@@ -593,6 +638,7 @@ impl GroupedHashAggregateStream {
             let probe_ratio_threshold =
                 options.skip_partial_aggregation_probe_ratio_threshold;
             let skipped_aggregation_rows = MetricBuilder::new(&agg.metrics)
+                .with_category(MetricCategory::Rows)
                 .counter("skipped_aggregation_rows", partition);
             Some(SkipAggregationProbe::new(
                 probe_rows_threshold,
@@ -606,7 +652,7 @@ impl GroupedHashAggregateStream {
         let reduction_factor = if agg.mode == AggregateMode::Partial {
             Some(
                 MetricBuilder::new(&agg.metrics)
-                    .with_type(metrics::MetricType::SUMMARY)
+                    .with_type(metrics::MetricType::Summary)
                     .ratio_metrics("reduction_factor", partition),
             )
         } else {
@@ -622,6 +668,7 @@ impl GroupedHashAggregateStream {
             filter_expressions,
             group_by: agg_group_by,
             reservation,
+            oom_mode,
             group_values,
             current_group_indices: Default::default(),
             exec_state,
@@ -631,7 +678,7 @@ impl GroupedHashAggregateStream {
             group_ordering,
             input_done: false,
             spill_state,
-            group_values_soft_limit: agg.limit,
+            group_values_soft_limit: agg.limit_options().map(|config| config.limit()),
             skip_aggregation_probe,
             reduction_factor,
         })
@@ -671,23 +718,24 @@ impl Stream for GroupedHashAggregateStream {
             match &self.exec_state {
                 ExecutionState::ReadingInput => 'reading_input: {
                     match ready!(self.input.poll_next_unpin(cx)) {
-                        // New batch to aggregate in partial aggregation operator
-                        Some(Ok(batch)) if self.mode == AggregateMode::Partial => {
+                        // New batch to aggregate
+                        Some(Ok(batch)) => {
                             let timer = elapsed_compute.timer();
                             let input_rows = batch.num_rows();
 
-                            if let Some(reduction_factor) = self.reduction_factor.as_ref()
+                            if self.mode == AggregateMode::Partial
+                                && let Some(reduction_factor) =
+                                    self.reduction_factor.as_ref()
                             {
                                 reduction_factor.add_total(input_rows);
                             }
 
-                            // Do the grouping
-                            self.group_aggregate_batch(batch)?;
+                            // Do the grouping.
+                            // `group_aggregate_batch` will _not_ have updated the memory reservation yet.
+                            // The rest of the code will first try to reduce memory usage by
+                            // already emitting results.
+                            self.group_aggregate_batch(&batch)?;
 
-                            self.update_skip_aggregation_probe(input_rows);
-
-                            // If we can begin emitting rows, do so,
-                            // otherwise keep consuming input
                             assert!(!self.input_done);
 
                             // If the number of group values equals or exceeds the soft limit,
@@ -699,7 +747,13 @@ impl Stream for GroupedHashAggregateStream {
                                 break 'reading_input;
                             }
 
-                            if let Some(to_emit) = self.group_ordering.emit_to() {
+                            // Try to emit completed groups if possible.
+                            // If we already started spilling, we can no longer emit since
+                            // this might lead to incorrect output ordering
+                            if (self.spill_state.spills.is_empty()
+                                || self.spill_state.is_stream_merging)
+                                && let Some(to_emit) = self.group_ordering.emit_to()
+                            {
                                 timer.done();
                                 if let Some(batch) = self.emit(to_emit, false)? {
                                     self.exec_state =
@@ -709,44 +763,30 @@ impl Stream for GroupedHashAggregateStream {
                                 break 'reading_input;
                             }
 
-                            self.emit_early_if_necessary()?;
-
-                            self.switch_to_skip_aggregation()?;
-
-                            timer.done();
-                        }
-
-                        // New batch to aggregate in terminal aggregation operator
-                        // (Final/FinalPartitioned/Single/SinglePartitioned)
-                        Some(Ok(batch)) => {
-                            let timer = elapsed_compute.timer();
-
-                            // Make sure we have enough capacity for `batch`, otherwise spill
-                            self.spill_previous_if_necessary(&batch)?;
-
-                            // Do the grouping
-                            self.group_aggregate_batch(batch)?;
-
-                            // If we can begin emitting rows, do so,
-                            // otherwise keep consuming input
-                            assert!(!self.input_done);
-
-                            // If the number of group values equals or exceeds the soft limit,
-                            // emit all groups and switch to producing output
-                            if self.hit_soft_group_limit() {
-                                timer.done();
-                                self.set_input_done_and_produce_output()?;
-                                // make sure the exec_state just set is not overwritten below
-                                break 'reading_input;
+                            if self.mode == AggregateMode::Partial {
+                                // Spilling should never be activated in partial aggregation mode.
+                                assert!(!self.spill_state.is_stream_merging);
+
+                                // Check if we should switch to skip aggregation mode
+                                // It's important that we do this before we early emit since we've
+                                // already updated the probe.
+                                self.update_skip_aggregation_probe(input_rows);
+                                if let Some(new_state) =
+                                    self.switch_to_skip_aggregation()?
+                                {
+                                    timer.done();
+                                    self.exec_state = new_state;
+                                    break 'reading_input;
+                                }
                             }
 
-                            if let Some(to_emit) = self.group_ordering.emit_to() {
+                            // If we reach this point, try to update the memory reservation
+                            // handling out-of-memory conditions as determined by the OOM mode.
+                            if let Some(new_state) =
+                                self.try_update_memory_reservation()?
+                            {
                                 timer.done();
-                                if let Some(batch) = self.emit(to_emit, false)? {
-                                    self.exec_state =
-                                        ExecutionState::ProducingOutput(batch);
-                                };
-                                // make sure the exec_state just set is not overwritten below
+                                self.exec_state = new_state;
                                 break 'reading_input;
                             }
 
@@ -774,7 +814,7 @@ impl Stream for GroupedHashAggregateStream {
                             if let Some(probe) = self.skip_aggregation_probe.as_mut() {
                                 probe.record_skipped(&batch);
                             }
-                            let states = self.transform_to_states(batch)?;
+                            let states = self.transform_to_states(&batch)?;
                             return Poll::Ready(Some(Ok(
                                 states.record_output(&self.baseline_metrics)
                             )));
@@ -785,6 +825,15 @@ impl Stream for GroupedHashAggregateStream {
                         }
                         None => {
                             // inner is done, switching to `Done` state
+                            // Sanity check: when switching from SkippingAggregation to Done,
+                            // all groups should have already been emitted
+                            if !self.group_values.is_empty() {
+                                return Poll::Ready(Some(internal_err!(
+                                    "Switching from SkippingAggregation to Done with {} groups still in hash table. \
+                                    This is a bug - all groups should have been emitted before skip aggregation started.",
+                                    self.group_values.len()
+                                )));
+                            }
                             self.exec_state = ExecutionState::Done;
                         }
                     }
@@ -832,6 +881,14 @@ impl Stream for GroupedHashAggregateStream {
                 }
 
                 ExecutionState::Done => {
+                    // Sanity check: all groups should have been emitted by now
+                    if !self.group_values.is_empty() {
+                        return Poll::Ready(Some(internal_err!(
+                            "AggregateStream was in Done state with {} groups left in hash table. \
+                            This is a bug - all groups should have been emitted before entering Done state.",
+                            self.group_values.len()
+                        )));
+                    }
                     // release the memory reservation since sending back output batch itself needs
                     // some memory reservation, so make some room for it.
                     self.clear_all();
@@ -851,12 +908,12 @@ impl RecordBatchStream for GroupedHashAggregateStream {
 
 impl GroupedHashAggregateStream {
     /// Perform group-by aggregation for the given [`RecordBatch`].
-    fn group_aggregate_batch(&mut self, batch: RecordBatch) -> Result<()> {
+    fn group_aggregate_batch(&mut self, batch: &RecordBatch) -> Result<()> {
         // Evaluate the grouping expressions
         let group_by_values = if self.spill_state.is_stream_merging {
-            evaluate_group_by(&self.spill_state.merging_group_by, &batch)?
+            evaluate_group_by(&self.spill_state.merging_group_by, batch)?
         } else {
-            evaluate_group_by(&self.group_by, &batch)?
+            evaluate_group_by(&self.group_by, batch)?
         };
 
         // Only create the timer if there are actual aggregate arguments to evaluate
@@ -873,18 +930,18 @@ impl GroupedHashAggregateStream {
 
         // Evaluate the aggregation expressions.
         let input_values = if self.spill_state.is_stream_merging {
-            evaluate_many(&self.spill_state.merging_aggregate_arguments, &batch)?
+            evaluate_many(&self.spill_state.merging_aggregate_arguments, batch)?
         } else {
-            evaluate_many(&self.aggregate_arguments, &batch)?
+            evaluate_many(&self.aggregate_arguments, batch)?
         };
         drop(timer);
 
         // Evaluate the filter expressions, if any, against the inputs
         let filter_values = if self.spill_state.is_stream_merging {
             let filter_expressions = vec![None; self.accumulators.len()];
-            evaluate_optional(&filter_expressions, &batch)?
+            evaluate_optional(&filter_expressions, batch)?
         } else {
-            evaluate_optional(&self.filter_expressions, &batch)?
+            evaluate_optional(&self.filter_expressions, batch)?
         };
 
         for group_values in &group_by_values {
@@ -924,28 +981,24 @@ impl GroupedHashAggregateStream {
 
                 // Call the appropriate method on each aggregator with
                 // the entire input row and the relevant group indexes
-                match self.mode {
-                    AggregateMode::Partial
-                    | AggregateMode::Single
-                    | AggregateMode::SinglePartitioned
-                        if !self.spill_state.is_stream_merging =>
-                    {
-                        acc.update_batch(
-                            values,
-                            group_indices,
-                            opt_filter,
-                            total_num_groups,
-                        )?;
-                    }
-                    _ => {
-                        if opt_filter.is_some() {
-                            return internal_err!("aggregate filter should be applied in partial stage, there should be no filter in final stage");
-                        }
-
-                        // if aggregation is over intermediate states,
-                        // use merge
-                        acc.merge_batch(values, group_indices, None, total_num_groups)?;
-                    }
+                if self.mode.input_mode() == AggregateInputMode::Raw
+                    && !self.spill_state.is_stream_merging
+                {
+                    acc.update_batch(
+                        values,
+                        group_indices,
+                        opt_filter,
+                        total_num_groups,
+                    )?;
+                } else {
+                    assert_or_internal_err!(
+                        opt_filter.is_none(),
+                        "aggregate filter should be applied in partial stage, there should be no filter in final stage"
+                    );
+
+                    // if aggregation is over intermediate states,
+                    // use merge
+                    acc.merge_batch(values, group_indices, None, total_num_groups)?;
                 }
                 self.group_by_metrics
                     .aggregation_time
@@ -953,25 +1006,76 @@ impl GroupedHashAggregateStream {
             }
         }
 
-        match self.update_memory_reservation() {
-            // Here we can ignore `insufficient_capacity_err` because we will spill later,
-            // but at least one batch should fit in the memory
-            Err(DataFusionError::ResourcesExhausted(_))
-                if self.group_values.len() >= self.batch_size =>
-            {
-                Ok(())
+        Ok(())
+    }
+
+    /// Attempts to update the memory reservation. If that fails due to a
+    /// [DataFusionError::ResourcesExhausted] error, an attempt will be made to resolve
+    /// the out-of-memory condition based on the [out-of-memory handling mode](OutOfMemoryMode).
+    ///
+    /// If the out-of-memory condition can not be resolved, an `Err` value will be returned
+    ///
+    /// Returns `Ok(Some(ExecutionState))` if the state should be changed, `Ok(None)` otherwise.
+    fn try_update_memory_reservation(&mut self) -> Result<Option<ExecutionState>> {
+        let oom = match self.update_memory_reservation() {
+            Err(e @ DataFusionError::ResourcesExhausted(_)) => e,
+            Err(e) => return Err(e),
+            Ok(_) => return Ok(None),
+        };
+
+        match self.oom_mode {
+            OutOfMemoryMode::Spill if !self.group_values.is_empty() => {
+                self.spill()?;
+                self.clear_shrink(self.batch_size);
+                self.update_memory_reservation()?;
+                Ok(None)
+            }
+            OutOfMemoryMode::EmitEarly if self.group_values.len() > 1 => {
+                let n = if self.group_values.len() >= self.batch_size {
+                    // Try to emit an integer multiple of batch size if possible
+                    self.group_values.len() / self.batch_size * self.batch_size
+                } else {
+                    // Otherwise emit whatever we can
+                    self.group_values.len()
+                };
+
+                if let Some(emit_to) = self.group_ordering.oom_emit_to(n)
+                    && let Some(batch) = self.emit(emit_to, false)?
+                {
+                    return Ok(Some(ExecutionState::ProducingOutput(batch)));
+                }
+                Err(oom)
             }
-            other => other,
+            OutOfMemoryMode::EmitEarly
+            | OutOfMemoryMode::Spill
+            | OutOfMemoryMode::ReportError => Err(oom),
         }
     }
 
     fn update_memory_reservation(&mut self) -> Result<()> {
         let acc = self.accumulators.iter().map(|x| x.size()).sum::<usize>();
-        let reservation_result = self.reservation.try_resize(
-            acc + self.group_values.size()
-                + self.group_ordering.size()
-                + self.current_group_indices.allocated_size(),
-        );
+        let groups_and_acc_size = acc
+            + self.group_values.size()
+            + self.group_ordering.size()
+            + self.current_group_indices.allocated_size();
+
+        // Reserve extra headroom for sorting during potential spill.
+        // When OOM triggers, group_aggregate_batch has already processed the
+        // latest input batch, so the internal state may have grown well beyond
+        // the last successful reservation. The emit batch reflects this larger
+        // actual state, and the sort needs memory proportional to it.
+        // By reserving headroom equal to the data size, we trigger OOM earlier
+        // (before too much data accumulates), ensuring the freed reservation
+        // after clear_shrink is sufficient to cover the sort memory.
+        let sort_headroom =
+            if self.oom_mode == OutOfMemoryMode::Spill && !self.group_values.is_empty() {
+                acc + self.group_values.size()
+            } else {
+                0
+            };
+
+        let new_size = groups_and_acc_size + sort_headroom;
+        let reservation_result = self.reservation.try_resize(new_size);
 
         if reservation_result.is_ok() {
             self.spill_state
@@ -1002,17 +1106,12 @@ impl GroupedHashAggregateStream {
 
         // Next output each aggregate value
         for acc in self.accumulators.iter_mut() {
-            match self.mode {
-                AggregateMode::Partial => output.extend(acc.state(emit_to)?),
-                _ if spilling => {
-                    // If spilling, output partial state because the spilled data will be
-                    // merged and re-evaluated later.
-                    output.extend(acc.state(emit_to)?)
-                }
-                AggregateMode::Final
-                | AggregateMode::FinalPartitioned
-                | AggregateMode::Single
-                | AggregateMode::SinglePartitioned => output.push(acc.evaluate(emit_to)?),
+            if self.mode.output_mode() == AggregateOutputMode::Final && !spilling {
+                output.push(acc.evaluate(emit_to)?)
+            } else {
+                // Output partial state: either because we're in a non-final mode,
+                // or because we're spilling and will merge/re-evaluate later.
+                output.extend(acc.state(emit_to)?)
             }
         }
         drop(timer);
@@ -1026,24 +1125,6 @@ impl GroupedHashAggregateStream {
         Ok(Some(batch))
     }
 
-    /// Optimistically, [`Self::group_aggregate_batch`] allows to exceed the memory target slightly
-    /// (~ 1 [`RecordBatch`]) for simplicity. In such cases, spill the data to disk and clear the
-    /// memory. Currently only [`GroupOrdering::None`] is supported for spilling.
-    fn spill_previous_if_necessary(&mut self, batch: &RecordBatch) -> Result<()> {
-        // TODO: support group_ordering for spilling
-        if !self.group_values.is_empty()
-            && batch.num_rows() > 0
-            && matches!(self.group_ordering, GroupOrdering::None)
-            && !self.spill_state.is_stream_merging
-            && self.update_memory_reservation().is_err()
-        {
-            assert_ne!(self.mode, AggregateMode::Partial);
-            self.spill()?;
-            self.clear_shrink(batch);
-        }
-        Ok(())
-    }
-
     /// Emit all intermediate aggregation states, sort them, and store them on disk.
     /// This process helps in reducing memory pressure by allowing the data to be
     /// read back with streaming merge.
@@ -1052,17 +1133,47 @@ impl GroupedHashAggregateStream {
         let Some(emit) = self.emit(EmitTo::All, true)? else {
             return Ok(());
         };
-        let sorted = sort_batch(&emit, &self.spill_state.spill_expr, None)?;
 
-        // Spill sorted state to disk
+        // Free accumulated state now that data has been emitted into `emit`.
+        // This must happen before reserving sort memory so the pool has room.
+        // Use 0 to minimize allocated capacity and maximize memory available for sorting.
+        self.clear_shrink(0);
+        self.update_memory_reservation()?;
+
+        let batch_size_ratio = self.batch_size as f32 / emit.num_rows() as f32;
+        let batch_memory = get_record_batch_memory_size(&emit);
+        // The maximum worst case for a sort is 2X the original underlying buffers(regardless of slicing)
+        // First we get the underlying buffers' size, then we get the sliced("actual") size of the batch,
+        // and multiply it by the ratio of batch_size to actual size to get the estimated memory needed for sorting the batch.
+        // If something goes wrong in get_sliced_size()(double counting or something),
+        // we fall back to the worst case.
+        let sort_memory = (batch_memory
+            + (emit.get_sliced_size()? as f32 * batch_size_ratio) as usize)
+            .min(batch_memory * 2);
+
+        // If we can't grow even that, we have no choice but to return an error since we can't spill to disk without sorting the data first.
+        self.reservation.try_grow(sort_memory).map_err(|err| {
+            resources_datafusion_err!(
+                "Failed to reserve memory for sort during spill: {err}"
+            )
+        })?;
+
+        let sorted_iter = IncrementalSortIterator::new(
+            emit,
+            self.spill_state.spill_expr.clone(),
+            self.batch_size,
+        );
         let spillfile = self
             .spill_state
             .spill_manager
-            .spill_record_batch_by_size_and_return_max_batch_memory(
-                &sorted,
+            .spill_record_batch_iter_and_return_max_batch_memory(
+                sorted_iter,
                 "HashAggSpill",
-                self.batch_size,
             )?;
+
+        // Shrink the memory we allocated for sorting as the sorting is fully done at this point.
+        self.reservation.shrink(sort_memory);
+
         match spillfile {
             Some((spillfile, max_record_batch_memory)) => {
                 self.spill_state.spills.push(SortedSpillFile {
@@ -1080,71 +1191,16 @@ impl GroupedHashAggregateStream {
         Ok(())
     }
 
-    /// Clear memory and shirk capacities to the size of the batch.
-    fn clear_shrink(&mut self, batch: &RecordBatch) {
-        self.group_values.clear_shrink(batch);
+    /// Clear memory and shrink capacities to the given number of rows.
+    fn clear_shrink(&mut self, num_rows: usize) {
+        self.group_values.clear_shrink(num_rows);
         self.current_group_indices.clear();
-        self.current_group_indices.shrink_to(batch.num_rows());
+        self.current_group_indices.shrink_to(num_rows);
     }
 
-    /// Clear memory and shirk capacities to zero.
+    /// Clear memory and shrink capacities to zero.
     fn clear_all(&mut self) {
-        let s = self.schema();
-        self.clear_shrink(&RecordBatch::new_empty(s));
-    }
-
-    /// Emit if the used memory exceeds the target for partial aggregation.
-    /// Currently only [`GroupOrdering::None`] is supported for early emitting.
-    /// TODO: support group_ordering for early emitting
-    fn emit_early_if_necessary(&mut self) -> Result<()> {
-        if self.group_values.len() >= self.batch_size
-            && matches!(self.group_ordering, GroupOrdering::None)
-            && self.update_memory_reservation().is_err()
-        {
-            assert_eq!(self.mode, AggregateMode::Partial);
-            let n = self.group_values.len() / self.batch_size * self.batch_size;
-            if let Some(batch) = self.emit(EmitTo::First(n), false)? {
-                self.exec_state = ExecutionState::ProducingOutput(batch);
-            };
-        }
-        Ok(())
-    }
-
-    /// At this point, all the inputs are read and there are some spills.
-    /// Emit the remaining rows and create a batch.
-    /// Conduct a streaming merge sort between the batch and spilled data. Since the stream is fully
-    /// sorted, set `self.group_ordering` to Full, then later we can read with [`EmitTo::First`].
-    fn update_merged_stream(&mut self) -> Result<()> {
-        let Some(batch) = self.emit(EmitTo::All, true)? else {
-            return Ok(());
-        };
-        // clear up memory for streaming_merge
-        self.clear_all();
-        self.update_memory_reservation()?;
-        let mut streams: Vec<SendableRecordBatchStream> = vec![];
-        let expr = self.spill_state.spill_expr.clone();
-        let schema = batch.schema();
-        streams.push(Box::pin(RecordBatchStreamAdapter::new(
-            Arc::clone(&schema),
-            futures::stream::once(futures::future::lazy(move |_| {
-                sort_batch(&batch, &expr, None)
-            })),
-        )));
-
-        self.spill_state.is_stream_merging = true;
-        self.input = StreamingMergeBuilder::new()
-            .with_streams(streams)
-            .with_schema(schema)
-            .with_spill_manager(self.spill_state.spill_manager.clone())
-            .with_sorted_spill_files(std::mem::take(&mut self.spill_state.spills))
-            .with_expressions(&self.spill_state.spill_expr)
-            .with_metrics(self.baseline_metrics.clone())
-            .with_batch_size(self.batch_size)
-            .with_reservation(self.reservation.new_empty())
-            .build()?;
-        self.input_done = false;
-        self.group_ordering = GroupOrdering::Full(GroupOrderingFull::new());
-        Ok(())
+        self.clear_shrink(0);
     }
 
     /// returns true if there is a soft groups limit and the number of distinct
@@ -1156,18 +1212,72 @@ impl GroupedHashAggregateStream {
         group_values_soft_limit <= self.group_values.len()
     }
 
-    /// common function for signalling end of processing of the input stream
+    /// Finalizes reading of the input stream and prepares for producing output values.
+    ///
+    /// This method is called both when the original input stream and,
+    /// in case of disk spilling, the SPM stream have been drained.
     fn set_input_done_and_produce_output(&mut self) -> Result<()> {
         self.input_done = true;
         self.group_ordering.input_done();
         let elapsed_compute = self.baseline_metrics.elapsed_compute().clone();
         let timer = elapsed_compute.timer();
         self.exec_state = if self.spill_state.spills.is_empty() {
+            // Input has been entirely processed without spilling to disk.
+
+            // Flush any remaining group values.
             let batch = self.emit(EmitTo::All, false)?;
+
+            // If there are none, we're done; otherwise switch to emitting them
             batch.map_or(ExecutionState::Done, ExecutionState::ProducingOutput)
         } else {
-            // If spill files exist, stream-merge them.
-            self.update_merged_stream()?;
+            // Spill any remaining data to disk. There is some performance overhead in
+            // writing out this last chunk of data and reading it back. The benefit of
+            // doing this is that memory usage for this stream is reduced, and the more
+            // sophisticated memory handling in `MultiLevelMergeBuilder` can take over
+            // instead.
+            // Spilling to disk and reading back also ensures batch size is consistent
+            // rather than potentially having one significantly larger last batch.
+            self.spill()?;
+
+            // Mark that we're switching to stream merging mode.
+            self.spill_state.is_stream_merging = true;
+
+            self.input = StreamingMergeBuilder::new()
+                .with_schema(Arc::clone(&self.spill_state.spill_schema))
+                .with_spill_manager(self.spill_state.spill_manager.clone())
+                .with_sorted_spill_files(std::mem::take(&mut self.spill_state.spills))
+                .with_expressions(&self.spill_state.spill_expr)
+                .with_metrics(self.baseline_metrics.clone())
+                .with_batch_size(self.batch_size)
+                .with_reservation(self.reservation.new_empty())
+                .build()?;
+            self.input_done = false;
+
+            // Reset the group values collectors.
+            self.clear_all();
+
+            // We can now use `GroupOrdering::Full` since the spill files are sorted
+            // on the grouping columns.
+            self.group_ordering = GroupOrdering::Full(GroupOrderingFull::new());
+
+            // Recreate `group_values` for streaming merge so group ids are assigned
+            // in first-seen order, as required by `GroupOrderingFull`.
+            // The pre-spill multi-column collector may use `vectorized_intern`, which
+            // can assign new group ids out of input order under hash collisions.
+            let group_schema = self
+                .spill_state
+                .merging_group_by
+                .group_schema(&self.spill_state.spill_schema)?;
+            if group_schema.fields().len() > 1 {
+                self.group_values = new_group_values(group_schema, &self.group_ordering)?;
+            }
+
+            // Use `OutOfMemoryMode::ReportError` from this point on
+            // to ensure we don't spill the spilled data to disk again.
+            self.oom_mode = OutOfMemoryMode::ReportError;
+
+            self.update_memory_reservation()?;
+
             ExecutionState::ReadingInput
         };
         timer.done();
@@ -1190,16 +1300,17 @@ impl GroupedHashAggregateStream {
     /// skipped, forces stream to produce currently accumulated output.
     ///
     /// Notice: It should only be called in Partial aggregation
-    fn switch_to_skip_aggregation(&mut self) -> Result<()> {
-        if let Some(probe) = self.skip_aggregation_probe.as_mut() {
-            if probe.should_skip() {
-                if let Some(batch) = self.emit(EmitTo::All, false)? {
-                    self.exec_state = ExecutionState::ProducingOutput(batch);
-                };
-            }
-        }
+    ///
+    /// Returns `Some(ExecutionState)` if the state should be changed, None otherwise.
+    fn switch_to_skip_aggregation(&mut self) -> Result<Option<ExecutionState>> {
+        if let Some(probe) = self.skip_aggregation_probe.as_mut()
+            && probe.should_skip()
+            && let Some(batch) = self.emit(EmitTo::All, false)?
+        {
+            return Ok(Some(ExecutionState::ProducingOutput(batch)));
+        };
 
-        Ok(())
+        Ok(None)
     }
 
     /// Returns true if the aggregation probe indicates that aggregation
@@ -1213,14 +1324,16 @@ impl GroupedHashAggregateStream {
     }
 
     /// Transforms input batch to intermediate aggregate state, without grouping it
-    fn transform_to_states(&self, batch: RecordBatch) -> Result<RecordBatch> {
-        let mut group_values = evaluate_group_by(&self.group_by, &batch)?;
-        let input_values = evaluate_many(&self.aggregate_arguments, &batch)?;
-        let filter_values = evaluate_optional(&self.filter_expressions, &batch)?;
-
-        if group_values.len() != 1 {
-            return internal_err!("group_values expected to have single element");
-        }
+    fn transform_to_states(&self, batch: &RecordBatch) -> Result<RecordBatch> {
+        let mut group_values = evaluate_group_by(&self.group_by, batch)?;
+        let input_values = evaluate_many(&self.aggregate_arguments, batch)?;
+        let filter_values = evaluate_optional(&self.filter_expressions, batch)?;
+
+        assert_eq_or_internal_err!(
+            group_values.len(),
+            1,
+            "group_values expected to have single element"
+        );
         let mut output = group_values.swap_remove(0);
 
         let iter = self
@@ -1239,3 +1352,353 @@ impl GroupedHashAggregateStream {
         Ok(states_batch)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::InputOrderMode;
+    use crate::execution_plan::ExecutionPlan;
+    use crate::test::TestMemoryExec;
+    use arrow::array::{Int32Array, Int64Array};
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+    use datafusion_functions_aggregate::count::count_udaf;
+    use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+    use datafusion_physical_expr::expressions::col;
+
+    #[tokio::test]
+    async fn test_double_emission_race_condition_bug() -> Result<()> {
+        // Fix for https://github.com/apache/datafusion/issues/18701
+        // This test specifically proves that we have fixed double emission race condition
+        // where emit_early_if_necessary() and switch_to_skip_aggregation()
+        // both emit in the same loop iteration, causing data loss
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("group_col", DataType::Int32, false),
+            Field::new("value_col", DataType::Int64, false),
+        ]));
+
+        // Create data that will trigger BOTH conditions in the same iteration:
+        // 1. More groups than batch_size (triggers early emission when memory pressure hits)
+        // 2. High cardinality ratio (triggers skip aggregation)
+        let batch_size = 1024; // We'll set this in session config
+        let num_groups = batch_size + 100; // Slightly more than batch_size (1124 groups)
+
+        // Create exactly 1 row per group = 100% cardinality ratio
+        let group_ids: Vec<i32> = (0..num_groups as i32).collect();
+        let values: Vec<i64> = vec![1; num_groups];
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(group_ids)),
+                Arc::new(Int64Array::from(values)),
+            ],
+        )?;
+
+        let input_partitions = vec![vec![batch]];
+
+        // Create constrained memory to trigger early emission but not completely fail
+        let runtime = RuntimeEnvBuilder::default()
+            .with_memory_limit(1024, 1.0) // small enough to start but will trigger pressure
+            .build_arc()?;
+
+        let mut task_ctx = TaskContext::default().with_runtime(runtime);
+
+        // Configure to trigger BOTH conditions:
+        // 1. Low probe threshold (triggers skip probe after few rows)
+        // 2. Low ratio threshold (triggers skip aggregation immediately)
+        // 3. Set batch_size to 1024 so our 1124 groups will trigger early emission
+        // This creates the race condition where both emit paths are triggered
+        let mut session_config = task_ctx.session_config().clone();
+        session_config = session_config.set(
+            "datafusion.execution.batch_size",
+            &datafusion_common::ScalarValue::UInt64(Some(1024)),
+        );
+        session_config = session_config.set(
+            "datafusion.execution.skip_partial_aggregation_probe_rows_threshold",
+            &datafusion_common::ScalarValue::UInt64(Some(50)),
+        );
+        session_config = session_config.set(
+            "datafusion.execution.skip_partial_aggregation_probe_ratio_threshold",
+            &datafusion_common::ScalarValue::Float64(Some(0.8)),
+        );
+        task_ctx = task_ctx.with_session_config(session_config);
+        let task_ctx = Arc::new(task_ctx);
+
+        // Create aggregate: COUNT(*) GROUP BY group_col
+        let group_expr = vec![(col("group_col", &schema)?, "group_col".to_string())];
+        let aggr_expr = vec![Arc::new(
+            AggregateExprBuilder::new(count_udaf(), vec![col("value_col", &schema)?])
+                .schema(Arc::clone(&schema))
+                .alias("count_value")
+                .build()?,
+        )];
+
+        let exec = TestMemoryExec::try_new(&input_partitions, Arc::clone(&schema), None)?;
+        let exec = Arc::new(TestMemoryExec::update_cache(&Arc::new(exec)));
+
+        // Use Partial mode where the race condition occurs
+        let aggregate_exec = AggregateExec::try_new(
+            AggregateMode::Partial,
+            PhysicalGroupBy::new_single(group_expr),
+            aggr_expr,
+            vec![None],
+            exec,
+            Arc::clone(&schema),
+        )?;
+
+        // Execute and collect results
+        let mut stream =
+            GroupedHashAggregateStream::new(&aggregate_exec, &Arc::clone(&task_ctx), 0)?;
+        let mut results = Vec::new();
+
+        while let Some(result) = stream.next().await {
+            let batch = result?;
+            results.push(batch);
+        }
+
+        // Count total groups emitted
+        let mut total_output_groups = 0;
+        for batch in &results {
+            total_output_groups += batch.num_rows();
+        }
+
+        assert_eq!(
+            total_output_groups, num_groups,
+            "Unexpected number of groups",
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_skip_aggregation_probe_not_locked_until_skip() -> Result<()> {
+        // Test that the probe is not locked until we actually decide to skip.
+        // This allows us to continue evaluating the skip condition across multiple batches.
+        //
+        // Scenario:
+        // - Batch 1: Hits rows threshold but NOT ratio threshold (low cardinality) -> don't skip
+        // - Batch 2: Now hits ratio threshold (high cardinality) -> skip
+        //
+        // Without the fix, the probe would be locked after batch 1, preventing the skip
+        // decision from being made on batch 2.
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("group_col", DataType::Int32, false),
+            Field::new("value_col", DataType::Int32, false),
+        ]));
+
+        // Configure thresholds:
+        // - probe_rows_threshold: 100 rows
+        // - probe_ratio_threshold: 0.8 (80%)
+        let probe_rows_threshold = 100;
+        let probe_ratio_threshold = 0.8;
+
+        // Batch 1: 100 rows with only 10 unique groups
+        // Ratio: 10/100 = 0.1 (10%) < 0.8 -> should NOT skip
+        // This will hit the rows threshold but not the ratio threshold
+        let batch1_rows = 100;
+        let batch1_groups = 10;
+        let mut group_ids_batch1 = Vec::new();
+        for i in 0..batch1_rows {
+            group_ids_batch1.push((i % batch1_groups) as i32);
+        }
+        let values_batch1: Vec<i32> = vec![1; batch1_rows];
+
+        let batch1 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(group_ids_batch1)),
+                Arc::new(Int32Array::from(values_batch1)),
+            ],
+        )?;
+
+        // Batch 2: 350 rows with 350 unique NEW groups (starting from group 10)
+        // After batch 2, total: 450 rows, 360 groups
+        // Ratio: 360/450 = 0.8 (80%) >= 0.8 -> SHOULD decide to skip
+        let batch2_rows = 350;
+        let batch2_groups = 350;
+        let group_ids_batch2: Vec<i32> = (batch1_groups..(batch1_groups + batch2_groups))
+            .map(|x| x as i32)
+            .collect();
+        let values_batch2: Vec<i32> = vec![1; batch2_rows];
+
+        let batch2 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(group_ids_batch2)),
+                Arc::new(Int32Array::from(values_batch2)),
+            ],
+        )?;
+
+        // Batch 3: This batch should be skipped since we decided to skip after batch 2
+        // 100 rows with 100 unique groups (continuing from where batch 2 left off)
+        let batch3_rows = 100;
+        let batch3_groups = 100;
+        let batch3_start_group = batch1_groups + batch2_groups;
+        let group_ids_batch3: Vec<i32> = (batch3_start_group
+            ..(batch3_start_group + batch3_groups))
+            .map(|x| x as i32)
+            .collect();
+        let values_batch3: Vec<i32> = vec![1; batch3_rows];
+
+        let batch3 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(group_ids_batch3)),
+                Arc::new(Int32Array::from(values_batch3)),
+            ],
+        )?;
+
+        let input_partitions = vec![vec![batch1, batch2, batch3]];
+
+        let runtime = RuntimeEnvBuilder::default().build_arc()?;
+        let mut task_ctx = TaskContext::default().with_runtime(runtime);
+
+        // Configure skip aggregation settings
+        let mut session_config = task_ctx.session_config().clone();
+        session_config = session_config.set(
+            "datafusion.execution.skip_partial_aggregation_probe_rows_threshold",
+            &datafusion_common::ScalarValue::UInt64(Some(probe_rows_threshold)),
+        );
+        session_config = session_config.set(
+            "datafusion.execution.skip_partial_aggregation_probe_ratio_threshold",
+            &datafusion_common::ScalarValue::Float64(Some(probe_ratio_threshold)),
+        );
+        task_ctx = task_ctx.with_session_config(session_config);
+        let task_ctx = Arc::new(task_ctx);
+
+        // Create aggregate: COUNT(*) GROUP BY group_col
+        let group_expr = vec![(col("group_col", &schema)?, "group_col".to_string())];
+        let aggr_expr = vec![Arc::new(
+            AggregateExprBuilder::new(count_udaf(), vec![col("value_col", &schema)?])
+                .schema(Arc::clone(&schema))
+                .alias("count_value")
+                .build()?,
+        )];
+
+        let exec = TestMemoryExec::try_new(&input_partitions, Arc::clone(&schema), None)?;
+        let exec = Arc::new(TestMemoryExec::update_cache(&Arc::new(exec)));
+
+        // Use Partial mode
+        let aggregate_exec = AggregateExec::try_new(
+            AggregateMode::Partial,
+            PhysicalGroupBy::new_single(group_expr),
+            aggr_expr,
+            vec![None],
+            exec,
+            Arc::clone(&schema),
+        )?;
+
+        // Execute and collect results
+        let mut stream =
+            GroupedHashAggregateStream::new(&aggregate_exec, &Arc::clone(&task_ctx), 0)?;
+        let mut results = Vec::new();
+
+        while let Some(result) = stream.next().await {
+            let batch = result?;
+            results.push(batch);
+        }
+
+        // Check that skip aggregation actually happened
+        // The key metric is skipped_aggregation_rows
+        let metrics = aggregate_exec.metrics().unwrap();
+        let skipped_rows = metrics
+            .sum_by_name("skipped_aggregation_rows")
+            .map(|m| m.as_usize())
+            .unwrap_or(0);
+
+        // We expect batch 3's rows to be skipped (100 rows)
+        assert_eq!(
+            skipped_rows, batch3_rows,
+            "Expected batch 3's rows ({batch3_rows}) to be skipped",
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_emit_early_with_partially_sorted() -> Result<()> {
+        // Reproducer for #20445: EmitEarly with PartiallySorted panics in
+        // remove_groups because it emits more groups than the sort boundary.
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("sort_col", DataType::Int32, false),
+            Field::new("group_col", DataType::Int32, false),
+            Field::new("value_col", DataType::Int64, false),
+        ]));
+
+        // All rows share sort_col=1 (no sort boundary), with unique group_col
+        // values to create many groups and trigger memory pressure.
+        let n = 256;
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1; n])),
+                Arc::new(Int32Array::from((0..n as i32).collect::<Vec<_>>())),
+                Arc::new(Int64Array::from(vec![1; n])),
+            ],
+        )?;
+
+        let runtime = RuntimeEnvBuilder::default()
+            .with_memory_limit(4096, 1.0)
+            .build_arc()?;
+        let mut task_ctx = TaskContext::default().with_runtime(runtime);
+        let mut cfg = task_ctx.session_config().clone();
+        cfg = cfg.set(
+            "datafusion.execution.batch_size",
+            &datafusion_common::ScalarValue::UInt64(Some(128)),
+        );
+        cfg = cfg.set(
+            "datafusion.execution.skip_partial_aggregation_probe_rows_threshold",
+            &datafusion_common::ScalarValue::UInt64(Some(u64::MAX)),
+        );
+        task_ctx = task_ctx.with_session_config(cfg);
+        let task_ctx = Arc::new(task_ctx);
+
+        let ordering = LexOrdering::new(vec![PhysicalSortExpr::new_default(Arc::new(
+            Column::new("sort_col", 0),
+        )
+            as _)])
+        .unwrap();
+        let exec = TestMemoryExec::try_new(&[vec![batch]], Arc::clone(&schema), None)?
+            .try_with_sort_information(vec![ordering])?;
+        let exec = Arc::new(TestMemoryExec::update_cache(&Arc::new(exec)));
+
+        // GROUP BY sort_col, group_col with input sorted on sort_col
+        // gives PartiallySorted([0])
+        let aggregate_exec = AggregateExec::try_new(
+            AggregateMode::Partial,
+            PhysicalGroupBy::new_single(vec![
+                (col("sort_col", &schema)?, "sort_col".to_string()),
+                (col("group_col", &schema)?, "group_col".to_string()),
+            ]),
+            vec![Arc::new(
+                AggregateExprBuilder::new(count_udaf(), vec![col("value_col", &schema)?])
+                    .schema(Arc::clone(&schema))
+                    .alias("count_value")
+                    .build()?,
+            )],
+            vec![None],
+            exec,
+            Arc::clone(&schema),
+        )?;
+        assert!(matches!(
+            aggregate_exec.input_order_mode(),
+            InputOrderMode::PartiallySorted(_)
+        ));
+
+        // Must not panic with "assertion failed: *current_sort >= n"
+        let mut stream = GroupedHashAggregateStream::new(&aggregate_exec, &task_ctx, 0)?;
+        while let Some(result) = stream.next().await {
+            if let Err(e) = result {
+                if e.to_string().contains("Resources exhausted") {
+                    break;
+                }
+                return Err(e);
+            }
+        }
+
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/aggregates/topk/hash_table.rs b/datafusion/physical-plan/src/aggregates/topk/hash_table.rs
index 974aea3b6292c..694780f08547f 100644
--- a/datafusion/physical-plan/src/aggregates/topk/hash_table.rs
+++ b/datafusion/physical-plan/src/aggregates/topk/hash_table.rs
@@ -15,22 +15,23 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! A wrapper around `hashbrown::RawTable` that allows entries to be tracked by index
+//! A wrapper around `hashbrown::HashTable` that allows entries to be tracked by index
 
 use crate::aggregates::group_values::HashValue;
 use crate::aggregates::topk::heap::Comparable;
-use ahash::RandomState;
 use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano};
 use arrow::array::{
-    builder::PrimitiveBuilder, cast::AsArray, downcast_primitive, Array, ArrayRef,
-    ArrowPrimitiveType, LargeStringArray, PrimitiveArray, StringArray, StringViewArray,
+    Array, ArrayRef, ArrowPrimitiveType, LargeStringArray, PrimitiveArray, StringArray,
+    StringViewArray, builder::PrimitiveBuilder, cast::AsArray, downcast_primitive,
 };
-use arrow::datatypes::{i256, DataType};
-use datafusion_common::exec_datafusion_err;
+use arrow::datatypes::{DataType, i256};
 use datafusion_common::Result;
+use datafusion_common::exec_datafusion_err;
+use datafusion_common::hash_utils::RandomState;
 use half::f16;
-use hashbrown::raw::RawTable;
+use hashbrown::hash_table::HashTable;
 use std::fmt::Debug;
+use std::hash::BuildHasher;
 use std::sync::Arc;
 
 /// A "type alias" for Keys which are stored in our map
@@ -48,13 +49,17 @@ pub struct HashTableItem<ID: KeyType> {
     pub heap_idx: usize,
 }
 
-/// A custom wrapper around `hashbrown::RawTable` that:
+/// A custom wrapper around `hashbrown::HashTable` that:
 /// 1. limits the number of entries to the top K
 /// 2. Allocates a capacity greater than top K to maintain a low-fill factor and prevent resizing
 /// 3. Tracks indexes to allow corresponding heap to refer to entries by index vs hash
-/// 4. Catches resize events to allow the corresponding heap to update it's indexes
 struct TopKHashTable<ID: KeyType> {
-    map: RawTable<HashTableItem<ID>>,
+    map: HashTable<usize>,
+    // Store the actual items separately to allow for index-based access
+    store: Vec<Option<HashTableItem<ID>>>,
+    // Free index in the store for reuse
+    free_index: Option<usize>,
+    // The maximum number of entries allowed
     limit: usize,
 }
 
@@ -62,25 +67,23 @@ struct TopKHashTable<ID: KeyType> {
 pub trait ArrowHashTable {
     fn set_batch(&mut self, ids: ArrayRef);
     fn len(&self) -> usize;
-    // JUSTIFICATION
-    //  Benefit:  ~15% speedup + required to index into RawTable from binary heap
-    //  Soundness: the caller must provide valid indexes
-    unsafe fn update_heap_idx(&mut self, mapper: &[(usize, usize)]);
-    // JUSTIFICATION
-    //  Benefit:  ~15% speedup + required to index into RawTable from binary heap
-    //  Soundness: the caller must provide a valid index
-    unsafe fn heap_idx_at(&self, map_idx: usize) -> usize;
-    unsafe fn take_all(&mut self, indexes: Vec<usize>) -> ArrayRef;
-
-    // JUSTIFICATION
-    //  Benefit:  ~15% speedup + required to index into RawTable from binary heap
-    //  Soundness: the caller must provide valid indexes
-    unsafe fn find_or_insert(
-        &mut self,
-        row_idx: usize,
-        replace_idx: usize,
-        map: &mut Vec<(usize, usize)>,
-    ) -> (usize, bool);
+    fn update_heap_idx(&mut self, mapper: &[(usize, usize)]);
+    fn heap_idx_at(&self, map_idx: usize) -> usize;
+    fn take_all(&mut self, indexes: Vec<usize>) -> ArrayRef;
+    fn find_or_insert(&mut self, row_idx: usize, replace_idx: usize) -> (usize, bool);
+}
+
+/// Returns true if the given data type can be used as a top-K aggregation hash key.
+///
+/// Supported types include Arrow primitives (integers, floats, decimals, intervals)
+/// and UTF-8 strings (`Utf8`, `LargeUtf8`, `Utf8View`). This is used internally by
+/// `PriorityMap::supports()` to validate grouping key type compatibility.
+pub fn is_supported_hash_key_type(kt: &DataType) -> bool {
+    kt.is_primitive()
+        || matches!(
+            kt,
+            DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8
+        )
 }
 
 // An implementation of ArrowHashTable for String keys
@@ -119,6 +122,34 @@ impl StringHashTable {
             data_type,
         }
     }
+
+    /// Extracts the string value at the given row index, handling nulls and different string types.
+    ///
+    /// Returns `None` if the value is null, otherwise `Some(value.to_string())`.
+    fn extract_string_value(&self, row_idx: usize) -> Option<String> {
+        let is_null_and_value = match self.data_type {
+            DataType::Utf8 => {
+                let arr = self.owned.as_string::<i32>();
+                (arr.is_null(row_idx), arr.value(row_idx))
+            }
+            DataType::LargeUtf8 => {
+                let arr = self.owned.as_string::<i64>();
+                (arr.is_null(row_idx), arr.value(row_idx))
+            }
+            DataType::Utf8View => {
+                let arr = self.owned.as_string_view();
+                (arr.is_null(row_idx), arr.value(row_idx))
+            }
+            _ => panic!("Unsupported data type"),
+        };
+
+        let (is_null, value) = is_null_and_value;
+        if is_null {
+            None
+        } else {
+            Some(value.to_string())
+        }
+    }
 }
 
 impl ArrowHashTable for StringHashTable {
@@ -130,15 +161,15 @@ impl ArrowHashTable for StringHashTable {
         self.map.len()
     }
 
-    unsafe fn update_heap_idx(&mut self, mapper: &[(usize, usize)]) {
+    fn update_heap_idx(&mut self, mapper: &[(usize, usize)]) {
         self.map.update_heap_idx(mapper);
     }
 
-    unsafe fn heap_idx_at(&self, map_idx: usize) -> usize {
+    fn heap_idx_at(&self, map_idx: usize) -> usize {
         self.map.heap_idx_at(map_idx)
     }
 
-    unsafe fn take_all(&mut self, indexes: Vec<usize>) -> ArrayRef {
+    fn take_all(&mut self, indexes: Vec<usize>) -> ArrayRef {
         let ids = self.map.take_all(indexes);
         match self.data_type {
             DataType::Utf8 => Arc::new(StringArray::from(ids)),
@@ -148,67 +179,16 @@ impl ArrowHashTable for StringHashTable {
         }
     }
 
-    unsafe fn find_or_insert(
-        &mut self,
-        row_idx: usize,
-        replace_idx: usize,
-        mapper: &mut Vec<(usize, usize)>,
-    ) -> (usize, bool) {
-        let id = match self.data_type {
-            DataType::Utf8 => {
-                let ids = self
-                    .owned
-                    .as_any()
-                    .downcast_ref::<StringArray>()
-                    .expect("Expected StringArray for DataType::Utf8");
-                if ids.is_null(row_idx) {
-                    None
-                } else {
-                    Some(ids.value(row_idx))
-                }
-            }
-            DataType::LargeUtf8 => {
-                let ids = self
-                    .owned
-                    .as_any()
-                    .downcast_ref::<LargeStringArray>()
-                    .expect("Expected LargeStringArray for DataType::LargeUtf8");
-                if ids.is_null(row_idx) {
-                    None
-                } else {
-                    Some(ids.value(row_idx))
-                }
-            }
-            DataType::Utf8View => {
-                let ids = self
-                    .owned
-                    .as_any()
-                    .downcast_ref::<StringViewArray>()
-                    .expect("Expected StringViewArray for DataType::Utf8View");
-                if ids.is_null(row_idx) {
-                    None
-                } else {
-                    Some(ids.value(row_idx))
-                }
-            }
-            _ => panic!("Unsupported data type"),
-        };
+    fn find_or_insert(&mut self, row_idx: usize, replace_idx: usize) -> (usize, bool) {
+        let id = self.extract_string_value(row_idx);
 
-        let hash = self.rnd.hash_one(id);
-        if let Some(map_idx) = self
-            .map
-            .find(hash, |mi| id == mi.as_ref().map(|id| id.as_str()))
-        {
-            return (map_idx, false);
-        }
+        // Compute hash and create equality closure for hash table lookup.
+        let hash = self.rnd.hash_one(id.as_deref());
+        let id_for_eq = id.clone();
+        let eq = move |mi: &Option<String>| id_for_eq.as_deref() == mi.as_deref();
 
-        // we're full and this is a better value, so remove the worst
-        let heap_idx = self.map.remove_if_full(replace_idx);
-
-        // add the new group
-        let id = id.map(|id| id.to_string());
-        let map_idx = self.map.insert(hash, id, heap_idx, mapper);
-        (map_idx, true)
+        // Use entry API to avoid double lookup
+        self.map.find_or_insert(hash, id, replace_idx, eq)
     }
 }
 
@@ -245,15 +225,15 @@ where
         self.map.len()
     }
 
-    unsafe fn update_heap_idx(&mut self, mapper: &[(usize, usize)]) {
+    fn update_heap_idx(&mut self, mapper: &[(usize, usize)]) {
         self.map.update_heap_idx(mapper);
     }
 
-    unsafe fn heap_idx_at(&self, map_idx: usize) -> usize {
+    fn heap_idx_at(&self, map_idx: usize) -> usize {
         self.map.heap_idx_at(map_idx)
     }
 
-    unsafe fn take_all(&mut self, indexes: Vec<usize>) -> ArrayRef {
+    fn take_all(&mut self, indexes: Vec<usize>) -> ArrayRef {
         let ids = self.map.take_all(indexes);
         let mut builder: PrimitiveBuilder<VAL> =
             PrimitiveArray::builder(ids.len()).with_data_type(self.kt.clone());
@@ -267,112 +247,117 @@ where
         Arc::new(ids)
     }
 
-    unsafe fn find_or_insert(
-        &mut self,
-        row_idx: usize,
-        replace_idx: usize,
-        mapper: &mut Vec<(usize, usize)>,
-    ) -> (usize, bool) {
+    fn find_or_insert(&mut self, row_idx: usize, replace_idx: usize) -> (usize, bool) {
         let ids = self.owned.as_primitive::<VAL>();
         let id: Option<VAL::Native> = if ids.is_null(row_idx) {
             None
         } else {
             Some(ids.value(row_idx))
         };
-
+        // Compute hash and create equality closure for hash table lookup.
         let hash: u64 = id.hash(&self.rnd);
-        if let Some(map_idx) = self.map.find(hash, |mi| id == *mi) {
-            return (map_idx, false);
-        }
+        let eq = |mi: &Option<VAL::Native>| id == *mi;
 
-        // we're full and this is a better value, so remove the worst
-        let heap_idx = self.map.remove_if_full(replace_idx);
-
-        // add the new group
-        let map_idx = self.map.insert(hash, id, heap_idx, mapper);
-        (map_idx, true)
+        // Use entry API to avoid double lookup
+        self.map.find_or_insert(hash, id, replace_idx, eq)
     }
 }
 
-impl<ID: KeyType> TopKHashTable<ID> {
+use hashbrown::hash_table::Entry;
+impl<ID: KeyType + PartialEq> TopKHashTable<ID> {
     pub fn new(limit: usize, capacity: usize) -> Self {
         Self {
-            map: RawTable::with_capacity(capacity),
+            map: HashTable::with_capacity(capacity),
+            store: Vec::with_capacity(capacity),
+            free_index: None,
             limit,
         }
     }
 
-    pub fn find(&self, hash: u64, mut eq: impl FnMut(&ID) -> bool) -> Option<usize> {
-        let bucket = self.map.find(hash, |mi| eq(&mi.id))?;
-        // JUSTIFICATION
-        //  Benefit:  ~15% speedup + required to index into RawTable from binary heap
-        //  Soundness: getting the index of a bucket we just found
-        let idx = unsafe { self.map.bucket_index(&bucket) };
-        Some(idx)
-    }
-
-    pub unsafe fn heap_idx_at(&self, map_idx: usize) -> usize {
-        let bucket = unsafe { self.map.bucket(map_idx) };
-        bucket.as_ref().heap_idx
+    pub fn heap_idx_at(&self, map_idx: usize) -> usize {
+        self.store[map_idx].as_ref().unwrap().heap_idx
     }
 
-    pub unsafe fn remove_if_full(&mut self, replace_idx: usize) -> usize {
+    pub fn remove_if_full(&mut self, replace_idx: usize) -> usize {
         if self.map.len() >= self.limit {
-            self.map.erase(self.map.bucket(replace_idx));
+            let item_to_remove = self.store[replace_idx].as_ref().unwrap();
+            let hash = item_to_remove.hash;
+            let id_to_remove = &item_to_remove.id;
+
+            let eq = |&idx: &usize| self.store[idx].as_ref().unwrap().id == *id_to_remove;
+            let hasher = |idx: &usize| self.store[*idx].as_ref().unwrap().hash;
+            match self.map.entry(hash, eq, hasher) {
+                Entry::Occupied(entry) => {
+                    let (removed_idx, _) = entry.remove();
+                    self.store[removed_idx] = None;
+                    self.free_index = Some(removed_idx);
+                }
+                Entry::Vacant(_) => unreachable!(),
+            }
             0 // if full, always replace top node
         } else {
             self.map.len() // if we're not full, always append to end
         }
     }
 
-    unsafe fn update_heap_idx(&mut self, mapper: &[(usize, usize)]) {
+    fn update_heap_idx(&mut self, mapper: &[(usize, usize)]) {
         for (m, h) in mapper {
-            self.map.bucket(*m).as_mut().heap_idx = *h
+            self.store[*m].as_mut().unwrap().heap_idx = *h;
         }
     }
 
-    pub fn insert(
+    /// Find an existing entry or insert a new one, avoiding double hash table lookup.
+    /// Returns (map_idx, is_new) where is_new indicates if this was a new insertion.
+    /// If inserting a new entry and the table is full, replaces the entry at replace_idx.
+    pub fn find_or_insert(
         &mut self,
         hash: u64,
         id: ID,
-        heap_idx: usize,
-        mapper: &mut Vec<(usize, usize)>,
-    ) -> usize {
-        let mi = HashTableItem::new(hash, id, heap_idx);
-        let bucket = self.map.try_insert_no_grow(hash, mi);
-        let bucket = match bucket {
-            Ok(bucket) => bucket,
-            Err(new_item) => {
-                let bucket = self.map.insert(hash, new_item, |mi| mi.hash);
-                // JUSTIFICATION
-                //  Benefit:  ~15% speedup + required to index into RawTable from binary heap
-                //  Soundness: we're getting indexes of buckets, not dereferencing them
-                unsafe {
-                    for bucket in self.map.iter() {
-                        let heap_idx = bucket.as_ref().heap_idx;
-                        let map_idx = self.map.bucket_index(&bucket);
-                        mapper.push((heap_idx, map_idx));
-                    }
-                }
-                bucket
+        replace_idx: usize,
+        mut eq: impl FnMut(&ID) -> bool,
+    ) -> (usize, bool) {
+        // Check if entry exists - this is the only hash table lookup
+        {
+            let eq_fn = |idx: &usize| eq(&self.store[*idx].as_ref().unwrap().id);
+            if let Some(&map_idx) = self.map.find(hash, eq_fn) {
+                return (map_idx, false);
             }
+        }
+
+        // Entry doesn't exist - compute heap_idx and prepare item
+        let heap_idx = self.remove_if_full(replace_idx);
+        let mi = HashTableItem::new(hash, id, heap_idx);
+        let store_idx = if let Some(idx) = self.free_index.take() {
+            self.store[idx] = Some(mi);
+            idx
+        } else {
+            self.store.push(Some(mi));
+            self.store.len() - 1
         };
-        // JUSTIFICATION
-        //  Benefit:  ~15% speedup + required to index into RawTable from binary heap
-        //  Soundness: we're getting indexes of buckets, not dereferencing them
-        unsafe { self.map.bucket_index(&bucket) }
+
+        // Reserve space if needed
+        let hasher = |idx: &usize| self.store[*idx].as_ref().unwrap().hash;
+        if self.map.len() == self.map.capacity() {
+            self.map.reserve(self.limit, hasher);
+        }
+
+        // Insert without checking again since we already confirmed it doesn't exist
+        self.map.insert_unique(hash, store_idx, hasher);
+        (store_idx, true)
     }
 
     pub fn len(&self) -> usize {
         self.map.len()
     }
 
-    pub unsafe fn take_all(&mut self, idxs: Vec<usize>) -> Vec<ID> {
+    pub fn take_all(&mut self, idxs: Vec<usize>) -> Vec<ID> {
         let ids = idxs
             .into_iter()
-            .map(|idx| self.map.bucket(idx).as_ref().id.clone())
+            .map(|idx| self.store[idx].take().unwrap().id)
             .collect();
         self.map.clear();
+        self.store.clear();
+        self.free_index = None;
         ids
     }
 }
@@ -451,11 +436,8 @@ mod tests {
         let dt = DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".into()));
         let mut ht = new_hash_table(1, dt.clone())?;
         ht.set_batch(Arc::new(ids));
-        let mut mapper = vec![];
-        let ids = unsafe {
-            ht.find_or_insert(0, 0, &mut mapper);
-            ht.take_all(vec![0])
-        };
+        ht.find_or_insert(0, 0);
+        let ids = ht.take_all(vec![0]);
         assert_eq!(ids.data_type(), &dt);
 
         Ok(())
@@ -464,28 +446,29 @@ mod tests {
     #[test]
     fn should_resize_properly() -> Result<()> {
         let mut heap_to_map = BTreeMap::<usize, usize>::new();
+        // Create TopKHashTable with limit=5 and capacity=3 to force resizing
         let mut map = TopKHashTable::<Option<String>>::new(5, 3);
-        for (heap_idx, id) in vec!["1", "2", "3", "4", "5"].into_iter().enumerate() {
-            let mut mapper = vec![];
+
+        // Insert 5 entries, tracking the heap-to-map index mapping
+        for (heap_idx, id) in ["1", "2", "3", "4", "5"].iter().enumerate() {
+            let value = Some(id.to_string());
             let hash = heap_idx as u64;
-            let map_idx = map.insert(hash, Some(id.to_string()), heap_idx, &mut mapper);
-            let _ = heap_to_map.insert(heap_idx, map_idx);
-            if heap_idx == 3 {
-                assert_eq!(
-                    mapper,
-                    vec![(0, 0), (1, 1), (2, 2), (3, 3)],
-                    "Pass {heap_idx} resized incorrectly!"
-                );
-                for (heap_idx, map_idx) in mapper {
-                    let _ = heap_to_map.insert(heap_idx, map_idx);
-                }
-            } else {
-                assert_eq!(mapper, vec![], "Pass {heap_idx} should not have resized!");
-            }
+            let (map_idx, is_new) =
+                map.find_or_insert(hash, value.clone(), heap_idx, |v| *v == value);
+            assert!(is_new, "Entry should be new");
+            heap_to_map.insert(heap_idx, map_idx);
         }
 
+        // Verify all 5 entries are present
+        assert_eq!(map.len(), 5);
+
+        // Verify that the hash table resized properly (capacity should have grown beyond 3)
+        // This is implicit - if it didn't resize, insertions would have failed or been slow
+
+        // Drain all values in heap order
         let (_heap_idxs, map_idxs): (Vec<_>, Vec<_>) = heap_to_map.into_iter().unzip();
-        let ids = unsafe { map.take_all(map_idxs) };
+        let ids = map.take_all(map_idxs);
+
         assert_eq!(
             format!("{ids:?}"),
             r#"[Some("1"), Some("2"), Some("3"), Some("4"), Some("5")]"#
diff --git a/datafusion/physical-plan/src/aggregates/topk/heap.rs b/datafusion/physical-plan/src/aggregates/topk/heap.rs
index 23ccf5e17ef69..889fe04bf830a 100644
--- a/datafusion/physical-plan/src/aggregates/topk/heap.rs
+++ b/datafusion/physical-plan/src/aggregates/topk/heap.rs
@@ -15,17 +15,25 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! A custom binary heap implementation for performant top K aggregation
-
+//! A custom binary heap implementation for performant top K aggregation.
+//!
+//! the `new_heap` //! factory function selects an appropriate heap implementation
+//! based on the Arrow data type.
+//!
+//! Supported value types include Arrow primitives (integers, floats, decimals, intervals)
+//! and UTF-8 strings (`Utf8`, `LargeUtf8`, `Utf8View`) using lexicographic ordering.
+
+use arrow::array::{ArrayRef, ArrowPrimitiveType, PrimitiveArray, downcast_primitive};
+use arrow::array::{LargeStringBuilder, StringBuilder, StringViewBuilder};
 use arrow::array::{
+    StringArray,
     cast::AsArray,
     types::{IntervalDayTime, IntervalMonthDayNano},
 };
-use arrow::array::{downcast_primitive, ArrayRef, ArrowPrimitiveType, PrimitiveArray};
 use arrow::buffer::ScalarBuffer;
-use arrow::datatypes::{i256, DataType};
-use datafusion_common::exec_datafusion_err;
+use arrow::datatypes::{DataType, i256};
 use datafusion_common::Result;
+use datafusion_common::exec_datafusion_err;
 
 use half::f16;
 use std::cmp::Ordering;
@@ -72,7 +80,6 @@ pub trait ArrowHeap {
     fn set_batch(&mut self, vals: ArrayRef);
     fn is_worse(&self, idx: usize) -> bool;
     fn worst_map_idx(&self) -> usize;
-    fn renumber(&mut self, heap_to_map: &[(usize, usize)]);
     fn insert(&mut self, row_idx: usize, map_idx: usize, map: &mut Vec<(usize, usize)>);
     fn replace_if_better(
         &mut self,
@@ -131,10 +138,6 @@ where
         self.heap.worst_map_idx()
     }
 
-    fn renumber(&mut self, heap_to_map: &[(usize, usize)]) {
-        self.heap.renumber(heap_to_map);
-    }
-
     fn insert(&mut self, row_idx: usize, map_idx: usize, map: &mut Vec<(usize, usize)>) {
         let vals = self.batch.as_primitive::<VAL>();
         let new_val = vals.value(row_idx);
@@ -161,6 +164,164 @@ where
     }
 }
 
+/// An implementation of `ArrowHeap` that deals with string values.
+///
+/// Supports all three UTF-8 string types: `Utf8`, `LargeUtf8`, and `Utf8View`.
+/// String values are compared lexicographically using the compare-first pattern:
+/// borrowed strings are compared before allocation, and only allocated when the
+/// heap confirms they improve the top-K set.
+///
+pub struct StringHeap {
+    batch: ArrayRef,
+    heap: TopKHeap<Option<String>>,
+    desc: bool,
+    data_type: DataType,
+}
+
+impl StringHeap {
+    pub fn new(limit: usize, desc: bool, data_type: DataType) -> Self {
+        let batch: ArrayRef = Arc::new(StringArray::from(Vec::<&str>::new()));
+        Self {
+            batch,
+            heap: TopKHeap::new(limit, desc),
+            desc,
+            data_type,
+        }
+    }
+
+    /// Extracts a string value from the current batch at the given row index.
+    ///
+    /// Panics if the row index is out of bounds or if the data type is not one of
+    /// the supported UTF-8 string types.
+    ///
+    /// Note: Null values should not appear in the input; the aggregation layer
+    /// ensures nulls are filtered before reaching this code.
+    fn value(&self, row_idx: usize) -> &str {
+        extract_string_value(&self.batch, &self.data_type, row_idx)
+    }
+}
+
+/// Helper to extract a string value from an ArrayRef at a given index.
+///
+/// Supports `Utf8`, `LargeUtf8`, and `Utf8View` data types.
+///
+/// # Panics
+/// Panics if the index is out of bounds or if the data type is unsupported.
+fn extract_string_value<'a>(
+    batch: &'a ArrayRef,
+    data_type: &DataType,
+    idx: usize,
+) -> &'a str {
+    match data_type {
+        DataType::Utf8 => batch.as_string::<i32>().value(idx),
+        DataType::LargeUtf8 => batch.as_string::<i64>().value(idx),
+        DataType::Utf8View => batch.as_string_view().value(idx),
+        _ => unreachable!("Unsupported string type: {data_type}"),
+    }
+}
+
+impl ArrowHeap for StringHeap {
+    fn set_batch(&mut self, vals: ArrayRef) {
+        self.batch = vals;
+    }
+
+    fn is_worse(&self, row_idx: usize) -> bool {
+        if !self.heap.is_full() {
+            return false;
+        }
+        // Compare borrowed `&str` against the worst heap value first to avoid
+        // allocating a `String` unless this row would actually replace an
+        // existing heap entry.
+        let new_val = self.value(row_idx);
+        let worst_val = self.heap.worst_val().expect("Missing root");
+        match worst_val {
+            None => false,
+            Some(worst_str) => {
+                (!self.desc && new_val > worst_str.as_str())
+                    || (self.desc && new_val < worst_str.as_str())
+            }
+        }
+    }
+
+    fn worst_map_idx(&self) -> usize {
+        self.heap.worst_map_idx()
+    }
+
+    fn insert(&mut self, row_idx: usize, map_idx: usize, map: &mut Vec<(usize, usize)>) {
+        // When appending (heap not full) we must allocate to own the string
+        // because it will be stored in the heap. For replacements we avoid
+        // allocation until `replace_if_better` confirms a replacement is
+        // necessary.
+        let new_str = self.value(row_idx).to_string();
+        let new_val = Some(new_str);
+        self.heap.append_or_replace(new_val, map_idx, map);
+    }
+
+    fn replace_if_better(
+        &mut self,
+        heap_idx: usize,
+        row_idx: usize,
+        map: &mut Vec<(usize, usize)>,
+    ) {
+        let new_str = self.value(row_idx);
+        let existing = self.heap.heap[heap_idx]
+            .as_ref()
+            .expect("Missing heap item");
+
+        // Compare borrowed reference first—no allocation yet.
+        // We compare the borrowed `&str` with the stored `Option<String>` and
+        // only allocate (`to_string()`) when a replacement is required.
+        match &existing.val {
+            None => {
+                // Existing is null; new value always wins
+                let new_val = Some(new_str.to_string());
+                self.heap.replace_if_better(heap_idx, new_val, map);
+            }
+            Some(existing_str) => {
+                // Compare borrowed strings first
+                if (!self.desc && new_str < existing_str.as_str())
+                    || (self.desc && new_str > existing_str.as_str())
+                {
+                    let new_val = Some(new_str.to_string());
+                    self.heap.replace_if_better(heap_idx, new_val, map);
+                }
+                // Else: no improvement, no allocation
+            }
+        }
+    }
+
+    fn drain(&mut self) -> (ArrayRef, Vec<usize>) {
+        let (vals, map_idxs) = self.heap.drain();
+        // Use Arrow builders to safely construct arrays from the owned
+        // `Option<String>` values. Builders avoid needing to maintain
+        // references to temporary storage.
+
+        // Macro to eliminate duplication across string builder types.
+        // All three builders share the same interface for append_value,
+        // append_null, and finish, differing only in their concrete types.
+        macro_rules! build_string_array {
+            ($builder_type:ty) => {{
+                let mut builder = <$builder_type>::new();
+                for val in vals {
+                    match val {
+                        Some(s) => builder.append_value(&s),
+                        None => builder.append_null(),
+                    }
+                }
+                Arc::new(builder.finish())
+            }};
+        }
+
+        let arr: ArrayRef = match self.data_type {
+            DataType::Utf8 => build_string_array!(StringBuilder),
+            DataType::LargeUtf8 => build_string_array!(LargeStringBuilder),
+            DataType::Utf8View => build_string_array!(StringViewBuilder),
+            _ => unreachable!("Unsupported string type: {}", self.data_type),
+        };
+        (arr, map_idxs)
+    }
+}
+
 impl<VAL: ValueType> TopKHeap<VAL> {
     pub fn new(limit: usize, desc: bool) -> Self {
         Self {
@@ -268,14 +429,6 @@ impl<VAL: ValueType> TopKHeap<VAL> {
         self.heapify_down(heap_idx, mapper);
     }
 
-    pub fn renumber(&mut self, heap_to_map: &[(usize, usize)]) {
-        for (heap_idx, map_idx) in heap_to_map.iter() {
-            if let Some(Some(hi)) = self.heap.get_mut(*heap_idx) {
-                hi.map_idx = *map_idx;
-            }
-        }
-    }
-
     fn heapify_up(&mut self, mut idx: usize, mapper: &mut Vec<(usize, usize)>) {
         let desc = self.desc;
         while idx != 0 {
@@ -311,13 +464,12 @@ impl<VAL: ValueType> TopKHeap<VAL> {
         let mut best_idx = node_idx;
         let mut best_val = &entry.val;
         for child_idx in left_child..=left_child + 1 {
-            if let Some(Some(child)) = self.heap.get(child_idx) {
-                if (!desc && child.val.comp(best_val) == Ordering::Greater)
-                    || (desc && child.val.comp(best_val) == Ordering::Less)
-                {
-                    best_val = &child.val;
-                    best_idx = child_idx;
-                }
+            if let Some(Some(child)) = self.heap.get(child_idx)
+                && ((!desc && child.val.comp(best_val) == Ordering::Greater)
+                    || (desc && child.val.comp(best_val) == Ordering::Less))
+            {
+                best_val = &child.val;
+                best_idx = child_idx;
             }
         }
         if best_val.comp(&entry.val) != Ordering::Equal {
@@ -326,20 +478,10 @@ impl<VAL: ValueType> TopKHeap<VAL> {
         }
     }
 
-    fn _tree_print(
-        &self,
-        idx: usize,
-        prefix: String,
-        is_tail: bool,
-        output: &mut String,
-    ) {
+    fn _tree_print(&self, idx: usize, prefix: &str, is_tail: bool, output: &mut String) {
         if let Some(Some(hi)) = self.heap.get(idx) {
             let connector = if idx != 0 {
-                if is_tail {
-                    "└── "
-                } else {
-                    "├── "
-                }
+                if is_tail { "└── " } else { "├── " }
             } else {
                 ""
             };
@@ -357,10 +499,10 @@ impl<VAL: ValueType> TopKHeap<VAL> {
             let right_exists = right_idx < self.len;
 
             if left_exists {
-                self._tree_print(left_idx, child_prefix.clone(), !right_exists, output);
+                self._tree_print(left_idx, &child_prefix, !right_exists, output);
             }
             if right_exists {
-                self._tree_print(right_idx, child_prefix, true, output);
+                self._tree_print(right_idx, &child_prefix, true, output);
             }
         }
     }
@@ -370,7 +512,7 @@ impl<VAL: ValueType> Display for TopKHeap<VAL> {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         let mut output = String::new();
         if !self.heap.is_empty() {
-            self._tree_print(0, String::new(), true, &mut output);
+            self._tree_print(0, "", true, &mut output);
         }
         write!(f, "{output}")
     }
@@ -462,11 +604,31 @@ compare_integer!(u8, u16, u32, u64);
 compare_integer!(IntervalDayTime, IntervalMonthDayNano);
 compare_float!(f16, f32, f64);
 
+/// Returns true if the given data type can be stored in a top-K aggregation heap.
+///
+/// Supported types include Arrow primitives (integers, floats, decimals, intervals)
+/// and UTF-8 strings (`Utf8`, `LargeUtf8`, `Utf8View`). This is used internally by
+/// `PriorityMap::supports()` to validate aggregate value type compatibility.
+pub fn is_supported_heap_type(vt: &DataType) -> bool {
+    vt.is_primitive()
+        || matches!(
+            vt,
+            DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8
+        )
+}
+
 pub fn new_heap(
     limit: usize,
     desc: bool,
     vt: DataType,
 ) -> Result<Box<dyn ArrowHeap + Send>> {
+    if matches!(
+        vt,
+        DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View
+    ) {
+        return Ok(Box::new(StringHeap::new(limit, desc, vt)));
+    }
+
     macro_rules! downcast_helper {
         ($vt:ty, $d:ident) => {
             return Ok(Box::new(PrimitiveHeap::<$vt>::new(limit, desc, vt)))
@@ -478,7 +640,9 @@ pub fn new_heap(
         _ => {}
     }
 
-    Err(exec_datafusion_err!("Can't group type: {vt:?}"))
+    Err(exec_datafusion_err!(
+        "Unsupported TopK aggregate value type: {vt:?}"
+    ))
 }
 
 #[cfg(test)]
@@ -494,9 +658,7 @@ mod tests {
         heap.append_or_replace(1, 1, &mut map);
 
         let actual = heap.to_string();
-        assert_snapshot!(actual, @r#"
-val=1 idx=0, bucket=1
-            "#);
+        assert_snapshot!(actual, @"val=1 idx=0, bucket=1");
 
         Ok(())
     }
@@ -513,10 +675,10 @@ val=1 idx=0, bucket=1
         assert_eq!(map, vec![(2, 0), (1, 1)]);
 
         let actual = heap.to_string();
-        assert_snapshot!(actual, @r#"
-val=2 idx=0, bucket=2
-└── val=1 idx=1, bucket=1
-            "#);
+        assert_snapshot!(actual, @r"
+        val=2 idx=0, bucket=2
+        └── val=1 idx=1, bucket=1
+        ");
 
         Ok(())
     }
@@ -530,20 +692,20 @@ val=2 idx=0, bucket=2
         heap.append_or_replace(2, 2, &mut map);
         heap.append_or_replace(3, 3, &mut map);
         let actual = heap.to_string();
-        assert_snapshot!(actual, @r#"
-val=3 idx=0, bucket=3
-├── val=1 idx=1, bucket=1
-└── val=2 idx=2, bucket=2
-            "#);
+        assert_snapshot!(actual, @r"
+        val=3 idx=0, bucket=3
+        ├── val=1 idx=1, bucket=1
+        └── val=2 idx=2, bucket=2
+        ");
 
         let mut map = vec![];
         heap.append_or_replace(0, 0, &mut map);
         let actual = heap.to_string();
-        assert_snapshot!(actual, @r#"
-val=2 idx=0, bucket=2
-├── val=1 idx=1, bucket=1
-└── val=0 idx=2, bucket=0
-            "#);
+        assert_snapshot!(actual, @r"
+        val=2 idx=0, bucket=2
+        ├── val=1 idx=1, bucket=1
+        └── val=0 idx=2, bucket=0
+        ");
         assert_eq!(map, vec![(2, 0), (0, 2)]);
 
         Ok(())
@@ -559,22 +721,22 @@ val=2 idx=0, bucket=2
         heap.append_or_replace(3, 3, &mut map);
         heap.append_or_replace(4, 4, &mut map);
         let actual = heap.to_string();
-        assert_snapshot!(actual, @r#"
-val=4 idx=0, bucket=4
-├── val=3 idx=1, bucket=3
-│   └── val=1 idx=3, bucket=1
-└── val=2 idx=2, bucket=2
-            "#);
+        assert_snapshot!(actual, @r"
+        val=4 idx=0, bucket=4
+        ├── val=3 idx=1, bucket=3
+        │   └── val=1 idx=3, bucket=1
+        └── val=2 idx=2, bucket=2
+        ");
 
         let mut map = vec![];
         heap.replace_if_better(1, 0, &mut map);
         let actual = heap.to_string();
-        assert_snapshot!(actual, @r#"
-val=4 idx=0, bucket=4
-├── val=1 idx=1, bucket=1
-│   └── val=0 idx=3, bucket=3
-└── val=2 idx=2, bucket=2
-            "#);
+        assert_snapshot!(actual, @r"
+        val=4 idx=0, bucket=4
+        ├── val=1 idx=1, bucket=1
+        │   └── val=0 idx=3, bucket=3
+        └── val=2 idx=2, bucket=2
+        ");
         assert_eq!(map, vec![(1, 1), (3, 3)]);
 
         Ok(())
@@ -589,10 +751,10 @@ val=4 idx=0, bucket=4
         heap.append_or_replace(2, 2, &mut map);
 
         let actual = heap.to_string();
-        assert_snapshot!(actual, @r#"
-val=2 idx=0, bucket=2
-└── val=1 idx=1, bucket=1
-            "#);
+        assert_snapshot!(actual, @r"
+        val=2 idx=0, bucket=2
+        └── val=1 idx=1, bucket=1
+        ");
 
         assert_eq!(heap.worst_val(), Some(&2));
         assert_eq!(heap.worst_map_idx(), 2);
@@ -609,10 +771,10 @@ val=2 idx=0, bucket=2
         heap.append_or_replace(2, 2, &mut map);
 
         let actual = heap.to_string();
-        assert_snapshot!(actual, @r#"
-val=2 idx=0, bucket=2
-└── val=1 idx=1, bucket=1
-            "#);
+        assert_snapshot!(actual, @r"
+        val=2 idx=0, bucket=2
+        └── val=1 idx=1, bucket=1
+        ");
 
         let (vals, map_idxs) = heap.drain();
         assert_eq!(vals, vec![1, 2]);
@@ -621,29 +783,4 @@ val=2 idx=0, bucket=2
 
         Ok(())
     }
-
-    #[test]
-    fn should_renumber() -> Result<()> {
-        let mut map = vec![];
-        let mut heap = TopKHeap::new(10, false);
-
-        heap.append_or_replace(1, 1, &mut map);
-        heap.append_or_replace(2, 2, &mut map);
-
-        let actual = heap.to_string();
-        assert_snapshot!(actual, @r#"
-val=2 idx=0, bucket=2
-└── val=1 idx=1, bucket=1
-            "#);
-
-        let numbers = vec![(0, 1), (1, 2)];
-        heap.renumber(numbers.as_slice());
-        let actual = heap.to_string();
-        assert_snapshot!(actual, @r#"
-val=2 idx=0, bucket=1
-└── val=1 idx=1, bucket=2
-            "#);
-
-        Ok(())
-    }
 }
diff --git a/datafusion/physical-plan/src/aggregates/topk/priority_map.rs b/datafusion/physical-plan/src/aggregates/topk/priority_map.rs
index a09d70f7471f3..c74b648d373ce 100644
--- a/datafusion/physical-plan/src/aggregates/topk/priority_map.rs
+++ b/datafusion/physical-plan/src/aggregates/topk/priority_map.rs
@@ -17,8 +17,8 @@
 
 //! A `Map<K, V>` / `PriorityQueue` combo that evicts the worst values after reaching `capacity`
 
-use crate::aggregates::topk::hash_table::{new_hash_table, ArrowHashTable};
-use crate::aggregates::topk::heap::{new_heap, ArrowHeap};
+use crate::aggregates::topk::hash_table::{ArrowHashTable, new_hash_table};
+use crate::aggregates::topk::heap::{ArrowHeap, new_heap};
 use arrow::array::ArrayRef;
 use arrow::datatypes::DataType;
 use datafusion_common::Result;
@@ -63,40 +63,26 @@ impl PriorityMap {
         // handle new groups we haven't seen yet
         map.clear();
         let replace_idx = self.heap.worst_map_idx();
-        // JUSTIFICATION
-        //  Benefit:  ~15% speedup + required to index into RawTable from binary heap
-        //  Soundness: replace_idx kept valid during resizes
-        let (map_idx, did_insert) =
-            unsafe { self.map.find_or_insert(row_idx, replace_idx, map) };
+
+        let (map_idx, did_insert) = self.map.find_or_insert(row_idx, replace_idx);
         if did_insert {
-            self.heap.renumber(map);
-            map.clear();
             self.heap.insert(row_idx, map_idx, map);
-            // JUSTIFICATION
-            //  Benefit:  ~15% speedup + required to index into RawTable from binary heap
-            //  Soundness: the map was created on the line above, so all the indexes should be valid
-            unsafe { self.map.update_heap_idx(map) };
+            self.map.update_heap_idx(map);
             return Ok(());
         };
 
         // this is a value for an existing group
         map.clear();
-        // JUSTIFICATION
-        //  Benefit:  ~15% speedup + required to index into RawTable from binary heap
-        //  Soundness: map_idx was just found, so it is valid
-        let heap_idx = unsafe { self.map.heap_idx_at(map_idx) };
+        let heap_idx = self.map.heap_idx_at(map_idx);
         self.heap.replace_if_better(heap_idx, row_idx, map);
-        // JUSTIFICATION
-        //  Benefit:  ~15% speedup + required to index into RawTable from binary heap
-        //  Soundness: the index map was just built, so it will be valid
-        unsafe { self.map.update_heap_idx(map) };
+        self.map.update_heap_idx(map);
 
         Ok(())
     }
 
     pub fn emit(&mut self) -> Result<Vec<ArrayRef>> {
         let (vals, map_idxs) = self.heap.drain();
-        let ids = unsafe { self.map.take_all(map_idxs) };
+        let ids = self.map.take_all(map_idxs);
         Ok(vec![ids, vals])
     }
 
@@ -182,13 +168,13 @@ mod tests {
         let batch = RecordBatch::try_new(test_schema(), cols)?;
         let actual = format!("{}", pretty_format_batches(&[batch])?);
 
-        assert_snapshot!(actual, @r#"
-+----------+--------------+
-| trace_id | timestamp_ms |
-+----------+--------------+
-| 1        | 1            |
-+----------+--------------+
-        "#
+        assert_snapshot!(actual, @r"
+        +----------+--------------+
+        | trace_id | timestamp_ms |
+        +----------+--------------+
+        | 1        | 1            |
+        +----------+--------------+
+        "
         );
 
         Ok(())
@@ -207,13 +193,13 @@ mod tests {
         let batch = RecordBatch::try_new(test_schema(), cols)?;
         let actual = format!("{}", pretty_format_batches(&[batch])?);
 
-        assert_snapshot!(actual, @r#"
-+----------+--------------+
-| trace_id | timestamp_ms |
-+----------+--------------+
-| 1        | 1            |
-+----------+--------------+
-        "#
+        assert_snapshot!(actual, @r"
+        +----------+--------------+
+        | trace_id | timestamp_ms |
+        +----------+--------------+
+        | 1        | 1            |
+        +----------+--------------+
+        "
         );
 
         Ok(())
@@ -231,13 +217,13 @@ mod tests {
         let cols = agg.emit()?;
         let batch = RecordBatch::try_new(test_schema(), cols)?;
         let actual = format!("{}", pretty_format_batches(&[batch])?);
-        assert_snapshot!(actual, @r#"
-+----------+--------------+
-| trace_id | timestamp_ms |
-+----------+--------------+
-| 2        | 2            |
-+----------+--------------+
-        "#
+        assert_snapshot!(actual, @r"
+        +----------+--------------+
+        | trace_id | timestamp_ms |
+        +----------+--------------+
+        | 2        | 2            |
+        +----------+--------------+
+        "
         );
 
         Ok(())
@@ -255,13 +241,13 @@ mod tests {
         let cols = agg.emit()?;
         let batch = RecordBatch::try_new(test_schema(), cols)?;
         let actual = format!("{}", pretty_format_batches(&[batch])?);
-        assert_snapshot!(actual, @r#"
-+----------+--------------+
-| trace_id | timestamp_ms |
-+----------+--------------+
-| 1        | 1            |
-+----------+--------------+
-        "#
+        assert_snapshot!(actual, @r"
+        +----------+--------------+
+        | trace_id | timestamp_ms |
+        +----------+--------------+
+        | 1        | 1            |
+        +----------+--------------+
+        "
         );
 
         Ok(())
@@ -279,13 +265,13 @@ mod tests {
         let cols = agg.emit()?;
         let batch = RecordBatch::try_new(test_schema(), cols)?;
         let actual = format!("{}", pretty_format_batches(&[batch])?);
-        assert_snapshot!(actual, @r#"
-+----------+--------------+
-| trace_id | timestamp_ms |
-+----------+--------------+
-| 1        | 2            |
-+----------+--------------+
-        "#
+        assert_snapshot!(actual, @r"
+        +----------+--------------+
+        | trace_id | timestamp_ms |
+        +----------+--------------+
+        | 1        | 2            |
+        +----------+--------------+
+        "
         );
 
         Ok(())
@@ -303,13 +289,13 @@ mod tests {
         let cols = agg.emit()?;
         let batch = RecordBatch::try_new(test_schema(), cols)?;
         let actual = format!("{}", pretty_format_batches(&[batch])?);
-        assert_snapshot!(actual, @r#"
-+----------+--------------+
-| trace_id | timestamp_ms |
-+----------+--------------+
-| 1        | 1            |
-+----------+--------------+
-        "#
+        assert_snapshot!(actual, @r"
+        +----------+--------------+
+        | trace_id | timestamp_ms |
+        +----------+--------------+
+        | 1        | 1            |
+        +----------+--------------+
+        "
         );
 
         Ok(())
@@ -327,13 +313,13 @@ mod tests {
         let cols = agg.emit()?;
         let batch = RecordBatch::try_new(test_schema(), cols)?;
         let actual = format!("{}", pretty_format_batches(&[batch])?);
-        assert_snapshot!(actual, @r#"
-+----------+--------------+
-| trace_id | timestamp_ms |
-+----------+--------------+
-| 2        | 2            |
-+----------+--------------+
-        "#
+        assert_snapshot!(actual, @r"
+        +----------+--------------+
+        | trace_id | timestamp_ms |
+        +----------+--------------+
+        | 2        | 2            |
+        +----------+--------------+
+        "
         );
 
         Ok(())
@@ -351,13 +337,13 @@ mod tests {
         let cols = agg.emit()?;
         let batch = RecordBatch::try_new(test_schema(), cols)?;
         let actual = format!("{}", pretty_format_batches(&[batch])?);
-        assert_snapshot!(actual, @r#"
-+----------+--------------+
-| trace_id | timestamp_ms |
-+----------+--------------+
-| 1        | 1            |
-+----------+--------------+
-        "#
+        assert_snapshot!(actual, @r"
+        +----------+--------------+
+        | trace_id | timestamp_ms |
+        +----------+--------------+
+        | 1        | 1            |
+        +----------+--------------+
+        "
         );
 
         Ok(())
@@ -375,14 +361,110 @@ mod tests {
         let cols = agg.emit()?;
         let batch = RecordBatch::try_new(test_schema(), cols)?;
         let actual = format!("{}", pretty_format_batches(&[batch])?);
+        assert_snapshot!(actual, @r"
+        +----------+--------------+
+        | trace_id | timestamp_ms |
+        +----------+--------------+
+        | 1        | 2            |
+        +----------+--------------+
+        "
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn should_track_lexicographic_min_utf8_value() -> Result<()> {
+        let ids: ArrayRef = Arc::new(Int64Array::from(vec![1, 1]));
+        let vals: ArrayRef = Arc::new(StringArray::from(vec!["zulu", "alpha"]));
+        let mut agg = PriorityMap::new(DataType::Int64, DataType::Utf8, 1, false)?;
+        agg.set_batch(ids, vals);
+        agg.insert(0)?;
+        agg.insert(1)?;
+
+        let cols = agg.emit()?;
+        let batch = RecordBatch::try_new(test_schema_value(DataType::Utf8), cols)?;
+        let actual = format!("{}", pretty_format_batches(&[batch])?);
+
         assert_snapshot!(actual, @r#"
 +----------+--------------+
 | trace_id | timestamp_ms |
 +----------+--------------+
-| 1        | 2            |
+| 1        | alpha        |
 +----------+--------------+
-        "#
-        );
+        "#);
+
+        Ok(())
+    }
+
+    #[test]
+    fn should_track_lexicographic_max_utf8_value_desc() -> Result<()> {
+        let ids: ArrayRef = Arc::new(Int64Array::from(vec![1, 1]));
+        let vals: ArrayRef = Arc::new(StringArray::from(vec!["alpha", "zulu"]));
+        let mut agg = PriorityMap::new(DataType::Int64, DataType::Utf8, 1, true)?;
+        agg.set_batch(ids, vals);
+        agg.insert(0)?;
+        agg.insert(1)?;
+
+        let cols = agg.emit()?;
+        let batch = RecordBatch::try_new(test_schema_value(DataType::Utf8), cols)?;
+        let actual = format!("{}", pretty_format_batches(&[batch])?);
+
+        assert_snapshot!(actual, @r#"
++----------+--------------+
+| trace_id | timestamp_ms |
++----------+--------------+
+| 1        | zulu         |
++----------+--------------+
+        "#);
+
+        Ok(())
+    }
+
+    #[test]
+    fn should_track_large_utf8_values() -> Result<()> {
+        let ids: ArrayRef = Arc::new(Int64Array::from(vec![1, 1]));
+        let vals: ArrayRef = Arc::new(LargeStringArray::from(vec!["zulu", "alpha"]));
+        let mut agg = PriorityMap::new(DataType::Int64, DataType::LargeUtf8, 1, false)?;
+        agg.set_batch(ids, vals);
+        agg.insert(0)?;
+        agg.insert(1)?;
+
+        let cols = agg.emit()?;
+        let batch = RecordBatch::try_new(test_schema_value(DataType::LargeUtf8), cols)?;
+        let actual = format!("{}", pretty_format_batches(&[batch])?);
+
+        assert_snapshot!(actual, @r#"
++----------+--------------+
+| trace_id | timestamp_ms |
++----------+--------------+
+| 1        | alpha        |
++----------+--------------+
+        "#);
+
+        Ok(())
+    }
+
+    #[test]
+    fn should_track_utf8_view_values() -> Result<()> {
+        let ids: ArrayRef = Arc::new(Int64Array::from(vec![1, 1]));
+        let vals: ArrayRef = Arc::new(StringViewArray::from(vec!["alpha", "zulu"]));
+        let mut agg = PriorityMap::new(DataType::Int64, DataType::Utf8View, 1, true)?;
+        agg.set_batch(ids, vals);
+        agg.insert(0)?;
+        agg.insert(1)?;
+
+        let cols = agg.emit()?;
+        let batch = RecordBatch::try_new(test_schema_value(DataType::Utf8View), cols)?;
+        let actual = format!("{}", pretty_format_batches(&[batch])?);
+
+        assert_snapshot!(actual, @r#"
++----------+--------------+
+| trace_id | timestamp_ms |
++----------+--------------+
+| 1        | zulu         |
++----------+--------------+
+        "#);
 
         Ok(())
     }
@@ -400,14 +482,14 @@ mod tests {
         let cols = agg.emit()?;
         let batch = RecordBatch::try_new(test_schema(), cols)?;
         let actual = format!("{}", pretty_format_batches(&[batch])?);
-        assert_snapshot!(actual, @r#"
-+----------+--------------+
-| trace_id | timestamp_ms |
-+----------+--------------+
-|          | 3            |
-| 1        | 1            |
-+----------+--------------+
-        "#
+        assert_snapshot!(actual, @r"
+        +----------+--------------+
+        | trace_id | timestamp_ms |
+        +----------+--------------+
+        |          | 3            |
+        | 1        | 1            |
+        +----------+--------------+
+        "
         );
 
         Ok(())
@@ -433,4 +515,11 @@ mod tests {
             Field::new("timestamp_ms", DataType::Int64, true),
         ]))
     }
+
+    fn test_schema_value(value_type: DataType) -> SchemaRef {
+        Arc::new(Schema::new(vec![
+            Field::new("trace_id", DataType::Int64, true),
+            Field::new("timestamp_ms", value_type, true),
+        ]))
+    }
 }
diff --git a/datafusion/physical-plan/src/aggregates/topk_stream.rs b/datafusion/physical-plan/src/aggregates/topk_stream.rs
index eb1b7543cbfd6..4aa566ccfcd0a 100644
--- a/datafusion/physical-plan/src/aggregates/topk_stream.rs
+++ b/datafusion/physical-plan/src/aggregates/topk_stream.rs
@@ -19,21 +19,24 @@
 
 use crate::aggregates::group_values::GroupByMetrics;
 use crate::aggregates::topk::priority_map::PriorityMap;
+#[cfg(debug_assertions)]
+use crate::aggregates::topk_types_supported;
 use crate::aggregates::{
-    aggregate_expressions, evaluate_group_by, evaluate_many, AggregateExec,
-    PhysicalGroupBy,
+    AggregateExec, PhysicalGroupBy, aggregate_expressions, evaluate_group_by,
+    evaluate_many,
 };
 use crate::metrics::BaselineMetrics;
 use crate::{RecordBatchStream, SendableRecordBatchStream};
 use arrow::array::{Array, ArrayRef, RecordBatch};
 use arrow::datatypes::SchemaRef;
 use arrow::util::pretty::print_batches;
-use datafusion_common::internal_datafusion_err;
 use datafusion_common::Result;
+use datafusion_common::internal_datafusion_err;
 use datafusion_execution::TaskContext;
 use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_expr_common::metrics::RecordOutput;
 use futures::stream::{Stream, StreamExt};
-use log::{trace, Level};
+use log::{Level, trace};
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};
@@ -47,32 +50,59 @@ pub struct GroupedTopKAggregateStream {
     baseline_metrics: BaselineMetrics,
     group_by_metrics: GroupByMetrics,
     aggregate_arguments: Vec<Vec<Arc<dyn PhysicalExpr>>>,
-    group_by: PhysicalGroupBy,
+    group_by: Arc<PhysicalGroupBy>,
     priority_map: PriorityMap,
 }
 
 impl GroupedTopKAggregateStream {
     pub fn new(
         aggr: &AggregateExec,
-        context: Arc<TaskContext>,
+        context: &Arc<TaskContext>,
         partition: usize,
         limit: usize,
     ) -> Result<Self> {
         let agg_schema = Arc::clone(&aggr.schema);
-        let group_by = aggr.group_by.clone();
-        let input = aggr.input.execute(partition, Arc::clone(&context))?;
+        let group_by = Arc::clone(&aggr.group_by);
+        let input = aggr.input.execute(partition, Arc::clone(context))?;
         let baseline_metrics = BaselineMetrics::new(&aggr.metrics, partition);
         let group_by_metrics = GroupByMetrics::new(&aggr.metrics, partition);
         let aggregate_arguments =
             aggregate_expressions(&aggr.aggr_expr, &aggr.mode, group_by.expr.len())?;
-        let (val_field, desc) = aggr
-            .get_minmax_desc()
-            .ok_or_else(|| internal_datafusion_err!("Min/max required"))?;
 
         let (expr, _) = &aggr.group_expr().expr()[0];
         let kt = expr.data_type(&aggr.input().schema())?;
-        let vt = val_field.data_type().clone();
 
+        // Check if this is a MIN/MAX aggregate or a DISTINCT-like operation
+        let (vt, desc) = if let Some((val_field, desc)) = aggr.get_minmax_desc() {
+            // MIN/MAX case: use the aggregate output type
+            (val_field.data_type().clone(), desc)
+        } else {
+            // DISTINCT case: use the group key type and get ordering from limit_order_descending
+            // The ordering direction is set by the optimizer when it pushes down the limit
+            let desc = aggr
+                .limit_options()
+                .and_then(|config| config.descending)
+                .ok_or_else(|| {
+                    internal_datafusion_err!(
+                        "Ordering direction required for DISTINCT with limit"
+                    )
+                })?;
+            (kt.clone(), desc)
+        };
+
+        // Type validation is performed by the optimizer and can_use_topk() check.
+        // This debug assertion documents the contract without runtime overhead in release builds.
+        #[cfg(debug_assertions)]
+        {
+            debug_assert!(
+                topk_types_supported(&kt, &vt),
+                "TopK type validation should have been performed by optimizer and can_use_topk(). \
+                 Found unsupported types: key={kt:?}, value={vt:?}"
+            );
+        }
+
+        // Note: Null values in aggregate columns are filtered by the aggregation layer
+        // before reaching the heap, so the heap implementations don't need explicit null handling.
         let priority_map = PriorityMap::new(kt, vt, limit, desc)?;
 
         Ok(GroupedTopKAggregateStream {
@@ -97,11 +127,12 @@ impl RecordBatchStream for GroupedTopKAggregateStream {
 }
 
 impl GroupedTopKAggregateStream {
-    fn intern(&mut self, ids: ArrayRef, vals: ArrayRef) -> Result<()> {
+    fn intern(&mut self, ids: &ArrayRef, vals: &ArrayRef) -> Result<()> {
         let _timer = self.group_by_metrics.time_calculating_group_ids.timer();
 
         let len = ids.len();
-        self.priority_map.set_batch(ids, Arc::clone(&vals));
+        self.priority_map
+            .set_batch(Arc::clone(ids), Arc::clone(vals));
 
         let has_nulls = vals.null_count() > 0;
         for row_idx in 0..len {
@@ -153,21 +184,24 @@ impl Stream for GroupedTopKAggregateStream {
                         "Exactly 1 group value required"
                     );
                     let group_by_values = Arc::clone(&group_by_values[0][0]);
-                    let input_values = {
-                        let _timer = (!self.aggregate_arguments.is_empty()).then(|| {
-                            self.group_by_metrics.aggregate_arguments_time.timer()
-                        });
-                        evaluate_many(
+                    let input_values = if self.aggregate_arguments.is_empty() {
+                        // DISTINCT case: use group key as both key and value
+                        Arc::clone(&group_by_values)
+                    } else {
+                        // MIN/MAX case: evaluate aggregate expressions
+                        let _timer =
+                            self.group_by_metrics.aggregate_arguments_time.timer();
+                        let input_values = evaluate_many(
                             &self.aggregate_arguments,
                             batches.first().unwrap(),
-                        )?
+                        )?;
+                        assert_eq!(input_values.len(), 1, "Exactly 1 input required");
+                        assert_eq!(input_values[0].len(), 1, "Exactly 1 input required");
+                        Arc::clone(&input_values[0][0])
                     };
-                    assert_eq!(input_values.len(), 1, "Exactly 1 input required");
-                    assert_eq!(input_values[0].len(), 1, "Exactly 1 input required");
-                    let input_values = Arc::clone(&input_values[0][0]);
 
                     // iterate over each column of group_by values
-                    (*self).intern(group_by_values, input_values)?;
+                    (*self).intern(&group_by_values, &input_values)?;
                 }
                 // inner is done, emit all rows and switch to producing output
                 None => {
@@ -177,9 +211,15 @@ impl Stream for GroupedTopKAggregateStream {
                     }
                     let batch = {
                         let _timer = emitting_time.timer();
-                        let cols = self.priority_map.emit()?;
+                        let mut cols = self.priority_map.emit()?;
+                        // For DISTINCT case (no aggregate expressions), only use the group key column
+                        // since the schema only has one field and key/value are the same
+                        if self.aggregate_arguments.is_empty() {
+                            cols.truncate(1);
+                        }
                         RecordBatch::try_new(Arc::clone(&self.schema), cols)?
                     };
+                    let batch = batch.record_output(&self.baseline_metrics);
                     trace!(
                         "partition {} emit batch with {} rows",
                         self.partition,
diff --git a/datafusion/physical-plan/src/analyze.rs b/datafusion/physical-plan/src/analyze.rs
index c696cf5aa5e60..491a0872a2f97 100644
--- a/datafusion/physical-plan/src/analyze.rs
+++ b/datafusion/physical-plan/src/analyze.rs
@@ -17,7 +17,6 @@
 
 //! Defines the ANALYZE operator
 
-use std::any::Any;
 use std::sync::Arc;
 
 use super::stream::{RecordBatchReceiverStream, RecordBatchStreamAdapter};
@@ -26,14 +25,16 @@ use super::{
     SendableRecordBatchStream,
 };
 use crate::display::DisplayableExecutionPlan;
-use crate::metrics::MetricType;
+use crate::metrics::{MetricCategory, MetricType};
 use crate::{DisplayFormatType, ExecutionPlan, Partitioning};
 
 use arrow::{array::StringBuilder, datatypes::SchemaRef, record_batch::RecordBatch};
 use datafusion_common::instant::Instant;
-use datafusion_common::{internal_err, DataFusionError, Result};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{DataFusionError, Result, assert_eq_or_internal_err};
 use datafusion_execution::TaskContext;
 use datafusion_physical_expr::EquivalenceProperties;
+use datafusion_physical_expr::PhysicalExpr;
 
 use futures::StreamExt;
 
@@ -47,11 +48,13 @@ pub struct AnalyzeExec {
     show_statistics: bool,
     /// Which metric categories should be displayed
     metric_types: Vec<MetricType>,
+    /// Optional filter by semantic category (rows / bytes / timing).
+    metric_categories: Option<Vec<MetricCategory>>,
     /// The input plan (the plan being analyzed)
     pub(crate) input: Arc<dyn ExecutionPlan>,
     /// The output schema for RecordBatches of this exec node
     schema: SchemaRef,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl AnalyzeExec {
@@ -60,6 +63,7 @@ impl AnalyzeExec {
         verbose: bool,
         show_statistics: bool,
         metric_types: Vec<MetricType>,
+        metric_categories: Option<Vec<MetricCategory>>,
         input: Arc<dyn ExecutionPlan>,
         schema: SchemaRef,
     ) -> Self {
@@ -68,9 +72,10 @@ impl AnalyzeExec {
             verbose,
             show_statistics,
             metric_types,
+            metric_categories,
             input,
             schema,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -84,6 +89,11 @@ impl AnalyzeExec {
         self.show_statistics
     }
 
+    /// Access to metric_categories
+    pub fn metric_categories(&self) -> Option<&[MetricCategory]> {
+        self.metric_categories.as_deref()
+    }
+
     /// The input plan
     pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
         &self.input
@@ -127,11 +137,7 @@ impl ExecutionPlan for AnalyzeExec {
     }
 
     /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -143,6 +149,13 @@ impl ExecutionPlan for AnalyzeExec {
         vec![Distribution::UnspecifiedDistribution]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         mut children: Vec<Arc<dyn ExecutionPlan>>,
@@ -151,6 +164,7 @@ impl ExecutionPlan for AnalyzeExec {
             self.verbose,
             self.show_statistics,
             self.metric_types.clone(),
+            self.metric_categories.clone(),
             children.pop().unwrap(),
             Arc::clone(&self.schema),
         )))
@@ -161,11 +175,11 @@ impl ExecutionPlan for AnalyzeExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        if 0 != partition {
-            return internal_err!(
-                "AnalyzeExec invalid partition. Expected 0, got {partition}"
-            );
-        }
+        assert_eq_or_internal_err!(
+            partition,
+            0,
+            "AnalyzeExec invalid partition. Expected 0, got {partition}"
+        );
 
         // Gather futures that will run each input partition in
         // parallel (on a separate tokio task) using a JoinSet to
@@ -189,6 +203,7 @@ impl ExecutionPlan for AnalyzeExec {
         let verbose = self.verbose;
         let show_statistics = self.show_statistics;
         let metric_types = self.metric_types.clone();
+        let metric_categories = self.metric_categories.clone();
 
         // future that gathers the results from all the tasks in the
         // JoinSet that computes the overall row count and final
@@ -206,9 +221,10 @@ impl ExecutionPlan for AnalyzeExec {
                 show_statistics,
                 total_rows,
                 duration,
-                captured_input,
-                captured_schema,
+                &captured_input,
+                &captured_schema,
                 &metric_types,
+                metric_categories.as_deref(),
             )
         };
 
@@ -220,14 +236,16 @@ impl ExecutionPlan for AnalyzeExec {
 }
 
 /// Creates the output of AnalyzeExec as a RecordBatch
+#[expect(clippy::too_many_arguments)]
 fn create_output_batch(
     verbose: bool,
     show_statistics: bool,
     total_rows: usize,
     duration: std::time::Duration,
-    input: Arc<dyn ExecutionPlan>,
-    schema: SchemaRef,
+    input: &Arc<dyn ExecutionPlan>,
+    schema: &SchemaRef,
     metric_types: &[MetricType],
+    metric_categories: Option<&[MetricCategory]>,
 ) -> Result<RecordBatch> {
     let mut type_builder = StringBuilder::with_capacity(1, 1024);
     let mut plan_builder = StringBuilder::with_capacity(1, 1024);
@@ -237,6 +255,7 @@ fn create_output_batch(
 
     let annotated_plan = DisplayableExecutionPlan::with_metrics(input.as_ref())
         .set_metric_types(metric_types.to_vec())
+        .set_metric_categories(metric_categories.map(|c| c.to_vec()))
         .set_show_statistics(show_statistics)
         .indent(verbose)
         .to_string();
@@ -249,6 +268,7 @@ fn create_output_batch(
 
         let annotated_plan = DisplayableExecutionPlan::with_full_metrics(input.as_ref())
             .set_metric_types(metric_types.to_vec())
+            .set_metric_categories(metric_categories.map(|c| c.to_vec()))
             .set_show_statistics(show_statistics)
             .indent(verbose)
             .to_string();
@@ -262,7 +282,7 @@ fn create_output_batch(
     }
 
     RecordBatch::try_new(
-        schema,
+        Arc::clone(schema),
         vec![
             Arc::new(type_builder.finish()),
             Arc::new(plan_builder.finish()),
@@ -278,7 +298,7 @@ mod tests {
         collect,
         test::{
             assert_is_pending,
-            exec::{assert_strong_count_converges_to_zero, BlockingExec},
+            exec::{BlockingExec, assert_strong_count_converges_to_zero},
         },
     };
 
@@ -296,7 +316,8 @@ mod tests {
         let analyze_exec = Arc::new(AnalyzeExec::new(
             true,
             false,
-            vec![MetricType::SUMMARY, MetricType::DEV],
+            vec![MetricType::Summary, MetricType::Dev],
+            None,
             blocking_exec,
             schema,
         ));
diff --git a/datafusion/physical-plan/src/async_func.rs b/datafusion/physical-plan/src/async_func.rs
index 54a76e0ebb971..76a68bf5708db 100644
--- a/datafusion/physical-plan/src/async_func.rs
+++ b/datafusion/physical-plan/src/async_func.rs
@@ -15,37 +15,43 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use crate::coalesce::LimitedBatchCoalescer;
 use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet};
 use crate::stream::RecordBatchStreamAdapter;
 use crate::{
     DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties,
+    check_if_same_properties,
 };
 use arrow::array::RecordBatch;
 use arrow_schema::{Fields, Schema, SchemaRef};
-use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion};
-use datafusion_common::{internal_err, Result};
-use datafusion_execution::{SendableRecordBatchStream, TaskContext};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::tree_node::{Transformed, TreeNode};
+use datafusion_common::{Result, assert_eq_or_internal_err};
+use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream, TaskContext};
+use datafusion_physical_expr::ScalarFunctionExpr;
 use datafusion_physical_expr::async_scalar_function::AsyncFuncExpr;
 use datafusion_physical_expr::equivalence::ProjectionMapping;
 use datafusion_physical_expr::expressions::Column;
-use datafusion_physical_expr::ScalarFunctionExpr;
+use datafusion_physical_expr_common::metrics::{BaselineMetrics, RecordOutput};
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use futures::Stream;
 use futures::stream::StreamExt;
 use log::trace;
-use std::any::Any;
+use std::pin::Pin;
 use std::sync::Arc;
+use std::task::{Context, Poll, ready};
 
 /// This structure evaluates a set of async expressions on a record
 /// batch producing a new record batch
 ///
 /// The schema of the output of the AsyncFuncExec is:
 /// Input columns followed by one column for each async expression
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct AsyncFuncExec {
     /// The async expressions to evaluate
     async_exprs: Vec<Arc<AsyncFuncExpr>>,
     input: Arc<dyn ExecutionPlan>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
     metrics: ExecutionPlanMetricsSet,
 }
 
@@ -79,7 +85,7 @@ impl AsyncFuncExec {
         Ok(Self {
             input,
             async_exprs,
-            cache,
+            cache: Arc::new(cache),
             metrics: ExecutionPlanMetricsSet::new(),
         })
     }
@@ -100,6 +106,25 @@ impl AsyncFuncExec {
             input.boundedness(),
         ))
     }
+
+    pub fn async_exprs(&self) -> &[Arc<AsyncFuncExpr>] {
+        &self.async_exprs
+    }
+
+    pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
+        &self.input
+    }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for AsyncFuncExec {
@@ -132,11 +157,7 @@ impl ExecutionPlan for AsyncFuncExec {
         "async_func"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -144,16 +165,26 @@ impl ExecutionPlan for AsyncFuncExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
-        children: Vec<Arc<dyn ExecutionPlan>>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        if children.len() != 1 {
-            return internal_err!("AsyncFuncExec wrong number of children");
-        }
+        assert_eq_or_internal_err!(
+            children.len(),
+            1,
+            "AsyncFuncExec wrong number of children"
+        );
+        check_if_same_properties!(self, children);
         Ok(Arc::new(AsyncFuncExec::try_new(
             self.async_exprs.clone(),
-            Arc::clone(&children[0]),
+            children.swap_remove(0),
         )?))
     }
 
@@ -168,22 +199,35 @@ impl ExecutionPlan for AsyncFuncExec {
             context.session_id(),
             context.task_id()
         );
-        // TODO figure out how to record metrics
 
         // first execute the input stream
         let input_stream = self.input.execute(partition, Arc::clone(&context))?;
 
+        // TODO: Track `elapsed_compute` in `BaselineMetrics`
+        // Issue: <https://github.com/apache/datafusion/issues/19658>
+        let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
+
         // now, for each record batch, evaluate the async expressions and add the columns to the result
         let async_exprs_captured = Arc::new(self.async_exprs.clone());
         let schema_captured = self.schema();
         let config_options_ref = Arc::clone(context.session_config().options());
 
-        let stream_with_async_functions = input_stream.then(move |batch| {
+        let coalesced_input_stream = CoalesceInputStream {
+            input_stream,
+            batch_coalescer: LimitedBatchCoalescer::new(
+                Arc::clone(&self.input.schema()),
+                config_options_ref.execution.batch_size,
+                None,
+            ),
+        };
+
+        let stream_with_async_functions = coalesced_input_stream.then(move |batch| {
             // need to clone *again* to capture the async_exprs and schema in the
             // stream and satisfy lifetime requirements.
             let async_exprs_captured = Arc::clone(&async_exprs_captured);
             let schema_captured = Arc::clone(&schema_captured);
             let config_options = Arc::clone(&config_options_ref);
+            let baseline_metrics_captured = baseline_metrics.clone();
 
             async move {
                 let batch = batch?;
@@ -196,7 +240,8 @@ impl ExecutionPlan for AsyncFuncExec {
                     output_arrays.push(output.to_array(batch.num_rows())?);
                 }
                 let batch = RecordBatch::try_new(schema_captured, output_arrays)?;
-                Ok(batch)
+
+                Ok(batch.record_output(&baseline_metrics_captured))
             }
         });
 
@@ -211,6 +256,49 @@ impl ExecutionPlan for AsyncFuncExec {
     }
 }
 
+struct CoalesceInputStream {
+    input_stream: Pin<Box<dyn RecordBatchStream + Send>>,
+    batch_coalescer: LimitedBatchCoalescer,
+}
+
+impl Stream for CoalesceInputStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        let mut completed = false;
+
+        loop {
+            if let Some(batch) = self.batch_coalescer.next_completed_batch() {
+                return Poll::Ready(Some(Ok(batch)));
+            }
+
+            if completed {
+                return Poll::Ready(None);
+            }
+
+            match ready!(self.input_stream.poll_next_unpin(cx)) {
+                Some(Ok(batch)) => {
+                    if let Err(err) = self.batch_coalescer.push_batch(batch) {
+                        return Poll::Ready(Some(Err(err)));
+                    }
+                }
+                Some(err) => {
+                    return Poll::Ready(Some(err));
+                }
+                None => {
+                    completed = true;
+                    if let Err(err) = self.batch_coalescer.finish() {
+                        return Poll::Ready(Some(Err(err)));
+                    }
+                }
+            }
+        }
+    }
+}
+
 const ASYNC_FN_PREFIX: &str = "__async_fn_";
 
 /// Maps async_expressions to new columns
@@ -250,17 +338,15 @@ impl AsyncMapper {
     ) -> Result<()> {
         // recursively look for references to async functions
         physical_expr.apply(|expr| {
-            if let Some(scalar_func_expr) =
-                expr.as_any().downcast_ref::<ScalarFunctionExpr>()
+            if let Some(scalar_func_expr) = expr.downcast_ref::<ScalarFunctionExpr>()
+                && scalar_func_expr.fun().as_async().is_some()
             {
-                if scalar_func_expr.fun().as_async().is_some() {
-                    let next_name = self.next_column_name();
-                    self.async_exprs.push(Arc::new(AsyncFuncExpr::try_new(
-                        next_name,
-                        Arc::clone(expr),
-                        schema,
-                    )?));
-                }
+                let next_name = self.next_column_name();
+                self.async_exprs.push(Arc::new(AsyncFuncExpr::try_new(
+                    next_name,
+                    Arc::clone(expr),
+                    schema,
+                )?));
             }
             Ok(TreeNodeRecursion::Continue)
         })?;
@@ -298,3 +384,51 @@ impl AsyncMapper {
         Arc::new(Column::new(async_expr.name(), output_idx))
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::array::{RecordBatch, UInt32Array};
+    use arrow_schema::{DataType, Field, Schema};
+    use datafusion_common::Result;
+    use datafusion_execution::{TaskContext, config::SessionConfig};
+    use futures::StreamExt;
+
+    use crate::{ExecutionPlan, async_func::AsyncFuncExec, test::TestMemoryExec};
+
+    #[tokio::test]
+    async fn test_async_fn_with_coalescing() -> Result<()> {
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("c0", DataType::UInt32, false)]));
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(UInt32Array::from(vec![1, 2, 3, 4, 5, 6]))],
+        )?;
+
+        let batches: Vec<RecordBatch> = std::iter::repeat_n(batch, 50).collect();
+
+        let session_config = SessionConfig::new().with_batch_size(200);
+        let task_ctx = TaskContext::default().with_session_config(session_config);
+        let task_ctx = Arc::new(task_ctx);
+
+        let test_exec =
+            TestMemoryExec::try_new_exec(&[batches], Arc::clone(&schema), None)?;
+        let exec = AsyncFuncExec::try_new(vec![], test_exec)?;
+
+        let mut stream = exec.execute(0, Arc::clone(&task_ctx))?;
+        let batch = stream
+            .next()
+            .await
+            .expect("expected to get a record batch")?;
+        assert_eq!(200, batch.num_rows());
+        let batch = stream
+            .next()
+            .await
+            .expect("expected to get a record batch")?;
+        assert_eq!(100, batch.num_rows());
+
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/buffer.rs b/datafusion/physical-plan/src/buffer.rs
new file mode 100644
index 0000000000000..0cc4a1d71814e
--- /dev/null
+++ b/datafusion/physical-plan/src/buffer.rs
@@ -0,0 +1,646 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`BufferExec`] decouples production and consumption on messages by buffering the input in the
+//! background up to a certain capacity.
+
+use crate::execution_plan::{CardinalityEffect, SchedulingType};
+use crate::filter_pushdown::{
+    ChildPushdownResult, FilterDescription, FilterPushdownPhase,
+    FilterPushdownPropagation,
+};
+use crate::projection::ProjectionExec;
+use crate::stream::RecordBatchStreamAdapter;
+use crate::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, SortOrderPushdownResult,
+    check_if_same_properties,
+};
+use arrow::array::RecordBatch;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, Statistics, internal_err, plan_err};
+use datafusion_common_runtime::SpawnedTask;
+use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
+use datafusion_execution::{SendableRecordBatchStream, TaskContext};
+use datafusion_physical_expr_common::metrics::{
+    ExecutionPlanMetricsSet, MetricBuilder, MetricCategory, MetricsSet,
+};
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
+use futures::{Stream, StreamExt, TryStreamExt};
+use pin_project_lite::pin_project;
+use std::fmt;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::task::{Context, Poll};
+use tokio::sync::mpsc::UnboundedReceiver;
+use tokio::sync::{OwnedSemaphorePermit, Semaphore};
+
+/// WARNING: EXPERIMENTAL
+///
+/// Decouples production and consumption of record batches with an internal queue per partition,
+/// eagerly filling up the capacity of the queues even before any message is requested.
+///
+/// ```text
+///             ┌───────────────────────────┐
+///             │        BufferExec         │
+///             │                           │
+///             │┌────── Partition 0 ──────┐│
+///             ││            ┌────┐ ┌────┐││       ┌────┐
+/// ──background poll────────▶│    │ │    ├┼┼───────▶    │
+///             ││            └────┘ └────┘││       └────┘
+///             │└─────────────────────────┘│
+///             │┌────── Partition 1 ──────┐│
+///             ││     ┌────┐ ┌────┐ ┌────┐││       ┌────┐
+/// ──background poll─▶│    │ │    │ │    ├┼┼───────▶    │
+///             ││     └────┘ └────┘ └────┘││       └────┘
+///             │└─────────────────────────┘│
+///             │                           │
+///             │           ...             │
+///             │                           │
+///             │┌────── Partition N ──────┐│
+///             ││                   ┌────┐││       ┌────┐
+/// ──background poll───────────────▶│    ├┼┼───────▶    │
+///             ││                   └────┘││       └────┘
+///             │└─────────────────────────┘│
+///             └───────────────────────────┘
+/// ```
+///
+/// The capacity is provided in bytes, and for each buffered record batch it will take into account
+/// the size reported by [RecordBatch::get_array_memory_size].
+///
+/// If a single record batch exceeds the maximum capacity set in the `capacity` argument, it's still
+/// allowed to pass in order to not deadlock the buffer.
+///
+/// This is useful for operators that conditionally start polling one of their children only after
+/// other child has finished, allowing to perform some early work and accumulating batches in
+/// memory so that they can be served immediately when requested.
+#[derive(Debug, Clone)]
+pub struct BufferExec {
+    input: Arc<dyn ExecutionPlan>,
+    properties: Arc<PlanProperties>,
+    capacity: usize,
+    metrics: ExecutionPlanMetricsSet,
+}
+
+impl BufferExec {
+    /// Builds a new [BufferExec] with the provided capacity in bytes.
+    pub fn new(input: Arc<dyn ExecutionPlan>, capacity: usize) -> Self {
+        let properties = PlanProperties::clone(input.properties())
+            .with_scheduling_type(SchedulingType::Cooperative);
+
+        Self {
+            input,
+            properties: Arc::new(properties),
+            capacity,
+            metrics: ExecutionPlanMetricsSet::new(),
+        }
+    }
+
+    /// Returns the input [ExecutionPlan] of this [BufferExec].
+    pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
+        &self.input
+    }
+
+    /// Returns the per-partition capacity in bytes for this [BufferExec].
+    pub fn capacity(&self) -> usize {
+        self.capacity
+    }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
+}
+
+impl DisplayAs for BufferExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "BufferExec: capacity={}", self.capacity)
+            }
+            DisplayFormatType::TreeRender => {
+                writeln!(f, "target_batch_size={}", self.capacity)
+            }
+        }
+    }
+}
+
+impl ExecutionPlan for BufferExec {
+    fn name(&self) -> &str {
+        "BufferExec"
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.properties
+    }
+
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![true]
+    }
+
+    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
+        vec![false]
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.input]
+    }
+
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
+        if children.len() != 1 {
+            return plan_err!("BufferExec can only have one child");
+        }
+        Ok(Arc::new(Self::new(children.swap_remove(0), self.capacity)))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        let mem_reservation = MemoryConsumer::new(format!("BufferExec[{partition}]"))
+            .register(context.memory_pool());
+        let in_stream = self.input.execute(partition, context)?;
+
+        // Set up the metrics for the stream.
+        let curr_mem_in = Arc::new(AtomicUsize::new(0));
+        let curr_mem_out = Arc::clone(&curr_mem_in);
+        let mut max_mem_in = 0;
+        let max_mem = MetricBuilder::new(&self.metrics)
+            .with_category(MetricCategory::Bytes)
+            .gauge("max_mem_used", partition);
+
+        let curr_queued_in = Arc::new(AtomicUsize::new(0));
+        let curr_queued_out = Arc::clone(&curr_queued_in);
+        let mut max_queued_in = 0;
+        let max_queued = MetricBuilder::new(&self.metrics)
+            .with_category(MetricCategory::Rows)
+            .gauge("max_queued", partition);
+
+        // Capture metrics when an element is queued on the stream.
+        let in_stream = in_stream.inspect_ok(move |v| {
+            let size = v.get_array_memory_size();
+            let curr_size = curr_mem_in.fetch_add(size, Ordering::Relaxed) + size;
+            if curr_size > max_mem_in {
+                max_mem_in = curr_size;
+                max_mem.set(max_mem_in);
+            }
+
+            let curr_queued = curr_queued_in.fetch_add(1, Ordering::Relaxed) + 1;
+            if curr_queued > max_queued_in {
+                max_queued_in = curr_queued;
+                max_queued.set(max_queued_in);
+            }
+        });
+        // Buffer the input.
+        let out_stream =
+            MemoryBufferedStream::new(in_stream, self.capacity, mem_reservation);
+        // Update in the metrics that when an element gets out, some memory gets freed.
+        let out_stream = out_stream.inspect_ok(move |v| {
+            curr_mem_out.fetch_sub(v.get_array_memory_size(), Ordering::Relaxed);
+            curr_queued_out.fetch_sub(1, Ordering::Relaxed);
+        });
+
+        Ok(Box::pin(RecordBatchStreamAdapter::new(
+            self.schema(),
+            out_stream,
+        )))
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        self.input.partition_statistics(partition)
+    }
+
+    fn supports_limit_pushdown(&self) -> bool {
+        self.input.supports_limit_pushdown()
+    }
+
+    fn cardinality_effect(&self) -> CardinalityEffect {
+        CardinalityEffect::Equal
+    }
+
+    fn try_swapping_with_projection(
+        &self,
+        projection: &ProjectionExec,
+    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+        match self.input.try_swapping_with_projection(projection)? {
+            Some(new_input) => Ok(Some(
+                Arc::new(self.clone()).with_new_children(vec![new_input])?,
+            )),
+            None => Ok(None),
+        }
+    }
+
+    fn gather_filters_for_pushdown(
+        &self,
+        _phase: FilterPushdownPhase,
+        parent_filters: Vec<Arc<dyn PhysicalExpr>>,
+        _config: &ConfigOptions,
+    ) -> Result<FilterDescription> {
+        FilterDescription::from_children(parent_filters, &self.children())
+    }
+
+    fn handle_child_pushdown_result(
+        &self,
+        _phase: FilterPushdownPhase,
+        child_pushdown_result: ChildPushdownResult,
+        _config: &ConfigOptions,
+    ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
+        Ok(FilterPushdownPropagation::if_all(child_pushdown_result))
+    }
+
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn ExecutionPlan>>> {
+        // CoalesceBatchesExec is transparent for sort ordering - it preserves order
+        // Delegate to the child and wrap with a new CoalesceBatchesExec
+        self.input.try_pushdown_sort(order)?.try_map(|new_input| {
+            Ok(Arc::new(Self::new(new_input, self.capacity)) as Arc<dyn ExecutionPlan>)
+        })
+    }
+}
+
+/// Represents anything that occupies a capacity in a [MemoryBufferedStream].
+pub trait SizedMessage {
+    fn size(&self) -> usize;
+}
+
+impl SizedMessage for RecordBatch {
+    fn size(&self) -> usize {
+        self.get_array_memory_size()
+    }
+}
+
+pin_project! {
+/// Decouples production and consumption of messages in a stream with an internal queue, eagerly
+/// filling it up to the specified maximum capacity even before any message is requested.
+///
+/// Allows each message to have a different size, which is taken into account for determining if
+/// the queue is full or not.
+pub struct MemoryBufferedStream<T: SizedMessage> {
+    task: SpawnedTask<()>,
+    batch_rx: UnboundedReceiver<Result<(T, OwnedSemaphorePermit)>>,
+    memory_reservation: Arc<MemoryReservation>,
+}}
+
+impl<T: Send + SizedMessage + 'static> MemoryBufferedStream<T> {
+    /// Builds a new [MemoryBufferedStream] with the provided capacity and event handler.
+    ///
+    /// This immediately spawns a Tokio task that will start consumption of the input stream.
+    pub fn new(
+        mut input: impl Stream<Item = Result<T>> + Unpin + Send + 'static,
+        capacity: usize,
+        memory_reservation: MemoryReservation,
+    ) -> Self {
+        let semaphore = Arc::new(Semaphore::new(capacity));
+        let (batch_tx, batch_rx) = tokio::sync::mpsc::unbounded_channel();
+
+        let memory_reservation = Arc::new(memory_reservation);
+        let memory_reservation_clone = Arc::clone(&memory_reservation);
+        let task = SpawnedTask::spawn(async move {
+            loop {
+                // Select on both the input stream and the channel being closed.
+                // By down this, we abort polling the input as soon as the consumer channel is
+                // closed. Otherwise, we would need to wait for a full new message to be available
+                // in order to consider aborting the stream
+                let item_or_err = tokio::select! {
+                    biased;
+                    _ = batch_tx.closed() => break,
+                    item_or_err = input.next() => {
+                        let Some(item_or_err) = item_or_err else {
+                            break; // stream finished
+                        };
+                        item_or_err
+                    }
+                };
+
+                let item = match item_or_err {
+                    Ok(batch) => batch,
+                    Err(err) => {
+                        let _ = batch_tx.send(Err(err)); // If there's an error it means the channel was closed, which is fine.
+                        break;
+                    }
+                };
+
+                let size = item.size();
+                if let Err(err) = memory_reservation.try_grow(size) {
+                    let _ = batch_tx.send(Err(err)); // If there's an error it means the channel was closed, which is fine.
+                    break;
+                }
+
+                // We need to cap the minimum between amount of permits and the actual size of the
+                // message. If at any point we try to acquire more permits than the capacity of the
+                // semaphore, the stream will deadlock.
+                let capped_size = size.min(capacity) as u32;
+
+                let semaphore = Arc::clone(&semaphore);
+                let Ok(permit) = semaphore.acquire_many_owned(capped_size).await else {
+                    let _ = batch_tx.send(internal_err!("Closed semaphore in MemoryBufferedStream. This is a bug in DataFusion, please report it!"));
+                    break;
+                };
+
+                if batch_tx.send(Ok((item, permit))).is_err() {
+                    break; // stream was closed
+                };
+            }
+        });
+
+        Self {
+            task,
+            batch_rx,
+            memory_reservation: memory_reservation_clone,
+        }
+    }
+
+    /// Returns the number of queued messages.
+    pub fn messages_queued(&self) -> usize {
+        self.batch_rx.len()
+    }
+}
+
+impl<T: SizedMessage> Stream for MemoryBufferedStream<T> {
+    type Item = Result<T>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let self_project = self.project();
+        match self_project.batch_rx.poll_recv(cx) {
+            Poll::Ready(Some(Ok((item, _semaphore_permit)))) => {
+                self_project.memory_reservation.shrink(item.size());
+                Poll::Ready(Some(Ok(item)))
+            }
+            Poll::Ready(Some(Err(err))) => Poll::Ready(Some(Err(err))),
+            Poll::Ready(None) => Poll::Ready(None),
+            Poll::Pending => Poll::Pending,
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        if self.batch_rx.is_closed() {
+            let len = self.batch_rx.len();
+            (len, Some(len))
+        } else {
+            (self.batch_rx.len(), None)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use datafusion_common::{DataFusionError, assert_contains};
+    use datafusion_execution::memory_pool::{
+        GreedyMemoryPool, MemoryPool, UnboundedMemoryPool,
+    };
+    use std::error::Error;
+    use std::fmt::Debug;
+    use std::time::Duration;
+    use tokio::time::timeout;
+
+    #[tokio::test]
+    async fn buffers_only_some_messages() -> Result<(), Box<dyn Error>> {
+        let input = futures::stream::iter([1, 2, 3, 4]).map(Ok);
+        let (_, res) = memory_pool_and_reservation();
+
+        let buffered = MemoryBufferedStream::new(input, 4, res);
+        wait_for_buffering().await;
+        assert_eq!(buffered.messages_queued(), 2);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn yields_all_messages() -> Result<(), Box<dyn Error>> {
+        let input = futures::stream::iter([1, 2, 3, 4]).map(Ok);
+        let (_, res) = memory_pool_and_reservation();
+
+        let mut buffered = MemoryBufferedStream::new(input, 10, res);
+        wait_for_buffering().await;
+        assert_eq!(buffered.messages_queued(), 4);
+
+        pull_ok_msg(&mut buffered).await?;
+        pull_ok_msg(&mut buffered).await?;
+        pull_ok_msg(&mut buffered).await?;
+        pull_ok_msg(&mut buffered).await?;
+        finished(&mut buffered).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn yields_first_msg_even_if_big() -> Result<(), Box<dyn Error>> {
+        let input = futures::stream::iter([25, 1, 2, 3]).map(Ok);
+        let (_, res) = memory_pool_and_reservation();
+
+        let mut buffered = MemoryBufferedStream::new(input, 10, res);
+        wait_for_buffering().await;
+        assert_eq!(buffered.messages_queued(), 1);
+        pull_ok_msg(&mut buffered).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn memory_pool_kills_stream() -> Result<(), Box<dyn Error>> {
+        let input = futures::stream::iter([1, 2, 3, 4]).map(Ok);
+        let (_, res) = bounded_memory_pool_and_reservation(7);
+
+        let mut buffered = MemoryBufferedStream::new(input, 10, res);
+        wait_for_buffering().await;
+
+        pull_ok_msg(&mut buffered).await?;
+        pull_ok_msg(&mut buffered).await?;
+        pull_ok_msg(&mut buffered).await?;
+        let msg = pull_err_msg(&mut buffered).await?;
+
+        assert_contains!(msg.to_string(), "Failed to allocate additional 4.0 B");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn memory_pool_does_not_kill_stream() -> Result<(), Box<dyn Error>> {
+        let input = futures::stream::iter([1, 2, 3, 4]).map(Ok);
+        let (_, res) = bounded_memory_pool_and_reservation(7);
+
+        let mut buffered = MemoryBufferedStream::new(input, 3, res);
+        wait_for_buffering().await;
+        pull_ok_msg(&mut buffered).await?;
+
+        wait_for_buffering().await;
+        pull_ok_msg(&mut buffered).await?;
+
+        wait_for_buffering().await;
+        pull_ok_msg(&mut buffered).await?;
+
+        wait_for_buffering().await;
+        pull_ok_msg(&mut buffered).await?;
+
+        wait_for_buffering().await;
+        finished(&mut buffered).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn messages_pass_even_if_all_exceed_limit() -> Result<(), Box<dyn Error>> {
+        let input = futures::stream::iter([3, 3, 3, 3]).map(Ok);
+        let (_, res) = memory_pool_and_reservation();
+
+        let mut buffered = MemoryBufferedStream::new(input, 2, res);
+        wait_for_buffering().await;
+        assert_eq!(buffered.messages_queued(), 1);
+        pull_ok_msg(&mut buffered).await?;
+
+        wait_for_buffering().await;
+        assert_eq!(buffered.messages_queued(), 1);
+        pull_ok_msg(&mut buffered).await?;
+
+        wait_for_buffering().await;
+        assert_eq!(buffered.messages_queued(), 1);
+        pull_ok_msg(&mut buffered).await?;
+
+        wait_for_buffering().await;
+        assert_eq!(buffered.messages_queued(), 1);
+        pull_ok_msg(&mut buffered).await?;
+
+        wait_for_buffering().await;
+        finished(&mut buffered).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn errors_get_propagated() -> Result<(), Box<dyn Error>> {
+        let input = futures::stream::iter([1, 2, 3, 4]).map(|v| {
+            if v == 3 {
+                return internal_err!("Error on 3");
+            }
+            Ok(v)
+        });
+        let (_, res) = memory_pool_and_reservation();
+
+        let mut buffered = MemoryBufferedStream::new(input, 10, res);
+        wait_for_buffering().await;
+
+        pull_ok_msg(&mut buffered).await?;
+        pull_ok_msg(&mut buffered).await?;
+        pull_err_msg(&mut buffered).await?;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn memory_gets_released_if_stream_drops() -> Result<(), Box<dyn Error>> {
+        let input = futures::stream::iter([1, 2, 3, 4]).map(Ok);
+        let (pool, res) = memory_pool_and_reservation();
+
+        let mut buffered = MemoryBufferedStream::new(input, 10, res);
+        wait_for_buffering().await;
+        assert_eq!(buffered.messages_queued(), 4);
+        assert_eq!(pool.reserved(), 10);
+
+        pull_ok_msg(&mut buffered).await?;
+        assert_eq!(buffered.messages_queued(), 3);
+        assert_eq!(pool.reserved(), 9);
+
+        pull_ok_msg(&mut buffered).await?;
+        assert_eq!(buffered.messages_queued(), 2);
+        assert_eq!(pool.reserved(), 7);
+
+        drop(buffered);
+        assert_eq!(pool.reserved(), 0);
+        Ok(())
+    }
+
+    fn memory_pool_and_reservation() -> (Arc<dyn MemoryPool>, MemoryReservation) {
+        let pool = Arc::new(UnboundedMemoryPool::default()) as _;
+        let reservation = MemoryConsumer::new("test").register(&pool);
+        (pool, reservation)
+    }
+
+    fn bounded_memory_pool_and_reservation(
+        size: usize,
+    ) -> (Arc<dyn MemoryPool>, MemoryReservation) {
+        let pool = Arc::new(GreedyMemoryPool::new(size)) as _;
+        let reservation = MemoryConsumer::new("test").register(&pool);
+        (pool, reservation)
+    }
+
+    async fn wait_for_buffering() {
+        // We do not have control over the spawned task, so the best we can do is to yield some
+        // cycles to the tokio runtime and let the task make progress on its own.
+        tokio::time::sleep(Duration::from_millis(1)).await;
+    }
+
+    async fn pull_ok_msg<T: SizedMessage>(
+        buffered: &mut MemoryBufferedStream<T>,
+    ) -> Result<T, Box<dyn Error>> {
+        Ok(timeout(Duration::from_millis(1), buffered.next())
+            .await?
+            .unwrap_or_else(|| internal_err!("Stream should not have finished"))?)
+    }
+
+    async fn pull_err_msg<T: SizedMessage + Debug>(
+        buffered: &mut MemoryBufferedStream<T>,
+    ) -> Result<DataFusionError, Box<dyn Error>> {
+        Ok(timeout(Duration::from_millis(1), buffered.next())
+            .await?
+            .map(|v| match v {
+                Ok(v) => internal_err!(
+                    "Stream should not have failed, but succeeded with {v:?}"
+                ),
+                Err(err) => Ok(err),
+            })
+            .unwrap_or_else(|| internal_err!("Stream should not have finished"))?)
+    }
+
+    async fn finished<T: SizedMessage>(
+        buffered: &mut MemoryBufferedStream<T>,
+    ) -> Result<(), Box<dyn Error>> {
+        match timeout(Duration::from_millis(1), buffered.next())
+            .await?
+            .is_none()
+        {
+            true => Ok(()),
+            false => internal_err!("Stream should have finished")?,
+        }
+    }
+
+    impl SizedMessage for usize {
+        fn size(&self) -> usize {
+            *self
+        }
+    }
+}
diff --git a/datafusion/physical-plan/src/coalesce/mod.rs b/datafusion/physical-plan/src/coalesce/mod.rs
index 8405a660f063d..ea1a87d091481 100644
--- a/datafusion/physical-plan/src/coalesce/mod.rs
+++ b/datafusion/physical-plan/src/coalesce/mod.rs
@@ -18,7 +18,7 @@
 use arrow::array::RecordBatch;
 use arrow::compute::BatchCoalescer;
 use arrow::datatypes::SchemaRef;
-use datafusion_common::{internal_err, Result};
+use datafusion_common::{Result, assert_or_internal_err};
 
 /// Concatenate multiple [`RecordBatch`]es and apply a limit
 ///
@@ -88,11 +88,10 @@ impl LimitedBatchCoalescer {
     /// Returns an error if called after [`Self::finish`] or if the internal push
     /// operation fails.
     pub fn push_batch(&mut self, batch: RecordBatch) -> Result<PushBatchStatus> {
-        if self.finished {
-            return internal_err!(
-                "LimitedBatchCoalescer: cannot push batch after finish"
-            );
-        }
+        assert_or_internal_err!(
+            !self.finished,
+            "LimitedBatchCoalescer: cannot push batch after finish"
+        );
 
         // if we are at the limit, return LimitReached
         if let Some(fetch) = self.fetch {
@@ -135,6 +134,10 @@ impl LimitedBatchCoalescer {
         Ok(())
     }
 
+    pub(crate) fn is_finished(&self) -> bool {
+        self.finished
+    }
+
     /// Return the next completed batch, if any
     pub fn next_completed_batch(&mut self) -> Option<RecordBatch> {
         self.inner.next_completed_batch()
diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs
index eb3c3b5befbdd..2bf046f03b6cf 100644
--- a/datafusion/physical-plan/src/coalesce_batches.rs
+++ b/datafusion/physical-plan/src/coalesce_batches.rs
@@ -17,20 +17,22 @@
 
 //! [`CoalesceBatchesExec`] combines small batches into larger batches.
 
-use std::any::Any;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 
 use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
 use super::{DisplayAs, ExecutionPlanProperties, PlanProperties, Statistics};
+use crate::projection::ProjectionExec;
 use crate::{
     DisplayFormatType, ExecutionPlan, RecordBatchStream, SendableRecordBatchStream,
+    check_if_same_properties,
 };
 
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
 use datafusion_common::Result;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_execution::TaskContext;
 use datafusion_physical_expr::PhysicalExpr;
 
@@ -40,7 +42,9 @@ use crate::filter_pushdown::{
     ChildPushdownResult, FilterDescription, FilterPushdownPhase,
     FilterPushdownPropagation,
 };
+use crate::sort_pushdown::SortOrderPushdownResult;
 use datafusion_common::config::ConfigOptions;
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 use futures::ready;
 use futures::stream::{Stream, StreamExt};
 
@@ -54,6 +58,10 @@ use futures::stream::{Stream, StreamExt};
 /// reaches the `fetch` value.
 ///
 /// See [`LimitedBatchCoalescer`] for more information
+#[deprecated(
+    since = "52.0.0",
+    note = "We now use BatchCoalescer from arrow-rs instead of a dedicated operator"
+)]
 #[derive(Debug, Clone)]
 pub struct CoalesceBatchesExec {
     /// The input plan
@@ -64,9 +72,10 @@ pub struct CoalesceBatchesExec {
     fetch: Option<usize>,
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
+#[expect(deprecated)]
 impl CoalesceBatchesExec {
     /// Create a new CoalesceBatchesExec
     pub fn new(input: Arc<dyn ExecutionPlan>, target_batch_size: usize) -> Self {
@@ -76,7 +85,7 @@ impl CoalesceBatchesExec {
             target_batch_size,
             fetch: None,
             metrics: ExecutionPlanMetricsSet::new(),
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -107,8 +116,20 @@ impl CoalesceBatchesExec {
             input.boundedness(),
         )
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
+#[expect(deprecated)]
 impl DisplayAs for CoalesceBatchesExec {
     fn fmt_as(
         &self,
@@ -139,17 +160,14 @@ impl DisplayAs for CoalesceBatchesExec {
     }
 }
 
+#[expect(deprecated)]
 impl ExecutionPlan for CoalesceBatchesExec {
     fn name(&self) -> &'static str {
         "CoalesceBatchesExec"
     }
 
     /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -165,12 +183,20 @@ impl ExecutionPlan for CoalesceBatchesExec {
         vec![false]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
-        children: Vec<Arc<dyn ExecutionPlan>>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         Ok(Arc::new(
-            CoalesceBatchesExec::new(Arc::clone(&children[0]), self.target_batch_size)
+            CoalesceBatchesExec::new(children.swap_remove(0), self.target_batch_size)
                 .with_fetch(self.fetch),
         ))
     }
@@ -196,14 +222,9 @@ impl ExecutionPlan for CoalesceBatchesExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        self.input
-            .partition_statistics(partition)?
-            .with_fetch(self.fetch, 0, 1)
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let stats = Arc::unwrap_or_clone(self.input.partition_statistics(partition)?);
+        Ok(Arc::new(stats.with_fetch(self.fetch, 0, 1)?))
     }
 
     fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn ExecutionPlan>> {
@@ -212,7 +233,7 @@ impl ExecutionPlan for CoalesceBatchesExec {
             target_batch_size: self.target_batch_size,
             fetch: limit,
             metrics: self.metrics.clone(),
-            cache: self.cache.clone(),
+            cache: Arc::clone(&self.cache),
         }))
     }
 
@@ -224,6 +245,18 @@ impl ExecutionPlan for CoalesceBatchesExec {
         CardinalityEffect::Equal
     }
 
+    fn try_swapping_with_projection(
+        &self,
+        projection: &ProjectionExec,
+    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+        match self.input.try_swapping_with_projection(projection)? {
+            Some(new_input) => Ok(Some(
+                Arc::new(self.clone()).with_new_children(vec![new_input])?,
+            )),
+            None => Ok(None),
+        }
+    }
+
     fn gather_filters_for_pushdown(
         &self,
         _phase: FilterPushdownPhase,
@@ -241,6 +274,20 @@ impl ExecutionPlan for CoalesceBatchesExec {
     ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
         Ok(FilterPushdownPropagation::if_all(child_pushdown_result))
     }
+
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn ExecutionPlan>>> {
+        // CoalesceBatchesExec is transparent for sort ordering - it preserves order
+        // Delegate to the child and wrap with a new CoalesceBatchesExec
+        self.input.try_pushdown_sort(order)?.try_map(|new_input| {
+            Ok(Arc::new(
+                CoalesceBatchesExec::new(new_input, self.target_batch_size)
+                    .with_fetch(self.fetch),
+            ) as Arc<dyn ExecutionPlan>)
+        })
+    }
 }
 
 /// Stream for [`CoalesceBatchesExec`]. See [`CoalesceBatchesExec`] for more details.
diff --git a/datafusion/physical-plan/src/coalesce_partitions.rs b/datafusion/physical-plan/src/coalesce_partitions.rs
index 2597dc6408dee..9290d725165e9 100644
--- a/datafusion/physical-plan/src/coalesce_partitions.rs
+++ b/datafusion/physical-plan/src/coalesce_partitions.rs
@@ -18,7 +18,6 @@
 //! Defines the merge plan for executing partitions in parallel and then merging the results
 //! into a single partition
 
-use std::any::Any;
 use std::sync::Arc;
 
 use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
@@ -29,11 +28,14 @@ use super::{
 };
 use crate::execution_plan::{CardinalityEffect, EvaluationType, SchedulingType};
 use crate::filter_pushdown::{FilterDescription, FilterPushdownPhase};
-use crate::projection::{make_with_child, ProjectionExec};
-use crate::{DisplayFormatType, ExecutionPlan, Partitioning};
+use crate::projection::{ProjectionExec, make_with_child};
+use crate::sort_pushdown::SortOrderPushdownResult;
+use crate::{DisplayFormatType, ExecutionPlan, Partitioning, check_if_same_properties};
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::{internal_err, Result};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, assert_eq_or_internal_err, internal_err};
 use datafusion_execution::TaskContext;
 use datafusion_physical_expr::PhysicalExpr;
 
@@ -45,7 +47,7 @@ pub struct CoalescePartitionsExec {
     input: Arc<dyn ExecutionPlan>,
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
     /// Optional number of rows to fetch. Stops producing rows after this fetch
     pub(crate) fetch: Option<usize>,
 }
@@ -57,7 +59,7 @@ impl CoalescePartitionsExec {
         CoalescePartitionsExec {
             input,
             metrics: ExecutionPlanMetricsSet::new(),
-            cache,
+            cache: Arc::new(cache),
             fetch: None,
         }
     }
@@ -98,6 +100,17 @@ impl CoalescePartitionsExec {
         .with_evaluation_type(drive)
         .with_scheduling_type(scheduling)
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for CoalescePartitionsExec {
@@ -129,11 +142,7 @@ impl ExecutionPlan for CoalescePartitionsExec {
     }
 
     /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -145,11 +154,19 @@ impl ExecutionPlan for CoalescePartitionsExec {
         vec![false]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
-        children: Vec<Arc<dyn ExecutionPlan>>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let mut plan = CoalescePartitionsExec::new(Arc::clone(&children[0]));
+        check_if_same_properties!(self, children);
+        let mut plan = CoalescePartitionsExec::new(children.swap_remove(0));
         plan.fetch = self.fetch;
         Ok(Arc::new(plan))
     }
@@ -160,9 +177,11 @@ impl ExecutionPlan for CoalescePartitionsExec {
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
         // CoalescePartitionsExec produces a single partition
-        if 0 != partition {
-            return internal_err!("CoalescePartitionsExec invalid partition {partition}");
-        }
+        assert_eq_or_internal_err!(
+            partition,
+            0,
+            "CoalescePartitionsExec invalid partition {partition}"
+        );
 
         let input_partitions = self.input.output_partitioning().partition_count();
         match input_partitions {
@@ -220,14 +239,9 @@ impl ExecutionPlan for CoalescePartitionsExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, _partition: Option<usize>) -> Result<Statistics> {
-        self.input
-            .partition_statistics(None)?
-            .with_fetch(self.fetch, 0, 1)
+    fn partition_statistics(&self, _partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let stats = Arc::unwrap_or_clone(self.input.partition_statistics(None)?);
+        Ok(Arc::new(stats.with_fetch(self.fetch, 0, 1)?))
     }
 
     fn supports_limit_pushdown(&self) -> bool {
@@ -270,10 +284,23 @@ impl ExecutionPlan for CoalescePartitionsExec {
             input: Arc::clone(&self.input),
             fetch: limit,
             metrics: self.metrics.clone(),
-            cache: self.cache.clone(),
+            cache: Arc::clone(&self.cache),
         }))
     }
 
+    fn with_preserve_order(
+        &self,
+        preserve_order: bool,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        self.input
+            .with_preserve_order(preserve_order)
+            .and_then(|new_input| {
+                Arc::new(self.clone())
+                    .with_new_children(vec![new_input])
+                    .ok()
+            })
+    }
+
     fn gather_filters_for_pushdown(
         &self,
         _phase: FilterPushdownPhase,
@@ -282,13 +309,49 @@ impl ExecutionPlan for CoalescePartitionsExec {
     ) -> Result<FilterDescription> {
         FilterDescription::from_children(parent_filters, &self.children())
     }
+
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn ExecutionPlan>>> {
+        // CoalescePartitionsExec merges multiple partitions into one, which loses
+        // global ordering. However, we can still push the sort requirement down
+        // to optimize individual partitions - the Sort operator above will handle
+        // the global ordering.
+        //
+        // Note: The result will always be at most Inexact (never Exact) when there
+        // are multiple partitions, because merging destroys global ordering.
+        let result = self.input.try_pushdown_sort(order)?;
+
+        // If we have multiple partitions, we can't return Exact even if the
+        // underlying source claims Exact - merging destroys global ordering
+        let has_multiple_partitions =
+            self.input.output_partitioning().partition_count() > 1;
+
+        result
+            .try_map(|new_input| {
+                Ok(
+                    Arc::new(
+                        CoalescePartitionsExec::new(new_input).with_fetch(self.fetch),
+                    ) as Arc<dyn ExecutionPlan>,
+                )
+            })
+            .map(|r| {
+                if has_multiple_partitions {
+                    // Downgrade Exact to Inexact when merging multiple partitions
+                    r.into_inexact()
+                } else {
+                    r
+                }
+            })
+    }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
     use crate::test::exec::{
-        assert_strong_count_converges_to_zero, BlockingExec, PanicExec,
+        BlockingExec, PanicExec, assert_strong_count_converges_to_zero,
     };
     use crate::test::{self, assert_is_pending};
     use crate::{collect, common};
diff --git a/datafusion/physical-plan/src/column_rewriter.rs b/datafusion/physical-plan/src/column_rewriter.rs
new file mode 100644
index 0000000000000..2df95cd61474e
--- /dev/null
+++ b/datafusion/physical-plan/src/column_rewriter.rs
@@ -0,0 +1,382 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use datafusion_common::{
+    DataFusionError, HashMap,
+    tree_node::{Transformed, TreeNodeRecursion, TreeNodeRewriter},
+};
+use datafusion_physical_expr::{PhysicalExpr, expressions::Column};
+
+/// Rewrite column references in a physical expr according to a mapping.
+///
+/// This rewriter traverses the expression tree and replaces [`Column`] nodes
+/// with the corresponding expression found in the `column_map`.
+///
+/// If a column is found in the map, it is replaced by the mapped expression.
+/// If a column is NOT found in the map, a `DataFusionError::Internal` is
+/// returned.
+pub struct PhysicalColumnRewriter<'a> {
+    /// Mapping from original column to new column.
+    pub column_map: &'a HashMap<Column, Arc<dyn PhysicalExpr>>,
+}
+
+impl<'a> PhysicalColumnRewriter<'a> {
+    /// Create a new PhysicalColumnRewriter with the given column mapping.
+    pub fn new(column_map: &'a HashMap<Column, Arc<dyn PhysicalExpr>>) -> Self {
+        Self { column_map }
+    }
+}
+
+impl<'a> TreeNodeRewriter for PhysicalColumnRewriter<'a> {
+    type Node = Arc<dyn PhysicalExpr>;
+
+    fn f_down(
+        &mut self,
+        node: Self::Node,
+    ) -> datafusion_common::Result<Transformed<Self::Node>> {
+        if let Some(column) = node.downcast_ref::<Column>() {
+            if let Some(new_column) = self.column_map.get(column) {
+                // jump to prevent rewriting the new sub-expression again
+                return Ok(Transformed::new(
+                    Arc::clone(new_column),
+                    true,
+                    TreeNodeRecursion::Jump,
+                ));
+            } else {
+                // Column not found in mapping
+                return Err(DataFusionError::Internal(format!(
+                    "Column {column:?} not found in column mapping {:?}",
+                    self.column_map
+                )));
+            }
+        }
+        Ok(Transformed::no(node))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common::{Result, tree_node::TreeNode};
+    use datafusion_physical_expr::{
+        PhysicalExpr,
+        expressions::{Column, binary, col, lit},
+    };
+
+    /// Helper function to create a test schema
+    fn create_test_schema() -> Arc<Schema> {
+        Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+            Field::new("c", DataType::Int32, true),
+            Field::new("d", DataType::Int32, true),
+            Field::new("e", DataType::Int32, true),
+            Field::new("new_col", DataType::Int32, true),
+            Field::new("inner_col", DataType::Int32, true),
+            Field::new("another_col", DataType::Int32, true),
+        ]))
+    }
+
+    /// Helper function to create a complex nested expression with multiple columns
+    /// Create: (col_a + col_b) * (col_c - col_d) + col_e
+    fn create_complex_expression(schema: &Schema) -> Arc<dyn PhysicalExpr> {
+        let col_a = col("a", schema).unwrap();
+        let col_b = col("b", schema).unwrap();
+        let col_c = col("c", schema).unwrap();
+        let col_d = col("d", schema).unwrap();
+        let col_e = col("e", schema).unwrap();
+
+        let add_expr =
+            binary(col_a, datafusion_expr::Operator::Plus, col_b, schema).unwrap();
+        let sub_expr =
+            binary(col_c, datafusion_expr::Operator::Minus, col_d, schema).unwrap();
+        let mul_expr = binary(
+            add_expr,
+            datafusion_expr::Operator::Multiply,
+            sub_expr,
+            schema,
+        )
+        .unwrap();
+        binary(mul_expr, datafusion_expr::Operator::Plus, col_e, schema).unwrap()
+    }
+
+    /// Helper function to create a deeply nested expression
+    /// Create: col_a + (col_b + (col_c + (col_d + col_e)))
+    fn create_deeply_nested_expression(schema: &Schema) -> Arc<dyn PhysicalExpr> {
+        let col_a = col("a", schema).unwrap();
+        let col_b = col("b", schema).unwrap();
+        let col_c = col("c", schema).unwrap();
+        let col_d = col("d", schema).unwrap();
+        let col_e = col("e", schema).unwrap();
+
+        let inner1 =
+            binary(col_d, datafusion_expr::Operator::Plus, col_e, schema).unwrap();
+        let inner2 =
+            binary(col_c, datafusion_expr::Operator::Plus, inner1, schema).unwrap();
+        let inner3 =
+            binary(col_b, datafusion_expr::Operator::Plus, inner2, schema).unwrap();
+        binary(col_a, datafusion_expr::Operator::Plus, inner3, schema).unwrap()
+    }
+
+    #[test]
+    fn test_simple_column_replacement_with_jump() -> Result<()> {
+        let schema = create_test_schema();
+
+        // Test that Jump prevents re-processing of replaced columns
+        let mut column_map = HashMap::new();
+        column_map.insert(Column::new_with_schema("a", &schema).unwrap(), lit(42i32));
+        column_map.insert(
+            Column::new_with_schema("b", &schema).unwrap(),
+            lit("replaced_b"),
+        );
+        column_map.insert(
+            Column::new_with_schema("c", &schema).unwrap(),
+            col("c", &schema).unwrap(),
+        );
+        column_map.insert(
+            Column::new_with_schema("d", &schema).unwrap(),
+            col("d", &schema).unwrap(),
+        );
+        column_map.insert(
+            Column::new_with_schema("e", &schema).unwrap(),
+            col("e", &schema).unwrap(),
+        );
+
+        let mut rewriter = PhysicalColumnRewriter::new(&column_map);
+        let expr = create_complex_expression(&schema);
+
+        let result = expr.rewrite(&mut rewriter)?;
+
+        // Verify the transformation occurred
+        assert!(result.transformed);
+
+        assert_eq!(
+            format!("{}", result.data),
+            "(42 + replaced_b) * (c@2 - d@3) + e@4"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_nested_column_replacement_with_jump() -> Result<()> {
+        let schema = create_test_schema();
+        // Test Jump behavior with deeply nested expressions
+        let mut column_map = HashMap::new();
+        // Replace col_c with a complex expression containing new columns
+        let replacement_expr = binary(
+            lit(100i32),
+            datafusion_expr::Operator::Plus,
+            col("new_col", &schema).unwrap(),
+            &schema,
+        )
+        .unwrap();
+        column_map.insert(
+            Column::new_with_schema("c", &schema).unwrap(),
+            replacement_expr,
+        );
+        column_map.insert(
+            Column::new_with_schema("a", &schema).unwrap(),
+            col("a", &schema).unwrap(),
+        );
+        column_map.insert(
+            Column::new_with_schema("b", &schema).unwrap(),
+            col("b", &schema).unwrap(),
+        );
+        column_map.insert(
+            Column::new_with_schema("d", &schema).unwrap(),
+            col("d", &schema).unwrap(),
+        );
+        column_map.insert(
+            Column::new_with_schema("e", &schema).unwrap(),
+            col("e", &schema).unwrap(),
+        );
+
+        let mut rewriter = PhysicalColumnRewriter::new(&column_map);
+        let expr = create_deeply_nested_expression(&schema);
+
+        let result = expr.rewrite(&mut rewriter)?;
+
+        // Verify transformation occurred
+        assert!(result.transformed);
+
+        assert_eq!(
+            format!("{}", result.data),
+            "a@0 + b@1 + 100 + new_col@5 + d@3 + e@4"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_circular_reference_prevention() -> Result<()> {
+        let schema = create_test_schema();
+        // Test that Jump prevents infinite recursion with circular references
+        let mut column_map = HashMap::new();
+
+        // Create a circular reference: col_a -> col_b -> col_a (but Jump should prevent the second visit)
+        column_map.insert(
+            Column::new_with_schema("a", &schema).unwrap(),
+            col("b", &schema).unwrap(),
+        );
+        column_map.insert(
+            Column::new_with_schema("b", &schema).unwrap(),
+            col("a", &schema).unwrap(),
+        );
+
+        let mut rewriter = PhysicalColumnRewriter::new(&column_map);
+
+        // Start with an expression containing col_a
+        let expr = binary(
+            col("a", &schema).unwrap(),
+            datafusion_expr::Operator::Plus,
+            col("b", &schema).unwrap(),
+            &schema,
+        )
+        .unwrap();
+
+        let result = expr.rewrite(&mut rewriter)?;
+
+        // Verify transformation occurred
+        assert!(result.transformed);
+
+        assert_eq!(format!("{}", result.data), "b@1 + a@0");
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_multiple_replacements_in_same_expression() -> Result<()> {
+        let schema = create_test_schema();
+        // Test multiple column replacements in the same complex expression
+        let mut column_map = HashMap::new();
+
+        // Replace multiple columns with literals
+        column_map.insert(Column::new_with_schema("a", &schema).unwrap(), lit(10i32));
+        column_map.insert(Column::new_with_schema("c", &schema).unwrap(), lit(20i32));
+        column_map.insert(Column::new_with_schema("e", &schema).unwrap(), lit(30i32));
+        column_map.insert(
+            Column::new_with_schema("b", &schema).unwrap(),
+            col("b", &schema).unwrap(),
+        );
+        column_map.insert(
+            Column::new_with_schema("d", &schema).unwrap(),
+            col("d", &schema).unwrap(),
+        );
+
+        let mut rewriter = PhysicalColumnRewriter::new(&column_map);
+        let expr = create_complex_expression(&schema); // (col_a + col_b) * (col_c - col_d) + col_e
+
+        let result = expr.rewrite(&mut rewriter)?;
+
+        // Verify transformation occurred
+        assert!(result.transformed);
+        assert_eq!(format!("{}", result.data), "(10 + b@1) * (20 - d@3) + 30");
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_jump_with_complex_replacement_expression() -> Result<()> {
+        let schema = create_test_schema();
+        // Test Jump behavior when replacing with very complex expressions
+        let mut column_map = HashMap::new();
+
+        // Replace col_a with a complex nested expression
+        let inner_expr = binary(
+            lit(5i32),
+            datafusion_expr::Operator::Multiply,
+            col("a", &schema).unwrap(),
+            &schema,
+        )
+        .unwrap();
+        let middle_expr = binary(
+            inner_expr,
+            datafusion_expr::Operator::Plus,
+            lit(3i32),
+            &schema,
+        )
+        .unwrap();
+        let complex_replacement = binary(
+            middle_expr,
+            datafusion_expr::Operator::Minus,
+            col("another_col", &schema).unwrap(),
+            &schema,
+        )
+        .unwrap();
+
+        column_map.insert(
+            Column::new_with_schema("a", &schema).unwrap(),
+            complex_replacement,
+        );
+        column_map.insert(
+            Column::new_with_schema("b", &schema).unwrap(),
+            col("b", &schema).unwrap(),
+        );
+
+        let mut rewriter = PhysicalColumnRewriter::new(&column_map);
+
+        // Create expression: col_a + col_b
+        let expr = binary(
+            col("a", &schema).unwrap(),
+            datafusion_expr::Operator::Plus,
+            col("b", &schema).unwrap(),
+            &schema,
+        )
+        .unwrap();
+
+        let result = expr.rewrite(&mut rewriter)?;
+
+        assert_eq!(
+            format!("{}", result.data),
+            "5 * a@0 + 3 - another_col@7 + b@1"
+        );
+
+        // Verify transformation occurred
+        assert!(result.transformed);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_unmapped_columns_detection() -> Result<()> {
+        let schema = create_test_schema();
+        let mut column_map = HashMap::new();
+
+        // Only map col_a, leave col_b unmapped
+        column_map.insert(Column::new_with_schema("a", &schema).unwrap(), lit(42i32));
+
+        let mut rewriter = PhysicalColumnRewriter::new(&column_map);
+
+        // Create expression: col_a + col_b
+        let expr = binary(
+            col("a", &schema).unwrap(),
+            datafusion_expr::Operator::Plus,
+            col("b", &schema).unwrap(),
+            &schema,
+        )
+        .unwrap();
+
+        let err = expr.rewrite(&mut rewriter).unwrap_err();
+        assert!(matches!(err, DataFusionError::Internal(_)));
+
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/common.rs b/datafusion/physical-plan/src/common.rs
index e9a8499a7c9ac..5551685427ad2 100644
--- a/datafusion/physical-plan/src/common.rs
+++ b/datafusion/physical-plan/src/common.rs
@@ -29,14 +29,13 @@ use arrow::array::Array;
 use arrow::datatypes::Schema;
 use arrow::record_batch::RecordBatch;
 use datafusion_common::stats::Precision;
-use datafusion_common::{plan_err, Result};
+use datafusion_common::{Result, plan_err};
 use datafusion_execution::memory_pool::MemoryReservation;
 
 use futures::{StreamExt, TryStreamExt};
-use parking_lot::Mutex;
 
 /// [`MemoryReservation`] used across query execution streams
-pub(crate) type SharedMemoryReservation = Arc<Mutex<MemoryReservation>>;
+pub(crate) type SharedMemoryReservation = Arc<MemoryReservation>;
 
 /// Create a vector of record batches from a stream
 pub async fn collect(stream: SendableRecordBatchStream) -> Result<Vec<RecordBatch>> {
@@ -181,7 +180,7 @@ pub fn compute_record_batch_statistics(
 /// Checks if the given projection is valid for the given schema.
 pub fn can_project(
     schema: &arrow::datatypes::SchemaRef,
-    projection: Option<&Vec<usize>>,
+    projection: Option<&[usize]>,
 ) -> Result<()> {
     match projection {
         Some(columns) => {
@@ -262,6 +261,7 @@ mod tests {
                     min_value: Precision::Absent,
                     sum_value: Precision::Absent,
                     null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Absent,
@@ -269,6 +269,7 @@ mod tests {
                     min_value: Precision::Absent,
                     sum_value: Precision::Absent,
                     null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
                 },
             ],
         };
@@ -302,6 +303,7 @@ mod tests {
                 min_value: Precision::Absent,
                 sum_value: Precision::Absent,
                 null_count: Precision::Exact(3),
+                byte_size: Precision::Absent,
             }],
         };
 
diff --git a/datafusion/physical-plan/src/coop.rs b/datafusion/physical-plan/src/coop.rs
index b62d15e6d2f17..fe6a3bc3d5678 100644
--- a/datafusion/physical-plan/src/coop.rs
+++ b/datafusion/physical-plan/src/coop.rs
@@ -22,10 +22,15 @@
 //! A single call to `poll_next` on a top-level [`Stream`] may potentially perform a lot of work
 //! before it returns a `Poll::Pending`. Think for instance of calculating an aggregation over a
 //! large dataset.
+//!
 //! If a `Stream` runs for a long period of time without yielding back to the Tokio executor,
 //! it can starve other tasks waiting on that executor to execute them.
 //! Additionally, this prevents the query execution from being cancelled.
 //!
+//! For more background, please also see the [Using Rust async for Query Execution and Cancelling Long-Running Queries blog]
+//!
+//! [Using Rust async for Query Execution and Cancelling Long-Running Queries blog]: https://datafusion.apache.org/blog/2025/06/30/cancellation
+//!
 //! To ensure that `Stream` implementations yield regularly, operators can insert explicit yield
 //! points using the utilities in this module. For most operators this is **not** necessary. The
 //! `Stream`s of the built-in DataFusion operators that generate (rather than manipulate)
@@ -66,10 +71,10 @@
 //! that report [`SchedulingType::NonCooperative`] in their [plan properties](ExecutionPlan::properties).
 
 use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_physical_expr::PhysicalExpr;
 #[cfg(datafusion_coop = "tokio_fallback")]
 use futures::Future;
-use std::any::Any;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};
@@ -79,17 +84,19 @@ use crate::filter_pushdown::{
     ChildPushdownResult, FilterDescription, FilterPushdownPhase,
     FilterPushdownPropagation,
 };
+use crate::projection::ProjectionExec;
 use crate::{
     DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, RecordBatchStream,
-    SendableRecordBatchStream,
+    SendableRecordBatchStream, SortOrderPushdownResult, check_if_same_properties,
 };
 use arrow::record_batch::RecordBatch;
 use arrow_schema::Schema;
-use datafusion_common::{internal_err, Result, Statistics};
+use datafusion_common::{Result, Statistics, assert_eq_or_internal_err};
 use datafusion_execution::TaskContext;
 
 use crate::execution_plan::SchedulingType;
 use crate::stream::RecordBatchStreamAdapter;
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 use futures::{Stream, StreamExt};
 
 /// A stream that passes record batches through unchanged while cooperating with the Tokio runtime.
@@ -207,19 +214,18 @@ where
 /// An execution plan decorator that enables cooperative multitasking.
 /// It wraps the streams produced by its input execution plan using the [`make_cooperative`] function,
 /// which makes the stream participate in Tokio cooperative scheduling.
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct CooperativeExec {
     input: Arc<dyn ExecutionPlan>,
-    properties: PlanProperties,
+    properties: Arc<PlanProperties>,
 }
 
 impl CooperativeExec {
     /// Creates a new `CooperativeExec` operator that wraps the given input execution plan.
     pub fn new(input: Arc<dyn ExecutionPlan>) -> Self {
-        let properties = input
-            .properties()
-            .clone()
-            .with_scheduling_type(SchedulingType::Cooperative);
+        let properties = PlanProperties::clone(input.properties())
+            .with_scheduling_type(SchedulingType::Cooperative)
+            .into();
 
         Self { input, properties }
     }
@@ -228,6 +234,16 @@ impl CooperativeExec {
     pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
         &self.input
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for CooperativeExec {
@@ -245,15 +261,11 @@ impl ExecutionPlan for CooperativeExec {
         "CooperativeExec"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> Arc<Schema> {
         self.input.schema()
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.properties
     }
 
@@ -265,13 +277,23 @@ impl ExecutionPlan for CooperativeExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        if children.len() != 1 {
-            return internal_err!("CooperativeExec requires exactly one child");
-        }
+        assert_eq_or_internal_err!(
+            children.len(),
+            1,
+            "CooperativeExec requires exactly one child"
+        );
+        check_if_same_properties!(self, children);
         Ok(Arc::new(CooperativeExec::new(children.swap_remove(0))))
     }
 
@@ -284,7 +306,7 @@ impl ExecutionPlan for CooperativeExec {
         Ok(make_cooperative(child_stream))
     }
 
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         self.input.partition_statistics(partition)
     }
 
@@ -296,6 +318,18 @@ impl ExecutionPlan for CooperativeExec {
         Equal
     }
 
+    fn try_swapping_with_projection(
+        &self,
+        projection: &ProjectionExec,
+    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+        match self.input.try_swapping_with_projection(projection)? {
+            Some(new_input) => Ok(Some(
+                Arc::new(self.clone()).with_new_children(vec![new_input])?,
+            )),
+            None => Ok(None),
+        }
+    }
+
     fn gather_filters_for_pushdown(
         &self,
         _phase: FilterPushdownPhase,
@@ -313,6 +347,27 @@ impl ExecutionPlan for CooperativeExec {
     ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
         Ok(FilterPushdownPropagation::if_all(child_pushdown_result))
     }
+
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn ExecutionPlan>>> {
+        let child = self.input();
+
+        match child.try_pushdown_sort(order)? {
+            SortOrderPushdownResult::Exact { inner } => {
+                let new_exec = Arc::new(self.clone()).with_new_children(vec![inner])?;
+                Ok(SortOrderPushdownResult::Exact { inner: new_exec })
+            }
+            SortOrderPushdownResult::Inexact { inner } => {
+                let new_exec = Arc::new(self.clone()).with_new_children(vec![inner])?;
+                Ok(SortOrderPushdownResult::Inexact { inner: new_exec })
+            }
+            SortOrderPushdownResult::Unsupported => {
+                Ok(SortOrderPushdownResult::Unsupported)
+            }
+        }
+    }
 }
 
 /// Creates a [`CooperativeStream`] wrapper around the given [`RecordBatchStream`].
@@ -341,11 +396,10 @@ pub fn make_cooperative(stream: SendableRecordBatchStream) -> SendableRecordBatc
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::stream::RecordBatchStreamAdapter;
 
     use arrow_schema::SchemaRef;
 
-    use futures::{stream, StreamExt};
+    use futures::stream;
 
     // This is the hardcoded value Tokio uses
     const TASK_BUDGET: usize = 128;
diff --git a/datafusion/physical-plan/src/display.rs b/datafusion/physical-plan/src/display.rs
index 35ca0b65ae294..756a68b1a958d 100644
--- a/datafusion/physical-plan/src/display.rs
+++ b/datafusion/physical-plan/src/display.rs
@@ -28,10 +28,10 @@ use datafusion_common::display::{GraphvizBuilder, PlanType, StringifiedPlan};
 use datafusion_expr::display_schema;
 use datafusion_physical_expr::LexOrdering;
 
-use crate::metrics::MetricType;
+use crate::metrics::{MetricCategory, MetricType};
 use crate::render_tree::RenderTree;
 
-use super::{accept, ExecutionPlan, ExecutionPlanVisitor};
+use super::{ExecutionPlan, ExecutionPlanVisitor, accept};
 
 /// Options for controlling how each [`ExecutionPlan`] should format itself
 #[derive(Debug, Clone, Copy, PartialEq)]
@@ -123,13 +123,16 @@ pub struct DisplayableExecutionPlan<'a> {
     show_schema: bool,
     /// Which metric categories should be included when rendering
     metric_types: Vec<MetricType>,
+    /// Optional filter by semantic category (rows / bytes / timing).
+    /// `None` means show all categories; `Some(vec![])` means plan-only.
+    metric_categories: Option<Vec<MetricCategory>>,
     // (TreeRender) Maximum total width of the rendered tree
     tree_maximum_render_width: usize,
 }
 
 impl<'a> DisplayableExecutionPlan<'a> {
     fn default_metric_types() -> Vec<MetricType> {
-        vec![MetricType::SUMMARY, MetricType::DEV]
+        vec![MetricType::Summary, MetricType::Dev]
     }
 
     /// Create a wrapper around an [`ExecutionPlan`] which can be
@@ -141,6 +144,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
             show_statistics: false,
             show_schema: false,
             metric_types: Self::default_metric_types(),
+            metric_categories: None,
             tree_maximum_render_width: 240,
         }
     }
@@ -155,6 +159,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
             show_statistics: false,
             show_schema: false,
             metric_types: Self::default_metric_types(),
+            metric_categories: None,
             tree_maximum_render_width: 240,
         }
     }
@@ -169,6 +174,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
             show_statistics: false,
             show_schema: false,
             metric_types: Self::default_metric_types(),
+            metric_categories: None,
             tree_maximum_render_width: 240,
         }
     }
@@ -194,6 +200,23 @@ impl<'a> DisplayableExecutionPlan<'a> {
         self
     }
 
+    /// Specify which metric categories to include.
+    ///
+    /// - `None` means show all categories (default).
+    /// - `Some(vec![])` means plan-only — suppress all metrics.
+    /// - `Some(vec![Rows])` means show only row-count metrics (plus
+    ///   uncategorized metrics).
+    ///
+    /// See [`MetricCategory`] for the determinism properties of each
+    /// category.
+    pub fn set_metric_categories(
+        mut self,
+        metric_categories: Option<Vec<MetricCategory>>,
+    ) -> Self {
+        self.metric_categories = metric_categories;
+        self
+    }
+
     /// Set the maximum render width for the tree format
     pub fn set_tree_maximum_render_width(mut self, width: usize) -> Self {
         self.tree_maximum_render_width = width;
@@ -223,6 +246,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
             show_statistics: bool,
             show_schema: bool,
             metric_types: Vec<MetricType>,
+            metric_categories: Option<Vec<MetricCategory>>,
         }
         impl fmt::Display for Wrapper<'_> {
             fn fmt(&self, f: &mut Formatter) -> fmt::Result {
@@ -234,6 +258,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
                     show_statistics: self.show_statistics,
                     show_schema: self.show_schema,
                     metric_types: &self.metric_types,
+                    metric_categories: self.metric_categories.as_deref(),
                 };
                 accept(self.plan, &mut visitor)
             }
@@ -245,6 +270,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
             show_statistics: self.show_statistics,
             show_schema: self.show_schema,
             metric_types: self.metric_types.clone(),
+            metric_categories: self.metric_categories.clone(),
         }
     }
 
@@ -265,6 +291,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
             show_metrics: ShowMetrics,
             show_statistics: bool,
             metric_types: Vec<MetricType>,
+            metric_categories: Option<Vec<MetricCategory>>,
         }
         impl fmt::Display for Wrapper<'_> {
             fn fmt(&self, f: &mut Formatter) -> fmt::Result {
@@ -276,6 +303,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
                     show_metrics: self.show_metrics,
                     show_statistics: self.show_statistics,
                     metric_types: &self.metric_types,
+                    metric_categories: self.metric_categories.as_deref(),
                     graphviz_builder: GraphvizBuilder::default(),
                     parents: Vec::new(),
                 };
@@ -294,6 +322,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
             show_metrics: self.show_metrics,
             show_statistics: self.show_statistics,
             metric_types: self.metric_types.clone(),
+            metric_categories: self.metric_categories.clone(),
         }
     }
 
@@ -329,6 +358,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
             show_statistics: bool,
             show_schema: bool,
             metric_types: Vec<MetricType>,
+            metric_categories: Option<Vec<MetricCategory>>,
         }
 
         impl fmt::Display for Wrapper<'_> {
@@ -341,6 +371,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
                     show_statistics: self.show_statistics,
                     show_schema: self.show_schema,
                     metric_types: &self.metric_types,
+                    metric_categories: self.metric_categories.as_deref(),
                 };
                 visitor.pre_visit(self.plan)?;
                 Ok(())
@@ -353,6 +384,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
             show_statistics: self.show_statistics,
             show_schema: self.show_schema,
             metric_types: self.metric_types.clone(),
+            metric_categories: self.metric_categories.clone(),
         }
     }
 
@@ -409,6 +441,8 @@ struct IndentVisitor<'a, 'b> {
     show_schema: bool,
     /// Which metric types should be rendered
     metric_types: &'a [MetricType],
+    /// Optional filter by semantic category (rows / bytes / timing).
+    metric_categories: Option<&'a [MetricCategory]>,
 }
 
 impl ExecutionPlanVisitor for IndentVisitor<'_, '_> {
@@ -420,12 +454,14 @@ impl ExecutionPlanVisitor for IndentVisitor<'_, '_> {
             ShowMetrics::None => {}
             ShowMetrics::Aggregated => {
                 if let Some(metrics) = plan.metrics() {
-                    let metrics = metrics
+                    let mut metrics = metrics
                         .filter_by_metric_types(self.metric_types)
                         .aggregate_by_name()
                         .sorted_for_display()
                         .timestamps_removed();
-
+                    if let Some(cats) = self.metric_categories {
+                        metrics = metrics.filter_by_categories(cats);
+                    }
                     write!(self.f, ", metrics=[{metrics}]")?;
                 } else {
                     write!(self.f, ", metrics=[]")?;
@@ -433,7 +469,10 @@ impl ExecutionPlanVisitor for IndentVisitor<'_, '_> {
             }
             ShowMetrics::Full => {
                 if let Some(metrics) = plan.metrics() {
-                    let metrics = metrics.filter_by_metric_types(self.metric_types);
+                    let mut metrics = metrics.filter_by_metric_types(self.metric_types);
+                    if let Some(cats) = self.metric_categories {
+                        metrics = metrics.filter_by_categories(cats);
+                    }
                     write!(self.f, ", metrics=[{metrics}]")?;
                 } else {
                     write!(self.f, ", metrics=[]")?;
@@ -472,6 +511,8 @@ struct GraphvizVisitor<'a, 'b> {
     show_statistics: bool,
     /// Which metric types should be rendered
     metric_types: &'a [MetricType],
+    /// Optional filter by semantic category
+    metric_categories: Option<&'a [MetricCategory]>,
 
     graphviz_builder: GraphvizBuilder,
     /// Used to record parent node ids when visiting a plan.
@@ -508,12 +549,14 @@ impl ExecutionPlanVisitor for GraphvizVisitor<'_, '_> {
             ShowMetrics::None => "".to_string(),
             ShowMetrics::Aggregated => {
                 if let Some(metrics) = plan.metrics() {
-                    let metrics = metrics
+                    let mut metrics = metrics
                         .filter_by_metric_types(self.metric_types)
                         .aggregate_by_name()
                         .sorted_for_display()
                         .timestamps_removed();
-
+                    if let Some(cats) = self.metric_categories {
+                        metrics = metrics.filter_by_categories(cats);
+                    }
                     format!("metrics=[{metrics}]")
                 } else {
                     "metrics=[]".to_string()
@@ -521,7 +564,10 @@ impl ExecutionPlanVisitor for GraphvizVisitor<'_, '_> {
             }
             ShowMetrics::Full => {
                 if let Some(metrics) = plan.metrics() {
-                    let metrics = metrics.filter_by_metric_types(self.metric_types);
+                    let mut metrics = metrics.filter_by_metric_types(self.metric_types);
+                    if let Some(cats) = self.metric_categories {
+                        metrics = metrics.filter_by_categories(cats);
+                    }
                     format!("metrics=[{metrics}]")
                 } else {
                     "metrics=[]".to_string()
@@ -734,13 +780,14 @@ impl TreeRenderVisitor<'_, '_> {
                 if let Some(node) = root.get_node(x, y) {
                     write!(self.f, "{}", Self::VERTICAL)?;
 
-                    // Rigure out what to render.
-                    let mut render_text = String::new();
-                    if render_y == 0 {
-                        render_text = node.name.clone();
+                    // Figure out what to render.
+                    let mut render_text = if render_y == 0 {
+                        node.name.clone()
                     } else if render_y <= extra_info[x].len() {
-                        render_text = extra_info[x][render_y - 1].clone();
-                    }
+                        extra_info[x][render_y - 1].clone()
+                    } else {
+                        String::new()
+                    };
 
                     render_text = Self::adjust_text_for_rendering(
                         &render_text,
@@ -1120,8 +1167,11 @@ mod tests {
     use std::fmt::Write;
     use std::sync::Arc;
 
-    use datafusion_common::{internal_datafusion_err, Result, Statistics};
+    use datafusion_common::{
+        Result, Statistics, internal_datafusion_err, tree_node::TreeNodeRecursion,
+    };
     use datafusion_execution::{SendableRecordBatchStream, TaskContext};
+    use datafusion_physical_expr::PhysicalExpr;
 
     use crate::{DisplayAs, ExecutionPlan, PlanProperties};
 
@@ -1149,11 +1199,7 @@ mod tests {
             "TestStatsExecPlan"
         }
 
-        fn as_any(&self) -> &dyn std::any::Any {
-            self
-        }
-
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             unimplemented!()
         }
 
@@ -1168,6 +1214,13 @@ mod tests {
             unimplemented!()
         }
 
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
+
         fn execute(
             &self,
             _: usize,
@@ -1176,18 +1229,17 @@ mod tests {
             todo!()
         }
 
-        fn statistics(&self) -> Result<Statistics> {
-            self.partition_statistics(None)
-        }
-
-        fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+        fn partition_statistics(
+            &self,
+            partition: Option<usize>,
+        ) -> Result<Arc<Statistics>> {
             if partition.is_some() {
-                return Ok(Statistics::new_unknown(self.schema().as_ref()));
+                return Ok(Arc::new(Statistics::new_unknown(self.schema().as_ref())));
             }
             match self {
                 Self::Panic => panic!("expected panic"),
                 Self::Error => Err(internal_datafusion_err!("expected error")),
-                Self::Ok => Ok(Statistics::new_unknown(self.schema().as_ref())),
+                Self::Ok => Ok(Arc::new(Statistics::new_unknown(self.schema().as_ref()))),
             }
         }
     }
diff --git a/datafusion/physical-plan/src/empty.rs b/datafusion/physical-plan/src/empty.rs
index 40b4ec61dc102..8103695ad08fa 100644
--- a/datafusion/physical-plan/src/empty.rs
+++ b/datafusion/physical-plan/src/empty.rs
@@ -17,21 +17,22 @@
 
 //! EmptyRelation with produce_one_row=false execution plan
 
-use std::any::Any;
 use std::sync::Arc;
 
 use crate::memory::MemoryStream;
-use crate::{common, DisplayAs, PlanProperties, SendableRecordBatchStream, Statistics};
+use crate::{DisplayAs, PlanProperties, SendableRecordBatchStream, Statistics};
 use crate::{
-    execution_plan::{Boundedness, EmissionType},
     DisplayFormatType, ExecutionPlan, Partitioning,
+    execution_plan::{Boundedness, EmissionType},
 };
 
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
-use datafusion_common::{internal_err, Result};
+use datafusion_common::stats::Precision;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{ColumnStatistics, Result, ScalarValue, assert_or_internal_err};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::EquivalenceProperties;
+use datafusion_physical_expr::{EquivalenceProperties, PhysicalExpr};
 
 use crate::execution_plan::SchedulingType;
 use log::trace;
@@ -43,7 +44,7 @@ pub struct EmptyExec {
     schema: SchemaRef,
     /// Number of partitions
     partitions: usize,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl EmptyExec {
@@ -53,7 +54,7 @@ impl EmptyExec {
         EmptyExec {
             schema,
             partitions: 1,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -62,7 +63,7 @@ impl EmptyExec {
         self.partitions = partitions;
         // Changing partitions may invalidate output partitioning, so update it:
         let output_partitioning = Self::output_partitioning_helper(self.partitions);
-        self.cache = self.cache.with_partitioning(output_partitioning);
+        Arc::make_mut(&mut self.cache).partitioning = output_partitioning;
         self
     }
 
@@ -110,11 +111,7 @@ impl ExecutionPlan for EmptyExec {
     }
 
     /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -122,6 +119,13 @@ impl ExecutionPlan for EmptyExec {
         vec![]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         _: Vec<Arc<dyn ExecutionPlan>>,
@@ -134,15 +138,19 @@ impl ExecutionPlan for EmptyExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        trace!("Start EmptyExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
+        trace!(
+            "Start EmptyExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
 
-        if partition >= self.partitions {
-            return internal_err!(
-                "EmptyExec invalid partition {} (expected less than {})",
-                partition,
-                self.partitions
-            );
-        }
+        assert_or_internal_err!(
+            partition < self.partitions,
+            "EmptyExec invalid partition {} (expected less than {})",
+            partition,
+            self.partitions
+        );
 
         Ok(Box::pin(MemoryStream::try_new(
             self.data()?,
@@ -151,35 +159,41 @@ impl ExecutionPlan for EmptyExec {
         )?))
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         if let Some(partition) = partition {
-            if partition >= self.partitions {
-                return internal_err!(
-                    "EmptyExec invalid partition {} (expected less than {})",
-                    partition,
-                    self.partitions
-                );
-            }
+            assert_or_internal_err!(
+                partition < self.partitions,
+                "EmptyExec invalid partition {} (expected less than {})",
+                partition,
+                self.partitions
+            );
         }
 
-        let batch = self
-            .data()
-            .expect("Create empty RecordBatch should not fail");
-        Ok(common::compute_record_batch_statistics(
-            &[batch],
-            &self.schema,
-            None,
-        ))
+        // Build explicit stats: exact zero rows and bytes, with explicit known column stats
+        let mut stats = Statistics::default()
+            .with_num_rows(Precision::Exact(0))
+            .with_total_byte_size(Precision::Exact(0));
+
+        // Add explicit column stats for each field in schema
+        for _ in self.schema.fields() {
+            stats = stats.add_column_statistics(ColumnStatistics {
+                null_count: Precision::Exact(0),
+                distinct_count: Precision::Exact(0),
+                min_value: Precision::<ScalarValue>::Absent,
+                max_value: Precision::<ScalarValue>::Absent,
+                sum_value: Precision::<ScalarValue>::Absent,
+                byte_size: Precision::Exact(0),
+            });
+        }
+
+        Ok(Arc::new(stats))
     }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::common;
     use crate::test;
     use crate::with_new_children_if_necessary;
 
diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs
index ffa9611d26e85..1a67ea0ded11b 100644
--- a/datafusion/physical-plan/src/execution_plan.rs
+++ b/datafusion/physical-plan/src/execution_plan.rs
@@ -22,21 +22,26 @@ use crate::filter_pushdown::{
 };
 pub use crate::metrics::Metric;
 pub use crate::ordering::InputOrderMode;
+use crate::sort_pushdown::SortOrderPushdownResult;
 pub use crate::stream::EmptyRecordBatchStream;
 
+use arrow_schema::Schema;
 pub use datafusion_common::hash_utils;
+use datafusion_common::tree_node::{
+    Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
+};
 pub use datafusion_common::utils::project_schema;
-pub use datafusion_common::{internal_err, ColumnStatistics, Statistics};
+pub use datafusion_common::{ColumnStatistics, Statistics, internal_err};
 pub use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream};
 pub use datafusion_expr::{Accumulator, ColumnarValue};
 pub use datafusion_physical_expr::window::WindowExpr;
 pub use datafusion_physical_expr::{
-    expressions, Distribution, Partitioning, PhysicalExpr,
+    Distribution, Partitioning, PhysicalExpr, expressions,
 };
 
 use std::any::Any;
 use std::fmt::Debug;
-use std::sync::Arc;
+use std::sync::{Arc, LazyLock};
 
 use crate::coalesce_partitions::CoalescePartitionsExec;
 use crate::display::DisplayableExecutionPlan;
@@ -47,11 +52,16 @@ use crate::stream::RecordBatchStreamAdapter;
 use arrow::array::{Array, RecordBatch};
 use arrow::datatypes::SchemaRef;
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::{exec_err, Constraints, DataFusionError, Result};
+use datafusion_common::{
+    Constraints, DataFusionError, Result, assert_eq_or_internal_err,
+    assert_or_internal_err, exec_err,
+};
 use datafusion_common_runtime::JoinSet;
 use datafusion_execution::TaskContext;
 use datafusion_physical_expr::EquivalenceProperties;
-use datafusion_physical_expr_common::sort_expr::{LexOrdering, OrderingRequirements};
+use datafusion_physical_expr_common::sort_expr::{
+    LexOrdering, OrderingRequirements, PhysicalSortExpr,
+};
 
 use futures::stream::{StreamExt, TryStreamExt};
 
@@ -82,8 +92,8 @@ use futures::stream::{StreamExt, TryStreamExt};
 /// `ExecutionPlan` with memory tracking and spilling support.
 ///
 /// [`datafusion-examples`]: https://github.com/apache/datafusion/tree/main/datafusion-examples
-/// [`memory_pool_execution_plan.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/memory_pool_execution_plan.rs
-pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
+/// [`memory_pool_execution_plan.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/execution_monitoring/memory_pool_execution_plan.rs
+pub trait ExecutionPlan: Any + Debug + DisplayAs + Send + Sync {
     /// Short name for the ExecutionPlan, such as 'DataSourceExec'.
     ///
     /// Implementation note: this method can just proxy to
@@ -107,10 +117,6 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
         }
     }
 
-    /// Returns the execution plan as [`Any`] so that it can be
-    /// downcast to a specific implementation.
-    fn as_any(&self) -> &dyn Any;
-
     /// Get the schema for this execution plan
     fn schema(&self) -> SchemaRef {
         Arc::clone(self.properties().schema())
@@ -121,7 +127,7 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
     ///
     /// This information is available via methods on [`ExecutionPlanProperties`]
     /// trait, which is implemented for all `ExecutionPlan`s.
-    fn properties(&self) -> &PlanProperties;
+    fn properties(&self) -> &Arc<PlanProperties>;
 
     /// Returns an error if this individual node does not conform to its invariants.
     /// These invariants are typically only checked in debug mode.
@@ -197,6 +203,80 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
     /// joins).
     fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>>;
 
+    /// Apply a closure `f` to each expression (non-recursively) in the current
+    /// physical plan node. This does not include expressions in any children.
+    ///
+    /// The closure `f` is applied to expressions in the order they appear in the plan.
+    /// The closure can return `TreeNodeRecursion::Continue` to continue visiting,
+    /// `TreeNodeRecursion::Stop` to stop visiting immediately, or `TreeNodeRecursion::Jump`
+    /// to skip any remaining expressions (though typically all expressions are visited).
+    ///
+    /// The expressions visited do not necessarily represent or even contribute
+    /// to the output schema of this node. For example, `FilterExec` visits the
+    /// filter predicate even though the output of a Filter has the same columns
+    /// as the input.
+    ///
+    /// # Example Usage
+    /// ```
+    /// # use std::sync::Arc;
+    /// # use datafusion_physical_plan::ExecutionPlan;
+    /// # use datafusion_common::tree_node::TreeNodeRecursion;
+    /// # fn example(plan: Arc<dyn ExecutionPlan>) -> datafusion_common::Result<()> {
+    /// // Count the number of expressions
+    /// let mut count = 0;
+    /// plan.apply_expressions(&mut |_expr| {
+    ///     count += 1;
+    ///     Ok(TreeNodeRecursion::Continue)
+    /// })?;
+    /// # Ok(())
+    /// # }
+    /// ```
+    ///
+    /// # Implementation Examples
+    ///
+    /// ## Node with no expressions (e.g., EmptyExec, MemoryExec)
+    /// ```ignore
+    /// fn apply_expressions(
+    ///     &self,
+    ///     _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    /// ) -> Result<TreeNodeRecursion> {
+    ///     Ok(TreeNodeRecursion::Continue)
+    /// }
+    /// ```
+    ///
+    /// ## Node with a single expression (e.g., FilterExec)
+    /// ```ignore
+    /// fn apply_expressions(
+    ///     &self,
+    ///     f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    /// ) -> Result<TreeNodeRecursion> {
+    ///     f(self.predicate.as_ref())
+    /// }
+    /// ```
+    ///
+    /// ## Node with multiple expressions (e.g., ProjectionExec, JoinExec)
+    ///
+    /// Use [`TreeNodeRecursion::visit_sibling`] when iterating over multiple
+    /// expressions. This correctly propagates [`TreeNodeRecursion::Stop`]: if
+    /// `f` returns `Stop` for an earlier expression, `visit_sibling` short-circuits
+    /// and skips the remaining ones.
+    /// ```ignore
+    /// fn apply_expressions(
+    ///     &self,
+    ///     f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    /// ) -> Result<TreeNodeRecursion> {
+    ///     let mut tnr = TreeNodeRecursion::Continue;
+    ///     for expr in &self.expressions {
+    ///         tnr = tnr.visit_sibling(|| f(expr.as_ref()))?;
+    ///     }
+    ///     Ok(tnr)
+    /// }
+    /// ```
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion>;
+
     /// Returns a new `ExecutionPlan` where all existing children were replaced
     /// by the `children`, in order
     fn with_new_children(
@@ -465,34 +545,22 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
         None
     }
 
-    /// Returns statistics for this `ExecutionPlan` node. If statistics are not
-    /// available, should return [`Statistics::new_unknown`] (the default), not
-    /// an error.
-    ///
-    /// For TableScan executors, which supports filter pushdown, special attention
-    /// needs to be paid to whether the stats returned by this method are exact or not
-    #[deprecated(since = "48.0.0", note = "Use `partition_statistics` method instead")]
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(Statistics::new_unknown(&self.schema()))
-    }
-
     /// Returns statistics for a specific partition of this `ExecutionPlan` node.
     /// If statistics are not available, should return [`Statistics::new_unknown`]
     /// (the default), not an error.
     /// If `partition` is `None`, it returns statistics for the entire plan.
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         if let Some(idx) = partition {
             // Validate partition index
             let partition_count = self.properties().partitioning.partition_count();
-            if idx >= partition_count {
-                return internal_err!(
-                    "Invalid partition index: {}, the partition count is {}",
-                    idx,
-                    partition_count
-                );
-            }
+            assert_or_internal_err!(
+                idx < partition_count,
+                "Invalid partition index: {}, the partition count is {}",
+                idx,
+                partition_count
+            );
         }
-        Ok(Statistics::new_unknown(&self.schema()))
+        Ok(Arc::new(Statistics::new_unknown(&self.schema())))
     }
 
     /// Returns `true` if a limit can be safely pushed down through this
@@ -507,6 +575,10 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
 
     /// Returns a fetching variant of this `ExecutionPlan` node, if it supports
     /// fetch limits. Returns `None` otherwise.
+    ///
+    /// See physical optimizer rule [`limit_pushdown`] for details.
+    ///
+    /// [`limit_pushdown`]: https://docs.rs/datafusion/latest/datafusion/physical_optimizer/limit_pushdown/index.html
     fn with_fetch(&self, _limit: Option<usize>) -> Option<Arc<dyn ExecutionPlan>> {
         None
     }
@@ -571,6 +643,7 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
     }
 
     /// Handle the result of a child pushdown.
+    ///
     /// This method is called as we recurse back up the plan tree after pushing
     /// filters down to child nodes via [`ExecutionPlan::gather_filters_for_pushdown`].
     /// It allows the current node to process the results of filter pushdown from
@@ -680,6 +753,62 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
     ) -> Option<Arc<dyn ExecutionPlan>> {
         None
     }
+
+    /// Try to push down sort ordering requirements to this node.
+    ///
+    /// This method is called during sort pushdown optimization to determine if this
+    /// node can optimize for a requested sort ordering. Implementations should:
+    ///
+    /// - Return [`SortOrderPushdownResult::Exact`] if the node can guarantee the exact
+    ///   ordering (allowing the Sort operator to be removed)
+    /// - Return [`SortOrderPushdownResult::Inexact`] if the node can optimize for the
+    ///   ordering but cannot guarantee perfect sorting (Sort operator is kept)
+    /// - Return [`SortOrderPushdownResult::Unsupported`] if the node cannot optimize
+    ///   for the ordering
+    ///
+    /// For transparent nodes (that preserve ordering), implement this to delegate to
+    /// children and wrap the result with a new instance of this node.
+    ///
+    /// Default implementation returns `Unsupported`.
+    fn try_pushdown_sort(
+        &self,
+        _order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn ExecutionPlan>>> {
+        Ok(SortOrderPushdownResult::Unsupported)
+    }
+
+    /// Returns a variant of this `ExecutionPlan` that is aware of order-sensitivity.
+    ///
+    /// This is used to signal to data sources that the output ordering must be
+    /// preserved, even if it might be more efficient to ignore it (e.g. by
+    /// skipping some row groups in Parquet).
+    ///
+    fn with_preserve_order(
+        &self,
+        _preserve_order: bool,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        None
+    }
+}
+
+impl dyn ExecutionPlan {
+    /// Returns `true` if the plan is of type `T`.
+    ///
+    /// Prefer this over `downcast_ref::<T>().is_some()`. Works correctly when
+    /// called on `Arc<dyn ExecutionPlan>` via auto-deref.
+    pub fn is<T: ExecutionPlan>(&self) -> bool {
+        (self as &dyn Any).is::<T>()
+    }
+
+    /// Attempts to downcast this plan to a concrete type `T`, returning `None`
+    /// if the plan is not of that type.
+    ///
+    /// Works correctly when called on `Arc<dyn ExecutionPlan>` via auto-deref,
+    /// unlike `(&arc as &dyn Any).downcast_ref::<T>()` which would attempt to
+    /// downcast the `Arc` itself.
+    pub fn downcast_ref<T: ExecutionPlan>(&self) -> Option<&T> {
+        (self as &dyn Any).downcast_ref()
+    }
 }
 
 /// [`ExecutionPlan`] Invariant Level
@@ -919,7 +1048,7 @@ pub(crate) fn boundedness_from_children<'a>(
             } => {
                 return Boundedness::Unbounded {
                     requires_infinite_memory: true,
-                }
+                };
             }
             Boundedness::Unbounded {
                 requires_infinite_memory: false,
@@ -1018,12 +1147,17 @@ impl PlanProperties {
         self
     }
 
-    /// Overwrite equivalence properties with its new value.
-    pub fn with_eq_properties(mut self, eq_properties: EquivalenceProperties) -> Self {
+    /// Set equivalence properties having mut reference.
+    pub fn set_eq_properties(&mut self, eq_properties: EquivalenceProperties) {
         // Changing equivalence properties also changes output ordering, so
         // make sure to overwrite it:
         self.output_ordering = eq_properties.output_ordering();
         self.eq_properties = eq_properties;
+    }
+
+    /// Overwrite equivalence properties with its new value.
+    pub fn with_eq_properties(mut self, eq_properties: EquivalenceProperties) -> Self {
+        self.set_eq_properties(eq_properties);
         self
     }
 
@@ -1055,9 +1189,14 @@ impl PlanProperties {
         self
     }
 
+    /// Set constraints having mut reference.
+    pub fn set_constraints(&mut self, constraints: Constraints) {
+        self.eq_properties.set_constraints(constraints);
+    }
+
     /// Overwrite constraints with its new value.
     pub fn with_constraints(mut self, constraints: Constraints) -> Self {
-        self.eq_properties = self.eq_properties.with_constraints(constraints);
+        self.set_constraints(constraints);
         self
     }
 
@@ -1082,15 +1221,15 @@ impl PlanProperties {
 macro_rules! check_len {
     ($target:expr, $func_name:ident, $expected_len:expr) => {
         let actual_len = $target.$func_name().len();
-        if actual_len != $expected_len {
-            return internal_err!(
-                "{}::{} returned Vec with incorrect size: {} != {}",
-                $target.name(),
-                stringify!($func_name),
-                actual_len,
-                $expected_len
-            );
-        }
+        assert_eq_or_internal_err!(
+            actual_len,
+            $expected_len,
+            "{}::{} returned Vec with incorrect size: {} != {}",
+            $target.name(),
+            stringify!($func_name),
+            actual_len,
+            $expected_len
+        );
     };
 }
 
@@ -1116,6 +1255,7 @@ pub fn check_default_invariants<P: ExecutionPlan + ?Sized>(
 ///     1. RepartitionExec for changing the partition number between two `ExecutionPlan`s
 ///     2. CoalescePartitionsExec for collapsing all of the partitions into one without ordering guarantee
 ///     3. SortPreservingMergeExec for collapsing all of the sorted partitions into one with ordering guarantee
+#[expect(clippy::needless_pass_by_value)]
 pub fn need_data_exchange(plan: Arc<dyn ExecutionPlan>) -> bool {
     plan.properties().evaluation_type == EvaluationType::Eager
 }
@@ -1127,9 +1267,12 @@ pub fn with_new_children_if_necessary(
     children: Vec<Arc<dyn ExecutionPlan>>,
 ) -> Result<Arc<dyn ExecutionPlan>> {
     let old_children = plan.children();
-    if children.len() != old_children.len() {
-        internal_err!("Wrong number of children")
-    } else if children.is_empty()
+    assert_eq_or_internal_err!(
+        children.len(),
+        old_children.len(),
+        "Wrong number of children"
+    );
+    if children.is_empty()
         || children
             .iter()
             .zip(old_children.iter())
@@ -1167,6 +1310,10 @@ pub async fn collect(
 ///
 /// Dropping the stream will abort the execution of the query, and free up
 /// any allocated resources
+#[expect(
+    clippy::needless_pass_by_value,
+    reason = "Public API that historically takes owned Arcs"
+)]
 pub fn execute_stream(
     plan: Arc<dyn ExecutionPlan>,
     context: Arc<TaskContext>,
@@ -1231,6 +1378,10 @@ pub async fn collect_partitioned(
 ///
 /// Dropping the stream will abort the execution of the query, and free up
 /// any allocated resources
+#[expect(
+    clippy::needless_pass_by_value,
+    reason = "Public API that historically takes owned Arcs"
+)]
 pub fn execute_stream_partitioned(
     plan: Arc<dyn ExecutionPlan>,
     context: Arc<TaskContext>,
@@ -1262,6 +1413,10 @@ pub fn execute_stream_partitioned(
 /// violate the `not null` constraints specified in the `sink_schema`. If there are
 /// such columns, it wraps the resulting stream to enforce the `not null` constraints
 /// by invoking the [`check_not_null_constraints`] function on each batch of the stream.
+#[expect(
+    clippy::needless_pass_by_value,
+    reason = "Public API that historically takes owned Arcs"
+)]
 pub fn execute_input_stream(
     input: Arc<dyn ExecutionPlan>,
     sink_schema: SchemaRef,
@@ -1340,6 +1495,68 @@ pub fn check_not_null_constraints(
     Ok(batch)
 }
 
+/// Make plan ready to be re-executed returning its clone with state reset for all nodes.
+///
+/// Some plans will change their internal states after execution, making them unable to be executed again.
+/// This function uses [`ExecutionPlan::reset_state`] to reset any internal state within the plan.
+///
+/// An example is `CrossJoinExec`, which loads the left table into memory and stores it in the plan.
+/// However, if the data of the left table is derived from the work table, it will become outdated
+/// as the work table changes. When the next iteration executes this plan again, we must clear the left table.
+///
+/// # Limitations
+///
+/// While this function enables plan reuse, it does not allow the same plan to be executed if it (OR):
+///
+/// * uses dynamic filters,
+/// * represents a recursive query.
+///
+pub fn reset_plan_states(plan: Arc<dyn ExecutionPlan>) -> Result<Arc<dyn ExecutionPlan>> {
+    plan.transform_up(|plan| {
+        let new_plan = Arc::clone(&plan).reset_state()?;
+        Ok(Transformed::yes(new_plan))
+    })
+    .data()
+}
+
+/// Check if the `plan` children has the same properties as passed `children`.
+/// In this case plan can avoid self properties re-computation when its children
+/// replace is requested.
+/// The size of `children` must be equal to the size of `ExecutionPlan::children()`.
+pub fn has_same_children_properties(
+    plan: &impl ExecutionPlan,
+    children: &[Arc<dyn ExecutionPlan>],
+) -> Result<bool> {
+    let old_children = plan.children();
+    assert_eq_or_internal_err!(
+        children.len(),
+        old_children.len(),
+        "Wrong number of children"
+    );
+    for (lhs, rhs) in old_children.iter().zip(children.iter()) {
+        if !Arc::ptr_eq(lhs.properties(), rhs.properties()) {
+            return Ok(false);
+        }
+    }
+    Ok(true)
+}
+
+/// Helper macro to avoid properties re-computation if passed children properties
+/// the same as plan already has. Could be used to implement fast-path for method
+/// [`ExecutionPlan::with_new_children`].
+#[macro_export]
+macro_rules! check_if_same_properties {
+    ($plan: expr, $children: expr) => {
+        if $crate::execution_plan::has_same_children_properties(
+            $plan.as_ref(),
+            &$children,
+        )? {
+            let plan = $plan.with_new_children_and_same_properties($children);
+            return Ok(::std::sync::Arc::new(plan));
+        }
+    };
+}
+
 /// Utility function yielding a string representation of the given [`ExecutionPlan`].
 pub fn get_plan_string(plan: &Arc<dyn ExecutionPlan>) -> Vec<String> {
     let formatted = displayable(plan.as_ref()).indent(true).to_string();
@@ -1361,18 +1578,28 @@ pub enum CardinalityEffect {
     GreaterEqual,
 }
 
+/// Can be used in contexts where properties have not yet been initialized properly.
+pub(crate) fn stub_properties() -> Arc<PlanProperties> {
+    static STUB_PROPERTIES: LazyLock<Arc<PlanProperties>> = LazyLock::new(|| {
+        Arc::new(PlanProperties::new(
+            EquivalenceProperties::new(Arc::new(Schema::empty())),
+            Partitioning::UnknownPartitioning(1),
+            EmissionType::Final,
+            Boundedness::Bounded,
+        ))
+    });
+
+    Arc::clone(&STUB_PROPERTIES)
+}
+
 #[cfg(test)]
 mod tests {
-    use std::any::Any;
-    use std::sync::Arc;
 
     use super::*;
     use crate::{DisplayAs, DisplayFormatType, ExecutionPlan};
 
     use arrow::array::{DictionaryArray, Int32Array, NullArray, RunArray};
-    use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
-    use datafusion_common::{Result, Statistics};
-    use datafusion_execution::{SendableRecordBatchStream, TaskContext};
+    use arrow::datatypes::{DataType, Field, Schema};
 
     #[derive(Debug)]
     pub struct EmptyExec;
@@ -1398,11 +1625,7 @@ mod tests {
             Self::static_name()
         }
 
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             unimplemented!()
         }
 
@@ -1417,6 +1640,13 @@ mod tests {
             unimplemented!()
         }
 
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
+
         fn execute(
             &self,
             _partition: usize,
@@ -1425,11 +1655,10 @@ mod tests {
             unimplemented!()
         }
 
-        fn statistics(&self) -> Result<Statistics> {
-            unimplemented!()
-        }
-
-        fn partition_statistics(&self, _partition: Option<usize>) -> Result<Statistics> {
+        fn partition_statistics(
+            &self,
+            _partition: Option<usize>,
+        ) -> Result<Arc<Statistics>> {
             unimplemented!()
         }
     }
@@ -1465,11 +1694,7 @@ mod tests {
             "MyRenamedEmptyExec"
         }
 
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             unimplemented!()
         }
 
@@ -1477,6 +1702,13 @@ mod tests {
             vec![]
         }
 
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
+
         fn with_new_children(
             self: Arc<Self>,
             _: Vec<Arc<dyn ExecutionPlan>>,
@@ -1492,13 +1724,115 @@ mod tests {
             unimplemented!()
         }
 
-        fn statistics(&self) -> Result<Statistics> {
+        fn partition_statistics(
+            &self,
+            _partition: Option<usize>,
+        ) -> Result<Arc<Statistics>> {
+            unimplemented!()
+        }
+    }
+
+    /// A test node that holds a fixed list of expressions, used to test
+    /// `apply_expressions` behavior.
+    #[derive(Debug)]
+    struct MultiExprExec {
+        exprs: Vec<Arc<dyn PhysicalExpr>>,
+    }
+
+    impl DisplayAs for MultiExprExec {
+        fn fmt_as(
+            &self,
+            _t: DisplayFormatType,
+            _f: &mut std::fmt::Formatter,
+        ) -> std::fmt::Result {
+            unimplemented!()
+        }
+    }
+
+    impl ExecutionPlan for MultiExprExec {
+        fn name(&self) -> &'static str {
+            "MultiExprExec"
+        }
+
+        fn properties(&self) -> &Arc<PlanProperties> {
             unimplemented!()
         }
 
-        fn partition_statistics(&self, _partition: Option<usize>) -> Result<Statistics> {
+        fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+            vec![]
+        }
+
+        fn with_new_children(
+            self: Arc<Self>,
+            _: Vec<Arc<dyn ExecutionPlan>>,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
             unimplemented!()
         }
+
+        fn apply_expressions(
+            &self,
+            f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            let mut tnr = TreeNodeRecursion::Continue;
+            for expr in &self.exprs {
+                tnr = tnr.visit_sibling(|| f(expr.as_ref()))?;
+            }
+            Ok(tnr)
+        }
+
+        fn execute(
+            &self,
+            _partition: usize,
+            _context: Arc<TaskContext>,
+        ) -> Result<SendableRecordBatchStream> {
+            unimplemented!()
+        }
+
+        fn partition_statistics(
+            &self,
+            _partition: Option<usize>,
+        ) -> Result<Arc<Statistics>> {
+            unimplemented!()
+        }
+    }
+
+    /// Returns a simple literal `Arc<dyn PhysicalExpr>` for use in tests.
+    fn lit_expr(val: i64) -> Arc<dyn PhysicalExpr> {
+        use datafusion_physical_expr::expressions::Literal;
+        Arc::new(Literal::new(datafusion_common::ScalarValue::Int64(Some(
+            val,
+        ))))
+    }
+
+    /// `apply_expressions` visits all expressions when `f` always returns `Continue`.
+    #[test]
+    fn test_apply_expressions_continue_visits_all() -> Result<()> {
+        let plan = MultiExprExec {
+            exprs: vec![lit_expr(1), lit_expr(2), lit_expr(3)],
+        };
+        let mut visited = 0usize;
+        plan.apply_expressions(&mut |_expr| {
+            visited += 1;
+            Ok(TreeNodeRecursion::Continue)
+        })?;
+        assert_eq!(visited, 3);
+        Ok(())
+    }
+
+    #[test]
+    fn test_apply_expressions_stop_halts_early() -> Result<()> {
+        let plan = MultiExprExec {
+            exprs: vec![lit_expr(1), lit_expr(2), lit_expr(3)],
+        };
+        let mut visited = 0usize;
+        let tnr = plan.apply_expressions(&mut |_expr| {
+            visited += 1;
+            Ok(TreeNodeRecursion::Stop)
+        })?;
+        // Only the first expression is visited; the rest are skipped.
+        assert_eq!(visited, 1);
+        assert_eq!(tnr, TreeNodeRecursion::Stop);
+        Ok(())
     }
 
     #[test]
@@ -1516,7 +1850,7 @@ mod tests {
     /// A compilation test to ensure that the `ExecutionPlan::name()` method can
     /// be called from a trait object.
     /// Related ticket: https://github.com/apache/datafusion/pull/11047
-    #[allow(dead_code)]
+    #[expect(unused)]
     fn use_execution_plan_as_trait_object(plan: &dyn ExecutionPlan) {
         let _ = plan.name();
     }
diff --git a/datafusion/physical-plan/src/explain.rs b/datafusion/physical-plan/src/explain.rs
index bf488ccfae56a..617a1a6cdaf53 100644
--- a/datafusion/physical-plan/src/explain.rs
+++ b/datafusion/physical-plan/src/explain.rs
@@ -17,7 +17,6 @@
 
 //! Defines the EXPLAIN operator
 
-use std::any::Any;
 use std::sync::Arc;
 
 use super::{DisplayAs, PlanProperties, SendableRecordBatchStream};
@@ -27,9 +26,10 @@ use crate::{DisplayFormatType, ExecutionPlan, Partitioning};
 
 use arrow::{array::StringBuilder, datatypes::SchemaRef, record_batch::RecordBatch};
 use datafusion_common::display::StringifiedPlan;
-use datafusion_common::{internal_err, Result};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, assert_eq_or_internal_err};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::EquivalenceProperties;
+use datafusion_physical_expr::{EquivalenceProperties, PhysicalExpr};
 
 use log::trace;
 
@@ -44,7 +44,7 @@ pub struct ExplainExec {
     stringified_plans: Vec<StringifiedPlan>,
     /// control which plans to print
     verbose: bool,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl ExplainExec {
@@ -59,7 +59,7 @@ impl ExplainExec {
             schema,
             stringified_plans,
             verbose,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -108,11 +108,7 @@ impl ExecutionPlan for ExplainExec {
     }
 
     /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -121,6 +117,13 @@ impl ExecutionPlan for ExplainExec {
         vec![]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         _: Vec<Arc<dyn ExecutionPlan>>,
@@ -133,10 +136,17 @@ impl ExecutionPlan for ExplainExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        trace!("Start ExplainExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
-        if 0 != partition {
-            return internal_err!("ExplainExec invalid partition {partition}");
-        }
+        trace!(
+            "Start ExplainExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
+        assert_eq_or_internal_err!(
+            partition,
+            0,
+            "ExplainExec invalid partition {partition}"
+        );
         let mut type_builder =
             StringBuilder::with_capacity(self.stringified_plans.len(), 1024);
         let mut plan_builder =
@@ -172,7 +182,11 @@ impl ExecutionPlan for ExplainExec {
         )?;
 
         trace!(
-            "Before returning RecordBatchStream in ExplainExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
+            "Before returning RecordBatchStream in ExplainExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
 
         Ok(Box::pin(RecordBatchStreamAdapter::new(
             Arc::clone(&self.schema),
diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs
index 5ba508a8defe1..50efe8f5092e8 100644
--- a/datafusion/physical-plan/src/filter.rs
+++ b/datafusion/physical-plan/src/filter.rs
@@ -15,31 +15,36 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
+use std::collections::hash_map::Entry;
+use std::collections::{HashMap, HashSet};
 use std::pin::Pin;
 use std::sync::Arc;
-use std::task::{ready, Context, Poll};
+use std::task::{Context, Poll, ready};
 
+use datafusion_physical_expr::projection::{ProjectionRef, combine_projections};
 use itertools::Itertools;
 
 use super::{
     ColumnStatistics, DisplayAs, ExecutionPlanProperties, PlanProperties,
     RecordBatchStream, SendableRecordBatchStream, Statistics,
 };
+use crate::check_if_same_properties;
+use crate::coalesce::{LimitedBatchCoalescer, PushBatchStatus};
 use crate::common::can_project;
 use crate::execution_plan::CardinalityEffect;
 use crate::filter_pushdown::{
     ChildFilterDescription, ChildPushdownResult, FilterDescription, FilterPushdownPhase,
-    FilterPushdownPropagation, PushedDown, PushedDownPredicate,
+    FilterPushdownPropagation, PushedDown,
 };
+use crate::limit::LocalLimitExec;
 use crate::metrics::{MetricBuilder, MetricType};
 use crate::projection::{
-    make_with_child, try_embed_projection, update_expr, EmbeddedProjection,
-    ProjectionExec, ProjectionExpr,
+    EmbeddedProjection, ProjectionExec, ProjectionExpr, make_with_child,
+    try_embed_projection, update_expr,
 };
 use crate::{
-    metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, RatioMetrics},
     DisplayFormatType, ExecutionPlan,
+    metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, RatioMetrics},
 };
 
 use arrow::compute::filter_record_batch;
@@ -48,18 +53,19 @@ use arrow::record_batch::RecordBatch;
 use datafusion_common::cast::as_boolean_array;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::stats::Precision;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::{
-    internal_err, plan_err, project_schema, DataFusionError, Result, ScalarValue,
+    DataFusionError, Result, ScalarValue, internal_err, plan_err, project_schema,
 };
 use datafusion_execution::TaskContext;
 use datafusion_expr::Operator;
 use datafusion_physical_expr::equivalence::ProjectionMapping;
-use datafusion_physical_expr::expressions::{lit, BinaryExpr, Column};
+use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal, lit};
 use datafusion_physical_expr::intervals::utils::check_support;
-use datafusion_physical_expr::utils::collect_columns;
+use datafusion_physical_expr::utils::{collect_columns, reassign_expr_columns};
 use datafusion_physical_expr::{
-    analyze, conjunction, split_conjunction, AcrossPartitions, AnalysisContext,
-    ConstExpr, ExprBoundaries, PhysicalExpr,
+    AcrossPartitions, AnalysisContext, ConstExpr, ExprBoundaries, PhysicalExpr, analyze,
+    conjunction, split_conjunction,
 };
 
 use datafusion_physical_expr_common::physical_expr::fmt_sql;
@@ -67,6 +73,7 @@ use futures::stream::{Stream, StreamExt};
 use log::trace;
 
 const FILTER_EXEC_DEFAULT_SELECTIVITY: u8 = 20;
+const FILTER_EXEC_DEFAULT_BATCH_SIZE: usize = 8192;
 
 /// FilterExec evaluates a boolean predicate against all input batches to determine which rows to
 /// include in its output batches.
@@ -81,41 +88,168 @@ pub struct FilterExec {
     /// Selectivity for statistics. 0 = no rows, 100 = all rows
     default_selectivity: u8,
     /// Properties equivalence properties, partitioning, etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
     /// The projection indices of the columns in the output schema of join
-    projection: Option<Vec<usize>>,
+    projection: Option<ProjectionRef>,
+    /// Target batch size for output batches
+    batch_size: usize,
+    /// Number of rows to fetch
+    fetch: Option<usize>,
+}
+
+/// Builder for [`FilterExec`] to set optional parameters
+pub struct FilterExecBuilder {
+    predicate: Arc<dyn PhysicalExpr>,
+    input: Arc<dyn ExecutionPlan>,
+    projection: Option<ProjectionRef>,
+    default_selectivity: u8,
+    batch_size: usize,
+    fetch: Option<usize>,
+}
+
+impl FilterExecBuilder {
+    /// Create a new builder with required parameters (predicate and input)
+    pub fn new(predicate: Arc<dyn PhysicalExpr>, input: Arc<dyn ExecutionPlan>) -> Self {
+        Self {
+            predicate,
+            input,
+            projection: None,
+            default_selectivity: FILTER_EXEC_DEFAULT_SELECTIVITY,
+            batch_size: FILTER_EXEC_DEFAULT_BATCH_SIZE,
+            fetch: None,
+        }
+    }
+
+    /// Set the input execution plan
+    pub fn with_input(mut self, input: Arc<dyn ExecutionPlan>) -> Self {
+        self.input = input;
+        self
+    }
+
+    /// Set the predicate expression
+    pub fn with_predicate(mut self, predicate: Arc<dyn PhysicalExpr>) -> Self {
+        self.predicate = predicate;
+        self
+    }
+
+    /// Set the projection, composing with any existing projection.
+    ///
+    /// If a projection is already set, the new projection indices are mapped
+    /// through the existing projection. For example, if the current projection
+    /// is `[0, 2, 3]` and `apply_projection(Some(vec![0, 2]))` is called, the
+    /// resulting projection will be `[0, 3]` (indices 0 and 2 of `[0, 2, 3]`).
+    ///
+    /// If no projection is currently set, the new projection is used directly.
+    /// If `None` is passed, the projection is cleared.
+    pub fn apply_projection(self, projection: Option<Vec<usize>>) -> Result<Self> {
+        let projection = projection.map(Into::into);
+        self.apply_projection_by_ref(projection.as_ref())
+    }
+
+    /// The same as [`Self::apply_projection`] but takes projection shared reference.
+    pub fn apply_projection_by_ref(
+        mut self,
+        projection: Option<&ProjectionRef>,
+    ) -> Result<Self> {
+        // Check if the projection is valid against current output schema
+        can_project(&self.input.schema(), projection.map(AsRef::as_ref))?;
+        self.projection = combine_projections(projection, self.projection.as_ref())?;
+        Ok(self)
+    }
+
+    /// Set the default selectivity
+    pub fn with_default_selectivity(mut self, default_selectivity: u8) -> Self {
+        self.default_selectivity = default_selectivity;
+        self
+    }
+
+    /// Set the batch size
+    pub fn with_batch_size(mut self, batch_size: usize) -> Self {
+        self.batch_size = batch_size;
+        self
+    }
+
+    /// Set the fetch limit
+    pub fn with_fetch(mut self, fetch: Option<usize>) -> Self {
+        self.fetch = fetch;
+        self
+    }
+
+    /// Build the FilterExec, computing properties once with all configured parameters
+    pub fn build(self) -> Result<FilterExec> {
+        // Validate predicate type
+        match self.predicate.data_type(self.input.schema().as_ref())? {
+            DataType::Boolean => {}
+            other => {
+                return plan_err!(
+                    "Filter predicate must return BOOLEAN values, got {other:?}"
+                );
+            }
+        }
+
+        // Validate selectivity
+        if self.default_selectivity > 100 {
+            return plan_err!(
+                "Default filter selectivity value needs to be less than or equal to 100"
+            );
+        }
+
+        // Validate projection if provided
+        can_project(&self.input.schema(), self.projection.as_deref())?;
+
+        // Compute properties once with all parameters
+        let cache = FilterExec::compute_properties(
+            &self.input,
+            &self.predicate,
+            self.default_selectivity,
+            self.projection.as_deref(),
+        )?;
+
+        Ok(FilterExec {
+            predicate: self.predicate,
+            input: self.input,
+            metrics: ExecutionPlanMetricsSet::new(),
+            default_selectivity: self.default_selectivity,
+            cache: Arc::new(cache),
+            projection: self.projection,
+            batch_size: self.batch_size,
+            fetch: self.fetch,
+        })
+    }
+}
+
+impl From<&FilterExec> for FilterExecBuilder {
+    fn from(exec: &FilterExec) -> Self {
+        Self {
+            predicate: Arc::clone(&exec.predicate),
+            input: Arc::clone(&exec.input),
+            projection: exec.projection.clone(),
+            default_selectivity: exec.default_selectivity,
+            batch_size: exec.batch_size,
+            fetch: exec.fetch,
+            // We could cache / copy over PlanProperties
+            // here but that would require invalidating them in FilterExecBuilder::apply_projection, etc.
+            // and currently every call to this method ends up invalidating them anyway.
+            // If useful this can be added in the future as a non-breaking change.
+        }
+    }
 }
 
 impl FilterExec {
-    /// Create a FilterExec on an input
+    /// Create a FilterExec on an input using the builder pattern
     pub fn try_new(
         predicate: Arc<dyn PhysicalExpr>,
         input: Arc<dyn ExecutionPlan>,
     ) -> Result<Self> {
-        match predicate.data_type(input.schema().as_ref())? {
-            DataType::Boolean => {
-                let default_selectivity = FILTER_EXEC_DEFAULT_SELECTIVITY;
-                let cache = Self::compute_properties(
-                    &input,
-                    &predicate,
-                    default_selectivity,
-                    None,
-                )?;
-                Ok(Self {
-                    predicate,
-                    input: Arc::clone(&input),
-                    metrics: ExecutionPlanMetricsSet::new(),
-                    default_selectivity,
-                    cache,
-                    projection: None,
-                })
-            }
-            other => {
-                plan_err!("Filter predicate must return BOOLEAN values, got {other:?}")
-            }
-        }
+        FilterExecBuilder::new(predicate, input).build()
+    }
+
+    /// Get a batch size
+    pub fn batch_size(&self) -> usize {
+        self.batch_size
     }
 
+    /// Set the default selectivity
     pub fn with_default_selectivity(
         mut self,
         default_selectivity: u8,
@@ -130,31 +264,29 @@ impl FilterExec {
     }
 
     /// Return new instance of [FilterExec] with the given projection.
+    ///
+    /// # Deprecated
+    /// Use [`FilterExecBuilder::apply_projection`] instead
+    #[deprecated(
+        since = "52.0.0",
+        note = "Use FilterExecBuilder::apply_projection instead"
+    )]
     pub fn with_projection(&self, projection: Option<Vec<usize>>) -> Result<Self> {
-        //  Check if the projection is valid
-        can_project(&self.schema(), projection.as_ref())?;
-
-        let projection = match projection {
-            Some(projection) => match &self.projection {
-                Some(p) => Some(projection.iter().map(|i| p[*i]).collect()),
-                None => Some(projection),
-            },
-            None => None,
-        };
+        let builder = FilterExecBuilder::from(self);
+        builder.apply_projection(projection)?.build()
+    }
 
-        let cache = Self::compute_properties(
-            &self.input,
-            &self.predicate,
-            self.default_selectivity,
-            projection.as_ref(),
-        )?;
+    /// Set the batch size
+    pub fn with_batch_size(&self, batch_size: usize) -> Result<Self> {
         Ok(Self {
             predicate: Arc::clone(&self.predicate),
             input: Arc::clone(&self.input),
             metrics: self.metrics.clone(),
             default_selectivity: self.default_selectivity,
-            cache,
-            projection,
+            cache: Arc::clone(&self.cache),
+            projection: self.projection.clone(),
+            batch_size,
+            fetch: self.fetch,
         })
     }
 
@@ -174,45 +306,81 @@ impl FilterExec {
     }
 
     /// Projection
-    pub fn projection(&self) -> Option<&Vec<usize>> {
-        self.projection.as_ref()
+    pub fn projection(&self) -> &Option<ProjectionRef> {
+        &self.projection
     }
 
-    /// Calculates `Statistics` for `FilterExec`, by applying selectivity (either default, or estimated) to input statistics.
-    fn statistics_helper(
-        schema: SchemaRef,
+    /// Calculates `Statistics` for `FilterExec`, by applying selectivity
+    /// (either default, or estimated) to input statistics.
+    ///
+    /// Equality predicates (`col = literal`) set NDV to `Exact(1)`, or
+    /// `Exact(0)` when the predicate is contradictory (e.g. `a = 1 AND a = 2`).
+    pub(crate) fn statistics_helper(
+        schema: &SchemaRef,
         input_stats: Statistics,
         predicate: &Arc<dyn PhysicalExpr>,
         default_selectivity: u8,
     ) -> Result<Statistics> {
-        if !check_support(predicate, &schema) {
+        let (eq_columns, is_infeasible) = collect_equality_columns(predicate);
+
+        let input_num_rows = input_stats.num_rows;
+        let input_total_byte_size = input_stats.total_byte_size;
+
+        let (selectivity, num_rows, column_statistics) = if is_infeasible {
+            // Contradictory predicate: zero rows, and null/min/max are
+            // undefined on an empty column.
+            let mut cs = input_stats.to_inexact().column_statistics;
+            for col_stat in &mut cs {
+                col_stat.distinct_count = Precision::Exact(0);
+                col_stat.null_count = Precision::Exact(0);
+                col_stat.min_value = Precision::Absent;
+                col_stat.max_value = Precision::Absent;
+                col_stat.sum_value = Precision::Absent;
+                col_stat.byte_size = Precision::Exact(0);
+            }
+            (0.0, Precision::Exact(0), cs)
+        } else if !check_support(predicate, schema) {
+            // Interval analysis is not applicable; fall back to the default
+            // selectivity but still pin NDV=1 for every `col = literal` column.
             let selectivity = default_selectivity as f64 / 100.0;
-            let mut stats = input_stats.to_inexact();
-            stats.num_rows = stats.num_rows.with_estimated_selectivity(selectivity);
-            stats.total_byte_size = stats
-                .total_byte_size
-                .with_estimated_selectivity(selectivity);
-            return Ok(stats);
-        }
-
-        let num_rows = input_stats.num_rows;
-        let total_byte_size = input_stats.total_byte_size;
-        let input_analysis_ctx = AnalysisContext::try_from_statistics(
-            &schema,
-            &input_stats.column_statistics,
-        )?;
-
-        let analysis_ctx = analyze(predicate, input_analysis_ctx, &schema)?;
+            let mut cs = input_stats.to_inexact().column_statistics;
+            for &idx in &eq_columns {
+                if idx < cs.len() && cs[idx].distinct_count != Precision::Exact(0) {
+                    cs[idx].distinct_count = Precision::Exact(1);
+                }
+            }
+            (
+                selectivity,
+                input_num_rows.with_estimated_selectivity(selectivity),
+                cs,
+            )
+        } else {
+            // Interval-analysis path. `collect_new_statistics` already sets
+            // distinct_count = Exact(1) when an interval collapses to a single
+            // value, so no post-fix is needed here.
+            let input_analysis_ctx = AnalysisContext::try_from_statistics(
+                schema,
+                &input_stats.column_statistics,
+            )?;
+            let analysis_ctx = analyze(predicate, input_analysis_ctx, schema)?;
+            let selectivity = analysis_ctx.selectivity.unwrap_or(1.0);
+            let filtered_num_rows =
+                input_num_rows.with_estimated_selectivity(selectivity);
+            let cs = collect_new_statistics(
+                schema,
+                &input_stats.column_statistics,
+                analysis_ctx.boundaries,
+                match &filtered_num_rows {
+                    Precision::Absent => None,
+                    p => Some(*p),
+                },
+            );
+            (selectivity, filtered_num_rows, cs)
+        };
 
-        // Estimate (inexact) selectivity of predicate
-        let selectivity = analysis_ctx.selectivity.unwrap_or(1.0);
-        let num_rows = num_rows.with_estimated_selectivity(selectivity);
-        let total_byte_size = total_byte_size.with_estimated_selectivity(selectivity);
+        let total_byte_size =
+            input_total_byte_size.with_estimated_selectivity(selectivity);
 
-        let column_statistics = collect_new_statistics(
-            &input_stats.column_statistics,
-            analysis_ctx.boundaries,
-        );
         Ok(Statistics {
             num_rows,
             total_byte_size,
@@ -220,48 +388,19 @@ impl FilterExec {
         })
     }
 
-    fn extend_constants(
-        input: &Arc<dyn ExecutionPlan>,
-        predicate: &Arc<dyn PhysicalExpr>,
-    ) -> Vec<ConstExpr> {
-        let mut res_constants = Vec::new();
-        let input_eqs = input.equivalence_properties();
-
-        let conjunctions = split_conjunction(predicate);
-        for conjunction in conjunctions {
-            if let Some(binary) = conjunction.as_any().downcast_ref::<BinaryExpr>() {
-                if binary.op() == &Operator::Eq {
-                    // Filter evaluates to single value for all partitions
-                    if input_eqs.is_expr_constant(binary.left()).is_some() {
-                        let across = input_eqs
-                            .is_expr_constant(binary.right())
-                            .unwrap_or_default();
-                        res_constants
-                            .push(ConstExpr::new(Arc::clone(binary.right()), across));
-                    } else if input_eqs.is_expr_constant(binary.right()).is_some() {
-                        let across = input_eqs
-                            .is_expr_constant(binary.left())
-                            .unwrap_or_default();
-                        res_constants
-                            .push(ConstExpr::new(Arc::clone(binary.left()), across));
-                    }
-                }
-            }
-        }
-        res_constants
-    }
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
     fn compute_properties(
         input: &Arc<dyn ExecutionPlan>,
         predicate: &Arc<dyn PhysicalExpr>,
         default_selectivity: u8,
-        projection: Option<&Vec<usize>>,
+        projection: Option<&[usize]>,
     ) -> Result<PlanProperties> {
         // Combine the equal predicates with the input equivalence properties
         // to construct the equivalence properties:
+        let schema = input.schema();
         let stats = Self::statistics_helper(
-            input.schema(),
-            input.partition_statistics(None)?,
+            &schema,
+            Arc::unwrap_or_clone(input.partition_statistics(None)?),
             predicate,
             default_selectivity,
         )?;
@@ -286,14 +425,17 @@ impl FilterExec {
         eq_properties.add_constants(constants)?;
         // This is for logical constant (for example: a = '1', then a could be marked as a constant)
         // to do: how to deal with multiple situation to represent = (for example c1 between 0 and 0)
-        eq_properties.add_constants(Self::extend_constants(input, predicate))?;
+        eq_properties.add_constants(ConstExpr::collect_predicate_constants(
+            input.equivalence_properties(),
+            predicate,
+        ))?;
 
         let mut output_partitioning = input.output_partitioning().clone();
         // If contains projection, update the PlanProperties.
         if let Some(projection) = projection {
             let schema = eq_properties.schema();
             let projection_mapping = ProjectionMapping::from_indices(projection, schema)?;
-            let out_schema = project_schema(schema, Some(projection))?;
+            let out_schema = project_schema(schema, Some(&projection))?;
             output_partitioning =
                 output_partitioning.project(&projection_mapping, &eq_properties);
             eq_properties = eq_properties.project(&projection_mapping, out_schema);
@@ -306,6 +448,17 @@ impl FilterExec {
             input.boundedness(),
         ))
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for FilterExec {
@@ -334,9 +487,19 @@ impl DisplayAs for FilterExec {
                 } else {
                     "".to_string()
                 };
-                write!(f, "FilterExec: {}{}", self.predicate, display_projections)
+                let fetch = self
+                    .fetch
+                    .map_or_else(|| "".to_string(), |f| format!(", fetch={f}"));
+                write!(
+                    f,
+                    "FilterExec: {}{}{}",
+                    self.predicate, display_projections, fetch
+                )
             }
             DisplayFormatType::TreeRender => {
+                if let Some(fetch) = self.fetch {
+                    writeln!(f, "fetch={fetch}")?;
+                }
                 write!(f, "predicate={}", fmt_sql(self.predicate.as_ref()))
             }
         }
@@ -349,11 +512,7 @@ impl ExecutionPlan for FilterExec {
     }
 
     /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -361,6 +520,13 @@ impl ExecutionPlan for FilterExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        f(self.predicate.as_ref())
+    }
+
     fn maintains_input_order(&self) -> Vec<bool> {
         // Tell optimizer this operator doesn't reorder its input
         vec![true]
@@ -370,12 +536,11 @@ impl ExecutionPlan for FilterExec {
         self: Arc<Self>,
         mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        FilterExec::try_new(Arc::clone(&self.predicate), children.swap_remove(0))
-            .and_then(|e| {
-                let selectivity = e.default_selectivity();
-                e.with_default_selectivity(selectivity)
-            })
-            .and_then(|e| e.with_projection(self.projection().cloned()))
+        check_if_same_properties!(self, children);
+        let new_input = children.swap_remove(0);
+        FilterExecBuilder::from(&*self)
+            .with_input(new_input)
+            .build()
             .map(|e| Arc::new(e) as _)
     }
 
@@ -384,7 +549,12 @@ impl ExecutionPlan for FilterExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        trace!("Start FilterExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
+        trace!(
+            "Start FilterExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
         let metrics = FilterExecMetrics::new(&self.metrics, partition);
         Ok(Box::pin(FilterExecStream {
             schema: self.schema(),
@@ -392,6 +562,11 @@ impl ExecutionPlan for FilterExec {
             input: self.input.execute(partition, context)?,
             metrics,
             projection: self.projection.clone(),
+            batch_coalescer: LimitedBatchCoalescer::new(
+                self.schema(),
+                self.batch_size,
+                self.fetch,
+            ),
         }))
     }
 
@@ -401,19 +576,16 @@ impl ExecutionPlan for FilterExec {
 
     /// The output statistics of a filtering operation can be estimated if the
     /// predicate's selectivity value can be determined for the incoming data.
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        let input_stats = self.input.partition_statistics(partition)?;
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let input_stats =
+            Arc::unwrap_or_clone(self.input.partition_statistics(partition)?);
         let stats = Self::statistics_helper(
-            self.schema(),
+            &self.input.schema(),
             input_stats,
             self.predicate(),
             self.default_selectivity,
         )?;
-        Ok(stats.project(self.projection.as_ref()))
+        Ok(Arc::new(stats.project(self.projection.as_ref())))
     }
 
     fn cardinality_effect(&self) -> CardinalityEffect {
@@ -432,15 +604,15 @@ impl ExecutionPlan for FilterExec {
             if let Some(new_predicate) =
                 update_expr(self.predicate(), projection.expr(), false)?
             {
-                return FilterExec::try_new(
-                    new_predicate,
-                    make_with_child(projection, self.input())?,
-                )
-                .and_then(|e| {
-                    let selectivity = self.default_selectivity();
-                    e.with_default_selectivity(selectivity)
-                })
-                .map(|e| Some(Arc::new(e) as _));
+                return FilterExecBuilder::from(self)
+                    .with_input(make_with_child(projection, self.input())?)
+                    .with_predicate(new_predicate)
+                    // The original FilterExec projection referenced columns from its old
+                    // input. After the swap the new input is the ProjectionExec which
+                    // already handles column selection, so clear the projection here.
+                    .apply_projection(None)?
+                    .build()
+                    .map(|e| Some(Arc::new(e) as _));
             }
         }
         try_embed_projection(projection, self)
@@ -452,16 +624,10 @@ impl ExecutionPlan for FilterExec {
         parent_filters: Vec<Arc<dyn PhysicalExpr>>,
         _config: &ConfigOptions,
     ) -> Result<FilterDescription> {
-        if !matches!(phase, FilterPushdownPhase::Pre) {
-            // For non-pre phase, filters pass through unchanged
-            let filter_supports = parent_filters
-                .into_iter()
-                .map(PushedDownPredicate::supported)
-                .collect();
-            return Ok(FilterDescription::new().with_child(ChildFilterDescription {
-                parent_filters: filter_supports,
-                self_filters: vec![],
-            }));
+        if phase != FilterPushdownPhase::Pre {
+            let child =
+                ChildFilterDescription::from_child(&parent_filters, self.input())?;
+            return Ok(FilterDescription::new().with_child(child));
         }
 
         let child = ChildFilterDescription::from_child(&parent_filters, self.input())?
@@ -481,14 +647,30 @@ impl ExecutionPlan for FilterExec {
         child_pushdown_result: ChildPushdownResult,
         _config: &ConfigOptions,
     ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
-        if !matches!(phase, FilterPushdownPhase::Pre) {
+        if phase != FilterPushdownPhase::Pre {
             return Ok(FilterPushdownPropagation::if_all(child_pushdown_result));
         }
         // We absorb any parent filters that were not handled by our children
-        let unsupported_parent_filters =
-            child_pushdown_result.parent_filters.iter().filter_map(|f| {
-                matches!(f.all(), PushedDown::No).then_some(Arc::clone(&f.filter))
-            });
+        let mut unsupported_parent_filters: Vec<Arc<dyn PhysicalExpr>> =
+            child_pushdown_result
+                .parent_filters
+                .iter()
+                .filter_map(|f| {
+                    matches!(f.all(), PushedDown::No).then_some(Arc::clone(&f.filter))
+                })
+                .collect();
+
+        // If this FilterExec has a projection, the unsupported parent filters
+        // are in the output schema (after projection) coordinates. We need to
+        // remap them to the input schema coordinates before combining with self filters.
+        if self.projection.is_some() {
+            let input_schema = self.input().schema();
+            unsupported_parent_filters = unsupported_parent_filters
+                .into_iter()
+                .map(|expr| reassign_expr_columns(expr, &input_schema))
+                .collect::<Result<Vec<_>>>()?;
+        }
+
         let unsupported_self_filters = child_pushdown_result
             .self_filters
             .first()
@@ -509,8 +691,23 @@ impl ExecutionPlan for FilterExec {
         let filter_input = Arc::clone(self.input());
         let new_predicate = conjunction(unhandled_filters);
         let updated_node = if new_predicate.eq(&lit(true)) {
-            // FilterExec is no longer needed, but we may need to leave a projection in place
-            match self.projection() {
+            // FilterExec is no longer needed, but we may need to leave a projection in place.
+            // If this FilterExec had a fetch limit, propagate it to the child.
+            // When the child also has a fetch, use the minimum of both to preserve
+            // the tighter constraint.
+            let filter_input = if let Some(outer_fetch) = self.fetch {
+                let effective_fetch = match filter_input.fetch() {
+                    Some(inner_fetch) => outer_fetch.min(inner_fetch),
+                    None => outer_fetch,
+                };
+                match filter_input.with_fetch(Some(effective_fetch)) {
+                    Some(node) => node,
+                    None => Arc::new(LocalLimitExec::new(filter_input, effective_fetch)),
+                }
+            } else {
+                filter_input
+            };
+            match self.projection().as_ref() {
                 Some(projection_indices) => {
                     let filter_child_schema = filter_input.schema();
                     let proj_exprs = projection_indices
@@ -536,19 +733,21 @@ impl ExecutionPlan for FilterExec {
             // The new predicate is the same as our current predicate
             None
         } else {
-            // Create a new FilterExec with the new predicate
+            // Create a new FilterExec with the new predicate, preserving the projection
             let new = FilterExec {
                 predicate: Arc::clone(&new_predicate),
                 input: Arc::clone(&filter_input),
                 metrics: self.metrics.clone(),
                 default_selectivity: self.default_selectivity,
-                cache: Self::compute_properties(
+                cache: Arc::new(Self::compute_properties(
                     &filter_input,
                     &new_predicate,
                     self.default_selectivity,
-                    self.projection.as_ref(),
-                )?,
-                projection: None,
+                    self.projection.as_deref(),
+                )?),
+                projection: self.projection.clone(),
+                batch_size: self.batch_size,
+                fetch: self.fetch,
             };
             Some(Arc::new(new) as _)
         };
@@ -558,11 +757,114 @@ impl ExecutionPlan for FilterExec {
             updated_node,
         })
     }
+
+    fn fetch(&self) -> Option<usize> {
+        self.fetch
+    }
+
+    fn with_fetch(&self, fetch: Option<usize>) -> Option<Arc<dyn ExecutionPlan>> {
+        Some(Arc::new(Self {
+            predicate: Arc::clone(&self.predicate),
+            input: Arc::clone(&self.input),
+            metrics: self.metrics.clone(),
+            default_selectivity: self.default_selectivity,
+            cache: Arc::clone(&self.cache),
+            projection: self.projection.clone(),
+            batch_size: self.batch_size,
+            fetch,
+        }))
+    }
+
+    fn with_preserve_order(
+        &self,
+        preserve_order: bool,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        self.input
+            .with_preserve_order(preserve_order)
+            .and_then(|new_input| {
+                Arc::new(self.clone())
+                    .with_new_children(vec![new_input])
+                    .ok()
+            })
+    }
 }
 
 impl EmbeddedProjection for FilterExec {
     fn with_projection(&self, projection: Option<Vec<usize>>) -> Result<Self> {
-        self.with_projection(projection)
+        FilterExecBuilder::from(self)
+            .apply_projection(projection)?
+            .build()
+    }
+}
+
+/// Collects column equality information from `col = literal` predicates in a
+/// conjunction.
+///
+/// Returns `(eq_columns, is_infeasible)`:
+/// - `eq_columns`: set of column indices constrained to a single literal value.
+/// - `is_infeasible`: `true` when the same column is equated to two different
+///   non-null literals (e.g. `name = 'alice' AND name = 'bob'`), which is
+///   always unsatisfiable.
+///
+/// Only AND conjunctions are traversed; OR is intentionally skipped
+/// since `a = 1 OR a = 2` does not pin NDV to 1.
+fn collect_equality_columns(predicate: &Arc<dyn PhysicalExpr>) -> (HashSet<usize>, bool) {
+    let mut eq_values: HashMap<usize, ScalarValue> = HashMap::new();
+    let mut infeasible = false;
+
+    for expr in split_conjunction(predicate) {
+        let Some(binary) = expr.downcast_ref::<BinaryExpr>() else {
+            continue;
+        };
+        if *binary.op() != Operator::Eq {
+            continue;
+        }
+        let left = binary.left();
+        let right = binary.right();
+        let pair = if let Some(col) = left.downcast_ref::<Column>()
+            && let Some(lit) = right.downcast_ref::<Literal>()
+            && !lit.value().is_null()
+        {
+            Some((col.index(), lit.value().clone()))
+        } else if let Some(col) = right.downcast_ref::<Column>()
+            && let Some(lit) = left.downcast_ref::<Literal>()
+            && !lit.value().is_null()
+        {
+            Some((col.index(), lit.value().clone()))
+        } else {
+            None
+        };
+
+        if let Some((idx, value)) = pair {
+            match eq_values.entry(idx) {
+                Entry::Occupied(prev) => {
+                    if *prev.get() != value {
+                        infeasible = true;
+                        break;
+                    }
+                }
+                Entry::Vacant(slot) => {
+                    slot.insert(value);
+                }
+            }
+        }
+    }
+
+    (eq_values.into_keys().collect(), infeasible)
+}
+
+/// Converts an interval bound to a [`Precision`] value. NULL bounds (which
+/// represent "unbounded" in the interval type) map to [`Precision::Absent`].
+fn interval_bound_to_precision(
+    bound: ScalarValue,
+    is_exact: bool,
+) -> Precision<ScalarValue> {
+    if bound.is_null() {
+        Precision::Absent
+    } else if is_exact {
+        Precision::Exact(bound)
+    } else {
+        Precision::Inexact(bound)
     }
 }
 
@@ -571,8 +873,10 @@ impl EmbeddedProjection for FilterExec {
 /// is adjusted by using the next/previous value for its data type to convert
 /// it into a closed bound.
 fn collect_new_statistics(
+    schema: &SchemaRef,
     input_column_stats: &[ColumnStatistics],
     analysis_boundaries: Vec<ExprBoundaries>,
+    filtered_num_rows: Option<Precision<usize>>,
 ) -> Vec<ColumnStatistics> {
     analysis_boundaries
         .into_iter()
@@ -587,27 +891,44 @@ fn collect_new_statistics(
                 },
             )| {
                 let Some(interval) = interval else {
-                    // If the interval is `None`, we can say that there are no rows:
+                    // If the interval is `None`, we can say that there are no rows.
+                    // Use a typed null to preserve the column's data type, so that
+                    // downstream interval analysis can still intersect intervals
+                    // of the same type.
+                    let typed_null = ScalarValue::try_from(schema.field(idx).data_type())
+                        .unwrap_or(ScalarValue::Null);
                     return ColumnStatistics {
                         null_count: Precision::Exact(0),
-                        max_value: Precision::Exact(ScalarValue::Null),
-                        min_value: Precision::Exact(ScalarValue::Null),
-                        sum_value: Precision::Exact(ScalarValue::Null),
+                        max_value: Precision::Exact(typed_null.clone()),
+                        min_value: Precision::Exact(typed_null.clone()),
+                        sum_value: Precision::Exact(typed_null),
                         distinct_count: Precision::Exact(0),
+                        byte_size: input_column_stats[idx].byte_size,
                     };
                 };
                 let (lower, upper) = interval.into_bounds();
-                let (min_value, max_value) = if lower.eq(&upper) {
-                    (Precision::Exact(lower), Precision::Exact(upper))
+                let is_single_value =
+                    !lower.is_null() && !upper.is_null() && lower == upper;
+                let min_value = interval_bound_to_precision(lower, is_single_value);
+                let max_value = interval_bound_to_precision(upper, is_single_value);
+                // When the interval collapses to a single value (equality
+                // predicate), the column has exactly 1 distinct value.
+                // Otherwise, cap NDV at the filtered row count.
+                let capped_distinct_count = if is_single_value {
+                    Precision::Exact(1)
                 } else {
-                    (Precision::Inexact(lower), Precision::Inexact(upper))
+                    match filtered_num_rows {
+                        Some(rows) => distinct_count.to_inexact().min(&rows),
+                        None => distinct_count.to_inexact(),
+                    }
                 };
                 ColumnStatistics {
                     null_count: input_column_stats[idx].null_count.to_inexact(),
                     max_value,
                     min_value,
                     sum_value: Precision::Absent,
-                    distinct_count: distinct_count.to_inexact(),
+                    distinct_count: capped_distinct_count,
+                    byte_size: input_column_stats[idx].byte_size,
                 }
             },
         )
@@ -626,15 +947,19 @@ struct FilterExecStream {
     /// Runtime metrics recording
     metrics: FilterExecMetrics,
     /// The projection indices of the columns in the input schema
-    projection: Option<Vec<usize>>,
+    projection: Option<ProjectionRef>,
+    /// Batch coalescer to combine small batches
+    batch_coalescer: LimitedBatchCoalescer,
 }
 
 /// The metrics for `FilterExec`
 struct FilterExecMetrics {
-    // Common metrics for most operators
+    /// Common metrics for most operators
     baseline_metrics: BaselineMetrics,
-    // Selectivity of the filter, calculated as output_rows / input_rows
+    /// Selectivity of the filter, calculated as output_rows / input_rows
     selectivity: RatioMetrics,
+    // Remember to update `docs/source/user-guide/metrics.md` when adding new metrics,
+    // or modifying metrics comments
 }
 
 impl FilterExecMetrics {
@@ -642,7 +967,7 @@ impl FilterExecMetrics {
         Self {
             baseline_metrics: BaselineMetrics::new(metrics, partition),
             selectivity: MetricBuilder::new(metrics)
-                .with_type(MetricType::SUMMARY)
+                .with_type(MetricType::Summary)
                 .ratio_metrics("selectivity", partition),
         }
     }
@@ -652,14 +977,13 @@ pub fn batch_filter(
     batch: &RecordBatch,
     predicate: &Arc<dyn PhysicalExpr>,
 ) -> Result<RecordBatch> {
-    filter_and_project(batch, predicate, None, &batch.schema())
+    filter_and_project(batch, predicate, None)
 }
 
 fn filter_and_project(
     batch: &RecordBatch,
     predicate: &Arc<dyn PhysicalExpr>,
     projection: Option<&Vec<usize>>,
-    output_schema: &SchemaRef,
 ) -> Result<RecordBatch> {
     predicate
         .evaluate(batch)
@@ -669,14 +993,7 @@ fn filter_and_project(
                 // Apply filter array to record batch
                 (Ok(filter_array), None) => filter_record_batch(batch, filter_array)?,
                 (Ok(filter_array), Some(projection)) => {
-                    let projected_columns = projection
-                        .iter()
-                        .map(|i| Arc::clone(batch.column(*i)))
-                        .collect();
-                    let projected_batch = RecordBatch::try_new(
-                        Arc::clone(output_schema),
-                        projected_columns,
-                    )?;
+                    let projected_batch = batch.project(projection)?;
                     filter_record_batch(&projected_batch, filter_array)?
                 }
                 (Err(_), _) => {
@@ -695,36 +1012,73 @@ impl Stream for FilterExecStream {
         mut self: Pin<&mut Self>,
         cx: &mut Context<'_>,
     ) -> Poll<Option<Self::Item>> {
-        let poll;
+        let elapsed_compute = self.metrics.baseline_metrics.elapsed_compute().clone();
         loop {
+            // If there is a completed batch ready, return it
+            if let Some(batch) = self.batch_coalescer.next_completed_batch() {
+                self.metrics.selectivity.add_part(batch.num_rows());
+                let poll = Poll::Ready(Some(Ok(batch)));
+                return self.metrics.baseline_metrics.record_poll(poll);
+            }
+
+            if self.batch_coalescer.is_finished() {
+                // If input is done and no batches are ready, return None to signal end of stream.
+                return Poll::Ready(None);
+            }
+
+            // Attempt to pull the next batch from the input stream.
             match ready!(self.input.poll_next_unpin(cx)) {
+                None => {
+                    self.batch_coalescer.finish()?;
+                    // continue draining the coalescer
+                }
                 Some(Ok(batch)) => {
-                    let timer = self.metrics.baseline_metrics.elapsed_compute().timer();
-                    let filtered_batch = filter_and_project(
-                        &batch,
-                        &self.predicate,
-                        self.projection.as_ref(),
-                        &self.schema,
-                    )?;
+                    let timer = elapsed_compute.timer();
+                    let status = self.predicate.as_ref()
+                        .evaluate(&batch)
+                        .and_then(|v| v.into_array(batch.num_rows()))
+                        .and_then(|array| {
+                            Ok(match self.projection.as_ref()  {
+                                Some(projection) => {
+                                    let projected_batch = batch.project(projection)?;
+                                    (array, projected_batch)
+                                },
+                                None => (array, batch)
+                            })
+                        }).and_then(|(array, batch)| {
+                            match as_boolean_array(&array) {
+                                Ok(filter_array) => {
+                                    self.metrics.selectivity.add_total(batch.num_rows());
+                                    // TODO: support push_batch_with_filter in LimitedBatchCoalescer
+                                    let batch = filter_record_batch(&batch, filter_array)?;
+                                    let state = self.batch_coalescer.push_batch(batch)?;
+                                    Ok(state)
+                                }
+                                Err(_) => {
+                                    internal_err!(
+                                        "Cannot create filter_array from non-boolean predicates"
+                                    )
+                                }
+                            }
+                        })?;
                     timer.done();
 
-                    self.metrics.selectivity.add_part(filtered_batch.num_rows());
-                    self.metrics.selectivity.add_total(batch.num_rows());
-
-                    // Skip entirely filtered batches
-                    if filtered_batch.num_rows() == 0 {
-                        continue;
+                    match status {
+                        PushBatchStatus::Continue => {
+                            // Keep pushing more batches
+                        }
+                        PushBatchStatus::LimitReached => {
+                            // limit was reached, so stop early
+                            self.batch_coalescer.finish()?;
+                            // continue draining the coalescer
+                        }
                     }
-                    poll = Poll::Ready(Some(Ok(filtered_batch)));
-                    break;
-                }
-                value => {
-                    poll = Poll::Ready(value);
-                    break;
                 }
+
+                // Error case
+                other => return Poll::Ready(other),
             }
         }
-        self.metrics.baseline_metrics.record_poll(poll)
     }
 
     fn size_hint(&self) -> (usize, Option<usize>) {
@@ -732,7 +1086,6 @@ impl Stream for FilterExecStream {
         self.input.size_hint()
     }
 }
-
 impl RecordBatchStream for FilterExecStream {
     fn schema(&self) -> SchemaRef {
         Arc::clone(&self.schema)
@@ -758,7 +1111,20 @@ fn collect_columns_from_predicate_inner(
 
     let predicates = split_conjunction(predicate);
     predicates.into_iter().for_each(|p| {
-        if let Some(binary) = p.as_any().downcast_ref::<BinaryExpr>() {
+        if let Some(binary) = p.downcast_ref::<BinaryExpr>() {
+            // Only extract pairs where at least one side is a Column reference.
+            // Pairs like `complex_expr = literal` should not create equivalence
+            // classes — the literal could appear in many unrelated expressions
+            // (e.g. sort keys), and normalize_expr's deep traversal would
+            // replace those occurrences with the complex expression, corrupting
+            // sort orderings. Constant propagation for such pairs is handled
+            // separately by `extend_constants`.
+            let has_direct_column_operand =
+                binary.left().downcast_ref::<Column>().is_some()
+                    || binary.right().downcast_ref::<Column>().is_some();
+            if !has_direct_column_operand {
+                return;
+            }
             match binary.op() {
                 Operator::Eq => {
                     eq_predicate_columns.push((binary.left(), binary.right()))
@@ -789,7 +1155,6 @@ mod tests {
     use crate::test;
     use crate::test::exec::StatisticsExec;
     use arrow::datatypes::{Field, Schema, UnionFields, UnionMode};
-    use datafusion_common::ScalarValue;
 
     #[tokio::test]
     async fn collect_columns_predicates() -> Result<()> {
@@ -1116,7 +1481,7 @@ mod tests {
         ];
         let _ = exp_col_stats
             .into_iter()
-            .zip(statistics.column_statistics)
+            .zip(statistics.column_statistics.clone())
             .map(|(expected, actual)| {
                 if let Some(val) = actual.min_value.get_value() {
                     if val.data_type().is_floating() {
@@ -1187,7 +1552,7 @@ mod tests {
             )),
         ));
         // Since filter predicate passes all entries, statistics after filter shouldn't change.
-        let expected = input.partition_statistics(None)?.column_statistics;
+        let expected = input.partition_statistics(None)?.column_statistics.clone();
         let filter: Arc<dyn ExecutionPlan> =
             Arc::new(FilterExec::try_new(predicate, input)?);
         let statistics = filter.partition_statistics(None)?;
@@ -1251,18 +1616,20 @@ mod tests {
             statistics.column_statistics,
             vec![
                 ColumnStatistics {
-                    min_value: Precision::Exact(ScalarValue::Null),
-                    max_value: Precision::Exact(ScalarValue::Null),
-                    sum_value: Precision::Exact(ScalarValue::Null),
+                    min_value: Precision::Exact(ScalarValue::Int32(None)),
+                    max_value: Precision::Exact(ScalarValue::Int32(None)),
+                    sum_value: Precision::Exact(ScalarValue::Int32(None)),
                     distinct_count: Precision::Exact(0),
                     null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
-                    min_value: Precision::Exact(ScalarValue::Null),
-                    max_value: Precision::Exact(ScalarValue::Null),
-                    sum_value: Precision::Exact(ScalarValue::Null),
+                    min_value: Precision::Exact(ScalarValue::Int32(None)),
+                    max_value: Precision::Exact(ScalarValue::Int32(None)),
+                    sum_value: Precision::Exact(ScalarValue::Int32(None)),
                     distinct_count: Precision::Exact(0),
                     null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
                 },
             ]
         );
@@ -1270,6 +1637,70 @@ mod tests {
         Ok(())
     }
 
+    /// Regression test: stacking two FilterExecs where the inner filter
+    /// proves zero selectivity should not panic with a type mismatch
+    /// during interval intersection.
+    ///
+    /// Previously, when a filter proved no rows could match, the column
+    /// statistics used untyped `ScalarValue::Null` (data type `Null`).
+    /// If an outer FilterExec then tried to analyze its own predicate
+    /// against those statistics, `Interval::intersect` would fail with:
+    ///   "Only intervals with the same data type are intersectable, lhs:Null, rhs:Int32"
+    #[tokio::test]
+    async fn test_nested_filter_with_zero_selectivity_inner() -> Result<()> {
+        // Inner table: a: [1, 100], b: [1, 3]
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Precision::Inexact(1000),
+                total_byte_size: Precision::Inexact(4000),
+                column_statistics: vec![
+                    ColumnStatistics {
+                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
+                        max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
+                        ..Default::default()
+                    },
+                    ColumnStatistics {
+                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
+                        max_value: Precision::Inexact(ScalarValue::Int32(Some(3))),
+                        ..Default::default()
+                    },
+                ],
+            },
+            schema,
+        ));
+
+        // Inner filter: a > 200 (impossible given a max=100 → zero selectivity)
+        let inner_predicate: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(200)))),
+        ));
+        let inner_filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(inner_predicate, input)?);
+
+        // Outer filter: a = 50
+        // Before the fix, this would panic because the inner filter's
+        // zero-selectivity statistics produced Null-typed intervals for
+        // column `a`, which couldn't intersect with the Int32 literal.
+        let outer_predicate: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Eq,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(50)))),
+        ));
+        let outer_filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(outer_predicate, inner_filter)?);
+
+        // Should succeed without error
+        let statistics = outer_filter.partition_statistics(None)?;
+        assert_eq!(statistics.num_rows, Precision::Inexact(0));
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_filter_statistics_more_inputs() -> Result<()> {
         let schema = Schema::new(vec![
@@ -1364,10 +1795,11 @@ mod tests {
                 max_value: Precision::Inexact(ScalarValue::Int32(Some(10))),
                 sum_value: Precision::Absent,
                 distinct_count: Precision::Absent,
+                byte_size: Precision::Absent,
             }],
         };
 
-        assert_eq!(filter_statistics, expected_filter_statistics);
+        assert_eq!(*filter_statistics, expected_filter_statistics);
 
         Ok(())
     }
@@ -1447,13 +1879,14 @@ mod tests {
     #[test]
     fn test_equivalence_properties_union_type() -> Result<()> {
         let union_type = DataType::Union(
-            UnionFields::new(
+            UnionFields::try_new(
                 vec![0, 1],
                 vec![
                     Field::new("f1", DataType::Int32, true),
                     Field::new("f2", DataType::Utf8, true),
                 ],
-            ),
+            )
+            .unwrap(),
             UnionMode::Sparse,
         );
 
@@ -1476,4 +1909,1337 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_builder_with_projection() -> Result<()> {
+        // Create a schema with multiple columns
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Int32, false),
+        ]));
+
+        let input = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+
+        // Create a filter predicate: a > 10
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+        ));
+
+        // Create filter with projection [0, 2] (columns a and c) using builder
+        let projection = Some(vec![0, 2]);
+        let filter = FilterExecBuilder::new(predicate, input)
+            .apply_projection(projection.clone())
+            .unwrap()
+            .build()?;
+
+        // Verify projection is set correctly
+        assert_eq!(filter.projection(), &Some([0, 2].into()));
+
+        // Verify schema contains only projected columns
+        let output_schema = filter.schema();
+        assert_eq!(output_schema.fields().len(), 2);
+        assert_eq!(output_schema.field(0).name(), "a");
+        assert_eq!(output_schema.field(1).name(), "c");
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_builder_without_projection() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]));
+
+        let input = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(5)))),
+        ));
+
+        // Create filter without projection using builder
+        let filter = FilterExecBuilder::new(predicate, input).build()?;
+
+        // Verify no projection is set
+        assert!(filter.projection().is_none());
+
+        // Verify schema contains all columns
+        let output_schema = filter.schema();
+        assert_eq!(output_schema.fields().len(), 2);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_builder_invalid_projection() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]));
+
+        let input = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(5)))),
+        ));
+
+        // Try to create filter with invalid projection (index out of bounds) using builder
+        let result =
+            FilterExecBuilder::new(predicate, input).apply_projection(Some(vec![0, 5])); // 5 is out of bounds
+
+        // Should return an error
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_builder_vs_with_projection() -> Result<()> {
+        // This test verifies that the builder with projection produces the same result
+        // as try_new().with_projection(), but more efficiently (one compute_properties call)
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Int32, false),
+            Field::new("d", DataType::Int32, false),
+        ]);
+
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Precision::Inexact(1000),
+                total_byte_size: Precision::Inexact(4000),
+                column_statistics: vec![
+                    ColumnStatistics {
+                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
+                        max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
+                        ..Default::default()
+                    },
+                    ColumnStatistics {
+                        ..Default::default()
+                    },
+                    ColumnStatistics {
+                        ..Default::default()
+                    },
+                    ColumnStatistics {
+                        ..Default::default()
+                    },
+                ],
+            },
+            schema,
+        ));
+        let input: Arc<dyn ExecutionPlan> = input;
+
+        let predicate: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Lt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(50)))),
+        ));
+
+        let projection = Some(vec![0, 2]);
+
+        // Method 1: Builder with projection (one call to compute_properties)
+        let filter1 = FilterExecBuilder::new(Arc::clone(&predicate), Arc::clone(&input))
+            .apply_projection(projection.clone())
+            .unwrap()
+            .build()?;
+
+        // Method 2: Also using builder for comparison (deprecated try_new().with_projection() removed)
+        let filter2 = FilterExecBuilder::new(predicate, input)
+            .apply_projection(projection)
+            .unwrap()
+            .build()?;
+
+        // Both methods should produce equivalent results
+        assert_eq!(filter1.schema(), filter2.schema());
+        assert_eq!(filter1.projection(), filter2.projection());
+
+        // Verify statistics are the same
+        let stats1 = filter1.partition_statistics(None)?;
+        let stats2 = filter2.partition_statistics(None)?;
+        assert_eq!(stats1.num_rows, stats2.num_rows);
+        assert_eq!(stats1.total_byte_size, stats2.total_byte_size);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_builder_statistics_with_projection() -> Result<()> {
+        // Test that statistics are correctly computed when using builder with projection
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Int32, false),
+        ]);
+
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Precision::Inexact(1000),
+                total_byte_size: Precision::Inexact(12000),
+                column_statistics: vec![
+                    ColumnStatistics {
+                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
+                        max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
+                        ..Default::default()
+                    },
+                    ColumnStatistics {
+                        min_value: Precision::Inexact(ScalarValue::Int32(Some(10))),
+                        max_value: Precision::Inexact(ScalarValue::Int32(Some(200))),
+                        ..Default::default()
+                    },
+                    ColumnStatistics {
+                        min_value: Precision::Inexact(ScalarValue::Int32(Some(5))),
+                        max_value: Precision::Inexact(ScalarValue::Int32(Some(50))),
+                        ..Default::default()
+                    },
+                ],
+            },
+            schema,
+        ));
+
+        // Filter: a < 50, Project: [0, 2]
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Lt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(50)))),
+        ));
+
+        let filter = FilterExecBuilder::new(predicate, input)
+            .apply_projection(Some(vec![0, 2]))
+            .unwrap()
+            .build()?;
+
+        let statistics = filter.partition_statistics(None)?;
+
+        // Verify statistics reflect both filtering and projection
+        assert!(matches!(statistics.num_rows, Precision::Inexact(_)));
+
+        // Schema should only have 2 columns after projection
+        assert_eq!(filter.schema().fields().len(), 2);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_builder_predicate_validation() -> Result<()> {
+        // Test that builder validates predicate type correctly
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]));
+
+        let input = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+
+        // Create a predicate that doesn't return boolean (returns Int32)
+        let invalid_predicate = Arc::new(Column::new("a", 0));
+
+        // Should fail because predicate doesn't return boolean
+        let result = FilterExecBuilder::new(invalid_predicate, input)
+            .apply_projection(Some(vec![0]))
+            .unwrap()
+            .build();
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_builder_projection_composition() -> Result<()> {
+        // Test that calling apply_projection multiple times composes projections
+        // If initial projection is [0, 2, 3] and we call apply_projection([0, 2]),
+        // the result should be [0, 3] (indices 0 and 2 of [0, 2, 3])
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Int32, false),
+            Field::new("d", DataType::Int32, false),
+        ]));
+
+        let input = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+
+        // Create a filter predicate: a > 10
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+        ));
+
+        // First projection: [0, 2, 3] -> select columns a, c, d
+        // Second projection: [0, 2] -> select indices 0 and 2 of [0, 2, 3] -> [0, 3]
+        // Final result: columns a and d
+        let filter = FilterExecBuilder::new(predicate, input)
+            .apply_projection(Some(vec![0, 2, 3]))?
+            .apply_projection(Some(vec![0, 2]))?
+            .build()?;
+
+        // Verify composed projection is [0, 3]
+        assert_eq!(filter.projection(), &Some([0, 3].into()));
+
+        // Verify schema contains only columns a and d
+        let output_schema = filter.schema();
+        assert_eq!(output_schema.fields().len(), 2);
+        assert_eq!(output_schema.field(0).name(), "a");
+        assert_eq!(output_schema.field(1).name(), "d");
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_builder_projection_composition_none_clears() -> Result<()> {
+        // Test that passing None clears the projection
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]));
+
+        let input = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+        ));
+
+        // Set a projection then clear it with None
+        let filter = FilterExecBuilder::new(predicate, input)
+            .apply_projection(Some(vec![0]))?
+            .apply_projection(None)?
+            .build()?;
+
+        // Projection should be cleared
+        assert_eq!(filter.projection(), &None);
+
+        // Schema should have all columns
+        let output_schema = filter.schema();
+        assert_eq!(output_schema.fields().len(), 2);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_filter_with_projection_remaps_post_phase_parent_filters() -> Result<()> {
+        // Test that FilterExec with a projection must remap parent dynamic
+        // filter column indices from its output schema to the input schema
+        // before passing them to the child.
+        let input_schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, false),
+            Field::new("c", DataType::Float64, false),
+        ]));
+        let input = Arc::new(EmptyExec::new(Arc::clone(&input_schema)));
+
+        // FilterExec: a > 0, projection=[c@2]
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(0)))),
+        ));
+        let filter = FilterExecBuilder::new(predicate, input)
+            .apply_projection(Some(vec![2]))?
+            .build()?;
+
+        // Output schema should be [c:Float64]
+        let output_schema = filter.schema();
+        assert_eq!(output_schema.fields().len(), 1);
+        assert_eq!(output_schema.field(0).name(), "c");
+
+        // Simulate a parent dynamic filter referencing output column c@0
+        let parent_filter: Arc<dyn PhysicalExpr> = Arc::new(Column::new("c", 0));
+
+        let config = ConfigOptions::new();
+        let desc = filter.gather_filters_for_pushdown(
+            FilterPushdownPhase::Post,
+            vec![parent_filter],
+            &config,
+        )?;
+
+        // The filter pushed to the child must reference c@2 (input schema),
+        // not c@0 (output schema).
+        let parent_filters = desc.parent_filters();
+        assert_eq!(parent_filters.len(), 1); // one child
+        assert_eq!(parent_filters[0].len(), 1); // one filter
+        let remapped = &parent_filters[0][0].predicate;
+        let display = format!("{remapped}");
+        assert_eq!(
+            display, "c@2",
+            "Post-phase parent filter column index must be remapped \
+             from output schema (c@0) to input schema (c@2)"
+        );
+
+        Ok(())
+    }
+
+    /// Regression test for https://github.com/apache/datafusion/issues/20194
+    ///
+    /// `collect_columns_from_predicate_inner` should only extract equality
+    /// pairs where at least one side is a Column. Pairs like
+    /// `complex_expr = literal` must not create equivalence classes because
+    /// `normalize_expr`'s deep traversal would replace the literal inside
+    /// unrelated expressions (e.g. sort keys) with the complex expression.
+    #[test]
+    fn test_collect_columns_skips_non_column_pairs() -> Result<()> {
+        let schema = test::aggr_test_schema();
+
+        // Simulate: nvl(c2, 0) = 0  →  (c2 IS DISTINCT FROM 0) = 0
+        // Neither side is a Column, so this should NOT be extracted.
+        let complex_expr: Arc<dyn PhysicalExpr> = binary(
+            col("c2", &schema)?,
+            Operator::IsDistinctFrom,
+            lit(0u32),
+            &schema,
+        )?;
+        let predicate: Arc<dyn PhysicalExpr> =
+            binary(complex_expr, Operator::Eq, lit(0u32), &schema)?;
+
+        let (equal_pairs, _) = collect_columns_from_predicate_inner(&predicate);
+        assert_eq!(
+            0,
+            equal_pairs.len(),
+            "Should not extract equality pairs where neither side is a Column"
+        );
+
+        // But col = literal should still be extracted
+        let predicate: Arc<dyn PhysicalExpr> =
+            binary(col("c2", &schema)?, Operator::Eq, lit(0u32), &schema)?;
+        let (equal_pairs, _) = collect_columns_from_predicate_inner(&predicate);
+        assert_eq!(
+            1,
+            equal_pairs.len(),
+            "Should extract equality pairs where one side is a Column"
+        );
+
+        Ok(())
+    }
+
+    /// Columns with Absent min/max statistics should remain Absent after
+    /// FilterExec.
+    #[tokio::test]
+    async fn test_filter_statistics_absent_columns_stay_absent() -> Result<()> {
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Precision::Inexact(1000),
+                total_byte_size: Precision::Absent,
+                column_statistics: vec![
+                    ColumnStatistics::default(),
+                    ColumnStatistics::default(),
+                ],
+            },
+            schema.clone(),
+        ));
+
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Eq,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
+        ));
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(predicate, input)?);
+
+        let statistics = filter.partition_statistics(None)?;
+        let col_b_stats = &statistics.column_statistics[1];
+        assert_eq!(col_b_stats.min_value, Precision::Absent);
+        assert_eq!(col_b_stats.max_value, Precision::Absent);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_filter_statistics_equality_ndv() -> Result<()> {
+        #[expect(clippy::type_complexity)]
+        let cases: Vec<(
+            &str,
+            Vec<Field>,
+            Vec<ColumnStatistics>,
+            Arc<dyn PhysicalExpr>,
+            Vec<Precision<usize>>,
+        )> = vec![
+            (
+                "utf8 equality",
+                vec![Field::new("name", DataType::Utf8, false)],
+                vec![ColumnStatistics {
+                    distinct_count: Precision::Inexact(50),
+                    ..Default::default()
+                }],
+                Arc::new(BinaryExpr::new(
+                    Arc::new(Column::new("name", 0)),
+                    Operator::Eq,
+                    Arc::new(Literal::new(ScalarValue::Utf8(Some("hello".to_string())))),
+                )),
+                vec![Precision::Exact(1)],
+            ),
+            (
+                "utf8view equality",
+                vec![Field::new("name", DataType::Utf8View, false)],
+                vec![ColumnStatistics {
+                    distinct_count: Precision::Inexact(50),
+                    ..Default::default()
+                }],
+                Arc::new(BinaryExpr::new(
+                    Arc::new(Column::new("name", 0)),
+                    Operator::Eq,
+                    Arc::new(Literal::new(ScalarValue::Utf8View(Some(
+                        "hello".to_string(),
+                    )))),
+                )),
+                vec![Precision::Exact(1)],
+            ),
+            (
+                "largeutf8 equality",
+                vec![Field::new("name", DataType::LargeUtf8, false)],
+                vec![ColumnStatistics {
+                    distinct_count: Precision::Inexact(50),
+                    ..Default::default()
+                }],
+                Arc::new(BinaryExpr::new(
+                    Arc::new(Column::new("name", 0)),
+                    Operator::Eq,
+                    Arc::new(Literal::new(ScalarValue::LargeUtf8(Some(
+                        "hello".to_string(),
+                    )))),
+                )),
+                vec![Precision::Exact(1)],
+            ),
+            (
+                "utf8 reversed (literal = column)",
+                vec![Field::new("name", DataType::Utf8, false)],
+                vec![ColumnStatistics {
+                    distinct_count: Precision::Inexact(50),
+                    ..Default::default()
+                }],
+                Arc::new(BinaryExpr::new(
+                    Arc::new(Literal::new(ScalarValue::Utf8(Some("hello".to_string())))),
+                    Operator::Eq,
+                    Arc::new(Column::new("name", 0)),
+                )),
+                vec![Precision::Exact(1)],
+            ),
+            (
+                "OR preserves original NDV",
+                vec![Field::new("name", DataType::Utf8, false)],
+                vec![ColumnStatistics {
+                    distinct_count: Precision::Inexact(50),
+                    ..Default::default()
+                }],
+                Arc::new(BinaryExpr::new(
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("name", 0)),
+                        Operator::Eq,
+                        Arc::new(Literal::new(ScalarValue::Utf8(Some("a".to_string())))),
+                    )),
+                    Operator::Or,
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("name", 0)),
+                        Operator::Eq,
+                        Arc::new(Literal::new(ScalarValue::Utf8(Some("b".to_string())))),
+                    )),
+                )),
+                vec![Precision::Inexact(50)],
+            ),
+            (
+                "AND with mixed types (Utf8 + Int32)",
+                vec![
+                    Field::new("name", DataType::Utf8, false),
+                    Field::new("age", DataType::Int32, false),
+                ],
+                vec![
+                    ColumnStatistics {
+                        distinct_count: Precision::Inexact(50),
+                        ..Default::default()
+                    },
+                    ColumnStatistics {
+                        distinct_count: Precision::Inexact(80),
+                        ..Default::default()
+                    },
+                ],
+                Arc::new(BinaryExpr::new(
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("name", 0)),
+                        Operator::Eq,
+                        Arc::new(Literal::new(ScalarValue::Utf8(Some(
+                            "hello".to_string(),
+                        )))),
+                    )),
+                    Operator::And,
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("age", 1)),
+                        Operator::Eq,
+                        Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
+                    )),
+                )),
+                vec![Precision::Exact(1), Precision::Exact(1)],
+            ),
+            (
+                "numeric equality with min/max bounds (interval analysis path)",
+                vec![Field::new("a", DataType::Int32, false)],
+                vec![ColumnStatistics {
+                    min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
+                    max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
+                    distinct_count: Precision::Inexact(80),
+                    ..Default::default()
+                }],
+                Arc::new(BinaryExpr::new(
+                    Arc::new(Column::new("a", 0)),
+                    Operator::Eq,
+                    Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
+                )),
+                vec![Precision::Exact(1)],
+            ),
+            (
+                "timestamp equality",
+                vec![Field::new(
+                    "ts",
+                    DataType::Timestamp(arrow::datatypes::TimeUnit::Nanosecond, None),
+                    false,
+                )],
+                vec![ColumnStatistics {
+                    distinct_count: Precision::Inexact(500),
+                    ..Default::default()
+                }],
+                Arc::new(BinaryExpr::new(
+                    Arc::new(Column::new("ts", 0)),
+                    Operator::Eq,
+                    Arc::new(Literal::new(ScalarValue::TimestampNanosecond(
+                        Some(1_609_459_200_000_000_000),
+                        None,
+                    ))),
+                )),
+                vec![Precision::Exact(1)],
+            ),
+            (
+                "contradictory numeric equality (infeasible)",
+                vec![Field::new("a", DataType::Int32, false)],
+                vec![ColumnStatistics {
+                    distinct_count: Precision::Inexact(50),
+                    ..Default::default()
+                }],
+                Arc::new(BinaryExpr::new(
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("a", 0)),
+                        Operator::Eq,
+                        Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
+                    )),
+                    Operator::And,
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("a", 0)),
+                        Operator::Eq,
+                        Arc::new(Literal::new(ScalarValue::Int32(Some(99)))),
+                    )),
+                )),
+                vec![Precision::Exact(0)],
+            ),
+            (
+                "utf8 equality with absent input NDV",
+                vec![Field::new("name", DataType::Utf8, false)],
+                vec![ColumnStatistics {
+                    distinct_count: Precision::Absent,
+                    ..Default::default()
+                }],
+                Arc::new(BinaryExpr::new(
+                    Arc::new(Column::new("name", 0)),
+                    Operator::Eq,
+                    Arc::new(Literal::new(ScalarValue::Utf8(Some("hello".to_string())))),
+                )),
+                vec![Precision::Exact(1)],
+            ),
+            (
+                "contradictory utf8 equality (infeasible)",
+                vec![Field::new("name", DataType::Utf8, false)],
+                vec![ColumnStatistics {
+                    distinct_count: Precision::Inexact(100),
+                    ..Default::default()
+                }],
+                Arc::new(BinaryExpr::new(
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("name", 0)),
+                        Operator::Eq,
+                        Arc::new(Literal::new(ScalarValue::Utf8(Some(
+                            "alice".to_string(),
+                        )))),
+                    )),
+                    Operator::And,
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("name", 0)),
+                        Operator::Eq,
+                        Arc::new(Literal::new(ScalarValue::Utf8(Some(
+                            "bob".to_string(),
+                        )))),
+                    )),
+                )),
+                vec![Precision::Exact(0)],
+            ),
+            (
+                "redundant same-value equality combined with another column",
+                vec![
+                    Field::new("a", DataType::Int32, false),
+                    Field::new("b", DataType::Int32, false),
+                ],
+                vec![
+                    ColumnStatistics {
+                        distinct_count: Precision::Inexact(80),
+                        ..Default::default()
+                    },
+                    ColumnStatistics {
+                        distinct_count: Precision::Inexact(40),
+                        ..Default::default()
+                    },
+                ],
+                Arc::new(BinaryExpr::new(
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(BinaryExpr::new(
+                            Arc::new(Column::new("a", 0)),
+                            Operator::Eq,
+                            Arc::new(Literal::new(ScalarValue::Int32(Some(1)))),
+                        )),
+                        Operator::And,
+                        Arc::new(BinaryExpr::new(
+                            Arc::new(Column::new("a", 0)),
+                            Operator::Eq,
+                            Arc::new(Literal::new(ScalarValue::Int32(Some(1)))),
+                        )),
+                    )),
+                    Operator::And,
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("b", 1)),
+                        Operator::Eq,
+                        Arc::new(Literal::new(ScalarValue::Int32(Some(2)))),
+                    )),
+                )),
+                vec![Precision::Exact(1), Precision::Exact(1)],
+            ),
+        ];
+
+        for (desc, fields, col_stats, predicate, expected_ndvs) in cases {
+            let schema = Schema::new(fields);
+            let input = Arc::new(StatisticsExec::new(
+                Statistics {
+                    num_rows: Precision::Inexact(100),
+                    total_byte_size: Precision::Inexact(1000),
+                    column_statistics: col_stats,
+                },
+                schema.clone(),
+            ));
+            let filter: Arc<dyn ExecutionPlan> =
+                Arc::new(FilterExec::try_new(predicate, input)?);
+            let statistics = filter.partition_statistics(None)?;
+
+            for (i, expected) in expected_ndvs.iter().enumerate() {
+                assert_eq!(
+                    statistics.column_statistics[i].distinct_count, *expected,
+                    "case '{desc}': column {i} NDV mismatch"
+                );
+            }
+        }
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_filter_statistics_and_equality_ndv() -> Result<()> {
+        // a: min=1, max=100, ndv=80
+        // b: min=1, max=50, ndv=40
+        // c: min=1, max=200, ndv=150
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Int32, false),
+        ]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Precision::Inexact(100),
+                total_byte_size: Precision::Inexact(1200),
+                column_statistics: vec![
+                    ColumnStatistics {
+                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
+                        max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
+                        distinct_count: Precision::Inexact(80),
+                        ..Default::default()
+                    },
+                    ColumnStatistics {
+                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
+                        max_value: Precision::Inexact(ScalarValue::Int32(Some(50))),
+                        distinct_count: Precision::Inexact(40),
+                        ..Default::default()
+                    },
+                    ColumnStatistics {
+                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
+                        max_value: Precision::Inexact(ScalarValue::Int32(Some(200))),
+                        distinct_count: Precision::Inexact(150),
+                        ..Default::default()
+                    },
+                ],
+            },
+            schema.clone(),
+        ));
+
+        // a = 42 AND b > 10 AND c = 7
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(BinaryExpr::new(
+                Arc::new(BinaryExpr::new(
+                    Arc::new(Column::new("a", 0)),
+                    Operator::Eq,
+                    Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
+                )),
+                Operator::And,
+                Arc::new(BinaryExpr::new(
+                    Arc::new(Column::new("b", 1)),
+                    Operator::Gt,
+                    Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+                )),
+            )),
+            Operator::And,
+            Arc::new(BinaryExpr::new(
+                Arc::new(Column::new("c", 2)),
+                Operator::Eq,
+                Arc::new(Literal::new(ScalarValue::Int32(Some(7)))),
+            )),
+        ));
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(predicate, input)?);
+        let statistics = filter.partition_statistics(None)?;
+        // a = 42 collapses to single value
+        assert_eq!(
+            statistics.column_statistics[0].distinct_count,
+            Precision::Exact(1)
+        );
+        // b > 10 narrows to [11, 50] but doesn't collapse to a single value.
+        // The combined selectivity of a=42 (1/80) and c=7 (1/150) on 100 rows
+        // computes num_rows = 1, so NDV is capped at the row count: min(40, 1) = 1.
+        assert_eq!(
+            statistics.column_statistics[1].distinct_count,
+            Precision::Inexact(1)
+        );
+        // c = 7 collapses to single value
+        assert_eq!(
+            statistics.column_statistics[2].distinct_count,
+            Precision::Exact(1)
+        );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_filter_statistics_equality_absent_bounds_ndv() -> Result<()> {
+        // a: ndv=80, no min/max
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Precision::Inexact(100),
+                total_byte_size: Precision::Inexact(400),
+                column_statistics: vec![ColumnStatistics {
+                    distinct_count: Precision::Inexact(80),
+                    ..Default::default()
+                }],
+            },
+            schema.clone(),
+        ));
+
+        // a = 42: even without known bounds, interval analysis resolves
+        // the equality to [42, 42], so NDV is correctly set to Exact(1)
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Eq,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
+        ));
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(predicate, input)?);
+        let statistics = filter.partition_statistics(None)?;
+        assert_eq!(
+            statistics.column_statistics[0].distinct_count,
+            Precision::Exact(1)
+        );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_filter_statistics_equality_int8_ndv() -> Result<()> {
+        // a: min=-100, max=100, ndv=50
+        let schema = Schema::new(vec![Field::new("a", DataType::Int8, false)]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Precision::Inexact(100),
+                total_byte_size: Precision::Inexact(100),
+                column_statistics: vec![ColumnStatistics {
+                    min_value: Precision::Inexact(ScalarValue::Int8(Some(-100))),
+                    max_value: Precision::Inexact(ScalarValue::Int8(Some(100))),
+                    distinct_count: Precision::Inexact(50),
+                    ..Default::default()
+                }],
+            },
+            schema.clone(),
+        ));
+
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Eq,
+            Arc::new(Literal::new(ScalarValue::Int8(Some(42)))),
+        ));
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(predicate, input)?);
+        let statistics = filter.partition_statistics(None)?;
+        assert_eq!(
+            statistics.column_statistics[0].distinct_count,
+            Precision::Exact(1)
+        );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_filter_statistics_equality_int64_ndv() -> Result<()> {
+        // a: min=0, max=1_000_000, ndv=100_000
+        let schema = Schema::new(vec![Field::new("a", DataType::Int64, false)]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Precision::Inexact(100_000),
+                total_byte_size: Precision::Inexact(800_000),
+                column_statistics: vec![ColumnStatistics {
+                    min_value: Precision::Inexact(ScalarValue::Int64(Some(0))),
+                    max_value: Precision::Inexact(ScalarValue::Int64(Some(1_000_000))),
+                    distinct_count: Precision::Inexact(100_000),
+                    ..Default::default()
+                }],
+            },
+            schema.clone(),
+        ));
+
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Eq,
+            Arc::new(Literal::new(ScalarValue::Int64(Some(42)))),
+        ));
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(predicate, input)?);
+        let statistics = filter.partition_statistics(None)?;
+        assert_eq!(
+            statistics.column_statistics[0].distinct_count,
+            Precision::Exact(1)
+        );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_filter_statistics_equality_float32_ndv() -> Result<()> {
+        // a: min=0.0, max=100.0, ndv=50
+        let schema = Schema::new(vec![Field::new("a", DataType::Float32, false)]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Precision::Inexact(100),
+                total_byte_size: Precision::Inexact(400),
+                column_statistics: vec![ColumnStatistics {
+                    min_value: Precision::Inexact(ScalarValue::Float32(Some(0.0))),
+                    max_value: Precision::Inexact(ScalarValue::Float32(Some(100.0))),
+                    distinct_count: Precision::Inexact(50),
+                    ..Default::default()
+                }],
+            },
+            schema.clone(),
+        ));
+
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Eq,
+            Arc::new(Literal::new(ScalarValue::Float32(Some(42.5)))),
+        ));
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(predicate, input)?);
+        let statistics = filter.partition_statistics(None)?;
+        assert_eq!(
+            statistics.column_statistics[0].distinct_count,
+            Precision::Exact(1)
+        );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_filter_statistics_equality_reversed_ndv() -> Result<()> {
+        // a: min=1, max=100, ndv=80
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Precision::Inexact(100),
+                total_byte_size: Precision::Inexact(400),
+                column_statistics: vec![ColumnStatistics {
+                    min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
+                    max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
+                    distinct_count: Precision::Inexact(80),
+                    ..Default::default()
+                }],
+            },
+            schema.clone(),
+        ));
+
+        // 42 = a (literal on the left)
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
+            Operator::Eq,
+            Arc::new(Column::new("a", 0)),
+        ));
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(predicate, input)?);
+        let statistics = filter.partition_statistics(None)?;
+        assert_eq!(
+            statistics.column_statistics[0].distinct_count,
+            Precision::Exact(1)
+        );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_filter_statistics_equality_timestamp_ndv() -> Result<()> {
+        // ts: min=1_000_000_000, max=2_000_000_000, ndv=500
+        let schema = Schema::new(vec![Field::new(
+            "ts",
+            DataType::Timestamp(arrow::datatypes::TimeUnit::Nanosecond, None),
+            false,
+        )]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Precision::Inexact(1000),
+                total_byte_size: Precision::Inexact(8000),
+                column_statistics: vec![ColumnStatistics {
+                    min_value: Precision::Inexact(ScalarValue::TimestampNanosecond(
+                        Some(1_000_000_000),
+                        None,
+                    )),
+                    max_value: Precision::Inexact(ScalarValue::TimestampNanosecond(
+                        Some(2_000_000_000),
+                        None,
+                    )),
+                    distinct_count: Precision::Inexact(500),
+                    ..Default::default()
+                }],
+            },
+            schema.clone(),
+        ));
+
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("ts", 0)),
+            Operator::Eq,
+            Arc::new(Literal::new(ScalarValue::TimestampNanosecond(
+                Some(1_500_000_000),
+                None,
+            ))),
+        ));
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(predicate, input)?);
+        let statistics = filter.partition_statistics(None)?;
+        assert_eq!(
+            statistics.column_statistics[0].distinct_count,
+            Precision::Exact(1)
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_collect_equality_columns() {
+        use std::collections::HashSet;
+        // (description, predicate, expected_column_indices, expected_infeasible)
+        #[expect(clippy::type_complexity)]
+        let cases: Vec<(&str, Arc<dyn PhysicalExpr>, Vec<usize>, bool)> = vec![
+            (
+                "simple col = literal",
+                Arc::new(BinaryExpr::new(
+                    Arc::new(Column::new("a", 0)),
+                    Operator::Eq,
+                    Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
+                )),
+                vec![0],
+                false,
+            ),
+            (
+                "reversed literal = col",
+                Arc::new(BinaryExpr::new(
+                    Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
+                    Operator::Eq,
+                    Arc::new(Column::new("a", 0)),
+                )),
+                vec![0],
+                false,
+            ),
+            (
+                "AND with two equalities",
+                Arc::new(BinaryExpr::new(
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("a", 0)),
+                        Operator::Eq,
+                        Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
+                    )),
+                    Operator::And,
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("b", 1)),
+                        Operator::Eq,
+                        Arc::new(Literal::new(ScalarValue::Utf8(Some(
+                            "hello".to_string(),
+                        )))),
+                    )),
+                )),
+                vec![0, 1],
+                false,
+            ),
+            (
+                "OR produces empty set",
+                Arc::new(BinaryExpr::new(
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("a", 0)),
+                        Operator::Eq,
+                        Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
+                    )),
+                    Operator::Or,
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("a", 0)),
+                        Operator::Eq,
+                        Arc::new(Literal::new(ScalarValue::Int32(Some(99)))),
+                    )),
+                )),
+                vec![],
+                false,
+            ),
+            (
+                "greater-than produces empty set",
+                Arc::new(BinaryExpr::new(
+                    Arc::new(Column::new("a", 0)),
+                    Operator::Gt,
+                    Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
+                )),
+                vec![],
+                false,
+            ),
+            (
+                "col = col produces empty set",
+                Arc::new(BinaryExpr::new(
+                    Arc::new(Column::new("a", 0)),
+                    Operator::Eq,
+                    Arc::new(Column::new("b", 1)),
+                )),
+                vec![],
+                false,
+            ),
+            (
+                "nested AND with three equalities",
+                Arc::new(BinaryExpr::new(
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(BinaryExpr::new(
+                            Arc::new(Column::new("a", 0)),
+                            Operator::Eq,
+                            Arc::new(Literal::new(ScalarValue::Int32(Some(1)))),
+                        )),
+                        Operator::And,
+                        Arc::new(BinaryExpr::new(
+                            Arc::new(Column::new("b", 1)),
+                            Operator::Eq,
+                            Arc::new(Literal::new(ScalarValue::Int32(Some(2)))),
+                        )),
+                    )),
+                    Operator::And,
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("c", 2)),
+                        Operator::Eq,
+                        Arc::new(Literal::new(ScalarValue::Int32(Some(3)))),
+                    )),
+                )),
+                vec![0, 1, 2],
+                false,
+            ),
+            (
+                "AND with mixed equality and non-equality",
+                Arc::new(BinaryExpr::new(
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("a", 0)),
+                        Operator::Eq,
+                        Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
+                    )),
+                    Operator::And,
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("b", 1)),
+                        Operator::Gt,
+                        Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+                    )),
+                )),
+                vec![0],
+                false,
+            ),
+            (
+                "col = NULL is excluded",
+                Arc::new(BinaryExpr::new(
+                    Arc::new(Column::new("a", 0)),
+                    Operator::Eq,
+                    Arc::new(Literal::new(ScalarValue::Int32(None))),
+                )),
+                vec![],
+                false,
+            ),
+            (
+                "NULL = col is excluded",
+                Arc::new(BinaryExpr::new(
+                    Arc::new(Literal::new(ScalarValue::Utf8(None))),
+                    Operator::Eq,
+                    Arc::new(Column::new("a", 0)),
+                )),
+                vec![],
+                false,
+            ),
+            (
+                "contradictory: same col, different literals",
+                Arc::new(BinaryExpr::new(
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("a", 0)),
+                        Operator::Eq,
+                        Arc::new(Literal::new(ScalarValue::Utf8(Some(
+                            "alice".to_string(),
+                        )))),
+                    )),
+                    Operator::And,
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("a", 0)),
+                        Operator::Eq,
+                        Arc::new(Literal::new(ScalarValue::Utf8(Some(
+                            "bob".to_string(),
+                        )))),
+                    )),
+                )),
+                vec![0],
+                true,
+            ),
+            (
+                "same col, same literal is not contradictory",
+                Arc::new(BinaryExpr::new(
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("a", 0)),
+                        Operator::Eq,
+                        Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
+                    )),
+                    Operator::And,
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("a", 0)),
+                        Operator::Eq,
+                        Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
+                    )),
+                )),
+                vec![0],
+                false,
+            ),
+        ];
+
+        for (desc, expr, expected_cols, expected_infeasible) in cases {
+            let (result, infeasible) = collect_equality_columns(&expr);
+            let expected: HashSet<usize> = expected_cols.into_iter().collect();
+            if expected_infeasible {
+                // When infeasible, the scan is short-circuited, so we only
+                // assert the infeasibility flag — the partial column set
+                // contents are an implementation detail.
+                assert!(infeasible, "case '{desc}': expected infeasible");
+            } else {
+                assert_eq!(result, expected, "case '{desc}': columns mismatch");
+                assert!(!infeasible, "case '{desc}': expected feasible");
+            }
+        }
+    }
+
+    /// Regression test: ProjectionExec on top of a FilterExec that already has
+    /// an explicit projection must not panic when `try_swapping_with_projection`
+    /// attempts to swap the two nodes.
+    ///
+    /// Before the fix, `FilterExecBuilder::from(self)` copied the old projection
+    /// (e.g. `[0, 1, 2]`) from the FilterExec. After `.with_input` replaced the
+    /// input with the narrower ProjectionExec (2 columns), `.build()` tried to
+    /// validate the stale `[0, 1, 2]` projection against the 2-column schema and
+    /// panicked with "project index 2 out of bounds, max field 2".
+    #[test]
+    fn test_filter_with_projection_swap_does_not_panic() -> Result<()> {
+        use crate::projection::ProjectionExpr;
+        use datafusion_physical_expr::expressions::col;
+
+        // Schema: [ts: Int64, tokens: Int64, svc: Utf8]
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("ts", DataType::Int64, false),
+            Field::new("tokens", DataType::Int64, false),
+            Field::new("svc", DataType::Utf8, false),
+        ]));
+        let input = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+
+        // FilterExec: ts > 0, projection=[ts@0, tokens@1, svc@2] (all 3 cols)
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("ts", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int64(Some(0)))),
+        ));
+        let filter = Arc::new(
+            FilterExecBuilder::new(predicate, input)
+                .apply_projection(Some(vec![0, 1, 2]))?
+                .build()?,
+        );
+
+        // ProjectionExec: narrows to [ts, tokens] (drops svc)
+        let proj_exprs = vec![
+            ProjectionExpr {
+                expr: col("ts", &filter.schema())?,
+                alias: "ts".to_string(),
+            },
+            ProjectionExpr {
+                expr: col("tokens", &filter.schema())?,
+                alias: "tokens".to_string(),
+            },
+        ];
+        let projection = Arc::new(ProjectionExec::try_new(
+            proj_exprs,
+            Arc::clone(&filter) as _,
+        )?);
+
+        // This must not panic
+        let result = filter.try_swapping_with_projection(&projection)?;
+        assert!(result.is_some(), "swap should succeed");
+
+        let new_plan = result.unwrap();
+        // Output schema must still be [ts, tokens]
+        let out_schema = new_plan.schema();
+        assert_eq!(out_schema.fields().len(), 2);
+        assert_eq!(out_schema.field(0).name(), "ts");
+        assert_eq!(out_schema.field(1).name(), "tokens");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_filter_statistics_ndv_capped_at_row_count() -> Result<()> {
+        // Table: a: min=1, max=100, distinct_count=80, 100 rows
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Precision::Inexact(100),
+                total_byte_size: Precision::Inexact(400),
+                column_statistics: vec![ColumnStatistics {
+                    min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
+                    max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
+                    distinct_count: Precision::Inexact(80),
+                    ..Default::default()
+                }],
+            },
+            schema.clone(),
+        ));
+
+        // a <= 10 => ~10 rows out of 100
+        let predicate: Arc<dyn PhysicalExpr> =
+            binary(col("a", &schema)?, Operator::LtEq, lit(10i32), &schema)?;
+
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(predicate, input)?);
+
+        let statistics = filter.partition_statistics(None)?;
+        // Filter estimates ~10 rows (selectivity = 10/100)
+        assert_eq!(statistics.num_rows, Precision::Inexact(10));
+        // NDV should be capped at the filtered row count (10), not the original 80
+        let ndv = &statistics.column_statistics[0].distinct_count;
+        assert!(
+            ndv.get_value().copied() <= Some(10),
+            "Expected NDV <= 10 (filtered row count), got {ndv:?}"
+        );
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/filter_pushdown.rs b/datafusion/physical-plan/src/filter_pushdown.rs
index f6b1b7448f885..810f9ffcbcdb1 100644
--- a/datafusion/physical-plan/src/filter_pushdown.rs
+++ b/datafusion/physical-plan/src/filter_pushdown.rs
@@ -37,10 +37,13 @@
 use std::collections::HashSet;
 use std::sync::Arc;
 
-use datafusion_common::Result;
-use datafusion_physical_expr::utils::{collect_columns, reassign_expr_columns};
+use arrow_schema::SchemaRef;
+use datafusion_common::{
+    Result,
+    tree_node::{Transformed, TreeNode},
+};
+use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-use itertools::Itertools;
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum FilterPushdownPhase {
@@ -217,13 +220,13 @@ pub struct ChildPushdownResult {
 /// Returned from [`ExecutionPlan::handle_child_pushdown_result`] to communicate
 /// to the optimizer:
 ///
-/// 1. What to do with any parent filters that were could not be pushed down into the children.
+/// 1. What to do with any parent filters that could not be pushed down into the children.
 /// 2. If the node needs to be replaced in the execution plan with a new node or not.
 ///
 /// [`ExecutionPlan::handle_child_pushdown_result`]: crate::ExecutionPlan::handle_child_pushdown_result
 #[derive(Debug, Clone)]
 pub struct FilterPushdownPropagation<T> {
-    /// What filters were pushed into the parent node.
+    /// Which parent filters were pushed down into this node's children.
     pub filters: Vec<PushedDown>,
     /// The updated node, if it was updated during pushdown
     pub updated_node: Option<T>,
@@ -306,6 +309,83 @@ pub struct ChildFilterDescription {
     pub(crate) self_filters: Vec<Arc<dyn PhysicalExpr>>,
 }
 
+/// Validates and remaps filter column references to a target schema in one step.
+///
+/// When pushing filters from a parent to a child node, we need to:
+/// 1. Verify that all columns referenced by the filter exist in the target
+/// 2. Remap column indices to match the target schema
+///
+/// `allowed_indices` controls which column indices (in the parent schema) are
+/// considered valid. For single-input nodes this defaults to
+/// `0..child_schema.len()` (all columns are reachable). For join nodes it is
+/// restricted to the subset of output columns that map to the target child,
+/// which is critical when different sides have same-named columns.
+pub(crate) struct FilterRemapper {
+    /// The target schema to remap column indices into.
+    child_schema: SchemaRef,
+    /// Only columns at these indices (in the *parent* schema) are considered
+    /// valid. For non-join nodes this defaults to `0..child_schema.len()`.
+    allowed_indices: HashSet<usize>,
+}
+
+impl FilterRemapper {
+    /// Create a remapper that accepts any column whose index falls within
+    /// `0..child_schema.len()` and whose name exists in the target schema.
+    pub(crate) fn new(child_schema: SchemaRef) -> Self {
+        let allowed_indices = (0..child_schema.fields().len()).collect();
+        Self {
+            child_schema,
+            allowed_indices,
+        }
+    }
+
+    /// Create a remapper that only accepts columns at the given indices.
+    /// This is used by join nodes to restrict pushdown to one side of the
+    /// join when both sides have same-named columns.
+    fn with_allowed_indices(
+        child_schema: SchemaRef,
+        allowed_indices: HashSet<usize>,
+    ) -> Self {
+        Self {
+            child_schema,
+            allowed_indices,
+        }
+    }
+
+    /// Try to remap a filter's column references to the target schema.
+    ///
+    /// Validates and remaps in a single tree traversal: for each column,
+    /// checks that its index is in the allowed set and that
+    /// its name exists in the target schema, then remaps the index.
+    /// Returns `Some(remapped)` if all columns are valid, or `None` if any
+    /// column fails validation.
+    pub(crate) fn try_remap(
+        &self,
+        filter: &Arc<dyn PhysicalExpr>,
+    ) -> Result<Option<Arc<dyn PhysicalExpr>>> {
+        let mut all_valid = true;
+        let transformed = Arc::clone(filter).transform_down(|expr| {
+            if let Some(col) = expr.downcast_ref::<Column>() {
+                if self.allowed_indices.contains(&col.index())
+                    && let Ok(new_index) = self.child_schema.index_of(col.name())
+                {
+                    Ok(Transformed::yes(Arc::new(Column::new(
+                        col.name(),
+                        new_index,
+                    ))))
+                } else {
+                    all_valid = false;
+                    Ok(Transformed::complete(expr))
+                }
+            } else {
+                Ok(Transformed::no(expr))
+            }
+        })?;
+
+        Ok(all_valid.then_some(transformed.data))
+    }
+}
+
 impl ChildFilterDescription {
     /// Build a child filter description by analyzing which parent filters can be pushed to a specific child.
     ///
@@ -318,36 +398,41 @@ impl ChildFilterDescription {
         parent_filters: &[Arc<dyn PhysicalExpr>],
         child: &Arc<dyn crate::ExecutionPlan>,
     ) -> Result<Self> {
-        let child_schema = child.schema();
+        let remapper = FilterRemapper::new(child.schema());
+        Self::remap_filters(parent_filters, &remapper)
+    }
 
-        // Get column names from child schema for quick lookup
-        let child_column_names: HashSet<&str> = child_schema
-            .fields()
-            .iter()
-            .map(|f| f.name().as_str())
-            .collect();
+    /// Like [`Self::from_child`], but restricts which parent-level columns are
+    /// considered reachable through this child.
+    ///
+    /// `allowed_indices` is the set of column indices (in the *parent*
+    /// schema) that map to this child's side of a join. A filter is only
+    /// eligible for pushdown when **every** column index it references
+    /// appears in `allowed_indices`.
+    ///
+    /// This prevents incorrect pushdown when different join sides have
+    /// columns with the same name: matching on index ensures a filter
+    /// referencing the right side's `k@2` is not pushed to the left side
+    /// which also has a column named `k` but at a different index.
+    pub fn from_child_with_allowed_indices(
+        parent_filters: &[Arc<dyn PhysicalExpr>],
+        allowed_indices: HashSet<usize>,
+        child: &Arc<dyn crate::ExecutionPlan>,
+    ) -> Result<Self> {
+        let remapper =
+            FilterRemapper::with_allowed_indices(child.schema(), allowed_indices);
+        Self::remap_filters(parent_filters, &remapper)
+    }
 
-        // Analyze each parent filter
+    fn remap_filters(
+        parent_filters: &[Arc<dyn PhysicalExpr>],
+        remapper: &FilterRemapper,
+    ) -> Result<Self> {
         let mut child_parent_filters = Vec::with_capacity(parent_filters.len());
-
         for filter in parent_filters {
-            // Check which columns the filter references
-            let referenced_columns = collect_columns(filter);
-
-            // Check if all referenced columns exist in the child schema
-            let all_columns_exist = referenced_columns
-                .iter()
-                .all(|col| child_column_names.contains(col.name()));
-
-            if all_columns_exist {
-                // All columns exist in child - we can push down
-                // Need to reassign column indices to match child schema
-                let reassigned_filter =
-                    reassign_expr_columns(Arc::clone(filter), &child_schema)?;
-                child_parent_filters
-                    .push(PushedDownPredicate::supported(reassigned_filter));
+            if let Some(remapped) = remapper.try_remap(filter)? {
+                child_parent_filters.push(PushedDownPredicate::supported(remapped));
             } else {
-                // Some columns don't exist in child - cannot push down
                 child_parent_filters
                     .push(PushedDownPredicate::unsupported(Arc::clone(filter)));
             }
@@ -359,6 +444,17 @@ impl ChildFilterDescription {
         })
     }
 
+    /// Mark all parent filters as unsupported for this child.
+    pub fn all_unsupported(parent_filters: &[Arc<dyn PhysicalExpr>]) -> Self {
+        Self {
+            parent_filters: parent_filters
+                .iter()
+                .map(|f| PushedDownPredicate::unsupported(Arc::clone(f)))
+                .collect(),
+            self_filters: vec![],
+        }
+    }
+
     /// Add a self filter (from the current node) to be pushed down to this child.
     pub fn with_self_filter(mut self, filter: Arc<dyn PhysicalExpr>) -> Self {
         self.self_filters.push(filter);
@@ -412,6 +508,7 @@ impl FilterDescription {
     /// This method automatically determines filter routing based on column analysis:
     /// - If all columns referenced by a filter exist in a child's schema, it can be pushed down
     /// - Otherwise, it cannot be pushed down to that child
+    #[expect(clippy::needless_pass_by_value)]
     pub fn from_children(
         parent_filters: Vec<Arc<dyn PhysicalExpr>>,
         children: &[&Arc<dyn crate::ExecutionPlan>],
@@ -433,15 +530,9 @@ impl FilterDescription {
         children: &[&Arc<dyn crate::ExecutionPlan>],
     ) -> Self {
         let mut desc = Self::new();
-        let child_filters = parent_filters
-            .iter()
-            .map(|f| PushedDownPredicate::unsupported(Arc::clone(f)))
-            .collect_vec();
         for _ in 0..children.len() {
-            desc = desc.with_child(ChildFilterDescription {
-                parent_filters: child_filters.clone(),
-                self_filters: vec![],
-            });
+            desc =
+                desc.with_child(ChildFilterDescription::all_unsupported(parent_filters));
         }
         desc
     }
diff --git a/datafusion/physical-plan/src/joins/array_map.rs b/datafusion/physical-plan/src/joins/array_map.rs
new file mode 100644
index 0000000000000..ad40d6776df4f
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/array_map.rs
@@ -0,0 +1,547 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_schema::DataType;
+use num_traits::AsPrimitive;
+use std::mem::size_of;
+
+use crate::joins::MapOffset;
+use crate::joins::chain::traverse_chain;
+use arrow::array::{Array, ArrayRef, AsArray, BooleanArray};
+use arrow::buffer::BooleanBuffer;
+use arrow::datatypes::ArrowNumericType;
+use datafusion_common::{Result, ScalarValue, internal_err};
+
+/// A macro to downcast only supported integer types (up to 64-bit) and invoke a generic function.
+///
+/// Usage: `downcast_supported_integer!(data_type => (Method, arg1, arg2, ...))`
+///
+/// The `Method` must be an associated method of [`ArrayMap`] that is generic over
+/// `<T: ArrowNumericType>` and allow `T::Native: AsPrimitive<u64>`.
+macro_rules! downcast_supported_integer {
+    ($DATA_TYPE:expr => ($METHOD:ident $(, $ARGS:expr)*)) => {
+        match $DATA_TYPE {
+            arrow::datatypes::DataType::Int8 => ArrayMap::$METHOD::<arrow::datatypes::Int8Type>($($ARGS),*),
+            arrow::datatypes::DataType::Int16 => ArrayMap::$METHOD::<arrow::datatypes::Int16Type>($($ARGS),*),
+            arrow::datatypes::DataType::Int32 => ArrayMap::$METHOD::<arrow::datatypes::Int32Type>($($ARGS),*),
+            arrow::datatypes::DataType::Int64 => ArrayMap::$METHOD::<arrow::datatypes::Int64Type>($($ARGS),*),
+            arrow::datatypes::DataType::UInt8 => ArrayMap::$METHOD::<arrow::datatypes::UInt8Type>($($ARGS),*),
+            arrow::datatypes::DataType::UInt16 => ArrayMap::$METHOD::<arrow::datatypes::UInt16Type>($($ARGS),*),
+            arrow::datatypes::DataType::UInt32 => ArrayMap::$METHOD::<arrow::datatypes::UInt32Type>($($ARGS),*),
+            arrow::datatypes::DataType::UInt64 => ArrayMap::$METHOD::<arrow::datatypes::UInt64Type>($($ARGS),*),
+            _ => {
+                return internal_err!(
+                    "Unsupported type for ArrayMap: {:?}",
+                    $DATA_TYPE
+                );
+            }
+        }
+    };
+}
+
+/// A dense map for single-column integer join keys within a limited range.
+///
+/// Maps join keys to build-side indices using direct array indexing:
+/// `data[val - min_val_in_build_side] -> val_idx_in_build_side + 1`.
+///
+/// NULL values are ignored on both the build side and the probe side.
+///
+/// # Handling Negative Numbers with `wrapping_sub`
+///
+/// This implementation supports signed integer ranges (e.g., `[-5, 5]`) efficiently by
+/// treating them as `u64` (Two's Complement) and relying on the bitwise properties of
+/// wrapping arithmetic (`wrapping_sub`).
+///
+/// In Two's Complement representation, `a_signed - b_signed` produces the same bit pattern
+/// as `a_unsigned.wrapping_sub(b_unsigned)` (modulo 2^N). This allows us to perform
+/// range calculations and zero-based index mapping uniformly for both signed and unsigned
+/// types without branching.
+///
+/// ## Examples
+///
+/// Consider an `Int64` range `[-5, 5]`.
+/// * `min_val (-5)` casts to `u64`: `...11111011` (`u64::MAX - 4`)
+/// * `max_val (5)` casts to `u64`: `...00000101` (`5`)
+///
+/// **1. Range Calculation**
+///
+/// ```text
+/// In modular arithmetic, this is equivalent to:
+///   (5 - (2^64 - 5)) mod 2^64
+/// = (5 - 2^64 + 5) mod 2^64
+/// = (10 - 2^64) mod 2^64
+/// = 10
+///
+/// ```
+/// The resulting `range` (10) correctly represents the size of the interval `[-5, 5]`.
+///
+/// **2. Index Lookup (in `get_matched_indices`)**
+///
+/// For a probe value of `0` (which is stored as `0u64`):
+/// ```text
+/// In modular arithmetic, this is equivalent to:
+///   (0 - (2^64 - 5)) mod 2^64
+/// = (-2^64 + 5) mod 2^64
+/// = 5
+/// ```
+/// This correctly maps `-5` to index `0`, `0` to index `5`, etc.
+#[derive(Debug)]
+pub struct ArrayMap {
+    // data[probSideVal-offset] -> valIdxInBuildSide + 1; 0 for absent
+    data: Vec<u32>,
+    // min val in buildSide
+    offset: u64,
+    // next[buildSideIdx] -> next matching valIdxInBuildSide + 1; 0 for end of chain.
+    // If next is empty, it means there are no duplicate keys (no conflicts).
+    // It uses the same chain-based conflict resolution as [`JoinHashMapType`].
+    next: Vec<u32>,
+    num_of_distinct_key: usize,
+}
+
+impl ArrayMap {
+    pub fn is_supported_type(data_type: &DataType) -> bool {
+        matches!(
+            data_type,
+            DataType::Int8
+                | DataType::Int16
+                | DataType::Int32
+                | DataType::Int64
+                | DataType::UInt8
+                | DataType::UInt16
+                | DataType::UInt32
+                | DataType::UInt64
+        )
+    }
+
+    pub(crate) fn key_to_u64(v: &ScalarValue) -> Option<u64> {
+        match v {
+            ScalarValue::Int8(Some(v)) => Some(*v as u64),
+            ScalarValue::Int16(Some(v)) => Some(*v as u64),
+            ScalarValue::Int32(Some(v)) => Some(*v as u64),
+            ScalarValue::Int64(Some(v)) => Some(*v as u64),
+            ScalarValue::UInt8(Some(v)) => Some(*v as u64),
+            ScalarValue::UInt16(Some(v)) => Some(*v as u64),
+            ScalarValue::UInt32(Some(v)) => Some(*v as u64),
+            ScalarValue::UInt64(Some(v)) => Some(*v),
+            _ => None,
+        }
+    }
+
+    /// Estimates the maximum memory usage for an `ArrayMap` with the given parameters.
+    ///
+    pub fn estimate_memory_size(min_val: u64, max_val: u64, num_rows: usize) -> usize {
+        let range = Self::calculate_range(min_val, max_val);
+        if range >= usize::MAX as u64 {
+            return usize::MAX;
+        }
+        let size = (range + 1) as usize;
+        size.saturating_mul(size_of::<u32>())
+            .saturating_add(num_rows.saturating_mul(size_of::<u32>()))
+    }
+
+    pub fn calculate_range(min_val: u64, max_val: u64) -> u64 {
+        max_val.wrapping_sub(min_val)
+    }
+
+    /// Creates a new [`ArrayMap`] from the given array of join keys.
+    ///
+    /// Note: This function processes only the non-null values in the input `array`,
+    /// ignoring any rows where the key is `NULL`.
+    ///
+    pub(crate) fn try_new(array: &ArrayRef, min_val: u64, max_val: u64) -> Result<Self> {
+        let range = max_val.wrapping_sub(min_val);
+        if range >= usize::MAX as u64 {
+            return internal_err!("ArrayMap key range is too large to be allocated.");
+        }
+        let size = (range + 1) as usize;
+
+        let mut data: Vec<u32> = vec![0; size];
+        let mut next: Vec<u32> = vec![];
+        let mut num_of_distinct_key = 0;
+
+        downcast_supported_integer!(
+            array.data_type() => (
+                fill_data,
+                array,
+                min_val,
+                &mut data,
+                &mut next,
+                &mut num_of_distinct_key
+            )
+        )?;
+
+        Ok(Self {
+            data,
+            offset: min_val,
+            next,
+            num_of_distinct_key,
+        })
+    }
+
+    fn fill_data<T: ArrowNumericType>(
+        array: &ArrayRef,
+        offset_val: u64,
+        data: &mut [u32],
+        next: &mut Vec<u32>,
+        num_of_distinct_key: &mut usize,
+    ) -> Result<()>
+    where
+        T::Native: AsPrimitive<u64>,
+    {
+        let arr = array.as_primitive::<T>();
+        // Iterate in reverse to maintain FIFO order when there are duplicate keys.
+        for (i, val) in arr.iter().enumerate().rev() {
+            if let Some(val) = val {
+                let key: u64 = val.as_();
+                let idx = key.wrapping_sub(offset_val) as usize;
+                if idx >= data.len() {
+                    return internal_err!("failed build Array idx >= data.len()");
+                }
+
+                if data[idx] != 0 {
+                    if next.is_empty() {
+                        *next = vec![0; array.len()]
+                    }
+                    next[i] = data[idx]
+                } else {
+                    *num_of_distinct_key += 1;
+                }
+                data[idx] = (i) as u32 + 1;
+            }
+        }
+        Ok(())
+    }
+
+    pub fn num_of_distinct_key(&self) -> usize {
+        self.num_of_distinct_key
+    }
+
+    /// Returns the memory usage of this [`ArrayMap`] in bytes.
+    pub fn size(&self) -> usize {
+        self.data.capacity() * size_of::<u32>() + self.next.capacity() * size_of::<u32>()
+    }
+
+    pub fn get_matched_indices_with_limit_offset(
+        &self,
+        prob_side_keys: &[ArrayRef],
+        limit: usize,
+        current_offset: MapOffset,
+        probe_indices: &mut Vec<u32>,
+        build_indices: &mut Vec<u64>,
+    ) -> Result<Option<MapOffset>> {
+        if prob_side_keys.len() != 1 {
+            return internal_err!(
+                "ArrayMap expects 1 join key, but got {}",
+                prob_side_keys.len()
+            );
+        }
+        let array = &prob_side_keys[0];
+
+        downcast_supported_integer!(
+            array.data_type() => (
+                lookup_and_get_indices,
+                self,
+                array,
+                limit,
+                current_offset,
+                probe_indices,
+                build_indices
+            )
+        )
+    }
+
+    fn lookup_and_get_indices<T: ArrowNumericType>(
+        &self,
+        array: &ArrayRef,
+        limit: usize,
+        current_offset: MapOffset,
+        probe_indices: &mut Vec<u32>,
+        build_indices: &mut Vec<u64>,
+    ) -> Result<Option<MapOffset>>
+    where
+        T::Native: Copy + AsPrimitive<u64>,
+    {
+        probe_indices.clear();
+        build_indices.clear();
+
+        let arr = array.as_primitive::<T>();
+
+        let have_null = arr.null_count() > 0;
+
+        if self.next.is_empty() {
+            for prob_idx in current_offset.0..arr.len() {
+                if build_indices.len() == limit {
+                    return Ok(Some((prob_idx, None)));
+                }
+
+                // short circuit
+                if have_null && arr.is_null(prob_idx) {
+                    continue;
+                }
+                // SAFETY: prob_idx is guaranteed to be within bounds by the loop range.
+                let prob_val: u64 = unsafe { arr.value_unchecked(prob_idx) }.as_();
+                let idx_in_build_side = prob_val.wrapping_sub(self.offset) as usize;
+
+                if idx_in_build_side >= self.data.len()
+                    || self.data[idx_in_build_side] == 0
+                {
+                    continue;
+                }
+                build_indices.push((self.data[idx_in_build_side] - 1) as u64);
+                probe_indices.push(prob_idx as u32);
+            }
+            Ok(None)
+        } else {
+            let mut remaining_output = limit;
+            let to_skip = match current_offset {
+                // None `initial_next_idx` indicates that `initial_idx` processing hasn't been started
+                (idx, None) => idx,
+                // Zero `initial_next_idx` indicates that `initial_idx` has been processed during
+                // previous iteration, and it should be skipped
+                (idx, Some(0)) => idx + 1,
+                // Otherwise, process remaining `initial_idx` matches by traversing `next_chain`,
+                // to start with the next index
+                (idx, Some(next_idx)) => {
+                    let is_last = idx == arr.len() - 1;
+                    if let Some(next_offset) = traverse_chain(
+                        &self.next,
+                        idx,
+                        next_idx as u32,
+                        &mut remaining_output,
+                        probe_indices,
+                        build_indices,
+                        is_last,
+                    ) {
+                        return Ok(Some(next_offset));
+                    }
+                    idx + 1
+                }
+            };
+
+            for prob_side_idx in to_skip..arr.len() {
+                if remaining_output == 0 {
+                    return Ok(Some((prob_side_idx, None)));
+                }
+
+                if arr.is_null(prob_side_idx) {
+                    continue;
+                }
+
+                let is_last = prob_side_idx == arr.len() - 1;
+
+                // SAFETY: prob_idx is guaranteed to be within bounds by the loop range.
+                let prob_val: u64 = unsafe { arr.value_unchecked(prob_side_idx) }.as_();
+                let idx_in_build_side = prob_val.wrapping_sub(self.offset) as usize;
+                if idx_in_build_side >= self.data.len()
+                    || self.data[idx_in_build_side] == 0
+                {
+                    continue;
+                }
+
+                let build_idx = self.data[idx_in_build_side];
+
+                if let Some(offset) = traverse_chain(
+                    &self.next,
+                    prob_side_idx,
+                    build_idx,
+                    &mut remaining_output,
+                    probe_indices,
+                    build_indices,
+                    is_last,
+                ) {
+                    return Ok(Some(offset));
+                }
+            }
+            Ok(None)
+        }
+    }
+
+    pub fn contain_keys(&self, probe_side_keys: &[ArrayRef]) -> Result<BooleanArray> {
+        if probe_side_keys.len() != 1 {
+            return internal_err!(
+                "ArrayMap join expects 1 join key, but got {}",
+                probe_side_keys.len()
+            );
+        }
+        let array = &probe_side_keys[0];
+
+        downcast_supported_integer!(
+            array.data_type() => (
+                contain_hashes_helper,
+                self,
+                array
+            )
+        )
+    }
+
+    fn contain_hashes_helper<T: ArrowNumericType>(
+        &self,
+        array: &ArrayRef,
+    ) -> Result<BooleanArray>
+    where
+        T::Native: AsPrimitive<u64>,
+    {
+        let arr = array.as_primitive::<T>();
+        let buffer = BooleanBuffer::collect_bool(arr.len(), |i| {
+            if arr.is_null(i) {
+                return false;
+            }
+            // SAFETY: i is within bounds [0, arr.len())
+            let key: u64 = unsafe { arr.value_unchecked(i) }.as_();
+            let idx = key.wrapping_sub(self.offset) as usize;
+            idx < self.data.len() && self.data[idx] != 0
+        });
+        Ok(BooleanArray::new(buffer, None))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::Int32Array;
+    use arrow::array::Int64Array;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_array_map_limit_offset_duplicate_elements() -> Result<()> {
+        let build: ArrayRef = Arc::new(Int32Array::from(vec![1, 1, 2]));
+        let map = ArrayMap::try_new(&build, 1, 2)?;
+        let probe = [Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef];
+
+        let mut prob_idx = Vec::new();
+        let mut build_idx = Vec::new();
+        let mut next = Some((0, None));
+        let mut results = vec![];
+
+        while let Some(o) = next {
+            next = map.get_matched_indices_with_limit_offset(
+                &probe,
+                1,
+                o,
+                &mut prob_idx,
+                &mut build_idx,
+            )?;
+            results.push((prob_idx.clone(), build_idx.clone(), next));
+        }
+
+        let expected = vec![
+            (vec![0], vec![0], Some((0, Some(2)))),
+            (vec![0], vec![1], Some((0, Some(0)))),
+            (vec![1], vec![2], None),
+        ];
+        assert_eq!(results, expected);
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_map_with_limit_and_misses() -> Result<()> {
+        let build: ArrayRef = Arc::new(Int32Array::from(vec![1, 2]));
+        let map = ArrayMap::try_new(&build, 1, 2)?;
+        let probe = [Arc::new(Int32Array::from(vec![10, 1, 2])) as ArrayRef];
+
+        let (mut p_idx, mut b_idx) = (vec![], vec![]);
+        // Skip 10, find 1, next is 2
+        let next = map.get_matched_indices_with_limit_offset(
+            &probe,
+            1,
+            (0, None),
+            &mut p_idx,
+            &mut b_idx,
+        )?;
+        assert_eq!(p_idx, vec![1]);
+        assert_eq!(b_idx, vec![0]);
+        assert_eq!(next, Some((2, None)));
+
+        // Find 2, end
+        let next = map.get_matched_indices_with_limit_offset(
+            &probe,
+            1,
+            next.unwrap(),
+            &mut p_idx,
+            &mut b_idx,
+        )?;
+        assert_eq!(p_idx, vec![2]);
+        assert_eq!(b_idx, vec![1]);
+        assert!(next.is_none());
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_map_with_build_duplicates_and_misses() -> Result<()> {
+        let build_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 1]));
+        let array_map = ArrayMap::try_new(&build_array, 1, 1)?;
+        // prob: 10(m), 1(h1, h2), 20(m), 1(h1, h2)
+        let probe_array: ArrayRef = Arc::new(Int32Array::from(vec![10, 1, 20, 1]));
+        let prob_side_keys = [probe_array];
+
+        let mut prob_indices = Vec::new();
+        let mut build_indices = Vec::new();
+
+        // batch_size=3, should get 2 matches from first '1' and 1 match from second '1'
+        let result_offset = array_map.get_matched_indices_with_limit_offset(
+            &prob_side_keys,
+            3,
+            (0, None),
+            &mut prob_indices,
+            &mut build_indices,
+        )?;
+
+        assert_eq!(prob_indices, vec![1, 1, 3]);
+        assert_eq!(build_indices, vec![0, 1, 0]);
+        assert_eq!(result_offset, Some((3, Some(2))));
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_map_i64_with_negative_and_positive_numbers() -> Result<()> {
+        // Build array with a mix of negative and positive i64 values, no duplicates
+        let build_array: ArrayRef = Arc::new(Int64Array::from(vec![-5, 0, 5, -2, 3, 10]));
+        let min_val = -5_i128;
+        let max_val = 10_i128;
+
+        let array_map = ArrayMap::try_new(&build_array, min_val as u64, max_val as u64)?;
+
+        // Probe array
+        let probe_array: ArrayRef = Arc::new(Int64Array::from(vec![0, -5, 10, -1]));
+        let prob_side_keys = [Arc::clone(&probe_array)];
+
+        let mut prob_indices = Vec::new();
+        let mut build_indices = Vec::new();
+
+        // Call once to get all matches
+        let result_offset = array_map.get_matched_indices_with_limit_offset(
+            &prob_side_keys,
+            10, // A batch size larger than number of probes
+            (0, None),
+            &mut prob_indices,
+            &mut build_indices,
+        )?;
+
+        // Expected matches, in probe-side order:
+        // Probe 0 (value 0) -> Build 1 (value 0)
+        // Probe 1 (value -5) -> Build 0 (value -5)
+        // Probe 2 (value 10) -> Build 5 (value 10)
+        let expected_prob_indices = vec![0, 1, 2];
+        let expected_build_indices = vec![1, 0, 5];
+
+        assert_eq!(prob_indices, expected_prob_indices);
+        assert_eq!(build_indices, expected_build_indices);
+        assert!(result_offset.is_none());
+
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/joins/chain.rs b/datafusion/physical-plan/src/joins/chain.rs
new file mode 100644
index 0000000000000..846b7505d6478
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/chain.rs
@@ -0,0 +1,69 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::fmt::Debug;
+use std::ops::Sub;
+
+use arrow::datatypes::ArrowNativeType;
+
+use crate::joins::MapOffset;
+
+/// Traverses the chain of matching indices, collecting results up to the remaining limit.
+/// Returns `Some(offset)` if the limit was reached and there are more results to process,
+/// or `None` if the chain was fully traversed.
+#[inline(always)]
+pub(crate) fn traverse_chain<T>(
+    next_chain: &[T],
+    prob_idx: usize,
+    start_chain_idx: T,
+    remaining: &mut usize,
+    input_indices: &mut Vec<u32>,
+    match_indices: &mut Vec<u64>,
+    is_last_input: bool,
+) -> Option<MapOffset>
+where
+    T: Copy + TryFrom<usize> + PartialOrd + Into<u64> + Sub<Output = T>,
+    <T as TryFrom<usize>>::Error: Debug,
+    T: ArrowNativeType,
+{
+    let zero = T::usize_as(0);
+    let one = T::usize_as(1);
+    let mut match_row_idx = start_chain_idx - one;
+
+    loop {
+        match_indices.push(match_row_idx.into());
+        input_indices.push(prob_idx as u32);
+        *remaining -= 1;
+
+        let next = next_chain[match_row_idx.into() as usize];
+
+        if *remaining == 0 {
+            // Limit reached - return offset for next call
+            return if is_last_input && next == zero {
+                // Finished processing the last input row
+                None
+            } else {
+                Some((prob_idx, Some(next.into())))
+            };
+        }
+        if next == zero {
+            // End of chain
+            return None;
+        }
+        match_row_idx = next - one;
+    }
+}
diff --git a/datafusion/physical-plan/src/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs
index fc32bb6fc94c7..3027fb130f087 100644
--- a/datafusion/physical-plan/src/joins/cross_join.rs
+++ b/datafusion/physical-plan/src/joins/cross_join.rs
@@ -18,36 +18,40 @@
 //! Defines the cross join plan for loading the left side of the cross join
 //! and producing batches in parallel for the right partitions
 
-use std::{any::Any, sync::Arc, task::Poll};
+use std::{sync::Arc, task::Poll};
 
 use super::utils::{
-    adjust_right_output_partitioning, reorder_output_after_swap, BatchSplitter,
-    BatchTransformer, BuildProbeJoinMetrics, NoopBatchTransformer, OnceAsync, OnceFut,
-    StatefulStreamResult,
+    BatchSplitter, BatchTransformer, BuildProbeJoinMetrics, NoopBatchTransformer,
+    OnceAsync, OnceFut, StatefulStreamResult, adjust_right_output_partitioning,
+    reorder_output_after_swap,
 };
-use crate::execution_plan::{boundedness_from_children, EmissionType};
+use crate::execution_plan::{EmissionType, boundedness_from_children};
 use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet};
 use crate::projection::{
-    join_allows_pushdown, join_table_borders, new_join_children,
-    physical_to_column_exprs, ProjectionExec,
+    ProjectionExec, join_allows_pushdown, join_table_borders, new_join_children,
+    physical_to_column_exprs,
 };
 use crate::{
-    handle_state, ColumnStatistics, DisplayAs, DisplayFormatType, Distribution,
-    ExecutionPlan, ExecutionPlanProperties, PlanProperties, RecordBatchStream,
-    SendableRecordBatchStream, Statistics,
+    ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, ExecutionPlan,
+    ExecutionPlanProperties, PlanProperties, RecordBatchStream,
+    SendableRecordBatchStream, Statistics, check_if_same_properties, handle_state,
 };
 
 use arrow::array::{RecordBatch, RecordBatchOptions};
 use arrow::compute::concat_batches;
 use arrow::datatypes::{Fields, Schema, SchemaRef};
 use datafusion_common::stats::Precision;
-use datafusion_common::{internal_err, JoinType, Result, ScalarValue};
-use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{
+    JoinType, Result, ScalarValue, assert_eq_or_internal_err, internal_err,
+};
 use datafusion_execution::TaskContext;
+use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
+use datafusion_physical_expr::PhysicalExpr;
 use datafusion_physical_expr::equivalence::join_equivalence_properties;
 
 use async_trait::async_trait;
-use futures::{ready, Stream, StreamExt, TryStreamExt};
+use futures::{Stream, StreamExt, TryStreamExt, ready};
 
 /// Data of the left side that is buffered into memory
 #[derive(Debug)]
@@ -59,7 +63,7 @@ struct JoinLeftData {
     _reservation: MemoryReservation,
 }
 
-#[allow(rustdoc::private_intra_doc_links)]
+#[expect(rustdoc::private_intra_doc_links)]
 /// Cross Join Execution Plan
 ///
 /// This operator is used when there are no predicates between two tables and
@@ -92,7 +96,7 @@ pub struct CrossJoinExec {
     /// Execution plan metrics
     metrics: ExecutionPlanMetricsSet,
     /// Properties such as schema, equivalence properties, ordering, partitioning, etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl CrossJoinExec {
@@ -123,7 +127,7 @@ impl CrossJoinExec {
             schema,
             left_fut: Default::default(),
             metrics: ExecutionPlanMetricsSet::default(),
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -190,6 +194,23 @@ impl CrossJoinExec {
             &self.right.schema(),
         )
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        let left = children.swap_remove(0);
+        let right = children.swap_remove(0);
+
+        Self {
+            left,
+            right,
+            metrics: ExecutionPlanMetricsSet::new(),
+            left_fut: Default::default(),
+            cache: Arc::clone(&self.cache),
+            schema: Arc::clone(&self.schema),
+        }
+    }
 }
 
 /// Asynchronously collect the result of the left child
@@ -204,7 +225,7 @@ async fn load_left_input(
     let (batches, _metrics, reservation) = stream
         .try_fold(
             (Vec::new(), metrics, reservation),
-            |(mut batches, metrics, mut reservation), batch| async {
+            |(mut batches, metrics, reservation), batch| async {
                 let batch_size = batch.get_array_memory_size();
                 // Reserve memory for incoming batch
                 reservation.try_grow(batch_size)?;
@@ -250,11 +271,7 @@ impl ExecutionPlan for CrossJoinExec {
         "CrossJoinExec"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -266,10 +283,19 @@ impl ExecutionPlan for CrossJoinExec {
         Some(self.metrics.clone_inner())
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // CrossJoin has no join conditions or expressions
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         Ok(Arc::new(CrossJoinExec::new(
             Arc::clone(&children[0]),
             Arc::clone(&children[1]),
@@ -283,7 +309,7 @@ impl ExecutionPlan for CrossJoinExec {
             schema: Arc::clone(&self.schema),
             left_fut: Default::default(), // reset the build side!
             metrics: ExecutionPlanMetricsSet::default(),
-            cache: self.cache.clone(),
+            cache: Arc::clone(&self.cache),
         };
         Ok(Arc::new(new_exec))
     }
@@ -300,12 +326,12 @@ impl ExecutionPlan for CrossJoinExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        if self.left.output_partitioning().partition_count() != 1 {
-            return internal_err!(
-                "Invalid CrossJoinExec, the output partition count of the left child must be 1,\
+        assert_eq_or_internal_err!(
+            self.left.output_partitioning().partition_count(),
+            1,
+            "Invalid CrossJoinExec, the output partition count of the left child must be 1,\
                  consider using CoalescePartitionsExec or the EnforceDistribution rule"
-            );
-        }
+        );
 
         let stream = self.right.execute(partition, Arc::clone(&context))?;
 
@@ -354,16 +380,13 @@ impl ExecutionPlan for CrossJoinExec {
         }
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         // Get the all partitions statistics of the left
-        let left_stats = self.left.partition_statistics(None)?;
-        let right_stats = self.right.partition_statistics(partition)?;
+        let left_stats = Arc::unwrap_or_clone(self.left.partition_statistics(None)?);
+        let right_stats =
+            Arc::unwrap_or_clone(self.right.partition_statistics(partition)?);
 
-        Ok(stats_cartesian_product(left_stats, right_stats))
+        Ok(Arc::new(stats_cartesian_product(left_stats, right_stats)))
     }
 
     /// Tries to swap the projection with its input [`CrossJoinExec`]. If it can be done,
@@ -431,31 +454,34 @@ fn stats_cartesian_product(
     // Min, max and distinct_count on the other hand are invariants.
     let cross_join_stats = left_col_stats
         .into_iter()
-        .map(|s| ColumnStatistics {
-            null_count: s.null_count.multiply(&right_row_count),
-            distinct_count: s.distinct_count,
-            min_value: s.min_value,
-            max_value: s.max_value,
-            sum_value: s
-                .sum_value
-                .get_value()
-                // Cast the row count into the same type as any existing sum value
-                .and_then(|v| {
-                    Precision::<ScalarValue>::from(right_row_count)
-                        .cast_to(&v.data_type())
-                        .ok()
-                })
-                .map(|row_count| s.sum_value.multiply(&row_count))
-                .unwrap_or(Precision::Absent),
+        .map(|s| {
+            let widened_sum = s.sum_value.cast_to_sum_type();
+            ColumnStatistics {
+                null_count: s.null_count.multiply(&right_row_count),
+                distinct_count: s.distinct_count,
+                min_value: s.min_value,
+                max_value: s.max_value,
+                sum_value: widened_sum
+                    .get_value()
+                    // Cast the row count into the same type as any existing sum value
+                    .and_then(|v| {
+                        Precision::<ScalarValue>::from(right_row_count)
+                            .cast_to(&v.data_type())
+                            .ok()
+                    })
+                    .map(|row_count| widened_sum.multiply(&row_count))
+                    .unwrap_or(Precision::Absent),
+                byte_size: Precision::Absent,
+            }
         })
         .chain(right_col_stats.into_iter().map(|s| {
+            let widened_sum = s.sum_value.cast_to_sum_type();
             ColumnStatistics {
                 null_count: s.null_count.multiply(&left_row_count),
                 distinct_count: s.distinct_count,
                 min_value: s.min_value,
                 max_value: s.max_value,
-                sum_value: s
-                    .sum_value
+                sum_value: widened_sum
                     .get_value()
                     // Cast the row count into the same type as any existing sum value
                     .and_then(|v| {
@@ -463,8 +489,9 @@ fn stats_cartesian_product(
                             .cast_to(&v.data_type())
                             .ok()
                     })
-                    .map(|row_count| s.sum_value.multiply(&row_count))
+                    .map(|row_count| widened_sum.multiply(&row_count))
                     .unwrap_or(Precision::Absent),
+                byte_size: Precision::Absent,
             }
         }))
         .collect();
@@ -650,7 +677,6 @@ impl<T: BatchTransformer> CrossJoinStream<T> {
                         self.left_index += 1;
                     }
 
-                    self.join_metrics.output_batches.add(1);
                     return Ok(StatefulStreamResult::Ready(Some(batch)));
                 }
             }
@@ -703,6 +729,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
                     sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
                     null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(1),
@@ -710,6 +737,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::from("a")),
                     sum_value: Precision::Absent,
                     null_count: Precision::Exact(3),
+                    byte_size: Precision::Absent,
                 },
             ],
         };
@@ -723,6 +751,7 @@ mod tests {
                 min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
                 sum_value: Precision::Exact(ScalarValue::Int64(Some(20))),
                 null_count: Precision::Exact(2),
+                byte_size: Precision::Absent,
             }],
         };
 
@@ -740,6 +769,7 @@ mod tests {
                         42 * right_row_count as i64,
                     ))),
                     null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(1),
@@ -747,6 +777,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::from("a")),
                     sum_value: Precision::Absent,
                     null_count: Precision::Exact(3 * right_row_count),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(3),
@@ -756,6 +787,7 @@ mod tests {
                         20 * left_row_count as i64,
                     ))),
                     null_count: Precision::Exact(2 * left_row_count),
+                    byte_size: Precision::Absent,
                 },
             ],
         };
@@ -777,6 +809,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
                     sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
                     null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(1),
@@ -784,6 +817,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::from("a")),
                     sum_value: Precision::Absent,
                     null_count: Precision::Exact(3),
+                    byte_size: Precision::Absent,
                 },
             ],
         };
@@ -797,6 +831,7 @@ mod tests {
                 min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
                 sum_value: Precision::Exact(ScalarValue::Int64(Some(20))),
                 null_count: Precision::Exact(2),
+                byte_size: Precision::Absent,
             }],
         };
 
@@ -812,6 +847,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
                     sum_value: Precision::Absent, // we don't know the row count on the right
                     null_count: Precision::Absent, // we don't know the row count on the right
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(1),
@@ -819,6 +855,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::from("a")),
                     sum_value: Precision::Absent,
                     null_count: Precision::Absent, // we don't know the row count on the right
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(3),
@@ -828,6 +865,7 @@ mod tests {
                         20 * left_row_count as i64,
                     ))),
                     null_count: Precision::Exact(2 * left_row_count),
+                    byte_size: Precision::Absent,
                 },
             ],
         };
@@ -835,6 +873,49 @@ mod tests {
         assert_eq!(result, expected);
     }
 
+    #[tokio::test]
+    async fn test_stats_cartesian_product_unsigned_sum_widens_to_u64() {
+        let left_row_count = 2;
+        let right_row_count = 3;
+
+        let left = Statistics {
+            num_rows: Precision::Exact(left_row_count),
+            total_byte_size: Precision::Exact(10),
+            column_statistics: vec![ColumnStatistics {
+                distinct_count: Precision::Exact(2),
+                max_value: Precision::Exact(ScalarValue::UInt32(Some(10))),
+                min_value: Precision::Exact(ScalarValue::UInt32(Some(1))),
+                sum_value: Precision::Exact(ScalarValue::UInt32(Some(7))),
+                null_count: Precision::Exact(0),
+                byte_size: Precision::Absent,
+            }],
+        };
+
+        let right = Statistics {
+            num_rows: Precision::Exact(right_row_count),
+            total_byte_size: Precision::Exact(10),
+            column_statistics: vec![ColumnStatistics {
+                distinct_count: Precision::Exact(3),
+                max_value: Precision::Exact(ScalarValue::UInt32(Some(12))),
+                min_value: Precision::Exact(ScalarValue::UInt32(Some(0))),
+                sum_value: Precision::Exact(ScalarValue::UInt32(Some(11))),
+                null_count: Precision::Exact(0),
+                byte_size: Precision::Absent,
+            }],
+        };
+
+        let result = stats_cartesian_product(left, right);
+
+        assert_eq!(
+            result.column_statistics[0].sum_value,
+            Precision::Exact(ScalarValue::UInt64(Some(21)))
+        );
+        assert_eq!(
+            result.column_statistics[1].sum_value,
+            Precision::Exact(ScalarValue::UInt64(Some(22)))
+        );
+    }
+
     #[tokio::test]
     async fn test_join() -> Result<()> {
         let task_ctx = Arc::new(TaskContext::default());
@@ -854,18 +935,18 @@ mod tests {
 
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]);
 
-        assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b2 | c2 |
-            +----+----+----+----+----+----+
-            | 1  | 4  | 7  | 10 | 12 | 14 |
-            | 1  | 4  | 7  | 11 | 13 | 15 |
-            | 2  | 5  | 8  | 10 | 12 | 14 |
-            | 2  | 5  | 8  | 11 | 13 | 15 |
-            | 3  | 6  | 9  | 10 | 12 | 14 |
-            | 3  | 6  | 9  | 11 | 13 | 15 |
-            +----+----+----+----+----+----+
-            "#);
+        assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+----+----+----+----+
+        | a1 | b1 | c1 | a2 | b2 | c2 |
+        +----+----+----+----+----+----+
+        | 1  | 4  | 7  | 10 | 12 | 14 |
+        | 1  | 4  | 7  | 11 | 13 | 15 |
+        | 2  | 5  | 8  | 10 | 12 | 14 |
+        | 2  | 5  | 8  | 11 | 13 | 15 |
+        | 3  | 6  | 9  | 10 | 12 | 14 |
+        | 3  | 6  | 9  | 11 | 13 | 15 |
+        +----+----+----+----+----+----+
+        ");
 
         assert_join_metrics!(metrics, 6);
 
diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs
index 0a582bd911cb0..958ffc910f23a 100644
--- a/datafusion/physical-plan/src/joins/hash_join/exec.rs
+++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs
@@ -15,43 +15,54 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::collections::HashSet;
 use std::fmt;
 use std::mem::size_of;
-use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{Arc, OnceLock};
-use std::{any::Any, vec};
+use std::vec;
 
-use crate::execution_plan::{boundedness_from_children, EmissionType};
+use crate::ExecutionPlanProperties;
+use crate::execution_plan::{
+    EmissionType, boundedness_from_children, has_same_children_properties,
+    stub_properties,
+};
 use crate::filter_pushdown::{
-    ChildPushdownResult, FilterDescription, FilterPushdownPhase,
+    ChildFilterDescription, ChildPushdownResult, FilterDescription, FilterPushdownPhase,
     FilterPushdownPropagation,
 };
-use crate::joins::hash_join::shared_bounds::{ColumnBounds, SharedBoundsAccumulator};
+use crate::joins::Map;
+use crate::joins::array_map::ArrayMap;
+use crate::joins::hash_join::inlist_builder::build_struct_inlist_values;
+use crate::joins::hash_join::shared_bounds::{
+    ColumnBounds, PartitionBounds, PushdownStrategy, SharedBuildAccumulator,
+};
 use crate::joins::hash_join::stream::{
     BuildSide, BuildSideInitialState, HashJoinStream, HashJoinStreamState,
 };
 use crate::joins::join_hash_map::{JoinHashMapU32, JoinHashMapU64};
 use crate::joins::utils::{
-    asymmetric_join_output_partitioning, reorder_output_after_swap, swap_join_projection,
-    update_hash, OnceAsync, OnceFut,
+    OnceAsync, OnceFut, asymmetric_join_output_partitioning, reorder_output_after_swap,
+    swap_join_projection, update_hash,
 };
 use crate::joins::{JoinOn, JoinOnRef, PartitionMode, SharedBitmapBuilder};
+use crate::metrics::{Count, MetricBuilder, MetricCategory};
 use crate::projection::{
-    try_embed_projection, try_pushdown_through_join, EmbeddedProjection, JoinData,
-    ProjectionExec,
+    EmbeddedProjection, JoinData, ProjectionExec, try_embed_projection,
+    try_pushdown_through_join,
 };
+use crate::repartition::REPARTITION_RANDOM_STATE;
 use crate::spill::get_record_batch_memory_size;
-use crate::ExecutionPlanProperties;
 use crate::{
+    DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning,
+    PlanProperties, SendableRecordBatchStream, Statistics,
     common::can_project,
     joins::utils::{
+        BuildProbeJoinMetrics, ColumnIndex, JoinFilter, JoinHashMapType,
         build_join_schema, check_join_is_valid, estimate_join_statistics,
         need_produce_result_in_final, symmetric_join_output_partitioning,
-        BuildProbeJoinMetrics, ColumnIndex, JoinFilter, JoinHashMapType,
     },
     metrics::{ExecutionPlanMetricsSet, MetricsSet},
-    DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning,
-    PlanProperties, SendableRecordBatchStream, Statistics,
 };
 
 use arrow::array::{ArrayRef, BooleanBufferBuilder};
@@ -59,35 +70,127 @@ use arrow::compute::concat_batches;
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
 use arrow::util::bit_util;
-use arrow_schema::DataType;
+use arrow_schema::{DataType, Schema};
 use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::utils::memory::estimate_memory_size;
 use datafusion_common::{
-    internal_err, plan_err, project_schema, JoinSide, JoinType, NullEquality, Result,
+    JoinSide, JoinType, NullEquality, Result, assert_or_internal_err, internal_err,
+    plan_err, project_schema,
 };
-use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
 use datafusion_execution::TaskContext;
+use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
 use datafusion_expr::Accumulator;
 use datafusion_functions_aggregate_common::min_max::{MaxAccumulator, MinAccumulator};
 use datafusion_physical_expr::equivalence::{
-    join_equivalence_properties, ProjectionMapping,
+    ProjectionMapping, join_equivalence_properties,
 };
-use datafusion_physical_expr::expressions::{lit, DynamicFilterPhysicalExpr};
+use datafusion_physical_expr::expressions::{Column, DynamicFilterPhysicalExpr, lit};
+use datafusion_physical_expr::projection::{ProjectionRef, combine_projections};
 use datafusion_physical_expr::{PhysicalExpr, PhysicalExprRef};
 
-use ahash::RandomState;
+use datafusion_common::hash_utils::RandomState;
 use datafusion_physical_expr_common::physical_expr::fmt_sql;
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays;
 use futures::TryStreamExt;
 use parking_lot::Mutex;
 
+use super::partitioned_hash_eval::SeededRandomState;
+
 /// Hard-coded seed to ensure hash values from the hash join differ from `RepartitionExec`, avoiding collisions.
-const HASH_JOIN_SEED: RandomState =
-    RandomState::with_seeds('J' as u64, 'O' as u64, 'I' as u64, 'N' as u64);
+pub(crate) const HASH_JOIN_SEED: SeededRandomState =
+    SeededRandomState::with_seed(12210250226015887276);
+
+const ARRAY_MAP_CREATED_COUNT_METRIC_NAME: &str = "array_map_created_count";
+
+#[expect(clippy::too_many_arguments)]
+fn try_create_array_map(
+    bounds: &Option<PartitionBounds>,
+    schema: &SchemaRef,
+    batches: &[RecordBatch],
+    on_left: &[PhysicalExprRef],
+    reservation: &mut MemoryReservation,
+    perfect_hash_join_small_build_threshold: usize,
+    perfect_hash_join_min_key_density: f64,
+    null_equality: NullEquality,
+) -> Result<Option<(ArrayMap, RecordBatch, Vec<ArrayRef>)>> {
+    if on_left.len() != 1 {
+        return Ok(None);
+    }
+
+    if null_equality == NullEquality::NullEqualsNull {
+        for batch in batches.iter() {
+            let arrays = evaluate_expressions_to_arrays(on_left, batch)?;
+            if arrays[0].null_count() > 0 {
+                return Ok(None);
+            }
+        }
+    }
+
+    let (min_val, max_val) = if let Some(bounds) = bounds {
+        let (min_val, max_val) = if let Some(cb) = bounds.get_column_bounds(0) {
+            (cb.min.clone(), cb.max.clone())
+        } else {
+            return Ok(None);
+        };
+
+        if min_val.is_null() || max_val.is_null() {
+            return Ok(None);
+        }
+
+        if min_val > max_val {
+            return internal_err!("min_val>max_val");
+        }
+
+        if let Some((mi, ma)) =
+            ArrayMap::key_to_u64(&min_val).zip(ArrayMap::key_to_u64(&max_val))
+        {
+            (mi, ma)
+        } else {
+            return Ok(None);
+        }
+    } else {
+        return Ok(None);
+    };
+
+    let range = ArrayMap::calculate_range(min_val, max_val);
+    let num_row: usize = batches.iter().map(|x| x.num_rows()).sum();
+
+    // TODO: support create ArrayMap<u64>
+    if num_row >= u32::MAX as usize {
+        return Ok(None);
+    }
+
+    // When the key range spans the full integer domain (e.g. i64::MIN to i64::MAX),
+    // range is u64::MAX and `range + 1` below would overflow.
+    if range == usize::MAX as u64 {
+        return Ok(None);
+    }
+
+    let dense_ratio = (num_row as f64) / ((range + 1) as f64);
+
+    if range >= perfect_hash_join_small_build_threshold as u64
+        && dense_ratio <= perfect_hash_join_min_key_density
+    {
+        return Ok(None);
+    }
+
+    let mem_size = ArrayMap::estimate_memory_size(min_val, max_val, num_row);
+    reservation.try_grow(mem_size)?;
+
+    let batch = concat_batches(schema, batches)?;
+    let left_values = evaluate_expressions_to_arrays(on_left, &batch)?;
+
+    let array_map = ArrayMap::try_new(&left_values[0], min_val, max_val)?;
+
+    Ok(Some((array_map, batch, left_values)))
+}
 
 /// HashTable and input data for the left (build side) of a join
 pub(super) struct JoinLeftData {
     /// The hash table with indices into `batch`
-    pub(super) hash_map: Box<dyn JoinHashMapType>,
+    /// Arc is used to allow sharing with SharedBuildAccumulator for hash map pushdown
+    pub(super) map: Arc<Map>,
     /// The input rows for the build side
     batch: RecordBatch,
     /// The build side on expressions values
@@ -102,35 +205,24 @@ pub(super) struct JoinLeftData {
     /// This could hide potential out-of-memory issues, especially when upstream operators increase their memory consumption.
     /// The MemoryReservation ensures proper tracking of memory resources throughout the join operation's lifecycle.
     _reservation: MemoryReservation,
-    /// Bounds computed from the build side for dynamic filter pushdown
-    pub(super) bounds: Option<Vec<ColumnBounds>>,
+    /// Bounds computed from the build side for dynamic filter pushdown.
+    /// If the partition is empty (no rows) this will be None.
+    /// If the partition has some rows this will be Some with the bounds for each join key column.
+    pub(super) bounds: Option<PartitionBounds>,
+    /// Membership testing strategy for filter pushdown
+    /// Contains either InList values for small build sides or hash table reference for large build sides
+    pub(super) membership: PushdownStrategy,
+    /// Shared atomic flag indicating if any probe partition saw data (for null-aware anti joins)
+    /// This is shared across all probe partitions to provide global knowledge
+    pub(super) probe_side_non_empty: AtomicBool,
+    /// Shared atomic flag indicating if any probe partition saw NULL in join keys (for null-aware anti joins)
+    pub(super) probe_side_has_null: AtomicBool,
 }
 
 impl JoinLeftData {
-    /// Create a new `JoinLeftData` from its parts
-    pub(super) fn new(
-        hash_map: Box<dyn JoinHashMapType>,
-        batch: RecordBatch,
-        values: Vec<ArrayRef>,
-        visited_indices_bitmap: SharedBitmapBuilder,
-        probe_threads_counter: AtomicUsize,
-        reservation: MemoryReservation,
-        bounds: Option<Vec<ColumnBounds>>,
-    ) -> Self {
-        Self {
-            hash_map,
-            batch,
-            values,
-            visited_indices_bitmap,
-            probe_threads_counter,
-            _reservation: reservation,
-            bounds,
-        }
-    }
-
-    /// return a reference to the hash map
-    pub(super) fn hash_map(&self) -> &dyn JoinHashMapType {
-        &*self.hash_map
+    /// return a reference to the map
+    pub(super) fn map(&self) -> &Map {
+        &self.map
     }
 
     /// returns a reference to the build side batch
@@ -148,6 +240,11 @@ impl JoinLeftData {
         &self.visited_indices_bitmap
     }
 
+    /// returns a reference to the InList values for filter pushdown
+    pub(super) fn membership(&self) -> &PushdownStrategy {
+        &self.membership
+    }
+
     /// Decrements the counter of running threads, and returns `true`
     /// if caller is the last running thread
     pub(super) fn report_probe_completed(&self) -> bool {
@@ -155,7 +252,278 @@ impl JoinLeftData {
     }
 }
 
-#[allow(rustdoc::private_intra_doc_links)]
+/// Helps to build [`HashJoinExec`].
+///
+/// Builder can be created from an existing [`HashJoinExec`] using [`From::from`].
+/// In this case, all its fields are inherited. If a field that affects the node's
+/// properties is modified, they will be automatically recomputed during the build.
+///
+/// # Adding setters
+///
+/// When adding a new setter, it is necessary to ensure that the `preserve_properties`
+/// flag is set to false if modifying the field requires a recomputation of the plan's
+/// properties.
+///
+pub struct HashJoinExecBuilder {
+    exec: HashJoinExec,
+    preserve_properties: bool,
+}
+
+impl HashJoinExecBuilder {
+    /// Make a new [`HashJoinExecBuilder`].
+    pub fn new(
+        left: Arc<dyn ExecutionPlan>,
+        right: Arc<dyn ExecutionPlan>,
+        on: Vec<(PhysicalExprRef, PhysicalExprRef)>,
+        join_type: JoinType,
+    ) -> Self {
+        Self {
+            exec: HashJoinExec {
+                left,
+                right,
+                on,
+                filter: None,
+                join_type,
+                left_fut: Default::default(),
+                random_state: HASH_JOIN_SEED,
+                mode: PartitionMode::Auto,
+                fetch: None,
+                metrics: ExecutionPlanMetricsSet::new(),
+                projection: None,
+                column_indices: vec![],
+                null_equality: NullEquality::NullEqualsNothing,
+                null_aware: false,
+                dynamic_filter: None,
+                // Will be computed at when plan will be built.
+                cache: stub_properties(),
+                join_schema: Arc::new(Schema::empty()),
+            },
+            // As `exec` is initialized with stub properties,
+            // they will be properly computed when plan will be built.
+            preserve_properties: false,
+        }
+    }
+
+    /// Set join type.
+    pub fn with_type(mut self, join_type: JoinType) -> Self {
+        self.exec.join_type = join_type;
+        self.preserve_properties = false;
+        self
+    }
+
+    /// Set projection from the vector.
+    pub fn with_projection(self, projection: Option<Vec<usize>>) -> Self {
+        self.with_projection_ref(projection.map(Into::into))
+    }
+
+    /// Set projection from the shared reference.
+    pub fn with_projection_ref(mut self, projection: Option<ProjectionRef>) -> Self {
+        self.exec.projection = projection;
+        self.preserve_properties = false;
+        self
+    }
+
+    /// Set optional filter.
+    pub fn with_filter(mut self, filter: Option<JoinFilter>) -> Self {
+        self.exec.filter = filter;
+        self
+    }
+
+    /// Set expressions to join on.
+    pub fn with_on(mut self, on: Vec<(PhysicalExprRef, PhysicalExprRef)>) -> Self {
+        self.exec.on = on;
+        self.preserve_properties = false;
+        self
+    }
+
+    /// Set partition mode.
+    pub fn with_partition_mode(mut self, mode: PartitionMode) -> Self {
+        self.exec.mode = mode;
+        self.preserve_properties = false;
+        self
+    }
+
+    /// Set null equality property.
+    pub fn with_null_equality(mut self, null_equality: NullEquality) -> Self {
+        self.exec.null_equality = null_equality;
+        self
+    }
+
+    /// Set null aware property.
+    pub fn with_null_aware(mut self, null_aware: bool) -> Self {
+        self.exec.null_aware = null_aware;
+        self
+    }
+
+    /// Set fetch property.
+    pub fn with_fetch(mut self, fetch: Option<usize>) -> Self {
+        self.exec.fetch = fetch;
+        self
+    }
+
+    /// Require to recompute plan properties.
+    pub fn recompute_properties(mut self) -> Self {
+        self.preserve_properties = false;
+        self
+    }
+
+    /// Replace children.
+    pub fn with_new_children(
+        mut self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Self> {
+        assert_or_internal_err!(
+            children.len() == 2,
+            "wrong number of children passed into `HashJoinExecBuilder`"
+        );
+        self.preserve_properties &= has_same_children_properties(&self.exec, &children)?;
+        self.exec.right = children.swap_remove(1);
+        self.exec.left = children.swap_remove(0);
+        Ok(self)
+    }
+
+    /// Reset runtime state.
+    pub fn reset_state(mut self) -> Self {
+        self.exec.left_fut = Default::default();
+        self.exec.dynamic_filter = None;
+        self.exec.metrics = ExecutionPlanMetricsSet::new();
+        self
+    }
+
+    /// Build result as a dyn execution plan.
+    pub fn build_exec(self) -> Result<Arc<dyn ExecutionPlan>> {
+        self.build().map(|p| Arc::new(p) as _)
+    }
+
+    /// Build resulting execution plan.
+    pub fn build(self) -> Result<HashJoinExec> {
+        let Self {
+            exec,
+            preserve_properties,
+        } = self;
+
+        // Validate null_aware flag
+        if exec.null_aware {
+            let join_type = exec.join_type();
+            if !matches!(join_type, JoinType::LeftAnti) {
+                return plan_err!(
+                    "null_aware can only be true for LeftAnti joins, got {join_type}"
+                );
+            }
+            let on = exec.on();
+            if on.len() != 1 {
+                return plan_err!(
+                    "null_aware anti join only supports single column join key, got {} columns",
+                    on.len()
+                );
+            }
+        }
+
+        if preserve_properties {
+            return Ok(exec);
+        }
+
+        let HashJoinExec {
+            left,
+            right,
+            on,
+            filter,
+            join_type,
+            left_fut,
+            random_state,
+            mode,
+            metrics,
+            projection,
+            null_equality,
+            null_aware,
+            dynamic_filter,
+            fetch,
+            // Recomputed.
+            join_schema: _,
+            column_indices: _,
+            cache: _,
+        } = exec;
+
+        let left_schema = left.schema();
+        let right_schema = right.schema();
+        if on.is_empty() {
+            return plan_err!("On constraints in HashJoinExec should be non-empty");
+        }
+
+        check_join_is_valid(&left_schema, &right_schema, &on)?;
+        let (join_schema, column_indices) =
+            build_join_schema(&left_schema, &right_schema, &join_type);
+
+        let join_schema = Arc::new(join_schema);
+
+        // Check if the projection is valid.
+        can_project(&join_schema, projection.as_deref())?;
+
+        let cache = HashJoinExec::compute_properties(
+            &left,
+            &right,
+            &join_schema,
+            join_type,
+            &on,
+            mode,
+            projection.as_deref(),
+        )?;
+
+        Ok(HashJoinExec {
+            left,
+            right,
+            on,
+            filter,
+            join_type,
+            join_schema,
+            left_fut,
+            random_state,
+            mode,
+            metrics,
+            projection,
+            column_indices,
+            null_equality,
+            null_aware,
+            cache: Arc::new(cache),
+            dynamic_filter,
+            fetch,
+        })
+    }
+
+    fn with_dynamic_filter(mut self, filter: Option<HashJoinExecDynamicFilter>) -> Self {
+        self.exec.dynamic_filter = filter;
+        self
+    }
+}
+
+impl From<&HashJoinExec> for HashJoinExecBuilder {
+    fn from(exec: &HashJoinExec) -> Self {
+        Self {
+            exec: HashJoinExec {
+                left: Arc::clone(exec.left()),
+                right: Arc::clone(exec.right()),
+                on: exec.on.clone(),
+                filter: exec.filter.clone(),
+                join_type: exec.join_type,
+                join_schema: Arc::clone(&exec.join_schema),
+                left_fut: Arc::clone(&exec.left_fut),
+                random_state: exec.random_state.clone(),
+                mode: exec.mode,
+                metrics: exec.metrics.clone(),
+                projection: exec.projection.clone(),
+                column_indices: exec.column_indices.clone(),
+                null_equality: exec.null_equality,
+                null_aware: exec.null_aware,
+                cache: Arc::clone(&exec.cache),
+                dynamic_filter: exec.dynamic_filter.clone(),
+                fetch: exec.fetch,
+            },
+            preserve_properties: true,
+        }
+    }
+}
+
+#[expect(rustdoc::private_intra_doc_links)]
 /// Join execution plan: Evaluates equijoin predicates in parallel on multiple
 /// partitions using a hash table and an optional filter list to apply post
 /// join.
@@ -170,6 +538,36 @@ impl JoinLeftData {
 /// `<col1> != <col2>`) are known as "filter expressions" and are evaluated
 /// after the equijoin predicates.
 ///
+/// # ArrayMap Optimization
+///
+/// For joins with a single integer-based join key, `HashJoinExec` may use an [`ArrayMap`]
+/// (also known as a "perfect hash join") instead of a general-purpose hash map.
+/// This optimization is used when:
+/// 1. There is exactly one join key.
+/// 2. The join key is an integer type up to 64 bits wide that can be losslessly converted
+///    to `u64` (128-bit integer types such as `i128` and `u128` are not supported).
+/// 3. The range of keys is small enough (controlled by `perfect_hash_join_small_build_threshold`)
+///    OR the keys are sufficiently dense (controlled by `perfect_hash_join_min_key_density`).
+/// 4. build_side.num_rows() < u32::MAX
+/// 5. NullEqualsNothing || (NullEqualsNull && build side doesn't contain null)
+///
+/// See [`try_create_array_map`] for more details.
+///
+/// Note that when using [`PartitionMode::Partitioned`], the build side is split into multiple
+/// partitions. This can cause a dense build side to become sparse within each partition,
+/// potentially disabling this optimization.
+///
+/// For example, consider:
+/// ```sql
+/// SELECT t1.value, t2.value
+/// FROM range(10000) AS t1
+/// JOIN range(10000) AS t2
+///   ON t1.value = t2.value;
+/// ```
+/// With 24 partitions, each partition will only receive a subset of the 10,000 rows.
+/// The first partition might contain values like `3, 10, 18, 39, 43`, which are sparse
+/// relative to the original range, even though the overall data set is dense.
+///
 /// # "Build Side" vs "Probe Side"
 ///
 /// HashJoin takes two inputs, which are referred to as the "build" and the
@@ -203,9 +601,9 @@ impl JoinLeftData {
 ///    Resulting hash table stores hashed join-key fields for each row as a key, and
 ///    indices of corresponding rows in concatenated batch.
 ///
-/// Hash join uses LIFO data structure as a hash table, and in order to retain
-/// original build-side input order while obtaining data during probe phase, hash
-/// table is updated by iterating batch sequence in reverse order -- it allows to
+/// When using the standard `JoinHashMap`, hash join uses LIFO data structure as a hash table,
+/// and in order to retain original build-side input order while obtaining data during probe phase,
+/// hash table is updated by iterating batch sequence in reverse order -- it allows to
 /// keep rows with smaller indices "on the top" of hash table, and still maintain
 /// correct indexing for concatenated build-side data batch.
 ///
@@ -338,33 +736,37 @@ pub struct HashJoinExec {
     /// Each output stream waits on the `OnceAsync` to signal the completion of
     /// the hash table creation.
     left_fut: Arc<OnceAsync<JoinLeftData>>,
-    /// Shared the `RandomState` for the hashing algorithm
-    random_state: RandomState,
+    /// Shared the `SeededRandomState` for the hashing algorithm (seeds preserved for serialization)
+    random_state: SeededRandomState,
     /// Partitioning mode to use
     pub mode: PartitionMode,
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     /// The projection indices of the columns in the output schema of join
-    pub projection: Option<Vec<usize>>,
+    pub projection: Option<ProjectionRef>,
     /// Information of index and left / right placement of columns
     column_indices: Vec<ColumnIndex>,
     /// The equality null-handling behavior of the join algorithm.
     pub null_equality: NullEquality,
+    /// Flag to indicate if this is a null-aware anti join
+    pub null_aware: bool,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
     /// Dynamic filter for pushing down to the probe side
     /// Set when dynamic filter pushdown is detected in handle_child_pushdown_result.
     /// HashJoinExec also needs to keep a shared bounds accumulator for coordinating updates.
     dynamic_filter: Option<HashJoinExecDynamicFilter>,
+    /// Maximum number of rows to return
+    fetch: Option<usize>,
 }
 
 #[derive(Clone)]
 struct HashJoinExecDynamicFilter {
     /// Dynamic filter that we'll update with the results of the build side once that is done.
     filter: Arc<DynamicFilterPhysicalExpr>,
-    /// Bounds accumulator to keep track of the min/max bounds on the join keys for each partition.
+    /// Build accumulator to collect build-side information (hash maps and/or bounds) from each partition.
     /// It is lazily initialized during execution to make sure we use the actual execution time partition counts.
-    bounds_accumulator: OnceLock<Arc<SharedBoundsAccumulator>>,
+    build_accumulator: OnceLock<Arc<SharedBuildAccumulator>>,
 }
 
 impl fmt::Debug for HashJoinExec {
@@ -396,11 +798,11 @@ impl EmbeddedProjection for HashJoinExec {
 }
 
 impl HashJoinExec {
-    /// Tries to create a new [HashJoinExec].
+    /// Tries to create a new [`HashJoinExec`].
     ///
     /// # Error
     /// This function errors when it is not possible to join the left and right sides on keys `on`.
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     pub fn try_new(
         left: Arc<dyn ExecutionPlan>,
         right: Arc<dyn ExecutionPlan>,
@@ -410,55 +812,24 @@ impl HashJoinExec {
         projection: Option<Vec<usize>>,
         partition_mode: PartitionMode,
         null_equality: NullEquality,
+        null_aware: bool,
     ) -> Result<Self> {
-        let left_schema = left.schema();
-        let right_schema = right.schema();
-        if on.is_empty() {
-            return plan_err!("On constraints in HashJoinExec should be non-empty");
-        }
-
-        check_join_is_valid(&left_schema, &right_schema, &on)?;
-
-        let (join_schema, column_indices) =
-            build_join_schema(&left_schema, &right_schema, join_type);
-
-        let random_state = HASH_JOIN_SEED;
-
-        let join_schema = Arc::new(join_schema);
-
-        //  check if the projection is valid
-        can_project(&join_schema, projection.as_ref())?;
-
-        let cache = Self::compute_properties(
-            &left,
-            &right,
-            Arc::clone(&join_schema),
-            *join_type,
-            &on,
-            partition_mode,
-            projection.as_ref(),
-        )?;
-
-        // Initialize both dynamic filter and bounds accumulator to None
-        // They will be set later if dynamic filtering is enabled
+        HashJoinExecBuilder::new(left, right, on, *join_type)
+            .with_filter(filter)
+            .with_projection(projection)
+            .with_partition_mode(partition_mode)
+            .with_null_equality(null_equality)
+            .with_null_aware(null_aware)
+            .build()
+    }
 
-        Ok(HashJoinExec {
-            left,
-            right,
-            on,
-            filter,
-            join_type: *join_type,
-            join_schema,
-            left_fut: Default::default(),
-            random_state,
-            mode: partition_mode,
-            metrics: ExecutionPlanMetricsSet::new(),
-            projection,
-            column_indices,
-            null_equality,
-            cache,
-            dynamic_filter: None,
-        })
+    /// Create a builder based on the existing [`HashJoinExec`].
+    ///
+    /// Returned builder preserves all existing fields. If a field requiring properties
+    /// recomputation is modified, this will be done automatically during the node build.
+    ///
+    pub fn builder(&self) -> HashJoinExecBuilder {
+        self.into()
     }
 
     fn create_dynamic_filter(on: &JoinOn) -> Arc<DynamicFilterPhysicalExpr> {
@@ -469,6 +840,27 @@ impl HashJoinExec {
         Arc::new(DynamicFilterPhysicalExpr::new(right_keys, lit(true)))
     }
 
+    fn allow_join_dynamic_filter_pushdown(&self, config: &ConfigOptions) -> bool {
+        let (_, probe_preserved) = self.join_type.on_lr_is_preserved();
+        if !probe_preserved || !config.optimizer.enable_join_dynamic_filter_pushdown {
+            return false;
+        }
+
+        // `preserve_file_partitions` can report Hash partitioning for Hive-style
+        // file groups, but those partitions are not actually hash-distributed.
+        // Partitioned dynamic filters rely on hash routing, so disable them in
+        // this mode to avoid incorrect results. Follow-up work: enable dynamic
+        // filtering for preserve_file_partitioned scans (issue #20195).
+        // https://github.com/apache/datafusion/issues/20195
+        if config.optimizer.preserve_file_partitions > 0
+            && self.mode == PartitionMode::Partitioned
+        {
+            return false;
+        }
+
+        true
+    }
+
     /// left (build) side which gets hashed
     pub fn left(&self) -> &Arc<dyn ExecutionPlan> {
         &self.left
@@ -510,6 +902,35 @@ impl HashJoinExec {
         self.null_equality
     }
 
+    /// Get the dynamic filter expression for testing purposes.
+    /// Returns the dynamic filter expression for this hash join, if set.
+    pub fn dynamic_filter(&self) -> Option<&Arc<DynamicFilterPhysicalExpr>> {
+        self.dynamic_filter.as_ref().map(|df| &df.filter)
+    }
+
+    /// Set the dynamic filter on this hash join.
+    ///
+    /// Resets any internal state that depends on any previous dynamic filter.
+    ///
+    /// Validates that the filter's children reference valid columns in
+    /// the probe (right) side's schema.
+    pub fn with_dynamic_filter(
+        mut self,
+        filter: Arc<DynamicFilterPhysicalExpr>,
+    ) -> Result<Self> {
+        let probe_schema = self.right.schema();
+        for child in filter.children() {
+            child.data_type(&probe_schema)?;
+        }
+        self.dynamic_filter = Some(HashJoinExecDynamicFilter {
+            filter,
+            // Initialize with an empty accumulator which will be lazily populated
+            // during execution.
+            build_accumulator: OnceLock::new(),
+        });
+        Ok(self)
+    }
+
     /// Calculate order preservation flags for this hash join.
     fn maintains_input_order(join_type: JoinType) -> Vec<bool> {
         vec![
@@ -538,43 +959,30 @@ impl HashJoinExec {
 
     /// Return new instance of [HashJoinExec] with the given projection.
     pub fn with_projection(&self, projection: Option<Vec<usize>>) -> Result<Self> {
+        let projection = projection.map(Into::into);
         //  check if the projection is valid
-        can_project(&self.schema(), projection.as_ref())?;
-        let projection = match projection {
-            Some(projection) => match &self.projection {
-                Some(p) => Some(projection.iter().map(|i| p[*i]).collect()),
-                None => Some(projection),
-            },
-            None => None,
-        };
-        Self::try_new(
-            Arc::clone(&self.left),
-            Arc::clone(&self.right),
-            self.on.clone(),
-            self.filter.clone(),
-            &self.join_type,
-            projection,
-            self.mode,
-            self.null_equality,
-        )
+        can_project(&self.schema(), projection.as_deref())?;
+        let projection =
+            combine_projections(projection.as_ref(), self.projection.as_ref())?;
+        self.builder().with_projection_ref(projection).build()
     }
 
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
     fn compute_properties(
         left: &Arc<dyn ExecutionPlan>,
         right: &Arc<dyn ExecutionPlan>,
-        schema: SchemaRef,
+        schema: &SchemaRef,
         join_type: JoinType,
         on: JoinOnRef,
         mode: PartitionMode,
-        projection: Option<&Vec<usize>>,
+        projection: Option<&[usize]>,
     ) -> Result<PlanProperties> {
         // Calculate equivalence properties:
         let mut eq_properties = join_equivalence_properties(
             left.equivalence_properties().clone(),
             right.equivalence_properties().clone(),
             &join_type,
-            Arc::clone(&schema),
+            Arc::clone(schema),
             &Self::maintains_input_order(join_type),
             Some(Self::probe_side()),
             on,
@@ -618,9 +1026,8 @@ impl HashJoinExec {
         // If contains projection, update the PlanProperties.
         if let Some(projection) = projection {
             // construct a map from the input expressions to the output expression of the Projection
-            let projection_mapping =
-                ProjectionMapping::from_indices(projection, &schema)?;
-            let out_schema = project_schema(&schema, Some(projection))?;
+            let projection_mapping = ProjectionMapping::from_indices(projection, schema)?;
+            let out_schema = project_schema(schema, Some(&projection))?;
             output_partitioning =
                 output_partitioning.project(&projection_mapping, &eq_properties);
             eq_properties = eq_properties.project(&projection_mapping, out_schema);
@@ -663,24 +1070,25 @@ impl HashJoinExec {
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let left = self.left();
         let right = self.right();
-        let new_join = HashJoinExec::try_new(
-            Arc::clone(right),
-            Arc::clone(left),
-            self.on()
-                .iter()
-                .map(|(l, r)| (Arc::clone(r), Arc::clone(l)))
-                .collect(),
-            self.filter().map(JoinFilter::swap),
-            &self.join_type().swap(),
-            swap_join_projection(
+        let new_join = self
+            .builder()
+            .with_type(self.join_type.swap())
+            .with_new_children(vec![Arc::clone(right), Arc::clone(left)])?
+            .with_on(
+                self.on()
+                    .iter()
+                    .map(|(l, r)| (Arc::clone(r), Arc::clone(l)))
+                    .collect(),
+            )
+            .with_filter(self.filter().map(JoinFilter::swap))
+            .with_projection(swap_join_projection(
                 left.schema().fields().len(),
                 right.schema().fields().len(),
-                self.projection.as_ref(),
+                self.projection.as_deref(),
                 self.join_type(),
-            ),
-            partition_mode,
-            self.null_equality(),
-        )?;
+            ))
+            .with_partition_mode(partition_mode)
+            .build()?;
         // In case of anti / semi joins or if there is embedded projection in HashJoinExec, output column order is preserved, no need to add projection again
         if matches!(
             self.join_type(),
@@ -726,11 +1134,14 @@ impl DisplayAs for HashJoinExec {
                     "".to_string()
                 };
                 let display_null_equality =
-                    if matches!(self.null_equality(), NullEquality::NullEqualsNull) {
+                    if self.null_equality() == NullEquality::NullEqualsNull {
                         ", NullsEqual: true"
                     } else {
                         ""
                     };
+                let display_fetch = self
+                    .fetch
+                    .map_or_else(String::new, |f| format!(", fetch={f}"));
                 let on = self
                     .on
                     .iter()
@@ -739,13 +1150,14 @@ impl DisplayAs for HashJoinExec {
                     .join(", ");
                 write!(
                     f,
-                    "HashJoinExec: mode={:?}, join_type={:?}, on=[{}]{}{}{}",
+                    "HashJoinExec: mode={:?}, join_type={:?}, on=[{}]{}{}{}{}",
                     self.mode,
                     self.join_type,
                     on,
                     display_filter,
                     display_projections,
                     display_null_equality,
+                    display_fetch,
                 )
             }
             DisplayFormatType::TreeRender => {
@@ -764,7 +1176,7 @@ impl DisplayAs for HashJoinExec {
 
                 writeln!(f, "on={on}")?;
 
-                if matches!(self.null_equality(), NullEquality::NullEqualsNull) {
+                if self.null_equality() == NullEquality::NullEqualsNull {
                     writeln!(f, "NullsEqual: true")?;
                 }
 
@@ -772,6 +1184,10 @@ impl DisplayAs for HashJoinExec {
                     writeln!(f, "filter={filter}")?;
                 }
 
+                if let Some(fetch) = self.fetch {
+                    writeln!(f, "fetch={fetch}")?;
+                }
+
                 Ok(())
             }
         }
@@ -783,11 +1199,7 @@ impl ExecutionPlan for HashJoinExec {
         "HashJoinExec"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -839,63 +1251,44 @@ impl ExecutionPlan for HashJoinExec {
         vec![&self.left, &self.right]
     }
 
-    /// Creates a new HashJoinExec with different children while preserving configuration.
-    ///
-    /// This method is called during query optimization when the optimizer creates new
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to join key expressions from both sides
+        let mut tnr = TreeNodeRecursion::Continue;
+        for (left, right) in &self.on {
+            tnr = tnr.visit_sibling(|| f(left.as_ref()))?;
+            tnr = tnr.visit_sibling(|| f(right.as_ref()))?;
+        }
+
+        // Apply to join filter expression if present
+        if let Some(filter) = &self.filter {
+            tnr = tnr.visit_sibling(|| f(filter.expression().as_ref()))?;
+        }
+
+        // Apply to dynamic filter expression if present
+        if let Some(df) = &self.dynamic_filter {
+            tnr = tnr.visit_sibling(|| f(df.filter.as_ref()))?;
+        }
+
+        Ok(tnr)
+    }
+
+    /// Creates a new HashJoinExec with different children while preserving configuration.
+    ///
+    /// This method is called during query optimization when the optimizer creates new
     /// plan nodes. Importantly, it creates a fresh bounds_accumulator via `try_new`
     /// rather than cloning the existing one because partitioning may have changed.
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        Ok(Arc::new(HashJoinExec {
-            left: Arc::clone(&children[0]),
-            right: Arc::clone(&children[1]),
-            on: self.on.clone(),
-            filter: self.filter.clone(),
-            join_type: self.join_type,
-            join_schema: Arc::clone(&self.join_schema),
-            left_fut: Arc::clone(&self.left_fut),
-            random_state: self.random_state.clone(),
-            mode: self.mode,
-            metrics: ExecutionPlanMetricsSet::new(),
-            projection: self.projection.clone(),
-            column_indices: self.column_indices.clone(),
-            null_equality: self.null_equality,
-            cache: Self::compute_properties(
-                &children[0],
-                &children[1],
-                Arc::clone(&self.join_schema),
-                self.join_type,
-                &self.on,
-                self.mode,
-                self.projection.as_ref(),
-            )?,
-            // Keep the dynamic filter, bounds accumulator will be reset
-            dynamic_filter: self.dynamic_filter.clone(),
-        }))
+        self.builder().with_new_children(children)?.build_exec()
     }
 
     fn reset_state(self: Arc<Self>) -> Result<Arc<dyn ExecutionPlan>> {
-        Ok(Arc::new(HashJoinExec {
-            left: Arc::clone(&self.left),
-            right: Arc::clone(&self.right),
-            on: self.on.clone(),
-            filter: self.filter.clone(),
-            join_type: self.join_type,
-            join_schema: Arc::clone(&self.join_schema),
-            // Reset the left_fut to allow re-execution
-            left_fut: Arc::new(OnceAsync::default()),
-            random_state: self.random_state.clone(),
-            mode: self.mode,
-            metrics: ExecutionPlanMetricsSet::new(),
-            projection: self.projection.clone(),
-            column_indices: self.column_indices.clone(),
-            null_equality: self.null_equality,
-            cache: self.cache.clone(),
-            // Reset dynamic filter and bounds accumulator to initial state
-            dynamic_filter: None,
-        }))
+        self.builder().reset_state().build_exec()
     }
 
     fn execute(
@@ -911,24 +1304,65 @@ impl ExecutionPlan for HashJoinExec {
         let left_partitions = self.left.output_partitioning().partition_count();
         let right_partitions = self.right.output_partitioning().partition_count();
 
-        if self.mode == PartitionMode::Partitioned && left_partitions != right_partitions
-        {
-            return internal_err!(
-                "Invalid HashJoinExec, partition count mismatch {left_partitions}!={right_partitions},\
-                 consider using RepartitionExec"
-            );
-        }
+        assert_or_internal_err!(
+            self.mode != PartitionMode::Partitioned
+                || left_partitions == right_partitions,
+            "Invalid HashJoinExec, partition count mismatch {left_partitions}!={right_partitions},\
+             consider using RepartitionExec"
+        );
 
-        if self.mode == PartitionMode::CollectLeft && left_partitions != 1 {
-            return internal_err!(
-                "Invalid HashJoinExec, the output partition count of the left child must be 1 in CollectLeft mode,\
-                 consider using CoalescePartitionsExec or the EnforceDistribution rule"
-            );
-        }
+        assert_or_internal_err!(
+            self.mode != PartitionMode::CollectLeft || left_partitions == 1,
+            "Invalid HashJoinExec, the output partition count of the left child must be 1 in CollectLeft mode,\
+             consider using CoalescePartitionsExec or the EnforceDistribution rule"
+        );
 
-        let enable_dynamic_filter_pushdown = self.dynamic_filter.is_some();
+        // Only enable dynamic filter pushdown if:
+        // - The session config enables dynamic filter pushdown
+        // - A dynamic filter exists
+        // - At least one consumer is holding a reference to it, this avoids expensive filter
+        //   computation when disabled or when no consumer will use it.
+        let enable_dynamic_filter_pushdown = self
+            .allow_join_dynamic_filter_pushdown(context.session_config().options())
+            && self
+                .dynamic_filter
+                .as_ref()
+                .map(|df| df.filter.is_used())
+                .unwrap_or(false);
 
         let join_metrics = BuildProbeJoinMetrics::new(partition, &self.metrics);
+
+        let array_map_created_count = MetricBuilder::new(&self.metrics)
+            .with_category(MetricCategory::Rows)
+            .counter(ARRAY_MAP_CREATED_COUNT_METRIC_NAME, partition);
+
+        // Initialize build_accumulator lazily with runtime partition counts (only if enabled)
+        // Use RepartitionExec's random state (seeds: 0,0,0,0) for partition routing
+        let repartition_random_state = REPARTITION_RANDOM_STATE;
+        let build_accumulator = enable_dynamic_filter_pushdown
+            .then(|| {
+                self.dynamic_filter.as_ref().map(|df| {
+                    let filter = Arc::clone(&df.filter);
+                    let on_right = self
+                        .on
+                        .iter()
+                        .map(|(_, right_expr)| Arc::clone(right_expr))
+                        .collect::<Vec<_>>();
+                    Some(Arc::clone(df.build_accumulator.get_or_init(|| {
+                        Arc::new(SharedBuildAccumulator::new_from_partition_mode(
+                            self.mode,
+                            self.left.as_ref(),
+                            self.right.as_ref(),
+                            filter,
+                            on_right,
+                            repartition_random_state,
+                        ))
+                    })))
+                })
+            })
+            .flatten()
+            .flatten();
+
         let left_fut = match self.mode {
             PartitionMode::CollectLeft => self.left_fut.try_once(|| {
                 let left_stream = self.left.execute(0, Arc::clone(&context))?;
@@ -937,7 +1371,7 @@ impl ExecutionPlan for HashJoinExec {
                     MemoryConsumer::new("HashJoinInput").register(context.memory_pool());
 
                 Ok(collect_left_input(
-                    self.random_state.clone(),
+                    self.random_state.random_state().clone(),
                     left_stream,
                     on_left.clone(),
                     join_metrics.clone(),
@@ -945,6 +1379,9 @@ impl ExecutionPlan for HashJoinExec {
                     need_produce_result_in_final(self.join_type),
                     self.right().output_partitioning().partition_count(),
                     enable_dynamic_filter_pushdown,
+                    Arc::clone(context.session_config().options()),
+                    self.null_equality,
+                    array_map_created_count,
                 ))
             })?,
             PartitionMode::Partitioned => {
@@ -953,9 +1390,8 @@ impl ExecutionPlan for HashJoinExec {
                 let reservation =
                     MemoryConsumer::new(format!("HashJoinInput[{partition}]"))
                         .register(context.memory_pool());
-
                 OnceFut::new(collect_left_input(
-                    self.random_state.clone(),
+                    self.random_state.random_state().clone(),
                     left_stream,
                     on_left.clone(),
                     join_metrics.clone(),
@@ -963,6 +1399,9 @@ impl ExecutionPlan for HashJoinExec {
                     need_produce_result_in_final(self.join_type),
                     1,
                     enable_dynamic_filter_pushdown,
+                    Arc::clone(context.session_config().options()),
+                    self.null_equality,
+                    array_map_created_count,
                 ))
             }
             PartitionMode::Auto => {
@@ -975,36 +1414,12 @@ impl ExecutionPlan for HashJoinExec {
 
         let batch_size = context.session_config().batch_size();
 
-        // Initialize bounds_accumulator lazily with runtime partition counts (only if enabled)
-        let bounds_accumulator = enable_dynamic_filter_pushdown
-            .then(|| {
-                self.dynamic_filter.as_ref().map(|df| {
-                    let filter = Arc::clone(&df.filter);
-                    let on_right = self
-                        .on
-                        .iter()
-                        .map(|(_, right_expr)| Arc::clone(right_expr))
-                        .collect::<Vec<_>>();
-                    Some(Arc::clone(df.bounds_accumulator.get_or_init(|| {
-                        Arc::new(SharedBoundsAccumulator::new_from_partition_mode(
-                            self.mode,
-                            self.left.as_ref(),
-                            self.right.as_ref(),
-                            filter,
-                            on_right,
-                        ))
-                    })))
-                })
-            })
-            .flatten()
-            .flatten();
-
         // we have the batches and the hash map with their keys. We can how create a stream
         // over the right that uses this information to issue new batches.
         let right_stream = self.right.execute(partition, context)?;
 
         // update column indices to reflect the projection
-        let column_indices_after_projection = match &self.projection {
+        let column_indices_after_projection = match self.projection.as_ref() {
             Some(projection) => projection
                 .iter()
                 .map(|i| self.column_indices[*i].clone())
@@ -1025,7 +1440,7 @@ impl ExecutionPlan for HashJoinExec {
             self.filter.clone(),
             self.join_type,
             right_stream,
-            self.random_state.clone(),
+            self.random_state.random_state().clone(),
             join_metrics,
             column_indices_after_projection,
             self.null_equality,
@@ -1034,8 +1449,10 @@ impl ExecutionPlan for HashJoinExec {
             batch_size,
             vec![],
             self.right.output_ordering().is_some(),
-            bounds_accumulator,
+            build_accumulator,
             self.mode,
+            self.null_aware,
+            self.fetch,
         )))
     }
 
@@ -1043,26 +1460,60 @@ impl ExecutionPlan for HashJoinExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let stats = match (partition, self.mode) {
+            // For CollectLeft mode, the left side is collected into a single partition,
+            // so all left partitions are available to each output partition.
+            // For the right side, we need the specific partition statistics.
+            (Some(partition), PartitionMode::CollectLeft) => {
+                let left_stats = self.left.partition_statistics(None)?;
+                let right_stats = self.right.partition_statistics(Some(partition))?;
+
+                estimate_join_statistics(
+                    Arc::unwrap_or_clone(left_stats),
+                    Arc::unwrap_or_clone(right_stats),
+                    &self.on,
+                    &self.join_type,
+                    &self.join_schema,
+                )?
+            }
 
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        if partition.is_some() {
-            return Ok(Statistics::new_unknown(&self.schema()));
-        }
-        // TODO stats: it is not possible in general to know the output size of joins
-        // There are some special cases though, for example:
-        // - `A LEFT JOIN B ON A.col=B.col` with `COUNT_DISTINCT(B.col)=COUNT(B.col)`
-        let stats = estimate_join_statistics(
-            self.left.partition_statistics(None)?,
-            self.right.partition_statistics(None)?,
-            self.on.clone(),
-            &self.join_type,
-            &self.join_schema,
-        )?;
+            // For Partitioned mode, both sides are partitioned, so each output partition
+            // only has access to the corresponding partition from both sides.
+            (Some(partition), PartitionMode::Partitioned) => {
+                let left_stats = self.left.partition_statistics(Some(partition))?;
+                let right_stats = self.right.partition_statistics(Some(partition))?;
+
+                estimate_join_statistics(
+                    Arc::unwrap_or_clone(left_stats),
+                    Arc::unwrap_or_clone(right_stats),
+                    &self.on,
+                    &self.join_type,
+                    &self.join_schema,
+                )?
+            }
+
+            // For Auto mode or when no specific partition is requested, fall back to
+            // the current behavior of getting all partition statistics.
+            (None, _) | (Some(_), PartitionMode::Auto) => {
+                // TODO stats: it is not possible in general to know the output size of joins
+                // There are some special cases though, for example:
+                // - `A LEFT JOIN B ON A.col=B.col` with `COUNT_DISTINCT(B.col)=COUNT(B.col)`
+                let left_stats = self.left.partition_statistics(None)?;
+                let right_stats = self.right.partition_statistics(None)?;
+                estimate_join_statistics(
+                    Arc::unwrap_or_clone(left_stats),
+                    Arc::unwrap_or_clone(right_stats),
+                    &self.on,
+                    &self.join_type,
+                    &self.join_schema,
+                )?
+            }
+        };
         // Project statistics if there is a projection
-        Ok(stats.project(self.projection.as_ref()))
+        let stats = stats.project(self.projection.as_ref());
+        // Apply fetch limit to statistics
+        Ok(Arc::new(stats.with_fetch(self.fetch, 0, 1)?))
     }
 
     /// Tries to push `projection` down through `hash_join`. If possible, performs the
@@ -1077,6 +1528,7 @@ impl ExecutionPlan for HashJoinExec {
             return Ok(None);
         }
 
+        let schema = self.schema();
         if let Some(JoinData {
             projected_left_child,
             projected_right_child,
@@ -1087,20 +1539,20 @@ impl ExecutionPlan for HashJoinExec {
             self.left(),
             self.right(),
             self.on(),
-            self.schema(),
+            &schema,
             self.filter(),
         )? {
-            Ok(Some(Arc::new(HashJoinExec::try_new(
-                Arc::new(projected_left_child),
-                Arc::new(projected_right_child),
-                join_on,
-                join_filter,
-                self.join_type(),
+            self.builder()
+                .with_new_children(vec![
+                    Arc::new(projected_left_child),
+                    Arc::new(projected_right_child),
+                ])?
+                .with_on(join_on)
+                .with_filter(join_filter)
                 // Returned early if projection is not None
-                None,
-                *self.partition_mode(),
-                self.null_equality,
-            )?)))
+                .with_projection(None)
+                .build_exec()
+                .map(Some)
         } else {
             try_embed_projection(projection, self)
         }
@@ -1112,30 +1564,105 @@ impl ExecutionPlan for HashJoinExec {
         parent_filters: Vec<Arc<dyn PhysicalExpr>>,
         config: &ConfigOptions,
     ) -> Result<FilterDescription> {
-        // Other types of joins can support *some* filters, but restrictions are complex and error prone.
-        // For now we don't support them.
-        // See the logical optimizer rules for more details: datafusion/optimizer/src/push_down_filter.rs
-        // See https://github.com/apache/datafusion/issues/16973 for tracking.
-        if self.join_type != JoinType::Inner {
-            return Ok(FilterDescription::all_unsupported(
-                &parent_filters,
-                &self.children(),
-            ));
+        // This is the physical-plan equivalent of `push_down_all_join` in
+        // `datafusion/optimizer/src/push_down_filter.rs`. That function uses `lr_is_preserved`
+        // to decide which parent predicates can be pushed past a logical join to its children,
+        // then checks column references to route each predicate to the correct side.
+        //
+        // We apply the same two-level logic here:
+        // 1. `lr_is_preserved` gates whether a side is eligible at all.
+        // 2. For each filter, we check that all column references belong to the
+        //    target child (using `column_indices` to map output column positions
+        //    to join sides). This is critical for correctness: name-based matching
+        //    alone (as done by `ChildFilterDescription::from_child`) can incorrectly
+        //    push filters when different join sides have columns with the same name
+        //    (e.g. nested mark joins both producing "mark" columns).
+        let (left_preserved, right_preserved) = lr_is_preserved(self.join_type);
+
+        // Build the set of allowed column indices for each side
+        let column_indices: Vec<ColumnIndex> = match self.projection.as_ref() {
+            Some(projection) => projection
+                .iter()
+                .map(|i| self.column_indices[*i].clone())
+                .collect(),
+            None => self.column_indices.clone(),
+        };
+
+        let (mut left_allowed, mut right_allowed) = (HashSet::new(), HashSet::new());
+        column_indices
+            .iter()
+            .enumerate()
+            .for_each(|(output_idx, ci)| {
+                match ci.side {
+                    JoinSide::Left => left_allowed.insert(output_idx),
+                    JoinSide::Right => right_allowed.insert(output_idx),
+                    // Mark columns - don't allow pushdown to either side
+                    JoinSide::None => false,
+                };
+            });
+
+        // For semi/anti joins, the non-preserved side's columns are not in the
+        // output, but filters on join key columns can still be pushed there.
+        // We find output columns that are join keys on the preserved side and
+        // add their output indices to the non-preserved side's allowed set.
+        // The name-based remap in FilterRemapper will then match them to the
+        // corresponding column in the non-preserved child's schema.
+        match self.join_type {
+            JoinType::LeftSemi | JoinType::LeftAnti => {
+                let left_key_indices: HashSet<usize> = self
+                    .on
+                    .iter()
+                    .filter_map(|(left_key, _)| {
+                        left_key.downcast_ref::<Column>().map(|c| c.index())
+                    })
+                    .collect();
+                for (output_idx, ci) in column_indices.iter().enumerate() {
+                    if ci.side == JoinSide::Left && left_key_indices.contains(&ci.index) {
+                        right_allowed.insert(output_idx);
+                    }
+                }
+            }
+            JoinType::RightSemi | JoinType::RightAnti => {
+                let right_key_indices: HashSet<usize> = self
+                    .on
+                    .iter()
+                    .filter_map(|(_, right_key)| {
+                        right_key.downcast_ref::<Column>().map(|c| c.index())
+                    })
+                    .collect();
+                for (output_idx, ci) in column_indices.iter().enumerate() {
+                    if ci.side == JoinSide::Right && right_key_indices.contains(&ci.index)
+                    {
+                        left_allowed.insert(output_idx);
+                    }
+                }
+            }
+            _ => {}
         }
 
-        // Get basic filter descriptions for both children
-        let left_child = crate::filter_pushdown::ChildFilterDescription::from_child(
-            &parent_filters,
-            self.left(),
-        )?;
-        let mut right_child = crate::filter_pushdown::ChildFilterDescription::from_child(
-            &parent_filters,
-            self.right(),
-        )?;
+        let left_child = if left_preserved {
+            ChildFilterDescription::from_child_with_allowed_indices(
+                &parent_filters,
+                left_allowed,
+                self.left(),
+            )?
+        } else {
+            ChildFilterDescription::all_unsupported(&parent_filters)
+        };
+
+        let mut right_child = if right_preserved {
+            ChildFilterDescription::from_child_with_allowed_indices(
+                &parent_filters,
+                right_allowed,
+                self.right(),
+            )?
+        } else {
+            ChildFilterDescription::all_unsupported(&parent_filters)
+        };
 
         // Add dynamic filters in Post phase if enabled
-        if matches!(phase, FilterPushdownPhase::Post)
-            && config.optimizer.enable_join_dynamic_filter_pushdown
+        if phase == FilterPushdownPhase::Post
+            && self.allow_join_dynamic_filter_pushdown(config)
         {
             // Add actual dynamic filter to right side (probe side)
             let dynamic_filter = Self::create_dynamic_filter(&self.on);
@@ -1153,23 +1680,10 @@ impl ExecutionPlan for HashJoinExec {
         child_pushdown_result: ChildPushdownResult,
         _config: &ConfigOptions,
     ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
-        // Note: this check shouldn't be necessary because we already marked all parent filters as unsupported for
-        // non-inner joins in `gather_filters_for_pushdown`.
-        // However it's a cheap check and serves to inform future devs touching this function that they need to be really
-        // careful pushing down filters through non-inner joins.
-        if self.join_type != JoinType::Inner {
-            // Other types of joins can support *some* filters, but restrictions are complex and error prone.
-            // For now we don't support them.
-            // See the logical optimizer rules for more details: datafusion/optimizer/src/push_down_filter.rs
-            return Ok(FilterPushdownPropagation::all_unsupported(
-                child_pushdown_result,
-            ));
-        }
-
         let mut result = FilterPushdownPropagation::if_any(child_pushdown_result.clone());
         assert_eq!(child_pushdown_result.self_filters.len(), 2); // Should always be 2, we have 2 children
         let right_child_self_filters = &child_pushdown_result.self_filters[1]; // We only push down filters to the right child
-                                                                               // We expect 0 or 1 self filters
+        // We expect 0 or 1 self filters
         if let Some(filter) = right_child_self_filters.first() {
             // Note that we don't check PushdDownPredicate::discrimnant because even if nothing said
             // "yes, I can fully evaluate this filter" things might still use it for statistics -> it's worth updating
@@ -1178,31 +1692,58 @@ impl ExecutionPlan for HashJoinExec {
                 Arc::downcast::<DynamicFilterPhysicalExpr>(predicate)
             {
                 // We successfully pushed down our self filter - we need to make a new node with the dynamic filter
-                let new_node = Arc::new(HashJoinExec {
-                    left: Arc::clone(&self.left),
-                    right: Arc::clone(&self.right),
-                    on: self.on.clone(),
-                    filter: self.filter.clone(),
-                    join_type: self.join_type,
-                    join_schema: Arc::clone(&self.join_schema),
-                    left_fut: Arc::clone(&self.left_fut),
-                    random_state: self.random_state.clone(),
-                    mode: self.mode,
-                    metrics: ExecutionPlanMetricsSet::new(),
-                    projection: self.projection.clone(),
-                    column_indices: self.column_indices.clone(),
-                    null_equality: self.null_equality,
-                    cache: self.cache.clone(),
-                    dynamic_filter: Some(HashJoinExecDynamicFilter {
+                let new_node = self
+                    .builder()
+                    .with_dynamic_filter(Some(HashJoinExecDynamicFilter {
                         filter: dynamic_filter,
-                        bounds_accumulator: OnceLock::new(),
-                    }),
-                });
-                result = result.with_updated_node(new_node as Arc<dyn ExecutionPlan>);
+                        build_accumulator: OnceLock::new(),
+                    }))
+                    .build_exec()?;
+                result = result.with_updated_node(new_node);
             }
         }
         Ok(result)
     }
+
+    fn supports_limit_pushdown(&self) -> bool {
+        // Hash join execution plan does not support pushing limit down through to children
+        // because the children don't know about the join condition and can't
+        // determine how many rows to produce
+        false
+    }
+
+    fn fetch(&self) -> Option<usize> {
+        self.fetch
+    }
+
+    fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn ExecutionPlan>> {
+        self.builder()
+            .with_fetch(limit)
+            .build()
+            .ok()
+            .map(|exec| Arc::new(exec) as _)
+    }
+}
+
+/// Determines which sides of a join are "preserved" for filter pushdown.
+///
+/// A preserved side means filters on that side's columns can be safely pushed
+/// below the join. This mirrors the logic in the logical optimizer's
+/// `lr_is_preserved` in `datafusion/optimizer/src/push_down_filter.rs`.
+fn lr_is_preserved(join_type: JoinType) -> (bool, bool) {
+    match join_type {
+        JoinType::Inner => (true, true),
+        JoinType::Left => (true, false),
+        JoinType::Right => (false, true),
+        JoinType::Full => (false, false),
+        // Filters in semi/anti joins are either on the preserved side, or on join keys,
+        // as all output columns come from the preserved side. Join key filters can be
+        // safely pushed down into the other side.
+        JoinType::LeftSemi | JoinType::LeftAnti => (true, true),
+        JoinType::RightSemi | JoinType::RightAnti => (true, true),
+        JoinType::LeftMark => (true, false),
+        JoinType::RightMark => (false, true),
+    }
 }
 
 /// Accumulator for collecting min/max bounds from build-side data during hash join.
@@ -1301,20 +1842,18 @@ impl BuildSideState {
         reservation: MemoryReservation,
         on_left: Vec<Arc<dyn PhysicalExpr>>,
         schema: &SchemaRef,
-        should_compute_bounds: bool,
+        should_compute_dynamic_filters: bool,
     ) -> Result<Self> {
         Ok(Self {
             batches: Vec::new(),
             num_rows: 0,
             metrics,
             reservation,
-            bounds_accumulators: should_compute_bounds
+            bounds_accumulators: should_compute_dynamic_filters
                 .then(|| {
                     on_left
-                        .iter()
-                        .map(|expr| {
-                            CollectLeftAccumulator::try_new(Arc::clone(expr), schema)
-                        })
+                        .into_iter()
+                        .map(|expr| CollectLeftAccumulator::try_new(expr, schema))
                         .collect::<Result<Vec<_>>>()
                 })
                 .transpose()?,
@@ -1322,6 +1861,19 @@ impl BuildSideState {
     }
 }
 
+fn should_collect_min_max_for_perfect_hash(
+    on_left: &[PhysicalExprRef],
+    schema: &SchemaRef,
+) -> Result<bool> {
+    if on_left.len() != 1 {
+        return Ok(false);
+    }
+
+    let expr = &on_left[0];
+    let data_type = expr.data_type(schema)?;
+    Ok(ArrayMap::is_supported_type(&data_type))
+}
+
 /// Collects all batches from the left (build) side stream and creates a hash map for joining.
 ///
 /// This function is responsible for:
@@ -1338,19 +1890,19 @@ impl BuildSideState {
 /// * `reservation` - Memory reservation tracker for the hash table and data
 /// * `with_visited_indices_bitmap` - Whether to track visited indices (for outer joins)
 /// * `probe_threads_count` - Number of threads that will probe this hash table
-/// * `should_compute_bounds` - Whether to compute min/max bounds for dynamic filtering
+/// * `should_compute_dynamic_filters` - Whether to compute min/max bounds for dynamic filtering
 ///
 /// # Dynamic Filter Coordination
-/// When `should_compute_bounds` is true, this function computes the min/max bounds
+/// When `should_compute_dynamic_filters` is true, this function computes the min/max bounds
 /// for each join key column but does NOT update the dynamic filter. Instead, the
 /// bounds are stored in the returned `JoinLeftData` and later coordinated by
-/// `SharedBoundsAccumulator` to ensure all partitions contribute their bounds
+/// `SharedBuildAccumulator` to ensure all partitions contribute their bounds
 /// before updating the filter exactly once.
 ///
 /// # Returns
 /// `JoinLeftData` containing the hash map, consolidated batch, join key values,
 /// visited indices bitmap, and computed bounds (if requested).
-#[allow(clippy::too_many_arguments)]
+#[expect(clippy::too_many_arguments)]
 async fn collect_left_input(
     random_state: RandomState,
     left_stream: SendableRecordBatchStream,
@@ -1359,19 +1911,22 @@ async fn collect_left_input(
     reservation: MemoryReservation,
     with_visited_indices_bitmap: bool,
     probe_threads_count: usize,
-    should_compute_bounds: bool,
+    should_compute_dynamic_filters: bool,
+    config: Arc<ConfigOptions>,
+    null_equality: NullEquality,
+    array_map_created_count: Count,
 ) -> Result<JoinLeftData> {
     let schema = left_stream.schema();
 
-    // This operation performs 2 steps at once:
-    // 1. creates a [JoinHashMap] of all batches from the stream
-    // 2. stores the batches in a vector.
+    let should_collect_min_max_for_phj =
+        should_collect_min_max_for_perfect_hash(&on_left, &schema)?;
+
     let initial = BuildSideState::try_new(
         metrics,
         reservation,
         on_left.clone(),
         &schema,
-        should_compute_bounds,
+        should_compute_dynamic_filters || should_collect_min_max_for_phj,
     )?;
 
     let state = left_stream
@@ -1408,92 +1963,143 @@ async fn collect_left_input(
         bounds_accumulators,
     } = state;
 
-    // Estimation of memory size, required for hashtable, prior to allocation.
-    // Final result can be verified using `RawTable.allocation_info()`
-    let fixed_size_u32 = size_of::<JoinHashMapU32>();
-    let fixed_size_u64 = size_of::<JoinHashMapU64>();
-
-    // Use `u32` indices for the JoinHashMap when num_rows ≤ u32::MAX, otherwise use the
-    // `u64` indice variant
-    let mut hashmap: Box<dyn JoinHashMapType> = if num_rows > u32::MAX as usize {
-        let estimated_hashtable_size =
-            estimate_memory_size::<(u64, u64)>(num_rows, fixed_size_u64)?;
-        reservation.try_grow(estimated_hashtable_size)?;
-        metrics.build_mem_used.add(estimated_hashtable_size);
-        Box::new(JoinHashMapU64::with_capacity(num_rows))
-    } else {
-        let estimated_hashtable_size =
-            estimate_memory_size::<(u32, u64)>(num_rows, fixed_size_u32)?;
-        reservation.try_grow(estimated_hashtable_size)?;
-        metrics.build_mem_used.add(estimated_hashtable_size);
-        Box::new(JoinHashMapU32::with_capacity(num_rows))
+    // Compute bounds
+    let mut bounds = match bounds_accumulators {
+        Some(accumulators) if num_rows > 0 => {
+            let bounds = accumulators
+                .into_iter()
+                .map(CollectLeftAccumulator::evaluate)
+                .collect::<Result<Vec<_>>>()?;
+            Some(PartitionBounds::new(bounds))
+        }
+        _ => None,
     };
 
-    let mut hashes_buffer = Vec::new();
-    let mut offset = 0;
-
-    // Updating hashmap starting from the last batch
-    let batches_iter = batches.iter().rev();
-    for batch in batches_iter.clone() {
-        hashes_buffer.clear();
-        hashes_buffer.resize(batch.num_rows(), 0);
-        update_hash(
+    let (join_hash_map, batch, left_values) =
+        if let Some((array_map, batch, left_value)) = try_create_array_map(
+            &bounds,
+            &schema,
+            &batches,
             &on_left,
-            batch,
-            &mut *hashmap,
-            offset,
-            &random_state,
-            &mut hashes_buffer,
-            0,
-            true,
-        )?;
-        offset += batch.num_rows();
-    }
-    // Merge all batches into a single batch, so we can directly index into the arrays
-    let single_batch = concat_batches(&schema, batches_iter)?;
+            &mut reservation,
+            config.execution.perfect_hash_join_small_build_threshold,
+            config.execution.perfect_hash_join_min_key_density,
+            null_equality,
+        )? {
+            array_map_created_count.add(1);
+            metrics.build_mem_used.add(array_map.size());
+
+            (Map::ArrayMap(array_map), batch, left_value)
+        } else {
+            // Estimation of memory size, required for hashtable, prior to allocation.
+            // Final result can be verified using `RawTable.allocation_info()`
+            let fixed_size_u32 = size_of::<JoinHashMapU32>();
+            let fixed_size_u64 = size_of::<JoinHashMapU64>();
+
+            // Use `u32` indices for the JoinHashMap when num_rows ≤ u32::MAX, otherwise use the
+            // `u64` indice variant
+            // Arc is used instead of Box to allow sharing with SharedBuildAccumulator for hash map pushdown
+            let mut hashmap: Box<dyn JoinHashMapType> = if num_rows > u32::MAX as usize {
+                let estimated_hashtable_size =
+                    estimate_memory_size::<(u64, u64)>(num_rows, fixed_size_u64)?;
+                reservation.try_grow(estimated_hashtable_size)?;
+                metrics.build_mem_used.add(estimated_hashtable_size);
+                Box::new(JoinHashMapU64::with_capacity(num_rows))
+            } else {
+                let estimated_hashtable_size =
+                    estimate_memory_size::<(u32, u64)>(num_rows, fixed_size_u32)?;
+                reservation.try_grow(estimated_hashtable_size)?;
+                metrics.build_mem_used.add(estimated_hashtable_size);
+                Box::new(JoinHashMapU32::with_capacity(num_rows))
+            };
+
+            let mut hashes_buffer = Vec::new();
+            let mut offset = 0;
+
+            let batches_iter = batches.iter().rev();
+
+            // Updating hashmap starting from the last batch
+            for batch in batches_iter.clone() {
+                hashes_buffer.clear();
+                hashes_buffer.resize(batch.num_rows(), 0);
+                update_hash(
+                    &on_left,
+                    batch,
+                    &mut *hashmap,
+                    offset,
+                    &random_state,
+                    &mut hashes_buffer,
+                    0,
+                    true,
+                )?;
+                offset += batch.num_rows();
+            }
+
+            // Merge all batches into a single batch, so we can directly index into the arrays
+            let batch = concat_batches(&schema, batches_iter.clone())?;
+
+            let left_values = evaluate_expressions_to_arrays(&on_left, &batch)?;
+
+            (Map::HashMap(hashmap), batch, left_values)
+        };
 
     // Reserve additional memory for visited indices bitmap and create shared builder
     let visited_indices_bitmap = if with_visited_indices_bitmap {
-        let bitmap_size = bit_util::ceil(single_batch.num_rows(), 8);
+        let bitmap_size = bit_util::ceil(batch.num_rows(), 8);
         reservation.try_grow(bitmap_size)?;
         metrics.build_mem_used.add(bitmap_size);
 
-        let mut bitmap_buffer = BooleanBufferBuilder::new(single_batch.num_rows());
+        let mut bitmap_buffer = BooleanBufferBuilder::new(batch.num_rows());
         bitmap_buffer.append_n(num_rows, false);
         bitmap_buffer
     } else {
         BooleanBufferBuilder::new(0)
     };
 
-    let left_values = on_left
-        .iter()
-        .map(|c| {
-            c.evaluate(&single_batch)?
-                .into_array(single_batch.num_rows())
-        })
-        .collect::<Result<Vec<_>>>()?;
+    let map = Arc::new(join_hash_map);
 
-    // Compute bounds for dynamic filter if enabled
-    let bounds = match bounds_accumulators {
-        Some(accumulators) if num_rows > 0 => {
-            let bounds = accumulators
-                .into_iter()
-                .map(CollectLeftAccumulator::evaluate)
-                .collect::<Result<Vec<_>>>()?;
-            Some(bounds)
+    let membership = if num_rows == 0 {
+        PushdownStrategy::Empty
+    } else {
+        // If the build side is small enough we can use IN list pushdown.
+        // If it's too big we fall back to pushing down a reference to the hash table.
+        // See `PushdownStrategy` for more details.
+        let estimated_size = left_values
+            .iter()
+            .map(|arr| arr.get_array_memory_size())
+            .sum::<usize>();
+        if left_values.is_empty()
+            || left_values[0].is_empty()
+            || estimated_size > config.optimizer.hash_join_inlist_pushdown_max_size
+            || map.num_of_distinct_key()
+                > config
+                    .optimizer
+                    .hash_join_inlist_pushdown_max_distinct_values
+        {
+            PushdownStrategy::Map(Arc::clone(&map))
+        } else if let Some(in_list_values) = build_struct_inlist_values(&left_values)? {
+            PushdownStrategy::InList(in_list_values)
+        } else {
+            PushdownStrategy::Map(Arc::clone(&map))
         }
-        _ => None,
     };
 
-    let data = JoinLeftData::new(
-        hashmap,
-        single_batch,
-        left_values.clone(),
-        Mutex::new(visited_indices_bitmap),
-        AtomicUsize::new(probe_threads_count),
-        reservation,
+    if should_collect_min_max_for_phj && !should_compute_dynamic_filters {
+        bounds = None;
+    }
+
+    let data = JoinLeftData {
+        map,
+        batch,
+        values: left_values,
+        visited_indices_bitmap: Mutex::new(visited_indices_bitmap),
+        probe_threads_counter: AtomicUsize::new(probe_threads_count),
+        _reservation: reservation,
         bounds,
-    );
+        membership,
+        probe_side_non_empty: AtomicBool::new(false),
+        probe_side_has_null: AtomicBool::new(false),
+    };
 
     Ok(data)
 }
@@ -1501,29 +2107,66 @@ async fn collect_left_input(
 #[cfg(test)]
 mod tests {
     use super::*;
+
+    fn assert_phj_used(metrics: &MetricsSet, use_phj: bool) {
+        if use_phj {
+            assert!(
+                metrics
+                    .sum_by_name(ARRAY_MAP_CREATED_COUNT_METRIC_NAME)
+                    .expect("should have array_map_created_count metrics")
+                    .as_usize()
+                    >= 1
+            );
+        } else {
+            assert_eq!(
+                metrics
+                    .sum_by_name(ARRAY_MAP_CREATED_COUNT_METRIC_NAME)
+                    .map(|v| v.as_usize())
+                    .unwrap_or(0),
+                0
+            )
+        }
+    }
+
+    fn build_schema_and_on() -> Result<(SchemaRef, SchemaRef, JoinOn)> {
+        let left_schema = Arc::new(Schema::new(vec![
+            Field::new("a1", DataType::Int32, true),
+            Field::new("b1", DataType::Int32, true),
+        ]));
+        let right_schema = Arc::new(Schema::new(vec![
+            Field::new("a2", DataType::Int32, true),
+            Field::new("b1", DataType::Int32, true),
+        ]));
+        let on = vec![(
+            Arc::new(Column::new_with_schema("b1", &left_schema)?) as _,
+            Arc::new(Column::new_with_schema("b1", &right_schema)?) as _,
+        )];
+        Ok((left_schema, right_schema, on))
+    }
+
     use crate::coalesce_partitions::CoalescePartitionsExec;
     use crate::joins::hash_join::stream::lookup_join_hashmap;
-    use crate::test::{assert_join_metrics, TestMemoryExec};
+    use crate::test::{TestMemoryExec, assert_join_metrics};
     use crate::{
         common, expressions::Column, repartition::RepartitionExec, test::build_table_i32,
         test::exec::MockExec,
     };
 
-    use arrow::array::{Date32Array, Int32Array, StructArray, UInt32Array, UInt64Array};
+    use arrow::array::{
+        Date32Array, Int32Array, Int64Array, StructArray, UInt32Array, UInt64Array,
+    };
     use arrow::buffer::NullBuffer;
     use arrow::datatypes::{DataType, Field};
-    use arrow_schema::Schema;
     use datafusion_common::hash_utils::create_hashes;
     use datafusion_common::test_util::{batches_to_sort_string, batches_to_string};
     use datafusion_common::{
-        assert_batches_eq, assert_batches_sorted_eq, assert_contains, exec_err,
-        ScalarValue,
+        ScalarValue, assert_batches_eq, assert_batches_sorted_eq, assert_contains,
+        exec_err, internal_err,
     };
     use datafusion_execution::config::SessionConfig;
     use datafusion_execution::runtime_env::RuntimeEnvBuilder;
     use datafusion_expr::Operator;
     use datafusion_physical_expr::expressions::{BinaryExpr, Literal};
-    use datafusion_physical_expr::PhysicalExpr;
     use hashbrown::HashTable;
     use insta::{allow_duplicates, assert_snapshot};
     use rstest::*;
@@ -1535,10 +2178,37 @@ mod tests {
 
     #[template]
     #[rstest]
-    fn batch_sizes(#[values(8192, 10, 5, 2, 1)] batch_size: usize) {}
+    fn hash_join_exec_configs(
+        #[values(8192, 10, 5, 2, 1)] batch_size: usize,
+        #[values(true, false)] use_perfect_hash_join_as_possible: bool,
+    ) {
+    }
 
-    fn prepare_task_ctx(batch_size: usize) -> Arc<TaskContext> {
-        let session_config = SessionConfig::default().with_batch_size(batch_size);
+    fn prepare_task_ctx(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Arc<TaskContext> {
+        let mut session_config = SessionConfig::default().with_batch_size(batch_size);
+
+        if use_perfect_hash_join_as_possible {
+            session_config
+                .options_mut()
+                .execution
+                .perfect_hash_join_small_build_threshold = 819200;
+            session_config
+                .options_mut()
+                .execution
+                .perfect_hash_join_min_key_density = 0.0;
+        } else {
+            session_config
+                .options_mut()
+                .execution
+                .perfect_hash_join_small_build_threshold = 0;
+            session_config
+                .options_mut()
+                .execution
+                .perfect_hash_join_min_key_density = f64::INFINITY;
+        }
         Arc::new(TaskContext::default().with_session_config(session_config))
     }
 
@@ -1552,6 +2222,26 @@ mod tests {
         TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap()
     }
 
+    /// Build a table with two columns supporting nullable values
+    fn build_table_two_cols(
+        a: (&str, &Vec<Option<i32>>),
+        b: (&str, &Vec<Option<i32>>),
+    ) -> Arc<dyn ExecutionPlan> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new(a.0, DataType::Int32, true),
+            Field::new(b.0, DataType::Int32, true),
+        ]));
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(a.1.clone())),
+                Arc::new(Int32Array::from(b.1.clone())),
+            ],
+        )
+        .unwrap();
+        TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap()
+    }
+
     fn join(
         left: Arc<dyn ExecutionPlan>,
         right: Arc<dyn ExecutionPlan>,
@@ -1568,6 +2258,7 @@ mod tests {
             None,
             PartitionMode::CollectLeft,
             null_equality,
+            false,
         )
     }
 
@@ -1588,9 +2279,130 @@ mod tests {
             None,
             PartitionMode::CollectLeft,
             null_equality,
+            false,
+        )
+    }
+
+    fn empty_build_with_probe_error_inputs()
+    -> (Arc<dyn ExecutionPlan>, Arc<dyn ExecutionPlan>, JoinOn) {
+        let left_batch =
+            build_table_i32(("a1", &vec![]), ("b1", &vec![]), ("c1", &vec![]));
+        let left_schema = left_batch.schema();
+        let left: Arc<dyn ExecutionPlan> = TestMemoryExec::try_new_exec(
+            &[vec![left_batch]],
+            Arc::clone(&left_schema),
+            None,
+        )
+        .unwrap();
+
+        let err = exec_err!("bad data error");
+        let right_batch =
+            build_table_i32(("a2", &vec![]), ("b1", &vec![]), ("c2", &vec![]));
+        let right_schema = right_batch.schema();
+        let on = vec![(
+            Arc::new(Column::new_with_schema("b1", &left_schema).unwrap()) as _,
+            Arc::new(Column::new_with_schema("b1", &right_schema).unwrap()) as _,
+        )];
+        let right: Arc<dyn ExecutionPlan> = Arc::new(
+            MockExec::new(vec![Ok(right_batch), err], right_schema).with_use_task(false),
+        );
+
+        (left, right, on)
+    }
+
+    async fn assert_empty_build_probe_behavior(
+        join_types: &[JoinType],
+        expect_probe_error: bool,
+        with_filter: bool,
+    ) {
+        let (left, right, on) = empty_build_with_probe_error_inputs();
+        let filter = prepare_join_filter();
+
+        for join_type in join_types {
+            let join = if with_filter {
+                join_with_filter(
+                    Arc::clone(&left),
+                    Arc::clone(&right),
+                    on.clone(),
+                    filter.clone(),
+                    join_type,
+                    NullEquality::NullEqualsNothing,
+                )
+                .unwrap()
+            } else {
+                join(
+                    Arc::clone(&left),
+                    Arc::clone(&right),
+                    on.clone(),
+                    join_type,
+                    NullEquality::NullEqualsNothing,
+                )
+                .unwrap()
+            };
+
+            let result = common::collect(
+                join.execute(0, Arc::new(TaskContext::default())).unwrap(),
+            )
+            .await;
+
+            if expect_probe_error {
+                let result_string = result.unwrap_err().to_string();
+                assert!(
+                    result_string.contains("bad data error"),
+                    "actual: {result_string}"
+                );
+            } else {
+                let batches = result.unwrap();
+                assert!(
+                    batches.is_empty(),
+                    "expected no output batches for {join_type}, got {batches:?}"
+                );
+            }
+        }
+    }
+
+    fn hash_join_with_dynamic_filter(
+        left: Arc<dyn ExecutionPlan>,
+        right: Arc<dyn ExecutionPlan>,
+        on: JoinOn,
+        join_type: JoinType,
+    ) -> Result<(HashJoinExec, Arc<DynamicFilterPhysicalExpr>)> {
+        hash_join_with_dynamic_filter_and_mode(
+            left,
+            right,
+            on,
+            join_type,
+            PartitionMode::CollectLeft,
         )
     }
 
+    fn hash_join_with_dynamic_filter_and_mode(
+        left: Arc<dyn ExecutionPlan>,
+        right: Arc<dyn ExecutionPlan>,
+        on: JoinOn,
+        join_type: JoinType,
+        mode: PartitionMode,
+    ) -> Result<(HashJoinExec, Arc<DynamicFilterPhysicalExpr>)> {
+        let dynamic_filter = HashJoinExec::create_dynamic_filter(&on);
+        let mut join = HashJoinExec::try_new(
+            left,
+            right,
+            on,
+            None,
+            &join_type,
+            None,
+            mode,
+            NullEquality::NullEqualsNothing,
+            false,
+        )?;
+        join.dynamic_filter = Some(HashJoinExecDynamicFilter {
+            filter: Arc::clone(&dynamic_filter),
+            build_accumulator: OnceLock::new(),
+        });
+
+        Ok((join, dynamic_filter))
+    }
+
     async fn join_collect(
         left: Arc<dyn ExecutionPlan>,
         right: Arc<dyn ExecutionPlan>,
@@ -1652,7 +2464,7 @@ mod tests {
                 Partitioning::Hash(left_expr, partition_count),
             )?),
             PartitionMode::Auto => {
-                return internal_err!("Unexpected PartitionMode::Auto in join tests")
+                return internal_err!("Unexpected PartitionMode::Auto in join tests");
             }
         };
 
@@ -1673,7 +2485,7 @@ mod tests {
                 Partitioning::Hash(right_expr, partition_count),
             )?),
             PartitionMode::Auto => {
-                return internal_err!("Unexpected PartitionMode::Auto in join tests")
+                return internal_err!("Unexpected PartitionMode::Auto in join tests");
             }
         };
 
@@ -1686,6 +2498,7 @@ mod tests {
             None,
             partition_mode,
             null_equality,
+            false,
         )?;
 
         let columns = columns(&join.schema());
@@ -1706,10 +2519,13 @@ mod tests {
         Ok((columns, batches, metrics))
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_inner_one(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_inner_one(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 5]), // this has a repetition
@@ -1740,26 +2556,30 @@ mod tests {
 
         allow_duplicates! {
             // Inner join output is expected to preserve both inputs order
-            assert_snapshot!(batches_to_string(&batches), @r#"
-                +----+----+----+----+----+----+
-                | a1 | b1 | c1 | a2 | b1 | c2 |
-                +----+----+----+----+----+----+
-                | 1  | 4  | 7  | 10 | 4  | 70 |
-                | 2  | 5  | 8  | 20 | 5  | 80 |
-                | 3  | 5  | 9  | 20 | 5  | 80 |
-                +----+----+----+----+----+----+
-                "#);
-        }
-
+            assert_snapshot!(batches_to_string(&batches), @r"
+            +----+----+----+----+----+----+
+            | a1 | b1 | c1 | a2 | b1 | c2 |
+            +----+----+----+----+----+----+
+            | 1  | 4  | 7  | 10 | 4  | 70 |
+            | 2  | 5  | 8  | 20 | 5  | 80 |
+            | 3  | 5  | 9  | 20 | 5  | 80 |
+            +----+----+----+----+----+----+
+            ");
+        }
+
         assert_join_metrics!(metrics, 3);
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
 
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn partitioned_join_inner_one(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn partitioned_join_inner_one(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 5]), // this has a repetition
@@ -1788,18 +2608,19 @@ mod tests {
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]);
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
-                +----+----+----+----+----+----+
-                | a1 | b1 | c1 | a2 | b1 | c2 |
-                +----+----+----+----+----+----+
-                | 1  | 4  | 7  | 10 | 4  | 70 |
-                | 2  | 5  | 8  | 20 | 5  | 80 |
-                | 3  | 5  | 9  | 20 | 5  | 80 |
-                +----+----+----+----+----+----+
-                "#);
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
+            +----+----+----+----+----+----+
+            | a1 | b1 | c1 | a2 | b1 | c2 |
+            +----+----+----+----+----+----+
+            | 1  | 4  | 7  | 10 | 4  | 70 |
+            | 2  | 5  | 8  | 20 | 5  | 80 |
+            | 3  | 5  | 9  | 20 | 5  | 80 |
+            +----+----+----+----+----+----+
+            ");
         }
 
         assert_join_metrics!(metrics, 3);
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
 
         Ok(())
     }
@@ -1836,7 +2657,7 @@ mod tests {
 
         // Inner join output is expected to preserve both inputs order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b2 | c2 |
             +----+----+----+----+----+----+
@@ -1844,7 +2665,7 @@ mod tests {
             | 2  | 5  | 8  | 20 | 5  | 80 |
             | 3  | 5  | 9  | 20 | 5  | 80 |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
         assert_join_metrics!(metrics, 3);
@@ -1884,7 +2705,7 @@ mod tests {
 
         // Inner join output is expected to preserve both inputs order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b2 | c2 |
             +----+----+----+----+----+----+
@@ -1893,7 +2714,7 @@ mod tests {
             | 0  | 4  | 6  | 10 | 4  | 70 |
             | 1  | 4  | 7  | 10 | 4  | 70 |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
         assert_join_metrics!(metrics, 4);
@@ -1901,10 +2722,13 @@ mod tests {
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_inner_two(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_inner_two(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 2]),
             ("b2", &vec![1, 2, 2]),
@@ -1952,11 +2776,16 @@ mod tests {
             div_ceil(9, batch_size)
         };
 
-        assert_eq!(batches.len(), expected_batch_count);
+        // With batch coalescing, we may have fewer batches than expected
+        assert!(
+            batches.len() <= expected_batch_count,
+            "expected at most {expected_batch_count} batches, got {}",
+            batches.len()
+        );
 
         // Inner join output is expected to preserve both inputs order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b2 | c1 | a1 | b2 | c2 |
             +----+----+----+----+----+----+
@@ -1964,7 +2793,7 @@ mod tests {
             | 2  | 2  | 8  | 2  | 2  | 80 |
             | 2  | 2  | 9  | 2  | 2  | 80 |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
         assert_join_metrics!(metrics, 3);
@@ -1973,10 +2802,13 @@ mod tests {
     }
 
     /// Test where the left has 2 parts, the right with 1 part => 1 part
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_inner_one_two_parts_left(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_inner_one_two_parts_left(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let batch1 = build_table_i32(
             ("a1", &vec![1, 2]),
             ("b2", &vec![1, 2]),
@@ -2032,11 +2864,16 @@ mod tests {
             div_ceil(9, batch_size)
         };
 
-        assert_eq!(batches.len(), expected_batch_count);
+        // With batch coalescing, we may have fewer batches than expected
+        assert!(
+            batches.len() <= expected_batch_count,
+            "expected at most {expected_batch_count} batches, got {}",
+            batches.len()
+        );
 
         // Inner join output is expected to preserve both inputs order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b2 | c1 | a1 | b2 | c2 |
             +----+----+----+----+----+----+
@@ -2044,7 +2881,7 @@ mod tests {
             | 2  | 2  | 8  | 2  | 2  | 80 |
             | 2  | 2  | 9  | 2  | 2  | 80 |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
         assert_join_metrics!(metrics, 3);
@@ -2095,7 +2932,7 @@ mod tests {
 
         // Inner join output is expected to preserve both inputs order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b2 | c2 |
             +----+----+----+----+----+----+
@@ -2104,7 +2941,7 @@ mod tests {
             | 0  | 4  | 6  | 10 | 4  | 70 |
             | 1  | 4  | 7  | 10 | 4  | 70 |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
         assert_join_metrics!(metrics, 4);
@@ -2113,10 +2950,13 @@ mod tests {
     }
 
     /// Test where the left has 1 part, the right has 2 parts => 2 parts
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_inner_one_two_parts_right(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_inner_one_two_parts_right(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 5]), // this has a repetition
@@ -2168,17 +3008,22 @@ mod tests {
             // and filtered later.
             div_ceil(6, batch_size)
         };
-        assert_eq!(batches.len(), expected_batch_count);
+        // With batch coalescing, we may have fewer batches than expected
+        assert!(
+            batches.len() <= expected_batch_count,
+            "expected at most {expected_batch_count} batches, got {}",
+            batches.len()
+        );
 
         // Inner join output is expected to preserve both inputs order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b1 | c2 |
             +----+----+----+----+----+----+
             | 1  | 4  | 7  | 10 | 4  | 70 |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
         // second part
@@ -2193,20 +3038,28 @@ mod tests {
             // and filtered later.
             div_ceil(3, batch_size)
         };
-        assert_eq!(batches.len(), expected_batch_count);
+        // With batch coalescing, we may have fewer batches than expected
+        assert!(
+            batches.len() <= expected_batch_count,
+            "expected at most {expected_batch_count} batches, got {}",
+            batches.len()
+        );
 
         // Inner join output is expected to preserve both inputs order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b1 | c2 |
             +----+----+----+----+----+----+
             | 2  | 5  | 8  | 30 | 5  | 90 |
             | 3  | 5  | 9  | 30 | 5  | 90 |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
@@ -2220,10 +3073,13 @@ mod tests {
         TestMemoryExec::try_new_exec(&[vec![batch.clone(), batch]], schema, None).unwrap()
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_left_multi_batch(batch_size: usize) {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_left_multi_batch(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
@@ -2240,9 +3096,9 @@ mod tests {
         )];
 
         let join = join(
-            left,
-            right,
-            on,
+            Arc::clone(&left),
+            Arc::clone(&right),
+            on.clone(),
             &JoinType::Left,
             NullEquality::NullEqualsNothing,
         )
@@ -2251,11 +3107,18 @@ mod tests {
         let columns = columns(&join.schema());
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]);
 
-        let stream = join.execute(0, task_ctx).unwrap();
-        let batches = common::collect(stream).await.unwrap();
+        let (_, batches, metrics) = join_collect(
+            Arc::clone(&left),
+            Arc::clone(&right),
+            on.clone(),
+            &JoinType::Left,
+            NullEquality::NullEqualsNothing,
+            task_ctx,
+        )
+        .await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b1 | c2 |
             +----+----+----+----+----+----+
@@ -2265,14 +3128,20 @@ mod tests {
             | 2  | 5  | 8  | 20 | 5  | 80 |
             | 3  | 7  | 9  |    |    |    |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
+
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+        return Ok(());
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_full_multi_batch(batch_size: usize) {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_full_multi_batch(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
@@ -2303,9 +3172,10 @@ mod tests {
 
         let stream = join.execute(0, task_ctx).unwrap();
         let batches = common::collect(stream).await.unwrap();
+        let metrics = join.metrics().unwrap();
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b2 | c2 |
             +----+----+----+----+----+----+
@@ -2317,14 +3187,19 @@ mod tests {
             | 2  | 5  | 8  | 20 | 5  | 80 |
             | 3  | 7  | 9  |    |    |    |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
+
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_left_empty_right(batch_size: usize) {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_left_empty_right(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]),
@@ -2351,9 +3226,10 @@ mod tests {
 
         let stream = join.execute(0, task_ctx).unwrap();
         let batches = common::collect(stream).await.unwrap();
+        let metrics = join.metrics().unwrap();
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b1 | c2 |
             +----+----+----+----+----+----+
@@ -2361,14 +3237,19 @@ mod tests {
             | 2  | 5  | 8  |    |    |    |
             | 3  | 7  | 9  |    |    |    |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
+
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_full_empty_right(batch_size: usize) {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_full_empty_right(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]),
@@ -2395,9 +3276,10 @@ mod tests {
 
         let stream = join.execute(0, task_ctx).unwrap();
         let batches = common::collect(stream).await.unwrap();
+        let metrics = join.metrics().unwrap();
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b2 | c2 |
             +----+----+----+----+----+----+
@@ -2405,14 +3287,19 @@ mod tests {
             | 2  | 5  | 8  |    |    |    |
             | 3  | 7  | 9  |    |    |    |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
+
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_left_one(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_left_one(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
@@ -2441,7 +3328,7 @@ mod tests {
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]);
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b1 | c2 |
             +----+----+----+----+----+----+
@@ -2449,18 +3336,22 @@ mod tests {
             | 2  | 5  | 8  | 20 | 5  | 80 |
             | 3  | 7  | 9  |    |    |    |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
         assert_join_metrics!(metrics, 3);
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
 
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn partitioned_join_left_one(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn partitioned_join_left_one(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
@@ -2489,7 +3380,7 @@ mod tests {
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]);
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b1 | c2 |
             +----+----+----+----+----+----+
@@ -2497,10 +3388,11 @@ mod tests {
             | 2  | 5  | 8  | 20 | 5  | 80 |
             | 3  | 7  | 9  |    |    |    |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
         assert_join_metrics!(metrics, 3);
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
 
         Ok(())
     }
@@ -2525,10 +3417,13 @@ mod tests {
         )
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_left_semi(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_left_semi(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_semi_anti_left_table();
         let right = build_semi_anti_right_table();
         // left_table left semi join right_table on left_table.b1 = right_table.b2
@@ -2553,7 +3448,7 @@ mod tests {
 
         // ignore the order
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+-----+
             | a1 | b1 | c1  |
             +----+----+-----+
@@ -2561,16 +3456,22 @@ mod tests {
             | 13 | 10 | 130 |
             | 9  | 8  | 90  |
             +----+----+-----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_left_semi_with_filter(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_left_semi_with_filter(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_semi_anti_left_table();
         let right = build_semi_anti_right_table();
 
@@ -2626,6 +3527,9 @@ mod tests {
             ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         // left_table left semi join right_table on left_table.b1 = right_table.b2 and right_table.a2 > 10
         let filter_expression = Arc::new(BinaryExpr::new(
             Arc::new(Column::new("x", 0)),
@@ -2654,22 +3558,28 @@ mod tests {
         let batches = common::collect(stream).await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+-----+
             | a1 | b1 | c1  |
             +----+----+-----+
             | 13 | 10 | 130 |
             +----+----+-----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_right_semi(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_right_semi(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_semi_anti_left_table();
         let right = build_semi_anti_right_table();
 
@@ -2695,7 +3605,7 @@ mod tests {
 
         // RightSemi join output is expected to preserve right input order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+-----+
             | a2 | b2 | c2  |
             +----+----+-----+
@@ -2703,16 +3613,22 @@ mod tests {
             | 12 | 10 | 40  |
             | 10 | 10 | 100 |
             +----+----+-----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_right_semi_with_filter(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_right_semi_with_filter(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_semi_anti_left_table();
         let right = build_semi_anti_right_table();
 
@@ -2758,7 +3674,7 @@ mod tests {
 
         // RightSemi join output is expected to preserve right input order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+-----+
             | a2 | b2 | c2  |
             +----+----+-----+
@@ -2766,9 +3682,12 @@ mod tests {
             | 12 | 10 | 40  |
             | 10 | 10 | 100 |
             +----+----+-----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         // left_table right semi join right_table on left_table.b1 = right_table.b2 on left_table.a1!=9
         let filter_expression = Arc::new(BinaryExpr::new(
             Arc::new(Column::new("x", 0)),
@@ -2795,23 +3714,29 @@ mod tests {
 
         // RightSemi join output is expected to preserve right input order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+-----+
             | a2 | b2 | c2  |
             +----+----+-----+
             | 12 | 10 | 40  |
             | 10 | 10 | 100 |
             +----+----+-----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_left_anti(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_left_anti(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_semi_anti_left_table();
         let right = build_semi_anti_right_table();
         // left_table left anti join right_table on left_table.b1 = right_table.b2
@@ -2835,7 +3760,7 @@ mod tests {
         let batches = common::collect(stream).await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+
             | a1 | b1 | c1 |
             +----+----+----+
@@ -2844,15 +3769,22 @@ mod tests {
             | 5  | 5  | 50 |
             | 7  | 7  | 70 |
             +----+----+----+
-                "#);
+            ");
         }
+
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_left_anti_with_filter(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_left_anti_with_filter(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_semi_anti_left_table();
         let right = build_semi_anti_right_table();
         // left_table left anti join right_table on left_table.b1 = right_table.b2 and right_table.a2!=8
@@ -2895,7 +3827,7 @@ mod tests {
         let batches = common::collect(stream).await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+-----+
             | a1 | b1 | c1  |
             +----+----+-----+
@@ -2906,9 +3838,12 @@ mod tests {
             | 7  | 7  | 70  |
             | 9  | 8  | 90  |
             +----+----+-----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         // left_table left anti join right_table on left_table.b1 = right_table.b2 and right_table.a2 != 13
         let filter_expression = Arc::new(BinaryExpr::new(
             Arc::new(Column::new("x", 0)),
@@ -2938,7 +3873,7 @@ mod tests {
         let batches = common::collect(stream).await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+-----+
             | a1 | b1 | c1  |
             +----+----+-----+
@@ -2949,16 +3884,22 @@ mod tests {
             | 7  | 7  | 70  |
             | 9  | 8  | 90  |
             +----+----+-----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_right_anti(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_right_anti(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_semi_anti_left_table();
         let right = build_semi_anti_right_table();
         let on = vec![(
@@ -2982,7 +3923,7 @@ mod tests {
 
         // RightAnti join output is expected to preserve right input order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+-----+
             | a2 | b2 | c2  |
             +----+----+-----+
@@ -2990,15 +3931,22 @@ mod tests {
             | 2  | 2  | 80  |
             | 4  | 4  | 120 |
             +----+----+-----+
-                "#);
+            ");
         }
+
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_right_anti_with_filter(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_right_anti_with_filter(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_semi_anti_left_table();
         let right = build_semi_anti_right_table();
         // left_table right anti join right_table on left_table.b1 = right_table.b2 and left_table.a1!=13
@@ -3043,7 +3991,7 @@ mod tests {
 
         // RightAnti join output is expected to preserve right input order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+-----+
             | a2 | b2 | c2  |
             +----+----+-----+
@@ -3053,9 +4001,12 @@ mod tests {
             | 10 | 10 | 100 |
             | 4  | 4  | 120 |
             +----+----+-----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         // left_table right anti join right_table on left_table.b1 = right_table.b2 and right_table.b2!=8
         let column_indices = vec![ColumnIndex {
             index: 1,
@@ -3090,7 +4041,7 @@ mod tests {
 
         // RightAnti join output is expected to preserve right input order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+-----+
             | a2 | b2 | c2  |
             +----+----+-----+
@@ -3099,16 +4050,22 @@ mod tests {
             | 2  | 2  | 80  |
             | 4  | 4  | 120 |
             +----+----+-----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_right_one(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_right_one(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]),
@@ -3137,7 +4094,7 @@ mod tests {
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]);
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b1 | c2 |
             +----+----+----+----+----+----+
@@ -3145,18 +4102,22 @@ mod tests {
             | 1  | 4  | 7  | 10 | 4  | 70 |
             | 2  | 5  | 8  | 20 | 5  | 80 |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
         assert_join_metrics!(metrics, 3);
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
 
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn partitioned_join_right_one(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn partitioned_join_right_one(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]),
@@ -3185,7 +4146,7 @@ mod tests {
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]);
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b1 | c2 |
             +----+----+----+----+----+----+
@@ -3193,18 +4154,22 @@ mod tests {
             | 1  | 4  | 7  | 10 | 4  | 70 |
             | 2  | 5  | 8  | 20 | 5  | 80 |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
         assert_join_metrics!(metrics, 3);
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
 
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_full_one(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_full_one(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
@@ -3235,7 +4200,7 @@ mod tests {
         let batches = common::collect(stream).await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b2 | c2 |
             +----+----+----+----+----+----+
@@ -3244,16 +4209,22 @@ mod tests {
             | 2  | 5  | 8  | 20 | 5  | 80 |
             | 3  | 7  | 9  |    |    |    |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_left_mark(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_left_mark(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
@@ -3282,7 +4253,7 @@ mod tests {
         assert_eq!(columns, vec!["a1", "b1", "c1", "mark"]);
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+-------+
             | a1 | b1 | c1 | mark  |
             +----+----+----+-------+
@@ -3290,18 +4261,22 @@ mod tests {
             | 2  | 5  | 8  | true  |
             | 3  | 7  | 9  | false |
             +----+----+----+-------+
-                "#);
+            ");
         }
 
         assert_join_metrics!(metrics, 3);
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
 
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn partitioned_join_left_mark(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn partitioned_join_left_mark(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
@@ -3330,7 +4305,7 @@ mod tests {
         assert_eq!(columns, vec!["a1", "b1", "c1", "mark"]);
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+-------+
             | a1 | b1 | c1 | mark  |
             +----+----+----+-------+
@@ -3338,18 +4313,22 @@ mod tests {
             | 2  | 5  | 8  | true  |
             | 3  | 7  | 9  | false |
             +----+----+----+-------+
-                "#);
+            ");
         }
 
         assert_join_metrics!(metrics, 3);
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
 
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_right_mark(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_right_mark(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
@@ -3389,14 +4368,18 @@ mod tests {
         assert_batches_sorted_eq!(expected, &batches);
 
         assert_join_metrics!(metrics, 3);
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
 
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn partitioned_join_right_mark(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn partitioned_join_right_mark(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
@@ -3437,6 +4420,7 @@ mod tests {
         assert_batches_sorted_eq!(expected, &batches);
 
         assert_join_metrics!(metrics, 4);
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
 
         Ok(())
     }
@@ -3450,13 +4434,9 @@ mod tests {
             ("y", &vec![200, 300]),
         );
 
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
         let hashes_buff = &mut vec![0; left.num_rows()];
-        let hashes = create_hashes(
-            &[Arc::clone(&left.columns()[0])],
-            &random_state,
-            hashes_buff,
-        )?;
+        let hashes = create_hashes([&left.columns()[0]], &random_state, hashes_buff)?;
 
         // Maps both values to both indices (1 and 2, representing input 0 and 1)
         // 0 -> (0, 1)
@@ -3485,12 +4465,10 @@ mod tests {
         let right_keys_values =
             key_column.evaluate(&right)?.into_array(right.num_rows())?;
         let mut hashes_buffer = vec![0; right.num_rows()];
-        create_hashes(
-            &[Arc::clone(&right_keys_values)],
-            &random_state,
-            &mut hashes_buffer,
-        )?;
+        create_hashes([&right_keys_values], &random_state, &mut hashes_buffer)?;
 
+        let mut probe_indices_buffer = Vec::new();
+        let mut build_indices_buffer = Vec::new();
         let (l, r, _) = lookup_join_hashmap(
             &join_hash_map,
             &[left_keys_values],
@@ -3499,6 +4477,8 @@ mod tests {
             &hashes_buffer,
             8192,
             (0, None),
+            &mut probe_indices_buffer,
+            &mut build_indices_buffer,
         )?;
 
         let left_ids: UInt64Array = vec![0, 1].into();
@@ -3521,13 +4501,9 @@ mod tests {
             ("y", &vec![200, 300]),
         );
 
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
         let hashes_buff = &mut vec![0; left.num_rows()];
-        let hashes = create_hashes(
-            &[Arc::clone(&left.columns()[0])],
-            &random_state,
-            hashes_buff,
-        )?;
+        let hashes = create_hashes([&left.columns()[0]], &random_state, hashes_buff)?;
 
         hashmap_left.insert_unique(hashes[0], (hashes[0], 1u32), |(h, _)| *h);
         hashmap_left.insert_unique(hashes[0], (hashes[0], 2u32), |(h, _)| *h);
@@ -3550,12 +4526,10 @@ mod tests {
         let right_keys_values =
             key_column.evaluate(&right)?.into_array(right.num_rows())?;
         let mut hashes_buffer = vec![0; right.num_rows()];
-        create_hashes(
-            &[Arc::clone(&right_keys_values)],
-            &random_state,
-            &mut hashes_buffer,
-        )?;
+        create_hashes([&right_keys_values], &random_state, &mut hashes_buffer)?;
 
+        let mut probe_indices_buffer = Vec::new();
+        let mut build_indices_buffer = Vec::new();
         let (l, r, _) = lookup_join_hashmap(
             &join_hash_map,
             &[left_keys_values],
@@ -3564,6 +4538,8 @@ mod tests {
             &hashes_buffer,
             8192,
             (0, None),
+            &mut probe_indices_buffer,
+            &mut build_indices_buffer,
         )?;
 
         // We still expect to match rows 0 and 1 on both sides
@@ -3610,14 +4586,14 @@ mod tests {
         let batches = common::collect(stream).await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +---+---+---+----+---+----+
             | a | b | c | a  | b | c  |
             +---+---+---+----+---+----+
             | 1 | 4 | 7 | 10 | 1 | 70 |
             | 2 | 5 | 8 | 20 | 2 | 80 |
             +---+---+---+----+---+----+
-                "#);
+            ");
         }
 
         Ok(())
@@ -3651,10 +4627,13 @@ mod tests {
         )
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_inner_with_filter(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_inner_with_filter(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a", &vec![0, 1, 2, 2]),
             ("b", &vec![4, 5, 7, 8]),
@@ -3687,23 +4666,29 @@ mod tests {
         let batches = common::collect(stream).await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +---+---+---+----+---+---+
             | a | b | c | a  | b | c |
             +---+---+---+----+---+---+
             | 2 | 7 | 9 | 10 | 2 | 7 |
             | 2 | 7 | 9 | 20 | 2 | 5 |
             +---+---+---+----+---+---+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_left_with_filter(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_left_with_filter(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a", &vec![0, 1, 2, 2]),
             ("b", &vec![4, 5, 7, 8]),
@@ -3736,7 +4721,7 @@ mod tests {
         let batches = common::collect(stream).await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +---+---+---+----+---+---+
             | a | b | c | a  | b | c |
             +---+---+---+----+---+---+
@@ -3746,16 +4731,22 @@ mod tests {
             | 2 | 7 | 9 | 20 | 2 | 5 |
             | 2 | 8 | 1 |    |   |   |
             +---+---+---+----+---+---+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_right_with_filter(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_right_with_filter(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a", &vec![0, 1, 2, 2]),
             ("b", &vec![4, 5, 7, 8]),
@@ -3788,7 +4779,7 @@ mod tests {
         let batches = common::collect(stream).await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +---+---+---+----+---+---+
             | a | b | c | a  | b | c |
             +---+---+---+----+---+---+
@@ -3797,16 +4788,22 @@ mod tests {
             | 2 | 7 | 9 | 10 | 2 | 7 |
             | 2 | 7 | 9 | 20 | 2 | 5 |
             +---+---+---+----+---+---+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_full_with_filter(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_full_with_filter(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a", &vec![0, 1, 2, 2]),
             ("b", &vec![4, 5, 7, 8]),
@@ -3853,6 +4850,9 @@ mod tests {
         ];
         assert_batches_sorted_eq!(expected, &batches);
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         // THIS MIGRATION HALTED DUE TO ISSUE #15312
         //allow_duplicates! {
         //    assert_snapshot!(batches_to_sort_string(&batches), @r#"
@@ -4043,7 +5043,7 @@ mod tests {
         let batches = common::collect(stream).await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +------------+---+------------+---+
             | date       | n | date       | n |
             +------------+---+------------+---+
@@ -4051,7 +5051,7 @@ mod tests {
             | 2022-04-26 | 2 | 2022-04-26 | 5 |
             | 2022-04-27 | 3 | 2022-04-27 | 6 |
             +------------+---+------------+---+
-                "#);
+            ");
         }
 
         Ok(())
@@ -4111,6 +5111,70 @@ mod tests {
         }
     }
 
+    #[tokio::test]
+    async fn join_does_not_consume_probe_when_empty_build_fixes_output() {
+        assert_empty_build_probe_behavior(
+            &[
+                JoinType::Inner,
+                JoinType::Left,
+                JoinType::LeftSemi,
+                JoinType::LeftAnti,
+                JoinType::LeftMark,
+                JoinType::RightSemi,
+            ],
+            false,
+            false,
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn join_does_not_consume_probe_when_empty_build_fixes_output_with_filter() {
+        assert_empty_build_probe_behavior(
+            &[
+                JoinType::Inner,
+                JoinType::Left,
+                JoinType::LeftSemi,
+                JoinType::LeftAnti,
+                JoinType::LeftMark,
+                JoinType::RightSemi,
+            ],
+            false,
+            true,
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn join_still_consumes_probe_when_empty_build_needs_probe_rows() {
+        assert_empty_build_probe_behavior(
+            &[
+                JoinType::Right,
+                JoinType::Full,
+                JoinType::RightAnti,
+                JoinType::RightMark,
+            ],
+            true,
+            false,
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn join_still_consumes_probe_when_empty_build_needs_probe_rows_with_filter() {
+        assert_empty_build_probe_behavior(
+            &[
+                JoinType::Right,
+                JoinType::Full,
+                JoinType::RightAnti,
+                JoinType::RightMark,
+            ],
+            true,
+            true,
+        )
+        .await;
+    }
+
     #[tokio::test]
     async fn join_split_batch() {
         let left = build_table(
@@ -4202,7 +5266,7 @@ mod tests {
         // validation of partial join results output for different batch_size setting
         for join_type in join_types {
             for batch_size in (1..21).rev() {
-                let task_ctx = prepare_task_ctx(batch_size);
+                let task_ctx = prepare_task_ctx(batch_size, true);
 
                 let join = join(
                     Arc::clone(&left),
@@ -4229,10 +5293,11 @@ mod tests {
                     }
                     _ => div_ceil(expected_resultset_records, batch_size) + 1,
                 };
-                assert_eq!(
-                    batches.len(),
-                    expected_batch_count,
-                    "expected {expected_batch_count} output batches for {join_type} join with batch_size = {batch_size}"
+                // With batch coalescing, we may have fewer batches than expected
+                assert!(
+                    batches.len() <= expected_batch_count,
+                    "expected at most {expected_batch_count} output batches for {join_type} join with batch_size = {batch_size}, got {}",
+                    batches.len()
                 );
 
                 let expected = match join_type {
@@ -4242,7 +5307,17 @@ mod tests {
                     JoinType::LeftAnti => left_empty.to_vec(),
                     _ => common_result.to_vec(),
                 };
-                assert_batches_eq!(expected, &batches);
+                // For anti joins with empty results, we may get zero batches
+                // (with coalescing) instead of one empty batch with schema
+                if batches.is_empty() {
+                    // Verify this is an expected empty result case
+                    assert!(
+                        matches!(join_type, JoinType::RightAnti | JoinType::LeftAnti),
+                        "Unexpected empty result for {join_type} join"
+                    );
+                } else {
+                    assert_batches_eq!(expected, &batches);
+                }
             }
         }
     }
@@ -4371,6 +5446,7 @@ mod tests {
                 None,
                 PartitionMode::Partitioned,
                 NullEquality::NullEqualsNothing,
+                false,
             )?;
 
             let stream = join.execute(1, task_ctx)?;
@@ -4380,7 +5456,6 @@ mod tests {
             assert_contains!(
                 err.to_string(),
                 "Resources exhausted: Additional allocation failed for HashJoinInput[1] with top memory consumers (across reservations) as:\n  HashJoinInput[1]"
-
             );
 
             assert_contains!(
@@ -4443,7 +5518,7 @@ mod tests {
         assert_eq!(columns, vec!["n1", "n2"]);
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +--------+--------+
             | n1     | n2     |
             +--------+--------+
@@ -4451,7 +5526,7 @@ mod tests {
             | {a: 1} | {a: 1} |
             | {a: 2} | {a: 2} |
             +--------+--------+
-                "#);
+            ");
         }
 
         assert_join_metrics!(metrics, 3);
@@ -4482,13 +5557,13 @@ mod tests {
         .await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches_null_eq), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches_null_eq), @r"
             +----+----+
             | n1 | n2 |
             +----+----+
             |    |    |
             +----+----+
-                "#);
+            ");
         }
 
         assert_join_metrics!(metrics, 1);
@@ -4505,9 +5580,15 @@ mod tests {
 
         assert_join_metrics!(metrics, 0);
 
-        let expected_null_neq =
-            ["+----+----+", "| n1 | n2 |", "+----+----+", "+----+----+"];
-        assert_batches_eq!(expected_null_neq, &batches_null_neq);
+        // With batch coalescing, empty results may not emit any batches
+        // Check that either we have no batches, or an empty batch with proper schema
+        if batches_null_neq.is_empty() {
+            // This is fine - no output rows
+        } else {
+            let expected_null_neq =
+                ["+----+----+", "| n1 | n2 |", "+----+----+", "+----+----+"];
+            assert_batches_eq!(expected_null_neq, &batches_null_neq);
+        }
 
         Ok(())
     }
@@ -4516,4 +5597,797 @@ mod tests {
     fn columns(schema: &Schema) -> Vec<String> {
         schema.fields().iter().map(|f| f.name().clone()).collect()
     }
+
+    /// This test verifies that the dynamic filter is marked as complete after HashJoinExec finishes building the hash table.
+    #[tokio::test]
+    async fn test_hash_join_marks_filter_complete() -> Result<()> {
+        let task_ctx = Arc::new(TaskContext::default());
+        let left = build_table(
+            ("a1", &vec![1, 2, 3]),
+            ("b1", &vec![4, 5, 6]),
+            ("c1", &vec![7, 8, 9]),
+        );
+        let right = build_table(
+            ("a2", &vec![10, 20, 30]),
+            ("b1", &vec![4, 5, 6]),
+            ("c2", &vec![70, 80, 90]),
+        );
+
+        let on = vec![(
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        )];
+
+        let (join, dynamic_filter) =
+            hash_join_with_dynamic_filter(left, right, on, JoinType::Inner)?;
+
+        // Execute the join
+        let stream = join.execute(0, task_ctx)?;
+        let _batches = common::collect(stream).await?;
+
+        // After the join completes, the dynamic filter should be marked as complete
+        // wait_complete() should return immediately
+        dynamic_filter.wait_complete().await;
+
+        Ok(())
+    }
+
+    /// This test verifies that the dynamic filter is marked as complete even when the build side is empty.
+    #[tokio::test]
+    async fn test_hash_join_marks_filter_complete_empty_build_side() -> Result<()> {
+        let task_ctx = Arc::new(TaskContext::default());
+        // Empty left side (build side)
+        let left = build_table(("a1", &vec![]), ("b1", &vec![]), ("c1", &vec![]));
+        let right = build_table(
+            ("a2", &vec![10, 20, 30]),
+            ("b1", &vec![4, 5, 6]),
+            ("c2", &vec![70, 80, 90]),
+        );
+
+        let on = vec![(
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        )];
+
+        let (join, dynamic_filter) =
+            hash_join_with_dynamic_filter(left, right, on, JoinType::Inner)?;
+
+        // Execute the join
+        let stream = join.execute(0, task_ctx)?;
+        let _batches = common::collect(stream).await?;
+
+        // Even with empty build side, the dynamic filter should be marked as complete
+        // wait_complete() should return immediately
+        dynamic_filter.wait_complete().await;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_partitioned_dynamic_filter_reports_empty_canceled_partitions()
+    -> Result<()> {
+        let mut session_config = SessionConfig::default();
+        session_config
+            .options_mut()
+            .optimizer
+            .enable_dynamic_filter_pushdown = true;
+        let task_ctx =
+            Arc::new(TaskContext::default().with_session_config(session_config));
+
+        let child_left_schema = Arc::new(Schema::new(vec![
+            Field::new("child_left_payload", DataType::Int32, false),
+            Field::new("child_key", DataType::Int32, false),
+            Field::new("child_left_extra", DataType::Int32, false),
+        ]));
+        let child_right_schema = Arc::new(Schema::new(vec![
+            Field::new("child_right_payload", DataType::Int32, false),
+            Field::new("child_right_key", DataType::Int32, false),
+            Field::new("child_right_extra", DataType::Int32, false),
+        ]));
+        let parent_left_schema = Arc::new(Schema::new(vec![
+            Field::new("parent_payload", DataType::Int32, false),
+            Field::new("parent_key", DataType::Int32, false),
+            Field::new("parent_extra", DataType::Int32, false),
+        ]));
+
+        let child_left: Arc<dyn ExecutionPlan> = TestMemoryExec::try_new_exec(
+            &[
+                vec![build_table_i32(
+                    ("child_left_payload", &vec![10]),
+                    ("child_key", &vec![0]),
+                    ("child_left_extra", &vec![100]),
+                )],
+                vec![build_table_i32(
+                    ("child_left_payload", &vec![11]),
+                    ("child_key", &vec![1]),
+                    ("child_left_extra", &vec![101]),
+                )],
+                vec![build_table_i32(
+                    ("child_left_payload", &vec![12]),
+                    ("child_key", &vec![2]),
+                    ("child_left_extra", &vec![102]),
+                )],
+                vec![build_table_i32(
+                    ("child_left_payload", &vec![13]),
+                    ("child_key", &vec![3]),
+                    ("child_left_extra", &vec![103]),
+                )],
+            ],
+            Arc::clone(&child_left_schema),
+            None,
+        )?;
+        let child_right: Arc<dyn ExecutionPlan> = TestMemoryExec::try_new_exec(
+            &[
+                vec![build_table_i32(
+                    ("child_right_payload", &vec![20]),
+                    ("child_right_key", &vec![0]),
+                    ("child_right_extra", &vec![200]),
+                )],
+                vec![build_table_i32(
+                    ("child_right_payload", &vec![21]),
+                    ("child_right_key", &vec![1]),
+                    ("child_right_extra", &vec![201]),
+                )],
+                vec![build_table_i32(
+                    ("child_right_payload", &vec![22]),
+                    ("child_right_key", &vec![2]),
+                    ("child_right_extra", &vec![202]),
+                )],
+                vec![build_table_i32(
+                    ("child_right_payload", &vec![23]),
+                    ("child_right_key", &vec![3]),
+                    ("child_right_extra", &vec![203]),
+                )],
+            ],
+            Arc::clone(&child_right_schema),
+            None,
+        )?;
+        let parent_left: Arc<dyn ExecutionPlan> = TestMemoryExec::try_new_exec(
+            &[
+                vec![build_table_i32(
+                    ("parent_payload", &vec![30]),
+                    ("parent_key", &vec![0]),
+                    ("parent_extra", &vec![300]),
+                )],
+                vec![RecordBatch::new_empty(Arc::clone(&parent_left_schema))],
+                vec![build_table_i32(
+                    ("parent_payload", &vec![32]),
+                    ("parent_key", &vec![2]),
+                    ("parent_extra", &vec![302]),
+                )],
+                vec![RecordBatch::new_empty(Arc::clone(&parent_left_schema))],
+            ],
+            Arc::clone(&parent_left_schema),
+            None,
+        )?;
+
+        let child_on = vec![(
+            Arc::new(Column::new_with_schema("child_key", &child_left_schema)?) as _,
+            Arc::new(Column::new_with_schema(
+                "child_right_key",
+                &child_right_schema,
+            )?) as _,
+        )];
+        let (child_join, _child_dynamic_filter) = hash_join_with_dynamic_filter_and_mode(
+            child_left,
+            child_right,
+            child_on,
+            JoinType::Inner,
+            PartitionMode::Partitioned,
+        )?;
+        let child_join: Arc<dyn ExecutionPlan> = Arc::new(child_join);
+
+        let parent_on = vec![(
+            Arc::new(Column::new_with_schema("parent_key", &parent_left_schema)?) as _,
+            Arc::new(Column::new_with_schema("child_key", &child_join.schema())?) as _,
+        )];
+        let parent_join = HashJoinExec::try_new(
+            parent_left,
+            child_join,
+            parent_on,
+            None,
+            &JoinType::RightSemi,
+            None,
+            PartitionMode::Partitioned,
+            NullEquality::NullEqualsNothing,
+            false,
+        )?;
+
+        let batches = tokio::time::timeout(
+            std::time::Duration::from_secs(5),
+            crate::execution_plan::collect(Arc::new(parent_join), task_ctx),
+        )
+        .await
+        .expect("partitioned right-semi join should not hang")?;
+
+        assert_batches_sorted_eq!(
+            [
+                "+--------------------+-----------+------------------+---------------------+-----------------+-------------------+",
+                "| child_left_payload | child_key | child_left_extra | child_right_payload | child_right_key | child_right_extra |",
+                "+--------------------+-----------+------------------+---------------------+-----------------+-------------------+",
+                "| 10                 | 0         | 100              | 20                  | 0               | 200               |",
+                "| 12                 | 2         | 102              | 22                  | 2               | 202               |",
+                "+--------------------+-----------+------------------+---------------------+-----------------+-------------------+",
+            ],
+            &batches
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_hash_join_skips_probe_on_empty_build_after_partition_bounds_report()
+    -> Result<()> {
+        let task_ctx = Arc::new(TaskContext::default());
+        let (left, right, on) = empty_build_with_probe_error_inputs();
+
+        // Keep an extra consumer reference so execute() enables dynamic filter pushdown
+        // and enters the WaitPartitionBoundsReport path before deciding whether to poll
+        // the probe side.
+        let (join, dynamic_filter) =
+            hash_join_with_dynamic_filter(left, right, on, JoinType::Inner)?;
+
+        let stream = join.execute(0, task_ctx)?;
+        let batches = common::collect(stream).await?;
+        assert!(batches.is_empty());
+
+        dynamic_filter.wait_complete().await;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_perfect_hash_join_with_negative_numbers() -> Result<()> {
+        let task_ctx = prepare_task_ctx(8192, true);
+        let (left_schema, right_schema, on) = build_schema_and_on()?;
+
+        let left_batch = RecordBatch::try_new(
+            Arc::clone(&left_schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef,
+                Arc::new(Int32Array::from(vec![-1, 0, 1])) as ArrayRef,
+            ],
+        )?;
+        let left = TestMemoryExec::try_new_exec(&[vec![left_batch]], left_schema, None)?;
+
+        let right_batch = RecordBatch::try_new(
+            Arc::clone(&right_schema),
+            vec![
+                Arc::new(Int32Array::from(vec![10, 20, 30, 40])) as ArrayRef,
+                Arc::new(Int32Array::from(vec![1, -1, 0, 2])) as ArrayRef,
+            ],
+        )?;
+        let right =
+            TestMemoryExec::try_new_exec(&[vec![right_batch]], right_schema, None)?;
+
+        let (columns, batches, metrics) = join_collect(
+            left,
+            right,
+            on,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNothing,
+            task_ctx,
+        )
+        .await?;
+
+        assert_eq!(columns, vec!["a1", "b1", "a2", "b1"]);
+
+        assert_batches_sorted_eq!(
+            [
+                "+----+----+----+----+",
+                "| a1 | b1 | a2 | b1 |",
+                "+----+----+----+----+",
+                "| 1  | -1 | 20 | -1 |",
+                "| 2  | 0  | 30 | 0  |",
+                "| 3  | 1  | 10 | 1  |",
+                "+----+----+----+----+",
+            ],
+            &batches
+        );
+
+        assert_phj_used(&metrics, true);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_perfect_hash_join_overflow_full_int64_range() -> Result<()> {
+        let task_ctx = prepare_task_ctx(8192, true);
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, true)]));
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int64Array::from(vec![i64::MIN, i64::MAX]))],
+        )?;
+        let left = TestMemoryExec::try_new_exec(
+            &[vec![batch.clone()]],
+            Arc::clone(&schema),
+            None,
+        )?;
+        let right = TestMemoryExec::try_new_exec(&[vec![batch]], schema, None)?;
+        let on: JoinOn = vec![(
+            Arc::new(Column::new_with_schema("a", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("a", &right.schema())?) as _,
+        )];
+        let (_columns, batches, _metrics) = join_collect(
+            left,
+            right,
+            on,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNothing,
+            task_ctx,
+        )
+        .await?;
+        let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total_rows, 2);
+        Ok(())
+    }
+
+    #[apply(hash_join_exec_configs)]
+    #[tokio::test]
+    async fn test_phj_null_equals_null_build_no_nulls_probe_has_nulls(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
+        let (left_schema, right_schema, on) = build_schema_and_on()?;
+
+        let left_batch = RecordBatch::try_new(
+            Arc::clone(&left_schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef,
+                Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef,
+            ],
+        )?;
+        let left = TestMemoryExec::try_new_exec(&[vec![left_batch]], left_schema, None)?;
+
+        let right_batch = RecordBatch::try_new(
+            Arc::clone(&right_schema),
+            vec![
+                Arc::new(Int32Array::from(vec![3, 4])) as ArrayRef,
+                Arc::new(Int32Array::from(vec![Some(10), None])) as ArrayRef,
+            ],
+        )?;
+        let right =
+            TestMemoryExec::try_new_exec(&[vec![right_batch]], right_schema, None)?;
+
+        let (columns, batches, metrics) = join_collect(
+            left,
+            right,
+            on,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNull,
+            task_ctx,
+        )
+        .await?;
+
+        assert_eq!(columns, vec!["a1", "b1", "a2", "b1"]);
+        assert_batches_sorted_eq!(
+            [
+                "+----+----+----+----+",
+                "| a1 | b1 | a2 | b1 |",
+                "+----+----+----+----+",
+                "| 1  | 10 | 3  | 10 |",
+                "+----+----+----+----+",
+            ],
+            &batches
+        );
+
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
+        Ok(())
+    }
+
+    #[apply(hash_join_exec_configs)]
+    #[tokio::test]
+    async fn test_phj_null_equals_nothing_build_probe_all_have_nulls(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
+        let (left_schema, right_schema, on) = build_schema_and_on()?;
+
+        let left_batch = RecordBatch::try_new(
+            Arc::clone(&left_schema),
+            vec![
+                Arc::new(Int32Array::from(vec![Some(1), Some(2)])) as ArrayRef,
+                Arc::new(Int32Array::from(vec![Some(10), None])) as ArrayRef,
+            ],
+        )?;
+        let left = TestMemoryExec::try_new_exec(&[vec![left_batch]], left_schema, None)?;
+
+        let right_batch = RecordBatch::try_new(
+            Arc::clone(&right_schema),
+            vec![
+                Arc::new(Int32Array::from(vec![Some(3), Some(4)])) as ArrayRef,
+                Arc::new(Int32Array::from(vec![Some(10), None])) as ArrayRef,
+            ],
+        )?;
+        let right =
+            TestMemoryExec::try_new_exec(&[vec![right_batch]], right_schema, None)?;
+
+        let (columns, batches, metrics) = join_collect(
+            left,
+            right,
+            on,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNothing,
+            task_ctx,
+        )
+        .await?;
+
+        assert_eq!(columns, vec!["a1", "b1", "a2", "b1"]);
+        assert_batches_sorted_eq!(
+            [
+                "+----+----+----+----+",
+                "| a1 | b1 | a2 | b1 |",
+                "+----+----+----+----+",
+                "| 1  | 10 | 3  | 10 |",
+                "+----+----+----+----+",
+            ],
+            &batches
+        );
+
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_phj_null_equals_null_build_have_nulls() -> Result<()> {
+        let task_ctx = prepare_task_ctx(8192, true);
+        let (left_schema, right_schema, on) = build_schema_and_on()?;
+
+        let left_batch = RecordBatch::try_new(
+            Arc::clone(&left_schema),
+            vec![
+                Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])) as ArrayRef,
+                Arc::new(Int32Array::from(vec![Some(10), Some(20), None])) as ArrayRef,
+            ],
+        )?;
+        let left = TestMemoryExec::try_new_exec(&[vec![left_batch]], left_schema, None)?;
+
+        let right_batch = RecordBatch::try_new(
+            Arc::clone(&right_schema),
+            vec![
+                Arc::new(Int32Array::from(vec![Some(3), Some(4)])) as ArrayRef,
+                Arc::new(Int32Array::from(vec![Some(10), Some(30)])) as ArrayRef,
+            ],
+        )?;
+        let right =
+            TestMemoryExec::try_new_exec(&[vec![right_batch]], right_schema, None)?;
+
+        let (columns, batches, metrics) = join_collect(
+            left,
+            right,
+            on,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNull,
+            task_ctx,
+        )
+        .await?;
+
+        assert_eq!(columns, vec!["a1", "b1", "a2", "b1"]);
+        assert_batches_sorted_eq!(
+            [
+                "+----+----+----+----+",
+                "| a1 | b1 | a2 | b1 |",
+                "+----+----+----+----+",
+                "| 1  | 10 | 3  | 10 |",
+                "+----+----+----+----+",
+            ],
+            &batches
+        );
+
+        assert_phj_used(&metrics, false);
+
+        Ok(())
+    }
+
+    /// Test null-aware anti join when probe side (right) contains NULL
+    /// Expected: no rows should be output (NULL in subquery means all results are unknown)
+    #[apply(hash_join_exec_configs)]
+    #[tokio::test]
+    async fn test_null_aware_anti_join_probe_null(batch_size: usize) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, false);
+
+        // Build left table (rows to potentially output)
+        let left = build_table_two_cols(
+            ("c1", &vec![Some(1), Some(2), Some(3), Some(4)]),
+            ("dummy", &vec![Some(10), Some(20), Some(30), Some(40)]),
+        );
+
+        // Build right table (subquery with NULL)
+        let right = build_table_two_cols(
+            ("c2", &vec![Some(1), Some(2), Some(3), None]),
+            ("dummy", &vec![Some(100), Some(200), Some(300), Some(400)]),
+        );
+
+        let on = vec![(
+            Arc::new(Column::new_with_schema("c1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("c2", &right.schema())?) as _,
+        )];
+
+        // Create null-aware anti join
+        let join = HashJoinExec::try_new(
+            left,
+            right,
+            on,
+            None,
+            &JoinType::LeftAnti,
+            None,
+            PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNothing,
+            true, // null_aware = true
+        )?;
+
+        let stream = join.execute(0, task_ctx)?;
+        let batches = common::collect(stream).await?;
+
+        // Expected: empty result (probe side has NULL, so no rows should be output)
+        allow_duplicates! {
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
+            ++
+            ++
+            ");
+        }
+        Ok(())
+    }
+
+    /// Test null-aware anti join when build side (left) contains NULL keys
+    /// Expected: rows with NULL keys should not be output
+    #[apply(hash_join_exec_configs)]
+    #[tokio::test]
+    async fn test_null_aware_anti_join_build_null(batch_size: usize) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, false);
+
+        // Build left table with NULL key (this row should not be output)
+        let left = build_table_two_cols(
+            ("c1", &vec![Some(1), Some(4), None]),
+            ("dummy", &vec![Some(10), Some(40), Some(0)]),
+        );
+
+        // Build right table (no NULL, so probe-side check passes)
+        let right = build_table_two_cols(
+            ("c2", &vec![Some(1), Some(2), Some(3)]),
+            ("dummy", &vec![Some(100), Some(200), Some(300)]),
+        );
+
+        let on = vec![(
+            Arc::new(Column::new_with_schema("c1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("c2", &right.schema())?) as _,
+        )];
+
+        // Create null-aware anti join
+        let join = HashJoinExec::try_new(
+            left,
+            right,
+            on,
+            None,
+            &JoinType::LeftAnti,
+            None,
+            PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNothing,
+            true, // null_aware = true
+        )?;
+
+        let stream = join.execute(0, task_ctx)?;
+        let batches = common::collect(stream).await?;
+
+        // Expected: only c1=4 (not c1=1 which matches, not c1=NULL)
+        allow_duplicates! {
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
+            +----+-------+
+            | c1 | dummy |
+            +----+-------+
+            | 4  | 40    |
+            +----+-------+
+            ");
+        }
+        Ok(())
+    }
+
+    /// Test null-aware anti join with no NULLs (should work like regular anti join)
+    #[apply(hash_join_exec_configs)]
+    #[tokio::test]
+    async fn test_null_aware_anti_join_no_nulls(batch_size: usize) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, false);
+
+        // Build left table (no NULLs)
+        let left = build_table_two_cols(
+            ("c1", &vec![Some(1), Some(2), Some(4), Some(5)]),
+            ("dummy", &vec![Some(10), Some(20), Some(40), Some(50)]),
+        );
+
+        // Build right table (no NULLs)
+        let right = build_table_two_cols(
+            ("c2", &vec![Some(1), Some(2), Some(3)]),
+            ("dummy", &vec![Some(100), Some(200), Some(300)]),
+        );
+
+        let on = vec![(
+            Arc::new(Column::new_with_schema("c1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("c2", &right.schema())?) as _,
+        )];
+
+        // Create null-aware anti join
+        let join = HashJoinExec::try_new(
+            left,
+            right,
+            on,
+            None,
+            &JoinType::LeftAnti,
+            None,
+            PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNothing,
+            true, // null_aware = true
+        )?;
+
+        let stream = join.execute(0, task_ctx)?;
+        let batches = common::collect(stream).await?;
+
+        // Expected: c1=4 and c1=5 (they don't match anything in right)
+        allow_duplicates! {
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
+            +----+-------+
+            | c1 | dummy |
+            +----+-------+
+            | 4  | 40    |
+            | 5  | 50    |
+            +----+-------+
+            ");
+        }
+        Ok(())
+    }
+
+    /// Test that null_aware validation rejects non-LeftAnti join types
+    #[tokio::test]
+    async fn test_null_aware_validation_wrong_join_type() {
+        let left =
+            build_table_two_cols(("c1", &vec![Some(1)]), ("dummy", &vec![Some(10)]));
+        let right =
+            build_table_two_cols(("c2", &vec![Some(1)]), ("dummy", &vec![Some(100)]));
+
+        let on = vec![(
+            Arc::new(Column::new_with_schema("c1", &left.schema()).unwrap()) as _,
+            Arc::new(Column::new_with_schema("c2", &right.schema()).unwrap()) as _,
+        )];
+
+        // Try to create null-aware Inner join (should fail)
+        let result = HashJoinExec::try_new(
+            left,
+            right,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNothing,
+            true, // null_aware = true (invalid for Inner join)
+        );
+
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("null_aware can only be true for LeftAnti joins")
+        );
+    }
+
+    /// Test that null_aware validation rejects multi-column joins
+    #[tokio::test]
+    async fn test_null_aware_validation_multi_column() {
+        let left = build_table(("a", &vec![1]), ("b", &vec![2]), ("c", &vec![3]));
+        let right = build_table(("x", &vec![1]), ("y", &vec![2]), ("z", &vec![3]));
+
+        // Try multi-column join
+        let on = vec![
+            (
+                Arc::new(Column::new_with_schema("a", &left.schema()).unwrap()) as _,
+                Arc::new(Column::new_with_schema("x", &right.schema()).unwrap()) as _,
+            ),
+            (
+                Arc::new(Column::new_with_schema("b", &left.schema()).unwrap()) as _,
+                Arc::new(Column::new_with_schema("y", &right.schema()).unwrap()) as _,
+            ),
+        ];
+
+        // Try to create null-aware anti join with 2 columns (should fail)
+        let result = HashJoinExec::try_new(
+            left,
+            right,
+            on,
+            None,
+            &JoinType::LeftAnti,
+            None,
+            PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNothing,
+            true, // null_aware = true (invalid for multi-column)
+        );
+
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("null_aware anti join only supports single column join key")
+        );
+    }
+
+    #[test]
+    fn test_lr_is_preserved() {
+        assert_eq!(lr_is_preserved(JoinType::Inner), (true, true));
+        assert_eq!(lr_is_preserved(JoinType::Left), (true, false));
+        assert_eq!(lr_is_preserved(JoinType::Right), (false, true));
+        assert_eq!(lr_is_preserved(JoinType::Full), (false, false));
+        assert_eq!(lr_is_preserved(JoinType::LeftSemi), (true, true));
+        assert_eq!(lr_is_preserved(JoinType::LeftAnti), (true, true));
+        assert_eq!(lr_is_preserved(JoinType::LeftMark), (true, false));
+        assert_eq!(lr_is_preserved(JoinType::RightSemi), (true, true));
+        assert_eq!(lr_is_preserved(JoinType::RightAnti), (true, true));
+        assert_eq!(lr_is_preserved(JoinType::RightMark), (false, true));
+    }
+
+    #[test]
+    fn test_with_dynamic_filter() -> Result<()> {
+        let (_, _, on) = build_schema_and_on()?;
+        let left = build_table(("a1", &vec![1]), ("b1", &vec![1]), ("c1", &vec![1]));
+        let right = build_table(("a2", &vec![1]), ("b1", &vec![1]), ("c2", &vec![1]));
+
+        let join = HashJoinExec::try_new(
+            left,
+            right,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNothing,
+            false,
+        )?;
+        assert!(join.dynamic_filter().is_none());
+
+        let df = Arc::new(DynamicFilterPhysicalExpr::new(
+            vec![Arc::new(Column::new("b1", 1)) as _],
+            lit(true),
+        ));
+        let join = join.with_dynamic_filter(Arc::clone(&df))?;
+
+        let restored = join.dynamic_filter().expect("should have dynamic filter");
+        assert_eq!(
+            restored
+                .expression_id()
+                .expect("DynamicFilterPhysicalExpr always has an expression_id"),
+            df.expression_id()
+                .expect("DynamicFilterPhysicalExpr always has an expression_id"),
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_with_dynamic_filter_rejects_invalid_columns() -> Result<()> {
+        let (_, _, on) = build_schema_and_on()?;
+        let left = build_table(("a1", &vec![1]), ("b1", &vec![1]), ("c1", &vec![1]));
+        let right = build_table(("a2", &vec![1]), ("b1", &vec![1]), ("c2", &vec![1]));
+
+        let join = HashJoinExec::try_new(
+            left,
+            right,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNothing,
+            false,
+        )?;
+
+        // Column index 99 is out of bounds for the right (probe) side schema.
+        let df = Arc::new(DynamicFilterPhysicalExpr::new(
+            vec![Arc::new(Column::new("bad", 99)) as _],
+            lit(true),
+        ));
+        assert!(join.with_dynamic_filter(df).is_err());
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/joins/hash_join/inlist_builder.rs b/datafusion/physical-plan/src/joins/hash_join/inlist_builder.rs
new file mode 100644
index 0000000000000..2fc3201c6363f
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/hash_join/inlist_builder.rs
@@ -0,0 +1,158 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utilities for building InList expressions from hash join build side data
+
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, StructArray};
+use arrow::datatypes::{Field, FieldRef, Fields};
+use arrow_schema::DataType;
+use datafusion_common::Result;
+
+pub(super) fn build_struct_fields(data_types: &[DataType]) -> Result<Fields> {
+    data_types
+        .iter()
+        .enumerate()
+        .map(|(i, dt)| Ok(Field::new(format!("c{i}"), dt.clone(), true)))
+        .collect()
+}
+
+/// Builds InList values from join key column arrays.
+///
+/// If `join_key_arrays` is:
+/// 1. A single array, let's say Int32, this will produce a flat
+///    InList expression where the lookup is expected to be scalar Int32 values,
+///    that is: this will produce `IN LIST (1, 2, 3)` expected to be used as `2 IN LIST (1, 2, 3)`.
+/// 2. An Int32 array and a Utf8 array, this will produce a Struct InList expression
+///    where the lookup is expected to be Struct values with two fields (Int32, Utf8),
+///    that is: this will produce `IN LIST ((1, "a"), (2, "b"))` expected to be used as `(2, "b") IN LIST ((1, "a"), (2, "b"))`.
+///    The field names of the struct are auto-generated as "c0", "c1", ... and should match the struct expression used in the join keys.
+///
+/// Note that this function does not deduplicate values - deduplication will happen later
+/// when building an InList expression from this array via `InListExpr::try_new_from_array`.
+///
+/// Returns `None` if the estimated size exceeds `max_size_bytes` or if the number of rows
+/// exceeds `max_distinct_values`.
+pub(super) fn build_struct_inlist_values(
+    join_key_arrays: &[ArrayRef],
+) -> Result<Option<ArrayRef>> {
+    // Build the source array/struct
+    let source_array: ArrayRef = if join_key_arrays.len() == 1 {
+        // Single column: use directly
+        Arc::clone(&join_key_arrays[0])
+    } else {
+        // Multi-column: build StructArray once from all columns
+        let fields = build_struct_fields(
+            &join_key_arrays
+                .iter()
+                .map(|arr| arr.data_type().clone())
+                .collect::<Vec<_>>(),
+        )?;
+
+        // Build field references with proper Arc wrapping
+        let arrays_with_fields: Vec<(FieldRef, ArrayRef)> = fields
+            .iter()
+            .cloned()
+            .zip(join_key_arrays.iter().cloned())
+            .collect();
+
+        Arc::new(StructArray::from(arrays_with_fields))
+    };
+
+    Ok(Some(source_array))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{
+        DictionaryArray, Int8Array, Int32Array, StringArray, StringDictionaryBuilder,
+    };
+
+    #[test]
+    fn test_build_single_column_inlist_array() {
+        let array = Arc::new(Int32Array::from(vec![1, 2, 3, 2, 1])) as ArrayRef;
+        let result = build_struct_inlist_values(std::slice::from_ref(&array))
+            .unwrap()
+            .unwrap();
+
+        assert!(array.eq(&result));
+    }
+
+    #[test]
+    fn test_build_multi_column_inlist() {
+        let array1 = Arc::new(Int32Array::from(vec![1, 2, 3, 2, 1])) as ArrayRef;
+        let array2 =
+            Arc::new(StringArray::from(vec!["a", "b", "c", "b", "a"])) as ArrayRef;
+
+        let result = build_struct_inlist_values(&[array1, array2])
+            .unwrap()
+            .unwrap();
+
+        assert_eq!(
+            *result.data_type(),
+            DataType::Struct(
+                build_struct_fields(&[DataType::Int32, DataType::Utf8]).unwrap()
+            )
+        );
+    }
+
+    #[test]
+    fn test_build_multi_column_inlist_with_dictionary() {
+        let mut builder = StringDictionaryBuilder::<arrow::datatypes::Int8Type>::new();
+        builder.append_value("foo");
+        builder.append_value("foo");
+        builder.append_value("foo");
+        let dict_array = Arc::new(builder.finish()) as ArrayRef;
+
+        let int_array = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef;
+
+        let result = build_struct_inlist_values(&[dict_array, int_array])
+            .unwrap()
+            .unwrap();
+
+        assert_eq!(result.len(), 3);
+        assert_eq!(
+            *result.data_type(),
+            DataType::Struct(
+                build_struct_fields(&[
+                    DataType::Dictionary(
+                        Box::new(DataType::Int8),
+                        Box::new(DataType::Utf8)
+                    ),
+                    DataType::Int32
+                ])
+                .unwrap()
+            )
+        );
+    }
+
+    #[test]
+    fn test_build_single_column_dictionary_inlist() {
+        let keys = Int8Array::from(vec![0i8, 0, 0]);
+        let values = Arc::new(StringArray::from(vec!["foo"]));
+        let dict_array = Arc::new(DictionaryArray::new(keys, values)) as ArrayRef;
+
+        let result = build_struct_inlist_values(std::slice::from_ref(&dict_array))
+            .unwrap()
+            .unwrap();
+
+        assert_eq!(result.len(), 3);
+        assert_eq!(result.data_type(), dict_array.data_type());
+    }
+}
diff --git a/datafusion/physical-plan/src/joins/hash_join/mod.rs b/datafusion/physical-plan/src/joins/hash_join/mod.rs
index 7f1e5cae13a3e..b915802ea4015 100644
--- a/datafusion/physical-plan/src/joins/hash_join/mod.rs
+++ b/datafusion/physical-plan/src/joins/hash_join/mod.rs
@@ -17,8 +17,11 @@
 
 //! [`HashJoinExec`] Partitioned Hash Join Operator
 
-pub use exec::HashJoinExec;
+pub use exec::{HashJoinExec, HashJoinExecBuilder};
+pub use partitioned_hash_eval::{HashExpr, HashTableLookupExpr, SeededRandomState};
 
 mod exec;
+mod inlist_builder;
+mod partitioned_hash_eval;
 mod shared_bounds;
 mod stream;
diff --git a/datafusion/physical-plan/src/joins/hash_join/partitioned_hash_eval.rs b/datafusion/physical-plan/src/joins/hash_join/partitioned_hash_eval.rs
new file mode 100644
index 0000000000000..0daac0bb86a75
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/hash_join/partitioned_hash_eval.rs
@@ -0,0 +1,594 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Hash computation and hash table lookup expressions for dynamic filtering
+
+use std::{fmt::Display, hash::Hash, sync::Arc};
+
+use arrow::{
+    array::{ArrayRef, UInt64Array},
+    datatypes::{DataType, Schema},
+    record_batch::RecordBatch,
+};
+use datafusion_common::Result;
+use datafusion_common::hash_utils::RandomState;
+use datafusion_common::hash_utils::{create_hashes, with_hashes};
+use datafusion_expr::ColumnarValue;
+use datafusion_physical_expr_common::physical_expr::{
+    DynHash, PhysicalExpr, PhysicalExprRef,
+};
+
+use crate::joins::Map;
+
+/// RandomState wrapper that preserves the seed used to create it.
+///
+/// This is needed because `RandomState` doesn't expose its seed after creation,
+/// but we need them for serialization (e.g., protobuf serde).
+#[derive(Clone, Debug)]
+pub struct SeededRandomState {
+    random_state: RandomState,
+    seed: u64,
+}
+
+impl SeededRandomState {
+    /// Create a new SeededRandomState with the given seed.
+    pub const fn with_seed(k: u64) -> Self {
+        Self {
+            random_state: RandomState::with_seed(k),
+            seed: k,
+        }
+    }
+
+    /// Get the inner RandomState.
+    pub fn random_state(&self) -> &RandomState {
+        &self.random_state
+    }
+
+    /// Get the seed used to create this RandomState.
+    pub fn seed(&self) -> u64 {
+        self.seed
+    }
+}
+
+/// Physical expression that computes hash values for a set of columns
+///
+/// This expression computes the hash of join key columns using a specific RandomState.
+/// It returns a UInt64Array containing the hash values.
+///
+/// This is used for:
+/// - Computing routing hashes (with RepartitionExec's 0,0,0,0 seeds)
+/// - Computing lookup hashes (with HashJoin's 'J','O','I','N' seeds)
+pub struct HashExpr {
+    /// Columns to hash
+    on_columns: Vec<PhysicalExprRef>,
+    /// Random state for hashing (with seeds preserved for serialization)
+    random_state: SeededRandomState,
+    /// Description for display
+    description: String,
+}
+
+impl HashExpr {
+    /// Create a new HashExpr
+    ///
+    /// # Arguments
+    /// * `on_columns` - Columns to hash
+    /// * `random_state` - SeededRandomState for hashing
+    /// * `description` - Description for debugging (e.g., "hash_repartition", "hash_join")
+    pub fn new(
+        on_columns: Vec<PhysicalExprRef>,
+        random_state: SeededRandomState,
+        description: String,
+    ) -> Self {
+        Self {
+            on_columns,
+            random_state,
+            description,
+        }
+    }
+
+    /// Get the columns being hashed.
+    pub fn on_columns(&self) -> &[PhysicalExprRef] {
+        &self.on_columns
+    }
+
+    /// Get the seed used for hashing.
+    pub fn seed(&self) -> u64 {
+        self.random_state.seed()
+    }
+
+    /// Get the description.
+    pub fn description(&self) -> &str {
+        &self.description
+    }
+}
+
+impl std::fmt::Debug for HashExpr {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let cols = self
+            .on_columns
+            .iter()
+            .map(|e| e.to_string())
+            .collect::<Vec<_>>()
+            .join(", ");
+        let seed = self.seed();
+        write!(f, "{}({cols}, [{seed}])", self.description)
+    }
+}
+
+impl Hash for HashExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.on_columns.dyn_hash(state);
+        self.description.hash(state);
+        self.seed().hash(state);
+    }
+}
+
+impl PartialEq for HashExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.on_columns == other.on_columns
+            && self.description == other.description
+            && self.seed() == other.seed()
+    }
+}
+
+impl Eq for HashExpr {}
+
+impl Display for HashExpr {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.description)
+    }
+}
+
+impl PhysicalExpr for HashExpr {
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        self.on_columns.iter().collect()
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        Ok(Arc::new(HashExpr::new(
+            children,
+            self.random_state.clone(),
+            self.description.clone(),
+        )))
+    }
+
+    fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
+        Ok(DataType::UInt64)
+    }
+
+    fn nullable(&self, _input_schema: &Schema) -> Result<bool> {
+        Ok(false)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
+        let num_rows = batch.num_rows();
+
+        // Evaluate columns
+        let keys_values = evaluate_columns(&self.on_columns, batch)?;
+
+        // Compute hashes
+        let mut hashes_buffer = vec![0; num_rows];
+        create_hashes(
+            &keys_values,
+            self.random_state.random_state(),
+            &mut hashes_buffer,
+        )?;
+
+        Ok(ColumnarValue::Array(Arc::new(UInt64Array::from(
+            hashes_buffer,
+        ))))
+    }
+
+    fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.description)
+    }
+}
+
+/// Physical expression that checks join keys in a [`Map`] (hash table or array map).
+///
+/// Returns a [`BooleanArray`](arrow::array::BooleanArray) indicating if join keys (from `on_columns`) exist in the map.
+// TODO: rename to MapLookupExpr
+pub struct HashTableLookupExpr {
+    /// Columns in the ON clause used to compute the join key for lookups
+    on_columns: Vec<PhysicalExprRef>,
+    /// Random state for hashing (with seeds preserved for serialization)
+    random_state: SeededRandomState,
+    /// Map to check against (hash table or array map)
+    map: Arc<Map>,
+    /// Description for display
+    description: String,
+}
+
+impl HashTableLookupExpr {
+    /// Create a new HashTableLookupExpr
+    ///
+    /// # Arguments
+    /// * `on_columns` - Columns in the ON clause used to compute the join key
+    /// * `random_state` - SeededRandomState for hashing
+    /// * `map` - Map to check membership (hash table or array map)
+    /// * `description` - Description for debugging
+    /// # Note
+    /// This is public for internal testing purposes only and is not
+    /// guaranteed to be stable across versions.
+    pub fn new(
+        on_columns: Vec<PhysicalExprRef>,
+        random_state: SeededRandomState,
+        map: Arc<Map>,
+        description: String,
+    ) -> Self {
+        Self {
+            on_columns,
+            random_state,
+            map,
+            description,
+        }
+    }
+}
+
+impl std::fmt::Debug for HashTableLookupExpr {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let cols = self
+            .on_columns
+            .iter()
+            .map(|e| e.to_string())
+            .collect::<Vec<_>>()
+            .join(", ");
+        let seed = self.random_state.seed();
+        write!(f, "{}({cols}, [{seed}])", self.description)
+    }
+}
+
+impl Hash for HashTableLookupExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.on_columns.dyn_hash(state);
+        self.description.hash(state);
+        self.random_state.seed().hash(state);
+        // Note that we compare hash_map by pointer equality.
+        // Actually comparing the contents of the hash maps would be expensive.
+        // The way these hash maps are used in actuality is that HashJoinExec creates
+        // one per partition per query execution, thus it is never possible for two different
+        // hash maps to have the same content in practice.
+        // Theoretically this is a public API and users could create identical hash maps,
+        // but that seems unlikely and not worth paying the cost of deep comparison all the time.
+        Arc::as_ptr(&self.map).hash(state);
+    }
+}
+
+impl PartialEq for HashTableLookupExpr {
+    fn eq(&self, other: &Self) -> bool {
+        // Note that we compare hash_map by pointer equality.
+        // Actually comparing the contents of the hash maps would be expensive.
+        // The way these hash maps are used in actuality is that HashJoinExec creates
+        // one per partition per query execution, thus it is never possible for two different
+        // hash maps to have the same content in practice.
+        // Theoretically this is a public API and users could create identical hash maps,
+        // but that seems unlikely and not worth paying the cost of deep comparison all the time.
+        self.on_columns == other.on_columns
+            && self.description == other.description
+            && self.random_state.seed() == other.random_state.seed()
+            && Arc::ptr_eq(&self.map, &other.map)
+    }
+}
+
+impl Eq for HashTableLookupExpr {}
+
+impl Display for HashTableLookupExpr {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.description)
+    }
+}
+
+impl PhysicalExpr for HashTableLookupExpr {
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        self.on_columns.iter().collect()
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        Ok(Arc::new(HashTableLookupExpr::new(
+            children,
+            self.random_state.clone(),
+            Arc::clone(&self.map),
+            self.description.clone(),
+        )))
+    }
+
+    fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
+        Ok(DataType::Boolean)
+    }
+
+    fn nullable(&self, _input_schema: &Schema) -> Result<bool> {
+        Ok(false)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
+        // Evaluate columns
+        let join_keys = evaluate_columns(&self.on_columns, batch)?;
+
+        match self.map.as_ref() {
+            Map::HashMap(map) => {
+                with_hashes(&join_keys, self.random_state.random_state(), |hashes| {
+                    let array = map.contain_hashes(hashes);
+                    Ok(ColumnarValue::Array(Arc::new(array)))
+                })
+            }
+            Map::ArrayMap(map) => {
+                let array = map.contain_keys(&join_keys)?;
+                Ok(ColumnarValue::Array(Arc::new(array)))
+            }
+        }
+    }
+
+    fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.description)
+    }
+}
+
+fn evaluate_columns(
+    columns: &[PhysicalExprRef],
+    batch: &RecordBatch,
+) -> Result<Vec<ArrayRef>> {
+    let num_rows = batch.num_rows();
+    columns
+        .iter()
+        .map(|c| c.evaluate(batch)?.into_array(num_rows))
+        .collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::joins::join_hash_map::JoinHashMapU32;
+    use datafusion_physical_expr::expressions::Column;
+    use std::collections::hash_map::DefaultHasher;
+    use std::hash::Hasher;
+
+    fn compute_hash<T: Hash>(value: &T) -> u64 {
+        let mut hasher = DefaultHasher::new();
+        value.hash(&mut hasher);
+        hasher.finish()
+    }
+
+    #[test]
+    fn test_hash_expr_eq_same() {
+        let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0));
+        let col_b: PhysicalExprRef = Arc::new(Column::new("b", 1));
+
+        let expr1 = HashExpr::new(
+            vec![Arc::clone(&col_a), Arc::clone(&col_b)],
+            SeededRandomState::with_seed(1),
+            "test_hash".to_string(),
+        );
+
+        let expr2 = HashExpr::new(
+            vec![Arc::clone(&col_a), Arc::clone(&col_b)],
+            SeededRandomState::with_seed(1),
+            "test_hash".to_string(),
+        );
+
+        assert_eq!(expr1, expr2);
+    }
+
+    #[test]
+    fn test_hash_expr_eq_different_columns() {
+        let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0));
+        let col_b: PhysicalExprRef = Arc::new(Column::new("b", 1));
+        let col_c: PhysicalExprRef = Arc::new(Column::new("c", 2));
+
+        let expr1 = HashExpr::new(
+            vec![Arc::clone(&col_a), Arc::clone(&col_b)],
+            SeededRandomState::with_seed(1),
+            "test_hash".to_string(),
+        );
+
+        let expr2 = HashExpr::new(
+            vec![Arc::clone(&col_a), Arc::clone(&col_c)],
+            SeededRandomState::with_seed(1),
+            "test_hash".to_string(),
+        );
+
+        assert_ne!(expr1, expr2);
+    }
+
+    #[test]
+    fn test_hash_expr_eq_different_description() {
+        let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0));
+
+        let expr1 = HashExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            "hash_one".to_string(),
+        );
+
+        let expr2 = HashExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            "hash_two".to_string(),
+        );
+
+        assert_ne!(expr1, expr2);
+    }
+
+    #[test]
+    fn test_hash_expr_eq_different_seeds() {
+        let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0));
+
+        let expr1 = HashExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            "test_hash".to_string(),
+        );
+
+        let expr2 = HashExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(5),
+            "test_hash".to_string(),
+        );
+
+        assert_ne!(expr1, expr2);
+    }
+
+    #[test]
+    fn test_hash_expr_hash_consistency() {
+        let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0));
+        let col_b: PhysicalExprRef = Arc::new(Column::new("b", 1));
+
+        let expr1 = HashExpr::new(
+            vec![Arc::clone(&col_a), Arc::clone(&col_b)],
+            SeededRandomState::with_seed(1),
+            "test_hash".to_string(),
+        );
+
+        let expr2 = HashExpr::new(
+            vec![Arc::clone(&col_a), Arc::clone(&col_b)],
+            SeededRandomState::with_seed(1),
+            "test_hash".to_string(),
+        );
+
+        // Equal expressions should have equal hashes
+        assert_eq!(expr1, expr2);
+        assert_eq!(compute_hash(&expr1), compute_hash(&expr2));
+    }
+
+    #[test]
+    fn test_hash_table_lookup_expr_eq_same() {
+        let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0));
+        let hash_map =
+            Arc::new(Map::HashMap(Box::new(JoinHashMapU32::with_capacity(10))));
+
+        let expr1 = HashTableLookupExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            Arc::clone(&hash_map),
+            "lookup".to_string(),
+        );
+
+        let expr2 = HashTableLookupExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            Arc::clone(&hash_map),
+            "lookup".to_string(),
+        );
+
+        assert_eq!(expr1, expr2);
+    }
+
+    #[test]
+    fn test_hash_table_lookup_expr_eq_different_columns() {
+        let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0));
+        let col_b: PhysicalExprRef = Arc::new(Column::new("b", 1));
+
+        let hash_map =
+            Arc::new(Map::HashMap(Box::new(JoinHashMapU32::with_capacity(10))));
+
+        let expr1 = HashTableLookupExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            Arc::clone(&hash_map),
+            "lookup".to_string(),
+        );
+
+        let expr2 = HashTableLookupExpr::new(
+            vec![Arc::clone(&col_b)],
+            SeededRandomState::with_seed(1),
+            Arc::clone(&hash_map),
+            "lookup".to_string(),
+        );
+
+        assert_ne!(expr1, expr2);
+    }
+
+    #[test]
+    fn test_hash_table_lookup_expr_eq_different_description() {
+        let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0));
+        let hash_map =
+            Arc::new(Map::HashMap(Box::new(JoinHashMapU32::with_capacity(10))));
+
+        let expr1 = HashTableLookupExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            Arc::clone(&hash_map),
+            "lookup_one".to_string(),
+        );
+
+        let expr2 = HashTableLookupExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            Arc::clone(&hash_map),
+            "lookup_two".to_string(),
+        );
+
+        assert_ne!(expr1, expr2);
+    }
+
+    #[test]
+    fn test_hash_table_lookup_expr_eq_different_hash_map() {
+        let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0));
+
+        // Two different Arc pointers (even with same content) should not be equal
+        let hash_map1 =
+            Arc::new(Map::HashMap(Box::new(JoinHashMapU32::with_capacity(10))));
+        let hash_map2 =
+            Arc::new(Map::HashMap(Box::new(JoinHashMapU32::with_capacity(10))));
+        let expr1 = HashTableLookupExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            hash_map1,
+            "lookup".to_string(),
+        );
+
+        let expr2 = HashTableLookupExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            hash_map2,
+            "lookup".to_string(),
+        );
+
+        // Different Arc pointers means not equal (uses Arc::ptr_eq)
+        assert_ne!(expr1, expr2);
+    }
+
+    #[test]
+    fn test_hash_table_lookup_expr_hash_consistency() {
+        let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0));
+        let hash_map =
+            Arc::new(Map::HashMap(Box::new(JoinHashMapU32::with_capacity(10))));
+
+        let expr1 = HashTableLookupExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            Arc::clone(&hash_map),
+            "lookup".to_string(),
+        );
+
+        let expr2 = HashTableLookupExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            Arc::clone(&hash_map),
+            "lookup".to_string(),
+        );
+
+        // Equal expressions should have equal hashes
+        assert_eq!(expr1, expr2);
+        assert_eq!(compute_hash(&expr1), compute_hash(&expr2));
+    }
+}
diff --git a/datafusion/physical-plan/src/joins/hash_join/shared_bounds.rs b/datafusion/physical-plan/src/joins/hash_join/shared_bounds.rs
index 25f7a0de31acd..23ca14f5ba406 100644
--- a/datafusion/physical-plan/src/joins/hash_join/shared_bounds.rs
+++ b/datafusion/physical-plan/src/joins/hash_join/shared_bounds.rs
@@ -15,33 +15,43 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Utilities for shared bounds. Used in dynamic filter pushdown in Hash Joins.
+//! Utilities for shared build-side information. Used in dynamic filter pushdown in Hash Joins.
 // TODO: include the link to the Dynamic Filter blog post.
 
 use std::fmt;
 use std::sync::Arc;
 
-use crate::joins::PartitionMode;
 use crate::ExecutionPlan;
 use crate::ExecutionPlanProperties;
-
-use datafusion_common::{Result, ScalarValue};
+use crate::joins::Map;
+use crate::joins::PartitionMode;
+use crate::joins::hash_join::exec::HASH_JOIN_SEED;
+use crate::joins::hash_join::inlist_builder::build_struct_fields;
+use crate::joins::hash_join::partitioned_hash_eval::{
+    HashExpr, HashTableLookupExpr, SeededRandomState,
+};
+use arrow::array::ArrayRef;
+use arrow::datatypes::{DataType, Field, Schema};
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::{DataFusionError, Result, ScalarValue, SharedResult};
 use datafusion_expr::Operator;
-use datafusion_physical_expr::expressions::{lit, BinaryExpr, DynamicFilterPhysicalExpr};
-use datafusion_physical_expr::{PhysicalExpr, PhysicalExprRef};
+use datafusion_functions::core::r#struct as struct_func;
+use datafusion_physical_expr::expressions::{
+    BinaryExpr, CaseExpr, DynamicFilterPhysicalExpr, InListExpr, lit,
+};
+use datafusion_physical_expr::{PhysicalExpr, PhysicalExprRef, ScalarFunctionExpr};
 
-use itertools::Itertools;
 use parking_lot::Mutex;
-use tokio::sync::Barrier;
+use tokio::sync::Notify;
 
 /// Represents the minimum and maximum values for a specific column.
 /// Used in dynamic filter pushdown to establish value boundaries.
 #[derive(Debug, Clone, PartialEq)]
 pub(crate) struct ColumnBounds {
     /// The minimum value observed for this column
-    min: ScalarValue,
+    pub(crate) min: ScalarValue,
     /// The maximum value observed for this column  
-    max: ScalarValue,
+    pub(crate) max: ScalarValue,
 }
 
 impl ColumnBounds {
@@ -54,42 +64,166 @@ impl ColumnBounds {
 /// This contains the min/max values computed from one partition's build-side data.
 #[derive(Debug, Clone)]
 pub(crate) struct PartitionBounds {
-    /// Partition identifier for debugging and determinism (not strictly necessary)
-    partition: usize,
     /// Min/max bounds for each join key column in this partition.
     /// Index corresponds to the join key expression index.
     column_bounds: Vec<ColumnBounds>,
 }
 
 impl PartitionBounds {
-    pub(crate) fn new(partition: usize, column_bounds: Vec<ColumnBounds>) -> Self {
-        Self {
-            partition,
-            column_bounds,
+    pub(crate) fn new(column_bounds: Vec<ColumnBounds>) -> Self {
+        Self { column_bounds }
+    }
+
+    pub(crate) fn get_column_bounds(&self, index: usize) -> Option<&ColumnBounds> {
+        self.column_bounds.get(index)
+    }
+}
+
+/// Creates a membership predicate for filter pushdown.
+///
+/// If `inlist_values` is provided (for small build sides), creates an InList expression.
+/// Otherwise, creates a HashTableLookup expression (for large build sides).
+///
+/// Supports both single-column and multi-column joins using struct expressions.
+fn create_membership_predicate(
+    on_right: &[PhysicalExprRef],
+    pushdown: PushdownStrategy,
+    random_state: &SeededRandomState,
+    schema: &Schema,
+) -> Result<Option<Arc<dyn PhysicalExpr>>> {
+    match pushdown {
+        // Use InList expression for small build sides
+        PushdownStrategy::InList(in_list_array) => {
+            // Build the expression to compare against
+            let expr = if on_right.len() == 1 {
+                // Single column: col IN (val1, val2, ...)
+                Arc::clone(&on_right[0])
+            } else {
+                let fields = build_struct_fields(
+                    on_right
+                        .iter()
+                        .map(|r| r.data_type(schema))
+                        .collect::<Result<Vec<_>>>()?
+                        .as_ref(),
+                )?;
+
+                // The return field name and the function field name don't really matter here.
+                let return_field =
+                    Arc::new(Field::new("struct", DataType::Struct(fields), true));
+
+                Arc::new(ScalarFunctionExpr::new(
+                    "struct",
+                    struct_func(),
+                    on_right.to_vec(),
+                    return_field,
+                    Arc::new(ConfigOptions::default()),
+                )) as Arc<dyn PhysicalExpr>
+            };
+
+            // Use in_list_from_array() helper to create InList with static_filter optimization (hash-based lookup)
+            Ok(Some(Arc::new(InListExpr::try_new_from_array(
+                expr,
+                in_list_array,
+                false,
+            )?)))
         }
+        // Use hash table lookup for large build sides
+        PushdownStrategy::Map(hash_map) => Ok(Some(Arc::new(HashTableLookupExpr::new(
+            on_right.to_vec(),
+            random_state.clone(),
+            hash_map,
+            "hash_lookup".to_string(),
+        )) as Arc<dyn PhysicalExpr>)),
+        // Empty partition - should not create a filter for this
+        PushdownStrategy::Empty => Ok(None),
     }
+}
+
+/// Creates a bounds predicate from partition bounds.
+///
+/// Returns `None` if no column bounds are available.
+/// Returns a combined predicate (col >= min AND col <= max) for all columns with bounds.
+fn create_bounds_predicate(
+    on_right: &[PhysicalExprRef],
+    bounds: &PartitionBounds,
+) -> Option<Arc<dyn PhysicalExpr>> {
+    let mut column_predicates = Vec::new();
 
-    pub(crate) fn len(&self) -> usize {
-        self.column_bounds.len()
+    for (col_idx, right_expr) in on_right.iter().enumerate() {
+        if let Some(column_bounds) = bounds.get_column_bounds(col_idx) {
+            // Create predicate: col >= min AND col <= max
+            let min_expr = Arc::new(BinaryExpr::new(
+                Arc::clone(right_expr),
+                Operator::GtEq,
+                lit(column_bounds.min.clone()),
+            )) as Arc<dyn PhysicalExpr>;
+            let max_expr = Arc::new(BinaryExpr::new(
+                Arc::clone(right_expr),
+                Operator::LtEq,
+                lit(column_bounds.max.clone()),
+            )) as Arc<dyn PhysicalExpr>;
+            let range_expr = Arc::new(BinaryExpr::new(min_expr, Operator::And, max_expr))
+                as Arc<dyn PhysicalExpr>;
+            column_predicates.push(range_expr);
+        }
     }
 
-    pub(crate) fn get_column_bounds(&self, index: usize) -> Option<&ColumnBounds> {
-        self.column_bounds.get(index)
+    if column_predicates.is_empty() {
+        None
+    } else {
+        Some(
+            column_predicates
+                .into_iter()
+                .reduce(|acc, pred| {
+                    Arc::new(BinaryExpr::new(acc, Operator::And, pred))
+                        as Arc<dyn PhysicalExpr>
+                })
+                .unwrap(),
+        )
     }
 }
 
-/// Coordinates dynamic filter bounds collection across multiple partitions
+/// Combines a membership predicate and a bounds predicate with logical AND.
 ///
-/// This structure ensures that dynamic filters are built with complete information from all
-/// relevant partitions before being applied to probe-side scans. Incomplete filters would
+/// Returns `None` when neither is available; callers decide the fallback (e.g.
+/// skip updating the filter vs. emit a `lit(true)` branch inside a CASE).
+fn combine_membership_and_bounds(
+    membership_expr: Option<Arc<dyn PhysicalExpr>>,
+    bounds_expr: Option<Arc<dyn PhysicalExpr>>,
+) -> Option<Arc<dyn PhysicalExpr>> {
+    match (membership_expr, bounds_expr) {
+        (Some(membership), Some(bounds)) => {
+            Some(Arc::new(BinaryExpr::new(bounds, Operator::And, membership))
+                as Arc<dyn PhysicalExpr>)
+        }
+        (Some(membership), None) => Some(membership),
+        (None, Some(bounds)) => Some(bounds),
+        (None, None) => None,
+    }
+}
+
+/// Coordinates build-side information collection across multiple partitions
+///
+/// This structure collects information from the build side (hash tables and/or bounds) and
+/// ensures that dynamic filters are built with complete information from all relevant
+/// partitions before being applied to probe-side scans. Incomplete filters would
 /// incorrectly eliminate valid join results.
 ///
 /// ## Synchronization Strategy
 ///
-/// 1. Each partition computes bounds from its build-side data
-/// 2. Bounds are stored in the shared vector
-/// 3. A barrier tracks how many partitions have reported their bounds
-/// 4. When the last partition reports, bounds are merged and the filter is updated exactly once
+/// 1. Each partition computes information from its build-side data (hash maps and/or bounds)
+/// 2. Information is stored in the shared state, which tracks how many partitions have reported
+/// 3. When the last partition reports, one waiter is elected as the finalizer; it merges the
+///    collected information, updates the dynamic filter exactly once, and publishes the
+///    terminal result by transitioning [`CompletionState`] to `Ready`
+/// 4. A [`tokio::sync::Notify`] wakes any other partitions parked in `wait_for_completion`,
+///    which then observe the `Ready` state under the mutex and return immediately
+///
+/// ## Hash Map vs Bounds
+///
+/// - **Hash Maps (Partitioned mode)**: Collects Arc references to hash tables from each partition.
+///   Creates a `PartitionedHashLookupPhysicalExpr` that routes rows to the correct partition's hash table.
+/// - **Bounds (CollectLeft mode)**: Collects min/max bounds and creates range predicates.
 ///
 /// ## Partition Counting
 ///
@@ -101,25 +235,97 @@ impl PartitionBounds {
 ///
 /// All fields use a single mutex to ensure correct coordination between concurrent
 /// partition executions.
-pub(crate) struct SharedBoundsAccumulator {
-    /// Shared state protected by a single mutex to avoid ordering concerns
-    inner: Mutex<SharedBoundsState>,
-    barrier: Barrier,
+pub(crate) struct SharedBuildAccumulator {
+    /// Build-side data protected by a single mutex to avoid ordering concerns
+    inner: Mutex<AccumulatorState>,
+    /// Wakes every partition that is parked in [`Self::wait_for_completion`]
+    /// once [`AccumulatorState::completion`] transitions to
+    /// [`CompletionState::Ready`]. Notifications are fired once per
+    /// accumulator lifetime (the elected finalizer publishes the terminal
+    /// result, then broadcasts), so late subscribers simply re-check the
+    /// state under the mutex and return immediately.
+    completion_notify: Notify,
     /// Dynamic filter for pushdown to probe side
     dynamic_filter: Arc<DynamicFilterPhysicalExpr>,
-    /// Right side join expressions needed for creating filter bounds
+    /// Right side join expressions needed for creating filter expressions
     on_right: Vec<PhysicalExprRef>,
+    /// Random state for partitioning (RepartitionExec's hash function with 0,0,0,0 seeds)
+    /// Used for PartitionedHashLookupPhysicalExpr
+    repartition_random_state: SeededRandomState,
+    /// Schema of the probe (right) side for evaluating filter expressions
+    probe_schema: Arc<Schema>,
+}
+
+/// Strategy for filter pushdown (decided at collection time)
+#[derive(Clone)]
+pub(crate) enum PushdownStrategy {
+    /// Use InList for small build sides (< 128MB)
+    InList(ArrayRef),
+    /// Use map lookup for large build sides
+    Map(Arc<Map>),
+    /// There was no data in this partition, do not build a dynamic filter for it
+    Empty,
+}
+
+/// Build-side data reported by a single partition
+pub(crate) enum PartitionBuildData {
+    Partitioned {
+        partition_id: usize,
+        pushdown: PushdownStrategy,
+        bounds: PartitionBounds,
+    },
+    CollectLeft {
+        pushdown: PushdownStrategy,
+        bounds: PartitionBounds,
+    },
+}
+
+/// Per-partition accumulated data (Partitioned mode)
+#[derive(Clone)]
+struct PartitionData {
+    bounds: PartitionBounds,
+    pushdown: PushdownStrategy,
+}
+
+/// Build-side data organized by partition mode
+enum AccumulatedBuildData {
+    Partitioned {
+        partitions: Vec<PartitionStatus>,
+        completed_partitions: usize,
+    },
+    CollectLeft {
+        data: PartitionStatus,
+        reported_count: usize,
+        expected_reports: usize,
+    },
+}
+
+enum CompletionState {
+    Pending,
+    Finalizing,
+    Ready(SharedResult<()>),
+}
+
+struct AccumulatorState {
+    data: AccumulatedBuildData,
+    completion: CompletionState,
+}
+
+#[derive(Clone)]
+enum PartitionStatus {
+    Pending,
+    Reported(PartitionData),
+    CanceledUnknown,
 }
 
-/// State protected by SharedBoundsAccumulator's mutex
-struct SharedBoundsState {
-    /// Bounds from completed partitions.
-    /// Each element represents the column bounds computed by one partition.
-    bounds: Vec<PartitionBounds>,
+#[derive(Clone)]
+enum FinalizeInput {
+    Partitioned(Vec<PartitionStatus>),
+    CollectLeft(PartitionStatus),
 }
 
-impl SharedBoundsAccumulator {
-    /// Creates a new SharedBoundsAccumulator configured for the given partition mode
+impl SharedBuildAccumulator {
+    /// Creates a new SharedBuildAccumulator configured for the given partition mode
     ///
     /// This method calculates how many times `collect_build_side` will be called based on the
     /// partition mode's execution pattern. This count is critical for determining when we have
@@ -137,12 +343,12 @@ impl SharedBoundsAccumulator {
     ///   `collect_build_side` once. Expected calls = number of build partitions.
     ///
     /// - **Auto**: Placeholder mode resolved during optimization. Uses 1 as safe default since
-    ///   the actual mode will be determined and a new bounds_accumulator created before execution.
+    ///   the actual mode will be determined and a new accumulator created before execution.
     ///
     /// ## Why This Matters
     ///
     /// We cannot build a partial filter from some partitions - it would incorrectly eliminate
-    /// valid join results. We must wait until we have complete bounds information from ALL
+    /// valid join results. We must wait until we have complete information from ALL
     /// relevant partitions before updating the dynamic filter.
     pub(crate) fn new_from_partition_mode(
         partition_mode: PartitionMode,
@@ -150,6 +356,7 @@ impl SharedBoundsAccumulator {
         right_child: &dyn ExecutionPlan,
         dynamic_filter: Arc<DynamicFilterPhysicalExpr>,
         on_right: Vec<PhysicalExprRef>,
+        repartition_random_state: SeededRandomState,
     ) -> Self {
         // Troubleshooting: If partition counts are incorrect, verify this logic matches
         // the actual execution pattern in collect_build_side()
@@ -163,141 +370,320 @@ impl SharedBoundsAccumulator {
                 left_child.output_partitioning().partition_count()
             }
             // Default value, will be resolved during optimization (does not exist once `execute()` is called; will be replaced by one of the other two)
-            PartitionMode::Auto => unreachable!("PartitionMode::Auto should not be present at execution time. This is a bug in DataFusion, please report it!"),
+            PartitionMode::Auto => unreachable!(
+                "PartitionMode::Auto should not be present at execution time. This is a bug in DataFusion, please report it!"
+            ),
+        };
+
+        let mode_data = match partition_mode {
+            PartitionMode::Partitioned => AccumulatedBuildData::Partitioned {
+                partitions: vec![
+                    PartitionStatus::Pending;
+                    left_child.output_partitioning().partition_count()
+                ],
+                completed_partitions: 0,
+            },
+            PartitionMode::CollectLeft => AccumulatedBuildData::CollectLeft {
+                data: PartitionStatus::Pending,
+                reported_count: 0,
+                expected_reports: expected_calls,
+            },
+            PartitionMode::Auto => unreachable!(
+                "PartitionMode::Auto should not be present at execution time. This is a bug in DataFusion, please report it!"
+            ),
         };
+
         Self {
-            inner: Mutex::new(SharedBoundsState {
-                bounds: Vec::with_capacity(expected_calls),
+            inner: Mutex::new(AccumulatorState {
+                data: mode_data,
+                completion: CompletionState::Pending,
             }),
-            barrier: Barrier::new(expected_calls),
+            completion_notify: Notify::new(),
             dynamic_filter,
             on_right,
+            repartition_random_state,
+            probe_schema: right_child.schema(),
         }
     }
 
-    /// Create a filter expression from individual partition bounds using OR logic.
+    /// Report build-side data from a partition
     ///
-    /// This creates a filter where each partition's bounds form a conjunction (AND)
-    /// of column range predicates, and all partitions are combined with OR.
+    /// This unified method handles both CollectLeft and Partitioned modes. When all partitions
+    /// have reported (barrier wait), the leader builds the appropriate filter expression:
+    /// - CollectLeft: Simple conjunction of bounds and membership check
+    /// - Partitioned: CASE expression routing to per-partition filters
     ///
-    /// For example, with 2 partitions and 2 columns:
-    /// ((col0 >= p0_min0 AND col0 <= p0_max0 AND col1 >= p0_min1 AND col1 <= p0_max1)
-    ///  OR
-    ///  (col0 >= p1_min0 AND col0 <= p1_max0 AND col1 >= p1_min1 AND col1 <= p1_max1))
-    pub(crate) fn create_filter_from_partition_bounds(
-        &self,
-        bounds: &[PartitionBounds],
-    ) -> Result<Arc<dyn PhysicalExpr>> {
-        if bounds.is_empty() {
-            return Ok(lit(true));
+    /// # Arguments
+    /// * `data` - Build data including hash map, pushdown strategy, and bounds
+    ///
+    /// # Returns
+    /// * `Result<()>` - Ok if successful, Err if filter update failed or mode mismatch
+    pub(crate) async fn report_build_data(&self, data: PartitionBuildData) -> Result<()> {
+        let finalize_input = {
+            let mut guard = self.inner.lock();
+            self.store_build_data(&mut guard, data)?;
+            self.take_finalize_input_if_ready(&mut guard)
+        };
+
+        if let Some(finalize_input) = finalize_input {
+            self.finish(finalize_input);
+        }
+
+        self.wait_for_completion().await
+    }
+
+    pub(crate) fn report_canceled_partition(&self, partition_id: usize) {
+        let finalize_input = {
+            let mut guard = self.inner.lock();
+            self.store_canceled_partition(&mut guard, partition_id);
+            self.take_finalize_input_if_ready(&mut guard)
+        };
+
+        if let Some(finalize_input) = finalize_input {
+            self.finish(finalize_input);
         }
+    }
 
-        // Create a predicate for each partition
-        let mut partition_predicates = Vec::with_capacity(bounds.len());
-
-        for partition_bounds in bounds.iter().sorted_by_key(|b| b.partition) {
-            // Create range predicates for each join key in this partition
-            let mut column_predicates = Vec::with_capacity(partition_bounds.len());
-
-            for (col_idx, right_expr) in self.on_right.iter().enumerate() {
-                if let Some(column_bounds) = partition_bounds.get_column_bounds(col_idx) {
-                    // Create predicate: col >= min AND col <= max
-                    let min_expr = Arc::new(BinaryExpr::new(
-                        Arc::clone(right_expr),
-                        Operator::GtEq,
-                        lit(column_bounds.min.clone()),
-                    )) as Arc<dyn PhysicalExpr>;
-                    let max_expr = Arc::new(BinaryExpr::new(
-                        Arc::clone(right_expr),
-                        Operator::LtEq,
-                        lit(column_bounds.max.clone()),
-                    )) as Arc<dyn PhysicalExpr>;
-                    let range_expr =
-                        Arc::new(BinaryExpr::new(min_expr, Operator::And, max_expr))
-                            as Arc<dyn PhysicalExpr>;
-                    column_predicates.push(range_expr);
+    fn store_build_data(
+        &self,
+        guard: &mut AccumulatorState,
+        data: PartitionBuildData,
+    ) -> Result<()> {
+        match (data, &mut guard.data) {
+            (
+                PartitionBuildData::Partitioned {
+                    partition_id,
+                    pushdown,
+                    bounds,
+                },
+                AccumulatedBuildData::Partitioned {
+                    partitions,
+                    completed_partitions,
+                },
+            ) => {
+                if matches!(partitions[partition_id], PartitionStatus::Pending) {
+                    *completed_partitions += 1;
                 }
+                partitions[partition_id] =
+                    PartitionStatus::Reported(PartitionData { pushdown, bounds });
             }
-
-            // Combine all column predicates for this partition with AND
-            if !column_predicates.is_empty() {
-                let partition_predicate = column_predicates
-                    .into_iter()
-                    .reduce(|acc, pred| {
-                        Arc::new(BinaryExpr::new(acc, Operator::And, pred))
-                            as Arc<dyn PhysicalExpr>
-                    })
-                    .unwrap();
-                partition_predicates.push(partition_predicate);
+            (
+                PartitionBuildData::CollectLeft { pushdown, bounds },
+                AccumulatedBuildData::CollectLeft {
+                    data,
+                    reported_count,
+                    ..
+                },
+            ) => {
+                if matches!(data, PartitionStatus::Pending) {
+                    *data = PartitionStatus::Reported(PartitionData { pushdown, bounds });
+                }
+                *reported_count += 1;
+            }
+            _ => {
+                return datafusion_common::internal_err!(
+                    "Build data mode mismatch in report_build_data"
+                );
             }
         }
-
-        // Combine all partition predicates with OR
-        let combined_predicate = partition_predicates
-            .into_iter()
-            .reduce(|acc, pred| {
-                Arc::new(BinaryExpr::new(acc, Operator::Or, pred))
-                    as Arc<dyn PhysicalExpr>
-            })
-            .unwrap_or_else(|| lit(true));
-
-        Ok(combined_predicate)
+        Ok(())
     }
 
-    /// Report bounds from a completed partition and update dynamic filter if all partitions are done
-    ///
-    /// This method coordinates the dynamic filter updates across all partitions. It stores the
-    /// bounds from the current partition, increments the completion counter, and when all
-    /// partitions have reported, creates an OR'd filter from individual partition bounds.
-    ///
-    /// This method is async and uses a [`tokio::sync::Barrier`] to wait for all partitions
-    /// to report their bounds. Once that occurs, the method will resolve for all callers and the
-    /// dynamic filter will be updated exactly once.
-    ///
-    /// # Note
-    ///
-    /// As barriers are reusable, it is likely an error to call this method more times than the
-    /// total number of partitions - as it can lead to pending futures that never resolve. We rely
-    /// on correct usage from the caller rather than imposing additional checks here. If this is a concern,
-    /// consider making the resulting future shared so the ready result can be reused.
-    ///
-    /// # Arguments
-    /// * `left_side_partition_id` - The identifier for the **left-side** partition reporting its bounds
-    /// * `partition_bounds` - The bounds computed by this partition (if any)
-    ///
-    /// # Returns
-    /// * `Result<()>` - Ok if successful, Err if filter update failed
-    pub(crate) async fn report_partition_bounds(
+    fn store_canceled_partition(
         &self,
-        left_side_partition_id: usize,
-        partition_bounds: Option<Vec<ColumnBounds>>,
-    ) -> Result<()> {
-        // Store bounds in the accumulator - this runs once per partition
-        if let Some(bounds) = partition_bounds {
-            let mut guard = self.inner.lock();
+        guard: &mut AccumulatorState,
+        partition_id: usize,
+    ) {
+        if let AccumulatedBuildData::Partitioned {
+            partitions,
+            completed_partitions,
+        } = &mut guard.data
+            && matches!(partitions[partition_id], PartitionStatus::Pending)
+        {
+            partitions[partition_id] = PartitionStatus::CanceledUnknown;
+            *completed_partitions += 1;
+        }
+    }
 
-            let should_push = if let Some(last_bound) = guard.bounds.last() {
-                // In `PartitionMode::CollectLeft`, all streams on the left side share the same partition id (0).
-                // Since this function can be called multiple times for that same partition, we must deduplicate
-                // by checking against the last recorded bound.
-                last_bound.partition != left_side_partition_id
-            } else {
-                true
-            };
+    fn take_finalize_input_if_ready(
+        &self,
+        guard: &mut AccumulatorState,
+    ) -> Option<FinalizeInput> {
+        if !matches!(guard.completion, CompletionState::Pending) {
+            return None;
+        }
 
-            if should_push {
-                guard
-                    .bounds
-                    .push(PartitionBounds::new(left_side_partition_id, bounds));
+        let finalize_input = match &guard.data {
+            AccumulatedBuildData::Partitioned {
+                partitions,
+                completed_partitions,
+            } if *completed_partitions == partitions.len() => {
+                Some(FinalizeInput::Partitioned(partitions.clone()))
+            }
+            AccumulatedBuildData::CollectLeft {
+                data,
+                reported_count,
+                expected_reports,
+            } if *reported_count == *expected_reports => {
+                Some(FinalizeInput::CollectLeft(data.clone()))
             }
+            _ => None,
+        }?;
+
+        guard.completion = CompletionState::Finalizing;
+        Some(finalize_input)
+    }
+
+    fn finish(&self, finalize_input: FinalizeInput) {
+        let result = self.build_filter(finalize_input).map_err(Arc::new);
+        self.dynamic_filter.mark_complete();
+
+        let mut guard = self.inner.lock();
+        guard.completion = CompletionState::Ready(result);
+        drop(guard);
+        self.completion_notify.notify_waiters();
+    }
+
+    async fn wait_for_completion(&self) -> Result<()> {
+        loop {
+            let notified = {
+                let guard = self.inner.lock();
+                match &guard.completion {
+                    CompletionState::Ready(Ok(())) => return Ok(()),
+                    CompletionState::Ready(Err(err)) => {
+                        return Err(DataFusionError::Shared(Arc::clone(err)));
+                    }
+                    CompletionState::Pending | CompletionState::Finalizing => {
+                        self.completion_notify.notified()
+                    }
+                }
+            };
+            notified.await;
         }
+    }
+
+    fn build_filter(&self, finalize_input: FinalizeInput) -> Result<()> {
+        match finalize_input {
+            FinalizeInput::CollectLeft(partition) => match partition {
+                PartitionStatus::Reported(partition_data) => {
+                    let membership_expr = create_membership_predicate(
+                        &self.on_right,
+                        partition_data.pushdown.clone(),
+                        &HASH_JOIN_SEED,
+                        self.probe_schema.as_ref(),
+                    )?;
+                    let bounds_expr =
+                        create_bounds_predicate(&self.on_right, &partition_data.bounds);
+
+                    if let Some(filter_expr) =
+                        combine_membership_and_bounds(membership_expr, bounds_expr)
+                    {
+                        self.dynamic_filter.update(filter_expr)?;
+                    }
+                }
+                PartitionStatus::Pending => {
+                    return datafusion_common::internal_err!(
+                        "attempted to finalize collect-left dynamic filter without reported build data"
+                    );
+                }
+                PartitionStatus::CanceledUnknown => {
+                    return datafusion_common::internal_err!(
+                        "collect-left dynamic filter cannot finalize with canceled build data"
+                    );
+                }
+            },
+            FinalizeInput::Partitioned(partitions) => {
+                let num_partitions = partitions.len();
+                let routing_hash_expr = Arc::new(HashExpr::new(
+                    self.on_right.clone(),
+                    self.repartition_random_state.clone(),
+                    "hash_repartition".to_string(),
+                )) as Arc<dyn PhysicalExpr>;
+
+                let modulo_expr = Arc::new(BinaryExpr::new(
+                    routing_hash_expr,
+                    Operator::Modulo,
+                    lit(ScalarValue::UInt64(Some(num_partitions as u64))),
+                )) as Arc<dyn PhysicalExpr>;
+
+                let mut real_branches = Vec::new();
+                let mut empty_partition_ids = Vec::new();
+                let mut has_canceled_unknown = false;
+
+                for (partition_id, partition) in partitions.iter().enumerate() {
+                    match partition {
+                        PartitionStatus::Reported(partition)
+                            if matches!(partition.pushdown, PushdownStrategy::Empty) =>
+                        {
+                            empty_partition_ids.push(partition_id);
+                        }
+                        PartitionStatus::Reported(partition) => {
+                            let membership_expr = create_membership_predicate(
+                                &self.on_right,
+                                partition.pushdown.clone(),
+                                &HASH_JOIN_SEED,
+                                self.probe_schema.as_ref(),
+                            )?;
+                            let bounds_expr = create_bounds_predicate(
+                                &self.on_right,
+                                &partition.bounds,
+                            );
+                            let then_expr = combine_membership_and_bounds(
+                                membership_expr,
+                                bounds_expr,
+                            )
+                            .unwrap_or_else(|| lit(true));
+                            real_branches.push((
+                                lit(ScalarValue::UInt64(Some(partition_id as u64))),
+                                then_expr,
+                            ));
+                        }
+                        PartitionStatus::CanceledUnknown => {
+                            has_canceled_unknown = true;
+                        }
+                        PartitionStatus::Pending => {
+                            return datafusion_common::internal_err!(
+                                "attempted to finalize dynamic filter with pending partition"
+                            );
+                        }
+                    }
+                }
+
+                let filter_expr = if has_canceled_unknown {
+                    let mut when_then_branches = empty_partition_ids
+                        .into_iter()
+                        .map(|partition_id| {
+                            (
+                                lit(ScalarValue::UInt64(Some(partition_id as u64))),
+                                lit(false),
+                            )
+                        })
+                        .collect::<Vec<_>>();
+                    when_then_branches.extend(real_branches);
+
+                    if when_then_branches.is_empty() {
+                        lit(true)
+                    } else {
+                        Arc::new(CaseExpr::try_new(
+                            Some(modulo_expr),
+                            when_then_branches,
+                            Some(lit(true)),
+                        )?) as Arc<dyn PhysicalExpr>
+                    }
+                } else if real_branches.is_empty() {
+                    lit(false)
+                } else if real_branches.len() == 1
+                    && empty_partition_ids.len() + 1 == num_partitions
+                {
+                    Arc::clone(&real_branches[0].1)
+                } else {
+                    Arc::new(CaseExpr::try_new(
+                        Some(modulo_expr),
+                        real_branches,
+                        Some(lit(false)),
+                    )?) as Arc<dyn PhysicalExpr>
+                };
 
-        if self.barrier.wait().await.is_leader() {
-            // All partitions have reported, so we can update the filter
-            let inner = self.inner.lock();
-            if !inner.bounds.is_empty() {
-                let filter_expr =
-                    self.create_filter_from_partition_bounds(&inner.bounds)?;
                 self.dynamic_filter.update(filter_expr)?;
             }
         }
@@ -306,8 +692,105 @@ impl SharedBoundsAccumulator {
     }
 }
 
-impl fmt::Debug for SharedBoundsAccumulator {
+impl fmt::Debug for SharedBuildAccumulator {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "SharedBoundsAccumulator")
+        write!(f, "SharedBuildAccumulator")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn make_partitioned_accumulator(num_partitions: usize) -> SharedBuildAccumulator {
+        let probe_schema = Arc::new(Schema::new(vec![Field::new(
+            "probe_key",
+            DataType::Int32,
+            false,
+        )]));
+        let dynamic_filter = Arc::new(DynamicFilterPhysicalExpr::new(vec![], lit(true)));
+        SharedBuildAccumulator {
+            inner: Mutex::new(AccumulatorState {
+                data: AccumulatedBuildData::Partitioned {
+                    partitions: vec![PartitionStatus::Pending; num_partitions],
+                    completed_partitions: 0,
+                },
+                completion: CompletionState::Pending,
+            }),
+            completion_notify: Notify::new(),
+            dynamic_filter,
+            on_right: vec![],
+            repartition_random_state: SeededRandomState::with_seed(1),
+            probe_schema,
+        }
+    }
+
+    fn partitioned_state(acc: &SharedBuildAccumulator) -> (Vec<PartitionStatus>, usize) {
+        let guard = acc.inner.lock();
+        let AccumulatedBuildData::Partitioned {
+            partitions,
+            completed_partitions,
+        } = &guard.data
+        else {
+            panic!("expected partitioned accumulator");
+        };
+        (partitions.clone(), *completed_partitions)
+    }
+
+    // Regression guard for the build-report lifecycle fix: on `Drop`, a stream
+    // in `BuildReportState::ReportScheduled` still calls `report_canceled_partition`
+    // because it cannot tell whether the coordinator has already observed the
+    // report (first poll of the `OnceFut` runs `store_build_data` synchronously
+    // before the future's first `.await`, but the stream doesn't learn that
+    // until `get_shared` returns `Ok`). Correctness therefore relies on
+    // `store_canceled_partition` being a no-op when the partition is already
+    // `Reported`. This test pins that invariant.
+    #[test]
+    fn report_canceled_partition_is_noop_after_report() {
+        let acc = make_partitioned_accumulator(2);
+
+        {
+            let mut guard = acc.inner.lock();
+            acc.store_build_data(
+                &mut guard,
+                PartitionBuildData::Partitioned {
+                    partition_id: 0,
+                    pushdown: PushdownStrategy::Empty,
+                    bounds: PartitionBounds::new(vec![]),
+                },
+            )
+            .unwrap();
+        }
+        let (partitions, completed) = partitioned_state(&acc);
+        assert!(matches!(partitions[0], PartitionStatus::Reported(_)));
+        assert_eq!(completed, 1);
+
+        acc.report_canceled_partition(0);
+        let (partitions, completed) = partitioned_state(&acc);
+        assert!(
+            matches!(partitions[0], PartitionStatus::Reported(_)),
+            "late cancel must not overwrite a prior Reported status"
+        );
+        assert_eq!(completed, 1, "late cancel must not double-count completion");
+    }
+
+    // Drop from the `NotReported` (or first-poll-never-ran) state must
+    // transition `Pending` -> `CanceledUnknown` and bump `completed_partitions`,
+    // which is what unblocks sibling partitions waiting on the coordinator.
+    #[test]
+    fn report_canceled_partition_marks_pending_partition_canceled() {
+        let acc = make_partitioned_accumulator(2);
+
+        acc.report_canceled_partition(0);
+        let (partitions, completed) = partitioned_state(&acc);
+        assert!(matches!(partitions[0], PartitionStatus::CanceledUnknown));
+        assert_eq!(completed, 1);
+
+        // Idempotent: a second cancel (e.g. a stray double-drop) must not
+        // double-count completion.
+        acc.report_canceled_partition(0);
+        let (partitions, completed) = partitioned_state(&acc);
+        assert!(matches!(partitions[0], PartitionStatus::CanceledUnknown));
+        assert_eq!(completed, 1);
     }
 }
diff --git a/datafusion/physical-plan/src/joins/hash_join/stream.rs b/datafusion/physical-plan/src/joins/hash_join/stream.rs
index bb3465365ec96..9885fb5c5c70a 100644
--- a/datafusion/physical-plan/src/joins/hash_join/stream.rs
+++ b/datafusion/physical-plan/src/joins/hash_join/stream.rs
@@ -21,37 +21,42 @@
 //! [`super::HashJoinExec`]. See comments in [`HashJoinStream`] for more details.
 
 use std::sync::Arc;
+use std::sync::atomic::Ordering;
 use std::task::Poll;
 
+use crate::coalesce::{LimitedBatchCoalescer, PushBatchStatus};
+use crate::joins::Map;
+use crate::joins::MapOffset;
+use crate::joins::PartitionMode;
 use crate::joins::hash_join::exec::JoinLeftData;
-use crate::joins::hash_join::shared_bounds::SharedBoundsAccumulator;
+use crate::joins::hash_join::shared_bounds::{
+    PartitionBounds, PartitionBuildData, SharedBuildAccumulator,
+};
 use crate::joins::utils::{
-    equal_rows_arr, get_final_indices_from_shared_bitmap, OnceFut,
+    OnceFut, equal_rows_arr, get_final_indices_from_shared_bitmap,
 };
-use crate::joins::PartitionMode;
 use crate::{
-    handle_state,
+    RecordBatchStream, SendableRecordBatchStream, handle_state,
     hash_utils::create_hashes,
-    joins::join_hash_map::JoinHashMapOffset,
     joins::utils::{
-        adjust_indices_by_join_type, apply_join_filter_to_indices,
+        BuildProbeJoinMetrics, ColumnIndex, JoinFilter, JoinHashMapType,
+        StatefulStreamResult, adjust_indices_by_join_type, apply_join_filter_to_indices,
         build_batch_empty_build_side, build_batch_from_indices,
-        need_produce_result_in_final, BuildProbeJoinMetrics, ColumnIndex, JoinFilter,
-        JoinHashMapType, StatefulStreamResult,
+        need_produce_result_in_final,
     },
-    RecordBatchStream, SendableRecordBatchStream,
 };
 
-use arrow::array::{ArrayRef, UInt32Array, UInt64Array};
+use arrow::array::{Array, ArrayRef, UInt32Array, UInt64Array};
 use arrow::datatypes::{Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use datafusion_common::{
-    internal_datafusion_err, internal_err, JoinSide, JoinType, NullEquality, Result,
+    JoinSide, JoinType, NullEquality, Result, internal_datafusion_err, internal_err,
 };
 use datafusion_physical_expr::PhysicalExprRef;
 
-use ahash::RandomState;
-use futures::{ready, Stream, StreamExt};
+use datafusion_common::hash_utils::RandomState;
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays;
+use futures::{Stream, StreamExt, ready};
 
 /// Represents build-side of hash join.
 pub(super) enum BuildSide {
@@ -151,13 +156,13 @@ pub(super) struct ProcessProbeBatchState {
     /// Probe-side on expressions values
     values: Vec<ArrayRef>,
     /// Starting offset for JoinHashMap lookups
-    offset: JoinHashMapOffset,
+    offset: MapOffset,
     /// Max joined probe-side index from current batch
     joined_probe_idx: Option<usize>,
 }
 
 impl ProcessProbeBatchState {
-    fn advance(&mut self, offset: JoinHashMapOffset, joined_probe_idx: Option<usize>) {
+    fn advance(&mut self, offset: MapOffset, joined_probe_idx: Option<usize>) {
         self.offset = offset;
         if joined_probe_idx.is_some() {
             self.joined_probe_idx = joined_probe_idx;
@@ -165,6 +170,19 @@ impl ProcessProbeBatchState {
     }
 }
 
+/// Lifecycle of this partition's build-data report to the shared coordinator.
+///
+/// `ReportScheduled` means the reporting `OnceFut` has been constructed but is
+/// lazy: the coordinator has not yet observed the report. Only `ReportDelivered`
+/// guarantees the coordinator saw it, so `Drop` must still cancel the partition
+/// when the state is `ReportScheduled` — otherwise sibling partitions wait
+/// forever for a report that never runs.
+enum BuildReportState {
+    NotReported,
+    ReportScheduled,
+    ReportDelivered,
+}
+
 /// [`Stream`] for [`super::HashJoinExec`] that does the actual join.
 ///
 /// This stream:
@@ -203,16 +221,26 @@ pub(super) struct HashJoinStream {
     batch_size: usize,
     /// Scratch space for computing hashes
     hashes_buffer: Vec<u64>,
+    /// Scratch space for probe indices during hash lookup
+    probe_indices_buffer: Vec<u32>,
+    /// Scratch space for build indices during hash lookup
+    build_indices_buffer: Vec<u64>,
     /// Specifies whether the right side has an ordering to potentially preserve
     right_side_ordered: bool,
-    /// Shared bounds accumulator for coordinating dynamic filter updates (optional)
-    bounds_accumulator: Option<Arc<SharedBoundsAccumulator>>,
-    /// Optional future to signal when bounds have been reported by all partitions
+    /// Shared build accumulator for coordinating dynamic filter updates (collects hash maps and/or bounds, optional)
+    build_accumulator: Option<Arc<SharedBuildAccumulator>>,
+    /// Optional future to signal when build information has been reported by all partitions
     /// and the dynamic filter has been updated
-    bounds_waiter: Option<OnceFut<()>>,
-
+    build_waiter: Option<OnceFut<()>>,
+    /// Tracks where this partition is in the build-data reporting lifecycle.
+    build_report_state: BuildReportState,
     /// Partitioning mode to use
     mode: PartitionMode,
+    /// Output buffer for coalescing small batches into larger ones with optional fetch limit.
+    /// Uses `LimitedBatchCoalescer` to efficiently combine batches and absorb limit with 'fetch'
+    output_buffer: LimitedBatchCoalescer,
+    /// Whether this is a null-aware anti join
+    null_aware: bool,
 }
 
 impl RecordBatchStream for HashJoinStream {
@@ -269,7 +297,7 @@ impl RecordBatchStream for HashJoinStream {
 /// Build indices: 4, 5, 6, 6
 /// Probe indices: 3, 3, 4, 5
 /// ```
-#[allow(clippy::too_many_arguments)]
+#[expect(clippy::too_many_arguments)]
 pub(super) fn lookup_join_hashmap(
     build_hashmap: &dyn JoinHashMapType,
     build_side_values: &[ArrayRef],
@@ -277,27 +305,71 @@ pub(super) fn lookup_join_hashmap(
     null_equality: NullEquality,
     hashes_buffer: &[u64],
     limit: usize,
-    offset: JoinHashMapOffset,
-) -> Result<(UInt64Array, UInt32Array, Option<JoinHashMapOffset>)> {
-    let (probe_indices, build_indices, next_offset) =
-        build_hashmap.get_matched_indices_with_limit_offset(hashes_buffer, limit, offset);
-
-    let build_indices: UInt64Array = build_indices.into();
-    let probe_indices: UInt32Array = probe_indices.into();
-
+    offset: MapOffset,
+    probe_indices_buffer: &mut Vec<u32>,
+    build_indices_buffer: &mut Vec<u64>,
+) -> Result<(UInt64Array, UInt32Array, Option<MapOffset>)> {
+    let next_offset = build_hashmap.get_matched_indices_with_limit_offset(
+        hashes_buffer,
+        limit,
+        offset,
+        probe_indices_buffer,
+        build_indices_buffer,
+    );
+
+    let build_indices_unfiltered: UInt64Array =
+        std::mem::take(build_indices_buffer).into();
+    let probe_indices_unfiltered: UInt32Array =
+        std::mem::take(probe_indices_buffer).into();
+
+    // TODO: optimize equal_rows_arr to avoid allocation of intermediate arrays
+    // https://github.com/apache/datafusion/issues/12131
     let (build_indices, probe_indices) = equal_rows_arr(
-        &build_indices,
-        &probe_indices,
+        &build_indices_unfiltered,
+        &probe_indices_unfiltered,
         build_side_values,
         probe_side_values,
         null_equality,
     )?;
 
+    // Reclaim buffers
+    *build_indices_buffer = build_indices_unfiltered.into_parts().1.into();
+    *probe_indices_buffer = probe_indices_unfiltered.into_parts().1.into();
+
     Ok((build_indices, probe_indices, next_offset))
 }
 
+/// Counts the number of distinct elements in the input array.
+///
+/// The input array must be sorted (e.g., `[0, 1, 1, 2, 2, ...]`) and contain no null values.
+#[inline]
+fn count_distinct_sorted_indices(indices: &UInt32Array) -> usize {
+    if indices.is_empty() {
+        return 0;
+    }
+
+    debug_assert!(indices.null_count() == 0);
+
+    let values_buf = indices.values();
+    let values = values_buf.as_ref();
+    let mut iter = values.iter();
+    let Some(&first) = iter.next() else {
+        return 0;
+    };
+
+    let mut count = 1usize;
+    let mut last = first;
+    for &value in iter {
+        if value != last {
+            last = value;
+            count += 1;
+        }
+    }
+    count
+}
+
 impl HashJoinStream {
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     pub(super) fn new(
         partition: usize,
         schema: Arc<Schema>,
@@ -314,9 +386,15 @@ impl HashJoinStream {
         batch_size: usize,
         hashes_buffer: Vec<u64>,
         right_side_ordered: bool,
-        bounds_accumulator: Option<Arc<SharedBoundsAccumulator>>,
+        build_accumulator: Option<Arc<SharedBuildAccumulator>>,
         mode: PartitionMode,
+        null_aware: bool,
+        fetch: Option<usize>,
     ) -> Self {
+        // Create output buffer with coalescing and optional fetch limit.
+        let output_buffer =
+            LimitedBatchCoalescer::new(Arc::clone(&schema), batch_size, fetch);
+
         Self {
             partition,
             schema,
@@ -332,13 +410,76 @@ impl HashJoinStream {
             build_side,
             batch_size,
             hashes_buffer,
+            probe_indices_buffer: Vec::with_capacity(batch_size),
+            build_indices_buffer: Vec::with_capacity(batch_size),
             right_side_ordered,
-            bounds_accumulator,
-            bounds_waiter: None,
+            build_accumulator,
+            build_waiter: None,
+            build_report_state: BuildReportState::NotReported,
             mode,
+            output_buffer,
+            null_aware,
+        }
+    }
+
+    /// Returns the next state after the build side has been fully collected
+    /// and any required build-side coordination has completed.
+    fn state_after_build_ready(
+        join_type: JoinType,
+        left_data: &JoinLeftData,
+    ) -> HashJoinStreamState {
+        if left_data.map().is_empty()
+            && join_type.empty_build_side_produces_empty_result()
+        {
+            HashJoinStreamState::Completed
+        } else {
+            HashJoinStreamState::FetchProbeBatch
         }
     }
 
+    /// Transitions state after build-side data has been collected, automatically
+    /// reporting build data to the accumulator when one is present.
+    ///
+    /// If a `build_accumulator` is configured, this method constructs the
+    /// appropriate [`PartitionBuildData`], schedules the reporting future, and
+    /// returns [`HashJoinStreamState::WaitPartitionBoundsReport`]. Otherwise it
+    /// delegates to [`Self::state_after_build_ready`].
+    fn transition_after_build_collected(
+        &mut self,
+        left_data: &Arc<JoinLeftData>,
+    ) -> HashJoinStreamState {
+        let Some(build_accumulator) = self.build_accumulator.as_ref() else {
+            return Self::state_after_build_ready(self.join_type, left_data.as_ref());
+        };
+
+        let pushdown = left_data.membership().clone();
+        let bounds = left_data
+            .bounds
+            .clone()
+            .unwrap_or_else(|| PartitionBounds::new(vec![]));
+
+        let build_data = match self.mode {
+            PartitionMode::Partitioned => PartitionBuildData::Partitioned {
+                partition_id: self.partition,
+                pushdown,
+                bounds,
+            },
+            PartitionMode::CollectLeft => {
+                PartitionBuildData::CollectLeft { pushdown, bounds }
+            }
+            PartitionMode::Auto => unreachable!(
+                "PartitionMode::Auto should not be present at execution time. This is a bug in DataFusion, please report it!"
+            ),
+        };
+
+        let acc = Arc::clone(build_accumulator);
+        self.build_waiter = Some(OnceFut::new(async move {
+            acc.report_build_data(build_data).await
+        }));
+        self.build_report_state = BuildReportState::ReportScheduled;
+        HashJoinStreamState::WaitPartitionBoundsReport
+    }
+
     /// Separate implementation function that unpins the [`HashJoinStream`] so
     /// that partial borrows work correctly
     fn poll_next_impl(
@@ -346,6 +487,19 @@ impl HashJoinStream {
         cx: &mut std::task::Context<'_>,
     ) -> Poll<Option<Result<RecordBatch>>> {
         loop {
+            // First, check if we have any completed batches ready to emit
+            if let Some(batch) = self.output_buffer.next_completed_batch() {
+                return self
+                    .join_metrics
+                    .baseline
+                    .record_poll(Poll::Ready(Some(Ok(batch))));
+            }
+
+            // Check if the coalescer has finished (limit reached and flushed)
+            if self.output_buffer.is_finished() {
+                return Poll::Ready(None);
+            }
+
             return match self.state {
                 HashJoinStreamState::WaitBuildSide => {
                     handle_state!(ready!(self.collect_build_side(cx)))
@@ -357,24 +511,28 @@ impl HashJoinStream {
                     handle_state!(ready!(self.fetch_probe_batch(cx)))
                 }
                 HashJoinStreamState::ProcessProbeBatch(_) => {
-                    let poll = handle_state!(self.process_probe_batch());
-                    self.join_metrics.baseline.record_poll(poll)
+                    handle_state!(self.process_probe_batch())
                 }
                 HashJoinStreamState::ExhaustedProbeSide => {
-                    let poll = handle_state!(self.process_unmatched_build_batch());
-                    self.join_metrics.baseline.record_poll(poll)
+                    handle_state!(self.process_unmatched_build_batch())
+                }
+                HashJoinStreamState::Completed if !self.output_buffer.is_empty() => {
+                    // Flush any remaining buffered data
+                    self.output_buffer.finish()?;
+                    // Continue loop to emit the flushed batch
+                    continue;
                 }
                 HashJoinStreamState::Completed => Poll::Ready(None),
             };
         }
     }
 
-    /// Optional step to wait until bounds have been reported by all partitions.
-    /// This state is only entered if a bounds accumulator is present.
+    /// Optional step to wait until build-side information (hash maps or bounds) has been reported by all partitions.
+    /// This state is only entered if a build accumulator is present.
     ///
     /// ## Why wait?
     ///
-    /// The dynamic filter is only built once all partitions have reported their bounds.
+    /// The dynamic filter is only built once all partitions have reported their information (hash maps or bounds).
     /// If we do not wait here, the probe-side scan may start before the filter is ready.
     /// This can lead to the probe-side scan missing the opportunity to apply the filter
     /// and skip reading unnecessary data.
@@ -382,10 +540,13 @@ impl HashJoinStream {
         &mut self,
         cx: &mut std::task::Context<'_>,
     ) -> Poll<Result<StatefulStreamResult<Option<RecordBatch>>>> {
-        if let Some(ref mut fut) = self.bounds_waiter {
+        if let Some(ref mut fut) = self.build_waiter {
             ready!(fut.get_shared(cx))?;
+            self.build_report_state = BuildReportState::ReportDelivered;
         }
-        self.state = HashJoinStreamState::FetchProbeBatch;
+        let build_side = self.build_side.try_as_ready()?;
+        self.state =
+            Self::state_after_build_ready(self.join_type, build_side.left_data.as_ref());
         Poll::Ready(Ok(StatefulStreamResult::Continue))
     }
 
@@ -398,36 +559,19 @@ impl HashJoinStream {
     ) -> Poll<Result<StatefulStreamResult<Option<RecordBatch>>>> {
         let build_timer = self.join_metrics.build_time.timer();
         // build hash table from left (build) side, if not yet done
-        let left_data = ready!(self
-            .build_side
-            .try_as_initial_mut()?
-            .left_fut
-            .get_shared(cx))?;
+        let left_data = ready!(
+            self.build_side
+                .try_as_initial_mut()?
+                .left_fut
+                .get_shared(cx)
+        )?;
         build_timer.done();
 
-        // Handle dynamic filter bounds accumulation
-        //
-        // Dynamic filter coordination between partitions:
-        // Report bounds to the accumulator which will handle synchronization and filter updates
-        if let Some(ref bounds_accumulator) = self.bounds_accumulator {
-            let bounds_accumulator = Arc::clone(bounds_accumulator);
-
-            let left_side_partition_id = match self.mode {
-                PartitionMode::Partitioned => self.partition,
-                PartitionMode::CollectLeft => 0,
-                PartitionMode::Auto => unreachable!("PartitionMode::Auto should not be present at execution time. This is a bug in DataFusion, please report it!"),
-            };
+        // Note: For null-aware anti join, we need to check the probe side (right) for NULLs,
+        // not the build side (left). The probe-side NULL check happens during process_probe_batch.
+        // The probe_side_has_null flag will be set there if any probe batch contains NULL.
 
-            let left_data_bounds = left_data.bounds.clone();
-            self.bounds_waiter = Some(OnceFut::new(async move {
-                bounds_accumulator
-                    .report_partition_bounds(left_side_partition_id, left_data_bounds)
-                    .await
-            }));
-            self.state = HashJoinStreamState::WaitPartitionBoundsReport;
-        } else {
-            self.state = HashJoinStreamState::FetchProbeBatch;
-        }
+        self.state = self.transition_after_build_collected(&left_data);
 
         self.build_side = BuildSide::Ready(BuildSideReadyState { left_data });
         Poll::Ready(Ok(StatefulStreamResult::Continue))
@@ -447,15 +591,17 @@ impl HashJoinStream {
             }
             Some(Ok(batch)) => {
                 // Precalculate hash values for fetched batch
-                let keys_values = self
-                    .on_right
-                    .iter()
-                    .map(|c| c.evaluate(&batch)?.into_array(batch.num_rows()))
-                    .collect::<Result<Vec<_>>>()?;
-
-                self.hashes_buffer.clear();
-                self.hashes_buffer.resize(batch.num_rows(), 0);
-                create_hashes(&keys_values, &self.random_state, &mut self.hashes_buffer)?;
+                let keys_values = evaluate_expressions_to_arrays(&self.on_right, &batch)?;
+
+                if let Map::HashMap(_) = self.build_side.try_as_ready()?.left_data.map() {
+                    self.hashes_buffer.clear();
+                    self.hashes_buffer.resize(batch.num_rows(), 0);
+                    create_hashes(
+                        &keys_values,
+                        &self.random_state,
+                        &mut self.hashes_buffer,
+                    )?;
+                }
 
                 self.join_metrics.input_batches.add(1);
                 self.join_metrics.input_rows.add(batch.num_rows());
@@ -483,10 +629,58 @@ impl HashJoinStream {
         let state = self.state.try_as_process_probe_batch_mut()?;
         let build_side = self.build_side.try_as_ready_mut()?;
 
+        self.join_metrics
+            .probe_hit_rate
+            .add_total(state.batch.num_rows());
+
         let timer = self.join_metrics.join_time.timer();
 
-        // if the left side is empty, we can skip the (potentially expensive) join operation
-        if build_side.left_data.hash_map.is_empty() && self.filter.is_none() {
+        // Null-aware anti join semantics:
+        // For LeftAnti: output LEFT (build) rows where LEFT.key NOT IN RIGHT.key
+        // 1. If RIGHT (probe) contains NULL in any batch, no LEFT rows should be output
+        // 2. LEFT rows with NULL keys should not be output (handled in final stage)
+        if self.null_aware {
+            // Mark that we've seen a probe batch with actual rows (probe side is non-empty)
+            // Only set this if batch has rows - empty batches don't count
+            // Use shared atomic state so all partitions can see this global information
+            if state.batch.num_rows() > 0 {
+                build_side
+                    .left_data
+                    .probe_side_non_empty
+                    .store(true, Ordering::Relaxed);
+            }
+
+            // Check if probe side (RIGHT) contains NULL
+            // Since null_aware validation ensures single column join, we only check the first column
+            let probe_key_column = &state.values[0];
+            if probe_key_column.null_count() > 0 {
+                // Found NULL in probe side - set shared flag to prevent any output
+                build_side
+                    .left_data
+                    .probe_side_has_null
+                    .store(true, Ordering::Relaxed);
+            }
+
+            // If probe side has NULL (detected in this or any other partition), return empty result
+            if build_side
+                .left_data
+                .probe_side_has_null
+                .load(Ordering::Relaxed)
+            {
+                timer.done();
+                self.state = HashJoinStreamState::FetchProbeBatch;
+                return Ok(StatefulStreamResult::Continue);
+            }
+        }
+
+        // If the build side is empty, this stream only reaches ProcessProbeBatch for
+        // join types whose output still depends on probe rows.
+        let is_empty = build_side.left_data.map().is_empty();
+
+        if is_empty {
+            // Invariant: state_after_build_ready should have already completed
+            // join types whose result is fixed to empty when the build side is empty.
+            debug_assert!(!self.join_type.empty_build_side_produces_empty_result());
             let result = build_batch_empty_build_side(
                 &self.schema,
                 build_side.left_data.batch(),
@@ -494,24 +688,54 @@ impl HashJoinStream {
                 &self.column_indices,
                 self.join_type,
             )?;
-            self.join_metrics.output_batches.add(1);
             timer.done();
-
+            self.output_buffer.push_batch(result)?;
             self.state = HashJoinStreamState::FetchProbeBatch;
 
-            return Ok(StatefulStreamResult::Ready(Some(result)));
+            return Ok(StatefulStreamResult::Continue);
         }
 
         // get the matched by join keys indices
-        let (left_indices, right_indices, next_offset) = lookup_join_hashmap(
-            build_side.left_data.hash_map(),
-            build_side.left_data.values(),
-            &state.values,
-            self.null_equality,
-            &self.hashes_buffer,
-            self.batch_size,
-            state.offset,
-        )?;
+        let (left_indices, right_indices, next_offset) = match build_side.left_data.map()
+        {
+            Map::HashMap(map) => lookup_join_hashmap(
+                map.as_ref(),
+                build_side.left_data.values(),
+                &state.values,
+                self.null_equality,
+                &self.hashes_buffer,
+                self.batch_size,
+                state.offset,
+                &mut self.probe_indices_buffer,
+                &mut self.build_indices_buffer,
+            )?,
+            Map::ArrayMap(array_map) => {
+                let next_offset = array_map.get_matched_indices_with_limit_offset(
+                    &state.values,
+                    self.batch_size,
+                    state.offset,
+                    &mut self.probe_indices_buffer,
+                    &mut self.build_indices_buffer,
+                )?;
+                (
+                    UInt64Array::from(self.build_indices_buffer.clone()),
+                    UInt32Array::from(self.probe_indices_buffer.clone()),
+                    next_offset,
+                )
+            }
+        };
+
+        let distinct_right_indices_count = count_distinct_sorted_indices(&right_indices);
+
+        self.join_metrics
+            .probe_hit_rate
+            .add_part(distinct_right_indices_count);
+
+        self.join_metrics.avg_fanout.add_part(left_indices.len());
+
+        self.join_metrics
+            .avg_fanout
+            .add_total(distinct_right_indices_count);
 
         // apply join filter if exists
         let (left_indices, right_indices) = if let Some(filter) = &self.filter {
@@ -523,6 +747,7 @@ impl HashJoinStream {
                 filter,
                 JoinSide::Left,
                 None,
+                self.join_type,
             )?
         } else {
             (left_indices, right_indices)
@@ -575,31 +800,36 @@ impl HashJoinStream {
             self.right_side_ordered,
         )?;
 
-        let result = if self.join_type == JoinType::RightMark {
-            build_batch_from_indices(
-                &self.schema,
-                &state.batch,
-                build_side.left_data.batch(),
-                &left_indices,
-                &right_indices,
-                &self.column_indices,
-                JoinSide::Right,
-            )?
-        } else {
-            build_batch_from_indices(
-                &self.schema,
-                build_side.left_data.batch(),
-                &state.batch,
-                &left_indices,
-                &right_indices,
-                &self.column_indices,
-                JoinSide::Left,
-            )?
-        };
+        // Build output batch and push to coalescer
+        let (build_batch, probe_batch, join_side) =
+            if self.join_type == JoinType::RightMark {
+                (&state.batch, build_side.left_data.batch(), JoinSide::Right)
+            } else {
+                (build_side.left_data.batch(), &state.batch, JoinSide::Left)
+            };
+
+        let batch = build_batch_from_indices(
+            &self.schema,
+            build_batch,
+            probe_batch,
+            &left_indices,
+            &right_indices,
+            &self.column_indices,
+            join_side,
+            self.join_type,
+        )?;
+
+        let push_status = self.output_buffer.push_batch(batch)?;
 
-        self.join_metrics.output_batches.add(1);
         timer.done();
 
+        // If limit reached, finish and move to Completed state
+        if push_status == PushBatchStatus::LimitReached {
+            self.output_buffer.finish()?;
+            self.state = HashJoinStreamState::Completed;
+            return Ok(StatefulStreamResult::Continue);
+        }
+
         if next_offset.is_none() {
             self.state = HashJoinStreamState::FetchProbeBatch;
         } else {
@@ -610,7 +840,7 @@ impl HashJoinStream {
             )
         };
 
-        Ok(StatefulStreamResult::Ready(Some(result)))
+        Ok(StatefulStreamResult::Continue)
     }
 
     /// Processes unmatched build-side rows for certain join types and produces output batch
@@ -627,40 +857,95 @@ impl HashJoinStream {
         }
 
         let build_side = self.build_side.try_as_ready()?;
+
+        // For null-aware anti join, if probe side had NULL, no rows should be output
+        // Check shared atomic state to get global knowledge across all partitions
+        if self.null_aware
+            && build_side
+                .left_data
+                .probe_side_has_null
+                .load(Ordering::Relaxed)
+        {
+            timer.done();
+            self.state = HashJoinStreamState::Completed;
+            return Ok(StatefulStreamResult::Continue);
+        }
         if !build_side.left_data.report_probe_completed() {
             self.state = HashJoinStreamState::Completed;
             return Ok(StatefulStreamResult::Continue);
         }
 
         // use the global left bitmap to produce the left indices and right indices
-        let (left_side, right_side) = get_final_indices_from_shared_bitmap(
+        let (mut left_side, mut right_side) = get_final_indices_from_shared_bitmap(
             build_side.left_data.visited_indices_bitmap(),
             self.join_type,
             true,
         );
-        let empty_right_batch = RecordBatch::new_empty(self.right.schema());
-        // use the left and right indices to produce the batch result
-        let result = build_batch_from_indices(
-            &self.schema,
-            build_side.left_data.batch(),
-            &empty_right_batch,
-            &left_side,
-            &right_side,
-            &self.column_indices,
-            JoinSide::Left,
-        );
 
-        if let Ok(ref batch) = result {
-            self.join_metrics.input_batches.add(1);
-            self.join_metrics.input_rows.add(batch.num_rows());
-
-            self.join_metrics.output_batches.add(1);
+        // For null-aware anti join, filter out LEFT rows with NULL in join keys
+        // BUT only if the probe side (RIGHT) was non-empty. If probe side is empty,
+        // NULL NOT IN (empty) = TRUE, so NULL rows should be returned.
+        // Use shared atomic state to get global knowledge across all partitions
+        if self.null_aware
+            && self.join_type == JoinType::LeftAnti
+            && build_side
+                .left_data
+                .probe_side_non_empty
+                .load(Ordering::Relaxed)
+        {
+            // Since null_aware validation ensures single column join, we only check the first column
+            let build_key_column = &build_side.left_data.values()[0];
+
+            // Filter out indices where the key is NULL
+            let filtered_indices: Vec<u64> = left_side
+                .iter()
+                .filter_map(|idx| {
+                    let idx_usize = idx.unwrap() as usize;
+                    if build_key_column.is_null(idx_usize) {
+                        None // Skip rows with NULL keys
+                    } else {
+                        Some(idx.unwrap())
+                    }
+                })
+                .collect();
+
+            left_side = UInt64Array::from(filtered_indices);
+
+            // Update right_side to match the new length
+            let mut builder = arrow::array::UInt32Builder::with_capacity(left_side.len());
+            builder.append_nulls(left_side.len());
+            right_side = builder.finish();
         }
+
+        self.join_metrics.input_batches.add(1);
+        self.join_metrics.input_rows.add(left_side.len());
+
         timer.done();
 
         self.state = HashJoinStreamState::Completed;
 
-        Ok(StatefulStreamResult::Ready(Some(result?)))
+        // Push final unmatched indices to output buffer
+        if !left_side.is_empty() {
+            let empty_right_batch = RecordBatch::new_empty(self.right.schema());
+            let batch = build_batch_from_indices(
+                &self.schema,
+                build_side.left_data.batch(),
+                &empty_right_batch,
+                &left_side,
+                &right_side,
+                &self.column_indices,
+                JoinSide::Left,
+                self.join_type,
+            )?;
+            let push_status = self.output_buffer.push_batch(batch)?;
+
+            // If limit reached, finish the coalescer
+            if push_status == PushBatchStatus::LimitReached {
+                self.output_buffer.finish()?;
+            }
+        }
+
+        Ok(StatefulStreamResult::Continue)
     }
 }
 
@@ -674,3 +959,15 @@ impl Stream for HashJoinStream {
         self.poll_next_impl(cx)
     }
 }
+
+impl Drop for HashJoinStream {
+    fn drop(&mut self) {
+        if self.mode == PartitionMode::Partitioned
+            && !matches!(self.build_report_state, BuildReportState::ReportDelivered)
+            && let Some(build_accumulator) = &self.build_accumulator
+        {
+            build_accumulator.report_canceled_partition(self.partition);
+            self.build_report_state = BuildReportState::ReportDelivered;
+        }
+    }
+}
diff --git a/datafusion/physical-plan/src/joins/join_hash_map.rs b/datafusion/physical-plan/src/joins/join_hash_map.rs
index bdd4bfeeb0fbe..8f0fb66b64fbf 100644
--- a/datafusion/physical-plan/src/joins/join_hash_map.rs
+++ b/datafusion/physical-plan/src/joins/join_hash_map.rs
@@ -22,8 +22,11 @@
 use std::fmt::{self, Debug};
 use std::ops::Sub;
 
-use hashbrown::hash_table::Entry::{Occupied, Vacant};
+use arrow::array::BooleanArray;
+use arrow::buffer::BooleanBuffer;
+use arrow::datatypes::ArrowNativeType;
 use hashbrown::HashTable;
+use hashbrown::hash_table::Entry::{Occupied, Vacant};
 
 /// Maps a `u64` hash value based on the build side ["on" values] to a list of indices with this key's value.
 ///
@@ -93,6 +96,12 @@ use hashbrown::HashTable;
 ///
 /// At runtime we choose between using `JoinHashMapU32` and `JoinHashMapU64` which oth implement
 /// `JoinHashMapType`.
+///
+/// ## Note on use of this trait as a public API
+/// This is currently a public trait but is mainly intended for internal use within DataFusion.
+/// For example, we may compare references to `JoinHashMapType` implementations by pointer equality
+/// rather than deep equality of contents, as deep equality would be expensive and in our usage
+/// patterns it is impossible for two different hash maps to have identical contents in a practical sense.
 pub trait JoinHashMapType: Send + Sync {
     fn extend_zero(&mut self, len: usize);
 
@@ -112,11 +121,19 @@ pub trait JoinHashMapType: Send + Sync {
         &self,
         hash_values: &[u64],
         limit: usize,
-        offset: JoinHashMapOffset,
-    ) -> (Vec<u32>, Vec<u64>, Option<JoinHashMapOffset>);
+        offset: MapOffset,
+        input_indices: &mut Vec<u32>,
+        match_indices: &mut Vec<u64>,
+    ) -> Option<MapOffset>;
+
+    /// Returns a BooleanArray indicating which of the provided hashes exist in the map.
+    fn contain_hashes(&self, hash_values: &[u64]) -> BooleanArray;
 
     /// Returns `true` if the join hash map contains no entries.
     fn is_empty(&self) -> bool;
+
+    /// Returns the number of entries in the join hash map.
+    fn len(&self) -> usize;
 }
 
 pub struct JoinHashMapU32 {
@@ -169,20 +186,32 @@ impl JoinHashMapType for JoinHashMapU32 {
         &self,
         hash_values: &[u64],
         limit: usize,
-        offset: JoinHashMapOffset,
-    ) -> (Vec<u32>, Vec<u64>, Option<JoinHashMapOffset>) {
+        offset: MapOffset,
+        input_indices: &mut Vec<u32>,
+        match_indices: &mut Vec<u64>,
+    ) -> Option<MapOffset> {
         get_matched_indices_with_limit_offset::<u32>(
             &self.map,
             &self.next,
             hash_values,
             limit,
             offset,
+            input_indices,
+            match_indices,
         )
     }
 
+    fn contain_hashes(&self, hash_values: &[u64]) -> BooleanArray {
+        contain_hashes(&self.map, hash_values)
+    }
+
     fn is_empty(&self) -> bool {
         self.map.is_empty()
     }
+
+    fn len(&self) -> usize {
+        self.map.len()
+    }
 }
 
 pub struct JoinHashMapU64 {
@@ -235,60 +264,37 @@ impl JoinHashMapType for JoinHashMapU64 {
         &self,
         hash_values: &[u64],
         limit: usize,
-        offset: JoinHashMapOffset,
-    ) -> (Vec<u32>, Vec<u64>, Option<JoinHashMapOffset>) {
+        offset: MapOffset,
+        input_indices: &mut Vec<u32>,
+        match_indices: &mut Vec<u64>,
+    ) -> Option<MapOffset> {
         get_matched_indices_with_limit_offset::<u64>(
             &self.map,
             &self.next,
             hash_values,
             limit,
             offset,
+            input_indices,
+            match_indices,
         )
     }
 
+    fn contain_hashes(&self, hash_values: &[u64]) -> BooleanArray {
+        contain_hashes(&self.map, hash_values)
+    }
+
     fn is_empty(&self) -> bool {
         self.map.is_empty()
     }
-}
 
-// Type of offsets for obtaining indices from JoinHashMap.
-pub(crate) type JoinHashMapOffset = (usize, Option<u64>);
-
-// Macro for traversing chained values with limit.
-// Early returns in case of reaching output tuples limit.
-macro_rules! chain_traverse {
-    (
-        $input_indices:ident, $match_indices:ident,
-        $hash_values:ident, $next_chain:ident,
-        $input_idx:ident, $chain_idx:ident, $remaining_output:ident, $one:ident, $zero:ident
-    ) => {{
-        // now `one` and `zero` are in scope from the outer function
-        let mut match_row_idx = $chain_idx - $one;
-        loop {
-            $match_indices.push(match_row_idx.into());
-            $input_indices.push($input_idx as u32);
-            $remaining_output -= 1;
-
-            let next = $next_chain[match_row_idx.into() as usize];
-
-            if $remaining_output == 0 {
-                // we compare against `zero` (of type T) here too
-                let next_offset = if $input_idx == $hash_values.len() - 1 && next == $zero
-                {
-                    None
-                } else {
-                    Some(($input_idx, Some(next.into())))
-                };
-                return ($input_indices, $match_indices, next_offset);
-            }
-            if next == $zero {
-                break;
-            }
-            match_row_idx = next - $one;
-        }
-    }};
+    fn len(&self) -> usize {
+        self.map.len()
+    }
 }
 
+use crate::joins::MapOffset;
+use crate::joins::chain::traverse_chain;
+
 pub fn update_from_iter<'a, T>(
     map: &mut HashTable<(u64, T)>,
     next: &mut [T],
@@ -375,15 +381,18 @@ pub fn get_matched_indices_with_limit_offset<T>(
     next_chain: &[T],
     hash_values: &[u64],
     limit: usize,
-    offset: JoinHashMapOffset,
-) -> (Vec<u32>, Vec<u64>, Option<JoinHashMapOffset>)
+    offset: MapOffset,
+    input_indices: &mut Vec<u32>,
+    match_indices: &mut Vec<u64>,
+) -> Option<MapOffset>
 where
     T: Copy + TryFrom<usize> + PartialOrd + Into<u64> + Sub<Output = T>,
     <T as TryFrom<usize>>::Error: Debug,
+    T: ArrowNativeType,
 {
-    let mut input_indices = Vec::with_capacity(limit);
-    let mut match_indices = Vec::with_capacity(limit);
-    let zero = T::try_from(0).unwrap();
+    // Clear the buffer before producing new results
+    input_indices.clear();
+    match_indices.clear();
     let one = T::try_from(1).unwrap();
 
     // Check if hashmap consists of unique values
@@ -397,19 +406,18 @@ where
                 match_indices.push((*idx - one).into());
             }
         }
-        let next_off = if end == hash_values.len() {
+        return if end == hash_values.len() {
             None
         } else {
             Some((end, None))
         };
-        return (input_indices, match_indices, next_off);
     }
 
     let mut remaining_output = limit;
 
     // Calculate initial `hash_values` index before iterating
     let to_skip = match offset {
-        // None `initial_next_idx` indicates that `initial_idx` processing has'n been started
+        // None `initial_next_idx` indicates that `initial_idx` processing hasn't been started
         (idx, None) => idx,
         // Zero `initial_next_idx` indicates that `initial_idx` has been processed during
         // previous iteration, and it should be skipped
@@ -417,39 +425,73 @@ where
         // Otherwise, process remaining `initial_idx` matches by traversing `next_chain`,
         // to start with the next index
         (idx, Some(next_idx)) => {
-            let next_idx: T = T::try_from(next_idx as usize).unwrap();
-            chain_traverse!(
-                input_indices,
-                match_indices,
-                hash_values,
+            let next_idx: T = T::usize_as(next_idx as usize);
+            let is_last = idx == hash_values.len() - 1;
+            if let Some(next_offset) = traverse_chain(
                 next_chain,
                 idx,
                 next_idx,
-                remaining_output,
-                one,
-                zero
-            );
+                &mut remaining_output,
+                input_indices,
+                match_indices,
+                is_last,
+            ) {
+                return Some(next_offset);
+            }
             idx + 1
         }
     };
 
-    let mut row_idx = to_skip;
-    for &hash in &hash_values[to_skip..] {
+    let hash_values_len = hash_values.len();
+    for (i, &hash) in hash_values[to_skip..].iter().enumerate() {
+        let row_idx = to_skip + i;
         if let Some((_, idx)) = map.find(hash, |(h, _)| hash == *h) {
             let idx: T = *idx;
-            chain_traverse!(
-                input_indices,
-                match_indices,
-                hash_values,
+            let is_last = row_idx == hash_values_len - 1;
+            if let Some(next_offset) = traverse_chain(
                 next_chain,
                 row_idx,
                 idx,
-                remaining_output,
-                one,
-                zero
-            );
+                &mut remaining_output,
+                input_indices,
+                match_indices,
+                is_last,
+            ) {
+                return Some(next_offset);
+            }
+        }
+    }
+    None
+}
+
+pub fn contain_hashes<T>(map: &HashTable<(u64, T)>, hash_values: &[u64]) -> BooleanArray {
+    let buffer = BooleanBuffer::collect_bool(hash_values.len(), |i| {
+        let hash = hash_values[i];
+        map.find(hash, |(h, _)| hash == *h).is_some()
+    });
+    BooleanArray::new(buffer, None)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_contain_hashes() {
+        let mut hash_map = JoinHashMapU32::with_capacity(10);
+        hash_map.update_from_iter(Box::new([10u64, 20u64, 30u64].iter().enumerate()), 0);
+
+        let probe_hashes = vec![10, 11, 20, 21, 30, 31];
+        let array = hash_map.contain_hashes(&probe_hashes);
+
+        assert_eq!(array.len(), probe_hashes.len());
+
+        for (i, &hash) in probe_hashes.iter().enumerate() {
+            if matches!(hash, 10 | 20 | 30) {
+                assert!(array.value(i), "Hash {hash} should exist in the map");
+            } else {
+                assert!(!array.value(i), "Hash {hash} should NOT exist in the map");
+            }
         }
-        row_idx += 1;
     }
-    (input_indices, match_indices, None)
 }
diff --git a/datafusion/physical-plan/src/joins/mod.rs b/datafusion/physical-plan/src/joins/mod.rs
index b0c28cf994f71..2cdfa1e6ac020 100644
--- a/datafusion/physical-plan/src/joins/mod.rs
+++ b/datafusion/physical-plan/src/joins/mod.rs
@@ -20,13 +20,16 @@
 use arrow::array::BooleanBufferBuilder;
 pub use cross_join::CrossJoinExec;
 use datafusion_physical_expr::PhysicalExprRef;
-pub use hash_join::HashJoinExec;
-pub use nested_loop_join::NestedLoopJoinExec;
+pub use hash_join::{
+    HashExpr, HashJoinExec, HashJoinExecBuilder, HashTableLookupExpr, SeededRandomState,
+};
+pub use nested_loop_join::{NestedLoopJoinExec, NestedLoopJoinExecBuilder};
 use parking_lot::Mutex;
 // Note: SortMergeJoin is not used in plans yet
 pub use piecewise_merge_join::PiecewiseMergeJoinExec;
 pub use sort_merge_join::SortMergeJoinExec;
 pub use symmetric_hash_join::SymmetricHashJoinExec;
+pub mod chain;
 mod cross_join;
 mod hash_join;
 mod nested_loop_join;
@@ -36,8 +39,38 @@ mod stream_join_utils;
 mod symmetric_hash_join;
 pub mod utils;
 
+mod array_map;
 mod join_filter;
-mod join_hash_map;
+/// Hash map implementations for join operations.
+///
+/// Note: This module is public for internal testing purposes only
+/// and is not guaranteed to be stable across versions.
+pub mod join_hash_map;
+
+use array_map::ArrayMap;
+use utils::JoinHashMapType;
+
+pub enum Map {
+    HashMap(Box<dyn JoinHashMapType>),
+    ArrayMap(ArrayMap),
+}
+
+impl Map {
+    /// Returns the number of elements in the map.
+    pub fn num_of_distinct_key(&self) -> usize {
+        match self {
+            Map::HashMap(map) => map.len(),
+            Map::ArrayMap(array_map) => array_map.num_of_distinct_key(),
+        }
+    }
+
+    /// Returns `true` if the map contains no elements.
+    pub fn is_empty(&self) -> bool {
+        self.num_of_distinct_key() == 0
+    }
+}
+
+pub(crate) type MapOffset = (usize, Option<u64>);
 
 #[cfg(test)]
 pub mod test_utils;
diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs
index 7ae09a42de880..add4508c87bd0 100644
--- a/datafusion/physical-plan/src/joins/nested_loop_join.rs
+++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs
@@ -17,11 +17,10 @@
 
 //! [`NestedLoopJoinExec`]: joins without equijoin (equality predicates).
 
-use std::any::Any;
 use std::fmt::Formatter;
 use std::ops::{BitOr, ControlFlow};
-use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
 use std::task::Poll;
 
 use super::utils::{
@@ -29,51 +28,61 @@ use super::utils::{
     reorder_output_after_swap, swap_join_projection,
 };
 use crate::common::can_project;
-use crate::execution_plan::{boundedness_from_children, EmissionType};
+use crate::execution_plan::{EmissionType, boundedness_from_children};
+use crate::joins::SharedBitmapBuilder;
 use crate::joins::utils::{
+    BuildProbeJoinMetrics, ColumnIndex, JoinFilter, OnceAsync, OnceFut,
     build_join_schema, check_join_is_valid, estimate_join_statistics,
-    need_produce_right_in_final, BuildProbeJoinMetrics, ColumnIndex, JoinFilter,
-    OnceAsync, OnceFut,
+    need_produce_right_in_final,
+};
+use crate::metrics::{
+    Count, ExecutionPlanMetricsSet, MetricBuilder, MetricType, MetricsSet, RatioMetrics,
 };
-use crate::joins::SharedBitmapBuilder;
-use crate::metrics::{Count, ExecutionPlanMetricsSet, MetricsSet};
 use crate::projection::{
-    try_embed_projection, try_pushdown_through_join, EmbeddedProjection, JoinData,
-    ProjectionExec,
+    EmbeddedProjection, JoinData, ProjectionExec, try_embed_projection,
+    try_pushdown_through_join,
 };
 use crate::{
     DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties,
     PlanProperties, RecordBatchStream, SendableRecordBatchStream,
+    check_if_same_properties,
 };
 
 use arrow::array::{
-    new_null_array, Array, BooleanArray, BooleanBufferBuilder, RecordBatchOptions,
-    UInt64Array,
+    Array, BooleanArray, BooleanBufferBuilder, RecordBatchOptions, UInt32Array,
+    UInt64Array, new_null_array,
 };
 use arrow::buffer::BooleanBuffer;
 use arrow::compute::{
-    concat_batches, filter, filter_record_batch, not, take, BatchCoalescer,
+    BatchCoalescer, concat_batches, filter, filter_record_batch, not, take,
 };
 use arrow::datatypes::{Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use arrow_schema::DataType;
 use datafusion_common::cast::as_boolean_array;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::{
-    arrow_err, internal_datafusion_err, internal_err, project_schema,
-    unwrap_or_internal_err, DataFusionError, JoinSide, Result, ScalarValue, Statistics,
+    JoinSide, Result, ScalarValue, Statistics, arrow_err, assert_eq_or_internal_err,
+    internal_datafusion_err, internal_err, project_schema, unwrap_or_internal_err,
 };
-use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
 use datafusion_execution::TaskContext;
+use datafusion_execution::disk_manager::RefCountedTempFile;
+use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
 use datafusion_expr::JoinType;
 use datafusion_physical_expr::equivalence::{
-    join_equivalence_properties, ProjectionMapping,
+    ProjectionMapping, join_equivalence_properties,
 };
 
+use datafusion_physical_expr::projection::{ProjectionRef, combine_projections};
 use futures::{Stream, StreamExt, TryStreamExt};
 use log::debug;
 use parking_lot::Mutex;
 
-#[allow(rustdoc::private_intra_doc_links)]
+use crate::metrics::SpillMetrics;
+use crate::spill::replayable_spill_input::ReplayableStreamSource;
+use crate::spill::spill_manager::SpillManager;
+
+#[expect(rustdoc::private_intra_doc_links)]
 /// NestedLoopJoinExec is a build-probe join operator designed for joins that
 /// do not have equijoin keys in their `ON` clause.
 ///
@@ -156,10 +165,21 @@ use parking_lot::Mutex;
 /// - The design try to minimize the intermediate data size to approximately
 ///   1 batch, for better cache locality and memory efficiency.
 ///
-/// # TODO: Memory-limited Execution
-/// If the memory budget is exceeded during left-side buffering, fallback
-/// strategies such as streaming left batches and re-scanning the right side
-/// may be implemented in the future.
+/// # Memory-limited Execution
+/// When the memory budget is exceeded during left-side buffering, the operator
+/// falls back to a multi-pass strategy:
+/// 1. Buffer as many left rows as fit in memory (one "chunk")
+/// 2. On the first pass, the right side is both processed and spilled to disk
+/// 3. For each subsequent left chunk, the right side is re-read from the spill file
+///
+/// The fallback is triggered automatically when the initial in-memory load
+/// fails with `ResourcesExhausted` and disk spilling is available. Each
+/// output partition independently re-executes the left child and manages
+/// its own spill state.
+///
+/// Currently supports INNER, LEFT, LEFT SEMI, LEFT ANTI, and LEFT MARK
+/// join types. RIGHT/FULL joins require a global right-side bitmap across
+/// all left chunks, which is not yet implemented.
 ///
 /// Tracking issue: <https://github.com/apache/datafusion/issues/15760>
 ///
@@ -187,53 +207,131 @@ pub struct NestedLoopJoinExec {
     /// Each output stream waits on the `OnceAsync` to signal the completion of
     /// the build(left) side data, and buffer them all for later joining.
     build_side_data: OnceAsync<JoinLeftData>,
+    /// Shared left-side spill data for OOM fallback.
+    ///
+    /// When `build_side_data` fails with OOM, the first partition to
+    /// initiate fallback spills the entire left side to disk. Other
+    /// partitions share the same spill file via this `OnceAsync`,
+    /// avoiding redundant re-execution of the left child.
+    left_spill_data: Arc<OnceAsync<LeftSpillData>>,
     /// Information of index and left / right placement of columns
     column_indices: Vec<ColumnIndex>,
     /// Projection to apply to the output of the join
-    projection: Option<Vec<usize>>,
+    projection: Option<ProjectionRef>,
 
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
-impl NestedLoopJoinExec {
-    /// Try to create a new [`NestedLoopJoinExec`]
-    pub fn try_new(
+/// Helps to build [`NestedLoopJoinExec`].
+pub struct NestedLoopJoinExecBuilder {
+    left: Arc<dyn ExecutionPlan>,
+    right: Arc<dyn ExecutionPlan>,
+    join_type: JoinType,
+    filter: Option<JoinFilter>,
+    projection: Option<ProjectionRef>,
+}
+
+impl NestedLoopJoinExecBuilder {
+    /// Make a new [`NestedLoopJoinExecBuilder`].
+    pub fn new(
         left: Arc<dyn ExecutionPlan>,
         right: Arc<dyn ExecutionPlan>,
-        filter: Option<JoinFilter>,
-        join_type: &JoinType,
-        projection: Option<Vec<usize>>,
-    ) -> Result<Self> {
+        join_type: JoinType,
+    ) -> Self {
+        Self {
+            left,
+            right,
+            join_type,
+            filter: None,
+            projection: None,
+        }
+    }
+
+    /// Set projection from the vector.
+    pub fn with_projection(self, projection: Option<Vec<usize>>) -> Self {
+        self.with_projection_ref(projection.map(Into::into))
+    }
+
+    /// Set projection from the shared reference.
+    pub fn with_projection_ref(mut self, projection: Option<ProjectionRef>) -> Self {
+        self.projection = projection;
+        self
+    }
+
+    /// Set optional filter.
+    pub fn with_filter(mut self, filter: Option<JoinFilter>) -> Self {
+        self.filter = filter;
+        self
+    }
+
+    /// Build resulting execution plan.
+    pub fn build(self) -> Result<NestedLoopJoinExec> {
+        let Self {
+            left,
+            right,
+            join_type,
+            filter,
+            projection,
+        } = self;
+
         let left_schema = left.schema();
         let right_schema = right.schema();
         check_join_is_valid(&left_schema, &right_schema, &[])?;
         let (join_schema, column_indices) =
-            build_join_schema(&left_schema, &right_schema, join_type);
+            build_join_schema(&left_schema, &right_schema, &join_type);
         let join_schema = Arc::new(join_schema);
-        let cache = Self::compute_properties(
+        let cache = NestedLoopJoinExec::compute_properties(
             &left,
             &right,
-            Arc::clone(&join_schema),
-            *join_type,
-            projection.as_ref(),
+            &join_schema,
+            join_type,
+            projection.as_deref(),
         )?;
-
         Ok(NestedLoopJoinExec {
             left,
             right,
             filter,
-            join_type: *join_type,
+            join_type,
             join_schema,
             build_side_data: Default::default(),
+            left_spill_data: Arc::new(OnceAsync::default()),
             column_indices,
             projection,
             metrics: Default::default(),
-            cache,
+            cache: Arc::new(cache),
         })
     }
+}
+
+impl From<&NestedLoopJoinExec> for NestedLoopJoinExecBuilder {
+    fn from(exec: &NestedLoopJoinExec) -> Self {
+        Self {
+            left: Arc::clone(exec.left()),
+            right: Arc::clone(exec.right()),
+            join_type: exec.join_type,
+            filter: exec.filter.clone(),
+            projection: exec.projection.clone(),
+        }
+    }
+}
+
+impl NestedLoopJoinExec {
+    /// Try to create a new [`NestedLoopJoinExec`]
+    pub fn try_new(
+        left: Arc<dyn ExecutionPlan>,
+        right: Arc<dyn ExecutionPlan>,
+        filter: Option<JoinFilter>,
+        join_type: &JoinType,
+        projection: Option<Vec<usize>>,
+    ) -> Result<Self> {
+        NestedLoopJoinExecBuilder::new(left, right, *join_type)
+            .with_projection(projection)
+            .with_filter(filter)
+            .build()
+    }
 
     /// left side
     pub fn left(&self) -> &Arc<dyn ExecutionPlan> {
@@ -255,24 +353,24 @@ impl NestedLoopJoinExec {
         &self.join_type
     }
 
-    pub fn projection(&self) -> Option<&Vec<usize>> {
-        self.projection.as_ref()
+    pub fn projection(&self) -> &Option<ProjectionRef> {
+        &self.projection
     }
 
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
     fn compute_properties(
         left: &Arc<dyn ExecutionPlan>,
         right: &Arc<dyn ExecutionPlan>,
-        schema: SchemaRef,
+        schema: &SchemaRef,
         join_type: JoinType,
-        projection: Option<&Vec<usize>>,
+        projection: Option<&[usize]>,
     ) -> Result<PlanProperties> {
         // Calculate equivalence properties:
         let mut eq_properties = join_equivalence_properties(
             left.equivalence_properties().clone(),
             right.equivalence_properties().clone(),
             &join_type,
-            Arc::clone(&schema),
+            Arc::clone(schema),
             &Self::maintains_input_order(join_type),
             None,
             // No on columns in nested loop join
@@ -307,9 +405,8 @@ impl NestedLoopJoinExec {
 
         if let Some(projection) = projection {
             // construct a map from the input expressions to the output expression of the Projection
-            let projection_mapping =
-                ProjectionMapping::from_indices(projection, &schema)?;
-            let out_schema = project_schema(&schema, Some(projection))?;
+            let projection_mapping = ProjectionMapping::from_indices(projection, schema)?;
+            let out_schema = project_schema(schema, Some(&projection))?;
             output_partitioning =
                 output_partitioning.project(&projection_mapping, &eq_properties);
             eq_properties = eq_properties.project(&projection_mapping, out_schema);
@@ -333,22 +430,14 @@ impl NestedLoopJoinExec {
     }
 
     pub fn with_projection(&self, projection: Option<Vec<usize>>) -> Result<Self> {
+        let projection = projection.map(Into::into);
         // check if the projection is valid
-        can_project(&self.schema(), projection.as_ref())?;
-        let projection = match projection {
-            Some(projection) => match &self.projection {
-                Some(p) => Some(projection.iter().map(|i| p[*i]).collect()),
-                None => Some(projection),
-            },
-            None => None,
-        };
-        Self::try_new(
-            Arc::clone(&self.left),
-            Arc::clone(&self.right),
-            self.filter.clone(),
-            &self.join_type,
-            projection,
-        )
+        can_project(&self.schema(), projection.as_deref())?;
+        let projection =
+            combine_projections(projection.as_ref(), self.projection.as_ref())?;
+        NestedLoopJoinExecBuilder::from(self)
+            .with_projection_ref(projection)
+            .build()
     }
 
     /// Returns a new `ExecutionPlan` that runs NestedLoopsJoins with the left
@@ -370,7 +459,7 @@ impl NestedLoopJoinExec {
             swap_join_projection(
                 left.schema().fields().len(),
                 right.schema().fields().len(),
-                self.projection.as_ref(),
+                self.projection.as_deref(),
                 self.join_type(),
             ),
         )?;
@@ -398,6 +487,28 @@ impl NestedLoopJoinExec {
 
         Ok(plan)
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        let left = children.swap_remove(0);
+        let right = children.swap_remove(0);
+
+        Self {
+            left,
+            right,
+            metrics: ExecutionPlanMetricsSet::new(),
+            build_side_data: Default::default(),
+            left_spill_data: Arc::new(OnceAsync::default()),
+            cache: Arc::clone(&self.cache),
+            filter: self.filter.clone(),
+            join_type: self.join_type,
+            join_schema: Arc::clone(&self.join_schema),
+            column_indices: self.column_indices.clone(),
+            projection: self.projection.clone(),
+        }
+    }
 }
 
 impl DisplayAs for NestedLoopJoinExec {
@@ -448,11 +559,7 @@ impl ExecutionPlan for NestedLoopJoinExec {
         "NestedLoopJoinExec"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -471,17 +578,32 @@ impl ExecutionPlan for NestedLoopJoinExec {
         vec![&self.left, &self.right]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn crate::PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to join filter expressions if present
+        if let Some(filter) = &self.filter {
+            f(filter.expression().as_ref())?;
+        }
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        Ok(Arc::new(NestedLoopJoinExec::try_new(
-            Arc::clone(&children[0]),
-            Arc::clone(&children[1]),
-            self.filter.clone(),
-            &self.join_type,
-            self.projection.clone(),
-        )?))
+        check_if_same_properties!(self, children);
+        Ok(Arc::new(
+            NestedLoopJoinExecBuilder::new(
+                Arc::clone(&children[0]),
+                Arc::clone(&children[1]),
+                self.join_type,
+            )
+            .with_filter(self.filter.clone())
+            .with_projection_ref(self.projection.clone())
+            .build()?,
+        ))
     }
 
     fn execute(
@@ -489,16 +611,30 @@ impl ExecutionPlan for NestedLoopJoinExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        if self.left.output_partitioning().partition_count() != 1 {
-            return internal_err!(
-                "Invalid NestedLoopJoinExec, the output partition count of the left child must be 1,\
+        assert_eq_or_internal_err!(
+            self.left.output_partitioning().partition_count(),
+            1,
+            "Invalid NestedLoopJoinExec, the output partition count of the left child must be 1,\
                  consider using CoalescePartitionsExec or the EnforceDistribution rule"
-            );
-        }
+        );
 
-        let join_metrics = BuildProbeJoinMetrics::new(partition, &self.metrics);
+        let metrics = NestedLoopJoinMetrics::new(&self.metrics, partition);
+        let batch_size = context.session_config().batch_size();
 
-        // Initialization reservation for load of inner table
+        // update column indices to reflect the projection
+        let column_indices_after_projection = match self.projection.as_ref() {
+            Some(projection) => projection
+                .iter()
+                .map(|i| self.column_indices[*i].clone())
+                .collect(),
+            None => self.column_indices.clone(),
+        };
+
+        let right_partition_count = self.right().output_partitioning().partition_count();
+
+        // Always try to buffer all left data in memory via OnceFut.
+        // If that fails with OOM, the stream will fallback to memory-limited
+        // mode (if conditions allow).
         let load_reservation =
             MemoryConsumer::new(format!("NestedLoopJoinLoad[{partition}]"))
                 .register(context.memory_pool());
@@ -508,24 +644,30 @@ impl ExecutionPlan for NestedLoopJoinExec {
 
             Ok(collect_left_input(
                 stream,
-                join_metrics.clone(),
+                metrics.join_metrics.clone(),
                 load_reservation,
                 need_produce_result_in_final(self.join_type),
-                self.right().output_partitioning().partition_count(),
+                right_partition_count,
             ))
         })?;
 
-        let batch_size = context.session_config().batch_size();
+        let probe_side_data = self.right.execute(partition, Arc::clone(&context))?;
 
-        let probe_side_data = self.right.execute(partition, context)?;
-
-        // update column indices to reflect the projection
-        let column_indices_after_projection = match &self.projection {
-            Some(projection) => projection
-                .iter()
-                .map(|i| self.column_indices[*i].clone())
-                .collect(),
-            None => self.column_indices.clone(),
+        // Determine if OOM fallback to memory-limited mode is possible.
+        // Conditions:
+        // 1. Disk manager supports temp files (needed for right-side spilling)
+        // 2. Join type does not require tracking right-side matched state
+        //    across multiple left chunks (RIGHT/FULL joins not yet supported)
+        let spill_state = if context.runtime_env().disk_manager.tmp_files_enabled()
+            && !need_produce_right_in_final(self.join_type)
+        {
+            SpillState::Pending {
+                left_plan: Arc::clone(&self.left),
+                task_context: Arc::clone(&context),
+                left_spill_data: Arc::clone(&self.left_spill_data),
+            }
+        } else {
+            SpillState::Disabled
         };
 
         Ok(Box::pin(NestedLoopJoinStream::new(
@@ -535,8 +677,9 @@ impl ExecutionPlan for NestedLoopJoinExec {
             probe_side_data,
             build_side_data,
             column_indices_after_projection,
-            join_metrics,
+            metrics,
             batch_size,
+            spill_state,
         )))
     }
 
@@ -544,21 +687,35 @@ impl ExecutionPlan for NestedLoopJoinExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        if partition.is_some() {
-            return Ok(Statistics::new_unknown(&self.schema()));
-        }
-        estimate_join_statistics(
-            self.left.partition_statistics(None)?,
-            self.right.partition_statistics(None)?,
-            vec![],
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        // NestedLoopJoinExec is designed for joins without equijoin keys in the
+        // ON clause (e.g., `t1 JOIN t2 ON (t1.v1 + t2.v1) % 2 = 0`). Any join
+        // predicates are stored in `self.filter`, but `estimate_join_statistics`
+        // currently doesn't support selectivity estimation for such arbitrary
+        // filter expressions. We pass an empty join column list, which means
+        // the cardinality estimation cannot use column statistics and returns
+        // unknown row counts.
+        let join_columns = Vec::new();
+
+        // Left side is always a single partition (Distribution::SinglePartition),
+        // so we always request overall stats with `None`. Right side can have
+        // multiple partitions, so we forward the partition parameter to get
+        // partition-specific statistics when requested.
+        let left_stats = Arc::unwrap_or_clone(self.left.partition_statistics(None)?);
+        let right_stats = Arc::unwrap_or_clone(match partition {
+            Some(partition) => self.right.partition_statistics(Some(partition))?,
+            None => self.right.partition_statistics(None)?,
+        });
+
+        let stats = estimate_join_statistics(
+            left_stats,
+            right_stats,
+            &join_columns,
             &self.join_type,
-            &self.schema(),
-        )
+            &self.join_schema,
+        )?;
+
+        Ok(Arc::new(stats.project(self.projection.as_ref())))
     }
 
     /// Tries to push `projection` down through `nested_loop_join`. If possible, performs the
@@ -573,6 +730,7 @@ impl ExecutionPlan for NestedLoopJoinExec {
             return Ok(None);
         }
 
+        let schema = self.schema();
         if let Some(JoinData {
             projected_left_child,
             projected_right_child,
@@ -583,7 +741,7 @@ impl ExecutionPlan for NestedLoopJoinExec {
             self.left(),
             self.right(),
             &[],
-            self.schema(),
+            &schema,
             self.filter(),
         )? {
             Ok(Some(Arc::new(NestedLoopJoinExec::try_new(
@@ -662,10 +820,10 @@ async fn collect_left_input(
     let schema = stream.schema();
 
     // Load all batches and count the rows
-    let (batches, metrics, mut reservation) = stream
+    let (batches, metrics, reservation) = stream
         .try_fold(
             (Vec::new(), join_metrics, reservation),
-            |(mut batches, metrics, mut reservation), batch| async {
+            |(mut batches, metrics, reservation), batch| async {
                 let batch_size = batch.get_array_memory_size();
                 // Reserve memory for incoming batch
                 reservation.try_grow(batch_size)?;
@@ -715,6 +873,68 @@ enum NLJState {
     EmitLeftUnmatched,
     Done,
 }
+/// Shared data for the left-side spill fallback.
+///
+/// When the in-memory `OnceFut` path fails with OOM, the first partition
+/// spills the entire left side to disk. This struct holds the spill file
+/// reference so other partitions can read from the same file.
+pub(crate) struct LeftSpillData {
+    /// SpillManager used to read the spill file (has the left schema)
+    spill_manager: SpillManager,
+    /// The spill file containing all left-side batches
+    spill_file: RefCountedTempFile,
+    /// Left-side schema
+    schema: SchemaRef,
+}
+
+/// Tracks the state of the memory-limited spill fallback for NLJ.
+///
+/// The NLJ always starts with the standard OnceFut path. If the in-memory
+/// load fails with OOM and conditions allow, the operator falls back to a
+/// multi-pass strategy where left data is loaded in chunks and the right
+/// side is spilled to disk.
+pub(crate) enum SpillState {
+    /// Fallback is not possible (e.g., join type requires global right bitmap,
+    /// or disk manager is disabled). OOM errors will propagate as-is.
+    Disabled,
+
+    /// Fallback is possible but not yet triggered. The operator is still
+    /// attempting the standard OnceFut path. Holds the context needed to
+    /// initiate fallback if OOM occurs.
+    Pending {
+        /// Left child plan for re-execution
+        left_plan: Arc<dyn ExecutionPlan>,
+        /// TaskContext for re-execution and SpillManager creation
+        task_context: Arc<TaskContext>,
+        /// Shared OnceAsync for left-side spill data. The first partition
+        /// to initiate fallback spills the left side; others share the file.
+        left_spill_data: Arc<OnceAsync<LeftSpillData>>,
+    },
+
+    /// Fallback has been triggered. Left data is being loaded in chunks
+    /// and the right side is spilled to disk for re-scanning.
+    Active(Box<SpillStateActive>),
+}
+
+/// State for active memory-limited spill execution.
+/// Boxed inside [`SpillState::Active`] to reduce enum size.
+pub(crate) struct SpillStateActive {
+    /// Shared future for left-side spill data. All partitions wait on
+    /// the same future — the first to poll triggers the actual spill.
+    left_spill_fut: OnceFut<LeftSpillData>,
+    /// Left input stream for incremental chunk reading (from spill file).
+    /// None until `left_spill_fut` resolves.
+    left_stream: Option<SendableRecordBatchStream>,
+    /// Left-side schema (set once `left_spill_fut` resolves)
+    left_schema: Option<SchemaRef>,
+    /// Memory reservation for left-side buffering
+    reservation: MemoryReservation,
+    /// Accumulated left batches for the current chunk
+    pending_batches: Vec<RecordBatch>,
+    /// Right input that spills on the first pass and replays from spill later.
+    right_input: ReplayableStreamSource,
+}
+
 pub(crate) struct NestedLoopJoinStream {
     // ========================================================================
     // PROPERTIES:
@@ -732,7 +952,8 @@ pub(crate) struct NestedLoopJoinStream {
     /// type of the join
     pub(crate) join_type: JoinType,
     /// the probe-side(right) table data of the nested loop join
-    pub(crate) right_data: SendableRecordBatchStream,
+    /// `Option` is used because memory-limited path requires resetting it.
+    pub(crate) right_data: Option<SendableRecordBatchStream>,
     /// the build-side table data of the nested loop join
     pub(crate) left_data: OnceFut<JoinLeftData>,
     /// Projection to construct the output schema from the left and right tables.
@@ -749,7 +970,7 @@ pub(crate) struct NestedLoopJoinStream {
     /// the join filter (e.g., `JOIN ON (b+c)>0`).
     pub(crate) column_indices: Vec<ColumnIndex>,
     /// Join execution metrics
-    pub(crate) join_metrics: BuildProbeJoinMetrics,
+    pub(crate) metrics: NestedLoopJoinMetrics,
 
     /// `batch_size` from configuration
     batch_size: usize,
@@ -780,9 +1001,7 @@ pub(crate) struct NestedLoopJoinStream {
     /// Should we go back to `BufferingLeft` state again after `EmitLeftUnmatched`
     /// state is over.
     left_exhausted: bool,
-    /// If we can buffer all left data in one pass
-    /// TODO(now): this is for the (unimplemented) memory-limited execution
-    #[allow(dead_code)]
+    /// If we can buffer all left data in one pass (false means memory-limited multi-pass)
     left_buffered_in_one_pass: bool,
 
     // Probe(right) side
@@ -792,6 +1011,30 @@ pub(crate) struct NestedLoopJoinStream {
     // For right join, keep track of matched rows in `current_right_batch`
     // Constructed when fetching each new incoming right batch in `FetchingRight` state.
     current_right_batch_matched: Option<BooleanArray>,
+
+    /// Memory-limited spill fallback state. See [`SpillState`] for details.
+    spill_state: SpillState,
+}
+
+pub(crate) struct NestedLoopJoinMetrics {
+    /// Join execution metrics
+    pub(crate) join_metrics: BuildProbeJoinMetrics,
+    /// Selectivity of the join: output_rows / (left_rows * right_rows)
+    pub(crate) selectivity: RatioMetrics,
+    /// Spill metrics for memory-limited execution
+    pub(crate) spill_metrics: SpillMetrics,
+}
+
+impl NestedLoopJoinMetrics {
+    pub fn new(metrics: &ExecutionPlanMetricsSet, partition: usize) -> Self {
+        Self {
+            join_metrics: BuildProbeJoinMetrics::new(partition, metrics),
+            selectivity: MetricBuilder::new(metrics)
+                .with_type(MetricType::Summary)
+                .ratio_metrics("selectivity", partition),
+            spill_metrics: SpillMetrics::new(metrics, partition),
+        }
+    }
 }
 
 impl Stream for NestedLoopJoinStream {
@@ -844,7 +1087,7 @@ impl Stream for NestedLoopJoinStream {
                     // -side batches), related metrics except build time will be
                     // updated.
                     // stop on drop
-                    let build_metric = self.join_metrics.build_time.clone();
+                    let build_metric = self.metrics.join_metrics.build_time.clone();
                     let _build_timer = build_metric.timer();
 
                     match self.handle_buffering_left(cx) {
@@ -878,7 +1121,7 @@ impl Stream for NestedLoopJoinStream {
                 NLJState::FetchingRight => {
                     debug!("[NLJState] Entering: {:?}", self.state);
                     // stop on drop
-                    let join_metric = self.join_metrics.join_time.clone();
+                    let join_metric = self.metrics.join_metrics.join_time.clone();
                     let _join_timer = join_metric.timer();
 
                     match self.handle_fetching_right(cx) {
@@ -905,13 +1148,13 @@ impl Stream for NestedLoopJoinStream {
                     debug!("[NLJState] Entering: {:?}", self.state);
 
                     // stop on drop
-                    let join_metric = self.join_metrics.join_time.clone();
+                    let join_metric = self.metrics.join_metrics.join_time.clone();
                     let _join_timer = join_metric.timer();
 
                     match self.handle_probe_right() {
                         ControlFlow::Continue(()) => continue,
                         ControlFlow::Break(poll) => {
-                            return self.join_metrics.baseline.record_poll(poll)
+                            return self.metrics.join_metrics.baseline.record_poll(poll);
                         }
                     }
                 }
@@ -926,13 +1169,13 @@ impl Stream for NestedLoopJoinStream {
                     debug!("[NLJState] Entering: {:?}", self.state);
 
                     // stop on drop
-                    let join_metric = self.join_metrics.join_time.clone();
+                    let join_metric = self.metrics.join_metrics.join_time.clone();
                     let _join_timer = join_metric.timer();
 
                     match self.handle_emit_right_unmatched() {
                         ControlFlow::Continue(()) => continue,
                         ControlFlow::Break(poll) => {
-                            return self.join_metrics.baseline.record_poll(poll)
+                            return self.metrics.join_metrics.baseline.record_poll(poll);
                         }
                     }
                 }
@@ -949,20 +1192,20 @@ impl Stream for NestedLoopJoinStream {
                 // 3. --> Done
                 //    It has processed all data, go to the final state and ready
                 //    to exit.
-                //
-                // TODO: For memory-limited case, go back to `BufferingLeft`
-                // state again.
+                // 4. --> BufferingLeft (memory-limited mode only)
+                //    When left data was loaded in chunks and more chunks remain,
+                //    go back to BufferingLeft to load the next chunk.
                 NLJState::EmitLeftUnmatched => {
                     debug!("[NLJState] Entering: {:?}", self.state);
 
                     // stop on drop
-                    let join_metric = self.join_metrics.join_time.clone();
+                    let join_metric = self.metrics.join_metrics.join_time.clone();
                     let _join_timer = join_metric.timer();
 
                     match self.handle_emit_left_unmatched() {
                         ControlFlow::Continue(()) => continue,
                         ControlFlow::Break(poll) => {
-                            return self.join_metrics.baseline.record_poll(poll)
+                            return self.metrics.join_metrics.baseline.record_poll(poll);
                         }
                     }
                 }
@@ -972,13 +1215,13 @@ impl Stream for NestedLoopJoinStream {
                     debug!("[NLJState] Entering: {:?}", self.state);
 
                     // stop on drop
-                    let join_metric = self.join_metrics.join_time.clone();
+                    let join_metric = self.metrics.join_metrics.join_time.clone();
                     let _join_timer = join_metric.timer();
                     // counting it in join timer due to there might be some
                     // final resout batches to output in this state
 
                     let poll = self.handle_done();
-                    return self.join_metrics.baseline.record_poll(poll);
+                    return self.metrics.join_metrics.baseline.record_poll(poll);
                 }
             }
         }
@@ -992,7 +1235,7 @@ impl RecordBatchStream for NestedLoopJoinStream {
 }
 
 impl NestedLoopJoinStream {
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     pub(crate) fn new(
         schema: Arc<Schema>,
         filter: Option<JoinFilter>,
@@ -1000,17 +1243,18 @@ impl NestedLoopJoinStream {
         right_data: SendableRecordBatchStream,
         left_data: OnceFut<JoinLeftData>,
         column_indices: Vec<ColumnIndex>,
-        join_metrics: BuildProbeJoinMetrics,
+        metrics: NestedLoopJoinMetrics,
         batch_size: usize,
+        spill_state: SpillState,
     ) -> Self {
         Self {
             output_schema: Arc::clone(&schema),
             join_filter: filter,
             join_type,
-            right_data,
+            right_data: Some(right_data),
             column_indices,
             left_data,
-            join_metrics,
+            metrics,
             buffered_left_data: None,
             output_buffer: Box::new(BatchCoalescer::new(schema, batch_size)),
             batch_size,
@@ -1023,45 +1267,347 @@ impl NestedLoopJoinStream {
             left_buffered_in_one_pass: true,
             handled_empty_output: false,
             should_track_unmatched_right: need_produce_right_in_final(join_type),
+            spill_state,
         }
     }
 
+    /// Returns true if this stream is operating in memory-limited mode
+    fn is_memory_limited(&self) -> bool {
+        matches!(self.spill_state, SpillState::Active(_))
+    }
+
+    /// Check if we can fall back to memory-limited mode on this error.
+    fn can_fallback_to_spill(&self, error: &datafusion_common::DataFusionError) -> bool {
+        matches!(self.spill_state, SpillState::Pending { .. })
+            && matches!(
+                error.find_root(),
+                datafusion_common::DataFusionError::ResourcesExhausted(_)
+            )
+    }
+
+    /// Switch from the standard OnceFut path to memory-limited mode.
+    ///
+    /// Uses the shared `left_spill_data` OnceAsync so that only the first
+    /// partition to reach this point re-executes the left child and spills
+    /// it to disk. Other partitions share the same spill file.
+    fn initiate_fallback(&mut self) -> Result<()> {
+        // Take ownership of Pending state
+        let (left_plan, context, left_spill_data) =
+            match std::mem::replace(&mut self.spill_state, SpillState::Disabled) {
+                SpillState::Pending {
+                    left_plan,
+                    task_context,
+                    left_spill_data,
+                } => (left_plan, task_context, left_spill_data),
+                _ => {
+                    return internal_err!(
+                        "initiate_fallback called in non-Pending spill state"
+                    );
+                }
+            };
+
+        // Use OnceAsync to ensure only the first partition spills the left
+        // side. Other partitions will get the same OnceFut that resolves
+        // to the shared spill file.
+        let left_spill_fut = left_spill_data.try_once(|| {
+            let plan = Arc::clone(&left_plan);
+            let ctx = Arc::clone(&context);
+            let spill_metrics = self.metrics.spill_metrics.clone();
+            Ok(async move {
+                let mut stream = plan.execute(0, Arc::clone(&ctx))?;
+                let schema = stream.schema();
+                let left_spill_manager = SpillManager::new(
+                    ctx.runtime_env(),
+                    spill_metrics,
+                    Arc::clone(&schema),
+                )
+                .with_compression_type(ctx.session_config().spill_compression());
+
+                let result = left_spill_manager
+                    .spill_record_batch_stream_and_return_max_batch_memory(
+                        &mut stream,
+                        "NestedLoopJoin left spill",
+                    )
+                    .await?;
+
+                match result {
+                    Some((file, _max_batch_memory)) => Ok(LeftSpillData {
+                        spill_manager: left_spill_manager,
+                        spill_file: file,
+                        schema,
+                    }),
+                    None => {
+                        internal_err!("Left side produced no data to spill")
+                    }
+                }
+            })
+        })?;
+
+        // Create reservation with can_spill for fair memory allocation
+        let reservation = MemoryConsumer::new("NestedLoopJoinLoad[fallback]".to_string())
+            .with_can_spill(true)
+            .register(context.memory_pool());
+
+        // Create SpillManager for right-side spilling
+        let right_schema = self
+            .right_data
+            .as_ref()
+            .expect("right_data must be present before fallback")
+            .schema();
+        let right_data = self
+            .right_data
+            .take()
+            .expect("right_data must be present before fallback");
+        let right_spill_manager = SpillManager::new(
+            context.runtime_env(),
+            self.metrics.spill_metrics.clone(),
+            right_schema,
+        )
+        .with_compression_type(context.session_config().spill_compression());
+
+        self.spill_state = SpillState::Active(Box::new(SpillStateActive {
+            left_spill_fut,
+            left_stream: None,
+            left_schema: None,
+            reservation,
+            pending_batches: Vec::new(),
+            right_input: ReplayableStreamSource::new(
+                right_data,
+                right_spill_manager,
+                "NestedLoopJoin right spill",
+            ),
+        }));
+
+        // State stays BufferingLeft — next poll will enter
+        // handle_buffering_left_memory_limited via is_memory_limited() check
+        self.state = NLJState::BufferingLeft;
+
+        Ok(())
+    }
+
     // ==== State handler functions ====
 
-    /// Handle BufferingLeft state - prepare left side batches
+    /// Handle BufferingLeft state - prepare left side batches.
+    ///
+    /// In standard mode, uses OnceFut to load all left data at once.
+    /// In memory-limited mode, incrementally buffers left batches until the
+    /// memory budget is reached or the left stream is exhausted.
     fn handle_buffering_left(
         &mut self,
         cx: &mut std::task::Context<'_>,
     ) -> ControlFlow<Poll<Option<Result<RecordBatch>>>> {
-        match self.left_data.get_shared(cx) {
-            Poll::Ready(Ok(left_data)) => {
-                self.buffered_left_data = Some(left_data);
-                // TODO: implement memory-limited case
-                self.left_exhausted = true;
-                self.state = NLJState::FetchingRight;
-                // Continue to next state immediately
-                ControlFlow::Continue(())
+        if self.is_memory_limited() {
+            self.handle_buffering_left_memory_limited(cx)
+        } else {
+            // Standard path: use OnceFut
+            match self.left_data.get_shared(cx) {
+                Poll::Ready(Ok(left_data)) => {
+                    self.buffered_left_data = Some(left_data);
+                    self.left_exhausted = true;
+                    self.state = NLJState::FetchingRight;
+                    ControlFlow::Continue(())
+                }
+                Poll::Ready(Err(e)) => {
+                    if self.can_fallback_to_spill(&e) {
+                        debug!(
+                            "NestedLoopJoin: OnceFut failed with OOM, \
+                             falling back to memory-limited mode"
+                        );
+                        match self.initiate_fallback() {
+                            Ok(()) => ControlFlow::Continue(()),
+                            Err(fallback_err) => {
+                                ControlFlow::Break(Poll::Ready(Some(Err(fallback_err))))
+                            }
+                        }
+                    } else {
+                        ControlFlow::Break(Poll::Ready(Some(Err(e))))
+                    }
+                }
+                Poll::Pending => ControlFlow::Break(Poll::Pending),
+            }
+        }
+    }
+
+    /// Memory-limited path for handle_buffering_left.
+    ///
+    /// Incrementally polls the left stream and accumulates batches until:
+    /// - Memory reservation fails (chunk is full, more data remains)
+    /// - Left stream is exhausted (this is the last/only chunk)
+    fn handle_buffering_left_memory_limited(
+        &mut self,
+        cx: &mut std::task::Context<'_>,
+    ) -> ControlFlow<Poll<Option<Result<RecordBatch>>>> {
+        let SpillState::Active(active) = &mut self.spill_state else {
+            unreachable!(
+                "handle_buffering_left_memory_limited called without Active spill state"
+            );
+        };
+
+        // On first entry (or after re-entry for a new chunk pass when
+        // left_stream was consumed), wait for the shared left spill
+        // future to resolve and then open a stream from the spill file.
+        if active.left_stream.is_none() {
+            match active.left_spill_fut.get_shared(cx) {
+                Poll::Ready(Ok(spill_data)) => {
+                    match spill_data
+                        .spill_manager
+                        .read_spill_as_stream(spill_data.spill_file.clone(), None)
+                    {
+                        Ok(stream) => {
+                            active.left_schema = Some(Arc::clone(&spill_data.schema));
+                            active.left_stream = Some(stream);
+                        }
+                        Err(e) => {
+                            return ControlFlow::Break(Poll::Ready(Some(Err(e))));
+                        }
+                    }
+                }
+                Poll::Ready(Err(e)) => {
+                    return ControlFlow::Break(Poll::Ready(Some(Err(e))));
+                }
+                Poll::Pending => {
+                    return ControlFlow::Break(Poll::Pending);
+                }
+            }
+        }
+
+        let left_stream = active
+            .left_stream
+            .as_mut()
+            .expect("left_stream must be set after spill future resolves");
+
+        // Poll left stream for more batches.
+        // Note: pending_batches may already contain a batch from the
+        // previous chunk iteration (the batch that triggered the memory limit).
+        loop {
+            match left_stream.poll_next_unpin(cx) {
+                Poll::Ready(Some(Ok(batch))) => {
+                    if batch.num_rows() == 0 {
+                        continue;
+                    }
+                    let batch_rows = batch.num_rows();
+                    let batch_size = batch.get_array_memory_size();
+                    let can_grow = active.reservation.try_grow(batch_size).is_ok();
+
+                    if !can_grow && !active.pending_batches.is_empty() {
+                        // Memory limit reached and we already have data.
+                        // Push this batch into pending (it's already in memory)
+                        // and stop buffering for this chunk.
+                        active.pending_batches.push(batch);
+                        self.left_exhausted = false;
+                        self.left_buffered_in_one_pass = false;
+                        break;
+                    } else if !can_grow {
+                        // No pending batches yet — we must accept this batch
+                        // to make progress, even if it exceeds the budget.
+                        active.reservation.grow(batch_size);
+                    }
+
+                    self.metrics.join_metrics.build_mem_used.add(batch_size);
+                    self.metrics.join_metrics.build_input_batches.add(1);
+                    self.metrics.join_metrics.build_input_rows.add(batch_rows);
+                    active.pending_batches.push(batch);
+                }
+                Poll::Ready(Some(Err(e))) => {
+                    return ControlFlow::Break(Poll::Ready(Some(Err(e))));
+                }
+                Poll::Ready(None) => {
+                    // Left stream exhausted
+                    self.left_exhausted = true;
+                    break;
+                }
+                Poll::Pending => {
+                    return ControlFlow::Break(Poll::Pending);
+                }
+            }
+        }
+
+        if active.pending_batches.is_empty() {
+            // No data at all — go directly to Done
+            self.left_exhausted = true;
+            self.state = NLJState::Done;
+            return ControlFlow::Continue(());
+        }
+
+        let merged_batch = match concat_batches(
+            active
+                .left_schema
+                .as_ref()
+                .expect("left_schema must be set"),
+            &active.pending_batches,
+        ) {
+            Ok(batch) => batch,
+            Err(e) => {
+                return ControlFlow::Break(Poll::Ready(Some(Err(e.into()))));
+            }
+        };
+        active.pending_batches.clear();
+
+        // Build visited bitmap if needed for this join type
+        let with_visited = need_produce_result_in_final(self.join_type);
+        let n_rows = merged_batch.num_rows();
+        let visited_left_side = if with_visited {
+            let buffer_size = n_rows.div_ceil(8);
+            // Use infallible grow for bitmap — it's small
+            active.reservation.grow(buffer_size);
+            self.metrics.join_metrics.build_mem_used.add(buffer_size);
+            let mut buffer = BooleanBufferBuilder::new(n_rows);
+            buffer.append_n(n_rows, false);
+            buffer
+        } else {
+            BooleanBufferBuilder::new(0)
+        };
+
+        // Create an empty reservation for JoinLeftData's RAII field.
+        // The actual memory tracking is managed by the Active state's reservation.
+        let dummy_reservation = active.reservation.new_empty();
+
+        let left_data = JoinLeftData::new(
+            merged_batch,
+            Mutex::new(visited_left_side),
+            // In memory-limited mode, only 1 probe thread per chunk
+            AtomicUsize::new(1),
+            dummy_reservation,
+        );
+
+        self.buffered_left_data = Some(Arc::new(left_data));
+
+        match active.right_input.open_pass() {
+            Ok(stream) => {
+                self.right_data = Some(stream);
+            }
+            Err(e) => {
+                return ControlFlow::Break(Poll::Ready(Some(Err(e))));
             }
-            Poll::Ready(Err(e)) => ControlFlow::Break(Poll::Ready(Some(Err(e)))),
-            Poll::Pending => ControlFlow::Break(Poll::Pending),
         }
+
+        self.state = NLJState::FetchingRight;
+        ControlFlow::Continue(())
     }
 
-    /// Handle FetchingRight state - fetch next right batch and prepare for processing
+    /// Handle FetchingRight state - fetch next right batch and prepare for processing.
+    ///
+    /// In memory-limited mode during the first pass, each right batch is also
+    /// written to a spill file so it can be re-read on subsequent passes.
     fn handle_fetching_right(
         &mut self,
         cx: &mut std::task::Context<'_>,
     ) -> ControlFlow<Poll<Option<Result<RecordBatch>>>> {
-        match self.right_data.poll_next_unpin(cx) {
+        match self
+            .right_data
+            .as_mut()
+            .expect("right_data must be present while fetching right")
+            .poll_next_unpin(cx)
+        {
             Poll::Ready(result) => match result {
                 Some(Ok(right_batch)) => {
                     // Update metrics
-                    let right_batch_size = right_batch.num_rows();
-                    self.join_metrics.input_rows.add(right_batch_size);
-                    self.join_metrics.input_batches.add(1);
+                    let right_batch_rows = right_batch.num_rows();
+                    self.metrics.join_metrics.input_rows.add(right_batch_rows);
+                    self.metrics.join_metrics.input_batches.add(1);
 
                     // Skip the empty batch
-                    if right_batch_size == 0 {
+                    if right_batch_rows == 0 {
                         return ControlFlow::Continue(());
                     }
 
@@ -1069,7 +1615,7 @@ impl NestedLoopJoinStream {
 
                     // Prepare right bitmap
                     if self.should_track_unmatched_right {
-                        let zeroed_buf = BooleanBuffer::new_unset(right_batch_size);
+                        let zeroed_buf = BooleanBuffer::new_unset(right_batch_rows);
                         self.current_right_batch_matched =
                             Some(BooleanArray::new(zeroed_buf, None));
                     }
@@ -1080,7 +1626,6 @@ impl NestedLoopJoinStream {
                 }
                 Some(Err(e)) => ControlFlow::Break(Poll::Ready(Some(Err(e)))),
                 None => {
-                    // Right stream exhausted
                     self.state = NLJState::EmitLeftUnmatched;
                     ControlFlow::Continue(())
                 }
@@ -1108,6 +1653,17 @@ impl NestedLoopJoinStream {
             Ok(false) => {
                 // Left exhausted, transition to FetchingRight
                 self.left_probe_idx = 0;
+
+                // Selectivity Metric: Update total possibilities for the batch (left_rows * right_rows)
+                // If memory-limited execution is implemented, this logic must be updated accordingly.
+                if let (Ok(left_data), Some(right_batch)) =
+                    (self.get_left_data(), self.current_right_batch.as_ref())
+                {
+                    let left_rows = left_data.batch().num_rows();
+                    let right_rows = right_batch.num_rows();
+                    self.metrics.selectivity.add_total(left_rows * right_rows);
+                }
+
                 if self.should_track_unmatched_right {
                     debug_assert!(
                         self.current_right_batch_matched.is_some(),
@@ -1138,7 +1694,6 @@ impl NestedLoopJoinStream {
                 && self.current_right_batch.is_some(),
             "This state is yielding output for unmatched rows in the current right batch, so both the right batch and the bitmap must be present"
         );
-
         // Construct the result batch for unmatched right rows using a utility function
         match self.process_right_unmatched() {
             Ok(Some(batch)) => {
@@ -1164,7 +1719,11 @@ impl NestedLoopJoinStream {
         }
     }
 
-    /// Handle EmitLeftUnmatched state - emit unmatched left rows
+    /// Handle EmitLeftUnmatched state - emit unmatched left rows.
+    ///
+    /// In memory-limited mode, after processing all unmatched rows for the
+    /// current left chunk, transitions back to `BufferingLeft` to load the
+    /// next chunk (if the left stream is not yet exhausted).
     fn handle_emit_left_unmatched(
         &mut self,
     ) -> ControlFlow<Poll<Option<Result<RecordBatch>>>> {
@@ -1178,11 +1737,30 @@ impl NestedLoopJoinStream {
             // State unchanged (EmitLeftUnmatched)
             // Continue processing until we have processed all unmatched rows
             Ok(true) => ControlFlow::Continue(()),
-            // To Done state
-            // We have finished processing all unmatched rows
+            // We have finished processing all unmatched rows for this chunk
             Ok(false) => match self.output_buffer.finish_buffered_batch() {
                 Ok(()) => {
-                    self.state = NLJState::Done;
+                    // Flush any completed batch before transitioning.
+                    // This is critical for the memory-limited path: the
+                    // ProbeRight results must be emitted before we discard
+                    // the current chunk and load the next one.
+                    if let Some(poll) = self.maybe_flush_ready_batch() {
+                        return ControlFlow::Break(poll);
+                    }
+
+                    if !self.left_exhausted && self.is_memory_limited() {
+                        // More left data to process — free current chunk and
+                        // go back to BufferingLeft for the next chunk
+                        if let SpillState::Active(ref active) = self.spill_state {
+                            active.reservation.resize(0);
+                        }
+                        self.buffered_left_data = None;
+                        self.left_probe_idx = 0;
+                        self.left_emit_idx = 0;
+                        self.state = NLJState::BufferingLeft;
+                    } else {
+                        self.state = NLJState::Done;
+                    }
                     ControlFlow::Continue(())
                 }
                 Err(e) => ControlFlow::Break(Poll::Ready(Some(arrow_err!(e)))),
@@ -1205,7 +1783,7 @@ impl NestedLoopJoinStream {
         // should be with the expected schema for this operator
         if !self.handled_empty_output {
             let zero_count = Count::new();
-            if *self.join_metrics.baseline.output_rows() == zero_count {
+            if *self.metrics.join_metrics.baseline.output_rows() == zero_count {
                 let empty_batch = RecordBatch::new_empty(Arc::clone(&self.output_schema));
                 self.handled_empty_output = true;
                 return Poll::Ready(Some(Ok(empty_batch)));
@@ -1239,11 +1817,54 @@ impl NestedLoopJoinStream {
         // and push the result into output_buffer
         // ========
 
+        // Special case:
+        // When the right batch is very small, join with multiple left rows at once,
+        //
+        // The regular implementation is not efficient if the plan's right child is
+        // very small (e.g. 1 row total), because inside the inner loop of NLJ, it's
+        // handling one input right batch at once, if it's not large enough, the
+        // overheads like filter evaluation can't be amortized through vectorization.
+        debug_assert_ne!(
+            right_batch.num_rows(),
+            0,
+            "When fetching the right batch, empty batches will be skipped"
+        );
+
+        let l_row_cnt_ratio = self.batch_size / right_batch.num_rows();
+        if l_row_cnt_ratio > 10 {
+            // Calculate max left rows to handle at once. This operator tries to handle
+            // up to `datafusion.execution.batch_size` rows at once in the intermediate
+            // batch.
+            let l_row_count = std::cmp::min(
+                l_row_cnt_ratio,
+                left_data.batch().num_rows() - self.left_probe_idx,
+            );
+
+            debug_assert!(
+                l_row_count != 0,
+                "This function should only be entered when there are remaining left rows to process"
+            );
+            let joined_batch = self.process_left_range_join(
+                &left_data,
+                &right_batch,
+                self.left_probe_idx,
+                l_row_count,
+            )?;
+
+            if let Some(batch) = joined_batch {
+                self.output_buffer.push_batch(batch)?;
+            }
+
+            self.left_probe_idx += l_row_count;
+
+            return Ok(true);
+        }
+
         let l_idx = self.left_probe_idx;
-        let join_batch =
+        let joined_batch =
             self.process_single_left_row_join(&left_data, &right_batch, l_idx)?;
 
-        if let Some(batch) = join_batch {
+        if let Some(batch) = joined_batch {
             self.output_buffer.push_batch(batch)?;
         }
 
@@ -1256,8 +1877,196 @@ impl NestedLoopJoinStream {
         Ok(true)
     }
 
+    /// Process [l_start_index, l_start_index + l_count) JOIN right_batch
+    /// Returns a RecordBatch containing the join results (None if empty)
+    ///
+    /// Side Effect: If the join type requires, left or right side matched bitmap
+    /// will be set for matched indices.
+    fn process_left_range_join(
+        &mut self,
+        left_data: &JoinLeftData,
+        right_batch: &RecordBatch,
+        l_start_index: usize,
+        l_row_count: usize,
+    ) -> Result<Option<RecordBatch>> {
+        // Construct the Cartesian product between the specified range of left rows
+        // and the entire right_batch. First, it calculates the index vectors, then
+        // materializes the intermediate batch, and finally applies the join filter
+        // to it.
+        // -----------------------------------------------------------
+        let right_rows = right_batch.num_rows();
+        let total_rows = l_row_count * right_rows;
+
+        // Build index arrays for cartesian product: left_range X right_batch
+        let left_indices: UInt32Array =
+            UInt32Array::from_iter_values((0..l_row_count).flat_map(|i| {
+                std::iter::repeat_n((l_start_index + i) as u32, right_rows)
+            }));
+        let right_indices: UInt32Array = UInt32Array::from_iter_values(
+            (0..l_row_count).flat_map(|_| 0..right_rows as u32),
+        );
+
+        debug_assert!(
+            left_indices.len() == right_indices.len()
+                && right_indices.len() == total_rows,
+            "The length or cartesian product should be (left_size * right_size)",
+        );
+
+        // Evaluate the join filter (if any) over an intermediate batch built
+        // using the filter's own schema/column indices.
+        let bitmap_combined = if let Some(filter) = &self.join_filter {
+            // Build the intermediate batch for filter evaluation
+            let intermediate_batch = if filter.schema.fields().is_empty() {
+                // Constant predicate (e.g., TRUE/FALSE). Use an empty schema with row_count
+                create_record_batch_with_empty_schema(
+                    Arc::new((*filter.schema).clone()),
+                    total_rows,
+                )?
+            } else {
+                let mut filter_columns: Vec<Arc<dyn Array>> =
+                    Vec::with_capacity(filter.column_indices().len());
+                for column_index in filter.column_indices() {
+                    let array = if column_index.side == JoinSide::Left {
+                        let col = left_data.batch().column(column_index.index);
+                        take(col.as_ref(), &left_indices, None)?
+                    } else {
+                        let col = right_batch.column(column_index.index);
+                        take(col.as_ref(), &right_indices, None)?
+                    };
+                    filter_columns.push(array);
+                }
+
+                RecordBatch::try_new(Arc::new((*filter.schema).clone()), filter_columns)?
+            };
+
+            let filter_result = filter
+                .expression()
+                .evaluate(&intermediate_batch)?
+                .into_array(intermediate_batch.num_rows())?;
+            let filter_arr = as_boolean_array(&filter_result)?;
+
+            // Combine with null bitmap to get a unified mask
+            boolean_mask_from_filter(filter_arr)
+        } else {
+            // No filter: all pairs match
+            BooleanArray::from(vec![true; total_rows])
+        };
+
+        // Update the global left or right bitmap for matched indices
+        // -----------------------------------------------------------
+
+        // None means we don't have to update left bitmap for this join type
+        let mut left_bitmap = if need_produce_result_in_final(self.join_type) {
+            Some(left_data.bitmap().lock())
+        } else {
+            None
+        };
+
+        // 'local' meaning: we want to collect 'is_matched' flag for the current
+        // right batch, after it has joining all of the left buffer, here it's only
+        // the partial result for joining given left range
+        let mut local_right_bitmap = if self.should_track_unmatched_right {
+            let mut current_right_batch_bitmap = BooleanBufferBuilder::new(right_rows);
+            // Ensure builder has logical length so set_bit is in-bounds
+            current_right_batch_bitmap.append_n(right_rows, false);
+            Some(current_right_batch_bitmap)
+        } else {
+            None
+        };
+
+        // Set the matched bit for left and right side bitmap
+        for (i, is_matched) in bitmap_combined.iter().enumerate() {
+            let is_matched = is_matched.ok_or_else(|| {
+                internal_datafusion_err!("Must be Some after the previous combining step")
+            })?;
+
+            let l_index = l_start_index + i / right_rows;
+            let r_index = i % right_rows;
+
+            if let Some(bitmap) = left_bitmap.as_mut()
+                && is_matched
+            {
+                // Map local index back to absolute left index within the batch
+                bitmap.set_bit(l_index, true);
+            }
+
+            if let Some(bitmap) = local_right_bitmap.as_mut()
+                && is_matched
+            {
+                bitmap.set_bit(r_index, true);
+            }
+        }
+
+        // Apply the local right bitmap to the global bitmap
+        if self.should_track_unmatched_right {
+            // Remember to put it back after update
+            let global_right_bitmap =
+                std::mem::take(&mut self.current_right_batch_matched).ok_or_else(
+                    || internal_datafusion_err!("right batch's bitmap should be present"),
+                )?;
+            let (buf, nulls) = global_right_bitmap.into_parts();
+            debug_assert!(nulls.is_none());
+
+            let current_right_bitmap = local_right_bitmap
+                .ok_or_else(|| {
+                    internal_datafusion_err!(
+                        "Should be Some if the current join type requires right bitmap"
+                    )
+                })?
+                .finish();
+            let updated_global_right_bitmap = buf.bitor(&current_right_bitmap);
+
+            self.current_right_batch_matched =
+                Some(BooleanArray::new(updated_global_right_bitmap, None));
+        }
+
+        // For the following join types: only bitmaps are updated; do not emit rows now
+        if matches!(
+            self.join_type,
+            JoinType::LeftAnti
+                | JoinType::LeftSemi
+                | JoinType::LeftMark
+                | JoinType::RightAnti
+                | JoinType::RightMark
+                | JoinType::RightSemi
+        ) {
+            return Ok(None);
+        }
+
+        // Build the projected output batch (using output schema/column_indices),
+        // then apply the bitmap filter to it.
+        if self.output_schema.fields().is_empty() {
+            // Empty projection: only row count matters
+            let row_count = bitmap_combined.true_count();
+            return Ok(Some(create_record_batch_with_empty_schema(
+                Arc::clone(&self.output_schema),
+                row_count,
+            )?));
+        }
+
+        let mut out_columns: Vec<Arc<dyn Array>> =
+            Vec::with_capacity(self.output_schema.fields().len());
+        for column_index in &self.column_indices {
+            let array = if column_index.side == JoinSide::Left {
+                let col = left_data.batch().column(column_index.index);
+                take(col.as_ref(), &left_indices, None)?
+            } else {
+                let col = right_batch.column(column_index.index);
+                take(col.as_ref(), &right_indices, None)?
+            };
+            out_columns.push(array);
+        }
+        let pre_filtered =
+            RecordBatch::try_new(Arc::clone(&self.output_schema), out_columns)?;
+        let filtered = filter_record_batch(&pre_filtered, &bitmap_combined)?;
+        Ok(Some(filtered))
+    }
+
     /// Process a single left row join with the current right batch.
     /// Returns a RecordBatch containing the join results (None if empty)
+    ///
+    /// Side Effect: If the join type requires, left or right side matched bitmap
+    /// will be set for matched indices.
     fn process_single_left_row_join(
         &mut self,
         left_data: &JoinLeftData,
@@ -1296,7 +2105,7 @@ impl NestedLoopJoinStream {
             return Ok(None);
         }
 
-        if cur_right_bitmap.true_count() == 0 {
+        if !cur_right_bitmap.has_true() {
             // If none of the pairs has passed the join predicate/filter
             Ok(None)
         } else {
@@ -1397,11 +2206,16 @@ impl NestedLoopJoinStream {
         }
         let bitmap_sliced = BooleanArray::new(bitmap_sliced.finish(), None);
 
+        let right_schema = self
+            .right_data
+            .as_ref()
+            .expect("right_data must be present when building unmatched batch")
+            .schema();
         build_unmatched_batch(
-            Arc::clone(&self.output_schema),
+            &self.output_schema,
             &left_batch_sliced,
             bitmap_sliced,
-            self.right_data.schema(),
+            &right_schema,
             &self.column_indices,
             self.join_type,
             JoinSide::Left,
@@ -1424,10 +2238,10 @@ impl NestedLoopJoinStream {
         let left_schema = left_data.batch().schema();
 
         let res = build_unmatched_batch(
-            Arc::clone(&self.output_schema),
+            &self.output_schema,
             &cur_right_batch,
             right_batch_bitmap,
-            left_schema,
+            &left_schema,
             &self.column_indices,
             self.join_type,
             JoinSide::Right,
@@ -1451,14 +2265,14 @@ impl NestedLoopJoinStream {
     /// Flush the `output_buffer` if there are batches ready to output
     /// None if no result batch ready.
     fn maybe_flush_ready_batch(&mut self) -> Option<Poll<Option<Result<RecordBatch>>>> {
-        if self.output_buffer.has_completed_batch() {
-            if let Some(batch) = self.output_buffer.next_completed_batch() {
-                // HACK: this is not part of `BaselineMetrics` yet, so update it
-                // manually
-                self.join_metrics.output_batches.add(1);
+        if self.output_buffer.has_completed_batch()
+            && let Some(batch) = self.output_buffer.next_completed_batch()
+        {
+            // Update output rows for selectivity metric
+            let output_rows = batch.num_rows();
+            self.metrics.selectivity.add_part(output_rows);
 
-                return Some(Poll::Ready(Some(Ok(batch))));
-            }
+            return Some(Poll::Ready(Some(Ok(batch))));
         }
 
         None
@@ -1486,16 +2300,13 @@ impl NestedLoopJoinStream {
     ) -> Result<()> {
         let left_data = self.get_left_data()?;
 
-        // number of successfully joined pairs from (l_index x cur_right_batch)
-        let joined_len = r_matched_bitmap.true_count();
-
         // 1. Maybe update the left bitmap
-        if need_produce_result_in_final(self.join_type) && (joined_len > 0) {
+        if need_produce_result_in_final(self.join_type) && r_matched_bitmap.has_true() {
             let mut bitmap = left_data.bitmap().lock();
             bitmap.set_bit(l_index, true);
         }
 
-        // 2. Maybe updateh the right bitmap
+        // 2. Maybe update the right bitmap
         if self.should_track_unmatched_right {
             debug_assert!(self.current_right_batch_matched.is_some());
             // after bit-wise or, it will be put back
@@ -1554,22 +2365,26 @@ fn apply_filter_to_row_join_batch(
         .into_array(intermediate_batch.num_rows())?;
     let filter_arr = as_boolean_array(&filter_result)?;
 
-    // [Caution] This step has previously introduced bugs
-    // The filter result is NOT a bitmap; it contains true/false/null values.
-    // For example, 1 < NULL is evaluated to NULL. Therefore, we must combine (AND)
-    // the boolean array with its null bitmap to construct a unified bitmap.
-    let (is_filtered, nulls) = filter_arr.clone().into_parts();
-    let bitmap_combined = match nulls {
-        Some(nulls) => {
-            let combined = nulls.inner() & &is_filtered;
-            BooleanArray::new(combined, None)
-        }
-        None => BooleanArray::new(is_filtered, None),
-    };
+    // Convert boolean array with potential nulls into a unified mask bitmap
+    let bitmap_combined = boolean_mask_from_filter(filter_arr);
 
     Ok(bitmap_combined)
 }
 
+/// Convert a boolean filter array into a unified mask bitmap.
+///
+/// Caution: The filter result is NOT a bitmap; it contains true/false/null values.
+/// For example, `1 < NULL` evaluates to NULL. Therefore, we must combine (AND)
+/// the boolean array with its null bitmap to construct a unified bitmap.
+#[inline]
+fn boolean_mask_from_filter(filter_arr: &BooleanArray) -> BooleanArray {
+    let (values, nulls) = filter_arr.clone().into_parts();
+    match nulls {
+        Some(nulls) => BooleanArray::new(nulls.inner() & &values, None),
+        None => BooleanArray::new(values, None),
+    }
+}
+
 /// This function performs the following steps:
 /// 1. Apply filter to probe-side batch
 /// 2. Broadcast the left row (build_side_batch\[build_side_index\]) to the
@@ -1665,9 +2480,10 @@ fn build_row_join_batch(
             // Broadcast the single build-side row to match the filtered
             // probe-side batch length
             let original_left_array = build_side_batch.column(column_index.index);
-            // Avoid using `ScalarValue::to_array_of_size()` for `List(Utf8View)` to avoid
-            // deep copies for buffers inside `Utf8View` array. See below for details.
-            // https://github.com/apache/datafusion/issues/18159
+
+            // Use `arrow::compute::take` directly for `List(Utf8View)` rather
+            // than going through `ScalarValue::to_array_of_size()`, which
+            // avoids some intermediate allocations.
             //
             // In other cases, `to_array_of_size()` is faster.
             match original_left_array.data_type() {
@@ -1710,7 +2526,7 @@ fn build_row_join_batch(
 /// If Some, that's the result batch
 /// If None, it's not for this special case. Continue execution.
 fn build_unmatched_batch_empty_schema(
-    output_schema: SchemaRef,
+    output_schema: &SchemaRef,
     batch_bitmap: &BooleanArray,
     // For left/right/full joins, it needs to fill nulls for another side
     join_type: JoinType,
@@ -1728,7 +2544,7 @@ fn build_unmatched_batch_empty_schema(
 
     if output_schema.fields().is_empty() {
         Ok(Some(create_record_batch_with_empty_schema(
-            Arc::clone(&output_schema),
+            Arc::clone(output_schema),
             result_size,
         )?))
     } else {
@@ -1788,11 +2604,11 @@ fn create_record_batch_with_empty_schema(
 /// Null(bool) Null(Int32) 1
 /// Null(bool) Null(Int32) 3
 fn build_unmatched_batch(
-    output_schema: SchemaRef,
+    output_schema: &SchemaRef,
     batch: &RecordBatch,
     batch_bitmap: BooleanArray,
     // For left/right/full joins, it needs to fill nulls for another side
-    another_side_schema: SchemaRef,
+    another_side_schema: &SchemaRef,
     col_indices: &[ColumnIndex],
     join_type: JoinType,
     batch_side: JoinSide,
@@ -1802,11 +2618,9 @@ fn build_unmatched_batch(
     debug_assert_ne!(batch_side, JoinSide::None);
 
     // Handle special case (see function comment)
-    if let Some(batch) = build_unmatched_batch_empty_schema(
-        Arc::clone(&output_schema),
-        &batch_bitmap,
-        join_type,
-    )? {
+    if let Some(batch) =
+        build_unmatched_batch_empty_schema(output_schema, &batch_bitmap, join_type)?
+    {
         return Ok(Some(batch));
     }
 
@@ -1823,7 +2637,7 @@ fn build_unmatched_batch(
             // 2. Fill left side with nulls
             let flipped_bitmap = not(&batch_bitmap)?;
 
-            // create a recordbatch, with left_schema, of only one row of all nulls
+            // create a record batch, with left_schema, of only one row of all nulls
             let left_null_columns: Vec<Arc<dyn Array>> = another_side_schema
                 .fields()
                 .iter()
@@ -1837,9 +2651,7 @@ fn build_unmatched_batch(
                 another_side_schema
                     .fields()
                     .iter()
-                    .map(|field| {
-                        (**field).clone().with_nullable(true)
-                    })
+                    .map(|field| (**field).clone().with_nullable(true))
                     .collect::<Vec<_>>(),
             ));
             let left_null_batch = if nullable_left_schema.fields.is_empty() {
@@ -1853,10 +2665,20 @@ fn build_unmatched_batch(
             debug_assert_ne!(batch_side, JoinSide::None);
             let opposite_side = batch_side.negate();
 
-            build_row_join_batch(&output_schema, &left_null_batch, 0, batch, Some(flipped_bitmap), col_indices, opposite_side)
-
-        },
-        JoinType::RightSemi | JoinType::RightAnti | JoinType::LeftSemi | JoinType::LeftAnti => {
+            build_row_join_batch(
+                output_schema,
+                &left_null_batch,
+                0,
+                batch,
+                Some(flipped_bitmap),
+                col_indices,
+                opposite_side,
+            )
+        }
+        JoinType::RightSemi
+        | JoinType::RightAnti
+        | JoinType::LeftSemi
+        | JoinType::LeftAnti => {
             if matches!(join_type, JoinType::RightSemi | JoinType::RightAnti) {
                 debug_assert_eq!(batch_side, JoinSide::Right);
             }
@@ -1864,13 +2686,14 @@ fn build_unmatched_batch(
                 debug_assert_eq!(batch_side, JoinSide::Left);
             }
 
-            let bitmap = if matches!(join_type, JoinType::LeftSemi | JoinType::RightSemi) {
+            let bitmap = if matches!(join_type, JoinType::LeftSemi | JoinType::RightSemi)
+            {
                 batch_bitmap.clone()
             } else {
                 not(&batch_bitmap)?
             };
 
-            if bitmap.true_count() == 0 {
+            if !bitmap.has_true() {
                 return Ok(None);
             }
 
@@ -1886,8 +2709,11 @@ fn build_unmatched_batch(
                 columns.push(filtered_col);
             }
 
-            Ok(Some(RecordBatch::try_new(Arc::clone(&output_schema), columns)?))
-        },
+            Ok(Some(RecordBatch::try_new(
+                Arc::clone(output_schema),
+                columns,
+            )?))
+        }
         JoinType::RightMark | JoinType::LeftMark => {
             if join_type == JoinType::RightMark {
                 debug_assert_eq!(batch_side, JoinSide::Right);
@@ -1910,32 +2736,41 @@ fn build_unmatched_batch(
                 } else if column_index.side == JoinSide::None {
                     let right_batch_bitmap = std::mem::take(&mut right_batch_bitmap_opt);
                     match right_batch_bitmap {
-                        Some(right_batch_bitmap) => {columns.push(Arc::new(right_batch_bitmap))},
+                        Some(right_batch_bitmap) => {
+                            columns.push(Arc::new(right_batch_bitmap))
+                        }
                         None => unreachable!("Should only be one mark column"),
                     }
                 } else {
-                    return internal_err!("Not possible to have this join side for RightMark join");
+                    return internal_err!(
+                        "Not possible to have this join side for RightMark join"
+                    );
                 }
             }
 
-            Ok(Some(RecordBatch::try_new(Arc::clone(&output_schema), columns)?))
+            Ok(Some(RecordBatch::try_new(
+                Arc::clone(output_schema),
+                columns,
+            )?))
         }
-        _ => internal_err!("If batch is at right side, this function must be handling Full/Right/RightSemi/RightAnti/RightMark joins"),
+        _ => internal_err!(
+            "If batch is at right side, this function must be handling Full/Right/RightSemi/RightAnti/RightMark joins"
+        ),
     }
 }
 
 #[cfg(test)]
 pub(crate) mod tests {
     use super::*;
-    use crate::test::{assert_join_metrics, TestMemoryExec};
+    use crate::test::{TestMemoryExec, assert_join_metrics};
     use crate::{
         common, expressions::Column, repartition::RepartitionExec, test::build_table_i32,
     };
 
     use arrow::compute::SortOptions;
     use arrow::datatypes::{DataType, Field};
+    use datafusion_common::assert_contains;
     use datafusion_common::test_util::batches_to_sort_string;
-    use datafusion_common::{assert_contains, ScalarValue};
     use datafusion_execution::runtime_env::RuntimeEnvBuilder;
     use datafusion_expr::Operator;
     use datafusion_physical_expr::expressions::{BinaryExpr, Literal};
@@ -1983,7 +2818,8 @@ pub(crate) mod tests {
             source = source.try_with_sort_information(vec![ordering]).unwrap();
         }
 
-        Arc::new(TestMemoryExec::update_cache(Arc::new(source)))
+        let source = Arc::new(source);
+        Arc::new(TestMemoryExec::update_cache(&source))
     }
 
     fn build_left_table() -> Arc<dyn ExecutionPlan> {
@@ -2118,13 +2954,13 @@ pub(crate) mod tests {
         .await?;
 
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]);
-        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b2 | c2 |
-            +----+----+----+----+----+----+
-            | 5  | 5  | 50 | 2  | 2  | 80 |
-            +----+----+----+----+----+----+
-            "#));
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+----+----+----+----+
+        | a1 | b1 | c1 | a2 | b2 | c2 |
+        +----+----+----+----+----+----+
+        | 5  | 5  | 50 | 2  | 2  | 80 |
+        +----+----+----+----+----+----+
+        "));
 
         assert_join_metrics!(metrics, 1);
 
@@ -2148,15 +2984,15 @@ pub(crate) mod tests {
         )
         .await?;
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]);
-        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+-----+----+----+----+
-            | a1 | b1 | c1  | a2 | b2 | c2 |
-            +----+----+-----+----+----+----+
-            | 11 | 8  | 110 |    |    |    |
-            | 5  | 5  | 50  | 2  | 2  | 80 |
-            | 9  | 8  | 90  |    |    |    |
-            +----+----+-----+----+----+----+
-            "#));
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+-----+----+----+----+
+        | a1 | b1 | c1  | a2 | b2 | c2 |
+        +----+----+-----+----+----+----+
+        | 11 | 8  | 110 |    |    |    |
+        | 5  | 5  | 50  | 2  | 2  | 80 |
+        | 9  | 8  | 90  |    |    |    |
+        +----+----+-----+----+----+----+
+        "));
 
         assert_join_metrics!(metrics, 3);
 
@@ -2180,15 +3016,15 @@ pub(crate) mod tests {
         )
         .await?;
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]);
-        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+----+----+----+-----+
-            | a1 | b1 | c1 | a2 | b2 | c2  |
-            +----+----+----+----+----+-----+
-            |    |    |    | 10 | 10 | 100 |
-            |    |    |    | 12 | 10 | 40  |
-            | 5  | 5  | 50 | 2  | 2  | 80  |
-            +----+----+----+----+----+-----+
-            "#));
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+----+----+----+-----+
+        | a1 | b1 | c1 | a2 | b2 | c2  |
+        +----+----+----+----+----+-----+
+        |    |    |    | 10 | 10 | 100 |
+        |    |    |    | 12 | 10 | 40  |
+        | 5  | 5  | 50 | 2  | 2  | 80  |
+        +----+----+----+----+----+-----+
+        "));
 
         assert_join_metrics!(metrics, 3);
 
@@ -2212,17 +3048,17 @@ pub(crate) mod tests {
         )
         .await?;
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]);
-        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+-----+----+----+-----+
-            | a1 | b1 | c1  | a2 | b2 | c2  |
-            +----+----+-----+----+----+-----+
-            |    |    |     | 10 | 10 | 100 |
-            |    |    |     | 12 | 10 | 40  |
-            | 11 | 8  | 110 |    |    |     |
-            | 5  | 5  | 50  | 2  | 2  | 80  |
-            | 9  | 8  | 90  |    |    |     |
-            +----+----+-----+----+----+-----+
-            "#));
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+-----+----+----+-----+
+        | a1 | b1 | c1  | a2 | b2 | c2  |
+        +----+----+-----+----+----+-----+
+        |    |    |     | 10 | 10 | 100 |
+        |    |    |     | 12 | 10 | 40  |
+        | 11 | 8  | 110 |    |    |     |
+        | 5  | 5  | 50  | 2  | 2  | 80  |
+        | 9  | 8  | 90  |    |    |     |
+        +----+----+-----+----+----+-----+
+        "));
 
         assert_join_metrics!(metrics, 5);
 
@@ -2248,13 +3084,13 @@ pub(crate) mod tests {
         )
         .await?;
         assert_eq!(columns, vec!["a1", "b1", "c1"]);
-        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+----+
-            | a1 | b1 | c1 |
-            +----+----+----+
-            | 5  | 5  | 50 |
-            +----+----+----+
-            "#));
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+----+
+        | a1 | b1 | c1 |
+        +----+----+----+
+        | 5  | 5  | 50 |
+        +----+----+----+
+        "));
 
         assert_join_metrics!(metrics, 1);
 
@@ -2280,14 +3116,14 @@ pub(crate) mod tests {
         )
         .await?;
         assert_eq!(columns, vec!["a1", "b1", "c1"]);
-        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+-----+
-            | a1 | b1 | c1  |
-            +----+----+-----+
-            | 11 | 8  | 110 |
-            | 9  | 8  | 90  |
-            +----+----+-----+
-            "#));
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+-----+
+        | a1 | b1 | c1  |
+        +----+----+-----+
+        | 11 | 8  | 110 |
+        | 9  | 8  | 90  |
+        +----+----+-----+
+        "));
 
         assert_join_metrics!(metrics, 2);
 
@@ -2333,13 +3169,13 @@ pub(crate) mod tests {
         )
         .await?;
         assert_eq!(columns, vec!["a2", "b2", "c2"]);
-        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+----+
-            | a2 | b2 | c2 |
-            +----+----+----+
-            | 2  | 2  | 80 |
-            +----+----+----+
-            "#));
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+----+
+        | a2 | b2 | c2 |
+        +----+----+----+
+        | 2  | 2  | 80 |
+        +----+----+----+
+        "));
 
         assert_join_metrics!(metrics, 1);
 
@@ -2365,14 +3201,14 @@ pub(crate) mod tests {
         )
         .await?;
         assert_eq!(columns, vec!["a2", "b2", "c2"]);
-        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+-----+
-            | a2 | b2 | c2  |
-            +----+----+-----+
-            | 10 | 10 | 100 |
-            | 12 | 10 | 40  |
-            +----+----+-----+
-            "#));
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+-----+
+        | a2 | b2 | c2  |
+        +----+----+-----+
+        | 10 | 10 | 100 |
+        | 12 | 10 | 40  |
+        +----+----+-----+
+        "));
 
         assert_join_metrics!(metrics, 2);
 
@@ -2398,15 +3234,15 @@ pub(crate) mod tests {
         )
         .await?;
         assert_eq!(columns, vec!["a1", "b1", "c1", "mark"]);
-        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+-----+-------+
-            | a1 | b1 | c1  | mark  |
-            +----+----+-----+-------+
-            | 11 | 8  | 110 | false |
-            | 5  | 5  | 50  | true  |
-            | 9  | 8  | 90  | false |
-            +----+----+-----+-------+
-            "#));
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+-----+-------+
+        | a1 | b1 | c1  | mark  |
+        +----+----+-----+-------+
+        | 11 | 8  | 110 | false |
+        | 5  | 5  | 50  | true  |
+        | 9  | 8  | 90  | false |
+        +----+----+-----+-------+
+        "));
 
         assert_join_metrics!(metrics, 3);
 
@@ -2433,15 +3269,15 @@ pub(crate) mod tests {
         .await?;
         assert_eq!(columns, vec!["a2", "b2", "c2", "mark"]);
 
-        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+-----+-------+
-            | a2 | b2 | c2  | mark  |
-            +----+----+-----+-------+
-            | 10 | 10 | 100 | false |
-            | 12 | 10 | 40  | false |
-            | 2  | 2  | 80  | true  |
-            +----+----+-----+-------+
-            "#));
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+-----+-------+
+        | a2 | b2 | c2  | mark  |
+        +----+----+-----+-------+
+        | 10 | 10 | 100 | false |
+        | 12 | 10 | 40  | false |
+        | 2  | 2  | 80  | true  |
+        +----+----+-----+-------+
+        "));
 
         assert_join_metrics!(metrics, 3);
 
@@ -2466,20 +3302,44 @@ pub(crate) mod tests {
         );
         let filter = prepare_join_filter();
 
-        let join_types = vec![
+        // Join types that support memory-limited fallback should succeed
+        // even under tight memory limits (they spill to disk instead of OOM).
+        let fallback_join_types = vec![
             JoinType::Inner,
             JoinType::Left,
-            JoinType::Right,
-            JoinType::Full,
             JoinType::LeftSemi,
             JoinType::LeftAnti,
             JoinType::LeftMark,
+        ];
+
+        for join_type in &fallback_join_types {
+            let runtime = RuntimeEnvBuilder::new()
+                .with_memory_limit(100, 1.0)
+                .build_arc()?;
+            let task_ctx = TaskContext::default().with_runtime(runtime);
+            let task_ctx = Arc::new(task_ctx);
+
+            // Should succeed via spill fallback, not OOM
+            let _result = multi_partitioned_join_collect(
+                Arc::clone(&left),
+                Arc::clone(&right),
+                join_type,
+                Some(filter.clone()),
+                task_ctx,
+            )
+            .await?;
+        }
+
+        // Join types that do NOT support fallback should still OOM.
+        let no_fallback_join_types = vec![
+            JoinType::Right,
+            JoinType::Full,
             JoinType::RightSemi,
             JoinType::RightAnti,
             JoinType::RightMark,
         ];
 
-        for join_type in join_types {
+        for join_type in &no_fallback_join_types {
             let runtime = RuntimeEnvBuilder::new()
                 .with_memory_limit(100, 1.0)
                 .build_arc()?;
@@ -2489,17 +3349,14 @@ pub(crate) mod tests {
             let err = multi_partitioned_join_collect(
                 Arc::clone(&left),
                 Arc::clone(&right),
-                &join_type,
+                join_type,
                 Some(filter.clone()),
                 task_ctx,
             )
             .await
             .unwrap_err();
 
-            assert_contains!(
-                err.to_string(),
-                "Resources exhausted: Additional allocation failed for NestedLoopJoinLoad[0] with top memory consumers (across reservations) as:\n  NestedLoopJoinLoad[0]"
-            );
+            assert_contains!(err.to_string(), "Resources exhausted");
         }
 
         Ok(())
@@ -2509,4 +3366,224 @@ pub(crate) mod tests {
     fn columns(schema: &Schema) -> Vec<String> {
         schema.fields().iter().map(|f| f.name().clone()).collect()
     }
+
+    // ========================================================================
+    // Memory-limited execution tests
+    // ========================================================================
+
+    /// Helper to run a NLJ using partition 0 and collect results + metrics.
+    async fn join_collect(
+        left: Arc<dyn ExecutionPlan>,
+        right: Arc<dyn ExecutionPlan>,
+        join_type: &JoinType,
+        join_filter: Option<JoinFilter>,
+        context: Arc<TaskContext>,
+    ) -> Result<(Vec<String>, Vec<RecordBatch>, MetricsSet)> {
+        let nested_loop_join =
+            NestedLoopJoinExec::try_new(left, right, join_filter, join_type, None)?;
+        let columns = columns(&nested_loop_join.schema());
+        let stream = nested_loop_join.execute(0, context)?;
+        let batches: Vec<RecordBatch> = common::collect(stream)
+            .await?
+            .into_iter()
+            .filter(|b| b.num_rows() > 0)
+            .collect();
+        let metrics = nested_loop_join.metrics().unwrap();
+        Ok((columns, batches, metrics))
+    }
+
+    /// Create a TaskContext with tight memory limit and disk spilling enabled.
+    fn task_ctx_with_memory_limit(
+        memory_limit: usize,
+        batch_size: usize,
+    ) -> Result<Arc<TaskContext>> {
+        let runtime = RuntimeEnvBuilder::new()
+            .with_memory_limit(memory_limit, 1.0)
+            .build_arc()?;
+        let cfg = TaskContext::default()
+            .session_config()
+            .clone()
+            .with_batch_size(batch_size);
+        let task_ctx = TaskContext::default()
+            .with_runtime(runtime)
+            .with_session_config(cfg);
+        Ok(Arc::new(task_ctx))
+    }
+
+    #[tokio::test]
+    async fn test_nlj_memory_limited_inner_join() -> Result<()> {
+        // Use a very small memory limit to force OOM → fallback to spill.
+        let task_ctx = task_ctx_with_memory_limit(50, 16)?;
+        let left = build_left_table();
+        let right = build_right_table();
+        let filter = prepare_join_filter();
+
+        let (columns, batches, metrics) =
+            join_collect(left, right, &JoinType::Inner, Some(filter), task_ctx).await?;
+
+        assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]);
+
+        // Verify spill actually occurred (memory-limited path was taken)
+        assert!(
+            metrics.spill_count().unwrap_or(0) > 0,
+            "Expected spilling to occur under tight memory limit"
+        );
+
+        // Result should be identical to the non-memory-limited case
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+----+----+----+----+
+        | a1 | b1 | c1 | a2 | b2 | c2 |
+        +----+----+----+----+----+----+
+        | 5  | 5  | 50 | 2  | 2  | 80 |
+        +----+----+----+----+----+----+
+        "));
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_nlj_memory_limited_left_join() -> Result<()> {
+        let task_ctx = task_ctx_with_memory_limit(50, 16)?;
+        let left = build_left_table();
+        let right = build_right_table();
+        let filter = prepare_join_filter();
+
+        let (columns, batches, metrics) =
+            join_collect(left, right, &JoinType::Left, Some(filter), task_ctx).await?;
+
+        assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]);
+
+        // Verify spill actually occurred
+        assert!(
+            metrics.spill_count().unwrap_or(0) > 0,
+            "Expected spilling to occur under tight memory limit"
+        );
+
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+-----+----+----+----+
+        | a1 | b1 | c1  | a2 | b2 | c2 |
+        +----+----+-----+----+----+----+
+        | 11 | 8  | 110 |    |    |    |
+        | 5  | 5  | 50  | 2  | 2  | 80 |
+        | 9  | 8  | 90  |    |    |    |
+        +----+----+-----+----+----+----+
+        "));
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_nlj_fits_in_memory_no_spill() -> Result<()> {
+        // Use a large memory limit — everything fits, no spilling needed.
+        let task_ctx = task_ctx_with_memory_limit(10_000_000, 16)?;
+        let left = build_left_table();
+        let right = build_right_table();
+        let filter = prepare_join_filter();
+
+        let (columns, batches, metrics) =
+            join_collect(left, right, &JoinType::Inner, Some(filter), task_ctx).await?;
+
+        assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]);
+
+        // Verify no spilling occurred (standard OnceFut path was used)
+        assert_eq!(
+            metrics.spill_count().unwrap_or(0),
+            0,
+            "Expected no spilling with generous memory limit"
+        );
+
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+----+----+----+----+
+        | a1 | b1 | c1 | a2 | b2 | c2 |
+        +----+----+----+----+----+----+
+        | 5  | 5  | 50 | 2  | 2  | 80 |
+        +----+----+----+----+----+----+
+        "));
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_nlj_memory_limited_empty_inputs() -> Result<()> {
+        let task_ctx = task_ctx_with_memory_limit(50, 16)?;
+
+        // Empty left table
+        let empty_left = build_table(
+            ("a1", &vec![]),
+            ("b1", &vec![]),
+            ("c1", &vec![]),
+            None,
+            Vec::new(),
+        );
+        let right = build_right_table();
+        let filter = prepare_join_filter();
+
+        let (_columns, batches, _metrics) =
+            join_collect(empty_left, right, &JoinType::Inner, Some(filter), task_ctx)
+                .await?;
+        assert!(batches.is_empty() || batches.iter().all(|b| b.num_rows() == 0));
+
+        // Empty right table
+        let task_ctx2 = task_ctx_with_memory_limit(50, 16)?;
+        let left = build_left_table();
+        let empty_right = build_table(
+            ("a2", &vec![]),
+            ("b2", &vec![]),
+            ("c2", &vec![]),
+            None,
+            Vec::new(),
+        );
+        let filter2 = prepare_join_filter();
+
+        let (_columns, batches, _metrics) = join_collect(
+            left,
+            empty_right,
+            &JoinType::Inner,
+            Some(filter2),
+            task_ctx2,
+        )
+        .await?;
+        assert!(batches.is_empty() || batches.iter().all(|b| b.num_rows() == 0));
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_nlj_memory_limited_no_disk_falls_back_to_oom() -> Result<()> {
+        // When disk is disabled, fallback is not possible and OOM should occur.
+        use datafusion_execution::disk_manager::{DiskManagerBuilder, DiskManagerMode};
+
+        let runtime = RuntimeEnvBuilder::new()
+            .with_memory_limit(100, 1.0)
+            .with_disk_manager_builder(
+                DiskManagerBuilder::default().with_mode(DiskManagerMode::Disabled),
+            )
+            .build_arc()?;
+        let task_ctx = Arc::new(TaskContext::default().with_runtime(runtime));
+
+        let left = build_left_table();
+        let right = build_right_table();
+        let filter = prepare_join_filter();
+
+        let err = join_collect(left, right, &JoinType::Inner, Some(filter), task_ctx)
+            .await
+            .unwrap_err();
+
+        assert_contains!(err.to_string(), "Resources exhausted");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_nlj_right_join_no_fallback_on_oom() -> Result<()> {
+        // RIGHT JOIN does not support multi-pass fallback (needs global right
+        // bitmap). OOM should propagate as an error.
+        let task_ctx = task_ctx_with_memory_limit(50, 16)?;
+        let left = build_left_table();
+        let right = build_right_table();
+        let filter = prepare_join_filter();
+
+        let err = join_collect(left, right, &JoinType::Right, Some(filter), task_ctx)
+            .await
+            .unwrap_err();
+
+        assert_contains!(err.to_string(), "Resources exhausted");
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/joins/piecewise_merge_join/classic_join.rs b/datafusion/physical-plan/src/joins/piecewise_merge_join/classic_join.rs
index 646905e0d7875..da0d21f046daa 100644
--- a/datafusion/physical-plan/src/joins/piecewise_merge_join/classic_join.rs
+++ b/datafusion/physical-plan/src/joins/piecewise_merge_join/classic_join.rs
@@ -17,8 +17,8 @@
 
 //! Stream Implementation for PiecewiseMergeJoin's Classic Join (Left, Right, Full, Inner)
 
-use arrow::array::{new_null_array, Array, PrimitiveBuilder};
-use arrow::compute::{take, BatchCoalescer};
+use arrow::array::{Array, PrimitiveBuilder, new_null_array};
+use arrow::compute::{BatchCoalescer, take};
 use arrow::datatypes::UInt32Type;
 use arrow::{
     array::{ArrayRef, RecordBatch, UInt32Array},
@@ -26,7 +26,7 @@ use arrow::{
 };
 use arrow_schema::{Schema, SchemaRef, SortOptions};
 use datafusion_common::NullEquality;
-use datafusion_common::{internal_err, Result};
+use datafusion_common::{Result, internal_err};
 use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream};
 use datafusion_expr::{JoinType, Operator};
 use datafusion_physical_expr::PhysicalExprRef;
@@ -37,8 +37,8 @@ use std::{sync::Arc, task::Poll};
 use crate::handle_state;
 use crate::joins::piecewise_merge_join::exec::{BufferedSide, BufferedSideReadyState};
 use crate::joins::piecewise_merge_join::utils::need_produce_result_in_final;
-use crate::joins::utils::{compare_join_arrays, get_final_indices_from_shared_bitmap};
 use crate::joins::utils::{BuildProbeJoinMetrics, StatefulStreamResult};
+use crate::joins::utils::{JoinKeyComparator, get_final_indices_from_shared_bitmap};
 
 pub(super) enum PiecewiseMergeJoinStreamState {
     WaitBufferedSide,
@@ -70,7 +70,6 @@ pub(super) struct SortedStreamBatch {
 }
 
 impl SortedStreamBatch {
-    #[allow(dead_code)]
     fn new(batch: RecordBatch, compare_key_values: Vec<ArrayRef>) -> Self {
         Self {
             batch,
@@ -132,7 +131,7 @@ impl RecordBatchStream for ClassicPWMJStream {
 //      `Completed` however for Full and Right we will need to process the unmatched buffered rows.
 impl ClassicPWMJStream {
     // Creates a new `PiecewiseMergeJoinStream` instance
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     pub fn try_new(
         schema: Arc<Schema>,
         on_streamed: PhysicalExprRef,
@@ -189,11 +188,12 @@ impl ClassicPWMJStream {
         cx: &mut std::task::Context<'_>,
     ) -> Poll<Result<StatefulStreamResult<Option<RecordBatch>>>> {
         let build_timer = self.join_metrics.build_time.timer();
-        let buffered_data = ready!(self
-            .buffered_side
-            .try_as_initial_mut()?
-            .buffered_fut
-            .get_shared(cx))?;
+        let buffered_data = ready!(
+            self.buffered_side
+                .try_as_initial_mut()?
+                .buffered_fut
+                .get_shared(cx)
+        )?;
         build_timer.done();
 
         // We will start fetching stream batches for classic joins
@@ -248,10 +248,7 @@ impl ClassicPWMJStream {
                 // Reset BatchProcessState before processing a new stream batch
                 self.batch_process_state.reset();
                 self.state = PiecewiseMergeJoinStreamState::ProcessStreamBatch(
-                    SortedStreamBatch {
-                        batch: stream_batch,
-                        compare_key_values: vec![stream_values],
-                    },
+                    SortedStreamBatch::new(stream_batch, vec![stream_values]),
                 );
             }
             Some(Err(err)) => return Poll::Ready(Err(err)),
@@ -280,7 +277,7 @@ impl ClassicPWMJStream {
         let batch = resolve_classic_join(
             buffered_side,
             stream_batch,
-            Arc::clone(&self.schema),
+            &self.schema,
             self.operator,
             self.sort_option,
             self.join_type,
@@ -451,11 +448,10 @@ impl Stream for ClassicPWMJStream {
 }
 
 // For Left, Right, Full, and Inner joins, incoming stream batches will already be sorted.
-#[allow(clippy::too_many_arguments)]
 fn resolve_classic_join(
     buffered_side: &mut BufferedSideReadyState,
     stream_batch: &SortedStreamBatch,
-    join_schema: Arc<Schema>,
+    join_schema: &SchemaRef,
     operator: Operator,
     sort_options: SortOptions,
     join_type: JoinType,
@@ -464,6 +460,14 @@ fn resolve_classic_join(
     let buffered_len = buffered_side.buffered_data.values().len();
     let stream_values = stream_batch.compare_key_values();
 
+    // Build comparator once for the batch pair
+    let cmp = JoinKeyComparator::new(
+        &[Arc::clone(&stream_values[0])],
+        &[Arc::clone(buffered_side.buffered_data.values())],
+        &[sort_options],
+        NullEquality::NullEqualsNothing,
+    )?;
+
     let mut buffer_idx = batch_process_state.start_buffer_idx;
     let mut stream_idx = batch_process_state.start_stream_idx;
 
@@ -479,22 +483,12 @@ fn resolve_classic_join(
     // in the previous stream row.
     for row_idx in stream_idx..stream_batch.batch.num_rows() {
         while buffer_idx < buffered_len {
-            let compare = {
-                let buffered_values = buffered_side.buffered_data.values();
-                compare_join_arrays(
-                    &[Arc::clone(&stream_values[0])],
-                    row_idx,
-                    &[Arc::clone(buffered_values)],
-                    buffer_idx,
-                    &[sort_options],
-                    NullEquality::NullEqualsNothing,
-                )?
-            };
+            let compare = cmp.compare(row_idx, buffer_idx);
 
             // If we find a match we append all indices and move to the next stream row index
             match operator {
                 Operator::Gt | Operator::Lt => {
-                    if matches!(compare, Ordering::Less) {
+                    if compare == Ordering::Less {
                         batch_process_state.found = true;
                         let count = buffered_len - buffer_idx;
 
@@ -504,7 +498,7 @@ fn resolve_classic_join(
                             buffered_side,
                             stream_batch,
                             join_type,
-                            Arc::clone(&join_schema),
+                            join_schema,
                         )?;
 
                         batch_process_state.output_batches.push_batch(batch)?;
@@ -532,7 +526,7 @@ fn resolve_classic_join(
                             buffered_side,
                             stream_batch,
                             join_type,
-                            Arc::clone(&join_schema),
+                            join_schema,
                         )?;
 
                         // Flush batch and update pointers if we have a completed batch
@@ -553,7 +547,7 @@ fn resolve_classic_join(
                     return internal_err!(
                         "PiecewiseMergeJoin should not contain operator, {}",
                         operator
-                    )
+                    );
                 }
             };
 
@@ -579,14 +573,14 @@ fn resolve_classic_join(
         let batch = create_unmatched_batch(
             &mut batch_process_state.unmatched_indices,
             stream_batch,
-            Arc::clone(&join_schema),
+            join_schema,
         )?;
 
         batch_process_state.output_batches.push_batch(batch)?;
     }
 
     batch_process_state.continue_process = false;
-    Ok(RecordBatch::new_empty(Arc::clone(&join_schema)))
+    Ok(RecordBatch::new_empty(Arc::clone(join_schema)))
 }
 
 // Builds a record batch from indices ranges on the buffered and streamed side.
@@ -599,7 +593,7 @@ fn build_matched_indices_and_set_buffered_bitmap(
     buffered_side: &mut BufferedSideReadyState,
     stream_batch: &SortedStreamBatch,
     join_type: JoinType,
-    join_schema: Arc<Schema>,
+    join_schema: &SchemaRef,
 ) -> Result<RecordBatch> {
     // Mark the buffered indices as visited
     if need_produce_result_in_final(join_type) {
@@ -622,7 +616,7 @@ fn build_matched_indices_and_set_buffered_bitmap(
     buffered_columns.extend(streamed_columns);
 
     Ok(RecordBatch::try_new(
-        Arc::clone(&join_schema),
+        Arc::clone(join_schema),
         buffered_columns,
     )?)
 }
@@ -631,7 +625,7 @@ fn build_matched_indices_and_set_buffered_bitmap(
 fn create_unmatched_batch(
     streamed_indices: &mut PrimitiveBuilder<UInt32Type>,
     stream_batch: &SortedStreamBatch,
-    join_schema: Arc<Schema>,
+    join_schema: &SchemaRef,
 ) -> Result<RecordBatch> {
     let streamed_indices = streamed_indices.finish();
     let new_stream_batch = take_record_batch(&stream_batch.batch, &streamed_indices)?;
@@ -649,7 +643,7 @@ fn create_unmatched_batch(
     buffered_columns.extend(streamed_columns);
 
     Ok(RecordBatch::try_new(
-        Arc::clone(&join_schema),
+        Arc::clone(join_schema),
         buffered_columns,
     )?)
 }
@@ -658,17 +652,15 @@ fn create_unmatched_batch(
 mod tests {
     use super::*;
     use crate::{
-        common,
+        ExecutionPlan, common,
         joins::PiecewiseMergeJoinExec,
-        test::{build_table_i32, TestMemoryExec},
-        ExecutionPlan,
+        test::{TestMemoryExec, build_table_i32},
     };
     use arrow::array::{Date32Array, Date64Array};
     use arrow_schema::{DataType, Field};
     use datafusion_common::test_util::batches_to_string;
     use datafusion_execution::TaskContext;
-    use datafusion_expr::JoinType;
-    use datafusion_physical_expr::{expressions::Column, PhysicalExpr};
+    use datafusion_physical_expr::{PhysicalExpr, expressions::Column};
     use insta::assert_snapshot;
     use std::sync::Arc;
 
@@ -808,7 +800,7 @@ mod tests {
         let (_, batches) =
             join_collect(left, right, on, Operator::Lt, JoinType::Inner).await?;
 
-        assert_snapshot!(batches_to_string(&batches), @r#"
+        assert_snapshot!(batches_to_string(&batches), @r"
         +----+----+----+----+----+----+
         | a1 | b1 | c1 | a2 | b1 | c2 |
         +----+----+----+----+----+----+
@@ -819,7 +811,7 @@ mod tests {
         | 3  | 1  | 9  | 20 | 3  | 80 |
         | 3  | 1  | 9  | 10 | 2  | 70 |
         +----+----+----+----+----+----+
-        "#);
+        ");
         Ok(())
     }
 
@@ -859,18 +851,18 @@ mod tests {
         let (_, batches) =
             join_collect(left, right, on, Operator::Lt, JoinType::Inner).await?;
 
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b1 | c2 |
-            +----+----+----+----+----+----+
-            | 1  | 3  | 7  | 30 | 4  | 90 |
-            | 2  | 2  | 8  | 30 | 4  | 90 |
-            | 3  | 1  | 9  | 30 | 4  | 90 |
-            | 2  | 2  | 8  | 10 | 3  | 70 |
-            | 3  | 1  | 9  | 10 | 3  | 70 |
-            | 3  | 1  | 9  | 20 | 2  | 80 |
-            +----+----+----+----+----+----+
-        "#);
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +----+----+----+----+----+----+
+        | a1 | b1 | c1 | a2 | b1 | c2 |
+        +----+----+----+----+----+----+
+        | 1  | 3  | 7  | 30 | 4  | 90 |
+        | 2  | 2  | 8  | 30 | 4  | 90 |
+        | 3  | 1  | 9  | 30 | 4  | 90 |
+        | 2  | 2  | 8  | 10 | 3  | 70 |
+        | 3  | 1  | 9  | 10 | 3  | 70 |
+        | 3  | 1  | 9  | 20 | 2  | 80 |
+        +----+----+----+----+----+----+
+        ");
         Ok(())
     }
 
@@ -910,7 +902,7 @@ mod tests {
         let (_, batches) =
             join_collect(left, right, on, Operator::GtEq, JoinType::Inner).await?;
 
-        assert_snapshot!(batches_to_string(&batches), @r#"
+        assert_snapshot!(batches_to_string(&batches), @r"
         +----+----+----+----+----+----+
         | a1 | b1 | c1 | a2 | b1 | c2 |
         +----+----+----+----+----+----+
@@ -923,7 +915,7 @@ mod tests {
         | 2  | 3  | 8  | 10 | 3  | 70 |
         | 3  | 4  | 9  | 10 | 3  | 70 |
         +----+----+----+----+----+----+
-        "#);
+        ");
         Ok(())
     }
 
@@ -958,12 +950,12 @@ mod tests {
         );
         let (_, batches) =
             join_collect(left, right, on, Operator::LtEq, JoinType::Inner).await?;
-        assert_snapshot!(batches_to_string(&batches), @r#"
+        assert_snapshot!(batches_to_string(&batches), @r"
         +----+----+----+----+----+----+
         | a1 | b1 | c1 | a2 | b1 | c2 |
         +----+----+----+----+----+----+
         +----+----+----+----+----+----+
-        "#);
+        ");
         Ok(())
     }
 
@@ -1001,7 +993,7 @@ mod tests {
         let (_, batches) =
             join_collect(left, right, on, Operator::GtEq, JoinType::Full).await?;
 
-        assert_snapshot!(batches_to_string(&batches), @r#"
+        assert_snapshot!(batches_to_string(&batches), @r"
         +----+----+-----+----+----+-----+
         | a1 | b1 | c1  | a2 | b1 | c2  |
         +----+----+-----+----+----+-----+
@@ -1009,7 +1001,7 @@ mod tests {
         |    |    |     | 10 | 3  | 300 |
         | 1  | 1  | 100 |    |    |     |
         +----+----+-----+----+----+-----+
-        "#);
+        ");
 
         Ok(())
     }
@@ -1050,7 +1042,7 @@ mod tests {
         let (_, batches) =
             join_collect(left, right, on, Operator::Gt, JoinType::Left).await?;
 
-        assert_snapshot!(batches_to_string(&batches), @r#"
+        assert_snapshot!(batches_to_string(&batches), @r"
         +----+----+----+----+----+----+
         | a1 | b1 | c1 | a2 | b1 | c2 |
         +----+----+----+----+----+----+
@@ -1061,7 +1053,7 @@ mod tests {
         | 3  | 4  | 9  | 10 | 3  | 70 |
         | 1  | 1  | 7  |    |    |    |
         +----+----+----+----+----+----+
-        "#);
+        ");
         Ok(())
     }
 
@@ -1101,7 +1093,7 @@ mod tests {
         let (_, batches) =
             join_collect(left, right, on, Operator::Gt, JoinType::Right).await?;
 
-        assert_snapshot!(batches_to_string(&batches), @r#"
+        assert_snapshot!(batches_to_string(&batches), @r"
         +----+----+----+----+----+----+
         | a1 | b1 | c1 | a2 | b1 | c2 |
         +----+----+----+----+----+----+
@@ -1110,7 +1102,7 @@ mod tests {
         | 3  | 4  | 9  | 20 | 3  | 80 |
         |    |    |    | 10 | 5  | 70 |
         +----+----+----+----+----+----+
-        "#);
+        ");
         Ok(())
     }
 
@@ -1150,7 +1142,7 @@ mod tests {
         let (_, batches) =
             join_collect(left, right, on, Operator::Lt, JoinType::Right).await?;
 
-        assert_snapshot!(batches_to_string(&batches), @r#"
+        assert_snapshot!(batches_to_string(&batches), @r"
         +----+----+----+----+----+----+
         | a1 | b1 | c1 | a2 | b1 | c2 |
         +----+----+----+----+----+----+
@@ -1160,7 +1152,7 @@ mod tests {
         | 3  | 1  | 9  | 20 | 3  | 80 |
         | 3  | 1  | 9  | 10 | 2  | 70 |
         +----+----+----+----+----+----+
-        "#);
+        ");
         Ok(())
     }
 
@@ -1201,7 +1193,7 @@ mod tests {
             join_collect(left, right, on, Operator::LtEq, JoinType::Inner).await?;
 
         // Expected grouping follows right.b1 descending (4, 3, 2)
-        assert_snapshot!(batches_to_string(&batches), @r#"
+        assert_snapshot!(batches_to_string(&batches), @r"
         +----+----+----+----+----+----+
         | a1 | b1 | c1 | a2 | b1 | c2 |
         +----+----+----+----+----+----+
@@ -1211,7 +1203,7 @@ mod tests {
         | 3  | 2  | 9  | 20 | 3  | 80 |
         | 3  | 2  | 9  | 30 | 2  | 90 |
         +----+----+----+----+----+----+
-        "#);
+        ");
         Ok(())
     }
 
@@ -1252,7 +1244,7 @@ mod tests {
             join_collect(left, right, on, Operator::Gt, JoinType::Inner).await?;
 
         // Grouped by right in ascending evaluation for > (1,2,3)
-        assert_snapshot!(batches_to_string(&batches), @r#"
+        assert_snapshot!(batches_to_string(&batches), @r"
         +----+----+----+----+----+----+
         | a1 | b1 | c1 | a2 | b1 | c2 |
         +----+----+----+----+----+----+
@@ -1261,7 +1253,7 @@ mod tests {
         | 3  | 4  | 9  | 30 | 2  | 90 |
         | 3  | 4  | 9  | 10 | 3  | 70 |
         +----+----+----+----+----+----+
-        "#);
+        ");
         Ok(())
     }
 
@@ -1295,7 +1287,7 @@ mod tests {
         let (_, batches) =
             join_collect(left, right, on, Operator::LtEq, JoinType::Left).await?;
 
-        assert_snapshot!(batches_to_string(&batches), @r#"
+        assert_snapshot!(batches_to_string(&batches), @r"
         +----+----+----+----+----+----+
         | a1 | b1 | c1 | a2 | b1 | c2 |
         +----+----+----+----+----+----+
@@ -1303,7 +1295,7 @@ mod tests {
         | 1  | 5  | 7  |    |    |    |
         | 2  | 4  | 8  |    |    |    |
         +----+----+----+----+----+----+
-        "#);
+        ");
         Ok(())
     }
 
@@ -1341,14 +1333,14 @@ mod tests {
         let (_, batches) =
             join_collect(left, right, on, Operator::GtEq, JoinType::Right).await?;
 
-        assert_snapshot!(batches_to_string(&batches), @r#"
+        assert_snapshot!(batches_to_string(&batches), @r"
         +----+----+----+----+----+----+
         | a1 | b1 | c1 | a2 | b1 | c2 |
         +----+----+----+----+----+----+
         |    |    |    | 10 | 3  | 70 |
         |    |    |    | 20 | 5  | 80 |
         +----+----+----+----+----+----+
-        "#);
+        ");
         Ok(())
     }
 
@@ -1370,13 +1362,13 @@ mod tests {
         let (_, batches) =
             join_collect(left, right, on, Operator::Lt, JoinType::Inner).await?;
 
-        assert_snapshot!(batches_to_string(&batches), @r#"
+        assert_snapshot!(batches_to_string(&batches), @r"
         +----+----+-----+----+----+----+
         | a1 | b1 | c1  | a2 | b1 | c2 |
         +----+----+-----+----+----+----+
         | 42 | 5  | 999 | 30 | 7  | 90 |
         +----+----+-----+----+----+----+
-        "#);
+        ");
         Ok(())
     }
 
@@ -1402,12 +1394,12 @@ mod tests {
         let (_, batches) =
             join_collect(left, right, on, Operator::Gt, JoinType::Inner).await?;
 
-        assert_snapshot!(batches_to_string(&batches), @r#"
+        assert_snapshot!(batches_to_string(&batches), @r"
         +----+----+----+----+----+----+
         | a1 | b1 | c1 | a2 | b1 | c2 |
         +----+----+----+----+----+----+
         +----+----+----+----+----+----+
-        "#);
+        ");
         Ok(())
     }
 
@@ -1447,13 +1439,13 @@ mod tests {
         let (_, batches) =
             join_collect(left, right, on, Operator::Lt, JoinType::Inner).await?;
 
-        assert_snapshot!(batches_to_string(&batches), @r#"
-    +------------+------------+------------+------------+------------+------------+
-    | a1         | b1         | c1         | a2         | b1         | c2         |
-    +------------+------------+------------+------------+------------+------------+
-    | 1970-01-04 | 2022-04-23 | 1970-01-10 | 1970-01-31 | 2022-04-25 | 1970-04-01 |
-    +------------+------------+------------+------------+------------+------------+
-    "#);
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +------------+------------+------------+------------+------------+------------+
+        | a1         | b1         | c1         | a2         | b1         | c2         |
+        +------------+------------+------------+------------+------------+------------+
+        | 1970-01-04 | 2022-04-23 | 1970-01-10 | 1970-01-31 | 2022-04-25 | 1970-04-01 |
+        +------------+------------+------------+------------+------------+------------+
+        ");
         Ok(())
     }
 
@@ -1493,13 +1485,13 @@ mod tests {
         let (_, batches) =
             join_collect(left, right, on, Operator::Lt, JoinType::Inner).await?;
 
-        assert_snapshot!(batches_to_string(&batches), @r#"
+        assert_snapshot!(batches_to_string(&batches), @r"
         +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
         | a1                      | b1                  | c1                      | a2                      | b1                  | c2                      |
         +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
         | 1970-01-01T00:00:00.003 | 2022-04-23T08:44:01 | 1970-01-01T00:00:00.009 | 1970-01-01T00:00:00.030 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.090 |
         +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
-        "#);
+        ");
         Ok(())
     }
 
@@ -1537,14 +1529,14 @@ mod tests {
         let (_, batches) =
             join_collect(left, right, on, Operator::Lt, JoinType::Right).await?;
 
-        assert_snapshot!(batches_to_string(&batches), @r#"
-    +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
-    | a1                      | b1                  | c1                      | a2                      | b1                  | c2                      |
-    +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
-    | 1970-01-01T00:00:00.002 | 2022-04-23T08:44:01 | 1970-01-01T00:00:00.008 | 1970-01-01T00:00:00.020 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.090 |
-    |                         |                     |                         | 1970-01-01T00:00:00.010 | 2022-04-23T08:44:01 | 1970-01-01T00:00:00.080 |
-    +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
-"#);
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
+        | a1                      | b1                  | c1                      | a2                      | b1                  | c2                      |
+        +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
+        | 1970-01-01T00:00:00.002 | 2022-04-23T08:44:01 | 1970-01-01T00:00:00.008 | 1970-01-01T00:00:00.020 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.090 |
+        |                         |                     |                         | 1970-01-01T00:00:00.010 | 2022-04-23T08:44:01 | 1970-01-01T00:00:00.080 |
+        +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
+        ");
         Ok(())
     }
 }
diff --git a/datafusion/physical-plan/src/joins/piecewise_merge_join/exec.rs b/datafusion/physical-plan/src/joins/piecewise_merge_join/exec.rs
index a9ea92f2d92da..2b20089f8e221 100644
--- a/datafusion/physical-plan/src/joins/piecewise_merge_join/exec.rs
+++ b/datafusion/physical-plan/src/joins/piecewise_merge_join/exec.rs
@@ -23,10 +23,11 @@ use arrow::{
 };
 use arrow_schema::{SchemaRef, SortOptions};
 use datafusion_common::not_impl_err;
-use datafusion_common::{internal_err, JoinSide, Result};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{JoinSide, Result, internal_err};
 use datafusion_execution::{
-    memory_pool::{MemoryConsumer, MemoryReservation},
     SendableRecordBatchStream,
+    memory_pool::{MemoryConsumer, MemoryReservation},
 };
 use datafusion_expr::{JoinType, Operator};
 use datafusion_physical_expr::equivalence::join_equivalence_properties;
@@ -38,10 +39,10 @@ use datafusion_physical_expr_common::physical_expr::fmt_sql;
 use futures::TryStreamExt;
 use parking_lot::Mutex;
 use std::fmt::Formatter;
-use std::sync::atomic::AtomicUsize;
 use std::sync::Arc;
+use std::sync::atomic::AtomicUsize;
 
-use crate::execution_plan::{boundedness_from_children, EmissionType};
+use crate::execution_plan::{EmissionType, boundedness_from_children};
 
 use crate::joins::piecewise_merge_join::classic_join::{
     ClassicPWMJStream, PiecewiseMergeJoinStreamState,
@@ -50,16 +51,19 @@ use crate::joins::piecewise_merge_join::utils::{
     build_visited_indices_map, is_existence_join, is_right_existence_join,
 };
 use crate::joins::utils::asymmetric_join_output_partitioning;
+use crate::metrics::MetricsSet;
 use crate::{
+    DisplayAs, DisplayFormatType, ExecutionPlanProperties, check_if_same_properties,
+};
+use crate::{
+    ExecutionPlan, PlanProperties,
     joins::{
-        utils::{build_join_schema, BuildProbeJoinMetrics, OnceAsync, OnceFut},
         SharedBitmapBuilder,
+        utils::{BuildProbeJoinMetrics, OnceAsync, OnceFut, build_join_schema},
     },
     metrics::ExecutionPlanMetricsSet,
     spill::get_record_batch_memory_size,
-    ExecutionPlan, PlanProperties,
 };
-use crate::{DisplayAs, DisplayFormatType, ExecutionPlanProperties};
 
 /// `PiecewiseMergeJoinExec` is a join execution plan that only evaluates single range filter and show much
 /// better performance for these workloads than `NestedLoopJoin`
@@ -85,7 +89,7 @@ use crate::{DisplayAs, DisplayFormatType, ExecutionPlanProperties};
 /// Both sides are sorted so that we can iterate from index 0 to the end on each side.  This ordering ensures
 /// that when we find the first matching pair of rows, we can emit the current stream row joined with all remaining
 /// probe rows from the match position onward, without rescanning earlier probe rows.
-///  
+///
 /// For `<` and `<=` operators, both inputs are sorted in **descending** order, while for `>` and `>=` operators
 /// they are sorted in **ascending** order. This choice ensures that the pointer on the buffered side can advance
 /// monotonically as we stream new batches from the stream side.
@@ -128,34 +132,34 @@ use crate::{DisplayAs, DisplayFormatType, ExecutionPlanProperties};
 ///
 /// Processing Row 1:
 ///
-///       Sorted Buffered Side                                         Sorted Streamed Side          
-///       ┌──────────────────┐                                         ┌──────────────────┐         
-///     1 │       100        │                                       1 │       100        │        
-///       ├──────────────────┤                                         ├──────────────────┤         
-///     2 │       200        │ ─┐                                    2 │       200        │        
-///       ├──────────────────┤  │  For row 1 on streamed side with     ├──────────────────┤         
-///     3 │       200        │  │  value 100, we emit rows 2 - 5.    3 │       500        │       
+///       Sorted Buffered Side                                         Sorted Streamed Side
+///       ┌──────────────────┐                                         ┌──────────────────┐
+///     1 │       100        │                                       1 │       100        │
+///       ├──────────────────┤                                         ├──────────────────┤
+///     2 │       200        │ ─┐                                    2 │       200        │
+///       ├──────────────────┤  │  For row 1 on streamed side with     ├──────────────────┤
+///     3 │       200        │  │  value 100, we emit rows 2 - 5.    3 │       500        │
 ///       ├──────────────────┤  │  as matches when the operator is     └──────────────────┘
 ///     4 │       300        │  │  `Operator::Lt` (<) Emitting all
 ///       ├──────────────────┤  │  rows after the first match (row
 ///     5 │       400        │ ─┘  2 buffered side; 100 < 200)
-///       └──────────────────┘     
+///       └──────────────────┘
 ///
 /// Processing Row 2:
 ///   By sorting the streamed side we know
 ///
-///       Sorted Buffered Side                                         Sorted Streamed Side          
-///       ┌──────────────────┐                                         ┌──────────────────┐         
-///     1 │       100        │                                       1 │       100        │        
-///       ├──────────────────┤                                         ├──────────────────┤         
-///     2 │       200        │ <- Start here when probing for the    2 │       200        │        
-///       ├──────────────────┤    streamed side row 2.                 ├──────────────────┤         
-///     3 │       200        │                                       3 │       500        │       
+///       Sorted Buffered Side                                         Sorted Streamed Side
+///       ┌──────────────────┐                                         ┌──────────────────┐
+///     1 │       100        │                                       1 │       100        │
+///       ├──────────────────┤                                         ├──────────────────┤
+///     2 │       200        │ <- Start here when probing for the    2 │       200        │
+///       ├──────────────────┤    streamed side row 2.                 ├──────────────────┤
+///     3 │       200        │                                       3 │       500        │
 ///       ├──────────────────┤                                         └──────────────────┘
-///     4 │       300        │  
-///       ├──────────────────┤  
+///     4 │       300        │
+///       ├──────────────────┤
 ///     5 │       400        │
-///       └──────────────────┘     
+///       └──────────────────┘
 /// ```
 ///
 /// ## Existence Joins (Semi, Anti, Mark)
@@ -201,10 +205,10 @@ use crate::{DisplayAs, DisplayFormatType, ExecutionPlanProperties};
 ///          1 │       100        │        1 │       500        │
 ///            ├──────────────────┤          ├──────────────────┤
 ///          2 │       200        │        2 │       200        │
-///            ├──────────────────┤          ├──────────────────┤    
+///            ├──────────────────┤          ├──────────────────┤
 ///          3 │       200        │        3 │       300        │
 ///            ├──────────────────┤          └──────────────────┘
-///          4 │       300        │ ─┐       
+///          4 │       300        │ ─┐
 ///            ├──────────────────┤  | We emit matches for row 4 - 5
 ///          5 │       400        │ ─┘ on the buffered side.
 ///            └──────────────────┘
@@ -235,11 +239,11 @@ use crate::{DisplayAs, DisplayFormatType, ExecutionPlanProperties};
 ///
 /// # Mark Join:
 /// Sorts the probe side, then computes the min/max range of the probe keys and scans the buffered side only
-/// within that range.  
+/// within that range.
 ///   Complexity: `O(|S| + scan(R[range]))`.
 ///
 /// ## Nested Loop Join
-/// Compares every row from `S` with every row from `R`.  
+/// Compares every row from `S` with every row from `R`.
 ///   Complexity: `O(|S| * |R|)`.
 ///
 /// ## Nested Loop Join
@@ -272,13 +276,12 @@ pub struct PiecewiseMergeJoinExec {
     left_child_plan_required_order: LexOrdering,
     /// The right sort order, descending for `<`, `<=` operations + ascending for `>`, `>=` operations
     /// Unsorted for mark joins
-    #[allow(unused)]
     right_batch_required_orders: LexOrdering,
 
     /// This determines the sort order of all join columns used in sorting the stream and buffered execution plans.
     sort_options: SortOptions,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
     /// Number of partitions to process
     num_partitions: usize,
 }
@@ -321,7 +324,7 @@ impl PiecewiseMergeJoinExec {
             _ => {
                 return internal_err!(
                     "Cannot contain non-range operator in PiecewiseMergeJoinExec"
-                )
+                );
             }
         };
 
@@ -372,7 +375,7 @@ impl PiecewiseMergeJoinExec {
             left_child_plan_required_order,
             right_batch_required_orders,
             sort_options,
-            cache,
+            cache: Arc::new(cache),
             num_partitions,
         })
     }
@@ -465,6 +468,31 @@ impl PiecewiseMergeJoinExec {
     pub fn swap_inputs(&self) -> Result<Arc<dyn ExecutionPlan>> {
         todo!()
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        let buffered = children.swap_remove(0);
+        let streamed = children.swap_remove(0);
+        Self {
+            buffered,
+            streamed,
+            on: self.on.clone(),
+            operator: self.operator,
+            join_type: self.join_type,
+            schema: Arc::clone(&self.schema),
+            left_child_plan_required_order: self.left_child_plan_required_order.clone(),
+            right_batch_required_orders: self.right_batch_required_orders.clone(),
+            sort_options: self.sort_options,
+            cache: Arc::clone(&self.cache),
+            num_partitions: self.num_partitions,
+
+            // Re-set state.
+            metrics: ExecutionPlanMetricsSet::new(),
+            buffered_fut: Default::default(),
+        }
+    }
 }
 
 impl ExecutionPlan for PiecewiseMergeJoinExec {
@@ -472,11 +500,7 @@ impl ExecutionPlan for PiecewiseMergeJoinExec {
         "PiecewiseMergeJoinExec"
     }
 
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -484,6 +508,14 @@ impl ExecutionPlan for PiecewiseMergeJoinExec {
         vec![&self.buffered, &self.streamed]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to the two expressions being compared in the range predicate
+        f(self.on.0.as_ref())?.visit_sibling(|| f(self.on.1.as_ref()))
+    }
+
     fn required_input_distribution(&self) -> Vec<Distribution> {
         vec![
             Distribution::SinglePartition,
@@ -510,6 +542,7 @@ impl ExecutionPlan for PiecewiseMergeJoinExec {
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         match &children[..] {
             [left, right] => Ok(Arc::new(PiecewiseMergeJoinExec::try_new(
                 Arc::clone(left),
@@ -526,6 +559,13 @@ impl ExecutionPlan for PiecewiseMergeJoinExec {
         }
     }
 
+    fn reset_state(self: Arc<Self>) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(self.with_new_children_and_same_properties(vec![
+            Arc::clone(&self.buffered),
+            Arc::clone(&self.streamed),
+        ])))
+    }
+
     fn execute(
         &self,
         partition: usize,
@@ -572,6 +612,10 @@ impl ExecutionPlan for PiecewiseMergeJoinExec {
             )))
         }
     }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
 }
 
 impl DisplayAs for PiecewiseMergeJoinExec {
@@ -615,7 +659,7 @@ async fn build_buffered_data(
 
     // Combine batches and record number of rows
     let initial = (Vec::new(), 0, metrics, reservation);
-    let (batches, num_rows, metrics, mut reservation) = buffered
+    let (batches, num_rows, metrics, reservation) = buffered
         .try_fold(initial, |mut acc, batch| async {
             let batch_size = get_record_batch_memory_size(&batch);
             acc.3.try_grow(batch_size)?;
diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/bitwise_stream.rs b/datafusion/physical-plan/src/joins/sort_merge_join/bitwise_stream.rs
new file mode 100644
index 0000000000000..3b409c98b2cf4
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/sort_merge_join/bitwise_stream.rs
@@ -0,0 +1,1334 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Sort-merge join stream specialized for semi/anti/mark joins.
+//!
+//! Instantiated by [`SortMergeJoinExec`](crate::joins::sort_merge_join::SortMergeJoinExec)
+//! when the join type is `LeftSemi`, `LeftAnti`, `RightSemi`, `RightAnti`,
+//! `LeftMark`, or `RightMark`.
+//!
+//! # Motivation
+//!
+//! The general-purpose `MaterializingSortMergeJoinStream`
+//! handles semi/anti joins by materializing `(outer, inner)` row pairs,
+//! applying a filter, then using a "corrected filter mask" to deduplicate.
+//! Semi/anti joins only need a boolean per outer row (does a match exist?),
+//! not pairs. The pair-based approach incurs unnecessary memory allocation
+//! and intermediate batches.
+//!
+//! This stream instead tracks matches with a per-outer-batch bitset,
+//! avoiding all pair materialization.
+//!
+//! # "Outer Side" vs "Inner Side"
+//!
+//! For `Left*` join types, left is outer and right is inner.
+//! For `Right*` join types, right is outer and left is inner.
+//! The output schema always equals the outer side's schema (for semi/anti)
+//! or the outer side's schema plus a boolean mark column (for mark joins).
+//!
+//! # Algorithm
+//!
+//! Both inputs must be sorted by the join keys. The stream performs a merge
+//! scan across the two sorted inputs:
+//!
+//! ```text
+//!   outer cursor ──►  [1, 2, 2, 3, 5, 5, 7]
+//!   inner cursor ──►  [2, 2, 4, 5, 6, 7, 7]
+//!                       ▲
+//!                   compare keys at cursors
+//! ```
+//!
+//! At each step, the keys at the outer and inner cursors are compared:
+//!
+//! - **outer < inner**: Skip the outer key group (no match exists).
+//! - **outer > inner**: Skip the inner key group.
+//! - **outer == inner**: Process the match (see below).
+//!
+//! Key groups are contiguous runs of equal keys within one side. The scan
+//! advances past entire groups at each step.
+//!
+//! ## Processing a key match
+//!
+//! **Without filter**: All outer rows in the key group are marked as matched.
+//!
+//! **With filter**: The inner key group is buffered (may span multiple inner
+//! batches). For each buffered inner row, the filter is evaluated against the
+//! outer key group as a batch. Results are OR'd into the matched bitset. A
+//! short-circuit exits early when all outer rows in the group are matched.
+//!
+//! ```text
+//!   matched bitset:  [0, 0, 1, 0, 0, ...]
+//!                     ▲── one bit per outer row ──▲
+//!
+//!   On emit:
+//!     Semi  → filter_record_batch(outer_batch, &matched)
+//!     Anti  → filter_record_batch(outer_batch, &NOT(matched))
+//!     Mark  → outer_batch + matched as boolean column
+//! ```
+//!
+//! ## Batch boundaries
+//!
+//! Key groups can span batch boundaries on either side. The stream handles
+//! this by detecting when a group extends to the end of a batch, loading the
+//! next batch, and continuing if the key matches. The [`PendingBoundary`] enum
+//! preserves loop context across async `Poll::Pending` re-entries.
+//!
+//! # Memory
+//!
+//! Memory usage is bounded and independent of total input size:
+//! - One outer batch at a time (not tracked by reservation — single batch,
+//!   cannot be spilled since it's needed for filter evaluation)
+//! - One inner batch at a time (streaming)
+//! - `matched` bitset: one bit per outer row, re-allocated per batch
+//! - Inner key group buffer: only for filtered joins, one key group at a time.
+//!   Tracked via `MemoryReservation`; spilled to disk when the memory pool
+//!   limit is exceeded.
+//! - `BatchCoalescer`: output buffering to target batch size
+//!
+//! # Degenerate cases
+//!
+//! **Highly skewed key (filtered joins only):** When a filter is present,
+//! the inner key group is buffered so each inner row can be evaluated
+//! against the outer group. If one join key has N inner rows, all N rows
+//! are held in memory simultaneously (or spilled to disk if the memory
+//! pool limit is reached). With uniform key distribution this is small
+//! (inner_rows / num_distinct_keys), but a single hot key can buffer
+//! arbitrarily many rows. The no-filter path does not buffer inner
+//! rows — it only advances the cursor — so it is unaffected.
+//!
+//! **Scalar broadcast during filter evaluation:** Each inner row is
+//! broadcast to match the outer group length for filter evaluation,
+//! allocating one array per inner row × filter column. This is inherent
+//! to the `PhysicalExpr::evaluate(RecordBatch)` API, which does not
+//! support scalar inputs directly. The total work is
+//! O(inner_group × outer_group) per key, but with much lower constant
+//! factor than the pair-materialization approach.
+
+use std::cmp::Ordering;
+use std::fs::File;
+use std::io::BufReader;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use crate::RecordBatchStream;
+use crate::joins::utils::{JoinFilter, JoinKeyComparator, compare_join_arrays};
+use crate::metrics::{
+    BaselineMetrics, Count, ExecutionPlanMetricsSet, Gauge, MetricBuilder,
+};
+use crate::spill::spill_manager::SpillManager;
+use arrow::array::{Array, ArrayRef, BooleanArray, BooleanBufferBuilder, RecordBatch};
+use arrow::compute::{BatchCoalescer, SortOptions, filter_record_batch, not};
+use arrow::datatypes::SchemaRef;
+use arrow::ipc::reader::StreamReader;
+use arrow::util::bit_chunk_iterator::UnalignedBitChunk;
+use arrow::util::bit_util::apply_bitwise_binary_op;
+use datafusion_common::{
+    JoinSide, JoinType, NullEquality, Result, ScalarValue, internal_err,
+};
+use datafusion_execution::SendableRecordBatchStream;
+use datafusion_execution::disk_manager::RefCountedTempFile;
+use datafusion_execution::memory_pool::MemoryReservation;
+use datafusion_physical_expr_common::physical_expr::PhysicalExprRef;
+
+use futures::{Stream, StreamExt, ready};
+
+/// Evaluates join key expressions against a batch, returning one array per key.
+fn evaluate_join_keys(
+    batch: &RecordBatch,
+    on: &[PhysicalExprRef],
+) -> Result<Vec<ArrayRef>> {
+    on.iter()
+        .map(|expr| {
+            let num_rows = batch.num_rows();
+            let val = expr.evaluate(batch)?;
+            val.into_array(num_rows)
+        })
+        .collect()
+}
+
+/// Find the first index in `key_arrays` starting from `from` where the key
+/// differs from the key at `from`. Uses a pre-built `JoinKeyComparator` for
+/// zero-alloc ordinal comparison without per-row type dispatch.
+///
+/// Optimized for join workloads: checks adjacent and boundary keys before
+/// falling back to binary search, since most key groups are small (often 1).
+fn find_key_group_end(cmp: &JoinKeyComparator, from: usize, len: usize) -> usize {
+    let next = from + 1;
+    if next >= len {
+        return len;
+    }
+
+    // Fast path: single-row group (common with unique keys).
+    if cmp.compare(from, next) != Ordering::Equal {
+        return next;
+    }
+
+    // Check if the entire remaining batch shares this key.
+    let last = len - 1;
+    if cmp.compare(from, last) == Ordering::Equal {
+        return len;
+    }
+
+    // Binary search the interior: key at `next` matches, key at `last` doesn't.
+    let mut lo = next + 1;
+    let mut hi = last;
+    while lo < hi {
+        let mid = lo + (hi - lo) / 2;
+        if cmp.compare(from, mid) == Ordering::Equal {
+            lo = mid + 1;
+        } else {
+            hi = mid;
+        }
+    }
+    lo
+}
+
+/// When an outer key group spans a batch boundary, the boundary loop emits
+/// the current batch, then polls for the next. If that poll returns Pending,
+/// `ready!` exits `poll_join` and we re-enter from the top on the next call.
+/// Without this state, the new batch would be processed fresh by the
+/// merge-scan — but inner already advanced past this key, so the matching
+/// outer rows would be skipped via `Ordering::Less` and never marked.
+///
+/// This enum carries the last key (as single-row sliced arrays) from the
+/// previous batch so we can check whether the next batch continues the same
+/// key group. Stored as `Option<PendingBoundary>`: `None` means normal
+/// processing.
+#[derive(Debug)]
+enum PendingBoundary {
+    /// Resuming a no-filter boundary loop.
+    NoFilter { saved_keys: Vec<ArrayRef> },
+    /// Resuming a filtered boundary loop. Inner key data remains in the
+    /// buffer (or spill file) for the resumed loop.
+    Filtered { saved_keys: Vec<ArrayRef> },
+}
+
+/// Sort-Merge join stream for Semi/Anti/Mark joins.
+///
+/// Named "bitwise" because it tracks outer-row matches via a per-batch
+/// boolean bitset (`BooleanBufferBuilder`) rather than materializing
+/// `(outer, inner)` row pairs. Filter results are OR'd into the bitset
+/// in `u64` chunks, and emitting applies the bitset directly.
+pub(crate) struct BitwiseSortMergeJoinStream {
+    join_type: JoinType,
+
+    // Input streams — in the nested-loop model that sort-merge join
+    // implements, "outer" is the driving loop and "inner" is probed for
+    // matches. The existing MaterializingSortMergeJoinStream calls these "streamed"
+    // and "buffered" respectively. For Left* joins, outer=left; for
+    // Right* joins, outer=right. Output schema equals the outer side.
+    outer: SendableRecordBatchStream,
+    inner: SendableRecordBatchStream,
+
+    // Current batches and cursor positions within them
+    outer_batch: Option<RecordBatch>,
+    /// Row index into `outer_batch` — the next unprocessed outer row.
+    outer_offset: usize,
+    outer_key_arrays: Vec<ArrayRef>,
+    inner_batch: Option<RecordBatch>,
+    /// Row index into `inner_batch` — the next unprocessed inner row.
+    inner_offset: usize,
+    inner_key_arrays: Vec<ArrayRef>,
+
+    // Per-outer-batch match tracking, reused across batches.
+    // Bit-packed (not Vec<bool>) so that:
+    //  - emit: finish() yields a BooleanBuffer directly (no packing iteration)
+    //  - OR: apply_bitwise_binary_op ORs filter results in u64 chunks
+    //  - count: UnalignedBitChunk::count_ones uses popcnt
+    matched: BooleanBufferBuilder,
+
+    // Inner key group buffer: all inner rows sharing the current join key.
+    // Only populated when a filter is present. Unbounded — a single key
+    // with many inner rows will buffer them all. See "Degenerate cases"
+    // in exec.rs. Spilled to disk when memory reservation fails.
+    inner_key_buffer: Vec<RecordBatch>,
+    inner_key_spill: Option<RefCountedTempFile>,
+
+    // True when buffer_inner_key_group returned Pending after partially
+    // filling inner_key_buffer. On re-entry, buffer_inner_key_group
+    // must skip clear() and resume from poll_next_inner_batch (the
+    // current inner_batch was already sliced and pushed before Pending).
+    buffering_inner_pending: bool,
+
+    // Boundary re-entry state — see PendingBoundary doc comment.
+    pending_boundary: Option<PendingBoundary>,
+
+    // Join ON expressions, evaluated against each new batch to produce
+    // the key arrays used for sorted key comparisons.
+    on_outer: Vec<PhysicalExprRef>,
+    on_inner: Vec<PhysicalExprRef>,
+    filter: Option<JoinFilter>,
+    sort_options: Vec<SortOptions>,
+    null_equality: NullEquality,
+    // Decomposed from JoinType: when RightSemi/RightAnti, outer=right,
+    // inner=left, so we swap sides when building the filter batch.
+    outer_is_left: bool,
+
+    // Output
+    coalescer: BatchCoalescer,
+    schema: SchemaRef,
+
+    // Metrics
+    join_time: crate::metrics::Time,
+    input_batches: Count,
+    input_rows: Count,
+    baseline_metrics: BaselineMetrics,
+    peak_mem_used: Gauge,
+
+    // Memory / spill — only the inner key buffer is tracked via reservation,
+    // matching existing SMJ (which tracks only the buffered side). The outer
+    // batch is a single batch at a time and cannot be spilled.
+    reservation: MemoryReservation,
+    spill_manager: SpillManager,
+    runtime_env: Arc<datafusion_execution::runtime_env::RuntimeEnv>,
+    inner_buffer_size: usize,
+
+    // Cached comparators — pre-built to avoid per-row type dispatch.
+    /// Comparator for outer vs inner key comparison
+    outer_inner_cmp: Option<JoinKeyComparator>,
+    /// Comparator for outer self-comparison (find_key_group_end on outer)
+    outer_self_cmp: Option<JoinKeyComparator>,
+    /// Comparator for inner self-comparison (find_key_group_end on inner)
+    inner_self_cmp: Option<JoinKeyComparator>,
+
+    // True once the current outer batch has been emitted. The Equal
+    // branch's inner loops call emit then `ready!(poll_next_outer_batch)`.
+    // If that poll returns Pending, poll_join re-enters from the top
+    // on the next poll — with outer_batch still Some and outer_offset
+    // past the end. The main loop's step 3 would re-emit without this
+    // guard. Cleared when poll_next_outer_batch loads a new batch.
+    batch_emitted: bool,
+}
+
+impl BitwiseSortMergeJoinStream {
+    #[expect(clippy::too_many_arguments)]
+    pub fn try_new(
+        schema: SchemaRef,
+        sort_options: Vec<SortOptions>,
+        null_equality: NullEquality,
+        outer: SendableRecordBatchStream,
+        inner: SendableRecordBatchStream,
+        on_outer: Vec<PhysicalExprRef>,
+        on_inner: Vec<PhysicalExprRef>,
+        filter: Option<JoinFilter>,
+        join_type: JoinType,
+        batch_size: usize,
+        partition: usize,
+        metrics: &ExecutionPlanMetricsSet,
+        reservation: MemoryReservation,
+        spill_manager: SpillManager,
+        runtime_env: Arc<datafusion_execution::runtime_env::RuntimeEnv>,
+    ) -> Result<Self> {
+        debug_assert!(
+            matches!(
+                join_type,
+                JoinType::LeftSemi
+                    | JoinType::RightSemi
+                    | JoinType::LeftAnti
+                    | JoinType::RightAnti
+                    | JoinType::LeftMark
+                    | JoinType::RightMark
+            ),
+            "BitwiseSortMergeJoinStream does not handle {join_type:?}"
+        );
+        let outer_is_left = matches!(
+            join_type,
+            JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark
+        );
+
+        let join_time = MetricBuilder::new(metrics).subset_time("join_time", partition);
+        let input_batches =
+            MetricBuilder::new(metrics).counter("input_batches", partition);
+        let input_rows = MetricBuilder::new(metrics).counter("input_rows", partition);
+        let baseline_metrics = BaselineMetrics::new(metrics, partition);
+        let peak_mem_used = MetricBuilder::new(metrics).gauge("peak_mem_used", partition);
+
+        Ok(Self {
+            join_type,
+            outer,
+            inner,
+            outer_batch: None,
+            outer_offset: 0,
+            outer_key_arrays: vec![],
+            inner_batch: None,
+            inner_offset: 0,
+            inner_key_arrays: vec![],
+            matched: BooleanBufferBuilder::new(0),
+            inner_key_buffer: vec![],
+            inner_key_spill: None,
+            buffering_inner_pending: false,
+            pending_boundary: None,
+            on_outer,
+            on_inner,
+            filter,
+            sort_options,
+            null_equality,
+            outer_is_left,
+            coalescer: BatchCoalescer::new(Arc::clone(&schema), batch_size)
+                .with_biggest_coalesce_batch_size(Some(batch_size / 2)),
+            schema,
+            join_time,
+            input_batches,
+            input_rows,
+            baseline_metrics,
+            peak_mem_used,
+            reservation,
+            spill_manager,
+            runtime_env,
+            inner_buffer_size: 0,
+            outer_inner_cmp: None,
+            outer_self_cmp: None,
+            inner_self_cmp: None,
+            batch_emitted: false,
+        })
+    }
+
+    /// Resize the memory reservation to match current tracked usage.
+    fn try_resize_reservation(&mut self) -> Result<()> {
+        let needed = self.inner_buffer_size;
+        self.reservation.try_resize(needed)?;
+        self.peak_mem_used.set_max(self.reservation.size());
+        Ok(())
+    }
+
+    /// Get or build the outer vs inner key comparator.
+    fn get_outer_inner_cmp(&mut self) -> Result<&JoinKeyComparator> {
+        if self.outer_inner_cmp.is_none() {
+            self.outer_inner_cmp = Some(JoinKeyComparator::new(
+                &self.outer_key_arrays,
+                &self.inner_key_arrays,
+                &self.sort_options,
+                self.null_equality,
+            )?);
+        }
+        Ok(self.outer_inner_cmp.as_ref().unwrap())
+    }
+
+    /// Get or build the outer self-comparison comparator.
+    fn get_outer_self_cmp(&mut self) -> Result<&JoinKeyComparator> {
+        if self.outer_self_cmp.is_none() {
+            self.outer_self_cmp = Some(JoinKeyComparator::new(
+                &self.outer_key_arrays,
+                &self.outer_key_arrays,
+                &self.sort_options,
+                self.null_equality,
+            )?);
+        }
+        Ok(self.outer_self_cmp.as_ref().unwrap())
+    }
+
+    /// Get or build the inner self-comparison comparator.
+    fn get_inner_self_cmp(&mut self) -> Result<&JoinKeyComparator> {
+        if self.inner_self_cmp.is_none() {
+            self.inner_self_cmp = Some(JoinKeyComparator::new(
+                &self.inner_key_arrays,
+                &self.inner_key_arrays,
+                &self.sort_options,
+                self.null_equality,
+            )?);
+        }
+        Ok(self.inner_self_cmp.as_ref().unwrap())
+    }
+
+    /// Spill the in-memory inner key buffer to disk and clear it.
+    fn spill_inner_key_buffer(&mut self) -> Result<()> {
+        let spill_file = self
+            .spill_manager
+            .spill_record_batch_and_finish(
+                &self.inner_key_buffer,
+                "semi_anti_smj_inner_key_spill",
+            )?
+            .expect("inner_key_buffer is non-empty when spilling");
+        self.inner_key_buffer.clear();
+        self.inner_buffer_size = 0;
+        self.inner_key_spill = Some(spill_file);
+        // Should succeed now — inner buffer has been spilled.
+        self.try_resize_reservation()
+    }
+
+    /// Clear inner key group state after processing. Does not resize the
+    /// reservation — the next key group will resize when buffering, or
+    /// the stream's Drop will free it. This avoids unnecessary memory
+    /// pool interactions (see apache/datafusion#20729).
+    fn clear_inner_key_group(&mut self) {
+        self.inner_key_buffer.clear();
+        self.inner_key_spill = None;
+        self.inner_buffer_size = 0;
+    }
+
+    /// Poll for the next outer batch. Returns true if a batch was loaded.
+    fn poll_next_outer_batch(&mut self, cx: &mut Context<'_>) -> Poll<Result<bool>> {
+        loop {
+            match ready!(self.outer.poll_next_unpin(cx)) {
+                None => return Poll::Ready(Ok(false)),
+                Some(Err(e)) => return Poll::Ready(Err(e)),
+                Some(Ok(batch)) => {
+                    let batch_num_rows = batch.num_rows();
+                    self.input_batches.add(1);
+                    self.input_rows.add(batch_num_rows);
+                    if batch_num_rows == 0 {
+                        continue;
+                    }
+                    let keys = evaluate_join_keys(&batch, &self.on_outer)?;
+                    self.outer_batch = Some(batch);
+                    self.outer_offset = 0;
+                    self.outer_key_arrays = keys;
+                    self.outer_inner_cmp = None;
+                    self.outer_self_cmp = None;
+                    self.batch_emitted = false;
+                    self.matched = BooleanBufferBuilder::new(batch_num_rows);
+                    self.matched.append_n(batch_num_rows, false);
+                    return Poll::Ready(Ok(true));
+                }
+            }
+        }
+    }
+
+    /// Poll for the next inner batch. Returns true if a batch was loaded.
+    fn poll_next_inner_batch(&mut self, cx: &mut Context<'_>) -> Poll<Result<bool>> {
+        loop {
+            match ready!(self.inner.poll_next_unpin(cx)) {
+                None => return Poll::Ready(Ok(false)),
+                Some(Err(e)) => return Poll::Ready(Err(e)),
+                Some(Ok(batch)) => {
+                    let batch_num_rows = batch.num_rows();
+                    self.input_batches.add(1);
+                    self.input_rows.add(batch_num_rows);
+                    if batch_num_rows == 0 {
+                        continue;
+                    }
+                    let keys = evaluate_join_keys(&batch, &self.on_inner)?;
+                    self.inner_batch = Some(batch);
+                    self.inner_offset = 0;
+                    self.inner_key_arrays = keys;
+                    self.outer_inner_cmp = None;
+                    self.inner_self_cmp = None;
+                    return Poll::Ready(Ok(true));
+                }
+            }
+        }
+    }
+
+    /// Emit the current outer batch through the coalescer, applying the
+    /// matched bitset as a selection mask. No-op if already emitted
+    /// (see `batch_emitted` field).
+    fn emit_outer_batch(&mut self) -> Result<()> {
+        if self.batch_emitted {
+            return Ok(());
+        }
+        self.batch_emitted = true;
+
+        let batch = self.outer_batch.as_ref().unwrap();
+
+        // finish() converts the bit-packed builder directly to a
+        // BooleanBuffer — no iteration or repacking needed.
+        let matched_buf = self.matched.finish();
+
+        match self.join_type {
+            JoinType::LeftMark | JoinType::RightMark => {
+                // Mark joins emit ALL outer rows with a boolean match column appended.
+                debug_assert_eq!(
+                    self.schema.fields().len(),
+                    batch.num_columns() + 1,
+                    "Mark join output schema should be outer schema + 1 mark column"
+                );
+                let mark_col = Arc::new(BooleanArray::new(matched_buf, None)) as ArrayRef;
+                let mut columns = Vec::with_capacity(batch.num_columns() + 1);
+                columns.extend_from_slice(batch.columns());
+                columns.push(mark_col);
+                let output = RecordBatch::try_new(Arc::clone(&self.schema), columns)?;
+                self.coalescer.push_batch(output)?;
+            }
+            JoinType::LeftSemi | JoinType::RightSemi => {
+                let selection = BooleanArray::new(matched_buf, None);
+                let filtered = filter_record_batch(batch, &selection)?;
+                if filtered.num_rows() > 0 {
+                    self.coalescer.push_batch(filtered)?;
+                }
+            }
+            JoinType::LeftAnti | JoinType::RightAnti => {
+                let selection = not(&BooleanArray::new(matched_buf, None))?;
+                let filtered = filter_record_batch(batch, &selection)?;
+                if filtered.num_rows() > 0 {
+                    self.coalescer.push_batch(filtered)?;
+                }
+            }
+            _ => unreachable!(),
+        }
+        Ok(())
+    }
+
+    /// Process a key match between outer and inner sides (no filter).
+    /// Sets matched bits for all outer rows sharing the current key.
+    fn process_key_match_no_filter(&mut self) -> Result<()> {
+        let outer_batch = self.outer_batch.as_ref().unwrap();
+        let num_outer = outer_batch.num_rows();
+
+        self.get_outer_self_cmp()?;
+        let outer_group_end = find_key_group_end(
+            self.outer_self_cmp.as_ref().unwrap(),
+            self.outer_offset,
+            num_outer,
+        );
+
+        for i in self.outer_offset..outer_group_end {
+            self.matched.set_bit(i, true);
+        }
+
+        self.outer_offset = outer_group_end;
+        Ok(())
+    }
+
+    /// Advance inner past the current key group. Returns Ok(true) if inner
+    /// is exhausted.
+    fn advance_inner_past_key_group(
+        &mut self,
+        cx: &mut Context<'_>,
+    ) -> Poll<Result<bool>> {
+        loop {
+            let inner_batch = match &self.inner_batch {
+                Some(b) => b,
+                None => return Poll::Ready(Ok(true)),
+            };
+            let num_inner = inner_batch.num_rows();
+
+            self.get_inner_self_cmp()?;
+            let group_end = find_key_group_end(
+                self.inner_self_cmp.as_ref().unwrap(),
+                self.inner_offset,
+                num_inner,
+            );
+
+            if group_end < num_inner {
+                self.inner_offset = group_end;
+                return Poll::Ready(Ok(false));
+            }
+
+            // Key group extends to end of batch — need to check next batch
+            let saved_inner_keys = slice_keys(&self.inner_key_arrays, num_inner - 1);
+
+            match ready!(self.poll_next_inner_batch(cx)) {
+                Err(e) => return Poll::Ready(Err(e)),
+                Ok(false) => {
+                    return Poll::Ready(Ok(true));
+                }
+                Ok(true) => {
+                    if keys_match(
+                        &saved_inner_keys,
+                        &self.inner_key_arrays,
+                        &self.sort_options,
+                        self.null_equality,
+                    )? {
+                        continue;
+                    } else {
+                        return Poll::Ready(Ok(false));
+                    }
+                }
+            }
+        }
+    }
+
+    /// Buffer inner key group for filter evaluation. Collects all inner rows
+    /// with the current key across batch boundaries.
+    ///
+    /// If poll_next_inner_batch returns Pending, we save progress via
+    /// buffering_inner_pending. On re-entry (from the Equal branch in
+    /// poll_join), we skip clear() and the slice+push for the current
+    /// batch (which was already buffered before Pending), and go directly
+    /// to polling for the next inner batch.
+    fn buffer_inner_key_group(&mut self, cx: &mut Context<'_>) -> Poll<Result<bool>> {
+        // On re-entry after Pending: don't clear the partially-filled
+        // buffer. The current inner_batch was already sliced and pushed
+        // before Pending, so jump to polling for the next batch.
+        let mut resume_from_poll = false;
+        if self.buffering_inner_pending {
+            self.buffering_inner_pending = false;
+            resume_from_poll = true;
+        } else {
+            self.clear_inner_key_group();
+        }
+
+        loop {
+            if self.inner_batch.is_none() {
+                return Poll::Ready(Ok(true));
+            }
+            let num_inner = self.inner_batch.as_ref().unwrap().num_rows();
+            self.get_inner_self_cmp()?;
+            let group_end = find_key_group_end(
+                self.inner_self_cmp.as_ref().unwrap(),
+                self.inner_offset,
+                num_inner,
+            );
+
+            if !resume_from_poll {
+                let inner_batch = self.inner_batch.as_ref().unwrap();
+                let slice =
+                    inner_batch.slice(self.inner_offset, group_end - self.inner_offset);
+                self.inner_buffer_size += slice.get_array_memory_size();
+                self.inner_key_buffer.push(slice);
+
+                // Reserve memory for the newly buffered slice. If the pool
+                // is exhausted, spill the entire buffer to disk.
+                if self.try_resize_reservation().is_err() {
+                    if self.runtime_env.disk_manager.tmp_files_enabled() {
+                        self.spill_inner_key_buffer()?;
+                    } else {
+                        // Re-attempt to get the error message
+                        self.try_resize_reservation().map_err(|e| {
+                            datafusion_common::DataFusionError::Execution(format!(
+                                "{e}. Disk spilling disabled."
+                            ))
+                        })?;
+                    }
+                }
+
+                if group_end < num_inner {
+                    self.inner_offset = group_end;
+                    return Poll::Ready(Ok(false));
+                }
+            }
+            resume_from_poll = false;
+
+            // Key group extends to end of batch — check next
+            let saved_inner_keys = slice_keys(&self.inner_key_arrays, num_inner - 1);
+
+            // If poll returns Pending, the current batch is already
+            // in inner_key_buffer.
+            self.buffering_inner_pending = true;
+            match ready!(self.poll_next_inner_batch(cx)) {
+                Err(e) => {
+                    self.buffering_inner_pending = false;
+                    return Poll::Ready(Err(e));
+                }
+                Ok(false) => {
+                    self.buffering_inner_pending = false;
+                    return Poll::Ready(Ok(true));
+                }
+                Ok(true) => {
+                    self.buffering_inner_pending = false;
+                    if keys_match(
+                        &saved_inner_keys,
+                        &self.inner_key_arrays,
+                        &self.sort_options,
+                        self.null_equality,
+                    )? {
+                        continue;
+                    } else {
+                        return Poll::Ready(Ok(false));
+                    }
+                }
+            }
+        }
+    }
+
+    /// Process a key match with a filter. For each inner row in the buffered
+    /// key group, evaluates the filter against the outer key group and ORs
+    /// the results into the matched bitset using u64-chunked bitwise ops.
+    fn process_key_match_with_filter(&mut self) -> Result<()> {
+        self.get_outer_self_cmp()?;
+        let filter = self.filter.as_ref().unwrap();
+        let outer_batch = self.outer_batch.as_ref().unwrap();
+        let num_outer = outer_batch.num_rows();
+
+        // buffer_inner_key_group must be called before this function
+        debug_assert!(
+            !self.inner_key_buffer.is_empty() || self.inner_key_spill.is_some(),
+            "process_key_match_with_filter called with no inner key data"
+        );
+        debug_assert!(
+            self.outer_offset < num_outer,
+            "outer_offset must be within the current batch"
+        );
+        debug_assert!(
+            self.matched.len() == num_outer,
+            "matched vector must be sized for the current outer batch"
+        );
+
+        let outer_group_end = find_key_group_end(
+            self.outer_self_cmp.as_ref().unwrap(),
+            self.outer_offset,
+            num_outer,
+        );
+        let outer_group_len = outer_group_end - self.outer_offset;
+        let outer_slice = outer_batch.slice(self.outer_offset, outer_group_len);
+
+        // Count already-matched bits using popcnt on u64 chunks (zero-copy).
+        let mut matched_count = UnalignedBitChunk::new(
+            self.matched.as_slice(),
+            self.outer_offset,
+            outer_group_len,
+        )
+        .count_ones();
+
+        // Process spilled inner batches first (read back from disk).
+        if let Some(spill_file) = &self.inner_key_spill {
+            let file = BufReader::new(File::open(spill_file.path())?);
+            let reader = StreamReader::try_new(file, None)?;
+            for batch_result in reader {
+                let inner_slice = batch_result?;
+                matched_count = eval_filter_for_inner_slice(
+                    self.outer_is_left,
+                    filter,
+                    &outer_slice,
+                    &inner_slice,
+                    &mut self.matched,
+                    self.outer_offset,
+                    outer_group_len,
+                    matched_count,
+                )?;
+                if matched_count == outer_group_len {
+                    break;
+                }
+            }
+        }
+
+        // Then process in-memory inner batches.
+        // evaluate_filter_for_inner_row is a free function (not &self method)
+        // so that Rust can split the struct borrow: &mut self.matched coexists
+        // with &self.inner_key_buffer and &self.filter inside this loop.
+        if matched_count < outer_group_len {
+            'outer: for inner_slice in &self.inner_key_buffer {
+                matched_count = eval_filter_for_inner_slice(
+                    self.outer_is_left,
+                    filter,
+                    &outer_slice,
+                    inner_slice,
+                    &mut self.matched,
+                    self.outer_offset,
+                    outer_group_len,
+                    matched_count,
+                )?;
+                if matched_count == outer_group_len {
+                    break 'outer;
+                }
+            }
+        }
+
+        self.outer_offset = outer_group_end;
+        Ok(())
+    }
+
+    /// Continue processing an outer key group that spans multiple outer
+    /// batches. Returns `true` if this outer batch was fully consumed
+    /// by the key group and the caller should load another.
+    fn resume_boundary(&mut self) -> Result<bool> {
+        debug_assert!(
+            self.outer_batch.is_some(),
+            "caller must load outer_batch first"
+        );
+        match self.pending_boundary.take() {
+            Some(PendingBoundary::NoFilter { saved_keys }) => {
+                let same_key = keys_match(
+                    &saved_keys,
+                    &self.outer_key_arrays,
+                    &self.sort_options,
+                    self.null_equality,
+                )?;
+                if same_key {
+                    self.process_key_match_no_filter()?;
+                    let num_outer = self.outer_batch.as_ref().unwrap().num_rows();
+                    if self.outer_offset >= num_outer {
+                        self.pending_boundary = Some(PendingBoundary::NoFilter {
+                            saved_keys: slice_keys(&self.outer_key_arrays, num_outer - 1),
+                        });
+                        self.emit_outer_batch()?;
+                        self.outer_batch = None;
+                        return Ok(true);
+                    }
+                }
+            }
+            Some(PendingBoundary::Filtered { saved_keys }) => {
+                debug_assert!(
+                    !self.inner_key_buffer.is_empty() || self.inner_key_spill.is_some(),
+                    "Filtered pending boundary entered but no inner key data exists"
+                );
+                let same_key = keys_match(
+                    &saved_keys,
+                    &self.outer_key_arrays,
+                    &self.sort_options,
+                    self.null_equality,
+                )?;
+                if same_key {
+                    self.process_key_match_with_filter()?;
+                    let num_outer = self.outer_batch.as_ref().unwrap().num_rows();
+                    if self.outer_offset >= num_outer {
+                        self.pending_boundary = Some(PendingBoundary::Filtered {
+                            saved_keys: slice_keys(&self.outer_key_arrays, num_outer - 1),
+                        });
+                        self.emit_outer_batch()?;
+                        self.outer_batch = None;
+                        return Ok(true);
+                    }
+                }
+                self.clear_inner_key_group();
+            }
+            None => {}
+        }
+        Ok(false)
+    }
+
+    /// Main loop: drive the merge-scan to produce output batches.
+    fn poll_join(&mut self, cx: &mut Context<'_>) -> Poll<Result<Option<RecordBatch>>> {
+        let join_time = self.join_time.clone();
+        let _timer = join_time.timer();
+
+        loop {
+            // 1. Ensure we have an outer batch
+            if self.outer_batch.is_none() {
+                match ready!(self.poll_next_outer_batch(cx)) {
+                    Err(e) => return Poll::Ready(Err(e)),
+                    Ok(false) => {
+                        // Outer exhausted — flush coalescer
+                        self.pending_boundary = None;
+                        self.coalescer.finish_buffered_batch()?;
+                        if let Some(batch) = self.coalescer.next_completed_batch() {
+                            return Poll::Ready(Ok(Some(batch)));
+                        }
+                        return Poll::Ready(Ok(None));
+                    }
+                    Ok(true) => {
+                        if self.resume_boundary()? {
+                            continue;
+                        }
+                    }
+                }
+            }
+
+            // 2. Ensure we have an inner batch (unless inner is exhausted).
+            // Skip this when resuming a pending boundary — inner was already
+            // advanced past the key group before the boundary loop started.
+            if self.inner_batch.is_none() && self.pending_boundary.is_none() {
+                match ready!(self.poll_next_inner_batch(cx)) {
+                    Err(e) => return Poll::Ready(Err(e)),
+                    Ok(false) => {
+                        // Inner exhausted — emit remaining outer batches.
+                        // For semi: no more matches possible.
+                        // For anti: all remaining outer rows are unmatched.
+                        self.emit_outer_batch()?;
+                        self.outer_batch = None;
+
+                        loop {
+                            match ready!(self.poll_next_outer_batch(cx)) {
+                                Err(e) => return Poll::Ready(Err(e)),
+                                Ok(false) => break,
+                                Ok(true) => {
+                                    self.emit_outer_batch()?;
+                                    self.outer_batch = None;
+                                }
+                            }
+                        }
+
+                        self.coalescer.finish_buffered_batch()?;
+                        if let Some(batch) = self.coalescer.next_completed_batch() {
+                            return Poll::Ready(Ok(Some(batch)));
+                        }
+                        return Poll::Ready(Ok(None));
+                    }
+                    Ok(true) => {}
+                }
+            }
+
+            // 3. Main merge-scan loop
+            let outer_batch = self.outer_batch.as_ref().unwrap();
+            let num_outer = outer_batch.num_rows();
+
+            if self.outer_offset >= num_outer {
+                self.emit_outer_batch()?;
+                self.outer_batch = None;
+
+                if let Some(batch) = self.coalescer.next_completed_batch() {
+                    return Poll::Ready(Ok(Some(batch)));
+                }
+                continue;
+            }
+
+            let inner_batch = match &self.inner_batch {
+                Some(b) => b,
+                None => {
+                    self.emit_outer_batch()?;
+                    self.outer_batch = None;
+                    continue;
+                }
+            };
+            let num_inner = inner_batch.num_rows();
+
+            if self.inner_offset >= num_inner {
+                match ready!(self.poll_next_inner_batch(cx)) {
+                    Err(e) => return Poll::Ready(Err(e)),
+                    Ok(false) => {
+                        self.inner_batch = None;
+                        continue;
+                    }
+                    Ok(true) => continue,
+                }
+            }
+
+            // 4. Compare keys at current positions
+            self.get_outer_inner_cmp()?;
+            let cmp = self
+                .outer_inner_cmp
+                .as_ref()
+                .unwrap()
+                .compare(self.outer_offset, self.inner_offset);
+
+            match cmp {
+                Ordering::Less => {
+                    self.get_outer_self_cmp()?;
+                    let group_end = find_key_group_end(
+                        self.outer_self_cmp.as_ref().unwrap(),
+                        self.outer_offset,
+                        num_outer,
+                    );
+                    self.outer_offset = group_end;
+                }
+                Ordering::Greater => {
+                    self.get_inner_self_cmp()?;
+                    let group_end = find_key_group_end(
+                        self.inner_self_cmp.as_ref().unwrap(),
+                        self.inner_offset,
+                        num_inner,
+                    );
+                    if group_end >= num_inner {
+                        let saved_keys =
+                            slice_keys(&self.inner_key_arrays, num_inner - 1);
+                        match ready!(self.poll_next_inner_batch(cx)) {
+                            Err(e) => return Poll::Ready(Err(e)),
+                            Ok(false) => {
+                                self.inner_batch = None;
+                                continue;
+                            }
+                            Ok(true) => {
+                                if keys_match(
+                                    &saved_keys,
+                                    &self.inner_key_arrays,
+                                    &self.sort_options,
+                                    self.null_equality,
+                                )? {
+                                    match ready!(self.advance_inner_past_key_group(cx)) {
+                                        Err(e) => return Poll::Ready(Err(e)),
+                                        Ok(_) => continue,
+                                    }
+                                }
+                                continue;
+                            }
+                        }
+                    } else {
+                        self.inner_offset = group_end;
+                    }
+                }
+                Ordering::Equal => {
+                    if self.filter.is_some() {
+                        // Buffer inner key group (may span batches)
+                        match ready!(self.buffer_inner_key_group(cx)) {
+                            Err(e) => return Poll::Ready(Err(e)),
+                            Ok(_inner_exhausted) => {}
+                        }
+
+                        // Process outer rows against buffered inner group
+                        // (may need to handle outer batch boundary)
+                        loop {
+                            self.process_key_match_with_filter()?;
+
+                            let outer_batch = self.outer_batch.as_ref().unwrap();
+                            if self.outer_offset >= outer_batch.num_rows() {
+                                let saved_keys = slice_keys(
+                                    &self.outer_key_arrays,
+                                    outer_batch.num_rows() - 1,
+                                );
+
+                                self.emit_outer_batch()?;
+                                debug_assert!(
+                                    !self.inner_key_buffer.is_empty()
+                                        || self.inner_key_spill.is_some(),
+                                    "Filtered pending boundary requires inner key data in buffer or spill"
+                                );
+                                self.pending_boundary =
+                                    Some(PendingBoundary::Filtered { saved_keys });
+
+                                match ready!(self.poll_next_outer_batch(cx)) {
+                                    Err(e) => return Poll::Ready(Err(e)),
+                                    Ok(false) => {
+                                        self.pending_boundary = None;
+                                        self.outer_batch = None;
+                                        break;
+                                    }
+                                    Ok(true) => {
+                                        let Some(PendingBoundary::Filtered {
+                                            saved_keys,
+                                        }) = self.pending_boundary.take()
+                                        else {
+                                            unreachable!()
+                                        };
+                                        let same = keys_match(
+                                            &saved_keys,
+                                            &self.outer_key_arrays,
+                                            &self.sort_options,
+                                            self.null_equality,
+                                        )?;
+                                        if same {
+                                            continue;
+                                        }
+                                        break;
+                                    }
+                                }
+                            } else {
+                                break;
+                            }
+                        }
+
+                        self.clear_inner_key_group();
+                    } else {
+                        // No filter: advance inner past key group, then
+                        // mark all outer rows with this key as matched.
+                        match ready!(self.advance_inner_past_key_group(cx)) {
+                            Err(e) => return Poll::Ready(Err(e)),
+                            Ok(_inner_exhausted) => {}
+                        }
+
+                        loop {
+                            self.process_key_match_no_filter()?;
+
+                            let num_outer = self.outer_batch.as_ref().unwrap().num_rows();
+                            if self.outer_offset >= num_outer {
+                                let saved_keys =
+                                    slice_keys(&self.outer_key_arrays, num_outer - 1);
+
+                                self.emit_outer_batch()?;
+                                self.pending_boundary =
+                                    Some(PendingBoundary::NoFilter { saved_keys });
+
+                                match ready!(self.poll_next_outer_batch(cx)) {
+                                    Err(e) => return Poll::Ready(Err(e)),
+                                    Ok(false) => {
+                                        self.pending_boundary = None;
+                                        self.outer_batch = None;
+                                        break;
+                                    }
+                                    Ok(true) => {
+                                        let Some(PendingBoundary::NoFilter {
+                                            saved_keys,
+                                        }) = self.pending_boundary.take()
+                                        else {
+                                            unreachable!()
+                                        };
+                                        let same_key = keys_match(
+                                            &saved_keys,
+                                            &self.outer_key_arrays,
+                                            &self.sort_options,
+                                            self.null_equality,
+                                        )?;
+                                        if same_key {
+                                            continue;
+                                        }
+                                        break;
+                                    }
+                                }
+                            } else {
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Check for completed coalescer batch
+            if let Some(batch) = self.coalescer.next_completed_batch() {
+                return Poll::Ready(Ok(Some(batch)));
+            }
+        }
+    }
+}
+
+/// Evaluate the filter for all rows in an inner slice against the outer group,
+/// OR-ing results into the matched bitset. Returns the updated matched count.
+/// Extracted as a free function so Rust can split borrows on the stream struct.
+#[expect(clippy::too_many_arguments)]
+fn eval_filter_for_inner_slice(
+    outer_is_left: bool,
+    filter: &JoinFilter,
+    outer_slice: &RecordBatch,
+    inner_slice: &RecordBatch,
+    matched: &mut BooleanBufferBuilder,
+    outer_offset: usize,
+    outer_group_len: usize,
+    // Passed in to avoid recounting bits we just counted at the call site.
+    mut matched_count: usize,
+) -> Result<usize> {
+    debug_assert_eq!(
+        matched_count,
+        UnalignedBitChunk::new(matched.as_slice(), outer_offset, outer_group_len)
+            .count_ones()
+    );
+    for inner_row in 0..inner_slice.num_rows() {
+        if matched_count == outer_group_len {
+            break;
+        }
+
+        let filter_result = evaluate_filter_for_inner_row(
+            outer_is_left,
+            filter,
+            outer_slice,
+            inner_slice,
+            inner_row,
+        )?;
+
+        // OR filter results into the matched bitset. Both sides are
+        // bit-packed [u8] buffers, so apply_bitwise_binary_op
+        // processes 64 bits per loop iteration (not 1 bit at a time).
+        //
+        // The offsets handle alignment: outer_offset is the bit
+        // position within matched where this key group starts,
+        // and filter_buf.offset() is the BooleanBuffer's internal
+        // bit offset (usually 0, but not guaranteed by Arrow).
+        let filter_buf = filter_result.values();
+        apply_bitwise_binary_op(
+            matched.as_slice_mut(),
+            outer_offset,
+            filter_buf.inner().as_slice(),
+            filter_buf.offset(),
+            outer_group_len,
+            |a, b| a | b,
+        );
+
+        // Recount matched bits after the OR. UnalignedBitChunk is
+        // zero-copy — it reads the bytes in place and uses popcnt.
+        matched_count =
+            UnalignedBitChunk::new(matched.as_slice(), outer_offset, outer_group_len)
+                .count_ones();
+    }
+    Ok(matched_count)
+}
+
+/// Slice each key array to a single row at `idx`.
+fn slice_keys(keys: &[ArrayRef], idx: usize) -> Vec<ArrayRef> {
+    keys.iter().map(|a| a.slice(idx, 1)).collect()
+}
+
+/// Compare the first row of two key arrays using sort options to determine
+/// equality. The left side is expected to be single-row slices (from
+/// `slice_keys`); the right side can be any length (row 0 is compared).
+fn keys_match(
+    left_arrays: &[ArrayRef],
+    right_arrays: &[ArrayRef],
+    sort_options: &[SortOptions],
+    null_equality: NullEquality,
+) -> Result<bool> {
+    debug_assert!(left_arrays.iter().all(|a| a.len() == 1));
+    let cmp = compare_join_arrays(
+        left_arrays,
+        0,
+        right_arrays,
+        0,
+        sort_options,
+        null_equality,
+    )?;
+    Ok(cmp == Ordering::Equal)
+}
+
+/// Evaluate the join filter for one inner row against a slice of outer rows.
+///
+/// Free function (not a method on BitwiseSortMergeJoinStream) so that Rust
+/// can split the struct borrow in process_key_match_with_filter: the caller
+/// holds &mut self.matched and &self.inner_key_buffer simultaneously, which
+/// is impossible if this borrows all of &self.
+fn evaluate_filter_for_inner_row(
+    outer_is_left: bool,
+    filter: &JoinFilter,
+    outer_slice: &RecordBatch,
+    inner_batch: &RecordBatch,
+    inner_idx: usize,
+) -> Result<BooleanArray> {
+    let num_outer_rows = outer_slice.num_rows();
+
+    // Build filter input columns in the order the filter expects
+    let mut columns: Vec<ArrayRef> = Vec::with_capacity(filter.column_indices().len());
+    for col_idx in filter.column_indices() {
+        let (side_batch, side_idx) = if outer_is_left {
+            match col_idx.side {
+                JoinSide::Left => (outer_slice, None),
+                JoinSide::Right => (inner_batch, Some(inner_idx)),
+                JoinSide::None => {
+                    return internal_err!("Unexpected JoinSide::None in filter");
+                }
+            }
+        } else {
+            match col_idx.side {
+                JoinSide::Left => (inner_batch, Some(inner_idx)),
+                JoinSide::Right => (outer_slice, None),
+                JoinSide::None => {
+                    return internal_err!("Unexpected JoinSide::None in filter");
+                }
+            }
+        };
+
+        match side_idx {
+            None => {
+                columns.push(Arc::clone(side_batch.column(col_idx.index)));
+            }
+            Some(idx) => {
+                // Broadcasts inner scalar to N-element array. Arrow's
+                // BinaryExpr handles Scalar×Array natively via the Datum
+                // trait, but Column::evaluate always returns Array, so
+                // we'd need a custom expr to avoid this broadcast.
+                let scalar = ScalarValue::try_from_array(
+                    side_batch.column(col_idx.index).as_ref(),
+                    idx,
+                )?;
+                columns.push(scalar.to_array_of_size(num_outer_rows)?);
+            }
+        }
+    }
+
+    let filter_batch = RecordBatch::try_new(Arc::clone(filter.schema()), columns)?;
+    let result = filter
+        .expression()
+        .evaluate(&filter_batch)?
+        .into_array(num_outer_rows)?;
+    let bool_arr = result
+        .as_any()
+        .downcast_ref::<BooleanArray>()
+        .ok_or_else(|| {
+            datafusion_common::DataFusionError::Internal(
+                "Filter expression did not return BooleanArray".to_string(),
+            )
+        })?;
+    // Treat nulls as false
+    if bool_arr.null_count() > 0 {
+        Ok(arrow::compute::prep_null_mask_filter(bool_arr))
+    } else {
+        Ok(bool_arr.clone())
+    }
+}
+
+impl Stream for BitwiseSortMergeJoinStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        let poll = self.poll_join(cx).map(|result| result.transpose());
+        self.baseline_metrics.record_poll(poll)
+    }
+}
+
+impl RecordBatchStream for BitwiseSortMergeJoinStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs b/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs
index 592878a3bb1c5..3f309431614a4 100644
--- a/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs
+++ b/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs
@@ -19,38 +19,41 @@
 //! A Sort-Merge join plan consumes two sorted children plans and produces
 //! joined output by given join type and other options.
 
-use std::any::Any;
 use std::fmt::Formatter;
 use std::sync::Arc;
 
-use crate::execution_plan::{boundedness_from_children, EmissionType};
+use super::bitwise_stream::BitwiseSortMergeJoinStream;
+use super::materializing_stream::MaterializingSortMergeJoinStream;
+use super::metrics::SortMergeJoinMetrics;
+use crate::execution_plan::{EmissionType, boundedness_from_children};
 use crate::expressions::PhysicalSortExpr;
-use crate::joins::sort_merge_join::metrics::SortMergeJoinMetrics;
-use crate::joins::sort_merge_join::stream::SortMergeJoinStream;
 use crate::joins::utils::{
-    build_join_schema, check_join_is_valid, estimate_join_statistics,
-    reorder_output_after_swap, symmetric_join_output_partitioning, JoinFilter, JoinOn,
-    JoinOnRef,
+    JoinFilter, JoinOn, JoinOnRef, build_join_schema, check_join_is_valid,
+    estimate_join_statistics, reorder_output_after_swap,
+    symmetric_join_output_partitioning,
 };
-use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet};
+use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet, SpillMetrics};
 use crate::projection::{
-    join_allows_pushdown, join_table_borders, new_join_children,
-    physical_to_column_exprs, update_join_on, ProjectionExec,
+    ProjectionExec, join_allows_pushdown, join_table_borders, new_join_children,
+    physical_to_column_exprs, update_join_on,
 };
+use crate::spill::spill_manager::SpillManager;
 use crate::{
     DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties,
-    PlanProperties, SendableRecordBatchStream, Statistics,
+    PlanProperties, SendableRecordBatchStream, Statistics, check_if_same_properties,
 };
 
 use arrow::compute::SortOptions;
 use arrow::datatypes::SchemaRef;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::{
-    internal_err, plan_err, JoinSide, JoinType, NullEquality, Result,
+    JoinSide, JoinType, NullEquality, Result, assert_eq_or_internal_err, internal_err,
+    plan_err,
 };
-use datafusion_execution::memory_pool::MemoryConsumer;
 use datafusion_execution::TaskContext;
+use datafusion_execution::memory_pool::MemoryConsumer;
 use datafusion_physical_expr::equivalence::join_equivalence_properties;
-use datafusion_physical_expr_common::physical_expr::{fmt_sql, PhysicalExprRef};
+use datafusion_physical_expr_common::physical_expr::{PhysicalExprRef, fmt_sql};
 use datafusion_physical_expr_common::sort_expr::{LexOrdering, OrderingRequirements};
 
 /// Join execution plan that executes equi-join predicates on multiple partitions using Sort-Merge
@@ -126,7 +129,7 @@ pub struct SortMergeJoinExec {
     /// Defines the null equality for the join.
     pub null_equality: NullEquality,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl SortMergeJoinExec {
@@ -197,7 +200,7 @@ impl SortMergeJoinExec {
             right_sort_exprs,
             sort_options,
             null_equality,
-            cache,
+            cache: Arc::new(cache),
         })
     }
 
@@ -333,12 +336,28 @@ impl SortMergeJoinExec {
                 | JoinType::RightSemi
                 | JoinType::LeftAnti
                 | JoinType::RightAnti
+                | JoinType::LeftMark
+                | JoinType::RightMark
         ) {
             Ok(Arc::new(new_join))
         } else {
             reorder_output_after_swap(Arc::new(new_join), &left.schema(), &right.schema())
         }
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        let left = children.swap_remove(0);
+        let right = children.swap_remove(0);
+        Self {
+            left,
+            right,
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for SortMergeJoinExec {
@@ -352,14 +371,15 @@ impl DisplayAs for SortMergeJoinExec {
                     .collect::<Vec<String>>()
                     .join(", ");
                 let display_null_equality =
-                    if matches!(self.null_equality(), NullEquality::NullEqualsNull) {
+                    if self.null_equality() == NullEquality::NullEqualsNull {
                         ", NullsEqual: true"
                     } else {
                         ""
                     };
                 write!(
                     f,
-                    "SortMergeJoin: join_type={:?}, on=[{}]{}{}",
+                    "{}: join_type={:?}, on=[{}]{}{}",
+                    Self::static_name(),
                     self.join_type,
                     on,
                     self.filter.as_ref().map_or_else(
@@ -384,7 +404,7 @@ impl DisplayAs for SortMergeJoinExec {
                 }
                 writeln!(f, "on={on}")?;
 
-                if matches!(self.null_equality(), NullEquality::NullEqualsNull) {
+                if self.null_equality() == NullEquality::NullEqualsNull {
                     writeln!(f, "NullsEqual: true")?;
                 }
 
@@ -399,11 +419,7 @@ impl ExecutionPlan for SortMergeJoinExec {
         "SortMergeJoinExec"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -434,10 +450,28 @@ impl ExecutionPlan for SortMergeJoinExec {
         vec![&self.left, &self.right]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn crate::PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to join keys from both sides
+        let mut tnr = TreeNodeRecursion::Continue;
+        for (left, right) in &self.on {
+            tnr = tnr.visit_sibling(|| f(left.as_ref()))?;
+            tnr = tnr.visit_sibling(|| f(right.as_ref()))?;
+        }
+        // Apply to join filter expressions if present
+        if let Some(filter) = &self.filter {
+            tnr = tnr.visit_sibling(|| f(filter.expression().as_ref()))?;
+        }
+        Ok(tnr)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         match &children[..] {
             [left, right] => Ok(Arc::new(SortMergeJoinExec::try_new(
                 Arc::clone(left),
@@ -459,12 +493,12 @@ impl ExecutionPlan for SortMergeJoinExec {
     ) -> Result<SendableRecordBatchStream> {
         let left_partitions = self.left.output_partitioning().partition_count();
         let right_partitions = self.right.output_partitioning().partition_count();
-        if left_partitions != right_partitions {
-            return internal_err!(
-                "Invalid SortMergeJoinExec, partition count mismatch {left_partitions}!={right_partitions},\
+        assert_eq_or_internal_err!(
+            left_partitions,
+            right_partitions,
+            "Invalid SortMergeJoinExec, partition count mismatch {left_partitions}!={right_partitions},\
                  consider using RepartitionExec"
-            );
-        }
+        );
         let (on_left, on_right) = self.on.iter().cloned().unzip();
         let (streamed, buffered, on_streamed, on_buffered) =
             if SortMergeJoinExec::probe_side(&self.join_type) == JoinSide::Left {
@@ -487,54 +521,88 @@ impl ExecutionPlan for SortMergeJoinExec {
         let streamed = streamed.execute(partition, Arc::clone(&context))?;
         let buffered = buffered.execute(partition, Arc::clone(&context))?;
 
-        // create output buffer
         let batch_size = context.session_config().batch_size();
-
-        // create memory reservation
         let reservation = MemoryConsumer::new(format!("SMJStream[{partition}]"))
             .register(context.memory_pool());
+        let spill_manager = SpillManager::new(
+            context.runtime_env(),
+            SpillMetrics::new(&self.metrics, partition),
+            buffered.schema(),
+        )
+        .with_compression_type(context.session_config().spill_compression());
 
-        // create join stream
-        Ok(Box::pin(SortMergeJoinStream::try_new(
-            context.session_config().spill_compression(),
-            Arc::clone(&self.schema),
-            self.sort_options.clone(),
-            self.null_equality,
-            streamed,
-            buffered,
-            on_streamed,
-            on_buffered,
-            self.filter.clone(),
+        if matches!(
             self.join_type,
-            batch_size,
-            SortMergeJoinMetrics::new(partition, &self.metrics),
-            reservation,
-            context.runtime_env(),
-        )?))
+            JoinType::LeftSemi
+                | JoinType::LeftAnti
+                | JoinType::RightSemi
+                | JoinType::RightAnti
+                | JoinType::LeftMark
+                | JoinType::RightMark
+        ) {
+            Ok(Box::pin(BitwiseSortMergeJoinStream::try_new(
+                Arc::clone(&self.schema),
+                self.sort_options.clone(),
+                self.null_equality,
+                streamed,
+                buffered,
+                on_streamed,
+                on_buffered,
+                self.filter.clone(),
+                self.join_type,
+                batch_size,
+                partition,
+                &self.metrics,
+                reservation,
+                spill_manager,
+                context.runtime_env(),
+            )?))
+        } else {
+            Ok(Box::pin(MaterializingSortMergeJoinStream::try_new(
+                Arc::clone(&self.schema),
+                self.sort_options.clone(),
+                self.null_equality,
+                streamed,
+                buffered,
+                on_streamed,
+                on_buffered,
+                self.filter.clone(),
+                self.join_type,
+                batch_size,
+                SortMergeJoinMetrics::new(partition, &self.metrics),
+                reservation,
+                spill_manager,
+                context.runtime_env(),
+            )?))
+        }
     }
 
     fn metrics(&self) -> Option<MetricsSet> {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        if partition.is_some() {
-            return Ok(Statistics::new_unknown(&self.schema()));
-        }
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        // SortMergeJoinExec uses symmetric hash partitioning where both left and right
+        // inputs are hash-partitioned on the join keys. This means partition `i` of the
+        // left input is joined with partition `i` of the right input.
+        //
+        // Therefore, partition-specific statistics can be computed by getting the
+        // partition-specific statistics from both children and combining them via
+        // `estimate_join_statistics`.
+        //
         // TODO stats: it is not possible in general to know the output size of joins
         // There are some special cases though, for example:
         // - `A LEFT JOIN B ON A.col=B.col` with `COUNT_DISTINCT(B.col)=COUNT(B.col)`
-        estimate_join_statistics(
-            self.left.partition_statistics(None)?,
-            self.right.partition_statistics(None)?,
-            self.on.clone(),
+        let left_stats = Arc::unwrap_or_clone(self.left.partition_statistics(partition)?);
+        let right_stats =
+            Arc::unwrap_or_clone(self.right.partition_statistics(partition)?);
+        Ok(Arc::new(estimate_join_statistics(
+            left_stats,
+            right_stats,
+            &self.on,
             &self.join_type,
             &self.schema,
-        )
+        )?))
     }
 
     /// Tries to swap the projection with its input [`SortMergeJoinExec`]. If it can be done,
diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/filter.rs b/datafusion/physical-plan/src/joins/sort_merge_join/filter.rs
new file mode 100644
index 0000000000000..4fc6cccaa8838
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/sort_merge_join/filter.rs
@@ -0,0 +1,388 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Filter handling for Sort-Merge Join
+//!
+//! This module encapsulates the complexity of join filter evaluation, including:
+//! - Immediate filtering for INNER joins
+//! - Deferred filtering for outer joins
+//! - Metadata tracking for grouping output rows by input row
+//! - Correcting filter masks to handle multiple matches per input row
+
+use std::sync::Arc;
+
+use arrow::array::{
+    Array, ArrayBuilder, ArrayRef, BooleanArray, BooleanBuilder, RecordBatch,
+    RecordBatchOptions, UInt64Array, UInt64Builder, new_null_array,
+};
+use arrow::compute::kernels::zip::zip;
+use arrow::compute::{self, filter_record_batch};
+use arrow::datatypes::SchemaRef;
+use datafusion_common::{JoinSide, JoinType, Result};
+
+use crate::joins::utils::JoinFilter;
+
+/// Metadata for tracking filter results during deferred filtering
+///
+/// When a join filter is present and we need to ensure each input row produces
+/// at least one output (outer joins), we can't filter immediately. Instead,
+/// we accumulate all joined rows with metadata, then post-process to determine
+/// which rows to output.
+#[derive(Debug)]
+pub struct FilterMetadata {
+    /// Did each output row pass the join filter?
+    /// Used to detect if an input row found ANY match
+    pub filter_mask: BooleanBuilder,
+
+    /// Which input row (within batch) produced each output row?
+    /// Used for grouping output rows by input row
+    pub row_indices: UInt64Builder,
+
+    /// Which input batch did each output row come from?
+    /// Used to disambiguate row_indices across multiple batches
+    pub batch_ids: Vec<usize>,
+}
+
+impl FilterMetadata {
+    /// Create new empty filter metadata
+    pub fn new() -> Self {
+        Self {
+            filter_mask: BooleanBuilder::new(),
+            row_indices: UInt64Builder::new(),
+            batch_ids: vec![],
+        }
+    }
+
+    /// Returns (row_indices, filter_mask, batch_ids_ref) and clears builders
+    pub fn finish_metadata(&mut self) -> (UInt64Array, BooleanArray, &[usize]) {
+        let row_indices = self.row_indices.finish();
+        let filter_mask = self.filter_mask.finish();
+        (row_indices, filter_mask, &self.batch_ids)
+    }
+
+    /// Add metadata for null-joined rows (no filter applied)
+    pub fn append_nulls(&mut self, num_rows: usize) {
+        self.filter_mask.append_nulls(num_rows);
+        self.row_indices.append_nulls(num_rows);
+        self.batch_ids.resize(
+            self.batch_ids.len() + num_rows,
+            0, // batch_id = 0 for null-joined rows
+        );
+    }
+
+    /// Add metadata for filtered rows
+    pub fn append_filter_metadata(
+        &mut self,
+        row_indices: &UInt64Array,
+        filter_mask: &BooleanArray,
+        batch_id: usize,
+    ) {
+        debug_assert_eq!(
+            row_indices.len(),
+            filter_mask.len(),
+            "row_indices and filter_mask must have same length"
+        );
+
+        self.filter_mask.extend(filter_mask);
+        self.row_indices.extend(row_indices);
+        self.batch_ids
+            .resize(self.batch_ids.len() + row_indices.len(), batch_id);
+    }
+
+    /// Verify that metadata arrays are aligned (same length)
+    pub fn debug_assert_metadata_aligned(&self) {
+        if self.filter_mask.len() > 0 {
+            debug_assert_eq!(
+                self.filter_mask.len(),
+                self.row_indices.len(),
+                "filter_mask and row_indices must have same length when metadata is used"
+            );
+            debug_assert_eq!(
+                self.filter_mask.len(),
+                self.batch_ids.len(),
+                "filter_mask and batch_ids must have same length when metadata is used"
+            );
+        } else {
+            debug_assert_eq!(
+                self.filter_mask.len(),
+                0,
+                "filter_mask should be empty when batches is empty"
+            );
+        }
+    }
+}
+
+impl Default for FilterMetadata {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Determines if a join type needs deferred filtering
+///
+/// Deferred filtering is required when:
+/// - A filter exists AND
+/// - The join type requires ensuring each input row produces at least one output
+pub fn needs_deferred_filtering(
+    filter: &Option<JoinFilter>,
+    join_type: JoinType,
+) -> bool {
+    filter.is_some()
+        && matches!(join_type, JoinType::Left | JoinType::Right | JoinType::Full)
+}
+
+/// Gets the arrays which join filters are applied on
+///
+/// Extracts the columns needed for filter evaluation from left and right batch columns
+pub fn get_filter_columns(
+    join_filter: &Option<JoinFilter>,
+    left_columns: &[ArrayRef],
+    right_columns: &[ArrayRef],
+) -> Vec<ArrayRef> {
+    let mut filter_columns = vec![];
+
+    if let Some(f) = join_filter {
+        let left_columns: Vec<ArrayRef> = f
+            .column_indices()
+            .iter()
+            .filter(|col_index| col_index.side == JoinSide::Left)
+            .map(|i| Arc::clone(&left_columns[i.index]))
+            .collect();
+        let right_columns: Vec<ArrayRef> = f
+            .column_indices()
+            .iter()
+            .filter(|col_index| col_index.side == JoinSide::Right)
+            .map(|i| Arc::clone(&right_columns[i.index]))
+            .collect();
+
+        filter_columns.extend(left_columns);
+        filter_columns.extend(right_columns);
+    }
+
+    filter_columns
+}
+
+/// Determines if current index is the last occurrence of a row
+///
+/// Used during filter mask correction to detect row boundaries when grouping
+/// output rows by input row.
+fn last_index_for_row(
+    row_index: usize,
+    indices: &UInt64Array,
+    batch_ids: &[usize],
+    indices_len: usize,
+) -> bool {
+    debug_assert_eq!(
+        indices.len(),
+        indices_len,
+        "indices.len() should match indices_len parameter"
+    );
+    debug_assert_eq!(
+        batch_ids.len(),
+        indices_len,
+        "batch_ids.len() should match indices_len"
+    );
+    debug_assert!(
+        row_index < indices_len,
+        "row_index {row_index} should be < indices_len {indices_len}",
+    );
+
+    // If this is the last index overall, it's definitely the last for this row
+    if row_index == indices_len - 1 {
+        return true;
+    }
+
+    // Check if next row has different (batch_id, index) pair
+    let current_batch_id = batch_ids[row_index];
+    let next_batch_id = batch_ids[row_index + 1];
+
+    if current_batch_id != next_batch_id {
+        return true;
+    }
+
+    // Same batch_id, check if row index is different
+    // Both current and next should be non-null (already joined rows)
+    if indices.is_null(row_index) || indices.is_null(row_index + 1) {
+        return true;
+    }
+
+    indices.value(row_index) != indices.value(row_index + 1)
+}
+
+/// Corrects the filter mask for joins with deferred filtering
+///
+/// When an input row joins with multiple buffered rows, we get multiple output rows.
+/// This function groups them by input row and applies join-type-specific logic:
+///
+/// - **Outer joins**: Keep first matching row, convert rest to nulls, add null-joined for unmatched
+///
+/// # Arguments
+/// * `join_type` - The type of join being performed
+/// * `row_indices` - Which input row produced each output row
+/// * `batch_ids` - Which batch each output row came from
+/// * `filter_mask` - Whether each output row passed the filter
+/// * `expected_size` - Total number of input rows (for adding unmatched)
+///
+/// # Returns
+/// Corrected mask indicating which rows to include in final output:
+/// - `true`: Include this row
+/// - `false`: Convert to null-joined row (outer joins)
+/// - `null`: Discard this row
+pub fn get_corrected_filter_mask(
+    join_type: JoinType,
+    row_indices: &UInt64Array,
+    batch_ids: &[usize],
+    filter_mask: &BooleanArray,
+    expected_size: usize,
+) -> Option<BooleanArray> {
+    let row_indices_length = row_indices.len();
+    let mut corrected_mask: BooleanBuilder =
+        BooleanBuilder::with_capacity(row_indices_length);
+    let mut seen_true = false;
+
+    match join_type {
+        JoinType::Left | JoinType::Right | JoinType::Full => {
+            // For each input row group: keep first filter-passing row,
+            // discard (null) remaining matches, null-join if none passed.
+            // Null metadata entries are already-null-joined rows that
+            // flow through unchanged to preserve output ordering.
+            for i in 0..row_indices_length {
+                let last_index =
+                    last_index_for_row(i, row_indices, batch_ids, row_indices_length);
+                if filter_mask.is_null(i) {
+                    corrected_mask.append_value(true);
+                } else if filter_mask.value(i) {
+                    seen_true = true;
+                    corrected_mask.append_value(true);
+                } else if seen_true || !filter_mask.value(i) && !last_index {
+                    corrected_mask.append_null();
+                } else {
+                    corrected_mask.append_value(false);
+                }
+
+                if last_index {
+                    seen_true = false;
+                }
+            }
+
+            corrected_mask.append_n(expected_size - corrected_mask.len(), false);
+            Some(corrected_mask.finish())
+        }
+        JoinType::LeftMark
+        | JoinType::RightMark
+        | JoinType::LeftSemi
+        | JoinType::RightSemi
+        | JoinType::LeftAnti
+        | JoinType::RightAnti => {
+            unreachable!("Semi/anti/mark joins are handled by BitwiseSortMergeJoinStream")
+        }
+        JoinType::Inner => None,
+    }
+}
+
+/// Applies corrected filter mask to record batch based on join type
+///
+/// The corrected mask has three possible values per row:
+/// - `true`: Keep the row as-is (matched and passed filter)
+/// - `false`: Convert to null-joined row (all filter matches failed for this input row)
+/// - `null`: Discard the row entirely (duplicate match for an already-output input row)
+///
+/// This function preserves input row ordering by processing each row in place
+/// rather than separating matched/unmatched rows.
+pub fn filter_record_batch_by_join_type(
+    record_batch: &RecordBatch,
+    corrected_mask: &BooleanArray,
+    join_type: JoinType,
+    schema: &SchemaRef,
+    buffered_schema: &SchemaRef,
+) -> Result<RecordBatch> {
+    match join_type {
+        JoinType::Left | JoinType::Right | JoinType::Full => {
+            if record_batch.num_rows() == 0 {
+                return Ok(record_batch.clone());
+            }
+
+            // Discard null-masked rows (keep true + false only)
+            let keep_mask = compute::is_not_null(corrected_mask)?;
+            let kept_batch = filter_record_batch(record_batch, &keep_mask)?;
+
+            if kept_batch.num_rows() == 0 {
+                return Ok(kept_batch);
+            }
+
+            let kept_corrected = compute::filter(corrected_mask, &keep_mask)?;
+            let kept_corrected = kept_corrected
+                .as_any()
+                .downcast_ref::<BooleanArray>()
+                .unwrap();
+
+            // All rows passed the filter — no null-joining needed
+            if !kept_corrected.has_false() {
+                return Ok(kept_batch);
+            }
+
+            // For false entries: replace the non-preserved side with nulls.
+            // This preserves row ordering unlike filter+concat.
+            let (null_side_start, null_side_len) = match join_type {
+                JoinType::Left => {
+                    // Left join: null out right (buffered) columns
+                    let left_cols =
+                        schema.fields().len() - buffered_schema.fields().len();
+                    (left_cols, buffered_schema.fields().len())
+                }
+                JoinType::Right => {
+                    // Right join: null out left (buffered) columns
+                    (0, buffered_schema.fields().len())
+                }
+                JoinType::Full => {
+                    // Full join: null out buffered columns for streamed rows
+                    // that matched but failed the filter. Unmatched buffered
+                    // rows are null-joined on the streamed side separately
+                    // when the buffered batch is drained.
+                    let left_cols =
+                        schema.fields().len() - buffered_schema.fields().len();
+                    (left_cols, buffered_schema.fields().len())
+                }
+                _ => unreachable!(),
+            };
+
+            let num_rows = kept_batch.num_rows();
+            let mut columns: Vec<ArrayRef> = kept_batch.columns().to_vec();
+
+            for col in columns.iter_mut().skip(null_side_start).take(null_side_len) {
+                let null_array = new_null_array(col.data_type(), num_rows);
+                *col = zip(kept_corrected, &*col, &null_array)?;
+            }
+
+            let options = RecordBatchOptions::new().with_row_count(Some(num_rows));
+            Ok(RecordBatch::try_new_with_options(
+                Arc::clone(schema),
+                columns,
+                &options,
+            )?)
+        }
+        JoinType::LeftSemi
+        | JoinType::LeftAnti
+        | JoinType::RightSemi
+        | JoinType::RightAnti
+        | JoinType::LeftMark
+        | JoinType::RightMark => unreachable!(
+            "Semi/anti/mark joins are handled by SemiAntiMarkSortMergeJoinStream"
+        ),
+        JoinType::Inner => Ok(filter_record_batch(record_batch, corrected_mask)?),
+    }
+}
diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/materializing_stream.rs b/datafusion/physical-plan/src/joins/sort_merge_join/materializing_stream.rs
new file mode 100644
index 0000000000000..4840b56f55fff
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/sort_merge_join/materializing_stream.rs
@@ -0,0 +1,1883 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Sort-Merge Join execution
+//!
+//! This module implements the runtime state machine for the Sort-Merge Join
+//! operator. It drives two sorted input streams (the *streamed* side and the
+//! *buffered* side), compares join keys, and produces joined `RecordBatch`es.
+
+use std::cmp::Ordering;
+use std::collections::{HashMap, VecDeque};
+use std::fs::File;
+use std::io::BufReader;
+use std::mem::size_of;
+use std::ops::Range;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::sync::atomic::AtomicUsize;
+use std::sync::atomic::Ordering::Relaxed;
+use std::task::{Context, Poll};
+
+use crate::joins::sort_merge_join::filter::{
+    FilterMetadata, filter_record_batch_by_join_type, get_corrected_filter_mask,
+    get_filter_columns, needs_deferred_filtering,
+};
+use crate::joins::sort_merge_join::metrics::SortMergeJoinMetrics;
+use crate::joins::utils::{JoinFilter, JoinKeyComparator};
+use crate::metrics::RecordOutput;
+use crate::spill::spill_manager::SpillManager;
+use crate::{PhysicalExpr, RecordBatchStream, SendableRecordBatchStream};
+
+use arrow::array::{types::UInt64Type, *};
+use arrow::compute::{
+    self, BatchCoalescer, SortOptions, concat_batches, filter_record_batch, interleave,
+    take, take_arrays,
+};
+use arrow::datatypes::SchemaRef;
+use arrow::ipc::reader::StreamReader;
+use datafusion_common::cast::as_uint64_array;
+use datafusion_common::{JoinType, NullEquality, Result, exec_err, internal_err};
+use datafusion_execution::disk_manager::RefCountedTempFile;
+use datafusion_execution::memory_pool::MemoryReservation;
+use datafusion_execution::runtime_env::RuntimeEnv;
+use datafusion_physical_expr_common::physical_expr::PhysicalExprRef;
+
+use futures::{Stream, StreamExt};
+
+/// State of SMJ stream
+#[derive(Debug, PartialEq, Eq)]
+pub(super) enum SortMergeJoinState {
+    /// Init joining with a new streamed row or a new buffered batches
+    Init,
+    /// Polling one streamed row or one buffered batch, or both
+    Polling,
+    /// Joining polled data and making output
+    JoinOutput,
+    /// Emit ready data if have any and then go back to [`Self::Init`] state
+    EmitReadyThenInit,
+    /// No more output
+    Exhausted,
+}
+
+/// State of streamed data stream
+#[derive(Debug, PartialEq, Eq)]
+pub(super) enum StreamedState {
+    /// Init polling
+    Init,
+    /// Polling one streamed row
+    Polling,
+    /// Ready to produce one streamed row
+    Ready,
+    /// No more streamed row
+    Exhausted,
+}
+
+/// State of buffered data stream
+#[derive(Debug, PartialEq, Eq)]
+pub(super) enum BufferedState {
+    /// Init polling
+    Init,
+    /// Polling first row in the next batch
+    PollingFirst,
+    /// Polling rest rows in the next batch
+    PollingRest,
+    /// Ready to produce one batch
+    Ready,
+    /// No more buffered batches
+    Exhausted,
+}
+
+/// Represents a chunk of joined data from streamed and buffered side
+pub(super) struct StreamedJoinedChunk {
+    /// Index of batch in buffered_data
+    buffered_batch_idx: Option<usize>,
+    /// Array builder for streamed indices
+    streamed_indices: UInt64Builder,
+    /// Array builder for buffered indices
+    /// This could contain nulls if the join is null-joined
+    buffered_indices: UInt64Builder,
+}
+
+/// Represents a record batch from streamed input.
+///
+/// Also stores information of matching rows from buffered batches.
+pub(super) struct StreamedBatch {
+    /// The streamed record batch
+    pub batch: RecordBatch,
+    /// The index of row in the streamed batch to compare with buffered batches
+    pub idx: usize,
+    /// The join key arrays of streamed batch which are used to compare with buffered batches
+    /// and to produce output. They are produced by evaluating `on` expressions.
+    pub join_arrays: Vec<ArrayRef>,
+    /// Chunks of indices from buffered side (may be nulls) joined to streamed
+    pub output_indices: Vec<StreamedJoinedChunk>,
+    /// Total number of output rows across all chunks in `output_indices`
+    pub num_output_rows: usize,
+    /// Index of currently scanned batch from buffered data
+    pub buffered_batch_idx: Option<usize>,
+}
+
+impl StreamedBatch {
+    fn new(batch: RecordBatch, on_column: &[Arc<dyn PhysicalExpr>]) -> Self {
+        let join_arrays = join_arrays(&batch, on_column);
+        StreamedBatch {
+            batch,
+            idx: 0,
+            join_arrays,
+            output_indices: vec![],
+            num_output_rows: 0,
+            buffered_batch_idx: None,
+        }
+    }
+
+    fn new_empty(schema: SchemaRef) -> Self {
+        StreamedBatch {
+            batch: RecordBatch::new_empty(schema),
+            idx: 0,
+            join_arrays: vec![],
+            output_indices: vec![],
+            num_output_rows: 0,
+            buffered_batch_idx: None,
+        }
+    }
+
+    /// Number of unfrozen output pairs in this streamed batch
+    fn num_output_rows(&self) -> usize {
+        self.num_output_rows
+    }
+
+    /// Appends new pair consisting of current streamed index and `buffered_idx`
+    /// index of buffered batch with `buffered_batch_idx` index.
+    fn append_output_pair(
+        &mut self,
+        buffered_batch_idx: Option<usize>,
+        buffered_idx: Option<usize>,
+        batch_size: usize,
+    ) {
+        // If no current chunk exists or current chunk is not for current buffered batch,
+        // create a new chunk
+        if self.output_indices.is_empty() || self.buffered_batch_idx != buffered_batch_idx
+        {
+            // Compute capacity only when creating a new chunk (infrequent operation).
+            // The capacity is the remaining space to reach batch_size.
+            // This should always be >= 1 since we only call this when num_output_rows < batch_size.
+            debug_assert!(
+                batch_size > self.num_output_rows,
+                "batch_size ({batch_size}) must be > num_output_rows ({})",
+                self.num_output_rows
+            );
+            let capacity = batch_size - self.num_output_rows;
+            self.output_indices.push(StreamedJoinedChunk {
+                buffered_batch_idx,
+                streamed_indices: UInt64Builder::with_capacity(capacity),
+                buffered_indices: UInt64Builder::with_capacity(capacity),
+            });
+            self.buffered_batch_idx = buffered_batch_idx;
+        };
+        let current_chunk = self.output_indices.last_mut().unwrap();
+
+        // Append index of streamed batch and index of buffered batch into current chunk
+        current_chunk.streamed_indices.append_value(self.idx as u64);
+        if let Some(idx) = buffered_idx {
+            current_chunk.buffered_indices.append_value(idx as u64);
+        } else {
+            current_chunk.buffered_indices.append_null();
+        }
+        self.num_output_rows += 1;
+    }
+}
+
+/// Per-row filter outcome tracking for full outer joins.
+///
+/// In a full outer join with a filter, buffered rows that match on join
+/// keys but fail every filter evaluation must be emitted with NULLs on
+/// the streamed side. Three states are needed because a simple boolean
+/// cannot distinguish "never matched" (handled by [`BufferedBatch::null_joined`])
+/// from "matched but all filters failed" (must be emitted as null-joined).
+#[repr(u8)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(super) enum FilterState {
+    /// Row never appeared in a matched pair.
+    Unvisited = 0,
+    /// Row matched streamed rows, but all filter evaluations failed.
+    AllFailed = 1,
+    /// Row matched and at least one filter evaluation passed.
+    SomePassed = 2,
+}
+
+/// A buffered batch that contains contiguous rows with same join key
+///
+/// `BufferedBatch` can exist as either an in-memory `RecordBatch` or a `RefCountedTempFile` on disk.
+#[derive(Debug)]
+pub(super) struct BufferedBatch {
+    /// Represents in memory or spilled record batch
+    pub batch: BufferedBatchState,
+    /// The range in which the rows share the same join key
+    pub range: Range<usize>,
+    /// Array refs of the join key
+    pub join_arrays: Vec<ArrayRef>,
+    /// Buffered joined index (null joining buffered)
+    pub null_joined: Vec<usize>,
+    /// Size estimation used for reserving / releasing memory
+    pub size_estimation: usize,
+    /// Tracks filter outcomes for buffered rows in full outer joins.
+    /// Indexed by absolute row position within the batch. See [`FilterState`].
+    pub join_filter_status: Vec<FilterState>,
+    /// Current buffered batch number of rows. Equal to batch.num_rows()
+    /// but if batch is spilled to disk this property is preferable
+    /// and less expensive
+    pub num_rows: usize,
+}
+
+impl BufferedBatch {
+    fn new(
+        batch: RecordBatch,
+        range: Range<usize>,
+        on_column: &[PhysicalExprRef],
+    ) -> Self {
+        let join_arrays = join_arrays(&batch, on_column);
+
+        // Estimation is calculated as
+        //   inner batch size
+        // + join keys size
+        // + worst case null_joined (as vector capacity * element size)
+        // + Range size
+        // + size of this estimation
+        let size_estimation = batch.get_array_memory_size()
+            + join_arrays
+                .iter()
+                .map(|arr| arr.get_array_memory_size())
+                .sum::<usize>()
+            + batch.num_rows().next_power_of_two() * size_of::<usize>()
+            + size_of::<Range<usize>>()
+            + size_of::<usize>();
+
+        let num_rows = batch.num_rows();
+        BufferedBatch {
+            batch: BufferedBatchState::InMemory(batch),
+            range,
+            join_arrays,
+            null_joined: vec![],
+            size_estimation,
+            join_filter_status: vec![FilterState::Unvisited; num_rows],
+            num_rows,
+        }
+    }
+}
+
+// TODO: Spill join arrays (https://github.com/apache/datafusion/pull/17429)
+// Used to represent whether the buffered data is currently in memory or written to disk
+#[derive(Debug)]
+pub(super) enum BufferedBatchState {
+    // In memory record batch
+    InMemory(RecordBatch),
+    // Spilled temp file
+    Spilled(RefCountedTempFile),
+}
+
+/// Sort-Merge join stream for Inner/Left/Right/Full joins.
+///
+/// Named "materializing" because it builds explicit `(streamed, buffered)` row
+/// pairs in [`JoinedRecordBatches`] to produce output columns from both sides
+/// of the join.
+pub(super) struct MaterializingSortMergeJoinStream {
+    // ========================================================================
+    // PROPERTIES:
+    // These fields are initialized at the start and remain constant throughout
+    // the execution.
+    // ========================================================================
+    /// Output schema
+    pub schema: SchemaRef,
+    /// Defines the null equality for the join.
+    pub null_equality: NullEquality,
+    /// Sort options of join columns used to sort streamed and buffered data stream
+    pub sort_options: Vec<SortOptions>,
+    /// optional join filter
+    pub filter: Option<JoinFilter>,
+    /// How the join is performed
+    pub join_type: JoinType,
+    /// Target output batch size
+    pub batch_size: usize,
+
+    // ========================================================================
+    // STREAMED FIELDS:
+    // These fields manage the properties and state of the streamed input.
+    // ========================================================================
+    /// Input schema of streamed
+    pub streamed_schema: SchemaRef,
+    /// Streamed data stream
+    pub streamed: SendableRecordBatchStream,
+    /// Current processing record batch of streamed
+    pub streamed_batch: StreamedBatch,
+    /// (used in outer join) Is current streamed row joined at least once?
+    pub streamed_joined: bool,
+    /// State of streamed
+    pub streamed_state: StreamedState,
+    /// Join key columns of streamed
+    pub on_streamed: Vec<PhysicalExprRef>,
+
+    // ========================================================================
+    // BUFFERED FIELDS:
+    // These fields manage the properties and state of the buffered input.
+    // ========================================================================
+    /// Input schema of buffered
+    pub buffered_schema: SchemaRef,
+    /// Buffered data stream
+    pub buffered: SendableRecordBatchStream,
+    /// Current buffered data
+    pub buffered_data: BufferedData,
+    /// (used in outer join) Is current buffered batches joined at least once?
+    pub buffered_joined: bool,
+    /// State of buffered
+    pub buffered_state: BufferedState,
+    /// Join key columns of buffered
+    pub on_buffered: Vec<PhysicalExprRef>,
+
+    // ========================================================================
+    // MERGE JOIN STATES:
+    // These fields track the execution state of merge join and are updated
+    // during the execution.
+    // ========================================================================
+    /// Current state of the stream
+    pub state: SortMergeJoinState,
+    /// Staging output array builders
+    pub joined_record_batches: JoinedRecordBatches,
+    /// Output buffer. Currently used by filtering as it requires double buffering
+    /// to avoid small/empty batches. Non-filtered join outputs directly from `staging_output_record_batches.batches`
+    pub output: BatchCoalescer,
+    /// The comparison result of current streamed row and buffered batches
+    pub current_ordering: Ordering,
+    /// Manages the process of spilling and reading back intermediate data
+    pub spill_manager: SpillManager,
+
+    // ========================================================================
+    // CACHED COMPARATORS:
+    // Pre-built comparators to avoid per-row type dispatch in hot loops.
+    // ========================================================================
+    /// Comparator for streamed vs buffered head batch key comparison
+    pub streamed_buffered_cmp: Option<JoinKeyComparator>,
+    /// Comparator for buffered head vs tail batch equality check
+    pub buffered_equality_cmp: Option<JoinKeyComparator>,
+
+    // ========================================================================
+    // EXECUTION RESOURCES:
+    // Fields related to managing execution resources and monitoring performance.
+    // ========================================================================
+    /// Metrics
+    pub join_metrics: SortMergeJoinMetrics,
+    /// Memory reservation
+    pub reservation: MemoryReservation,
+    /// Runtime env
+    pub runtime_env: Arc<RuntimeEnv>,
+    /// A unique number for each batch
+    pub streamed_batch_counter: AtomicUsize,
+}
+
+/// Staging area for joined data before output
+///
+/// Accumulates joined rows until either:
+/// - Target batch size reached (for efficiency)
+/// - Stream exhausted (flush remaining data)
+pub(super) struct JoinedRecordBatches {
+    /// Joined batches. Each batch is already joined columns from left and right sources
+    pub(super) joined_batches: BatchCoalescer,
+    /// Filter metadata for deferred filtering
+    pub(super) filter_metadata: FilterMetadata,
+}
+
+impl JoinedRecordBatches {
+    /// Concatenates all accumulated batches into a single RecordBatch
+    ///
+    /// Must drain ALL batches from BatchCoalescer for filtered joins to ensure
+    /// metadata alignment when applying get_corrected_filter_mask().
+    pub(super) fn concat_batches(&mut self, schema: &SchemaRef) -> Result<RecordBatch> {
+        self.joined_batches.finish_buffered_batch()?;
+
+        let mut all_batches = vec![];
+        while let Some(batch) = self.joined_batches.next_completed_batch() {
+            all_batches.push(batch);
+        }
+
+        match all_batches.as_slice() {
+            [] => unreachable!("concat_batches called with empty BatchCoalescer"),
+            [single_batch] => Ok(single_batch.clone()),
+            multiple_batches => Ok(concat_batches(schema, multiple_batches)?),
+        }
+    }
+
+    /// Clears batches without touching metadata (for early return when no filtering needed)
+    fn clear_batches(&mut self, schema: &SchemaRef, batch_size: usize) {
+        self.joined_batches = BatchCoalescer::new(Arc::clone(schema), batch_size)
+            .with_biggest_coalesce_batch_size(Option::from(batch_size / 2));
+    }
+
+    /// Asserts that if batches is empty, metadata is also empty
+    #[inline]
+    fn debug_assert_empty_consistency(&self) {
+        if self.joined_batches.is_empty() {
+            debug_assert_eq!(
+                self.filter_metadata.filter_mask.len(),
+                0,
+                "filter_mask should be empty when batches is empty"
+            );
+            debug_assert_eq!(
+                self.filter_metadata.row_indices.len(),
+                0,
+                "row_indices should be empty when batches is empty"
+            );
+            debug_assert_eq!(
+                self.filter_metadata.batch_ids.len(),
+                0,
+                "batch_ids should be empty when batches is empty"
+            );
+        }
+    }
+
+    /// Pushes a batch with null metadata (rows that need no filter correction)
+    ///
+    /// Used for: (1) Full join buffered rows with no streamed match, and
+    /// (2) outer join streamed rows with no buffered match. These rows are
+    /// already in final form but must flow through the deferred filtering
+    /// pipeline to preserve output ordering. Null metadata causes
+    /// get_corrected_filter_mask() to pass them through unchanged.
+    ///
+    /// Maintains invariant: N rows → N metadata entries (nulls)
+    fn push_batch_with_null_metadata(&mut self, batch: RecordBatch, join_type: JoinType) {
+        debug_assert!(
+            matches!(join_type, JoinType::Left | JoinType::Right | JoinType::Full),
+            "push_batch_with_null_metadata should only be called for deferred-filtered joins"
+        );
+
+        let num_rows = batch.num_rows();
+
+        self.filter_metadata.append_nulls(num_rows);
+
+        self.filter_metadata.debug_assert_metadata_aligned();
+        self.joined_batches
+            .push_batch(batch)
+            .expect("Failed to push batch to BatchCoalescer");
+    }
+
+    /// Pushes a batch with filter metadata (filtered outer joins)
+    ///
+    /// Deferred filtering: An input row may join with multiple buffered rows, but we
+    /// don't know yet if all matches failed the filter. We track metadata so
+    /// `get_corrected_filter_mask()` can later group by input row and decide:
+    /// - If any match passed: emit passing rows
+    /// - If all matches failed: emit null-joined row
+    ///
+    /// Maintains invariant: N rows → N metadata entries
+    fn push_batch_with_filter_metadata(
+        &mut self,
+        batch: RecordBatch,
+        row_indices: &UInt64Array,
+        filter_mask: &BooleanArray,
+        streamed_batch_id: usize,
+        join_type: JoinType,
+    ) {
+        debug_assert!(
+            matches!(join_type, JoinType::Left | JoinType::Right | JoinType::Full),
+            "push_batch_with_filter_metadata should only be called for outer joins that need deferred filtering"
+        );
+
+        debug_assert_eq!(
+            row_indices.len(),
+            filter_mask.len(),
+            "row_indices and filter_mask must have same length"
+        );
+
+        self.filter_metadata.append_filter_metadata(
+            row_indices,
+            filter_mask,
+            streamed_batch_id,
+        );
+
+        self.filter_metadata.debug_assert_metadata_aligned();
+        self.joined_batches
+            .push_batch(batch)
+            .expect("Failed to push batch to BatchCoalescer");
+    }
+
+    /// Pushes a batch without metadata (non-filtered joins)
+    ///
+    /// No deferred filtering needed. Either every join match is output (Inner),
+    /// or null-joined rows are handled separately. No need to track which input
+    /// row produced which output row.
+    fn push_batch_without_metadata(&mut self, batch: RecordBatch) {
+        self.joined_batches
+            .push_batch(batch)
+            .expect("Failed to push batch to BatchCoalescer");
+    }
+
+    fn clear(&mut self, schema: &SchemaRef, batch_size: usize) {
+        self.joined_batches = BatchCoalescer::new(Arc::clone(schema), batch_size)
+            .with_biggest_coalesce_batch_size(Option::from(batch_size / 2));
+        self.filter_metadata = FilterMetadata::new();
+        self.debug_assert_empty_consistency();
+    }
+}
+impl RecordBatchStream for MaterializingSortMergeJoinStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
+impl Stream for MaterializingSortMergeJoinStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        let join_time = self.join_metrics.join_time().clone();
+        let _timer = join_time.timer();
+        loop {
+            match &self.state {
+                SortMergeJoinState::Init => {
+                    let streamed_exhausted =
+                        self.streamed_state == StreamedState::Exhausted;
+                    let buffered_exhausted =
+                        self.buffered_state == BufferedState::Exhausted;
+                    self.state = if streamed_exhausted && buffered_exhausted {
+                        SortMergeJoinState::Exhausted
+                    } else {
+                        match self.current_ordering {
+                            Ordering::Less | Ordering::Equal => {
+                                if !streamed_exhausted {
+                                    // Batch deferred filtering: process_filtered_batches()
+                                    // only when >= batch_size rows have accumulated.
+                                    // Without this gate, unique keys cause per-row pipeline
+                                    // execution (concat + correct_mask + filter_by_type),
+                                    // which dominates runtime.
+                                    //
+                                    // Accumulated rows are bounded to ~2*batch_size:
+                                    // one batch_size worth from freeze_dequeuing_buffered()
+                                    // (when an input batch is fully consumed), plus up to
+                                    // batch_size pairs accumulating toward the next freeze.
+                                    // This does not reintroduce the unbounded buffering
+                                    // fixed by PR #20482. Exhausted state flushes remainder.
+                                    if needs_deferred_filtering(
+                                        &self.filter,
+                                        self.join_type,
+                                    ) {
+                                        let accumulated = self.num_unfrozen_pairs()
+                                            + self
+                                                .joined_record_batches
+                                                .filter_metadata
+                                                .filter_mask
+                                                .len();
+                                        if accumulated >= self.batch_size {
+                                            match self.process_filtered_batches()? {
+                                                Poll::Ready(Some(batch)) => {
+                                                    return Poll::Ready(Some(Ok(batch)));
+                                                }
+                                                Poll::Ready(None) | Poll::Pending => {}
+                                            }
+                                        }
+                                    }
+
+                                    self.streamed_joined = false;
+                                    self.streamed_state = StreamedState::Init;
+                                }
+                            }
+                            Ordering::Greater => {
+                                if !buffered_exhausted {
+                                    self.buffered_joined = false;
+                                    self.buffered_state = BufferedState::Init;
+                                }
+                            }
+                        }
+                        SortMergeJoinState::Polling
+                    };
+                }
+                SortMergeJoinState::Polling => {
+                    if ![StreamedState::Exhausted, StreamedState::Ready]
+                        .contains(&self.streamed_state)
+                    {
+                        match self.poll_streamed_row(cx)? {
+                            Poll::Ready(_) => {}
+                            Poll::Pending => return Poll::Pending,
+                        }
+                    }
+
+                    if ![BufferedState::Exhausted, BufferedState::Ready]
+                        .contains(&self.buffered_state)
+                    {
+                        match self.poll_buffered_batches(cx)? {
+                            Poll::Ready(_) => {}
+                            Poll::Pending => return Poll::Pending,
+                        }
+                    }
+                    let streamed_exhausted =
+                        self.streamed_state == StreamedState::Exhausted;
+                    let buffered_exhausted =
+                        self.buffered_state == BufferedState::Exhausted;
+                    if streamed_exhausted && buffered_exhausted {
+                        self.state = SortMergeJoinState::Exhausted;
+                        continue;
+                    }
+                    self.current_ordering = self.compare_streamed_buffered()?;
+                    self.state = SortMergeJoinState::JoinOutput;
+                }
+                SortMergeJoinState::EmitReadyThenInit => {
+                    // If have data to emit, emit it and if no more, change to next
+
+                    // Verify metadata alignment before checking if we have batches to output
+                    self.joined_record_batches
+                        .filter_metadata
+                        .debug_assert_metadata_aligned();
+
+                    // For filtered joins, skip output and let Init state handle it
+                    if needs_deferred_filtering(&self.filter, self.join_type) {
+                        self.state = SortMergeJoinState::Init;
+                        continue;
+                    }
+
+                    // For non-filtered joins, only output if we have a completed batch
+                    // (opportunistic output when target batch size is reached)
+                    if self
+                        .joined_record_batches
+                        .joined_batches
+                        .has_completed_batch()
+                    {
+                        let record_batch = self
+                            .joined_record_batches
+                            .joined_batches
+                            .next_completed_batch()
+                            .expect("has_completed_batch was true");
+                        (&record_batch)
+                            .record_output(&self.join_metrics.baseline_metrics());
+                        return Poll::Ready(Some(Ok(record_batch)));
+                    }
+                    self.state = SortMergeJoinState::Init;
+                }
+                SortMergeJoinState::JoinOutput => {
+                    self.join_partial()?;
+
+                    if self.num_unfrozen_pairs() < self.batch_size {
+                        if self.buffered_data.scanning_finished() {
+                            self.buffered_data.scanning_reset();
+                            self.state = SortMergeJoinState::EmitReadyThenInit;
+                        }
+                    } else {
+                        self.freeze_all()?;
+
+                        // Verify metadata alignment before checking if we have batches to output
+                        self.joined_record_batches
+                            .filter_metadata
+                            .debug_assert_metadata_aligned();
+
+                        // For filtered joins, skip output and let Init state handle it
+                        if needs_deferred_filtering(&self.filter, self.join_type) {
+                            continue;
+                        }
+
+                        // For non-filtered joins, only output if we have a completed batch
+                        // (opportunistic output when target batch size is reached)
+                        if self
+                            .joined_record_batches
+                            .joined_batches
+                            .has_completed_batch()
+                        {
+                            let record_batch = self
+                                .joined_record_batches
+                                .joined_batches
+                                .next_completed_batch()
+                                .expect("has_completed_batch was true");
+                            (&record_batch)
+                                .record_output(&self.join_metrics.baseline_metrics());
+                            return Poll::Ready(Some(Ok(record_batch)));
+                        }
+                        // Otherwise keep buffering (don't output yet)
+                    }
+                }
+                SortMergeJoinState::Exhausted => {
+                    self.freeze_all()?;
+
+                    // Verify metadata alignment before final output
+                    self.joined_record_batches
+                        .filter_metadata
+                        .debug_assert_metadata_aligned();
+
+                    // For filtered joins, must concat and filter ALL data at once
+                    if needs_deferred_filtering(&self.filter, self.join_type)
+                        && !self.joined_record_batches.joined_batches.is_empty()
+                    {
+                        let record_batch = self.filter_joined_batch()?;
+                        (&record_batch)
+                            .record_output(&self.join_metrics.baseline_metrics());
+                        return Poll::Ready(Some(Ok(record_batch)));
+                    }
+
+                    // For non-filtered joins, finish buffered data first
+                    if !self.joined_record_batches.joined_batches.is_empty() {
+                        self.joined_record_batches
+                            .joined_batches
+                            .finish_buffered_batch()?;
+                    }
+
+                    // Output one completed batch at a time (stay in Exhausted until empty)
+                    if self
+                        .joined_record_batches
+                        .joined_batches
+                        .has_completed_batch()
+                    {
+                        let record_batch = self
+                            .joined_record_batches
+                            .joined_batches
+                            .next_completed_batch()
+                            .expect("has_completed_batch was true");
+                        (&record_batch)
+                            .record_output(&self.join_metrics.baseline_metrics());
+                        return Poll::Ready(Some(Ok(record_batch)));
+                    }
+
+                    // Finally check self.output BatchCoalescer (used by filtered joins)
+                    return if !self.output.is_empty() {
+                        self.output.finish_buffered_batch()?;
+                        let record_batch = self
+                            .output
+                            .next_completed_batch()
+                            .expect("Failed to get last batch");
+                        (&record_batch)
+                            .record_output(&self.join_metrics.baseline_metrics());
+                        Poll::Ready(Some(Ok(record_batch)))
+                    } else {
+                        Poll::Ready(None)
+                    };
+                }
+            }
+        }
+    }
+}
+
+impl MaterializingSortMergeJoinStream {
+    #[expect(clippy::too_many_arguments)]
+    pub fn try_new(
+        schema: SchemaRef,
+        sort_options: Vec<SortOptions>,
+        null_equality: NullEquality,
+        streamed: SendableRecordBatchStream,
+        buffered: SendableRecordBatchStream,
+        on_streamed: Vec<Arc<dyn PhysicalExpr>>,
+        on_buffered: Vec<Arc<dyn PhysicalExpr>>,
+        filter: Option<JoinFilter>,
+        join_type: JoinType,
+        batch_size: usize,
+        join_metrics: SortMergeJoinMetrics,
+        reservation: MemoryReservation,
+        spill_manager: SpillManager,
+        runtime_env: Arc<RuntimeEnv>,
+    ) -> Result<Self> {
+        let streamed_schema = streamed.schema();
+        let buffered_schema = buffered.schema();
+        debug_assert!(
+            matches!(
+                join_type,
+                JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full
+            ),
+            "MaterializingSortMergeJoinStream does not handle {join_type:?}; \
+             semi/anti/mark joins use BitwiseSortMergeJoinStream"
+        );
+        Ok(Self {
+            state: SortMergeJoinState::Init,
+            sort_options,
+            null_equality,
+            schema: Arc::clone(&schema),
+            streamed_schema: Arc::clone(&streamed_schema),
+            buffered_schema,
+            streamed,
+            buffered,
+            streamed_batch: StreamedBatch::new_empty(streamed_schema),
+            buffered_data: BufferedData::default(),
+            streamed_joined: false,
+            buffered_joined: false,
+            streamed_state: StreamedState::Init,
+            buffered_state: BufferedState::Init,
+            current_ordering: Ordering::Equal,
+            on_streamed,
+            on_buffered,
+            filter,
+            joined_record_batches: JoinedRecordBatches {
+                joined_batches: BatchCoalescer::new(Arc::clone(&schema), batch_size)
+                    .with_biggest_coalesce_batch_size(Option::from(batch_size / 2)),
+                filter_metadata: FilterMetadata::new(),
+            },
+            output: BatchCoalescer::new(schema, batch_size)
+                .with_biggest_coalesce_batch_size(Option::from(batch_size / 2)),
+            batch_size,
+            join_type,
+            join_metrics,
+            reservation,
+            runtime_env,
+            spill_manager,
+            streamed_buffered_cmp: None,
+            buffered_equality_cmp: None,
+            streamed_batch_counter: AtomicUsize::new(0),
+        })
+    }
+
+    /// Build a comparator for streamed vs buffered head batch keys.
+    fn rebuild_streamed_buffered_cmp(&mut self) -> Result<()> {
+        if self.streamed_batch.join_arrays.is_empty()
+            || !self.buffered_data.has_buffered_rows()
+        {
+            self.streamed_buffered_cmp = None;
+            return Ok(());
+        }
+        self.streamed_buffered_cmp = Some(JoinKeyComparator::new(
+            &self.streamed_batch.join_arrays,
+            &self.buffered_data.head_batch().join_arrays,
+            &self.sort_options,
+            self.null_equality,
+        )?);
+        Ok(())
+    }
+
+    /// Build a comparator for buffered head vs tail batch equality.
+    fn rebuild_buffered_equality_cmp(&mut self) -> Result<()> {
+        if self.buffered_data.batches.is_empty() {
+            self.buffered_equality_cmp = None;
+            return Ok(());
+        }
+        self.buffered_equality_cmp = Some(JoinKeyComparator::new(
+            &self.buffered_data.head_batch().join_arrays,
+            &self.buffered_data.tail_batch().join_arrays,
+            &self.sort_options,
+            // is_join_arrays_equal treats both-null as equal
+            NullEquality::NullEqualsNull,
+        )?);
+        Ok(())
+    }
+
+    /// Number of unfrozen output pairs (used to decide when to freeze + output)
+    fn num_unfrozen_pairs(&self) -> usize {
+        self.streamed_batch.num_output_rows()
+    }
+
+    /// Process accumulated batches for filtered joins
+    ///
+    /// Freezes unfrozen pairs, applies deferred filtering, and outputs if ready.
+    /// Returns Poll::Ready with a batch if one is available, otherwise Poll::Pending.
+    fn process_filtered_batches(&mut self) -> Poll<Option<Result<RecordBatch>>> {
+        self.freeze_all()?;
+
+        self.joined_record_batches
+            .filter_metadata
+            .debug_assert_metadata_aligned();
+
+        if !self.joined_record_batches.joined_batches.is_empty() {
+            let out_filtered_batch = self.filter_joined_batch()?;
+            self.output
+                .push_batch(out_filtered_batch)
+                .expect("Failed to push output batch");
+
+            if self.output.has_completed_batch() {
+                let record_batch = self
+                    .output
+                    .next_completed_batch()
+                    .expect("Failed to get output batch");
+                (&record_batch).record_output(&self.join_metrics.baseline_metrics());
+                return Poll::Ready(Some(Ok(record_batch)));
+            }
+        }
+
+        Poll::Pending
+    }
+
+    /// Poll next streamed row
+    fn poll_streamed_row(&mut self, cx: &mut Context) -> Poll<Option<Result<()>>> {
+        loop {
+            match &self.streamed_state {
+                StreamedState::Init => {
+                    if self.streamed_batch.idx + 1 < self.streamed_batch.batch.num_rows()
+                    {
+                        self.streamed_batch.idx += 1;
+                        self.streamed_state = StreamedState::Ready;
+                        return Poll::Ready(Some(Ok(())));
+                    } else {
+                        self.streamed_state = StreamedState::Polling;
+                    }
+                }
+                StreamedState::Polling => match self.streamed.poll_next_unpin(cx)? {
+                    Poll::Pending => {
+                        return Poll::Pending;
+                    }
+                    Poll::Ready(None) => {
+                        self.streamed_state = StreamedState::Exhausted;
+                    }
+                    Poll::Ready(Some(batch)) => {
+                        if batch.num_rows() > 0 {
+                            self.freeze_streamed()?;
+                            self.join_metrics.input_batches().add(1);
+                            self.join_metrics.input_rows().add(batch.num_rows());
+                            self.streamed_batch =
+                                StreamedBatch::new(batch, &self.on_streamed);
+                            self.rebuild_streamed_buffered_cmp()?;
+                            // Every incoming streaming batch should have its unique id
+                            // Check `JoinedRecordBatches.self.streamed_batch_counter` documentation
+                            self.streamed_batch_counter
+                                .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
+                            self.streamed_state = StreamedState::Ready;
+                        }
+                    }
+                },
+                StreamedState::Ready => {
+                    return Poll::Ready(Some(Ok(())));
+                }
+                StreamedState::Exhausted => {
+                    return Poll::Ready(None);
+                }
+            }
+        }
+    }
+
+    fn free_reservation(&mut self, buffered_batch: &BufferedBatch) -> Result<()> {
+        // Shrink memory usage for in-memory batches only
+        if let BufferedBatchState::InMemory(_) = buffered_batch.batch {
+            self.reservation
+                .try_shrink(buffered_batch.size_estimation)?;
+        }
+        Ok(())
+    }
+
+    fn allocate_reservation(&mut self, mut buffered_batch: BufferedBatch) -> Result<()> {
+        match self.reservation.try_grow(buffered_batch.size_estimation) {
+            Ok(_) => {
+                self.join_metrics
+                    .peak_mem_used()
+                    .set_max(self.reservation.size());
+                Ok(())
+            }
+            Err(_) if self.runtime_env.disk_manager.tmp_files_enabled() => {
+                // Spill buffered batch to disk
+
+                match buffered_batch.batch {
+                    BufferedBatchState::InMemory(batch) => {
+                        let spill_file = self
+                            .spill_manager
+                            .spill_record_batch_and_finish(
+                                &[batch],
+                                "sort_merge_join_buffered_spill",
+                            )?
+                            .unwrap(); // Operation only return None if no batches are spilled, here we ensure that at least one batch is spilled
+
+                        buffered_batch.batch = BufferedBatchState::Spilled(spill_file);
+                        Ok(())
+                    }
+                    _ => internal_err!("Buffered batch has empty body"),
+                }
+            }
+            Err(e) => exec_err!("{}. Disk spilling disabled.", e.message()),
+        }?;
+
+        self.buffered_data.batches.push_back(buffered_batch);
+        Ok(())
+    }
+
+    /// Poll next buffered batches
+    fn poll_buffered_batches(&mut self, cx: &mut Context) -> Poll<Option<Result<()>>> {
+        loop {
+            match &self.buffered_state {
+                BufferedState::Init => {
+                    // pop previous buffered batches
+                    let mut head_changed = false;
+                    while !self.buffered_data.batches.is_empty() {
+                        let head_batch = self.buffered_data.head_batch();
+                        // If the head batch is fully processed, dequeue it and produce output of it.
+                        if head_batch.range.end == head_batch.num_rows {
+                            self.freeze_dequeuing_buffered()?;
+                            if let Some(mut buffered_batch) =
+                                self.buffered_data.batches.pop_front()
+                            {
+                                self.produce_buffered_not_matched(&mut buffered_batch)?;
+                                self.free_reservation(&buffered_batch)?;
+                                head_changed = true;
+                            }
+                        } else {
+                            // If the head batch is not fully processed, break the loop.
+                            // Streamed batch will be joined with the head batch in the next step.
+                            break;
+                        }
+                    }
+                    if head_changed {
+                        self.streamed_buffered_cmp = None;
+                        self.buffered_equality_cmp = None;
+                    }
+                    if self.buffered_data.batches.is_empty() {
+                        self.buffered_state = BufferedState::PollingFirst;
+                    } else {
+                        let tail_batch = self.buffered_data.tail_batch_mut();
+                        tail_batch.range.start = tail_batch.range.end;
+                        tail_batch.range.end += 1;
+                        self.buffered_state = BufferedState::PollingRest;
+                    }
+                }
+                BufferedState::PollingFirst => match self.buffered.poll_next_unpin(cx)? {
+                    Poll::Pending => {
+                        return Poll::Pending;
+                    }
+                    Poll::Ready(None) => {
+                        self.buffered_state = BufferedState::Exhausted;
+                        return Poll::Ready(None);
+                    }
+                    Poll::Ready(Some(batch)) => {
+                        self.join_metrics.input_batches().add(1);
+                        self.join_metrics.input_rows().add(batch.num_rows());
+
+                        if batch.num_rows() > 0 {
+                            let buffered_batch =
+                                BufferedBatch::new(batch, 0..1, &self.on_buffered);
+
+                            self.allocate_reservation(buffered_batch)?;
+                            self.streamed_buffered_cmp = None;
+                            self.buffered_state = BufferedState::PollingRest;
+                        }
+                    }
+                },
+                BufferedState::PollingRest => {
+                    if self.buffered_data.tail_batch().range.end
+                        < self.buffered_data.tail_batch().num_rows
+                    {
+                        if self.buffered_equality_cmp.is_none() {
+                            self.rebuild_buffered_equality_cmp()?;
+                        }
+                        while self.buffered_data.tail_batch().range.end
+                            < self.buffered_data.tail_batch().num_rows
+                        {
+                            if self.buffered_equality_cmp.as_ref().unwrap().is_equal(
+                                self.buffered_data.head_batch().range.start,
+                                self.buffered_data.tail_batch().range.end,
+                            ) {
+                                self.buffered_data.tail_batch_mut().range.end += 1;
+                            } else {
+                                self.buffered_state = BufferedState::Ready;
+                                return Poll::Ready(Some(Ok(())));
+                            }
+                        }
+                    } else {
+                        match self.buffered.poll_next_unpin(cx)? {
+                            Poll::Pending => {
+                                return Poll::Pending;
+                            }
+                            Poll::Ready(None) => {
+                                self.buffered_state = BufferedState::Ready;
+                            }
+                            Poll::Ready(Some(batch)) => {
+                                // Polling batches coming concurrently as multiple partitions
+                                self.join_metrics.input_batches().add(1);
+                                self.join_metrics.input_rows().add(batch.num_rows());
+                                if batch.num_rows() > 0 {
+                                    let buffered_batch = BufferedBatch::new(
+                                        batch,
+                                        0..0,
+                                        &self.on_buffered,
+                                    );
+                                    self.allocate_reservation(buffered_batch)?;
+                                    self.buffered_equality_cmp = None;
+                                }
+                            }
+                        }
+                    }
+                }
+                BufferedState::Ready => {
+                    return Poll::Ready(Some(Ok(())));
+                }
+                BufferedState::Exhausted => {
+                    return Poll::Ready(None);
+                }
+            }
+        }
+    }
+
+    /// Get comparison result of streamed row and buffered batches
+    fn compare_streamed_buffered(&mut self) -> Result<Ordering> {
+        if self.streamed_state == StreamedState::Exhausted {
+            return Ok(Ordering::Greater);
+        }
+        if !self.buffered_data.has_buffered_rows() {
+            return Ok(Ordering::Less);
+        }
+
+        if self.streamed_buffered_cmp.is_none() {
+            self.rebuild_streamed_buffered_cmp()?;
+        }
+        Ok(self.streamed_buffered_cmp.as_ref().unwrap().compare(
+            self.streamed_batch.idx,
+            self.buffered_data.head_batch().range.start,
+        ))
+    }
+
+    /// Produce join and fill output buffer until reaching target batch size
+    /// or the join is finished
+    fn join_partial(&mut self) -> Result<()> {
+        // Whether to join streamed rows
+        let mut join_streamed = false;
+        // Whether to join buffered rows
+        let mut join_buffered = false;
+
+        // determine whether we need to join streamed/buffered rows
+        match self.current_ordering {
+            Ordering::Less => {
+                if matches!(
+                    self.join_type,
+                    JoinType::Left | JoinType::Right | JoinType::Full
+                ) {
+                    join_streamed = !self.streamed_joined;
+                }
+            }
+            Ordering::Equal => {
+                join_streamed = true;
+                join_buffered = true;
+            }
+            Ordering::Greater => {
+                if self.join_type == JoinType::Full {
+                    join_buffered = !self.buffered_joined;
+                };
+            }
+        }
+        if !join_streamed && !join_buffered {
+            // no joined data
+            self.buffered_data.scanning_finish();
+            return Ok(());
+        }
+
+        if join_buffered {
+            // joining streamed/nulls and buffered
+            while !self.buffered_data.scanning_finished()
+                && self.num_unfrozen_pairs() < self.batch_size
+            {
+                let scanning_idx = self.buffered_data.scanning_idx();
+                if join_streamed {
+                    // Join streamed row and buffered row
+                    self.streamed_batch.append_output_pair(
+                        Some(self.buffered_data.scanning_batch_idx),
+                        Some(scanning_idx),
+                        self.batch_size,
+                    );
+                } else {
+                    // Join nulls and buffered row for FULL join
+                    self.buffered_data
+                        .scanning_batch_mut()
+                        .null_joined
+                        .push(scanning_idx);
+                }
+                self.buffered_data.scanning_advance();
+
+                if self.buffered_data.scanning_finished() {
+                    self.streamed_joined = join_streamed;
+                    self.buffered_joined = true;
+                }
+            }
+        } else {
+            // joining streamed and nulls
+            let scanning_batch_idx = if self.buffered_data.scanning_finished() {
+                None
+            } else {
+                Some(self.buffered_data.scanning_batch_idx)
+            };
+            self.streamed_batch.append_output_pair(
+                scanning_batch_idx,
+                None,
+                self.batch_size,
+            );
+            self.buffered_data.scanning_finish();
+            self.streamed_joined = true;
+        }
+        Ok(())
+    }
+
+    fn freeze_all(&mut self) -> Result<()> {
+        self.freeze_buffered(self.buffered_data.batches.len())?;
+        self.freeze_streamed()?;
+
+        // After freezing, metadata should be aligned
+        self.joined_record_batches
+            .filter_metadata
+            .debug_assert_metadata_aligned();
+
+        Ok(())
+    }
+
+    // Produces and stages record batches to ensure dequeued buffered batch
+    // no longer needed:
+    //   1. freezes all indices joined to streamed side
+    //   2. freezes NULLs joined to dequeued buffered batch to "release" it
+    fn freeze_dequeuing_buffered(&mut self) -> Result<()> {
+        self.freeze_streamed()?;
+        // Only freeze and produce the first batch in buffered_data as the batch is fully processed
+        self.freeze_buffered(1)?;
+
+        // After freezing, metadata should be aligned
+        self.joined_record_batches
+            .filter_metadata
+            .debug_assert_metadata_aligned();
+
+        Ok(())
+    }
+
+    // Produces and stages record batch from buffered indices with corresponding
+    // NULLs on streamed side.
+    //
+    // Applicable only in case of Full join.
+    //
+    fn freeze_buffered(&mut self, batch_count: usize) -> Result<()> {
+        if self.join_type != JoinType::Full {
+            return Ok(());
+        }
+        for buffered_batch in self.buffered_data.batches.range_mut(..batch_count) {
+            let buffered_indices = UInt64Array::from_iter_values(
+                buffered_batch.null_joined.iter().map(|&index| index as u64),
+            );
+            if let Some(record_batch) = produce_buffered_null_batch(
+                &self.schema,
+                &self.streamed_schema,
+                &buffered_indices,
+                buffered_batch,
+            )? {
+                self.joined_record_batches
+                    .push_batch_with_null_metadata(record_batch, self.join_type);
+            }
+            buffered_batch.null_joined.clear();
+        }
+        Ok(())
+    }
+
+    fn produce_buffered_not_matched(
+        &mut self,
+        buffered_batch: &mut BufferedBatch,
+    ) -> Result<()> {
+        if self.join_type != JoinType::Full {
+            return Ok(());
+        }
+
+        // Collect buffered rows that matched on join keys but had every
+        // filter evaluation fail — these must be emitted with NULLs on
+        // the streamed side to satisfy full outer join semantics.
+        let not_matched_buffered_indices = buffered_batch
+            .join_filter_status
+            .iter()
+            .enumerate()
+            .filter_map(|(i, state)| {
+                matches!(state, FilterState::AllFailed).then_some(i as u64)
+            })
+            .collect::<Vec<_>>();
+
+        let buffered_indices =
+            UInt64Array::from_iter_values(not_matched_buffered_indices.iter().copied());
+
+        if let Some(record_batch) = produce_buffered_null_batch(
+            &self.schema,
+            &self.streamed_schema,
+            &buffered_indices,
+            buffered_batch,
+        )? {
+            self.joined_record_batches
+                .push_batch_with_null_metadata(record_batch, self.join_type);
+        }
+        buffered_batch
+            .join_filter_status
+            .fill(FilterState::Unvisited);
+
+        Ok(())
+    }
+
+    // Produces and stages record batch for all output indices found
+    // for current streamed batch and clears staged output indices.
+    //
+    // Null-joined chunks (no buffered match) are pushed immediately.
+    // Matched chunks are collected and processed together in
+    // freeze_streamed_matched() to amortize filter evaluation overhead.
+    fn freeze_streamed(&mut self) -> Result<()> {
+        let mut matched_chunks: Vec<(usize, UInt64Array, UInt64Array)> = Vec::new();
+        let mut total_matched_rows: usize = 0;
+
+        for chunk in self.streamed_batch.output_indices.iter_mut() {
+            let left_indices = chunk.streamed_indices.finish();
+            if left_indices.is_empty() {
+                continue;
+            }
+            let right_indices: UInt64Array = chunk.buffered_indices.finish();
+
+            if chunk.buffered_batch_idx.is_none() {
+                let left_columns =
+                    materialize_left_columns(&self.streamed_batch.batch, &left_indices)?;
+                let right_columns =
+                    create_unmatched_columns(&self.buffered_schema, left_indices.len());
+
+                let columns = if self.join_type != JoinType::Right {
+                    [left_columns, right_columns].concat()
+                } else {
+                    [right_columns, left_columns].concat()
+                };
+                let batch = RecordBatch::try_new(Arc::clone(&self.schema), columns)?;
+
+                // Null-joined rows (no buffered match) need no filter correction,
+                // but must flow through the same pipeline as matched rows to
+                // preserve output ordering. Use null metadata as a sentinel so
+                // get_corrected_filter_mask() passes them through unchanged.
+                if needs_deferred_filtering(&self.filter, self.join_type) {
+                    self.joined_record_batches
+                        .push_batch_with_null_metadata(batch, self.join_type);
+                } else {
+                    self.joined_record_batches
+                        .push_batch_without_metadata(batch);
+                }
+                continue;
+            }
+
+            total_matched_rows += left_indices.len();
+            matched_chunks.push((
+                chunk.buffered_batch_idx.unwrap(),
+                left_indices,
+                right_indices,
+            ));
+        }
+
+        if !matched_chunks.is_empty() {
+            self.freeze_streamed_matched(&matched_chunks, total_matched_rows)?;
+        }
+
+        self.streamed_batch.output_indices.clear();
+        self.streamed_batch.num_output_rows = 0;
+        Ok(())
+    }
+
+    /// Materializes columns, evaluates the join filter, and pushes output
+    /// for all matched chunks in a single batch. This avoids per-chunk
+    /// RecordBatch construction and filter evaluation, which dominates
+    /// cost when keys are near-unique (1 row per chunk).
+    fn freeze_streamed_matched(
+        &mut self,
+        matched_chunks: &[(usize, UInt64Array, UInt64Array)],
+        total_matched_rows: usize,
+    ) -> Result<()> {
+        debug_assert!(
+            !matched_chunks.is_empty(),
+            "caller guards this with an is_empty check before calling"
+        );
+        debug_assert!(
+            matched_chunks.iter().all(|(idx, left, right)| {
+                left.len() == right.len() && *idx < self.buffered_data.batches.len()
+            }),
+            "left/right indices are built in pairs from the same streamed×buffered cross, \
+             and batch_idx comes from iterating buffered_data.batches"
+        );
+        debug_assert_eq!(
+            matched_chunks
+                .iter()
+                .map(|(_, l, _)| l.len())
+                .sum::<usize>(),
+            total_matched_rows,
+            "total_matched_rows is accumulated from the same chunks in freeze_streamed"
+        );
+
+        let combined_left_indices = if matched_chunks.len() == 1 {
+            matched_chunks[0].1.clone()
+        } else {
+            let refs: Vec<&dyn Array> =
+                matched_chunks.iter().map(|c| &c.1 as &dyn Array).collect();
+            as_uint64_array(&compute::concat(&refs)?)?.clone()
+        };
+
+        let left_columns =
+            materialize_left_columns(&self.streamed_batch.batch, &combined_left_indices)?;
+
+        let right_columns =
+            self.materialize_right_columns(matched_chunks, total_matched_rows)?;
+
+        let filter_columns = if self.join_type == JoinType::Right {
+            get_filter_columns(&self.filter, &right_columns, &left_columns)
+        } else {
+            get_filter_columns(&self.filter, &left_columns, &right_columns)
+        };
+
+        let columns = if self.join_type != JoinType::Right {
+            [left_columns, right_columns].concat()
+        } else {
+            [right_columns, left_columns].concat()
+        };
+        let output_batch = RecordBatch::try_new(Arc::clone(&self.schema), columns)?;
+
+        if !filter_columns.is_empty() {
+            if let Some(f) = &self.filter {
+                let filter_batch =
+                    RecordBatch::try_new(Arc::clone(f.schema()), filter_columns)?;
+                let filter_result = f
+                    .expression()
+                    .evaluate(&filter_batch)?
+                    .into_array(filter_batch.num_rows())?;
+
+                let filter_result_mask =
+                    datafusion_common::cast::as_boolean_array(&filter_result)?;
+
+                // Convert NULL filter results to false — NULL means "not satisfied"
+                // per SQL semantics, same as Left/Right outer joins.
+                let mask = if filter_result_mask.null_count() > 0 {
+                    compute::prep_null_mask_filter(filter_result_mask)
+                } else {
+                    filter_result_mask.clone()
+                };
+
+                if needs_deferred_filtering(&self.filter, self.join_type) {
+                    self.joined_record_batches.push_batch_with_filter_metadata(
+                        output_batch,
+                        &combined_left_indices,
+                        &mask,
+                        self.streamed_batch_counter.load(Relaxed),
+                        self.join_type,
+                    );
+                } else {
+                    let filtered_batch = filter_record_batch(&output_batch, &mask)?;
+                    self.joined_record_batches
+                        .push_batch_without_metadata(filtered_batch);
+                }
+
+                // Track which buffered rows had all filter matches fail,
+                // so full join can emit them as null-joined later.
+                if self.join_type == JoinType::Full {
+                    let mut offset = 0usize;
+                    for (batch_idx, _left, right) in matched_chunks {
+                        let chunk_len = right.len();
+                        let buffered_batch = &mut self.buffered_data.batches[*batch_idx];
+
+                        for i in 0..chunk_len {
+                            if right.is_null(i) {
+                                continue;
+                            }
+                            let idx = right.value(i) as usize;
+                            match buffered_batch.join_filter_status[idx] {
+                                FilterState::SomePassed => {}
+                                _ if mask.value(offset + i) => {
+                                    buffered_batch.join_filter_status[idx] =
+                                        FilterState::SomePassed;
+                                }
+                                _ => {
+                                    buffered_batch.join_filter_status[idx] =
+                                        FilterState::AllFailed;
+                                }
+                            }
+                        }
+                        offset += chunk_len;
+                    }
+                    debug_assert_eq!(
+                        offset, total_matched_rows,
+                        "offset must advance through every chunk exactly once"
+                    );
+                }
+            }
+        } else {
+            self.joined_record_batches
+                .push_batch_without_metadata(output_batch);
+        }
+
+        Ok(())
+    }
+
+    /// Materializes right-side columns across all matched chunks.
+    ///
+    /// When chunks reference a single buffered batch, indices are concatenated
+    /// for a single fetch. When multiple batches are involved, `interleave`
+    /// gathers columns across sources. A null-row sentinel at source index 0
+    /// handles null right indices (unmatched streamed rows).
+    fn materialize_right_columns(
+        &self,
+        matched_chunks: &[(usize, UInt64Array, UInt64Array)],
+        total_matched_rows: usize,
+    ) -> Result<Vec<ArrayRef>> {
+        let first_batch_idx = matched_chunks[0].0;
+        let single_source = matched_chunks.iter().all(|c| c.0 == first_batch_idx);
+
+        if single_source {
+            let combined_right_indices = if matched_chunks.len() == 1 {
+                matched_chunks[0].2.clone()
+            } else {
+                let refs: Vec<&dyn Array> =
+                    matched_chunks.iter().map(|c| &c.2 as &dyn Array).collect();
+                as_uint64_array(&compute::concat(&refs)?)?.clone()
+            };
+            return fetch_right_columns_by_idxs(
+                &self.buffered_data,
+                first_batch_idx,
+                &combined_right_indices,
+            );
+        }
+
+        // Multiple source batches: map each buffered_batch_idx to a
+        // contiguous source index, reserving source 0 for a null sentinel.
+        let mut batch_idx_to_source: HashMap<usize, usize> = HashMap::new();
+        let mut source_batches: Vec<usize> = Vec::new();
+        for (batch_idx, _, _) in matched_chunks {
+            batch_idx_to_source.entry(*batch_idx).or_insert_with(|| {
+                let idx = source_batches.len() + 1;
+                source_batches.push(*batch_idx);
+                idx
+            });
+        }
+
+        let mut interleave_indices: Vec<(usize, usize)> =
+            Vec::with_capacity(total_matched_rows);
+        for (batch_idx, _, right) in matched_chunks {
+            let source = batch_idx_to_source[batch_idx];
+            for i in 0..right.len() {
+                if right.is_null(i) {
+                    interleave_indices.push((0, 0));
+                } else {
+                    interleave_indices.push((source, right.value(i) as usize));
+                }
+            }
+        }
+
+        let num_right_cols = self.buffered_schema.fields().len();
+        let mut right_columns = Vec::with_capacity(num_right_cols);
+
+        // Read each source batch once (spilled batches require disk I/O).
+        let source_data: Vec<Option<RecordBatch>> = source_batches
+            .iter()
+            .map(|&idx| {
+                let bb = &self.buffered_data.batches[idx];
+                match &bb.batch {
+                    BufferedBatchState::InMemory(batch) => Some(batch.clone()),
+                    BufferedBatchState::Spilled(spill_file) => {
+                        let file = BufReader::new(File::open(spill_file.path()).ok()?);
+                        let reader = StreamReader::try_new(file, None).ok()?;
+                        reader.into_iter().next()?.ok()
+                    }
+                }
+            })
+            .collect();
+
+        for col_idx in 0..num_right_cols {
+            let dtype = self.buffered_schema.field(col_idx).data_type();
+            let null_array = new_null_array(dtype, 1);
+
+            let mut source_arrays: Vec<&dyn Array> =
+                Vec::with_capacity(source_batches.len() + 1);
+            source_arrays.push(null_array.as_ref());
+
+            for data in &source_data {
+                match data {
+                    Some(batch) => source_arrays.push(batch.column(col_idx).as_ref()),
+                    None => {
+                        return internal_err!(
+                            "Failed to read spilled buffered batch during interleave"
+                        );
+                    }
+                }
+            }
+
+            right_columns.push(interleave(&source_arrays, &interleave_indices)?);
+        }
+
+        Ok(right_columns)
+    }
+
+    fn filter_joined_batch(&mut self) -> Result<RecordBatch> {
+        // Metadata should be aligned before processing
+        self.joined_record_batches
+            .filter_metadata
+            .debug_assert_metadata_aligned();
+
+        let record_batch = self.joined_record_batches.concat_batches(&self.schema)?;
+        let (mut out_indices, mut out_mask, mut batch_ids) =
+            self.joined_record_batches.filter_metadata.finish_metadata();
+        let default_batch_ids = vec![0; record_batch.num_rows()];
+
+        // If only nulls come in and indices sizes doesn't match with expected record batch count
+        // generate missing indices
+        // Happens for null joined batches for Full Join
+        if out_indices.null_count() == out_indices.len()
+            && out_indices.len() != record_batch.num_rows()
+        {
+            out_mask = BooleanArray::from(vec![None; record_batch.num_rows()]);
+            out_indices = UInt64Array::from(vec![None; record_batch.num_rows()]);
+            batch_ids = &default_batch_ids;
+        }
+
+        // After potential reconstruction, metadata should align with batch row count
+        debug_assert_eq!(
+            out_indices.len(),
+            record_batch.num_rows(),
+            "out_indices length should match record_batch row count"
+        );
+        debug_assert_eq!(
+            out_mask.len(),
+            record_batch.num_rows(),
+            "out_mask length should match record_batch row count (unless empty)"
+        );
+        debug_assert_eq!(
+            batch_ids.len(),
+            record_batch.num_rows(),
+            "batch_ids length should match record_batch row count"
+        );
+
+        if out_mask.is_empty() {
+            self.joined_record_batches
+                .clear_batches(&self.schema, self.batch_size);
+            return Ok(record_batch);
+        }
+
+        // Validate inputs to get_corrected_filter_mask
+        debug_assert_eq!(
+            out_indices.len(),
+            out_mask.len(),
+            "out_indices and out_mask must have same length for get_corrected_filter_mask"
+        );
+        debug_assert_eq!(
+            batch_ids.len(),
+            out_mask.len(),
+            "batch_ids and out_mask must have same length for get_corrected_filter_mask"
+        );
+
+        let maybe_corrected_mask = get_corrected_filter_mask(
+            self.join_type,
+            &out_indices,
+            batch_ids,
+            &out_mask,
+            record_batch.num_rows(),
+        );
+
+        let corrected_mask = if let Some(ref filtered_join_mask) = maybe_corrected_mask {
+            filtered_join_mask
+        } else {
+            &out_mask
+        };
+
+        self.filter_record_batch_by_join_type(&record_batch, corrected_mask)
+    }
+
+    fn filter_record_batch_by_join_type(
+        &mut self,
+        record_batch: &RecordBatch,
+        corrected_mask: &BooleanArray,
+    ) -> Result<RecordBatch> {
+        let filtered_record_batch = filter_record_batch_by_join_type(
+            record_batch,
+            corrected_mask,
+            self.join_type,
+            &self.schema,
+            &self.buffered_schema,
+        )?;
+
+        self.joined_record_batches
+            .clear(&self.schema, self.batch_size);
+
+        Ok(filtered_record_batch)
+    }
+}
+
+/// Materialize left (streamed) columns using slice or take.
+fn materialize_left_columns(
+    batch: &RecordBatch,
+    indices: &UInt64Array,
+) -> Result<Vec<ArrayRef>> {
+    if let Some(range) = is_contiguous_range(indices) {
+        Ok(batch.slice(range.start, range.len()).columns().to_vec())
+    } else {
+        Ok(take_arrays(batch.columns(), indices, None)?)
+    }
+}
+
+fn create_unmatched_columns(schema: &SchemaRef, size: usize) -> Vec<ArrayRef> {
+    schema
+        .fields()
+        .iter()
+        .map(|f| new_null_array(f.data_type(), size))
+        .collect::<Vec<_>>()
+}
+
+fn produce_buffered_null_batch(
+    schema: &SchemaRef,
+    streamed_schema: &SchemaRef,
+    buffered_indices: &PrimitiveArray<UInt64Type>,
+    buffered_batch: &BufferedBatch,
+) -> Result<Option<RecordBatch>> {
+    if buffered_indices.is_empty() {
+        return Ok(None);
+    }
+
+    // Take buffered (right) columns
+    let right_columns =
+        fetch_right_columns_from_batch_by_idxs(buffered_batch, buffered_indices)?;
+
+    // Create null streamed (left) columns
+    let mut left_columns = streamed_schema
+        .fields()
+        .iter()
+        .map(|f| new_null_array(f.data_type(), buffered_indices.len()))
+        .collect::<Vec<_>>();
+
+    left_columns.extend(right_columns);
+
+    Ok(Some(RecordBatch::try_new(
+        Arc::clone(schema),
+        left_columns,
+    )?))
+}
+
+/// Checks if a `UInt64Array` contains a contiguous ascending range (e.g. \[3,4,5,6\]).
+/// Returns `Some(start..start+len)` if so, `None` otherwise.
+/// This allows replacing an O(n) `take` with an O(1) `slice`.
+#[inline]
+fn is_contiguous_range(indices: &UInt64Array) -> Option<Range<usize>> {
+    if indices.is_empty() || indices.null_count() > 0 {
+        return None;
+    }
+    let values = indices.values();
+    let start = values[0];
+    let len = values.len() as u64;
+    // Quick rejection: if last element doesn't match expected, not contiguous
+    if values[values.len() - 1] != start + len - 1 {
+        return None;
+    }
+    // Verify every element is sequential (handles duplicates and gaps)
+    for i in 1..values.len() {
+        if values[i] != start + i as u64 {
+            return None;
+        }
+    }
+    Some(start as usize..(start + len) as usize)
+}
+
+/// Get `buffered_indices` rows for `buffered_data[buffered_batch_idx]` by specific column indices
+#[inline(always)]
+fn fetch_right_columns_by_idxs(
+    buffered_data: &BufferedData,
+    buffered_batch_idx: usize,
+    buffered_indices: &UInt64Array,
+) -> Result<Vec<ArrayRef>> {
+    fetch_right_columns_from_batch_by_idxs(
+        &buffered_data.batches[buffered_batch_idx],
+        buffered_indices,
+    )
+}
+
+#[inline(always)]
+fn fetch_right_columns_from_batch_by_idxs(
+    buffered_batch: &BufferedBatch,
+    buffered_indices: &UInt64Array,
+) -> Result<Vec<ArrayRef>> {
+    match &buffered_batch.batch {
+        // In memory batch
+        // In memory batch
+        BufferedBatchState::InMemory(batch) => {
+            // When indices form a contiguous range (common in SMJ since the
+            // buffered side is scanned sequentially), use zero-copy slice.
+            if let Some(range) = is_contiguous_range(buffered_indices) {
+                Ok(batch.slice(range.start, range.len()).columns().to_vec())
+            } else {
+                Ok(take_arrays(batch.columns(), buffered_indices, None)?)
+            }
+        }
+        // If the batch was spilled to disk, less likely
+        BufferedBatchState::Spilled(spill_file) => {
+            let mut buffered_cols: Vec<ArrayRef> =
+                Vec::with_capacity(buffered_indices.len());
+
+            let file = BufReader::new(File::open(spill_file.path())?);
+            let reader = StreamReader::try_new(file, None)?;
+
+            for batch in reader {
+                batch?.columns().iter().for_each(|column| {
+                    buffered_cols.extend(take(column, &buffered_indices, None))
+                });
+            }
+
+            Ok(buffered_cols)
+        }
+    }
+}
+
+/// Buffered data contains all buffered batches with one unique join key
+#[derive(Debug, Default)]
+pub(super) struct BufferedData {
+    /// Buffered batches with the same key
+    pub batches: VecDeque<BufferedBatch>,
+    /// current scanning batch index used in join_partial()
+    pub scanning_batch_idx: usize,
+    /// current scanning offset used in join_partial()
+    pub scanning_offset: usize,
+}
+
+impl BufferedData {
+    pub fn head_batch(&self) -> &BufferedBatch {
+        self.batches.front().unwrap()
+    }
+
+    pub fn tail_batch(&self) -> &BufferedBatch {
+        self.batches.back().unwrap()
+    }
+
+    pub fn tail_batch_mut(&mut self) -> &mut BufferedBatch {
+        self.batches.back_mut().unwrap()
+    }
+
+    pub fn has_buffered_rows(&self) -> bool {
+        self.batches.iter().any(|batch| !batch.range.is_empty())
+    }
+
+    pub fn scanning_reset(&mut self) {
+        self.scanning_batch_idx = 0;
+        self.scanning_offset = 0;
+    }
+
+    pub fn scanning_advance(&mut self) {
+        self.scanning_offset += 1;
+        while !self.scanning_finished() && self.scanning_batch_finished() {
+            self.scanning_batch_idx += 1;
+            self.scanning_offset = 0;
+        }
+    }
+
+    pub fn scanning_batch(&self) -> &BufferedBatch {
+        &self.batches[self.scanning_batch_idx]
+    }
+
+    pub fn scanning_batch_mut(&mut self) -> &mut BufferedBatch {
+        &mut self.batches[self.scanning_batch_idx]
+    }
+
+    pub fn scanning_idx(&self) -> usize {
+        self.scanning_batch().range.start + self.scanning_offset
+    }
+
+    pub fn scanning_batch_finished(&self) -> bool {
+        self.scanning_offset == self.scanning_batch().range.len()
+    }
+
+    pub fn scanning_finished(&self) -> bool {
+        self.scanning_batch_idx == self.batches.len()
+    }
+
+    pub fn scanning_finish(&mut self) {
+        self.scanning_batch_idx = self.batches.len();
+        self.scanning_offset = 0;
+    }
+}
+
+/// Get join array refs of given batch and join columns
+fn join_arrays(batch: &RecordBatch, on_column: &[PhysicalExprRef]) -> Vec<ArrayRef> {
+    on_column
+        .iter()
+        .map(|c| {
+            let num_rows = batch.num_rows();
+            let c = c.evaluate(batch).unwrap();
+            c.into_array(num_rows).unwrap()
+        })
+        .collect()
+}
diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/metrics.rs b/datafusion/physical-plan/src/joins/sort_merge_join/metrics.rs
index 5920cd663a775..62efb77f877ab 100644
--- a/datafusion/physical-plan/src/joins/sort_merge_join/metrics.rs
+++ b/datafusion/physical-plan/src/joins/sort_merge_join/metrics.rs
@@ -18,12 +18,11 @@
 //! Module for tracking Sort Merge Join metrics
 
 use crate::metrics::{
-    BaselineMetrics, Count, ExecutionPlanMetricsSet, Gauge, MetricBuilder, SpillMetrics,
-    Time,
+    BaselineMetrics, Count, ExecutionPlanMetricsSet, Gauge, MetricBuilder,
+    MetricCategory, Time,
 };
 
 /// Metrics for SortMergeJoinExec
-#[allow(dead_code)]
 pub(super) struct SortMergeJoinMetrics {
     /// Total time for joining probe-side batches to the build-side batches
     join_time: Time,
@@ -31,28 +30,25 @@ pub(super) struct SortMergeJoinMetrics {
     input_batches: Count,
     /// Number of rows consumed by this operator
     input_rows: Count,
-    /// Number of batches produced by this operator
-    output_batches: Count,
     /// Execution metrics
     baseline_metrics: BaselineMetrics,
     /// Peak memory used for buffered data.
     /// Calculated as sum of peak memory values across partitions
     peak_mem_used: Gauge,
-    /// Metrics related to spilling
-    spill_metrics: SpillMetrics,
 }
 
 impl SortMergeJoinMetrics {
-    #[allow(dead_code)]
     pub fn new(partition: usize, metrics: &ExecutionPlanMetricsSet) -> Self {
         let join_time = MetricBuilder::new(metrics).subset_time("join_time", partition);
-        let input_batches =
-            MetricBuilder::new(metrics).counter("input_batches", partition);
-        let input_rows = MetricBuilder::new(metrics).counter("input_rows", partition);
-        let output_batches =
-            MetricBuilder::new(metrics).counter("output_batches", partition);
-        let peak_mem_used = MetricBuilder::new(metrics).gauge("peak_mem_used", partition);
-        let spill_metrics = SpillMetrics::new(metrics, partition);
+        let input_batches = MetricBuilder::new(metrics)
+            .with_category(MetricCategory::Rows)
+            .counter("input_batches", partition);
+        let input_rows = MetricBuilder::new(metrics)
+            .with_category(MetricCategory::Rows)
+            .counter("input_rows", partition);
+        let peak_mem_used = MetricBuilder::new(metrics)
+            .with_category(MetricCategory::Bytes)
+            .gauge("peak_mem_used", partition);
 
         let baseline_metrics = BaselineMetrics::new(metrics, partition);
 
@@ -60,10 +56,8 @@ impl SortMergeJoinMetrics {
             join_time,
             input_batches,
             input_rows,
-            output_batches,
             baseline_metrics,
             peak_mem_used,
-            spill_metrics,
         }
     }
 
@@ -82,15 +76,8 @@ impl SortMergeJoinMetrics {
     pub fn input_rows(&self) -> Count {
         self.input_rows.clone()
     }
-    pub fn output_batches(&self) -> Count {
-        self.output_batches.clone()
-    }
 
     pub fn peak_mem_used(&self) -> Gauge {
         self.peak_mem_used.clone()
     }
-
-    pub fn spill_metrics(&self) -> SpillMetrics {
-        self.spill_metrics.clone()
-    }
 }
diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/mod.rs b/datafusion/physical-plan/src/joins/sort_merge_join/mod.rs
index 82f18e7414095..2fdb0924e723d 100644
--- a/datafusion/physical-plan/src/joins/sort_merge_join/mod.rs
+++ b/datafusion/physical-plan/src/joins/sort_merge_join/mod.rs
@@ -19,9 +19,11 @@
 
 pub use exec::SortMergeJoinExec;
 
+pub(crate) mod bitwise_stream;
 mod exec;
+mod filter;
+pub(crate) mod materializing_stream;
 mod metrics;
-mod stream;
 
 #[cfg(test)]
 mod tests;
diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/stream.rs b/datafusion/physical-plan/src/joins/sort_merge_join/stream.rs
deleted file mode 100644
index 1185866b9f46e..0000000000000
--- a/datafusion/physical-plan/src/joins/sort_merge_join/stream.rs
+++ /dev/null
@@ -1,1941 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Sort-Merge Join execution
-//!
-//! This module implements the runtime state machine for the Sort-Merge Join
-//! operator. It drives two sorted input streams (the *streamed* side and the
-//! *buffered* side), compares join keys, and produces joined `RecordBatch`es.
-
-use std::cmp::Ordering;
-use std::collections::{HashMap, VecDeque};
-use std::fs::File;
-use std::io::BufReader;
-use std::mem::size_of;
-use std::ops::Range;
-use std::pin::Pin;
-use std::sync::atomic::AtomicUsize;
-use std::sync::atomic::Ordering::Relaxed;
-use std::sync::Arc;
-use std::task::{Context, Poll};
-
-use crate::joins::sort_merge_join::metrics::SortMergeJoinMetrics;
-use crate::joins::utils::{compare_join_arrays, JoinFilter};
-use crate::spill::spill_manager::SpillManager;
-use crate::{PhysicalExpr, RecordBatchStream, SendableRecordBatchStream};
-
-use arrow::array::{types::UInt64Type, *};
-use arrow::compute::{
-    self, concat_batches, filter_record_batch, is_not_null, take, SortOptions,
-};
-use arrow::datatypes::{DataType, SchemaRef, TimeUnit};
-use arrow::error::ArrowError;
-use arrow::ipc::reader::StreamReader;
-use datafusion_common::config::SpillCompression;
-use datafusion_common::{
-    exec_err, internal_err, not_impl_err, DataFusionError, HashSet, JoinSide, JoinType,
-    NullEquality, Result,
-};
-use datafusion_execution::disk_manager::RefCountedTempFile;
-use datafusion_execution::memory_pool::MemoryReservation;
-use datafusion_execution::runtime_env::RuntimeEnv;
-use datafusion_physical_expr_common::physical_expr::PhysicalExprRef;
-
-use futures::{Stream, StreamExt};
-
-/// State of SMJ stream
-#[derive(Debug, PartialEq, Eq)]
-pub(super) enum SortMergeJoinState {
-    /// Init joining with a new streamed row or a new buffered batches
-    Init,
-    /// Polling one streamed row or one buffered batch, or both
-    Polling,
-    /// Joining polled data and making output
-    JoinOutput,
-    /// No more output
-    Exhausted,
-}
-
-/// State of streamed data stream
-#[derive(Debug, PartialEq, Eq)]
-pub(super) enum StreamedState {
-    /// Init polling
-    Init,
-    /// Polling one streamed row
-    Polling,
-    /// Ready to produce one streamed row
-    Ready,
-    /// No more streamed row
-    Exhausted,
-}
-
-/// State of buffered data stream
-#[derive(Debug, PartialEq, Eq)]
-pub(super) enum BufferedState {
-    /// Init polling
-    Init,
-    /// Polling first row in the next batch
-    PollingFirst,
-    /// Polling rest rows in the next batch
-    PollingRest,
-    /// Ready to produce one batch
-    Ready,
-    /// No more buffered batches
-    Exhausted,
-}
-
-/// Represents a chunk of joined data from streamed and buffered side
-pub(super) struct StreamedJoinedChunk {
-    /// Index of batch in buffered_data
-    buffered_batch_idx: Option<usize>,
-    /// Array builder for streamed indices
-    streamed_indices: UInt64Builder,
-    /// Array builder for buffered indices
-    /// This could contain nulls if the join is null-joined
-    buffered_indices: UInt64Builder,
-}
-
-/// Represents a record batch from streamed input.
-///
-/// Also stores information of matching rows from buffered batches.
-pub(super) struct StreamedBatch {
-    /// The streamed record batch
-    pub batch: RecordBatch,
-    /// The index of row in the streamed batch to compare with buffered batches
-    pub idx: usize,
-    /// The join key arrays of streamed batch which are used to compare with buffered batches
-    /// and to produce output. They are produced by evaluating `on` expressions.
-    pub join_arrays: Vec<ArrayRef>,
-    /// Chunks of indices from buffered side (may be nulls) joined to streamed
-    pub output_indices: Vec<StreamedJoinedChunk>,
-    /// Index of currently scanned batch from buffered data
-    pub buffered_batch_idx: Option<usize>,
-    /// Indices that found a match for the given join filter
-    /// Used for semi joins to keep track the streaming index which got a join filter match
-    /// and already emitted to the output.
-    pub join_filter_matched_idxs: HashSet<u64>,
-}
-
-impl StreamedBatch {
-    fn new(batch: RecordBatch, on_column: &[Arc<dyn PhysicalExpr>]) -> Self {
-        let join_arrays = join_arrays(&batch, on_column);
-        StreamedBatch {
-            batch,
-            idx: 0,
-            join_arrays,
-            output_indices: vec![],
-            buffered_batch_idx: None,
-            join_filter_matched_idxs: HashSet::new(),
-        }
-    }
-
-    fn new_empty(schema: SchemaRef) -> Self {
-        StreamedBatch {
-            batch: RecordBatch::new_empty(schema),
-            idx: 0,
-            join_arrays: vec![],
-            output_indices: vec![],
-            buffered_batch_idx: None,
-            join_filter_matched_idxs: HashSet::new(),
-        }
-    }
-
-    /// Appends new pair consisting of current streamed index and `buffered_idx`
-    /// index of buffered batch with `buffered_batch_idx` index.
-    fn append_output_pair(
-        &mut self,
-        buffered_batch_idx: Option<usize>,
-        buffered_idx: Option<usize>,
-    ) {
-        // If no current chunk exists or current chunk is not for current buffered batch,
-        // create a new chunk
-        if self.output_indices.is_empty() || self.buffered_batch_idx != buffered_batch_idx
-        {
-            self.output_indices.push(StreamedJoinedChunk {
-                buffered_batch_idx,
-                streamed_indices: UInt64Builder::with_capacity(1),
-                buffered_indices: UInt64Builder::with_capacity(1),
-            });
-            self.buffered_batch_idx = buffered_batch_idx;
-        };
-        let current_chunk = self.output_indices.last_mut().unwrap();
-
-        // Append index of streamed batch and index of buffered batch into current chunk
-        current_chunk.streamed_indices.append_value(self.idx as u64);
-        if let Some(idx) = buffered_idx {
-            current_chunk.buffered_indices.append_value(idx as u64);
-        } else {
-            current_chunk.buffered_indices.append_null();
-        }
-    }
-}
-
-/// A buffered batch that contains contiguous rows with same join key
-///
-/// `BufferedBatch` can exist as either an in-memory `RecordBatch` or a `RefCountedTempFile` on disk.
-#[derive(Debug)]
-pub(super) struct BufferedBatch {
-    /// Represents in memory or spilled record batch
-    pub batch: BufferedBatchState,
-    /// The range in which the rows share the same join key
-    pub range: Range<usize>,
-    /// Array refs of the join key
-    pub join_arrays: Vec<ArrayRef>,
-    /// Buffered joined index (null joining buffered)
-    pub null_joined: Vec<usize>,
-    /// Size estimation used for reserving / releasing memory
-    pub size_estimation: usize,
-    /// The indices of buffered batch that the join filter doesn't satisfy.
-    /// This is a map between right row index and a boolean value indicating whether all joined row
-    /// of the right row does not satisfy the filter .
-    /// When dequeuing the buffered batch, we need to produce null joined rows for these indices.
-    pub join_filter_not_matched_map: HashMap<u64, bool>,
-    /// Current buffered batch number of rows. Equal to batch.num_rows()
-    /// but if batch is spilled to disk this property is preferable
-    /// and less expensive
-    pub num_rows: usize,
-}
-
-impl BufferedBatch {
-    fn new(
-        batch: RecordBatch,
-        range: Range<usize>,
-        on_column: &[PhysicalExprRef],
-    ) -> Self {
-        let join_arrays = join_arrays(&batch, on_column);
-
-        // Estimation is calculated as
-        //   inner batch size
-        // + join keys size
-        // + worst case null_joined (as vector capacity * element size)
-        // + Range size
-        // + size of this estimation
-        let size_estimation = batch.get_array_memory_size()
-            + join_arrays
-                .iter()
-                .map(|arr| arr.get_array_memory_size())
-                .sum::<usize>()
-            + batch.num_rows().next_power_of_two() * size_of::<usize>()
-            + size_of::<Range<usize>>()
-            + size_of::<usize>();
-
-        let num_rows = batch.num_rows();
-        BufferedBatch {
-            batch: BufferedBatchState::InMemory(batch),
-            range,
-            join_arrays,
-            null_joined: vec![],
-            size_estimation,
-            join_filter_not_matched_map: HashMap::new(),
-            num_rows,
-        }
-    }
-}
-
-// TODO: Spill join arrays (https://github.com/apache/datafusion/pull/17429)
-// Used to represent whether the buffered data is currently in memory or written to disk
-#[derive(Debug)]
-pub(super) enum BufferedBatchState {
-    // In memory record batch
-    InMemory(RecordBatch),
-    // Spilled temp file
-    Spilled(RefCountedTempFile),
-}
-
-/// Sort-Merge join stream that consumes streamed and buffered data streams
-/// and produces joined output stream.
-pub(super) struct SortMergeJoinStream {
-    // ========================================================================
-    // PROPERTIES:
-    // These fields are initialized at the start and remain constant throughout
-    // the execution.
-    // ========================================================================
-    /// Output schema
-    pub schema: SchemaRef,
-    /// Defines the null equality for the join.
-    pub null_equality: NullEquality,
-    /// Sort options of join columns used to sort streamed and buffered data stream
-    pub sort_options: Vec<SortOptions>,
-    /// optional join filter
-    pub filter: Option<JoinFilter>,
-    /// How the join is performed
-    pub join_type: JoinType,
-    /// Target output batch size
-    pub batch_size: usize,
-
-    // ========================================================================
-    // STREAMED FIELDS:
-    // These fields manage the properties and state of the streamed input.
-    // ========================================================================
-    /// Input schema of streamed
-    pub streamed_schema: SchemaRef,
-    /// Streamed data stream
-    pub streamed: SendableRecordBatchStream,
-    /// Current processing record batch of streamed
-    pub streamed_batch: StreamedBatch,
-    /// (used in outer join) Is current streamed row joined at least once?
-    pub streamed_joined: bool,
-    /// State of streamed
-    pub streamed_state: StreamedState,
-    /// Join key columns of streamed
-    pub on_streamed: Vec<PhysicalExprRef>,
-
-    // ========================================================================
-    // BUFFERED FIELDS:
-    // These fields manage the properties and state of the buffered input.
-    // ========================================================================
-    /// Input schema of buffered
-    pub buffered_schema: SchemaRef,
-    /// Buffered data stream
-    pub buffered: SendableRecordBatchStream,
-    /// Current buffered data
-    pub buffered_data: BufferedData,
-    /// (used in outer join) Is current buffered batches joined at least once?
-    pub buffered_joined: bool,
-    /// State of buffered
-    pub buffered_state: BufferedState,
-    /// Join key columns of buffered
-    pub on_buffered: Vec<PhysicalExprRef>,
-
-    // ========================================================================
-    // MERGE JOIN STATES:
-    // These fields track the execution state of merge join and are updated
-    // during the execution.
-    // ========================================================================
-    /// Current state of the stream
-    pub state: SortMergeJoinState,
-    /// Staging output array builders
-    pub staging_output_record_batches: JoinedRecordBatches,
-    /// Output buffer. Currently used by filtering as it requires double buffering
-    /// to avoid small/empty batches. Non-filtered join outputs directly from `staging_output_record_batches.batches`
-    pub output: RecordBatch,
-    /// Staging output size, including output batches and staging joined results.
-    /// Increased when we put rows into buffer and decreased after we actually output batches.
-    /// Used to trigger output when sufficient rows are ready
-    pub output_size: usize,
-    /// The comparison result of current streamed row and buffered batches
-    pub current_ordering: Ordering,
-    /// Manages the process of spilling and reading back intermediate data
-    pub spill_manager: SpillManager,
-
-    // ========================================================================
-    // EXECUTION RESOURCES:
-    // Fields related to managing execution resources and monitoring performance.
-    // ========================================================================
-    /// Metrics
-    pub join_metrics: SortMergeJoinMetrics,
-    /// Memory reservation
-    pub reservation: MemoryReservation,
-    /// Runtime env
-    pub runtime_env: Arc<RuntimeEnv>,
-    /// A unique number for each batch
-    pub streamed_batch_counter: AtomicUsize,
-}
-
-/// Joined batches with attached join filter information
-pub(super) struct JoinedRecordBatches {
-    /// Joined batches. Each batch is already joined columns from left and right sources
-    pub batches: Vec<RecordBatch>,
-    /// Filter match mask for each row(matched/non-matched)
-    pub filter_mask: BooleanBuilder,
-    /// Left row indices to glue together rows in `batches` and `filter_mask`
-    pub row_indices: UInt64Builder,
-    /// Which unique batch id the row belongs to
-    /// It is necessary to differentiate rows that are distributed the way when they point to the same
-    /// row index but in not the same batches
-    pub batch_ids: Vec<usize>,
-}
-
-impl JoinedRecordBatches {
-    fn clear(&mut self) {
-        self.batches.clear();
-        self.batch_ids.clear();
-        self.filter_mask = BooleanBuilder::new();
-        self.row_indices = UInt64Builder::new();
-    }
-}
-impl RecordBatchStream for SortMergeJoinStream {
-    fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.schema)
-    }
-}
-
-/// True if next index refers to either:
-/// - another batch id
-/// - another row index within same batch id
-/// - end of row indices
-#[inline(always)]
-fn last_index_for_row(
-    row_index: usize,
-    indices: &UInt64Array,
-    batch_ids: &[usize],
-    indices_len: usize,
-) -> bool {
-    row_index == indices_len - 1
-        || batch_ids[row_index] != batch_ids[row_index + 1]
-        || indices.value(row_index) != indices.value(row_index + 1)
-}
-
-// Returns a corrected boolean bitmask for the given join type
-// Values in the corrected bitmask can be: true, false, null
-// `true` - the row found its match and sent to the output
-// `null` - the row ignored, no output
-// `false` - the row sent as NULL joined row
-pub(super) fn get_corrected_filter_mask(
-    join_type: JoinType,
-    row_indices: &UInt64Array,
-    batch_ids: &[usize],
-    filter_mask: &BooleanArray,
-    expected_size: usize,
-) -> Option<BooleanArray> {
-    let row_indices_length = row_indices.len();
-    let mut corrected_mask: BooleanBuilder =
-        BooleanBuilder::with_capacity(row_indices_length);
-    let mut seen_true = false;
-
-    match join_type {
-        JoinType::Left | JoinType::Right => {
-            for i in 0..row_indices_length {
-                let last_index =
-                    last_index_for_row(i, row_indices, batch_ids, row_indices_length);
-                if filter_mask.value(i) {
-                    seen_true = true;
-                    corrected_mask.append_value(true);
-                } else if seen_true || !filter_mask.value(i) && !last_index {
-                    corrected_mask.append_null(); // to be ignored and not set to output
-                } else {
-                    corrected_mask.append_value(false); // to be converted to null joined row
-                }
-
-                if last_index {
-                    seen_true = false;
-                }
-            }
-
-            // Generate null joined rows for records which have no matching join key
-            corrected_mask.append_n(expected_size - corrected_mask.len(), false);
-            Some(corrected_mask.finish())
-        }
-        JoinType::LeftMark | JoinType::RightMark => {
-            for i in 0..row_indices_length {
-                let last_index =
-                    last_index_for_row(i, row_indices, batch_ids, row_indices_length);
-                if filter_mask.value(i) && !seen_true {
-                    seen_true = true;
-                    corrected_mask.append_value(true);
-                } else if seen_true || !filter_mask.value(i) && !last_index {
-                    corrected_mask.append_null(); // to be ignored and not set to output
-                } else {
-                    corrected_mask.append_value(false); // to be converted to null joined row
-                }
-
-                if last_index {
-                    seen_true = false;
-                }
-            }
-
-            // Generate null joined rows for records which have no matching join key
-            corrected_mask.append_n(expected_size - corrected_mask.len(), false);
-            Some(corrected_mask.finish())
-        }
-        JoinType::LeftSemi | JoinType::RightSemi => {
-            for i in 0..row_indices_length {
-                let last_index =
-                    last_index_for_row(i, row_indices, batch_ids, row_indices_length);
-                if filter_mask.value(i) && !seen_true {
-                    seen_true = true;
-                    corrected_mask.append_value(true);
-                } else {
-                    corrected_mask.append_null(); // to be ignored and not set to output
-                }
-
-                if last_index {
-                    seen_true = false;
-                }
-            }
-
-            Some(corrected_mask.finish())
-        }
-        JoinType::LeftAnti | JoinType::RightAnti => {
-            for i in 0..row_indices_length {
-                let last_index =
-                    last_index_for_row(i, row_indices, batch_ids, row_indices_length);
-
-                if filter_mask.value(i) {
-                    seen_true = true;
-                }
-
-                if last_index {
-                    if !seen_true {
-                        corrected_mask.append_value(true);
-                    } else {
-                        corrected_mask.append_null();
-                    }
-
-                    seen_true = false;
-                } else {
-                    corrected_mask.append_null();
-                }
-            }
-            // Generate null joined rows for records which have no matching join key,
-            // for LeftAnti non-matched considered as true
-            corrected_mask.append_n(expected_size - corrected_mask.len(), true);
-            Some(corrected_mask.finish())
-        }
-        JoinType::Full => {
-            let mut mask: Vec<Option<bool>> = vec![Some(true); row_indices_length];
-            let mut last_true_idx = 0;
-            let mut first_row_idx = 0;
-            let mut seen_false = false;
-
-            for i in 0..row_indices_length {
-                let last_index =
-                    last_index_for_row(i, row_indices, batch_ids, row_indices_length);
-                let val = filter_mask.value(i);
-                let is_null = filter_mask.is_null(i);
-
-                if val {
-                    // memoize the first seen matched row
-                    if !seen_true {
-                        last_true_idx = i;
-                    }
-                    seen_true = true;
-                }
-
-                if is_null || val {
-                    mask[i] = Some(true);
-                } else if !is_null && !val && (seen_true || seen_false) {
-                    mask[i] = None;
-                } else {
-                    mask[i] = Some(false);
-                }
-
-                if !is_null && !val {
-                    seen_false = true;
-                }
-
-                if last_index {
-                    // If the left row seen as true its needed to output it once
-                    // To do that we mark all other matches for same row as null to avoid the output
-                    if seen_true {
-                        #[allow(clippy::needless_range_loop)]
-                        for j in first_row_idx..last_true_idx {
-                            mask[j] = None;
-                        }
-                    }
-
-                    seen_true = false;
-                    seen_false = false;
-                    last_true_idx = 0;
-                    first_row_idx = i + 1;
-                }
-            }
-
-            Some(BooleanArray::from(mask))
-        }
-        // Only outer joins needs to keep track of processed rows and apply corrected filter mask
-        _ => None,
-    }
-}
-
-impl Stream for SortMergeJoinStream {
-    type Item = Result<RecordBatch>;
-
-    fn poll_next(
-        mut self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Self::Item>> {
-        let join_time = self.join_metrics.join_time().clone();
-        let _timer = join_time.timer();
-        loop {
-            match &self.state {
-                SortMergeJoinState::Init => {
-                    let streamed_exhausted =
-                        self.streamed_state == StreamedState::Exhausted;
-                    let buffered_exhausted =
-                        self.buffered_state == BufferedState::Exhausted;
-                    self.state = if streamed_exhausted && buffered_exhausted {
-                        SortMergeJoinState::Exhausted
-                    } else {
-                        match self.current_ordering {
-                            Ordering::Less | Ordering::Equal => {
-                                if !streamed_exhausted {
-                                    if self.filter.is_some()
-                                        && matches!(
-                                            self.join_type,
-                                            JoinType::Left
-                                                | JoinType::LeftSemi
-                                                | JoinType::LeftMark
-                                                | JoinType::Right
-                                                | JoinType::RightSemi
-                                                | JoinType::RightMark
-                                                | JoinType::LeftAnti
-                                                | JoinType::RightAnti
-                                                | JoinType::Full
-                                        )
-                                    {
-                                        self.freeze_all()?;
-
-                                        // If join is filtered and there is joined tuples waiting
-                                        // to be filtered
-                                        if !self
-                                            .staging_output_record_batches
-                                            .batches
-                                            .is_empty()
-                                        {
-                                            // Apply filter on joined tuples and get filtered batch
-                                            let out_filtered_batch =
-                                                self.filter_joined_batch()?;
-
-                                            // Append filtered batch to the output buffer
-                                            self.output = concat_batches(
-                                                &self.schema(),
-                                                [&self.output, &out_filtered_batch],
-                                            )?;
-
-                                            // Send to output if the output buffer surpassed the `batch_size`
-                                            if self.output.num_rows() >= self.batch_size {
-                                                let record_batch = std::mem::replace(
-                                                    &mut self.output,
-                                                    RecordBatch::new_empty(
-                                                        out_filtered_batch.schema(),
-                                                    ),
-                                                );
-                                                return Poll::Ready(Some(Ok(
-                                                    record_batch,
-                                                )));
-                                            }
-                                        }
-                                    }
-
-                                    self.streamed_joined = false;
-                                    self.streamed_state = StreamedState::Init;
-                                }
-                            }
-                            Ordering::Greater => {
-                                if !buffered_exhausted {
-                                    self.buffered_joined = false;
-                                    self.buffered_state = BufferedState::Init;
-                                }
-                            }
-                        }
-                        SortMergeJoinState::Polling
-                    };
-                }
-                SortMergeJoinState::Polling => {
-                    if ![StreamedState::Exhausted, StreamedState::Ready]
-                        .contains(&self.streamed_state)
-                    {
-                        match self.poll_streamed_row(cx)? {
-                            Poll::Ready(_) => {}
-                            Poll::Pending => return Poll::Pending,
-                        }
-                    }
-
-                    if ![BufferedState::Exhausted, BufferedState::Ready]
-                        .contains(&self.buffered_state)
-                    {
-                        match self.poll_buffered_batches(cx)? {
-                            Poll::Ready(_) => {}
-                            Poll::Pending => return Poll::Pending,
-                        }
-                    }
-                    let streamed_exhausted =
-                        self.streamed_state == StreamedState::Exhausted;
-                    let buffered_exhausted =
-                        self.buffered_state == BufferedState::Exhausted;
-                    if streamed_exhausted && buffered_exhausted {
-                        self.state = SortMergeJoinState::Exhausted;
-                        continue;
-                    }
-                    self.current_ordering = self.compare_streamed_buffered()?;
-                    self.state = SortMergeJoinState::JoinOutput;
-                }
-                SortMergeJoinState::JoinOutput => {
-                    self.join_partial()?;
-
-                    if self.output_size < self.batch_size {
-                        if self.buffered_data.scanning_finished() {
-                            self.buffered_data.scanning_reset();
-                            self.state = SortMergeJoinState::Init;
-                        }
-                    } else {
-                        self.freeze_all()?;
-                        if !self.staging_output_record_batches.batches.is_empty() {
-                            let record_batch = self.output_record_batch_and_reset()?;
-                            // For non-filtered join output whenever the target output batch size
-                            // is hit. For filtered join its needed to output on later phase
-                            // because target output batch size can be hit in the middle of
-                            // filtering causing the filtering to be incomplete and causing
-                            // correctness issues
-                            if self.filter.is_some()
-                                && matches!(
-                                    self.join_type,
-                                    JoinType::Left
-                                        | JoinType::LeftSemi
-                                        | JoinType::Right
-                                        | JoinType::RightSemi
-                                        | JoinType::LeftAnti
-                                        | JoinType::RightAnti
-                                        | JoinType::LeftMark
-                                        | JoinType::RightMark
-                                        | JoinType::Full
-                                )
-                            {
-                                continue;
-                            }
-
-                            return Poll::Ready(Some(Ok(record_batch)));
-                        }
-                        return Poll::Pending;
-                    }
-                }
-                SortMergeJoinState::Exhausted => {
-                    self.freeze_all()?;
-
-                    // if there is still something not processed
-                    if !self.staging_output_record_batches.batches.is_empty() {
-                        if self.filter.is_some()
-                            && matches!(
-                                self.join_type,
-                                JoinType::Left
-                                    | JoinType::LeftSemi
-                                    | JoinType::Right
-                                    | JoinType::RightSemi
-                                    | JoinType::LeftAnti
-                                    | JoinType::RightAnti
-                                    | JoinType::Full
-                                    | JoinType::LeftMark
-                                    | JoinType::RightMark
-                            )
-                        {
-                            let record_batch = self.filter_joined_batch()?;
-                            return Poll::Ready(Some(Ok(record_batch)));
-                        } else {
-                            let record_batch = self.output_record_batch_and_reset()?;
-                            return Poll::Ready(Some(Ok(record_batch)));
-                        }
-                    } else if self.output.num_rows() > 0 {
-                        // if processed but still not outputted because it didn't hit batch size before
-                        let schema = self.output.schema();
-                        let record_batch = std::mem::replace(
-                            &mut self.output,
-                            RecordBatch::new_empty(schema),
-                        );
-                        return Poll::Ready(Some(Ok(record_batch)));
-                    } else {
-                        return Poll::Ready(None);
-                    }
-                }
-            }
-        }
-    }
-}
-
-impl SortMergeJoinStream {
-    #[allow(clippy::too_many_arguments)]
-    pub fn try_new(
-        // Configured via `datafusion.execution.spill_compression`.
-        spill_compression: SpillCompression,
-        schema: SchemaRef,
-        sort_options: Vec<SortOptions>,
-        null_equality: NullEquality,
-        streamed: SendableRecordBatchStream,
-        buffered: SendableRecordBatchStream,
-        on_streamed: Vec<Arc<dyn PhysicalExpr>>,
-        on_buffered: Vec<Arc<dyn PhysicalExpr>>,
-        filter: Option<JoinFilter>,
-        join_type: JoinType,
-        batch_size: usize,
-        join_metrics: SortMergeJoinMetrics,
-        reservation: MemoryReservation,
-        runtime_env: Arc<RuntimeEnv>,
-    ) -> Result<Self> {
-        let streamed_schema = streamed.schema();
-        let buffered_schema = buffered.schema();
-        let spill_manager = SpillManager::new(
-            Arc::clone(&runtime_env),
-            join_metrics.spill_metrics().clone(),
-            Arc::clone(&buffered_schema),
-        )
-        .with_compression_type(spill_compression);
-        Ok(Self {
-            state: SortMergeJoinState::Init,
-            sort_options,
-            null_equality,
-            schema: Arc::clone(&schema),
-            streamed_schema: Arc::clone(&streamed_schema),
-            buffered_schema,
-            streamed,
-            buffered,
-            streamed_batch: StreamedBatch::new_empty(streamed_schema),
-            buffered_data: BufferedData::default(),
-            streamed_joined: false,
-            buffered_joined: false,
-            streamed_state: StreamedState::Init,
-            buffered_state: BufferedState::Init,
-            current_ordering: Ordering::Equal,
-            on_streamed,
-            on_buffered,
-            filter,
-            staging_output_record_batches: JoinedRecordBatches {
-                batches: vec![],
-                filter_mask: BooleanBuilder::new(),
-                row_indices: UInt64Builder::new(),
-                batch_ids: vec![],
-            },
-            output: RecordBatch::new_empty(schema),
-            output_size: 0,
-            batch_size,
-            join_type,
-            join_metrics,
-            reservation,
-            runtime_env,
-            spill_manager,
-            streamed_batch_counter: AtomicUsize::new(0),
-        })
-    }
-
-    /// Poll next streamed row
-    fn poll_streamed_row(&mut self, cx: &mut Context) -> Poll<Option<Result<()>>> {
-        loop {
-            match &self.streamed_state {
-                StreamedState::Init => {
-                    if self.streamed_batch.idx + 1 < self.streamed_batch.batch.num_rows()
-                    {
-                        self.streamed_batch.idx += 1;
-                        self.streamed_state = StreamedState::Ready;
-                        return Poll::Ready(Some(Ok(())));
-                    } else {
-                        self.streamed_state = StreamedState::Polling;
-                    }
-                }
-                StreamedState::Polling => match self.streamed.poll_next_unpin(cx)? {
-                    Poll::Pending => {
-                        return Poll::Pending;
-                    }
-                    Poll::Ready(None) => {
-                        self.streamed_state = StreamedState::Exhausted;
-                    }
-                    Poll::Ready(Some(batch)) => {
-                        if batch.num_rows() > 0 {
-                            self.freeze_streamed()?;
-                            self.join_metrics.input_batches().add(1);
-                            self.join_metrics.input_rows().add(batch.num_rows());
-                            self.streamed_batch =
-                                StreamedBatch::new(batch, &self.on_streamed);
-                            // Every incoming streaming batch should have its unique id
-                            // Check `JoinedRecordBatches.self.streamed_batch_counter` documentation
-                            self.streamed_batch_counter
-                                .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
-                            self.streamed_state = StreamedState::Ready;
-                        }
-                    }
-                },
-                StreamedState::Ready => {
-                    return Poll::Ready(Some(Ok(())));
-                }
-                StreamedState::Exhausted => {
-                    return Poll::Ready(None);
-                }
-            }
-        }
-    }
-
-    fn free_reservation(&mut self, buffered_batch: BufferedBatch) -> Result<()> {
-        // Shrink memory usage for in-memory batches only
-        if let BufferedBatchState::InMemory(_) = buffered_batch.batch {
-            self.reservation
-                .try_shrink(buffered_batch.size_estimation)?;
-        }
-        Ok(())
-    }
-
-    fn allocate_reservation(&mut self, mut buffered_batch: BufferedBatch) -> Result<()> {
-        match self.reservation.try_grow(buffered_batch.size_estimation) {
-            Ok(_) => {
-                self.join_metrics
-                    .peak_mem_used()
-                    .set_max(self.reservation.size());
-                Ok(())
-            }
-            Err(_) if self.runtime_env.disk_manager.tmp_files_enabled() => {
-                // Spill buffered batch to disk
-
-                match buffered_batch.batch {
-                    BufferedBatchState::InMemory(batch) => {
-                        let spill_file = self
-                            .spill_manager
-                            .spill_record_batch_and_finish(
-                                &[batch],
-                                "sort_merge_join_buffered_spill",
-                            )?
-                            .unwrap(); // Operation only return None if no batches are spilled, here we ensure that at least one batch is spilled
-
-                        buffered_batch.batch = BufferedBatchState::Spilled(spill_file);
-                        Ok(())
-                    }
-                    _ => internal_err!("Buffered batch has empty body"),
-                }
-            }
-            Err(e) => exec_err!("{}. Disk spilling disabled.", e.message()),
-        }?;
-
-        self.buffered_data.batches.push_back(buffered_batch);
-        Ok(())
-    }
-
-    /// Poll next buffered batches
-    fn poll_buffered_batches(&mut self, cx: &mut Context) -> Poll<Option<Result<()>>> {
-        loop {
-            match &self.buffered_state {
-                BufferedState::Init => {
-                    // pop previous buffered batches
-                    while !self.buffered_data.batches.is_empty() {
-                        let head_batch = self.buffered_data.head_batch();
-                        // If the head batch is fully processed, dequeue it and produce output of it.
-                        if head_batch.range.end == head_batch.num_rows {
-                            self.freeze_dequeuing_buffered()?;
-                            if let Some(mut buffered_batch) =
-                                self.buffered_data.batches.pop_front()
-                            {
-                                self.produce_buffered_not_matched(&mut buffered_batch)?;
-                                self.free_reservation(buffered_batch)?;
-                            }
-                        } else {
-                            // If the head batch is not fully processed, break the loop.
-                            // Streamed batch will be joined with the head batch in the next step.
-                            break;
-                        }
-                    }
-                    if self.buffered_data.batches.is_empty() {
-                        self.buffered_state = BufferedState::PollingFirst;
-                    } else {
-                        let tail_batch = self.buffered_data.tail_batch_mut();
-                        tail_batch.range.start = tail_batch.range.end;
-                        tail_batch.range.end += 1;
-                        self.buffered_state = BufferedState::PollingRest;
-                    }
-                }
-                BufferedState::PollingFirst => match self.buffered.poll_next_unpin(cx)? {
-                    Poll::Pending => {
-                        return Poll::Pending;
-                    }
-                    Poll::Ready(None) => {
-                        self.buffered_state = BufferedState::Exhausted;
-                        return Poll::Ready(None);
-                    }
-                    Poll::Ready(Some(batch)) => {
-                        self.join_metrics.input_batches().add(1);
-                        self.join_metrics.input_rows().add(batch.num_rows());
-
-                        if batch.num_rows() > 0 {
-                            let buffered_batch =
-                                BufferedBatch::new(batch, 0..1, &self.on_buffered);
-
-                            self.allocate_reservation(buffered_batch)?;
-                            self.buffered_state = BufferedState::PollingRest;
-                        }
-                    }
-                },
-                BufferedState::PollingRest => {
-                    if self.buffered_data.tail_batch().range.end
-                        < self.buffered_data.tail_batch().num_rows
-                    {
-                        while self.buffered_data.tail_batch().range.end
-                            < self.buffered_data.tail_batch().num_rows
-                        {
-                            if is_join_arrays_equal(
-                                &self.buffered_data.head_batch().join_arrays,
-                                self.buffered_data.head_batch().range.start,
-                                &self.buffered_data.tail_batch().join_arrays,
-                                self.buffered_data.tail_batch().range.end,
-                            )? {
-                                self.buffered_data.tail_batch_mut().range.end += 1;
-                            } else {
-                                self.buffered_state = BufferedState::Ready;
-                                return Poll::Ready(Some(Ok(())));
-                            }
-                        }
-                    } else {
-                        match self.buffered.poll_next_unpin(cx)? {
-                            Poll::Pending => {
-                                return Poll::Pending;
-                            }
-                            Poll::Ready(None) => {
-                                self.buffered_state = BufferedState::Ready;
-                            }
-                            Poll::Ready(Some(batch)) => {
-                                // Polling batches coming concurrently as multiple partitions
-                                self.join_metrics.input_batches().add(1);
-                                self.join_metrics.input_rows().add(batch.num_rows());
-                                if batch.num_rows() > 0 {
-                                    let buffered_batch = BufferedBatch::new(
-                                        batch,
-                                        0..0,
-                                        &self.on_buffered,
-                                    );
-                                    self.allocate_reservation(buffered_batch)?;
-                                }
-                            }
-                        }
-                    }
-                }
-                BufferedState::Ready => {
-                    return Poll::Ready(Some(Ok(())));
-                }
-                BufferedState::Exhausted => {
-                    return Poll::Ready(None);
-                }
-            }
-        }
-    }
-
-    /// Get comparison result of streamed row and buffered batches
-    fn compare_streamed_buffered(&self) -> Result<Ordering> {
-        if self.streamed_state == StreamedState::Exhausted {
-            return Ok(Ordering::Greater);
-        }
-        if !self.buffered_data.has_buffered_rows() {
-            return Ok(Ordering::Less);
-        }
-
-        compare_join_arrays(
-            &self.streamed_batch.join_arrays,
-            self.streamed_batch.idx,
-            &self.buffered_data.head_batch().join_arrays,
-            self.buffered_data.head_batch().range.start,
-            &self.sort_options,
-            self.null_equality,
-        )
-    }
-
-    /// Produce join and fill output buffer until reaching target batch size
-    /// or the join is finished
-    fn join_partial(&mut self) -> Result<()> {
-        // Whether to join streamed rows
-        let mut join_streamed = false;
-        // Whether to join buffered rows
-        let mut join_buffered = false;
-        // For Mark join we store a dummy id to indicate the row has a match
-        let mut mark_row_as_match = false;
-
-        // determine whether we need to join streamed/buffered rows
-        match self.current_ordering {
-            Ordering::Less => {
-                if matches!(
-                    self.join_type,
-                    JoinType::Left
-                        | JoinType::Right
-                        | JoinType::Full
-                        | JoinType::LeftAnti
-                        | JoinType::RightAnti
-                        | JoinType::LeftMark
-                        | JoinType::RightMark
-                ) {
-                    join_streamed = !self.streamed_joined;
-                }
-            }
-            Ordering::Equal => {
-                if matches!(
-                    self.join_type,
-                    JoinType::LeftSemi
-                        | JoinType::LeftMark
-                        | JoinType::RightSemi
-                        | JoinType::RightMark
-                ) {
-                    mark_row_as_match = matches!(
-                        self.join_type,
-                        JoinType::LeftMark | JoinType::RightMark
-                    );
-                    // if the join filter is specified then its needed to output the streamed index
-                    // only if it has not been emitted before
-                    // the `join_filter_matched_idxs` keeps track on if streamed index has a successful
-                    // filter match and prevents the same index to go into output more than once
-                    if self.filter.is_some() {
-                        join_streamed = !self
-                            .streamed_batch
-                            .join_filter_matched_idxs
-                            .contains(&(self.streamed_batch.idx as u64))
-                            && !self.streamed_joined;
-                        // if the join filter specified there can be references to buffered columns
-                        // so buffered columns are needed to access them
-                        join_buffered = join_streamed;
-                    } else {
-                        join_streamed = !self.streamed_joined;
-                    }
-                }
-                if matches!(
-                    self.join_type,
-                    JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full
-                ) {
-                    join_streamed = true;
-                    join_buffered = true;
-                };
-
-                if matches!(self.join_type, JoinType::LeftAnti | JoinType::RightAnti)
-                    && self.filter.is_some()
-                {
-                    join_streamed = !self.streamed_joined;
-                    join_buffered = join_streamed;
-                }
-            }
-            Ordering::Greater => {
-                if matches!(self.join_type, JoinType::Full) {
-                    join_buffered = !self.buffered_joined;
-                };
-            }
-        }
-        if !join_streamed && !join_buffered {
-            // no joined data
-            self.buffered_data.scanning_finish();
-            return Ok(());
-        }
-
-        if join_buffered {
-            // joining streamed/nulls and buffered
-            while !self.buffered_data.scanning_finished()
-                && self.output_size < self.batch_size
-            {
-                let scanning_idx = self.buffered_data.scanning_idx();
-                if join_streamed {
-                    // Join streamed row and buffered row
-                    self.streamed_batch.append_output_pair(
-                        Some(self.buffered_data.scanning_batch_idx),
-                        Some(scanning_idx),
-                    );
-                } else {
-                    // Join nulls and buffered row for FULL join
-                    self.buffered_data
-                        .scanning_batch_mut()
-                        .null_joined
-                        .push(scanning_idx);
-                }
-                self.output_size += 1;
-                self.buffered_data.scanning_advance();
-
-                if self.buffered_data.scanning_finished() {
-                    self.streamed_joined = join_streamed;
-                    self.buffered_joined = true;
-                }
-            }
-        } else {
-            // joining streamed and nulls
-            let scanning_batch_idx = if self.buffered_data.scanning_finished() {
-                None
-            } else {
-                Some(self.buffered_data.scanning_batch_idx)
-            };
-            // For Mark join we store a dummy id to indicate the row has a match
-            let scanning_idx = mark_row_as_match.then_some(0);
-
-            self.streamed_batch
-                .append_output_pair(scanning_batch_idx, scanning_idx);
-            self.output_size += 1;
-            self.buffered_data.scanning_finish();
-            self.streamed_joined = true;
-        }
-        Ok(())
-    }
-
-    fn freeze_all(&mut self) -> Result<()> {
-        self.freeze_buffered(self.buffered_data.batches.len())?;
-        self.freeze_streamed()?;
-        Ok(())
-    }
-
-    // Produces and stages record batches to ensure dequeued buffered batch
-    // no longer needed:
-    //   1. freezes all indices joined to streamed side
-    //   2. freezes NULLs joined to dequeued buffered batch to "release" it
-    fn freeze_dequeuing_buffered(&mut self) -> Result<()> {
-        self.freeze_streamed()?;
-        // Only freeze and produce the first batch in buffered_data as the batch is fully processed
-        self.freeze_buffered(1)?;
-        Ok(())
-    }
-
-    // Produces and stages record batch from buffered indices with corresponding
-    // NULLs on streamed side.
-    //
-    // Applicable only in case of Full join.
-    //
-    fn freeze_buffered(&mut self, batch_count: usize) -> Result<()> {
-        if !matches!(self.join_type, JoinType::Full) {
-            return Ok(());
-        }
-        for buffered_batch in self.buffered_data.batches.range_mut(..batch_count) {
-            let buffered_indices = UInt64Array::from_iter_values(
-                buffered_batch.null_joined.iter().map(|&index| index as u64),
-            );
-            if let Some(record_batch) = produce_buffered_null_batch(
-                &self.schema,
-                &self.streamed_schema,
-                &buffered_indices,
-                buffered_batch,
-            )? {
-                let num_rows = record_batch.num_rows();
-                self.staging_output_record_batches
-                    .filter_mask
-                    .append_nulls(num_rows);
-                self.staging_output_record_batches
-                    .row_indices
-                    .append_nulls(num_rows);
-                self.staging_output_record_batches.batch_ids.resize(
-                    self.staging_output_record_batches.batch_ids.len() + num_rows,
-                    0,
-                );
-
-                self.staging_output_record_batches
-                    .batches
-                    .push(record_batch);
-            }
-            buffered_batch.null_joined.clear();
-        }
-        Ok(())
-    }
-
-    fn produce_buffered_not_matched(
-        &mut self,
-        buffered_batch: &mut BufferedBatch,
-    ) -> Result<()> {
-        if !matches!(self.join_type, JoinType::Full) {
-            return Ok(());
-        }
-
-        // For buffered row which is joined with streamed side rows but all joined rows
-        // don't satisfy the join filter
-        let not_matched_buffered_indices = buffered_batch
-            .join_filter_not_matched_map
-            .iter()
-            .filter_map(|(idx, failed)| if *failed { Some(*idx) } else { None })
-            .collect::<Vec<_>>();
-
-        let buffered_indices =
-            UInt64Array::from_iter_values(not_matched_buffered_indices.iter().copied());
-
-        if let Some(record_batch) = produce_buffered_null_batch(
-            &self.schema,
-            &self.streamed_schema,
-            &buffered_indices,
-            buffered_batch,
-        )? {
-            let num_rows = record_batch.num_rows();
-
-            self.staging_output_record_batches
-                .filter_mask
-                .append_nulls(num_rows);
-            self.staging_output_record_batches
-                .row_indices
-                .append_nulls(num_rows);
-            self.staging_output_record_batches.batch_ids.resize(
-                self.staging_output_record_batches.batch_ids.len() + num_rows,
-                0,
-            );
-            self.staging_output_record_batches
-                .batches
-                .push(record_batch);
-        }
-        buffered_batch.join_filter_not_matched_map.clear();
-
-        Ok(())
-    }
-
-    // Produces and stages record batch for all output indices found
-    // for current streamed batch and clears staged output indices.
-    fn freeze_streamed(&mut self) -> Result<()> {
-        for chunk in self.streamed_batch.output_indices.iter_mut() {
-            // The row indices of joined streamed batch
-            let left_indices = chunk.streamed_indices.finish();
-
-            if left_indices.is_empty() {
-                continue;
-            }
-
-            let mut left_columns = self
-                .streamed_batch
-                .batch
-                .columns()
-                .iter()
-                .map(|column| take(column, &left_indices, None))
-                .collect::<Result<Vec<_>, ArrowError>>()?;
-
-            // The row indices of joined buffered batch
-            let right_indices: UInt64Array = chunk.buffered_indices.finish();
-            let mut right_columns =
-                if matches!(self.join_type, JoinType::LeftMark | JoinType::RightMark) {
-                    vec![Arc::new(is_not_null(&right_indices)?) as ArrayRef]
-                } else if matches!(
-                    self.join_type,
-                    JoinType::LeftSemi
-                        | JoinType::LeftAnti
-                        | JoinType::RightAnti
-                        | JoinType::RightSemi
-                ) {
-                    vec![]
-                } else if let Some(buffered_idx) = chunk.buffered_batch_idx {
-                    fetch_right_columns_by_idxs(
-                        &self.buffered_data,
-                        buffered_idx,
-                        &right_indices,
-                    )?
-                } else {
-                    // If buffered batch none, meaning it is null joined batch.
-                    // We need to create null arrays for buffered columns to join with streamed rows.
-                    create_unmatched_columns(
-                        self.join_type,
-                        &self.buffered_schema,
-                        right_indices.len(),
-                    )
-                };
-
-            // Prepare the columns we apply join filter on later.
-            // Only for joined rows between streamed and buffered.
-            let filter_columns = if chunk.buffered_batch_idx.is_some() {
-                if !matches!(self.join_type, JoinType::Right) {
-                    if matches!(
-                        self.join_type,
-                        JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark
-                    ) {
-                        let right_cols = fetch_right_columns_by_idxs(
-                            &self.buffered_data,
-                            chunk.buffered_batch_idx.unwrap(),
-                            &right_indices,
-                        )?;
-
-                        get_filter_column(&self.filter, &left_columns, &right_cols)
-                    } else if matches!(
-                        self.join_type,
-                        JoinType::RightAnti | JoinType::RightSemi | JoinType::RightMark
-                    ) {
-                        let right_cols = fetch_right_columns_by_idxs(
-                            &self.buffered_data,
-                            chunk.buffered_batch_idx.unwrap(),
-                            &right_indices,
-                        )?;
-
-                        get_filter_column(&self.filter, &right_cols, &left_columns)
-                    } else {
-                        get_filter_column(&self.filter, &left_columns, &right_columns)
-                    }
-                } else {
-                    get_filter_column(&self.filter, &right_columns, &left_columns)
-                }
-            } else {
-                // This chunk is totally for null joined rows (outer join), we don't need to apply join filter.
-                // Any join filter applied only on either streamed or buffered side will be pushed already.
-                vec![]
-            };
-
-            let columns = if !matches!(self.join_type, JoinType::Right) {
-                left_columns.extend(right_columns);
-                left_columns
-            } else {
-                right_columns.extend(left_columns);
-                right_columns
-            };
-
-            let output_batch = RecordBatch::try_new(Arc::clone(&self.schema), columns)?;
-            // Apply join filter if any
-            if !filter_columns.is_empty() {
-                if let Some(f) = &self.filter {
-                    // Construct batch with only filter columns
-                    let filter_batch =
-                        RecordBatch::try_new(Arc::clone(f.schema()), filter_columns)?;
-
-                    let filter_result = f
-                        .expression()
-                        .evaluate(&filter_batch)?
-                        .into_array(filter_batch.num_rows())?;
-
-                    // The boolean selection mask of the join filter result
-                    let pre_mask =
-                        datafusion_common::cast::as_boolean_array(&filter_result)?;
-
-                    // If there are nulls in join filter result, exclude them from selecting
-                    // the rows to output.
-                    let mask = if pre_mask.null_count() > 0 {
-                        compute::prep_null_mask_filter(
-                            datafusion_common::cast::as_boolean_array(&filter_result)?,
-                        )
-                    } else {
-                        pre_mask.clone()
-                    };
-
-                    // Push the filtered batch which contains rows passing join filter to the output
-                    if matches!(
-                        self.join_type,
-                        JoinType::Left
-                            | JoinType::LeftSemi
-                            | JoinType::Right
-                            | JoinType::RightSemi
-                            | JoinType::LeftAnti
-                            | JoinType::RightAnti
-                            | JoinType::LeftMark
-                            | JoinType::RightMark
-                            | JoinType::Full
-                    ) {
-                        self.staging_output_record_batches
-                            .batches
-                            .push(output_batch);
-                    } else {
-                        let filtered_batch = filter_record_batch(&output_batch, &mask)?;
-                        self.staging_output_record_batches
-                            .batches
-                            .push(filtered_batch);
-                    }
-
-                    if !matches!(self.join_type, JoinType::Full) {
-                        self.staging_output_record_batches.filter_mask.extend(&mask);
-                    } else {
-                        self.staging_output_record_batches
-                            .filter_mask
-                            .extend(pre_mask);
-                    }
-                    self.staging_output_record_batches
-                        .row_indices
-                        .extend(&left_indices);
-                    self.staging_output_record_batches.batch_ids.resize(
-                        self.staging_output_record_batches.batch_ids.len()
-                            + left_indices.len(),
-                        self.streamed_batch_counter.load(Relaxed),
-                    );
-
-                    // For outer joins, we need to push the null joined rows to the output if
-                    // all joined rows are failed on the join filter.
-                    // I.e., if all rows joined from a streamed row are failed with the join filter,
-                    // we need to join it with nulls as buffered side.
-                    if matches!(self.join_type, JoinType::Full) {
-                        let buffered_batch = &mut self.buffered_data.batches
-                            [chunk.buffered_batch_idx.unwrap()];
-
-                        for i in 0..pre_mask.len() {
-                            // If the buffered row is not joined with streamed side,
-                            // skip it.
-                            if right_indices.is_null(i) {
-                                continue;
-                            }
-
-                            let buffered_index = right_indices.value(i);
-
-                            buffered_batch.join_filter_not_matched_map.insert(
-                                buffered_index,
-                                *buffered_batch
-                                    .join_filter_not_matched_map
-                                    .get(&buffered_index)
-                                    .unwrap_or(&true)
-                                    && !pre_mask.value(i),
-                            );
-                        }
-                    }
-                } else {
-                    self.staging_output_record_batches
-                        .batches
-                        .push(output_batch);
-                }
-            } else {
-                self.staging_output_record_batches
-                    .batches
-                    .push(output_batch);
-            }
-        }
-
-        self.streamed_batch.output_indices.clear();
-
-        Ok(())
-    }
-
-    fn output_record_batch_and_reset(&mut self) -> Result<RecordBatch> {
-        let record_batch =
-            concat_batches(&self.schema, &self.staging_output_record_batches.batches)?;
-        self.join_metrics.output_batches().add(1);
-        self.join_metrics
-            .baseline_metrics()
-            .record_output(record_batch.num_rows());
-        // If join filter exists, `self.output_size` is not accurate as we don't know the exact
-        // number of rows in the output record batch. If streamed row joined with buffered rows,
-        // once join filter is applied, the number of output rows may be more than 1.
-        // If `record_batch` is empty, we should reset `self.output_size` to 0. It could be happened
-        // when the join filter is applied and all rows are filtered out.
-        if record_batch.num_rows() == 0 || record_batch.num_rows() > self.output_size {
-            self.output_size = 0;
-        } else {
-            self.output_size -= record_batch.num_rows();
-        }
-
-        if !(self.filter.is_some()
-            && matches!(
-                self.join_type,
-                JoinType::Left
-                    | JoinType::LeftSemi
-                    | JoinType::Right
-                    | JoinType::RightSemi
-                    | JoinType::LeftAnti
-                    | JoinType::RightAnti
-                    | JoinType::LeftMark
-                    | JoinType::RightMark
-                    | JoinType::Full
-            ))
-        {
-            self.staging_output_record_batches.batches.clear();
-        }
-
-        Ok(record_batch)
-    }
-
-    fn filter_joined_batch(&mut self) -> Result<RecordBatch> {
-        let record_batch =
-            concat_batches(&self.schema, &self.staging_output_record_batches.batches)?;
-        let mut out_indices = self.staging_output_record_batches.row_indices.finish();
-        let mut out_mask = self.staging_output_record_batches.filter_mask.finish();
-        let mut batch_ids = &self.staging_output_record_batches.batch_ids;
-        let default_batch_ids = vec![0; record_batch.num_rows()];
-
-        // If only nulls come in and indices sizes doesn't match with expected record batch count
-        // generate missing indices
-        // Happens for null joined batches for Full Join
-        if out_indices.null_count() == out_indices.len()
-            && out_indices.len() != record_batch.num_rows()
-        {
-            out_mask = BooleanArray::from(vec![None; record_batch.num_rows()]);
-            out_indices = UInt64Array::from(vec![None; record_batch.num_rows()]);
-            batch_ids = &default_batch_ids;
-        }
-
-        if out_mask.is_empty() {
-            self.staging_output_record_batches.batches.clear();
-            return Ok(record_batch);
-        }
-
-        let maybe_corrected_mask = get_corrected_filter_mask(
-            self.join_type,
-            &out_indices,
-            batch_ids,
-            &out_mask,
-            record_batch.num_rows(),
-        );
-
-        let corrected_mask = if let Some(ref filtered_join_mask) = maybe_corrected_mask {
-            filtered_join_mask
-        } else {
-            &out_mask
-        };
-
-        self.filter_record_batch_by_join_type(record_batch, corrected_mask)
-    }
-
-    fn filter_record_batch_by_join_type(
-        &mut self,
-        record_batch: RecordBatch,
-        corrected_mask: &BooleanArray,
-    ) -> Result<RecordBatch> {
-        let mut filtered_record_batch =
-            filter_record_batch(&record_batch, corrected_mask)?;
-        let left_columns_length = self.streamed_schema.fields.len();
-        let right_columns_length = self.buffered_schema.fields.len();
-
-        if matches!(
-            self.join_type,
-            JoinType::Left | JoinType::LeftMark | JoinType::Right | JoinType::RightMark
-        ) {
-            let null_mask = compute::not(corrected_mask)?;
-            let null_joined_batch = filter_record_batch(&record_batch, &null_mask)?;
-
-            let mut right_columns = create_unmatched_columns(
-                self.join_type,
-                &self.buffered_schema,
-                null_joined_batch.num_rows(),
-            );
-
-            let columns = if !matches!(self.join_type, JoinType::Right) {
-                let mut left_columns = null_joined_batch
-                    .columns()
-                    .iter()
-                    .take(right_columns_length)
-                    .cloned()
-                    .collect::<Vec<_>>();
-
-                left_columns.extend(right_columns);
-                left_columns
-            } else {
-                let left_columns = null_joined_batch
-                    .columns()
-                    .iter()
-                    .skip(left_columns_length)
-                    .cloned()
-                    .collect::<Vec<_>>();
-
-                right_columns.extend(left_columns);
-                right_columns
-            };
-
-            // Push the streamed/buffered batch joined nulls to the output
-            let null_joined_streamed_batch =
-                RecordBatch::try_new(Arc::clone(&self.schema), columns)?;
-
-            filtered_record_batch = concat_batches(
-                &self.schema,
-                &[filtered_record_batch, null_joined_streamed_batch],
-            )?;
-        } else if matches!(self.join_type, JoinType::LeftSemi | JoinType::LeftAnti) {
-            let output_column_indices = (0..left_columns_length).collect::<Vec<_>>();
-            filtered_record_batch =
-                filtered_record_batch.project(&output_column_indices)?;
-        } else if matches!(self.join_type, JoinType::RightAnti | JoinType::RightSemi) {
-            let output_column_indices = (0..right_columns_length).collect::<Vec<_>>();
-            filtered_record_batch =
-                filtered_record_batch.project(&output_column_indices)?;
-        } else if matches!(self.join_type, JoinType::Full)
-            && corrected_mask.false_count() > 0
-        {
-            // Find rows which joined by key but Filter predicate evaluated as false
-            let joined_filter_not_matched_mask = compute::not(corrected_mask)?;
-            let joined_filter_not_matched_batch =
-                filter_record_batch(&record_batch, &joined_filter_not_matched_mask)?;
-
-            // Add left unmatched rows adding the right side as nulls
-            let right_null_columns = self
-                .buffered_schema
-                .fields()
-                .iter()
-                .map(|f| {
-                    new_null_array(
-                        f.data_type(),
-                        joined_filter_not_matched_batch.num_rows(),
-                    )
-                })
-                .collect::<Vec<_>>();
-
-            let mut result_joined = joined_filter_not_matched_batch
-                .columns()
-                .iter()
-                .take(left_columns_length)
-                .cloned()
-                .collect::<Vec<_>>();
-
-            result_joined.extend(right_null_columns);
-
-            let left_null_joined_batch =
-                RecordBatch::try_new(Arc::clone(&self.schema), result_joined)?;
-
-            // Add right unmatched rows adding the left side as nulls
-            let mut result_joined = self
-                .streamed_schema
-                .fields()
-                .iter()
-                .map(|f| {
-                    new_null_array(
-                        f.data_type(),
-                        joined_filter_not_matched_batch.num_rows(),
-                    )
-                })
-                .collect::<Vec<_>>();
-
-            let right_data = joined_filter_not_matched_batch
-                .columns()
-                .iter()
-                .skip(left_columns_length)
-                .cloned()
-                .collect::<Vec<_>>();
-
-            result_joined.extend(right_data);
-
-            filtered_record_batch = concat_batches(
-                &self.schema,
-                &[filtered_record_batch, left_null_joined_batch],
-            )?;
-        }
-
-        self.staging_output_record_batches.clear();
-
-        Ok(filtered_record_batch)
-    }
-}
-
-fn create_unmatched_columns(
-    join_type: JoinType,
-    schema: &SchemaRef,
-    size: usize,
-) -> Vec<ArrayRef> {
-    if matches!(join_type, JoinType::LeftMark | JoinType::RightMark) {
-        vec![Arc::new(BooleanArray::from(vec![false; size])) as ArrayRef]
-    } else {
-        schema
-            .fields()
-            .iter()
-            .map(|f| new_null_array(f.data_type(), size))
-            .collect::<Vec<_>>()
-    }
-}
-
-/// Gets the arrays which join filters are applied on.
-fn get_filter_column(
-    join_filter: &Option<JoinFilter>,
-    streamed_columns: &[ArrayRef],
-    buffered_columns: &[ArrayRef],
-) -> Vec<ArrayRef> {
-    let mut filter_columns = vec![];
-
-    if let Some(f) = join_filter {
-        let left_columns = f
-            .column_indices()
-            .iter()
-            .filter(|col_index| col_index.side == JoinSide::Left)
-            .map(|i| Arc::clone(&streamed_columns[i.index]))
-            .collect::<Vec<_>>();
-
-        let right_columns = f
-            .column_indices()
-            .iter()
-            .filter(|col_index| col_index.side == JoinSide::Right)
-            .map(|i| Arc::clone(&buffered_columns[i.index]))
-            .collect::<Vec<_>>();
-
-        filter_columns.extend(left_columns);
-        filter_columns.extend(right_columns);
-    }
-
-    filter_columns
-}
-
-fn produce_buffered_null_batch(
-    schema: &SchemaRef,
-    streamed_schema: &SchemaRef,
-    buffered_indices: &PrimitiveArray<UInt64Type>,
-    buffered_batch: &BufferedBatch,
-) -> Result<Option<RecordBatch>> {
-    if buffered_indices.is_empty() {
-        return Ok(None);
-    }
-
-    // Take buffered (right) columns
-    let right_columns =
-        fetch_right_columns_from_batch_by_idxs(buffered_batch, buffered_indices)?;
-
-    // Create null streamed (left) columns
-    let mut left_columns = streamed_schema
-        .fields()
-        .iter()
-        .map(|f| new_null_array(f.data_type(), buffered_indices.len()))
-        .collect::<Vec<_>>();
-
-    left_columns.extend(right_columns);
-
-    Ok(Some(RecordBatch::try_new(
-        Arc::clone(schema),
-        left_columns,
-    )?))
-}
-
-/// Get `buffered_indices` rows for `buffered_data[buffered_batch_idx]` by specific column indices
-#[inline(always)]
-fn fetch_right_columns_by_idxs(
-    buffered_data: &BufferedData,
-    buffered_batch_idx: usize,
-    buffered_indices: &UInt64Array,
-) -> Result<Vec<ArrayRef>> {
-    fetch_right_columns_from_batch_by_idxs(
-        &buffered_data.batches[buffered_batch_idx],
-        buffered_indices,
-    )
-}
-
-#[inline(always)]
-fn fetch_right_columns_from_batch_by_idxs(
-    buffered_batch: &BufferedBatch,
-    buffered_indices: &UInt64Array,
-) -> Result<Vec<ArrayRef>> {
-    match &buffered_batch.batch {
-        // In memory batch
-        BufferedBatchState::InMemory(batch) => Ok(batch
-            .columns()
-            .iter()
-            .map(|column| take(column, &buffered_indices, None))
-            .collect::<Result<Vec<_>, ArrowError>>()
-            .map_err(Into::<DataFusionError>::into)?),
-        // If the batch was spilled to disk, less likely
-        BufferedBatchState::Spilled(spill_file) => {
-            let mut buffered_cols: Vec<ArrayRef> =
-                Vec::with_capacity(buffered_indices.len());
-
-            let file = BufReader::new(File::open(spill_file.path())?);
-            let reader = StreamReader::try_new(file, None)?;
-
-            for batch in reader {
-                batch?.columns().iter().for_each(|column| {
-                    buffered_cols.extend(take(column, &buffered_indices, None))
-                });
-            }
-
-            Ok(buffered_cols)
-        }
-    }
-}
-
-/// Buffered data contains all buffered batches with one unique join key
-#[derive(Debug, Default)]
-pub(super) struct BufferedData {
-    /// Buffered batches with the same key
-    pub batches: VecDeque<BufferedBatch>,
-    /// current scanning batch index used in join_partial()
-    pub scanning_batch_idx: usize,
-    /// current scanning offset used in join_partial()
-    pub scanning_offset: usize,
-}
-
-impl BufferedData {
-    pub fn head_batch(&self) -> &BufferedBatch {
-        self.batches.front().unwrap()
-    }
-
-    pub fn tail_batch(&self) -> &BufferedBatch {
-        self.batches.back().unwrap()
-    }
-
-    pub fn tail_batch_mut(&mut self) -> &mut BufferedBatch {
-        self.batches.back_mut().unwrap()
-    }
-
-    pub fn has_buffered_rows(&self) -> bool {
-        self.batches.iter().any(|batch| !batch.range.is_empty())
-    }
-
-    pub fn scanning_reset(&mut self) {
-        self.scanning_batch_idx = 0;
-        self.scanning_offset = 0;
-    }
-
-    pub fn scanning_advance(&mut self) {
-        self.scanning_offset += 1;
-        while !self.scanning_finished() && self.scanning_batch_finished() {
-            self.scanning_batch_idx += 1;
-            self.scanning_offset = 0;
-        }
-    }
-
-    pub fn scanning_batch(&self) -> &BufferedBatch {
-        &self.batches[self.scanning_batch_idx]
-    }
-
-    pub fn scanning_batch_mut(&mut self) -> &mut BufferedBatch {
-        &mut self.batches[self.scanning_batch_idx]
-    }
-
-    pub fn scanning_idx(&self) -> usize {
-        self.scanning_batch().range.start + self.scanning_offset
-    }
-
-    pub fn scanning_batch_finished(&self) -> bool {
-        self.scanning_offset == self.scanning_batch().range.len()
-    }
-
-    pub fn scanning_finished(&self) -> bool {
-        self.scanning_batch_idx == self.batches.len()
-    }
-
-    pub fn scanning_finish(&mut self) {
-        self.scanning_batch_idx = self.batches.len();
-        self.scanning_offset = 0;
-    }
-}
-
-/// Get join array refs of given batch and join columns
-fn join_arrays(batch: &RecordBatch, on_column: &[PhysicalExprRef]) -> Vec<ArrayRef> {
-    on_column
-        .iter()
-        .map(|c| {
-            let num_rows = batch.num_rows();
-            let c = c.evaluate(batch).unwrap();
-            c.into_array(num_rows).unwrap()
-        })
-        .collect()
-}
-
-/// A faster version of compare_join_arrays() that only output whether
-/// the given two rows are equal
-fn is_join_arrays_equal(
-    left_arrays: &[ArrayRef],
-    left: usize,
-    right_arrays: &[ArrayRef],
-    right: usize,
-) -> Result<bool> {
-    let mut is_equal = true;
-    for (left_array, right_array) in left_arrays.iter().zip(right_arrays) {
-        macro_rules! compare_value {
-            ($T:ty) => {{
-                match (left_array.is_null(left), right_array.is_null(right)) {
-                    (false, false) => {
-                        let left_array =
-                            left_array.as_any().downcast_ref::<$T>().unwrap();
-                        let right_array =
-                            right_array.as_any().downcast_ref::<$T>().unwrap();
-                        if left_array.value(left) != right_array.value(right) {
-                            is_equal = false;
-                        }
-                    }
-                    (true, false) => is_equal = false,
-                    (false, true) => is_equal = false,
-                    _ => {}
-                }
-            }};
-        }
-
-        match left_array.data_type() {
-            DataType::Null => {}
-            DataType::Boolean => compare_value!(BooleanArray),
-            DataType::Int8 => compare_value!(Int8Array),
-            DataType::Int16 => compare_value!(Int16Array),
-            DataType::Int32 => compare_value!(Int32Array),
-            DataType::Int64 => compare_value!(Int64Array),
-            DataType::UInt8 => compare_value!(UInt8Array),
-            DataType::UInt16 => compare_value!(UInt16Array),
-            DataType::UInt32 => compare_value!(UInt32Array),
-            DataType::UInt64 => compare_value!(UInt64Array),
-            DataType::Float32 => compare_value!(Float32Array),
-            DataType::Float64 => compare_value!(Float64Array),
-            DataType::Utf8 => compare_value!(StringArray),
-            DataType::Utf8View => compare_value!(StringViewArray),
-            DataType::LargeUtf8 => compare_value!(LargeStringArray),
-            DataType::Binary => compare_value!(BinaryArray),
-            DataType::BinaryView => compare_value!(BinaryViewArray),
-            DataType::FixedSizeBinary(_) => compare_value!(FixedSizeBinaryArray),
-            DataType::LargeBinary => compare_value!(LargeBinaryArray),
-            DataType::Decimal32(..) => compare_value!(Decimal32Array),
-            DataType::Decimal64(..) => compare_value!(Decimal64Array),
-            DataType::Decimal128(..) => compare_value!(Decimal128Array),
-            DataType::Decimal256(..) => compare_value!(Decimal256Array),
-            DataType::Timestamp(time_unit, None) => match time_unit {
-                TimeUnit::Second => compare_value!(TimestampSecondArray),
-                TimeUnit::Millisecond => compare_value!(TimestampMillisecondArray),
-                TimeUnit::Microsecond => compare_value!(TimestampMicrosecondArray),
-                TimeUnit::Nanosecond => compare_value!(TimestampNanosecondArray),
-            },
-            DataType::Date32 => compare_value!(Date32Array),
-            DataType::Date64 => compare_value!(Date64Array),
-            dt => {
-                return not_impl_err!(
-                    "Unsupported data type in sort merge join comparator: {}",
-                    dt
-                );
-            }
-        }
-        if !is_equal {
-            return Ok(false);
-        }
-    }
-    Ok(true)
-}
diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/tests.rs b/datafusion/physical-plan/src/joins/sort_merge_join/tests.rs
index 83a5c4041cc03..5d70530528728 100644
--- a/datafusion/physical-plan/src/joins/sort_merge_join/tests.rs
+++ b/datafusion/physical-plan/src/joins/sort_merge_join/tests.rs
@@ -24,42 +24,52 @@
 //!
 //! Add relevant tests under the specified sections.
 
+use std::pin::Pin;
 use std::sync::Arc;
+use std::task::{Context, Poll};
 
+use super::bitwise_stream::BitwiseSortMergeJoinStream;
+use crate::joins::utils::{ColumnIndex, JoinFilter, JoinOn};
+use crate::joins::{HashJoinExec, PartitionMode, SortMergeJoinExec};
+use crate::metrics::{ExecutionPlanMetricsSet, SpillMetrics};
+use crate::spill::spill_manager::SpillManager;
+use crate::test::TestMemoryExec;
+use crate::test::exec::BarrierExec;
+use crate::test::{build_table_i32, build_table_i32_two_cols};
+use crate::{ExecutionPlan, RecordBatchStream, common};
+use crate::{
+    expressions::Column, joins::sort_merge_join::filter::get_corrected_filter_mask,
+    joins::sort_merge_join::materializing_stream::JoinedRecordBatches,
+};
 use arrow::array::{
-    builder::{BooleanBuilder, UInt64Builder},
     BinaryArray, BooleanArray, Date32Array, Date64Array, FixedSizeBinaryArray,
     Int32Array, RecordBatch, UInt64Array,
 };
-use arrow::compute::{concat_batches, filter_record_batch, SortOptions};
+use arrow::compute::{BatchCoalescer, SortOptions, filter_record_batch};
 use arrow::datatypes::{DataType, Field, Schema};
-
+use arrow_ord::sort::SortColumn;
+use arrow_schema::SchemaRef;
 use datafusion_common::JoinType::*;
 use datafusion_common::{
-    assert_batches_eq, assert_contains, JoinType, NullEquality, Result,
+    JoinSide, internal_err,
+    test_util::{batches_to_sort_string, batches_to_string},
 };
 use datafusion_common::{
-    test_util::{batches_to_sort_string, batches_to_string},
-    JoinSide,
+    JoinType, NullEquality, Result, ScalarValue, assert_batches_eq, assert_contains,
 };
+use datafusion_common_runtime::JoinSet;
 use datafusion_execution::config::SessionConfig;
 use datafusion_execution::disk_manager::{DiskManagerBuilder, DiskManagerMode};
+use datafusion_execution::memory_pool::MemoryConsumer;
 use datafusion_execution::runtime_env::RuntimeEnvBuilder;
-use datafusion_execution::TaskContext;
+use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use datafusion_expr::Operator;
 use datafusion_physical_expr::expressions::BinaryExpr;
-use insta::{allow_duplicates, assert_snapshot};
-
-use crate::{
-    expressions::Column,
-    joins::sort_merge_join::stream::{get_corrected_filter_mask, JoinedRecordBatches},
-};
-
-use crate::joins::utils::{ColumnIndex, JoinFilter, JoinOn};
-use crate::joins::SortMergeJoinExec;
-use crate::test::TestMemoryExec;
-use crate::test::{build_table_i32, build_table_i32_two_cols};
-use crate::{common, ExecutionPlan};
+use datafusion_physical_expr::expressions::Literal;
+use datafusion_physical_expr_common::physical_expr::PhysicalExprRef;
+use futures::{Stream, StreamExt};
+use insta::assert_snapshot;
+use itertools::Itertools;
 
 fn build_table(
     a: (&str, &Vec<i32>),
@@ -365,15 +375,15 @@ async fn join_inner_one() -> Result<()> {
     let (_, batches) = join_collect(left, right, on, Inner).await?;
 
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b1 | c2 |
-            +----+----+----+----+----+----+
-            | 1  | 4  | 7  | 10 | 4  | 70 |
-            | 2  | 5  | 8  | 20 | 5  | 80 |
-            | 3  | 5  | 9  | 20 | 5  | 80 |
-            +----+----+----+----+----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b1 | c1 | a2 | b1 | c2 |
+    +----+----+----+----+----+----+
+    | 1  | 4  | 7  | 10 | 4  | 70 |
+    | 2  | 5  | 8  | 20 | 5  | 80 |
+    | 3  | 5  | 9  | 20 | 5  | 80 |
+    +----+----+----+----+----+----+
+    ");
     Ok(())
 }
 
@@ -403,15 +413,15 @@ async fn join_inner_two() -> Result<()> {
     let (_columns, batches) = join_collect(left, right, on, Inner).await?;
 
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b2 | c1 | a1 | b2 | c2 |
-            +----+----+----+----+----+----+
-            | 1  | 1  | 7  | 1  | 1  | 70 |
-            | 2  | 2  | 8  | 2  | 2  | 80 |
-            | 2  | 2  | 9  | 2  | 2  | 80 |
-            +----+----+----+----+----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b2 | c1 | a1 | b2 | c2 |
+    +----+----+----+----+----+----+
+    | 1  | 1  | 7  | 1  | 1  | 70 |
+    | 2  | 2  | 8  | 2  | 2  | 80 |
+    | 2  | 2  | 9  | 2  | 2  | 80 |
+    +----+----+----+----+----+----+
+    ");
     Ok(())
 }
 
@@ -441,16 +451,16 @@ async fn join_inner_two_two() -> Result<()> {
     let (_columns, batches) = join_collect(left, right, on, Inner).await?;
 
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b2 | c1 | a1 | b2 | c2 |
-            +----+----+----+----+----+----+
-            | 1  | 1  | 7  | 1  | 1  | 70 |
-            | 1  | 1  | 7  | 1  | 1  | 80 |
-            | 1  | 1  | 8  | 1  | 1  | 70 |
-            | 1  | 1  | 8  | 1  | 1  | 80 |
-            +----+----+----+----+----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b2 | c1 | a1 | b2 | c2 |
+    +----+----+----+----+----+----+
+    | 1  | 1  | 7  | 1  | 1  | 70 |
+    | 1  | 1  | 7  | 1  | 1  | 80 |
+    | 1  | 1  | 8  | 1  | 1  | 70 |
+    | 1  | 1  | 8  | 1  | 1  | 80 |
+    +----+----+----+----+----+----+
+    ");
     Ok(())
 }
 
@@ -479,15 +489,15 @@ async fn join_inner_with_nulls() -> Result<()> {
 
     let (_, batches) = join_collect(left, right, on, Inner).await?;
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b2 | c1 | a1 | b2 | c2 |
-            +----+----+----+----+----+----+
-            | 1  | 1  |    | 1  | 1  | 70 |
-            | 2  | 2  | 8  | 2  | 2  | 80 |
-            | 2  | 2  | 9  | 2  | 2  | 80 |
-            +----+----+----+----+----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b2 | c1 | a1 | b2 | c2 |
+    +----+----+----+----+----+----+
+    | 1  | 1  |    | 1  | 1  | 70 |
+    | 2  | 2  | 8  | 2  | 2  | 80 |
+    | 2  | 2  | 9  | 2  | 2  | 80 |
+    +----+----+----+----+----+----+
+    ");
     Ok(())
 }
 
@@ -529,16 +539,16 @@ async fn join_inner_with_nulls_with_options() -> Result<()> {
     )
     .await?;
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b2 | c1 | a1 | b2 | c2 |
-            +----+----+----+----+----+----+
-            | 2  | 2  | 9  | 2  | 2  | 80 |
-            | 2  | 2  | 8  | 2  | 2  | 80 |
-            | 1  | 1  |    | 1  | 1  | 70 |
-            | 1  |    | 1  | 1  |    | 10 |
-            +----+----+----+----+----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b2 | c1 | a1 | b2 | c2 |
+    +----+----+----+----+----+----+
+    | 2  | 2  | 9  | 2  | 2  | 80 |
+    | 2  | 2  | 8  | 2  | 2  | 80 |
+    | 1  | 1  |    | 1  | 1  | 70 |
+    | 1  |    | 1  | 1  |    | 10 |
+    +----+----+----+----+----+----+
+    ");
     Ok(())
 }
 
@@ -570,15 +580,15 @@ async fn join_inner_output_two_batches() -> Result<()> {
     assert_eq!(batches[0].num_rows(), 2);
     assert_eq!(batches[1].num_rows(), 1);
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b2 | c1 | a1 | b2 | c2 |
-            +----+----+----+----+----+----+
-            | 1  | 1  | 7  | 1  | 1  | 70 |
-            | 2  | 2  | 8  | 2  | 2  | 80 |
-            | 2  | 2  | 9  | 2  | 2  | 80 |
-            +----+----+----+----+----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b2 | c1 | a1 | b2 | c2 |
+    +----+----+----+----+----+----+
+    | 1  | 1  | 7  | 1  | 1  | 70 |
+    | 2  | 2  | 8  | 2  | 2  | 80 |
+    | 2  | 2  | 9  | 2  | 2  | 80 |
+    +----+----+----+----+----+----+
+    ");
     Ok(())
 }
 
@@ -601,15 +611,15 @@ async fn join_left_one() -> Result<()> {
 
     let (_, batches) = join_collect(left, right, on, Left).await?;
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b1 | c2 |
-            +----+----+----+----+----+----+
-            | 1  | 4  | 7  | 10 | 4  | 70 |
-            | 2  | 5  | 8  | 20 | 5  | 80 |
-            | 3  | 7  | 9  |    |    |    |
-            +----+----+----+----+----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b1 | c1 | a2 | b1 | c2 |
+    +----+----+----+----+----+----+
+    | 1  | 4  | 7  | 10 | 4  | 70 |
+    | 2  | 5  | 8  | 20 | 5  | 80 |
+    | 3  | 7  | 9  |    |    |    |
+    +----+----+----+----+----+----+
+    ");
     Ok(())
 }
 
@@ -632,15 +642,249 @@ async fn join_right_one() -> Result<()> {
 
     let (_, batches) = join_collect(left, right, on, Right).await?;
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b1 | c2 |
-            +----+----+----+----+----+----+
-            | 1  | 4  | 7  | 10 | 4  | 70 |
-            | 2  | 5  | 8  | 20 | 5  | 80 |
-            |    |    |    | 30 | 6  | 90 |
-            +----+----+----+----+----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b1 | c1 | a2 | b1 | c2 |
+    +----+----+----+----+----+----+
+    | 1  | 4  | 7  | 10 | 4  | 70 |
+    | 2  | 5  | 8  | 20 | 5  | 80 |
+    |    |    |    | 30 | 6  | 90 |
+    +----+----+----+----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_different_columns_count_with_filter() -> Result<()> {
+    // select *
+    // from t1
+    // right join t2 on t1.b1 = t2.b1 and t1.a1 > t2.a2
+
+    let left = build_table(
+        ("a1", &vec![1, 21, 3]), // 21(t1.a1) > 20(t2.a2)
+        ("b1", &vec![4, 5, 7]),
+        ("c1", &vec![7, 8, 9]),
+    );
+
+    let right = build_table_two_cols(
+        ("a2", &vec![10, 20, 30]),
+        ("b1", &vec![4, 5, 6]), // 6 does not exist on the left
+    );
+
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    let filter = JoinFilter::new(
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a1", 0)),
+            Operator::Gt,
+            Arc::new(Column::new("a2", 1)),
+        )),
+        vec![
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Right,
+            },
+        ],
+        Arc::new(Schema::new(vec![
+            Field::new("a1", DataType::Int32, true),
+            Field::new("a2", DataType::Int32, true),
+        ])),
+    );
+
+    let (_, batches) = join_collect_with_filter(left, right, on, filter, Right).await?;
+
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+
+    | a1 | b1 | c1 | a2 | b1 |
+    +----+----+----+----+----+
+    |    |    |    | 10 | 4  |
+    | 21 | 5  | 8  | 20 | 5  |
+    |    |    |    | 30 | 6  |
+    +----+----+----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_left_different_columns_count_with_filter() -> Result<()> {
+    // select *
+    // from t2
+    // left join t1 on t2.b1 = t1.b1 and t2.a2 > t1.a1
+
+    let left = build_table_two_cols(
+        ("a2", &vec![10, 20, 30]),
+        ("b1", &vec![4, 5, 6]), // 6 does not exist on the right
+    );
+
+    let right = build_table(
+        ("a1", &vec![1, 21, 3]), // 20(t2.a2) > 1(t1.a1)
+        ("b1", &vec![4, 5, 7]),
+        ("c1", &vec![7, 8, 9]),
+    );
+
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    let filter = JoinFilter::new(
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a2", 0)),
+            Operator::Gt,
+            Arc::new(Column::new("a1", 1)),
+        )),
+        vec![
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Right,
+            },
+        ],
+        Arc::new(Schema::new(vec![
+            Field::new("a2", DataType::Int32, true),
+            Field::new("a1", DataType::Int32, true),
+        ])),
+    );
+
+    let (_, batches) = join_collect_with_filter(left, right, on, filter, Left).await?;
+
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+
+    | a2 | b1 | a1 | b1 | c1 |
+    +----+----+----+----+----+
+    | 10 | 4  | 1  | 4  | 7  |
+    | 20 | 5  |    |    |    |
+    | 30 | 6  |    |    |    |
+    +----+----+----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_left_mark_different_columns_count_with_filter() -> Result<()> {
+    // select *
+    // from t2
+    // left mark join t1 on t2.b1 = t1.b1 and t2.a2 > t1.a1
+
+    let left = build_table_two_cols(
+        ("a2", &vec![10, 20, 30]),
+        ("b1", &vec![4, 5, 6]), // 6 does not exist on the right
+    );
+
+    let right = build_table(
+        ("a1", &vec![1, 21, 3]), // 20(t2.a2) > 1(t1.a1)
+        ("b1", &vec![4, 5, 7]),
+        ("c1", &vec![7, 8, 9]),
+    );
+
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    let filter = JoinFilter::new(
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a2", 0)),
+            Operator::Gt,
+            Arc::new(Column::new("a1", 1)),
+        )),
+        vec![
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Right,
+            },
+        ],
+        Arc::new(Schema::new(vec![
+            Field::new("a2", DataType::Int32, true),
+            Field::new("a1", DataType::Int32, true),
+        ])),
+    );
+
+    let (_, batches) =
+        join_collect_with_filter(left, right, on, filter, LeftMark).await?;
+
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+-------+
+    | a2 | b1 | mark  |
+    +----+----+-------+
+    | 10 | 4  | true  |
+    | 20 | 5  | false |
+    | 30 | 6  | false |
+    +----+----+-------+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_mark_different_columns_count_with_filter() -> Result<()> {
+    // select *
+    // from t1
+    // right mark join t2 on t1.b1 = t2.b1 and t1.a1 > t2.a2
+
+    let left = build_table(
+        ("a1", &vec![1, 21, 3]), // 21(t1.a1) > 20(t2.a2)
+        ("b1", &vec![4, 5, 7]),
+        ("c1", &vec![7, 8, 9]),
+    );
+
+    let right = build_table_two_cols(
+        ("a2", &vec![10, 20, 30]),
+        ("b1", &vec![4, 5, 6]), // 6 does not exist on the left
+    );
+
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    let filter = JoinFilter::new(
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a1", 0)),
+            Operator::Gt,
+            Arc::new(Column::new("a2", 1)),
+        )),
+        vec![
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Right,
+            },
+        ],
+        Arc::new(Schema::new(vec![
+            Field::new("a1", DataType::Int32, true),
+            Field::new("a2", DataType::Int32, true),
+        ])),
+    );
+
+    let (_, batches) =
+        join_collect_with_filter(left, right, on, filter, RightMark).await?;
+
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+-------+
+    | a2 | b1 | mark  |
+    +----+----+-------+
+    | 10 | 4  | false |
+    | 20 | 5  | true  |
+    | 30 | 6  | false |
+    +----+----+-------+
+    ");
     Ok(())
 }
 
@@ -663,16 +907,16 @@ async fn join_full_one() -> Result<()> {
 
     let (_, batches) = join_collect(left, right, on, Full).await?;
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b2 | c2 |
-            +----+----+----+----+----+----+
-            |    |    |    | 30 | 6  | 90 |
-            | 1  | 4  | 7  | 10 | 4  | 70 |
-            | 2  | 5  | 8  | 20 | 5  | 80 |
-            | 3  | 7  | 9  |    |    |    |
-            +----+----+----+----+----+----+
-            "#);
+    assert_snapshot!(batches_to_sort_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b1 | c1 | a2 | b2 | c2 |
+    +----+----+----+----+----+----+
+    |    |    |    | 30 | 6  | 90 |
+    | 1  | 4  | 7  | 10 | 4  | 70 |
+    | 2  | 5  | 8  | 20 | 5  | 80 |
+    | 3  | 7  | 9  |    |    |    |
+    +----+----+----+----+----+----+
+    ");
     Ok(())
 }
 
@@ -696,14 +940,14 @@ async fn join_left_anti() -> Result<()> {
     let (_, batches) = join_collect(left, right, on, LeftAnti).await?;
 
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+
-            | a1 | b1 | c1 |
-            +----+----+----+
-            | 3  | 7  | 9  |
-            | 5  | 7  | 11 |
-            +----+----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+
+    | a1 | b1 | c1 |
+    +----+----+----+
+    | 3  | 7  | 9  |
+    | 5  | 7  | 11 |
+    +----+----+----+
+    ");
     Ok(())
 }
 
@@ -722,13 +966,13 @@ async fn join_right_anti_one_one() -> Result<()> {
 
     let (_, batches) = join_collect(left, right, on, RightAnti).await?;
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+
-            | a2 | b1 |
-            +----+----+
-            | 30 | 6  |
-            +----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+
+    | a2 | b1 |
+    +----+----+
+    | 30 | 6  |
+    +----+----+
+    ");
 
     let left2 = build_table(
         ("a1", &vec![1, 2, 2]),
@@ -748,13 +992,13 @@ async fn join_right_anti_one_one() -> Result<()> {
 
     let (_, batches2) = join_collect(left2, right2, on, RightAnti).await?;
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches2), @r#"
-            +----+----+----+
-            | a2 | b1 | c2 |
-            +----+----+----+
-            | 30 | 6  | 90 |
-            +----+----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches2), @r"
+    +----+----+----+
+    | a2 | b1 | c2 |
+    +----+----+----+
+    | 30 | 6  | 90 |
+    +----+----+----+
+    ");
 
     Ok(())
 }
@@ -780,15 +1024,15 @@ async fn join_right_anti_two_two() -> Result<()> {
 
     let (_, batches) = join_collect(left, right, on, RightAnti).await?;
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+
-            | a2 | b1 |
-            +----+----+
-            | 10 | 4  |
-            | 20 | 5  |
-            | 30 | 6  |
-            +----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+
+    | a2 | b1 |
+    +----+----+
+    | 10 | 4  |
+    | 20 | 5  |
+    | 30 | 6  |
+    +----+----+
+    ");
 
     let left = build_table(
         ("a1", &vec![1, 2, 2]),
@@ -865,13 +1109,68 @@ async fn join_right_anti_two_with_filter() -> Result<()> {
     );
     let (_, batches) =
         join_collect_with_filter(left, right, on, filter, RightAnti).await?;
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+
-            | a1 | b1 | c2 |
-            +----+----+----+
-            | 1  | 10 | 20 |
-            +----+----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+
+    | a1 | b1 | c2 |
+    +----+----+----+
+    | 1  | 10 | 20 |
+    +----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_anti_filtered_with_mismatched_columns() -> Result<()> {
+    let left = build_table_two_cols(("a1", &vec![31, 31]), ("b1", &vec![32, 33]));
+    let right = build_table(
+        ("a2", &vec![31, 31]),
+        ("b2", &vec![32, 35]),
+        ("c2", &vec![108, 109]),
+    );
+    let on = vec![
+        (
+            Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("a2", &right.schema())?) as _,
+        ),
+        (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
+        ),
+    ];
+
+    let filter = JoinFilter::new(
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("b1", 0)),
+            Operator::LtEq,
+            Arc::new(Column::new("c2", 1)),
+        )),
+        vec![
+            ColumnIndex {
+                index: 1,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 2,
+                side: JoinSide::Right,
+            },
+        ],
+        Arc::new(Schema::new(vec![
+            Field::new("b1", DataType::Int32, false),
+            Field::new("c2", DataType::Int32, false),
+        ])),
+    );
+
+    let (_, batches) =
+        join_collect_with_filter(left, right, on, filter, RightAnti).await?;
+
+    let expected = [
+        "+----+----+-----+",
+        "| a2 | b2 | c2  |",
+        "+----+----+-----+",
+        "| 31 | 35 | 109 |",
+        "+----+----+-----+",
+    ];
+    assert_batches_eq!(expected, &batches);
     Ok(())
 }
 
@@ -900,13 +1199,13 @@ async fn join_right_anti_with_nulls() -> Result<()> {
 
     let (_, batches) = join_collect(left, right, on, RightAnti).await?;
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+
-            | a1 | b1 | c2 |
-            +----+----+----+
-            | 2  |    | 8  |
-            +----+----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+
+    | a1 | b1 | c2 |
+    +----+----+----+
+    | 2  |    | 8  |
+    +----+----+----+
+    ");
     Ok(())
 }
 
@@ -950,15 +1249,15 @@ async fn join_right_anti_with_nulls_with_options() -> Result<()> {
     .await?;
 
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+
-            | a1 | b1 | c2 |
-            +----+----+----+
-            | 3  |    | 9  |
-            | 2  | 5  |    |
-            | 2  | 5  | 8  |
-            +----+----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+
+    | a1 | b1 | c2 |
+    +----+----+----+
+    | 3  |    | 9  |
+    | 2  | 5  |    |
+    | 2  | 5  | 8  |
+    +----+----+----+
+    ");
     Ok(())
 }
 
@@ -987,18 +1286,19 @@ async fn join_right_anti_output_two_batches() -> Result<()> {
 
     let (_, batches) =
         join_collect_batch_size_equals_two(left, right, on, LeftAnti).await?;
-    assert_eq!(batches.len(), 2);
-    assert_eq!(batches[0].num_rows(), 2);
-    assert_eq!(batches[1].num_rows(), 1);
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+
-            | a1 | b1 | c1 |
-            +----+----+----+
-            | 1  | 4  | 7  |
-            | 2  | 5  | 8  |
-            | 2  | 5  | 8  |
-            +----+----+----+
-            "#);
+    // BitwiseSortMergeJoinStream uses a coalescer, so batch boundaries differ
+    // from the old stream. Only assert data correctness.
+    let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+    assert_eq!(total_rows, 3);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+
+    | a1 | b1 | c1 |
+    +----+----+----+
+    | 1  | 4  | 7  |
+    | 2  | 5  | 8  |
+    | 2  | 5  | 8  |
+    +----+----+----+
+    ");
     Ok(())
 }
 
@@ -1021,15 +1321,15 @@ async fn join_left_semi() -> Result<()> {
 
     let (_, batches) = join_collect(left, right, on, LeftSemi).await?;
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+
-            | a1 | b1 | c1 |
-            +----+----+----+
-            | 1  | 4  | 7  |
-            | 2  | 5  | 8  |
-            | 2  | 5  | 8  |
-            +----+----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+
+    | a1 | b1 | c1 |
+    +----+----+----+
+    | 1  | 4  | 7  |
+    | 2  | 5  | 8  |
+    | 2  | 5  | 8  |
+    +----+----+----+
+    ");
     Ok(())
 }
 
@@ -1275,9 +1575,10 @@ async fn join_right_semi_output_two_batches() -> Result<()> {
         "| 2  | 5  | 8  |",
         "+----+----+----+",
     ];
-    assert_eq!(batches.len(), 2);
-    assert_eq!(batches[0].num_rows(), 2);
-    assert_eq!(batches[1].num_rows(), 1);
+    // BitwiseSortMergeJoinStream uses a coalescer, so batch boundaries differ
+    // from the old stream. Only assert data correctness.
+    let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+    assert_eq!(total_rows, 3);
     assert_batches_eq!(expected, &batches);
     Ok(())
 }
@@ -1301,16 +1602,16 @@ async fn join_left_mark() -> Result<()> {
 
     let (_, batches) = join_collect(left, right, on, LeftMark).await?;
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+-------+
-            | a1 | b1 | c1 | mark  |
-            +----+----+----+-------+
-            | 1  | 4  | 7  | true  |
-            | 2  | 5  | 8  | true  |
-            | 2  | 5  | 8  | true  |
-            | 3  | 7  | 9  | false |
-            +----+----+----+-------+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+-------+
+    | a1 | b1 | c1 | mark  |
+    +----+----+----+-------+
+    | 1  | 4  | 7  | true  |
+    | 2  | 5  | 8  | true  |
+    | 2  | 5  | 8  | true  |
+    | 3  | 7  | 9  | false |
+    +----+----+----+-------+
+    ");
     Ok(())
 }
 
@@ -1333,16 +1634,16 @@ async fn join_right_mark() -> Result<()> {
 
     let (_, batches) = join_collect(left, right, on, RightMark).await?;
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+-------+
-            | a2 | b1 | c2 | mark  |
-            +----+----+----+-------+
-            | 10 | 4  | 60 | true  |
-            | 20 | 4  | 70 | true  |
-            | 30 | 5  | 80 | true  |
-            | 40 | 6  | 90 | false |
-            +----+----+----+-------+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+-------+
+    | a2 | b1 | c2 | mark  |
+    +----+----+----+-------+
+    | 10 | 4  | 60 | true  |
+    | 20 | 4  | 70 | true  |
+    | 30 | 5  | 80 | true  |
+    | 40 | 6  | 90 | false |
+    +----+----+----+-------+
+    ");
     Ok(())
 }
 
@@ -1366,14 +1667,14 @@ async fn join_with_duplicated_column_names() -> Result<()> {
 
     let (_, batches) = join_collect(left, right, on, Inner).await?;
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +---+---+---+----+---+----+
-            | a | b | c | a  | b | c  |
-            +---+---+---+----+---+----+
-            | 1 | 4 | 7 | 10 | 1 | 70 |
-            | 2 | 5 | 8 | 20 | 2 | 80 |
-            +---+---+---+----+---+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +---+---+---+----+---+----+
+    | a | b | c | a  | b | c  |
+    +---+---+---+----+---+----+
+    | 1 | 4 | 7 | 10 | 1 | 70 |
+    | 2 | 5 | 8 | 20 | 2 | 80 |
+    +---+---+---+----+---+----+
+    ");
     Ok(())
 }
 
@@ -1398,15 +1699,15 @@ async fn join_date32() -> Result<()> {
     let (_, batches) = join_collect(left, right, on, Inner).await?;
 
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +------------+------------+------------+------------+------------+------------+
-            | a1         | b1         | c1         | a2         | b1         | c2         |
-            +------------+------------+------------+------------+------------+------------+
-            | 1970-01-02 | 2022-04-25 | 1970-01-08 | 1970-01-11 | 2022-04-25 | 1970-03-12 |
-            | 1970-01-03 | 2022-04-26 | 1970-01-09 | 1970-01-21 | 2022-04-26 | 1970-03-22 |
-            | 1970-01-04 | 2022-04-26 | 1970-01-10 | 1970-01-21 | 2022-04-26 | 1970-03-22 |
-            +------------+------------+------------+------------+------------+------------+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +------------+------------+------------+------------+------------+------------+
+    | a1         | b1         | c1         | a2         | b1         | c2         |
+    +------------+------------+------------+------------+------------+------------+
+    | 1970-01-02 | 2022-04-25 | 1970-01-08 | 1970-01-11 | 2022-04-25 | 1970-03-12 |
+    | 1970-01-03 | 2022-04-26 | 1970-01-09 | 1970-01-21 | 2022-04-26 | 1970-03-22 |
+    | 1970-01-04 | 2022-04-26 | 1970-01-10 | 1970-01-21 | 2022-04-26 | 1970-03-22 |
+    +------------+------------+------------+------------+------------+------------+
+    ");
     Ok(())
 }
 
@@ -1431,15 +1732,15 @@ async fn join_date64() -> Result<()> {
     let (_, batches) = join_collect(left, right, on, Inner).await?;
 
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
-            | a1                      | b1                  | c1                      | a2                      | b1                  | c2                      |
-            +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
-            | 1970-01-01T00:00:00.001 | 2022-04-23T08:44:01 | 1970-01-01T00:00:00.007 | 1970-01-01T00:00:00.010 | 2022-04-23T08:44:01 | 1970-01-01T00:00:00.070 |
-            | 1970-01-01T00:00:00.002 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.008 | 1970-01-01T00:00:00.030 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.090 |
-            | 1970-01-01T00:00:00.003 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.009 | 1970-01-01T00:00:00.030 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.090 |
-            +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
+    | a1                      | b1                  | c1                      | a2                      | b1                  | c2                      |
+    +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
+    | 1970-01-01T00:00:00.001 | 2022-04-23T08:44:01 | 1970-01-01T00:00:00.007 | 1970-01-01T00:00:00.010 | 2022-04-23T08:44:01 | 1970-01-01T00:00:00.070 |
+    | 1970-01-01T00:00:00.002 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.008 | 1970-01-01T00:00:00.030 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.090 |
+    | 1970-01-01T00:00:00.003 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.009 | 1970-01-01T00:00:00.030 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.090 |
+    +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
+    ");
     Ok(())
 }
 
@@ -1478,15 +1779,15 @@ async fn join_binary() -> Result<()> {
     let (_, batches) = join_collect(left, right, on, Inner).await?;
 
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +--------+----+----+--------+-----+----+
-            | a1     | b1 | c1 | a1     | b2  | c2 |
-            +--------+----+----+--------+-----+----+
-            | c0ffee | 5  | 7  | c0ffee | 105 | 70 |
-            | decade | 10 | 8  | decade | 110 | 80 |
-            | facade | 15 | 9  | facade | 115 | 90 |
-            +--------+----+----+--------+-----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +--------+----+----+--------+-----+----+
+    | a1     | b1 | c1 | a1     | b2  | c2 |
+    +--------+----+----+--------+-----+----+
+    | c0ffee | 5  | 7  | c0ffee | 105 | 70 |
+    | decade | 10 | 8  | decade | 110 | 80 |
+    | facade | 15 | 9  | facade | 115 | 90 |
+    +--------+----+----+--------+-----+----+
+    ");
     Ok(())
 }
 
@@ -1525,15 +1826,15 @@ async fn join_fixed_size_binary() -> Result<()> {
     let (_, batches) = join_collect(left, right, on, Inner).await?;
 
     // The output order is important as SMJ preserves sortedness
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +--------+----+----+--------+-----+----+
-            | a1     | b1 | c1 | a1     | b2  | c2 |
-            +--------+----+----+--------+-----+----+
-            | c0ffee | 5  | 7  | c0ffee | 105 | 70 |
-            | decade | 10 | 8  | decade | 110 | 80 |
-            | facade | 15 | 9  | facade | 115 | 90 |
-            +--------+----+----+--------+-----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +--------+----+----+--------+-----+----+
+    | a1     | b1 | c1 | a1     | b2  | c2 |
+    +--------+----+----+--------+-----+----+
+    | c0ffee | 5  | 7  | c0ffee | 105 | 70 |
+    | decade | 10 | 8  | decade | 110 | 80 |
+    | facade | 15 | 9  | facade | 115 | 90 |
+    +--------+----+----+--------+-----+----+
+    ");
     Ok(())
 }
 
@@ -1555,20 +1856,20 @@ async fn join_left_sort_order() -> Result<()> {
     )];
 
     let (_, batches) = join_collect(left, right, on, Left).await?;
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b2 | c2 |
-            +----+----+----+----+----+----+
-            | 0  | 3  | 4  |    |    |    |
-            | 1  | 4  | 5  | 10 | 4  | 60 |
-            | 2  | 5  | 6  |    |    |    |
-            | 3  | 6  | 7  | 20 | 6  | 70 |
-            | 3  | 6  | 7  | 30 | 6  | 80 |
-            | 4  | 6  | 8  | 20 | 6  | 70 |
-            | 4  | 6  | 8  | 30 | 6  | 80 |
-            | 5  | 7  | 9  |    |    |    |
-            +----+----+----+----+----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b1 | c1 | a2 | b2 | c2 |
+    +----+----+----+----+----+----+
+    | 0  | 3  | 4  |    |    |    |
+    | 1  | 4  | 5  | 10 | 4  | 60 |
+    | 2  | 5  | 6  |    |    |    |
+    | 3  | 6  | 7  | 20 | 6  | 70 |
+    | 3  | 6  | 7  | 30 | 6  | 80 |
+    | 4  | 6  | 8  | 20 | 6  | 70 |
+    | 4  | 6  | 8  | 30 | 6  | 80 |
+    | 5  | 7  | 9  |    |    |    |
+    +----+----+----+----+----+----+
+    ");
     Ok(())
 }
 
@@ -1590,16 +1891,16 @@ async fn join_right_sort_order() -> Result<()> {
     )];
 
     let (_, batches) = join_collect(left, right, on, Right).await?;
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b2 | c2 |
-            +----+----+----+----+----+----+
-            |    |    |    | 0  | 2  | 60 |
-            | 1  | 4  | 7  | 10 | 4  | 70 |
-            | 2  | 5  | 8  | 20 | 5  | 80 |
-            |    |    |    | 30 | 6  | 90 |
-            +----+----+----+----+----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b1 | c1 | a2 | b2 | c2 |
+    +----+----+----+----+----+----+
+    |    |    |    | 0  | 2  | 60 |
+    | 1  | 4  | 7  | 10 | 4  | 70 |
+    | 2  | 5  | 8  | 20 | 5  | 80 |
+    |    |    |    | 30 | 6  | 90 |
+    +----+----+----+----+----+----+
+    ");
     Ok(())
 }
 
@@ -1633,21 +1934,21 @@ async fn join_left_multiple_batches() -> Result<()> {
     )];
 
     let (_, batches) = join_collect(left, right, on, Left).await?;
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b2 | c2 |
-            +----+----+----+----+----+----+
-            | 0  | 3  | 4  |    |    |    |
-            | 1  | 4  | 5  | 10 | 4  | 60 |
-            | 2  | 5  | 6  |    |    |    |
-            | 3  | 6  | 7  | 20 | 6  | 70 |
-            | 3  | 6  | 7  | 30 | 6  | 80 |
-            | 4  | 6  | 8  | 20 | 6  | 70 |
-            | 4  | 6  | 8  | 30 | 6  | 80 |
-            | 5  | 7  | 9  |    |    |    |
-            | 6  | 9  | 9  |    |    |    |
-            +----+----+----+----+----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b1 | c1 | a2 | b2 | c2 |
+    +----+----+----+----+----+----+
+    | 0  | 3  | 4  |    |    |    |
+    | 1  | 4  | 5  | 10 | 4  | 60 |
+    | 2  | 5  | 6  |    |    |    |
+    | 3  | 6  | 7  | 20 | 6  | 70 |
+    | 3  | 6  | 7  | 30 | 6  | 80 |
+    | 4  | 6  | 8  | 20 | 6  | 70 |
+    | 4  | 6  | 8  | 30 | 6  | 80 |
+    | 5  | 7  | 9  |    |    |    |
+    | 6  | 9  | 9  |    |    |    |
+    +----+----+----+----+----+----+
+    ");
     Ok(())
 }
 
@@ -1681,21 +1982,21 @@ async fn join_right_multiple_batches() -> Result<()> {
     )];
 
     let (_, batches) = join_collect(left, right, on, Right).await?;
-    assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b2 | c2 |
-            +----+----+----+----+----+----+
-            |    |    |    | 0  | 3  | 4  |
-            | 10 | 4  | 60 | 1  | 4  | 5  |
-            |    |    |    | 2  | 5  | 6  |
-            | 20 | 6  | 70 | 3  | 6  | 7  |
-            | 30 | 6  | 80 | 3  | 6  | 7  |
-            | 20 | 6  | 70 | 4  | 6  | 8  |
-            | 30 | 6  | 80 | 4  | 6  | 8  |
-            |    |    |    | 5  | 7  | 9  |
-            |    |    |    | 6  | 9  | 9  |
-            +----+----+----+----+----+----+
-            "#);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b1 | c1 | a2 | b2 | c2 |
+    +----+----+----+----+----+----+
+    |    |    |    | 0  | 3  | 4  |
+    | 10 | 4  | 60 | 1  | 4  | 5  |
+    |    |    |    | 2  | 5  | 6  |
+    | 20 | 6  | 70 | 3  | 6  | 7  |
+    | 30 | 6  | 80 | 3  | 6  | 7  |
+    | 20 | 6  | 70 | 4  | 6  | 8  |
+    | 30 | 6  | 80 | 4  | 6  | 8  |
+    |    |    |    | 5  | 7  | 9  |
+    |    |    |    | 6  | 9  | 9  |
+    +----+----+----+----+----+----+
+    ");
     Ok(())
 }
 
@@ -1729,23 +2030,125 @@ async fn join_full_multiple_batches() -> Result<()> {
     )];
 
     let (_, batches) = join_collect(left, right, on, Full).await?;
-    assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b2 | c2 |
-            +----+----+----+----+----+----+
-            |    |    |    | 0  | 2  | 50 |
-            |    |    |    | 40 | 8  | 90 |
-            | 0  | 3  | 4  |    |    |    |
-            | 1  | 4  | 5  | 10 | 4  | 60 |
-            | 2  | 5  | 6  |    |    |    |
-            | 3  | 6  | 7  | 20 | 6  | 70 |
-            | 3  | 6  | 7  | 30 | 6  | 80 |
-            | 4  | 6  | 8  | 20 | 6  | 70 |
-            | 4  | 6  | 8  | 30 | 6  | 80 |
-            | 5  | 7  | 9  |    |    |    |
-            | 6  | 9  | 9  |    |    |    |
-            +----+----+----+----+----+----+
-            "#);
+    assert_snapshot!(batches_to_sort_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b1 | c1 | a2 | b2 | c2 |
+    +----+----+----+----+----+----+
+    |    |    |    | 0  | 2  | 50 |
+    |    |    |    | 40 | 8  | 90 |
+    | 0  | 3  | 4  |    |    |    |
+    | 1  | 4  | 5  | 10 | 4  | 60 |
+    | 2  | 5  | 6  |    |    |    |
+    | 3  | 6  | 7  | 20 | 6  | 70 |
+    | 3  | 6  | 7  | 30 | 6  | 80 |
+    | 4  | 6  | 8  | 20 | 6  | 70 |
+    | 4  | 6  | 8  | 30 | 6  | 80 |
+    | 5  | 7  | 9  |    |    |    |
+    | 6  | 9  | 9  |    |    |    |
+    +----+----+----+----+----+----+
+    ");
+    Ok(())
+}
+
+/// Full outer join where the filter evaluates to NULL due to a nullable column.
+/// NULL filter results must be treated as unmatched, not matched.
+/// Reproducer for SPARK-43113.
+#[tokio::test]
+async fn join_full_null_filter_result() -> Result<()> {
+    // Left: (a, b) all non-null, sorted on a
+    let left = build_table_two_cols(
+        ("a1", &vec![1, 1, 2, 2, 3, 3]),
+        ("b1", &vec![1, 2, 1, 2, 1, 2]),
+    );
+
+    // Right: (a, b) with b nullable, sorted on a
+    let right_schema = Arc::new(Schema::new(vec![
+        Field::new("a2", DataType::Int32, false),
+        Field::new("b2", DataType::Int32, true),
+    ]));
+    let right_batch = RecordBatch::try_new(
+        Arc::clone(&right_schema),
+        vec![
+            Arc::new(Int32Array::from(vec![1, 2])),
+            Arc::new(Int32Array::from(vec![None, Some(2)])),
+        ],
+    )?;
+    let right =
+        TestMemoryExec::try_new_exec(&[vec![right_batch]], right_schema, None).unwrap();
+
+    let on = vec![(
+        Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("a2", &right.schema())?) as _,
+    )];
+
+    // Filter: b1 < (b2 + 1) AND b1 < (a2 + 1)
+    // When b2 is NULL, (b2 + 1) is NULL, so b1 < NULL is NULL → unmatched.
+    let lit_1: PhysicalExprRef = Arc::new(Literal::new(ScalarValue::Int32(Some(1))));
+    let b1_lt_b2_plus_1: PhysicalExprRef = Arc::new(BinaryExpr::new(
+        Arc::new(Column::new("b1", 0)),
+        Operator::Lt,
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("b2", 1)),
+            Operator::Plus,
+            Arc::clone(&lit_1),
+        )),
+    ));
+    let b1_lt_a2_plus_1: PhysicalExprRef = Arc::new(BinaryExpr::new(
+        Arc::new(Column::new("b1", 0)),
+        Operator::Lt,
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a2", 2)),
+            Operator::Plus,
+            Arc::clone(&lit_1),
+        )),
+    ));
+    let filter_expr: PhysicalExprRef = Arc::new(BinaryExpr::new(
+        b1_lt_b2_plus_1,
+        Operator::And,
+        b1_lt_a2_plus_1,
+    ));
+
+    let filter = JoinFilter::new(
+        filter_expr,
+        vec![
+            ColumnIndex {
+                index: 1,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 1,
+                side: JoinSide::Right,
+            },
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Right,
+            },
+        ],
+        Arc::new(Schema::new(vec![
+            Field::new("b1", DataType::Int32, true),
+            Field::new("b2", DataType::Int32, true),
+            Field::new("a2", DataType::Int32, true),
+        ])),
+    );
+
+    let (_, batches) = join_collect_with_filter(left, right, on, filter, Full).await?;
+
+    // r=(1,NULL): b2 is NULL → b1 < (NULL+1) is NULL → all a=1 rows unmatched
+    // r=(2,2): b1 < 3 AND b1 < 3 → both l=(2,1) and l=(2,2) match
+    // l=(3,*): no right row with a=3 → unmatched
+    assert_snapshot!(batches_to_sort_string(&batches), @r"
+    +----+----+----+----+
+    | a1 | b1 | a2 | b2 |
+    +----+----+----+----+
+    |    |    | 1  |    |
+    | 1  | 1  |    |    |
+    | 1  | 2  |    |    |
+    | 2  | 1  | 2  | 2  |
+    | 2  | 2  | 2  | 2  |
+    | 3  | 1  |    |    |
+    | 3  | 2  |    |    |
+    +----+----+----+----+
+    ");
     Ok(())
 }
 
@@ -1768,7 +2171,9 @@ async fn overallocation_single_batch_no_spill() -> Result<()> {
     let sort_options = vec![SortOptions::default(); on.len()];
 
     let join_types = vec![
-        Inner, Left, Right, RightSemi, Full, LeftSemi, LeftAnti, LeftMark, RightMark,
+        // Semi/anti/mark joins use BitwiseSortMergeJoinStream which only tracks
+        // inner key buffer memory; tested in bitwise_sort_merge_join/tests.rs.
+        Inner, Left, Right, Full,
     ];
 
     // Disable DiskManager to prevent spilling
@@ -1849,7 +2254,9 @@ async fn overallocation_multi_batch_no_spill() -> Result<()> {
     let sort_options = vec![SortOptions::default(); on.len()];
 
     let join_types = vec![
-        Inner, Left, Right, RightSemi, Full, LeftSemi, LeftAnti, LeftMark, RightMark,
+        // Semi/anti/mark joins use BitwiseSortMergeJoinStream which only tracks
+        // inner key buffer memory; tested in bitwise_sort_merge_join/tests.rs.
+        Inner, Left, Right, Full,
     ];
 
     // Disable DiskManager to prevent spilling
@@ -1909,7 +2316,9 @@ async fn overallocation_single_batch_spill() -> Result<()> {
     let sort_options = vec![SortOptions::default(); on.len()];
 
     let join_types = [
-        Inner, Left, Right, RightSemi, Full, LeftSemi, LeftAnti, LeftMark, RightMark,
+        // Semi/anti/mark joins use BitwiseSortMergeJoinStream which only tracks
+        // inner key buffer memory; tested in bitwise_sort_merge_join/tests.rs.
+        Inner, Left, Right, Full,
     ];
 
     // Enable DiskManager to allow spilling
@@ -2013,7 +2422,9 @@ async fn overallocation_multi_batch_spill() -> Result<()> {
     let sort_options = vec![SortOptions::default(); on.len()];
 
     let join_types = [
-        Inner, Left, Right, RightSemi, Full, LeftSemi, LeftAnti, LeftMark, RightMark,
+        // Semi/anti/mark joins use BitwiseSortMergeJoinStream which only tracks
+        // inner key buffer memory; tested in bitwise_sort_merge_join/tests.rs.
+        Inner, Left, Right, Full,
     ];
 
     // Enable DiskManager to allow spilling
@@ -2076,33 +2487,346 @@ async fn overallocation_multi_batch_spill() -> Result<()> {
     Ok(())
 }
 
-fn build_joined_record_batches() -> Result<JoinedRecordBatches> {
-    let schema = Arc::new(Schema::new(vec![
-        Field::new("a", DataType::Int32, true),
-        Field::new("b", DataType::Int32, true),
-        Field::new("x", DataType::Int32, true),
-        Field::new("y", DataType::Int32, true),
-    ]));
-
-    let mut batches = JoinedRecordBatches {
-        batches: vec![],
-        filter_mask: BooleanBuilder::new(),
-        row_indices: UInt64Builder::new(),
-        batch_ids: vec![],
-    };
-
-    // Insert already prejoined non-filtered rows
-    batches.batches.push(RecordBatch::try_new(
-        Arc::clone(&schema),
+/// Build a c1 < c2 filter on the third column of each side.
+fn build_c1_lt_c2_filter(left_schema: &Schema, right_schema: &Schema) -> JoinFilter {
+    JoinFilter::new(
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("c1", 0)),
+            Operator::Lt,
+            Arc::new(Column::new("c2", 1)),
+        )),
+        vec![
+            ColumnIndex {
+                index: 2,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 2,
+                side: JoinSide::Right,
+            },
+        ],
+        Arc::new(Schema::new(vec![
+            left_schema
+                .field_with_name("c1")
+                .unwrap()
+                .clone()
+                .with_nullable(true),
+            right_schema
+                .field_with_name("c2")
+                .unwrap()
+                .clone()
+                .with_nullable(true),
+        ])),
+    )
+}
+
+#[tokio::test]
+async fn spill_with_filter_deferred() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![0, 1, 2, 3, 4, 5]),
+        ("b1", &vec![1, 2, 3, 4, 5, 6]),
+        ("c1", &vec![4, 5, 6, 7, 8, 9]),
+    );
+    let right = build_table(
+        ("a2", &vec![0, 10, 20, 30, 40]),
+        ("b2", &vec![1, 3, 4, 6, 8]),
+        ("c2", &vec![50, 60, 70, 80, 90]),
+    );
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
+    )];
+    let sort_options = vec![SortOptions::default(); on.len()];
+
+    let filter = build_c1_lt_c2_filter(&left.schema(), &right.schema());
+
+    // Deferred filtering join types handled by the main MaterializingSortMergeJoinStream
+    let join_types = [Left, Right, Full];
+
+    let runtime = RuntimeEnvBuilder::new()
+        .with_memory_limit(100, 1.0)
+        .with_disk_manager_builder(
+            DiskManagerBuilder::default().with_mode(DiskManagerMode::OsTmpDirectory),
+        )
+        .build_arc()?;
+
+    for batch_size in [1, 50] {
+        let session_config = SessionConfig::default().with_batch_size(batch_size);
+
+        for join_type in &join_types {
+            // Run with spilling
+            let task_ctx = Arc::new(
+                TaskContext::default()
+                    .with_session_config(session_config.clone())
+                    .with_runtime(Arc::clone(&runtime)),
+            );
+            let join = join_with_filter(
+                Arc::clone(&left),
+                Arc::clone(&right),
+                on.clone(),
+                filter.clone(),
+                *join_type,
+                sort_options.clone(),
+                NullEquality::NullEqualsNothing,
+            )?;
+            let stream = join.execute(0, task_ctx)?;
+            let spilled_result = common::collect(stream).await.unwrap();
+
+            assert!(join.metrics().is_some());
+            assert!(
+                join.metrics().unwrap().spill_count().unwrap() > 0,
+                "Expected spilling for {join_type:?} batch_size={batch_size}"
+            );
+
+            // Run without spilling
+            let task_ctx_no_spill = Arc::new(
+                TaskContext::default().with_session_config(session_config.clone()),
+            );
+            let join_no_spill = join_with_filter(
+                Arc::clone(&left),
+                Arc::clone(&right),
+                on.clone(),
+                filter.clone(),
+                *join_type,
+                sort_options.clone(),
+                NullEquality::NullEqualsNothing,
+            )?;
+            let stream = join_no_spill.execute(0, task_ctx_no_spill)?;
+            let no_spill_result = common::collect(stream).await.unwrap();
+
+            let spilled_str = batches_to_sort_string(&spilled_result);
+            let no_spill_str = batches_to_sort_string(&no_spill_result);
+            assert_eq!(
+                spilled_str, no_spill_str,
+                "Spill vs no-spill mismatch for {join_type:?} batch_size={batch_size}"
+            );
+        }
+    }
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn spill_with_filter_multi_batch() -> Result<()> {
+    let left_batch_1 = build_table_i32(
+        ("a1", &vec![0, 1]),
+        ("b1", &vec![1, 1]),
+        ("c1", &vec![4, 5]),
+    );
+    let left_batch_2 = build_table_i32(
+        ("a1", &vec![2, 3]),
+        ("b1", &vec![1, 1]),
+        ("c1", &vec![6, 7]),
+    );
+    let left_batch_3 = build_table_i32(
+        ("a1", &vec![4, 5]),
+        ("b1", &vec![1, 1]),
+        ("c1", &vec![8, 9]),
+    );
+    let right_batch_1 = build_table_i32(
+        ("a2", &vec![0, 10]),
+        ("b2", &vec![1, 1]),
+        ("c2", &vec![50, 60]),
+    );
+    let right_batch_2 = build_table_i32(
+        ("a2", &vec![20, 30]),
+        ("b2", &vec![1, 1]),
+        ("c2", &vec![70, 80]),
+    );
+    let right_batch_3 =
+        build_table_i32(("a2", &vec![40]), ("b2", &vec![1]), ("c2", &vec![90]));
+    let left = build_table_from_batches(vec![left_batch_1, left_batch_2, left_batch_3]);
+    let right =
+        build_table_from_batches(vec![right_batch_1, right_batch_2, right_batch_3]);
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
+    )];
+    let sort_options = vec![SortOptions::default(); on.len()];
+
+    let filter = build_c1_lt_c2_filter(&left.schema(), &right.schema());
+
+    let join_types = [Left, Right, Full];
+
+    let runtime = RuntimeEnvBuilder::new()
+        .with_memory_limit(500, 1.0)
+        .with_disk_manager_builder(
+            DiskManagerBuilder::default().with_mode(DiskManagerMode::OsTmpDirectory),
+        )
+        .build_arc()?;
+
+    for batch_size in [1, 50] {
+        let session_config = SessionConfig::default().with_batch_size(batch_size);
+
+        for join_type in &join_types {
+            // Run with spilling
+            let task_ctx = Arc::new(
+                TaskContext::default()
+                    .with_session_config(session_config.clone())
+                    .with_runtime(Arc::clone(&runtime)),
+            );
+            let join = join_with_filter(
+                Arc::clone(&left),
+                Arc::clone(&right),
+                on.clone(),
+                filter.clone(),
+                *join_type,
+                sort_options.clone(),
+                NullEquality::NullEqualsNothing,
+            )?;
+            let stream = join.execute(0, task_ctx)?;
+            let spilled_result = common::collect(stream).await.unwrap();
+
+            assert!(join.metrics().is_some());
+            assert!(
+                join.metrics().unwrap().spill_count().unwrap() > 0,
+                "Expected spilling for {join_type:?} batch_size={batch_size}"
+            );
+
+            // Run without spilling
+            let task_ctx_no_spill = Arc::new(
+                TaskContext::default().with_session_config(session_config.clone()),
+            );
+            let join_no_spill = join_with_filter(
+                Arc::clone(&left),
+                Arc::clone(&right),
+                on.clone(),
+                filter.clone(),
+                *join_type,
+                sort_options.clone(),
+                NullEquality::NullEqualsNothing,
+            )?;
+            let stream = join_no_spill.execute(0, task_ctx_no_spill)?;
+            let no_spill_result = common::collect(stream).await.unwrap();
+
+            let spilled_str = batches_to_sort_string(&spilled_result);
+            let no_spill_str = batches_to_sort_string(&no_spill_result);
+            assert_eq!(
+                spilled_str, no_spill_str,
+                "Spill vs no-spill mismatch for {join_type:?} batch_size={batch_size}"
+            );
+        }
+    }
+
+    Ok(())
+}
+
+/// FULL join where all buffered rows match on key but fail the filter.
+/// Verifies produce_buffered_not_matched emits null-joined rows under spill.
+#[tokio::test]
+async fn spill_full_join_filter_not_matched() -> Result<()> {
+    // c1 values (100..105) are always > c2 values (1..5), so c1 < c2 always fails
+    let left = build_table(
+        ("a1", &vec![0, 1, 2, 3, 4]),
+        ("b1", &vec![1, 1, 1, 1, 1]),
+        ("c1", &vec![100, 101, 102, 103, 104]),
+    );
+    let right = build_table(
+        ("a2", &vec![10, 20, 30, 40, 50]),
+        ("b2", &vec![1, 1, 1, 1, 1]),
+        ("c2", &vec![1, 2, 3, 4, 5]),
+    );
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
+    )];
+    let sort_options = vec![SortOptions::default(); on.len()];
+
+    let filter = build_c1_lt_c2_filter(&left.schema(), &right.schema());
+
+    let runtime = RuntimeEnvBuilder::new()
+        .with_memory_limit(100, 1.0)
+        .with_disk_manager_builder(
+            DiskManagerBuilder::default().with_mode(DiskManagerMode::OsTmpDirectory),
+        )
+        .build_arc()?;
+
+    for batch_size in [1, 50] {
+        let session_config = SessionConfig::default().with_batch_size(batch_size);
+
+        // Run with spilling
+        let task_ctx = Arc::new(
+            TaskContext::default()
+                .with_session_config(session_config.clone())
+                .with_runtime(Arc::clone(&runtime)),
+        );
+        let join = join_with_filter(
+            Arc::clone(&left),
+            Arc::clone(&right),
+            on.clone(),
+            filter.clone(),
+            Full,
+            sort_options.clone(),
+            NullEquality::NullEqualsNothing,
+        )?;
+        let stream = join.execute(0, task_ctx)?;
+        let spilled_result = common::collect(stream).await.unwrap();
+
+        assert!(
+            join.metrics().unwrap().spill_count().unwrap() > 0,
+            "Expected spilling for FULL batch_size={batch_size}"
+        );
+
+        // Run without spilling
+        let task_ctx_no_spill =
+            Arc::new(TaskContext::default().with_session_config(session_config.clone()));
+        let join_no_spill = join_with_filter(
+            Arc::clone(&left),
+            Arc::clone(&right),
+            on.clone(),
+            filter.clone(),
+            Full,
+            sort_options.clone(),
+            NullEquality::NullEqualsNothing,
+        )?;
+        let stream = join_no_spill.execute(0, task_ctx_no_spill)?;
+        let no_spill_result = common::collect(stream).await.unwrap();
+
+        // All filter evaluations fail, so FULL join should produce:
+        // - 5 rows with left columns + null right columns (unmatched left)
+        // - 5 rows with null left columns + right columns (unmatched right)
+        let total_rows: usize = no_spill_result.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(
+            total_rows, 10,
+            "FULL join with all-failing filter should produce 10 rows, got {total_rows}"
+        );
+
+        let spilled_str = batches_to_sort_string(&spilled_result);
+        let no_spill_str = batches_to_sort_string(&no_spill_result);
+        assert_eq!(
+            spilled_str, no_spill_str,
+            "Spill vs no-spill mismatch for FULL join batch_size={batch_size}"
+        );
+    }
+
+    Ok(())
+}
+
+fn build_joined_record_batches() -> Result<JoinedRecordBatches> {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Int32, true),
+        Field::new("b", DataType::Int32, true),
+        Field::new("x", DataType::Int32, true),
+        Field::new("y", DataType::Int32, true),
+    ]));
+
+    let mut batches = JoinedRecordBatches {
+        joined_batches: BatchCoalescer::new(Arc::clone(&schema), 8192),
+        filter_metadata: crate::joins::sort_merge_join::filter::FilterMetadata::new(),
+    };
+
+    // Insert already prejoined non-filtered rows
+    batches.joined_batches.push_batch(RecordBatch::try_new(
+        Arc::clone(&schema),
         vec![
             Arc::new(Int32Array::from(vec![1, 1])),
             Arc::new(Int32Array::from(vec![10, 10])),
             Arc::new(Int32Array::from(vec![1, 1])),
             Arc::new(Int32Array::from(vec![11, 9])),
         ],
-    )?);
+    )?)?;
 
-    batches.batches.push(RecordBatch::try_new(
+    batches.joined_batches.push_batch(RecordBatch::try_new(
         Arc::clone(&schema),
         vec![
             Arc::new(Int32Array::from(vec![1])),
@@ -2110,9 +2834,9 @@ fn build_joined_record_batches() -> Result<JoinedRecordBatches> {
             Arc::new(Int32Array::from(vec![1])),
             Arc::new(Int32Array::from(vec![12])),
         ],
-    )?);
+    )?)?;
 
-    batches.batches.push(RecordBatch::try_new(
+    batches.joined_batches.push_batch(RecordBatch::try_new(
         Arc::clone(&schema),
         vec![
             Arc::new(Int32Array::from(vec![1, 1])),
@@ -2120,9 +2844,9 @@ fn build_joined_record_batches() -> Result<JoinedRecordBatches> {
             Arc::new(Int32Array::from(vec![1, 1])),
             Arc::new(Int32Array::from(vec![11, 13])),
         ],
-    )?);
+    )?)?;
 
-    batches.batches.push(RecordBatch::try_new(
+    batches.joined_batches.push_batch(RecordBatch::try_new(
         Arc::clone(&schema),
         vec![
             Arc::new(Int32Array::from(vec![1])),
@@ -2130,9 +2854,9 @@ fn build_joined_record_batches() -> Result<JoinedRecordBatches> {
             Arc::new(Int32Array::from(vec![1])),
             Arc::new(Int32Array::from(vec![12])),
         ],
-    )?);
+    )?)?;
 
-    batches.batches.push(RecordBatch::try_new(
+    batches.joined_batches.push_batch(RecordBatch::try_new(
         Arc::clone(&schema),
         vec![
             Arc::new(Int32Array::from(vec![1, 1])),
@@ -2140,47 +2864,76 @@ fn build_joined_record_batches() -> Result<JoinedRecordBatches> {
             Arc::new(Int32Array::from(vec![1, 1])),
             Arc::new(Int32Array::from(vec![12, 11])),
         ],
-    )?);
+    )?)?;
 
     let streamed_indices = vec![0, 0];
-    batches.batch_ids.extend(vec![0; streamed_indices.len()]);
     batches
+        .filter_metadata
+        .batch_ids
+        .extend(vec![0; streamed_indices.len()]);
+    batches
+        .filter_metadata
         .row_indices
         .extend(&UInt64Array::from(streamed_indices));
 
     let streamed_indices = vec![1];
-    batches.batch_ids.extend(vec![0; streamed_indices.len()]);
     batches
+        .filter_metadata
+        .batch_ids
+        .extend(vec![0; streamed_indices.len()]);
+    batches
+        .filter_metadata
         .row_indices
         .extend(&UInt64Array::from(streamed_indices));
 
     let streamed_indices = vec![0, 0];
-    batches.batch_ids.extend(vec![1; streamed_indices.len()]);
     batches
+        .filter_metadata
+        .batch_ids
+        .extend(vec![1; streamed_indices.len()]);
+    batches
+        .filter_metadata
         .row_indices
         .extend(&UInt64Array::from(streamed_indices));
 
     let streamed_indices = vec![0];
-    batches.batch_ids.extend(vec![2; streamed_indices.len()]);
     batches
+        .filter_metadata
+        .batch_ids
+        .extend(vec![2; streamed_indices.len()]);
+    batches
+        .filter_metadata
         .row_indices
         .extend(&UInt64Array::from(streamed_indices));
 
     let streamed_indices = vec![0, 0];
-    batches.batch_ids.extend(vec![3; streamed_indices.len()]);
     batches
+        .filter_metadata
+        .batch_ids
+        .extend(vec![3; streamed_indices.len()]);
+    batches
+        .filter_metadata
         .row_indices
         .extend(&UInt64Array::from(streamed_indices));
 
     batches
+        .filter_metadata
         .filter_mask
         .extend(&BooleanArray::from(vec![true, false]));
-    batches.filter_mask.extend(&BooleanArray::from(vec![true]));
     batches
+        .filter_metadata
+        .filter_mask
+        .extend(&BooleanArray::from(vec![true]));
+    batches
+        .filter_metadata
         .filter_mask
         .extend(&BooleanArray::from(vec![false, true]));
-    batches.filter_mask.extend(&BooleanArray::from(vec![false]));
     batches
+        .filter_metadata
+        .filter_mask
+        .extend(&BooleanArray::from(vec![false]));
+    batches
+        .filter_metadata
         .filter_mask
         .extend(&BooleanArray::from(vec![false, false]));
 
@@ -2190,11 +2943,11 @@ fn build_joined_record_batches() -> Result<JoinedRecordBatches> {
 #[tokio::test]
 async fn test_left_outer_join_filtered_mask() -> Result<()> {
     let mut joined_batches = build_joined_record_batches()?;
-    let schema = joined_batches.batches.first().unwrap().schema();
+    let schema = joined_batches.joined_batches.schema();
 
-    let output = concat_batches(&schema, &joined_batches.batches)?;
-    let out_mask = joined_batches.filter_mask.finish();
-    let out_indices = joined_batches.row_indices.finish();
+    let output = joined_batches.concat_batches(&schema)?;
+    let out_mask = joined_batches.filter_metadata.filter_mask.finish();
+    let out_indices = joined_batches.filter_metadata.row_indices.finish();
 
     assert_eq!(
         get_corrected_filter_mask(
@@ -2331,7 +3084,7 @@ async fn test_left_outer_join_filtered_mask() -> Result<()> {
     let corrected_mask = get_corrected_filter_mask(
         Left,
         &out_indices,
-        &joined_batches.batch_ids,
+        &joined_batches.filter_metadata.batch_ids,
         &out_mask,
         output.num_rows(),
     )
@@ -2353,15 +3106,15 @@ async fn test_left_outer_join_filtered_mask() -> Result<()> {
 
     let filtered_rb = filter_record_batch(&output, &corrected_mask)?;
 
-    assert_snapshot!(batches_to_string(&[filtered_rb]), @r#"
-                +---+----+---+----+
-                | a | b  | x | y  |
-                +---+----+---+----+
-                | 1 | 10 | 1 | 11 |
-                | 1 | 11 | 1 | 12 |
-                | 1 | 12 | 1 | 13 |
-                +---+----+---+----+
-            "#);
+    assert_snapshot!(batches_to_string(&[filtered_rb]), @r"
+    +---+----+---+----+
+    | a | b  | x | y  |
+    +---+----+---+----+
+    | 1 | 10 | 1 | 11 |
+    | 1 | 11 | 1 | 12 |
+    | 1 | 12 | 1 | 13 |
+    +---+----+---+----+
+    ");
 
     // output null rows
 
@@ -2382,366 +3135,1375 @@ async fn test_left_outer_join_filtered_mask() -> Result<()> {
 
     let null_joined_batch = filter_record_batch(&output, &null_mask)?;
 
-    assert_snapshot!(batches_to_string(&[null_joined_batch]), @r#"
-                +---+----+---+----+
-                | a | b  | x | y  |
-                +---+----+---+----+
-                | 1 | 13 | 1 | 12 |
-                | 1 | 14 | 1 | 11 |
-                +---+----+---+----+
-            "#);
+    assert_snapshot!(batches_to_string(&[null_joined_batch]), @r"
+    +---+----+---+----+
+    | a | b  | x | y  |
+    +---+----+---+----+
+    | 1 | 13 | 1 | 12 |
+    | 1 | 14 | 1 | 11 |
+    +---+----+---+----+
+    ");
     Ok(())
 }
 
-#[tokio::test]
-async fn test_semi_join_filtered_mask() -> Result<()> {
-    for join_type in [LeftSemi, RightSemi] {
-        let mut joined_batches = build_joined_record_batches()?;
-        let schema = joined_batches.batches.first().unwrap().schema();
+#[test]
+fn test_partition_statistics() -> Result<()> {
+    use crate::ExecutionPlan;
+    use datafusion_common::stats::Precision;
+
+    let left = build_table(
+        ("a1", &vec![1, 2, 3]),
+        ("b1", &vec![4, 5, 5]),
+        ("c1", &vec![7, 8, 9]),
+    );
+    let right = build_table(
+        ("a2", &vec![10, 20, 30]),
+        ("b1", &vec![4, 5, 6]),
+        ("c2", &vec![70, 80, 90]),
+    );
+
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    // Test different join types to ensure partition_statistics works correctly for all
+    let join_types = vec![
+        (Inner, 6),     // left cols + right cols
+        (Left, 6),      // left cols + right cols
+        (Right, 6),     // left cols + right cols
+        (Full, 6),      // left cols + right cols
+        (LeftSemi, 3),  // only left cols
+        (LeftAnti, 3),  // only left cols
+        (RightSemi, 3), // only right cols
+        (RightAnti, 3), // only right cols
+    ];
 
-        let output = concat_batches(&schema, &joined_batches.batches)?;
-        let out_mask = joined_batches.filter_mask.finish();
-        let out_indices = joined_batches.row_indices.finish();
+    for (join_type, expected_cols) in join_types {
+        let join_exec =
+            join(Arc::clone(&left), Arc::clone(&right), on.clone(), join_type)?;
 
+        // Test aggregate statistics (partition = None)
+        // Should return meaningful statistics computed from both inputs
+        let stats = join_exec.partition_statistics(None)?;
         assert_eq!(
-            get_corrected_filter_mask(
-                join_type,
-                &UInt64Array::from(vec![0]),
-                &[0usize],
-                &BooleanArray::from(vec![true]),
-                output.num_rows()
-            )
-            .unwrap(),
-            BooleanArray::from(vec![true])
+            stats.column_statistics.len(),
+            expected_cols,
+            "Aggregate stats column count failed for {join_type:?}"
+        );
+        // Verify that aggregate statistics have a meaningful num_rows (not Absent)
+        assert!(
+            stats.num_rows != Precision::Absent,
+            "Aggregate stats should have meaningful num_rows for {join_type:?}, got {:?}",
+            stats.num_rows
         );
 
+        // Test partition-specific statistics (partition = Some(0))
+        // The implementation correctly passes `partition` to children.
+        // Since the child TestMemoryExec returns unknown stats for specific partitions,
+        // the join output will also have Absent num_rows. This is expected behavior
+        // as the statistics depend on what the children can provide.
+        let partition_stats = join_exec.partition_statistics(Some(0))?;
         assert_eq!(
-            get_corrected_filter_mask(
-                join_type,
-                &UInt64Array::from(vec![0]),
-                &[0usize],
-                &BooleanArray::from(vec![false]),
-                output.num_rows()
-            )
-            .unwrap(),
-            BooleanArray::from(vec![None])
+            partition_stats.column_statistics.len(),
+            expected_cols,
+            "Partition stats column count failed for {join_type:?}"
+        );
+        // When children return unknown stats, the join's partition stats will be Absent
+        assert!(
+            partition_stats.num_rows == Precision::Absent,
+            "Partition stats should have Absent num_rows when children return unknown for {join_type:?}, got {:?}",
+            partition_stats.num_rows
         );
+    }
 
-        assert_eq!(
-            get_corrected_filter_mask(
-                join_type,
-                &UInt64Array::from(vec![0, 0]),
-                &[0usize; 2],
-                &BooleanArray::from(vec![true, true]),
-                output.num_rows()
+    Ok(())
+}
+
+fn build_batches(
+    a: (&str, &[Vec<bool>]),
+    b: (&str, &[Vec<i32>]),
+    c: (&str, &[Vec<i32>]),
+) -> (Vec<RecordBatch>, SchemaRef) {
+    assert_eq!(a.1.len(), b.1.len());
+    let mut batches = vec![];
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new(a.0, DataType::Boolean, false),
+        Field::new(b.0, DataType::Int32, false),
+        Field::new(c.0, DataType::Int32, false),
+    ]));
+
+    for i in 0..a.1.len() {
+        batches.push(
+            RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![
+                    Arc::new(BooleanArray::from(a.1[i].clone())),
+                    Arc::new(Int32Array::from(b.1[i].clone())),
+                    Arc::new(Int32Array::from(c.1[i].clone())),
+                ],
             )
             .unwrap(),
-            BooleanArray::from(vec![Some(true), None])
         );
+    }
+    let schema = batches[0].schema();
+    (batches, schema)
+}
 
-        assert_eq!(
-            get_corrected_filter_mask(
-                join_type,
-                &UInt64Array::from(vec![0, 0, 0]),
-                &[0usize; 3],
-                &BooleanArray::from(vec![true, true, true]),
-                output.num_rows()
-            )
-            .unwrap(),
-            BooleanArray::from(vec![Some(true), None, None])
-        );
+fn build_batched_finish_barrier_table(
+    a: (&str, &[Vec<bool>]),
+    b: (&str, &[Vec<i32>]),
+    c: (&str, &[Vec<i32>]),
+) -> (Arc<BarrierExec>, Arc<TestMemoryExec>) {
+    let (batches, schema) = build_batches(a, b, c);
 
-        assert_eq!(
-            get_corrected_filter_mask(
-                join_type,
-                &UInt64Array::from(vec![0, 0, 0]),
-                &[0usize; 3],
-                &BooleanArray::from(vec![true, false, true]),
-                output.num_rows()
-            )
-            .unwrap(),
-            BooleanArray::from(vec![Some(true), None, None])
-        );
+    let memory_exec = TestMemoryExec::try_new_exec(
+        std::slice::from_ref(&batches),
+        Arc::clone(&schema),
+        None,
+    )
+    .unwrap();
 
-        assert_eq!(
-            get_corrected_filter_mask(
-                join_type,
-                &UInt64Array::from(vec![0, 0, 0]),
-                &[0usize; 3],
-                &BooleanArray::from(vec![false, false, true]),
-                output.num_rows()
-            )
-            .unwrap(),
-            BooleanArray::from(vec![None, None, Some(true),])
-        );
+    let barrier_exec = Arc::new(
+        BarrierExec::new(vec![batches], schema)
+            .with_log(false)
+            .without_start_barrier()
+            .with_finish_barrier(),
+    );
 
-        assert_eq!(
-            get_corrected_filter_mask(
-                join_type,
-                &UInt64Array::from(vec![0, 0, 0]),
-                &[0usize; 3],
-                &BooleanArray::from(vec![false, true, true]),
-                output.num_rows()
-            )
-            .unwrap(),
-            BooleanArray::from(vec![None, Some(true), None])
-        );
+    (barrier_exec, memory_exec)
+}
 
-        assert_eq!(
-            get_corrected_filter_mask(
-                join_type,
-                &UInt64Array::from(vec![0, 0, 0]),
-                &[0usize; 3],
-                &BooleanArray::from(vec![false, false, false]),
-                output.num_rows()
+/// Concat and sort batches by all the columns to make sure we can compare them with different join
+fn prepare_record_batches_for_cmp(output: Vec<RecordBatch>) -> RecordBatch {
+    let output_batch = arrow::compute::concat_batches(output[0].schema_ref(), &output)
+        .expect("failed to concat batches");
+
+    // Sort on all columns to make sure we have a deterministic order for the assertion
+    let sort_columns = output_batch
+        .columns()
+        .iter()
+        .map(|c| SortColumn {
+            values: Arc::clone(c),
+            options: None,
+        })
+        .collect::<Vec<_>>();
+
+    let sorted_columns =
+        arrow::compute::lexsort(&sort_columns, None).expect("failed to sort");
+
+    RecordBatch::try_new(output_batch.schema(), sorted_columns)
+        .expect("failed to create batch")
+}
+
+#[expect(clippy::too_many_arguments)]
+async fn join_get_stream_and_get_expected(
+    left: Arc<dyn ExecutionPlan>,
+    right: Arc<dyn ExecutionPlan>,
+    oracle_left: Arc<dyn ExecutionPlan>,
+    oracle_right: Arc<dyn ExecutionPlan>,
+    on: JoinOn,
+    join_type: JoinType,
+    filter: Option<JoinFilter>,
+    batch_size: usize,
+) -> Result<(SendableRecordBatchStream, RecordBatch)> {
+    let sort_options = vec![SortOptions::default(); on.len()];
+    let null_equality = NullEquality::NullEqualsNothing;
+    let task_ctx = Arc::new(
+        TaskContext::default()
+            .with_session_config(SessionConfig::default().with_batch_size(batch_size)),
+    );
+
+    let expected_output = {
+        let oracle = HashJoinExec::try_new(
+            oracle_left,
+            oracle_right,
+            on.clone(),
+            filter.clone(),
+            &join_type,
+            None,
+            PartitionMode::Partitioned,
+            null_equality,
+            false,
+        )?;
+
+        let stream = oracle.execute(0, Arc::clone(&task_ctx))?;
+
+        let batches = common::collect(stream).await?;
+
+        prepare_record_batches_for_cmp(batches)
+    };
+
+    let join = SortMergeJoinExec::try_new(
+        left,
+        right,
+        on,
+        filter,
+        join_type,
+        sort_options,
+        null_equality,
+    )?;
+
+    let stream = join.execute(0, task_ctx)?;
+
+    Ok((stream, expected_output))
+}
+
+fn generate_data_for_emit_early_test(
+    batch_size: usize,
+    number_of_batches: usize,
+    join_type: JoinType,
+) -> (
+    Arc<BarrierExec>,
+    Arc<BarrierExec>,
+    Arc<TestMemoryExec>,
+    Arc<TestMemoryExec>,
+) {
+    let number_of_rows_per_batch = number_of_batches * batch_size;
+    // Prepare data
+    let left_a1 = (0..number_of_rows_per_batch as i32)
+        .chunks(batch_size)
+        .into_iter()
+        .map(|chunk| chunk.collect::<Vec<_>>())
+        .collect::<Vec<_>>();
+    let left_b1 = (0..1000000)
+        .filter(|item| {
+            match join_type {
+                LeftAnti | RightAnti => {
+                    let remainder = item % (batch_size as i32);
+
+                    // Make sure to have one that match and one that don't
+                    remainder == 0 || remainder == 1
+                }
+                // Have at least 1 that is not matching
+                _ => item % batch_size as i32 != 0,
+            }
+        })
+        .take(number_of_rows_per_batch)
+        .chunks(batch_size)
+        .into_iter()
+        .map(|chunk| chunk.collect::<Vec<_>>())
+        .collect::<Vec<_>>();
+
+    let left_bool_col1 = left_a1
+        .clone()
+        .into_iter()
+        .map(|b| {
+            b.into_iter()
+                // Mostly true but have some false that not overlap with the right column
+                .map(|a| a % (batch_size as i32) != (batch_size as i32) - 2)
+                .collect::<Vec<_>>()
+        })
+        .collect::<Vec<_>>();
+
+    let (left, left_memory) = build_batched_finish_barrier_table(
+        ("bool_col1", left_bool_col1.as_slice()),
+        ("b1", left_b1.as_slice()),
+        ("a1", left_a1.as_slice()),
+    );
+
+    let right_a2 = (0..number_of_rows_per_batch as i32)
+        .map(|item| item * 11)
+        .chunks(batch_size)
+        .into_iter()
+        .map(|chunk| chunk.collect::<Vec<_>>())
+        .collect::<Vec<_>>();
+    let right_b1 = (0..1000000)
+        .filter(|item| {
+            match join_type {
+                LeftAnti | RightAnti => {
+                    let remainder = item % (batch_size as i32);
+
+                    // Make sure to have one that match and one that don't
+                    remainder == 1 || remainder == 2
+                }
+                // Have at least 1 that is not matching
+                _ => item % batch_size as i32 != 1,
+            }
+        })
+        .take(number_of_rows_per_batch)
+        .chunks(batch_size)
+        .into_iter()
+        .map(|chunk| chunk.collect::<Vec<_>>())
+        .collect::<Vec<_>>();
+    let right_bool_col2 = right_a2
+        .clone()
+        .into_iter()
+        .map(|b| {
+            b.into_iter()
+                // Mostly true but have some false that not overlap with the left column
+                .map(|a| a % (batch_size as i32) != (batch_size as i32) - 1)
+                .collect::<Vec<_>>()
+        })
+        .collect::<Vec<_>>();
+
+    let (right, right_memory) = build_batched_finish_barrier_table(
+        ("bool_col2", right_bool_col2.as_slice()),
+        ("b1", right_b1.as_slice()),
+        ("a2", right_a2.as_slice()),
+    );
+
+    (left, right, left_memory, right_memory)
+}
+
+#[tokio::test]
+async fn test_should_emit_early_when_have_enough_data_to_emit() -> Result<()> {
+    for with_filtering in [false, true] {
+        let join_types = vec![
+            Inner, Left, Right, RightSemi, Full, LeftSemi, LeftAnti, LeftMark, RightMark,
+        ];
+        const BATCH_SIZE: usize = 10;
+        for join_type in join_types {
+            for output_batch_size in [
+                BATCH_SIZE / 3,
+                BATCH_SIZE / 2,
+                BATCH_SIZE,
+                BATCH_SIZE * 2,
+                BATCH_SIZE * 3,
+            ] {
+                // Make sure the number of batches is enough for all join type to emit some output
+                let number_of_batches = if output_batch_size <= BATCH_SIZE {
+                    100
+                } else {
+                    // Have enough batches
+                    (output_batch_size * 100) / BATCH_SIZE
+                };
+
+                let (left, right, left_memory, right_memory) =
+                    generate_data_for_emit_early_test(
+                        BATCH_SIZE,
+                        number_of_batches,
+                        join_type,
+                    );
+
+                let on = vec![(
+                    Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+                    Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+                )];
+
+                let join_filter = if with_filtering {
+                    let filter = JoinFilter::new(
+                        Arc::new(BinaryExpr::new(
+                            Arc::new(Column::new("bool_col1", 0)),
+                            Operator::And,
+                            Arc::new(Column::new("bool_col2", 1)),
+                        )),
+                        vec![
+                            ColumnIndex {
+                                index: 0,
+                                side: JoinSide::Left,
+                            },
+                            ColumnIndex {
+                                index: 0,
+                                side: JoinSide::Right,
+                            },
+                        ],
+                        Arc::new(Schema::new(vec![
+                            Field::new("bool_col1", DataType::Boolean, true),
+                            Field::new("bool_col2", DataType::Boolean, true),
+                        ])),
+                    );
+                    Some(filter)
+                } else {
+                    None
+                };
+
+                // select *
+                // from t1
+                // right join t2 on t1.b1 = t2.b1 and t1.bool_col1 AND t2.bool_col2
+                let (mut output_stream, expected) = join_get_stream_and_get_expected(
+                    Arc::clone(&left) as Arc<dyn ExecutionPlan>,
+                    Arc::clone(&right) as Arc<dyn ExecutionPlan>,
+                    left_memory as Arc<dyn ExecutionPlan>,
+                    right_memory as Arc<dyn ExecutionPlan>,
+                    on,
+                    join_type,
+                    join_filter,
+                    output_batch_size,
+                )
+                .await?;
+
+                let (output_batched, output_batches_after_finish) =
+                  consume_stream_until_finish_barrier_reached(left, right, &mut output_stream).await.unwrap_or_else(|e| panic!("Failed to consume stream for join type: '{join_type}' and with filtering '{with_filtering}': {e:?}"));
+
+                // It should emit more than that, but we are being generous
+                // and to make sure the test pass for all
+                const MINIMUM_OUTPUT_BATCHES: usize = 5;
+                assert!(
+                    MINIMUM_OUTPUT_BATCHES <= number_of_batches / 5,
+                    "Make sure that the minimum output batches is realistic"
+                );
+                // Test to make sure that we are not waiting for input to be fully consumed to emit some output
+                assert!(
+                    output_batched.len() >= MINIMUM_OUTPUT_BATCHES,
+                    "[Sort Merge Join {join_type}] Stream must have at least emit {} batches, but only got {} batches",
+                    MINIMUM_OUTPUT_BATCHES,
+                    output_batched.len()
+                );
+
+                // Just sanity test to make sure we are still producing valid output
+                {
+                    let output = [output_batched, output_batches_after_finish].concat();
+                    let actual_prepared = prepare_record_batches_for_cmp(output);
+
+                    assert_eq!(actual_prepared.columns(), expected.columns());
+                }
+            }
+        }
+    }
+    Ok(())
+}
+
+/// Polls the stream until both barriers are reached,
+/// collecting the emitted batches along the way.
+///
+/// If the stream is pending for too long (5s) without emitting any batches,
+/// it panics to avoid hanging the test indefinitely.
+///
+/// Note: The left and right BarrierExec might be the input of the output stream
+async fn consume_stream_until_finish_barrier_reached(
+    left: Arc<BarrierExec>,
+    right: Arc<BarrierExec>,
+    output_stream: &mut SendableRecordBatchStream,
+) -> Result<(Vec<RecordBatch>, Vec<RecordBatch>)> {
+    let mut switch_to_finish_barrier = false;
+    let mut output_batched = vec![];
+    let mut after_finish_barrier_reached = vec![];
+    let mut background_task = JoinSet::new();
+
+    let mut start_time_since_last_ready = datafusion_common::instant::Instant::now();
+    loop {
+        let next_item = output_stream.next();
+
+        // Manual polling
+        let poll_output = futures::poll!(next_item);
+
+        // Wake up the stream to make sure it makes progress
+        tokio::task::yield_now().await;
+
+        match poll_output {
+            Poll::Ready(Some(Ok(batch))) => {
+                if batch.num_rows() == 0 {
+                    return internal_err!("join stream should not emit empty batch");
+                }
+                if switch_to_finish_barrier {
+                    after_finish_barrier_reached.push(batch);
+                } else {
+                    output_batched.push(batch);
+                }
+                start_time_since_last_ready = datafusion_common::instant::Instant::now();
+            }
+            Poll::Ready(Some(Err(e))) => return Err(e),
+            Poll::Ready(None) if !switch_to_finish_barrier => {
+                unreachable!("Stream should not end before manually finishing it")
+            }
+            Poll::Ready(None) => {
+                break;
+            }
+            Poll::Pending => {
+                if right.is_finish_barrier_reached()
+                    && left.is_finish_barrier_reached()
+                    && !switch_to_finish_barrier
+                {
+                    switch_to_finish_barrier = true;
+
+                    let right = Arc::clone(&right);
+                    background_task.spawn(async move {
+                        right.wait_finish().await;
+                    });
+                    let left = Arc::clone(&left);
+                    background_task.spawn(async move {
+                        left.wait_finish().await;
+                    });
+                }
+
+                // Make sure the test doesn't run forever
+                if start_time_since_last_ready.elapsed()
+                    > std::time::Duration::from_secs(5)
+                {
+                    return internal_err!(
+                        "Stream should have emitted data by now, but it's still pending. Output batches so far: {}",
+                        output_batched.len()
+                    );
+                }
+            }
+        }
+    }
+
+    Ok((output_batched, after_finish_barrier_reached))
+}
+
+/// Exercises the multi-source interleave path in `materialize_right_columns`.
+///
+/// When the right (buffered) side is split into many small batches with unique
+/// keys, a single `freeze_streamed()` call references multiple `BufferedBatch`es.
+/// This forces the `interleave` kernel instead of the single-source `take` path.
+/// Without this test, the interleave path has zero coverage from unit tests
+/// (fuzz tests use ~100 unique keys across 1000 rows, so all keys fit in one
+/// buffered batch).
+#[tokio::test]
+async fn join_filtered_with_multiple_buffered_batches() -> Result<()> {
+    let left_schema = Arc::new(Schema::new(vec![
+        Field::new("key", DataType::Int32, false),
+        Field::new("val_l", DataType::Int32, false),
+    ]));
+    let right_schema = Arc::new(Schema::new(vec![
+        Field::new("key", DataType::Int32, false),
+        Field::new("val_r", DataType::Int32, false),
+    ]));
+
+    // Left: single batch, keys 1..=6
+    let left_batch = RecordBatch::try_new(
+        Arc::clone(&left_schema),
+        vec![
+            Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6])),
+            Arc::new(Int32Array::from(vec![10, 20, 30, 40, 50, 60])),
+        ],
+    )?;
+    let left = build_table_from_batches(vec![left_batch]);
+
+    // Right: one row per batch so each key lives in a separate BufferedBatch
+    let right_batches: Vec<RecordBatch> = (1..=6)
+        .map(|k| {
+            RecordBatch::try_new(
+                Arc::clone(&right_schema),
+                vec![
+                    Arc::new(Int32Array::from(vec![k])),
+                    Arc::new(Int32Array::from(vec![k * 100])),
+                ],
             )
-            .unwrap(),
-            BooleanArray::from(vec![None, None, None])
-        );
+            .unwrap()
+        })
+        .collect();
+    let right = build_table_from_batches(right_batches);
+
+    let on: JoinOn = vec![(
+        Arc::new(Column::new_with_schema("key", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("key", &right.schema())?) as _,
+    )];
 
-        let corrected_mask = get_corrected_filter_mask(
-            join_type,
-            &out_indices,
-            &joined_batches.batch_ids,
-            &out_mask,
-            output.num_rows(),
-        )
-        .unwrap();
+    // Filter: val_l + val_r < 350 — passes for keys 1-3, fails for 4-6
+    let filter = JoinFilter::new(
+        Arc::new(BinaryExpr::new(
+            Arc::new(BinaryExpr::new(
+                Arc::new(Column::new("val_l", 0)),
+                Operator::Plus,
+                Arc::new(Column::new("val_r", 1)),
+            )),
+            Operator::Lt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(350)))),
+        )),
+        vec![
+            ColumnIndex {
+                index: 1,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 1,
+                side: JoinSide::Right,
+            },
+        ],
+        Arc::new(Schema::new(vec![
+            Field::new("val_l", DataType::Int32, true),
+            Field::new("val_r", DataType::Int32, true),
+        ])),
+    );
 
-        assert_eq!(
-            corrected_mask,
-            BooleanArray::from(vec![
-                Some(true),
-                None,
-                Some(true),
-                None,
-                Some(true),
-                None,
-                None,
-                None
-            ])
-        );
+    // Inner: only rows passing the filter
+    let (_, batches) = join_collect_with_filter(
+        Arc::clone(&left),
+        Arc::clone(&right),
+        on.clone(),
+        filter.clone(),
+        Inner,
+    )
+    .await?;
+    let result = batches_to_sort_string(&batches);
+    assert_snapshot!(result, @r"
+    +-----+-------+-----+-------+
+    | key | val_l | key | val_r |
+    +-----+-------+-----+-------+
+    | 1   | 10    | 1   | 100   |
+    | 2   | 20    | 2   | 200   |
+    | 3   | 30    | 3   | 300   |
+    +-----+-------+-----+-------+
+    ");
+
+    // Left: unmatched left rows get null right columns
+    let (_, batches) = join_collect_with_filter(
+        Arc::clone(&left),
+        Arc::clone(&right),
+        on.clone(),
+        filter.clone(),
+        Left,
+    )
+    .await?;
+    let result = batches_to_sort_string(&batches);
+    assert_snapshot!(result, @r"
+    +-----+-------+-----+-------+
+    | key | val_l | key | val_r |
+    +-----+-------+-----+-------+
+    | 1   | 10    | 1   | 100   |
+    | 2   | 20    | 2   | 200   |
+    | 3   | 30    | 3   | 300   |
+    | 4   | 40    |     |       |
+    | 5   | 50    |     |       |
+    | 6   | 60    |     |       |
+    +-----+-------+-----+-------+
+    ");
+
+    // Full: unmatched rows on both sides get null columns
+    let (_, batches) = join_collect_with_filter(
+        Arc::clone(&left),
+        Arc::clone(&right),
+        on.clone(),
+        filter.clone(),
+        Full,
+    )
+    .await?;
+    let result = batches_to_sort_string(&batches);
+    assert_snapshot!(result, @r"
+    +-----+-------+-----+-------+
+    | key | val_l | key | val_r |
+    +-----+-------+-----+-------+
+    |     |       | 4   | 400   |
+    |     |       | 5   | 500   |
+    |     |       | 6   | 600   |
+    | 1   | 10    | 1   | 100   |
+    | 2   | 20    | 2   | 200   |
+    | 3   | 30    | 3   | 300   |
+    | 4   | 40    |     |       |
+    | 5   | 50    |     |       |
+    | 6   | 60    |     |       |
+    +-----+-------+-----+-------+
+    ");
 
-        let filtered_rb = filter_record_batch(&output, &corrected_mask)?;
-
-        assert_batches_eq!(
-            &[
-                "+---+----+---+----+",
-                "| a | b  | x | y  |",
-                "+---+----+---+----+",
-                "| 1 | 10 | 1 | 11 |",
-                "| 1 | 11 | 1 | 12 |",
-                "| 1 | 12 | 1 | 13 |",
-                "+---+----+---+----+",
-            ],
-            &[filtered_rb]
-        );
+    Ok(())
+}
 
-        // output null rows
-        let null_mask = arrow::compute::not(&corrected_mask)?;
-        assert_eq!(
-            null_mask,
-            BooleanArray::from(vec![
-                Some(false),
-                None,
-                Some(false),
-                None,
-                Some(false),
-                None,
-                None,
-                None
-            ])
-        );
+/// Returns the column names on the schema
+fn columns(schema: &Schema) -> Vec<String> {
+    schema.fields().iter().map(|f| f.name().clone()).collect()
+}
 
-        let null_joined_batch = filter_record_batch(&output, &null_mask)?;
+// ==================== BitwiseSortMergeJoinStream direct tests ====================
+//
+// These tests construct a BitwiseSortMergeJoinStream directly (bypassing exec)
+// to exercise async re-entry and spill edge cases using PendingStream.
+
+/// Create test memory/spill resources for stream-level tests.
+fn test_stream_resources(
+    inner_schema: SchemaRef,
+    metrics: &ExecutionPlanMetricsSet,
+) -> (
+    datafusion_execution::memory_pool::MemoryReservation,
+    SpillManager,
+    Arc<datafusion_execution::runtime_env::RuntimeEnv>,
+) {
+    let ctx = TaskContext::default();
+    let runtime_env = ctx.runtime_env();
+    let reservation = MemoryConsumer::new("test").register(ctx.memory_pool());
+    let spill_manager = SpillManager::new(
+        Arc::clone(&runtime_env),
+        SpillMetrics::new(metrics, 0),
+        inner_schema,
+    );
+    (reservation, spill_manager, runtime_env)
+}
 
-        assert_batches_eq!(
-            &[
-                "+---+---+---+---+",
-                "| a | b | x | y |",
-                "+---+---+---+---+",
-                "+---+---+---+---+",
-            ],
-            &[null_joined_batch]
-        );
+/// A RecordBatch stream that yields Poll::Pending once before delivering
+/// each batch at a specified index. This simulates the behavior of
+/// repartitioned tokio::sync::mpsc channels where data isn't immediately
+/// available.
+struct PendingStream {
+    batches: Vec<RecordBatch>,
+    index: usize,
+    /// If pending_before[i] is true, yield Pending once before delivering
+    /// the batch at index i.
+    pending_before: Vec<bool>,
+    /// True if we've already yielded Pending for the current index.
+    yielded_pending: bool,
+    schema: SchemaRef,
+}
+
+impl PendingStream {
+    fn new(batches: Vec<RecordBatch>, pending_before: Vec<bool>) -> Self {
+        assert_eq!(batches.len(), pending_before.len());
+        let schema = batches[0].schema();
+        Self {
+            batches,
+            index: 0,
+            pending_before,
+            yielded_pending: false,
+            schema,
+        }
     }
+}
+
+impl Stream for PendingStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        if self.index >= self.batches.len() {
+            return Poll::Ready(None);
+        }
+        if self.pending_before[self.index] && !self.yielded_pending {
+            self.yielded_pending = true;
+            cx.waker().wake_by_ref();
+            return Poll::Pending;
+        }
+        self.yielded_pending = false;
+        let batch = self.batches[self.index].clone();
+        self.index += 1;
+        Poll::Ready(Some(Ok(batch)))
+    }
+}
+
+impl RecordBatchStream for PendingStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
+/// Helper: collect all output from a BitwiseSortMergeJoinStream.
+async fn collect_stream(stream: BitwiseSortMergeJoinStream) -> Result<Vec<RecordBatch>> {
+    common::collect(Box::pin(stream)).await
+}
+
+/// Reproduces the buffer_inner_key_group re-entry bug:
+///
+/// When buffer_inner_key_group buffers inner rows across batch boundaries
+/// and poll_next_inner_batch returns Pending mid-way, the ready! macro
+/// exits poll_join. On re-entry, the merge-scan reaches Equal again and
+/// calls buffer_inner_key_group a second time -- which starts with
+/// clear(), destroying the partially collected inner rows. Previously
+/// consumed batches are gone, so re-buffering misses them.
+///
+/// Setup:
+/// - Inner: 3 single-row batches, all with key=1, filter values c2=[10, 20, 30]
+/// - Outer: 1 row, key=1, filter value c1=10
+/// - Filter: c1 == c2 (only first inner row c2=10 matches)
+/// - Pending injected before 3rd inner batch
+///
+/// Without the bug: outer row emitted (match via c2=10)
+/// With the bug: outer row missing (c2=10 batch lost on re-entry)
+#[tokio::test]
+async fn filter_buffer_pending_loses_inner_rows() -> Result<()> {
+    let left_schema = Arc::new(Schema::new(vec![
+        Field::new("a1", DataType::Int32, false),
+        Field::new("b1", DataType::Int32, false),
+        Field::new("c1", DataType::Int32, false),
+    ]));
+    let right_schema = Arc::new(Schema::new(vec![
+        Field::new("a2", DataType::Int32, false),
+        Field::new("b1", DataType::Int32, false),
+        Field::new("c2", DataType::Int32, false),
+    ]));
+
+    // Outer: 1 row, key=1, c1=10
+    let outer_batch = RecordBatch::try_new(
+        Arc::clone(&left_schema),
+        vec![
+            Arc::new(Int32Array::from(vec![1])),
+            Arc::new(Int32Array::from(vec![1])), // join key
+            Arc::new(Int32Array::from(vec![10])), // filter value
+        ],
+    )?;
+
+    // Inner: 3 single-row batches, key=1, c2=[10, 20, 30]
+    let inner_batch1 = RecordBatch::try_new(
+        Arc::clone(&right_schema),
+        vec![
+            Arc::new(Int32Array::from(vec![100])),
+            Arc::new(Int32Array::from(vec![1])), // join key
+            Arc::new(Int32Array::from(vec![10])), // matches filter
+        ],
+    )?;
+    let inner_batch2 = RecordBatch::try_new(
+        Arc::clone(&right_schema),
+        vec![
+            Arc::new(Int32Array::from(vec![200])),
+            Arc::new(Int32Array::from(vec![1])),
+            Arc::new(Int32Array::from(vec![20])), // doesn't match
+        ],
+    )?;
+    let inner_batch3 = RecordBatch::try_new(
+        Arc::clone(&right_schema),
+        vec![
+            Arc::new(Int32Array::from(vec![300])),
+            Arc::new(Int32Array::from(vec![1])),
+            Arc::new(Int32Array::from(vec![30])), // doesn't match
+        ],
+    )?;
+
+    let outer: SendableRecordBatchStream = Box::pin(PendingStream::new(
+        vec![outer_batch],
+        vec![false], // outer delivers immediately
+    ));
+    let inner: SendableRecordBatchStream = Box::pin(PendingStream::new(
+        vec![inner_batch1, inner_batch2, inner_batch3],
+        vec![false, false, true], // Pending before 3rd batch
+    ));
+
+    // Filter: c1 == c2
+    let filter = JoinFilter::new(
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("c1", 0)),
+            Operator::Eq,
+            Arc::new(Column::new("c2", 1)),
+        )),
+        vec![
+            ColumnIndex {
+                index: 2,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 2,
+                side: JoinSide::Right,
+            },
+        ],
+        Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Int32, false),
+            Field::new("c2", DataType::Int32, false),
+        ])),
+    );
+
+    let on_outer: Vec<PhysicalExprRef> = vec![Arc::new(Column::new("b1", 1))];
+    let on_inner: Vec<PhysicalExprRef> = vec![Arc::new(Column::new("b1", 1))];
+
+    let metrics = ExecutionPlanMetricsSet::new();
+    let inner_schema = inner.schema();
+    let (reservation, spill_manager, runtime_env) =
+        test_stream_resources(inner_schema, &metrics);
+    let stream = BitwiseSortMergeJoinStream::try_new(
+        left_schema, // output schema = outer schema for semi
+        vec![SortOptions::default()],
+        NullEquality::NullEqualsNothing,
+        outer,
+        inner,
+        on_outer,
+        on_inner,
+        Some(filter),
+        LeftSemi,
+        8192,
+        0,
+        &metrics,
+        reservation,
+        spill_manager,
+        runtime_env,
+    )?;
+
+    let batches = collect_stream(stream).await?;
+    let total: usize = batches.iter().map(|b| b.num_rows()).sum();
+    assert_eq!(
+        total, 1,
+        "LeftSemi with filter: outer row should be emitted because \
+         inner row c2=10 matches filter c1==c2. Got {total} rows."
+    );
     Ok(())
 }
 
+/// Reproduces the no-filter boundary Pending re-entry bug:
+///
+/// When an outer key group spans a batch boundary, the no-filter path
+/// emits the current batch, then polls for the next outer batch. If
+/// poll returns Pending, poll_join exits. On re-entry, without the
+/// PendingBoundary fix, the new batch is processed fresh by the
+/// merge-scan. Since inner already advanced past this key, the outer
+/// rows with the matching key are skipped via Ordering::Less.
+///
+/// Setup:
+/// - Outer: 2 single-row batches, both with key=1 (key group spans boundary)
+/// - Inner: 1 row with key=1
+/// - Pending injected on outer before 2nd batch
+///
+/// Without fix: only first outer row emitted (second lost on re-entry)
+/// With fix: both outer rows emitted
 #[tokio::test]
-async fn test_anti_join_filtered_mask() -> Result<()> {
-    for join_type in [LeftAnti, RightAnti] {
-        let mut joined_batches = build_joined_record_batches()?;
-        let schema = joined_batches.batches.first().unwrap().schema();
+async fn no_filter_boundary_pending_loses_outer_rows() -> Result<()> {
+    let left_schema = Arc::new(Schema::new(vec![
+        Field::new("a1", DataType::Int32, false),
+        Field::new("b1", DataType::Int32, false),
+        Field::new("c1", DataType::Int32, false),
+    ]));
+    let right_schema = Arc::new(Schema::new(vec![
+        Field::new("a2", DataType::Int32, false),
+        Field::new("b1", DataType::Int32, false),
+        Field::new("c2", DataType::Int32, false),
+    ]));
 
-        let output = concat_batches(&schema, &joined_batches.batches)?;
-        let out_mask = joined_batches.filter_mask.finish();
-        let out_indices = joined_batches.row_indices.finish();
+    // Outer: 2 single-row batches, both key=1
+    let outer_batch1 = RecordBatch::try_new(
+        Arc::clone(&left_schema),
+        vec![
+            Arc::new(Int32Array::from(vec![1])),
+            Arc::new(Int32Array::from(vec![1])),
+            Arc::new(Int32Array::from(vec![10])),
+        ],
+    )?;
+    let outer_batch2 = RecordBatch::try_new(
+        Arc::clone(&left_schema),
+        vec![
+            Arc::new(Int32Array::from(vec![2])),
+            Arc::new(Int32Array::from(vec![1])), // same key
+            Arc::new(Int32Array::from(vec![20])),
+        ],
+    )?;
 
-        assert_eq!(
-            get_corrected_filter_mask(
-                join_type,
-                &UInt64Array::from(vec![0]),
-                &[0usize],
-                &BooleanArray::from(vec![true]),
-                1
-            )
-            .unwrap(),
-            BooleanArray::from(vec![None])
-        );
+    // Inner: 1 row, key=1
+    let inner_batch = RecordBatch::try_new(
+        Arc::clone(&right_schema),
+        vec![
+            Arc::new(Int32Array::from(vec![100])),
+            Arc::new(Int32Array::from(vec![1])),
+            Arc::new(Int32Array::from(vec![50])),
+        ],
+    )?;
 
-        assert_eq!(
-            get_corrected_filter_mask(
-                join_type,
-                &UInt64Array::from(vec![0]),
-                &[0usize],
-                &BooleanArray::from(vec![false]),
-                1
-            )
-            .unwrap(),
-            BooleanArray::from(vec![Some(true)])
-        );
+    let outer: SendableRecordBatchStream = Box::pin(PendingStream::new(
+        vec![outer_batch1, outer_batch2],
+        vec![false, true], // Pending before 2nd outer batch
+    ));
+    let inner: SendableRecordBatchStream =
+        Box::pin(PendingStream::new(vec![inner_batch], vec![false]));
+
+    let on_outer: Vec<PhysicalExprRef> = vec![Arc::new(Column::new("b1", 1))];
+    let on_inner: Vec<PhysicalExprRef> = vec![Arc::new(Column::new("b1", 1))];
+
+    let metrics = ExecutionPlanMetricsSet::new();
+    let inner_schema = inner.schema();
+    let (reservation, spill_manager, runtime_env) =
+        test_stream_resources(inner_schema, &metrics);
+    let stream = BitwiseSortMergeJoinStream::try_new(
+        left_schema,
+        vec![SortOptions::default()],
+        NullEquality::NullEqualsNothing,
+        outer,
+        inner,
+        on_outer,
+        on_inner,
+        None, // no filter
+        LeftSemi,
+        8192,
+        0,
+        &metrics,
+        reservation,
+        spill_manager,
+        runtime_env,
+    )?;
 
-        assert_eq!(
-            get_corrected_filter_mask(
-                join_type,
-                &UInt64Array::from(vec![0, 0]),
-                &[0usize; 2],
-                &BooleanArray::from(vec![true, true]),
-                2
-            )
-            .unwrap(),
-            BooleanArray::from(vec![None, None])
-        );
+    let batches = collect_stream(stream).await?;
+    let total: usize = batches.iter().map(|b| b.num_rows()).sum();
+    assert_eq!(
+        total, 2,
+        "LeftSemi no filter: both outer rows (key=1) should be emitted \
+         because inner has key=1. Got {total} rows."
+    );
+    Ok(())
+}
 
-        assert_eq!(
-            get_corrected_filter_mask(
-                join_type,
-                &UInt64Array::from(vec![0, 0, 0]),
-                &[0usize; 3],
-                &BooleanArray::from(vec![true, true, true]),
-                3
-            )
-            .unwrap(),
-            BooleanArray::from(vec![None, None, None])
-        );
+/// Tests the filtered boundary Pending re-entry: outer key group spans
+/// batches with a filter, and poll_next_outer_batch returns Pending.
+///
+/// Setup:
+/// - Outer: 2 single-row batches, both key=1, c1=[10, 20]
+/// - Inner: 1 row, key=1, c2=10
+/// - Filter: c1 == c2 (first outer row matches, second doesn't)
+/// - Pending before 2nd outer batch
+///
+/// Expected: 1 row (only the first outer row c1=10 passes the filter)
+#[tokio::test]
+async fn filtered_boundary_pending_outer_rows() -> Result<()> {
+    let left_schema = Arc::new(Schema::new(vec![
+        Field::new("a1", DataType::Int32, false),
+        Field::new("b1", DataType::Int32, false),
+        Field::new("c1", DataType::Int32, false),
+    ]));
+    let right_schema = Arc::new(Schema::new(vec![
+        Field::new("a2", DataType::Int32, false),
+        Field::new("b1", DataType::Int32, false),
+        Field::new("c2", DataType::Int32, false),
+    ]));
 
-        assert_eq!(
-            get_corrected_filter_mask(
-                join_type,
-                &UInt64Array::from(vec![0, 0, 0]),
-                &[0usize; 3],
-                &BooleanArray::from(vec![true, false, true]),
-                3
-            )
-            .unwrap(),
-            BooleanArray::from(vec![None, None, None])
-        );
+    let outer_batch1 = RecordBatch::try_new(
+        Arc::clone(&left_schema),
+        vec![
+            Arc::new(Int32Array::from(vec![1])),
+            Arc::new(Int32Array::from(vec![1])),
+            Arc::new(Int32Array::from(vec![10])), // matches filter
+        ],
+    )?;
+    let outer_batch2 = RecordBatch::try_new(
+        Arc::clone(&left_schema),
+        vec![
+            Arc::new(Int32Array::from(vec![2])),
+            Arc::new(Int32Array::from(vec![1])), // same key
+            Arc::new(Int32Array::from(vec![20])), // doesn't match
+        ],
+    )?;
 
-        assert_eq!(
-            get_corrected_filter_mask(
-                join_type,
-                &UInt64Array::from(vec![0, 0, 0]),
-                &[0usize; 3],
-                &BooleanArray::from(vec![false, false, true]),
-                3
-            )
-            .unwrap(),
-            BooleanArray::from(vec![None, None, None])
-        );
+    let inner_batch = RecordBatch::try_new(
+        Arc::clone(&right_schema),
+        vec![
+            Arc::new(Int32Array::from(vec![100])),
+            Arc::new(Int32Array::from(vec![1])),
+            Arc::new(Int32Array::from(vec![10])),
+        ],
+    )?;
 
-        assert_eq!(
-            get_corrected_filter_mask(
-                join_type,
-                &UInt64Array::from(vec![0, 0, 0]),
-                &[0usize; 3],
-                &BooleanArray::from(vec![false, true, true]),
-                3
-            )
-            .unwrap(),
-            BooleanArray::from(vec![None, None, None])
-        );
+    let outer: SendableRecordBatchStream = Box::pin(PendingStream::new(
+        vec![outer_batch1, outer_batch2],
+        vec![false, true], // Pending before 2nd outer batch
+    ));
+    let inner: SendableRecordBatchStream =
+        Box::pin(PendingStream::new(vec![inner_batch], vec![false]));
 
-        assert_eq!(
-            get_corrected_filter_mask(
-                join_type,
-                &UInt64Array::from(vec![0, 0, 0]),
-                &[0usize; 3],
-                &BooleanArray::from(vec![false, false, false]),
-                3
-            )
-            .unwrap(),
-            BooleanArray::from(vec![None, None, Some(true)])
-        );
+    let filter = JoinFilter::new(
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("c1", 0)),
+            Operator::Eq,
+            Arc::new(Column::new("c2", 1)),
+        )),
+        vec![
+            ColumnIndex {
+                index: 2,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 2,
+                side: JoinSide::Right,
+            },
+        ],
+        Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Int32, false),
+            Field::new("c2", DataType::Int32, false),
+        ])),
+    );
 
-        let corrected_mask = get_corrected_filter_mask(
-            join_type,
-            &out_indices,
-            &joined_batches.batch_ids,
-            &out_mask,
-            output.num_rows(),
+    let on_outer: Vec<PhysicalExprRef> = vec![Arc::new(Column::new("b1", 1))];
+    let on_inner: Vec<PhysicalExprRef> = vec![Arc::new(Column::new("b1", 1))];
+
+    let metrics = ExecutionPlanMetricsSet::new();
+    let inner_schema = inner.schema();
+    let (reservation, spill_manager, runtime_env) =
+        test_stream_resources(inner_schema, &metrics);
+    let stream = BitwiseSortMergeJoinStream::try_new(
+        left_schema,
+        vec![SortOptions::default()],
+        NullEquality::NullEqualsNothing,
+        outer,
+        inner,
+        on_outer,
+        on_inner,
+        Some(filter),
+        LeftSemi,
+        8192,
+        0,
+        &metrics,
+        reservation,
+        spill_manager,
+        runtime_env,
+    )?;
+
+    let batches = collect_stream(stream).await?;
+    let total: usize = batches.iter().map(|b| b.num_rows()).sum();
+    assert_eq!(
+        total, 1,
+        "LeftSemi filtered boundary: only first outer row (c1=10) matches \
+         filter c1==c2. Got {total} rows."
+    );
+    Ok(())
+}
+
+// ── Bitwise stream spill tests ─────────────────────────────────────────────
+
+/// Exercises inner key group spilling under memory pressure.
+///
+/// Uses a tiny memory limit (100 bytes) with disk spilling enabled. Since our
+/// operator only buffers inner rows when a filter is present, this test includes
+/// a filter (c1 < c2, always true). Verifies:
+/// 1. Spill metrics are recorded (spill_count, spilled_bytes, spilled_rows > 0)
+/// 2. Results match a non-spilled run
+#[tokio::test]
+async fn bitwise_spill_with_filter() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![1, 2, 3, 4, 5, 6]),
+        ("b1", &vec![1, 2, 3, 4, 5, 6]),
+        ("c1", &vec![4, 5, 6, 7, 8, 9]),
+    );
+    let right = build_table(
+        ("a2", &vec![10, 20, 30, 40, 50]),
+        ("b1", &vec![1, 3, 4, 6, 8]),
+        ("c2", &vec![50, 60, 70, 80, 90]),
+    );
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+    let sort_options = vec![SortOptions::default(); on.len()];
+
+    // c1 < c2 is always true for matching keys
+    let filter = JoinFilter::new(
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("c1", 0)),
+            Operator::Lt,
+            Arc::new(Column::new("c2", 1)),
+        )),
+        vec![
+            ColumnIndex {
+                index: 2,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 2,
+                side: JoinSide::Right,
+            },
+        ],
+        Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Int32, false),
+            Field::new("c2", DataType::Int32, false),
+        ])),
+    );
+
+    let runtime = RuntimeEnvBuilder::new()
+        .with_memory_limit(100, 1.0)
+        .with_disk_manager_builder(
+            DiskManagerBuilder::default().with_mode(DiskManagerMode::OsTmpDirectory),
         )
-        .unwrap();
+        .build_arc()?;
 
-        assert_eq!(
-            corrected_mask,
-            BooleanArray::from(vec![
-                None,
-                None,
-                None,
-                None,
-                None,
-                Some(true),
-                None,
-                Some(true)
-            ])
-        );
+    for batch_size in [1, 50] {
+        let session_config = SessionConfig::default().with_batch_size(batch_size);
+
+        for join_type in [LeftSemi, LeftAnti, RightSemi, RightAnti] {
+            let task_ctx = Arc::new(
+                TaskContext::default()
+                    .with_session_config(session_config.clone())
+                    .with_runtime(Arc::clone(&runtime)),
+            );
 
-        let filtered_rb = filter_record_batch(&output, &corrected_mask)?;
-
-        allow_duplicates! {
-            assert_snapshot!(batches_to_string(&[filtered_rb]), @r#"
-                    +---+----+---+----+
-                    | a | b  | x | y  |
-                    +---+----+---+----+
-                    | 1 | 13 | 1 | 12 |
-                    | 1 | 14 | 1 | 11 |
-                    +---+----+---+----+
-            "#);
+            let join = SortMergeJoinExec::try_new(
+                Arc::clone(&left),
+                Arc::clone(&right),
+                on.clone(),
+                Some(filter.clone()),
+                join_type,
+                sort_options.clone(),
+                NullEquality::NullEqualsNothing,
+            )?;
+            let stream = join.execute(0, task_ctx)?;
+            let spilled_result = common::collect(stream).await.unwrap();
+
+            assert!(
+                join.metrics().is_some(),
+                "metrics missing for {join_type:?}"
+            );
+            let metrics = join.metrics().unwrap();
+            assert!(
+                metrics.spill_count().unwrap() > 0,
+                "expected spill_count > 0 for {join_type:?}, batch_size={batch_size}"
+            );
+            assert!(
+                metrics.spilled_bytes().unwrap() > 0,
+                "expected spilled_bytes > 0 for {join_type:?}, batch_size={batch_size}"
+            );
+            assert!(
+                metrics.spilled_rows().unwrap() > 0,
+                "expected spilled_rows > 0 for {join_type:?}, batch_size={batch_size}"
+            );
+
+            // Run without spilling and compare results
+            let task_ctx_no_spill = Arc::new(
+                TaskContext::default().with_session_config(session_config.clone()),
+            );
+            let join_no_spill = SortMergeJoinExec::try_new(
+                Arc::clone(&left),
+                Arc::clone(&right),
+                on.clone(),
+                Some(filter.clone()),
+                join_type,
+                sort_options.clone(),
+                NullEquality::NullEqualsNothing,
+            )?;
+            let stream = join_no_spill.execute(0, task_ctx_no_spill)?;
+            let no_spill_result = common::collect(stream).await.unwrap();
+
+            let no_spill_metrics = join_no_spill.metrics().unwrap();
+            assert_eq!(
+                no_spill_metrics.spill_count(),
+                Some(0),
+                "unexpected spill for {join_type:?} without memory limit"
+            );
+
+            assert_eq!(
+                spilled_result, no_spill_result,
+                "spilled vs non-spilled results differ for {join_type:?}, batch_size={batch_size}"
+            );
         }
+    }
 
-        // output null rows
-        let null_mask = arrow::compute::not(&corrected_mask)?;
-        assert_eq!(
-            null_mask,
-            BooleanArray::from(vec![
-                None,
-                None,
-                None,
-                None,
-                None,
-                Some(false),
-                None,
-                Some(false),
-            ])
+    Ok(())
+}
+
+/// Reproduces a bug where `resume_boundary` for the Filtered pending case
+/// only checks `inner_key_buffer.is_empty()` but ignores `inner_key_spill`.
+/// After spilling, the in-memory buffer is cleared while the spill file
+/// holds the data. If the outer key group spans a batch boundary, the
+/// second outer batch's rows are never evaluated against the inner group.
+///
+/// Setup:
+/// - Outer: 2 single-row batches, both key=1, c1=[10, 10]
+/// - Inner: 1 batch with many rows all key=1 (enough to trigger spill)
+/// - Filter: c1 == c2 (matches when c2=10)
+/// - Memory limit: tiny (100 bytes) to force spilling
+/// - Pending before 2nd outer batch to trigger boundary re-entry
+///
+/// Expected: both outer rows match (semi=2 rows, anti=0 rows)
+/// Bug: second outer row is skipped because resume_boundary sees empty
+///      inner_key_buffer and skips re-evaluation.
+#[tokio::test]
+async fn spill_filtered_boundary_loses_outer_rows() -> Result<()> {
+    let left_schema = Arc::new(Schema::new(vec![
+        Field::new("a1", DataType::Int32, false),
+        Field::new("b1", DataType::Int32, false),
+        Field::new("c1", DataType::Int32, false),
+    ]));
+    let right_schema = Arc::new(Schema::new(vec![
+        Field::new("a2", DataType::Int32, false),
+        Field::new("b1", DataType::Int32, false),
+        Field::new("c2", DataType::Int32, false),
+    ]));
+
+    // Two single-row outer batches with the same key -- key group spans boundary
+    let outer_batch1 = RecordBatch::try_new(
+        Arc::clone(&left_schema),
+        vec![
+            Arc::new(Int32Array::from(vec![1])),
+            Arc::new(Int32Array::from(vec![1])), // key=1
+            Arc::new(Int32Array::from(vec![10])), // matches filter
+        ],
+    )?;
+    let outer_batch2 = RecordBatch::try_new(
+        Arc::clone(&left_schema),
+        vec![
+            Arc::new(Int32Array::from(vec![2])),
+            Arc::new(Int32Array::from(vec![1])), // same key=1
+            Arc::new(Int32Array::from(vec![10])), // also matches filter
+        ],
+    )?;
+
+    // Inner: many rows with key=1 to force spilling, followed by key=2.
+    // c2=10 so the filter c1==c2 passes for both outer rows.
+    // The key=2 row ensures the inner cursor advances past the key group
+    // (buffer_inner_key_group returns Ok(false) instead of Ok(true)).
+    let n_inner = 200;
+    let mut inner_a = vec![100; n_inner];
+    inner_a.push(101);
+    let mut inner_b = vec![1; n_inner];
+    inner_b.push(2); // different key -- forces inner cursor past key=1
+    let mut inner_c = vec![10; n_inner];
+    inner_c.push(10);
+    let inner_batch = RecordBatch::try_new(
+        Arc::clone(&right_schema),
+        vec![
+            Arc::new(Int32Array::from(inner_a)),
+            Arc::new(Int32Array::from(inner_b)),
+            Arc::new(Int32Array::from(inner_c)),
+        ],
+    )?;
+
+    // Filter: c1 == c2
+    let filter = JoinFilter::new(
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("c1", 0)),
+            Operator::Eq,
+            Arc::new(Column::new("c2", 1)),
+        )),
+        vec![
+            ColumnIndex {
+                index: 2,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 2,
+                side: JoinSide::Right,
+            },
+        ],
+        Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Int32, false),
+            Field::new("c2", DataType::Int32, false),
+        ])),
+    );
+
+    let runtime = RuntimeEnvBuilder::new()
+        .with_memory_limit(100, 1.0)
+        .with_disk_manager_builder(
+            DiskManagerBuilder::default().with_mode(DiskManagerMode::OsTmpDirectory),
+        )
+        .build_arc()?;
+
+    let on_outer: Vec<PhysicalExprRef> = vec![Arc::new(Column::new("b1", 1))];
+    let on_inner: Vec<PhysicalExprRef> = vec![Arc::new(Column::new("b1", 1))];
+
+    for join_type in [LeftSemi, LeftAnti] {
+        let outer: SendableRecordBatchStream = Box::pin(PendingStream::new(
+            vec![outer_batch1.clone(), outer_batch2.clone()],
+            vec![false, true], // Pending before 2nd outer batch
+        ));
+        let inner: SendableRecordBatchStream =
+            Box::pin(PendingStream::new(vec![inner_batch.clone()], vec![false]));
+
+        let metrics = ExecutionPlanMetricsSet::new();
+        let reservation = MemoryConsumer::new("test").register(&runtime.memory_pool);
+        let spill_manager = SpillManager::new(
+            Arc::clone(&runtime),
+            SpillMetrics::new(&metrics, 0),
+            Arc::clone(&right_schema),
         );
 
-        let null_joined_batch = filter_record_batch(&output, &null_mask)?;
+        let stream = BitwiseSortMergeJoinStream::try_new(
+            Arc::clone(&left_schema),
+            vec![SortOptions::default()],
+            NullEquality::NullEqualsNothing,
+            outer,
+            inner,
+            on_outer.clone(),
+            on_inner.clone(),
+            Some(filter.clone()),
+            join_type,
+            8192,
+            0,
+            &metrics,
+            reservation,
+            spill_manager,
+            Arc::clone(&runtime),
+        )?;
 
-        allow_duplicates! {
-            assert_snapshot!(batches_to_string(&[null_joined_batch]), @r#"
-                        +---+---+---+---+
-                        | a | b | x | y |
-                        +---+---+---+---+
-                        +---+---+---+---+
-                "#);
+        let batches = collect_stream(stream).await?;
+        let total: usize = batches.iter().map(|b| b.num_rows()).sum();
+
+        match join_type {
+            LeftSemi => {
+                assert_eq!(
+                    total, 2,
+                    "LeftSemi spill+boundary: both outer rows match filter, \
+                     expected 2 rows, got {total}"
+                );
+            }
+            LeftAnti => {
+                assert_eq!(
+                    total, 0,
+                    "LeftAnti spill+boundary: both outer rows match filter, \
+                     expected 0 rows, got {total}"
+                );
+            }
+            _ => unreachable!(),
         }
     }
 
     Ok(())
 }
-
-/// Returns the column names on the schema
-fn columns(schema: &Schema) -> Vec<String> {
-    schema.fields().iter().map(|f| f.name().clone()).collect()
-}
diff --git a/datafusion/physical-plan/src/joins/stream_join_utils.rs b/datafusion/physical-plan/src/joins/stream_join_utils.rs
index 80221a77992ce..571c199abb448 100644
--- a/datafusion/physical-plan/src/joins/stream_join_utils.rs
+++ b/datafusion/physical-plan/src/joins/stream_join_utils.rs
@@ -22,24 +22,26 @@ use std::collections::{HashMap, VecDeque};
 use std::mem::size_of;
 use std::sync::Arc;
 
+use crate::joins::MapOffset;
 use crate::joins::join_hash_map::{
-    get_matched_indices, get_matched_indices_with_limit_offset, update_from_iter,
-    JoinHashMapOffset,
+    contain_hashes, get_matched_indices, get_matched_indices_with_limit_offset,
+    update_from_iter,
 };
 use crate::joins::utils::{JoinFilter, JoinHashMapType};
-use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricBuilder};
-use crate::{metrics, ExecutionPlan};
+use crate::metrics::{
+    BaselineMetrics, ExecutionPlanMetricsSet, MetricBuilder, MetricCategory,
+};
+use crate::{ExecutionPlan, metrics};
 
 use arrow::array::{
-    ArrowPrimitiveType, BooleanBufferBuilder, NativeAdapter, PrimitiveArray, RecordBatch,
+    ArrowPrimitiveType, BooleanArray, BooleanBufferBuilder, NativeAdapter,
+    PrimitiveArray, RecordBatch,
 };
 use arrow::compute::concat_batches;
 use arrow::datatypes::{ArrowNativeType, Schema, SchemaRef};
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
 use datafusion_common::utils::memory::estimate_memory_size;
-use datafusion_common::{
-    arrow_datafusion_err, DataFusionError, HashSet, JoinSide, Result, ScalarValue,
-};
+use datafusion_common::{HashSet, JoinSide, Result, ScalarValue, arrow_datafusion_err};
 use datafusion_expr::interval_arithmetic::Interval;
 use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_expr::intervals::cp_solver::ExprIntervalGraph;
@@ -79,8 +81,10 @@ impl JoinHashMapType for PruningJoinHashMap {
         &self,
         hash_values: &[u64],
         limit: usize,
-        offset: JoinHashMapOffset,
-    ) -> (Vec<u32>, Vec<u64>, Option<JoinHashMapOffset>) {
+        offset: MapOffset,
+        input_indices: &mut Vec<u32>,
+        match_indices: &mut Vec<u64>,
+    ) -> Option<MapOffset> {
         // Flatten the deque
         let next: Vec<u64> = self.next.iter().copied().collect();
         get_matched_indices_with_limit_offset::<u64>(
@@ -89,12 +93,22 @@ impl JoinHashMapType for PruningJoinHashMap {
             hash_values,
             limit,
             offset,
+            input_indices,
+            match_indices,
         )
     }
 
+    fn contain_hashes(&self, hash_values: &[u64]) -> BooleanArray {
+        contain_hashes(&self.map, hash_values)
+    }
+
     fn is_empty(&self) -> bool {
         self.map.is_empty()
     }
+
+    fn len(&self) -> usize {
+        self.map.len()
+    }
 }
 
 /// The `PruningJoinHashMap` is similar to a regular `JoinHashMap`, but with
@@ -363,7 +377,7 @@ fn convert_filter_columns(
     column_map: &HashMap<Column, Column>,
 ) -> Result<Option<Arc<dyn PhysicalExpr>>> {
     // Attempt to downcast the input expression to a Column type.
-    Ok(if let Some(col) = input.as_any().downcast_ref::<Column>() {
+    Ok(if let Some(col) = input.downcast_ref::<Column>() {
         // If the downcast is successful, retrieve the corresponding filter column.
         column_map.get(col).map(|c| Arc::new(c.clone()) as _)
     } else {
@@ -682,40 +696,41 @@ pub struct StreamJoinMetrics {
     pub(crate) right: StreamJoinSideMetrics,
     /// Memory used by sides in bytes
     pub(crate) stream_memory_usage: metrics::Gauge,
-    /// Number of batches produced by this operator
-    pub(crate) output_batches: metrics::Count,
     /// Number of rows produced by this operator
     pub(crate) baseline_metrics: BaselineMetrics,
 }
 
 impl StreamJoinMetrics {
     pub fn new(partition: usize, metrics: &ExecutionPlanMetricsSet) -> Self {
-        let input_batches =
-            MetricBuilder::new(metrics).counter("input_batches", partition);
-        let input_rows = MetricBuilder::new(metrics).counter("input_rows", partition);
+        let input_batches = MetricBuilder::new(metrics)
+            .with_category(MetricCategory::Rows)
+            .counter("left_input_batches", partition);
+        let input_rows = MetricBuilder::new(metrics)
+            .with_category(MetricCategory::Rows)
+            .counter("left_input_rows", partition);
         let left = StreamJoinSideMetrics {
             input_batches,
             input_rows,
         };
 
-        let input_batches =
-            MetricBuilder::new(metrics).counter("input_batches", partition);
-        let input_rows = MetricBuilder::new(metrics).counter("input_rows", partition);
+        let input_batches = MetricBuilder::new(metrics)
+            .with_category(MetricCategory::Rows)
+            .counter("right_input_batches", partition);
+        let input_rows = MetricBuilder::new(metrics)
+            .with_category(MetricCategory::Rows)
+            .counter("right_input_rows", partition);
         let right = StreamJoinSideMetrics {
             input_batches,
             input_rows,
         };
 
-        let stream_memory_usage =
-            MetricBuilder::new(metrics).gauge("stream_memory_usage", partition);
-
-        let output_batches =
-            MetricBuilder::new(metrics).counter("output_batches", partition);
+        let stream_memory_usage = MetricBuilder::new(metrics)
+            .with_category(MetricCategory::Bytes)
+            .gauge("stream_memory_usage", partition);
 
         Self {
             left,
             right,
-            output_batches,
             stream_memory_usage,
             baseline_metrics: BaselineMetrics::new(metrics, partition),
         }
@@ -1020,46 +1035,54 @@ pub mod tests {
         let left_schema = Arc::new(left_schema);
         let right_schema = Arc::new(right_schema);
 
-        assert!(build_filter_input_order(
-            JoinSide::Left,
-            &filter,
-            &left_schema,
-            &PhysicalSortExpr {
-                expr: col("la1", left_schema.as_ref())?,
-                options: SortOptions::default(),
-            }
-        )?
-        .is_some());
-        assert!(build_filter_input_order(
-            JoinSide::Left,
-            &filter,
-            &left_schema,
-            &PhysicalSortExpr {
-                expr: col("lt1", left_schema.as_ref())?,
-                options: SortOptions::default(),
-            }
-        )?
-        .is_none());
-        assert!(build_filter_input_order(
-            JoinSide::Right,
-            &filter,
-            &right_schema,
-            &PhysicalSortExpr {
-                expr: col("ra1", right_schema.as_ref())?,
-                options: SortOptions::default(),
-            }
-        )?
-        .is_some());
-        assert!(build_filter_input_order(
-            JoinSide::Right,
-            &filter,
-            &right_schema,
-            &PhysicalSortExpr {
-                expr: col("rb1", right_schema.as_ref())?,
-                options: SortOptions::default(),
-            }
-        )?
-        .is_none());
+        assert!(
+            build_filter_input_order(
+                JoinSide::Left,
+                &filter,
+                &left_schema,
+                &PhysicalSortExpr {
+                    expr: col("la1", left_schema.as_ref())?,
+                    options: SortOptions::default(),
+                }
+            )?
+            .is_some()
+        );
+        assert!(
+            build_filter_input_order(
+                JoinSide::Left,
+                &filter,
+                &left_schema,
+                &PhysicalSortExpr {
+                    expr: col("lt1", left_schema.as_ref())?,
+                    options: SortOptions::default(),
+                }
+            )?
+            .is_none()
+        );
+        assert!(
+            build_filter_input_order(
+                JoinSide::Right,
+                &filter,
+                &right_schema,
+                &PhysicalSortExpr {
+                    expr: col("ra1", right_schema.as_ref())?,
+                    options: SortOptions::default(),
+                }
+            )?
+            .is_some()
+        );
+        assert!(
+            build_filter_input_order(
+                JoinSide::Right,
+                &filter,
+                &right_schema,
+                &PhysicalSortExpr {
+                    expr: col("rb1", right_schema.as_ref())?,
+                    options: SortOptions::default(),
+                }
+            )?
+            .is_none()
+        );
 
         Ok(())
     }
diff --git a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs
index be4646e88bd76..11e036434ee97 100644
--- a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs
+++ b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs
@@ -25,36 +25,36 @@
 //! This plan uses the [`OneSideHashJoiner`] object to facilitate join calculations
 //! for both its children.
 
-use std::any::Any;
 use std::fmt::{self, Debug};
 use std::mem::{size_of, size_of_val};
 use std::sync::Arc;
 use std::task::{Context, Poll};
 use std::vec;
 
+use crate::check_if_same_properties;
 use crate::common::SharedMemoryReservation;
 use crate::execution_plan::{boundedness_from_children, emission_type_from_children};
 use crate::joins::stream_join_utils::{
+    PruningJoinHashMap, SortedFilterExpr, StreamJoinMetrics,
     calculate_filter_expr_intervals, combine_two_batches,
     convert_sort_expr_with_filter_schema, get_pruning_anti_indices,
     get_pruning_semi_indices, prepare_sorted_exprs, record_visited_indices,
-    PruningJoinHashMap, SortedFilterExpr, StreamJoinMetrics,
 };
 use crate::joins::utils::{
-    apply_join_filter_to_indices, build_batch_from_indices, build_join_schema,
-    check_join_is_valid, equal_rows_arr, symmetric_join_output_partitioning, update_hash,
     BatchSplitter, BatchTransformer, ColumnIndex, JoinFilter, JoinHashMapType, JoinOn,
-    JoinOnRef, NoopBatchTransformer, StatefulStreamResult,
+    JoinOnRef, NoopBatchTransformer, StatefulStreamResult, apply_join_filter_to_indices,
+    build_batch_from_indices, build_join_schema, check_join_is_valid, equal_rows_arr,
+    symmetric_join_output_partitioning, update_hash,
 };
 use crate::projection::{
-    join_allows_pushdown, join_table_borders, new_join_children,
-    physical_to_column_exprs, update_join_filter, update_join_on, ProjectionExec,
+    ProjectionExec, join_allows_pushdown, join_table_borders, new_join_children,
+    physical_to_column_exprs, update_join_filter, update_join_on,
 };
 use crate::{
+    DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties,
+    PlanProperties, RecordBatchStream, SendableRecordBatchStream,
     joins::StreamJoinPartitionMode,
     metrics::{ExecutionPlanMetricsSet, MetricsSet},
-    DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties,
-    PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics,
 };
 
 use arrow::array::{
@@ -65,21 +65,23 @@ use arrow::compute::concat_batches;
 use arrow::datatypes::{ArrowNativeType, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use datafusion_common::hash_utils::create_hashes;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::utils::bisect;
 use datafusion_common::{
-    internal_err, plan_err, HashSet, JoinSide, JoinType, NullEquality, Result,
+    HashSet, JoinSide, JoinType, NullEquality, Result, assert_eq_or_internal_err,
+    plan_err,
 };
-use datafusion_execution::memory_pool::MemoryConsumer;
 use datafusion_execution::TaskContext;
+use datafusion_execution::memory_pool::MemoryConsumer;
 use datafusion_expr::interval_arithmetic::Interval;
 use datafusion_physical_expr::equivalence::join_equivalence_properties;
 use datafusion_physical_expr::intervals::cp_solver::ExprIntervalGraph;
-use datafusion_physical_expr_common::physical_expr::{fmt_sql, PhysicalExprRef};
+use datafusion_physical_expr_common::physical_expr::{PhysicalExprRef, fmt_sql};
 use datafusion_physical_expr_common::sort_expr::{LexOrdering, OrderingRequirements};
 
-use ahash::RandomState;
-use futures::{ready, Stream, StreamExt};
-use parking_lot::Mutex;
+use datafusion_common::hash_utils::RandomState;
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays;
+use futures::{Stream, StreamExt, ready};
 
 const HASHMAP_SHRINK_SCALE_FACTOR: usize = 4;
 
@@ -195,7 +197,7 @@ pub struct SymmetricHashJoinExec {
     /// Partition Mode
     mode: StreamJoinPartitionMode,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl SymmetricHashJoinExec {
@@ -205,7 +207,7 @@ impl SymmetricHashJoinExec {
     /// - It is not possible to join the left and right sides on keys `on`, or
     /// - It fails to construct `SortedFilterExpr`s, or
     /// - It fails to create the [ExprIntervalGraph].
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     pub fn try_new(
         left: Arc<dyn ExecutionPlan>,
         right: Arc<dyn ExecutionPlan>,
@@ -235,7 +237,7 @@ impl SymmetricHashJoinExec {
             build_join_schema(&left_schema, &right_schema, join_type);
 
         // Initialize the random state for the join operation:
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
         let schema = Arc::new(schema);
         let cache = Self::compute_properties(&left, &right, schema, *join_type, &on)?;
         Ok(SymmetricHashJoinExec {
@@ -251,7 +253,7 @@ impl SymmetricHashJoinExec {
             left_sort_exprs,
             right_sort_exprs,
             mode,
-            cache,
+            cache: Arc::new(cache),
         })
     }
 
@@ -358,6 +360,20 @@ impl SymmetricHashJoinExec {
         }
         Ok(false)
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        let left = children.swap_remove(0);
+        let right = children.swap_remove(0);
+        Self {
+            left,
+            right,
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for SymmetricHashJoinExec {
@@ -405,11 +421,7 @@ impl ExecutionPlan for SymmetricHashJoinExec {
         "SymmetricHashJoinExec"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -447,10 +459,28 @@ impl ExecutionPlan for SymmetricHashJoinExec {
         vec![&self.left, &self.right]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn crate::PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to join keys from both sides
+        let mut tnr = TreeNodeRecursion::Continue;
+        for (left, right) in &self.on {
+            tnr = tnr.visit_sibling(|| f(left.as_ref()))?;
+            tnr = tnr.visit_sibling(|| f(right.as_ref()))?;
+        }
+        // Apply to join filter expressions if present
+        if let Some(filter) = &self.filter {
+            tnr = tnr.visit_sibling(|| f(filter.expression().as_ref()))?;
+        }
+        Ok(tnr)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         Ok(Arc::new(SymmetricHashJoinExec::try_new(
             Arc::clone(&children[0]),
             Arc::clone(&children[1]),
@@ -468,11 +498,6 @@ impl ExecutionPlan for SymmetricHashJoinExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        // TODO stats: it is not possible in general to know the output size of joins
-        Ok(Statistics::new_unknown(&self.schema()))
-    }
-
     fn execute(
         &self,
         partition: usize,
@@ -480,12 +505,12 @@ impl ExecutionPlan for SymmetricHashJoinExec {
     ) -> Result<SendableRecordBatchStream> {
         let left_partitions = self.left.output_partitioning().partition_count();
         let right_partitions = self.right.output_partitioning().partition_count();
-        if left_partitions != right_partitions {
-            return internal_err!(
-                "Invalid SymmetricHashJoinExec, partition count mismatch {left_partitions}!={right_partitions},\
+        assert_eq_or_internal_err!(
+            left_partitions,
+            right_partitions,
+            "Invalid SymmetricHashJoinExec, partition count mismatch {left_partitions}!={right_partitions},\
                  consider using RepartitionExec"
-            );
-        }
+        );
         // If `filter_state` and `filter` are both present, then calculate sorted
         // filter expressions for both sides, and build an expression graph.
         let (left_sorted_filter_expr, right_sorted_filter_expr, graph) = match (
@@ -523,12 +548,12 @@ impl ExecutionPlan for SymmetricHashJoinExec {
         let enforce_batch_size_in_joins =
             context.session_config().enforce_batch_size_in_joins();
 
-        let reservation = Arc::new(Mutex::new(
+        let reservation = Arc::new(
             MemoryConsumer::new(format!("SymmetricHashJoinStream[{partition}]"))
                 .register(context.memory_pool()),
-        ));
+        );
         if let Some(g) = graph.as_ref() {
-            reservation.lock().try_grow(g.size())?;
+            reservation.try_grow(g.size())?;
         }
 
         if enforce_batch_size_in_joins {
@@ -928,6 +953,7 @@ pub(crate) fn build_side_determined_results(
             &probe_indices,
             column_indices,
             build_hash_joiner.build_side,
+            join_type,
         )
         .map(|batch| (batch.num_rows() > 0).then_some(batch))
     } else {
@@ -955,7 +981,7 @@ pub(crate) fn build_side_determined_results(
 ///
 /// A [Result] containing an optional record batch if the join type is not one of `LeftAnti`, `RightAnti`, `LeftSemi` or `RightSemi`.
 /// If the join type is one of the above four, the function will return [None].
-#[allow(clippy::too_many_arguments)]
+#[expect(clippy::too_many_arguments)]
 pub(crate) fn join_with_probe_batch(
     build_hash_joiner: &mut OneSideHashJoiner,
     probe_hash_joiner: &mut OneSideHashJoiner,
@@ -991,6 +1017,7 @@ pub(crate) fn join_with_probe_batch(
             filter,
             build_hash_joiner.build_side,
             None,
+            join_type,
         )?
     } else {
         (build_indices, probe_indices)
@@ -1029,6 +1056,7 @@ pub(crate) fn join_with_probe_batch(
             &probe_indices,
             column_indices,
             build_hash_joiner.build_side,
+            join_type,
         )
         .map(|batch| (batch.num_rows() > 0).then_some(batch))
     }
@@ -1053,7 +1081,7 @@ pub(crate) fn join_with_probe_batch(
 ///
 /// A [Result] containing a tuple with two equal length arrays, representing indices of rows from build and probe side,
 /// matched by join key columns.
-#[allow(clippy::too_many_arguments)]
+#[expect(clippy::too_many_arguments)]
 fn lookup_join_hashmap(
     build_hashmap: &PruningJoinHashMap,
     build_batch: &RecordBatch,
@@ -1065,14 +1093,8 @@ fn lookup_join_hashmap(
     hashes_buffer: &mut Vec<u64>,
     deleted_offset: Option<usize>,
 ) -> Result<(UInt64Array, UInt32Array)> {
-    let keys_values = probe_on
-        .iter()
-        .map(|c| c.evaluate(probe_batch)?.into_array(probe_batch.num_rows()))
-        .collect::<Result<Vec<_>>>()?;
-    let build_join_values = build_on
-        .iter()
-        .map(|c| c.evaluate(build_batch)?.into_array(build_batch.num_rows()))
-        .collect::<Result<Vec<_>>>()?;
+    let keys_values = evaluate_expressions_to_arrays(probe_on, probe_batch)?;
+    let build_join_values = evaluate_expressions_to_arrays(build_on, build_batch)?;
 
     hashes_buffer.clear();
     hashes_buffer.resize(probe_batch.num_rows(), 0);
@@ -1245,7 +1267,7 @@ impl OneSideHashJoiner {
             filter_intervals.push((expr.node_index(), expr.interval().clone()))
         }
         // Update the physical expression graph using the join filter intervals:
-        graph.update_ranges(&mut filter_intervals, Interval::CERTAINLY_TRUE)?;
+        graph.update_ranges(&mut filter_intervals, Interval::TRUE)?;
         // Extract the new join filter interval for the build side:
         let calculated_build_side_interval = filter_intervals.remove(0).1;
         // If the intervals have not changed, return early without pruning:
@@ -1376,7 +1398,6 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
                     }
                 }
                 Some((batch, _)) => {
-                    self.metrics.output_batches.add(1);
                     return self
                         .metrics
                         .baseline_metrics
@@ -1404,7 +1425,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
                     return Poll::Ready(Ok(StatefulStreamResult::Continue));
                 }
                 self.set_state(SHJStreamState::PullLeft);
-                Poll::Ready(self.process_batch_from_right(batch))
+                Poll::Ready(self.process_batch_from_right(&batch))
             }
             Some(Err(e)) => Poll::Ready(Err(e)),
             None => {
@@ -1433,7 +1454,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
                     return Poll::Ready(Ok(StatefulStreamResult::Continue));
                 }
                 self.set_state(SHJStreamState::PullRight);
-                Poll::Ready(self.process_batch_from_left(batch))
+                Poll::Ready(self.process_batch_from_left(&batch))
             }
             Some(Err(e)) => Poll::Ready(Err(e)),
             None => {
@@ -1462,7 +1483,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
                 if batch.num_rows() == 0 {
                     return Poll::Ready(Ok(StatefulStreamResult::Continue));
                 }
-                Poll::Ready(self.process_batch_after_right_end(batch))
+                Poll::Ready(self.process_batch_after_right_end(&batch))
             }
             Some(Err(e)) => Poll::Ready(Err(e)),
             None => {
@@ -1493,7 +1514,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
                 if batch.num_rows() == 0 {
                     return Poll::Ready(Ok(StatefulStreamResult::Continue));
                 }
-                Poll::Ready(self.process_batch_after_left_end(batch))
+                Poll::Ready(self.process_batch_after_left_end(&batch))
             }
             Some(Err(e)) => Poll::Ready(Err(e)),
             None => {
@@ -1523,7 +1544,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
 
     fn process_batch_from_right(
         &mut self,
-        batch: RecordBatch,
+        batch: &RecordBatch,
     ) -> Result<StatefulStreamResult<Option<RecordBatch>>> {
         self.perform_join_for_given_side(batch, JoinSide::Right)
             .map(|maybe_batch| {
@@ -1537,7 +1558,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
 
     fn process_batch_from_left(
         &mut self,
-        batch: RecordBatch,
+        batch: &RecordBatch,
     ) -> Result<StatefulStreamResult<Option<RecordBatch>>> {
         self.perform_join_for_given_side(batch, JoinSide::Left)
             .map(|maybe_batch| {
@@ -1551,14 +1572,14 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
 
     fn process_batch_after_left_end(
         &mut self,
-        right_batch: RecordBatch,
+        right_batch: &RecordBatch,
     ) -> Result<StatefulStreamResult<Option<RecordBatch>>> {
         self.process_batch_from_right(right_batch)
     }
 
     fn process_batch_after_right_end(
         &mut self,
-        left_batch: RecordBatch,
+        left_batch: &RecordBatch,
     ) -> Result<StatefulStreamResult<Option<RecordBatch>>> {
         self.process_batch_from_left(left_batch)
     }
@@ -1637,7 +1658,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
     /// 5. Combines the results and returns a combined batch or `None` if no batch was produced.
     fn perform_join_for_given_side(
         &mut self,
-        probe_batch: RecordBatch,
+        probe_batch: &RecordBatch,
         probe_side: JoinSide,
     ) -> Result<Option<RecordBatch>> {
         let (
@@ -1667,7 +1688,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
         probe_side_metrics.input_batches.add(1);
         probe_side_metrics.input_rows.add(probe_batch.num_rows());
         // Update the internal state of the hash joiner for the build side:
-        probe_hash_joiner.update_internal_state(&probe_batch, &self.random_state)?;
+        probe_hash_joiner.update_internal_state(probe_batch, &self.random_state)?;
         // Join the two sides:
         let equal_result = join_with_probe_batch(
             build_hash_joiner,
@@ -1675,7 +1696,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
             &self.schema,
             self.join_type,
             self.filter.as_ref(),
-            &probe_batch,
+            probe_batch,
             &self.column_indices,
             &self.random_state,
             self.null_equality,
@@ -1696,7 +1717,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
             calculate_filter_expr_intervals(
                 &build_hash_joiner.input_buffer,
                 build_side_sorted_filter_expr,
-                &probe_batch,
+                probe_batch,
                 probe_side_sorted_filter_expr,
             )?;
             let prune_length = build_hash_joiner
@@ -1723,7 +1744,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
         let result = combine_two_batches(&self.schema, equal_result, anti_result)?;
         let capacity = self.size();
         self.metrics.stream_memory_usage.set(capacity);
-        self.reservation.lock().try_resize(capacity)?;
+        self.reservation.try_resize(capacity)?;
         Ok(result)
     }
 }
@@ -1774,7 +1795,7 @@ mod tests {
     use datafusion_common::ScalarValue;
     use datafusion_execution::config::SessionConfig;
     use datafusion_expr::Operator;
-    use datafusion_physical_expr::expressions::{binary, col, lit, Column};
+    use datafusion_physical_expr::expressions::{Column, binary, col, lit};
     use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 
     use rstest::*;
diff --git a/datafusion/physical-plan/src/joins/test_utils.rs b/datafusion/physical-plan/src/joins/test_utils.rs
index de288724c446e..0455fb2a1eb6e 100644
--- a/datafusion/physical-plan/src/joins/test_utils.rs
+++ b/datafusion/physical-plan/src/joins/test_utils.rs
@@ -25,11 +25,11 @@ use crate::joins::{
 };
 use crate::repartition::RepartitionExec;
 use crate::test::TestMemoryExec;
-use crate::{common, ExecutionPlan, ExecutionPlanProperties, Partitioning};
+use crate::{ExecutionPlan, ExecutionPlanProperties, Partitioning, common};
 
 use arrow::array::{
-    types::IntervalDayTime, ArrayRef, Float64Array, Int32Array, IntervalDayTimeArray,
-    RecordBatch, TimestampMillisecondArray,
+    ArrayRef, Float64Array, Int32Array, IntervalDayTimeArray, RecordBatch,
+    TimestampMillisecondArray, types::IntervalDayTime,
 };
 use arrow::datatypes::{DataType, Schema};
 use arrow::util::pretty::pretty_format_batches;
@@ -152,6 +152,7 @@ pub async fn partitioned_hash_join_with_filter(
         None,
         PartitionMode::Partitioned,
         null_equality,
+        false, // null_aware
     )?);
 
     let mut batches = vec![];
@@ -534,9 +535,11 @@ pub fn create_memory_table(
     let right_schema = right_partition[0].schema();
     let right = TestMemoryExec::try_new(&[right_partition], right_schema, None)?
         .try_with_sort_information(right_sorted)?;
+    let left = Arc::new(left);
+    let right = Arc::new(right);
     Ok((
-        Arc::new(TestMemoryExec::update_cache(Arc::new(left))),
-        Arc::new(TestMemoryExec::update_cache(Arc::new(right))),
+        Arc::new(TestMemoryExec::update_cache(&left)),
+        Arc::new(TestMemoryExec::update_cache(&right)),
     ))
 }
 
diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs
index 9b589b674cc5b..90cab7246d71c 100644
--- a/datafusion/physical-plan/src/joins/utils.rs
+++ b/datafusion/physical-plan/src/joins/utils.rs
@@ -17,7 +17,7 @@
 
 //! Join related functionality used both on logical and physical plans
 
-use std::cmp::{min, Ordering};
+use std::cmp::{Ordering, min};
 use std::collections::HashSet;
 use std::fmt::{self, Debug};
 use std::future::Future;
@@ -27,7 +27,10 @@ use std::sync::Arc;
 use std::task::{Context, Poll};
 
 use crate::joins::SharedBitmapBuilder;
-use crate::metrics::{self, BaselineMetrics, ExecutionPlanMetricsSet, MetricBuilder};
+use crate::metrics::{
+    self, BaselineMetrics, ExecutionPlanMetricsSet, MetricBuilder, MetricCategory,
+    MetricType,
+};
 use crate::projection::{ProjectionExec, ProjectionExpr};
 use crate::{
     ColumnStatistics, ExecutionPlan, ExecutionPlanProperties, Partitioning, Statistics,
@@ -37,46 +40,48 @@ pub use super::join_filter::JoinFilter;
 pub use super::join_hash_map::JoinHashMapType;
 pub use crate::joins::{JoinOn, JoinOnRef};
 
-use ahash::RandomState;
 use arrow::array::{
-    builder::UInt64Builder, downcast_array, new_null_array, Array, ArrowPrimitiveType,
-    BooleanBufferBuilder, NativeAdapter, PrimitiveArray, RecordBatch, RecordBatchOptions,
-    UInt32Array, UInt32Builder, UInt64Array,
+    Array, ArrowPrimitiveType, BooleanBufferBuilder, NativeAdapter, PrimitiveArray,
+    RecordBatch, RecordBatchOptions, UInt32Array, UInt32Builder, UInt64Array,
+    builder::UInt64Builder, downcast_array, new_null_array,
 };
 use arrow::array::{
     ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, Date32Array, Date64Array,
-    Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array,
-    Int32Array, Int64Array, Int8Array, LargeBinaryArray, LargeStringArray, StringArray,
+    Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, Int8Array,
+    Int16Array, Int32Array, Int64Array, LargeBinaryArray, LargeStringArray, StringArray,
     StringViewArray, TimestampMicrosecondArray, TimestampMillisecondArray,
-    TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt8Array,
+    TimestampNanosecondArray, TimestampSecondArray, UInt8Array, UInt16Array,
 };
 use arrow::buffer::{BooleanBuffer, NullBuffer};
 use arrow::compute::kernels::cmp::eq;
-use arrow::compute::{self, and, take, FilterBuilder};
+use arrow::compute::{self, FilterBuilder, and, take};
 use arrow::datatypes::{
     ArrowNativeType, Field, Schema, SchemaBuilder, UInt32Type, UInt64Type,
 };
 use arrow_ord::cmp::not_distinct;
+use arrow_ord::ord::{DynComparator, make_comparator};
 use arrow_schema::{ArrowError, DataType, SortOptions, TimeUnit};
 use datafusion_common::cast::as_boolean_array;
+use datafusion_common::hash_utils::RandomState;
 use datafusion_common::hash_utils::create_hashes;
 use datafusion_common::stats::Precision;
 use datafusion_common::{
-    not_impl_err, plan_err, DataFusionError, JoinSide, JoinType, NullEquality, Result,
-    SharedResult,
+    DataFusionError, JoinSide, JoinType, NullEquality, Result, SharedResult,
+    not_impl_err, plan_err,
 };
-use datafusion_expr::interval_arithmetic::Interval;
 use datafusion_expr::Operator;
+use datafusion_expr::interval_arithmetic::Interval;
 use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_expr::utils::collect_columns;
 use datafusion_physical_expr::{
-    add_offset_to_expr, add_offset_to_physical_sort_exprs, LexOrdering, PhysicalExpr,
-    PhysicalExprRef,
+    LexOrdering, PhysicalExpr, PhysicalExprRef, add_offset_to_expr,
+    add_offset_to_physical_sort_exprs,
 };
 
 use datafusion_physical_expr_common::datum::compare_op_for_nested;
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays;
 use futures::future::{BoxFuture, Shared};
-use futures::{ready, FutureExt};
+use futures::{FutureExt, ready};
 use parking_lot::Mutex;
 
 /// Checks whether the schemas "left" and "right" and columns "on" represent a valid join.
@@ -156,20 +161,21 @@ pub fn calculate_join_output_ordering(
     match maintains_input_order {
         [true, false] => {
             // Special case, we can prefix ordering of right side with the ordering of left side.
-            if join_type == JoinType::Inner && probe_side == Some(JoinSide::Left) {
-                if let Some(right_ordering) = right_ordering.cloned() {
-                    let right_offset = add_offset_to_physical_sort_exprs(
-                        right_ordering,
-                        left_columns_len as _,
-                    )?;
-                    return if let Some(left_ordering) = left_ordering {
-                        let mut result = left_ordering.clone();
-                        result.extend(right_offset);
-                        Ok(Some(result))
-                    } else {
-                        Ok(LexOrdering::new(right_offset))
-                    };
-                }
+            if join_type == JoinType::Inner
+                && probe_side == Some(JoinSide::Left)
+                && let Some(right_ordering) = right_ordering.cloned()
+            {
+                let right_offset = add_offset_to_physical_sort_exprs(
+                    right_ordering,
+                    left_columns_len as _,
+                )?;
+                return if let Some(left_ordering) = left_ordering {
+                    let mut result = left_ordering.clone();
+                    result.extend(right_offset);
+                    Ok(Some(result))
+                } else {
+                    Ok(LexOrdering::new(right_offset))
+                };
             }
             Ok(left_ordering.cloned())
         }
@@ -407,15 +413,43 @@ struct PartialJoinStatistics {
     pub column_statistics: Vec<ColumnStatistics>,
 }
 
-/// Estimate the statistics for the given join's output.
+/// Estimates the output statistics for a join operation based on input statistics.
+///
+/// # Statistics Propagation
+///
+/// This function estimates join output statistics using the following approach:
+/// - **Row count estimation**: Uses the `on` parameter (equijoin keys) to estimate
+///   output cardinality via [`estimate_join_cardinality`]. The estimation is based on
+///   column-level statistics (distinct counts, min/max values) of the join keys.
+/// - **Column statistics**: Combines column statistics from both inputs. For join types
+///   that preserve all columns (Inner, Left, Right, Full), statistics from both sides
+///   are concatenated. For semi/anti joins, only the relevant side's statistics are kept.
+/// - **Byte size**: Always returns `Precision::Absent` as join output size is difficult
+///   to estimate without knowing the actual data.
+///
+/// # The `on` Parameter
+///
+/// The `on` parameter represents equijoin keys (e.g., `t1.id = t2.id`). When `on` is
+/// empty (as in NestedLoopJoinExec which handles non-equijoin predicates), the
+/// cardinality estimation cannot compute selectivity from join keys, and this function
+/// returns unknown statistics (`num_rows: Precision::Absent`).
+///
+/// # Limitations
+///
+/// - Does not account for selectivity of arbitrary join filter expressions
+///   (e.g., `(t1.v1 + t2.v1) % 2 = 0`). Such filters, common in NestedLoopJoinExec,
+///   are not factored into the cardinality estimation.
+/// - Column statistics for the output are simply combined from inputs without
+///   adjusting for join selectivity (acknowledged in the code as needing
+///   "filter selectivity analysis").
 pub(crate) fn estimate_join_statistics(
     left_stats: Statistics,
     right_stats: Statistics,
-    on: JoinOn,
+    on: &JoinOn,
     join_type: &JoinType,
     schema: &Schema,
 ) -> Result<Statistics> {
-    let join_stats = estimate_join_cardinality(join_type, left_stats, right_stats, &on);
+    let join_stats = estimate_join_cardinality(join_type, left_stats, right_stats, on);
     let (num_rows, column_statistics) = match join_stats {
         Some(stats) => (Precision::Inexact(stats.num_rows), stats.column_statistics),
         None => (Precision::Absent, Statistics::unknown_column(schema)),
@@ -438,8 +472,8 @@ fn estimate_join_cardinality(
         .iter()
         .map(|(left, right)| {
             match (
-                left.as_any().downcast_ref::<Column>(),
-                right.as_any().downcast_ref::<Column>(),
+                left.downcast_ref::<Column>(),
+                right.downcast_ref::<Column>(),
             ) {
                 (Some(left), Some(right)) => (
                     left_stats.column_statistics[left.index()].clone(),
@@ -497,35 +531,48 @@ fn estimate_join_cardinality(
             })
         }
 
-        // For SemiJoins estimation result is either zero, in cases when inputs
-        // are non-overlapping according to statistics, or equal to number of rows
-        // for outer input
-        JoinType::LeftSemi | JoinType::RightSemi => {
-            let (outer_stats, inner_stats) = match join_type {
-                JoinType::LeftSemi => (left_stats, right_stats),
-                _ => (right_stats, left_stats),
-            };
-            let cardinality = match estimate_disjoint_inputs(&outer_stats, &inner_stats) {
-                Some(estimation) => *estimation.get_value()?,
-                None => *outer_stats.num_rows.get_value()?,
-            };
+        JoinType::LeftSemi
+        | JoinType::RightSemi
+        | JoinType::LeftAnti
+        | JoinType::RightAnti => {
+            let is_left = matches!(join_type, JoinType::LeftSemi | JoinType::LeftAnti);
+            let is_anti = matches!(join_type, JoinType::LeftAnti | JoinType::RightAnti);
+
+            let ((outer_stats, inner_stats), (outer_col_stats, inner_col_stats)) =
+                if is_left {
+                    (
+                        (&left_stats, &right_stats),
+                        (&left_col_stats, &right_col_stats),
+                    )
+                } else {
+                    (
+                        (&right_stats, &left_stats),
+                        (&right_col_stats, &left_col_stats),
+                    )
+                };
 
-            Some(PartialJoinStatistics {
-                num_rows: cardinality,
-                column_statistics: outer_stats.column_statistics,
-            })
-        }
+            let outer_rows = *outer_stats.num_rows.get_value()?;
 
-        // For AntiJoins estimation always equals to outer statistics, as
-        // non-overlapping inputs won't affect estimation
-        JoinType::LeftAnti | JoinType::RightAnti => {
-            let outer_stats = match join_type {
-                JoinType::LeftAnti => left_stats,
-                _ => right_stats,
-            };
+            let cardinality =
+                if estimate_disjoint_inputs(outer_stats, inner_stats).is_some() {
+                    // Disjoint inputs: semi produces 0, anti keeps all rows.
+                    if is_anti { outer_rows } else { 0 }
+                } else {
+                    match estimate_semi_join_cardinality(
+                        &outer_stats.num_rows,
+                        &inner_stats.num_rows,
+                        outer_col_stats,
+                        inner_col_stats,
+                    ) {
+                        Some(semi) if is_anti => outer_rows.saturating_sub(semi),
+                        Some(semi) => semi,
+                        None => outer_rows,
+                    }
+                };
 
+            let outer_stats = if is_left { left_stats } else { right_stats };
             Some(PartialJoinStatistics {
-                num_rows: *outer_stats.num_rows.get_value()?,
+                num_rows: cardinality,
                 column_statistics: outer_stats.column_statistics,
             })
         }
@@ -564,16 +611,26 @@ fn estimate_inner_join_cardinality(
         return Some(estimation);
     };
 
+    let Statistics {
+        num_rows: left_num_rows,
+        column_statistics: left_column_statistics,
+        ..
+    } = left_stats;
+    let Statistics {
+        num_rows: right_num_rows,
+        column_statistics: right_column_statistics,
+        ..
+    } = right_stats;
+
     // The algorithm here is partly based on the non-histogram selectivity estimation
     // from Spark's Catalyst optimizer.
     let mut join_selectivity = Precision::Absent;
-    for (left_stat, right_stat) in left_stats
-        .column_statistics
+    for (left_stat, right_stat) in left_column_statistics
         .iter()
-        .zip(right_stats.column_statistics.iter())
+        .zip(right_column_statistics.iter())
     {
-        let left_max_distinct = max_distinct_count(&left_stats.num_rows, left_stat);
-        let right_max_distinct = max_distinct_count(&right_stats.num_rows, right_stat);
+        let left_max_distinct = max_distinct_count(&left_num_rows, left_stat);
+        let right_max_distinct = max_distinct_count(&right_num_rows, right_stat);
         let max_distinct = left_max_distinct.max(&right_max_distinct);
         if max_distinct.get_value().is_some() {
             // Seems like there are a few implementations of this algorithm that implement
@@ -655,6 +712,95 @@ fn estimate_disjoint_inputs(
     None
 }
 
+/// Estimates the number of outer rows that have at least one matching
+/// key on the inner side (i.e. semi join cardinality) using NDV
+/// (Number of Distinct Values) statistics.
+///
+/// Assuming the smaller domain is contained in the larger, the number
+/// of overlapping distinct values is `min(outer_ndv, inner_ndv)`.
+/// Under the uniformity assumption (each distinct value contributes
+/// equally to row counts), the surviving fraction of outer rows is:
+///
+/// Null rows cannot match, so each column's selectivity is further
+/// reduced by the outer null fraction:
+///
+/// ```text
+/// null_frac_i = outer_null_count_i / outer_rows
+/// selectivity_i = min(outer_ndv_i, inner_ndv_i) / outer_ndv_i * (1 - null_frac_i)
+/// ```
+///
+/// For multi-column join keys the overall selectivity is the product
+/// of per-column factors:
+///
+/// ```text
+/// semi_cardinality = outer_rows * product_i(selectivity_i)
+/// ```
+///
+/// Anti join cardinality is derived as the complement:
+/// `outer_rows - semi_cardinality`.
+///
+/// Boundary cases:
+/// * `inner_ndv >= outer_ndv` → selectivity = `1.0 - null_frac`
+/// * `null_frac = 1.0` → selectivity = 0.0 (no non-null rows can match)
+/// * Missing NDV statistics → returns `None` (fallback to `outer_rows`)
+///
+/// PostgreSQL uses a similar approach in `eqjoinsel_semi`
+/// (`src/backend/utils/adt/selfuncs.c`). When NDV statistics are
+/// available on both sides it computes selectivity as `nd2 / nd1`,
+/// which is equivalent to `min(outer_ndv, inner_ndv) / outer_ndv`.
+/// If either side lacks statistics it falls back to a default.
+fn estimate_semi_join_cardinality(
+    outer_num_rows: &Precision<usize>,
+    inner_num_rows: &Precision<usize>,
+    outer_col_stats: &[ColumnStatistics],
+    inner_col_stats: &[ColumnStatistics],
+) -> Option<usize> {
+    let outer_rows = *outer_num_rows.get_value()?;
+    if outer_rows == 0 {
+        return Some(0);
+    }
+    let inner_rows = *inner_num_rows.get_value()?;
+    if inner_rows == 0 {
+        return Some(0);
+    }
+
+    let mut selectivity = 1.0_f64;
+    let mut has_selectivity_estimate = false;
+
+    for (outer_stat, inner_stat) in outer_col_stats.iter().zip(inner_col_stats.iter()) {
+        let outer_has_stats = outer_stat.distinct_count.get_value().is_some()
+            || (outer_stat.min_value.get_value().is_some()
+                && outer_stat.max_value.get_value().is_some());
+        let inner_has_stats = inner_stat.distinct_count.get_value().is_some()
+            || (inner_stat.min_value.get_value().is_some()
+                && inner_stat.max_value.get_value().is_some());
+        if !outer_has_stats || !inner_has_stats {
+            continue;
+        }
+
+        let outer_ndv = max_distinct_count(outer_num_rows, outer_stat);
+        let inner_ndv = max_distinct_count(inner_num_rows, inner_stat);
+
+        if let (Some(&o), Some(&i)) = (outer_ndv.get_value(), inner_ndv.get_value())
+            && o > 0
+        {
+            let null_frac = outer_stat
+                .null_count
+                .get_value()
+                .map(|&nc| nc as f64 / outer_rows as f64)
+                .unwrap_or(0.0);
+            selectivity *= (o.min(i) as f64) / (o as f64) * (1.0 - null_frac);
+            has_selectivity_estimate = true;
+        }
+    }
+
+    if has_selectivity_estimate {
+        Some((outer_rows as f64 * selectivity).ceil() as usize)
+    } else {
+        None
+    }
+}
+
 /// Estimate the number of maximum distinct values that can be present in the
 /// given column from its statistics. If distinct_count is available, uses it
 /// directly. Otherwise, if the column is numeric and has min/max values, it
@@ -665,7 +811,19 @@ fn max_distinct_count(
     stats: &ColumnStatistics,
 ) -> Precision<usize> {
     match &stats.distinct_count {
-        &dc @ (Precision::Exact(_) | Precision::Inexact(_)) => dc,
+        &dc @ (Precision::Exact(_) | Precision::Inexact(_)) => {
+            // NDV can never exceed the number of rows
+            match num_rows {
+                Precision::Absent => dc,
+                _ => {
+                    if dc.get_value() <= num_rows.get_value() {
+                        dc
+                    } else {
+                        num_rows.to_inexact()
+                    }
+                }
+            }
+        }
         _ => {
             // The number can never be greater than the number of rows we have
             // minus the nulls (since they don't count as distinct values).
@@ -680,38 +838,37 @@ fn max_distinct_count(
                     }
                 }
                 Precision::Exact(count) => {
-                    let count = count - stats.null_count.get_value().unwrap_or(&0);
+                    let null_count = *stats.null_count.get_value().unwrap_or(&0);
+                    let non_null_count = count.checked_sub(null_count).unwrap_or(0);
                     if stats.null_count.is_exact().unwrap_or(false) {
-                        Precision::Exact(count)
+                        Precision::Exact(non_null_count)
                     } else {
-                        Precision::Inexact(count)
+                        Precision::Inexact(non_null_count)
                     }
                 }
             };
             // Cap the estimate using the number of possible values:
             if let (Some(min), Some(max)) =
                 (stats.min_value.get_value(), stats.max_value.get_value())
-            {
-                if let Some(range_dc) = Interval::try_new(min.clone(), max.clone())
+                && let Some(range_dc) = Interval::try_new(min.clone(), max.clone())
                     .ok()
                     .and_then(|e| e.cardinality())
+            {
+                let range_dc = range_dc as usize;
+                // Note that the `unwrap` calls in the below statement are safe.
+                return if result == Precision::Absent
+                    || &range_dc < result.get_value().unwrap()
                 {
-                    let range_dc = range_dc as usize;
-                    // Note that the `unwrap` calls in the below statement are safe.
-                    return if matches!(result, Precision::Absent)
-                        || &range_dc < result.get_value().unwrap()
+                    if stats.min_value.is_exact().unwrap()
+                        && stats.max_value.is_exact().unwrap()
                     {
-                        if stats.min_value.is_exact().unwrap()
-                            && stats.max_value.is_exact().unwrap()
-                        {
-                            Precision::Exact(range_dc)
-                        } else {
-                            Precision::Inexact(range_dc)
-                        }
+                        Precision::Exact(range_dc)
                     } else {
-                        result
-                    };
-                }
+                        Precision::Inexact(range_dc)
+                    }
+                } else {
+                    result
+                };
             }
 
             result
@@ -870,6 +1027,7 @@ pub(crate) fn get_final_indices_from_bit_map(
     (left_indices, right_indices)
 }
 
+#[expect(clippy::too_many_arguments)]
 pub(crate) fn apply_join_filter_to_indices(
     build_input_buffer: &RecordBatch,
     probe_batch: &RecordBatch,
@@ -878,6 +1036,7 @@ pub(crate) fn apply_join_filter_to_indices(
     filter: &JoinFilter,
     build_side: JoinSide,
     max_intermediate_size: Option<usize>,
+    join_type: JoinType,
 ) -> Result<(UInt64Array, UInt32Array)> {
     if build_indices.is_empty() && probe_indices.is_empty() {
         return Ok((build_indices, probe_indices));
@@ -898,6 +1057,7 @@ pub(crate) fn apply_join_filter_to_indices(
                 &probe_indices.slice(i, len),
                 filter.column_indices(),
                 build_side,
+                join_type,
             )?;
             let filter_result = filter
                 .expression()
@@ -919,6 +1079,7 @@ pub(crate) fn apply_join_filter_to_indices(
             &probe_indices,
             filter.column_indices(),
             build_side,
+            join_type,
         )?;
 
         filter
@@ -937,8 +1098,20 @@ pub(crate) fn apply_join_filter_to_indices(
     ))
 }
 
+/// Creates a [RecordBatch] with zero columns but the given row count.
+/// Used when a join has an empty projection (e.g. `SELECT count(1) ...`).
+fn new_empty_schema_batch(schema: &Schema, row_count: usize) -> Result<RecordBatch> {
+    let options = RecordBatchOptions::new().with_row_count(Some(row_count));
+    Ok(RecordBatch::try_new_with_options(
+        Arc::new(schema.clone()),
+        vec![],
+        &options,
+    )?)
+}
+
 /// Returns a new [RecordBatch] by combining the `left` and `right` according to `indices`.
 /// The resulting batch has [Schema] `schema`.
+#[expect(clippy::too_many_arguments)]
 pub(crate) fn build_batch_from_indices(
     schema: &Schema,
     build_input_buffer: &RecordBatch,
@@ -947,17 +1120,17 @@ pub(crate) fn build_batch_from_indices(
     probe_indices: &UInt32Array,
     column_indices: &[ColumnIndex],
     build_side: JoinSide,
+    join_type: JoinType,
 ) -> Result<RecordBatch> {
     if schema.fields().is_empty() {
-        let options = RecordBatchOptions::new()
-            .with_match_field_names(true)
-            .with_row_count(Some(build_indices.len()));
-
-        return Ok(RecordBatch::try_new_with_options(
-            Arc::new(schema.clone()),
-            vec![],
-            &options,
-        )?);
+        // For RightAnti and RightSemi joins, after `adjust_indices_by_join_type`
+        // the build_indices were untouched so only probe_indices hold the actual
+        // row count.
+        let row_count = match join_type {
+            JoinType::RightAnti | JoinType::RightSemi => probe_indices.len(),
+            _ => build_indices.len(),
+        };
+        return new_empty_schema_batch(schema, row_count);
     }
 
     // build the columns of the new [RecordBatch]:
@@ -1004,44 +1177,35 @@ pub(crate) fn build_batch_empty_build_side(
     column_indices: &[ColumnIndex],
     join_type: JoinType,
 ) -> Result<RecordBatch> {
-    match join_type {
-        // these join types only return data if the left side is not empty, so we return an
-        // empty RecordBatch
-        JoinType::Inner
-        | JoinType::Left
-        | JoinType::LeftSemi
-        | JoinType::RightSemi
-        | JoinType::LeftAnti
-        | JoinType::LeftMark => Ok(RecordBatch::new_empty(Arc::new(schema.clone()))),
-
-        // the remaining joins will return data for the right columns and null for the left ones
-        JoinType::Right | JoinType::Full | JoinType::RightAnti | JoinType::RightMark => {
-            let num_rows = probe_batch.num_rows();
-            let mut columns: Vec<Arc<dyn Array>> =
-                Vec::with_capacity(schema.fields().len());
-
-            for column_index in column_indices {
-                let array = match column_index.side {
-                    // left -> null array
-                    JoinSide::Left => new_null_array(
-                        build_batch.column(column_index.index).data_type(),
-                        num_rows,
-                    ),
-                    // right -> respective right array
-                    JoinSide::Right => Arc::clone(probe_batch.column(column_index.index)),
-                    // right mark -> unset boolean array as there are no matches on the left side
-                    JoinSide::None => Arc::new(BooleanArray::new(
-                        BooleanBuffer::new_unset(num_rows),
-                        None,
-                    )),
-                };
+    if join_type.empty_build_side_produces_empty_result() {
+        // These join types only return data if the left side is not empty.
+        return Ok(RecordBatch::new_empty(Arc::new(schema.clone())));
+    }
+
+    // The remaining joins return right-side rows and nulls for the left side.
+    let num_rows = probe_batch.num_rows();
+    if schema.fields().is_empty() {
+        return new_empty_schema_batch(schema, num_rows);
+    }
 
-                columns.push(array);
+    let columns = column_indices
+        .iter()
+        .map(|column_index| match column_index.side {
+            // left -> null array
+            JoinSide::Left => new_null_array(
+                build_batch.column(column_index.index).data_type(),
+                num_rows,
+            ),
+            // right -> respective right array
+            JoinSide::Right => Arc::clone(probe_batch.column(column_index.index)),
+            // right mark -> unset boolean array as there are no matches on the left side
+            JoinSide::None => {
+                Arc::new(BooleanArray::new(BooleanBuffer::new_unset(num_rows), None))
             }
+        })
+        .collect();
 
-            Ok(RecordBatch::try_new(Arc::new(schema.clone()), columns)?)
-        }
-    }
+    Ok(RecordBatch::try_new(Arc::new(schema.clone()), columns)?)
 }
 
 /// The input is the matched indices for left and right and
@@ -1128,8 +1292,8 @@ pub(crate) fn append_right_indices(
 ) -> Result<(UInt64Array, UInt32Array)> {
     if preserve_order_for_right {
         Ok(append_probe_indices_in_order(
-            left_indices,
-            right_indices,
+            &left_indices,
+            &right_indices,
             adjust_range,
         ))
     } else {
@@ -1273,8 +1437,8 @@ fn build_range_bitmap<T: ArrowPrimitiveType>(
 /// - A `PrimitiveArray` of `UInt64Type` with the newly constructed build indices.
 /// - A `PrimitiveArray` of `UInt32Type` with the newly constructed probe indices.
 fn append_probe_indices_in_order(
-    build_indices: PrimitiveArray<UInt64Type>,
-    probe_indices: PrimitiveArray<UInt32Type>,
+    build_indices: &PrimitiveArray<UInt64Type>,
+    probe_indices: &PrimitiveArray<UInt32Type>,
     range: Range<usize>,
 ) -> (PrimitiveArray<UInt64Type>, PrimitiveArray<UInt32Type>) {
     // Builders for new indices:
@@ -1327,8 +1491,10 @@ pub(crate) struct BuildProbeJoinMetrics {
     pub(crate) input_batches: metrics::Count,
     /// Number of rows consumed by probe-side this operator
     pub(crate) input_rows: metrics::Count,
-    /// Number of batches produced by this operator
-    pub(crate) output_batches: metrics::Count,
+    /// Fraction of probe rows that found more than one match
+    pub(crate) probe_hit_rate: metrics::RatioMetrics,
+    /// Average number of build matches per matched probe row
+    pub(crate) avg_fanout: metrics::RatioMetrics,
 }
 
 // This Drop implementation updates the elapsed compute part of the metrics.
@@ -1358,22 +1524,33 @@ impl BuildProbeJoinMetrics {
 
         let build_time = MetricBuilder::new(metrics).subset_time("build_time", partition);
 
-        let build_input_batches =
-            MetricBuilder::new(metrics).counter("build_input_batches", partition);
+        let build_input_batches = MetricBuilder::new(metrics)
+            .with_category(MetricCategory::Rows)
+            .counter("build_input_batches", partition);
+
+        let build_input_rows = MetricBuilder::new(metrics)
+            .with_category(MetricCategory::Rows)
+            .counter("build_input_rows", partition);
 
-        let build_input_rows =
-            MetricBuilder::new(metrics).counter("build_input_rows", partition);
+        let build_mem_used = MetricBuilder::new(metrics)
+            .with_category(MetricCategory::Bytes)
+            .gauge("build_mem_used", partition);
 
-        let build_mem_used =
-            MetricBuilder::new(metrics).gauge("build_mem_used", partition);
+        let input_batches = MetricBuilder::new(metrics)
+            .with_category(MetricCategory::Rows)
+            .counter("input_batches", partition);
 
-        let input_batches =
-            MetricBuilder::new(metrics).counter("input_batches", partition);
+        let input_rows = MetricBuilder::new(metrics)
+            .with_category(MetricCategory::Rows)
+            .counter("input_rows", partition);
 
-        let input_rows = MetricBuilder::new(metrics).counter("input_rows", partition);
+        let probe_hit_rate = MetricBuilder::new(metrics)
+            .with_type(MetricType::Summary)
+            .ratio_metrics("probe_hit_rate", partition);
 
-        let output_batches =
-            MetricBuilder::new(metrics).counter("output_batches", partition);
+        let avg_fanout = MetricBuilder::new(metrics)
+            .with_type(MetricType::Summary)
+            .ratio_metrics("avg_fanout", partition);
 
         Self {
             build_time,
@@ -1383,8 +1560,9 @@ impl BuildProbeJoinMetrics {
             join_time,
             input_batches,
             input_rows,
-            output_batches,
             baseline,
+            probe_hit_rate,
+            avg_fanout,
         }
     }
 }
@@ -1626,7 +1804,7 @@ fn swap_reverting_projection(
 pub fn swap_join_projection(
     left_schema_len: usize,
     right_schema_len: usize,
-    projection: Option<&Vec<usize>>,
+    projection: Option<&[usize]>,
     join_type: &JoinType,
 ) -> Option<Vec<usize>> {
     match join_type {
@@ -1637,7 +1815,7 @@ pub fn swap_join_projection(
         | JoinType::RightAnti
         | JoinType::RightSemi
         | JoinType::LeftMark
-        | JoinType::RightMark => projection.cloned(),
+        | JoinType::RightMark => projection.map(|p| p.to_vec()),
         _ => projection.map(|p| {
             p.iter()
                 .map(|i| {
@@ -1662,22 +1840,19 @@ pub fn swap_join_projection(
 /// `fifo_hashmap` sets the order of iteration over `batch` rows while updating hashmap,
 /// which allows to keep either first (if set to true) or last (if set to false) row index
 /// as a chain head for rows with equal hash values.
-#[allow(clippy::too_many_arguments)]
+#[expect(clippy::too_many_arguments)]
 pub fn update_hash(
     on: &[PhysicalExprRef],
     batch: &RecordBatch,
     hash_map: &mut dyn JoinHashMapType,
     offset: usize,
     random_state: &RandomState,
-    hashes_buffer: &mut Vec<u64>,
+    hashes_buffer: &mut [u64],
     deleted_offset: usize,
     fifo_hashmap: bool,
 ) -> Result<()> {
     // evaluate the keys
-    let keys_values = on
-        .iter()
-        .map(|c| c.evaluate(batch)?.into_array(batch.num_rows()))
-        .collect::<Result<Vec<_>>>()?;
+    let keys_values = evaluate_expressions_to_arrays(on, batch)?;
 
     // calculate the hash values
     let hash_values = create_hashes(&keys_values, random_state, hashes_buffer)?;
@@ -1762,6 +1937,110 @@ fn eq_dyn_null(
     }
 }
 
+/// Pre-built comparator for join key columns that eliminates per-row type
+/// dispatch. Wraps `arrow_ord::ord::DynComparator` closures built once per
+/// batch pair, used for all row comparisons within those batches.
+///
+/// The first key column is stored separately so that single-column joins
+/// (the common case) avoid Vec iteration entirely, and multi-column joins
+/// short-circuit without entering the loop when the first column is
+/// selective.
+///
+/// Null handling is baked into the closures at construction time:
+/// - `NullEqualsNull`: `make_comparator` returns `Equal` for both-null, which
+///   is the desired behavior. Closures are used as-is.
+/// - `NullEqualsNothing`: columns where both sides contain nulls get a wrapper
+///   that returns `Less` for both-null. Columns where one side has no nulls
+///   skip the wrapper since both-null is impossible.
+///
+/// Because `NullEqualsNothing` wraps comparators to return `Less` for
+/// both-null, `is_equal` will return `false` for both-null rows when that
+/// mode is active. Callers needing both-null == equal semantics (e.g.,
+/// buffered head/tail equality in SMJ) should construct with
+/// `NullEqualsNull`.
+pub struct JoinKeyComparator {
+    first: DynComparator,
+    rest: Vec<DynComparator>,
+}
+
+impl JoinKeyComparator {
+    /// Build comparators for each join key column pair.
+    pub fn new(
+        left_arrays: &[ArrayRef],
+        right_arrays: &[ArrayRef],
+        sort_options: &[SortOptions],
+        null_equality: NullEquality,
+    ) -> Result<Self> {
+        debug_assert_eq!(left_arrays.len(), right_arrays.len());
+        debug_assert_eq!(left_arrays.len(), sort_options.len());
+
+        let mut iter = left_arrays
+            .iter()
+            .zip(right_arrays.iter())
+            .zip(sort_options.iter())
+            .map(|((l, r), opts)| {
+                let inner = make_comparator(l.as_ref(), r.as_ref(), *opts)?;
+                if null_equality == NullEquality::NullEqualsNothing {
+                    let ln = l.logical_nulls().filter(|n| n.null_count() > 0);
+                    let rn = r.logical_nulls().filter(|n| n.null_count() > 0);
+                    match (ln, rn) {
+                        // Both sides have nulls — wrap to override both-null.
+                        (Some(ln), Some(rn)) => Ok(Box::new(move |i, j| {
+                            if ln.is_null(i) && rn.is_null(j) {
+                                Ordering::Less
+                            } else {
+                                inner(i, j)
+                            }
+                        })
+                            as DynComparator),
+                        // One side has no nulls — both-null impossible, no wrap.
+                        _ => Ok(inner),
+                    }
+                } else {
+                    Ok(inner)
+                }
+            });
+
+        let first = iter.next().expect("join must have at least one key")?;
+        let rest = iter.collect::<Result<Vec<_>>>()?;
+        Ok(Self { first, rest })
+    }
+
+    /// Compare row `left` (in the left arrays) with row `right` (in the right
+    /// arrays). Returns the lexicographic ordering across all key columns.
+    #[inline]
+    pub fn compare(&self, left: usize, right: usize) -> Ordering {
+        let ord = (self.first)(left, right);
+        if ord != Ordering::Equal || self.rest.is_empty() {
+            return ord;
+        }
+        for cmp_fn in &self.rest {
+            let ord = cmp_fn(left, right);
+            if ord != Ordering::Equal {
+                return ord;
+            }
+        }
+        Ordering::Equal
+    }
+
+    /// Check equality of row `left` (in the left arrays) with row `right`
+    /// (in the right arrays). Both-null is treated as equal when constructed
+    /// with `NullEqualsNull`. With `NullEqualsNothing`, both-null returns
+    /// `false` because the override is baked into the comparators.
+    #[inline]
+    pub fn is_equal(&self, left: usize, right: usize) -> bool {
+        if (self.first)(left, right) != Ordering::Equal {
+            return false;
+        }
+        for cmp_fn in &self.rest {
+            if cmp_fn(left, right) != Ordering::Equal {
+                return false;
+            }
+        }
+        true
+    }
+}
+
 /// Get comparison result of two rows of join arrays
 pub fn compare_join_arrays(
     left_arrays: &[ArrayRef],
@@ -1862,11 +2141,10 @@ mod tests {
 
     use super::*;
 
-    use arrow::array::Int32Array;
     use arrow::datatypes::{DataType, Fields};
     use arrow::error::{ArrowError, Result as ArrowResult};
     use datafusion_common::stats::Precision::{Absent, Exact, Inexact};
-    use datafusion_common::{arrow_datafusion_err, arrow_err, ScalarValue};
+    use datafusion_common::{ScalarValue, arrow_datafusion_err, arrow_err};
     use datafusion_physical_expr::PhysicalSortExpr;
 
     use rstest::rstest;
@@ -2066,6 +2344,7 @@ mod tests {
             max_value: max.map(ScalarValue::from),
             sum_value: Absent,
             null_count,
+            byte_size: Absent,
         }
     }
 
@@ -2227,6 +2506,22 @@ mod tests {
                 (10, Inexact(1), Inexact(10), Absent, Absent),
                 Some(Inexact(0)),
             ),
+            // NDV > num_rows: distinct count should be capped at row count
+            (
+                (5, Inexact(1), Inexact(100), Inexact(50), Absent),
+                (10, Inexact(1), Inexact(100), Inexact(50), Absent),
+                // max_distinct_count caps: left NDV=min(50,5)=5, right NDV=min(50,10)=10
+                // cardinality = (5 * 10) / max(5, 10) = 50 / 10 = 5
+                Some(Inexact(5)),
+            ),
+            // NDV > num_rows on one side only
+            (
+                (3, Inexact(1), Inexact(100), Inexact(100), Absent),
+                (10, Inexact(1), Inexact(100), Inexact(5), Absent),
+                // max_distinct_count caps: left NDV=min(100,3)=3, right NDV=min(5,10)=5
+                // cardinality = (3 * 10) / max(3, 5) = 30 / 5 = 6
+                Some(Inexact(6)),
+            ),
         ];
 
         for (left_info, right_info, expected_cardinality) in cases {
@@ -2366,11 +2661,14 @@ mod tests {
         //   y: min=0, max=100, distinct=None
         //
         // Join on a=c, b=d (ignore x/y)
+        // Right column d has NDV=2500 but only 2000 rows, so NDV is capped
+        // to 2000. join_selectivity = max(500, 2000) = 2000.
+        // Inner cardinality = (1000 * 2000) / 2000 = 1000
         let cases = vec![
-            (JoinType::Inner, 800),
+            (JoinType::Inner, 1000),
             (JoinType::Left, 1000),
             (JoinType::Right, 2000),
-            (JoinType::Full, 2200),
+            (JoinType::Full, 2000),
         ];
 
         let left_col_stats = vec![
@@ -2501,7 +2799,7 @@ mod tests {
                 JoinType::LeftSemi,
                 (50, Inexact(10), Inexact(20), Absent, Absent),
                 (10, Inexact(15), Inexact(25), Absent, Absent),
-                Some(50),
+                Some(46),
             ),
             (
                 JoinType::RightSemi,
@@ -2537,13 +2835,13 @@ mod tests {
                 JoinType::LeftAnti,
                 (50, Inexact(10), Inexact(20), Absent, Absent),
                 (10, Inexact(15), Inexact(25), Absent, Absent),
-                Some(50),
+                Some(4),
             ),
             (
                 JoinType::RightAnti,
                 (50, Inexact(10), Inexact(20), Absent, Absent),
                 (10, Inexact(15), Inexact(25), Absent, Absent),
-                Some(10),
+                Some(0),
             ),
             (
                 JoinType::LeftAnti,
@@ -2569,6 +2867,108 @@ mod tests {
                 (10, Inexact(30), Absent, Absent, Absent),
                 Some(50),
             ),
+            // NDV-based semi join: outer_ndv=20, inner_ndv=10
+            // selectivity = 10/20 = 0.5, cardinality = ceil(50 * 0.5) = 25
+            (
+                JoinType::LeftSemi,
+                (50, Inexact(1), Inexact(100), Inexact(20), Absent),
+                (10, Inexact(1), Inexact(100), Inexact(10), Absent),
+                Some(25),
+            ),
+            // inner_ndv(30) >= outer_ndv(20) -> selectivity 1.0, no reduction
+            (
+                JoinType::LeftSemi,
+                (50, Inexact(1), Inexact(100), Inexact(20), Absent),
+                (100, Inexact(1), Inexact(100), Inexact(30), Absent),
+                Some(50),
+            ),
+            // NDV-based anti join: semi=25, anti = 50 - 25 = 25
+            (
+                JoinType::LeftAnti,
+                (50, Inexact(1), Inexact(100), Inexact(20), Absent),
+                (10, Inexact(1), Inexact(100), Inexact(10), Absent),
+                Some(25),
+            ),
+            // inner covers all outer: semi=50, anti = 0
+            (
+                JoinType::LeftAnti,
+                (50, Inexact(1), Inexact(100), Inexact(20), Absent),
+                (100, Inexact(1), Inexact(100), Inexact(30), Absent),
+                Some(0),
+            ),
+            // RightSemi with explicit NDV (NDV within row count, used as-is):
+            // For RightSemi, sides are swapped: outer = right (20 rows, ndv=10),
+            // inner = left (50 rows, ndv=5). selectivity = min(10,5)/10 = 0.5,
+            // cardinality = ceil(20 * 0.5) = 10.
+            (
+                JoinType::RightSemi,
+                (50, Inexact(1), Inexact(100), Inexact(5), Absent),
+                (20, Inexact(1), Inexact(100), Inexact(10), Absent),
+                Some(10),
+            ),
+            // RightAnti with explicit NDV: anti = outer_rows - semi = 20 - 10 = 10.
+            (
+                JoinType::RightAnti,
+                (50, Inexact(1), Inexact(100), Inexact(5), Absent),
+                (20, Inexact(1), Inexact(100), Inexact(10), Absent),
+                Some(10),
+            ),
+            // RightSemi where right-side NDV (20) exceeds right-side row count (10):
+            // NDV is clamped to 10, so outer_ndv=10, inner_ndv=10,
+            // selectivity = min(10,10)/10 = 1.0, cardinality = ceil(10 * 1.0) = 10.
+            (
+                JoinType::RightSemi,
+                (50, Inexact(1), Inexact(100), Inexact(10), Absent),
+                (10, Inexact(1), Inexact(100), Inexact(20), Absent),
+                Some(10),
+            ),
+            // RightAnti with NDV clamped by row count: anti = 10 - 10 = 0.
+            (
+                JoinType::RightAnti,
+                (50, Inexact(1), Inexact(100), Inexact(10), Absent),
+                (10, Inexact(1), Inexact(100), Inexact(20), Absent),
+                Some(0),
+            ),
+            // Empty inner table: no match possible, semi → 0
+            (
+                JoinType::LeftSemi,
+                (100, Absent, Absent, Absent, Absent),
+                (0, Absent, Absent, Absent, Absent),
+                Some(0),
+            ),
+            // NDV-based semi with nulls on outer side:
+            // outer_ndv=20, inner_ndv=10, null_frac=10/100=0.1
+            // selectivity = 10/20 * (1-0.1) = 0.5 * 0.9 = 0.45
+            // semi = ceil(100 * 0.45) = 45
+            (
+                JoinType::LeftSemi,
+                (100, Absent, Absent, Inexact(20), Inexact(10)),
+                (200, Absent, Absent, Inexact(10), Absent),
+                Some(45),
+            ),
+            // Anti-join with nulls on outer side:
+            // semi=45, anti = 100 - 45 = 55
+            (
+                JoinType::LeftAnti,
+                (100, Absent, Absent, Inexact(20), Inexact(10)),
+                (200, Absent, Absent, Inexact(10), Absent),
+                Some(55),
+            ),
+            // All outer rows are null: null_frac=1.0
+            // selectivity = 10/20 * (1-1.0) = 0.0, semi = 0
+            (
+                JoinType::LeftSemi,
+                (100, Absent, Absent, Inexact(20), Inexact(100)),
+                (200, Absent, Absent, Inexact(10), Absent),
+                Some(0),
+            ),
+            // All outer rows are null (anti): anti = 100 - 0 = 100
+            (
+                JoinType::LeftAnti,
+                (100, Absent, Absent, Inexact(20), Inexact(100)),
+                (200, Absent, Absent, Inexact(10), Absent),
+                Some(100),
+            ),
         ];
 
         let join_on = vec![(
@@ -2650,18 +3050,21 @@ mod tests {
             &JoinType::LeftSemi,
             Statistics {
                 num_rows: Inexact(500),
-                total_byte_size: Absent,
+                    total_byte_size: Absent,
                 column_statistics: dummy_column_stats.clone(),
             },
             Statistics {
                 num_rows: Absent,
-                total_byte_size: Absent,
+                    total_byte_size: Absent,
                 column_statistics: dummy_column_stats.clone(),
             },
             &join_on,
         ).expect("Expected non-empty PartialJoinStatistics for SemiJoin with absent inner num_rows");
 
-        assert_eq!(absent_inner_estimation.num_rows, 500, "Expected outer.num_rows estimated SemiJoin cardinality for absent inner num_rows");
+        assert_eq!(
+            absent_inner_estimation.num_rows, 500,
+            "Expected outer.num_rows estimated SemiJoin cardinality for absent inner num_rows"
+        );
 
         let absent_inner_estimation = estimate_join_cardinality(
             &JoinType::LeftSemi,
@@ -2677,7 +3080,161 @@ mod tests {
             },
             &join_on,
         );
-        assert!(absent_inner_estimation.is_none(), "Expected \"None\" estimated SemiJoin cardinality for absent outer and inner num_rows");
+        assert!(
+            absent_inner_estimation.is_none(),
+            "Expected \"None\" estimated SemiJoin cardinality for absent outer and inner num_rows"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_semi_join_multi_column_and_mixed_stats() -> Result<()> {
+        let join_on = vec![
+            (
+                Arc::new(Column::new("l_col0", 0)) as _,
+                Arc::new(Column::new("r_col0", 0)) as _,
+            ),
+            (
+                Arc::new(Column::new("l_col1", 1)) as _,
+                Arc::new(Column::new("r_col1", 1)) as _,
+            ),
+        ];
+
+        // Multi-column: both columns have NDV on both sides.
+        // col0: outer_ndv=20, inner_ndv=10 → selectivity = 10/20 = 0.5
+        // col1: outer_ndv=40, inner_ndv=10 → selectivity = 10/40 = 0.25
+        // total selectivity = 0.5 * 0.25 = 0.125
+        // semi = ceil(100 * 0.125) = 13
+        let result = estimate_join_cardinality(
+            &JoinType::LeftSemi,
+            Statistics {
+                num_rows: Inexact(100),
+                total_byte_size: Absent,
+                column_statistics: vec![
+                    create_column_stats(Absent, Absent, Inexact(20), Absent),
+                    create_column_stats(Absent, Absent, Inexact(40), Absent),
+                ],
+            },
+            Statistics {
+                num_rows: Inexact(200),
+                total_byte_size: Absent,
+                column_statistics: vec![
+                    create_column_stats(Absent, Absent, Inexact(10), Absent),
+                    create_column_stats(Absent, Absent, Inexact(10), Absent),
+                ],
+            },
+            &join_on,
+        )
+        .map(|c| c.num_rows);
+        assert_eq!(result, Some(13), "multi-column semi join");
+
+        // Multi-column anti: anti = 100 - 13 = 87
+        let result = estimate_join_cardinality(
+            &JoinType::LeftAnti,
+            Statistics {
+                num_rows: Inexact(100),
+                total_byte_size: Absent,
+                column_statistics: vec![
+                    create_column_stats(Absent, Absent, Inexact(20), Absent),
+                    create_column_stats(Absent, Absent, Inexact(40), Absent),
+                ],
+            },
+            Statistics {
+                num_rows: Inexact(200),
+                total_byte_size: Absent,
+                column_statistics: vec![
+                    create_column_stats(Absent, Absent, Inexact(10), Absent),
+                    create_column_stats(Absent, Absent, Inexact(10), Absent),
+                ],
+            },
+            &join_on,
+        )
+        .map(|c| c.num_rows);
+        assert_eq!(result, Some(87), "multi-column anti join");
+
+        // Mixed stats: col0 has NDV on both sides, col1 has NDV only on outer.
+        // col1 is skipped (either side missing), so selectivity comes from col0 only.
+        // col0: outer_ndv=20, inner_ndv=10 → selectivity = 0.5
+        // semi = ceil(100 * 0.5) = 50
+        let result = estimate_join_cardinality(
+            &JoinType::LeftSemi,
+            Statistics {
+                num_rows: Inexact(100),
+                total_byte_size: Absent,
+                column_statistics: vec![
+                    create_column_stats(Absent, Absent, Inexact(20), Absent),
+                    create_column_stats(Absent, Absent, Inexact(40), Absent),
+                ],
+            },
+            Statistics {
+                num_rows: Inexact(200),
+                total_byte_size: Absent,
+                column_statistics: vec![
+                    create_column_stats(Absent, Absent, Inexact(10), Absent),
+                    create_column_stats(Absent, Absent, Absent, Absent),
+                ],
+            },
+            &join_on,
+        )
+        .map(|c| c.num_rows);
+        assert_eq!(result, Some(50), "mixed stats: col1 skipped");
+
+        // Mixed stats: neither column has stats on both sides → fallback to outer_rows
+        let result = estimate_join_cardinality(
+            &JoinType::LeftSemi,
+            Statistics {
+                num_rows: Inexact(100),
+                total_byte_size: Absent,
+                column_statistics: vec![
+                    create_column_stats(Absent, Absent, Inexact(20), Absent),
+                    create_column_stats(Absent, Absent, Absent, Absent),
+                ],
+            },
+            Statistics {
+                num_rows: Inexact(200),
+                total_byte_size: Absent,
+                column_statistics: vec![
+                    create_column_stats(Absent, Absent, Absent, Absent),
+                    create_column_stats(Absent, Absent, Inexact(10), Absent),
+                ],
+            },
+            &join_on,
+        )
+        .map(|c| c.num_rows);
+        assert_eq!(result, Some(100), "no column has stats on both sides");
+
+        // Multi-column with nulls on one column:
+        // col0: outer_ndv=20, inner_ndv=10, null_frac=0.0 → 10/20 * 1.0 = 0.5
+        // col1: outer_ndv=40, inner_ndv=10, null_frac=20/100=0.2 → 10/40 * 0.8 = 0.2
+        // total selectivity = 0.5 * 0.2 = 0.1
+        // semi = ceil(100 * 0.1) = 10
+        let result = estimate_join_cardinality(
+            &JoinType::LeftSemi,
+            Statistics {
+                num_rows: Inexact(100),
+                total_byte_size: Absent,
+                column_statistics: vec![
+                    create_column_stats(Absent, Absent, Inexact(20), Absent),
+                    create_column_stats(Absent, Absent, Inexact(40), Inexact(20)),
+                ],
+            },
+            Statistics {
+                num_rows: Inexact(200),
+                total_byte_size: Absent,
+                column_statistics: vec![
+                    create_column_stats(Absent, Absent, Inexact(10), Absent),
+                    create_column_stats(Absent, Absent, Inexact(10), Absent),
+                ],
+            },
+            &join_on,
+        )
+        .map(|c| c.num_rows);
+        assert_eq!(
+            result,
+            Some(10),
+            "multi-column semi join with nulls on one column"
+        );
 
         Ok(())
     }
@@ -2807,7 +3364,6 @@ mod tests {
 
     fn assert_col_expr(expr: &Arc<dyn PhysicalExpr>, name: &str, index: usize) {
         let col = expr
-            .as_any()
             .downcast_ref::<Column>()
             .expect("Projection items should be Column expression");
         assert_eq!(col.name(), name);
@@ -2837,4 +3393,238 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_build_batch_empty_build_side_empty_schema() -> Result<()> {
+        // When the output schema has no fields (empty projection pushed into
+        // the join), build_batch_empty_build_side should return a RecordBatch
+        // with the correct row count but no columns.
+        let empty_schema = Schema::empty();
+
+        let build_batch = RecordBatch::try_new(
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)])),
+            vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
+        )?;
+
+        let probe_batch = RecordBatch::try_new(
+            Arc::new(Schema::new(vec![Field::new("b", DataType::Int32, true)])),
+            vec![Arc::new(Int32Array::from(vec![4, 5, 6, 7]))],
+        )?;
+
+        let result = build_batch_empty_build_side(
+            &empty_schema,
+            &build_batch,
+            &probe_batch,
+            &[], // no column indices with empty projection
+            JoinType::Right,
+        )?;
+
+        assert_eq!(result.num_rows(), 4);
+        assert_eq!(result.num_columns(), 0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_max_distinct_count_no_overflow_when_null_count_exceeds_num_rows() {
+        let num_rows = Exact(2);
+        let stats = ColumnStatistics {
+            distinct_count: Absent,
+            null_count: Exact(5),
+            min_value: Absent,
+            max_value: Absent,
+            sum_value: Absent,
+            byte_size: Absent,
+        };
+        let result = max_distinct_count(&num_rows, &stats);
+        assert_eq!(result, Exact(0));
+    }
+
+    #[test]
+    fn test_join_key_comparator_multi_column() {
+        let left_a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 2, 3]));
+        let left_b: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c", "d"]));
+        let right_a: ArrayRef = Arc::new(Int32Array::from(vec![2, 2, 3, 4]));
+        let right_b: ArrayRef = Arc::new(StringArray::from(vec!["b", "d", "a", "a"]));
+
+        let opts = vec![SortOptions::default(), SortOptions::default()];
+        let cmp = JoinKeyComparator::new(
+            &[left_a, left_b],
+            &[right_a, right_b],
+            &opts,
+            NullEquality::NullEqualsNull,
+        )
+        .unwrap();
+
+        // left[0]=(1,"a") vs right[0]=(2,"b") -> Less (first column)
+        assert_eq!(cmp.compare(0, 0), Ordering::Less);
+        // left[1]=(2,"b") vs right[0]=(2,"b") -> Equal
+        assert_eq!(cmp.compare(1, 0), Ordering::Equal);
+        assert!(cmp.is_equal(1, 0));
+        // left[2]=(2,"c") vs right[1]=(2,"d") -> Less (second column)
+        assert_eq!(cmp.compare(2, 1), Ordering::Less);
+        // left[3]=(3,"d") vs right[0]=(2,"b") -> Greater
+        assert_eq!(cmp.compare(3, 0), Ordering::Greater);
+    }
+
+    #[test]
+    fn test_join_key_comparator_null_equals_null() {
+        let left: ArrayRef =
+            Arc::new(Int32Array::from(vec![Some(1), None, None, Some(2)]));
+        let right: ArrayRef =
+            Arc::new(Int32Array::from(vec![None, None, Some(1), Some(2)]));
+
+        let opts = vec![SortOptions {
+            descending: false,
+            nulls_first: true,
+        }];
+        let cmp = JoinKeyComparator::new(
+            &[left],
+            &[right],
+            &opts,
+            NullEquality::NullEqualsNull,
+        )
+        .unwrap();
+
+        // left[1]=NULL vs right[1]=NULL -> Equal (NullEqualsNull)
+        assert_eq!(cmp.compare(1, 1), Ordering::Equal);
+        assert!(cmp.is_equal(1, 1));
+        // left[0]=1 vs right[0]=NULL -> Greater (nulls_first, non-null > null)
+        assert_eq!(cmp.compare(0, 0), Ordering::Greater);
+        // left[3]=2 vs right[3]=2 -> Equal
+        assert_eq!(cmp.compare(3, 3), Ordering::Equal);
+    }
+
+    #[test]
+    fn test_join_key_comparator_null_equals_nothing() {
+        let left: ArrayRef =
+            Arc::new(Int32Array::from(vec![Some(1), None, None, Some(2)]));
+        let right: ArrayRef =
+            Arc::new(Int32Array::from(vec![None, None, Some(1), Some(2)]));
+
+        let opts = vec![SortOptions {
+            descending: false,
+            nulls_first: true,
+        }];
+        let cmp = JoinKeyComparator::new(
+            &[left],
+            &[right],
+            &opts,
+            NullEquality::NullEqualsNothing,
+        )
+        .unwrap();
+
+        // left[1]=NULL vs right[1]=NULL -> Less (NullEqualsNothing)
+        assert_eq!(cmp.compare(1, 1), Ordering::Less);
+        // left[0]=1 vs right[0]=NULL -> Greater (nulls_first)
+        assert_eq!(cmp.compare(0, 0), Ordering::Greater);
+        // left[3]=2 vs right[3]=2 -> Equal
+        assert_eq!(cmp.compare(3, 3), Ordering::Equal);
+    }
+
+    #[test]
+    fn test_join_key_comparator_nulls_first_ordering() {
+        let left: ArrayRef = Arc::new(Int32Array::from(vec![None, Some(1)]));
+        let right: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), None]));
+
+        // nulls_first = true: null < non-null
+        let cmp_nf = JoinKeyComparator::new(
+            &[Arc::clone(&left)],
+            &[Arc::clone(&right)],
+            &[SortOptions {
+                descending: false,
+                nulls_first: true,
+            }],
+            NullEquality::NullEqualsNull,
+        )
+        .unwrap();
+        assert_eq!(cmp_nf.compare(0, 0), Ordering::Less);
+        assert_eq!(cmp_nf.compare(1, 1), Ordering::Greater);
+
+        // nulls_first = false: null > non-null
+        let cmp_nl = JoinKeyComparator::new(
+            &[left],
+            &[right],
+            &[SortOptions {
+                descending: false,
+                nulls_first: false,
+            }],
+            NullEquality::NullEqualsNull,
+        )
+        .unwrap();
+        assert_eq!(cmp_nl.compare(0, 0), Ordering::Greater);
+        assert_eq!(cmp_nl.compare(1, 1), Ordering::Less);
+    }
+
+    #[test]
+    fn test_max_distinct_count_preserves_precision_when_not_capped() {
+        assert_eq!(
+            max_distinct_count(
+                &Exact(10),
+                &ColumnStatistics {
+                    distinct_count: Exact(5),
+                    ..Default::default()
+                }
+            ),
+            Exact(5)
+        );
+        assert_eq!(
+            max_distinct_count(
+                &Exact(10),
+                &ColumnStatistics {
+                    distinct_count: Inexact(5),
+                    ..Default::default()
+                }
+            ),
+            Inexact(5)
+        );
+        // Inexact num_rows does not affect an exact NDV that is within bounds
+        assert_eq!(
+            max_distinct_count(
+                &Inexact(10),
+                &ColumnStatistics {
+                    distinct_count: Exact(5),
+                    ..Default::default()
+                }
+            ),
+            Exact(5)
+        );
+    }
+
+    #[test]
+    fn test_max_distinct_count_demotes_to_inexact_when_capped() {
+        // Exact NDV > Exact num_rows is an illegal state (NDV <= num_rows is a
+        // mathematical invariant), but the code handles it defensively by
+        // capping and demoting to inexact
+        assert_eq!(
+            max_distinct_count(
+                &Exact(10),
+                &ColumnStatistics {
+                    distinct_count: Exact(15),
+                    ..Default::default()
+                }
+            ),
+            Inexact(10)
+        );
+        assert_eq!(
+            max_distinct_count(
+                &Inexact(10),
+                &ColumnStatistics {
+                    distinct_count: Exact(15),
+                    ..Default::default()
+                }
+            ),
+            Inexact(10)
+        );
+        assert_eq!(
+            max_distinct_count(
+                &Exact(10),
+                &ColumnStatistics {
+                    distinct_count: Inexact(15),
+                    ..Default::default()
+                }
+            ),
+            Inexact(10)
+        );
+    }
 }
diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs
index 17628fd8ad1d2..3005e975424b4 100644
--- a/datafusion/physical-plan/src/lib.rs
+++ b/datafusion/physical-plan/src/lib.rs
@@ -23,6 +23,7 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! Traits for physical query plan, supporting parallel execution for partitioned relations.
 //!
@@ -30,26 +31,27 @@
 
 pub use datafusion_common::hash_utils;
 pub use datafusion_common::utils::project_schema;
-pub use datafusion_common::{internal_err, ColumnStatistics, Statistics};
+pub use datafusion_common::{ColumnStatistics, Statistics, internal_err};
 pub use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream};
 pub use datafusion_expr::{Accumulator, ColumnarValue};
-pub use datafusion_physical_expr::window::WindowExpr;
 use datafusion_physical_expr::PhysicalSortExpr;
+pub use datafusion_physical_expr::window::WindowExpr;
 pub use datafusion_physical_expr::{
-    expressions, Distribution, Partitioning, PhysicalExpr,
+    Distribution, Partitioning, PhysicalExpr, expressions,
 };
 
 pub use crate::display::{DefaultDisplay, DisplayAs, DisplayFormatType, VerboseDisplay};
 pub use crate::execution_plan::{
-    collect, collect_partitioned, displayable, execute_input_stream, execute_stream,
-    execute_stream_partitioned, get_plan_string, with_new_children_if_necessary,
-    ExecutionPlan, ExecutionPlanProperties, PlanProperties,
+    ExecutionPlan, ExecutionPlanProperties, PlanProperties, collect, collect_partitioned,
+    displayable, execute_input_stream, execute_stream, execute_stream_partitioned,
+    get_plan_string, with_new_children_if_necessary,
 };
 pub use crate::metrics::Metric;
 pub use crate::ordering::InputOrderMode;
+pub use crate::sort_pushdown::SortOrderPushdownResult;
 pub use crate::stream::EmptyRecordBatchStream;
 pub use crate::topk::TopK;
-pub use crate::visitor::{accept, visit_execution_plan, ExecutionPlanVisitor};
+pub use crate::visitor::{ExecutionPlanVisitor, accept, visit_execution_plan};
 pub use crate::work_table::WorkTable;
 pub use spill::spill_manager::SpillManager;
 
@@ -61,9 +63,11 @@ mod visitor;
 pub mod aggregates;
 pub mod analyze;
 pub mod async_func;
+pub mod buffer;
 pub mod coalesce;
 pub mod coalesce_batches;
 pub mod coalesce_partitions;
+pub mod column_rewriter;
 pub mod common;
 pub mod coop;
 pub mod display;
@@ -76,10 +80,13 @@ pub mod joins;
 pub mod limit;
 pub mod memory;
 pub mod metrics;
+pub mod operator_statistics;
 pub mod placeholder_row;
 pub mod projection;
 pub mod recursive_query;
 pub mod repartition;
+pub mod scalar_subquery;
+pub mod sort_pushdown;
 pub mod sorts;
 pub mod spill;
 pub mod stream;
diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs
index 6a0cae20e5aa6..51bef5d24bd2d 100644
--- a/datafusion/physical-plan/src/limit.rs
+++ b/datafusion/physical-plan/src/limit.rs
@@ -17,7 +17,6 @@
 
 //! Defines the LIMIT plan
 
-use std::any::Any;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};
@@ -28,13 +27,18 @@ use super::{
     SendableRecordBatchStream, Statistics,
 };
 use crate::execution_plan::{Boundedness, CardinalityEffect};
-use crate::{DisplayFormatType, Distribution, ExecutionPlan, Partitioning};
+use crate::{
+    DisplayFormatType, Distribution, ExecutionPlan, Partitioning,
+    check_if_same_properties,
+};
 
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
-use datafusion_common::{internal_err, Result};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, assert_eq_or_internal_err, internal_err};
 use datafusion_execution::TaskContext;
 
+use datafusion_physical_expr::{LexOrdering, PhysicalExpr};
 use futures::stream::{Stream, StreamExt};
 use log::trace;
 
@@ -50,7 +54,10 @@ pub struct GlobalLimitExec {
     fetch: Option<usize>,
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
-    cache: PlanProperties,
+    /// Does the limit have to preserve the order of its input, and if so what is it?
+    /// Some optimizations may reorder the input if no particular sort is required
+    required_ordering: Option<LexOrdering>,
+    cache: Arc<PlanProperties>,
 }
 
 impl GlobalLimitExec {
@@ -62,7 +69,8 @@ impl GlobalLimitExec {
             skip,
             fetch,
             metrics: ExecutionPlanMetricsSet::new(),
-            cache,
+            required_ordering: None,
+            cache: Arc::new(cache),
         }
     }
 
@@ -91,6 +99,27 @@ impl GlobalLimitExec {
             Boundedness::Bounded,
         )
     }
+
+    /// Get the required ordering from limit
+    pub fn required_ordering(&self) -> &Option<LexOrdering> {
+        &self.required_ordering
+    }
+
+    /// Set the required ordering for limit
+    pub fn set_required_ordering(&mut self, required_ordering: Option<LexOrdering>) {
+        self.required_ordering = required_ordering;
+    }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for GlobalLimitExec {
@@ -125,11 +154,7 @@ impl ExecutionPlan for GlobalLimitExec {
     }
 
     /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -149,12 +174,27 @@ impl ExecutionPlan for GlobalLimitExec {
         vec![false]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to required ordering expressions if present
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = &self.required_ordering {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
-        children: Vec<Arc<dyn ExecutionPlan>>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         Ok(Arc::new(GlobalLimitExec::new(
-            Arc::clone(&children[0]),
+            children.swap_remove(0),
             self.skip,
             self.fetch,
         )))
@@ -167,14 +207,18 @@ impl ExecutionPlan for GlobalLimitExec {
     ) -> Result<SendableRecordBatchStream> {
         trace!("Start GlobalLimitExec::execute for partition: {partition}");
         // GlobalLimitExec has a single output partition
-        if 0 != partition {
-            return internal_err!("GlobalLimitExec invalid partition {partition}");
-        }
+        assert_eq_or_internal_err!(
+            partition,
+            0,
+            "GlobalLimitExec invalid partition {partition}"
+        );
 
         // GlobalLimitExec requires a single input partition
-        if 1 != self.input.output_partitioning().partition_count() {
-            return internal_err!("GlobalLimitExec requires a single input partition");
-        }
+        assert_eq_or_internal_err!(
+            self.input.output_partitioning().partition_count(),
+            1,
+            "GlobalLimitExec requires a single input partition"
+        );
 
         let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
         let stream = self.input.execute(0, context)?;
@@ -190,14 +234,9 @@ impl ExecutionPlan for GlobalLimitExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        self.input
-            .partition_statistics(partition)?
-            .with_fetch(self.fetch, self.skip, 1)
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let stats = Arc::unwrap_or_clone(self.input.partition_statistics(partition)?);
+        Ok(Arc::new(stats.with_fetch(self.fetch, self.skip, 1)?))
     }
 
     fn fetch(&self) -> Option<usize> {
@@ -210,7 +249,7 @@ impl ExecutionPlan for GlobalLimitExec {
 }
 
 /// LocalLimitExec applies a limit to a single partition
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct LocalLimitExec {
     /// Input execution plan
     input: Arc<dyn ExecutionPlan>,
@@ -218,7 +257,10 @@ pub struct LocalLimitExec {
     fetch: usize,
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
-    cache: PlanProperties,
+    /// If the child plan is a sort node, after the sort node is removed during
+    /// physical optimization, we should add the required ordering to the limit node
+    required_ordering: Option<LexOrdering>,
+    cache: Arc<PlanProperties>,
 }
 
 impl LocalLimitExec {
@@ -229,7 +271,8 @@ impl LocalLimitExec {
             input,
             fetch,
             metrics: ExecutionPlanMetricsSet::new(),
-            cache,
+            required_ordering: None,
+            cache: Arc::new(cache),
         }
     }
 
@@ -253,6 +296,27 @@ impl LocalLimitExec {
             Boundedness::Bounded,
         )
     }
+
+    /// Get the required ordering from limit
+    pub fn required_ordering(&self) -> &Option<LexOrdering> {
+        &self.required_ordering
+    }
+
+    /// Set the required ordering for limit
+    pub fn set_required_ordering(&mut self, required_ordering: Option<LexOrdering>) {
+        self.required_ordering = required_ordering;
+    }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for LocalLimitExec {
@@ -278,11 +342,7 @@ impl ExecutionPlan for LocalLimitExec {
     }
 
     /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -298,10 +358,25 @@ impl ExecutionPlan for LocalLimitExec {
         vec![true]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to required ordering expressions if present
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = &self.required_ordering {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         match children.len() {
             1 => Ok(Arc::new(LocalLimitExec::new(
                 Arc::clone(&children[0]),
@@ -316,7 +391,12 @@ impl ExecutionPlan for LocalLimitExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        trace!("Start LocalLimitExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
+        trace!(
+            "Start LocalLimitExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
         let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
         let stream = self.input.execute(partition, context)?;
         Ok(Box::pin(LimitStream::new(
@@ -331,14 +411,9 @@ impl ExecutionPlan for LocalLimitExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        self.input
-            .partition_statistics(partition)?
-            .with_fetch(Some(self.fetch), 0, 1)
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let stats = Arc::unwrap_or_clone(self.input.partition_statistics(partition)?);
+        Ok(Arc::new(stats.with_fetch(Some(self.fetch), 0, 1)?))
     }
 
     fn fetch(&self) -> Option<usize> {
@@ -491,7 +566,6 @@ mod tests {
     use arrow::datatypes::Schema;
     use datafusion_common::stats::Precision;
     use datafusion_physical_expr::expressions::col;
-    use datafusion_physical_expr::PhysicalExpr;
 
     #[tokio::test]
     async fn limit() -> Result<()> {
diff --git a/datafusion/physical-plan/src/memory.rs b/datafusion/physical-plan/src/memory.rs
index 1bf1e04efb53b..e172ef4463ec4 100644
--- a/datafusion/physical-plan/src/memory.rs
+++ b/datafusion/physical-plan/src/memory.rs
@@ -27,15 +27,16 @@ use crate::execution_plan::{Boundedness, EmissionType, SchedulingType};
 use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
 use crate::{
     DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties,
-    RecordBatchStream, SendableRecordBatchStream, Statistics,
+    RecordBatchStream, SendableRecordBatchStream,
 };
 
 use arrow::array::RecordBatch;
 use arrow::datatypes::SchemaRef;
-use datafusion_common::{internal_err, Result};
-use datafusion_execution::memory_pool::MemoryReservation;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, assert_eq_or_internal_err, assert_or_internal_err};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::EquivalenceProperties;
+use datafusion_execution::memory_pool::MemoryReservation;
+use datafusion_physical_expr::{EquivalenceProperties, PhysicalExpr};
 
 use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 use futures::Stream;
@@ -144,6 +145,9 @@ pub trait LazyBatchGenerator: Send + Sync + fmt::Debug + fmt::Display {
 
     /// Generate the next batch, return `None` when no more batches are available
     fn generate_next_batch(&mut self) -> Result<Option<RecordBatch>>;
+
+    /// Returns a new instance with the state reset.
+    fn reset_state(&self) -> Arc<RwLock<dyn LazyBatchGenerator>>;
 }
 
 /// Execution plan for lazy in-memory batches of data
@@ -153,10 +157,12 @@ pub trait LazyBatchGenerator: Send + Sync + fmt::Debug + fmt::Display {
 pub struct LazyMemoryExec {
     /// Schema representing the data
     schema: SchemaRef,
+    /// Optional projection for which columns to load
+    projection: Option<Vec<usize>>,
     /// Functions to generate batches for each partition
     batch_generators: Vec<Arc<RwLock<dyn LazyBatchGenerator>>>,
     /// Plan properties cache storing equivalence properties, partitioning, and execution mode
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
 }
@@ -195,31 +201,49 @@ impl LazyMemoryExec {
             EmissionType::Incremental,
             boundedness,
         )
-        .with_scheduling_type(SchedulingType::Cooperative);
+        .with_scheduling_type(SchedulingType::Cooperative)
+        .into();
 
         Ok(Self {
             schema,
+            projection: None,
             batch_generators: generators,
             cache,
             metrics: ExecutionPlanMetricsSet::new(),
         })
     }
 
-    pub fn try_set_partitioning(&mut self, partitioning: Partitioning) -> Result<()> {
-        if partitioning.partition_count() != self.batch_generators.len() {
-            internal_err!(
-                "Partition count must match generator count: {} != {}",
-                partitioning.partition_count(),
-                self.batch_generators.len()
-            )
-        } else {
-            self.cache.partitioning = partitioning;
-            Ok(())
+    pub fn with_projection(mut self, projection: Option<Vec<usize>>) -> Self {
+        match projection.as_ref() {
+            Some(columns) => {
+                let projected = Arc::new(self.schema.project(columns).unwrap());
+                Arc::make_mut(&mut self.cache).set_eq_properties(
+                    EquivalenceProperties::new(Arc::clone(&projected)),
+                );
+                self.schema = projected;
+                self.projection = projection;
+                self
+            }
+            _ => self,
         }
     }
 
+    pub fn try_set_partitioning(&mut self, partitioning: Partitioning) -> Result<()> {
+        let partition_count = partitioning.partition_count();
+        let generator_count = self.batch_generators.len();
+        assert_eq_or_internal_err!(
+            partition_count,
+            generator_count,
+            "Partition count must match generator count: {} != {}",
+            partition_count,
+            generator_count
+        );
+        Arc::make_mut(&mut self.cache).partitioning = partitioning;
+        Ok(())
+    }
+
     pub fn add_ordering(&mut self, ordering: impl IntoIterator<Item = PhysicalSortExpr>) {
-        self.cache
+        Arc::make_mut(&mut self.cache)
             .eq_properties
             .add_orderings(std::iter::once(ordering));
     }
@@ -276,15 +300,11 @@ impl ExecutionPlan for LazyMemoryExec {
         "LazyMemoryExec"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         Arc::clone(&self.schema)
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -292,15 +312,22 @@ impl ExecutionPlan for LazyMemoryExec {
         vec![]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        if children.is_empty() {
-            Ok(self)
-        } else {
-            internal_err!("Children cannot be replaced in LazyMemoryExec")
-        }
+        assert_or_internal_err!(
+            children.is_empty(),
+            "Children cannot be replaced in LazyMemoryExec"
+        );
+        Ok(self)
     }
 
     fn execute(
@@ -308,19 +335,23 @@ impl ExecutionPlan for LazyMemoryExec {
         partition: usize,
         _context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        if partition >= self.batch_generators.len() {
-            return internal_err!(
-                "Invalid partition {} for LazyMemoryExec with {} partitions",
-                partition,
-                self.batch_generators.len()
-            );
-        }
+        assert_or_internal_err!(
+            partition < self.batch_generators.len(),
+            "Invalid partition {} for LazyMemoryExec with {} partitions",
+            partition,
+            self.batch_generators.len()
+        );
 
         let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
 
+        // Create a fresh generator via reset_state() so that each execute()
+        // call produces an independent stream starting from the beginning.
+        let generator = self.batch_generators[partition].read().reset_state();
+
         let stream = LazyMemoryStream {
             schema: Arc::clone(&self.schema),
-            generator: Arc::clone(&self.batch_generators[partition]),
+            projection: self.projection.clone(),
+            generator,
             baseline_metrics,
         };
         Ok(Box::pin(cooperative(stream)))
@@ -330,14 +361,27 @@ impl ExecutionPlan for LazyMemoryExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(Statistics::new_unknown(&self.schema))
+    fn reset_state(self: Arc<Self>) -> Result<Arc<dyn ExecutionPlan>> {
+        let generators = self
+            .generators()
+            .iter()
+            .map(|g| g.read().reset_state())
+            .collect::<Vec<_>>();
+        Ok(Arc::new(LazyMemoryExec {
+            schema: Arc::clone(&self.schema),
+            batch_generators: generators,
+            cache: Arc::clone(&self.cache),
+            metrics: ExecutionPlanMetricsSet::new(),
+            projection: self.projection.clone(),
+        }))
     }
 }
 
 /// Stream that generates record batches on demand
 pub struct LazyMemoryStream {
     schema: SchemaRef,
+    /// Optional projection for which columns to load
+    projection: Option<Vec<usize>>,
     /// Generator to produce batches
     ///
     /// Note: Idiomatically, DataFusion uses plan-time parallelism - each stream
@@ -361,7 +405,14 @@ impl Stream for LazyMemoryStream {
         let batch = self.generator.write().generate_next_batch();
 
         let poll = match batch {
-            Ok(Some(batch)) => Poll::Ready(Some(Ok(batch))),
+            Ok(Some(batch)) => {
+                // return just the columns requested
+                let batch = match self.projection.as_ref() {
+                    Some(columns) => batch.project(columns)?,
+                    None => batch,
+                };
+                Poll::Ready(Some(Ok(batch)))
+            }
             Ok(None) => Poll::Ready(None),
             Err(e) => Poll::Ready(Some(Err(e))),
         };
@@ -422,6 +473,15 @@ mod lazy_memory_tests {
                 vec![Arc::new(array)],
             )?))
         }
+
+        fn reset_state(&self) -> Arc<RwLock<dyn LazyBatchGenerator>> {
+            Arc::new(RwLock::new(TestGenerator {
+                counter: 0,
+                max_batches: self.max_batches,
+                batch_size: self.batch_size,
+                schema: Arc::clone(&self.schema),
+            }))
+        }
     }
 
     #[tokio::test]
@@ -475,6 +535,41 @@ mod lazy_memory_tests {
         Ok(())
     }
 
+    /// Verify that calling execute(0) twice on the same LazyMemoryExec
+    /// produces independent streams with the same data.
+    #[tokio::test]
+    async fn test_lazy_memory_exec_multiple_executions_are_independent() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]));
+        let generator = TestGenerator {
+            counter: 0,
+            max_batches: 3,
+            batch_size: 2,
+            schema: Arc::clone(&schema),
+        };
+
+        let exec =
+            LazyMemoryExec::try_new(schema, vec![Arc::new(RwLock::new(generator))])?;
+        let task_ctx = Arc::new(TaskContext::default());
+
+        // First execution — consume all batches
+        let batches_1 = collect(exec.execute(0, Arc::clone(&task_ctx))?).await?;
+        let total_rows_1: usize = batches_1.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total_rows_1, 6);
+
+        // Second execution — should produce the same data, not continue
+        // from where the first execution left off
+        let batches_2 = collect(exec.execute(0, Arc::clone(&task_ctx))?).await?;
+        let total_rows_2: usize = batches_2.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total_rows_2, 6);
+
+        // Verify contents are identical
+        for (b1, b2) in batches_1.iter().zip(batches_2.iter()) {
+            assert_eq!(b1, b2);
+        }
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_lazy_memory_exec_invalid_partition() -> Result<()> {
         let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]));
@@ -540,4 +635,31 @@ mod lazy_memory_tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_lazy_memory_exec_reset_state() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]));
+        let generator = TestGenerator {
+            counter: 0,
+            max_batches: 3,
+            batch_size: 2,
+            schema: Arc::clone(&schema),
+        };
+
+        let exec = Arc::new(LazyMemoryExec::try_new(
+            schema,
+            vec![Arc::new(RwLock::new(generator))],
+        )?);
+        let stream = exec.execute(0, Arc::new(TaskContext::default()))?;
+        let batches = collect(stream).await?;
+
+        let exec_reset = exec.reset_state()?;
+        let stream = exec_reset.execute(0, Arc::new(TaskContext::default()))?;
+        let batches_reset = collect(stream).await?;
+
+        // if the reset_state is not correct, the batches_reset will be empty
+        assert_eq!(batches, batches_reset);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/metrics.rs b/datafusion/physical-plan/src/metrics.rs
new file mode 100644
index 0000000000000..fe17cbdd4a2c2
--- /dev/null
+++ b/datafusion/physical-plan/src/metrics.rs
@@ -0,0 +1,21 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Metrics live in `datafusion-physical-expr-common`; this module re-exports
+//! them to keep the public APIs stable.
+
+pub use datafusion_physical_expr_common::metrics::*;
diff --git a/datafusion/physical-plan/src/operator_statistics/mod.rs b/datafusion/physical-plan/src/operator_statistics/mod.rs
new file mode 100644
index 0000000000000..20266e9768ebe
--- /dev/null
+++ b/datafusion/physical-plan/src/operator_statistics/mod.rs
@@ -0,0 +1,2320 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Pluggable statistics propagation for physical plans.
+//!
+//! This module provides an extensible mechanism for computing statistics
+//! on [`ExecutionPlan`] nodes, following the chain of responsibility pattern
+//! similar to `RelationPlanner` for SQL parsing.
+//!
+//! # Overview
+//!
+//! The default implementation delegates to each operator's built-in
+//! `partition_statistics`. Users can register custom [`StatisticsProvider`]
+//! implementations to:
+//!
+//! 1. Provide statistics for custom [`ExecutionPlan`] implementations
+//! 2. Override default estimation with advanced approaches (e.g., histograms)
+//! 3. Plug in domain-specific knowledge for better cardinality estimation
+//!
+//! # Architecture
+//!
+//! - [`StatisticsProvider`]: Chain element that computes statistics for specific operators
+//! - [`StatisticsRegistry`]: Chains providers, lives in SessionState
+//! - [`ExtendedStatistics`]: Statistics with type-safe custom extensions
+//!
+//! # Built-in Providers
+//!
+//! The following providers are included and can be registered in this order:
+//!
+//! 1. [`FilterStatisticsProvider`] - selectivity-based filter estimation
+//! 2. [`ProjectionStatisticsProvider`] - column mapping through projections
+//! 3. [`PassthroughStatisticsProvider`] - passthrough for cardinality-preserving operators
+//! 4. [`AggregateStatisticsProvider`] - NDV-based GROUP BY cardinality estimation
+//! 5. [`JoinStatisticsProvider`] - NDV-based join output estimation (hash, sort-merge, cross)
+//! 6. [`LimitStatisticsProvider`] - caps output at the fetch limit (local and global)
+//! 7. [`UnionStatisticsProvider`] - sums input row counts
+//! 8. [`DefaultStatisticsProvider`] - fallback to `partition_statistics(None)`
+//!
+//! # Relationship to [#20184](https://github.com/apache/datafusion/issues/20184)
+//!
+//! This module performs its own bottom-up tree walk in [`StatisticsRegistry::compute`],
+//! separate from the walk optimizer rules do via `transform_up`. This means existing
+//! rules that call `partition_statistics` directly bypass the registry.
+//!
+//! [#20184](https://github.com/apache/datafusion/issues/20184) adds a `child_stats`
+//! parameter to `partition_statistics`. Once it lands, the registry can feed enriched
+//! **base** [`Statistics`] into operators' built-in `partition_statistics` calls,
+//! removing redundancy for the base-stats path (row counts, column stats). However,
+//! the separate registry walk is still required for [`ExtendedStatistics`] extension
+//! propagation: `partition_statistics` returns `Arc<Statistics>`, so extensions
+//! (histograms, sketches, etc.) are stripped at that boundary and can only flow
+//! through the registry walk.
+//!
+//! If [`Statistics`] itself were extended to carry a type-erased extension map
+//! (similar to [`ExtendedStatistics`]), the registry walk could be dropped entirely:
+//! extensions would flow naturally through `partition_statistics(child_stats)` and
+//! the registry would become a pure chain-of-responsibility on top of the existing
+//! traversal with no separate walk needed.
+//!
+//! # Example
+//!
+//! ```ignore
+//! use datafusion_physical_plan::operator_statistics::*;
+//!
+//! // Create registry with default provider
+//! let mut registry = StatisticsRegistry::new();
+//!
+//! // Register custom provider (higher priority)
+//! registry.register(Arc::new(MyHistogramProvider));
+//!
+//! // Compute statistics through the chain
+//! let stats = registry.compute(plan.as_ref())?;
+//! ```
+
+use std::any::{Any, TypeId};
+use std::collections::HashMap;
+use std::fmt::{self, Debug};
+use std::sync::Arc;
+
+use datafusion_common::stats::Precision;
+use datafusion_common::{Result, Statistics};
+
+use crate::ExecutionPlan;
+
+// ============================================================================
+// ExtendedStatistics: Statistics with type-safe extensions
+// ============================================================================
+
+/// Statistics with support for custom extensions.
+///
+/// Wraps the standard [`Statistics`] and adds a type-erased extension map
+/// for custom statistics like histograms, sketches, or domain-specific metadata.
+///
+/// # Example
+///
+/// ```ignore
+/// // Define a custom statistics extension
+/// #[derive(Debug, Clone)]
+/// struct HistogramStats {
+///     buckets: Vec<(i64, i64, usize)>, // (min, max, count)
+/// }
+///
+/// // Set extension in a planner
+/// let mut stats = ExtendedStatistics::from(base_stats);
+/// stats.set_extension(HistogramStats { buckets: vec![] });
+///
+/// // Retrieve in a consumer
+/// if let Some(hist) = stats.get_extension::<HistogramStats>() {
+///     // Use histogram for better estimation
+/// }
+/// ```
+#[derive(Debug, Clone, Default)]
+pub struct ExtendedStatistics {
+    /// Standard statistics (num_rows, byte_size, column stats)
+    base: Arc<Statistics>,
+    /// Type-erased extensions for custom statistics
+    extensions: HashMap<TypeId, Arc<dyn Any + Send + Sync>>,
+}
+
+impl ExtendedStatistics {
+    /// Create new ExtendedStatistics wrapping owned statistics.
+    pub fn new(base: Statistics) -> Self {
+        Self {
+            base: Arc::new(base),
+            extensions: HashMap::new(),
+        }
+    }
+
+    /// Create new ExtendedStatistics from an [`Arc<Statistics>`].
+    pub fn new_arc(base: Arc<Statistics>) -> Self {
+        Self {
+            base,
+            extensions: HashMap::new(),
+        }
+    }
+
+    /// Returns a reference to the base [`Statistics`].
+    pub fn base(&self) -> &Statistics {
+        &self.base
+    }
+
+    /// Returns a reference to the underlying [`Arc<Statistics>`].
+    pub fn base_arc(&self) -> &Arc<Statistics> {
+        &self.base
+    }
+
+    /// Get a reference to a custom statistics extension by type.
+    pub fn get_extension<T: 'static + Send + Sync>(&self) -> Option<&T> {
+        self.extensions
+            .get(&TypeId::of::<T>())
+            .and_then(|ext| ext.downcast_ref())
+    }
+
+    /// Set a custom statistics extension.
+    pub fn set_extension<T: 'static + Send + Sync>(&mut self, value: T) {
+        self.extensions.insert(TypeId::of::<T>(), Arc::new(value));
+    }
+
+    /// Check if an extension of the given type exists.
+    pub fn has_extension<T: 'static + Send + Sync>(&self) -> bool {
+        self.extensions.contains_key(&TypeId::of::<T>())
+    }
+
+    /// Merge extensions from another ExtendedStatistics (other's extensions take precedence).
+    pub fn merge_extensions(&mut self, other: &ExtendedStatistics) {
+        for (type_id, ext) in &other.extensions {
+            self.extensions.insert(*type_id, Arc::clone(ext));
+        }
+    }
+}
+
+impl From<Statistics> for ExtendedStatistics {
+    fn from(base: Statistics) -> Self {
+        Self::new(base)
+    }
+}
+
+impl From<Arc<Statistics>> for ExtendedStatistics {
+    fn from(base: Arc<Statistics>) -> Self {
+        Self::new_arc(base)
+    }
+}
+
+impl From<ExtendedStatistics> for Statistics {
+    fn from(extended: ExtendedStatistics) -> Self {
+        Arc::unwrap_or_clone(extended.base)
+    }
+}
+
+// ============================================================================
+// StatisticsProvider trait and registry
+// ============================================================================
+
+/// Result of attempting to compute statistics with a [`StatisticsProvider`].
+#[derive(Debug)]
+pub enum StatisticsResult {
+    /// Statistics were computed by this provider
+    Computed(ExtendedStatistics),
+    /// This provider doesn't handle this operator; delegate to next in chain
+    Delegate,
+}
+
+/// Customize statistics computation for [`ExecutionPlan`] nodes.
+///
+/// Implementations can handle specific operator types or override default
+/// estimation logic. The chain of providers is traversed until one returns
+/// [`StatisticsResult::Computed`].
+///
+/// # Implementing a Custom Provider
+///
+/// ```ignore
+/// #[derive(Debug)]
+/// struct MyStatisticsProvider;
+///
+/// impl StatisticsProvider for MyStatisticsProvider {
+///     fn compute_statistics(
+///         &self,
+///         plan: &dyn ExecutionPlan,
+///         child_stats: &[ExtendedStatistics],
+///     ) -> Result<StatisticsResult> {
+///         if let Some(my_exec) = plan.downcast_ref::<MyCustomExec>() {
+///             // Custom logic for MyCustomExec
+///             Ok(StatisticsResult::Computed(/* ... */))
+///         } else {
+///             // Let next provider handle it
+///             Ok(StatisticsResult::Delegate)
+///         }
+///     }
+/// }
+/// ```
+pub trait StatisticsProvider: Debug + Send + Sync {
+    /// Compute statistics for an [`ExecutionPlan`] node.
+    ///
+    /// # Arguments
+    /// * `plan` - The execution plan node to compute statistics for
+    /// * `child_stats` - Extended statistics already computed for child nodes,
+    ///   in the same order as `plan.children()`. Empty for leaf nodes.
+    ///
+    /// # Returns
+    /// * `StatisticsResult::Computed(stats)` - Short-circuits the chain
+    /// * `StatisticsResult::Delegate` - Passes to next provider in chain
+    fn compute_statistics(
+        &self,
+        plan: &dyn ExecutionPlan,
+        child_stats: &[ExtendedStatistics],
+    ) -> Result<StatisticsResult>;
+}
+
+/// Default statistics provider that delegates to each operator's built-in
+/// `partition_statistics` implementation.
+#[derive(Debug, Default)]
+pub struct DefaultStatisticsProvider;
+
+impl StatisticsProvider for DefaultStatisticsProvider {
+    fn compute_statistics(
+        &self,
+        plan: &dyn ExecutionPlan,
+        _child_stats: &[ExtendedStatistics],
+    ) -> Result<StatisticsResult> {
+        let base = plan.partition_statistics(None)?;
+        Ok(StatisticsResult::Computed(ExtendedStatistics::new_arc(
+            base,
+        )))
+    }
+}
+
+/// Registry that chains [`StatisticsProvider`] implementations.
+///
+/// The registry is a stateless provider chain: it holds no mutable state
+/// and is cheaply `Clone`able / `Send` / `Sync`.
+#[derive(Clone)]
+pub struct StatisticsRegistry {
+    providers: Vec<Arc<dyn StatisticsProvider>>,
+}
+
+impl Debug for StatisticsRegistry {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "StatisticsRegistry({} providers)", self.providers.len())
+    }
+}
+
+impl Default for StatisticsRegistry {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl StatisticsRegistry {
+    /// Create a new empty registry.
+    ///
+    /// With no providers, `compute()` falls back to each plan node's
+    /// built-in `partition_statistics()`. Register providers to enhance
+    /// statistics (e.g., inject NDV, use histograms).
+    pub fn new() -> Self {
+        Self {
+            providers: Vec::new(),
+        }
+    }
+
+    /// Create a registry with the given provider chain.
+    pub fn with_providers(providers: Vec<Arc<dyn StatisticsProvider>>) -> Self {
+        Self { providers }
+    }
+
+    /// Create a registry pre-loaded with the standard built-in providers.
+    ///
+    /// Provider order (first match wins):
+    /// 1. [`FilterStatisticsProvider`]
+    /// 2. [`ProjectionStatisticsProvider`]
+    /// 3. [`PassthroughStatisticsProvider`]
+    /// 4. [`AggregateStatisticsProvider`]
+    /// 5. [`JoinStatisticsProvider`]
+    /// 6. [`LimitStatisticsProvider`]
+    /// 7. [`UnionStatisticsProvider`]
+    /// 8. [`DefaultStatisticsProvider`]
+    pub fn default_with_builtin_providers() -> Self {
+        Self::with_providers(vec![
+            Arc::new(FilterStatisticsProvider),
+            Arc::new(ProjectionStatisticsProvider),
+            Arc::new(PassthroughStatisticsProvider),
+            Arc::new(AggregateStatisticsProvider),
+            Arc::new(JoinStatisticsProvider),
+            Arc::new(LimitStatisticsProvider),
+            Arc::new(UnionStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ])
+    }
+
+    /// Register a provider at the front of the chain (higher priority).
+    pub fn register(&mut self, provider: Arc<dyn StatisticsProvider>) {
+        self.providers.insert(0, provider);
+    }
+
+    /// Returns the current provider chain.
+    pub fn providers(&self) -> &[Arc<dyn StatisticsProvider>] {
+        &self.providers
+    }
+
+    /// Compute extended statistics for a plan through the provider chain.
+    ///
+    /// Performs a bottom-up tree walk: child statistics are computed recursively
+    /// and passed to providers, mirroring how `partition_statistics` composes
+    /// operators. Once [#20184](https://github.com/apache/datafusion/issues/20184)
+    /// lands, the registry can feed enriched base stats directly into
+    /// `partition_statistics(child_stats)`, removing the need for a separate walk.
+    ///
+    /// If no providers are registered, falls back to the plan's built-in
+    /// `partition_statistics(None)` with no overhead.
+    pub fn compute(&self, plan: &dyn ExecutionPlan) -> Result<ExtendedStatistics> {
+        // Fast path: no providers registered, skip the walk entirely
+        if self.providers.is_empty() {
+            let base = plan.partition_statistics(None)?;
+            return Ok(ExtendedStatistics::new_arc(base));
+        }
+
+        let children = plan.children();
+
+        // For leaf nodes, try providers with empty child stats.
+        // For non-leaf nodes, recursively compute enhanced child stats first.
+        let child_stats: Vec<ExtendedStatistics> = if children.is_empty() {
+            Vec::new()
+        } else {
+            children
+                .iter()
+                .map(|child| self.compute(child.as_ref()))
+                .collect::<Result<Vec<_>>>()?
+        };
+
+        for provider in &self.providers {
+            match provider.compute_statistics(plan, &child_stats)? {
+                StatisticsResult::Computed(stats) => return Ok(stats),
+                StatisticsResult::Delegate => continue,
+            }
+        }
+        // Fallback: use plan's built-in stats
+        let base = plan.partition_statistics(None)?;
+        Ok(ExtendedStatistics::new_arc(base))
+    }
+
+    /// Compute statistics and return only the base Statistics (no extensions).
+    ///
+    /// Convenience method for callers that don't need extensions.
+    pub fn compute_base(&self, plan: &dyn ExecutionPlan) -> Result<Statistics> {
+        Ok(self.compute(plan)?.base().clone())
+    }
+}
+
+// ============================================================================
+// Statistics Utility Functions
+// ============================================================================
+
+/// Estimate the number of distinct values when sampling from a population.
+///
+/// Given a domain with `domain_size` distinct values and `num_selected` rows
+/// sampled/filtered from it, estimates how many distinct values will appear
+/// in the sample.
+///
+/// Uses the formula: `Expected distinct = N * [1 - (1 - 1/N)^n]`
+///
+/// # References
+///
+/// Based on Calcite's `RelMdUtil.numDistinctVals()`:
+/// <https://github.com/apache/calcite/blob/main/core/src/main/java/org/apache/calcite/rel/metadata/RelMdUtil.java>
+pub fn num_distinct_vals(domain_size: usize, num_selected: usize) -> usize {
+    if domain_size == 0 || num_selected == 0 {
+        return 0;
+    }
+
+    if num_selected >= domain_size {
+        return domain_size;
+    }
+
+    let n = domain_size as f64;
+    let k = num_selected as f64;
+
+    // For large n, (1-1/n).powf(k) loses precision because the base is near
+    // 1.0; use the equivalent exp(-k/n) form which is numerically stable.
+    // Threshold matches Calcite's RelMdUtil.numDistinctVals().
+    let expected = if domain_size > 1000 {
+        n * (1.0 - (-k / n).exp())
+    } else {
+        n * (1.0 - (1.0 - 1.0 / n).powf(k))
+    };
+
+    let result = expected.round() as usize;
+    result.clamp(1, domain_size)
+}
+
+/// Estimate NDV after applying a selectivity factor (filtering).
+///
+/// When filtering rows, each distinct value has multiple rows. If a value
+/// appears `k` times, the probability it survives the filter is `1 - (1-s)^k`
+/// where `s` is the selectivity.
+///
+/// Assuming uniform distribution (each value appears `rows/ndv` times):
+/// ```text
+/// NDV_after ~ NDV_before * [1 - (1 - selectivity)^(rows/NDV)]
+/// ```
+pub fn ndv_after_selectivity(
+    original_ndv: usize,
+    original_rows: usize,
+    selectivity: f64,
+) -> usize {
+    if selectivity <= 0.0 || original_ndv == 0 || original_rows == 0 {
+        return 0;
+    }
+    if selectivity >= 1.0 {
+        return original_ndv;
+    }
+
+    let ndv = original_ndv as f64;
+    let rows = original_rows as f64;
+
+    let rows_per_value = rows / ndv;
+    let survival_prob = 1.0 - (1.0 - selectivity).powf(rows_per_value);
+    let expected_ndv = ndv * survival_prob;
+
+    (expected_ndv.round() as usize).clamp(1, original_ndv)
+}
+
+/// Rescale `total_byte_size` proportionally after overriding `num_rows`.
+///
+/// When a provider replaces `num_rows` but keeps the rest of the stats from
+/// `partition_statistics`, the original `total_byte_size` becomes inconsistent.
+/// This function adjusts it by the ratio `new_rows / old_rows`, preserving the
+/// average bytes-per-row from the original estimate.
+fn rescale_byte_size(stats: &mut Statistics, new_num_rows: Precision<usize>) {
+    let old_rows = stats.num_rows;
+    stats.num_rows = new_num_rows;
+    stats.total_byte_size = match (old_rows, new_num_rows, stats.total_byte_size) {
+        (Precision::Exact(old), Precision::Exact(new), Precision::Exact(bytes))
+            if old > 0 =>
+        {
+            Precision::Exact((bytes as f64 * new as f64 / old as f64).round() as usize)
+        }
+        _ => match (
+            old_rows.get_value(),
+            new_num_rows.get_value(),
+            stats.total_byte_size.get_value(),
+        ) {
+            (Some(&old), Some(&new), Some(&bytes)) if old > 0 => Precision::Inexact(
+                (bytes as f64 * new as f64 / old as f64).round() as usize,
+            ),
+            _ => stats.total_byte_size,
+        },
+    };
+}
+
+/// Fetches base statistics from the operator's built-in `partition_statistics`,
+/// overrides `num_rows` with the registry-computed estimate, and rescales
+/// `total_byte_size` proportionally.
+///
+/// Used by providers that compute a better row count but cannot yet propagate
+/// column-level stats (NDV, min/max) through the operator — pending #20184.
+fn computed_with_row_count(
+    plan: &dyn ExecutionPlan,
+    num_rows: Precision<usize>,
+) -> Result<StatisticsResult> {
+    let mut base = Arc::unwrap_or_clone(plan.partition_statistics(None)?);
+    rescale_byte_size(&mut base, num_rows);
+    Ok(StatisticsResult::Computed(ExtendedStatistics::new(base)))
+}
+
+/// Statistics provider for [`FilterExec`](crate::filter::FilterExec) that uses
+/// pre-computed enhanced child statistics from the registry walk.
+///
+/// Unlike the default provider (which calls `partition_statistics` and gets raw
+/// child stats), this provider receives enhanced child stats that may include
+/// NDV overrides injected at the scan level. It applies the same selectivity
+/// estimation logic as `FilterExec::statistics_helper`, then additionally
+/// adjusts each column's `distinct_count` using [`ndv_after_selectivity`] based
+/// on the computed selectivity ratio.
+#[derive(Debug, Default)]
+pub struct FilterStatisticsProvider;
+
+impl StatisticsProvider for FilterStatisticsProvider {
+    fn compute_statistics(
+        &self,
+        plan: &dyn ExecutionPlan,
+        child_stats: &[ExtendedStatistics],
+    ) -> Result<StatisticsResult> {
+        use crate::filter::FilterExec;
+
+        let Some(filter) = plan.downcast_ref::<FilterExec>() else {
+            return Ok(StatisticsResult::Delegate);
+        };
+        if child_stats.is_empty() {
+            return Ok(StatisticsResult::Delegate);
+        }
+
+        let input_stats = (*child_stats[0].base).clone();
+        let input_rows = input_stats.num_rows;
+        let mut stats = FilterExec::statistics_helper(
+            &filter.input().schema(),
+            input_stats,
+            filter.predicate(),
+            filter.default_selectivity(),
+            // TODO: pass filter.expression_analyzer_registry() once #21122 lands
+        )?;
+
+        // Adjust distinct_count for each column using the selectivity ratio
+        // via the probabilistic survival model from
+        // ndv_after_selectivity to account for rows removed by the filter.
+        if let (Some(&orig_rows), Some(&filtered_rows)) =
+            (input_rows.get_value(), stats.num_rows.get_value())
+            && orig_rows > 0
+            && filtered_rows < orig_rows
+        {
+            let selectivity = filtered_rows as f64 / orig_rows as f64;
+            for col_stat in &mut stats.column_statistics {
+                if let Some(&ndv) = col_stat.distinct_count.get_value() {
+                    let adjusted = ndv_after_selectivity(ndv, orig_rows, selectivity);
+                    col_stat.distinct_count = Precision::Inexact(adjusted);
+                }
+            }
+        }
+
+        let stats = stats.project(filter.projection().as_ref());
+        Ok(StatisticsResult::Computed(ExtendedStatistics::new(stats)))
+    }
+}
+
+/// Statistics provider for [`ProjectionExec`](crate::projection::ProjectionExec)
+/// that uses pre-computed enhanced child statistics from the registry walk.
+///
+/// Maps enhanced child column statistics to output columns based on the
+/// projection expressions, preserving NDV and other statistics through
+/// column references.
+#[derive(Debug, Default)]
+pub struct ProjectionStatisticsProvider;
+
+impl StatisticsProvider for ProjectionStatisticsProvider {
+    fn compute_statistics(
+        &self,
+        plan: &dyn ExecutionPlan,
+        child_stats: &[ExtendedStatistics],
+    ) -> Result<StatisticsResult> {
+        use crate::projection::ProjectionExec;
+
+        let Some(proj) = plan.downcast_ref::<ProjectionExec>() else {
+            return Ok(StatisticsResult::Delegate);
+        };
+        if child_stats.is_empty() {
+            return Ok(StatisticsResult::Delegate);
+        }
+
+        let input_stats = (*child_stats[0].base).clone();
+        let output_schema = proj.schema();
+        // TODO: pass proj.expression_analyzer_registry() once #21122 lands,
+        // so expression-level NDV/min/max feeds into projected column stats.
+        let stats = proj
+            .projection_expr()
+            .project_statistics(input_stats, &output_schema)?;
+        Ok(StatisticsResult::Computed(ExtendedStatistics::new(stats)))
+    }
+}
+
+/// Statistics provider for single-input operators with
+/// [`CardinalityEffect::Equal`](crate::execution_plan::CardinalityEffect::Equal).
+///
+/// These operators (Sort, Repartition, CoalescePartitions, etc.) don't
+/// transform statistics, so we pass through the enhanced child stats directly.
+/// This avoids the fallback calling `partition_statistics(None)` which would
+/// trigger a redundant internal recursion with raw (non-enhanced) stats.
+#[derive(Debug, Default)]
+pub struct PassthroughStatisticsProvider;
+
+impl StatisticsProvider for PassthroughStatisticsProvider {
+    fn compute_statistics(
+        &self,
+        plan: &dyn ExecutionPlan,
+        child_stats: &[ExtendedStatistics],
+    ) -> Result<StatisticsResult> {
+        use crate::execution_plan::CardinalityEffect;
+
+        if child_stats.len() != 1
+            || !matches!(plan.cardinality_effect(), CardinalityEffect::Equal)
+        {
+            return Ok(StatisticsResult::Delegate);
+        }
+
+        // Only pass through when the schema is unchanged (same column count).
+        // Operators like WindowAggExec preserve row count but add columns;
+        // passing through child stats would produce wrong column_statistics.
+        let input_cols = child_stats[0].base.column_statistics.len();
+        let output_cols = plan.schema().fields().len();
+        if input_cols != output_cols {
+            return Ok(StatisticsResult::Delegate);
+        }
+
+        Ok(StatisticsResult::Computed(child_stats[0].clone()))
+    }
+}
+
+/// Statistics provider for [`AggregateExec`](crate::aggregates::AggregateExec)
+/// that estimates output cardinality from the NDV of GROUP BY columns.
+///
+/// For each GROUP BY column, looks up `distinct_count` from the enhanced
+/// child statistics. The estimated output rows is the product of all
+/// column NDVs, capped at the input row count. This assumes independence
+/// between columns, so correlated columns (e.g., `city` and `state`) will
+/// produce overestimates.
+///
+/// For GROUPING SETS / CUBE / ROLLUP, delegates to the built-in
+/// `partition_statistics`, which handles per-set NDV estimation correctly.
+///
+/// Delegates when:
+/// - The plan is not an `AggregateExec`
+/// - The aggregate is `Partial` (per-partition, not bounded by global NDV)
+/// - GROUP BY is empty (scalar aggregate)
+/// - Any GROUP BY expression is not a simple column reference
+/// - Any GROUP BY column lacks NDV information
+#[derive(Debug, Default)]
+pub struct AggregateStatisticsProvider;
+
+impl StatisticsProvider for AggregateStatisticsProvider {
+    fn compute_statistics(
+        &self,
+        plan: &dyn ExecutionPlan,
+        child_stats: &[ExtendedStatistics],
+    ) -> Result<StatisticsResult> {
+        use crate::aggregates::AggregateExec;
+        use datafusion_physical_expr::expressions::Column;
+
+        use crate::aggregates::AggregateMode;
+
+        let Some(agg) = plan.downcast_ref::<AggregateExec>() else {
+            return Ok(StatisticsResult::Delegate);
+        };
+
+        // Partial aggregates produce per-partition groups, not bounded by
+        // global NDV; delegate to the built-in estimate for those.
+        if matches!(agg.mode(), AggregateMode::Partial) {
+            return Ok(StatisticsResult::Delegate);
+        }
+
+        if child_stats.is_empty() || agg.group_expr().expr().is_empty() {
+            return Ok(StatisticsResult::Delegate);
+        }
+
+        let input_stats = &child_stats[0].base;
+
+        // Compute NDV product of GROUP BY columns
+        let mut ndv_product: Option<usize> = None;
+        for (expr, _) in agg.group_expr().expr().iter() {
+            let Some(col) = expr.downcast_ref::<Column>() else {
+                return Ok(StatisticsResult::Delegate);
+            };
+            let Some(&ndv) = input_stats
+                .column_statistics
+                .get(col.index())
+                .and_then(|s| s.distinct_count.get_value())
+            else {
+                return Ok(StatisticsResult::Delegate);
+            };
+            if ndv == 0 {
+                return Ok(StatisticsResult::Delegate);
+            }
+            ndv_product = Some(match ndv_product {
+                Some(prev) => prev.saturating_mul(ndv),
+                None => ndv,
+            });
+        }
+
+        let Some(product) = ndv_product else {
+            return Ok(StatisticsResult::Delegate);
+        };
+
+        // For CUBE/ROLLUP/GROUPING SETS (multiple grouping sets), delegate to
+        // the built-in estimate, which handles per-set NDV estimation correctly.
+        if agg.group_expr().groups().len() > 1 {
+            return Ok(StatisticsResult::Delegate);
+        }
+
+        // Cap at input rows
+        let estimate = match input_stats.num_rows.get_value() {
+            Some(&rows) => product.min(rows),
+            None => product,
+        };
+
+        let num_rows = Precision::Inexact(estimate);
+
+        computed_with_row_count(plan, num_rows)
+    }
+}
+
+/// Statistics provider for equi-joins (hash join, sort-merge join) and cross joins.
+///
+/// For equi-joins, estimates output cardinality as
+/// `left_rows * right_rows / product(max(left_ndv_i, right_ndv_i))`
+/// across all join key columns (assuming independence between keys),
+/// falling back to the Cartesian product when any key lacks NDV on both sides.
+/// For cross joins, uses the exact Cartesian product.
+///
+/// The base inner-join estimate is then adjusted for the join type:
+/// - Semi joins: capped at the preserved-side row count
+/// - Anti joins: preserved-side minus matched rows (clamped to 0)
+/// - Left/Right outer: at least as many rows as the preserved side
+/// - Full outer: at least `left + right - inner_estimate`
+/// - Left mark: exactly `left_rows` (one output row per left row)
+///
+/// Delegates when:
+/// - The plan is not a supported join type
+/// - Either input lacks row count information
+#[derive(Debug, Default)]
+pub struct JoinStatisticsProvider;
+
+impl StatisticsProvider for JoinStatisticsProvider {
+    fn compute_statistics(
+        &self,
+        plan: &dyn ExecutionPlan,
+        child_stats: &[ExtendedStatistics],
+    ) -> Result<StatisticsResult> {
+        use crate::joins::{CrossJoinExec, HashJoinExec, SortMergeJoinExec};
+        use datafusion_common::JoinType;
+        use datafusion_physical_expr::expressions::Column;
+
+        if child_stats.len() < 2 {
+            return Ok(StatisticsResult::Delegate);
+        }
+
+        let left = &child_stats[0].base;
+        let right = &child_stats[1].base;
+
+        let (Some(&left_rows), Some(&right_rows)) =
+            (left.num_rows.get_value(), right.num_rows.get_value())
+        else {
+            return Ok(StatisticsResult::Delegate);
+        };
+
+        use crate::joins::JoinOnRef;
+
+        /// Estimate equi-join output using NDV of join key columns:
+        ///   left_rows * right_rows / product(max(left_ndv_i, right_ndv_i))
+        /// Falls back to Cartesian product if any key lacks NDV on both sides.
+        fn equi_join_estimate(
+            on: JoinOnRef,
+            left: &Statistics,
+            right: &Statistics,
+            left_rows: usize,
+            right_rows: usize,
+        ) -> usize {
+            if on.is_empty() {
+                return left_rows.saturating_mul(right_rows);
+            }
+            let mut ndv_divisor: usize = 1;
+            for (left_key, right_key) in on {
+                let left_ndv = left_key
+                    .downcast_ref::<Column>()
+                    .and_then(|c| left.column_statistics.get(c.index()))
+                    .and_then(|s| s.distinct_count.get_value().copied());
+                let right_ndv = right_key
+                    .downcast_ref::<Column>()
+                    .and_then(|c| right.column_statistics.get(c.index()))
+                    .and_then(|s| s.distinct_count.get_value().copied());
+                match (left_ndv, right_ndv) {
+                    (Some(l), Some(r)) if l > 0 && r > 0 => {
+                        ndv_divisor = ndv_divisor.saturating_mul(l.max(r));
+                    }
+                    _ => return left_rows.saturating_mul(right_rows),
+                }
+            }
+            if ndv_divisor > 0 {
+                left_rows.saturating_mul(right_rows) / ndv_divisor
+            } else {
+                left_rows.saturating_mul(right_rows)
+            }
+        }
+
+        let (inner_estimate, is_exact_cartesian, join_type) = if let Some(hash_join) =
+            plan.downcast_ref::<HashJoinExec>()
+        {
+            let est =
+                equi_join_estimate(hash_join.on(), left, right, left_rows, right_rows);
+            (est, false, *hash_join.join_type())
+        } else if let Some(smj) = plan.downcast_ref::<SortMergeJoinExec>() {
+            let est = equi_join_estimate(smj.on(), left, right, left_rows, right_rows);
+            (est, false, smj.join_type())
+        } else if plan.downcast_ref::<CrossJoinExec>().is_some() {
+            let both_exact = left.num_rows.is_exact().unwrap_or(false)
+                && right.num_rows.is_exact().unwrap_or(false);
+            (
+                left_rows.saturating_mul(right_rows),
+                both_exact,
+                JoinType::Inner,
+            )
+        } else {
+            return Ok(StatisticsResult::Delegate);
+        };
+
+        // Apply join-type-aware cardinality bounds
+        let estimated = match join_type {
+            JoinType::Inner => inner_estimate,
+            JoinType::Left => inner_estimate.max(left_rows),
+            JoinType::Right => inner_estimate.max(right_rows),
+            JoinType::Full => {
+                // At least left + right - matched, but never less than inner
+                let outer_bound = left_rows
+                    .saturating_add(right_rows)
+                    .saturating_sub(inner_estimate);
+                inner_estimate.max(outer_bound)
+            }
+            JoinType::LeftSemi => inner_estimate.min(left_rows),
+            JoinType::RightSemi => inner_estimate.min(right_rows),
+            JoinType::LeftAnti => left_rows.saturating_sub(inner_estimate.min(left_rows)),
+            JoinType::RightAnti => {
+                right_rows.saturating_sub(inner_estimate.min(right_rows))
+            }
+            JoinType::LeftMark => left_rows,
+            JoinType::RightMark => right_rows,
+        };
+
+        // NL join inner with exact inputs is an exact Cartesian product;
+        // NDV-based estimates are inherently inexact.
+        let num_rows = if is_exact_cartesian && join_type == JoinType::Inner {
+            Precision::Exact(estimated)
+        } else {
+            Precision::Inexact(estimated)
+        };
+
+        computed_with_row_count(plan, num_rows)
+    }
+}
+
+/// Statistics provider for [`LocalLimitExec`](crate::limit::LocalLimitExec) and
+/// [`GlobalLimitExec`](crate::limit::GlobalLimitExec).
+///
+/// Caps output row count at the limit value, accounting for any leading skip offset
+/// in `GlobalLimitExec`.
+#[derive(Debug, Default)]
+pub struct LimitStatisticsProvider;
+
+impl StatisticsProvider for LimitStatisticsProvider {
+    fn compute_statistics(
+        &self,
+        plan: &dyn ExecutionPlan,
+        child_stats: &[ExtendedStatistics],
+    ) -> Result<StatisticsResult> {
+        use crate::limit::{GlobalLimitExec, LocalLimitExec};
+
+        if child_stats.is_empty() {
+            return Ok(StatisticsResult::Delegate);
+        }
+
+        let (skip, fetch) = if let Some(limit) = plan.downcast_ref::<LocalLimitExec>() {
+            (0usize, Some(limit.fetch()))
+        } else if let Some(limit) = plan.downcast_ref::<GlobalLimitExec>() {
+            (limit.skip(), limit.fetch())
+        } else {
+            return Ok(StatisticsResult::Delegate);
+        };
+
+        let num_rows = match child_stats[0].base.num_rows {
+            Precision::Exact(rows) => {
+                let available = rows.saturating_sub(skip);
+                Precision::Exact(fetch.map_or(available, |f| available.min(f)))
+            }
+            Precision::Inexact(rows) => {
+                let available = rows.saturating_sub(skip);
+                match fetch {
+                    Some(f) => Precision::Inexact(available.min(f)),
+                    None => Precision::Inexact(available),
+                }
+            }
+            Precision::Absent => match fetch {
+                Some(f) => Precision::Inexact(f),
+                None => Precision::Absent,
+            },
+        };
+
+        computed_with_row_count(plan, num_rows)
+    }
+}
+
+/// Statistics provider for [`UnionExec`](crate::union::UnionExec).
+///
+/// Sums row counts across all inputs.
+#[derive(Debug, Default)]
+pub struct UnionStatisticsProvider;
+
+impl StatisticsProvider for UnionStatisticsProvider {
+    fn compute_statistics(
+        &self,
+        plan: &dyn ExecutionPlan,
+        child_stats: &[ExtendedStatistics],
+    ) -> Result<StatisticsResult> {
+        use crate::union::UnionExec;
+
+        if plan.downcast_ref::<UnionExec>().is_none() {
+            return Ok(StatisticsResult::Delegate);
+        }
+
+        let total = child_stats.iter().try_fold(
+            Precision::Exact(0usize),
+            |acc, s| -> Result<Precision<usize>> {
+                Ok(match (acc, s.base.num_rows) {
+                    (Precision::Absent, _) | (_, Precision::Absent) => Precision::Absent,
+                    (Precision::Exact(a), Precision::Exact(b)) => {
+                        Precision::Exact(a.saturating_add(b))
+                    }
+                    (Precision::Inexact(a), Precision::Exact(b))
+                    | (Precision::Exact(a), Precision::Inexact(b))
+                    | (Precision::Inexact(a), Precision::Inexact(b)) => {
+                        Precision::Inexact(a.saturating_add(b))
+                    }
+                })
+            },
+        )?;
+
+        computed_with_row_count(plan, total)
+    }
+}
+
+type ProviderFn = dyn Fn(&dyn ExecutionPlan, &[ExtendedStatistics]) -> Result<StatisticsResult>
+    + Send
+    + Sync;
+
+/// A [`StatisticsProvider`] backed by a user-supplied closure.
+///
+/// Useful for injecting custom statistics in tests or for cardinality feedback
+/// pipelines where real runtime statistics need to override plan estimates.
+/// The closure receives the current plan node and its children's enhanced
+/// statistics, returning a [`StatisticsResult`].
+///
+/// To distinguish between multiple nodes of the same type (e.g., two
+/// `FilterExec` nodes), match on structural properties like the input schema's
+/// column names, number of columns, or child row counts.
+///
+/// # Example
+///
+/// ```rust,ignore (requires crate-internal imports)
+/// let provider = ClosureStatisticsProvider::new(|plan, child_stats| {
+///     if plan.downcast_ref::<FilterExec>().is_some() {
+///         Ok(StatisticsResult::Computed(ExtendedStatistics::from(Statistics {
+///             num_rows: Precision::Inexact(42),
+///             ..Statistics::new_unknown(plan.schema().as_ref())
+///         })))
+///     } else {
+///         Ok(StatisticsResult::Delegate)
+///     }
+/// });
+/// ```
+pub struct ClosureStatisticsProvider {
+    f: Box<ProviderFn>,
+}
+
+impl ClosureStatisticsProvider {
+    /// Create a new provider from a closure.
+    pub fn new(
+        f: impl Fn(&dyn ExecutionPlan, &[ExtendedStatistics]) -> Result<StatisticsResult>
+        + Send
+        + Sync
+        + 'static,
+    ) -> Self {
+        Self { f: Box::new(f) }
+    }
+}
+
+impl Debug for ClosureStatisticsProvider {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "ClosureStatisticsProvider")
+    }
+}
+
+impl StatisticsProvider for ClosureStatisticsProvider {
+    fn compute_statistics(
+        &self,
+        plan: &dyn ExecutionPlan,
+        child_stats: &[ExtendedStatistics],
+    ) -> Result<StatisticsResult> {
+        (self.f)(plan, child_stats)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::filter::FilterExec;
+    use crate::projection::ProjectionExec;
+    use crate::{DisplayAs, DisplayFormatType, PlanProperties};
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common::stats::Precision;
+    use datafusion_common::{ColumnStatistics, ScalarValue};
+    use datafusion_expr::Operator;
+    use datafusion_physical_expr::PhysicalExpr;
+    use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal, col, lit};
+    use datafusion_physical_expr::{EquivalenceProperties, Partitioning};
+    use std::fmt;
+
+    use crate::execution_plan::{Boundedness, EmissionType};
+    use datafusion_common::tree_node::TreeNodeRecursion;
+
+    fn make_schema() -> Arc<Schema> {
+        Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]))
+    }
+
+    #[derive(Debug)]
+    struct MockSourceExec {
+        schema: Arc<Schema>,
+        stats: Statistics,
+        cache: Arc<PlanProperties>,
+    }
+
+    impl MockSourceExec {
+        fn new(schema: Arc<Schema>, num_rows: Precision<usize>) -> Self {
+            let num_cols = schema.fields().len();
+            Self::with_column_stats(
+                schema,
+                num_rows,
+                vec![ColumnStatistics::new_unknown(); num_cols],
+            )
+        }
+
+        fn with_column_stats(
+            schema: Arc<Schema>,
+            num_rows: Precision<usize>,
+            column_statistics: Vec<ColumnStatistics>,
+        ) -> Self {
+            let eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
+            let cache = Arc::new(PlanProperties::new(
+                eq_properties,
+                Partitioning::UnknownPartitioning(1),
+                EmissionType::Incremental,
+                Boundedness::Bounded,
+            ));
+            Self {
+                schema,
+                stats: Statistics {
+                    num_rows,
+                    total_byte_size: Precision::Absent,
+                    column_statistics,
+                },
+                cache,
+            }
+        }
+    }
+
+    impl DisplayAs for MockSourceExec {
+        fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
+            write!(f, "MockSourceExec")
+        }
+    }
+
+    impl ExecutionPlan for MockSourceExec {
+        fn name(&self) -> &str {
+            "MockSourceExec"
+        }
+
+        fn schema(&self) -> Arc<Schema> {
+            Arc::clone(&self.schema)
+        }
+
+        fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+            vec![]
+        }
+
+        fn with_new_children(
+            self: Arc<Self>,
+            _children: Vec<Arc<dyn ExecutionPlan>>,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
+            Ok(self)
+        }
+
+        fn properties(&self) -> &Arc<PlanProperties> {
+            &self.cache
+        }
+
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
+
+        fn execute(
+            &self,
+            _partition: usize,
+            _context: Arc<datafusion_execution::TaskContext>,
+        ) -> Result<crate::SendableRecordBatchStream> {
+            unimplemented!()
+        }
+
+        fn partition_statistics(
+            &self,
+            _partition: Option<usize>,
+        ) -> Result<Arc<Statistics>> {
+            Ok(Arc::new(self.stats.clone()))
+        }
+    }
+
+    fn make_source(num_rows: usize) -> Arc<dyn ExecutionPlan> {
+        Arc::new(MockSourceExec::new(
+            make_schema(),
+            Precision::Exact(num_rows),
+        ))
+    }
+
+    #[test]
+    fn test_default_provider() -> Result<()> {
+        let engine = StatisticsRegistry::new();
+        let source = make_source(1000);
+
+        let stats = engine.compute(source.as_ref())?;
+        assert!(matches!(stats.base.num_rows, Precision::Exact(1000)));
+        Ok(())
+    }
+
+    #[test]
+    fn test_custom_chain_configuration() -> Result<()> {
+        let source = make_source(1000);
+
+        // Test with_providers: fully custom chain (no default)
+        let custom_only =
+            StatisticsRegistry::with_providers(vec![Arc::new(CustomStatisticsProvider)]);
+        // CustomStatisticsProvider only handles CustomExec, delegates for others
+        // With no default provider, filter returns fallback statistics
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(lit(true), Arc::clone(&source))?);
+        let stats = custom_only.compute(filter.as_ref())?;
+        // Falls back to plan.statistics() since no provider handles it
+        assert!(stats.base.num_rows.get_value().is_some());
+
+        // Test with_providers: custom provider + built-in fallback
+        let with_override =
+            StatisticsRegistry::with_providers(vec![Arc::new(OverrideFilterProvider {
+                fixed_selectivity: 0.25,
+            })
+                as Arc<dyn StatisticsProvider>]);
+        // OverrideFilterProvider handles filters, built-in fallback handles the rest
+        let stats = with_override.compute(filter.as_ref())?;
+        assert!(matches!(stats.base.num_rows, Precision::Inexact(250)));
+
+        // Verify chain inspection
+        assert_eq!(with_override.providers().len(), 1);
+
+        Ok(())
+    }
+
+    #[derive(Debug)]
+    struct CustomExec {
+        input: Arc<dyn ExecutionPlan>,
+    }
+
+    impl DisplayAs for CustomExec {
+        fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
+            write!(f, "CustomExec")
+        }
+    }
+
+    impl ExecutionPlan for CustomExec {
+        fn name(&self) -> &str {
+            "CustomExec"
+        }
+
+        fn schema(&self) -> Arc<Schema> {
+            self.input.schema()
+        }
+
+        fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+            vec![&self.input]
+        }
+
+        fn with_new_children(
+            self: Arc<Self>,
+            children: Vec<Arc<dyn ExecutionPlan>>,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
+            Ok(Arc::new(CustomExec {
+                input: Arc::clone(&children[0]),
+            }))
+        }
+
+        fn properties(&self) -> &Arc<PlanProperties> {
+            self.input.properties()
+        }
+
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
+
+        fn execute(
+            &self,
+            _partition: usize,
+            _context: Arc<datafusion_execution::TaskContext>,
+        ) -> Result<crate::SendableRecordBatchStream> {
+            unimplemented!()
+        }
+    }
+
+    #[derive(Debug)]
+    struct CustomStatisticsProvider;
+
+    impl StatisticsProvider for CustomStatisticsProvider {
+        fn compute_statistics(
+            &self,
+            plan: &dyn ExecutionPlan,
+            child_stats: &[ExtendedStatistics],
+        ) -> Result<StatisticsResult> {
+            if plan.downcast_ref::<CustomExec>().is_some() {
+                Ok(StatisticsResult::Computed(child_stats[0].clone()))
+            } else {
+                Ok(StatisticsResult::Delegate)
+            }
+        }
+    }
+
+    #[test]
+    fn test_custom_provider_for_custom_exec() -> Result<()> {
+        let mut engine = StatisticsRegistry::new();
+        engine.register(Arc::new(CustomStatisticsProvider));
+
+        let source = make_source(1000);
+        let custom: Arc<dyn ExecutionPlan> = Arc::new(CustomExec { input: source });
+
+        let stats = engine.compute(custom.as_ref())?;
+        assert!(matches!(stats.base.num_rows, Precision::Exact(1000)));
+        Ok(())
+    }
+
+    #[derive(Debug)]
+    struct OverrideFilterProvider {
+        fixed_selectivity: f64,
+    }
+
+    impl StatisticsProvider for OverrideFilterProvider {
+        fn compute_statistics(
+            &self,
+            plan: &dyn ExecutionPlan,
+            child_stats: &[ExtendedStatistics],
+        ) -> Result<StatisticsResult> {
+            if plan.downcast_ref::<FilterExec>().is_some() {
+                if let Some(&input_rows) = child_stats[0].base.num_rows.get_value() {
+                    let estimated = (input_rows as f64 * self.fixed_selectivity) as usize;
+                    Ok(StatisticsResult::Computed(ExtendedStatistics::from(
+                        Statistics {
+                            num_rows: Precision::Inexact(estimated),
+                            total_byte_size: Precision::Absent,
+                            column_statistics: child_stats[0]
+                                .base
+                                .column_statistics
+                                .clone(),
+                        },
+                    )))
+                } else {
+                    Ok(StatisticsResult::Delegate)
+                }
+            } else {
+                Ok(StatisticsResult::Delegate)
+            }
+        }
+    }
+
+    #[test]
+    fn test_override_builtin_operator() -> Result<()> {
+        let mut engine = StatisticsRegistry::new();
+        engine.register(Arc::new(OverrideFilterProvider {
+            fixed_selectivity: 0.1,
+        }));
+
+        let source = make_source(1000);
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(lit(true), source)?);
+
+        let stats = engine.compute(filter.as_ref())?;
+        assert!(matches!(stats.base.num_rows, Precision::Inexact(100)));
+        Ok(())
+    }
+
+    #[test]
+    fn test_filter_statistics_propagation() -> Result<()> {
+        let engine = StatisticsRegistry::new();
+        let source = make_source(1000);
+        let predicate = lit(true);
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(predicate, source)?);
+
+        let stats = engine.compute(filter.as_ref())?;
+        assert!(stats.base.num_rows.get_value().unwrap_or(&0) <= &1000);
+        Ok(())
+    }
+
+    #[test]
+    fn test_filter_adjusts_ndv_by_selectivity() -> Result<()> {
+        use datafusion_common::ScalarValue;
+        use datafusion_expr::Operator;
+        use datafusion_physical_expr::expressions::{
+            BinaryExpr, Column as PhysColumn, Literal,
+        };
+
+        // Source: 1000 rows, NDV(a)=1000 (unique), NDV(b)=800 (near-unique)
+        // With NDV close to num_rows, each value has ~1.25 rows, so filtering
+        // visibly reduces the number of surviving distinct values.
+        let schema = make_schema(); // "a" Int32, "b" Int32
+        let col_stats = vec![
+            {
+                let mut cs = ColumnStatistics::new_unknown();
+                cs.distinct_count = Precision::Exact(1000);
+                cs.min_value = Precision::Exact(ScalarValue::Int32(Some(1)));
+                cs.max_value = Precision::Exact(ScalarValue::Int32(Some(1000)));
+                cs
+            },
+            {
+                let mut cs = ColumnStatistics::new_unknown();
+                cs.distinct_count = Precision::Exact(800);
+                cs.min_value = Precision::Exact(ScalarValue::Int32(Some(1)));
+                cs.max_value = Precision::Exact(ScalarValue::Int32(Some(800)));
+                cs
+            },
+        ];
+        let source: Arc<dyn ExecutionPlan> = Arc::new(MockSourceExec::with_column_stats(
+            schema,
+            Precision::Exact(1000),
+            col_stats,
+        ));
+
+        // Filter: a > 900 (selectivity ~10%, keeps values 901-1000)
+        let predicate: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            Arc::new(PhysColumn::new("a", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(900)))),
+        ));
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(predicate, source)?);
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(FilterStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(filter.as_ref())?;
+
+        let output_ndv_a = stats.base.column_statistics[0]
+            .distinct_count
+            .get_value()
+            .copied()
+            .unwrap_or(0);
+        let output_ndv_b = stats.base.column_statistics[1]
+            .distinct_count
+            .get_value()
+            .copied()
+            .unwrap_or(0);
+
+        // NDV(a): interval analysis narrows to [901,1000] -> ~100 distinct values
+        assert!(
+            output_ndv_a <= 100,
+            "Expected NDV(a) <= 100 after filter, got {output_ndv_a}"
+        );
+        // NDV(b): not in predicate, but selectivity ~10% with 1.25 rows/value
+        // means many distinct values are lost. ndv_after_selectivity(800, 1000, 0.1)
+        // gives ~76. Significantly less than the original 800.
+        assert!(
+            output_ndv_b < 200,
+            "Expected NDV(b) < 200 after filter, got {output_ndv_b}"
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_projection_statistics_propagation() -> Result<()> {
+        let engine = StatisticsRegistry::new();
+        let source = make_source(1000);
+        let schema = make_schema();
+        let proj: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
+            vec![(col("a", &schema)?, "a".to_string())],
+            source,
+        )?);
+
+        let stats = engine.compute(proj.as_ref())?;
+        assert!(matches!(stats.base.num_rows, Precision::Exact(1000)));
+        Ok(())
+    }
+
+    #[test]
+    fn test_passthrough_statistics_propagation() -> Result<()> {
+        use crate::coalesce_partitions::CoalescePartitionsExec;
+
+        let engine = StatisticsRegistry::new();
+        let source = make_source(1000);
+        let coalesce: Arc<dyn ExecutionPlan> =
+            Arc::new(CoalescePartitionsExec::new(source));
+
+        let stats = engine.compute(coalesce.as_ref())?;
+        // PassthroughStatisticsProvider should propagate child row count unchanged
+        assert_eq!(stats.base.num_rows, Precision::Exact(1000));
+        Ok(())
+    }
+
+    #[test]
+    fn test_chain_priority() -> Result<()> {
+        let mut engine = StatisticsRegistry::new();
+        engine.register(Arc::new(OverrideFilterProvider {
+            fixed_selectivity: 0.5,
+        }));
+        engine.register(Arc::new(CustomStatisticsProvider));
+
+        let source = make_source(1000);
+
+        // CustomExec handled by CustomStatisticsProvider
+        let custom: Arc<dyn ExecutionPlan> = Arc::new(CustomExec {
+            input: Arc::clone(&source),
+        });
+        let stats = engine.compute(custom.as_ref())?;
+        assert!(matches!(stats.base.num_rows, Precision::Exact(1000)));
+
+        // FilterExec: CustomStatisticsProvider delegates, OverrideFilterProvider handles
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(lit(true), source)?);
+        let stats = engine.compute(filter.as_ref())?;
+        assert!(matches!(stats.base.num_rows, Precision::Inexact(500)));
+
+        Ok(())
+    }
+
+    // =========================================================================
+    // num_distinct_vals Utility Tests
+    // =========================================================================
+
+    #[test]
+    fn test_num_distinct_vals_basic() {
+        assert_eq!(num_distinct_vals(0, 100), 0);
+        assert_eq!(num_distinct_vals(100, 0), 0);
+        assert_eq!(num_distinct_vals(100, 100), 100);
+        assert_eq!(num_distinct_vals(100, 200), 100);
+
+        let ndv = num_distinct_vals(1000, 100);
+        assert!((90..=100).contains(&ndv), "Expected ~95, got {ndv}");
+
+        let ndv = num_distinct_vals(1000, 500);
+        assert!((350..=450).contains(&ndv), "Expected ~393, got {ndv}");
+
+        let ndv = num_distinct_vals(1_000_000, 10_000);
+        assert!((9900..=10000).contains(&ndv), "Expected ~9950, got {ndv}");
+
+        let ndv = num_distinct_vals(1_000_000, 100);
+        assert!((99..=100).contains(&ndv), "Expected ~100, got {ndv}");
+    }
+
+    #[test]
+    fn test_num_distinct_vals_small_domain() {
+        let ndv = num_distinct_vals(10, 5);
+        assert!((3..=5).contains(&ndv), "Expected ~4, got {ndv}");
+
+        assert_eq!(num_distinct_vals(10, 20), 10);
+        assert_eq!(num_distinct_vals(10, 1), 1);
+    }
+
+    #[test]
+    fn test_ndv_after_selectivity() {
+        let ndv = ndv_after_selectivity(1000, 10000, 0.1);
+        assert!((600..=700).contains(&ndv), "Expected ~632, got {ndv}");
+
+        let ndv = ndv_after_selectivity(1000, 10000, 0.01);
+        assert!((90..=100).contains(&ndv), "Expected ~95, got {ndv}");
+
+        assert_eq!(ndv_after_selectivity(1000, 10000, 0.0), 0);
+        assert_eq!(ndv_after_selectivity(1000, 10000, 1.0), 1000);
+        assert_eq!(ndv_after_selectivity(0, 10000, 0.5), 0);
+    }
+
+    // =========================================================================
+    // AggregateStatisticsProvider tests
+    // =========================================================================
+
+    use crate::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy};
+
+    fn make_source_with_ndv(
+        num_rows: usize,
+        col_ndvs: Vec<Option<usize>>,
+    ) -> Arc<dyn ExecutionPlan> {
+        let fields: Vec<Field> = col_ndvs
+            .iter()
+            .enumerate()
+            .map(|(i, _)| Field::new(format!("c{i}"), DataType::Int32, false))
+            .collect();
+        let schema = Arc::new(Schema::new(fields));
+        let col_stats = col_ndvs
+            .into_iter()
+            .map(|ndv| {
+                let mut cs = ColumnStatistics::new_unknown();
+                if let Some(n) = ndv {
+                    cs.distinct_count = Precision::Exact(n);
+                }
+                cs
+            })
+            .collect();
+        Arc::new(MockSourceExec::with_column_stats(
+            schema,
+            Precision::Exact(num_rows),
+            col_stats,
+        ))
+    }
+
+    fn make_aggregate(
+        input: Arc<dyn ExecutionPlan>,
+        group_by: PhysicalGroupBy,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(AggregateExec::try_new(
+            AggregateMode::Single,
+            group_by,
+            vec![],
+            vec![],
+            Arc::clone(&input),
+            input.schema(),
+        )?))
+    }
+
+    #[test]
+    fn test_aggregate_provider_with_ndv() -> Result<()> {
+        let source = make_source_with_ndv(100, vec![Some(10)]);
+        let group_by = PhysicalGroupBy::new_single(vec![(
+            Arc::new(Column::new("c0", 0)),
+            "c0".to_string(),
+        )]);
+        let agg = make_aggregate(source, group_by)?;
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(AggregateStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(agg.as_ref())?;
+        assert_eq!(stats.base.num_rows, Precision::Inexact(10));
+        Ok(())
+    }
+
+    #[test]
+    fn test_aggregate_provider_multi_column() -> Result<()> {
+        let source = make_source_with_ndv(1000, vec![Some(10), Some(5)]);
+        let group_by = PhysicalGroupBy::new_single(vec![
+            (Arc::new(Column::new("c0", 0)), "c0".to_string()),
+            (Arc::new(Column::new("c1", 1)), "c1".to_string()),
+        ]);
+        let agg = make_aggregate(source, group_by)?;
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(AggregateStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(agg.as_ref())?;
+        // 10 * 5 = 50
+        assert_eq!(stats.base.num_rows, Precision::Inexact(50));
+        Ok(())
+    }
+
+    #[test]
+    fn test_aggregate_provider_caps_at_input_rows() -> Result<()> {
+        // NDV product (100 * 100 = 10_000) exceeds input rows (500)
+        let source = make_source_with_ndv(500, vec![Some(100), Some(100)]);
+        let group_by = PhysicalGroupBy::new_single(vec![
+            (Arc::new(Column::new("c0", 0)), "c0".to_string()),
+            (Arc::new(Column::new("c1", 1)), "c1".to_string()),
+        ]);
+        let agg = make_aggregate(source, group_by)?;
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(AggregateStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(agg.as_ref())?;
+        assert_eq!(stats.base.num_rows, Precision::Inexact(500));
+        Ok(())
+    }
+
+    #[test]
+    fn test_aggregate_provider_no_ndv_delegates() -> Result<()> {
+        // No NDV on the GROUP BY column
+        let source = make_source_with_ndv(100, vec![None]);
+        let group_by = PhysicalGroupBy::new_single(vec![(
+            Arc::new(Column::new("c0", 0)),
+            "c0".to_string(),
+        )]);
+        let agg = make_aggregate(source, group_by)?;
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(AggregateStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(agg.as_ref())?;
+        // Delegates to DefaultStatisticsProvider, which calls partition_statistics
+        assert!(
+            stats.base.num_rows.get_value().is_some()
+                || matches!(stats.base.num_rows, Precision::Absent)
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_aggregate_provider_non_column_expr_delegates() -> Result<()> {
+        let source = make_source_with_ndv(100, vec![Some(10), Some(5)]);
+        // GROUP BY an expression (c0 + c1), not a simple column ref
+        let expr: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("c0", 0)),
+            Operator::Plus,
+            Arc::new(Column::new("c1", 1)),
+        ));
+        let group_by = PhysicalGroupBy::new_single(vec![(expr, "sum".to_string())]);
+        let agg = make_aggregate(source, group_by)?;
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(AggregateStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(agg.as_ref())?;
+        // Should delegate (expression is not a Column)
+        assert!(
+            stats.base.num_rows.get_value().is_some()
+                || matches!(stats.base.num_rows, Precision::Absent)
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_aggregate_provider_grouping_sets() -> Result<()> {
+        let source = make_source_with_ndv(1000, vec![Some(10), Some(5)]);
+        // GROUPING SETS: (c0, c1), (c0), (c1) -> 3 groups
+        let group_by = PhysicalGroupBy::new(
+            vec![
+                (Arc::new(Column::new("c0", 0)), "c0".to_string()),
+                (Arc::new(Column::new("c1", 1)), "c1".to_string()),
+            ],
+            vec![
+                (
+                    Arc::new(Literal::new(ScalarValue::Int32(None))),
+                    "c0".to_string(),
+                ),
+                (
+                    Arc::new(Literal::new(ScalarValue::Int32(None))),
+                    "c1".to_string(),
+                ),
+            ],
+            vec![
+                vec![false, true],  // (c0, NULL) - group by c0 only
+                vec![true, false],  // (NULL, c1) - group by c1 only
+                vec![false, false], // (c0, c1)   - group by both
+            ],
+            true,
+        );
+        let agg = make_aggregate(source, group_by)?;
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(AggregateStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(agg.as_ref())?;
+        // Multiple grouping sets: provider delegates to DefaultStatisticsProvider,
+        // which calls the built-in partition_statistics for correct per-set
+        // NDV estimation. The exact value depends on the built-in implementation.
+        assert!(
+            stats.base.num_rows.get_value().is_some()
+                || matches!(stats.base.num_rows, Precision::Absent)
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_aggregate_provider_partial_delegates() -> Result<()> {
+        // Partial aggregates produce per-partition groups; the provider
+        // should delegate rather than applying global NDV bounds.
+        let source = make_source_with_ndv(100, vec![Some(10)]);
+        let group_by = PhysicalGroupBy::new_single(vec![(
+            Arc::new(Column::new("c0", 0)),
+            "c0".to_string(),
+        )]);
+        let agg: Arc<dyn ExecutionPlan> = Arc::new(AggregateExec::try_new(
+            AggregateMode::Partial,
+            group_by,
+            vec![],
+            vec![],
+            Arc::clone(&source),
+            source.schema(),
+        )?);
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(AggregateStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(agg.as_ref())?;
+        // Should fall through to DefaultStatisticsProvider (partition_statistics).
+        // The exact value depends on the built-in implementation.
+        assert!(
+            stats.base.num_rows.get_value().is_some()
+                || matches!(stats.base.num_rows, Precision::Absent)
+        );
+        Ok(())
+    }
+
+    // =========================================================================
+    // JoinStatisticsProvider tests
+    // =========================================================================
+
+    use crate::joins::{HashJoinExec, PartitionMode};
+    use datafusion_common::{JoinType, NullEquality};
+
+    fn make_source_with_ndv_2col(
+        num_rows: usize,
+        ndv_a: Option<usize>,
+    ) -> Arc<dyn ExecutionPlan> {
+        let schema = make_schema(); // "a" Int32, "b" Int32
+        let col_stats = vec![
+            {
+                let mut cs = ColumnStatistics::new_unknown();
+                if let Some(n) = ndv_a {
+                    cs.distinct_count = Precision::Exact(n);
+                }
+                cs
+            },
+            ColumnStatistics::new_unknown(),
+        ];
+        Arc::new(MockSourceExec::with_column_stats(
+            schema,
+            Precision::Exact(num_rows),
+            col_stats,
+        ))
+    }
+
+    fn make_hash_join(
+        left: Arc<dyn ExecutionPlan>,
+        right: Arc<dyn ExecutionPlan>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let _schema = make_schema();
+        let on: crate::joins::JoinOn = vec![(
+            Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>,
+            Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>,
+        )];
+        Ok(Arc::new(HashJoinExec::try_new(
+            left,
+            right,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNull,
+            false,
+        )?))
+    }
+
+    #[test]
+    fn test_join_provider_with_ndv() -> Result<()> {
+        // left: 1000 rows, NDV(a)=100; right: 500 rows, NDV(a)=50
+        // expected = 1000 * 500 / max(100, 50) = 5000
+        let left = make_source_with_ndv_2col(1000, Some(100));
+        let right = make_source_with_ndv_2col(500, Some(50));
+        let join = make_hash_join(left, right)?;
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(JoinStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(join.as_ref())?;
+        assert_eq!(stats.base.num_rows, Precision::Inexact(5000));
+        Ok(())
+    }
+
+    #[test]
+    fn test_join_provider_uses_actual_key_column_ndv() -> Result<()> {
+        // Join on column "b" (index 1), NDV only set on "b", not "a".
+        // Old first()-based code would look up column 0 (a), find no NDV,
+        // and fall back to Cartesian product. The fix looks up column 1 (b).
+        // left: 1000 rows, NDV(b)=50; right: 500 rows, NDV(b)=25
+        // expected = 1000 * 500 / max(50, 25) = 10000
+        let schema = make_schema(); // "a" Int32, "b" Int32
+        let make_source_ndv_b =
+            |num_rows: usize, ndv_b: usize| -> Arc<dyn ExecutionPlan> {
+                let col_stats = vec![
+                    ColumnStatistics::new_unknown(), // "a": no NDV
+                    {
+                        let mut cs = ColumnStatistics::new_unknown();
+                        cs.distinct_count = Precision::Exact(ndv_b);
+                        cs
+                    },
+                ];
+                Arc::new(MockSourceExec::with_column_stats(
+                    Arc::clone(&schema),
+                    Precision::Exact(num_rows),
+                    col_stats,
+                ))
+            };
+
+        let left = make_source_ndv_b(1000, 50);
+        let right = make_source_ndv_b(500, 25);
+
+        // Join on column "b" (index 1)
+        let on: crate::joins::JoinOn = vec![(
+            Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>,
+            Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>,
+        )];
+        let join: Arc<dyn ExecutionPlan> = Arc::new(HashJoinExec::try_new(
+            left,
+            right,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNull,
+            false,
+        )?);
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(JoinStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(join.as_ref())?;
+        assert_eq!(stats.base.num_rows, Precision::Inexact(10_000));
+        Ok(())
+    }
+
+    #[test]
+    fn test_join_provider_multi_key_ndv() -> Result<()> {
+        // Multi-key join: ON a.a = b.a AND a.b = b.b
+        // left: 1000 rows, NDV(a)=100, NDV(b)=20
+        // right: 500 rows, NDV(a)=50, NDV(b)=10
+        // expected = 1000 * 500 / (max(100,50) * max(20,10)) = 500000 / 2000 = 250
+        let schema = make_schema(); // "a" Int32, "b" Int32
+        let make_source_2ndv =
+            |num_rows: usize, ndv_a: usize, ndv_b: usize| -> Arc<dyn ExecutionPlan> {
+                let col_stats = vec![
+                    {
+                        let mut cs = ColumnStatistics::new_unknown();
+                        cs.distinct_count = Precision::Exact(ndv_a);
+                        cs
+                    },
+                    {
+                        let mut cs = ColumnStatistics::new_unknown();
+                        cs.distinct_count = Precision::Exact(ndv_b);
+                        cs
+                    },
+                ];
+                Arc::new(MockSourceExec::with_column_stats(
+                    Arc::clone(&schema),
+                    Precision::Exact(num_rows),
+                    col_stats,
+                ))
+            };
+
+        let left = make_source_2ndv(1000, 100, 20);
+        let right = make_source_2ndv(500, 50, 10);
+
+        let on: crate::joins::JoinOn = vec![
+            (
+                Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>,
+                Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>,
+            ),
+            (
+                Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>,
+                Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>,
+            ),
+        ];
+        let join: Arc<dyn ExecutionPlan> = Arc::new(HashJoinExec::try_new(
+            left,
+            right,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNull,
+            false,
+        )?);
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(JoinStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(join.as_ref())?;
+        assert_eq!(stats.base.num_rows, Precision::Inexact(250));
+        Ok(())
+    }
+
+    #[test]
+    fn test_join_provider_fallback_cartesian() -> Result<()> {
+        // No NDV available -> Cartesian product estimate
+        let left = make_source_with_ndv_2col(100, None);
+        let right = make_source_with_ndv_2col(200, None);
+        let join = make_hash_join(left, right)?;
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(JoinStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(join.as_ref())?;
+        assert_eq!(stats.base.num_rows, Precision::Inexact(20_000));
+        Ok(())
+    }
+
+    #[test]
+    fn test_nl_join_delegates() -> Result<()> {
+        use crate::joins::NestedLoopJoinExec;
+
+        // NL join delegates to the built-in (NestedLoopJoinExec may have an
+        // arbitrary JoinFilter, so the provider cannot safely assume Cartesian).
+        let left = make_source(100);
+        let right = make_source(200);
+        let join: Arc<dyn ExecutionPlan> = Arc::new(NestedLoopJoinExec::try_new(
+            left,
+            right,
+            None,
+            &JoinType::Inner,
+            None,
+        )?);
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(JoinStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(join.as_ref())?;
+        // Provider delegates; result comes from built-in partition_statistics.
+        assert!(
+            stats.base.num_rows.get_value().is_some()
+                || matches!(stats.base.num_rows, Precision::Absent)
+        );
+        Ok(())
+    }
+
+    fn make_hash_join_typed(
+        left: Arc<dyn ExecutionPlan>,
+        right: Arc<dyn ExecutionPlan>,
+        join_type: JoinType,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let on: crate::joins::JoinOn = vec![(
+            Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>,
+            Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>,
+        )];
+        Ok(Arc::new(HashJoinExec::try_new(
+            left,
+            right,
+            on,
+            None,
+            &join_type,
+            None,
+            PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNull,
+            false,
+        )?))
+    }
+
+    fn compute_join_rows(
+        left_rows: usize,
+        left_ndv: Option<usize>,
+        right_rows: usize,
+        right_ndv: Option<usize>,
+        join_type: JoinType,
+    ) -> Result<Precision<usize>> {
+        let left = make_source_with_ndv_2col(left_rows, left_ndv);
+        let right = make_source_with_ndv_2col(right_rows, right_ndv);
+        let join = make_hash_join_typed(left, right, join_type)?;
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(JoinStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        Ok(registry.compute(join.as_ref())?.base.num_rows)
+    }
+
+    #[test]
+    fn test_join_provider_left_outer() -> Result<()> {
+        // left=1000, right=500, NDV(a)=100/50
+        // inner estimate = 1000*500/100 = 5000, already >= left_rows
+        // Left outer: max(5000, 1000) = 5000
+        assert_eq!(
+            compute_join_rows(1000, Some(100), 500, Some(50), JoinType::Left)?,
+            Precision::Inexact(5000)
+        );
+        // Small inner estimate: left=1000, right=10, NDV=100/100
+        // inner = 1000*10/100 = 100, left outer = max(100, 1000) = 1000
+        assert_eq!(
+            compute_join_rows(1000, Some(100), 10, Some(100), JoinType::Left)?,
+            Precision::Inexact(1000)
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_join_provider_right_outer() -> Result<()> {
+        // inner = 1000*10/100 = 100, right outer = max(100, 10) = 100
+        assert_eq!(
+            compute_join_rows(1000, Some(100), 10, Some(100), JoinType::Right)?,
+            Precision::Inexact(100)
+        );
+        // inner = 10*1000/100 = 100, right outer = max(100, 1000) = 1000
+        assert_eq!(
+            compute_join_rows(10, Some(100), 1000, Some(100), JoinType::Right)?,
+            Precision::Inexact(1000)
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_join_provider_semi_join() -> Result<()> {
+        // inner = 5000, left semi = min(5000, 1000) = 1000
+        assert_eq!(
+            compute_join_rows(1000, Some(100), 500, Some(50), JoinType::LeftSemi)?,
+            Precision::Inexact(1000)
+        );
+        // inner = 5000, right semi = min(5000, 500) = 500
+        assert_eq!(
+            compute_join_rows(1000, Some(100), 500, Some(50), JoinType::RightSemi)?,
+            Precision::Inexact(500)
+        );
+        // Cartesian fallback (no NDV): inner = 1000*500 = 500000,
+        // left semi = min(500000, 1000) = 1000 (selectivity = 1.0)
+        assert_eq!(
+            compute_join_rows(1000, None, 500, None, JoinType::LeftSemi)?,
+            Precision::Inexact(1000)
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_join_provider_anti_join() -> Result<()> {
+        // inner = 1000*10/100 = 100, left anti = 1000 - min(100, 1000) = 900
+        assert_eq!(
+            compute_join_rows(1000, Some(100), 10, Some(100), JoinType::LeftAnti)?,
+            Precision::Inexact(900)
+        );
+        // inner = 5000, right anti = 500 - min(5000, 500) = 0
+        assert_eq!(
+            compute_join_rows(1000, Some(100), 500, Some(50), JoinType::RightAnti)?,
+            Precision::Inexact(0)
+        );
+        Ok(())
+    }
+
+    // =========================================================================
+    // CrossJoinExec tests (handled by JoinStatisticsProvider)
+    // =========================================================================
+
+    #[test]
+    fn test_cross_join_provider_exact() -> Result<()> {
+        use crate::joins::CrossJoinExec;
+        let left = make_source(100);
+        let right = make_source(200);
+        let join: Arc<dyn ExecutionPlan> = Arc::new(CrossJoinExec::new(left, right));
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(JoinStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(join.as_ref())?;
+        // Both inputs have Exact row counts -> result is also Exact
+        assert_eq!(stats.base.num_rows, Precision::Exact(20_000));
+        Ok(())
+    }
+
+    // =========================================================================
+    // LimitStatisticsProvider tests
+    // =========================================================================
+
+    use crate::limit::{GlobalLimitExec, LocalLimitExec};
+
+    #[test]
+    fn test_limit_provider_caps_output() -> Result<()> {
+        // input > fetch -> capped at fetch
+        let source = make_source(1000);
+        let limit: Arc<dyn ExecutionPlan> = Arc::new(LocalLimitExec::new(source, 100));
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(LimitStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(limit.as_ref())?;
+        assert_eq!(stats.base.num_rows, Precision::Exact(100));
+        Ok(())
+    }
+
+    #[test]
+    fn test_limit_provider_input_smaller_than_fetch() -> Result<()> {
+        // input < fetch -> output = input
+        let source = make_source(50);
+        let limit: Arc<dyn ExecutionPlan> = Arc::new(LocalLimitExec::new(source, 200));
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(LimitStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(limit.as_ref())?;
+        assert_eq!(stats.base.num_rows, Precision::Exact(50));
+        Ok(())
+    }
+
+    #[test]
+    fn test_global_limit_provider_skip_and_fetch() -> Result<()> {
+        // 1000 rows, skip 200, fetch 100 -> exactly 100
+        let source = make_source(1000);
+        let limit: Arc<dyn ExecutionPlan> =
+            Arc::new(GlobalLimitExec::new(source, 200, Some(100)));
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(LimitStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(limit.as_ref())?;
+        assert_eq!(stats.base.num_rows, Precision::Exact(100));
+        Ok(())
+    }
+
+    #[test]
+    fn test_global_limit_provider_skip_exceeds_rows() -> Result<()> {
+        // 100 rows, skip 200 -> 0 rows (skip > available)
+        let source = make_source(100);
+        let limit: Arc<dyn ExecutionPlan> =
+            Arc::new(GlobalLimitExec::new(source, 200, Some(50)));
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(LimitStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(limit.as_ref())?;
+        assert_eq!(stats.base.num_rows, Precision::Exact(0));
+        Ok(())
+    }
+
+    #[test]
+    fn test_limit_provider_inexact_input() -> Result<()> {
+        // Inexact(1000) with fetch=100: result must stay Inexact, not Exact,
+        // because the actual row count could be less than 100.
+        let source = make_source_with_precision(Precision::Inexact(1000));
+        let limit: Arc<dyn ExecutionPlan> = Arc::new(LocalLimitExec::new(source, 100));
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(LimitStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(limit.as_ref())?;
+        assert_eq!(stats.base.num_rows, Precision::Inexact(100));
+        Ok(())
+    }
+
+    // =========================================================================
+    // UnionStatisticsProvider tests
+    // =========================================================================
+
+    use crate::union::UnionExec;
+
+    fn make_source_with_precision(num_rows: Precision<usize>) -> Arc<dyn ExecutionPlan> {
+        Arc::new(MockSourceExec::new(make_schema(), num_rows))
+    }
+
+    #[test]
+    fn test_union_provider_sums_rows() -> Result<()> {
+        let union = UnionExec::try_new(vec![make_source(300), make_source(700)])?;
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(UnionStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(union.as_ref())?;
+        assert_eq!(stats.base.num_rows, Precision::Exact(1000));
+        Ok(())
+    }
+
+    #[test]
+    fn test_union_provider_three_inputs() -> Result<()> {
+        let union = UnionExec::try_new(vec![
+            make_source(100),
+            make_source(200),
+            make_source(300),
+        ])?;
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(UnionStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(union.as_ref())?;
+        assert_eq!(stats.base.num_rows, Precision::Exact(600));
+        Ok(())
+    }
+
+    #[test]
+    fn test_union_provider_absent_propagates() -> Result<()> {
+        // One input with unknown row count -> result must be Absent, not Inexact(300)
+        let union = UnionExec::try_new(vec![
+            make_source(300),
+            make_source_with_precision(Precision::Absent),
+        ])?;
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(UnionStatisticsProvider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+        let stats = registry.compute(union.as_ref())?;
+        assert_eq!(stats.base.num_rows, Precision::Absent);
+        Ok(())
+    }
+
+    // =========================================================================
+    // ClosureStatisticsProvider tests
+    // =========================================================================
+
+    #[test]
+    fn test_closure_provider_basic() -> Result<()> {
+        // Override all FilterExec stats with a fixed row count
+        let provider = ClosureStatisticsProvider::new(|plan, _child_stats| {
+            if plan.downcast_ref::<FilterExec>().is_some() {
+                Ok(StatisticsResult::Computed(ExtendedStatistics::from(
+                    Statistics {
+                        num_rows: Precision::Inexact(42),
+                        total_byte_size: Precision::Absent,
+                        column_statistics: vec![],
+                    },
+                )))
+            } else {
+                Ok(StatisticsResult::Delegate)
+            }
+        });
+
+        let registry = StatisticsRegistry::with_providers(vec![
+            Arc::new(provider),
+            Arc::new(DefaultStatisticsProvider),
+        ]);
+
+        let source = make_source(1000);
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(lit(true), source)?);
+        let stats = registry.compute(filter.as_ref())?;
+        assert_eq!(stats.base.num_rows, Precision::Inexact(42));
+        Ok(())
+    }
+
+    #[test]
+    fn test_closure_provider_distinguishes_nodes_by_child_stats() -> Result<()> {
+        // Two FilterExec nodes with different input sizes.
+        // The closure uses the child row count as a proxy to distinguish them,
+        // which mirrors the cardinality feedback use case where you match a
+        // runtime-observed count to the right node in the plan tree.
+        let provider = ClosureStatisticsProvider::new(|plan, child_stats| {
+            if plan.downcast_ref::<FilterExec>().is_none() {
+                return Ok(StatisticsResult::Delegate);
+            }
+            match child_stats[0].base.num_rows.get_value().copied() {
+                Some(500) => Ok(StatisticsResult::Computed(ExtendedStatistics::from(
+                    Statistics {
+                        num_rows: Precision::Inexact(100),
+                        total_byte_size: Precision::Absent,
+                        column_statistics: vec![],
+                    },
+                ))),
+                Some(200) => Ok(StatisticsResult::Computed(ExtendedStatistics::from(
+                    Statistics {
+                        num_rows: Precision::Inexact(50),
+                        total_byte_size: Precision::Absent,
+                        column_statistics: vec![],
+                    },
+                ))),
+                _ => Ok(StatisticsResult::Delegate),
+            }
+        });
+
+        let registry = StatisticsRegistry::with_providers(vec![Arc::new(provider)]);
+
+        let filter_a: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(lit(true), make_source(500))?);
+        let filter_b: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(lit(true), make_source(200))?);
+
+        let stats_a = registry.compute(filter_a.as_ref())?;
+        let stats_b = registry.compute(filter_b.as_ref())?;
+
+        assert_eq!(stats_a.base.num_rows, Precision::Inexact(100));
+        assert_eq!(stats_b.base.num_rows, Precision::Inexact(50));
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/placeholder_row.rs b/datafusion/physical-plan/src/placeholder_row.rs
index e7df79f867d70..ae8e73cd74ade 100644
--- a/datafusion/physical-plan/src/placeholder_row.rs
+++ b/datafusion/physical-plan/src/placeholder_row.rs
@@ -17,22 +17,23 @@
 
 //! EmptyRelation produce_one_row=true execution plan
 
-use std::any::Any;
 use std::sync::Arc;
 
 use crate::coop::cooperative;
 use crate::execution_plan::{Boundedness, EmissionType, SchedulingType};
 use crate::memory::MemoryStream;
 use crate::{
-    common, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties,
-    SendableRecordBatchStream, Statistics,
+    DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties,
+    SendableRecordBatchStream, Statistics, common,
 };
 
 use arrow::array::{ArrayRef, NullArray, RecordBatch, RecordBatchOptions};
 use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef};
-use datafusion_common::{internal_err, Result};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, assert_or_internal_err};
 use datafusion_execution::TaskContext;
 use datafusion_physical_expr::EquivalenceProperties;
+use datafusion_physical_expr::PhysicalExpr;
 
 use log::trace;
 
@@ -43,7 +44,7 @@ pub struct PlaceholderRowExec {
     schema: SchemaRef,
     /// Number of partitions
     partitions: usize,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl PlaceholderRowExec {
@@ -54,7 +55,7 @@ impl PlaceholderRowExec {
         PlaceholderRowExec {
             schema,
             partitions,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -63,7 +64,7 @@ impl PlaceholderRowExec {
         self.partitions = partitions;
         // Update output partitioning when updating partitions:
         let output_partitioning = Self::output_partitioning_helper(self.partitions);
-        self.cache = self.cache.with_partitioning(output_partitioning);
+        Arc::make_mut(&mut self.cache).partitioning = output_partitioning;
         self
     }
 
@@ -128,11 +129,7 @@ impl ExecutionPlan for PlaceholderRowExec {
     }
 
     /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -140,6 +137,13 @@ impl ExecutionPlan for PlaceholderRowExec {
         vec![]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         _: Vec<Arc<dyn ExecutionPlan>>,
@@ -152,25 +156,24 @@ impl ExecutionPlan for PlaceholderRowExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        trace!("Start PlaceholderRowExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
-
-        if partition >= self.partitions {
-            return internal_err!(
-                "PlaceholderRowExec invalid partition {} (expected less than {})",
-                partition,
-                self.partitions
-            );
-        }
+        trace!(
+            "Start PlaceholderRowExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
+
+        assert_or_internal_err!(
+            partition < self.partitions,
+            "PlaceholderRowExec invalid partition {partition} (expected less than {})",
+            self.partitions
+        );
 
         let ms = MemoryStream::try_new(self.data()?, Arc::clone(&self.schema), None)?;
         Ok(Box::pin(cooperative(ms)))
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         let batches = self
             .data()
             .expect("Create single row placeholder RecordBatch should not fail");
@@ -181,11 +184,11 @@ impl ExecutionPlan for PlaceholderRowExec {
             None => vec![batches; self.partitions],
         };
 
-        Ok(common::compute_record_batch_statistics(
+        Ok(Arc::new(common::compute_record_batch_statistics(
             &batches,
             &self.schema,
             None,
-        ))
+        )))
     }
 }
 
diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs
index ead2196860cde..e5b91fbb1c5d4 100644
--- a/datafusion/physical-plan/src/projection.rs
+++ b/datafusion/physical-plan/src/projection.rs
@@ -20,41 +20,44 @@
 //! of a projection on table `t1` where the expressions `a`, `b`, and `a+b` are the
 //! projection expressions. `SELECT` without `FROM` will only evaluate expressions.
 
-use super::expressions::{Column, Literal};
+use super::expressions::Column;
 use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
 use super::{
     DisplayAs, ExecutionPlanProperties, PlanProperties, RecordBatchStream,
-    SendableRecordBatchStream, Statistics,
+    SendableRecordBatchStream, SortOrderPushdownResult, Statistics,
 };
+use crate::column_rewriter::PhysicalColumnRewriter;
 use crate::execution_plan::CardinalityEffect;
 use crate::filter_pushdown::{
-    ChildPushdownResult, FilterDescription, FilterPushdownPhase,
-    FilterPushdownPropagation,
+    ChildFilterDescription, ChildPushdownResult, FilterDescription, FilterPushdownPhase,
+    FilterPushdownPropagation, FilterRemapper, PushedDownPredicate,
 };
 use crate::joins::utils::{ColumnIndex, JoinFilter, JoinOn, JoinOnRef};
-use crate::{DisplayFormatType, ExecutionPlan, PhysicalExpr};
-use std::any::Any;
+use crate::{DisplayFormatType, ExecutionPlan, PhysicalExpr, check_if_same_properties};
 use std::collections::HashMap;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 
 use arrow::datatypes::SchemaRef;
-use arrow::record_batch::{RecordBatch, RecordBatchOptions};
+use arrow::record_batch::RecordBatch;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
 };
-use datafusion_common::{internal_err, JoinSide, Result};
+use datafusion_common::{DataFusionError, JoinSide, Result, internal_err};
 use datafusion_execution::TaskContext;
+use datafusion_expr::ExpressionPlacement;
 use datafusion_physical_expr::equivalence::ProjectionMapping;
-use datafusion_physical_expr::utils::collect_columns;
-use datafusion_physical_expr_common::physical_expr::{fmt_sql, PhysicalExprRef};
-use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement};
+use datafusion_physical_expr::projection::Projector;
+use datafusion_physical_expr_common::physical_expr::{PhysicalExprRef, fmt_sql};
+use datafusion_physical_expr_common::sort_expr::{
+    LexOrdering, LexRequirement, PhysicalSortExpr,
+};
 // Re-exported from datafusion-physical-expr for backwards compatibility
 // We recommend updating your imports to use datafusion-physical-expr directly
 pub use datafusion_physical_expr::projection::{
-    update_expr, ProjectionExpr, ProjectionExprs,
+    ProjectionExpr, ProjectionExprs, update_expr,
 };
 
 use futures::stream::{Stream, StreamExt};
@@ -66,16 +69,15 @@ use log::trace;
 /// output row for each input row.
 #[derive(Debug, Clone)]
 pub struct ProjectionExec {
-    /// The projection expressions stored as tuples of (expression, output column name)
-    projection: ProjectionExprs,
-    /// The schema once the projection has been applied to the input
-    schema: SchemaRef,
+    /// A projector specialized to apply the projection to the input schema from the child node
+    /// and produce [`RecordBatch`]es with the output schema of this node.
+    projector: Projector,
     /// The input plan
     input: Arc<dyn ExecutionPlan>,
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl ProjectionExec {
@@ -134,28 +136,40 @@ impl ProjectionExec {
         E: Into<ProjectionExpr>,
     {
         let input_schema = input.schema();
-        // convert argument to Vec<ProjectionExpr>
-        let expr_vec = expr.into_iter().map(Into::into).collect::<Vec<_>>();
-        let projection = ProjectionExprs::new(expr_vec);
-
-        let schema = Arc::new(projection.project_schema(&input_schema)?);
+        let expr_arc = expr.into_iter().map(Into::into).collect::<Arc<_>>();
+        let projection = ProjectionExprs::from_expressions(expr_arc);
+        let projector = projection.make_projector(&input_schema)?;
+        Self::try_from_projector(projector, input)
+    }
 
+    fn try_from_projector(
+        projector: Projector,
+        input: Arc<dyn ExecutionPlan>,
+    ) -> Result<Self> {
         // Construct a map from the input expressions to the output expression of the Projection
-        let projection_mapping = projection.projection_mapping(&input_schema)?;
-        let cache =
-            Self::compute_properties(&input, &projection_mapping, Arc::clone(&schema))?;
+        let projection_mapping =
+            projector.projection().projection_mapping(&input.schema())?;
+        let cache = Self::compute_properties(
+            &input,
+            &projection_mapping,
+            Arc::clone(projector.output_schema()),
+        )?;
         Ok(Self {
-            projection,
-            schema,
+            projector,
             input,
             metrics: ExecutionPlanMetricsSet::new(),
-            cache,
+            cache: Arc::new(cache),
         })
     }
 
     /// The projection expressions stored as tuples of (expression, output column name)
     pub fn expr(&self) -> &[ProjectionExpr] {
-        self.projection.as_ref()
+        self.projector.projection().as_ref()
+    }
+
+    /// The projection expressions as a [`ProjectionExprs`].
+    pub fn projection_expr(&self) -> &ProjectionExprs {
+        self.projector.projection()
     }
 
     /// The input plan
@@ -184,6 +198,40 @@ impl ProjectionExec {
             input.boundedness(),
         ))
     }
+
+    /// Collect reverse alias mapping from projection expressions.
+    /// The result hash map is a map from aliased Column in parent to original expr.
+    fn collect_reverse_alias(
+        &self,
+    ) -> Result<datafusion_common::HashMap<Column, Arc<dyn PhysicalExpr>>> {
+        let mut alias_map = datafusion_common::HashMap::new();
+        for projection in self.projection_expr().iter() {
+            let (aliased_index, _output_field) = self
+                .projector
+                .output_schema()
+                .column_with_name(&projection.alias)
+                .ok_or_else(|| {
+                    DataFusionError::Internal(format!(
+                        "Expr {} with alias {} not found in output schema",
+                        projection.expr, projection.alias
+                    ))
+                })?;
+            let aliased_col = Column::new(&projection.alias, aliased_index);
+            alias_map.insert(aliased_col, Arc::clone(&projection.expr));
+        }
+        Ok(alias_map)
+    }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for ProjectionExec {
@@ -195,7 +243,8 @@ impl DisplayAs for ProjectionExec {
         match t {
             DisplayFormatType::Default | DisplayFormatType::Verbose => {
                 let expr: Vec<String> = self
-                    .projection
+                    .projector
+                    .projection()
                     .as_ref()
                     .iter()
                     .map(|proj_expr| {
@@ -232,11 +281,7 @@ impl ExecutionPlan for ProjectionExec {
     }
 
     /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -246,11 +291,19 @@ impl ExecutionPlan for ProjectionExec {
     }
 
     fn benefits_from_input_partitioning(&self) -> Vec<bool> {
-        let all_simple_exprs = self.projection.iter().all(|proj_expr| {
-            proj_expr.expr.as_any().is::<Column>()
-                || proj_expr.expr.as_any().is::<Literal>()
-        });
-        // If expressions are all either column_expr or Literal, then all computations in this projection are reorder or rename,
+        let all_simple_exprs =
+            self.projector
+                .projection()
+                .as_ref()
+                .iter()
+                .all(|proj_expr| {
+                    !matches!(
+                        proj_expr.expr.placement(),
+                        ExpressionPlacement::KeepInPlace
+                    )
+                });
+        // If expressions are all either column_expr or Literal (or other cheap expressions),
+        // then all computations in this projection are reorder or rename,
         // and projection would not benefit from the repartition, benefits_from_input_partitioning will return false.
         vec![!all_simple_exprs]
     }
@@ -259,12 +312,27 @@ impl ExecutionPlan for ProjectionExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        let mut tnr = TreeNodeRecursion::Continue;
+        for proj_expr in self.projector.projection().as_ref().iter() {
+            tnr = tnr.visit_sibling(|| f(proj_expr.expr.as_ref()))?;
+        }
+        Ok(tnr)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        ProjectionExec::try_new(self.projection.clone(), children.swap_remove(0))
-            .map(|p| Arc::new(p) as _)
+        check_if_same_properties!(self, children);
+        ProjectionExec::try_from_projector(
+            self.projector.clone(),
+            children.swap_remove(0),
+        )
+        .map(|p| Arc::new(p) as _)
     }
 
     fn execute(
@@ -272,27 +340,34 @@ impl ExecutionPlan for ProjectionExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        trace!("Start ProjectionExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
+        trace!(
+            "Start ProjectionExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
+
+        let projector = self.projector.with_metrics(&self.metrics, partition);
         Ok(Box::pin(ProjectionStream::new(
-            Arc::clone(&self.schema),
-            self.projection.expr_iter().collect(),
+            projector,
             self.input.execute(partition, context)?,
             BaselineMetrics::new(&self.metrics, partition),
-        )))
+        )?))
     }
 
     fn metrics(&self) -> Option<MetricsSet> {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        let input_stats = self.input.partition_statistics(partition)?;
-        self.projection
-            .project_statistics(input_stats, &self.input.schema())
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let input_stats =
+            Arc::unwrap_or_clone(self.input.partition_statistics(partition)?);
+        let output_schema = self.schema();
+        Ok(Arc::new(
+            self.projector
+                .projection()
+                .project_statistics(input_stats, &output_schema)?,
+        ))
     }
 
     fn supports_limit_pushdown(&self) -> bool {
@@ -322,10 +397,28 @@ impl ExecutionPlan for ProjectionExec {
         parent_filters: Vec<Arc<dyn PhysicalExpr>>,
         _config: &ConfigOptions,
     ) -> Result<FilterDescription> {
-        // TODO: In future, we can try to handle inverting aliases here.
-        // For the time being, we pass through untransformed filters, so filters on aliases are not handled.
-        // https://github.com/apache/datafusion/issues/17246
-        FilterDescription::from_children(parent_filters, &self.children())
+        // expand alias column to original expr in parent filters
+        let invert_alias_map = self.collect_reverse_alias()?;
+        let output_schema = self.schema();
+        let remapper = FilterRemapper::new(output_schema);
+        let mut child_parent_filters = Vec::with_capacity(parent_filters.len());
+
+        for filter in parent_filters {
+            // Check that column exists in child, then reassign column indices to match child schema
+            if let Some(reassigned) = remapper.try_remap(&filter)? {
+                // rewrite filter expression using invert alias map
+                let mut rewriter = PhysicalColumnRewriter::new(&invert_alias_map);
+                let rewritten = reassigned.rewrite(&mut rewriter)?.data;
+                child_parent_filters.push(PushedDownPredicate::supported(rewritten));
+            } else {
+                child_parent_filters.push(PushedDownPredicate::unsupported(filter));
+            }
+        }
+
+        Ok(FilterDescription::new().with_child(ChildFilterDescription {
+            parent_filters: child_parent_filters,
+            self_filters: vec![],
+        }))
     }
 
     fn handle_child_pushdown_result(
@@ -336,51 +429,109 @@ impl ExecutionPlan for ProjectionExec {
     ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
         Ok(FilterPushdownPropagation::if_all(child_pushdown_result))
     }
+
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn ExecutionPlan>>> {
+        let child = self.input();
+        let mut child_order = Vec::new();
+
+        // Check and transform sort expressions
+        for sort_expr in order {
+            // Recursively transform the expression
+            let mut can_pushdown = true;
+            let transformed = Arc::clone(&sort_expr.expr).transform(|expr| {
+                if let Some(col) = expr.downcast_ref::<Column>() {
+                    // Check if column index is valid.
+                    // This should always be true but fail gracefully if it's not.
+                    if col.index() >= self.expr().len() {
+                        can_pushdown = false;
+                        return Ok(Transformed::no(expr));
+                    }
+
+                    let proj_expr = &self.expr()[col.index()];
+
+                    // Check if projection expression is a simple column
+                    // We cannot push down order by clauses that depend on
+                    // projected computations as they would have nothing to reference.
+                    if let Some(child_col) = proj_expr.expr.downcast_ref::<Column>() {
+                        // Replace with the child column
+                        Ok(Transformed::yes(Arc::new(child_col.clone()) as _))
+                    } else {
+                        // Projection involves computation, cannot push down
+                        can_pushdown = false;
+                        Ok(Transformed::no(expr))
+                    }
+                } else {
+                    Ok(Transformed::no(expr))
+                }
+            })?;
+
+            if !can_pushdown {
+                return Ok(SortOrderPushdownResult::Unsupported);
+            }
+
+            child_order.push(PhysicalSortExpr {
+                expr: transformed.data,
+                options: sort_expr.options,
+            });
+        }
+
+        // Recursively push down to child node
+        match child.try_pushdown_sort(&child_order)? {
+            SortOrderPushdownResult::Exact { inner } => {
+                let new_exec = Arc::new(self.clone()).with_new_children(vec![inner])?;
+                Ok(SortOrderPushdownResult::Exact { inner: new_exec })
+            }
+            SortOrderPushdownResult::Inexact { inner } => {
+                let new_exec = Arc::new(self.clone()).with_new_children(vec![inner])?;
+                Ok(SortOrderPushdownResult::Inexact { inner: new_exec })
+            }
+            SortOrderPushdownResult::Unsupported => {
+                Ok(SortOrderPushdownResult::Unsupported)
+            }
+        }
+    }
+
+    fn with_preserve_order(
+        &self,
+        preserve_order: bool,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        self.input
+            .with_preserve_order(preserve_order)
+            .and_then(|new_input| {
+                Arc::new(self.clone())
+                    .with_new_children(vec![new_input])
+                    .ok()
+            })
+    }
 }
 
 impl ProjectionStream {
     /// Create a new projection stream
     fn new(
-        schema: SchemaRef,
-        expr: Vec<Arc<dyn PhysicalExpr>>,
+        projector: Projector,
         input: SendableRecordBatchStream,
         baseline_metrics: BaselineMetrics,
-    ) -> Self {
-        Self {
-            schema,
-            expr,
+    ) -> Result<Self> {
+        Ok(Self {
+            projector,
             input,
             baseline_metrics,
-        }
+        })
     }
 
     fn batch_project(&self, batch: &RecordBatch) -> Result<RecordBatch> {
         // Records time on drop
         let _timer = self.baseline_metrics.elapsed_compute().timer();
-        let arrays = self
-            .expr
-            .iter()
-            .map(|expr| {
-                expr.evaluate(batch)
-                    .and_then(|v| v.into_array(batch.num_rows()))
-            })
-            .collect::<Result<Vec<_>>>()?;
-
-        if arrays.is_empty() {
-            let options =
-                RecordBatchOptions::new().with_row_count(Some(batch.num_rows()));
-            RecordBatch::try_new_with_options(Arc::clone(&self.schema), arrays, &options)
-                .map_err(Into::into)
-        } else {
-            RecordBatch::try_new(Arc::clone(&self.schema), arrays).map_err(Into::into)
-        }
+        self.projector.project_batch(batch)
     }
 }
 
 /// Projection iterator
 struct ProjectionStream {
-    schema: SchemaRef,
-    expr: Vec<Arc<dyn PhysicalExpr>>,
+    projector: Projector,
     input: SendableRecordBatchStream,
     baseline_metrics: BaselineMetrics,
 }
@@ -409,10 +560,19 @@ impl Stream for ProjectionStream {
 impl RecordBatchStream for ProjectionStream {
     /// Get the schema
     fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.schema)
+        Arc::clone(self.projector.output_schema())
     }
 }
 
+/// Trait for execution plans that can embed a projection, avoiding a separate
+/// [`ProjectionExec`] wrapper.
+///
+/// # Empty projections
+///
+/// `Some(vec![])` is a valid projection that produces zero output columns while
+/// preserving the correct row count. Implementors must ensure that runtime batch
+/// construction still returns batches with the right number of rows even when no
+/// columns are selected (e.g. for `SELECT count(1) … JOIN …`).
 pub trait EmbeddedProjection: ExecutionPlan + Sized {
     fn with_projection(&self, projection: Option<Vec<usize>>) -> Result<Self>;
 }
@@ -423,6 +583,15 @@ pub fn try_embed_projection<Exec: EmbeddedProjection + 'static>(
     projection: &ProjectionExec,
     execution_plan: &Exec,
 ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+    // If the projection has no expressions at all (e.g., ProjectionExec: expr=[]),
+    // embed an empty projection into the execution plan so it outputs zero columns.
+    // This avoids allocating throwaway null arrays for build-side columns
+    // when no output columns are actually needed (e.g., count(1) over a right join).
+    if projection.expr().is_empty() {
+        let new_execution_plan = Arc::new(execution_plan.with_projection(Some(vec![]))?);
+        return Ok(Some(new_execution_plan));
+    }
+
     // Collect all column indices from the given projection expressions.
     let projection_index = collect_column_indices(projection.expr());
 
@@ -430,13 +599,7 @@ pub fn try_embed_projection<Exec: EmbeddedProjection + 'static>(
         return Ok(None);
     };
 
-    // If the projection indices is the same as the input columns, we don't need to embed the projection to hash join.
-    // Check the projection_index is 0..n-1 and the length of projection_index is the same as the length of execution_plan schema fields.
-    if projection_index.len() == projection_index.last().unwrap() + 1
-        && projection_index.len() == execution_plan.schema().fields().len()
-    {
-        return Ok(None);
-    }
+    let columns_reduced = projection_index.len() < execution_plan.schema().fields().len();
 
     let new_execution_plan =
         Arc::new(execution_plan.with_projection(Some(projection_index.to_vec()))?);
@@ -471,9 +634,16 @@ pub fn try_embed_projection<Exec: EmbeddedProjection + 'static>(
         Arc::clone(&new_execution_plan) as _,
     )?);
     if is_projection_removable(&new_projection) {
+        // Residual is identity — embedding fully absorbed the projection.
         Ok(Some(new_execution_plan))
-    } else {
+    } else if columns_reduced {
+        // Embedding reduced columns even though a residual is still needed
+        // for renames or expressions — worth keeping.
         Ok(Some(new_projection))
+    } else {
+        // No columns eliminated and residual still needed — embedding just
+        // adds an unnecessary column reorder inside the operator.
+        Ok(None)
     }
 }
 
@@ -489,7 +659,7 @@ pub fn try_pushdown_through_join(
     join_left: &Arc<dyn ExecutionPlan>,
     join_right: &Arc<dyn ExecutionPlan>,
     join_on: JoinOnRef,
-    schema: SchemaRef,
+    schema: &SchemaRef,
     filter: Option<&JoinFilter>,
 ) -> Result<Option<JoinData>> {
     // Convert projected expressions to columns. We can not proceed if this is not possible.
@@ -502,7 +672,7 @@ pub fn try_pushdown_through_join(
 
     if !join_allows_pushdown(
         &projection_as_columns,
-        &schema,
+        schema,
         far_right_left_col_ind,
         far_left_right_col_ind,
     ) {
@@ -555,20 +725,19 @@ pub fn try_pushdown_through_join(
 pub fn remove_unnecessary_projections(
     plan: Arc<dyn ExecutionPlan>,
 ) -> Result<Transformed<Arc<dyn ExecutionPlan>>> {
-    let maybe_modified =
-        if let Some(projection) = plan.as_any().downcast_ref::<ProjectionExec>() {
-            // If the projection does not cause any change on the input, we can
-            // safely remove it:
-            if is_projection_removable(projection) {
-                return Ok(Transformed::yes(Arc::clone(projection.input())));
-            }
-            // If it does, check if we can push it under its child(ren):
-            projection
-                .input()
-                .try_swapping_with_projection(projection)?
-        } else {
-            return Ok(Transformed::no(plan));
-        };
+    let maybe_modified = if let Some(projection) = plan.downcast_ref::<ProjectionExec>() {
+        // If the projection does not cause any change on the input, we can
+        // safely remove it:
+        if is_projection_removable(projection) {
+            return Ok(Transformed::yes(Arc::clone(projection.input())));
+        }
+        // If it does, check if we can push it under its child(ren):
+        projection
+            .input()
+            .try_swapping_with_projection(projection)?
+    } else {
+        return Ok(Transformed::no(plan));
+    };
     Ok(maybe_modified.map_or_else(|| Transformed::no(plan), Transformed::yes))
 }
 
@@ -579,7 +748,7 @@ pub fn remove_unnecessary_projections(
 fn is_projection_removable(projection: &ProjectionExec) -> bool {
     let exprs = projection.expr();
     exprs.iter().enumerate().all(|(idx, proj_expr)| {
-        let Some(col) = proj_expr.expr.as_any().downcast_ref::<Column>() else {
+        let Some(col) = proj_expr.expr.downcast_ref::<Column>() else {
             return false;
         };
         col.name() == proj_expr.alias && col.index() == idx
@@ -592,7 +761,6 @@ pub fn all_alias_free_columns(exprs: &[ProjectionExpr]) -> bool {
     exprs.iter().all(|proj_expr| {
         proj_expr
             .expr
-            .as_any()
             .downcast_ref::<Column>()
             .map(|column| column.name() == proj_expr.alias)
             .unwrap_or(false)
@@ -611,7 +779,6 @@ pub fn new_projections_for_columns(
         .filter_map(|proj_expr| {
             proj_expr
                 .expr
-                .as_any()
                 .downcast_ref::<Column>()
                 .map(|expr| source[expr.index()])
         })
@@ -630,9 +797,7 @@ pub fn make_with_child(
 
 /// Returns `true` if all the expressions in the argument are `Column`s.
 pub fn all_columns(exprs: &[ProjectionExpr]) -> bool {
-    exprs
-        .iter()
-        .all(|proj_expr| proj_expr.expr.as_any().is::<Column>())
+    exprs.iter().all(|proj_expr| proj_expr.expr.is::<Column>())
 }
 
 /// Updates the given lexicographic ordering according to given projected
@@ -681,7 +846,6 @@ pub fn physical_to_column_exprs(
         .map(|proj_expr| {
             proj_expr
                 .expr
-                .as_any()
                 .downcast_ref::<Column>()
                 .map(|col| (col.clone(), proj_expr.alias.clone()))
         })
@@ -789,10 +953,6 @@ pub fn update_join_on(
     hash_join_on: &[(PhysicalExprRef, PhysicalExprRef)],
     left_field_size: usize,
 ) -> Option<Vec<(PhysicalExprRef, PhysicalExprRef)>> {
-    // TODO: Clippy wants the "map" call removed, but doing so generates
-    //       a compilation error. Remove the clippy directive once this
-    //       issue is fixed.
-    #[allow(clippy::map_identity)]
     let (left_idx, right_idx): (Vec<_>, Vec<_>) = hash_join_on
         .iter()
         .map(|(left, right)| (left, right))
@@ -868,7 +1028,7 @@ fn try_unifying_projections(
             .expr
             .apply(|expr| {
                 Ok({
-                    if let Some(column) = expr.as_any().downcast_ref::<Column>() {
+                    if let Some(column) = expr.downcast_ref::<Column>() {
                         *column_ref_map.entry(column.clone()).or_default() += 1;
                     }
                     TreeNodeRecursion::Continue
@@ -877,11 +1037,15 @@ fn try_unifying_projections(
             .unwrap();
     });
     // Merging these projections is not beneficial, e.g
-    // If an expression is not trivial and it is referred more than 1, unifies projections will be
+    // If an expression is not trivial (KeepInPlace) and it is referred more than 1, unifies projections will be
     // beneficial as caching mechanism for non-trivial computations.
     // See discussion in: https://github.com/apache/datafusion/issues/8296
     if column_ref_map.iter().any(|(column, count)| {
-        *count > 1 && !is_expr_trivial(&Arc::clone(&child.expr()[column.index()].expr))
+        *count > 1
+            && !child.expr()[column.index()]
+                .expr
+                .placement()
+                .should_push_to_leaves()
     }) {
         return Ok(None);
     }
@@ -903,15 +1067,37 @@ fn try_unifying_projections(
 
 /// Collect all column indices from the given projection expressions.
 fn collect_column_indices(exprs: &[ProjectionExpr]) -> Vec<usize> {
-    // Collect indices and remove duplicates.
-    let mut indices = exprs
-        .iter()
-        .flat_map(|proj_expr| collect_columns(&proj_expr.expr))
-        .map(|x| x.index())
-        .collect::<std::collections::HashSet<_>>()
-        .into_iter()
-        .collect::<Vec<_>>();
-    indices.sort();
+    // Collect column indices in a deterministic order that preserves the
+    // projection's column ordering. For simple Column expressions, we use
+    // the column index directly. For complex expressions, we walk the
+    // expression tree to collect column references in traversal order.
+    // This allows the embedded projection to match the desired output
+    // column order, avoiding a residual ProjectionExec.
+    let mut seen = std::collections::HashSet::new();
+    let mut indices = Vec::new();
+    for proj_expr in exprs {
+        if let Some(col) = proj_expr.expr.downcast_ref::<Column>() {
+            // Simple column reference: preserve projection order.
+            if seen.insert(col.index()) {
+                indices.push(col.index());
+            }
+        } else {
+            // Complex expression: collect all referenced columns in
+            // expression tree traversal order (deterministic) to preserve
+            // the natural ordering of column references.
+            proj_expr
+                .expr
+                .apply(|expr| {
+                    if let Some(col) = expr.downcast_ref::<Column>()
+                        && seen.insert(col.index())
+                    {
+                        indices.push(col.index());
+                    }
+                    Ok(TreeNodeRecursion::Continue)
+                })
+                .expect("closure always returns OK");
+        }
+    }
     indices
 }
 
@@ -958,7 +1144,7 @@ fn new_columns_for_join_on(
             // Rewrite all columns in `on`
             Arc::clone(*on)
                 .transform(|expr| {
-                    if let Some(column) = expr.as_any().downcast_ref::<Column>() {
+                    if let Some(column) = expr.downcast_ref::<Column>() {
                         // Find the column in the projection expressions
                         let new_column = projection_exprs
                             .iter()
@@ -991,28 +1177,24 @@ fn new_columns_for_join_on(
     (new_columns.len() == hash_join_on.len()).then_some(new_columns)
 }
 
-/// Checks if the given expression is trivial.
-/// An expression is considered trivial if it is either a `Column` or a `Literal`.
-fn is_expr_trivial(expr: &Arc<dyn PhysicalExpr>) -> bool {
-    expr.as_any().downcast_ref::<Column>().is_some()
-        || expr.as_any().downcast_ref::<Literal>().is_some()
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
-    use std::sync::Arc;
 
     use crate::common::collect;
+
+    use crate::filter_pushdown::PushedDown;
     use crate::test;
     use crate::test::exec::StatisticsExec;
 
     use arrow::datatypes::{DataType, Field, Schema};
-    use datafusion_common::stats::{ColumnStatistics, Precision, Statistics};
     use datafusion_common::ScalarValue;
+    use datafusion_common::stats::{ColumnStatistics, Precision, Statistics};
 
     use datafusion_expr::Operator;
-    use datafusion_physical_expr::expressions::{col, BinaryExpr, Column, Literal};
+    use datafusion_physical_expr::expressions::{
+        BinaryExpr, Column, DynamicFilterPhysicalExpr, Literal, binary, col, lit,
+    };
 
     #[test]
     fn test_collect_column_indices() -> Result<()> {
@@ -1029,7 +1211,8 @@ mod tests {
             expr,
             alias: "b-(1+a)".to_string(),
         }]);
-        assert_eq!(column_indices, vec![1, 7]);
+        // Tree traversal order: b@7 is visited before a@1
+        assert_eq!(column_indices, vec![7, 1]);
         Ok(())
     }
 
@@ -1201,4 +1384,431 @@ mod tests {
         );
         assert!(stats.total_byte_size.is_exact().unwrap_or(false));
     }
+
+    #[test]
+    fn test_filter_pushdown_with_alias() -> Result<()> {
+        let input_schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics::new_unknown(&input_schema),
+            input_schema.clone(),
+        ));
+
+        // project "a" as "b"
+        let projection = ProjectionExec::try_new(
+            vec![ProjectionExpr {
+                expr: Arc::new(Column::new("a", 0)),
+                alias: "b".to_string(),
+            }],
+            input,
+        )?;
+
+        // filter "b > 5"
+        let filter = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("b", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(5)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let description = projection.gather_filters_for_pushdown(
+            FilterPushdownPhase::Post,
+            vec![filter],
+            &ConfigOptions::default(),
+        )?;
+
+        // Should be converted to "a > 5"
+        // "a" is index 0 in input
+        let expected_filter = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(5)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        assert_eq!(description.self_filters(), vec![vec![]]);
+        let pushed_filters = &description.parent_filters()[0];
+        assert_eq!(
+            format!("{}", pushed_filters[0].predicate),
+            format!("{}", expected_filter)
+        );
+        // Verify the predicate was actually pushed down
+        assert!(matches!(pushed_filters[0].discriminant, PushedDown::Yes));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_filter_pushdown_with_multiple_aliases() -> Result<()> {
+        let input_schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                column_statistics: vec![Default::default(); input_schema.fields().len()],
+                ..Default::default()
+            },
+            input_schema.clone(),
+        ));
+
+        // project "a" as "x", "b" as "y"
+        let projection = ProjectionExec::try_new(
+            vec![
+                ProjectionExpr {
+                    expr: Arc::new(Column::new("a", 0)),
+                    alias: "x".to_string(),
+                },
+                ProjectionExpr {
+                    expr: Arc::new(Column::new("b", 1)),
+                    alias: "y".to_string(),
+                },
+            ],
+            input,
+        )?;
+
+        // filter "x > 5"
+        let filter1 = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("x", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(5)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        // filter "y < 10"
+        let filter2 = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("y", 1)),
+            Operator::Lt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let description = projection.gather_filters_for_pushdown(
+            FilterPushdownPhase::Post,
+            vec![filter1, filter2],
+            &ConfigOptions::default(),
+        )?;
+
+        // Should be converted to "a > 5" and "b < 10"
+        let expected_filter1 = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(5)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let expected_filter2 = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("b", 1)),
+            Operator::Lt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let pushed_filters = &description.parent_filters()[0];
+        assert_eq!(pushed_filters.len(), 2);
+        // Note: The order of filters is preserved
+        assert_eq!(
+            format!("{}", pushed_filters[0].predicate),
+            format!("{}", expected_filter1)
+        );
+        assert_eq!(
+            format!("{}", pushed_filters[1].predicate),
+            format!("{}", expected_filter2)
+        );
+        // Verify the predicates were actually pushed down
+        assert!(matches!(pushed_filters[0].discriminant, PushedDown::Yes));
+        assert!(matches!(pushed_filters[1].discriminant, PushedDown::Yes));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_filter_pushdown_with_swapped_aliases() -> Result<()> {
+        let input_schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                column_statistics: vec![Default::default(); input_schema.fields().len()],
+                ..Default::default()
+            },
+            input_schema.clone(),
+        ));
+
+        // project "a" as "b", "b" as "a"
+        let projection = ProjectionExec::try_new(
+            vec![
+                ProjectionExpr {
+                    expr: Arc::new(Column::new("a", 0)),
+                    alias: "b".to_string(),
+                },
+                ProjectionExpr {
+                    expr: Arc::new(Column::new("b", 1)),
+                    alias: "a".to_string(),
+                },
+            ],
+            input,
+        )?;
+
+        // filter "b > 5" (output column 0, which is "a" in input)
+        let filter1 = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("b", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(5)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        // filter "a < 10" (output column 1, which is "b" in input)
+        let filter2 = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 1)),
+            Operator::Lt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let description = projection.gather_filters_for_pushdown(
+            FilterPushdownPhase::Post,
+            vec![filter1, filter2],
+            &ConfigOptions::default(),
+        )?;
+
+        let pushed_filters = &description.parent_filters()[0];
+        assert_eq!(pushed_filters.len(), 2);
+
+        // "b" (output index 0) -> "a" (input index 0)
+        let expected_filter1 = "a@0 > 5";
+        // "a" (output index 1) -> "b" (input index 1)
+        let expected_filter2 = "b@1 < 10";
+
+        assert_eq!(format!("{}", pushed_filters[0].predicate), expected_filter1);
+        assert_eq!(format!("{}", pushed_filters[1].predicate), expected_filter2);
+        // Verify the predicates were actually pushed down
+        assert!(matches!(pushed_filters[0].discriminant, PushedDown::Yes));
+        assert!(matches!(pushed_filters[1].discriminant, PushedDown::Yes));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_filter_pushdown_with_mixed_columns() -> Result<()> {
+        let input_schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                column_statistics: vec![Default::default(); input_schema.fields().len()],
+                ..Default::default()
+            },
+            input_schema.clone(),
+        ));
+
+        // project "a" as "x", "b" as "b" (pass through)
+        let projection = ProjectionExec::try_new(
+            vec![
+                ProjectionExpr {
+                    expr: Arc::new(Column::new("a", 0)),
+                    alias: "x".to_string(),
+                },
+                ProjectionExpr {
+                    expr: Arc::new(Column::new("b", 1)),
+                    alias: "b".to_string(),
+                },
+            ],
+            input,
+        )?;
+
+        // filter "x > 5"
+        let filter1 = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("x", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(5)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        // filter "b < 10" (using output index 1 which corresponds to 'b')
+        let filter2 = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("b", 1)),
+            Operator::Lt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let description = projection.gather_filters_for_pushdown(
+            FilterPushdownPhase::Post,
+            vec![filter1, filter2],
+            &ConfigOptions::default(),
+        )?;
+
+        let pushed_filters = &description.parent_filters()[0];
+        assert_eq!(pushed_filters.len(), 2);
+        // "x" -> "a" (index 0)
+        let expected_filter1 = "a@0 > 5";
+        // "b" -> "b" (index 1)
+        let expected_filter2 = "b@1 < 10";
+
+        assert_eq!(format!("{}", pushed_filters[0].predicate), expected_filter1);
+        assert_eq!(format!("{}", pushed_filters[1].predicate), expected_filter2);
+        // Verify the predicates were actually pushed down
+        assert!(matches!(pushed_filters[0].discriminant, PushedDown::Yes));
+        assert!(matches!(pushed_filters[1].discriminant, PushedDown::Yes));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_filter_pushdown_with_complex_expression() -> Result<()> {
+        let input_schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                column_statistics: vec![Default::default(); input_schema.fields().len()],
+                ..Default::default()
+            },
+            input_schema.clone(),
+        ));
+
+        // project "a + 1" as "z"
+        let projection = ProjectionExec::try_new(
+            vec![ProjectionExpr {
+                expr: Arc::new(BinaryExpr::new(
+                    Arc::new(Column::new("a", 0)),
+                    Operator::Plus,
+                    Arc::new(Literal::new(ScalarValue::Int32(Some(1)))),
+                )),
+                alias: "z".to_string(),
+            }],
+            input,
+        )?;
+
+        // filter "z > 10"
+        let filter = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("z", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let description = projection.gather_filters_for_pushdown(
+            FilterPushdownPhase::Post,
+            vec![filter],
+            &ConfigOptions::default(),
+        )?;
+
+        // expand to `a + 1 > 10`
+        let pushed_filters = &description.parent_filters()[0];
+        assert!(matches!(pushed_filters[0].discriminant, PushedDown::Yes));
+        assert_eq!(format!("{}", pushed_filters[0].predicate), "a@0 + 1 > 10");
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_filter_pushdown_with_unknown_column() -> Result<()> {
+        let input_schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                column_statistics: vec![Default::default(); input_schema.fields().len()],
+                ..Default::default()
+            },
+            input_schema.clone(),
+        ));
+
+        // project "a" as "a"
+        let projection = ProjectionExec::try_new(
+            vec![ProjectionExpr {
+                expr: Arc::new(Column::new("a", 0)),
+                alias: "a".to_string(),
+            }],
+            input,
+        )?;
+
+        // filter "unknown_col > 5" - using a column name that doesn't exist in projection output
+        // Column constructor: name, index. Index 1 doesn't exist.
+        let filter = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("unknown_col", 1)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(5)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let description = projection.gather_filters_for_pushdown(
+            FilterPushdownPhase::Post,
+            vec![filter],
+            &ConfigOptions::default(),
+        )?;
+
+        let pushed_filters = &description.parent_filters()[0];
+        assert!(matches!(pushed_filters[0].discriminant, PushedDown::No));
+        // The column shouldn't be found in the alias map, so it remains unchanged with its index
+        assert_eq!(
+            format!("{}", pushed_filters[0].predicate),
+            "unknown_col@1 > 5"
+        );
+
+        Ok(())
+    }
+
+    /// Basic test for `DynamicFilterPhysicalExpr` can correctly update its child expression
+    /// i.e. starting with lit(true) and after update it becomes `a > 5`
+    /// with projection [b - 1 as a], the pushed down filter should be `b - 1 > 5`
+    #[test]
+    fn test_basic_dyn_filter_projection_pushdown_update_child() -> Result<()> {
+        let input_schema =
+            Arc::new(Schema::new(vec![Field::new("b", DataType::Int32, false)]));
+
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                column_statistics: vec![Default::default(); input_schema.fields().len()],
+                ..Default::default()
+            },
+            input_schema.as_ref().clone(),
+        ));
+
+        // project "b" - 1 as "a"
+        let projection = ProjectionExec::try_new(
+            vec![ProjectionExpr {
+                expr: binary(
+                    Arc::new(Column::new("b", 0)),
+                    Operator::Minus,
+                    lit(1),
+                    &input_schema,
+                )
+                .unwrap(),
+                alias: "a".to_string(),
+            }],
+            input,
+        )?;
+
+        // simulate projection's parent create a dynamic filter on "a"
+        let projected_schema = projection.schema();
+        let col_a = col("a", &projected_schema)?;
+        let dynamic_filter = Arc::new(DynamicFilterPhysicalExpr::new(
+            vec![Arc::clone(&col_a)],
+            lit(true),
+        ));
+        // Initial state should be lit(true)
+        let current = dynamic_filter.current()?;
+        assert_eq!(format!("{current}"), "true");
+
+        let dyn_phy_expr: Arc<dyn PhysicalExpr> = Arc::clone(&dynamic_filter) as _;
+
+        let description = projection.gather_filters_for_pushdown(
+            FilterPushdownPhase::Post,
+            vec![dyn_phy_expr],
+            &ConfigOptions::default(),
+        )?;
+
+        let pushed_filters = &description.parent_filters()[0][0];
+
+        // Check currently pushed_filters is lit(true)
+        assert_eq!(
+            format!("{}", pushed_filters.predicate),
+            "DynamicFilter [ empty ]"
+        );
+
+        // Update to a > 5 (after projection, b is now called a)
+        let new_expr =
+            Arc::new(BinaryExpr::new(Arc::clone(&col_a), Operator::Gt, lit(5i32)));
+        dynamic_filter.update(new_expr)?;
+
+        // Now it should be a > 5
+        let current = dynamic_filter.current()?;
+        assert_eq!(format!("{current}"), "a@0 > 5");
+
+        // Check currently pushed_filters is b - 1 > 5 (because b - 1 is projected as a)
+        assert_eq!(
+            format!("{}", pushed_filters.predicate),
+            "DynamicFilter [ b@0 - 1 > 5 ]"
+        );
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/recursive_query.rs b/datafusion/physical-plan/src/recursive_query.rs
index 163f214444d09..6eee5d1237d29 100644
--- a/datafusion/physical-plan/src/recursive_query.rs
+++ b/datafusion/physical-plan/src/recursive_query.rs
@@ -21,23 +21,30 @@ use std::any::Any;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 
-use super::work_table::{ReservedBatches, WorkTable, WorkTableExec};
-use crate::execution_plan::{Boundedness, EmissionType};
+use super::work_table::{ReservedBatches, WorkTable};
+use crate::aggregates::group_values::{GroupValues, new_group_values};
+use crate::aggregates::order::GroupOrdering;
+use crate::execution_plan::{Boundedness, EmissionType, reset_plan_states};
+use crate::metrics::{
+    BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, RecordOutput,
+};
 use crate::{
-    metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet},
-    PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics,
+    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, RecordBatchStream,
+    SendableRecordBatchStream,
 };
-use crate::{DisplayAs, DisplayFormatType, ExecutionPlan};
-
+use arrow::array::{BooleanArray, BooleanBuilder};
+use arrow::compute::filter_record_batch;
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::{internal_datafusion_err, not_impl_err, Result};
-use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
+use datafusion_common::{Result, internal_datafusion_err, not_impl_err};
 use datafusion_execution::TaskContext;
+use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
+use datafusion_physical_expr::PhysicalExpr;
 use datafusion_physical_expr::{EquivalenceProperties, Partitioning};
 
-use futures::{ready, Stream, StreamExt};
+use futures::{Stream, StreamExt, ready};
 
 /// Recursive query execution plan.
 ///
@@ -69,7 +76,7 @@ pub struct RecursiveQueryExec {
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl RecursiveQueryExec {
@@ -81,9 +88,9 @@ impl RecursiveQueryExec {
         is_distinct: bool,
     ) -> Result<Self> {
         // Each recursive query needs its own work table
-        let work_table = Arc::new(WorkTable::new());
+        let work_table = Arc::new(WorkTable::new(name.clone()));
         // Use the same work table for both the WorkTableExec and the recursive term
-        let recursive_term = assign_work_table(recursive_term, Arc::clone(&work_table))?;
+        let recursive_term = assign_work_table(recursive_term, &work_table)?;
         let cache = Self::compute_properties(static_term.schema());
         Ok(RecursiveQueryExec {
             name,
@@ -92,7 +99,7 @@ impl RecursiveQueryExec {
             is_distinct,
             work_table,
             metrics: ExecutionPlanMetricsSet::new(),
-            cache,
+            cache: Arc::new(cache),
         })
     }
 
@@ -134,11 +141,7 @@ impl ExecutionPlan for RecursiveQueryExec {
         "RecursiveQueryExec"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -146,6 +149,13 @@ impl ExecutionPlan for RecursiveQueryExec {
         vec![&self.static_term, &self.recursive_term]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     // TODO: control these hints and see whether we can
     // infer some from the child plans (static/recursive terms).
     fn maintains_input_order(&self) -> Vec<bool> {
@@ -195,17 +205,14 @@ impl ExecutionPlan for RecursiveQueryExec {
             Arc::clone(&self.work_table),
             Arc::clone(&self.recursive_term),
             static_stream,
+            self.is_distinct,
             baseline_metrics,
-        )))
+        )?))
     }
 
     fn metrics(&self) -> Option<MetricsSet> {
         Some(self.metrics.clone_inner())
     }
-
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(Statistics::new_unknown(&self.schema()))
-    }
 }
 
 impl DisplayAs for RecursiveQueryExec {
@@ -267,8 +274,10 @@ struct RecursiveQueryStream {
     buffer: Vec<RecordBatch>,
     /// Tracks the memory used by the buffer
     reservation: MemoryReservation,
-    // /// Metrics.
-    _baseline_metrics: BaselineMetrics,
+    /// If the distinct flag is set, then we use this hash table to remove duplicates from result and work tables
+    distinct_deduplicator: Option<DistinctDeduplicator>,
+    /// Metrics.
+    baseline_metrics: BaselineMetrics,
 }
 
 impl RecursiveQueryStream {
@@ -278,12 +287,16 @@ impl RecursiveQueryStream {
         work_table: Arc<WorkTable>,
         recursive_term: Arc<dyn ExecutionPlan>,
         static_stream: SendableRecordBatchStream,
+        is_distinct: bool,
         baseline_metrics: BaselineMetrics,
-    ) -> Self {
+    ) -> Result<Self> {
         let schema = static_stream.schema();
         let reservation =
             MemoryConsumer::new("RecursiveQuery").register(task_context.memory_pool());
-        Self {
+        let distinct_deduplicator = is_distinct
+            .then(|| DistinctDeduplicator::new(Arc::clone(&schema), &task_context))
+            .transpose()?;
+        Ok(Self {
             task_context,
             work_table,
             recursive_term,
@@ -292,21 +305,42 @@ impl RecursiveQueryStream {
             schema,
             buffer: vec![],
             reservation,
-            _baseline_metrics: baseline_metrics,
-        }
+            distinct_deduplicator,
+            baseline_metrics,
+        })
     }
 
     /// Push a clone of the given batch to the in memory buffer, and then return
     /// a poll with it.
     fn push_batch(
         mut self: std::pin::Pin<&mut Self>,
-        batch: RecordBatch,
+        mut batch: RecordBatch,
     ) -> Poll<Option<Result<RecordBatch>>> {
+        let baseline_metrics = self.baseline_metrics.clone();
+
+        // Rebind to the declared output schema. The recursive term is planned
+        // independently from the static term and its projection may leave
+        // columns un-aliased (e.g. `upper(r.val)` vs the anchor's
+        // `upper(val) AS val`); downstream consumers that key on
+        // `batch.schema().field(i).name()` (TopK, CSV/JSON writers, custom
+        // collectors) would otherwise see the recursive branch's names leak
+        // through. Logical-plan coercion guarantees matching types, so this
+        // is a zero-copy field rebind.
+        if batch.schema() != self.schema {
+            batch =
+                RecordBatch::try_new(Arc::clone(&self.schema), batch.columns().to_vec())?;
+        }
+
+        if let Some(deduplicator) = &mut self.distinct_deduplicator {
+            let _timer_guard = baseline_metrics.elapsed_compute().timer();
+            batch = deduplicator.deduplicate(&batch)?;
+        }
+
         if let Err(e) = self.reservation.try_grow(batch.get_array_memory_size()) {
             return Poll::Ready(Some(Err(e)));
         }
-
         self.buffer.push(batch.clone());
+        (&batch).record_output(&baseline_metrics);
         Poll::Ready(Some(Ok(batch)))
     }
 
@@ -346,12 +380,12 @@ impl RecursiveQueryStream {
 
 fn assign_work_table(
     plan: Arc<dyn ExecutionPlan>,
-    work_table: Arc<WorkTable>,
+    work_table: &Arc<WorkTable>,
 ) -> Result<Arc<dyn ExecutionPlan>> {
     let mut work_table_refs = 0;
     plan.transform_down(|plan| {
         if let Some(new_plan) =
-            plan.with_new_state(Arc::clone(&work_table) as Arc<dyn Any + Send + Sync>)
+            plan.with_new_state(Arc::clone(work_table) as Arc<dyn Any + Send + Sync>)
         {
             if work_table_refs > 0 {
                 not_impl_err!(
@@ -361,8 +395,6 @@ fn assign_work_table(
                 work_table_refs += 1;
                 Ok(Transformed::yes(new_plan))
             }
-        } else if plan.as_any().is::<RecursiveQueryExec>() {
-            not_impl_err!("Recursive queries cannot be nested")
         } else {
             Ok(Transformed::no(plan))
         }
@@ -370,25 +402,6 @@ fn assign_work_table(
     .data()
 }
 
-/// Some plans will change their internal states after execution, making them unable to be executed again.
-/// This function uses [`ExecutionPlan::reset_state`] to reset any internal state within the plan.
-///
-/// An example is `CrossJoinExec`, which loads the left table into memory and stores it in the plan.
-/// However, if the data of the left table is derived from the work table, it will become outdated
-/// as the work table changes. When the next iteration executes this plan again, we must clear the left table.
-fn reset_plan_states(plan: Arc<dyn ExecutionPlan>) -> Result<Arc<dyn ExecutionPlan>> {
-    plan.transform_up(|plan| {
-        // WorkTableExec's states have already been updated correctly.
-        if plan.as_any().is::<WorkTableExec>() {
-            Ok(Transformed::no(plan))
-        } else {
-            let new_plan = Arc::clone(&plan).reset_state()?;
-            Ok(Transformed::yes(new_plan))
-        }
-    })
-    .data()
-}
-
 impl Stream for RecursiveQueryStream {
     type Item = Result<RecordBatch>;
 
@@ -396,7 +409,6 @@ impl Stream for RecursiveQueryStream {
         mut self: std::pin::Pin<&mut Self>,
         cx: &mut Context<'_>,
     ) -> Poll<Option<Self::Item>> {
-        // TODO: we should use this poll to record some metrics!
         if let Some(static_stream) = &mut self.static_stream {
             // While the static term's stream is available, we'll be forwarding the batches from it (also
             // saving them for the initial iteration of the recursive term).
@@ -433,5 +445,61 @@ impl RecordBatchStream for RecursiveQueryStream {
     }
 }
 
+/// Deduplicator based on a hash table.
+struct DistinctDeduplicator {
+    /// Grouped rows used for distinct
+    group_values: Box<dyn GroupValues>,
+    reservation: MemoryReservation,
+    intern_output_buffer: Vec<usize>,
+}
+
+impl DistinctDeduplicator {
+    fn new(schema: SchemaRef, task_context: &TaskContext) -> Result<Self> {
+        let group_values = new_group_values(schema, &GroupOrdering::None)?;
+        let reservation = MemoryConsumer::new("RecursiveQueryHashTable")
+            .register(task_context.memory_pool());
+        Ok(Self {
+            group_values,
+            reservation,
+            intern_output_buffer: Vec::new(),
+        })
+    }
+
+    /// Remove duplicated rows from the given batch, keeping a state between batches.
+    ///
+    /// We use a hash table to allocate new group ids for the new rows.
+    /// [`GroupValues`] allocate increasing group ids.
+    /// Hence, if groups (i.e., rows) are new, then they have ids >= length before interning, we keep them.
+    /// We also detect duplicates by enforcing that group ids are increasing.
+    fn deduplicate(&mut self, batch: &RecordBatch) -> Result<RecordBatch> {
+        let size_before = self.group_values.len();
+        self.intern_output_buffer.reserve(batch.num_rows());
+        self.group_values
+            .intern(batch.columns(), &mut self.intern_output_buffer)?;
+        let mask = new_groups_mask(&self.intern_output_buffer, size_before);
+        self.intern_output_buffer.clear();
+        // We update the reservation to reflect the new size of the hash table.
+        self.reservation.try_resize(self.group_values.size())?;
+        Ok(filter_record_batch(batch, &mask)?)
+    }
+}
+
+/// Return a mask, each element being true if, and only if, the element is greater than all previous elements and greater or equal than the provided max_already_seen_group_id
+fn new_groups_mask(
+    values: &[usize],
+    mut max_already_seen_group_id: usize,
+) -> BooleanArray {
+    let mut output = BooleanBuilder::with_capacity(values.len());
+    for value in values {
+        if *value >= max_already_seen_group_id {
+            output.append_value(true);
+            max_already_seen_group_id = *value + 1; // We want to be increasing
+        } else {
+            output.append_value(false);
+        }
+    }
+    output.finish()
+}
+
 #[cfg(test)]
 mod tests {}
diff --git a/datafusion/physical-plan/src/render_tree.rs b/datafusion/physical-plan/src/render_tree.rs
index f86e4c55e7b0e..40e2763698093 100644
--- a/datafusion/physical-plan/src/render_tree.rs
+++ b/datafusion/physical-plan/src/render_tree.rs
@@ -31,11 +31,12 @@ use crate::{DisplayFormatType, ExecutionPlan};
 // TODO: It's never used.
 /// Represents a 2D coordinate in the rendered tree.
 /// Used to track positions of nodes and their connections.
-#[allow(dead_code)]
 pub struct Coordinate {
     /// Horizontal position in the tree
+    #[expect(dead_code)]
     pub x: usize,
     /// Vertical position in the tree
+    #[expect(dead_code)]
     pub y: usize,
 }
 
diff --git a/datafusion/physical-plan/src/repartition/distributor_channels.rs b/datafusion/physical-plan/src/repartition/distributor_channels.rs
index 34294d0f2326d..22872d1e32d49 100644
--- a/datafusion/physical-plan/src/repartition/distributor_channels.rs
+++ b/datafusion/physical-plan/src/repartition/distributor_channels.rs
@@ -43,8 +43,8 @@ use std::{
     ops::DerefMut,
     pin::Pin,
     sync::{
-        atomic::{AtomicUsize, Ordering},
         Arc,
+        atomic::{AtomicUsize, Ordering},
     },
     task::{Context, Poll, Waker},
 };
@@ -476,7 +476,7 @@ type SharedGate = Arc<Gate>;
 mod tests {
     use std::sync::atomic::AtomicBool;
 
-    use futures::{task::ArcWake, FutureExt};
+    use futures::{FutureExt, task::ArcWake};
 
     use super::*;
 
diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs
index 08fac9fc69b39..b4af6e2c09a5c 100644
--- a/datafusion/physical-plan/src/repartition/mod.rs
+++ b/datafusion/physical-plan/src/repartition/mod.rs
@@ -23,37 +23,42 @@ use std::fmt::{Debug, Formatter};
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};
-use std::{any::Any, vec};
+use std::vec;
 
 use super::common::SharedMemoryReservation;
 use super::metrics::{self, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet};
 use super::{
     DisplayAs, ExecutionPlanProperties, RecordBatchStream, SendableRecordBatchStream,
 };
+use crate::coalesce::LimitedBatchCoalescer;
 use crate::execution_plan::{CardinalityEffect, EvaluationType, SchedulingType};
 use crate::hash_utils::create_hashes;
 use crate::metrics::{BaselineMetrics, SpillMetrics};
-use crate::projection::{all_columns, make_with_child, update_expr, ProjectionExec};
-use crate::repartition::distributor_channels::{
-    channels, partition_aware_channels, DistributionReceiver, DistributionSender,
-};
+use crate::projection::{ProjectionExec, all_columns, make_with_child, update_expr};
 use crate::sorts::streaming_merge::StreamingMergeBuilder;
 use crate::spill::spill_manager::SpillManager;
+use crate::spill::spill_pool::{self, SpillPoolWriter};
 use crate::stream::RecordBatchStreamAdapter;
-use crate::{DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, Statistics};
+use crate::{
+    DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, Statistics,
+    check_if_same_properties,
+};
 
 use arrow::array::{PrimitiveArray, RecordBatch, RecordBatchOptions};
 use arrow::compute::take_arrays;
 use arrow::datatypes::{SchemaRef, UInt32Type};
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::stats::Precision;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::utils::transpose;
-use datafusion_common::{internal_err, ColumnStatistics, HashMap};
-use datafusion_common::{not_impl_err, DataFusionError, Result};
+use datafusion_common::{
+    ColumnStatistics, DataFusionError, HashMap, assert_or_internal_err,
+    internal_datafusion_err, internal_err,
+};
+use datafusion_common::{Result, not_impl_err};
 use datafusion_common_runtime::SpawnedTask;
-use datafusion_execution::disk_manager::RefCountedTempFile;
-use datafusion_execution::memory_pool::MemoryConsumer;
 use datafusion_execution::TaskContext;
+use datafusion_execution::memory_pool::MemoryConsumer;
 use datafusion_physical_expr::{EquivalenceProperties, PhysicalExpr};
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
 
@@ -61,33 +66,111 @@ use crate::filter_pushdown::{
     ChildPushdownResult, FilterDescription, FilterPushdownPhase,
     FilterPushdownPropagation,
 };
+use crate::joins::SeededRandomState;
+use crate::sort_pushdown::SortOrderPushdownResult;
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays;
 use futures::stream::Stream;
-use futures::{FutureExt, StreamExt, TryStreamExt};
+use futures::{FutureExt, StreamExt, TryStreamExt, ready};
 use log::trace;
 use parking_lot::Mutex;
 
 mod distributor_channels;
+use distributor_channels::{
+    DistributionReceiver, DistributionSender, channels, partition_aware_channels,
+};
 
-/// A batch in the repartition queue - either in memory or spilled to disk
+/// A batch in the repartition queue - either in memory or spilled to disk.
+///
+/// This enum represents the two states a batch can be in during repartitioning.
+/// The decision to spill is made based on memory availability when sending a batch
+/// to an output partition.
+///
+/// # Batch Flow with Spilling
+///
+/// ```text
+/// Input Stream ──▶ Partition Logic ──▶ try_grow()
+///                                            │
+///                            ┌───────────────┴────────────────┐
+///                            │                                │
+///                            ▼                                ▼
+///                   try_grow() succeeds            try_grow() fails
+///                   (Memory Available)              (Memory Pressure)
+///                            │                                │
+///                            ▼                                ▼
+///                  RepartitionBatch::Memory         spill_writer.push_batch()
+///                  (batch held in memory)           (batch written to disk)
+///                            │                                │
+///                            │                                ▼
+///                            │                      RepartitionBatch::Spilled
+///                            │                      (marker - no batch data)
+///                            │                                │
+///                            └────────┬───────────────────────┘
+///                                     │
+///                                     ▼
+///                              Send to channel
+///                                     │
+///                                     ▼
+///                            Output Stream (poll)
+///                                     │
+///                      ┌──────────────┴─────────────┐
+///                      │                            │
+///                      ▼                            ▼
+///         RepartitionBatch::Memory      RepartitionBatch::Spilled
+///         Return batch immediately       Poll spill_stream (blocks)
+///                      │                            │
+///                      └────────┬───────────────────┘
+///                               │
+///                               ▼
+///                          Return batch
+///                    (FIFO order preserved)
+/// ```
+///
+/// See [`RepartitionExec`] for overall architecture and [`StreamState`] for
+/// the state machine that handles reading these batches.
 #[derive(Debug)]
 enum RepartitionBatch {
     /// Batch held in memory (counts against memory reservation)
     Memory(RecordBatch),
-    /// Batch spilled to disk (one file per batch for queue semantics)
-    /// File automatically deleted when dropped via reference counting
-    /// The size field stores the original batch size for validation when reading back
-    Spilled {
-        spill_file: RefCountedTempFile,
-        size: usize,
-    },
+    /// Marker indicating a batch was spilled to the partition's SpillPool.
+    /// The actual batch can be retrieved by reading from the SpillPoolStream.
+    /// This variant contains no data itself - it's just a signal to the reader
+    /// to fetch the next batch from the spill stream.
+    Spilled,
 }
 
 type MaybeBatch = Option<Result<RepartitionBatch>>;
 type InputPartitionsToCurrentPartitionSender = Vec<DistributionSender<MaybeBatch>>;
 type InputPartitionsToCurrentPartitionReceiver = Vec<DistributionReceiver<MaybeBatch>>;
 
-/// Channels and resources for a single output partition
-#[derive(Debug)]
+/// Output channel with its associated memory reservation and spill writer
+struct OutputChannel {
+    sender: DistributionSender<MaybeBatch>,
+    reservation: SharedMemoryReservation,
+    spill_writer: SpillPoolWriter,
+}
+
+/// Channels and resources for a single output partition.
+///
+/// Each output partition has channels to receive data from all input partitions.
+/// To handle memory pressure, each (input, output) pair gets its own
+/// [`SpillPool`](crate::spill::spill_pool) channel via [`spill_pool::channel`].
+///
+/// # Structure
+///
+/// For an output partition receiving from N input partitions:
+/// - `tx`: N senders (one per input) for sending batches to this output
+/// - `rx`: N receivers (one per input) for receiving batches at this output
+/// - `spill_writers`: N spill writers (one per input) for writing spilled data
+/// - `spill_readers`: N spill readers (one per input) for reading spilled data
+///
+/// This 1:1 mapping between input partitions and spill channels ensures that
+/// batches from each input are processed in FIFO order, even when some batches
+/// are spilled to disk and others remain in memory.
+///
+/// See [`RepartitionExec`] for the overall N×M architecture.
+///
+/// [`spill_pool::channel`]: crate::spill::spill_pool::channel
 struct PartitionChannels {
     /// Senders for each input partition to send data to this output partition
     tx: InputPartitionsToCurrentPartitionSender,
@@ -95,20 +178,32 @@ struct PartitionChannels {
     rx: InputPartitionsToCurrentPartitionReceiver,
     /// Memory reservation for this output partition
     reservation: SharedMemoryReservation,
-    /// Spill manager for handling disk spills for this output partition
-    spill_manager: Arc<SpillManager>,
+    /// Spill writers for writing spilled data.
+    /// SpillPoolWriter is Clone, so multiple writers can share state in non-preserve-order mode.
+    spill_writers: Vec<SpillPoolWriter>,
+    /// Spill readers for reading spilled data - one per input partition (FIFO semantics).
+    /// Each (input, output) pair gets its own reader to maintain proper ordering.
+    spill_readers: Vec<SendableRecordBatchStream>,
 }
 
-#[derive(Debug)]
 struct ConsumingInputStreamsState {
     /// Channels for sending batches from input partitions to output partitions.
     /// Key is the partition number.
     channels: HashMap<usize, PartitionChannels>,
 
-    /// Helper that ensures that that background job is killed once it is no longer needed.
+    /// Helper that ensures that background jobs are killed once they are no longer needed.
     abort_helper: Arc<Vec<SpawnedTask<()>>>,
 }
 
+impl Debug for ConsumingInputStreamsState {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ConsumingInputStreamsState")
+            .field("num_channels", &self.channels.len())
+            .field("abort_helper", &self.abort_helper)
+            .finish()
+    }
+}
+
 /// Inner state of [`RepartitionExec`].
 #[derive(Default)]
 enum RepartitionExecState {
@@ -142,10 +237,10 @@ impl Debug for RepartitionExecState {
 impl RepartitionExecState {
     fn ensure_input_streams_initialized(
         &mut self,
-        input: Arc<dyn ExecutionPlan>,
-        metrics: ExecutionPlanMetricsSet,
+        input: &Arc<dyn ExecutionPlan>,
+        metrics: &ExecutionPlanMetricsSet,
         output_partitions: usize,
-        ctx: Arc<TaskContext>,
+        ctx: &Arc<TaskContext>,
     ) -> Result<()> {
         if !matches!(self, RepartitionExecState::NotInitialized) {
             return Ok(());
@@ -155,10 +250,10 @@ impl RepartitionExecState {
         let mut streams_and_metrics = Vec::with_capacity(num_input_partitions);
 
         for i in 0..num_input_partitions {
-            let metrics = RepartitionMetrics::new(i, output_partitions, &metrics);
+            let metrics = RepartitionMetrics::new(i, output_partitions, metrics);
 
             let timer = metrics.fetch_time.timer();
-            let stream = input.execute(i, Arc::clone(&ctx))?;
+            let stream = input.execute(i, Arc::clone(ctx))?;
             timer.done();
 
             streams_and_metrics.push((stream, metrics));
@@ -167,27 +262,31 @@ impl RepartitionExecState {
         Ok(())
     }
 
+    #[expect(clippy::too_many_arguments)]
     fn consume_input_streams(
         &mut self,
-        input: Arc<dyn ExecutionPlan>,
-        metrics: ExecutionPlanMetricsSet,
-        partitioning: Partitioning,
+        input: &Arc<dyn ExecutionPlan>,
+        metrics: &ExecutionPlanMetricsSet,
+        partitioning: &Partitioning,
         preserve_order: bool,
-        name: String,
-        context: Arc<TaskContext>,
+        name: &str,
+        context: &Arc<TaskContext>,
+        spill_manager: SpillManager,
     ) -> Result<&mut ConsumingInputStreamsState> {
         let streams_and_metrics = match self {
             RepartitionExecState::NotInitialized => {
                 self.ensure_input_streams_initialized(
-                    Arc::clone(&input),
-                    metrics.clone(),
+                    input,
+                    metrics,
                     partitioning.partition_count(),
-                    Arc::clone(&context),
+                    context,
                 )?;
                 let RepartitionExecState::InputStreamsInitialized(value) = self else {
                     // This cannot happen, as ensure_input_streams_initialized() was just called,
                     // but the compiler does not know.
-                    return internal_err!("Programming error: RepartitionExecState must be in the InputStreamsInitialized state after calling RepartitionExecState::ensure_input_streams_initialized");
+                    return internal_err!(
+                        "Programming error: RepartitionExecState must be in the InputStreamsInitialized state after calling RepartitionExecState::ensure_input_streams_initialized"
+                    );
                 };
                 value
             }
@@ -198,17 +297,19 @@ impl RepartitionExecState {
         let num_input_partitions = streams_and_metrics.len();
         let num_output_partitions = partitioning.partition_count();
 
+        let spill_manager = Arc::new(spill_manager);
+
         let (txs, rxs) = if preserve_order {
-            let (txs, rxs) =
+            // Create partition-aware channels with one channel per (input, output) pair
+            // This provides backpressure while maintaining proper ordering
+            let (txs_all, rxs_all) =
                 partition_aware_channels(num_input_partitions, num_output_partitions);
             // Take transpose of senders and receivers. `state.channels` keeps track of entries per output partition
-            let txs = transpose(txs);
-            let rxs = transpose(rxs);
+            let txs = transpose(txs_all);
+            let rxs = transpose(rxs_all);
             (txs, rxs)
         } else {
-            // create one channel per *output* partition
-            // note we use a custom channel that ensures there is always data for each receiver
-            // but limits the amount of buffering if required.
+            // Create one channel per *output* partition with backpressure
             let (txs, rxs) = channels(num_output_partitions);
             // Clone sender for each input partitions
             let txs = txs
@@ -221,24 +322,39 @@ impl RepartitionExecState {
 
         let mut channels = HashMap::with_capacity(txs.len());
         for (partition, (tx, rx)) in txs.into_iter().zip(rxs).enumerate() {
-            let reservation = Arc::new(Mutex::new(
+            let reservation = Arc::new(
                 MemoryConsumer::new(format!("{name}[{partition}]"))
                     .with_can_spill(true)
                     .register(context.memory_pool()),
-            ));
-            let spill_metrics = SpillMetrics::new(&metrics, partition);
-            let spill_manager = Arc::new(SpillManager::new(
-                Arc::clone(&context.runtime_env()),
-                spill_metrics,
-                input.schema(),
-            ));
+            );
+
+            // Create spill channels based on mode:
+            // - preserve_order: one spill channel per (input, output) pair for proper FIFO ordering
+            // - non-preserve-order: one shared spill channel per output partition since all inputs
+            //   share the same receiver
+            let max_file_size = context
+                .session_config()
+                .options()
+                .execution
+                .max_spill_file_size_bytes;
+            let num_spill_channels = if preserve_order {
+                num_input_partitions
+            } else {
+                1
+            };
+            let (spill_writers, spill_readers): (Vec<_>, Vec<_>) = (0
+                ..num_spill_channels)
+                .map(|_| spill_pool::channel(max_file_size, Arc::clone(&spill_manager)))
+                .unzip();
+
             channels.insert(
                 partition,
                 PartitionChannels {
                     tx,
                     rx,
                     reservation,
-                    spill_manager,
+                    spill_readers,
+                    spill_writers,
                 },
             );
         }
@@ -251,34 +367,41 @@ impl RepartitionExecState {
             let txs: HashMap<_, _> = channels
                 .iter()
                 .map(|(partition, channels)| {
+                    // In preserve_order mode: each input gets its own spill writer (index i)
+                    // In non-preserve-order mode: all inputs share spill writer 0 via clone
+                    let spill_writer_idx = if preserve_order { i } else { 0 };
                     (
                         *partition,
-                        (
-                            channels.tx[i].clone(),
-                            Arc::clone(&channels.reservation),
-                            Arc::clone(&channels.spill_manager),
-                        ),
+                        OutputChannel {
+                            sender: channels.tx[i].clone(),
+                            reservation: Arc::clone(&channels.reservation),
+                            spill_writer: channels.spill_writers[spill_writer_idx]
+                                .clone(),
+                        },
                     )
                 })
                 .collect();
 
+            // Extract senders for wait_for_task before moving txs
+            let senders: HashMap<_, _> = txs
+                .iter()
+                .map(|(partition, channel)| (*partition, channel.sender.clone()))
+                .collect();
+
             let input_task = SpawnedTask::spawn(RepartitionExec::pull_from_input(
                 stream,
-                txs.clone(),
+                txs,
                 partitioning.clone(),
                 metrics,
+                // preserve_order depends on partition index to start from 0
+                if preserve_order { 0 } else { i },
+                num_input_partitions,
             ));
 
             // In a separate task, wait for each input to be done
             // (and pass along any errors, including panic!s)
-            let wait_for_task = SpawnedTask::spawn(RepartitionExec::wait_for_task(
-                input_task,
-                txs.into_iter()
-                    .map(|(partition, (tx, _reservation, _spill_manager))| {
-                        (partition, tx)
-                    })
-                    .collect(),
-            ));
+            let wait_for_task =
+                SpawnedTask::spawn(RepartitionExec::wait_for_task(input_task, senders));
             spawned_tasks.push(wait_for_task);
         }
         *self = Self::ConsumingInputStreams(ConsumingInputStreamsState {
@@ -300,10 +423,10 @@ pub struct BatchPartitioner {
 
 enum BatchPartitionerState {
     Hash {
-        random_state: ahash::RandomState,
         exprs: Vec<Arc<dyn PhysicalExpr>>,
-        num_partitions: usize,
+        partition_reducer: StrengthReducedU64,
         hash_buffer: Vec<u64>,
+        indices: Vec<Vec<u32>>,
     },
     RoundRobin {
         num_partitions: usize,
@@ -311,29 +434,176 @@ enum BatchPartitionerState {
     },
 }
 
-impl BatchPartitioner {
-    /// Create a new [`BatchPartitioner`] with the provided [`Partitioning`]
-    ///
-    /// The time spent repartitioning will be recorded to `timer`
-    pub fn try_new(partitioning: Partitioning, timer: metrics::Time) -> Result<Self> {
-        let state = match partitioning {
-            Partitioning::RoundRobinBatch(num_partitions) => {
-                BatchPartitionerState::RoundRobin {
-                    num_partitions,
-                    next_idx: 0,
+/// Fixed RandomState used for hash repartitioning to ensure consistent behavior across
+/// executions and runs.
+pub const REPARTITION_RANDOM_STATE: SeededRandomState = SeededRandomState::with_seed(0);
+
+/// Computes `value % divisor` without division in the hot loop when `divisor`
+/// is fixed for many values.
+///
+/// Hash repartitioning computes a remainder for every row. Integer division is
+/// relatively expensive, so this precomputes the strength-reduced form of the
+/// divisor: powers of two use a bit mask, and other divisors use a reciprocal
+/// multiply to recover the quotient and therefore the remainder. This is the
+/// same invariant-divisor optimization compilers use for `%` by a constant.
+#[derive(Debug, Clone, Copy)]
+enum StrengthReducedU64 {
+    PowerOfTwo { mask: u64 },
+    Reciprocal { divisor: u64, reciprocal: u128 },
+}
+
+impl StrengthReducedU64 {
+    fn new(divisor: u64) -> Self {
+        debug_assert!(divisor > 0);
+
+        if divisor.is_power_of_two() {
+            Self::PowerOfTwo { mask: divisor - 1 }
+        } else {
+            Self::Reciprocal {
+                divisor,
+                // ceil(2^128 / divisor), computed without representing 2^128
+                reciprocal: u128::MAX / u128::from(divisor) + 1,
+            }
+        }
+    }
+
+    fn partition_indices(self, hash_buffer: &[u64], indices: &mut [Vec<u32>]) {
+        match self {
+            Self::PowerOfTwo { mask } => {
+                for (index, hash) in hash_buffer.iter().enumerate() {
+                    indices[(*hash & mask) as usize].push(index as u32);
                 }
             }
-            Partitioning::Hash(exprs, num_partitions) => BatchPartitionerState::Hash {
+            Self::Reciprocal {
+                divisor,
+                reciprocal,
+            } => {
+                for (index, hash) in hash_buffer.iter().enumerate() {
+                    let quotient = Self::quotient(*hash, reciprocal);
+                    let partition = *hash - quotient * divisor;
+                    indices[partition as usize].push(index as u32);
+                }
+            }
+        }
+    }
+
+    #[cfg(test)]
+    fn remainder(self, value: u64) -> u64 {
+        match self {
+            Self::PowerOfTwo { mask } => value & mask,
+            Self::Reciprocal {
+                divisor,
+                reciprocal,
+            } => value - Self::quotient(value, reciprocal) * divisor,
+        }
+    }
+
+    #[inline]
+    fn quotient(value: u64, reciprocal: u128) -> u64 {
+        let reciprocal_low = reciprocal as u64;
+        let reciprocal_high = (reciprocal >> 64) as u64;
+        let low_product = u128::from(value) * u128::from(reciprocal_low);
+        let high_product = u128::from(value) * u128::from(reciprocal_high);
+        let carry = ((high_product & u128::from(u64::MAX)) + (low_product >> 64)) >> 64;
+
+        ((high_product >> 64) + carry) as u64
+    }
+}
+
+impl BatchPartitioner {
+    /// Create a new [`BatchPartitioner`] for hash-based repartitioning.
+    ///
+    /// # Parameters
+    /// - `exprs`: Expressions used to compute the hash for each input row.
+    /// - `num_partitions`: Total number of output partitions.
+    /// - `timer`: Metric used to record time spent during repartitioning.
+    ///
+    /// The partition count is fixed for the lifetime of the partitioner, so this
+    /// precomputes a strength-reduced reducer for `hash % num_partitions`.
+    ///
+    /// # Errors
+    /// Returns an error if `num_partitions` is zero.
+    pub fn new_hash_partitioner(
+        exprs: Vec<Arc<dyn PhysicalExpr>>,
+        num_partitions: usize,
+        timer: metrics::Time,
+    ) -> Result<Self> {
+        if num_partitions == 0 {
+            return internal_err!("Hash repartition requires at least one partition");
+        }
+
+        Ok(Self {
+            state: BatchPartitionerState::Hash {
                 exprs,
-                num_partitions,
-                // Use fixed random hash
-                random_state: ahash::RandomState::with_seeds(0, 0, 0, 0),
+                partition_reducer: StrengthReducedU64::new(num_partitions as u64),
                 hash_buffer: vec![],
+                indices: vec![vec![]; num_partitions],
             },
-            other => return not_impl_err!("Unsupported repartitioning scheme {other:?}"),
-        };
+            timer,
+        })
+    }
 
-        Ok(Self { state, timer })
+    /// Create a new [`BatchPartitioner`] for round-robin repartitioning.
+    ///
+    /// # Parameters
+    /// - `num_partitions`: Total number of output partitions.
+    /// - `timer`: Metric used to record time spent during repartitioning.
+    /// - `input_partition`: Index of the current input partition.
+    /// - `num_input_partitions`: Total number of input partitions.
+    ///
+    /// # Notes
+    /// The starting output partition is derived from the input partition
+    /// to avoid skew when multiple input partitions are used.
+    pub fn new_round_robin_partitioner(
+        num_partitions: usize,
+        timer: metrics::Time,
+        input_partition: usize,
+        num_input_partitions: usize,
+    ) -> Self {
+        Self {
+            state: BatchPartitionerState::RoundRobin {
+                num_partitions,
+                next_idx: (input_partition * num_partitions) / num_input_partitions,
+            },
+            timer,
+        }
+    }
+    /// Create a new [`BatchPartitioner`] based on the provided [`Partitioning`] scheme.
+    ///
+    /// This is a convenience constructor that delegates to the specialized
+    /// hash or round-robin constructors depending on the partitioning variant.
+    ///
+    /// # Parameters
+    /// - `partitioning`: Partitioning scheme to apply (hash or round-robin).
+    /// - `timer`: Metric used to record time spent during repartitioning.
+    /// - `input_partition`: Index of the current input partition.
+    /// - `num_input_partitions`: Total number of input partitions.
+    ///
+    /// # Errors
+    /// Returns an error if the provided partitioning scheme is not supported,
+    /// or if hash partitioning is requested with zero output partitions.
+    pub fn try_new(
+        partitioning: Partitioning,
+        timer: metrics::Time,
+        input_partition: usize,
+        num_input_partitions: usize,
+    ) -> Result<Self> {
+        match partitioning {
+            Partitioning::Hash(exprs, num_partitions) => {
+                Self::new_hash_partitioner(exprs, num_partitions, timer)
+            }
+            Partitioning::RoundRobinBatch(num_partitions) => {
+                Ok(Self::new_round_robin_partitioner(
+                    num_partitions,
+                    timer,
+                    input_partition,
+                    num_input_partitions,
+                ))
+            }
+            other => {
+                not_impl_err!("Unsupported repartitioning scheme {other:?}")
+            }
+        }
     }
 
     /// Partition the provided [`RecordBatch`] into one or more partitioned [`RecordBatch`]
@@ -355,12 +625,21 @@ impl BatchPartitioner {
         })
     }
 
-    /// Actual implementation of [`partition`](Self::partition).
+    /// Returns an iterator of `(partition_index, RecordBatch)` pairs for the given batch.
+    ///
+    /// This is useful for async consumers that want to separate CPU-bound partitioning
+    /// from I/O. For example, you can iterate results on the async side and send them
+    /// through a channel, while performing file I/O on a blocking task:
+    ///
+    /// ```ignore
+    /// for result in partitioner.partition_iter(batch)? {
+    ///     let (partition, batch) = result?;
+    ///     tx.send((partition, batch)).await?;
+    /// }
+    /// ```
     ///
-    /// The reason this was pulled out is that we need to have a variant of `partition` that works w/ sync functions,
-    /// and one that works w/ async. Using an iterator as an intermediate representation was the best way to achieve
-    /// this (so we don't need to clone the entire implementation).
-    fn partition_iter(
+    /// The sync [`partition`](Self::partition) method is implemented on top of this.
+    pub fn partition_iter(
         &mut self,
         batch: RecordBatch,
     ) -> Result<impl Iterator<Item = Result<(usize, RecordBatch)>> + Send + '_> {
@@ -375,53 +654,52 @@ impl BatchPartitioner {
                     Box::new(std::iter::once(Ok((idx, batch))))
                 }
                 BatchPartitionerState::Hash {
-                    random_state,
                     exprs,
-                    num_partitions: partitions,
+                    partition_reducer,
                     hash_buffer,
+                    indices,
                 } => {
                     // Tracking time required for distributing indexes across output partitions
                     let timer = self.timer.timer();
 
-                    let arrays = exprs
-                        .iter()
-                        .map(|expr| expr.evaluate(&batch)?.into_array(batch.num_rows()))
-                        .collect::<Result<Vec<_>>>()?;
+                    let arrays =
+                        evaluate_expressions_to_arrays(exprs.as_slice(), &batch)?;
 
                     hash_buffer.clear();
                     hash_buffer.resize(batch.num_rows(), 0);
 
-                    create_hashes(&arrays, random_state, hash_buffer)?;
+                    create_hashes(
+                        &arrays,
+                        REPARTITION_RANDOM_STATE.random_state(),
+                        hash_buffer,
+                    )?;
 
-                    let mut indices: Vec<_> = (0..*partitions)
-                        .map(|_| Vec::with_capacity(batch.num_rows()))
-                        .collect();
+                    indices.iter_mut().for_each(|v| v.clear());
 
-                    for (index, hash) in hash_buffer.iter().enumerate() {
-                        indices[(*hash % *partitions as u64) as usize].push(index as u32);
-                    }
+                    partition_reducer.partition_indices(hash_buffer, indices);
 
                     // Finished building index-arrays for output partitions
                     timer.done();
 
                     // Borrowing partitioner timer to prevent moving `self` to closure
                     let partitioner_timer = &self.timer;
-                    let it = indices
-                        .into_iter()
-                        .enumerate()
-                        .filter_map(|(partition, indices)| {
-                            let indices: PrimitiveArray<UInt32Type> = indices.into();
-                            (!indices.is_empty()).then_some((partition, indices))
-                        })
-                        .map(move |(partition, indices)| {
+
+                    let mut partitioned_batches = vec![];
+                    for (partition, p_indices) in indices.iter_mut().enumerate() {
+                        if !p_indices.is_empty() {
+                            let taken_indices = std::mem::take(p_indices);
+                            let indices_array: PrimitiveArray<UInt32Type> =
+                                taken_indices.into();
+
                             // Tracking time required for repartitioned batches construction
                             let _timer = partitioner_timer.timer();
 
                             // Produce batches based on indices
-                            let columns = take_arrays(batch.columns(), &indices, None)?;
+                            let columns =
+                                take_arrays(batch.columns(), &indices_array, None)?;
 
                             let mut options = RecordBatchOptions::new();
-                            options = options.with_row_count(Some(indices.len()));
+                            options = options.with_row_count(Some(indices_array.len()));
                             let batch = RecordBatch::try_new_with_options(
                                 batch.schema(),
                                 columns,
@@ -429,10 +707,22 @@ impl BatchPartitioner {
                             )
                             .unwrap();
 
-                            Ok((partition, batch))
-                        });
+                            partitioned_batches.push(Ok((partition, batch)));
+
+                            // Return the taken vec
+                            let (_, buffer, _) = indices_array.into_parts();
+                            let mut vec =
+                                buffer.into_inner().into_vec::<u32>().map_err(|e| {
+                                    internal_datafusion_err!(
+                                        "Could not convert buffer to vec: {e:?}"
+                                    )
+                                })?;
+                            vec.clear();
+                            *p_indices = vec;
+                        }
+                    }
 
-                    Box::new(it)
+                    Box::new(partitioned_batches.into_iter())
                 }
             };
 
@@ -441,9 +731,9 @@ impl BatchPartitioner {
 
     // return the number of output partitions
     fn num_partitions(&self) -> usize {
-        match self.state {
-            BatchPartitionerState::RoundRobin { num_partitions, .. } => num_partitions,
-            BatchPartitionerState::Hash { num_partitions, .. } => num_partitions,
+        match &self.state {
+            BatchPartitionerState::RoundRobin { num_partitions, .. } => *num_partitions,
+            BatchPartitionerState::Hash { indices, .. } => indices.len(),
         }
     }
 }
@@ -507,6 +797,38 @@ impl BatchPartitioner {
 /// arbitrary interleaving (and thus unordered) unless
 /// [`Self::with_preserve_order`] specifies otherwise.
 ///
+/// # Spilling Architecture
+///
+/// RepartitionExec uses [`SpillPool`](crate::spill::spill_pool) channels to handle
+/// memory pressure during repartitioning. Each (input partition, output partition)
+/// pair gets its own SpillPool channel for FIFO ordering.
+///
+/// ```text
+/// Input Partitions (N)          Output Partitions (M)
+/// ────────────────────          ─────────────────────
+///
+///    Input 0 ──┐                      ┌──▶ Output 0
+///              │  ┌──────────────┐    │
+///              ├─▶│ SpillPool    │────┤
+///              │  │ [In0→Out0]   │    │
+///    Input 1 ──┤  └──────────────┘    ├──▶ Output 1
+///              │                       │
+///              │  ┌──────────────┐    │
+///              ├─▶│ SpillPool    │────┤
+///              │  │ [In1→Out0]   │    │
+///    Input 2 ──┤  └──────────────┘    ├──▶ Output 2
+///              │                      │
+///              │       ... (N×M SpillPools total)
+///              │                      │
+///              │  ┌──────────────┐    │
+///              └─▶│ SpillPool    │────┘
+///                 │ [InN→OutM]   │
+///                 └──────────────┘
+///
+/// Each SpillPool maintains FIFO order for its (input, output) pair.
+/// See `RepartitionBatch` for details on the memory/spill decision logic.
+/// ```
+///
 /// # Footnote
 ///
 /// The "Exchange Operator" was first described in the 1989 paper
@@ -514,6 +836,10 @@ impl BatchPartitioner {
 /// system Paper](https://dl.acm.org/doi/pdf/10.1145/93605.98720)
 /// which uses the term "Exchange" for the concept of repartitioning
 /// data across threads.
+///
+/// For more background, please also see the [Optimizing Repartitions in DataFusion] blog.
+///
+/// [Optimizing Repartitions in DataFusion]: https://datafusion.apache.org/blog/2025/12/15/avoid-consecutive-repartitions
 #[derive(Debug, Clone)]
 pub struct RepartitionExec {
     /// Input execution plan
@@ -527,7 +853,7 @@ pub struct RepartitionExec {
     /// `SortPreservingRepartitionExec`, false means `RepartitionExec`.
     preserve_order: bool,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 #[derive(Debug, Clone)]
@@ -586,7 +912,7 @@ impl RepartitionExec {
         &self.cache.partitioning
     }
 
-    /// Get preserve_order flag of the RepartitionExecutor
+    /// Get preserve_order flag of the RepartitionExec
     /// `true` means `SortPreservingRepartitionExec`, `false` means `RepartitionExec`
     pub fn preserve_order(&self) -> bool {
         self.preserve_order
@@ -596,10 +922,23 @@ impl RepartitionExec {
     pub fn name(&self) -> &str {
         "RepartitionExec"
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            state: Default::default(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for RepartitionExec {
     fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result {
+        let input_partition_count = self.input.output_partitioning().partition_count();
         match t {
             DisplayFormatType::Default | DisplayFormatType::Verbose => {
                 write!(
@@ -607,11 +946,17 @@ impl DisplayAs for RepartitionExec {
                     "{}: partitioning={}, input_partitions={}",
                     self.name(),
                     self.partitioning(),
-                    self.input.output_partitioning().partition_count()
+                    input_partition_count,
                 )?;
 
                 if self.preserve_order {
                     write!(f, ", preserve_order=true")?;
+                } else if input_partition_count <= 1
+                    && self.input.output_ordering().is_some()
+                {
+                    // Make it explicit that repartition maintains sortedness for a single input partition even
+                    // when `preserve_sort order` is false
+                    write!(f, ", maintains_sort_order=true")?;
                 }
 
                 if let Some(sort_exprs) = self.sort_exprs() {
@@ -621,9 +966,6 @@ impl DisplayAs for RepartitionExec {
             }
             DisplayFormatType::TreeRender => {
                 writeln!(f, "partitioning_scheme={}", self.partitioning(),)?;
-
-                let input_partition_count =
-                    self.input.output_partitioning().partition_count();
                 let output_partition_count = self.partitioning().partition_count();
                 let input_to_output_partition_str =
                     format!("{input_partition_count} -> {output_partition_count}");
@@ -647,11 +989,7 @@ impl ExecutionPlan for RepartitionExec {
     }
 
     /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -659,10 +997,26 @@ impl ExecutionPlan for RepartitionExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to hash partition expressions if this is a hash repartition
+        if let Partitioning::Hash(exprs, _) = self.partitioning() {
+            let mut tnr = TreeNodeRecursion::Continue;
+            for expr in exprs {
+                tnr = tnr.visit_sibling(|| f(expr.as_ref()))?;
+            }
+            return Ok(tnr);
+        }
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         let mut repartition = RepartitionExec::try_new(
             children.swap_remove(0),
             self.partitioning().clone(),
@@ -692,6 +1046,8 @@ impl ExecutionPlan for RepartitionExec {
             partition
         );
 
+        let spill_metrics = SpillMetrics::new(&self.metrics, partition);
+
         let input = Arc::clone(&self.input);
         let partitioning = self.partitioning().clone();
         let metrics = self.metrics.clone();
@@ -700,33 +1056,40 @@ impl ExecutionPlan for RepartitionExec {
         let schema = self.schema();
         let schema_captured = Arc::clone(&schema);
 
+        let spill_manager = SpillManager::new(
+            Arc::clone(&context.runtime_env()),
+            spill_metrics,
+            input.schema(),
+        );
+
         // Get existing ordering to use for merging
         let sort_exprs = self.sort_exprs().cloned();
 
         let state = Arc::clone(&self.state);
         if let Some(mut state) = state.try_lock() {
             state.ensure_input_streams_initialized(
-                Arc::clone(&input),
-                metrics.clone(),
+                &input,
+                &metrics,
                 partitioning.partition_count(),
-                Arc::clone(&context),
+                &context,
             )?;
         }
 
-        let stream = futures::stream::once(async move {
-            let num_input_partitions = input.output_partitioning().partition_count();
+        let num_input_partitions = input.output_partitioning().partition_count();
 
+        let stream = futures::stream::once(async move {
             // lock scope
-            let (mut rx, reservation, spill_manager, abort_helper) = {
+            let (rx, reservation, spill_readers, abort_helper) = {
                 // lock mutexes
                 let mut state = state.lock();
                 let state = state.consume_input_streams(
-                    Arc::clone(&input),
-                    metrics.clone(),
-                    partitioning,
+                    &input,
+                    &metrics,
+                    &partitioning,
                     preserve_order,
-                    name.clone(),
-                    Arc::clone(&context),
+                    &name,
+                    &context,
+                    spill_manager.clone(),
                 )?;
 
                 // now return stream for the specified *output* partition which will
@@ -734,7 +1097,7 @@ impl ExecutionPlan for RepartitionExec {
                 let PartitionChannels {
                     rx,
                     reservation,
-                    spill_manager,
+                    spill_readers,
                     ..
                 } = state
                     .channels
@@ -744,7 +1107,7 @@ impl ExecutionPlan for RepartitionExec {
                 (
                     rx,
                     reservation,
-                    spill_manager,
+                    spill_readers,
                     Arc::clone(&state.abort_helper),
                 )
             };
@@ -755,17 +1118,22 @@ impl ExecutionPlan for RepartitionExec {
 
             if preserve_order {
                 // Store streams from all the input partitions:
+                // Each input partition gets its own spill reader to maintain proper FIFO ordering
                 let input_streams = rx
                     .into_iter()
-                    .map(|receiver| {
-                        Box::pin(PerPartitionStream {
-                            schema: Arc::clone(&schema_captured),
+                    .zip(spill_readers)
+                    .map(|(receiver, spill_stream)| {
+                        // In preserve_order mode, each receiver corresponds to exactly one input partition
+                        Box::pin(PerPartitionStream::new(
+                            Arc::clone(&schema_captured),
                             receiver,
-                            _drop_helper: Arc::clone(&abort_helper),
-                            reservation: Arc::clone(&reservation),
-                            spill_manager: Arc::clone(&spill_manager),
-                            state: RepartitionStreamState::ReceivingFromChannel,
-                        }) as SendableRecordBatchStream
+                            Arc::clone(&abort_helper),
+                            Arc::clone(&reservation),
+                            spill_stream,
+                            1, // Each receiver handles one input partition
+                            BaselineMetrics::new(&metrics, partition),
+                            None, // subsequent merge sort already does batching https://github.com/apache/datafusion/blob/e4dcf0c85611ad0bd291f03a8e03fe56d773eb16/datafusion/physical-plan/src/sorts/merge.rs#L286
+                        )) as SendableRecordBatchStream
                     })
                     .collect::<Vec<_>>();
                 // Note that receiver size (`rx.len()`) and `num_input_partitions` are same.
@@ -784,18 +1152,27 @@ impl ExecutionPlan for RepartitionExec {
                     .with_batch_size(context.session_config().batch_size())
                     .with_fetch(fetch)
                     .with_reservation(merge_reservation)
+                    .with_spill_manager(spill_manager)
                     .build()
             } else {
-                Ok(Box::pin(RepartitionStream {
-                    num_input_partitions,
-                    num_input_partitions_processed: 0,
-                    schema: input.schema(),
-                    input: rx.swap_remove(0),
-                    _drop_helper: abort_helper,
+                // Non-preserve-order case: single input stream, so use the first spill reader
+                let spill_stream = spill_readers
+                    .into_iter()
+                    .next()
+                    .expect("at least one spill reader should exist");
+
+                Ok(Box::pin(PerPartitionStream::new(
+                    schema_captured,
+                    rx.into_iter()
+                        .next()
+                        .expect("at least one receiver should exist"),
+                    abort_helper,
                     reservation,
-                    spill_manager,
-                    state: RepartitionStreamState::ReceivingFromChannel,
-                }) as SendableRecordBatchStream)
+                    spill_stream,
+                    num_input_partitions,
+                    BaselineMetrics::new(&metrics, partition),
+                    Some(context.session_config().batch_size()),
+                )) as SendableRecordBatchStream)
             }
         })
         .try_flatten();
@@ -807,26 +1184,21 @@ impl ExecutionPlan for RepartitionExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.input.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         if let Some(partition) = partition {
             let partition_count = self.partitioning().partition_count();
             if partition_count == 0 {
-                return Ok(Statistics::new_unknown(&self.schema()));
+                return Ok(Arc::new(Statistics::new_unknown(&self.schema())));
             }
 
-            if partition >= partition_count {
-                return internal_err!(
-                    "RepartitionExec invalid partition {} (expected less than {})",
-                    partition,
-                    self.partitioning().partition_count()
-                );
-            }
+            assert_or_internal_err!(
+                partition < partition_count,
+                "RepartitionExec invalid partition {} (expected less than {})",
+                partition,
+                partition_count
+            );
 
-            let mut stats = self.input.partition_statistics(None)?;
+            let mut stats = Arc::unwrap_or_clone(self.input.partition_statistics(None)?);
 
             // Distribute statistics across partitions
             stats.num_rows = stats
@@ -847,7 +1219,7 @@ impl ExecutionPlan for RepartitionExec {
                 .map(|_| ColumnStatistics::new_unknown())
                 .collect();
 
-            Ok(stats)
+            Ok(Arc::new(stats))
         } else {
             self.input.partition_statistics(None)
         }
@@ -915,13 +1287,34 @@ impl ExecutionPlan for RepartitionExec {
         Ok(FilterPushdownPropagation::if_all(child_pushdown_result))
     }
 
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn ExecutionPlan>>> {
+        // RepartitionExec only maintains input order if preserve_order is set
+        // or if there's only one partition
+        if !self.maintains_input_order()[0] {
+            return Ok(SortOrderPushdownResult::Unsupported);
+        }
+
+        // Delegate to the child and wrap with a new RepartitionExec
+        self.input.try_pushdown_sort(order)?.try_map(|new_input| {
+            let mut new_repartition =
+                RepartitionExec::try_new(new_input, self.partitioning().clone())?;
+            if self.preserve_order {
+                new_repartition = new_repartition.with_preserve_order();
+            }
+            Ok(Arc::new(new_repartition) as Arc<dyn ExecutionPlan>)
+        })
+    }
+
     fn repartitioned(
         &self,
         target_partitions: usize,
         _config: &ConfigOptions,
     ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
         use Partitioning::*;
-        let mut new_properties = self.cache.clone();
+        let mut new_properties = PlanProperties::clone(&self.cache);
         new_properties.partitioning = match new_properties.partitioning {
             RoundRobinBatch(_) => RoundRobinBatch(target_partitions),
             Hash(hash, _) => Hash(hash, target_partitions),
@@ -932,7 +1325,7 @@ impl ExecutionPlan for RepartitionExec {
             state: Arc::clone(&self.state),
             metrics: self.metrics.clone(),
             preserve_order: self.preserve_order,
-            cache: new_properties,
+            cache: new_properties.into(),
         })))
     }
 }
@@ -946,14 +1339,13 @@ impl RepartitionExec {
         partitioning: Partitioning,
     ) -> Result<Self> {
         let preserve_order = false;
-        let cache =
-            Self::compute_properties(&input, partitioning.clone(), preserve_order);
+        let cache = Self::compute_properties(&input, partitioning, preserve_order);
         Ok(RepartitionExec {
             input,
             state: Default::default(),
             metrics: ExecutionPlanMetricsSet::new(),
             preserve_order,
-            cache,
+            cache: Arc::new(cache),
         })
     }
 
@@ -1014,7 +1406,7 @@ impl RepartitionExec {
                 // to maintain order
                 self.input.output_partitioning().partition_count() > 1;
         let eq_properties = Self::eq_properties_helper(&self.input, self.preserve_order);
-        self.cache = self.cache.with_eq_properties(eq_properties);
+        Arc::make_mut(&mut self.cache).set_eq_properties(eq_properties);
         self
     }
 
@@ -1030,22 +1422,35 @@ impl RepartitionExec {
     /// Pulls data from the specified input plan, feeding it to the
     /// output partitions based on the desired partitioning
     ///
-    /// txs hold the output sending channels for each output partition
+    /// `output_channels` holds the output sending channels for each output partition
     async fn pull_from_input(
         mut stream: SendableRecordBatchStream,
-        mut output_channels: HashMap<
-            usize,
-            (
-                DistributionSender<MaybeBatch>,
-                SharedMemoryReservation,
-                Arc<SpillManager>,
-            ),
-        >,
+        mut output_channels: HashMap<usize, OutputChannel>,
         partitioning: Partitioning,
         metrics: RepartitionMetrics,
+        input_partition: usize,
+        num_input_partitions: usize,
     ) -> Result<()> {
-        let mut partitioner =
-            BatchPartitioner::try_new(partitioning, metrics.repartition_time.clone())?;
+        let mut partitioner = match &partitioning {
+            Partitioning::Hash(exprs, num_partitions) => {
+                BatchPartitioner::new_hash_partitioner(
+                    exprs.clone(),
+                    *num_partitions,
+                    metrics.repartition_time.clone(),
+                )?
+            }
+            Partitioning::RoundRobinBatch(num_partitions) => {
+                BatchPartitioner::new_round_robin_partitioner(
+                    *num_partitions,
+                    metrics.repartition_time.clone(),
+                    input_partition,
+                    num_input_partitions,
+                )
+            }
+            other => {
+                return not_impl_err!("Unsupported repartitioning scheme {other:?}");
+            }
+        };
 
         // While there are still outputs to send to, keep pulling inputs
         let mut batches_until_yield = partitioner.num_partitions();
@@ -1072,37 +1477,27 @@ impl RepartitionExec {
 
                 let timer = metrics.send_time[partition].timer();
                 // if there is still a receiver, send to it
-                if let Some((tx, reservation, spill_manager)) =
-                    output_channels.get_mut(&partition)
-                {
+                if let Some(channel) = output_channels.get_mut(&partition) {
                     let (batch_to_send, is_memory_batch) =
-                        match reservation.lock().try_grow(size) {
+                        match channel.reservation.try_grow(size) {
                             Ok(_) => {
                                 // Memory available - send in-memory batch
                                 (RepartitionBatch::Memory(batch), true)
                             }
                             Err(_) => {
-                                // We're memory limited - spill this single batch to its own file
-                                let spill_file = spill_manager
-                                    .spill_record_batch_and_finish(
-                                        &[batch],
-                                        &format!(
-                                            "RepartitionExec spill partition {partition}"
-                                        ),
-                                    )?
-                                    // Note that we handled empty batch above, so this is safe
-                                    .expect("non-empty batch should produce spill file");
-
-                                // Store size for validation when reading back
-                                (RepartitionBatch::Spilled { spill_file, size }, false)
+                                // We're memory limited - spill to SpillPool
+                                // SpillPool handles file handle reuse and rotation
+                                channel.spill_writer.push_batch(&batch)?;
+                                // Send marker indicating batch was spilled
+                                (RepartitionBatch::Spilled, false)
                             }
                         };
 
-                    if tx.send(Some(Ok(batch_to_send))).await.is_err() {
+                    if channel.sender.send(Some(Ok(batch_to_send))).await.is_err() {
                         // If the other end has hung up, it was an early shutdown (e.g. LIMIT)
                         // Only shrink memory if it was a memory batch
                         if is_memory_batch {
-                            reservation.lock().shrink(size);
+                            channel.reservation.shrink(size);
                         }
                         output_channels.remove(&partition);
                     }
@@ -1134,6 +1529,8 @@ impl RepartitionExec {
             }
         }
 
+        // Spill writers will auto-finalize when dropped
+        // No need for explicit flush
         Ok(())
     }
 
@@ -1176,7 +1573,7 @@ impl RepartitionExec {
             // Input task completed successfully
             Ok(Ok(())) => {
                 // notify each output partition that this input partition has no more data
-                for (_, tx) in txs {
+                for (_partition, tx) in txs {
                     tx.send(None).await.ok();
                 }
             }
@@ -1184,25 +1581,65 @@ impl RepartitionExec {
     }
 }
 
-enum RepartitionStreamState {
-    /// Waiting for next item from channel
-    ReceivingFromChannel,
-    /// Reading a spilled batch from disk (stream reads via tokio::fs)
-    ReadingSpilledBatch(SendableRecordBatchStream),
+/// State for tracking whether we're reading from memory channel or spill stream.
+///
+/// This state machine ensures proper ordering when batches are mixed between memory
+/// and spilled storage. When a [`RepartitionBatch::Spilled`] marker is received,
+/// the stream must block on the spill stream until the corresponding batch arrives.
+///
+/// # State Machine
+///
+/// ```text
+///                        ┌─────────────────┐
+///                   ┌───▶│  ReadingMemory  │◀───┐
+///                   │    └────────┬────────┘    │
+///                   │             │             │
+///                   │     Poll channel          │
+///                   │             │             │
+///                   │  ┌──────────┼─────────────┐
+///                   │  │          │             │
+///                   │  ▼          ▼             │
+///                   │ Memory   Spilled          │
+///       Got batch   │ batch    marker           │
+///       from spill  │  │          │             │
+///                   │  │          ▼             │
+///                   │  │  ┌──────────────────┐  │
+///                   │  │  │ ReadingSpilled   │  │
+///                   │  │  └────────┬─────────┘  │
+///                   │  │           │            │
+///                   │  │   Poll spill_stream    │
+///                   │  │           │            │
+///                   │  │           ▼            │
+///                   │  │      Get batch         │
+///                   │  │           │            │
+///                   └──┴───────────┴────────────┘
+///                                  │
+///                                  ▼
+///                           Return batch
+///                     (Order preserved within
+///                      (input, output) pair)
+/// ```
+///
+/// The transition to `ReadingSpilled` blocks further channel polling to maintain
+/// FIFO ordering - we cannot read the next item from the channel until the spill
+/// stream provides the current batch.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum StreamState {
+    /// Reading from the memory channel (normal operation)
+    ReadingMemory,
+    /// Waiting for a spilled batch from the spill stream.
+    /// Must not poll channel until spilled batch is received to preserve ordering.
+    ReadingSpilled,
 }
 
-struct RepartitionStream {
-    /// Number of input partitions that will be sending batches to this output channel
-    num_input_partitions: usize,
-
-    /// Number of input partitions that have finished sending batches to this output channel
-    num_input_partitions_processed: usize,
-
+/// This struct converts a receiver to a stream.
+/// Receiver receives data on an SPSC channel.
+struct PerPartitionStream {
     /// Schema wrapped by Arc
     schema: SchemaRef,
 
     /// channel containing the repartitioned batches
-    input: DistributionReceiver<MaybeBatch>,
+    receiver: DistributionReceiver<MaybeBatch>,
 
     /// Handle to ensure background tasks are killed when no longer needed.
     _drop_helper: Arc<Vec<SpawnedTask<()>>>,
@@ -1210,114 +1647,166 @@ struct RepartitionStream {
     /// Memory reservation.
     reservation: SharedMemoryReservation,
 
-    /// Spill manager for reading spilled batches
-    spill_manager: Arc<SpillManager>,
+    /// Infinite stream for reading from the spill pool
+    spill_stream: SendableRecordBatchStream,
+
+    /// Internal state indicating if we are reading from memory or spill stream
+    state: StreamState,
+
+    /// Number of input partitions that have not yet finished.
+    /// In non-preserve-order mode, multiple input partitions send to the same channel,
+    /// each sending None when complete. We must wait for all of them.
+    remaining_partitions: usize,
 
-    /// Current state of the stream
-    state: RepartitionStreamState,
+    /// Execution metrics
+    baseline_metrics: BaselineMetrics,
+
+    /// None for sort preserving variant (merge sort already does coalescing)
+    batch_coalescer: Option<LimitedBatchCoalescer>,
 }
 
-impl Stream for RepartitionStream {
-    type Item = Result<RecordBatch>;
+impl PerPartitionStream {
+    #[expect(clippy::too_many_arguments)]
+    fn new(
+        schema: SchemaRef,
+        receiver: DistributionReceiver<MaybeBatch>,
+        drop_helper: Arc<Vec<SpawnedTask<()>>>,
+        reservation: SharedMemoryReservation,
+        spill_stream: SendableRecordBatchStream,
+        num_input_partitions: usize,
+        baseline_metrics: BaselineMetrics,
+        batch_size: Option<usize>,
+    ) -> Self {
+        let batch_coalescer =
+            batch_size.map(|s| LimitedBatchCoalescer::new(Arc::clone(&schema), s, None));
+        Self {
+            schema,
+            receiver,
+            _drop_helper: drop_helper,
+            reservation,
+            spill_stream,
+            state: StreamState::ReadingMemory,
+            remaining_partitions: num_input_partitions,
+            baseline_metrics,
+            batch_coalescer,
+        }
+    }
 
-    fn poll_next(
-        mut self: Pin<&mut Self>,
+    fn poll_next_inner(
+        self: &mut Pin<&mut Self>,
         cx: &mut Context<'_>,
-    ) -> Poll<Option<Self::Item>> {
+    ) -> Poll<Option<Result<RecordBatch>>> {
+        use futures::StreamExt;
+        let cloned_time = self.baseline_metrics.elapsed_compute().clone();
+        let _timer = cloned_time.timer();
+
         loop {
-            match &mut self.state {
-                RepartitionStreamState::ReceivingFromChannel => {
-                    let value = futures::ready!(self.input.recv().poll_unpin(cx));
+            match self.state {
+                StreamState::ReadingMemory => {
+                    // Poll the memory channel for next message
+                    let value = match self.receiver.recv().poll_unpin(cx) {
+                        Poll::Ready(v) => v,
+                        Poll::Pending => {
+                            // Nothing from channel, wait
+                            return Poll::Pending;
+                        }
+                    };
+
                     match value {
                         Some(Some(v)) => match v {
                             Ok(RepartitionBatch::Memory(batch)) => {
-                                // Release memory and return
-                                self.reservation
-                                    .lock()
-                                    .shrink(batch.get_array_memory_size());
+                                // Release memory and return batch
+                                self.reservation.shrink(batch.get_array_memory_size());
                                 return Poll::Ready(Some(Ok(batch)));
                             }
-                            Ok(RepartitionBatch::Spilled { spill_file, size }) => {
-                                // Read from disk - SpillReaderStream uses tokio::fs internally
-                                // Pass the original size for validation
-                                let stream = self
-                                    .spill_manager
-                                    .read_spill_as_stream(spill_file, Some(size))?;
-                                self.state =
-                                    RepartitionStreamState::ReadingSpilledBatch(stream);
-                                // Continue loop to poll the stream immediately
+                            Ok(RepartitionBatch::Spilled) => {
+                                // Batch was spilled, transition to reading from spill stream
+                                // We must block on spill stream until we get the batch
+                                // to preserve ordering
+                                self.state = StreamState::ReadingSpilled;
+                                continue;
                             }
                             Err(e) => {
                                 return Poll::Ready(Some(Err(e)));
                             }
                         },
                         Some(None) => {
-                            self.num_input_partitions_processed += 1;
-
-                            if self.num_input_partitions
-                                == self.num_input_partitions_processed
-                            {
-                                // all input partitions have finished sending batches
+                            // One input partition finished
+                            self.remaining_partitions -= 1;
+                            if self.remaining_partitions == 0 {
+                                // All input partitions finished
                                 return Poll::Ready(None);
-                            } else {
-                                // other partitions still have data to send
-                                continue;
                             }
+                            // Continue to poll for more data from other partitions
+                            continue;
                         }
                         None => {
+                            // Channel closed unexpectedly
                             return Poll::Ready(None);
                         }
                     }
                 }
-                RepartitionStreamState::ReadingSpilledBatch(stream) => {
-                    match futures::ready!(stream.poll_next_unpin(cx)) {
-                        Some(Ok(batch)) => {
-                            // Return batch and stay in ReadingSpilledBatch state to read more batches
+                StreamState::ReadingSpilled => {
+                    // Poll spill stream for the spilled batch
+                    match self.spill_stream.poll_next_unpin(cx) {
+                        Poll::Ready(Some(Ok(batch))) => {
+                            self.state = StreamState::ReadingMemory;
                             return Poll::Ready(Some(Ok(batch)));
                         }
-                        Some(Err(e)) => {
-                            self.state = RepartitionStreamState::ReceivingFromChannel;
+                        Poll::Ready(Some(Err(e))) => {
                             return Poll::Ready(Some(Err(e)));
                         }
-                        None => {
-                            // Spill stream ended - go back to receiving from channel
-                            self.state = RepartitionStreamState::ReceivingFromChannel;
-                            continue;
+                        Poll::Ready(None) => {
+                            // Spill stream ended, keep draining the memory channel
+                            self.state = StreamState::ReadingMemory;
+                        }
+                        Poll::Pending => {
+                            // Spilled batch not ready yet, must wait
+                            // This preserves ordering by blocking until spill data arrives
+                            return Poll::Pending;
                         }
                     }
                 }
             }
         }
     }
-}
-
-impl RecordBatchStream for RepartitionStream {
-    /// Get the schema
-    fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.schema)
-    }
-}
-
-/// This struct converts a receiver to a stream.
-/// Receiver receives data on an SPSC channel.
-struct PerPartitionStream {
-    /// Schema wrapped by Arc
-    schema: SchemaRef,
-
-    /// channel containing the repartitioned batches
-    receiver: DistributionReceiver<MaybeBatch>,
-
-    /// Handle to ensure background tasks are killed when no longer needed.
-    _drop_helper: Arc<Vec<SpawnedTask<()>>>,
 
-    /// Memory reservation.
-    reservation: SharedMemoryReservation,
+    fn poll_next_and_coalesce(
+        self: &mut Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        coalescer: &mut LimitedBatchCoalescer,
+    ) -> Poll<Option<Result<RecordBatch>>> {
+        let cloned_time = self.baseline_metrics.elapsed_compute().clone();
+        let mut completed = false;
 
-    /// Spill manager for reading spilled batches
-    spill_manager: Arc<SpillManager>,
+        loop {
+            if let Some(batch) = coalescer.next_completed_batch() {
+                return Poll::Ready(Some(Ok(batch)));
+            }
+            if completed {
+                return Poll::Ready(None);
+            }
 
-    /// Current state of the stream
-    state: RepartitionStreamState,
+            match ready!(self.poll_next_inner(cx)) {
+                Some(Ok(batch)) => {
+                    let _timer = cloned_time.timer();
+                    if let Err(err) = coalescer.push_batch(batch) {
+                        return Poll::Ready(Some(Err(err)));
+                    }
+                }
+                Some(err) => {
+                    return Poll::Ready(Some(err));
+                }
+                None => {
+                    completed = true;
+                    let _timer = cloned_time.timer();
+                    if let Err(err) = coalescer.finish() {
+                        return Poll::Ready(Some(Err(err)));
+                    }
+                }
+            }
+        }
+    }
 }
 
 impl Stream for PerPartitionStream {
@@ -1327,60 +1816,14 @@ impl Stream for PerPartitionStream {
         mut self: Pin<&mut Self>,
         cx: &mut Context<'_>,
     ) -> Poll<Option<Self::Item>> {
-        loop {
-            match &mut self.state {
-                RepartitionStreamState::ReceivingFromChannel => {
-                    let value = futures::ready!(self.receiver.recv().poll_unpin(cx));
-                    match value {
-                        Some(Some(v)) => match v {
-                            Ok(RepartitionBatch::Memory(batch)) => {
-                                // Release memory and return
-                                self.reservation
-                                    .lock()
-                                    .shrink(batch.get_array_memory_size());
-                                return Poll::Ready(Some(Ok(batch)));
-                            }
-                            Ok(RepartitionBatch::Spilled { spill_file, size }) => {
-                                // Read from disk - SpillReaderStream uses tokio::fs internally
-                                // Pass the original size for validation
-                                let stream = self
-                                    .spill_manager
-                                    .read_spill_as_stream(spill_file, Some(size))?;
-                                self.state =
-                                    RepartitionStreamState::ReadingSpilledBatch(stream);
-                                // Continue loop to poll the stream immediately
-                            }
-                            Err(e) => {
-                                return Poll::Ready(Some(Err(e)));
-                            }
-                        },
-                        Some(None) => {
-                            // Input partition has finished sending batches
-                            return Poll::Ready(None);
-                        }
-                        None => return Poll::Ready(None),
-                    }
-                }
-
-                RepartitionStreamState::ReadingSpilledBatch(stream) => {
-                    match futures::ready!(stream.poll_next_unpin(cx)) {
-                        Some(Ok(batch)) => {
-                            // Return batch and stay in ReadingSpilledBatch state to read more batches
-                            return Poll::Ready(Some(Ok(batch)));
-                        }
-                        Some(Err(e)) => {
-                            self.state = RepartitionStreamState::ReceivingFromChannel;
-                            return Poll::Ready(Some(Err(e)));
-                        }
-                        None => {
-                            // Spill stream ended - go back to receiving from channel
-                            self.state = RepartitionStreamState::ReceivingFromChannel;
-                            continue;
-                        }
-                    }
-                }
-            }
+        let poll;
+        if let Some(mut coalescer) = self.batch_coalescer.take() {
+            poll = self.poll_next_and_coalesce(cx, &mut coalescer);
+            self.batch_coalescer = Some(coalescer);
+        } else {
+            poll = self.poll_next_inner(cx);
         }
+        self.baseline_metrics.record_poll(poll)
     }
 }
 
@@ -1401,8 +1844,8 @@ mod tests {
         test::{
             assert_is_pending,
             exec::{
-                assert_strong_count_converges_to_zero, BarrierExec, BlockingExec,
-                ErrorExec, MockExec,
+                BarrierExec, BlockingExec, ErrorExec, MockExec,
+                assert_strong_count_converges_to_zero,
             },
         },
         {collect, expressions::col},
@@ -1414,9 +1857,101 @@ mod tests {
     use datafusion_common::exec_err;
     use datafusion_common::test_util::batches_to_sort_string;
     use datafusion_common_runtime::JoinSet;
+    use datafusion_execution::config::SessionConfig;
     use datafusion_execution::runtime_env::RuntimeEnvBuilder;
     use insta::assert_snapshot;
-    use itertools::Itertools;
+
+    #[test]
+    fn strength_reduced_u64_remainder_matches_modulo() {
+        let divisors = [
+            1,
+            2,
+            3,
+            4,
+            5,
+            7,
+            8,
+            10,
+            16,
+            31,
+            32,
+            63,
+            64,
+            65,
+            97,
+            u64::from(u32::MAX),
+            u64::from(u32::MAX) + 1,
+            1_u64 << 32,
+            (1_u64 << 63) - 1,
+            1_u64 << 63,
+            u64::MAX - 1,
+            u64::MAX,
+        ];
+        let values = [
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            31,
+            32,
+            33,
+            63,
+            64,
+            65,
+            u64::from(u32::MAX) - 1,
+            u64::from(u32::MAX),
+            u64::from(u32::MAX) + 1,
+            (1_u64 << 32) - 1,
+            1_u64 << 32,
+            (1_u64 << 32) + 1,
+            (1_u64 << 63) - 1,
+            1_u64 << 63,
+            (1_u64 << 63) + 1,
+            u64::MAX - 1,
+            u64::MAX,
+        ];
+
+        for divisor in divisors {
+            let reducer = StrengthReducedU64::new(divisor);
+            for value in values {
+                assert_eq!(
+                    reducer.remainder(value),
+                    value % divisor,
+                    "value={value} divisor={divisor}"
+                );
+            }
+
+            let mut value = 0x1234_5678_9abc_def0 ^ divisor;
+            for _ in 0..10_000 {
+                value = value
+                    .wrapping_mul(6_364_136_223_846_793_005)
+                    .wrapping_add(1_442_695_040_888_963_407);
+                assert_eq!(
+                    reducer.remainder(value),
+                    value % divisor,
+                    "value={value} divisor={divisor}"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn hash_partitioner_requires_nonzero_partitions() {
+        let metrics = ExecutionPlanMetricsSet::new();
+        let timer = MetricBuilder::new(&metrics).subset_time("test", 0);
+
+        let err = BatchPartitioner::new_hash_partitioner(vec![], 0, timer)
+            .err()
+            .expect("zero hash partitions should fail")
+            .to_string();
+
+        assert!(
+            err.contains("Hash repartition requires at least one partition"),
+            "actual: {err}"
+        );
+    }
 
     #[tokio::test]
     async fn one_to_many_round_robin() -> Result<()> {
@@ -1430,10 +1965,13 @@ mod tests {
             repartition(&schema, partitions, Partitioning::RoundRobinBatch(4)).await?;
 
         assert_eq!(4, output_partitions.len());
-        assert_eq!(13, output_partitions[0].len());
-        assert_eq!(13, output_partitions[1].len());
-        assert_eq!(12, output_partitions[2].len());
-        assert_eq!(12, output_partitions[3].len());
+        for partition in &output_partitions {
+            assert_eq!(1, partition.len());
+        }
+        assert_eq!(13 * 8, output_partitions[0][0].num_rows());
+        assert_eq!(13 * 8, output_partitions[1][0].num_rows());
+        assert_eq!(12 * 8, output_partitions[2][0].num_rows());
+        assert_eq!(12 * 8, output_partitions[3][0].num_rows());
 
         Ok(())
     }
@@ -1450,7 +1988,7 @@ mod tests {
             repartition(&schema, partitions, Partitioning::RoundRobinBatch(1)).await?;
 
         assert_eq!(1, output_partitions.len());
-        assert_eq!(150, output_partitions[0].len());
+        assert_eq!(150 * 8, output_partitions[0][0].num_rows());
 
         Ok(())
     }
@@ -1466,12 +2004,12 @@ mod tests {
         let output_partitions =
             repartition(&schema, partitions, Partitioning::RoundRobinBatch(5)).await?;
 
+        let total_rows_per_partition = 8 * 50 * 3 / 5;
         assert_eq!(5, output_partitions.len());
-        assert_eq!(30, output_partitions[0].len());
-        assert_eq!(30, output_partitions[1].len());
-        assert_eq!(30, output_partitions[2].len());
-        assert_eq!(30, output_partitions[3].len());
-        assert_eq!(30, output_partitions[4].len());
+        for partition in output_partitions {
+            assert_eq!(1, partition.len());
+            assert_eq!(total_rows_per_partition, partition[0].num_rows());
+        }
 
         Ok(())
     }
@@ -1501,6 +2039,32 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_repartition_with_coalescing() -> Result<()> {
+        let schema = test_schema();
+        // create 50 batches, each having 8 rows
+        let partition = create_vec_batches(50);
+        let partitions = vec![partition.clone(), partition.clone()];
+        let partitioning = Partitioning::RoundRobinBatch(1);
+
+        let session_config = SessionConfig::new().with_batch_size(200);
+        let task_ctx = TaskContext::default().with_session_config(session_config);
+        let task_ctx = Arc::new(task_ctx);
+
+        // create physical plan
+        let exec = TestMemoryExec::try_new_exec(&partitions, Arc::clone(&schema), None)?;
+        let exec = RepartitionExec::try_new(exec, partitioning)?;
+
+        for i in 0..exec.partitioning().partition_count() {
+            let mut stream = exec.execute(i, Arc::clone(&task_ctx))?;
+            while let Some(result) = stream.next().await {
+                let batch = result?;
+                assert_eq!(200, batch.num_rows());
+            }
+        }
+        Ok(())
+    }
+
     fn test_schema() -> Arc<Schema> {
         Arc::new(Schema::new(vec![Field::new("c0", DataType::UInt32, false)]))
     }
@@ -1546,12 +2110,12 @@ mod tests {
 
         let output_partitions = handle.join().await.unwrap().unwrap();
 
+        let total_rows_per_partition = 8 * 50 * 3 / 5;
         assert_eq!(5, output_partitions.len());
-        assert_eq!(30, output_partitions[0].len());
-        assert_eq!(30, output_partitions[1].len());
-        assert_eq!(30, output_partitions[2].len());
-        assert_eq!(30, output_partitions[3].len());
-        assert_eq!(30, output_partitions[4].len());
+        for partition in output_partitions {
+            assert_eq!(1, partition.len());
+            assert_eq!(total_rows_per_partition, partition[0].num_rows());
+        }
 
         Ok(())
     }
@@ -1720,16 +2284,16 @@ mod tests {
         // output stream 1 should *not* error and have one of the input batches
         let batches = crate::common::collect(output_stream1).await.unwrap();
 
-        assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +------------------+
-            | my_awesome_field |
-            +------------------+
-            | baz              |
-            | frob             |
-            | gaz              |
-            | grob             |
-            +------------------+
-            "#);
+        assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +------------------+
+        | my_awesome_field |
+        +------------------+
+        | baz              |
+        | frob             |
+        | gar              |
+        | goo              |
+        +------------------+
+        ");
     }
 
     #[tokio::test]
@@ -1789,14 +2353,13 @@ mod tests {
         });
         let batches_with_drop = crate::common::collect(output_stream1).await.unwrap();
 
-        fn sort(batch: Vec<RecordBatch>) -> Vec<RecordBatch> {
-            batch
-                .into_iter()
-                .sorted_by_key(|b| format!("{b:?}"))
-                .collect()
-        }
-
-        assert_eq!(sort(batches_without_drop), sort(batches_with_drop));
+        let items_vec_with_drop = str_batches_to_vec(&batches_with_drop);
+        let items_set_with_drop: HashSet<&str> =
+            items_vec_with_drop.iter().copied().collect();
+        assert_eq!(
+            items_set_with_drop.symmetric_difference(&items_set).count(),
+            0
+        );
     }
 
     fn str_batches_to_vec(batches: &[RecordBatch]) -> Vec<&str> {
@@ -2124,7 +2687,7 @@ mod tests {
     /// Create vector batches
     fn create_vec_batches(n: usize) -> Vec<RecordBatch> {
         let batch = create_batch();
-        (0..n).map(|_| batch.clone()).collect()
+        std::iter::repeat_n(batch, n).collect()
     }
 
     /// Create batch
@@ -2136,19 +2699,111 @@ mod tests {
         )
         .unwrap()
     }
+
+    /// Create batches with sequential values for ordering tests
+    fn create_ordered_batches(num_batches: usize) -> Vec<RecordBatch> {
+        let schema = test_schema();
+        (0..num_batches)
+            .map(|i| {
+                let start = (i * 8) as u32;
+                RecordBatch::try_new(
+                    Arc::clone(&schema),
+                    vec![Arc::new(UInt32Array::from(
+                        (start..start + 8).collect::<Vec<_>>(),
+                    ))],
+                )
+                .unwrap()
+            })
+            .collect()
+    }
+
+    #[tokio::test]
+    async fn test_repartition_ordering_with_spilling() -> Result<()> {
+        // Test that repartition preserves ordering when spilling occurs
+        // This tests the state machine fix where we must block on spill_stream
+        // when a Spilled marker is received, rather than continuing to poll the channel
+
+        let schema = test_schema();
+        // Create batches with sequential values: batch 0 has [0,1,2,3,4,5,6,7],
+        // batch 1 has [8,9,10,11,12,13,14,15], etc.
+        let partition = create_ordered_batches(20);
+        let input_partitions = vec![partition];
+
+        // Use RoundRobinBatch to ensure predictable ordering
+        let partitioning = Partitioning::RoundRobinBatch(2);
+
+        // Set up context with very tight memory limit to force spilling
+        let runtime = RuntimeEnvBuilder::default()
+            .with_memory_limit(1, 1.0)
+            .build_arc()?;
+
+        let task_ctx = TaskContext::default().with_runtime(runtime);
+        let task_ctx = Arc::new(task_ctx);
+
+        // create physical plan
+        let exec =
+            TestMemoryExec::try_new_exec(&input_partitions, Arc::clone(&schema), None)?;
+        let exec = RepartitionExec::try_new(exec, partitioning)?;
+
+        // Collect all output partitions
+        let mut all_batches = Vec::new();
+        for i in 0..exec.partitioning().partition_count() {
+            let mut partition_batches = Vec::new();
+            let mut stream = exec.execute(i, Arc::clone(&task_ctx))?;
+            while let Some(result) = stream.next().await {
+                let batch = result?;
+                partition_batches.push(batch);
+            }
+            all_batches.push(partition_batches);
+        }
+
+        // Verify spilling occurred
+        let metrics = exec.metrics().unwrap();
+        assert!(
+            metrics.spill_count().unwrap() > 0,
+            "Expected spilling to occur, but spill_count = 0"
+        );
+
+        // Verify ordering is preserved within each partition
+        // With RoundRobinBatch, even batches go to partition 0, odd batches to partition 1
+        for (partition_idx, batches) in all_batches.iter().enumerate() {
+            let mut last_value = None;
+            for batch in batches {
+                let array = batch
+                    .column(0)
+                    .as_any()
+                    .downcast_ref::<UInt32Array>()
+                    .unwrap();
+
+                for i in 0..array.len() {
+                    let value = array.value(i);
+                    if let Some(last) = last_value {
+                        assert!(
+                            value > last,
+                            "Ordering violated in partition {partition_idx}: {value} is not greater than {last}"
+                        );
+                    }
+                    last_value = Some(value);
+                }
+            }
+        }
+
+        Ok(())
+    }
 }
 
 #[cfg(test)]
 mod test {
+    use arrow::array::record_batch;
     use arrow::compute::SortOptions;
     use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common::assert_batches_eq;
 
     use super::*;
     use crate::test::TestMemoryExec;
     use crate::union::UnionExec;
 
     use datafusion_physical_expr::expressions::col;
-    use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
 
     /// Asserts that the plan is as expected
     ///
@@ -2197,7 +2852,7 @@ mod test {
 
         // Repartition should not preserve order
         assert_plan!(&exec, @r"
-        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
           DataSourceExec: partitions=1, partition_sizes=[0], output_ordering=c0@0 ASC
         ");
 
@@ -2224,6 +2879,204 @@ mod test {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_preserve_order_with_spilling() -> Result<()> {
+        use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+
+        // Create sorted input data across multiple partitions
+        // Partition1: [1,3], [5,7], [9,11]
+        // Partition2: [2,4], [6,8], [10,12]
+        let batch1 = record_batch!(("c0", UInt32, [1, 3])).unwrap();
+        let batch2 = record_batch!(("c0", UInt32, [2, 4])).unwrap();
+        let batch3 = record_batch!(("c0", UInt32, [5, 7])).unwrap();
+        let batch4 = record_batch!(("c0", UInt32, [6, 8])).unwrap();
+        let batch5 = record_batch!(("c0", UInt32, [9, 11])).unwrap();
+        let batch6 = record_batch!(("c0", UInt32, [10, 12])).unwrap();
+        let schema = batch1.schema();
+        let sort_exprs = LexOrdering::new([PhysicalSortExpr {
+            expr: col("c0", &schema).unwrap(),
+            options: SortOptions::default().asc(),
+        }])
+        .unwrap();
+        let partition1 = vec![batch1.clone(), batch3.clone(), batch5.clone()];
+        let partition2 = vec![batch2.clone(), batch4.clone(), batch6.clone()];
+        let input_partitions = vec![partition1, partition2];
+
+        // Set up context with tight memory limit to force spilling
+        // Sorting needs some non-spillable memory, so 64 bytes should force spilling while still allowing the query to complete
+        let runtime = RuntimeEnvBuilder::default()
+            .with_memory_limit(64, 1.0)
+            .build_arc()?;
+
+        let task_ctx = TaskContext::default().with_runtime(runtime);
+        let task_ctx = Arc::new(task_ctx);
+
+        // Create physical plan with order preservation
+        let exec = TestMemoryExec::try_new(&input_partitions, Arc::clone(&schema), None)?
+            .try_with_sort_information(vec![sort_exprs.clone(), sort_exprs])?;
+        let exec = Arc::new(exec);
+        let exec = Arc::new(TestMemoryExec::update_cache(&exec));
+        // Repartition into 3 partitions with order preservation
+        // We expect 1 batch per output partition after repartitioning
+        let exec = RepartitionExec::try_new(exec, Partitioning::RoundRobinBatch(3))?
+            .with_preserve_order();
+
+        let mut batches = vec![];
+
+        // Collect all partitions - should succeed by spilling to disk
+        for i in 0..exec.partitioning().partition_count() {
+            let mut stream = exec.execute(i, Arc::clone(&task_ctx))?;
+            while let Some(result) = stream.next().await {
+                let batch = result?;
+                batches.push(batch);
+            }
+        }
+
+        #[rustfmt::skip]
+        let expected = [
+            [
+                "+----+",
+                "| c0 |",
+                "+----+",
+                "| 1  |",
+                "| 2  |",
+                "| 3  |",
+                "| 4  |",
+                "+----+",
+            ],
+            [
+                "+----+",
+                "| c0 |",
+                "+----+",
+                "| 5  |",
+                "| 6  |",
+                "| 7  |",
+                "| 8  |",
+                "+----+",
+            ],
+            [
+                "+----+",
+                "| c0 |",
+                "+----+",
+                "| 9  |",
+                "| 10 |",
+                "| 11 |",
+                "| 12 |",
+                "+----+",
+            ],
+        ];
+
+        for (batch, expected) in batches.iter().zip(expected.iter()) {
+            assert_batches_eq!(expected, std::slice::from_ref(batch));
+        }
+
+        // We should have spilled ~ all of the data.
+        // - We spill data during the repartitioning phase
+        // - We may also spill during the final merge sort
+        let all_batches = [batch1, batch2, batch3, batch4, batch5, batch6];
+        let metrics = exec.metrics().unwrap();
+        assert!(
+            metrics.spill_count().unwrap() > input_partitions.len(),
+            "Expected spill_count > {} for order-preserving repartition, but got {:?}",
+            input_partitions.len(),
+            metrics.spill_count()
+        );
+        assert!(
+            metrics.spilled_bytes().unwrap()
+                > all_batches
+                    .iter()
+                    .map(|b| b.get_array_memory_size())
+                    .sum::<usize>(),
+            "Expected spilled_bytes > {} for order-preserving repartition, got {}",
+            all_batches
+                .iter()
+                .map(|b| b.get_array_memory_size())
+                .sum::<usize>(),
+            metrics.spilled_bytes().unwrap()
+        );
+        assert!(
+            metrics.spilled_rows().unwrap()
+                >= all_batches.iter().map(|b| b.num_rows()).sum::<usize>(),
+            "Expected spilled_rows > {} for order-preserving repartition, got {}",
+            all_batches.iter().map(|b| b.num_rows()).sum::<usize>(),
+            metrics.spilled_rows().unwrap()
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_hash_partitioning_with_spilling() -> Result<()> {
+        use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+
+        // Create input data similar to the round-robin test
+        let batch1 = record_batch!(("c0", UInt32, [1, 3])).unwrap();
+        let batch2 = record_batch!(("c0", UInt32, [2, 4])).unwrap();
+        let batch3 = record_batch!(("c0", UInt32, [5, 7])).unwrap();
+        let batch4 = record_batch!(("c0", UInt32, [6, 8])).unwrap();
+        let schema = batch1.schema();
+
+        let partition1 = vec![batch1.clone(), batch3.clone()];
+        let partition2 = vec![batch2.clone(), batch4.clone()];
+        let input_partitions = vec![partition1, partition2];
+
+        // Set up context with memory limit to test hash partitioning with spilling infrastructure
+        let runtime = RuntimeEnvBuilder::default()
+            .with_memory_limit(1, 1.0)
+            .build_arc()?;
+
+        let task_ctx = TaskContext::default().with_runtime(runtime);
+        let task_ctx = Arc::new(task_ctx);
+
+        // Create physical plan with hash partitioning
+        let exec = TestMemoryExec::try_new(&input_partitions, Arc::clone(&schema), None)?;
+        let exec = Arc::new(exec);
+        let exec = Arc::new(TestMemoryExec::update_cache(&exec));
+        // Hash partition into 2 partitions by column c0
+        let hash_expr = col("c0", &schema)?;
+        let exec =
+            RepartitionExec::try_new(exec, Partitioning::Hash(vec![hash_expr], 2))?;
+
+        // Collect all partitions concurrently using JoinSet - this prevents deadlock
+        // where the distribution channel gate closes when all output channels are full
+        let mut join_set = tokio::task::JoinSet::new();
+        for i in 0..exec.partitioning().partition_count() {
+            let stream = exec.execute(i, Arc::clone(&task_ctx))?;
+            join_set.spawn(async move {
+                let mut count = 0;
+                futures::pin_mut!(stream);
+                while let Some(result) = stream.next().await {
+                    let batch = result?;
+                    count += batch.num_rows();
+                }
+                Ok::<usize, DataFusionError>(count)
+            });
+        }
+
+        // Wait for all partitions and sum the rows
+        let mut total_rows = 0;
+        while let Some(result) = join_set.join_next().await {
+            total_rows += result.unwrap()?;
+        }
+
+        // Verify we got all rows back
+        let all_batches = [batch1, batch2, batch3, batch4];
+        let expected_rows: usize = all_batches.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total_rows, expected_rows);
+
+        // Verify metrics are available
+        let metrics = exec.metrics().unwrap();
+        // Just verify the metrics can be retrieved (spilling may or may not occur)
+        let spill_count = metrics.spill_count().unwrap_or(0);
+        assert!(spill_count > 0);
+        let spilled_bytes = metrics.spilled_bytes().unwrap_or(0);
+        assert!(spilled_bytes > 0);
+        let spilled_rows = metrics.spilled_rows().unwrap_or(0);
+        assert!(spilled_rows > 0);
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_repartition() -> Result<()> {
         let schema = test_schema();
@@ -2236,7 +3089,7 @@ mod test {
 
         // Repartition should not preserve order
         assert_plan!(exec.as_ref(), @r"
-        RepartitionExec: partitioning=RoundRobinBatch(20), input_partitions=1
+        RepartitionExec: partitioning=RoundRobinBatch(20), input_partitions=1, maintains_sort_order=true
           DataSourceExec: partitions=1, partition_sizes=[0], output_ordering=c0@0 ASC
         ");
         Ok(())
@@ -2262,11 +3115,11 @@ mod test {
         schema: &SchemaRef,
         sort_exprs: LexOrdering,
     ) -> Arc<dyn ExecutionPlan> {
-        Arc::new(TestMemoryExec::update_cache(Arc::new(
-            TestMemoryExec::try_new(&[vec![]], Arc::clone(schema), None)
-                .unwrap()
-                .try_with_sort_information(vec![sort_exprs])
-                .unwrap(),
-        )))
+        let exec = TestMemoryExec::try_new(&[vec![]], Arc::clone(schema), None)
+            .unwrap()
+            .try_with_sort_information(vec![sort_exprs])
+            .unwrap();
+        let exec = Arc::new(exec);
+        Arc::new(TestMemoryExec::update_cache(&exec))
     }
 }
diff --git a/datafusion/physical-plan/src/scalar_subquery.rs b/datafusion/physical-plan/src/scalar_subquery.rs
new file mode 100644
index 0000000000000..b6ad7f91f097d
--- /dev/null
+++ b/datafusion/physical-plan/src/scalar_subquery.rs
@@ -0,0 +1,574 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Execution plan for uncorrelated scalar subqueries.
+//!
+//! [`ScalarSubqueryExec`] wraps a main input plan and a set of subquery plans.
+//! At execution time, it runs each subquery exactly once, extracts the scalar
+//! result, and populates a shared [`ScalarSubqueryResults`] container that
+//! [`ScalarSubqueryExpr`] instances hold directly and read from by index.
+//!
+//! [`ScalarSubqueryExpr`]: datafusion_physical_expr::scalar_subquery::ScalarSubqueryExpr
+
+use std::fmt;
+use std::sync::Arc;
+
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, ScalarValue, Statistics, exec_err, internal_err};
+use datafusion_execution::TaskContext;
+use datafusion_expr::execution_props::{ScalarSubqueryResults, SubqueryIndex};
+use datafusion_physical_expr::PhysicalExpr;
+
+use crate::execution_plan::{CardinalityEffect, ExecutionPlan, PlanProperties};
+use crate::joins::utils::{OnceAsync, OnceFut};
+use crate::stream::RecordBatchStreamAdapter;
+use crate::{DisplayAs, DisplayFormatType, SendableRecordBatchStream};
+
+use futures::StreamExt;
+use futures::TryStreamExt;
+
+/// Links a scalar subquery's execution plan to its index in the shared results
+/// container. The [`ScalarSubqueryExec`] that owns these links populates
+/// `results[index]` at execution time, and [`ScalarSubqueryExpr`] instances
+/// with the same index read from it.
+///
+/// [`ScalarSubqueryExpr`]: datafusion_physical_expr::scalar_subquery::ScalarSubqueryExpr
+#[derive(Debug, Clone)]
+pub struct ScalarSubqueryLink {
+    /// The physical plan for the subquery.
+    pub plan: Arc<dyn ExecutionPlan>,
+    /// Index into the shared results container.
+    pub index: SubqueryIndex,
+}
+
+/// Manages execution of uncorrelated scalar subqueries for a single plan
+/// level.
+///
+/// From a query-results perspective, this node is a pass-through: it yields
+/// the same batches as its main input and exists only to populate scalar
+/// subquery results as a side effect before those batches are produced.
+///
+/// The first child node is the **main input plan**, whose batches are passed
+/// through unchanged. The remaining children are **subquery plans**, each of
+/// which must produce exactly zero or one row. Before any batches from the main
+/// input are yielded, all subquery plans are executed and their scalar results
+/// are stored in a shared [`ScalarSubqueryResults`] container owned by this
+/// node. [`ScalarSubqueryExpr`] nodes embedded in the main input's expressions
+/// hold the same container and read from it by index.
+///
+/// All subqueries are evaluated eagerly when the first output partition is
+/// requested, before any rows from the main input are produced.
+///
+/// TODO: Consider overlapping computation of the subqueries with evaluating the
+/// main query.
+///
+/// [`ScalarSubqueryExpr`]: datafusion_physical_expr::scalar_subquery::ScalarSubqueryExpr
+#[derive(Debug)]
+pub struct ScalarSubqueryExec {
+    /// The main input plan whose output is passed through.
+    input: Arc<dyn ExecutionPlan>,
+    /// Subquery plans and their result indexes.
+    subqueries: Vec<ScalarSubqueryLink>,
+    /// Shared one-time async computation of subquery results.
+    subquery_future: Arc<OnceAsync<()>>,
+    /// Shared results container; the corresponding `ScalarSubqueryExpr`
+    /// nodes in the input plan hold the same underlying container.
+    results: ScalarSubqueryResults,
+    /// Cached plan properties (copied from input).
+    cache: Arc<PlanProperties>,
+}
+
+impl ScalarSubqueryExec {
+    pub fn new(
+        input: Arc<dyn ExecutionPlan>,
+        subqueries: Vec<ScalarSubqueryLink>,
+        results: ScalarSubqueryResults,
+    ) -> Self {
+        let cache = Arc::clone(input.properties());
+        Self {
+            input,
+            subqueries,
+            subquery_future: Arc::default(),
+            results,
+            cache,
+        }
+    }
+
+    pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
+        &self.input
+    }
+
+    pub fn subqueries(&self) -> &[ScalarSubqueryLink] {
+        &self.subqueries
+    }
+
+    pub fn results(&self) -> &ScalarSubqueryResults {
+        &self.results
+    }
+
+    /// Returns a per-child bool vec that is `true` for the main input
+    /// (child 0) and `false` for every subquery child.
+    fn true_for_input_only(&self) -> Vec<bool> {
+        std::iter::once(true)
+            .chain(std::iter::repeat_n(false, self.subqueries.len()))
+            .collect()
+    }
+}
+
+impl DisplayAs for ScalarSubqueryExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(
+                    f,
+                    "ScalarSubqueryExec: subqueries={}",
+                    self.subqueries.len()
+                )
+            }
+            DisplayFormatType::TreeRender => {
+                write!(f, "")
+            }
+        }
+    }
+}
+
+impl ExecutionPlan for ScalarSubqueryExec {
+    fn name(&self) -> &'static str {
+        "ScalarSubqueryExec"
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.cache
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        let mut children = vec![&self.input];
+        for sq in &self.subqueries {
+            children.push(&sq.plan);
+        }
+        children
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        // First child is the main input, the rest are subquery plans.
+        let input = children.remove(0);
+        let subqueries = self
+            .subqueries
+            .iter()
+            .zip(children)
+            .map(|(sq, new_plan)| ScalarSubqueryLink {
+                plan: new_plan,
+                index: sq.index,
+            })
+            .collect();
+        Ok(Arc::new(ScalarSubqueryExec::new(
+            input,
+            subqueries,
+            self.results.clone(),
+        )))
+    }
+
+    fn reset_state(self: Arc<Self>) -> Result<Arc<dyn ExecutionPlan>> {
+        self.results.clear();
+        Ok(Arc::new(ScalarSubqueryExec {
+            input: Arc::clone(&self.input),
+            subqueries: self.subqueries.clone(),
+            subquery_future: Arc::default(),
+            results: self.results.clone(),
+            cache: Arc::clone(&self.cache),
+        }))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        let subqueries = self.subqueries.clone();
+        let results = self.results.clone();
+        let subquery_ctx = Arc::clone(&context);
+        let mut subquery_future = self.subquery_future.try_once(move || {
+            Ok(async move { execute_subqueries(subqueries, results, subquery_ctx).await })
+        })?;
+        let input = Arc::clone(&self.input);
+        let schema = self.schema();
+
+        Ok(Box::pin(RecordBatchStreamAdapter::new(
+            schema,
+            futures::stream::once(async move {
+                // Execute all subqueries exactly once, even when multiple
+                // partitions call execute() concurrently.
+                wait_for_subqueries(&mut subquery_future).await?;
+
+                // Now that the subqueries have finished execution, we can
+                // safely execute the main input
+                input.execute(partition, context)
+            })
+            .try_flatten(),
+        )))
+    }
+
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
+    fn maintains_input_order(&self) -> Vec<bool> {
+        // Only the main input (first child); subquery children don't contribute
+        // to ordering.
+        self.true_for_input_only()
+    }
+
+    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
+        // Only the main input; subquery children produce at most one row, so
+        // repartitioning them has no benefit.
+        self.true_for_input_only()
+    }
+
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        self.input.partition_statistics(partition)
+    }
+
+    fn cardinality_effect(&self) -> CardinalityEffect {
+        CardinalityEffect::Equal
+    }
+}
+
+/// Wait for the subquery execution future to complete.
+async fn wait_for_subqueries(fut: &mut OnceFut<()>) -> Result<()> {
+    std::future::poll_fn(|cx| fut.get_shared(cx)).await?;
+    Ok(())
+}
+
+async fn execute_subqueries(
+    subqueries: Vec<ScalarSubqueryLink>,
+    results: ScalarSubqueryResults,
+    context: Arc<TaskContext>,
+) -> Result<()> {
+    // Evaluate subqueries in parallel; wait for them all to finish evaluation
+    // before returning.
+    let futures = subqueries.iter().map(|sq| {
+        let plan = Arc::clone(&sq.plan);
+        let ctx = Arc::clone(&context);
+        let results = results.clone();
+        let index = sq.index;
+        async move {
+            let value = execute_scalar_subquery(plan, ctx).await?;
+            results.set(index, value)?;
+            Ok(()) as Result<()>
+        }
+    });
+    futures::future::try_join_all(futures).await?;
+    Ok(())
+}
+
+/// Execute a single subquery plan and extract the scalar value.
+/// Returns NULL for 0 rows, the scalar value for exactly 1 row,
+/// or an error for >1 rows.
+async fn execute_scalar_subquery(
+    plan: Arc<dyn ExecutionPlan>,
+    context: Arc<TaskContext>,
+) -> Result<ScalarValue> {
+    let schema = plan.schema();
+    if schema.fields().len() != 1 {
+        // Should be enforced by the physical planner.
+        return internal_err!(
+            "Scalar subquery must return exactly one column, got {}",
+            schema.fields().len()
+        );
+    }
+
+    let mut stream = crate::execute_stream(plan, context)?;
+    let mut result: Option<ScalarValue> = None;
+
+    while let Some(batch) = stream.next().await.transpose()? {
+        if batch.num_rows() == 0 {
+            continue;
+        }
+        if result.is_some() || batch.num_rows() > 1 {
+            return exec_err!("Scalar subquery returned more than one row");
+        }
+        result = Some(ScalarValue::try_from_array(batch.column(0), 0)?);
+    }
+
+    // 0 rows → typed NULL per SQL semantics
+    match result {
+        Some(v) => Ok(v),
+        None => ScalarValue::try_from(schema.field(0).data_type()),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::test::{self, TestMemoryExec};
+    use crate::{
+        execution_plan::reset_plan_states,
+        projection::{ProjectionExec, ProjectionExpr},
+    };
+
+    use std::sync::atomic::{AtomicUsize, Ordering};
+
+    use crate::test::exec::ErrorExec;
+    use arrow::array::{Int32Array, Int64Array};
+    use arrow::datatypes::{DataType, Field, Schema};
+    use arrow::record_batch::RecordBatch;
+    use datafusion_physical_expr::scalar_subquery::ScalarSubqueryExpr;
+
+    enum ExpectedSubqueryResult {
+        Value(ScalarValue),
+        Error(&'static str),
+    }
+
+    #[derive(Debug)]
+    struct CountingExec {
+        inner: Arc<dyn ExecutionPlan>,
+        execute_calls: Arc<AtomicUsize>,
+    }
+
+    impl CountingExec {
+        fn new(inner: Arc<dyn ExecutionPlan>, execute_calls: Arc<AtomicUsize>) -> Self {
+            Self {
+                inner,
+                execute_calls,
+            }
+        }
+    }
+
+    impl DisplayAs for CountingExec {
+        fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
+            match t {
+                DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                    write!(f, "CountingExec")
+                }
+                DisplayFormatType::TreeRender => write!(f, ""),
+            }
+        }
+    }
+
+    impl ExecutionPlan for CountingExec {
+        fn name(&self) -> &'static str {
+            "CountingExec"
+        }
+
+        fn properties(&self) -> &Arc<PlanProperties> {
+            self.inner.properties()
+        }
+
+        fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+            vec![&self.inner]
+        }
+
+        fn with_new_children(
+            self: Arc<Self>,
+            mut children: Vec<Arc<dyn ExecutionPlan>>,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
+            Ok(Arc::new(Self::new(
+                children.remove(0),
+                Arc::clone(&self.execute_calls),
+            )))
+        }
+
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
+
+        fn execute(
+            &self,
+            partition: usize,
+            context: Arc<TaskContext>,
+        ) -> Result<SendableRecordBatchStream> {
+            self.execute_calls.fetch_add(1, Ordering::SeqCst);
+            self.inner.execute(partition, context)
+        }
+    }
+
+    fn make_subquery_plan(batches: Vec<RecordBatch>) -> Arc<dyn ExecutionPlan> {
+        let schema = batches[0].schema();
+        TestMemoryExec::try_new_exec(&[batches], schema, None).unwrap()
+    }
+
+    fn int32_batch(values: Vec<i32>) -> RecordBatch {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+        RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(values))]).unwrap()
+    }
+
+    fn empty_int64_batch() -> RecordBatch {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, true)]));
+        RecordBatch::try_new(schema, vec![Arc::new(Int64Array::from(vec![] as Vec<i64>))])
+            .unwrap()
+    }
+
+    fn placeholder_input() -> Arc<dyn ExecutionPlan> {
+        Arc::new(crate::placeholder_row::PlaceholderRowExec::new(
+            test::aggr_test_schema(),
+        ))
+    }
+
+    fn single_subquery_exec(
+        input: Arc<dyn ExecutionPlan>,
+        subquery_plan: Arc<dyn ExecutionPlan>,
+        results: ScalarSubqueryResults,
+    ) -> ScalarSubqueryExec {
+        ScalarSubqueryExec::new(
+            input,
+            vec![ScalarSubqueryLink {
+                plan: subquery_plan,
+                index: SubqueryIndex::new(0),
+            }],
+            results,
+        )
+    }
+
+    fn scalar_subquery_projection_input(
+        results: ScalarSubqueryResults,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(ProjectionExec::try_new(
+            vec![ProjectionExpr {
+                expr: Arc::new(ScalarSubqueryExpr::new(
+                    DataType::Int32,
+                    false,
+                    SubqueryIndex::new(0),
+                    results,
+                )),
+                alias: "sq".to_string(),
+            }],
+            placeholder_input(),
+        )?))
+    }
+
+    fn extract_single_int32_value(batches: &[RecordBatch]) -> i32 {
+        assert_eq!(batches.len(), 1);
+        let values = batches[0]
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert_eq!(values.len(), 1);
+        values.value(0)
+    }
+
+    #[tokio::test]
+    async fn test_execute_scalar_subquery_row_count_semantics() -> Result<()> {
+        for (name, plan, expected) in [
+            (
+                "single_row",
+                make_subquery_plan(vec![int32_batch(vec![42])]),
+                ExpectedSubqueryResult::Value(ScalarValue::Int32(Some(42))),
+            ),
+            (
+                "zero_rows",
+                make_subquery_plan(vec![empty_int64_batch()]),
+                ExpectedSubqueryResult::Value(ScalarValue::Int64(None)),
+            ),
+            (
+                "multiple_rows",
+                make_subquery_plan(vec![int32_batch(vec![1, 2, 3])]),
+                ExpectedSubqueryResult::Error("more than one row"),
+            ),
+        ] {
+            let actual =
+                execute_scalar_subquery(plan, Arc::new(TaskContext::default())).await;
+            match expected {
+                ExpectedSubqueryResult::Value(expected) => {
+                    assert_eq!(actual?, expected, "{name}");
+                }
+                ExpectedSubqueryResult::Error(expected) => {
+                    let err = actual.expect_err(name);
+                    assert!(
+                        err.to_string().contains(expected),
+                        "{name}: expected error containing '{expected}', got {err}"
+                    );
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_failed_subquery_is_not_retried() -> Result<()> {
+        let execute_calls = Arc::new(AtomicUsize::new(0));
+        let subquery_plan = Arc::new(CountingExec::new(
+            Arc::new(ErrorExec::new()),
+            Arc::clone(&execute_calls),
+        ));
+        let exec = single_subquery_exec(
+            placeholder_input(),
+            subquery_plan,
+            ScalarSubqueryResults::new(1),
+        );
+
+        let ctx = Arc::new(TaskContext::default());
+        let stream = exec.execute(0, Arc::clone(&ctx))?;
+        assert!(crate::common::collect(stream).await.is_err());
+
+        let stream = exec.execute(0, ctx)?;
+        assert!(crate::common::collect(stream).await.is_err());
+
+        assert_eq!(execute_calls.load(Ordering::SeqCst), 1);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_reset_state_clears_results_and_reexecutes_subqueries() -> Result<()> {
+        let execute_calls = Arc::new(AtomicUsize::new(0));
+        let results = ScalarSubqueryResults::new(1);
+        let subquery_plan = Arc::new(CountingExec::new(
+            make_subquery_plan(vec![int32_batch(vec![42])]),
+            Arc::clone(&execute_calls),
+        ));
+        let exec: Arc<dyn ExecutionPlan> = Arc::new(single_subquery_exec(
+            scalar_subquery_projection_input(results.clone())?,
+            subquery_plan,
+            results.clone(),
+        ));
+
+        let batches =
+            crate::common::collect(exec.execute(0, Arc::new(TaskContext::default()))?)
+                .await?;
+        assert_eq!(extract_single_int32_value(&batches), 42);
+        assert_eq!(
+            results.get(SubqueryIndex::new(0)),
+            Some(ScalarValue::Int32(Some(42)))
+        );
+
+        let reset_exec = reset_plan_states(Arc::clone(&exec))?;
+        assert_eq!(results.get(SubqueryIndex::new(0)), None);
+
+        let reset_batches = crate::common::collect(
+            reset_exec.execute(0, Arc::new(TaskContext::default()))?,
+        )
+        .await?;
+        assert_eq!(extract_single_int32_value(&reset_batches), 42);
+        assert_eq!(
+            results.get(SubqueryIndex::new(0)),
+            Some(ScalarValue::Int32(Some(42)))
+        );
+        assert_eq!(execute_calls.load(Ordering::SeqCst), 2);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/sort_pushdown.rs b/datafusion/physical-plan/src/sort_pushdown.rs
new file mode 100644
index 0000000000000..8432fd5dabee7
--- /dev/null
+++ b/datafusion/physical-plan/src/sort_pushdown.rs
@@ -0,0 +1,120 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Sort pushdown types for physical execution plans.
+//!
+//! This module provides types used for pushing sort ordering requirements
+//! down through the execution plan tree to data sources.
+
+/// Result of attempting to push down sort ordering to a node.
+///
+/// Used by [`ExecutionPlan::try_pushdown_sort`] to communicate
+/// whether and how sort ordering was successfully pushed down.
+///
+/// [`ExecutionPlan::try_pushdown_sort`]: crate::ExecutionPlan::try_pushdown_sort
+#[derive(Debug, Clone)]
+pub enum SortOrderPushdownResult<T> {
+    /// The source can guarantee exact ordering (data is perfectly sorted).
+    ///
+    /// When this is returned, the optimizer can safely remove the Sort operator
+    /// entirely since the data source guarantees the requested ordering.
+    Exact {
+        /// The optimized node that provides exact ordering
+        inner: T,
+    },
+    /// The source has optimized for the ordering but cannot guarantee perfect sorting.
+    ///
+    /// This indicates the data source has been optimized (e.g., reordered files/row groups
+    /// based on statistics, enabled reverse scanning) but the data may not be perfectly
+    /// sorted. The optimizer should keep the Sort operator but benefits from the
+    /// optimization (e.g., faster TopK queries due to early termination).
+    Inexact {
+        /// The optimized node that provides approximate ordering
+        inner: T,
+    },
+    /// The source cannot optimize for this ordering.
+    ///
+    /// The data source does not support the requested sort ordering and no
+    /// optimization was applied.
+    Unsupported,
+}
+
+impl<T> SortOrderPushdownResult<T> {
+    /// Extract the inner value if present
+    pub fn into_inner(self) -> Option<T> {
+        match self {
+            Self::Exact { inner } | Self::Inexact { inner } => Some(inner),
+            Self::Unsupported => None,
+        }
+    }
+
+    /// Map the inner value to a different type while preserving the variant.
+    pub fn map<U, F: FnOnce(T) -> U>(self, f: F) -> SortOrderPushdownResult<U> {
+        match self {
+            Self::Exact { inner } => SortOrderPushdownResult::Exact { inner: f(inner) },
+            Self::Inexact { inner } => {
+                SortOrderPushdownResult::Inexact { inner: f(inner) }
+            }
+            Self::Unsupported => SortOrderPushdownResult::Unsupported,
+        }
+    }
+
+    /// Try to map the inner value, returning an error if the function fails.
+    pub fn try_map<U, E, F: FnOnce(T) -> Result<U, E>>(
+        self,
+        f: F,
+    ) -> Result<SortOrderPushdownResult<U>, E> {
+        match self {
+            Self::Exact { inner } => {
+                Ok(SortOrderPushdownResult::Exact { inner: f(inner)? })
+            }
+            Self::Inexact { inner } => {
+                Ok(SortOrderPushdownResult::Inexact { inner: f(inner)? })
+            }
+            Self::Unsupported => Ok(SortOrderPushdownResult::Unsupported),
+        }
+    }
+
+    /// Convert this result to `Inexact`, downgrading `Exact` if present.
+    ///
+    /// This is useful when an operation (like merging multiple partitions)
+    /// cannot guarantee exact ordering even if the input provides it.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datafusion_physical_plan::SortOrderPushdownResult;
+    /// let exact = SortOrderPushdownResult::Exact { inner: 42 };
+    /// let inexact = exact.into_inexact();
+    /// assert!(matches!(inexact, SortOrderPushdownResult::Inexact { inner: 42 }));
+    ///
+    /// let already_inexact = SortOrderPushdownResult::Inexact { inner: 42 };
+    /// let still_inexact = already_inexact.into_inexact();
+    /// assert!(matches!(still_inexact, SortOrderPushdownResult::Inexact { inner: 42 }));
+    ///
+    /// let unsupported = SortOrderPushdownResult::<i32>::Unsupported;
+    /// let still_unsupported = unsupported.into_inexact();
+    /// assert!(matches!(still_unsupported, SortOrderPushdownResult::Unsupported));
+    /// ```
+    pub fn into_inexact(self) -> Self {
+        match self {
+            Self::Exact { inner } => Self::Inexact { inner },
+            Self::Inexact { inner } => Self::Inexact { inner },
+            Self::Unsupported => Self::Unsupported,
+        }
+    }
+}
diff --git a/datafusion/physical-plan/src/sorts/builder.rs b/datafusion/physical-plan/src/sorts/builder.rs
index 9b2fa968222c4..75eb2ff980325 100644
--- a/datafusion/physical-plan/src/sorts/builder.rs
+++ b/datafusion/physical-plan/src/sorts/builder.rs
@@ -16,11 +16,14 @@
 // under the License.
 
 use crate::spill::get_record_batch_memory_size;
+use arrow::array::ArrayRef;
 use arrow::compute::interleave;
 use arrow::datatypes::SchemaRef;
+use arrow::error::ArrowError;
 use arrow::record_batch::RecordBatch;
-use datafusion_common::Result;
+use datafusion_common::{DataFusionError, Result};
 use datafusion_execution::memory_pool::MemoryReservation;
+use log::warn;
 use std::sync::Arc;
 
 #[derive(Debug, Copy, Clone, Default)]
@@ -40,9 +43,24 @@ pub struct BatchBuilder {
     /// Maintain a list of [`RecordBatch`] and their corresponding stream
     batches: Vec<(usize, RecordBatch)>,
 
-    /// Accounts for memory used by buffered batches
+    /// Accounts for memory used by buffered batches.
+    ///
+    /// May include pre-reserved bytes (from `sort_spill_reservation_bytes`)
+    /// that were transferred via [`MemoryReservation::take()`] to prevent
+    /// starvation when concurrent sort partitions compete for pool memory.
     reservation: MemoryReservation,
 
+    /// Tracks the actual memory used by buffered batches (not including
+    /// pre-reserved bytes). This allows [`Self::push_batch`] to skip pool
+    /// allocation requests when the pre-reserved bytes cover the batch.
+    batches_mem_used: usize,
+
+    /// The initial reservation size at construction time. When the reservation
+    /// is pre-loaded with `sort_spill_reservation_bytes` (via `take()`), this
+    /// records that amount so we never shrink below it, maintaining the
+    /// anti-starvation guarantee throughout the merge.
+    initial_reservation: usize,
+
     /// The current [`BatchCursor`] for each stream
     cursors: Vec<BatchCursor>,
 
@@ -59,19 +77,26 @@ impl BatchBuilder {
         batch_size: usize,
         reservation: MemoryReservation,
     ) -> Self {
+        let initial_reservation = reservation.size();
         Self {
             schema,
             batches: Vec::with_capacity(stream_count * 2),
             cursors: vec![BatchCursor::default(); stream_count],
             indices: Vec::with_capacity(batch_size),
             reservation,
+            batches_mem_used: 0,
+            initial_reservation,
         }
     }
 
     /// Append a new batch in `stream_idx`
     pub fn push_batch(&mut self, stream_idx: usize, batch: RecordBatch) -> Result<()> {
-        self.reservation
-            .try_grow(get_record_batch_memory_size(&batch))?;
+        let size = get_record_batch_memory_size(&batch);
+        self.batches_mem_used += size;
+        // Only request additional memory from the pool when actual batch
+        // usage exceeds the current reservation (which may include
+        // pre-reserved bytes from sort_spill_reservation_bytes).
+        try_grow_reservation_to_at_least(&mut self.reservation, self.batches_mem_used)?;
         let batch_idx = self.batches.len();
         self.batches.push((stream_idx, batch));
         self.cursors[stream_idx] = BatchCursor {
@@ -104,9 +129,79 @@ impl BatchBuilder {
         &self.schema
     }
 
+    /// Try to interleave all columns using the given index slice.
+    fn try_interleave_columns(
+        &self,
+        indices: &[(usize, usize)],
+    ) -> Result<Vec<ArrayRef>> {
+        (0..self.schema.fields.len())
+            .map(|column_idx| {
+                let arrays: Vec<_> = self
+                    .batches
+                    .iter()
+                    .map(|(_, batch)| batch.column(column_idx).as_ref())
+                    .collect();
+                // Arrow 58.1.0+ returns OffsetOverflowError directly from
+                // interleave, allowing retry_interleave to shrink the batch.
+                interleave(&arrays, indices).map_err(Into::into)
+            })
+            .collect::<Result<Vec<_>>>()
+    }
+
+    /// Builds a record batch from the first `rows_to_emit` buffered rows.
+    fn finish_record_batch(
+        &mut self,
+        rows_to_emit: usize,
+        columns: Vec<ArrayRef>,
+    ) -> Result<RecordBatch> {
+        // Remove consumed indices, keeping any remaining for the next call.
+        self.indices.drain(..rows_to_emit);
+
+        // Only clean up fully-consumed batches when all indices are drained,
+        // because remaining indices may still reference earlier batches.
+        // In the overflow/partial-emit case this may retain some extra memory
+        // across a few drain polls, but avoids costly index scanning on the
+        // hot path. The retention is bounded and short-lived since leftover
+        // rows are drained over subsequent polls.
+        if self.indices.is_empty() {
+            // New cursors are only created once the previous cursor for the stream
+            // is finished. This means all remaining rows from all but the last batch
+            // for each stream have been yielded to the newly created record batch
+            //
+            // We can therefore drop all but the last batch for each stream
+            let mut batch_idx = 0;
+            let mut retained = 0;
+            self.batches.retain(|(stream_idx, batch)| {
+                let stream_cursor = &mut self.cursors[*stream_idx];
+                let retain = stream_cursor.batch_idx == batch_idx;
+                batch_idx += 1;
+
+                if retain {
+                    stream_cursor.batch_idx = retained;
+                    retained += 1;
+                } else {
+                    self.batches_mem_used -= get_record_batch_memory_size(batch);
+                }
+                retain
+            });
+        }
+
+        // Release excess memory back to the pool, but never shrink below
+        // initial_reservation to maintain the anti-starvation guarantee
+        // for the merge phase.
+        let target = self.batches_mem_used.max(self.initial_reservation);
+        if self.reservation.size() > target {
+            self.reservation.shrink(self.reservation.size() - target);
+        }
+
+        RecordBatch::try_new(Arc::clone(&self.schema), columns).map_err(Into::into)
+    }
+
     /// Drains the in_progress row indexes, and builds a new RecordBatch from them
     ///
-    /// Will then drop any batches for which all rows have been yielded to the output
+    /// Will then drop any batches for which all rows have been yielded to the output.
+    /// If an offset overflow occurs (e.g. string/list offsets exceed i32::MAX),
+    /// retries with progressively fewer rows until it succeeds.
     ///
     /// Returns `None` if no pending rows
     pub fn build_record_batch(&mut self) -> Result<Option<RecordBatch>> {
@@ -114,43 +209,151 @@ impl BatchBuilder {
             return Ok(None);
         }
 
-        let columns = (0..self.schema.fields.len())
-            .map(|column_idx| {
-                let arrays: Vec<_> = self
-                    .batches
-                    .iter()
-                    .map(|(_, batch)| batch.column(column_idx).as_ref())
-                    .collect();
-                Ok(interleave(&arrays, &self.indices)?)
-            })
-            .collect::<Result<Vec<_>>>()?;
-
-        self.indices.clear();
-
-        // New cursors are only created once the previous cursor for the stream
-        // is finished. This means all remaining rows from all but the last batch
-        // for each stream have been yielded to the newly created record batch
-        //
-        // We can therefore drop all but the last batch for each stream
-        let mut batch_idx = 0;
-        let mut retained = 0;
-        self.batches.retain(|(stream_idx, batch)| {
-            let stream_cursor = &mut self.cursors[*stream_idx];
-            let retain = stream_cursor.batch_idx == batch_idx;
-            batch_idx += 1;
-
-            if retain {
-                stream_cursor.batch_idx = retained;
-                retained += 1;
-            } else {
-                self.reservation.shrink(get_record_batch_memory_size(batch));
+        let (rows_to_emit, columns) =
+            retry_interleave(self.indices.len(), self.indices.len(), |rows_to_emit| {
+                self.try_interleave_columns(&self.indices[..rows_to_emit])
+            })?;
+
+        Ok(Some(self.finish_record_batch(rows_to_emit, columns)?))
+    }
+}
+
+/// Try to grow `reservation` so it covers at least `needed` bytes.
+///
+/// When a reservation has been pre-loaded with bytes (e.g. via
+/// [`MemoryReservation::take()`]), this avoids redundant pool
+/// allocations: if the reservation already covers `needed`, this is
+/// a no-op; otherwise only the deficit is requested from the pool.
+pub(crate) fn try_grow_reservation_to_at_least(
+    reservation: &mut MemoryReservation,
+    needed: usize,
+) -> Result<()> {
+    if needed > reservation.size() {
+        reservation.try_grow(needed - reservation.size())?;
+    }
+    Ok(())
+}
+
+/// Returns true if the error is an Arrow offset overflow.
+fn is_offset_overflow(e: &DataFusionError) -> bool {
+    matches!(
+        e,
+        DataFusionError::ArrowError(boxed, _)
+            if matches!(boxed.as_ref(), ArrowError::OffsetOverflowError(_))
+    )
+}
+
+#[cfg(test)]
+fn offset_overflow_error() -> DataFusionError {
+    DataFusionError::ArrowError(Box::new(ArrowError::OffsetOverflowError(0)), None)
+}
+
+fn retry_interleave<T, F>(
+    mut rows_to_emit: usize,
+    total_rows: usize,
+    mut interleave: F,
+) -> Result<(usize, T)>
+where
+    F: FnMut(usize) -> Result<T>,
+{
+    loop {
+        match interleave(rows_to_emit) {
+            Ok(value) => return Ok((rows_to_emit, value)),
+            // Only offset overflow is recoverable by emitting fewer rows.
+            Err(e) if is_offset_overflow(&e) => {
+                rows_to_emit /= 2;
+                if rows_to_emit == 0 {
+                    return Err(e);
+                }
+                warn!(
+                    "Interleave offset overflow with {total_rows} rows, retrying with {rows_to_emit}"
+                );
             }
-            retain
+            Err(e) => return Err(e),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{Array, ArrayDataBuilder, Int32Array, ListArray};
+    use arrow::buffer::Buffer;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_execution::memory_pool::{
+        MemoryConsumer, MemoryPool, UnboundedMemoryPool,
+    };
+
+    fn overflow_list_batch() -> RecordBatch {
+        let values_field = Arc::new(Field::new_list_field(DataType::Int32, true));
+        // SAFETY: This intentionally constructs an invalid child length so
+        // Arrow's interleave hits offset overflow before touching child data.
+        let list = ListArray::from(unsafe {
+            ArrayDataBuilder::new(DataType::List(Arc::clone(&values_field)))
+                .len(1)
+                .add_buffer(Buffer::from_slice_ref([0_i32, i32::MAX]))
+                .add_child_data(Int32Array::from(Vec::<i32>::new()).to_data())
+                .build_unchecked()
         });
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "list_col",
+            DataType::List(values_field),
+            true,
+        )]));
+        RecordBatch::try_new(schema, vec![Arc::new(list)]).unwrap()
+    }
+
+    #[test]
+    fn test_retry_interleave_halves_rows_until_success() {
+        let mut attempts = Vec::new();
+
+        let (rows_to_emit, result) = retry_interleave(4, 4, |rows_to_emit| {
+            attempts.push(rows_to_emit);
+            if rows_to_emit > 1 {
+                Err(offset_overflow_error())
+            } else {
+                Ok("ok")
+            }
+        })
+        .unwrap();
+
+        assert_eq!(rows_to_emit, 1);
+        assert_eq!(result, "ok");
+        assert_eq!(attempts, vec![4, 2, 1]);
+    }
+
+    #[test]
+    fn test_is_offset_overflow_matches_arrow_error() {
+        assert!(is_offset_overflow(&offset_overflow_error()));
+    }
+
+    #[test]
+    fn test_retry_interleave_does_not_retry_non_offset_errors() {
+        let mut attempts = Vec::new();
+
+        let error = retry_interleave(4, 4, |rows_to_emit| {
+            attempts.push(rows_to_emit);
+            Err::<(), _>(DataFusionError::Execution("boom".into()))
+        })
+        .unwrap_err();
+
+        assert_eq!(attempts, vec![4]);
+        assert!(matches!(error, DataFusionError::Execution(msg) if msg == "boom"));
+    }
+
+    #[test]
+    fn test_try_interleave_columns_surfaces_arrow_offset_overflow() {
+        let batch = overflow_list_batch();
+        let schema = batch.schema();
+        let pool: Arc<dyn MemoryPool> = Arc::new(UnboundedMemoryPool::default());
+        let reservation = MemoryConsumer::new("test").register(&pool);
+        let mut builder = BatchBuilder::new(schema, 1, 2, reservation);
+        builder.push_batch(0, batch).unwrap();
+
+        let error = builder
+            .try_interleave_columns(&[(0, 0), (0, 0)])
+            .unwrap_err();
 
-        Ok(Some(RecordBatch::try_new(
-            Arc::clone(&self.schema),
-            columns,
-        )?))
+        assert!(is_offset_overflow(&error));
     }
 }
diff --git a/datafusion/physical-plan/src/sorts/cursor.rs b/datafusion/physical-plan/src/sorts/cursor.rs
index 54dc2414e4f08..288ec4cee1594 100644
--- a/datafusion/physical-plan/src/sorts/cursor.rs
+++ b/datafusion/physical-plan/src/sorts/cursor.rs
@@ -19,8 +19,8 @@ use std::cmp::Ordering;
 use std::sync::Arc;
 
 use arrow::array::{
-    types::ByteArrayType, Array, ArrowPrimitiveType, GenericByteArray,
-    GenericByteViewArray, OffsetSizeTrait, PrimitiveArray, StringViewArray,
+    Array, ArrowPrimitiveType, GenericByteArray, GenericByteViewArray, OffsetSizeTrait,
+    PrimitiveArray, StringViewArray, types::ByteArrayType,
 };
 use arrow::buffer::{Buffer, OffsetBuffer, ScalarBuffer};
 use arrow::compute::SortOptions;
@@ -445,7 +445,6 @@ mod tests {
     use datafusion_execution::memory_pool::{
         GreedyMemoryPool, MemoryConsumer, MemoryPool,
     };
-    use std::sync::Arc;
 
     use super::*;
 
diff --git a/datafusion/physical-plan/src/sorts/merge.rs b/datafusion/physical-plan/src/sorts/merge.rs
index 720a3e53e4597..c29933535adc5 100644
--- a/datafusion/physical-plan/src/sorts/merge.rs
+++ b/datafusion/physical-plan/src/sorts/merge.rs
@@ -20,13 +20,13 @@
 
 use std::pin::Pin;
 use std::sync::Arc;
-use std::task::{ready, Context, Poll};
+use std::task::{Context, Poll, ready};
 
+use crate::RecordBatchStream;
 use crate::metrics::BaselineMetrics;
 use crate::sorts::builder::BatchBuilder;
 use crate::sorts::cursor::{Cursor, CursorValues};
 use crate::sorts::stream::PartitionedStream;
-use crate::RecordBatchStream;
 
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
@@ -53,6 +53,14 @@ pub(crate) struct SortPreservingMergeStream<C: CursorValues> {
     /// `fetch` limit.
     done: bool,
 
+    /// Whether buffered rows should be drained after `done` is set.
+    ///
+    /// This is enabled when we stop because the `fetch` limit has been
+    /// reached, allowing partial batches left over after overflow handling to
+    /// be emitted on subsequent polls. It remains disabled for terminal
+    /// errors so the stream does not yield data after returning `Err`.
+    drain_in_progress_on_done: bool,
+
     /// A loser tree that always produces the minimum cursor
     ///
     /// Node 0 stores the top winner, Nodes 1..num_streams store
@@ -164,6 +172,7 @@ impl<C: CursorValues> SortPreservingMergeStream<C> {
             streams,
             metrics,
             done: false,
+            drain_in_progress_on_done: false,
             cursors: (0..stream_count).map(|_| None).collect(),
             prev_cursors: (0..stream_count).map(|_| None).collect(),
             round_robin_tie_breaker_mode: false,
@@ -203,11 +212,28 @@ impl<C: CursorValues> SortPreservingMergeStream<C> {
         }
     }
 
+    fn emit_in_progress_batch(&mut self) -> Result<Option<RecordBatch>> {
+        let rows_before = self.in_progress.len();
+        let result = self.in_progress.build_record_batch();
+        self.produced += rows_before - self.in_progress.len();
+        result
+    }
+
     fn poll_next_inner(
         &mut self,
         cx: &mut Context<'_>,
     ) -> Poll<Option<Result<RecordBatch>>> {
         if self.done {
+            // When `build_record_batch()` hits an i32 offset overflow (e.g.
+            // combined string offsets exceed 2 GB), it emits a partial batch
+            // and keeps the remaining rows in `self.in_progress.indices`.
+            // Drain those leftover rows before terminating the stream,
+            // otherwise they would be silently dropped.
+            // Repeated overflows are fine — each poll emits another partial
+            // batch until `in_progress` is fully drained.
+            if self.drain_in_progress_on_done && !self.in_progress.is_empty() {
+                return Poll::Ready(self.emit_in_progress_batch().transpose());
+            }
             return Poll::Ready(None);
         }
         // Once all partitions have set their corresponding cursors for the loser tree,
@@ -283,14 +309,13 @@ impl<C: CursorValues> SortPreservingMergeStream<C> {
                 // stop sorting if fetch has been reached
                 if self.fetch_reached() {
                     self.done = true;
+                    self.drain_in_progress_on_done = true;
                 } else if self.in_progress.len() < self.batch_size {
                     continue;
                 }
             }
 
-            self.produced += self.in_progress.len();
-
-            return Poll::Ready(self.in_progress.build_record_batch().transpose());
+            return Poll::Ready(self.emit_in_progress_batch().transpose());
         }
     }
 
@@ -542,3 +567,95 @@ impl<C: CursorValues + Unpin> RecordBatchStream for SortPreservingMergeStream<C>
         Arc::clone(self.in_progress.schema())
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metrics::ExecutionPlanMetricsSet;
+    use crate::sorts::stream::PartitionedStream;
+    use arrow::array::Int32Array;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_execution::memory_pool::{
+        MemoryConsumer, MemoryPool, UnboundedMemoryPool,
+    };
+    use futures::task::noop_waker_ref;
+    use std::cmp::Ordering;
+
+    #[derive(Debug)]
+    struct EmptyPartitionedStream;
+
+    impl PartitionedStream for EmptyPartitionedStream {
+        type Output = Result<(DummyValues, RecordBatch)>;
+
+        fn partitions(&self) -> usize {
+            1
+        }
+
+        fn poll_next(
+            &mut self,
+            _cx: &mut Context<'_>,
+            _stream_idx: usize,
+        ) -> Poll<Option<Self::Output>> {
+            Poll::Ready(None)
+        }
+    }
+
+    #[derive(Debug)]
+    struct DummyValues;
+
+    impl CursorValues for DummyValues {
+        fn len(&self) -> usize {
+            0
+        }
+
+        fn eq(_l: &Self, _l_idx: usize, _r: &Self, _r_idx: usize) -> bool {
+            unreachable!("done-path test should not compare cursors")
+        }
+
+        fn eq_to_previous(_cursor: &Self, _idx: usize) -> bool {
+            unreachable!("done-path test should not compare cursors")
+        }
+
+        fn compare(_l: &Self, _l_idx: usize, _r: &Self, _r_idx: usize) -> Ordering {
+            unreachable!("done-path test should not compare cursors")
+        }
+    }
+
+    #[test]
+    fn test_done_drains_buffered_rows() {
+        let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, false)]));
+        let pool: Arc<dyn MemoryPool> = Arc::new(UnboundedMemoryPool::default());
+        let reservation = MemoryConsumer::new("test").register(&pool);
+        let metrics = ExecutionPlanMetricsSet::new();
+
+        let mut stream = SortPreservingMergeStream::<DummyValues>::new(
+            Box::new(EmptyPartitionedStream),
+            Arc::clone(&schema),
+            BaselineMetrics::new(&metrics, 0),
+            16,
+            Some(1),
+            reservation,
+            true,
+        );
+
+        let batch =
+            RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(vec![1]))])
+                .unwrap();
+        stream.in_progress.push_batch(0, batch).unwrap();
+        stream.in_progress.push_row(0);
+        stream.done = true;
+        stream.drain_in_progress_on_done = true;
+
+        let waker = noop_waker_ref();
+        let mut cx = Context::from_waker(waker);
+
+        match stream.poll_next_inner(&mut cx) {
+            Poll::Ready(Some(Ok(batch))) => assert_eq!(batch.num_rows(), 1),
+            other => {
+                panic!("expected buffered rows to be drained after done, got {other:?}")
+            }
+        }
+        assert!(stream.in_progress.is_empty());
+        assert!(matches!(stream.poll_next_inner(&mut cx), Poll::Ready(None)));
+    }
+}
diff --git a/datafusion/physical-plan/src/sorts/mod.rs b/datafusion/physical-plan/src/sorts/mod.rs
index 9c72e34fe343e..ca8d4a4400c49 100644
--- a/datafusion/physical-plan/src/sorts/mod.rs
+++ b/datafusion/physical-plan/src/sorts/mod.rs
@@ -22,7 +22,10 @@ mod cursor;
 mod merge;
 mod multi_level_merge;
 pub mod partial_sort;
+pub mod partitioned_topk;
 pub mod sort;
 pub mod sort_preserving_merge;
 mod stream;
 pub mod streaming_merge;
+
+pub(crate) use stream::IncrementalSortIterator;
diff --git a/datafusion/physical-plan/src/sorts/multi_level_merge.rs b/datafusion/physical-plan/src/sorts/multi_level_merge.rs
index 6e7a5e7a72616..8985e1d8c70ee 100644
--- a/datafusion/physical-plan/src/sorts/multi_level_merge.rs
+++ b/datafusion/physical-plan/src/sorts/multi_level_merge.rs
@@ -30,7 +30,8 @@ use arrow::datatypes::SchemaRef;
 use datafusion_common::Result;
 use datafusion_execution::memory_pool::MemoryReservation;
 
-use crate::sorts::sort::get_reserved_byte_for_record_batch_size;
+use crate::sorts::builder::try_grow_reservation_to_at_least;
+use crate::sorts::sort::get_reserved_bytes_for_record_batch_size;
 use crate::sorts::streaming_merge::{SortedSpillFile, StreamingMergeBuilder};
 use crate::stream::RecordBatchStreamAdapter;
 use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream};
@@ -145,7 +146,7 @@ impl Debug for MultiLevelMergeBuilder {
 }
 
 impl MultiLevelMergeBuilder {
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     pub(crate) fn new(
         spill_manager: SpillManager,
         schema: SchemaRef,
@@ -253,7 +254,12 @@ impl MultiLevelMergeBuilder {
 
             // Need to merge multiple streams
             (_, _) => {
-                let mut memory_reservation = self.reservation.new_empty();
+                // Transfer any pre-reserved bytes (from sort_spill_reservation_bytes)
+                // to the merge memory reservation. This prevents starvation when
+                // concurrent sort partitions compete for pool memory: the pre-reserved
+                // bytes cover spill file buffer reservations without additional pool
+                // allocation.
+                let mut memory_reservation = self.reservation.take();
 
                 // Don't account for existing streams memory
                 // as we are not holding the memory for them
@@ -269,6 +275,15 @@ impl MultiLevelMergeBuilder {
 
                 let is_only_merging_memory_streams = sorted_spill_files.is_empty();
 
+                // If no spill files were selected (e.g. all too large for
+                // available memory but enough in-memory streams exist),
+                // return the pre-reserved bytes to self.reservation so
+                // create_new_merge_sort can transfer them to the merge
+                // stream's BatchBuilder.
+                if is_only_merging_memory_streams {
+                    mem::swap(&mut self.reservation, &mut memory_reservation);
+                }
+
                 for spill in sorted_spill_files {
                     let stream = self
                         .spill_manager
@@ -290,7 +305,11 @@ impl MultiLevelMergeBuilder {
                 // If we're only merging memory streams, we don't need to attach the memory reservation
                 // as it's empty
                 if is_only_merging_memory_streams {
-                    assert_eq!(memory_reservation.size(), 0, "when only merging memory streams, we should not have any memory reservation and let the merge sort handle the memory");
+                    assert_eq!(
+                        memory_reservation.size(),
+                        0,
+                        "when only merging memory streams, we should not have any memory reservation and let the merge sort handle the memory"
+                    );
 
                     Ok(merge_sort_stream)
                 } else {
@@ -333,8 +352,10 @@ impl MultiLevelMergeBuilder {
             builder = builder.with_bypass_mempool();
         } else {
             // If we are only merging in-memory streams, we need to use the memory reservation
-            // because we don't know the maximum size of the batches in the streams
-            builder = builder.with_reservation(self.reservation.new_empty());
+            // because we don't know the maximum size of the batches in the streams.
+            // Use take() to transfer any pre-reserved bytes so the merge can use them
+            // as its initial budget without additional pool allocation.
+            builder = builder.with_reservation(self.reservation.take());
         }
 
         builder.build()
@@ -352,13 +373,24 @@ impl MultiLevelMergeBuilder {
     ) -> Result<(Vec<SortedSpillFile>, usize)> {
         assert_ne!(buffer_len, 0, "Buffer length must be greater than 0");
         let mut number_of_spills_to_read_for_current_phase = 0;
+        // Track total memory needed for spill file buffers. When the
+        // reservation has pre-reserved bytes (from sort_spill_reservation_bytes),
+        // those bytes cover the first N spill files without additional pool
+        // allocation, preventing starvation under memory pressure.
+        let mut total_needed: usize = 0;
 
         for spill in &self.sorted_spill_files {
-            // For memory pools that are not shared this is good, for other this is not
-            // and there should be some upper limit to memory reservation so we won't starve the system
-            match reservation.try_grow(get_reserved_byte_for_record_batch_size(
-                spill.max_record_batch_memory * buffer_len,
-            )) {
+            let per_spill = get_reserved_bytes_for_record_batch_size(
+                spill.max_record_batch_memory,
+                // Size will be the same as the sliced size, bc it is a spilled batch.
+                spill.max_record_batch_memory,
+            ) * buffer_len;
+            total_needed += per_spill;
+
+            // For memory pools that are not shared this is good, for other
+            // this is not and there should be some upper limit to memory
+            // reservation so we won't starve the system.
+            match try_grow_reservation_to_at_least(reservation, total_needed) {
                 Ok(_) => {
                     number_of_spills_to_read_for_current_phase += 1;
                 }
diff --git a/datafusion/physical-plan/src/sorts/partial_sort.rs b/datafusion/physical-plan/src/sorts/partial_sort.rs
index 7a623b0c30d32..28b8745235918 100644
--- a/datafusion/physical-plan/src/sorts/partial_sort.rs
+++ b/datafusion/physical-plan/src/sorts/partial_sort.rs
@@ -51,7 +51,6 @@
 //! The plan concats incoming data with such last rows of previous input
 //! and continues partial sorting of the segments.
 
-use std::any::Any;
 use std::fmt::Debug;
 use std::pin::Pin;
 use std::sync::Arc;
@@ -62,17 +61,19 @@ use crate::sorts::sort::sort_batch;
 use crate::{
     DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties,
     Partitioning, PlanProperties, SendableRecordBatchStream, Statistics,
+    check_if_same_properties,
 };
 
 use arrow::compute::concat_batches;
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
-use datafusion_common::utils::evaluate_partition_ranges;
 use datafusion_common::Result;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::utils::evaluate_partition_ranges;
 use datafusion_execution::{RecordBatchStream, TaskContext};
-use datafusion_physical_expr::LexOrdering;
+use datafusion_physical_expr::{LexOrdering, PhysicalExpr};
 
-use futures::{ready, Stream, StreamExt};
+use futures::{Stream, StreamExt, ready};
 use log::trace;
 
 /// Partial Sort execution plan.
@@ -93,7 +94,7 @@ pub struct PartialSortExec {
     /// Fetch highest/lowest n results
     fetch: Option<usize>,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl PartialSortExec {
@@ -114,7 +115,7 @@ impl PartialSortExec {
             metrics_set: ExecutionPlanMetricsSet::new(),
             preserve_partitioning,
             fetch: None,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -132,12 +133,8 @@ impl PartialSortExec {
     /// input partitions producing a single, sorted partition.
     pub fn with_preserve_partitioning(mut self, preserve_partitioning: bool) -> Self {
         self.preserve_partitioning = preserve_partitioning;
-        self.cache = self
-            .cache
-            .with_partitioning(Self::output_partitioning_helper(
-                &self.input,
-                self.preserve_partitioning,
-            ));
+        Arc::make_mut(&mut self.cache).partitioning =
+            Self::output_partitioning_helper(&self.input, self.preserve_partitioning);
         self
     }
 
@@ -207,6 +204,17 @@ impl PartialSortExec {
             input.boundedness(),
         ))
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics_set: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for PartialSortExec {
@@ -220,9 +228,17 @@ impl DisplayAs for PartialSortExec {
                 let common_prefix_length = self.common_prefix_length;
                 match self.fetch {
                     Some(fetch) => {
-                        write!(f, "PartialSortExec: TopK(fetch={fetch}), expr=[{}], common_prefix_length=[{common_prefix_length}]", self.expr)
+                        write!(
+                            f,
+                            "PartialSortExec: TopK(fetch={fetch}), expr=[{}], common_prefix_length=[{common_prefix_length}]",
+                            self.expr
+                        )
                     }
-                    None => write!(f, "PartialSortExec: expr=[{}], common_prefix_length=[{common_prefix_length}]", self.expr),
+                    None => write!(
+                        f,
+                        "PartialSortExec: expr=[{}], common_prefix_length=[{common_prefix_length}]",
+                        self.expr
+                    ),
                 }
             }
             DisplayFormatType::TreeRender => match self.fetch {
@@ -243,11 +259,7 @@ impl ExecutionPlan for PartialSortExec {
         "PartialSortExec"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -271,10 +283,22 @@ impl ExecutionPlan for PartialSortExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        let mut tnr = TreeNodeRecursion::Continue;
+        for sort_expr in &self.expr {
+            tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+        }
+        Ok(tnr)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         let new_partial_sort = PartialSortExec::new(
             self.expr.clone(),
             Arc::clone(&children[0]),
@@ -291,7 +315,12 @@ impl ExecutionPlan for PartialSortExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        trace!("Start PartialSortExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
+        trace!(
+            "Start PartialSortExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
 
         let input = self.input.execute(partition, Arc::clone(&context))?;
 
@@ -316,11 +345,7 @@ impl ExecutionPlan for PartialSortExec {
         Some(self.metrics_set.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.input.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         self.input.partition_statistics(partition)
     }
 }
@@ -484,13 +509,13 @@ mod tests {
     use itertools::Itertools;
 
     use crate::collect;
-    use crate::expressions::col;
     use crate::expressions::PhysicalSortExpr;
+    use crate::expressions::col;
     use crate::sorts::sort::SortExec;
     use crate::test;
-    use crate::test::assert_is_pending;
-    use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec};
     use crate::test::TestMemoryExec;
+    use crate::test::assert_is_pending;
+    use crate::test::exec::{BlockingExec, assert_strong_count_converges_to_zero};
 
     use super::*;
 
@@ -536,18 +561,18 @@ mod tests {
 
         assert_eq!(2, result.len());
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&result), @r#"
-                +---+---+---+
-                | a | b | c |
-                +---+---+---+
-                | 0 | 1 | 0 |
-                | 0 | 1 | 1 |
-                | 0 | 2 | 5 |
-                | 1 | 2 | 4 |
-                | 1 | 3 | 2 |
-                | 1 | 3 | 3 |
-                +---+---+---+
-                "#);
+            assert_snapshot!(batches_to_string(&result), @r"
+            +---+---+---+
+            | a | b | c |
+            +---+---+---+
+            | 0 | 1 | 0 |
+            | 0 | 1 | 1 |
+            | 0 | 2 | 5 |
+            | 1 | 2 | 4 |
+            | 1 | 3 | 2 |
+            | 1 | 3 | 3 |
+            +---+---+---+
+            ");
         }
         assert_eq!(
             task_ctx.runtime_env().memory_pool.reserved(),
@@ -604,16 +629,16 @@ mod tests {
 
             assert_eq!(2, result.len());
             allow_duplicates! {
-                assert_snapshot!(batches_to_string(&result), @r#"
-                    +---+---+---+
-                    | a | b | c |
-                    +---+---+---+
-                    | 0 | 1 | 4 |
-                    | 0 | 2 | 3 |
-                    | 1 | 2 | 2 |
-                    | 1 | 3 | 0 |
-                    +---+---+---+
-                    "#);
+                assert_snapshot!(batches_to_string(&result), @r"
+                +---+---+---+
+                | a | b | c |
+                +---+---+---+
+                | 0 | 1 | 4 |
+                | 0 | 2 | 3 |
+                | 1 | 2 | 2 |
+                | 1 | 3 | 0 |
+                +---+---+---+
+                ");
             }
             assert_eq!(
                 task_ctx.runtime_env().memory_pool.reserved(),
@@ -680,20 +705,20 @@ mod tests {
                 "The sort should have returned all memory used back to the memory manager"
             );
             allow_duplicates! {
-                assert_snapshot!(batches_to_string(&result), @r#"
-                    +---+---+---+
-                    | a | b | c |
-                    +---+---+---+
-                    | 0 | 1 | 6 |
-                    | 0 | 1 | 7 |
-                    | 0 | 3 | 4 |
-                    | 0 | 3 | 5 |
-                    | 1 | 2 | 0 |
-                    | 1 | 2 | 1 |
-                    | 1 | 4 | 2 |
-                    | 1 | 4 | 3 |
-                    +---+---+---+
-                    "#);
+                assert_snapshot!(batches_to_string(&result), @r"
+                +---+---+---+
+                | a | b | c |
+                +---+---+---+
+                | 0 | 1 | 6 |
+                | 0 | 1 | 7 |
+                | 0 | 3 | 4 |
+                | 0 | 3 | 5 |
+                | 1 | 2 | 0 |
+                | 1 | 2 | 1 |
+                | 1 | 4 | 2 |
+                | 1 | 4 | 3 |
+                +---+---+---+
+                ");
             }
         }
         Ok(())
@@ -1038,20 +1063,20 @@ mod tests {
             task_ctx,
         )
         .await?;
-        assert_snapshot!(batches_to_string(&result), @r#"
-            +-----+------+-------+
-            | a   | b    | c     |
-            +-----+------+-------+
-            | 1.0 | 20.0 | 20.0  |
-            | 1.0 | 20.0 | 10.0  |
-            | 1.0 | 40.0 | 10.0  |
-            | 2.0 | 40.0 | 100.0 |
-            | 2.0 | NaN  | NaN   |
-            | 3.0 |      |       |
-            | 3.0 |      | 100.0 |
-            | 3.0 | NaN  | NaN   |
-            +-----+------+-------+
-            "#);
+        assert_snapshot!(batches_to_string(&result), @r"
+        +-----+------+-------+
+        | a   | b    | c     |
+        +-----+------+-------+
+        | 1.0 | 20.0 | 20.0  |
+        | 1.0 | 20.0 | 10.0  |
+        | 1.0 | 40.0 | 10.0  |
+        | 2.0 | 40.0 | 100.0 |
+        | 2.0 | NaN  | NaN   |
+        | 3.0 |      |       |
+        | 3.0 |      | 100.0 |
+        | 3.0 | NaN  | NaN   |
+        +-----+------+-------+
+        ");
         assert_eq!(result.len(), 2);
         let metrics = partial_sort_exec.metrics().unwrap();
         assert!(metrics.elapsed_compute().unwrap() > 0);
@@ -1164,21 +1189,21 @@ mod tests {
         assert_eq!(result.len(), 3,);
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&result), @r#"
-                +---+---+---+
-                | a | b | c |
-                +---+---+---+
-                | 1 | 1 | 1 |
-                | 1 | 1 | 2 |
-                | 1 | 1 | 3 |
-                | 2 | 2 | 4 |
-                | 2 | 2 | 4 |
-                | 2 | 2 | 6 |
-                | 3 | 3 | 7 |
-                | 3 | 3 | 8 |
-                | 3 | 3 | 9 |
-                +---+---+---+
-                "#);
+            assert_snapshot!(batches_to_string(&result), @r"
+            +---+---+---+
+            | a | b | c |
+            +---+---+---+
+            | 1 | 1 | 1 |
+            | 1 | 1 | 2 |
+            | 1 | 1 | 3 |
+            | 2 | 2 | 4 |
+            | 2 | 2 | 4 |
+            | 2 | 2 | 6 |
+            | 3 | 3 | 7 |
+            | 3 | 3 | 8 |
+            | 3 | 3 | 9 |
+            +---+---+---+
+            ");
         }
 
         assert_eq!(task_ctx.runtime_env().memory_pool.reserved(), 0,);
diff --git a/datafusion/physical-plan/src/sorts/partitioned_topk.rs b/datafusion/physical-plan/src/sorts/partitioned_topk.rs
new file mode 100644
index 0000000000000..f5d47f503bf9b
--- /dev/null
+++ b/datafusion/physical-plan/src/sorts/partitioned_topk.rs
@@ -0,0 +1,525 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`PartitionedTopKExec`]: Top-K per partition operator
+//!
+//! For queries like:
+//! ```sql
+//! SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val) as rn
+//! FROM t WHERE rn <= N
+//! ```
+//!
+//! Instead of sorting the entire dataset, this operator maintains a
+//! [`TopK`] heap per partition (reusing the existing TopK implementation)
+//! and emits only the top-K rows per partition in sorted order
+//! `(partition_keys, order_keys)`.
+
+use std::fmt::{self, Formatter};
+use std::sync::Arc;
+
+use arrow::array::{RecordBatch, UInt32Array};
+use arrow::compute::{BatchCoalescer, take_record_batch};
+use arrow::datatypes::SchemaRef;
+use arrow::row::{OwnedRow, RowConverter};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{HashMap, Result};
+use datafusion_execution::TaskContext;
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_expr::expressions::{DynamicFilterPhysicalExpr, lit};
+use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use futures::StreamExt;
+use futures::TryStreamExt;
+use parking_lot::RwLock;
+
+use crate::execution_plan::{Boundedness, EmissionType};
+use crate::metrics::ExecutionPlanMetricsSet;
+use crate::topk::{TopK, TopKDynamicFilters, build_sort_fields};
+use crate::{
+    DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties,
+    PlanProperties, SendableRecordBatchStream, stream::RecordBatchStreamAdapter,
+};
+
+/// Per-partition Top-K operator for window function queries.
+///
+/// # Background
+///
+/// "Top K per partition" is a common analytics pattern used for queries such as
+/// "find the top 3 products by revenue for each store". The (simplified) SQL
+/// for such a query might be:
+///
+/// ```sql
+/// SELECT * FROM (
+///     SELECT *, ROW_NUMBER() OVER (PARTITION BY store ORDER BY revenue DESC) as rn
+///     FROM sales
+/// ) WHERE rn <= 3;
+/// ```
+///
+/// The unoptimized physical plan would be:
+///
+/// ```text
+/// FilterExec: rn <= 3
+///   BoundedWindowAggExec: ROW_NUMBER() PARTITION BY [store] ORDER BY [revenue DESC]
+///     SortExec: expr=[store ASC, revenue DESC]
+///       DataSourceExec
+/// ```
+///
+/// This plan sorts the **entire** dataset (O(N log N)), computes `ROW_NUMBER`
+/// for **all** rows, and then filters to keep only the top K per partition.
+/// With 10M rows, 1K partitions, and K=3, it sorts all 10M rows but only
+/// keeps 3K.
+///
+/// # Optimization
+///
+/// `PartitionedTopKExec` replaces the `SortExec` and the `FilterExec` is
+/// removed. The optimized plan becomes:
+///
+/// ```text
+/// BoundedWindowAggExec: ROW_NUMBER() PARTITION BY [store] ORDER BY [revenue DESC]
+///   PartitionedTopKExec: fetch=3, partition=[store], order=[revenue DESC]
+///     DataSourceExec
+/// ```
+///
+/// Instead of sorting the entire dataset, this operator reads unsorted input,
+/// maintains a [`TopK`] heap per distinct partition key, and emits only the
+/// top-K rows per partition in sorted order `(partition_keys, order_keys)`.
+///
+/// Cost: O(N log K) time instead of O(N log N), and O(K × P × row_size)
+/// memory where K = fetch, P = number of distinct partitions.
+/// ## Why maintaining partition key order in output
+/// Window functions do not require partition keys to be globally sorted, and
+/// enforcing such ordering in the output can introduce unnecessary overhead.
+/// However, the physical optimizer framework currently cannot express an
+/// ordering that is only grouped by some keys while ordered by others. For
+/// example:
+///
+///
+/// # Example
+///
+/// For the query above with `fetch=3` and input:
+///
+/// ```text
+/// store | revenue
+/// ------|--------
+///   A   |  100
+///   B   |   50
+///   A   |  200
+///   B   |  150
+///   A   |  300
+///   A   |  400
+/// ```
+///
+/// The operator maintains two heaps:
+/// - **store=A**: keeps top-3 by revenue DESC → {400, 300, 200}, evicts 100
+/// - **store=B**: keeps top-3 by revenue DESC → {150, 50} (only 2 rows)
+///
+/// Output (sorted by store ASC, revenue DESC):
+///
+/// ```text
+/// store | revenue
+/// ------|--------
+///   A   |  400
+///   A   |  300
+///   A   |  200
+///   B   |  150
+///   B   |   50
+/// ```
+///
+/// This is then passed to `BoundedWindowAggExec` which assigns
+/// `ROW_NUMBER` 1, 2, 3 to each partition — all of which satisfy `rn <= 3`.
+///
+/// # Limitations
+///
+/// - Only activated when the window function is `ROW_NUMBER` with a
+///   `PARTITION BY` clause. Global top-K (no `PARTITION BY`) is already
+///   handled efficiently by `SortExec` with `fetch`.
+/// - For very high cardinality partition keys (millions of distinct values),
+///   both memory usage and runtime overhead can become significant. In such
+///   cases, the sort-based plan is more robust. Therefore, this optimization
+///   is currently controlled by a configuration flag.
+#[derive(Debug, Clone)]
+pub struct PartitionedTopKExec {
+    /// Input execution plan (reads unsorted data)
+    input: Arc<dyn ExecutionPlan>,
+    /// Full sort expressions: `[partition_keys..., order_keys...]`.
+    ///
+    /// For `PARTITION BY store ORDER BY revenue DESC` with sort
+    /// `[store ASC, revenue DESC]`, the first `partition_prefix_len`
+    /// expressions are the partition keys (`[store ASC]`) and the
+    /// remaining are the order-by keys (`[revenue DESC]`).
+    expr: LexOrdering,
+    /// Number of leading expressions in `expr` that define the partition
+    /// key. For example, `PARTITION BY a, b` → `partition_prefix_len = 2`.
+    partition_prefix_len: usize,
+    /// Maximum number of rows to keep per partition (the K in "top-K").
+    /// Derived from the filter predicate: `rn <= 3` → `fetch = 3`,
+    /// `rn < 3` → `fetch = 2`.
+    fetch: usize,
+    /// Execution metrics
+    metrics_set: ExecutionPlanMetricsSet,
+    /// Cached plan properties (output ordering, partitioning, etc.)
+    cache: Arc<PlanProperties>,
+}
+
+impl PartitionedTopKExec {
+    /// Create a new `PartitionedTopKExec`.
+    ///
+    /// # Arguments
+    ///
+    /// * `input` - The child execution plan providing unsorted input rows.
+    /// * `expr` - Full sort ordering `[partition_keys..., order_keys...]`.
+    ///   For `PARTITION BY pk ORDER BY val ASC`, this would be `[pk ASC, val ASC]`.
+    /// * `partition_prefix_len` - Number of leading expressions in `expr`
+    ///   that form the partition key. Must be >= 1.
+    /// * `fetch` - Maximum rows to retain per partition (the K in "top-K").
+    ///
+    /// # Example
+    ///
+    /// ```text
+    /// // For: ROW_NUMBER() OVER (PARTITION BY store ORDER BY revenue DESC) ... WHERE rn <= 5
+    /// PartitionedTopKExec::try_new(
+    ///     data_source,
+    ///     LexOrdering([store ASC, revenue DESC]),
+    ///     1,    // partition_prefix_len: 1 partition column (store)
+    ///     5,    // fetch: keep top 5 per partition
+    /// )
+    /// ```
+    pub fn try_new(
+        input: Arc<dyn ExecutionPlan>,
+        expr: LexOrdering,
+        partition_prefix_len: usize,
+        fetch: usize,
+    ) -> Result<Self> {
+        let cache = Self::compute_properties(&input, expr.clone())?;
+        Ok(Self {
+            input,
+            expr,
+            partition_prefix_len,
+            fetch,
+            metrics_set: ExecutionPlanMetricsSet::new(),
+            cache: Arc::new(cache),
+        })
+    }
+
+    /// Returns the child execution plan.
+    pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
+        &self.input
+    }
+
+    /// Returns the full sort ordering `[partition_keys..., order_keys...]`.
+    pub fn expr(&self) -> &LexOrdering {
+        &self.expr
+    }
+
+    /// Returns the number of leading expressions in [`Self::expr`] that
+    /// define the partition key.
+    pub fn partition_prefix_len(&self) -> usize {
+        self.partition_prefix_len
+    }
+
+    /// Returns the maximum number of rows retained per partition.
+    pub fn fetch(&self) -> usize {
+        self.fetch
+    }
+
+    /// Compute [`PlanProperties`] for this operator.
+    ///
+    /// The output is sorted by `sort_exprs` (partition keys then order keys),
+    /// uses the same partitioning as the input, emits all output at once
+    /// (`EmissionType::Final`), and is bounded.
+    fn compute_properties(
+        input: &Arc<dyn ExecutionPlan>,
+        sort_exprs: LexOrdering,
+    ) -> Result<PlanProperties> {
+        let mut eq_properties = input.equivalence_properties().clone();
+        eq_properties.reorder(sort_exprs)?;
+
+        Ok(PlanProperties::new(
+            eq_properties,
+            input.output_partitioning().clone(),
+            EmissionType::Final,
+            Boundedness::Bounded,
+        ))
+    }
+}
+
+impl DisplayAs for PartitionedTopKExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                let partition_exprs: Vec<String> = self.expr[..self.partition_prefix_len]
+                    .iter()
+                    .map(|e| format!("{}", e.expr))
+                    .collect();
+                let order_exprs: Vec<String> = self.expr[self.partition_prefix_len..]
+                    .iter()
+                    .map(|e| format!("{e}"))
+                    .collect();
+                write!(
+                    f,
+                    "PartitionedTopKExec: fetch={}, partition=[{}], order=[{}]",
+                    self.fetch,
+                    partition_exprs.join(", "),
+                    order_exprs.join(", "),
+                )
+            }
+            DisplayFormatType::TreeRender => {
+                let partition_exprs: Vec<String> = self.expr[..self.partition_prefix_len]
+                    .iter()
+                    .map(|e| format!("{}", e.expr))
+                    .collect();
+                let order_exprs: Vec<String> = self.expr[self.partition_prefix_len..]
+                    .iter()
+                    .map(|e| format!("{e}"))
+                    .collect();
+                writeln!(f, "fetch={}", self.fetch)?;
+                writeln!(f, "partition=[{}]", partition_exprs.join(", "))?;
+                writeln!(f, "order=[{}]", order_exprs.join(", "))
+            }
+        }
+    }
+}
+
+impl ExecutionPlan for PartitionedTopKExec {
+    fn name(&self) -> &'static str {
+        "PartitionedTopKExec"
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.cache
+    }
+
+    fn required_input_distribution(&self) -> Vec<Distribution> {
+        let partition_exprs: Vec<Arc<dyn PhysicalExpr>> = self.expr
+            [..self.partition_prefix_len]
+            .iter()
+            .map(|e| Arc::clone(&e.expr))
+            .collect();
+        vec![Distribution::HashPartitioned(partition_exprs)]
+    }
+
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![false]
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.input]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        assert_eq!(children.len(), 1);
+        Ok(Arc::new(PartitionedTopKExec::try_new(
+            Arc::clone(&children[0]),
+            self.expr.clone(),
+            self.partition_prefix_len,
+            self.fetch,
+        )?))
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        let mut tnr = TreeNodeRecursion::Continue;
+        for sort_expr in &self.expr {
+            tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+        }
+        Ok(tnr)
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        let input = self.input.execute(partition, Arc::clone(&context))?;
+        let schema = input.schema();
+
+        let partition_sort_fields =
+            build_sort_fields(&self.expr[..self.partition_prefix_len], &schema)?;
+
+        let partition_converter = RowConverter::new(partition_sort_fields)?;
+
+        let partition_exprs: Vec<Arc<dyn PhysicalExpr>> = self.expr
+            [..self.partition_prefix_len]
+            .iter()
+            .map(|e| Arc::clone(&e.expr))
+            .collect();
+        let order_expr: LexOrdering =
+            LexOrdering::new(self.expr[self.partition_prefix_len..].iter().cloned())
+                .expect("PartitionedTopKExec requires at least one order-by expression");
+        let fetch = self.fetch;
+        let batch_size = context.session_config().batch_size();
+        let runtime = Arc::clone(&context.runtime_env());
+        let metrics_set = self.metrics_set.clone();
+
+        let stream = futures::stream::once(async move {
+            do_partitioned_topk(
+                input,
+                schema,
+                partition_converter,
+                partition_exprs,
+                order_expr,
+                fetch,
+                batch_size,
+                runtime,
+                metrics_set,
+            )
+            .await
+        })
+        .try_flatten();
+
+        Ok(Box::pin(RecordBatchStreamAdapter::new(
+            self.input.schema(),
+            stream,
+        )))
+    }
+}
+
+/// Create a no-op [`TopKDynamicFilters`] for a per-partition [`TopK`].
+///
+/// In normal `SortExec` top-K mode, dynamic filters push predicates down to
+/// the data source (e.g., telling Parquet to skip rows worse than the current
+/// K-th best). For per-partition heaps the data is already in memory and split
+/// by partition key, so there is no data source to push filters to. We pass
+/// `lit(true)` (accept everything) so the filter never rejects any row.
+fn create_noop_dynamic_filter() -> Arc<RwLock<TopKDynamicFilters>> {
+    Arc::new(RwLock::new(TopKDynamicFilters::new(Arc::new(
+        DynamicFilterPhysicalExpr::new(vec![], lit(true)),
+    ))))
+}
+
+/// Read all input, split batches by partition key, feed each sub-batch
+/// to a per-partition [`TopK`], then emit results in partition-key order.
+///
+/// # Phases
+///
+/// 1. **Accumulation** — For each input batch:
+///    - Evaluate partition key expressions to get partition column arrays
+///    - Convert partition columns to binary [`arrow::row::Row`] format
+///    - Group row indices by partition key
+///    - Extract sub-batches via [`take_record_batch`] and insert into
+///      the partition's [`TopK`] heap
+///
+/// 2. **Emission** — After all input is consumed:
+///    - Sort partition keys so output is ordered by partition key
+///    - For each partition in sorted order, call [`TopK::emit`] to get
+///      rows sorted by order-by key
+///    - Return all batches as a single stream
+///
+/// # Cost
+///
+/// - Time: O(N log K) where N = total rows, K = fetch
+/// - Memory: O(K × P × row_size) where P = number of distinct partitions
+#[expect(clippy::too_many_arguments)]
+async fn do_partitioned_topk(
+    mut input: SendableRecordBatchStream,
+    schema: SchemaRef,
+    partition_converter: RowConverter,
+    partition_exprs: Vec<Arc<dyn PhysicalExpr>>,
+    order_expr: LexOrdering,
+    fetch: usize,
+    batch_size: usize,
+    runtime: Arc<datafusion_execution::runtime_env::RuntimeEnv>,
+    metrics_set: ExecutionPlanMetricsSet,
+) -> Result<SendableRecordBatchStream> {
+    let mut partitions: HashMap<OwnedRow, TopK> = HashMap::new();
+    let mut partition_counter: usize = 0;
+
+    // Macro-like helper: create a new TopK for a partition
+    macro_rules! new_topk {
+        () => {{
+            let id = partition_counter;
+            partition_counter += 1;
+            TopK::try_new(
+                id,
+                Arc::clone(&schema),
+                vec![],
+                order_expr.clone(),
+                fetch,
+                batch_size,
+                Arc::clone(&runtime),
+                &metrics_set,
+                create_noop_dynamic_filter(),
+            )
+        }};
+    }
+
+    // ---------- Accumulation phase ----------
+    while let Some(batch) = input.next().await {
+        let batch = batch?;
+        let num_rows = batch.num_rows();
+        if num_rows == 0 {
+            continue;
+        }
+
+        // Evaluate partition key columns
+        let pk_arrays: Vec<_> = partition_exprs
+            .iter()
+            .map(|e| e.evaluate(&batch).and_then(|v| v.into_array(num_rows)))
+            .collect::<Result<Vec<_>>>()?;
+
+        let pk_rows = partition_converter.convert_columns(&pk_arrays)?;
+
+        // Group row indices by partition key
+        let mut groups: HashMap<OwnedRow, Vec<u32>> = HashMap::new();
+        for row_idx in 0..num_rows {
+            let pk = pk_rows.row(row_idx).owned();
+            groups.entry(pk).or_default().push(row_idx as u32);
+        }
+
+        // For each partition group, create a sub-batch and feed to TopK
+        for (pk, indices) in groups {
+            if !partitions.contains_key(&pk) {
+                partitions.insert(pk.clone(), new_topk!()?);
+            }
+            let topk = partitions.get_mut(&pk).unwrap();
+            let indices_array = UInt32Array::from(indices);
+            let sub_batch = take_record_batch(&batch, &indices_array)?;
+            topk.insert_batch(sub_batch)?;
+        }
+    }
+
+    // ---------- Emit phase ----------
+    // Sort partition keys so output is ordered by (partition_keys, order_keys).
+    let mut sorted_pks: Vec<OwnedRow> = partitions.keys().cloned().collect();
+    sorted_pks.sort();
+
+    let mut coalescer = BatchCoalescer::new(Arc::clone(&schema), batch_size);
+
+    for pk in sorted_pks {
+        if let Some(topk) = partitions.remove(&pk) {
+            // TopK::emit() returns a stream of sorted batches
+            let mut stream = topk.emit()?;
+            while let Some(batch) = stream.next().await {
+                coalescer.push_batch(batch?)?;
+            }
+        }
+    }
+    coalescer.finish_buffered_batch()?;
+    let mut output_batches: Vec<RecordBatch> = Vec::new();
+    while let Some(batch) = coalescer.next_completed_batch() {
+        output_batches.push(batch);
+    }
+
+    Ok(Box::pin(RecordBatchStreamAdapter::new(
+        schema,
+        futures::stream::iter(output_batches.into_iter().map(Ok)),
+    )))
+}
diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs
index a95fad19f614b..f4c764ac73a3f 100644
--- a/datafusion/physical-plan/src/sorts/sort.rs
+++ b/datafusion/physical-plan/src/sorts/sort.rs
@@ -19,7 +19,6 @@
 //! It will do in-memory sorting if it has enough memory budget
 //! but spills to disk if needed.
 
-use std::any::Any;
 use std::fmt;
 use std::fmt::{Debug, Formatter};
 use std::sync::Arc;
@@ -27,22 +26,27 @@ use std::sync::Arc;
 use parking_lot::RwLock;
 
 use crate::common::spawn_buffered;
-use crate::execution_plan::{Boundedness, CardinalityEffect, EmissionType};
+use crate::execution_plan::{
+    Boundedness, CardinalityEffect, EmissionType, has_same_children_properties,
+};
 use crate::expressions::PhysicalSortExpr;
+use crate::filter::FilterExec;
 use crate::filter_pushdown::{
-    ChildFilterDescription, FilterDescription, FilterPushdownPhase,
+    ChildFilterDescription, ChildPushdownResult, FilterDescription, FilterPushdownPhase,
+    FilterPushdownPropagation, PushedDown,
 };
 use crate::limit::LimitStream;
 use crate::metrics::{
-    BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, SpillMetrics, SplitMetrics,
+    BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, SpillMetrics,
 };
-use crate::projection::{make_with_child, update_ordering, ProjectionExec};
+use crate::projection::{ProjectionExec, make_with_child, update_ordering};
+use crate::sorts::IncrementalSortIterator;
 use crate::sorts::streaming_merge::{SortedSpillFile, StreamingMergeBuilder};
 use crate::spill::get_record_batch_memory_size;
 use crate::spill::in_progress_spill_file::InProgressSpillFile;
 use crate::spill::spill_manager::{GetSlicedSize, SpillManager};
-use crate::stream::BatchSplitStream;
 use crate::stream::RecordBatchStreamAdapter;
+use crate::stream::ReservationStream;
 use crate::topk::TopK;
 use crate::topk::TopKDynamicFilters;
 use crate::{
@@ -51,20 +55,21 @@ use crate::{
     Statistics,
 };
 
-use arrow::array::{Array, RecordBatch, RecordBatchOptions, StringViewArray};
+use arrow::array::{RecordBatch, RecordBatchOptions};
 use arrow::compute::{concat_batches, lexsort_to_indices, take_arrays};
 use arrow::datatypes::SchemaRef;
 use datafusion_common::config::SpillCompression;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::{
-    internal_datafusion_err, internal_err, unwrap_or_internal_err, DataFusionError,
-    Result,
+    DataFusionError, Result, assert_or_internal_err, internal_datafusion_err,
+    unwrap_or_internal_err,
 };
+use datafusion_execution::TaskContext;
 use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
 use datafusion_execution::runtime_env::RuntimeEnv;
-use datafusion_execution::TaskContext;
-use datafusion_physical_expr::expressions::{lit, DynamicFilterPhysicalExpr};
 use datafusion_physical_expr::LexOrdering;
 use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_expr::expressions::{DynamicFilterPhysicalExpr, lit};
 
 use futures::{StreamExt, TryStreamExt};
 use log::{debug, trace};
@@ -74,8 +79,6 @@ struct ExternalSorterMetrics {
     baseline: BaselineMetrics,
 
     spill_metrics: SpillMetrics,
-
-    split_metrics: SplitMetrics,
 }
 
 impl ExternalSorterMetrics {
@@ -83,7 +86,6 @@ impl ExternalSorterMetrics {
         Self {
             baseline: BaselineMetrics::new(metrics, partition),
             spill_metrics: SpillMetrics::new(metrics, partition),
-            split_metrics: SplitMetrics::new(metrics, partition),
         }
     }
 }
@@ -265,7 +267,7 @@ struct ExternalSorter {
 impl ExternalSorter {
     // TODO: make a builder or some other nicer API to avoid the
     // clippy warning
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     pub fn new(
         partition_id: usize,
         schema: SchemaRef,
@@ -341,11 +343,6 @@ impl ExternalSorter {
     /// 2. A combined streaming merge incorporating both in-memory
     ///    batches and data from spill files on disk.
     async fn sort(&mut self) -> Result<SendableRecordBatchStream> {
-        // Release the memory reserved for merge back to the pool so
-        // there is some left when `in_mem_sort_stream` requests an
-        // allocation.
-        self.merge_reservation.free();
-
         if self.spilled_before() {
             // Sort `in_mem_batches` and spill it first. If there are many
             // `in_mem_batches` and the memory limit is almost reached, merging
@@ -354,6 +351,13 @@ impl ExternalSorter {
                 self.sort_and_spill_in_mem_batches().await?;
             }
 
+            // Transfer the pre-reserved merge memory to the streaming merge
+            // using `take()` instead of `new_empty()`. This ensures the merge
+            // stream starts with `sort_spill_reservation_bytes` already
+            // allocated, preventing starvation when concurrent sort partitions
+            // compete for pool memory. `take()` moves the bytes atomically
+            // without releasing them back to the pool, so other partitions
+            // cannot race to consume the freed memory.
             StreamingMergeBuilder::new()
                 .with_sorted_spill_files(std::mem::take(&mut self.finished_spill_files))
                 .with_spill_manager(self.spill_manager.clone())
@@ -362,9 +366,14 @@ impl ExternalSorter {
                 .with_metrics(self.metrics.baseline.clone())
                 .with_batch_size(self.batch_size)
                 .with_fetch(None)
-                .with_reservation(self.merge_reservation.new_empty())
+                .with_reservation(self.merge_reservation.take())
                 .build()
         } else {
+            // Release the memory reserved for merge back to the pool so
+            // there is some left when `in_mem_sort_stream` requests an
+            // allocation. Only needed for the non-spill path; the spill
+            // path transfers the reservation to the merge stream instead.
+            self.merge_reservation.free();
             self.in_mem_sort_stream(self.metrics.baseline.clone())
         }
     }
@@ -374,6 +383,12 @@ impl ExternalSorter {
         self.reservation.size()
     }
 
+    /// How much memory is reserved for the merge phase?
+    #[cfg(test)]
+    fn merge_reservation_size(&self) -> usize {
+        self.merge_reservation.size()
+    }
+
     /// How many bytes have been spilled to disk?
     fn spilled_bytes(&self) -> usize {
         self.metrics.spill_metrics.spilled_bytes.value()
@@ -405,8 +420,6 @@ impl ExternalSorter {
                 Some((self.spill_manager.create_in_progress_file("Sorting")?, 0));
         }
 
-        Self::organize_stringview_arrays(globally_sorted_batches)?;
-
         debug!("Spilling sort data of ExternalSorter to disk whilst inserting");
 
         let batches_to_spill = std::mem::take(globally_sorted_batches);
@@ -418,15 +431,15 @@ impl ExternalSorter {
             })?;
 
         for batch in batches_to_spill {
-            in_progress_file.append_batch(&batch)?;
+            let gc_sliced_size = in_progress_file.append_batch(&batch)?;
 
-            *max_record_batch_size =
-                (*max_record_batch_size).max(batch.get_sliced_size()?);
+            *max_record_batch_size = (*max_record_batch_size).max(gc_sliced_size);
         }
 
-        if !globally_sorted_batches.is_empty() {
-            return internal_err!("This function consumes globally_sorted_batches, so it should be empty after taking.");
-        }
+        assert_or_internal_err!(
+            globally_sorted_batches.is_empty(),
+            "This function consumes globally_sorted_batches, so it should be empty after taking."
+        );
 
         Ok(())
     }
@@ -449,79 +462,13 @@ impl ExternalSorter {
         Ok(())
     }
 
-    /// Reconstruct `globally_sorted_batches` to organize the payload buffers of each
-    /// `StringViewArray` in sequential order by calling `gc()` on them.
-    ///
-    /// Note this is a workaround until <https://github.com/apache/arrow-rs/issues/7185> is
-    /// available
-    ///
-    /// # Rationale
-    /// After (merge-based) sorting, all batches will be sorted into a single run,
-    /// but physically this sorted run is chunked into many small batches. For
-    /// `StringViewArray`s inside each sorted run, their inner buffers are not
-    /// re-constructed by default, leading to non-sequential payload locations
-    /// (permutated by `interleave()` Arrow kernel). A single payload buffer might
-    /// be shared by multiple `RecordBatch`es.
-    /// When writing each batch to disk, the writer has to write all referenced buffers,
-    /// because they have to be read back one by one to reduce memory usage. This
-    /// causes extra disk reads and writes, and potentially execution failure.
-    ///
-    /// # Example
-    /// Before sorting:
-    /// batch1 -> buffer1
-    /// batch2 -> buffer2
-    ///
-    /// sorted_batch1 -> buffer1
-    ///               -> buffer2
-    /// sorted_batch2 -> buffer1
-    ///               -> buffer2
-    ///
-    /// Then when spilling each batch, the writer has to write all referenced buffers
-    /// repeatedly.
-    fn organize_stringview_arrays(
-        globally_sorted_batches: &mut Vec<RecordBatch>,
-    ) -> Result<()> {
-        let mut organized_batches = Vec::with_capacity(globally_sorted_batches.len());
-
-        for batch in globally_sorted_batches.drain(..) {
-            let mut new_columns: Vec<Arc<dyn Array>> =
-                Vec::with_capacity(batch.num_columns());
-
-            let mut arr_mutated = false;
-            for array in batch.columns() {
-                if let Some(string_view_array) =
-                    array.as_any().downcast_ref::<StringViewArray>()
-                {
-                    let new_array = string_view_array.gc();
-                    new_columns.push(Arc::new(new_array));
-                    arr_mutated = true;
-                } else {
-                    new_columns.push(Arc::clone(array));
-                }
-            }
-
-            let organized_batch = if arr_mutated {
-                RecordBatch::try_new(batch.schema(), new_columns)?
-            } else {
-                batch
-            };
-
-            organized_batches.push(organized_batch);
-        }
-
-        *globally_sorted_batches = organized_batches;
-
-        Ok(())
-    }
-
     /// Sorts the in-memory batches and merges them into a single sorted run, then writes
     /// the result to spill files.
     async fn sort_and_spill_in_mem_batches(&mut self) -> Result<()> {
-        if self.in_mem_batches.is_empty() {
-            return internal_err!(
-                "in_mem_batches must not be empty when attempting to sort and spill"
-            );
-        }
+        assert_or_internal_err!(
+            !self.in_mem_batches.is_empty(),
+            "in_mem_batches must not be empty when attempting to sort and spill"
+        );
 
         // Release the memory reserved for merge back to the pool so
         // there is some left when `in_mem_sort_stream` requests an
@@ -533,11 +480,10 @@ impl ExternalSorter {
             self.in_mem_sort_stream(self.metrics.baseline.intermediate())?;
         // After `in_mem_sort_stream()` is constructed, all `in_mem_batches` is taken
         // to construct a globally sorted stream.
-        if !self.in_mem_batches.is_empty() {
-            return internal_err!(
-                "in_mem_batches should be empty after constructing sorted stream"
-            );
-        }
+        assert_or_internal_err!(
+            self.in_mem_batches.is_empty(),
+            "in_mem_batches should be empty after constructing sorted stream"
+        );
         // 'global' here refers to all buffered batches when the memory limit is
         // reached. This variable will buffer the sorted batches after
         // sort-preserving merge and incrementally append to spill files.
@@ -545,7 +491,7 @@ impl ExternalSorter {
 
         while let Some(batch) = sorted_stream.next().await {
             let batch = batch?;
-            let sorted_size = get_reserved_byte_for_record_batch(&batch);
+            let sorted_size = get_reserved_bytes_for_record_batch(&batch)?;
             if self.reservation.try_grow(sorted_size).is_err() {
                 // Although the reservation is not enough, the batch is
                 // already in memory, so it's okay to combine it with previously
@@ -569,11 +515,10 @@ impl ExternalSorter {
         // Sanity check after spilling
         let buffers_cleared_property =
             self.in_mem_batches.is_empty() && globally_sorted_batches.is_empty();
-        if !buffers_cleared_property {
-            return internal_err!(
-                "in_mem_batches and globally_sorted_batches should be cleared before"
-            );
-        }
+        assert_or_internal_err!(
+            buffers_cleared_property,
+            "in_mem_batches and globally_sorted_batches should be cleared before"
+        );
 
         // Reserve headroom for next sort/merge
         self.reserve_memory_for_merge()?;
@@ -663,7 +608,7 @@ impl ExternalSorter {
         if self.in_mem_batches.len() == 1 {
             let batch = self.in_mem_batches.swap_remove(0);
             let reservation = self.reservation.take();
-            return self.sort_batch_stream(batch, metrics, reservation, true);
+            return self.sort_batch_stream(batch, &metrics, reservation);
         }
 
         // If less than sort_in_place_threshold_bytes, concatenate and sort in place
@@ -672,10 +617,10 @@ impl ExternalSorter {
             let batch = concat_batches(&self.schema, &self.in_mem_batches)?;
             self.in_mem_batches.clear();
             self.reservation
-                .try_resize(get_reserved_byte_for_record_batch(&batch))
+                .try_resize(get_reserved_bytes_for_record_batch(&batch)?)
                 .map_err(Self::err_with_oom_context)?;
             let reservation = self.reservation.take();
-            return self.sort_batch_stream(batch, metrics, reservation, true);
+            return self.sort_batch_stream(batch, &metrics, reservation);
         }
 
         let streams = std::mem::take(&mut self.in_mem_batches)
@@ -684,15 +629,8 @@ impl ExternalSorter {
                 let metrics = self.metrics.baseline.intermediate();
                 let reservation = self
                     .reservation
-                    .split(get_reserved_byte_for_record_batch(&batch));
-                let input = self.sort_batch_stream(
-                    batch,
-                    metrics,
-                    reservation,
-                    // Passing false as `StreamingMergeBuilder` will split the
-                    // stream into batches of `self.batch_size` rows.
-                    false,
-                )?;
+                    .split(get_reserved_bytes_for_record_batch(&batch)?);
+                let input = self.sort_batch_stream(batch, &metrics, reservation)?;
                 Ok(spawn_buffered(input, 1))
             })
             .collect::<Result<_>>()?;
@@ -710,52 +648,67 @@ impl ExternalSorter {
 
     /// Sorts a single `RecordBatch` into a single stream.
     ///
-    /// `reservation` accounts for the memory used by this batch and
-    /// is released when the sort is complete
-    ///
-    /// passing `split` true will return a [`BatchSplitStream`] where each batch maximum row count
-    /// will be `self.batch_size`.
-    /// If `split` is false, the stream will return a single batch
+    /// This may output multiple batches depending on the size of the
+    /// sorted data and the target batch size.
+    /// For single-batch output cases, `reservation` will be freed immediately after sorting,
+    /// as the batch will be output and is expected to be reserved by the consumer of the stream.
+    /// For multi-batch output cases, `reservation` will be grown to match the actual
+    /// size of sorted output, and as each batch is output, its memory will be freed from the reservation.
+    /// (This leads to the same behaviour, as futures are only evaluated when polled by the consumer.)
     fn sort_batch_stream(
         &self,
         batch: RecordBatch,
-        metrics: BaselineMetrics,
+        metrics: &BaselineMetrics,
         reservation: MemoryReservation,
-        mut split: bool,
     ) -> Result<SendableRecordBatchStream> {
         assert_eq!(
-            get_reserved_byte_for_record_batch(&batch),
+            get_reserved_bytes_for_record_batch(&batch)?,
             reservation.size()
         );
 
-        split = split && batch.num_rows() > self.batch_size;
-
         let schema = batch.schema();
-
         let expressions = self.expr.clone();
-        let stream = futures::stream::once(async move {
-            let _timer = metrics.elapsed_compute().timer();
+        let batch_size = self.batch_size;
+        let output_row_metrics = metrics.output_rows().clone();
 
-            let sorted = sort_batch(&batch, &expressions, None)?;
+        let stream = futures::stream::once(async move {
+            let schema = batch.schema();
 
-            metrics.record_output(sorted.num_rows());
-            drop(batch);
-            drop(reservation);
-            Ok(sorted)
-        });
+            // Sort the batch immediately and get all output batches
+            let sorted_batches = sort_batch_chunked(&batch, &expressions, batch_size)?;
 
-        let mut output: SendableRecordBatchStream =
-            Box::pin(RecordBatchStreamAdapter::new(schema, stream));
+            // Resize the reservation to match the actual sorted output size.
+            // Using try_resize avoids a release-then-reacquire cycle, which
+            // matters for MemoryPool implementations where grow/shrink have
+            // non-trivial cost (e.g. JNI calls in Comet).
+            let total_sorted_size: usize = sorted_batches
+                .iter()
+                .map(get_record_batch_memory_size)
+                .sum();
+            reservation
+                .try_resize(total_sorted_size)
+                .map_err(Self::err_with_oom_context)?;
 
-        if split {
-            output = Box::pin(BatchSplitStream::new(
-                output,
-                self.batch_size,
-                self.metrics.split_metrics.clone(),
-            ));
-        }
+            // Wrap in ReservationStream to hold the reservation
+            Result::<_, DataFusionError>::Ok(Box::pin(ReservationStream::new(
+                Arc::clone(&schema),
+                Box::pin(RecordBatchStreamAdapter::new(
+                    Arc::clone(&schema),
+                    futures::stream::iter(sorted_batches.into_iter().map(Ok)),
+                )),
+                reservation,
+            )) as SendableRecordBatchStream)
+        })
+        .try_flatten()
+        .map(move |batch| match batch {
+            Ok(batch) => {
+                output_row_metrics.add(batch.num_rows());
+                Ok(batch)
+            }
+            Err(e) => Err(e),
+        });
 
-        Ok(output)
+        Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
     }
 
     /// If this sort may spill, pre-allocates
@@ -781,7 +734,7 @@ impl ExternalSorter {
         &mut self,
         input: &RecordBatch,
     ) -> Result<()> {
-        let size = get_reserved_byte_for_record_batch(input);
+        let size = get_reserved_bytes_for_record_batch(input)?;
 
         match self.reservation.try_grow(size) {
             Ok(_) => Ok(()),
@@ -805,7 +758,8 @@ impl ExternalSorter {
         match e {
             DataFusionError::ResourcesExhausted(_) => e.context(
                 "Not enough memory to continue external sort. \
-                    Consider increasing the memory limit, or decreasing sort_spill_reservation_bytes"
+                    Consider increasing the memory limit config: 'datafusion.runtime.memory_limit', \
+                    or decreasing the config: 'datafusion.execution.sort_spill_reservation_bytes'."
             ),
             // This is not an OOM error, so just return it as is.
             _ => e,
@@ -820,16 +774,29 @@ impl ExternalSorter {
 /// in sorting and merging. The sorted copies are in either row format or array format.
 /// Please refer to cursor.rs and stream.rs for more details. No matter what format the
 /// sorted copies are, they will use more memory than the original record batch.
-pub(crate) fn get_reserved_byte_for_record_batch_size(record_batch_size: usize) -> usize {
-    // 2x may not be enough for some cases, but it's a good start.
+///
+/// This can basically be calculated as the sum of the actual space it takes in
+/// memory (which would be larger for a sliced batch), and the size of the actual data.
+pub(crate) fn get_reserved_bytes_for_record_batch_size(
+    record_batch_size: usize,
+    sliced_size: usize,
+) -> usize {
+    // Even 2x may not be enough for some cases, but it's a good enough estimation as a baseline.
     // If 2x is not enough, user can set a larger value for `sort_spill_reservation_bytes`
     // to compensate for the extra memory needed.
-    record_batch_size * 2
+    record_batch_size + sliced_size
 }
 
 /// Estimate how much memory is needed to sort a `RecordBatch`.
-fn get_reserved_byte_for_record_batch(batch: &RecordBatch) -> usize {
-    get_reserved_byte_for_record_batch_size(get_record_batch_memory_size(batch))
+/// This will just call `get_reserved_bytes_for_record_batch_size` with the
+/// memory size of the record batch and its sliced size.
+pub(crate) fn get_reserved_bytes_for_record_batch(batch: &RecordBatch) -> Result<usize> {
+    batch.get_sliced_size().map(|sliced_size| {
+        get_reserved_bytes_for_record_batch_size(
+            get_record_batch_memory_size(batch),
+            sliced_size,
+        )
+    })
 }
 
 impl Debug for ExternalSorter {
@@ -854,15 +821,7 @@ pub fn sort_batch(
         .collect::<Result<Vec<_>>>()?;
 
     let indices = lexsort_to_indices(&sort_columns, fetch)?;
-    let mut columns = take_arrays(batch.columns(), &indices, None)?;
-
-    // The columns may be larger than the unsorted columns in `batch` especially for variable length
-    // data types due to exponential growth when building the sort columns. We shrink the columns
-    // to prevent memory reservation failures, as well as excessive memory allocation when running
-    // merges in `SortPreservingMergeStream`.
-    columns.iter_mut().for_each(|c| {
-        c.shrink_to_fit();
-    });
+    let columns = take_arrays(batch.columns(), &indices, None)?;
 
     let options = RecordBatchOptions::new().with_row_count(Some(indices.len()));
     Ok(RecordBatch::try_new_with_options(
@@ -872,6 +831,17 @@ pub fn sort_batch(
     )?)
 }
 
+/// Sort a batch and return the result as multiple batches of size `batch_size`.
+/// This is useful when you want to avoid creating one large sorted batch in memory,
+/// and instead want to process the sorted data in smaller chunks.
+pub fn sort_batch_chunked(
+    batch: &RecordBatch,
+    expressions: &LexOrdering,
+    batch_size: usize,
+) -> Result<Vec<RecordBatch>> {
+    IncrementalSortIterator::new(batch.clone(), expressions.clone(), batch_size).collect()
+}
+
 /// Sort execution plan.
 ///
 /// Support sorting datasets that are larger than the memory allotted
@@ -892,7 +862,7 @@ pub struct SortExec {
     /// Normalized common sort prefix between the input and the sort expressions (only used with fetch)
     common_sort_prefix: Vec<PhysicalSortExpr>,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
     /// Filter matching the state of the sort for dynamic filter pushdown.
     /// If `fetch` is `Some`, this will also be set and a TopK operator may be used.
     /// If `fetch` is `None`, this will be `None`.
@@ -914,7 +884,7 @@ impl SortExec {
             preserve_partitioning,
             fetch: None,
             common_sort_prefix: sort_prefix,
-            cache,
+            cache: Arc::new(cache),
             filter: None,
         }
     }
@@ -933,12 +903,8 @@ impl SortExec {
     /// input partitions producing a single, sorted partition.
     pub fn with_preserve_partitioning(mut self, preserve_partitioning: bool) -> Self {
         self.preserve_partitioning = preserve_partitioning;
-        self.cache = self
-            .cache
-            .with_partitioning(Self::output_partitioning_helper(
-                &self.input,
-                self.preserve_partitioning,
-            ));
+        Arc::make_mut(&mut self.cache).partitioning =
+            Self::output_partitioning_helper(&self.input, self.preserve_partitioning);
         self
     }
 
@@ -962,7 +928,7 @@ impl SortExec {
             preserve_partitioning: self.preserve_partitioning,
             common_sort_prefix: self.common_sort_prefix.clone(),
             fetch: self.fetch,
-            cache: self.cache.clone(),
+            cache: Arc::clone(&self.cache),
             filter: self.filter.clone(),
         }
     }
@@ -975,12 +941,12 @@ impl SortExec {
     /// operation since rows that are not going to be included
     /// can be dropped.
     pub fn with_fetch(&self, fetch: Option<usize>) -> Self {
-        let mut cache = self.cache.clone();
+        let mut cache = PlanProperties::clone(&self.cache);
         // If the SortExec can emit incrementally (that means the sort requirements
         // and properties of the input match), the SortExec can generate its result
         // without scanning the entire input when a fetch value exists.
         let is_pipeline_friendly = matches!(
-            self.cache.emission_type,
+            cache.emission_type,
             EmissionType::Incremental | EmissionType::Both
         );
         if fetch.is_some() && is_pipeline_friendly {
@@ -992,7 +958,7 @@ impl SortExec {
         });
         let mut new_sort = self.cloned();
         new_sort.fetch = fetch;
-        new_sort.cache = cache;
+        new_sort.cache = cache.into();
         new_sort.filter = filter;
         new_sort
     }
@@ -1012,6 +978,30 @@ impl SortExec {
         self.fetch
     }
 
+    /// Returns the dynamic filter expression for this sort (TopK), if set.
+    pub fn dynamic_filter(&self) -> Option<Arc<DynamicFilterPhysicalExpr>> {
+        self.filter.as_ref().map(|f| f.read().expr())
+    }
+
+    /// Replace the dynamic filter expression for this sort.
+    ///
+    ///
+    /// Resets any internal state which may depend on the previous dynamic filter.
+    ///
+    /// Validates that the filter's children reference valid columns in
+    /// the sort's input schema.
+    pub fn with_dynamic_filter(
+        mut self,
+        filter: Arc<DynamicFilterPhysicalExpr>,
+    ) -> Result<Self> {
+        let input_schema = self.input.schema();
+        for child in filter.children() {
+            child.data_type(&input_schema)?;
+        }
+        self.filter = Some(Arc::new(RwLock::new(TopKDynamicFilters::new(filter))));
+        Ok(self)
+    }
+
     fn output_partitioning_helper(
         input: &Arc<dyn ExecutionPlan>,
         preserve_partitioning: bool,
@@ -1088,13 +1078,16 @@ impl DisplayAs for SortExec {
                 let preserve_partitioning = self.preserve_partitioning;
                 match self.fetch {
                     Some(fetch) => {
-                        write!(f, "SortExec: TopK(fetch={fetch}), expr=[{}], preserve_partitioning=[{preserve_partitioning}]", self.expr)?;
-                        if let Some(filter) = &self.filter {
-                            if let Ok(current) = filter.read().expr().current() {
-                                if !current.eq(&lit(true)) {
-                                    write!(f, ", filter=[{current}]")?;
-                                }
-                            }
+                        write!(
+                            f,
+                            "SortExec: TopK(fetch={fetch}), expr=[{}], preserve_partitioning=[{preserve_partitioning}]",
+                            self.expr
+                        )?;
+                        if let Some(filter) = &self.filter
+                            && let Ok(current) = filter.read().expr().current()
+                            && !current.eq(&lit(true))
+                        {
+                            write!(f, ", filter=[{current}]")?;
                         }
                         if !self.common_sort_prefix.is_empty() {
                             write!(f, ", sort_prefix=[")?;
@@ -1112,7 +1105,11 @@ impl DisplayAs for SortExec {
                             Ok(())
                         }
                     }
-                    None => write!(f, "SortExec: expr=[{}], preserve_partitioning=[{preserve_partitioning}]", self.expr),
+                    None => write!(
+                        f,
+                        "SortExec: expr=[{}], preserve_partitioning=[{preserve_partitioning}]",
+                        self.expr
+                    ),
                 }
             }
             DisplayFormatType::TreeRender => match self.fetch {
@@ -1136,11 +1133,7 @@ impl ExecutionPlan for SortExec {
         }
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -1158,6 +1151,25 @@ impl ExecutionPlan for SortExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to sort expressions
+        let mut tnr = TreeNodeRecursion::Continue;
+        for sort_expr in &self.expr {
+            tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+        }
+
+        // Apply to dynamic filter expression if present (when fetch is Some, TopK mode)
+        if let Some(filter) = &self.filter {
+            let filter_guard = filter.read();
+            tnr = tnr.visit_sibling(|| f(filter_guard.expr().as_ref()))?;
+        }
+
+        Ok(tnr)
+    }
+
     fn benefits_from_input_partitioning(&self) -> Vec<bool> {
         vec![false]
     }
@@ -1167,19 +1179,19 @@ impl ExecutionPlan for SortExec {
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let mut new_sort = self.cloned();
-        assert!(
-            children.len() == 1,
-            "SortExec should have exactly one child"
-        );
+        assert_eq!(children.len(), 1, "SortExec should have exactly one child");
         new_sort.input = Arc::clone(&children[0]);
-        // Recompute the properties based on the new input since they may have changed
-        let (cache, sort_prefix) = Self::compute_properties(
-            &new_sort.input,
-            new_sort.expr.clone(),
-            new_sort.preserve_partitioning,
-        )?;
-        new_sort.cache = cache;
-        new_sort.common_sort_prefix = sort_prefix;
+
+        if !has_same_children_properties(self.as_ref(), &children)? {
+            // Recompute the properties based on the new input since they may have changed
+            let (cache, sort_prefix) = Self::compute_properties(
+                &new_sort.input,
+                new_sort.expr.clone(),
+                new_sort.preserve_partitioning,
+            )?;
+            new_sort.cache = Arc::new(cache);
+            new_sort.common_sort_prefix = sort_prefix;
+        }
 
         Ok(Arc::new(new_sort))
     }
@@ -1188,7 +1200,6 @@ impl ExecutionPlan for SortExec {
         let children = self.children().into_iter().cloned().collect();
         let new_sort = self.with_new_children(children)?;
         let mut new_sort = new_sort
-            .as_any()
             .downcast_ref::<SortExec>()
             .expect("cloned 1 lines above this line, we know the type")
             .clone();
@@ -1204,7 +1215,12 @@ impl ExecutionPlan for SortExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        trace!("Start SortExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
+        trace!(
+            "Start SortExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
 
         let mut input = self.input.execute(partition, Arc::clone(&context))?;
 
@@ -1284,20 +1300,14 @@ impl ExecutionPlan for SortExec {
         Some(self.metrics_set.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        if !self.preserve_partitioning() {
-            return self
-                .input
-                .partition_statistics(None)?
-                .with_fetch(self.fetch, 0, 1);
-        }
-        self.input
-            .partition_statistics(partition)?
-            .with_fetch(self.fetch, 0, 1)
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let p = if !self.preserve_partitioning() {
+            None
+        } else {
+            partition
+        };
+        let stats = Arc::unwrap_or_clone(self.input.partition_statistics(p)?);
+        Ok(Arc::new(stats.with_fetch(self.fetch, 0, 1)?))
     }
 
     fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn ExecutionPlan>> {
@@ -1346,21 +1356,84 @@ impl ExecutionPlan for SortExec {
         parent_filters: Vec<Arc<dyn PhysicalExpr>>,
         config: &datafusion_common::config::ConfigOptions,
     ) -> Result<FilterDescription> {
-        if !matches!(phase, FilterPushdownPhase::Post) {
+        if phase != FilterPushdownPhase::Post {
+            if self.fetch.is_some() {
+                return Ok(FilterDescription::all_unsupported(
+                    &parent_filters,
+                    &self.children(),
+                ));
+            }
             return FilterDescription::from_children(parent_filters, &self.children());
         }
 
-        let mut child =
-            ChildFilterDescription::from_child(&parent_filters, self.input())?;
+        // In Post phase: block parent filters when fetch is set,
+        // but still push the TopK dynamic filter (self-filter).
+        let mut child = if self.fetch.is_some() {
+            ChildFilterDescription::all_unsupported(&parent_filters)
+        } else {
+            ChildFilterDescription::from_child(&parent_filters, self.input())?
+        };
 
-        if let Some(filter) = &self.filter {
-            if config.optimizer.enable_topk_dynamic_filter_pushdown {
-                child = child.with_self_filter(filter.read().expr());
-            }
+        if let Some(filter) = &self.filter
+            && config.optimizer.enable_topk_dynamic_filter_pushdown
+        {
+            child = child.with_self_filter(filter.read().expr());
         }
 
         Ok(FilterDescription::new().with_child(child))
     }
+
+    fn handle_child_pushdown_result(
+        &self,
+        _phase: FilterPushdownPhase,
+        child_pushdown_result: ChildPushdownResult,
+        _config: &datafusion_common::config::ConfigOptions,
+    ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
+        // For a plain sort (no fetch) we intercept any unsupported filters
+        // by inserting a FilterExec below this Sort. Moving the filter below
+        // Sort is safe because Sort preserves all rows.
+        //
+        // Why not fetch (TopK)?
+        // A sort with fetch limits the number of output rows.  Inserting a
+        // FilterExec *below* the TopK would change semantics.  A filter *above*
+        // the TopK is supposed to post-filter its output (e.g. "take the top 10
+        // rows, then keep only those with a > 5").  Pushing the filter below
+        // Sort changes the meaning to "filter first, then take top 10", which
+        // produces a different result.
+        if self.fetch.is_some() {
+            return Ok(FilterPushdownPropagation::if_all(child_pushdown_result));
+        }
+
+        // Collect parent filters that were NOT successfully pushed to our child.
+        let unsupported_filters: Vec<Arc<dyn PhysicalExpr>> = child_pushdown_result
+            .parent_filters
+            .iter()
+            .filter(|&f| matches!(f.all(), PushedDown::No))
+            .map(|f| Arc::clone(&f.filter))
+            .collect();
+
+        if unsupported_filters.is_empty() {
+            // All filters were pushed — nothing extra to do.
+            return Ok(FilterPushdownPropagation::if_all(child_pushdown_result));
+        }
+
+        // Build a single conjunctive predicate from the unsupported filters
+        // and insert a FilterExec between this SortExec and its child.
+        let predicate = datafusion_physical_expr::conjunction(unsupported_filters);
+        let new_child =
+            Arc::new(FilterExec::try_new(predicate, Arc::clone(self.input()))?)
+                as Arc<dyn ExecutionPlan>;
+        let new_sort = Arc::new(
+            SortExec::new(self.expr.clone(), new_child)
+                .with_fetch(self.fetch())
+                .with_preserve_partitioning(self.preserve_partitioning()),
+        ) as Arc<dyn ExecutionPlan>;
+
+        Ok(FilterPushdownPropagation {
+            filters: vec![PushedDown::Yes; child_pushdown_result.parent_filters.len()],
+            updated_node: Some(new_sort),
+        })
+    }
 }
 
 #[cfg(test)]
@@ -1372,33 +1445,39 @@ mod tests {
     use super::*;
     use crate::coalesce_partitions::CoalescePartitionsExec;
     use crate::collect;
+    use crate::empty::EmptyExec;
     use crate::execution_plan::Boundedness;
     use crate::expressions::col;
+    use crate::filter_pushdown::{FilterPushdownPhase, PushedDown};
     use crate::test;
-    use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec};
     use crate::test::TestMemoryExec;
+    use crate::test::exec::{BlockingExec, assert_strong_count_converges_to_zero};
     use crate::test::{assert_is_pending, make_partition};
 
     use arrow::array::*;
     use arrow::compute::SortOptions;
     use arrow::datatypes::*;
+    use datafusion_common::ScalarValue;
     use datafusion_common::cast::as_primitive_array;
+    use datafusion_common::config::ConfigOptions;
     use datafusion_common::test_util::batches_to_string;
-    use datafusion_common::{DataFusionError, Result, ScalarValue};
+    use datafusion_execution::RecordBatchStream;
     use datafusion_execution::config::SessionConfig;
+    use datafusion_execution::memory_pool::{
+        GreedyMemoryPool, MemoryConsumer, MemoryPool,
+    };
     use datafusion_execution::runtime_env::RuntimeEnvBuilder;
-    use datafusion_execution::RecordBatchStream;
-    use datafusion_physical_expr::expressions::{Column, Literal};
     use datafusion_physical_expr::EquivalenceProperties;
+    use datafusion_physical_expr::expressions::{Column, Literal};
 
-    use futures::{FutureExt, Stream};
+    use futures::{FutureExt, Stream, TryStreamExt};
     use insta::assert_snapshot;
 
     #[derive(Debug, Clone)]
     pub struct SortedUnboundedExec {
         schema: Schema,
         batch_size: u64,
-        cache: PlanProperties,
+        cache: Arc<PlanProperties>,
     }
 
     impl DisplayAs for SortedUnboundedExec {
@@ -1434,11 +1513,7 @@ mod tests {
             Self::static_name()
         }
 
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             &self.cache
         }
 
@@ -1453,6 +1528,13 @@ mod tests {
             Ok(self)
         }
 
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
+
         fn execute(
             &self,
             _partition: usize,
@@ -1612,13 +1694,24 @@ mod tests {
     #[tokio::test]
     async fn test_batch_reservation_error() -> Result<()> {
         // Pick a memory limit and sort_spill_reservation that make the first batch reservation fail.
-        // These values assume that the ExternalSorter will reserve 800 bytes for the first batch.
-        let expected_batch_reservation = 800;
         let merge_reservation: usize = 0; // Set to 0 for simplicity
-        let memory_limit: usize = expected_batch_reservation + merge_reservation - 1; // Just short of what we need
 
         let session_config =
             SessionConfig::new().with_sort_spill_reservation_bytes(merge_reservation);
+
+        let plan = test::scan_partitioned(1);
+
+        // Read the first record batch to determine the actual memory requirement
+        let expected_batch_reservation = {
+            let temp_ctx = Arc::new(TaskContext::default());
+            let mut stream = plan.execute(0, Arc::clone(&temp_ctx))?;
+            let first_batch = stream.next().await.unwrap()?;
+            get_reserved_bytes_for_record_batch(&first_batch)?
+        };
+
+        // Set memory limit just short of what we need
+        let memory_limit: usize = expected_batch_reservation + merge_reservation - 1;
+
         let runtime = RuntimeEnvBuilder::new()
             .with_memory_limit(memory_limit, 1.0)
             .build_arc()?;
@@ -1628,14 +1721,11 @@ mod tests {
                 .with_runtime(runtime),
         );
 
-        let plan = test::scan_partitioned(1);
-
-        // Read the first record batch to assert that our memory limit and sort_spill_reservation
-        // settings trigger the test scenario.
+        // Verify that our memory limit is insufficient
         {
             let mut stream = plan.execute(0, Arc::clone(&task_ctx))?;
             let first_batch = stream.next().await.unwrap()?;
-            let batch_reservation = get_reserved_byte_for_record_batch(&first_batch);
+            let batch_reservation = get_reserved_bytes_for_record_batch(&first_batch)?;
 
             assert_eq!(batch_reservation, expected_batch_reservation);
             assert!(memory_limit < (merge_reservation + batch_reservation));
@@ -1660,6 +1750,21 @@ mod tests {
             "Assertion failed: expected a ResourcesExhausted error, but got: {err:?}"
         );
 
+        // Verify external sorter error message when resource is exhausted
+        let config_vector = vec![
+            "datafusion.runtime.memory_limit",
+            "datafusion.execution.sort_spill_reservation_bytes",
+        ];
+        let error_message = err.message().to_string();
+        for config in config_vector.into_iter() {
+            assert!(
+                error_message.as_str().contains(config),
+                "Config: '{}' should be contained in error message: {}.",
+                config,
+                error_message.as_str()
+            );
+        }
+
         Ok(())
     }
 
@@ -1680,7 +1785,7 @@ mod tests {
 
         // The input has 200 partitions, each partition has a batch containing 100 rows.
         // Each row has a single Utf8 column, the Utf8 string values are roughly 42 bytes.
-        // The total size of the input is roughly 8.4 KB.
+        // The total size of the input is roughly 820 KB.
         let input = test::scan_partitioned_utf8(200);
         let schema = input.schema();
 
@@ -1803,6 +1908,93 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_sort_memory_reduction_per_batch() -> Result<()> {
+        // This test verifies that memory reservation is reduced for every batch emitted
+        // during the sort process. This is important to ensure we don't hold onto
+        // memory longer than necessary.
+
+        // Create a large enough batch that will be split into multiple output batches
+        let batch_size = 50; // Small batch size to force multiple output batches
+        let num_rows = 1000; // Create enough data for multiple batches
+
+        let task_ctx = Arc::new(
+            TaskContext::default().with_session_config(
+                SessionConfig::new()
+                    .with_batch_size(batch_size)
+                    .with_sort_in_place_threshold_bytes(usize::MAX), // Ensure we don't concat batches
+            ),
+        );
+
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        // Create unsorted data
+        let mut values: Vec<i32> = (0..num_rows).collect();
+        values.reverse();
+
+        let input_batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(values))],
+        )?;
+
+        let batches = vec![input_batch];
+
+        let sort_exec = Arc::new(SortExec::new(
+            [PhysicalSortExpr {
+                expr: Arc::new(Column::new("a", 0)),
+                options: SortOptions::default(),
+            }]
+            .into(),
+            TestMemoryExec::try_new_exec(
+                std::slice::from_ref(&batches),
+                Arc::clone(&schema),
+                None,
+            )?,
+        ));
+
+        let mut stream = sort_exec.execute(0, Arc::clone(&task_ctx))?;
+
+        let mut previous_reserved = task_ctx.runtime_env().memory_pool.reserved();
+        let mut batch_count = 0;
+
+        // Collect batches and verify memory is reduced with each batch
+        while let Some(result) = stream.next().await {
+            let batch = result?;
+            batch_count += 1;
+
+            // Verify we got a non-empty batch
+            assert!(batch.num_rows() > 0, "Batch should not be empty");
+
+            let current_reserved = task_ctx.runtime_env().memory_pool.reserved();
+
+            // After the first batch, memory should be reducing or staying the same
+            // (it should not increase as we emit batches)
+            if batch_count > 1 {
+                assert!(
+                    current_reserved <= previous_reserved,
+                    "Memory reservation should decrease or stay same as batches are emitted. \
+                     Batch {batch_count}: previous={previous_reserved}, current={current_reserved}"
+                );
+            }
+
+            previous_reserved = current_reserved;
+        }
+
+        assert!(
+            batch_count > 1,
+            "Expected multiple batches to be emitted, got {batch_count}"
+        );
+
+        // Verify all memory is returned at the end
+        assert_eq!(
+            task_ctx.runtime_env().memory_pool.reserved(),
+            0,
+            "All memory should be returned after consuming all batches"
+        );
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_sort_metadata() -> Result<()> {
         let task_ctx = Arc::new(TaskContext::default());
@@ -2096,7 +2288,9 @@ mod tests {
         let source = SortedUnboundedExec {
             schema: schema.clone(),
             batch_size: 2,
-            cache: SortedUnboundedExec::compute_properties(Arc::new(schema.clone())),
+            cache: Arc::new(SortedUnboundedExec::compute_properties(Arc::new(
+                schema.clone(),
+            ))),
         };
         let mut plan = SortExec::new(
             [PhysicalSortExpr::new_default(Arc::new(Column::new(
@@ -2108,21 +2302,21 @@ mod tests {
         plan = plan.with_fetch(Some(9));
 
         let batches = collect(Arc::new(plan), task_ctx).await?;
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+
-            | c1 |
-            +----+
-            | 0  |
-            | 1  |
-            | 2  |
-            | 3  |
-            | 4  |
-            | 5  |
-            | 6  |
-            | 7  |
-            | 8  |
-            +----+
-            "#);
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +----+
+        | c1 |
+        +----+
+        | 0  |
+        | 1  |
+        | 2  |
+        | 3  |
+        | 4  |
+        | 5  |
+        | 6  |
+        | 7  |
+        | 8  |
+        +----+
+        ");
         Ok(())
     }
 
@@ -2151,8 +2345,8 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn should_return_stream_with_batches_in_the_requested_size_when_sorting_in_place(
-    ) -> Result<()> {
+    async fn should_return_stream_with_batches_in_the_requested_size_when_sorting_in_place()
+    -> Result<()> {
         let batch_size = 100;
 
         let create_task_ctx = |_: &[RecordBatch]| {
@@ -2203,8 +2397,8 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn should_return_stream_with_batches_in_the_requested_size_when_having_a_single_batch(
-    ) -> Result<()> {
+    async fn should_return_stream_with_batches_in_the_requested_size_when_having_a_single_batch()
+    -> Result<()> {
         let batch_size = 100;
 
         let create_task_ctx = |_: &[RecordBatch]| {
@@ -2267,8 +2461,8 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn should_return_stream_with_batches_in_the_requested_size_when_having_to_spill(
-    ) -> Result<()> {
+    async fn should_return_stream_with_batches_in_the_requested_size_when_having_to_spill()
+    -> Result<()> {
         let batch_size = 100;
 
         let create_task_ctx = |generated_batches: &[RecordBatch]| {
@@ -2391,4 +2585,423 @@ mod tests {
 
         Ok((sorted_batches, metrics))
     }
+
+    #[tokio::test]
+    async fn test_sort_batch_chunked_basic() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        // Create a batch with 1000 rows
+        let mut values: Vec<i32> = (0..1000).collect();
+        // Shuffle to make it unsorted
+        values.reverse();
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(values))],
+        )?;
+
+        let expressions: LexOrdering =
+            [PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)))].into();
+
+        // Sort with batch_size = 250
+        let result_batches = sort_batch_chunked(&batch, &expressions, 250)?;
+
+        // Verify 4 batches are returned
+        assert_eq!(result_batches.len(), 4);
+
+        // Verify each batch has <= 250 rows
+        let mut total_rows = 0;
+        for (i, batch) in result_batches.iter().enumerate() {
+            assert!(
+                batch.num_rows() <= 250,
+                "Batch {} has {} rows, expected <= 250",
+                i,
+                batch.num_rows()
+            );
+            total_rows += batch.num_rows();
+        }
+
+        // Verify total row count matches input
+        assert_eq!(total_rows, 1000);
+
+        // Verify data is correctly sorted across all chunks
+        let concatenated = concat_batches(&schema, &result_batches)?;
+        let array = as_primitive_array::<Int32Type>(concatenated.column(0))?;
+        for i in 0..array.len() - 1 {
+            assert!(
+                array.value(i) <= array.value(i + 1),
+                "Array not sorted at position {}: {} > {}",
+                i,
+                array.value(i),
+                array.value(i + 1)
+            );
+        }
+        assert_eq!(array.value(0), 0);
+        assert_eq!(array.value(array.len() - 1), 999);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_sort_batch_chunked_smaller_than_batch_size() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        // Create a batch with 50 rows
+        let values: Vec<i32> = (0..50).rev().collect();
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(values))],
+        )?;
+
+        let expressions: LexOrdering =
+            [PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)))].into();
+
+        // Sort with batch_size = 100
+        let result_batches = sort_batch_chunked(&batch, &expressions, 100)?;
+
+        // Should return exactly 1 batch
+        assert_eq!(result_batches.len(), 1);
+        assert_eq!(result_batches[0].num_rows(), 50);
+
+        // Verify it's correctly sorted
+        let array = as_primitive_array::<Int32Type>(result_batches[0].column(0))?;
+        for i in 0..array.len() - 1 {
+            assert!(array.value(i) <= array.value(i + 1));
+        }
+        assert_eq!(array.value(0), 0);
+        assert_eq!(array.value(49), 49);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_sort_batch_chunked_exact_multiple() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        // Create a batch with 1000 rows
+        let values: Vec<i32> = (0..1000).rev().collect();
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(values))],
+        )?;
+
+        let expressions: LexOrdering =
+            [PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)))].into();
+
+        // Sort with batch_size = 100
+        let result_batches = sort_batch_chunked(&batch, &expressions, 100)?;
+
+        // Should return exactly 10 batches of 100 rows each
+        assert_eq!(result_batches.len(), 10);
+        for batch in &result_batches {
+            assert_eq!(batch.num_rows(), 100);
+        }
+
+        // Verify sorted correctly across all batches
+        let concatenated = concat_batches(&schema, &result_batches)?;
+        let array = as_primitive_array::<Int32Type>(concatenated.column(0))?;
+        for i in 0..array.len() - 1 {
+            assert!(array.value(i) <= array.value(i + 1));
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_sort_batch_chunked_empty_batch() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        let batch = RecordBatch::new_empty(Arc::clone(&schema));
+
+        let expressions: LexOrdering =
+            [PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)))].into();
+
+        let result_batches = sort_batch_chunked(&batch, &expressions, 100)?;
+
+        // Empty input produces no output batches (0 chunks)
+        assert_eq!(result_batches.len(), 0);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_get_reserved_bytes_for_record_batch_with_sliced_batches() -> Result<()>
+    {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        // Create a larger batch then slice it
+        let large_array = Int32Array::from((0..1000).collect::<Vec<i32>>());
+        let sliced_array = large_array.slice(100, 50); // Take 50 elements starting at 100
+
+        let sliced_batch =
+            RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(sliced_array)])?;
+        let batch =
+            RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(large_array)])?;
+
+        let sliced_reserved = get_reserved_bytes_for_record_batch(&sliced_batch)?;
+        let reserved = get_reserved_bytes_for_record_batch(&batch)?;
+
+        // The reserved memory for the sliced batch should be less than that of the full batch
+        assert!(reserved > sliced_reserved);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_with_dynamic_filter() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+        let child = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+
+        let sort = SortExec::new(
+            LexOrdering::new(vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("a", 0)),
+                options: SortOptions::default(),
+            }])
+            .unwrap(),
+            child,
+        )
+        .with_fetch(Some(10));
+
+        // SortExec with fetch creates a dynamic filter automatically.
+        let original_id = sort
+            .dynamic_filter()
+            .expect("should have dynamic filter with fetch")
+            .expression_id()
+            .expect("DynamicFilterPhysicalExpr always has an expression_id");
+
+        // with_dynamic_filter replaces it with a new TopKDynamicFilters.
+        let new_df = Arc::new(DynamicFilterPhysicalExpr::new(
+            vec![Arc::new(Column::new("a", 0)) as _],
+            lit(true),
+        ));
+        let new_id = new_df
+            .expression_id()
+            .expect("DynamicFilterPhysicalExpr always has an expression_id");
+        let sort = sort.with_dynamic_filter(Arc::clone(&new_df))?;
+        let restored_id = sort
+            .dynamic_filter()
+            .expect("should still have dynamic filter")
+            .expression_id()
+            .expect("DynamicFilterPhysicalExpr always has an expression_id");
+        assert_eq!(restored_id, new_id);
+        assert_ne!(restored_id, original_id);
+        Ok(())
+    }
+
+    #[test]
+    fn test_with_dynamic_filter_rejects_invalid_columns() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+        let child = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+
+        let sort = SortExec::new(
+            LexOrdering::new(vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("a", 0)),
+                options: SortOptions::default(),
+            }])
+            .unwrap(),
+            child,
+        )
+        .with_fetch(Some(10));
+
+        // Column index 99 is out of bounds for the input schema.
+        let df = Arc::new(DynamicFilterPhysicalExpr::new(
+            vec![Arc::new(Column::new("bad", 99)) as _],
+            lit(true),
+        ));
+        assert!(sort.with_dynamic_filter(df).is_err());
+        Ok(())
+    }
+
+    /// Verifies that `ExternalSorter::sort()` transfers the pre-reserved
+    /// merge bytes to the merge stream via `take()`, rather than leaving
+    /// them in the sorter (via `new_empty()`).
+    ///
+    /// 1. Create a sorter with a tight memory pool and insert enough data
+    ///    to force spilling
+    /// 2. Verify `merge_reservation` holds the pre-reserved bytes before sort
+    /// 3. Call `sort()` to get the merge stream
+    /// 4. Verify `merge_reservation` is now 0 (bytes transferred to merge stream)
+    /// 5. Simulate contention: a competing consumer grabs all available pool memory
+    /// 6. Verify the merge stream still works (it uses its pre-reserved bytes
+    ///    as initial budget, not requesting from pool starting at 0)
+    ///
+    /// With `new_empty()` (before fix), step 4 fails: `merge_reservation`
+    /// still holds the bytes, the merge stream starts with 0 budget, and
+    /// those bytes become unaccounted-for reserved memory that nobody uses.
+    #[tokio::test]
+    async fn test_sort_merge_reservation_transferred_not_freed() -> Result<()> {
+        let sort_spill_reservation_bytes: usize = 10 * 1024; // 10 KB
+
+        // Pool: merge reservation (10KB) + enough room for sort to work.
+        // The room must accommodate batch data accumulation before spilling.
+        let sort_working_memory: usize = 40 * 1024; // 40 KB for sort operations
+        let pool_size = sort_spill_reservation_bytes + sort_working_memory;
+        let pool: Arc<dyn MemoryPool> = Arc::new(GreedyMemoryPool::new(pool_size));
+
+        let runtime = RuntimeEnvBuilder::new()
+            .with_memory_pool(Arc::clone(&pool))
+            .build_arc()?;
+
+        let metrics_set = ExecutionPlanMetricsSet::new();
+        let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Int32, false)]));
+
+        let mut sorter = ExternalSorter::new(
+            0,
+            Arc::clone(&schema),
+            [PhysicalSortExpr::new_default(Arc::new(Column::new("x", 0)))].into(),
+            128, // batch_size
+            sort_spill_reservation_bytes,
+            usize::MAX, // sort_in_place_threshold_bytes (high to avoid concat path)
+            SpillCompression::Uncompressed,
+            &metrics_set,
+            Arc::clone(&runtime),
+        )?;
+
+        // Insert enough data to force spilling.
+        let num_batches = 200;
+        for i in 0..num_batches {
+            let values: Vec<i32> = ((i * 100)..((i + 1) * 100)).rev().collect();
+            let batch = RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![Arc::new(Int32Array::from(values))],
+            )?;
+            sorter.insert_batch(batch).await?;
+        }
+
+        assert!(
+            sorter.spilled_before(),
+            "Test requires spilling to exercise the merge path"
+        );
+
+        // Before sort(), merge_reservation holds sort_spill_reservation_bytes.
+        assert!(
+            sorter.merge_reservation_size() >= sort_spill_reservation_bytes,
+            "merge_reservation should hold the pre-reserved bytes before sort()"
+        );
+
+        // Call sort() to get the merge stream. With the fix (take()),
+        // the pre-reserved merge bytes are transferred to the merge
+        // stream. Without the fix (free() + new_empty()), the bytes
+        // are released back to the pool and the merge stream starts
+        // with 0 bytes.
+        let merge_stream = sorter.sort().await?;
+
+        // THE KEY ASSERTION: after sort(), merge_reservation must be 0.
+        // This proves take() transferred the bytes to the merge stream,
+        // rather than them being freed back to the pool where other
+        // partitions could steal them.
+        assert_eq!(
+            sorter.merge_reservation_size(),
+            0,
+            "After sort(), merge_reservation should be 0 (bytes transferred \
+             to merge stream via take()). If non-zero, the bytes are still \
+             held by the sorter and will be freed on drop, allowing other \
+             partitions to steal them."
+        );
+
+        // Drop the sorter to free its reservations back to the pool.
+        drop(sorter);
+
+        // Simulate contention: another partition grabs ALL available
+        // pool memory. If the merge stream didn't receive the
+        // pre-reserved bytes via take(), it will fail when it tries
+        // to allocate memory for reading spill files.
+        let contender = MemoryConsumer::new("CompetingPartition").register(&pool);
+        let available = pool_size.saturating_sub(pool.reserved());
+        if available > 0 {
+            contender.try_grow(available).unwrap();
+        }
+
+        // The merge stream must still produce correct results despite
+        // the pool being fully consumed by the contender. This only
+        // works if sort() transferred the pre-reserved bytes to the
+        // merge stream (via take()) rather than freeing them.
+        let batches: Vec<RecordBatch> = merge_stream.try_collect().await?;
+        let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(
+            total_rows,
+            (num_batches * 100) as usize,
+            "Merge stream should produce all rows even under memory contention"
+        );
+
+        // Verify data is sorted
+        let merged = concat_batches(&schema, &batches)?;
+        let col = merged.column(0).as_primitive::<Int32Type>();
+        for i in 1..col.len() {
+            assert!(
+                col.value(i - 1) <= col.value(i),
+                "Output should be sorted, but found {} > {} at index {}",
+                col.value(i - 1),
+                col.value(i),
+                i
+            );
+        }
+
+        drop(contender);
+        Ok(())
+    }
+
+    fn make_sort_exec_with_fetch(fetch: Option<usize>) -> SortExec {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+        let input = Arc::new(EmptyExec::new(schema));
+        SortExec::new(
+            [PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)))].into(),
+            input,
+        )
+        .with_fetch(fetch)
+    }
+
+    #[test]
+    fn test_sort_with_fetch_blocks_filter_pushdown() -> Result<()> {
+        let sort = make_sort_exec_with_fetch(Some(10));
+        let desc = sort.gather_filters_for_pushdown(
+            FilterPushdownPhase::Pre,
+            vec![Arc::new(Column::new("a", 0))],
+            &ConfigOptions::new(),
+        )?;
+        // Sort with fetch (TopK) must not allow filters to be pushed below it.
+        assert!(matches!(
+            desc.parent_filters()[0][0].discriminant,
+            PushedDown::No
+        ));
+        Ok(())
+    }
+
+    #[test]
+    fn test_sort_without_fetch_allows_filter_pushdown() -> Result<()> {
+        let sort = make_sort_exec_with_fetch(None);
+        let desc = sort.gather_filters_for_pushdown(
+            FilterPushdownPhase::Pre,
+            vec![Arc::new(Column::new("a", 0))],
+            &ConfigOptions::new(),
+        )?;
+        // Plain sort (no fetch) is filter-commutative.
+        assert!(matches!(
+            desc.parent_filters()[0][0].discriminant,
+            PushedDown::Yes
+        ));
+        Ok(())
+    }
+
+    #[test]
+    fn test_sort_with_fetch_allows_topk_self_filter_in_post_phase() -> Result<()> {
+        let sort = make_sort_exec_with_fetch(Some(10));
+        assert!(sort.filter.is_some(), "TopK filter should be created");
+
+        let mut config = ConfigOptions::new();
+        config.optimizer.enable_topk_dynamic_filter_pushdown = true;
+        let desc = sort.gather_filters_for_pushdown(
+            FilterPushdownPhase::Post,
+            vec![Arc::new(Column::new("a", 0))],
+            &config,
+        )?;
+        // Parent filters are still blocked in the Post phase.
+        assert!(matches!(
+            desc.parent_filters()[0][0].discriminant,
+            PushedDown::No
+        ));
+        // But the TopK self-filter should be pushed down.
+        assert_eq!(desc.self_filters()[0].len(), 1);
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs
index 3a94f156fa9b3..13c28ccb10991 100644
--- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs
+++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs
@@ -17,22 +17,24 @@
 
 //! [`SortPreservingMergeExec`] merges multiple sorted streams into one sorted stream.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use crate::common::spawn_buffered;
 use crate::limit::LimitStream;
 use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
-use crate::projection::{make_with_child, update_ordering, ProjectionExec};
+use crate::projection::{ProjectionExec, make_with_child, update_ordering};
 use crate::sorts::streaming_merge::StreamingMergeBuilder;
 use crate::{
     DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties,
     Partitioning, PlanProperties, SendableRecordBatchStream, Statistics,
+    check_if_same_properties,
 };
 
-use datafusion_common::{internal_err, Result};
-use datafusion_execution::memory_pool::MemoryConsumer;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, assert_eq_or_internal_err, internal_err};
 use datafusion_execution::TaskContext;
+use datafusion_execution::memory_pool::MemoryConsumer;
+use datafusion_physical_expr::PhysicalExpr;
 use datafusion_physical_expr_common::sort_expr::{LexOrdering, OrderingRequirements};
 
 use crate::execution_plan::{EvaluationType, SchedulingType};
@@ -93,7 +95,7 @@ pub struct SortPreservingMergeExec {
     /// Optional number of rows to fetch. Stops producing rows after this fetch
     fetch: Option<usize>,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
     /// Use round-robin selection of tied winners of loser tree
     ///
     /// See [`Self::with_round_robin_repartition`] for more information.
@@ -109,7 +111,7 @@ impl SortPreservingMergeExec {
             expr,
             metrics: ExecutionPlanMetricsSet::new(),
             fetch: None,
-            cache,
+            cache: Arc::new(cache),
             enable_round_robin_repartition: true,
         }
     }
@@ -180,6 +182,17 @@ impl SortPreservingMergeExec {
         .with_evaluation_type(drive)
         .with_scheduling_type(scheduling)
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for SortPreservingMergeExec {
@@ -221,11 +234,7 @@ impl ExecutionPlan for SortPreservingMergeExec {
     }
 
     /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -240,11 +249,24 @@ impl ExecutionPlan for SortPreservingMergeExec {
             expr: self.expr.clone(),
             metrics: self.metrics.clone(),
             fetch: limit,
-            cache: self.cache.clone(),
-            enable_round_robin_repartition: true,
+            cache: Arc::clone(&self.cache),
+            enable_round_robin_repartition: self.enable_round_robin_repartition,
         }))
     }
 
+    fn with_preserve_order(
+        &self,
+        preserve_order: bool,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        self.input
+            .with_preserve_order(preserve_order)
+            .and_then(|new_input| {
+                Arc::new(self.clone())
+                    .with_new_children(vec![new_input])
+                    .ok()
+            })
+    }
+
     fn required_input_distribution(&self) -> Vec<Distribution> {
         vec![Distribution::UnspecifiedDistribution]
     }
@@ -265,12 +287,24 @@ impl ExecutionPlan for SortPreservingMergeExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        let mut tnr = TreeNodeRecursion::Continue;
+        for sort_expr in &self.expr {
+            tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+        }
+        Ok(tnr)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
-        children: Vec<Arc<dyn ExecutionPlan>>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         Ok(Arc::new(
-            SortPreservingMergeExec::new(self.expr.clone(), Arc::clone(&children[0]))
+            SortPreservingMergeExec::new(self.expr.clone(), children.swap_remove(0))
                 .with_fetch(self.fetch),
         ))
     }
@@ -281,11 +315,11 @@ impl ExecutionPlan for SortPreservingMergeExec {
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
         trace!("Start SortPreservingMergeExec::execute for partition: {partition}");
-        if 0 != partition {
-            return internal_err!(
-                "SortPreservingMergeExec invalid partition {partition}"
-            );
-        }
+        assert_eq_or_internal_err!(
+            partition,
+            0,
+            "SortPreservingMergeExec invalid partition {partition}"
+        );
 
         let input_partitions = self.input.output_partitioning().partition_count();
         trace!(
@@ -304,7 +338,9 @@ impl ExecutionPlan for SortPreservingMergeExec {
             1 => match self.fetch {
                 Some(fetch) => {
                     let stream = self.input.execute(0, context)?;
-                    debug!("Done getting stream for SortPreservingMergeExec::execute with 1 input with {fetch}");
+                    debug!(
+                        "Done getting stream for SortPreservingMergeExec::execute with 1 input with {fetch}"
+                    );
                     Ok(Box::pin(LimitStream::new(
                         stream,
                         0,
@@ -314,7 +350,9 @@ impl ExecutionPlan for SortPreservingMergeExec {
                 }
                 None => {
                     let stream = self.input.execute(0, context);
-                    debug!("Done getting stream for SortPreservingMergeExec::execute with 1 input without fetch");
+                    debug!(
+                        "Done getting stream for SortPreservingMergeExec::execute with 1 input without fetch"
+                    );
                     stream
                 }
             },
@@ -327,7 +365,9 @@ impl ExecutionPlan for SortPreservingMergeExec {
                     })
                     .collect::<Result<_>>()?;
 
-                debug!("Done setting up sender-receiver for SortPreservingMergeExec::execute");
+                debug!(
+                    "Done setting up sender-receiver for SortPreservingMergeExec::execute"
+                );
 
                 let result = StreamingMergeBuilder::new()
                     .with_streams(receivers)
@@ -340,7 +380,9 @@ impl ExecutionPlan for SortPreservingMergeExec {
                     .with_round_robin_tie_breaker(self.enable_round_robin_repartition)
                     .build()?;
 
-                debug!("Got stream result from SortPreservingMergeStream::new_from_receivers");
+                debug!(
+                    "Got stream result from SortPreservingMergeStream::new_from_receivers"
+                );
 
                 Ok(result)
             }
@@ -351,11 +393,7 @@ impl ExecutionPlan for SortPreservingMergeExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.input.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, _partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, _partition: Option<usize>) -> Result<Arc<Statistics>> {
         self.input.partition_statistics(None)
     }
 
@@ -396,11 +434,10 @@ mod tests {
     use std::fmt::Formatter;
     use std::pin::Pin;
     use std::sync::Mutex;
-    use std::task::{ready, Context, Poll, Waker};
+    use std::task::{Context, Poll, Waker, ready};
     use std::time::Duration;
 
     use super::*;
-    use crate::coalesce_batches::CoalesceBatchesExec;
     use crate::coalesce_partitions::CoalescePartitionsExec;
     use crate::execution_plan::{Boundedness, EmissionType};
     use crate::expressions::col;
@@ -408,8 +445,8 @@ mod tests {
     use crate::repartition::RepartitionExec;
     use crate::sorts::sort::SortExec;
     use crate::stream::RecordBatchReceiverStream;
-    use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec};
     use crate::test::TestMemoryExec;
+    use crate::test::exec::{BlockingExec, assert_strong_count_converges_to_zero};
     use crate::test::{self, assert_is_pending, make_partition};
     use crate::{collect, common};
 
@@ -422,11 +459,11 @@ mod tests {
     use datafusion_common::test_util::batches_to_string;
     use datafusion_common::{assert_batches_eq, exec_err};
     use datafusion_common_runtime::SpawnedTask;
+    use datafusion_execution::RecordBatchStream;
     use datafusion_execution::config::SessionConfig;
     use datafusion_execution::runtime_env::RuntimeEnvBuilder;
-    use datafusion_execution::RecordBatchStream;
-    use datafusion_physical_expr::expressions::Column;
     use datafusion_physical_expr::EquivalenceProperties;
+    use datafusion_physical_expr::expressions::Column;
     use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
     use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 
@@ -436,31 +473,33 @@ mod tests {
 
     // The number in the function is highly related to the memory limit we are testing
     // any change of the constant should be aware of
-    fn generate_task_ctx_for_round_robin_tie_breaker() -> Result<Arc<TaskContext>> {
+    fn generate_task_ctx_for_round_robin_tie_breaker(
+        target_batch_size: usize,
+    ) -> Result<Arc<TaskContext>> {
         let runtime = RuntimeEnvBuilder::new()
             .with_memory_limit(20_000_000, 1.0)
             .build_arc()?;
-        let config = SessionConfig::new();
+        let mut config = SessionConfig::new();
+        config.options_mut().execution.batch_size = target_batch_size;
         let task_ctx = TaskContext::default()
             .with_runtime(runtime)
             .with_session_config(config);
         Ok(Arc::new(task_ctx))
     }
+
     // The number in the function is highly related to the memory limit we are testing,
     // any change of the constant should be aware of
     fn generate_spm_for_round_robin_tie_breaker(
         enable_round_robin_repartition: bool,
     ) -> Result<Arc<SortPreservingMergeExec>> {
-        let target_batch_size = 12500;
         let row_size = 12500;
         let a: ArrayRef = Arc::new(Int32Array::from(vec![1; row_size]));
         let b: ArrayRef = Arc::new(StringArray::from_iter(vec![Some("a"); row_size]));
         let c: ArrayRef = Arc::new(Int64Array::from_iter(vec![0; row_size]));
         let rb = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)])?;
-
-        let rbs = (0..1024).map(|_| rb.clone()).collect::<Vec<_>>();
-
         let schema = rb.schema();
+
+        let rbs = std::iter::repeat_n(rb, 1024).collect::<Vec<_>>();
         let sort = [
             PhysicalSortExpr {
                 expr: col("b", &schema)?,
@@ -477,9 +516,7 @@ mod tests {
             TestMemoryExec::try_new_exec(&[rbs], schema, None)?,
             Partitioning::RoundRobinBatch(2),
         )?;
-        let coalesce_batches_exec =
-            CoalesceBatchesExec::new(Arc::new(repartition_exec), target_batch_size);
-        let spm = SortPreservingMergeExec::new(sort, Arc::new(coalesce_batches_exec))
+        let spm = SortPreservingMergeExec::new(sort, Arc::new(repartition_exec))
             .with_round_robin_repartition(enable_round_robin_repartition);
         Ok(Arc::new(spm))
     }
@@ -491,7 +528,8 @@ mod tests {
     /// based on whether the tie breaker is enabled or disabled.
     #[tokio::test(flavor = "multi_thread")]
     async fn test_round_robin_tie_breaker_success() -> Result<()> {
-        let task_ctx = generate_task_ctx_for_round_robin_tie_breaker()?;
+        let target_batch_size = 12500;
+        let task_ctx = generate_task_ctx_for_round_robin_tie_breaker(target_batch_size)?;
         let spm = generate_spm_for_round_robin_tie_breaker(true)?;
         let _collected = collect(spm, task_ctx).await?;
         Ok(())
@@ -504,7 +542,7 @@ mod tests {
     /// based on whether the tie breaker is enabled or disabled.
     #[tokio::test(flavor = "multi_thread")]
     async fn test_round_robin_tie_breaker_fail() -> Result<()> {
-        let task_ctx = generate_task_ctx_for_round_robin_tie_breaker()?;
+        let task_ctx = generate_task_ctx_for_round_robin_tie_breaker(8192)?;
         let spm = generate_spm_for_round_robin_tie_breaker(false)?;
         let _err = collect(spm, task_ctx).await.unwrap_err();
         Ok(())
@@ -975,22 +1013,22 @@ mod tests {
         let collected = collect(merge, task_ctx).await.unwrap();
         assert_eq!(collected.len(), 1);
 
-        assert_snapshot!(batches_to_string(collected.as_slice()), @r#"
-            +---+---+-------------------------------+
-            | a | b | c                             |
-            +---+---+-------------------------------+
-            | 1 |   | 1970-01-01T00:00:00.000000008 |
-            | 1 |   | 1970-01-01T00:00:00.000000008 |
-            | 2 | a |                               |
-            | 7 | b | 1970-01-01T00:00:00.000000006 |
-            | 2 | b |                               |
-            | 9 | d |                               |
-            | 3 | e | 1970-01-01T00:00:00.000000004 |
-            | 3 | g | 1970-01-01T00:00:00.000000005 |
-            | 4 | h |                               |
-            | 5 | i | 1970-01-01T00:00:00.000000004 |
-            +---+---+-------------------------------+
-            "#);
+        assert_snapshot!(batches_to_string(collected.as_slice()), @r"
+        +---+---+-------------------------------+
+        | a | b | c                             |
+        +---+---+-------------------------------+
+        | 1 |   | 1970-01-01T00:00:00.000000008 |
+        | 1 |   | 1970-01-01T00:00:00.000000008 |
+        | 2 | a |                               |
+        | 7 | b | 1970-01-01T00:00:00.000000006 |
+        | 2 | b |                               |
+        | 9 | d |                               |
+        | 3 | e | 1970-01-01T00:00:00.000000004 |
+        | 3 | g | 1970-01-01T00:00:00.000000005 |
+        | 4 | h |                               |
+        | 5 | i | 1970-01-01T00:00:00.000000004 |
+        +---+---+-------------------------------+
+        ");
     }
 
     #[tokio::test]
@@ -1016,14 +1054,14 @@ mod tests {
         let collected = collect(merge, task_ctx).await.unwrap();
         assert_eq!(collected.len(), 1);
 
-        assert_snapshot!(batches_to_string(collected.as_slice()), @r#"
-            +---+---+
-            | a | b |
-            +---+---+
-            | 1 | a |
-            | 2 | b |
-            +---+---+
-            "#);
+        assert_snapshot!(batches_to_string(collected.as_slice()), @r"
+        +---+---+
+        | a | b |
+        +---+---+
+        | 1 | a |
+        | 2 | b |
+        +---+---+
+        ");
     }
 
     #[tokio::test]
@@ -1048,17 +1086,17 @@ mod tests {
         let collected = collect(merge, task_ctx).await.unwrap();
         assert_eq!(collected.len(), 1);
 
-        assert_snapshot!(batches_to_string(collected.as_slice()), @r#"
-            +---+---+
-            | a | b |
-            +---+---+
-            | 1 | a |
-            | 2 | b |
-            | 7 | c |
-            | 9 | d |
-            | 3 | e |
-            +---+---+
-            "#);
+        assert_snapshot!(batches_to_string(collected.as_slice()), @r"
+        +---+---+
+        | a | b |
+        +---+---+
+        | 1 | a |
+        | 2 | b |
+        | 7 | c |
+        | 9 | d |
+        | 3 | e |
+        +---+---+
+        ");
     }
 
     #[tokio::test]
@@ -1157,16 +1195,16 @@ mod tests {
         let collected = collect(Arc::clone(&merge) as Arc<dyn ExecutionPlan>, task_ctx)
             .await
             .unwrap();
-        assert_snapshot!(batches_to_string(collected.as_slice()), @r#"
-            +----+---+
-            | a  | b |
-            +----+---+
-            | 1  | a |
-            | 10 | b |
-            | 2  | c |
-            | 20 | d |
-            +----+---+
-            "#);
+        assert_snapshot!(batches_to_string(collected.as_slice()), @r"
+        +----+---+
+        | a  | b |
+        +----+---+
+        | 1  | a |
+        | 10 | b |
+        | 2  | c |
+        | 20 | d |
+        +----+---+
+        ");
 
         // Now, validate metrics
         let metrics = merge.metrics().unwrap();
@@ -1272,32 +1310,32 @@ mod tests {
         // Expect the data to be sorted first by "batch_number" (because
         // that was the order it was fed in, even though only "value"
         // is in the sort key)
-        assert_snapshot!(batches_to_string(collected.as_slice()), @r#"
-                +--------------+-------+
-                | batch_number | value |
-                +--------------+-------+
-                | 0            | A     |
-                | 1            | A     |
-                | 2            | A     |
-                | 3            | A     |
-                | 4            | A     |
-                | 5            | A     |
-                | 6            | A     |
-                | 7            | A     |
-                | 8            | A     |
-                | 9            | A     |
-                | 0            | B     |
-                | 1            | B     |
-                | 2            | B     |
-                | 3            | B     |
-                | 4            | B     |
-                | 5            | B     |
-                | 6            | B     |
-                | 7            | B     |
-                | 8            | B     |
-                | 9            | B     |
-                +--------------+-------+
-            "#);
+        assert_snapshot!(batches_to_string(collected.as_slice()), @r"
+        +--------------+-------+
+        | batch_number | value |
+        +--------------+-------+
+        | 0            | A     |
+        | 1            | A     |
+        | 2            | A     |
+        | 3            | A     |
+        | 4            | A     |
+        | 5            | A     |
+        | 6            | A     |
+        | 7            | A     |
+        | 8            | A     |
+        | 9            | A     |
+        | 0            | B     |
+        | 1            | B     |
+        | 2            | B     |
+        | 3            | B     |
+        | 4            | B     |
+        | 5            | B     |
+        | 6            | B     |
+        | 7            | B     |
+        | 8            | B     |
+        | 9            | B     |
+        +--------------+-------+
+        ");
     }
 
     #[derive(Debug)]
@@ -1342,7 +1380,7 @@ mod tests {
     #[derive(Debug, Clone)]
     struct CongestedExec {
         schema: Schema,
-        cache: PlanProperties,
+        cache: Arc<PlanProperties>,
         congestion: Arc<Congestion>,
     }
 
@@ -1375,15 +1413,18 @@ mod tests {
         fn name(&self) -> &'static str {
             Self::static_name()
         }
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             &self.cache
         }
         fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
             vec![]
         }
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
         fn with_new_children(
             self: Arc<Self>,
             _: Vec<Arc<dyn ExecutionPlan>>,
@@ -1471,7 +1512,7 @@ mod tests {
         };
         let source = CongestedExec {
             schema: schema.clone(),
-            cache: properties,
+            cache: Arc::new(properties),
             congestion: Arc::new(Congestion::new(partition_count)),
         };
         let spm = SortPreservingMergeExec::new(
@@ -1491,4 +1532,59 @@ mod tests {
             Err(_) => exec_err!("SortPreservingMerge caused a deadlock"),
         }
     }
+
+    #[tokio::test]
+    async fn test_sort_merge_stops_after_error_with_buffered_rows() -> Result<()> {
+        let task_ctx = Arc::new(TaskContext::default());
+        let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, false)]));
+        let sort: LexOrdering = [PhysicalSortExpr::new_default(Arc::new(Column::new(
+            "i", 0,
+        ))
+            as Arc<dyn PhysicalExpr>)]
+        .into();
+
+        let mut stream0 = RecordBatchReceiverStream::builder(Arc::clone(&schema), 2);
+        let tx0 = stream0.tx();
+        let schema0 = Arc::clone(&schema);
+        stream0.spawn(async move {
+            let batch =
+                RecordBatch::try_new(schema0, vec![Arc::new(Int32Array::from(vec![1]))])?;
+            tx0.send(Ok(batch)).await.unwrap();
+            tx0.send(exec_err!("stream failure")).await.unwrap();
+            Ok(())
+        });
+
+        let mut stream1 = RecordBatchReceiverStream::builder(Arc::clone(&schema), 1);
+        let tx1 = stream1.tx();
+        let schema1 = Arc::clone(&schema);
+        stream1.spawn(async move {
+            let batch =
+                RecordBatch::try_new(schema1, vec![Arc::new(Int32Array::from(vec![2]))])?;
+            tx1.send(Ok(batch)).await.unwrap();
+            Ok(())
+        });
+
+        let metrics = ExecutionPlanMetricsSet::new();
+        let reservation =
+            MemoryConsumer::new("test").register(&task_ctx.runtime_env().memory_pool);
+
+        let mut merge_stream = StreamingMergeBuilder::new()
+            .with_streams(vec![stream0.build(), stream1.build()])
+            .with_schema(Arc::clone(&schema))
+            .with_expressions(&sort)
+            .with_metrics(BaselineMetrics::new(&metrics, 0))
+            .with_batch_size(task_ctx.session_config().batch_size())
+            .with_fetch(None)
+            .with_reservation(reservation)
+            .build()?;
+
+        let first = merge_stream.next().await.unwrap();
+        assert!(first.is_err(), "expected merge stream to surface the error");
+        assert!(
+            merge_stream.next().await.is_none(),
+            "merge stream yielded data after returning an error"
+        );
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/sorts/stream.rs b/datafusion/physical-plan/src/sorts/stream.rs
index 97dd1761b14cf..ff7f259dd1347 100644
--- a/datafusion/physical-plan/src/sorts/stream.rs
+++ b/datafusion/physical-plan/src/sorts/stream.rs
@@ -15,20 +15,25 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::sorts::cursor::{ArrayValues, CursorArray, RowValues};
 use crate::SendableRecordBatchStream;
+use crate::sorts::cursor::{ArrayValues, CursorArray, RowValues};
 use crate::{PhysicalExpr, PhysicalSortExpr};
-use arrow::array::Array;
+use arrow::array::{Array, UInt32Array};
+use arrow::compute::take_record_batch;
 use arrow::datatypes::Schema;
 use arrow::record_batch::RecordBatch;
 use arrow::row::{RowConverter, Rows, SortField};
-use datafusion_common::{internal_datafusion_err, Result};
+use arrow_ord::sort::lexsort_to_indices;
+use datafusion_common::{Result, internal_datafusion_err};
 use datafusion_execution::memory_pool::MemoryReservation;
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays;
 use futures::stream::{Fuse, StreamExt};
+use std::iter::FusedIterator;
 use std::marker::PhantomData;
+use std::mem;
 use std::sync::Arc;
-use std::task::{ready, Context, Poll};
+use std::task::{Context, Poll, ready};
 
 /// A [`Stream`](futures::Stream) that has multiple partitions that can
 /// be polled separately but not concurrently
@@ -98,11 +103,11 @@ impl ReusableRows {
         })
     }
     // save the Rows
-    fn save(&mut self, stream_idx: usize, rows: Arc<Rows>) {
-        self.inner[stream_idx][1] = Some(Arc::clone(&rows));
+    fn save(&mut self, stream_idx: usize, rows: &Arc<Rows>) {
+        self.inner[stream_idx][1] = Some(Arc::clone(rows));
         // swap the current with the previous one, so that the next poll can reuse the Rows from the previous poll
         let [a, b] = &mut self.inner[stream_idx];
-        std::mem::swap(a, b);
+        mem::swap(a, b);
     }
 }
 
@@ -164,11 +169,7 @@ impl RowCursorStream {
         batch: &RecordBatch,
         stream_idx: usize,
     ) -> Result<RowValues> {
-        let cols = self
-            .column_expressions
-            .iter()
-            .map(|expr| expr.evaluate(batch)?.into_array(batch.num_rows()))
-            .collect::<Result<Vec<_>>>()?;
+        let cols = evaluate_expressions_to_arrays(&self.column_expressions, batch)?;
 
         // At this point, ownership should of this Rows should be unique
         let mut rows = self.rows.take_next(stream_idx)?;
@@ -180,10 +181,10 @@ impl RowCursorStream {
 
         let rows = Arc::new(rows);
 
-        self.rows.save(stream_idx, Arc::clone(&rows));
+        self.rows.save(stream_idx, &rows);
 
         // track the memory in the newly created Rows.
-        let mut rows_reservation = self.reservation.new_empty();
+        let rows_reservation = self.reservation.new_empty();
         rows_reservation.try_grow(rows.size())?;
         Ok(RowValues::new(rows, rows_reservation))
     }
@@ -249,7 +250,7 @@ impl<T: CursorArray> FieldCursorStream<T> {
         let array = value.into_array(batch.num_rows())?;
         let size_in_mem = array.get_buffer_memory_size();
         let array = array.as_any().downcast_ref::<T>().expect("field values");
-        let mut array_reservation = self.reservation.new_empty();
+        let array_reservation = self.reservation.new_empty();
         array_reservation.try_grow(size_in_mem)?;
         Ok(ArrayValues::new(
             self.sort.options,
@@ -279,3 +280,159 @@ impl<T: CursorArray> PartitionedStream for FieldCursorStream<T> {
         }))
     }
 }
+
+/// A lazy, memory-efficient sort iterator used as a fallback during aggregate
+/// spill when there is not enough memory for an eager sort (which requires ~2x
+/// peak memory to hold both the unsorted and sorted copies simultaneously).
+///
+/// On the first call to `next()`, a sorted index array (`UInt32Array`) is
+/// computed via `lexsort_to_indices`. Subsequent calls yield chunks of
+/// `batch_size` rows by `take`-ing from the original batch using slices of
+/// this index array. Each `take` copies data for the chunk (not zero-copy),
+/// but only one chunk is live at a time since the caller consumes it before
+/// requesting the next. Once all rows have been yielded, the original batch
+/// and index array are dropped to free memory.
+///
+/// The caller must reserve `sizeof(batch) + sizeof(one chunk)` for this iterator,
+/// and free the reservation once the iterator is depleted.
+pub(crate) struct IncrementalSortIterator {
+    batch: RecordBatch,
+    expressions: LexOrdering,
+    batch_size: usize,
+    indices: Option<UInt32Array>,
+    cursor: usize,
+}
+
+impl IncrementalSortIterator {
+    pub(crate) fn new(
+        batch: RecordBatch,
+        expressions: LexOrdering,
+        batch_size: usize,
+    ) -> Self {
+        Self {
+            batch,
+            expressions,
+            batch_size,
+            cursor: 0,
+            indices: None,
+        }
+    }
+}
+
+impl Iterator for IncrementalSortIterator {
+    type Item = Result<RecordBatch>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.cursor >= self.batch.num_rows() {
+            return None;
+        }
+
+        match self.indices.as_ref() {
+            None => {
+                let sort_columns = match self
+                    .expressions
+                    .iter()
+                    .map(|expr| expr.evaluate_to_sort_column(&self.batch))
+                    .collect::<Result<Vec<_>>>()
+                {
+                    Ok(cols) => cols,
+                    Err(e) => return Some(Err(e)),
+                };
+
+                let indices = match lexsort_to_indices(&sort_columns, None) {
+                    Ok(indices) => indices,
+                    Err(e) => return Some(Err(e.into())),
+                };
+                self.indices = Some(indices);
+
+                // Call again, this time it will hit the Some(indices) branch and return the first batch
+                self.next()
+            }
+            Some(indices) => {
+                let batch_size = self.batch_size.min(self.batch.num_rows() - self.cursor);
+
+                // Perform the take to produce the next batch
+                let new_batch_indices = indices.slice(self.cursor, batch_size);
+                let new_batch = match take_record_batch(&self.batch, &new_batch_indices) {
+                    Ok(batch) => batch,
+                    Err(e) => return Some(Err(e.into())),
+                };
+
+                self.cursor += batch_size;
+
+                // If this is the last batch, we can release the memory
+                if self.cursor >= self.batch.num_rows() {
+                    let schema = self.batch.schema();
+                    let _ = mem::replace(&mut self.batch, RecordBatch::new_empty(schema));
+                    self.indices = None;
+                }
+
+                // Return the new batch
+                Some(Ok(new_batch))
+            }
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let num_rows = self.batch.num_rows();
+        let batch_size = self.batch_size;
+        let num_batches = num_rows.div_ceil(batch_size);
+        (num_batches, Some(num_batches))
+    }
+}
+
+impl FusedIterator for IncrementalSortIterator {}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{AsArray, Int32Array};
+    use arrow::datatypes::{DataType, Field, Int32Type};
+    use datafusion_common::DataFusionError;
+    use datafusion_physical_expr::expressions::col;
+
+    /// Verifies that `take_record_batch` in `IncrementalSortIterator` actually
+    /// copies the data into a new allocation rather than returning a zero-copy
+    /// slice of the original batch. If the output arrays were slices, their
+    /// underlying buffer length would match the original array's length; a true
+    /// copy will have a buffer sized to fit only the chunk.
+    #[test]
+    fn incremental_sort_iterator_copies_data() -> Result<()> {
+        let original_len = 10;
+        let batch_size = 3;
+
+        // Build a batch with a single Int32 column of descending values
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+        let col_a: Int32Array = Int32Array::from(vec![0; original_len]);
+        let batch = RecordBatch::try_new(schema, vec![Arc::new(col_a)])?;
+
+        // Sort ascending on column "a"
+        let expressions = LexOrdering::new(vec![PhysicalSortExpr::new_default(col(
+            "a",
+            &batch.schema(),
+        )?)])
+        .unwrap();
+
+        let mut total_rows = 0;
+        IncrementalSortIterator::new(batch.clone(), expressions, batch_size).try_for_each(
+            |result| {
+                let chunk = result?;
+                total_rows += chunk.num_rows();
+
+                // Every output column must be a fresh allocation whose length
+                // equals the chunk size, NOT the original array length.
+                chunk.columns().iter().zip(batch.columns()).for_each(|(arr, original_arr)| {
+                    let (_, scalar_buf, _) = arr.as_primitive::<Int32Type>().clone().into_parts();
+                    let (_, original_scalar_buf, _) = original_arr.as_primitive::<Int32Type>().clone().into_parts();
+
+                    assert_ne!(scalar_buf.inner().data_ptr(), original_scalar_buf.inner().data_ptr(), "Expected a copy of the data for each chunk, but got a slice that shares the same buffer as the original array");
+                });
+
+                Result::<_, DataFusionError>::Ok(())
+            },
+        )?;
+
+        assert_eq!(total_rows, original_len);
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/sorts/streaming_merge.rs b/datafusion/physical-plan/src/sorts/streaming_merge.rs
index 191b135753412..8129c3d8f695d 100644
--- a/datafusion/physical-plan/src/sorts/streaming_merge.rs
+++ b/datafusion/physical-plan/src/sorts/streaming_merge.rs
@@ -27,11 +27,11 @@ use crate::sorts::{
 use crate::{SendableRecordBatchStream, SpillManager};
 use arrow::array::*;
 use arrow::datatypes::{DataType, SchemaRef};
-use datafusion_common::{internal_err, Result};
+use datafusion_common::human_readable_size;
+use datafusion_common::{Result, assert_or_internal_err, internal_err};
 use datafusion_execution::disk_manager::RefCountedTempFile;
 use datafusion_execution::memory_pool::{
-    human_readable_size, MemoryConsumer, MemoryPool, MemoryReservation,
-    UnboundedMemoryPool,
+    MemoryConsumer, MemoryPool, MemoryReservation, UnboundedMemoryPool,
 };
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
 use std::sync::Arc;
@@ -213,11 +213,10 @@ impl<'a> StreamingMergeBuilder<'a> {
         }
 
         // Early return if streams are empty:
-        if streams.is_empty() {
-            return internal_err!(
-                "Streams/sorted spill files cannot be empty for streaming merge"
-            );
-        }
+        assert_or_internal_err!(
+            !streams.is_empty(),
+            "Streams/sorted spill files cannot be empty for streaming merge"
+        );
 
         // Unwrapping mandatory fields
         let schema = schema.expect("Schema cannot be empty for streaming merge");
diff --git a/datafusion/physical-plan/src/spill/in_progress_spill_file.rs b/datafusion/physical-plan/src/spill/in_progress_spill_file.rs
index 14917e23b7921..e0548bd5bf860 100644
--- a/datafusion/physical-plan/src/spill/in_progress_spill_file.rs
+++ b/datafusion/physical-plan/src/spill/in_progress_spill_file.rs
@@ -24,7 +24,10 @@ use arrow::array::RecordBatch;
 use datafusion_common::exec_datafusion_err;
 use datafusion_execution::disk_manager::RefCountedTempFile;
 
-use super::{spill_manager::SpillManager, IPCStreamWriter};
+use super::{
+    IPCStreamWriter, gc_view_arrays,
+    spill_manager::{GetSlicedSize, SpillManager},
+};
 
 /// Represents an in-progress spill file used for writing `RecordBatch`es to disk, created by `SpillManager`.
 /// Caller is able to use this struct to incrementally append in-memory batches to
@@ -51,19 +54,32 @@ impl InProgressSpillFile {
 
     /// Appends a `RecordBatch` to the spill file, initializing the writer if necessary.
     ///
+    /// Before writing, performs GC on StringView/BinaryView arrays to compact backing
+    /// buffers. When a view array is sliced, it still references the original full buffers,
+    /// causing massive spill files without GC (see issue #19414: 820MB → 33MB after GC).
+    ///
+    /// Returns the post-GC sliced memory size of the batch for memory accounting.
+    ///
     /// # Errors
     /// - Returns an error if the file is not active (has been finalized)
     /// - Returns an error if appending would exceed the disk usage limit configured
     ///   by `max_temp_directory_size` in `DiskManager`
-    pub fn append_batch(&mut self, batch: &RecordBatch) -> Result<()> {
+    pub fn append_batch(&mut self, batch: &RecordBatch) -> Result<usize> {
         if self.in_progress_file.is_none() {
             return Err(exec_datafusion_err!(
                 "Append operation failed: No active in-progress file. The file may have already been finalized."
             ));
         }
+
+        let gc_batch = gc_view_arrays(batch)?;
+
         if self.writer.is_none() {
-            let schema = batch.schema();
-            if let Some(ref in_progress_file) = self.in_progress_file {
+            // Use the SpillManager's declared schema rather than the batch's schema.
+            // Individual batches may have different schemas (e.g., different nullability)
+            // when they come from different branches of a UnionExec. The SpillManager's
+            // schema represents the canonical schema that all batches should conform to.
+            let schema = self.spill_writer.schema();
+            if let Some(in_progress_file) = &mut self.in_progress_file {
                 self.writer = Some(IPCStreamWriter::new(
                     in_progress_file.path(),
                     schema.as_ref(),
@@ -72,22 +88,48 @@ impl InProgressSpillFile {
 
                 // Update metrics
                 self.spill_writer.metrics.spill_file_count.add(1);
+
+                // Update initial size (schema/header)
+                in_progress_file.update_disk_usage()?;
+                let initial_size = in_progress_file.current_disk_usage();
+                self.spill_writer
+                    .metrics
+                    .spilled_bytes
+                    .add(initial_size as usize);
             }
         }
         if let Some(writer) = &mut self.writer {
-            let (spilled_rows, _) = writer.write(batch)?;
+            let (spilled_rows, _) = writer.write(&gc_batch)?;
             if let Some(in_progress_file) = &mut self.in_progress_file {
+                let pre_size = in_progress_file.current_disk_usage();
                 in_progress_file.update_disk_usage()?;
+                let post_size = in_progress_file.current_disk_usage();
+
+                self.spill_writer.metrics.spilled_rows.add(spilled_rows);
+                self.spill_writer
+                    .metrics
+                    .spilled_bytes
+                    .add((post_size - pre_size) as usize);
             } else {
                 unreachable!() // Already checked inside current function
             }
+        }
+        gc_batch.get_sliced_size()
+    }
 
-            // Update metrics
-            self.spill_writer.metrics.spilled_rows.add(spilled_rows);
+    pub fn flush(&mut self) -> Result<()> {
+        if let Some(writer) = &mut self.writer {
+            writer.flush()?;
         }
         Ok(())
     }
 
+    /// Returns a reference to the in-progress file, if it exists.
+    /// This can be used to get the file path for creating readers before the file is finished.
+    pub fn file(&self) -> Option<&RefCountedTempFile> {
+        self.in_progress_file.as_ref()
+    }
+
     /// Finalizes the file, returning the completed file reference.
     /// If there are no batches spilled before, it returns `None`.
     pub fn finish(&mut self) -> Result<Option<RefCountedTempFile>> {
@@ -100,11 +142,89 @@ impl InProgressSpillFile {
         // Since spill files are append-only, add the file size to spilled_bytes
         if let Some(in_progress_file) = &mut self.in_progress_file {
             // Since writer.finish() writes continuation marker and message length at the end
+            let pre_size = in_progress_file.current_disk_usage();
             in_progress_file.update_disk_usage()?;
-            let size = in_progress_file.current_disk_usage();
-            self.spill_writer.metrics.spilled_bytes.add(size as usize);
+            let post_size = in_progress_file.current_disk_usage();
+            self.spill_writer
+                .metrics
+                .spilled_bytes
+                .add((post_size - pre_size) as usize);
         }
 
         Ok(self.in_progress_file.take())
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::Int64Array;
+    use arrow_schema::{DataType, Field, Schema};
+    use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+    use datafusion_physical_expr_common::metrics::{
+        ExecutionPlanMetricsSet, SpillMetrics,
+    };
+    use futures::TryStreamExt;
+
+    #[tokio::test]
+    async fn test_spill_file_uses_spill_manager_schema() -> Result<()> {
+        let nullable_schema = Arc::new(Schema::new(vec![
+            Field::new("key", DataType::Int64, false),
+            Field::new("val", DataType::Int64, true),
+        ]));
+        let non_nullable_schema = Arc::new(Schema::new(vec![
+            Field::new("key", DataType::Int64, false),
+            Field::new("val", DataType::Int64, false),
+        ]));
+
+        let runtime = Arc::new(RuntimeEnvBuilder::new().build()?);
+        let metrics_set = ExecutionPlanMetricsSet::new();
+        let spill_metrics = SpillMetrics::new(&metrics_set, 0);
+        let spill_manager = Arc::new(SpillManager::new(
+            runtime,
+            spill_metrics,
+            Arc::clone(&nullable_schema),
+        ));
+
+        let mut in_progress = spill_manager.create_in_progress_file("test")?;
+
+        // First batch: non-nullable val (simulates literal-0 UNION branch)
+        let non_nullable_batch = RecordBatch::try_new(
+            Arc::clone(&non_nullable_schema),
+            vec![
+                Arc::new(Int64Array::from(vec![1, 2, 3])),
+                Arc::new(Int64Array::from(vec![0, 0, 0])),
+            ],
+        )?;
+        in_progress.append_batch(&non_nullable_batch)?;
+
+        // Second batch: nullable val with NULLs (simulates table UNION branch)
+        let nullable_batch = RecordBatch::try_new(
+            Arc::clone(&nullable_schema),
+            vec![
+                Arc::new(Int64Array::from(vec![4, 5, 6])),
+                Arc::new(Int64Array::from(vec![Some(10), None, Some(30)])),
+            ],
+        )?;
+        in_progress.append_batch(&nullable_batch)?;
+
+        let spill_file = in_progress.finish()?.unwrap();
+
+        let stream = spill_manager.read_spill_as_stream(spill_file, None)?;
+
+        // Stream schema should be nullable
+        assert_eq!(stream.schema(), nullable_schema);
+
+        let batches = stream.try_collect::<Vec<_>>().await?;
+        assert_eq!(batches.len(), 2);
+
+        // Both batches must have the SpillManager's nullable schema
+        assert_eq!(
+            batches[0],
+            non_nullable_batch.with_schema(Arc::clone(&nullable_schema))?
+        );
+        assert_eq!(batches[1], nullable_batch);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/spill/mod.rs b/datafusion/physical-plan/src/spill/mod.rs
index 5b9a91e781b16..51e59318e2d94 100644
--- a/datafusion/physical-plan/src/spill/mod.rs
+++ b/datafusion/physical-plan/src/spill/mod.rs
@@ -18,32 +18,44 @@
 //! Defines the spilling functions
 
 pub(crate) mod in_progress_spill_file;
+pub(crate) mod replayable_spill_input;
 pub(crate) mod spill_manager;
+pub mod spill_pool;
+
+// Moved for refactor, re-export to keep the public API stable
+pub use datafusion_common::utils::memory::get_record_batch_memory_size;
+// Re-export SpillManager for doctests only (hidden from public docs)
+#[doc(hidden)]
+pub use spill_manager::SpillManager;
 
 use std::fs::File;
 use std::io::BufReader;
 use std::path::{Path, PathBuf};
 use std::pin::Pin;
-use std::ptr::NonNull;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 
-use arrow::array::{layout, ArrayData, BufferSpec};
-use arrow::datatypes::{Schema, SchemaRef};
+use arrow::array::{
+    Array, ArrayRef, BinaryViewArray, BufferSpec, GenericByteViewArray, StringViewArray,
+    layout, make_array,
+};
+use arrow::datatypes::DataType;
+use arrow::datatypes::{ByteViewType, Schema, SchemaRef};
 use arrow::ipc::{
+    MetadataVersion,
     reader::StreamReader,
     writer::{IpcWriteOptions, StreamWriter},
-    MetadataVersion,
 };
 use arrow::record_batch::RecordBatch;
+use arrow_data::ArrayDataBuilder;
 
 use datafusion_common::config::SpillCompression;
-use datafusion_common::{exec_datafusion_err, DataFusionError, HashSet, Result};
+use datafusion_common::{DataFusionError, Result, exec_datafusion_err, exec_err};
 use datafusion_common_runtime::SpawnedTask;
-use datafusion_execution::disk_manager::RefCountedTempFile;
 use datafusion_execution::RecordBatchStream;
+use datafusion_execution::disk_manager::RefCountedTempFile;
 use futures::{FutureExt as _, Stream};
-use log::warn;
+use log::debug;
 
 /// Stream that reads spill files from disk where each batch is read in a spawned blocking task
 /// It will read one batch at a time and will not do any buffering, to buffer data use [`crate::common::spawn_buffered`]
@@ -110,6 +122,7 @@ impl SpillReaderStream {
                     unreachable!()
                 };
 
+                let expected_schema = Arc::clone(&self.schema);
                 let task = SpawnedTask::spawn_blocking(move || {
                     let file = BufReader::new(File::open(spill_file.path())?);
                     // SAFETY: DataFusion's spill writer strictly follows Arrow IPC specifications
@@ -119,6 +132,21 @@ impl SpillReaderStream {
                         StreamReader::try_new(file, None)?.with_skip_validation(true)
                     };
 
+                    // Validate the schema read from Arrow IPC file is the same as the
+                    // schema of the current `SpillManager`
+                    let actual_schema = reader.schema();
+
+                    if actual_schema != expected_schema {
+                        return exec_err!(
+                            "Spill file schema mismatch: expected {}, got {}. \
+                            The caller must use the same SpillManager that created the spill file to read it.",
+                            expected_schema,
+                            actual_schema
+                        );
+                    }
+
+                    // TODO: Same-schema reads from a different SpillManager still pass today.
+                    // Add a SpillManager UID to IPC metadata and validate it here as well.
                     let next_batch = reader.next().transpose()?;
 
                     Ok((reader, next_batch))
@@ -148,12 +176,12 @@ impl SpillReaderStream {
                                         > max_record_batch_memory
                                             + SPILL_BATCH_MEMORY_MARGIN
                                     {
-                                        warn!(
-                                                "Record batch memory usage ({actual_size} bytes) exceeds the expected limit ({max_record_batch_memory} bytes) \n\
+                                        debug!(
+                                            "Record batch memory usage ({actual_size} bytes) exceeds the expected limit ({max_record_batch_memory} bytes) \n\
                                                 by more than the allowed tolerance ({SPILL_BATCH_MEMORY_MARGIN} bytes).\n\
                                                 This likely indicates a bug in memory accounting during spilling.\n\
                                                 Please report this issue in https://github.com/apache/datafusion/issues/17340."
-                                            );
+                                        );
                                     }
                                 }
                                 self.state = SpillReaderStreamState::Waiting(reader);
@@ -222,6 +250,7 @@ impl RecordBatchStream for SpillReaderStream {
     since = "46.0.0",
     note = "This method is deprecated. Use `SpillManager::spill_record_batch_by_size` instead."
 )]
+#[expect(clippy::needless_pass_by_value)]
 pub fn spill_record_batch_by_size(
     batch: &RecordBatch,
     path: PathBuf,
@@ -244,74 +273,6 @@ pub fn spill_record_batch_by_size(
     Ok(())
 }
 
-/// Calculate total used memory of this batch.
-///
-/// This function is used to estimate the physical memory usage of the `RecordBatch`.
-/// It only counts the memory of large data `Buffer`s, and ignores metadata like
-/// types and pointers.
-/// The implementation will add up all unique `Buffer`'s memory
-/// size, due to:
-/// - The data pointer inside `Buffer` are memory regions returned by global memory
-///   allocator, those regions can't have overlap.
-/// - The actual used range of `ArrayRef`s inside `RecordBatch` can have overlap
-///   or reuse the same `Buffer`. For example: taking a slice from `Array`.
-///
-/// Example:
-/// For a `RecordBatch` with two columns: `col1` and `col2`, two columns are pointing
-/// to a sub-region of the same buffer.
-///
-/// {xxxxxxxxxxxxxxxxxxx} <--- buffer
-///       ^    ^  ^    ^
-///       |    |  |    |
-/// col1->{    }  |    |
-/// col2--------->{    }
-///
-/// In the above case, `get_record_batch_memory_size` will return the size of
-/// the buffer, instead of the sum of `col1` and `col2`'s actual memory size.
-///
-/// Note: Current `RecordBatch`.get_array_memory_size()` will double count the
-/// buffer memory size if multiple arrays within the batch are sharing the same
-/// `Buffer`. This method provides temporary fix until the issue is resolved:
-/// <https://github.com/apache/arrow-rs/issues/6439>
-pub fn get_record_batch_memory_size(batch: &RecordBatch) -> usize {
-    // Store pointers to `Buffer`'s start memory address (instead of actual
-    // used data region's pointer represented by current `Array`)
-    let mut counted_buffers: HashSet<NonNull<u8>> = HashSet::new();
-    let mut total_size = 0;
-
-    for array in batch.columns() {
-        let array_data = array.to_data();
-        count_array_data_memory_size(&array_data, &mut counted_buffers, &mut total_size);
-    }
-
-    total_size
-}
-
-/// Count the memory usage of `array_data` and its children recursively.
-fn count_array_data_memory_size(
-    array_data: &ArrayData,
-    counted_buffers: &mut HashSet<NonNull<u8>>,
-    total_size: &mut usize,
-) {
-    // Count memory usage for `array_data`
-    for buffer in array_data.buffers() {
-        if counted_buffers.insert(buffer.data_ptr()) {
-            *total_size += buffer.capacity();
-        } // Otherwise the buffer's memory is already counted
-    }
-
-    if let Some(null_buffer) = array_data.nulls() {
-        if counted_buffers.insert(null_buffer.inner().inner().data_ptr()) {
-            *total_size += null_buffer.inner().inner().capacity();
-        }
-    }
-
-    // Count all children `ArrayData` recursively
-    for child in array_data.child_data() {
-        count_array_data_memory_size(child, counted_buffers, total_size);
-    }
-}
-
 /// Write in Arrow IPC Stream format to a file.
 ///
 /// Stream format is used for spill because it supports dictionary replacement, and the random
@@ -371,6 +332,11 @@ impl IPCStreamWriter {
         Ok((delta_num_rows, delta_num_bytes))
     }
 
+    pub fn flush(&mut self) -> Result<()> {
+        self.writer.flush()?;
+        Ok(())
+    }
+
     /// Finish the writer
     pub fn finish(&mut self) -> Result<()> {
         self.writer.finish().map_err(Into::into)
@@ -400,6 +366,174 @@ fn get_max_alignment_for_schema(schema: &Schema) -> usize {
     max_alignment
 }
 
+/// Size of a single view structure in StringView/BinaryView arrays (in bytes).
+/// Each view is 16 bytes: 4 bytes length + 4 bytes prefix + 8 bytes buffer ID/offset.
+const VIEW_SIZE_BYTES: usize = 16;
+
+/// Performs garbage collection on StringView and BinaryView arrays before spilling to reduce memory usage.
+///
+/// # Why GC is needed
+///
+/// StringView and BinaryView arrays can accumulate significant memory waste when sliced.
+/// When a large array is sliced (e.g., taking first 100 rows of 1000), the view array
+/// still references the original data buffers containing all 1000 rows of data.
+///
+/// For example, in the ClickBench benchmark (issue #19414), repeated slicing of StringView
+/// arrays resulted in 820MB of spill files that could be reduced to just 33MB after GC -
+/// a 96% reduction in size.
+///
+/// # How it works
+///
+/// The GC process:
+/// 1. Identifies view arrays (StringView/BinaryView) in the batch
+/// 2. Checks if their data buffers exceed a memory threshold
+/// 3. If exceeded, calls the Arrow `gc()` method which creates new compact buffers
+///    containing only the data referenced by the current views
+/// 4. Returns a new batch with GC'd arrays (or original arrays if GC not needed)
+///
+/// # When GC is triggered
+///
+/// GC is only performed when data buffers exceed a threshold (currently 10KB).
+/// This balances memory savings against the CPU overhead of garbage collection.
+/// Small arrays are passed through unchanged since the GC overhead would exceed
+/// any memory savings.
+///
+/// # Performance considerations
+///
+/// - If no view arrays need compaction, the original batch is cloned cheaply
+/// - GC is skipped for small buffers to avoid unnecessary CPU overhead
+/// - Nested container types are traversed recursively so view arrays inside
+///   `List`, `Map`, `Union`, `Dictionary`, and other child-bearing arrays are compacted too
+/// - The Arrow `gc()` method itself is optimized and only copies referenced data
+pub(crate) fn gc_view_arrays(batch: &RecordBatch) -> Result<RecordBatch> {
+    let mut mutated = false;
+    let mut new_columns: Vec<Arc<dyn Array>> = Vec::with_capacity(batch.num_columns());
+
+    for array in batch.columns() {
+        let (gc_array, array_mutated) = gc_array(array)?;
+        mutated |= array_mutated;
+        new_columns.push(gc_array);
+    }
+
+    if mutated {
+        Ok(RecordBatch::try_new(batch.schema(), new_columns)?)
+    } else {
+        Ok(batch.clone())
+    }
+}
+
+fn gc_array(array: &ArrayRef) -> Result<(ArrayRef, bool)> {
+    match array.data_type() {
+        DataType::Utf8View => {
+            let string_view = array
+                .as_any()
+                .downcast_ref::<StringViewArray>()
+                .expect("Utf8View array should downcast to StringViewArray");
+            if should_gc_view_array(string_view) {
+                Ok((Arc::new(string_view.gc()) as ArrayRef, true))
+            } else {
+                Ok((Arc::clone(array), false))
+            }
+        }
+        DataType::BinaryView => {
+            let binary_view = array
+                .as_any()
+                .downcast_ref::<BinaryViewArray>()
+                .expect("BinaryView array should downcast to BinaryViewArray");
+            if should_gc_view_array(binary_view) {
+                Ok((Arc::new(binary_view.gc()) as ArrayRef, true))
+            } else {
+                Ok((Arc::clone(array), false))
+            }
+        }
+        _ => gc_array_children(array),
+    }
+}
+
+fn gc_array_children(array: &ArrayRef) -> Result<(ArrayRef, bool)> {
+    let data = array.to_data();
+    if data.child_data().is_empty() {
+        return Ok((Arc::clone(array), false));
+    }
+
+    let mut mutated = false;
+    let mut child_data = Vec::with_capacity(data.child_data().len());
+    for child in data.child_data() {
+        let child_array = make_array(child.clone());
+        let (gc_child, child_mutated) = gc_array(&child_array)?;
+        mutated |= child_mutated;
+        child_data.push(gc_child.to_data());
+    }
+
+    if !mutated {
+        return Ok((Arc::clone(array), false));
+    }
+
+    let rebuilt = ArrayDataBuilder::new(data.data_type().clone())
+        .len(data.len())
+        .offset(data.offset())
+        .nulls(data.nulls().cloned())
+        .buffers(data.buffers().to_vec())
+        .child_data(child_data)
+        .build()?;
+
+    Ok((make_array(rebuilt), true))
+}
+
+/// Determines whether a view array should be garbage collected before spilling.
+///
+/// Arrow's `gc()` always allocates new compact buffers (it is never a no-op), so we
+/// check here to skip the allocation cost when data buffers are small. We subtract
+/// the views buffer (16 bytes × n_rows) from `get_buffer_memory_size()` so the
+/// threshold tracks non-inline string data rather than row count.
+fn should_gc_view_array<T: ByteViewType>(array: &GenericByteViewArray<T>) -> bool {
+    const MIN_BUFFER_SIZE_FOR_GC: usize = 10 * 1024; // 10KB threshold
+
+    if array.data_buffers().is_empty() {
+        return false;
+    }
+
+    let data_buffer_size = array
+        .get_buffer_memory_size()
+        .saturating_sub(array.len() * VIEW_SIZE_BYTES);
+    data_buffer_size > MIN_BUFFER_SIZE_FOR_GC
+}
+
+#[cfg(test)]
+fn calculate_string_view_waste_ratio(array: &StringViewArray) -> f64 {
+    use arrow_data::MAX_INLINE_VIEW_LEN;
+    calculate_view_waste_ratio(array.len(), array.data_buffers(), |i| {
+        if !array.is_null(i) {
+            let value = array.value(i);
+            if value.len() > MAX_INLINE_VIEW_LEN as usize {
+                return value.len();
+            }
+        }
+        0
+    })
+}
+
+#[cfg(test)]
+fn calculate_view_waste_ratio<F>(
+    len: usize,
+    data_buffers: &[arrow::buffer::Buffer],
+    get_value_size: F,
+) -> f64
+where
+    F: Fn(usize) -> usize,
+{
+    let total_buffer_size: usize = data_buffers.iter().map(|b| b.capacity()).sum();
+    if total_buffer_size == 0 {
+        return 0.0;
+    }
+
+    let mut actual_used_size = (0..len).map(get_value_size).sum::<usize>();
+    actual_used_size += len * VIEW_SIZE_BYTES;
+
+    let waste = total_buffer_size.saturating_sub(actual_used_size);
+    waste as f64 / total_buffer_size as f64
+}
+
 #[cfg(test)]
 mod tests {
     use super::in_progress_spill_file::InProgressSpillFile;
@@ -409,16 +543,12 @@ mod tests {
     use crate::metrics::SpillMetrics;
     use crate::spill::spill_manager::SpillManager;
     use crate::test::build_table_i32;
-    use arrow::array::{ArrayRef, Float64Array, Int32Array, ListArray, StringArray};
+    use arrow::array::{ArrayRef, Int32Array, StringArray};
     use arrow::compute::cast;
-    use arrow::datatypes::{DataType, Field, Int32Type, Schema};
-    use arrow::record_batch::RecordBatch;
-    use datafusion_common::Result;
+    use arrow::datatypes::{DataType, Field};
     use datafusion_execution::runtime_env::RuntimeEnv;
     use futures::StreamExt as _;
 
-    use std::sync::Arc;
-
     #[tokio::test]
     async fn test_batch_spill_and_read() -> Result<()> {
         let batch1 = build_table_i32(
@@ -533,11 +663,12 @@ mod tests {
         let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
         let spill_manager = SpillManager::new(env, metrics, Arc::clone(&schema));
 
+        let row_batches: Vec<RecordBatch> =
+            (0..batch1.num_rows()).map(|i| batch1.slice(i, 1)).collect();
         let (spill_file, max_batch_mem) = spill_manager
-            .spill_record_batch_by_size_and_return_max_batch_memory(
-                &batch1,
+            .spill_record_batch_iter_and_return_max_batch_memory(
+                row_batches.iter().map(Ok),
                 "Test Spill",
-                1,
             )?
             .unwrap();
         assert!(spill_file.path().exists());
@@ -659,133 +790,6 @@ mod tests {
         Ok(())
     }
 
-    #[test]
-    fn test_get_record_batch_memory_size() {
-        // Create a simple record batch with two columns
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("ints", DataType::Int32, true),
-            Field::new("float64", DataType::Float64, false),
-        ]));
-
-        let int_array =
-            Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4), Some(5)]);
-        let float64_array = Float64Array::from(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
-
-        let batch = RecordBatch::try_new(
-            schema,
-            vec![Arc::new(int_array), Arc::new(float64_array)],
-        )
-        .unwrap();
-
-        let size = get_record_batch_memory_size(&batch);
-        assert_eq!(size, 60);
-    }
-
-    #[test]
-    fn test_get_record_batch_memory_size_with_null() {
-        // Create a simple record batch with two columns
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("ints", DataType::Int32, true),
-            Field::new("float64", DataType::Float64, false),
-        ]));
-
-        let int_array = Int32Array::from(vec![None, Some(2), Some(3)]);
-        let float64_array = Float64Array::from(vec![1.0, 2.0, 3.0]);
-
-        let batch = RecordBatch::try_new(
-            schema,
-            vec![Arc::new(int_array), Arc::new(float64_array)],
-        )
-        .unwrap();
-
-        let size = get_record_batch_memory_size(&batch);
-        assert_eq!(size, 100);
-    }
-
-    #[test]
-    fn test_get_record_batch_memory_size_empty() {
-        // Test with empty record batch
-        let schema = Arc::new(Schema::new(vec![Field::new(
-            "ints",
-            DataType::Int32,
-            false,
-        )]));
-
-        let int_array: Int32Array = Int32Array::from(vec![] as Vec<i32>);
-        let batch = RecordBatch::try_new(schema, vec![Arc::new(int_array)]).unwrap();
-
-        let size = get_record_batch_memory_size(&batch);
-        assert_eq!(size, 0, "Empty batch should have 0 memory size");
-    }
-
-    #[test]
-    fn test_get_record_batch_memory_size_shared_buffer() {
-        // Test with slices that share the same underlying buffer
-        let original = Int32Array::from(vec![1, 2, 3, 4, 5]);
-        let slice1 = original.slice(0, 3);
-        let slice2 = original.slice(2, 3);
-
-        // `RecordBatch` with `original` array
-        // ----
-        let schema_origin = Arc::new(Schema::new(vec![Field::new(
-            "origin_col",
-            DataType::Int32,
-            false,
-        )]));
-        let batch_origin =
-            RecordBatch::try_new(schema_origin, vec![Arc::new(original)]).unwrap();
-
-        // `RecordBatch` with all columns are reference to `original` array
-        // ----
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("slice1", DataType::Int32, false),
-            Field::new("slice2", DataType::Int32, false),
-        ]));
-
-        let batch_sliced =
-            RecordBatch::try_new(schema, vec![Arc::new(slice1), Arc::new(slice2)])
-                .unwrap();
-
-        // Two sizes should all be only counting the buffer in `original` array
-        let size_origin = get_record_batch_memory_size(&batch_origin);
-        let size_sliced = get_record_batch_memory_size(&batch_sliced);
-
-        assert_eq!(size_origin, size_sliced);
-    }
-
-    #[test]
-    fn test_get_record_batch_memory_size_nested_array() {
-        let schema = Arc::new(Schema::new(vec![
-            Field::new(
-                "nested_int",
-                DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))),
-                false,
-            ),
-            Field::new(
-                "nested_int2",
-                DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))),
-                false,
-            ),
-        ]));
-
-        let int_list_array = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
-            Some(vec![Some(1), Some(2), Some(3)]),
-        ]);
-
-        let int_list_array2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
-            Some(vec![Some(4), Some(5), Some(6)]),
-        ]);
-
-        let batch = RecordBatch::try_new(
-            schema,
-            vec![Arc::new(int_list_array), Arc::new(int_list_array2)],
-        )
-        .unwrap();
-
-        let size = get_record_batch_memory_size(&batch);
-        assert_eq!(size, 8208);
-    }
-
     // ==== Spill manager tests ====
 
     #[test]
@@ -873,13 +877,13 @@ mod tests {
                 Arc::new(StringArray::from(vec!["d", "e", "f"])),
             ],
         )?;
-        // After appending each batch, spilled_rows should increase, while spill_file_count and
-        // spilled_bytes remain the same (spilled_bytes is updated only after finish() is called)
+        // After appending each batch, spilled_rows and spilled_bytes should increase incrementally,
+        // while spill_file_count remains 1 (since we're writing to the same file)
         in_progress_file.append_batch(&batch1)?;
-        verify_metrics(&in_progress_file, 1, 0, 3)?;
+        verify_metrics(&in_progress_file, 1, 440, 3)?;
 
         in_progress_file.append_batch(&batch2)?;
-        verify_metrics(&in_progress_file, 1, 0, 6)?;
+        verify_metrics(&in_progress_file, 1, 704, 6)?;
 
         let completed_file = in_progress_file.finish()?;
         assert!(completed_file.is_some());
@@ -914,7 +918,7 @@ mod tests {
         let completed_file = spill_manager.spill_record_batch_and_finish(&[], "Test")?;
         assert!(completed_file.is_none());
 
-        // Test write empty batch with interface `spill_record_batch_by_size_and_return_max_batch_memory()`
+        // Test write empty batch with interface `spill_record_batch_iter_and_return_max_batch_memory()`
         let empty_batch = RecordBatch::try_new(
             Arc::clone(&schema),
             vec![
@@ -923,10 +927,9 @@ mod tests {
             ],
         )?;
         let completed_file = spill_manager
-            .spill_record_batch_by_size_and_return_max_batch_memory(
-                &empty_batch,
+            .spill_record_batch_iter_and_return_max_batch_memory(
+                std::iter::once(Ok(&empty_batch)),
                 "Test",
-                1,
             )?;
         assert!(completed_file.is_none());
 
@@ -987,4 +990,540 @@ mod tests {
         assert_eq!(alignment, 8);
         Ok(())
     }
+    #[tokio::test]
+    async fn test_real_time_spill_metrics() -> Result<()> {
+        let env = Arc::new(RuntimeEnv::default());
+        let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, false),
+        ]));
+
+        let spill_manager = Arc::new(SpillManager::new(
+            Arc::clone(&env),
+            metrics.clone(),
+            Arc::clone(&schema),
+        ));
+        let mut in_progress_file = spill_manager.create_in_progress_file("Test")?;
+
+        let batch1 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])),
+                Arc::new(StringArray::from(vec!["a", "b", "c"])),
+            ],
+        )?;
+
+        // Before any batch, metrics should be 0
+        assert_eq!(metrics.spilled_bytes.value(), 0);
+        assert_eq!(metrics.spill_file_count.value(), 0);
+
+        // Append first batch
+        in_progress_file.append_batch(&batch1)?;
+
+        // Metrics should be updated immediately (at least schema and first batch)
+        let bytes_after_batch1 = metrics.spilled_bytes.value();
+        assert_eq!(bytes_after_batch1, 440);
+        assert_eq!(metrics.spill_file_count.value(), 1);
+
+        // Check global progress
+        let progress = env.spilling_progress();
+        assert_eq!(progress.current_bytes, bytes_after_batch1 as u64);
+        assert_eq!(progress.active_files_count, 1);
+
+        // Append another batch
+        in_progress_file.append_batch(&batch1)?;
+        let bytes_after_batch2 = metrics.spilled_bytes.value();
+        assert!(bytes_after_batch2 > bytes_after_batch1);
+
+        // Check global progress again
+        let progress = env.spilling_progress();
+        assert_eq!(progress.current_bytes, bytes_after_batch2 as u64);
+
+        // Finish the file
+        let spilled_file = in_progress_file.finish()?;
+        let final_bytes = metrics.spilled_bytes.value();
+        assert!(final_bytes > bytes_after_batch2);
+
+        // Even after finish, file is still "active" until dropped
+        let progress = env.spilling_progress();
+        assert!(progress.current_bytes > 0);
+        assert_eq!(progress.active_files_count, 1);
+
+        drop(spilled_file);
+        assert_eq!(env.spilling_progress().active_files_count, 0);
+        assert_eq!(env.spilling_progress().current_bytes, 0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_gc_string_view_before_spill() -> Result<()> {
+        use arrow::array::StringViewArray;
+
+        let strings: Vec<String> = (0..200)
+            .map(|i| {
+                if i % 2 == 0 {
+                    "short_string".to_string()
+                } else {
+                    "this_is_a_much_longer_string_that_will_not_be_inlined".to_string()
+                }
+            })
+            .collect();
+
+        let string_array = StringViewArray::from(strings);
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "strings",
+            DataType::Utf8View,
+            false,
+        )]));
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(string_array) as ArrayRef],
+        )?;
+        let sliced_batch = batch.slice(0, 20);
+        let gc_batch = gc_view_arrays(&sliced_batch)?;
+
+        assert_eq!(gc_batch.num_rows(), sliced_batch.num_rows());
+        assert_eq!(gc_batch.num_columns(), sliced_batch.num_columns());
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_gc_binary_view_before_spill() -> Result<()> {
+        use arrow::array::BinaryViewArray;
+
+        let binaries: Vec<Vec<u8>> = (0..200)
+            .map(|i| {
+                if i % 2 == 0 {
+                    vec![1, 2, 3, 4]
+                } else {
+                    vec![1; 50]
+                }
+            })
+            .collect();
+
+        let binary_array =
+            BinaryViewArray::from_iter(binaries.iter().map(|b| Some(b.as_slice())));
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "binaries",
+            DataType::BinaryView,
+            false,
+        )]));
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(binary_array) as ArrayRef],
+        )?;
+        let sliced_batch = batch.slice(0, 20);
+        let gc_batch = gc_view_arrays(&sliced_batch)?;
+
+        assert_eq!(gc_batch.num_rows(), sliced_batch.num_rows());
+        assert_eq!(gc_batch.num_columns(), sliced_batch.num_columns());
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_gc_skips_small_arrays() -> Result<()> {
+        use arrow::array::StringViewArray;
+
+        let strings: Vec<String> = (0..10).map(|i| format!("string_{i}")).collect();
+
+        let string_array = StringViewArray::from(strings);
+        let array_ref: ArrayRef = Arc::new(string_array);
+
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "strings",
+            DataType::Utf8View,
+            false,
+        )]));
+
+        let batch = RecordBatch::try_new(Arc::clone(&schema), vec![array_ref])?;
+
+        // GC should return the original batch for small arrays
+        let should_gc = should_gc_view_array(
+            batch
+                .column(0)
+                .as_any()
+                .downcast_ref::<StringViewArray>()
+                .unwrap(),
+        );
+        let gc_batch = gc_view_arrays(&batch)?;
+
+        assert!(!should_gc);
+        assert_eq!(gc_batch.num_rows(), batch.num_rows());
+        assert!(Arc::ptr_eq(batch.column(0), gc_batch.column(0)));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_gc_with_mixed_columns() -> Result<()> {
+        use arrow::array::{Int32Array, StringViewArray};
+
+        let strings: Vec<String> = (0..200)
+            .map(|i| format!("long_string_for_gc_testing_{i}"))
+            .collect();
+
+        let string_array = StringViewArray::from(strings);
+        let int_array = Int32Array::from((0..200).collect::<Vec<i32>>());
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("strings", DataType::Utf8View, false),
+            Field::new("ints", DataType::Int32, false),
+        ]));
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(string_array) as ArrayRef,
+                Arc::new(int_array) as ArrayRef,
+            ],
+        )?;
+
+        let sliced_batch = batch.slice(0, 50);
+        let gc_batch = gc_view_arrays(&sliced_batch)?;
+
+        assert_eq!(gc_batch.num_columns(), 2);
+        assert_eq!(gc_batch.num_rows(), 50);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_verify_gc_triggers_for_sliced_arrays() -> Result<()> {
+        let strings: Vec<String> = (0..200)
+            .map(|i| {
+                format!(
+                    "http://example.com/very/long/path/that/exceeds/inline/threshold/{i}"
+                )
+            })
+            .collect();
+
+        let string_array = StringViewArray::from(strings);
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "url",
+            DataType::Utf8View,
+            false,
+        )]));
+
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![Arc::new(string_array.clone()) as ArrayRef],
+        )?;
+
+        let sliced = batch.slice(0, 20);
+
+        let sliced_array = sliced
+            .column(0)
+            .as_any()
+            .downcast_ref::<StringViewArray>()
+            .unwrap();
+        let should_gc = should_gc_view_array(sliced_array);
+        let waste_ratio = calculate_string_view_waste_ratio(sliced_array);
+
+        assert!(
+            waste_ratio > 0.8,
+            "Waste ratio should be > 0.8 for sliced array"
+        );
+        assert!(
+            should_gc,
+            "GC should trigger for sliced array with high waste"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_reproduce_issue_19414_string_view_spill_without_gc() -> Result<()> {
+        use arrow::array::StringViewArray;
+        use std::fs;
+
+        let num_rows = 1000;
+        let mut strings = Vec::with_capacity(num_rows);
+
+        for i in 0..num_rows {
+            let url = match i % 5 {
+                0 => format!(
+                    "http://irr.ru/index.php?showalbum/login-leniya7777294,938303130/{i}"
+                ),
+                1 => format!("http://komme%2F27.0.1453.116/very/long/path/{i}"),
+                2 => format!("https://produkty%2Fproduct/category/item/{i}"),
+                3 => format!(
+                    "http://irr.ru/index.php?showalbum/login-kapusta-advert2668/{i}"
+                ),
+                4 => format!(
+                    "http://irr.ru/index.php?showalbum/login-kapustic/product/{i}"
+                ),
+                _ => unreachable!(),
+            };
+            strings.push(url);
+        }
+
+        let string_array = StringViewArray::from(strings);
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "URL",
+            DataType::Utf8View,
+            false,
+        )]));
+
+        let original_batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(string_array.clone()) as ArrayRef],
+        )?;
+
+        let total_buffer_size: usize = string_array
+            .data_buffers()
+            .iter()
+            .map(|buffer| buffer.capacity())
+            .sum();
+
+        let mut sliced_batches = Vec::new();
+        let slice_size = 100;
+
+        for i in (0..num_rows).step_by(slice_size) {
+            let len = std::cmp::min(slice_size, num_rows - i);
+            let sliced = original_batch.slice(i, len);
+            sliced_batches.push(sliced);
+        }
+
+        let env = Arc::new(RuntimeEnv::default());
+        let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
+        let spill_manager = SpillManager::new(env, metrics, schema);
+
+        let mut in_progress_file = spill_manager.create_in_progress_file("Test GC")?;
+
+        for batch in &sliced_batches {
+            in_progress_file.append_batch(batch)?;
+        }
+
+        let spill_file = in_progress_file.finish()?.unwrap();
+        let file_size = fs::metadata(spill_file.path())?.len() as usize;
+
+        let theoretical_without_gc = total_buffer_size * sliced_batches.len();
+        let reduction_percent = ((theoretical_without_gc - file_size) as f64
+            / theoretical_without_gc as f64)
+            * 100.0;
+
+        assert!(
+            reduction_percent > 80.0,
+            "GC should reduce spill file size by >80%, got {reduction_percent:.1}%"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_spill_with_and_without_gc_comparison() -> Result<()> {
+        let num_rows = 400;
+        let strings: Vec<String> = (0..num_rows)
+            .map(|i| {
+                format!(
+                    "http://example.com/this/is/a/long/url/path/that/wont/be/inlined/{i}"
+                )
+            })
+            .collect();
+
+        let string_array = StringViewArray::from(strings);
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "url",
+            DataType::Utf8View,
+            false,
+        )]));
+
+        let batch =
+            RecordBatch::try_new(schema, vec![Arc::new(string_array) as ArrayRef])?;
+
+        let sliced_batch = batch.slice(0, 40);
+
+        let array_without_gc = sliced_batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<StringViewArray>()
+            .unwrap();
+        let size_without_gc: usize = array_without_gc
+            .data_buffers()
+            .iter()
+            .map(|buffer| buffer.capacity())
+            .sum();
+
+        let gc_batch = gc_view_arrays(&sliced_batch)?;
+        let array_with_gc = gc_batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<StringViewArray>()
+            .unwrap();
+        let size_with_gc: usize = array_with_gc
+            .data_buffers()
+            .iter()
+            .map(|buffer| buffer.capacity())
+            .sum();
+
+        let reduction_percent =
+            ((size_without_gc - size_with_gc) as f64 / size_without_gc as f64) * 100.0;
+
+        assert!(
+            reduction_percent > 85.0,
+            "Expected >85% reduction for 10% slice, got {reduction_percent:.1}%"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_gc_recurses_into_nested_view_arrays() -> Result<()> {
+        use arrow::array::{DictionaryArray, Int32Array};
+        use arrow::buffer::Buffer;
+
+        let strings: Vec<String> = (0..200)
+            .map(|i| format!("http://example.com/nested/path/that/is/not/inlined/{i}"))
+            .collect();
+        let string_values = Arc::new(StringViewArray::from(strings)) as ArrayRef;
+
+        let list_data = ArrayDataBuilder::new(DataType::List(Arc::new(
+            Field::new_list_field(DataType::Utf8View, true),
+        )))
+        .len(20)
+        .buffers(vec![Buffer::from_iter((0..=20).map(|i| i * 5_i32))])
+        .child_data(vec![string_values.slice(0, 100).to_data()])
+        .build()?;
+        let list_array = make_array(list_data);
+
+        let keys = Int32Array::from_iter_values(0..20);
+        let dictionary = DictionaryArray::new(keys, string_values.slice(0, 20));
+        let dictionary_array = Arc::new(dictionary) as ArrayRef;
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new(
+                "list_strings",
+                DataType::List(Arc::new(Field::new_list_field(DataType::Utf8View, true))),
+                false,
+            ),
+            Field::new(
+                "dictionary_strings",
+                DataType::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(DataType::Utf8View),
+                ),
+                false,
+            ),
+        ]));
+        let batch = RecordBatch::try_new(schema, vec![list_array, dictionary_array])?;
+        let gc_batch = gc_view_arrays(&batch)?;
+
+        let gc_list_values = gc_batch.column(0).to_data().child_data()[0].clone();
+        let gc_list_values = make_array(gc_list_values);
+        let gc_list_values = gc_list_values
+            .as_any()
+            .downcast_ref::<StringViewArray>()
+            .unwrap();
+        assert!(
+            calculate_string_view_waste_ratio(gc_list_values) < 0.2,
+            "GC should compact nested List child views"
+        );
+
+        let gc_dictionary_values = gc_batch.column(1).to_data().child_data()[0].clone();
+        let gc_dictionary_values = make_array(gc_dictionary_values);
+        let gc_dictionary_values = gc_dictionary_values
+            .as_any()
+            .downcast_ref::<StringViewArray>()
+            .unwrap();
+        assert!(
+            calculate_string_view_waste_ratio(gc_dictionary_values) < 0.2,
+            "GC should compact nested Dictionary values"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_spill_file_size_gc_verification_string_view() -> Result<()> {
+        use arrow::array::StringViewArray;
+        use std::fs;
+
+        // 1. Setup bloated data (large buffers)
+        let num_rows = 1000;
+        let string_array: StringViewArray = (0..num_rows)
+            .map(|i| Some(format!("this_is_a_long_string_to_ensure_it_is_not_inlined_and_causes_waste_{i}")))
+            .collect();
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "s",
+            DataType::Utf8View,
+            false,
+        )]));
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(string_array.clone()) as ArrayRef],
+        )?;
+
+        // 2. Slice it heavily (1% of the data)
+        let sliced_batch = batch.slice(0, 10);
+
+        // 3. Spill to disk using SpillManager
+        let env = Arc::new(RuntimeEnv::default());
+        let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
+        let spill_manager = SpillManager::new(env, metrics, schema);
+        let spill_file = spill_manager
+            .spill_record_batch_and_finish(&[sliced_batch], "TestGC")?
+            .unwrap();
+
+        // 4. Check file size on disk
+        let file_size = fs::metadata(spill_file.path())?.len();
+
+        // The original buffer size is around 70KB.
+        // Without GC, the spill file would be > 70KB.
+        // With GC, it should be much smaller (only 10 rows of ~70 bytes each + metadata).
+        assert!(
+            file_size < 10 * 1024,
+            "Spill file is too large ({file_size} bytes)! GC might not be working."
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_spill_file_size_gc_verification_binary_view() -> Result<()> {
+        use arrow::array::BinaryViewArray;
+        use std::fs;
+
+        // 1. Setup bloated data (large buffers)
+        let num_rows = 1000;
+        let binary_array: BinaryViewArray =
+            (0..num_rows).map(|i| Some(vec![i as u8; 100])).collect();
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "b",
+            DataType::BinaryView,
+            false,
+        )]));
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(binary_array.clone()) as ArrayRef],
+        )?;
+
+        // 2. Slice it heavily (1% of the data)
+        let sliced_batch = batch.slice(0, 10);
+
+        // 3. Spill to disk using SpillManager
+        let env = Arc::new(RuntimeEnv::default());
+        let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
+        let spill_manager = SpillManager::new(env, metrics, schema);
+        let spill_file = spill_manager
+            .spill_record_batch_and_finish(&[sliced_batch], "TestGCBinary")?
+            .unwrap();
+
+        // 4. Check file size on disk
+        let file_size = fs::metadata(spill_file.path())?.len();
+
+        // Original buffer is 100KB.
+        // With GC, it should be much smaller.
+        assert!(
+            file_size < 10 * 1024,
+            "Spill file is too large ({file_size} bytes)! GC might not be working."
+        );
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/spill/replayable_spill_input.rs b/datafusion/physical-plan/src/spill/replayable_spill_input.rs
new file mode 100644
index 0000000000000..ddef15a639183
--- /dev/null
+++ b/datafusion/physical-plan/src/spill/replayable_spill_input.rs
@@ -0,0 +1,445 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utility for replaying a one-shot input `RecordBatchStream` through spill.
+//!
+//! See comments in [`ReplayableStreamSource`] for details.
+
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use arrow::datatypes::SchemaRef;
+use arrow::record_batch::RecordBatch;
+use datafusion_common::{Result, internal_err};
+use datafusion_execution::RecordBatchStream;
+use datafusion_execution::SendableRecordBatchStream;
+use datafusion_execution::disk_manager::RefCountedTempFile;
+use futures::Stream;
+use parking_lot::Mutex;
+
+use crate::EmptyRecordBatchStream;
+use crate::spill::in_progress_spill_file::InProgressSpillFile;
+use crate::spill::spill_manager::SpillManager;
+
+/// Spill-backed replayable stream source.
+///
+/// [`ReplayableStreamSource`] is constructed from an input stream, usually produced
+/// by executing an input `ExecutionPlan`.
+///
+/// - On the first pass, it evaluates the input stream, produces `RecordBatch`es,
+///   caches those batches to a local spill file, and also forwards them to the
+///   output.
+/// - On subsequent passes, it reads directly from the spill file.
+///
+/// ```text
+/// first pass:
+///
+/// RecordBatch stream
+///     |
+///     v
+///   [batch] -> output
+///     |
+///     +----> spill file
+///
+///
+/// later passes:
+///
+/// spill file
+///     |
+///     v
+///   [batch] -> output
+/// ```
+///
+/// This is useful when an input stream must be replayed and:
+/// - Re-evaluation is expensive because the input stream may come from a long
+///   and complex pipeline.
+/// - The parent operator is under memory pressure and cannot cache the input in
+///   memory for replay.
+///
+/// # Concurrency assumption
+/// Passes must be opened and consumed sequentially.
+/// Opening another pass before exhausting the current one returns an error.
+pub(crate) struct ReplayableStreamSource {
+    schema: SchemaRef,
+    input: Option<SendableRecordBatchStream>,
+    spill_manager: SpillManager,
+    request_description: String,
+    /// Inner state is owned by either the source or one active stream to ensure
+    /// sequential access; see struct docs for the concurrency contract.
+    ///
+    /// Ownership model:
+    /// - No active stream: source owns the state (`source.state = Some(state)`).
+    /// - Active stream: the stream owns the state (`source.state = None`).
+    state: Arc<Mutex<Option<StateInner>>>,
+}
+
+/// Inner state exclusively owned by either [`ReplayableStreamSource`] or one [`ReplayableSpillStream`]
+enum StateInner {
+    Unopened,
+    Replayable(Option<RefCountedTempFile>),
+    Poisoned,
+}
+
+impl ReplayableStreamSource {
+    /// Creates a replayable stream producer over a one-shot input stream.
+    ///
+    /// It caches the input into a local spill file on the first pass, then
+    /// reads directly from that spill file on subsequent passes.
+    pub(crate) fn new(
+        input: SendableRecordBatchStream,
+        spill_manager: SpillManager,
+        request_description: impl Into<String>,
+    ) -> Self {
+        let schema = input.schema();
+        Self {
+            schema,
+            input: Some(input),
+            spill_manager,
+            request_description: request_description.into(),
+            state: Arc::new(Mutex::new(Some(StateInner::Unopened))),
+        }
+    }
+
+    fn set_state(&self, state: StateInner) {
+        *self.state.lock() = Some(state);
+    }
+
+    /// Opens the next pass over this input.
+    ///
+    /// The first call returns a stream that forwards upstream batches while
+    /// caching them to spill. Later calls return streams that read directly
+    /// from the completed spill file.
+    ///
+    /// # Note
+    /// Subsequent passes MUST be opened only after the previous pass is fully
+    /// consumed; otherwise, an error is returned.
+    pub(crate) fn open_pass(&mut self) -> Result<SendableRecordBatchStream> {
+        let state = self.state.lock().take();
+        let Some(state) = state else {
+            return internal_err!("ReplayableStreamSource pass is still active");
+        };
+
+        match state {
+            StateInner::Unopened => {
+                let Some(input) = self.input.take() else {
+                    self.set_state(StateInner::Poisoned);
+                    return internal_err!(
+                        "ReplayableStreamSource missing first-pass input"
+                    );
+                };
+                let spill_file = match self
+                    .spill_manager
+                    .create_in_progress_file(&self.request_description)
+                {
+                    Ok(spill_file) => spill_file,
+                    Err(e) => {
+                        self.input = Some(input);
+                        self.set_state(StateInner::Unopened);
+                        return Err(e);
+                    }
+                };
+
+                Ok(Box::pin(ReplayableSpillStream::new_first(
+                    Arc::clone(&self.schema),
+                    input,
+                    Arc::clone(&self.state),
+                    spill_file,
+                )))
+            }
+            StateInner::Poisoned => {
+                internal_err!(
+                    "ReplayableStreamSource first pass did not complete successfully"
+                )
+            }
+            StateInner::Replayable(spill_file) => {
+                let replay_state = spill_file.clone();
+                match ReplayableSpillStream::new_replay(
+                    Arc::clone(&self.schema),
+                    &self.spill_manager,
+                    Arc::clone(&self.state),
+                    spill_file,
+                ) {
+                    Ok(stream) => Ok(Box::pin(stream)),
+                    Err(e) => {
+                        self.set_state(StateInner::Replayable(replay_state));
+                        Err(e)
+                    }
+                }
+            }
+        }
+    }
+}
+
+/// Makes a one-shot stream replayable using spill caching, keeping replays fast
+/// and memory efficient.
+///
+/// On the first pass, it evaluates and forwards output from `inner` while
+/// caching it to a spill file for future replays.
+///
+/// On later passes, it replays directly from the cached spill file.
+///
+/// See also [`ReplayableStreamSource`] for details.
+struct ReplayableSpillStream {
+    schema: SchemaRef,
+    shared_state: Arc<Mutex<Option<StateInner>>>,
+    held_state: Option<StateInner>,
+    spill_file: Option<InProgressSpillFile>,
+    inner: SendableRecordBatchStream,
+}
+
+impl ReplayableSpillStream {
+    fn new_first(
+        schema: SchemaRef,
+        inner: SendableRecordBatchStream,
+        shared_state: Arc<Mutex<Option<StateInner>>>,
+        spill_file: InProgressSpillFile,
+    ) -> Self {
+        Self {
+            schema,
+            shared_state,
+            held_state: Some(StateInner::Unopened),
+            spill_file: Some(spill_file),
+            inner,
+        }
+    }
+
+    fn new_replay(
+        schema: SchemaRef,
+        spill_manager: &SpillManager,
+        shared_state: Arc<Mutex<Option<StateInner>>>,
+        spill_file: Option<RefCountedTempFile>,
+    ) -> Result<Self> {
+        let inner = if let Some(file) = spill_file.as_ref() {
+            spill_manager.read_spill_as_stream(file.clone(), None)?
+        } else {
+            Box::pin(EmptyRecordBatchStream::new(Arc::clone(&schema)))
+        };
+
+        Ok(Self {
+            schema,
+            shared_state,
+            held_state: Some(StateInner::Replayable(spill_file)),
+            spill_file: None,
+            inner,
+        })
+    }
+
+    fn restore_held_state(&mut self) {
+        if let Some(state) = self.held_state.take() {
+            *self.shared_state.lock() = Some(state);
+        }
+    }
+
+    fn set_state(&mut self, state: StateInner) {
+        if self.held_state.take().is_some() {
+            *self.shared_state.lock() = Some(state);
+        }
+    }
+
+    fn poison(&mut self) {
+        self.set_state(StateInner::Poisoned);
+    }
+}
+
+impl Stream for ReplayableSpillStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+
+        match this.inner.as_mut().poll_next(cx) {
+            Poll::Ready(Some(Ok(batch))) => {
+                if batch.num_rows() > 0
+                    && let Some(spill_file) = this.spill_file.as_mut()
+                    && let Err(e) = spill_file.append_batch(&batch)
+                {
+                    this.spill_file.take();
+                    this.poison();
+                    return Poll::Ready(Some(Err(e)));
+                }
+
+                Poll::Ready(Some(Ok(batch)))
+            }
+            Poll::Ready(Some(Err(e))) => {
+                this.spill_file.take();
+                this.poison();
+                Poll::Ready(Some(Err(e)))
+            }
+            // The stream is exhausted, give the inner state ownership back to `ReplayableStreamSource`
+            Poll::Ready(None) => {
+                if let Some(spill_file) = this.spill_file.as_mut() {
+                    match spill_file.finish() {
+                        Ok(file) => {
+                            this.spill_file.take();
+                            this.set_state(StateInner::Replayable(file));
+                            Poll::Ready(None)
+                        }
+                        Err(e) => {
+                            this.spill_file.take();
+                            this.poison();
+                            Poll::Ready(Some(Err(e)))
+                        }
+                    }
+                } else {
+                    this.restore_held_state();
+                    Poll::Ready(None)
+                }
+            }
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
+impl RecordBatchStream for ReplayableSpillStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
+impl Drop for ReplayableSpillStream {
+    /// If a stream is dropped before it finishes, poison the state so later
+    /// replay attempts fail.
+    ///
+    /// A partial first pass leaves the spill file incomplete, so replaying it
+    /// would be unsafe.
+    fn drop(&mut self) {
+        if self.held_state.is_some() {
+            self.poison();
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::Int64Array;
+    use arrow_schema::{DataType, Field, Schema};
+    use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+    use datafusion_physical_expr_common::metrics::{
+        ExecutionPlanMetricsSet, SpillMetrics,
+    };
+    use futures::{StreamExt, TryStreamExt};
+
+    use crate::stream::RecordBatchStreamAdapter;
+
+    fn build_spill_manager(schema: SchemaRef) -> Result<SpillManager> {
+        let runtime = Arc::new(RuntimeEnvBuilder::new().build()?);
+        let metrics_set = ExecutionPlanMetricsSet::new();
+        let spill_metrics = SpillMetrics::new(&metrics_set, 0);
+        Ok(SpillManager::new(runtime, spill_metrics, schema))
+    }
+
+    fn build_batch(schema: SchemaRef, values: Vec<i64>) -> Result<RecordBatch> {
+        RecordBatch::try_new(schema, vec![Arc::new(Int64Array::from(values))])
+            .map_err(Into::into)
+    }
+
+    #[tokio::test]
+    async fn test_replayable_spill_input_replays_completed_first_pass() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]));
+        let batch1 = build_batch(Arc::clone(&schema), vec![1, 2])?;
+        let batch2 = build_batch(Arc::clone(&schema), vec![3, 4])?;
+
+        let input = Box::pin(RecordBatchStreamAdapter::new(
+            Arc::clone(&schema),
+            futures::stream::iter(vec![Ok(batch1.clone()), Ok(batch2.clone())]),
+        ));
+        let spill_manager = build_spill_manager(Arc::clone(&schema))?;
+        let mut replayable =
+            ReplayableStreamSource::new(input, spill_manager, "test replayable spill");
+
+        let pass1 = replayable.open_pass()?;
+        let pass1_batches = pass1.try_collect::<Vec<_>>().await?;
+        assert_eq!(pass1_batches, vec![batch1.clone(), batch2.clone()]);
+
+        let pass2 = replayable.open_pass()?;
+        let pass2_batches = pass2.try_collect::<Vec<_>>().await?;
+        assert_eq!(pass2_batches, vec![batch1, batch2]);
+
+        Ok(())
+    }
+
+    // Try to open a new pass, when the first pass has not finished.
+    // The spill file is only partially written, so an error will be returned.
+    #[tokio::test]
+    async fn test_replayable_spill_input_poisoned_when_first_pass_dropped() -> Result<()>
+    {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]));
+        let batch1 = build_batch(Arc::clone(&schema), vec![1, 2])?;
+        let batch2 = build_batch(Arc::clone(&schema), vec![3, 4])?;
+
+        let input = Box::pin(RecordBatchStreamAdapter::new(
+            Arc::clone(&schema),
+            futures::stream::iter(vec![Ok(batch1), Ok(batch2)]),
+        ));
+        let spill_manager = build_spill_manager(Arc::clone(&schema))?;
+        let mut replayable =
+            ReplayableStreamSource::new(input, spill_manager, "test replayable spill");
+
+        let mut pass1 = replayable.open_pass()?;
+        let first = pass1.next().await.transpose()?;
+        assert!(first.is_some());
+        drop(pass1);
+
+        let err = match replayable.open_pass() {
+            Ok(_) => panic!("expected first pass to poison replayable spill input"),
+            Err(err) => err.strip_backtrace(),
+        };
+        assert!(
+            err.to_string().contains(
+                "ReplayableStreamSource first pass did not complete successfully"
+            )
+        );
+
+        Ok(())
+    }
+
+    // Open a new pass, when the previous pass from spill is still in progress.
+    // An error is expected, since it requires sequential access.
+    #[tokio::test]
+    async fn test_replayable_spill_input_errors_when_replay_pass_in_progress()
+    -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]));
+        let batch1 = build_batch(Arc::clone(&schema), vec![1, 2])?;
+        let batch2 = build_batch(Arc::clone(&schema), vec![3, 4])?;
+
+        let input = Box::pin(RecordBatchStreamAdapter::new(
+            Arc::clone(&schema),
+            futures::stream::iter(vec![Ok(batch1.clone()), Ok(batch2.clone())]),
+        ));
+        let spill_manager = build_spill_manager(Arc::clone(&schema))?;
+        let mut replayable =
+            ReplayableStreamSource::new(input, spill_manager, "test replayable spill");
+
+        let pass1 = replayable.open_pass()?;
+        let _ = pass1.try_collect::<Vec<_>>().await?;
+
+        let pass2 = replayable.open_pass()?;
+        let err = match replayable.open_pass() {
+            Ok(_) => panic!("expected open_pass to fail while replay pass is active"),
+            Err(err) => err.strip_backtrace(),
+        };
+        assert!(
+            err.to_string()
+                .contains("ReplayableStreamSource pass is still active")
+        );
+        drop(pass2);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/spill/spill_manager.rs b/datafusion/physical-plan/src/spill/spill_manager.rs
index cc39102d89819..365a9f977eace 100644
--- a/datafusion/physical-plan/src/spill/spill_manager.rs
+++ b/datafusion/physical-plan/src/spill/spill_manager.rs
@@ -17,20 +17,19 @@
 
 //! Define the `SpillManager` struct, which is responsible for reading and writing `RecordBatch`es to raw files based on the provided configurations.
 
-use arrow::array::StringViewArray;
-use arrow::datatypes::SchemaRef;
+use super::{SpillReaderStream, in_progress_spill_file::InProgressSpillFile};
+use crate::coop::cooperative;
+use crate::{common::spawn_buffered, metrics::SpillMetrics};
+use arrow::array::{BinaryViewArray, GenericByteViewArray, StringViewArray};
+use arrow::datatypes::{ByteViewType, SchemaRef};
 use arrow::record_batch::RecordBatch;
+use datafusion_common::{DataFusionError, Result, config::SpillCompression};
+use datafusion_execution::SendableRecordBatchStream;
+use datafusion_execution::disk_manager::RefCountedTempFile;
 use datafusion_execution::runtime_env::RuntimeEnv;
+use std::borrow::Borrow;
 use std::sync::Arc;
 
-use datafusion_common::{config::SpillCompression, Result};
-use datafusion_execution::disk_manager::RefCountedTempFile;
-use datafusion_execution::SendableRecordBatchStream;
-
-use super::{in_progress_spill_file::InProgressSpillFile, SpillReaderStream};
-use crate::coop::cooperative;
-use crate::{common::spawn_buffered, metrics::SpillMetrics};
-
 /// The `SpillManager` is responsible for the following tasks:
 /// - Reading and writing `RecordBatch`es to raw files based on the provided configurations.
 /// - Updating the associated metrics.
@@ -72,6 +71,11 @@ impl SpillManager {
         self
     }
 
+    /// Returns the schema for batches managed by this SpillManager
+    pub fn schema(&self) -> &SchemaRef {
+        &self.schema
+    }
+
     /// Creates a temporary file for in-progress operations, returning an error
     /// message if file creation fails. The file can be used to append batches
     /// incrementally and then finish the file when done.
@@ -105,39 +109,27 @@ impl SpillManager {
         in_progress_file.finish()
     }
 
-    /// Refer to the documentation for [`Self::spill_record_batch_and_finish`]. This method
-    /// additionally spills the `RecordBatch` into smaller batches, divided by `row_limit`.
-    ///
-    /// # Errors
-    /// - Returns an error if spilling would exceed the disk usage limit configured
-    ///   by `max_temp_directory_size` in `DiskManager`
-    pub(crate) fn spill_record_batch_by_size_and_return_max_batch_memory(
+    /// Spill an iterator of `RecordBatch`es to disk and return the spill file and the size of the largest batch in memory
+    /// Note that this expects the caller to provide *non-sliced* batches, so the memory calculation of each batch is accurate.
+    pub(crate) fn spill_record_batch_iter_and_return_max_batch_memory(
         &self,
-        batch: &RecordBatch,
+        mut iter: impl Iterator<Item = Result<impl Borrow<RecordBatch>>>,
         request_description: &str,
-        row_limit: usize,
     ) -> Result<Option<(RefCountedTempFile, usize)>> {
-        let total_rows = batch.num_rows();
-        let mut batches = Vec::new();
-        let mut offset = 0;
-
-        // It's ok to calculate all slices first, because slicing is zero-copy.
-        while offset < total_rows {
-            let length = std::cmp::min(total_rows - offset, row_limit);
-            let sliced_batch = batch.slice(offset, length);
-            batches.push(sliced_batch);
-            offset += length;
-        }
-
         let mut in_progress_file = self.create_in_progress_file(request_description)?;
 
         let mut max_record_batch_size = 0;
 
-        for batch in batches {
-            in_progress_file.append_batch(&batch)?;
-
-            max_record_batch_size = max_record_batch_size.max(batch.get_sliced_size()?);
-        }
+        iter.try_for_each(|batch| {
+            let batch = batch?;
+            let borrowed = batch.borrow();
+            if borrowed.num_rows() == 0 {
+                return Ok(());
+            }
+            let gc_sliced_size = in_progress_file.append_batch(borrowed)?;
+            max_record_batch_size = max_record_batch_size.max(gc_sliced_size);
+            Result::<_, DataFusionError>::Ok(())
+        })?;
 
         let file = in_progress_file.finish()?;
 
@@ -158,9 +150,9 @@ impl SpillManager {
 
         while let Some(batch) = stream.next().await {
             let batch = batch?;
-            in_progress_file.append_batch(&batch)?;
+            let gc_sliced_size = in_progress_file.append_batch(&batch)?;
 
-            max_record_batch_size = max_record_batch_size.max(batch.get_sliced_size()?);
+            max_record_batch_size = max_record_batch_size.max(gc_sliced_size);
         }
 
         let file = in_progress_file.finish()?;
@@ -168,9 +160,22 @@ impl SpillManager {
         Ok(file.map(|f| (f, max_record_batch_size)))
     }
 
-    /// Reads a spill file as a stream. The file must be created by the current `SpillManager`.
-    /// This method will generate output in FIFO order: the batch appended first
-    /// will be read first.
+    /// Reads a spill file as a stream. The file must be created by the current
+    /// `SpillManager`; otherwise an error will be returned.
+    ///
+    /// Output is produced in FIFO order: the batch appended first is read first.
+    ///
+    /// # Arg `max_record_batch_memory`
+    ///
+    /// Most callers should pass `None`. This is mainly useful for the
+    /// memory-limited sort-preserving merge path.
+    ///
+    /// When provided, this value is used only as a validation hint. If a
+    /// decoded batch exceeds this threshold, a debug-level log message is
+    /// emitted.
+    ///
+    /// That path uses the maximum spilled batch size to conservatively estimate
+    /// the merge degree when merging multiple sorted runs.
     pub fn read_spill_as_stream(
         &self,
         spill_file_path: RefCountedTempFile,
@@ -184,12 +189,25 @@ impl SpillManager {
 
         Ok(spawn_buffered(stream, self.batch_read_buffer_capacity))
     }
+
+    /// Same as `read_spill_as_stream`, but without buffering.
+    pub fn read_spill_as_stream_unbuffered(
+        &self,
+        spill_file_path: RefCountedTempFile,
+        max_record_batch_memory: Option<usize>,
+    ) -> Result<SendableRecordBatchStream> {
+        Ok(Box::pin(cooperative(SpillReaderStream::new(
+            Arc::clone(&self.schema),
+            spill_file_path,
+            max_record_batch_memory,
+        ))))
+    }
 }
 
 pub(crate) trait GetSlicedSize {
     /// Returns the size of the `RecordBatch` when sliced.
     /// Note: if multiple arrays or even a single array share the same data buffers, we may double count each buffer.
-    /// Therefore, make sure we call gc() or organize_stringview_arrays() before using this method.
+    /// Therefore, make sure we call gc() or gc_view_arrays() before using this method.
     fn get_sliced_size(&self) -> Result<usize>;
 }
 
@@ -209,26 +227,132 @@ impl GetSlicedSize for RecordBatch {
             // "bytes needed if we materialized exactly this slice into fresh buffers".
             // This is a workaround until https://github.com/apache/arrow-rs/issues/8230
             if let Some(sv) = array.as_any().downcast_ref::<StringViewArray>() {
-                for buffer in sv.data_buffers() {
-                    total += buffer.capacity();
-                }
+                total += byte_view_data_buffer_size(sv);
+            }
+            if let Some(bv) = array.as_any().downcast_ref::<BinaryViewArray>() {
+                total += byte_view_data_buffer_size(bv);
             }
         }
         Ok(total)
     }
 }
 
+fn byte_view_data_buffer_size<T: ByteViewType>(array: &GenericByteViewArray<T>) -> usize {
+    array
+        .data_buffers()
+        .iter()
+        .map(|buffer| buffer.capacity())
+        .sum()
+}
+
 #[cfg(test)]
 mod tests {
+    use super::SpillManager;
+    use crate::common::collect;
+    use crate::metrics::{ExecutionPlanMetricsSet, SpillMetrics};
     use crate::spill::{get_record_batch_memory_size, spill_manager::GetSlicedSize};
     use arrow::datatypes::{DataType, Field, Schema};
     use arrow::{
-        array::{ArrayRef, StringViewArray},
+        array::{ArrayRef, Int32Array, StringArray, StringViewArray},
         record_batch::RecordBatch,
     };
     use datafusion_common::Result;
+    use datafusion_execution::runtime_env::RuntimeEnv;
     use std::sync::Arc;
 
+    fn build_test_spill_manager(
+        env: Arc<RuntimeEnv>,
+        schema: Arc<Schema>,
+    ) -> SpillManager {
+        let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
+        SpillManager::new(env, metrics, schema)
+    }
+
+    fn build_writer_batch(schema: Arc<Schema>) -> Result<RecordBatch> {
+        RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])),
+                Arc::new(StringArray::from(vec!["a", "b", "c"])),
+            ],
+        )
+        .map_err(Into::into)
+    }
+
+    #[tokio::test]
+    async fn test_read_spill_as_stream_from_another_spill_manager_same_schema()
+    -> Result<()> {
+        let env = Arc::new(RuntimeEnv::default());
+        let writer_schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("value", DataType::Utf8, false),
+        ]));
+        let reader_schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("value", DataType::Utf8, false),
+        ]));
+
+        let writer =
+            build_test_spill_manager(Arc::clone(&env), Arc::clone(&writer_schema));
+        let reader = build_test_spill_manager(env, Arc::clone(&reader_schema));
+        let written_batch = build_writer_batch(Arc::clone(&writer_schema))?;
+
+        let spill_file = writer
+            .spill_record_batch_and_finish(
+                std::slice::from_ref(&written_batch),
+                "writer",
+            )?
+            .unwrap();
+
+        // Same-schema reads through a different SpillManager currently pass
+        // because only schema compatibility is validated. This is not a
+        // supported usage pattern.
+        let stream = reader.read_spill_as_stream(spill_file, None)?;
+        assert_eq!(stream.schema(), reader_schema);
+
+        let batches = collect(stream).await?;
+        assert_eq!(batches, vec![written_batch]);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_read_spill_as_stream_from_another_spill_manager_different_schema()
+    -> Result<()> {
+        let env = Arc::new(RuntimeEnv::default());
+        let writer_schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("value", DataType::Utf8, false),
+        ]));
+        let reader_schema = Arc::new(Schema::new(vec![
+            Field::new("other_id", DataType::Int32, true),
+            Field::new("other_value", DataType::Utf8, true),
+        ]));
+
+        let writer =
+            build_test_spill_manager(Arc::clone(&env), Arc::clone(&writer_schema));
+        let reader = build_test_spill_manager(env, Arc::clone(&reader_schema));
+        let written_batch = build_writer_batch(Arc::clone(&writer_schema))?;
+
+        let spill_file = writer
+            .spill_record_batch_and_finish(
+                std::slice::from_ref(&written_batch),
+                "writer",
+            )?
+            .unwrap();
+
+        let stream = reader.read_spill_as_stream(spill_file, None)?;
+        let err = collect(stream)
+            .await
+            .expect_err("schema mismatch should fail fast");
+        let err = err.to_string();
+        assert!(err.contains("Spill file schema mismatch"));
+        assert!(err.contains("expected"));
+        assert!(err.contains("got"));
+
+        Ok(())
+    }
+
     #[test]
     fn check_sliced_size_for_string_view_array() -> Result<()> {
         let array_length = 50;
diff --git a/datafusion/physical-plan/src/spill/spill_pool.rs b/datafusion/physical-plan/src/spill/spill_pool.rs
new file mode 100644
index 0000000000000..2639188a2609d
--- /dev/null
+++ b/datafusion/physical-plan/src/spill/spill_pool.rs
@@ -0,0 +1,1543 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use futures::{Stream, StreamExt};
+use std::collections::VecDeque;
+use std::sync::Arc;
+use std::task::Waker;
+
+use parking_lot::Mutex;
+
+use arrow::datatypes::SchemaRef;
+use arrow::record_batch::RecordBatch;
+use datafusion_common::Result;
+use datafusion_execution::disk_manager::RefCountedTempFile;
+use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream};
+
+use super::in_progress_spill_file::InProgressSpillFile;
+use super::spill_manager::SpillManager;
+
+/// Shared state between the writer and readers of a spill pool.
+/// This contains the queue of files and coordination state.
+///
+/// # Locking Design
+///
+/// This struct uses **fine-grained locking** with nested `Arc<Mutex<>>`:
+/// - `SpillPoolShared` is wrapped in `Arc<Mutex<>>` (outer lock)
+/// - Each `ActiveSpillFileShared` is wrapped in `Arc<Mutex<>>` (inner lock)
+///
+/// This enables:
+/// 1. **Short critical sections**: The outer lock is held only for queue operations
+/// 2. **I/O outside locks**: Disk I/O happens while holding only the file-specific lock
+/// 3. **Concurrent operations**: Reader can access the queue while writer does I/O
+///
+/// **Lock ordering discipline**: Never hold both locks simultaneously to prevent deadlock.
+/// Always: acquire outer lock → release outer lock → acquire inner lock (if needed).
+struct SpillPoolShared {
+    /// Queue of ALL files (including the current write file if it exists).
+    /// Readers always read from the front of this queue (FIFO).
+    /// Each file has its own lock to enable concurrent reader/writer access.
+    files: VecDeque<Arc<Mutex<ActiveSpillFileShared>>>,
+    /// SpillManager for creating files and tracking metrics
+    spill_manager: Arc<SpillManager>,
+    /// Pool-level waker to notify when new files are available (single reader)
+    waker: Option<Waker>,
+    /// Whether the writer has been dropped (no more files will be added)
+    writer_dropped: bool,
+    /// Writer's reference to the current file (shared by all cloned writers).
+    /// Has its own lock to allow I/O without blocking queue access.
+    current_write_file: Option<Arc<Mutex<ActiveSpillFileShared>>>,
+    /// Number of active writer clones. Only when this reaches zero should
+    /// `writer_dropped` be set to true. This prevents premature EOF signaling
+    /// when one writer clone is dropped while others are still active.
+    active_writer_count: usize,
+}
+
+impl SpillPoolShared {
+    /// Creates a new shared pool state
+    fn new(spill_manager: Arc<SpillManager>) -> Self {
+        Self {
+            files: VecDeque::new(),
+            spill_manager,
+            waker: None,
+            writer_dropped: false,
+            current_write_file: None,
+            active_writer_count: 1,
+        }
+    }
+
+    /// Registers a waker to be notified when new data is available (pool-level)
+    fn register_waker(&mut self, waker: Waker) {
+        self.waker = Some(waker);
+    }
+
+    /// Wakes the pool-level reader
+    fn wake(&mut self) {
+        if let Some(waker) = self.waker.take() {
+            waker.wake();
+        }
+    }
+}
+
+/// Writer for a spill pool. Provides coordinated write access with FIFO semantics.
+///
+/// Created by [`channel`]. See that function for architecture diagrams and usage examples.
+///
+/// The writer is `Clone`, allowing multiple writers to coordinate on the same pool.
+/// All clones share the same current write file and coordinate file rotation.
+/// The writer automatically manages file rotation based on the `max_file_size_bytes`
+/// configured in [`channel`]. When the last writer clone is dropped, it finalizes the
+/// current file so readers can access all written data.
+pub struct SpillPoolWriter {
+    /// Maximum size in bytes before rotating to a new file.
+    /// Typically set from configuration `datafusion.execution.max_spill_file_size_bytes`.
+    max_file_size_bytes: usize,
+    /// Shared state with readers (includes current_write_file for coordination)
+    shared: Arc<Mutex<SpillPoolShared>>,
+}
+
+impl Clone for SpillPoolWriter {
+    fn clone(&self) -> Self {
+        // Increment the active writer count so that `writer_dropped` is only
+        // set to true when the *last* clone is dropped.
+        self.shared.lock().active_writer_count += 1;
+        Self {
+            max_file_size_bytes: self.max_file_size_bytes,
+            shared: Arc::clone(&self.shared),
+        }
+    }
+}
+
+impl SpillPoolWriter {
+    /// Spills a batch to the pool, rotating files when necessary.
+    ///
+    /// If the current file would exceed `max_file_size_bytes` after adding
+    /// this batch, the file is finalized and a new one is started.
+    ///
+    /// See [`channel`] for overall architecture and examples.
+    ///
+    /// # File Rotation Logic
+    ///
+    /// ```text
+    /// push_batch()
+    ///      │
+    ///      ▼
+    /// Current file exists?
+    ///      │
+    ///      ├─ No ──▶ Create new file ──▶ Add to shared queue
+    ///      │                               Wake readers
+    ///      ▼
+    /// Write batch to current file
+    ///      │
+    ///      ▼
+    /// estimated_size > max_file_size_bytes?
+    ///      │
+    ///      ├─ No ──▶ Keep current file for next batch
+    ///      │
+    ///      ▼
+    /// Yes: finish() current file
+    ///      Mark writer_finished = true
+    ///      Wake readers
+    ///      │
+    ///      ▼
+    /// Next push_batch() creates new file
+    /// ```
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if disk I/O fails or disk quota is exceeded.
+    pub fn push_batch(&self, batch: &RecordBatch) -> Result<()> {
+        if batch.num_rows() == 0 {
+            // Skip empty batches
+            return Ok(());
+        }
+
+        let batch_size = batch.get_array_memory_size();
+
+        // Fine-grained locking: Lock shared state briefly for queue access
+        let mut shared = self.shared.lock();
+
+        // Create new file if we don't have one yet
+        if shared.current_write_file.is_none() {
+            let spill_manager = Arc::clone(&shared.spill_manager);
+            // Release shared lock before disk I/O (fine-grained locking)
+            drop(shared);
+
+            let writer = spill_manager.create_in_progress_file("SpillPool")?;
+            // Clone the file so readers can access it immediately
+            let file = writer.file().expect("InProgressSpillFile should always have a file when it is first created").clone();
+
+            let file_shared = Arc::new(Mutex::new(ActiveSpillFileShared {
+                writer: Some(writer),
+                file: Some(file), // Set immediately so readers can access it
+                batches_written: 0,
+                estimated_size: 0,
+                writer_finished: false,
+                waker: None,
+            }));
+
+            // Re-acquire lock and push to shared queue
+            shared = self.shared.lock();
+            shared.files.push_back(Arc::clone(&file_shared));
+            shared.current_write_file = Some(file_shared);
+            shared.wake(); // Wake readers waiting for new files
+        }
+
+        let current_write_file = shared.current_write_file.take();
+        // Release shared lock before file I/O (fine-grained locking)
+        // This allows readers to access the queue while we do disk I/O
+        drop(shared);
+
+        // Write batch to current file - lock only the specific file
+        if let Some(current_file) = current_write_file {
+            // Now lock just this file for I/O (separate from shared lock)
+            let mut file_shared = current_file.lock();
+
+            // Append the batch
+            if let Some(ref mut writer) = file_shared.writer {
+                writer.append_batch(batch)?;
+                // make sure we flush the writer for readers
+                writer.flush()?;
+                file_shared.batches_written += 1;
+                file_shared.estimated_size += batch_size;
+            }
+
+            // Wake reader waiting on this specific file
+            file_shared.wake();
+
+            // Check if we need to rotate
+            let needs_rotation = file_shared.estimated_size > self.max_file_size_bytes;
+
+            if needs_rotation {
+                // Finish the IPC writer
+                if let Some(mut writer) = file_shared.writer.take() {
+                    writer.finish()?;
+                }
+                // Mark as finished so readers know not to wait for more data
+                file_shared.writer_finished = true;
+                // Wake reader waiting on this file (it's now finished)
+                file_shared.wake();
+                // Don't put back current_write_file - let it rotate
+            } else {
+                // Release file lock
+                drop(file_shared);
+                // Put back the current file for further writing
+                let mut shared = self.shared.lock();
+                shared.current_write_file = Some(current_file);
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl Drop for SpillPoolWriter {
+    fn drop(&mut self) {
+        let mut shared = self.shared.lock();
+
+        shared.active_writer_count -= 1;
+        let is_last_writer = shared.active_writer_count == 0;
+
+        if !is_last_writer {
+            // Other writer clones are still active; do not finalize or
+            // signal EOF to readers.
+            return;
+        }
+
+        // Finalize the current file when the last writer is dropped
+        if let Some(current_file) = shared.current_write_file.take() {
+            // Release shared lock before locking file
+            drop(shared);
+
+            let mut file_shared = current_file.lock();
+
+            // Finish the current writer if it exists
+            if let Some(mut writer) = file_shared.writer.take() {
+                // Ignore errors on drop - we're in destructor
+                let _ = writer.finish();
+            }
+
+            // Mark as finished so readers know not to wait for more data
+            file_shared.writer_finished = true;
+
+            // Wake reader waiting on this file (it's now finished)
+            file_shared.wake();
+
+            drop(file_shared);
+            shared = self.shared.lock();
+        }
+
+        // Mark writer as dropped and wake pool-level readers
+        shared.writer_dropped = true;
+        shared.wake();
+    }
+}
+
+/// Creates a paired writer and reader for a spill pool with MPSC (multi-producer, single-consumer)
+/// semantics.
+///
+/// This is the recommended way to create a spill pool. The writer is `Clone`, allowing
+/// multiple producers to coordinate writes to the same pool. The reader can consume batches
+/// in FIFO order. The reader can start reading immediately after a writer appends a batch
+/// to the spill file, without waiting for the file to be sealed, while writers continue to
+/// write more data.
+///
+/// Internally this coordinates rotating spill files based on size limits, and
+/// handles asynchronous notification between the writer and reader using wakers.
+/// This ensures that we manage disk usage efficiently while allowing concurrent
+/// I/O between the writer and reader.
+///
+/// # Data Flow Overview
+///
+/// 1. Writer write batch `B0` to F1
+/// 2. Writer write batch `B1` to F1, notices the size limit exceeded, finishes F1.
+/// 3. Reader read `B0` from F1
+/// 4. Reader read `B1`, no more batch to read -> wait on the waker
+/// 5. Writer write batch `B2` to a new file `F2`, wake up the waiting reader.
+/// 6. Reader read `B2` from F2.
+/// 7. Repeat until writer is dropped.
+///
+/// # Architecture
+///
+/// ```text
+/// ┌─────────────────────────────────────────────────────────────────────────┐
+/// │                            SpillPool                                    │
+/// │                                                                         │
+/// │  Writer Side              Shared State              Reader Side         │
+/// │  ───────────              ────────────              ───────────         │
+/// │                                                                         │
+/// │  SpillPoolWriter    ┌────────────────────┐    SpillPoolReader           │
+/// │       │             │  VecDeque<File>    │          │                   │
+/// │       │             │  ┌────┐┌────┐      │          │                   │
+/// │  push_batch()       │  │ F1 ││ F2 │ ...  │      next().await            │
+/// │       │             │  └────┘└────┘      │          │                   │
+/// │       ▼             │   (FIFO order)     │          ▼                   │
+/// │  ┌─────────┐        │                    │    ┌──────────┐              │
+/// │  │Current  │───────▶│ Coordination:      │◀───│ Current  │              │
+/// │  │Write    │        │ - Wakers           │    │ Read     │              │
+/// │  │File     │        │ - Batch counts     │    │ File     │              │
+/// │  └─────────┘        │ - Writer status    │    └──────────┘              │
+/// │       │             └────────────────────┘          │                   │
+/// │       │                                              │                  │
+/// │  Size > limit?                                Read all batches?         │
+/// │       │                                              │                  │
+/// │       ▼                                              ▼                  │
+/// │  Rotate to new file                            Pop from queue           │
+/// └─────────────────────────────────────────────────────────────────────────┘
+///
+/// Writer produces → Shared FIFO queue → Reader consumes
+/// ```
+///
+/// # File State Machine
+///
+/// Each file in the pool coordinates between writer and reader:
+///
+/// ```text
+///                Writer View              Reader View
+///                ───────────              ───────────
+///
+/// Created        writer: Some(..)         batches_read: 0
+///                batches_written: 0       (waiting for data)
+///                       │
+///                       ▼
+/// Writing        append_batch()           Can read if:
+///                batches_written++        batches_read < batches_written
+///                wake readers
+///                       │                        │
+///                       │                        ▼
+///                ┌──────┴──────┐          poll_next() → batch
+///                │             │          batches_read++
+///                ▼             ▼
+///          Size > limit?  More data?
+///                │             │
+///                │             └─▶ Yes ──▶ Continue writing
+///                ▼
+///          finish()                   Reader catches up:
+///          writer_finished = true     batches_read == batches_written
+///          wake readers                       │
+///                │                            ▼
+///                └─────────────────────▶ Returns Poll::Ready(None)
+///                                       File complete, pop from queue
+/// ```
+///
+/// # Arguments
+///
+/// * `max_file_size_bytes` - Maximum size per file before rotation. When a file
+///   exceeds this size, the writer automatically rotates to a new file.
+/// * `spill_manager` - Manager for file creation and metrics tracking
+///
+/// # Returns
+///
+/// A tuple of `(SpillPoolWriter, SendableRecordBatchStream)` that share the same
+/// underlying pool. The reader is returned as a stream for immediate use with
+/// async stream combinators.
+///
+/// # Example
+///
+/// ```
+/// use std::sync::Arc;
+/// use arrow::array::{ArrayRef, Int32Array};
+/// use arrow::datatypes::{DataType, Field, Schema};
+/// use arrow::record_batch::RecordBatch;
+/// use datafusion_execution::runtime_env::RuntimeEnv;
+/// use futures::StreamExt;
+///
+/// # use datafusion_physical_plan::spill::spill_pool;
+/// # use datafusion_physical_plan::spill::SpillManager; // Re-exported for doctests
+/// # use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, SpillMetrics};
+/// #
+/// # #[tokio::main]
+/// # async fn main() -> datafusion_common::Result<()> {
+/// # // Setup for the example (typically comes from TaskContext in production)
+/// # let env = Arc::new(RuntimeEnv::default());
+/// # let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
+/// # let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+/// # let spill_manager = Arc::new(SpillManager::new(env, metrics, schema.clone()));
+/// #
+/// // Create channel with 1MB file size limit
+/// let (writer, mut reader) = spill_pool::channel(1024 * 1024, spill_manager);
+///
+/// // Spawn writer and reader concurrently; writer wakes reader via wakers
+/// let writer_task = tokio::spawn(async move {
+///     for i in 0..5 {
+///         let array: ArrayRef = Arc::new(Int32Array::from(vec![i; 100]));
+///         let batch = RecordBatch::try_new(schema.clone(), vec![array]).unwrap();
+///         writer.push_batch(&batch)?;
+///     }
+///     // Explicitly drop writer to finalize the spill file and wake the reader
+///     drop(writer);
+///     datafusion_common::Result::<()>::Ok(())
+/// });
+///
+/// let reader_task = tokio::spawn(async move {
+///     let mut batches_read = 0;
+///     while let Some(result) = reader.next().await {
+///         let _batch = result?;
+///         batches_read += 1;
+///     }
+///     datafusion_common::Result::<usize>::Ok(batches_read)
+/// });
+///
+/// let (writer_res, reader_res) = tokio::join!(writer_task, reader_task);
+/// writer_res
+///     .map_err(|e| datafusion_common::DataFusionError::Execution(e.to_string()))??;
+/// let batches_read = reader_res
+///     .map_err(|e| datafusion_common::DataFusionError::Execution(e.to_string()))??;
+///
+/// assert_eq!(batches_read, 5);
+/// # Ok(())
+/// # }
+/// ```
+///
+/// # Why rotate files?
+///
+/// File rotation ensures we don't end up with unreferenced disk usage.
+/// If we used a single file for all spilled data, we would end up with
+/// unreferenced data at the beginning of the file that has already been read
+/// by readers but we can't delete because you can't truncate from the start of a file.
+///
+/// Consider the case of a query like `SELECT * FROM large_table WHERE false`.
+/// Obviously this query produces no output rows, but if we had a spilling operator
+/// in the middle of this query between the scan and the filter it would see the entire
+/// `large_table` flow through it and thus would spill all of that data to disk.
+/// So we'd end up using up to `size(large_table)` bytes of disk space.
+/// If instead we use file rotation, and as long as the readers can keep up with the writer,
+/// then we can ensure that once a file is fully read by all readers it can be deleted,
+/// thus bounding the maximum disk usage to roughly `max_file_size_bytes`.
+pub fn channel(
+    max_file_size_bytes: usize,
+    spill_manager: Arc<SpillManager>,
+) -> (SpillPoolWriter, SendableRecordBatchStream) {
+    let schema = Arc::clone(spill_manager.schema());
+    let shared = Arc::new(Mutex::new(SpillPoolShared::new(spill_manager)));
+
+    let writer = SpillPoolWriter {
+        max_file_size_bytes,
+        shared: Arc::clone(&shared),
+    };
+
+    let reader = SpillPoolReader::new(shared, schema);
+
+    (writer, Box::pin(reader))
+}
+
+/// Shared state between writer and readers for an active spill file.
+/// Protected by a Mutex to coordinate between concurrent readers and the writer.
+struct ActiveSpillFileShared {
+    /// Writer handle - taken (set to None) when finish() is called
+    writer: Option<InProgressSpillFile>,
+    /// The spill file, set when the writer finishes.
+    /// Taken by the reader when creating a stream (the file stays open via file handles).
+    file: Option<RefCountedTempFile>,
+    /// Total number of batches written to this file
+    batches_written: usize,
+    /// Estimated size in bytes of data written to this file
+    estimated_size: usize,
+    /// Whether the writer has finished writing to this file
+    writer_finished: bool,
+    /// Waker for reader waiting on this specific file (SPSC: only one reader)
+    waker: Option<Waker>,
+}
+
+impl ActiveSpillFileShared {
+    /// Registers a waker to be notified when new data is written to this file
+    fn register_waker(&mut self, waker: Waker) {
+        self.waker = Some(waker);
+    }
+
+    /// Wakes the reader waiting on this file
+    fn wake(&mut self) {
+        if let Some(waker) = self.waker.take() {
+            waker.wake();
+        }
+    }
+}
+
+/// Reader state for a SpillFile (owned by individual SpillFile instances).
+/// This is kept separate from the shared state to avoid holding locks during I/O.
+struct SpillFileReader {
+    /// The actual stream reading from disk
+    stream: SendableRecordBatchStream,
+    /// Number of batches this reader has consumed
+    batches_read: usize,
+}
+
+struct SpillFile {
+    /// Shared coordination state (contains writer and batch counts)
+    shared: Arc<Mutex<ActiveSpillFileShared>>,
+    /// Reader state (lazy-initialized, owned by this SpillFile)
+    reader: Option<SpillFileReader>,
+    /// Spill manager for creating readers
+    spill_manager: Arc<SpillManager>,
+}
+
+impl Stream for SpillFile {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> std::task::Poll<Option<Self::Item>> {
+        use std::task::Poll;
+
+        // Step 1: Lock shared state and check coordination
+        let (should_read, file) = {
+            let mut shared = self.shared.lock();
+
+            // Determine if we can read
+            let batches_read = self.reader.as_ref().map_or(0, |r| r.batches_read);
+
+            if batches_read < shared.batches_written {
+                // More data available to read - take the file if we don't have a reader yet
+                let file = if self.reader.is_none() {
+                    shared.file.take()
+                } else {
+                    None
+                };
+                (true, file)
+            } else if shared.writer_finished {
+                // No more data and writer is done - EOF
+                return Poll::Ready(None);
+            } else {
+                // Caught up to writer, but writer still active - register waker and wait
+                shared.register_waker(cx.waker().clone());
+                return Poll::Pending;
+            }
+        }; // Lock released here
+
+        // Step 2: Lazy-create reader stream if needed
+        if self.reader.is_none() && should_read {
+            if let Some(file) = file {
+                // we want this unbuffered because files are actively being written to
+                match self
+                    .spill_manager
+                    .read_spill_as_stream_unbuffered(file, None)
+                {
+                    Ok(stream) => {
+                        self.reader = Some(SpillFileReader {
+                            stream,
+                            batches_read: 0,
+                        });
+                    }
+                    Err(e) => return Poll::Ready(Some(Err(e))),
+                }
+            } else {
+                // File not available yet (writer hasn't finished or already taken)
+                // Register waker and wait for file to be ready
+                let mut shared = self.shared.lock();
+                shared.register_waker(cx.waker().clone());
+                return Poll::Pending;
+            }
+        }
+
+        // Step 3: Poll the reader stream (no lock held)
+        if let Some(reader) = &mut self.reader {
+            match reader.stream.poll_next_unpin(cx) {
+                Poll::Ready(Some(Ok(batch))) => {
+                    // Successfully read a batch - increment counter
+                    reader.batches_read += 1;
+                    Poll::Ready(Some(Ok(batch)))
+                }
+                Poll::Ready(Some(Err(e))) => Poll::Ready(Some(Err(e))),
+                Poll::Ready(None) => {
+                    // Stream exhausted unexpectedly
+                    // This shouldn't happen if coordination is correct, but handle gracefully
+                    Poll::Ready(None)
+                }
+                Poll::Pending => Poll::Pending,
+            }
+        } else {
+            // Should not reach here, but handle gracefully
+            Poll::Ready(None)
+        }
+    }
+}
+
+/// A stream that reads from a SpillPool in FIFO order.
+///
+/// Created by [`channel`]. See that function for architecture diagrams and usage examples.
+///
+/// The stream automatically handles file rotation and reads from completed files.
+/// When no data is available, it returns `Poll::Pending` and registers a waker to
+/// be notified when the writer produces more data.
+///
+/// # Infinite Stream Semantics
+///
+/// This stream never returns `None` (`Poll::Ready(None)`) on its own - it will keep
+/// waiting for the writer to produce more data. The stream ends only when:
+/// - The reader is dropped
+/// - The writer is dropped AND all queued data has been consumed
+///
+/// This makes it suitable for continuous streaming scenarios where the writer may
+/// produce data intermittently.
+pub struct SpillPoolReader {
+    /// Shared reference to the spill pool
+    shared: Arc<Mutex<SpillPoolShared>>,
+    /// Current SpillFile we're reading from
+    current_file: Option<SpillFile>,
+    /// Schema of the spilled data
+    schema: SchemaRef,
+}
+
+impl SpillPoolReader {
+    /// Creates a new reader from shared pool state.
+    ///
+    /// This is private - use the `channel()` function to create a reader/writer pair.
+    ///
+    /// # Arguments
+    ///
+    /// * `shared` - Shared reference to the pool state
+    fn new(shared: Arc<Mutex<SpillPoolShared>>, schema: SchemaRef) -> Self {
+        Self {
+            shared,
+            current_file: None,
+            schema,
+        }
+    }
+}
+
+impl Stream for SpillPoolReader {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> std::task::Poll<Option<Self::Item>> {
+        use std::task::Poll;
+
+        loop {
+            // If we have a current file, try to read from it
+            if let Some(ref mut file) = self.current_file {
+                match file.poll_next_unpin(cx) {
+                    Poll::Ready(Some(Ok(batch))) => {
+                        // Got a batch, return it
+                        return Poll::Ready(Some(Ok(batch)));
+                    }
+                    Poll::Ready(Some(Err(e))) => {
+                        // Error reading batch
+                        return Poll::Ready(Some(Err(e)));
+                    }
+                    Poll::Ready(None) => {
+                        // Current file stream exhausted
+                        // Check if this file is marked as writer_finished
+                        let writer_finished = { file.shared.lock().writer_finished };
+
+                        if writer_finished {
+                            // File is complete, pop it from the queue and move to next
+                            let mut shared = self.shared.lock();
+                            shared.files.pop_front();
+                            drop(shared); // Release lock
+
+                            // Clear current file and continue loop to get next file
+                            self.current_file = None;
+                            continue;
+                        } else {
+                            // Stream exhausted but writer not finished - unexpected
+                            // This shouldn't happen with proper coordination
+                            return Poll::Ready(None);
+                        }
+                    }
+                    Poll::Pending => {
+                        // File not ready yet (waiting for writer)
+                        // Register waker so we get notified when writer adds more batches
+                        let mut shared = self.shared.lock();
+                        shared.register_waker(cx.waker().clone());
+                        return Poll::Pending;
+                    }
+                }
+            }
+
+            // No current file, need to get the next one
+            let mut shared = self.shared.lock();
+
+            // Peek at the front of the queue (don't pop yet)
+            if let Some(file_shared) = shared.files.front() {
+                // Create a SpillFile from the shared state
+                let spill_manager = Arc::clone(&shared.spill_manager);
+                let file_shared = Arc::clone(file_shared);
+                drop(shared); // Release lock before creating SpillFile
+
+                self.current_file = Some(SpillFile {
+                    shared: file_shared,
+                    reader: None,
+                    spill_manager,
+                });
+
+                // Continue loop to poll the new file
+                continue;
+            }
+
+            // No files in queue - check if writer is done
+            if shared.writer_dropped {
+                // Writer is done and no more files will be added - EOF
+                return Poll::Ready(None);
+            }
+
+            // Writer still active, register waker that will get notified when new files are added
+            shared.register_waker(cx.waker().clone());
+            return Poll::Pending;
+        }
+    }
+}
+
+impl RecordBatchStream for SpillPoolReader {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metrics::{ExecutionPlanMetricsSet, SpillMetrics};
+    use arrow::array::{ArrayRef, Int32Array};
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common_runtime::SpawnedTask;
+    use datafusion_execution::runtime_env::RuntimeEnv;
+
+    fn create_test_schema() -> SchemaRef {
+        Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]))
+    }
+
+    fn create_test_batch(start: i32, count: usize) -> RecordBatch {
+        let schema = create_test_schema();
+        let a: ArrayRef = Arc::new(Int32Array::from(
+            (start..start + count as i32).collect::<Vec<_>>(),
+        ));
+        RecordBatch::try_new(schema, vec![a]).unwrap()
+    }
+
+    fn create_spill_channel(
+        max_file_size: usize,
+    ) -> (SpillPoolWriter, SendableRecordBatchStream) {
+        let env = Arc::new(RuntimeEnv::default());
+        let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
+        let schema = create_test_schema();
+        let spill_manager = Arc::new(SpillManager::new(env, metrics, schema));
+
+        channel(max_file_size, spill_manager)
+    }
+
+    fn create_spill_channel_with_metrics(
+        max_file_size: usize,
+    ) -> (SpillPoolWriter, SendableRecordBatchStream, SpillMetrics) {
+        let env = Arc::new(RuntimeEnv::default());
+        let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
+        let schema = create_test_schema();
+        let spill_manager = Arc::new(SpillManager::new(env, metrics.clone(), schema));
+
+        let (writer, reader) = channel(max_file_size, spill_manager);
+        (writer, reader, metrics)
+    }
+
+    #[tokio::test]
+    async fn test_basic_write_and_read() -> Result<()> {
+        let (writer, mut reader) = create_spill_channel(1024 * 1024);
+
+        // Write one batch
+        let batch1 = create_test_batch(0, 10);
+        writer.push_batch(&batch1)?;
+
+        // Read the batch
+        let result = reader.next().await.unwrap()?;
+        assert_eq!(result.num_rows(), 10);
+
+        // Write another batch
+        let batch2 = create_test_batch(10, 5);
+        writer.push_batch(&batch2)?;
+        // Read the second batch
+        let result = reader.next().await.unwrap()?;
+        assert_eq!(result.num_rows(), 5);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_single_batch_write_read() -> Result<()> {
+        let (writer, mut reader) = create_spill_channel(1024 * 1024);
+
+        // Write one batch
+        let batch = create_test_batch(0, 5);
+        writer.push_batch(&batch)?;
+
+        // Read it back
+        let result = reader.next().await.unwrap()?;
+        assert_eq!(result.num_rows(), 5);
+
+        // Verify the actual data
+        let col = result
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert_eq!(col.value(0), 0);
+        assert_eq!(col.value(4), 4);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_multiple_batches_sequential() -> Result<()> {
+        let (writer, mut reader) = create_spill_channel(1024 * 1024);
+
+        // Write multiple batches
+        for i in 0..5 {
+            let batch = create_test_batch(i * 10, 10);
+            writer.push_batch(&batch)?;
+        }
+
+        // Read all batches and verify FIFO order
+        for i in 0..5 {
+            let result = reader.next().await.unwrap()?;
+            assert_eq!(result.num_rows(), 10);
+
+            let col = result
+                .column(0)
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .unwrap();
+            assert_eq!(col.value(0), i * 10, "Batch {i} not in FIFO order");
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_empty_writer() -> Result<()> {
+        let (_writer, reader) = create_spill_channel(1024 * 1024);
+
+        // Reader should pend since no batches were written
+        let mut reader = reader;
+        let result =
+            tokio::time::timeout(std::time::Duration::from_millis(100), reader.next())
+                .await;
+
+        assert!(result.is_err(), "Reader should timeout on empty writer");
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_empty_batch_skipping() -> Result<()> {
+        let (writer, mut reader) = create_spill_channel(1024 * 1024);
+
+        // Write empty batch
+        let empty_batch = create_test_batch(0, 0);
+        writer.push_batch(&empty_batch)?;
+
+        // Write non-empty batch
+        let batch = create_test_batch(0, 5);
+        writer.push_batch(&batch)?;
+
+        // Should only read the non-empty batch
+        let result = reader.next().await.unwrap()?;
+        assert_eq!(result.num_rows(), 5);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_rotation_triggered_by_size() -> Result<()> {
+        // Set a small max_file_size to trigger rotation after one batch
+        let batch1 = create_test_batch(0, 10);
+        let batch_size = batch1.get_array_memory_size() + 1;
+
+        let (writer, mut reader, metrics) = create_spill_channel_with_metrics(batch_size);
+
+        // Write first batch (should fit in first file)
+        writer.push_batch(&batch1)?;
+
+        // Check metrics after first batch - file created but not finalized yet
+        assert_eq!(
+            metrics.spill_file_count.value(),
+            1,
+            "Should have created 1 file after first batch"
+        );
+        assert_eq!(
+            metrics.spilled_bytes.value(),
+            320,
+            "Spilled bytes should reflect data written (header + 1 batch)"
+        );
+        assert_eq!(
+            metrics.spilled_rows.value(),
+            10,
+            "Should have spilled 10 rows from first batch"
+        );
+
+        // Write second batch (should trigger rotation - finalize first file)
+        let batch2 = create_test_batch(10, 10);
+        assert!(
+            batch2.get_array_memory_size() <= batch_size,
+            "batch2 size {} exceeds limit {batch_size}",
+            batch2.get_array_memory_size(),
+        );
+        assert!(
+            batch1.get_array_memory_size() + batch2.get_array_memory_size() > batch_size,
+            "Combined size {} does not exceed limit to trigger rotation",
+            batch1.get_array_memory_size() + batch2.get_array_memory_size()
+        );
+        writer.push_batch(&batch2)?;
+
+        // Check metrics after rotation - first file finalized, but second file not created yet
+        // (new file created lazily on next push_batch call)
+        assert_eq!(
+            metrics.spill_file_count.value(),
+            1,
+            "Should still have 1 file (second file not created until next write)"
+        );
+        assert!(
+            metrics.spilled_bytes.value() > 0,
+            "Spilled bytes should be > 0 after first file finalized (got {})",
+            metrics.spilled_bytes.value()
+        );
+        assert_eq!(
+            metrics.spilled_rows.value(),
+            20,
+            "Should have spilled 20 total rows (10 + 10)"
+        );
+
+        // Write a third batch to confirm rotation occurred (creates second file)
+        let batch3 = create_test_batch(20, 5);
+        writer.push_batch(&batch3)?;
+
+        // Now check that second file was created
+        assert_eq!(
+            metrics.spill_file_count.value(),
+            2,
+            "Should have created 2 files after writing to new file"
+        );
+        assert_eq!(
+            metrics.spilled_rows.value(),
+            25,
+            "Should have spilled 25 total rows (10 + 10 + 5)"
+        );
+
+        // Read all three batches
+        let result1 = reader.next().await.unwrap()?;
+        assert_eq!(result1.num_rows(), 10);
+
+        let result2 = reader.next().await.unwrap()?;
+        assert_eq!(result2.num_rows(), 10);
+
+        let result3 = reader.next().await.unwrap()?;
+        assert_eq!(result3.num_rows(), 5);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_multiple_rotations() -> Result<()> {
+        let batches = (0..10)
+            .map(|i| create_test_batch(i * 10, 10))
+            .collect::<Vec<_>>();
+
+        let batch_size = batches[0].get_array_memory_size() * 2 + 1;
+
+        // Very small max_file_size to force frequent rotations
+        let (writer, mut reader, metrics) = create_spill_channel_with_metrics(batch_size);
+
+        // Write many batches to cause multiple rotations
+        for i in 0..10 {
+            let batch = create_test_batch(i * 10, 10);
+            writer.push_batch(&batch)?;
+        }
+
+        // Check metrics after all writes - should have multiple files due to rotations
+        // With batch_size = 2 * one_batch + 1, each file fits ~2 batches before rotating
+        // 10 batches should create multiple files (exact count depends on rotation timing)
+        let file_count = metrics.spill_file_count.value();
+        assert!(
+            file_count >= 4,
+            "Should have created at least 4 files with multiple rotations (got {file_count})"
+        );
+        assert!(
+            metrics.spilled_bytes.value() > 0,
+            "Spilled bytes should be > 0 after rotations (got {})",
+            metrics.spilled_bytes.value()
+        );
+        assert_eq!(
+            metrics.spilled_rows.value(),
+            100,
+            "Should have spilled 100 total rows (10 batches * 10 rows)"
+        );
+
+        // Read all batches and verify order
+        for i in 0..10 {
+            let result = reader.next().await.unwrap()?;
+            assert_eq!(result.num_rows(), 10);
+
+            let col = result
+                .column(0)
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .unwrap();
+            assert_eq!(
+                col.value(0),
+                i * 10,
+                "Batch {i} not in correct order after rotations"
+            );
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_single_batch_larger_than_limit() -> Result<()> {
+        // Very small limit
+        let (writer, mut reader, metrics) = create_spill_channel_with_metrics(100);
+
+        // Write a batch that exceeds the limit
+        let large_batch = create_test_batch(0, 100);
+        writer.push_batch(&large_batch)?;
+
+        // Check metrics after large batch - should trigger rotation immediately
+        assert_eq!(
+            metrics.spill_file_count.value(),
+            1,
+            "Should have created 1 file for large batch"
+        );
+        assert_eq!(
+            metrics.spilled_rows.value(),
+            100,
+            "Should have spilled 100 rows from large batch"
+        );
+
+        // Should still write and read successfully
+        let result = reader.next().await.unwrap()?;
+        assert_eq!(result.num_rows(), 100);
+
+        // Next batch should go to a new file
+        let batch2 = create_test_batch(100, 10);
+        writer.push_batch(&batch2)?;
+
+        // Check metrics after second batch - should have rotated to a new file
+        assert_eq!(
+            metrics.spill_file_count.value(),
+            2,
+            "Should have created 2 files after rotation"
+        );
+        assert_eq!(
+            metrics.spilled_rows.value(),
+            110,
+            "Should have spilled 110 total rows (100 + 10)"
+        );
+
+        let result2 = reader.next().await.unwrap()?;
+        assert_eq!(result2.num_rows(), 10);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_very_small_max_file_size() -> Result<()> {
+        // Test with just 1 byte max (extreme case)
+        let (writer, mut reader) = create_spill_channel(1);
+
+        // Any batch will exceed this limit
+        let batch = create_test_batch(0, 5);
+        writer.push_batch(&batch)?;
+
+        // Should still work
+        let result = reader.next().await.unwrap()?;
+        assert_eq!(result.num_rows(), 5);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_exact_size_boundary() -> Result<()> {
+        // Create a batch and measure its approximate size
+        let batch = create_test_batch(0, 10);
+        let batch_size = batch.get_array_memory_size();
+
+        // Set max_file_size to exactly the batch size
+        let (writer, mut reader, metrics) = create_spill_channel_with_metrics(batch_size);
+
+        // Write first batch (exactly at the size limit)
+        writer.push_batch(&batch)?;
+
+        // Check metrics after first batch - should NOT rotate yet (size == limit, not >)
+        assert_eq!(
+            metrics.spill_file_count.value(),
+            1,
+            "Should have created 1 file after first batch at exact boundary"
+        );
+        assert_eq!(
+            metrics.spilled_rows.value(),
+            10,
+            "Should have spilled 10 rows from first batch"
+        );
+
+        // Write second batch (exceeds the limit, should trigger rotation)
+        let batch2 = create_test_batch(10, 10);
+        writer.push_batch(&batch2)?;
+
+        // Check metrics after second batch - rotation triggered, first file finalized
+        // Note: second file not created yet (lazy creation on next write)
+        assert_eq!(
+            metrics.spill_file_count.value(),
+            1,
+            "Should still have 1 file after rotation (second file created lazily)"
+        );
+        assert_eq!(
+            metrics.spilled_rows.value(),
+            20,
+            "Should have spilled 20 total rows (10 + 10)"
+        );
+        // Verify first file was finalized by checking spilled_bytes
+        assert!(
+            metrics.spilled_bytes.value() > 0,
+            "Spilled bytes should be > 0 after file finalization (got {})",
+            metrics.spilled_bytes.value()
+        );
+
+        // Both should be readable
+        let result1 = reader.next().await.unwrap()?;
+        assert_eq!(result1.num_rows(), 10);
+
+        let result2 = reader.next().await.unwrap()?;
+        assert_eq!(result2.num_rows(), 10);
+
+        // Spill another batch, now we should see the second file created
+        let batch3 = create_test_batch(20, 5);
+        writer.push_batch(&batch3)?;
+        assert_eq!(
+            metrics.spill_file_count.value(),
+            2,
+            "Should have created 2 files after writing to new file"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_concurrent_reader_writer() -> Result<()> {
+        let (writer, mut reader) = create_spill_channel(1024 * 1024);
+
+        // Spawn writer task
+        let writer_handle = SpawnedTask::spawn(async move {
+            for i in 0..10 {
+                let batch = create_test_batch(i * 10, 10);
+                writer.push_batch(&batch).unwrap();
+                // Small delay to simulate real concurrent work
+                tokio::time::sleep(std::time::Duration::from_millis(5)).await;
+            }
+        });
+
+        // Reader task (runs concurrently)
+        let reader_handle = SpawnedTask::spawn(async move {
+            let mut count = 0;
+            for i in 0..10 {
+                let result = reader.next().await.unwrap().unwrap();
+                assert_eq!(result.num_rows(), 10);
+
+                let col = result
+                    .column(0)
+                    .as_any()
+                    .downcast_ref::<Int32Array>()
+                    .unwrap();
+                assert_eq!(col.value(0), i * 10);
+                count += 1;
+            }
+            count
+        });
+
+        // Wait for both to complete
+        writer_handle.await.unwrap();
+        let batches_read = reader_handle.await.unwrap();
+        assert_eq!(batches_read, 10);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_reader_catches_up_to_writer() -> Result<()> {
+        let (writer, mut reader) = create_spill_channel(1024 * 1024);
+
+        let (reader_waiting_tx, reader_waiting_rx) = tokio::sync::oneshot::channel();
+        let (first_read_done_tx, first_read_done_rx) = tokio::sync::oneshot::channel();
+
+        #[derive(Clone, Copy, Debug, PartialEq, Eq)]
+        enum ReadWriteEvent {
+            ReadStart,
+            Read(usize),
+            Write(usize),
+        }
+
+        let events = Arc::new(Mutex::new(vec![]));
+        // Start reader first (will pend)
+        let reader_events = Arc::clone(&events);
+        let reader_handle = SpawnedTask::spawn(async move {
+            reader_events.lock().push(ReadWriteEvent::ReadStart);
+            reader_waiting_tx
+                .send(())
+                .expect("reader_waiting channel closed unexpectedly");
+            let result = reader.next().await.unwrap().unwrap();
+            reader_events
+                .lock()
+                .push(ReadWriteEvent::Read(result.num_rows()));
+            first_read_done_tx
+                .send(())
+                .expect("first_read_done channel closed unexpectedly");
+            let result = reader.next().await.unwrap().unwrap();
+            reader_events
+                .lock()
+                .push(ReadWriteEvent::Read(result.num_rows()));
+        });
+
+        // Wait until the reader is pending on the first batch
+        reader_waiting_rx
+            .await
+            .expect("reader should signal when waiting");
+
+        // Now write a batch (should wake the reader)
+        let batch = create_test_batch(0, 5);
+        events.lock().push(ReadWriteEvent::Write(batch.num_rows()));
+        writer.push_batch(&batch)?;
+
+        // Wait for the reader to finish the first read before allowing the
+        // second write. This ensures deterministic ordering of events:
+        // 1. The reader starts and pends on the first `next()`
+        // 2. The first write wakes the reader
+        // 3. The reader processes the first batch and signals completion
+        // 4. The second write is issued, ensuring consistent event ordering
+        first_read_done_rx
+            .await
+            .expect("reader should signal when first read completes");
+
+        // Write another batch
+        let batch = create_test_batch(5, 10);
+        events.lock().push(ReadWriteEvent::Write(batch.num_rows()));
+        writer.push_batch(&batch)?;
+
+        // Reader should complete
+        reader_handle.await.unwrap();
+        let events = events.lock().clone();
+        assert_eq!(
+            events,
+            vec![
+                ReadWriteEvent::ReadStart,
+                ReadWriteEvent::Write(5),
+                ReadWriteEvent::Read(5),
+                ReadWriteEvent::Write(10),
+                ReadWriteEvent::Read(10)
+            ]
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_reader_starts_after_writer_finishes() -> Result<()> {
+        let (writer, reader) = create_spill_channel(128);
+
+        // Writer writes all data
+        for i in 0..5 {
+            let batch = create_test_batch(i * 10, 10);
+            writer.push_batch(&batch)?;
+        }
+
+        drop(writer);
+
+        // Now start reader
+        let mut reader = reader;
+        let mut count = 0;
+        for i in 0..5 {
+            let result = reader.next().await.unwrap()?;
+            assert_eq!(result.num_rows(), 10);
+
+            let col = result
+                .column(0)
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .unwrap();
+            assert_eq!(col.value(0), i * 10);
+            count += 1;
+        }
+
+        assert_eq!(count, 5, "Should read all batches after writer finishes");
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_writer_drop_finalizes_file() -> Result<()> {
+        let env = Arc::new(RuntimeEnv::default());
+        let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
+        let schema = create_test_schema();
+        let spill_manager =
+            Arc::new(SpillManager::new(Arc::clone(&env), metrics.clone(), schema));
+
+        let (writer, mut reader) = channel(1024 * 1024, spill_manager);
+
+        // Write some batches
+        for i in 0..5 {
+            let batch = create_test_batch(i * 10, 10);
+            writer.push_batch(&batch)?;
+        }
+
+        // Check metrics before drop - spilled_bytes already reflects written data
+        let spilled_bytes_before = metrics.spilled_bytes.value();
+        assert_eq!(
+            spilled_bytes_before, 1088,
+            "Spilled bytes should reflect data written (header + 5 batches)"
+        );
+
+        // Explicitly drop the writer - this should finalize the current file
+        drop(writer);
+
+        // Check metrics after drop - spilled_bytes should be > 0 now
+        let spilled_bytes_after = metrics.spilled_bytes.value();
+        assert!(
+            spilled_bytes_after > 0,
+            "Spilled bytes should be > 0 after writer is dropped (got {spilled_bytes_after})"
+        );
+
+        // Verify reader can still read all batches
+        let mut count = 0;
+        for i in 0..5 {
+            let result = reader.next().await.unwrap()?;
+            assert_eq!(result.num_rows(), 10);
+
+            let col = result
+                .column(0)
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .unwrap();
+            assert_eq!(col.value(0), i * 10);
+            count += 1;
+        }
+
+        assert_eq!(count, 5, "Should read all batches after writer is dropped");
+
+        Ok(())
+    }
+
+    /// Verifies that the reader stays alive as long as any writer clone exists.
+    ///
+    /// `SpillPoolWriter` is `Clone`, and in non-preserve-order repartitioning
+    /// mode multiple input partition tasks share clones of the same writer.
+    /// The reader must not see EOF until **all** clones have been dropped,
+    /// even if the queue is temporarily empty between writes from different
+    /// clones.
+    ///
+    /// The test sequence is:
+    ///
+    /// 1. writer1 writes a batch, then is dropped.
+    /// 2. The reader consumes that batch (queue is now empty).
+    /// 3. writer2 (still alive) writes a batch.
+    /// 4. The reader must see that batch.
+    /// 5. EOF is only signalled after writer2 is also dropped.
+    #[tokio::test]
+    async fn test_clone_drop_does_not_signal_eof_prematurely() -> Result<()> {
+        let (writer1, mut reader) = create_spill_channel(1024 * 1024);
+        let writer2 = writer1.clone();
+
+        // Synchronization: tell writer2 when it may proceed.
+        let (proceed_tx, proceed_rx) = tokio::sync::oneshot::channel::<()>();
+
+        // Spawn writer2 — it waits for the signal before writing.
+        let writer2_handle = SpawnedTask::spawn(async move {
+            proceed_rx.await.unwrap();
+            writer2.push_batch(&create_test_batch(10, 10)).unwrap();
+            // writer2 is dropped here (last clone → true EOF)
+        });
+
+        // Writer1 writes one batch, then drops.
+        writer1.push_batch(&create_test_batch(0, 10))?;
+        drop(writer1);
+
+        // Read writer1's batch.
+        let batch1 = reader.next().await.unwrap()?;
+        assert_eq!(batch1.num_rows(), 10);
+        let col = batch1
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert_eq!(col.value(0), 0);
+
+        // Signal writer2 to write its batch. It will execute when the
+        // current task yields (i.e. when reader.next() returns Pending).
+        proceed_tx.send(()).unwrap();
+
+        // The reader should wait (Pending) for writer2's data, not EOF.
+        let batch2 =
+            tokio::time::timeout(std::time::Duration::from_secs(5), reader.next())
+                .await
+                .expect("Reader timed out — should not hang");
+
+        assert!(
+            batch2.is_some(),
+            "Reader must not return EOF while a writer clone is still alive"
+        );
+        let batch2 = batch2.unwrap()?;
+        assert_eq!(batch2.num_rows(), 10);
+        let col = batch2
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert_eq!(col.value(0), 10);
+
+        writer2_handle.await.unwrap();
+
+        // All writers dropped — reader should see real EOF now.
+        assert!(reader.next().await.is_none());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_disk_usage_decreases_as_files_consumed() -> Result<()> {
+        use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+
+        // Test configuration
+        const NUM_BATCHES: usize = 3;
+        const ROWS_PER_BATCH: usize = 100;
+
+        // Step 1: Create a test batch and measure its size
+        let batch = create_test_batch(0, ROWS_PER_BATCH);
+        let batch_size = batch.get_array_memory_size();
+
+        // Step 2: Configure file rotation to approximately 1 batch per file
+        // Create a custom RuntimeEnv so we can access the DiskManager
+        let runtime = Arc::new(RuntimeEnvBuilder::default().build()?);
+        let disk_manager = Arc::clone(&runtime.disk_manager);
+
+        let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
+        let schema = create_test_schema();
+        let spill_manager = Arc::new(SpillManager::new(runtime, metrics.clone(), schema));
+
+        let (writer, mut reader) = channel(batch_size, spill_manager);
+
+        // Step 3: Write NUM_BATCHES batches to create approximately NUM_BATCHES files
+        for i in 0..NUM_BATCHES {
+            let start = (i * ROWS_PER_BATCH) as i32;
+            writer.push_batch(&create_test_batch(start, ROWS_PER_BATCH))?;
+        }
+
+        // Check how many files were created (should be at least a few due to file rotation)
+        let file_count = metrics.spill_file_count.value();
+        assert_eq!(
+            file_count,
+            NUM_BATCHES - 1,
+            "Expected at {} files with rotation, got {file_count}",
+            NUM_BATCHES - 1
+        );
+
+        // Step 4: Verify initial disk usage reflects all files
+        let initial_disk_usage = disk_manager.used_disk_space();
+        assert!(
+            initial_disk_usage > 0,
+            "Expected disk usage > 0 after writing batches, got {initial_disk_usage}"
+        );
+
+        // Step 5: Read NUM_BATCHES - 1 batches (all but 1)
+        // As each file is fully consumed, it should be dropped and disk usage should decrease
+        for i in 0..(NUM_BATCHES - 1) {
+            let result = reader.next().await.unwrap()?;
+            assert_eq!(result.num_rows(), ROWS_PER_BATCH);
+
+            let col = result
+                .column(0)
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .unwrap();
+            assert_eq!(col.value(0), (i * ROWS_PER_BATCH) as i32);
+        }
+
+        // Step 6: Verify disk usage decreased but is not zero (at least 1 batch remains)
+        let partial_disk_usage = disk_manager.used_disk_space();
+        assert!(
+            partial_disk_usage > 0
+                && partial_disk_usage < (batch_size * NUM_BATCHES * 2) as u64,
+            "Disk usage should be > 0 with remaining batches"
+        );
+        assert!(
+            partial_disk_usage < initial_disk_usage,
+            "Disk usage should have decreased after reading most batches: initial={initial_disk_usage}, partial={partial_disk_usage}"
+        );
+
+        // Step 7: Read the final batch
+        let result = reader.next().await.unwrap()?;
+        assert_eq!(result.num_rows(), ROWS_PER_BATCH);
+
+        // Step 8: Drop writer first to signal no more data will be written
+        // The reader has infinite stream semantics and will wait for the writer
+        // to be dropped before returning None
+        drop(writer);
+
+        // Verify we've read all batches - now the reader should return None
+        assert!(
+            reader.next().await.is_none(),
+            "Should have no more batches to read"
+        );
+
+        // Step 9: Drop reader to release all references
+        drop(reader);
+
+        // Step 10: Verify complete cleanup - disk usage should be 0
+        let final_disk_usage = disk_manager.used_disk_space();
+        assert_eq!(
+            final_disk_usage, 0,
+            "Disk usage should be 0 after all files dropped, got {final_disk_usage}"
+        );
+
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/stream.rs b/datafusion/physical-plan/src/stream.rs
index 480b723d0b151..4b7e707fccedd 100644
--- a/datafusion/physical-plan/src/stream.rs
+++ b/datafusion/physical-plan/src/stream.rs
@@ -27,11 +27,13 @@ use super::metrics::ExecutionPlanMetricsSet;
 use super::metrics::{BaselineMetrics, SplitMetrics};
 use super::{ExecutionPlan, RecordBatchStream, SendableRecordBatchStream};
 use crate::displayable;
+use crate::spill::get_record_batch_memory_size;
 
 use arrow::{datatypes::SchemaRef, record_batch::RecordBatch};
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_common_runtime::JoinSet;
 use datafusion_execution::TaskContext;
+use datafusion_execution::memory_pool::MemoryReservation;
 
 use futures::ready;
 use futures::stream::BoxStream;
@@ -699,11 +701,75 @@ impl RecordBatchStream for BatchSplitStream {
     }
 }
 
+/// A stream that holds a memory reservation for its lifetime,
+/// shrinking the reservation as batches are consumed.
+/// The original reservation must have its batch sizes calculated using [`get_record_batch_memory_size`]
+/// On error, the reservation is *NOT* freed, until the stream is dropped.
+pub(crate) struct ReservationStream {
+    schema: SchemaRef,
+    inner: SendableRecordBatchStream,
+    reservation: MemoryReservation,
+}
+
+impl ReservationStream {
+    pub(crate) fn new(
+        schema: SchemaRef,
+        inner: SendableRecordBatchStream,
+        reservation: MemoryReservation,
+    ) -> Self {
+        Self {
+            schema,
+            inner,
+            reservation,
+        }
+    }
+}
+
+impl Stream for ReservationStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        let res = self.inner.poll_next_unpin(cx);
+
+        match res {
+            Poll::Ready(res) => {
+                match res {
+                    Some(Ok(batch)) => {
+                        self.reservation
+                            .shrink(get_record_batch_memory_size(&batch));
+                        Poll::Ready(Some(Ok(batch)))
+                    }
+                    Some(Err(err)) => Poll::Ready(Some(Err(err))),
+                    None => {
+                        // Stream is done so free the reservation completely
+                        self.reservation.free();
+                        Poll::Ready(None)
+                    }
+                }
+            }
+            Poll::Pending => Poll::Pending,
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
+    }
+}
+
+impl RecordBatchStream for ReservationStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
     use crate::test::exec::{
-        assert_strong_count_converges_to_zero, BlockingExec, MockExec, PanicExec,
+        BlockingExec, MockExec, PanicExec, assert_strong_count_converges_to_zero,
     };
 
     use arrow::datatypes::{DataType, Field, Schema};
@@ -924,7 +990,126 @@ mod test {
 
         assert_eq!(
             number_of_batches, 2,
-            "Should have received exactly one empty batch"
+            "Should have received exactly two empty batches"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_reservation_stream_shrinks_on_poll() {
+        use arrow::array::Int32Array;
+        use datafusion_execution::memory_pool::MemoryConsumer;
+        use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+
+        let runtime = RuntimeEnvBuilder::new()
+            .with_memory_limit(10 * 1024 * 1024, 1.0)
+            .build_arc()
+            .unwrap();
+
+        let reservation = MemoryConsumer::new("test").register(&runtime.memory_pool);
+
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        // Create batches
+        let batch1 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))],
+        )
+        .unwrap();
+        let batch2 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(vec![6, 7, 8, 9, 10]))],
+        )
+        .unwrap();
+
+        let batch1_size = get_record_batch_memory_size(&batch1);
+        let batch2_size = get_record_batch_memory_size(&batch2);
+
+        // Reserve memory upfront
+        reservation.try_grow(batch1_size + batch2_size).unwrap();
+        let initial_reserved = runtime.memory_pool.reserved();
+        assert_eq!(initial_reserved, batch1_size + batch2_size);
+
+        // Create stream with batches
+        let stream = futures::stream::iter(vec![Ok(batch1), Ok(batch2)]);
+        let inner = Box::pin(RecordBatchStreamAdapter::new(Arc::clone(&schema), stream))
+            as SendableRecordBatchStream;
+
+        let mut res_stream =
+            ReservationStream::new(Arc::clone(&schema), inner, reservation);
+
+        // Poll first batch
+        let result1 = res_stream.next().await;
+        assert!(result1.is_some());
+
+        // Memory should be reduced by batch1_size
+        let after_first = runtime.memory_pool.reserved();
+        assert_eq!(after_first, batch2_size);
+
+        // Poll second batch
+        let result2 = res_stream.next().await;
+        assert!(result2.is_some());
+
+        // Memory should be reduced by batch2_size
+        let after_second = runtime.memory_pool.reserved();
+        assert_eq!(after_second, 0);
+
+        // Poll None (end of stream)
+        let result3 = res_stream.next().await;
+        assert!(result3.is_none());
+
+        // Memory should still be 0
+        assert_eq!(runtime.memory_pool.reserved(), 0);
+    }
+
+    #[tokio::test]
+    async fn test_reservation_stream_error_handling() {
+        use datafusion_execution::memory_pool::MemoryConsumer;
+        use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+
+        let runtime = RuntimeEnvBuilder::new()
+            .with_memory_limit(10 * 1024 * 1024, 1.0)
+            .build_arc()
+            .unwrap();
+
+        let reservation = MemoryConsumer::new("test").register(&runtime.memory_pool);
+
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        reservation.try_grow(1000).unwrap();
+        let initial = runtime.memory_pool.reserved();
+        assert_eq!(initial, 1000);
+
+        // Create a stream that errors
+        let stream = futures::stream::iter(vec![exec_err!("Test error")]);
+        let inner = Box::pin(RecordBatchStreamAdapter::new(Arc::clone(&schema), stream))
+            as SendableRecordBatchStream;
+
+        let mut res_stream =
+            ReservationStream::new(Arc::clone(&schema), inner, reservation);
+
+        // Get the error
+        let result = res_stream.next().await;
+        assert!(result.is_some());
+        assert!(result.unwrap().is_err());
+
+        // Verify reservation is NOT automatically freed on error
+        // The reservation is only freed when poll_next returns Poll::Ready(None)
+        // After an error, the stream may continue to hold the reservation
+        // until it's explicitly dropped or polled to None
+        let after_error = runtime.memory_pool.reserved();
+        assert_eq!(
+            after_error, 1000,
+            "Reservation should still be held after error"
+        );
+
+        // Drop the stream to free the reservation
+        drop(res_stream);
+
+        // Now memory should be freed
+        assert_eq!(
+            runtime.memory_pool.reserved(),
+            0,
+            "Memory should be freed when stream is dropped"
         );
     }
 }
diff --git a/datafusion/physical-plan/src/streaming.rs b/datafusion/physical-plan/src/streaming.rs
index f9a7feb9e726e..250eb59f19b87 100644
--- a/datafusion/physical-plan/src/streaming.rs
+++ b/datafusion/physical-plan/src/streaming.rs
@@ -17,25 +17,26 @@
 
 //! Generic plans for deferred execution: [`StreamingTableExec`] and [`PartitionStream`]
 
-use std::any::Any;
 use std::fmt::Debug;
 use std::sync::Arc;
 
 use super::{DisplayAs, DisplayFormatType, PlanProperties};
 use crate::coop::make_cooperative;
-use crate::display::{display_orderings, ProjectSchemaDisplay};
+use crate::display::{ProjectSchemaDisplay, display_orderings};
 use crate::execution_plan::{Boundedness, EmissionType, SchedulingType};
 use crate::limit::LimitStream;
 use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
 use crate::projection::{
-    all_alias_free_columns, new_projections_for_columns, update_ordering, ProjectionExec,
+    ProjectionExec, all_alias_free_columns, new_projections_for_columns, update_ordering,
 };
 use crate::stream::RecordBatchStreamAdapter;
 use crate::{ExecutionPlan, Partitioning, SendableRecordBatchStream};
 
 use arrow::datatypes::{Schema, SchemaRef};
-use datafusion_common::{internal_err, plan_err, Result};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, internal_err, plan_err};
 use datafusion_execution::TaskContext;
+use datafusion_physical_expr::PhysicalExpr;
 use datafusion_physical_expr::{EquivalenceProperties, LexOrdering};
 
 use async_trait::async_trait;
@@ -67,7 +68,7 @@ pub struct StreamingTableExec {
     projected_output_ordering: Vec<LexOrdering>,
     infinite: bool,
     limit: Option<usize>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
     metrics: ExecutionPlanMetricsSet,
 }
 
@@ -111,7 +112,7 @@ impl StreamingTableExec {
             projected_output_ordering,
             infinite,
             limit,
-            cache,
+            cache: Arc::new(cache),
             metrics: ExecutionPlanMetricsSet::new(),
         })
     }
@@ -232,11 +233,7 @@ impl ExecutionPlan for StreamingTableExec {
         "StreamingTableExec"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -248,6 +245,13 @@ impl ExecutionPlan for StreamingTableExec {
         vec![]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
@@ -335,7 +339,7 @@ impl ExecutionPlan for StreamingTableExec {
             projected_output_ordering: self.projected_output_ordering.clone(),
             infinite: self.infinite,
             limit,
-            cache: self.cache.clone(),
+            cache: Arc::clone(&self.cache),
             metrics: self.metrics.clone(),
         }))
     }
@@ -346,7 +350,7 @@ mod test {
     use super::*;
     use crate::collect_partitioned;
     use crate::streaming::PartitionStream;
-    use crate::test::{make_partition, TestPartitionStream};
+    use crate::test::{TestPartitionStream, make_partition};
     use arrow::record_batch::RecordBatch;
 
     #[tokio::test]
diff --git a/datafusion/physical-plan/src/test.rs b/datafusion/physical-plan/src/test.rs
index 349f9955b6914..4c4724e4dcc4f 100644
--- a/datafusion/physical-plan/src/test.rs
+++ b/datafusion/physical-plan/src/test.rs
@@ -17,7 +17,6 @@
 
 //! Utilities for testing datafusion-physical-plan
 
-use std::any::Any;
 use std::collections::HashMap;
 use std::fmt;
 use std::fmt::{Debug, Formatter};
@@ -25,19 +24,20 @@ use std::pin::Pin;
 use std::sync::Arc;
 use std::task::Context;
 
+use crate::ExecutionPlan;
 use crate::common;
 use crate::execution_plan::{Boundedness, EmissionType};
 use crate::memory::MemoryStream;
 use crate::metrics::MetricsSet;
 use crate::stream::RecordBatchStreamAdapter;
 use crate::streaming::PartitionStream;
-use crate::ExecutionPlan;
 use crate::{DisplayAs, DisplayFormatType, PlanProperties};
 
 use arrow::array::{Array, ArrayRef, Int32Array, RecordBatch};
 use arrow_schema::{DataType, Field, Schema, SchemaRef};
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::{
-    config::ConfigOptions, internal_err, project_schema, Result, Statistics,
+    Result, Statistics, assert_or_internal_err, config::ConfigOptions, project_schema,
 };
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use datafusion_physical_expr::equivalence::{
@@ -45,7 +45,9 @@ use datafusion_physical_expr::equivalence::{
 };
 use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_expr::utils::collect_columns;
-use datafusion_physical_expr::{EquivalenceProperties, LexOrdering, Partitioning};
+use datafusion_physical_expr::{
+    EquivalenceProperties, LexOrdering, Partitioning, PhysicalExpr,
+};
 
 use futures::{Future, FutureExt};
 
@@ -75,7 +77,7 @@ pub struct TestMemoryExec {
     /// The maximum number of records to read from this plan. If `None`,
     /// all records after filtering are returned.
     fetch: Option<usize>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl DisplayAs for TestMemoryExec {
@@ -105,10 +107,10 @@ impl DisplayAs for TestMemoryExec {
                     .map_or(String::new(), |limit| format!(", fetch={limit}"));
                 if self.show_sizes {
                     write!(
-                                f,
-                                "partitions={}, partition_sizes={partition_sizes:?}{limit}{output_ordering}{constraints}",
-                                partition_sizes.len(),
-                            )
+                        f,
+                        "partitions={}, partition_sizes={partition_sizes:?}{limit}{output_ordering}{constraints}",
+                        partition_sizes.len(),
+                    )
                 } else {
                     write!(
                         f,
@@ -130,11 +132,7 @@ impl ExecutionPlan for TestMemoryExec {
         "DataSourceExec"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -142,11 +140,25 @@ impl ExecutionPlan for TestMemoryExec {
         Vec::new()
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to all sort information orderings
+        let mut tnr = TreeNodeRecursion::Continue;
+        for ordering in &self.sort_information {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         _: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        unimplemented!()
+        Ok(self)
     }
 
     fn repartitioned(
@@ -169,15 +181,11 @@ impl ExecutionPlan for TestMemoryExec {
         unimplemented!()
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.statistics_inner()
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         if partition.is_some() {
-            Ok(Statistics::new_unknown(&self.schema))
+            Ok(Arc::new(Statistics::new_unknown(&self.schema)))
         } else {
-            self.statistics_inner()
+            Ok(Arc::new(self.statistics_inner()?))
         }
     }
 
@@ -239,7 +247,7 @@ impl TestMemoryExec {
         Ok(Self {
             partitions: partitions.to_vec(),
             schema,
-            cache: PlanProperties::new(
+            cache: Arc::new(PlanProperties::new(
                 EquivalenceProperties::new_with_orderings(
                     Arc::clone(&projected_schema),
                     Vec::<LexOrdering>::new(),
@@ -247,7 +255,7 @@ impl TestMemoryExec {
                 Partitioning::UnknownPartitioning(partitions.len()),
                 EmissionType::Incremental,
                 Boundedness::Bounded,
-            ),
+            )),
             projected_schema,
             projection,
             sort_information: vec![],
@@ -265,16 +273,15 @@ impl TestMemoryExec {
     ) -> Result<Arc<TestMemoryExec>> {
         let mut source = Self::try_new(partitions, schema, projection)?;
         let cache = source.compute_properties();
-        source.cache = cache;
+        source.cache = Arc::new(cache);
         Ok(Arc::new(source))
     }
 
     // Equivalent of `DataSourceExec::new`
-    pub fn update_cache(source: Arc<TestMemoryExec>) -> TestMemoryExec {
+    pub fn update_cache(source: &Arc<TestMemoryExec>) -> TestMemoryExec {
         let cache = source.compute_properties();
-        let source = &*source;
-        let mut source = source.clone();
-        source.cache = cache;
+        let mut source = (**source).clone();
+        source.cache = Arc::new(cache);
         source
     }
 
@@ -317,12 +324,11 @@ impl TestMemoryExec {
                     .map(|field| field.name() != col.name())
                     .unwrap_or(true)
             });
-        if let Some(col) = ambiguous_column {
-            return internal_err!(
-                "Column {:?} is not found in the original schema of the TestMemoryExec",
-                col
-            );
-        }
+        assert_or_internal_err!(
+            ambiguous_column.is_none(),
+            "Column {:?} is not found in the original schema of the TestMemoryExec",
+            ambiguous_column.as_ref().unwrap()
+        );
 
         // If there is a projection on the source, we also need to project orderings
         if let Some(projection) = &self.projection {
@@ -344,6 +350,7 @@ impl TestMemoryExec {
         }
 
         self.sort_information = sort_information;
+        self.cache = Arc::new(self.compute_properties());
         Ok(self)
     }
 
diff --git a/datafusion/physical-plan/src/test/exec.rs b/datafusion/physical-plan/src/test/exec.rs
index b720181b27fe0..200223b9b660a 100644
--- a/datafusion/physical-plan/src/test/exec.rs
+++ b/datafusion/physical-plan/src/test/exec.rs
@@ -17,28 +17,28 @@
 
 //! Simple iterator over batches for use in testing
 
-use std::{
-    any::Any,
-    pin::Pin,
-    sync::{Arc, Weak},
-    task::{Context, Poll},
-};
-
 use crate::{
-    common, execution_plan::Boundedness, DisplayAs, DisplayFormatType, ExecutionPlan,
-    Partitioning, PlanProperties, RecordBatchStream, SendableRecordBatchStream,
-    Statistics,
+    DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties,
+    RecordBatchStream, SendableRecordBatchStream, Statistics, common,
+    execution_plan::Boundedness,
 };
 use crate::{
     execution_plan::EmissionType,
     stream::{RecordBatchReceiverStream, RecordBatchStreamAdapter},
 };
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::{
+    pin::Pin,
+    sync::{Arc, Weak},
+    task::{Context, Poll},
+};
 
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
-use datafusion_common::{internal_err, DataFusionError, Result};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{DataFusionError, Result, internal_err};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::EquivalenceProperties;
+use datafusion_physical_expr::{EquivalenceProperties, PhysicalExpr};
 
 use futures::Stream;
 use tokio::sync::Barrier;
@@ -125,7 +125,7 @@ pub struct MockExec {
     /// if true (the default), sends data using a separate task to ensure the
     /// batches are not available without this stream yielding first
     use_task: bool,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl MockExec {
@@ -142,7 +142,7 @@ impl MockExec {
             data,
             schema,
             use_task: true,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -188,11 +188,7 @@ impl ExecutionPlan for MockExec {
         Self::static_name()
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -200,6 +196,13 @@ impl ExecutionPlan for MockExec {
         vec![]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         _: Vec<Arc<dyn ExecutionPlan>>,
@@ -254,13 +257,9 @@ impl ExecutionPlan for MockExec {
     }
 
     // Panics if one of the batches is an error
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         if partition.is_some() {
-            return Ok(Statistics::new_unknown(&self.schema));
+            return Ok(Arc::new(Statistics::new_unknown(&self.schema)));
         }
         let data: Result<Vec<_>> = self
             .data
@@ -273,11 +272,11 @@ impl ExecutionPlan for MockExec {
 
         let data = data?;
 
-        Ok(common::compute_record_batch_statistics(
+        Ok(Arc::new(common::compute_record_batch_statistics(
             &[data],
             &self.schema,
             None,
-        ))
+        )))
     }
 }
 
@@ -298,29 +297,91 @@ pub struct BarrierExec {
     schema: SchemaRef,
 
     /// all streams wait on this barrier to produce
-    barrier: Arc<Barrier>,
-    cache: PlanProperties,
+    start_data_barrier: Option<Arc<Barrier>>,
+
+    /// the stream wait for this to return Poll::Ready(None)
+    finish_barrier: Option<Arc<(Barrier, AtomicUsize)>>,
+
+    cache: Arc<PlanProperties>,
+
+    log: bool,
 }
 
 impl BarrierExec {
     /// Create a new exec with some number of partitions.
     pub fn new(data: Vec<Vec<RecordBatch>>, schema: SchemaRef) -> Self {
         // wait for all streams and the input
-        let barrier = Arc::new(Barrier::new(data.len() + 1));
+        let barrier = Some(Arc::new(Barrier::new(data.len() + 1)));
         let cache = Self::compute_properties(Arc::clone(&schema), &data);
         Self {
             data,
             schema,
-            barrier,
-            cache,
+            start_data_barrier: barrier,
+            cache: Arc::new(cache),
+            finish_barrier: None,
+            log: true,
         }
     }
 
+    pub fn with_log(mut self, log: bool) -> Self {
+        self.log = log;
+        self
+    }
+
+    pub fn without_start_barrier(mut self) -> Self {
+        self.start_data_barrier = None;
+        self
+    }
+
+    pub fn with_finish_barrier(mut self) -> Self {
+        let barrier = Arc::new((
+            // wait for all streams and the input
+            Barrier::new(self.data.len() + 1),
+            AtomicUsize::new(0),
+        ));
+
+        self.finish_barrier = Some(barrier);
+        self
+    }
+
     /// wait until all the input streams and this function is ready
     pub async fn wait(&self) {
-        println!("BarrierExec::wait waiting on barrier");
-        self.barrier.wait().await;
-        println!("BarrierExec::wait done waiting");
+        let barrier = &self
+            .start_data_barrier
+            .as_ref()
+            .expect("Must only be called when having a start barrier");
+        if self.log {
+            println!("BarrierExec::wait waiting on barrier");
+        }
+        barrier.wait().await;
+        if self.log {
+            println!("BarrierExec::wait done waiting");
+        }
+    }
+
+    pub async fn wait_finish(&self) {
+        let (barrier, _) = &self
+            .finish_barrier
+            .as_deref()
+            .expect("Must only be called when having a finish barrier");
+
+        if self.log {
+            println!("BarrierExec::wait_finish waiting on barrier");
+        }
+        barrier.wait().await;
+        if self.log {
+            println!("BarrierExec::wait_finish done waiting");
+        }
+    }
+
+    /// Return true if the finish barrier has been reached in all partitions
+    pub fn is_finish_barrier_reached(&self) -> bool {
+        let (_, reached_finish) = self
+            .finish_barrier
+            .as_deref()
+            .expect("Must only be called when having finish barrier");
+
+        reached_finish.load(Ordering::Relaxed) == self.data.len()
     }
 
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
@@ -360,11 +421,7 @@ impl ExecutionPlan for BarrierExec {
         Self::static_name()
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -379,6 +436,13 @@ impl ExecutionPlan for BarrierExec {
         unimplemented!()
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     /// Returns a stream which yields data
     fn execute(
         &self,
@@ -391,17 +455,32 @@ impl ExecutionPlan for BarrierExec {
 
         // task simply sends data in order after barrier is reached
         let data = self.data[partition].clone();
-        let b = Arc::clone(&self.barrier);
+        let start_barrier = self.start_data_barrier.as_ref().map(Arc::clone);
+        let finish_barrier = self.finish_barrier.as_ref().map(Arc::clone);
+        let log = self.log;
         let tx = builder.tx();
         builder.spawn(async move {
-            println!("Partition {partition} waiting on barrier");
-            b.wait().await;
+            if let Some(barrier) = start_barrier {
+                if log {
+                    println!("Partition {partition} waiting on barrier");
+                }
+                barrier.wait().await;
+            }
             for batch in data {
-                println!("Partition {partition} sending batch");
+                if log {
+                    println!("Partition {partition} sending batch");
+                }
                 if let Err(e) = tx.send(Ok(batch)).await {
                     println!("ERROR batch via barrier stream stream: {e}");
                 }
             }
+            if let Some((barrier, reached_finish)) = finish_barrier.as_deref() {
+                if log {
+                    println!("Partition {partition} waiting on finish barrier");
+                }
+                reached_finish.fetch_add(1, Ordering::Relaxed);
+                barrier.wait().await;
+            }
 
             Ok(())
         });
@@ -410,26 +489,22 @@ impl ExecutionPlan for BarrierExec {
         Ok(builder.build())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         if partition.is_some() {
-            return Ok(Statistics::new_unknown(&self.schema));
+            return Ok(Arc::new(Statistics::new_unknown(&self.schema)));
         }
-        Ok(common::compute_record_batch_statistics(
+        Ok(Arc::new(common::compute_record_batch_statistics(
             &self.data,
             &self.schema,
             None,
-        ))
+        )))
     }
 }
 
 /// A mock execution plan that errors on a call to execute
 #[derive(Debug)]
 pub struct ErrorExec {
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl Default for ErrorExec {
@@ -446,7 +521,9 @@ impl ErrorExec {
             true,
         )]));
         let cache = Self::compute_properties(schema);
-        Self { cache }
+        Self {
+            cache: Arc::new(cache),
+        }
     }
 
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
@@ -483,11 +560,7 @@ impl ExecutionPlan for ErrorExec {
         Self::static_name()
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -502,6 +575,13 @@ impl ExecutionPlan for ErrorExec {
         unimplemented!()
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     /// Returns a stream which yields data
     fn execute(
         &self,
@@ -517,20 +597,20 @@ impl ExecutionPlan for ErrorExec {
 pub struct StatisticsExec {
     stats: Statistics,
     schema: Arc<Schema>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 impl StatisticsExec {
     pub fn new(stats: Statistics, schema: Schema) -> Self {
         assert_eq!(
-            stats
-                .column_statistics.len(), schema.fields().len(),
+            stats.column_statistics.len(),
+            schema.fields().len(),
             "if defined, the column statistics vector length should be the number of fields"
         );
         let cache = Self::compute_properties(Arc::new(schema.clone()));
         Self {
             stats,
             schema: Arc::new(schema),
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -573,11 +653,7 @@ impl ExecutionPlan for StatisticsExec {
         Self::static_name()
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -585,6 +661,13 @@ impl ExecutionPlan for StatisticsExec {
         vec![]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         _: Vec<Arc<dyn ExecutionPlan>>,
@@ -600,16 +683,12 @@ impl ExecutionPlan for StatisticsExec {
         unimplemented!("This plan only serves for testing statistics")
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(self.stats.clone())
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        Ok(if partition.is_some() {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        Ok(Arc::new(if partition.is_some() {
             Statistics::new_unknown(&self.schema)
         } else {
             self.stats.clone()
-        })
+        }))
     }
 }
 
@@ -623,7 +702,7 @@ pub struct BlockingExec {
 
     /// Ref-counting helper to check if the plan and the produced stream are still in memory.
     refs: Arc<()>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl BlockingExec {
@@ -633,7 +712,7 @@ impl BlockingExec {
         Self {
             schema,
             refs: Default::default(),
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -680,11 +759,7 @@ impl ExecutionPlan for BlockingExec {
         Self::static_name()
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -700,6 +775,13 @@ impl ExecutionPlan for BlockingExec {
         internal_err!("Children cannot be replaced in {self:?}")
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn execute(
         &self,
         _partition: usize,
@@ -766,7 +848,7 @@ pub struct PanicExec {
     /// Number of output partitions. Each partition will produce this
     /// many empty output record batches prior to panicking
     batches_until_panics: Vec<usize>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl PanicExec {
@@ -778,7 +860,7 @@ impl PanicExec {
         Self {
             schema,
             batches_until_panics,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -826,11 +908,7 @@ impl ExecutionPlan for PanicExec {
         Self::static_name()
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -839,6 +917,13 @@ impl ExecutionPlan for PanicExec {
         vec![]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         _: Vec<Arc<dyn ExecutionPlan>>,
diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs
index 9435de1cc4488..15c9f0a060e9e 100644
--- a/datafusion/physical-plan/src/topk/mod.rs
+++ b/datafusion/physical-plan/src/topk/mod.rs
@@ -19,29 +19,32 @@
 
 use arrow::{
     array::{Array, AsArray},
-    compute::{interleave_record_batch, prep_null_mask_filter, FilterBuilder},
+    compute::{FilterBuilder, interleave_record_batch, prep_null_mask_filter},
     row::{RowConverter, Rows, SortField},
 };
 use datafusion_expr::{ColumnarValue, Operator};
 use std::mem::size_of;
 use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc};
 
-use super::metrics::{BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder};
+use super::metrics::{
+    BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder, MetricCategory,
+    RecordOutput,
+};
 use crate::spill::get_record_batch_memory_size;
-use crate::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream};
+use crate::{SendableRecordBatchStream, stream::RecordBatchStreamAdapter};
 
 use arrow::array::{ArrayRef, RecordBatch};
 use arrow::datatypes::SchemaRef;
 use datafusion_common::{
-    internal_datafusion_err, internal_err, HashMap, Result, ScalarValue,
+    HashMap, Result, ScalarValue, internal_datafusion_err, internal_err,
 };
 use datafusion_execution::{
     memory_pool::{MemoryConsumer, MemoryReservation},
     runtime_env::RuntimeEnv,
 };
 use datafusion_physical_expr::{
-    expressions::{is_not_null, is_null, lit, BinaryExpr, DynamicFilterPhysicalExpr},
     PhysicalExpr,
+    expressions::{BinaryExpr, DynamicFilterPhysicalExpr, is_not_null, is_null, lit},
 };
 use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
 use parking_lot::RwLock;
@@ -129,6 +132,9 @@ pub struct TopK {
     pub(crate) finished: bool,
 }
 
+/// For more background, please also see the [Dynamic Filters: Passing Information Between Operators During Execution for 25x Faster Queries blog]
+///
+/// [Dynamic Filters: Passing Information Between Operators During Execution for 25x Faster Queries blog]: https://datafusion.apache.org/blog/2025/09/10/dynamic-filters
 #[derive(Debug, Clone)]
 pub struct TopKDynamicFilters {
     /// The current *global* threshold for the dynamic filter.
@@ -157,7 +163,7 @@ impl TopKDynamicFilters {
 // Guesstimate for memory allocation: estimated number of bytes used per row in the RowConverter
 const ESTIMATED_BYTES_PER_ROW: usize = 20;
 
-fn build_sort_fields(
+pub(crate) fn build_sort_fields(
     ordering: &[PhysicalSortExpr],
     schema: &SchemaRef,
 ) -> Result<Vec<SortField>> {
@@ -176,7 +182,8 @@ impl TopK {
     /// Create a new [`TopK`] that stores the top `k` values, as
     /// defined by the sort expressions in `expr`.
     // TODO: make a builder or some other nicer API
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
+    #[expect(clippy::needless_pass_by_value)]
     pub fn try_new(
         partition_id: usize,
         schema: SchemaRef,
@@ -224,6 +231,7 @@ impl TopK {
 
     /// Insert `batch`, remembering if any of its values are among
     /// the top k seen so far.
+    #[expect(clippy::needless_pass_by_value)]
     pub fn insert_batch(&mut self, batch: RecordBatch) -> Result<()> {
         // Updates on drop
         let baseline = self.metrics.baseline.clone();
@@ -246,13 +254,12 @@ impl TopK {
         let num_rows = batch.num_rows();
         let array = filtered.into_array(num_rows)?;
         let mut filter = array.as_boolean().clone();
-        let true_count = filter.true_count();
-        if true_count == 0 {
+        if !filter.has_true() {
             // nothing to filter, so no need to update
             return Ok(());
         }
         // only update the keys / rows if the filter does not match all rows
-        if true_count < num_rows {
+        if filter.null_count() > 0 || filter.has_false() {
             // Indices in `set_indices` should be correct if filter contains nulls
             // So we prepare the filter here. Note this is also done in the `FilterBuilder`
             // so there is no overhead to do this here.
@@ -375,7 +382,7 @@ impl TopK {
         };
 
         // Build the filter expression OUTSIDE any synchronization
-        let predicate = Self::build_filter_expression(&self.expr, thresholds)?;
+        let predicate = Self::build_filter_expression(&self.expr, &thresholds)?;
         let new_threshold = new_threshold_row.to_vec();
 
         // update the threshold. Since there was a lock gap, we must check if it is still the best
@@ -405,10 +412,10 @@ impl TopK {
         };
 
         // Update the filter expression
-        if let Some(pred) = predicate {
-            if !pred.eq(&lit(true)) {
-                filter.expr.update(pred)?;
-            }
+        if let Some(pred) = predicate
+            && !pred.eq(&lit(true))
+        {
+            filter.expr.update(pred)?;
         }
 
         Ok(())
@@ -418,7 +425,7 @@ impl TopK {
     /// This is now called outside of any locks to reduce critical section time.
     fn build_filter_expression(
         sort_exprs: &[PhysicalSortExpr],
-        thresholds: Vec<ScalarValue>,
+        thresholds: &[ScalarValue],
     ) -> Result<Option<Arc<dyn PhysicalExpr>>> {
         // Create filter expressions for each threshold
         let mut filters: Vec<Arc<dyn PhysicalExpr>> =
@@ -589,14 +596,17 @@ impl TopK {
             common_sort_prefix_converter: _,
             common_sort_prefix: _,
             finished: _,
-            filter: _,
+            filter,
         } = self;
         let _timer = metrics.baseline.elapsed_compute().timer(); // time updated on drop
 
+        // Mark the dynamic filter as complete now that TopK processing is finished.
+        filter.read().expr().mark_complete();
+
         // break into record batches as needed
         let mut batches = vec![];
         if let Some(mut batch) = heap.emit()? {
-            metrics.baseline.output_rows().add(batch.num_rows());
+            (&batch).record_output(&metrics.baseline);
 
             loop {
                 if batch.num_rows() <= batch_size {
@@ -637,6 +647,7 @@ impl TopKMetrics {
         Self {
             baseline: BaselineMetrics::new(metrics, partition),
             row_replacements: MetricBuilder::new(metrics)
+                .with_category(MetricCategory::Rows)
                 .counter("row_replacements", partition),
         }
     }
@@ -714,8 +725,8 @@ impl TopKHeap {
         let row = row.as_ref();
 
         // Reuse storage for evicted item if possible
-        let new_top_k = if self.inner.len() == self.k {
-            let prev_min = self.inner.pop().unwrap();
+        if self.inner.len() == self.k {
+            let mut prev_min = self.inner.peek_mut().unwrap();
 
             // Update batch use
             if prev_min.batch_id == batch_entry.id {
@@ -726,15 +737,16 @@ impl TopKHeap {
 
             // update memory accounting
             self.owned_bytes -= prev_min.owned_size();
-            prev_min.with_new_row(row, batch_id, index)
-        } else {
-            TopKRow::new(row, batch_id, index)
-        };
 
-        self.owned_bytes += new_top_k.owned_size();
+            prev_min.replace_with(row, batch_id, index);
 
-        // put the new row into the heap
-        self.inner.push(new_top_k)
+            self.owned_bytes += prev_min.owned_size();
+        } else {
+            let new_row = TopKRow::new(row, batch_id, index);
+            self.owned_bytes += new_row.owned_size();
+            // put the new row into the heap
+            self.inner.push(new_row);
+        };
     }
 
     /// Returns the values stored in this heap, from values low to
@@ -863,7 +875,7 @@ impl TopKHeap {
                     ScalarValue::try_from_array(&array, 0)?
                 }
                 array => {
-                    return internal_err!("Expected a scalar value, got {:?}", array)
+                    return internal_err!("Expected a scalar value, got {:?}", array);
                 }
             };
 
@@ -901,26 +913,13 @@ impl TopKRow {
         }
     }
 
-    /// Create a new  TopKRow reusing the existing allocation
-    fn with_new_row(
-        self,
-        new_row: impl AsRef<[u8]>,
-        batch_id: u32,
-        index: usize,
-    ) -> Self {
-        let Self {
-            mut row,
-            batch_id: _,
-            index: _,
-        } = self;
-        row.clear();
-        row.extend_from_slice(new_row.as_ref());
+    // Replace the existing row capacity with new values
+    fn replace_with(&mut self, new_row: impl AsRef<[u8]>, batch_id: u32, index: usize) {
+        self.row.clear();
+        self.row.extend_from_slice(new_row.as_ref());
 
-        Self {
-            row,
-            batch_id,
-            index,
-        }
+        self.batch_id = batch_id;
+        self.index = index;
     }
 
     /// Returns the number of bytes owned by this row in the heap (not
@@ -1061,7 +1060,7 @@ impl RecordBatchStore {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::array::{Float64Array, Int32Array, RecordBatch};
+    use arrow::array::{Float64Array, Int32Array};
     use arrow::datatypes::{DataType, Field, Schema};
     use arrow_schema::SortOptions;
     use datafusion_common::assert_batches_eq;
@@ -1196,4 +1195,52 @@ mod tests {
 
         Ok(())
     }
+
+    /// This test verifies that the dynamic filter is marked as complete after TopK processing finishes.
+    #[tokio::test]
+    async fn test_topk_marks_filter_complete() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        let sort_expr = PhysicalSortExpr {
+            expr: col("a", schema.as_ref())?,
+            options: SortOptions::default(),
+        };
+
+        let full_expr = LexOrdering::from([sort_expr.clone()]);
+        let prefix = vec![sort_expr];
+
+        // Create a dummy runtime environment and metrics
+        let runtime = Arc::new(RuntimeEnv::default());
+        let metrics = ExecutionPlanMetricsSet::new();
+
+        // Create a dynamic filter that we'll check for completion
+        let dynamic_filter = Arc::new(DynamicFilterPhysicalExpr::new(vec![], lit(true)));
+        let dynamic_filter_clone = Arc::clone(&dynamic_filter);
+
+        // Create a TopK instance
+        let mut topk = TopK::try_new(
+            0,
+            Arc::clone(&schema),
+            prefix,
+            full_expr,
+            2,
+            10,
+            runtime,
+            &metrics,
+            Arc::new(RwLock::new(TopKDynamicFilters::new(dynamic_filter))),
+        )?;
+
+        let array: ArrayRef = Arc::new(Int32Array::from(vec![Some(3), Some(1), Some(2)]));
+        let batch = RecordBatch::try_new(Arc::clone(&schema), vec![array])?;
+        topk.insert_batch(batch)?;
+
+        // Call emit to finish TopK processing
+        let _results: Vec<_> = topk.emit()?.try_collect().await?;
+
+        // After emit is called, the dynamic filter should be marked as complete
+        // wait_complete() should return immediately
+        dynamic_filter_clone.wait_complete().await;
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/tree_node.rs b/datafusion/physical-plan/src/tree_node.rs
index 85d7b33575ca2..aa4f144f91898 100644
--- a/datafusion/physical-plan/src/tree_node.rs
+++ b/datafusion/physical-plan/src/tree_node.rs
@@ -20,10 +20,10 @@
 use std::fmt::{self, Display, Formatter};
 use std::sync::Arc;
 
-use crate::{displayable, with_new_children_if_necessary, ExecutionPlan};
+use crate::{ExecutionPlan, displayable, with_new_children_if_necessary};
 
-use datafusion_common::tree_node::{ConcreteTreeNode, DynTreeNode};
 use datafusion_common::Result;
+use datafusion_common::tree_node::{ConcreteTreeNode, DynTreeNode};
 
 impl DynTreeNode for dyn ExecutionPlan {
     fn arc_children(&self) -> Vec<&Arc<Self>> {
diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs
index c95678dac9cdd..ec9ea376e0b6d 100644
--- a/datafusion/physical-plan/src/union.rs
+++ b/datafusion/physical-plan/src/union.rs
@@ -23,31 +23,40 @@
 
 use std::borrow::Borrow;
 use std::pin::Pin;
+use std::sync::Arc;
 use std::task::{Context, Poll};
-use std::{any::Any, sync::Arc};
 
 use super::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, Partitioning,
+    PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics,
     metrics::{ExecutionPlanMetricsSet, MetricsSet},
-    ColumnStatistics, DisplayAs, DisplayFormatType, ExecutionPlan,
-    ExecutionPlanProperties, Partitioning, PlanProperties, RecordBatchStream,
-    SendableRecordBatchStream, Statistics,
 };
+use crate::check_if_same_properties;
 use crate::execution_plan::{
-    boundedness_from_children, check_default_invariants, emission_type_from_children,
-    InvariantLevel,
+    CardinalityEffect, InvariantLevel, boundedness_from_children,
+    check_default_invariants, emission_type_from_children,
+};
+use crate::filter::FilterExec;
+use crate::filter_pushdown::{
+    ChildPushdownResult, FilterDescription, FilterPushdownPhase,
+    FilterPushdownPropagation, PushedDown,
 };
-use crate::filter_pushdown::{FilterDescription, FilterPushdownPhase};
 use crate::metrics::BaselineMetrics;
-use crate::projection::{make_with_child, ProjectionExec};
+use crate::projection::{ProjectionExec, make_with_child};
 use crate::stream::ObservedStream;
 
 use arrow::datatypes::{Field, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::stats::Precision;
-use datafusion_common::{exec_err, internal_datafusion_err, internal_err, Result};
+use datafusion_common::stats::NdvFallback;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{
+    Result, assert_or_internal_err, exec_err, internal_datafusion_err,
+};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::{calculate_union, EquivalenceProperties, PhysicalExpr};
+use datafusion_physical_expr::{
+    EquivalenceProperties, PhysicalExpr, calculate_union, conjunction,
+};
 
 use futures::Stream;
 use itertools::Itertools;
@@ -98,7 +107,7 @@ pub struct UnionExec {
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl UnionExec {
@@ -116,7 +125,7 @@ impl UnionExec {
         UnionExec {
             inputs,
             metrics: ExecutionPlanMetricsSet::new(),
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -145,7 +154,7 @@ impl UnionExec {
                 Ok(Arc::new(UnionExec {
                     inputs,
                     metrics: ExecutionPlanMetricsSet::new(),
-                    cache,
+                    cache: Arc::new(cache),
                 }))
             }
         }
@@ -181,6 +190,17 @@ impl UnionExec {
             boundedness_from_children(inputs),
         ))
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            inputs: children,
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for UnionExec {
@@ -204,11 +224,7 @@ impl ExecutionPlan for UnionExec {
     }
 
     /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -253,10 +269,18 @@ impl ExecutionPlan for UnionExec {
         self.inputs.iter().collect()
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         UnionExec::try_new(children)
     }
 
@@ -265,7 +289,12 @@ impl ExecutionPlan for UnionExec {
         mut partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        trace!("Start UnionExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
+        trace!(
+            "Start UnionExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
         let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
         // record the tiny amount of work done in this function so
         // elapsed_compute is reported as non zero
@@ -297,11 +326,7 @@ impl ExecutionPlan for UnionExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         if let Some(partition_idx) = partition {
             // For a specific partition, find which input it belongs to
             let mut remaining_idx = partition_idx;
@@ -314,22 +339,23 @@ impl ExecutionPlan for UnionExec {
                 remaining_idx -= input_partition_count;
             }
             // If we get here, the partition index is out of bounds
-            Ok(Statistics::new_unknown(&self.schema()))
+            Ok(Arc::new(Statistics::new_unknown(&self.schema())))
         } else {
-            // Collect statistics from all inputs
-            let stats = self
-                .inputs
-                .iter()
-                .map(|input_exec| input_exec.partition_statistics(None))
-                .collect::<Result<Vec<_>>>()?;
-
-            Ok(stats
-                .into_iter()
-                .reduce(stats_union)
-                .unwrap_or_else(|| Statistics::new_unknown(&self.schema())))
+            let schema = self.schema();
+            Ok(Arc::new(merge_input_statistics(
+                &self.inputs,
+                None,
+                schema.as_ref(),
+            )?))
         }
     }
 
+    fn cardinality_effect(&self) -> CardinalityEffect {
+        // Union combines rows from multiple inputs, so output rows are not tied
+        // to any single input and can only be constrained as greater-or-equal.
+        CardinalityEffect::GreaterEqual
+    }
+
     fn supports_limit_pushdown(&self) -> bool {
         true
     }
@@ -363,6 +389,83 @@ impl ExecutionPlan for UnionExec {
     ) -> Result<FilterDescription> {
         FilterDescription::from_children(parent_filters, &self.children())
     }
+
+    fn handle_child_pushdown_result(
+        &self,
+        phase: FilterPushdownPhase,
+        child_pushdown_result: ChildPushdownResult,
+        _config: &ConfigOptions,
+    ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
+        // Pre phase: handle heterogeneous pushdown by wrapping individual
+        // children with FilterExec and reporting all filters as handled.
+        // Post phase: use default behavior to let the filter creator decide how to handle
+        // filters that weren't fully pushed down.
+        if phase != FilterPushdownPhase::Pre {
+            return Ok(FilterPushdownPropagation::if_all(child_pushdown_result));
+        }
+
+        // UnionExec needs specialized filter pushdown handling when children have
+        // heterogeneous pushdown support. Without this, when some children support
+        // pushdown and others don't, the default behavior would leave FilterExec
+        // above UnionExec, re-applying filters to outputs of all children—including
+        // those that already applied the filters via pushdown. This specialized
+        // implementation adds FilterExec only to children that don't support
+        // pushdown, avoiding redundant filtering and improving performance.
+        //
+        // Example: Given Child1 (no pushdown support) and Child2 (has pushdown support)
+        //   Default behavior:          This implementation:
+        //   FilterExec                 UnionExec
+        //     UnionExec                  FilterExec
+        //       Child1                     Child1
+        //       Child2(filter)           Child2(filter)
+
+        // Collect unsupported filters for each child
+        let mut unsupported_filters_per_child = vec![Vec::new(); self.inputs.len()];
+        for parent_filter_result in child_pushdown_result.parent_filters.iter() {
+            for (child_idx, &child_result) in
+                parent_filter_result.child_results.iter().enumerate()
+            {
+                if matches!(child_result, PushedDown::No) {
+                    unsupported_filters_per_child[child_idx]
+                        .push(Arc::clone(&parent_filter_result.filter));
+                }
+            }
+        }
+
+        // Wrap children that have unsupported filters with FilterExec
+        let mut new_children = self.inputs.clone();
+        for (child_idx, unsupported_filters) in
+            unsupported_filters_per_child.iter().enumerate()
+        {
+            if !unsupported_filters.is_empty() {
+                let combined_filter = conjunction(unsupported_filters.clone());
+                new_children[child_idx] = Arc::new(FilterExec::try_new(
+                    combined_filter,
+                    Arc::clone(&self.inputs[child_idx]),
+                )?);
+            }
+        }
+
+        // Check if any children were modified
+        let children_modified = new_children
+            .iter()
+            .zip(self.inputs.iter())
+            .any(|(new, old)| !Arc::ptr_eq(new, old));
+
+        let all_filters_pushed =
+            vec![PushedDown::Yes; child_pushdown_result.parent_filters.len()];
+        let propagation = if children_modified {
+            let updated_node = UnionExec::try_new(new_children)?;
+            FilterPushdownPropagation::with_parent_pushdown_result(all_filters_pushed)
+                .with_updated_node(updated_node)
+        } else {
+            FilterPushdownPropagation::with_parent_pushdown_result(all_filters_pushed)
+        };
+
+        // Report all parent filters as supported since we've ensured they're applied
+        // on all children (either pushed down or via FilterExec)
+        Ok(propagation)
+    }
 }
 
 /// Combines multiple input streams by interleaving them.
@@ -404,22 +507,21 @@ pub struct InterleaveExec {
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl InterleaveExec {
     /// Create a new InterleaveExec
     pub fn try_new(inputs: Vec<Arc<dyn ExecutionPlan>>) -> Result<Self> {
-        if !can_interleave(inputs.iter()) {
-            return internal_err!(
-                "Not all InterleaveExec children have a consistent hash partitioning"
-            );
-        }
+        assert_or_internal_err!(
+            can_interleave(inputs.iter()),
+            "Not all InterleaveExec children have a consistent hash partitioning"
+        );
         let cache = Self::compute_properties(&inputs)?;
         Ok(InterleaveExec {
             inputs,
             metrics: ExecutionPlanMetricsSet::new(),
-            cache,
+            cache: Arc::new(cache),
         })
     }
 
@@ -441,6 +543,17 @@ impl InterleaveExec {
             boundedness_from_children(inputs),
         ))
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            inputs: children,
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for InterleaveExec {
@@ -464,11 +577,7 @@ impl ExecutionPlan for InterleaveExec {
     }
 
     /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -480,16 +589,23 @@ impl ExecutionPlan for InterleaveExec {
         vec![false; self.inputs().len()]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         // New children are no longer interleavable, which might be a bug of optimization rewrite.
-        if !can_interleave(children.iter()) {
-            return internal_err!(
-                "Can not create InterleaveExec: new children can not be interleaved"
-            );
-        }
+        assert_or_internal_err!(
+            can_interleave(children.iter()),
+            "Can not create InterleaveExec: new children can not be interleaved"
+        );
+        check_if_same_properties!(self, children);
         Ok(Arc::new(InterleaveExec::try_new(children)?))
     }
 
@@ -498,7 +614,12 @@ impl ExecutionPlan for InterleaveExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        trace!("Start InterleaveExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
+        trace!(
+            "Start InterleaveExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
         let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
         // record the tiny amount of work done in this function so
         // elapsed_compute is reported as non zero
@@ -535,21 +656,13 @@ impl ExecutionPlan for InterleaveExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        let stats = self
-            .inputs
-            .iter()
-            .map(|stat| stat.partition_statistics(partition))
-            .collect::<Result<Vec<_>>>()?;
-
-        Ok(stats
-            .into_iter()
-            .reduce(stats_union)
-            .unwrap_or_else(|| Statistics::new_unknown(&self.schema())))
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let schema = self.schema();
+        Ok(Arc::new(merge_input_statistics(
+            &self.inputs,
+            partition,
+            schema.as_ref(),
+        )?))
     }
 
     fn benefits_from_input_partitioning(&self) -> Vec<bool> {
@@ -583,15 +696,28 @@ fn union_schema(inputs: &[Arc<dyn ExecutionPlan>]) -> Result<SchemaRef> {
     }
 
     let first_schema = inputs[0].schema();
+    let first_field_count = first_schema.fields().len();
+
+    // validate that all inputs have the same number of fields
+    for (idx, input) in inputs.iter().enumerate().skip(1) {
+        let field_count = input.schema().fields().len();
+        if field_count != first_field_count {
+            return exec_err!(
+                "UnionExec/InterleaveExec requires all inputs to have the same number of fields. \
+                 Input 0 has {first_field_count} fields, but input {idx} has {field_count} fields"
+            );
+        }
+    }
 
-    let fields = (0..first_schema.fields().len())
+    let fields = (0..first_field_count)
         .map(|i| {
             // We take the name from the left side of the union to match how names are coerced during logical planning,
             // which also uses the left side names.
             let base_field = first_schema.field(i).clone();
 
             // Coerce metadata and nullability across all inputs
-            let merged_field = inputs
+
+            inputs
                 .iter()
                 .enumerate()
                 .map(|(input_idx, input)| {
@@ -613,9 +739,7 @@ fn union_schema(inputs: &[Arc<dyn ExecutionPlan>]) -> Result<SchemaRef> {
                 // We can unwrap this because if inputs was empty, this would've already panic'ed when we
                 // indexed into inputs[0].
                 .unwrap()
-                .with_name(base_field.name());
-
-            merged_field
+                .with_name(base_field.name())
         })
         .collect::<Vec<_>>();
 
@@ -697,40 +821,35 @@ impl Stream for CombinedRecordBatchStream {
     }
 }
 
-fn col_stats_union(
-    mut left: ColumnStatistics,
-    right: ColumnStatistics,
-) -> ColumnStatistics {
-    left.distinct_count = Precision::Absent;
-    left.min_value = left.min_value.min(&right.min_value);
-    left.max_value = left.max_value.max(&right.max_value);
-    left.sum_value = left.sum_value.add(&right.sum_value);
-    left.null_count = left.null_count.add(&right.null_count);
-
-    left
-}
+fn merge_input_statistics(
+    inputs: &[Arc<dyn ExecutionPlan>],
+    partition: Option<usize>,
+    schema: &Schema,
+) -> Result<Statistics> {
+    let stats = inputs
+        .iter()
+        .map(|input| {
+            input
+                .partition_statistics(partition)
+                .map(Arc::unwrap_or_clone)
+        })
+        .collect::<Result<Vec<_>>>()?;
 
-fn stats_union(mut left: Statistics, right: Statistics) -> Statistics {
-    left.num_rows = left.num_rows.add(&right.num_rows);
-    left.total_byte_size = left.total_byte_size.add(&right.total_byte_size);
-    left.column_statistics = left
-        .column_statistics
-        .into_iter()
-        .zip(right.column_statistics)
-        .map(|(a, b)| col_stats_union(a, b))
-        .collect::<Vec<_>>();
-    left
+    Statistics::try_merge_iter_with_ndv_fallback(stats.iter(), schema, NdvFallback::Sum)
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
     use crate::collect;
+    use crate::repartition::RepartitionExec;
+    use crate::test::exec::StatisticsExec;
     use crate::test::{self, TestMemoryExec};
 
     use arrow::compute::SortOptions;
     use arrow::datatypes::DataType;
-    use datafusion_common::ScalarValue;
+    use datafusion_common::stats::Precision;
+    use datafusion_common::{ColumnStatistics, ScalarValue};
     use datafusion_physical_expr::equivalence::convert_to_orderings;
     use datafusion_physical_expr::expressions::col;
 
@@ -748,6 +867,18 @@ mod tests {
         Ok(schema)
     }
 
+    fn create_test_schema2() -> Result<SchemaRef> {
+        let a = Field::new("a", DataType::Int32, true);
+        let b = Field::new("b", DataType::Int32, true);
+        let c = Field::new("c", DataType::Int32, true);
+        let d = Field::new("d", DataType::Int32, true);
+        let e = Field::new("e", DataType::Int32, true);
+        let f = Field::new("f", DataType::Int32, true);
+        let schema = Arc::new(Schema::new(vec![a, b, c, d, e, f]));
+
+        Ok(schema)
+    }
+
     #[tokio::test]
     async fn test_union_partitions() -> Result<()> {
         let task_ctx = Arc::new(TaskContext::default());
@@ -773,94 +904,204 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
-    async fn test_stats_union() {
-        let left = Statistics {
-            num_rows: Precision::Exact(5),
-            total_byte_size: Precision::Exact(23),
-            column_statistics: vec![
-                ColumnStatistics {
-                    distinct_count: Precision::Exact(5),
-                    max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
-                    min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
-                    sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
-                    null_count: Precision::Exact(0),
-                },
-                ColumnStatistics {
-                    distinct_count: Precision::Exact(1),
-                    max_value: Precision::Exact(ScalarValue::from("x")),
-                    min_value: Precision::Exact(ScalarValue::from("a")),
-                    sum_value: Precision::Absent,
-                    null_count: Precision::Exact(3),
-                },
-                ColumnStatistics {
-                    distinct_count: Precision::Absent,
-                    max_value: Precision::Exact(ScalarValue::Float32(Some(1.1))),
-                    min_value: Precision::Exact(ScalarValue::Float32(Some(0.1))),
-                    sum_value: Precision::Exact(ScalarValue::Float32(Some(42.0))),
-                    null_count: Precision::Absent,
-                },
-            ],
-        };
+    fn stats_merge_inputs() -> (SchemaRef, Statistics, Statistics, Statistics) {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::UInt32, true)]));
+
+        let left = Statistics::default()
+            .with_num_rows(Precision::Exact(5))
+            .with_total_byte_size(Precision::Exact(23))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_distinct_count(Precision::Exact(5))
+                    .with_min_value(Precision::Exact(ScalarValue::UInt32(Some(1))))
+                    .with_max_value(Precision::Exact(ScalarValue::UInt32(Some(21))))
+                    .with_sum_value(Precision::Exact(ScalarValue::UInt32(Some(42))))
+                    .with_null_count(Precision::Exact(0))
+                    .with_byte_size(Precision::Exact(40)),
+            );
 
-        let right = Statistics {
-            num_rows: Precision::Exact(7),
-            total_byte_size: Precision::Exact(29),
-            column_statistics: vec![
-                ColumnStatistics {
-                    distinct_count: Precision::Exact(3),
-                    max_value: Precision::Exact(ScalarValue::Int64(Some(34))),
-                    min_value: Precision::Exact(ScalarValue::Int64(Some(1))),
-                    sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
-                    null_count: Precision::Exact(1),
-                },
-                ColumnStatistics {
-                    distinct_count: Precision::Absent,
-                    max_value: Precision::Exact(ScalarValue::from("c")),
-                    min_value: Precision::Exact(ScalarValue::from("b")),
-                    sum_value: Precision::Absent,
-                    null_count: Precision::Absent,
-                },
-                ColumnStatistics {
-                    distinct_count: Precision::Absent,
-                    max_value: Precision::Absent,
-                    min_value: Precision::Absent,
-                    sum_value: Precision::Absent,
-                    null_count: Precision::Absent,
-                },
-            ],
-        };
+        let right = Statistics::default()
+            .with_num_rows(Precision::Exact(7))
+            .with_total_byte_size(Precision::Exact(29))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_distinct_count(Precision::Exact(3))
+                    .with_min_value(Precision::Exact(ScalarValue::UInt32(Some(22))))
+                    .with_max_value(Precision::Exact(ScalarValue::UInt32(Some(34))))
+                    .with_sum_value(Precision::Exact(ScalarValue::UInt32(Some(8))))
+                    .with_null_count(Precision::Exact(1))
+                    .with_byte_size(Precision::Exact(60)),
+            );
 
-        let result = stats_union(left, right);
-        let expected = Statistics {
-            num_rows: Precision::Exact(12),
-            total_byte_size: Precision::Exact(52),
-            column_statistics: vec![
-                ColumnStatistics {
-                    distinct_count: Precision::Absent,
-                    max_value: Precision::Exact(ScalarValue::Int64(Some(34))),
-                    min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
-                    sum_value: Precision::Exact(ScalarValue::Int64(Some(84))),
-                    null_count: Precision::Exact(1),
-                },
-                ColumnStatistics {
-                    distinct_count: Precision::Absent,
-                    max_value: Precision::Exact(ScalarValue::from("x")),
-                    min_value: Precision::Exact(ScalarValue::from("a")),
-                    sum_value: Precision::Absent,
-                    null_count: Precision::Absent,
-                },
-                ColumnStatistics {
-                    distinct_count: Precision::Absent,
-                    max_value: Precision::Absent,
-                    min_value: Precision::Absent,
-                    sum_value: Precision::Absent,
-                    null_count: Precision::Absent,
-                },
-            ],
-        };
+        let expected = Statistics::default()
+            .with_num_rows(Precision::Exact(12))
+            .with_total_byte_size(Precision::Exact(52))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_distinct_count(Precision::Inexact(8))
+                    .with_min_value(Precision::Exact(ScalarValue::UInt32(Some(1))))
+                    .with_max_value(Precision::Exact(ScalarValue::UInt32(Some(34))))
+                    .with_sum_value(Precision::Exact(ScalarValue::UInt64(Some(50))))
+                    .with_null_count(Precision::Exact(1))
+                    .with_byte_size(Precision::Exact(100)),
+            );
+
+        (schema, left, right, expected)
+    }
+
+    fn stats_merge_multicolumn_inputs() -> (SchemaRef, Statistics, Statistics, Statistics)
+    {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, true),
+            Field::new("b", DataType::Utf8, true),
+            Field::new("c", DataType::Float32, true),
+        ]));
+
+        let left = Statistics::default()
+            .with_num_rows(Precision::Exact(5))
+            .with_total_byte_size(Precision::Exact(23))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_distinct_count(Precision::Exact(5))
+                    .with_min_value(Precision::Exact(ScalarValue::Int64(Some(-4))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int64(Some(21))))
+                    .with_sum_value(Precision::Exact(ScalarValue::Int64(Some(42))))
+                    .with_null_count(Precision::Exact(0)),
+            )
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_distinct_count(Precision::Exact(2))
+                    .with_min_value(Precision::Exact(ScalarValue::from("a")))
+                    .with_max_value(Precision::Exact(ScalarValue::from("x")))
+                    .with_null_count(Precision::Exact(3)),
+            )
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_max_value(Precision::Exact(ScalarValue::Float32(Some(1.1))))
+                    .with_min_value(Precision::Exact(ScalarValue::Float32(Some(0.1))))
+                    .with_sum_value(Precision::Exact(ScalarValue::Float32(Some(42.0)))),
+            );
+
+        let right = Statistics::default()
+            .with_num_rows(Precision::Exact(7))
+            .with_total_byte_size(Precision::Exact(29))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_distinct_count(Precision::Exact(3))
+                    .with_min_value(Precision::Exact(ScalarValue::Int64(Some(1))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int64(Some(34))))
+                    .with_sum_value(Precision::Exact(ScalarValue::Int64(Some(42))))
+                    .with_null_count(Precision::Exact(1)),
+            )
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_distinct_count(Precision::Exact(3))
+                    .with_min_value(Precision::Exact(ScalarValue::from("b")))
+                    .with_max_value(Precision::Exact(ScalarValue::from("z"))),
+            )
+            .add_column_statistics(ColumnStatistics::new_unknown());
+
+        let expected = Statistics::default()
+            .with_num_rows(Precision::Exact(12))
+            .with_total_byte_size(Precision::Exact(52))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_distinct_count(Precision::Inexact(6))
+                    .with_min_value(Precision::Exact(ScalarValue::Int64(Some(-4))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int64(Some(34))))
+                    .with_sum_value(Precision::Exact(ScalarValue::Int64(Some(84))))
+                    .with_null_count(Precision::Exact(1)),
+            )
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_distinct_count(Precision::Inexact(5))
+                    .with_min_value(Precision::Exact(ScalarValue::from("a")))
+                    .with_max_value(Precision::Exact(ScalarValue::from("z"))),
+            )
+            .add_column_statistics(ColumnStatistics::new_unknown());
+
+        (schema, left, right, expected)
+    }
+
+    #[test]
+    fn test_union_partition_statistics_uses_shared_statistics_merge() -> Result<()> {
+        let (schema, left, right, expected) = stats_merge_inputs();
+
+        let left: Arc<dyn ExecutionPlan> =
+            Arc::new(StatisticsExec::new(left, schema.as_ref().clone()));
+        let right: Arc<dyn ExecutionPlan> =
+            Arc::new(StatisticsExec::new(right, schema.as_ref().clone()));
+
+        let union = UnionExec::try_new(vec![left, right])?;
+        let stats = union.partition_statistics(None)?;
+
+        assert_eq!(stats.as_ref(), &expected);
+        Ok(())
+    }
+
+    #[test]
+    fn test_union_partition_statistics_uses_shared_statistics_merge_multicolumn()
+    -> Result<()> {
+        let (schema, left, right, expected) = stats_merge_multicolumn_inputs();
+
+        let left: Arc<dyn ExecutionPlan> =
+            Arc::new(StatisticsExec::new(left, schema.as_ref().clone()));
+        let right: Arc<dyn ExecutionPlan> =
+            Arc::new(StatisticsExec::new(right, schema.as_ref().clone()));
+
+        let union = UnionExec::try_new(vec![left, right])?;
+        let stats = union.partition_statistics(None)?;
 
-        assert_eq!(result, expected);
+        assert_eq!(stats.as_ref(), &expected);
+        Ok(())
+    }
+
+    #[test]
+    fn test_interleave_partition_statistics_uses_shared_statistics_merge() -> Result<()> {
+        let (schema, left, right, expected) = stats_merge_inputs();
+        let hash_expr = vec![col("a", schema.as_ref())?];
+
+        let left: Arc<dyn ExecutionPlan> = Arc::new(RepartitionExec::try_new(
+            Arc::new(StatisticsExec::new(left, schema.as_ref().clone())),
+            Partitioning::Hash(hash_expr.clone(), 2),
+        )?);
+        let right: Arc<dyn ExecutionPlan> = Arc::new(RepartitionExec::try_new(
+            Arc::new(StatisticsExec::new(right, schema.as_ref().clone())),
+            Partitioning::Hash(hash_expr, 2),
+        )?);
+
+        let interleave = InterleaveExec::try_new(vec![left, right])?;
+        let stats = interleave.partition_statistics(None)?;
+
+        assert_eq!(stats.as_ref(), &expected);
+        Ok(())
+    }
+
+    #[test]
+    fn test_interleave_partition_statistics_for_partition_uses_shared_statistics_merge()
+    -> Result<()> {
+        let (schema, left, right, _) = stats_merge_inputs();
+        let hash_expr = vec![col("a", schema.as_ref())?];
+
+        let left: Arc<dyn ExecutionPlan> = Arc::new(RepartitionExec::try_new(
+            Arc::new(StatisticsExec::new(left, schema.as_ref().clone())),
+            Partitioning::Hash(hash_expr.clone(), 2),
+        )?);
+        let right: Arc<dyn ExecutionPlan> = Arc::new(RepartitionExec::try_new(
+            Arc::new(StatisticsExec::new(right, schema.as_ref().clone())),
+            Partitioning::Hash(hash_expr, 2),
+        )?);
+
+        let interleave = InterleaveExec::try_new(vec![left, right])?;
+        let stats = interleave.partition_statistics(Some(0))?;
+
+        let expected = Statistics::default()
+            .with_num_rows(Precision::Inexact(5))
+            .with_total_byte_size(Precision::Inexact(25))
+            .add_column_statistics(ColumnStatistics::new_unknown());
+
+        assert_eq!(stats.as_ref(), &expected);
+        Ok(())
     }
 
     #[tokio::test]
@@ -926,14 +1167,14 @@ mod tests {
             let first_orderings = convert_to_orderings(first_child_orderings);
             let second_orderings = convert_to_orderings(second_child_orderings);
             let union_expected_orderings = convert_to_orderings(union_orderings);
-            let child1 = Arc::new(TestMemoryExec::update_cache(Arc::new(
-                TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?
-                    .try_with_sort_information(first_orderings)?,
-            )));
-            let child2 = Arc::new(TestMemoryExec::update_cache(Arc::new(
-                TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?
-                    .try_with_sort_information(second_orderings)?,
-            )));
+            let child1_exec = TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?
+                .try_with_sort_information(first_orderings)?;
+            let child1 = Arc::new(child1_exec);
+            let child1 = Arc::new(TestMemoryExec::update_cache(&child1));
+            let child2_exec = TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?
+                .try_with_sort_information(second_orderings)?;
+            let child2 = Arc::new(child2_exec);
+            let child2 = Arc::new(TestMemoryExec::update_cache(&child2));
 
             let mut union_expected_eq = EquivalenceProperties::new(Arc::clone(&schema));
             union_expected_eq.add_orderings(union_expected_orderings);
@@ -967,20 +1208,24 @@ mod tests {
     fn test_union_empty_inputs() {
         // Test that UnionExec::try_new fails with empty inputs
         let result = UnionExec::try_new(vec![]);
-        assert!(result
-            .unwrap_err()
-            .to_string()
-            .contains("UnionExec requires at least one input"));
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("UnionExec requires at least one input")
+        );
     }
 
     #[test]
     fn test_union_schema_empty_inputs() {
         // Test that union_schema fails with empty inputs
         let result = union_schema(&[]);
-        assert!(result
-            .unwrap_err()
-            .to_string()
-            .contains("Cannot create union schema from empty inputs"));
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Cannot create union schema from empty inputs")
+        );
     }
 
     #[test]
@@ -1013,7 +1258,6 @@ mod tests {
 
         // Downcast to verify it's a UnionExec
         let union = union_plan
-            .as_any()
             .downcast_ref::<UnionExec>()
             .expect("Expected UnionExec");
 
@@ -1024,4 +1268,43 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_union_schema_mismatch() {
+        // Test that UnionExec properly rejects inputs with different field counts
+        let schema = create_test_schema().unwrap();
+        let schema2 = create_test_schema2().unwrap();
+        let memory_exec1 =
+            Arc::new(TestMemoryExec::try_new(&[], Arc::clone(&schema), None).unwrap());
+        let memory_exec2 =
+            Arc::new(TestMemoryExec::try_new(&[], Arc::clone(&schema2), None).unwrap());
+
+        let result = UnionExec::try_new(vec![memory_exec1, memory_exec2]);
+        assert!(result.is_err());
+        assert!(
+            result.unwrap_err().to_string().contains(
+                "UnionExec/InterleaveExec requires all inputs to have the same number of fields"
+            )
+        );
+    }
+
+    #[test]
+    fn test_union_cardinality_effect() -> Result<()> {
+        let schema = create_test_schema()?;
+        let input1: Arc<dyn ExecutionPlan> =
+            Arc::new(TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?);
+        let input2: Arc<dyn ExecutionPlan> =
+            Arc::new(TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?);
+
+        let union = UnionExec::try_new(vec![input1, input2])?;
+        let union = union
+            .downcast_ref::<UnionExec>()
+            .expect("expected UnionExec for multiple inputs");
+
+        assert!(matches!(
+            union.cardinality_effect(),
+            CardinalityEffect::GreaterEqual
+        ));
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs
index 7212c764130e0..c774ff09af33c 100644
--- a/datafusion/physical-plan/src/unnest.rs
+++ b/datafusion/physical-plan/src/unnest.rs
@@ -18,22 +18,23 @@
 //! Define a plan for unnesting values in columns that contain a list type.
 
 use std::cmp::{self, Ordering};
-use std::task::{ready, Poll};
-use std::{any::Any, sync::Arc};
+use std::sync::Arc;
+use std::task::{Poll, ready};
 
 use super::metrics::{
-    self, BaselineMetrics, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet,
-    RecordOutput,
+    self, BaselineMetrics, ExecutionPlanMetricsSet, MetricBuilder, MetricCategory,
+    MetricsSet, RecordOutput,
 };
 use super::{DisplayAs, ExecutionPlanProperties, PlanProperties};
 use crate::{
     DisplayFormatType, Distribution, ExecutionPlan, RecordBatchStream,
-    SendableRecordBatchStream,
+    SendableRecordBatchStream, check_if_same_properties,
 };
 
 use arrow::array::{
-    new_null_array, Array, ArrayRef, AsArray, BooleanBufferBuilder, FixedSizeListArray,
-    Int64Array, LargeListArray, ListArray, PrimitiveArray, Scalar, StructArray,
+    Array, ArrayRef, AsArray, BooleanBufferBuilder, FixedSizeListArray, Int64Array,
+    LargeListArray, LargeListViewArray, ListArray, ListViewArray, PrimitiveArray, Scalar,
+    StructArray, new_null_array,
 };
 use arrow::compute::kernels::length::length;
 use arrow::compute::kernels::zip::zip;
@@ -42,14 +43,15 @@ use arrow::datatypes::{DataType, Int64Type, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use arrow_ord::cmp::lt;
 use async_trait::async_trait;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::{
-    exec_datafusion_err, exec_err, internal_err, Constraints, HashMap, HashSet, Result,
-    UnnestOptions,
+    Constraints, HashMap, HashSet, Result, UnnestOptions, exec_datafusion_err, exec_err,
+    internal_err,
 };
 use datafusion_execution::TaskContext;
+use datafusion_physical_expr::PhysicalExpr;
 use datafusion_physical_expr::equivalence::ProjectionMapping;
 use datafusion_physical_expr::expressions::Column;
-use datafusion_physical_expr::PhysicalExpr;
 use futures::{Stream, StreamExt};
 use log::trace;
 
@@ -74,7 +76,7 @@ pub struct UnnestExec {
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl UnnestExec {
@@ -90,7 +92,7 @@ impl UnnestExec {
             &input,
             &list_column_indices,
             &struct_column_indices,
-            Arc::clone(&schema),
+            &schema,
         )?;
 
         Ok(UnnestExec {
@@ -100,7 +102,7 @@ impl UnnestExec {
             struct_column_indices,
             options,
             metrics: Default::default(),
-            cache,
+            cache: Arc::new(cache),
         })
     }
 
@@ -109,7 +111,7 @@ impl UnnestExec {
         input: &Arc<dyn ExecutionPlan>,
         list_column_indices: &[ListUnnest],
         struct_column_indices: &[usize],
-        schema: SchemaRef,
+        schema: &SchemaRef,
     ) -> Result<PlanProperties> {
         // Find out which indices are not unnested, such that they can be copied over from the input plan
         let input_schema = input.schema();
@@ -159,7 +161,7 @@ impl UnnestExec {
         // the unnest operation invalidates any global uniqueness or primary-key constraints.
         let input_eq_properties = input.equivalence_properties();
         let eq_properties = input_eq_properties
-            .project(&projection_mapping, Arc::clone(&schema))
+            .project(&projection_mapping, Arc::clone(schema))
             .with_constraints(Constraints::default());
 
         // Output partitioning must use the projection mapping
@@ -193,6 +195,17 @@ impl UnnestExec {
     pub fn options(&self) -> &UnnestOptions {
         &self.options
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for UnnestExec {
@@ -217,11 +230,7 @@ impl ExecutionPlan for UnnestExec {
         "UnnestExec"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -229,12 +238,20 @@ impl ExecutionPlan for UnnestExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
-        children: Vec<Arc<dyn ExecutionPlan>>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         Ok(Arc::new(UnnestExec::new(
-            Arc::clone(&children[0]),
+            children.swap_remove(0),
             self.list_column_indices.clone(),
             self.struct_column_indices.clone(),
             Arc::clone(&self.schema),
@@ -277,25 +294,22 @@ struct UnnestMetrics {
     input_batches: metrics::Count,
     /// Number of rows consumed
     input_rows: metrics::Count,
-    /// Number of batches produced
-    output_batches: metrics::Count,
 }
 
 impl UnnestMetrics {
     fn new(partition: usize, metrics: &ExecutionPlanMetricsSet) -> Self {
-        let input_batches =
-            MetricBuilder::new(metrics).counter("input_batches", partition);
-
-        let input_rows = MetricBuilder::new(metrics).counter("input_rows", partition);
+        let input_batches = MetricBuilder::new(metrics)
+            .with_category(MetricCategory::Rows)
+            .counter("input_batches", partition);
 
-        let output_batches =
-            MetricBuilder::new(metrics).counter("output_batches", partition);
+        let input_rows = MetricBuilder::new(metrics)
+            .with_category(MetricCategory::Rows)
+            .counter("input_rows", partition);
 
         Self {
             baseline_metrics: BaselineMetrics::new(metrics, partition),
             input_batches,
             input_rows,
-            output_batches,
         }
     }
 }
@@ -361,7 +375,6 @@ impl UnnestStream {
                     let Some(result_batch) = result else {
                         continue;
                     };
-                    self.metrics.output_batches.add(1);
                     (&result_batch).record_output(&self.metrics.baseline_metrics);
 
                     // Empty record batches should not be emitted.
@@ -375,7 +388,7 @@ impl UnnestStream {
                         produced {} output batches containing {} rows in {}",
                         self.metrics.input_batches,
                         self.metrics.input_rows,
-                        self.metrics.output_batches,
+                        self.metrics.baseline_metrics.output_batches(),
                         self.metrics.baseline_metrics.output_rows(),
                         self.metrics.baseline_metrics.elapsed_compute(),
                     );
@@ -413,9 +426,7 @@ fn flatten_struct_cols(
                     Ok(struct_arr.columns().to_vec())
                 }
                 data_type => internal_err!(
-                    "expecting column {} from input plan to be a struct, got {:?}",
-                    idx,
-                    data_type
+                    "expecting column {idx} from input plan to be a struct, got {data_type}"
                 ),
             },
             None => Ok(vec![Arc::clone(column_data)]),
@@ -834,6 +845,30 @@ impl ListArrayType for FixedSizeListArray {
     }
 }
 
+impl ListArrayType for ListViewArray {
+    fn values(&self) -> &ArrayRef {
+        self.values()
+    }
+
+    fn value_offsets(&self, row: usize) -> (i64, i64) {
+        let offset = self.value_offsets()[row] as i64;
+        let size = self.value_sizes()[row] as i64;
+        (offset, offset + size)
+    }
+}
+
+impl ListArrayType for LargeListViewArray {
+    fn values(&self) -> &ArrayRef {
+        self.values()
+    }
+
+    fn value_offsets(&self, row: usize) -> (i64, i64) {
+        let offset = self.value_offsets()[row];
+        let size = self.value_sizes()[row];
+        (offset, offset + size)
+    }
+}
+
 /// Unnest multiple list arrays according to the length array.
 fn unnest_list_arrays(
     list_arrays: &[ArrayRef],
@@ -850,6 +885,12 @@ fn unnest_list_arrays(
             DataType::FixedSizeList(_, _) => {
                 Ok(list_array.as_fixed_size_list() as &dyn ListArrayType)
             }
+            DataType::ListView(_) => {
+                Ok(list_array.as_list_view::<i32>() as &dyn ListArrayType)
+            }
+            DataType::LargeListView(_) => {
+                Ok(list_array.as_list_view::<i64>() as &dyn ListArrayType)
+            }
             other => exec_err!("Invalid unnest datatype {other }"),
         })
         .collect::<Result<Vec<_>>>()?;
@@ -1206,32 +1247,32 @@ mod tests {
         .unwrap();
 
         assert_snapshot!(batches_to_string(&[ret]),
-        @r###"
-+---------------------------------+---------------------------------+---------------------------------+
-| col1_unnest_placeholder_depth_1 | col1_unnest_placeholder_depth_2 | col2_unnest_placeholder_depth_1 |
-+---------------------------------+---------------------------------+---------------------------------+
-| [1, 2, 3]                       | 1                               | a                               |
-|                                 | 2                               | b                               |
-| [4, 5]                          | 3                               |                                 |
-| [1, 2, 3]                       |                                 | a                               |
-|                                 |                                 | b                               |
-| [4, 5]                          |                                 |                                 |
-| [1, 2, 3]                       | 4                               | a                               |
-|                                 | 5                               | b                               |
-| [4, 5]                          |                                 |                                 |
-| [7, 8, 9, 10]                   | 7                               | c                               |
-|                                 | 8                               | d                               |
-| [11, 12, 13]                    | 9                               |                                 |
-|                                 | 10                              |                                 |
-| [7, 8, 9, 10]                   |                                 | c                               |
-|                                 |                                 | d                               |
-| [11, 12, 13]                    |                                 |                                 |
-| [7, 8, 9, 10]                   | 11                              | c                               |
-|                                 | 12                              | d                               |
-| [11, 12, 13]                    | 13                              |                                 |
-|                                 |                                 | e                               |
-+---------------------------------+---------------------------------+---------------------------------+
-        "###);
+        @r"
+        +---------------------------------+---------------------------------+---------------------------------+
+        | col1_unnest_placeholder_depth_1 | col1_unnest_placeholder_depth_2 | col2_unnest_placeholder_depth_1 |
+        +---------------------------------+---------------------------------+---------------------------------+
+        | [1, 2, 3]                       | 1                               | a                               |
+        |                                 | 2                               | b                               |
+        | [4, 5]                          | 3                               |                                 |
+        | [1, 2, 3]                       |                                 | a                               |
+        |                                 |                                 | b                               |
+        | [4, 5]                          |                                 |                                 |
+        | [1, 2, 3]                       | 4                               | a                               |
+        |                                 | 5                               | b                               |
+        | [4, 5]                          |                                 |                                 |
+        | [7, 8, 9, 10]                   | 7                               | c                               |
+        |                                 | 8                               | d                               |
+        | [11, 12, 13]                    | 9                               |                                 |
+        |                                 | 10                              |                                 |
+        | [7, 8, 9, 10]                   |                                 | c                               |
+        |                                 |                                 | d                               |
+        | [11, 12, 13]                    |                                 |                                 |
+        | [7, 8, 9, 10]                   | 11                              | c                               |
+        |                                 | 12                              | d                               |
+        | [11, 12, 13]                    | 13                              |                                 |
+        |                                 |                                 | e                               |
+        +---------------------------------+---------------------------------+---------------------------------+
+        ");
         Ok(())
     }
 
diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs
index a76316369ec77..14f8ce5e95ffd 100644
--- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs
+++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs
@@ -20,8 +20,7 @@
 //! the input data seen so far), which makes it appropriate when processing
 //! infinite inputs.
 
-use std::any::Any;
-use std::cmp::{min, Ordering};
+use std::cmp::{Ordering, min};
 use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
@@ -36,7 +35,7 @@ use crate::windows::{
 use crate::{
     ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, ExecutionPlan,
     ExecutionPlanProperties, InputOrderMode, PlanProperties, RecordBatchStream,
-    SendableRecordBatchStream, Statistics, WindowExpr,
+    SendableRecordBatchStream, Statistics, WindowExpr, check_if_same_properties,
 };
 
 use arrow::compute::take_record_batch;
@@ -48,15 +47,16 @@ use arrow::{
 };
 use datafusion_common::hash_utils::create_hashes;
 use datafusion_common::stats::Precision;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::utils::{
     evaluate_partition_ranges, get_at_indices, get_row_at_idx,
 };
 use datafusion_common::{
-    arrow_datafusion_err, exec_datafusion_err, exec_err, DataFusionError, HashMap, Result,
+    HashMap, Result, arrow_datafusion_err, exec_datafusion_err, exec_err,
 };
 use datafusion_execution::TaskContext;
-use datafusion_expr::window_state::{PartitionBatchState, WindowAggState};
 use datafusion_expr::ColumnarValue;
+use datafusion_expr::window_state::{PartitionBatchState, WindowAggState};
 use datafusion_physical_expr::window::{
     PartitionBatches, PartitionKey, PartitionWindowAggStates, WindowState,
 };
@@ -65,9 +65,10 @@ use datafusion_physical_expr_common::sort_expr::{
     OrderingRequirements, PhysicalSortExpr,
 };
 
-use ahash::RandomState;
+use crate::execution_plan::CardinalityEffect;
+use datafusion_common::hash_utils::RandomState;
 use futures::stream::Stream;
-use futures::{ready, StreamExt};
+use futures::{StreamExt, ready};
 use hashbrown::hash_table::HashTable;
 use indexmap::IndexMap;
 use log::debug;
@@ -93,7 +94,7 @@ pub struct BoundedWindowAggExec {
     // See `get_ordered_partition_by_indices` for more details.
     ordered_partition_by_indices: Vec<usize>,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
     /// If `can_rerepartition` is false, partition_keys is always empty.
     can_repartition: bool,
 }
@@ -134,7 +135,7 @@ impl BoundedWindowAggExec {
             metrics: ExecutionPlanMetricsSet::new(),
             input_order_mode,
             ordered_partition_by_indices,
-            cache,
+            cache: Arc::new(cache),
             can_repartition,
         })
     }
@@ -175,7 +176,9 @@ impl BoundedWindowAggExec {
                 if self.window_expr()[0].partition_by().len()
                     != ordered_partition_by_indices.len()
                 {
-                    return exec_err!("All partition by columns should have an ordering in Sorted mode.");
+                    return exec_err!(
+                        "All partition by columns should have an ordering in Sorted mode."
+                    );
                 }
                 Box::new(SortedSearch {
                     partition_by_sort_keys,
@@ -246,6 +249,17 @@ impl BoundedWindowAggExec {
             total_byte_size: Precision::Absent,
         })
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for BoundedWindowAggExec {
@@ -298,11 +312,7 @@ impl ExecutionPlan for BoundedWindowAggExec {
     }
 
     /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -310,6 +320,19 @@ impl ExecutionPlan for BoundedWindowAggExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        let mut tnr = TreeNodeRecursion::Continue;
+        for window_expr in &self.window_expr {
+            for expr in window_expr.expressions() {
+                tnr = tnr.visit_sibling(|| f(expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
+
     fn required_input_ordering(&self) -> Vec<Option<OrderingRequirements>> {
         let partition_bys = self.window_expr()[0].partition_by();
         let order_keys = self.window_expr()[0].order_by();
@@ -337,6 +360,7 @@ impl ExecutionPlan for BoundedWindowAggExec {
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         Ok(Arc::new(BoundedWindowAggExec::try_new(
             self.window_expr.clone(),
             Arc::clone(&children[0]),
@@ -366,13 +390,14 @@ impl ExecutionPlan for BoundedWindowAggExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let input_stat =
+            Arc::unwrap_or_clone(self.input.partition_statistics(partition)?);
+        Ok(Arc::new(self.statistics_helper(input_stat)?))
     }
 
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        let input_stat = self.input.partition_statistics(partition)?;
-        self.statistics_helper(input_stat)
+    fn cardinality_effect(&self) -> CardinalityEffect {
+        CardinalityEffect::Equal
     }
 }
 
@@ -627,23 +652,23 @@ impl PartitionSearcher for LinearSearch {
     fn mark_partition_end(&self, partition_buffers: &mut PartitionBatches) {
         // We should be in the `PartiallySorted` case, otherwise we can not
         // tell when we are at the end of a given partition.
-        if !self.ordered_partition_by_indices.is_empty() {
-            if let Some((last_row, _)) = partition_buffers.last() {
-                let last_sorted_cols = self
+        if !self.ordered_partition_by_indices.is_empty()
+            && let Some((last_row, _)) = partition_buffers.last()
+        {
+            let last_sorted_cols = self
+                .ordered_partition_by_indices
+                .iter()
+                .map(|idx| last_row[*idx].clone())
+                .collect::<Vec<_>>();
+            for (row, partition_batch_state) in partition_buffers.iter_mut() {
+                let sorted_cols = self
                     .ordered_partition_by_indices
                     .iter()
-                    .map(|idx| last_row[*idx].clone())
-                    .collect::<Vec<_>>();
-                for (row, partition_batch_state) in partition_buffers.iter_mut() {
-                    let sorted_cols = self
-                        .ordered_partition_by_indices
-                        .iter()
-                        .map(|idx| &row[*idx]);
-                    // All the partitions other than `last_sorted_cols` are done.
-                    // We are sure that we will no longer receive values for these
-                    // partitions (arrival of a new value would violate ordering).
-                    partition_batch_state.is_end = !sorted_cols.eq(&last_sorted_cols);
-                }
+                    .map(|idx| &row[*idx]);
+                // All the partitions other than `last_sorted_cols` are done.
+                // We are sure that we will no longer receive values for these
+                // partitions (arrival of a new value would violate ordering).
+                partition_batch_state.is_end = !sorted_cols.eq(&last_sorted_cols);
             }
         }
     }
@@ -1242,23 +1267,24 @@ mod tests {
     use std::time::Duration;
 
     use crate::common::collect;
+    use crate::execution_plan::CardinalityEffect;
     use crate::expressions::PhysicalSortExpr;
     use crate::projection::{ProjectionExec, ProjectionExpr};
     use crate::streaming::{PartitionStream, StreamingTableExec};
     use crate::test::TestMemoryExec;
     use crate::windows::{
-        create_udwf_window_expr, create_window_expr, BoundedWindowAggExec, InputOrderMode,
+        BoundedWindowAggExec, InputOrderMode, create_udwf_window_expr, create_window_expr,
     };
-    use crate::{displayable, execute_stream, ExecutionPlan};
+    use crate::{ExecutionPlan, displayable, execute_stream};
 
     use arrow::array::{
-        builder::{Int64Builder, UInt64Builder},
         RecordBatch,
+        builder::{Int64Builder, UInt64Builder},
     };
     use arrow::compute::SortOptions;
     use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
     use datafusion_common::test_util::batches_to_string;
-    use datafusion_common::{exec_datafusion_err, Result, ScalarValue};
+    use datafusion_common::{Result, ScalarValue, exec_datafusion_err};
     use datafusion_execution::config::SessionConfig;
     use datafusion_execution::{
         RecordBatchStream, SendableRecordBatchStream, TaskContext,
@@ -1269,12 +1295,12 @@ mod tests {
     use datafusion_functions_aggregate::count::count_udaf;
     use datafusion_functions_window::nth_value::last_value_udwf;
     use datafusion_functions_window::nth_value::nth_value_udwf;
-    use datafusion_physical_expr::expressions::{col, Column, Literal};
+    use datafusion_physical_expr::expressions::{Column, Literal, col};
     use datafusion_physical_expr::window::StandardWindowExpr;
     use datafusion_physical_expr::{LexOrdering, PhysicalExpr};
 
     use futures::future::Shared;
-    use futures::{pin_mut, ready, FutureExt, Stream, StreamExt};
+    use futures::{FutureExt, Stream, StreamExt, pin_mut, ready};
     use insta::assert_snapshot;
     use itertools::Itertools;
     use tokio::time::timeout;
@@ -1474,20 +1500,6 @@ mod tests {
         Ok(results)
     }
 
-    /// Execute the [ExecutionPlan] and collect the results in memory
-    #[allow(dead_code)]
-    pub async fn collect_bonafide(
-        plan: Arc<dyn ExecutionPlan>,
-        context: Arc<TaskContext>,
-    ) -> Result<Vec<RecordBatch>> {
-        let stream = execute_stream(plan, context)?;
-        let mut results = vec![];
-
-        collect_stream(stream, &mut results).await?;
-
-        Ok(results)
-    }
-
     fn test_schema() -> SchemaRef {
         Arc::new(Schema::new(vec![
             Field::new("sn", DataType::UInt64, true),
@@ -1496,14 +1508,16 @@ mod tests {
     }
 
     fn schema_orders(schema: &SchemaRef) -> Result<Vec<LexOrdering>> {
-        let orderings = vec![[PhysicalSortExpr {
-            expr: col("sn", schema)?,
-            options: SortOptions {
-                descending: false,
-                nulls_first: false,
-            },
-        }]
-        .into()];
+        let orderings = vec![
+            [PhysicalSortExpr {
+                expr: col("sn", schema)?,
+                options: SortOptions {
+                    descending: false,
+                    nulls_first: false,
+                },
+            }]
+            .into(),
+        ];
         Ok(orderings)
     }
 
@@ -1700,21 +1714,21 @@ mod tests {
           DataSourceExec: partitions=1, partition_sizes=[3]
         "#);
 
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +---+------+---------------+---------------+
-            | a | last | nth_value(-1) | nth_value(-2) |
-            +---+------+---------------+---------------+
-            | 1 | 1    | 1             |               |
-            | 2 | 2    | 2             | 1             |
-            | 3 | 3    | 3             | 2             |
-            | 1 | 1    | 1             | 3             |
-            | 2 | 2    | 2             | 1             |
-            | 3 | 3    | 3             | 2             |
-            | 1 | 1    | 1             | 3             |
-            | 2 | 2    | 2             | 1             |
-            | 3 | 3    | 3             | 2             |
-            +---+------+---------------+---------------+
-            "#);
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +---+------+---------------+---------------+
+        | a | last | nth_value(-1) | nth_value(-2) |
+        +---+------+---------------+---------------+
+        | 1 | 1    | 1             |               |
+        | 2 | 2    | 2             | 1             |
+        | 3 | 3    | 3             | 2             |
+        | 1 | 1    | 1             | 3             |
+        | 2 | 2    | 2             | 1             |
+        | 3 | 3    | 3             | 2             |
+        | 1 | 1    | 1             | 3             |
+        | 2 | 2    | 2             | 1             |
+        | 3 | 3    | 3             | 2             |
+        +---+------+---------------+---------------+
+        ");
         Ok(())
     }
 
@@ -1821,21 +1835,38 @@ mod tests {
         let task_ctx = task_context();
         let batches = collect_with_timeout(plan, task_ctx, timeout_duration).await?;
 
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+------+-------+
-            | sn | hash | col_2 |
-            +----+------+-------+
-            | 0  | 2    | 2     |
-            | 1  | 2    | 2     |
-            | 2  | 2    | 2     |
-            | 3  | 2    | 1     |
-            | 4  | 1    | 2     |
-            | 5  | 1    | 2     |
-            | 6  | 1    | 2     |
-            | 7  | 1    | 1     |
-            +----+------+-------+
-            "#);
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +----+------+-------+
+        | sn | hash | col_2 |
+        +----+------+-------+
+        | 0  | 2    | 2     |
+        | 1  | 2    | 2     |
+        | 2  | 2    | 2     |
+        | 3  | 2    | 1     |
+        | 4  | 1    | 2     |
+        | 5  | 1    | 2     |
+        | 6  | 1    | 2     |
+        | 7  | 1    | 1     |
+        +----+------+-------+
+        ");
 
         Ok(())
     }
+
+    #[test]
+    fn test_bounded_window_agg_cardinality_effect() -> Result<()> {
+        let schema = test_schema();
+        let input: Arc<dyn ExecutionPlan> =
+            Arc::new(TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?);
+        let plan = bounded_window_exec_pb_latent_range(input, 1, "hash", "sn")?;
+        let plan = plan
+            .downcast_ref::<BoundedWindowAggExec>()
+            .expect("expected BoundedWindowAggExec");
+
+        assert!(matches!(
+            plan.cardinality_effect(),
+            CardinalityEffect::Equal
+        ));
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/windows/mod.rs b/datafusion/physical-plan/src/windows/mod.rs
index cd35325eb3d7a..b72a65cf996be 100644
--- a/datafusion/physical-plan/src/windows/mod.rs
+++ b/datafusion/physical-plan/src/windows/mod.rs
@@ -25,13 +25,13 @@ use std::borrow::Borrow;
 use std::sync::Arc;
 
 use crate::{
-    expressions::PhysicalSortExpr, ExecutionPlan, ExecutionPlanProperties,
-    InputOrderMode, PhysicalExpr,
+    ExecutionPlan, ExecutionPlanProperties, InputOrderMode, PhysicalExpr,
+    expressions::PhysicalSortExpr,
 };
 
 use arrow::datatypes::{Schema, SchemaRef};
 use arrow_schema::{FieldRef, SortOptions};
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::{
     LimitEffect, PartitionEvaluator, ReversedUDWF, SetMonotonicity, WindowFrame,
     WindowFunctionDefinition, WindowUDF,
@@ -88,7 +88,7 @@ pub fn schema_add_window_field(
 }
 
 /// Create a physical expression for window function
-#[allow(clippy::too_many_arguments)]
+#[expect(clippy::too_many_arguments)]
 pub fn create_window_expr(
     fun: &WindowFunctionDefinition,
     name: String,
@@ -226,6 +226,18 @@ impl WindowUDFExpr {
     pub fn fun(&self) -> &Arc<WindowUDF> {
         &self.fun
     }
+
+    /// Returns all arguments passed to this window function.
+    ///
+    /// Unlike [`StandardWindowFunctionExpr::expressions`], which returns
+    /// only the expressions that need batch evaluation (and may filter out
+    /// literal offset/default args like those for `lead`/`lag`), this
+    /// method returns the complete, unfiltered argument list. This is
+    /// needed for serialization so that all arguments survive a
+    /// protobuf round-trip.
+    pub fn args(&self) -> &[Arc<dyn PhysicalExpr>] {
+        &self.args
+    }
 }
 
 impl StandardWindowFunctionExpr for WindowUDFExpr {
@@ -389,11 +401,11 @@ pub(crate) fn window_equivalence_properties(
             let mut found = false;
             for sort_expr in sort_options.into_iter() {
                 candidate_ordering.push(sort_expr);
-                if let Some(lex) = LexOrdering::new(candidate_ordering.clone()) {
-                    if window_eq_properties.ordering_satisfy(lex)? {
-                        found = true;
-                        break;
-                    }
+                if let Some(lex) = LexOrdering::new(candidate_ordering.clone())
+                    && window_eq_properties.ordering_satisfy(lex)?
+                {
+                    found = true;
+                    break;
                 }
                 // This option didn't work, remove it and try the next one
                 candidate_ordering.pop();
@@ -407,10 +419,10 @@ pub(crate) fn window_equivalence_properties(
 
         // If we successfully built an ordering for all columns, use it
         // When there are no partition expressions, candidate_ordering will be empty and won't be added
-        if candidate_ordering.len() == partitioning_exprs.len() {
-            if let Some(lex) = LexOrdering::new(candidate_ordering) {
-                all_satisfied_lexs.push(lex);
-            }
+        if candidate_ordering.len() == partitioning_exprs.len()
+            && let Some(lex) = LexOrdering::new(candidate_ordering)
+        {
+            all_satisfied_lexs.push(lex);
         }
         // If there is a partitioning, and no possible ordering cannot satisfy
         // the input plan's orderings, then we cannot further introduce any
@@ -512,21 +524,21 @@ pub(crate) fn window_equivalence_properties(
                             let is_asc = !sort_expr.options.descending;
                             candidate_order.push(sort_expr);
 
-                            if let Some(lex) = LexOrdering::new(candidate_order.clone()) {
-                                if window_eq_properties.ordering_satisfy(lex)? {
-                                    if idx == 0 {
-                                        // The first column's ordering direction determines the overall
-                                        // monotonicity behavior of the window result.
-                                        // - If the aggregate has increasing set monotonicity (e.g., MAX, COUNT)
-                                        //   and the first arg is ascending, the window result is increasing
-                                        // - If the aggregate has decreasing set monotonicity (e.g., MIN)
-                                        //   and the first arg is ascending, the window result is also increasing
-                                        // This flag is used to determine the final window column ordering.
-                                        asc = is_asc;
-                                    }
-                                    found = true;
-                                    break;
+                            if let Some(lex) = LexOrdering::new(candidate_order.clone())
+                                && window_eq_properties.ordering_satisfy(lex)?
+                            {
+                                if idx == 0 {
+                                    // The first column's ordering direction determines the overall
+                                    // monotonicity behavior of the window result.
+                                    // - If the aggregate has increasing set monotonicity (e.g., MAX, COUNT)
+                                    //   and the first arg is ascending, the window result is increasing
+                                    // - If the aggregate has decreasing set monotonicity (e.g., MIN)
+                                    //   and the first arg is ascending, the window result is also increasing
+                                    // This flag is used to determine the final window column ordering.
+                                    asc = is_asc;
                                 }
+                                found = true;
+                                break;
                             }
                             // This option didn't work, remove it and try the next one
                             candidate_order.pop();
@@ -740,13 +752,13 @@ mod tests {
     use crate::expressions::col;
     use crate::streaming::StreamingTableExec;
     use crate::test::assert_is_pending;
-    use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec};
+    use crate::test::exec::{BlockingExec, assert_strong_count_converges_to_zero};
 
+    use InputOrderMode::{Linear, PartiallySorted, Sorted};
     use arrow::compute::SortOptions;
     use arrow_schema::{DataType, Field};
     use datafusion_execution::TaskContext;
     use datafusion_functions_aggregate::count::count_udaf;
-    use InputOrderMode::{Linear, PartiallySorted, Sorted};
 
     use futures::FutureExt;
 
diff --git a/datafusion/physical-plan/src/windows/window_agg_exec.rs b/datafusion/physical-plan/src/windows/window_agg_exec.rs
index 1b7cb9bb76e1b..5098c84034062 100644
--- a/datafusion/physical-plan/src/windows/window_agg_exec.rs
+++ b/datafusion/physical-plan/src/windows/window_agg_exec.rs
@@ -17,13 +17,12 @@
 
 //! Stream and channel implementations for window function expressions.
 
-use std::any::Any;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 
 use super::utils::create_schema;
-use crate::execution_plan::EmissionType;
+use crate::execution_plan::{CardinalityEffect, EmissionType};
 use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
 use crate::windows::{
     calc_requirements, get_ordered_partition_by_indices, get_partition_by_sort_exprs,
@@ -32,7 +31,7 @@ use crate::windows::{
 use crate::{
     ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, ExecutionPlan,
     ExecutionPlanProperties, PhysicalExpr, PlanProperties, RecordBatchStream,
-    SendableRecordBatchStream, Statistics, WindowExpr,
+    SendableRecordBatchStream, Statistics, WindowExpr, check_if_same_properties,
 };
 
 use arrow::array::ArrayRef;
@@ -41,14 +40,15 @@ use arrow::datatypes::SchemaRef;
 use arrow::error::ArrowError;
 use arrow::record_batch::RecordBatch;
 use datafusion_common::stats::Precision;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::utils::{evaluate_partition_ranges, transpose};
-use datafusion_common::{internal_err, Result};
+use datafusion_common::{Result, assert_eq_or_internal_err};
 use datafusion_execution::TaskContext;
 use datafusion_physical_expr_common::sort_expr::{
     OrderingRequirements, PhysicalSortExpr,
 };
 
-use futures::{ready, Stream, StreamExt};
+use futures::{Stream, StreamExt, ready};
 
 /// Window execution plan
 #[derive(Debug, Clone)]
@@ -65,7 +65,7 @@ pub struct WindowAggExec {
     // see `get_ordered_partition_by_indices` for more details.
     ordered_partition_by_indices: Vec<usize>,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
     /// If `can_partition` is false, partition_keys is always empty.
     can_repartition: bool,
 }
@@ -82,14 +82,14 @@ impl WindowAggExec {
 
         let ordered_partition_by_indices =
             get_ordered_partition_by_indices(window_expr[0].partition_by(), &input)?;
-        let cache = Self::compute_properties(Arc::clone(&schema), &input, &window_expr)?;
+        let cache = Self::compute_properties(&schema, &input, &window_expr)?;
         Ok(Self {
             input,
             window_expr,
             schema,
             metrics: ExecutionPlanMetricsSet::new(),
             ordered_partition_by_indices,
-            cache,
+            cache: Arc::new(cache),
             can_repartition,
         })
     }
@@ -120,12 +120,12 @@ impl WindowAggExec {
 
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
     fn compute_properties(
-        schema: SchemaRef,
+        schema: &SchemaRef,
         input: &Arc<dyn ExecutionPlan>,
         window_exprs: &[Arc<dyn WindowExpr>],
     ) -> Result<PlanProperties> {
         // Calculate equivalence properties:
-        let eq_properties = window_equivalence_properties(&schema, input, window_exprs)?;
+        let eq_properties = window_equivalence_properties(schema, input, window_exprs)?;
 
         // Get output partitioning:
         // Because we can have repartitioning using the partition keys this
@@ -159,22 +159,15 @@ impl WindowAggExec {
         }
     }
 
-    fn statistics_inner(&self) -> Result<Statistics> {
-        let input_stat = self.input.partition_statistics(None)?;
-        let win_cols = self.window_expr.len();
-        let input_cols = self.input.schema().fields().len();
-        // TODO stats: some windowing function will maintain invariants such as min, max...
-        let mut column_statistics = Vec::with_capacity(win_cols + input_cols);
-        // copy stats of the input to the beginning of the schema.
-        column_statistics.extend(input_stat.column_statistics);
-        for _ in 0..win_cols {
-            column_statistics.push(ColumnStatistics::new_unknown())
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
         }
-        Ok(Statistics {
-            num_rows: input_stat.num_rows,
-            column_statistics,
-            total_byte_size: Precision::Absent,
-        })
     }
 }
 
@@ -220,11 +213,7 @@ impl ExecutionPlan for WindowAggExec {
     }
 
     /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -232,6 +221,19 @@ impl ExecutionPlan for WindowAggExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        let mut tnr = TreeNodeRecursion::Continue;
+        for window_expr in &self.window_expr {
+            for expr in window_expr.expressions() {
+                tnr = tnr.visit_sibling(|| f(expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
+
     fn maintains_input_order(&self) -> Vec<bool> {
         vec![true]
     }
@@ -260,11 +262,12 @@ impl ExecutionPlan for WindowAggExec {
 
     fn with_new_children(
         self: Arc<Self>,
-        children: Vec<Arc<dyn ExecutionPlan>>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         Ok(Arc::new(WindowAggExec::try_new(
             self.window_expr.clone(),
-            Arc::clone(&children[0]),
+            children.swap_remove(0),
             true,
         )?))
     }
@@ -290,16 +293,27 @@ impl ExecutionPlan for WindowAggExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.statistics_inner()
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let input_stat =
+            Arc::unwrap_or_clone(self.input.partition_statistics(partition)?);
+        let win_cols = self.window_expr.len();
+        let input_cols = self.input.schema().fields().len();
+        // TODO stats: some windowing function will maintain invariants such as min, max...
+        let mut column_statistics = Vec::with_capacity(win_cols + input_cols);
+        // copy stats of the input to the beginning of the schema.
+        column_statistics.extend(input_stat.column_statistics);
+        for _ in 0..win_cols {
+            column_statistics.push(ColumnStatistics::new_unknown())
+        }
+        Ok(Arc::new(Statistics {
+            num_rows: input_stat.num_rows,
+            column_statistics,
+            total_byte_size: Precision::Absent,
+        }))
     }
 
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        if partition.is_none() {
-            self.statistics_inner()
-        } else {
-            Ok(Statistics::new_unknown(&self.schema()))
-        }
+    fn cardinality_effect(&self) -> CardinalityEffect {
+        CardinalityEffect::Equal
     }
 }
 
@@ -337,9 +351,11 @@ impl WindowAggStream {
         ordered_partition_by_indices: Vec<usize>,
     ) -> Result<Self> {
         // In WindowAggExec all partition by columns should be ordered.
-        if window_expr[0].partition_by().len() != ordered_partition_by_indices.len() {
-            return internal_err!("All partition by columns should have an ordering");
-        }
+        assert_eq_or_internal_err!(
+            window_expr[0].partition_by().len(),
+            ordered_partition_by_indices.len(),
+            "All partition by columns should have an ordering"
+        );
         Ok(Self {
             schema,
             input,
@@ -448,3 +464,47 @@ impl RecordBatchStream for WindowAggStream {
         Arc::clone(&self.schema)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::test::TestMemoryExec;
+    use crate::windows::create_window_expr;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common::ScalarValue;
+    use datafusion_expr::{
+        WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition,
+    };
+    use datafusion_functions_aggregate::count::count_udaf;
+
+    #[test]
+    fn test_window_agg_cardinality_effect() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, true)]));
+        let input: Arc<dyn ExecutionPlan> =
+            Arc::new(TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?);
+        let args = vec![crate::expressions::col("a", &schema)?];
+        let window_expr = create_window_expr(
+            &WindowFunctionDefinition::AggregateUDF(count_udaf()),
+            "count(a)".to_string(),
+            &args,
+            &[],
+            &[],
+            Arc::new(WindowFrame::new_bounds(
+                WindowFrameUnits::Rows,
+                WindowFrameBound::Preceding(ScalarValue::UInt64(None)),
+                WindowFrameBound::CurrentRow,
+            )),
+            Arc::clone(&schema),
+            false,
+            false,
+            None,
+        )?;
+
+        let window = WindowAggExec::try_new(vec![window_expr], input, true)?;
+        assert!(matches!(
+            window.cardinality_effect(),
+            CardinalityEffect::Equal
+        ));
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/work_table.rs b/datafusion/physical-plan/src/work_table.rs
index 40a22f94b81f6..0855dbf2fd635 100644
--- a/datafusion/physical-plan/src/work_table.rs
+++ b/datafusion/physical-plan/src/work_table.rs
@@ -31,16 +31,16 @@ use crate::{
 
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
-use datafusion_common::{internal_datafusion_err, internal_err, Result};
-use datafusion_execution::memory_pool::MemoryReservation;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, assert_eq_or_internal_err, internal_datafusion_err};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::{EquivalenceProperties, Partitioning};
+use datafusion_execution::memory_pool::MemoryReservation;
+use datafusion_physical_expr::{EquivalenceProperties, Partitioning, PhysicalExpr};
 
 /// A vector of record batches with a memory reservation.
 #[derive(Debug)]
 pub(super) struct ReservedBatches {
     batches: Vec<RecordBatch>,
-    #[allow(dead_code)]
     reservation: MemoryReservation,
 }
 
@@ -59,13 +59,15 @@ impl ReservedBatches {
 #[derive(Debug)]
 pub struct WorkTable {
     batches: Mutex<Option<ReservedBatches>>,
+    name: String,
 }
 
 impl WorkTable {
     /// Create a new work table.
-    pub(super) fn new() -> Self {
+    pub(super) fn new(name: String) -> Self {
         Self {
             batches: Mutex::new(None),
+            name,
         }
     }
 
@@ -101,25 +103,35 @@ pub struct WorkTableExec {
     name: String,
     /// The schema of the stream
     schema: SchemaRef,
+    /// Projection to apply to build the output stream from the recursion state
+    projection: Option<Vec<usize>>,
     /// The work table
     work_table: Arc<WorkTable>,
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl WorkTableExec {
     /// Create a new execution plan for a worktable exec.
-    pub fn new(name: String, schema: SchemaRef) -> Self {
+    pub fn new(
+        name: String,
+        mut schema: SchemaRef,
+        projection: Option<Vec<usize>>,
+    ) -> Result<Self> {
+        if let Some(projection) = &projection {
+            schema = Arc::new(schema.project(projection)?);
+        }
         let cache = Self::compute_properties(Arc::clone(&schema));
-        Self {
-            name,
+        Ok(Self {
+            name: name.clone(),
             schema,
+            projection,
+            work_table: Arc::new(WorkTable::new(name)),
             metrics: ExecutionPlanMetricsSet::new(),
-            work_table: Arc::new(WorkTable::new()),
-            cache,
-        }
+            cache: Arc::new(cache),
+        })
     }
 
     /// Ref to name
@@ -166,11 +178,7 @@ impl ExecutionPlan for WorkTableExec {
         "WorkTableExec"
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -178,6 +186,13 @@ impl ExecutionPlan for WorkTableExec {
         vec![]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         _: Vec<Arc<dyn ExecutionPlan>>,
@@ -192,16 +207,27 @@ impl ExecutionPlan for WorkTableExec {
         _context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
         // WorkTable streams must be the plan base.
-        if partition != 0 {
-            return internal_err!(
-                "WorkTableExec got an invalid partition {partition} (expected 0)"
-            );
+        assert_eq_or_internal_err!(
+            partition,
+            0,
+            "WorkTableExec got an invalid partition {partition} (expected 0)"
+        );
+        let ReservedBatches {
+            mut batches,
+            reservation,
+        } = self.work_table.take()?;
+        if let Some(projection) = &self.projection {
+            // We apply the projection
+            // TODO: it would be better to apply it as soon as possible and not only here
+            // TODO: an aggressive projection makes the memory reservation smaller, even if we do not edit it
+            batches = batches
+                .into_iter()
+                .map(|b| b.project(projection))
+                .collect::<Result<Vec<_>, _>>()?;
         }
-        let batch = self.work_table.take()?;
 
-        let stream =
-            MemoryStream::try_new(batch.batches, Arc::clone(&self.schema), None)?
-                .with_reservation(batch.reservation);
+        let stream = MemoryStream::try_new(batches, Arc::clone(&self.schema), None)?
+            .with_reservation(reservation);
         Ok(Box::pin(cooperative(stream)))
     }
 
@@ -209,12 +235,8 @@ impl ExecutionPlan for WorkTableExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(Statistics::new_unknown(&self.schema()))
-    }
-
-    fn partition_statistics(&self, _partition: Option<usize>) -> Result<Statistics> {
-        Ok(Statistics::new_unknown(&self.schema()))
+    fn partition_statistics(&self, _partition: Option<usize>) -> Result<Arc<Statistics>> {
+        Ok(Arc::new(Statistics::new_unknown(&self.schema())))
     }
 
     /// Injects run-time state into this `WorkTableExec`.
@@ -231,12 +253,17 @@ impl ExecutionPlan for WorkTableExec {
         // Down-cast to the expected state type; propagate `None` on failure
         let work_table = state.downcast::<WorkTable>().ok()?;
 
+        if work_table.name != self.name {
+            return None; // Different table
+        }
+
         Some(Arc::new(Self {
             name: self.name.clone(),
             schema: Arc::clone(&self.schema),
+            projection: self.projection.clone(),
             metrics: ExecutionPlanMetricsSet::new(),
             work_table,
-            cache: self.cache.clone(),
+            cache: Arc::clone(&self.cache),
         }))
     }
 }
@@ -244,17 +271,19 @@ impl ExecutionPlan for WorkTableExec {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::array::{ArrayRef, Int32Array};
+    use arrow::array::{ArrayRef, Int16Array, Int32Array, Int64Array};
+    use arrow_schema::{DataType, Field, Schema};
     use datafusion_execution::memory_pool::{MemoryConsumer, UnboundedMemoryPool};
+    use futures::StreamExt;
 
     #[test]
     fn test_work_table() {
-        let work_table = WorkTable::new();
+        let work_table = WorkTable::new("test".into());
         // Can't take from empty work_table
         assert!(work_table.take().is_err());
 
         let pool = Arc::new(UnboundedMemoryPool::default()) as _;
-        let mut reservation = MemoryConsumer::new("test_work_table").register(&pool);
+        let reservation = MemoryConsumer::new("test_work_table").register(&pool);
 
         // Update batch to work_table
         let array: ArrayRef = Arc::new((0..5).collect::<Int32Array>());
@@ -278,4 +307,53 @@ mod tests {
         drop(memory_stream);
         assert_eq!(pool.reserved(), 0);
     }
+
+    #[tokio::test]
+    async fn test_work_table_exec() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Int16, false),
+        ]));
+        let work_table_exec =
+            WorkTableExec::new("wt".into(), Arc::clone(&schema), Some(vec![2, 1]))
+                .unwrap();
+
+        // We inject the work table
+        let work_table = Arc::new(WorkTable::new("wt".into()));
+        let work_table_exec = work_table_exec
+            .with_new_state(Arc::clone(&work_table) as _)
+            .unwrap();
+
+        // We update the work table
+        let pool = Arc::new(UnboundedMemoryPool::default()) as _;
+        let reservation = MemoryConsumer::new("test_work_table").register(&pool);
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int64Array::from(vec![1, 2, 3, 4, 5])),
+                Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])),
+                Arc::new(Int16Array::from(vec![1, 2, 3, 4, 5])),
+            ],
+        )
+        .unwrap();
+        work_table.update(ReservedBatches::new(vec![batch], reservation));
+
+        // We get back the batch from the work table
+        let returned_batch = work_table_exec
+            .execute(0, Arc::new(TaskContext::default()))
+            .unwrap()
+            .next()
+            .await
+            .unwrap()
+            .unwrap();
+        assert_eq!(
+            returned_batch,
+            RecordBatch::try_from_iter(vec![
+                ("c", Arc::new(Int16Array::from(vec![1, 2, 3, 4, 5])) as _),
+                ("b", Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])) as _),
+            ])
+            .unwrap()
+        );
+    }
 }
diff --git a/datafusion/proto-common/Cargo.toml b/datafusion/proto-common/Cargo.toml
index c67c8892a3ded..46dae36ba40ed 100644
--- a/datafusion/proto-common/Cargo.toml
+++ b/datafusion/proto-common/Cargo.toml
@@ -28,9 +28,6 @@ license = { workspace = true }
 authors = { workspace = true }
 rust-version = { workspace = true }
 
-# Exclude proto files so crates.io consumers don't need protoc
-exclude = ["*.proto"]
-
 [package.metadata.docs.rs]
 all-features = true
 
diff --git a/datafusion/proto-common/gen/Cargo.toml b/datafusion/proto-common/gen/Cargo.toml
index ef56d2697d818..f0e60819d42a8 100644
--- a/datafusion/proto-common/gen/Cargo.toml
+++ b/datafusion/proto-common/gen/Cargo.toml
@@ -29,10 +29,13 @@ publish = false
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
 [dependencies]
 # Pin these dependencies so that the generated output is deterministic
-pbjson-build = "=0.8.0"
-prost-build = "=0.14.1"
+pbjson-build = "=0.9.0"
+prost-build = "=0.14.3"
diff --git a/datafusion/proto-common/proto/datafusion_common.proto b/datafusion/proto-common/proto/datafusion_common.proto
index 267953556b166..87bcf3c14493c 100644
--- a/datafusion/proto-common/proto/datafusion_common.proto
+++ b/datafusion/proto-common/proto/datafusion_common.proto
@@ -183,6 +183,11 @@ message Map {
   bool keys_sorted = 2;
 }
 
+message RunEndEncoded {
+  Field run_ends_field = 1;
+  Field values_field = 2;
+}
+
 enum UnionMode{
   sparse = 0;
   dense = 1;
@@ -194,7 +199,7 @@ message Union{
   repeated int32 type_ids = 3;
 }
 
-// Used for List/FixedSizeList/LargeList/Struct/Map
+// Used for List/FixedSizeList/LargeList/ListView/LargeListView/Struct/Map
 message ScalarNestedValue {
   message Dictionary {
     bytes ipc_message = 1;
@@ -236,6 +241,12 @@ message ScalarDictionaryValue {
   ScalarValue value = 2;
 }
 
+message ScalarRunEndEncodedValue {
+  Field run_ends_field = 1;
+  Field values_field = 2;
+  ScalarValue value = 3;
+}
+
 message IntervalDayTimeValue {
   int32 days = 1;
   int32 milliseconds = 2;
@@ -295,6 +306,8 @@ message ScalarValue{
     ScalarNestedValue large_list_value = 16;
     ScalarNestedValue list_value = 17;
     ScalarNestedValue fixed_size_list_value = 18;
+    ScalarNestedValue list_view_value = 46;
+    ScalarNestedValue large_list_view_value = 47;
     ScalarNestedValue struct_value = 32;
     ScalarNestedValue map_value = 41;
 
@@ -321,6 +334,8 @@ message ScalarValue{
     IntervalMonthDayNanoValue interval_month_day_nano = 31;
     ScalarFixedSizeBinary fixed_size_binary_value = 34;
     UnionValue union_value = 42;
+
+    ScalarRunEndEncodedValue run_end_encoded_value = 45;
   }
 }
 
@@ -385,10 +400,13 @@ message ArrowType{
     List LIST = 25;
     List LARGE_LIST = 26;
     FixedSizeList FIXED_SIZE_LIST = 27;
+    List LIST_VIEW = 43;
+    List LARGE_LIST_VIEW = 44;
     Struct STRUCT = 28;
     Union UNION = 29;
     Dictionary DICTIONARY = 30;
     Map MAP = 33;
+    RunEndEncoded RUN_END_ENCODED = 42;
   }
 }
 
@@ -416,6 +434,13 @@ message JsonWriterOptions {
 }
 
 
+enum CsvQuoteStyle {
+  NECESSARY = 0;
+  ALWAYS = 1;
+  NON_NUMERIC = 2;
+  NEVER = 3;
+}
+
 message CsvWriterOptions {
   // Compression type
   CompressionTypeVariant compression = 1;
@@ -439,6 +464,12 @@ message CsvWriterOptions {
   string escape = 10;
   // Optional flag whether to double quotes, instead of escaping. Defaults to `true`
   bool double_quote = 11;
+  // Quote style for CSV writing
+  CsvQuoteStyle quote_style = 12;
+  // Whether to ignore leading whitespace in string values
+  bool ignore_leading_whitespace = 13;
+  // Whether to ignore trailing whitespace in string values
+  bool ignore_trailing_whitespace = 14;
 }
 
 // Options controlling CSV format
@@ -461,12 +492,21 @@ message CsvOptions {
   bytes newlines_in_values = 16; // Indicates if newlines are supported in values
   bytes terminator = 17; // Optional terminator character as a byte
   bytes truncated_rows = 18; // Indicates if truncated rows are allowed
+  optional uint32 compression_level = 19; // Optional compression level
+  // Quote style for CSV writing
+  CsvQuoteStyle quote_style = 20;
+  // Whether to ignore leading whitespace in string values
+  bytes ignore_leading_whitespace = 21;
+  // Whether to ignore trailing whitespace in string values
+  bytes ignore_trailing_whitespace = 22;
 }
 
 // Options controlling CSV format
 message JsonOptions {
   CompressionTypeVariant compression = 1; // Compression type
   optional uint64 schema_infer_max_rec = 2; // Optional max records for schema inference
+  optional uint32 compression_level = 3; // Optional compression level
+  optional bool newline_delimited = 4; // Whether to read as newline-delimited JSON (default true). When false, expects JSON array format [{},...]
 }
 
 message TableParquetOptions {
@@ -519,6 +559,7 @@ message ParquetOptions {
   bool skip_metadata = 3; // default = true
   bool pushdown_filters = 5; // default = false
   bool reorder_filters = 6; // default = false
+  bool force_filter_selections = 34; // default = false
   uint64 data_pagesize_limit = 7; // default = 1024 * 1024
   uint64 write_batch_size = 8; // default = 1024
   string writer_version = 9; // default = "1.0"
@@ -585,6 +626,14 @@ message ParquetOptions {
   oneof max_predicate_cache_size_opt {
     uint64 max_predicate_cache_size = 33;
   }
+
+  CdcOptions content_defined_chunking = 35;
+}
+
+message CdcOptions {
+  uint64 min_chunk_size = 1;
+  uint64 max_chunk_size = 2;
+  int32 norm_level = 3;
 }
 
 enum JoinSide {
@@ -608,6 +657,8 @@ message Statistics {
   Precision num_rows = 1;
   Precision total_byte_size = 2;
   repeated ColumnStats column_stats = 3;
+  // total_rows was removed - field 4 is reserved
+  reserved 4;
 }
 
 message ColumnStats {
@@ -616,4 +667,5 @@ message ColumnStats {
   Precision sum_value = 5;
   Precision null_count = 3;
   Precision distinct_count = 4;
+  Precision byte_size = 6;
 }
diff --git a/datafusion/proto-common/src/common.rs b/datafusion/proto-common/src/common.rs
index 9af63e3b07365..d5046aee2e2c7 100644
--- a/datafusion/proto-common/src/common.rs
+++ b/datafusion/proto-common/src/common.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion_common::{internal_datafusion_err, DataFusionError};
+use datafusion_common::{DataFusionError, internal_datafusion_err};
 
 /// Return a `DataFusionError::Internal` with the given message
 pub fn proto_error<S: Into<String>>(message: S) -> DataFusionError {
diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs
index 4ede5b970eaeb..ef2a8c18470c4 100644
--- a/datafusion/proto-common/src/from_proto/mod.rs
+++ b/datafusion/proto-common/src/from_proto/mod.rs
@@ -16,32 +16,36 @@
 // under the License.
 
 use std::collections::HashMap;
-use std::convert::{TryFrom, TryInto};
 use std::sync::Arc;
 
 use crate::common::proto_error;
 use crate::protobuf_common as protobuf;
 use arrow::array::{ArrayRef, AsArray};
 use arrow::buffer::Buffer;
-use arrow::csv::WriterBuilder;
+use arrow::csv::{QuoteStyle, WriterBuilder};
 use arrow::datatypes::{
-    i256, DataType, Field, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit,
-    Schema, TimeUnit, UnionFields, UnionMode,
+    DataType, Field, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, Schema,
+    TimeUnit, UnionFields, UnionMode, i256,
+};
+use arrow::ipc::{
+    convert::fb_to_schema,
+    reader::{read_dictionary, read_record_batch},
+    root_as_message,
+    writer::{DictionaryTracker, IpcDataGenerator, IpcWriteOptions},
 };
-use arrow::ipc::{reader::read_record_batch, root_as_message};
 
 use datafusion_common::{
+    Column, ColumnStatistics, Constraint, Constraints, DFSchema, DFSchemaRef,
+    DataFusionError, JoinSide, ScalarValue, Statistics, TableReference,
     arrow_datafusion_err,
     config::{
-        CsvOptions, JsonOptions, ParquetColumnOptions, ParquetOptions,
+        CdcOptions, CsvOptions, JsonOptions, ParquetColumnOptions, ParquetOptions,
         TableParquetOptions,
     },
     file_options::{csv_writer::CsvWriterOptions, json_writer::JsonWriterOptions},
     parsers::CompressionTypeVariant,
     plan_datafusion_err,
     stats::Precision,
-    Column, ColumnStatistics, Constraint, Constraints, DFSchema, DFSchemaRef,
-    DataFusionError, JoinSide, ScalarValue, Statistics, TableReference,
 };
 
 #[derive(Debug)]
@@ -292,6 +296,16 @@ impl TryFrom<&protobuf::arrow_type::ArrowTypeEnum> for DataType {
                 let list_size = list.list_size;
                 DataType::FixedSizeList(Arc::new(list_type), list_size)
             }
+            arrow_type::ArrowTypeEnum::ListView(list) => {
+                let list_type =
+                    list.as_ref().field_type.as_deref().required("field_type")?;
+                DataType::ListView(Arc::new(list_type))
+            }
+            arrow_type::ArrowTypeEnum::LargeListView(list) => {
+                let list_type =
+                    list.as_ref().field_type.as_deref().required("field_type")?;
+                DataType::LargeListView(Arc::new(list_type))
+            }
             arrow_type::ArrowTypeEnum::Struct(strct) => DataType::Struct(
                 parse_proto_fields_to_fields(&strct.sub_field_types)?.into(),
             ),
@@ -304,13 +318,16 @@ impl TryFrom<&protobuf::arrow_type::ArrowTypeEnum> for DataType {
                 };
                 let union_fields = parse_proto_fields_to_fields(&union.union_types)?;
 
-                // Default to index based type ids if not provided
-                let type_ids: Vec<_> = match union.type_ids.is_empty() {
-                    true => (0..union_fields.len() as i8).collect(),
-                    false => union.type_ids.iter().map(|i| *i as i8).collect(),
+                // Default to index based type ids if not explicitly provided
+                let union_fields = if union.type_ids.is_empty() {
+                    UnionFields::from_fields(union_fields)
+                } else {
+                    let type_ids = union.type_ids.iter().map(|i| *i as i8);
+                    UnionFields::try_new(type_ids, union_fields).map_err(|e| {
+                        DataFusionError::from(e).context("Deserializing Union DataType")
+                    })?
                 };
-
-                DataType::Union(UnionFields::new(type_ids, union_fields), union_mode)
+                DataType::Union(union_fields, union_mode)
             }
             arrow_type::ArrowTypeEnum::Dictionary(dict) => {
                 let key_datatype = dict.as_ref().key.as_deref().required("key")?;
@@ -323,6 +340,19 @@ impl TryFrom<&protobuf::arrow_type::ArrowTypeEnum> for DataType {
                 let keys_sorted = map.keys_sorted;
                 DataType::Map(Arc::new(field), keys_sorted)
             }
+            arrow_type::ArrowTypeEnum::RunEndEncoded(run_end_encoded) => {
+                let run_ends_field: Field = run_end_encoded
+                    .as_ref()
+                    .run_ends_field
+                    .as_deref()
+                    .required("run_ends_field")?;
+                let value_field: Field = run_end_encoded
+                    .as_ref()
+                    .values_field
+                    .as_deref()
+                    .required("values_field")?;
+                DataType::RunEndEncoded(run_ends_field.into(), value_field.into())
+            }
         })
     }
 }
@@ -381,10 +411,12 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
             Value::Float32Value(v) => Self::Float32(Some(*v)),
             Value::Float64Value(v) => Self::Float64(Some(*v)),
             Value::Date32Value(v) => Self::Date32(Some(*v)),
-            // ScalarValue::List is serialized using arrow IPC format
+            // Nested ScalarValue types are serialized using arrow IPC format
             Value::ListValue(v)
             | Value::FixedSizeListValue(v)
             | Value::LargeListValue(v)
+            | Value::ListViewValue(v)
+            | Value::LargeListViewValue(v)
             | Value::StructValue(v)
             | Value::MapValue(v) => {
                 let protobuf::ScalarNestedValue {
@@ -398,55 +430,83 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
                     schema_ref.try_into()?
                 } else {
                     return Err(Error::General(
-                        "Invalid schema while deserializing ScalarValue::List"
+                        "Invalid schema while deserializing nested ScalarValue"
                             .to_string(),
                     ));
                 };
 
+                // IPC dictionary batch IDs are assigned when encoding the schema, but our protobuf
+                // `Schema` doesn't preserve those IDs. Reconstruct them deterministically by
+                // round-tripping the schema through IPC.
+                let schema: Schema = {
+                    let ipc_gen = IpcDataGenerator {};
+                    let write_options = IpcWriteOptions::default();
+                    let mut dict_tracker = DictionaryTracker::new(false);
+                    let encoded_schema = ipc_gen.schema_to_bytes_with_dictionary_tracker(
+                        &schema,
+                        &mut dict_tracker,
+                        &write_options,
+                    );
+                    let message =
+                        root_as_message(encoded_schema.ipc_message.as_slice()).map_err(
+                            |e| {
+                                Error::General(format!(
+                                    "Error IPC schema message while deserializing nested ScalarValue: {e}"
+                                ))
+                            },
+                        )?;
+                    let ipc_schema = message.header_as_schema().ok_or_else(|| {
+                        Error::General(
+                            "Unexpected message type deserializing nested ScalarValue schema"
+                                .to_string(),
+                        )
+                    })?;
+                    fb_to_schema(ipc_schema)
+                };
+
                 let message = root_as_message(ipc_message.as_slice()).map_err(|e| {
                     Error::General(format!(
-                        "Error IPC message while deserializing ScalarValue::List: {e}"
+                        "Error IPC message while deserializing nested ScalarValue: {e}"
                     ))
                 })?;
                 let buffer = Buffer::from(arrow_data.as_slice());
 
                 let ipc_batch = message.header_as_record_batch().ok_or_else(|| {
                     Error::General(
-                        "Unexpected message type deserializing ScalarValue::List"
+                        "Unexpected message type deserializing nested ScalarValue"
                             .to_string(),
                     )
                 })?;
 
-                let dict_by_id: HashMap<i64,ArrayRef> = dictionaries.iter().map(|protobuf::scalar_nested_value::Dictionary { ipc_message, arrow_data }| {
+                let mut dict_by_id: HashMap<i64, ArrayRef> = HashMap::new();
+                for protobuf::scalar_nested_value::Dictionary {
+                    ipc_message,
+                    arrow_data,
+                } in dictionaries
+                {
                     let message = root_as_message(ipc_message.as_slice()).map_err(|e| {
                         Error::General(format!(
-                            "Error IPC message while deserializing ScalarValue::List dictionary message: {e}"
+                            "Error IPC message while deserializing nested ScalarValue dictionary message: {e}"
                         ))
                     })?;
                     let buffer = Buffer::from(arrow_data.as_slice());
 
                     let dict_batch = message.header_as_dictionary_batch().ok_or_else(|| {
                         Error::General(
-                            "Unexpected message type deserializing ScalarValue::List dictionary message"
+                            "Unexpected message type deserializing nested ScalarValue dictionary message"
                                 .to_string(),
                         )
                     })?;
-
-                    let id = dict_batch.id();
-
-                    let record_batch = read_record_batch(
+                    read_dictionary(
                         &buffer,
-                        dict_batch.data().unwrap(),
-                        Arc::new(schema.clone()),
-                        &Default::default(),
-                        None,
+                        dict_batch,
+                        &schema,
+                        &mut dict_by_id,
                         &message.version(),
-                    )?;
-
-                    let values: ArrayRef = Arc::clone(record_batch.column(0));
-
-                    Ok((id, values))
-                }).collect::<datafusion_common::Result<HashMap<_, _>>>()?;
+                    )
+                    .map_err(|e| arrow_datafusion_err!(e))
+                    .map_err(|e| e.context("Decoding nested ScalarValue dictionary"))?;
+                }
 
                 let record_batch = read_record_batch(
                     &buffer,
@@ -457,7 +517,7 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
                     &message.version(),
                 )
                 .map_err(|e| arrow_datafusion_err!(e))
-                .map_err(|e| e.context("Decoding ScalarValue::List Value"))?;
+                .map_err(|e| e.context("Decoding nested ScalarValue value"))?;
                 let arr = record_batch.column(0);
                 match value {
                     Value::ListValue(_) => {
@@ -469,6 +529,12 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
                     Value::FixedSizeListValue(_) => {
                         Self::FixedSizeList(arr.as_fixed_size_list().to_owned().into())
                     }
+                    Value::ListViewValue(_) => {
+                        Self::ListView(arr.as_list_view::<i32>().to_owned().into())
+                    }
+                    Value::LargeListViewValue(_) => {
+                        Self::LargeListView(arr.as_list_view::<i64>().to_owned().into())
+                    }
                     Value::StructValue(_) => {
                         Self::Struct(arr.as_struct().to_owned().into())
                     }
@@ -575,6 +641,32 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
 
                 Self::Dictionary(Box::new(index_type), Box::new(value))
             }
+            Value::RunEndEncodedValue(v) => {
+                let run_ends_field: Field = v
+                    .run_ends_field
+                    .as_ref()
+                    .ok_or_else(|| Error::required("run_ends_field"))?
+                    .try_into()?;
+
+                let values_field: Field = v
+                    .values_field
+                    .as_ref()
+                    .ok_or_else(|| Error::required("values_field"))?
+                    .try_into()?;
+
+                let value: Self = v
+                    .value
+                    .as_ref()
+                    .ok_or_else(|| Error::required("value"))?
+                    .as_ref()
+                    .try_into()?;
+
+                Self::RunEndEncoded(
+                    run_ends_field.into(),
+                    values_field.into(),
+                    Box::new(value),
+                )
+            }
             Value::BinaryValue(v) => Self::Binary(Some(v.clone())),
             Value::BinaryViewValue(v) => Self::BinaryView(Some(v.clone())),
             Value::LargeBinaryValue(v) => Self::LargeBinary(Some(v.clone())),
@@ -602,7 +694,9 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
                     .collect::<Option<Vec<_>>>();
                 let fields = fields.ok_or_else(|| Error::required("UnionField"))?;
                 let fields = parse_proto_fields_to_fields(&fields)?;
-                let fields = UnionFields::new(ids, fields);
+                let union_fields = UnionFields::try_new(ids, fields).map_err(|e| {
+                    DataFusionError::from(e).context("Deserializing Union ScalarValue")
+                })?;
                 let v_id = val.value_id as i8;
                 let val = match &val.value {
                     None => None,
@@ -614,7 +708,7 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
                         Some((v_id, Box::new(val)))
                     }
                 };
-                Self::Union(val, fields, mode)
+                Self::Union(val, union_fields, mode)
             }
             Value::FixedSizeBinaryValue(v) => {
                 Self::FixedSizeBinary(v.length, Some(v.clone().values))
@@ -699,6 +793,11 @@ impl From<&protobuf::ColumnStats> for ColumnStatistics {
             } else {
                 Precision::Absent
             },
+            byte_size: if let Some(sbs) = &cs.byte_size {
+                sbs.clone().into()
+            } else {
+                Precision::Absent
+            },
         }
     }
 }
@@ -865,6 +964,17 @@ impl From<CompressionTypeVariant> for protobuf::CompressionTypeVariant {
     }
 }
 
+impl From<protobuf::CsvQuoteStyle> for datafusion_common::parsers::CsvQuoteStyle {
+    fn from(value: protobuf::CsvQuoteStyle) -> Self {
+        match value {
+            protobuf::CsvQuoteStyle::Necessary => Self::Necessary,
+            protobuf::CsvQuoteStyle::Always => Self::Always,
+            protobuf::CsvQuoteStyle::NonNumeric => Self::NonNumeric,
+            protobuf::CsvQuoteStyle::Never => Self::Never,
+        }
+    }
+}
+
 impl TryFrom<&protobuf::CsvWriterOptions> for CsvWriterOptions {
     type Error = DataFusionError;
 
@@ -900,9 +1010,10 @@ impl TryFrom<&protobuf::CsvOptions> for CsvOptions {
             quote: proto_opts.quote[0],
             terminator: proto_opts.terminator.first().copied(),
             escape: proto_opts.escape.first().copied(),
-            double_quote: proto_opts.has_header.first().map(|h| *h != 0),
+            double_quote: proto_opts.double_quote.first().map(|h| *h != 0),
             newlines_in_values: proto_opts.newlines_in_values.first().map(|h| *h != 0),
             compression: proto_opts.compression().into(),
+            compression_level: proto_opts.compression_level,
             schema_infer_max_rec: proto_opts.schema_infer_max_rec.map(|h| h as usize),
             date_format: (!proto_opts.date_format.is_empty())
                 .then(|| proto_opts.date_format.clone()),
@@ -920,6 +1031,15 @@ impl TryFrom<&protobuf::CsvOptions> for CsvOptions {
                 .then(|| proto_opts.null_regex.clone()),
             comment: proto_opts.comment.first().copied(),
             truncated_rows: proto_opts.truncated_rows.first().map(|h| *h != 0),
+            quote_style: proto_opts.quote_style().into(),
+            ignore_leading_whitespace: proto_opts
+                .ignore_leading_whitespace
+                .first()
+                .map(|h| *h != 0),
+            ignore_trailing_whitespace: proto_opts
+                .ignore_trailing_whitespace
+                .first()
+                .map(|h| *h != 0),
         })
     }
 }
@@ -930,7 +1050,6 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions {
     fn try_from(
         value: &protobuf::ParquetOptions,
     ) -> datafusion_common::Result<Self, Self::Error> {
-        #[allow(deprecated)] // max_statistics_size
         Ok(ParquetOptions {
             enable_page_index: value.enable_page_index,
             pruning: value.pruning,
@@ -943,9 +1062,12 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions {
                 .unwrap_or(None),
             pushdown_filters: value.pushdown_filters,
             reorder_filters: value.reorder_filters,
+            force_filter_selections: value.force_filter_selections,
             data_pagesize_limit: value.data_pagesize_limit as usize,
             write_batch_size: value.write_batch_size as usize,
-            writer_version: value.writer_version.clone(),
+            writer_version: value.writer_version.parse().map_err(|e| {
+                DataFusionError::Internal(format!("Failed to parse writer_version: {e}"))
+            })?,
             compression: value.compression_opt.clone().map(|opt| match opt {
                 protobuf::parquet_options::CompressionOpt::Compression(v) => Some(v),
             }).unwrap_or(None),
@@ -1005,6 +1127,17 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions {
             max_predicate_cache_size: value.max_predicate_cache_size_opt.map(|opt| match opt {
                 protobuf::parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(v) => Some(v as usize),
             }).unwrap_or(None),
+            use_content_defined_chunking: value.content_defined_chunking.map(|cdc| {
+                let defaults = CdcOptions::default();
+                CdcOptions {
+                    // proto3 uses 0 as the wire default for uint64; a zero chunk size is
+                    // invalid, so treat it as "field not set" and fall back to the default.
+                    min_chunk_size: if cdc.min_chunk_size != 0 { cdc.min_chunk_size as usize } else { defaults.min_chunk_size },
+                    max_chunk_size: if cdc.max_chunk_size != 0 { cdc.max_chunk_size as usize } else { defaults.max_chunk_size },
+                    // norm_level = 0 is a valid value (and the default), so pass it through directly.
+                    norm_level: cdc.norm_level,
+                }
+            }),
         })
     }
 }
@@ -1014,7 +1147,6 @@ impl TryFrom<&protobuf::ParquetColumnOptions> for ParquetColumnOptions {
     fn try_from(
         value: &protobuf::ParquetColumnOptions,
     ) -> datafusion_common::Result<Self, Self::Error> {
-        #[allow(deprecated)] // max_statistics_size
         Ok(ParquetColumnOptions {
             compression: value.compression_opt.clone().map(|opt| match opt {
                 protobuf::parquet_column_options::CompressionOpt::Compression(v) => Some(v),
@@ -1068,7 +1200,7 @@ impl TryFrom<&protobuf::TableParquetOptions> for TableParquetOptions {
                 column_specific_options.insert(column_name.clone(), options.try_into()?);
             }
         }
-        Ok(TableParquetOptions {
+        let opts = TableParquetOptions {
             global: value
                 .global
                 .as_ref()
@@ -1076,9 +1208,9 @@ impl TryFrom<&protobuf::TableParquetOptions> for TableParquetOptions {
                 .unwrap()
                 .unwrap(),
             column_specific_options,
-            key_value_metadata: Default::default(),
-            crypto: Default::default(),
-        })
+            ..Default::default()
+        };
+        Ok(opts)
     }
 }
 
@@ -1091,7 +1223,9 @@ impl TryFrom<&protobuf::JsonOptions> for JsonOptions {
         let compression: protobuf::CompressionTypeVariant = proto_opts.compression();
         Ok(JsonOptions {
             compression: compression.into(),
+            compression_level: proto_opts.compression_level,
             schema_infer_max_rec: proto_opts.schema_infer_max_rec.map(|h| h as usize),
+            newline_delimited: proto_opts.newline_delimited.unwrap_or(true),
         })
     }
 }
@@ -1167,6 +1301,16 @@ pub(crate) fn csv_writer_options_from_proto(
             return Err(proto_error("Error parsing CSV Escape"));
         }
     }
+    let quote_style = match protobuf::CsvQuoteStyle::try_from(writer_options.quote_style)
+    {
+        Ok(protobuf::CsvQuoteStyle::Always) => QuoteStyle::Always,
+        Ok(protobuf::CsvQuoteStyle::NonNumeric) => QuoteStyle::NonNumeric,
+        Ok(protobuf::CsvQuoteStyle::Never) => QuoteStyle::Never,
+        Ok(protobuf::CsvQuoteStyle::Necessary) => QuoteStyle::Necessary,
+        _ => Err(proto_error(
+            "Unknown quote style, must be one of: 'Always', 'NonNumeric', 'Never', 'Necessary'",
+        ))?,
+    };
     Ok(builder
         .with_header(writer_options.has_header)
         .with_date_format(writer_options.date_format.clone())
@@ -1174,5 +1318,92 @@ pub(crate) fn csv_writer_options_from_proto(
         .with_timestamp_format(writer_options.timestamp_format.clone())
         .with_time_format(writer_options.time_format.clone())
         .with_null(writer_options.null_value.clone())
-        .with_double_quote(writer_options.double_quote))
+        .with_double_quote(writer_options.double_quote)
+        .with_quote_style(quote_style)
+        .with_ignore_leading_whitespace(writer_options.ignore_leading_whitespace)
+        .with_ignore_trailing_whitespace(writer_options.ignore_trailing_whitespace))
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion_common::config::{CdcOptions, ParquetOptions, TableParquetOptions};
+
+    fn parquet_options_proto_round_trip(opts: ParquetOptions) -> ParquetOptions {
+        let proto: crate::protobuf_common::ParquetOptions =
+            (&opts).try_into().expect("to_proto");
+        ParquetOptions::try_from(&proto).expect("from_proto")
+    }
+
+    fn table_parquet_options_proto_round_trip(
+        opts: TableParquetOptions,
+    ) -> TableParquetOptions {
+        let proto: crate::protobuf_common::TableParquetOptions =
+            (&opts).try_into().expect("to_proto");
+        TableParquetOptions::try_from(&proto).expect("from_proto")
+    }
+
+    #[test]
+    fn test_parquet_options_cdc_disabled_round_trip() {
+        let opts = ParquetOptions::default();
+        assert!(opts.use_content_defined_chunking.is_none());
+        let recovered = parquet_options_proto_round_trip(opts.clone());
+        assert_eq!(opts, recovered);
+    }
+
+    #[test]
+    fn test_parquet_options_cdc_enabled_round_trip() {
+        let opts = ParquetOptions {
+            use_content_defined_chunking: Some(CdcOptions {
+                min_chunk_size: 128 * 1024,
+                max_chunk_size: 512 * 1024,
+                norm_level: 2,
+            }),
+            ..ParquetOptions::default()
+        };
+        let recovered = parquet_options_proto_round_trip(opts.clone());
+        let cdc = recovered.use_content_defined_chunking.unwrap();
+        assert_eq!(cdc.min_chunk_size, 128 * 1024);
+        assert_eq!(cdc.max_chunk_size, 512 * 1024);
+        assert_eq!(cdc.norm_level, 2);
+    }
+
+    #[test]
+    fn test_parquet_options_cdc_negative_norm_level_round_trip() {
+        let opts = ParquetOptions {
+            use_content_defined_chunking: Some(CdcOptions {
+                norm_level: -3,
+                ..CdcOptions::default()
+            }),
+            ..ParquetOptions::default()
+        };
+        let recovered = parquet_options_proto_round_trip(opts);
+        assert_eq!(
+            recovered.use_content_defined_chunking.unwrap().norm_level,
+            -3
+        );
+    }
+
+    #[test]
+    fn test_table_parquet_options_cdc_round_trip() {
+        let mut opts = TableParquetOptions::default();
+        opts.global.use_content_defined_chunking = Some(CdcOptions {
+            min_chunk_size: 64 * 1024,
+            max_chunk_size: 2 * 1024 * 1024,
+            norm_level: -1,
+        });
+
+        let recovered = table_parquet_options_proto_round_trip(opts.clone());
+        let cdc = recovered.global.use_content_defined_chunking.unwrap();
+        assert_eq!(cdc.min_chunk_size, 64 * 1024);
+        assert_eq!(cdc.max_chunk_size, 2 * 1024 * 1024);
+        assert_eq!(cdc.norm_level, -1);
+    }
+
+    #[test]
+    fn test_table_parquet_options_cdc_disabled_round_trip() {
+        let opts = TableParquetOptions::default();
+        assert!(opts.global.use_content_defined_chunking.is_none());
+        let recovered = table_parquet_options_proto_round_trip(opts.clone());
+        assert!(recovered.global.use_content_defined_chunking.is_none());
+    }
 }
diff --git a/datafusion/proto-common/src/generated/mod.rs b/datafusion/proto-common/src/generated/mod.rs
index 24a062e4cad59..9c2ca9385aa5e 100644
--- a/datafusion/proto-common/src/generated/mod.rs
+++ b/datafusion/proto-common/src/generated/mod.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+// This code is generated so we don't want to fix any lint violations manually
+#[allow(clippy::allow_attributes)]
 #[allow(clippy::all)]
 #[rustfmt::skip]
 pub mod datafusion_proto_common {
diff --git a/datafusion/proto-common/src/generated/pbjson.rs b/datafusion/proto-common/src/generated/pbjson.rs
index e63f345459b8f..6112c55793a2b 100644
--- a/datafusion/proto-common/src/generated/pbjson.rs
+++ b/datafusion/proto-common/src/generated/pbjson.rs
@@ -29,7 +29,7 @@ impl<'de> serde::Deserialize<'de> for ArrowFormat {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -100,7 +100,7 @@ impl<'de> serde::Deserialize<'de> for ArrowOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -264,6 +264,12 @@ impl serde::Serialize for ArrowType {
                 arrow_type::ArrowTypeEnum::FixedSizeList(v) => {
                     struct_ser.serialize_field("FIXEDSIZELIST", v)?;
                 }
+                arrow_type::ArrowTypeEnum::ListView(v) => {
+                    struct_ser.serialize_field("LISTVIEW", v)?;
+                }
+                arrow_type::ArrowTypeEnum::LargeListView(v) => {
+                    struct_ser.serialize_field("LARGELISTVIEW", v)?;
+                }
                 arrow_type::ArrowTypeEnum::Struct(v) => {
                     struct_ser.serialize_field("STRUCT", v)?;
                 }
@@ -276,6 +282,9 @@ impl serde::Serialize for ArrowType {
                 arrow_type::ArrowTypeEnum::Map(v) => {
                     struct_ser.serialize_field("MAP", v)?;
                 }
+                arrow_type::ArrowTypeEnum::RunEndEncoded(v) => {
+                    struct_ser.serialize_field("RUNENDENCODED", v)?;
+                }
             }
         }
         struct_ser.end()
@@ -329,10 +338,16 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
             "LARGELIST",
             "FIXED_SIZE_LIST",
             "FIXEDSIZELIST",
+            "LIST_VIEW",
+            "LISTVIEW",
+            "LARGE_LIST_VIEW",
+            "LARGELISTVIEW",
             "STRUCT",
             "UNION",
             "DICTIONARY",
             "MAP",
+            "RUN_END_ENCODED",
+            "RUNENDENCODED",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -371,10 +386,13 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
             List,
             LargeList,
             FixedSizeList,
+            ListView,
+            LargeListView,
             Struct,
             Union,
             Dictionary,
             Map,
+            RunEndEncoded,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -383,7 +401,7 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -430,10 +448,13 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
                             "LIST" => Ok(GeneratedField::List),
                             "LARGELIST" | "LARGE_LIST" => Ok(GeneratedField::LargeList),
                             "FIXEDSIZELIST" | "FIXED_SIZE_LIST" => Ok(GeneratedField::FixedSizeList),
+                            "LISTVIEW" | "LIST_VIEW" => Ok(GeneratedField::ListView),
+                            "LARGELISTVIEW" | "LARGE_LIST_VIEW" => Ok(GeneratedField::LargeListView),
                             "STRUCT" => Ok(GeneratedField::Struct),
                             "UNION" => Ok(GeneratedField::Union),
                             "DICTIONARY" => Ok(GeneratedField::Dictionary),
                             "MAP" => Ok(GeneratedField::Map),
+                            "RUNENDENCODED" | "RUN_END_ENCODED" => Ok(GeneratedField::RunEndEncoded),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -687,6 +708,20 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
                                 return Err(serde::de::Error::duplicate_field("FIXEDSIZELIST"));
                             }
                             arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::FixedSizeList)
+;
+                        }
+                        GeneratedField::ListView => {
+                            if arrow_type_enum__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("LISTVIEW"));
+                            }
+                            arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::ListView)
+;
+                        }
+                        GeneratedField::LargeListView => {
+                            if arrow_type_enum__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("LARGELISTVIEW"));
+                            }
+                            arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::LargeListView)
 ;
                         }
                         GeneratedField::Struct => {
@@ -715,6 +750,13 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
                                 return Err(serde::de::Error::duplicate_field("MAP"));
                             }
                             arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Map)
+;
+                        }
+                        GeneratedField::RunEndEncoded => {
+                            if arrow_type_enum__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("RUNENDENCODED"));
+                            }
+                            arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::RunEndEncoded)
 ;
                         }
                     }
@@ -758,7 +800,7 @@ impl<'de> serde::Deserialize<'de> for AvroFormat {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -829,7 +871,7 @@ impl<'de> serde::Deserialize<'de> for AvroOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -869,6 +911,144 @@ impl<'de> serde::Deserialize<'de> for AvroOptions {
         deserializer.deserialize_struct("datafusion_common.AvroOptions", FIELDS, GeneratedVisitor)
     }
 }
+impl serde::Serialize for CdcOptions {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.min_chunk_size != 0 {
+            len += 1;
+        }
+        if self.max_chunk_size != 0 {
+            len += 1;
+        }
+        if self.norm_level != 0 {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion_common.CdcOptions", len)?;
+        if self.min_chunk_size != 0 {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("minChunkSize", ToString::to_string(&self.min_chunk_size).as_str())?;
+        }
+        if self.max_chunk_size != 0 {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("maxChunkSize", ToString::to_string(&self.max_chunk_size).as_str())?;
+        }
+        if self.norm_level != 0 {
+            struct_ser.serialize_field("normLevel", &self.norm_level)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for CdcOptions {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "min_chunk_size",
+            "minChunkSize",
+            "max_chunk_size",
+            "maxChunkSize",
+            "norm_level",
+            "normLevel",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            MinChunkSize,
+            MaxChunkSize,
+            NormLevel,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "minChunkSize" | "min_chunk_size" => Ok(GeneratedField::MinChunkSize),
+                            "maxChunkSize" | "max_chunk_size" => Ok(GeneratedField::MaxChunkSize),
+                            "normLevel" | "norm_level" => Ok(GeneratedField::NormLevel),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = CdcOptions;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion_common.CdcOptions")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<CdcOptions, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut min_chunk_size__ = None;
+                let mut max_chunk_size__ = None;
+                let mut norm_level__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::MinChunkSize => {
+                            if min_chunk_size__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("minChunkSize"));
+                            }
+                            min_chunk_size__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::MaxChunkSize => {
+                            if max_chunk_size__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("maxChunkSize"));
+                            }
+                            max_chunk_size__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::NormLevel => {
+                            if norm_level__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("normLevel"));
+                            }
+                            norm_level__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                    }
+                }
+                Ok(CdcOptions {
+                    min_chunk_size: min_chunk_size__.unwrap_or_default(),
+                    max_chunk_size: max_chunk_size__.unwrap_or_default(),
+                    norm_level: norm_level__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion_common.CdcOptions", FIELDS, GeneratedVisitor)
+    }
+}
 impl serde::Serialize for Column {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
@@ -916,7 +1096,7 @@ impl<'de> serde::Deserialize<'de> for Column {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1016,7 +1196,7 @@ impl<'de> serde::Deserialize<'de> for ColumnRelation {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1091,6 +1271,9 @@ impl serde::Serialize for ColumnStats {
         if self.distinct_count.is_some() {
             len += 1;
         }
+        if self.byte_size.is_some() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion_common.ColumnStats", len)?;
         if let Some(v) = self.min_value.as_ref() {
             struct_ser.serialize_field("minValue", v)?;
@@ -1107,6 +1290,9 @@ impl serde::Serialize for ColumnStats {
         if let Some(v) = self.distinct_count.as_ref() {
             struct_ser.serialize_field("distinctCount", v)?;
         }
+        if let Some(v) = self.byte_size.as_ref() {
+            struct_ser.serialize_field("byteSize", v)?;
+        }
         struct_ser.end()
     }
 }
@@ -1127,6 +1313,8 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
             "nullCount",
             "distinct_count",
             "distinctCount",
+            "byte_size",
+            "byteSize",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -1136,6 +1324,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
             SumValue,
             NullCount,
             DistinctCount,
+            ByteSize,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -1144,7 +1333,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1162,6 +1351,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
                             "sumValue" | "sum_value" => Ok(GeneratedField::SumValue),
                             "nullCount" | "null_count" => Ok(GeneratedField::NullCount),
                             "distinctCount" | "distinct_count" => Ok(GeneratedField::DistinctCount),
+                            "byteSize" | "byte_size" => Ok(GeneratedField::ByteSize),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -1186,6 +1376,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
                 let mut sum_value__ = None;
                 let mut null_count__ = None;
                 let mut distinct_count__ = None;
+                let mut byte_size__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::MinValue => {
@@ -1218,6 +1409,12 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
                             }
                             distinct_count__ = map_.next_value()?;
                         }
+                        GeneratedField::ByteSize => {
+                            if byte_size__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("byteSize"));
+                            }
+                            byte_size__ = map_.next_value()?;
+                        }
                     }
                 }
                 Ok(ColumnStats {
@@ -1226,6 +1423,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
                     sum_value: sum_value__,
                     null_count: null_count__,
                     distinct_count: distinct_count__,
+                    byte_size: byte_size__,
                 })
             }
         }
@@ -1264,7 +1462,7 @@ impl<'de> serde::Deserialize<'de> for CompressionTypeVariant {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = CompressionTypeVariant;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1361,7 +1559,7 @@ impl<'de> serde::Deserialize<'de> for Constraint {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1461,7 +1659,7 @@ impl<'de> serde::Deserialize<'de> for Constraints {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1552,7 +1750,7 @@ impl<'de> serde::Deserialize<'de> for CsvFormat {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1666,6 +1864,18 @@ impl serde::Serialize for CsvOptions {
         if !self.truncated_rows.is_empty() {
             len += 1;
         }
+        if self.compression_level.is_some() {
+            len += 1;
+        }
+        if self.quote_style != 0 {
+            len += 1;
+        }
+        if !self.ignore_leading_whitespace.is_empty() {
+            len += 1;
+        }
+        if !self.ignore_trailing_whitespace.is_empty() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion_common.CsvOptions", len)?;
         if !self.has_header.is_empty() {
             #[allow(clippy::needless_borrow)]
@@ -1743,6 +1953,24 @@ impl serde::Serialize for CsvOptions {
             #[allow(clippy::needless_borrows_for_generic_args)]
             struct_ser.serialize_field("truncatedRows", pbjson::private::base64::encode(&self.truncated_rows).as_str())?;
         }
+        if let Some(v) = self.compression_level.as_ref() {
+            struct_ser.serialize_field("compressionLevel", v)?;
+        }
+        if self.quote_style != 0 {
+            let v = CsvQuoteStyle::try_from(self.quote_style)
+                .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.quote_style)))?;
+            struct_ser.serialize_field("quoteStyle", &v)?;
+        }
+        if !self.ignore_leading_whitespace.is_empty() {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("ignoreLeadingWhitespace", pbjson::private::base64::encode(&self.ignore_leading_whitespace).as_str())?;
+        }
+        if !self.ignore_trailing_whitespace.is_empty() {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("ignoreTrailingWhitespace", pbjson::private::base64::encode(&self.ignore_trailing_whitespace).as_str())?;
+        }
         struct_ser.end()
     }
 }
@@ -1783,6 +2011,14 @@ impl<'de> serde::Deserialize<'de> for CsvOptions {
             "terminator",
             "truncated_rows",
             "truncatedRows",
+            "compression_level",
+            "compressionLevel",
+            "quote_style",
+            "quoteStyle",
+            "ignore_leading_whitespace",
+            "ignoreLeadingWhitespace",
+            "ignore_trailing_whitespace",
+            "ignoreTrailingWhitespace",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -1805,6 +2041,10 @@ impl<'de> serde::Deserialize<'de> for CsvOptions {
             NewlinesInValues,
             Terminator,
             TruncatedRows,
+            CompressionLevel,
+            QuoteStyle,
+            IgnoreLeadingWhitespace,
+            IgnoreTrailingWhitespace,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -1813,7 +2053,7 @@ impl<'de> serde::Deserialize<'de> for CsvOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1844,6 +2084,10 @@ impl<'de> serde::Deserialize<'de> for CsvOptions {
                             "newlinesInValues" | "newlines_in_values" => Ok(GeneratedField::NewlinesInValues),
                             "terminator" => Ok(GeneratedField::Terminator),
                             "truncatedRows" | "truncated_rows" => Ok(GeneratedField::TruncatedRows),
+                            "compressionLevel" | "compression_level" => Ok(GeneratedField::CompressionLevel),
+                            "quoteStyle" | "quote_style" => Ok(GeneratedField::QuoteStyle),
+                            "ignoreLeadingWhitespace" | "ignore_leading_whitespace" => Ok(GeneratedField::IgnoreLeadingWhitespace),
+                            "ignoreTrailingWhitespace" | "ignore_trailing_whitespace" => Ok(GeneratedField::IgnoreTrailingWhitespace),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -1881,6 +2125,10 @@ impl<'de> serde::Deserialize<'de> for CsvOptions {
                 let mut newlines_in_values__ = None;
                 let mut terminator__ = None;
                 let mut truncated_rows__ = None;
+                let mut compression_level__ = None;
+                let mut quote_style__ = None;
+                let mut ignore_leading_whitespace__ = None;
+                let mut ignore_trailing_whitespace__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::HasHeader => {
@@ -2011,6 +2259,36 @@ impl<'de> serde::Deserialize<'de> for CsvOptions {
                                 Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0)
                             ;
                         }
+                        GeneratedField::CompressionLevel => {
+                            if compression_level__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("compressionLevel"));
+                            }
+                            compression_level__ = 
+                                map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| x.0)
+                            ;
+                        }
+                        GeneratedField::QuoteStyle => {
+                            if quote_style__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("quoteStyle"));
+                            }
+                            quote_style__ = Some(map_.next_value::<CsvQuoteStyle>()? as i32);
+                        }
+                        GeneratedField::IgnoreLeadingWhitespace => {
+                            if ignore_leading_whitespace__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("ignoreLeadingWhitespace"));
+                            }
+                            ignore_leading_whitespace__ = 
+                                Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::IgnoreTrailingWhitespace => {
+                            if ignore_trailing_whitespace__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("ignoreTrailingWhitespace"));
+                            }
+                            ignore_trailing_whitespace__ = 
+                                Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0)
+                            ;
+                        }
                     }
                 }
                 Ok(CsvOptions {
@@ -2032,12 +2310,93 @@ impl<'de> serde::Deserialize<'de> for CsvOptions {
                     newlines_in_values: newlines_in_values__.unwrap_or_default(),
                     terminator: terminator__.unwrap_or_default(),
                     truncated_rows: truncated_rows__.unwrap_or_default(),
+                    compression_level: compression_level__,
+                    quote_style: quote_style__.unwrap_or_default(),
+                    ignore_leading_whitespace: ignore_leading_whitespace__.unwrap_or_default(),
+                    ignore_trailing_whitespace: ignore_trailing_whitespace__.unwrap_or_default(),
                 })
             }
         }
         deserializer.deserialize_struct("datafusion_common.CsvOptions", FIELDS, GeneratedVisitor)
     }
 }
+impl serde::Serialize for CsvQuoteStyle {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        let variant = match self {
+            Self::Necessary => "NECESSARY",
+            Self::Always => "ALWAYS",
+            Self::NonNumeric => "NON_NUMERIC",
+            Self::Never => "NEVER",
+        };
+        serializer.serialize_str(variant)
+    }
+}
+impl<'de> serde::Deserialize<'de> for CsvQuoteStyle {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "NECESSARY",
+            "ALWAYS",
+            "NON_NUMERIC",
+            "NEVER",
+        ];
+
+        struct GeneratedVisitor;
+
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
+            type Value = CsvQuoteStyle;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                write!(formatter, "expected one of: {:?}", &FIELDS)
+            }
+
+            fn visit_i64<E>(self, v: i64) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                i32::try_from(v)
+                    .ok()
+                    .and_then(|x| x.try_into().ok())
+                    .ok_or_else(|| {
+                        serde::de::Error::invalid_value(serde::de::Unexpected::Signed(v), &self)
+                    })
+            }
+
+            fn visit_u64<E>(self, v: u64) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                i32::try_from(v)
+                    .ok()
+                    .and_then(|x| x.try_into().ok())
+                    .ok_or_else(|| {
+                        serde::de::Error::invalid_value(serde::de::Unexpected::Unsigned(v), &self)
+                    })
+            }
+
+            fn visit_str<E>(self, value: &str) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                match value {
+                    "NECESSARY" => Ok(CsvQuoteStyle::Necessary),
+                    "ALWAYS" => Ok(CsvQuoteStyle::Always),
+                    "NON_NUMERIC" => Ok(CsvQuoteStyle::NonNumeric),
+                    "NEVER" => Ok(CsvQuoteStyle::Never),
+                    _ => Err(serde::de::Error::unknown_variant(value, FIELDS)),
+                }
+            }
+        }
+        deserializer.deserialize_any(GeneratedVisitor)
+    }
+}
 impl serde::Serialize for CsvWriterOptions {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
@@ -2079,6 +2438,15 @@ impl serde::Serialize for CsvWriterOptions {
         if self.double_quote {
             len += 1;
         }
+        if self.quote_style != 0 {
+            len += 1;
+        }
+        if self.ignore_leading_whitespace {
+            len += 1;
+        }
+        if self.ignore_trailing_whitespace {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion_common.CsvWriterOptions", len)?;
         if self.compression != 0 {
             let v = CompressionTypeVariant::try_from(self.compression)
@@ -2115,6 +2483,17 @@ impl serde::Serialize for CsvWriterOptions {
         if self.double_quote {
             struct_ser.serialize_field("doubleQuote", &self.double_quote)?;
         }
+        if self.quote_style != 0 {
+            let v = CsvQuoteStyle::try_from(self.quote_style)
+                .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.quote_style)))?;
+            struct_ser.serialize_field("quoteStyle", &v)?;
+        }
+        if self.ignore_leading_whitespace {
+            struct_ser.serialize_field("ignoreLeadingWhitespace", &self.ignore_leading_whitespace)?;
+        }
+        if self.ignore_trailing_whitespace {
+            struct_ser.serialize_field("ignoreTrailingWhitespace", &self.ignore_trailing_whitespace)?;
+        }
         struct_ser.end()
     }
 }
@@ -2143,6 +2522,12 @@ impl<'de> serde::Deserialize<'de> for CsvWriterOptions {
             "escape",
             "double_quote",
             "doubleQuote",
+            "quote_style",
+            "quoteStyle",
+            "ignore_leading_whitespace",
+            "ignoreLeadingWhitespace",
+            "ignore_trailing_whitespace",
+            "ignoreTrailingWhitespace",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -2158,6 +2543,9 @@ impl<'de> serde::Deserialize<'de> for CsvWriterOptions {
             Quote,
             Escape,
             DoubleQuote,
+            QuoteStyle,
+            IgnoreLeadingWhitespace,
+            IgnoreTrailingWhitespace,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -2166,7 +2554,7 @@ impl<'de> serde::Deserialize<'de> for CsvWriterOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2190,6 +2578,9 @@ impl<'de> serde::Deserialize<'de> for CsvWriterOptions {
                             "quote" => Ok(GeneratedField::Quote),
                             "escape" => Ok(GeneratedField::Escape),
                             "doubleQuote" | "double_quote" => Ok(GeneratedField::DoubleQuote),
+                            "quoteStyle" | "quote_style" => Ok(GeneratedField::QuoteStyle),
+                            "ignoreLeadingWhitespace" | "ignore_leading_whitespace" => Ok(GeneratedField::IgnoreLeadingWhitespace),
+                            "ignoreTrailingWhitespace" | "ignore_trailing_whitespace" => Ok(GeneratedField::IgnoreTrailingWhitespace),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -2220,6 +2611,9 @@ impl<'de> serde::Deserialize<'de> for CsvWriterOptions {
                 let mut quote__ = None;
                 let mut escape__ = None;
                 let mut double_quote__ = None;
+                let mut quote_style__ = None;
+                let mut ignore_leading_whitespace__ = None;
+                let mut ignore_trailing_whitespace__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Compression => {
@@ -2288,6 +2682,24 @@ impl<'de> serde::Deserialize<'de> for CsvWriterOptions {
                             }
                             double_quote__ = Some(map_.next_value()?);
                         }
+                        GeneratedField::QuoteStyle => {
+                            if quote_style__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("quoteStyle"));
+                            }
+                            quote_style__ = Some(map_.next_value::<CsvQuoteStyle>()? as i32);
+                        }
+                        GeneratedField::IgnoreLeadingWhitespace => {
+                            if ignore_leading_whitespace__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("ignoreLeadingWhitespace"));
+                            }
+                            ignore_leading_whitespace__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::IgnoreTrailingWhitespace => {
+                            if ignore_trailing_whitespace__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("ignoreTrailingWhitespace"));
+                            }
+                            ignore_trailing_whitespace__ = Some(map_.next_value()?);
+                        }
                     }
                 }
                 Ok(CsvWriterOptions {
@@ -2302,6 +2714,9 @@ impl<'de> serde::Deserialize<'de> for CsvWriterOptions {
                     quote: quote__.unwrap_or_default(),
                     escape: escape__.unwrap_or_default(),
                     double_quote: double_quote__.unwrap_or_default(),
+                    quote_style: quote_style__.unwrap_or_default(),
+                    ignore_leading_whitespace: ignore_leading_whitespace__.unwrap_or_default(),
+                    ignore_trailing_whitespace: ignore_trailing_whitespace__.unwrap_or_default(),
                 })
             }
         }
@@ -2369,7 +2784,7 @@ impl<'de> serde::Deserialize<'de> for Decimal128 {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2492,7 +2907,7 @@ impl<'de> serde::Deserialize<'de> for Decimal128Type {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2618,7 +3033,7 @@ impl<'de> serde::Deserialize<'de> for Decimal256 {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2741,7 +3156,7 @@ impl<'de> serde::Deserialize<'de> for Decimal256Type {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2867,7 +3282,7 @@ impl<'de> serde::Deserialize<'de> for Decimal32 {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2990,7 +3405,7 @@ impl<'de> serde::Deserialize<'de> for Decimal32Type {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3116,7 +3531,7 @@ impl<'de> serde::Deserialize<'de> for Decimal64 {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3239,7 +3654,7 @@ impl<'de> serde::Deserialize<'de> for Decimal64Type {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3351,7 +3766,7 @@ impl<'de> serde::Deserialize<'de> for DfField {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3459,7 +3874,7 @@ impl<'de> serde::Deserialize<'de> for DfSchema {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3569,7 +3984,7 @@ impl<'de> serde::Deserialize<'de> for Dictionary {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3661,7 +4076,7 @@ impl<'de> serde::Deserialize<'de> for EmptyMessage {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3773,7 +4188,7 @@ impl<'de> serde::Deserialize<'de> for Field {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3912,7 +4327,7 @@ impl<'de> serde::Deserialize<'de> for FixedSizeList {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4022,7 +4437,7 @@ impl<'de> serde::Deserialize<'de> for IntervalDayTimeValue {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4144,7 +4559,7 @@ impl<'de> serde::Deserialize<'de> for IntervalMonthDayNanoValue {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4248,7 +4663,7 @@ impl<'de> serde::Deserialize<'de> for IntervalUnit {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = IntervalUnit;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4320,7 +4735,7 @@ impl<'de> serde::Deserialize<'de> for JoinConstraint {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = JoinConstraint;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4393,7 +4808,7 @@ impl<'de> serde::Deserialize<'de> for JoinSide {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = JoinSide;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4481,7 +4896,7 @@ impl<'de> serde::Deserialize<'de> for JoinType {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = JoinType;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4548,6 +4963,12 @@ impl serde::Serialize for JsonOptions {
         if self.schema_infer_max_rec.is_some() {
             len += 1;
         }
+        if self.compression_level.is_some() {
+            len += 1;
+        }
+        if self.newline_delimited.is_some() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion_common.JsonOptions", len)?;
         if self.compression != 0 {
             let v = CompressionTypeVariant::try_from(self.compression)
@@ -4559,6 +4980,12 @@ impl serde::Serialize for JsonOptions {
             #[allow(clippy::needless_borrows_for_generic_args)]
             struct_ser.serialize_field("schemaInferMaxRec", ToString::to_string(&v).as_str())?;
         }
+        if let Some(v) = self.compression_level.as_ref() {
+            struct_ser.serialize_field("compressionLevel", v)?;
+        }
+        if let Some(v) = self.newline_delimited.as_ref() {
+            struct_ser.serialize_field("newlineDelimited", v)?;
+        }
         struct_ser.end()
     }
 }
@@ -4572,12 +4999,18 @@ impl<'de> serde::Deserialize<'de> for JsonOptions {
             "compression",
             "schema_infer_max_rec",
             "schemaInferMaxRec",
+            "compression_level",
+            "compressionLevel",
+            "newline_delimited",
+            "newlineDelimited",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
             Compression,
             SchemaInferMaxRec,
+            CompressionLevel,
+            NewlineDelimited,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -4586,7 +5019,7 @@ impl<'de> serde::Deserialize<'de> for JsonOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4601,6 +5034,8 @@ impl<'de> serde::Deserialize<'de> for JsonOptions {
                         match value {
                             "compression" => Ok(GeneratedField::Compression),
                             "schemaInferMaxRec" | "schema_infer_max_rec" => Ok(GeneratedField::SchemaInferMaxRec),
+                            "compressionLevel" | "compression_level" => Ok(GeneratedField::CompressionLevel),
+                            "newlineDelimited" | "newline_delimited" => Ok(GeneratedField::NewlineDelimited),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -4622,6 +5057,8 @@ impl<'de> serde::Deserialize<'de> for JsonOptions {
             {
                 let mut compression__ = None;
                 let mut schema_infer_max_rec__ = None;
+                let mut compression_level__ = None;
+                let mut newline_delimited__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Compression => {
@@ -4638,11 +5075,27 @@ impl<'de> serde::Deserialize<'de> for JsonOptions {
                                 map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| x.0)
                             ;
                         }
+                        GeneratedField::CompressionLevel => {
+                            if compression_level__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("compressionLevel"));
+                            }
+                            compression_level__ = 
+                                map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| x.0)
+                            ;
+                        }
+                        GeneratedField::NewlineDelimited => {
+                            if newline_delimited__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("newlineDelimited"));
+                            }
+                            newline_delimited__ = map_.next_value()?;
+                        }
                     }
                 }
                 Ok(JsonOptions {
                     compression: compression__.unwrap_or_default(),
                     schema_infer_max_rec: schema_infer_max_rec__,
+                    compression_level: compression_level__,
+                    newline_delimited: newline_delimited__,
                 })
             }
         }
@@ -4690,7 +5143,7 @@ impl<'de> serde::Deserialize<'de> for JsonWriterOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4782,7 +5235,7 @@ impl<'de> serde::Deserialize<'de> for List {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4883,7 +5336,7 @@ impl<'de> serde::Deserialize<'de> for Map {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4983,7 +5436,7 @@ impl<'de> serde::Deserialize<'de> for NdJsonFormat {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5061,7 +5514,7 @@ impl<'de> serde::Deserialize<'de> for NullEquality {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = NullEquality;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5228,7 +5681,7 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5382,7 +5835,7 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnSpecificOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5482,7 +5935,7 @@ impl<'de> serde::Deserialize<'de> for ParquetFormat {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5557,6 +6010,9 @@ impl serde::Serialize for ParquetOptions {
         if self.reorder_filters {
             len += 1;
         }
+        if self.force_filter_selections {
+            len += 1;
+        }
         if self.data_pagesize_limit != 0 {
             len += 1;
         }
@@ -5602,6 +6058,9 @@ impl serde::Serialize for ParquetOptions {
         if !self.created_by.is_empty() {
             len += 1;
         }
+        if self.content_defined_chunking.is_some() {
+            len += 1;
+        }
         if self.metadata_size_hint_opt.is_some() {
             len += 1;
         }
@@ -5651,6 +6110,9 @@ impl serde::Serialize for ParquetOptions {
         if self.reorder_filters {
             struct_ser.serialize_field("reorderFilters", &self.reorder_filters)?;
         }
+        if self.force_filter_selections {
+            struct_ser.serialize_field("forceFilterSelections", &self.force_filter_selections)?;
+        }
         if self.data_pagesize_limit != 0 {
             #[allow(clippy::needless_borrow)]
             #[allow(clippy::needless_borrows_for_generic_args)]
@@ -5710,6 +6172,9 @@ impl serde::Serialize for ParquetOptions {
         if !self.created_by.is_empty() {
             struct_ser.serialize_field("createdBy", &self.created_by)?;
         }
+        if let Some(v) = self.content_defined_chunking.as_ref() {
+            struct_ser.serialize_field("contentDefinedChunking", v)?;
+        }
         if let Some(v) = self.metadata_size_hint_opt.as_ref() {
             match v {
                 parquet_options::MetadataSizeHintOpt::MetadataSizeHint(v) => {
@@ -5816,6 +6281,8 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
             "pushdownFilters",
             "reorder_filters",
             "reorderFilters",
+            "force_filter_selections",
+            "forceFilterSelections",
             "data_pagesize_limit",
             "dataPagesizeLimit",
             "write_batch_size",
@@ -5846,6 +6313,8 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
             "maxRowGroupSize",
             "created_by",
             "createdBy",
+            "content_defined_chunking",
+            "contentDefinedChunking",
             "metadata_size_hint",
             "metadataSizeHint",
             "compression",
@@ -5875,6 +6344,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
             SkipMetadata,
             PushdownFilters,
             ReorderFilters,
+            ForceFilterSelections,
             DataPagesizeLimit,
             WriteBatchSize,
             WriterVersion,
@@ -5890,6 +6360,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
             DataPageRowCountLimit,
             MaxRowGroupSize,
             CreatedBy,
+            ContentDefinedChunking,
             MetadataSizeHint,
             Compression,
             DictionaryEnabled,
@@ -5909,7 +6380,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5927,6 +6398,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
                             "skipMetadata" | "skip_metadata" => Ok(GeneratedField::SkipMetadata),
                             "pushdownFilters" | "pushdown_filters" => Ok(GeneratedField::PushdownFilters),
                             "reorderFilters" | "reorder_filters" => Ok(GeneratedField::ReorderFilters),
+                            "forceFilterSelections" | "force_filter_selections" => Ok(GeneratedField::ForceFilterSelections),
                             "dataPagesizeLimit" | "data_pagesize_limit" => Ok(GeneratedField::DataPagesizeLimit),
                             "writeBatchSize" | "write_batch_size" => Ok(GeneratedField::WriteBatchSize),
                             "writerVersion" | "writer_version" => Ok(GeneratedField::WriterVersion),
@@ -5942,6 +6414,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
                             "dataPageRowCountLimit" | "data_page_row_count_limit" => Ok(GeneratedField::DataPageRowCountLimit),
                             "maxRowGroupSize" | "max_row_group_size" => Ok(GeneratedField::MaxRowGroupSize),
                             "createdBy" | "created_by" => Ok(GeneratedField::CreatedBy),
+                            "contentDefinedChunking" | "content_defined_chunking" => Ok(GeneratedField::ContentDefinedChunking),
                             "metadataSizeHint" | "metadata_size_hint" => Ok(GeneratedField::MetadataSizeHint),
                             "compression" => Ok(GeneratedField::Compression),
                             "dictionaryEnabled" | "dictionary_enabled" => Ok(GeneratedField::DictionaryEnabled),
@@ -5977,6 +6450,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
                 let mut skip_metadata__ = None;
                 let mut pushdown_filters__ = None;
                 let mut reorder_filters__ = None;
+                let mut force_filter_selections__ = None;
                 let mut data_pagesize_limit__ = None;
                 let mut write_batch_size__ = None;
                 let mut writer_version__ = None;
@@ -5992,6 +6466,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
                 let mut data_page_row_count_limit__ = None;
                 let mut max_row_group_size__ = None;
                 let mut created_by__ = None;
+                let mut content_defined_chunking__ = None;
                 let mut metadata_size_hint_opt__ = None;
                 let mut compression_opt__ = None;
                 let mut dictionary_enabled_opt__ = None;
@@ -6035,6 +6510,12 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
                             }
                             reorder_filters__ = Some(map_.next_value()?);
                         }
+                        GeneratedField::ForceFilterSelections => {
+                            if force_filter_selections__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("forceFilterSelections"));
+                            }
+                            force_filter_selections__ = Some(map_.next_value()?);
+                        }
                         GeneratedField::DataPagesizeLimit => {
                             if data_pagesize_limit__.is_some() {
                                 return Err(serde::de::Error::duplicate_field("dataPagesizeLimit"));
@@ -6139,6 +6620,12 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
                             }
                             created_by__ = Some(map_.next_value()?);
                         }
+                        GeneratedField::ContentDefinedChunking => {
+                            if content_defined_chunking__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("contentDefinedChunking"));
+                            }
+                            content_defined_chunking__ = map_.next_value()?;
+                        }
                         GeneratedField::MetadataSizeHint => {
                             if metadata_size_hint_opt__.is_some() {
                                 return Err(serde::de::Error::duplicate_field("metadataSizeHint"));
@@ -6213,6 +6700,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
                     skip_metadata: skip_metadata__.unwrap_or_default(),
                     pushdown_filters: pushdown_filters__.unwrap_or_default(),
                     reorder_filters: reorder_filters__.unwrap_or_default(),
+                    force_filter_selections: force_filter_selections__.unwrap_or_default(),
                     data_pagesize_limit: data_pagesize_limit__.unwrap_or_default(),
                     write_batch_size: write_batch_size__.unwrap_or_default(),
                     writer_version: writer_version__.unwrap_or_default(),
@@ -6228,6 +6716,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
                     data_page_row_count_limit: data_page_row_count_limit__.unwrap_or_default(),
                     max_row_group_size: max_row_group_size__.unwrap_or_default(),
                     created_by: created_by__.unwrap_or_default(),
+                    content_defined_chunking: content_defined_chunking__,
                     metadata_size_hint_opt: metadata_size_hint_opt__,
                     compression_opt: compression_opt__,
                     dictionary_enabled_opt: dictionary_enabled_opt__,
@@ -6295,7 +6784,7 @@ impl<'de> serde::Deserialize<'de> for Precision {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6384,7 +6873,7 @@ impl<'de> serde::Deserialize<'de> for PrecisionInfo {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = PrecisionInfo;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6469,7 +6958,7 @@ impl<'de> serde::Deserialize<'de> for PrimaryKeyConstraint {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6524,6 +7013,116 @@ impl<'de> serde::Deserialize<'de> for PrimaryKeyConstraint {
         deserializer.deserialize_struct("datafusion_common.PrimaryKeyConstraint", FIELDS, GeneratedVisitor)
     }
 }
+impl serde::Serialize for RunEndEncoded {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.run_ends_field.is_some() {
+            len += 1;
+        }
+        if self.values_field.is_some() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion_common.RunEndEncoded", len)?;
+        if let Some(v) = self.run_ends_field.as_ref() {
+            struct_ser.serialize_field("runEndsField", v)?;
+        }
+        if let Some(v) = self.values_field.as_ref() {
+            struct_ser.serialize_field("valuesField", v)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for RunEndEncoded {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "run_ends_field",
+            "runEndsField",
+            "values_field",
+            "valuesField",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            RunEndsField,
+            ValuesField,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "runEndsField" | "run_ends_field" => Ok(GeneratedField::RunEndsField),
+                            "valuesField" | "values_field" => Ok(GeneratedField::ValuesField),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = RunEndEncoded;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion_common.RunEndEncoded")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<RunEndEncoded, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut run_ends_field__ = None;
+                let mut values_field__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::RunEndsField => {
+                            if run_ends_field__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("runEndsField"));
+                            }
+                            run_ends_field__ = map_.next_value()?;
+                        }
+                        GeneratedField::ValuesField => {
+                            if values_field__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("valuesField"));
+                            }
+                            values_field__ = map_.next_value()?;
+                        }
+                    }
+                }
+                Ok(RunEndEncoded {
+                    run_ends_field: run_ends_field__,
+                    values_field: values_field__,
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion_common.RunEndEncoded", FIELDS, GeneratedVisitor)
+    }
+}
 impl serde::Serialize for ScalarDictionaryValue {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
@@ -6572,7 +7171,7 @@ impl<'de> serde::Deserialize<'de> for ScalarDictionaryValue {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6682,7 +7281,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFixedSizeBinary {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6816,7 +7415,7 @@ impl<'de> serde::Deserialize<'de> for ScalarNestedValue {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6952,7 +7551,7 @@ impl<'de> serde::Deserialize<'de> for scalar_nested_value::Dictionary {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7017,6 +7616,133 @@ impl<'de> serde::Deserialize<'de> for scalar_nested_value::Dictionary {
         deserializer.deserialize_struct("datafusion_common.ScalarNestedValue.Dictionary", FIELDS, GeneratedVisitor)
     }
 }
+impl serde::Serialize for ScalarRunEndEncodedValue {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.run_ends_field.is_some() {
+            len += 1;
+        }
+        if self.values_field.is_some() {
+            len += 1;
+        }
+        if self.value.is_some() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion_common.ScalarRunEndEncodedValue", len)?;
+        if let Some(v) = self.run_ends_field.as_ref() {
+            struct_ser.serialize_field("runEndsField", v)?;
+        }
+        if let Some(v) = self.values_field.as_ref() {
+            struct_ser.serialize_field("valuesField", v)?;
+        }
+        if let Some(v) = self.value.as_ref() {
+            struct_ser.serialize_field("value", v)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for ScalarRunEndEncodedValue {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "run_ends_field",
+            "runEndsField",
+            "values_field",
+            "valuesField",
+            "value",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            RunEndsField,
+            ValuesField,
+            Value,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "runEndsField" | "run_ends_field" => Ok(GeneratedField::RunEndsField),
+                            "valuesField" | "values_field" => Ok(GeneratedField::ValuesField),
+                            "value" => Ok(GeneratedField::Value),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = ScalarRunEndEncodedValue;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion_common.ScalarRunEndEncodedValue")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<ScalarRunEndEncodedValue, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut run_ends_field__ = None;
+                let mut values_field__ = None;
+                let mut value__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::RunEndsField => {
+                            if run_ends_field__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("runEndsField"));
+                            }
+                            run_ends_field__ = map_.next_value()?;
+                        }
+                        GeneratedField::ValuesField => {
+                            if values_field__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("valuesField"));
+                            }
+                            values_field__ = map_.next_value()?;
+                        }
+                        GeneratedField::Value => {
+                            if value__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("value"));
+                            }
+                            value__ = map_.next_value()?;
+                        }
+                    }
+                }
+                Ok(ScalarRunEndEncodedValue {
+                    run_ends_field: run_ends_field__,
+                    values_field: values_field__,
+                    value: value__,
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion_common.ScalarRunEndEncodedValue", FIELDS, GeneratedVisitor)
+    }
+}
 impl serde::Serialize for ScalarTime32Value {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
@@ -7067,7 +7793,7 @@ impl<'de> serde::Deserialize<'de> for ScalarTime32Value {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7180,7 +7906,7 @@ impl<'de> serde::Deserialize<'de> for ScalarTime64Value {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7317,7 +8043,7 @@ impl<'de> serde::Deserialize<'de> for ScalarTimestampValue {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7477,6 +8203,12 @@ impl serde::Serialize for ScalarValue {
                 scalar_value::Value::FixedSizeListValue(v) => {
                     struct_ser.serialize_field("fixedSizeListValue", v)?;
                 }
+                scalar_value::Value::ListViewValue(v) => {
+                    struct_ser.serialize_field("listViewValue", v)?;
+                }
+                scalar_value::Value::LargeListViewValue(v) => {
+                    struct_ser.serialize_field("largeListViewValue", v)?;
+                }
                 scalar_value::Value::StructValue(v) => {
                     struct_ser.serialize_field("structValue", v)?;
                 }
@@ -7559,6 +8291,9 @@ impl serde::Serialize for ScalarValue {
                 scalar_value::Value::UnionValue(v) => {
                     struct_ser.serialize_field("unionValue", v)?;
                 }
+                scalar_value::Value::RunEndEncodedValue(v) => {
+                    struct_ser.serialize_field("runEndEncodedValue", v)?;
+                }
             }
         }
         struct_ser.end()
@@ -7611,6 +8346,10 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
             "listValue",
             "fixed_size_list_value",
             "fixedSizeListValue",
+            "list_view_value",
+            "listViewValue",
+            "large_list_view_value",
+            "largeListViewValue",
             "struct_value",
             "structValue",
             "map_value",
@@ -7655,6 +8394,8 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
             "fixedSizeBinaryValue",
             "union_value",
             "unionValue",
+            "run_end_encoded_value",
+            "runEndEncodedValue",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -7679,6 +8420,8 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
             LargeListValue,
             ListValue,
             FixedSizeListValue,
+            ListViewValue,
+            LargeListViewValue,
             StructValue,
             MapValue,
             Decimal32Value,
@@ -7701,6 +8444,7 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
             IntervalMonthDayNano,
             FixedSizeBinaryValue,
             UnionValue,
+            RunEndEncodedValue,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -7709,7 +8453,7 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7742,6 +8486,8 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
                             "largeListValue" | "large_list_value" => Ok(GeneratedField::LargeListValue),
                             "listValue" | "list_value" => Ok(GeneratedField::ListValue),
                             "fixedSizeListValue" | "fixed_size_list_value" => Ok(GeneratedField::FixedSizeListValue),
+                            "listViewValue" | "list_view_value" => Ok(GeneratedField::ListViewValue),
+                            "largeListViewValue" | "large_list_view_value" => Ok(GeneratedField::LargeListViewValue),
                             "structValue" | "struct_value" => Ok(GeneratedField::StructValue),
                             "mapValue" | "map_value" => Ok(GeneratedField::MapValue),
                             "decimal32Value" | "decimal32_value" => Ok(GeneratedField::Decimal32Value),
@@ -7764,6 +8510,7 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
                             "intervalMonthDayNano" | "interval_month_day_nano" => Ok(GeneratedField::IntervalMonthDayNano),
                             "fixedSizeBinaryValue" | "fixed_size_binary_value" => Ok(GeneratedField::FixedSizeBinaryValue),
                             "unionValue" | "union_value" => Ok(GeneratedField::UnionValue),
+                            "runEndEncodedValue" | "run_end_encoded_value" => Ok(GeneratedField::RunEndEncodedValue),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -7909,6 +8656,20 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
                                 return Err(serde::de::Error::duplicate_field("fixedSizeListValue"));
                             }
                             value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::FixedSizeListValue)
+;
+                        }
+                        GeneratedField::ListViewValue => {
+                            if value__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("listViewValue"));
+                            }
+                            value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::ListViewValue)
+;
+                        }
+                        GeneratedField::LargeListViewValue => {
+                            if value__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("largeListViewValue"));
+                            }
+                            value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::LargeListViewValue)
 ;
                         }
                         GeneratedField::StructValue => {
@@ -8054,6 +8815,13 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
                                 return Err(serde::de::Error::duplicate_field("unionValue"));
                             }
                             value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::UnionValue)
+;
+                        }
+                        GeneratedField::RunEndEncodedValue => {
+                            if value__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("runEndEncodedValue"));
+                            }
+                            value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::RunEndEncodedValue)
 ;
                         }
                     }
@@ -8113,7 +8881,7 @@ impl<'de> serde::Deserialize<'de> for Schema {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8234,7 +9002,7 @@ impl<'de> serde::Deserialize<'de> for Statistics {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8344,7 +9112,7 @@ impl<'de> serde::Deserialize<'de> for Struct {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8453,7 +9221,7 @@ impl<'de> serde::Deserialize<'de> for TableParquetOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8555,7 +9323,7 @@ impl<'de> serde::Deserialize<'de> for TimeUnit {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = TimeUnit;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8652,7 +9420,7 @@ impl<'de> serde::Deserialize<'de> for Timestamp {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8773,7 +9541,7 @@ impl<'de> serde::Deserialize<'de> for Union {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8894,7 +9662,7 @@ impl<'de> serde::Deserialize<'de> for UnionField {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8983,7 +9751,7 @@ impl<'de> serde::Deserialize<'de> for UnionMode {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = UnionMode;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -9094,7 +9862,7 @@ impl<'de> serde::Deserialize<'de> for UnionValue {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -9214,7 +9982,7 @@ impl<'de> serde::Deserialize<'de> for UniqueConstraint {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
diff --git a/datafusion/proto-common/src/generated/prost.rs b/datafusion/proto-common/src/generated/prost.rs
index aa7c3d51a9d6d..4472ff0cde59b 100644
--- a/datafusion/proto-common/src/generated/prost.rs
+++ b/datafusion/proto-common/src/generated/prost.rs
@@ -176,6 +176,13 @@ pub struct Map {
     pub keys_sorted: bool,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
+pub struct RunEndEncoded {
+    #[prost(message, optional, boxed, tag = "1")]
+    pub run_ends_field: ::core::option::Option<::prost::alloc::boxed::Box<Field>>,
+    #[prost(message, optional, boxed, tag = "2")]
+    pub values_field: ::core::option::Option<::prost::alloc::boxed::Box<Field>>,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Union {
     #[prost(message, repeated, tag = "1")]
     pub union_types: ::prost::alloc::vec::Vec<Field>,
@@ -184,7 +191,7 @@ pub struct Union {
     #[prost(int32, repeated, tag = "3")]
     pub type_ids: ::prost::alloc::vec::Vec<i32>,
 }
-/// Used for List/FixedSizeList/LargeList/Struct/Map
+/// Used for List/FixedSizeList/LargeList/ListView/LargeListView/Struct/Map
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ScalarNestedValue {
     #[prost(bytes = "vec", tag = "1")]
@@ -264,6 +271,15 @@ pub struct ScalarDictionaryValue {
     #[prost(message, optional, boxed, tag = "2")]
     pub value: ::core::option::Option<::prost::alloc::boxed::Box<ScalarValue>>,
 }
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct ScalarRunEndEncodedValue {
+    #[prost(message, optional, tag = "1")]
+    pub run_ends_field: ::core::option::Option<Field>,
+    #[prost(message, optional, tag = "2")]
+    pub values_field: ::core::option::Option<Field>,
+    #[prost(message, optional, boxed, tag = "3")]
+    pub value: ::core::option::Option<::prost::alloc::boxed::Box<ScalarValue>>,
+}
 #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct IntervalDayTimeValue {
     #[prost(int32, tag = "1")]
@@ -311,7 +327,7 @@ pub struct ScalarFixedSizeBinary {
 pub struct ScalarValue {
     #[prost(
         oneof = "scalar_value::Value",
-        tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 43, 44, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42"
+        tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 46, 47, 32, 41, 43, 44, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42, 45"
     )]
     pub value: ::core::option::Option<scalar_value::Value>,
 }
@@ -362,6 +378,10 @@ pub mod scalar_value {
         ListValue(super::ScalarNestedValue),
         #[prost(message, tag = "18")]
         FixedSizeListValue(super::ScalarNestedValue),
+        #[prost(message, tag = "46")]
+        ListViewValue(super::ScalarNestedValue),
+        #[prost(message, tag = "47")]
+        LargeListViewValue(super::ScalarNestedValue),
         #[prost(message, tag = "32")]
         StructValue(super::ScalarNestedValue),
         #[prost(message, tag = "41")]
@@ -406,6 +426,8 @@ pub mod scalar_value {
         FixedSizeBinaryValue(super::ScalarFixedSizeBinary),
         #[prost(message, tag = "42")]
         UnionValue(::prost::alloc::boxed::Box<super::UnionValue>),
+        #[prost(message, tag = "45")]
+        RunEndEncodedValue(::prost::alloc::boxed::Box<super::ScalarRunEndEncodedValue>),
     }
 }
 #[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
@@ -449,7 +471,7 @@ pub struct Decimal256 {
 pub struct ArrowType {
     #[prost(
         oneof = "arrow_type::ArrowTypeEnum",
-        tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 40, 41, 24, 36, 25, 26, 27, 28, 29, 30, 33"
+        tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 40, 41, 24, 36, 25, 26, 27, 43, 44, 28, 29, 30, 33, 42"
     )]
     pub arrow_type_enum: ::core::option::Option<arrow_type::ArrowTypeEnum>,
 }
@@ -530,6 +552,10 @@ pub mod arrow_type {
         LargeList(::prost::alloc::boxed::Box<super::List>),
         #[prost(message, tag = "27")]
         FixedSizeList(::prost::alloc::boxed::Box<super::FixedSizeList>),
+        #[prost(message, tag = "43")]
+        ListView(::prost::alloc::boxed::Box<super::List>),
+        #[prost(message, tag = "44")]
+        LargeListView(::prost::alloc::boxed::Box<super::List>),
         #[prost(message, tag = "28")]
         Struct(super::Struct),
         #[prost(message, tag = "29")]
@@ -538,6 +564,8 @@ pub mod arrow_type {
         Dictionary(::prost::alloc::boxed::Box<super::Dictionary>),
         #[prost(message, tag = "33")]
         Map(::prost::alloc::boxed::Box<super::Map>),
+        #[prost(message, tag = "42")]
+        RunEndEncoded(::prost::alloc::boxed::Box<super::RunEndEncoded>),
     }
 }
 /// Useful for representing an empty enum variant in rust
@@ -591,6 +619,15 @@ pub struct CsvWriterOptions {
     /// Optional flag whether to double quotes, instead of escaping. Defaults to `true`
     #[prost(bool, tag = "11")]
     pub double_quote: bool,
+    /// Quote style for CSV writing
+    #[prost(enumeration = "CsvQuoteStyle", tag = "12")]
+    pub quote_style: i32,
+    /// Whether to ignore leading whitespace in string values
+    #[prost(bool, tag = "13")]
+    pub ignore_leading_whitespace: bool,
+    /// Whether to ignore trailing whitespace in string values
+    #[prost(bool, tag = "14")]
+    pub ignore_trailing_whitespace: bool,
 }
 /// Options controlling CSV format
 #[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
@@ -649,6 +686,18 @@ pub struct CsvOptions {
     /// Indicates if truncated rows are allowed
     #[prost(bytes = "vec", tag = "18")]
     pub truncated_rows: ::prost::alloc::vec::Vec<u8>,
+    /// Optional compression level
+    #[prost(uint32, optional, tag = "19")]
+    pub compression_level: ::core::option::Option<u32>,
+    /// Quote style for CSV writing
+    #[prost(enumeration = "CsvQuoteStyle", tag = "20")]
+    pub quote_style: i32,
+    /// Whether to ignore leading whitespace in string values
+    #[prost(bytes = "vec", tag = "21")]
+    pub ignore_leading_whitespace: ::prost::alloc::vec::Vec<u8>,
+    /// Whether to ignore trailing whitespace in string values
+    #[prost(bytes = "vec", tag = "22")]
+    pub ignore_trailing_whitespace: ::prost::alloc::vec::Vec<u8>,
 }
 /// Options controlling CSV format
 #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
@@ -659,6 +708,12 @@ pub struct JsonOptions {
     /// Optional max records for schema inference
     #[prost(uint64, optional, tag = "2")]
     pub schema_infer_max_rec: ::core::option::Option<u64>,
+    /// Optional compression level
+    #[prost(uint32, optional, tag = "3")]
+    pub compression_level: ::core::option::Option<u32>,
+    /// Whether to read as newline-delimited JSON (default true). When false, expects JSON array format \[{},...\]
+    #[prost(bool, optional, tag = "4")]
+    pub newline_delimited: ::core::option::Option<bool>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct TableParquetOptions {
@@ -763,6 +818,9 @@ pub struct ParquetOptions {
     /// default = false
     #[prost(bool, tag = "6")]
     pub reorder_filters: bool,
+    /// default = false
+    #[prost(bool, tag = "34")]
+    pub force_filter_selections: bool,
     /// default = 1024 * 1024
     #[prost(uint64, tag = "7")]
     pub data_pagesize_limit: u64,
@@ -806,6 +864,8 @@ pub struct ParquetOptions {
     pub max_row_group_size: u64,
     #[prost(string, tag = "16")]
     pub created_by: ::prost::alloc::string::String,
+    #[prost(message, optional, tag = "35")]
+    pub content_defined_chunking: ::core::option::Option<CdcOptions>,
     #[prost(oneof = "parquet_options::MetadataSizeHintOpt", tags = "4")]
     pub metadata_size_hint_opt: ::core::option::Option<
         parquet_options::MetadataSizeHintOpt,
@@ -899,6 +959,15 @@ pub mod parquet_options {
         MaxPredicateCacheSize(u64),
     }
 }
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct CdcOptions {
+    #[prost(uint64, tag = "1")]
+    pub min_chunk_size: u64,
+    #[prost(uint64, tag = "2")]
+    pub max_chunk_size: u64,
+    #[prost(int32, tag = "3")]
+    pub norm_level: i32,
+}
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Precision {
     #[prost(enumeration = "PrecisionInfo", tag = "1")]
@@ -927,6 +996,8 @@ pub struct ColumnStats {
     pub null_count: ::core::option::Option<Precision>,
     #[prost(message, optional, tag = "4")]
     pub distinct_count: ::core::option::Option<Precision>,
+    #[prost(message, optional, tag = "6")]
+    pub byte_size: ::core::option::Option<Precision>,
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
 #[repr(i32)]
@@ -1154,6 +1225,38 @@ impl CompressionTypeVariant {
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
 #[repr(i32)]
+pub enum CsvQuoteStyle {
+    Necessary = 0,
+    Always = 1,
+    NonNumeric = 2,
+    Never = 3,
+}
+impl CsvQuoteStyle {
+    /// String value of the enum field names used in the ProtoBuf definition.
+    ///
+    /// The values are not transformed in any way and thus are considered stable
+    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
+    pub fn as_str_name(&self) -> &'static str {
+        match self {
+            Self::Necessary => "NECESSARY",
+            Self::Always => "ALWAYS",
+            Self::NonNumeric => "NON_NUMERIC",
+            Self::Never => "NEVER",
+        }
+    }
+    /// Creates an enum from field names used in the ProtoBuf definition.
+    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
+        match value {
+            "NECESSARY" => Some(Self::Necessary),
+            "ALWAYS" => Some(Self::Always),
+            "NON_NUMERIC" => Some(Self::NonNumeric),
+            "NEVER" => Some(Self::Never),
+            _ => None,
+        }
+    }
+}
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
+#[repr(i32)]
 pub enum JoinSide {
     LeftSide = 0,
     RightSide = 1,
diff --git a/datafusion/proto-common/src/lib.rs b/datafusion/proto-common/src/lib.rs
index b0061168c5ce6..6f7fb7b89c0c4 100644
--- a/datafusion/proto-common/src/lib.rs
+++ b/datafusion/proto-common/src/lib.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 #![doc(
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs
index e9de1d9e9a9ef..dca95746b4631 100644
--- a/datafusion/proto-common/src/to_proto/mod.rs
+++ b/datafusion/proto-common/src/to_proto/mod.rs
@@ -20,10 +20,10 @@ use std::sync::Arc;
 
 use crate::protobuf_common as protobuf;
 use crate::protobuf_common::{
-    arrow_type::ArrowTypeEnum, scalar_value::Value, EmptyMessage,
+    EmptyMessage, arrow_type::ArrowTypeEnum, scalar_value::Value,
 };
 use arrow::array::{ArrayRef, RecordBatch};
-use arrow::csv::WriterBuilder;
+use arrow::csv::{QuoteStyle, WriterBuilder};
 use arrow::datatypes::{
     DataType, Field, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, Schema,
     SchemaRef, TimeUnit, UnionMode,
@@ -31,7 +31,10 @@ use arrow::datatypes::{
 use arrow::ipc::writer::{
     CompressionContext, DictionaryTracker, IpcDataGenerator, IpcWriteOptions,
 };
+use datafusion_common::parsers::CsvQuoteStyle;
 use datafusion_common::{
+    Column, ColumnStatistics, Constraint, Constraints, DFSchema, DFSchemaRef,
+    DataFusionError, JoinSide, ScalarValue, Statistics,
     config::{
         CsvOptions, JsonOptions, ParquetColumnOptions, ParquetOptions,
         TableParquetOptions,
@@ -40,8 +43,6 @@ use datafusion_common::{
     parsers::CompressionTypeVariant,
     plan_datafusion_err,
     stats::Precision,
-    Column, ColumnStatistics, Constraint, Constraints, DFSchema, DFSchemaRef,
-    DataFusionError, JoinSide, ScalarValue, Statistics,
 };
 
 #[derive(Debug)]
@@ -171,6 +172,14 @@ impl TryFrom<&DataType> for protobuf::arrow_type::ArrowTypeEnum {
             DataType::LargeList(item_type) => Self::LargeList(Box::new(protobuf::List {
                 field_type: Some(Box::new(item_type.as_ref().try_into()?)),
             })),
+            DataType::ListView(item_type) => Self::ListView(Box::new(protobuf::List {
+                field_type: Some(Box::new(item_type.as_ref().try_into()?)),
+            })),
+            DataType::LargeListView(item_type) => {
+                Self::LargeListView(Box::new(protobuf::List {
+                    field_type: Some(Box::new(item_type.as_ref().try_into()?)),
+                }))
+            }
             DataType::Struct(struct_fields) => Self::Struct(protobuf::Struct {
                 sub_field_types: convert_arc_fields_to_proto_fields(struct_fields)?,
             }),
@@ -180,7 +189,9 @@ impl TryFrom<&DataType> for protobuf::arrow_type::ArrowTypeEnum {
                     UnionMode::Dense => protobuf::UnionMode::Dense,
                 };
                 Self::Union(protobuf::Union {
-                    union_types: convert_arc_fields_to_proto_fields(fields.iter().map(|(_, item)|item))?,
+                    union_types: convert_arc_fields_to_proto_fields(
+                        fields.iter().map(|(_, item)| item),
+                    )?,
                     union_mode: union_mode.into(),
                     type_ids: fields.iter().map(|(x, _)| x as i32).collect(),
                 })
@@ -191,37 +202,39 @@ impl TryFrom<&DataType> for protobuf::arrow_type::ArrowTypeEnum {
                     value: Some(Box::new(value_type.as_ref().try_into()?)),
                 }))
             }
-            DataType::Decimal32(precision, scale) => Self::Decimal32(protobuf::Decimal32Type {
-                precision: *precision as u32,
-                scale: *scale as i32,
-            }),
-            DataType::Decimal64(precision, scale) => Self::Decimal64(protobuf::Decimal64Type {
-                precision: *precision as u32,
-                scale: *scale as i32,
-            }),
-            DataType::Decimal128(precision, scale) => Self::Decimal128(protobuf::Decimal128Type {
-                precision: *precision as u32,
-                scale: *scale as i32,
-            }),
-            DataType::Decimal256(precision, scale) => Self::Decimal256(protobuf::Decimal256Type {
-                precision: *precision as u32,
-                scale: *scale as i32,
-            }),
-            DataType::Map(field, sorted) => {
-                Self::Map(Box::new(
-                    protobuf::Map {
-                        field_type: Some(Box::new(field.as_ref().try_into()?)),
-                        keys_sorted: *sorted,
-                    }
-                ))
-            }
-            DataType::RunEndEncoded(_, _) => {
-                return Err(Error::General(
-                    "Proto serialization error: The RunEndEncoded data type is not yet supported".to_owned()
-                ))
-            }
-            DataType::ListView(_) | DataType::LargeListView(_) => {
-                return Err(Error::General(format!("Proto serialization error: {val} not yet supported")))
+            DataType::Decimal32(precision, scale) => {
+                Self::Decimal32(protobuf::Decimal32Type {
+                    precision: *precision as u32,
+                    scale: *scale as i32,
+                })
+            }
+            DataType::Decimal64(precision, scale) => {
+                Self::Decimal64(protobuf::Decimal64Type {
+                    precision: *precision as u32,
+                    scale: *scale as i32,
+                })
+            }
+            DataType::Decimal128(precision, scale) => {
+                Self::Decimal128(protobuf::Decimal128Type {
+                    precision: *precision as u32,
+                    scale: *scale as i32,
+                })
+            }
+            DataType::Decimal256(precision, scale) => {
+                Self::Decimal256(protobuf::Decimal256Type {
+                    precision: *precision as u32,
+                    scale: *scale as i32,
+                })
+            }
+            DataType::Map(field, sorted) => Self::Map(Box::new(protobuf::Map {
+                field_type: Some(Box::new(field.as_ref().try_into()?)),
+                keys_sorted: *sorted,
+            })),
+            DataType::RunEndEncoded(run_ends_field, values_field) => {
+                Self::RunEndEncoded(Box::new(protobuf::RunEndEncoded {
+                    run_ends_field: Some(Box::new(run_ends_field.as_ref().try_into()?)),
+                    values_field: Some(Box::new(values_field.as_ref().try_into()?)),
+                }))
             }
         };
 
@@ -374,6 +387,12 @@ impl TryFrom<&ScalarValue> for protobuf::ScalarValue {
             ScalarValue::FixedSizeList(arr) => {
                 encode_scalar_nested_value(arr.to_owned() as ArrayRef, val)
             }
+            ScalarValue::ListView(arr) => {
+                encode_scalar_nested_value(arr.to_owned() as ArrayRef, val)
+            }
+            ScalarValue::LargeListView(arr) => {
+                encode_scalar_nested_value(arr.to_owned() as ArrayRef, val)
+            }
             ScalarValue::Struct(arr) => {
                 encode_scalar_nested_value(arr.to_owned() as ArrayRef, val)
             }
@@ -680,6 +699,18 @@ impl TryFrom<&ScalarValue> for protobuf::ScalarValue {
                     ))),
                 })
             }
+
+            ScalarValue::RunEndEncoded(run_ends_field, values_field, val) => {
+                Ok(protobuf::ScalarValue {
+                    value: Some(Value::RunEndEncodedValue(Box::new(
+                        protobuf::ScalarRunEndEncodedValue {
+                            run_ends_field: Some(run_ends_field.as_ref().try_into()?),
+                            values_field: Some(values_field.as_ref().try_into()?),
+                            value: Some(Box::new(val.as_ref().try_into()?)),
+                        },
+                    ))),
+                })
+            }
         }
     }
 }
@@ -795,6 +826,7 @@ impl From<&ColumnStatistics> for protobuf::ColumnStats {
             sum_value: Some(protobuf::Precision::from(&s.sum_value)),
             null_count: Some(protobuf::Precision::from(&s.null_count)),
             distinct_count: Some(protobuf::Precision::from(&s.distinct_count)),
+            byte_size: Some(protobuf::Precision::from(&s.byte_size)),
         }
     }
 }
@@ -821,6 +853,29 @@ impl From<&CompressionTypeVariant> for protobuf::CompressionTypeVariant {
     }
 }
 
+impl From<CsvQuoteStyle> for protobuf::CsvQuoteStyle {
+    fn from(value: CsvQuoteStyle) -> Self {
+        match value {
+            CsvQuoteStyle::Necessary => Self::Necessary,
+            CsvQuoteStyle::Always => Self::Always,
+            CsvQuoteStyle::NonNumeric => Self::NonNumeric,
+            CsvQuoteStyle::Never => Self::Never,
+        }
+    }
+}
+
+impl From<QuoteStyle> for protobuf::CsvQuoteStyle {
+    fn from(value: QuoteStyle) -> Self {
+        match value {
+            QuoteStyle::Necessary => Self::Necessary,
+            QuoteStyle::Always => Self::Always,
+            QuoteStyle::NonNumeric => Self::NonNumeric,
+            QuoteStyle::Never => Self::Never,
+            _ => Self::Necessary,
+        }
+    }
+}
+
 impl TryFrom<&CsvWriterOptions> for protobuf::CsvWriterOptions {
     type Error = DataFusionError;
 
@@ -856,9 +911,10 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions {
             metadata_size_hint_opt: value.metadata_size_hint.map(|v| protobuf::parquet_options::MetadataSizeHintOpt::MetadataSizeHint(v as u64)),
             pushdown_filters: value.pushdown_filters,
             reorder_filters: value.reorder_filters,
+            force_filter_selections: value.force_filter_selections,
             data_pagesize_limit: value.data_pagesize_limit as u64,
             write_batch_size: value.write_batch_size as u64,
-            writer_version: value.writer_version.clone(),
+            writer_version: value.writer_version.to_string(),
             compression_opt: value.compression.clone().map(protobuf::parquet_options::CompressionOpt::Compression),
             dictionary_enabled_opt: value.dictionary_enabled.map(protobuf::parquet_options::DictionaryEnabledOpt::DictionaryEnabled),
             dictionary_page_size_limit: value.dictionary_page_size_limit as u64,
@@ -881,6 +937,13 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions {
             skip_arrow_metadata: value.skip_arrow_metadata,
             coerce_int96_opt: value.coerce_int96.clone().map(protobuf::parquet_options::CoerceInt96Opt::CoerceInt96),
             max_predicate_cache_size_opt: value.max_predicate_cache_size.map(|v| protobuf::parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(v as u64)),
+            content_defined_chunking: value.use_content_defined_chunking.as_ref().map(|cdc|
+                protobuf::CdcOptions {
+                    min_chunk_size: cdc.min_chunk_size as u64,
+                    max_chunk_size: cdc.max_chunk_size as u64,
+                    norm_level: cdc.norm_level,
+                }
+            ),
         })
     }
 }
@@ -940,8 +1003,11 @@ impl TryFrom<&TableParquetOptions> for protobuf::TableParquetOptions {
             .iter()
             .filter_map(|(k, v)| v.as_ref().map(|v| (k.clone(), v.clone())))
             .collect::<HashMap<String, String>>();
+
+        let global: protobuf::ParquetOptions = (&value.global).try_into()?;
+
         Ok(protobuf::TableParquetOptions {
-            global: Some((&value.global).try_into()?),
+            global: Some(global),
             column_specific_options,
             key_value_metadata,
         })
@@ -953,6 +1019,7 @@ impl TryFrom<&CsvOptions> for protobuf::CsvOptions {
 
     fn try_from(opts: &CsvOptions) -> datafusion_common::Result<Self, Self::Error> {
         let compression: protobuf::CompressionTypeVariant = opts.compression.into();
+        let quote_style: protobuf::CsvQuoteStyle = opts.quote_style.into();
         Ok(protobuf::CsvOptions {
             has_header: opts.has_header.map_or_else(Vec::new, |h| vec![h as u8]),
             delimiter: vec![opts.delimiter],
@@ -974,6 +1041,14 @@ impl TryFrom<&CsvOptions> for protobuf::CsvOptions {
             null_regex: opts.null_regex.clone().unwrap_or_default(),
             comment: opts.comment.map_or_else(Vec::new, |h| vec![h]),
             truncated_rows: opts.truncated_rows.map_or_else(Vec::new, |h| vec![h as u8]),
+            compression_level: opts.compression_level,
+            quote_style: quote_style.into(),
+            ignore_leading_whitespace: opts
+                .ignore_leading_whitespace
+                .map_or_else(Vec::new, |h| vec![h as u8]),
+            ignore_trailing_whitespace: opts
+                .ignore_trailing_whitespace
+                .map_or_else(Vec::new, |h| vec![h as u8]),
         })
     }
 }
@@ -986,6 +1061,8 @@ impl TryFrom<&JsonOptions> for protobuf::JsonOptions {
         Ok(protobuf::JsonOptions {
             compression: compression.into(),
             schema_infer_max_rec: opts.schema_infer_max_rec.map(|h| h as u64),
+            compression_level: opts.compression_level,
+            newline_delimited: Some(opts.newline_delimited),
         })
     }
 }
@@ -1006,23 +1083,30 @@ fn create_proto_scalar<I, T: FnOnce(&I) -> protobuf::scalar_value::Value>(
     Ok(protobuf::ScalarValue { value: Some(value) })
 }
 
-// ScalarValue::List / FixedSizeList / LargeList / Struct / Map are serialized using
-// Arrow IPC messages as a single column RecordBatch
+// Nested ScalarValue types (List / FixedSizeList / LargeList / ListView / LargeListView / Struct / Map)
+// are serialized using Arrow IPC messages as a single column RecordBatch
 fn encode_scalar_nested_value(
     arr: ArrayRef,
     val: &ScalarValue,
 ) -> Result<protobuf::ScalarValue, Error> {
     let batch = RecordBatch::try_from_iter(vec![("field_name", arr)]).map_err(|e| {
         Error::General(format!(
-            "Error creating temporary batch while encoding ScalarValue::List: {e}"
+            "Error creating temporary batch while encoding nested ScalarValue: {e}"
         ))
     })?;
 
-    let gen = IpcDataGenerator {};
+    let ipc_gen = IpcDataGenerator {};
     let mut dict_tracker = DictionaryTracker::new(false);
     let write_options = IpcWriteOptions::default();
+    // The IPC writer requires pre-allocated dictionary IDs (normally assigned when
+    // serializing the schema). Populate `dict_tracker` by encoding the schema first.
+    ipc_gen.schema_to_bytes_with_dictionary_tracker(
+        batch.schema().as_ref(),
+        &mut dict_tracker,
+        &write_options,
+    );
     let mut compression_context = CompressionContext::default();
-    let (encoded_dictionaries, encoded_message) = gen
+    let (encoded_dictionaries, encoded_message) = ipc_gen
         .encode(
             &batch,
             &mut dict_tracker,
@@ -1030,7 +1114,7 @@ fn encode_scalar_nested_value(
             &mut compression_context,
         )
         .map_err(|e| {
-            Error::General(format!("Error encoding ScalarValue::List as IPC: {e}"))
+            Error::General(format!("Error encoding nested ScalarValue as IPC: {e}"))
         })?;
 
     let schema: protobuf::Schema = batch.schema().try_into()?;
@@ -1062,6 +1146,16 @@ fn encode_scalar_nested_value(
                 scalar_list_value,
             )),
         }),
+        ScalarValue::ListView(_) => Ok(protobuf::ScalarValue {
+            value: Some(protobuf::scalar_value::Value::ListViewValue(
+                scalar_list_value,
+            )),
+        }),
+        ScalarValue::LargeListView(_) => Ok(protobuf::ScalarValue {
+            value: Some(protobuf::scalar_value::Value::LargeListViewValue(
+                scalar_list_value,
+            )),
+        }),
         ScalarValue::Struct(_) => Ok(protobuf::ScalarValue {
             value: Some(protobuf::scalar_value::Value::StructValue(
                 scalar_list_value,
@@ -1092,6 +1186,7 @@ pub(crate) fn csv_writer_options_to_proto(
     compression: &CompressionTypeVariant,
 ) -> protobuf::CsvWriterOptions {
     let compression: protobuf::CompressionTypeVariant = compression.into();
+    let quote_style: protobuf::CsvQuoteStyle = csv_options.quote_style().into();
     protobuf::CsvWriterOptions {
         compression: compression.into(),
         delimiter: (csv_options.delimiter() as char).to_string(),
@@ -1104,5 +1199,8 @@ pub(crate) fn csv_writer_options_to_proto(
         quote: (csv_options.quote() as char).to_string(),
         escape: (csv_options.escape() as char).to_string(),
         double_quote: csv_options.double_quote(),
+        quote_style: quote_style.into(),
+        ignore_leading_whitespace: csv_options.ignore_leading_whitespace(),
+        ignore_trailing_whitespace: csv_options.ignore_trailing_whitespace(),
     }
 }
diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml
index 920e277b8ccc0..4484846813296 100644
--- a/datafusion/proto/Cargo.toml
+++ b/datafusion/proto/Cargo.toml
@@ -28,9 +28,6 @@ license = { workspace = true }
 authors = { workspace = true }
 rust-version = { workspace = true }
 
-# Exclude proto files so crates.io consumers don't need protoc
-exclude = ["*.proto"]
-
 [package.metadata.docs.rs]
 all-features = true
 
@@ -41,7 +38,7 @@ name = "datafusion_proto"
 default = ["parquet"]
 json = ["pbjson", "serde", "serde_json", "datafusion-proto-common/json"]
 parquet = ["datafusion-datasource-parquet", "datafusion-common/parquet", "datafusion/parquet"]
-avro = ["datafusion-datasource-avro", "datafusion-common/avro"]
+avro = ["datafusion-datasource-avro"]
 
 # Note to developers: do *not* add `datafusion` as a dependency in
 # this crate. See https://github.com/apache/datafusion/issues/17713
@@ -73,6 +70,7 @@ serde = { version = "1.0", optional = true }
 serde_json = { workspace = true, optional = true }
 
 [dev-dependencies]
+async-trait = { workspace = true }
 datafusion = { workspace = true, default-features = false, features = [
     "sql",
     "datetime_expressions",
diff --git a/datafusion/proto/gen/Cargo.toml b/datafusion/proto/gen/Cargo.toml
index c2096b6011123..8b48dfe70e6c7 100644
--- a/datafusion/proto/gen/Cargo.toml
+++ b/datafusion/proto/gen/Cargo.toml
@@ -29,10 +29,13 @@ publish = false
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
 [dependencies]
 # Pin these dependencies so that the generated output is deterministic
-pbjson-build = "=0.8.0"
-prost-build = "=0.14.1"
+pbjson-build = "=0.9.0"
+prost-build = "=0.14.3"
diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto
index f9400d14a59c9..dcc2be9f563eb 100644
--- a/datafusion/proto/proto/datafusion.proto
+++ b/datafusion/proto/proto/datafusion.proto
@@ -62,6 +62,7 @@ message LogicalPlanNode {
     RecursiveQueryNode recursive_query = 31;
     CteWorkTableScanNode cte_work_table_scan = 32;
     DmlNode dml = 33;
+    EmptyTableScanNode empty_table_scan = 34;
   }
 }
 
@@ -270,6 +271,25 @@ message CopyToNode {
   repeated string partition_by = 7;
 }
 
+// Identifies a built-in file format supported by DataFusion.
+// Used by DefaultLogicalExtensionCodec to serialize/deserialize
+// FileFormatFactory instances (e.g. in CopyTo plans).
+enum FileFormatKind {
+  FILE_FORMAT_KIND_UNSPECIFIED = 0;
+  FILE_FORMAT_KIND_CSV = 1;
+  FILE_FORMAT_KIND_JSON = 2;
+  FILE_FORMAT_KIND_PARQUET = 3;
+  FILE_FORMAT_KIND_ARROW = 4;
+  FILE_FORMAT_KIND_AVRO = 5;
+}
+
+// Wraps a serialized FileFormatFactory with its format kind tag,
+// so the decoder can dispatch to the correct format-specific codec.
+message FileFormatProto {
+  FileFormatKind kind = 1;
+  bytes encoded_file_format = 2;
+}
+
 message DmlNode{
    enum Type {
     UPDATE = 0;
@@ -278,6 +298,7 @@ message DmlNode{
     INSERT_APPEND = 3;
     INSERT_OVERWRITE = 4;
     INSERT_REPLACE = 5;
+    TRUNCATE = 6;
   }
   Type dml_type = 1;
   LogicalPlanNode input = 2;
@@ -406,6 +427,8 @@ message LogicalExprNode {
 
     Unnest unnest = 35;
 
+    // Subquery expressions
+    ScalarSubqueryExprNode scalar_subquery_expr = 36;
   }
 }
 
@@ -413,6 +436,15 @@ message Wildcard {
   TableReference qualifier = 1;
 }
 
+message SubqueryNode {
+  LogicalPlanNode subquery = 1;
+  repeated LogicalExprNode outer_ref_columns = 2;
+}
+
+message ScalarSubqueryExprNode {
+  SubqueryNode subquery = 1;
+}
+
 message PlaceholderNode {
   string id = 1;
   // We serialize the data type, metadata, and nullability separately to maintain
@@ -593,11 +625,15 @@ message WhenThen {
 message CastNode {
   LogicalExprNode expr = 1;
   datafusion_common.ArrowType arrow_type = 2;
+  map<string, string> metadata = 3;
+  optional bool nullable = 4;
 }
 
 message TryCastNode {
   LogicalExprNode expr = 1;
   datafusion_common.ArrowType arrow_type = 2;
+  map<string, string> metadata = 3;
+  optional bool nullable = 4;
 }
 
 message SortExprNode {
@@ -748,6 +784,10 @@ message PhysicalPlanNode {
     GenerateSeriesNode generate_series = 33;
     SortMergeJoinExecNode sort_merge_join = 34;
     MemoryScanExecNode memory_scan = 35;
+    AsyncFuncExecNode async_func = 36;
+    BufferExecNode buffer = 37;
+    ArrowScanExecNode arrow_scan = 38;
+    ScalarSubqueryExecNode scalar_subquery = 39;
   }
 }
 
@@ -757,6 +797,16 @@ message PartitionColumn {
 }
 
 
+// Determines how file sink output paths are interpreted.
+enum FileOutputMode {
+  // Infer output mode from the URL (extension/trailing `/` heuristic).
+  FILE_OUTPUT_MODE_AUTOMATIC = 0;
+  // Write to a single file at the exact output path.
+  FILE_OUTPUT_MODE_SINGLE_FILE = 1;
+  // Write to a directory with generated filenames.
+  FILE_OUTPUT_MODE_DIRECTORY = 2;
+}
+
 message FileSinkConfig {
   reserved 6; // writer_mode
   reserved 8; // was `overwrite` which has been superseded by `insert_op`
@@ -769,6 +819,8 @@ message FileSinkConfig {
   bool keep_partition_by_columns = 9;
   InsertOp insert_op = 10;
   string file_extension = 11;
+  // Determines how the output path is interpreted.
+  FileOutputMode file_output_mode = 12;
 }
 
 enum InsertOp {
@@ -836,6 +888,12 @@ message PhysicalExprNode {
   // Was date_time_interval_expr
   reserved 17;
 
+  // Unique identifier for this expression to do deduplication during deserialization.
+  // When serializing, this is set via `PhysicalExpr::expression_id`. When deserializing,
+  // this id is used by the `DeduplicatingProtoConverter` to preserve referential
+  // integrity across serde roundtrips for different expressions with the same id.
+  optional uint64 expr_id = 30;
+
   oneof ExprType {
     // column references
     PhysicalColumn column = 1;
@@ -871,9 +929,23 @@ message PhysicalExprNode {
     PhysicalExtensionExprNode extension = 19;
 
     UnknownColumn unknown_column = 20;
+
+    PhysicalHashExprNode hash_expr = 21;
+
+    PhysicalScalarSubqueryExprNode scalar_subquery = 22;
+
+    PhysicalDynamicFilterNode dynamic_filter = 23;
   }
 }
 
+message PhysicalDynamicFilterNode {
+  repeated PhysicalExprNode children = 1;
+  repeated PhysicalExprNode remapped_children = 2;
+  uint64 generation = 3;
+  PhysicalExprNode inner_expr = 4;
+  bool is_complete = 5;
+}
+
 message PhysicalScalarUdfNode {
   string name = 1;
   repeated PhysicalExprNode args = 2;
@@ -932,6 +1004,9 @@ message PhysicalBinaryExprNode {
   PhysicalExprNode l = 1;
   PhysicalExprNode r = 2;
   string op = 3;
+  // Linearized operands for chains of the same operator (e.g. a AND b AND c).
+  // When present, `l` and `r` are ignored and `operands` holds the flattened list.
+  repeated PhysicalExprNode operands = 4;
 }
 
 message PhysicalDateTimeIntervalExprNode {
@@ -989,11 +1064,19 @@ message PhysicalExtensionExprNode {
   repeated PhysicalExprNode inputs = 2;
 }
 
+message PhysicalHashExprNode {
+  repeated PhysicalExprNode on_columns = 1;
+  uint64 seed0 = 2;
+  string description = 6;
+}
+
 message FilterExecNode {
   PhysicalPlanNode input = 1;
   PhysicalExprNode expr = 2;
   uint32 default_filter_selectivity = 3;
   repeated uint32 projection = 9;
+  uint32 batch_size = 10;
+  optional uint32 fetch = 11;
 }
 
 message FileGroup {
@@ -1009,6 +1092,15 @@ message PhysicalSortExprNodeCollection {
   repeated PhysicalSortExprNode physical_sort_expr_nodes = 1;
 }
 
+message ProjectionExpr {
+  string alias = 1;
+  PhysicalExprNode expr = 2;
+}
+
+message ProjectionExprs {
+  repeated ProjectionExpr projections = 1;
+}
+
 message FileScanExecConf {
   repeated FileGroup file_groups = 1;
   datafusion_common.Schema schema = 2;
@@ -1024,6 +1116,8 @@ message FileScanExecConf {
 
   datafusion_common.Constraints constraints = 11;
   optional uint64 batch_size = 12;
+
+  optional ProjectionExprs projection_exprs = 13;
 }
 
 message ParquetScanExecNode {
@@ -1060,6 +1154,10 @@ message AvroScanExecNode {
   FileScanExecConf base_conf = 1;
 }
 
+message ArrowScanExecNode {
+  FileScanExecConf base_conf = 1;
+}
+
 message MemoryScanExecNode {
   repeated bytes partitions = 1;
   datafusion_common.Schema schema = 2;
@@ -1088,6 +1186,9 @@ message HashJoinExecNode {
   datafusion_common.NullEquality null_equality = 7;
   JoinFilter filter = 8;
   repeated uint32 projection = 9;
+  bool null_aware = 10;
+  // Optional dynamic filter expression for pushing down to the probe side.
+  PhysicalExprNode dynamic_filter = 11;
 }
 
 enum StreamPartitionMode {
@@ -1126,6 +1227,10 @@ message AnalyzeExecNode {
   bool show_statistics = 2;
   PhysicalPlanNode input = 3;
   datafusion_common.Schema schema = 4;
+  // Optional metric category filter.
+  // Empty means "plan only". Absent (has_metric_categories=false) means "all".
+  bool has_metric_categories = 5;
+  repeated string metric_categories = 6;
 }
 
 message CrossJoinExecNode {
@@ -1167,6 +1272,7 @@ enum AggregateMode {
   FINAL_PARTITIONED = 2;
   SINGLE = 3;
   SINGLE_PARTITIONED = 4;
+  PARTIAL_REDUCE = 5;
 }
 
 message PartiallySortedInputOrderMode {
@@ -1196,6 +1302,8 @@ message MaybePhysicalSortExprs {
 message AggLimit {
   // wrap into a message to make it optional
   uint64 limit = 1;
+  // Optional ordering direction for TopK aggregation (true = descending, false = ascending)
+  optional bool descending = 2;
 }
 
 message AggregateExecNode {
@@ -1211,6 +1319,9 @@ message AggregateExecNode {
   repeated bool groups = 9;
   repeated MaybeFilter filter_expr = 10;
   AggLimit limit = 11;
+  bool has_grouping_set = 12;
+  // Optional dynamic filter expression for pushing down to the child.
+  PhysicalExprNode dynamic_filter = 13;
 }
 
 message GlobalLimitExecNode {
@@ -1232,6 +1343,8 @@ message SortExecNode {
   // Maximum number of highest/lowest rows to fetch; negative means no limit
   int64 fetch = 3;
   bool preserve_partitioning = 4;
+  // Optional dynamic filter expression for TopK pushdown.
+  PhysicalExprNode dynamic_filter = 5;
 }
 
 message SortPreservingMergeExecNode {
@@ -1273,6 +1386,7 @@ message RepartitionExecNode{
   //   uint64 unknown = 4;
   // }
   Partitioning partitioning = 5;
+  bool preserve_order = 6;
 }
 
 message Partitioning {
@@ -1327,6 +1441,13 @@ message CteWorkTableScanNode {
     datafusion_common.Schema schema = 2;
 }
 
+message EmptyTableScanNode {
+    TableReference table_name = 1;
+    datafusion_common.Schema schema = 2;
+    ProjectionColumns projection = 3;
+    repeated LogicalExprNode filters = 4;
+}
+
 enum GenerateSeriesName {
   GS_GENERATE_SERIES = 0;
   GS_RANGE = 1;
@@ -1382,3 +1503,25 @@ message SortMergeJoinExecNode {
   repeated SortExprNode sort_options = 6;
   datafusion_common.NullEquality null_equality = 7;
 }
+
+message AsyncFuncExecNode {
+  PhysicalPlanNode input = 1;
+  repeated PhysicalExprNode async_exprs = 2;
+  repeated string async_expr_names = 3;
+}
+
+message BufferExecNode {
+  PhysicalPlanNode input = 1;
+  uint64 capacity = 2;
+}
+
+message ScalarSubqueryExecNode {
+  PhysicalPlanNode input = 1;
+  repeated PhysicalPlanNode subqueries = 2;
+}
+
+message PhysicalScalarSubqueryExprNode {
+  datafusion_common.ArrowType data_type = 1;
+  bool nullable = 2;
+  uint32 index = 3;
+}
diff --git a/datafusion/proto/src/bytes/mod.rs b/datafusion/proto/src/bytes/mod.rs
index 6eab2239015a7..2b7d7ed8e849b 100644
--- a/datafusion/proto/src/bytes/mod.rs
+++ b/datafusion/proto/src/bytes/mod.rs
@@ -21,28 +21,21 @@ use crate::logical_plan::{
     self, AsLogicalPlan, DefaultLogicalExtensionCodec, LogicalExtensionCodec,
 };
 use crate::physical_plan::{
-    AsExecutionPlan, DefaultPhysicalExtensionCodec, PhysicalExtensionCodec,
+    DefaultPhysicalExtensionCodec, DefaultPhysicalProtoConverter, PhysicalExtensionCodec,
+    PhysicalPlanDecodeContext, PhysicalProtoConverterExtension,
 };
 use crate::protobuf;
-use datafusion_common::{plan_datafusion_err, Result};
+use datafusion_common::{Result, plan_datafusion_err};
 use datafusion_execution::TaskContext;
-use datafusion_expr::{
-    create_udaf, create_udf, create_udwf, AggregateUDF, Expr, LogicalPlan, Volatility,
-    WindowUDF,
-};
+use datafusion_expr::{Expr, LogicalPlan};
 use prost::{
-    bytes::{Bytes, BytesMut},
     Message,
+    bytes::{Bytes, BytesMut},
 };
 use std::sync::Arc;
 
-// Reexport Bytes which appears in the API
-use datafusion_execution::registry::FunctionRegistry;
-use datafusion_expr::planner::ExprPlanner;
 use datafusion_physical_plan::ExecutionPlan;
 
-mod registry;
-
 /// Encodes something (such as [`Expr`]) to/from a stream of
 /// bytes.
 ///
@@ -64,26 +57,21 @@ pub trait Serializeable: Sized {
     /// Convert `self` to an opaque byte stream
     fn to_bytes(&self) -> Result<Bytes>;
 
-    /// Convert `bytes` (the output of [`to_bytes`]) back into an
-    /// object. This will error if the serialized bytes contain any
-    /// user defined functions, in which case use
-    /// [`from_bytes_with_registry`]
+    /// Convert `bytes` (the output of [`to_bytes`]) back into an object. This
+    /// will error if the serialized bytes contain any user defined functions,
+    /// in which case use [`from_bytes_with_ctx`]
     ///
     /// [`to_bytes`]: Self::to_bytes
-    /// [`from_bytes_with_registry`]: Self::from_bytes_with_registry
+    /// [`from_bytes_with_ctx`]: Self::from_bytes_with_ctx
     fn from_bytes(bytes: &[u8]) -> Result<Self> {
-        Self::from_bytes_with_registry(bytes, &registry::NoRegistry {})
+        Self::from_bytes_with_ctx(bytes, &TaskContext::default())
     }
 
-    /// Convert `bytes` (the output of [`to_bytes`]) back into an
-    /// object resolving user defined functions with the specified
-    /// `registry`
+    /// Convert `bytes` (the output of [`to_bytes`]) back into an object,
+    /// resolving user defined functions with the specified `ctx`
     ///
     /// [`to_bytes`]: Self::to_bytes
-    fn from_bytes_with_registry(
-        bytes: &[u8],
-        registry: &dyn FunctionRegistry,
-    ) -> Result<Self>;
+    fn from_bytes_with_ctx(bytes: &[u8], ctx: &TaskContext) -> Result<Self>;
 }
 
 impl Serializeable for Expr {
@@ -99,100 +87,22 @@ impl Serializeable for Expr {
 
         let bytes: Bytes = buffer.into();
 
-        // the produced byte stream may lead to "recursion limit" errors, see
+        // The produced byte stream may lead to "recursion limit" errors, see
         // https://github.com/apache/datafusion/issues/3968
-        // Until the underlying prost issue ( https://github.com/tokio-rs/prost/issues/736 ) is fixed, we try to
-        // deserialize the data here and check for errors.
-        //
-        // Need to provide some placeholder registry because the stream may contain UDFs
-        struct PlaceHolderRegistry;
-
-        impl FunctionRegistry for PlaceHolderRegistry {
-            fn udfs(&self) -> std::collections::HashSet<String> {
-                std::collections::HashSet::default()
-            }
-
-            fn udf(&self, name: &str) -> Result<Arc<datafusion_expr::ScalarUDF>> {
-                Ok(Arc::new(create_udf(
-                    name,
-                    vec![],
-                    arrow::datatypes::DataType::Null,
-                    Volatility::Immutable,
-                    Arc::new(|_| unimplemented!()),
-                )))
-            }
-
-            fn udaf(&self, name: &str) -> Result<Arc<AggregateUDF>> {
-                Ok(Arc::new(create_udaf(
-                    name,
-                    vec![arrow::datatypes::DataType::Null],
-                    Arc::new(arrow::datatypes::DataType::Null),
-                    Volatility::Immutable,
-                    Arc::new(|_| unimplemented!()),
-                    Arc::new(vec![]),
-                )))
-            }
-
-            fn udwf(&self, name: &str) -> Result<Arc<WindowUDF>> {
-                Ok(Arc::new(create_udwf(
-                    name,
-                    arrow::datatypes::DataType::Null,
-                    Arc::new(arrow::datatypes::DataType::Null),
-                    Volatility::Immutable,
-                    Arc::new(|| unimplemented!()),
-                )))
-            }
-            fn register_udaf(
-                &mut self,
-                _udaf: Arc<AggregateUDF>,
-            ) -> Result<Option<Arc<AggregateUDF>>> {
-                datafusion_common::internal_err!(
-                    "register_udaf called in Placeholder Registry!"
-                )
-            }
-            fn register_udf(
-                &mut self,
-                _udf: Arc<datafusion_expr::ScalarUDF>,
-            ) -> Result<Option<Arc<datafusion_expr::ScalarUDF>>> {
-                datafusion_common::internal_err!(
-                    "register_udf called in Placeholder Registry!"
-                )
-            }
-            fn register_udwf(
-                &mut self,
-                _udaf: Arc<WindowUDF>,
-            ) -> Result<Option<Arc<WindowUDF>>> {
-                datafusion_common::internal_err!(
-                    "register_udwf called in Placeholder Registry!"
-                )
-            }
-
-            fn expr_planners(&self) -> Vec<Arc<dyn ExprPlanner>> {
-                vec![]
-            }
-
-            fn udafs(&self) -> std::collections::HashSet<String> {
-                std::collections::HashSet::default()
-            }
-
-            fn udwfs(&self) -> std::collections::HashSet<String> {
-                std::collections::HashSet::default()
-            }
-        }
-        Expr::from_bytes_with_registry(&bytes, &PlaceHolderRegistry)?;
+        // Until the underlying prost issue ( https://github.com/tokio-rs/prost/issues/736 )
+        // is fixed, verify the bytes can be decoded without hitting that limit.
+        protobuf::LogicalExprNode::decode(bytes.as_ref())
+            .map_err(|e| plan_datafusion_err!("Error decoding expr as protobuf: {e}"))?;
 
         Ok(bytes)
     }
 
-    fn from_bytes_with_registry(
-        bytes: &[u8],
-        registry: &dyn FunctionRegistry,
-    ) -> Result<Self> {
+    fn from_bytes_with_ctx(bytes: &[u8], ctx: &TaskContext) -> Result<Self> {
         let protobuf = protobuf::LogicalExprNode::decode(bytes)
             .map_err(|e| plan_datafusion_err!("Error decoding expr as protobuf: {e}"))?;
 
         let extension_codec = DefaultLogicalExtensionCodec {};
-        logical_plan::from_proto::parse_expr(&protobuf, registry, &extension_codec)
+        logical_plan::from_proto::parse_expr(&protobuf, ctx, &extension_codec)
             .map_err(|e| plan_datafusion_err!("Error parsing protobuf into Expr: {e}"))
     }
 }
@@ -276,16 +186,18 @@ pub fn logical_plan_from_json_with_extension_codec(
 /// Serialize a PhysicalPlan as bytes
 pub fn physical_plan_to_bytes(plan: Arc<dyn ExecutionPlan>) -> Result<Bytes> {
     let extension_codec = DefaultPhysicalExtensionCodec {};
-    physical_plan_to_bytes_with_extension_codec(plan, &extension_codec)
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    physical_plan_to_bytes_with_proto_converter(plan, &extension_codec, &proto_converter)
 }
 
 /// Serialize a PhysicalPlan as JSON
 #[cfg(feature = "json")]
 pub fn physical_plan_to_json(plan: Arc<dyn ExecutionPlan>) -> Result<String> {
     let extension_codec = DefaultPhysicalExtensionCodec {};
-    let protobuf =
-        protobuf::PhysicalPlanNode::try_from_physical_plan(plan, &extension_codec)
-            .map_err(|e| plan_datafusion_err!("Error serializing plan: {e}"))?;
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    let protobuf = proto_converter
+        .execution_plan_to_proto(&plan, &extension_codec)
+        .map_err(|e| plan_datafusion_err!("Error serializing plan: {e}"))?;
     serde_json::to_string(&protobuf)
         .map_err(|e| plan_datafusion_err!("Error serializing plan: {e}"))
 }
@@ -295,8 +207,18 @@ pub fn physical_plan_to_bytes_with_extension_codec(
     plan: Arc<dyn ExecutionPlan>,
     extension_codec: &dyn PhysicalExtensionCodec,
 ) -> Result<Bytes> {
-    let protobuf =
-        protobuf::PhysicalPlanNode::try_from_physical_plan(plan, extension_codec)?;
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    physical_plan_to_bytes_with_proto_converter(plan, extension_codec, &proto_converter)
+}
+
+/// Serialize a PhysicalPlan as bytes, using the provided extension codec
+/// and protobuf converter.
+pub fn physical_plan_to_bytes_with_proto_converter(
+    plan: Arc<dyn ExecutionPlan>,
+    extension_codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
+) -> Result<Bytes> {
+    let protobuf = proto_converter.execution_plan_to_proto(&plan, extension_codec)?;
     let mut buffer = BytesMut::new();
     protobuf
         .encode(&mut buffer)
@@ -313,7 +235,9 @@ pub fn physical_plan_from_json(
     let back: protobuf::PhysicalPlanNode = serde_json::from_str(json)
         .map_err(|e| plan_datafusion_err!("Error serializing plan: {e}"))?;
     let extension_codec = DefaultPhysicalExtensionCodec {};
-    back.try_into_physical_plan(ctx, &extension_codec)
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    let decode_ctx = PhysicalPlanDecodeContext::new(ctx, &extension_codec);
+    proto_converter.proto_to_execution_plan(&back, &decode_ctx)
 }
 
 /// Deserialize a PhysicalPlan from bytes
@@ -322,7 +246,13 @@ pub fn physical_plan_from_bytes(
     ctx: &TaskContext,
 ) -> Result<Arc<dyn ExecutionPlan>> {
     let extension_codec = DefaultPhysicalExtensionCodec {};
-    physical_plan_from_bytes_with_extension_codec(bytes, ctx, &extension_codec)
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    physical_plan_from_bytes_with_proto_converter(
+        bytes,
+        ctx,
+        &extension_codec,
+        &proto_converter,
+    )
 }
 
 /// Deserialize a PhysicalPlan from bytes
@@ -330,8 +260,25 @@ pub fn physical_plan_from_bytes_with_extension_codec(
     bytes: &[u8],
     ctx: &TaskContext,
     extension_codec: &dyn PhysicalExtensionCodec,
+) -> Result<Arc<dyn ExecutionPlan>> {
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    physical_plan_from_bytes_with_proto_converter(
+        bytes,
+        ctx,
+        extension_codec,
+        &proto_converter,
+    )
+}
+
+/// Deserialize a PhysicalPlan from bytes
+pub fn physical_plan_from_bytes_with_proto_converter(
+    bytes: &[u8],
+    ctx: &TaskContext,
+    extension_codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Arc<dyn ExecutionPlan>> {
     let protobuf = protobuf::PhysicalPlanNode::decode(bytes)
         .map_err(|e| plan_datafusion_err!("Error decoding expr as protobuf: {e}"))?;
-    protobuf.try_into_physical_plan(ctx, extension_codec)
+    let decode_ctx = PhysicalPlanDecodeContext::new(ctx, extension_codec);
+    proto_converter.proto_to_execution_plan(&protobuf, &decode_ctx)
 }
diff --git a/datafusion/proto/src/bytes/registry.rs b/datafusion/proto/src/bytes/registry.rs
deleted file mode 100644
index 087e073db21af..0000000000000
--- a/datafusion/proto/src/bytes/registry.rs
+++ /dev/null
@@ -1,70 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::{collections::HashSet, sync::Arc};
-
-use datafusion_common::plan_err;
-use datafusion_common::Result;
-use datafusion_execution::registry::FunctionRegistry;
-use datafusion_expr::planner::ExprPlanner;
-use datafusion_expr::{AggregateUDF, ScalarUDF, WindowUDF};
-
-/// A default [`FunctionRegistry`] registry that does not resolve any
-/// user defined functions
-pub(crate) struct NoRegistry {}
-
-impl FunctionRegistry for NoRegistry {
-    fn udfs(&self) -> HashSet<String> {
-        HashSet::new()
-    }
-
-    fn udf(&self, name: &str) -> Result<Arc<ScalarUDF>> {
-        plan_err!("No function registry provided to deserialize, so can not deserialize User Defined Function '{name}'")
-    }
-
-    fn udaf(&self, name: &str) -> Result<Arc<AggregateUDF>> {
-        plan_err!("No function registry provided to deserialize, so can not deserialize User Defined Aggregate Function '{name}'")
-    }
-
-    fn udwf(&self, name: &str) -> Result<Arc<WindowUDF>> {
-        plan_err!("No function registry provided to deserialize, so can not deserialize User Defined Window Function '{name}'")
-    }
-    fn register_udaf(
-        &mut self,
-        udaf: Arc<AggregateUDF>,
-    ) -> Result<Option<Arc<AggregateUDF>>> {
-        plan_err!("No function registry provided to deserialize, so can not register User Defined Aggregate Function '{}'", udaf.inner().name())
-    }
-    fn register_udf(&mut self, udf: Arc<ScalarUDF>) -> Result<Option<Arc<ScalarUDF>>> {
-        plan_err!("No function registry provided to deserialize, so can not deserialize User Defined Function '{}'", udf.inner().name())
-    }
-    fn register_udwf(&mut self, udwf: Arc<WindowUDF>) -> Result<Option<Arc<WindowUDF>>> {
-        plan_err!("No function registry provided to deserialize, so can not deserialize User Defined Window Function '{}'", udwf.inner().name())
-    }
-
-    fn expr_planners(&self) -> Vec<Arc<dyn ExprPlanner>> {
-        vec![]
-    }
-
-    fn udafs(&self) -> HashSet<String> {
-        HashSet::new()
-    }
-
-    fn udwfs(&self) -> HashSet<String> {
-        HashSet::new()
-    }
-}
diff --git a/datafusion/proto/src/common.rs b/datafusion/proto/src/common.rs
index 2aa12dd3504b6..22ded708d8c71 100644
--- a/datafusion/proto/src/common.rs
+++ b/datafusion/proto/src/common.rs
@@ -15,14 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion_common::{internal_datafusion_err, internal_err, Result};
+use datafusion_common::{Result, assert_eq_or_internal_err, internal_datafusion_err};
 
 pub(crate) fn str_to_byte(s: &String, description: &str) -> Result<u8> {
-    if s.len() != 1 {
-        return internal_err!(
-            "Invalid CSV {description}: expected single character, got {s}"
-        );
-    }
+    assert_eq_or_internal_err!(
+        s.len(),
+        1,
+        "Invalid CSV {description}: expected single character, got {s}"
+    );
     Ok(s.as_bytes()[0])
 }
 
diff --git a/datafusion/proto/src/generated/datafusion.rs b/datafusion/proto/src/generated/datafusion.rs
new file mode 100644
index 0000000000000..8b137891791fe
--- /dev/null
+++ b/datafusion/proto/src/generated/datafusion.rs
@@ -0,0 +1 @@
+
diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs b/datafusion/proto/src/generated/datafusion_proto_common.rs
index aa7c3d51a9d6d..4472ff0cde59b 100644
--- a/datafusion/proto/src/generated/datafusion_proto_common.rs
+++ b/datafusion/proto/src/generated/datafusion_proto_common.rs
@@ -176,6 +176,13 @@ pub struct Map {
     pub keys_sorted: bool,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
+pub struct RunEndEncoded {
+    #[prost(message, optional, boxed, tag = "1")]
+    pub run_ends_field: ::core::option::Option<::prost::alloc::boxed::Box<Field>>,
+    #[prost(message, optional, boxed, tag = "2")]
+    pub values_field: ::core::option::Option<::prost::alloc::boxed::Box<Field>>,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Union {
     #[prost(message, repeated, tag = "1")]
     pub union_types: ::prost::alloc::vec::Vec<Field>,
@@ -184,7 +191,7 @@ pub struct Union {
     #[prost(int32, repeated, tag = "3")]
     pub type_ids: ::prost::alloc::vec::Vec<i32>,
 }
-/// Used for List/FixedSizeList/LargeList/Struct/Map
+/// Used for List/FixedSizeList/LargeList/ListView/LargeListView/Struct/Map
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ScalarNestedValue {
     #[prost(bytes = "vec", tag = "1")]
@@ -264,6 +271,15 @@ pub struct ScalarDictionaryValue {
     #[prost(message, optional, boxed, tag = "2")]
     pub value: ::core::option::Option<::prost::alloc::boxed::Box<ScalarValue>>,
 }
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct ScalarRunEndEncodedValue {
+    #[prost(message, optional, tag = "1")]
+    pub run_ends_field: ::core::option::Option<Field>,
+    #[prost(message, optional, tag = "2")]
+    pub values_field: ::core::option::Option<Field>,
+    #[prost(message, optional, boxed, tag = "3")]
+    pub value: ::core::option::Option<::prost::alloc::boxed::Box<ScalarValue>>,
+}
 #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct IntervalDayTimeValue {
     #[prost(int32, tag = "1")]
@@ -311,7 +327,7 @@ pub struct ScalarFixedSizeBinary {
 pub struct ScalarValue {
     #[prost(
         oneof = "scalar_value::Value",
-        tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 43, 44, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42"
+        tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 46, 47, 32, 41, 43, 44, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42, 45"
     )]
     pub value: ::core::option::Option<scalar_value::Value>,
 }
@@ -362,6 +378,10 @@ pub mod scalar_value {
         ListValue(super::ScalarNestedValue),
         #[prost(message, tag = "18")]
         FixedSizeListValue(super::ScalarNestedValue),
+        #[prost(message, tag = "46")]
+        ListViewValue(super::ScalarNestedValue),
+        #[prost(message, tag = "47")]
+        LargeListViewValue(super::ScalarNestedValue),
         #[prost(message, tag = "32")]
         StructValue(super::ScalarNestedValue),
         #[prost(message, tag = "41")]
@@ -406,6 +426,8 @@ pub mod scalar_value {
         FixedSizeBinaryValue(super::ScalarFixedSizeBinary),
         #[prost(message, tag = "42")]
         UnionValue(::prost::alloc::boxed::Box<super::UnionValue>),
+        #[prost(message, tag = "45")]
+        RunEndEncodedValue(::prost::alloc::boxed::Box<super::ScalarRunEndEncodedValue>),
     }
 }
 #[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
@@ -449,7 +471,7 @@ pub struct Decimal256 {
 pub struct ArrowType {
     #[prost(
         oneof = "arrow_type::ArrowTypeEnum",
-        tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 40, 41, 24, 36, 25, 26, 27, 28, 29, 30, 33"
+        tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 40, 41, 24, 36, 25, 26, 27, 43, 44, 28, 29, 30, 33, 42"
     )]
     pub arrow_type_enum: ::core::option::Option<arrow_type::ArrowTypeEnum>,
 }
@@ -530,6 +552,10 @@ pub mod arrow_type {
         LargeList(::prost::alloc::boxed::Box<super::List>),
         #[prost(message, tag = "27")]
         FixedSizeList(::prost::alloc::boxed::Box<super::FixedSizeList>),
+        #[prost(message, tag = "43")]
+        ListView(::prost::alloc::boxed::Box<super::List>),
+        #[prost(message, tag = "44")]
+        LargeListView(::prost::alloc::boxed::Box<super::List>),
         #[prost(message, tag = "28")]
         Struct(super::Struct),
         #[prost(message, tag = "29")]
@@ -538,6 +564,8 @@ pub mod arrow_type {
         Dictionary(::prost::alloc::boxed::Box<super::Dictionary>),
         #[prost(message, tag = "33")]
         Map(::prost::alloc::boxed::Box<super::Map>),
+        #[prost(message, tag = "42")]
+        RunEndEncoded(::prost::alloc::boxed::Box<super::RunEndEncoded>),
     }
 }
 /// Useful for representing an empty enum variant in rust
@@ -591,6 +619,15 @@ pub struct CsvWriterOptions {
     /// Optional flag whether to double quotes, instead of escaping. Defaults to `true`
     #[prost(bool, tag = "11")]
     pub double_quote: bool,
+    /// Quote style for CSV writing
+    #[prost(enumeration = "CsvQuoteStyle", tag = "12")]
+    pub quote_style: i32,
+    /// Whether to ignore leading whitespace in string values
+    #[prost(bool, tag = "13")]
+    pub ignore_leading_whitespace: bool,
+    /// Whether to ignore trailing whitespace in string values
+    #[prost(bool, tag = "14")]
+    pub ignore_trailing_whitespace: bool,
 }
 /// Options controlling CSV format
 #[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
@@ -649,6 +686,18 @@ pub struct CsvOptions {
     /// Indicates if truncated rows are allowed
     #[prost(bytes = "vec", tag = "18")]
     pub truncated_rows: ::prost::alloc::vec::Vec<u8>,
+    /// Optional compression level
+    #[prost(uint32, optional, tag = "19")]
+    pub compression_level: ::core::option::Option<u32>,
+    /// Quote style for CSV writing
+    #[prost(enumeration = "CsvQuoteStyle", tag = "20")]
+    pub quote_style: i32,
+    /// Whether to ignore leading whitespace in string values
+    #[prost(bytes = "vec", tag = "21")]
+    pub ignore_leading_whitespace: ::prost::alloc::vec::Vec<u8>,
+    /// Whether to ignore trailing whitespace in string values
+    #[prost(bytes = "vec", tag = "22")]
+    pub ignore_trailing_whitespace: ::prost::alloc::vec::Vec<u8>,
 }
 /// Options controlling CSV format
 #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
@@ -659,6 +708,12 @@ pub struct JsonOptions {
     /// Optional max records for schema inference
     #[prost(uint64, optional, tag = "2")]
     pub schema_infer_max_rec: ::core::option::Option<u64>,
+    /// Optional compression level
+    #[prost(uint32, optional, tag = "3")]
+    pub compression_level: ::core::option::Option<u32>,
+    /// Whether to read as newline-delimited JSON (default true). When false, expects JSON array format \[{},...\]
+    #[prost(bool, optional, tag = "4")]
+    pub newline_delimited: ::core::option::Option<bool>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct TableParquetOptions {
@@ -763,6 +818,9 @@ pub struct ParquetOptions {
     /// default = false
     #[prost(bool, tag = "6")]
     pub reorder_filters: bool,
+    /// default = false
+    #[prost(bool, tag = "34")]
+    pub force_filter_selections: bool,
     /// default = 1024 * 1024
     #[prost(uint64, tag = "7")]
     pub data_pagesize_limit: u64,
@@ -806,6 +864,8 @@ pub struct ParquetOptions {
     pub max_row_group_size: u64,
     #[prost(string, tag = "16")]
     pub created_by: ::prost::alloc::string::String,
+    #[prost(message, optional, tag = "35")]
+    pub content_defined_chunking: ::core::option::Option<CdcOptions>,
     #[prost(oneof = "parquet_options::MetadataSizeHintOpt", tags = "4")]
     pub metadata_size_hint_opt: ::core::option::Option<
         parquet_options::MetadataSizeHintOpt,
@@ -899,6 +959,15 @@ pub mod parquet_options {
         MaxPredicateCacheSize(u64),
     }
 }
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct CdcOptions {
+    #[prost(uint64, tag = "1")]
+    pub min_chunk_size: u64,
+    #[prost(uint64, tag = "2")]
+    pub max_chunk_size: u64,
+    #[prost(int32, tag = "3")]
+    pub norm_level: i32,
+}
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Precision {
     #[prost(enumeration = "PrecisionInfo", tag = "1")]
@@ -927,6 +996,8 @@ pub struct ColumnStats {
     pub null_count: ::core::option::Option<Precision>,
     #[prost(message, optional, tag = "4")]
     pub distinct_count: ::core::option::Option<Precision>,
+    #[prost(message, optional, tag = "6")]
+    pub byte_size: ::core::option::Option<Precision>,
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
 #[repr(i32)]
@@ -1154,6 +1225,38 @@ impl CompressionTypeVariant {
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
 #[repr(i32)]
+pub enum CsvQuoteStyle {
+    Necessary = 0,
+    Always = 1,
+    NonNumeric = 2,
+    Never = 3,
+}
+impl CsvQuoteStyle {
+    /// String value of the enum field names used in the ProtoBuf definition.
+    ///
+    /// The values are not transformed in any way and thus are considered stable
+    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
+    pub fn as_str_name(&self) -> &'static str {
+        match self {
+            Self::Necessary => "NECESSARY",
+            Self::Always => "ALWAYS",
+            Self::NonNumeric => "NON_NUMERIC",
+            Self::Never => "NEVER",
+        }
+    }
+    /// Creates an enum from field names used in the ProtoBuf definition.
+    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
+        match value {
+            "NECESSARY" => Some(Self::Necessary),
+            "ALWAYS" => Some(Self::Always),
+            "NON_NUMERIC" => Some(Self::NonNumeric),
+            "NEVER" => Some(Self::Never),
+            _ => None,
+        }
+    }
+}
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
+#[repr(i32)]
 pub enum JoinSide {
     LeftSide = 0,
     RightSide = 1,
diff --git a/datafusion/proto/src/generated/mod.rs b/datafusion/proto/src/generated/mod.rs
index da3302a743753..ca32b1500d57b 100644
--- a/datafusion/proto/src/generated/mod.rs
+++ b/datafusion/proto/src/generated/mod.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+// This code is generated so we don't want to fix any lint violations manually
+#[allow(clippy::allow_attributes)]
 #[allow(clippy::all)]
 #[rustfmt::skip]
 pub mod datafusion {
diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs
index 4cf834d0601e4..0650ce740526d 100644
--- a/datafusion/proto/src/generated/pbjson.rs
+++ b/datafusion/proto/src/generated/pbjson.rs
@@ -9,12 +9,18 @@ impl serde::Serialize for AggLimit {
         if self.limit != 0 {
             len += 1;
         }
+        if self.descending.is_some() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.AggLimit", len)?;
         if self.limit != 0 {
             #[allow(clippy::needless_borrow)]
             #[allow(clippy::needless_borrows_for_generic_args)]
             struct_ser.serialize_field("limit", ToString::to_string(&self.limit).as_str())?;
         }
+        if let Some(v) = self.descending.as_ref() {
+            struct_ser.serialize_field("descending", v)?;
+        }
         struct_ser.end()
     }
 }
@@ -26,11 +32,13 @@ impl<'de> serde::Deserialize<'de> for AggLimit {
     {
         const FIELDS: &[&str] = &[
             "limit",
+            "descending",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
             Limit,
+            Descending,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -39,7 +47,7 @@ impl<'de> serde::Deserialize<'de> for AggLimit {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -53,6 +61,7 @@ impl<'de> serde::Deserialize<'de> for AggLimit {
                     {
                         match value {
                             "limit" => Ok(GeneratedField::Limit),
+                            "descending" => Ok(GeneratedField::Descending),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -73,6 +82,7 @@ impl<'de> serde::Deserialize<'de> for AggLimit {
                     V: serde::de::MapAccess<'de>,
             {
                 let mut limit__ = None;
+                let mut descending__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Limit => {
@@ -83,10 +93,17 @@ impl<'de> serde::Deserialize<'de> for AggLimit {
                                 Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
                             ;
                         }
+                        GeneratedField::Descending => {
+                            if descending__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("descending"));
+                            }
+                            descending__ = map_.next_value()?;
+                        }
                     }
                 }
                 Ok(AggLimit {
                     limit: limit__.unwrap_or_default(),
+                    descending: descending__,
                 })
             }
         }
@@ -134,6 +151,12 @@ impl serde::Serialize for AggregateExecNode {
         if self.limit.is_some() {
             len += 1;
         }
+        if self.has_grouping_set {
+            len += 1;
+        }
+        if self.dynamic_filter.is_some() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.AggregateExecNode", len)?;
         if !self.group_expr.is_empty() {
             struct_ser.serialize_field("groupExpr", &self.group_expr)?;
@@ -170,6 +193,12 @@ impl serde::Serialize for AggregateExecNode {
         if let Some(v) = self.limit.as_ref() {
             struct_ser.serialize_field("limit", v)?;
         }
+        if self.has_grouping_set {
+            struct_ser.serialize_field("hasGroupingSet", &self.has_grouping_set)?;
+        }
+        if let Some(v) = self.dynamic_filter.as_ref() {
+            struct_ser.serialize_field("dynamicFilter", v)?;
+        }
         struct_ser.end()
     }
 }
@@ -198,6 +227,10 @@ impl<'de> serde::Deserialize<'de> for AggregateExecNode {
             "filter_expr",
             "filterExpr",
             "limit",
+            "has_grouping_set",
+            "hasGroupingSet",
+            "dynamic_filter",
+            "dynamicFilter",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -213,6 +246,8 @@ impl<'de> serde::Deserialize<'de> for AggregateExecNode {
             Groups,
             FilterExpr,
             Limit,
+            HasGroupingSet,
+            DynamicFilter,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -221,7 +256,7 @@ impl<'de> serde::Deserialize<'de> for AggregateExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -245,6 +280,8 @@ impl<'de> serde::Deserialize<'de> for AggregateExecNode {
                             "groups" => Ok(GeneratedField::Groups),
                             "filterExpr" | "filter_expr" => Ok(GeneratedField::FilterExpr),
                             "limit" => Ok(GeneratedField::Limit),
+                            "hasGroupingSet" | "has_grouping_set" => Ok(GeneratedField::HasGroupingSet),
+                            "dynamicFilter" | "dynamic_filter" => Ok(GeneratedField::DynamicFilter),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -275,6 +312,8 @@ impl<'de> serde::Deserialize<'de> for AggregateExecNode {
                 let mut groups__ = None;
                 let mut filter_expr__ = None;
                 let mut limit__ = None;
+                let mut has_grouping_set__ = None;
+                let mut dynamic_filter__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::GroupExpr => {
@@ -343,6 +382,18 @@ impl<'de> serde::Deserialize<'de> for AggregateExecNode {
                             }
                             limit__ = map_.next_value()?;
                         }
+                        GeneratedField::HasGroupingSet => {
+                            if has_grouping_set__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("hasGroupingSet"));
+                            }
+                            has_grouping_set__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::DynamicFilter => {
+                            if dynamic_filter__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("dynamicFilter"));
+                            }
+                            dynamic_filter__ = map_.next_value()?;
+                        }
                     }
                 }
                 Ok(AggregateExecNode {
@@ -357,6 +408,8 @@ impl<'de> serde::Deserialize<'de> for AggregateExecNode {
                     groups: groups__.unwrap_or_default(),
                     filter_expr: filter_expr__.unwrap_or_default(),
                     limit: limit__,
+                    has_grouping_set: has_grouping_set__.unwrap_or_default(),
+                    dynamic_filter: dynamic_filter__,
                 })
             }
         }
@@ -375,6 +428,7 @@ impl serde::Serialize for AggregateMode {
             Self::FinalPartitioned => "FINAL_PARTITIONED",
             Self::Single => "SINGLE",
             Self::SinglePartitioned => "SINGLE_PARTITIONED",
+            Self::PartialReduce => "PARTIAL_REDUCE",
         };
         serializer.serialize_str(variant)
     }
@@ -391,11 +445,12 @@ impl<'de> serde::Deserialize<'de> for AggregateMode {
             "FINAL_PARTITIONED",
             "SINGLE",
             "SINGLE_PARTITIONED",
+            "PARTIAL_REDUCE",
         ];
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = AggregateMode;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -436,6 +491,7 @@ impl<'de> serde::Deserialize<'de> for AggregateMode {
                     "FINAL_PARTITIONED" => Ok(AggregateMode::FinalPartitioned),
                     "SINGLE" => Ok(AggregateMode::Single),
                     "SINGLE_PARTITIONED" => Ok(AggregateMode::SinglePartitioned),
+                    "PARTIAL_REDUCE" => Ok(AggregateMode::PartialReduce),
                     _ => Err(serde::de::Error::unknown_variant(value, FIELDS)),
                 }
             }
@@ -500,7 +556,7 @@ impl<'de> serde::Deserialize<'de> for AggregateNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -665,7 +721,7 @@ impl<'de> serde::Deserialize<'de> for AggregateUdfExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -836,7 +892,7 @@ impl<'de> serde::Deserialize<'de> for AliasNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -937,6 +993,12 @@ impl serde::Serialize for AnalyzeExecNode {
         if self.schema.is_some() {
             len += 1;
         }
+        if self.has_metric_categories {
+            len += 1;
+        }
+        if !self.metric_categories.is_empty() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.AnalyzeExecNode", len)?;
         if self.verbose {
             struct_ser.serialize_field("verbose", &self.verbose)?;
@@ -950,6 +1012,12 @@ impl serde::Serialize for AnalyzeExecNode {
         if let Some(v) = self.schema.as_ref() {
             struct_ser.serialize_field("schema", v)?;
         }
+        if self.has_metric_categories {
+            struct_ser.serialize_field("hasMetricCategories", &self.has_metric_categories)?;
+        }
+        if !self.metric_categories.is_empty() {
+            struct_ser.serialize_field("metricCategories", &self.metric_categories)?;
+        }
         struct_ser.end()
     }
 }
@@ -965,6 +1033,10 @@ impl<'de> serde::Deserialize<'de> for AnalyzeExecNode {
             "showStatistics",
             "input",
             "schema",
+            "has_metric_categories",
+            "hasMetricCategories",
+            "metric_categories",
+            "metricCategories",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -973,6 +1045,8 @@ impl<'de> serde::Deserialize<'de> for AnalyzeExecNode {
             ShowStatistics,
             Input,
             Schema,
+            HasMetricCategories,
+            MetricCategories,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -981,7 +1055,7 @@ impl<'de> serde::Deserialize<'de> for AnalyzeExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -998,6 +1072,8 @@ impl<'de> serde::Deserialize<'de> for AnalyzeExecNode {
                             "showStatistics" | "show_statistics" => Ok(GeneratedField::ShowStatistics),
                             "input" => Ok(GeneratedField::Input),
                             "schema" => Ok(GeneratedField::Schema),
+                            "hasMetricCategories" | "has_metric_categories" => Ok(GeneratedField::HasMetricCategories),
+                            "metricCategories" | "metric_categories" => Ok(GeneratedField::MetricCategories),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -1021,6 +1097,8 @@ impl<'de> serde::Deserialize<'de> for AnalyzeExecNode {
                 let mut show_statistics__ = None;
                 let mut input__ = None;
                 let mut schema__ = None;
+                let mut has_metric_categories__ = None;
+                let mut metric_categories__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Verbose => {
@@ -1047,6 +1125,18 @@ impl<'de> serde::Deserialize<'de> for AnalyzeExecNode {
                             }
                             schema__ = map_.next_value()?;
                         }
+                        GeneratedField::HasMetricCategories => {
+                            if has_metric_categories__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("hasMetricCategories"));
+                            }
+                            has_metric_categories__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::MetricCategories => {
+                            if metric_categories__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("metricCategories"));
+                            }
+                            metric_categories__ = Some(map_.next_value()?);
+                        }
                     }
                 }
                 Ok(AnalyzeExecNode {
@@ -1054,6 +1144,8 @@ impl<'de> serde::Deserialize<'de> for AnalyzeExecNode {
                     show_statistics: show_statistics__.unwrap_or_default(),
                     input: input__,
                     schema: schema__,
+                    has_metric_categories: has_metric_categories__.unwrap_or_default(),
+                    metric_categories: metric_categories__.unwrap_or_default(),
                 })
             }
         }
@@ -1107,7 +1199,7 @@ impl<'de> serde::Deserialize<'de> for AnalyzeNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1208,7 +1300,7 @@ impl<'de> serde::Deserialize<'de> for AnalyzedLogicalPlanType {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1260,6 +1352,225 @@ impl<'de> serde::Deserialize<'de> for AnalyzedLogicalPlanType {
         deserializer.deserialize_struct("datafusion.AnalyzedLogicalPlanType", FIELDS, GeneratedVisitor)
     }
 }
+impl serde::Serialize for ArrowScanExecNode {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.base_conf.is_some() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.ArrowScanExecNode", len)?;
+        if let Some(v) = self.base_conf.as_ref() {
+            struct_ser.serialize_field("baseConf", v)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for ArrowScanExecNode {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "base_conf",
+            "baseConf",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            BaseConf,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "baseConf" | "base_conf" => Ok(GeneratedField::BaseConf),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = ArrowScanExecNode;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.ArrowScanExecNode")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<ArrowScanExecNode, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut base_conf__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::BaseConf => {
+                            if base_conf__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("baseConf"));
+                            }
+                            base_conf__ = map_.next_value()?;
+                        }
+                    }
+                }
+                Ok(ArrowScanExecNode {
+                    base_conf: base_conf__,
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.ArrowScanExecNode", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for AsyncFuncExecNode {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.input.is_some() {
+            len += 1;
+        }
+        if !self.async_exprs.is_empty() {
+            len += 1;
+        }
+        if !self.async_expr_names.is_empty() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.AsyncFuncExecNode", len)?;
+        if let Some(v) = self.input.as_ref() {
+            struct_ser.serialize_field("input", v)?;
+        }
+        if !self.async_exprs.is_empty() {
+            struct_ser.serialize_field("asyncExprs", &self.async_exprs)?;
+        }
+        if !self.async_expr_names.is_empty() {
+            struct_ser.serialize_field("asyncExprNames", &self.async_expr_names)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for AsyncFuncExecNode {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "input",
+            "async_exprs",
+            "asyncExprs",
+            "async_expr_names",
+            "asyncExprNames",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Input,
+            AsyncExprs,
+            AsyncExprNames,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "input" => Ok(GeneratedField::Input),
+                            "asyncExprs" | "async_exprs" => Ok(GeneratedField::AsyncExprs),
+                            "asyncExprNames" | "async_expr_names" => Ok(GeneratedField::AsyncExprNames),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = AsyncFuncExecNode;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.AsyncFuncExecNode")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<AsyncFuncExecNode, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut input__ = None;
+                let mut async_exprs__ = None;
+                let mut async_expr_names__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Input => {
+                            if input__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("input"));
+                            }
+                            input__ = map_.next_value()?;
+                        }
+                        GeneratedField::AsyncExprs => {
+                            if async_exprs__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("asyncExprs"));
+                            }
+                            async_exprs__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::AsyncExprNames => {
+                            if async_expr_names__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("asyncExprNames"));
+                            }
+                            async_expr_names__ = Some(map_.next_value()?);
+                        }
+                    }
+                }
+                Ok(AsyncFuncExecNode {
+                    input: input__,
+                    async_exprs: async_exprs__.unwrap_or_default(),
+                    async_expr_names: async_expr_names__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.AsyncFuncExecNode", FIELDS, GeneratedVisitor)
+    }
+}
 impl serde::Serialize for AvroScanExecNode {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
@@ -1300,7 +1611,7 @@ impl<'de> serde::Deserialize<'de> for AvroScanExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1391,7 +1702,7 @@ impl<'de> serde::Deserialize<'de> for BareTableReference {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1506,7 +1817,7 @@ impl<'de> serde::Deserialize<'de> for BetweenNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1632,7 +1943,7 @@ impl<'de> serde::Deserialize<'de> for BinaryExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1693,7 +2004,7 @@ impl<'de> serde::Deserialize<'de> for BinaryExprNode {
         deserializer.deserialize_struct("datafusion.BinaryExprNode", FIELDS, GeneratedVisitor)
     }
 }
-impl serde::Serialize for CaseNode {
+impl serde::Serialize for BufferExecNode {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
     where
@@ -1701,47 +2012,39 @@ impl serde::Serialize for CaseNode {
     {
         use serde::ser::SerializeStruct;
         let mut len = 0;
-        if self.expr.is_some() {
-            len += 1;
-        }
-        if !self.when_then_expr.is_empty() {
+        if self.input.is_some() {
             len += 1;
         }
-        if self.else_expr.is_some() {
+        if self.capacity != 0 {
             len += 1;
         }
-        let mut struct_ser = serializer.serialize_struct("datafusion.CaseNode", len)?;
-        if let Some(v) = self.expr.as_ref() {
-            struct_ser.serialize_field("expr", v)?;
-        }
-        if !self.when_then_expr.is_empty() {
-            struct_ser.serialize_field("whenThenExpr", &self.when_then_expr)?;
+        let mut struct_ser = serializer.serialize_struct("datafusion.BufferExecNode", len)?;
+        if let Some(v) = self.input.as_ref() {
+            struct_ser.serialize_field("input", v)?;
         }
-        if let Some(v) = self.else_expr.as_ref() {
-            struct_ser.serialize_field("elseExpr", v)?;
+        if self.capacity != 0 {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("capacity", ToString::to_string(&self.capacity).as_str())?;
         }
         struct_ser.end()
     }
 }
-impl<'de> serde::Deserialize<'de> for CaseNode {
+impl<'de> serde::Deserialize<'de> for BufferExecNode {
     #[allow(deprecated)]
     fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
     where
         D: serde::Deserializer<'de>,
     {
         const FIELDS: &[&str] = &[
-            "expr",
-            "when_then_expr",
-            "whenThenExpr",
-            "else_expr",
-            "elseExpr",
+            "input",
+            "capacity",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
-            Expr,
-            WhenThenExpr,
-            ElseExpr,
+            Input,
+            Capacity,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -1750,7 +2053,7 @@ impl<'de> serde::Deserialize<'de> for CaseNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1763,9 +2066,8 @@ impl<'de> serde::Deserialize<'de> for CaseNode {
                         E: serde::de::Error,
                     {
                         match value {
-                            "expr" => Ok(GeneratedField::Expr),
-                            "whenThenExpr" | "when_then_expr" => Ok(GeneratedField::WhenThenExpr),
-                            "elseExpr" | "else_expr" => Ok(GeneratedField::ElseExpr),
+                            "input" => Ok(GeneratedField::Input),
+                            "capacity" => Ok(GeneratedField::Capacity),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -1775,32 +2077,153 @@ impl<'de> serde::Deserialize<'de> for CaseNode {
         }
         struct GeneratedVisitor;
         impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
-            type Value = CaseNode;
+            type Value = BufferExecNode;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                formatter.write_str("struct datafusion.CaseNode")
+                formatter.write_str("struct datafusion.BufferExecNode")
             }
 
-            fn visit_map<V>(self, mut map_: V) -> std::result::Result<CaseNode, V::Error>
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<BufferExecNode, V::Error>
                 where
                     V: serde::de::MapAccess<'de>,
             {
-                let mut expr__ = None;
-                let mut when_then_expr__ = None;
-                let mut else_expr__ = None;
+                let mut input__ = None;
+                let mut capacity__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
-                        GeneratedField::Expr => {
-                            if expr__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("expr"));
+                        GeneratedField::Input => {
+                            if input__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("input"));
                             }
-                            expr__ = map_.next_value()?;
+                            input__ = map_.next_value()?;
                         }
-                        GeneratedField::WhenThenExpr => {
-                            if when_then_expr__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("whenThenExpr"));
+                        GeneratedField::Capacity => {
+                            if capacity__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("capacity"));
                             }
-                            when_then_expr__ = Some(map_.next_value()?);
+                            capacity__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                    }
+                }
+                Ok(BufferExecNode {
+                    input: input__,
+                    capacity: capacity__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.BufferExecNode", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for CaseNode {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.expr.is_some() {
+            len += 1;
+        }
+        if !self.when_then_expr.is_empty() {
+            len += 1;
+        }
+        if self.else_expr.is_some() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.CaseNode", len)?;
+        if let Some(v) = self.expr.as_ref() {
+            struct_ser.serialize_field("expr", v)?;
+        }
+        if !self.when_then_expr.is_empty() {
+            struct_ser.serialize_field("whenThenExpr", &self.when_then_expr)?;
+        }
+        if let Some(v) = self.else_expr.as_ref() {
+            struct_ser.serialize_field("elseExpr", v)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for CaseNode {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "expr",
+            "when_then_expr",
+            "whenThenExpr",
+            "else_expr",
+            "elseExpr",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Expr,
+            WhenThenExpr,
+            ElseExpr,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "expr" => Ok(GeneratedField::Expr),
+                            "whenThenExpr" | "when_then_expr" => Ok(GeneratedField::WhenThenExpr),
+                            "elseExpr" | "else_expr" => Ok(GeneratedField::ElseExpr),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = CaseNode;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.CaseNode")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<CaseNode, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut expr__ = None;
+                let mut when_then_expr__ = None;
+                let mut else_expr__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Expr => {
+                            if expr__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("expr"));
+                            }
+                            expr__ = map_.next_value()?;
+                        }
+                        GeneratedField::WhenThenExpr => {
+                            if when_then_expr__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("whenThenExpr"));
+                            }
+                            when_then_expr__ = Some(map_.next_value()?);
                         }
                         GeneratedField::ElseExpr => {
                             if else_expr__.is_some() {
@@ -1834,6 +2257,12 @@ impl serde::Serialize for CastNode {
         if self.arrow_type.is_some() {
             len += 1;
         }
+        if !self.metadata.is_empty() {
+            len += 1;
+        }
+        if self.nullable.is_some() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.CastNode", len)?;
         if let Some(v) = self.expr.as_ref() {
             struct_ser.serialize_field("expr", v)?;
@@ -1841,6 +2270,12 @@ impl serde::Serialize for CastNode {
         if let Some(v) = self.arrow_type.as_ref() {
             struct_ser.serialize_field("arrowType", v)?;
         }
+        if !self.metadata.is_empty() {
+            struct_ser.serialize_field("metadata", &self.metadata)?;
+        }
+        if let Some(v) = self.nullable.as_ref() {
+            struct_ser.serialize_field("nullable", v)?;
+        }
         struct_ser.end()
     }
 }
@@ -1854,12 +2289,16 @@ impl<'de> serde::Deserialize<'de> for CastNode {
             "expr",
             "arrow_type",
             "arrowType",
+            "metadata",
+            "nullable",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
             Expr,
             ArrowType,
+            Metadata,
+            Nullable,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -1868,7 +2307,7 @@ impl<'de> serde::Deserialize<'de> for CastNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1883,6 +2322,8 @@ impl<'de> serde::Deserialize<'de> for CastNode {
                         match value {
                             "expr" => Ok(GeneratedField::Expr),
                             "arrowType" | "arrow_type" => Ok(GeneratedField::ArrowType),
+                            "metadata" => Ok(GeneratedField::Metadata),
+                            "nullable" => Ok(GeneratedField::Nullable),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -1904,6 +2345,8 @@ impl<'de> serde::Deserialize<'de> for CastNode {
             {
                 let mut expr__ = None;
                 let mut arrow_type__ = None;
+                let mut metadata__ = None;
+                let mut nullable__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Expr => {
@@ -1918,11 +2361,27 @@ impl<'de> serde::Deserialize<'de> for CastNode {
                             }
                             arrow_type__ = map_.next_value()?;
                         }
+                        GeneratedField::Metadata => {
+                            if metadata__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("metadata"));
+                            }
+                            metadata__ = Some(
+                                map_.next_value::<std::collections::HashMap<_, _>>()?
+                            );
+                        }
+                        GeneratedField::Nullable => {
+                            if nullable__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("nullable"));
+                            }
+                            nullable__ = map_.next_value()?;
+                        }
                     }
                 }
                 Ok(CastNode {
                     expr: expr__,
                     arrow_type: arrow_type__,
+                    metadata: metadata__.unwrap_or_default(),
+                    nullable: nullable__,
                 })
             }
         }
@@ -1985,7 +2444,7 @@ impl<'de> serde::Deserialize<'de> for CoalesceBatchesExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2106,7 +2565,7 @@ impl<'de> serde::Deserialize<'de> for CoalescePartitionsExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2218,7 +2677,7 @@ impl<'de> serde::Deserialize<'de> for ColumnIndex {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2329,7 +2788,7 @@ impl<'de> serde::Deserialize<'de> for ColumnUnnestListItem {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2440,7 +2899,7 @@ impl<'de> serde::Deserialize<'de> for ColumnUnnestListRecursion {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2542,7 +3001,7 @@ impl<'de> serde::Deserialize<'de> for ColumnUnnestListRecursions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2633,7 +3092,7 @@ impl<'de> serde::Deserialize<'de> for CooperativeExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2753,7 +3212,7 @@ impl<'de> serde::Deserialize<'de> for CopyToNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2891,7 +3350,7 @@ impl<'de> serde::Deserialize<'de> for CreateCatalogNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3018,7 +3477,7 @@ impl<'de> serde::Deserialize<'de> for CreateCatalogSchemaNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3237,7 +3696,7 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3482,7 +3941,7 @@ impl<'de> serde::Deserialize<'de> for CreateViewNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3617,7 +4076,7 @@ impl<'de> serde::Deserialize<'de> for CrossJoinExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3725,7 +4184,7 @@ impl<'de> serde::Deserialize<'de> for CrossJoinNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3893,7 +4352,7 @@ impl<'de> serde::Deserialize<'de> for CsvScanExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4056,7 +4515,7 @@ impl<'de> serde::Deserialize<'de> for CsvSink {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4182,7 +4641,7 @@ impl<'de> serde::Deserialize<'de> for CsvSinkExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4308,7 +4767,7 @@ impl<'de> serde::Deserialize<'de> for CteWorkTableScanNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4408,7 +4867,7 @@ impl<'de> serde::Deserialize<'de> for CubeNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4535,7 +4994,7 @@ impl<'de> serde::Deserialize<'de> for CustomTableScanNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4651,7 +5110,7 @@ impl<'de> serde::Deserialize<'de> for DateUnit {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = DateUnit;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4735,7 +5194,7 @@ impl<'de> serde::Deserialize<'de> for DistinctNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4853,7 +5312,7 @@ impl<'de> serde::Deserialize<'de> for DistinctOnNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4999,7 +5458,7 @@ impl<'de> serde::Deserialize<'de> for DmlNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5091,6 +5550,7 @@ impl serde::Serialize for dml_node::Type {
             Self::InsertAppend => "INSERT_APPEND",
             Self::InsertOverwrite => "INSERT_OVERWRITE",
             Self::InsertReplace => "INSERT_REPLACE",
+            Self::Truncate => "TRUNCATE",
         };
         serializer.serialize_str(variant)
     }
@@ -5108,11 +5568,12 @@ impl<'de> serde::Deserialize<'de> for dml_node::Type {
             "INSERT_APPEND",
             "INSERT_OVERWRITE",
             "INSERT_REPLACE",
+            "TRUNCATE",
         ];
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = dml_node::Type;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5154,6 +5615,7 @@ impl<'de> serde::Deserialize<'de> for dml_node::Type {
                     "INSERT_APPEND" => Ok(dml_node::Type::InsertAppend),
                     "INSERT_OVERWRITE" => Ok(dml_node::Type::InsertOverwrite),
                     "INSERT_REPLACE" => Ok(dml_node::Type::InsertReplace),
+                    "TRUNCATE" => Ok(dml_node::Type::Truncate),
                     _ => Err(serde::de::Error::unknown_variant(value, FIELDS)),
                 }
             }
@@ -5217,7 +5679,7 @@ impl<'de> serde::Deserialize<'de> for DropViewNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5326,7 +5788,7 @@ impl<'de> serde::Deserialize<'de> for EmptyExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5418,7 +5880,7 @@ impl<'de> serde::Deserialize<'de> for EmptyRelationNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5470,7 +5932,7 @@ impl<'de> serde::Deserialize<'de> for EmptyRelationNode {
         deserializer.deserialize_struct("datafusion.EmptyRelationNode", FIELDS, GeneratedVisitor)
     }
 }
-impl serde::Serialize for ExplainExecNode {
+impl serde::Serialize for EmptyTableScanNode {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
     where
@@ -5478,46 +5940,54 @@ impl serde::Serialize for ExplainExecNode {
     {
         use serde::ser::SerializeStruct;
         let mut len = 0;
+        if self.table_name.is_some() {
+            len += 1;
+        }
         if self.schema.is_some() {
             len += 1;
         }
-        if !self.stringified_plans.is_empty() {
+        if self.projection.is_some() {
             len += 1;
         }
-        if self.verbose {
+        if !self.filters.is_empty() {
             len += 1;
         }
-        let mut struct_ser = serializer.serialize_struct("datafusion.ExplainExecNode", len)?;
+        let mut struct_ser = serializer.serialize_struct("datafusion.EmptyTableScanNode", len)?;
+        if let Some(v) = self.table_name.as_ref() {
+            struct_ser.serialize_field("tableName", v)?;
+        }
         if let Some(v) = self.schema.as_ref() {
             struct_ser.serialize_field("schema", v)?;
         }
-        if !self.stringified_plans.is_empty() {
-            struct_ser.serialize_field("stringifiedPlans", &self.stringified_plans)?;
+        if let Some(v) = self.projection.as_ref() {
+            struct_ser.serialize_field("projection", v)?;
         }
-        if self.verbose {
-            struct_ser.serialize_field("verbose", &self.verbose)?;
+        if !self.filters.is_empty() {
+            struct_ser.serialize_field("filters", &self.filters)?;
         }
         struct_ser.end()
     }
 }
-impl<'de> serde::Deserialize<'de> for ExplainExecNode {
+impl<'de> serde::Deserialize<'de> for EmptyTableScanNode {
     #[allow(deprecated)]
     fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
     where
         D: serde::Deserializer<'de>,
     {
         const FIELDS: &[&str] = &[
+            "table_name",
+            "tableName",
             "schema",
-            "stringified_plans",
-            "stringifiedPlans",
-            "verbose",
+            "projection",
+            "filters",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
+            TableName,
             Schema,
-            StringifiedPlans,
-            Verbose,
+            Projection,
+            Filters,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -5526,7 +5996,7 @@ impl<'de> serde::Deserialize<'de> for ExplainExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5539,9 +6009,10 @@ impl<'de> serde::Deserialize<'de> for ExplainExecNode {
                         E: serde::de::Error,
                     {
                         match value {
+                            "tableName" | "table_name" => Ok(GeneratedField::TableName),
                             "schema" => Ok(GeneratedField::Schema),
-                            "stringifiedPlans" | "stringified_plans" => Ok(GeneratedField::StringifiedPlans),
-                            "verbose" => Ok(GeneratedField::Verbose),
+                            "projection" => Ok(GeneratedField::Projection),
+                            "filters" => Ok(GeneratedField::Filters),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -5551,52 +6022,60 @@ impl<'de> serde::Deserialize<'de> for ExplainExecNode {
         }
         struct GeneratedVisitor;
         impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
-            type Value = ExplainExecNode;
+            type Value = EmptyTableScanNode;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                formatter.write_str("struct datafusion.ExplainExecNode")
+                formatter.write_str("struct datafusion.EmptyTableScanNode")
             }
 
-            fn visit_map<V>(self, mut map_: V) -> std::result::Result<ExplainExecNode, V::Error>
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<EmptyTableScanNode, V::Error>
                 where
                     V: serde::de::MapAccess<'de>,
             {
+                let mut table_name__ = None;
                 let mut schema__ = None;
-                let mut stringified_plans__ = None;
-                let mut verbose__ = None;
+                let mut projection__ = None;
+                let mut filters__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
+                        GeneratedField::TableName => {
+                            if table_name__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("tableName"));
+                            }
+                            table_name__ = map_.next_value()?;
+                        }
                         GeneratedField::Schema => {
                             if schema__.is_some() {
                                 return Err(serde::de::Error::duplicate_field("schema"));
                             }
                             schema__ = map_.next_value()?;
                         }
-                        GeneratedField::StringifiedPlans => {
-                            if stringified_plans__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("stringifiedPlans"));
+                        GeneratedField::Projection => {
+                            if projection__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("projection"));
                             }
-                            stringified_plans__ = Some(map_.next_value()?);
+                            projection__ = map_.next_value()?;
                         }
-                        GeneratedField::Verbose => {
-                            if verbose__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("verbose"));
+                        GeneratedField::Filters => {
+                            if filters__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("filters"));
                             }
-                            verbose__ = Some(map_.next_value()?);
+                            filters__ = Some(map_.next_value()?);
                         }
                     }
                 }
-                Ok(ExplainExecNode {
+                Ok(EmptyTableScanNode {
+                    table_name: table_name__,
                     schema: schema__,
-                    stringified_plans: stringified_plans__.unwrap_or_default(),
-                    verbose: verbose__.unwrap_or_default(),
+                    projection: projection__,
+                    filters: filters__.unwrap_or_default(),
                 })
             }
         }
-        deserializer.deserialize_struct("datafusion.ExplainExecNode", FIELDS, GeneratedVisitor)
+        deserializer.deserialize_struct("datafusion.EmptyTableScanNode", FIELDS, GeneratedVisitor)
     }
 }
-impl serde::Serialize for ExplainNode {
+impl serde::Serialize for ExplainExecNode {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
     where
@@ -5604,15 +6083,21 @@ impl serde::Serialize for ExplainNode {
     {
         use serde::ser::SerializeStruct;
         let mut len = 0;
-        if self.input.is_some() {
+        if self.schema.is_some() {
+            len += 1;
+        }
+        if !self.stringified_plans.is_empty() {
             len += 1;
         }
         if self.verbose {
             len += 1;
         }
-        let mut struct_ser = serializer.serialize_struct("datafusion.ExplainNode", len)?;
-        if let Some(v) = self.input.as_ref() {
-            struct_ser.serialize_field("input", v)?;
+        let mut struct_ser = serializer.serialize_struct("datafusion.ExplainExecNode", len)?;
+        if let Some(v) = self.schema.as_ref() {
+            struct_ser.serialize_field("schema", v)?;
+        }
+        if !self.stringified_plans.is_empty() {
+            struct_ser.serialize_field("stringifiedPlans", &self.stringified_plans)?;
         }
         if self.verbose {
             struct_ser.serialize_field("verbose", &self.verbose)?;
@@ -5620,20 +6105,23 @@ impl serde::Serialize for ExplainNode {
         struct_ser.end()
     }
 }
-impl<'de> serde::Deserialize<'de> for ExplainNode {
+impl<'de> serde::Deserialize<'de> for ExplainExecNode {
     #[allow(deprecated)]
     fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
     where
         D: serde::Deserializer<'de>,
     {
         const FIELDS: &[&str] = &[
-            "input",
+            "schema",
+            "stringified_plans",
+            "stringifiedPlans",
             "verbose",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
-            Input,
+            Schema,
+            StringifiedPlans,
             Verbose,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
@@ -5643,7 +6131,7 @@ impl<'de> serde::Deserialize<'de> for ExplainNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5656,8 +6144,125 @@ impl<'de> serde::Deserialize<'de> for ExplainNode {
                         E: serde::de::Error,
                     {
                         match value {
-                            "input" => Ok(GeneratedField::Input),
-                            "verbose" => Ok(GeneratedField::Verbose),
+                            "schema" => Ok(GeneratedField::Schema),
+                            "stringifiedPlans" | "stringified_plans" => Ok(GeneratedField::StringifiedPlans),
+                            "verbose" => Ok(GeneratedField::Verbose),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = ExplainExecNode;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.ExplainExecNode")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<ExplainExecNode, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut schema__ = None;
+                let mut stringified_plans__ = None;
+                let mut verbose__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Schema => {
+                            if schema__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("schema"));
+                            }
+                            schema__ = map_.next_value()?;
+                        }
+                        GeneratedField::StringifiedPlans => {
+                            if stringified_plans__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("stringifiedPlans"));
+                            }
+                            stringified_plans__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::Verbose => {
+                            if verbose__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("verbose"));
+                            }
+                            verbose__ = Some(map_.next_value()?);
+                        }
+                    }
+                }
+                Ok(ExplainExecNode {
+                    schema: schema__,
+                    stringified_plans: stringified_plans__.unwrap_or_default(),
+                    verbose: verbose__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.ExplainExecNode", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for ExplainNode {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.input.is_some() {
+            len += 1;
+        }
+        if self.verbose {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.ExplainNode", len)?;
+        if let Some(v) = self.input.as_ref() {
+            struct_ser.serialize_field("input", v)?;
+        }
+        if self.verbose {
+            struct_ser.serialize_field("verbose", &self.verbose)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for ExplainNode {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "input",
+            "verbose",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Input,
+            Verbose,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "input" => Ok(GeneratedField::Input),
+                            "verbose" => Ok(GeneratedField::Verbose),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -5704,6 +6309,204 @@ impl<'de> serde::Deserialize<'de> for ExplainNode {
         deserializer.deserialize_struct("datafusion.ExplainNode", FIELDS, GeneratedVisitor)
     }
 }
+impl serde::Serialize for FileFormatKind {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        let variant = match self {
+            Self::Unspecified => "FILE_FORMAT_KIND_UNSPECIFIED",
+            Self::Csv => "FILE_FORMAT_KIND_CSV",
+            Self::Json => "FILE_FORMAT_KIND_JSON",
+            Self::Parquet => "FILE_FORMAT_KIND_PARQUET",
+            Self::Arrow => "FILE_FORMAT_KIND_ARROW",
+            Self::Avro => "FILE_FORMAT_KIND_AVRO",
+        };
+        serializer.serialize_str(variant)
+    }
+}
+impl<'de> serde::Deserialize<'de> for FileFormatKind {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "FILE_FORMAT_KIND_UNSPECIFIED",
+            "FILE_FORMAT_KIND_CSV",
+            "FILE_FORMAT_KIND_JSON",
+            "FILE_FORMAT_KIND_PARQUET",
+            "FILE_FORMAT_KIND_ARROW",
+            "FILE_FORMAT_KIND_AVRO",
+        ];
+
+        struct GeneratedVisitor;
+
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
+            type Value = FileFormatKind;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                write!(formatter, "expected one of: {:?}", &FIELDS)
+            }
+
+            fn visit_i64<E>(self, v: i64) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                i32::try_from(v)
+                    .ok()
+                    .and_then(|x| x.try_into().ok())
+                    .ok_or_else(|| {
+                        serde::de::Error::invalid_value(serde::de::Unexpected::Signed(v), &self)
+                    })
+            }
+
+            fn visit_u64<E>(self, v: u64) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                i32::try_from(v)
+                    .ok()
+                    .and_then(|x| x.try_into().ok())
+                    .ok_or_else(|| {
+                        serde::de::Error::invalid_value(serde::de::Unexpected::Unsigned(v), &self)
+                    })
+            }
+
+            fn visit_str<E>(self, value: &str) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                match value {
+                    "FILE_FORMAT_KIND_UNSPECIFIED" => Ok(FileFormatKind::Unspecified),
+                    "FILE_FORMAT_KIND_CSV" => Ok(FileFormatKind::Csv),
+                    "FILE_FORMAT_KIND_JSON" => Ok(FileFormatKind::Json),
+                    "FILE_FORMAT_KIND_PARQUET" => Ok(FileFormatKind::Parquet),
+                    "FILE_FORMAT_KIND_ARROW" => Ok(FileFormatKind::Arrow),
+                    "FILE_FORMAT_KIND_AVRO" => Ok(FileFormatKind::Avro),
+                    _ => Err(serde::de::Error::unknown_variant(value, FIELDS)),
+                }
+            }
+        }
+        deserializer.deserialize_any(GeneratedVisitor)
+    }
+}
+impl serde::Serialize for FileFormatProto {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.kind != 0 {
+            len += 1;
+        }
+        if !self.encoded_file_format.is_empty() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.FileFormatProto", len)?;
+        if self.kind != 0 {
+            let v = FileFormatKind::try_from(self.kind)
+                .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.kind)))?;
+            struct_ser.serialize_field("kind", &v)?;
+        }
+        if !self.encoded_file_format.is_empty() {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("encodedFileFormat", pbjson::private::base64::encode(&self.encoded_file_format).as_str())?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for FileFormatProto {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "kind",
+            "encoded_file_format",
+            "encodedFileFormat",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Kind,
+            EncodedFileFormat,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "kind" => Ok(GeneratedField::Kind),
+                            "encodedFileFormat" | "encoded_file_format" => Ok(GeneratedField::EncodedFileFormat),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = FileFormatProto;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.FileFormatProto")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<FileFormatProto, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut kind__ = None;
+                let mut encoded_file_format__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Kind => {
+                            if kind__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("kind"));
+                            }
+                            kind__ = Some(map_.next_value::<FileFormatKind>()? as i32);
+                        }
+                        GeneratedField::EncodedFileFormat => {
+                            if encoded_file_format__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("encodedFileFormat"));
+                            }
+                            encoded_file_format__ = 
+                                Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0)
+                            ;
+                        }
+                    }
+                }
+                Ok(FileFormatProto {
+                    kind: kind__.unwrap_or_default(),
+                    encoded_file_format: encoded_file_format__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.FileFormatProto", FIELDS, GeneratedVisitor)
+    }
+}
 impl serde::Serialize for FileGroup {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
@@ -5743,7 +6546,7 @@ impl<'de> serde::Deserialize<'de> for FileGroup {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5763,36 +6566,110 @@ impl<'de> serde::Deserialize<'de> for FileGroup {
                 }
                 deserializer.deserialize_identifier(GeneratedVisitor)
             }
-        }
-        struct GeneratedVisitor;
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
-            type Value = FileGroup;
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = FileGroup;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.FileGroup")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<FileGroup, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut files__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Files => {
+                            if files__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("files"));
+                            }
+                            files__ = Some(map_.next_value()?);
+                        }
+                    }
+                }
+                Ok(FileGroup {
+                    files: files__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.FileGroup", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for FileOutputMode {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        let variant = match self {
+            Self::Automatic => "FILE_OUTPUT_MODE_AUTOMATIC",
+            Self::SingleFile => "FILE_OUTPUT_MODE_SINGLE_FILE",
+            Self::Directory => "FILE_OUTPUT_MODE_DIRECTORY",
+        };
+        serializer.serialize_str(variant)
+    }
+}
+impl<'de> serde::Deserialize<'de> for FileOutputMode {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "FILE_OUTPUT_MODE_AUTOMATIC",
+            "FILE_OUTPUT_MODE_SINGLE_FILE",
+            "FILE_OUTPUT_MODE_DIRECTORY",
+        ];
+
+        struct GeneratedVisitor;
+
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
+            type Value = FileOutputMode;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                write!(formatter, "expected one of: {:?}", &FIELDS)
+            }
+
+            fn visit_i64<E>(self, v: i64) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                i32::try_from(v)
+                    .ok()
+                    .and_then(|x| x.try_into().ok())
+                    .ok_or_else(|| {
+                        serde::de::Error::invalid_value(serde::de::Unexpected::Signed(v), &self)
+                    })
+            }
 
-            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                formatter.write_str("struct datafusion.FileGroup")
+            fn visit_u64<E>(self, v: u64) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                i32::try_from(v)
+                    .ok()
+                    .and_then(|x| x.try_into().ok())
+                    .ok_or_else(|| {
+                        serde::de::Error::invalid_value(serde::de::Unexpected::Unsigned(v), &self)
+                    })
             }
 
-            fn visit_map<V>(self, mut map_: V) -> std::result::Result<FileGroup, V::Error>
-                where
-                    V: serde::de::MapAccess<'de>,
+            fn visit_str<E>(self, value: &str) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
             {
-                let mut files__ = None;
-                while let Some(k) = map_.next_key()? {
-                    match k {
-                        GeneratedField::Files => {
-                            if files__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("files"));
-                            }
-                            files__ = Some(map_.next_value()?);
-                        }
-                    }
+                match value {
+                    "FILE_OUTPUT_MODE_AUTOMATIC" => Ok(FileOutputMode::Automatic),
+                    "FILE_OUTPUT_MODE_SINGLE_FILE" => Ok(FileOutputMode::SingleFile),
+                    "FILE_OUTPUT_MODE_DIRECTORY" => Ok(FileOutputMode::Directory),
+                    _ => Err(serde::de::Error::unknown_variant(value, FIELDS)),
                 }
-                Ok(FileGroup {
-                    files: files__.unwrap_or_default(),
-                })
             }
         }
-        deserializer.deserialize_struct("datafusion.FileGroup", FIELDS, GeneratedVisitor)
+        deserializer.deserialize_any(GeneratedVisitor)
     }
 }
 impl serde::Serialize for FileRange {
@@ -5846,7 +6723,7 @@ impl<'de> serde::Deserialize<'de> for FileRange {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5949,6 +6826,9 @@ impl serde::Serialize for FileScanExecConf {
         if self.batch_size.is_some() {
             len += 1;
         }
+        if self.projection_exprs.is_some() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.FileScanExecConf", len)?;
         if !self.file_groups.is_empty() {
             struct_ser.serialize_field("fileGroups", &self.file_groups)?;
@@ -5982,6 +6862,9 @@ impl serde::Serialize for FileScanExecConf {
             #[allow(clippy::needless_borrows_for_generic_args)]
             struct_ser.serialize_field("batchSize", ToString::to_string(&v).as_str())?;
         }
+        if let Some(v) = self.projection_exprs.as_ref() {
+            struct_ser.serialize_field("projectionExprs", v)?;
+        }
         struct_ser.end()
     }
 }
@@ -6007,6 +6890,8 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf {
             "constraints",
             "batch_size",
             "batchSize",
+            "projection_exprs",
+            "projectionExprs",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -6021,6 +6906,7 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf {
             OutputOrdering,
             Constraints,
             BatchSize,
+            ProjectionExprs,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -6029,7 +6915,7 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6052,6 +6938,7 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf {
                             "outputOrdering" | "output_ordering" => Ok(GeneratedField::OutputOrdering),
                             "constraints" => Ok(GeneratedField::Constraints),
                             "batchSize" | "batch_size" => Ok(GeneratedField::BatchSize),
+                            "projectionExprs" | "projection_exprs" => Ok(GeneratedField::ProjectionExprs),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -6081,6 +6968,7 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf {
                 let mut output_ordering__ = None;
                 let mut constraints__ = None;
                 let mut batch_size__ = None;
+                let mut projection_exprs__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::FileGroups => {
@@ -6148,6 +7036,12 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf {
                                 map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| x.0)
                             ;
                         }
+                        GeneratedField::ProjectionExprs => {
+                            if projection_exprs__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("projectionExprs"));
+                            }
+                            projection_exprs__ = map_.next_value()?;
+                        }
                     }
                 }
                 Ok(FileScanExecConf {
@@ -6161,6 +7055,7 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf {
                     output_ordering: output_ordering__.unwrap_or_default(),
                     constraints: constraints__,
                     batch_size: batch_size__,
+                    projection_exprs: projection_exprs__,
                 })
             }
         }
@@ -6199,6 +7094,9 @@ impl serde::Serialize for FileSinkConfig {
         if !self.file_extension.is_empty() {
             len += 1;
         }
+        if self.file_output_mode != 0 {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.FileSinkConfig", len)?;
         if !self.object_store_url.is_empty() {
             struct_ser.serialize_field("objectStoreUrl", &self.object_store_url)?;
@@ -6226,6 +7124,11 @@ impl serde::Serialize for FileSinkConfig {
         if !self.file_extension.is_empty() {
             struct_ser.serialize_field("fileExtension", &self.file_extension)?;
         }
+        if self.file_output_mode != 0 {
+            let v = FileOutputMode::try_from(self.file_output_mode)
+                .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.file_output_mode)))?;
+            struct_ser.serialize_field("fileOutputMode", &v)?;
+        }
         struct_ser.end()
     }
 }
@@ -6252,6 +7155,8 @@ impl<'de> serde::Deserialize<'de> for FileSinkConfig {
             "insertOp",
             "file_extension",
             "fileExtension",
+            "file_output_mode",
+            "fileOutputMode",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -6264,6 +7169,7 @@ impl<'de> serde::Deserialize<'de> for FileSinkConfig {
             KeepPartitionByColumns,
             InsertOp,
             FileExtension,
+            FileOutputMode,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -6272,7 +7178,7 @@ impl<'de> serde::Deserialize<'de> for FileSinkConfig {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6293,6 +7199,7 @@ impl<'de> serde::Deserialize<'de> for FileSinkConfig {
                             "keepPartitionByColumns" | "keep_partition_by_columns" => Ok(GeneratedField::KeepPartitionByColumns),
                             "insertOp" | "insert_op" => Ok(GeneratedField::InsertOp),
                             "fileExtension" | "file_extension" => Ok(GeneratedField::FileExtension),
+                            "fileOutputMode" | "file_output_mode" => Ok(GeneratedField::FileOutputMode),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -6320,6 +7227,7 @@ impl<'de> serde::Deserialize<'de> for FileSinkConfig {
                 let mut keep_partition_by_columns__ = None;
                 let mut insert_op__ = None;
                 let mut file_extension__ = None;
+                let mut file_output_mode__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::ObjectStoreUrl => {
@@ -6370,6 +7278,12 @@ impl<'de> serde::Deserialize<'de> for FileSinkConfig {
                             }
                             file_extension__ = Some(map_.next_value()?);
                         }
+                        GeneratedField::FileOutputMode => {
+                            if file_output_mode__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("fileOutputMode"));
+                            }
+                            file_output_mode__ = Some(map_.next_value::<FileOutputMode>()? as i32);
+                        }
                     }
                 }
                 Ok(FileSinkConfig {
@@ -6381,6 +7295,7 @@ impl<'de> serde::Deserialize<'de> for FileSinkConfig {
                     keep_partition_by_columns: keep_partition_by_columns__.unwrap_or_default(),
                     insert_op: insert_op__.unwrap_or_default(),
                     file_extension: file_extension__.unwrap_or_default(),
+                    file_output_mode: file_output_mode__.unwrap_or_default(),
                 })
             }
         }
@@ -6407,6 +7322,12 @@ impl serde::Serialize for FilterExecNode {
         if !self.projection.is_empty() {
             len += 1;
         }
+        if self.batch_size != 0 {
+            len += 1;
+        }
+        if self.fetch.is_some() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.FilterExecNode", len)?;
         if let Some(v) = self.input.as_ref() {
             struct_ser.serialize_field("input", v)?;
@@ -6420,6 +7341,12 @@ impl serde::Serialize for FilterExecNode {
         if !self.projection.is_empty() {
             struct_ser.serialize_field("projection", &self.projection)?;
         }
+        if self.batch_size != 0 {
+            struct_ser.serialize_field("batchSize", &self.batch_size)?;
+        }
+        if let Some(v) = self.fetch.as_ref() {
+            struct_ser.serialize_field("fetch", v)?;
+        }
         struct_ser.end()
     }
 }
@@ -6435,6 +7362,9 @@ impl<'de> serde::Deserialize<'de> for FilterExecNode {
             "default_filter_selectivity",
             "defaultFilterSelectivity",
             "projection",
+            "batch_size",
+            "batchSize",
+            "fetch",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -6443,6 +7373,8 @@ impl<'de> serde::Deserialize<'de> for FilterExecNode {
             Expr,
             DefaultFilterSelectivity,
             Projection,
+            BatchSize,
+            Fetch,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -6451,7 +7383,7 @@ impl<'de> serde::Deserialize<'de> for FilterExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6468,6 +7400,8 @@ impl<'de> serde::Deserialize<'de> for FilterExecNode {
                             "expr" => Ok(GeneratedField::Expr),
                             "defaultFilterSelectivity" | "default_filter_selectivity" => Ok(GeneratedField::DefaultFilterSelectivity),
                             "projection" => Ok(GeneratedField::Projection),
+                            "batchSize" | "batch_size" => Ok(GeneratedField::BatchSize),
+                            "fetch" => Ok(GeneratedField::Fetch),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -6491,6 +7425,8 @@ impl<'de> serde::Deserialize<'de> for FilterExecNode {
                 let mut expr__ = None;
                 let mut default_filter_selectivity__ = None;
                 let mut projection__ = None;
+                let mut batch_size__ = None;
+                let mut fetch__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Input => {
@@ -6522,6 +7458,22 @@ impl<'de> serde::Deserialize<'de> for FilterExecNode {
                                     .into_iter().map(|x| x.0).collect())
                             ;
                         }
+                        GeneratedField::BatchSize => {
+                            if batch_size__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("batchSize"));
+                            }
+                            batch_size__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::Fetch => {
+                            if fetch__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("fetch"));
+                            }
+                            fetch__ = 
+                                map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| x.0)
+                            ;
+                        }
                     }
                 }
                 Ok(FilterExecNode {
@@ -6529,6 +7481,8 @@ impl<'de> serde::Deserialize<'de> for FilterExecNode {
                     expr: expr__,
                     default_filter_selectivity: default_filter_selectivity__.unwrap_or_default(),
                     projection: projection__.unwrap_or_default(),
+                    batch_size: batch_size__.unwrap_or_default(),
+                    fetch: fetch__,
                 })
             }
         }
@@ -6574,7 +7528,7 @@ impl<'de> serde::Deserialize<'de> for FixedSizeBinary {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6683,7 +7637,7 @@ impl<'de> serde::Deserialize<'de> for FullTableReference {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6794,7 +7748,7 @@ impl<'de> serde::Deserialize<'de> for GenerateSeriesArgsContainsNull {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6924,7 +7878,7 @@ impl<'de> serde::Deserialize<'de> for GenerateSeriesArgsDate {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7096,7 +8050,7 @@ impl<'de> serde::Deserialize<'de> for GenerateSeriesArgsInt64 {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7276,7 +8230,7 @@ impl<'de> serde::Deserialize<'de> for GenerateSeriesArgsTimestamp {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7403,7 +8357,7 @@ impl<'de> serde::Deserialize<'de> for GenerateSeriesName {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = GenerateSeriesName;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7527,7 +8481,7 @@ impl<'de> serde::Deserialize<'de> for GenerateSeriesNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7681,7 +8635,7 @@ impl<'de> serde::Deserialize<'de> for GlobalLimitExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7794,7 +8748,7 @@ impl<'de> serde::Deserialize<'de> for GroupingSetNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7878,6 +8832,12 @@ impl serde::Serialize for HashJoinExecNode {
         if !self.projection.is_empty() {
             len += 1;
         }
+        if self.null_aware {
+            len += 1;
+        }
+        if self.dynamic_filter.is_some() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.HashJoinExecNode", len)?;
         if let Some(v) = self.left.as_ref() {
             struct_ser.serialize_field("left", v)?;
@@ -7909,6 +8869,12 @@ impl serde::Serialize for HashJoinExecNode {
         if !self.projection.is_empty() {
             struct_ser.serialize_field("projection", &self.projection)?;
         }
+        if self.null_aware {
+            struct_ser.serialize_field("nullAware", &self.null_aware)?;
+        }
+        if let Some(v) = self.dynamic_filter.as_ref() {
+            struct_ser.serialize_field("dynamicFilter", v)?;
+        }
         struct_ser.end()
     }
 }
@@ -7930,6 +8896,10 @@ impl<'de> serde::Deserialize<'de> for HashJoinExecNode {
             "nullEquality",
             "filter",
             "projection",
+            "null_aware",
+            "nullAware",
+            "dynamic_filter",
+            "dynamicFilter",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -7942,6 +8912,8 @@ impl<'de> serde::Deserialize<'de> for HashJoinExecNode {
             NullEquality,
             Filter,
             Projection,
+            NullAware,
+            DynamicFilter,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -7950,7 +8922,7 @@ impl<'de> serde::Deserialize<'de> for HashJoinExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7971,6 +8943,8 @@ impl<'de> serde::Deserialize<'de> for HashJoinExecNode {
                             "nullEquality" | "null_equality" => Ok(GeneratedField::NullEquality),
                             "filter" => Ok(GeneratedField::Filter),
                             "projection" => Ok(GeneratedField::Projection),
+                            "nullAware" | "null_aware" => Ok(GeneratedField::NullAware),
+                            "dynamicFilter" | "dynamic_filter" => Ok(GeneratedField::DynamicFilter),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -7998,6 +8972,8 @@ impl<'de> serde::Deserialize<'de> for HashJoinExecNode {
                 let mut null_equality__ = None;
                 let mut filter__ = None;
                 let mut projection__ = None;
+                let mut null_aware__ = None;
+                let mut dynamic_filter__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Left => {
@@ -8051,6 +9027,18 @@ impl<'de> serde::Deserialize<'de> for HashJoinExecNode {
                                     .into_iter().map(|x| x.0).collect())
                             ;
                         }
+                        GeneratedField::NullAware => {
+                            if null_aware__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("nullAware"));
+                            }
+                            null_aware__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::DynamicFilter => {
+                            if dynamic_filter__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("dynamicFilter"));
+                            }
+                            dynamic_filter__ = map_.next_value()?;
+                        }
                     }
                 }
                 Ok(HashJoinExecNode {
@@ -8062,6 +9050,8 @@ impl<'de> serde::Deserialize<'de> for HashJoinExecNode {
                     null_equality: null_equality__.unwrap_or_default(),
                     filter: filter__,
                     projection: projection__.unwrap_or_default(),
+                    null_aware: null_aware__.unwrap_or_default(),
+                    dynamic_filter: dynamic_filter__,
                 })
             }
         }
@@ -8119,7 +9109,7 @@ impl<'de> serde::Deserialize<'de> for HashRepartition {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8246,7 +9236,7 @@ impl<'de> serde::Deserialize<'de> for ILikeNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8380,7 +9370,7 @@ impl<'de> serde::Deserialize<'de> for InListNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8478,7 +9468,7 @@ impl<'de> serde::Deserialize<'de> for InsertOp {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = InsertOp;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8563,7 +9553,7 @@ impl<'de> serde::Deserialize<'de> for InterleaveExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8654,7 +9644,7 @@ impl<'de> serde::Deserialize<'de> for IsFalse {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8745,7 +9735,7 @@ impl<'de> serde::Deserialize<'de> for IsNotFalse {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8836,7 +9826,7 @@ impl<'de> serde::Deserialize<'de> for IsNotNull {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8927,7 +9917,7 @@ impl<'de> serde::Deserialize<'de> for IsNotTrue {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -9018,7 +10008,7 @@ impl<'de> serde::Deserialize<'de> for IsNotUnknown {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -9109,7 +10099,7 @@ impl<'de> serde::Deserialize<'de> for IsNull {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -9200,7 +10190,7 @@ impl<'de> serde::Deserialize<'de> for IsTrue {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -9291,7 +10281,7 @@ impl<'de> serde::Deserialize<'de> for IsUnknown {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -9399,7 +10389,7 @@ impl<'de> serde::Deserialize<'de> for JoinFilter {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -9575,7 +10565,7 @@ impl<'de> serde::Deserialize<'de> for JoinNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -9737,7 +10727,7 @@ impl<'de> serde::Deserialize<'de> for JoinOn {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -9838,7 +10828,7 @@ impl<'de> serde::Deserialize<'de> for JsonScanExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -9938,7 +10928,7 @@ impl<'de> serde::Deserialize<'de> for JsonSink {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -10064,7 +11054,7 @@ impl<'de> serde::Deserialize<'de> for JsonSinkExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -10207,7 +11197,7 @@ impl<'de> serde::Deserialize<'de> for LikeNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -10345,7 +11335,7 @@ impl<'de> serde::Deserialize<'de> for LimitNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -10458,7 +11448,7 @@ impl<'de> serde::Deserialize<'de> for ListIndex {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -10565,7 +11555,7 @@ impl<'de> serde::Deserialize<'de> for ListRange {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -10683,7 +11673,7 @@ impl<'de> serde::Deserialize<'de> for ListUnnest {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -10897,7 +11887,7 @@ impl<'de> serde::Deserialize<'de> for ListingTableScanNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -11121,7 +12111,7 @@ impl<'de> serde::Deserialize<'de> for LocalLimitExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -11223,7 +12213,7 @@ impl<'de> serde::Deserialize<'de> for LogicalExprList {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -11382,6 +12372,9 @@ impl serde::Serialize for LogicalExprNode {
                 logical_expr_node::ExprType::Unnest(v) => {
                     struct_ser.serialize_field("unnest", v)?;
                 }
+                logical_expr_node::ExprType::ScalarSubqueryExpr(v) => {
+                    struct_ser.serialize_field("scalarSubqueryExpr", v)?;
+                }
             }
         }
         struct_ser.end()
@@ -11443,6 +12436,8 @@ impl<'de> serde::Deserialize<'de> for LogicalExprNode {
             "similarTo",
             "placeholder",
             "unnest",
+            "scalar_subquery_expr",
+            "scalarSubqueryExpr",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -11478,6 +12473,7 @@ impl<'de> serde::Deserialize<'de> for LogicalExprNode {
             SimilarTo,
             Placeholder,
             Unnest,
+            ScalarSubqueryExpr,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -11486,7 +12482,7 @@ impl<'de> serde::Deserialize<'de> for LogicalExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -11530,6 +12526,7 @@ impl<'de> serde::Deserialize<'de> for LogicalExprNode {
                             "similarTo" | "similar_to" => Ok(GeneratedField::SimilarTo),
                             "placeholder" => Ok(GeneratedField::Placeholder),
                             "unnest" => Ok(GeneratedField::Unnest),
+                            "scalarSubqueryExpr" | "scalar_subquery_expr" => Ok(GeneratedField::ScalarSubqueryExpr),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -11767,6 +12764,13 @@ impl<'de> serde::Deserialize<'de> for LogicalExprNode {
                                 return Err(serde::de::Error::duplicate_field("unnest"));
                             }
                             expr_type__ = map_.next_value::<::std::option::Option<_>>()?.map(logical_expr_node::ExprType::Unnest)
+;
+                        }
+                        GeneratedField::ScalarSubqueryExpr => {
+                            if expr_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("scalarSubqueryExpr"));
+                            }
+                            expr_type__ = map_.next_value::<::std::option::Option<_>>()?.map(logical_expr_node::ExprType::ScalarSubqueryExpr)
 ;
                         }
                     }
@@ -11819,7 +12823,7 @@ impl<'de> serde::Deserialize<'de> for LogicalExprNodeCollection {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -11920,7 +12924,7 @@ impl<'de> serde::Deserialize<'de> for LogicalExtensionNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -12093,6 +13097,9 @@ impl serde::Serialize for LogicalPlanNode {
                 logical_plan_node::LogicalPlanType::Dml(v) => {
                     struct_ser.serialize_field("dml", v)?;
                 }
+                logical_plan_node::LogicalPlanType::EmptyTableScan(v) => {
+                    struct_ser.serialize_field("emptyTableScan", v)?;
+                }
             }
         }
         struct_ser.end()
@@ -12152,6 +13159,8 @@ impl<'de> serde::Deserialize<'de> for LogicalPlanNode {
             "cte_work_table_scan",
             "cteWorkTableScan",
             "dml",
+            "empty_table_scan",
+            "emptyTableScan",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -12188,6 +13197,7 @@ impl<'de> serde::Deserialize<'de> for LogicalPlanNode {
             RecursiveQuery,
             CteWorkTableScan,
             Dml,
+            EmptyTableScan,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -12196,7 +13206,7 @@ impl<'de> serde::Deserialize<'de> for LogicalPlanNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -12241,6 +13251,7 @@ impl<'de> serde::Deserialize<'de> for LogicalPlanNode {
                             "recursiveQuery" | "recursive_query" => Ok(GeneratedField::RecursiveQuery),
                             "cteWorkTableScan" | "cte_work_table_scan" => Ok(GeneratedField::CteWorkTableScan),
                             "dml" => Ok(GeneratedField::Dml),
+                            "emptyTableScan" | "empty_table_scan" => Ok(GeneratedField::EmptyTableScan),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -12485,6 +13496,13 @@ impl<'de> serde::Deserialize<'de> for LogicalPlanNode {
                                 return Err(serde::de::Error::duplicate_field("dml"));
                             }
                             logical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(logical_plan_node::LogicalPlanType::Dml)
+;
+                        }
+                        GeneratedField::EmptyTableScan => {
+                            if logical_plan_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("emptyTableScan"));
+                            }
+                            logical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(logical_plan_node::LogicalPlanType::EmptyTableScan)
 ;
                         }
                     }
@@ -12536,7 +13554,7 @@ impl<'de> serde::Deserialize<'de> for MaybeFilter {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -12628,7 +13646,7 @@ impl<'de> serde::Deserialize<'de> for MaybePhysicalSortExprs {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -12761,7 +13779,7 @@ impl<'de> serde::Deserialize<'de> for MemoryScanExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -12905,7 +13923,7 @@ impl<'de> serde::Deserialize<'de> for NamedStructField {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -12996,7 +14014,7 @@ impl<'de> serde::Deserialize<'de> for NegativeNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -13122,7 +14140,7 @@ impl<'de> serde::Deserialize<'de> for NestedLoopJoinExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -13252,7 +14270,7 @@ impl<'de> serde::Deserialize<'de> for Not {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -13330,7 +14348,7 @@ impl<'de> serde::Deserialize<'de> for NullTreatment {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = NullTreatment;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -13415,7 +14433,7 @@ impl<'de> serde::Deserialize<'de> for OptimizedLogicalPlanType {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -13507,7 +14525,7 @@ impl<'de> serde::Deserialize<'de> for OptimizedPhysicalPlanType {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -13616,7 +14634,7 @@ impl<'de> serde::Deserialize<'de> for ParquetScanExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -13734,7 +14752,7 @@ impl<'de> serde::Deserialize<'de> for ParquetSink {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -13860,7 +14878,7 @@ impl<'de> serde::Deserialize<'de> for ParquetSinkExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -13986,7 +15004,7 @@ impl<'de> serde::Deserialize<'de> for PartialTableReference {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -14086,7 +15104,7 @@ impl<'de> serde::Deserialize<'de> for PartiallySortedInputOrderMode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -14189,7 +15207,7 @@ impl<'de> serde::Deserialize<'de> for PartitionColumn {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -14278,7 +15296,7 @@ impl<'de> serde::Deserialize<'de> for PartitionMode {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = PartitionMode;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -14397,7 +15415,7 @@ impl<'de> serde::Deserialize<'de> for PartitionStats {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -14567,7 +15585,7 @@ impl<'de> serde::Deserialize<'de> for PartitionedFile {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -14726,7 +15744,7 @@ impl<'de> serde::Deserialize<'de> for Partitioning {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -14891,7 +15909,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalAggregateExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -15046,7 +16064,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalAliasNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -15124,6 +16142,9 @@ impl serde::Serialize for PhysicalBinaryExprNode {
         if !self.op.is_empty() {
             len += 1;
         }
+        if !self.operands.is_empty() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.PhysicalBinaryExprNode", len)?;
         if let Some(v) = self.l.as_ref() {
             struct_ser.serialize_field("l", v)?;
@@ -15134,6 +16155,9 @@ impl serde::Serialize for PhysicalBinaryExprNode {
         if !self.op.is_empty() {
             struct_ser.serialize_field("op", &self.op)?;
         }
+        if !self.operands.is_empty() {
+            struct_ser.serialize_field("operands", &self.operands)?;
+        }
         struct_ser.end()
     }
 }
@@ -15147,6 +16171,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalBinaryExprNode {
             "l",
             "r",
             "op",
+            "operands",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -15154,6 +16179,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalBinaryExprNode {
             L,
             R,
             Op,
+            Operands,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -15162,7 +16188,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalBinaryExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -15178,6 +16204,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalBinaryExprNode {
                             "l" => Ok(GeneratedField::L),
                             "r" => Ok(GeneratedField::R),
                             "op" => Ok(GeneratedField::Op),
+                            "operands" => Ok(GeneratedField::Operands),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -15200,6 +16227,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalBinaryExprNode {
                 let mut l__ = None;
                 let mut r__ = None;
                 let mut op__ = None;
+                let mut operands__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::L => {
@@ -15220,12 +16248,19 @@ impl<'de> serde::Deserialize<'de> for PhysicalBinaryExprNode {
                             }
                             op__ = Some(map_.next_value()?);
                         }
+                        GeneratedField::Operands => {
+                            if operands__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("operands"));
+                            }
+                            operands__ = Some(map_.next_value()?);
+                        }
                     }
                 }
                 Ok(PhysicalBinaryExprNode {
                     l: l__,
                     r: r__,
                     op: op__.unwrap_or_default(),
+                    operands: operands__.unwrap_or_default(),
                 })
             }
         }
@@ -15289,7 +16324,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalCaseNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -15407,7 +16442,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalCastNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -15515,7 +16550,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalColumn {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -15578,7 +16613,132 @@ impl<'de> serde::Deserialize<'de> for PhysicalColumn {
         deserializer.deserialize_struct("datafusion.PhysicalColumn", FIELDS, GeneratedVisitor)
     }
 }
-impl serde::Serialize for PhysicalDateTimeIntervalExprNode {
+impl serde::Serialize for PhysicalDateTimeIntervalExprNode {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.l.is_some() {
+            len += 1;
+        }
+        if self.r.is_some() {
+            len += 1;
+        }
+        if !self.op.is_empty() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.PhysicalDateTimeIntervalExprNode", len)?;
+        if let Some(v) = self.l.as_ref() {
+            struct_ser.serialize_field("l", v)?;
+        }
+        if let Some(v) = self.r.as_ref() {
+            struct_ser.serialize_field("r", v)?;
+        }
+        if !self.op.is_empty() {
+            struct_ser.serialize_field("op", &self.op)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for PhysicalDateTimeIntervalExprNode {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "l",
+            "r",
+            "op",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            L,
+            R,
+            Op,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "l" => Ok(GeneratedField::L),
+                            "r" => Ok(GeneratedField::R),
+                            "op" => Ok(GeneratedField::Op),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = PhysicalDateTimeIntervalExprNode;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.PhysicalDateTimeIntervalExprNode")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<PhysicalDateTimeIntervalExprNode, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut l__ = None;
+                let mut r__ = None;
+                let mut op__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::L => {
+                            if l__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("l"));
+                            }
+                            l__ = map_.next_value()?;
+                        }
+                        GeneratedField::R => {
+                            if r__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("r"));
+                            }
+                            r__ = map_.next_value()?;
+                        }
+                        GeneratedField::Op => {
+                            if op__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("op"));
+                            }
+                            op__ = Some(map_.next_value()?);
+                        }
+                    }
+                }
+                Ok(PhysicalDateTimeIntervalExprNode {
+                    l: l__,
+                    r: r__,
+                    op: op__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.PhysicalDateTimeIntervalExprNode", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for PhysicalDynamicFilterNode {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
     where
@@ -15586,45 +16746,66 @@ impl serde::Serialize for PhysicalDateTimeIntervalExprNode {
     {
         use serde::ser::SerializeStruct;
         let mut len = 0;
-        if self.l.is_some() {
+        if !self.children.is_empty() {
             len += 1;
         }
-        if self.r.is_some() {
+        if !self.remapped_children.is_empty() {
             len += 1;
         }
-        if !self.op.is_empty() {
+        if self.generation != 0 {
             len += 1;
         }
-        let mut struct_ser = serializer.serialize_struct("datafusion.PhysicalDateTimeIntervalExprNode", len)?;
-        if let Some(v) = self.l.as_ref() {
-            struct_ser.serialize_field("l", v)?;
+        if self.inner_expr.is_some() {
+            len += 1;
         }
-        if let Some(v) = self.r.as_ref() {
-            struct_ser.serialize_field("r", v)?;
+        if self.is_complete {
+            len += 1;
         }
-        if !self.op.is_empty() {
-            struct_ser.serialize_field("op", &self.op)?;
+        let mut struct_ser = serializer.serialize_struct("datafusion.PhysicalDynamicFilterNode", len)?;
+        if !self.children.is_empty() {
+            struct_ser.serialize_field("children", &self.children)?;
+        }
+        if !self.remapped_children.is_empty() {
+            struct_ser.serialize_field("remappedChildren", &self.remapped_children)?;
+        }
+        if self.generation != 0 {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("generation", ToString::to_string(&self.generation).as_str())?;
+        }
+        if let Some(v) = self.inner_expr.as_ref() {
+            struct_ser.serialize_field("innerExpr", v)?;
+        }
+        if self.is_complete {
+            struct_ser.serialize_field("isComplete", &self.is_complete)?;
         }
         struct_ser.end()
     }
 }
-impl<'de> serde::Deserialize<'de> for PhysicalDateTimeIntervalExprNode {
+impl<'de> serde::Deserialize<'de> for PhysicalDynamicFilterNode {
     #[allow(deprecated)]
     fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
     where
         D: serde::Deserializer<'de>,
     {
         const FIELDS: &[&str] = &[
-            "l",
-            "r",
-            "op",
+            "children",
+            "remapped_children",
+            "remappedChildren",
+            "generation",
+            "inner_expr",
+            "innerExpr",
+            "is_complete",
+            "isComplete",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
-            L,
-            R,
-            Op,
+            Children,
+            RemappedChildren,
+            Generation,
+            InnerExpr,
+            IsComplete,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -15633,7 +16814,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalDateTimeIntervalExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -15646,9 +16827,11 @@ impl<'de> serde::Deserialize<'de> for PhysicalDateTimeIntervalExprNode {
                         E: serde::de::Error,
                     {
                         match value {
-                            "l" => Ok(GeneratedField::L),
-                            "r" => Ok(GeneratedField::R),
-                            "op" => Ok(GeneratedField::Op),
+                            "children" => Ok(GeneratedField::Children),
+                            "remappedChildren" | "remapped_children" => Ok(GeneratedField::RemappedChildren),
+                            "generation" => Ok(GeneratedField::Generation),
+                            "innerExpr" | "inner_expr" => Ok(GeneratedField::InnerExpr),
+                            "isComplete" | "is_complete" => Ok(GeneratedField::IsComplete),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -15658,49 +16841,67 @@ impl<'de> serde::Deserialize<'de> for PhysicalDateTimeIntervalExprNode {
         }
         struct GeneratedVisitor;
         impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
-            type Value = PhysicalDateTimeIntervalExprNode;
+            type Value = PhysicalDynamicFilterNode;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                formatter.write_str("struct datafusion.PhysicalDateTimeIntervalExprNode")
+                formatter.write_str("struct datafusion.PhysicalDynamicFilterNode")
             }
 
-            fn visit_map<V>(self, mut map_: V) -> std::result::Result<PhysicalDateTimeIntervalExprNode, V::Error>
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<PhysicalDynamicFilterNode, V::Error>
                 where
                     V: serde::de::MapAccess<'de>,
             {
-                let mut l__ = None;
-                let mut r__ = None;
-                let mut op__ = None;
+                let mut children__ = None;
+                let mut remapped_children__ = None;
+                let mut generation__ = None;
+                let mut inner_expr__ = None;
+                let mut is_complete__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
-                        GeneratedField::L => {
-                            if l__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("l"));
+                        GeneratedField::Children => {
+                            if children__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("children"));
                             }
-                            l__ = map_.next_value()?;
+                            children__ = Some(map_.next_value()?);
                         }
-                        GeneratedField::R => {
-                            if r__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("r"));
+                        GeneratedField::RemappedChildren => {
+                            if remapped_children__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("remappedChildren"));
                             }
-                            r__ = map_.next_value()?;
+                            remapped_children__ = Some(map_.next_value()?);
                         }
-                        GeneratedField::Op => {
-                            if op__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("op"));
+                        GeneratedField::Generation => {
+                            if generation__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("generation"));
                             }
-                            op__ = Some(map_.next_value()?);
+                            generation__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::InnerExpr => {
+                            if inner_expr__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("innerExpr"));
+                            }
+                            inner_expr__ = map_.next_value()?;
+                        }
+                        GeneratedField::IsComplete => {
+                            if is_complete__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("isComplete"));
+                            }
+                            is_complete__ = Some(map_.next_value()?);
                         }
                     }
                 }
-                Ok(PhysicalDateTimeIntervalExprNode {
-                    l: l__,
-                    r: r__,
-                    op: op__.unwrap_or_default(),
+                Ok(PhysicalDynamicFilterNode {
+                    children: children__.unwrap_or_default(),
+                    remapped_children: remapped_children__.unwrap_or_default(),
+                    generation: generation__.unwrap_or_default(),
+                    inner_expr: inner_expr__,
+                    is_complete: is_complete__.unwrap_or_default(),
                 })
             }
         }
-        deserializer.deserialize_struct("datafusion.PhysicalDateTimeIntervalExprNode", FIELDS, GeneratedVisitor)
+        deserializer.deserialize_struct("datafusion.PhysicalDynamicFilterNode", FIELDS, GeneratedVisitor)
     }
 }
 impl serde::Serialize for PhysicalExprNode {
@@ -15711,10 +16912,18 @@ impl serde::Serialize for PhysicalExprNode {
     {
         use serde::ser::SerializeStruct;
         let mut len = 0;
+        if self.expr_id.is_some() {
+            len += 1;
+        }
         if self.expr_type.is_some() {
             len += 1;
         }
         let mut struct_ser = serializer.serialize_struct("datafusion.PhysicalExprNode", len)?;
+        if let Some(v) = self.expr_id.as_ref() {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("exprId", ToString::to_string(&v).as_str())?;
+        }
         if let Some(v) = self.expr_type.as_ref() {
             match v {
                 physical_expr_node::ExprType::Column(v) => {
@@ -15771,6 +16980,15 @@ impl serde::Serialize for PhysicalExprNode {
                 physical_expr_node::ExprType::UnknownColumn(v) => {
                     struct_ser.serialize_field("unknownColumn", v)?;
                 }
+                physical_expr_node::ExprType::HashExpr(v) => {
+                    struct_ser.serialize_field("hashExpr", v)?;
+                }
+                physical_expr_node::ExprType::ScalarSubquery(v) => {
+                    struct_ser.serialize_field("scalarSubquery", v)?;
+                }
+                physical_expr_node::ExprType::DynamicFilter(v) => {
+                    struct_ser.serialize_field("dynamicFilter", v)?;
+                }
             }
         }
         struct_ser.end()
@@ -15783,6 +17001,8 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode {
         D: serde::Deserializer<'de>,
     {
         const FIELDS: &[&str] = &[
+            "expr_id",
+            "exprId",
             "column",
             "literal",
             "binary_expr",
@@ -15813,10 +17033,17 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode {
             "extension",
             "unknown_column",
             "unknownColumn",
+            "hash_expr",
+            "hashExpr",
+            "scalar_subquery",
+            "scalarSubquery",
+            "dynamic_filter",
+            "dynamicFilter",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
+            ExprId,
             Column,
             Literal,
             BinaryExpr,
@@ -15835,6 +17062,9 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode {
             LikeExpr,
             Extension,
             UnknownColumn,
+            HashExpr,
+            ScalarSubquery,
+            DynamicFilter,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -15843,7 +17073,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -15856,6 +17086,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode {
                         E: serde::de::Error,
                     {
                         match value {
+                            "exprId" | "expr_id" => Ok(GeneratedField::ExprId),
                             "column" => Ok(GeneratedField::Column),
                             "literal" => Ok(GeneratedField::Literal),
                             "binaryExpr" | "binary_expr" => Ok(GeneratedField::BinaryExpr),
@@ -15874,6 +17105,9 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode {
                             "likeExpr" | "like_expr" => Ok(GeneratedField::LikeExpr),
                             "extension" => Ok(GeneratedField::Extension),
                             "unknownColumn" | "unknown_column" => Ok(GeneratedField::UnknownColumn),
+                            "hashExpr" | "hash_expr" => Ok(GeneratedField::HashExpr),
+                            "scalarSubquery" | "scalar_subquery" => Ok(GeneratedField::ScalarSubquery),
+                            "dynamicFilter" | "dynamic_filter" => Ok(GeneratedField::DynamicFilter),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -15893,9 +17127,18 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode {
                 where
                     V: serde::de::MapAccess<'de>,
             {
+                let mut expr_id__ = None;
                 let mut expr_type__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
+                        GeneratedField::ExprId => {
+                            if expr_id__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("exprId"));
+                            }
+                            expr_id__ = 
+                                map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| x.0)
+                            ;
+                        }
                         GeneratedField::Column => {
                             if expr_type__.is_some() {
                                 return Err(serde::de::Error::duplicate_field("column"));
@@ -16020,11 +17263,33 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode {
                                 return Err(serde::de::Error::duplicate_field("unknownColumn"));
                             }
                             expr_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_expr_node::ExprType::UnknownColumn)
+;
+                        }
+                        GeneratedField::HashExpr => {
+                            if expr_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("hashExpr"));
+                            }
+                            expr_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_expr_node::ExprType::HashExpr)
+;
+                        }
+                        GeneratedField::ScalarSubquery => {
+                            if expr_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("scalarSubquery"));
+                            }
+                            expr_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_expr_node::ExprType::ScalarSubquery)
+;
+                        }
+                        GeneratedField::DynamicFilter => {
+                            if expr_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("dynamicFilter"));
+                            }
+                            expr_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_expr_node::ExprType::DynamicFilter)
 ;
                         }
                     }
                 }
                 Ok(PhysicalExprNode {
+                    expr_id: expr_id__,
                     expr_type: expr_type__,
                 })
             }
@@ -16081,7 +17346,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalExtensionExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -16193,7 +17458,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalExtensionNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -16256,6 +17521,136 @@ impl<'de> serde::Deserialize<'de> for PhysicalExtensionNode {
         deserializer.deserialize_struct("datafusion.PhysicalExtensionNode", FIELDS, GeneratedVisitor)
     }
 }
+impl serde::Serialize for PhysicalHashExprNode {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if !self.on_columns.is_empty() {
+            len += 1;
+        }
+        if self.seed0 != 0 {
+            len += 1;
+        }
+        if !self.description.is_empty() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.PhysicalHashExprNode", len)?;
+        if !self.on_columns.is_empty() {
+            struct_ser.serialize_field("onColumns", &self.on_columns)?;
+        }
+        if self.seed0 != 0 {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("seed0", ToString::to_string(&self.seed0).as_str())?;
+        }
+        if !self.description.is_empty() {
+            struct_ser.serialize_field("description", &self.description)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for PhysicalHashExprNode {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "on_columns",
+            "onColumns",
+            "seed0",
+            "description",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            OnColumns,
+            Seed0,
+            Description,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "onColumns" | "on_columns" => Ok(GeneratedField::OnColumns),
+                            "seed0" => Ok(GeneratedField::Seed0),
+                            "description" => Ok(GeneratedField::Description),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = PhysicalHashExprNode;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.PhysicalHashExprNode")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<PhysicalHashExprNode, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut on_columns__ = None;
+                let mut seed0__ = None;
+                let mut description__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::OnColumns => {
+                            if on_columns__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("onColumns"));
+                            }
+                            on_columns__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::Seed0 => {
+                            if seed0__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("seed0"));
+                            }
+                            seed0__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::Description => {
+                            if description__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("description"));
+                            }
+                            description__ = Some(map_.next_value()?);
+                        }
+                    }
+                }
+                Ok(PhysicalHashExprNode {
+                    on_columns: on_columns__.unwrap_or_default(),
+                    seed0: seed0__.unwrap_or_default(),
+                    description: description__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.PhysicalHashExprNode", FIELDS, GeneratedVisitor)
+    }
+}
 impl serde::Serialize for PhysicalHashRepartition {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
@@ -16307,7 +17702,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalHashRepartition {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -16425,7 +17820,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalInListNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -16534,7 +17929,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalIsNotNull {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -16625,7 +18020,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalIsNull {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -16741,7 +18136,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalLikeExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -16859,7 +18254,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalNegativeNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -16950,7 +18345,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalNot {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -17118,6 +18513,18 @@ impl serde::Serialize for PhysicalPlanNode {
                 physical_plan_node::PhysicalPlanType::MemoryScan(v) => {
                     struct_ser.serialize_field("memoryScan", v)?;
                 }
+                physical_plan_node::PhysicalPlanType::AsyncFunc(v) => {
+                    struct_ser.serialize_field("asyncFunc", v)?;
+                }
+                physical_plan_node::PhysicalPlanType::Buffer(v) => {
+                    struct_ser.serialize_field("buffer", v)?;
+                }
+                physical_plan_node::PhysicalPlanType::ArrowScan(v) => {
+                    struct_ser.serialize_field("arrowScan", v)?;
+                }
+                physical_plan_node::PhysicalPlanType::ScalarSubquery(v) => {
+                    struct_ser.serialize_field("scalarSubquery", v)?;
+                }
             }
         }
         struct_ser.end()
@@ -17183,6 +18590,13 @@ impl<'de> serde::Deserialize<'de> for PhysicalPlanNode {
             "sortMergeJoin",
             "memory_scan",
             "memoryScan",
+            "async_func",
+            "asyncFunc",
+            "buffer",
+            "arrow_scan",
+            "arrowScan",
+            "scalar_subquery",
+            "scalarSubquery",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -17221,6 +18635,10 @@ impl<'de> serde::Deserialize<'de> for PhysicalPlanNode {
             GenerateSeries,
             SortMergeJoin,
             MemoryScan,
+            AsyncFunc,
+            Buffer,
+            ArrowScan,
+            ScalarSubquery,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -17229,7 +18647,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalPlanNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -17276,6 +18694,10 @@ impl<'de> serde::Deserialize<'de> for PhysicalPlanNode {
                             "generateSeries" | "generate_series" => Ok(GeneratedField::GenerateSeries),
                             "sortMergeJoin" | "sort_merge_join" => Ok(GeneratedField::SortMergeJoin),
                             "memoryScan" | "memory_scan" => Ok(GeneratedField::MemoryScan),
+                            "asyncFunc" | "async_func" => Ok(GeneratedField::AsyncFunc),
+                            "buffer" => Ok(GeneratedField::Buffer),
+                            "arrowScan" | "arrow_scan" => Ok(GeneratedField::ArrowScan),
+                            "scalarSubquery" | "scalar_subquery" => Ok(GeneratedField::ScalarSubquery),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -17515,35 +18937,191 @@ impl<'de> serde::Deserialize<'de> for PhysicalPlanNode {
                             physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::Cooperative)
 ;
                         }
-                        GeneratedField::GenerateSeries => {
-                            if physical_plan_type__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("generateSeries"));
+                        GeneratedField::GenerateSeries => {
+                            if physical_plan_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("generateSeries"));
+                            }
+                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::GenerateSeries)
+;
+                        }
+                        GeneratedField::SortMergeJoin => {
+                            if physical_plan_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("sortMergeJoin"));
+                            }
+                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::SortMergeJoin)
+;
+                        }
+                        GeneratedField::MemoryScan => {
+                            if physical_plan_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("memoryScan"));
+                            }
+                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::MemoryScan)
+;
+                        }
+                        GeneratedField::AsyncFunc => {
+                            if physical_plan_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("asyncFunc"));
+                            }
+                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::AsyncFunc)
+;
+                        }
+                        GeneratedField::Buffer => {
+                            if physical_plan_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("buffer"));
+                            }
+                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::Buffer)
+;
+                        }
+                        GeneratedField::ArrowScan => {
+                            if physical_plan_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("arrowScan"));
+                            }
+                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::ArrowScan)
+;
+                        }
+                        GeneratedField::ScalarSubquery => {
+                            if physical_plan_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("scalarSubquery"));
+                            }
+                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::ScalarSubquery)
+;
+                        }
+                    }
+                }
+                Ok(PhysicalPlanNode {
+                    physical_plan_type: physical_plan_type__,
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.PhysicalPlanNode", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for PhysicalScalarSubqueryExprNode {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.data_type.is_some() {
+            len += 1;
+        }
+        if self.nullable {
+            len += 1;
+        }
+        if self.index != 0 {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.PhysicalScalarSubqueryExprNode", len)?;
+        if let Some(v) = self.data_type.as_ref() {
+            struct_ser.serialize_field("dataType", v)?;
+        }
+        if self.nullable {
+            struct_ser.serialize_field("nullable", &self.nullable)?;
+        }
+        if self.index != 0 {
+            struct_ser.serialize_field("index", &self.index)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for PhysicalScalarSubqueryExprNode {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "data_type",
+            "dataType",
+            "nullable",
+            "index",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            DataType,
+            Nullable,
+            Index,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "dataType" | "data_type" => Ok(GeneratedField::DataType),
+                            "nullable" => Ok(GeneratedField::Nullable),
+                            "index" => Ok(GeneratedField::Index),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = PhysicalScalarSubqueryExprNode;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.PhysicalScalarSubqueryExprNode")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<PhysicalScalarSubqueryExprNode, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut data_type__ = None;
+                let mut nullable__ = None;
+                let mut index__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::DataType => {
+                            if data_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("dataType"));
                             }
-                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::GenerateSeries)
-;
+                            data_type__ = map_.next_value()?;
                         }
-                        GeneratedField::SortMergeJoin => {
-                            if physical_plan_type__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("sortMergeJoin"));
+                        GeneratedField::Nullable => {
+                            if nullable__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("nullable"));
                             }
-                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::SortMergeJoin)
-;
+                            nullable__ = Some(map_.next_value()?);
                         }
-                        GeneratedField::MemoryScan => {
-                            if physical_plan_type__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("memoryScan"));
+                        GeneratedField::Index => {
+                            if index__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("index"));
                             }
-                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::MemoryScan)
-;
+                            index__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
                         }
                     }
                 }
-                Ok(PhysicalPlanNode {
-                    physical_plan_type: physical_plan_type__,
+                Ok(PhysicalScalarSubqueryExprNode {
+                    data_type: data_type__,
+                    nullable: nullable__.unwrap_or_default(),
+                    index: index__.unwrap_or_default(),
                 })
             }
         }
-        deserializer.deserialize_struct("datafusion.PhysicalPlanNode", FIELDS, GeneratedVisitor)
+        deserializer.deserialize_struct("datafusion.PhysicalScalarSubqueryExprNode", FIELDS, GeneratedVisitor)
     }
 }
 impl serde::Serialize for PhysicalScalarUdfNode {
@@ -17630,7 +19208,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalScalarUdfNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -17785,7 +19363,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalSortExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -17895,7 +19473,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalSortExprNodeCollection {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -17995,7 +19573,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalTryCastNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -18105,7 +19683,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalWhenThen {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -18287,7 +19865,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalWindowExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -18484,7 +20062,7 @@ impl<'de> serde::Deserialize<'de> for PlaceholderNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -18604,7 +20182,7 @@ impl<'de> serde::Deserialize<'de> for PlaceholderRowExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -18759,7 +20337,7 @@ impl<'de> serde::Deserialize<'de> for PlanType {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -18972,7 +20550,7 @@ impl<'de> serde::Deserialize<'de> for PrepareNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -19090,7 +20668,7 @@ impl<'de> serde::Deserialize<'de> for ProjectionColumns {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -19198,7 +20776,7 @@ impl<'de> serde::Deserialize<'de> for ProjectionExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -19254,18 +20832,217 @@ impl<'de> serde::Deserialize<'de> for ProjectionExecNode {
                             if expr_name__.is_some() {
                                 return Err(serde::de::Error::duplicate_field("exprName"));
                             }
-                            expr_name__ = Some(map_.next_value()?);
+                            expr_name__ = Some(map_.next_value()?);
+                        }
+                    }
+                }
+                Ok(ProjectionExecNode {
+                    input: input__,
+                    expr: expr__.unwrap_or_default(),
+                    expr_name: expr_name__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.ProjectionExecNode", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for ProjectionExpr {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if !self.alias.is_empty() {
+            len += 1;
+        }
+        if self.expr.is_some() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.ProjectionExpr", len)?;
+        if !self.alias.is_empty() {
+            struct_ser.serialize_field("alias", &self.alias)?;
+        }
+        if let Some(v) = self.expr.as_ref() {
+            struct_ser.serialize_field("expr", v)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for ProjectionExpr {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "alias",
+            "expr",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Alias,
+            Expr,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "alias" => Ok(GeneratedField::Alias),
+                            "expr" => Ok(GeneratedField::Expr),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = ProjectionExpr;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.ProjectionExpr")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<ProjectionExpr, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut alias__ = None;
+                let mut expr__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Alias => {
+                            if alias__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("alias"));
+                            }
+                            alias__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::Expr => {
+                            if expr__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("expr"));
+                            }
+                            expr__ = map_.next_value()?;
+                        }
+                    }
+                }
+                Ok(ProjectionExpr {
+                    alias: alias__.unwrap_or_default(),
+                    expr: expr__,
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.ProjectionExpr", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for ProjectionExprs {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if !self.projections.is_empty() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.ProjectionExprs", len)?;
+        if !self.projections.is_empty() {
+            struct_ser.serialize_field("projections", &self.projections)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for ProjectionExprs {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "projections",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Projections,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "projections" => Ok(GeneratedField::Projections),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = ProjectionExprs;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.ProjectionExprs")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<ProjectionExprs, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut projections__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Projections => {
+                            if projections__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("projections"));
+                            }
+                            projections__ = Some(map_.next_value()?);
                         }
                     }
                 }
-                Ok(ProjectionExecNode {
-                    input: input__,
-                    expr: expr__.unwrap_or_default(),
-                    expr_name: expr_name__.unwrap_or_default(),
+                Ok(ProjectionExprs {
+                    projections: projections__.unwrap_or_default(),
                 })
             }
         }
-        deserializer.deserialize_struct("datafusion.ProjectionExecNode", FIELDS, GeneratedVisitor)
+        deserializer.deserialize_struct("datafusion.ProjectionExprs", FIELDS, GeneratedVisitor)
     }
 }
 impl serde::Serialize for ProjectionNode {
@@ -19327,7 +21104,7 @@ impl<'de> serde::Deserialize<'de> for ProjectionNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -19454,7 +21231,7 @@ impl<'de> serde::Deserialize<'de> for RecursionUnnestOption {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -19592,7 +21369,7 @@ impl<'de> serde::Deserialize<'de> for RecursiveQueryNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -19652,26 +21429,280 @@ impl<'de> serde::Deserialize<'de> for RecursiveQueryNode {
                             }
                             recursive_term__ = map_.next_value()?;
                         }
-                        GeneratedField::IsDistinct => {
-                            if is_distinct__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("isDistinct"));
+                        GeneratedField::IsDistinct => {
+                            if is_distinct__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("isDistinct"));
+                            }
+                            is_distinct__ = Some(map_.next_value()?);
+                        }
+                    }
+                }
+                Ok(RecursiveQueryNode {
+                    name: name__.unwrap_or_default(),
+                    static_term: static_term__,
+                    recursive_term: recursive_term__,
+                    is_distinct: is_distinct__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.RecursiveQueryNode", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for RepartitionExecNode {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.input.is_some() {
+            len += 1;
+        }
+        if self.partitioning.is_some() {
+            len += 1;
+        }
+        if self.preserve_order {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.RepartitionExecNode", len)?;
+        if let Some(v) = self.input.as_ref() {
+            struct_ser.serialize_field("input", v)?;
+        }
+        if let Some(v) = self.partitioning.as_ref() {
+            struct_ser.serialize_field("partitioning", v)?;
+        }
+        if self.preserve_order {
+            struct_ser.serialize_field("preserveOrder", &self.preserve_order)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for RepartitionExecNode {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "input",
+            "partitioning",
+            "preserve_order",
+            "preserveOrder",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Input,
+            Partitioning,
+            PreserveOrder,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "input" => Ok(GeneratedField::Input),
+                            "partitioning" => Ok(GeneratedField::Partitioning),
+                            "preserveOrder" | "preserve_order" => Ok(GeneratedField::PreserveOrder),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = RepartitionExecNode;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.RepartitionExecNode")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<RepartitionExecNode, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut input__ = None;
+                let mut partitioning__ = None;
+                let mut preserve_order__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Input => {
+                            if input__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("input"));
+                            }
+                            input__ = map_.next_value()?;
+                        }
+                        GeneratedField::Partitioning => {
+                            if partitioning__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("partitioning"));
+                            }
+                            partitioning__ = map_.next_value()?;
+                        }
+                        GeneratedField::PreserveOrder => {
+                            if preserve_order__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("preserveOrder"));
+                            }
+                            preserve_order__ = Some(map_.next_value()?);
+                        }
+                    }
+                }
+                Ok(RepartitionExecNode {
+                    input: input__,
+                    partitioning: partitioning__,
+                    preserve_order: preserve_order__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.RepartitionExecNode", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for RepartitionNode {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.input.is_some() {
+            len += 1;
+        }
+        if self.partition_method.is_some() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.RepartitionNode", len)?;
+        if let Some(v) = self.input.as_ref() {
+            struct_ser.serialize_field("input", v)?;
+        }
+        if let Some(v) = self.partition_method.as_ref() {
+            match v {
+                repartition_node::PartitionMethod::RoundRobin(v) => {
+                    #[allow(clippy::needless_borrow)]
+                    #[allow(clippy::needless_borrows_for_generic_args)]
+                    struct_ser.serialize_field("roundRobin", ToString::to_string(&v).as_str())?;
+                }
+                repartition_node::PartitionMethod::Hash(v) => {
+                    struct_ser.serialize_field("hash", v)?;
+                }
+            }
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for RepartitionNode {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "input",
+            "round_robin",
+            "roundRobin",
+            "hash",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Input,
+            RoundRobin,
+            Hash,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "input" => Ok(GeneratedField::Input),
+                            "roundRobin" | "round_robin" => Ok(GeneratedField::RoundRobin),
+                            "hash" => Ok(GeneratedField::Hash),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = RepartitionNode;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.RepartitionNode")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<RepartitionNode, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut input__ = None;
+                let mut partition_method__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Input => {
+                            if input__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("input"));
+                            }
+                            input__ = map_.next_value()?;
+                        }
+                        GeneratedField::RoundRobin => {
+                            if partition_method__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("roundRobin"));
+                            }
+                            partition_method__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| repartition_node::PartitionMethod::RoundRobin(x.0));
+                        }
+                        GeneratedField::Hash => {
+                            if partition_method__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("hash"));
                             }
-                            is_distinct__ = Some(map_.next_value()?);
+                            partition_method__ = map_.next_value::<::std::option::Option<_>>()?.map(repartition_node::PartitionMethod::Hash)
+;
                         }
                     }
                 }
-                Ok(RecursiveQueryNode {
-                    name: name__.unwrap_or_default(),
-                    static_term: static_term__,
-                    recursive_term: recursive_term__,
-                    is_distinct: is_distinct__.unwrap_or_default(),
+                Ok(RepartitionNode {
+                    input: input__,
+                    partition_method: partition_method__,
                 })
             }
         }
-        deserializer.deserialize_struct("datafusion.RecursiveQueryNode", FIELDS, GeneratedVisitor)
+        deserializer.deserialize_struct("datafusion.RepartitionNode", FIELDS, GeneratedVisitor)
     }
 }
-impl serde::Serialize for RepartitionExecNode {
+impl serde::Serialize for RollupNode {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
     where
@@ -19679,37 +21710,29 @@ impl serde::Serialize for RepartitionExecNode {
     {
         use serde::ser::SerializeStruct;
         let mut len = 0;
-        if self.input.is_some() {
-            len += 1;
-        }
-        if self.partitioning.is_some() {
+        if !self.expr.is_empty() {
             len += 1;
         }
-        let mut struct_ser = serializer.serialize_struct("datafusion.RepartitionExecNode", len)?;
-        if let Some(v) = self.input.as_ref() {
-            struct_ser.serialize_field("input", v)?;
-        }
-        if let Some(v) = self.partitioning.as_ref() {
-            struct_ser.serialize_field("partitioning", v)?;
+        let mut struct_ser = serializer.serialize_struct("datafusion.RollupNode", len)?;
+        if !self.expr.is_empty() {
+            struct_ser.serialize_field("expr", &self.expr)?;
         }
         struct_ser.end()
     }
 }
-impl<'de> serde::Deserialize<'de> for RepartitionExecNode {
+impl<'de> serde::Deserialize<'de> for RollupNode {
     #[allow(deprecated)]
     fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
     where
         D: serde::Deserializer<'de>,
     {
         const FIELDS: &[&str] = &[
-            "input",
-            "partitioning",
+            "expr",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
-            Input,
-            Partitioning,
+            Expr,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -19718,7 +21741,7 @@ impl<'de> serde::Deserialize<'de> for RepartitionExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -19731,8 +21754,7 @@ impl<'de> serde::Deserialize<'de> for RepartitionExecNode {
                         E: serde::de::Error,
                     {
                         match value {
-                            "input" => Ok(GeneratedField::Input),
-                            "partitioning" => Ok(GeneratedField::Partitioning),
+                            "expr" => Ok(GeneratedField::Expr),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -19742,44 +21764,36 @@ impl<'de> serde::Deserialize<'de> for RepartitionExecNode {
         }
         struct GeneratedVisitor;
         impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
-            type Value = RepartitionExecNode;
+            type Value = RollupNode;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                formatter.write_str("struct datafusion.RepartitionExecNode")
+                formatter.write_str("struct datafusion.RollupNode")
             }
 
-            fn visit_map<V>(self, mut map_: V) -> std::result::Result<RepartitionExecNode, V::Error>
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<RollupNode, V::Error>
                 where
                     V: serde::de::MapAccess<'de>,
             {
-                let mut input__ = None;
-                let mut partitioning__ = None;
+                let mut expr__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
-                        GeneratedField::Input => {
-                            if input__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("input"));
-                            }
-                            input__ = map_.next_value()?;
-                        }
-                        GeneratedField::Partitioning => {
-                            if partitioning__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("partitioning"));
+                        GeneratedField::Expr => {
+                            if expr__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("expr"));
                             }
-                            partitioning__ = map_.next_value()?;
+                            expr__ = Some(map_.next_value()?);
                         }
                     }
                 }
-                Ok(RepartitionExecNode {
-                    input: input__,
-                    partitioning: partitioning__,
+                Ok(RollupNode {
+                    expr: expr__.unwrap_or_default(),
                 })
             }
         }
-        deserializer.deserialize_struct("datafusion.RepartitionExecNode", FIELDS, GeneratedVisitor)
+        deserializer.deserialize_struct("datafusion.RollupNode", FIELDS, GeneratedVisitor)
     }
 }
-impl serde::Serialize for RepartitionNode {
+impl serde::Serialize for ScalarSubqueryExecNode {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
     where
@@ -19790,29 +21804,20 @@ impl serde::Serialize for RepartitionNode {
         if self.input.is_some() {
             len += 1;
         }
-        if self.partition_method.is_some() {
+        if !self.subqueries.is_empty() {
             len += 1;
         }
-        let mut struct_ser = serializer.serialize_struct("datafusion.RepartitionNode", len)?;
+        let mut struct_ser = serializer.serialize_struct("datafusion.ScalarSubqueryExecNode", len)?;
         if let Some(v) = self.input.as_ref() {
             struct_ser.serialize_field("input", v)?;
         }
-        if let Some(v) = self.partition_method.as_ref() {
-            match v {
-                repartition_node::PartitionMethod::RoundRobin(v) => {
-                    #[allow(clippy::needless_borrow)]
-                    #[allow(clippy::needless_borrows_for_generic_args)]
-                    struct_ser.serialize_field("roundRobin", ToString::to_string(&v).as_str())?;
-                }
-                repartition_node::PartitionMethod::Hash(v) => {
-                    struct_ser.serialize_field("hash", v)?;
-                }
-            }
+        if !self.subqueries.is_empty() {
+            struct_ser.serialize_field("subqueries", &self.subqueries)?;
         }
         struct_ser.end()
     }
 }
-impl<'de> serde::Deserialize<'de> for RepartitionNode {
+impl<'de> serde::Deserialize<'de> for ScalarSubqueryExecNode {
     #[allow(deprecated)]
     fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
     where
@@ -19820,16 +21825,13 @@ impl<'de> serde::Deserialize<'de> for RepartitionNode {
     {
         const FIELDS: &[&str] = &[
             "input",
-            "round_robin",
-            "roundRobin",
-            "hash",
+            "subqueries",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
             Input,
-            RoundRobin,
-            Hash,
+            Subqueries,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -19838,7 +21840,7 @@ impl<'de> serde::Deserialize<'de> for RepartitionNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -19852,8 +21854,7 @@ impl<'de> serde::Deserialize<'de> for RepartitionNode {
                     {
                         match value {
                             "input" => Ok(GeneratedField::Input),
-                            "roundRobin" | "round_robin" => Ok(GeneratedField::RoundRobin),
-                            "hash" => Ok(GeneratedField::Hash),
+                            "subqueries" => Ok(GeneratedField::Subqueries),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -19863,18 +21864,18 @@ impl<'de> serde::Deserialize<'de> for RepartitionNode {
         }
         struct GeneratedVisitor;
         impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
-            type Value = RepartitionNode;
+            type Value = ScalarSubqueryExecNode;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                formatter.write_str("struct datafusion.RepartitionNode")
+                formatter.write_str("struct datafusion.ScalarSubqueryExecNode")
             }
 
-            fn visit_map<V>(self, mut map_: V) -> std::result::Result<RepartitionNode, V::Error>
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<ScalarSubqueryExecNode, V::Error>
                 where
                     V: serde::de::MapAccess<'de>,
             {
                 let mut input__ = None;
-                let mut partition_method__ = None;
+                let mut subqueries__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Input => {
@@ -19883,31 +21884,24 @@ impl<'de> serde::Deserialize<'de> for RepartitionNode {
                             }
                             input__ = map_.next_value()?;
                         }
-                        GeneratedField::RoundRobin => {
-                            if partition_method__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("roundRobin"));
-                            }
-                            partition_method__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| repartition_node::PartitionMethod::RoundRobin(x.0));
-                        }
-                        GeneratedField::Hash => {
-                            if partition_method__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("hash"));
+                        GeneratedField::Subqueries => {
+                            if subqueries__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("subqueries"));
                             }
-                            partition_method__ = map_.next_value::<::std::option::Option<_>>()?.map(repartition_node::PartitionMethod::Hash)
-;
+                            subqueries__ = Some(map_.next_value()?);
                         }
                     }
                 }
-                Ok(RepartitionNode {
+                Ok(ScalarSubqueryExecNode {
                     input: input__,
-                    partition_method: partition_method__,
+                    subqueries: subqueries__.unwrap_or_default(),
                 })
             }
         }
-        deserializer.deserialize_struct("datafusion.RepartitionNode", FIELDS, GeneratedVisitor)
+        deserializer.deserialize_struct("datafusion.ScalarSubqueryExecNode", FIELDS, GeneratedVisitor)
     }
 }
-impl serde::Serialize for RollupNode {
+impl serde::Serialize for ScalarSubqueryExprNode {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
     where
@@ -19915,29 +21909,29 @@ impl serde::Serialize for RollupNode {
     {
         use serde::ser::SerializeStruct;
         let mut len = 0;
-        if !self.expr.is_empty() {
+        if self.subquery.is_some() {
             len += 1;
         }
-        let mut struct_ser = serializer.serialize_struct("datafusion.RollupNode", len)?;
-        if !self.expr.is_empty() {
-            struct_ser.serialize_field("expr", &self.expr)?;
+        let mut struct_ser = serializer.serialize_struct("datafusion.ScalarSubqueryExprNode", len)?;
+        if let Some(v) = self.subquery.as_ref() {
+            struct_ser.serialize_field("subquery", v)?;
         }
         struct_ser.end()
     }
 }
-impl<'de> serde::Deserialize<'de> for RollupNode {
+impl<'de> serde::Deserialize<'de> for ScalarSubqueryExprNode {
     #[allow(deprecated)]
     fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
     where
         D: serde::Deserializer<'de>,
     {
         const FIELDS: &[&str] = &[
-            "expr",
+            "subquery",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
-            Expr,
+            Subquery,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -19946,7 +21940,7 @@ impl<'de> serde::Deserialize<'de> for RollupNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -19959,7 +21953,7 @@ impl<'de> serde::Deserialize<'de> for RollupNode {
                         E: serde::de::Error,
                     {
                         match value {
-                            "expr" => Ok(GeneratedField::Expr),
+                            "subquery" => Ok(GeneratedField::Subquery),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -19969,33 +21963,33 @@ impl<'de> serde::Deserialize<'de> for RollupNode {
         }
         struct GeneratedVisitor;
         impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
-            type Value = RollupNode;
+            type Value = ScalarSubqueryExprNode;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                formatter.write_str("struct datafusion.RollupNode")
+                formatter.write_str("struct datafusion.ScalarSubqueryExprNode")
             }
 
-            fn visit_map<V>(self, mut map_: V) -> std::result::Result<RollupNode, V::Error>
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<ScalarSubqueryExprNode, V::Error>
                 where
                     V: serde::de::MapAccess<'de>,
             {
-                let mut expr__ = None;
+                let mut subquery__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
-                        GeneratedField::Expr => {
-                            if expr__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("expr"));
+                        GeneratedField::Subquery => {
+                            if subquery__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("subquery"));
                             }
-                            expr__ = Some(map_.next_value()?);
+                            subquery__ = map_.next_value()?;
                         }
                     }
                 }
-                Ok(RollupNode {
-                    expr: expr__.unwrap_or_default(),
+                Ok(ScalarSubqueryExprNode {
+                    subquery: subquery__,
                 })
             }
         }
-        deserializer.deserialize_struct("datafusion.RollupNode", FIELDS, GeneratedVisitor)
+        deserializer.deserialize_struct("datafusion.ScalarSubqueryExprNode", FIELDS, GeneratedVisitor)
     }
 }
 impl serde::Serialize for ScalarUdfExprNode {
@@ -20057,7 +22051,7 @@ impl<'de> serde::Deserialize<'de> for ScalarUdfExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -20168,7 +22162,7 @@ impl<'de> serde::Deserialize<'de> for ScanLimit {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -20261,7 +22255,7 @@ impl<'de> serde::Deserialize<'de> for SelectionExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -20360,7 +22354,7 @@ impl<'de> serde::Deserialize<'de> for SelectionNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -20485,7 +22479,7 @@ impl<'de> serde::Deserialize<'de> for SimilarToNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -20584,6 +22578,9 @@ impl serde::Serialize for SortExecNode {
         if self.preserve_partitioning {
             len += 1;
         }
+        if self.dynamic_filter.is_some() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.SortExecNode", len)?;
         if let Some(v) = self.input.as_ref() {
             struct_ser.serialize_field("input", v)?;
@@ -20599,6 +22596,9 @@ impl serde::Serialize for SortExecNode {
         if self.preserve_partitioning {
             struct_ser.serialize_field("preservePartitioning", &self.preserve_partitioning)?;
         }
+        if let Some(v) = self.dynamic_filter.as_ref() {
+            struct_ser.serialize_field("dynamicFilter", v)?;
+        }
         struct_ser.end()
     }
 }
@@ -20614,6 +22614,8 @@ impl<'de> serde::Deserialize<'de> for SortExecNode {
             "fetch",
             "preserve_partitioning",
             "preservePartitioning",
+            "dynamic_filter",
+            "dynamicFilter",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -20622,6 +22624,7 @@ impl<'de> serde::Deserialize<'de> for SortExecNode {
             Expr,
             Fetch,
             PreservePartitioning,
+            DynamicFilter,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -20630,7 +22633,7 @@ impl<'de> serde::Deserialize<'de> for SortExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -20647,6 +22650,7 @@ impl<'de> serde::Deserialize<'de> for SortExecNode {
                             "expr" => Ok(GeneratedField::Expr),
                             "fetch" => Ok(GeneratedField::Fetch),
                             "preservePartitioning" | "preserve_partitioning" => Ok(GeneratedField::PreservePartitioning),
+                            "dynamicFilter" | "dynamic_filter" => Ok(GeneratedField::DynamicFilter),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -20670,6 +22674,7 @@ impl<'de> serde::Deserialize<'de> for SortExecNode {
                 let mut expr__ = None;
                 let mut fetch__ = None;
                 let mut preserve_partitioning__ = None;
+                let mut dynamic_filter__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Input => {
@@ -20698,6 +22703,12 @@ impl<'de> serde::Deserialize<'de> for SortExecNode {
                             }
                             preserve_partitioning__ = Some(map_.next_value()?);
                         }
+                        GeneratedField::DynamicFilter => {
+                            if dynamic_filter__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("dynamicFilter"));
+                            }
+                            dynamic_filter__ = map_.next_value()?;
+                        }
                     }
                 }
                 Ok(SortExecNode {
@@ -20705,6 +22716,7 @@ impl<'de> serde::Deserialize<'de> for SortExecNode {
                     expr: expr__.unwrap_or_default(),
                     fetch: fetch__.unwrap_or_default(),
                     preserve_partitioning: preserve_partitioning__.unwrap_or_default(),
+                    dynamic_filter: dynamic_filter__,
                 })
             }
         }
@@ -20767,7 +22779,7 @@ impl<'de> serde::Deserialize<'de> for SortExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -20877,7 +22889,7 @@ impl<'de> serde::Deserialize<'de> for SortExprNodeCollection {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -21023,7 +23035,7 @@ impl<'de> serde::Deserialize<'de> for SortMergeJoinExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -21186,7 +23198,7 @@ impl<'de> serde::Deserialize<'de> for SortNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -21315,7 +23327,7 @@ impl<'de> serde::Deserialize<'de> for SortPreservingMergeExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -21413,7 +23425,7 @@ impl<'de> serde::Deserialize<'de> for StreamPartitionMode {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = StreamPartitionMode;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -21506,7 +23518,7 @@ impl<'de> serde::Deserialize<'de> for StringifiedPlan {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -21614,7 +23626,7 @@ impl<'de> serde::Deserialize<'de> for SubqueryAliasNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -21675,6 +23687,115 @@ impl<'de> serde::Deserialize<'de> for SubqueryAliasNode {
         deserializer.deserialize_struct("datafusion.SubqueryAliasNode", FIELDS, GeneratedVisitor)
     }
 }
+impl serde::Serialize for SubqueryNode {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.subquery.is_some() {
+            len += 1;
+        }
+        if !self.outer_ref_columns.is_empty() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.SubqueryNode", len)?;
+        if let Some(v) = self.subquery.as_ref() {
+            struct_ser.serialize_field("subquery", v)?;
+        }
+        if !self.outer_ref_columns.is_empty() {
+            struct_ser.serialize_field("outerRefColumns", &self.outer_ref_columns)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for SubqueryNode {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "subquery",
+            "outer_ref_columns",
+            "outerRefColumns",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Subquery,
+            OuterRefColumns,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "subquery" => Ok(GeneratedField::Subquery),
+                            "outerRefColumns" | "outer_ref_columns" => Ok(GeneratedField::OuterRefColumns),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = SubqueryNode;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.SubqueryNode")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<SubqueryNode, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut subquery__ = None;
+                let mut outer_ref_columns__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Subquery => {
+                            if subquery__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("subquery"));
+                            }
+                            subquery__ = map_.next_value()?;
+                        }
+                        GeneratedField::OuterRefColumns => {
+                            if outer_ref_columns__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("outerRefColumns"));
+                            }
+                            outer_ref_columns__ = Some(map_.next_value()?);
+                        }
+                    }
+                }
+                Ok(SubqueryNode {
+                    subquery: subquery__,
+                    outer_ref_columns: outer_ref_columns__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.SubqueryNode", FIELDS, GeneratedVisitor)
+    }
+}
 impl serde::Serialize for SymmetricHashJoinExecNode {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
@@ -21789,7 +23910,7 @@ impl<'de> serde::Deserialize<'de> for SymmetricHashJoinExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -21966,7 +24087,7 @@ impl<'de> serde::Deserialize<'de> for TableReference {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -22049,6 +24170,12 @@ impl serde::Serialize for TryCastNode {
         if self.arrow_type.is_some() {
             len += 1;
         }
+        if !self.metadata.is_empty() {
+            len += 1;
+        }
+        if self.nullable.is_some() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.TryCastNode", len)?;
         if let Some(v) = self.expr.as_ref() {
             struct_ser.serialize_field("expr", v)?;
@@ -22056,6 +24183,12 @@ impl serde::Serialize for TryCastNode {
         if let Some(v) = self.arrow_type.as_ref() {
             struct_ser.serialize_field("arrowType", v)?;
         }
+        if !self.metadata.is_empty() {
+            struct_ser.serialize_field("metadata", &self.metadata)?;
+        }
+        if let Some(v) = self.nullable.as_ref() {
+            struct_ser.serialize_field("nullable", v)?;
+        }
         struct_ser.end()
     }
 }
@@ -22069,12 +24202,16 @@ impl<'de> serde::Deserialize<'de> for TryCastNode {
             "expr",
             "arrow_type",
             "arrowType",
+            "metadata",
+            "nullable",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
             Expr,
             ArrowType,
+            Metadata,
+            Nullable,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -22083,7 +24220,7 @@ impl<'de> serde::Deserialize<'de> for TryCastNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -22098,6 +24235,8 @@ impl<'de> serde::Deserialize<'de> for TryCastNode {
                         match value {
                             "expr" => Ok(GeneratedField::Expr),
                             "arrowType" | "arrow_type" => Ok(GeneratedField::ArrowType),
+                            "metadata" => Ok(GeneratedField::Metadata),
+                            "nullable" => Ok(GeneratedField::Nullable),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -22119,6 +24258,8 @@ impl<'de> serde::Deserialize<'de> for TryCastNode {
             {
                 let mut expr__ = None;
                 let mut arrow_type__ = None;
+                let mut metadata__ = None;
+                let mut nullable__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Expr => {
@@ -22133,11 +24274,27 @@ impl<'de> serde::Deserialize<'de> for TryCastNode {
                             }
                             arrow_type__ = map_.next_value()?;
                         }
+                        GeneratedField::Metadata => {
+                            if metadata__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("metadata"));
+                            }
+                            metadata__ = Some(
+                                map_.next_value::<std::collections::HashMap<_, _>>()?
+                            );
+                        }
+                        GeneratedField::Nullable => {
+                            if nullable__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("nullable"));
+                            }
+                            nullable__ = map_.next_value()?;
+                        }
                     }
                 }
                 Ok(TryCastNode {
                     expr: expr__,
                     arrow_type: arrow_type__,
+                    metadata: metadata__.unwrap_or_default(),
+                    nullable: nullable__,
                 })
             }
         }
@@ -22183,7 +24340,7 @@ impl<'de> serde::Deserialize<'de> for UnionExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -22274,7 +24431,7 @@ impl<'de> serde::Deserialize<'de> for UnionNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -22365,7 +24522,7 @@ impl<'de> serde::Deserialize<'de> for UnknownColumn {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -22456,7 +24613,7 @@ impl<'de> serde::Deserialize<'de> for Unnest {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -22581,7 +24738,7 @@ impl<'de> serde::Deserialize<'de> for UnnestExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -22763,7 +24920,7 @@ impl<'de> serde::Deserialize<'de> for UnnestNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -22923,7 +25080,7 @@ impl<'de> serde::Deserialize<'de> for UnnestOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -23035,7 +25192,7 @@ impl<'de> serde::Deserialize<'de> for ValuesNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -23170,7 +25327,7 @@ impl<'de> serde::Deserialize<'de> for ViewTableScanNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -23307,7 +25464,7 @@ impl<'de> serde::Deserialize<'de> for WhenThen {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -23407,7 +25564,7 @@ impl<'de> serde::Deserialize<'de> for Wildcard {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -23539,7 +25696,7 @@ impl<'de> serde::Deserialize<'de> for WindowAggExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -23756,7 +25913,7 @@ impl<'de> serde::Deserialize<'de> for WindowExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -23952,7 +26109,7 @@ impl<'de> serde::Deserialize<'de> for WindowFrame {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -24074,7 +26231,7 @@ impl<'de> serde::Deserialize<'de> for WindowFrameBound {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -24163,7 +26320,7 @@ impl<'de> serde::Deserialize<'de> for WindowFrameBoundType {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = WindowFrameBoundType;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -24237,7 +26394,7 @@ impl<'de> serde::Deserialize<'de> for WindowFrameUnits {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = WindowFrameUnits;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -24331,7 +26488,7 @@ impl<'de> serde::Deserialize<'de> for WindowNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs
index 12b4176274113..0d978ffca0797 100644
--- a/datafusion/proto/src/generated/prost.rs
+++ b/datafusion/proto/src/generated/prost.rs
@@ -5,7 +5,7 @@
 pub struct LogicalPlanNode {
     #[prost(
         oneof = "logical_plan_node::LogicalPlanType",
-        tags = "1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33"
+        tags = "1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34"
     )]
     pub logical_plan_type: ::core::option::Option<logical_plan_node::LogicalPlanType>,
 }
@@ -77,6 +77,8 @@ pub mod logical_plan_node {
         CteWorkTableScan(super::CteWorkTableScanNode),
         #[prost(message, tag = "33")]
         Dml(::prost::alloc::boxed::Box<super::DmlNode>),
+        #[prost(message, tag = "34")]
+        EmptyTableScan(super::EmptyTableScanNode),
     }
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
@@ -195,8 +197,8 @@ pub mod projection_node {
 pub struct SelectionNode {
     #[prost(message, optional, boxed, tag = "1")]
     pub input: ::core::option::Option<::prost::alloc::boxed::Box<LogicalPlanNode>>,
-    #[prost(message, optional, tag = "2")]
-    pub expr: ::core::option::Option<LogicalExprNode>,
+    #[prost(message, optional, boxed, tag = "2")]
+    pub expr: ::core::option::Option<::prost::alloc::boxed::Box<LogicalExprNode>>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct SortNode {
@@ -382,8 +384,8 @@ pub struct JoinNode {
     pub right_join_key: ::prost::alloc::vec::Vec<LogicalExprNode>,
     #[prost(enumeration = "super::datafusion_common::NullEquality", tag = "7")]
     pub null_equality: i32,
-    #[prost(message, optional, tag = "8")]
-    pub filter: ::core::option::Option<LogicalExprNode>,
+    #[prost(message, optional, boxed, tag = "8")]
+    pub filter: ::core::option::Option<::prost::alloc::boxed::Box<LogicalExprNode>>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct DistinctNode {
@@ -412,6 +414,15 @@ pub struct CopyToNode {
     #[prost(string, repeated, tag = "7")]
     pub partition_by: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
 }
+/// Wraps a serialized FileFormatFactory with its format kind tag,
+/// so the decoder can dispatch to the correct format-specific codec.
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct FileFormatProto {
+    #[prost(enumeration = "FileFormatKind", tag = "1")]
+    pub kind: i32,
+    #[prost(bytes = "vec", tag = "2")]
+    pub encoded_file_format: ::prost::alloc::vec::Vec<u8>,
+}
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct DmlNode {
     #[prost(enumeration = "dml_node::Type", tag = "1")]
@@ -444,6 +455,7 @@ pub mod dml_node {
         InsertAppend = 3,
         InsertOverwrite = 4,
         InsertReplace = 5,
+        Truncate = 6,
     }
     impl Type {
         /// String value of the enum field names used in the ProtoBuf definition.
@@ -458,6 +470,7 @@ pub mod dml_node {
                 Self::InsertAppend => "INSERT_APPEND",
                 Self::InsertOverwrite => "INSERT_OVERWRITE",
                 Self::InsertReplace => "INSERT_REPLACE",
+                Self::Truncate => "TRUNCATE",
             }
         }
         /// Creates an enum from field names used in the ProtoBuf definition.
@@ -469,6 +482,7 @@ pub mod dml_node {
                 "INSERT_APPEND" => Some(Self::InsertAppend),
                 "INSERT_OVERWRITE" => Some(Self::InsertOverwrite),
                 "INSERT_REPLACE" => Some(Self::InsertReplace),
+                "TRUNCATE" => Some(Self::Truncate),
                 _ => None,
             }
         }
@@ -566,7 +580,7 @@ pub struct SubqueryAliasNode {
 pub struct LogicalExprNode {
     #[prost(
         oneof = "logical_expr_node::ExprType",
-        tags = "1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35"
+        tags = "1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36"
     )]
     pub expr_type: ::core::option::Option<logical_expr_node::ExprType>,
 }
@@ -644,6 +658,9 @@ pub mod logical_expr_node {
         Placeholder(super::PlaceholderNode),
         #[prost(message, tag = "35")]
         Unnest(super::Unnest),
+        /// Subquery expressions
+        #[prost(message, tag = "36")]
+        ScalarSubqueryExpr(::prost::alloc::boxed::Box<super::ScalarSubqueryExprNode>),
     }
 }
 #[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
@@ -652,6 +669,18 @@ pub struct Wildcard {
     pub qualifier: ::core::option::Option<TableReference>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
+pub struct SubqueryNode {
+    #[prost(message, optional, boxed, tag = "1")]
+    pub subquery: ::core::option::Option<::prost::alloc::boxed::Box<LogicalPlanNode>>,
+    #[prost(message, repeated, tag = "2")]
+    pub outer_ref_columns: ::prost::alloc::vec::Vec<LogicalExprNode>,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct ScalarSubqueryExprNode {
+    #[prost(message, optional, boxed, tag = "1")]
+    pub subquery: ::core::option::Option<::prost::alloc::boxed::Box<SubqueryNode>>,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
 pub struct PlaceholderNode {
     #[prost(string, tag = "1")]
     pub id: ::prost::alloc::string::String,
@@ -919,6 +948,13 @@ pub struct CastNode {
     pub expr: ::core::option::Option<::prost::alloc::boxed::Box<LogicalExprNode>>,
     #[prost(message, optional, tag = "2")]
     pub arrow_type: ::core::option::Option<super::datafusion_common::ArrowType>,
+    #[prost(map = "string, string", tag = "3")]
+    pub metadata: ::std::collections::HashMap<
+        ::prost::alloc::string::String,
+        ::prost::alloc::string::String,
+    >,
+    #[prost(bool, optional, tag = "4")]
+    pub nullable: ::core::option::Option<bool>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct TryCastNode {
@@ -926,6 +962,13 @@ pub struct TryCastNode {
     pub expr: ::core::option::Option<::prost::alloc::boxed::Box<LogicalExprNode>>,
     #[prost(message, optional, tag = "2")]
     pub arrow_type: ::core::option::Option<super::datafusion_common::ArrowType>,
+    #[prost(map = "string, string", tag = "3")]
+    pub metadata: ::std::collections::HashMap<
+        ::prost::alloc::string::String,
+        ::prost::alloc::string::String,
+    >,
+    #[prost(bool, optional, tag = "4")]
+    pub nullable: ::core::option::Option<bool>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct SortExprNode {
@@ -1076,7 +1119,7 @@ pub mod table_reference {
 pub struct PhysicalPlanNode {
     #[prost(
         oneof = "physical_plan_node::PhysicalPlanType",
-        tags = "1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35"
+        tags = "1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39"
     )]
     pub physical_plan_type: ::core::option::Option<physical_plan_node::PhysicalPlanType>,
 }
@@ -1154,6 +1197,14 @@ pub mod physical_plan_node {
         SortMergeJoin(::prost::alloc::boxed::Box<super::SortMergeJoinExecNode>),
         #[prost(message, tag = "35")]
         MemoryScan(super::MemoryScanExecNode),
+        #[prost(message, tag = "36")]
+        AsyncFunc(::prost::alloc::boxed::Box<super::AsyncFuncExecNode>),
+        #[prost(message, tag = "37")]
+        Buffer(::prost::alloc::boxed::Box<super::BufferExecNode>),
+        #[prost(message, tag = "38")]
+        ArrowScan(super::ArrowScanExecNode),
+        #[prost(message, tag = "39")]
+        ScalarSubquery(::prost::alloc::boxed::Box<super::ScalarSubqueryExecNode>),
     }
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
@@ -1181,6 +1232,9 @@ pub struct FileSinkConfig {
     pub insert_op: i32,
     #[prost(string, tag = "11")]
     pub file_extension: ::prost::alloc::string::String,
+    /// Determines how the output path is interpreted.
+    #[prost(enumeration = "FileOutputMode", tag = "12")]
+    pub file_output_mode: i32,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct JsonSink {
@@ -1272,9 +1326,15 @@ pub struct PhysicalExtensionNode {
 /// physical expressions
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct PhysicalExprNode {
+    /// Unique identifier for this expression to do deduplication during deserialization.
+    /// When serializing, this is set via `PhysicalExpr::expression_id`. When deserializing,
+    /// this id is used by the `DeduplicatingProtoConverter` to preserve referential
+    /// integrity across serde roundtrips for different expressions with the same id.
+    #[prost(uint64, optional, tag = "30")]
+    pub expr_id: ::core::option::Option<u64>,
     #[prost(
         oneof = "physical_expr_node::ExprType",
-        tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 18, 19, 20"
+        tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23"
     )]
     pub expr_type: ::core::option::Option<physical_expr_node::ExprType>,
 }
@@ -1325,9 +1385,28 @@ pub mod physical_expr_node {
         Extension(super::PhysicalExtensionExprNode),
         #[prost(message, tag = "20")]
         UnknownColumn(super::UnknownColumn),
+        #[prost(message, tag = "21")]
+        HashExpr(super::PhysicalHashExprNode),
+        #[prost(message, tag = "22")]
+        ScalarSubquery(super::PhysicalScalarSubqueryExprNode),
+        #[prost(message, tag = "23")]
+        DynamicFilter(::prost::alloc::boxed::Box<super::PhysicalDynamicFilterNode>),
     }
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
+pub struct PhysicalDynamicFilterNode {
+    #[prost(message, repeated, tag = "1")]
+    pub children: ::prost::alloc::vec::Vec<PhysicalExprNode>,
+    #[prost(message, repeated, tag = "2")]
+    pub remapped_children: ::prost::alloc::vec::Vec<PhysicalExprNode>,
+    #[prost(uint64, tag = "3")]
+    pub generation: u64,
+    #[prost(message, optional, boxed, tag = "4")]
+    pub inner_expr: ::core::option::Option<::prost::alloc::boxed::Box<PhysicalExprNode>>,
+    #[prost(bool, tag = "5")]
+    pub is_complete: bool,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
 pub struct PhysicalScalarUdfNode {
     #[prost(string, tag = "1")]
     pub name: ::prost::alloc::string::String,
@@ -1433,6 +1512,10 @@ pub struct PhysicalBinaryExprNode {
     pub r: ::core::option::Option<::prost::alloc::boxed::Box<PhysicalExprNode>>,
     #[prost(string, tag = "3")]
     pub op: ::prost::alloc::string::String,
+    /// Linearized operands for chains of the same operator (e.g. a AND b AND c).
+    /// When present, `l` and `r` are ignored and `operands` holds the flattened list.
+    #[prost(message, repeated, tag = "4")]
+    pub operands: ::prost::alloc::vec::Vec<PhysicalExprNode>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct PhysicalDateTimeIntervalExprNode {
@@ -1515,6 +1598,15 @@ pub struct PhysicalExtensionExprNode {
     pub inputs: ::prost::alloc::vec::Vec<PhysicalExprNode>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
+pub struct PhysicalHashExprNode {
+    #[prost(message, repeated, tag = "1")]
+    pub on_columns: ::prost::alloc::vec::Vec<PhysicalExprNode>,
+    #[prost(uint64, tag = "2")]
+    pub seed0: u64,
+    #[prost(string, tag = "6")]
+    pub description: ::prost::alloc::string::String,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
 pub struct FilterExecNode {
     #[prost(message, optional, boxed, tag = "1")]
     pub input: ::core::option::Option<::prost::alloc::boxed::Box<PhysicalPlanNode>>,
@@ -1524,6 +1616,10 @@ pub struct FilterExecNode {
     pub default_filter_selectivity: u32,
     #[prost(uint32, repeated, tag = "9")]
     pub projection: ::prost::alloc::vec::Vec<u32>,
+    #[prost(uint32, tag = "10")]
+    pub batch_size: u32,
+    #[prost(uint32, optional, tag = "11")]
+    pub fetch: ::core::option::Option<u32>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct FileGroup {
@@ -1542,6 +1638,18 @@ pub struct PhysicalSortExprNodeCollection {
     pub physical_sort_expr_nodes: ::prost::alloc::vec::Vec<PhysicalSortExprNode>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
+pub struct ProjectionExpr {
+    #[prost(string, tag = "1")]
+    pub alias: ::prost::alloc::string::String,
+    #[prost(message, optional, tag = "2")]
+    pub expr: ::core::option::Option<PhysicalExprNode>,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct ProjectionExprs {
+    #[prost(message, repeated, tag = "1")]
+    pub projections: ::prost::alloc::vec::Vec<ProjectionExpr>,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
 pub struct FileScanExecConf {
     #[prost(message, repeated, tag = "1")]
     pub file_groups: ::prost::alloc::vec::Vec<FileGroup>,
@@ -1563,6 +1671,8 @@ pub struct FileScanExecConf {
     pub constraints: ::core::option::Option<super::datafusion_common::Constraints>,
     #[prost(uint64, optional, tag = "12")]
     pub batch_size: ::core::option::Option<u64>,
+    #[prost(message, optional, tag = "13")]
+    pub projection_exprs: ::core::option::Option<ProjectionExprs>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ParquetScanExecNode {
@@ -1618,6 +1728,11 @@ pub struct AvroScanExecNode {
     pub base_conf: ::core::option::Option<FileScanExecConf>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
+pub struct ArrowScanExecNode {
+    #[prost(message, optional, tag = "1")]
+    pub base_conf: ::core::option::Option<FileScanExecConf>,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
 pub struct MemoryScanExecNode {
     #[prost(bytes = "vec", repeated, tag = "1")]
     pub partitions: ::prost::alloc::vec::Vec<::prost::alloc::vec::Vec<u8>>,
@@ -1655,6 +1770,11 @@ pub struct HashJoinExecNode {
     pub filter: ::core::option::Option<JoinFilter>,
     #[prost(uint32, repeated, tag = "9")]
     pub projection: ::prost::alloc::vec::Vec<u32>,
+    #[prost(bool, tag = "10")]
+    pub null_aware: bool,
+    /// Optional dynamic filter expression for pushing down to the probe side.
+    #[prost(message, optional, tag = "11")]
+    pub dynamic_filter: ::core::option::Option<PhysicalExprNode>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct SymmetricHashJoinExecNode {
@@ -1706,6 +1826,12 @@ pub struct AnalyzeExecNode {
     pub input: ::core::option::Option<::prost::alloc::boxed::Box<PhysicalPlanNode>>,
     #[prost(message, optional, tag = "4")]
     pub schema: ::core::option::Option<super::datafusion_common::Schema>,
+    /// Optional metric category filter.
+    /// Empty means "plan only". Absent (has_metric_categories=false) means "all".
+    #[prost(bool, tag = "5")]
+    pub has_metric_categories: bool,
+    #[prost(string, repeated, tag = "6")]
+    pub metric_categories: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct CrossJoinExecNode {
@@ -1797,6 +1923,9 @@ pub struct AggLimit {
     /// wrap into a message to make it optional
     #[prost(uint64, tag = "1")]
     pub limit: u64,
+    /// Optional ordering direction for TopK aggregation (true = descending, false = ascending)
+    #[prost(bool, optional, tag = "2")]
+    pub descending: ::core::option::Option<bool>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct AggregateExecNode {
@@ -1823,6 +1952,11 @@ pub struct AggregateExecNode {
     pub filter_expr: ::prost::alloc::vec::Vec<MaybeFilter>,
     #[prost(message, optional, tag = "11")]
     pub limit: ::core::option::Option<AggLimit>,
+    #[prost(bool, tag = "12")]
+    pub has_grouping_set: bool,
+    /// Optional dynamic filter expression for pushing down to the child.
+    #[prost(message, optional, tag = "13")]
+    pub dynamic_filter: ::core::option::Option<PhysicalExprNode>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct GlobalLimitExecNode {
@@ -1853,6 +1987,9 @@ pub struct SortExecNode {
     pub fetch: i64,
     #[prost(bool, tag = "4")]
     pub preserve_partitioning: bool,
+    /// Optional dynamic filter expression for TopK pushdown.
+    #[prost(message, optional, tag = "5")]
+    pub dynamic_filter: ::core::option::Option<PhysicalExprNode>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct SortPreservingMergeExecNode {
@@ -1911,6 +2048,8 @@ pub struct RepartitionExecNode {
     /// }
     #[prost(message, optional, tag = "5")]
     pub partitioning: ::core::option::Option<Partitioning>,
+    #[prost(bool, tag = "6")]
+    pub preserve_order: bool,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Partitioning {
@@ -2000,6 +2139,17 @@ pub struct CteWorkTableScanNode {
     #[prost(message, optional, tag = "2")]
     pub schema: ::core::option::Option<super::datafusion_common::Schema>,
 }
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct EmptyTableScanNode {
+    #[prost(message, optional, tag = "1")]
+    pub table_name: ::core::option::Option<TableReference>,
+    #[prost(message, optional, tag = "2")]
+    pub schema: ::core::option::Option<super::datafusion_common::Schema>,
+    #[prost(message, optional, tag = "3")]
+    pub projection: ::core::option::Option<ProjectionColumns>,
+    #[prost(message, repeated, tag = "4")]
+    pub filters: ::prost::alloc::vec::Vec<LogicalExprNode>,
+}
 #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct GenerateSeriesArgsContainsNull {
     #[prost(enumeration = "GenerateSeriesName", tag = "1")]
@@ -2090,6 +2240,79 @@ pub struct SortMergeJoinExecNode {
     #[prost(enumeration = "super::datafusion_common::NullEquality", tag = "7")]
     pub null_equality: i32,
 }
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct AsyncFuncExecNode {
+    #[prost(message, optional, boxed, tag = "1")]
+    pub input: ::core::option::Option<::prost::alloc::boxed::Box<PhysicalPlanNode>>,
+    #[prost(message, repeated, tag = "2")]
+    pub async_exprs: ::prost::alloc::vec::Vec<PhysicalExprNode>,
+    #[prost(string, repeated, tag = "3")]
+    pub async_expr_names: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct BufferExecNode {
+    #[prost(message, optional, boxed, tag = "1")]
+    pub input: ::core::option::Option<::prost::alloc::boxed::Box<PhysicalPlanNode>>,
+    #[prost(uint64, tag = "2")]
+    pub capacity: u64,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct ScalarSubqueryExecNode {
+    #[prost(message, optional, boxed, tag = "1")]
+    pub input: ::core::option::Option<::prost::alloc::boxed::Box<PhysicalPlanNode>>,
+    #[prost(message, repeated, tag = "2")]
+    pub subqueries: ::prost::alloc::vec::Vec<PhysicalPlanNode>,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct PhysicalScalarSubqueryExprNode {
+    #[prost(message, optional, tag = "1")]
+    pub data_type: ::core::option::Option<super::datafusion_common::ArrowType>,
+    #[prost(bool, tag = "2")]
+    pub nullable: bool,
+    #[prost(uint32, tag = "3")]
+    pub index: u32,
+}
+/// Identifies a built-in file format supported by DataFusion.
+/// Used by DefaultLogicalExtensionCodec to serialize/deserialize
+/// FileFormatFactory instances (e.g. in CopyTo plans).
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
+#[repr(i32)]
+pub enum FileFormatKind {
+    Unspecified = 0,
+    Csv = 1,
+    Json = 2,
+    Parquet = 3,
+    Arrow = 4,
+    Avro = 5,
+}
+impl FileFormatKind {
+    /// String value of the enum field names used in the ProtoBuf definition.
+    ///
+    /// The values are not transformed in any way and thus are considered stable
+    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
+    pub fn as_str_name(&self) -> &'static str {
+        match self {
+            Self::Unspecified => "FILE_FORMAT_KIND_UNSPECIFIED",
+            Self::Csv => "FILE_FORMAT_KIND_CSV",
+            Self::Json => "FILE_FORMAT_KIND_JSON",
+            Self::Parquet => "FILE_FORMAT_KIND_PARQUET",
+            Self::Arrow => "FILE_FORMAT_KIND_ARROW",
+            Self::Avro => "FILE_FORMAT_KIND_AVRO",
+        }
+    }
+    /// Creates an enum from field names used in the ProtoBuf definition.
+    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
+        match value {
+            "FILE_FORMAT_KIND_UNSPECIFIED" => Some(Self::Unspecified),
+            "FILE_FORMAT_KIND_CSV" => Some(Self::Csv),
+            "FILE_FORMAT_KIND_JSON" => Some(Self::Json),
+            "FILE_FORMAT_KIND_PARQUET" => Some(Self::Parquet),
+            "FILE_FORMAT_KIND_ARROW" => Some(Self::Arrow),
+            "FILE_FORMAT_KIND_AVRO" => Some(Self::Avro),
+            _ => None,
+        }
+    }
+}
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
 #[repr(i32)]
 pub enum WindowFrameUnits {
@@ -2200,6 +2423,39 @@ impl DateUnit {
         }
     }
 }
+/// Determines how file sink output paths are interpreted.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
+#[repr(i32)]
+pub enum FileOutputMode {
+    /// Infer output mode from the URL (extension/trailing `/` heuristic).
+    Automatic = 0,
+    /// Write to a single file at the exact output path.
+    SingleFile = 1,
+    /// Write to a directory with generated filenames.
+    Directory = 2,
+}
+impl FileOutputMode {
+    /// String value of the enum field names used in the ProtoBuf definition.
+    ///
+    /// The values are not transformed in any way and thus are considered stable
+    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
+    pub fn as_str_name(&self) -> &'static str {
+        match self {
+            Self::Automatic => "FILE_OUTPUT_MODE_AUTOMATIC",
+            Self::SingleFile => "FILE_OUTPUT_MODE_SINGLE_FILE",
+            Self::Directory => "FILE_OUTPUT_MODE_DIRECTORY",
+        }
+    }
+    /// Creates an enum from field names used in the ProtoBuf definition.
+    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
+        match value {
+            "FILE_OUTPUT_MODE_AUTOMATIC" => Some(Self::Automatic),
+            "FILE_OUTPUT_MODE_SINGLE_FILE" => Some(Self::SingleFile),
+            "FILE_OUTPUT_MODE_DIRECTORY" => Some(Self::Directory),
+            _ => None,
+        }
+    }
+}
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
 #[repr(i32)]
 pub enum InsertOp {
@@ -2292,6 +2548,7 @@ pub enum AggregateMode {
     FinalPartitioned = 2,
     Single = 3,
     SinglePartitioned = 4,
+    PartialReduce = 5,
 }
 impl AggregateMode {
     /// String value of the enum field names used in the ProtoBuf definition.
@@ -2305,6 +2562,7 @@ impl AggregateMode {
             Self::FinalPartitioned => "FINAL_PARTITIONED",
             Self::Single => "SINGLE",
             Self::SinglePartitioned => "SINGLE_PARTITIONED",
+            Self::PartialReduce => "PARTIAL_REDUCE",
         }
     }
     /// Creates an enum from field names used in the ProtoBuf definition.
@@ -2315,6 +2573,7 @@ impl AggregateMode {
             "FINAL_PARTITIONED" => Some(Self::FinalPartitioned),
             "SINGLE" => Some(Self::Single),
             "SINGLE_PARTITIONED" => Some(Self::SinglePartitioned),
+            "PARTIAL_REDUCE" => Some(Self::PartialReduce),
             _ => None,
         }
     }
diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs
index b16b12bc05162..7ddc930fa257e 100644
--- a/datafusion/proto/src/lib.rs
+++ b/datafusion/proto/src/lib.rs
@@ -23,6 +23,7 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! Serialize / Deserialize DataFusion Plans to bytes
 //!
diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs
index d32bfb22ffddd..a050f5fba2061 100644
--- a/datafusion/proto/src/logical_plan/file_formats.rs
+++ b/datafusion/proto/src/logical_plan/file_formats.rs
@@ -17,11 +17,15 @@
 
 use std::sync::Arc;
 
-use crate::protobuf::{CsvOptions as CsvOptionsProto, JsonOptions as JsonOptionsProto};
+use super::LogicalExtensionCodec;
+use crate::protobuf::{
+    CsvOptions as CsvOptionsProto, CsvQuoteStyle as CsvQuoteStyleProto,
+    JsonOptions as JsonOptionsProto,
+};
 use datafusion_common::config::{CsvOptions, JsonOptions};
 use datafusion_common::{
-    exec_datafusion_err, exec_err, not_impl_err, parsers::CompressionTypeVariant,
-    TableReference,
+    TableReference, exec_datafusion_err, exec_err, not_impl_err,
+    parsers::{CompressionTypeVariant, CsvQuoteStyle},
 };
 use datafusion_datasource::file_format::FileFormatFactory;
 use datafusion_datasource_arrow::file_format::ArrowFormatFactory;
@@ -30,8 +34,6 @@ use datafusion_datasource_json::file_format::JsonFormatFactory;
 use datafusion_execution::TaskContext;
 use prost::Message;
 
-use super::LogicalExtensionCodec;
-
 #[derive(Debug)]
 pub struct CsvLogicalExtensionCodec;
 
@@ -62,6 +64,14 @@ impl CsvOptionsProto {
                     .newlines_in_values
                     .map_or(vec![], |v| vec![v as u8]),
                 truncated_rows: options.truncated_rows.map_or(vec![], |v| vec![v as u8]),
+                compression_level: options.compression_level,
+                quote_style: options.quote_style as i32,
+                ignore_leading_whitespace: options
+                    .ignore_leading_whitespace
+                    .map_or(vec![], |v| vec![v as u8]),
+                ignore_trailing_whitespace: options
+                    .ignore_trailing_whitespace
+                    .map_or(vec![], |v| vec![v as u8]),
             }
         } else {
             CsvOptionsProto::default()
@@ -152,6 +162,24 @@ impl From<&CsvOptionsProto> for CsvOptions {
             } else {
                 Some(proto.truncated_rows[0] != 0)
             },
+            compression_level: proto.compression_level,
+            quote_style: match CsvQuoteStyleProto::try_from(proto.quote_style) {
+                Ok(CsvQuoteStyleProto::Always) => CsvQuoteStyle::Always,
+                Ok(CsvQuoteStyleProto::NonNumeric) => CsvQuoteStyle::NonNumeric,
+                Ok(CsvQuoteStyleProto::Never) => CsvQuoteStyle::Never,
+                Ok(CsvQuoteStyleProto::Necessary) => CsvQuoteStyle::Necessary,
+                _ => CsvQuoteStyle::Necessary,
+            },
+            ignore_leading_whitespace: if proto.ignore_leading_whitespace.is_empty() {
+                None
+            } else {
+                Some(proto.ignore_leading_whitespace[0] != 0)
+            },
+            ignore_trailing_whitespace: if proto.ignore_trailing_whitespace.is_empty() {
+                None
+            } else {
+                Some(proto.ignore_trailing_whitespace[0] != 0)
+            },
         }
     }
 }
@@ -213,12 +241,11 @@ impl LogicalExtensionCodec for CsvLogicalExtensionCodec {
         buf: &mut Vec<u8>,
         node: Arc<dyn FileFormatFactory>,
     ) -> datafusion_common::Result<()> {
-        let options =
-            if let Some(csv_factory) = node.as_any().downcast_ref::<CsvFormatFactory>() {
-                csv_factory.options.clone().unwrap_or_default()
-            } else {
-                return exec_err!("{}", "Unsupported FileFormatFactory type".to_string());
-            };
+        let options = if let Some(csv_factory) = node.downcast_ref::<CsvFormatFactory>() {
+            csv_factory.options.clone().unwrap_or_default()
+        } else {
+            return exec_err!("{}", "Unsupported FileFormatFactory type".to_string());
+        };
 
         let proto = CsvOptionsProto::from_factory(&CsvFormatFactory {
             options: Some(options),
@@ -238,6 +265,8 @@ impl JsonOptionsProto {
             JsonOptionsProto {
                 compression: options.compression as i32,
                 schema_infer_max_rec: options.schema_infer_max_rec.map(|v| v as u64),
+                compression_level: options.compression_level,
+                newline_delimited: Some(options.newline_delimited),
             }
         } else {
             JsonOptionsProto::default()
@@ -256,6 +285,8 @@ impl From<&JsonOptionsProto> for JsonOptions {
                 _ => CompressionTypeVariant::UNCOMPRESSED,
             },
             schema_infer_max_rec: proto.schema_infer_max_rec.map(|v| v as usize),
+            compression_level: proto.compression_level,
+            newline_delimited: proto.newline_delimited.unwrap_or(true),
         }
     }
 }
@@ -320,8 +351,7 @@ impl LogicalExtensionCodec for JsonLogicalExtensionCodec {
         buf: &mut Vec<u8>,
         node: Arc<dyn FileFormatFactory>,
     ) -> datafusion_common::Result<()> {
-        let options = if let Some(json_factory) =
-            node.as_any().downcast_ref::<JsonFormatFactory>()
+        let options = if let Some(json_factory) = node.downcast_ref::<JsonFormatFactory>()
         {
             json_factory.options.clone().unwrap_or_default()
         } else {
@@ -345,13 +375,13 @@ mod parquet {
     use super::*;
 
     use crate::protobuf::{
-        parquet_column_options, parquet_options,
-        ParquetColumnOptions as ParquetColumnOptionsProto, ParquetColumnSpecificOptions,
-        ParquetOptions as ParquetOptionsProto,
-        TableParquetOptions as TableParquetOptionsProto,
+        CdcOptions as CdcOptionsProto, ParquetColumnOptions as ParquetColumnOptionsProto,
+        ParquetColumnSpecificOptions, ParquetOptions as ParquetOptionsProto,
+        TableParquetOptions as TableParquetOptionsProto, parquet_column_options,
+        parquet_options,
     };
     use datafusion_common::config::{
-        ParquetColumnOptions, ParquetOptions, TableParquetOptions,
+        CdcOptions, ParquetColumnOptions, ParquetOptions, TableParquetOptions,
     };
     use datafusion_datasource_parquet::file_format::ParquetFormatFactory;
 
@@ -364,8 +394,7 @@ mod parquet {
             };
 
             let column_specific_options = global_options.column_specific_options;
-            #[allow(deprecated)] // max_statistics_size
-        TableParquetOptionsProto {
+            TableParquetOptionsProto {
             global: Some(ParquetOptionsProto {
                 enable_page_index: global_options.global.enable_page_index,
                 pruning: global_options.global.pruning,
@@ -375,9 +404,10 @@ mod parquet {
                 }),
                 pushdown_filters: global_options.global.pushdown_filters,
                 reorder_filters: global_options.global.reorder_filters,
+                force_filter_selections: global_options.global.force_filter_selections,
                 data_pagesize_limit: global_options.global.data_pagesize_limit as u64,
                 write_batch_size: global_options.global.write_batch_size as u64,
-                writer_version: global_options.global.writer_version.clone(),
+                writer_version: global_options.global.writer_version.to_string(),
                 compression_opt: global_options.global.compression.map(|compression| {
                     parquet_options::CompressionOpt::Compression(compression)
                 }),
@@ -420,6 +450,13 @@ mod parquet {
                 max_predicate_cache_size_opt: global_options.global.max_predicate_cache_size.map(|size| {
                     parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(size as u64)
                 }),
+                content_defined_chunking: global_options.global.use_content_defined_chunking.as_ref().map(|cdc| {
+                    CdcOptionsProto {
+                        min_chunk_size: cdc.min_chunk_size as u64,
+                        max_chunk_size: cdc.max_chunk_size as u64,
+                        norm_level: cdc.norm_level,
+                    }
+                }),
             }),
             column_specific_options: column_specific_options.into_iter().map(|(column_name, options)| {
                 ParquetColumnSpecificOptions {
@@ -461,8 +498,7 @@ mod parquet {
 
     impl From<&ParquetOptionsProto> for ParquetOptions {
         fn from(proto: &ParquetOptionsProto) -> Self {
-            #[allow(deprecated)] // max_statistics_size
-        ParquetOptions {
+            ParquetOptions {
             enable_page_index: proto.enable_page_index,
             pruning: proto.pruning,
             skip_metadata: proto.skip_metadata,
@@ -471,9 +507,13 @@ mod parquet {
             }),
             pushdown_filters: proto.pushdown_filters,
             reorder_filters: proto.reorder_filters,
+            force_filter_selections: proto.force_filter_selections,
             data_pagesize_limit: proto.data_pagesize_limit as usize,
             write_batch_size: proto.write_batch_size as usize,
-            writer_version: proto.writer_version.clone(),
+                   // TODO: Consider changing to TryFrom to avoid panic on invalid proto data
+            writer_version: proto.writer_version.parse().expect("
+                Invalid parquet writer version in proto, expected '1.0' or '2.0'
+            "),
             compression: proto.compression_opt.as_ref().map(|opt| match opt {
                 parquet_options::CompressionOpt::Compression(compression) => compression.clone(),
             }),
@@ -516,14 +556,24 @@ mod parquet {
             max_predicate_cache_size: proto.max_predicate_cache_size_opt.as_ref().map(|opt| match opt {
                 parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(size) => *size as usize,
             }),
+            use_content_defined_chunking: proto.content_defined_chunking.map(|cdc| {
+                let defaults = CdcOptions::default();
+                CdcOptions {
+                    // proto3 uses 0 as the wire default for uint64; a zero chunk size is
+                    // invalid, so treat it as "field not set" and fall back to the default.
+                    min_chunk_size: if cdc.min_chunk_size != 0 { cdc.min_chunk_size as usize } else { defaults.min_chunk_size },
+                    max_chunk_size: if cdc.max_chunk_size != 0 { cdc.max_chunk_size as usize } else { defaults.max_chunk_size },
+                    // norm_level = 0 is a valid value (and the default), so pass it through directly.
+                    norm_level: cdc.norm_level,
+                }
+            }),
         }
         }
     }
 
     impl From<ParquetColumnOptionsProto> for ParquetColumnOptions {
         fn from(proto: ParquetColumnOptionsProto) -> Self {
-            #[allow(deprecated)] // max_statistics_size
-        ParquetColumnOptions {
+            ParquetColumnOptions {
             bloom_filter_enabled: proto.bloom_filter_enabled_opt.map(
                 |parquet_column_options::BloomFilterEnabledOpt::BloomFilterEnabled(v)| v,
             ),
@@ -577,7 +627,7 @@ mod parquet {
                     .iter()
                     .map(|(k, v)| (k.clone(), Some(v.clone())))
                     .collect(),
-                crypto: Default::default(),
+                ..Default::default()
             }
         }
     }
@@ -648,7 +698,7 @@ mod parquet {
             use datafusion_datasource_parquet::file_format::ParquetFormatFactory;
 
             let options = if let Some(parquet_factory) =
-                node.as_any().downcast_ref::<ParquetFormatFactory>()
+                node.downcast_ref::<ParquetFormatFactory>()
             {
                 parquet_factory.options.clone().unwrap_or_default()
             } else {
diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs
index 598a77f5420e2..78ffd362c8e48 100644
--- a/datafusion/proto/src/logical_plan/from_proto.rs
+++ b/datafusion/proto/src/logical_plan/from_proto.rs
@@ -17,42 +17,44 @@
 
 use std::sync::Arc;
 
-use arrow::datatypes::Field;
+use arrow::datatypes::{DataType, Field};
+use datafusion_common::datatype::DataTypeExt;
 use datafusion_common::{
-    exec_datafusion_err, internal_err, plan_datafusion_err, NullEquality,
-    RecursionUnnestOption, Result, ScalarValue, TableReference, UnnestOptions,
+    NullEquality, RecursionUnnestOption, Result, ScalarValue, TableReference,
+    UnnestOptions, exec_datafusion_err, internal_err, plan_datafusion_err,
 };
+use datafusion_execution::TaskContext;
 use datafusion_execution::registry::FunctionRegistry;
 use datafusion_expr::dml::InsertOp;
 use datafusion_expr::expr::{Alias, NullTreatment, Placeholder, Sort};
 use datafusion_expr::expr::{Unnest, WildcardOptions};
+use datafusion_expr::logical_plan::Subquery;
 use datafusion_expr::{
-    expr::{self, InList, WindowFunction},
-    logical_plan::{PlanType, StringifiedPlan},
     Between, BinaryExpr, Case, Cast, Expr, GroupingSet,
     GroupingSet::GroupingSets,
     JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame, WindowFrameBound,
     WindowFrameUnits,
+    expr::{self, InList, WindowFunction},
+    logical_plan::{PlanType, StringifiedPlan},
 };
 use datafusion_expr::{ExprFunctionExt, WriteOp};
-use datafusion_proto_common::{from_proto::FromOptionalField, FromProtoError as Error};
+use datafusion_proto_common::{FromProtoError as Error, from_proto::FromOptionalField};
 
 use crate::protobuf::plan_type::PlanTypeEnum::{
     FinalPhysicalPlanWithSchema, InitialPhysicalPlanWithSchema,
 };
 use crate::protobuf::{
-    self,
+    self, AnalyzedLogicalPlanType, CubeNode, GroupingSetNode, OptimizedLogicalPlanType,
+    OptimizedPhysicalPlanType, PlaceholderNode, RollupNode,
     plan_type::PlanTypeEnum::{
         AnalyzedLogicalPlan, FinalAnalyzedLogicalPlan, FinalLogicalPlan,
         FinalPhysicalPlan, FinalPhysicalPlanWithStats, InitialLogicalPlan,
         InitialPhysicalPlan, InitialPhysicalPlanWithStats, OptimizedLogicalPlan,
         OptimizedPhysicalPlan, PhysicalPlanError,
     },
-    AnalyzedLogicalPlanType, CubeNode, GroupingSetNode, OptimizedLogicalPlanType,
-    OptimizedPhysicalPlanType, PlaceholderNode, RollupNode,
 };
 
-use super::LogicalExtensionCodec;
+use super::{AsLogicalPlan, LogicalExtensionCodec};
 
 impl From<&protobuf::UnnestOptions> for UnnestOptions {
     fn from(opts: &protobuf::UnnestOptions) -> Self {
@@ -240,6 +242,7 @@ impl From<protobuf::dml_node::Type> for WriteOp {
             }
             protobuf::dml_node::Type::InsertReplace => WriteOp::Insert(InsertOp::Replace),
             protobuf::dml_node::Type::Ctas => WriteOp::Ctas,
+            protobuf::dml_node::Type::Truncate => WriteOp::Truncate,
         }
     }
 }
@@ -255,7 +258,7 @@ impl From<protobuf::NullTreatment> for NullTreatment {
 
 pub fn parse_expr(
     proto: &protobuf::LogicalExprNode,
-    registry: &dyn FunctionRegistry,
+    ctx: &TaskContext,
     codec: &dyn LogicalExtensionCodec,
 ) -> Result<Expr, Error> {
     use protobuf::{logical_expr_node::ExprType, window_expr_node};
@@ -268,7 +271,7 @@ pub fn parse_expr(
     match expr_type {
         ExprType::BinaryExpr(binary_expr) => {
             let op = from_proto_binary_op(&binary_expr.op)?;
-            let operands = parse_exprs(&binary_expr.operands, registry, codec)?;
+            let operands = parse_exprs(&binary_expr.operands, ctx, codec)?;
 
             if operands.len() < 2 {
                 return Err(proto_error(
@@ -295,8 +298,8 @@ pub fn parse_expr(
                 .window_function
                 .as_ref()
                 .ok_or_else(|| Error::required("window_function"))?;
-            let partition_by = parse_exprs(&expr.partition_by, registry, codec)?;
-            let mut order_by = parse_sorts(&expr.order_by, registry, codec)?;
+            let partition_by = parse_exprs(&expr.partition_by, ctx, codec)?;
+            let mut order_by = parse_sorts(&expr.order_by, ctx, codec)?;
             let window_frame = expr
                 .window_frame
                 .as_ref()
@@ -328,7 +331,7 @@ pub fn parse_expr(
                 window_expr_node::WindowFunction::Udaf(udaf_name) => {
                     let udaf_function = match &expr.fun_definition {
                         Some(buf) => codec.try_decode_udaf(udaf_name, buf)?,
-                        None => registry
+                        None => ctx
                             .udaf(udaf_name)
                             .or_else(|_| codec.try_decode_udaf(udaf_name, &[]))?,
                     };
@@ -337,7 +340,7 @@ pub fn parse_expr(
                 window_expr_node::WindowFunction::Udwf(udwf_name) => {
                     let udwf_function = match &expr.fun_definition {
                         Some(buf) => codec.try_decode_udwf(udwf_name, buf)?,
-                        None => registry
+                        None => ctx
                             .udwf(udwf_name)
                             .or_else(|_| codec.try_decode_udwf(udwf_name, &[]))?,
                     };
@@ -345,7 +348,7 @@ pub fn parse_expr(
                 }
             };
 
-            let args = parse_exprs(&expr.exprs, registry, codec)?;
+            let args = parse_exprs(&expr.exprs, ctx, codec)?;
             let mut builder = Expr::from(WindowFunction::new(agg_fn, args))
                 .partition_by(partition_by)
                 .order_by(order_by)
@@ -356,8 +359,7 @@ pub fn parse_expr(
                 builder = builder.distinct();
             };
 
-            if let Some(filter) =
-                parse_optional_expr(expr.filter.as_deref(), registry, codec)?
+            if let Some(filter) = parse_optional_expr(expr.filter.as_deref(), ctx, codec)?
             {
                 builder = builder.filter(filter);
             }
@@ -365,7 +367,7 @@ pub fn parse_expr(
             builder.build().map_err(Error::DataFusionError)
         }
         ExprType::Alias(alias) => Ok(Expr::Alias(Alias::new(
-            parse_required_expr(alias.expr.as_deref(), registry, "expr", codec)?,
+            parse_required_expr(alias.expr.as_deref(), ctx, "expr", codec)?,
             alias
                 .relation
                 .first()
@@ -375,69 +377,69 @@ pub fn parse_expr(
         ))),
         ExprType::IsNullExpr(is_null) => Ok(Expr::IsNull(Box::new(parse_required_expr(
             is_null.expr.as_deref(),
-            registry,
+            ctx,
             "expr",
             codec,
         )?))),
         ExprType::IsNotNullExpr(is_not_null) => Ok(Expr::IsNotNull(Box::new(
-            parse_required_expr(is_not_null.expr.as_deref(), registry, "expr", codec)?,
+            parse_required_expr(is_not_null.expr.as_deref(), ctx, "expr", codec)?,
         ))),
         ExprType::NotExpr(not) => Ok(Expr::Not(Box::new(parse_required_expr(
             not.expr.as_deref(),
-            registry,
+            ctx,
             "expr",
             codec,
         )?))),
         ExprType::IsTrue(msg) => Ok(Expr::IsTrue(Box::new(parse_required_expr(
             msg.expr.as_deref(),
-            registry,
+            ctx,
             "expr",
             codec,
         )?))),
         ExprType::IsFalse(msg) => Ok(Expr::IsFalse(Box::new(parse_required_expr(
             msg.expr.as_deref(),
-            registry,
+            ctx,
             "expr",
             codec,
         )?))),
         ExprType::IsUnknown(msg) => Ok(Expr::IsUnknown(Box::new(parse_required_expr(
             msg.expr.as_deref(),
-            registry,
+            ctx,
             "expr",
             codec,
         )?))),
         ExprType::IsNotTrue(msg) => Ok(Expr::IsNotTrue(Box::new(parse_required_expr(
             msg.expr.as_deref(),
-            registry,
+            ctx,
             "expr",
             codec,
         )?))),
         ExprType::IsNotFalse(msg) => Ok(Expr::IsNotFalse(Box::new(parse_required_expr(
             msg.expr.as_deref(),
-            registry,
+            ctx,
             "expr",
             codec,
         )?))),
         ExprType::IsNotUnknown(msg) => Ok(Expr::IsNotUnknown(Box::new(
-            parse_required_expr(msg.expr.as_deref(), registry, "expr", codec)?,
+            parse_required_expr(msg.expr.as_deref(), ctx, "expr", codec)?,
         ))),
         ExprType::Between(between) => Ok(Expr::Between(Between::new(
             Box::new(parse_required_expr(
                 between.expr.as_deref(),
-                registry,
+                ctx,
                 "expr",
                 codec,
             )?),
             between.negated,
             Box::new(parse_required_expr(
                 between.low.as_deref(),
-                registry,
+                ctx,
                 "expr",
                 codec,
             )?),
             Box::new(parse_required_expr(
                 between.high.as_deref(),
-                registry,
+                ctx,
                 "expr",
                 codec,
             )?),
@@ -446,13 +448,13 @@ pub fn parse_expr(
             like.negated,
             Box::new(parse_required_expr(
                 like.expr.as_deref(),
-                registry,
+                ctx,
                 "expr",
                 codec,
             )?),
             Box::new(parse_required_expr(
                 like.pattern.as_deref(),
-                registry,
+                ctx,
                 "pattern",
                 codec,
             )?),
@@ -463,13 +465,13 @@ pub fn parse_expr(
             like.negated,
             Box::new(parse_required_expr(
                 like.expr.as_deref(),
-                registry,
+                ctx,
                 "expr",
                 codec,
             )?),
             Box::new(parse_required_expr(
                 like.pattern.as_deref(),
-                registry,
+                ctx,
                 "pattern",
                 codec,
             )?),
@@ -480,13 +482,13 @@ pub fn parse_expr(
             like.negated,
             Box::new(parse_required_expr(
                 like.expr.as_deref(),
-                registry,
+                ctx,
                 "expr",
                 codec,
             )?),
             Box::new(parse_required_expr(
                 like.pattern.as_deref(),
-                registry,
+                ctx,
                 "pattern",
                 codec,
             )?),
@@ -500,13 +502,13 @@ pub fn parse_expr(
                 .map(|e| {
                     let when_expr = parse_required_expr(
                         e.when_expr.as_ref(),
-                        registry,
+                        ctx,
                         "when_expr",
                         codec,
                     )?;
                     let then_expr = parse_required_expr(
                         e.then_expr.as_ref(),
-                        registry,
+                        ctx,
                         "then_expr",
                         codec,
                     )?;
@@ -514,37 +516,45 @@ pub fn parse_expr(
                 })
                 .collect::<Result<Vec<(Box<Expr>, Box<Expr>)>, Error>>()?;
             Ok(Expr::Case(Case::new(
-                parse_optional_expr(case.expr.as_deref(), registry, codec)?.map(Box::new),
+                parse_optional_expr(case.expr.as_deref(), ctx, codec)?.map(Box::new),
                 when_then_expr,
-                parse_optional_expr(case.else_expr.as_deref(), registry, codec)?
-                    .map(Box::new),
+                parse_optional_expr(case.else_expr.as_deref(), ctx, codec)?.map(Box::new),
             )))
         }
         ExprType::Cast(cast) => {
             let expr = Box::new(parse_required_expr(
                 cast.expr.as_deref(),
-                registry,
+                ctx,
                 "expr",
                 codec,
             )?);
-            let data_type = cast.arrow_type.as_ref().required("arrow_type")?;
-            Ok(Expr::Cast(Cast::new(expr, data_type)))
+            let data_type: DataType = cast.arrow_type.as_ref().required("arrow_type")?;
+            let field = data_type
+                .into_nullable_field()
+                .with_nullable(cast.nullable.unwrap_or(true));
+            Ok(Expr::Cast(Cast::new_from_field(expr, Arc::new(field))))
         }
         ExprType::TryCast(cast) => {
             let expr = Box::new(parse_required_expr(
                 cast.expr.as_deref(),
-                registry,
+                ctx,
                 "expr",
                 codec,
             )?);
-            let data_type = cast.arrow_type.as_ref().required("arrow_type")?;
-            Ok(Expr::TryCast(TryCast::new(expr, data_type)))
+            let data_type: DataType = cast.arrow_type.as_ref().required("arrow_type")?;
+            let field = data_type
+                .into_nullable_field()
+                .with_nullable(cast.nullable.unwrap_or(true));
+            Ok(Expr::TryCast(TryCast::new_from_field(
+                expr,
+                Arc::new(field),
+            )))
         }
         ExprType::Negative(negative) => Ok(Expr::Negative(Box::new(
-            parse_required_expr(negative.expr.as_deref(), registry, "expr", codec)?,
+            parse_required_expr(negative.expr.as_deref(), ctx, "expr", codec)?,
         ))),
         ExprType::Unnest(unnest) => {
-            let mut exprs = parse_exprs(&unnest.exprs, registry, codec)?;
+            let mut exprs = parse_exprs(&unnest.exprs, ctx, codec)?;
             if exprs.len() != 1 {
                 return Err(proto_error("Unnest must have exactly one expression"));
             }
@@ -553,11 +563,11 @@ pub fn parse_expr(
         ExprType::InList(in_list) => Ok(Expr::InList(InList::new(
             Box::new(parse_required_expr(
                 in_list.expr.as_deref(),
-                registry,
+                ctx,
                 "expr",
                 codec,
             )?),
-            parse_exprs(&in_list.list, registry, codec)?,
+            parse_exprs(&in_list.list, ctx, codec)?,
             in_list.negated,
         ))),
         ExprType::Wildcard(protobuf::Wildcard { qualifier }) => {
@@ -575,19 +585,19 @@ pub fn parse_expr(
         }) => {
             let scalar_fn = match fun_definition {
                 Some(buf) => codec.try_decode_udf(fun_name, buf)?,
-                None => registry
+                None => ctx
                     .udf(fun_name.as_str())
                     .or_else(|_| codec.try_decode_udf(fun_name, &[]))?,
             };
             Ok(Expr::ScalarFunction(expr::ScalarFunction::new_udf(
                 scalar_fn,
-                parse_exprs(args, registry, codec)?,
+                parse_exprs(args, ctx, codec)?,
             )))
         }
         ExprType::AggregateUdfExpr(pb) => {
             let agg_fn = match &pb.fun_definition {
                 Some(buf) => codec.try_decode_udaf(&pb.fun_name, buf)?,
-                None => registry
+                None => ctx
                     .udaf(&pb.fun_name)
                     .or_else(|_| codec.try_decode_udaf(&pb.fun_name, &[]))?,
             };
@@ -606,10 +616,10 @@ pub fn parse_expr(
 
             Ok(Expr::AggregateFunction(expr::AggregateFunction::new_udf(
                 agg_fn,
-                parse_exprs(&pb.args, registry, codec)?,
+                parse_exprs(&pb.args, ctx, codec)?,
                 pb.distinct,
-                parse_optional_expr(pb.filter.as_deref(), registry, codec)?.map(Box::new),
-                parse_sorts(&pb.order_by, registry, codec)?,
+                parse_optional_expr(pb.filter.as_deref(), ctx, codec)?.map(Box::new),
+                parse_sorts(&pb.order_by, ctx, codec)?,
                 null_treatment,
             )))
         }
@@ -617,15 +627,15 @@ pub fn parse_expr(
         ExprType::GroupingSet(GroupingSetNode { expr }) => {
             Ok(Expr::GroupingSet(GroupingSets(
                 expr.iter()
-                    .map(|expr_list| parse_exprs(&expr_list.expr, registry, codec))
+                    .map(|expr_list| parse_exprs(&expr_list.expr, ctx, codec))
                     .collect::<Result<Vec<_>, Error>>()?,
             )))
         }
         ExprType::Cube(CubeNode { expr }) => Ok(Expr::GroupingSet(GroupingSet::Cube(
-            parse_exprs(expr, registry, codec)?,
+            parse_exprs(expr, ctx, codec)?,
         ))),
         ExprType::Rollup(RollupNode { expr }) => Ok(Expr::GroupingSet(
-            GroupingSet::Rollup(parse_exprs(expr, registry, codec)?),
+            GroupingSet::Rollup(parse_exprs(expr, ctx, codec)?),
         )),
         ExprType::Placeholder(PlaceholderNode {
             id,
@@ -647,13 +657,41 @@ pub fn parse_expr(
                 )))
             }
         },
+        ExprType::ScalarSubqueryExpr(sq) => {
+            let subquery = parse_subquery(
+                sq.subquery
+                    .as_deref()
+                    .ok_or_else(|| Error::required("ScalarSubqueryExprNode.subquery"))?,
+                ctx,
+                codec,
+            )?;
+            Ok(Expr::ScalarSubquery(subquery))
+        }
     }
 }
 
+fn parse_subquery(
+    proto: &protobuf::SubqueryNode,
+    ctx: &TaskContext,
+    codec: &dyn LogicalExtensionCodec,
+) -> Result<Subquery, Error> {
+    let plan_node = proto
+        .subquery
+        .as_ref()
+        .ok_or_else(|| Error::required("SubqueryNode.subquery"))?;
+    let plan = plan_node.try_into_logical_plan(ctx, codec)?;
+    let outer_ref_columns = parse_exprs(&proto.outer_ref_columns, ctx, codec)?;
+    Ok(Subquery {
+        subquery: Arc::new(plan),
+        outer_ref_columns,
+        spans: Default::default(),
+    })
+}
+
 /// Parse a vector of `protobuf::LogicalExprNode`s.
 pub fn parse_exprs<'a, I>(
     protos: I,
-    registry: &dyn FunctionRegistry,
+    ctx: &TaskContext,
     codec: &dyn LogicalExtensionCodec,
 ) -> Result<Vec<Expr>, Error>
 where
@@ -662,7 +700,7 @@ where
     let res = protos
         .into_iter()
         .map(|elem| {
-            parse_expr(elem, registry, codec).map_err(|e| plan_datafusion_err!("{}", e))
+            parse_expr(elem, ctx, codec).map_err(|e| plan_datafusion_err!("{}", e))
         })
         .collect::<Result<Vec<_>>>()?;
     Ok(res)
@@ -670,7 +708,7 @@ where
 
 pub fn parse_sorts<'a, I>(
     protos: I,
-    registry: &dyn FunctionRegistry,
+    ctx: &TaskContext,
     codec: &dyn LogicalExtensionCodec,
 ) -> Result<Vec<Sort>, Error>
 where
@@ -678,17 +716,17 @@ where
 {
     protos
         .into_iter()
-        .map(|sort| parse_sort(sort, registry, codec))
+        .map(|sort| parse_sort(sort, ctx, codec))
         .collect::<Result<Vec<Sort>, Error>>()
 }
 
 pub fn parse_sort(
     sort: &protobuf::SortExprNode,
-    registry: &dyn FunctionRegistry,
+    ctx: &TaskContext,
     codec: &dyn LogicalExtensionCodec,
 ) -> Result<Sort, Error> {
     Ok(Sort::new(
-        parse_required_expr(sort.expr.as_ref(), registry, "expr", codec)?,
+        parse_required_expr(sort.expr.as_ref(), ctx, "expr", codec)?,
         sort.asc,
         sort.nulls_first,
     ))
@@ -729,6 +767,10 @@ pub fn from_proto_binary_op(op: &str) -> Result<Operator, Error> {
         "RegexMatch" => Ok(Operator::RegexMatch),
         "RegexNotIMatch" => Ok(Operator::RegexNotIMatch),
         "RegexNotMatch" => Ok(Operator::RegexNotMatch),
+        "LikeMatch" => Ok(Operator::LikeMatch),
+        "ILikeMatch" => Ok(Operator::ILikeMatch),
+        "NotLikeMatch" => Ok(Operator::NotLikeMatch),
+        "NotILikeMatch" => Ok(Operator::NotILikeMatch),
         "StringConcat" => Ok(Operator::StringConcat),
         "AtArrow" => Ok(Operator::AtArrow),
         "ArrowAt" => Ok(Operator::ArrowAt),
@@ -740,23 +782,23 @@ pub fn from_proto_binary_op(op: &str) -> Result<Operator, Error> {
 
 fn parse_optional_expr(
     p: Option<&protobuf::LogicalExprNode>,
-    registry: &dyn FunctionRegistry,
+    ctx: &TaskContext,
     codec: &dyn LogicalExtensionCodec,
 ) -> Result<Option<Expr>, Error> {
     match p {
-        Some(expr) => parse_expr(expr, registry, codec).map(Some),
+        Some(expr) => parse_expr(expr, ctx, codec).map(Some),
         None => Ok(None),
     }
 }
 
 fn parse_required_expr(
     p: Option<&protobuf::LogicalExprNode>,
-    registry: &dyn FunctionRegistry,
+    ctx: &TaskContext,
     field: impl Into<String>,
     codec: &dyn LogicalExtensionCodec,
 ) -> Result<Expr, Error> {
     match p {
-        Some(expr) => parse_expr(expr, registry, codec),
+        Some(expr) => parse_expr(expr, ctx, codec),
         None => Err(Error::required(field)),
     }
 }
diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs
index 9644c9f69feae..7ae5cbeed3e53 100644
--- a/datafusion/proto/src/logical_plan/mod.rs
+++ b/datafusion/proto/src/logical_plan/mod.rs
@@ -21,61 +21,64 @@ use std::sync::Arc;
 
 use crate::protobuf::logical_plan_node::LogicalPlanType::CustomScan;
 use crate::protobuf::{
-    dml_node, ColumnUnnestListItem, ColumnUnnestListRecursion, CteWorkTableScanNode,
-    CustomTableScanNode, DmlNode, SortExprNodeCollection,
+    ColumnUnnestListItem, ColumnUnnestListRecursion, CteWorkTableScanNode,
+    CustomTableScanNode, DmlNode, SortExprNodeCollection, dml_node,
 };
 use crate::{
     convert_required, into_required,
     protobuf::{
-        self, listing_table_scan_node::FileFormatType,
-        logical_plan_node::LogicalPlanType, LogicalExtensionNode, LogicalPlanNode,
+        self, LogicalExtensionNode, LogicalPlanNode,
+        listing_table_scan_node::FileFormatType, logical_plan_node::LogicalPlanType,
     },
 };
 
-use crate::protobuf::{proto_error, ToProtoError};
+use crate::protobuf::{ToProtoError, proto_error};
 use arrow::datatypes::{DataType, Field, Schema, SchemaBuilder, SchemaRef};
 use datafusion_catalog::cte_worktable::CteWorkTable;
+use datafusion_catalog::empty::EmptyTable;
 use datafusion_common::file_options::file_type::FileType;
 use datafusion_common::{
-    context, internal_datafusion_err, internal_err, not_impl_err, plan_err, Result,
-    TableReference, ToDFSchema,
+    Result, TableReference, ToDFSchema, assert_or_internal_err, context,
+    internal_datafusion_err, internal_err, not_impl_err, plan_err,
 };
 use datafusion_datasource::file_format::FileFormat;
 use datafusion_datasource::file_format::{
-    file_type_to_format, format_as_file_type, FileFormatFactory,
+    FileFormatFactory, file_type_to_format, format_as_file_type,
 };
-use datafusion_datasource_arrow::file_format::ArrowFormat;
+use datafusion_datasource_arrow::file_format::{ArrowFormat, ArrowFormatFactory};
 #[cfg(feature = "avro")]
 use datafusion_datasource_avro::file_format::AvroFormat;
-use datafusion_datasource_csv::file_format::CsvFormat;
-use datafusion_datasource_json::file_format::JsonFormat as OtherNdJsonFormat;
+use datafusion_datasource_csv::file_format::{CsvFormat, CsvFormatFactory};
+use datafusion_datasource_json::file_format::{
+    JsonFormat as OtherNdJsonFormat, JsonFormatFactory,
+};
 #[cfg(feature = "parquet")]
-use datafusion_datasource_parquet::file_format::ParquetFormat;
+use datafusion_datasource_parquet::file_format::{ParquetFormat, ParquetFormatFactory};
 use datafusion_expr::{
-    dml,
-    logical_plan::{
-        builder::project, Aggregate, CreateCatalog, CreateCatalogSchema,
-        CreateExternalTable, CreateView, DdlStatement, Distinct, EmptyRelation,
-        Extension, Join, JoinConstraint, Prepare, Projection, Repartition, Sort,
-        SubqueryAlias, TableScan, Values, Window,
-    },
-    DistinctOn, DropView, Expr, LogicalPlan, LogicalPlanBuilder, ScalarUDF, SortExpr,
-    Statement, WindowUDF,
+    AggregateUDF, DmlStatement, FetchType, HigherOrderUDF, RecursiveQuery, SkipType,
+    TableSource, Unnest,
 };
 use datafusion_expr::{
-    AggregateUDF, DmlStatement, FetchType, RecursiveQuery, SkipType, TableSource, Unnest,
+    DistinctOn, DropView, Expr, LogicalPlan, LogicalPlanBuilder, ScalarUDF, SortExpr,
+    Statement, WindowUDF, dml,
+    logical_plan::{
+        Aggregate, CreateCatalog, CreateCatalogSchema, CreateExternalTable, CreateView,
+        DdlStatement, Distinct, EmptyRelation, Extension, Join, JoinConstraint, Prepare,
+        Projection, Repartition, Sort, SubqueryAlias, TableScan, Values, Window,
+        builder::project,
+    },
 };
 
 use self::to_proto::{serialize_expr, serialize_exprs};
 use crate::logical_plan::to_proto::serialize_sorts;
+use datafusion_catalog::TableProvider;
 use datafusion_catalog::default_table_source::{provider_as_source, source_as_provider};
 use datafusion_catalog::view::ViewTable;
-use datafusion_catalog::TableProvider;
 use datafusion_catalog_listing::{ListingOptions, ListingTable, ListingTableConfig};
 use datafusion_datasource::ListingTableUrl;
 use datafusion_execution::TaskContext;
-use prost::bytes::BufMut;
 use prost::Message;
+use prost::bytes::BufMut;
 
 pub mod file_formats;
 pub mod from_proto;
@@ -105,7 +108,7 @@ pub trait AsLogicalPlan: Debug + Send + Sync + Clone {
         Self: Sized;
 }
 
-pub trait LogicalExtensionCodec: Debug + Send + Sync {
+pub trait LogicalExtensionCodec: Debug + Send + Sync + std::any::Any {
     fn try_decode(
         &self,
         buf: &[u8],
@@ -154,6 +157,24 @@ pub trait LogicalExtensionCodec: Debug + Send + Sync {
         Ok(())
     }
 
+    fn try_decode_higher_order_function(
+        &self,
+        name: &str,
+        _buf: &[u8],
+    ) -> Result<Arc<dyn HigherOrderUDF>> {
+        not_impl_err!(
+            "LogicalExtensionCodec is not provided for higher order function {name}"
+        )
+    }
+
+    fn try_encode_higher_order_function(
+        &self,
+        _node: &dyn HigherOrderUDF,
+        _buf: &mut Vec<u8>,
+    ) -> Result<()> {
+        Ok(())
+    }
+
     fn try_decode_udaf(&self, name: &str, _buf: &[u8]) -> Result<Arc<AggregateUDF>> {
         not_impl_err!(
             "LogicalExtensionCodec is not provided for aggregate function {name}"
@@ -208,6 +229,95 @@ impl LogicalExtensionCodec for DefaultLogicalExtensionCodec {
     ) -> Result<()> {
         not_impl_err!("LogicalExtensionCodec is not provided")
     }
+
+    fn try_decode_file_format(
+        &self,
+        buf: &[u8],
+        ctx: &TaskContext,
+    ) -> Result<Arc<dyn FileFormatFactory>> {
+        let proto = protobuf::FileFormatProto::decode(buf).map_err(|e| {
+            internal_datafusion_err!("Failed to decode FileFormatProto: {e}")
+        })?;
+
+        let kind = protobuf::FileFormatKind::try_from(proto.kind).map_err(|_| {
+            internal_datafusion_err!("Unknown FileFormatKind: {}", proto.kind)
+        })?;
+
+        match kind {
+            protobuf::FileFormatKind::Csv => file_formats::CsvLogicalExtensionCodec
+                .try_decode_file_format(&proto.encoded_file_format, ctx),
+            protobuf::FileFormatKind::Json => file_formats::JsonLogicalExtensionCodec
+                .try_decode_file_format(&proto.encoded_file_format, ctx),
+            #[cfg(feature = "parquet")]
+            protobuf::FileFormatKind::Parquet => {
+                file_formats::ParquetLogicalExtensionCodec
+                    .try_decode_file_format(&proto.encoded_file_format, ctx)
+            }
+            protobuf::FileFormatKind::Arrow => file_formats::ArrowLogicalExtensionCodec
+                .try_decode_file_format(&proto.encoded_file_format, ctx),
+            protobuf::FileFormatKind::Avro => file_formats::AvroLogicalExtensionCodec
+                .try_decode_file_format(&proto.encoded_file_format, ctx),
+            #[cfg(not(feature = "parquet"))]
+            protobuf::FileFormatKind::Parquet => {
+                not_impl_err!("Parquet support requires the 'parquet' feature")
+            }
+            protobuf::FileFormatKind::Unspecified => {
+                not_impl_err!("Unspecified file format kind")
+            }
+        }
+    }
+
+    fn try_encode_file_format(
+        &self,
+        buf: &mut Vec<u8>,
+        node: Arc<dyn FileFormatFactory>,
+    ) -> Result<()> {
+        let mut encoded_file_format = Vec::new();
+
+        let kind = if node.downcast_ref::<CsvFormatFactory>().is_some() {
+            file_formats::CsvLogicalExtensionCodec
+                .try_encode_file_format(&mut encoded_file_format, Arc::clone(&node))?;
+            protobuf::FileFormatKind::Csv
+        } else if node.downcast_ref::<JsonFormatFactory>().is_some() {
+            file_formats::JsonLogicalExtensionCodec
+                .try_encode_file_format(&mut encoded_file_format, Arc::clone(&node))?;
+            protobuf::FileFormatKind::Json
+        } else if node.downcast_ref::<ArrowFormatFactory>().is_some() {
+            file_formats::ArrowLogicalExtensionCodec
+                .try_encode_file_format(&mut encoded_file_format, Arc::clone(&node))?;
+            protobuf::FileFormatKind::Arrow
+        } else {
+            #[cfg(feature = "parquet")]
+            {
+                if node.downcast_ref::<ParquetFormatFactory>().is_some() {
+                    file_formats::ParquetLogicalExtensionCodec.try_encode_file_format(
+                        &mut encoded_file_format,
+                        Arc::clone(&node),
+                    )?;
+                    protobuf::FileFormatKind::Parquet
+                } else {
+                    return not_impl_err!(
+                        "Unsupported FileFormatFactory type for DefaultLogicalExtensionCodec"
+                    );
+                }
+            }
+            #[cfg(not(feature = "parquet"))]
+            {
+                return not_impl_err!(
+                    "Unsupported FileFormatFactory type for DefaultLogicalExtensionCodec"
+                );
+            }
+        };
+
+        let proto = protobuf::FileFormatProto {
+            kind: kind as i32,
+            encoded_file_format,
+        };
+        proto.encode(buf).map_err(|e| {
+            internal_datafusion_err!("Failed to encode FileFormatProto: {e}")
+        })?;
+        Ok(())
+    }
 }
 
 #[macro_export]
@@ -423,14 +533,17 @@ impl AsLogicalPlan for LogicalPlanNode {
                             }
                             Arc::new(json)
                         }
-                        #[cfg_attr(not(feature = "avro"), allow(unused_variables))]
                         FileFormatType::Avro(..) => {
                             #[cfg(feature = "avro")]
                             {
                                 Arc::new(AvroFormat)
                             }
                             #[cfg(not(feature = "avro"))]
-                            panic!("Unable to process avro file since `avro` feature is not enabled");
+                            {
+                                panic!(
+                                    "Unable to process avro file since `avro` feature is not enabled"
+                                );
+                            }
                         }
                         FileFormatType::Arrow(..) => {
                             Arc::new(ArrowFormat)
@@ -606,27 +719,26 @@ impl AsLogicalPlan for LogicalPlanNode {
                 }
 
                 Ok(LogicalPlan::Ddl(DdlStatement::CreateExternalTable(
-                    CreateExternalTable {
-                        schema: pb_schema.try_into()?,
-                        name: from_table_reference(
+                    CreateExternalTable::builder(
+                        from_table_reference(
                             create_extern_table.name.as_ref(),
                             "CreateExternalTable",
                         )?,
-                        location: create_extern_table.location.clone(),
-                        file_type: create_extern_table.file_type.clone(),
-                        table_partition_cols: create_extern_table
-                            .table_partition_cols
-                            .clone(),
-                        order_exprs,
-                        if_not_exists: create_extern_table.if_not_exists,
-                        or_replace: create_extern_table.or_replace,
-                        temporary: create_extern_table.temporary,
-                        definition,
-                        unbounded: create_extern_table.unbounded,
-                        options: create_extern_table.options.clone(),
-                        constraints: constraints.into(),
-                        column_defaults,
-                    },
+                        create_extern_table.location.clone(),
+                        create_extern_table.file_type.clone(),
+                        pb_schema.try_into()?,
+                    )
+                    .with_partition_cols(create_extern_table.table_partition_cols.clone())
+                    .with_order_exprs(order_exprs)
+                    .with_if_not_exists(create_extern_table.if_not_exists)
+                    .with_or_replace(create_extern_table.or_replace)
+                    .with_temporary(create_extern_table.temporary)
+                    .with_definition(definition)
+                    .with_unbounded(create_extern_table.unbounded)
+                    .with_options(create_extern_table.options.clone())
+                    .with_constraints(constraints.into())
+                    .with_column_defaults(column_defaults)
+                    .build(),
                 )))
             }
             LogicalPlanType::CreateView(create_view) => {
@@ -776,11 +888,10 @@ impl AsLogicalPlan for LogicalPlanNode {
                 builder.build()
             }
             LogicalPlanType::Union(union) => {
-                if union.inputs.len() < 2 {
-                    return internal_err!(
-                        "Protobuf deserialization error, Union was require at least two input."
-                    );
-                }
+                assert_or_internal_err!(
+                    union.inputs.len() >= 2,
+                    "Protobuf deserialization error, Union requires at least two inputs."
+                );
                 let (first, rest) = union.inputs.split_first().unwrap();
                 let mut builder = LogicalPlanBuilder::from(
                     first.try_into_logical_plan(ctx, extension_codec)?,
@@ -974,6 +1085,35 @@ impl AsLogicalPlan for LogicalPlanNode {
                 )?
                 .build()
             }
+            LogicalPlanType::EmptyTableScan(scan) => {
+                let schema: Schema = convert_required!(scan.schema)?;
+                let schema = Arc::new(schema);
+                let mut projection = None;
+                if let Some(columns) = &scan.projection {
+                    let column_indices = columns
+                        .columns
+                        .iter()
+                        .map(|name| schema.index_of(name))
+                        .collect::<Result<Vec<usize>, _>>()?;
+                    projection = Some(column_indices);
+                }
+
+                let filters =
+                    from_proto::parse_exprs(&scan.filters, ctx, extension_codec)?;
+
+                let table_name =
+                    from_table_reference(scan.table_name.as_ref(), "EmptyTableScan")?;
+
+                let provider = Arc::new(EmptyTable::new(Arc::clone(&schema)));
+
+                LogicalPlanBuilder::scan_with_filters(
+                    table_name,
+                    provider_as_source(provider),
+                    projection,
+                    filters,
+                )?
+                .build()
+            }
             LogicalPlanType::Dml(dml_node) => {
                 Ok(LogicalPlan::Dml(datafusion_expr::DmlStatement::new(
                     from_table_reference(dml_node.table_name.as_ref(), "DML ")?,
@@ -1019,7 +1159,6 @@ impl AsLogicalPlan for LogicalPlanNode {
             }) => {
                 let provider = source_as_provider(source)?;
                 let schema = provider.schema();
-                let source = provider.as_any();
 
                 let projection = match projection {
                     None => None,
@@ -1037,13 +1176,13 @@ impl AsLogicalPlan for LogicalPlanNode {
                 let filters: Vec<protobuf::LogicalExprNode> =
                     serialize_exprs(filters, extension_codec)?;
 
-                if let Some(listing_table) = source.downcast_ref::<ListingTable>() {
-                    let any = listing_table.options().format.as_any();
+                if let Some(listing_table) = provider.downcast_ref::<ListingTable>() {
+                    let format = listing_table.options().format.as_ref();
                     let file_format_type = {
                         let mut maybe_some_type = None;
 
                         #[cfg(feature = "parquet")]
-                        if let Some(parquet) = any.downcast_ref::<ParquetFormat>() {
+                        if let Some(parquet) = format.downcast_ref::<ParquetFormat>() {
                             let options = parquet.options();
                             maybe_some_type =
                                 Some(FileFormatType::Parquet(protobuf::ParquetFormat {
@@ -1051,7 +1190,7 @@ impl AsLogicalPlan for LogicalPlanNode {
                                 }));
                         };
 
-                        if let Some(csv) = any.downcast_ref::<CsvFormat>() {
+                        if let Some(csv) = format.downcast_ref::<CsvFormat>() {
                             let options = csv.options();
                             maybe_some_type =
                                 Some(FileFormatType::Csv(protobuf::CsvFormat {
@@ -1059,7 +1198,7 @@ impl AsLogicalPlan for LogicalPlanNode {
                                 }));
                         }
 
-                        if let Some(json) = any.downcast_ref::<OtherNdJsonFormat>() {
+                        if let Some(json) = format.downcast_ref::<OtherNdJsonFormat>() {
                             let options = json.options();
                             maybe_some_type =
                                 Some(FileFormatType::Json(protobuf::NdJsonFormat {
@@ -1068,12 +1207,12 @@ impl AsLogicalPlan for LogicalPlanNode {
                         }
 
                         #[cfg(feature = "avro")]
-                        if any.is::<AvroFormat>() {
+                        if format.is::<AvroFormat>() {
                             maybe_some_type =
                                 Some(FileFormatType::Avro(protobuf::AvroFormat {}))
                         }
 
-                        if any.is::<ArrowFormat>() {
+                        if format.is::<ArrowFormat>() {
                             maybe_some_type =
                                 Some(FileFormatType::Arrow(protobuf::ArrowFormat {}))
                         }
@@ -1151,7 +1290,7 @@ impl AsLogicalPlan for LogicalPlanNode {
                             },
                         )),
                     })
-                } else if let Some(view_table) = source.downcast_ref::<ViewTable>() {
+                } else if let Some(view_table) = provider.downcast_ref::<ViewTable>() {
                     let schema: protobuf::Schema = schema.as_ref().try_into()?;
                     Ok(LogicalPlanNode {
                         logical_plan_type: Some(LogicalPlanType::ViewScan(Box::new(
@@ -1172,7 +1311,8 @@ impl AsLogicalPlan for LogicalPlanNode {
                             },
                         ))),
                     })
-                } else if let Some(cte_work_table) = source.downcast_ref::<CteWorkTable>()
+                } else if let Some(cte_work_table) =
+                    provider.downcast_ref::<CteWorkTable>()
                 {
                     let name = cte_work_table.name().to_string();
                     let schema = cte_work_table.schema();
@@ -1186,6 +1326,19 @@ impl AsLogicalPlan for LogicalPlanNode {
                             },
                         )),
                     })
+                } else if provider.downcast_ref::<EmptyTable>().is_some() {
+                    let schema: protobuf::Schema = schema.as_ref().try_into()?;
+
+                    Ok(LogicalPlanNode {
+                        logical_plan_type: Some(LogicalPlanType::EmptyTableScan(
+                            protobuf::EmptyTableScanNode {
+                                table_name: Some(table_name.clone().into()),
+                                schema: Some(schema),
+                                projection,
+                                filters,
+                            },
+                        )),
+                    })
                 } else {
                     let schema: protobuf::Schema = schema.as_ref().try_into()?;
                     let mut bytes = vec![];
@@ -1230,10 +1383,10 @@ impl AsLogicalPlan for LogicalPlanNode {
                     logical_plan_type: Some(LogicalPlanType::Selection(Box::new(
                         protobuf::SelectionNode {
                             input: Some(Box::new(input)),
-                            expr: Some(serialize_expr(
+                            expr: Some(Box::new(serialize_expr(
                                 &filter.predicate,
                                 extension_codec,
-                            )?),
+                            )?)),
                         },
                     ))),
                 })
@@ -1349,7 +1502,7 @@ impl AsLogicalPlan for LogicalPlanNode {
                     null_equality.to_owned().into();
                 let filter = filter
                     .as_ref()
-                    .map(|e| serialize_expr(e, extension_codec))
+                    .map(|e| serialize_expr(e, extension_codec).map(Box::new))
                     .map_or(Ok(None), |v| v.map(Some))?;
                 Ok(LogicalPlanNode {
                     logical_plan_type: Some(LogicalPlanType::Join(Box::new(
@@ -1366,8 +1519,14 @@ impl AsLogicalPlan for LogicalPlanNode {
                     ))),
                 })
             }
-            LogicalPlan::Subquery(_) => {
-                not_impl_err!("LogicalPlan serde is not yet implemented for subqueries")
+            LogicalPlan::Subquery(subquery) => {
+                // Serialize the inner subquery plan directly — the
+                // LogicalPlan::Subquery wrapper is reconstructed during
+                // expression deserialization.
+                LogicalPlanNode::try_from_logical_plan(
+                    &subquery.subquery,
+                    extension_codec,
+                )
             }
             LogicalPlan::SubqueryAlias(SubqueryAlias { input, alias, .. }) => {
                 let input: LogicalPlanNode = LogicalPlanNode::try_from_logical_plan(
@@ -1451,7 +1610,7 @@ impl AsLogicalPlan for LogicalPlanNode {
                         PartitionMethod::RoundRobin(*partition_count as u64)
                     }
                     Partitioning::DistributeBy(_) => {
-                        return not_impl_err!("DistributeBy")
+                        return not_impl_err!("DistributeBy");
                     }
                 };
 
diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs
index 2774b5b6ba7c3..d79107d1d0f2b 100644
--- a/datafusion/proto/src/logical_plan/to_proto.rs
+++ b/datafusion/proto/src/logical_plan/to_proto.rs
@@ -22,21 +22,24 @@
 use std::collections::HashMap;
 
 use datafusion_common::{NullEquality, TableReference, UnnestOptions};
+use datafusion_expr::WriteOp;
 use datafusion_expr::dml::InsertOp;
 use datafusion_expr::expr::{
     self, AggregateFunctionParams, Alias, Between, BinaryExpr, Cast, GroupingSet, InList,
     Like, NullTreatment, Placeholder, ScalarFunction, Unnest,
 };
-use datafusion_expr::WriteOp;
+use datafusion_expr::logical_plan::Subquery;
 use datafusion_expr::{
-    logical_plan::PlanType, logical_plan::StringifiedPlan, Expr, JoinConstraint,
-    JoinType, SortExpr, TryCast, WindowFrame, WindowFrameBound, WindowFrameUnits,
-    WindowFunctionDefinition,
+    Expr, JoinConstraint, JoinType, SortExpr, TryCast, WindowFrame, WindowFrameBound,
+    WindowFrameUnits, WindowFunctionDefinition, logical_plan::PlanType,
+    logical_plan::StringifiedPlan,
 };
 
 use crate::protobuf::RecursionUnnestOption;
 use crate::protobuf::{
-    self,
+    self, AnalyzedLogicalPlanType, CubeNode, EmptyMessage, GroupingSetNode,
+    LogicalExprList, OptimizedLogicalPlanType, OptimizedPhysicalPlanType,
+    PlaceholderNode, RollupNode, ToProtoError as Error,
     plan_type::PlanTypeEnum::{
         AnalyzedLogicalPlan, FinalAnalyzedLogicalPlan, FinalLogicalPlan,
         FinalPhysicalPlan, FinalPhysicalPlanWithSchema, FinalPhysicalPlanWithStats,
@@ -44,12 +47,10 @@ use crate::protobuf::{
         InitialPhysicalPlanWithStats, OptimizedLogicalPlan, OptimizedPhysicalPlan,
         PhysicalPlanError,
     },
-    AnalyzedLogicalPlanType, CubeNode, EmptyMessage, GroupingSetNode, LogicalExprList,
-    OptimizedLogicalPlanType, OptimizedPhysicalPlanType, PlaceholderNode, RollupNode,
-    ToProtoError as Error,
 };
 
-use super::LogicalExtensionCodec;
+use super::{AsLogicalPlan, LogicalExtensionCodec};
+use crate::protobuf::LogicalPlanNode;
 
 impl From<&UnnestOptions> for protobuf::UnnestOptions {
     fn from(opts: &UnnestOptions) -> Self {
@@ -307,16 +308,16 @@ pub fn serialize_expr(
         }
         Expr::WindowFunction(window_fun) => {
             let expr::WindowFunction {
-                ref fun,
+                fun,
                 params:
                     expr::WindowFunctionParams {
-                        ref args,
-                        ref partition_by,
-                        ref order_by,
-                        ref window_frame,
-                        ref null_treatment,
-                        ref distinct,
-                        ref filter,
+                        args,
+                        partition_by,
+                        order_by,
+                        window_frame,
+                        null_treatment,
+                        distinct,
+                        filter,
                     },
             } = window_fun.as_ref();
             let mut buf = Vec::new();
@@ -361,14 +362,14 @@ pub fn serialize_expr(
             }
         }
         Expr::AggregateFunction(expr::AggregateFunction {
-            ref func,
+            func,
             params:
                 AggregateFunctionParams {
-                    ref args,
-                    ref distinct,
-                    ref filter,
-                    ref order_by,
-                    ref null_treatment,
+                    args,
+                    distinct,
+                    filter,
+                    order_by,
+                    null_treatment,
                 },
         }) => {
             let mut buf = Vec::new();
@@ -395,7 +396,7 @@ pub fn serialize_expr(
         Expr::ScalarVariable(_, _) => {
             return Err(Error::General(
                 "Proto serialization error: Scalar Variable not supported".to_string(),
-            ))
+            ));
         }
         Expr::ScalarFunction(ScalarFunction { func, args }) => {
             let mut buf = Vec::new();
@@ -522,19 +523,23 @@ pub fn serialize_expr(
                 expr_type: Some(ExprType::Case(expr)),
             }
         }
-        Expr::Cast(Cast { expr, data_type }) => {
+        Expr::Cast(Cast { expr, field }) => {
             let expr = Box::new(protobuf::CastNode {
                 expr: Some(Box::new(serialize_expr(expr.as_ref(), codec)?)),
-                arrow_type: Some(data_type.try_into()?),
+                arrow_type: Some(field.data_type().try_into()?),
+                metadata: field.metadata().clone(),
+                nullable: Some(field.is_nullable()),
             });
             protobuf::LogicalExprNode {
                 expr_type: Some(ExprType::Cast(expr)),
             }
         }
-        Expr::TryCast(TryCast { expr, data_type }) => {
+        Expr::TryCast(TryCast { expr, field }) => {
             let expr = Box::new(protobuf::TryCastNode {
                 expr: Some(Box::new(serialize_expr(expr.as_ref(), codec)?)),
-                arrow_type: Some(data_type.try_into()?),
+                arrow_type: Some(field.data_type().try_into()?),
+                metadata: field.metadata().clone(),
+                nullable: Some(field.is_nullable()),
             });
             protobuf::LogicalExprNode {
                 expr_type: Some(ExprType::TryCast(expr)),
@@ -576,13 +581,20 @@ pub fn serialize_expr(
                 qualifier: qualifier.to_owned().map(|x| x.into()),
             })),
         },
-        Expr::ScalarSubquery(_)
-        | Expr::InSubquery(_)
-        | Expr::Exists { .. }
-        | Expr::OuterReferenceColumn { .. } => {
-            // we would need to add logical plan operators to datafusion.proto to support this
-            // see discussion in https://github.com/apache/datafusion/issues/2565
-            return Err(Error::General("Proto serialization error: Expr::ScalarSubquery(_) | Expr::InSubquery(_) | Expr::Exists { .. } | Exp:OuterReferenceColumn not supported".to_string()));
+        Expr::ScalarSubquery(subquery) => protobuf::LogicalExprNode {
+            expr_type: Some(ExprType::ScalarSubqueryExpr(Box::new(
+                protobuf::ScalarSubqueryExprNode {
+                    subquery: Some(Box::new(serialize_subquery(subquery, codec)?)),
+                },
+            ))),
+        },
+        Expr::InSubquery(_)
+        | Expr::Exists(_)
+        | Expr::OuterReferenceColumn(_, _)
+        | Expr::SetComparison(_) => {
+            return Err(Error::General(format!(
+                "Proto serialization error: {expr} is not yet supported"
+            )));
         }
         Expr::GroupingSet(GroupingSet::Cube(exprs)) => protobuf::LogicalExprNode {
             expr_type: Some(ExprType::Cube(CubeNode {
@@ -622,11 +634,29 @@ pub fn serialize_expr(
                     .unwrap_or(HashMap::new()),
             })),
         },
+        Expr::HigherOrderFunction(_) | Expr::Lambda(_) | Expr::LambdaVariable(_) => {
+            return Err(Error::General(
+                "Proto serialization error: Lambda not implemented".to_string(),
+            ));
+        }
     };
 
     Ok(expr_node)
 }
 
+fn serialize_subquery(
+    subquery: &Subquery,
+    codec: &dyn LogicalExtensionCodec,
+) -> Result<protobuf::SubqueryNode, Error> {
+    let plan = LogicalPlanNode::try_from_logical_plan(&subquery.subquery, codec)
+        .map_err(|e| Error::General(e.to_string()))?;
+    let outer_ref_columns = serialize_exprs(&subquery.outer_ref_columns, codec)?;
+    Ok(protobuf::SubqueryNode {
+        subquery: Some(Box::new(plan)),
+        outer_ref_columns,
+    })
+}
+
 pub fn serialize_sorts<'a, I>(
     sorts: I,
     codec: &dyn LogicalExtensionCodec,
@@ -729,6 +759,7 @@ impl From<&WriteOp> for protobuf::dml_node::Type {
             WriteOp::Delete => protobuf::dml_node::Type::Delete,
             WriteOp::Update => protobuf::dml_node::Type::Update,
             WriteOp::Ctas => protobuf::dml_node::Type::Ctas,
+            WriteOp::Truncate => protobuf::dml_node::Type::Truncate,
         }
     }
 }
diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs
index 349ed79ddb4ad..43ebf0474320a 100644
--- a/datafusion/proto/src/physical_plan/from_proto.rs
+++ b/datafusion/proto/src/physical_plan/from_proto.rs
@@ -21,20 +21,15 @@ use std::sync::Arc;
 
 use arrow::array::RecordBatch;
 use arrow::compute::SortOptions;
-use arrow::datatypes::Field;
+use arrow::datatypes::{Field, Schema};
 use arrow::ipc::reader::StreamReader;
 use chrono::{TimeZone, Utc};
-use datafusion_expr::dml::InsertOp;
-use object_store::path::Path;
-use object_store::ObjectMeta;
-
-use arrow::datatypes::Schema;
-use datafusion_common::{internal_datafusion_err, not_impl_err, DataFusionError, Result};
+use datafusion_common::{DataFusionError, Result, internal_datafusion_err, not_impl_err};
 use datafusion_datasource::file::FileSource;
 use datafusion_datasource::file_groups::FileGroup;
 use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
 use datafusion_datasource::file_sink_config::FileSinkConfig;
-use datafusion_datasource::{FileRange, ListingTableUrl, PartitionedFile};
+use datafusion_datasource::{FileRange, ListingTableUrl, PartitionedFile, TableSchema};
 use datafusion_datasource_csv::file_format::CsvSink;
 use datafusion_datasource_json::file_format::JsonSink;
 #[cfg(feature = "parquet")]
@@ -42,21 +37,32 @@ use datafusion_datasource_parquet::file_format::ParquetSink;
 use datafusion_execution::object_store::ObjectStoreUrl;
 use datafusion_execution::{FunctionRegistry, TaskContext};
 use datafusion_expr::WindowFunctionDefinition;
+use datafusion_expr::dml::InsertOp;
+use datafusion_expr::execution_props::SubqueryIndex;
+use datafusion_physical_expr::projection::{ProjectionExpr, ProjectionExprs};
+use datafusion_physical_expr::scalar_subquery::ScalarSubqueryExpr;
 use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr, ScalarFunctionExpr};
 use datafusion_physical_plan::expressions::{
-    in_list, BinaryExpr, CaseExpr, CastExpr, Column, IsNotNullExpr, IsNullExpr, LikeExpr,
-    Literal, NegativeExpr, NotExpr, TryCastExpr, UnKnownColumn,
+    BinaryExpr, CaseExpr, CastExpr, Column, IsNotNullExpr, IsNullExpr, LikeExpr, Literal,
+    NegativeExpr, NotExpr, TryCastExpr, UnKnownColumn, in_list,
 };
+use datafusion_physical_plan::joins::{HashExpr, SeededRandomState};
 use datafusion_physical_plan::windows::{create_window_expr, schema_add_window_field};
 use datafusion_physical_plan::{Partitioning, PhysicalExpr, WindowExpr};
 use datafusion_proto_common::common::proto_error;
+use object_store::ObjectMeta;
+use object_store::path::Path;
 
-use crate::convert_required;
+use super::{
+    DefaultPhysicalProtoConverter, PhysicalExtensionCodec, PhysicalPlanDecodeContext,
+    PhysicalProtoConverterExtension,
+};
 use crate::logical_plan::{self};
-use crate::protobuf;
 use crate::protobuf::physical_expr_node::ExprType;
-
-use super::PhysicalExtensionCodec;
+use crate::{convert_required, protobuf};
+use datafusion_physical_expr::expressions::{
+    DynamicFilterInner, DynamicFilterPhysicalExpr,
+};
 
 impl From<&protobuf::PhysicalColumn> for Column {
     fn from(c: &protobuf::PhysicalColumn) -> Column {
@@ -69,18 +75,21 @@ impl From<&protobuf::PhysicalColumn> for Column {
 /// # Arguments
 ///
 /// * `proto` - Input proto with physical sort expression node
-/// * `registry` - A registry knows how to build logical expressions out of user-defined function names
 /// * `input_schema` - The Arrow schema for the input, used for determining expression data types
 ///   when performing type coercion.
-/// * `codec` - An extension codec used to decode custom UDFs.
+/// * `ctx` - Decode context carrying the task context, extension codec, and
+///   any scoped state needed during recursive deserialization.
+/// * `proto_converter` - Converter hooks used for recursive physical plan and
+///   expression deserialization.
 pub fn parse_physical_sort_expr(
     proto: &protobuf::PhysicalSortExprNode,
-    ctx: &TaskContext,
+    ctx: &PhysicalPlanDecodeContext<'_>,
     input_schema: &Schema,
-    codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<PhysicalSortExpr> {
     if let Some(expr) = &proto.expr {
-        let expr = parse_physical_expr(expr.as_ref(), ctx, input_schema, codec)?;
+        let expr =
+            proto_converter.proto_to_physical_expr(expr.as_ref(), input_schema, ctx)?;
         let options = SortOptions {
             descending: !proto.asc,
             nulls_first: proto.nulls_first,
@@ -96,19 +105,23 @@ pub fn parse_physical_sort_expr(
 /// # Arguments
 ///
 /// * `proto` - Input proto with vector of physical sort expression node
-/// * `registry` - A registry knows how to build logical expressions out of user-defined function names
 /// * `input_schema` - The Arrow schema for the input, used for determining expression data types
 ///   when performing type coercion.
-/// * `codec` - An extension codec used to decode custom UDFs.
+/// * `ctx` - Decode context carrying the task context, extension codec, and
+///   any scoped state needed during recursive deserialization.
+/// * `proto_converter` - Converter hooks used for recursive physical plan and
+///   expression deserialization.
 pub fn parse_physical_sort_exprs(
     proto: &[protobuf::PhysicalSortExprNode],
-    ctx: &TaskContext,
+    ctx: &PhysicalPlanDecodeContext<'_>,
     input_schema: &Schema,
-    codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Vec<PhysicalSortExpr>> {
     proto
         .iter()
-        .map(|sort_expr| parse_physical_sort_expr(sort_expr, ctx, input_schema, codec))
+        .map(|sort_expr| {
+            parse_physical_sort_expr(sort_expr, ctx, input_schema, proto_converter)
+        })
         .collect()
 }
 
@@ -118,21 +131,25 @@ pub fn parse_physical_sort_exprs(
 ///
 /// * `proto` - Input proto with physical window expression node.
 /// * `name` - Name of the window expression.
-/// * `registry` - A registry knows how to build logical expressions out of user-defined function names
-/// * `input_schema` - The Arrow schema for the input, used for determining expression data types
-///   when performing type coercion.
-/// * `codec` - An extension codec used to decode custom UDFs.
+/// * `input_schema` - The Arrow schema for the input, used for determining
+///   expression data types when performing type coercion.
+/// * `ctx` - Decode context carrying the task context, extension codec, and
+///   any scoped state needed during recursive deserialization.
+/// * `proto_converter` - Converter hooks used for recursive physical plan and
+///   expression deserialization.
 pub fn parse_physical_window_expr(
     proto: &protobuf::PhysicalWindowExprNode,
-    ctx: &TaskContext,
+    ctx: &PhysicalPlanDecodeContext<'_>,
     input_schema: &Schema,
-    codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Arc<dyn WindowExpr>> {
-    let window_node_expr = parse_physical_exprs(&proto.args, ctx, input_schema, codec)?;
+    let window_node_expr =
+        parse_physical_exprs(&proto.args, ctx, input_schema, proto_converter)?;
     let partition_by =
-        parse_physical_exprs(&proto.partition_by, ctx, input_schema, codec)?;
+        parse_physical_exprs(&proto.partition_by, ctx, input_schema, proto_converter)?;
 
-    let order_by = parse_physical_sort_exprs(&proto.order_by, ctx, input_schema, codec)?;
+    let order_by =
+        parse_physical_sort_exprs(&proto.order_by, ctx, input_schema, proto_converter)?;
 
     let window_frame = proto
         .window_frame
@@ -148,14 +165,20 @@ pub fn parse_physical_window_expr(
         match window_func {
             protobuf::physical_window_expr_node::WindowFunction::UserDefinedAggrFunction(udaf_name) => {
                 WindowFunctionDefinition::AggregateUDF(match &proto.fun_definition {
-                    Some(buf) => codec.try_decode_udaf(udaf_name, buf)?,
-                    None => ctx.udaf(udaf_name).or_else(|_| codec.try_decode_udaf(udaf_name, &[]))?,
+                    Some(buf) => ctx.codec().try_decode_udaf(udaf_name, buf)?,
+                    None => ctx
+                        .task_ctx()
+                        .udaf(udaf_name)
+                        .or_else(|_| ctx.codec().try_decode_udaf(udaf_name, &[]))?,
                 })
             }
             protobuf::physical_window_expr_node::WindowFunction::UserDefinedWindowFunction(udwf_name) => {
                 WindowFunctionDefinition::WindowUDF(match &proto.fun_definition {
-                    Some(buf) => codec.try_decode_udwf(udwf_name, buf)?,
-                    None => ctx.udwf(udwf_name).or_else(|_| codec.try_decode_udwf(udwf_name, &[]))?
+                    Some(buf) => ctx.codec().try_decode_udwf(udwf_name, buf)?,
+                    None => ctx
+                        .task_ctx()
+                        .udwf(udwf_name)
+                        .or_else(|_| ctx.codec().try_decode_udwf(udwf_name, &[]))?
                 })
             }
         }
@@ -183,16 +206,16 @@ pub fn parse_physical_window_expr(
 
 pub fn parse_physical_exprs<'a, I>(
     protos: I,
-    ctx: &TaskContext,
+    ctx: &PhysicalPlanDecodeContext<'_>,
     input_schema: &Schema,
-    codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Vec<Arc<dyn PhysicalExpr>>>
 where
     I: IntoIterator<Item = &'a protobuf::PhysicalExprNode>,
 {
     protos
         .into_iter()
-        .map(|p| parse_physical_expr(p, ctx, input_schema, codec))
+        .map(|p| proto_converter.proto_to_physical_expr(p, input_schema, ctx))
         .collect::<Result<Vec<_>>>()
 }
 
@@ -201,15 +224,42 @@ where
 /// # Arguments
 ///
 /// * `proto` - Input proto with physical expression node
-/// * `registry` - A registry knows how to build logical expressions out of user-defined function names
-/// * `input_schema` - The Arrow schema for the input, used for determining expression data types
-///   when performing type coercion.
-/// * `codec` - An extension codec used to decode custom UDFs.
+/// * `ctx` - Task context used to resolve registered functions.
+/// * `input_schema` - The Arrow schema for the input, used for determining
+///   expression data types when performing type coercion.
+/// * `codec` - Physical extension codec used to construct the root decode
+///   context for deserialization.
 pub fn parse_physical_expr(
     proto: &protobuf::PhysicalExprNode,
     ctx: &TaskContext,
     input_schema: &Schema,
     codec: &dyn PhysicalExtensionCodec,
+) -> Result<Arc<dyn PhysicalExpr>> {
+    let decode_ctx = PhysicalPlanDecodeContext::new(ctx, codec);
+    parse_physical_expr_with_converter(
+        proto,
+        input_schema,
+        &decode_ctx,
+        &DefaultPhysicalProtoConverter {},
+    )
+}
+
+/// Parses a physical expression from a protobuf.
+///
+/// # Arguments
+///
+/// * `proto` - Input proto with physical expression node
+/// * `input_schema` - The Arrow schema for the input, used for determining
+///   expression data types when performing type coercion.
+/// * `ctx` - Decode context carrying the task context, extension codec, and
+///   any scoped state needed during recursive deserialization.
+/// * `proto_converter` - Converter hooks used for recursive physical plan and
+///   expression deserialization.
+pub fn parse_physical_expr_with_converter(
+    proto: &protobuf::PhysicalExprNode,
+    input_schema: &Schema,
+    ctx: &PhysicalPlanDecodeContext<'_>,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Arc<dyn PhysicalExpr>> {
     let expr_type = proto
         .expr_type
@@ -223,23 +273,50 @@ pub fn parse_physical_expr(
         }
         ExprType::UnknownColumn(c) => Arc::new(UnKnownColumn::new(&c.name)),
         ExprType::Literal(scalar) => Arc::new(Literal::new(scalar.try_into()?)),
-        ExprType::BinaryExpr(binary_expr) => Arc::new(BinaryExpr::new(
-            parse_required_physical_expr(
-                binary_expr.l.as_deref(),
-                ctx,
-                "left",
-                input_schema,
-                codec,
-            )?,
-            logical_plan::from_proto::from_proto_binary_op(&binary_expr.op)?,
-            parse_required_physical_expr(
-                binary_expr.r.as_deref(),
-                ctx,
-                "right",
-                input_schema,
-                codec,
-            )?,
-        )),
+        ExprType::BinaryExpr(binary_expr) => {
+            let op = logical_plan::from_proto::from_proto_binary_op(&binary_expr.op)?;
+            if !binary_expr.operands.is_empty() {
+                // New linearized format: reduce the flat operands list back into
+                // a nested binary expression tree.
+                let operands: Vec<Arc<dyn PhysicalExpr>> = binary_expr
+                    .operands
+                    .iter()
+                    .map(|e| proto_converter.proto_to_physical_expr(e, input_schema, ctx))
+                    .collect::<Result<Vec<_>>>()?;
+
+                if operands.len() < 2 {
+                    return Err(proto_error(
+                        "A binary expression must always have at least 2 operands",
+                    ));
+                }
+
+                operands
+                    .into_iter()
+                    .reduce(|left, right| Arc::new(BinaryExpr::new(left, op, right)))
+                    .expect(
+                        "Binary expression could not be reduced to a single expression.",
+                    )
+            } else {
+                // Legacy format with l/r fields
+                Arc::new(BinaryExpr::new(
+                    parse_required_physical_expr(
+                        binary_expr.l.as_deref(),
+                        ctx,
+                        "left",
+                        input_schema,
+                        proto_converter,
+                    )?,
+                    op,
+                    parse_required_physical_expr(
+                        binary_expr.r.as_deref(),
+                        ctx,
+                        "right",
+                        input_schema,
+                        proto_converter,
+                    )?,
+                ))
+            }
+        }
         ExprType::AggregateExpr(_) => {
             return not_impl_err!(
                 "Cannot convert aggregate expr node to physical expression"
@@ -259,7 +336,7 @@ pub fn parse_physical_expr(
                 ctx,
                 "expr",
                 input_schema,
-                codec,
+                proto_converter,
             )?))
         }
         ExprType::IsNotNullExpr(e) => {
@@ -268,7 +345,7 @@ pub fn parse_physical_expr(
                 ctx,
                 "expr",
                 input_schema,
-                codec,
+                proto_converter,
             )?))
         }
         ExprType::NotExpr(e) => Arc::new(NotExpr::new(parse_required_physical_expr(
@@ -276,7 +353,7 @@ pub fn parse_physical_expr(
             ctx,
             "expr",
             input_schema,
-            codec,
+            proto_converter,
         )?)),
         ExprType::Negative(e) => {
             Arc::new(NegativeExpr::new(parse_required_physical_expr(
@@ -284,7 +361,7 @@ pub fn parse_physical_expr(
                 ctx,
                 "expr",
                 input_schema,
-                codec,
+                proto_converter,
             )?))
         }
         ExprType::InList(e) => in_list(
@@ -293,16 +370,18 @@ pub fn parse_physical_expr(
                 ctx,
                 "expr",
                 input_schema,
-                codec,
+                proto_converter,
             )?,
-            parse_physical_exprs(&e.list, ctx, input_schema, codec)?,
+            parse_physical_exprs(&e.list, ctx, input_schema, proto_converter)?,
             &e.negated,
             input_schema,
         )?,
         ExprType::Case(e) => Arc::new(CaseExpr::try_new(
             e.expr
                 .as_ref()
-                .map(|e| parse_physical_expr(e.as_ref(), ctx, input_schema, codec))
+                .map(|e| {
+                    proto_converter.proto_to_physical_expr(e.as_ref(), input_schema, ctx)
+                })
                 .transpose()?,
             e.when_then_expr
                 .iter()
@@ -313,21 +392,23 @@ pub fn parse_physical_expr(
                             ctx,
                             "when_expr",
                             input_schema,
-                            codec,
+                            proto_converter,
                         )?,
                         parse_required_physical_expr(
                             e.then_expr.as_ref(),
                             ctx,
                             "then_expr",
                             input_schema,
-                            codec,
+                            proto_converter,
                         )?,
                     ))
                 })
                 .collect::<Result<Vec<_>>>()?,
             e.else_expr
                 .as_ref()
-                .map(|e| parse_physical_expr(e.as_ref(), ctx, input_schema, codec))
+                .map(|e| {
+                    proto_converter.proto_to_physical_expr(e.as_ref(), input_schema, ctx)
+                })
                 .transpose()?,
         )?),
         ExprType::Cast(e) => Arc::new(CastExpr::new(
@@ -336,7 +417,7 @@ pub fn parse_physical_expr(
                 ctx,
                 "expr",
                 input_schema,
-                codec,
+                proto_converter,
             )?,
             convert_required!(e.arrow_type)?,
             None,
@@ -347,22 +428,23 @@ pub fn parse_physical_expr(
                 ctx,
                 "expr",
                 input_schema,
-                codec,
+                proto_converter,
             )?,
             convert_required!(e.arrow_type)?,
         )),
         ExprType::ScalarUdf(e) => {
             let udf = match &e.fun_definition {
-                Some(buf) => codec.try_decode_udf(&e.name, buf)?,
+                Some(buf) => ctx.codec().try_decode_udf(&e.name, buf)?,
                 None => ctx
+                    .task_ctx()
                     .udf(e.name.as_str())
-                    .or_else(|_| codec.try_decode_udf(&e.name, &[]))?,
+                    .or_else(|_| ctx.codec().try_decode_udf(&e.name, &[]))?,
             };
             let scalar_fun_def = Arc::clone(&udf);
 
-            let args = parse_physical_exprs(&e.args, ctx, input_schema, codec)?;
+            let args = parse_physical_exprs(&e.args, ctx, input_schema, proto_converter)?;
 
-            let config_options = Arc::clone(ctx.session_config().options());
+            let config_options = Arc::clone(ctx.task_ctx().session_config().options());
 
             Arc::new(
                 ScalarFunctionExpr::new(
@@ -388,23 +470,105 @@ pub fn parse_physical_expr(
                 ctx,
                 "expr",
                 input_schema,
-                codec,
+                proto_converter,
             )?,
             parse_required_physical_expr(
                 like_expr.pattern.as_deref(),
                 ctx,
                 "pattern",
                 input_schema,
-                codec,
+                proto_converter,
             )?,
         )),
+        ExprType::HashExpr(hash_expr) => {
+            let on_columns = parse_physical_exprs(
+                &hash_expr.on_columns,
+                ctx,
+                input_schema,
+                proto_converter,
+            )?;
+            Arc::new(HashExpr::new(
+                on_columns,
+                SeededRandomState::with_seed(hash_expr.seed0),
+                hash_expr.description.clone(),
+            ))
+        }
+        ExprType::ScalarSubquery(sq) => {
+            let data_type: arrow::datatypes::DataType = sq
+                .data_type
+                .as_ref()
+                .ok_or_else(|| {
+                    proto_error("Missing data_type in PhysicalScalarSubqueryExprNode")
+                })?
+                .try_into()?;
+            let results = ctx.scalar_subquery_results().ok_or_else(|| {
+                proto_error(
+                    "ScalarSubqueryExpr can only be deserialized as part \
+                         of a surrounding ScalarSubqueryExec",
+                )
+            })?;
+            Arc::new(ScalarSubqueryExpr::new(
+                data_type,
+                sq.nullable,
+                SubqueryIndex::new(sq.index as usize),
+                results.clone(),
+            ))
+        }
+        ExprType::DynamicFilter(dynamic_filter) => {
+            let children = parse_physical_exprs(
+                &dynamic_filter.children,
+                ctx,
+                input_schema,
+                proto_converter,
+            )?;
+
+            let remapped_children = if !dynamic_filter.remapped_children.is_empty() {
+                Some(parse_physical_exprs(
+                    &dynamic_filter.remapped_children,
+                    ctx,
+                    input_schema,
+                    proto_converter,
+                )?)
+            } else {
+                None
+            };
+
+            let inner_expr = parse_required_physical_expr(
+                dynamic_filter.inner_expr.as_deref(),
+                ctx,
+                "inner_expr",
+                input_schema,
+                proto_converter,
+            )?;
+
+            let expression_id = proto.expr_id.ok_or_else(|| {
+                proto_error(
+                    "DynamicFilterPhysicalExpr requires PhysicalExprNode.expr_id \
+                     to be set by the serializer",
+                )
+            })?;
+
+            let base_filter: Arc<dyn PhysicalExpr> =
+                Arc::new(DynamicFilterPhysicalExpr::from_parts(
+                    children,
+                    remapped_children,
+                    DynamicFilterInner {
+                        expression_id,
+                        generation: dynamic_filter.generation,
+                        expr: inner_expr,
+                        is_complete: dynamic_filter.is_complete,
+                    },
+                ));
+            base_filter
+        }
         ExprType::Extension(extension) => {
             let inputs: Vec<Arc<dyn PhysicalExpr>> = extension
                 .inputs
                 .iter()
-                .map(|e| parse_physical_expr(e, ctx, input_schema, codec))
+                .map(|e| proto_converter.proto_to_physical_expr(e, input_schema, ctx))
                 .collect::<Result<_>>()?;
-            (codec.try_decode_expr(extension.expr.as_slice(), &inputs)?) as _
+            ctx.codec()
+                .try_decode_expr(extension.expr.as_slice(), &inputs)? as _
         }
     };
 
@@ -413,26 +577,30 @@ pub fn parse_physical_expr(
 
 fn parse_required_physical_expr(
     expr: Option<&protobuf::PhysicalExprNode>,
-    ctx: &TaskContext,
+    ctx: &PhysicalPlanDecodeContext<'_>,
     field: &str,
     input_schema: &Schema,
-    codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Arc<dyn PhysicalExpr>> {
-    expr.map(|e| parse_physical_expr(e, ctx, input_schema, codec))
+    expr.map(|e| proto_converter.proto_to_physical_expr(e, input_schema, ctx))
         .transpose()?
         .ok_or_else(|| internal_datafusion_err!("Missing required field {field:?}"))
 }
 
 pub fn parse_protobuf_hash_partitioning(
     partitioning: Option<&protobuf::PhysicalHashRepartition>,
-    ctx: &TaskContext,
+    ctx: &PhysicalPlanDecodeContext<'_>,
     input_schema: &Schema,
-    codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Option<Partitioning>> {
     match partitioning {
         Some(hash_part) => {
-            let expr =
-                parse_physical_exprs(&hash_part.hash_expr, ctx, input_schema, codec)?;
+            let expr = parse_physical_exprs(
+                &hash_part.hash_expr,
+                ctx,
+                input_schema,
+                proto_converter,
+            )?;
 
             Ok(Some(Partitioning::Hash(
                 expr,
@@ -445,9 +613,9 @@ pub fn parse_protobuf_hash_partitioning(
 
 pub fn parse_protobuf_partitioning(
     partitioning: Option<&protobuf::Partitioning>,
-    ctx: &TaskContext,
+    ctx: &PhysicalPlanDecodeContext<'_>,
     input_schema: &Schema,
-    codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Option<Partitioning>> {
     match partitioning {
         Some(protobuf::Partitioning { partition_method }) => match partition_method {
@@ -461,7 +629,7 @@ pub fn parse_protobuf_partitioning(
                     Some(hash_repartition),
                     ctx,
                     input_schema,
-                    codec,
+                    proto_converter,
                 )
             }
             Some(protobuf::partitioning::PartitionMethod::Unknown(partition_count)) => {
@@ -481,38 +649,17 @@ pub fn parse_protobuf_file_scan_schema(
     Ok(Arc::new(convert_required!(proto.schema)?))
 }
 
-pub fn parse_protobuf_file_scan_config(
+/// Parses a TableSchema from protobuf, extracting the file schema and partition columns
+pub fn parse_table_schema_from_proto(
     proto: &protobuf::FileScanExecConf,
-    ctx: &TaskContext,
-    codec: &dyn PhysicalExtensionCodec,
-    file_source: Arc<dyn FileSource>,
-) -> Result<FileScanConfig> {
+) -> Result<TableSchema> {
     let schema: Arc<Schema> = parse_protobuf_file_scan_schema(proto)?;
-    let projection = proto
-        .projection
-        .iter()
-        .map(|i| *i as usize)
-        .collect::<Vec<_>>();
-
-    let constraints = convert_required!(proto.constraints)?;
-    let statistics = convert_required!(proto.statistics)?;
-
-    let file_groups = proto
-        .file_groups
-        .iter()
-        .map(|f| f.try_into())
-        .collect::<Result<Vec<_>, _>>()?;
-
-    let object_store_url = match proto.object_store_url.is_empty() {
-        false => ObjectStoreUrl::parse(&proto.object_store_url)?,
-        true => ObjectStoreUrl::local_filesystem(),
-    };
 
     // Reacquire the partition column types from the schema before removing them below.
     let table_partition_cols = proto
         .table_partition_cols
         .iter()
-        .map(|col| Ok(schema.field_with_name(col)?.clone()))
+        .map(|col| Ok(Arc::new(schema.field_with_name(col)?.clone())))
         .collect::<Result<Vec<_>>>()?;
 
     // Remove partition columns from the schema after recreating table_partition_cols
@@ -530,24 +677,74 @@ pub fn parse_protobuf_file_scan_config(
         .with_metadata(schema.metadata.clone()),
     );
 
+    Ok(TableSchema::new(file_schema, table_partition_cols))
+}
+
+pub fn parse_protobuf_file_scan_config(
+    proto: &protobuf::FileScanExecConf,
+    ctx: &PhysicalPlanDecodeContext<'_>,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
+    file_source: Arc<dyn FileSource>,
+) -> Result<FileScanConfig> {
+    let schema: Arc<Schema> = parse_protobuf_file_scan_schema(proto)?;
+
+    let constraints = convert_required!(proto.constraints)?;
+    let statistics = convert_required!(proto.statistics)?;
+
+    let file_groups = proto
+        .file_groups
+        .iter()
+        .map(|f| f.try_into())
+        .collect::<Result<Vec<_>, _>>()?;
+
+    let object_store_url = match proto.object_store_url.is_empty() {
+        false => ObjectStoreUrl::parse(&proto.object_store_url)?,
+        true => ObjectStoreUrl::local_filesystem(),
+    };
+
     let mut output_ordering = vec![];
     for node_collection in &proto.output_ordering {
         let sort_exprs = parse_physical_sort_exprs(
             &node_collection.physical_sort_expr_nodes,
             ctx,
             &schema,
-            codec,
+            proto_converter,
         )?;
         output_ordering.extend(LexOrdering::new(sort_exprs));
     }
 
-    let config = FileScanConfigBuilder::new(object_store_url, file_schema, file_source)
+    // Parse projection expressions if present and apply to file source
+    let file_source = if let Some(proto_projection_exprs) = &proto.projection_exprs {
+        let projection_exprs: Vec<ProjectionExpr> = proto_projection_exprs
+            .projections
+            .iter()
+            .map(|proto_expr| {
+                let expr = proto_converter.proto_to_physical_expr(
+                    proto_expr.expr.as_ref().ok_or_else(|| {
+                        internal_datafusion_err!("ProjectionExpr missing expr field")
+                    })?,
+                    &schema,
+                    ctx,
+                )?;
+                Ok(ProjectionExpr::new(expr, proto_expr.alias.clone()))
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        let projection_exprs = ProjectionExprs::new(projection_exprs);
+
+        // Apply projection to file source
+        file_source
+            .try_pushdown_projection(&projection_exprs)?
+            .unwrap_or(file_source)
+    } else {
+        file_source
+    };
+
+    let config = FileScanConfigBuilder::new(object_store_url, file_source)
         .with_file_groups(file_groups)
         .with_constraints(constraints)
         .with_statistics(statistics)
-        .with_projection_indices(Some(projection))
         .with_limit(proto.limit.as_ref().map(|sl| sl.limit as usize))
-        .with_table_partition_cols(table_partition_cols)
         .with_output_ordering(output_ordering)
         .with_batch_size(proto.batch_size.map(|s| s as usize))
         .build();
@@ -570,30 +767,28 @@ impl TryFrom<&protobuf::PartitionedFile> for PartitionedFile {
     type Error = DataFusionError;
 
     fn try_from(val: &protobuf::PartitionedFile) -> Result<Self, Self::Error> {
-        Ok(PartitionedFile {
-            object_meta: ObjectMeta {
-                location: Path::parse(val.path.as_str()).map_err(|e| {
-                    proto_error(format!("Invalid object_store path: {e}"))
-                })?,
-                last_modified: Utc.timestamp_nanos(val.last_modified_ns as i64),
-                size: val.size,
-                e_tag: None,
-                version: None,
-            },
-            partition_values: val
-                .partition_values
+        let mut pf = PartitionedFile::new_from_meta(ObjectMeta {
+            location: Path::parse(val.path.as_str())
+                .map_err(|e| proto_error(format!("Invalid object_store path: {e}")))?,
+            last_modified: Utc.timestamp_nanos(val.last_modified_ns as i64),
+            size: val.size,
+            e_tag: None,
+            version: None,
+        })
+        .with_partition_values(
+            val.partition_values
                 .iter()
                 .map(|v| v.try_into())
                 .collect::<Result<Vec<_>, _>>()?,
-            range: val.range.as_ref().map(|v| v.try_into()).transpose()?,
-            statistics: val
-                .statistics
-                .as_ref()
-                .map(|v| v.try_into().map(Arc::new))
-                .transpose()?,
-            extensions: None,
-            metadata_size_hint: None,
-        })
+        );
+        if let Some(range) = val.range.as_ref() {
+            let file_range: FileRange = range.try_into()?;
+            pf = pf.with_range(file_range.start, file_range.end);
+        }
+        if let Some(proto_stats) = val.statistics.as_ref() {
+            pf = pf.with_statistics(Arc::new(proto_stats.try_into()?));
+        }
+        Ok(pf)
     }
 }
 
@@ -683,6 +878,17 @@ impl TryFrom<&protobuf::FileSinkConfig> for FileSinkConfig {
             protobuf::InsertOp::Overwrite => InsertOp::Overwrite,
             protobuf::InsertOp::Replace => InsertOp::Replace,
         };
+        let file_output_mode = match conf.file_output_mode() {
+            protobuf::FileOutputMode::Automatic => {
+                datafusion_datasource::file_sink_config::FileOutputMode::Automatic
+            }
+            protobuf::FileOutputMode::SingleFile => {
+                datafusion_datasource::file_sink_config::FileOutputMode::SingleFile
+            }
+            protobuf::FileOutputMode::Directory => {
+                datafusion_datasource::file_sink_config::FileOutputMode::Directory
+            }
+        };
         Ok(Self {
             original_url: String::default(),
             object_store_url: ObjectStoreUrl::parse(&conf.object_store_url)?,
@@ -693,35 +899,26 @@ impl TryFrom<&protobuf::FileSinkConfig> for FileSinkConfig {
             insert_op,
             keep_partition_by_columns: conf.keep_partition_by_columns,
             file_extension: conf.file_extension.clone(),
+            file_output_mode,
         })
     }
 }
 
 #[cfg(test)]
 mod tests {
+
     use super::*;
-    use chrono::{TimeZone, Utc};
-    use datafusion_datasource::PartitionedFile;
-    use object_store::path::Path;
-    use object_store::ObjectMeta;
 
     #[test]
     fn partitioned_file_path_roundtrip_percent_encoded() {
         let path_str = "foo/foo%2Fbar/baz%252Fqux";
-        let pf = PartitionedFile {
-            object_meta: ObjectMeta {
-                location: Path::parse(path_str).unwrap(),
-                last_modified: Utc.timestamp_nanos(1_000),
-                size: 42,
-                e_tag: None,
-                version: None,
-            },
-            partition_values: vec![],
-            range: None,
-            statistics: None,
-            extensions: None,
-            metadata_size_hint: None,
-        };
+        let pf = PartitionedFile::new_from_meta(ObjectMeta {
+            location: Path::parse(path_str).unwrap(),
+            last_modified: Utc.timestamp_nanos(1_000),
+            size: 42,
+            e_tag: None,
+            version: None,
+        });
 
         let proto = protobuf::PartitionedFile::try_from(&pf).unwrap();
         assert_eq!(proto.path, path_str);
diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs
index 0ebbb373f2d10..8994926d173ae 100644
--- a/datafusion/proto/src/physical_plan/mod.rs
+++ b/datafusion/proto/src/physical_plan/mod.rs
@@ -15,35 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::any::Any;
+use std::cell::RefCell;
+use std::collections::HashMap;
 use std::fmt::Debug;
 use std::sync::Arc;
 
-use self::from_proto::parse_protobuf_partitioning;
-use self::to_proto::{serialize_partitioning, serialize_physical_expr};
-use crate::common::{byte_to_string, str_to_byte};
-use crate::physical_plan::from_proto::{
-    parse_physical_expr, parse_physical_sort_expr, parse_physical_sort_exprs,
-    parse_physical_window_expr, parse_protobuf_file_scan_config, parse_record_batches,
-};
-use crate::physical_plan::to_proto::{
-    serialize_file_scan_config, serialize_maybe_filter, serialize_physical_aggr_expr,
-    serialize_physical_sort_exprs, serialize_physical_window_expr,
-    serialize_record_batches,
-};
-use crate::protobuf::physical_aggregate_expr_node::AggregateFunction;
-use crate::protobuf::physical_expr_node::ExprType;
-use crate::protobuf::physical_plan_node::PhysicalPlanType;
-use crate::protobuf::{
-    self, proto_error, window_agg_exec_node, ListUnnest as ProtoListUnnest, SortExprNode,
-    SortMergeJoinExecNode,
-};
-use crate::{convert_required, into_required};
-
 use arrow::compute::SortOptions;
-use arrow::datatypes::{IntervalMonthDayNanoType, SchemaRef};
+use arrow::datatypes::{IntervalMonthDayNanoType, Schema, SchemaRef};
 use datafusion_catalog::memory::MemorySourceConfig;
+use datafusion_common::config::CsvOptions;
 use datafusion_common::{
-    internal_datafusion_err, internal_err, not_impl_err, DataFusionError, Result,
+    DataFusionError, Result, internal_datafusion_err, internal_err, not_impl_err,
 };
 #[cfg(feature = "parquet")]
 use datafusion_datasource::file::FileSource;
@@ -51,6 +34,7 @@ use datafusion_datasource::file_compression_type::FileCompressionType;
 use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
 use datafusion_datasource::sink::DataSinkExec;
 use datafusion_datasource::source::{DataSource, DataSourceExec};
+use datafusion_datasource_arrow::source::ArrowSource;
 #[cfg(feature = "avro")]
 use datafusion_datasource_avro::source::AvroSource;
 use datafusion_datasource_csv::file_format::CsvSink;
@@ -58,52 +42,135 @@ use datafusion_datasource_csv::source::CsvSource;
 use datafusion_datasource_json::file_format::JsonSink;
 use datafusion_datasource_json::source::JsonSource;
 #[cfg(feature = "parquet")]
+use datafusion_datasource_parquet::CachedParquetFileReaderFactory;
+#[cfg(feature = "parquet")]
 use datafusion_datasource_parquet::file_format::ParquetSink;
 #[cfg(feature = "parquet")]
 use datafusion_datasource_parquet::source::ParquetSource;
+#[cfg(feature = "parquet")]
+use datafusion_execution::object_store::ObjectStoreUrl;
 use datafusion_execution::{FunctionRegistry, TaskContext};
+use datafusion_expr::execution_props::{ScalarSubqueryResults, SubqueryIndex};
 use datafusion_expr::{AggregateUDF, ScalarUDF, WindowUDF};
 use datafusion_functions_table::generate_series::{
     Empty, GenSeriesArgs, GenerateSeriesTable, GenericSeriesState, TimestampValue,
 };
-use datafusion_physical_expr::aggregate::AggregateExprBuilder;
-use datafusion_physical_expr::aggregate::AggregateFunctionExpr;
+use datafusion_physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctionExpr};
+use datafusion_physical_expr::async_scalar_function::AsyncFuncExpr;
+use datafusion_physical_expr::expressions::DynamicFilterPhysicalExpr;
 use datafusion_physical_expr::{LexOrdering, LexRequirement, PhysicalExprRef};
-use datafusion_physical_plan::aggregates::AggregateMode;
-use datafusion_physical_plan::aggregates::{AggregateExec, PhysicalGroupBy};
+use datafusion_physical_plan::aggregates::{
+    AggregateExec, AggregateMode, LimitOptions, PhysicalGroupBy,
+};
 use datafusion_physical_plan::analyze::AnalyzeExec;
+use datafusion_physical_plan::async_func::AsyncFuncExec;
+use datafusion_physical_plan::buffer::BufferExec;
+#[expect(deprecated)]
 use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec;
 use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion_physical_plan::coop::CooperativeExec;
 use datafusion_physical_plan::empty::EmptyExec;
 use datafusion_physical_plan::explain::ExplainExec;
 use datafusion_physical_plan::expressions::PhysicalSortExpr;
-use datafusion_physical_plan::filter::FilterExec;
+use datafusion_physical_plan::filter::{FilterExec, FilterExecBuilder};
 use datafusion_physical_plan::joins::utils::{ColumnIndex, JoinFilter};
 use datafusion_physical_plan::joins::{
-    CrossJoinExec, NestedLoopJoinExec, SortMergeJoinExec, StreamJoinPartitionMode,
-    SymmetricHashJoinExec,
+    CrossJoinExec, HashJoinExec, NestedLoopJoinExec, PartitionMode, SortMergeJoinExec,
+    StreamJoinPartitionMode, SymmetricHashJoinExec,
 };
-use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
 use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
 use datafusion_physical_plan::memory::LazyMemoryExec;
-use datafusion_physical_plan::metrics::MetricType;
+use datafusion_physical_plan::metrics::{MetricCategory, MetricType};
 use datafusion_physical_plan::placeholder_row::PlaceholderRowExec;
 use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr};
 use datafusion_physical_plan::repartition::RepartitionExec;
+use datafusion_physical_plan::scalar_subquery::{ScalarSubqueryExec, ScalarSubqueryLink};
 use datafusion_physical_plan::sorts::sort::SortExec;
 use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
 use datafusion_physical_plan::union::{InterleaveExec, UnionExec};
 use datafusion_physical_plan::unnest::{ListUnnest, UnnestExec};
 use datafusion_physical_plan::windows::{BoundedWindowAggExec, WindowAggExec};
 use datafusion_physical_plan::{ExecutionPlan, InputOrderMode, PhysicalExpr, WindowExpr};
-
-use prost::bytes::BufMut;
 use prost::Message;
+use prost::bytes::BufMut;
+
+use self::from_proto::parse_protobuf_partitioning;
+use self::to_proto::serialize_partitioning;
+use crate::common::{byte_to_string, str_to_byte};
+use crate::physical_plan::from_proto::{
+    parse_physical_expr_with_converter, parse_physical_sort_expr,
+    parse_physical_sort_exprs, parse_physical_window_expr,
+    parse_protobuf_file_scan_config, parse_record_batches, parse_table_schema_from_proto,
+};
+use crate::physical_plan::to_proto::{
+    serialize_file_scan_config, serialize_maybe_filter, serialize_physical_aggr_expr,
+    serialize_physical_expr_with_converter, serialize_physical_sort_exprs,
+    serialize_physical_window_expr, serialize_record_batches,
+};
+use crate::protobuf::physical_aggregate_expr_node::AggregateFunction;
+use crate::protobuf::physical_expr_node::ExprType;
+use crate::protobuf::physical_plan_node::PhysicalPlanType;
+use crate::protobuf::{
+    self, ListUnnest as ProtoListUnnest, SortExprNode, SortMergeJoinExecNode,
+    proto_error, window_agg_exec_node,
+};
+use crate::{convert_required, into_required};
 
 pub mod from_proto;
 pub mod to_proto;
 
+/// Context threaded through physical-plan deserialization.
+///
+/// This bundles the stable per-call inputs for deserialization and the
+/// per-scope `ScalarSubqueryResults` handle needed while reconstructing
+/// `ScalarSubqueryExpr` nodes inside a `ScalarSubqueryExec` input plan.
+#[derive(Clone)]
+pub struct PhysicalPlanDecodeContext<'a> {
+    task_ctx: &'a TaskContext,
+    codec: &'a dyn PhysicalExtensionCodec,
+    scalar_subquery_results: Option<ScalarSubqueryResults>,
+}
+
+impl<'a> PhysicalPlanDecodeContext<'a> {
+    /// Creates a new root decode context.
+    pub fn new(task_ctx: &'a TaskContext, codec: &'a dyn PhysicalExtensionCodec) -> Self {
+        Self {
+            task_ctx,
+            codec,
+            scalar_subquery_results: None,
+        }
+    }
+
+    /// Returns the task context used for deserialization.
+    pub fn task_ctx(&self) -> &'a TaskContext {
+        self.task_ctx
+    }
+
+    /// Returns the physical extension codec used for deserialization.
+    pub fn codec(&self) -> &'a dyn PhysicalExtensionCodec {
+        self.codec
+    }
+
+    /// Returns the scalar subquery results container for the current scope, if
+    /// one is active.
+    pub fn scalar_subquery_results(&self) -> Option<&ScalarSubqueryResults> {
+        self.scalar_subquery_results.as_ref()
+    }
+
+    /// Returns a child context with a different scalar subquery results
+    /// container.
+    pub fn with_scalar_subquery_results(
+        &self,
+        scalar_subquery_results: ScalarSubqueryResults,
+    ) -> Self {
+        Self {
+            task_ctx: self.task_ctx,
+            codec: self.codec,
+            scalar_subquery_results: Some(scalar_subquery_results),
+        }
+    }
+}
+
 impl AsExecutionPlan for protobuf::PhysicalPlanNode {
     fn try_decode(buf: &[u8]) -> Result<Self>
     where
@@ -127,8 +194,45 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode {
     fn try_into_physical_plan(
         &self,
         ctx: &TaskContext,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        self.try_into_physical_plan_with_converter(
+            ctx,
+            codec,
+            &DefaultPhysicalProtoConverter {},
+        )
+    }
 
-        extension_codec: &dyn PhysicalExtensionCodec,
+    fn try_from_physical_plan(
+        plan: Arc<dyn ExecutionPlan>,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<Self>
+    where
+        Self: Sized,
+    {
+        Self::try_from_physical_plan_with_converter(
+            plan,
+            codec,
+            &DefaultPhysicalProtoConverter {},
+        )
+    }
+}
+
+impl protobuf::PhysicalPlanNode {
+    pub fn try_into_physical_plan_with_converter(
+        &self,
+        ctx: &TaskContext,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let decode_ctx = PhysicalPlanDecodeContext::new(ctx, codec);
+        self.try_into_physical_plan_with_context(&decode_ctx, proto_converter)
+    }
+
+    pub(crate) fn try_into_physical_plan_with_context(
+        &self,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let plan = self.physical_plan_type.as_ref().ok_or_else(|| {
             proto_error(format!(
@@ -137,340 +241,393 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode {
         })?;
         match plan {
             PhysicalPlanType::Explain(explain) => {
-                self.try_into_explain_physical_plan(explain, ctx, extension_codec)
+                self.try_into_explain_physical_plan(explain, ctx, proto_converter)
             }
             PhysicalPlanType::Projection(projection) => {
-                self.try_into_projection_physical_plan(projection, ctx, extension_codec)
+                self.try_into_projection_physical_plan(projection, ctx, proto_converter)
             }
             PhysicalPlanType::Filter(filter) => {
-                self.try_into_filter_physical_plan(filter, ctx, extension_codec)
+                self.try_into_filter_physical_plan(filter, ctx, proto_converter)
             }
             PhysicalPlanType::CsvScan(scan) => {
-                self.try_into_csv_scan_physical_plan(scan, ctx, extension_codec)
+                self.try_into_csv_scan_physical_plan(scan, ctx, proto_converter)
             }
             PhysicalPlanType::JsonScan(scan) => {
-                self.try_into_json_scan_physical_plan(scan, ctx, extension_codec)
+                self.try_into_json_scan_physical_plan(scan, ctx, proto_converter)
             }
-            #[cfg_attr(not(feature = "parquet"), allow(unused_variables))]
             PhysicalPlanType::ParquetScan(scan) => {
-                self.try_into_parquet_scan_physical_plan(scan, ctx, extension_codec)
+                self.try_into_parquet_scan_physical_plan(scan, ctx, proto_converter)
             }
-            #[cfg_attr(not(feature = "avro"), allow(unused_variables))]
             PhysicalPlanType::AvroScan(scan) => {
-                self.try_into_avro_scan_physical_plan(scan, ctx, extension_codec)
+                self.try_into_avro_scan_physical_plan(scan, ctx, proto_converter)
             }
             PhysicalPlanType::MemoryScan(scan) => {
-                self.try_into_memory_scan_physical_plan(scan, ctx, extension_codec)
+                self.try_into_memory_scan_physical_plan(scan, ctx, proto_converter)
+            }
+            PhysicalPlanType::ArrowScan(scan) => {
+                self.try_into_arrow_scan_physical_plan(scan, ctx, proto_converter)
             }
             PhysicalPlanType::CoalesceBatches(coalesce_batches) => self
                 .try_into_coalesce_batches_physical_plan(
                     coalesce_batches,
                     ctx,
-                    extension_codec,
+                    proto_converter,
                 ),
             PhysicalPlanType::Merge(merge) => {
-                self.try_into_merge_physical_plan(merge, ctx, extension_codec)
+                self.try_into_merge_physical_plan(merge, ctx, proto_converter)
             }
             PhysicalPlanType::Repartition(repart) => {
-                self.try_into_repartition_physical_plan(repart, ctx, extension_codec)
+                self.try_into_repartition_physical_plan(repart, ctx, proto_converter)
             }
             PhysicalPlanType::GlobalLimit(limit) => {
-                self.try_into_global_limit_physical_plan(limit, ctx, extension_codec)
+                self.try_into_global_limit_physical_plan(limit, ctx, proto_converter)
             }
             PhysicalPlanType::LocalLimit(limit) => {
-                self.try_into_local_limit_physical_plan(limit, ctx, extension_codec)
+                self.try_into_local_limit_physical_plan(limit, ctx, proto_converter)
             }
             PhysicalPlanType::Window(window_agg) => {
-                self.try_into_window_physical_plan(window_agg, ctx, extension_codec)
+                self.try_into_window_physical_plan(window_agg, ctx, proto_converter)
             }
             PhysicalPlanType::Aggregate(hash_agg) => {
-                self.try_into_aggregate_physical_plan(hash_agg, ctx, extension_codec)
+                self.try_into_aggregate_physical_plan(hash_agg, ctx, proto_converter)
             }
             PhysicalPlanType::HashJoin(hashjoin) => {
-                self.try_into_hash_join_physical_plan(hashjoin, ctx, extension_codec)
+                self.try_into_hash_join_physical_plan(hashjoin, ctx, proto_converter)
             }
             PhysicalPlanType::SymmetricHashJoin(sym_join) => self
                 .try_into_symmetric_hash_join_physical_plan(
                     sym_join,
                     ctx,
-                    extension_codec,
+                    proto_converter,
                 ),
             PhysicalPlanType::Union(union) => {
-                self.try_into_union_physical_plan(union, ctx, extension_codec)
+                self.try_into_union_physical_plan(union, ctx, proto_converter)
             }
             PhysicalPlanType::Interleave(interleave) => {
-                self.try_into_interleave_physical_plan(interleave, ctx, extension_codec)
+                self.try_into_interleave_physical_plan(interleave, ctx, proto_converter)
             }
             PhysicalPlanType::CrossJoin(crossjoin) => {
-                self.try_into_cross_join_physical_plan(crossjoin, ctx, extension_codec)
+                self.try_into_cross_join_physical_plan(crossjoin, ctx, proto_converter)
             }
             PhysicalPlanType::Empty(empty) => {
-                self.try_into_empty_physical_plan(empty, ctx, extension_codec)
+                self.try_into_empty_physical_plan(empty, ctx, proto_converter)
+            }
+            PhysicalPlanType::PlaceholderRow(placeholder) => {
+                self.try_into_placeholder_row_physical_plan(placeholder, ctx)
             }
-            PhysicalPlanType::PlaceholderRow(placeholder) => self
-                .try_into_placeholder_row_physical_plan(
-                    placeholder,
-                    ctx,
-                    extension_codec,
-                ),
             PhysicalPlanType::Sort(sort) => {
-                self.try_into_sort_physical_plan(sort, ctx, extension_codec)
+                self.try_into_sort_physical_plan(sort, ctx, proto_converter)
             }
             PhysicalPlanType::SortPreservingMerge(sort) => self
-                .try_into_sort_preserving_merge_physical_plan(sort, ctx, extension_codec),
+                .try_into_sort_preserving_merge_physical_plan(sort, ctx, proto_converter),
             PhysicalPlanType::Extension(extension) => {
-                self.try_into_extension_physical_plan(extension, ctx, extension_codec)
+                self.try_into_extension_physical_plan(extension, ctx, proto_converter)
             }
             PhysicalPlanType::NestedLoopJoin(join) => {
-                self.try_into_nested_loop_join_physical_plan(join, ctx, extension_codec)
+                self.try_into_nested_loop_join_physical_plan(join, ctx, proto_converter)
             }
             PhysicalPlanType::Analyze(analyze) => {
-                self.try_into_analyze_physical_plan(analyze, ctx, extension_codec)
+                self.try_into_analyze_physical_plan(analyze, ctx, proto_converter)
             }
             PhysicalPlanType::JsonSink(sink) => {
-                self.try_into_json_sink_physical_plan(sink, ctx, extension_codec)
+                self.try_into_json_sink_physical_plan(sink, ctx, proto_converter)
             }
             PhysicalPlanType::CsvSink(sink) => {
-                self.try_into_csv_sink_physical_plan(sink, ctx, extension_codec)
+                self.try_into_csv_sink_physical_plan(sink, ctx, proto_converter)
             }
             #[cfg_attr(not(feature = "parquet"), allow(unused_variables))]
             PhysicalPlanType::ParquetSink(sink) => {
-                self.try_into_parquet_sink_physical_plan(sink, ctx, extension_codec)
+                self.try_into_parquet_sink_physical_plan(sink, ctx, proto_converter)
             }
             PhysicalPlanType::Unnest(unnest) => {
-                self.try_into_unnest_physical_plan(unnest, ctx, extension_codec)
+                self.try_into_unnest_physical_plan(unnest, ctx, proto_converter)
             }
             PhysicalPlanType::Cooperative(cooperative) => {
-                self.try_into_cooperative_physical_plan(cooperative, ctx, extension_codec)
+                self.try_into_cooperative_physical_plan(cooperative, ctx, proto_converter)
             }
             PhysicalPlanType::GenerateSeries(generate_series) => {
                 self.try_into_generate_series_physical_plan(generate_series)
             }
             PhysicalPlanType::SortMergeJoin(sort_join) => {
-                self.try_into_sort_join(sort_join, ctx, extension_codec)
+                self.try_into_sort_join(sort_join, ctx, proto_converter)
+            }
+            PhysicalPlanType::AsyncFunc(async_func) => {
+                self.try_into_async_func_physical_plan(async_func, ctx, proto_converter)
+            }
+            PhysicalPlanType::Buffer(buffer) => {
+                self.try_into_buffer_physical_plan(buffer, ctx, proto_converter)
+            }
+            PhysicalPlanType::ScalarSubquery(sq) => {
+                self.try_into_scalar_subquery_physical_plan(sq, ctx, proto_converter)
             }
         }
     }
 
-    fn try_from_physical_plan(
+    pub fn try_from_physical_plan_with_converter(
         plan: Arc<dyn ExecutionPlan>,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self>
     where
         Self: Sized,
     {
         let plan_clone = Arc::clone(&plan);
-        let plan = plan.as_any();
+        let plan = plan.as_ref() as &dyn Any;
 
         if let Some(exec) = plan.downcast_ref::<ExplainExec>() {
-            return protobuf::PhysicalPlanNode::try_from_explain_exec(
-                exec,
-                extension_codec,
-            );
+            return protobuf::PhysicalPlanNode::try_from_explain_exec(exec, codec);
         }
 
         if let Some(exec) = plan.downcast_ref::<ProjectionExec>() {
             return protobuf::PhysicalPlanNode::try_from_projection_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<AnalyzeExec>() {
             return protobuf::PhysicalPlanNode::try_from_analyze_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<FilterExec>() {
             return protobuf::PhysicalPlanNode::try_from_filter_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(limit) = plan.downcast_ref::<GlobalLimitExec>() {
             return protobuf::PhysicalPlanNode::try_from_global_limit_exec(
                 limit,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(limit) = plan.downcast_ref::<LocalLimitExec>() {
             return protobuf::PhysicalPlanNode::try_from_local_limit_exec(
                 limit,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<HashJoinExec>() {
             return protobuf::PhysicalPlanNode::try_from_hash_join_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<SymmetricHashJoinExec>() {
             return protobuf::PhysicalPlanNode::try_from_symmetric_hash_join_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<SortMergeJoinExec>() {
             return protobuf::PhysicalPlanNode::try_from_sort_merge_join_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<CrossJoinExec>() {
             return protobuf::PhysicalPlanNode::try_from_cross_join_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<AggregateExec>() {
             return protobuf::PhysicalPlanNode::try_from_aggregate_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(empty) = plan.downcast_ref::<EmptyExec>() {
-            return protobuf::PhysicalPlanNode::try_from_empty_exec(
-                empty,
-                extension_codec,
-            );
+            return protobuf::PhysicalPlanNode::try_from_empty_exec(empty, codec);
         }
 
         if let Some(empty) = plan.downcast_ref::<PlaceholderRowExec>() {
             return protobuf::PhysicalPlanNode::try_from_placeholder_row_exec(
-                empty,
-                extension_codec,
+                empty, codec,
             );
         }
 
+        #[expect(deprecated)]
         if let Some(coalesce_batches) = plan.downcast_ref::<CoalesceBatchesExec>() {
             return protobuf::PhysicalPlanNode::try_from_coalesce_batches_exec(
                 coalesce_batches,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
-        if let Some(data_source_exec) = plan.downcast_ref::<DataSourceExec>() {
-            if let Some(node) = protobuf::PhysicalPlanNode::try_from_data_source_exec(
+        if let Some(data_source_exec) = plan.downcast_ref::<DataSourceExec>()
+            && let Some(node) = protobuf::PhysicalPlanNode::try_from_data_source_exec(
                 data_source_exec,
-                extension_codec,
-            )? {
-                return Ok(node);
-            }
+                codec,
+                proto_converter,
+            )?
+        {
+            return Ok(node);
         }
 
         if let Some(exec) = plan.downcast_ref::<CoalescePartitionsExec>() {
             return protobuf::PhysicalPlanNode::try_from_coalesce_partitions_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<RepartitionExec>() {
             return protobuf::PhysicalPlanNode::try_from_repartition_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<SortExec>() {
-            return protobuf::PhysicalPlanNode::try_from_sort_exec(exec, extension_codec);
+            return protobuf::PhysicalPlanNode::try_from_sort_exec(
+                exec,
+                codec,
+                proto_converter,
+            );
         }
 
         if let Some(union) = plan.downcast_ref::<UnionExec>() {
             return protobuf::PhysicalPlanNode::try_from_union_exec(
                 union,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(interleave) = plan.downcast_ref::<InterleaveExec>() {
             return protobuf::PhysicalPlanNode::try_from_interleave_exec(
                 interleave,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<SortPreservingMergeExec>() {
             return protobuf::PhysicalPlanNode::try_from_sort_preserving_merge_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<NestedLoopJoinExec>() {
             return protobuf::PhysicalPlanNode::try_from_nested_loop_join_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<WindowAggExec>() {
             return protobuf::PhysicalPlanNode::try_from_window_agg_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<BoundedWindowAggExec>() {
             return protobuf::PhysicalPlanNode::try_from_bounded_window_agg_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
-        if let Some(exec) = plan.downcast_ref::<DataSinkExec>() {
-            if let Some(node) = protobuf::PhysicalPlanNode::try_from_data_sink_exec(
+        if let Some(exec) = plan.downcast_ref::<DataSinkExec>()
+            && let Some(node) = protobuf::PhysicalPlanNode::try_from_data_sink_exec(
                 exec,
-                extension_codec,
-            )? {
-                return Ok(node);
-            }
+                codec,
+                proto_converter,
+            )?
+        {
+            return Ok(node);
         }
 
         if let Some(exec) = plan.downcast_ref::<UnnestExec>() {
             return protobuf::PhysicalPlanNode::try_from_unnest_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<CooperativeExec>() {
             return protobuf::PhysicalPlanNode::try_from_cooperative_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
-        if let Some(exec) = plan.downcast_ref::<LazyMemoryExec>() {
-            if let Some(node) =
+        if let Some(exec) = plan.downcast_ref::<LazyMemoryExec>()
+            && let Some(node) =
                 protobuf::PhysicalPlanNode::try_from_lazy_memory_exec(exec)?
-            {
-                return Ok(node);
-            }
+        {
+            return Ok(node);
+        }
+
+        if let Some(exec) = plan.downcast_ref::<AsyncFuncExec>() {
+            return protobuf::PhysicalPlanNode::try_from_async_func_exec(
+                exec,
+                codec,
+                proto_converter,
+            );
+        }
+
+        if let Some(exec) = plan.downcast_ref::<BufferExec>() {
+            return protobuf::PhysicalPlanNode::try_from_buffer_exec(
+                exec,
+                codec,
+                proto_converter,
+            );
+        }
+
+        if let Some(exec) = plan.downcast_ref::<ScalarSubqueryExec>() {
+            return protobuf::PhysicalPlanNode::try_from_scalar_subquery_exec(
+                exec,
+                codec,
+                proto_converter,
+            );
         }
 
         let mut buf: Vec<u8> = vec![];
-        match extension_codec.try_encode(Arc::clone(&plan_clone), &mut buf) {
+        match codec.try_encode(Arc::clone(&plan_clone), &mut buf) {
             Ok(_) => {
                 let inputs: Vec<protobuf::PhysicalPlanNode> = plan_clone
                     .children()
                     .into_iter()
                     .cloned()
                     .map(|i| {
-                        protobuf::PhysicalPlanNode::try_from_physical_plan(
+                        protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
                             i,
-                            extension_codec,
+                            codec,
+                            proto_converter,
                         )
                     })
                     .collect::<Result<_>>()?;
@@ -492,9 +649,8 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_explain_physical_plan(
         &self,
         explain: &protobuf::ExplainExecNode,
-        _ctx: &TaskContext,
-
-        _extension_codec: &dyn PhysicalExtensionCodec,
+        _ctx: &PhysicalPlanDecodeContext<'_>,
+        _proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         Ok(Arc::new(ExplainExec::new(
             Arc::new(explain.schema.as_ref().unwrap().try_into()?),
@@ -510,23 +666,21 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_projection_physical_plan(
         &self,
         projection: &protobuf::ProjectionExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&projection.input, ctx, extension_codec)?;
+            into_physical_plan(&projection.input, ctx, proto_converter)?;
         let exprs = projection
             .expr
             .iter()
             .zip(projection.expr_name.iter())
             .map(|(expr, name)| {
                 Ok((
-                    parse_physical_expr(
+                    proto_converter.proto_to_physical_expr(
                         expr,
-                        ctx,
                         input.schema().as_ref(),
-                        extension_codec,
+                        ctx,
                     )?,
                     name.to_string(),
                 ))
@@ -542,18 +696,17 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_filter_physical_plan(
         &self,
         filter: &protobuf::FilterExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&filter.input, ctx, extension_codec)?;
+            into_physical_plan(&filter.input, ctx, proto_converter)?;
 
         let predicate = filter
             .expr
             .as_ref()
             .map(|expr| {
-                parse_physical_expr(expr, ctx, input.schema().as_ref(), extension_codec)
+                proto_converter.proto_to_physical_expr(expr, input.schema().as_ref(), ctx)
             })
             .transpose()?
             .ok_or_else(|| {
@@ -563,20 +716,28 @@ impl protobuf::PhysicalPlanNode {
             })?;
 
         let filter_selectivity = filter.default_filter_selectivity.try_into();
-        let projection = if !filter.projection.is_empty() {
-            Some(
-                filter
-                    .projection
-                    .iter()
-                    .map(|i| *i as usize)
-                    .collect::<Vec<_>>(),
-            )
-        } else {
+        // Preserve the `None` state across proto boundaries. Proto cannot distinguish
+        // between `None` (full projection) and `Some(vec![])` (empty projection) since
+        // both serialize as an empty list. If all columns are included, we reconstruct
+        // `None` to avoid losing this semantic distinction on deserialization.
+        let num_fields = input.schema().fields().len();
+        let mut is_full_projection = filter.projection.len() == num_fields;
+        let mut projection_vec: Vec<usize> = Vec::with_capacity(filter.projection.len());
+        for (i, idx) in filter.projection.iter().enumerate() {
+            let idx = *idx as usize;
+            is_full_projection &= idx == i;
+            projection_vec.push(idx);
+        }
+        let projection = if is_full_projection {
             None
+        } else {
+            Some(projection_vec)
         };
-
-        let filter =
-            FilterExec::try_new(predicate, input)?.with_projection(projection)?;
+        let filter = FilterExecBuilder::new(predicate, input)
+            .apply_projection(projection)?
+            .with_batch_size(filter.batch_size as usize)
+            .with_fetch(filter.fetch.map(|f| f as usize))
+            .build()?;
         match filter_selectivity {
             Ok(filter_selectivity) => Ok(Arc::new(
                 filter.with_default_selectivity(filter_selectivity)?,
@@ -590,9 +751,8 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_csv_scan_physical_plan(
         &self,
         scan: &protobuf::CsvScanExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let escape =
             if let Some(protobuf::csv_scan_exec_node::OptionalEscape::Escape(escape)) =
@@ -612,23 +772,30 @@ impl protobuf::PhysicalPlanNode {
             None
         };
 
+        // Parse table schema with partition columns
+        let table_schema =
+            parse_table_schema_from_proto(scan.base_conf.as_ref().unwrap())?;
+
+        let csv_options = CsvOptions {
+            has_header: Some(scan.has_header),
+            delimiter: str_to_byte(&scan.delimiter, "delimiter")?,
+            quote: str_to_byte(&scan.quote, "quote")?,
+            newlines_in_values: Some(scan.newlines_in_values),
+            ..Default::default()
+        };
         let source = Arc::new(
-            CsvSource::new(
-                scan.has_header,
-                str_to_byte(&scan.delimiter, "delimiter")?,
-                0,
-            )
-            .with_escape(escape)
-            .with_comment(comment),
+            CsvSource::new(table_schema)
+                .with_csv_options(csv_options)
+                .with_escape(escape)
+                .with_comment(comment),
         );
 
         let conf = FileScanConfigBuilder::from(parse_protobuf_file_scan_config(
             scan.base_conf.as_ref().unwrap(),
             ctx,
-            extension_codec,
+            proto_converter,
             source,
         )?)
-        .with_newlines_in_values(scan.newlines_in_values)
         .with_file_compression_type(FileCompressionType::UNCOMPRESSED)
         .build();
         Ok(DataSourceExec::from_data_source(conf))
@@ -637,26 +804,45 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_json_scan_physical_plan(
         &self,
         scan: &protobuf::JsonScanExecNode,
-        ctx: &TaskContext,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let base_conf = scan.base_conf.as_ref().unwrap();
+        let table_schema = parse_table_schema_from_proto(base_conf)?;
+        let scan_conf = parse_protobuf_file_scan_config(
+            base_conf,
+            ctx,
+            proto_converter,
+            Arc::new(JsonSource::new(table_schema)),
+        )?;
+        Ok(DataSourceExec::from_data_source(scan_conf))
+    }
 
-        extension_codec: &dyn PhysicalExtensionCodec,
+    fn try_into_arrow_scan_physical_plan(
+        &self,
+        scan: &protobuf::ArrowScanExecNode,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        let base_conf = scan.base_conf.as_ref().ok_or_else(|| {
+            internal_datafusion_err!("base_conf in ArrowScanExecNode is missing.")
+        })?;
+        let table_schema = parse_table_schema_from_proto(base_conf)?;
         let scan_conf = parse_protobuf_file_scan_config(
-            scan.base_conf.as_ref().unwrap(),
+            base_conf,
             ctx,
-            extension_codec,
-            Arc::new(JsonSource::new()),
+            proto_converter,
+            Arc::new(ArrowSource::new_file_source(table_schema)),
         )?;
         Ok(DataSourceExec::from_data_source(scan_conf))
     }
 
-    #[cfg_attr(not(feature = "parquet"), allow(unused_variables))]
+    #[cfg_attr(not(feature = "parquet"), expect(unused_variables))]
     fn try_into_parquet_scan_physical_plan(
         &self,
         scan: &protobuf::ParquetScanExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         #[cfg(feature = "parquet")]
         {
@@ -673,7 +859,7 @@ impl protobuf::PhysicalPlanNode {
                     .iter()
                     .map(|&i| schema.field(i as usize).clone())
                     .collect();
-                Arc::new(arrow::datatypes::Schema::new(projected_fields))
+                Arc::new(Schema::new(projected_fields))
             } else {
                 schema
             };
@@ -682,11 +868,10 @@ impl protobuf::PhysicalPlanNode {
                 .predicate
                 .as_ref()
                 .map(|expr| {
-                    parse_physical_expr(
+                    proto_converter.proto_to_physical_expr(
                         expr,
-                        ctx,
                         predicate_schema.as_ref(),
-                        extension_codec,
+                        ctx,
                     )
                 })
                 .transpose()?;
@@ -695,7 +880,28 @@ impl protobuf::PhysicalPlanNode {
             if let Some(table_options) = scan.parquet_options.as_ref() {
                 options = table_options.try_into()?;
             }
-            let mut source = ParquetSource::new(options);
+
+            // Parse table schema with partition columns
+            let table_schema = parse_table_schema_from_proto(base_conf)?;
+            let object_store_url = match base_conf.object_store_url.is_empty() {
+                false => ObjectStoreUrl::parse(&base_conf.object_store_url)?,
+                true => ObjectStoreUrl::local_filesystem(),
+            };
+            let store = ctx
+                .task_ctx()
+                .runtime_env()
+                .object_store(object_store_url)?;
+            let metadata_cache = ctx
+                .task_ctx()
+                .runtime_env()
+                .cache_manager
+                .get_file_metadata_cache();
+            let reader_factory =
+                Arc::new(CachedParquetFileReaderFactory::new(store, metadata_cache));
+
+            let mut source = ParquetSource::new(table_schema)
+                .with_parquet_file_reader_factory(reader_factory)
+                .with_table_parquet_options(options);
 
             if let Some(predicate) = predicate {
                 source = source.with_predicate(predicate);
@@ -703,33 +909,37 @@ impl protobuf::PhysicalPlanNode {
             let base_config = parse_protobuf_file_scan_config(
                 base_conf,
                 ctx,
-                extension_codec,
+                proto_converter,
                 Arc::new(source),
             )?;
             Ok(DataSourceExec::from_data_source(base_config))
         }
         #[cfg(not(feature = "parquet"))]
-        panic!("Unable to process a Parquet PhysicalPlan when `parquet` feature is not enabled")
+        panic!(
+            "Unable to process a Parquet PhysicalPlan when `parquet` feature is not enabled"
+        )
     }
 
-    #[cfg_attr(not(feature = "avro"), allow(unused_variables))]
+    #[cfg_attr(not(feature = "avro"), expect(unused_variables))]
     fn try_into_avro_scan_physical_plan(
         &self,
         scan: &protobuf::AvroScanExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         #[cfg(feature = "avro")]
         {
+            let table_schema =
+                parse_table_schema_from_proto(scan.base_conf.as_ref().unwrap())?;
             let conf = parse_protobuf_file_scan_config(
                 scan.base_conf.as_ref().unwrap(),
                 ctx,
-                extension_codec,
-                Arc::new(AvroSource::new()),
+                proto_converter,
+                Arc::new(AvroSource::new(table_schema)),
             )?;
             Ok(DataSourceExec::from_data_source(conf))
         }
+
         #[cfg(not(feature = "avro"))]
         panic!("Unable to process a Avro PhysicalPlan when `avro` feature is not enabled")
     }
@@ -737,9 +947,8 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_memory_scan_physical_plan(
         &self,
         scan: &protobuf::MemoryScanExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let partitions = scan
             .partitions
@@ -769,7 +978,7 @@ impl protobuf::PhysicalPlanNode {
                 &ordering.physical_sort_expr_nodes,
                 ctx,
                 &schema,
-                extension_codec,
+                proto_converter,
             )?;
             sort_information.extend(LexOrdering::new(sort_exprs));
         }
@@ -786,13 +995,13 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_coalesce_batches_physical_plan(
         &self,
         coalesce_batches: &protobuf::CoalesceBatchesExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&coalesce_batches.input, ctx, extension_codec)?;
+            into_physical_plan(&coalesce_batches.input, ctx, proto_converter)?;
         Ok(Arc::new(
+            #[expect(deprecated)]
             CoalesceBatchesExec::new(input, coalesce_batches.target_batch_size as usize)
                 .with_fetch(coalesce_batches.fetch.map(|f| f as usize)),
         ))
@@ -801,12 +1010,11 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_merge_physical_plan(
         &self,
         merge: &protobuf::CoalescePartitionsExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&merge.input, ctx, extension_codec)?;
+            into_physical_plan(&merge.input, ctx, proto_converter)?;
         Ok(Arc::new(
             CoalescePartitionsExec::new(input)
                 .with_fetch(merge.fetch.map(|f| f as usize)),
@@ -816,33 +1024,32 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_repartition_physical_plan(
         &self,
         repart: &protobuf::RepartitionExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&repart.input, ctx, extension_codec)?;
+            into_physical_plan(&repart.input, ctx, proto_converter)?;
         let partitioning = parse_protobuf_partitioning(
             repart.partitioning.as_ref(),
             ctx,
             input.schema().as_ref(),
-            extension_codec,
+            proto_converter,
         )?;
-        Ok(Arc::new(RepartitionExec::try_new(
-            input,
-            partitioning.unwrap(),
-        )?))
+        let mut repart_exec = RepartitionExec::try_new(input, partitioning.unwrap())?;
+        if repart.preserve_order {
+            repart_exec = repart_exec.with_preserve_order();
+        }
+        Ok(Arc::new(repart_exec))
     }
 
     fn try_into_global_limit_physical_plan(
         &self,
         limit: &protobuf::GlobalLimitExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&limit.input, ctx, extension_codec)?;
+            into_physical_plan(&limit.input, ctx, proto_converter)?;
         let fetch = if limit.fetch >= 0 {
             Some(limit.fetch as usize)
         } else {
@@ -858,24 +1065,22 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_local_limit_physical_plan(
         &self,
         limit: &protobuf::LocalLimitExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&limit.input, ctx, extension_codec)?;
+            into_physical_plan(&limit.input, ctx, proto_converter)?;
         Ok(Arc::new(LocalLimitExec::new(input, limit.fetch as usize)))
     }
 
     fn try_into_window_physical_plan(
         &self,
         window_agg: &protobuf::WindowAggExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&window_agg.input, ctx, extension_codec)?;
+            into_physical_plan(&window_agg.input, ctx, proto_converter)?;
         let input_schema = input.schema();
 
         let physical_window_expr: Vec<Arc<dyn WindowExpr>> = window_agg
@@ -886,7 +1091,7 @@ impl protobuf::PhysicalPlanNode {
                     window_expr,
                     ctx,
                     input_schema.as_ref(),
-                    extension_codec,
+                    proto_converter,
                 )
             })
             .collect::<Result<Vec<_>, _>>()?;
@@ -895,7 +1100,7 @@ impl protobuf::PhysicalPlanNode {
             .partition_keys
             .iter()
             .map(|expr| {
-                parse_physical_expr(expr, ctx, input.schema().as_ref(), extension_codec)
+                proto_converter.proto_to_physical_expr(expr, input.schema().as_ref(), ctx)
             })
             .collect::<Result<Vec<Arc<dyn PhysicalExpr>>>>()?;
 
@@ -928,12 +1133,11 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_aggregate_physical_plan(
         &self,
         hash_agg: &protobuf::AggregateExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&hash_agg.input, ctx, extension_codec)?;
+            into_physical_plan(&hash_agg.input, ctx, proto_converter)?;
         let mode = protobuf::AggregateMode::try_from(hash_agg.mode).map_err(|_| {
             proto_error(format!(
                 "Received a AggregateNode message with unknown AggregateMode {}",
@@ -948,6 +1152,7 @@ impl protobuf::PhysicalPlanNode {
             protobuf::AggregateMode::SinglePartitioned => {
                 AggregateMode::SinglePartitioned
             }
+            protobuf::AggregateMode::PartialReduce => AggregateMode::PartialReduce,
         };
 
         let num_expr = hash_agg.group_expr.len();
@@ -957,7 +1162,8 @@ impl protobuf::PhysicalPlanNode {
             .iter()
             .zip(hash_agg.group_expr_name.iter())
             .map(|(expr, name)| {
-                parse_physical_expr(expr, ctx, input.schema().as_ref(), extension_codec)
+                proto_converter
+                    .proto_to_physical_expr(expr, input.schema().as_ref(), ctx)
                     .map(|expr| (expr, name.to_string()))
             })
             .collect::<Result<Vec<_>, _>>()?;
@@ -967,7 +1173,8 @@ impl protobuf::PhysicalPlanNode {
             .iter()
             .zip(hash_agg.group_expr_name.iter())
             .map(|(expr, name)| {
-                parse_physical_expr(expr, ctx, input.schema().as_ref(), extension_codec)
+                proto_converter
+                    .proto_to_physical_expr(expr, input.schema().as_ref(), ctx)
                     .map(|expr| (expr, name.to_string()))
             })
             .collect::<Result<Vec<_>, _>>()?;
@@ -982,6 +1189,8 @@ impl protobuf::PhysicalPlanNode {
             vec![]
         };
 
+        let has_grouping_set = hash_agg.has_grouping_set;
+
         let input_schema = hash_agg.input_schema.as_ref().ok_or_else(|| {
             internal_datafusion_err!("input_schema in AggregateNode is missing.")
         })?;
@@ -994,7 +1203,7 @@ impl protobuf::PhysicalPlanNode {
                 expr.expr
                     .as_ref()
                     .map(|e| {
-                        parse_physical_expr(e, ctx, &physical_schema, extension_codec)
+                        proto_converter.proto_to_physical_expr(e, &physical_schema, ctx)
                     })
                     .transpose()
             })
@@ -1015,11 +1224,10 @@ impl protobuf::PhysicalPlanNode {
                             .expr
                             .iter()
                             .map(|e| {
-                                parse_physical_expr(
+                                proto_converter.proto_to_physical_expr(
                                     e,
-                                    ctx,
                                     &physical_schema,
-                                    extension_codec,
+                                    ctx,
                                 )
                             })
                             .collect::<Result<Vec<_>>>()?;
@@ -1031,7 +1239,7 @@ impl protobuf::PhysicalPlanNode {
                                     e,
                                     ctx,
                                     &physical_schema,
-                                    extension_codec,
+                                    proto_converter,
                                 )
                             })
                             .collect::<Result<_>>()?;
@@ -1041,12 +1249,15 @@ impl protobuf::PhysicalPlanNode {
                             .map(|func| match func {
                                 AggregateFunction::UserDefinedAggrFunction(udaf_name) => {
                                     let agg_udf = match &agg_node.fun_definition {
-                                        Some(buf) => extension_codec
-                                            .try_decode_udaf(udaf_name, buf)?,
-                                        None => ctx.udaf(udaf_name).or_else(|_| {
-                                            extension_codec
-                                                .try_decode_udaf(udaf_name, &[])
-                                        })?,
+                                        Some(buf) => {
+                                            ctx.codec().try_decode_udaf(udaf_name, buf)?
+                                        }
+                                        None => ctx.task_ctx().udaf(udaf_name).or_else(
+                                            |_| {
+                                                ctx.codec()
+                                                    .try_decode_udaf(udaf_name, &[])
+                                            },
+                                        )?,
                                     };
 
                                     AggregateExprBuilder::new(agg_udf, input_phy_expr)
@@ -1072,21 +1283,43 @@ impl protobuf::PhysicalPlanNode {
             })
             .collect::<Result<Vec<_>, _>>()?;
 
-        let limit = hash_agg
-            .limit
-            .as_ref()
-            .map(|lit_value| lit_value.limit as usize);
-
+        let physical_schema_ref = Arc::clone(&physical_schema);
         let agg = AggregateExec::try_new(
             agg_mode,
-            PhysicalGroupBy::new(group_expr, null_expr, groups),
+            PhysicalGroupBy::new(group_expr, null_expr, groups, has_grouping_set),
             physical_aggr_expr,
             physical_filter_expr,
             input,
             physical_schema,
         )?;
 
-        let agg = agg.with_limit(limit);
+        let agg = if let Some(limit_proto) = &hash_agg.limit {
+            let limit = limit_proto.limit as usize;
+            let limit_options = match limit_proto.descending {
+                Some(descending) => LimitOptions::new_with_order(limit, descending),
+                None => LimitOptions::new(limit),
+            };
+            agg.with_limit_options(Some(limit_options))
+        } else {
+            agg
+        };
+
+        let agg = if let Some(dynamic_filter_proto) = &hash_agg.dynamic_filter {
+            let dynamic_filter_expr = proto_converter.proto_to_physical_expr(
+                dynamic_filter_proto,
+                physical_schema_ref.as_ref(),
+                ctx,
+            )?;
+            if let Ok(df) = (dynamic_filter_expr as Arc<dyn Any + Send + Sync>)
+                .downcast::<DynamicFilterPhysicalExpr>()
+            {
+                agg.with_dynamic_filter(df)?
+            } else {
+                agg
+            }
+        } else {
+            agg
+        };
 
         Ok(Arc::new(agg))
     }
@@ -1094,31 +1327,28 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_hash_join_physical_plan(
         &self,
         hashjoin: &protobuf::HashJoinExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let left: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&hashjoin.left, ctx, extension_codec)?;
+            into_physical_plan(&hashjoin.left, ctx, proto_converter)?;
         let right: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&hashjoin.right, ctx, extension_codec)?;
+            into_physical_plan(&hashjoin.right, ctx, proto_converter)?;
         let left_schema = left.schema();
         let right_schema = right.schema();
         let on: Vec<(PhysicalExprRef, PhysicalExprRef)> = hashjoin
             .on
             .iter()
             .map(|col| {
-                let left = parse_physical_expr(
+                let left = proto_converter.proto_to_physical_expr(
                     &col.left.clone().unwrap(),
-                    ctx,
                     left_schema.as_ref(),
-                    extension_codec,
+                    ctx,
                 )?;
-                let right = parse_physical_expr(
+                let right = proto_converter.proto_to_physical_expr(
                     &col.right.clone().unwrap(),
-                    ctx,
                     right_schema.as_ref(),
-                    extension_codec,
+                    ctx,
                 )?;
                 Ok((left, right))
             })
@@ -1147,12 +1377,12 @@ impl protobuf::PhysicalPlanNode {
                     .ok_or_else(|| proto_error("Missing JoinFilter schema"))?
                     .try_into()?;
 
-                let expression = parse_physical_expr(
+                let expression = proto_converter.proto_to_physical_expr(
                     f.expression.as_ref().ok_or_else(|| {
                         proto_error("Unexpected empty filter expression")
                     })?,
-                    ctx, &schema,
-                    extension_codec,
+                    &schema,
+                    ctx,
                 )?;
                 let column_indices = f.column_indices
                     .iter()
@@ -1197,7 +1427,7 @@ impl protobuf::PhysicalPlanNode {
         } else {
             None
         };
-        Ok(Arc::new(HashJoinExec::try_new(
+        let mut hash_join = HashJoinExec::try_new(
             left,
             right,
             on,
@@ -1206,35 +1436,48 @@ impl protobuf::PhysicalPlanNode {
             projection,
             partition_mode,
             null_equality.into(),
-        )?))
+            hashjoin.null_aware,
+        )?;
+
+        if let Some(dynamic_filter_proto) = &hashjoin.dynamic_filter {
+            let dynamic_filter_expr = proto_converter.proto_to_physical_expr(
+                dynamic_filter_proto,
+                right_schema.as_ref(),
+                ctx,
+            )?;
+            if let Ok(df) = (dynamic_filter_expr as Arc<dyn Any + Send + Sync>)
+                .downcast::<DynamicFilterPhysicalExpr>()
+            {
+                hash_join = hash_join.with_dynamic_filter(df)?;
+            }
+        }
+
+        Ok(Arc::new(hash_join))
     }
 
     fn try_into_symmetric_hash_join_physical_plan(
         &self,
         sym_join: &protobuf::SymmetricHashJoinExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let left = into_physical_plan(&sym_join.left, ctx, extension_codec)?;
-        let right = into_physical_plan(&sym_join.right, ctx, extension_codec)?;
+        let left = into_physical_plan(&sym_join.left, ctx, proto_converter)?;
+        let right = into_physical_plan(&sym_join.right, ctx, proto_converter)?;
         let left_schema = left.schema();
         let right_schema = right.schema();
         let on = sym_join
             .on
             .iter()
             .map(|col| {
-                let left = parse_physical_expr(
+                let left = proto_converter.proto_to_physical_expr(
                     &col.left.clone().unwrap(),
-                    ctx,
                     left_schema.as_ref(),
-                    extension_codec,
+                    ctx,
                 )?;
-                let right = parse_physical_expr(
+                let right = proto_converter.proto_to_physical_expr(
                     &col.right.clone().unwrap(),
-                    ctx,
                     right_schema.as_ref(),
-                    extension_codec,
+                    ctx,
                 )?;
                 Ok((left, right))
             })
@@ -1263,12 +1506,12 @@ impl protobuf::PhysicalPlanNode {
                     .ok_or_else(|| proto_error("Missing JoinFilter schema"))?
                     .try_into()?;
 
-                let expression = parse_physical_expr(
+                let expression = proto_converter.proto_to_physical_expr(
                     f.expression.as_ref().ok_or_else(|| {
                         proto_error("Unexpected empty filter expression")
                     })?,
-                    ctx, &schema,
-                    extension_codec,
+                    &schema,
+                    ctx,
                 )?;
                 let column_indices = f.column_indices
                     .iter()
@@ -1294,7 +1537,7 @@ impl protobuf::PhysicalPlanNode {
             &sym_join.left_sort_exprs,
             ctx,
             &left_schema,
-            extension_codec,
+            proto_converter,
         )?;
         let left_sort_exprs = LexOrdering::new(left_sort_exprs);
 
@@ -1302,7 +1545,7 @@ impl protobuf::PhysicalPlanNode {
             &sym_join.right_sort_exprs,
             ctx,
             &right_schema,
-            extension_codec,
+            proto_converter,
         )?;
         let right_sort_exprs = LexOrdering::new(right_sort_exprs);
 
@@ -1340,13 +1583,12 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_union_physical_plan(
         &self,
         union: &protobuf::UnionExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let mut inputs: Vec<Arc<dyn ExecutionPlan>> = vec![];
         for input in &union.inputs {
-            inputs.push(input.try_into_physical_plan(ctx, extension_codec)?);
+            inputs.push(proto_converter.proto_to_execution_plan(input, ctx)?);
         }
         UnionExec::try_new(inputs)
     }
@@ -1354,13 +1596,12 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_interleave_physical_plan(
         &self,
         interleave: &protobuf::InterleaveExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let mut inputs: Vec<Arc<dyn ExecutionPlan>> = vec![];
         for input in &interleave.inputs {
-            inputs.push(input.try_into_physical_plan(ctx, extension_codec)?);
+            inputs.push(proto_converter.proto_to_execution_plan(input, ctx)?);
         }
         Ok(Arc::new(InterleaveExec::try_new(inputs)?))
     }
@@ -1368,23 +1609,21 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_cross_join_physical_plan(
         &self,
         crossjoin: &protobuf::CrossJoinExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let left: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&crossjoin.left, ctx, extension_codec)?;
+            into_physical_plan(&crossjoin.left, ctx, proto_converter)?;
         let right: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&crossjoin.right, ctx, extension_codec)?;
+            into_physical_plan(&crossjoin.right, ctx, proto_converter)?;
         Ok(Arc::new(CrossJoinExec::new(left, right)))
     }
 
     fn try_into_empty_physical_plan(
         &self,
         empty: &protobuf::EmptyExecNode,
-        _ctx: &TaskContext,
-
-        _extension_codec: &dyn PhysicalExtensionCodec,
+        _ctx: &PhysicalPlanDecodeContext<'_>,
+        _proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let schema = Arc::new(convert_required!(empty.schema)?);
         Ok(Arc::new(EmptyExec::new(schema)))
@@ -1393,9 +1632,7 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_placeholder_row_physical_plan(
         &self,
         placeholder: &protobuf::PlaceholderRowExecNode,
-        _ctx: &TaskContext,
-
-        _extension_codec: &dyn PhysicalExtensionCodec,
+        _ctx: &PhysicalPlanDecodeContext<'_>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let schema = Arc::new(convert_required!(placeholder.schema)?);
         Ok(Arc::new(PlaceholderRowExec::new(schema)))
@@ -1404,11 +1641,10 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_sort_physical_plan(
         &self,
         sort: &protobuf::SortExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let input = into_physical_plan(&sort.input, ctx, extension_codec)?;
+        let input = into_physical_plan(&sort.input, ctx, proto_converter)?;
         let exprs = sort
             .expr
             .iter()
@@ -1429,7 +1665,11 @@ impl protobuf::PhysicalPlanNode {
                         })?
                         .as_ref();
                     Ok(PhysicalSortExpr {
-                        expr: parse_physical_expr(expr, ctx, input.schema().as_ref(), extension_codec)?,
+                        expr: proto_converter.proto_to_physical_expr(
+                            expr,
+                            input.schema().as_ref(),
+                            ctx,
+                        )?,
                         options: SortOptions {
                             descending: !sort_expr.asc,
                             nulls_first: sort_expr.nulls_first,
@@ -1450,17 +1690,33 @@ impl protobuf::PhysicalPlanNode {
             .with_fetch(fetch)
             .with_preserve_partitioning(sort.preserve_partitioning);
 
+        let new_sort = if let Some(dynamic_filter_proto) = &sort.dynamic_filter {
+            let dynamic_filter_expr = proto_converter.proto_to_physical_expr(
+                dynamic_filter_proto,
+                new_sort.input().schema().as_ref(),
+                ctx,
+            )?;
+            if let Ok(df) = (dynamic_filter_expr as Arc<dyn Any + Send + Sync>)
+                .downcast::<DynamicFilterPhysicalExpr>()
+            {
+                new_sort.with_dynamic_filter(df)?
+            } else {
+                new_sort
+            }
+        } else {
+            new_sort
+        };
+
         Ok(Arc::new(new_sort))
     }
 
     fn try_into_sort_preserving_merge_physical_plan(
         &self,
         sort: &protobuf::SortPreservingMergeExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let input = into_physical_plan(&sort.input, ctx, extension_codec)?;
+        let input = into_physical_plan(&sort.input, ctx, proto_converter)?;
         let exprs = sort
             .expr
             .iter()
@@ -1481,11 +1737,10 @@ impl protobuf::PhysicalPlanNode {
                         })?
                         .as_ref();
                     Ok(PhysicalSortExpr {
-                        expr: parse_physical_expr(
+                        expr: proto_converter.proto_to_physical_expr(
                             expr,
-                            ctx,
                             input.schema().as_ref(),
-                            extension_codec,
+                            ctx,
                         )?,
                         options: SortOptions {
                             descending: !sort_expr.asc,
@@ -1509,18 +1764,18 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_extension_physical_plan(
         &self,
         extension: &protobuf::PhysicalExtensionNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let inputs: Vec<Arc<dyn ExecutionPlan>> = extension
             .inputs
             .iter()
-            .map(|i| i.try_into_physical_plan(ctx, extension_codec))
+            .map(|i| proto_converter.proto_to_execution_plan(i, ctx))
             .collect::<Result<_>>()?;
 
         let extension_node =
-            extension_codec.try_decode(extension.node.as_slice(), &inputs, ctx)?;
+            ctx.codec()
+                .try_decode(extension.node.as_slice(), &inputs, ctx.task_ctx())?;
 
         Ok(extension_node)
     }
@@ -1528,14 +1783,13 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_nested_loop_join_physical_plan(
         &self,
         join: &protobuf::NestedLoopJoinExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let left: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&join.left, ctx, extension_codec)?;
+            into_physical_plan(&join.left, ctx, proto_converter)?;
         let right: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&join.right, ctx, extension_codec)?;
+            into_physical_plan(&join.right, ctx, proto_converter)?;
         let join_type = protobuf::JoinType::try_from(join.join_type).map_err(|_| {
             proto_error(format!(
                 "Received a NestedLoopJoinExecNode message with unknown JoinType {}",
@@ -1552,12 +1806,13 @@ impl protobuf::PhysicalPlanNode {
                             .ok_or_else(|| proto_error("Missing JoinFilter schema"))?
                             .try_into()?;
 
-                        let expression = parse_physical_expr(
+                        let expression = proto_converter
+                            .proto_to_physical_expr(
                             f.expression.as_ref().ok_or_else(|| {
                                 proto_error("Unexpected empty filter expression")
                             })?,
-                            ctx, &schema,
-                            extension_codec,
+                            &schema,
+                            ctx,
                         )?;
                         let column_indices = f.column_indices
                             .iter()
@@ -1602,16 +1857,26 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_analyze_physical_plan(
         &self,
         analyze: &protobuf::AnalyzeExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&analyze.input, ctx, extension_codec)?;
+            into_physical_plan(&analyze.input, ctx, proto_converter)?;
+        let metric_categories = if analyze.has_metric_categories {
+            let cats: Result<Vec<MetricCategory>> = analyze
+                .metric_categories
+                .iter()
+                .map(|s| s.parse::<MetricCategory>())
+                .collect();
+            Some(cats?)
+        } else {
+            None
+        };
         Ok(Arc::new(AnalyzeExec::new(
             analyze.verbose,
             analyze.show_statistics,
-            vec![MetricType::SUMMARY, MetricType::DEV],
+            vec![MetricType::Summary, MetricType::Dev],
+            metric_categories,
             input,
             Arc::new(convert_required!(analyze.schema)?),
         )))
@@ -1620,11 +1885,10 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_json_sink_physical_plan(
         &self,
         sink: &protobuf::JsonSinkExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let input = into_physical_plan(&sink.input, ctx, extension_codec)?;
+        let input = into_physical_plan(&sink.input, ctx, proto_converter)?;
 
         let data_sink: JsonSink = sink
             .sink
@@ -1640,7 +1904,7 @@ impl protobuf::PhysicalPlanNode {
                     &collection.physical_sort_expr_nodes,
                     ctx,
                     &sink_schema,
-                    extension_codec,
+                    proto_converter,
                 )
                 .map(|sort_exprs| {
                     LexRequirement::new(sort_exprs.into_iter().map(Into::into))
@@ -1658,11 +1922,10 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_csv_sink_physical_plan(
         &self,
         sink: &protobuf::CsvSinkExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let input = into_physical_plan(&sink.input, ctx, extension_codec)?;
+        let input = into_physical_plan(&sink.input, ctx, proto_converter)?;
 
         let data_sink: CsvSink = sink
             .sink
@@ -1678,7 +1941,7 @@ impl protobuf::PhysicalPlanNode {
                     &collection.physical_sort_expr_nodes,
                     ctx,
                     &sink_schema,
-                    extension_codec,
+                    proto_converter,
                 )
                 .map(|sort_exprs| {
                     LexRequirement::new(sort_exprs.into_iter().map(Into::into))
@@ -1697,13 +1960,12 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_parquet_sink_physical_plan(
         &self,
         sink: &protobuf::ParquetSinkExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         #[cfg(feature = "parquet")]
         {
-            let input = into_physical_plan(&sink.input, ctx, extension_codec)?;
+            let input = into_physical_plan(&sink.input, ctx, proto_converter)?;
 
             let data_sink: ParquetSink = sink
                 .sink
@@ -1719,7 +1981,7 @@ impl protobuf::PhysicalPlanNode {
                         &collection.physical_sort_expr_nodes,
                         ctx,
                         &sink_schema,
-                        extension_codec,
+                        proto_converter,
                     )
                     .map(|sort_exprs| {
                         LexRequirement::new(sort_exprs.into_iter().map(Into::into))
@@ -1740,11 +2002,10 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_unnest_physical_plan(
         &self,
         unnest: &protobuf::UnnestExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let input = into_physical_plan(&unnest.input, ctx, extension_codec)?;
+        let input = into_physical_plan(&unnest.input, ctx, proto_converter)?;
 
         Ok(Arc::new(UnnestExec::new(
             input,
@@ -1771,13 +2032,12 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_sort_join(
         &self,
         sort_join: &SortMergeJoinExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let left = into_physical_plan(&sort_join.left, ctx, extension_codec)?;
+        let left = into_physical_plan(&sort_join.left, ctx, proto_converter)?;
         let left_schema = left.schema();
-        let right = into_physical_plan(&sort_join.right, ctx, extension_codec)?;
+        let right = into_physical_plan(&sort_join.right, ctx, proto_converter)?;
         let right_schema = right.schema();
 
         let filter = sort_join
@@ -1790,13 +2050,12 @@ impl protobuf::PhysicalPlanNode {
                     .ok_or_else(|| proto_error("Missing JoinFilter schema"))?
                     .try_into()?;
 
-                let expression = parse_physical_expr(
+                let expression = proto_converter.proto_to_physical_expr(
                     f.expression.as_ref().ok_or_else(|| {
                         proto_error("Unexpected empty filter expression")
                     })?,
-                    ctx,
                     &schema,
-                    extension_codec,
+                    ctx,
                 )?;
                 let column_indices = f
                     .column_indices
@@ -1853,17 +2112,15 @@ impl protobuf::PhysicalPlanNode {
             .on
             .iter()
             .map(|col| {
-                let left = parse_physical_expr(
+                let left = proto_converter.proto_to_physical_expr(
                     &col.left.clone().unwrap(),
-                    ctx,
                     left_schema.as_ref(),
-                    extension_codec,
+                    ctx,
                 )?;
-                let right = parse_physical_expr(
+                let right = proto_converter.proto_to_physical_expr(
                     &col.right.clone().unwrap(),
-                    ctx,
                     right_schema.as_ref(),
-                    extension_codec,
+                    ctx,
                 )?;
                 Ok((left, right))
             })
@@ -1940,8 +2197,7 @@ impl protobuf::PhysicalPlanNode {
         };
 
         let table = GenerateSeriesTable::new(Arc::clone(&schema), args);
-        let generator =
-            table.as_generator(generate_series.target_batch_size as usize, None)?;
+        let generator = table.as_generator(generate_series.target_batch_size as usize)?;
 
         Ok(Arc::new(LazyMemoryExec::try_new(schema, vec![generator])?))
     }
@@ -1949,17 +2205,99 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_cooperative_physical_plan(
         &self,
         field_stream: &protobuf::CooperativeExecNode,
-        ctx: &TaskContext,
-
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let input = into_physical_plan(&field_stream.input, ctx, extension_codec)?;
+        let input = into_physical_plan(&field_stream.input, ctx, proto_converter)?;
         Ok(Arc::new(CooperativeExec::new(input)))
     }
 
+    fn try_into_async_func_physical_plan(
+        &self,
+        async_func: &protobuf::AsyncFuncExecNode,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let input: Arc<dyn ExecutionPlan> =
+            into_physical_plan(&async_func.input, ctx, proto_converter)?;
+
+        if async_func.async_exprs.len() != async_func.async_expr_names.len() {
+            return internal_err!(
+                "AsyncFuncExecNode async_exprs length does not match async_expr_names"
+            );
+        }
+
+        let async_exprs = async_func
+            .async_exprs
+            .iter()
+            .zip(async_func.async_expr_names.iter())
+            .map(|(expr, name)| {
+                let physical_expr = proto_converter.proto_to_physical_expr(
+                    expr,
+                    input.schema().as_ref(),
+                    ctx,
+                )?;
+
+                Ok(Arc::new(AsyncFuncExpr::try_new(
+                    name.clone(),
+                    physical_expr,
+                    input.schema().as_ref(),
+                )?))
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        Ok(Arc::new(AsyncFuncExec::try_new(async_exprs, input)?))
+    }
+
+    fn try_into_buffer_physical_plan(
+        &self,
+        buffer: &protobuf::BufferExecNode,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let input: Arc<dyn ExecutionPlan> =
+            into_physical_plan(&buffer.input, ctx, proto_converter)?;
+
+        Ok(Arc::new(BufferExec::new(input, buffer.capacity as usize)))
+    }
+
+    fn try_into_scalar_subquery_physical_plan(
+        &self,
+        sq: &protobuf::ScalarSubqueryExecNode,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        // First, deserialize the main input plan. We set up the subquery results
+        // container first, so that ScalarSubqueryExpr nodes can reference it.
+        let subquery_results = ScalarSubqueryResults::new(sq.subqueries.len());
+        let input_ctx = ctx.with_scalar_subquery_results(subquery_results.clone());
+        let input = into_physical_plan(&sq.input, &input_ctx, proto_converter)?;
+
+        // Now deserialize the subquery children.
+        let subqueries: Vec<ScalarSubqueryLink> = sq
+            .subqueries
+            .iter()
+            .enumerate()
+            .map(|(index, sq_plan)| {
+                let plan =
+                    sq_plan.try_into_physical_plan_with_context(ctx, proto_converter)?;
+                Ok(ScalarSubqueryLink {
+                    plan,
+                    index: SubqueryIndex::new(index),
+                })
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        Ok(Arc::new(ScalarSubqueryExec::new(
+            input,
+            subqueries,
+            subquery_results,
+        )))
+    }
+
     fn try_from_explain_exec(
         exec: &ExplainExec,
-        _extension_codec: &dyn PhysicalExtensionCodec,
+        _codec: &dyn PhysicalExtensionCodec,
     ) -> Result<Self> {
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::Explain(
@@ -1978,16 +2316,20 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_projection_exec(
         exec: &ProjectionExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
         let expr = exec
             .expr()
             .iter()
-            .map(|proj_expr| serialize_physical_expr(&proj_expr.expr, extension_codec))
+            .map(|proj_expr| {
+                proto_converter.physical_expr_to_proto(&proj_expr.expr, codec)
+            })
             .collect::<Result<Vec<_>>>()?;
         let expr_name = exec
             .expr()
@@ -2007,12 +2349,18 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_analyze_exec(
         exec: &AnalyzeExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
+        let (has_metric_categories, metric_categories) = match exec.metric_categories() {
+            Some(cats) => (true, cats.iter().map(|c| c.to_string()).collect()),
+            None => (false, vec![]),
+        };
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::Analyze(Box::new(
                 protobuf::AnalyzeExecNode {
@@ -2020,6 +2368,8 @@ impl protobuf::PhysicalPlanNode {
                     show_statistics: exec.show_statistics(),
                     input: Some(Box::new(input)),
                     schema: Some(exec.schema().as_ref().try_into()?),
+                    has_metric_categories,
+                    metric_categories,
                 },
             ))),
         })
@@ -2027,24 +2377,31 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_filter_exec(
         exec: &FilterExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::Filter(Box::new(
                 protobuf::FilterExecNode {
                     input: Some(Box::new(input)),
-                    expr: Some(serialize_physical_expr(
-                        exec.predicate(),
-                        extension_codec,
-                    )?),
+                    expr: Some(
+                        proto_converter
+                            .physical_expr_to_proto(exec.predicate(), codec)?,
+                    ),
                     default_filter_selectivity: exec.default_selectivity() as u32,
-                    projection: exec.projection().as_ref().map_or_else(Vec::new, |v| {
-                        v.iter().map(|x| *x as u32).collect::<Vec<u32>>()
-                    }),
+                    projection: match exec.projection() {
+                        None => (0..exec.input().schema().fields().len())
+                            .map(|i| i as u32)
+                            .collect(),
+                        Some(v) => v.iter().map(|x| *x as u32).collect(),
+                    },
+                    batch_size: exec.batch_size() as u32,
+                    fetch: exec.fetch().map(|f| f as u32),
                 },
             ))),
         })
@@ -2052,11 +2409,13 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_global_limit_exec(
         limit: &GlobalLimitExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             limit.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
 
         Ok(protobuf::PhysicalPlanNode {
@@ -2075,11 +2434,13 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_local_limit_exec(
         limit: &LocalLimitExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             limit.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::LocalLimit(Box::new(
@@ -2093,22 +2454,25 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_hash_join_exec(
         exec: &HashJoinExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let left = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let left = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.left().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
-        let right = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let right = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.right().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
         let on: Vec<protobuf::JoinOn> = exec
             .on()
             .iter()
             .map(|tuple| {
-                let l = serialize_physical_expr(&tuple.0, extension_codec)?;
-                let r = serialize_physical_expr(&tuple.1, extension_codec)?;
+                let l = proto_converter.physical_expr_to_proto(&tuple.0, codec)?;
+                let r = proto_converter.physical_expr_to_proto(&tuple.1, codec)?;
                 Ok::<_, DataFusionError>(protobuf::JoinOn {
                     left: Some(l),
                     right: Some(r),
@@ -2122,7 +2486,7 @@ impl protobuf::PhysicalPlanNode {
             .as_ref()
             .map(|f| {
                 let expression =
-                    serialize_physical_expr(f.expression(), extension_codec)?;
+                    proto_converter.physical_expr_to_proto(f.expression(), codec)?;
                 let column_indices = f
                     .column_indices()
                     .iter()
@@ -2149,6 +2513,15 @@ impl protobuf::PhysicalPlanNode {
             PartitionMode::Auto => protobuf::PartitionMode::Auto,
         };
 
+        let dynamic_filter = exec
+            .dynamic_filter()
+            .map(|df| {
+                let df_expr: Arc<dyn PhysicalExpr> =
+                    Arc::clone(df) as Arc<dyn PhysicalExpr>;
+                proto_converter.physical_expr_to_proto(&df_expr, codec)
+            })
+            .transpose()?;
+
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::HashJoin(Box::new(
                 protobuf::HashJoinExecNode {
@@ -2162,6 +2535,8 @@ impl protobuf::PhysicalPlanNode {
                     projection: exec.projection.as_ref().map_or_else(Vec::new, |v| {
                         v.iter().map(|x| *x as u32).collect::<Vec<u32>>()
                     }),
+                    null_aware: exec.null_aware,
+                    dynamic_filter,
                 },
             ))),
         })
@@ -2169,22 +2544,25 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_symmetric_hash_join_exec(
         exec: &SymmetricHashJoinExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let left = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let left = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.left().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
-        let right = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let right = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.right().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
         let on = exec
             .on()
             .iter()
             .map(|tuple| {
-                let l = serialize_physical_expr(&tuple.0, extension_codec)?;
-                let r = serialize_physical_expr(&tuple.1, extension_codec)?;
+                let l = proto_converter.physical_expr_to_proto(&tuple.0, codec)?;
+                let r = proto_converter.physical_expr_to_proto(&tuple.1, codec)?;
                 Ok::<_, DataFusionError>(protobuf::JoinOn {
                     left: Some(l),
                     right: Some(r),
@@ -2198,7 +2576,7 @@ impl protobuf::PhysicalPlanNode {
             .as_ref()
             .map(|f| {
                 let expression =
-                    serialize_physical_expr(f.expression(), extension_codec)?;
+                    proto_converter.physical_expr_to_proto(f.expression(), codec)?;
                 let column_indices = f
                     .column_indices()
                     .iter()
@@ -2235,10 +2613,10 @@ impl protobuf::PhysicalPlanNode {
                     .iter()
                     .map(|expr| {
                         Ok(protobuf::PhysicalSortExprNode {
-                            expr: Some(Box::new(serialize_physical_expr(
-                                &expr.expr,
-                                extension_codec,
-                            )?)),
+                            expr: Some(Box::new(
+                                proto_converter
+                                    .physical_expr_to_proto(&expr.expr, codec)?,
+                            )),
                             asc: !expr.options.descending,
                             nulls_first: expr.options.nulls_first,
                         })
@@ -2255,10 +2633,10 @@ impl protobuf::PhysicalPlanNode {
                     .iter()
                     .map(|expr| {
                         Ok(protobuf::PhysicalSortExprNode {
-                            expr: Some(Box::new(serialize_physical_expr(
-                                &expr.expr,
-                                extension_codec,
-                            )?)),
+                            expr: Some(Box::new(
+                                proto_converter
+                                    .physical_expr_to_proto(&expr.expr, codec)?,
+                            )),
                             asc: !expr.options.descending,
                             nulls_first: expr.options.nulls_first,
                         })
@@ -2287,22 +2665,25 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_sort_merge_join_exec(
         exec: &SortMergeJoinExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let left = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let left = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.left().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
-        let right = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let right = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.right().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
         let on = exec
             .on()
             .iter()
             .map(|tuple| {
-                let l = serialize_physical_expr(&tuple.0, extension_codec)?;
-                let r = serialize_physical_expr(&tuple.1, extension_codec)?;
+                let l = proto_converter.physical_expr_to_proto(&tuple.0, codec)?;
+                let r = proto_converter.physical_expr_to_proto(&tuple.1, codec)?;
                 Ok::<_, DataFusionError>(protobuf::JoinOn {
                     left: Some(l),
                     right: Some(r),
@@ -2316,7 +2697,7 @@ impl protobuf::PhysicalPlanNode {
             .as_ref()
             .map(|f| {
                 let expression =
-                    serialize_physical_expr(f.expression(), extension_codec)?;
+                    proto_converter.physical_expr_to_proto(f.expression(), codec)?;
                 let column_indices = f
                     .column_indices()
                     .iter()
@@ -2356,7 +2737,7 @@ impl protobuf::PhysicalPlanNode {
 
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::SortMergeJoin(Box::new(
-                protobuf::SortMergeJoinExecNode {
+                SortMergeJoinExecNode {
                     left: Some(Box::new(left)),
                     right: Some(Box::new(right)),
                     on,
@@ -2371,15 +2752,18 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_cross_join_exec(
         exec: &CrossJoinExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let left = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let left = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.left().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
-        let right = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let right = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.right().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::CrossJoin(Box::new(
@@ -2393,7 +2777,8 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_aggregate_exec(
         exec: &AggregateExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
         let groups: Vec<bool> = exec
             .group_expr()
@@ -2413,13 +2798,15 @@ impl protobuf::PhysicalPlanNode {
         let filter = exec
             .filter_expr()
             .iter()
-            .map(|expr| serialize_maybe_filter(expr.to_owned(), extension_codec))
+            .map(|expr| serialize_maybe_filter(expr.to_owned(), codec, proto_converter))
             .collect::<Result<Vec<_>>>()?;
 
         let agg = exec
             .aggr_expr()
             .iter()
-            .map(|expr| serialize_physical_aggr_expr(expr.to_owned(), extension_codec))
+            .map(|expr| {
+                serialize_physical_aggr_expr(expr.to_owned(), codec, proto_converter)
+            })
             .collect::<Result<Vec<_>>>()?;
 
         let agg_names = exec
@@ -2436,29 +2823,32 @@ impl protobuf::PhysicalPlanNode {
             AggregateMode::SinglePartitioned => {
                 protobuf::AggregateMode::SinglePartitioned
             }
+            AggregateMode::PartialReduce => protobuf::AggregateMode::PartialReduce,
         };
         let input_schema = exec.input_schema();
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
 
         let null_expr = exec
             .group_expr()
             .null_expr()
             .iter()
-            .map(|expr| serialize_physical_expr(&expr.0, extension_codec))
+            .map(|expr| proto_converter.physical_expr_to_proto(&expr.0, codec))
             .collect::<Result<Vec<_>>>()?;
 
         let group_expr = exec
             .group_expr()
             .expr()
             .iter()
-            .map(|expr| serialize_physical_expr(&expr.0, extension_codec))
+            .map(|expr| proto_converter.physical_expr_to_proto(&expr.0, codec))
             .collect::<Result<Vec<_>>>()?;
 
-        let limit = exec.limit().map(|value| protobuf::AggLimit {
-            limit: value as u64,
+        let limit = exec.limit_options().map(|config| protobuf::AggLimit {
+            limit: config.limit() as u64,
+            descending: config.descending(),
         });
 
         Ok(protobuf::PhysicalPlanNode {
@@ -2475,6 +2865,15 @@ impl protobuf::PhysicalPlanNode {
                     null_expr,
                     groups,
                     limit,
+                    has_grouping_set: exec.group_expr().has_grouping_set(),
+                    dynamic_filter: exec
+                        .dynamic_filter()
+                        .map(|df| {
+                            let df_expr: Arc<dyn PhysicalExpr> =
+                                Arc::clone(df) as Arc<dyn PhysicalExpr>;
+                            proto_converter.physical_expr_to_proto(&df_expr, codec)
+                        })
+                        .transpose()?,
                 },
             ))),
         })
@@ -2482,7 +2881,7 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_empty_exec(
         empty: &EmptyExec,
-        _extension_codec: &dyn PhysicalExtensionCodec,
+        _codec: &dyn PhysicalExtensionCodec,
     ) -> Result<Self> {
         let schema = empty.schema().as_ref().try_into()?;
         Ok(protobuf::PhysicalPlanNode {
@@ -2494,7 +2893,7 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_placeholder_row_exec(
         empty: &PlaceholderRowExec,
-        _extension_codec: &dyn PhysicalExtensionCodec,
+        _codec: &dyn PhysicalExtensionCodec,
     ) -> Result<Self> {
         let schema = empty.schema().as_ref().try_into()?;
         Ok(protobuf::PhysicalPlanNode {
@@ -2506,13 +2905,16 @@ impl protobuf::PhysicalPlanNode {
         })
     }
 
+    #[expect(deprecated)]
     fn try_from_coalesce_batches_exec(
         coalesce_batches: &CoalesceBatchesExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             coalesce_batches.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::CoalesceBatches(Box::new(
@@ -2527,18 +2929,20 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_data_source_exec(
         data_source_exec: &DataSourceExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Option<Self>> {
         let data_source = data_source_exec.data_source();
-        if let Some(maybe_csv) = data_source.as_any().downcast_ref::<FileScanConfig>() {
+        if let Some(maybe_csv) = data_source.downcast_ref::<FileScanConfig>() {
             let source = maybe_csv.file_source();
-            if let Some(csv_config) = source.as_any().downcast_ref::<CsvSource>() {
+            if let Some(csv_config) = source.downcast_ref::<CsvSource>() {
                 return Ok(Some(protobuf::PhysicalPlanNode {
                     physical_plan_type: Some(PhysicalPlanType::CsvScan(
                         protobuf::CsvScanExecNode {
                             base_conf: Some(serialize_file_scan_config(
                                 maybe_csv,
-                                extension_codec,
+                                codec,
+                                proto_converter,
                             )?),
                             has_header: csv_config.has_header(),
                             delimiter: byte_to_string(
@@ -2563,7 +2967,7 @@ impl protobuf::PhysicalPlanNode {
                             } else {
                                 None
                             },
-                            newlines_in_values: maybe_csv.newlines_in_values(),
+                            newlines_in_values: csv_config.newlines_in_values(),
                             truncate_rows: csv_config.truncate_rows(),
                         },
                     )),
@@ -2571,15 +2975,33 @@ impl protobuf::PhysicalPlanNode {
             }
         }
 
-        if let Some(scan_conf) = data_source.as_any().downcast_ref::<FileScanConfig>() {
+        if let Some(scan_conf) = data_source.downcast_ref::<FileScanConfig>() {
             let source = scan_conf.file_source();
-            if let Some(_json_source) = source.as_any().downcast_ref::<JsonSource>() {
+            if let Some(_json_source) = source.downcast_ref::<JsonSource>() {
                 return Ok(Some(protobuf::PhysicalPlanNode {
                     physical_plan_type: Some(PhysicalPlanType::JsonScan(
                         protobuf::JsonScanExecNode {
                             base_conf: Some(serialize_file_scan_config(
                                 scan_conf,
-                                extension_codec,
+                                codec,
+                                proto_converter,
+                            )?),
+                        },
+                    )),
+                }));
+            }
+        }
+
+        if let Some(scan_conf) = data_source.downcast_ref::<FileScanConfig>() {
+            let source = scan_conf.file_source();
+            if let Some(_arrow_source) = source.downcast_ref::<ArrowSource>() {
+                return Ok(Some(protobuf::PhysicalPlanNode {
+                    physical_plan_type: Some(PhysicalPlanType::ArrowScan(
+                        protobuf::ArrowScanExecNode {
+                            base_conf: Some(serialize_file_scan_config(
+                                scan_conf,
+                                codec,
+                                proto_converter,
                             )?),
                         },
                     )),
@@ -2593,14 +3015,15 @@ impl protobuf::PhysicalPlanNode {
         {
             let predicate = conf
                 .filter()
-                .map(|pred| serialize_physical_expr(&pred, extension_codec))
+                .map(|pred| proto_converter.physical_expr_to_proto(&pred, codec))
                 .transpose()?;
             return Ok(Some(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::ParquetScan(
                     protobuf::ParquetScanExecNode {
                         base_conf: Some(serialize_file_scan_config(
                             maybe_parquet,
-                            extension_codec,
+                            codec,
+                            proto_converter,
                         )?),
                         predicate,
                         parquet_options: Some(conf.table_parquet_options().try_into()?),
@@ -2610,15 +3033,16 @@ impl protobuf::PhysicalPlanNode {
         }
 
         #[cfg(feature = "avro")]
-        if let Some(maybe_avro) = data_source.as_any().downcast_ref::<FileScanConfig>() {
+        if let Some(maybe_avro) = data_source.downcast_ref::<FileScanConfig>() {
             let source = maybe_avro.file_source();
-            if source.as_any().downcast_ref::<AvroSource>().is_some() {
+            if source.downcast_ref::<AvroSource>().is_some() {
                 return Ok(Some(protobuf::PhysicalPlanNode {
                     physical_plan_type: Some(PhysicalPlanType::AvroScan(
                         protobuf::AvroScanExecNode {
                             base_conf: Some(serialize_file_scan_config(
                                 maybe_avro,
-                                extension_codec,
+                                codec,
+                                proto_converter,
                             )?),
                         },
                     )),
@@ -2626,9 +3050,7 @@ impl protobuf::PhysicalPlanNode {
             }
         }
 
-        if let Some(source_conf) =
-            data_source.as_any().downcast_ref::<MemorySourceConfig>()
-        {
+        if let Some(source_conf) = data_source.downcast_ref::<MemorySourceConfig>() {
             let proto_partitions = source_conf
                 .partitions()
                 .iter()
@@ -2651,7 +3073,8 @@ impl protobuf::PhysicalPlanNode {
                 .map(|ordering| {
                     let sort_exprs = serialize_physical_sort_exprs(
                         ordering.to_owned(),
-                        extension_codec,
+                        codec,
+                        proto_converter,
                     )?;
                     Ok::<_, DataFusionError>(protobuf::PhysicalSortExprNodeCollection {
                         physical_sort_expr_nodes: sort_exprs,
@@ -2678,11 +3101,13 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_coalesce_partitions_exec(
         exec: &CoalescePartitionsExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::Merge(Box::new(
@@ -2696,21 +3121,24 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_repartition_exec(
         exec: &RepartitionExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
 
         let pb_partitioning =
-            serialize_partitioning(exec.partitioning(), extension_codec)?;
+            serialize_partitioning(exec.partitioning(), codec, proto_converter)?;
 
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::Repartition(Box::new(
                 protobuf::RepartitionExecNode {
                     input: Some(Box::new(input)),
                     partitioning: Some(pb_partitioning),
+                    preserve_order: exec.preserve_order(),
                 },
             ))),
         })
@@ -2718,29 +3146,35 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_sort_exec(
         exec: &SortExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
-            exec.input().to_owned(),
-            extension_codec,
-        )?;
+        let input = proto_converter.execution_plan_to_proto(exec.input(), codec)?;
         let expr = exec
             .expr()
             .iter()
             .map(|expr| {
                 let sort_expr = Box::new(protobuf::PhysicalSortExprNode {
-                    expr: Some(Box::new(serialize_physical_expr(
-                        &expr.expr,
-                        extension_codec,
-                    )?)),
+                    expr: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(&expr.expr, codec)?,
+                    )),
                     asc: !expr.options.descending,
                     nulls_first: expr.options.nulls_first,
                 });
                 Ok(protobuf::PhysicalExprNode {
+                    expr_id: None,
                     expr_type: Some(ExprType::Sort(sort_expr)),
                 })
             })
             .collect::<Result<Vec<_>>>()?;
+        let dynamic_filter = exec
+            .dynamic_filter()
+            .map(|df| {
+                let df_expr: Arc<dyn PhysicalExpr> = df as Arc<dyn PhysicalExpr>;
+                proto_converter.physical_expr_to_proto(&df_expr, codec)
+            })
+            .transpose()?;
+
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::Sort(Box::new(
                 protobuf::SortExecNode {
@@ -2751,6 +3185,7 @@ impl protobuf::PhysicalPlanNode {
                         _ => -1,
                     },
                     preserve_partitioning: exec.preserve_partitioning(),
+                    dynamic_filter,
                 },
             ))),
         })
@@ -2758,14 +3193,18 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_union_exec(
         union: &UnionExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
         let mut inputs: Vec<protobuf::PhysicalPlanNode> = vec![];
         for input in union.inputs() {
-            inputs.push(protobuf::PhysicalPlanNode::try_from_physical_plan(
-                input.to_owned(),
-                extension_codec,
-            )?);
+            inputs.push(
+                protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
+                    input.to_owned(),
+                    codec,
+                    proto_converter,
+                )?,
+            );
         }
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::Union(protobuf::UnionExecNode {
@@ -2776,14 +3215,18 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_interleave_exec(
         interleave: &InterleaveExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
         let mut inputs: Vec<protobuf::PhysicalPlanNode> = vec![];
         for input in interleave.inputs() {
-            inputs.push(protobuf::PhysicalPlanNode::try_from_physical_plan(
-                input.to_owned(),
-                extension_codec,
-            )?);
+            inputs.push(
+                protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
+                    input.to_owned(),
+                    codec,
+                    proto_converter,
+                )?,
+            );
         }
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::Interleave(
@@ -2794,25 +3237,27 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_sort_preserving_merge_exec(
         exec: &SortPreservingMergeExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
         let expr = exec
             .expr()
             .iter()
             .map(|expr| {
                 let sort_expr = Box::new(protobuf::PhysicalSortExprNode {
-                    expr: Some(Box::new(serialize_physical_expr(
-                        &expr.expr,
-                        extension_codec,
-                    )?)),
+                    expr: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(&expr.expr, codec)?,
+                    )),
                     asc: !expr.options.descending,
                     nulls_first: expr.options.nulls_first,
                 });
                 Ok(protobuf::PhysicalExprNode {
+                    expr_id: None,
                     expr_type: Some(ExprType::Sort(sort_expr)),
                 })
             })
@@ -2830,15 +3275,18 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_nested_loop_join_exec(
         exec: &NestedLoopJoinExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let left = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let left = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.left().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
-        let right = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let right = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.right().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
 
         let join_type: protobuf::JoinType = exec.join_type().to_owned().into();
@@ -2847,7 +3295,7 @@ impl protobuf::PhysicalPlanNode {
             .as_ref()
             .map(|f| {
                 let expression =
-                    serialize_physical_expr(f.expression(), extension_codec)?;
+                    proto_converter.physical_expr_to_proto(f.expression(), codec)?;
                 let column_indices = f
                     .column_indices()
                     .iter()
@@ -2875,7 +3323,7 @@ impl protobuf::PhysicalPlanNode {
                     right: Some(Box::new(right)),
                     join_type: join_type.into(),
                     filter,
-                    projection: exec.projection().map_or_else(Vec::new, |v| {
+                    projection: exec.projection().as_ref().map_or_else(Vec::new, |v| {
                         v.iter().map(|x| *x as u32).collect::<Vec<u32>>()
                     }),
                 },
@@ -2885,23 +3333,25 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_window_agg_exec(
         exec: &WindowAggExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
 
         let window_expr = exec
             .window_expr()
             .iter()
-            .map(|e| serialize_physical_window_expr(e, extension_codec))
+            .map(|e| serialize_physical_window_expr(e, codec, proto_converter))
             .collect::<Result<Vec<protobuf::PhysicalWindowExprNode>>>()?;
 
         let partition_keys = exec
             .partition_keys()
             .iter()
-            .map(|e| serialize_physical_expr(e, extension_codec))
+            .map(|e| proto_converter.physical_expr_to_proto(e, codec))
             .collect::<Result<Vec<protobuf::PhysicalExprNode>>>()?;
 
         Ok(protobuf::PhysicalPlanNode {
@@ -2918,23 +3368,25 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_bounded_window_agg_exec(
         exec: &BoundedWindowAggExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
 
         let window_expr = exec
             .window_expr()
             .iter()
-            .map(|e| serialize_physical_window_expr(e, extension_codec))
+            .map(|e| serialize_physical_window_expr(e, codec, proto_converter))
             .collect::<Result<Vec<protobuf::PhysicalWindowExprNode>>>()?;
 
         let partition_keys = exec
             .partition_keys()
             .iter()
-            .map(|e| serialize_physical_expr(e, extension_codec))
+            .map(|e| proto_converter.physical_expr_to_proto(e, codec))
             .collect::<Result<Vec<protobuf::PhysicalExprNode>>>()?;
 
         let input_order_mode = match &exec.input_order_mode {
@@ -2967,12 +3419,14 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_data_sink_exec(
         exec: &DataSinkExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Option<Self>> {
         let input: protobuf::PhysicalPlanNode =
-            protobuf::PhysicalPlanNode::try_from_physical_plan(
+            protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
                 exec.input().to_owned(),
-                extension_codec,
+                codec,
+                proto_converter,
             )?;
         let sort_order = match exec.sort_order() {
             Some(requirements) => {
@@ -2981,10 +3435,10 @@ impl protobuf::PhysicalPlanNode {
                     .map(|requirement| {
                         let expr: PhysicalSortExpr = requirement.to_owned().into();
                         let sort_expr = protobuf::PhysicalSortExprNode {
-                            expr: Some(Box::new(serialize_physical_expr(
-                                &expr.expr,
-                                extension_codec,
-                            )?)),
+                            expr: Some(Box::new(
+                                proto_converter
+                                    .physical_expr_to_proto(&expr.expr, codec)?,
+                            )),
                             asc: !expr.options.descending,
                             nulls_first: expr.options.nulls_first,
                         };
@@ -2998,7 +3452,7 @@ impl protobuf::PhysicalPlanNode {
             None => None,
         };
 
-        if let Some(sink) = exec.sink().as_any().downcast_ref::<JsonSink>() {
+        if let Some(sink) = exec.sink().downcast_ref::<JsonSink>() {
             return Ok(Some(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::JsonSink(Box::new(
                     protobuf::JsonSinkExecNode {
@@ -3011,7 +3465,7 @@ impl protobuf::PhysicalPlanNode {
             }));
         }
 
-        if let Some(sink) = exec.sink().as_any().downcast_ref::<CsvSink>() {
+        if let Some(sink) = exec.sink().downcast_ref::<CsvSink>() {
             return Ok(Some(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::CsvSink(Box::new(
                     protobuf::CsvSinkExecNode {
@@ -3025,7 +3479,7 @@ impl protobuf::PhysicalPlanNode {
         }
 
         #[cfg(feature = "parquet")]
-        if let Some(sink) = exec.sink().as_any().downcast_ref::<ParquetSink>() {
+        if let Some(sink) = exec.sink().downcast_ref::<ParquetSink>() {
             return Ok(Some(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::ParquetSink(Box::new(
                     protobuf::ParquetSinkExecNode {
@@ -3044,11 +3498,13 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_unnest_exec(
         exec: &UnnestExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
 
         Ok(protobuf::PhysicalPlanNode {
@@ -3077,11 +3533,13 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_cooperative_exec(
         exec: &CooperativeExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
 
         Ok(protobuf::PhysicalPlanNode {
@@ -3207,6 +3665,90 @@ impl protobuf::PhysicalPlanNode {
 
         Ok(None)
     }
+
+    fn try_from_async_func_exec(
+        exec: &AsyncFuncExec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
+    ) -> Result<Self> {
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
+            Arc::clone(exec.input()),
+            codec,
+            proto_converter,
+        )?;
+
+        let mut async_exprs = vec![];
+        let mut async_expr_names = vec![];
+
+        for async_expr in exec.async_exprs() {
+            async_exprs
+                .push(proto_converter.physical_expr_to_proto(&async_expr.func, codec)?);
+            async_expr_names.push(async_expr.name.clone())
+        }
+
+        Ok(protobuf::PhysicalPlanNode {
+            physical_plan_type: Some(PhysicalPlanType::AsyncFunc(Box::new(
+                protobuf::AsyncFuncExecNode {
+                    input: Some(Box::new(input)),
+                    async_exprs,
+                    async_expr_names,
+                },
+            ))),
+        })
+    }
+
+    fn try_from_buffer_exec(
+        exec: &BufferExec,
+        extension_codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
+    ) -> Result<Self> {
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
+            Arc::clone(exec.input()),
+            extension_codec,
+            proto_converter,
+        )?;
+
+        Ok(protobuf::PhysicalPlanNode {
+            physical_plan_type: Some(PhysicalPlanType::Buffer(Box::new(
+                protobuf::BufferExecNode {
+                    input: Some(Box::new(input)),
+                    capacity: exec.capacity() as u64,
+                },
+            ))),
+        })
+    }
+
+    fn try_from_scalar_subquery_exec(
+        exec: &ScalarSubqueryExec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
+    ) -> Result<Self> {
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
+            Arc::clone(exec.input()),
+            codec,
+            proto_converter,
+        )?;
+        let subqueries = exec
+            .subqueries()
+            .iter()
+            .map(|sq| {
+                protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
+                    Arc::clone(&sq.plan),
+                    codec,
+                    proto_converter,
+                )
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        Ok(protobuf::PhysicalPlanNode {
+            physical_plan_type: Some(PhysicalPlanType::ScalarSubquery(Box::new(
+                protobuf::ScalarSubqueryExecNode {
+                    input: Some(Box::new(input)),
+                    subqueries,
+                },
+            ))),
+        })
+    }
 }
 
 pub trait AsExecutionPlan: Debug + Send + Sync + Clone {
@@ -3223,18 +3765,18 @@ pub trait AsExecutionPlan: Debug + Send + Sync + Clone {
         &self,
         ctx: &TaskContext,
 
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
     ) -> Result<Arc<dyn ExecutionPlan>>;
 
     fn try_from_physical_plan(
         plan: Arc<dyn ExecutionPlan>,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
     ) -> Result<Self>
     where
         Self: Sized;
 }
 
-pub trait PhysicalExtensionCodec: Debug + Send + Sync {
+pub trait PhysicalExtensionCodec: Debug + Send + Sync + Any {
     fn try_decode(
         &self,
         buf: &[u8],
@@ -3309,6 +3851,59 @@ impl PhysicalExtensionCodec for DefaultPhysicalExtensionCodec {
     }
 }
 
+/// Controls the conversion of physical plans and expressions to and from their
+/// Protobuf variants. Using this trait, users can perform optimizations on the
+/// conversion process or collect performance metrics.
+pub trait PhysicalProtoConverterExtension {
+    fn proto_to_execution_plan(
+        &self,
+        proto: &protobuf::PhysicalPlanNode,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+    ) -> Result<Arc<dyn ExecutionPlan>>;
+
+    fn default_proto_to_execution_plan(
+        &self,
+        proto: &protobuf::PhysicalPlanNode,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+    ) -> Result<Arc<dyn ExecutionPlan>>
+    where
+        Self: Sized,
+    {
+        proto.try_into_physical_plan_with_context(ctx, self)
+    }
+
+    fn execution_plan_to_proto(
+        &self,
+        plan: &Arc<dyn ExecutionPlan>,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<protobuf::PhysicalPlanNode>;
+
+    fn proto_to_physical_expr(
+        &self,
+        proto: &protobuf::PhysicalExprNode,
+        input_schema: &Schema,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+    ) -> Result<Arc<dyn PhysicalExpr>>;
+
+    fn default_proto_to_physical_expr(
+        &self,
+        proto: &protobuf::PhysicalExprNode,
+        input_schema: &Schema,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+    ) -> Result<Arc<dyn PhysicalExpr>>
+    where
+        Self: Sized,
+    {
+        parse_physical_expr_with_converter(proto, input_schema, ctx, self)
+    }
+
+    fn physical_expr_to_proto(
+        &self,
+        expr: &Arc<dyn PhysicalExpr>,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<protobuf::PhysicalExprNode>;
+}
+
 /// DataEncoderTuple captures the position of the encoder
 /// in the codec list that was used to encode the data and actual encoded data
 #[derive(Clone, PartialEq, prost::Message)]
@@ -3322,6 +3917,184 @@ struct DataEncoderTuple {
     pub blob: Vec<u8>,
 }
 
+pub struct DefaultPhysicalProtoConverter {}
+
+impl PhysicalProtoConverterExtension for DefaultPhysicalProtoConverter {
+    fn proto_to_execution_plan(
+        &self,
+        proto: &protobuf::PhysicalPlanNode,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        proto.try_into_physical_plan_with_context(ctx, self)
+    }
+
+    fn execution_plan_to_proto(
+        &self,
+        plan: &Arc<dyn ExecutionPlan>,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<protobuf::PhysicalPlanNode>
+    where
+        Self: Sized,
+    {
+        protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
+            Arc::clone(plan),
+            codec,
+            self,
+        )
+    }
+
+    fn proto_to_physical_expr(
+        &self,
+        proto: &protobuf::PhysicalExprNode,
+        input_schema: &Schema,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+    ) -> Result<Arc<dyn PhysicalExpr>>
+    where
+        Self: Sized,
+    {
+        // Default implementation calls the free function
+        parse_physical_expr_with_converter(proto, input_schema, ctx, self)
+    }
+
+    fn physical_expr_to_proto(
+        &self,
+        expr: &Arc<dyn PhysicalExpr>,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<protobuf::PhysicalExprNode> {
+        serialize_physical_expr_with_converter(expr, codec, self)
+    }
+}
+
+/// Internal deserializer that caches expressions by their `expression_id()` so
+/// multiple occurrences of the same expression are deduped.
+#[derive(Default)]
+struct DeduplicatingDeserializer {
+    /// Cache mapping expression_id to deserialized expressions.
+    cache: RefCell<HashMap<u64, Arc<dyn PhysicalExpr>>>,
+}
+
+impl PhysicalProtoConverterExtension for DeduplicatingDeserializer {
+    fn proto_to_execution_plan(
+        &self,
+        proto: &protobuf::PhysicalPlanNode,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        proto.try_into_physical_plan_with_context(ctx, self)
+    }
+
+    fn execution_plan_to_proto(
+        &self,
+        _plan: &Arc<dyn ExecutionPlan>,
+        _codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<protobuf::PhysicalPlanNode>
+    where
+        Self: Sized,
+    {
+        internal_err!("DeduplicatingDeserializer cannot serialize execution plans")
+    }
+
+    fn proto_to_physical_expr(
+        &self,
+        proto: &protobuf::PhysicalExprNode,
+        input_schema: &Schema,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+    ) -> Result<Arc<dyn PhysicalExpr>>
+    where
+        Self: Sized,
+    {
+        // `expr_id` is the generic identity slot on `PhysicalExprNode`.
+        // The default serializer populates it from `PhysicalExpr::expression_id`.
+        // A missing id means this expression type doesn't participate in deduping.
+        let Some(id) = proto.expr_id else {
+            return parse_physical_expr_with_converter(proto, input_schema, ctx, self);
+        };
+
+        let parsed = parse_physical_expr_with_converter(proto, input_schema, ctx, self)?;
+
+        let mut cache = self.cache.borrow_mut();
+        if let Some(cached) = cache.get(&id) {
+            // Since expressions may manage their own internal state when deriving
+            // expressions via `with_new_children`, we use `with_new_children`
+            // to opt into the same behavior.
+            //
+            // For example, one `DynamicFilterPhysicalExpr` may be derived from
+            // another resulting in shared references. Using `with_new_children`
+            // is meant to preserve those references.
+            let children: Vec<_> = parsed.children().into_iter().cloned().collect();
+            return Arc::clone(cached).with_new_children(children);
+        }
+
+        cache.insert(id, Arc::clone(&parsed));
+        Ok(parsed)
+    }
+
+    fn physical_expr_to_proto(
+        &self,
+        _expr: &Arc<dyn PhysicalExpr>,
+        _codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<protobuf::PhysicalExprNode> {
+        internal_err!("DeduplicatingDeserializer cannot serialize physical expressions")
+    }
+}
+
+/// A proto converter that deduplicates [`PhysicalExpr`] by [`PhysicalExpr::expression_id`].
+/// This helps preserve referential integrity when deserializing [`ExecutionPlan`]s
+/// which may contain multiple occurrences of the same [`PhysicalExpr`] (ex. when
+/// [`DynamicFilterPhysicalExpr`] are pushed down, it is important to preserve
+/// referential integrity).
+///
+///
+/// [`DynamicFilterPhysicalExpr`]: https://docs.rs/datafusion-physical-expr/latest/datafusion_physical_expr/expressions/struct.DynamicFilterPhysicalExpr.html
+#[derive(Debug, Default, Clone, Copy)]
+pub struct DeduplicatingProtoConverter {}
+
+impl PhysicalProtoConverterExtension for DeduplicatingProtoConverter {
+    fn proto_to_execution_plan(
+        &self,
+        proto: &protobuf::PhysicalPlanNode,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let deserializer = DeduplicatingDeserializer::default();
+        proto.try_into_physical_plan_with_context(ctx, &deserializer)
+    }
+
+    fn execution_plan_to_proto(
+        &self,
+        plan: &Arc<dyn ExecutionPlan>,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<protobuf::PhysicalPlanNode>
+    where
+        Self: Sized,
+    {
+        protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
+            Arc::clone(plan),
+            codec,
+            self,
+        )
+    }
+
+    fn proto_to_physical_expr(
+        &self,
+        proto: &protobuf::PhysicalExprNode,
+        input_schema: &Schema,
+        ctx: &PhysicalPlanDecodeContext<'_>,
+    ) -> Result<Arc<dyn PhysicalExpr>>
+    where
+        Self: Sized,
+    {
+        let deserializer = DeduplicatingDeserializer::default();
+        deserializer.proto_to_physical_expr(proto, input_schema, ctx)
+    }
+
+    fn physical_expr_to_proto(
+        &self,
+        expr: &Arc<dyn PhysicalExpr>,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<protobuf::PhysicalExprNode> {
+        serialize_physical_expr_with_converter(expr, codec, self)
+    }
+}
+
 /// A PhysicalExtensionCodec that tries one of multiple inner codecs
 /// until one works
 #[derive(Debug)]
@@ -3423,12 +4196,11 @@ impl PhysicalExtensionCodec for ComposedPhysicalExtensionCodec {
 
 fn into_physical_plan(
     node: &Option<Box<protobuf::PhysicalPlanNode>>,
-    ctx: &TaskContext,
-
-    extension_codec: &dyn PhysicalExtensionCodec,
+    ctx: &PhysicalPlanDecodeContext<'_>,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Arc<dyn ExecutionPlan>> {
     if let Some(field) = node {
-        field.try_into_physical_plan(ctx, extension_codec)
+        proto_converter.proto_to_execution_plan(field, ctx)
     } else {
         Err(proto_error("Missing required field in protobuf"))
     }
diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs
index dc0a78dbccf11..83c11cfc6b299 100644
--- a/datafusion/proto/src/physical_plan/to_proto.rs
+++ b/datafusion/proto/src/physical_plan/to_proto.rs
@@ -21,49 +21,58 @@ use arrow::array::RecordBatch;
 use arrow::datatypes::Schema;
 use arrow::ipc::writer::StreamWriter;
 use datafusion_common::{
-    internal_datafusion_err, internal_err, not_impl_err, DataFusionError, Result,
+    DataFusionError, Result, internal_datafusion_err, internal_err, not_impl_err,
 };
 use datafusion_datasource::file_scan_config::FileScanConfig;
-use datafusion_datasource::file_sink_config::FileSink;
-use datafusion_datasource::file_sink_config::FileSinkConfig;
+use datafusion_datasource::file_sink_config::{FileSink, FileSinkConfig};
 use datafusion_datasource::{FileRange, PartitionedFile};
 use datafusion_datasource_csv::file_format::CsvSink;
 use datafusion_datasource_json::file_format::JsonSink;
 #[cfg(feature = "parquet")]
 use datafusion_datasource_parquet::file_format::ParquetSink;
 use datafusion_expr::WindowFrame;
-use datafusion_physical_expr::window::{SlidingAggregateWindowExpr, StandardWindowExpr};
 use datafusion_physical_expr::ScalarFunctionExpr;
-use datafusion_physical_expr_common::physical_expr::snapshot_physical_expr;
+use datafusion_physical_expr::scalar_subquery::ScalarSubqueryExpr;
+use datafusion_physical_expr::window::{SlidingAggregateWindowExpr, StandardWindowExpr};
 use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
-use datafusion_physical_plan::expressions::LikeExpr;
 use datafusion_physical_plan::expressions::{
-    BinaryExpr, CaseExpr, CastExpr, Column, InListExpr, IsNotNullExpr, IsNullExpr,
-    Literal, NegativeExpr, NotExpr, TryCastExpr, UnKnownColumn,
+    BinaryExpr, CaseExpr, CastExpr, Column, DynamicFilterPhysicalExpr, InListExpr,
+    IsNotNullExpr, IsNullExpr, LikeExpr, Literal, NegativeExpr, NotExpr, TryCastExpr,
+    UnKnownColumn,
 };
+use datafusion_physical_plan::joins::{HashExpr, HashTableLookupExpr};
 use datafusion_physical_plan::udaf::AggregateFunctionExpr;
 use datafusion_physical_plan::windows::{PlainAggregateWindowExpr, WindowUDFExpr};
 use datafusion_physical_plan::{Partitioning, PhysicalExpr, WindowExpr};
 
+use super::{
+    DefaultPhysicalProtoConverter, PhysicalExtensionCodec,
+    PhysicalProtoConverterExtension,
+};
 use crate::protobuf::{
-    self, physical_aggregate_expr_node, physical_window_expr_node, PhysicalSortExprNode,
-    PhysicalSortExprNodeCollection,
+    self, PhysicalSortExprNode, PhysicalSortExprNodeCollection,
+    physical_aggregate_expr_node, physical_window_expr_node,
 };
 
-use super::PhysicalExtensionCodec;
-
+#[expect(clippy::needless_pass_by_value)]
 pub fn serialize_physical_aggr_expr(
     aggr_expr: Arc<AggregateFunctionExpr>,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<protobuf::PhysicalExprNode> {
-    let expressions = serialize_physical_exprs(&aggr_expr.expressions(), codec)?;
-    let order_bys =
-        serialize_physical_sort_exprs(aggr_expr.order_bys().iter().cloned(), codec)?;
+    let expressions =
+        serialize_physical_exprs(&aggr_expr.expressions(), codec, proto_converter)?;
+    let order_bys = serialize_physical_sort_exprs(
+        aggr_expr.order_bys().iter().cloned(),
+        codec,
+        proto_converter,
+    )?;
 
     let name = aggr_expr.fun().name().to_string();
     let mut buf = Vec::new();
     codec.try_encode_udaf(aggr_expr.fun(), &mut buf)?;
     Ok(protobuf::PhysicalExprNode {
+        expr_id: None,
         expr_type: Some(protobuf::physical_expr_node::ExprType::AggregateExpr(
             protobuf::PhysicalAggregateExprNode {
                 aggregate_function: Some(physical_aggregate_expr_node::AggregateFunction::UserDefinedAggrFunction(name)),
@@ -98,9 +107,10 @@ fn serialize_physical_window_aggr_expr(
 pub fn serialize_physical_window_expr(
     window_expr: &Arc<dyn WindowExpr>,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<protobuf::PhysicalWindowExprNode> {
     let expr = window_expr.as_any();
-    let args = window_expr.expressions().to_vec();
+    let mut args = window_expr.expressions().to_vec();
     let window_frame = window_expr.get_window_frame();
 
     let (window_function, fun_definition, ignore_nulls, distinct) =
@@ -136,6 +146,7 @@ pub fn serialize_physical_window_expr(
             {
                 let mut buf = Vec::new();
                 codec.try_encode_udwf(expr.fun(), &mut buf)?;
+                args = expr.args().to_vec();
                 (
                     physical_window_expr_node::WindowFunction::UserDefinedWindowFunction(
                         expr.fun().name().to_string(),
@@ -153,9 +164,14 @@ pub fn serialize_physical_window_expr(
             return not_impl_err!("WindowExpr not supported: {window_expr:?}");
         };
 
-    let args = serialize_physical_exprs(&args, codec)?;
-    let partition_by = serialize_physical_exprs(window_expr.partition_by(), codec)?;
-    let order_by = serialize_physical_sort_exprs(window_expr.order_by().to_vec(), codec)?;
+    let args = serialize_physical_exprs(&args, codec, proto_converter)?;
+    let partition_by =
+        serialize_physical_exprs(window_expr.partition_by(), codec, proto_converter)?;
+    let order_by = serialize_physical_sort_exprs(
+        window_expr.order_by().to_vec(),
+        codec,
+        proto_converter,
+    )?;
     let window_frame: protobuf::WindowFrame = window_frame
         .as_ref()
         .try_into()
@@ -177,22 +193,24 @@ pub fn serialize_physical_window_expr(
 pub fn serialize_physical_sort_exprs<I>(
     sort_exprs: I,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Vec<PhysicalSortExprNode>>
 where
     I: IntoIterator<Item = PhysicalSortExpr>,
 {
     sort_exprs
         .into_iter()
-        .map(|sort_expr| serialize_physical_sort_expr(sort_expr, codec))
+        .map(|sort_expr| serialize_physical_sort_expr(sort_expr, codec, proto_converter))
         .collect()
 }
 
 pub fn serialize_physical_sort_expr(
     sort_expr: PhysicalSortExpr,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<PhysicalSortExprNode> {
     let PhysicalSortExpr { expr, options } = sort_expr;
-    let expr = serialize_physical_expr(&expr, codec)?;
+    let expr = proto_converter.physical_expr_to_proto(&expr, codec)?;
     Ok(PhysicalSortExprNode {
         expr: Some(Box::new(expr)),
         asc: !options.descending,
@@ -203,13 +221,14 @@ pub fn serialize_physical_sort_expr(
 pub fn serialize_physical_exprs<'a, I>(
     values: I,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Vec<protobuf::PhysicalExprNode>>
 where
     I: IntoIterator<Item = &'a Arc<dyn PhysicalExpr>>,
 {
     values
         .into_iter()
-        .map(|value| serialize_physical_expr(value, codec))
+        .map(|value| proto_converter.physical_expr_to_proto(value, codec))
         .collect()
 }
 
@@ -221,13 +240,54 @@ pub fn serialize_physical_expr(
     value: &Arc<dyn PhysicalExpr>,
     codec: &dyn PhysicalExtensionCodec,
 ) -> Result<protobuf::PhysicalExprNode> {
-    // Snapshot the expr in case it has dynamic predicate state so
-    // it can be serialized
-    let value = snapshot_physical_expr(Arc::clone(value))?;
-    let expr = value.as_any();
+    serialize_physical_expr_with_converter(
+        value,
+        codec,
+        &DefaultPhysicalProtoConverter {},
+    )
+}
+
+/// Serialize a `PhysicalExpr` to default protobuf representation.
+///
+/// If required, a [`PhysicalExtensionCodec`] can be provided which can handle
+/// serialization of udfs requiring specialized serialization (see [`PhysicalExtensionCodec::try_encode_udf`]).
+/// A [`PhysicalProtoConverterExtension`] can be provided to handle the
+/// conversion process (see [`PhysicalProtoConverterExtension::physical_expr_to_proto`]).
+pub fn serialize_physical_expr_with_converter(
+    value: &Arc<dyn PhysicalExpr>,
+    codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
+) -> Result<protobuf::PhysicalExprNode> {
+    let expr = value.as_ref();
+    let expr_id = value.expression_id();
+    // HashTableLookupExpr is used for dynamic filter pushdown in hash joins.
+    // It contains an Arc<dyn JoinHashMapType> (the build-side hash table) which
+    // cannot be serialized - the hash table is a runtime structure built during
+    // execution on the build side.
+    //
+    // We replace it with lit(true) which is safe because:
+    // 1. The filter is a performance optimization, not a correctness requirement
+    // 2. lit(true) passes all rows, so no valid rows are incorrectly filtered out
+    // 3. The join itself will still produce correct results, just without the
+    //    benefit of early filtering on the probe side
+    //
+    // In distributed execution, the remote worker won't have access to the hash
+    // table anyway, so the best we can do is skip this optimization.
+    if expr.downcast_ref::<HashTableLookupExpr>().is_some() {
+        let value = datafusion_proto_common::ScalarValue {
+            value: Some(datafusion_proto_common::scalar_value::Value::BoolValue(
+                true,
+            )),
+        };
+        return Ok(protobuf::PhysicalExprNode {
+            expr_id,
+            expr_type: Some(protobuf::physical_expr_node::ExprType::Literal(value)),
+        });
+    }
 
     if let Some(expr) = expr.downcast_ref::<Column>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id,
             expr_type: Some(protobuf::physical_expr_node::ExprType::Column(
                 protobuf::PhysicalColumn {
                     name: expr.name().to_string(),
@@ -237,6 +297,7 @@ pub fn serialize_physical_expr(
         })
     } else if let Some(expr) = expr.downcast_ref::<UnKnownColumn>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id,
             expr_type: Some(protobuf::physical_expr_node::ExprType::UnknownColumn(
                 protobuf::UnknownColumn {
                     name: expr.name().to_string(),
@@ -244,19 +305,48 @@ pub fn serialize_physical_expr(
             )),
         })
     } else if let Some(expr) = expr.downcast_ref::<BinaryExpr>() {
+        // Linearize a nested binary expression tree of the same operator
+        // into a flat vector of operands to avoid deep recursion in proto.
+        let op = expr.op();
+        let mut operand_refs: Vec<&Arc<dyn PhysicalExpr>> = vec![expr.right()];
+        let mut current_expr: &BinaryExpr = expr;
+        loop {
+            match current_expr.left().downcast_ref::<BinaryExpr>() {
+                Some(bin) if bin.op() == op => {
+                    operand_refs.push(bin.right());
+                    current_expr = bin;
+                }
+                _ => {
+                    operand_refs.push(current_expr.left());
+                    break;
+                }
+            }
+        }
+
+        // Reverse so operands are ordered from left innermost to right outermost
+        operand_refs.reverse();
+
+        let operands = operand_refs
+            .iter()
+            .map(|e| proto_converter.physical_expr_to_proto(e, codec))
+            .collect::<Result<Vec<_>>>()?;
+
         let binary_expr = Box::new(protobuf::PhysicalBinaryExprNode {
-            l: Some(Box::new(serialize_physical_expr(expr.left(), codec)?)),
-            r: Some(Box::new(serialize_physical_expr(expr.right(), codec)?)),
-            op: format!("{:?}", expr.op()),
+            l: None,
+            r: None,
+            op: format!("{:?}", op),
+            operands,
         });
 
         Ok(protobuf::PhysicalExprNode {
+            expr_id,
             expr_type: Some(protobuf::physical_expr_node::ExprType::BinaryExpr(
                 binary_expr,
             )),
         })
     } else if let Some(expr) = expr.downcast_ref::<CaseExpr>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id,
             expr_type: Some(
                 protobuf::physical_expr_node::ExprType::Case(
                     Box::new(
@@ -264,14 +354,21 @@ pub fn serialize_physical_expr(
                             expr: expr
                                 .expr()
                                 .map(|exp| {
-                                    serialize_physical_expr(exp, codec).map(Box::new)
+                                    proto_converter
+                                        .physical_expr_to_proto(exp, codec)
+                                        .map(Box::new)
                                 })
                                 .transpose()?,
                             when_then_expr: expr
                                 .when_then_expr()
                                 .iter()
                                 .map(|(when_expr, then_expr)| {
-                                    serialize_when_then_expr(when_expr, then_expr, codec)
+                                    serialize_when_then_expr(
+                                        when_expr,
+                                        then_expr,
+                                        codec,
+                                        proto_converter,
+                                    )
                                 })
                                 .collect::<Result<
                                     Vec<protobuf::PhysicalWhenThen>,
@@ -279,7 +376,11 @@ pub fn serialize_physical_expr(
                                 >>()?,
                             else_expr: expr
                                 .else_expr()
-                                .map(|a| serialize_physical_expr(a, codec).map(Box::new))
+                                .map(|a| {
+                                    proto_converter
+                                        .physical_expr_to_proto(a, codec)
+                                        .map(Box::new)
+                                })
                                 .transpose()?,
                         },
                     ),
@@ -288,66 +389,88 @@ pub fn serialize_physical_expr(
         })
     } else if let Some(expr) = expr.downcast_ref::<NotExpr>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id,
             expr_type: Some(protobuf::physical_expr_node::ExprType::NotExpr(Box::new(
                 protobuf::PhysicalNot {
-                    expr: Some(Box::new(serialize_physical_expr(expr.arg(), codec)?)),
+                    expr: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(expr.arg(), codec)?,
+                    )),
                 },
             ))),
         })
     } else if let Some(expr) = expr.downcast_ref::<IsNullExpr>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id,
             expr_type: Some(protobuf::physical_expr_node::ExprType::IsNullExpr(
                 Box::new(protobuf::PhysicalIsNull {
-                    expr: Some(Box::new(serialize_physical_expr(expr.arg(), codec)?)),
+                    expr: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(expr.arg(), codec)?,
+                    )),
                 }),
             )),
         })
     } else if let Some(expr) = expr.downcast_ref::<IsNotNullExpr>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id,
             expr_type: Some(protobuf::physical_expr_node::ExprType::IsNotNullExpr(
                 Box::new(protobuf::PhysicalIsNotNull {
-                    expr: Some(Box::new(serialize_physical_expr(expr.arg(), codec)?)),
+                    expr: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(expr.arg(), codec)?,
+                    )),
                 }),
             )),
         })
     } else if let Some(expr) = expr.downcast_ref::<InListExpr>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id,
             expr_type: Some(protobuf::physical_expr_node::ExprType::InList(Box::new(
                 protobuf::PhysicalInListNode {
-                    expr: Some(Box::new(serialize_physical_expr(expr.expr(), codec)?)),
-                    list: serialize_physical_exprs(expr.list(), codec)?,
+                    expr: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(expr.expr(), codec)?,
+                    )),
+                    list: serialize_physical_exprs(expr.list(), codec, proto_converter)?,
                     negated: expr.negated(),
                 },
             ))),
         })
     } else if let Some(expr) = expr.downcast_ref::<NegativeExpr>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id,
             expr_type: Some(protobuf::physical_expr_node::ExprType::Negative(Box::new(
                 protobuf::PhysicalNegativeNode {
-                    expr: Some(Box::new(serialize_physical_expr(expr.arg(), codec)?)),
+                    expr: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(expr.arg(), codec)?,
+                    )),
                 },
             ))),
         })
     } else if let Some(lit) = expr.downcast_ref::<Literal>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id,
             expr_type: Some(protobuf::physical_expr_node::ExprType::Literal(
                 lit.value().try_into()?,
             )),
         })
     } else if let Some(cast) = expr.downcast_ref::<CastExpr>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id,
             expr_type: Some(protobuf::physical_expr_node::ExprType::Cast(Box::new(
                 protobuf::PhysicalCastNode {
-                    expr: Some(Box::new(serialize_physical_expr(cast.expr(), codec)?)),
+                    expr: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(cast.expr(), codec)?,
+                    )),
                     arrow_type: Some(cast.cast_type().try_into()?),
                 },
             ))),
         })
     } else if let Some(cast) = expr.downcast_ref::<TryCastExpr>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id,
             expr_type: Some(protobuf::physical_expr_node::ExprType::TryCast(Box::new(
                 protobuf::PhysicalTryCastNode {
-                    expr: Some(Box::new(serialize_physical_expr(cast.expr(), codec)?)),
+                    expr: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(cast.expr(), codec)?,
+                    )),
                     arrow_type: Some(cast.cast_type().try_into()?),
                 },
             ))),
@@ -356,10 +479,11 @@ pub fn serialize_physical_expr(
         let mut buf = Vec::new();
         codec.try_encode_udf(expr.fun(), &mut buf)?;
         Ok(protobuf::PhysicalExprNode {
+            expr_id,
             expr_type: Some(protobuf::physical_expr_node::ExprType::ScalarUdf(
                 protobuf::PhysicalScalarUdfNode {
                     name: expr.name().to_string(),
-                    args: serialize_physical_exprs(expr.args(), codec)?,
+                    args: serialize_physical_exprs(expr.args(), codec, proto_converter)?,
                     fun_definition: (!buf.is_empty()).then_some(buf),
                     return_type: Some(expr.return_type().try_into()?),
                     nullable: expr.nullable(),
@@ -372,28 +496,90 @@ pub fn serialize_physical_expr(
         })
     } else if let Some(expr) = expr.downcast_ref::<LikeExpr>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id,
             expr_type: Some(protobuf::physical_expr_node::ExprType::LikeExpr(Box::new(
                 protobuf::PhysicalLikeExprNode {
                     negated: expr.negated(),
                     case_insensitive: expr.case_insensitive(),
-                    expr: Some(Box::new(serialize_physical_expr(expr.expr(), codec)?)),
-                    pattern: Some(Box::new(serialize_physical_expr(
-                        expr.pattern(),
-                        codec,
-                    )?)),
+                    expr: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(expr.expr(), codec)?,
+                    )),
+                    pattern: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(expr.pattern(), codec)?,
+                    )),
                 },
             ))),
         })
+    } else if let Some(expr) = expr.downcast_ref::<HashExpr>() {
+        Ok(protobuf::PhysicalExprNode {
+            expr_id,
+            expr_type: Some(protobuf::physical_expr_node::ExprType::HashExpr(
+                protobuf::PhysicalHashExprNode {
+                    on_columns: serialize_physical_exprs(
+                        expr.on_columns(),
+                        codec,
+                        proto_converter,
+                    )?,
+                    seed0: expr.seed(),
+                    description: expr.description().to_string(),
+                },
+            )),
+        })
+    } else if let Some(expr) = expr.downcast_ref::<ScalarSubqueryExpr>() {
+        Ok(protobuf::PhysicalExprNode {
+            expr_id,
+            expr_type: Some(protobuf::physical_expr_node::ExprType::ScalarSubquery(
+                protobuf::PhysicalScalarSubqueryExprNode {
+                    data_type: Some(expr.data_type().try_into()?),
+                    nullable: expr.nullable(),
+                    index: expr.index().as_usize() as u32,
+                },
+            )),
+        })
+    } else if let Some(df) = expr.downcast_ref::<DynamicFilterPhysicalExpr>() {
+        let children = df
+            .original_children()
+            .iter()
+            .map(|child| proto_converter.physical_expr_to_proto(child, codec))
+            .collect::<Result<Vec<_>>>()?;
+
+        let remapped_children = if let Some(remapped) = df.remapped_children() {
+            remapped
+                .iter()
+                .map(|child| proto_converter.physical_expr_to_proto(child, codec))
+                .collect::<Result<Vec<_>>>()?
+        } else {
+            vec![]
+        };
+
+        // Atomic snapshot of inner state.
+        let inner = df.inner();
+        let inner_expr =
+            Box::new(proto_converter.physical_expr_to_proto(&inner.expr, codec)?);
+
+        Ok(protobuf::PhysicalExprNode {
+            expr_id,
+            expr_type: Some(protobuf::physical_expr_node::ExprType::DynamicFilter(
+                Box::new(protobuf::PhysicalDynamicFilterNode {
+                    children,
+                    remapped_children,
+                    generation: inner.generation,
+                    inner_expr: Some(inner_expr),
+                    is_complete: inner.is_complete,
+                }),
+            )),
+        })
     } else {
         let mut buf: Vec<u8> = vec![];
-        match codec.try_encode_expr(&value, &mut buf) {
+        match codec.try_encode_expr(value, &mut buf) {
             Ok(_) => {
                 let inputs: Vec<protobuf::PhysicalExprNode> = value
                     .children()
                     .into_iter()
-                    .map(|e| serialize_physical_expr(e, codec))
+                    .map(|e| proto_converter.physical_expr_to_proto(e, codec))
                     .collect::<Result<_>>()?;
                 Ok(protobuf::PhysicalExprNode {
+                    expr_id,
                     expr_type: Some(protobuf::physical_expr_node::ExprType::Extension(
                         protobuf::PhysicalExtensionExprNode { expr: buf, inputs },
                     )),
@@ -409,6 +595,7 @@ pub fn serialize_physical_expr(
 pub fn serialize_partitioning(
     partitioning: &Partitioning,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<protobuf::Partitioning> {
     let serialized_partitioning = match partitioning {
         Partitioning::RoundRobinBatch(partition_count) => protobuf::Partitioning {
@@ -417,7 +604,8 @@ pub fn serialize_partitioning(
             )),
         },
         Partitioning::Hash(exprs, partition_count) => {
-            let serialized_exprs = serialize_physical_exprs(exprs, codec)?;
+            let serialized_exprs =
+                serialize_physical_exprs(exprs, codec, proto_converter)?;
             protobuf::Partitioning {
                 partition_method: Some(protobuf::partitioning::PartitionMethod::Hash(
                     protobuf::PhysicalHashRepartition {
@@ -440,10 +628,11 @@ fn serialize_when_then_expr(
     when_expr: &Arc<dyn PhysicalExpr>,
     then_expr: &Arc<dyn PhysicalExpr>,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<protobuf::PhysicalWhenThen> {
     Ok(protobuf::PhysicalWhenThen {
-        when_expr: Some(serialize_physical_expr(when_expr, codec)?),
-        then_expr: Some(serialize_physical_expr(then_expr, codec)?),
+        when_expr: Some(proto_converter.physical_expr_to_proto(when_expr, codec)?),
+        then_expr: Some(proto_converter.physical_expr_to_proto(then_expr, codec)?),
     })
 }
 
@@ -499,6 +688,7 @@ impl TryFrom<&[PartitionedFile]> for protobuf::FileGroup {
 pub fn serialize_file_scan_config(
     conf: &FileScanConfig,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<protobuf::FileScanExecConf> {
     let file_groups = conf
         .file_groups
@@ -508,7 +698,8 @@ pub fn serialize_file_scan_config(
 
     let mut output_orderings = vec![];
     for order in &conf.output_ordering {
-        let ordering = serialize_physical_sort_exprs(order.to_vec(), codec)?;
+        let ordering =
+            serialize_physical_sort_exprs(order.to_vec(), codec, proto_converter)?;
         output_orderings.push(ordering)
     }
 
@@ -523,22 +714,37 @@ pub fn serialize_file_scan_config(
     fields.extend(conf.table_partition_cols().iter().cloned());
 
     let schema = Arc::new(
-        arrow::datatypes::Schema::new(fields.clone())
-            .with_metadata(conf.file_schema().metadata.clone()),
+        Schema::new(fields.clone()).with_metadata(conf.file_schema().metadata.clone()),
     );
 
+    let projection_exprs = conf
+        .file_source
+        .projection()
+        .as_ref()
+        .map(|projection_exprs| {
+            let projections = projection_exprs.iter().cloned().collect::<Vec<_>>();
+            Ok::<_, DataFusionError>(protobuf::ProjectionExprs {
+                projections: projections
+                    .into_iter()
+                    .map(|expr| {
+                        Ok(protobuf::ProjectionExpr {
+                            alias: expr.alias.to_string(),
+                            expr: Some(
+                                proto_converter
+                                    .physical_expr_to_proto(&expr.expr, codec)?,
+                            ),
+                        })
+                    })
+                    .collect::<Result<Vec<_>>>()?,
+            })
+        })
+        .transpose()?;
+
     Ok(protobuf::FileScanExecConf {
         file_groups,
-        statistics: Some((&conf.file_source.statistics().unwrap()).into()),
+        statistics: Some((&conf.statistics()).into()),
         limit: conf.limit.map(|l| protobuf::ScanLimit { limit: l as u32 }),
-        projection: conf
-            .projection_exprs
-            .as_ref()
-            .map(|p| p.column_indices())
-            .unwrap_or((0..schema.fields().len()).collect::<Vec<_>>())
-            .iter()
-            .map(|n| *n as u32)
-            .collect(),
+        projection: vec![],
         schema: Some(schema.as_ref().try_into()?),
         table_partition_cols: conf
             .table_partition_cols()
@@ -554,17 +760,19 @@ pub fn serialize_file_scan_config(
             .collect::<Vec<_>>(),
         constraints: Some(conf.constraints.clone().into()),
         batch_size: conf.batch_size.map(|s| s as u64),
+        projection_exprs,
     })
 }
 
 pub fn serialize_maybe_filter(
     expr: Option<Arc<dyn PhysicalExpr>>,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<protobuf::MaybeFilter> {
     match expr {
         None => Ok(protobuf::MaybeFilter { expr: None }),
         Some(expr) => Ok(protobuf::MaybeFilter {
-            expr: Some(serialize_physical_expr(&expr, codec)?),
+            expr: Some(proto_converter.physical_expr_to_proto(&expr, codec)?),
         }),
     }
 }
@@ -641,6 +849,17 @@ impl TryFrom<&FileSinkConfig> for protobuf::FileSinkConfig {
                 })
             })
             .collect::<Result<Vec<_>>>()?;
+        let file_output_mode = match conf.file_output_mode {
+            datafusion_datasource::file_sink_config::FileOutputMode::Automatic => {
+                protobuf::FileOutputMode::Automatic
+            }
+            datafusion_datasource::file_sink_config::FileOutputMode::SingleFile => {
+                protobuf::FileOutputMode::SingleFile
+            }
+            datafusion_datasource::file_sink_config::FileOutputMode::Directory => {
+                protobuf::FileOutputMode::Directory
+            }
+        };
         Ok(Self {
             object_store_url: conf.object_store_url.to_string(),
             file_groups,
@@ -650,6 +869,7 @@ impl TryFrom<&FileSinkConfig> for protobuf::FileSinkConfig {
             keep_partition_by_columns: conf.keep_partition_by_columns,
             insert_op: conf.insert_op as i32,
             file_extension: conf.file_extension.to_string(),
+            file_output_mode: file_output_mode.into(),
         })
     }
 }
diff --git a/datafusion/proto/tests/cases/mod.rs b/datafusion/proto/tests/cases/mod.rs
index aec6c1de30309..3abbaccf79673 100644
--- a/datafusion/proto/tests/cases/mod.rs
+++ b/datafusion/proto/tests/cases/mod.rs
@@ -26,7 +26,6 @@ use datafusion_expr::{
 };
 use datafusion_functions_window_common::field::WindowUDFFieldArgs;
 use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
-use std::any::Any;
 use std::fmt::Debug;
 use std::hash::Hash;
 use std::sync::Arc;
@@ -56,9 +55,6 @@ impl MyRegexUdf {
 
 /// Implement the ScalarUDFImpl trait for MyRegexUdf
 impl ScalarUDFImpl for MyRegexUdf {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "regex_udf"
     }
@@ -105,9 +101,6 @@ impl MyAggregateUDF {
 }
 
 impl AggregateUDFImpl for MyAggregateUDF {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
     fn name(&self) -> &str {
         "aggregate_udf"
     }
@@ -150,10 +143,6 @@ impl CustomUDWF {
 }
 
 impl WindowUDFImpl for CustomUDWF {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "custom_udwf"
     }
diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
index bfd693e6a0f83..dbc95536f0104 100644
--- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
+++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
@@ -16,21 +16,23 @@
 // under the License.
 
 use arrow::array::{
-    ArrayRef, FixedSizeListArray, Int32Builder, MapArray, MapBuilder, StringBuilder,
+    ArrayRef, FixedSizeListArray, Int32Builder, LargeListViewArray, ListViewArray,
+    MapArray, MapBuilder, StringBuilder,
 };
 use arrow::datatypes::{
-    DataType, Field, FieldRef, Fields, Int32Type, IntervalDayTimeType,
-    IntervalMonthDayNanoType, IntervalUnit, Schema, SchemaRef, TimeUnit, UnionFields,
-    UnionMode, DECIMAL256_MAX_PRECISION,
+    DECIMAL256_MAX_PRECISION, DataType, Field, FieldRef, Fields, Int32Type,
+    IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, Schema, SchemaRef,
+    TimeUnit, UnionFields, UnionMode,
 };
 use arrow::util::pretty::pretty_format_batches;
 use datafusion::datasource::file_format::json::{JsonFormat, JsonFormatFactory};
 use datafusion::datasource::listing::{
     ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
 };
-use datafusion::execution::options::ArrowReadOptions;
-use datafusion::optimizer::eliminate_nested_union::EliminateNestedUnion;
+use datafusion::execution::options::{ArrowReadOptions, JsonReadOptions};
 use datafusion::optimizer::Optimizer;
+use datafusion::optimizer::optimize_unions::OptimizeUnions;
+use datafusion_common::parquet_config::DFParquetWriterVersion;
 use datafusion_common::parsers::CompressionTypeVariant;
 use datafusion_functions_aggregate::sum::sum_distinct;
 use prost::Message;
@@ -42,13 +44,13 @@ use std::sync::Arc;
 use std::vec;
 
 use datafusion::catalog::{TableProvider, TableProviderFactory};
+use datafusion::datasource::DefaultTableSource;
 use datafusion::datasource::file_format::arrow::ArrowFormatFactory;
 use datafusion::datasource::file_format::csv::CsvFormatFactory;
 use datafusion::datasource::file_format::parquet::ParquetFormatFactory;
-use datafusion::datasource::file_format::{format_as_file_type, DefaultFileType};
-use datafusion::datasource::DefaultTableSource;
-use datafusion::execution::session_state::SessionStateBuilder;
+use datafusion::datasource::file_format::{DefaultFileType, format_as_file_type};
 use datafusion::execution::FunctionRegistry;
+use datafusion::execution::session_state::SessionStateBuilder;
 use datafusion::functions_aggregate::count::count_udaf;
 use datafusion::functions_aggregate::expr_fn::{
     approx_median, approx_percentile_cont, approx_percentile_cont_with_weight, count,
@@ -68,8 +70,8 @@ use datafusion::test_util::{TestTableFactory, TestTableProvider};
 use datafusion_common::config::TableOptions;
 use datafusion_common::scalar::ScalarStructBuilder;
 use datafusion_common::{
-    internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, DFSchemaRef,
-    DataFusionError, Result, ScalarValue, TableReference,
+    DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, TableReference,
+    internal_datafusion_err, internal_err, not_impl_err, plan_err,
 };
 use datafusion_execution::TaskContext;
 use datafusion_expr::dml::CopyTo;
@@ -102,7 +104,7 @@ use datafusion_proto::logical_plan::file_formats::{
 };
 use datafusion_proto::logical_plan::to_proto::serialize_expr;
 use datafusion_proto::logical_plan::{
-    from_proto, DefaultLogicalExtensionCodec, LogicalExtensionCodec,
+    DefaultLogicalExtensionCodec, LogicalExtensionCodec, from_proto,
 };
 use datafusion_proto::protobuf;
 
@@ -132,7 +134,8 @@ fn roundtrip_expr_test_with_codec(
 ) {
     let proto: protobuf::LogicalExprNode = serialize_expr(&initial_struct, codec)
         .unwrap_or_else(|e| panic!("Error serializing expression: {e:?}"));
-    let round_trip: Expr = from_proto::parse_expr(&proto, &ctx, codec).unwrap();
+    let round_trip: Expr =
+        from_proto::parse_expr(&proto, ctx.task_ctx().as_ref(), codec).unwrap();
 
     assert_eq!(format!("{:?}", &initial_struct), format!("{round_trip:?}"));
 
@@ -214,8 +217,6 @@ impl LogicalExtensionCodec for TestTableProviderCodec {
         buf: &mut Vec<u8>,
     ) -> Result<()> {
         let table = node
-            .as_ref()
-            .as_any()
             .downcast_ref::<TestTableProvider>()
             .expect("Can't encode non-test tables");
         let msg = TestTableProto {
@@ -412,6 +413,7 @@ async fn roundtrip_logical_plan_dml() -> Result<()> {
         "DELETE FROM T1",
         "UPDATE T1 SET a = 1",
         "CREATE TABLE T2 AS SELECT * FROM T1",
+        "TRUNCATE TABLE T1",
     ];
     for query in queries {
         let plan = ctx.sql(query).await?.into_optimized_plan()?;
@@ -464,7 +466,7 @@ async fn roundtrip_logical_plan_copy_to_writer_options() -> Result<()> {
 
     parquet_format.global.bloom_filter_on_read = true;
     parquet_format.global.created_by = "DataFusion Test".to_string();
-    parquet_format.global.writer_version = "PARQUET_2_0".to_string();
+    parquet_format.global.writer_version = DFParquetWriterVersion::V2_0;
     parquet_format.global.write_batch_size = 111;
     parquet_format.global.data_pagesize_limit = 222;
     parquet_format.global.data_page_row_count_limit = 333;
@@ -549,6 +551,8 @@ async fn roundtrip_logical_plan_copy_to_csv() -> Result<()> {
     csv_format.timestamp_format = Some("HH:mm:ss.SSSSSS".to_string());
     csv_format.time_format = Some("HH:mm:ss".to_string());
     csv_format.null_value = Some("NIL".to_string());
+    csv_format.compression = CompressionTypeVariant::GZIP;
+    csv_format.compression_level = Some(6);
 
     let file_type = format_as_file_type(Arc::new(CsvFormatFactory::new_with_options(
         csv_format.clone(),
@@ -584,7 +588,6 @@ async fn roundtrip_logical_plan_copy_to_csv() -> Result<()> {
             let format_factory = file_type.as_format_factory();
             let csv_factory = format_factory
                 .as_ref()
-                .as_any()
                 .downcast_ref::<CsvFormatFactory>()
                 .unwrap();
             let csv_config = csv_factory.options.as_ref().unwrap();
@@ -593,7 +596,9 @@ async fn roundtrip_logical_plan_copy_to_csv() -> Result<()> {
             assert_eq!(csv_format.datetime_format, csv_config.datetime_format);
             assert_eq!(csv_format.timestamp_format, csv_config.timestamp_format);
             assert_eq!(csv_format.time_format, csv_config.time_format);
-            assert_eq!(csv_format.null_value, csv_config.null_value)
+            assert_eq!(csv_format.null_value, csv_config.null_value);
+            assert_eq!(csv_format.compression, csv_config.compression);
+            assert_eq!(csv_format.compression_level, csv_config.compression_level);
         }
         _ => panic!(),
     }
@@ -651,7 +656,6 @@ async fn roundtrip_logical_plan_copy_to_json() -> Result<()> {
             let format_factory = file_type.as_format_factory();
             let json_factory = format_factory
                 .as_ref()
-                .as_any()
                 .downcast_ref::<JsonFormatFactory>()
                 .unwrap();
             let json_config = json_factory.options.as_ref().unwrap();
@@ -723,7 +727,6 @@ async fn roundtrip_logical_plan_copy_to_parquet() -> Result<()> {
             let format_factory = file_type.as_format_factory();
             let parquet_factory = format_factory
                 .as_ref()
-                .as_any()
                 .downcast_ref::<ParquetFormatFactory>()
                 .unwrap();
             let parquet_config = parquet_factory.options.as_ref().unwrap();
@@ -737,6 +740,189 @@ async fn roundtrip_logical_plan_copy_to_parquet() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn roundtrip_default_codec_csv() -> Result<()> {
+    let ctx = SessionContext::new();
+    let input = create_csv_scan(&ctx).await?;
+
+    let table_options =
+        TableOptions::default_from_session_config(ctx.state().config_options());
+    let mut csv_format = table_options.csv;
+    csv_format.delimiter = b'|';
+    csv_format.has_header = Some(true);
+    csv_format.compression = CompressionTypeVariant::GZIP;
+
+    let file_type = format_as_file_type(Arc::new(CsvFormatFactory::new_with_options(
+        csv_format.clone(),
+    )));
+
+    let plan = LogicalPlan::Copy(CopyTo::new(
+        Arc::new(input),
+        "test.csv".to_string(),
+        vec![],
+        file_type,
+        Default::default(),
+    ));
+
+    let bytes = logical_plan_to_bytes(&plan)?;
+    let roundtrip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
+
+    match roundtrip {
+        LogicalPlan::Copy(copy_to) => {
+            assert_eq!("test.csv", copy_to.output_url);
+            assert_eq!("csv", copy_to.file_type.get_ext());
+            let dt = copy_to
+                .file_type
+                .as_ref()
+                .as_any()
+                .downcast_ref::<DefaultFileType>()
+                .unwrap();
+            let csv = dt
+                .as_format_factory()
+                .as_ref()
+                .downcast_ref::<CsvFormatFactory>()
+                .unwrap();
+            let decoded = csv.options.as_ref().unwrap();
+            assert_eq!(csv_format.delimiter, decoded.delimiter);
+            assert_eq!(csv_format.has_header, decoded.has_header);
+            assert_eq!(csv_format.compression, decoded.compression);
+        }
+        _ => panic!("Expected CopyTo plan"),
+    }
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_default_codec_json() -> Result<()> {
+    let ctx = SessionContext::new();
+    let input = create_json_scan(&ctx).await?;
+
+    let table_options =
+        TableOptions::default_from_session_config(ctx.state().config_options());
+    let mut json_format = table_options.json;
+    json_format.compression = CompressionTypeVariant::GZIP;
+    json_format.schema_infer_max_rec = Some(500);
+
+    let file_type = format_as_file_type(Arc::new(JsonFormatFactory::new_with_options(
+        json_format.clone(),
+    )));
+
+    let plan = LogicalPlan::Copy(CopyTo::new(
+        Arc::new(input),
+        "test.json".to_string(),
+        vec![],
+        file_type,
+        Default::default(),
+    ));
+
+    let bytes = logical_plan_to_bytes(&plan)?;
+    let roundtrip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
+
+    match roundtrip {
+        LogicalPlan::Copy(copy_to) => {
+            assert_eq!("test.json", copy_to.output_url);
+            assert_eq!("json", copy_to.file_type.get_ext());
+            let dt = copy_to
+                .file_type
+                .as_ref()
+                .as_any()
+                .downcast_ref::<DefaultFileType>()
+                .unwrap();
+            let json = dt
+                .as_format_factory()
+                .as_ref()
+                .downcast_ref::<JsonFormatFactory>()
+                .unwrap();
+            let decoded = json.options.as_ref().unwrap();
+            assert_eq!(json_format.compression, decoded.compression);
+            assert_eq!(
+                json_format.schema_infer_max_rec,
+                decoded.schema_infer_max_rec
+            );
+        }
+        _ => panic!("Expected CopyTo plan"),
+    }
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_default_codec_parquet() -> Result<()> {
+    let ctx = SessionContext::new();
+    let input = create_parquet_scan(&ctx).await?;
+
+    let table_options =
+        TableOptions::default_from_session_config(ctx.state().config_options());
+    let mut parquet_format = table_options.parquet;
+    parquet_format.global.bloom_filter_on_read = true;
+    parquet_format.global.created_by = "DefaultCodecTest".to_string();
+
+    let file_type = format_as_file_type(Arc::new(
+        ParquetFormatFactory::new_with_options(parquet_format.clone()),
+    ));
+
+    let plan = LogicalPlan::Copy(CopyTo::new(
+        Arc::new(input),
+        "test.parquet".to_string(),
+        vec![],
+        file_type,
+        Default::default(),
+    ));
+
+    let bytes = logical_plan_to_bytes(&plan)?;
+    let roundtrip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
+
+    match roundtrip {
+        LogicalPlan::Copy(copy_to) => {
+            assert_eq!("test.parquet", copy_to.output_url);
+            assert_eq!("parquet", copy_to.file_type.get_ext());
+            let dt = copy_to
+                .file_type
+                .as_ref()
+                .as_any()
+                .downcast_ref::<DefaultFileType>()
+                .unwrap();
+            let pq = dt
+                .as_format_factory()
+                .as_ref()
+                .downcast_ref::<ParquetFormatFactory>()
+                .unwrap();
+            let decoded = pq.options.as_ref().unwrap();
+            assert!(decoded.global.bloom_filter_on_read);
+            assert_eq!("DefaultCodecTest", decoded.global.created_by);
+        }
+        _ => panic!("Expected CopyTo plan"),
+    }
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_default_codec_arrow() -> Result<()> {
+    let ctx = SessionContext::new();
+    let input = create_csv_scan(&ctx).await?;
+
+    let file_type = format_as_file_type(Arc::new(ArrowFormatFactory::new()));
+
+    let plan = LogicalPlan::Copy(CopyTo::new(
+        Arc::new(input),
+        "test.arrow".to_string(),
+        vec![],
+        file_type,
+        Default::default(),
+    ));
+
+    let bytes = logical_plan_to_bytes(&plan)?;
+    let roundtrip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
+
+    match roundtrip {
+        LogicalPlan::Copy(copy_to) => {
+            assert_eq!("test.arrow", copy_to.output_url);
+            assert_eq!("arrow", copy_to.file_type.get_ext());
+        }
+        _ => panic!("Expected CopyTo plan"),
+    }
+    Ok(())
+}
+
 async fn create_csv_scan(ctx: &SessionContext) -> Result<LogicalPlan, DataFusionError> {
     ctx.register_csv("t1", "tests/testdata/test.csv", CsvReadOptions::default())
         .await?;
@@ -749,7 +935,7 @@ async fn create_json_scan(ctx: &SessionContext) -> Result<LogicalPlan, DataFusio
     ctx.register_json(
         "t1",
         "../core/tests/data/1.json",
-        NdJsonReadOptions::default(),
+        JsonReadOptions::default(),
     )
     .await?;
 
@@ -1088,11 +1274,13 @@ async fn roundtrip_logical_plan_prepared_statement_with_metadata() -> Result<()>
     let prepared = LogicalPlanBuilder::new(plan)
         .prepare(
             "".to_string(),
-            vec![Field::new("", DataType::Int32, true)
-                .with_metadata(
-                    [("some_key".to_string(), "some_value".to_string())].into(),
-                )
-                .into()],
+            vec![
+                Field::new("", DataType::Int32, true)
+                    .with_metadata(
+                        [("some_key".to_string(), "some_value".to_string())].into(),
+                    )
+                    .into(),
+            ],
         )
         .unwrap()
         .plan()
@@ -1305,7 +1493,9 @@ impl LogicalExtensionCodec for UDFExtensionCodec {
 
     fn try_encode_udf(&self, node: &ScalarUDF, buf: &mut Vec<u8>) -> Result<()> {
         let binding = node.inner();
-        let udf = binding.as_any().downcast_ref::<MyRegexUdf>().unwrap();
+        let udf = (binding.as_ref() as &dyn Any)
+            .downcast_ref::<MyRegexUdf>()
+            .unwrap();
         let proto = MyRegexUdfNode {
             pattern: udf.pattern.clone(),
         };
@@ -1331,7 +1521,9 @@ impl LogicalExtensionCodec for UDFExtensionCodec {
 
     fn try_encode_udaf(&self, node: &AggregateUDF, buf: &mut Vec<u8>) -> Result<()> {
         let binding = node.inner();
-        let udf = binding.as_any().downcast_ref::<MyAggregateUDF>().unwrap();
+        let udf = (binding.as_ref() as &dyn Any)
+            .downcast_ref::<MyAggregateUDF>()
+            .unwrap();
         let proto = MyAggregateUdfNode {
             result: udf.result.clone(),
         };
@@ -1344,253 +1536,277 @@ impl LogicalExtensionCodec for UDFExtensionCodec {
 
 #[test]
 fn round_trip_scalar_values_and_data_types() {
-    let should_pass: Vec<ScalarValue> = vec![
-        ScalarValue::Boolean(None),
-        ScalarValue::Float32(None),
-        ScalarValue::Float64(None),
-        ScalarValue::Int8(None),
-        ScalarValue::Int16(None),
-        ScalarValue::Int32(None),
-        ScalarValue::Int64(None),
-        ScalarValue::UInt8(None),
-        ScalarValue::UInt16(None),
-        ScalarValue::UInt32(None),
-        ScalarValue::UInt64(None),
-        ScalarValue::Utf8(None),
-        ScalarValue::LargeUtf8(None),
-        ScalarValue::List(ScalarValue::new_list_nullable(&[], &DataType::Boolean)),
-        ScalarValue::LargeList(ScalarValue::new_large_list(&[], &DataType::Boolean)),
-        ScalarValue::Date32(None),
-        ScalarValue::Boolean(Some(true)),
-        ScalarValue::Boolean(Some(false)),
-        ScalarValue::Float32(Some(1.0)),
-        ScalarValue::Float32(Some(f32::MAX)),
-        ScalarValue::Float32(Some(f32::MIN)),
-        ScalarValue::Float32(Some(-2000.0)),
-        ScalarValue::Float64(Some(1.0)),
-        ScalarValue::Float64(Some(f64::MAX)),
-        ScalarValue::Float64(Some(f64::MIN)),
-        ScalarValue::Float64(Some(-2000.0)),
-        ScalarValue::Int8(Some(i8::MIN)),
-        ScalarValue::Int8(Some(i8::MAX)),
-        ScalarValue::Int8(Some(0)),
-        ScalarValue::Int8(Some(-15)),
-        ScalarValue::Int16(Some(i16::MIN)),
-        ScalarValue::Int16(Some(i16::MAX)),
-        ScalarValue::Int16(Some(0)),
-        ScalarValue::Int16(Some(-15)),
-        ScalarValue::Int32(Some(i32::MIN)),
-        ScalarValue::Int32(Some(i32::MAX)),
-        ScalarValue::Int32(Some(0)),
-        ScalarValue::Int32(Some(-15)),
-        ScalarValue::Int64(Some(i64::MIN)),
-        ScalarValue::Int64(Some(i64::MAX)),
-        ScalarValue::Int64(Some(0)),
-        ScalarValue::Int64(Some(-15)),
-        ScalarValue::UInt8(Some(u8::MAX)),
-        ScalarValue::UInt8(Some(0)),
-        ScalarValue::UInt16(Some(u16::MAX)),
-        ScalarValue::UInt16(Some(0)),
-        ScalarValue::UInt32(Some(u32::MAX)),
-        ScalarValue::UInt32(Some(0)),
-        ScalarValue::UInt64(Some(u64::MAX)),
-        ScalarValue::UInt64(Some(0)),
-        ScalarValue::Utf8(Some(String::from("Test string   "))),
-        ScalarValue::LargeUtf8(Some(String::from("Test Large utf8"))),
-        ScalarValue::Utf8View(Some(String::from("Test stringview"))),
-        ScalarValue::BinaryView(Some(b"binaryview".to_vec())),
-        ScalarValue::Date32(Some(0)),
-        ScalarValue::Date32(Some(i32::MAX)),
-        ScalarValue::Date32(None),
-        ScalarValue::Date64(Some(0)),
-        ScalarValue::Date64(Some(i64::MAX)),
-        ScalarValue::Date64(None),
-        ScalarValue::Time32Second(Some(0)),
-        ScalarValue::Time32Second(Some(i32::MAX)),
-        ScalarValue::Time32Second(None),
-        ScalarValue::Time32Millisecond(Some(0)),
-        ScalarValue::Time32Millisecond(Some(i32::MAX)),
-        ScalarValue::Time32Millisecond(None),
-        ScalarValue::Time64Microsecond(Some(0)),
-        ScalarValue::Time64Microsecond(Some(i64::MAX)),
-        ScalarValue::Time64Microsecond(None),
-        ScalarValue::Time64Nanosecond(Some(0)),
-        ScalarValue::Time64Nanosecond(Some(i64::MAX)),
-        ScalarValue::Time64Nanosecond(None),
-        ScalarValue::TimestampNanosecond(Some(0), None),
-        ScalarValue::TimestampNanosecond(Some(i64::MAX), None),
-        ScalarValue::TimestampNanosecond(Some(0), Some("UTC".into())),
-        ScalarValue::TimestampNanosecond(None, None),
-        ScalarValue::TimestampMicrosecond(Some(0), None),
-        ScalarValue::TimestampMicrosecond(Some(i64::MAX), None),
-        ScalarValue::TimestampMicrosecond(Some(0), Some("UTC".into())),
-        ScalarValue::TimestampMicrosecond(None, None),
-        ScalarValue::TimestampMillisecond(Some(0), None),
-        ScalarValue::TimestampMillisecond(Some(i64::MAX), None),
-        ScalarValue::TimestampMillisecond(Some(0), Some("UTC".into())),
-        ScalarValue::TimestampMillisecond(None, None),
-        ScalarValue::TimestampSecond(Some(0), None),
-        ScalarValue::TimestampSecond(Some(i64::MAX), None),
-        ScalarValue::TimestampSecond(Some(0), Some("UTC".into())),
-        ScalarValue::TimestampSecond(None, None),
-        ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value(0, 0))),
-        ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value(1, 2))),
-        ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value(
-            i32::MAX,
-            i32::MAX,
-        ))),
-        ScalarValue::IntervalDayTime(None),
-        ScalarValue::IntervalMonthDayNano(Some(IntervalMonthDayNanoType::make_value(
-            0, 0, 0,
-        ))),
-        ScalarValue::IntervalMonthDayNano(Some(IntervalMonthDayNanoType::make_value(
-            1, 2, 3,
-        ))),
-        ScalarValue::IntervalMonthDayNano(Some(IntervalMonthDayNanoType::make_value(
-            i32::MAX,
-            i32::MAX,
-            i64::MAX,
-        ))),
-        ScalarValue::IntervalMonthDayNano(None),
-        ScalarValue::List(ScalarValue::new_list_nullable(
-            &[
-                ScalarValue::Float32(Some(-213.1)),
-                ScalarValue::Float32(None),
-                ScalarValue::Float32(Some(5.5)),
-                ScalarValue::Float32(Some(2.0)),
-                ScalarValue::Float32(Some(1.0)),
-            ],
-            &DataType::Float32,
-        )),
-        ScalarValue::LargeList(ScalarValue::new_large_list(
-            &[
-                ScalarValue::Float32(Some(-213.1)),
-                ScalarValue::Float32(None),
-                ScalarValue::Float32(Some(5.5)),
-                ScalarValue::Float32(Some(2.0)),
-                ScalarValue::Float32(Some(1.0)),
-            ],
-            &DataType::Float32,
-        )),
-        ScalarValue::List(ScalarValue::new_list_nullable(
-            &[
-                ScalarValue::List(ScalarValue::new_list_nullable(
-                    &[],
-                    &DataType::Float32,
-                )),
-                ScalarValue::List(ScalarValue::new_list_nullable(
-                    &[
-                        ScalarValue::Float32(Some(-213.1)),
-                        ScalarValue::Float32(None),
-                        ScalarValue::Float32(Some(5.5)),
-                        ScalarValue::Float32(Some(2.0)),
-                        ScalarValue::Float32(Some(1.0)),
-                    ],
-                    &DataType::Float32,
-                )),
-            ],
-            &DataType::List(new_arc_field("item", DataType::Float32, true)),
-        )),
-        ScalarValue::LargeList(ScalarValue::new_large_list(
-            &[
-                ScalarValue::LargeList(ScalarValue::new_large_list(
-                    &[],
-                    &DataType::Float32,
-                )),
-                ScalarValue::LargeList(ScalarValue::new_large_list(
-                    &[
-                        ScalarValue::Float32(Some(-213.1)),
-                        ScalarValue::Float32(None),
-                        ScalarValue::Float32(Some(5.5)),
-                        ScalarValue::Float32(Some(2.0)),
-                        ScalarValue::Float32(Some(1.0)),
-                    ],
-                    &DataType::Float32,
-                )),
-            ],
-            &DataType::LargeList(new_arc_field("item", DataType::Float32, true)),
-        )),
-        ScalarValue::FixedSizeList(Arc::new(FixedSizeListArray::from_iter_primitive::<
-            Int32Type,
-            _,
-            _,
-        >(
-            vec![Some(vec![Some(1), Some(2), Some(3)])],
-            3,
-        ))),
-        ScalarValue::Dictionary(
-            Box::new(DataType::Int32),
-            Box::new(ScalarValue::from("foo")),
-        ),
-        ScalarValue::Dictionary(
-            Box::new(DataType::Int32),
-            Box::new(ScalarValue::Utf8(None)),
-        ),
-        ScalarValue::Binary(Some(b"bar".to_vec())),
-        ScalarValue::Binary(None),
-        ScalarValue::LargeBinary(Some(b"bar".to_vec())),
-        ScalarValue::LargeBinary(None),
-        ScalarStructBuilder::new()
-            .with_scalar(
+    let should_pass: Vec<ScalarValue> =
+        vec![
+            ScalarValue::Boolean(None),
+            ScalarValue::Float32(None),
+            ScalarValue::Float64(None),
+            ScalarValue::Int8(None),
+            ScalarValue::Int16(None),
+            ScalarValue::Int32(None),
+            ScalarValue::Int64(None),
+            ScalarValue::UInt8(None),
+            ScalarValue::UInt16(None),
+            ScalarValue::UInt32(None),
+            ScalarValue::UInt64(None),
+            ScalarValue::Utf8(None),
+            ScalarValue::LargeUtf8(None),
+            ScalarValue::List(ScalarValue::new_list_nullable(&[], &DataType::Boolean)),
+            ScalarValue::LargeList(ScalarValue::new_large_list(&[], &DataType::Boolean)),
+            ScalarValue::Date32(None),
+            ScalarValue::Boolean(Some(true)),
+            ScalarValue::Boolean(Some(false)),
+            ScalarValue::Float32(Some(1.0)),
+            ScalarValue::Float32(Some(f32::MAX)),
+            ScalarValue::Float32(Some(f32::MIN)),
+            ScalarValue::Float32(Some(-2000.0)),
+            ScalarValue::Float64(Some(1.0)),
+            ScalarValue::Float64(Some(f64::MAX)),
+            ScalarValue::Float64(Some(f64::MIN)),
+            ScalarValue::Float64(Some(-2000.0)),
+            ScalarValue::Int8(Some(i8::MIN)),
+            ScalarValue::Int8(Some(i8::MAX)),
+            ScalarValue::Int8(Some(0)),
+            ScalarValue::Int8(Some(-15)),
+            ScalarValue::Int16(Some(i16::MIN)),
+            ScalarValue::Int16(Some(i16::MAX)),
+            ScalarValue::Int16(Some(0)),
+            ScalarValue::Int16(Some(-15)),
+            ScalarValue::Int32(Some(i32::MIN)),
+            ScalarValue::Int32(Some(i32::MAX)),
+            ScalarValue::Int32(Some(0)),
+            ScalarValue::Int32(Some(-15)),
+            ScalarValue::Int64(Some(i64::MIN)),
+            ScalarValue::Int64(Some(i64::MAX)),
+            ScalarValue::Int64(Some(0)),
+            ScalarValue::Int64(Some(-15)),
+            ScalarValue::UInt8(Some(u8::MAX)),
+            ScalarValue::UInt8(Some(0)),
+            ScalarValue::UInt16(Some(u16::MAX)),
+            ScalarValue::UInt16(Some(0)),
+            ScalarValue::UInt32(Some(u32::MAX)),
+            ScalarValue::UInt32(Some(0)),
+            ScalarValue::UInt64(Some(u64::MAX)),
+            ScalarValue::UInt64(Some(0)),
+            ScalarValue::Utf8(Some(String::from("Test string   "))),
+            ScalarValue::LargeUtf8(Some(String::from("Test Large utf8"))),
+            ScalarValue::Utf8View(Some(String::from("Test stringview"))),
+            ScalarValue::BinaryView(Some(b"binaryview".to_vec())),
+            ScalarValue::Date32(Some(0)),
+            ScalarValue::Date32(Some(i32::MAX)),
+            ScalarValue::Date32(None),
+            ScalarValue::Date64(Some(0)),
+            ScalarValue::Date64(Some(i64::MAX)),
+            ScalarValue::Date64(None),
+            ScalarValue::Time32Second(Some(0)),
+            ScalarValue::Time32Second(Some(i32::MAX)),
+            ScalarValue::Time32Second(None),
+            ScalarValue::Time32Millisecond(Some(0)),
+            ScalarValue::Time32Millisecond(Some(i32::MAX)),
+            ScalarValue::Time32Millisecond(None),
+            ScalarValue::Time64Microsecond(Some(0)),
+            ScalarValue::Time64Microsecond(Some(i64::MAX)),
+            ScalarValue::Time64Microsecond(None),
+            ScalarValue::Time64Nanosecond(Some(0)),
+            ScalarValue::Time64Nanosecond(Some(i64::MAX)),
+            ScalarValue::Time64Nanosecond(None),
+            ScalarValue::TimestampNanosecond(Some(0), None),
+            ScalarValue::TimestampNanosecond(Some(i64::MAX), None),
+            ScalarValue::TimestampNanosecond(Some(0), Some("UTC".into())),
+            ScalarValue::TimestampNanosecond(None, None),
+            ScalarValue::TimestampMicrosecond(Some(0), None),
+            ScalarValue::TimestampMicrosecond(Some(i64::MAX), None),
+            ScalarValue::TimestampMicrosecond(Some(0), Some("UTC".into())),
+            ScalarValue::TimestampMicrosecond(None, None),
+            ScalarValue::TimestampMillisecond(Some(0), None),
+            ScalarValue::TimestampMillisecond(Some(i64::MAX), None),
+            ScalarValue::TimestampMillisecond(Some(0), Some("UTC".into())),
+            ScalarValue::TimestampMillisecond(None, None),
+            ScalarValue::TimestampSecond(Some(0), None),
+            ScalarValue::TimestampSecond(Some(i64::MAX), None),
+            ScalarValue::TimestampSecond(Some(0), Some("UTC".into())),
+            ScalarValue::TimestampSecond(None, None),
+            ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value(0, 0))),
+            ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value(1, 2))),
+            ScalarValue::IntervalDayTime(Some(IntervalDayTimeType::make_value(
+                i32::MAX,
+                i32::MAX,
+            ))),
+            ScalarValue::IntervalDayTime(None),
+            ScalarValue::IntervalMonthDayNano(Some(
+                IntervalMonthDayNanoType::make_value(0, 0, 0),
+            )),
+            ScalarValue::IntervalMonthDayNano(Some(
+                IntervalMonthDayNanoType::make_value(1, 2, 3),
+            )),
+            ScalarValue::IntervalMonthDayNano(Some(
+                IntervalMonthDayNanoType::make_value(i32::MAX, i32::MAX, i64::MAX),
+            )),
+            ScalarValue::IntervalMonthDayNano(None),
+            ScalarValue::List(ScalarValue::new_list_nullable(
+                &[
+                    ScalarValue::Float32(Some(-213.1)),
+                    ScalarValue::Float32(None),
+                    ScalarValue::Float32(Some(5.5)),
+                    ScalarValue::Float32(Some(2.0)),
+                    ScalarValue::Float32(Some(1.0)),
+                ],
+                &DataType::Float32,
+            )),
+            ScalarValue::LargeList(ScalarValue::new_large_list(
+                &[
+                    ScalarValue::Float32(Some(-213.1)),
+                    ScalarValue::Float32(None),
+                    ScalarValue::Float32(Some(5.5)),
+                    ScalarValue::Float32(Some(2.0)),
+                    ScalarValue::Float32(Some(1.0)),
+                ],
+                &DataType::Float32,
+            )),
+            ScalarValue::List(ScalarValue::new_list_nullable(
+                &[
+                    ScalarValue::List(ScalarValue::new_list_nullable(
+                        &[],
+                        &DataType::Float32,
+                    )),
+                    ScalarValue::List(ScalarValue::new_list_nullable(
+                        &[
+                            ScalarValue::Float32(Some(-213.1)),
+                            ScalarValue::Float32(None),
+                            ScalarValue::Float32(Some(5.5)),
+                            ScalarValue::Float32(Some(2.0)),
+                            ScalarValue::Float32(Some(1.0)),
+                        ],
+                        &DataType::Float32,
+                    )),
+                ],
+                &DataType::List(new_arc_field("item", DataType::Float32, true)),
+            )),
+            ScalarValue::LargeList(ScalarValue::new_large_list(
+                &[
+                    ScalarValue::LargeList(ScalarValue::new_large_list(
+                        &[],
+                        &DataType::Float32,
+                    )),
+                    ScalarValue::LargeList(ScalarValue::new_large_list(
+                        &[
+                            ScalarValue::Float32(Some(-213.1)),
+                            ScalarValue::Float32(None),
+                            ScalarValue::Float32(Some(5.5)),
+                            ScalarValue::Float32(Some(2.0)),
+                            ScalarValue::Float32(Some(1.0)),
+                        ],
+                        &DataType::Float32,
+                    )),
+                ],
+                &DataType::LargeList(new_arc_field("item", DataType::Float32, true)),
+            )),
+            ScalarValue::FixedSizeList(Arc::new(
+                FixedSizeListArray::from_iter_primitive::<Int32Type, _, _>(
+                    vec![Some(vec![Some(1), Some(2), Some(3)])],
+                    3,
+                ),
+            )),
+            ScalarValue::ListView(Arc::new(ListViewArray::from_iter_primitive::<
+                Int32Type,
+                _,
+                _,
+            >(vec![Some(vec![
+                Some(1),
+                None,
+                Some(3),
+            ])]))),
+            ScalarValue::LargeListView(Arc::new(
+                LargeListViewArray::from_iter_primitive::<Int32Type, _, _>(vec![Some(
+                    vec![Some(1), None, Some(3)],
+                )]),
+            )),
+            ScalarValue::Dictionary(
+                Box::new(DataType::Int32),
+                Box::new(ScalarValue::from("foo")),
+            ),
+            ScalarValue::Dictionary(
+                Box::new(DataType::Int32),
+                Box::new(ScalarValue::Utf8(None)),
+            ),
+            ScalarValue::RunEndEncoded(
+                Field::new("run_ends", DataType::Int32, false).into(),
+                Field::new("values", DataType::Utf8, true).into(),
+                Box::new(ScalarValue::from("foo")),
+            ),
+            ScalarValue::RunEndEncoded(
+                Field::new("run_ends", DataType::Int32, false).into(),
+                Field::new("values", DataType::Utf8, true).into(),
+                Box::new(ScalarValue::Utf8(None)),
+            ),
+            ScalarValue::Binary(Some(b"bar".to_vec())),
+            ScalarValue::Binary(None),
+            ScalarValue::LargeBinary(Some(b"bar".to_vec())),
+            ScalarValue::LargeBinary(None),
+            ScalarStructBuilder::new()
+                .with_scalar(
+                    Field::new("a", DataType::Int32, true),
+                    ScalarValue::from(23i32),
+                )
+                .with_scalar(
+                    Field::new("b", DataType::Boolean, false),
+                    ScalarValue::from(false),
+                )
+                .build()
+                .unwrap(),
+            ScalarStructBuilder::new()
+                .with_scalar(
+                    Field::new("a", DataType::Int32, true),
+                    ScalarValue::from(23i32),
+                )
+                .with_scalar(
+                    Field::new("b", DataType::Boolean, false),
+                    ScalarValue::from(false),
+                )
+                .build()
+                .unwrap(),
+            ScalarValue::try_from(&DataType::Struct(Fields::from(vec![
                 Field::new("a", DataType::Int32, true),
-                ScalarValue::from(23i32),
-            )
-            .with_scalar(
                 Field::new("b", DataType::Boolean, false),
-                ScalarValue::from(false),
-            )
-            .build()
+            ])))
             .unwrap(),
-        ScalarStructBuilder::new()
-            .with_scalar(
+            ScalarValue::try_from(&DataType::Struct(Fields::from(vec![
                 Field::new("a", DataType::Int32, true),
-                ScalarValue::from(23i32),
-            )
-            .with_scalar(
                 Field::new("b", DataType::Boolean, false),
-                ScalarValue::from(false),
-            )
-            .build()
+            ])))
             .unwrap(),
-        ScalarValue::try_from(&DataType::Struct(Fields::from(vec![
-            Field::new("a", DataType::Int32, true),
-            Field::new("b", DataType::Boolean, false),
-        ])))
-        .unwrap(),
-        ScalarValue::try_from(&DataType::Struct(Fields::from(vec![
-            Field::new("a", DataType::Int32, true),
-            Field::new("b", DataType::Boolean, false),
-        ])))
-        .unwrap(),
-        ScalarValue::try_from(&DataType::Map(
-            Arc::new(Field::new(
-                "entries",
-                DataType::Struct(Fields::from(vec![
-                    Field::new("key", DataType::Int32, true),
-                    Field::new("value", DataType::Utf8, false),
-                ])),
-                false,
-            )),
-            false,
-        ))
-        .unwrap(),
-        ScalarValue::try_from(&DataType::Map(
-            Arc::new(Field::new(
-                "entries",
-                DataType::Struct(Fields::from(vec![
-                    Field::new("key", DataType::Int32, true),
-                    Field::new("value", DataType::Utf8, true),
-                ])),
+            ScalarValue::try_from(&DataType::Map(
+                Arc::new(Field::new(
+                    "entries",
+                    DataType::Struct(Fields::from(vec![
+                        Field::new("key", DataType::Int32, true),
+                        Field::new("value", DataType::Utf8, false),
+                    ])),
+                    false,
+                )),
                 false,
-            )),
-            true,
-        ))
-        .unwrap(),
-        ScalarValue::Map(Arc::new(create_map_array_test_case())),
-        ScalarValue::FixedSizeBinary(b"bar".to_vec().len() as i32, Some(b"bar".to_vec())),
-        ScalarValue::FixedSizeBinary(0, None),
-        ScalarValue::FixedSizeBinary(5, None),
-    ];
+            ))
+            .unwrap(),
+            ScalarValue::try_from(&DataType::Map(
+                Arc::new(Field::new(
+                    "entries",
+                    DataType::Struct(Fields::from(vec![
+                        Field::new("key", DataType::Int32, true),
+                        Field::new("value", DataType::Utf8, true),
+                    ])),
+                    false,
+                )),
+                true,
+            ))
+            .unwrap(),
+            ScalarValue::Map(Arc::new(create_map_array_test_case())),
+            ScalarValue::FixedSizeBinary(
+                b"bar".to_vec().len() as i32,
+                Some(b"bar".to_vec()),
+            ),
+            ScalarValue::FixedSizeBinary(0, None),
+            ScalarValue::FixedSizeBinary(5, None),
+        ];
 
     // ScalarValue directly
     for test_case in should_pass.iter() {
@@ -1773,19 +1989,20 @@ fn round_trip_datatype() {
             ),
         ])),
         DataType::Union(
-            UnionFields::new(
+            UnionFields::try_new(
                 vec![7, 5, 3],
                 vec![
                     Field::new("nullable", DataType::Boolean, false),
                     Field::new("name", DataType::Utf8, false),
                     Field::new("datatype", DataType::Binary, false),
                 ],
-            ),
+            )
+            .unwrap(),
             UnionMode::Sparse,
         ),
         DataType::Union(
-            UnionFields::new(
-                vec![5, 8, 1],
+            UnionFields::try_new(
+                vec![5, 8, 1, 100],
                 vec![
                     Field::new("nullable", DataType::Boolean, false),
                     Field::new("name", DataType::Utf8, false),
@@ -1800,7 +2017,8 @@ fn round_trip_datatype() {
                         true,
                     ),
                 ],
-            ),
+            )
+            .unwrap(),
             UnionMode::Dense,
         ),
         DataType::Dictionary(
@@ -1981,6 +2199,10 @@ fn roundtrip_binary_op() {
     test(Operator::RegexNotMatch);
     test(Operator::RegexIMatch);
     test(Operator::RegexMatch);
+    test(Operator::LikeMatch);
+    test(Operator::ILikeMatch);
+    test(Operator::NotLikeMatch);
+    test(Operator::NotILikeMatch);
     test(Operator::BitwiseShiftRight);
     test(Operator::BitwiseShiftLeft);
     test(Operator::BitwiseAnd);
@@ -2350,7 +2572,8 @@ fn roundtrip_scalar_udf_extension_codec() {
     let ctx = SessionContext::new();
     let proto = serialize_expr(&test_expr, &UDFExtensionCodec).expect("serialize expr");
     let round_trip =
-        from_proto::parse_expr(&proto, &ctx, &UDFExtensionCodec).expect("parse expr");
+        from_proto::parse_expr(&proto, ctx.task_ctx().as_ref(), &UDFExtensionCodec)
+            .expect("parse expr");
 
     assert_eq!(format!("{:?}", &test_expr), format!("{round_trip:?}"));
     roundtrip_json_test(&proto);
@@ -2363,7 +2586,8 @@ fn roundtrip_aggregate_udf_extension_codec() {
     let ctx = SessionContext::new();
     let proto = serialize_expr(&test_expr, &UDFExtensionCodec).expect("serialize expr");
     let round_trip =
-        from_proto::parse_expr(&proto, &ctx, &UDFExtensionCodec).expect("parse expr");
+        from_proto::parse_expr(&proto, ctx.task_ctx().as_ref(), &UDFExtensionCodec)
+            .expect("parse expr");
 
     assert_eq!(format!("{:?}", &test_expr), format!("{round_trip:?}"));
     roundtrip_json_test(&proto);
@@ -2567,10 +2791,6 @@ fn roundtrip_window() {
     }
 
     impl WindowUDFImpl for SimpleWindowUDF {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
         fn name(&self) -> &str {
             "dummy_udwf"
         }
@@ -2744,7 +2964,7 @@ async fn roundtrip_union_query() -> Result<()> {
     let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
     // proto deserialization only supports 2-way union, hence this plan has nested unions
     // apply the flatten unions optimizer rule to be able to compare
-    let optimizer = Optimizer::with_rules(vec![Arc::new(EliminateNestedUnion::new())]);
+    let optimizer = Optimizer::with_rules(vec![Arc::new(OptimizeUnions::new())]);
     let unnested = optimizer.optimize(logical_round_trip, &(ctx.state()), |_x, _y| {})?;
     assert_eq!(
         format!("{}", plan.display_indent_schema()),
@@ -2879,3 +3099,55 @@ async fn roundtrip_mixed_case_table_reference() -> Result<()> {
 
     Ok(())
 }
+
+#[tokio::test]
+async fn roundtrip_empty_table_scan() -> Result<()> {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("name", DataType::Utf8, true),
+    ]));
+    let table = Arc::new(datafusion::datasource::empty::EmptyTable::new(Arc::clone(
+        &schema,
+    )));
+
+    let ctx = SessionContext::new();
+    ctx.register_table("empty", table)?;
+
+    let plan = ctx.table("empty").await?.into_optimized_plan()?;
+    let bytes = logical_plan_to_bytes(&plan)?;
+    let restored = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
+
+    assert_eq!(
+        format!("{}", plan.display_indent_schema()),
+        format!("{}", restored.display_indent_schema()),
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_empty_table_scan_with_projection() -> Result<()> {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("name", DataType::Utf8, true),
+    ]));
+    let table = Arc::new(datafusion::datasource::empty::EmptyTable::new(Arc::clone(
+        &schema,
+    )));
+
+    let ctx = SessionContext::new();
+    ctx.register_table("empty", table)?;
+
+    let plan = ctx
+        .table("empty")
+        .await?
+        .select_columns(&["name"])?
+        .into_optimized_plan()?;
+    let bytes = logical_plan_to_bytes(&plan)?;
+    let restored = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
+
+    assert_eq!(
+        format!("{}", plan.display_indent_schema()),
+        format!("{}", restored.display_indent_schema()),
+    );
+    Ok(())
+}
diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs
index c8b2bc02e447b..ca494e57db3b5 100644
--- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs
+++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs
@@ -15,30 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::collections::HashMap;
 use std::fmt::{Display, Formatter};
-
-use std::sync::Arc;
+use std::sync::{Arc, RwLock};
 use std::vec;
 
-use crate::cases::{
-    CustomUDWF, CustomUDWFNode, MyAggregateUDF, MyAggregateUdfNode, MyRegexUdf,
-    MyRegexUdfNode,
-};
-
 use arrow::array::RecordBatch;
 use arrow::csv::WriterBuilder;
 use arrow::datatypes::{Fields, TimeUnit};
-use datafusion::physical_expr::aggregate::AggregateExprBuilder;
-use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec;
-use datafusion::physical_plan::metrics::MetricType;
-use datafusion_expr::dml::InsertOp;
-use datafusion_functions_aggregate::approx_percentile_cont::approx_percentile_cont_udaf;
-use datafusion_functions_aggregate::array_agg::array_agg_udaf;
-use datafusion_functions_aggregate::min_max::max_udaf;
-use prost::Message;
-
 use datafusion::arrow::array::ArrayRef;
 use datafusion::arrow::compute::kernels::sort::SortOptions;
 use datafusion::arrow::datatypes::{DataType, Field, IntervalUnit, Schema};
@@ -51,8 +35,9 @@ use datafusion::datasource::listing::{
 };
 use datafusion::datasource::object_store::ObjectStoreUrl;
 use datafusion::datasource::physical_plan::{
-    wrap_partition_type_in_dict, wrap_partition_value_in_dict, FileGroup,
-    FileScanConfigBuilder, FileSinkConfig, FileSource, ParquetSource,
+    ArrowSource, FileGroup, FileOutputMode, FileScanConfig, FileScanConfigBuilder,
+    FileSinkConfig, ParquetSource, wrap_partition_type_in_dict,
+    wrap_partition_value_in_dict,
 };
 use datafusion::datasource::sink::DataSinkExec;
 use datafusion::datasource::source::DataSourceExec;
@@ -61,39 +46,48 @@ use datafusion::functions_aggregate::count::count_udaf;
 use datafusion::functions_aggregate::sum::sum_udaf;
 use datafusion::functions_window::nth_value::nth_value_udwf;
 use datafusion::functions_window::row_number::row_number_udwf;
-use datafusion::logical_expr::{create_udf, JoinType, Operator, Volatility};
+use datafusion::logical_expr::{JoinType, Operator, Volatility, create_udf};
+use datafusion::physical_expr::aggregate::AggregateExprBuilder;
 use datafusion::physical_expr::expressions::Literal;
 use datafusion::physical_expr::window::{SlidingAggregateWindowExpr, StandardWindowExpr};
 use datafusion::physical_expr::{
     LexOrdering, PhysicalSortRequirement, ScalarFunctionExpr,
 };
+use datafusion::physical_optimizer::PhysicalOptimizerRule;
+use datafusion::physical_optimizer::filter_pushdown::FilterPushdown;
 use datafusion::physical_plan::aggregates::{
-    AggregateExec, AggregateMode, PhysicalGroupBy,
+    AggregateExec, AggregateMode, LimitOptions, PhysicalGroupBy,
 };
 use datafusion::physical_plan::analyze::AnalyzeExec;
+#[expect(deprecated)]
+use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec;
 use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion::physical_plan::empty::EmptyExec;
 use datafusion::physical_plan::expressions::{
-    binary, cast, col, in_list, like, lit, BinaryExpr, Column, NotExpr, PhysicalSortExpr,
+    BinaryExpr, Column, NotExpr, PhysicalSortExpr, binary, cast, col, in_list, like, lit,
 };
-use datafusion::physical_plan::filter::FilterExec;
+use datafusion::physical_plan::filter::{FilterExec, FilterExecBuilder};
 use datafusion::physical_plan::joins::{
     HashJoinExec, NestedLoopJoinExec, PartitionMode, SortMergeJoinExec,
     StreamJoinPartitionMode, SymmetricHashJoinExec,
 };
 use datafusion::physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
+use datafusion::physical_plan::metrics::MetricType;
 use datafusion::physical_plan::placeholder_row::PlaceholderRowExec;
 use datafusion::physical_plan::projection::{ProjectionExec, ProjectionExpr};
 use datafusion::physical_plan::repartition::RepartitionExec;
+use datafusion::physical_plan::scalar_subquery::{
+    ScalarSubqueryExec, ScalarSubqueryLink,
+};
 use datafusion::physical_plan::sorts::sort::SortExec;
 use datafusion::physical_plan::union::{InterleaveExec, UnionExec};
 use datafusion::physical_plan::unnest::{ListUnnest, UnnestExec};
 use datafusion::physical_plan::windows::{
-    create_udwf_window_expr, BoundedWindowAggExec, PlainAggregateWindowExpr,
-    WindowAggExec,
+    BoundedWindowAggExec, PlainAggregateWindowExpr, WindowAggExec,
+    create_udwf_window_expr,
 };
 use datafusion::physical_plan::{
-    displayable, ExecutionPlan, InputOrderMode, Partitioning, PhysicalExpr, Statistics,
+    ExecutionPlan, InputOrderMode, Partitioning, PhysicalExpr, Statistics, displayable,
 };
 use datafusion::prelude::{ParquetReadOptions, SessionContext};
 use datafusion::scalar::ScalarValue;
@@ -103,20 +97,46 @@ use datafusion_common::file_options::json_writer::JsonWriterOptions;
 use datafusion_common::parsers::CompressionTypeVariant;
 use datafusion_common::stats::Precision;
 use datafusion_common::{
-    internal_datafusion_err, internal_err, not_impl_err, DataFusionError, NullEquality,
-    Result, UnnestOptions,
+    DataFusionError, NullEquality, Result, UnnestOptions, exec_datafusion_err,
+    internal_datafusion_err, internal_err, not_impl_err,
 };
+use datafusion_datasource::TableSchema;
+use datafusion_datasource::file::FileSource;
+use datafusion_expr::async_udf::{AsyncScalarUDF, AsyncScalarUDFImpl};
+use datafusion_expr::dml::InsertOp;
 use datafusion_expr::{
-    Accumulator, AccumulatorFactoryFunction, AggregateUDF, ColumnarValue, ScalarUDF,
-    Signature, SimpleAggregateUDF, WindowFrame, WindowFrameBound, WindowUDF,
+    Accumulator, AccumulatorFactoryFunction, AggregateUDF, ColumnarValue,
+    ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, SimpleAggregateUDF,
+    WindowFrame, WindowFrameBound, WindowUDF,
+    execution_props::{ScalarSubqueryResults, SubqueryIndex},
 };
+use datafusion_functions_aggregate::approx_percentile_cont::approx_percentile_cont_udaf;
+use datafusion_functions_aggregate::array_agg::array_agg_udaf;
 use datafusion_functions_aggregate::average::avg_udaf;
+use datafusion_functions_aggregate::min_max::max_udaf;
 use datafusion_functions_aggregate::nth_value::nth_value_udaf;
 use datafusion_functions_aggregate::string_agg::string_agg_udaf;
+use datafusion_physical_expr::scalar_subquery::ScalarSubqueryExpr;
+use datafusion_proto::bytes::{
+    physical_plan_from_bytes_with_proto_converter,
+    physical_plan_to_bytes_with_proto_converter,
+};
+use datafusion_proto::physical_plan::to_proto::serialize_physical_expr_with_converter;
 use datafusion_proto::physical_plan::{
-    AsExecutionPlan, DefaultPhysicalExtensionCodec, PhysicalExtensionCodec,
+    AsExecutionPlan, DeduplicatingProtoConverter, DefaultPhysicalExtensionCodec,
+    DefaultPhysicalProtoConverter, PhysicalExtensionCodec, PhysicalPlanDecodeContext,
+    PhysicalProtoConverterExtension,
 };
-use datafusion_proto::protobuf::{self, PhysicalPlanNode};
+use datafusion_proto::protobuf;
+use datafusion_proto::protobuf::{PhysicalExprNode, PhysicalPlanNode};
+use prost::Message;
+
+use crate::cases::{
+    CustomUDWF, CustomUDWFNode, MyAggregateUDF, MyAggregateUdfNode, MyRegexUdf,
+    MyRegexUdfNode,
+};
+use datafusion_physical_expr::expressions::DynamicFilterPhysicalExpr;
+use datafusion_physical_expr::utils::reassign_expr_columns;
 
 /// Perform a serde roundtrip and assert that the string representation of the before and after plans
 /// are identical. Note that this often isn't sufficient to guarantee that no information is
@@ -124,7 +144,8 @@ use datafusion_proto::protobuf::{self, PhysicalPlanNode};
 fn roundtrip_test(exec_plan: Arc<dyn ExecutionPlan>) -> Result<()> {
     let ctx = SessionContext::new();
     let codec = DefaultPhysicalExtensionCodec {};
-    roundtrip_test_and_return(exec_plan, &ctx, &codec)?;
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    roundtrip_test_and_return(exec_plan, &ctx, &codec, &proto_converter)?;
     Ok(())
 }
 
@@ -138,13 +159,19 @@ fn roundtrip_test_and_return(
     exec_plan: Arc<dyn ExecutionPlan>,
     ctx: &SessionContext,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Arc<dyn ExecutionPlan>> {
-    let proto: protobuf::PhysicalPlanNode =
-        protobuf::PhysicalPlanNode::try_from_physical_plan(exec_plan.clone(), codec)
-            .expect("to proto");
-    let result_exec_plan: Arc<dyn ExecutionPlan> = proto
-        .try_into_physical_plan(&ctx.task_ctx(), codec)
-        .expect("from proto");
+    let bytes = physical_plan_to_bytes_with_proto_converter(
+        Arc::clone(&exec_plan),
+        codec,
+        proto_converter,
+    )?;
+    let result_exec_plan = physical_plan_from_bytes_with_proto_converter(
+        bytes.as_ref(),
+        ctx.task_ctx().as_ref(),
+        codec,
+        proto_converter,
+    )?;
 
     pretty_assertions::assert_eq!(
         format!("{exec_plan:?}"),
@@ -164,7 +191,8 @@ fn roundtrip_test_with_context(
     ctx: &SessionContext,
 ) -> Result<()> {
     let codec = DefaultPhysicalExtensionCodec {};
-    roundtrip_test_and_return(exec_plan, ctx, &codec)?;
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    roundtrip_test_and_return(exec_plan, ctx, &codec, &proto_converter)?;
     Ok(())
 }
 
@@ -172,9 +200,10 @@ fn roundtrip_test_with_context(
 /// query results are identical.
 async fn roundtrip_test_sql_with_context(sql: &str, ctx: &SessionContext) -> Result<()> {
     let codec = DefaultPhysicalExtensionCodec {};
+    let proto_converter = DefaultPhysicalProtoConverter {};
     let initial_plan = ctx.sql(sql).await?.create_physical_plan().await?;
 
-    roundtrip_test_and_return(initial_plan, ctx, &codec)?;
+    roundtrip_test_and_return(initial_plan, ctx, &codec, &proto_converter)?;
     Ok(())
 }
 
@@ -281,6 +310,7 @@ fn roundtrip_hash_join() -> Result<()> {
                 None,
                 *partition_mode,
                 NullEquality::NullEqualsNothing,
+                false,
             )?))?;
         }
     }
@@ -595,14 +625,13 @@ fn roundtrip_aggregate_with_limit() -> Result<()> {
     let groups: Vec<(Arc<dyn PhysicalExpr>, String)> =
         vec![(col("a", &schema)?, "unused".to_string())];
 
-    let aggregates =
-        vec![
-            AggregateExprBuilder::new(avg_udaf(), vec![col("b", &schema)?])
-                .schema(Arc::clone(&schema))
-                .alias("AVG(b)")
-                .build()
-                .map(Arc::new)?,
-        ];
+    let aggregates = vec![
+        AggregateExprBuilder::new(avg_udaf(), vec![col("b", &schema)?])
+            .schema(Arc::clone(&schema))
+            .alias("AVG(b)")
+            .build()
+            .map(Arc::new)?,
+    ];
 
     let agg = AggregateExec::try_new(
         AggregateMode::Final,
@@ -612,7 +641,7 @@ fn roundtrip_aggregate_with_limit() -> Result<()> {
         Arc::new(EmptyExec::new(schema.clone())),
         schema,
     )?;
-    let agg = agg.with_limit(Some(12));
+    let agg = agg.with_limit_options(Some(LimitOptions::new_with_order(12, false)));
     roundtrip_test(Arc::new(agg))
 }
 
@@ -625,14 +654,16 @@ fn roundtrip_aggregate_with_approx_pencentile_cont() -> Result<()> {
     let groups: Vec<(Arc<dyn PhysicalExpr>, String)> =
         vec![(col("a", &schema)?, "unused".to_string())];
 
-    let aggregates = vec![AggregateExprBuilder::new(
-        approx_percentile_cont_udaf(),
-        vec![col("b", &schema)?, lit(0.5)],
-    )
-    .schema(Arc::clone(&schema))
-    .alias("APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY b)")
-    .build()
-    .map(Arc::new)?];
+    let aggregates = vec![
+        AggregateExprBuilder::new(
+            approx_percentile_cont_udaf(),
+            vec![col("b", &schema)?, lit(0.5)],
+        )
+        .schema(Arc::clone(&schema))
+        .alias("APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY b)")
+        .build()
+        .map(Arc::new)?,
+    ];
 
     let agg = AggregateExec::try_new(
         AggregateMode::Final,
@@ -661,15 +692,14 @@ fn roundtrip_aggregate_with_sort() -> Result<()> {
         },
     }];
 
-    let aggregates =
-        vec![
-            AggregateExprBuilder::new(array_agg_udaf(), vec![col("b", &schema)?])
-                .schema(Arc::clone(&schema))
-                .alias("ARRAY_AGG(b)")
-                .order_by(sort_exprs)
-                .build()
-                .map(Arc::new)?,
-        ];
+    let aggregates = vec![
+        AggregateExprBuilder::new(array_agg_udaf(), vec![col("b", &schema)?])
+            .schema(Arc::clone(&schema))
+            .alias("ARRAY_AGG(b)")
+            .order_by(sort_exprs)
+            .build()
+            .map(Arc::new)?,
+    ];
 
     let agg = AggregateExec::try_new(
         AggregateMode::Final,
@@ -729,14 +759,13 @@ fn roundtrip_aggregate_udaf() -> Result<()> {
     let groups: Vec<(Arc<dyn PhysicalExpr>, String)> =
         vec![(col("a", &schema)?, "unused".to_string())];
 
-    let aggregates =
-        vec![
-            AggregateExprBuilder::new(Arc::new(udaf), vec![col("b", &schema)?])
-                .schema(Arc::clone(&schema))
-                .alias("example_agg")
-                .build()
-                .map(Arc::new)?,
-        ];
+    let aggregates = vec![
+        AggregateExprBuilder::new(Arc::new(udaf), vec![col("b", &schema)?])
+            .schema(Arc::clone(&schema))
+            .alias("example_agg")
+            .build()
+            .map(Arc::new)?,
+    ];
 
     roundtrip_test_with_context(
         Arc::new(AggregateExec::try_new(
@@ -774,6 +803,19 @@ fn roundtrip_filter_with_not_and_in_list() -> Result<()> {
     )?))
 }
 
+#[test]
+fn roundtrip_filter_with_fetch() -> Result<()> {
+    let field_a = Field::new("a", DataType::Boolean, false);
+    let field_b = Field::new("b", DataType::Int64, false);
+    let schema = Arc::new(Schema::new(vec![field_a, field_b]));
+    let predicate = col("a", &schema)?;
+    let filter = FilterExecBuilder::new(predicate, Arc::new(EmptyExec::new(schema)))
+        .with_fetch(Some(10))
+        .build()?;
+    assert_eq!(filter.fetch(), Some(10));
+    roundtrip_test(Arc::new(filter))
+}
+
 #[test]
 fn roundtrip_sort() -> Result<()> {
     let field_a = Field::new("a", DataType::Boolean, false);
@@ -842,11 +884,13 @@ fn roundtrip_coalesce_batches_with_fetch() -> Result<()> {
     let field_b = Field::new("b", DataType::Int64, false);
     let schema = Arc::new(Schema::new(vec![field_a, field_b]));
 
+    #[expect(deprecated)]
     roundtrip_test(Arc::new(CoalesceBatchesExec::new(
         Arc::new(EmptyExec::new(schema.clone())),
         8096,
     )))?;
 
+    #[expect(deprecated)]
     roundtrip_test(Arc::new(
         CoalesceBatchesExec::new(Arc::new(EmptyExec::new(schema)), 8096)
             .with_fetch(Some(10)),
@@ -883,25 +927,100 @@ fn roundtrip_parquet_exec_with_pruning_predicate() -> Result<()> {
     let mut options = TableParquetOptions::new();
     options.global.pushdown_filters = true;
 
-    let file_source = Arc::new(ParquetSource::new(options).with_predicate(predicate));
+    let file_source = Arc::new(
+        ParquetSource::new(Arc::clone(&file_schema))
+            .with_table_parquet_options(options)
+            .with_predicate(predicate),
+    );
 
-    let scan_config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::local_filesystem(),
-        file_schema,
-        file_source,
-    )
-    .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new(
-        "/path/to/file.parquet".to_string(),
-        1024,
-    )])])
-    .with_statistics(Statistics {
-        num_rows: Precision::Inexact(100),
-        total_byte_size: Precision::Inexact(1024),
-        column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![
-            Field::new("col", DataType::Utf8, false),
-        ]))),
-    })
-    .build();
+    let scan_config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+            .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new(
+                "/path/to/file.parquet".to_string(),
+                1024,
+            )])])
+            .with_statistics(Statistics {
+                num_rows: Precision::Inexact(100),
+                total_byte_size: Precision::Inexact(1024),
+                column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(
+                    vec![Field::new("col", DataType::Utf8, false)],
+                ))),
+            })
+            .build();
+
+    roundtrip_test(DataSourceExec::from_data_source(scan_config))
+}
+
+#[test]
+fn roundtrip_parquet_exec_attaches_cached_reader_factory_after_roundtrip() -> Result<()> {
+    let file_schema =
+        Arc::new(Schema::new(vec![Field::new("col", DataType::Utf8, false)]));
+    let file_source = Arc::new(ParquetSource::new(Arc::clone(&file_schema)));
+    let scan_config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+            .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new(
+                "/path/to/file.parquet".to_string(),
+                1024,
+            )])])
+            .with_statistics(Statistics {
+                num_rows: Precision::Inexact(100),
+                total_byte_size: Precision::Inexact(1024),
+                column_statistics: Statistics::unknown_column(&file_schema),
+            })
+            .build();
+    let exec_plan = DataSourceExec::from_data_source(scan_config);
+
+    let ctx = SessionContext::new();
+    let codec = DefaultPhysicalExtensionCodec {};
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    let roundtripped =
+        roundtrip_test_and_return(exec_plan, &ctx, &codec, &proto_converter)?;
+
+    let data_source = roundtripped
+        .downcast_ref::<DataSourceExec>()
+        .ok_or_else(|| {
+            internal_datafusion_err!("Expected DataSourceExec after roundtrip")
+        })?;
+    let file_scan = data_source
+        .data_source()
+        .downcast_ref::<FileScanConfig>()
+        .ok_or_else(|| {
+            internal_datafusion_err!("Expected FileScanConfig after roundtrip")
+        })?;
+    let parquet_source = file_scan
+        .file_source()
+        .downcast_ref::<ParquetSource>()
+        .ok_or_else(|| {
+            internal_datafusion_err!("Expected ParquetSource after roundtrip")
+        })?;
+
+    assert!(
+        parquet_source.parquet_file_reader_factory().is_some(),
+        "Parquet reader factory should be attached after decoding from protobuf"
+    );
+    Ok(())
+}
+
+#[test]
+fn roundtrip_arrow_scan() -> Result<()> {
+    let file_schema =
+        Arc::new(Schema::new(vec![Field::new("col", DataType::Utf8, false)]));
+
+    let table_schema = TableSchema::new(file_schema.clone(), vec![]);
+    let file_source = Arc::new(ArrowSource::new_file_source(table_schema));
+
+    let scan_config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+            .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new(
+                "/path/to/file.arrow".to_string(),
+                1024,
+            )])])
+            .with_statistics(Statistics {
+                num_rows: Precision::Inexact(100),
+                total_byte_size: Precision::Inexact(1024),
+                column_statistics: Statistics::unknown_column(&file_schema),
+            })
+            .build();
 
     roundtrip_test(DataSourceExec::from_data_source(scan_config))
 }
@@ -914,21 +1033,21 @@ async fn roundtrip_parquet_exec_with_table_partition_cols() -> Result<()> {
         vec![wrap_partition_value_in_dict(ScalarValue::Int64(Some(0)))];
     let schema = Arc::new(Schema::new(vec![Field::new("col", DataType::Utf8, false)]));
 
-    let file_source = Arc::new(ParquetSource::default());
-    let scan_config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::local_filesystem(),
-        schema,
-        file_source,
-    )
-    .with_projection_indices(Some(vec![0, 1]))
-    .with_file_group(FileGroup::new(vec![file_group]))
-    .with_table_partition_cols(vec![Field::new(
-        "part".to_string(),
-        wrap_partition_type_in_dict(DataType::Int16),
-        false,
-    )])
-    .with_newlines_in_values(false)
-    .build();
+    let table_schema = TableSchema::new(
+        schema.clone(),
+        vec![Arc::new(Field::new(
+            "part".to_string(),
+            wrap_partition_type_in_dict(DataType::Int16),
+            false,
+        ))],
+    );
+
+    let file_source = Arc::new(ParquetSource::new(table_schema.clone()));
+    let scan_config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+            .with_projection_indices(Some(vec![0, 1]))?
+            .with_file_group(FileGroup::new(vec![file_group]))
+            .build();
 
     roundtrip_test(DataSourceExec::from_data_source(scan_config))
 }
@@ -942,26 +1061,25 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> {
         inner: Arc::new(Column::new("col", 1)),
     });
 
-    let file_source =
-        Arc::new(ParquetSource::default().with_predicate(custom_predicate_expr));
+    let file_source = Arc::new(
+        ParquetSource::new(Arc::clone(&file_schema))
+            .with_predicate(custom_predicate_expr),
+    );
 
-    let scan_config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::local_filesystem(),
-        file_schema,
-        file_source,
-    )
-    .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new(
-        "/path/to/file.parquet".to_string(),
-        1024,
-    )])])
-    .with_statistics(Statistics {
-        num_rows: Precision::Inexact(100),
-        total_byte_size: Precision::Inexact(1024),
-        column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![
-            Field::new("col", DataType::Utf8, false),
-        ]))),
-    })
-    .build();
+    let scan_config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+            .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new(
+                "/path/to/file.parquet".to_string(),
+                1024,
+            )])])
+            .with_statistics(Statistics {
+                num_rows: Precision::Inexact(100),
+                total_byte_size: Precision::Inexact(1024),
+                column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(
+                    vec![Field::new("col", DataType::Utf8, false)],
+                ))),
+            })
+            .build();
 
     #[derive(Debug, Clone, Eq)]
     struct CustomPredicateExpr {
@@ -982,16 +1100,12 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> {
     }
 
     impl Display for CustomPredicateExpr {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
             write!(f, "CustomPredicateExpr")
         }
     }
 
     impl PhysicalExpr for CustomPredicateExpr {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
         fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
             unreachable!()
         }
@@ -1059,11 +1173,7 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> {
             node: &Arc<dyn PhysicalExpr>,
             buf: &mut Vec<u8>,
         ) -> Result<()> {
-            if node
-                .as_any()
-                .downcast_ref::<CustomPredicateExpr>()
-                .is_some()
-            {
+            if node.downcast_ref::<CustomPredicateExpr>().is_some() {
                 buf.extend_from_slice("CustomPredicateExpr".as_bytes());
                 Ok(())
             } else {
@@ -1075,7 +1185,12 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> {
     let exec_plan = DataSourceExec::from_data_source(scan_config);
 
     let ctx = SessionContext::new();
-    roundtrip_test_and_return(exec_plan, &ctx, &CustomPhysicalExtensionCodec {})?;
+    roundtrip_test_and_return(
+        exec_plan,
+        &ctx,
+        &CustomPhysicalExtensionCodec {},
+        &DefaultPhysicalProtoConverter {},
+    )?;
     Ok(())
 }
 
@@ -1162,7 +1277,7 @@ impl PhysicalExtensionCodec for UDFExtensionCodec {
 
     fn try_encode_udf(&self, node: &ScalarUDF, buf: &mut Vec<u8>) -> Result<()> {
         let binding = node.inner();
-        if let Some(udf) = binding.as_any().downcast_ref::<MyRegexUdf>() {
+        if let Some(udf) = binding.downcast_ref::<MyRegexUdf>() {
             let proto = MyRegexUdfNode {
                 pattern: udf.pattern.clone(),
             };
@@ -1189,7 +1304,7 @@ impl PhysicalExtensionCodec for UDFExtensionCodec {
 
     fn try_encode_udaf(&self, node: &AggregateUDF, buf: &mut Vec<u8>) -> Result<()> {
         let binding = node.inner();
-        if let Some(udf) = binding.as_any().downcast_ref::<MyAggregateUDF>() {
+        if let Some(udf) = binding.downcast_ref::<MyAggregateUDF>() {
             let proto = MyAggregateUdfNode {
                 result: udf.result.clone(),
             };
@@ -1216,7 +1331,7 @@ impl PhysicalExtensionCodec for UDFExtensionCodec {
 
     fn try_encode_udwf(&self, node: &WindowUDF, buf: &mut Vec<u8>) -> Result<()> {
         let binding = node.inner();
-        if let Some(udwf) = binding.as_any().downcast_ref::<CustomUDWF>() {
+        if let Some(udwf) = binding.downcast_ref::<CustomUDWF>() {
             let proto = CustomUDWFNode {
                 payload: udwf.payload.clone(),
             };
@@ -1273,7 +1388,7 @@ fn roundtrip_scalar_udf_extension_codec() -> Result<()> {
 
     let aggregate = Arc::new(AggregateExec::try_new(
         AggregateMode::Final,
-        PhysicalGroupBy::new(vec![], vec![], vec![]),
+        PhysicalGroupBy::new(vec![], vec![], vec![], false),
         vec![aggr_expr],
         vec![None],
         window,
@@ -1281,7 +1396,8 @@ fn roundtrip_scalar_udf_extension_codec() -> Result<()> {
     )?);
 
     let ctx = SessionContext::new();
-    roundtrip_test_and_return(aggregate, &ctx, &UDFExtensionCodec)?;
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    roundtrip_test_and_return(aggregate, &ctx, &UDFExtensionCodec, &proto_converter)?;
     Ok(())
 }
 
@@ -1328,7 +1444,8 @@ fn roundtrip_udwf_extension_codec() -> Result<()> {
     )?);
 
     let ctx = SessionContext::new();
-    roundtrip_test_and_return(window, &ctx, &UDFExtensionCodec)?;
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    roundtrip_test_and_return(window, &ctx, &UDFExtensionCodec, &proto_converter)?;
     Ok(())
 }
 
@@ -1391,7 +1508,7 @@ fn roundtrip_aggregate_udf_extension_codec() -> Result<()> {
 
     let aggregate = Arc::new(AggregateExec::try_new(
         AggregateMode::Final,
-        PhysicalGroupBy::new(vec![], vec![], vec![]),
+        PhysicalGroupBy::new(vec![], vec![], vec![], false),
         vec![aggr_expr],
         vec![None],
         window,
@@ -1399,7 +1516,8 @@ fn roundtrip_aggregate_udf_extension_codec() -> Result<()> {
     )?);
 
     let ctx = SessionContext::new();
-    roundtrip_test_and_return(aggregate, &ctx, &UDFExtensionCodec)?;
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    roundtrip_test_and_return(aggregate, &ctx, &UDFExtensionCodec, &proto_converter)?;
     Ok(())
 }
 
@@ -1437,7 +1555,8 @@ fn roundtrip_analyze() -> Result<()> {
     roundtrip_test(Arc::new(AnalyzeExec::new(
         false,
         false,
-        vec![MetricType::SUMMARY, MetricType::DEV],
+        vec![MetricType::Summary, MetricType::Dev],
+        None,
         input,
         Arc::new(schema),
     )))
@@ -1469,6 +1588,7 @@ fn roundtrip_json_sink() -> Result<()> {
         insert_op: InsertOp::Overwrite,
         keep_partition_by_columns: true,
         file_extension: "json".into(),
+        file_output_mode: FileOutputMode::SingleFile,
     };
     let data_sink = Arc::new(JsonSink::new(
         file_sink_config,
@@ -1507,6 +1627,7 @@ fn roundtrip_csv_sink() -> Result<()> {
         insert_op: InsertOp::Overwrite,
         keep_partition_by_columns: true,
         file_extension: "csv".into(),
+        file_output_mode: FileOutputMode::Directory,
     };
     let data_sink = Arc::new(CsvSink::new(
         file_sink_config,
@@ -1523,22 +1644,17 @@ fn roundtrip_csv_sink() -> Result<()> {
 
     let ctx = SessionContext::new();
     let codec = DefaultPhysicalExtensionCodec {};
+    let proto_converter = DefaultPhysicalProtoConverter {};
+
     let roundtrip_plan = roundtrip_test_and_return(
         Arc::new(DataSinkExec::new(input, data_sink, Some(sort_order))),
         &ctx,
         &codec,
-    )
-    .unwrap();
+        &proto_converter,
+    )?;
 
-    let roundtrip_plan = roundtrip_plan
-        .as_any()
-        .downcast_ref::<DataSinkExec>()
-        .unwrap();
-    let csv_sink = roundtrip_plan
-        .sink()
-        .as_any()
-        .downcast_ref::<CsvSink>()
-        .unwrap();
+    let roundtrip_plan = roundtrip_plan.downcast_ref::<DataSinkExec>().unwrap();
+    let csv_sink = roundtrip_plan.sink().downcast_ref::<CsvSink>().unwrap();
     assert_eq!(
         CompressionTypeVariant::ZSTD,
         csv_sink.writer_options().compression
@@ -1564,6 +1680,7 @@ fn roundtrip_parquet_sink() -> Result<()> {
         insert_op: InsertOp::Overwrite,
         keep_partition_by_columns: true,
         file_extension: "parquet".into(),
+        file_output_mode: FileOutputMode::Automatic,
     };
     let data_sink = Arc::new(ParquetSink::new(
         file_sink_config,
@@ -1655,6 +1772,35 @@ fn roundtrip_union() -> Result<()> {
     roundtrip_test(union)
 }
 
+#[test]
+fn roundtrip_repartition_preserve_order() -> Result<()> {
+    let field_a = Field::new("a", DataType::Int64, false);
+    let schema = Arc::new(Schema::new(vec![field_a]));
+    let sort_exprs: LexOrdering = [PhysicalSortExpr {
+        expr: col("a", &schema)?,
+        options: SortOptions::default(),
+    }]
+    .into();
+
+    // Create two sorted single-partition inputs, then union them to get
+    // a sorted input with 2 partitions.
+    let source1 = SortExec::new(
+        sort_exprs.clone(),
+        Arc::new(EmptyExec::new(Arc::clone(&schema))),
+    );
+    let source2 = SortExec::new(sort_exprs, Arc::new(EmptyExec::new(schema)));
+    let union = UnionExec::try_new(vec![
+        Arc::new(source1) as Arc<dyn ExecutionPlan>,
+        Arc::new(source2) as Arc<dyn ExecutionPlan>,
+    ])?;
+
+    let repartition = RepartitionExec::try_new(union, Partitioning::RoundRobinBatch(10))?
+        .with_preserve_order();
+    assert!(repartition.preserve_order());
+
+    roundtrip_test(Arc::new(repartition))
+}
+
 #[test]
 fn roundtrip_interleave() -> Result<()> {
     let field_a = Field::new("col", DataType::Int64, false);
@@ -1803,26 +1949,24 @@ async fn roundtrip_projection_source() -> Result<()> {
 
     let statistics = Statistics::new_unknown(&schema);
 
-    let file_source = ParquetSource::default().with_statistics(statistics.clone());
-    let scan_config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::local_filesystem(),
-        schema.clone(),
-        file_source,
-    )
-    .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new(
-        "/path/to/file.parquet".to_string(),
-        1024,
-    )])])
-    .with_statistics(statistics)
-    .with_projection_indices(Some(vec![0, 1, 2]))
-    .build();
+    let file_source = Arc::new(ParquetSource::new(Arc::clone(&schema)));
+    let scan_config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+            .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new(
+                "/path/to/file.parquet".to_string(),
+                1024,
+            )])])
+            .with_statistics(statistics)
+            .with_projection_indices(Some(vec![0, 1, 2]))?
+            .build();
 
     let filter = Arc::new(
-        FilterExec::try_new(
+        FilterExecBuilder::new(
             Arc::new(BinaryExpr::new(col("c", &schema)?, Operator::Eq, lit(1))),
             DataSourceExec::from_data_source(scan_config),
-        )?
-        .with_projection(Some(vec![0, 1]))?,
+        )
+        .apply_projection(Some(vec![0, 1]))?
+        .build()?,
     );
 
     roundtrip_test(filter)
@@ -1972,6 +2116,7 @@ async fn test_serialize_deserialize_tpch_queries() -> Result<()> {
 
             // serialize the physical plan
             let codec = DefaultPhysicalExtensionCodec {};
+
             let proto =
                 PhysicalPlanNode::try_from_physical_plan(physical_plan.clone(), &codec)?;
 
@@ -2093,6 +2238,7 @@ async fn test_tpch_part_in_list_query_with_real_parquet_data() -> Result<()> {
 
     // Serialize the physical plan - bug may happen here already but not necessarily manifests
     let codec = DefaultPhysicalExtensionCodec {};
+
     let proto = PhysicalPlanNode::try_from_physical_plan(physical_plan.clone(), &codec)?;
 
     // This will fail with the bug, but should succeed when fixed
@@ -2264,3 +2410,1414 @@ async fn roundtrip_listing_table_with_schema_metadata() -> Result<()> {
 
     roundtrip_test(plan)
 }
+
+#[tokio::test]
+async fn roundtrip_async_func_exec() -> Result<()> {
+    #[derive(Debug, PartialEq, Eq, Hash)]
+    struct TestAsyncUDF {
+        signature: Signature,
+    }
+
+    impl TestAsyncUDF {
+        fn new() -> Self {
+            Self {
+                signature: Signature::exact(vec![DataType::Int64], Volatility::Volatile),
+            }
+        }
+    }
+
+    impl ScalarUDFImpl for TestAsyncUDF {
+        fn name(&self) -> &str {
+            "test_async_udf"
+        }
+
+        fn signature(&self) -> &Signature {
+            &self.signature
+        }
+
+        fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+            Ok(DataType::Int64)
+        }
+
+        fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+            not_impl_err!("Must call from `invoke_async_with_args`")
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl AsyncScalarUDFImpl for TestAsyncUDF {
+        async fn invoke_async_with_args(
+            &self,
+            args: ScalarFunctionArgs,
+        ) -> Result<ColumnarValue> {
+            Ok(args.args[0].clone())
+        }
+    }
+
+    let ctx = SessionContext::new();
+    let async_udf = AsyncScalarUDF::new(Arc::new(TestAsyncUDF::new()));
+    ctx.register_udf(async_udf.into_scalar_udf());
+
+    let physical_plan = ctx
+        .sql("select test_async_udf(1)")
+        .await?
+        .create_physical_plan()
+        .await?;
+
+    roundtrip_test_with_context(physical_plan, &ctx)?;
+
+    Ok(())
+}
+
+/// Test that HashTableLookupExpr serializes to lit(true)
+///
+/// HashTableLookupExpr contains a runtime hash table that cannot be serialized.
+/// The serialization code replaces it with lit(true) which is safe because
+/// it's a performance optimization filter, not a correctness requirement.
+#[test]
+fn roundtrip_hash_table_lookup_expr_to_lit() -> Result<()> {
+    use datafusion::physical_plan::joins::join_hash_map::JoinHashMapU32;
+    use datafusion::physical_plan::joins::{HashTableLookupExpr, Map};
+
+    // Create a simple schema and input plan
+    let schema = Arc::new(Schema::new(vec![Field::new("col", DataType::Int64, false)]));
+    let input = Arc::new(EmptyExec::new(schema.clone()));
+
+    // Create a HashTableLookupExpr - it will be replaced with lit(true) during serialization
+    let hash_map = Arc::new(Map::HashMap(Box::new(JoinHashMapU32::with_capacity(0))));
+    let on_columns = vec![datafusion::physical_plan::expressions::col("col", &schema)?];
+    let lookup_expr: Arc<dyn PhysicalExpr> = Arc::new(HashTableLookupExpr::new(
+        on_columns,
+        datafusion::physical_plan::joins::SeededRandomState::with_seed(0),
+        hash_map,
+        "test_lookup".to_string(),
+    ));
+
+    // Create a filter with the lookup expression
+    let filter = Arc::new(FilterExec::try_new(lookup_expr, input)?);
+
+    // Serialize
+    let ctx = SessionContext::new();
+    let codec = DefaultPhysicalExtensionCodec {};
+
+    let proto: PhysicalPlanNode =
+        PhysicalPlanNode::try_from_physical_plan(filter.clone(), &codec)
+            .expect("serialization should succeed");
+
+    // Deserialize
+    let result: Arc<dyn ExecutionPlan> = proto
+        .try_into_physical_plan(&ctx.task_ctx(), &codec)
+        .expect("deserialization should succeed");
+
+    // The deserialized plan should have lit(true) instead of HashTableLookupExpr
+    // Verify the filter predicate is a Literal(true)
+    let result_filter = result.downcast_ref::<FilterExec>().unwrap();
+    let predicate = result_filter.predicate();
+    let literal = predicate.downcast_ref::<Literal>().unwrap();
+    assert_eq!(*literal.value(), ScalarValue::Boolean(Some(true)));
+
+    Ok(())
+}
+
+#[test]
+fn roundtrip_hash_expr() -> Result<()> {
+    use datafusion::physical_plan::joins::{HashExpr, SeededRandomState};
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Int64, false),
+        Field::new("b", DataType::Utf8, false),
+    ]));
+
+    // Create a HashExpr with test columns and seeds
+    let on_columns = vec![col("a", &schema)?, col("b", &schema)?];
+    let hash_expr: Arc<dyn PhysicalExpr> = Arc::new(HashExpr::new(
+        on_columns,
+        SeededRandomState::with_seed(0), // arbitrary random seed for testing
+        "test_hash".to_string(),
+    ));
+
+    // Wrap in a filter by comparing hash value to a literal
+    // hash_expr > 0 is always boolean
+    let filter_expr = binary(hash_expr, Operator::Gt, lit(0u64), &schema)?;
+    let filter = Arc::new(FilterExec::try_new(
+        filter_expr,
+        Arc::new(EmptyExec::new(schema)),
+    )?);
+
+    // Confirm that the debug string contains the random state seeds
+    assert!(
+        format!("{filter:?}").contains("test_hash(a@0, b@1, [0])"),
+        "Debug string missing seeds: {filter:?}"
+    );
+    roundtrip_test(filter)
+}
+
+#[test]
+fn custom_proto_converter_intercepts() -> Result<()> {
+    #[derive(Default)]
+    struct CustomConverterInterceptor {
+        num_proto_plans: RwLock<usize>,
+        num_physical_plans: RwLock<usize>,
+        num_proto_exprs: RwLock<usize>,
+        num_physical_exprs: RwLock<usize>,
+    }
+
+    impl PhysicalProtoConverterExtension for CustomConverterInterceptor {
+        fn proto_to_execution_plan(
+            &self,
+            proto: &protobuf::PhysicalPlanNode,
+            ctx: &PhysicalPlanDecodeContext<'_>,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
+            {
+                let mut counter = self
+                    .num_proto_plans
+                    .write()
+                    .map_err(|err| exec_datafusion_err!("{err}"))?;
+                *counter += 1;
+            }
+            self.default_proto_to_execution_plan(proto, ctx)
+        }
+
+        fn execution_plan_to_proto(
+            &self,
+            plan: &Arc<dyn ExecutionPlan>,
+            codec: &dyn PhysicalExtensionCodec,
+        ) -> Result<protobuf::PhysicalPlanNode>
+        where
+            Self: Sized,
+        {
+            {
+                let mut counter = self
+                    .num_physical_plans
+                    .write()
+                    .map_err(|err| exec_datafusion_err!("{err}"))?;
+                *counter += 1;
+            }
+            PhysicalPlanNode::try_from_physical_plan_with_converter(
+                Arc::clone(plan),
+                codec,
+                self,
+            )
+        }
+
+        fn proto_to_physical_expr(
+            &self,
+            proto: &PhysicalExprNode,
+            input_schema: &Schema,
+            ctx: &PhysicalPlanDecodeContext<'_>,
+        ) -> Result<Arc<dyn PhysicalExpr>>
+        where
+            Self: Sized,
+        {
+            {
+                let mut counter = self
+                    .num_proto_exprs
+                    .write()
+                    .map_err(|err| exec_datafusion_err!("{err}"))?;
+                *counter += 1;
+            }
+            self.default_proto_to_physical_expr(proto, input_schema, ctx)
+        }
+
+        fn physical_expr_to_proto(
+            &self,
+            expr: &Arc<dyn PhysicalExpr>,
+            codec: &dyn PhysicalExtensionCodec,
+        ) -> Result<PhysicalExprNode> {
+            {
+                let mut counter = self
+                    .num_physical_exprs
+                    .write()
+                    .map_err(|err| exec_datafusion_err!("{err}"))?;
+                *counter += 1;
+            }
+            serialize_physical_expr_with_converter(expr, codec, self)
+        }
+    }
+
+    let field_a = Field::new("a", DataType::Boolean, false);
+    let field_b = Field::new("b", DataType::Int64, false);
+    let schema = Arc::new(Schema::new(vec![field_a, field_b]));
+    let sort_exprs = [
+        PhysicalSortExpr {
+            expr: col("a", &schema)?,
+            options: SortOptions {
+                descending: true,
+                nulls_first: false,
+            },
+        },
+        PhysicalSortExpr {
+            expr: col("b", &schema)?,
+            options: SortOptions {
+                descending: false,
+                nulls_first: true,
+            },
+        },
+    ]
+    .into();
+
+    let exec_plan = Arc::new(SortExec::new(sort_exprs, Arc::new(EmptyExec::new(schema))));
+
+    let ctx = SessionContext::new();
+    let codec = DefaultPhysicalExtensionCodec {};
+    let proto_converter = CustomConverterInterceptor::default();
+    roundtrip_test_and_return(exec_plan, &ctx, &codec, &proto_converter)?;
+
+    assert_eq!(*proto_converter.num_proto_exprs.read().unwrap(), 2);
+    assert_eq!(*proto_converter.num_physical_exprs.read().unwrap(), 2);
+    assert_eq!(*proto_converter.num_proto_plans.read().unwrap(), 2);
+    assert_eq!(*proto_converter.num_physical_plans.read().unwrap(), 2);
+
+    Ok(())
+}
+
+#[test]
+fn roundtrip_call_null_scalar_struct_dict() -> Result<()> {
+    let data_type = DataType::Struct(Fields::from(vec![Field::new(
+        "item",
+        DataType::Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)),
+        true,
+    )]));
+
+    let schema = Arc::new(Schema::new(vec![Field::new("a", data_type.clone(), true)]));
+    let scan = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+    let scalar = lit(ScalarValue::try_from(data_type)?);
+    let filter = Arc::new(FilterExec::try_new(
+        Arc::new(BinaryExpr::new(scalar, Operator::Eq, col("a", &schema)?)),
+        scan,
+    )?);
+
+    roundtrip_test(filter)
+}
+
+/// Create a [`DynamicFilterPhysicalExpr`] with child column expression "a" @ index 0.
+fn make_dynamic_filter() -> Arc<dyn PhysicalExpr> {
+    Arc::new(DynamicFilterPhysicalExpr::new(
+        vec![Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>],
+        lit(true),
+    )) as Arc<dyn PhysicalExpr>
+}
+
+/// Update a [`DynamicFilterPhysicalExpr`]'s children to support child schema "b" @ 0, "a" @ 1.
+fn make_reassigned_dynamic_filter(
+    filter: Arc<dyn PhysicalExpr>,
+) -> Result<(Arc<Schema>, Arc<dyn PhysicalExpr>)> {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("b", DataType::Int64, false),
+        Field::new("a", DataType::Int64, false),
+    ]));
+    let reassigned = reassign_expr_columns(filter, &schema)?;
+    Ok((schema, reassigned))
+}
+
+/// Extract the expression id from a [`PhysicalExpr`] proto. Populated by the
+/// default serializer from `PhysicalExpr::expression_id`.
+fn proto_expression_id(expr: &PhysicalExprNode) -> u64 {
+    expr.expr_id
+        .expect("expected PhysicalExprNode.expr_id to be populated")
+}
+
+/// Roundtrip a single physical expression shaped like so:
+///
+/// ```text
+///             BinaryExpr(AND)
+///             /             \
+///     filter_expr_1     filter_expr_2
+/// ```
+///
+/// Returns filter_expr_1 and filter_expr_2 after deserialization.
+fn roundtrip_dynamic_filter_expr_pair(
+    filter_expr_1: Arc<dyn PhysicalExpr>,
+    filter_expr_2: Arc<dyn PhysicalExpr>,
+    schema: Arc<Schema>,
+) -> Result<(Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>)> {
+    let pair_expr = Arc::new(BinaryExpr::new(
+        Arc::clone(&filter_expr_1),
+        Operator::And,
+        Arc::clone(&filter_expr_2),
+    )) as Arc<dyn PhysicalExpr>;
+
+    let codec = DefaultPhysicalExtensionCodec {};
+    let converter = DeduplicatingProtoConverter {};
+    let proto = converter.physical_expr_to_proto(&pair_expr, &codec)?;
+    let ctx = SessionContext::new();
+    let task_ctx = ctx.task_ctx();
+    let decode_ctx = PhysicalPlanDecodeContext::new(task_ctx.as_ref(), &codec);
+    let deserialized_expr =
+        converter.proto_to_physical_expr(&proto, &schema, &decode_ctx)?;
+
+    let binary = deserialized_expr
+        .downcast_ref::<BinaryExpr>()
+        .expect("Expected BinaryExpr");
+
+    Ok((Arc::clone(binary.left()), Arc::clone(binary.right())))
+}
+
+/// Roundtrip an execution plan shaped like so:
+///
+/// ```text
+/// FilterExec(dynamic_filter_1 on a@0)
+///   ProjectionExec(a := Column("a", source_index))
+///     DataSourceExec
+///       ParquetSource(predicate = dynamic_filter_2)
+/// ```
+///
+/// `dynamic_filter_1` and `dynamic_filter_2` are the same dynamic filter, except with
+/// different children.
+///
+/// Returns
+/// - `dynamic_filter_1` before serialization
+/// - `dynamic_filter_2` before serialization
+/// - `dynamic_filter_1` after serialization
+/// - `dynamic_filter_2` after serialization
+#[allow(clippy::type_complexity)]
+fn roundtrip_dynamic_filter_plan_pair() -> Result<(
+    Arc<dyn PhysicalExpr>,
+    Arc<dyn PhysicalExpr>,
+    Arc<dyn PhysicalExpr>,
+    Arc<dyn PhysicalExpr>,
+)> {
+    let filter_expr_1 = make_dynamic_filter();
+    let (data_source_schema, filter_expr_2) =
+        make_reassigned_dynamic_filter(Arc::clone(&filter_expr_1))?;
+    let left_before = Arc::clone(&filter_expr_1);
+    let right_before = Arc::clone(&filter_expr_2);
+    let file_source = Arc::new(
+        ParquetSource::new(Arc::clone(&data_source_schema))
+            .with_predicate(Arc::clone(&filter_expr_2)),
+    );
+    let scan_config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+            .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new(
+                "/path/to/file.parquet".to_string(),
+                1024,
+            )])])
+            .build();
+    let data_source_exec =
+        DataSourceExec::from_data_source(scan_config) as Arc<dyn ExecutionPlan>;
+
+    let projection_exec = Arc::new(ProjectionExec::try_new(
+        vec![ProjectionExpr {
+            expr: Arc::new(Column::new("a", 1)) as Arc<dyn PhysicalExpr>,
+            alias: "a".to_string(),
+        }],
+        data_source_exec,
+    )?) as Arc<dyn ExecutionPlan>;
+    let filter_exec = Arc::new(FilterExec::try_new(
+        Arc::clone(&filter_expr_1),
+        projection_exec,
+    )?) as Arc<dyn ExecutionPlan>;
+
+    let codec = DefaultPhysicalExtensionCodec {};
+    let converter = DeduplicatingProtoConverter {};
+    let proto = converter.execution_plan_to_proto(&filter_exec, &codec)?;
+
+    let ctx = SessionContext::new();
+    let task_ctx = ctx.task_ctx();
+    let decode_ctx = PhysicalPlanDecodeContext::new(task_ctx.as_ref(), &codec);
+    let deserialized_plan = converter.proto_to_execution_plan(&proto, &decode_ctx)?;
+
+    let outer_filter = deserialized_plan
+        .downcast_ref::<FilterExec>()
+        .expect("Expected outer FilterExec");
+    let left_filter = Arc::clone(outer_filter.predicate());
+    let projection = outer_filter.children()[0]
+        .downcast_ref::<ProjectionExec>()
+        .expect("Expected ProjectionExec");
+    let data_source = projection
+        .input()
+        .downcast_ref::<DataSourceExec>()
+        .expect("Expected DataSourceExec");
+    let scan_config = data_source
+        .data_source()
+        .downcast_ref::<FileScanConfig>()
+        .expect("Expected FileScanConfig");
+    let right_filter = scan_config
+        .file_source()
+        .filter()
+        .expect("Expected pushed-down predicate");
+
+    Ok((left_before, right_before, left_filter, right_filter))
+}
+
+/// Takes two [`DynamicFilterPhysicalExpr`] and asserts that updates to one are visible
+/// via the other. This helps assert that referential integrity is maintained after
+/// deserializing.
+fn assert_dynamic_filter_update_is_visible(
+    left_filter: &Arc<dyn PhysicalExpr>,
+    right_filter: &Arc<dyn PhysicalExpr>,
+) -> Result<()> {
+    let left_filter = left_filter
+        .downcast_ref::<DynamicFilterPhysicalExpr>()
+        .expect("Expected dynamic filter");
+    let right_filter = right_filter
+        .downcast_ref::<DynamicFilterPhysicalExpr>()
+        .expect("Expected dynamic filter");
+
+    // Sanity check that the filters have the same generation.
+    let original_generation = left_filter.snapshot_generation();
+    assert_eq!(original_generation, right_filter.snapshot_generation(),);
+
+    left_filter.update(lit(123_i64))?;
+
+    // Assert that both generations updated.
+    assert_eq!(original_generation + 1, right_filter.snapshot_generation(),);
+    assert_eq!(
+        left_filter.snapshot_generation(),
+        right_filter.snapshot_generation(),
+    );
+
+    // Ensure both filters have the updated expr.
+    let expected_current = r#"Literal { value: Int64(123), field: Field { name: "lit", data_type: Int64 } }"#;
+    assert_eq!(expected_current, format!("{:?}", left_filter.current()?),);
+    assert_eq!(expected_current, format!("{:?}", right_filter.current()?),);
+
+    Ok(())
+}
+
+/// Assert that two dynamic filters are equal both structurally (Debug output)
+/// and by identity (`expression_id`).
+///
+fn assert_dynamic_filters_equal(
+    expected: &Arc<dyn PhysicalExpr>,
+    actual: &Arc<dyn PhysicalExpr>,
+) {
+    // TODO: Debug currently omits `expression_id` so the id has to be checked
+    // separately here. Once plan nodes like `SortExec` / `AggregateExec` /
+    // `HashJoinExec` serialize their own dynamic filter, Debug can include
+    // `expression_id`.
+    //
+    // See https://github.com/apache/datafusion/issues/20418
+    assert_eq!(expected.expression_id(), actual.expression_id());
+
+    // Structural.
+    let expected_dbg = format!("{expected:?}");
+    let actual_dbg = format!("{actual:?}");
+    if expected_dbg == actual_dbg {
+        return;
+    }
+
+    // Note that the `DeduplicatingDeserializer` routes every cache hit through
+    // `with_new_children`. This produces an equivalent expression, but with
+    // remapped children that are equal to the original. Handle that case here.
+    let rewritten = Arc::clone(expected)
+        .with_new_children(expected.children().iter().map(|c| Arc::clone(c)).collect())
+        .expect("with_new_children on a dynamic filter should not fail");
+    assert_eq!(format!("{rewritten:?}"), actual_dbg);
+}
+
+// Two clones of a dynamic filter expression should be deduped to the exact same expression.
+#[test]
+fn test_dynamic_filter_roundtrip_dedupe() -> Result<()> {
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]));
+    let filter_expr_1 = make_dynamic_filter();
+    let filter_expr_2 = Arc::clone(&filter_expr_1);
+
+    let (filter_expr_1_after_roundtrip, filter_expr_2_after_roundtrip) =
+        roundtrip_dynamic_filter_expr_pair(
+            Arc::clone(&filter_expr_1),
+            Arc::clone(&filter_expr_2),
+            schema,
+        )?;
+
+    // Assert the filters are not modified during roundtrip.
+    assert_dynamic_filters_equal(&filter_expr_1, &filter_expr_1_after_roundtrip);
+    assert_dynamic_filters_equal(&filter_expr_2, &filter_expr_2_after_roundtrip);
+    assert_dynamic_filters_equal(
+        &filter_expr_1_after_roundtrip,
+        &filter_expr_2_after_roundtrip,
+    );
+
+    // Assert referential integrity.
+    assert_dynamic_filter_update_is_visible(
+        &filter_expr_1_after_roundtrip,
+        &filter_expr_2_after_roundtrip,
+    )?;
+
+    Ok(())
+}
+
+/// Roundtrip test for an execution plan where there are multiple instances of a dynamic filter
+/// with different children.
+#[test]
+fn test_dynamic_filter_plan_roundtrip_dedupe() -> Result<()> {
+    let (
+        filter_expr_1,
+        filter_expr_2,
+        filter_expr_1_after_roundtrip,
+        filter_expr_2_after_roundtrip,
+    ) = roundtrip_dynamic_filter_plan_pair()?;
+
+    // Assert the filters are not modified during roundtrip.
+    assert_dynamic_filters_equal(&filter_expr_1, &filter_expr_1_after_roundtrip);
+    assert_dynamic_filters_equal(&filter_expr_2, &filter_expr_2_after_roundtrip);
+
+    // Assert referential integrity.
+    assert_dynamic_filter_update_is_visible(
+        &filter_expr_1_after_roundtrip,
+        &filter_expr_2_after_roundtrip,
+    )?;
+
+    Ok(())
+}
+
+#[test]
+fn test_dynamic_filter_expression_id_is_stable_between_serializations() -> Result<()> {
+    let filter_expr = make_dynamic_filter();
+    let codec = DefaultPhysicalExtensionCodec {};
+    let proto_converter = DeduplicatingProtoConverter {};
+
+    let proto1 = proto_converter.physical_expr_to_proto(&filter_expr, &codec)?;
+    let expr_id1 = proto_expression_id(&proto1);
+
+    let proto2 = proto_converter.physical_expr_to_proto(&filter_expr, &codec)?;
+    let expr_id2 = proto_expression_id(&proto2);
+
+    assert_eq!(
+        expr_id1, expr_id2,
+        "Expected the same dynamic filter expression id across serializations"
+    );
+
+    Ok(())
+}
+
+/// Tests that `lead` window function with offset and default value args
+/// survives a protobuf round-trip. This is a regression test for a bug
+/// where `expressions()` (used during serialization) returns only the
+/// column expression for lead/lag, silently dropping the offset and
+/// default value literal args.
+#[test]
+fn roundtrip_lead_with_default_value() -> Result<()> {
+    use datafusion::functions_window::lead_lag::lead_udwf;
+
+    let field_a = Field::new("a", DataType::Int64, false);
+    let field_b = Field::new("b", DataType::Int64, false);
+    let schema = Arc::new(Schema::new(vec![field_a, field_b]));
+
+    // lead(a, 2, 42) — column a, offset 2, default value 42
+    let lead_window = create_udwf_window_expr(
+        &lead_udwf(),
+        &[col("a", &schema)?, lit(2i64), lit(42i64)],
+        schema.as_ref(),
+        "test lead with default".to_string(),
+        false,
+    )?;
+
+    let udwf_expr = Arc::new(StandardWindowExpr::new(
+        lead_window,
+        &[col("b", &schema)?],
+        &[PhysicalSortExpr {
+            expr: col("a", &schema)?,
+            options: SortOptions {
+                descending: false,
+                nulls_first: false,
+            },
+        }],
+        Arc::new(WindowFrame::new(None)),
+    ));
+
+    let input = Arc::new(EmptyExec::new(schema.clone()));
+
+    roundtrip_test(Arc::new(BoundedWindowAggExec::try_new(
+        vec![udwf_expr],
+        input,
+        InputOrderMode::Sorted,
+        true,
+    )?))
+}
+
+/// Verify that ScalarSubqueryExpr nodes in the input plan are connected to the
+/// same shared results container as ScalarSubqueryExec after a proto round-trip.
+#[test]
+fn roundtrip_scalar_subquery_exec() -> Result<()> {
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]));
+    let results = ScalarSubqueryResults::new(1);
+
+    // Build the input plan: a filter whose predicate references the
+    // scalar subquery result via ScalarSubqueryExpr.
+    let sq_expr = Arc::new(ScalarSubqueryExpr::new(
+        DataType::Int64,
+        true,
+        SubqueryIndex::new(0),
+        results.clone(),
+    ));
+    let predicate = binary(col("a", &schema)?, Operator::Eq, sq_expr, &schema)?;
+    let filter =
+        FilterExec::try_new(predicate, Arc::new(EmptyExec::new(schema.clone())))?;
+
+    // Build a trivial subquery plan.
+    let subquery_plan =
+        Arc::new(EmptyExec::new(Arc::new(Schema::new(vec![Field::new(
+            "x",
+            DataType::Int64,
+            true,
+        )]))));
+
+    let exec: Arc<dyn ExecutionPlan> = Arc::new(ScalarSubqueryExec::new(
+        Arc::new(filter),
+        vec![ScalarSubqueryLink {
+            plan: subquery_plan,
+            index: SubqueryIndex::new(0),
+        }],
+        results,
+    ));
+
+    // Perform the round-trip using DeduplicatingProtoConverter, which
+    // creates a DeduplicatingDeserializer that threads scalar subquery
+    // results through expression deserialization.
+    let codec = DefaultPhysicalExtensionCodec {};
+    let converter = DeduplicatingProtoConverter {};
+    let bytes = physical_plan_to_bytes_with_proto_converter(
+        Arc::clone(&exec),
+        &codec,
+        &converter,
+    )?;
+    let ctx = SessionContext::new();
+    let deserialized = physical_plan_from_bytes_with_proto_converter(
+        bytes.as_ref(),
+        ctx.task_ctx().as_ref(),
+        &codec,
+        &converter,
+    )?;
+
+    // Verify the deserialized ScalarSubqueryExec's results container is
+    // shared with the ScalarSubqueryExpr in the input plan.
+    let sq_exec = deserialized
+        .downcast_ref::<ScalarSubqueryExec>()
+        .expect("expected ScalarSubqueryExec");
+    let exec_results = sq_exec.results();
+
+    // Walk the input plan to find the ScalarSubqueryExpr and verify it
+    // points to the same results container.
+    let filter_exec = sq_exec
+        .input()
+        .downcast_ref::<FilterExec>()
+        .expect("expected FilterExec");
+    let binary_expr = filter_exec
+        .predicate()
+        .downcast_ref::<BinaryExpr>()
+        .expect("expected BinaryExpr");
+    let deserialized_sq_expr = binary_expr
+        .right()
+        .downcast_ref::<ScalarSubqueryExpr>()
+        .expect("expected ScalarSubqueryExpr");
+
+    assert!(
+        ScalarSubqueryResults::ptr_eq(exec_results, deserialized_sq_expr.results()),
+        "ScalarSubqueryExpr should share the same results container as ScalarSubqueryExec"
+    );
+    Ok(())
+}
+
+/// Verify that nested ScalarSubqueryExec nodes deserialize with distinct
+/// scoped results containers, and that each ScalarSubqueryExpr is wired to the
+/// container for its own surrounding ScalarSubqueryExec.
+#[test]
+fn roundtrip_nested_scalar_subquery_exec_scopes_results() -> Result<()> {
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]));
+    let subquery_schema =
+        Arc::new(Schema::new(vec![Field::new("x", DataType::Int64, true)]));
+
+    let inner_results = ScalarSubqueryResults::new(1);
+    let inner_sq_expr = Arc::new(ScalarSubqueryExpr::new(
+        DataType::Int64,
+        true,
+        SubqueryIndex::new(0),
+        inner_results.clone(),
+    ));
+    let inner_predicate =
+        binary(col("a", &schema)?, Operator::Eq, inner_sq_expr, &schema)?;
+    let inner_filter = Arc::new(FilterExec::try_new(
+        inner_predicate,
+        Arc::new(EmptyExec::new(schema.clone())),
+    )?);
+    let inner_exec: Arc<dyn ExecutionPlan> = Arc::new(ScalarSubqueryExec::new(
+        inner_filter,
+        vec![ScalarSubqueryLink {
+            plan: Arc::new(EmptyExec::new(subquery_schema.clone())),
+            index: SubqueryIndex::new(0),
+        }],
+        inner_results,
+    ));
+
+    let outer_results = ScalarSubqueryResults::new(1);
+    let outer_sq_expr = Arc::new(ScalarSubqueryExpr::new(
+        DataType::Int64,
+        true,
+        SubqueryIndex::new(0),
+        outer_results.clone(),
+    ));
+    let outer_predicate =
+        binary(col("a", &schema)?, Operator::Eq, outer_sq_expr, &schema)?;
+    let outer_filter = Arc::new(FilterExec::try_new(outer_predicate, inner_exec)?);
+    let outer_exec: Arc<dyn ExecutionPlan> = Arc::new(ScalarSubqueryExec::new(
+        outer_filter,
+        vec![ScalarSubqueryLink {
+            plan: Arc::new(EmptyExec::new(subquery_schema)),
+            index: SubqueryIndex::new(0),
+        }],
+        outer_results,
+    ));
+
+    let bytes = datafusion_proto::bytes::physical_plan_to_bytes(Arc::clone(&outer_exec))?;
+    let ctx = SessionContext::new();
+    let deserialized = datafusion_proto::bytes::physical_plan_from_bytes(
+        bytes.as_ref(),
+        ctx.task_ctx().as_ref(),
+    )?;
+
+    let outer_exec = deserialized
+        .downcast_ref::<ScalarSubqueryExec>()
+        .expect("expected outer ScalarSubqueryExec");
+    let outer_results = outer_exec.results();
+    let outer_filter = outer_exec
+        .input()
+        .downcast_ref::<FilterExec>()
+        .expect("expected outer FilterExec");
+    let outer_binary = outer_filter
+        .predicate()
+        .downcast_ref::<BinaryExpr>()
+        .expect("expected outer BinaryExpr");
+    let outer_sq_expr = outer_binary
+        .right()
+        .downcast_ref::<ScalarSubqueryExpr>()
+        .expect("expected outer ScalarSubqueryExpr");
+
+    let inner_exec = outer_filter
+        .input()
+        .downcast_ref::<ScalarSubqueryExec>()
+        .expect("expected inner ScalarSubqueryExec");
+    let inner_results = inner_exec.results();
+    let inner_filter = inner_exec
+        .input()
+        .downcast_ref::<FilterExec>()
+        .expect("expected inner FilterExec");
+    let inner_binary = inner_filter
+        .predicate()
+        .downcast_ref::<BinaryExpr>()
+        .expect("expected inner BinaryExpr");
+    let inner_sq_expr = inner_binary
+        .right()
+        .downcast_ref::<ScalarSubqueryExpr>()
+        .expect("expected inner ScalarSubqueryExpr");
+
+    assert!(
+        ScalarSubqueryResults::ptr_eq(outer_results, outer_sq_expr.results()),
+        "outer ScalarSubqueryExpr should use outer ScalarSubqueryExec results"
+    );
+    assert!(
+        ScalarSubqueryResults::ptr_eq(inner_results, inner_sq_expr.results()),
+        "inner ScalarSubqueryExpr should use inner ScalarSubqueryExec results"
+    );
+    assert!(
+        !ScalarSubqueryResults::ptr_eq(outer_results, inner_results),
+        "nested ScalarSubqueryExec nodes should not share results containers"
+    );
+    assert!(
+        !ScalarSubqueryResults::ptr_eq(outer_results, inner_sq_expr.results()),
+        "inner ScalarSubqueryExpr must not read from outer results"
+    );
+    assert!(
+        !ScalarSubqueryResults::ptr_eq(inner_results, outer_sq_expr.results()),
+        "outer ScalarSubqueryExpr must not read from inner results"
+    );
+
+    Ok(())
+}
+
+/// Verify that the default physical plan bytes round-trip preserves executable
+/// scalar subquery plans.
+#[tokio::test]
+async fn roundtrip_scalar_subquery_exec_with_default_converter_executes() -> Result<()> {
+    let ctx = SessionContext::new();
+    let sql = "SELECT x + (SELECT max(y) FROM (VALUES (10), (20)) AS u(y)) AS s \
+               FROM (VALUES (2), (1)) AS t(x) \
+               ORDER BY s";
+
+    let initial_plan = ctx.sql(sql).await?.create_physical_plan().await?;
+    assert!(
+        format!("{initial_plan:?}").contains("ScalarSubqueryExec"),
+        "expected ScalarSubqueryExec in plan:\n{initial_plan:?}"
+    );
+
+    let bytes =
+        datafusion_proto::bytes::physical_plan_to_bytes(Arc::clone(&initial_plan))?;
+    let roundtripped = datafusion_proto::bytes::physical_plan_from_bytes(
+        bytes.as_ref(),
+        ctx.task_ctx().as_ref(),
+    )?;
+    assert!(
+        format!("{roundtripped:?}").contains("ScalarSubqueryExec"),
+        "expected ScalarSubqueryExec after roundtrip:\n{roundtripped:?}"
+    );
+
+    let batches = datafusion::physical_plan::common::collect(
+        roundtripped.execute(0, ctx.task_ctx())?,
+    )
+    .await?;
+    datafusion::assert_batches_eq!(
+        &["+----+", "| s  |", "+----+", "| 21 |", "| 22 |", "+----+",],
+        &batches
+    );
+
+    Ok(())
+}
+
+/// Test that a chain of the same operator (a AND b AND c) is linearized
+/// and roundtrips correctly.
+#[test]
+fn roundtrip_binary_expr_chain_same_op() -> Result<()> {
+    let field_a = Field::new("a", DataType::Boolean, false);
+    let field_b = Field::new("b", DataType::Boolean, false);
+    let field_c = Field::new("c", DataType::Boolean, false);
+    let schema = Arc::new(Schema::new(vec![field_a, field_b, field_c]));
+    let ab = binary(
+        col("a", &schema)?,
+        Operator::And,
+        col("b", &schema)?,
+        &schema,
+    )?;
+    let abc = binary(ab, Operator::And, col("c", &schema)?, &schema)?;
+    roundtrip_test(Arc::new(FilterExec::try_new(
+        abc,
+        Arc::new(EmptyExec::new(schema)),
+    )?))
+}
+
+/// Test that mixed operators (a AND b OR c) are NOT linearized together —
+/// only chains of the same operator are flattened.
+#[test]
+fn roundtrip_binary_expr_mixed_ops() -> Result<()> {
+    let field_a = Field::new("a", DataType::Boolean, false);
+    let field_b = Field::new("b", DataType::Boolean, false);
+    let field_c = Field::new("c", DataType::Boolean, false);
+    let schema = Arc::new(Schema::new(vec![field_a, field_b, field_c]));
+    // (a AND b) OR c — AND and OR are different operators, so linearization stops
+    let a_and_b = binary(
+        col("a", &schema)?,
+        Operator::And,
+        col("b", &schema)?,
+        &schema,
+    )?;
+    let expr = binary(a_and_b, Operator::Or, col("c", &schema)?, &schema)?;
+    roundtrip_test(Arc::new(FilterExec::try_new(
+        expr,
+        Arc::new(EmptyExec::new(schema)),
+    )?))
+}
+
+/// Test that a deeply nested chain of AND expressions (like many WHERE conditions)
+/// roundtrips correctly. This is the scenario from issue #18602.
+#[test]
+fn roundtrip_binary_expr_deeply_nested_and_chain() -> Result<()> {
+    let field_a = Field::new("a", DataType::Boolean, false);
+    let schema = Arc::new(Schema::new(vec![field_a]));
+
+    // Build a chain: a AND a AND a AND ... (100 times)
+    let col_a = col("a", &schema)?;
+    let mut expr = Arc::clone(&col_a);
+    for _ in 0..99 {
+        expr = binary(expr, Operator::And, Arc::clone(&col_a), &schema)?;
+    }
+
+    roundtrip_test(Arc::new(FilterExec::try_new(
+        expr,
+        Arc::new(EmptyExec::new(schema)),
+    )?))
+}
+
+/// Test that a deeply nested chain of OR expressions roundtrips correctly.
+#[test]
+fn roundtrip_binary_expr_deeply_nested_or_chain() -> Result<()> {
+    let field_a = Field::new("a", DataType::Boolean, false);
+    let schema = Arc::new(Schema::new(vec![field_a]));
+
+    let col_a = col("a", &schema)?;
+    let mut expr = Arc::clone(&col_a);
+    for _ in 0..99 {
+        expr = binary(expr, Operator::Or, Arc::clone(&col_a), &schema)?;
+    }
+
+    roundtrip_test(Arc::new(FilterExec::try_new(
+        expr,
+        Arc::new(EmptyExec::new(schema)),
+    )?))
+}
+
+/// Test that alternating AND/OR operators produce correct results —
+/// each sub-chain gets linearized independently.
+#[test]
+fn roundtrip_binary_expr_alternating_and_or() -> Result<()> {
+    let field_a = Field::new("a", DataType::Boolean, false);
+    let field_b = Field::new("b", DataType::Boolean, false);
+    let field_c = Field::new("c", DataType::Boolean, false);
+    let field_d = Field::new("d", DataType::Boolean, false);
+    let schema = Arc::new(Schema::new(vec![field_a, field_b, field_c, field_d]));
+
+    // (a AND b) OR (c AND d)
+    let a_and_b = binary(
+        col("a", &schema)?,
+        Operator::And,
+        col("b", &schema)?,
+        &schema,
+    )?;
+    let c_and_d = binary(
+        col("c", &schema)?,
+        Operator::And,
+        col("d", &schema)?,
+        &schema,
+    )?;
+    let expr = binary(a_and_b, Operator::Or, c_and_d, &schema)?;
+
+    roundtrip_test(Arc::new(FilterExec::try_new(
+        expr,
+        Arc::new(EmptyExec::new(schema)),
+    )?))
+}
+
+/// Verify that the linearized proto format has a flat operands list
+/// rather than deeply nested l/r fields.
+#[test]
+fn test_linearization_produces_flat_operands() -> Result<()> {
+    // Build: a AND a AND a AND a (4 operands, 3 levels of nesting)
+    let col_a: Arc<dyn PhysicalExpr> = Arc::new(Column::new("a", 0));
+    let expr: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+        Arc::new(BinaryExpr::new(
+            Arc::new(BinaryExpr::new(
+                Arc::clone(&col_a),
+                Operator::And,
+                Arc::clone(&col_a),
+            )),
+            Operator::And,
+            Arc::clone(&col_a),
+        )),
+        Operator::And,
+        Arc::clone(&col_a),
+    ));
+
+    let codec = DefaultPhysicalExtensionCodec {};
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    let proto = proto_converter.physical_expr_to_proto(&expr, &codec)?;
+
+    // The top-level should use the operands field with 4 entries
+    match &proto.expr_type {
+        Some(protobuf::physical_expr_node::ExprType::BinaryExpr(b)) => {
+            assert!(
+                b.l.is_none(),
+                "l should be None when using linearized operands"
+            );
+            assert!(
+                b.r.is_none(),
+                "r should be None when using linearized operands"
+            );
+            assert_eq!(
+                b.operands.len(),
+                4,
+                "Expected 4 linearized operands for a AND a AND a AND a"
+            );
+            assert_eq!(b.op, "And");
+        }
+        other => panic!("Expected BinaryExpr, got {other:?}"),
+    }
+
+    Ok(())
+}
+
+/// Test that linearization stops when encountering a different operator.
+/// For (a AND b) OR c, only the top-level OR should be represented, and
+/// the left-hand AND subtree should be a separate nested BinaryExpr.
+#[test]
+fn test_linearization_stops_at_different_op() -> Result<()> {
+    // (a AND b) OR c
+    let a_and_b: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+        Arc::new(Column::new("a", 0)),
+        Operator::And,
+        Arc::new(Column::new("b", 1)),
+    ));
+    let expr: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+        a_and_b,
+        Operator::Or,
+        Arc::new(Column::new("c", 2)),
+    ));
+
+    let codec = DefaultPhysicalExtensionCodec {};
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    let proto = proto_converter.physical_expr_to_proto(&expr, &codec)?;
+
+    // The top-level OR should have only 2 operands (can't linearize through AND)
+    match &proto.expr_type {
+        Some(protobuf::physical_expr_node::ExprType::BinaryExpr(b)) => {
+            assert_eq!(
+                b.operands.len(),
+                2,
+                "Expected 2 operands for (a AND b) OR c"
+            );
+            assert_eq!(b.op, "Or");
+            // The first operand should be a nested AND BinaryExpr
+            match &b.operands[0].expr_type {
+                Some(protobuf::physical_expr_node::ExprType::BinaryExpr(inner)) => {
+                    assert_eq!(inner.op, "And");
+                    assert_eq!(inner.operands.len(), 2);
+                }
+                other => panic!("Expected inner BinaryExpr(AND), got {other:?}"),
+            }
+        }
+        other => panic!("Expected BinaryExpr, got {other:?}"),
+    }
+
+    Ok(())
+}
+
+/// Create a DataSourceExec backed by a ParquetSource that accepts filter pushdown,
+/// along with a ConfigOptions that enables all dynamic filter pushdown options.
+fn datasource_for_dynamic_filter_pushdown(
+    schema: &Arc<Schema>,
+) -> (Arc<dyn ExecutionPlan>, ConfigOptions) {
+    let mut parquet_options = TableParquetOptions::new();
+    parquet_options.global.pushdown_filters = true;
+    let source = Arc::new(
+        ParquetSource::new(Arc::clone(schema))
+            .with_table_parquet_options(parquet_options),
+    );
+    let scan_config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source)
+            .with_file(PartitionedFile::new("/path/to/file.parquet", 1024))
+            .build();
+
+    let mut config = ConfigOptions::default();
+    config.execution.parquet.pushdown_filters = true;
+    config.optimizer.enable_join_dynamic_filter_pushdown = true;
+    config.optimizer.enable_aggregate_dynamic_filter_pushdown = true;
+    config.optimizer.enable_topk_dynamic_filter_pushdown = true;
+
+    (DataSourceExec::from_data_source(scan_config), config)
+}
+
+/// Test that plan containing a HashJoinExec with dynamic filter pushdown
+/// can be serialized and deserialized while preserving references to the dynamic filter.
+#[test]
+fn test_hash_join_with_dynamic_filter_roundtrip() -> Result<()> {
+    let schema = Arc::new(Schema::new(vec![Field::new("col", DataType::Int64, false)]));
+
+    let left_child = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+    let (right_child, config) = datasource_for_dynamic_filter_pushdown(&schema);
+
+    let on: Vec<(Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>)> = vec![(
+        Arc::new(Column::new("col", 0)),
+        Arc::new(Column::new("col", 0)),
+    )];
+
+    let hash_join = Arc::new(HashJoinExec::try_new(
+        left_child,
+        right_child,
+        on,
+        None,
+        &JoinType::Inner,
+        None,
+        PartitionMode::CollectLeft,
+        NullEquality::NullEqualsNothing,
+        false,
+    )?) as Arc<dyn ExecutionPlan>;
+
+    // Run the optimizer rule for filter pushdown.
+    let optimizer = FilterPushdown::new_post_optimization();
+    let plan = optimizer.optimize(hash_join, &config)?;
+
+    let ctx = SessionContext::new();
+    let codec = DefaultPhysicalExtensionCodec {};
+    let converter = DeduplicatingProtoConverter {};
+    let deserialized = roundtrip_test_and_return(plan, &ctx, &codec, &converter)?;
+
+    // Extract the deserialized HashJoinExec and its dynamic filter.
+    let deserialized_join = deserialized
+        .downcast_ref::<HashJoinExec>()
+        .expect("Should be HashJoinExec");
+    let deserialized_hash_join_df = deserialized_join
+        .dynamic_filter()
+        .expect("HashJoinExec should have a dynamic filter after roundtrip");
+
+    // Extract the dynamic filter from the probe side's ParquetSource.
+    let deserialized_data_source = deserialized_join
+        .right()
+        .downcast_ref::<DataSourceExec>()
+        .expect("Right child should be DataSourceExec");
+    let (_, deserialized_parquet_source) = deserialized_data_source
+        .downcast_to_file_source::<ParquetSource>()
+        .expect("Should be ParquetSource");
+    let deserialized_predicate = deserialized_parquet_source
+        .filter()
+        .expect("ParquetSource should have a predicate after roundtrip");
+    let deserialized_predicate_df = deserialized_predicate
+        .downcast_ref::<DynamicFilterPhysicalExpr>()
+        .expect("ParquetSource predicate should contain a DynamicFilterPhysicalExpr");
+
+    // After roundtrip, the HashJoinExec's dynamic filter and the ParquetSource's
+    // predicate should share the same inner state.
+    assert_eq!(
+        deserialized_hash_join_df
+            .expression_id()
+            .expect("DynamicFilterPhysicalExpr always has an expression_id"),
+        deserialized_predicate_df
+            .expression_id()
+            .expect("DynamicFilterPhysicalExpr always has an expression_id"),
+        "HashJoinExec's dynamic filter should share inner state with the probe side's predicate"
+    );
+
+    Ok(())
+}
+
+/// returns a SessionContext with an empty `netflow` table registered
+fn netflow_context() -> Result<SessionContext> {
+    let ctx = SessionContext::new();
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("dst_geo_country_name", DataType::Utf8, true),
+        Field::new("dst_geo_city_name", DataType::Utf8, true),
+        Field::new("packets", DataType::UInt64, true),
+        Field::new("src_addr", DataType::Utf8, true),
+        Field::new("dst_addr", DataType::Utf8, true),
+    ]));
+
+    ctx.register_table("netflow", Arc::new(EmptyTable::new(schema)))?;
+
+    Ok(ctx)
+}
+
+/// Regression test for issue #18602:
+/// https://github.com/apache/datafusion/issues/18602
+///
+/// The physical filter expression here contains a long chain of `AND` predicates.
+/// Before linearizing `PhysicalBinaryExprNode`, encoding then decoding the protobuf
+/// could fail with `DecodeError: recursion limit reached`.
+#[tokio::test]
+async fn roundtrip_issue_18602_complex_filter_decode_recursion() -> Result<()> {
+    let ctx = netflow_context()?;
+    let sql = "SELECT \
+      dst_geo_country_name AS x_axis_1, \
+      dst_geo_city_name AS x_axis_2, \
+      sum(packets) AS y_axis_1 \
+    FROM netflow \
+    WHERE dst_geo_country_name IS NOT NULL \
+      AND src_addr NOT LIKE '10.201.%' \
+      AND dst_addr NOT LIKE '10.201.%' \
+      AND src_addr NOT LIKE '10.202.%' \
+      AND dst_addr NOT LIKE '10.202.%' \
+      AND src_addr NOT LIKE '10.203.%' \
+      AND dst_addr NOT LIKE '10.203.%' \
+      AND src_addr NOT LIKE '10.204.%' \
+      AND dst_addr NOT LIKE '10.204.%' \
+      AND src_addr NOT LIKE '172.16.186.%' \
+      AND dst_addr NOT LIKE '172.16.186.%' \
+      AND src_addr NOT LIKE '172.16.187.%' \
+      AND dst_addr NOT LIKE '172.16.187.%' \
+      AND src_addr NOT LIKE '172.16.188.%' \
+      AND dst_addr NOT LIKE '172.16.188.%' \
+      AND src_addr NOT LIKE '10.102.45.%' \
+      AND dst_addr NOT LIKE '10.102.45.%' \
+      AND src_addr NOT LIKE '172.25.210.%' \
+      AND dst_addr NOT LIKE '172.25.210.%' \
+      AND src_addr NOT LIKE '172.25.211.%' \
+      AND dst_addr NOT LIKE '172.25.211.%' \
+      AND src_addr NOT LIKE '141.226.101.%' \
+      AND dst_addr NOT LIKE '141.226.101.%' \
+      AND src_addr NOT LIKE '167.86.40.%' \
+      AND dst_addr NOT LIKE '167.86.40.%' \
+      AND src_addr NOT LIKE '66.22.38.%' \
+      AND dst_addr NOT LIKE '66.22.38.%' \
+      AND src_addr != '168.143.191.55' \
+      AND dst_addr != '168.143.191.55' \
+      AND src_addr != '82.112.107.142' \
+      AND dst_addr != '82.112.107.142' \
+      AND src_addr != '20.76.39.176' \
+      AND dst_addr != '20.76.39.176' \
+      AND src_addr != '162.159.129.83' \
+      AND dst_addr != '162.159.129.83' \
+      AND src_addr != '34.201.223.155' \
+      AND dst_addr != '34.201.223.155' \
+      AND src_addr != '34.201.223.156' \
+      AND dst_addr != '34.201.223.156' \
+      AND src_addr != '34.201.223.157' \
+      AND dst_addr != '34.201.223.157' \
+      AND src_addr != '134.201.223.157' \
+      AND dst_addr != '134.201.223.157' \
+      AND src_addr != '341.201.223.157' \
+      AND dst_addr != '341.201.223.157' \
+    GROUP BY x_axis_1, x_axis_2 \
+    ORDER BY y_axis_1 DESC \
+    LIMIT 20";
+
+    roundtrip_test_sql_with_context(sql, &ctx).await
+}
+
+/// Test that plan containing a AggregateExec with dynamic filter pushdown
+/// can be serialized and deserialized while preserving references to the dynamic filter.
+#[test]
+fn test_aggregate_with_dynamic_filter_roundtrip() -> Result<()> {
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]));
+    let col_a: Arc<dyn PhysicalExpr> = Arc::new(Column::new("a", 0));
+
+    let (child, config) = datasource_for_dynamic_filter_pushdown(&schema);
+
+    let agg = Arc::new(AggregateExec::try_new(
+        AggregateMode::Partial,
+        PhysicalGroupBy::new_single(vec![]),
+        vec![
+            AggregateExprBuilder::new(
+                datafusion::functions_aggregate::min_max::min_udaf(),
+                vec![Arc::clone(&col_a)],
+            )
+            .schema(Arc::clone(&schema))
+            .alias("min_a")
+            .build()
+            .map(Arc::new)?,
+        ],
+        vec![None],
+        child,
+        Arc::clone(&schema),
+    )?) as Arc<dyn ExecutionPlan>;
+
+    // Run the optimizer rule for filter pushdown.
+    let optimizer = FilterPushdown::new_post_optimization();
+    let plan = optimizer.optimize(agg, &config)?;
+
+    // Roundtrip with deduplication.
+    //
+    // Note: We don't use `roundtrip_test_and_return` here because there's a
+    // pre-existing issue with PhysicalGroupBy serialization where empty groups
+    // `[[]]` become `[]` after roundtrip. This behavior is unrelated to this test.
+    let ctx = SessionContext::new();
+    let codec = DefaultPhysicalExtensionCodec {};
+    let converter = DeduplicatingProtoConverter {};
+    let bytes = physical_plan_to_bytes_with_proto_converter(
+        Arc::clone(&plan),
+        &codec,
+        &converter,
+    )?;
+    let deserialized = physical_plan_from_bytes_with_proto_converter(
+        bytes.as_ref(),
+        ctx.task_ctx().as_ref(),
+        &codec,
+        &converter,
+    )?;
+
+    // Extract the deserialized AggregateExec and its dynamic filter.
+    let deserialized_agg = deserialized
+        .downcast_ref::<AggregateExec>()
+        .expect("Should be AggregateExec");
+    let deserialized_agg_df = deserialized_agg
+        .dynamic_filter()
+        .expect("AggregateExec should have a dynamic filter after roundtrip");
+
+    // Extract the dynamic filter from the child DataSourceExec.
+    let deserialized_data_source = deserialized_agg
+        .input()
+        .downcast_ref::<DataSourceExec>()
+        .expect("Child should be DataSourceExec");
+    let (_, deserialized_parquet_source) = deserialized_data_source
+        .downcast_to_file_source::<ParquetSource>()
+        .expect("Should be ParquetSource");
+    let deserialized_predicate = deserialized_parquet_source
+        .filter()
+        .expect("ParquetSource should have a predicate after roundtrip");
+    let deserialized_predicate_df = deserialized_predicate
+        .downcast_ref::<DynamicFilterPhysicalExpr>()
+        .expect("ParquetSource predicate should contain a DynamicFilterPhysicalExpr");
+
+    // The AggregateExec's dynamic filter and the child's predicate should
+    // share the same inner state after roundtrip.
+    assert_eq!(
+        deserialized_agg_df
+            .expression_id()
+            .expect("DynamicFilterPhysicalExpr always has an expression_id"),
+        deserialized_predicate_df
+            .expression_id()
+            .expect("DynamicFilterPhysicalExpr always has an expression_id"),
+        "AggregateExec's dynamic filter should share inner state with child's predicate"
+    );
+
+    Ok(())
+}
+
+/// Test that plan containing a SortExec with dynamic filter pushdown
+/// can be serialized and deserialized while preserving references to the dynamic filter.
+#[test]
+fn test_sort_topk_with_dynamic_filter_roundtrip() -> Result<()> {
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]));
+    let col_a: Arc<dyn PhysicalExpr> = Arc::new(Column::new("a", 0));
+
+    let (child, config) = datasource_for_dynamic_filter_pushdown(&schema);
+
+    let sort = Arc::new(
+        SortExec::new(
+            LexOrdering::new(vec![PhysicalSortExpr {
+                expr: Arc::clone(&col_a),
+                options: SortOptions::default(),
+            }])
+            .unwrap(),
+            child,
+        )
+        .with_fetch(Some(10)),
+    ) as Arc<dyn ExecutionPlan>;
+
+    // Verify the optimizer kept the dynamic filter on the SortExec.
+    let optimizer = FilterPushdown::new_post_optimization();
+    let plan = optimizer.optimize(sort, &config)?;
+
+    // Roundtrip with deduplication.
+    //
+    // Note: We don't use `roundtrip_test_and_return` here because
+    // `DeduplicatingDeserializer` rewrites cache hits via `with_new_children`,
+    // which sets `remapped_children: Some(...)` on the second encounter of a
+    // shared `DynamicFilterPhysicalExpr`. SortExec's `Debug` includes its
+    // dynamic filter, so the original-vs-deserialized structural equality check
+    // would fail purely on this artifact.
+    let ctx = SessionContext::new();
+    let codec = DefaultPhysicalExtensionCodec {};
+    let converter = DeduplicatingProtoConverter {};
+    let bytes = physical_plan_to_bytes_with_proto_converter(
+        Arc::clone(&plan),
+        &codec,
+        &converter,
+    )?;
+    let deserialized = physical_plan_from_bytes_with_proto_converter(
+        bytes.as_ref(),
+        ctx.task_ctx().as_ref(),
+        &codec,
+        &converter,
+    )?;
+
+    // Extract the deserialized SortExec and its dynamic filter.
+    let deserialized_sort = deserialized
+        .downcast_ref::<SortExec>()
+        .expect("Should be SortExec");
+    let deserialized_sort_df = deserialized_sort
+        .dynamic_filter()
+        .expect("SortExec should have a dynamic filter after roundtrip");
+
+    // Extract the dynamic filter from the child DataSourceExec.
+    let deserialized_data_source = deserialized_sort
+        .input()
+        .downcast_ref::<DataSourceExec>()
+        .expect("Child should be DataSourceExec");
+    let (_, deserialized_parquet_source) = deserialized_data_source
+        .downcast_to_file_source::<ParquetSource>()
+        .expect("Should be ParquetSource");
+    let deserialized_predicate = deserialized_parquet_source
+        .filter()
+        .expect("ParquetSource should have a predicate after roundtrip");
+    let deserialized_predicate_df = deserialized_predicate
+        .downcast_ref::<DynamicFilterPhysicalExpr>()
+        .expect("ParquetSource predicate should contain a DynamicFilterPhysicalExpr");
+
+    // The SortExec's dynamic filter and the child's predicate should
+    // share the same inner state after roundtrip.
+    assert_eq!(
+        deserialized_sort_df
+            .expression_id()
+            .expect("DynamicFilterPhysicalExpr always has an expression_id"),
+        deserialized_predicate_df
+            .expression_id()
+            .expect("DynamicFilterPhysicalExpr always has an expression_id"),
+        "SortExec's dynamic filter should share inner state with child's predicate"
+    );
+    Ok(())
+}
diff --git a/datafusion/proto/tests/cases/serialize.rs b/datafusion/proto/tests/cases/serialize.rs
index f45a62e948740..850fd42ce131b 100644
--- a/datafusion/proto/tests/cases/serialize.rs
+++ b/datafusion/proto/tests/cases/serialize.rs
@@ -23,12 +23,12 @@ use arrow::datatypes::{DataType, Field};
 use datafusion::execution::FunctionRegistry;
 use datafusion::prelude::SessionContext;
 use datafusion_expr::expr::Placeholder;
-use datafusion_expr::{col, create_udf, lit, ColumnarValue};
+use datafusion_expr::{ColumnarValue, col, create_udf, lit};
 use datafusion_expr::{Expr, Volatility};
 use datafusion_functions::string;
 use datafusion_proto::bytes::Serializeable;
-use datafusion_proto::logical_plan::to_proto::serialize_expr;
 use datafusion_proto::logical_plan::DefaultLogicalExtensionCodec;
+use datafusion_proto::logical_plan::to_proto::serialize_expr;
 
 #[test]
 #[should_panic(
@@ -42,7 +42,7 @@ fn bad_decode() {
 #[cfg(feature = "json")]
 fn plan_to_json() {
     use datafusion_common::DFSchema;
-    use datafusion_expr::{logical_plan::EmptyRelation, LogicalPlan};
+    use datafusion_expr::{LogicalPlan, logical_plan::EmptyRelation};
     use datafusion_proto::bytes::logical_plan_to_json;
 
     let plan = LogicalPlan::EmptyRelation(EmptyRelation {
@@ -77,7 +77,8 @@ fn udf_roundtrip_with_registry() {
         .call(vec![lit("")]);
 
     let bytes = expr.to_bytes().unwrap();
-    let deserialized_expr = Expr::from_bytes_with_registry(&bytes, &ctx).unwrap();
+    let deserialized_expr =
+        Expr::from_bytes_with_ctx(&bytes, ctx.task_ctx().as_ref()).unwrap();
 
     assert_eq!(expr, deserialized_expr);
 }
@@ -281,7 +282,8 @@ fn test_expression_serialization_roundtrip() {
 
         let extension_codec = DefaultLogicalExtensionCodec {};
         let proto = serialize_expr(&expr, &extension_codec).unwrap();
-        let deserialize = parse_expr(&proto, &ctx, &extension_codec).unwrap();
+        let deserialize =
+            parse_expr(&proto, ctx.task_ctx().as_ref(), &extension_codec).unwrap();
 
         let serialize_name = extract_function_name(&expr);
         let deserialize_name = extract_function_name(&deserialize);
diff --git a/datafusion/pruning/Cargo.toml b/datafusion/pruning/Cargo.toml
index 2429123bdf966..e6f4bb6f273c9 100644
--- a/datafusion/pruning/Cargo.toml
+++ b/datafusion/pruning/Cargo.toml
@@ -9,6 +9,9 @@ repository = { workspace = true }
 license = { workspace = true }
 authors = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -20,10 +23,10 @@ datafusion-expr-common = { workspace = true, default-features = true }
 datafusion-physical-expr = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
 datafusion-physical-plan = { workspace = true }
-itertools = { workspace = true }
 log = { workspace = true }
 
 [dev-dependencies]
 datafusion-expr = { workspace = true }
 datafusion-functions-nested = { workspace = true }
 insta = { workspace = true }
+itertools = { workspace = true }
diff --git a/datafusion/pruning/LICENSE.txt b/datafusion/pruning/LICENSE.txt
new file mode 120000
index 0000000000000..1ef648f64b34f
--- /dev/null
+++ b/datafusion/pruning/LICENSE.txt
@@ -0,0 +1 @@
+../../LICENSE.txt
\ No newline at end of file
diff --git a/datafusion/pruning/NOTICE.txt b/datafusion/pruning/NOTICE.txt
new file mode 120000
index 0000000000000..fb051c92b10b2
--- /dev/null
+++ b/datafusion/pruning/NOTICE.txt
@@ -0,0 +1 @@
+../../NOTICE.txt
\ No newline at end of file
diff --git a/datafusion/pruning/src/file_pruner.rs b/datafusion/pruning/src/file_pruner.rs
index ee86a8cc8cd58..f850e0c0114fb 100644
--- a/datafusion/pruning/src/file_pruner.rs
+++ b/datafusion/pruning/src/file_pruner.rs
@@ -19,69 +19,84 @@
 
 use std::sync::Arc;
 
-use arrow::datatypes::{FieldRef, Schema, SchemaRef};
-use datafusion_common::{
-    pruning::{
-        CompositePruningStatistics, PartitionPruningStatistics, PrunableStatistics,
-        PruningStatistics,
-    },
-    Result,
-};
+use arrow::datatypes::{FieldRef, SchemaRef};
+use datafusion_common::{Result, internal_datafusion_err, pruning::PrunableStatistics};
 use datafusion_datasource::PartitionedFile;
-use datafusion_physical_expr_common::physical_expr::{snapshot_generation, PhysicalExpr};
+use datafusion_physical_expr_common::physical_expr::{PhysicalExpr, snapshot_generation};
 use datafusion_physical_plan::metrics::Count;
-use itertools::Itertools;
 use log::debug;
 
 use crate::build_pruning_predicate;
 
-/// Prune based on partition values and file-level statistics.
+/// Prune based on file-level statistics.
+///
+/// Note: Partition column pruning is handled earlier via `replace_columns_with_literals`
+/// which substitutes partition column references with their literal values before
+/// the predicate reaches this pruner.
 pub struct FilePruner {
     predicate_generation: Option<u64>,
     predicate: Arc<dyn PhysicalExpr>,
-    /// Schema used for pruning, which combines the file schema and partition fields.
-    /// Partition fields are always at the end, as they are during scans.
-    pruning_schema: Arc<Schema>,
-    partitioned_file: PartitionedFile,
-    partition_fields: Vec<FieldRef>,
+    /// Schema used for pruning (the logical file schema).
+    file_schema: SchemaRef,
+    file_stats_pruning: PrunableStatistics,
     predicate_creation_errors: Count,
 }
 
 impl FilePruner {
+    #[deprecated(
+        since = "52.0.0",
+        note = "Use `try_new` instead which returns None if no statistics are available"
+    )]
+    #[expect(clippy::needless_pass_by_value)]
     pub fn new(
         predicate: Arc<dyn PhysicalExpr>,
         logical_file_schema: &SchemaRef,
-        partition_fields: Vec<FieldRef>,
+        _partition_fields: Vec<FieldRef>,
         partitioned_file: PartitionedFile,
         predicate_creation_errors: Count,
     ) -> Result<Self> {
-        // Build a pruning schema that combines the file fields and partition fields.
-        // Partition fields are always at the end.
-        let pruning_schema = Arc::new(
-            Schema::new(
-                logical_file_schema
-                    .fields()
-                    .iter()
-                    .cloned()
-                    .chain(partition_fields.iter().cloned())
-                    .collect_vec(),
+        Self::try_new(
+            predicate,
+            logical_file_schema,
+            &partitioned_file,
+            predicate_creation_errors,
+        )
+        .ok_or_else(|| {
+            internal_datafusion_err!(
+                "FilePruner::new called on a file without statistics: {:?}",
+                partitioned_file
             )
-            .with_metadata(logical_file_schema.metadata().clone()),
-        );
-        Ok(Self {
-            // Initialize the predicate generation to None so that the first time we call `should_prune` we actually check the predicate
-            // Subsequent calls will only do work if the predicate itself has changed.
-            // See `snapshot_generation` for more info.
+        })
+    }
+
+    /// Create a new file pruner if statistics are available.
+    /// Returns None if this file does not have statistics.
+    pub fn try_new(
+        predicate: Arc<dyn PhysicalExpr>,
+        file_schema: &SchemaRef,
+        partitioned_file: &PartitionedFile,
+        predicate_creation_errors: Count,
+    ) -> Option<Self> {
+        let file_stats = partitioned_file.statistics.as_ref()?;
+        let file_stats_pruning =
+            PrunableStatistics::new(vec![file_stats.clone()], Arc::clone(file_schema));
+        Some(Self {
             predicate_generation: None,
             predicate,
-            pruning_schema,
-            partitioned_file,
-            partition_fields,
+            file_schema: Arc::clone(file_schema),
+            file_stats_pruning,
             predicate_creation_errors,
         })
     }
 
     pub fn should_prune(&mut self) -> Result<bool> {
+        // Check if the predicate has changed since last invocation by tracking
+        // its "generation". Dynamic filter expressions can change their values
+        // during query execution, so we use generation tracking to detect when
+        // the predicate has been updated and needs to be rebuilt.
+        //
+        // If the generation hasn't changed, we can skip rebuilding the pruning
+        // predicate, which is an expensive operation involving expression analysis.
         let new_generation = snapshot_generation(&self.predicate);
         if let Some(current_generation) = self.predicate_generation.as_mut() {
             if *current_generation == new_generation {
@@ -93,39 +108,25 @@ impl FilePruner {
         }
         let pruning_predicate = build_pruning_predicate(
             Arc::clone(&self.predicate),
-            &self.pruning_schema,
+            &self.file_schema,
             &self.predicate_creation_errors,
         );
-        if let Some(pruning_predicate) = pruning_predicate {
-            // The partition column schema is the schema of the table - the schema of the file
-            let mut pruning = Box::new(PartitionPruningStatistics::try_new(
-                vec![self.partitioned_file.partition_values.clone()],
-                self.partition_fields.clone(),
-            )?) as Box<dyn PruningStatistics>;
-            if let Some(stats) = &self.partitioned_file.statistics {
-                let stats_pruning = Box::new(PrunableStatistics::new(
-                    vec![Arc::clone(stats)],
-                    Arc::clone(&self.pruning_schema),
-                ));
-                pruning = Box::new(CompositePruningStatistics::new(vec![
-                    pruning,
-                    stats_pruning,
-                ]));
-            }
-            match pruning_predicate.prune(pruning.as_ref()) {
-                Ok(values) => {
-                    assert!(values.len() == 1);
-                    // We expect a single container -> if all containers are false skip this file
-                    if values.into_iter().all(|v| !v) {
-                        return Ok(true);
-                    }
-                }
-                // Stats filter array could not be built, so we can't prune
-                Err(e) => {
-                    debug!("Ignoring error building pruning predicate for file: {e}");
-                    self.predicate_creation_errors.add(1);
+        let Some(pruning_predicate) = pruning_predicate else {
+            return Ok(false);
+        };
+        match pruning_predicate.prune(&self.file_stats_pruning) {
+            Ok(values) => {
+                assert!(values.len() == 1);
+                // We expect a single container -> if all containers are false skip this file
+                if values.into_iter().all(|v| !v) {
+                    return Ok(true);
                 }
             }
+            // Stats filter array could not be built, so we can't prune
+            Err(e) => {
+                debug!("Ignoring error building pruning predicate for file: {e}");
+                self.predicate_creation_errors.add(1);
+            }
         }
 
         Ok(false)
diff --git a/datafusion/pruning/src/lib.rs b/datafusion/pruning/src/lib.rs
index cec4fab2262f8..be17f29eaafa0 100644
--- a/datafusion/pruning/src/lib.rs
+++ b/datafusion/pruning/src/lib.rs
@@ -15,11 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
+
 mod file_pruner;
 mod pruning_predicate;
 
 pub use file_pruner::FilePruner;
 pub use pruning_predicate::{
-    build_pruning_predicate, PredicateRewriter, PruningPredicate, PruningStatistics,
-    RequiredColumns, UnhandledPredicateHook,
+    PredicateRewriter, PruningPredicate, PruningStatistics, RequiredColumns,
+    UnhandledPredicateHook, build_pruning_predicate,
 };
diff --git a/datafusion/pruning/src/pruning_predicate.rs b/datafusion/pruning/src/pruning_predicate.rs
index 380ada10df6e1..76cf14be88f5a 100644
--- a/datafusion/pruning/src/pruning_predicate.rs
+++ b/datafusion/pruning/src/pruning_predicate.rs
@@ -24,7 +24,7 @@ use std::sync::Arc;
 
 use arrow::array::AsArray;
 use arrow::{
-    array::{new_null_array, ArrayRef, BooleanArray},
+    array::{ArrayRef, BooleanArray, new_null_array},
     datatypes::{DataType, Field, Schema, SchemaRef},
     record_batch::{RecordBatch, RecordBatchOptions},
 };
@@ -35,17 +35,16 @@ use datafusion_physical_plan::metrics::Count;
 use log::{debug, trace};
 
 use datafusion_common::error::Result;
-use datafusion_common::tree_node::TransformedResult;
+use datafusion_common::tree_node::{TransformedResult, TreeNodeRecursion};
+use datafusion_common::{Column, DFSchema, assert_eq_or_internal_err};
 use datafusion_common::{
-    internal_datafusion_err, internal_err, plan_datafusion_err, plan_err,
+    ScalarValue, internal_datafusion_err, plan_datafusion_err, plan_err,
     tree_node::{Transformed, TreeNode},
-    ScalarValue,
 };
-use datafusion_common::{Column, DFSchema};
 use datafusion_expr_common::operator::Operator;
-use datafusion_physical_expr::utils::{collect_columns, Guarantee, LiteralGuarantee};
-use datafusion_physical_expr::{expressions as phys_expr, PhysicalExprRef};
-use datafusion_physical_expr_common::physical_expr::snapshot_physical_expr;
+use datafusion_physical_expr::utils::{Guarantee, LiteralGuarantee};
+use datafusion_physical_expr::{PhysicalExprRef, expressions as phys_expr};
+use datafusion_physical_expr_common::physical_expr::snapshot_physical_expr_opt;
 use datafusion_physical_plan::{ColumnarValue, PhysicalExpr};
 
 /// Used to prove that arbitrary predicates (boolean expression) can not
@@ -86,7 +85,7 @@ use datafusion_physical_plan::{ColumnarValue, PhysicalExpr};
 /// example of how to use `PruningPredicate` to prune files based on min/max
 /// values.
 ///
-/// [`pruning.rs` example in the `datafusion-examples`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/pruning.rs
+/// [`pruning.rs` example in the `datafusion-examples`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/pruning.rs
 ///
 /// Given an expression like `x = 5` and statistics for 3 containers (Row
 /// Groups, files, etc) `A`, `B`, and `C`:
@@ -238,7 +237,7 @@ use datafusion_physical_plan::{ColumnarValue, PhysicalExpr};
 /// Original Predicate | Rewritten Predicate
 /// ------------------ | --------------------
 /// `x = 5` | `x_null_count != x_row_count AND (x_min <= 5 AND 5 <= x_max)`
-/// `x < 5` | `x_null_count != x_row_count THEN false (x_max < 5)`
+/// `x < 5` | `x_null_count != x_row_count AND (x_min < 5)`
 /// `x = 5 AND y = 10` | `x_null_count != x_row_count AND (x_min <= 5 AND 5 <= x_max) AND y_null_count != y_row_count (y_min <= 10 AND 10 <= y_max)`
 /// `x IS NULL`  | `x_null_count > 0`
 /// `x IS NOT NULL`  | `x_null_count != row_count`
@@ -455,10 +454,29 @@ impl PruningPredicate {
     ///
     /// See the struct level documentation on [`PruningPredicate`] for more
     /// details.
-    pub fn try_new(expr: Arc<dyn PhysicalExpr>, schema: SchemaRef) -> Result<Self> {
-        // Get a (simpler) snapshot of the physical expr here to use with `PruningPredicate`
-        // which does not handle dynamic exprs in general
-        let expr = snapshot_physical_expr(expr)?;
+    ///
+    /// Note that `PruningPredicate` does not attempt to normalize or simplify
+    /// the input expression unless calling [`snapshot_physical_expr_opt`]
+    /// returns a new expression.
+    /// It is recommended that you pass the expressions through [`PhysicalExprSimplifier`]
+    /// before calling this method to make sure the expressions can be used for pruning.
+    pub fn try_new(mut expr: Arc<dyn PhysicalExpr>, schema: SchemaRef) -> Result<Self> {
+        // Get a (simpler) snapshot of the physical expr here to use with `PruningPredicate`.
+        // In particular this unravels any `DynamicFilterPhysicalExpr`s by snapshotting them
+        // so that PruningPredicate can work with a static expression.
+        let tf = snapshot_physical_expr_opt(expr)?;
+        if tf.transformed {
+            // If we had an expression such as Dynamic(part_col < 5 and col < 10)
+            // (this could come from something like `select * from t order by part_col, col, limit 10`)
+            // after snapshotting and because `DynamicFilterPhysicalExpr` applies child replacements to its
+            // children after snapshotting and previously `replace_columns_with_literals` may have been called with partition values
+            // the expression we have now is `8 < 5 and col < 10`.
+            // Thus we need as simplifier pass to get `false and col < 10` => `false` here.
+            let simplifier = PhysicalExprSimplifier::new(&schema);
+            expr = simplifier.simplify(tf.data)?;
+        } else {
+            expr = tf.data;
+        }
         let unhandled_hook = Arc::new(ConstantUnhandledPredicateHook::default()) as _;
 
         // build predicate expression once
@@ -473,7 +491,6 @@ impl PruningPredicate {
         // Simplify the newly created predicate to get rid of redundant casts, comparisons, etc.
         let predicate_expr =
             PhysicalExprSimplifier::new(&predicate_schema).simplify(predicate_expr)?;
-
         let literal_guarantees = LiteralGuarantee::analyze(&expr);
 
         Ok(Self {
@@ -585,8 +602,6 @@ impl PruningPredicate {
         is_always_true(&self.predicate_expr) && self.literal_guarantees.is_empty()
     }
 
-    // this is only used by `parquet` feature right now
-    #[allow(dead_code)]
     pub fn required_columns(&self) -> &RequiredColumns {
         &self.required_columns
     }
@@ -680,15 +695,13 @@ impl BoolVecBuilder {
 }
 
 fn is_always_true(expr: &Arc<dyn PhysicalExpr>) -> bool {
-    expr.as_any()
-        .downcast_ref::<phys_expr::Literal>()
+    expr.downcast_ref::<phys_expr::Literal>()
         .map(|l| matches!(l.value(), ScalarValue::Boolean(Some(true))))
         .unwrap_or_default()
 }
 
 fn is_always_false(expr: &Arc<dyn PhysicalExpr>) -> bool {
-    expr.as_any()
-        .downcast_ref::<phys_expr::Literal>()
+    expr.downcast_ref::<phys_expr::Literal>()
         .map(|l| matches!(l.value(), ScalarValue::Boolean(Some(false))))
         .unwrap_or_default()
 }
@@ -725,8 +738,6 @@ impl RequiredColumns {
     /// * `a > 5 OR a < 10` returns `Some(a)`
     /// * `a > 5 OR b < 10` returns `None`
     /// * `true` returns None
-    #[allow(dead_code)]
-    // this fn is only used by `parquet` feature right now, thus the `allow(dead_code)`
     pub fn single_column(&self) -> Option<&phys_expr::Column> {
         if self.columns.windows(2).all(|w| {
             // check if all columns are the same (ignoring statistics and field)
@@ -915,17 +926,17 @@ fn build_statistics_record_batch<S: PruningStatistics + ?Sized>(
             StatisticsType::Min => statistics.min_values(&column),
             StatisticsType::Max => statistics.max_values(&column),
             StatisticsType::NullCount => statistics.null_counts(&column),
-            StatisticsType::RowCount => statistics.row_counts(&column),
+            StatisticsType::RowCount => statistics.row_counts(),
         };
         let array = array.unwrap_or_else(|| new_null_array(data_type, num_containers));
 
-        if num_containers != array.len() {
-            return internal_err!(
-                "mismatched statistics length. Expected {}, got {}",
-                num_containers,
-                array.len()
-            );
-        }
+        assert_eq_or_internal_err!(
+            num_containers,
+            array.len(),
+            "mismatched statistics length. Expected {}, got {}",
+            num_containers,
+            array.len()
+        );
 
         // cast statistics array to required data type (e.g. parquet
         // provides timestamp statistics as "Int64")
@@ -959,24 +970,41 @@ impl<'a> PruningExpressionBuilder<'a> {
     fn try_new(
         left: &'a Arc<dyn PhysicalExpr>,
         right: &'a Arc<dyn PhysicalExpr>,
+        left_columns: ColumnReferenceCount,
+        right_columns: ColumnReferenceCount,
         op: Operator,
         schema: &'a SchemaRef,
         required_columns: &'a mut RequiredColumns,
     ) -> Result<Self> {
         // find column name; input could be a more complicated expression
-        let left_columns = collect_columns(left);
-        let right_columns = collect_columns(right);
-        let (column_expr, scalar_expr, columns, correct_operator) =
-            match (left_columns.len(), right_columns.len()) {
-                (1, 0) => (left, right, left_columns, op),
-                (0, 1) => (right, left, right_columns, reverse_operator(op)?),
-                _ => {
-                    // if more than one column used in expression - not supported
-                    return plan_err!(
-                        "Multi-column expressions are not currently supported"
-                    );
-                }
-            };
+        let (column_expr, scalar_expr, column, correct_operator) = match (
+            left_columns,
+            right_columns,
+        ) {
+            (ColumnReferenceCount::One(column), ColumnReferenceCount::Zero) => {
+                (left, right, column, op)
+            }
+            (ColumnReferenceCount::Zero, ColumnReferenceCount::One(column)) => {
+                (right, left, column, reverse_operator(op)?)
+            }
+            (ColumnReferenceCount::One(_), ColumnReferenceCount::One(_)) => {
+                // both sides have one column - not supported
+                return plan_err!(
+                    "Expression not supported for pruning: left has 1 column, right has 1 column"
+                );
+            }
+            (ColumnReferenceCount::Zero, ColumnReferenceCount::Zero) => {
+                // both sides are literals - should be handled before calling try_new
+                return plan_err!(
+                    "Pruning literal expressions is not supported, please call PhysicalExprSimplifier first"
+                );
+            }
+            (ColumnReferenceCount::Many, _) | (_, ColumnReferenceCount::Many) => {
+                return plan_err!(
+                    "Expression not supported for pruning: left or right has multiple columns"
+                );
+            }
+        };
 
         let df_schema = DFSchema::try_from(Arc::clone(schema))?;
         let (column_expr, correct_operator, scalar_expr) = rewrite_expr_to_prunable(
@@ -985,7 +1013,6 @@ impl<'a> PruningExpressionBuilder<'a> {
             scalar_expr,
             df_schema,
         )?;
-        let column = columns.iter().next().unwrap().clone();
         let field = match schema.column_with_name(column.name()) {
             Some((_, f)) => f,
             _ => {
@@ -1084,58 +1111,54 @@ fn rewrite_expr_to_prunable(
         return plan_err!("rewrite_expr_to_prunable only support compare expression");
     }
 
-    let column_expr_any = column_expr.as_any();
-
-    if column_expr_any
-        .downcast_ref::<phys_expr::Column>()
-        .is_some()
-    {
+    if column_expr.downcast_ref::<phys_expr::Column>().is_some() {
         // `col op lit()`
         Ok((Arc::clone(column_expr), op, Arc::clone(scalar_expr)))
-    } else if let Some(cast) = column_expr_any.downcast_ref::<phys_expr::CastExpr>() {
+    } else if let Some(cast) = column_expr.downcast_ref::<phys_expr::CastExpr>() {
         // `cast(col) op lit()`
-        let arrow_schema = schema.as_arrow();
-        let from_type = cast.expr().data_type(arrow_schema)?;
-        verify_support_type_for_prune(&from_type, cast.cast_type())?;
-        let (left, op, right) =
-            rewrite_expr_to_prunable(cast.expr(), op, scalar_expr, schema)?;
-        let left = Arc::new(phys_expr::CastExpr::new(
+        let (left, op, right) = rewrite_cast_child_to_prunable(
+            cast.expr(),
+            cast.cast_type(),
+            op,
+            scalar_expr,
+            schema,
+        )?;
+        let left = Arc::new(phys_expr::CastExpr::new_with_target_field(
             left,
-            cast.cast_type().clone(),
+            Arc::clone(cast.target_field()),
             None,
         ));
+        // PruningPredicate does not support pruning on nested fields yet.
+        // End-to-end nested-field pruning also requires Parquet statistics
+        // extraction to agree with PruningPredicate on a stats representation
+        // for nested field expressions.
         Ok((left, op, right))
-    } else if let Some(try_cast) =
-        column_expr_any.downcast_ref::<phys_expr::TryCastExpr>()
-    {
+    } else if let Some(try_cast) = column_expr.downcast_ref::<phys_expr::TryCastExpr>() {
         // `try_cast(col) op lit()`
-        let arrow_schema = schema.as_arrow();
-        let from_type = try_cast.expr().data_type(arrow_schema)?;
-        verify_support_type_for_prune(&from_type, try_cast.cast_type())?;
-        let (left, op, right) =
-            rewrite_expr_to_prunable(try_cast.expr(), op, scalar_expr, schema)?;
+        let (left, op, right) = rewrite_cast_child_to_prunable(
+            try_cast.expr(),
+            try_cast.cast_type(),
+            op,
+            scalar_expr,
+            schema,
+        )?;
         let left = Arc::new(phys_expr::TryCastExpr::new(
             left,
             try_cast.cast_type().clone(),
         ));
         Ok((left, op, right))
-    } else if let Some(neg) = column_expr_any.downcast_ref::<phys_expr::NegativeExpr>() {
+    } else if let Some(neg) = column_expr.downcast_ref::<phys_expr::NegativeExpr>() {
         // `-col > lit()`  --> `col < -lit()`
         let (left, op, right) =
             rewrite_expr_to_prunable(neg.arg(), op, scalar_expr, schema)?;
         let right = Arc::new(phys_expr::NegativeExpr::new(right));
         Ok((left, reverse_operator(op)?, right))
-    } else if let Some(not) = column_expr_any.downcast_ref::<phys_expr::NotExpr>() {
+    } else if let Some(not) = column_expr.downcast_ref::<phys_expr::NotExpr>() {
         // `!col = true` --> `col = !true`
         if op != Operator::Eq && op != Operator::NotEq {
             return plan_err!("Not with operator other than Eq / NotEq is not supported");
         }
-        if not
-            .arg()
-            .as_any()
-            .downcast_ref::<phys_expr::Column>()
-            .is_some()
-        {
+        if not.arg().downcast_ref::<phys_expr::Column>().is_some() {
             let left = Arc::clone(not.arg());
             let right = Arc::new(phys_expr::NotExpr::new(Arc::clone(scalar_expr)));
             Ok((left, reverse_operator(op)?, right))
@@ -1147,6 +1170,20 @@ fn rewrite_expr_to_prunable(
     }
 }
 
+fn rewrite_cast_child_to_prunable(
+    cast_child_expr: &PhysicalExprRef,
+    cast_type: &DataType,
+    op: Operator,
+    scalar_expr: &PhysicalExprRef,
+    schema: DFSchema,
+) -> Result<(PhysicalExprRef, Operator, PhysicalExprRef)> {
+    verify_support_type_for_prune(
+        &cast_child_expr.data_type(schema.as_arrow())?,
+        cast_type,
+    )?;
+    rewrite_expr_to_prunable(cast_child_expr, op, scalar_expr, schema)
+}
+
 fn is_compare_op(op: Operator) -> bool {
     matches!(
         op,
@@ -1161,13 +1198,6 @@ fn is_compare_op(op: Operator) -> bool {
     )
 }
 
-fn is_string_type(data_type: &DataType) -> bool {
-    matches!(
-        data_type,
-        DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View
-    )
-}
-
 // The pruning logic is based on the comparing the min/max bounds.
 // Must make sure the two type has order.
 // For example, casts from string to numbers is not correct.
@@ -1176,20 +1206,20 @@ fn verify_support_type_for_prune(from_type: &DataType, to_type: &DataType) -> Re
     // Dictionary casts are always supported as long as the value types are supported
     let from_type = match from_type {
         DataType::Dictionary(_, t) => {
-            return verify_support_type_for_prune(t.as_ref(), to_type)
+            return verify_support_type_for_prune(t.as_ref(), to_type);
         }
         _ => from_type,
     };
     let to_type = match to_type {
         DataType::Dictionary(_, t) => {
-            return verify_support_type_for_prune(from_type, t.as_ref())
+            return verify_support_type_for_prune(from_type, t.as_ref());
         }
         _ => to_type,
     };
     // If both types are strings or both are not strings (number, timestamp, etc)
     // then we can compare them.
     // PruningPredicate does not support casting of strings to numbers and such.
-    if is_string_type(from_type) == is_string_type(to_type) {
+    if from_type.is_string() == to_type.is_string() {
         Ok(())
     } else {
         plan_err!(
@@ -1205,10 +1235,10 @@ fn rewrite_column_expr(
     column_new: &phys_expr::Column,
 ) -> Result<Arc<dyn PhysicalExpr>> {
     e.transform(|expr| {
-        if let Some(column) = expr.as_any().downcast_ref::<phys_expr::Column>() {
-            if column == column_old {
-                return Ok(Transformed::yes(Arc::new(column_new.clone())));
-            }
+        if let Some(column) = expr.downcast_ref::<phys_expr::Column>()
+            && column == column_old
+        {
+            return Ok(Transformed::yes(Arc::new(column_new.clone())));
         }
 
         Ok(Transformed::no(expr))
@@ -1236,7 +1266,7 @@ fn build_single_column_expr(
 ) -> Option<Arc<dyn PhysicalExpr>> {
     let field = schema.field_with_name(column.name()).ok()?;
 
-    if matches!(field.data_type(), &DataType::Boolean) {
+    if *field.data_type() == DataType::Boolean {
         let col_ref = Arc::new(column.clone()) as _;
 
         let min = required_columns
@@ -1279,7 +1309,7 @@ fn build_is_null_column_expr(
     required_columns: &mut RequiredColumns,
     with_not: bool,
 ) -> Option<Arc<dyn PhysicalExpr>> {
-    if let Some(col) = expr.as_any().downcast_ref::<phys_expr::Column>() {
+    if let Some(col) = expr.downcast_ref::<phys_expr::Column>() {
         let field = schema.field_with_name(col.name()).ok()?;
 
         let null_count_field = &Field::new(field.name(), DataType::UInt64, true);
@@ -1396,12 +1426,11 @@ fn build_predicate_expression(
         return Arc::clone(expr);
     }
     // predicate expression can only be a binary expression
-    let expr_any = expr.as_any();
-    if let Some(is_null) = expr_any.downcast_ref::<phys_expr::IsNullExpr>() {
+    if let Some(is_null) = expr.downcast_ref::<phys_expr::IsNullExpr>() {
         return build_is_null_column_expr(is_null.arg(), schema, required_columns, false)
             .unwrap_or_else(|| unhandled_hook.handle(expr));
     }
-    if let Some(is_not_null) = expr_any.downcast_ref::<phys_expr::IsNotNullExpr>() {
+    if let Some(is_not_null) = expr.downcast_ref::<phys_expr::IsNotNullExpr>() {
         return build_is_null_column_expr(
             is_not_null.arg(),
             schema,
@@ -1410,20 +1439,20 @@ fn build_predicate_expression(
         )
         .unwrap_or_else(|| unhandled_hook.handle(expr));
     }
-    if let Some(col) = expr_any.downcast_ref::<phys_expr::Column>() {
+    if let Some(col) = expr.downcast_ref::<phys_expr::Column>() {
         return build_single_column_expr(col, schema, required_columns, false)
             .unwrap_or_else(|| unhandled_hook.handle(expr));
     }
-    if let Some(not) = expr_any.downcast_ref::<phys_expr::NotExpr>() {
+    if let Some(not) = expr.downcast_ref::<phys_expr::NotExpr>() {
         // match !col (don't do so recursively)
-        if let Some(col) = not.arg().as_any().downcast_ref::<phys_expr::Column>() {
+        if let Some(col) = not.arg().downcast_ref::<phys_expr::Column>() {
             return build_single_column_expr(col, schema, required_columns, true)
                 .unwrap_or_else(|| unhandled_hook.handle(expr));
         } else {
             return unhandled_hook.handle(expr);
         }
     }
-    if let Some(in_list) = expr_any.downcast_ref::<phys_expr::InListExpr>() {
+    if let Some(in_list) = expr.downcast_ref::<phys_expr::InListExpr>() {
         if !in_list.list().is_empty()
             && in_list.list().len() <= MAX_LIST_VALUE_SIZE_REWRITE
         {
@@ -1461,13 +1490,13 @@ fn build_predicate_expression(
     }
 
     let (left, op, right) = {
-        if let Some(bin_expr) = expr_any.downcast_ref::<phys_expr::BinaryExpr>() {
+        if let Some(bin_expr) = expr.downcast_ref::<phys_expr::BinaryExpr>() {
             (
                 Arc::clone(bin_expr.left()),
                 *bin_expr.op(),
                 Arc::clone(bin_expr.right()),
             )
-        } else if let Some(like_expr) = expr_any.downcast_ref::<phys_expr::LikeExpr>() {
+        } else if let Some(like_expr) = expr.downcast_ref::<phys_expr::LikeExpr>() {
             if like_expr.case_insensitive() {
                 return unhandled_hook.handle(expr);
             }
@@ -1514,8 +1543,17 @@ fn build_predicate_expression(
         return expr;
     }
 
-    let expr_builder =
-        PruningExpressionBuilder::try_new(&left, &right, op, schema, required_columns);
+    let left_columns = ColumnReferenceCount::from_expression(&left);
+    let right_columns = ColumnReferenceCount::from_expression(&right);
+    let expr_builder = PruningExpressionBuilder::try_new(
+        &left,
+        &right,
+        left_columns,
+        right_columns,
+        op,
+        schema,
+        required_columns,
+    );
     let mut expr_builder = match expr_builder {
         Ok(builder) => builder,
         // allow partial failure in predicate expression generation
@@ -1530,6 +1568,50 @@ fn build_predicate_expression(
         .unwrap_or_else(|_| unhandled_hook.handle(expr))
 }
 
+/// Count of distinct column references in an expression.
+/// This is the same as [`collect_columns`] but optimized to stop counting
+/// once more than one distinct column is found.
+///
+/// For example, in expression `col1 + col2`, the count is `Many`.
+/// In expression `col1 + 5`, the count is `One`.
+/// In expression `5 + 10`, the count is `Zero`.
+///
+/// [`collect_columns`]: datafusion_physical_expr::utils::collect_columns
+#[derive(Debug, PartialEq, Eq)]
+enum ColumnReferenceCount {
+    /// no column references
+    Zero,
+    /// Only one column reference
+    One(phys_expr::Column),
+    /// More than one column reference
+    Many,
+}
+
+impl ColumnReferenceCount {
+    /// Count the number of distinct column references in an expression
+    fn from_expression(expr: &Arc<dyn PhysicalExpr>) -> Self {
+        let mut seen = HashSet::<phys_expr::Column>::new();
+        expr.apply(|expr| {
+            if let Some(column) = expr.downcast_ref::<phys_expr::Column>() {
+                seen.insert(column.clone());
+                if seen.len() > 1 {
+                    return Ok(TreeNodeRecursion::Stop);
+                }
+            }
+            Ok(TreeNodeRecursion::Continue)
+        })
+        // pre_visit always returns OK, so this will always too
+        .expect("no way to return error during recursion");
+        match seen.len() {
+            0 => ColumnReferenceCount::Zero,
+            1 => ColumnReferenceCount::One(
+                seen.into_iter().next().expect("just checked len==1"),
+            ),
+            _ => ColumnReferenceCount::Many,
+        }
+    }
+}
+
 fn build_statistics_expr(
     expr_builder: &mut PruningExpressionBuilder,
 ) -> Result<Arc<dyn PhysicalExpr>> {
@@ -1628,7 +1710,7 @@ fn unpack_string(s: &ScalarValue) -> Option<&str> {
 }
 
 fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&str> {
-    if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+    if let Some(lit) = expr.downcast_ref::<phys_expr::Literal>() {
         let s = unpack_string(lit.value())?;
         return Some(s);
     }
@@ -1801,13 +1883,13 @@ fn increment_utf8(data: &str) -> Option<String> {
         let original = code_points[idx] as u32;
 
         // Try incrementing the code point
-        if let Some(next_char) = char::from_u32(original + 1) {
-            if is_valid_unicode(next_char) {
-                code_points[idx] = next_char;
-                // truncate the string to the current index
-                code_points.truncate(idx + 1);
-                return Some(code_points.into_iter().collect());
-            }
+        if let Some(next_char) = char::from_u32(original + 1)
+            && is_valid_unicode(next_char)
+        {
+            code_points[idx] = next_char;
+            // truncate the string to the current index
+            code_points.truncate(idx + 1);
+            return Some(code_points.into_iter().collect());
         }
     }
 
@@ -1869,6 +1951,7 @@ mod tests {
     use super::*;
     use datafusion_common::test_util::batches_to_string;
     use datafusion_expr::{and, col, lit, or};
+    use datafusion_physical_expr::utils::collect_columns;
     use insta::assert_snapshot;
 
     use arrow::array::Decimal128Array;
@@ -1877,10 +1960,13 @@ mod tests {
         datatypes::TimeUnit,
     };
     use datafusion_expr::expr::InList;
-    use datafusion_expr::{cast, is_null, try_cast, Expr};
+    use datafusion_expr::{Expr, cast, is_null, try_cast};
     use datafusion_functions_nested::expr_fn::{array_has, make_array};
-    use datafusion_physical_expr::expressions as phys_expr;
+    use datafusion_physical_expr::expressions::{
+        self as phys_expr, DynamicFilterPhysicalExpr,
+    };
     use datafusion_physical_expr::planner::logical2physical;
+    use itertools::Itertools;
 
     #[derive(Debug, Default)]
     /// Mock statistic provider for tests
@@ -2064,6 +2150,7 @@ mod tests {
         }
 
         /// Add contained information.
+        #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key
         pub fn with_contained(
             mut self,
             values: impl IntoIterator<Item = ScalarValue>,
@@ -2078,6 +2165,7 @@ mod tests {
         }
 
         /// get any contained information for the specified values
+        #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key
         fn contained(&self, find_values: &HashSet<ScalarValue>) -> Option<BooleanArray> {
             // find the one with the matching values
             self.contained
@@ -2204,11 +2292,10 @@ mod tests {
                 .unwrap_or(None)
         }
 
-        fn row_counts(&self, column: &Column) -> Option<ArrayRef> {
+        fn row_counts(&self) -> Option<ArrayRef> {
             self.stats
-                .get(column)
-                .map(|container_stats| container_stats.row_counts())
-                .unwrap_or(None)
+                .values()
+                .find_map(|container_stats| container_stats.row_counts())
         }
 
         fn contained(
@@ -2246,7 +2333,7 @@ mod tests {
             None
         }
 
-        fn row_counts(&self, _column: &Column) -> Option<ArrayRef> {
+        fn row_counts(&self) -> Option<ArrayRef> {
             None
         }
 
@@ -2759,6 +2846,163 @@ mod tests {
         Ok(())
     }
 
+    /// Test that non-boolean literal expressions don't prune any containers and error gracefully by not pruning anything instead of e.g. panicking
+    #[test]
+    fn row_group_predicate_non_boolean() {
+        let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, true)]));
+        let statistics = TestStatistics::new()
+            .with("c1", ContainerStats::new_i32(vec![Some(0)], vec![Some(10)]));
+        let expected_ret = &[true];
+        prune_with_expr(lit(1), &schema, &statistics, expected_ret);
+    }
+
+    // Test that literal-to-literal comparisons are correctly evaluated.
+    // When both sides are constants, the expression should be evaluated directly
+    // and if it's false, all containers should be pruned.
+    #[test]
+    fn row_group_predicate_literal_false() {
+        // lit(1) = lit(2) is always false, so all containers should be pruned
+        let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, true)]));
+        let statistics = TestStatistics::new()
+            .with("c1", ContainerStats::new_i32(vec![Some(0)], vec![Some(10)]));
+        let expected_ret = &[false];
+        prune_with_simplified_expr(lit(1).eq(lit(2)), &schema, &statistics, expected_ret);
+    }
+
+    /// Test nested/complex literal expression trees.
+    /// This is an integration test that PhysicalExprSimplifier + PruningPredicate work together as expected.
+    #[test]
+    fn row_group_predicate_literal_true() {
+        // lit(1) = lit(1) is always true, so no containers should be pruned
+        let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, true)]));
+        let statistics = TestStatistics::new()
+            .with("c1", ContainerStats::new_i32(vec![Some(0)], vec![Some(10)]));
+        let expected_ret = &[true];
+        prune_with_simplified_expr(lit(1).eq(lit(1)), &schema, &statistics, expected_ret);
+    }
+
+    /// Test nested/complex literal expression trees.
+    /// This is an integration test that PhysicalExprSimplifier + PruningPredicate work together as expected.
+    #[test]
+    fn row_group_predicate_literal_null() {
+        // lit(1) = null is always null, so no containers should be pruned
+        let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, true)]));
+        let statistics = TestStatistics::new()
+            .with("c1", ContainerStats::new_i32(vec![Some(0)], vec![Some(10)]));
+        let expected_ret = &[true];
+        prune_with_simplified_expr(
+            lit(1).eq(lit(ScalarValue::Null)),
+            &schema,
+            &statistics,
+            expected_ret,
+        );
+    }
+
+    /// Test nested/complex literal expression trees.
+    /// This is an integration test that PhysicalExprSimplifier + PruningPredicate work together as expected.
+    #[test]
+    fn row_group_predicate_complex_literals() {
+        let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, true)]));
+        let statistics = TestStatistics::new()
+            .with("c1", ContainerStats::new_i32(vec![Some(0)], vec![Some(10)]));
+
+        // (1 + 2) > 0 is always true
+        prune_with_simplified_expr(
+            (lit(1) + lit(2)).gt(lit(0)),
+            &schema,
+            &statistics,
+            &[true],
+        );
+
+        // (1 + 2) < 0 is always false
+        prune_with_simplified_expr(
+            (lit(1) + lit(2)).lt(lit(0)),
+            &schema,
+            &statistics,
+            &[false],
+        );
+
+        // Nested AND of literals: true AND false = false
+        prune_with_simplified_expr(
+            lit(true).and(lit(false)),
+            &schema,
+            &statistics,
+            &[false],
+        );
+
+        // Nested OR of literals: true OR false = true
+        prune_with_simplified_expr(
+            lit(true).or(lit(false)),
+            &schema,
+            &statistics,
+            &[true],
+        );
+
+        // Complex nested: (1 < 2) AND (3 > 1) = true AND true = true
+        prune_with_simplified_expr(
+            lit(1).lt(lit(2)).and(lit(3).gt(lit(1))),
+            &schema,
+            &statistics,
+            &[true],
+        );
+
+        // Complex nested: (1 > 2) OR (3 < 1) = false OR false = false
+        prune_with_simplified_expr(
+            lit(1).gt(lit(2)).or(lit(3).lt(lit(1))),
+            &schema,
+            &statistics,
+            &[false],
+        );
+    }
+
+    /// Integration test demonstrating that a dynamic filter with replaced children as literals will be snapshotted, simplified and then pruned correctly.
+    #[test]
+    fn row_group_predicate_dynamic_filter_with_literals() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Int32, true),
+            Field::new("part", DataType::Utf8, true),
+        ]));
+        let statistics = TestStatistics::new()
+            // Note that we have no stats, pruning can only happen via partition value pruning from the dynamic filter
+            .with_row_counts("c1", vec![Some(10)]);
+        let dynamic_filter_expr = col("c1").gt(lit(5)).and(col("part").eq(lit("B")));
+        let phys_expr = logical2physical(&dynamic_filter_expr, &schema);
+        let children = collect_columns(&phys_expr)
+            .iter()
+            .map(|c| Arc::new(c.clone()) as Arc<dyn PhysicalExpr>)
+            .collect_vec();
+        let dynamic_phys_expr =
+            Arc::new(DynamicFilterPhysicalExpr::new(children, phys_expr))
+                as Arc<dyn PhysicalExpr>;
+        // Simulate the partition value substitution that would happen in ParquetOpener
+        let remapped_expr = dynamic_phys_expr
+            .children()
+            .into_iter()
+            .map(|child_expr| {
+                let Some(col_expr) = child_expr.downcast_ref::<phys_expr::Column>()
+                else {
+                    return Arc::clone(child_expr);
+                };
+                if col_expr.name() == "part" {
+                    // simulate dynamic filter replacement with literal "A"
+                    Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+                        "A".to_string(),
+                    )))) as Arc<dyn PhysicalExpr>
+                } else {
+                    Arc::clone(child_expr)
+                }
+            })
+            .collect_vec();
+        let dynamic_filter_expr =
+            dynamic_phys_expr.with_new_children(remapped_expr).unwrap();
+        // After substitution the expression is c1 > 5 AND part = "B" which should prune the file since the partition value is "A"
+        let expected = &[false];
+        let p =
+            PruningPredicate::try_new(dynamic_filter_expr, Arc::clone(&schema)).unwrap();
+        let result = p.prune(&statistics).unwrap();
+        assert_eq!(result, expected);
+    }
+
     #[test]
     fn row_group_predicate_lt_bool() -> Result<()> {
         let schema = Schema::new(vec![Field::new("c1", DataType::Boolean, false)]);
@@ -2790,7 +3034,7 @@ mod tests {
             test_build_predicate_expression(&expr, &schema, &mut required_columns);
         assert_eq!(predicate_expr.to_string(), expected_expr);
         println!("required_columns: {required_columns:#?}"); // for debugging assertions below
-                                                             // c1 < 1 should add c1_min
+        // c1 < 1 should add c1_min
         let c1_min_field = Field::new("c1_min", DataType::Int32, false);
         assert_eq!(
             required_columns.columns[0],
@@ -3939,7 +4183,7 @@ mod tests {
     }
 
     #[test]
-    fn prune_cast_column_scalar() {
+    fn prune_cast_scalar() {
         // The data type of column i is INT32
         let (schema, statistics) = int32_setup();
         let expected_ret = &[true, true, false, true, true];
@@ -4422,7 +4666,7 @@ mod tests {
             true,
             // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> some rows could pass (must keep)
             true,
-            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> no row match. (min, max) maybe truncate 
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> no row match. (min, max) maybe truncate
             // original (min, max) maybe ("A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}\u{10ffff}\u{10ffff}")
             true,
         ];
@@ -5122,6 +5366,21 @@ mod tests {
         assert_eq!(result, expected);
     }
 
+    fn prune_with_simplified_expr(
+        expr: Expr,
+        schema: &SchemaRef,
+        statistics: &TestStatistics,
+        expected: &[bool],
+    ) {
+        println!("Pruning with expr: {expr}");
+        let expr = logical2physical(&expr, schema);
+        let simplifier = PhysicalExprSimplifier::new(schema);
+        let expr = simplifier.simplify(expr).unwrap();
+        let p = PruningPredicate::try_new(expr, Arc::<Schema>::clone(schema)).unwrap();
+        let result = p.prune(statistics).unwrap();
+        assert_eq!(result, expected);
+    }
+
     fn test_build_predicate_expression(
         expr: &Expr,
         schema: &Schema,
diff --git a/datafusion/session/Cargo.toml b/datafusion/session/Cargo.toml
index 0489da61eed86..230e26d1fc9fc 100644
--- a/datafusion/session/Cargo.toml
+++ b/datafusion/session/Cargo.toml
@@ -38,5 +38,8 @@ datafusion-expr = { workspace = true }
 datafusion-physical-plan = { workspace = true }
 parking_lot = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
diff --git a/datafusion/session/src/lib.rs b/datafusion/session/src/lib.rs
index a2e1d9ca3ae8f..11f734e757452 100644
--- a/datafusion/session/src/lib.rs
+++ b/datafusion/session/src/lib.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
+
 //! Session management for DataFusion query execution environment
 //!
 //! This module provides the core session management functionality for DataFusion,
diff --git a/datafusion/session/src/session.rs b/datafusion/session/src/session.rs
index fd033172f224f..82dda6655f8e2 100644
--- a/datafusion/session/src/session.rs
+++ b/datafusion/session/src/session.rs
@@ -18,11 +18,14 @@
 use async_trait::async_trait;
 use datafusion_common::config::{ConfigOptions, TableOptions};
 use datafusion_common::{DFSchema, Result};
+use datafusion_execution::TaskContext;
 use datafusion_execution::config::SessionConfig;
 use datafusion_execution::runtime_env::RuntimeEnv;
-use datafusion_execution::TaskContext;
 use datafusion_expr::execution_props::ExecutionProps;
-use datafusion_expr::{AggregateUDF, Expr, LogicalPlan, ScalarUDF, WindowUDF};
+use datafusion_expr::registry::ExtensionTypeRegistryRef;
+use datafusion_expr::{
+    AggregateUDF, Expr, HigherOrderUDF, LogicalPlan, ScalarUDF, WindowUDF,
+};
 use datafusion_physical_plan::{ExecutionPlan, PhysicalExpr};
 use parking_lot::{Mutex, RwLock};
 use std::any::Any;
@@ -100,7 +103,7 @@ pub trait Session: Send + Sync {
     /// + 2` will not be simplified to `a = 3` as this is a more involved process.
     /// See the [expr_api] example for how to simplify expressions.
     ///
-    /// [expr_api]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/expr_api.rs
+    /// [expr_api]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/expr_api.rs
     fn create_physical_expr(
         &self,
         expr: Expr,
@@ -110,12 +113,18 @@ pub trait Session: Send + Sync {
     /// Return reference to scalar_functions
     fn scalar_functions(&self) -> &HashMap<String, Arc<ScalarUDF>>;
 
+    /// Return reference to higher_order_functions
+    fn higher_order_functions(&self) -> &HashMap<String, Arc<dyn HigherOrderUDF>>;
+
     /// Return reference to aggregate_functions
     fn aggregate_functions(&self) -> &HashMap<String, Arc<AggregateUDF>>;
 
     /// Return reference to window functions
     fn window_functions(&self) -> &HashMap<String, Arc<WindowUDF>>;
 
+    /// Return a reference to the extension type registry
+    fn extension_type_registry(&self) -> &ExtensionTypeRegistryRef;
+
     /// Return the runtime env
     fn runtime_env(&self) -> &Arc<RuntimeEnv>;
 
@@ -149,6 +158,7 @@ impl From<&dyn Session> for TaskContext {
             state.session_id().to_string(),
             state.config().clone(),
             state.scalar_functions().clone(),
+            state.higher_order_functions().clone(),
             state.aggregate_functions().clone(),
             state.window_functions().clone(),
             Arc::clone(state.runtime_env()),
diff --git a/datafusion/spark/Cargo.toml b/datafusion/spark/Cargo.toml
index 7f6210fb32bf6..72bca58b7a2cf 100644
--- a/datafusion/spark/Cargo.toml
+++ b/datafusion/spark/Cargo.toml
@@ -29,6 +29,13 @@ edition = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+[features]
+default = []
+core = ["datafusion"]
+
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -40,19 +47,55 @@ arrow = { workspace = true }
 bigdecimal = { workspace = true }
 chrono = { workspace = true }
 crc32fast = "1.4"
+# Optional dependency for SessionStateBuilderSpark extension trait
+datafusion = { workspace = true, optional = true, default-features = false }
 datafusion-catalog = { workspace = true }
 datafusion-common = { workspace = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-functions = { workspace = true, features = ["crypto_expressions"] }
+datafusion-functions-aggregate = { workspace = true }
+datafusion-functions-aggregate-common = { workspace = true }
+datafusion-functions-nested = { workspace = true }
 log = { workspace = true }
+num-traits = { workspace = true }
+percent-encoding = "2.3.2"
 rand = { workspace = true }
-sha1 = "0.10"
+serde_json = { workspace = true }
+sha1 = "0.11"
+sha2 = { workspace = true }
 url = { workspace = true }
 
 [dev-dependencies]
+arrow = { workspace = true, features = ["test_utils"] }
 criterion = { workspace = true }
+# for SessionStateBuilderSpark tests
+datafusion = { workspace = true, default-features = false }
 
 [[bench]]
 harness = false
 name = "char"
+
+[[bench]]
+harness = false
+name = "space"
+
+[[bench]]
+harness = false
+name = "hex"
+
+[[bench]]
+harness = false
+name = "slice"
+
+[[bench]]
+harness = false
+name = "substring"
+
+[[bench]]
+harness = false
+name = "unhex"
+
+[[bench]]
+harness = false
+name = "sha2"
diff --git a/datafusion/spark/benches/char.rs b/datafusion/spark/benches/char.rs
index 02eab7630d070..38d9ebdeb4f5f 100644
--- a/datafusion/spark/benches/char.rs
+++ b/datafusion/spark/benches/char.rs
@@ -15,11 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::datatypes::{DataType, Field};
 use arrow::{array::PrimitiveArray, datatypes::Int64Type};
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_spark::function::string::char;
diff --git a/datafusion/spark/benches/hex.rs b/datafusion/spark/benches/hex.rs
new file mode 100644
index 0000000000000..9785371cc5827
--- /dev/null
+++ b/datafusion/spark/benches/hex.rs
@@ -0,0 +1,150 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::*;
+use arrow::datatypes::*;
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_spark::function::math::hex::SparkHex;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn seedable_rng() -> StdRng {
+    StdRng::seed_from_u64(42)
+}
+
+fn generate_int64_data(size: usize, null_density: f32) -> PrimitiveArray<Int64Type> {
+    let mut rng = seedable_rng();
+    (0..size)
+        .map(|_| {
+            if rng.random::<f32>() < null_density {
+                None
+            } else {
+                Some(rng.random_range::<i64, _>(-999_999_999_999..999_999_999_999))
+            }
+        })
+        .collect()
+}
+
+fn generate_utf8_data(size: usize, null_density: f32) -> StringArray {
+    let mut rng = seedable_rng();
+    let mut builder = StringBuilder::new();
+    for _ in 0..size {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            let len = rng.random_range::<usize, _>(1..=100);
+            let s: String =
+                std::iter::repeat_with(|| rng.random_range(b'a'..=b'z') as char)
+                    .take(len)
+                    .collect();
+            builder.append_value(&s);
+        }
+    }
+    builder.finish()
+}
+
+fn generate_binary_data(size: usize, null_density: f32) -> BinaryArray {
+    let mut rng = seedable_rng();
+    let mut builder = BinaryBuilder::new();
+    for _ in 0..size {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            let len = rng.random_range::<usize, _>(1..=100);
+            let bytes: Vec<u8> = (0..len).map(|_| rng.random()).collect();
+            builder.append_value(&bytes);
+        }
+    }
+    builder.finish()
+}
+
+fn generate_int64_dict_data(
+    size: usize,
+    null_density: f32,
+) -> DictionaryArray<Int32Type> {
+    let mut rng = seedable_rng();
+    let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int64Type>::new();
+    for _ in 0..size {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            builder.append_value(
+                rng.random_range::<i64, _>(-999_999_999_999..999_999_999_999),
+            );
+        }
+    }
+    builder.finish()
+}
+
+fn run_benchmark(c: &mut Criterion, name: &str, size: usize, array: Arc<dyn Array>) {
+    let hex_func = SparkHex::new();
+    let args = vec![ColumnarValue::Array(array)];
+    let arg_fields: Vec<_> = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function(&format!("{name}/size={size}"), |b| {
+        b.iter(|| {
+            black_box(
+                hex_func
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: args.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Arc::new(Field::new("f", DataType::Utf8, true)),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let sizes = vec![1024, 4096, 8192];
+    let null_density = 0.1;
+
+    for &size in &sizes {
+        let data = generate_int64_data(size, null_density);
+        run_benchmark(c, "hex_int64", size, Arc::new(data));
+    }
+
+    for &size in &sizes {
+        let data = generate_utf8_data(size, null_density);
+        run_benchmark(c, "hex_utf8", size, Arc::new(data));
+    }
+
+    for &size in &sizes {
+        let data = generate_binary_data(size, null_density);
+        run_benchmark(c, "hex_binary", size, Arc::new(data));
+    }
+
+    for &size in &sizes {
+        let data = generate_int64_dict_data(size, null_density);
+        run_benchmark(c, "hex_int64_dict", size, Arc::new(data));
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/spark/benches/sha2.rs b/datafusion/spark/benches/sha2.rs
new file mode 100644
index 0000000000000..6e835984703f0
--- /dev/null
+++ b/datafusion/spark/benches/sha2.rs
@@ -0,0 +1,105 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::*;
+use arrow::datatypes::*;
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_spark::function::hash::sha2::SparkSha2;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn seedable_rng() -> StdRng {
+    StdRng::seed_from_u64(42)
+}
+
+fn generate_binary_data(size: usize, null_density: f32) -> BinaryArray {
+    let mut rng = seedable_rng();
+    let mut builder = BinaryBuilder::new();
+    for _ in 0..size {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            let len = rng.random_range::<usize, _>(1..=100);
+            let bytes: Vec<u8> = (0..len).map(|_| rng.random()).collect();
+            builder.append_value(&bytes);
+        }
+    }
+    builder.finish()
+}
+
+fn run_benchmark(c: &mut Criterion, name: &str, size: usize, args: &[ColumnarValue]) {
+    let sha2_func = SparkSha2::new();
+    let arg_fields: Vec<_> = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function(&format!("{name}/size={size}"), |b| {
+        b.iter(|| {
+            black_box(
+                sha2_func
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: args.to_vec(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Arc::new(Field::new("f", DataType::Utf8, true)),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    // Scalar benchmark (avoid array expansion)
+    let scalar_args = vec![
+        ColumnarValue::Scalar(ScalarValue::Binary(Some(b"Spark".to_vec()))),
+        ColumnarValue::Scalar(ScalarValue::Int32(Some(256))),
+    ];
+    run_benchmark(c, "sha2/scalar", 1, &scalar_args);
+
+    let sizes = vec![1024, 4096, 8192];
+    let null_density = 0.1;
+
+    for &size in &sizes {
+        let values: ArrayRef = Arc::new(generate_binary_data(size, null_density));
+        let bit_lengths: ArrayRef = Arc::new(Int32Array::from(vec![256; size]));
+
+        let array_args = vec![
+            ColumnarValue::Array(Arc::clone(&values)),
+            ColumnarValue::Array(Arc::clone(&bit_lengths)),
+        ];
+        run_benchmark(c, "sha2/array_binary_256", size, &array_args);
+
+        let array_scalar_args = vec![
+            ColumnarValue::Array(Arc::clone(&values)),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(256))),
+        ];
+        run_benchmark(c, "sha2/array_scalar_binary_256", size, &array_scalar_args);
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/spark/benches/slice.rs b/datafusion/spark/benches/slice.rs
new file mode 100644
index 0000000000000..da392dc042f92
--- /dev/null
+++ b/datafusion/spark/benches/slice.rs
@@ -0,0 +1,185 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Int64Array, ListArray, ListViewArray, NullBufferBuilder, PrimitiveArray,
+};
+use arrow::buffer::{OffsetBuffer, ScalarBuffer};
+use arrow::datatypes::{DataType, Field, Int64Type};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_spark::function::array::slice;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn create_inputs(
+    rng: &mut StdRng,
+    size: usize,
+    child_array_size: usize,
+    null_density: f32,
+) -> (ListArray, ListViewArray) {
+    let mut nulls_builder = NullBufferBuilder::new(size);
+    let mut sizes = Vec::with_capacity(size);
+
+    for _ in 0..size {
+        if rng.random::<f32>() < null_density {
+            nulls_builder.append_null();
+        } else {
+            nulls_builder.append_non_null();
+        }
+        sizes.push(rng.random_range(1..child_array_size));
+    }
+    let nulls = nulls_builder.finish();
+
+    let length = sizes.iter().sum();
+    let values: PrimitiveArray<Int64Type> =
+        (0..length).map(|_| Some(rng.random())).collect();
+    let values = Arc::new(values);
+
+    let offsets = OffsetBuffer::from_lengths(sizes.clone());
+    let list_array = ListArray::new(
+        Arc::new(Field::new_list_field(DataType::Int64, true)),
+        offsets.clone(),
+        values.clone(),
+        nulls.clone(),
+    );
+
+    let offsets = ScalarBuffer::from(offsets.slice(0, size - 1));
+    let sizes = ScalarBuffer::from_iter(sizes.into_iter().map(|v| v as i32));
+    let list_view_array = ListViewArray::new(
+        Arc::new(Field::new_list_field(DataType::Int64, true)),
+        offsets,
+        sizes,
+        values,
+        nulls,
+    );
+
+    (list_array, list_view_array)
+}
+
+fn random_from_to(
+    rng: &mut StdRng,
+    size: i64,
+    null_density: f32,
+) -> (Option<i64>, Option<i64>) {
+    let from = if rng.random::<f32>() < null_density {
+        None
+    } else {
+        Some(rng.random_range(1..=size))
+    };
+
+    let to = if rng.random::<f32>() < null_density {
+        None
+    } else {
+        match from {
+            Some(from) => Some(rng.random_range(from..=size)),
+            None => Some(rng.random_range(1..=size)),
+        }
+    };
+
+    (from, to)
+}
+
+fn array_slice_benchmark(
+    name: &str,
+    input: ColumnarValue,
+    mut args: Vec<ColumnarValue>,
+    c: &mut Criterion,
+    size: usize,
+) {
+    args.insert(0, input);
+
+    let array_slice = slice();
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| {
+            <Arc<Field>>::from(Field::new(format!("arg_{idx}"), arg.data_type(), true))
+        })
+        .collect::<Vec<_>>();
+    c.bench_function(name, |b| {
+        b.iter(|| {
+            black_box(
+                array_slice
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: args.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new_list_field(args[0].data_type(), true)
+                            .into(),
+                        config_options: Arc::new(ConfigOptions::default()),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let rng = &mut StdRng::seed_from_u64(42);
+    let size = 1_000_000;
+    let child_array_size = 100;
+    let null_density = 0.1;
+
+    let (list_array, list_view_array) =
+        create_inputs(rng, size, child_array_size, null_density);
+
+    let mut array_from = Vec::with_capacity(size);
+    let mut array_to = Vec::with_capacity(size);
+    for child_array_size in list_array.offsets().lengths() {
+        let (from, to) = random_from_to(rng, child_array_size as i64, null_density);
+        array_from.push(from);
+        array_to.push(to);
+    }
+
+    // input
+    let list_array = ColumnarValue::Array(Arc::new(list_array));
+    let list_view_array = ColumnarValue::Array(Arc::new(list_view_array));
+
+    // args
+    let array_from = ColumnarValue::Array(Arc::new(Int64Array::from(array_from)));
+    let array_to = ColumnarValue::Array(Arc::new(Int64Array::from(array_to)));
+    let scalar_from = ColumnarValue::Scalar(ScalarValue::from(1i64));
+    let scalar_to = ColumnarValue::Scalar(ScalarValue::from(child_array_size as i64 / 2));
+
+    for input in [list_array, list_view_array] {
+        let input_type = input.data_type().to_string();
+
+        array_slice_benchmark(
+            &format!("slice: input {input_type}, array args, no stride"),
+            input.clone(),
+            vec![array_from.clone(), array_to.clone()],
+            c,
+            size,
+        );
+
+        array_slice_benchmark(
+            &format!("slice: input {input_type}, scalar args, no stride"),
+            input.clone(),
+            vec![scalar_from.clone(), scalar_to.clone()],
+            c,
+            size,
+        );
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/spark/benches/space.rs b/datafusion/spark/benches/space.rs
new file mode 100644
index 0000000000000..bd9d370ca37fe
--- /dev/null
+++ b/datafusion/spark/benches/space.rs
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::PrimitiveArray;
+use arrow::datatypes::{DataType, Field, Int32Type};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_spark::function::string::space;
+use rand::prelude::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let space_func = space();
+    let size = 1024;
+    let input: PrimitiveArray<Int32Type> = {
+        let null_density = 0.2;
+        let mut rng = StdRng::seed_from_u64(42);
+        (0..size)
+            .map(|_| {
+                if rng.random::<f32>() < null_density {
+                    None
+                } else {
+                    Some(rng.random_range::<i32, _>(1i32..10))
+                }
+            })
+            .collect()
+    };
+    let input = Arc::new(input);
+    let args = vec![ColumnarValue::Array(input)];
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+    c.bench_function("space", |b| {
+        b.iter(|| {
+            black_box(
+                space_func
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: args.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Arc::new(Field::new("f", DataType::Utf8, true)),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/spark/benches/substring.rs b/datafusion/spark/benches/substring.rs
new file mode 100644
index 0000000000000..d6eac817c322f
--- /dev/null
+++ b/datafusion/spark/benches/substring.rs
@@ -0,0 +1,205 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, Int64Array, OffsetSizeTrait};
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::{
+    create_string_array_with_len, create_string_view_array_with_len,
+};
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::DataFusionError;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_spark::function::string::substring;
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn create_args_without_count<O: OffsetSizeTrait>(
+    size: usize,
+    str_len: usize,
+    start_half_way: bool,
+    force_view_types: bool,
+) -> Vec<ColumnarValue> {
+    let start_array = Arc::new(Int64Array::from(
+        (0..size)
+            .map(|_| {
+                if start_half_way {
+                    (str_len / 2) as i64
+                } else {
+                    1i64
+                }
+            })
+            .collect::<Vec<_>>(),
+    ));
+
+    if force_view_types {
+        let string_array =
+            Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
+        vec![
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Array(start_array),
+        ]
+    } else {
+        let string_array =
+            Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+
+        vec![
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef),
+        ]
+    }
+}
+
+fn create_args_with_count<O: OffsetSizeTrait>(
+    size: usize,
+    str_len: usize,
+    count_max: usize,
+    force_view_types: bool,
+) -> Vec<ColumnarValue> {
+    let start_array =
+        Arc::new(Int64Array::from((0..size).map(|_| 1).collect::<Vec<_>>()));
+    let count = count_max.min(str_len) as i64;
+    let count_array = Arc::new(Int64Array::from(
+        (0..size).map(|_| count).collect::<Vec<_>>(),
+    ));
+
+    if force_view_types {
+        let string_array =
+            Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
+        vec![
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Array(start_array),
+            ColumnarValue::Array(count_array),
+        ]
+    } else {
+        let string_array =
+            Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+
+        vec![
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef),
+            ColumnarValue::Array(Arc::clone(&count_array) as ArrayRef),
+        ]
+    }
+}
+
+#[expect(clippy::needless_pass_by_value)]
+fn invoke_substr_with_args(
+    args: Vec<ColumnarValue>,
+    number_rows: usize,
+) -> Result<ColumnarValue, DataFusionError> {
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    substring().invoke_with_args(ScalarFunctionArgs {
+        args: args.clone(),
+        arg_fields,
+        number_rows,
+        return_field: Field::new("f", DataType::Utf8View, true).into(),
+        config_options: Arc::clone(&config_options),
+    })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    for size in [1024, 4096] {
+        // string_len = 12, substring_len=6 (see `create_args_without_count`)
+        let len = 12;
+        let mut group = c.benchmark_group("SHORTER THAN 12");
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+
+        let args = create_args_without_count::<i32>(size, len, true, true);
+        group.bench_function(
+            format!("substr_string_view [size={size}, strlen={len}]"),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_without_count::<i32>(size, len, false, false);
+        group.bench_function(format!("substr_string [size={size}, strlen={len}]"), |b| {
+            b.iter(|| black_box(invoke_substr_with_args(args.clone(), size)))
+        });
+
+        let args = create_args_without_count::<i64>(size, len, true, false);
+        group.bench_function(
+            format!("substr_large_string [size={size}, strlen={len}]"),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        group.finish();
+
+        // string_len = 128, start=1, count=64, substring_len=64
+        let len = 128;
+        let count = 64;
+        let mut group = c.benchmark_group("LONGER THAN 12");
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+
+        let args = create_args_with_count::<i32>(size, len, count, true);
+        group.bench_function(
+            format!("substr_string_view [size={size}, count={count}, strlen={len}]",),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_with_count::<i32>(size, len, count, false);
+        group.bench_function(
+            format!("substr_string [size={size}, count={count}, strlen={len}]",),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_with_count::<i64>(size, len, count, false);
+        group.bench_function(
+            format!("substr_large_string [size={size}, count={count}, strlen={len}]",),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        group.finish();
+
+        // string_len = 128, start=1, count=6, substring_len=6
+        let len = 128;
+        let count = 6;
+        let mut group = c.benchmark_group("SRC_LEN > 12, SUB_LEN < 12");
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+
+        let args = create_args_with_count::<i32>(size, len, count, true);
+        group.bench_function(
+            format!("substr_string_view [size={size}, count={count}, strlen={len}]",),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_with_count::<i32>(size, len, count, false);
+        group.bench_function(
+            format!("substr_string [size={size}, count={count}, strlen={len}]",),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_with_count::<i64>(size, len, count, false);
+        group.bench_function(
+            format!("substr_large_string [size={size}, count={count}, strlen={len}]",),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/spark/benches/unhex.rs b/datafusion/spark/benches/unhex.rs
new file mode 100644
index 0000000000000..7dce683485bc7
--- /dev/null
+++ b/datafusion/spark/benches/unhex.rs
@@ -0,0 +1,146 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Array, LargeStringArray, LargeStringBuilder, StringArray, StringBuilder,
+    StringViewArray, StringViewBuilder,
+};
+use arrow::datatypes::{DataType, Field};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_spark::function::math::unhex::SparkUnhex;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn generate_hex_string_data(size: usize, null_density: f32) -> StringArray {
+    let mut rng = StdRng::seed_from_u64(42);
+    let mut builder = StringBuilder::with_capacity(size, 0);
+    let hex_chars = b"0123456789abcdefABCDEF";
+
+    for _ in 0..size {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            let len = rng.random_range::<usize, _>(2..=100);
+            let s: String = std::iter::repeat_with(|| {
+                hex_chars[rng.random_range(0..hex_chars.len())] as char
+            })
+            .take(len)
+            .collect();
+            builder.append_value(&s);
+        }
+    }
+    builder.finish()
+}
+
+fn generate_hex_large_string_data(size: usize, null_density: f32) -> LargeStringArray {
+    let mut rng = StdRng::seed_from_u64(42);
+    let mut builder = LargeStringBuilder::with_capacity(size, 0);
+    let hex_chars = b"0123456789abcdefABCDEF";
+
+    for _ in 0..size {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            let len = rng.random_range::<usize, _>(2..=100);
+            let s: String = std::iter::repeat_with(|| {
+                hex_chars[rng.random_range(0..hex_chars.len())] as char
+            })
+            .take(len)
+            .collect();
+            builder.append_value(&s);
+        }
+    }
+    builder.finish()
+}
+
+fn generate_hex_utf8view_data(size: usize, null_density: f32) -> StringViewArray {
+    let mut rng = StdRng::seed_from_u64(42);
+    let mut builder = StringViewBuilder::with_capacity(size);
+    let hex_chars = b"0123456789abcdefABCDEF";
+
+    for _ in 0..size {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            let len = rng.random_range::<usize, _>(2..=100);
+            let s: String = std::iter::repeat_with(|| {
+                hex_chars[rng.random_range(0..hex_chars.len())] as char
+            })
+            .take(len)
+            .collect();
+            builder.append_value(&s);
+        }
+    }
+    builder.finish()
+}
+
+fn run_benchmark(c: &mut Criterion, name: &str, size: usize, array: Arc<dyn Array>) {
+    let unhex_func = SparkUnhex::new();
+    let args = vec![ColumnarValue::Array(array)];
+    let arg_fields: Vec<_> = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function(&format!("{name}/size={size}"), |b| {
+        b.iter(|| {
+            black_box(
+                unhex_func
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: args.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Arc::new(Field::new("f", DataType::Binary, true)),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let sizes = vec![1024, 4096, 8192];
+    let null_density = 0.1;
+
+    // Benchmark with hex string
+    for &size in &sizes {
+        let data = generate_hex_string_data(size, null_density);
+        run_benchmark(c, "unhex_utf8", size, Arc::new(data));
+    }
+
+    // Benchmark with hex large string
+    for &size in &sizes {
+        let data = generate_hex_large_string_data(size, null_density);
+        run_benchmark(c, "unhex_large_utf8", size, Arc::new(data));
+    }
+
+    // Benchmark with hex Utf8View
+    for &size in &sizes {
+        let data = generate_hex_utf8view_data(size, null_density);
+        run_benchmark(c, "unhex_utf8view", size, Arc::new(data));
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/spark/src/function/aggregate/avg.rs b/datafusion/spark/src/function/aggregate/avg.rs
index 65736815fec5c..5f4d2c253a2dc 100644
--- a/datafusion/spark/src/function/aggregate/avg.rs
+++ b/datafusion/spark/src/function/aggregate/avg.rs
@@ -15,24 +15,27 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::ArrowNativeTypeOp;
 use arrow::array::{
+    Array, ArrayRef, ArrowNativeTypeOp, ArrowNumericType, BooleanArray, Int64Array,
+    PrimitiveArray,
     builder::PrimitiveBuilder,
     cast::AsArray,
     types::{Float64Type, Int64Type},
-    Array, ArrayRef, ArrowNumericType, Int64Array, PrimitiveArray,
 };
 use arrow::compute::sum;
 use arrow::datatypes::{DataType, Field, FieldRef};
-use datafusion_common::utils::take_function_args;
-use datafusion_common::{not_impl_err, plan_err, Result, ScalarValue};
+use datafusion_common::types::{NativeType, logical_float64};
+use datafusion_common::{Result, ScalarValue, not_impl_err};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::utils::format_state_name;
-use datafusion_expr::Volatility::Immutable;
 use datafusion_expr::{
-    Accumulator, AggregateUDFImpl, EmitTo, GroupsAccumulator, ReversedUDAF, Signature,
+    Accumulator, AggregateUDFImpl, Coercion, EmitTo, GroupsAccumulator, ReversedUDAF,
+    Signature, TypeSignatureClass, Volatility,
 };
-use std::{any::Any, sync::Arc};
+use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::{
+    filtered_null_mask, set_nulls,
+};
+use std::sync::Arc;
 
 /// AVG aggregate expression
 /// Spark average aggregate expression. Differs from standard DataFusion average aggregate
@@ -56,31 +59,19 @@ impl SparkAvg {
     /// Implement AVG aggregate function
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Immutable),
+            signature: Signature::coercible(
+                vec![Coercion::new_implicit(
+                    TypeSignatureClass::Native(logical_float64()),
+                    vec![TypeSignatureClass::Numeric],
+                    NativeType::Float64,
+                )],
+                Volatility::Immutable,
+            ),
         }
     }
 }
 
 impl AggregateUDFImpl for SparkAvg {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        let [args] = take_function_args(self.name(), arg_types)?;
-
-        fn coerced_type(data_type: &DataType) -> Result<DataType> {
-            match &data_type {
-                d if d.is_numeric() => Ok(DataType::Float64),
-                DataType::Dictionary(_, v) => coerced_type(v.as_ref()),
-                _ => {
-                    plan_err!("Avg does not support inputs of type {data_type}.")
-                }
-            }
-        }
-        Ok(vec![coerced_type(args)?])
-    }
-
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
         Ok(DataType::Float64)
     }
@@ -222,7 +213,7 @@ impl Accumulator for AvgAccumulator {
 struct AvgGroupsAccumulator<T, F>
 where
     T: ArrowNumericType + Send,
-    F: Fn(T::Native, i64) -> Result<T::Native> + Send,
+    F: Fn(T::Native, i64) -> Result<T::Native> + Send + 'static,
 {
     /// The type of the returned average
     return_data_type: DataType,
@@ -240,7 +231,7 @@ where
 impl<T, F> AvgGroupsAccumulator<T, F>
 where
     T: ArrowNumericType + Send,
-    F: Fn(T::Native, i64) -> Result<T::Native> + Send,
+    F: Fn(T::Native, i64) -> Result<T::Native> + Send + 'static,
 {
     pub fn new(return_data_type: &DataType, avg_fn: F) -> Self {
         Self {
@@ -255,13 +246,13 @@ where
 impl<T, F> GroupsAccumulator for AvgGroupsAccumulator<T, F>
 where
     T: ArrowNumericType + Send,
-    F: Fn(T::Native, i64) -> Result<T::Native> + Send,
+    F: Fn(T::Native, i64) -> Result<T::Native> + Send + 'static,
 {
     fn update_batch(
         &mut self,
         values: &[ArrayRef],
         group_indices: &[usize],
-        _opt_filter: Option<&arrow::array::BooleanArray>,
+        _opt_filter: Option<&BooleanArray>,
         total_num_groups: usize,
     ) -> Result<()> {
         assert_eq!(values.len(), 1, "single argument to update_batch");
@@ -298,26 +289,26 @@ where
         &mut self,
         values: &[ArrayRef],
         group_indices: &[usize],
-        _opt_filter: Option<&arrow::array::BooleanArray>,
+        _opt_filter: Option<&BooleanArray>,
         total_num_groups: usize,
     ) -> Result<()> {
         assert_eq!(values.len(), 2, "two arguments to merge_batch");
         // first batch is partial sums, second is counts
         let partial_sums = values[0].as_primitive::<T>();
         let partial_counts = values[1].as_primitive::<Int64Type>();
-        // update counts with partial counts
-        self.counts.resize(total_num_groups, 0);
-        let iter1 = group_indices.iter().zip(partial_counts.values().iter());
-        for (&group_index, &partial_count) in iter1 {
-            self.counts[group_index] += partial_count;
-        }
 
-        // update sums
+        self.counts.resize(total_num_groups, 0);
         self.sums.resize(total_num_groups, T::default_value());
-        let iter2 = group_indices.iter().zip(partial_sums.values().iter());
-        for (&group_index, &new_value) in iter2 {
+
+        for (idx, &group_index) in group_indices.iter().enumerate() {
+            // Skip null state entries emitted by convert_to_state for
+            // filtered / null input rows.
+            if partial_counts.is_null(idx) || partial_sums.is_null(idx) {
+                continue;
+            }
+            self.counts[group_index] += partial_counts.value(idx);
             let sum = &mut self.sums[group_index];
-            *sum = sum.add_wrapping(new_value);
+            *sum = sum.add_wrapping(partial_sums.value(idx));
         }
 
         Ok(())
@@ -356,7 +347,149 @@ where
         ])
     }
 
+    fn convert_to_state(
+        &self,
+        values: &[ArrayRef],
+        opt_filter: Option<&BooleanArray>,
+    ) -> Result<Vec<ArrayRef>> {
+        let sums = values[0]
+            .as_primitive::<T>()
+            .clone()
+            .with_data_type(self.return_data_type.clone());
+        let counts = Int64Array::from_value(1, sums.len());
+
+        let nulls = filtered_null_mask(opt_filter, &sums);
+        let counts = set_nulls(counts, nulls.clone());
+        let sums = set_nulls(sums, nulls);
+
+        // [sum, count] - must match state() and merge_batch()
+        Ok(vec![
+            Arc::new(sums) as ArrayRef,
+            Arc::new(counts) as ArrayRef,
+        ])
+    }
+
+    fn supports_convert_to_state(&self) -> bool {
+        true
+    }
+
     fn size(&self) -> usize {
         self.counts.capacity() * size_of::<i64>() + self.sums.capacity() * size_of::<T>()
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::Float64Array;
+
+    fn make_acc() -> AvgGroupsAccumulator<Float64Type, impl Fn(f64, i64) -> Result<f64>> {
+        AvgGroupsAccumulator::<Float64Type, _>::new(&DataType::Float64, |sum, count| {
+            Ok(sum / count as f64)
+        })
+    }
+
+    #[test]
+    fn supports_convert_to_state() {
+        assert!(make_acc().supports_convert_to_state());
+    }
+
+    #[test]
+    fn convert_to_state_basic() {
+        let acc = make_acc();
+        let values: Vec<ArrayRef> =
+            vec![Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0]))];
+        let state = acc.convert_to_state(&values, None).unwrap();
+
+        assert_eq!(state.len(), 2);
+        let sums = state[0].as_primitive::<Float64Type>();
+        let counts = state[1].as_primitive::<Int64Type>();
+
+        assert_eq!(sums.values().as_ref(), &[1.0, 2.0, 3.0]);
+        assert_eq!(counts.values().as_ref(), &[1, 1, 1]);
+        assert_eq!(sums.null_count(), 0);
+        assert_eq!(counts.null_count(), 0);
+    }
+
+    #[test]
+    fn convert_to_state_with_nulls() {
+        let acc = make_acc();
+        let values: Vec<ArrayRef> = vec![Arc::new(Float64Array::from(vec![
+            Some(1.0),
+            None,
+            Some(3.0),
+        ]))];
+        let state = acc.convert_to_state(&values, None).unwrap();
+
+        let sums = state[0].as_primitive::<Float64Type>();
+        let counts = state[1].as_primitive::<Int64Type>();
+
+        assert!(!sums.is_null(0));
+        assert!(sums.is_null(1));
+        assert!(!sums.is_null(2));
+
+        assert_eq!(counts.value(0), 1);
+        assert!(counts.is_null(1));
+        assert_eq!(counts.value(2), 1);
+    }
+
+    #[test]
+    fn convert_to_state_with_filter() {
+        let acc = make_acc();
+        let values: Vec<ArrayRef> =
+            vec![Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0]))];
+        let filter = BooleanArray::from(vec![true, false, true]);
+        let state = acc.convert_to_state(&values, Some(&filter)).unwrap();
+
+        let sums = state[0].as_primitive::<Float64Type>();
+        let counts = state[1].as_primitive::<Int64Type>();
+
+        assert!(!sums.is_null(0));
+        assert!(sums.is_null(1));
+        assert!(!sums.is_null(2));
+
+        assert_eq!(counts.value(0), 1);
+        assert!(counts.is_null(1));
+        assert_eq!(counts.value(2), 1);
+    }
+
+    #[test]
+    fn convert_to_state_roundtrips_through_merge() {
+        let mut acc = make_acc();
+        let input: Vec<ArrayRef> =
+            vec![Arc::new(Float64Array::from(vec![10.0, 20.0, 30.0]))];
+        let state = acc.convert_to_state(&input, None).unwrap();
+
+        // feed the converted state back through merge_batch
+        acc.merge_batch(
+            &state,
+            &[0, 0, 0],
+            None,
+            1, // single group
+        )
+        .unwrap();
+
+        let result = acc.evaluate(EmitTo::All).unwrap();
+        let result = result.as_primitive::<Float64Type>();
+        assert_eq!(result.value(0), 20.0); // (10+20+30)/3
+    }
+
+    #[test]
+    fn convert_to_state_null_merge_matches_direct() {
+        // avg([1.0, NULL, 3.0]) must be 2.0 after a convert_to_state → merge_batch
+        // round-trip. Before the merge-path null fix this leaked the backing
+        // buffer value at the null slot and produced the wrong average.
+        let mut acc = make_acc();
+        let input: Vec<ArrayRef> = vec![Arc::new(Float64Array::from(vec![
+            Some(1.0),
+            None,
+            Some(3.0),
+        ]))];
+        let state = acc.convert_to_state(&input, None).unwrap();
+        acc.merge_batch(&state, &[0, 0, 0], None, 1).unwrap();
+
+        let result = acc.evaluate(EmitTo::All).unwrap();
+        let result = result.as_primitive::<Float64Type>();
+        assert_eq!(result.value(0), 2.0);
+    }
+}
diff --git a/datafusion/spark/src/function/aggregate/collect.rs b/datafusion/spark/src/function/aggregate/collect.rs
new file mode 100644
index 0000000000000..5af0fd39cca07
--- /dev/null
+++ b/datafusion/spark/src/function/aggregate/collect.rs
@@ -0,0 +1,192 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::ArrayRef;
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::utils::SingleRowListArrayBuilder;
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::utils::format_state_name;
+use datafusion_expr::{Accumulator, AggregateUDFImpl, Signature, Volatility};
+use datafusion_functions_aggregate::array_agg::{
+    ArrayAggAccumulator, DistinctArrayAggAccumulator,
+};
+use std::sync::Arc;
+
+// Spark implementation of collect_list/collect_set aggregate function.
+// Differs from DataFusion ArrayAgg in the following ways:
+// - ignores NULL inputs
+// - returns an empty list when all inputs are NULL
+// - does not support ordering
+
+// <https://spark.apache.org/docs/latest/api/sql/index.html#collect_list>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkCollectList {
+    signature: Signature,
+}
+
+impl Default for SparkCollectList {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkCollectList {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::any(1, Volatility::Immutable),
+        }
+    }
+}
+
+impl AggregateUDFImpl for SparkCollectList {
+    fn name(&self) -> &str {
+        "collect_list"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::List(Arc::new(Field::new_list_field(
+            arg_types[0].clone(),
+            true,
+        ))))
+    }
+
+    fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
+        Ok(vec![
+            Field::new_list(
+                format_state_name(args.name, "collect_list"),
+                Field::new_list_field(args.input_fields[0].data_type().clone(), true),
+                true,
+            )
+            .into(),
+        ])
+    }
+
+    fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        let field = &acc_args.expr_fields[0];
+        let data_type = field.data_type().clone();
+        let ignore_nulls = true;
+        Ok(Box::new(NullToEmptyListAccumulator::new(
+            ArrayAggAccumulator::try_new(&data_type, ignore_nulls)?,
+            data_type,
+        )))
+    }
+}
+
+// <https://spark.apache.org/docs/latest/api/sql/index.html#collect_set>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkCollectSet {
+    signature: Signature,
+}
+
+impl Default for SparkCollectSet {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkCollectSet {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::any(1, Volatility::Immutable),
+        }
+    }
+}
+
+impl AggregateUDFImpl for SparkCollectSet {
+    fn name(&self) -> &str {
+        "collect_set"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::List(Arc::new(Field::new_list_field(
+            arg_types[0].clone(),
+            true,
+        ))))
+    }
+
+    fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
+        Ok(vec![
+            Field::new_list(
+                format_state_name(args.name, "collect_set"),
+                Field::new_list_field(args.input_fields[0].data_type().clone(), true),
+                true,
+            )
+            .into(),
+        ])
+    }
+
+    fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        let field = &acc_args.expr_fields[0];
+        let data_type = field.data_type().clone();
+        let ignore_nulls = true;
+        Ok(Box::new(NullToEmptyListAccumulator::new(
+            DistinctArrayAggAccumulator::try_new(&data_type, None, ignore_nulls)?,
+            data_type,
+        )))
+    }
+}
+
+/// Wrapper accumulator that returns an empty list instead of NULL when all inputs are NULL.
+/// This implements Spark's behavior for collect_list and collect_set.
+#[derive(Debug)]
+struct NullToEmptyListAccumulator<T: Accumulator> {
+    inner: T,
+    data_type: DataType,
+}
+
+impl<T: Accumulator> NullToEmptyListAccumulator<T> {
+    pub fn new(inner: T, data_type: DataType) -> Self {
+        Self { inner, data_type }
+    }
+}
+
+impl<T: Accumulator> Accumulator for NullToEmptyListAccumulator<T> {
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        self.inner.update_batch(values)
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        self.inner.merge_batch(states)
+    }
+
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        self.inner.state()
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        let result = self.inner.evaluate()?;
+        if result.is_null() {
+            let empty_array = arrow::array::new_empty_array(&self.data_type);
+            Ok(SingleRowListArrayBuilder::new(empty_array).build_list_scalar())
+        } else {
+            Ok(result)
+        }
+    }
+
+    fn size(&self) -> usize {
+        self.inner.size() + self.data_type.size()
+    }
+}
diff --git a/datafusion/spark/src/function/aggregate/mod.rs b/datafusion/spark/src/function/aggregate/mod.rs
index d765d9c82f068..d6a2fe7a8503e 100644
--- a/datafusion/spark/src/function/aggregate/mod.rs
+++ b/datafusion/spark/src/function/aggregate/mod.rs
@@ -19,17 +19,44 @@ use datafusion_expr::AggregateUDF;
 use std::sync::Arc;
 
 pub mod avg;
+pub mod collect;
+pub mod try_sum;
+
 pub mod expr_fn {
     use datafusion_functions::export_functions;
 
     export_functions!((avg, "Returns the average value of a given column", arg1));
+    export_functions!((
+        try_sum,
+        "Returns the sum of values for a column, or NULL if overflow occurs",
+        arg1
+    ));
+    export_functions!((
+        collect_list,
+        "Returns a list created from the values in a column",
+        arg1
+    ));
+    export_functions!((
+        collect_set,
+        "Returns a set created from the values in a column",
+        arg1
+    ));
 }
 
 // TODO: try use something like datafusion_functions_aggregate::create_func!()
 pub fn avg() -> Arc<AggregateUDF> {
     Arc::new(AggregateUDF::new_from_impl(avg::SparkAvg::new()))
 }
+pub fn try_sum() -> Arc<AggregateUDF> {
+    Arc::new(AggregateUDF::new_from_impl(try_sum::SparkTrySum::new()))
+}
+pub fn collect_list() -> Arc<AggregateUDF> {
+    Arc::new(AggregateUDF::new_from_impl(collect::SparkCollectList::new()))
+}
+pub fn collect_set() -> Arc<AggregateUDF> {
+    Arc::new(AggregateUDF::new_from_impl(collect::SparkCollectSet::new()))
+}
 
 pub fn functions() -> Vec<Arc<AggregateUDF>> {
-    vec![avg()]
+    vec![avg(), try_sum(), collect_list(), collect_set()]
 }
diff --git a/datafusion/spark/src/function/aggregate/try_sum.rs b/datafusion/spark/src/function/aggregate/try_sum.rs
new file mode 100644
index 0000000000000..3918dea0f5072
--- /dev/null
+++ b/datafusion/spark/src/function/aggregate/try_sum.rs
@@ -0,0 +1,655 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, ArrowNumericType, AsArray, BooleanArray, PrimitiveArray};
+use arrow::datatypes::{
+    DECIMAL128_MAX_PRECISION, DataType, Decimal128Type, Field, FieldRef, Float64Type,
+    Int64Type,
+};
+use datafusion_common::{Result, ScalarValue, downcast_value, exec_err, not_impl_err};
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::utils::format_state_name;
+use datafusion_expr::{Accumulator, AggregateUDFImpl, Signature, Volatility};
+use std::fmt::{Debug, Formatter};
+use std::mem::size_of_val;
+
+#[derive(PartialEq, Eq, Hash)]
+pub struct SparkTrySum {
+    signature: Signature,
+}
+
+impl Default for SparkTrySum {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkTrySum {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+        }
+    }
+}
+
+impl Debug for SparkTrySum {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("SparkTrySum")
+            .field("signature", &self.signature)
+            .finish()
+    }
+}
+
+/// Accumulator for try_sum that detects overflow
+struct TrySumAccumulator<T: ArrowNumericType> {
+    sum: Option<T::Native>,
+    data_type: DataType,
+    failed: bool,
+    // Only used if data_type is Decimal128(p, s)
+    dec_precision: Option<u8>,
+}
+
+impl<T: ArrowNumericType> Debug for TrySumAccumulator<T> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "TrySumAccumulator({})", self.data_type)
+    }
+}
+
+impl<T: ArrowNumericType> TrySumAccumulator<T> {
+    fn new(data_type: DataType) -> Self {
+        let dec_precision = match &data_type {
+            DataType::Decimal128(p, _) => Some(*p),
+            _ => None,
+        };
+        Self {
+            sum: None,
+            data_type,
+            failed: false,
+            dec_precision,
+        }
+    }
+}
+
+impl<T: ArrowNumericType> Accumulator for TrySumAccumulator<T> {
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        Ok(vec![
+            self.evaluate()?,
+            ScalarValue::Boolean(Some(self.failed)),
+        ])
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        update_batch_internal(self, values)
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        // Check if any partition has failed
+        if downcast_value!(states[1], BooleanArray)
+            .iter()
+            .flatten()
+            .any(|f| f)
+        {
+            self.failed = true;
+            return Ok(());
+        }
+
+        // Merge the sum values using the same logic as update_batch
+        update_batch_internal(self, states)
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        evaluate_internal(self)
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self)
+    }
+}
+
+// Specialized implementations for update_batch for each type
+
+fn update_batch_internal<T: ArrowNumericType>(
+    acc: &mut TrySumAccumulator<T>,
+    values: &[ArrayRef],
+) -> Result<()> {
+    if values.is_empty() || acc.failed {
+        return Ok(());
+    }
+
+    let array: &PrimitiveArray<T> = values[0].as_primitive::<T>();
+
+    match acc.data_type {
+        DataType::Int64 => update_int64(acc, array),
+        DataType::Float64 => update_float64(acc, array),
+        DataType::Decimal128(_, _) => update_decimal128(acc, array),
+        _ => exec_err!(
+            "try_sum: unsupported type in update_batch: {:?}",
+            acc.data_type
+        ),
+    }
+}
+
+fn update_int64<T: ArrowNumericType>(
+    acc: &mut TrySumAccumulator<T>,
+    array: &PrimitiveArray<T>,
+) -> Result<()> {
+    for v in array.iter().flatten() {
+        // Cast to i64 for checked_add
+        let v_i64 = unsafe { std::mem::transmute_copy::<T::Native, i64>(&v) };
+        let sum_i64 = acc
+            .sum
+            .map(|s| unsafe { std::mem::transmute_copy::<T::Native, i64>(&s) });
+
+        let new_sum = match sum_i64 {
+            None => v_i64,
+            Some(s) => match s.checked_add(v_i64) {
+                Some(result) => result,
+                None => {
+                    acc.failed = true;
+                    return Ok(());
+                }
+            },
+        };
+
+        acc.sum = Some(unsafe { std::mem::transmute_copy::<i64, T::Native>(&new_sum) });
+    }
+    Ok(())
+}
+
+fn update_float64<T: ArrowNumericType>(
+    acc: &mut TrySumAccumulator<T>,
+    array: &PrimitiveArray<T>,
+) -> Result<()> {
+    for v in array.iter().flatten() {
+        let v_f64 = unsafe { std::mem::transmute_copy::<T::Native, f64>(&v) };
+        let sum_f64 = acc
+            .sum
+            .map(|s| unsafe { std::mem::transmute_copy::<T::Native, f64>(&s) })
+            .unwrap_or(0.0);
+        let new_sum = sum_f64 + v_f64;
+        acc.sum = Some(unsafe { std::mem::transmute_copy::<f64, T::Native>(&new_sum) });
+    }
+    Ok(())
+}
+
+fn update_decimal128<T: ArrowNumericType>(
+    acc: &mut TrySumAccumulator<T>,
+    array: &PrimitiveArray<T>,
+) -> Result<()> {
+    let precision = acc.dec_precision.unwrap_or(38);
+
+    for v in array.iter().flatten() {
+        let v_i128 = unsafe { std::mem::transmute_copy::<T::Native, i128>(&v) };
+        let sum_i128 = acc
+            .sum
+            .map(|s| unsafe { std::mem::transmute_copy::<T::Native, i128>(&s) });
+
+        let new_sum = match sum_i128 {
+            None => v_i128,
+            Some(s) => match s.checked_add(v_i128) {
+                Some(result) => result,
+                None => {
+                    acc.failed = true;
+                    return Ok(());
+                }
+            },
+        };
+
+        if exceeds_decimal128_precision(new_sum, precision) {
+            acc.failed = true;
+            return Ok(());
+        }
+
+        acc.sum = Some(unsafe { std::mem::transmute_copy::<i128, T::Native>(&new_sum) });
+    }
+    Ok(())
+}
+
+fn evaluate_internal<T: ArrowNumericType>(
+    acc: &mut TrySumAccumulator<T>,
+) -> Result<ScalarValue> {
+    if acc.failed {
+        return ScalarValue::new_primitive::<T>(None, &acc.data_type);
+    }
+    ScalarValue::new_primitive::<T>(acc.sum, &acc.data_type)
+}
+
+// Helpers to determine if it exceeds decimal precision
+fn pow10_i128(p: u8) -> Option<i128> {
+    let mut v: i128 = 1;
+    for _ in 0..p {
+        v = v.checked_mul(10)?;
+    }
+    Some(v)
+}
+
+fn exceeds_decimal128_precision(sum: i128, p: u8) -> bool {
+    if let Some(max_plus_one) = pow10_i128(p) {
+        let max = max_plus_one - 1;
+        sum > max || sum < -max
+    } else {
+        true
+    }
+}
+
+impl AggregateUDFImpl for SparkTrySum {
+    fn name(&self) -> &str {
+        "try_sum"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        use DataType::*;
+
+        let dt = &arg_types[0];
+        let result_type = match dt {
+            Null => Float64,
+            Decimal128(p, s) => {
+                let new_precision = DECIMAL128_MAX_PRECISION.min(p + 10);
+                Decimal128(new_precision, *s)
+            }
+            Int8 | Int16 | Int32 | Int64 => Int64,
+            Float16 | Float32 | Float64 => Float64,
+
+            other => return exec_err!("try_sum: unsupported type: {other:?}"),
+        };
+
+        Ok(result_type)
+    }
+
+    fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        macro_rules! helper {
+            ($t:ty, $dt:expr) => {
+                Ok(Box::new(TrySumAccumulator::<$t>::new($dt.clone())))
+            };
+        }
+
+        match acc_args.return_field.data_type() {
+            DataType::Int64 => helper!(Int64Type, acc_args.return_field.data_type()),
+            DataType::Float64 => helper!(Float64Type, acc_args.return_field.data_type()),
+            DataType::Decimal128(_, _) => {
+                helper!(Decimal128Type, acc_args.return_field.data_type())
+            }
+            _ => not_impl_err!(
+                "try_sum: unsupported type for accumulator: {}",
+                acc_args.return_field.data_type()
+            ),
+        }
+    }
+
+    fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
+        let sum_dt = args.return_field.data_type().clone();
+        Ok(vec![
+            Field::new(format_state_name(args.name, "sum"), sum_dt, true).into(),
+            Field::new(
+                format_state_name(args.name, "failed"),
+                DataType::Boolean,
+                false,
+            )
+            .into(),
+        ])
+    }
+
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        use DataType::*;
+        if arg_types.len() != 1 {
+            return exec_err!(
+                "try_sum: exactly 1 argument expected, got {}",
+                arg_types.len()
+            );
+        }
+
+        let dt = &arg_types[0];
+        let coerced = match dt {
+            Null => Float64,
+            Decimal128(p, s) => Decimal128(*p, *s),
+            Int8 | Int16 | Int32 | Int64 => Int64,
+            Float16 | Float32 | Float64 => Float64,
+            other => return exec_err!("try_sum: unsupported type: {other:?}"),
+        };
+        Ok(vec![coerced])
+    }
+
+    fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
+        Ok(ScalarValue::Null)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::array::{Decimal128Array, Float64Array, Int64Array};
+    use datafusion_common::DataFusionError;
+    use std::sync::Arc;
+
+    use super::*;
+    // -------- Helpers --------
+
+    fn int64(values: Vec<Option<i64>>) -> ArrayRef {
+        Arc::new(Int64Array::from(values)) as ArrayRef
+    }
+
+    fn f64(values: Vec<Option<f64>>) -> ArrayRef {
+        Arc::new(Float64Array::from(values)) as ArrayRef
+    }
+
+    fn dec128(p: u8, s: i8, vals: Vec<Option<i128>>) -> Result<ArrayRef> {
+        let base = Decimal128Array::from(vals);
+        let arr = base.with_precision_and_scale(p, s).map_err(|e| {
+            DataFusionError::Execution(format!("invalid precision/scale ({p},{s}): {e}"))
+        })?;
+        Ok(Arc::new(arr) as ArrayRef)
+    }
+
+    // -------- update_batch + evaluate --------
+
+    #[test]
+    fn try_sum_int_basic() -> Result<()> {
+        let mut acc = TrySumAccumulator::<Int64Type>::new(DataType::Int64);
+        acc.update_batch(&[int64((0..10).map(Some).collect())])?;
+        let out = acc.evaluate()?;
+        assert_eq!(out, ScalarValue::Int64(Some(45)));
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_int_with_nulls() -> Result<()> {
+        let mut acc = TrySumAccumulator::<Int64Type>::new(DataType::Int64);
+        acc.update_batch(&[int64(vec![None, Some(2), Some(3), None, Some(5)])])?;
+        let out = acc.evaluate()?;
+        assert_eq!(out, ScalarValue::Int64(Some(10)));
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_float_basic() -> Result<()> {
+        let mut acc = TrySumAccumulator::<Float64Type>::new(DataType::Float64);
+        acc.update_batch(&[f64(vec![Some(1.5), Some(2.5), None, Some(3.0)])])?;
+        let out = acc.evaluate()?;
+        assert_eq!(out, ScalarValue::Float64(Some(7.0)));
+        Ok(())
+    }
+
+    #[test]
+    fn float_overflow_behaves_like_spark_sum_infinite() -> Result<()> {
+        let mut acc = TrySumAccumulator::<Float64Type>::new(DataType::Float64);
+        acc.update_batch(&[f64(vec![Some(1e308), Some(1e308)])])?;
+
+        let out = acc.evaluate()?;
+        assert!(
+            matches!(out, ScalarValue::Float64(Some(v)) if v.is_infinite() && v.is_sign_positive()),
+            "waiting +Infinity, got: {out:?}"
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_float_negative_zero_normalizes_to_positive_zero() -> Result<()> {
+        let mut acc = TrySumAccumulator::<Float64Type>::new(DataType::Float64);
+        // -0.0 + 0.0 should normalize to 0.0 (positive zero), not -0.0
+        acc.update_batch(&[f64(vec![Some(-0.0), Some(0.0)])])?;
+        let out = acc.evaluate()?;
+        assert_eq!(out, ScalarValue::Float64(Some(0.0)));
+        // Verify it's positive zero using is_sign_positive
+        if let ScalarValue::Float64(Some(v)) = out {
+            assert!(v.is_sign_positive() || v == 0.0);
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_decimal_basic() -> Result<()> {
+        let p = 10u8;
+        let s = 2i8;
+        let mut acc =
+            TrySumAccumulator::<Decimal128Type>::new(DataType::Decimal128(p, s));
+        acc.update_batch(&[dec128(p, s, vec![Some(123), Some(477)])?])?;
+        let out = acc.evaluate()?;
+        assert_eq!(out, ScalarValue::Decimal128(Some(600), p, s));
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_decimal_with_nulls() -> Result<()> {
+        let p = 10u8;
+        let s = 2i8;
+        let mut acc =
+            TrySumAccumulator::<Decimal128Type>::new(DataType::Decimal128(p, s));
+        acc.update_batch(&[dec128(p, s, vec![Some(150), None, Some(200)])?])?;
+        let out = acc.evaluate()?;
+        assert_eq!(out, ScalarValue::Decimal128(Some(350), p, s));
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_decimal_overflow_sets_failed() -> Result<()> {
+        let p = 5u8;
+        let s = 0i8;
+        let mut acc =
+            TrySumAccumulator::<Decimal128Type>::new(DataType::Decimal128(p, s));
+        acc.update_batch(&[dec128(p, s, vec![Some(90_000), Some(20_000)])?])?;
+        let out = acc.evaluate()?;
+        assert_eq!(out, ScalarValue::Decimal128(None, p, s));
+        assert!(acc.failed);
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_decimal_merge_ok_and_failure_propagation() -> Result<()> {
+        let p = 10u8;
+        let s = 2i8;
+
+        let mut p_ok =
+            TrySumAccumulator::<Decimal128Type>::new(DataType::Decimal128(p, s));
+        p_ok.update_batch(&[dec128(p, s, vec![Some(100), Some(200)])?])?;
+        let s_ok = p_ok
+            .state()?
+            .into_iter()
+            .map(|sv| sv.to_array())
+            .collect::<Result<Vec<_>>>()?;
+
+        let mut p_fail =
+            TrySumAccumulator::<Decimal128Type>::new(DataType::Decimal128(p, s));
+        p_fail.update_batch(&[dec128(p, s, vec![Some(i128::MAX), Some(1)])?])?;
+        let s_fail = p_fail
+            .state()?
+            .into_iter()
+            .map(|sv| sv.to_array())
+            .collect::<Result<Vec<_>>>()?;
+
+        let mut final_acc =
+            TrySumAccumulator::<Decimal128Type>::new(DataType::Decimal128(p, s));
+        final_acc.merge_batch(&s_ok)?;
+        final_acc.merge_batch(&s_fail)?;
+
+        assert!(final_acc.failed);
+        assert_eq!(final_acc.evaluate()?, ScalarValue::Decimal128(None, p, s));
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_int_overflow_sets_failed() -> Result<()> {
+        let mut acc = TrySumAccumulator::<Int64Type>::new(DataType::Int64);
+        // i64::MAX + 1 => overflow => failed => result NULL
+        acc.update_batch(&[int64(vec![Some(i64::MAX), Some(1)])])?;
+        let out = acc.evaluate()?;
+        assert_eq!(out, ScalarValue::Int64(None));
+        assert!(acc.failed);
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_int_negative_overflow_sets_failed() -> Result<()> {
+        let mut acc = TrySumAccumulator::<Int64Type>::new(DataType::Int64);
+        // i64::MIN - 1 → overflow negative
+        acc.update_batch(&[int64(vec![Some(i64::MIN), Some(-1)])])?;
+        assert_eq!(acc.evaluate()?, ScalarValue::Int64(None));
+        assert!(acc.failed);
+        Ok(())
+    }
+
+    // -------- state + merge_batch --------
+
+    #[test]
+    fn try_sum_state_two_fields_and_merge_ok() -> Result<()> {
+        // acumulador 1 [10, 5] -> sum=15
+        let mut acc1 = TrySumAccumulator::<Int64Type>::new(DataType::Int64);
+        acc1.update_batch(&[int64(vec![Some(10), Some(5)])])?;
+        let state1 = acc1.state()?; // [sum, failed]
+        assert_eq!(state1.len(), 2);
+
+        // acumulador 2 [20, NULL] -> sum=20
+        let mut acc2 = TrySumAccumulator::<Int64Type>::new(DataType::Int64);
+        acc2.update_batch(&[int64(vec![Some(20), None])])?;
+        let state2 = acc2.state()?; // [sum, failed]
+
+        let state1_arrays: Vec<ArrayRef> = state1
+            .into_iter()
+            .map(|sv| sv.to_array())
+            .collect::<Result<_>>()?;
+
+        let state2_arrays: Vec<ArrayRef> = state2
+            .into_iter()
+            .map(|sv| sv.to_array())
+            .collect::<Result<_>>()?;
+
+        // final accumulator
+        let mut final_acc = TrySumAccumulator::<Int64Type>::new(DataType::Int64);
+
+        final_acc.merge_batch(&state1_arrays)?;
+        final_acc.merge_batch(&state2_arrays)?;
+
+        // sum total = 15 + 20 = 35
+        assert!(!final_acc.failed);
+        assert_eq!(final_acc.evaluate()?, ScalarValue::Int64(Some(35)));
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_merge_propagates_failure() -> Result<()> {
+        // sum=NULL, failed=true
+        let failed_sum = Arc::new(Int64Array::from(vec![None])) as ArrayRef;
+        let failed_flag = Arc::new(BooleanArray::from(vec![Some(true)])) as ArrayRef;
+
+        let mut acc = TrySumAccumulator::<Int64Type>::new(DataType::Int64);
+        acc.merge_batch(&[failed_sum, failed_flag])?;
+
+        assert!(acc.failed);
+        assert_eq!(acc.evaluate()?, ScalarValue::Int64(None));
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_merge_empty_partition_is_not_failure() -> Result<()> {
+        // sum=NULL, failed=false
+        let empty_sum = Arc::new(Int64Array::from(vec![None])) as ArrayRef;
+        let ok_flag = Arc::new(BooleanArray::from(vec![Some(false)])) as ArrayRef;
+
+        let mut acc = TrySumAccumulator::<Int64Type>::new(DataType::Int64);
+        acc.update_batch(&[int64(vec![Some(7), Some(8)])])?; // 15
+
+        acc.merge_batch(&[empty_sum, ok_flag])?;
+
+        assert!(!acc.failed);
+        assert_eq!(acc.evaluate()?, ScalarValue::Int64(Some(15)));
+        Ok(())
+    }
+
+    // -------- signature --------
+
+    #[test]
+    fn try_sum_return_type_matches_input() -> Result<()> {
+        let f = SparkTrySum::new();
+        assert_eq!(f.return_type(&[DataType::Int64])?, DataType::Int64);
+        assert_eq!(f.return_type(&[DataType::Float64])?, DataType::Float64);
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_state_and_evaluate_consistency() -> Result<()> {
+        let mut acc = TrySumAccumulator::<Float64Type>::new(DataType::Float64);
+        acc.update_batch(&[f64(vec![Some(1.0), Some(2.0)])])?;
+        let eval = acc.evaluate()?;
+        let state = acc.state()?;
+        assert_eq!(state[0], eval);
+        assert_eq!(state[1], ScalarValue::Boolean(Some(false)));
+        Ok(())
+    }
+
+    // -------------------------
+    // DECIMAL
+    // -------------------------
+
+    #[test]
+    fn decimal_10_2_sum_and_schema_widened() -> Result<()> {
+        // input: DECIMAL(10,2)  -> result: DECIMAL(20,2)
+        let f = SparkTrySum::new();
+        assert_eq!(
+            f.return_type(&[DataType::Decimal128(10, 2)])?,
+            DataType::Decimal128(20, 2),
+            "Spark needs +10 more digits of precision"
+        );
+
+        let mut acc =
+            TrySumAccumulator::<Decimal128Type>::new(DataType::Decimal128(20, 2));
+        acc.update_batch(&[dec128(10, 2, vec![Some(123), Some(477)])?])?;
+        assert_eq!(acc.evaluate()?, ScalarValue::Decimal128(Some(600), 20, 2));
+        Ok(())
+    }
+
+    #[test]
+    fn decimal_5_0_fits_after_widening() -> Result<()> {
+        // input: DECIMAL(5,0) -> result: DECIMAL(15,0)
+        let f = SparkTrySum::new();
+        assert_eq!(
+            f.return_type(&[DataType::Decimal128(5, 0)])?,
+            DataType::Decimal128(15, 0)
+        );
+
+        let mut acc =
+            TrySumAccumulator::<Decimal128Type>::new(DataType::Decimal128(15, 0));
+        acc.update_batch(&[dec128(5, 0, vec![Some(90_000), Some(20_000)])?])?;
+        assert_eq!(
+            acc.evaluate()?,
+            ScalarValue::Decimal128(Some(110_000), 15, 0)
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn decimal_38_0_max_precision_overflows_to_null() -> Result<()> {
+        let f = SparkTrySum::new();
+        assert_eq!(
+            f.return_type(&[DataType::Decimal128(38, 0)])?,
+            DataType::Decimal128(38, 0)
+        );
+        let ten_pow_38_minus_1 = {
+            let p10 = pow10_i128(38)
+                .ok_or_else(|| DataFusionError::Internal("10^38 overflow".into()))?;
+            p10 - 1
+        };
+        let mut acc =
+            TrySumAccumulator::<Decimal128Type>::new(DataType::Decimal128(38, 0));
+        acc.update_batch(&[dec128(38, 0, vec![Some(ten_pow_38_minus_1), Some(1)])?])?;
+
+        assert!(acc.failed, "need fail in overflow p=38");
+        assert_eq!(acc.evaluate()?, ScalarValue::Decimal128(None, 38, 0));
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/array/array_contains.rs b/datafusion/spark/src/function/array/array_contains.rs
new file mode 100644
index 0000000000000..5c7cb4be6ff9d
--- /dev/null
+++ b/datafusion/spark/src/function/array/array_contains.rs
@@ -0,0 +1,163 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Array, AsArray, BooleanArray, BooleanBufferBuilder, GenericListArray, OffsetSizeTrait,
+};
+use arrow::buffer::{BooleanBuffer, NullBuffer};
+use arrow::datatypes::DataType;
+use datafusion_common::{Result, exec_err};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions_nested::array_has::array_has_udf;
+use std::sync::Arc;
+
+/// Spark-compatible `array_contains` function.
+///
+/// Calls DataFusion's `array_has` and then applies Spark's null semantics:
+/// - If the result from `array_has` is `true`, return `true`.
+/// - If the result is `false` and the input array row contains any null elements,
+///   return `null` (because the element might have been the null).
+/// - If the result is `false` and the input array row has no null elements,
+///   return `false`.
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkArrayContains {
+    signature: Signature,
+}
+
+impl Default for SparkArrayContains {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkArrayContains {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::array_and_element(Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkArrayContains {
+    fn name(&self) -> &str {
+        "array_contains"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Boolean)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let haystack = args.args[0].clone();
+        let array_has_result = array_has_udf().invoke_with_args(args)?;
+
+        let result_array = array_has_result.to_array(1)?;
+        let patched = apply_spark_null_semantics(result_array.as_boolean(), &haystack)?;
+        Ok(ColumnarValue::Array(Arc::new(patched)))
+    }
+}
+
+/// For each row where `array_has` returned `false`, set the output to null
+/// if that row's input array contains any null elements.
+fn apply_spark_null_semantics(
+    result: &BooleanArray,
+    haystack_arg: &ColumnarValue,
+) -> Result<BooleanArray> {
+    // happy path
+    if haystack_arg.data_type() == DataType::Null || !result.has_false() {
+        return Ok(result.clone());
+    }
+
+    let haystack = haystack_arg.to_array_of_size(result.len())?;
+
+    let row_has_nulls = compute_row_has_nulls(&haystack)?;
+
+    // A row keeps its validity when result is true OR the row has no nulls.
+    let keep_mask = result.values() | &!&row_has_nulls;
+    let new_validity = match result.nulls() {
+        Some(n) => n.inner() & &keep_mask,
+        None => keep_mask,
+    };
+
+    Ok(BooleanArray::new(
+        result.values().clone(),
+        Some(NullBuffer::new(new_validity)),
+    ))
+}
+
+/// Returns a per-row bitmap where bit i is set if row i's list contains any null element.
+fn compute_row_has_nulls(haystack: &dyn Array) -> Result<BooleanBuffer> {
+    match haystack.data_type() {
+        DataType::List(_) => generic_list_row_has_nulls(haystack.as_list::<i32>()),
+        DataType::LargeList(_) => generic_list_row_has_nulls(haystack.as_list::<i64>()),
+        DataType::FixedSizeList(_, _) => {
+            let list = haystack.as_fixed_size_list();
+            let buf = match list.values().nulls() {
+                Some(nulls) => {
+                    let validity = nulls.inner();
+                    let vl = list.value_length() as usize;
+                    let mut builder = BooleanBufferBuilder::new(list.len());
+                    for i in 0..list.len() {
+                        builder.append(validity.slice(i * vl, vl).count_set_bits() < vl);
+                    }
+                    builder.finish()
+                }
+                None => BooleanBuffer::new_unset(list.len()),
+            };
+            Ok(mask_with_list_nulls(buf, list.nulls()))
+        }
+        dt => exec_err!("compute_row_has_nulls: unsupported data type {dt}"),
+    }
+}
+
+/// Computes per-row null presence for `List` and `LargeList` arrays.
+fn generic_list_row_has_nulls<O: OffsetSizeTrait>(
+    list: &GenericListArray<O>,
+) -> Result<BooleanBuffer> {
+    let buf = match list.values().nulls() {
+        Some(nulls) => {
+            let validity = nulls.inner();
+            let offsets = list.offsets();
+            let mut builder = BooleanBufferBuilder::new(list.len());
+            for i in 0..list.len() {
+                let s = offsets[i].as_usize();
+                let len = offsets[i + 1].as_usize() - s;
+                builder.append(validity.slice(s, len).count_set_bits() < len);
+            }
+            builder.finish()
+        }
+        None => BooleanBuffer::new_unset(list.len()),
+    };
+    Ok(mask_with_list_nulls(buf, list.nulls()))
+}
+
+/// Rows where the list itself is null should not be marked as "has nulls".
+fn mask_with_list_nulls(
+    buf: BooleanBuffer,
+    list_nulls: Option<&NullBuffer>,
+) -> BooleanBuffer {
+    match list_nulls {
+        Some(n) => &buf & n.inner(),
+        None => buf,
+    }
+}
diff --git a/datafusion/spark/src/function/array/mod.rs b/datafusion/spark/src/function/array/mod.rs
index 01056ba952984..6c16e05361641 100644
--- a/datafusion/spark/src/function/array/mod.rs
+++ b/datafusion/spark/src/function/array/mod.rs
@@ -15,27 +15,54 @@
 // specific language governing permissions and limitations
 // under the License.
 
+pub mod array_contains;
+pub mod repeat;
 pub mod shuffle;
+pub mod slice;
 pub mod spark_array;
 
 use datafusion_expr::ScalarUDF;
 use datafusion_functions::make_udf_function;
 use std::sync::Arc;
 
+make_udf_function!(array_contains::SparkArrayContains, spark_array_contains);
 make_udf_function!(spark_array::SparkArray, array);
 make_udf_function!(shuffle::SparkShuffle, shuffle);
+make_udf_function!(repeat::SparkArrayRepeat, array_repeat);
+make_udf_function!(slice::SparkSlice, slice);
 
 pub mod expr_fn {
     use datafusion_functions::export_functions;
 
+    export_functions!((
+        spark_array_contains,
+        "Returns true if the array contains the element (Spark semantics).",
+        array element
+    ));
     export_functions!((array, "Returns an array with the given elements.", args));
     export_functions!((
         shuffle,
         "Returns a random permutation of the given array.",
         args
     ));
+    export_functions!((
+        array_repeat,
+        "returns an array containing element count times.",
+        element count
+    ));
+    export_functions!((
+        slice,
+        "Returns a slice of the array from the start index with the given length.",
+        array start length
+    ));
 }
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
-    vec![array(), shuffle()]
+    vec![
+        spark_array_contains(),
+        array(),
+        shuffle(),
+        array_repeat(),
+        slice(),
+    ]
 }
diff --git a/datafusion/spark/src/function/array/repeat.rs b/datafusion/spark/src/function/array/repeat.rs
new file mode 100644
index 0000000000000..c4e5e449c7535
--- /dev/null
+++ b/datafusion/spark/src/function/array/repeat.rs
@@ -0,0 +1,121 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::{DataType, Field};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, exec_err};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions_nested::repeat::ArrayRepeat;
+use std::sync::Arc;
+
+use crate::function::null_utils::{
+    NullMaskResolution, apply_null_mask, compute_null_mask,
+};
+
+/// Spark-compatible `array_repeat` expression. The difference with DataFusion's `array_repeat` is the handling of NULL count: in Spark if the count is NULL, the result is NULL.
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#array_repeat>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkArrayRepeat {
+    signature: Signature,
+}
+
+impl Default for SparkArrayRepeat {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkArrayRepeat {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkArrayRepeat {
+    fn name(&self) -> &str {
+        "array_repeat"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::List(Arc::new(Field::new_list_field(
+            arg_types[0].clone(),
+            true,
+        ))))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        spark_array_repeat(args)
+    }
+
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        let [first_type, second_type] = take_function_args(self.name(), arg_types)?;
+
+        // Coerce the second argument to Int64/UInt64 if it's a numeric type
+        let second = match second_type {
+            DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => {
+                DataType::Int64
+            }
+            DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => {
+                DataType::UInt64
+            }
+            _ => return exec_err!("count must be an integer type"),
+        };
+
+        Ok(vec![first_type.clone(), second])
+    }
+}
+
+/// This is a Spark-specific wrapper around DataFusion's array_repeat that returns NULL
+/// if the count argument is NULL (Spark behavior), whereas DataFusion's array_repeat ignores NULLs.
+fn spark_array_repeat(args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+    let ScalarFunctionArgs {
+        args: arg_values,
+        arg_fields,
+        number_rows,
+        return_field,
+        config_options,
+    } = args;
+    let return_type = return_field.data_type().clone();
+
+    // A NULL element should be repeated into the array, not cause a NULL result.
+    let null_mask = compute_null_mask(&arg_values[1..], number_rows)?;
+
+    // If count is null then return NULL immediately
+    if matches!(null_mask, NullMaskResolution::ReturnNull) {
+        return Ok(ColumnarValue::Scalar(ScalarValue::try_from(return_type)?));
+    }
+
+    let array_repeat_func = ArrayRepeat::new();
+    let func_args = ScalarFunctionArgs {
+        args: arg_values,
+        arg_fields,
+        number_rows,
+        return_field,
+        config_options,
+    };
+    let result = array_repeat_func.invoke_with_args(func_args)?;
+
+    apply_null_mask(result, null_mask, &return_type)
+}
diff --git a/datafusion/spark/src/function/array/shuffle.rs b/datafusion/spark/src/function/array/shuffle.rs
index abeafd3a93660..031dd17177577 100644
--- a/datafusion/spark/src/function/array/shuffle.rs
+++ b/datafusion/spark/src/function/array/shuffle.rs
@@ -15,22 +15,27 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::function::functions_nested_utils::make_scalar_function;
 use arrow::array::{
     Array, ArrayRef, Capacities, FixedSizeListArray, GenericListArray, MutableArrayData,
     OffsetSizeTrait,
 };
 use arrow::buffer::OffsetBuffer;
+use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::{FixedSizeList, LargeList, List, Null};
-use arrow::datatypes::{DataType, FieldRef};
+use arrow::datatypes::FieldRef;
 use datafusion_common::cast::{
     as_fixed_size_list_array, as_large_list_array, as_list_array,
 };
-use datafusion_common::{exec_err, utils::take_function_args, Result};
-use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
+use datafusion_common::{
+    Result, ScalarValue, exec_err, internal_err, utils::take_function_args,
+};
+use datafusion_expr::{
+    ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignature, Volatility,
+};
 use rand::rng;
-use rand::seq::SliceRandom;
-use std::any::Any;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng, seq::SliceRandom};
 use std::sync::Arc;
 
 #[derive(Debug, PartialEq, Eq, Hash)]
@@ -47,16 +52,30 @@ impl Default for SparkShuffle {
 impl SparkShuffle {
     pub fn new() -> Self {
         Self {
-            signature: Signature::arrays(1, None, Volatility::Volatile),
+            signature: Signature {
+                type_signature: TypeSignature::OneOf(vec![
+                    // Only array argument
+                    TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+                        arguments: vec![ArrayFunctionArgument::Array],
+                        array_coercion: None,
+                    }),
+                    // Array + Index (seed) argument
+                    TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+                        arguments: vec![
+                            ArrayFunctionArgument::Array,
+                            ArrayFunctionArgument::Index,
+                        ],
+                        array_coercion: None,
+                    }),
+                ]),
+                volatility: Volatility::Volatile,
+                parameter_names: None,
+            },
         }
     }
 }
 
 impl ScalarUDFImpl for SparkShuffle {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "shuffle"
     }
@@ -65,42 +84,86 @@ impl ScalarUDFImpl for SparkShuffle {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(arg_types[0].clone())
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
     }
 
-    fn invoke_with_args(
+    fn return_field_from_args(
         &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        make_scalar_function(array_shuffle_inner)(&args.args)
+        args: datafusion_expr::ReturnFieldArgs,
+    ) -> Result<FieldRef> {
+        // Shuffle returns an array with the same type and nullability as the input
+        Ok(Arc::clone(&args.arg_fields[0]))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        if args.args.is_empty() || args.args.len() > 2 {
+            return exec_err!("shuffle expects 1 or 2 argument(s)");
+        }
+
+        // Extract seed from second argument if present
+        let seed = if args.args.len() == 2 {
+            extract_seed(&args.args[1])?
+        } else {
+            None
+        };
+
+        // Convert arguments to arrays
+        let arrays = ColumnarValue::values_to_arrays(&args.args[..1])?;
+        array_shuffle_with_seed(&arrays, seed).map(ColumnarValue::Array)
+    }
+}
+
+/// Extract seed value from ColumnarValue
+fn extract_seed(seed_arg: &ColumnarValue) -> Result<Option<u64>> {
+    match seed_arg {
+        ColumnarValue::Scalar(scalar) => {
+            let seed = match scalar {
+                ScalarValue::Int64(Some(v)) => Some(*v as u64),
+                ScalarValue::Null | ScalarValue::Int64(None) => None,
+                _ => {
+                    return exec_err!(
+                        "shuffle seed must be Int64 type but got '{}'",
+                        scalar.data_type()
+                    );
+                }
+            };
+            Ok(seed)
+        }
+        ColumnarValue::Array(_) => {
+            exec_err!("shuffle seed must be a scalar value, not an array")
+        }
     }
 }
 
-/// array_shuffle SQL function
-pub fn array_shuffle_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
+/// array_shuffle SQL function with optional seed
+fn array_shuffle_with_seed(arg: &[ArrayRef], seed: Option<u64>) -> Result<ArrayRef> {
     let [input_array] = take_function_args("shuffle", arg)?;
     match &input_array.data_type() {
         List(field) => {
             let array = as_list_array(input_array)?;
-            general_array_shuffle::<i32>(array, field)
+            general_array_shuffle::<i32>(array, field, seed)
         }
         LargeList(field) => {
             let array = as_large_list_array(input_array)?;
-            general_array_shuffle::<i64>(array, field)
+            general_array_shuffle::<i64>(array, field, seed)
         }
         FixedSizeList(field, _) => {
             let array = as_fixed_size_list_array(input_array)?;
-            fixed_size_array_shuffle(array, field)
+            fixed_size_array_shuffle(array, field, seed)
         }
         Null => Ok(Arc::clone(input_array)),
-        array_type => exec_err!("shuffle does not support type '{array_type}'."),
+        array_type => exec_err!(
+            "shuffle does not support type '{array_type}'; \
+        expected types: List, LargeList, FixedSizeList or Null."
+        ),
     }
 }
 
 fn general_array_shuffle<O: OffsetSizeTrait>(
     array: &GenericListArray<O>,
     field: &FieldRef,
+    seed: Option<u64>,
 ) -> Result<ArrayRef> {
     let values = array.values();
     let original_data = values.to_data();
@@ -109,7 +172,13 @@ fn general_array_shuffle<O: OffsetSizeTrait>(
     let mut nulls = vec![];
     let mut mutable =
         MutableArrayData::with_capacities(vec![&original_data], false, capacity);
-    let mut rng = rng();
+    let mut rng = if let Some(s) = seed {
+        StdRng::seed_from_u64(s)
+    } else {
+        // Use a random seed from the thread-local RNG
+        let seed = rng().random::<u64>();
+        StdRng::seed_from_u64(seed)
+    };
 
     for (row_index, offset_window) in array.offsets().windows(2).enumerate() {
         // skip the null value
@@ -149,6 +218,7 @@ fn general_array_shuffle<O: OffsetSizeTrait>(
 fn fixed_size_array_shuffle(
     array: &FixedSizeListArray,
     field: &FieldRef,
+    seed: Option<u64>,
 ) -> Result<ArrayRef> {
     let values = array.values();
     let original_data = values.to_data();
@@ -157,7 +227,13 @@ fn fixed_size_array_shuffle(
     let mut mutable =
         MutableArrayData::with_capacities(vec![&original_data], false, capacity);
     let value_length = array.value_length() as usize;
-    let mut rng = rng();
+    let mut rng = if let Some(s) = seed {
+        StdRng::seed_from_u64(s)
+    } else {
+        // Use a random seed from the thread-local RNG
+        let seed = rng().random::<u64>();
+        StdRng::seed_from_u64(seed)
+    };
 
     for row_index in 0..array.len() {
         // skip the null value
@@ -189,3 +265,51 @@ fn fixed_size_array_shuffle(
         Some(nulls.into()),
     )?))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::Field;
+    use datafusion_expr::ReturnFieldArgs;
+
+    #[test]
+    fn test_shuffle_nullability() {
+        let shuffle = SparkShuffle::new();
+
+        // Test with non-nullable array
+        let non_nullable_field = Arc::new(Field::new(
+            "arr",
+            List(Arc::new(Field::new("item", DataType::Int32, true))),
+            false, // not nullable
+        ));
+
+        let result = shuffle
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&non_nullable_field)],
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        // The result should not be nullable (same as input)
+        assert!(!result.is_nullable());
+        assert_eq!(result.data_type(), non_nullable_field.data_type());
+
+        // Test with nullable array
+        let nullable_field = Arc::new(Field::new(
+            "arr",
+            List(Arc::new(Field::new("item", DataType::Int32, true))),
+            true, // nullable
+        ));
+
+        let result = shuffle
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&nullable_field)],
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        // The result should be nullable (same as input)
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), nullable_field.data_type());
+    }
+}
diff --git a/datafusion/spark/src/function/array/slice.rs b/datafusion/spark/src/function/array/slice.rs
new file mode 100644
index 0000000000000..bcd10a1bf7d79
--- /dev/null
+++ b/datafusion/spark/src/function/array/slice.rs
@@ -0,0 +1,240 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef, Int64Builder};
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::cast::{as_int64_array, as_list_array};
+use datafusion_common::utils::ListCoercion;
+use datafusion_common::{
+    Result, ScalarValue, exec_err, internal_err, utils::take_function_args,
+};
+use datafusion_expr::{
+    ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, ReturnFieldArgs,
+    ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility,
+};
+use datafusion_functions_nested::extract::array_slice_udf;
+use std::sync::Arc;
+
+/// Spark slice function implementation
+/// Main difference from DataFusion's array_slice is that the third argument is the length of the slice and not the end index.
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#slice>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkSlice {
+    signature: Signature,
+}
+
+impl Default for SparkSlice {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkSlice {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature {
+                type_signature: TypeSignature::ArraySignature(
+                    ArrayFunctionSignature::Array {
+                        arguments: vec![
+                            ArrayFunctionArgument::Array,
+                            ArrayFunctionArgument::Index,
+                            ArrayFunctionArgument::Index,
+                        ],
+                        array_coercion: Some(ListCoercion::FixedSizedListToList),
+                    },
+                ),
+                volatility: Volatility::Immutable,
+                parameter_names: None,
+            },
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkSlice {
+    fn name(&self) -> &str {
+        "slice"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        let data_type = match args.arg_fields[0].data_type() {
+            DataType::Null => {
+                DataType::List(Arc::new(Field::new_list_field(DataType::Null, true)))
+            }
+            dt => dt.clone(),
+        };
+
+        Ok(Arc::new(Field::new("slice", data_type, nullable)))
+    }
+
+    fn invoke_with_args(
+        &self,
+        mut func_args: ScalarFunctionArgs,
+    ) -> Result<ColumnarValue> {
+        if func_args.args[0].data_type() == DataType::Null {
+            return Ok(ColumnarValue::Scalar(ScalarValue::new_null_list(
+                DataType::Null,
+                true,
+                1,
+            )));
+        }
+
+        let array_len = func_args
+            .args
+            .iter()
+            .find_map(|arg| match arg {
+                ColumnarValue::Array(array) => Some(array.len()),
+                _ => None,
+            })
+            .unwrap_or(func_args.number_rows);
+
+        let arrays = func_args
+            .args
+            .iter()
+            .map(|arg| match arg {
+                ColumnarValue::Array(array) => Ok(Arc::clone(array)),
+                ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(array_len),
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        let (start, end) = calculate_start_end(&arrays)?;
+
+        array_slice_udf().invoke_with_args(ScalarFunctionArgs {
+            args: vec![
+                func_args.args.swap_remove(0),
+                ColumnarValue::Array(start),
+                ColumnarValue::Array(end),
+            ],
+            arg_fields: func_args.arg_fields,
+            number_rows: func_args.number_rows,
+            return_field: func_args.return_field,
+            config_options: func_args.config_options,
+        })
+    }
+}
+
+fn calculate_start_end(args: &[ArrayRef]) -> Result<(ArrayRef, ArrayRef)> {
+    let [values, start, length] = take_function_args("slice", args)?;
+
+    let values_len = values.len();
+
+    let start = as_int64_array(&start)?;
+    let length = as_int64_array(&length)?;
+
+    let values = as_list_array(values)?;
+
+    let mut adjusted_start = Int64Builder::with_capacity(values_len);
+    let mut end = Int64Builder::with_capacity(values_len);
+
+    for row in 0..values_len {
+        if values.is_null(row) || start.is_null(row) || length.is_null(row) {
+            adjusted_start.append_null();
+            end.append_null();
+            continue;
+        }
+        let start = start.value(row);
+        let length = length.value(row);
+        let value_length = values.value(row).len() as i64;
+
+        if start == 0 {
+            return exec_err!("Start index must not be zero");
+        }
+        if length < 0 {
+            return exec_err!("Length must be non-negative, but got {}", length);
+        }
+
+        let adjusted_start_value = if start < 0 {
+            start + value_length + 1
+        } else {
+            start
+        };
+
+        adjusted_start.append_value(adjusted_start_value);
+        end.append_value(adjusted_start_value + (length - 1));
+    }
+
+    Ok((Arc::new(adjusted_start.finish()), Arc::new(end.finish())))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::NullArray;
+    use arrow::datatypes::Field;
+    use datafusion_common::ScalarValue;
+    use datafusion_common::cast::as_list_array;
+    use datafusion_expr::ReturnFieldArgs;
+
+    #[test]
+    fn test_spark_slice_function_when_input_is_null() {
+        let slice = SparkSlice::new();
+        let arg_fields: Vec<Arc<Field>> = vec![
+            Arc::new(Field::new("a", DataType::Null, true)),
+            Arc::new(Field::new("s", DataType::Int64, true)),
+            Arc::new(Field::new("l", DataType::Int64, true)),
+        ];
+        let out = slice
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &arg_fields,
+                scalar_arguments: &[],
+            })
+            .unwrap();
+        assert_eq!(
+            out.data_type(),
+            &DataType::List(Arc::new(Field::new_list_field(DataType::Null, true)))
+        );
+    }
+
+    #[test]
+    fn test_spark_slice_function_when_input_array_is_null() {
+        let input_args = vec![
+            ColumnarValue::Array(Arc::new(NullArray::new(1))),
+            ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
+            ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
+        ];
+
+        let args = ScalarFunctionArgs {
+            args: input_args,
+            arg_fields: vec![Arc::new(Field::new("item", DataType::Null, true))],
+            number_rows: 1,
+            return_field: Arc::new(Field::new(
+                "slice",
+                DataType::List(Arc::new(Field::new_list_field(DataType::Null, true))),
+                true,
+            )),
+            config_options: Arc::new(Default::default()),
+        };
+        let slice = SparkSlice::new();
+        let result = slice.invoke_with_args(args).unwrap();
+        let arr = result.to_array(1).unwrap();
+        let list = as_list_array(&arr).unwrap();
+        assert_eq!(
+            arr.data_type(),
+            &DataType::List(Arc::new(Field::new_list_field(DataType::Null, true)))
+        );
+        assert!(list.is_null(0));
+    }
+}
diff --git a/datafusion/spark/src/function/array/spark_array.rs b/datafusion/spark/src/function/array/spark_array.rs
index bf5842cb5a5a6..d6d4e7f0ab9f0 100644
--- a/datafusion/spark/src/function/array/spark_array.rs
+++ b/datafusion/spark/src/function/array/spark_array.rs
@@ -15,21 +15,17 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{any::Any, sync::Arc};
+use std::sync::Arc;
 
-use arrow::array::{
-    make_array, new_null_array, Array, ArrayData, ArrayRef, Capacities, GenericListArray,
-    MutableArrayData, NullArray, OffsetSizeTrait,
-};
-use arrow::buffer::OffsetBuffer;
+use arrow::array::{Array, ArrayRef, new_null_array};
 use arrow::datatypes::{DataType, Field, FieldRef};
 use datafusion_common::utils::SingleRowListArrayBuilder;
-use datafusion_common::{plan_datafusion_err, plan_err, Result};
-use datafusion_expr::type_coercion::binary::comparison_coercion;
+use datafusion_common::{Result, internal_err};
 use datafusion_expr::{
     ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
-    TypeSignature, Volatility,
+    Volatility,
 };
+use datafusion_functions_nested::make_array::{array_array, coerce_types_inner};
 
 use crate::function::functions_nested_utils::make_scalar_function;
 
@@ -38,7 +34,6 @@ const ARRAY_FIELD_DEFAULT_NAME: &str = "element";
 #[derive(Debug, PartialEq, Eq, Hash)]
 pub struct SparkArray {
     signature: Signature,
-    aliases: Vec<String>,
 }
 
 impl Default for SparkArray {
@@ -50,20 +45,12 @@ impl Default for SparkArray {
 impl SparkArray {
     pub fn new() -> Self {
         Self {
-            signature: Signature::one_of(
-                vec![TypeSignature::UserDefined, TypeSignature::Nullary],
-                Volatility::Immutable,
-            ),
-            aliases: vec![String::from("spark_make_array")],
+            signature: Signature::user_defined(Volatility::Immutable),
         }
     }
 }
 
 impl ScalarUDFImpl for SparkArray {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "array"
     }
@@ -72,34 +59,32 @@ impl ScalarUDFImpl for SparkArray {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let data_types = args
+            .arg_fields
+            .iter()
+            .map(|f| f.data_type())
+            .cloned()
+            .collect::<Vec<_>>();
+
         let mut expr_type = DataType::Null;
-        for arg_type in arg_types {
+        for arg_type in &data_types {
             if !arg_type.equals_datatype(&DataType::Null) {
                 expr_type = arg_type.clone();
                 break;
             }
         }
 
-        if expr_type.is_null() {
-            expr_type = DataType::Int32;
-        }
-
-        Ok(DataType::List(Arc::new(Field::new(
+        let return_type = DataType::List(Arc::new(Field::new(
             ARRAY_FIELD_DEFAULT_NAME,
             expr_type,
             true,
-        ))))
-    }
+        )));
 
-    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
-        let data_types = args
-            .arg_fields
-            .iter()
-            .map(|f| f.data_type())
-            .cloned()
-            .collect::<Vec<_>>();
-        let return_type = self.return_type(&data_types)?;
         Ok(Arc::new(Field::new(
             "this_field_name_is_irrelevant",
             return_type,
@@ -112,31 +97,12 @@ impl ScalarUDFImpl for SparkArray {
         make_scalar_function(make_array_inner)(args.as_slice())
     }
 
-    fn aliases(&self) -> &[String] {
-        &self.aliases
-    }
-
     fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        let first_type = arg_types.first().ok_or_else(|| {
-            plan_datafusion_err!("Spark array function requires at least one argument")
-        })?;
-        let new_type =
-            arg_types
-                .iter()
-                .skip(1)
-                .try_fold(first_type.clone(), |acc, x| {
-                    // The coerced types found by `comparison_coercion` are not guaranteed to be
-                    // coercible for the arguments. `comparison_coercion` returns more loose
-                    // types that can be coerced to both `acc` and `x` for comparison purpose.
-                    // See `maybe_data_types` for the actual coercion.
-                    let coerced_type = comparison_coercion(&acc, x);
-                    if let Some(coerced_type) = coerced_type {
-                        Ok(coerced_type)
-                    } else {
-                        plan_err!("Coercion from {acc} to {x} failed.")
-                    }
-                })?;
-        Ok(vec![new_type; arg_types.len()])
+        if arg_types.is_empty() {
+            Ok(vec![])
+        } else {
+            coerce_types_inner(arg_types, self.name())
+        }
     }
 }
 
@@ -158,7 +124,7 @@ pub fn make_array_inner(arrays: &[ArrayRef]) -> Result<ArrayRef> {
         DataType::Null => {
             let length = arrays.iter().map(|a| a.len()).sum();
             // By default Int32
-            let array = new_null_array(&DataType::Int32, length);
+            let array = new_null_array(&DataType::Null, length);
             Ok(Arc::new(
                 SingleRowListArrayBuilder::new(array)
                     .with_nullable(true)
@@ -166,99 +132,6 @@ pub fn make_array_inner(arrays: &[ArrayRef]) -> Result<ArrayRef> {
                     .build_list_array(),
             ))
         }
-        DataType::LargeList(..) => array_array::<i64>(arrays, data_type),
-        _ => array_array::<i32>(arrays, data_type),
-    }
-}
-
-/// Convert one or more [`ArrayRef`] of the same type into a
-/// `ListArray` or 'LargeListArray' depending on the offset size.
-///
-/// # Example (non nested)
-///
-/// Calling `array(col1, col2)` where col1 and col2 are non nested
-/// would return a single new `ListArray`, where each row was a list
-/// of 2 elements:
-///
-/// ```text
-/// ┌─────────┐   ┌─────────┐           ┌──────────────┐
-/// │ ┌─────┐ │   │ ┌─────┐ │           │ ┌──────────┐ │
-/// │ │  A  │ │   │ │  X  │ │           │ │  [A, X]  │ │
-/// │ ├─────┤ │   │ ├─────┤ │           │ ├──────────┤ │
-/// │ │NULL │ │   │ │  Y  │ │──────────▶│ │[NULL, Y] │ │
-/// │ ├─────┤ │   │ ├─────┤ │           │ ├──────────┤ │
-/// │ │  C  │ │   │ │  Z  │ │           │ │  [C, Z]  │ │
-/// │ └─────┘ │   │ └─────┘ │           │ └──────────┘ │
-/// └─────────┘   └─────────┘           └──────────────┘
-///   col1           col2                    output
-/// ```
-///
-/// # Example (nested)
-///
-/// Calling `array(col1, col2)` where col1 and col2 are lists
-/// would return a single new `ListArray`, where each row was a list
-/// of the corresponding elements of col1 and col2.
-///
-/// ``` text
-/// ┌──────────────┐   ┌──────────────┐        ┌─────────────────────────────┐
-/// │ ┌──────────┐ │   │ ┌──────────┐ │        │ ┌────────────────────────┐  │
-/// │ │  [A, X]  │ │   │ │    []    │ │        │ │    [[A, X], []]        │  │
-/// │ ├──────────┤ │   │ ├──────────┤ │        │ ├────────────────────────┤  │
-/// │ │[NULL, Y] │ │   │ │[Q, R, S] │ │───────▶│ │ [[NULL, Y], [Q, R, S]] │  │
-/// │ ├──────────┤ │   │ ├──────────┤ │        │ ├────────────────────────│  │
-/// │ │  [C, Z]  │ │   │ │   NULL   │ │        │ │    [[C, Z], NULL]      │  │
-/// │ └──────────┘ │   │ └──────────┘ │        │ └────────────────────────┘  │
-/// └──────────────┘   └──────────────┘        └─────────────────────────────┘
-///      col1               col2                         output
-/// ```
-fn array_array<O: OffsetSizeTrait>(
-    args: &[ArrayRef],
-    data_type: DataType,
-) -> Result<ArrayRef> {
-    // do not accept 0 arguments.
-    if args.is_empty() {
-        return plan_err!("Array requires at least one argument");
-    }
-
-    let mut data = vec![];
-    let mut total_len = 0;
-    for arg in args {
-        let arg_data = if arg.as_any().is::<NullArray>() {
-            ArrayData::new_empty(&data_type)
-        } else {
-            arg.to_data()
-        };
-        total_len += arg_data.len();
-        data.push(arg_data);
-    }
-
-    let mut offsets: Vec<O> = Vec::with_capacity(total_len);
-    offsets.push(O::usize_as(0));
-
-    let capacity = Capacities::Array(total_len);
-    let data_ref = data.iter().collect::<Vec<_>>();
-    let mut mutable = MutableArrayData::with_capacities(data_ref, true, capacity);
-
-    let num_rows = args[0].len();
-    for row_idx in 0..num_rows {
-        for (arr_idx, arg) in args.iter().enumerate() {
-            if !arg.as_any().is::<NullArray>()
-                && !arg.is_null(row_idx)
-                && arg.is_valid(row_idx)
-            {
-                mutable.extend(arr_idx, row_idx, row_idx + 1);
-            } else {
-                mutable.extend_nulls(1);
-            }
-        }
-        offsets.push(O::usize_as(mutable.len()));
+        _ => array_array::<i32>(arrays, data_type, ARRAY_FIELD_DEFAULT_NAME),
     }
-    let data = mutable.freeze();
-
-    Ok(Arc::new(GenericListArray::<O>::try_new(
-        Arc::new(Field::new(ARRAY_FIELD_DEFAULT_NAME, data_type, true)),
-        OffsetBuffer::new(offsets.into()),
-        make_array(data),
-        None,
-    )?))
 }
diff --git a/datafusion/spark/src/function/bitmap/bitmap_bit_position.rs b/datafusion/spark/src/function/bitmap/bitmap_bit_position.rs
new file mode 100644
index 0000000000000..49343f49a86d6
--- /dev/null
+++ b/datafusion/spark/src/function/bitmap/bitmap_bit_position.rs
@@ -0,0 +1,138 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, AsArray, Int64Array};
+use arrow::datatypes::Field;
+use arrow::datatypes::{DataType, FieldRef, Int8Type, Int16Type, Int32Type, Int64Type};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+    Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use std::sync::Arc;
+
+/// Spark-compatible `bitmap_bit_position` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#bitmap_bit_position>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct BitmapBitPosition {
+    signature: Signature,
+}
+
+impl Default for BitmapBitPosition {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl BitmapBitPosition {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Exact(vec![DataType::Int8]),
+                    TypeSignature::Exact(vec![DataType::Int16]),
+                    TypeSignature::Exact(vec![DataType::Int32]),
+                    TypeSignature::Exact(vec![DataType::Int64]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for BitmapBitPosition {
+    fn name(&self) -> &str {
+        "bitmap_bit_position"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(
+        &self,
+        args: datafusion_expr::ReturnFieldArgs,
+    ) -> Result<FieldRef> {
+        Ok(Arc::new(Field::new(
+            self.name(),
+            DataType::Int64,
+            args.arg_fields[0].is_nullable(),
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(bitmap_bit_position_inner, vec![])(&args.args)
+    }
+}
+
+pub fn bitmap_bit_position_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
+    let [array] = take_function_args("bitmap_bit_position", arg)?;
+    match &array.data_type() {
+        DataType::Int8 => {
+            let result: Int64Array = array
+                .as_primitive::<Int8Type>()
+                .iter()
+                .map(|opt| opt.map(|value| bitmap_bit_position(value.into())))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        DataType::Int16 => {
+            let result: Int64Array = array
+                .as_primitive::<Int16Type>()
+                .iter()
+                .map(|opt| opt.map(|value| bitmap_bit_position(value.into())))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        DataType::Int32 => {
+            let result: Int64Array = array
+                .as_primitive::<Int32Type>()
+                .iter()
+                .map(|opt| opt.map(|value| bitmap_bit_position(value.into())))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        DataType::Int64 => {
+            let result: Int64Array = array
+                .as_primitive::<Int64Type>()
+                .iter()
+                .map(|opt| opt.map(bitmap_bit_position))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        data_type => {
+            internal_err!("bitmap_bit_position does not support {data_type}")
+        }
+    }
+}
+
+const NUM_BYTES: i64 = 4 * 1024;
+const NUM_BITS: i64 = NUM_BYTES * 8;
+
+fn bitmap_bit_position(value: i64) -> i64 {
+    if value > 0 {
+        (value - 1) % NUM_BITS
+    } else {
+        (value.wrapping_neg()) % NUM_BITS
+    }
+}
diff --git a/datafusion/spark/src/function/bitmap/bitmap_bucket_number.rs b/datafusion/spark/src/function/bitmap/bitmap_bucket_number.rs
new file mode 100644
index 0000000000000..e49a9ca3d4f0a
--- /dev/null
+++ b/datafusion/spark/src/function/bitmap/bitmap_bucket_number.rs
@@ -0,0 +1,138 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, AsArray, Int64Array};
+use arrow::datatypes::Field;
+use arrow::datatypes::{DataType, FieldRef, Int8Type, Int16Type, Int32Type, Int64Type};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+    Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use std::sync::Arc;
+
+/// Spark-compatible `bitmap_bucket_number` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#bitmap_bucket_number>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct BitmapBucketNumber {
+    signature: Signature,
+}
+
+impl Default for BitmapBucketNumber {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl BitmapBucketNumber {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Exact(vec![DataType::Int8]),
+                    TypeSignature::Exact(vec![DataType::Int16]),
+                    TypeSignature::Exact(vec![DataType::Int32]),
+                    TypeSignature::Exact(vec![DataType::Int64]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for BitmapBucketNumber {
+    fn name(&self) -> &str {
+        "bitmap_bucket_number"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(
+        &self,
+        args: datafusion_expr::ReturnFieldArgs,
+    ) -> Result<FieldRef> {
+        Ok(Arc::new(Field::new(
+            self.name(),
+            DataType::Int64,
+            args.arg_fields[0].is_nullable(),
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(bitmap_bucket_number_inner, vec![])(&args.args)
+    }
+}
+
+pub fn bitmap_bucket_number_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
+    let [array] = take_function_args("bitmap_bucket_number", arg)?;
+    match &array.data_type() {
+        DataType::Int8 => {
+            let result: Int64Array = array
+                .as_primitive::<Int8Type>()
+                .iter()
+                .map(|opt| opt.map(|value| bitmap_bucket_number(value.into())))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        DataType::Int16 => {
+            let result: Int64Array = array
+                .as_primitive::<Int16Type>()
+                .iter()
+                .map(|opt| opt.map(|value| bitmap_bucket_number(value.into())))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        DataType::Int32 => {
+            let result: Int64Array = array
+                .as_primitive::<Int32Type>()
+                .iter()
+                .map(|opt| opt.map(|value| bitmap_bucket_number(value.into())))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        DataType::Int64 => {
+            let result: Int64Array = array
+                .as_primitive::<Int64Type>()
+                .iter()
+                .map(|opt| opt.map(bitmap_bucket_number))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        data_type => {
+            internal_err!("bitmap_bucket_number does not support {data_type}")
+        }
+    }
+}
+
+const NUM_BYTES: i64 = 4 * 1024;
+const NUM_BITS: i64 = NUM_BYTES * 8;
+
+fn bitmap_bucket_number(value: i64) -> i64 {
+    if value > 0 {
+        1 + (value - 1) / NUM_BITS
+    } else {
+        value / NUM_BITS
+    }
+}
diff --git a/datafusion/spark/src/function/bitmap/bitmap_count.rs b/datafusion/spark/src/function/bitmap/bitmap_count.rs
index 56a9c5edb812c..89bea101afbe7 100644
--- a/datafusion/spark/src/function/bitmap/bitmap_count.rs
+++ b/datafusion/spark/src/function/bitmap/bitmap_count.rs
@@ -15,19 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{
-    as_dictionary_array, Array, ArrayRef, BinaryArray, BinaryViewArray,
-    FixedSizeBinaryArray, Int64Array, LargeBinaryArray,
+    Array, ArrayRef, BinaryArray, BinaryViewArray, FixedSizeBinaryArray, Int64Array,
+    LargeBinaryArray, as_dictionary_array,
 };
 use arrow::datatypes::DataType::{
     Binary, BinaryView, Dictionary, FixedSizeBinary, LargeBinary,
 };
-use arrow::datatypes::{DataType, Int16Type, Int32Type, Int64Type, Int8Type};
+use arrow::datatypes::{DataType, FieldRef, Int8Type, Int16Type, Int32Type, Int64Type};
 use datafusion_common::utils::take_function_args;
-use datafusion_common::{internal_err, Result};
+use datafusion_common::{Result, internal_err};
 use datafusion_expr::{
     Coercion, ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature,
     TypeSignatureClass, Volatility,
@@ -58,10 +57,6 @@ impl BitmapCount {
 }
 
 impl ScalarUDFImpl for BitmapCount {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "bitmap_count"
     }
@@ -71,7 +66,20 @@ impl ScalarUDFImpl for BitmapCount {
     }
 
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(DataType::Int64)
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(
+        &self,
+        args: datafusion_expr::ReturnFieldArgs,
+    ) -> Result<FieldRef> {
+        use arrow::datatypes::Field;
+        // bitmap_count returns Int64 with the same nullability as the input
+        Ok(Arc::new(Field::new(
+            args.arg_fields[0].name(),
+            DataType::Int64,
+            args.arg_fields[0].is_nullable(),
+        )))
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
@@ -205,12 +213,17 @@ mod tests {
             Box::new(ScalarValue::Binary(Some(vec![0xFFu8, 0xFFu8]))),
         ));
 
-        let arg_fields = vec![Field::new(
-            "a",
-            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Binary)),
-            true,
-        )
-        .into()];
+        let arg_fields = vec![
+            Field::new(
+                "a",
+                DataType::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(DataType::Binary),
+                ),
+                true,
+            )
+            .into(),
+        ];
         let args = ScalarFunctionArgs {
             args: vec![dict.clone()],
             arg_fields,
@@ -224,4 +237,37 @@ mod tests {
         assert_eq!(*actual.into_array(1)?, *expect.into_array(1)?);
         Ok(())
     }
+
+    #[test]
+    fn test_bitmap_count_nullability() -> Result<()> {
+        use datafusion_expr::ReturnFieldArgs;
+
+        let bitmap_count = BitmapCount::new();
+
+        // Test with non-nullable binary field
+        let non_nullable_field = Arc::new(Field::new("bin", DataType::Binary, false));
+
+        let result = bitmap_count.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[Arc::clone(&non_nullable_field)],
+            scalar_arguments: &[None],
+        })?;
+
+        // The result should not be nullable (same as input)
+        assert!(!result.is_nullable());
+        assert_eq!(result.data_type(), &Int64);
+
+        // Test with nullable binary field
+        let nullable_field = Arc::new(Field::new("bin", DataType::Binary, true));
+
+        let result = bitmap_count.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[Arc::clone(&nullable_field)],
+            scalar_arguments: &[None],
+        })?;
+
+        // The result should be nullable (same as input)
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), &Int64);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/spark/src/function/bitmap/mod.rs b/datafusion/spark/src/function/bitmap/mod.rs
index 8532c32ac9c5f..4992992aeae8b 100644
--- a/datafusion/spark/src/function/bitmap/mod.rs
+++ b/datafusion/spark/src/function/bitmap/mod.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+pub mod bitmap_bit_position;
+pub mod bitmap_bucket_number;
 pub mod bitmap_count;
 
 use datafusion_expr::ScalarUDF;
@@ -22,6 +24,11 @@ use datafusion_functions::make_udf_function;
 use std::sync::Arc;
 
 make_udf_function!(bitmap_count::BitmapCount, bitmap_count);
+make_udf_function!(bitmap_bit_position::BitmapBitPosition, bitmap_bit_position);
+make_udf_function!(
+    bitmap_bucket_number::BitmapBucketNumber,
+    bitmap_bucket_number
+);
 
 pub mod expr_fn {
     use datafusion_functions::export_functions;
@@ -31,8 +38,22 @@ pub mod expr_fn {
         "Returns the number of set bits in the input bitmap.",
         arg
     ));
+    export_functions!((
+        bitmap_bit_position,
+        "Returns the bit position for the given input child expression.",
+        arg
+    ));
+    export_functions!((
+        bitmap_bucket_number,
+        "Returns the bucket number for the given input child expression.",
+        arg
+    ));
 }
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
-    vec![bitmap_count()]
+    vec![
+        bitmap_count(),
+        bitmap_bit_position(),
+        bitmap_bucket_number(),
+    ]
 }
diff --git a/datafusion/spark/src/function/bitwise/bit_count.rs b/datafusion/spark/src/function/bitwise/bit_count.rs
index 4b414b57cb776..3a91fea7a90c0 100644
--- a/datafusion/spark/src/function/bitwise/bit_count.rs
+++ b/datafusion/spark/src/function/bitwise/bit_count.rs
@@ -15,16 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, AsArray, Int32Array};
 use arrow::datatypes::{
-    DataType, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type,
-    UInt64Type, UInt8Type,
+    DataType, FieldRef, Int8Type, Int16Type, Int32Type, Int64Type, UInt8Type, UInt16Type,
+    UInt32Type, UInt64Type,
 };
 use datafusion_common::cast::as_boolean_array;
-use datafusion_common::{plan_err, Result};
+use datafusion_common::{Result, internal_err, plan_err};
 use datafusion_expr::{
     ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
     Volatility,
@@ -64,10 +63,6 @@ impl SparkBitCount {
 }
 
 impl ScalarUDFImpl for SparkBitCount {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "bit_count"
     }
@@ -77,7 +72,20 @@ impl ScalarUDFImpl for SparkBitCount {
     }
 
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(DataType::Int32) // Spark returns int (Int32)
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(
+        &self,
+        args: datafusion_expr::ReturnFieldArgs,
+    ) -> Result<FieldRef> {
+        use arrow::datatypes::Field;
+        // bit_count returns Int32 with the same nullability as the input
+        Ok(Arc::new(Field::new(
+            args.arg_fields[0].name(),
+            DataType::Int32,
+            args.arg_fields[0].is_nullable(),
+        )))
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
@@ -102,24 +110,25 @@ fn spark_bit_count(value_array: &[ArrayRef]) -> Result<ArrayRef> {
         DataType::Int8 => {
             let result: Int32Array = value_array
                 .as_primitive::<Int8Type>()
-                .unary(|v| bit_count(v.into()));
+                .unary(|v| (v as i64).count_ones() as i32);
             Ok(Arc::new(result))
         }
         DataType::Int16 => {
             let result: Int32Array = value_array
                 .as_primitive::<Int16Type>()
-                .unary(|v| bit_count(v.into()));
+                .unary(|v| (v as i64).count_ones() as i32);
             Ok(Arc::new(result))
         }
         DataType::Int32 => {
             let result: Int32Array = value_array
                 .as_primitive::<Int32Type>()
-                .unary(|v| bit_count(v.into()));
+                .unary(|v| (v as i64).count_ones() as i32);
             Ok(Arc::new(result))
         }
         DataType::Int64 => {
-            let result: Int32Array =
-                value_array.as_primitive::<Int64Type>().unary(bit_count);
+            let result: Int32Array = value_array
+                .as_primitive::<Int64Type>()
+                .unary(|v| v.count_ones() as i32);
             Ok(Arc::new(result))
         }
         DataType::UInt8 => {
@@ -155,28 +164,14 @@ fn spark_bit_count(value_array: &[ArrayRef]) -> Result<ArrayRef> {
     }
 }
 
-// Here’s the equivalent Rust implementation of the bitCount function (similar to Apache Spark's bitCount for LongType)
-// Spark: https://github.com/apache/spark/blob/ac717dd7aec665de578d7c6b0070e8fcdde3cea9/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala#L243
-// Java impl: https://github.com/openjdk/jdk/blob/d226023643f90027a8980d161ec6d423887ae3ce/src/java.base/share/classes/java/lang/Long.java#L1584
-fn bit_count(i: i64) -> i32 {
-    let mut u = i as u64;
-    u = u - ((u >> 1) & 0x5555555555555555);
-    u = (u & 0x3333333333333333) + ((u >> 2) & 0x3333333333333333);
-    u = (u + (u >> 4)) & 0x0f0f0f0f0f0f0f0f;
-    u = u + (u >> 8);
-    u = u + (u >> 16);
-    u = u + (u >> 32);
-    (u as i32) & 0x7f
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
     use arrow::array::{
-        Array, BooleanArray, Int16Array, Int32Array, Int64Array, Int8Array, UInt16Array,
-        UInt32Array, UInt64Array, UInt8Array,
+        Array, BooleanArray, Int8Array, Int16Array, Int32Array, Int64Array, UInt8Array,
+        UInt16Array, UInt32Array, UInt64Array,
     };
-    use arrow::datatypes::Int32Type;
+    use arrow::datatypes::Field;
 
     #[test]
     fn test_bit_count_basic() {
@@ -349,4 +344,37 @@ mod tests {
         assert!(arr.is_null(1));
         assert_eq!(arr.value(2), 3); // 0b111
     }
+
+    #[test]
+    fn test_bit_count_nullability() -> Result<()> {
+        use datafusion_expr::ReturnFieldArgs;
+
+        let bit_count = SparkBitCount::new();
+
+        // Test with non-nullable Int32 field
+        let non_nullable_field = Arc::new(Field::new("num", DataType::Int32, false));
+
+        let result = bit_count.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[Arc::clone(&non_nullable_field)],
+            scalar_arguments: &[None],
+        })?;
+
+        // The result should not be nullable (same as input)
+        assert!(!result.is_nullable());
+        assert_eq!(result.data_type(), &DataType::Int32);
+
+        // Test with nullable Int32 field
+        let nullable_field = Arc::new(Field::new("num", DataType::Int32, true));
+
+        let result = bit_count.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[Arc::clone(&nullable_field)],
+            scalar_arguments: &[None],
+        })?;
+
+        // The result should be nullable (same as input)
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), &DataType::Int32);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/spark/src/function/bitwise/bit_get.rs b/datafusion/spark/src/function/bitwise/bit_get.rs
index a8562618cb8cb..0de6498a0c37e 100644
--- a/datafusion/spark/src/function/bitwise/bit_get.rs
+++ b/datafusion/spark/src/function/bitwise/bit_get.rs
@@ -15,29 +15,24 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::mem::size_of;
 use std::sync::Arc;
 
-use arrow::array::{Array, ArrayRef, ArrowPrimitiveType, AsArray, PrimitiveArray};
-use arrow::compute::try_binary;
-use arrow::datatypes::DataType::{
-    Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8,
-};
-use arrow::datatypes::{
-    ArrowNativeType, DataType, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type,
-    UInt32Type, UInt64Type, UInt8Type,
+use arrow::array::{
+    Array, ArrayRef, ArrowPrimitiveType, AsArray, Int8Array, Int32Array, PrimitiveArray,
+    downcast_integer_array,
 };
-use datafusion_common::{exec_err, Result};
+use arrow::compute::try_binary;
+use arrow::datatypes::{ArrowNativeType, DataType, Field, FieldRef, Int8Type, Int32Type};
+use datafusion_common::types::{NativeType, logical_int32};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
 use datafusion_expr::{
-    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+    Coercion, ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignatureClass, Volatility,
 };
 use datafusion_functions::utils::make_scalar_function;
 
-use crate::function::error_utils::{
-    invalid_arg_count_exec_err, unsupported_data_type_exec_err,
-};
-
 #[derive(Debug, PartialEq, Eq, Hash)]
 pub struct SparkBitGet {
     signature: Signature,
@@ -53,45 +48,23 @@ impl Default for SparkBitGet {
 impl SparkBitGet {
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_exact(TypeSignatureClass::Integer),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_int32()),
+                        vec![TypeSignatureClass::Integer],
+                        NativeType::Int32,
+                    ),
+                ],
+                Volatility::Immutable,
+            ),
             aliases: vec!["getbit".to_string()],
         }
     }
 }
 
 impl ScalarUDFImpl for SparkBitGet {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        if arg_types.len() != 2 {
-            return Err(invalid_arg_count_exec_err(
-                "bit_get",
-                (2, 2),
-                arg_types.len(),
-            ));
-        }
-        if !arg_types[0].is_integer() && !arg_types[0].is_null() {
-            return Err(unsupported_data_type_exec_err(
-                "bit_get",
-                "Integer Type",
-                &arg_types[0],
-            ));
-        }
-        if !arg_types[1].is_integer() && !arg_types[1].is_null() {
-            return Err(unsupported_data_type_exec_err(
-                "bit_get",
-                "Integer Type",
-                &arg_types[1],
-            ));
-        }
-        if arg_types[0].is_null() {
-            return Ok(vec![Int8, Int32]);
-        }
-        Ok(vec![arg_types[0].clone(), Int32])
-    }
-
     fn name(&self) -> &str {
         "bit_get"
     }
@@ -105,7 +78,13 @@ impl ScalarUDFImpl for SparkBitGet {
     }
 
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(Int8)
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        // Spark derives nullability for BinaryExpression from its children
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new(self.name(), DataType::Int8, nullable)))
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
@@ -115,8 +94,8 @@ impl ScalarUDFImpl for SparkBitGet {
 
 fn spark_bit_get_inner<T: ArrowPrimitiveType>(
     value: &PrimitiveArray<T>,
-    pos: &PrimitiveArray<Int32Type>,
-) -> Result<PrimitiveArray<Int8Type>> {
+    pos: &Int32Array,
+) -> Result<Int8Array> {
     let bit_length = (size_of::<T::Native>() * 8) as i32;
 
     let result: PrimitiveArray<Int8Type> = try_binary(value, pos, |value, pos| {
@@ -130,164 +109,52 @@ fn spark_bit_get_inner<T: ArrowPrimitiveType>(
     Ok(result)
 }
 
-pub fn spark_bit_get(args: &[ArrayRef]) -> Result<ArrayRef> {
-    if args.len() != 2 {
-        return exec_err!("`bit_get` expects exactly two arguments");
-    }
-
-    if args[1].data_type() != &Int32 {
-        return exec_err!("`bit_get` expects Int32 as the second argument");
-    }
-
-    let pos_arg = args[1].as_primitive::<Int32Type>();
-
-    let ret = match &args[0].data_type() {
-        Int64 => {
-            let value_arg = args[0].as_primitive::<Int64Type>();
-            spark_bit_get_inner(value_arg, pos_arg)
-        }
-        Int32 => {
-            let value_arg = args[0].as_primitive::<Int32Type>();
-            spark_bit_get_inner(value_arg, pos_arg)
-        }
-        Int16 => {
-            let value_arg = args[0].as_primitive::<Int16Type>();
-            spark_bit_get_inner(value_arg, pos_arg)
-        }
-        Int8 => {
-            let value_arg = args[0].as_primitive::<Int8Type>();
-            spark_bit_get_inner(value_arg, pos_arg)
-        }
-        UInt64 => {
-            let value_arg = args[0].as_primitive::<UInt64Type>();
-            spark_bit_get_inner(value_arg, pos_arg)
-        }
-        UInt32 => {
-            let value_arg = args[0].as_primitive::<UInt32Type>();
-            spark_bit_get_inner(value_arg, pos_arg)
-        }
-        UInt16 => {
-            let value_arg = args[0].as_primitive::<UInt16Type>();
-            spark_bit_get_inner(value_arg, pos_arg)
-        }
-        UInt8 => {
-            let value_arg = args[0].as_primitive::<UInt8Type>();
-            spark_bit_get_inner(value_arg, pos_arg)
-        }
-        _ => {
-            exec_err!(
-                "`bit_get` expects Int64, Int32, Int16, or Int8 as the first argument"
-            )
-        }
-    }?;
+fn spark_bit_get(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [value, position] = take_function_args("bit_get", args)?;
+    let pos_arg = position.as_primitive::<Int32Type>();
+    let ret = downcast_integer_array!(
+        value => spark_bit_get_inner(value, pos_arg),
+        DataType::Null => Ok(Int8Array::new_null(value.len())),
+        d => internal_err!("Unsupported datatype for bit_get: {d}"),
+    )?;
     Ok(Arc::new(ret))
 }
 
 #[cfg(test)]
 mod tests {
-    use arrow::array::{Int32Array, Int64Array};
-
     use super::*;
 
     #[test]
-    fn test_bit_get_basic() {
-        // Test bit_get(11, 0) - 11 = 1011 in binary, bit 0 = 1
-        let result = spark_bit_get(&[
-            Arc::new(Int64Array::from(vec![11])),
-            Arc::new(Int32Array::from(vec![0])),
-        ])
-        .unwrap();
-
-        assert_eq!(result.as_primitive::<Int8Type>().value(0), 1);
-
-        // Test bit_get(11, 2) - 11 = 1011 in binary, bit 2 = 0
-        let result = spark_bit_get(&[
-            Arc::new(Int64Array::from(vec![11])),
-            Arc::new(Int32Array::from(vec![2])),
-        ])
-        .unwrap();
-
-        assert_eq!(result.as_primitive::<Int8Type>().value(0), 0);
-
-        // Test bit_get(11, 3) - 11 = 1011 in binary, bit 3 = 1
-        let result = spark_bit_get(&[
-            Arc::new(Int64Array::from(vec![11])),
-            Arc::new(Int32Array::from(vec![3])),
-        ])
-        .unwrap();
-
-        assert_eq!(result.as_primitive::<Int8Type>().value(0), 1);
+    fn test_bit_get_nullability_non_nullable_inputs() {
+        let func = SparkBitGet::new();
+        let value_field = Arc::new(Field::new("value", DataType::Int32, false));
+        let pos_field = Arc::new(Field::new("pos", DataType::Int32, false));
+
+        let out_field = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[value_field, pos_field],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        assert_eq!(out_field.data_type(), &DataType::Int8);
+        assert!(!out_field.is_nullable());
     }
 
     #[test]
-    fn test_bit_get_edge_cases() {
-        // Test with 0
-        let result = spark_bit_get(&[
-            Arc::new(Int64Array::from(vec![0])),
-            Arc::new(Int32Array::from(vec![0])),
-        ])
-        .unwrap();
-
-        assert_eq!(result.as_primitive::<Int8Type>().value(0), 0);
-
-        let result = spark_bit_get(&[
-            Arc::new(Int64Array::from(vec![11])),
-            Arc::new(Int32Array::from(vec![-1])),
-        ]);
-        assert_eq!(
-            result.unwrap_err().message().lines().next().unwrap(),
-            "Compute error: bit_get: position -1 is out of bounds. Expected pos < 64 and pos >= 0"
-        );
-
-        let result = spark_bit_get(&[
-            Arc::new(Int64Array::from(vec![11])),
-            Arc::new(Int32Array::from(vec![64])),
-        ]);
-
-        assert_eq!(
-            result.unwrap_err().message().lines().next().unwrap(),
-            "Compute error: bit_get: position 64 is out of bounds. Expected pos < 64 and pos >= 0"
-        );
-    }
-
-    #[test]
-    fn test_bit_get_null_inputs() {
-        // Test with NULL value
-        let result = spark_bit_get(&[
-            Arc::new(Int64Array::from(vec![None])),
-            Arc::new(Int32Array::from(vec![0])),
-        ])
-        .unwrap();
-
-        assert_eq!(result.as_primitive::<Int8Type>().value(0), 0);
-
-        // Test with NULL position
-        let result = spark_bit_get(&[
-            Arc::new(Int64Array::from(vec![11])),
-            Arc::new(Int32Array::from(vec![None])),
-        ])
-        .unwrap();
-
-        assert_eq!(result.as_primitive::<Int8Type>().value(0), 0);
-    }
-
-    #[test]
-    fn test_bit_get_large_numbers() {
-        // Test with larger number
-        let result = spark_bit_get(&[
-            Arc::new(Int64Array::from(vec![255])), // 11111111 in binary
-            Arc::new(Int32Array::from(vec![7])),   // bit 7 = 1
-        ])
-        .unwrap();
-
-        assert_eq!(result.as_primitive::<Int8Type>().value(0), 1);
-
-        let result = spark_bit_get(&[
-            Arc::new(Int64Array::from(vec![255])), // 11111111 in binary
-            Arc::new(Int32Array::from(vec![8])),   // bit 8 = 0
-        ])
-        .unwrap();
-
-        assert_eq!(result.as_primitive::<Int8Type>().value(0), 0);
+    fn test_bit_get_nullability_nullable_inputs() {
+        let func = SparkBitGet::new();
+        let value_field = Arc::new(Field::new("value", DataType::Int32, true));
+        let pos_field = Arc::new(Field::new("pos", DataType::Int32, false));
+
+        let out_field = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[value_field, pos_field],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        assert_eq!(out_field.data_type(), &DataType::Int8);
+        assert!(out_field.is_nullable());
     }
 }
diff --git a/datafusion/spark/src/function/bitwise/bit_shift.rs b/datafusion/spark/src/function/bitwise/bit_shift.rs
index bb645b7660584..b78f1890832cb 100644
--- a/datafusion/spark/src/function/bitwise/bit_shift.rs
+++ b/datafusion/spark/src/function/bitwise/bit_shift.rs
@@ -15,40 +15,36 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
-use arrow::array::{ArrayRef, ArrowPrimitiveType, AsArray, PrimitiveArray};
+use arrow::array::{ArrayRef, ArrowPrimitiveType, AsArray, Int32Array, PrimitiveArray};
 use arrow::compute;
 use arrow::datatypes::{
-    ArrowNativeType, DataType, Int32Type, Int64Type, UInt32Type, UInt64Type,
+    ArrowNativeType, DataType, Field, FieldRef, Int32Type, Int64Type, UInt32Type,
+    UInt64Type,
 };
-use datafusion_common::{plan_err, Result};
+use datafusion_common::types::{
+    NativeType, logical_int8, logical_int16, logical_int32, logical_int64, logical_uint8,
+    logical_uint16, logical_uint32, logical_uint64,
+};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
 use datafusion_expr::{
-    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+    Coercion, ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignature, TypeSignatureClass, Volatility,
 };
 use datafusion_functions::utils::make_scalar_function;
 
-use crate::function::error_utils::{
-    invalid_arg_count_exec_err, unsupported_data_type_exec_err,
-};
-
-/// Performs a bitwise left shift on each element of the `value` array by the corresponding amount in the `shift` array.
-/// The shift amount is normalized to the bit width of the type, matching Spark/Java semantics for negative and large shifts.
-///
-/// # Arguments
-/// * `value` - The array of values to shift.
-/// * `shift` - The array of shift amounts (must be Int32).
-///
-/// # Returns
-/// A new array with the shifted values.
-///
-fn shift_left<T: ArrowPrimitiveType>(
+/// Bitwise left shift on elements in `value` by corresponding `shift` amount.
+/// The shift amount is normalized to the bit width of the type, matching Spark/Java
+/// semantics for negative and large shifts.
+fn shift_left<T>(
     value: &PrimitiveArray<T>,
-    shift: &PrimitiveArray<Int32Type>,
+    shift: &Int32Array,
 ) -> Result<PrimitiveArray<T>>
 where
-    T::Native: ArrowNativeType + std::ops::Shl<i32, Output = T::Native>,
+    T: ArrowPrimitiveType,
+    T::Native: std::ops::Shl<i32, Output = T::Native>,
 {
     let bit_num = (T::Native::get_byte_width() * 8) as i32;
     let result = compute::binary::<_, Int32Type, _, _>(
@@ -62,22 +58,16 @@ where
     Ok(result)
 }
 
-/// Performs a bitwise right shift on each element of the `value` array by the corresponding amount in the `shift` array.
-/// The shift amount is normalized to the bit width of the type, matching Spark/Java semantics for negative and large shifts.
-///
-/// # Arguments
-/// * `value` - The array of values to shift.
-/// * `shift` - The array of shift amounts (must be Int32).
-///
-/// # Returns
-/// A new array with the shifted values.
-///
-fn shift_right<T: ArrowPrimitiveType>(
+/// Bitwise right shift on elements in `value` by corresponding `shift` amount.
+/// The shift amount is normalized to the bit width of the type, matching Spark/Java
+/// semantics for negative and large shifts.
+fn shift_right<T>(
     value: &PrimitiveArray<T>,
-    shift: &PrimitiveArray<Int32Type>,
+    shift: &Int32Array,
 ) -> Result<PrimitiveArray<T>>
 where
-    T::Native: ArrowNativeType + std::ops::Shr<i32, Output = T::Native>,
+    T: ArrowPrimitiveType,
+    T::Native: std::ops::Shr<i32, Output = T::Native>,
 {
     let bit_num = (T::Native::get_byte_width() * 8) as i32;
     let result = compute::binary::<_, Int32Type, _, _>(
@@ -95,50 +85,44 @@ where
 /// This is used to mimic Java's `>>>` operator, which does not exist in Rust.
 /// For unsigned types, this is just the normal right shift.
 /// For signed types, this casts to the unsigned type, shifts, then casts back.
-trait UShr<Rhs> {
-    fn ushr(self, rhs: Rhs) -> Self;
+trait UShr {
+    fn ushr(self, rhs: i32) -> Self;
 }
 
-impl UShr<i32> for u32 {
+impl UShr for u32 {
     fn ushr(self, rhs: i32) -> Self {
         self >> rhs
     }
 }
 
-impl UShr<i32> for u64 {
+impl UShr for u64 {
     fn ushr(self, rhs: i32) -> Self {
         self >> rhs
     }
 }
 
-impl UShr<i32> for i32 {
+impl UShr for i32 {
     fn ushr(self, rhs: i32) -> Self {
         ((self as u32) >> rhs) as i32
     }
 }
 
-impl UShr<i32> for i64 {
+impl UShr for i64 {
     fn ushr(self, rhs: i32) -> Self {
         ((self as u64) >> rhs) as i64
     }
 }
 
-/// Performs a bitwise unsigned right shift on each element of the `value` array by the corresponding amount in the `shift` array.
-/// The shift amount is normalized to the bit width of the type, matching Spark/Java semantics for negative and large shifts.
-///
-/// # Arguments
-/// * `value` - The array of values to shift.
-/// * `shift` - The array of shift amounts (must be Int32).
-///
-/// # Returns
-/// A new array with the shifted values.
-///
-fn shift_right_unsigned<T: ArrowPrimitiveType>(
+/// Bitwise unsigned right shift on elements in `value` by corresponding `shift`
+/// amount. The shift amount is normalized to the bit width of the type, matching
+/// Spark/Java semantics for negative and large shifts.
+fn shift_right_unsigned<T>(
     value: &PrimitiveArray<T>,
-    shift: &PrimitiveArray<Int32Type>,
+    shift: &Int32Array,
 ) -> Result<PrimitiveArray<T>>
 where
-    T::Native: ArrowNativeType + UShr<i32>,
+    T: ArrowPrimitiveType,
+    T::Native: UShr,
 {
     let bit_num = (T::Native::get_byte_width() * 8) as i32;
     let result = compute::binary::<_, Int32Type, _, _>(
@@ -152,289 +136,155 @@ where
     Ok(result)
 }
 
-trait BitShiftUDF: ScalarUDFImpl {
-    fn shift<T: ArrowPrimitiveType>(
-        &self,
-        value: &PrimitiveArray<T>,
-        shift: &PrimitiveArray<Int32Type>,
-    ) -> Result<PrimitiveArray<T>>
-    where
-        T::Native: ArrowNativeType
-            + std::ops::Shl<i32, Output = T::Native>
-            + std::ops::Shr<i32, Output = T::Native>
-            + UShr<i32>;
-
-    fn spark_shift(&self, arrays: &[ArrayRef]) -> Result<ArrayRef> {
-        let value_array = arrays[0].as_ref();
-        let shift_array = arrays[1].as_ref();
-
-        // Ensure shift array is Int32
-        let shift_array = if shift_array.data_type() != &DataType::Int32 {
-            return plan_err!("{} shift amount must be Int32", self.name());
-        } else {
-            shift_array.as_primitive::<Int32Type>()
-        };
-
-        match value_array.data_type() {
-            DataType::Int32 => {
-                let value_array = value_array.as_primitive::<Int32Type>();
-                Ok(Arc::new(self.shift(value_array, shift_array)?))
-            }
-            DataType::Int64 => {
-                let value_array = value_array.as_primitive::<Int64Type>();
-                Ok(Arc::new(self.shift(value_array, shift_array)?))
-            }
-            DataType::UInt32 => {
-                let value_array = value_array.as_primitive::<UInt32Type>();
-                Ok(Arc::new(self.shift(value_array, shift_array)?))
-            }
-            DataType::UInt64 => {
-                let value_array = value_array.as_primitive::<UInt64Type>();
-                Ok(Arc::new(self.shift(value_array, shift_array)?))
-            }
-            _ => {
-                plan_err!(
-                    "{} function does not support data type: {}",
-                    self.name(),
-                    value_array.data_type()
-                )
-            }
-        }
-    }
-}
-
-fn bit_shift_coerce_types(arg_types: &[DataType], func: &str) -> Result<Vec<DataType>> {
-    if arg_types.len() != 2 {
-        return Err(invalid_arg_count_exec_err(func, (2, 2), arg_types.len()));
-    }
-    if !arg_types[0].is_integer() && !arg_types[0].is_null() {
-        return Err(unsupported_data_type_exec_err(
-            func,
-            "Integer Type",
-            &arg_types[0],
-        ));
-    }
-    if !arg_types[1].is_integer() && !arg_types[1].is_null() {
-        return Err(unsupported_data_type_exec_err(
-            func,
-            "Integer Type",
-            &arg_types[1],
-        ));
-    }
-
-    // Coerce smaller integer types to Int32
-    let coerced_first = match &arg_types[0] {
-        DataType::Int8 | DataType::Int16 | DataType::Null => DataType::Int32,
-        DataType::UInt8 | DataType::UInt16 => DataType::UInt32,
-        _ => arg_types[0].clone(),
-    };
-
-    Ok(vec![coerced_first, DataType::Int32])
-}
-
-#[derive(Debug, Hash, Eq, PartialEq)]
-pub struct SparkShiftLeft {
-    signature: Signature,
-}
+fn shift_inner(
+    arrays: &[ArrayRef],
+    name: &str,
+    bit_shift_type: BitShiftType,
+) -> Result<ArrayRef> {
+    let [value_array, shift_array] = take_function_args(name, arrays)?;
+    let shift_array = shift_array.as_primitive::<Int32Type>();
 
-impl Default for SparkShiftLeft {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl SparkShiftLeft {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::user_defined(Volatility::Immutable),
-        }
-    }
-}
-
-impl BitShiftUDF for SparkShiftLeft {
-    fn shift<T: ArrowPrimitiveType>(
-        &self,
+    fn shift<T>(
         value: &PrimitiveArray<T>,
-        shift: &PrimitiveArray<Int32Type>,
+        shift: &Int32Array,
+        bit_shift_type: BitShiftType,
     ) -> Result<PrimitiveArray<T>>
     where
-        T::Native: ArrowNativeType
-            + std::ops::Shl<i32, Output = T::Native>
+        T: ArrowPrimitiveType,
+        T::Native: std::ops::Shl<i32, Output = T::Native>
             + std::ops::Shr<i32, Output = T::Native>
-            + UShr<i32>,
+            + UShr,
     {
-        shift_left(value, shift)
-    }
-}
-
-impl ScalarUDFImpl for SparkShiftLeft {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn name(&self) -> &str {
-        "shiftleft"
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        bit_shift_coerce_types(arg_types, "shiftleft")
-    }
-
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if arg_types.len() != 2 {
-            return plan_err!("shiftleft expects exactly 2 arguments");
+        match bit_shift_type {
+            BitShiftType::Left => shift_left(value, shift),
+            BitShiftType::Right => shift_right(value, shift),
+            BitShiftType::RightUnsigned => shift_right_unsigned(value, shift),
         }
-        // Return type is the same as the first argument (the value to shift)
-        Ok(arg_types[0].clone())
     }
 
-    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        if args.args.len() != 2 {
-            return plan_err!("shiftleft expects exactly 2 arguments");
+    match value_array.data_type() {
+        DataType::Int32 => {
+            let value_array = value_array.as_primitive::<Int32Type>();
+            Ok(Arc::new(shift(value_array, shift_array, bit_shift_type)?))
         }
-        let inner = |arr: &[ArrayRef]| -> Result<ArrayRef> { self.spark_shift(arr) };
-        make_scalar_function(inner, vec![])(&args.args)
-    }
-}
-
-#[derive(Debug, Hash, Eq, PartialEq)]
-pub struct SparkShiftRightUnsigned {
-    signature: Signature,
-}
-
-impl Default for SparkShiftRightUnsigned {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl SparkShiftRightUnsigned {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+        DataType::Int64 => {
+            let value_array = value_array.as_primitive::<Int64Type>();
+            Ok(Arc::new(shift(value_array, shift_array, bit_shift_type)?))
         }
-    }
-}
-
-impl BitShiftUDF for SparkShiftRightUnsigned {
-    fn shift<T: ArrowPrimitiveType>(
-        &self,
-        value: &PrimitiveArray<T>,
-        shift: &PrimitiveArray<Int32Type>,
-    ) -> Result<PrimitiveArray<T>>
-    where
-        T::Native: ArrowNativeType
-            + std::ops::Shl<i32, Output = T::Native>
-            + std::ops::Shr<i32, Output = T::Native>
-            + UShr<i32>,
-    {
-        shift_right_unsigned(value, shift)
-    }
-}
-
-impl ScalarUDFImpl for SparkShiftRightUnsigned {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn name(&self) -> &str {
-        "shiftrightunsigned"
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        bit_shift_coerce_types(arg_types, "shiftrightunsigned")
-    }
-
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if arg_types.len() != 2 {
-            return plan_err!("shiftrightunsigned expects exactly 2 arguments");
+        DataType::UInt32 => {
+            let value_array = value_array.as_primitive::<UInt32Type>();
+            Ok(Arc::new(shift(value_array, shift_array, bit_shift_type)?))
         }
-        // Return type is the same as the first argument (the value to shift)
-        Ok(arg_types[0].clone())
-    }
-
-    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        if args.args.len() != 2 {
-            return plan_err!("shiftrightunsigned expects exactly 2 arguments");
+        DataType::UInt64 => {
+            let value_array = value_array.as_primitive::<UInt64Type>();
+            Ok(Arc::new(shift(value_array, shift_array, bit_shift_type)?))
+        }
+        dt => {
+            internal_err!("{name} function does not support data type: {dt}")
         }
-        let inner = |arr: &[ArrayRef]| -> Result<ArrayRef> { self.spark_shift(arr) };
-        make_scalar_function(inner, vec![])(&args.args)
     }
 }
 
-#[derive(Debug, Hash, Eq, PartialEq)]
-pub struct SparkShiftRight {
-    signature: Signature,
+#[derive(Debug, Hash, Copy, Clone, Eq, PartialEq)]
+enum BitShiftType {
+    Left,
+    Right,
+    RightUnsigned,
 }
 
-impl Default for SparkShiftRight {
-    fn default() -> Self {
-        Self::new()
-    }
+#[derive(Debug, Hash, Eq, PartialEq)]
+pub struct SparkBitShift {
+    signature: Signature,
+    name: &'static str,
+    bit_shift_type: BitShiftType,
 }
 
-impl SparkShiftRight {
-    pub fn new() -> Self {
+impl SparkBitShift {
+    fn new(name: &'static str, bit_shift_type: BitShiftType) -> Self {
+        let shift_amount = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int32()),
+            vec![TypeSignatureClass::Integer],
+            NativeType::Int32,
+        );
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+            signature: Signature::one_of(
+                vec![
+                    // Upcast small ints to 32bit
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_implicit(
+                            TypeSignatureClass::Native(logical_int32()),
+                            vec![
+                                TypeSignatureClass::Native(logical_int8()),
+                                TypeSignatureClass::Native(logical_int16()),
+                            ],
+                            NativeType::Int32,
+                        ),
+                        shift_amount.clone(),
+                    ]),
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_implicit(
+                            TypeSignatureClass::Native(logical_uint32()),
+                            vec![
+                                TypeSignatureClass::Native(logical_uint8()),
+                                TypeSignatureClass::Native(logical_uint16()),
+                            ],
+                            NativeType::UInt32,
+                        ),
+                        shift_amount.clone(),
+                    ]),
+                    // Otherwise accept direct 64 bit integers
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_int64())),
+                        shift_amount.clone(),
+                    ]),
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_uint64())),
+                        shift_amount.clone(),
+                    ]),
+                ],
+                Volatility::Immutable,
+            ),
+            name,
+            bit_shift_type,
         }
     }
-}
 
-impl BitShiftUDF for SparkShiftRight {
-    fn shift<T: ArrowPrimitiveType>(
-        &self,
-        value: &PrimitiveArray<T>,
-        shift: &PrimitiveArray<Int32Type>,
-    ) -> Result<PrimitiveArray<T>>
-    where
-        T::Native: ArrowNativeType
-            + std::ops::Shl<i32, Output = T::Native>
-            + std::ops::Shr<i32, Output = T::Native>
-            + UShr<i32>,
-    {
-        shift_right(value, shift)
+    pub fn left() -> Self {
+        Self::new("shiftleft", BitShiftType::Left)
+    }
+
+    pub fn right() -> Self {
+        Self::new("shiftright", BitShiftType::Right)
     }
-}
 
-impl ScalarUDFImpl for SparkShiftRight {
-    fn as_any(&self) -> &dyn Any {
-        self
+    pub fn right_unsigned() -> Self {
+        Self::new("shiftrightunsigned", BitShiftType::RightUnsigned)
     }
+}
 
+impl ScalarUDFImpl for SparkBitShift {
     fn name(&self) -> &str {
-        "shiftright"
+        self.name
     }
 
     fn signature(&self) -> &Signature {
         &self.signature
     }
 
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        bit_shift_coerce_types(arg_types, "shiftright")
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if arg_types.len() != 2 {
-            return plan_err!("shiftright expects exactly 2 arguments");
-        }
-        // Return type is the same as the first argument (the value to shift)
-        Ok(arg_types[0].clone())
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        let data_type = args.arg_fields[0].data_type().clone();
+        Ok(Arc::new(Field::new(self.name(), data_type, nullable)))
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        if args.args.len() != 2 {
-            return plan_err!("shiftright expects exactly 2 arguments");
-        }
-        let inner = |arr: &[ArrayRef]| -> Result<ArrayRef> { self.spark_shift(arr) };
+        let inner = |arr: &[ArrayRef]| -> Result<ArrayRef> {
+            shift_inner(arr, self.name(), self.bit_shift_type)
+        };
         make_scalar_function(inner, vec![])(&args.args)
     }
 }
@@ -442,299 +292,52 @@ impl ScalarUDFImpl for SparkShiftRight {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::array::{Array, Int32Array, Int64Array, UInt32Array, UInt64Array};
-
-    #[test]
-    fn test_shift_right_unsigned_int32() {
-        let value_array = Arc::new(Int32Array::from(vec![4, 8, 16, 32]));
-        let shift_array = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
-        let result = SparkShiftRightUnsigned::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<Int32Type>();
-        assert_eq!(arr.value(0), 2); // 4 >>> 1 = 2
-        assert_eq!(arr.value(1), 2); // 8 >>> 2 = 2
-        assert_eq!(arr.value(2), 2); // 16 >>> 3 = 2
-        assert_eq!(arr.value(3), 2); // 32 >>> 4 = 2
-    }
-
-    #[test]
-    fn test_shift_right_unsigned_int64() {
-        let value_array = Arc::new(Int64Array::from(vec![4i64, 8, 16]));
-        let shift_array = Arc::new(Int32Array::from(vec![1, 2, 3]));
-        let result = SparkShiftRightUnsigned::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<Int64Type>();
-        assert_eq!(arr.value(0), 2); // 4 >>> 1 = 2
-        assert_eq!(arr.value(1), 2); // 8 >>> 2 = 2
-        assert_eq!(arr.value(2), 2); // 16 >>> 3 = 2
-    }
-
-    #[test]
-    fn test_shift_right_unsigned_uint32() {
-        let value_array = Arc::new(UInt32Array::from(vec![4u32, 8, 16]));
-        let shift_array = Arc::new(Int32Array::from(vec![1, 2, 3]));
-        let result = SparkShiftRightUnsigned::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<UInt32Type>();
-        assert_eq!(arr.value(0), 2); // 4 >>> 1 = 2
-        assert_eq!(arr.value(1), 2); // 8 >>> 2 = 2
-        assert_eq!(arr.value(2), 2); // 16 >>> 3 = 2
-    }
-
-    #[test]
-    fn test_shift_right_unsigned_uint64() {
-        let value_array = Arc::new(UInt64Array::from(vec![4u64, 8, 16]));
-        let shift_array = Arc::new(Int32Array::from(vec![1, 2, 3]));
-        let result = SparkShiftRightUnsigned::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<UInt64Type>();
-        assert_eq!(arr.value(0), 2); // 4 >>> 1 = 2
-        assert_eq!(arr.value(1), 2); // 8 >>> 2 = 2
-        assert_eq!(arr.value(2), 2); // 16 >>> 3 = 2
-    }
-
-    #[test]
-    fn test_shift_right_unsigned_nulls() {
-        let value_array = Arc::new(Int32Array::from(vec![Some(4), None, Some(8)]));
-        let shift_array = Arc::new(Int32Array::from(vec![Some(1), Some(2), None]));
-        let result = SparkShiftRightUnsigned::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<Int32Type>();
-        assert_eq!(arr.value(0), 2); // 4 >>> 1 = 2
-        assert!(arr.is_null(1)); // null >>> 2 = null
-        assert!(arr.is_null(2)); // 8 >>> null = null
-    }
-
-    #[test]
-    fn test_shift_right_unsigned_negative_shift() {
-        let value_array = Arc::new(Int32Array::from(vec![4, 8, 16]));
-        let shift_array = Arc::new(Int32Array::from(vec![-1, -2, -3]));
-        let result = SparkShiftRightUnsigned::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<Int32Type>();
-        assert_eq!(arr.value(0), 0); // 4 >>> -1 = 0
-        assert_eq!(arr.value(1), 0); // 8 >>> -2 = 0
-        assert_eq!(arr.value(2), 0); // 16 >>> -3 = 0
-    }
-
-    #[test]
-    fn test_shift_right_unsigned_negative_values() {
-        let value_array = Arc::new(Int32Array::from(vec![-4, -8, -16]));
-        let shift_array = Arc::new(Int32Array::from(vec![1, 2, 3]));
-        let result = SparkShiftRightUnsigned::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<Int32Type>();
-        // For unsigned right shift, negative values are treated as large positive values
-        // -4 as u32 = 4294967292, -4 >>> 1 = 2147483646
-        assert_eq!(arr.value(0), 2147483646);
-        // -8 as u32 = 4294967288, -8 >>> 2 = 1073741822
-        assert_eq!(arr.value(1), 1073741822);
-        // -16 as u32 = 4294967280, -16 >>> 3 = 536870910
-        assert_eq!(arr.value(2), 536870910);
-    }
-
-    #[test]
-    fn test_shift_right_int32() {
-        let value_array = Arc::new(Int32Array::from(vec![4, 8, 16, 32]));
-        let shift_array = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
-        let result = SparkShiftRight::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<Int32Type>();
-        assert_eq!(arr.value(0), 2); // 4 >> 1 = 2
-        assert_eq!(arr.value(1), 2); // 8 >> 2 = 2
-        assert_eq!(arr.value(2), 2); // 16 >> 3 = 2
-        assert_eq!(arr.value(3), 2); // 32 >> 4 = 2
-    }
-
-    #[test]
-    fn test_shift_right_int64() {
-        let value_array = Arc::new(Int64Array::from(vec![4i64, 8, 16]));
-        let shift_array = Arc::new(Int32Array::from(vec![1, 2, 3]));
-        let result = SparkShiftRight::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<Int64Type>();
-        assert_eq!(arr.value(0), 2); // 4 >> 1 = 2
-        assert_eq!(arr.value(1), 2); // 8 >> 2 = 2
-        assert_eq!(arr.value(2), 2); // 16 >> 3 = 2
-    }
-
-    #[test]
-    fn test_shift_right_uint32() {
-        let value_array = Arc::new(UInt32Array::from(vec![4u32, 8, 16]));
-        let shift_array = Arc::new(Int32Array::from(vec![1, 2, 3]));
-        let result = SparkShiftRight::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<UInt32Type>();
-        assert_eq!(arr.value(0), 2); // 4 >> 1 = 2
-        assert_eq!(arr.value(1), 2); // 8 >> 2 = 2
-        assert_eq!(arr.value(2), 2); // 16 >> 3 = 2
-    }
-
-    #[test]
-    fn test_shift_right_uint64() {
-        let value_array = Arc::new(UInt64Array::from(vec![4u64, 8, 16]));
-        let shift_array = Arc::new(Int32Array::from(vec![1, 2, 3]));
-        let result = SparkShiftRight::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<UInt64Type>();
-        assert_eq!(arr.value(0), 2); // 4 >> 1 = 2
-        assert_eq!(arr.value(1), 2); // 8 >> 2 = 2
-        assert_eq!(arr.value(2), 2); // 16 >> 3 = 2
-    }
-
-    #[test]
-    fn test_shift_right_nulls() {
-        let value_array = Arc::new(Int32Array::from(vec![Some(4), None, Some(8)]));
-        let shift_array = Arc::new(Int32Array::from(vec![Some(1), Some(2), None]));
-        let result = SparkShiftRight::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<Int32Type>();
-        assert_eq!(arr.value(0), 2); // 4 >> 1 = 2
-        assert!(arr.is_null(1)); // null >> 2 = null
-        assert!(arr.is_null(2)); // 8 >> null = null
-    }
-
-    #[test]
-    fn test_shift_right_large_shift() {
-        let value_array = Arc::new(Int32Array::from(vec![1, 2, 3]));
-        let shift_array = Arc::new(Int32Array::from(vec![32, 33, 64]));
-        let result = SparkShiftRight::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<Int32Type>();
-        assert_eq!(arr.value(0), 1); // 1 >> 32 = 1
-        assert_eq!(arr.value(1), 1); // 2 >> 33 = 1
-        assert_eq!(arr.value(2), 3); // 3 >> 64 = 3
-    }
-
-    #[test]
-    fn test_shift_right_negative_shift() {
-        let value_array = Arc::new(Int32Array::from(vec![4, 8, 16]));
-        let shift_array = Arc::new(Int32Array::from(vec![-1, -2, -3]));
-        let result = SparkShiftRight::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<Int32Type>();
-        assert_eq!(arr.value(0), 0); // 4 >> -1 = 0
-        assert_eq!(arr.value(1), 0); // 8 >> -2 = 0
-        assert_eq!(arr.value(2), 0); // 16 >> -3 = 0
-    }
-
-    #[test]
-    fn test_shift_right_negative_values() {
-        let value_array = Arc::new(Int32Array::from(vec![-4, -8, -16]));
-        let shift_array = Arc::new(Int32Array::from(vec![1, 2, 3]));
-        let result = SparkShiftRight::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<Int32Type>();
-        // For signed integers, right shift preserves the sign bit
-        assert_eq!(arr.value(0), -2); // -4 >> 1 = -2
-        assert_eq!(arr.value(1), -2); // -8 >> 2 = -2
-        assert_eq!(arr.value(2), -2); // -16 >> 3 = -2
-    }
-
-    #[test]
-    fn test_shift_left_int32() {
-        let value_array = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
-        let shift_array = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
-        let result = SparkShiftLeft::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<Int32Type>();
-        assert_eq!(arr.value(0), 2); // 1 << 1 = 2
-        assert_eq!(arr.value(1), 8); // 2 << 2 = 8
-        assert_eq!(arr.value(2), 24); // 3 << 3 = 24
-        assert_eq!(arr.value(3), 64); // 4 << 4 = 64
-    }
-
-    #[test]
-    fn test_shift_left_int64() {
-        let value_array = Arc::new(Int64Array::from(vec![1i64, 2, 3]));
-        let shift_array = Arc::new(Int32Array::from(vec![1, 2, 3]));
-        let result = SparkShiftLeft::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<Int64Type>();
-        assert_eq!(arr.value(0), 2); // 1 << 1 = 2
-        assert_eq!(arr.value(1), 8); // 2 << 2 = 8
-        assert_eq!(arr.value(2), 24); // 3 << 3 = 24
-    }
-
-    #[test]
-    fn test_shift_left_uint32() {
-        let value_array = Arc::new(UInt32Array::from(vec![1u32, 2, 3]));
-        let shift_array = Arc::new(Int32Array::from(vec![1, 2, 3]));
-        let result = SparkShiftLeft::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<UInt32Type>();
-        assert_eq!(arr.value(0), 2); // 1 << 1 = 2
-        assert_eq!(arr.value(1), 8); // 2 << 2 = 8
-        assert_eq!(arr.value(2), 24); // 3 << 3 = 24
-    }
-
-    #[test]
-    fn test_shift_left_uint64() {
-        let value_array = Arc::new(UInt64Array::from(vec![1u64, 2, 3]));
-        let shift_array = Arc::new(Int32Array::from(vec![1, 2, 3]));
-        let result = SparkShiftLeft::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<UInt64Type>();
-        assert_eq!(arr.value(0), 2); // 1 << 1 = 2
-        assert_eq!(arr.value(1), 8); // 2 << 2 = 8
-        assert_eq!(arr.value(2), 24); // 3 << 3 = 24
-    }
-
-    #[test]
-    fn test_shift_left_nulls() {
-        let value_array = Arc::new(Int32Array::from(vec![Some(2), None, Some(3)]));
-        let shift_array = Arc::new(Int32Array::from(vec![Some(1), Some(2), None]));
-        let result = SparkShiftLeft::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<Int32Type>();
-        assert_eq!(arr.value(0), 4); // 2 << 1 = 4
-        assert!(arr.is_null(1)); // null << 2 = null
-        assert!(arr.is_null(2)); // 3 << null = null
-    }
-
-    #[test]
-    fn test_shift_left_large_shift() {
-        let value_array = Arc::new(Int32Array::from(vec![1, 2, 3]));
-        let shift_array = Arc::new(Int32Array::from(vec![32, 33, 64]));
-        let result = SparkShiftLeft::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<Int32Type>();
-        assert_eq!(arr.value(0), 1); // 1 << 32 = 0 (overflow)
-        assert_eq!(arr.value(1), 4); // 2 << 33 = 0 (overflow)
-        assert_eq!(arr.value(2), 3); // 3 << 64 = 0 (overflow)
-    }
 
     #[test]
-    fn test_shift_left_negative_shift() {
-        let value_array = Arc::new(Int32Array::from(vec![4, 8, 16]));
-        let shift_array = Arc::new(Int32Array::from(vec![-1, -2, -3]));
-        let result = SparkShiftLeft::new()
-            .spark_shift(&[value_array, shift_array])
-            .unwrap();
-        let arr = result.as_primitive::<Int32Type>();
-        assert_eq!(arr.value(0), 0); // 4 << -1 = 0
-        assert_eq!(arr.value(1), 0); // 8 << -2 = 0
-        assert_eq!(arr.value(2), 0); // 16 << -3 = 0
+    fn test_bit_shift_nullability() -> Result<()> {
+        let func = SparkBitShift::left();
+
+        let non_nullable_value: FieldRef =
+            Arc::new(Field::new("value", DataType::Int64, false));
+        let non_nullable_shift: FieldRef =
+            Arc::new(Field::new("shift", DataType::Int32, false));
+
+        let out = func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[
+                Arc::clone(&non_nullable_value),
+                Arc::clone(&non_nullable_shift),
+            ],
+            scalar_arguments: &[None, None],
+        })?;
+
+        assert_eq!(out.data_type(), non_nullable_value.data_type());
+        assert!(
+            !out.is_nullable(),
+            "shift result should be non-nullable when both inputs are non-nullable"
+        );
+
+        let nullable_value: FieldRef =
+            Arc::new(Field::new("value", DataType::Int64, true));
+        let out_nullable_value = func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[Arc::clone(&nullable_value), Arc::clone(&non_nullable_shift)],
+            scalar_arguments: &[None, None],
+        })?;
+        assert!(
+            out_nullable_value.is_nullable(),
+            "shift result should be nullable when value is nullable"
+        );
+
+        let nullable_shift: FieldRef =
+            Arc::new(Field::new("shift", DataType::Int32, true));
+        let out_nullable_shift = func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[non_nullable_value, nullable_shift],
+            scalar_arguments: &[None, None],
+        })?;
+        assert!(
+            out_nullable_shift.is_nullable(),
+            "shift result should be nullable when shift is nullable"
+        );
+
+        Ok(())
     }
 }
diff --git a/datafusion/spark/src/function/bitwise/bitwise_not.rs b/datafusion/spark/src/function/bitwise/bitwise_not.rs
index 2f3fe227833b0..9252e1fb606da 100644
--- a/datafusion/spark/src/function/bitwise/bitwise_not.rs
+++ b/datafusion/spark/src/function/bitwise/bitwise_not.rs
@@ -15,14 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use arrow::array::*;
 use arrow::compute::kernels::bitwise;
-use arrow::datatypes::{Int16Type, Int32Type, Int64Type, Int8Type};
-use arrow::{array::*, datatypes::DataType};
-use datafusion_common::{plan_err, Result};
+use arrow::datatypes::{
+    DataType, Field, FieldRef, Int8Type, Int16Type, Int32Type, Int64Type,
+};
+use datafusion_common::{Result, internal_err, plan_err};
 use datafusion_expr::{ColumnarValue, TypeSignature, Volatility};
-use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature};
+use datafusion_expr::{ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature};
 use datafusion_functions::utils::make_scalar_function;
-use std::{any::Any, sync::Arc};
+use std::sync::Arc;
 
 #[derive(Debug, PartialEq, Eq, Hash)]
 pub struct SparkBitwiseNot {
@@ -52,10 +54,6 @@ impl SparkBitwiseNot {
 }
 
 impl ScalarUDFImpl for SparkBitwiseNot {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "bitwise_not"
     }
@@ -64,8 +62,18 @@ impl ScalarUDFImpl for SparkBitwiseNot {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(arg_types[0].clone())
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!(
+            "SparkBitwiseNot: return_type() is not used; return_field_from_args() is implemented"
+        )
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        Ok(Arc::new(Field::new(
+            self.name(),
+            args.arg_fields[0].data_type().clone(),
+            args.arg_fields[0].is_nullable(),
+        )))
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
@@ -107,3 +115,64 @@ pub fn spark_bitwise_not(args: &[ArrayRef]) -> Result<ArrayRef> {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_bitwise_not_nullability() {
+        let bitwise_not = SparkBitwiseNot::new();
+
+        // --- non-nullable Int32 input ---
+        let non_nullable_i32 = Arc::new(Field::new("c", DataType::Int32, false));
+        let out_non_null = bitwise_not
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&non_nullable_i32)],
+                // single-argument function -> one scalar_argument slot (None)
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        // result should be non-nullable and the same DataType as input
+        assert!(!out_non_null.is_nullable());
+        assert_eq!(out_non_null.data_type(), &DataType::Int32);
+
+        // --- nullable Int32 input ---
+        let nullable_i32 = Arc::new(Field::new("c", DataType::Int32, true));
+        let out_nullable = bitwise_not
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&nullable_i32)],
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        // result should be nullable and the same DataType as input
+        assert!(out_nullable.is_nullable());
+        assert_eq!(out_nullable.data_type(), &DataType::Int32);
+
+        // --- also test another integer type (Int64) for completeness ---
+        let non_nullable_i64 = Arc::new(Field::new("c", DataType::Int64, false));
+        let out_i64 = bitwise_not
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&non_nullable_i64)],
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        assert!(!out_i64.is_nullable());
+        assert_eq!(out_i64.data_type(), &DataType::Int64);
+
+        let nullable_i64 = Arc::new(Field::new("c", DataType::Int64, true));
+        let out_i64_null = bitwise_not
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&nullable_i64)],
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        assert!(out_i64_null.is_nullable());
+        assert_eq!(out_i64_null.data_type(), &DataType::Int64);
+    }
+}
diff --git a/datafusion/spark/src/function/bitwise/mod.rs b/datafusion/spark/src/function/bitwise/mod.rs
index d729a3ddd09a1..769ecf5c2fef5 100644
--- a/datafusion/spark/src/function/bitwise/mod.rs
+++ b/datafusion/spark/src/function/bitwise/mod.rs
@@ -24,9 +24,21 @@ use datafusion_expr::ScalarUDF;
 use datafusion_functions::make_udf_function;
 use std::sync::Arc;
 
-make_udf_function!(bit_shift::SparkShiftLeft, shiftleft);
-make_udf_function!(bit_shift::SparkShiftRight, shiftright);
-make_udf_function!(bit_shift::SparkShiftRightUnsigned, shiftrightunsigned);
+make_udf_function!(
+    bit_shift::SparkBitShift,
+    shiftleft,
+    bit_shift::SparkBitShift::left
+);
+make_udf_function!(
+    bit_shift::SparkBitShift,
+    shiftright,
+    bit_shift::SparkBitShift::right
+);
+make_udf_function!(
+    bit_shift::SparkBitShift,
+    shiftrightunsigned,
+    bit_shift::SparkBitShift::right_unsigned
+);
 make_udf_function!(bit_get::SparkBitGet, bit_get);
 make_udf_function!(bit_count::SparkBitCount, bit_count);
 make_udf_function!(bitwise_not::SparkBitwiseNot, bitwise_not);
diff --git a/datafusion/spark/src/function/collection/mod.rs b/datafusion/spark/src/function/collection/mod.rs
index a87df9a2c87a0..6871e3aba6469 100644
--- a/datafusion/spark/src/function/collection/mod.rs
+++ b/datafusion/spark/src/function/collection/mod.rs
@@ -15,11 +15,20 @@
 // specific language governing permissions and limitations
 // under the License.
 
+pub mod size;
+
 use datafusion_expr::ScalarUDF;
+use datafusion_functions::make_udf_function;
 use std::sync::Arc;
 
-pub mod expr_fn {}
+make_udf_function!(size::SparkSize, size);
+
+pub mod expr_fn {
+    use datafusion_functions::export_functions;
+
+    export_functions!((size, "Return the size of an array or map.", arg));
+}
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
-    vec![]
+    vec![size()]
 }
diff --git a/datafusion/spark/src/function/collection/size.rs b/datafusion/spark/src/function/collection/size.rs
new file mode 100644
index 0000000000000..e53bbf86d78cb
--- /dev/null
+++ b/datafusion/spark/src/function/collection/size.rs
@@ -0,0 +1,157 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef, AsArray, Int32Array};
+use arrow::compute::kernels::length::length as arrow_length;
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::{Result, plan_err};
+use datafusion_expr::{
+    ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, ReturnFieldArgs,
+    ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use std::sync::Arc;
+
+/// Spark-compatible `size` function.
+///
+/// Returns the number of elements in an array or the number of key-value pairs in a map.
+/// Returns -1 for null input (Spark behavior).
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkSize {
+    signature: Signature,
+}
+
+impl Default for SparkSize {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkSize {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    // Array Type
+                    TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+                        arguments: vec![ArrayFunctionArgument::Array],
+                        array_coercion: None,
+                    }),
+                    // Map Type
+                    TypeSignature::ArraySignature(ArrayFunctionSignature::MapArray),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkSize {
+    fn name(&self) -> &str {
+        "size"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Int32)
+    }
+
+    fn return_field_from_args(&self, _args: ReturnFieldArgs) -> Result<FieldRef> {
+        // nullable=false for legacy behavior (NULL -> -1); set to input nullability for null-on-null
+        Ok(Arc::new(Field::new(self.name(), DataType::Int32, false)))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_size_inner, vec![])(&args.args)
+    }
+}
+
+fn spark_size_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let array = &args[0];
+
+    match array.data_type() {
+        DataType::List(_) => {
+            if array.null_count() == 0 {
+                Ok(arrow_length(array)?)
+            } else {
+                let list_array = array.as_list::<i32>();
+                let lengths: Vec<i32> = list_array
+                    .offsets()
+                    .lengths()
+                    .enumerate()
+                    .map(|(i, len)| if array.is_null(i) { -1 } else { len as i32 })
+                    .collect();
+                Ok(Arc::new(Int32Array::from(lengths)))
+            }
+        }
+        DataType::FixedSizeList(_, size) => {
+            if array.null_count() == 0 {
+                Ok(arrow_length(array)?)
+            } else {
+                let length: Vec<i32> = (0..array.len())
+                    .map(|i| if array.is_null(i) { -1 } else { *size })
+                    .collect();
+                Ok(Arc::new(Int32Array::from(length)))
+            }
+        }
+        DataType::LargeList(_) => {
+            // Arrow length kernel returns Int64 for LargeList
+            let list_array = array.as_list::<i64>();
+            if array.null_count() == 0 {
+                let lengths: Vec<i32> = list_array
+                    .offsets()
+                    .lengths()
+                    .map(|len| len as i32)
+                    .collect();
+                Ok(Arc::new(Int32Array::from(lengths)))
+            } else {
+                let lengths: Vec<i32> = list_array
+                    .offsets()
+                    .lengths()
+                    .enumerate()
+                    .map(|(i, len)| if array.is_null(i) { -1 } else { len as i32 })
+                    .collect();
+                Ok(Arc::new(Int32Array::from(lengths)))
+            }
+        }
+        DataType::Map(_, _) => {
+            let map_array = array.as_map();
+            let length: Vec<i32> = if array.null_count() == 0 {
+                map_array
+                    .offsets()
+                    .lengths()
+                    .map(|len| len as i32)
+                    .collect()
+            } else {
+                map_array
+                    .offsets()
+                    .lengths()
+                    .enumerate()
+                    .map(|(i, len)| if array.is_null(i) { -1 } else { len as i32 })
+                    .collect()
+            };
+            Ok(Arc::new(Int32Array::from(length)))
+        }
+        DataType::Null => Ok(Arc::new(Int32Array::from(vec![-1; array.len()]))),
+        dt => {
+            plan_err!("size function does not support type: {}", dt)
+        }
+    }
+}
diff --git a/datafusion/spark/src/function/conditional/if.rs b/datafusion/spark/src/function/conditional/if.rs
index aee43dd8d0a58..b185a5187055d 100644
--- a/datafusion/spark/src/function/conditional/if.rs
+++ b/datafusion/spark/src/function/conditional/if.rs
@@ -16,10 +16,10 @@
 // under the License.
 
 use arrow::datatypes::DataType;
-use datafusion_common::{internal_err, plan_err, Result};
+use datafusion_common::{Result, internal_err, plan_err};
 use datafusion_expr::{
-    binary::try_type_union_resolution, simplify::ExprSimplifyResult, when, ColumnarValue,
-    Expr, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Expr, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+    binary::try_type_union_resolution, simplify::ExprSimplifyResult, when,
 };
 
 #[derive(Debug, PartialEq, Eq, Hash)]
@@ -42,10 +42,6 @@ impl SparkIf {
 }
 
 impl ScalarUDFImpl for SparkIf {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "if"
     }
@@ -86,7 +82,7 @@ impl ScalarUDFImpl for SparkIf {
     fn simplify(
         &self,
         args: Vec<Expr>,
-        _info: &dyn datafusion_expr::simplify::SimplifyInfo,
+        _info: &datafusion_expr::simplify::SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         let condition = args[0].clone();
         let then_expr = args[1].clone();
diff --git a/datafusion/spark/src/function/conversion/cast.rs b/datafusion/spark/src/function/conversion/cast.rs
new file mode 100644
index 0000000000000..45d1b336261d7
--- /dev/null
+++ b/datafusion/spark/src/function/conversion/cast.rs
@@ -0,0 +1,1007 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef, AsArray, TimestampMicrosecondBuilder};
+use arrow::datatypes::{
+    ArrowPrimitiveType, DataType, Field, FieldRef, Float32Type, Float64Type, Int8Type,
+    Int16Type, Int32Type, Int64Type, TimeUnit,
+};
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::types::{
+    logical_float32, logical_float64, logical_int8, logical_int16, logical_int32,
+    logical_int64, logical_string,
+};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
+use datafusion_expr::{Coercion, TypeSignatureClass};
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl,
+    Signature, TypeSignature, Volatility,
+};
+use std::sync::Arc;
+
+const MICROS_PER_SECOND: i64 = 1_000_000;
+
+/// Convert integer seconds to microseconds with saturating overflow behavior
+#[inline]
+fn secs_to_micros(secs: i64) -> i64 {
+    secs.saturating_mul(MICROS_PER_SECOND)
+}
+
+/// Convert float seconds to microseconds
+/// Returns None for NaN/Infinity in non-ANSI mode, error in ANSI mode
+/// Saturates to i64::MAX/MIN for overflow
+#[inline]
+fn float_secs_to_micros(val: f64, enable_ansi_mode: bool) -> Result<Option<i64>> {
+    if val.is_nan() || val.is_infinite() {
+        if enable_ansi_mode {
+            let display_val = if val.is_nan() {
+                "NaN"
+            } else if val.is_sign_positive() {
+                "Infinity"
+            } else {
+                "-Infinity"
+            };
+            return exec_err!("Cannot cast {} to TIMESTAMP", display_val);
+        }
+        return Ok(None);
+    }
+    let micros = val * MICROS_PER_SECOND as f64;
+
+    // Bounds check for i64 range.
+    // Note on precision: i64::MIN (-2^63) is exactly representable in f64,
+    // but i64::MAX (2^63 - 1) is not - it rounds up to 2^63 (i64::MAX + 1).
+    // We use strict `<` for the upper bound to reject values >= 2^63,
+    // which correctly handles the precision loss edge case.
+    if micros >= i64::MIN as f64 && micros < i64::MAX as f64 {
+        Ok(Some(micros as i64))
+    } else {
+        if enable_ansi_mode {
+            return exec_err!("Overflow casting {} to TIMESTAMP", val);
+        }
+        // Saturate to i64::MAX or i64::MIN like Spark does for overflow
+        if micros.is_sign_negative() {
+            Ok(Some(i64::MIN))
+        } else {
+            Ok(Some(i64::MAX))
+        }
+    }
+}
+
+/// Spark-compatible `cast` function for type conversions
+///
+/// This implements Spark's CAST expression with a target type parameter
+///
+/// # Usage
+/// ```sql
+/// SELECT spark_cast(value, 'timestamp')
+/// ```
+///
+/// # Currently supported conversions
+/// - Int8/Int16/Int32/Int64/Float32/Float64 -> Timestamp (target_type = 'timestamp')
+///
+/// The integer value is interpreted as seconds since the Unix epoch (1970-01-01 00:00:00 UTC)
+/// and converted to a timestamp with microsecond precision (matches spark's spec). Same is the case
+/// with Float but with higher precision to support micro / nanoseconds.
+///
+/// # Overflow behavior
+/// Uses saturating multiplication to handle overflow - values that would overflow
+/// i64 when multiplied by 1,000,000 are clamped to i64::MAX or i64::MIN
+///
+/// # References
+/// - <https://spark.apache.org/docs/latest/api/sql/index.html#cast>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkCast {
+    signature: Signature,
+    timezone: Option<Arc<str>>,
+}
+
+impl Default for SparkCast {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkCast {
+    pub fn new() -> Self {
+        Self::new_with_config(&ConfigOptions::default())
+    }
+
+    pub fn new_with_config(config: &ConfigOptions) -> Self {
+        // First arg: value to cast
+        // Second arg: target datatype as Utf8 string literal (ex : 'timestamp')
+        let string_arg =
+            Coercion::new_exact(TypeSignatureClass::Native(logical_string()));
+
+        // Supported input types: signed integers and floats
+        let input_type_signatures = [
+            logical_int8(),
+            logical_int16(),
+            logical_int32(),
+            logical_int64(),
+            logical_float32(),
+            logical_float64(),
+        ]
+        .map(|input_type| {
+            TypeSignature::Coercible(vec![
+                Coercion::new_exact(TypeSignatureClass::Native(input_type)),
+                string_arg.clone(),
+            ])
+        });
+
+        Self {
+            signature: Signature::new(
+                TypeSignature::OneOf(Vec::from(input_type_signatures)),
+                Volatility::Stable,
+            ),
+            timezone: config
+                .execution
+                .time_zone
+                .as_ref()
+                .map(|tz| Arc::from(tz.as_str()))
+                .or_else(|| Some(Arc::from("UTC"))),
+        }
+    }
+}
+
+/// Parse target type string into a DataType
+fn parse_target_type(type_str: &str, timezone: Option<Arc<str>>) -> Result<DataType> {
+    match type_str.to_lowercase().as_str() {
+        // further data type support in future
+        "timestamp" => Ok(DataType::Timestamp(TimeUnit::Microsecond, timezone)),
+        other => exec_err!(
+            "Unsupported spark_cast target type '{}'. Supported types: timestamp",
+            other
+        ),
+    }
+}
+
+/// Extract target type string from scalar arguments
+fn get_target_type_from_scalar_args(
+    scalar_args: &[Option<&ScalarValue>],
+    timezone: Option<Arc<str>>,
+) -> Result<DataType> {
+    let type_arg = scalar_args.get(1).and_then(|opt| *opt);
+
+    match type_arg {
+        Some(ScalarValue::Utf8(Some(s)))
+        | Some(ScalarValue::LargeUtf8(Some(s)))
+        | Some(ScalarValue::Utf8View(Some(s))) => parse_target_type(s, timezone),
+        _ => exec_err!(
+            "spark_cast requires second argument to be a string of target data type ex: timestamp"
+        ),
+    }
+}
+
+fn cast_int_to_timestamp<T: ArrowPrimitiveType>(
+    array: &ArrayRef,
+    timezone: Option<Arc<str>>,
+) -> Result<ArrayRef>
+where
+    T::Native: Into<i64>,
+{
+    let arr = array.as_primitive::<T>();
+    let mut builder = TimestampMicrosecondBuilder::with_capacity(arr.len());
+
+    for i in 0..arr.len() {
+        if arr.is_null(i) {
+            builder.append_null();
+        } else {
+            // spark saturates to i64 min/max
+            let micros = secs_to_micros(arr.value(i).into());
+            builder.append_value(micros);
+        }
+    }
+
+    Ok(Arc::new(builder.finish().with_timezone_opt(timezone)))
+}
+
+/// Cast float to timestamp
+/// Float value represents seconds (with fractional part) since Unix epoch
+/// NaN and Infinity: error in ANSI mode, NULL in non-ANSI mode
+fn cast_float_to_timestamp<T: ArrowPrimitiveType>(
+    array: &ArrayRef,
+    timezone: Option<Arc<str>>,
+    enable_ansi_mode: bool,
+) -> Result<ArrayRef>
+where
+    T::Native: Into<f64>,
+{
+    let arr = array.as_primitive::<T>();
+    let mut builder = TimestampMicrosecondBuilder::with_capacity(arr.len());
+
+    for i in 0..arr.len() {
+        if arr.is_null(i) {
+            builder.append_null();
+        } else {
+            let val: f64 = arr.value(i).into();
+            match float_secs_to_micros(val, enable_ansi_mode)? {
+                Some(micros) => builder.append_value(micros),
+                None => builder.append_null(),
+            }
+        }
+    }
+
+    Ok(Arc::new(builder.finish().with_timezone_opt(timezone)))
+}
+
+impl ScalarUDFImpl for SparkCast {
+    fn name(&self) -> &str {
+        "spark_cast"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn with_updated_config(&self, config: &ConfigOptions) -> Option<ScalarUDF> {
+        Some(ScalarUDF::from(Self::new_with_config(config)))
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let return_type = get_target_type_from_scalar_args(
+            args.scalar_arguments,
+            self.timezone.clone(),
+        )?;
+        Ok(Arc::new(Field::new(self.name(), return_type, true)))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let enable_ansi_mode = args.config_options.execution.enable_ansi_mode;
+        let target_type = args.return_field.data_type();
+        match target_type {
+            DataType::Timestamp(TimeUnit::Microsecond, tz) => {
+                cast_to_timestamp(&args.args[0], tz.clone(), enable_ansi_mode)
+            }
+            other => exec_err!("Unsupported spark_cast target type: {:?}", other),
+        }
+    }
+}
+
+/// Cast value to timestamp internal function
+fn cast_to_timestamp(
+    input: &ColumnarValue,
+    timezone: Option<Arc<str>>,
+    enable_ansi_mode: bool,
+) -> Result<ColumnarValue> {
+    match input {
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Null => Ok(ColumnarValue::Array(Arc::new(
+                arrow::array::TimestampMicrosecondArray::new_null(array.len())
+                    .with_timezone_opt(timezone),
+            ))),
+            DataType::Int8 => Ok(ColumnarValue::Array(
+                cast_int_to_timestamp::<Int8Type>(array, timezone)?,
+            )),
+            DataType::Int16 => Ok(ColumnarValue::Array(cast_int_to_timestamp::<
+                Int16Type,
+            >(array, timezone)?)),
+            DataType::Int32 => Ok(ColumnarValue::Array(cast_int_to_timestamp::<
+                Int32Type,
+            >(array, timezone)?)),
+            DataType::Int64 => Ok(ColumnarValue::Array(cast_int_to_timestamp::<
+                Int64Type,
+            >(array, timezone)?)),
+            DataType::Float32 => Ok(ColumnarValue::Array(cast_float_to_timestamp::<
+                Float32Type,
+            >(
+                array,
+                timezone,
+                enable_ansi_mode,
+            )?)),
+            DataType::Float64 => Ok(ColumnarValue::Array(cast_float_to_timestamp::<
+                Float64Type,
+            >(
+                array,
+                timezone,
+                enable_ansi_mode,
+            )?)),
+            other => exec_err!("Unsupported cast from {:?} to timestamp", other),
+        },
+        ColumnarValue::Scalar(scalar) => {
+            let micros = match scalar {
+                ScalarValue::Null
+                | ScalarValue::Int8(None)
+                | ScalarValue::Int16(None)
+                | ScalarValue::Int32(None)
+                | ScalarValue::Int64(None)
+                | ScalarValue::Float32(None)
+                | ScalarValue::Float64(None) => None,
+                ScalarValue::Int8(Some(v)) => Some(secs_to_micros((*v).into())),
+                ScalarValue::Int16(Some(v)) => Some(secs_to_micros((*v).into())),
+                ScalarValue::Int32(Some(v)) => Some(secs_to_micros((*v).into())),
+                ScalarValue::Int64(Some(v)) => Some(secs_to_micros(*v)),
+                ScalarValue::Float32(Some(v)) => {
+                    float_secs_to_micros(*v as f64, enable_ansi_mode)?
+                }
+                ScalarValue::Float64(Some(v)) => {
+                    float_secs_to_micros(*v, enable_ansi_mode)?
+                }
+                other => {
+                    return exec_err!("Unsupported cast from {:?} to timestamp", other);
+                }
+            };
+            Ok(ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(
+                micros, timezone,
+            )))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{
+        Float32Array, Float64Array, Int8Array, Int16Array, Int32Array, Int64Array,
+    };
+    use arrow::datatypes::TimestampMicrosecondType;
+
+    // helpers to make testing easier
+    fn make_args(input: ColumnarValue, target_type: &str) -> ScalarFunctionArgs {
+        make_args_with_timezone(input, target_type, Some("UTC"))
+    }
+
+    fn make_args_with_timezone(
+        input: ColumnarValue,
+        target_type: &str,
+        timezone: Option<&str>,
+    ) -> ScalarFunctionArgs {
+        let return_field = Arc::new(Field::new(
+            "result",
+            DataType::Timestamp(
+                TimeUnit::Microsecond,
+                Some(Arc::from(timezone.unwrap())),
+            ),
+            true,
+        ));
+        let mut config = ConfigOptions::default();
+        if let Some(tz) = timezone {
+            config.execution.time_zone = Some(tz.to_string());
+        }
+        ScalarFunctionArgs {
+            args: vec![
+                input,
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(target_type.to_string()))),
+            ],
+            arg_fields: vec![],
+            number_rows: 0,
+            return_field,
+            config_options: Arc::new(config),
+        }
+    }
+
+    fn assert_scalar_timestamp(result: ColumnarValue, expected: i64) {
+        assert_scalar_timestamp_with_tz(result, expected, "UTC");
+    }
+
+    fn assert_scalar_timestamp_with_tz(
+        result: ColumnarValue,
+        expected: i64,
+        expected_tz: &str,
+    ) {
+        match result {
+            ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(
+                Some(val),
+                Some(tz),
+            )) => {
+                assert_eq!(val, expected);
+                assert_eq!(tz.as_ref(), expected_tz);
+            }
+            _ => {
+                panic!(
+                    "Expected scalar timestamp with value {expected} and {expected_tz} timezone"
+                )
+            }
+        }
+    }
+
+    fn assert_scalar_null(result: ColumnarValue) {
+        assert_scalar_null_with_tz(result, "UTC");
+    }
+
+    fn assert_scalar_null_with_tz(result: ColumnarValue, expected_tz: &str) {
+        match result {
+            ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(None, Some(tz))) => {
+                assert_eq!(tz.as_ref(), expected_tz);
+            }
+            _ => panic!("Expected null scalar timestamp with {expected_tz} timezone"),
+        }
+    }
+
+    #[test]
+    fn test_cast_int8_array_to_timestamp() {
+        let array: ArrayRef = Arc::new(Int8Array::from(vec![
+            Some(0),
+            Some(1),
+            Some(-1),
+            Some(127),
+            Some(-128),
+            None,
+        ]));
+
+        let cast = SparkCast::new();
+        let args = make_args(ColumnarValue::Array(array), "timestamp");
+        let result = cast.invoke_with_args(args).unwrap();
+
+        match result {
+            ColumnarValue::Array(result_array) => {
+                let ts_array = result_array.as_primitive::<TimestampMicrosecondType>();
+                assert_eq!(ts_array.value(0), 0);
+                assert_eq!(ts_array.value(1), 1_000_000);
+                assert_eq!(ts_array.value(2), -1_000_000);
+                assert_eq!(ts_array.value(3), 127_000_000);
+                assert_eq!(ts_array.value(4), -128_000_000);
+                assert!(ts_array.is_null(5));
+            }
+            _ => panic!("Expected array result"),
+        }
+    }
+
+    #[test]
+    fn test_cast_int16_array_to_timestamp() {
+        let array: ArrayRef = Arc::new(Int16Array::from(vec![
+            Some(0),
+            Some(32767),
+            Some(-32768),
+            None,
+        ]));
+
+        let cast = SparkCast::new();
+        let args = make_args(ColumnarValue::Array(array), "timestamp");
+        let result = cast.invoke_with_args(args).unwrap();
+
+        match result {
+            ColumnarValue::Array(result_array) => {
+                let ts_array = result_array.as_primitive::<TimestampMicrosecondType>();
+                assert_eq!(ts_array.value(0), 0);
+                assert_eq!(ts_array.value(1), 32_767_000_000);
+                assert_eq!(ts_array.value(2), -32_768_000_000);
+                assert!(ts_array.is_null(3));
+            }
+            _ => panic!("Expected array result"),
+        }
+    }
+
+    #[test]
+    fn test_cast_int32_array_to_timestamp() {
+        let array: ArrayRef =
+            Arc::new(Int32Array::from(vec![Some(0), Some(1704067200), None]));
+
+        let cast = SparkCast::new();
+        let args = make_args(ColumnarValue::Array(array), "timestamp");
+        let result = cast.invoke_with_args(args).unwrap();
+
+        match result {
+            ColumnarValue::Array(result_array) => {
+                let ts_array = result_array.as_primitive::<TimestampMicrosecondType>();
+                assert_eq!(ts_array.value(0), 0);
+                assert_eq!(ts_array.value(1), 1_704_067_200_000_000);
+                assert!(ts_array.is_null(2));
+            }
+            _ => panic!("Expected array result"),
+        }
+    }
+
+    #[test]
+    fn test_cast_int64_array_overflow() {
+        let array: ArrayRef =
+            Arc::new(Int64Array::from(vec![Some(i64::MAX), Some(i64::MIN)]));
+
+        let cast = SparkCast::new();
+        let args = make_args(ColumnarValue::Array(array), "timestamp");
+        let result = cast.invoke_with_args(args).unwrap();
+
+        match result {
+            ColumnarValue::Array(result_array) => {
+                let ts_array = result_array.as_primitive::<TimestampMicrosecondType>();
+                // saturating_mul clamps to i64::MAX/MIN
+                assert_eq!(ts_array.value(0), i64::MAX);
+                assert_eq!(ts_array.value(1), i64::MIN);
+            }
+            _ => panic!("Expected array result"),
+        }
+    }
+
+    #[test]
+    fn test_cast_int64_array_to_timestamp() {
+        let array: ArrayRef = Arc::new(Int64Array::from(vec![
+            Some(0),
+            Some(1704067200),
+            Some(-86400),
+            None,
+        ]));
+
+        let cast = SparkCast::new();
+        let args = make_args(ColumnarValue::Array(array), "timestamp");
+        let result = cast.invoke_with_args(args).unwrap();
+
+        match result {
+            ColumnarValue::Array(result_array) => {
+                let ts_array = result_array.as_primitive::<TimestampMicrosecondType>();
+                assert_eq!(ts_array.value(0), 0);
+                assert_eq!(ts_array.value(1), 1_704_067_200_000_000);
+                assert_eq!(ts_array.value(2), -86_400_000_000); // -1 day
+                assert!(ts_array.is_null(3));
+            }
+            _ => panic!("Expected array result"),
+        }
+    }
+
+    #[test]
+    fn test_cast_scalar_int8() {
+        let cast = SparkCast::new();
+        let args = make_args(
+            ColumnarValue::Scalar(ScalarValue::Int8(Some(100))),
+            "timestamp",
+        );
+        let result = cast.invoke_with_args(args).unwrap();
+        assert_scalar_timestamp(result, 100_000_000);
+    }
+
+    #[test]
+    fn test_cast_scalar_int16() {
+        let cast = SparkCast::new();
+        let args = make_args(
+            ColumnarValue::Scalar(ScalarValue::Int16(Some(100))),
+            "timestamp",
+        );
+        let result = cast.invoke_with_args(args).unwrap();
+        assert_scalar_timestamp(result, 100_000_000);
+    }
+
+    #[test]
+    fn test_cast_scalar_int32() {
+        let cast = SparkCast::new();
+        let args = make_args(
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(1704067200))),
+            "timestamp",
+        );
+        let result = cast.invoke_with_args(args).unwrap();
+        assert_scalar_timestamp(result, 1_704_067_200_000_000);
+    }
+
+    #[test]
+    fn test_cast_scalar_int64() {
+        let cast = SparkCast::new();
+        let args = make_args(
+            ColumnarValue::Scalar(ScalarValue::Int64(Some(1704067200))),
+            "timestamp",
+        );
+        let result = cast.invoke_with_args(args).unwrap();
+        assert_scalar_timestamp(result, 1_704_067_200_000_000);
+    }
+
+    #[test]
+    fn test_cast_scalar_negative() {
+        let cast = SparkCast::new();
+        let args = make_args(
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(-86400))),
+            "timestamp",
+        );
+        let result = cast.invoke_with_args(args).unwrap();
+        // -86400 seconds = -1 day before epoch
+        assert_scalar_timestamp(result, -86_400_000_000);
+    }
+
+    #[test]
+    fn test_cast_scalar_null() {
+        let cast = SparkCast::new();
+        let args =
+            make_args(ColumnarValue::Scalar(ScalarValue::Int64(None)), "timestamp");
+        let result = cast.invoke_with_args(args).unwrap();
+        assert_scalar_null(result);
+    }
+
+    #[test]
+    fn test_cast_scalar_int64_overflow() {
+        let cast = SparkCast::new();
+        let args = make_args(
+            ColumnarValue::Scalar(ScalarValue::Int64(Some(i64::MAX))),
+            "timestamp",
+        );
+        let result = cast.invoke_with_args(args).unwrap();
+        // saturating_mul clamps to i64::MAX
+        assert_scalar_timestamp(result, i64::MAX);
+    }
+
+    #[test]
+    fn test_unsupported_target_type() {
+        let cast = SparkCast::new();
+        // invoke_with_args uses return_field which would be set correctly by planning
+        // For this test, we need to check return_field_from_args
+        let arg_fields: Vec<FieldRef> =
+            vec![Arc::new(Field::new("a", DataType::Int64, true))];
+        let target_type = ScalarValue::Utf8(Some("string".to_string()));
+        let scalar_arguments: Vec<Option<&ScalarValue>> = vec![None, Some(&target_type)];
+        let return_field_args = ReturnFieldArgs {
+            arg_fields: &arg_fields,
+            scalar_arguments: &scalar_arguments,
+        };
+        let result = cast.return_field_from_args(return_field_args);
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Unsupported spark_cast target type")
+        );
+    }
+
+    #[test]
+    fn test_unsupported_source_type() {
+        let cast = SparkCast::new();
+        let args = make_args(
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("2024-01-01".to_string()))),
+            "timestamp",
+        );
+        let result = cast.invoke_with_args(args);
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Unsupported cast from")
+        );
+    }
+
+    #[test]
+    fn test_cast_null_to_timestamp() {
+        let cast = SparkCast::new();
+        let args = make_args(ColumnarValue::Scalar(ScalarValue::Null), "timestamp");
+        let result = cast.invoke_with_args(args).unwrap();
+        assert_scalar_null(result);
+    }
+
+    #[test]
+    fn test_cast_null_array_to_timestamp() {
+        let array: ArrayRef = Arc::new(arrow::array::NullArray::new(3));
+
+        let cast = SparkCast::new();
+        let args = make_args(ColumnarValue::Array(array), "timestamp");
+        let result = cast.invoke_with_args(args).unwrap();
+
+        match result {
+            ColumnarValue::Array(result_array) => {
+                let ts_array = result_array.as_primitive::<TimestampMicrosecondType>();
+                assert_eq!(ts_array.len(), 3);
+                assert!(ts_array.is_null(0));
+                assert!(ts_array.is_null(1));
+                assert!(ts_array.is_null(2));
+            }
+            _ => panic!("Expected array result"),
+        }
+    }
+
+    #[test]
+    fn test_cast_int_to_timestamp_with_timezones() {
+        // Test with various timezones like Comet does
+        let timezones = [
+            "UTC",
+            "America/New_York",
+            "America/Los_Angeles",
+            "Europe/London",
+            "Asia/Tokyo",
+            "Australia/Sydney",
+        ];
+
+        let cast = SparkCast::new();
+        let test_value: i64 = 1704067200; // 2024-01-01 00:00:00 UTC
+        let expected_micros = test_value * MICROS_PER_SECOND;
+
+        for tz in timezones {
+            // scalar
+            let args = make_args_with_timezone(
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(test_value))),
+                "timestamp",
+                Some(tz),
+            );
+            let result = cast.invoke_with_args(args).unwrap();
+            assert_scalar_timestamp_with_tz(result, expected_micros, tz);
+
+            // array input
+            let array: ArrayRef =
+                Arc::new(Int64Array::from(vec![Some(test_value), None]));
+            let args = make_args_with_timezone(
+                ColumnarValue::Array(array),
+                "timestamp",
+                Some(tz),
+            );
+            let result = cast.invoke_with_args(args).unwrap();
+
+            match result {
+                ColumnarValue::Array(result_array) => {
+                    let ts_array =
+                        result_array.as_primitive::<TimestampMicrosecondType>();
+                    assert_eq!(ts_array.value(0), expected_micros);
+                    assert!(ts_array.is_null(1));
+                    assert_eq!(ts_array.timezone(), Some(tz));
+                }
+                _ => panic!("Expected array result for timezone {tz}"),
+            }
+        }
+    }
+
+    #[test]
+    fn test_cast_int_to_timestamp_default_timezone() {
+        let cast = SparkCast::new();
+        let args = make_args(
+            ColumnarValue::Scalar(ScalarValue::Int64(Some(0))),
+            "timestamp",
+        );
+        let result = cast.invoke_with_args(args).unwrap();
+        // Defaults to UTC
+        assert_scalar_timestamp_with_tz(result, 0, "UTC");
+    }
+
+    fn make_args_with_ansi_mode(
+        input: ColumnarValue,
+        target_type: &str,
+        enable_ansi_mode: bool,
+    ) -> ScalarFunctionArgs {
+        let return_field = Arc::new(Field::new(
+            "result",
+            DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from("UTC"))),
+            true,
+        ));
+        let mut config = ConfigOptions::default();
+        config.execution.time_zone = Some("UTC".to_string());
+        config.execution.enable_ansi_mode = enable_ansi_mode;
+        ScalarFunctionArgs {
+            args: vec![
+                input,
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(target_type.to_string()))),
+            ],
+            arg_fields: vec![],
+            number_rows: 0,
+            return_field,
+            config_options: Arc::new(config),
+        }
+    }
+
+    #[test]
+    fn test_cast_float64_array_to_timestamp() {
+        let array: ArrayRef = Arc::new(Float64Array::from(vec![
+            Some(0.0),
+            Some(1.5),
+            Some(-1.5),
+            Some(1704067200.123456),
+            None,
+        ]));
+
+        let cast = SparkCast::new();
+        let args = make_args(ColumnarValue::Array(array), "timestamp");
+        let result = cast.invoke_with_args(args).unwrap();
+
+        match result {
+            ColumnarValue::Array(result_array) => {
+                let ts_array = result_array.as_primitive::<TimestampMicrosecondType>();
+                assert_eq!(ts_array.value(0), 0);
+                assert_eq!(ts_array.value(1), 1_500_000); // 1.5 seconds
+                assert_eq!(ts_array.value(2), -1_500_000); // -1.5 seconds
+                assert_eq!(ts_array.value(3), 1_704_067_200_123_456); // with fractional
+                assert!(ts_array.is_null(4));
+            }
+            _ => panic!("Expected array result"),
+        }
+    }
+
+    #[test]
+    fn test_cast_float32_array_to_timestamp() {
+        let array: ArrayRef = Arc::new(Float32Array::from(vec![
+            Some(0.0f32),
+            Some(1.5f32),
+            Some(-1.5f32),
+            None,
+        ]));
+
+        let cast = SparkCast::new();
+        let args = make_args(ColumnarValue::Array(array), "timestamp");
+        let result = cast.invoke_with_args(args).unwrap();
+
+        match result {
+            ColumnarValue::Array(result_array) => {
+                let ts_array = result_array.as_primitive::<TimestampMicrosecondType>();
+                assert_eq!(ts_array.value(0), 0);
+                assert_eq!(ts_array.value(1), 1_500_000); // 1.5 seconds
+                assert_eq!(ts_array.value(2), -1_500_000); // -1.5 seconds
+                assert!(ts_array.is_null(3));
+            }
+            _ => panic!("Expected array result"),
+        }
+    }
+
+    #[test]
+    fn test_cast_scalar_float64() {
+        let cast = SparkCast::new();
+        let args = make_args(
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(1.5))),
+            "timestamp",
+        );
+        let result = cast.invoke_with_args(args).unwrap();
+        assert_scalar_timestamp(result, 1_500_000);
+    }
+
+    #[test]
+    fn test_cast_scalar_float32() {
+        let cast = SparkCast::new();
+        let args = make_args(
+            ColumnarValue::Scalar(ScalarValue::Float32(Some(1.5f32))),
+            "timestamp",
+        );
+        let result = cast.invoke_with_args(args).unwrap();
+        assert_scalar_timestamp(result, 1_500_000);
+    }
+
+    #[test]
+    fn test_cast_float_nan_non_ansi_mode() {
+        // In non-ANSI mode, NaN should return NULL
+        let cast = SparkCast::new();
+        let args = make_args_with_ansi_mode(
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(f64::NAN))),
+            "timestamp",
+            false,
+        );
+        let result = cast.invoke_with_args(args).unwrap();
+        assert_scalar_null(result);
+    }
+
+    #[test]
+    fn test_cast_float_infinity_non_ansi_mode() {
+        // In non-ANSI mode, Infinity should return NULL
+        let cast = SparkCast::new();
+
+        // Positive infinity
+        let args = make_args_with_ansi_mode(
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(f64::INFINITY))),
+            "timestamp",
+            false,
+        );
+        let result = cast.invoke_with_args(args).unwrap();
+        assert_scalar_null(result);
+
+        // Negative infinity
+        let args = make_args_with_ansi_mode(
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(f64::NEG_INFINITY))),
+            "timestamp",
+            false,
+        );
+        let result = cast.invoke_with_args(args).unwrap();
+        assert_scalar_null(result);
+    }
+
+    #[test]
+    fn test_cast_float_nan_ansi_mode() {
+        // In ANSI mode, NaN should error
+        let cast = SparkCast::new();
+        let args = make_args_with_ansi_mode(
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(f64::NAN))),
+            "timestamp",
+            true,
+        );
+        let result = cast.invoke_with_args(args);
+        assert!(result.is_err());
+        assert!(result.unwrap_err().to_string().contains("Cannot cast NaN"));
+    }
+
+    #[test]
+    fn test_cast_float_infinity_ansi_mode() {
+        // In ANSI mode, Infinity should error
+        let cast = SparkCast::new();
+        let args = make_args_with_ansi_mode(
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(f64::INFINITY))),
+            "timestamp",
+            true,
+        );
+        let result = cast.invoke_with_args(args);
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Cannot cast Infinity")
+        );
+    }
+
+    #[test]
+    fn test_cast_float_overflow_non_ansi_mode() {
+        // Value too large to fit in i64 microseconds - should saturate to i64::MAX like Spark
+        let cast = SparkCast::new();
+        let large_value = 1e19; // Way too large for i64 microseconds
+        let args = make_args_with_ansi_mode(
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(large_value))),
+            "timestamp",
+            false,
+        );
+        let result = cast.invoke_with_args(args).unwrap();
+        // Spark saturates overflow to i64::MAX
+        assert_scalar_timestamp(result, i64::MAX);
+    }
+
+    #[test]
+    fn test_cast_float_negative_overflow_non_ansi_mode() {
+        // Large negative value - should saturate to i64::MIN like Spark
+        let cast = SparkCast::new();
+        let large_value = -1e19; // Way too large negative for i64 microseconds
+        let args = make_args_with_ansi_mode(
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(large_value))),
+            "timestamp",
+            false,
+        );
+        let result = cast.invoke_with_args(args).unwrap();
+        // Spark saturates negative overflow to i64::MIN
+        assert_scalar_timestamp(result, i64::MIN);
+    }
+
+    #[test]
+    fn test_cast_float_overflow_ansi_mode() {
+        // Value too large to fit in i64 microseconds - should error in ANSI mode
+        let cast = SparkCast::new();
+        let large_value = 1e19; // Way too large for i64 microseconds
+        let args = make_args_with_ansi_mode(
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(large_value))),
+            "timestamp",
+            true,
+        );
+        let result = cast.invoke_with_args(args);
+        assert!(result.is_err());
+        assert!(result.unwrap_err().to_string().contains("Overflow"));
+    }
+
+    #[test]
+    fn test_cast_float_array_with_nan_and_infinity() {
+        // Array with NaN and Infinity in non-ANSI mode
+        let array: ArrayRef = Arc::new(Float64Array::from(vec![
+            Some(1.0),
+            Some(f64::NAN),
+            Some(f64::INFINITY),
+            Some(f64::NEG_INFINITY),
+            Some(2.0),
+        ]));
+
+        let cast = SparkCast::new();
+        let args =
+            make_args_with_ansi_mode(ColumnarValue::Array(array), "timestamp", false);
+        let result = cast.invoke_with_args(args).unwrap();
+
+        match result {
+            ColumnarValue::Array(result_array) => {
+                let ts_array = result_array.as_primitive::<TimestampMicrosecondType>();
+                assert_eq!(ts_array.value(0), 1_000_000);
+                assert!(ts_array.is_null(1)); // NaN -> NULL
+                assert!(ts_array.is_null(2)); // Infinity -> NULL
+                assert!(ts_array.is_null(3)); // -Infinity -> NULL
+                assert_eq!(ts_array.value(4), 2_000_000);
+            }
+            _ => panic!("Expected array result"),
+        }
+    }
+
+    #[test]
+    fn test_cast_float_negative_values() {
+        let cast = SparkCast::new();
+        let args = make_args(
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(-86400.5))),
+            "timestamp",
+        );
+        let result = cast.invoke_with_args(args).unwrap();
+        // -86400.5 seconds = -86400500000 microseconds (1 day and 0.5 seconds before epoch)
+        assert_scalar_timestamp(result, -86_400_500_000);
+    }
+}
diff --git a/datafusion/spark/src/function/conversion/mod.rs b/datafusion/spark/src/function/conversion/mod.rs
index a87df9a2c87a0..e8a89fa8c0616 100644
--- a/datafusion/spark/src/function/conversion/mod.rs
+++ b/datafusion/spark/src/function/conversion/mod.rs
@@ -15,11 +15,26 @@
 // specific language governing permissions and limitations
 // under the License.
 
+mod cast;
+
 use datafusion_expr::ScalarUDF;
+use datafusion_functions::make_udf_function_with_config;
 use std::sync::Arc;
 
-pub mod expr_fn {}
+make_udf_function_with_config!(cast::SparkCast, spark_cast);
+
+pub mod expr_fn {
+    use datafusion_functions::export_functions;
+
+    export_functions!((
+        spark_cast,
+        "Casts given value to the specified type following Spark-compatible semantics",
+        @config arg1 arg2
+    ));
+}
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
-    vec![]
+    use datafusion_common::config::ConfigOptions;
+    let config = ConfigOptions::default();
+    vec![spark_cast(&config)]
 }
diff --git a/datafusion/spark/src/function/datetime/add_months.rs b/datafusion/spark/src/function/datetime/add_months.rs
new file mode 100644
index 0000000000000..2963cf5880b9c
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/add_months.rs
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::ops::Add;
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, FieldRef, IntervalUnit};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{
+    ColumnarValue, Expr, ExprSchemable, ReturnFieldArgs, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, Volatility,
+};
+
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#add_months>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkAddMonths {
+    signature: Signature,
+}
+
+impl Default for SparkAddMonths {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkAddMonths {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::exact(
+                vec![DataType::Date32, DataType::Int32],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkAddMonths {
+    fn name(&self) -> &str {
+        "add_months"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        Ok(Arc::new(Field::new(
+            self.name(),
+            DataType::Date32,
+            nullable,
+        )))
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [date_arg, months_arg] = take_function_args("add_months", args)?;
+        let interval = months_arg
+            .cast_to(&DataType::Interval(IntervalUnit::YearMonth), info.schema())?;
+        Ok(ExprSimplifyResult::Simplified(date_arg.add(interval)))
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!("invoke should not be called on a simplified add_months() function")
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/date_add.rs b/datafusion/spark/src/function/datetime/date_add.rs
index a00430febcdb0..6db0fe3a36cf2 100644
--- a/datafusion/spark/src/function/datetime/date_add.rs
+++ b/datafusion/spark/src/function/datetime/date_add.rs
@@ -15,20 +15,19 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::ArrayRef;
 use arrow::compute;
-use arrow::datatypes::{DataType, Date32Type};
-use arrow::error::ArrowError;
+use arrow::datatypes::{DataType, Date32Type, Field, FieldRef};
 use datafusion_common::cast::{
-    as_date32_array, as_int16_array, as_int32_array, as_int8_array,
+    as_date32_array, as_int8_array, as_int16_array, as_int32_array,
 };
-use datafusion_common::{internal_err, Result};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
 use datafusion_expr::{
-    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
-    Volatility,
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature, Volatility,
 };
 use datafusion_functions::utils::make_scalar_function;
 
@@ -61,10 +60,6 @@ impl SparkDateAdd {
 }
 
 impl ScalarUDFImpl for SparkDateAdd {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "date_add"
     }
@@ -78,7 +73,16 @@ impl ScalarUDFImpl for SparkDateAdd {
     }
 
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(DataType::Date32)
+        internal_err!("Use return_field_from_args in this case instead.")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new(
+            self.name(),
+            DataType::Date32,
+            nullable,
+        )))
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
@@ -87,48 +91,31 @@ impl ScalarUDFImpl for SparkDateAdd {
 }
 
 fn spark_date_add(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let [date_arg, days_arg] = args else {
-        return internal_err!(
-            "Spark `date_add` function requires 2 arguments, got {}",
-            args.len()
-        );
-    };
+    let [date_arg, days_arg] = take_function_args("date_add", args)?;
     let date_array = as_date32_array(date_arg)?;
     let result = match days_arg.data_type() {
         DataType::Int8 => {
             let days_array = as_int8_array(days_arg)?;
-            compute::try_binary::<_, _, _, Date32Type>(
+            compute::binary::<_, _, _, Date32Type>(
                 date_array,
                 days_array,
-                |date, days| {
-                    date.checked_add(days as i32).ok_or_else(|| {
-                        ArrowError::ArithmeticOverflow("date_add".to_string())
-                    })
-                },
+                |date, days| date.wrapping_add(days as i32),
             )?
         }
         DataType::Int16 => {
             let days_array = as_int16_array(days_arg)?;
-            compute::try_binary::<_, _, _, Date32Type>(
+            compute::binary::<_, _, _, Date32Type>(
                 date_array,
                 days_array,
-                |date, days| {
-                    date.checked_add(days as i32).ok_or_else(|| {
-                        ArrowError::ArithmeticOverflow("date_add".to_string())
-                    })
-                },
+                |date, days| date.wrapping_add(days as i32),
             )?
         }
         DataType::Int32 => {
             let days_array = as_int32_array(days_arg)?;
-            compute::try_binary::<_, _, _, Date32Type>(
+            compute::binary::<_, _, _, Date32Type>(
                 date_array,
                 days_array,
-                |date, days| {
-                    date.checked_add(days).ok_or_else(|| {
-                        ArrowError::ArithmeticOverflow("date_add".to_string())
-                    })
-                },
+                |date, days| date.wrapping_add(days),
             )?
         }
         _ => {
@@ -140,3 +127,46 @@ fn spark_date_add(args: &[ArrayRef]) -> Result<ArrayRef> {
     };
     Ok(Arc::new(result))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_date_add_non_nullable_inputs() {
+        let func = SparkDateAdd::new();
+        let args = &[
+            Arc::new(Field::new("date", DataType::Date32, false)),
+            Arc::new(Field::new("num", DataType::Int8, false)),
+        ];
+
+        let ret_field = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: args,
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        assert_eq!(ret_field.data_type(), &DataType::Date32);
+        assert!(!ret_field.is_nullable());
+    }
+
+    #[test]
+    fn test_date_add_nullable_inputs() {
+        let func = SparkDateAdd::new();
+        let args = &[
+            Arc::new(Field::new("date", DataType::Date32, false)),
+            Arc::new(Field::new("num", DataType::Int16, true)),
+        ];
+
+        let ret_field = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: args,
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        assert_eq!(ret_field.data_type(), &DataType::Date32);
+        assert!(ret_field.is_nullable());
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/date_diff.rs b/datafusion/spark/src/function/datetime/date_diff.rs
new file mode 100644
index 0000000000000..b9793ddc00670
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/date_diff.rs
@@ -0,0 +1,114 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::types::{NativeType, logical_date, logical_string};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Expr, ExprSchemable, Operator, ReturnFieldArgs,
+    ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignatureClass, Volatility,
+    binary_expr,
+};
+
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#date_diff>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkDateDiff {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkDateDiff {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkDateDiff {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_date()),
+                        vec![
+                            TypeSignatureClass::Native(logical_string()),
+                            TypeSignatureClass::Timestamp,
+                        ],
+                        NativeType::Date,
+                    ),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_date()),
+                        vec![
+                            TypeSignatureClass::Native(logical_string()),
+                            TypeSignatureClass::Timestamp,
+                        ],
+                        NativeType::Date,
+                    ),
+                ],
+                Volatility::Immutable,
+            ),
+            aliases: vec!["datediff".to_string()],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkDateDiff {
+    fn name(&self) -> &str {
+        "date_diff"
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new(self.name(), DataType::Int32, nullable)))
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!(
+            "Apache Spark `date_diff` should have been simplified to standard subtraction"
+        )
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [end, start] = take_function_args(self.name(), args)?;
+        let end = end.cast_to(&DataType::Date32, info.schema())?;
+        let start = start.cast_to(&DataType::Date32, info.schema())?;
+        Ok(ExprSimplifyResult::Simplified(
+            binary_expr(end, Operator::Minus, start)
+                .cast_to(&DataType::Int32, info.schema())?,
+        ))
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/date_part.rs b/datafusion/spark/src/function/datetime/date_part.rs
new file mode 100644
index 0000000000000..ced4865d164a3
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/date_part.rs
@@ -0,0 +1,135 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::types::logical_date;
+use datafusion_common::{
+    Result, ScalarValue, internal_err, types::logical_string, utils::take_function_args,
+};
+use datafusion_expr::expr::ScalarFunction;
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Expr, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignature, TypeSignatureClass, Volatility,
+};
+use std::sync::Arc;
+
+/// Wrapper around datafusion date_part function to handle
+/// Spark behavior returning day of the week 1-indexed instead of 0-indexed and different part aliases.
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#date_part>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkDatePart {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkDatePart {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkDatePart {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                        Coercion::new_exact(TypeSignatureClass::Timestamp),
+                    ]),
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_date())),
+                    ]),
+                ],
+                Volatility::Immutable,
+            ),
+            aliases: vec![String::from("datepart")],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkDatePart {
+    fn name(&self) -> &str {
+        "date_part"
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("Use return_field_from_args in this case instead.")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        Ok(Arc::new(Field::new(self.name(), DataType::Int32, nullable)))
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!("spark date_part should have been simplified to standard date_part")
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        _info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [part_expr, date_expr] = take_function_args(self.name(), args)?;
+
+        let part = match part_expr.as_literal() {
+            Some(ScalarValue::Utf8(Some(v)))
+            | Some(ScalarValue::Utf8View(Some(v)))
+            | Some(ScalarValue::LargeUtf8(Some(v))) => v.to_lowercase(),
+            _ => {
+                return internal_err!(
+                    "First argument of `DATE_PART` must be non-null scalar Utf8"
+                );
+            }
+        };
+
+        // Map Spark-specific date part aliases to datafusion ones
+        let part = match part.as_str() {
+            "yearofweek" | "year_iso" => "isoyear",
+            "dayofweek" => "dow",
+            "dayofweek_iso" | "dow_iso" => "isodow",
+            other => other,
+        };
+
+        let part_expr = Expr::Literal(ScalarValue::new_utf8(part), None);
+
+        let date_part_expr = Expr::ScalarFunction(ScalarFunction::new_udf(
+            datafusion_functions::datetime::date_part(),
+            vec![part_expr, date_expr],
+        ));
+
+        match part {
+            // Add 1 for day-of-week parts to convert 0-indexed to 1-indexed
+            "dow" | "isodow" => Ok(ExprSimplifyResult::Simplified(
+                date_part_expr + Expr::Literal(ScalarValue::Int32(Some(1)), None),
+            )),
+            _ => Ok(ExprSimplifyResult::Simplified(date_part_expr)),
+        }
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/date_sub.rs b/datafusion/spark/src/function/datetime/date_sub.rs
index a3b26661d196c..bc2025c9b2eda 100644
--- a/datafusion/spark/src/function/datetime/date_sub.rs
+++ b/datafusion/spark/src/function/datetime/date_sub.rs
@@ -15,20 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::ArrayRef;
 use arrow::compute;
-use arrow::datatypes::{DataType, Date32Type};
-use arrow::error::ArrowError;
+use arrow::datatypes::{DataType, Date32Type, Field, FieldRef};
 use datafusion_common::cast::{
-    as_date32_array, as_int16_array, as_int32_array, as_int8_array,
+    as_date32_array, as_int8_array, as_int16_array, as_int32_array,
 };
-use datafusion_common::{internal_err, Result};
+use datafusion_common::{Result, internal_err};
 use datafusion_expr::{
-    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
-    Volatility,
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature, Volatility,
 };
 use datafusion_functions::utils::make_scalar_function;
 
@@ -59,10 +57,6 @@ impl SparkDateSub {
 }
 
 impl ScalarUDFImpl for SparkDateSub {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "date_sub"
     }
@@ -72,7 +66,16 @@ impl ScalarUDFImpl for SparkDateSub {
     }
 
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(DataType::Date32)
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new(
+            self.name(),
+            DataType::Date32,
+            nullable,
+        )))
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
@@ -91,38 +94,26 @@ fn spark_date_sub(args: &[ArrayRef]) -> Result<ArrayRef> {
     let result = match days_arg.data_type() {
         DataType::Int8 => {
             let days_array = as_int8_array(days_arg)?;
-            compute::try_binary::<_, _, _, Date32Type>(
+            compute::binary::<_, _, _, Date32Type>(
                 date_array,
                 days_array,
-                |date, days| {
-                    date.checked_sub(days as i32).ok_or_else(|| {
-                        ArrowError::ArithmeticOverflow("date_sub".to_string())
-                    })
-                },
+                |date, days| date.wrapping_sub(days as i32),
             )?
         }
         DataType::Int16 => {
             let days_array = as_int16_array(days_arg)?;
-            compute::try_binary::<_, _, _, Date32Type>(
+            compute::binary::<_, _, _, Date32Type>(
                 date_array,
                 days_array,
-                |date, days| {
-                    date.checked_sub(days as i32).ok_or_else(|| {
-                        ArrowError::ArithmeticOverflow("date_sub".to_string())
-                    })
-                },
+                |date, days| date.wrapping_sub(days as i32),
             )?
         }
         DataType::Int32 => {
             let days_array = as_int32_array(days_arg)?;
-            compute::try_binary::<_, _, _, Date32Type>(
+            compute::binary::<_, _, _, Date32Type>(
                 date_array,
                 days_array,
-                |date, days| {
-                    date.checked_sub(days).ok_or_else(|| {
-                        ArrowError::ArithmeticOverflow("date_sub".to_string())
-                    })
-                },
+                |date, days| date.wrapping_sub(days),
             )?
         }
         _ => {
@@ -134,3 +125,42 @@ fn spark_date_sub(args: &[ArrayRef]) -> Result<ArrayRef> {
     };
     Ok(Arc::new(result))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_date_sub_nullability_non_nullable_args() {
+        let udf = SparkDateSub::new();
+        let date_field = Arc::new(Field::new("d", DataType::Date32, false));
+        let days_field = Arc::new(Field::new("n", DataType::Int32, false));
+
+        let result = udf
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[date_field, days_field],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        assert!(!result.is_nullable());
+        assert_eq!(result.data_type(), &DataType::Date32);
+    }
+
+    #[test]
+    fn test_date_sub_nullability_nullable_arg() {
+        let udf = SparkDateSub::new();
+        let date_field = Arc::new(Field::new("d", DataType::Date32, false));
+        let nullable_days_field = Arc::new(Field::new("n", DataType::Int32, true));
+
+        let result = udf
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[date_field, nullable_days_field],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), &DataType::Date32);
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/date_trunc.rs b/datafusion/spark/src/function/datetime/date_trunc.rs
new file mode 100644
index 0000000000000..c8b0fbca36165
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/date_trunc.rs
@@ -0,0 +1,167 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, FieldRef, TimeUnit};
+use datafusion_common::types::{NativeType, logical_string};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, internal_err, plan_err};
+use datafusion_expr::expr::ScalarFunction;
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Expr, ExprSchemable, ReturnFieldArgs, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignatureClass, Volatility,
+};
+
+/// Spark date_trunc supports extra format aliases.
+/// It also handles timestamps with timezones by converting to session timezone first.
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#date_trunc>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkDateTrunc {
+    signature: Signature,
+}
+
+impl Default for SparkDateTrunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkDateTrunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Timestamp,
+                        vec![TypeSignatureClass::Native(logical_string())],
+                        NativeType::Timestamp(TimeUnit::Microsecond, None),
+                    ),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkDateTrunc {
+    fn name(&self) -> &str {
+        "date_trunc"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        Ok(Arc::new(Field::new(
+            self.name(),
+            args.arg_fields[1].data_type().clone(),
+            nullable,
+        )))
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!(
+            "spark date_trunc should have been simplified to standard date_trunc"
+        )
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [fmt_expr, ts_expr] = take_function_args(self.name(), args)?;
+
+        let fmt = match fmt_expr.as_literal() {
+            Some(ScalarValue::Utf8(Some(v)))
+            | Some(ScalarValue::Utf8View(Some(v)))
+            | Some(ScalarValue::LargeUtf8(Some(v))) => v.to_lowercase(),
+            _ => {
+                return plan_err!(
+                    "First argument of `DATE_TRUNC` must be non-null scalar Utf8"
+                );
+            }
+        };
+
+        // Map Spark-specific fmt aliases to datafusion ones
+        let fmt = match fmt.as_str() {
+            "yy" | "yyyy" => "year",
+            "mm" | "mon" => "month",
+            "dd" => "day",
+            other => other,
+        };
+
+        let session_tz = info.config_options().execution.time_zone.clone();
+        let ts_type = ts_expr.get_type(info.schema())?;
+
+        // Spark interprets timestamps in the session timezone before truncating,
+        // then returns a timestamp at microsecond precision.
+        // See: https://github.com/apache/spark/blob/f310f4fcc95580a6824bc7d22b76006f79b8804a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala#L492
+        //
+        // For sub-second truncations (second, millisecond, microsecond), timezone
+        // adjustment is unnecessary since timezone offsets are whole seconds.
+        let ts_expr = match (&ts_type, fmt) {
+            // Sub-second truncations don't need timezone adjustment
+            (_, "second" | "millisecond" | "microsecond") => ts_expr,
+
+            // convert to session timezone, strip timezone and convert back to original timezone
+            (DataType::Timestamp(unit, tz), _) => {
+                let ts_expr = match &session_tz {
+                    Some(session_tz) => ts_expr.cast_to(
+                        &DataType::Timestamp(
+                            TimeUnit::Microsecond,
+                            Some(Arc::from(session_tz.as_str())),
+                        ),
+                        info.schema(),
+                    )?,
+                    None => ts_expr,
+                };
+                Expr::ScalarFunction(ScalarFunction::new_udf(
+                    datafusion_functions::datetime::to_local_time(),
+                    vec![ts_expr],
+                ))
+                .cast_to(&DataType::Timestamp(*unit, tz.clone()), info.schema())?
+            }
+
+            _ => {
+                return plan_err!(
+                    "Second argument of `DATE_TRUNC` must be Timestamp, got {}",
+                    ts_type
+                );
+            }
+        };
+
+        let fmt_expr = Expr::Literal(ScalarValue::new_utf8(fmt), None);
+
+        Ok(ExprSimplifyResult::Simplified(Expr::ScalarFunction(
+            ScalarFunction::new_udf(
+                datafusion_functions::datetime::date_trunc(),
+                vec![fmt_expr, ts_expr],
+            ),
+        )))
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/extract.rs b/datafusion/spark/src/function/datetime/extract.rs
new file mode 100644
index 0000000000000..70026b18ed5e7
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/extract.rs
@@ -0,0 +1,254 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::ArrayRef;
+use arrow::compute::{DatePart, date_part};
+use arrow::datatypes::DataType;
+use datafusion_common::Result;
+use datafusion_common::utils::take_function_args;
+use datafusion_expr::{
+    Coercion, ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+/// Creates a signature for datetime extraction functions that accept timestamp types.
+fn extract_signature() -> Signature {
+    Signature::coercible(
+        vec![Coercion::new_exact(TypeSignatureClass::Timestamp)],
+        Volatility::Immutable,
+    )
+}
+
+// -----------------------------------------------------------------------------
+// SparkHour
+// -----------------------------------------------------------------------------
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkHour {
+    signature: Signature,
+}
+
+impl Default for SparkHour {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkHour {
+    pub fn new() -> Self {
+        Self {
+            signature: extract_signature(),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkHour {
+    fn name(&self) -> &str {
+        "hour"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Int32)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_hour, vec![])(&args.args)
+    }
+}
+
+fn spark_hour(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [ts_arg] = take_function_args("hour", args)?;
+    let result = date_part(ts_arg.as_ref(), DatePart::Hour)?;
+    Ok(result)
+}
+
+// -----------------------------------------------------------------------------
+// SparkMinute
+// -----------------------------------------------------------------------------
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkMinute {
+    signature: Signature,
+}
+
+impl Default for SparkMinute {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkMinute {
+    pub fn new() -> Self {
+        Self {
+            signature: extract_signature(),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkMinute {
+    fn name(&self) -> &str {
+        "minute"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Int32)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_minute, vec![])(&args.args)
+    }
+}
+
+fn spark_minute(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [ts_arg] = take_function_args("minute", args)?;
+    let result = date_part(ts_arg.as_ref(), DatePart::Minute)?;
+    Ok(result)
+}
+
+// -----------------------------------------------------------------------------
+// SparkSecond
+// -----------------------------------------------------------------------------
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkSecond {
+    signature: Signature,
+}
+
+impl Default for SparkSecond {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkSecond {
+    pub fn new() -> Self {
+        Self {
+            signature: extract_signature(),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkSecond {
+    fn name(&self) -> &str {
+        "second"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Int32)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_second, vec![])(&args.args)
+    }
+}
+
+fn spark_second(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [ts_arg] = take_function_args("second", args)?;
+    let result = date_part(ts_arg.as_ref(), DatePart::Second)?;
+    Ok(result)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{Array, Int32Array, TimestampMicrosecondArray};
+    use arrow::datatypes::TimeUnit;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_spark_hour() {
+        // Create a timestamp array: 2024-01-15 14:30:45 UTC (in microseconds)
+        // 14:30:45 -> hour = 14
+        let ts_micros = 1_705_329_045_000_000_i64; // 2024-01-15 14:30:45 UTC
+        let ts_array = TimestampMicrosecondArray::from(vec![Some(ts_micros), None]);
+        let ts_array = Arc::new(ts_array) as ArrayRef;
+
+        let result = spark_hour(&[ts_array]).unwrap();
+        let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
+
+        assert_eq!(result.value(0), 14);
+        assert!(result.is_null(1));
+    }
+
+    #[test]
+    fn test_spark_minute() {
+        // 14:30:45 -> minute = 30
+        let ts_micros = 1_705_329_045_000_000_i64;
+        let ts_array = TimestampMicrosecondArray::from(vec![Some(ts_micros), None]);
+        let ts_array = Arc::new(ts_array) as ArrayRef;
+
+        let result = spark_minute(&[ts_array]).unwrap();
+        let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
+
+        assert_eq!(result.value(0), 30);
+        assert!(result.is_null(1));
+    }
+
+    #[test]
+    fn test_spark_second() {
+        // 14:30:45 -> second = 45
+        let ts_micros = 1_705_329_045_000_000_i64;
+        let ts_array = TimestampMicrosecondArray::from(vec![Some(ts_micros), None]);
+        let ts_array = Arc::new(ts_array) as ArrayRef;
+
+        let result = spark_second(&[ts_array]).unwrap();
+        let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
+
+        assert_eq!(result.value(0), 45);
+        assert!(result.is_null(1));
+    }
+
+    #[test]
+    fn test_hour_return_type() {
+        let func = SparkHour::new();
+        let result = func
+            .return_type(&[DataType::Timestamp(TimeUnit::Microsecond, None)])
+            .unwrap();
+        assert_eq!(result, DataType::Int32);
+    }
+
+    #[test]
+    fn test_minute_return_type() {
+        let func = SparkMinute::new();
+        let result = func
+            .return_type(&[DataType::Timestamp(TimeUnit::Microsecond, None)])
+            .unwrap();
+        assert_eq!(result, DataType::Int32);
+    }
+
+    #[test]
+    fn test_second_return_type() {
+        let func = SparkSecond::new();
+        let result = func
+            .return_type(&[DataType::Timestamp(TimeUnit::Microsecond, None)])
+            .unwrap();
+        assert_eq!(result, DataType::Int32);
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/from_utc_timestamp.rs b/datafusion/spark/src/function/datetime/from_utc_timestamp.rs
new file mode 100644
index 0000000000000..bfca677c1dcce
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/from_utc_timestamp.rs
@@ -0,0 +1,190 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::timezone::Tz;
+use arrow::array::{Array, ArrayRef, AsArray, PrimitiveBuilder, StringArrayType};
+use arrow::datatypes::TimeUnit;
+use arrow::datatypes::{
+    ArrowTimestampType, DataType, Field, FieldRef, TimestampMicrosecondType,
+    TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
+};
+use datafusion_common::types::{NativeType, logical_string};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, exec_datafusion_err, exec_err, internal_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignatureClass, Volatility,
+};
+use datafusion_functions::datetime::to_local_time::adjust_to_local_time;
+use datafusion_functions::utils::make_scalar_function;
+
+/// Apache Spark `from_utc_timestamp` function.
+///
+/// Interprets the given timestamp as UTC and converts it to the given timezone.
+///
+/// Timestamp in Apache Spark represents number of microseconds from the Unix epoch, which is not
+/// timezone-agnostic. So in Apache Spark this function just shift the timestamp value from UTC timezone to
+/// the given timezone.
+///
+/// See <https://spark.apache.org/docs/latest/api/sql/index.html#from_utc_timestamp>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkFromUtcTimestamp {
+    signature: Signature,
+}
+
+impl Default for SparkFromUtcTimestamp {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkFromUtcTimestamp {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Timestamp,
+                        vec![TypeSignatureClass::Native(logical_string())],
+                        NativeType::Timestamp(TimeUnit::Microsecond, None),
+                    ),
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkFromUtcTimestamp {
+    fn name(&self) -> &str {
+        "from_utc_timestamp"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        Ok(Arc::new(Field::new(
+            self.name(),
+            args.arg_fields[0].data_type().clone(),
+            nullable,
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_from_utc_timestamp, vec![])(&args.args)
+    }
+}
+
+fn spark_from_utc_timestamp(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [timestamp, timezone] = take_function_args("from_utc_timestamp", args)?;
+
+    match timestamp.data_type() {
+        DataType::Timestamp(TimeUnit::Nanosecond, tz_opt) => {
+            process_timestamp_with_tz_array::<TimestampNanosecondType>(
+                timestamp,
+                timezone,
+                tz_opt.clone(),
+            )
+        }
+        DataType::Timestamp(TimeUnit::Microsecond, tz_opt) => {
+            process_timestamp_with_tz_array::<TimestampMicrosecondType>(
+                timestamp,
+                timezone,
+                tz_opt.clone(),
+            )
+        }
+        DataType::Timestamp(TimeUnit::Millisecond, tz_opt) => {
+            process_timestamp_with_tz_array::<TimestampMillisecondType>(
+                timestamp,
+                timezone,
+                tz_opt.clone(),
+            )
+        }
+        DataType::Timestamp(TimeUnit::Second, tz_opt) => {
+            process_timestamp_with_tz_array::<TimestampSecondType>(
+                timestamp,
+                timezone,
+                tz_opt.clone(),
+            )
+        }
+        ts_type => {
+            exec_err!("`from_utc_timestamp`: unsupported argument types: {ts_type}")
+        }
+    }
+}
+
+fn process_timestamp_with_tz_array<T: ArrowTimestampType>(
+    ts_array: &ArrayRef,
+    tz_array: &ArrayRef,
+    tz_opt: Option<Arc<str>>,
+) -> Result<ArrayRef> {
+    match tz_array.data_type() {
+        DataType::Utf8 => {
+            process_arrays::<T, _>(tz_opt, ts_array, tz_array.as_string::<i32>())
+        }
+        DataType::LargeUtf8 => {
+            process_arrays::<T, _>(tz_opt, ts_array, tz_array.as_string::<i64>())
+        }
+        DataType::Utf8View => {
+            process_arrays::<T, _>(tz_opt, ts_array, tz_array.as_string_view())
+        }
+        other => {
+            exec_err!("`from_utc_timestamp`: timezone must be a string type, got {other}")
+        }
+    }
+}
+
+fn process_arrays<'a, T: ArrowTimestampType, S>(
+    return_tz_opt: Option<Arc<str>>,
+    ts_array: &ArrayRef,
+    tz_array: &'a S,
+) -> Result<ArrayRef>
+where
+    &'a S: StringArrayType<'a>,
+{
+    let ts_primitive = ts_array.as_primitive::<T>();
+    let mut builder = PrimitiveBuilder::<T>::with_capacity(ts_array.len());
+
+    for (ts_opt, tz_opt) in ts_primitive.iter().zip(tz_array.iter()) {
+        match (ts_opt, tz_opt) {
+            (Some(ts), Some(tz_str)) => {
+                let tz: Tz = tz_str.parse().map_err(|e| {
+                    exec_datafusion_err!(
+                        "`from_utc_timestamp`: invalid timezone '{tz_str}': {e}"
+                    )
+                })?;
+                let val = adjust_to_local_time::<T>(ts, tz)?;
+                builder.append_value(val);
+            }
+            _ => builder.append_null(),
+        }
+    }
+
+    builder = builder.with_timezone_opt(return_tz_opt);
+    Ok(Arc::new(builder.finish()))
+}
diff --git a/datafusion/spark/src/function/datetime/last_day.rs b/datafusion/spark/src/function/datetime/last_day.rs
index c01a6403649c5..74c55d911f410 100644
--- a/datafusion/spark/src/function/datetime/last_day.rs
+++ b/datafusion/spark/src/function/datetime/last_day.rs
@@ -15,15 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, AsArray, Date32Array};
-use arrow::datatypes::{DataType, Date32Type};
+use arrow::datatypes::{DataType, Date32Type, Field, FieldRef};
 use chrono::{Datelike, Duration, NaiveDate};
-use datafusion_common::{exec_datafusion_err, internal_err, Result, ScalarValue};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, exec_datafusion_err, internal_err};
 use datafusion_expr::{
-    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 
 #[derive(Debug, PartialEq, Eq, Hash)]
@@ -46,10 +47,6 @@ impl SparkLastDay {
 }
 
 impl ScalarUDFImpl for SparkLastDay {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "last_day"
     }
@@ -59,22 +56,29 @@ impl ScalarUDFImpl for SparkLastDay {
     }
 
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(DataType::Date32)
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let Some(field) = args.arg_fields.first() else {
+            return internal_err!("Spark `last_day` expects exactly one argument");
+        };
+
+        Ok(Arc::new(Field::new(
+            self.name(),
+            DataType::Date32,
+            field.is_nullable(),
+        )))
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let ScalarFunctionArgs { args, .. } = args;
-        let [arg] = args.as_slice() else {
-            return internal_err!(
-                "Spark `last_day` function requires 1 argument, got {}",
-                args.len()
-            );
-        };
+        let [arg] = take_function_args("last_day", args)?;
         match arg {
             ColumnarValue::Scalar(ScalarValue::Date32(days)) => {
                 if let Some(days) = days {
                     Ok(ColumnarValue::Scalar(ScalarValue::Date32(Some(
-                        spark_last_day(*days)?,
+                        spark_last_day(days)?,
                     ))))
                 } else {
                     Ok(ColumnarValue::Scalar(ScalarValue::Date32(None)))
@@ -90,7 +94,9 @@ impl ScalarUDFImpl for SparkLastDay {
                         Ok(Arc::new(result) as ArrayRef)
                     }
                     other => {
-                        internal_err!("Unsupported data type {other:?} for Spark function `last_day`")
+                        internal_err!(
+                            "Unsupported data type {other:?} for Spark function `last_day`"
+                        )
                     }
                 }?;
                 Ok(ColumnarValue::Array(result))
@@ -103,7 +109,11 @@ impl ScalarUDFImpl for SparkLastDay {
 }
 
 fn spark_last_day(days: i32) -> Result<i32> {
-    let date = Date32Type::to_naive_date(days);
+    let date = Date32Type::to_naive_date_opt(days).ok_or_else(|| {
+        exec_datafusion_err!(
+            "Spark `last_day`: Unable to convert days value {days} to date"
+        )
+    })?;
 
     let (year, month) = (date.year(), date.month());
     let (next_year, next_month) = if month == 12 {
@@ -123,3 +133,57 @@ fn spark_last_day(days: i32) -> Result<i32> {
         first_day_next_month - Duration::days(1),
     ))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::function::utils::test::test_scalar_function;
+    use arrow::array::Array;
+
+    #[test]
+    fn test_last_day_nullability_matches_input() {
+        let func = SparkLastDay::new();
+
+        let non_nullable_arg = Arc::new(Field::new("arg", DataType::Date32, false));
+        let nullable_arg = Arc::new(Field::new("arg", DataType::Date32, true));
+
+        let non_nullable_out = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&non_nullable_arg)],
+                scalar_arguments: &[None],
+            })
+            .expect("non-nullable arg should succeed");
+        assert_eq!(non_nullable_out.data_type(), &DataType::Date32);
+        assert!(!non_nullable_out.is_nullable());
+
+        let nullable_out = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&nullable_arg)],
+                scalar_arguments: &[None],
+            })
+            .expect("nullable arg should succeed");
+        assert_eq!(nullable_out.data_type(), &DataType::Date32);
+        assert!(nullable_out.is_nullable());
+    }
+
+    #[test]
+    fn test_last_day_scalar_evaluation() {
+        test_scalar_function!(
+            SparkLastDay::new(),
+            vec![ColumnarValue::Scalar(ScalarValue::Date32(Some(0)))],
+            Ok(Some(30)),
+            i32,
+            DataType::Date32,
+            Date32Array
+        );
+
+        test_scalar_function!(
+            SparkLastDay::new(),
+            vec![ColumnarValue::Scalar(ScalarValue::Date32(None))],
+            Ok(None),
+            i32,
+            DataType::Date32,
+            Date32Array
+        );
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/make_dt_interval.rs b/datafusion/spark/src/function/datetime/make_dt_interval.rs
index bbfba44861344..88ccae1b914a4 100644
--- a/datafusion/spark/src/function/datetime/make_dt_interval.rs
+++ b/datafusion/spark/src/function/datetime/make_dt_interval.rs
@@ -15,19 +15,20 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{
     Array, ArrayRef, AsArray, DurationMicrosecondBuilder, PrimitiveArray,
 };
 use arrow::datatypes::TimeUnit::Microsecond;
-use arrow::datatypes::{DataType, Float64Type, Int32Type};
+use arrow::datatypes::{DataType, Field, FieldRef, Float64Type, Int32Type};
+use datafusion_common::types::{NativeType, logical_float64, logical_int32};
 use datafusion_common::{
-    exec_err, plan_datafusion_err, DataFusionError, Result, ScalarValue,
+    DataFusionError, Result, ScalarValue, internal_err, plan_datafusion_err,
 };
 use datafusion_expr::{
-    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+    Coercion, ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignature, TypeSignatureClass, Volatility,
 };
 use datafusion_functions::utils::make_scalar_function;
 
@@ -44,17 +45,42 @@ impl Default for SparkMakeDtInterval {
 
 impl SparkMakeDtInterval {
     pub fn new() -> Self {
+        let int32 = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int32()),
+            vec![TypeSignatureClass::Integer],
+            NativeType::Int32,
+        );
+
+        let float64 = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_float64()),
+            vec![TypeSignatureClass::Numeric],
+            NativeType::Float64,
+        );
+
+        let variants = vec![
+            TypeSignature::Nullary,
+            // (days)
+            TypeSignature::Coercible(vec![int32.clone()]),
+            // (days, hours)
+            TypeSignature::Coercible(vec![int32.clone(), int32.clone()]),
+            // (days, hours, minutes)
+            TypeSignature::Coercible(vec![int32.clone(), int32.clone(), int32.clone()]),
+            // (days, hours, minutes, seconds)
+            TypeSignature::Coercible(vec![
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                float64,
+            ]),
+        ];
+
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+            signature: Signature::one_of(variants, Volatility::Immutable),
         }
     }
 }
 
 impl ScalarUDFImpl for SparkMakeDtInterval {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "make_dt_interval"
     }
@@ -70,7 +96,28 @@ impl ScalarUDFImpl for SparkMakeDtInterval {
     ///
     /// [Sail compatibility doc]: https://github.com/lakehq/sail/blob/dc5368daa24d40a7758a299e1ba8fc985cb29108/docs/guide/dataframe/data-types/compatibility.md?plain=1#L260
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(DataType::Duration(Microsecond))
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let has_non_finite_secs = args
+            .scalar_arguments
+            .get(3)
+            .and_then(|arg| {
+                arg.map(|scalar| match scalar {
+                    ScalarValue::Float64(Some(v)) => !v.is_finite(),
+                    ScalarValue::Float32(Some(v)) => !v.is_finite(),
+                    _ => false,
+                })
+            })
+            .unwrap_or(false);
+        let nullable =
+            has_non_finite_secs || args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new(
+            self.name(),
+            DataType::Duration(Microsecond),
+            nullable,
+        )))
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
@@ -79,26 +126,13 @@ impl ScalarUDFImpl for SparkMakeDtInterval {
                 Some(0),
             )));
         }
-        make_scalar_function(make_dt_interval_kernel, vec![])(&args.args)
-    }
-
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        if arg_types.len() > 4 {
-            return exec_err!(
+        if args.args.len() > 4 {
+            return Err(DataFusionError::Execution(format!(
                 "make_dt_interval expects between 0 and 4 arguments, got {}",
-                arg_types.len()
-            );
+                args.args.len()
+            )));
         }
-
-        Ok((0..arg_types.len())
-            .map(|i| {
-                if i == 3 {
-                    DataType::Float64
-                } else {
-                    DataType::Int32
-                }
-            })
-            .collect())
+        make_scalar_function(make_dt_interval_kernel, vec![])(&args.args)
     }
 }
 
@@ -205,14 +239,11 @@ fn make_interval_dt_nano(day: i32, hour: i32, min: i32, sec: f64) -> Option<i64>
 
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
 
     use arrow::array::{DurationMicrosecondArray, Float64Array, Int32Array};
     use arrow::datatypes::DataType::Duration;
-    use arrow::datatypes::Field;
     use arrow::datatypes::TimeUnit::Microsecond;
-    use datafusion_common::{internal_datafusion_err, DataFusionError, Result};
-    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+    use datafusion_common::internal_datafusion_err;
 
     use super::*;
 
@@ -276,6 +307,59 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn return_field_respects_nullability() -> Result<()> {
+        let udf = SparkMakeDtInterval::new();
+
+        // All nullable inputs -> nullable output
+        let arg_fields = vec![
+            Arc::new(Field::new("days", DataType::Int32, true)),
+            Arc::new(Field::new("hours", DataType::Int32, true)),
+            Arc::new(Field::new("mins", DataType::Int32, true)),
+            Arc::new(Field::new("secs", DataType::Float64, true)),
+        ];
+
+        let out = udf.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &arg_fields,
+            scalar_arguments: &[None, None, None, None],
+        })?;
+        assert!(out.is_nullable());
+        assert_eq!(out.data_type(), &Duration(Microsecond));
+
+        // Non-nullable inputs -> non-nullable output
+        let non_nullable_arg_fields = vec![
+            Arc::new(Field::new("days", DataType::Int32, false)),
+            Arc::new(Field::new("hours", DataType::Int32, false)),
+            Arc::new(Field::new("mins", DataType::Int32, false)),
+            Arc::new(Field::new("secs", DataType::Float64, false)),
+        ];
+
+        let out = udf.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &non_nullable_arg_fields,
+            scalar_arguments: &[None, None, None, None],
+        })?;
+        assert!(!out.is_nullable());
+
+        // Non-finite secs scalar should force nullable even if fields are non-nullable
+        let scalar_values =
+            [None, None, None, Some(ScalarValue::Float64(Some(f64::NAN)))];
+        let scalar_refs = scalar_values.iter().map(|v| v.as_ref()).collect::<Vec<_>>();
+        let out = udf.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &non_nullable_arg_fields,
+            scalar_arguments: &scalar_refs,
+        })?;
+        assert!(out.is_nullable());
+
+        // Zero-arg call (defaults) should also be non-nullable
+        let out = udf.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[],
+            scalar_arguments: &[],
+        })?;
+        assert!(!out.is_nullable());
+
+        Ok(())
+    }
+
     #[test]
     fn error_months_overflow_should_be_null() -> Result<()> {
         // months = year*12 + month → NULL
@@ -465,19 +549,33 @@ mod tests {
     fn no_more_than_4_params() -> Result<()> {
         let udf = SparkMakeDtInterval::new();
 
-        let arg_types = vec![
-            DataType::Int32,
-            DataType::Int32,
-            DataType::Int32,
-            DataType::Float64,
-            DataType::Int32,
+        // Create args with 5 parameters (exceeds the limit of 4)
+        let args = vec![
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(1))),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(2))),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(3))),
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(4.0))),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(5))),
         ];
 
-        let res = udf.coerce_types(&arg_types);
+        let arg_fields = args
+            .iter()
+            .map(|arg| Field::new("a", arg.data_type(), true).into())
+            .collect::<Vec<_>>();
+
+        let func_args = ScalarFunctionArgs {
+            args,
+            arg_fields,
+            number_rows: 1,
+            return_field: Field::new("f", Duration(Microsecond), true).into(),
+            config_options: Arc::new(Default::default()),
+        };
+
+        let res = udf.invoke_with_args(func_args);
 
         assert!(
             matches!(res, Err(DataFusionError::Execution(_))),
-            "make_dt_interval should return execution error for too many arguments"
+            "make_dt_interval should return execution error for more than 4 arguments"
         );
 
         Ok(())
diff --git a/datafusion/spark/src/function/datetime/make_interval.rs b/datafusion/spark/src/function/datetime/make_interval.rs
index 8e3169556b95b..abbf398d53d89 100644
--- a/datafusion/spark/src/function/datetime/make_interval.rs
+++ b/datafusion/spark/src/function/datetime/make_interval.rs
@@ -15,18 +15,17 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{Array, ArrayRef, IntervalMonthDayNanoBuilder, PrimitiveArray};
 use arrow::datatypes::DataType::Interval;
 use arrow::datatypes::IntervalUnit::MonthDayNano;
 use arrow::datatypes::{DataType, IntervalMonthDayNano};
-use datafusion_common::{
-    exec_err, plan_datafusion_err, DataFusionError, Result, ScalarValue,
-};
+use datafusion_common::types::{NativeType, logical_float64, logical_int32};
+use datafusion_common::{DataFusionError, Result, ScalarValue, plan_datafusion_err};
 use datafusion_expr::{
-    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+    Coercion, ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+    TypeSignatureClass, Volatility,
 };
 use datafusion_functions::utils::make_scalar_function;
 
@@ -43,17 +42,69 @@ impl Default for SparkMakeInterval {
 
 impl SparkMakeInterval {
     pub fn new() -> Self {
+        let int32 = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int32()),
+            vec![TypeSignatureClass::Integer],
+            NativeType::Int32,
+        );
+
+        let float64 = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_float64()),
+            vec![TypeSignatureClass::Numeric],
+            NativeType::Float64,
+        );
+
+        let variants = vec![
+            TypeSignature::Nullary,
+            // year
+            TypeSignature::Coercible(vec![int32.clone()]),
+            // year, month
+            TypeSignature::Coercible(vec![int32.clone(), int32.clone()]),
+            // year, month, week
+            TypeSignature::Coercible(vec![int32.clone(), int32.clone(), int32.clone()]),
+            // year, month, week, day
+            TypeSignature::Coercible(vec![
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+            ]),
+            // year, month, week, day, hour
+            TypeSignature::Coercible(vec![
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+            ]),
+            // year, month, week, day, hour, minute
+            TypeSignature::Coercible(vec![
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+            ]),
+            // year, month, week, day, hour, minute, second
+            TypeSignature::Coercible(vec![
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                float64.clone(),
+            ]),
+        ];
+
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+            signature: Signature::one_of(variants, Volatility::Immutable),
         }
     }
 }
 
 impl ScalarUDFImpl for SparkMakeInterval {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "make_interval"
     }
@@ -74,27 +125,6 @@ impl ScalarUDFImpl for SparkMakeInterval {
         }
         make_scalar_function(make_interval_kernel, vec![])(&args.args)
     }
-
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        let length = arg_types.len();
-        match length {
-            x if x > 7 => {
-                exec_err!(
-                    "make_interval expects between 0 and 7 arguments, got {}",
-                    arg_types.len()
-                )
-            }
-            _ => Ok((0..arg_types.len())
-                .map(|i| {
-                    if i == 6 {
-                        DataType::Float64
-                    } else {
-                        DataType::Int32
-                    }
-                })
-                .collect()),
-        }
-    }
 }
 
 fn make_interval_kernel(args: &[ArrayRef]) -> Result<ArrayRef, DataFusionError> {
@@ -238,7 +268,9 @@ mod tests {
     use arrow::array::{Float64Array, Int32Array, IntervalMonthDayNanoArray};
     use arrow::datatypes::Field;
     use datafusion_common::config::ConfigOptions;
-    use datafusion_common::{internal_datafusion_err, internal_err, Result};
+    use datafusion_common::{
+        Result, assert_eq_or_internal_err, internal_datafusion_err, internal_err,
+    };
 
     use super::*;
     fn run_make_interval_month_day_nano(arrs: Vec<ArrayRef>) -> Result<ArrayRef> {
@@ -533,34 +565,33 @@ mod tests {
                     .ok_or_else(|| {
                         internal_datafusion_err!("expected IntervalMonthDayNanoArray")
                     })?;
-                if arr.len() != number_rows {
-                    return internal_err!(
-                        "expected array length {number_rows}, got {}",
-                        arr.len()
-                    );
-                }
+                assert_eq_or_internal_err!(
+                    arr.len(),
+                    number_rows,
+                    "expected array length {number_rows}"
+                );
                 for i in 0..number_rows {
                     let iv = arr.value(i);
-                    if (iv.months, iv.days, iv.nanoseconds) != (0, 0, 0) {
-                        return internal_err!(
-                            "row {i}: expected (0,0,0), got ({},{},{})",
-                            iv.months,
-                            iv.days,
-                            iv.nanoseconds
-                        );
-                    }
-                }
-            }
-            ColumnarValue::Scalar(ScalarValue::IntervalMonthDayNano(Some(iv))) => {
-                if (iv.months, iv.days, iv.nanoseconds) != (0, 0, 0) {
-                    return internal_err!(
-                        "expected scalar 0s, got ({},{},{})",
+                    assert_eq_or_internal_err!(
+                        (iv.months, iv.days, iv.nanoseconds),
+                        (0, 0, 0),
+                        "row {i}: expected (0,0,0), got ({},{},{})",
                         iv.months,
                         iv.days,
                         iv.nanoseconds
                     );
                 }
             }
+            ColumnarValue::Scalar(ScalarValue::IntervalMonthDayNano(Some(iv))) => {
+                assert_eq_or_internal_err!(
+                    (iv.months, iv.days, iv.nanoseconds),
+                    (0, 0, 0),
+                    "expected scalar 0s, got ({},{},{})",
+                    iv.months,
+                    iv.days,
+                    iv.nanoseconds
+                );
+            }
             other => {
                 return internal_err!(
                     "expected Array or Scalar IntervalMonthDayNano, got {other:?}"
diff --git a/datafusion/spark/src/function/datetime/mod.rs b/datafusion/spark/src/function/datetime/mod.rs
index a6adc99607665..3133ed7337f25 100644
--- a/datafusion/spark/src/function/datetime/mod.rs
+++ b/datafusion/spark/src/function/datetime/mod.rs
@@ -15,27 +15,72 @@
 // specific language governing permissions and limitations
 // under the License.
 
+pub mod add_months;
 pub mod date_add;
+pub mod date_diff;
+pub mod date_part;
 pub mod date_sub;
+pub mod date_trunc;
+pub mod extract;
+pub mod from_utc_timestamp;
 pub mod last_day;
 pub mod make_dt_interval;
 pub mod make_interval;
 pub mod next_day;
+pub mod time_trunc;
+pub mod to_utc_timestamp;
+pub mod trunc;
+pub mod unix;
 
 use datafusion_expr::ScalarUDF;
 use datafusion_functions::make_udf_function;
 use std::sync::Arc;
 
+make_udf_function!(add_months::SparkAddMonths, add_months);
 make_udf_function!(date_add::SparkDateAdd, date_add);
+make_udf_function!(date_diff::SparkDateDiff, date_diff);
+make_udf_function!(date_part::SparkDatePart, date_part);
 make_udf_function!(date_sub::SparkDateSub, date_sub);
+make_udf_function!(date_trunc::SparkDateTrunc, date_trunc);
+make_udf_function!(
+    from_utc_timestamp::SparkFromUtcTimestamp,
+    from_utc_timestamp
+);
+make_udf_function!(extract::SparkHour, hour);
+make_udf_function!(extract::SparkMinute, minute);
+make_udf_function!(extract::SparkSecond, second);
 make_udf_function!(last_day::SparkLastDay, last_day);
 make_udf_function!(make_dt_interval::SparkMakeDtInterval, make_dt_interval);
 make_udf_function!(make_interval::SparkMakeInterval, make_interval);
 make_udf_function!(next_day::SparkNextDay, next_day);
+make_udf_function!(time_trunc::SparkTimeTrunc, time_trunc);
+make_udf_function!(to_utc_timestamp::SparkToUtcTimestamp, to_utc_timestamp);
+make_udf_function!(trunc::SparkTrunc, trunc);
+make_udf_function!(unix::SparkUnixDate, unix_date);
+make_udf_function!(
+    unix::SparkUnixTimestamp,
+    unix_micros,
+    unix::SparkUnixTimestamp::microseconds
+);
+make_udf_function!(
+    unix::SparkUnixTimestamp,
+    unix_millis,
+    unix::SparkUnixTimestamp::milliseconds
+);
+make_udf_function!(
+    unix::SparkUnixTimestamp,
+    unix_seconds,
+    unix::SparkUnixTimestamp::seconds
+);
 
 pub mod expr_fn {
     use datafusion_functions::export_functions;
 
+    export_functions!((
+        add_months,
+        "Returns the date that is months months after start. The function returns NULL if at least one of the input parameters is NULL.",
+        arg1 arg2
+    ));
     export_functions!((
         date_add,
         "Returns the date that is days days after start. The function returns NULL if at least one of the input parameters is NULL.",
@@ -46,6 +91,17 @@ pub mod expr_fn {
         "Returns the date that is days days before start. The function returns NULL if at least one of the input parameters is NULL.",
         arg1 arg2
     ));
+    export_functions!((hour, "Extracts the hour component of a timestamp.", arg1));
+    export_functions!((
+        minute,
+        "Extracts the minute component of a timestamp.",
+        arg1
+    ));
+    export_functions!((
+        second,
+        "Extracts the second component of a timestamp.",
+        arg1
+    ));
     export_functions!((
         last_day,
         "Returns the last day of the month which the date belongs to.",
@@ -68,15 +124,85 @@ pub mod expr_fn {
         "Returns the first date which is later than start_date and named as indicated. The function returns NULL if at least one of the input parameters is NULL.",
         arg1 arg2
     ));
+    export_functions!((
+        date_diff,
+        "Returns the number of days from start `start` to end `end`.",
+        end start
+    ));
+    export_functions!((
+        date_trunc,
+        "Truncates a timestamp `ts` to the unit specified by the format `fmt`.",
+        fmt ts
+    ));
+    export_functions!((
+        time_trunc,
+        "Truncates a time `t` to the unit specified by the format `fmt`.",
+        fmt t
+    ));
+    export_functions!((
+        trunc,
+        "Truncates a date `dt` to the unit specified by the format `fmt`.",
+        dt fmt
+    ));
+    export_functions!((
+        date_part,
+        "Extracts a part of the date or time from a date, time, or timestamp expression.",
+        arg1 arg2
+    ));
+    export_functions!((
+        from_utc_timestamp,
+        "Interpret a given timestamp `ts` in UTC timezone and then convert it to timezone `tz`.",
+        ts tz
+    ));
+    export_functions!((
+        to_utc_timestamp,
+        "Interpret a given timestamp `ts` in timezone `tz` and then convert it to UTC timezone.",
+        ts tz
+    ));
+    export_functions!((
+        unix_date,
+        "Returns the number of days since epoch (1970-01-01) for the given date `dt`.",
+        dt
+    ));
+    export_functions!((
+        unix_micros,
+        "Returns the number of microseconds since epoch (1970-01-01 00:00:00 UTC) for the given timestamp `ts`.",
+        ts
+    ));
+    export_functions!((
+        unix_millis,
+        "Returns the number of milliseconds since epoch (1970-01-01 00:00:00 UTC) for the given timestamp `ts`.",
+        ts
+    ));
+    export_functions!((
+        unix_seconds,
+        "Returns the number of seconds since epoch (1970-01-01 00:00:00 UTC) for the given timestamp `ts`.",
+        ts
+    ));
 }
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
     vec![
+        add_months(),
         date_add(),
+        date_diff(),
+        date_part(),
         date_sub(),
+        date_trunc(),
+        from_utc_timestamp(),
+        hour(),
         last_day(),
         make_dt_interval(),
         make_interval(),
+        minute(),
         next_day(),
+        second(),
+        time_trunc(),
+        to_utc_timestamp(),
+        trunc(),
+        unix_date(),
+        unix_micros(),
+        unix_millis(),
+        unix_seconds(),
     ]
 }
diff --git a/datafusion/spark/src/function/datetime/next_day.rs b/datafusion/spark/src/function/datetime/next_day.rs
index 32739f3e2c591..2241043d44cd7 100644
--- a/datafusion/spark/src/function/datetime/next_day.rs
+++ b/datafusion/spark/src/function/datetime/next_day.rs
@@ -15,15 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
-use arrow::array::{new_null_array, ArrayRef, AsArray, Date32Array, StringArrayType};
-use arrow::datatypes::{DataType, Date32Type};
+use arrow::array::{ArrayRef, AsArray, Date32Array, StringArrayType};
+use arrow::datatypes::{DataType, Date32Type, Field, FieldRef};
 use chrono::{Datelike, Duration, Weekday};
-use datafusion_common::{exec_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
 use datafusion_expr::{
-    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 
 /// <https://spark.apache.org/docs/latest/api/sql/index.html#next_day>
@@ -50,10 +50,6 @@ impl SparkNextDay {
 }
 
 impl ScalarUDFImpl for SparkNextDay {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "next_day"
     }
@@ -63,7 +59,13 @@ impl ScalarUDFImpl for SparkNextDay {
     }
 
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(DataType::Date32)
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, _args: ReturnFieldArgs) -> Result<FieldRef> {
+        // Spark marks next_day as always nullable because invalid day_of_week values
+        // can yield NULL even when inputs are non-null.
+        Ok(Arc::new(Field::new(self.name(), DataType::Date32, true)))
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
@@ -78,7 +80,12 @@ impl ScalarUDFImpl for SparkNextDay {
         match (date, day_of_week) {
             (ColumnarValue::Scalar(date), ColumnarValue::Scalar(day_of_week)) => {
                 match (date, day_of_week) {
-                    (ScalarValue::Date32(days), ScalarValue::Utf8(day_of_week) | ScalarValue::LargeUtf8(day_of_week) | ScalarValue::Utf8View(day_of_week)) => {
+                    (
+                        ScalarValue::Date32(days),
+                        ScalarValue::Utf8(day_of_week)
+                        | ScalarValue::LargeUtf8(day_of_week)
+                        | ScalarValue::Utf8View(day_of_week),
+                    ) => {
                         if let Some(days) = days {
                             if let Some(day_of_week) = day_of_week {
                                 Ok(ColumnarValue::Scalar(ScalarValue::Date32(
@@ -93,25 +100,36 @@ impl ScalarUDFImpl for SparkNextDay {
                             Ok(ColumnarValue::Scalar(ScalarValue::Date32(None)))
                         }
                     }
-                    _ => exec_err!("Spark `next_day` function: first arg must be date, second arg must be string. Got {args:?}"),
+                    _ => exec_err!(
+                        "Spark `next_day` function: first arg must be date, second arg must be string. Got {args:?}"
+                    ),
                 }
             }
             (ColumnarValue::Array(date_array), ColumnarValue::Scalar(day_of_week)) => {
                 match (date_array.data_type(), day_of_week) {
-                    (DataType::Date32, ScalarValue::Utf8(day_of_week) | ScalarValue::LargeUtf8(day_of_week) | ScalarValue::Utf8View(day_of_week)) => {
+                    (
+                        DataType::Date32,
+                        ScalarValue::Utf8(day_of_week)
+                        | ScalarValue::LargeUtf8(day_of_week)
+                        | ScalarValue::Utf8View(day_of_week),
+                    ) => {
                         if let Some(day_of_week) = day_of_week {
                             let result: Date32Array = date_array
                                 .as_primitive::<Date32Type>()
-                                .unary_opt(|days| spark_next_day(days, day_of_week.as_str()))
+                                .unary_opt(|days| {
+                                    spark_next_day(days, day_of_week.as_str())
+                                })
                                 .with_data_type(DataType::Date32);
                             Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef))
                         } else {
                             // TODO: if spark.sql.ansi.enabled is false,
                             //  returns NULL instead of an error for a malformed dayOfWeek.
-                            Ok(ColumnarValue::Array(Arc::new(new_null_array(&DataType::Date32, date_array.len()))))
+                            Ok(ColumnarValue::Scalar(ScalarValue::Date32(None)))
                         }
                     }
-                    _ => exec_err!("Spark `next_day` function: first arg must be date, second arg must be string. Got {args:?}"),
+                    _ => exec_err!(
+                        "Spark `next_day` function: first arg must be date, second arg must be string. Got {args:?}"
+                    ),
                 }
             }
             (
@@ -143,7 +161,9 @@ impl ScalarUDFImpl for SparkNextDay {
                                 process_next_day_arrays(date_array, day_of_week_array)
                             }
                             other => {
-                                exec_err!("Spark `next_day` function: second arg must be string. Got {other:?}")
+                                exec_err!(
+                                    "Spark `next_day` function: second arg must be string. Got {other:?}"
+                                )
                             }
                         }
                     }
@@ -188,7 +208,7 @@ where
 }
 
 fn spark_next_day(days: i32, day_of_week: &str) -> Option<i32> {
-    let date = Date32Type::to_naive_date(days);
+    let date = Date32Type::to_naive_date_opt(days)?;
 
     let day_of_week = day_of_week.trim().to_uppercase();
     let day_of_week = match day_of_week.as_str() {
@@ -224,3 +244,39 @@ fn spark_next_day(days: i32, day_of_week: &str) -> Option<i32> {
         None
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn return_type_is_not_used() {
+        let func = SparkNextDay::new();
+        let err = func
+            .return_type(&[DataType::Date32, DataType::Utf8])
+            .unwrap_err();
+        assert!(
+            err.to_string()
+                .contains("return_field_from_args should be used instead")
+        );
+    }
+
+    #[test]
+    fn next_day_is_always_nullable() {
+        let func = SparkNextDay::new();
+        let date_field: FieldRef =
+            Arc::new(Field::new("start_date", DataType::Date32, false));
+        let day_field: FieldRef =
+            Arc::new(Field::new("day_of_week", DataType::Utf8, false));
+
+        let field = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&date_field), Arc::clone(&day_field)],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        assert_eq!(field.data_type(), &DataType::Date32);
+        assert!(field.is_nullable());
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/time_trunc.rs b/datafusion/spark/src/function/datetime/time_trunc.rs
new file mode 100644
index 0000000000000..a66b8e94685aa
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/time_trunc.rs
@@ -0,0 +1,117 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::types::logical_string;
+use datafusion_common::{Result, ScalarValue, internal_err, plan_err};
+use datafusion_expr::expr::ScalarFunction;
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Expr, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignatureClass, Volatility,
+};
+
+/// Spark time_trunc function only handles time inputs.
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#time_trunc>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkTimeTrunc {
+    signature: Signature,
+}
+
+impl Default for SparkTimeTrunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkTimeTrunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                    Coercion::new_exact(TypeSignatureClass::Time),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkTimeTrunc {
+    fn name(&self) -> &str {
+        "time_trunc"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        Ok(Arc::new(Field::new(
+            self.name(),
+            args.arg_fields[1].data_type().clone(),
+            nullable,
+        )))
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!(
+            "spark time_trunc should have been simplified to standard date_trunc"
+        )
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        _info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let fmt_expr = &args[0];
+
+        let fmt = match fmt_expr.as_literal() {
+            Some(ScalarValue::Utf8(Some(v)))
+            | Some(ScalarValue::Utf8View(Some(v)))
+            | Some(ScalarValue::LargeUtf8(Some(v))) => v.to_lowercase(),
+            _ => {
+                return plan_err!(
+                    "First argument of `TIME_TRUNC` must be non-null scalar Utf8"
+                );
+            }
+        };
+
+        if !matches!(
+            fmt.as_str(),
+            "hour" | "minute" | "second" | "millisecond" | "microsecond"
+        ) {
+            return plan_err!(
+                "The format argument of `TIME_TRUNC` must be one of: hour, minute, second, millisecond, microsecond"
+            );
+        }
+
+        Ok(ExprSimplifyResult::Simplified(Expr::ScalarFunction(
+            ScalarFunction::new_udf(datafusion_functions::datetime::date_trunc(), args),
+        )))
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/to_utc_timestamp.rs b/datafusion/spark/src/function/datetime/to_utc_timestamp.rs
new file mode 100644
index 0000000000000..67910ff33f1af
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/to_utc_timestamp.rs
@@ -0,0 +1,220 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::timezone::Tz;
+use arrow::array::{Array, ArrayRef, AsArray, PrimitiveBuilder, StringArrayType};
+use arrow::datatypes::TimeUnit;
+use arrow::datatypes::{
+    ArrowTimestampType, DataType, Field, FieldRef, TimestampMicrosecondType,
+    TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
+};
+use chrono::{DateTime, Offset, TimeZone};
+use datafusion_common::types::{NativeType, logical_string};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{
+    Result, exec_datafusion_err, exec_err, internal_datafusion_err, internal_err,
+};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignatureClass, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+/// Apache Spark `to_utc_timestamp` function.
+///
+/// Interprets the given timestamp in the provided timezone and then converts it to UTC.
+///
+/// Timestamp in Apache Spark represents number of microseconds from the Unix epoch, which is not
+/// timezone-agnostic. So in Apache Spark this function just shift the timestamp value from the given
+/// timezone to UTC timezone.
+///
+/// See <https://spark.apache.org/docs/latest/api/sql/index.html#to_utc_timestamp>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkToUtcTimestamp {
+    signature: Signature,
+}
+
+impl Default for SparkToUtcTimestamp {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkToUtcTimestamp {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Timestamp,
+                        vec![TypeSignatureClass::Native(logical_string())],
+                        NativeType::Timestamp(TimeUnit::Microsecond, None),
+                    ),
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkToUtcTimestamp {
+    fn name(&self) -> &str {
+        "to_utc_timestamp"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        Ok(Arc::new(Field::new(
+            self.name(),
+            args.arg_fields[0].data_type().clone(),
+            nullable,
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(to_utc_timestamp, vec![])(&args.args)
+    }
+}
+
+fn to_utc_timestamp(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [timestamp, timezone] = take_function_args("to_utc_timestamp", args)?;
+
+    match timestamp.data_type() {
+        DataType::Timestamp(TimeUnit::Nanosecond, tz_opt) => {
+            process_timestamp_with_tz_array::<TimestampNanosecondType>(
+                timestamp,
+                timezone,
+                tz_opt.clone(),
+            )
+        }
+        DataType::Timestamp(TimeUnit::Microsecond, tz_opt) => {
+            process_timestamp_with_tz_array::<TimestampMicrosecondType>(
+                timestamp,
+                timezone,
+                tz_opt.clone(),
+            )
+        }
+        DataType::Timestamp(TimeUnit::Millisecond, tz_opt) => {
+            process_timestamp_with_tz_array::<TimestampMillisecondType>(
+                timestamp,
+                timezone,
+                tz_opt.clone(),
+            )
+        }
+        DataType::Timestamp(TimeUnit::Second, tz_opt) => {
+            process_timestamp_with_tz_array::<TimestampSecondType>(
+                timestamp,
+                timezone,
+                tz_opt.clone(),
+            )
+        }
+        ts_type => {
+            exec_err!("`to_utc_timestamp`: unsupported argument types: {ts_type}")
+        }
+    }
+}
+
+fn process_timestamp_with_tz_array<T: ArrowTimestampType>(
+    ts_array: &ArrayRef,
+    tz_array: &ArrayRef,
+    tz_opt: Option<Arc<str>>,
+) -> Result<ArrayRef> {
+    match tz_array.data_type() {
+        DataType::Utf8 => {
+            process_arrays::<T, _>(tz_opt, ts_array, tz_array.as_string::<i32>())
+        }
+        DataType::LargeUtf8 => {
+            process_arrays::<T, _>(tz_opt, ts_array, tz_array.as_string::<i64>())
+        }
+        DataType::Utf8View => {
+            process_arrays::<T, _>(tz_opt, ts_array, tz_array.as_string_view())
+        }
+        other => {
+            exec_err!("`to_utc_timestamp`: timezone must be a string type, got {other}")
+        }
+    }
+}
+
+fn process_arrays<'a, T: ArrowTimestampType, S>(
+    return_tz_opt: Option<Arc<str>>,
+    ts_array: &ArrayRef,
+    tz_array: &'a S,
+) -> Result<ArrayRef>
+where
+    &'a S: StringArrayType<'a>,
+{
+    let ts_primitive = ts_array.as_primitive::<T>();
+    let mut builder = PrimitiveBuilder::<T>::with_capacity(ts_array.len());
+
+    for (ts_opt, tz_opt) in ts_primitive.iter().zip(tz_array.iter()) {
+        match (ts_opt, tz_opt) {
+            (Some(ts), Some(tz_str)) => {
+                let tz: Tz = tz_str.parse().map_err(|e| {
+                    exec_datafusion_err!(
+                        "`to_utc_timestamp`: invalid timezone '{tz_str}': {e}"
+                    )
+                })?;
+                let val = adjust_to_utc_time::<T>(ts, tz)?;
+                builder.append_value(val);
+            }
+            _ => builder.append_null(),
+        }
+    }
+
+    builder = builder.with_timezone_opt(return_tz_opt);
+    Ok(Arc::new(builder.finish()))
+}
+
+fn adjust_to_utc_time<T: ArrowTimestampType>(ts: i64, tz: Tz) -> Result<i64> {
+    let dt = match T::UNIT {
+        TimeUnit::Nanosecond => Some(DateTime::from_timestamp_nanos(ts)),
+        TimeUnit::Microsecond => DateTime::from_timestamp_micros(ts),
+        TimeUnit::Millisecond => DateTime::from_timestamp_millis(ts),
+        TimeUnit::Second => DateTime::from_timestamp(ts, 0),
+    }
+    .ok_or_else(|| internal_datafusion_err!("Invalid timestamp"))?;
+    let naive_dt = dt.naive_utc();
+
+    let offset_seconds = tz
+        .offset_from_utc_datetime(&naive_dt)
+        .fix()
+        .local_minus_utc() as i64;
+
+    let offset_in_unit = match T::UNIT {
+        TimeUnit::Nanosecond => offset_seconds.checked_mul(1_000_000_000),
+        TimeUnit::Microsecond => offset_seconds.checked_mul(1_000_000),
+        TimeUnit::Millisecond => offset_seconds.checked_mul(1_000),
+        TimeUnit::Second => Some(offset_seconds),
+    }
+    .ok_or_else(|| internal_datafusion_err!("Offset overflow"))?;
+
+    ts.checked_sub(offset_in_unit).ok_or_else(|| {
+        internal_datafusion_err!("Timestamp overflow during timezone adjustment")
+    })
+}
diff --git a/datafusion/spark/src/function/datetime/trunc.rs b/datafusion/spark/src/function/datetime/trunc.rs
new file mode 100644
index 0000000000000..9d7da5969a525
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/trunc.rs
@@ -0,0 +1,138 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, FieldRef, TimeUnit};
+use datafusion_common::types::{NativeType, logical_date, logical_string};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, internal_err, plan_err};
+use datafusion_expr::expr::ScalarFunction;
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Expr, ExprSchemable, ReturnFieldArgs, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignatureClass, Volatility,
+};
+
+/// Spark trunc supports date inputs only and extra format aliases.
+/// Also spark trunc's argument order is (date, format).
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#trunc>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkTrunc {
+    signature: Signature,
+}
+
+impl Default for SparkTrunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkTrunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_date()),
+                        vec![TypeSignatureClass::Native(logical_string())],
+                        NativeType::Date,
+                    ),
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkTrunc {
+    fn name(&self) -> &str {
+        "trunc"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        Ok(Arc::new(Field::new(
+            self.name(),
+            args.arg_fields[0].data_type().clone(),
+            nullable,
+        )))
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!("spark trunc should have been simplified to standard date_trunc")
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [dt_expr, fmt_expr] = take_function_args(self.name(), args)?;
+
+        let fmt = match fmt_expr.as_literal() {
+            Some(ScalarValue::Utf8(Some(v)))
+            | Some(ScalarValue::Utf8View(Some(v)))
+            | Some(ScalarValue::LargeUtf8(Some(v))) => v.to_lowercase(),
+            _ => {
+                return plan_err!(
+                    "Second argument of `TRUNC` must be non-null scalar Utf8"
+                );
+            }
+        };
+
+        // Map Spark-specific fmt aliases to datafusion ones
+        let fmt = match fmt.as_str() {
+            "yy" | "yyyy" => "year",
+            "mm" | "mon" => "month",
+            "year" | "month" | "day" | "week" | "quarter" => fmt.as_str(),
+            _ => {
+                return plan_err!(
+                    "The format argument of `TRUNC` must be one of: year, yy, yyyy, month, mm, mon, day, week, quarter."
+                );
+            }
+        };
+        let return_type = dt_expr.get_type(info.schema())?;
+
+        let fmt_expr = Expr::Literal(ScalarValue::new_utf8(fmt), None);
+
+        // Spark uses Dates so we need to cast to timestamp and back to work with datafusion's date_trunc
+        Ok(ExprSimplifyResult::Simplified(
+            Expr::ScalarFunction(ScalarFunction::new_udf(
+                datafusion_functions::datetime::date_trunc(),
+                vec![
+                    fmt_expr,
+                    dt_expr.cast_to(
+                        &DataType::Timestamp(TimeUnit::Nanosecond, None),
+                        info.schema(),
+                    )?,
+                ],
+            ))
+            .cast_to(&return_type, info.schema())?,
+        ))
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/unix.rs b/datafusion/spark/src/function/datetime/unix.rs
new file mode 100644
index 0000000000000..6eaf3a08780bc
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/unix.rs
@@ -0,0 +1,165 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, FieldRef, TimeUnit};
+use datafusion_common::types::logical_date;
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Expr, ExprSchemable, ReturnFieldArgs, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignatureClass, Volatility,
+};
+
+/// Returns the number of days since epoch (1970-01-01) for the given date.
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#unix_date>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkUnixDate {
+    signature: Signature,
+}
+
+impl Default for SparkUnixDate {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkUnixDate {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![Coercion::new_exact(TypeSignatureClass::Native(
+                    logical_date(),
+                ))],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkUnixDate {
+    fn name(&self) -> &str {
+        "unix_date"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields[0].is_nullable();
+        Ok(Arc::new(Field::new(self.name(), DataType::Int32, nullable)))
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!("invoke_with_args should not be called on SparkUnixDate")
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [date] = take_function_args(self.name(), args)?;
+        Ok(ExprSimplifyResult::Simplified(
+            date.cast_to(&DataType::Date32, info.schema())?
+                .cast_to(&DataType::Int32, info.schema())?,
+        ))
+    }
+}
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkUnixTimestamp {
+    time_unit: TimeUnit,
+    signature: Signature,
+    name: &'static str,
+}
+
+impl SparkUnixTimestamp {
+    pub fn new(name: &'static str, time_unit: TimeUnit) -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![Coercion::new_exact(TypeSignatureClass::Timestamp)],
+                Volatility::Immutable,
+            ),
+            time_unit,
+            name,
+        }
+    }
+
+    /// Returns the number of microseconds since epoch (1970-01-01 00:00:00 UTC) for the given timestamp.
+    /// <https://spark.apache.org/docs/latest/api/sql/index.html#unix_micros>
+    pub fn microseconds() -> Self {
+        Self::new("unix_micros", TimeUnit::Microsecond)
+    }
+
+    /// Returns the number of milliseconds since epoch (1970-01-01 00:00:00 UTC) for the given timestamp.
+    /// <https://spark.apache.org/docs/latest/api/sql/index.html#unix_millis>
+    pub fn milliseconds() -> Self {
+        Self::new("unix_millis", TimeUnit::Millisecond)
+    }
+
+    /// Returns the number of seconds since epoch (1970-01-01 00:00:00 UTC) for the given timestamp.
+    /// <https://spark.apache.org/docs/latest/api/sql/index.html#unix_seconds>
+    pub fn seconds() -> Self {
+        Self::new("unix_seconds", TimeUnit::Second)
+    }
+}
+
+impl ScalarUDFImpl for SparkUnixTimestamp {
+    fn name(&self) -> &str {
+        self.name
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields[0].is_nullable();
+        Ok(Arc::new(Field::new(self.name(), DataType::Int64, nullable)))
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!("invoke_with_args should not be called on `{}`", self.name())
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [ts] = take_function_args(self.name(), args)?;
+        Ok(ExprSimplifyResult::Simplified(
+            ts.cast_to(
+                &DataType::Timestamp(self.time_unit, Some("UTC".into())),
+                info.schema(),
+            )?
+            .cast_to(&DataType::Int64, info.schema())?,
+        ))
+    }
+}
diff --git a/datafusion/spark/src/function/error_utils.rs b/datafusion/spark/src/function/error_utils.rs
index b972d64ed3e9a..362a32bcd0cc2 100644
--- a/datafusion/spark/src/function/error_utils.rs
+++ b/datafusion/spark/src/function/error_utils.rs
@@ -18,7 +18,7 @@
 // TODO: https://github.com/apache/spark/tree/master/common/utils/src/main/resources/error
 
 use arrow::datatypes::DataType;
-use datafusion_common::{exec_datafusion_err, internal_datafusion_err, DataFusionError};
+use datafusion_common::{DataFusionError, exec_datafusion_err, internal_datafusion_err};
 
 pub fn invalid_arg_count_exec_err(
     function_name: &str,
@@ -44,7 +44,9 @@ pub fn unsupported_data_type_exec_err(
     required: &str,
     provided: &DataType,
 ) -> DataFusionError {
-    exec_datafusion_err!("Unsupported Data Type: Spark `{function_name}` function expects {required}, got {provided}")
+    exec_datafusion_err!(
+        "Unsupported Data Type: Spark `{function_name}` function expects {required}, got {provided}"
+    )
 }
 
 pub fn unsupported_data_types_exec_err(
diff --git a/datafusion/spark/src/function/hash/crc32.rs b/datafusion/spark/src/function/hash/crc32.rs
index 76e31d12c6487..2fc376abcb725 100644
--- a/datafusion/spark/src/function/hash/crc32.rs
+++ b/datafusion/spark/src/function/hash/crc32.rs
@@ -15,18 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, Int64Array};
-use arrow::datatypes::DataType;
+use arrow::datatypes::{DataType, Field, FieldRef};
 use crc32fast::Hasher;
 use datafusion_common::cast::{
-    as_binary_array, as_binary_view_array, as_large_binary_array,
+    as_binary_array, as_binary_view_array, as_fixed_size_binary_array,
+    as_large_binary_array,
 };
-use datafusion_common::{exec_err, internal_err, Result};
+use datafusion_common::types::{NativeType, logical_string};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
 use datafusion_expr::{
-    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+    Coercion, ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignatureClass, Volatility,
 };
 use datafusion_functions::utils::make_scalar_function;
 
@@ -45,16 +48,19 @@ impl Default for SparkCrc32 {
 impl SparkCrc32 {
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+            signature: Signature::coercible(
+                vec![Coercion::new_implicit(
+                    TypeSignatureClass::Binary,
+                    vec![TypeSignatureClass::Native(logical_string())],
+                    NativeType::Binary,
+                )],
+                Volatility::Immutable,
+            ),
         }
     }
 }
 
 impl ScalarUDFImpl for SparkCrc32 {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "crc32"
     }
@@ -64,29 +70,16 @@ impl ScalarUDFImpl for SparkCrc32 {
     }
 
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(DataType::Int64)
+        internal_err!("return_field_from_args should be used instead")
     }
 
-    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(spark_crc32, vec![])(&args.args)
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new(self.name(), DataType::Int64, nullable)))
     }
 
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        if arg_types.len() != 1 {
-            return exec_err!(
-                "`crc32` function requires 1 argument, got {}",
-                arg_types.len()
-            );
-        }
-        match arg_types[0] {
-            DataType::Binary | DataType::LargeBinary | DataType::BinaryView => {
-                Ok(vec![arg_types[0].clone()])
-            }
-            DataType::Utf8 | DataType::Utf8View => Ok(vec![DataType::Binary]),
-            DataType::LargeUtf8 => Ok(vec![DataType::LargeBinary]),
-            DataType::Null => Ok(vec![DataType::Binary]),
-            _ => exec_err!("`crc32` function does not support type {}", arg_types[0]),
-        }
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_crc32, vec![])(&args.args)
     }
 }
 
@@ -104,14 +97,10 @@ fn spark_crc32_impl<'a>(input: impl Iterator<Item = Option<&'a [u8]>>) -> ArrayR
 }
 
 fn spark_crc32(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let [input] = args else {
-        return internal_err!(
-            "Spark `crc32` function requires 1 argument, got {}",
-            args.len()
-        );
-    };
+    let [input] = take_function_args("crc32", args)?;
 
     match input.data_type() {
+        DataType::Null => Ok(Arc::new(Int64Array::new_null(input.len()))),
         DataType::Binary => {
             let input = as_binary_array(input)?;
             Ok(spark_crc32_impl(input.iter()))
@@ -124,11 +113,42 @@ fn spark_crc32(args: &[ArrayRef]) -> Result<ArrayRef> {
             let input = as_binary_view_array(input)?;
             Ok(spark_crc32_impl(input.iter()))
         }
-        _ => {
-            exec_err!(
-                "Spark `crc32` function: argument must be binary or large binary, got {:?}",
-                input.data_type()
-            )
+        DataType::FixedSizeBinary(_) => {
+            let input = as_fixed_size_binary_array(input)?;
+            Ok(spark_crc32_impl(input.iter()))
         }
+        dt => {
+            internal_err!("Unsupported data type for crc32: {dt}")
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_crc32_nullability() -> Result<()> {
+        let crc32_func = SparkCrc32::new();
+
+        // non-nullable field should produce non-nullable output
+        let field_not_null = Arc::new(Field::new("data", DataType::Binary, false));
+        let result = crc32_func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: std::slice::from_ref(&field_not_null),
+            scalar_arguments: &[None],
+        })?;
+        assert!(!result.is_nullable());
+        assert_eq!(result.data_type(), &DataType::Int64);
+
+        // nullable field should produce nullable output
+        let field_nullable = Arc::new(Field::new("data", DataType::Binary, true));
+        let result = crc32_func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[field_nullable],
+            scalar_arguments: &[None],
+        })?;
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), &DataType::Int64);
+
+        Ok(())
     }
 }
diff --git a/datafusion/spark/src/function/hash/sha1.rs b/datafusion/spark/src/function/hash/sha1.rs
index 25cbdd4453505..dd9009eb8233f 100644
--- a/datafusion/spark/src/function/hash/sha1.rs
+++ b/datafusion/spark/src/function/hash/sha1.rs
@@ -15,18 +15,20 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
-use std::fmt::Write;
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, StringArray};
-use arrow::datatypes::DataType;
+use arrow::datatypes::{DataType, Field, FieldRef};
 use datafusion_common::cast::{
-    as_binary_array, as_binary_view_array, as_large_binary_array,
+    as_binary_array, as_binary_view_array, as_fixed_size_binary_array,
+    as_large_binary_array,
 };
-use datafusion_common::{exec_err, internal_err, Result};
+use datafusion_common::types::{NativeType, logical_string};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
 use datafusion_expr::{
-    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+    Coercion, ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignatureClass, Volatility,
 };
 use datafusion_functions::utils::make_scalar_function;
 use sha1::{Digest, Sha1};
@@ -47,17 +49,20 @@ impl Default for SparkSha1 {
 impl SparkSha1 {
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+            signature: Signature::coercible(
+                vec![Coercion::new_implicit(
+                    TypeSignatureClass::Binary,
+                    vec![TypeSignatureClass::Native(logical_string())],
+                    NativeType::Binary,
+                )],
+                Volatility::Immutable,
+            ),
             aliases: vec!["sha".to_string()],
         }
     }
 }
 
 impl ScalarUDFImpl for SparkSha1 {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "sha1"
     }
@@ -71,38 +76,29 @@ impl ScalarUDFImpl for SparkSha1 {
     }
 
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(DataType::Utf8)
+        internal_err!("return_field_from_args should be used instead")
     }
 
-    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(spark_sha1, vec![])(&args.args)
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new(self.name(), DataType::Utf8, nullable)))
     }
 
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        if arg_types.len() != 1 {
-            return exec_err!(
-                "`sha1` function requires 1 argument, got {}",
-                arg_types.len()
-            );
-        }
-        match arg_types[0] {
-            DataType::Binary | DataType::LargeBinary | DataType::BinaryView => {
-                Ok(vec![arg_types[0].clone()])
-            }
-            DataType::Utf8 | DataType::Utf8View => Ok(vec![DataType::Binary]),
-            DataType::LargeUtf8 => Ok(vec![DataType::LargeBinary]),
-            DataType::Null => Ok(vec![DataType::Binary]),
-            _ => exec_err!("`sha1` function does not support type {}", arg_types[0]),
-        }
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_sha1, vec![])(&args.args)
     }
 }
 
+/// Hex encoding lookup table for fast byte-to-hex conversion
+const HEX_CHARS_LOWER: &[u8; 16] = b"0123456789abcdef";
+
+#[inline]
 fn spark_sha1_digest(value: &[u8]) -> String {
     let result = Sha1::digest(value);
     let mut s = String::with_capacity(result.len() * 2);
-    for b in result.as_slice() {
-        #[allow(clippy::unwrap_used)]
-        write!(&mut s, "{b:02x}").unwrap();
+    for &b in result.as_slice() {
+        s.push(HEX_CHARS_LOWER[(b >> 4) as usize] as char);
+        s.push(HEX_CHARS_LOWER[(b & 0x0f) as usize] as char);
     }
     s
 }
@@ -115,14 +111,10 @@ fn spark_sha1_impl<'a>(input: impl Iterator<Item = Option<&'a [u8]>>) -> ArrayRe
 }
 
 fn spark_sha1(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let [input] = args else {
-        return internal_err!(
-            "Spark `sha1` function requires 1 argument, got {}",
-            args.len()
-        );
-    };
+    let [input] = take_function_args("sha1", args)?;
 
     match input.data_type() {
+        DataType::Null => Ok(Arc::new(StringArray::new_null(input.len()))),
         DataType::Binary => {
             let input = as_binary_array(input)?;
             Ok(spark_sha1_impl(input.iter()))
@@ -135,11 +127,42 @@ fn spark_sha1(args: &[ArrayRef]) -> Result<ArrayRef> {
             let input = as_binary_view_array(input)?;
             Ok(spark_sha1_impl(input.iter()))
         }
-        _ => {
-            exec_err!(
-                "Spark `sha1` function: argument must be binary or large binary, got {:?}",
-                input.data_type()
-            )
+        DataType::FixedSizeBinary(_) => {
+            let input = as_fixed_size_binary_array(input)?;
+            Ok(spark_sha1_impl(input.iter()))
+        }
+        dt => {
+            internal_err!("Unsupported data type for sha1: {dt}")
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_sha1_nullability() -> Result<()> {
+        let func = SparkSha1::new();
+
+        // Non-nullable input keeps output non-nullable
+        let non_nullable: FieldRef = Arc::new(Field::new("col", DataType::Binary, false));
+        let out = func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[Arc::clone(&non_nullable)],
+            scalar_arguments: &[None],
+        })?;
+        assert!(!out.is_nullable());
+        assert_eq!(out.data_type(), &DataType::Utf8);
+
+        // Nullable input makes output nullable
+        let nullable: FieldRef = Arc::new(Field::new("col", DataType::Binary, true));
+        let out = func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[Arc::clone(&nullable)],
+            scalar_arguments: &[None],
+        })?;
+        assert!(out.is_nullable());
+        assert_eq!(out.data_type(), &DataType::Utf8);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/hash/sha2.rs b/datafusion/spark/src/function/hash/sha2.rs
index b006607d3eeda..38fa0cc643751 100644
--- a/datafusion/spark/src/function/hash/sha2.rs
+++ b/datafusion/spark/src/function/hash/sha2.rs
@@ -15,26 +15,28 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate datafusion_functions;
-
-use crate::function::error_utils::{
-    invalid_arg_count_exec_err, unsupported_data_type_exec_err,
-};
-use crate::function::math::hex::spark_sha2_hex;
-use arrow::array::{ArrayRef, AsArray, StringArray};
+use arrow::array::{ArrayRef, AsArray, BinaryArrayType, Int32Array, StringArray};
 use arrow::datatypes::{DataType, Int32Type};
-use datafusion_common::{exec_err, internal_datafusion_err, Result, ScalarValue};
-use datafusion_expr::Signature;
-use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Volatility};
-pub use datafusion_functions::crypto::basic::{sha224, sha256, sha384, sha512};
-use std::any::Any;
+use datafusion_common::types::{
+    NativeType, logical_binary, logical_int32, logical_string,
+};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, internal_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use sha2::{self, Digest};
 use std::sync::Arc;
 
+/// Differs from DataFusion version in allowing array input for bit lengths, and
+/// also hex encoding the output.
+///
 /// <https://spark.apache.org/docs/latest/api/sql/index.html#sha2>
 #[derive(Debug, PartialEq, Eq, Hash)]
 pub struct SparkSha2 {
     signature: Signature,
-    aliases: Vec<String>,
 }
 
 impl Default for SparkSha2 {
@@ -46,17 +48,26 @@ impl Default for SparkSha2 {
 impl SparkSha2 {
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
-            aliases: vec![],
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_binary()),
+                        vec![TypeSignatureClass::Native(logical_string())],
+                        NativeType::Binary,
+                    ),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_int32()),
+                        vec![TypeSignatureClass::Integer],
+                        NativeType::Int32,
+                    ),
+                ],
+                Volatility::Immutable,
+            ),
         }
     }
 }
 
 impl ScalarUDFImpl for SparkSha2 {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "sha2"
     }
@@ -65,156 +76,188 @@ impl ScalarUDFImpl for SparkSha2 {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if arg_types[1].is_null() {
-            return Ok(DataType::Null);
-        }
-        Ok(match arg_types[0] {
-            DataType::Utf8View
-            | DataType::LargeUtf8
-            | DataType::Utf8
-            | DataType::Binary
-            | DataType::BinaryView
-            | DataType::LargeBinary => DataType::Utf8,
-            DataType::Null => DataType::Null,
-            _ => {
-                return exec_err!(
-                    "{} function can only accept strings or binary arrays.",
-                    self.name()
-                )
-            }
-        })
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Utf8)
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        let args: [ColumnarValue; 2] = args.args.try_into().map_err(|_| {
-            internal_datafusion_err!("Expected 2 arguments for function sha2")
-        })?;
+        let [values, bit_lengths] = take_function_args(self.name(), args.args.iter())?;
 
-        sha2(args)
-    }
+        match (values, bit_lengths) {
+            (
+                ColumnarValue::Scalar(value_scalar),
+                ColumnarValue::Scalar(ScalarValue::Int32(Some(bit_length))),
+            ) => {
+                if value_scalar.is_null() {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)));
+                }
 
-    fn aliases(&self) -> &[String] {
-        &self.aliases
-    }
-
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        if arg_types.len() != 2 {
-            return Err(invalid_arg_count_exec_err(
-                self.name(),
-                (2, 2),
-                arg_types.len(),
-            ));
-        }
-        let expr_type = match &arg_types[0] {
-            DataType::Utf8View
-            | DataType::LargeUtf8
-            | DataType::Utf8
-            | DataType::Binary
-            | DataType::BinaryView
-            | DataType::LargeBinary
-            | DataType::Null => Ok(arg_types[0].clone()),
-            _ => Err(unsupported_data_type_exec_err(
-                self.name(),
-                "String, Binary",
-                &arg_types[0],
-            )),
-        }?;
-        let bit_length_type = if arg_types[1].is_numeric() {
-            Ok(DataType::Int32)
-        } else if arg_types[1].is_null() {
-            Ok(DataType::Null)
-        } else {
-            Err(unsupported_data_type_exec_err(
-                self.name(),
-                "Numeric Type",
-                &arg_types[1],
-            ))
-        }?;
-
-        Ok(vec![expr_type, bit_length_type])
-    }
-}
+                // Accept both Binary and Utf8 scalars (depending on coercion)
+                let bytes = match value_scalar {
+                    ScalarValue::Binary(Some(b)) => b.as_slice(),
+                    ScalarValue::LargeBinary(Some(b)) => b.as_slice(),
+                    ScalarValue::BinaryView(Some(b)) => b.as_slice(),
+                    ScalarValue::Utf8(Some(s))
+                    | ScalarValue::LargeUtf8(Some(s))
+                    | ScalarValue::Utf8View(Some(s)) => s.as_bytes(),
+                    other => {
+                        return internal_err!(
+                            "Unsupported scalar datatype for sha2: {}",
+                            other.data_type()
+                        );
+                    }
+                };
 
-pub fn sha2(args: [ColumnarValue; 2]) -> Result<ColumnarValue> {
-    match args {
-        [ColumnarValue::Scalar(ScalarValue::Utf8(expr_arg)), ColumnarValue::Scalar(ScalarValue::Int32(Some(bit_length_arg)))] => {
-            compute_sha2(
-                bit_length_arg,
-                &[ColumnarValue::from(ScalarValue::Utf8(expr_arg))],
-            )
-        }
-        [ColumnarValue::Array(expr_arg), ColumnarValue::Scalar(ScalarValue::Int32(Some(bit_length_arg)))] => {
-            compute_sha2(bit_length_arg, &[ColumnarValue::from(expr_arg)])
-        }
-        [ColumnarValue::Scalar(ScalarValue::Utf8(expr_arg)), ColumnarValue::Array(bit_length_arg)] =>
-        {
-            let arr: StringArray = bit_length_arg
-                .as_primitive::<Int32Type>()
-                .iter()
-                .map(|bit_length| {
-                    match sha2([
-                        ColumnarValue::Scalar(ScalarValue::Utf8(expr_arg.clone())),
-                        ColumnarValue::Scalar(ScalarValue::Int32(bit_length)),
-                    ])
-                    .unwrap()
-                    {
-                        ColumnarValue::Scalar(ScalarValue::Utf8(str)) => str,
-                        ColumnarValue::Array(arr) => arr
-                            .as_string::<i32>()
-                            .iter()
-                            .map(|str| str.unwrap().to_string())
-                            .next(), // first element
-                        _ => unreachable!(),
+                let out = match bit_length {
+                    224 => {
+                        let mut digest = sha2::Sha224::default();
+                        digest.update(bytes);
+                        Some(hex_encode(digest.finalize()))
                     }
-                })
-                .collect();
-            Ok(ColumnarValue::Array(Arc::new(arr) as ArrayRef))
-        }
-        [ColumnarValue::Array(expr_arg), ColumnarValue::Array(bit_length_arg)] => {
-            let expr_iter = expr_arg.as_string::<i32>().iter();
-            let bit_length_iter = bit_length_arg.as_primitive::<Int32Type>().iter();
-            let arr: StringArray = expr_iter
-                .zip(bit_length_iter)
-                .map(|(expr, bit_length)| {
-                    match sha2([
-                        ColumnarValue::Scalar(ScalarValue::Utf8(Some(
-                            expr.unwrap().to_string(),
-                        ))),
-                        ColumnarValue::Scalar(ScalarValue::Int32(bit_length)),
-                    ])
-                    .unwrap()
-                    {
-                        ColumnarValue::Scalar(ScalarValue::Utf8(str)) => str,
-                        ColumnarValue::Array(arr) => arr
-                            .as_string::<i32>()
-                            .iter()
-                            .map(|str| str.unwrap().to_string())
-                            .next(), // first element
-                        _ => unreachable!(),
+                    0 | 256 => {
+                        let mut digest = sha2::Sha256::default();
+                        digest.update(bytes);
+                        Some(hex_encode(digest.finalize()))
+                    }
+                    384 => {
+                        let mut digest = sha2::Sha384::default();
+                        digest.update(bytes);
+                        Some(hex_encode(digest.finalize()))
                     }
-                })
-                .collect();
-            Ok(ColumnarValue::Array(Arc::new(arr) as ArrayRef))
+                    512 => {
+                        let mut digest = sha2::Sha512::default();
+                        digest.update(bytes);
+                        Some(hex_encode(digest.finalize()))
+                    }
+                    _ => None,
+                };
+
+                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(out)))
+            }
+            // Array values + scalar bit length (common case: sha2(col, 256))
+            (
+                ColumnarValue::Array(values_array),
+                ColumnarValue::Scalar(ScalarValue::Int32(Some(bit_length))),
+            ) => {
+                let output: ArrayRef = match values_array.data_type() {
+                    DataType::Binary => sha2_binary_scalar_bitlen(
+                        &values_array.as_binary::<i32>(),
+                        *bit_length,
+                    ),
+                    DataType::LargeBinary => sha2_binary_scalar_bitlen(
+                        &values_array.as_binary::<i64>(),
+                        *bit_length,
+                    ),
+                    DataType::BinaryView => sha2_binary_scalar_bitlen(
+                        &values_array.as_binary_view(),
+                        *bit_length,
+                    ),
+                    dt => return internal_err!("Unsupported datatype for sha2: {dt}"),
+                };
+                Ok(ColumnarValue::Array(output))
+            }
+            (
+                ColumnarValue::Scalar(_),
+                ColumnarValue::Scalar(ScalarValue::Int32(None)),
+            ) => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))),
+            (
+                ColumnarValue::Array(_),
+                ColumnarValue::Scalar(ScalarValue::Int32(None)),
+            ) => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))),
+            _ => {
+                // Fallback to existing behavior for any array/mixed cases
+                make_scalar_function(sha2_impl, vec![])(&args.args)
+            }
         }
-        _ => exec_err!("Unsupported argument types for sha2 function"),
     }
 }
 
-fn compute_sha2(
-    bit_length_arg: i32,
-    expr_arg: &[ColumnarValue],
-) -> Result<ColumnarValue> {
-    match bit_length_arg {
-        0 | 256 => sha256(expr_arg),
-        224 => sha224(expr_arg),
-        384 => sha384(expr_arg),
-        512 => sha512(expr_arg),
-        _ => {
-            // Return null for unsupported bit lengths instead of error, because spark sha2 does not
-            // error out for this.
-            return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)));
+fn sha2_impl(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [values, bit_lengths] = take_function_args("sha2", args)?;
+
+    let bit_lengths = bit_lengths.as_primitive::<Int32Type>();
+    let output = match values.data_type() {
+        DataType::Binary => sha2_binary_impl(&values.as_binary::<i32>(), bit_lengths),
+        DataType::LargeBinary => {
+            sha2_binary_impl(&values.as_binary::<i64>(), bit_lengths)
         }
+        DataType::BinaryView => sha2_binary_impl(&values.as_binary_view(), bit_lengths),
+        dt => return internal_err!("Unsupported datatype for sha2: {dt}"),
+    };
+    Ok(output)
+}
+
+fn sha2_binary_impl<'a, BinaryArrType>(
+    values: &BinaryArrType,
+    bit_lengths: &Int32Array,
+) -> ArrayRef
+where
+    BinaryArrType: BinaryArrayType<'a>,
+{
+    sha2_binary_bitlen_iter(values, bit_lengths.iter())
+}
+
+fn sha2_binary_scalar_bitlen<'a, BinaryArrType>(
+    values: &BinaryArrType,
+    bit_length: i32,
+) -> ArrayRef
+where
+    BinaryArrType: BinaryArrayType<'a>,
+{
+    sha2_binary_bitlen_iter(values, std::iter::repeat(Some(bit_length)))
+}
+
+fn sha2_binary_bitlen_iter<'a, BinaryArrType, I>(
+    values: &BinaryArrType,
+    bit_lengths: I,
+) -> ArrayRef
+where
+    BinaryArrType: BinaryArrayType<'a>,
+    I: Iterator<Item = Option<i32>>,
+{
+    let array = values
+        .iter()
+        .zip(bit_lengths)
+        .map(|(value, bit_length)| match (value, bit_length) {
+            (Some(value), Some(224)) => {
+                let mut digest = sha2::Sha224::default();
+                digest.update(value);
+                Some(hex_encode(digest.finalize()))
+            }
+            (Some(value), Some(0 | 256)) => {
+                let mut digest = sha2::Sha256::default();
+                digest.update(value);
+                Some(hex_encode(digest.finalize()))
+            }
+            (Some(value), Some(384)) => {
+                let mut digest = sha2::Sha384::default();
+                digest.update(value);
+                Some(hex_encode(digest.finalize()))
+            }
+            (Some(value), Some(512)) => {
+                let mut digest = sha2::Sha512::default();
+                digest.update(value);
+                Some(hex_encode(digest.finalize()))
+            }
+            // Unknown bit-lengths go to null, same as in Spark
+            _ => None,
+        })
+        .collect::<StringArray>();
+    Arc::new(array)
+}
+
+const HEX_CHARS: [u8; 16] = *b"0123456789abcdef";
+
+#[inline]
+fn hex_encode<T: AsRef<[u8]>>(data: T) -> String {
+    let bytes = data.as_ref();
+    let mut out = Vec::with_capacity(bytes.len() * 2);
+    for &b in bytes {
+        let hi = b >> 4;
+        let lo = b & 0x0F;
+        out.push(HEX_CHARS[hi as usize]);
+        out.push(HEX_CHARS[lo as usize]);
     }
-    .map(|hashed| spark_sha2_hex(&[hashed]).unwrap())
+    // SAFETY: out contains only ASCII
+    unsafe { String::from_utf8_unchecked(out) }
 }
diff --git a/datafusion/spark/src/function/json/json_tuple.rs b/datafusion/spark/src/function/json/json_tuple.rs
new file mode 100644
index 0000000000000..3496f979ffe06
--- /dev/null
+++ b/datafusion/spark/src/function/json/json_tuple.rs
@@ -0,0 +1,238 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{Array, ArrayRef, NullBufferBuilder, StringBuilder, StructArray};
+use arrow::datatypes::{DataType, Field, FieldRef, Fields};
+use datafusion_common::cast::as_string_array;
+use datafusion_common::{Result, exec_err, internal_err};
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+
+/// Spark-compatible `json_tuple` expression
+///
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#json_tuple>
+///
+/// Extracts top-level fields from a JSON string and returns them as a struct.
+///
+/// `json_tuple(json_string, field1, field2, ...) -> Struct<c0: Utf8, c1: Utf8, ...>`
+///
+/// Note: In Spark, `json_tuple` is a Generator that produces multiple columns directly.
+/// In DataFusion, a ScalarUDF can only return one value per row, so the result is wrapped
+/// in a Struct. The caller (e.g. Comet) is expected to destructure the struct fields.
+///
+/// - Returns NULL for each field that is missing from the JSON object
+/// - Returns NULL for all fields if the input is NULL or not valid JSON
+/// - Non-string JSON values are converted to their JSON string representation
+/// - JSON `null` values are returned as NULL (not the string "null")
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct JsonTuple {
+    signature: Signature,
+}
+
+impl Default for JsonTuple {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl JsonTuple {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::variadic(vec![DataType::Utf8], Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for JsonTuple {
+    fn name(&self) -> &str {
+        "json_tuple"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        if args.arg_fields.len() < 2 {
+            return exec_err!(
+                "json_tuple requires at least 2 arguments (json_string, field1), got {}",
+                args.arg_fields.len()
+            );
+        }
+
+        let num_fields = args.arg_fields.len() - 1;
+        let fields: Fields = (0..num_fields)
+            .map(|i| Field::new(format!("c{i}"), DataType::Utf8, true))
+            .collect::<Vec<_>>()
+            .into();
+
+        Ok(Arc::new(Field::new(
+            self.name(),
+            DataType::Struct(fields),
+            true,
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs {
+            args: arg_values,
+            return_field,
+            ..
+        } = args;
+        let arrays = ColumnarValue::values_to_arrays(&arg_values)?;
+        let result = json_tuple_inner(&arrays, return_field.data_type())?;
+
+        Ok(ColumnarValue::Array(result))
+    }
+}
+
+fn json_tuple_inner(args: &[ArrayRef], return_type: &DataType) -> Result<ArrayRef> {
+    let num_rows = args[0].len();
+    let num_fields = args.len() - 1;
+
+    let json_array = as_string_array(&args[0])?;
+
+    let field_arrays = args[1..]
+        .iter()
+        .map(|arg| as_string_array(arg))
+        .collect::<Result<Vec<_>>>()?;
+
+    let mut builders: Vec<StringBuilder> =
+        (0..num_fields).map(|_| StringBuilder::new()).collect();
+
+    let mut null_buffer = NullBufferBuilder::new(num_rows);
+
+    for row_idx in 0..num_rows {
+        if json_array.is_null(row_idx) {
+            for builder in &mut builders {
+                builder.append_null();
+            }
+            null_buffer.append_null();
+            continue;
+        }
+
+        let json_str = json_array.value(row_idx);
+        match serde_json::from_str::<serde_json::Value>(json_str) {
+            Ok(serde_json::Value::Object(map)) => {
+                null_buffer.append_non_null();
+                for (field_idx, builder) in builders.iter_mut().enumerate() {
+                    if field_arrays[field_idx].is_null(row_idx) {
+                        builder.append_null();
+                        continue;
+                    }
+                    let field_name = field_arrays[field_idx].value(row_idx);
+                    match map.get(field_name) {
+                        Some(serde_json::Value::Null) => {
+                            builder.append_null();
+                        }
+                        Some(serde_json::Value::String(s)) => {
+                            builder.append_value(s);
+                        }
+                        Some(other) => {
+                            builder.append_value(other.to_string());
+                        }
+                        None => {
+                            builder.append_null();
+                        }
+                    }
+                }
+            }
+            _ => {
+                for builder in &mut builders {
+                    builder.append_null();
+                }
+                null_buffer.append_null();
+            }
+        }
+    }
+
+    let struct_fields = match return_type {
+        DataType::Struct(fields) => fields.clone(),
+        _ => {
+            return internal_err!(
+                "json_tuple requires a Struct return type, got {:?}",
+                return_type
+            );
+        }
+    };
+
+    let arrays: Vec<ArrayRef> = builders
+        .into_iter()
+        .map(|mut builder| Arc::new(builder.finish()) as ArrayRef)
+        .collect();
+
+    let struct_array = StructArray::try_new(struct_fields, arrays, null_buffer.finish())?;
+
+    Ok(Arc::new(struct_array))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_return_field_shape() {
+        let func = JsonTuple::new();
+        let fields = vec![
+            Arc::new(Field::new("json", DataType::Utf8, false)),
+            Arc::new(Field::new("f1", DataType::Utf8, false)),
+            Arc::new(Field::new("f2", DataType::Utf8, false)),
+        ];
+        let result = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &fields,
+                scalar_arguments: &[None, None, None],
+            })
+            .unwrap();
+
+        match result.data_type() {
+            DataType::Struct(inner) => {
+                assert_eq!(inner.len(), 2);
+                assert_eq!(inner[0].name(), "c0");
+                assert_eq!(inner[1].name(), "c1");
+                assert_eq!(inner[0].data_type(), &DataType::Utf8);
+                assert!(inner[0].is_nullable());
+            }
+            other => panic!("Expected Struct, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn test_too_few_args() {
+        let func = JsonTuple::new();
+        let fields = vec![Arc::new(Field::new("json", DataType::Utf8, false))];
+        let result = func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &fields,
+            scalar_arguments: &[None],
+        });
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("at least 2 arguments")
+        );
+    }
+}
diff --git a/datafusion/spark/src/function/json/mod.rs b/datafusion/spark/src/function/json/mod.rs
index a87df9a2c87a0..01378235d7c64 100644
--- a/datafusion/spark/src/function/json/mod.rs
+++ b/datafusion/spark/src/function/json/mod.rs
@@ -15,11 +15,24 @@
 // specific language governing permissions and limitations
 // under the License.
 
+pub mod json_tuple;
+
 use datafusion_expr::ScalarUDF;
+use datafusion_functions::make_udf_function;
 use std::sync::Arc;
 
-pub mod expr_fn {}
+make_udf_function!(json_tuple::JsonTuple, json_tuple);
+
+pub mod expr_fn {
+    use datafusion_functions::export_functions;
+
+    export_functions!((
+        json_tuple,
+        "Extracts top-level fields from a JSON string and returns them as a struct.",
+        args,
+    ));
+}
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
-    vec![]
+    vec![json_tuple()]
 }
diff --git a/datafusion/spark/src/function/map/map_from_arrays.rs b/datafusion/spark/src/function/map/map_from_arrays.rs
index 987548e353e44..692e837d00f5e 100644
--- a/datafusion/spark/src/function/map/map_from_arrays.rs
+++ b/datafusion/spark/src/function/map/map_from_arrays.rs
@@ -15,19 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
-
 use crate::function::map::utils::{
     get_element_type, get_list_offsets, get_list_values,
     map_from_keys_values_offsets_nulls, map_type_from_key_value_types,
 };
 use arrow::array::{Array, ArrayRef, NullArray};
 use arrow::compute::kernels::cast;
-use arrow::datatypes::DataType;
+use arrow::datatypes::{DataType, Field, FieldRef};
 use datafusion_common::utils::take_function_args;
-use datafusion_common::Result;
-use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
 use datafusion_functions::utils::make_scalar_function;
+use std::sync::Arc;
 
 /// Spark-compatible `map_from_arrays` expression
 /// <https://spark.apache.org/docs/latest/api/sql/index.html#map_from_arrays>
@@ -51,10 +53,6 @@ impl MapFromArrays {
 }
 
 impl ScalarUDFImpl for MapFromArrays {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "map_from_arrays"
     }
@@ -63,18 +61,26 @@ impl ScalarUDFImpl for MapFromArrays {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        let [key_type, value_type] = take_function_args("map_from_arrays", arg_types)?;
-        Ok(map_type_from_key_value_types(
-            get_element_type(key_type)?,
-            get_element_type(value_type)?,
-        ))
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let [keys_field, values_field] = args.arg_fields else {
+            return internal_err!("map_from_arrays expects exactly 2 arguments");
+        };
+
+        let map_type = map_type_from_key_value_types(
+            get_element_type(keys_field.data_type())?,
+            get_element_type(values_field.data_type())?,
+        );
+        // Spark marks map_from_arrays as null intolerant, so the output is
+        // nullable if either input is nullable.
+        let nullable = keys_field.is_nullable() || values_field.is_nullable();
+        Ok(Arc::new(Field::new(self.name(), map_type, nullable)))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(map_from_arrays_inner, vec![])(&args.args)
     }
 }
@@ -82,9 +88,7 @@ impl ScalarUDFImpl for MapFromArrays {
 fn map_from_arrays_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [keys, values] = take_function_args("map_from_arrays", args)?;
 
-    if matches!(keys.data_type(), DataType::Null)
-        || matches!(values.data_type(), DataType::Null)
-    {
+    if *keys.data_type() == DataType::Null || *values.data_type() == DataType::Null {
         return Ok(cast(
             &NullArray::new(keys.len()),
             &map_type_from_key_value_types(
@@ -103,3 +107,57 @@ fn map_from_arrays_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
         values.nulls(),
     )
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_map_from_arrays_nullability_and_type() {
+        let func = MapFromArrays::new();
+
+        let keys_field: FieldRef = Arc::new(Field::new(
+            "keys",
+            DataType::List(Arc::new(Field::new("item", DataType::Int32, false))),
+            false,
+        ));
+        let values_field: FieldRef = Arc::new(Field::new(
+            "values",
+            DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))),
+            false,
+        ));
+
+        let out = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&keys_field), Arc::clone(&values_field)],
+                scalar_arguments: &[None, None],
+            })
+            .expect("return_field_from_args should succeed");
+
+        let expected_type =
+            map_type_from_key_value_types(&DataType::Int32, &DataType::Utf8);
+        assert_eq!(out.data_type(), &expected_type);
+        assert!(
+            !out.is_nullable(),
+            "map_from_arrays should be non-nullable when both inputs are non-nullable"
+        );
+
+        let nullable_keys: FieldRef = Arc::new(Field::new(
+            "keys",
+            DataType::List(Arc::new(Field::new("item", DataType::Int32, false))),
+            true,
+        ));
+
+        let out_nullable = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[nullable_keys, values_field],
+                scalar_arguments: &[None, None],
+            })
+            .expect("return_field_from_args should succeed");
+
+        assert!(
+            out_nullable.is_nullable(),
+            "map_from_arrays should be nullable when any input is nullable"
+        );
+    }
+}
diff --git a/datafusion/spark/src/function/map/map_from_entries.rs b/datafusion/spark/src/function/map/map_from_entries.rs
index 6648979c5dd23..facf9f8c53473 100644
--- a/datafusion/spark/src/function/map/map_from_entries.rs
+++ b/datafusion/spark/src/function/map/map_from_entries.rs
@@ -15,18 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
+use std::sync::Arc;
 
 use crate::function::map::utils::{
-    get_element_type, get_list_offsets, get_list_values,
-    map_from_keys_values_offsets_nulls, map_type_from_key_value_types,
+    get_list_offsets, get_list_values, map_from_keys_values_offsets_nulls,
+    map_type_from_key_value_types,
 };
 use arrow::array::{Array, ArrayRef, NullBufferBuilder, StructArray};
 use arrow::buffer::NullBuffer;
-use arrow::datatypes::DataType;
+use arrow::datatypes::{DataType, Field, FieldRef};
 use datafusion_common::utils::take_function_args;
-use datafusion_common::{exec_err, Result};
-use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
+use datafusion_common::{Result, exec_err, internal_err};
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
 use datafusion_functions::utils::make_scalar_function;
 
 /// Spark-compatible `map_from_entries` expression
@@ -51,10 +54,6 @@ impl MapFromEntries {
 }
 
 impl ScalarUDFImpl for MapFromEntries {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "map_from_entries"
     }
@@ -63,9 +62,28 @@ impl ScalarUDFImpl for MapFromEntries {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        let [entries_type] = take_function_args("map_from_entries", arg_types)?;
-        let entries_element_type = get_element_type(entries_type)?;
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let [entries_field] = args.arg_fields else {
+            return exec_err!("map_from_entries: expected one argument");
+        };
+
+        let (entries_element_field, entries_element_type) =
+            match entries_field.data_type() {
+                DataType::List(field)
+                | DataType::LargeList(field)
+                | DataType::FixedSizeList(field, _) => {
+                    Ok((field.as_ref(), field.data_type()))
+                }
+                wrong_type => exec_err!(
+                    "map_from_entries: expected array<struct<key, value>>, got {:?}",
+                    wrong_type
+                ),
+            }?;
+
         let (keys_type, values_type) = match entries_element_type {
             DataType::Struct(fields) if fields.len() == 2 => {
                 Ok((fields[0].data_type(), fields[1].data_type()))
@@ -75,13 +93,14 @@ impl ScalarUDFImpl for MapFromEntries {
                 wrong_type
             ),
         }?;
-        Ok(map_type_from_key_value_types(keys_type, values_type))
+
+        let map_type = map_type_from_key_value_types(keys_type, values_type);
+        let nullable = entries_field.is_nullable() || entries_element_field.is_nullable();
+
+        Ok(Arc::new(Field::new(self.name(), map_type, nullable)))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(map_from_entries_inner, vec![])(&args.args)
     }
 }
@@ -131,3 +150,61 @@ fn map_from_entries_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
         res_nulls.as_ref(),
     )
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::Fields;
+
+    fn make_entries_field(array_nullable: bool, element_nullable: bool) -> FieldRef {
+        let struct_type = DataType::Struct(Fields::from(vec![
+            Field::new("key", DataType::Int32, false),
+            Field::new("value", DataType::Utf8, true),
+        ]));
+        Arc::new(Field::new(
+            "entries",
+            DataType::List(Arc::new(Field::new("item", struct_type, element_nullable))),
+            array_nullable,
+        ))
+    }
+
+    #[test]
+    fn test_map_from_entries_nullability_matches_input() {
+        let func = MapFromEntries::new();
+        let expected_type =
+            map_type_from_key_value_types(&DataType::Int32, &DataType::Utf8);
+
+        // Non-nullable array and elements => non-nullable result
+        let non_nullable_field = make_entries_field(false, false);
+        let result = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&non_nullable_field)],
+                scalar_arguments: &[None],
+            })
+            .expect("should infer field");
+        assert!(!result.is_nullable());
+        assert_eq!(result.data_type(), &expected_type);
+
+        // Nullable elements should make result nullable even if array is non-nullable
+        let element_nullable_field = make_entries_field(false, true);
+        let result = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&element_nullable_field)],
+                scalar_arguments: &[None],
+            })
+            .expect("should infer field");
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), &expected_type);
+
+        // Nullable array should also yield nullable result
+        let array_nullable_field = make_entries_field(true, false);
+        let result = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&array_nullable_field)],
+                scalar_arguments: &[None],
+            })
+            .expect("should infer field");
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), &expected_type);
+    }
+}
diff --git a/datafusion/spark/src/function/map/mod.rs b/datafusion/spark/src/function/map/mod.rs
index 2f596b19b422f..c9ebed6f612e1 100644
--- a/datafusion/spark/src/function/map/mod.rs
+++ b/datafusion/spark/src/function/map/mod.rs
@@ -17,6 +17,7 @@
 
 pub mod map_from_arrays;
 pub mod map_from_entries;
+pub mod str_to_map;
 mod utils;
 
 use datafusion_expr::ScalarUDF;
@@ -25,6 +26,7 @@ use std::sync::Arc;
 
 make_udf_function!(map_from_arrays::MapFromArrays, map_from_arrays);
 make_udf_function!(map_from_entries::MapFromEntries, map_from_entries);
+make_udf_function!(str_to_map::SparkStrToMap, str_to_map);
 
 pub mod expr_fn {
     use datafusion_functions::export_functions;
@@ -40,8 +42,14 @@ pub mod expr_fn {
         "Creates a map from array<struct<key, value>>.",
         arg1
     ));
+
+    export_functions!((
+        str_to_map,
+        "Creates a map after splitting the text into key/value pairs using delimiters.",
+        text pair_delim key_value_delim
+    ));
 }
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
-    vec![map_from_arrays(), map_from_entries()]
+    vec![map_from_arrays(), map_from_entries(), str_to_map()]
 }
diff --git a/datafusion/spark/src/function/map/str_to_map.rs b/datafusion/spark/src/function/map/str_to_map.rs
new file mode 100644
index 0000000000000..c603e775a6031
--- /dev/null
+++ b/datafusion/spark/src/function/map/str_to_map.rs
@@ -0,0 +1,261 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::collections::HashSet;
+use std::sync::Arc;
+
+use arrow::array::{
+    Array, ArrayRef, MapBuilder, MapFieldNames, StringArrayType, StringBuilder,
+};
+use arrow::buffer::NullBuffer;
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::cast::{
+    as_large_string_array, as_string_array, as_string_view_array,
+};
+use datafusion_common::{Result, exec_err, internal_err};
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature, Volatility,
+};
+
+use crate::function::map::utils::map_type_from_key_value_types;
+
+const DEFAULT_PAIR_DELIM: &str = ",";
+const DEFAULT_KV_DELIM: &str = ":";
+
+/// Spark-compatible `str_to_map` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#str_to_map>
+///
+/// Creates a map from a string by splitting on delimiters.
+/// str_to_map(text[, pairDelim[, keyValueDelim]]) -> Map<String, String>
+///
+/// - text: The input string
+/// - pairDelim: Delimiter between key-value pairs (default: ',')
+/// - keyValueDelim: Delimiter between key and value (default: ':')
+///
+/// # Duplicate Key Handling
+/// Uses EXCEPTION behavior (Spark 3.0+ default): errors on duplicate keys.
+/// See `spark.sql.mapKeyDedupPolicy`:
+/// <https://github.com/apache/spark/blob/v4.0.0/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala#L4502-L4511>
+///
+/// TODO: Support configurable `spark.sql.mapKeyDedupPolicy` (LAST_WIN) in a follow-up PR.
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkStrToMap {
+    signature: Signature,
+}
+
+impl Default for SparkStrToMap {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkStrToMap {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    // str_to_map(text)
+                    TypeSignature::String(1),
+                    // str_to_map(text, pairDelim)
+                    TypeSignature::String(2),
+                    // str_to_map(text, pairDelim, keyValueDelim)
+                    TypeSignature::String(3),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkStrToMap {
+    fn name(&self) -> &str {
+        "str_to_map"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        let map_type = map_type_from_key_value_types(&DataType::Utf8, &DataType::Utf8);
+        Ok(Arc::new(Field::new(self.name(), map_type, nullable)))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let arrays: Vec<ArrayRef> = ColumnarValue::values_to_arrays(&args.args)?;
+        let result = str_to_map_inner(&arrays)?;
+        Ok(ColumnarValue::Array(result))
+    }
+}
+
+fn str_to_map_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+    match args.len() {
+        1 => match args[0].data_type() {
+            DataType::Utf8 => str_to_map_impl(as_string_array(&args[0])?, None, None),
+            DataType::LargeUtf8 => {
+                str_to_map_impl(as_large_string_array(&args[0])?, None, None)
+            }
+            DataType::Utf8View => {
+                str_to_map_impl(as_string_view_array(&args[0])?, None, None)
+            }
+            other => exec_err!(
+                "Unsupported data type {other:?} for str_to_map, \
+                expected Utf8, LargeUtf8, or Utf8View"
+            ),
+        },
+        2 => match (args[0].data_type(), args[1].data_type()) {
+            (DataType::Utf8, DataType::Utf8) => str_to_map_impl(
+                as_string_array(&args[0])?,
+                Some(as_string_array(&args[1])?),
+                None,
+            ),
+            (DataType::LargeUtf8, DataType::LargeUtf8) => str_to_map_impl(
+                as_large_string_array(&args[0])?,
+                Some(as_large_string_array(&args[1])?),
+                None,
+            ),
+            (DataType::Utf8View, DataType::Utf8View) => str_to_map_impl(
+                as_string_view_array(&args[0])?,
+                Some(as_string_view_array(&args[1])?),
+                None,
+            ),
+            (t1, t2) => exec_err!(
+                "Unsupported data types ({t1:?}, {t2:?}) for str_to_map, \
+                expected matching Utf8, LargeUtf8, or Utf8View"
+            ),
+        },
+        3 => match (
+            args[0].data_type(),
+            args[1].data_type(),
+            args[2].data_type(),
+        ) {
+            (DataType::Utf8, DataType::Utf8, DataType::Utf8) => str_to_map_impl(
+                as_string_array(&args[0])?,
+                Some(as_string_array(&args[1])?),
+                Some(as_string_array(&args[2])?),
+            ),
+            (DataType::LargeUtf8, DataType::LargeUtf8, DataType::LargeUtf8) => {
+                str_to_map_impl(
+                    as_large_string_array(&args[0])?,
+                    Some(as_large_string_array(&args[1])?),
+                    Some(as_large_string_array(&args[2])?),
+                )
+            }
+            (DataType::Utf8View, DataType::Utf8View, DataType::Utf8View) => {
+                str_to_map_impl(
+                    as_string_view_array(&args[0])?,
+                    Some(as_string_view_array(&args[1])?),
+                    Some(as_string_view_array(&args[2])?),
+                )
+            }
+            (t1, t2, t3) => exec_err!(
+                "Unsupported data types ({t1:?}, {t2:?}, {t3:?}) for str_to_map, \
+                expected matching Utf8, LargeUtf8, or Utf8View"
+            ),
+        },
+        n => exec_err!("str_to_map expects 1-3 arguments, got {n}"),
+    }
+}
+
+fn str_to_map_impl<'a, V: StringArrayType<'a> + Copy>(
+    text_array: V,
+    pair_delim_array: Option<V>,
+    kv_delim_array: Option<V>,
+) -> Result<ArrayRef> {
+    let num_rows = text_array.len();
+
+    // Precompute combined null buffer from all input arrays.
+    // NullBuffer::union performs a bitmap-level AND, which is more efficient
+    // than checking per-row nullability inline.
+    let text_nulls = text_array.nulls().cloned();
+    let pair_nulls = pair_delim_array.and_then(|a| a.nulls().cloned());
+    let kv_nulls = kv_delim_array.and_then(|a| a.nulls().cloned());
+    let combined_nulls = [text_nulls.as_ref(), pair_nulls.as_ref(), kv_nulls.as_ref()]
+        .into_iter()
+        .fold(None, |acc, nulls| NullBuffer::union(acc.as_ref(), nulls));
+
+    // Use field names matching map_type_from_key_value_types: "key" and "value"
+    let field_names = MapFieldNames {
+        entry: "entries".to_string(),
+        key: "key".to_string(),
+        value: "value".to_string(),
+    };
+    let mut map_builder = MapBuilder::new(
+        Some(field_names),
+        StringBuilder::new(),
+        StringBuilder::new(),
+    );
+
+    let mut seen_keys = HashSet::new();
+    for row_idx in 0..num_rows {
+        if combined_nulls.as_ref().is_some_and(|n| n.is_null(row_idx)) {
+            map_builder.append(false)?;
+            continue;
+        }
+
+        // Per-row delimiter extraction
+        let pair_delim =
+            pair_delim_array.map_or(DEFAULT_PAIR_DELIM, |a| a.value(row_idx));
+        let kv_delim = kv_delim_array.map_or(DEFAULT_KV_DELIM, |a| a.value(row_idx));
+
+        let text = text_array.value(row_idx);
+        if text.is_empty() {
+            // Empty string -> map with empty key and NULL value (Spark behavior)
+            map_builder.keys().append_value("");
+            map_builder.values().append_null();
+            map_builder.append(true)?;
+            continue;
+        }
+
+        seen_keys.clear();
+        for pair in text.split(pair_delim) {
+            if pair.is_empty() {
+                continue;
+            }
+
+            let mut kv_iter = pair.splitn(2, kv_delim);
+            let key = kv_iter.next().unwrap_or("");
+            let value = kv_iter.next();
+
+            // TODO: Support LAST_WIN policy via spark.sql.mapKeyDedupPolicy config
+            // EXCEPTION policy: error on duplicate keys (Spark 3.0+ default)
+            if !seen_keys.insert(key) {
+                return exec_err!(
+                    "Duplicate map key '{key}' was found, please check the input data. \
+                    If you want to remove the duplicated keys, you can set \
+                    spark.sql.mapKeyDedupPolicy to \"LAST_WIN\" so that the key \
+                    inserted at last takes precedence."
+                );
+            }
+
+            map_builder.keys().append_value(key);
+            match value {
+                Some(v) => map_builder.values().append_value(v),
+                None => map_builder.values().append_null(),
+            }
+        }
+        map_builder.append(true)?;
+    }
+
+    Ok(Arc::new(map_builder.finish()))
+}
diff --git a/datafusion/spark/src/function/map/utils.rs b/datafusion/spark/src/function/map/utils.rs
index b568f45403c30..f5fff0c4b4c46 100644
--- a/datafusion/spark/src/function/map/utils.rs
+++ b/datafusion/spark/src/function/map/utils.rs
@@ -23,7 +23,7 @@ use arrow::array::{Array, ArrayRef, AsArray, BooleanBuilder, MapArray, StructArr
 use arrow::buffer::{NullBuffer, OffsetBuffer};
 use arrow::compute::filter;
 use arrow::datatypes::{DataType, Field, Fields};
-use datafusion_common::{exec_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err};
 
 /// Helper function to get element [`DataType`]
 /// from [`List`](DataType::List)/[`LargeList`](DataType::LargeList)/[`FixedSizeList`](DataType::FixedSizeList)<br>
@@ -64,14 +64,15 @@ pub fn get_list_offsets(array: &ArrayRef) -> Result<Cow<'_, [i32]>> {
     match array.data_type() {
         DataType::List(_) => Ok(Cow::Borrowed(array.as_list::<i32>().offsets().as_ref())),
         DataType::LargeList(_) => Ok(Cow::Owned(
-            array.as_list::<i64>()
+            array
+                .as_list::<i64>()
                 .offsets()
                 .iter()
                 .map(|i| *i as i32)
                 .collect::<Vec<_>>(),
         )),
         DataType::FixedSizeList(_, size) => Ok(Cow::Owned(
-             (0..=array.len() as i32).map(|i| size * i).collect()
+            (0..=array.len() as i32).map(|i| size * i).collect(),
         )),
         wrong_type => exec_err!(
             "get_list_offsets expects List/LargeList/FixedSizeList as argument, got {wrong_type:?}"
@@ -146,6 +147,7 @@ pub fn map_from_keys_values_offsets_nulls(
     )?))
 }
 
+#[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key
 fn map_deduplicate_keys(
     flat_keys: &ArrayRef,
     flat_values: &ArrayRef,
@@ -180,15 +182,17 @@ fn map_deduplicate_keys(
         let num_keys_entries = *next_keys_offset as usize - cur_keys_offset;
         let num_values_entries = *next_values_offset as usize - cur_values_offset;
 
-        let mut keys_mask_one = [false].repeat(num_keys_entries);
-        let mut values_mask_one = [false].repeat(num_values_entries);
+        let mut keys_mask_one = vec![false; num_keys_entries];
+        let mut values_mask_one = vec![false; num_values_entries];
 
         let key_is_valid = keys_nulls.is_none_or(|buf| buf.is_valid(row_idx));
         let value_is_valid = values_nulls.is_none_or(|buf| buf.is_valid(row_idx));
 
         if key_is_valid && value_is_valid {
             if num_keys_entries != num_values_entries {
-                return exec_err!("map_deduplicate_keys: keys and values lists in the same row must have equal lengths");
+                return exec_err!(
+                    "map_deduplicate_keys: keys and values lists in the same row must have equal lengths"
+                );
             } else if num_keys_entries != 0 {
                 let mut seen_keys = HashSet::new();
 
diff --git a/datafusion/spark/src/function/math/abs.rs b/datafusion/spark/src/function/math/abs.rs
new file mode 100644
index 0000000000000..0d6c7f3285a18
--- /dev/null
+++ b/datafusion/spark/src/function/math/abs.rs
@@ -0,0 +1,565 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::*;
+use arrow::datatypes::{DataType, Field, FieldRef};
+use arrow::error::ArrowError;
+use datafusion_common::{DataFusionError, Result, ScalarValue, internal_err};
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+use datafusion_functions::{
+    downcast_named_arg, make_abs_function, make_try_abs_function,
+    make_wrapping_abs_function,
+};
+use std::sync::Arc;
+
+/// Spark-compatible `abs` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#abs>
+///
+/// Returns the absolute value of input
+/// Returns NULL if input is NULL, returns NaN if input is NaN.
+///
+/// Differences with DataFusion abs:
+///  - Spark's ANSI-compliant dialect, when off (i.e. `spark.sql.ansi.enabled=false`), taking absolute value on the minimal value of a signed integer returns the value as is. DataFusion's abs throws "DataFusion error: Arrow error: Compute error" on arithmetic overflow
+///
+/// TODOs:
+///  - Spark's abs also supports ANSI interval types: YearMonthIntervalType and DayTimeIntervalType. DataFusion's abs doesn't.
+///
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkAbs {
+    signature: Signature,
+}
+
+impl Default for SparkAbs {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkAbs {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::numeric(1, Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkAbs {
+    fn name(&self) -> &str {
+        "abs"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!(
+            "SparkAbs: return_type() is not used; return_field_from_args() is implemented"
+        )
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let input_field = &args.arg_fields[0];
+        let out_dt = input_field.data_type().clone();
+        let out_nullable = input_field.is_nullable();
+
+        Ok(Arc::new(Field::new(self.name(), out_dt, out_nullable)))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        spark_abs(&args.args, args.config_options.execution.enable_ansi_mode)
+    }
+}
+
+macro_rules! scalar_compute_op {
+    ($ENABLE_ANSI_MODE:expr, $INPUT:ident, $SCALAR_TYPE:ident) => {{
+        let result = if $ENABLE_ANSI_MODE {
+            $INPUT.checked_abs().ok_or_else(|| {
+                ArrowError::ComputeError(format!(
+                    "{} overflow on abs({:?})",
+                    stringify!($SCALAR_TYPE),
+                    $INPUT
+                ))
+            })?
+        } else {
+            $INPUT.wrapping_abs()
+        };
+        Ok(ColumnarValue::Scalar(ScalarValue::$SCALAR_TYPE(Some(
+            result,
+        ))))
+    }};
+    ($ENABLE_ANSI_MODE:expr, $INPUT:ident, $PRECISION:expr, $SCALE:expr, $SCALAR_TYPE:ident) => {{
+        let result = if $ENABLE_ANSI_MODE {
+            $INPUT.checked_abs().ok_or_else(|| {
+                ArrowError::ComputeError(format!(
+                    "{} overflow on abs({:?})",
+                    stringify!($SCALAR_TYPE),
+                    $INPUT
+                ))
+            })?
+        } else {
+            $INPUT.wrapping_abs()
+        };
+        Ok(ColumnarValue::Scalar(ScalarValue::$SCALAR_TYPE(
+            Some(result),
+            $PRECISION,
+            $SCALE,
+        )))
+    }};
+}
+
+pub fn spark_abs(
+    args: &[ColumnarValue],
+    enable_ansi_mode: bool,
+) -> Result<ColumnarValue, DataFusionError> {
+    if args.len() != 1 {
+        return internal_err!("abs takes exactly 1 argument, but got: {}", args.len());
+    }
+
+    match &args[0] {
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Null
+            | DataType::UInt8
+            | DataType::UInt16
+            | DataType::UInt32
+            | DataType::UInt64 => Ok(args[0].clone()),
+            DataType::Int8 => {
+                let abs_fun = if enable_ansi_mode {
+                    make_try_abs_function!(Int8Array)
+                } else {
+                    make_wrapping_abs_function!(Int8Array)
+                };
+                abs_fun(array).map(ColumnarValue::Array)
+            }
+            DataType::Int16 => {
+                let abs_fun = if enable_ansi_mode {
+                    make_try_abs_function!(Int16Array)
+                } else {
+                    make_wrapping_abs_function!(Int16Array)
+                };
+                abs_fun(array).map(ColumnarValue::Array)
+            }
+            DataType::Int32 => {
+                let abs_fun = if enable_ansi_mode {
+                    make_try_abs_function!(Int32Array)
+                } else {
+                    make_wrapping_abs_function!(Int32Array)
+                };
+                abs_fun(array).map(ColumnarValue::Array)
+            }
+            DataType::Int64 => {
+                let abs_fun = if enable_ansi_mode {
+                    make_try_abs_function!(Int64Array)
+                } else {
+                    make_wrapping_abs_function!(Int64Array)
+                };
+                abs_fun(array).map(ColumnarValue::Array)
+            }
+            DataType::Float32 => {
+                let abs_fun = make_abs_function!(Float32Array);
+                abs_fun(array).map(ColumnarValue::Array)
+            }
+            DataType::Float64 => {
+                let abs_fun = make_abs_function!(Float64Array);
+                abs_fun(array).map(ColumnarValue::Array)
+            }
+            DataType::Decimal128(_, _) => {
+                let abs_fun = if enable_ansi_mode {
+                    make_try_abs_function!(Decimal128Array)
+                } else {
+                    make_wrapping_abs_function!(Decimal128Array)
+                };
+                abs_fun(array).map(ColumnarValue::Array)
+            }
+            DataType::Decimal256(_, _) => {
+                let abs_fun = if enable_ansi_mode {
+                    make_try_abs_function!(Decimal256Array)
+                } else {
+                    make_wrapping_abs_function!(Decimal256Array)
+                };
+                abs_fun(array).map(ColumnarValue::Array)
+            }
+            dt => internal_err!("Not supported datatype for Spark ABS: {dt}"),
+        },
+        ColumnarValue::Scalar(sv) => match sv {
+            ScalarValue::Null
+            | ScalarValue::UInt8(_)
+            | ScalarValue::UInt16(_)
+            | ScalarValue::UInt32(_)
+            | ScalarValue::UInt64(_) => Ok(args[0].clone()),
+            sv if sv.is_null() => Ok(args[0].clone()),
+            ScalarValue::Int8(Some(v)) => scalar_compute_op!(enable_ansi_mode, v, Int8),
+            ScalarValue::Int16(Some(v)) => scalar_compute_op!(enable_ansi_mode, v, Int16),
+            ScalarValue::Int32(Some(v)) => scalar_compute_op!(enable_ansi_mode, v, Int32),
+            ScalarValue::Int64(Some(v)) => scalar_compute_op!(enable_ansi_mode, v, Int64),
+            ScalarValue::Float32(Some(v)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Float32(Some(v.abs()))))
+            }
+            ScalarValue::Float64(Some(v)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Float64(Some(v.abs()))))
+            }
+            ScalarValue::Decimal128(Some(v), precision, scale) => {
+                scalar_compute_op!(enable_ansi_mode, v, *precision, *scale, Decimal128)
+            }
+            ScalarValue::Decimal256(Some(v), precision, scale) => {
+                scalar_compute_op!(enable_ansi_mode, v, *precision, *scale, Decimal256)
+            }
+            dt => internal_err!("Not supported datatype for Spark ABS: {dt}"),
+        },
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::i256;
+
+    macro_rules! eval_array_legacy_mode {
+        ($INPUT:expr, $OUTPUT:expr, $FUNC:ident) => {{
+            let input = $INPUT;
+            let args = ColumnarValue::Array(Arc::new(input));
+            let expected = $OUTPUT;
+            match spark_abs(&[args], false) {
+                Ok(ColumnarValue::Array(result)) => {
+                    let actual = datafusion_common::cast::$FUNC(&result).unwrap();
+                    assert_eq!(actual, &expected);
+                }
+                _ => unreachable!(),
+            }
+        }};
+    }
+
+    #[test]
+    fn test_abs_array_legacy_mode() {
+        eval_array_legacy_mode!(
+            Int8Array::from(vec![Some(-1), Some(i8::MIN), Some(i8::MAX), None]),
+            Int8Array::from(vec![Some(1), Some(i8::MIN), Some(i8::MAX), None]),
+            as_int8_array
+        );
+
+        eval_array_legacy_mode!(
+            Int16Array::from(vec![Some(-1), Some(i16::MIN), Some(i16::MAX), None]),
+            Int16Array::from(vec![Some(1), Some(i16::MIN), Some(i16::MAX), None]),
+            as_int16_array
+        );
+
+        eval_array_legacy_mode!(
+            Int32Array::from(vec![Some(-1), Some(i32::MIN), Some(i32::MAX), None]),
+            Int32Array::from(vec![Some(1), Some(i32::MIN), Some(i32::MAX), None]),
+            as_int32_array
+        );
+
+        eval_array_legacy_mode!(
+            Int64Array::from(vec![Some(-1), Some(i64::MIN), Some(i64::MAX), None]),
+            Int64Array::from(vec![Some(1), Some(i64::MIN), Some(i64::MAX), None]),
+            as_int64_array
+        );
+
+        eval_array_legacy_mode!(
+            Float32Array::from(vec![
+                Some(-1f32),
+                Some(f32::MIN),
+                Some(f32::MAX),
+                None,
+                Some(f32::NAN),
+                Some(f32::INFINITY),
+                Some(f32::NEG_INFINITY),
+                Some(0.0),
+                Some(-0.0),
+            ]),
+            Float32Array::from(vec![
+                Some(1f32),
+                Some(f32::MAX),
+                Some(f32::MAX),
+                None,
+                Some(f32::NAN),
+                Some(f32::INFINITY),
+                Some(f32::INFINITY),
+                Some(0.0),
+                Some(0.0),
+            ]),
+            as_float32_array
+        );
+
+        eval_array_legacy_mode!(
+            Float64Array::from(vec![
+                Some(-1f64),
+                Some(f64::MIN),
+                Some(f64::MAX),
+                None,
+                Some(f64::NAN),
+                Some(f64::INFINITY),
+                Some(f64::NEG_INFINITY),
+                Some(0.0),
+                Some(-0.0),
+            ]),
+            Float64Array::from(vec![
+                Some(1f64),
+                Some(f64::MAX),
+                Some(f64::MAX),
+                None,
+                Some(f64::NAN),
+                Some(f64::INFINITY),
+                Some(f64::INFINITY),
+                Some(0.0),
+                Some(0.0),
+            ]),
+            as_float64_array
+        );
+
+        eval_array_legacy_mode!(
+            Decimal128Array::from(vec![Some(i128::MIN), Some(i128::MIN + 1), None])
+                .with_precision_and_scale(38, 37)
+                .unwrap(),
+            Decimal128Array::from(vec![Some(i128::MIN), Some(i128::MAX), None])
+                .with_precision_and_scale(38, 37)
+                .unwrap(),
+            as_decimal128_array
+        );
+
+        eval_array_legacy_mode!(
+            Decimal256Array::from(vec![
+                Some(i256::MIN),
+                Some(i256::MINUS_ONE),
+                Some(i256::MIN + i256::from(1)),
+                None
+            ])
+            .with_precision_and_scale(5, 2)
+            .unwrap(),
+            Decimal256Array::from(vec![
+                Some(i256::MIN),
+                Some(i256::ONE),
+                Some(i256::MAX),
+                None
+            ])
+            .with_precision_and_scale(5, 2)
+            .unwrap(),
+            as_decimal256_array
+        );
+    }
+
+    macro_rules! eval_array_ansi_mode {
+        ($INPUT:expr) => {{
+            let input = $INPUT;
+            let args = ColumnarValue::Array(Arc::new(input));
+            match spark_abs(&[args], true) {
+                Err(e) => {
+                    assert!(
+                        e.to_string().contains("overflow on abs"),
+                        "Error message did not match. Actual message: {e}"
+                    );
+                }
+                _ => unreachable!(),
+            }
+        }};
+        ($INPUT:expr, $OUTPUT:expr, $FUNC:ident) => {{
+            let input = $INPUT;
+            let args = ColumnarValue::Array(Arc::new(input));
+            let expected = $OUTPUT;
+            match spark_abs(&[args], true) {
+                Ok(ColumnarValue::Array(result)) => {
+                    let actual = datafusion_common::cast::$FUNC(&result).unwrap();
+                    assert_eq!(actual, &expected);
+                }
+                _ => unreachable!(),
+            }
+        }};
+    }
+    #[test]
+    fn test_abs_array_ansi_mode() {
+        eval_array_ansi_mode!(
+            UInt64Array::from(vec![Some(u64::MIN), Some(u64::MAX), None]),
+            UInt64Array::from(vec![Some(u64::MIN), Some(u64::MAX), None]),
+            as_uint64_array
+        );
+
+        eval_array_ansi_mode!(Int8Array::from(vec![
+            Some(-1),
+            Some(i8::MIN),
+            Some(i8::MAX),
+            None
+        ]));
+        eval_array_ansi_mode!(Int16Array::from(vec![
+            Some(-1),
+            Some(i16::MIN),
+            Some(i16::MAX),
+            None
+        ]));
+        eval_array_ansi_mode!(Int32Array::from(vec![
+            Some(-1),
+            Some(i32::MIN),
+            Some(i32::MAX),
+            None
+        ]));
+        eval_array_ansi_mode!(Int64Array::from(vec![
+            Some(-1),
+            Some(i64::MIN),
+            Some(i64::MAX),
+            None
+        ]));
+        eval_array_ansi_mode!(
+            Float32Array::from(vec![
+                Some(-1f32),
+                Some(f32::MIN),
+                Some(f32::MAX),
+                None,
+                Some(f32::NAN),
+                Some(f32::INFINITY),
+                Some(f32::NEG_INFINITY),
+                Some(0.0),
+                Some(-0.0),
+            ]),
+            Float32Array::from(vec![
+                Some(1f32),
+                Some(f32::MAX),
+                Some(f32::MAX),
+                None,
+                Some(f32::NAN),
+                Some(f32::INFINITY),
+                Some(f32::INFINITY),
+                Some(0.0),
+                Some(0.0),
+            ]),
+            as_float32_array
+        );
+
+        eval_array_ansi_mode!(
+            Float64Array::from(vec![
+                Some(-1f64),
+                Some(f64::MIN),
+                Some(f64::MAX),
+                None,
+                Some(f64::NAN),
+                Some(f64::INFINITY),
+                Some(f64::NEG_INFINITY),
+                Some(0.0),
+                Some(-0.0),
+            ]),
+            Float64Array::from(vec![
+                Some(1f64),
+                Some(f64::MAX),
+                Some(f64::MAX),
+                None,
+                Some(f64::NAN),
+                Some(f64::INFINITY),
+                Some(f64::INFINITY),
+                Some(0.0),
+                Some(0.0),
+            ]),
+            as_float64_array
+        );
+
+        // decimal: no arithmetic overflow
+        eval_array_ansi_mode!(
+            Decimal128Array::from(vec![Some(-1), Some(-2), Some(i128::MIN + 1)])
+                .with_precision_and_scale(38, 37)
+                .unwrap(),
+            Decimal128Array::from(vec![Some(1), Some(2), Some(i128::MAX)])
+                .with_precision_and_scale(38, 37)
+                .unwrap(),
+            as_decimal128_array
+        );
+
+        eval_array_ansi_mode!(
+            Decimal256Array::from(vec![
+                Some(i256::MINUS_ONE),
+                Some(i256::from(-2)),
+                Some(i256::MIN + i256::from(1))
+            ])
+            .with_precision_and_scale(18, 7)
+            .unwrap(),
+            Decimal256Array::from(vec![
+                Some(i256::ONE),
+                Some(i256::from(2)),
+                Some(i256::MAX)
+            ])
+            .with_precision_and_scale(18, 7)
+            .unwrap(),
+            as_decimal256_array
+        );
+
+        // decimal: arithmetic overflow
+        eval_array_ansi_mode!(
+            Decimal128Array::from(vec![Some(i128::MIN), None])
+                .with_precision_and_scale(38, 37)
+                .unwrap()
+        );
+        eval_array_ansi_mode!(
+            Decimal256Array::from(vec![Some(i256::MIN), None])
+                .with_precision_and_scale(5, 2)
+                .unwrap()
+        );
+    }
+
+    #[test]
+    fn test_abs_nullability() {
+        let abs = SparkAbs::new();
+
+        // --- non-nullable Int32 input ---
+        let non_nullable_i32 = Arc::new(Field::new("c", DataType::Int32, false));
+        let out_non_null = abs
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&non_nullable_i32)],
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        // result should be non-nullable and the same DataType as input
+        assert!(!out_non_null.is_nullable());
+        assert_eq!(out_non_null.data_type(), &DataType::Int32);
+
+        // --- nullable Int32 input ---
+        let nullable_i32 = Arc::new(Field::new("c", DataType::Int32, true));
+        let out_nullable = abs
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&nullable_i32)],
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        // result should be nullable and the same DataType as input
+        assert!(out_nullable.is_nullable());
+        assert_eq!(out_nullable.data_type(), &DataType::Int32);
+
+        // --- non-nullable Float64 input ---
+        let non_nullable_f64 = Arc::new(Field::new("c", DataType::Float64, false));
+        let out_f64 = abs
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&non_nullable_f64)],
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        assert!(!out_f64.is_nullable());
+        assert_eq!(out_f64.data_type(), &DataType::Float64);
+
+        // --- nullable Float64 input ---
+        let nullable_f64 = Arc::new(Field::new("c", DataType::Float64, true));
+        let out_f64_null = abs
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&nullable_f64)],
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        assert!(out_f64_null.is_nullable());
+        assert_eq!(out_f64_null.data_type(), &DataType::Float64);
+    }
+}
diff --git a/datafusion/spark/src/function/math/bin.rs b/datafusion/spark/src/function/math/bin.rs
new file mode 100644
index 0000000000000..82afd48e8dc9f
--- /dev/null
+++ b/datafusion/spark/src/function/math/bin.rs
@@ -0,0 +1,106 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, AsArray, StringArray};
+use arrow::datatypes::{DataType, Field, FieldRef, Int64Type};
+use datafusion_common::types::{NativeType, logical_int64};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+    TypeSignatureClass, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use std::sync::Arc;
+
+/// Spark-compatible `bin` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#bin>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkBin {
+    signature: Signature,
+}
+
+impl Default for SparkBin {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkBin {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![TypeSignature::Coercible(vec![Coercion::new_implicit(
+                    TypeSignatureClass::Native(logical_int64()),
+                    vec![TypeSignatureClass::Numeric],
+                    NativeType::Int64,
+                )])],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkBin {
+    fn name(&self) -> &str {
+        "bin"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(
+        &self,
+        args: datafusion_expr::ReturnFieldArgs,
+    ) -> Result<FieldRef> {
+        Ok(Arc::new(Field::new(
+            self.name(),
+            DataType::Utf8,
+            args.arg_fields[0].is_nullable(),
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_bin_inner, vec![])(&args.args)
+    }
+}
+
+fn spark_bin_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
+    let [array] = take_function_args("bin", arg)?;
+    match &array.data_type() {
+        DataType::Int64 => {
+            let result: StringArray = array
+                .as_primitive::<Int64Type>()
+                .iter()
+                .map(|opt| opt.map(spark_bin))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        data_type => {
+            internal_err!("bin does not support: {data_type}")
+        }
+    }
+}
+
+fn spark_bin(value: i64) -> String {
+    format!("{value:b}")
+}
diff --git a/datafusion/spark/src/function/math/ceil.rs b/datafusion/spark/src/function/math/ceil.rs
new file mode 100644
index 0000000000000..5096914a1eba8
--- /dev/null
+++ b/datafusion/spark/src/function/math/ceil.rs
@@ -0,0 +1,304 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{ArrowNativeTypeOp, AsArray, Decimal128Array};
+use arrow::datatypes::{DataType, Decimal128Type, Float32Type, Float64Type, Int64Type};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, exec_err};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+
+/// Spark-compatible `ceil` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#ceil>
+///
+/// Differences with DataFusion ceil:
+///  - Spark's ceil returns Int64 for float inputs; DataFusion preserves
+///    the input type (Float32→Float32, Float64→Float64)
+///  - Spark's ceil on Decimal128(p, s) returns Decimal128(p−s+1, 0), reducing scale
+///    to 0; DataFusion preserves the original precision and scale
+///  - Spark only supports Decimal128; DataFusion also supports Decimal32/64/256
+///  - Spark does not check for decimal overflow; DataFusion errors on overflow
+///
+/// 2-argument ceil(value, scale) is not yet implemented
+/// <https://github.com/apache/datafusion/issues/21560>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkCeil {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkCeil {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkCeil {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::numeric(1, Volatility::Immutable),
+            aliases: vec!["ceiling".to_string()],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkCeil {
+    fn name(&self) -> &str {
+        "ceil"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        match &arg_types[0] {
+            DataType::Decimal128(p, s) => {
+                if *s > 0 {
+                    Ok(DataType::Decimal128(decimal128_ceil_precision(*p, *s), 0))
+                } else {
+                    // scale <= 0 means the value is already a whole number
+                    // (or represents multiples of 10^(-scale)), so ceil is a no-op
+                    Ok(DataType::Decimal128(*p, *s))
+                }
+            }
+            dt if matches!(dt, DataType::Float32 | DataType::Float64)
+                || dt.is_integer() =>
+            {
+                Ok(DataType::Int64)
+            }
+            other => exec_err!("Unsupported data type {other:?} for function ceil"),
+        }
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        spark_ceil(&args.args)
+    }
+}
+
+fn spark_ceil(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+    let [input] = take_function_args("ceil", args)?;
+
+    match input {
+        ColumnarValue::Scalar(value) => spark_ceil_scalar(value),
+        ColumnarValue::Array(input) => spark_ceil_array(input),
+    }
+}
+
+/// Compute ceil for a single decimal128 value with the given scale.
+#[inline]
+fn decimal128_ceil(value: i128, scale: u32) -> i128 {
+    let div = 10_i128.pow_wrapping(scale);
+    let d = value / div;
+    let r = value % div;
+    if r > 0 { d + 1 } else { d }
+}
+
+/// Compute the return precision for a decimal128 ceil result.
+#[inline]
+fn decimal128_ceil_precision(precision: u8, scale: i8) -> u8 {
+    ((precision as i64) - (scale as i64) + 1).clamp(1, 38) as u8
+}
+
+fn spark_ceil_scalar(value: &ScalarValue) -> Result<ColumnarValue> {
+    let result = match value {
+        ScalarValue::Float32(v) => ScalarValue::Int64(v.map(|x| x.ceil() as i64)),
+        ScalarValue::Float64(v) => ScalarValue::Int64(v.map(|x| x.ceil() as i64)),
+        v if v.data_type().is_integer() => v.cast_to(&DataType::Int64)?,
+        ScalarValue::Decimal128(v, p, s) if *s > 0 => {
+            let new_p = decimal128_ceil_precision(*p, *s);
+            ScalarValue::Decimal128(v.map(|x| decimal128_ceil(x, *s as u32)), new_p, 0)
+        }
+        ScalarValue::Decimal128(_, _, _) => value.clone(),
+        other => {
+            return exec_err!(
+                "Unsupported data type {:?} for function ceil",
+                other.data_type()
+            );
+        }
+    };
+    Ok(ColumnarValue::Scalar(result))
+}
+
+fn spark_ceil_array(input: &Arc<dyn arrow::array::Array>) -> Result<ColumnarValue> {
+    let result = match input.data_type() {
+        DataType::Float32 => Arc::new(
+            input
+                .as_primitive::<Float32Type>()
+                .unary::<_, Int64Type>(|x| x.ceil() as i64),
+        ) as _,
+        DataType::Float64 => Arc::new(
+            input
+                .as_primitive::<Float64Type>()
+                .unary::<_, Int64Type>(|x| x.ceil() as i64),
+        ) as _,
+        dt if dt.is_integer() => arrow::compute::cast(input, &DataType::Int64)?,
+        DataType::Decimal128(p, s) if *s > 0 => {
+            let new_p = decimal128_ceil_precision(*p, *s);
+            let result: Decimal128Array = input
+                .as_primitive::<Decimal128Type>()
+                .unary(|x| decimal128_ceil(x, *s as u32));
+            Arc::new(result.with_data_type(DataType::Decimal128(new_p, 0)))
+        }
+        DataType::Decimal128(_, _) => Arc::clone(input),
+        other => return exec_err!("Unsupported data type {other:?} for function ceil"),
+    };
+
+    Ok(ColumnarValue::Array(result))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{Decimal128Array, Float32Array, Float64Array, Int64Array};
+    use datafusion_common::ScalarValue;
+
+    #[test]
+    fn test_ceil_float64() {
+        let input = Float64Array::from(vec![
+            Some(125.2345),
+            Some(15.0001),
+            Some(0.1),
+            Some(-0.9),
+            Some(-1.1),
+            Some(123.0),
+            None,
+        ]);
+        let args = vec![ColumnarValue::Array(Arc::new(input))];
+        let result = spark_ceil(&args).unwrap();
+        let result = match result {
+            ColumnarValue::Array(arr) => arr,
+            _ => panic!("Expected array"),
+        };
+        let result = result.as_primitive::<Int64Type>();
+        assert_eq!(
+            result,
+            &Int64Array::from(vec![
+                Some(126),
+                Some(16),
+                Some(1),
+                Some(0),
+                Some(-1),
+                Some(123),
+                None,
+            ])
+        );
+    }
+
+    #[test]
+    fn test_ceil_float32() {
+        let input = Float32Array::from(vec![
+            Some(125.2345f32),
+            Some(15.0001f32),
+            Some(0.1f32),
+            Some(-0.9f32),
+            Some(-1.1f32),
+            Some(123.0f32),
+            None,
+        ]);
+        let args = vec![ColumnarValue::Array(Arc::new(input))];
+        let result = spark_ceil(&args).unwrap();
+        let result = match result {
+            ColumnarValue::Array(arr) => arr,
+            _ => panic!("Expected array"),
+        };
+        let result = result.as_primitive::<Int64Type>();
+        assert_eq!(
+            result,
+            &Int64Array::from(vec![
+                Some(126),
+                Some(16),
+                Some(1),
+                Some(0),
+                Some(-1),
+                Some(123),
+                None,
+            ])
+        );
+    }
+
+    #[test]
+    fn test_ceil_int64() {
+        let input = Int64Array::from(vec![Some(1), Some(-1), None]);
+        let args = vec![ColumnarValue::Array(Arc::new(input))];
+        let result = spark_ceil(&args).unwrap();
+        let result = match result {
+            ColumnarValue::Array(arr) => arr,
+            _ => panic!("Expected array"),
+        };
+        let result = result.as_primitive::<Int64Type>();
+        assert_eq!(result, &Int64Array::from(vec![Some(1), Some(-1), None]));
+    }
+
+    #[test]
+    fn test_ceil_decimal128() {
+        // Decimal128(10, 2): 150 = 1.50, -150 = -1.50, 100 = 1.00
+        let return_type = DataType::Decimal128(9, 0);
+        let input = Decimal128Array::from(vec![Some(150), Some(-150), Some(100), None])
+            .with_data_type(DataType::Decimal128(10, 2));
+        let args = vec![ColumnarValue::Array(Arc::new(input))];
+        let result = spark_ceil(&args).unwrap();
+        let result = match result {
+            ColumnarValue::Array(arr) => arr,
+            _ => panic!("Expected array"),
+        };
+        let result = result.as_primitive::<Decimal128Type>();
+        let expected = Decimal128Array::from(vec![Some(2), Some(-1), Some(1), None])
+            .with_data_type(return_type);
+        assert_eq!(result, &expected);
+    }
+
+    #[test]
+    fn test_ceil_float64_scalar() {
+        let input = ScalarValue::Float64(Some(-1.1));
+        let args = vec![ColumnarValue::Scalar(input)];
+        let result = match spark_ceil(&args).unwrap() {
+            ColumnarValue::Scalar(v) => v,
+            _ => panic!("Expected scalar"),
+        };
+        assert_eq!(result, ScalarValue::Int64(Some(-1)));
+    }
+
+    #[test]
+    fn test_ceil_float32_scalar() {
+        let input = ScalarValue::Float32(Some(125.2345f32));
+        let args = vec![ColumnarValue::Scalar(input)];
+        let result = match spark_ceil(&args).unwrap() {
+            ColumnarValue::Scalar(v) => v,
+            _ => panic!("Expected scalar"),
+        };
+        assert_eq!(result, ScalarValue::Int64(Some(126)));
+    }
+
+    #[test]
+    fn test_ceil_int64_scalar() {
+        let input = ScalarValue::Int64(Some(48));
+        let args = vec![ColumnarValue::Scalar(input)];
+        let result = match spark_ceil(&args).unwrap() {
+            ColumnarValue::Scalar(v) => v,
+            _ => panic!("Expected scalar"),
+        };
+        assert_eq!(result, ScalarValue::Int64(Some(48)));
+    }
+}
diff --git a/datafusion/spark/src/function/math/expm1.rs b/datafusion/spark/src/function/math/expm1.rs
index 42eccf3a2431a..a1090072f4909 100644
--- a/datafusion/spark/src/function/math/expm1.rs
+++ b/datafusion/spark/src/function/math/expm1.rs
@@ -15,23 +15,20 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::function::error_utils::{
-    invalid_arg_count_exec_err, unsupported_data_type_exec_err,
-};
+use crate::function::error_utils::unsupported_data_type_exec_err;
 use arrow::array::{ArrayRef, AsArray};
 use arrow::datatypes::{DataType, Float64Type};
+use datafusion_common::utils::take_function_args;
 use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::{
     ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
 };
-use std::any::Any;
 use std::sync::Arc;
 
 /// <https://spark.apache.org/docs/latest/api/sql/index.html#expm1>
 #[derive(Debug, PartialEq, Eq, Hash)]
 pub struct SparkExpm1 {
     signature: Signature,
-    aliases: Vec<String>,
 }
 
 impl Default for SparkExpm1 {
@@ -43,17 +40,12 @@ impl Default for SparkExpm1 {
 impl SparkExpm1 {
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
-            aliases: vec![],
+            signature: Signature::exact(vec![DataType::Float64], Volatility::Immutable),
         }
     }
 }
 
 impl ScalarUDFImpl for SparkExpm1 {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "expm1"
     }
@@ -67,10 +59,8 @@ impl ScalarUDFImpl for SparkExpm1 {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        if args.args.len() != 1 {
-            return Err(invalid_arg_count_exec_err("expm1", (1, 1), args.args.len()));
-        }
-        match &args.args[0] {
+        let [arg] = take_function_args(self.name(), args.args)?;
+        match arg {
             ColumnarValue::Scalar(ScalarValue::Float64(value)) => Ok(
                 ColumnarValue::Scalar(ScalarValue::Float64(value.map(|x| x.exp_m1()))),
             ),
@@ -94,52 +84,4 @@ impl ScalarUDFImpl for SparkExpm1 {
             )),
         }
     }
-
-    fn aliases(&self) -> &[String] {
-        &self.aliases
-    }
-
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        if arg_types.len() != 1 {
-            return Err(invalid_arg_count_exec_err("expm1", (1, 1), arg_types.len()));
-        }
-        if arg_types[0].is_numeric() {
-            Ok(vec![DataType::Float64])
-        } else {
-            Err(unsupported_data_type_exec_err(
-                "expm1",
-                "Numeric Type",
-                &arg_types[0],
-            ))
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::function::math::expm1::SparkExpm1;
-    use crate::function::utils::test::test_scalar_function;
-    use arrow::array::{Array, Float64Array};
-    use arrow::datatypes::DataType::Float64;
-    use datafusion_common::{Result, ScalarValue};
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
-
-    macro_rules! test_expm1_float64_invoke {
-        ($INPUT:expr, $EXPECTED:expr) => {
-            test_scalar_function!(
-                SparkExpm1::new(),
-                vec![ColumnarValue::Scalar(ScalarValue::Float64($INPUT))],
-                $EXPECTED,
-                f64,
-                Float64,
-                Float64Array
-            );
-        };
-    }
-
-    #[test]
-    fn test_expm1_invoke() -> Result<()> {
-        test_expm1_float64_invoke!(Some(0f64), Ok(Some(0.0f64)));
-        Ok(())
-    }
 }
diff --git a/datafusion/spark/src/function/math/factorial.rs b/datafusion/spark/src/function/math/factorial.rs
index 4921e73d262a3..c9405273e823b 100644
--- a/datafusion/spark/src/function/math/factorial.rs
+++ b/datafusion/spark/src/function/math/factorial.rs
@@ -15,14 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{Array, Int64Array};
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::{Int32, Int64};
 use datafusion_common::cast::as_int32_array;
-use datafusion_common::{exec_err, internal_err, DataFusionError, Result, ScalarValue};
+use datafusion_common::{
+    DataFusionError, Result, ScalarValue, exec_err, utils::take_function_args,
+};
 use datafusion_expr::Signature;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Volatility};
 
@@ -49,10 +50,6 @@ impl SparkFactorial {
 }
 
 impl ScalarUDFImpl for SparkFactorial {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "factorial"
     }
@@ -99,11 +96,9 @@ const FACTORIALS: [i64; 21] = [
 ];
 
 pub fn spark_factorial(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
-    if args.len() != 1 {
-        return internal_err!("`factorial` expects exactly one argument");
-    }
+    let [arg] = take_function_args("factorial", args)?;
 
-    match &args[0] {
+    match arg {
         ColumnarValue::Scalar(ScalarValue::Int32(value)) => {
             let result = compute_factorial(*value);
             Ok(ColumnarValue::Scalar(ScalarValue::Int64(result)))
@@ -136,8 +131,8 @@ fn compute_factorial(num: Option<i32>) -> Option<i64> {
 mod test {
     use crate::function::math::factorial::spark_factorial;
     use arrow::array::{Int32Array, Int64Array};
-    use datafusion_common::cast::as_int64_array;
     use datafusion_common::ScalarValue;
+    use datafusion_common::cast::as_int64_array;
     use datafusion_expr::ColumnarValue;
     use std::sync::Arc;
 
diff --git a/datafusion/spark/src/function/math/hex.rs b/datafusion/spark/src/function/math/hex.rs
index cdd13e9033265..22e0b5b0786ea 100644
--- a/datafusion/spark/src/function/math/hex.rs
+++ b/datafusion/spark/src/function/math/hex.rs
@@ -15,27 +15,28 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
+use std::str::from_utf8_unchecked;
 use std::sync::Arc;
 
-use crate::function::error_utils::{
-    invalid_arg_count_exec_err, unsupported_data_type_exec_err,
-};
-use arrow::array::{Array, StringArray};
+use arrow::array::{Array, ArrayRef, StringBuilder};
 use arrow::datatypes::DataType;
 use arrow::{
     array::{as_dictionary_array, as_largestring_array, as_string_array},
     datatypes::Int32Type,
 };
+use datafusion_common::cast::as_large_binary_array;
 use datafusion_common::cast::as_string_view_array;
+use datafusion_common::types::{NativeType, logical_int64, logical_string};
+use datafusion_common::utils::take_function_args;
 use datafusion_common::{
+    DataFusionError,
     cast::{as_binary_array, as_fixed_size_binary_array, as_int64_array},
-    exec_err, internal_err, DataFusionError,
+    exec_err,
+};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+    TypeSignatureClass, Volatility,
 };
-use datafusion_expr::Signature;
-use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Volatility};
-use std::fmt::Write;
-
 /// <https://spark.apache.org/docs/latest/api/sql/index.html#hex>
 #[derive(Debug, PartialEq, Eq, Hash)]
 pub struct SparkHex {
@@ -51,18 +52,33 @@ impl Default for SparkHex {
 
 impl SparkHex {
     pub fn new() -> Self {
+        let int64 = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int64()),
+            vec![TypeSignatureClass::Numeric],
+            NativeType::Int64,
+        );
+
+        let string = Coercion::new_exact(TypeSignatureClass::Native(logical_string()));
+
+        let binary = Coercion::new_exact(TypeSignatureClass::Binary);
+
+        let variants = vec![
+            // accepts numeric types
+            TypeSignature::Coercible(vec![int64]),
+            // accepts string types (Utf8, Utf8View, LargeUtf8)
+            TypeSignature::Coercible(vec![string]),
+            // accepts binary types (Binary, FixedSizeBinary, LargeBinary)
+            TypeSignature::Coercible(vec![binary]),
+        ];
+
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+            signature: Signature::one_of(variants, Volatility::Immutable),
             aliases: vec![],
         }
     }
 }
 
 impl ScalarUDFImpl for SparkHex {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "hex"
     }
@@ -71,11 +87,13 @@ impl ScalarUDFImpl for SparkHex {
         &self.signature
     }
 
-    fn return_type(
-        &self,
-        _arg_types: &[DataType],
-    ) -> datafusion_common::Result<DataType> {
-        Ok(DataType::Utf8)
+    fn return_type(&self, arg_types: &[DataType]) -> datafusion_common::Result<DataType> {
+        Ok(match &arg_types[0] {
+            DataType::Dictionary(key_type, _) => {
+                DataType::Dictionary(key_type.clone(), Box::new(DataType::Utf8))
+            }
+            _ => DataType::Utf8,
+        })
     }
 
     fn invoke_with_args(
@@ -88,86 +106,115 @@ impl ScalarUDFImpl for SparkHex {
     fn aliases(&self) -> &[String] {
         &self.aliases
     }
+}
 
-    fn coerce_types(
-        &self,
-        arg_types: &[DataType],
-    ) -> datafusion_common::Result<Vec<DataType>> {
-        if arg_types.len() != 1 {
-            return Err(invalid_arg_count_exec_err("hex", (1, 1), arg_types.len()));
-        }
-        match &arg_types[0] {
-            DataType::Int64
-            | DataType::Utf8
-            | DataType::Utf8View
-            | DataType::LargeUtf8
-            | DataType::Binary
-            | DataType::LargeBinary => Ok(vec![arg_types[0].clone()]),
-            DataType::Dictionary(key_type, value_type) => match value_type.as_ref() {
-                DataType::Int64
-                | DataType::Utf8
-                | DataType::Utf8View
-                | DataType::LargeUtf8
-                | DataType::Binary
-                | DataType::LargeBinary => Ok(vec![arg_types[0].clone()]),
-                other => {
-                    if other.is_numeric() {
-                        Ok(vec![DataType::Dictionary(
-                            key_type.clone(),
-                            Box::new(DataType::Int64),
-                        )])
-                    } else {
-                        Err(unsupported_data_type_exec_err(
-                            "hex",
-                            "Numeric, String, or Binary",
-                            &arg_types[0],
-                        ))
-                    }
-                }
-            },
-            other => {
-                if other.is_numeric() {
-                    Ok(vec![DataType::Int64])
-                } else {
-                    Err(unsupported_data_type_exec_err(
-                        "hex",
-                        "Numeric, String, or Binary",
-                        &arg_types[0],
-                    ))
-                }
-            }
-        }
+/// Hex encoding lookup tables for fast byte-to-hex conversion.
+///
+/// Each entry maps a full byte to its two-character hex encoding so the
+/// hot loop becomes one load + one two-byte extend per input byte instead
+/// of two nibble lookups and two pushes.
+const HEX_CHARS_UPPER_NIBBLES: &[u8; 16] = b"0123456789ABCDEF";
+const HEX_CHARS_LOWER_NIBBLES: &[u8; 16] = b"0123456789abcdef";
+
+const HEX_LOOKUP_UPPER: [[u8; 2]; 256] = build_hex_lookup(HEX_CHARS_UPPER_NIBBLES);
+const HEX_LOOKUP_LOWER: [[u8; 2]; 256] = build_hex_lookup(HEX_CHARS_LOWER_NIBBLES);
+
+const fn build_hex_lookup(nibbles: &[u8; 16]) -> [[u8; 2]; 256] {
+    let mut table = [[0u8; 2]; 256];
+    let mut i = 0;
+    while i < 256 {
+        table[i][0] = nibbles[(i >> 4) & 0xF];
+        table[i][1] = nibbles[i & 0xF];
+        i += 1;
     }
+    table
 }
 
-fn hex_int64(num: i64) -> String {
-    format!("{num:X}")
+#[inline]
+fn hex_int64(num: i64, buffer: &mut [u8; 16]) -> &[u8] {
+    if num == 0 {
+        return b"0";
+    }
+
+    // Walk the value two nibbles (one full byte) at a time. The buffer is
+    // filled from the right so the high-order nibbles end up first; the
+    // returned slice trims leading zeros automatically.
+    let mut n = num as u64;
+    let mut i = 16;
+    while n >= 0x10 {
+        i -= 2;
+        let pair = HEX_LOOKUP_UPPER[(n & 0xFF) as usize];
+        buffer[i] = pair[0];
+        buffer[i + 1] = pair[1];
+        n >>= 8;
+    }
+    if n > 0 {
+        // Single remaining high nibble (value 0x1..=0xF).
+        i -= 1;
+        buffer[i] = HEX_CHARS_UPPER_NIBBLES[n as usize];
+    }
+    &buffer[i..]
 }
 
-#[inline(always)]
-fn hex_encode<T: AsRef<[u8]>>(data: T, lower_case: bool) -> String {
-    let mut s = String::with_capacity(data.as_ref().len() * 2);
-    if lower_case {
-        for b in data.as_ref() {
-            // Writing to a string never errors, so we can unwrap here.
-            write!(&mut s, "{b:02x}").unwrap();
-        }
+/// Generic hex encoding for byte array types
+fn hex_encode_bytes<'a, I, T>(
+    iter: I,
+    lowercase: bool,
+    len: usize,
+) -> Result<ArrayRef, DataFusionError>
+where
+    I: Iterator<Item = Option<T>>,
+    T: AsRef<[u8]> + 'a,
+{
+    let mut builder = StringBuilder::with_capacity(len, len * 64);
+    let mut buffer = Vec::with_capacity(64);
+    let lookup = if lowercase {
+        &HEX_LOOKUP_LOWER
     } else {
-        for b in data.as_ref() {
-            // Writing to a string never errors, so we can unwrap here.
-            write!(&mut s, "{b:02X}").unwrap();
+        &HEX_LOOKUP_UPPER
+    };
+
+    for v in iter {
+        if let Some(b) = v {
+            let bytes = b.as_ref();
+            buffer.clear();
+            buffer.reserve(bytes.len() * 2);
+            for &byte in bytes {
+                buffer.extend_from_slice(&lookup[byte as usize]);
+            }
+            // SAFETY: buffer contains only ASCII hex digits, which are valid UTF-8.
+            unsafe {
+                builder.append_value(from_utf8_unchecked(&buffer));
+            }
+        } else {
+            builder.append_null();
         }
     }
-    s
+
+    Ok(Arc::new(builder.finish()))
 }
 
-#[inline(always)]
-fn hex_bytes<T: AsRef<[u8]>>(
-    bytes: T,
-    lowercase: bool,
-) -> Result<String, std::fmt::Error> {
-    let hex_string = hex_encode(bytes, lowercase);
-    Ok(hex_string)
+/// Generic hex encoding for int64 type
+fn hex_encode_int64(
+    iter: impl Iterator<Item = Option<i64>>,
+    len: usize,
+) -> Result<ArrayRef, DataFusionError> {
+    let mut builder = StringBuilder::with_capacity(len, len * 16);
+
+    for v in iter {
+        if let Some(num) = v {
+            let mut temp = [0u8; 16];
+            let slice = hex_int64(num, &mut temp);
+            // SAFETY: slice contains only ASCII hex digests, which are valid UTF-8
+            unsafe {
+                builder.append_value(from_utf8_unchecked(slice));
+            }
+        } else {
+            builder.append_null();
+        }
+    }
+
+    Ok(Arc::new(builder.finish()))
 }
 
 /// Spark-compatible `hex` function
@@ -184,106 +231,118 @@ pub fn compute_hex(
     args: &[ColumnarValue],
     lowercase: bool,
 ) -> Result<ColumnarValue, DataFusionError> {
-    if args.len() != 1 {
-        return internal_err!("hex expects exactly one argument");
-    }
-
-    let input = match &args[0] {
-        ColumnarValue::Scalar(value) => ColumnarValue::Array(value.to_array()?),
-        ColumnarValue::Array(_) => args[0].clone(),
+    let input = match take_function_args("hex", args)? {
+        [ColumnarValue::Scalar(value)] => ColumnarValue::Array(value.to_array()?),
+        [ColumnarValue::Array(arr)] => ColumnarValue::Array(Arc::clone(arr)),
     };
 
     match &input {
         ColumnarValue::Array(array) => match array.data_type() {
             DataType::Int64 => {
                 let array = as_int64_array(array)?;
-
-                let hexed_array: StringArray =
-                    array.iter().map(|v| v.map(hex_int64)).collect();
-
-                Ok(ColumnarValue::Array(Arc::new(hexed_array)))
+                Ok(ColumnarValue::Array(hex_encode_int64(
+                    array.iter(),
+                    array.len(),
+                )?))
             }
             DataType::Utf8 => {
                 let array = as_string_array(array);
-
-                let hexed: StringArray = array
-                    .iter()
-                    .map(|v| v.map(|b| hex_bytes(b, lowercase)).transpose())
-                    .collect::<Result<_, _>>()?;
-
-                Ok(ColumnarValue::Array(Arc::new(hexed)))
+                Ok(ColumnarValue::Array(hex_encode_bytes(
+                    array.iter(),
+                    lowercase,
+                    array.len(),
+                )?))
             }
             DataType::Utf8View => {
                 let array = as_string_view_array(array)?;
-
-                let hexed: StringArray = array
-                    .iter()
-                    .map(|v| v.map(|b| hex_bytes(b, lowercase)).transpose())
-                    .collect::<Result<_, _>>()?;
-
-                Ok(ColumnarValue::Array(Arc::new(hexed)))
+                Ok(ColumnarValue::Array(hex_encode_bytes(
+                    array.iter(),
+                    lowercase,
+                    array.len(),
+                )?))
             }
             DataType::LargeUtf8 => {
                 let array = as_largestring_array(array);
-
-                let hexed: StringArray = array
-                    .iter()
-                    .map(|v| v.map(|b| hex_bytes(b, lowercase)).transpose())
-                    .collect::<Result<_, _>>()?;
-
-                Ok(ColumnarValue::Array(Arc::new(hexed)))
+                Ok(ColumnarValue::Array(hex_encode_bytes(
+                    array.iter(),
+                    lowercase,
+                    array.len(),
+                )?))
             }
             DataType::Binary => {
                 let array = as_binary_array(array)?;
-
-                let hexed: StringArray = array
-                    .iter()
-                    .map(|v| v.map(|b| hex_bytes(b, lowercase)).transpose())
-                    .collect::<Result<_, _>>()?;
-
-                Ok(ColumnarValue::Array(Arc::new(hexed)))
+                Ok(ColumnarValue::Array(hex_encode_bytes(
+                    array.iter(),
+                    lowercase,
+                    array.len(),
+                )?))
+            }
+            DataType::LargeBinary => {
+                let array = as_large_binary_array(array)?;
+                Ok(ColumnarValue::Array(hex_encode_bytes(
+                    array.iter(),
+                    lowercase,
+                    array.len(),
+                )?))
             }
             DataType::FixedSizeBinary(_) => {
                 let array = as_fixed_size_binary_array(array)?;
-
-                let hexed: StringArray = array
-                    .iter()
-                    .map(|v| v.map(|b| hex_bytes(b, lowercase)).transpose())
-                    .collect::<Result<_, _>>()?;
-
-                Ok(ColumnarValue::Array(Arc::new(hexed)))
+                Ok(ColumnarValue::Array(hex_encode_bytes(
+                    array.iter(),
+                    lowercase,
+                    array.len(),
+                )?))
             }
-            DataType::Dictionary(_, value_type) => {
+            DataType::Dictionary(key_type, _) => {
+                if **key_type != DataType::Int32 {
+                    return exec_err!(
+                        "hex only supports Int32 dictionary keys, get: {}",
+                        key_type
+                    );
+                }
+
                 let dict = as_dictionary_array::<Int32Type>(&array);
+                let dict_values = dict.values();
 
-                let values = match **value_type {
-                    DataType::Int64 => as_int64_array(dict.values())?
-                        .iter()
-                        .map(|v| v.map(hex_int64))
-                        .collect::<Vec<_>>(),
-                    DataType::Utf8 => as_string_array(dict.values())
-                        .iter()
-                        .map(|v| v.map(|b| hex_bytes(b, lowercase)).transpose())
-                        .collect::<Result<_, _>>()?,
-                    DataType::Binary => as_binary_array(dict.values())?
-                        .iter()
-                        .map(|v| v.map(|b| hex_bytes(b, lowercase)).transpose())
-                        .collect::<Result<_, _>>()?,
-                    _ => exec_err!(
-                        "hex got an unexpected argument type: {}",
-                        array.data_type()
-                    )?,
+                let encoded_values = match dict_values.data_type() {
+                    DataType::Int64 => {
+                        let arr = as_int64_array(dict_values)?;
+                        hex_encode_int64(arr.iter(), arr.len())?
+                    }
+                    DataType::Utf8 => {
+                        let arr = as_string_array(dict_values);
+                        hex_encode_bytes(arr.iter(), lowercase, arr.len())?
+                    }
+                    DataType::LargeUtf8 => {
+                        let arr = as_largestring_array(dict_values);
+                        hex_encode_bytes(arr.iter(), lowercase, arr.len())?
+                    }
+                    DataType::Utf8View => {
+                        let arr = as_string_view_array(dict_values)?;
+                        hex_encode_bytes(arr.iter(), lowercase, arr.len())?
+                    }
+                    DataType::Binary => {
+                        let arr = as_binary_array(dict_values)?;
+                        hex_encode_bytes(arr.iter(), lowercase, arr.len())?
+                    }
+                    DataType::LargeBinary => {
+                        let arr = as_large_binary_array(dict_values)?;
+                        hex_encode_bytes(arr.iter(), lowercase, arr.len())?
+                    }
+                    DataType::FixedSizeBinary(_) => {
+                        let arr = as_fixed_size_binary_array(dict_values)?;
+                        hex_encode_bytes(arr.iter(), lowercase, arr.len())?
+                    }
+                    _ => {
+                        return exec_err!(
+                            "hex got an unexpected argument type: {}",
+                            dict_values.data_type()
+                        );
+                    }
                 };
 
-                let new_values: Vec<Option<String>> = dict
-                    .keys()
-                    .iter()
-                    .map(|key| key.map(|k| values[k as usize].clone()).unwrap_or(None))
-                    .collect();
-
-                let string_array_values = StringArray::from(new_values);
-
-                Ok(ColumnarValue::Array(Arc::new(string_array_values)))
+                let new_dict = dict.with_values(encoded_values);
+                Ok(ColumnarValue::Array(Arc::new(new_dict)))
             }
             _ => exec_err!("hex got an unexpected argument type: {}", array.data_type()),
         },
@@ -293,16 +352,20 @@ pub fn compute_hex(
 
 #[cfg(test)]
 mod test {
+    use std::str::from_utf8_unchecked;
     use std::sync::Arc;
 
-    use arrow::array::{Int64Array, StringArray};
+    use arrow::array::{
+        BinaryArray, DictionaryArray, Int32Array, Int64Array, StringArray,
+    };
     use arrow::{
         array::{
-            as_string_array, BinaryDictionaryBuilder, PrimitiveDictionaryBuilder,
-            StringBuilder, StringDictionaryBuilder,
+            BinaryDictionaryBuilder, PrimitiveDictionaryBuilder, StringDictionaryBuilder,
+            as_string_array,
         },
         datatypes::{Int32Type, Int64Type},
     };
+    use datafusion_common::cast::as_dictionary_array;
     use datafusion_expr::ColumnarValue;
 
     #[test]
@@ -314,12 +377,12 @@ mod test {
         input_builder.append_value("rust");
         let input = input_builder.finish();
 
-        let mut string_builder = StringBuilder::new();
-        string_builder.append_value("6869");
-        string_builder.append_value("627965");
-        string_builder.append_null();
-        string_builder.append_value("72757374");
-        let expected = string_builder.finish();
+        let mut expected_builder = StringDictionaryBuilder::<Int32Type>::new();
+        expected_builder.append_value("6869");
+        expected_builder.append_value("627965");
+        expected_builder.append_null();
+        expected_builder.append_value("72757374");
+        let expected = expected_builder.finish();
 
         let columnar_value = ColumnarValue::Array(Arc::new(input));
         let result = super::spark_hex(&[columnar_value]).unwrap();
@@ -329,7 +392,7 @@ mod test {
             _ => panic!("Expected array"),
         };
 
-        let result = as_string_array(&result);
+        let result = as_dictionary_array(&result).unwrap();
 
         assert_eq!(result, &expected);
     }
@@ -343,12 +406,12 @@ mod test {
         input_builder.append_value(3);
         let input = input_builder.finish();
 
-        let mut string_builder = StringBuilder::new();
-        string_builder.append_value("1");
-        string_builder.append_value("2");
-        string_builder.append_null();
-        string_builder.append_value("3");
-        let expected = string_builder.finish();
+        let mut expected_builder = StringDictionaryBuilder::<Int32Type>::new();
+        expected_builder.append_value("1");
+        expected_builder.append_value("2");
+        expected_builder.append_null();
+        expected_builder.append_value("3");
+        let expected = expected_builder.finish();
 
         let columnar_value = ColumnarValue::Array(Arc::new(input));
         let result = super::spark_hex(&[columnar_value]).unwrap();
@@ -358,7 +421,7 @@ mod test {
             _ => panic!("Expected array"),
         };
 
-        let result = as_string_array(&result);
+        let result = as_dictionary_array(&result).unwrap();
 
         assert_eq!(result, &expected);
     }
@@ -372,7 +435,7 @@ mod test {
         input_builder.append_value("3");
         let input = input_builder.finish();
 
-        let mut expected_builder = StringBuilder::new();
+        let mut expected_builder = StringDictionaryBuilder::<Int32Type>::new();
         expected_builder.append_value("31");
         expected_builder.append_value("6A");
         expected_builder.append_null();
@@ -387,20 +450,79 @@ mod test {
             _ => panic!("Expected array"),
         };
 
-        let result = as_string_array(&result);
+        let result = as_dictionary_array(&result).unwrap();
 
         assert_eq!(result, &expected);
     }
 
     #[test]
     fn test_hex_int64() {
-        let num = 1234;
-        let hexed = super::hex_int64(num);
-        assert_eq!(hexed, "4D2".to_string());
+        let test_cases = vec![
+            (0_i64, "0"),
+            (1, "1"),
+            (15, "F"),
+            (16, "10"),
+            (255, "FF"),
+            (256, "100"),
+            (1234, "4D2"),
+            (i64::MAX, "7FFFFFFFFFFFFFFF"),
+            (i64::MIN, "8000000000000000"),
+            (-1, "FFFFFFFFFFFFFFFF"),
+        ];
+
+        for (num, expected) in test_cases {
+            let mut cache = [0u8; 16];
+            let slice = super::hex_int64(num, &mut cache);
+
+            unsafe {
+                let result = from_utf8_unchecked(slice);
+                assert_eq!(expected, result, "hex_int64({num}) mismatch");
+            }
+        }
+    }
+
+    #[test]
+    fn test_hex_lookup_table_covers_all_bytes() {
+        // Cross-check the precomputed table against an independent encoder
+        // for every possible byte value and both casings.
+        for byte in 0u8..=255 {
+            let upper = format!("{byte:02X}");
+            let lower = format!("{byte:02x}");
+            let upper_pair = super::HEX_LOOKUP_UPPER[byte as usize];
+            let lower_pair = super::HEX_LOOKUP_LOWER[byte as usize];
+            assert_eq!(
+                upper.as_bytes(),
+                &upper_pair,
+                "upper encoding mismatch for byte 0x{byte:02X}"
+            );
+            assert_eq!(
+                lower.as_bytes(),
+                &lower_pair,
+                "lower encoding mismatch for byte 0x{byte:02X}"
+            );
+        }
+    }
 
-        let num = -1;
-        let hexed = super::hex_int64(num);
-        assert_eq!(hexed, "FFFFFFFFFFFFFFFF".to_string());
+    #[test]
+    fn test_spark_hex_binary_round_trip_all_bytes() {
+        // Single-row binary input containing every byte value, encoded in
+        // a single column. Catches per-byte regressions in the bytes path.
+        let payload: Vec<u8> = (0u8..=255).collect();
+        let bin_array = BinaryArray::from(vec![Some(payload.as_slice())]);
+
+        let result =
+            super::spark_hex(&[ColumnarValue::Array(Arc::new(bin_array))]).unwrap();
+        let array = match result {
+            ColumnarValue::Array(array) => array,
+            _ => panic!("Expected array"),
+        };
+        let strings = as_string_array(&array);
+        let mut expected = String::with_capacity(512);
+        for byte in 0u8..=255 {
+            use std::fmt::Write;
+            write!(expected, "{byte:02X}").unwrap();
+        }
+        assert_eq!(strings.value(0), expected);
     }
 
     #[test]
@@ -424,4 +546,28 @@ mod test {
 
         assert_eq!(string_array, &expected_array);
     }
+
+    #[test]
+    fn test_dict_values_null() {
+        let keys = Int32Array::from(vec![Some(0), None, Some(1)]);
+        let vals = Int64Array::from(vec![Some(32), None]);
+        // [32, null, null]
+        let dict = DictionaryArray::new(keys, Arc::new(vals));
+
+        let columnar_value = ColumnarValue::Array(Arc::new(dict));
+        let result = super::spark_hex(&[columnar_value]).unwrap();
+
+        let result = match result {
+            ColumnarValue::Array(array) => array,
+            _ => panic!("Expected array"),
+        };
+
+        let result = as_dictionary_array(&result).unwrap();
+
+        let keys = Int32Array::from(vec![Some(0), None, Some(1)]);
+        let vals = StringArray::from(vec![Some("20"), None]);
+        let expected = DictionaryArray::new(keys, Arc::new(vals));
+
+        assert_eq!(&expected, result);
+    }
 }
diff --git a/datafusion/spark/src/function/math/mod.rs b/datafusion/spark/src/function/math/mod.rs
index 092335e4aa18d..3f6a3a686db30 100644
--- a/datafusion/spark/src/function/math/mod.rs
+++ b/datafusion/spark/src/function/math/mod.rs
@@ -15,28 +15,45 @@
 // specific language governing permissions and limitations
 // under the License.
 
+pub mod abs;
+pub mod bin;
+pub mod ceil;
 pub mod expm1;
 pub mod factorial;
 pub mod hex;
 pub mod modulus;
+pub mod negative;
 pub mod rint;
+pub mod round;
+pub mod trigonometry;
+pub mod unhex;
 pub mod width_bucket;
 
 use datafusion_expr::ScalarUDF;
 use datafusion_functions::make_udf_function;
 use std::sync::Arc;
 
+make_udf_function!(abs::SparkAbs, abs);
+make_udf_function!(ceil::SparkCeil, ceil);
 make_udf_function!(expm1::SparkExpm1, expm1);
 make_udf_function!(factorial::SparkFactorial, factorial);
 make_udf_function!(hex::SparkHex, hex);
 make_udf_function!(modulus::SparkMod, modulus);
 make_udf_function!(modulus::SparkPmod, pmod);
 make_udf_function!(rint::SparkRint, rint);
+make_udf_function!(round::SparkRound, round);
+make_udf_function!(unhex::SparkUnhex, unhex);
 make_udf_function!(width_bucket::SparkWidthBucket, width_bucket);
+make_udf_function!(trigonometry::SparkCsc, csc);
+make_udf_function!(trigonometry::SparkSec, sec);
+make_udf_function!(negative::SparkNegative, negative);
+make_udf_function!(bin::SparkBin, bin);
 
 pub mod expr_fn {
     use datafusion_functions::export_functions;
 
+    export_functions!((abs, "Returns abs(expr)", arg1));
+    export_functions!((ceil, "Returns the ceiling of expr.", arg1));
     export_functions!((expm1, "Returns exp(expr) - 1 as a Float64.", arg1));
     export_functions!((
         factorial,
@@ -46,18 +63,48 @@ pub mod expr_fn {
     export_functions!((hex, "Computes hex value of the given column.", arg1));
     export_functions!((modulus, "Returns the remainder of division of the first argument by the second argument.", arg1 arg2));
     export_functions!((pmod, "Returns the positive remainder of division of the first argument by the second argument.", arg1 arg2));
-    export_functions!((rint, "Returns the double value that is closest in value to the argument and is equal to a mathematical integer.", arg1));
+    export_functions!((
+        rint,
+        "Returns the double value that is closest in value to the argument and is equal to a mathematical integer.",
+        arg1
+    ));
+    export_functions!((
+        round,
+        "Rounds the value of expr to scale decimal places using HALF_UP rounding mode.",
+        arg1 arg2
+    ));
+    export_functions!((unhex, "Converts hexadecimal string to binary.", arg1));
     export_functions!((width_bucket, "Returns the bucket number into which the value of this expression would fall after being evaluated.", arg1 arg2 arg3 arg4));
+    export_functions!((csc, "Returns the cosecant of expr.", arg1));
+    export_functions!((sec, "Returns the secant of expr.", arg1));
+    export_functions!((
+        negative,
+        "Returns the negation of expr (unary minus).",
+        arg1
+    ));
+    export_functions!((
+        bin,
+        "Returns the string representation of the long value represented in binary.",
+        arg1
+    ));
 }
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
     vec![
+        abs(),
+        ceil(),
         expm1(),
         factorial(),
         hex(),
         modulus(),
         pmod(),
         rint(),
+        round(),
+        unhex(),
         width_bucket(),
+        csc(),
+        sec(),
+        negative(),
+        bin(),
     ]
 }
diff --git a/datafusion/spark/src/function/math/modulus.rs b/datafusion/spark/src/function/math/modulus.rs
index fea0297a7ae94..97f59c2cbb0cb 100644
--- a/datafusion/spark/src/function/math/modulus.rs
+++ b/datafusion/spark/src/function/math/modulus.rs
@@ -15,41 +15,73 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use arrow::array::{Scalar, new_null_array};
 use arrow::compute::kernels::numeric::add;
-use arrow::compute::kernels::{cmp::lt, numeric::rem, zip::zip};
+use arrow::compute::kernels::{
+    cmp::{eq, lt},
+    numeric::rem,
+    zip::zip,
+};
 use arrow::datatypes::DataType;
-use datafusion_common::{internal_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, assert_eq_or_internal_err};
 use datafusion_expr::{
     ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
 };
-use std::any::Any;
 
-/// Spark-compatible `mod` function
-/// This function directly uses Arrow's arithmetic_op function for modulo operations
-pub fn spark_mod(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-    if args.len() != 2 {
-        return internal_err!("mod expects exactly two arguments");
+/// Computes `rem(left, right)` with divide-by-zero handling.
+/// In ANSI mode, any zero divisor causes an error.
+/// In legacy mode (ANSI off), zero divisors are replaced with NULL before
+/// computing the remainder, so those positions return NULL while others
+/// compute normally.
+fn try_rem(
+    left: &arrow::array::ArrayRef,
+    right: &arrow::array::ArrayRef,
+    enable_ansi_mode: bool,
+) -> Result<arrow::array::ArrayRef> {
+    if enable_ansi_mode {
+        Ok(rem(left, right)?)
+    } else {
+        // In legacy mode, null out zero divisors so that division by zero
+        // returns NULL instead of erroring (integers) or returning NaN (floats).
+        let zero = ScalarValue::new_zero(right.data_type())?.to_array()?;
+        let zero = Scalar::new(zero);
+        let null = Scalar::new(new_null_array(right.data_type(), 1));
+        let is_zero = eq(right, &zero)?;
+        let safe_right = zip(&is_zero, &null, right)?;
+        Ok(rem(left, &safe_right)?)
     }
+}
+
+/// Spark-compatible `mod` function
+/// In ANSI mode, division by zero throws an error.
+/// In legacy mode, division by zero returns NULL (Spark behavior).
+pub fn spark_mod(
+    args: &[ColumnarValue],
+    enable_ansi_mode: bool,
+) -> Result<ColumnarValue> {
+    assert_eq_or_internal_err!(args.len(), 2, "mod expects exactly two arguments");
     let args = ColumnarValue::values_to_arrays(args)?;
-    let result = rem(&args[0], &args[1])?;
+    let result = try_rem(&args[0], &args[1], enable_ansi_mode)?;
     Ok(ColumnarValue::Array(result))
 }
 
 /// Spark-compatible `pmod` function
-/// This function directly uses Arrow's arithmetic_op function for modulo operations
-pub fn spark_pmod(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-    if args.len() != 2 {
-        return internal_err!("pmod expects exactly two arguments");
-    }
+/// In ANSI mode, division by zero throws an error.
+/// In legacy mode, division by zero returns NULL (Spark behavior).
+pub fn spark_pmod(
+    args: &[ColumnarValue],
+    enable_ansi_mode: bool,
+) -> Result<ColumnarValue> {
+    assert_eq_or_internal_err!(args.len(), 2, "pmod expects exactly two arguments");
     let args = ColumnarValue::values_to_arrays(args)?;
     let left = &args[0];
     let right = &args[1];
     let zero = ScalarValue::new_zero(left.data_type())?.to_array_of_size(left.len())?;
-    let result = rem(left, right)?;
+    let result = try_rem(left, right, enable_ansi_mode)?;
     let neg = lt(&result, &zero)?;
     let plus = zip(&neg, right, &zero)?;
     let result = add(&plus, &result)?;
-    let result = rem(&result, right)?;
+    let result = try_rem(&result, right, enable_ansi_mode)?;
     Ok(ColumnarValue::Array(result))
 }
 
@@ -74,10 +106,6 @@ impl SparkMod {
 }
 
 impl ScalarUDFImpl for SparkMod {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "mod"
     }
@@ -87,9 +115,11 @@ impl ScalarUDFImpl for SparkMod {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if arg_types.len() != 2 {
-            return internal_err!("mod expects exactly two arguments");
-        }
+        assert_eq_or_internal_err!(
+            arg_types.len(),
+            2,
+            "mod expects exactly two arguments"
+        );
 
         // Return the same type as the first argument for simplicity
         // Arrow's rem function handles type promotion internally
@@ -97,7 +127,7 @@ impl ScalarUDFImpl for SparkMod {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        spark_mod(&args.args)
+        spark_mod(&args.args, args.config_options.execution.enable_ansi_mode)
     }
 }
 
@@ -122,10 +152,6 @@ impl SparkPmod {
 }
 
 impl ScalarUDFImpl for SparkPmod {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "pmod"
     }
@@ -135,9 +161,11 @@ impl ScalarUDFImpl for SparkPmod {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if arg_types.len() != 2 {
-            return internal_err!("pmod expects exactly two arguments");
-        }
+        assert_eq_or_internal_err!(
+            arg_types.len(),
+            2,
+            "pmod expects exactly two arguments"
+        );
 
         // Return the same type as the first argument for simplicity
         // Arrow's rem function handles type promotion internally
@@ -145,7 +173,7 @@ impl ScalarUDFImpl for SparkPmod {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        spark_pmod(&args.args)
+        spark_pmod(&args.args, args.config_options.execution.enable_ansi_mode)
     }
 }
 
@@ -165,7 +193,7 @@ mod test {
         let left_value = ColumnarValue::Array(Arc::new(left));
         let right_value = ColumnarValue::Array(Arc::new(right));
 
-        let result = spark_mod(&[left_value, right_value]).unwrap();
+        let result = spark_mod(&[left_value, right_value], false).unwrap();
 
         if let ColumnarValue::Array(result_array) = result {
             let result_int32 =
@@ -187,7 +215,7 @@ mod test {
         let left_value = ColumnarValue::Array(Arc::new(left));
         let right_value = ColumnarValue::Array(Arc::new(right));
 
-        let result = spark_mod(&[left_value, right_value]).unwrap();
+        let result = spark_mod(&[left_value, right_value], false).unwrap();
 
         if let ColumnarValue::Array(result_array) = result {
             let result_int64 =
@@ -212,6 +240,8 @@ mod test {
             Some(5.0),
             Some(f64::NAN),
             Some(f64::INFINITY),
+            Some(10.5),
+            Some(15.8),
         ]);
         let right = Float64Array::from(vec![
             Some(3.0),
@@ -223,12 +253,14 @@ mod test {
             Some(f64::INFINITY),
             Some(f64::INFINITY),
             Some(f64::NAN),
+            Some(0.0),
+            Some(0.0),
         ]);
 
         let left_value = ColumnarValue::Array(Arc::new(left));
         let right_value = ColumnarValue::Array(Arc::new(right));
 
-        let result = spark_mod(&[left_value, right_value]).unwrap();
+        let result = spark_mod(&[left_value, right_value], false).unwrap();
 
         if let ColumnarValue::Array(result_array) = result {
             let result_float64 = result_array
@@ -239,7 +271,7 @@ mod test {
             assert!((result_float64.value(0) - 1.5).abs() < f64::EPSILON); // 10.5 % 3.0 = 1.5
             assert!((result_float64.value(1) - 2.2).abs() < f64::EPSILON); // 7.2 % 2.5 = 2.2
             assert!((result_float64.value(2) - 3.2).abs() < f64::EPSILON); // 15.8 % 4.2 = 3.2
-                                                                           // nan % 2.0 = nan
+            // nan % 2.0 = nan
             assert!(result_float64.value(3).is_nan());
             // inf % 2.0 = nan (IEEE 754)
             assert!(result_float64.value(4).is_nan());
@@ -251,6 +283,9 @@ mod test {
             assert!(result_float64.value(7).is_nan());
             // inf % nan = nan
             assert!(result_float64.value(8).is_nan());
+            // Division by zero returns NULL
+            assert!(result_float64.is_null(9)); // 10.5 % 0.0 = NULL
+            assert!(result_float64.is_null(10)); // 15.8 % 0.0 = NULL
         } else {
             panic!("Expected array result");
         }
@@ -268,6 +303,8 @@ mod test {
             Some(5.0),
             Some(f32::NAN),
             Some(f32::INFINITY),
+            Some(10.5),
+            Some(15.8),
         ]);
         let right = Float32Array::from(vec![
             Some(3.0),
@@ -279,12 +316,14 @@ mod test {
             Some(f32::INFINITY),
             Some(f32::INFINITY),
             Some(f32::NAN),
+            Some(0.0),
+            Some(0.0),
         ]);
 
         let left_value = ColumnarValue::Array(Arc::new(left));
         let right_value = ColumnarValue::Array(Arc::new(right));
 
-        let result = spark_mod(&[left_value, right_value]).unwrap();
+        let result = spark_mod(&[left_value, right_value], false).unwrap();
 
         if let ColumnarValue::Array(result_array) = result {
             let result_float32 = result_array
@@ -295,7 +334,7 @@ mod test {
             assert!((result_float32.value(0) - 1.5).abs() < f32::EPSILON); // 10.5 % 3.0 = 1.5
             assert!((result_float32.value(1) - 2.2).abs() < f32::EPSILON * 3.0); // 7.2 % 2.5 = 2.2
             assert!((result_float32.value(2) - 3.2).abs() < f32::EPSILON * 10.0); // 15.8 % 4.2 = 3.2
-                                                                                  // nan % 2.0 = nan
+            // nan % 2.0 = nan
             assert!(result_float32.value(3).is_nan());
             // inf % 2.0 = nan (IEEE 754)
             assert!(result_float32.value(4).is_nan());
@@ -307,6 +346,9 @@ mod test {
             assert!(result_float32.value(7).is_nan());
             // inf % nan = nan
             assert!(result_float32.value(8).is_nan());
+            // Division by zero returns NULL
+            assert!(result_float32.is_null(9)); // 10.5 % 0.0 = NULL
+            assert!(result_float32.is_null(10)); // 15.8 % 0.0 = NULL
         } else {
             panic!("Expected array result");
         }
@@ -319,7 +361,7 @@ mod test {
 
         let left_value = ColumnarValue::Array(Arc::new(left));
 
-        let result = spark_mod(&[left_value, right_value]).unwrap();
+        let result = spark_mod(&[left_value, right_value], false).unwrap();
 
         if let ColumnarValue::Array(result_array) = result {
             let result_int32 =
@@ -337,20 +379,43 @@ mod test {
         let left = Int32Array::from(vec![Some(10)]);
         let left_value = ColumnarValue::Array(Arc::new(left));
 
-        let result = spark_mod(&[left_value]);
+        let result = spark_mod(&[left_value], false);
         assert!(result.is_err());
     }
 
     #[test]
-    fn test_mod_zero_division() {
+    fn test_mod_zero_division_legacy() {
+        // In legacy mode (ANSI off), division by zero returns NULL per-element
         let left = Int32Array::from(vec![Some(10), Some(7), Some(15)]);
         let right = Int32Array::from(vec![Some(0), Some(2), Some(4)]);
 
         let left_value = ColumnarValue::Array(Arc::new(left));
         let right_value = ColumnarValue::Array(Arc::new(right));
 
-        let result = spark_mod(&[left_value, right_value]);
-        assert!(result.is_err()); // Division by zero should error
+        let result = spark_mod(&[left_value, right_value], false).unwrap();
+
+        if let ColumnarValue::Array(result_array) = result {
+            let result_int32 =
+                result_array.as_any().downcast_ref::<Int32Array>().unwrap();
+            assert!(result_int32.is_null(0)); // 10 % 0 = NULL
+            assert_eq!(result_int32.value(1), 1); // 7 % 2 = 1
+            assert_eq!(result_int32.value(2), 3); // 15 % 4 = 3
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_mod_zero_division_ansi() {
+        // In ANSI mode, division by zero should error
+        let left = Int32Array::from(vec![Some(10), Some(7), Some(15)]);
+        let right = Int32Array::from(vec![Some(0), Some(2), Some(4)]);
+
+        let left_value = ColumnarValue::Array(Arc::new(left));
+        let right_value = ColumnarValue::Array(Arc::new(right));
+
+        let result = spark_mod(&[left_value, right_value], true);
+        assert!(result.is_err());
     }
 
     // PMOD tests
@@ -362,7 +427,7 @@ mod test {
         let left_value = ColumnarValue::Array(Arc::new(left));
         let right_value = ColumnarValue::Array(Arc::new(right));
 
-        let result = spark_pmod(&[left_value, right_value]).unwrap();
+        let result = spark_pmod(&[left_value, right_value], false).unwrap();
 
         if let ColumnarValue::Array(result_array) = result {
             let result_int32 =
@@ -385,7 +450,7 @@ mod test {
         let left_value = ColumnarValue::Array(Arc::new(left));
         let right_value = ColumnarValue::Array(Arc::new(right));
 
-        let result = spark_pmod(&[left_value, right_value]).unwrap();
+        let result = spark_pmod(&[left_value, right_value], false).unwrap();
 
         if let ColumnarValue::Array(result_array) = result {
             let result_int64 =
@@ -410,6 +475,8 @@ mod test {
             Some(f64::INFINITY),
             Some(5.0),
             Some(-5.0),
+            Some(10.5),
+            Some(-7.2),
         ]);
         let right = Float64Array::from(vec![
             Some(3.0),
@@ -420,12 +487,14 @@ mod test {
             Some(2.0),
             Some(f64::INFINITY),
             Some(f64::INFINITY),
+            Some(0.0),
+            Some(0.0),
         ]);
 
         let left_value = ColumnarValue::Array(Arc::new(left));
         let right_value = ColumnarValue::Array(Arc::new(right));
 
-        let result = spark_pmod(&[left_value, right_value]).unwrap();
+        let result = spark_pmod(&[left_value, right_value], false).unwrap();
 
         if let ColumnarValue::Array(result_array) = result {
             let result_float64 = result_array
@@ -437,7 +506,7 @@ mod test {
             assert!((result_float64.value(1) - 1.8).abs() < f64::EPSILON * 3.0); // -7.2 pmod 3.0 = 1.8 (positive)
             assert!((result_float64.value(2) - 3.2).abs() < f64::EPSILON * 3.0); // 15.8 pmod 4.2 = 3.2
             assert!((result_float64.value(3) - 1.0).abs() < f64::EPSILON * 3.0); // -15.8 pmod 4.2 = 1.0 (positive)
-                                                                                 // nan pmod 2.0 = nan
+            // nan pmod 2.0 = nan
             assert!(result_float64.value(4).is_nan());
             // inf pmod 2.0 = nan (IEEE 754)
             assert!(result_float64.value(5).is_nan());
@@ -445,6 +514,9 @@ mod test {
             assert!((result_float64.value(6) - 5.0).abs() < f64::EPSILON);
             // -5.0 pmod inf = NaN
             assert!(result_float64.value(7).is_nan());
+            // Division by zero returns NULL
+            assert!(result_float64.is_null(8)); // 10.5 pmod 0.0 = NULL
+            assert!(result_float64.is_null(9)); // -7.2 pmod 0.0 = NULL
         } else {
             panic!("Expected array result");
         }
@@ -461,6 +533,8 @@ mod test {
             Some(f32::INFINITY),
             Some(5.0),
             Some(-5.0),
+            Some(10.5),
+            Some(-7.2),
         ]);
         let right = Float32Array::from(vec![
             Some(3.0),
@@ -471,12 +545,14 @@ mod test {
             Some(2.0),
             Some(f32::INFINITY),
             Some(f32::INFINITY),
+            Some(0.0),
+            Some(0.0),
         ]);
 
         let left_value = ColumnarValue::Array(Arc::new(left));
         let right_value = ColumnarValue::Array(Arc::new(right));
 
-        let result = spark_pmod(&[left_value, right_value]).unwrap();
+        let result = spark_pmod(&[left_value, right_value], false).unwrap();
 
         if let ColumnarValue::Array(result_array) = result {
             let result_float32 = result_array
@@ -488,7 +564,7 @@ mod test {
             assert!((result_float32.value(1) - 1.8).abs() < f32::EPSILON * 3.0); // -7.2 pmod 3.0 = 1.8 (positive)
             assert!((result_float32.value(2) - 3.2).abs() < f32::EPSILON * 10.0); // 15.8 pmod 4.2 = 3.2
             assert!((result_float32.value(3) - 1.0).abs() < f32::EPSILON * 10.0); // -15.8 pmod 4.2 = 1.0 (positive)
-                                                                                  // nan pmod 2.0 = nan
+            // nan pmod 2.0 = nan
             assert!(result_float32.value(4).is_nan());
             // inf pmod 2.0 = nan (IEEE 754)
             assert!(result_float32.value(5).is_nan());
@@ -496,6 +572,9 @@ mod test {
             assert!((result_float32.value(6) - 5.0).abs() < f32::EPSILON * 10.0);
             // -5.0 pmod inf = NaN
             assert!(result_float32.value(7).is_nan());
+            // Division by zero returns NULL
+            assert!(result_float32.is_null(8)); // 10.5 pmod 0.0 = NULL
+            assert!(result_float32.is_null(9)); // -7.2 pmod 0.0 = NULL
         } else {
             panic!("Expected array result");
         }
@@ -508,7 +587,7 @@ mod test {
 
         let left_value = ColumnarValue::Array(Arc::new(left));
 
-        let result = spark_pmod(&[left_value, right_value]).unwrap();
+        let result = spark_pmod(&[left_value, right_value], false).unwrap();
 
         if let ColumnarValue::Array(result_array) = result {
             let result_int32 =
@@ -527,20 +606,43 @@ mod test {
         let left = Int32Array::from(vec![Some(10)]);
         let left_value = ColumnarValue::Array(Arc::new(left));
 
-        let result = spark_pmod(&[left_value]);
+        let result = spark_pmod(&[left_value], false);
         assert!(result.is_err());
     }
 
     #[test]
-    fn test_pmod_zero_division() {
+    fn test_pmod_zero_division_legacy() {
+        // In legacy mode (ANSI off), division by zero returns NULL per-element
+        let left = Int32Array::from(vec![Some(10), Some(-7), Some(15)]);
+        let right = Int32Array::from(vec![Some(0), Some(0), Some(4)]);
+
+        let left_value = ColumnarValue::Array(Arc::new(left));
+        let right_value = ColumnarValue::Array(Arc::new(right));
+
+        let result = spark_pmod(&[left_value, right_value], false).unwrap();
+
+        if let ColumnarValue::Array(result_array) = result {
+            let result_int32 =
+                result_array.as_any().downcast_ref::<Int32Array>().unwrap();
+            assert!(result_int32.is_null(0)); // 10 pmod 0 = NULL
+            assert!(result_int32.is_null(1)); // -7 pmod 0 = NULL
+            assert_eq!(result_int32.value(2), 3); // 15 pmod 4 = 3
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_pmod_zero_division_ansi() {
+        // In ANSI mode, division by zero should error
         let left = Int32Array::from(vec![Some(10), Some(-7), Some(15)]);
         let right = Int32Array::from(vec![Some(0), Some(0), Some(4)]);
 
         let left_value = ColumnarValue::Array(Arc::new(left));
         let right_value = ColumnarValue::Array(Arc::new(right));
 
-        let result = spark_pmod(&[left_value, right_value]);
-        assert!(result.is_err()); // Division by zero should error
+        let result = spark_pmod(&[left_value, right_value], true);
+        assert!(result.is_err());
     }
 
     #[test]
@@ -552,7 +654,7 @@ mod test {
         let left_value = ColumnarValue::Array(Arc::new(left));
         let right_value = ColumnarValue::Array(Arc::new(right));
 
-        let result = spark_pmod(&[left_value, right_value]).unwrap();
+        let result = spark_pmod(&[left_value, right_value], false).unwrap();
 
         if let ColumnarValue::Array(result_array) = result {
             let result_int32 =
@@ -590,7 +692,7 @@ mod test {
         let left_value = ColumnarValue::Array(Arc::new(left));
         let right_value = ColumnarValue::Array(Arc::new(right));
 
-        let result = spark_pmod(&[left_value, right_value]).unwrap();
+        let result = spark_pmod(&[left_value, right_value], false).unwrap();
 
         if let ColumnarValue::Array(result_array) = result {
             let result_int32 =
diff --git a/datafusion/spark/src/function/math/negative.rs b/datafusion/spark/src/function/math/negative.rs
new file mode 100644
index 0000000000000..51e2418b85167
--- /dev/null
+++ b/datafusion/spark/src/function/math/negative.rs
@@ -0,0 +1,472 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::types::*;
+use arrow::array::*;
+use arrow::datatypes::{DataType, IntervalDayTime, IntervalMonthDayNano, IntervalUnit};
+use bigdecimal::num_traits::WrappingNeg;
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, exec_err, not_impl_err};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+    Volatility,
+};
+use std::sync::Arc;
+
+/// Spark-compatible `negative` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#negative>
+///
+/// Returns the negation of input (equivalent to unary minus)
+/// Returns NULL if input is NULL, returns NaN if input is NaN.
+///
+/// ANSI mode support:
+///  - When ANSI mode is disabled (`spark.sql.ansi.enabled=false`), negating the minimal
+///    value of a signed integer wraps around. For example: negative(i32::MIN) returns
+///    i32::MIN (wraps instead of error).
+///  - When ANSI mode is enabled (`spark.sql.ansi.enabled=true`), overflow conditions
+///    throw an ARITHMETIC_OVERFLOW error instead of wrapping.
+///
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkNegative {
+    signature: Signature,
+}
+
+impl Default for SparkNegative {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkNegative {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature {
+                type_signature: TypeSignature::OneOf(vec![
+                    // Numeric types: signed integers, float, decimals
+                    TypeSignature::Numeric(1),
+                    // Interval types: YearMonth, DayTime, MonthDayNano
+                    TypeSignature::Uniform(
+                        1,
+                        vec![
+                            DataType::Interval(IntervalUnit::YearMonth),
+                            DataType::Interval(IntervalUnit::DayTime),
+                            DataType::Interval(IntervalUnit::MonthDayNano),
+                        ],
+                    ),
+                ]),
+                volatility: Volatility::Immutable,
+                parameter_names: None,
+            },
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkNegative {
+    fn name(&self) -> &str {
+        "negative"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        spark_negative(&args.args, args.config_options.execution.enable_ansi_mode)
+    }
+}
+
+/// Macro to implement negation for integer array types
+macro_rules! impl_integer_array_negative {
+    ($array:expr, $type:ty, $type_name:expr, $enable_ansi_mode:expr) => {{
+        let array = $array.as_primitive::<$type>();
+        let result: PrimitiveArray<$type> = if $enable_ansi_mode {
+            array.try_unary(|x| {
+                x.checked_neg().ok_or_else(|| {
+                    (exec_err!("{} overflow on negative({x})", $type_name)
+                        as Result<(), _>)
+                        .unwrap_err()
+                })
+            })?
+        } else {
+            array.unary(|x| x.wrapping_neg())
+        };
+        Ok(ColumnarValue::Array(Arc::new(result)))
+    }};
+}
+
+/// Macro to implement negation for float array types
+macro_rules! impl_float_array_negative {
+    ($array:expr, $type:ty) => {{
+        let array = $array.as_primitive::<$type>();
+        let result: PrimitiveArray<$type> = array.unary(|x| -x);
+        Ok(ColumnarValue::Array(Arc::new(result)))
+    }};
+}
+
+/// Macro to implement negation for decimal array types
+macro_rules! impl_decimal_array_negative {
+    ($array:expr, $type:ty, $type_name:expr, $enable_ansi_mode:expr) => {{
+        let array = $array.as_primitive::<$type>();
+        let result: PrimitiveArray<$type> = if $enable_ansi_mode {
+            array
+                .try_unary(|x| {
+                    x.checked_neg().ok_or_else(|| {
+                        (exec_err!("{} overflow on negative({x})", $type_name)
+                            as Result<(), _>)
+                            .unwrap_err()
+                    })
+                })?
+                .with_data_type(array.data_type().clone())
+        } else {
+            array.unary(|x| x.wrapping_neg())
+        };
+        Ok(ColumnarValue::Array(Arc::new(result)))
+    }};
+}
+
+/// Macro to implement negation for integer scalar types
+macro_rules! impl_integer_scalar_negative {
+    ($v:expr, $type_name:expr, $variant:ident, $enable_ansi_mode:expr) => {{
+        let result = if $enable_ansi_mode {
+            $v.checked_neg().ok_or_else(|| {
+                (exec_err!("{} overflow on negative({})", $type_name, $v)
+                    as Result<(), _>)
+                    .unwrap_err()
+            })?
+        } else {
+            $v.wrapping_neg()
+        };
+        Ok(ColumnarValue::Scalar(ScalarValue::$variant(Some(result))))
+    }};
+}
+
+/// Macro to implement negation for decimal scalar types
+macro_rules! impl_decimal_scalar_negative {
+    ($v:expr, $precision:expr, $scale:expr, $type_name:expr, $variant:ident, $enable_ansi_mode:expr) => {{
+        let result = if $enable_ansi_mode {
+            $v.checked_neg().ok_or_else(|| {
+                (exec_err!("{} overflow on negative({})", $type_name, $v)
+                    as Result<(), _>)
+                    .unwrap_err()
+            })?
+        } else {
+            $v.wrapping_neg()
+        };
+        Ok(ColumnarValue::Scalar(ScalarValue::$variant(
+            Some(result),
+            *$precision,
+            *$scale,
+        )))
+    }};
+}
+
+/// Core implementation of Spark's negative function
+fn spark_negative(
+    args: &[ColumnarValue],
+    enable_ansi_mode: bool,
+) -> Result<ColumnarValue> {
+    let [arg] = take_function_args("negative", args)?;
+
+    match arg {
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Null => Ok(arg.clone()),
+
+            // Signed integers - use checked negation in ANSI mode, wrapping in legacy mode
+            DataType::Int8 => {
+                impl_integer_array_negative!(array, Int8Type, "Int8", enable_ansi_mode)
+            }
+            DataType::Int16 => {
+                impl_integer_array_negative!(array, Int16Type, "Int16", enable_ansi_mode)
+            }
+            DataType::Int32 => {
+                impl_integer_array_negative!(array, Int32Type, "Int32", enable_ansi_mode)
+            }
+            DataType::Int64 => {
+                impl_integer_array_negative!(array, Int64Type, "Int64", enable_ansi_mode)
+            }
+
+            // Floating point - simple negation (no overflow possible)
+            DataType::Float16 => impl_float_array_negative!(array, Float16Type),
+            DataType::Float32 => impl_float_array_negative!(array, Float32Type),
+            DataType::Float64 => impl_float_array_negative!(array, Float64Type),
+
+            // Decimal types - use checked negation in ANSI mode, wrapping in legacy mode
+            DataType::Decimal32(_, _) => impl_decimal_array_negative!(
+                array,
+                Decimal32Type,
+                "Decimal32",
+                enable_ansi_mode
+            ),
+            DataType::Decimal64(_, _) => impl_decimal_array_negative!(
+                array,
+                Decimal64Type,
+                "Decimal64",
+                enable_ansi_mode
+            ),
+            DataType::Decimal128(_, _) => impl_decimal_array_negative!(
+                array,
+                Decimal128Type,
+                "Decimal128",
+                enable_ansi_mode
+            ),
+            DataType::Decimal256(_, _) => impl_decimal_array_negative!(
+                array,
+                Decimal256Type,
+                "Decimal256",
+                enable_ansi_mode
+            ),
+
+            // interval type - use checked negation in ANSI mode, wrapping in legacy mode
+            DataType::Interval(IntervalUnit::YearMonth) => {
+                impl_integer_array_negative!(
+                    array,
+                    IntervalYearMonthType,
+                    "IntervalYearMonth",
+                    enable_ansi_mode
+                )
+            }
+            DataType::Interval(IntervalUnit::DayTime) => {
+                let array = array.as_primitive::<IntervalDayTimeType>();
+                let result: PrimitiveArray<IntervalDayTimeType> = if enable_ansi_mode {
+                    array.try_unary(|x| {
+                        let days = x.days.checked_neg().ok_or_else(|| {
+                            (exec_err!(
+                                "IntervalDayTime overflow on negative (days: {})",
+                                x.days
+                            ) as Result<(), _>)
+                                .unwrap_err()
+                        })?;
+                        let milliseconds =
+                            x.milliseconds.checked_neg().ok_or_else(|| {
+                                (exec_err!(
+                                "IntervalDayTime overflow on negative (milliseconds: {})",
+                                x.milliseconds
+                            ) as Result<(), _>)
+                                .unwrap_err()
+                            })?;
+                        Ok::<_, arrow::error::ArrowError>(IntervalDayTime {
+                            days,
+                            milliseconds,
+                        })
+                    })?
+                } else {
+                    array.unary(|x| IntervalDayTime {
+                        days: x.days.wrapping_neg(),
+                        milliseconds: x.milliseconds.wrapping_neg(),
+                    })
+                };
+                Ok(ColumnarValue::Array(Arc::new(result)))
+            }
+            DataType::Interval(IntervalUnit::MonthDayNano) => {
+                let array = array.as_primitive::<IntervalMonthDayNanoType>();
+                let result: PrimitiveArray<IntervalMonthDayNanoType> = if enable_ansi_mode
+                {
+                    array.try_unary(|x| {
+                        let months = x.months.checked_neg().ok_or_else(|| {
+                            (exec_err!(
+                                "IntervalMonthDayNano overflow on negative (months: {})",
+                                x.months
+                            ) as Result<(), _>)
+                                .unwrap_err()
+                        })?;
+                        let days = x.days.checked_neg().ok_or_else(|| {
+                            (exec_err!(
+                                "IntervalMonthDayNano overflow on negative (days: {})",
+                                x.days
+                            ) as Result<(), _>)
+                                .unwrap_err()
+                        })?;
+                        let nanoseconds = x.nanoseconds.checked_neg().ok_or_else(|| {
+                            (exec_err!(
+                                "IntervalMonthDayNano overflow on negative (nanoseconds: {})",
+                                x.nanoseconds
+                            ) as Result<(), _>)
+                                .unwrap_err()
+                        })?;
+                        Ok::<_, arrow::error::ArrowError>(IntervalMonthDayNano {
+                            months,
+                            days,
+                            nanoseconds,
+                        })
+                    })?
+                } else {
+                    array.unary(|x| IntervalMonthDayNano {
+                        months: x.months.wrapping_neg(),
+                        days: x.days.wrapping_neg(),
+                        nanoseconds: x.nanoseconds.wrapping_neg(),
+                    })
+                };
+                Ok(ColumnarValue::Array(Arc::new(result)))
+            }
+
+            dt => not_impl_err!("Not supported datatype for Spark negative(): {dt}"),
+        },
+        ColumnarValue::Scalar(sv) => match sv {
+            ScalarValue::Null => Ok(arg.clone()),
+            _ if sv.is_null() => Ok(arg.clone()),
+
+            // Signed integers - use checked negation in ANSI mode, wrapping in legacy mode
+            ScalarValue::Int8(Some(v)) => {
+                impl_integer_scalar_negative!(v, "Int8", Int8, enable_ansi_mode)
+            }
+            ScalarValue::Int16(Some(v)) => {
+                impl_integer_scalar_negative!(v, "Int16", Int16, enable_ansi_mode)
+            }
+            ScalarValue::Int32(Some(v)) => {
+                impl_integer_scalar_negative!(v, "Int32", Int32, enable_ansi_mode)
+            }
+            ScalarValue::Int64(Some(v)) => {
+                impl_integer_scalar_negative!(v, "Int64", Int64, enable_ansi_mode)
+            }
+
+            // Floating point - simple negation
+            ScalarValue::Float16(Some(v)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Float16(Some(-v))))
+            }
+            ScalarValue::Float32(Some(v)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Float32(Some(-v))))
+            }
+            ScalarValue::Float64(Some(v)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Float64(Some(-v))))
+            }
+
+            // Decimal types - use checked negation in ANSI mode, wrapping in legacy mode
+            ScalarValue::Decimal32(Some(v), precision, scale) => {
+                impl_decimal_scalar_negative!(
+                    v,
+                    precision,
+                    scale,
+                    "Decimal32",
+                    Decimal32,
+                    enable_ansi_mode
+                )
+            }
+            ScalarValue::Decimal64(Some(v), precision, scale) => {
+                impl_decimal_scalar_negative!(
+                    v,
+                    precision,
+                    scale,
+                    "Decimal64",
+                    Decimal64,
+                    enable_ansi_mode
+                )
+            }
+            ScalarValue::Decimal128(Some(v), precision, scale) => {
+                impl_decimal_scalar_negative!(
+                    v,
+                    precision,
+                    scale,
+                    "Decimal128",
+                    Decimal128,
+                    enable_ansi_mode
+                )
+            }
+            ScalarValue::Decimal256(Some(v), precision, scale) => {
+                impl_decimal_scalar_negative!(
+                    v,
+                    precision,
+                    scale,
+                    "Decimal256",
+                    Decimal256,
+                    enable_ansi_mode
+                )
+            }
+
+            //interval type - use checked negation in ANSI mode, wrapping in legacy mode
+            ScalarValue::IntervalYearMonth(Some(v)) => {
+                impl_integer_scalar_negative!(
+                    v,
+                    "IntervalYearMonth",
+                    IntervalYearMonth,
+                    enable_ansi_mode
+                )
+            }
+            ScalarValue::IntervalDayTime(Some(v)) => {
+                let result = if enable_ansi_mode {
+                    let days = v.days.checked_neg().ok_or_else(|| {
+                        (exec_err!(
+                            "IntervalDayTime overflow on negative (days: {})",
+                            v.days
+                        ) as Result<(), _>)
+                            .unwrap_err()
+                    })?;
+                    let milliseconds = v.milliseconds.checked_neg().ok_or_else(|| {
+                        (exec_err!(
+                            "IntervalDayTime overflow on negative (milliseconds: {})",
+                            v.milliseconds
+                        ) as Result<(), _>)
+                            .unwrap_err()
+                    })?;
+                    IntervalDayTime { days, milliseconds }
+                } else {
+                    IntervalDayTime {
+                        days: v.days.wrapping_neg(),
+                        milliseconds: v.milliseconds.wrapping_neg(),
+                    }
+                };
+                Ok(ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(
+                    result,
+                ))))
+            }
+            ScalarValue::IntervalMonthDayNano(Some(v)) => {
+                let result = if enable_ansi_mode {
+                    let months = v.months.checked_neg().ok_or_else(|| {
+                        (exec_err!(
+                            "IntervalMonthDayNano overflow on negative (months: {})",
+                            v.months
+                        ) as Result<(), _>)
+                            .unwrap_err()
+                    })?;
+                    let days = v.days.checked_neg().ok_or_else(|| {
+                        (exec_err!(
+                            "IntervalMonthDayNano overflow on negative (days: {})",
+                            v.days
+                        ) as Result<(), _>)
+                            .unwrap_err()
+                    })?;
+                    let nanoseconds = v.nanoseconds.checked_neg().ok_or_else(|| {
+                        (exec_err!(
+                            "IntervalMonthDayNano overflow on negative (nanoseconds: {})",
+                            v.nanoseconds
+                        ) as Result<(), _>)
+                            .unwrap_err()
+                    })?;
+                    IntervalMonthDayNano {
+                        months,
+                        days,
+                        nanoseconds,
+                    }
+                } else {
+                    IntervalMonthDayNano {
+                        months: v.months.wrapping_neg(),
+                        days: v.days.wrapping_neg(),
+                        nanoseconds: v.nanoseconds.wrapping_neg(),
+                    }
+                };
+                Ok(ColumnarValue::Scalar(ScalarValue::IntervalMonthDayNano(
+                    Some(result),
+                )))
+            }
+
+            dt => not_impl_err!("Not supported datatype for Spark negative(): {dt}"),
+        },
+    }
+}
diff --git a/datafusion/spark/src/function/math/rint.rs b/datafusion/spark/src/function/math/rint.rs
index 9b61529c5bc44..3bca93b13241b 100644
--- a/datafusion/spark/src/function/math/rint.rs
+++ b/datafusion/spark/src/function/math/rint.rs
@@ -15,16 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{Array, ArrayRef, AsArray};
 use arrow::compute::cast;
 use arrow::datatypes::DataType::{
-    Float32, Float64, Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8,
+    Float32, Float64, Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64,
 };
 use arrow::datatypes::{DataType, Float32Type, Float64Type};
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, assert_eq_or_internal_err, exec_err};
 use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion_expr::{
     ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
@@ -51,10 +50,6 @@ impl SparkRint {
 }
 
 impl ScalarUDFImpl for SparkRint {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "rint"
     }
@@ -83,9 +78,7 @@ impl ScalarUDFImpl for SparkRint {
 }
 
 pub fn spark_rint(args: &[ArrayRef]) -> Result<ArrayRef> {
-    if args.len() != 1 {
-        return exec_err!("rint expects exactly 1 argument, got {}", args.len());
-    }
+    assert_eq_or_internal_err!(args.len(), 1, "`rint` expects exactly one argument");
 
     let array: &dyn Array = args[0].as_ref();
     match args[0].data_type() {
diff --git a/datafusion/spark/src/function/math/round.rs b/datafusion/spark/src/function/math/round.rs
new file mode 100644
index 0000000000000..05745666183d3
--- /dev/null
+++ b/datafusion/spark/src/function/math/round.rs
@@ -0,0 +1,654 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::*;
+use arrow::datatypes::{
+    ArrowNativeTypeOp, DataType, Decimal32Type, Decimal64Type, Decimal128Type,
+    Decimal256Type, Float16Type, Float32Type, Float64Type, Int8Type, Int16Type,
+    Int32Type, Int64Type, UInt8Type, UInt16Type, UInt32Type, UInt64Type,
+};
+use datafusion_common::types::{
+    NativeType, logical_float32, logical_float64, logical_int32,
+};
+use datafusion_common::{Result, ScalarValue, exec_err, not_impl_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+    TypeSignatureClass, Volatility,
+};
+
+/// Spark-compatible `round` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#round>
+///
+/// Rounds the value of `expr` to `scale` decimal places using HALF_UP rounding mode.
+/// Returns the same type as the input expression.
+///
+/// - `round(expr)` rounds to 0 decimal places (default scale = 0)
+/// - `round(expr, scale)` rounds to `scale` decimal places
+/// - For integer types with negative scale: `round(25, -1)` → `30`
+/// - Uses HALF_UP rounding: 2.5 → 3, -2.5 → -3 (away from zero)
+///
+/// Supported types: Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64,
+/// Float16, Float32, Float64, Decimal32, Decimal64, Decimal128, Decimal256
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkRound {
+    signature: Signature,
+}
+
+impl Default for SparkRound {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkRound {
+    pub fn new() -> Self {
+        let decimal = Coercion::new_exact(TypeSignatureClass::Decimal);
+        let integer = Coercion::new_exact(TypeSignatureClass::Integer);
+        let decimal_places = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int32()),
+            vec![TypeSignatureClass::Integer],
+            NativeType::Int32,
+        );
+        let float32 = Coercion::new_exact(TypeSignatureClass::Native(logical_float32()));
+        let float64 = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_float64()),
+            vec![TypeSignatureClass::Numeric],
+            NativeType::Float64,
+        );
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    // round(decimal, scale)
+                    TypeSignature::Coercible(vec![
+                        decimal.clone(),
+                        decimal_places.clone(),
+                    ]),
+                    // round(decimal)
+                    TypeSignature::Coercible(vec![decimal]),
+                    // round(integer, scale)
+                    TypeSignature::Coercible(vec![
+                        integer.clone(),
+                        decimal_places.clone(),
+                    ]),
+                    // round(integer)
+                    TypeSignature::Coercible(vec![integer]),
+                    // round(float32, scale)
+                    TypeSignature::Coercible(vec![
+                        float32.clone(),
+                        decimal_places.clone(),
+                    ]),
+                    // round(float32)
+                    TypeSignature::Coercible(vec![float32]),
+                    // round(float64, scale)
+                    TypeSignature::Coercible(vec![float64.clone(), decimal_places]),
+                    // round(float64)
+                    TypeSignature::Coercible(vec![float64]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkRound {
+    fn name(&self) -> &str {
+        "round"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        spark_round(&args.args, args.config_options.execution.enable_ansi_mode)
+    }
+}
+
+/// Extract the scale (decimal places) from the second argument.
+/// Returns `Some(0)` if no second argument is provided.
+/// Returns `None` if the scale argument is NULL (Spark returns NULL for `round(expr, NULL)`).
+fn get_scale(args: &[ColumnarValue]) -> Result<Option<i32>> {
+    if args.len() < 2 {
+        return Ok(Some(0));
+    }
+
+    match &args[1] {
+        ColumnarValue::Scalar(ScalarValue::Int8(Some(v))) => Ok(Some(i32::from(*v))),
+        ColumnarValue::Scalar(ScalarValue::Int16(Some(v))) => Ok(Some(i32::from(*v))),
+        ColumnarValue::Scalar(ScalarValue::Int32(Some(v))) => Ok(Some(*v)),
+        ColumnarValue::Scalar(ScalarValue::Int64(Some(v))) => {
+            i32::try_from(*v).map(Some).map_err(|_| {
+                (exec_err!("round scale {v} is out of supported i32 range")
+                    as Result<(), _>)
+                    .unwrap_err()
+            })
+        }
+        ColumnarValue::Scalar(ScalarValue::UInt8(Some(v))) => Ok(Some(i32::from(*v))),
+        ColumnarValue::Scalar(ScalarValue::UInt16(Some(v))) => Ok(Some(i32::from(*v))),
+        ColumnarValue::Scalar(ScalarValue::UInt32(Some(v))) => {
+            i32::try_from(*v).map(Some).map_err(|_| {
+                (exec_err!("round scale {v} is out of supported i32 range")
+                    as Result<(), _>)
+                    .unwrap_err()
+            })
+        }
+        ColumnarValue::Scalar(ScalarValue::UInt64(Some(v))) => {
+            i32::try_from(*v).map(Some).map_err(|_| {
+                (exec_err!("round scale {v} is out of supported i32 range")
+                    as Result<(), _>)
+                    .unwrap_err()
+            })
+        }
+        ColumnarValue::Scalar(sv) if sv.is_null() => Ok(None),
+        other => exec_err!("Unsupported type for round scale: {}", other.data_type()),
+    }
+}
+
+/// Round a floating-point value to the given number of decimal places using
+/// HALF_UP rounding mode (ties round away from zero).
+///
+/// This matches Spark's `RoundBase` behaviour for `FloatType` / `DoubleType`,
+/// which internally converts the value to `BigDecimal` and rounds with
+/// `RoundingMode.HALF_UP`.
+///
+/// # Arguments
+/// * `value` – the floating-point number to round
+/// * `scale` – number of decimal places to keep.
+///   - `scale >= 0`: rounds to that many fractional digits
+///     (e.g. `round_float(2.345, 2) == 2.35`)
+///   - `scale < 0`:  rounds to the left of the decimal point
+///     (e.g. `round_float(125.0, -1) == 130.0`)
+///
+/// # Examples
+/// ```text
+/// round_float(2.5,  0) →  3.0   // half rounds up
+/// round_float(-2.5, 0) → -3.0   // half rounds away from zero
+/// round_float(1.4,  0) →  1.0
+/// round_float(125.0, -1) → 130.0
+/// ```
+fn round_float<T: num_traits::Float>(value: T, scale: i32) -> T {
+    if scale >= 0 {
+        let factor = T::from(10.0f64.powi(scale)).unwrap_or_else(T::infinity);
+        if factor.is_infinite() {
+            // Very large positive scale — value is already precise enough, return as-is
+            return value;
+        }
+        (value * factor).round() / factor
+    } else {
+        let factor = T::from(10.0f64.powi(-scale)).unwrap_or_else(T::infinity);
+        if factor.is_infinite() {
+            // Very large negative scale — any finite value rounds to 0
+            return T::zero();
+        }
+        (value / factor).round() * factor
+    }
+}
+
+/// Round an integer value to the given scale using HALF_UP rounding mode.
+///
+/// Only meaningful when `scale` is negative — a non-negative scale leaves
+/// the integer unchanged because integers have no fractional part.
+///
+/// This matches Spark's `RoundBase` behaviour for `ByteType`, `ShortType`,
+/// `IntegerType`, and `LongType`, which round to the nearest power-of-ten
+/// boundary and return the same integer type.
+///
+/// In ANSI mode, overflow conditions return an error instead of wrapping.
+///
+/// # Arguments
+/// * `value` – the integer to round (widened to `i64` by callers)
+/// * `scale` – rounding position relative to the ones digit.
+///   - `scale >= 0`:  returns `value` as-is
+///   - `scale == -1`: rounds to the nearest 10
+///   - `scale == -2`: rounds to the nearest 100
+///   - If `10^|scale|` overflows `i64`, returns `0`
+/// * `enable_ansi_mode` – when true, overflow returns an error
+///
+/// # Examples
+/// ```text
+/// round_integer(25,   -1, false) →  Ok(30)
+/// round_integer(-25,  -1, false) → Ok(-30)
+/// round_integer(123,  -1, false) →  Ok(120)
+/// round_integer(150,  -2, false) →  Ok(200)
+/// round_integer(42,    2, false) →   Ok(42)   // no-op for positive scale
+/// round_integer(42,  -10, false) →    Ok(0)   // factor overflows → 0
+/// ```
+fn round_integer(value: i64, scale: i32, enable_ansi_mode: bool) -> Result<i64> {
+    if scale >= 0 {
+        return Ok(value);
+    }
+    let abs_scale = (-scale) as u32;
+    let Some(factor) = 10_i64.checked_pow(abs_scale) else {
+        return Ok(0);
+    };
+    let remainder = value % factor;
+    let threshold = factor / 2;
+    let result = if remainder >= threshold {
+        if enable_ansi_mode {
+            value
+                .checked_sub(remainder)
+                .and_then(|v| v.checked_add(factor))
+                .ok_or_else(|| {
+                    (exec_err!("Int64 overflow on round({value}, {scale})")
+                        as Result<(), _>)
+                        .unwrap_err()
+                })?
+        } else {
+            value.wrapping_sub(remainder).wrapping_add(factor)
+        }
+    } else if remainder <= -threshold {
+        if enable_ansi_mode {
+            value
+                .checked_sub(remainder)
+                .and_then(|v| v.checked_sub(factor))
+                .ok_or_else(|| {
+                    (exec_err!("Int64 overflow on round({value}, {scale})")
+                        as Result<(), _>)
+                        .unwrap_err()
+                })?
+        } else {
+            value.wrapping_sub(remainder).wrapping_sub(factor)
+        }
+    } else {
+        value - remainder
+    };
+    Ok(result)
+}
+
+// ---------------------------------------------------------------------------
+// Decimal rounding using ArrowNativeTypeOp (HALF_UP)
+// ---------------------------------------------------------------------------
+
+/// Round a decimal value represented as its unscaled integer using HALF_UP
+/// rounding mode (ties round away from zero).
+///
+/// This matches Spark's `RoundBase` behaviour for `DecimalType`, which calls
+/// `BigDecimal.setScale(scale, RoundingMode.HALF_UP)`.
+///
+/// Decimals are stored as `(unscaled_value, precision, scale)` where the real
+/// value equals `unscaled_value * 10^(-scale)`.  This function operates on the
+/// unscaled integer directly:
+///
+/// 1. Compute `diff = input_scale - decimal_places`.
+///    If `diff <= 0` the requested precision is finer than (or equal to) the
+///    stored scale, so nothing needs to be rounded — return as-is.
+/// 2. Divide by `10^diff` to shift the rounding boundary into the ones digit.
+/// 3. Inspect the remainder to decide whether to round up or down (HALF_UP).
+/// 4. Multiply back by `10^diff` so the result is expressed at the original
+///    `input_scale`.
+///
+/// # Arguments
+/// * `value`          – unscaled decimal value
+/// * `input_scale`    – scale of the incoming decimal
+/// * `decimal_places` – number of fractional digits to keep (may be negative)
+///
+/// # Returns
+/// The rounded unscaled value at the same `input_scale`, or an error
+/// on overflow.
+///
+/// # Examples
+/// ```text
+/// // 2.5 (unscaled 25, scale 1) rounded to 0 places → 3.0 (unscaled 30)
+/// round_decimal(25_i128, 1, 0)  → Ok(30)
+///
+/// // 2.345 (unscaled 2345, scale 3) rounded to 2 places → 2.350 (unscaled 2350)
+/// round_decimal(2345_i128, 3, 2) → Ok(2350)
+/// ```
+fn round_decimal<V: ArrowNativeTypeOp>(
+    value: V,
+    input_scale: i8,
+    decimal_places: i32,
+) -> Result<V> {
+    let diff = i64::from(input_scale) - i64::from(decimal_places);
+    if diff <= 0 {
+        // Nothing to round – the requested precision is finer than (or equal to) the
+        // stored scale.
+        return Ok(value);
+    }
+
+    let diff = diff as u32;
+
+    let one = V::ONE;
+    let two = V::from_usize(2).ok_or_else(|| {
+        (exec_err!("Internal error: could not create constant 2") as Result<(), _>)
+            .unwrap_err()
+    })?;
+    let ten = V::from_usize(10).ok_or_else(|| {
+        (exec_err!("Internal error: could not create constant 10") as Result<(), _>)
+            .unwrap_err()
+    })?;
+
+    let Ok(factor) = ten.pow_checked(diff) else {
+        // 10^diff overflows the decimal type — the rounding position is beyond
+        // the representable range, so any value rounds to 0.
+        // This matches Spark's BigDecimal.setScale behavior where rounding to a
+        // scale far beyond the number's magnitude yields 0.
+        return Ok(V::ZERO);
+    };
+
+    let mut quotient = value.div_wrapping(factor);
+    let remainder = value.mod_wrapping(factor);
+
+    // HALF_UP: round away from zero when remainder is exactly half
+    let threshold = factor.div_wrapping(two);
+    if remainder >= threshold {
+        quotient = quotient.add_checked(one).map_err(|_| {
+            (exec_err!("Overflow while rounding decimal") as Result<(), _>).unwrap_err()
+        })?;
+    } else if remainder <= threshold.neg_wrapping() {
+        quotient = quotient.sub_checked(one).map_err(|_| {
+            (exec_err!("Overflow while rounding decimal") as Result<(), _>).unwrap_err()
+        })?;
+    }
+
+    // Re-scale the quotient back to `input_scale` so the returned unscaled integer is
+    // at the original scale. `factor` is already `10^diff` which is exactly the shift
+    // we need.
+    quotient.mul_checked(factor).map_err(|_| {
+        (exec_err!("Overflow while rounding decimal") as Result<(), _>).unwrap_err()
+    })
+}
+
+// ---------------------------------------------------------------------------
+// Macros for array dispatch
+// ---------------------------------------------------------------------------
+
+macro_rules! impl_integer_array_round {
+    ($array:expr, $arrow_type:ty, $scale:expr, $enable_ansi_mode:expr) => {{
+        let array = $array.as_primitive::<$arrow_type>();
+        type Native = <$arrow_type as arrow::datatypes::ArrowPrimitiveType>::Native;
+        let result: PrimitiveArray<$arrow_type> = if $enable_ansi_mode {
+            array.try_unary(|x| {
+                let v = round_integer(x as i64, $scale, true)?;
+                Native::try_from(v).map_err(|_| {
+                    (exec_err!(
+                        "{} overflow on round({x}, {})",
+                        stringify!($arrow_type),
+                        $scale
+                    ) as Result<(), _>)
+                        .unwrap_err()
+                })
+            })?
+        } else {
+            array.unary(|x| round_integer(x as i64, $scale, false).unwrap() as Native)
+        };
+        Ok(ColumnarValue::Array(Arc::new(result)))
+    }};
+}
+
+macro_rules! impl_float_array_round {
+    ($array:expr, $arrow_type:ty, $scale:expr) => {{
+        let array = $array.as_primitive::<$arrow_type>();
+        let result: PrimitiveArray<$arrow_type> = array.unary(|x| round_float(x, $scale));
+        Ok(ColumnarValue::Array(Arc::new(result)))
+    }};
+}
+
+macro_rules! impl_decimal_array_round {
+    ($array:expr, $arrow_type:ty, $input_scale:expr, $scale:expr) => {{
+        let array = $array.as_primitive::<$arrow_type>();
+        let result: PrimitiveArray<$arrow_type> = array
+            .try_unary(|x| round_decimal(x, $input_scale, $scale))?
+            .with_data_type($array.data_type().clone());
+        Ok(ColumnarValue::Array(Arc::new(result)))
+    }};
+}
+
+// ---------------------------------------------------------------------------
+// Core dispatch
+// ---------------------------------------------------------------------------
+
+fn spark_round(args: &[ColumnarValue], enable_ansi_mode: bool) -> Result<ColumnarValue> {
+    if args.is_empty() || args.len() > 2 {
+        return exec_err!("round requires 1 or 2 arguments, got {}", args.len());
+    }
+
+    let scale = match get_scale(args)? {
+        Some(s) => s,
+        None => {
+            // NULL scale → return NULL with the same data type as the first argument
+            return Ok(ColumnarValue::Scalar(ScalarValue::try_from(
+                args[0].data_type(),
+            )?));
+        }
+    };
+
+    match &args[0] {
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Null => Ok(args[0].clone()),
+
+            // Integer types
+            DataType::Int8 => {
+                impl_integer_array_round!(array, Int8Type, scale, enable_ansi_mode)
+            }
+            DataType::Int16 => {
+                impl_integer_array_round!(array, Int16Type, scale, enable_ansi_mode)
+            }
+            DataType::Int32 => {
+                impl_integer_array_round!(array, Int32Type, scale, enable_ansi_mode)
+            }
+            DataType::Int64 => {
+                impl_integer_array_round!(array, Int64Type, scale, enable_ansi_mode)
+            }
+
+            // Unsigned integer types
+            DataType::UInt8 => {
+                impl_integer_array_round!(array, UInt8Type, scale, enable_ansi_mode)
+            }
+            DataType::UInt16 => {
+                impl_integer_array_round!(array, UInt16Type, scale, enable_ansi_mode)
+            }
+            DataType::UInt32 => {
+                impl_integer_array_round!(array, UInt32Type, scale, enable_ansi_mode)
+            }
+            DataType::UInt64 => {
+                let array = array.as_primitive::<UInt64Type>();
+                let result: PrimitiveArray<UInt64Type> = array.try_unary(|x| {
+                    let v_i64 = i64::try_from(x).map_err(|_| {
+                        (exec_err!(
+                            "round: UInt64 value {x} exceeds i64::MAX and cannot be rounded"
+                        ) as Result<(), _>)
+                            .unwrap_err()
+                    })?;
+                    round_integer(v_i64, scale, enable_ansi_mode)
+                        .map(|v| v as u64)
+                })?;
+                Ok(ColumnarValue::Array(Arc::new(result)))
+            }
+
+            // Float types
+            DataType::Float16 => impl_float_array_round!(array, Float16Type, scale),
+            DataType::Float32 => impl_float_array_round!(array, Float32Type, scale),
+            DataType::Float64 => impl_float_array_round!(array, Float64Type, scale),
+
+            // Decimal types
+            DataType::Decimal32(_, input_scale) => {
+                impl_decimal_array_round!(array, Decimal32Type, *input_scale, scale)
+            }
+            DataType::Decimal64(_, input_scale) => {
+                impl_decimal_array_round!(array, Decimal64Type, *input_scale, scale)
+            }
+            DataType::Decimal128(_, input_scale) => {
+                impl_decimal_array_round!(array, Decimal128Type, *input_scale, scale)
+            }
+            DataType::Decimal256(_, input_scale) => {
+                impl_decimal_array_round!(array, Decimal256Type, *input_scale, scale)
+            }
+
+            dt => not_impl_err!("Unsupported data type for Spark round(): {dt}"),
+        },
+
+        ColumnarValue::Scalar(sv) => match sv {
+            ScalarValue::Null => Ok(args[0].clone()),
+            _ if sv.is_null() => Ok(args[0].clone()),
+
+            // Integer scalars
+            ScalarValue::Int8(Some(v)) => {
+                let r = round_integer(i64::from(*v), scale, enable_ansi_mode)?;
+                let result = if enable_ansi_mode {
+                    i8::try_from(r).map_err(|_| {
+                        (exec_err!("Int8 overflow on round({v}, {scale})")
+                            as Result<(), _>)
+                            .unwrap_err()
+                    })?
+                } else {
+                    r as i8
+                };
+                Ok(ColumnarValue::Scalar(ScalarValue::Int8(Some(result))))
+            }
+            ScalarValue::Int16(Some(v)) => {
+                let r = round_integer(i64::from(*v), scale, enable_ansi_mode)?;
+                let result = if enable_ansi_mode {
+                    i16::try_from(r).map_err(|_| {
+                        (exec_err!("Int16 overflow on round({v}, {scale})")
+                            as Result<(), _>)
+                            .unwrap_err()
+                    })?
+                } else {
+                    r as i16
+                };
+                Ok(ColumnarValue::Scalar(ScalarValue::Int16(Some(result))))
+            }
+            ScalarValue::Int32(Some(v)) => {
+                let r = round_integer(i64::from(*v), scale, enable_ansi_mode)?;
+                let result = if enable_ansi_mode {
+                    i32::try_from(r).map_err(|_| {
+                        (exec_err!("Int32 overflow on round({v}, {scale})")
+                            as Result<(), _>)
+                            .unwrap_err()
+                    })?
+                } else {
+                    r as i32
+                };
+                Ok(ColumnarValue::Scalar(ScalarValue::Int32(Some(result))))
+            }
+            ScalarValue::Int64(Some(v)) => {
+                let result = round_integer(*v, scale, enable_ansi_mode)?;
+                Ok(ColumnarValue::Scalar(ScalarValue::Int64(Some(result))))
+            }
+
+            // Unsigned integer scalars
+            ScalarValue::UInt8(Some(v)) => {
+                let r = round_integer(i64::from(*v), scale, enable_ansi_mode)?;
+                let result = if enable_ansi_mode {
+                    u8::try_from(r).map_err(|_| {
+                        (exec_err!("UInt8 overflow on round({v}, {scale})")
+                            as Result<(), _>)
+                            .unwrap_err()
+                    })?
+                } else {
+                    r as u8
+                };
+                Ok(ColumnarValue::Scalar(ScalarValue::UInt8(Some(result))))
+            }
+            ScalarValue::UInt16(Some(v)) => {
+                let r = round_integer(i64::from(*v), scale, enable_ansi_mode)?;
+                let result = if enable_ansi_mode {
+                    u16::try_from(r).map_err(|_| {
+                        (exec_err!("UInt16 overflow on round({v}, {scale})")
+                            as Result<(), _>)
+                            .unwrap_err()
+                    })?
+                } else {
+                    r as u16
+                };
+                Ok(ColumnarValue::Scalar(ScalarValue::UInt16(Some(result))))
+            }
+            ScalarValue::UInt32(Some(v)) => {
+                let r = round_integer(i64::from(*v), scale, enable_ansi_mode)?;
+                let result = if enable_ansi_mode {
+                    u32::try_from(r).map_err(|_| {
+                        (exec_err!("UInt32 overflow on round({v}, {scale})")
+                            as Result<(), _>)
+                            .unwrap_err()
+                    })?
+                } else {
+                    r as u32
+                };
+                Ok(ColumnarValue::Scalar(ScalarValue::UInt32(Some(result))))
+            }
+            ScalarValue::UInt64(Some(v)) => {
+                let v_i64 = i64::try_from(*v).map_err(|_| {
+                    (exec_err!(
+                        "round: UInt64 value {v} exceeds i64::MAX and cannot be rounded"
+                    ) as Result<(), _>)
+                        .unwrap_err()
+                })?;
+                let result = round_integer(v_i64, scale, enable_ansi_mode)?;
+                Ok(ColumnarValue::Scalar(ScalarValue::UInt64(Some(
+                    result as u64,
+                ))))
+            }
+
+            // Float scalars
+            ScalarValue::Float16(Some(v)) => {
+                let result = round_float(*v, scale);
+                Ok(ColumnarValue::Scalar(ScalarValue::Float16(Some(result))))
+            }
+            ScalarValue::Float32(Some(v)) => {
+                let result = round_float(*v, scale);
+                Ok(ColumnarValue::Scalar(ScalarValue::Float32(Some(result))))
+            }
+            ScalarValue::Float64(Some(v)) => {
+                let result = round_float(*v, scale);
+                Ok(ColumnarValue::Scalar(ScalarValue::Float64(Some(result))))
+            }
+
+            // Decimal scalars
+            ScalarValue::Decimal32(Some(v), precision, input_scale) => {
+                let rounded = round_decimal(*v, *input_scale, scale)?;
+                Ok(ColumnarValue::Scalar(ScalarValue::Decimal32(
+                    Some(rounded),
+                    *precision,
+                    *input_scale,
+                )))
+            }
+            ScalarValue::Decimal64(Some(v), precision, input_scale) => {
+                let rounded = round_decimal(*v, *input_scale, scale)?;
+                Ok(ColumnarValue::Scalar(ScalarValue::Decimal64(
+                    Some(rounded),
+                    *precision,
+                    *input_scale,
+                )))
+            }
+            ScalarValue::Decimal128(Some(v), precision, input_scale) => {
+                let rounded = round_decimal(*v, *input_scale, scale)?;
+                Ok(ColumnarValue::Scalar(ScalarValue::Decimal128(
+                    Some(rounded),
+                    *precision,
+                    *input_scale,
+                )))
+            }
+            ScalarValue::Decimal256(Some(v), precision, input_scale) => {
+                let rounded = round_decimal(*v, *input_scale, scale)?;
+                Ok(ColumnarValue::Scalar(ScalarValue::Decimal256(
+                    Some(rounded),
+                    *precision,
+                    *input_scale,
+                )))
+            }
+
+            dt => not_impl_err!("Unsupported data type for Spark round(): {dt}"),
+        },
+    }
+}
diff --git a/datafusion/spark/src/function/math/trigonometry.rs b/datafusion/spark/src/function/math/trigonometry.rs
new file mode 100644
index 0000000000000..b3853d66d9be1
--- /dev/null
+++ b/datafusion/spark/src/function/math/trigonometry.rs
@@ -0,0 +1,158 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::function::error_utils::unsupported_data_type_exec_err;
+use arrow::array::{ArrayRef, AsArray};
+use arrow::datatypes::{DataType, Float64Type};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use std::sync::Arc;
+
+static CSC_FUNCTION_NAME: &str = "csc";
+
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#csc>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkCsc {
+    signature: Signature,
+}
+
+impl Default for SparkCsc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkCsc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::exact(vec![DataType::Float64], Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkCsc {
+    fn name(&self) -> &str {
+        CSC_FUNCTION_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let [arg] = take_function_args(self.name(), &args.args)?;
+        spark_csc(arg)
+    }
+}
+
+fn spark_csc(arg: &ColumnarValue) -> Result<ColumnarValue> {
+    match arg {
+        ColumnarValue::Scalar(ScalarValue::Float64(value)) => Ok(ColumnarValue::Scalar(
+            ScalarValue::Float64(value.map(|x| 1.0 / x.sin())),
+        )),
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Float64 => Ok(ColumnarValue::Array(Arc::new(
+                array
+                    .as_primitive::<Float64Type>()
+                    .unary::<_, Float64Type>(|x| 1.0 / x.sin()),
+            ) as ArrayRef)),
+            other => Err(unsupported_data_type_exec_err(
+                CSC_FUNCTION_NAME,
+                format!("{}", DataType::Float64).as_str(),
+                other,
+            )),
+        },
+        other => Err(unsupported_data_type_exec_err(
+            CSC_FUNCTION_NAME,
+            format!("{}", DataType::Float64).as_str(),
+            &other.data_type(),
+        )),
+    }
+}
+
+static SEC_FUNCTION_NAME: &str = "sec";
+
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#sec>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkSec {
+    signature: Signature,
+}
+
+impl Default for SparkSec {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkSec {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::exact(vec![DataType::Float64], Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkSec {
+    fn name(&self) -> &str {
+        SEC_FUNCTION_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let [arg] = take_function_args(self.name(), &args.args)?;
+        spark_sec(arg)
+    }
+}
+
+fn spark_sec(arg: &ColumnarValue) -> Result<ColumnarValue> {
+    match arg {
+        ColumnarValue::Scalar(ScalarValue::Float64(value)) => Ok(ColumnarValue::Scalar(
+            ScalarValue::Float64(value.map(|x| 1.0 / x.cos())),
+        )),
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Float64 => Ok(ColumnarValue::Array(Arc::new(
+                array
+                    .as_primitive::<Float64Type>()
+                    .unary::<_, Float64Type>(|x| 1.0 / x.cos()),
+            ) as ArrayRef)),
+            other => Err(unsupported_data_type_exec_err(
+                SEC_FUNCTION_NAME,
+                format!("{}", DataType::Float64).as_str(),
+                other,
+            )),
+        },
+        other => Err(unsupported_data_type_exec_err(
+            SEC_FUNCTION_NAME,
+            format!("{}", DataType::Float64).as_str(),
+            &other.data_type(),
+        )),
+    }
+}
diff --git a/datafusion/spark/src/function/math/unhex.rs b/datafusion/spark/src/function/math/unhex.rs
new file mode 100644
index 0000000000000..f6c9e2fa27a67
--- /dev/null
+++ b/datafusion/spark/src/function/math/unhex.rs
@@ -0,0 +1,209 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef, BinaryBuilder};
+use arrow::datatypes::DataType;
+use datafusion_common::cast::{
+    as_large_string_array, as_string_array, as_string_view_array,
+};
+use datafusion_common::types::logical_string;
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{DataFusionError, Result, ScalarValue, exec_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
+};
+use std::sync::Arc;
+
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#unhex>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkUnhex {
+    signature: Signature,
+}
+
+impl Default for SparkUnhex {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkUnhex {
+    pub fn new() -> Self {
+        let string = Coercion::new_exact(TypeSignatureClass::Native(logical_string()));
+
+        Self {
+            signature: Signature::coercible(vec![string], Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkUnhex {
+    fn name(&self) -> &str {
+        "unhex"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Binary)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        spark_unhex(&args.args)
+    }
+}
+
+#[inline]
+fn hex_nibble(c: u8) -> Option<u8> {
+    match c {
+        b'0'..=b'9' => Some(c - b'0'),
+        b'a'..=b'f' => Some(c - b'a' + 10),
+        b'A'..=b'F' => Some(c - b'A' + 10),
+        _ => None,
+    }
+}
+
+/// Decodes a hex-encoded byte slice into binary data.
+/// Returns `true` if decoding succeeded, `false` if the input contains invalid hex characters.
+fn unhex_common(bytes: &[u8], out: &mut Vec<u8>) -> bool {
+    if bytes.is_empty() {
+        return true;
+    }
+
+    let mut i = 0usize;
+
+    // If the hex string length is odd, implicitly left-pad with '0'.
+    if (bytes.len() & 1) == 1 {
+        match hex_nibble(bytes[0]) {
+            // Equivalent to (0 << 4) | lo
+            Some(lo) => out.push(lo),
+            None => return false,
+        }
+        i = 1;
+    }
+
+    while i + 1 < bytes.len() {
+        match (hex_nibble(bytes[i]), hex_nibble(bytes[i + 1])) {
+            (Some(hi), Some(lo)) => out.push((hi << 4) | lo),
+            _ => return false,
+        }
+        i += 2;
+    }
+
+    true
+}
+
+/// Converts an iterator of hex strings to a binary array.
+fn unhex_array<I, T>(
+    iter: I,
+    len: usize,
+    capacity: usize,
+) -> Result<ArrayRef, DataFusionError>
+where
+    I: Iterator<Item = Option<T>>,
+    T: AsRef<str>,
+{
+    let mut builder = BinaryBuilder::with_capacity(len, capacity);
+    let mut buffer = Vec::new();
+
+    for v in iter {
+        if let Some(s) = v {
+            buffer.clear();
+            buffer.reserve(s.as_ref().len().div_ceil(2));
+            if unhex_common(s.as_ref().as_bytes(), &mut buffer) {
+                builder.append_value(&buffer);
+            } else {
+                builder.append_null();
+            }
+        } else {
+            builder.append_null();
+        }
+    }
+
+    Ok(Arc::new(builder.finish()))
+}
+
+/// Convert a single hex string to binary
+fn unhex_scalar(s: &str) -> Option<Vec<u8>> {
+    let mut buffer = Vec::with_capacity(s.len().div_ceil(2));
+    if unhex_common(s.as_bytes(), &mut buffer) {
+        Some(buffer)
+    } else {
+        None
+    }
+}
+
+fn spark_unhex(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    let [args] = take_function_args("unhex", args)?;
+
+    match args {
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Utf8 => {
+                let array = as_string_array(array)?;
+                let capacity = array.values().len().div_ceil(2);
+                Ok(ColumnarValue::Array(unhex_array(
+                    array.iter(),
+                    array.len(),
+                    capacity,
+                )?))
+            }
+            DataType::Utf8View => {
+                let array = as_string_view_array(array)?;
+                // Estimate capacity since StringViewArray data can be scattered or inlined.
+                let capacity = array.len() * 32;
+                Ok(ColumnarValue::Array(unhex_array(
+                    array.iter(),
+                    array.len(),
+                    capacity,
+                )?))
+            }
+            DataType::LargeUtf8 => {
+                let array = as_large_string_array(array)?;
+                let capacity = array.values().len().div_ceil(2);
+                Ok(ColumnarValue::Array(unhex_array(
+                    array.iter(),
+                    array.len(),
+                    capacity,
+                )?))
+            }
+            _ => exec_err!(
+                "unhex only supports string argument, but got: {}",
+                array.data_type()
+            ),
+        },
+        ColumnarValue::Scalar(sv) => match sv {
+            ScalarValue::Utf8(None)
+            | ScalarValue::Utf8View(None)
+            | ScalarValue::LargeUtf8(None) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Binary(None)))
+            }
+            ScalarValue::Utf8(Some(s))
+            | ScalarValue::Utf8View(Some(s))
+            | ScalarValue::LargeUtf8(Some(s)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Binary(unhex_scalar(s))))
+            }
+            _ => {
+                exec_err!(
+                    "unhex only supports string argument, but got: {}",
+                    sv.data_type()
+                )
+            }
+        },
+    }
+}
diff --git a/datafusion/spark/src/function/math/width_bucket.rs b/datafusion/spark/src/function/math/width_bucket.rs
index 45a0d843b7ed7..79da924116d2e 100644
--- a/datafusion/spark/src/function/math/width_bucket.rs
+++ b/datafusion/spark/src/function/math/width_bucket.rs
@@ -15,10 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
-use crate::function::error_utils::unsupported_data_types_exec_err;
 use arrow::array::{
     Array, ArrayRef, DurationMicrosecondArray, Float64Array, IntervalMonthDayNanoArray,
     IntervalYearMonthArray,
@@ -27,17 +25,24 @@ use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::{Duration, Float64, Int32, Interval};
 use arrow::datatypes::IntervalUnit::{MonthDayNano, YearMonth};
 use datafusion_common::cast::{
-    as_duration_microsecond_array, as_float64_array, as_int32_array,
+    as_duration_microsecond_array, as_float64_array, as_int64_array,
     as_interval_mdn_array, as_interval_ym_array,
 };
-use datafusion_common::{exec_err, Result};
+use datafusion_common::types::{
+    NativeType, logical_duration_microsecond, logical_float64, logical_int64,
+    logical_interval_mdn, logical_interval_year_month,
+};
+use datafusion_common::{Result, exec_err, internal_err};
 use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
-use datafusion_expr::type_coercion::is_signed_numeric;
-use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+    TypeSignatureClass,
+};
 use datafusion_functions::utils::make_scalar_function;
 
-use arrow::array::{Int32Array, Int32Builder};
+use arrow::array::{Int32Array, Int32Builder, Int64Array};
 use arrow::datatypes::TimeUnit::Microsecond;
+use datafusion_expr::Coercion;
 use datafusion_expr::Volatility::Immutable;
 
 #[derive(Debug, PartialEq, Eq, Hash)]
@@ -53,17 +58,64 @@ impl Default for SparkWidthBucket {
 
 impl SparkWidthBucket {
     pub fn new() -> Self {
+        let numeric = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_float64()),
+            vec![TypeSignatureClass::Numeric],
+            NativeType::Float64,
+        );
+        let duration = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_duration_microsecond()),
+            vec![TypeSignatureClass::Duration],
+            NativeType::Duration(Microsecond),
+        );
+        let interval_ym = Coercion::new_exact(TypeSignatureClass::Native(
+            logical_interval_year_month(),
+        ));
+        let interval_mdn =
+            Coercion::new_exact(TypeSignatureClass::Native(logical_interval_mdn()));
+        let bucket = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int64()),
+            vec![TypeSignatureClass::Integer],
+            NativeType::Int64,
+        );
+        let type_signature = Signature::one_of(
+            vec![
+                TypeSignature::Coercible(vec![
+                    numeric.clone(),
+                    numeric.clone(),
+                    numeric.clone(),
+                    bucket.clone(),
+                ]),
+                TypeSignature::Coercible(vec![
+                    duration.clone(),
+                    duration.clone(),
+                    duration.clone(),
+                    bucket.clone(),
+                ]),
+                TypeSignature::Coercible(vec![
+                    interval_ym.clone(),
+                    interval_ym.clone(),
+                    interval_ym.clone(),
+                    bucket.clone(),
+                ]),
+                TypeSignature::Coercible(vec![
+                    interval_mdn.clone(),
+                    interval_mdn.clone(),
+                    interval_mdn.clone(),
+                    bucket.clone(),
+                ]),
+            ],
+            Immutable,
+        )
+        .with_parameter_names(vec!["expr", "min", "max", "num_buckets"])
+        .expect("valid parameter names");
         Self {
-            signature: Signature::user_defined(Immutable),
+            signature: type_signature,
         }
     }
 }
 
 impl ScalarUDFImpl for SparkWidthBucket {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "width_bucket"
     }
@@ -88,63 +140,6 @@ impl ScalarUDFImpl for SparkWidthBucket {
             Ok(SortProperties::default())
         }
     }
-
-    fn coerce_types(&self, types: &[DataType]) -> Result<Vec<DataType>> {
-        use DataType::*;
-
-        let (v, lo, hi, n) = (&types[0], &types[1], &types[2], &types[3]);
-
-        match (v, lo, hi, n) {
-            (a, b, c, &(Int8 | Int16 | Int32 | Int64))
-                if is_signed_numeric(a)
-                    && is_signed_numeric(b)
-                    && is_signed_numeric(c) =>
-            {
-                Ok(vec![Float64, Float64, Float64, Int32])
-            }
-            (
-                &Duration(_),
-                &Duration(_),
-                &Duration(_),
-                &(Int8 | Int16 | Int32 | Int64),
-            ) => Ok(vec![
-                Duration(Microsecond),
-                Duration(Microsecond),
-                Duration(Microsecond),
-                Int32,
-            ]),
-            (
-                &Interval(MonthDayNano),
-                &Interval(MonthDayNano),
-                &Interval(MonthDayNano),
-                &(Int8 | Int16 | Int32 | Int64),
-            ) => Ok(vec![
-                Interval(MonthDayNano),
-                Interval(MonthDayNano),
-                Interval(MonthDayNano),
-                Int32,
-            ]),
-            (
-                &Interval(YearMonth),
-                &Interval(YearMonth),
-                &Interval(YearMonth),
-                &(Int8 | Int16 | Int32 | Int64),
-            ) => Ok(vec![
-                Interval(YearMonth),
-                Interval(YearMonth),
-                Interval(YearMonth),
-                Int32,
-            ]),
-
-            _ => exec_err!(
-                "width_bucket expects a numeric argument, got {} {} {} {}",
-                types[0],
-                types[1],
-                types[2],
-                types[3]
-            ),
-        }
-    }
 }
 
 fn width_bucket_kern(args: &[ArrayRef]) -> Result<ArrayRef> {
@@ -160,42 +155,40 @@ fn width_bucket_kern(args: &[ArrayRef]) -> Result<ArrayRef> {
             let v = as_float64_array(v)?;
             let min = as_float64_array(minv)?;
             let max = as_float64_array(maxv)?;
-            let n_bucket = as_int32_array(nb)?;
+            let n_bucket = as_int64_array(nb)?;
             Ok(Arc::new(width_bucket_float64(v, min, max, n_bucket)))
         }
         Duration(Microsecond) => {
             let v = as_duration_microsecond_array(v)?;
             let min = as_duration_microsecond_array(minv)?;
             let max = as_duration_microsecond_array(maxv)?;
-            let n_bucket = as_int32_array(nb)?;
+            let n_bucket = as_int64_array(nb)?;
             Ok(Arc::new(width_bucket_i64_as_float(v, min, max, n_bucket)))
         }
         Interval(YearMonth) => {
             let v = as_interval_ym_array(v)?;
             let min = as_interval_ym_array(minv)?;
             let max = as_interval_ym_array(maxv)?;
-            let n_bucket = as_int32_array(nb)?;
+            let n_bucket = as_int64_array(nb)?;
             Ok(Arc::new(width_bucket_i32_as_float(v, min, max, n_bucket)))
         }
         Interval(MonthDayNano) => {
             let v = as_interval_mdn_array(v)?;
             let min = as_interval_mdn_array(minv)?;
             let max = as_interval_mdn_array(maxv)?;
-            let n_bucket = as_int32_array(nb)?;
-            Ok(Arc::new(width_bucket_interval_mdn_exact(v, min, max, n_bucket)))
+            let n_bucket = as_int64_array(nb)?;
+            Ok(Arc::new(width_bucket_interval_mdn_exact(
+                v, min, max, n_bucket,
+            )))
         }
 
-
-        other => Err(unsupported_data_types_exec_err(
-            "width_bucket",
-            "Float/Decimal OR Duration OR Interval(YearMonth) for first 3 args; Int for 4th",
-            &[
-                other.clone(),
-                minv.data_type().clone(),
-                maxv.data_type().clone(),
-                nb.data_type().clone(),
-            ],
-        )),
+        other => internal_err!(
+            "width_bucket received unexpected data types: {:?}, {:?}, {:?}, {:?}",
+            other,
+            minv.data_type(),
+            maxv.data_type(),
+            nb.data_type()
+        ),
     }
 }
 
@@ -205,7 +198,7 @@ macro_rules! width_bucket_kernel_impl {
             v: &$arr_ty,
             min: &$arr_ty,
             max: &$arr_ty,
-            n_bucket: &Int32Array,
+            n_bucket: &Int64Array,
         ) -> Int32Array {
             let len = v.len();
             let mut b = Int32Builder::with_capacity(len);
@@ -225,6 +218,7 @@ macro_rules! width_bucket_kernel_impl {
                     b.append_null();
                     continue;
                 }
+                let next_bucket = (buckets + 1) as i32;
                 if $check_nan {
                     if !x.is_finite() || !l.is_finite() || !h.is_finite() {
                         b.append_null();
@@ -239,11 +233,11 @@ macro_rules! width_bucket_kernel_impl {
                         continue;
                     }
                 };
-                if matches!(ord, std::cmp::Ordering::Equal) {
+                if ord == std::cmp::Ordering::Equal {
                     b.append_null();
                     continue;
                 }
-                let asc = matches!(ord, std::cmp::Ordering::Less);
+                let asc = ord == std::cmp::Ordering::Less;
 
                 if asc {
                     if x < l {
@@ -251,7 +245,7 @@ macro_rules! width_bucket_kernel_impl {
                         continue;
                     }
                     if x >= h {
-                        b.append_value(buckets + 1);
+                        b.append_value(next_bucket);
                         continue;
                     }
                 } else {
@@ -260,7 +254,7 @@ macro_rules! width_bucket_kernel_impl {
                         continue;
                     }
                     if x <= h {
-                        b.append_value(buckets + 1);
+                        b.append_value(next_bucket);
                         continue;
                     }
                 }
@@ -274,8 +268,8 @@ macro_rules! width_bucket_kernel_impl {
                 if bucket < 1 {
                     bucket = 1;
                 }
-                if bucket > buckets + 1 {
-                    bucket = buckets + 1;
+                if bucket > next_bucket {
+                    bucket = next_bucket;
                 }
 
                 b.append_value(bucket);
@@ -311,7 +305,7 @@ pub(crate) fn width_bucket_interval_mdn_exact(
     v: &IntervalMonthDayNanoArray,
     lo: &IntervalMonthDayNanoArray,
     hi: &IntervalMonthDayNanoArray,
-    n: &Int32Array,
+    n: &Int64Array,
 ) -> Int32Array {
     let len = v.len();
     let mut b = Int32Builder::with_capacity(len);
@@ -326,6 +320,7 @@ pub(crate) fn width_bucket_interval_mdn_exact(
             b.append_null();
             continue;
         }
+        let next_bucket = (buckets + 1) as i32;
 
         let x = v.value(i);
         let l = lo.value(i);
@@ -351,7 +346,7 @@ pub(crate) fn width_bucket_interval_mdn_exact(
                     continue;
                 }
                 if x_m >= h_m {
-                    b.append_value(buckets + 1);
+                    b.append_value(next_bucket);
                     continue;
                 }
             } else {
@@ -360,7 +355,7 @@ pub(crate) fn width_bucket_interval_mdn_exact(
                     continue;
                 }
                 if x_m <= h_m {
-                    b.append_value(buckets + 1);
+                    b.append_value(next_bucket);
                     continue;
                 }
             }
@@ -375,8 +370,8 @@ pub(crate) fn width_bucket_interval_mdn_exact(
             if bucket < 1 {
                 bucket = 1;
             }
-            if bucket > buckets + 1 {
-                bucket = buckets + 1;
+            if bucket > next_bucket {
+                bucket = next_bucket;
             }
             b.append_value(bucket);
             continue;
@@ -402,7 +397,7 @@ pub(crate) fn width_bucket_interval_mdn_exact(
                     continue;
                 }
                 if x_f >= h_f {
-                    b.append_value(buckets + 1);
+                    b.append_value(next_bucket);
                     continue;
                 }
             } else {
@@ -411,7 +406,7 @@ pub(crate) fn width_bucket_interval_mdn_exact(
                     continue;
                 }
                 if x_f <= h_f {
-                    b.append_value(buckets + 1);
+                    b.append_value(next_bucket);
                     continue;
                 }
             }
@@ -426,8 +421,8 @@ pub(crate) fn width_bucket_interval_mdn_exact(
             if bucket < 1 {
                 bucket = 1;
             }
-            if bucket > buckets + 1 {
-                bucket = buckets + 1;
+            if bucket > next_bucket {
+                bucket = next_bucket;
             }
             b.append_value(bucket);
             continue;
@@ -442,18 +437,17 @@ pub(crate) fn width_bucket_interval_mdn_exact(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use std::sync::Arc;
 
     use arrow::array::{
-        ArrayRef, DurationMicrosecondArray, Float64Array, Int32Array,
+        ArrayRef, DurationMicrosecondArray, Float64Array, Int32Array, Int64Array,
         IntervalYearMonthArray,
     };
     use arrow::datatypes::IntervalMonthDayNano;
 
     // --- Helpers -------------------------------------------------------------
 
-    fn i32_array_all(len: usize, val: i32) -> Arc<Int32Array> {
-        Arc::new(Int32Array::from(vec![val; len]))
+    fn i64_array_all(len: usize, val: i64) -> Arc<Int64Array> {
+        Arc::new(Int64Array::from(vec![val; len]))
     }
 
     fn f64_array(vals: &[f64]) -> Arc<Float64Array> {
@@ -491,7 +485,7 @@ mod tests {
         let v = f64_array(&[0.5, 1.0, 9.9, -1.0, 10.0]);
         let lo = f64_array(&[0.0, 0.0, 0.0, 0.0, 0.0]);
         let hi = f64_array(&[10.0, 10.0, 10.0, 10.0, 10.0]);
-        let n = i32_array_all(5, 10);
+        let n = i64_array_all(5, 10);
 
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         let out = downcast_i32(&out);
@@ -503,7 +497,7 @@ mod tests {
         let v = f64_array(&[9.9, 10.0, 0.0, -0.1, 10.1]);
         let lo = f64_array(&[10.0; 5]);
         let hi = f64_array(&[0.0; 5]);
-        let n = i32_array_all(5, 10);
+        let n = i64_array_all(5, 10);
 
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         let out = downcast_i32(&out);
@@ -515,7 +509,7 @@ mod tests {
         let v = f64_array(&[0.0, 9.999999999, 10.0]);
         let lo = f64_array(&[0.0; 3]);
         let hi = f64_array(&[10.0; 3]);
-        let n = i32_array_all(3, 10);
+        let n = i64_array_all(3, 10);
 
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         let out = downcast_i32(&out);
@@ -527,7 +521,7 @@ mod tests {
         let v = f64_array(&[10.0, 0.0, -0.000001]);
         let lo = f64_array(&[10.0; 3]);
         let hi = f64_array(&[0.0; 3]);
-        let n = i32_array_all(3, 10);
+        let n = i64_array_all(3, 10);
 
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         let out = downcast_i32(&out);
@@ -539,7 +533,7 @@ mod tests {
         let v = f64_array(&[1.0, 5.0, 9.0]);
         let lo = f64_array(&[0.0, 0.0, 0.0]);
         let hi = f64_array(&[10.0, 10.0, 10.0]);
-        let n = Arc::new(Int32Array::from(vec![0, -1, 10]));
+        let n = Arc::new(Int64Array::from(vec![0, -1, 10]));
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         let out = downcast_i32(&out);
         assert!(out.is_null(0));
@@ -549,7 +543,7 @@ mod tests {
         let v = f64_array(&[1.0]);
         let lo = f64_array(&[5.0]);
         let hi = f64_array(&[5.0]);
-        let n = i32_array_all(1, 10);
+        let n = i64_array_all(1, 10);
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         let out = downcast_i32(&out);
         assert!(out.is_null(0));
@@ -557,7 +551,7 @@ mod tests {
         let v = f64_array_opt(&[Some(f64::NAN)]);
         let lo = f64_array(&[0.0]);
         let hi = f64_array(&[10.0]);
-        let n = i32_array_all(1, 10);
+        let n = i64_array_all(1, 10);
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         let out = downcast_i32(&out);
         assert!(out.is_null(0));
@@ -568,7 +562,7 @@ mod tests {
         let v = f64_array_opt(&[None, Some(1.0), Some(2.0), Some(3.0)]);
         let lo = f64_array(&[0.0; 4]);
         let hi = f64_array(&[10.0; 4]);
-        let n = i32_array_all(4, 10);
+        let n = i64_array_all(4, 10);
 
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         let out = downcast_i32(&out);
@@ -580,7 +574,7 @@ mod tests {
         let v = f64_array(&[1.0]);
         let lo = f64_array_opt(&[None]);
         let hi = f64_array(&[10.0]);
-        let n = i32_array_all(1, 10);
+        let n = i64_array_all(1, 10);
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         let out = downcast_i32(&out);
         assert!(out.is_null(0));
@@ -593,7 +587,7 @@ mod tests {
         let v = dur_us_array(&[1_000_000, 0, -1]);
         let lo = dur_us_array(&[0, 0, 0]);
         let hi = dur_us_array(&[2_000_000, 2_000_000, 2_000_000]);
-        let n = i32_array_all(3, 2);
+        let n = i64_array_all(3, 2);
 
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         let out = downcast_i32(&out);
@@ -605,7 +599,7 @@ mod tests {
         let v = dur_us_array(&[0]);
         let lo = dur_us_array(&[1]);
         let hi = dur_us_array(&[1]);
-        let n = i32_array_all(1, 10);
+        let n = i64_array_all(1, 10);
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         assert!(downcast_i32(&out).is_null(0));
     }
@@ -617,7 +611,7 @@ mod tests {
         let v = ym_array(&[0, 5, 11, 12, 13]);
         let lo = ym_array(&[0; 5]);
         let hi = ym_array(&[12; 5]);
-        let n = i32_array_all(5, 12);
+        let n = i64_array_all(5, 12);
 
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         let out = downcast_i32(&out);
@@ -629,7 +623,7 @@ mod tests {
         let v = ym_array(&[11, 12, 0, -1, 13]);
         let lo = ym_array(&[12; 5]);
         let hi = ym_array(&[0; 5]);
-        let n = i32_array_all(5, 12);
+        let n = i64_array_all(5, 12);
 
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         let out = downcast_i32(&out);
@@ -643,7 +637,7 @@ mod tests {
         let v = mdn_array(&[(0, 0, 0), (5, 0, 0), (11, 0, 0), (12, 0, 0), (13, 0, 0)]);
         let lo = mdn_array(&[(0, 0, 0); 5]);
         let hi = mdn_array(&[(12, 0, 0); 5]);
-        let n = i32_array_all(5, 12);
+        let n = i64_array_all(5, 12);
 
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         let out = downcast_i32(&out);
@@ -655,7 +649,7 @@ mod tests {
         let v = mdn_array(&[(11, 0, 0), (12, 0, 0), (0, 0, 0), (-1, 0, 0), (13, 0, 0)]);
         let lo = mdn_array(&[(12, 0, 0); 5]);
         let hi = mdn_array(&[(0, 0, 0); 5]);
-        let n = i32_array_all(5, 12);
+        let n = i64_array_all(5, 12);
 
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         let out = downcast_i32(&out);
@@ -675,7 +669,7 @@ mod tests {
         ]);
         let lo = mdn_array(&[(0, 0, 0); 6]);
         let hi = mdn_array(&[(0, 10, 0); 6]);
-        let n = i32_array_all(6, 10);
+        let n = i64_array_all(6, 10);
 
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         let out = downcast_i32(&out);
@@ -688,7 +682,7 @@ mod tests {
         let v = mdn_array(&[(0, 9, 0), (0, 10, 0), (0, 0, 0), (0, -1, 0), (0, 11, 0)]);
         let lo = mdn_array(&[(0, 10, 0); 5]);
         let hi = mdn_array(&[(0, 0, 0); 5]);
-        let n = i32_array_all(5, 10);
+        let n = i64_array_all(5, 10);
 
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         let out = downcast_i32(&out);
@@ -700,7 +694,7 @@ mod tests {
         let v = mdn_array(&[(0, 9, 1), (0, 10, 0), (0, 0, 0), (0, -1, 0), (0, 11, 0)]);
         let lo = mdn_array(&[(0, 10, 0); 5]);
         let hi = mdn_array(&[(0, 0, 0); 5]);
-        let n = i32_array_all(5, 10);
+        let n = i64_array_all(5, 10);
 
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         let out = downcast_i32(&out);
@@ -713,7 +707,7 @@ mod tests {
         let v = mdn_array(&[(0, 1, 0)]);
         let lo = mdn_array(&[(0, 0, 0)]);
         let hi = mdn_array(&[(1, 1, 0)]);
-        let n = i32_array_all(1, 4);
+        let n = i64_array_all(1, 4);
 
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         let out = downcast_i32(&out);
@@ -725,7 +719,7 @@ mod tests {
         let v = mdn_array(&[(0, 0, 0)]);
         let lo = mdn_array(&[(1, 2, 3)]);
         let hi = mdn_array(&[(1, 2, 3)]); // lo == hi
-        let n = i32_array_all(1, 10);
+        let n = i64_array_all(1, 10);
 
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         assert!(downcast_i32(&out).is_null(0));
@@ -736,7 +730,7 @@ mod tests {
         let v = mdn_array(&[(0, 0, 0)]);
         let lo = mdn_array(&[(0, 0, 0)]);
         let hi = mdn_array(&[(0, 10, 0)]);
-        let n = Arc::new(Int32Array::from(vec![0])); // n <= 0
+        let n = Arc::new(Int64Array::from(vec![0])); // n <= 0
 
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         assert!(downcast_i32(&out).is_null(0));
@@ -750,7 +744,7 @@ mod tests {
         ]));
         let lo = mdn_array(&[(0, 0, 0), (0, 0, 0)]);
         let hi = mdn_array(&[(0, 10, 0), (0, 10, 0)]);
-        let n = i32_array_all(2, 10);
+        let n = i64_array_all(2, 10);
 
         let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
         let out = downcast_i32(&out);
@@ -775,13 +769,12 @@ mod tests {
         let v: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
         let lo = f64_array(&[0.0, 0.0, 0.0]);
         let hi = f64_array(&[10.0, 10.0, 10.0]);
-        let n = i32_array_all(3, 10);
+        let n = i64_array_all(3, 10);
 
         let err = width_bucket_kern(&[v, lo, hi, n]).unwrap_err();
         let msg = format!("{err}");
         assert!(
-            msg.contains("unsupported data types")
-                || msg.contains("Float/Decimal OR Duration OR Interval(YearMonth)"),
+            msg.contains("width_bucket received unexpected data types"),
             "unexpected error: {msg}"
         );
     }
diff --git a/datafusion/spark/src/function/mod.rs b/datafusion/spark/src/function/mod.rs
index 3f4f94cfaaf8c..d5dd60c3545a5 100644
--- a/datafusion/spark/src/function/mod.rs
+++ b/datafusion/spark/src/function/mod.rs
@@ -33,6 +33,7 @@ pub mod lambda;
 pub mod map;
 pub mod math;
 pub mod misc;
+mod null_utils;
 pub mod predicate;
 pub mod string;
 pub mod r#struct;
diff --git a/datafusion/spark/src/function/null_utils.rs b/datafusion/spark/src/function/null_utils.rs
new file mode 100644
index 0000000000000..b25dc07d0e525
--- /dev/null
+++ b/datafusion/spark/src/function/null_utils.rs
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::Array;
+use arrow::buffer::NullBuffer;
+use arrow::datatypes::DataType;
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::ColumnarValue;
+use std::sync::Arc;
+
+pub(crate) enum NullMaskResolution {
+    /// Return NULL as the result (e.g., scalar inputs with at least one NULL)
+    ReturnNull,
+    /// No null mask needed (e.g., all scalar inputs are non-NULL)
+    NoMask,
+    /// Null mask to apply for arrays
+    Apply(NullBuffer),
+}
+
+/// Compute NULL mask for the arguments using NullBuffer::union
+pub(crate) fn compute_null_mask(
+    args: &[ColumnarValue],
+    number_rows: usize,
+) -> Result<NullMaskResolution> {
+    // Check if all arguments are scalars
+    let all_scalars = args
+        .iter()
+        .all(|arg| matches!(arg, ColumnarValue::Scalar(_)));
+
+    if all_scalars {
+        // For scalars, check if any is NULL
+        for arg in args {
+            if let ColumnarValue::Scalar(scalar) = arg
+                && scalar.is_null()
+            {
+                return Ok(NullMaskResolution::ReturnNull);
+            }
+        }
+        // No NULLs in scalars
+        Ok(NullMaskResolution::NoMask)
+    } else {
+        // For arrays, compute NULL mask for each row using NullBuffer::union
+        let array_len = args
+            .iter()
+            .find_map(|arg| match arg {
+                ColumnarValue::Array(array) => Some(array.len()),
+                _ => None,
+            })
+            .unwrap_or(number_rows);
+
+        // Convert all scalars to arrays for uniform processing
+        let arrays: Result<Vec<_>> = args
+            .iter()
+            .map(|arg| match arg {
+                ColumnarValue::Array(array) => Ok(Arc::clone(array)),
+                ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(array_len),
+            })
+            .collect();
+        let arrays = arrays?;
+
+        // Use NullBuffer::union to combine all null buffers
+        let combined_nulls = arrays
+            .iter()
+            .map(|arr| arr.nulls())
+            .fold(None, |acc, nulls| NullBuffer::union(acc.as_ref(), nulls));
+
+        match combined_nulls {
+            Some(nulls) => Ok(NullMaskResolution::Apply(nulls)),
+            None => Ok(NullMaskResolution::NoMask),
+        }
+    }
+}
+
+/// Apply NULL mask to the result using NullBuffer::union
+pub(crate) fn apply_null_mask(
+    result: ColumnarValue,
+    null_mask: NullMaskResolution,
+    return_type: &DataType,
+) -> Result<ColumnarValue> {
+    match (result, null_mask) {
+        // Scalar with ReturnNull mask means return NULL of the correct type
+        (ColumnarValue::Scalar(_), NullMaskResolution::ReturnNull) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::try_from(return_type)?))
+        }
+        // Scalar without mask, return as-is
+        (scalar @ ColumnarValue::Scalar(_), NullMaskResolution::NoMask) => Ok(scalar),
+        // Array with NULL mask - use NullBuffer::union to combine nulls
+        (ColumnarValue::Array(array), NullMaskResolution::Apply(null_mask)) => {
+            // Combine the result's existing nulls with our computed null mask
+            let combined_nulls = NullBuffer::union(array.nulls(), Some(&null_mask));
+
+            // Create new array with combined nulls
+            let new_array = array
+                .into_data()
+                .into_builder()
+                .nulls(combined_nulls)
+                .build()?;
+
+            Ok(ColumnarValue::Array(Arc::new(arrow::array::make_array(
+                new_array,
+            ))))
+        }
+        // Array without NULL mask, return as-is
+        (array @ ColumnarValue::Array(_), NullMaskResolution::NoMask) => Ok(array),
+        // Edge cases that shouldn't happen in practice
+        (scalar, _) => Ok(scalar),
+    }
+}
diff --git a/datafusion/spark/src/function/string/ascii.rs b/datafusion/spark/src/function/string/ascii.rs
index f14a66d4e484d..7846d3c681c71 100644
--- a/datafusion/spark/src/function/string/ascii.rs
+++ b/datafusion/spark/src/function/string/ascii.rs
@@ -15,13 +15,17 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::datatypes::DataType;
-use datafusion_common::Result;
-use datafusion_expr::ColumnarValue;
-use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility};
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::types::{NativeType, logical_string};
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignatureClass, Volatility,
+};
 use datafusion_functions::string::ascii::ascii;
 use datafusion_functions::utils::make_scalar_function;
-use std::any::Any;
 
 /// Spark compatible version of the [ascii] function. Differs from the [default ascii function]
 /// in that it is more permissive of input types, for example casting numeric input to string
@@ -42,17 +46,22 @@ impl Default for SparkAscii {
 
 impl SparkAscii {
     pub fn new() -> Self {
+        // Spark's ascii uses ImplicitCastInputTypes with StringType,
+        // which allows numeric types to be implicitly cast to String.
+        // See: https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+        let string_coercion = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_string()),
+            vec![TypeSignatureClass::Numeric],
+            NativeType::String,
+        );
+
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+            signature: Signature::coercible(vec![string_coercion], Volatility::Immutable),
         }
     }
 }
 
 impl ScalarUDFImpl for SparkAscii {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "ascii"
     }
@@ -62,14 +71,60 @@ impl ScalarUDFImpl for SparkAscii {
     }
 
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(DataType::Int32)
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        // ascii returns an Int32 value
+        // The result is nullable only if any of the input arguments is nullable
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new("ascii", DataType::Int32, nullable)))
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(ascii, vec![])(&args.args)
     }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_return_field_nullable_input() {
+        let ascii_func = SparkAscii::new();
+        let nullable_field = Arc::new(Field::new("input", DataType::Utf8, true));
+
+        let result = ascii_func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[nullable_field],
+                scalar_arguments: &[],
+            })
+            .unwrap();
+
+        assert_eq!(result.data_type(), &DataType::Int32);
+        assert!(
+            result.is_nullable(),
+            "Output should be nullable when input is nullable"
+        );
+    }
+
+    #[test]
+    fn test_return_field_non_nullable_input() {
+        let ascii_func = SparkAscii::new();
+        let non_nullable_field = Arc::new(Field::new("input", DataType::Utf8, false));
+
+        let result = ascii_func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[non_nullable_field],
+                scalar_arguments: &[],
+            })
+            .unwrap();
 
-    fn coerce_types(&self, _arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        Ok(vec![DataType::Utf8])
+        assert_eq!(result.data_type(), &DataType::Int32);
+        assert!(
+            !result.is_nullable(),
+            "Output should not be nullable when input is not nullable"
+        );
     }
 }
diff --git a/datafusion/spark/src/function/string/base64.rs b/datafusion/spark/src/function/string/base64.rs
new file mode 100644
index 0000000000000..95607f374b32f
--- /dev/null
+++ b/datafusion/spark/src/function/string/base64.rs
@@ -0,0 +1,174 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::datatypes::DataType;
+use datafusion_common::arrow::datatypes::{Field, FieldRef};
+use datafusion_common::types::{NativeType, logical_string};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, exec_err, internal_err};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{Coercion, Expr, ReturnFieldArgs, TypeSignatureClass, lit};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::expr_fn::{decode, encode};
+
+/// Apache Spark base64 uses padded base64 encoding.
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#base64>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkBase64 {
+    signature: Signature,
+}
+
+impl Default for SparkBase64 {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkBase64 {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![Coercion::new_implicit(
+                    TypeSignatureClass::Binary,
+                    vec![TypeSignatureClass::Native(logical_string())],
+                    NativeType::Binary,
+                )],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkBase64 {
+    fn name(&self) -> &str {
+        "base64"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_type should not be called for {}", self.name())
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs<'_>) -> Result<FieldRef> {
+        let [bin] = take_function_args(self.name(), args.arg_fields)?;
+        let return_type = match bin.data_type() {
+            DataType::LargeBinary => DataType::LargeUtf8,
+            _ => DataType::Utf8,
+        };
+        Ok(Arc::new(Field::new(
+            self.name(),
+            return_type,
+            bin.is_nullable(),
+        )))
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        exec_err!(
+            "invoke should not be called on a simplified {} function",
+            self.name()
+        )
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        _info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [bin] = take_function_args(self.name(), args)?;
+        Ok(ExprSimplifyResult::Simplified(encode(
+            bin,
+            lit("base64pad"),
+        )))
+    }
+}
+
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#unbase64>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkUnBase64 {
+    signature: Signature,
+}
+
+impl Default for SparkUnBase64 {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkUnBase64 {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![Coercion::new_implicit(
+                    TypeSignatureClass::Binary,
+                    vec![TypeSignatureClass::Native(logical_string())],
+                    NativeType::Binary,
+                )],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkUnBase64 {
+    fn name(&self) -> &str {
+        "unbase64"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_type should not be called for {}", self.name())
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs<'_>) -> Result<FieldRef> {
+        let [str] = take_function_args(self.name(), args.arg_fields)?;
+        let return_type = match str.data_type() {
+            DataType::LargeBinary => DataType::LargeBinary,
+            _ => DataType::Binary,
+        };
+        Ok(Arc::new(Field::new(
+            self.name(),
+            return_type,
+            str.is_nullable(),
+        )))
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        exec_err!("{} should have been simplified", self.name())
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        _info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [bin] = take_function_args(self.name(), args)?;
+        Ok(ExprSimplifyResult::Simplified(decode(
+            bin,
+            lit("base64pad"),
+        )))
+    }
+}
diff --git a/datafusion/spark/src/function/string/char.rs b/datafusion/spark/src/function/string/char.rs
index a1813373c65ff..15b00ee98f5c7 100644
--- a/datafusion/spark/src/function/string/char.rs
+++ b/datafusion/spark/src/function/string/char.rs
@@ -17,14 +17,15 @@
 
 use arrow::array::ArrayRef;
 use arrow::array::GenericStringBuilder;
-use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::Int64;
 use arrow::datatypes::DataType::Utf8;
-use std::{any::Any, sync::Arc};
+use arrow::datatypes::{DataType, Field, FieldRef};
+use std::sync::Arc;
 
-use datafusion_common::{cast::as_int64_array, exec_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, cast::as_int64_array, exec_err};
 use datafusion_expr::{
-    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 
 /// Spark-compatible `char` expression
@@ -49,10 +50,6 @@ impl CharFunc {
 }
 
 impl ScalarUDFImpl for CharFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "char"
     }
@@ -62,12 +59,19 @@ impl ScalarUDFImpl for CharFunc {
     }
 
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(Utf8)
+        datafusion_common::internal_err!(
+            "return_type should not be called, use return_field_from_args instead"
+        )
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         spark_chr(&args.args)
     }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new(self.name(), Utf8, nullable)))
+    }
 }
 
 /// Returns the ASCII character having the binary equivalent to the input expression.
@@ -119,7 +123,7 @@ fn chr(args: &[ArrayRef]) -> Result<ArrayRef> {
                         None => {
                             return exec_err!(
                                 "requested character not compatible for encoding."
-                            )
+                            );
                         }
                     }
                 }
@@ -130,3 +134,48 @@ fn chr(args: &[ArrayRef]) -> Result<ArrayRef> {
 
     Ok(Arc::new(builder.finish()) as ArrayRef)
 }
+
+#[test]
+fn test_char_nullability() -> Result<()> {
+    use arrow::datatypes::{DataType::Utf8, Field, FieldRef};
+    use datafusion_expr::ReturnFieldArgs;
+    use std::sync::Arc;
+
+    let func = CharFunc::new();
+
+    let nullable_field: FieldRef = Arc::new(Field::new("col", Int64, true));
+
+    let out_nullable = func.return_field_from_args(ReturnFieldArgs {
+        arg_fields: &[nullable_field],
+        scalar_arguments: &[None],
+    })?;
+
+    assert!(
+        out_nullable.is_nullable(),
+        "char(col) should be nullable when input column is nullable"
+    );
+    assert_eq!(
+        out_nullable.data_type(),
+        &Utf8,
+        "char always returns Utf8 regardless of input type"
+    );
+
+    let non_nullable_field: FieldRef = Arc::new(Field::new("col", Int64, false));
+
+    let out_non_nullable = func.return_field_from_args(ReturnFieldArgs {
+        arg_fields: &[non_nullable_field],
+        scalar_arguments: &[None],
+    })?;
+
+    assert!(
+        !out_non_nullable.is_nullable(),
+        "char(col) should NOT be nullable when input column is NOT nullable"
+    );
+    assert_eq!(
+        out_non_nullable.data_type(),
+        &Utf8,
+        "char always returns Utf8 regardless of input type"
+    );
+
+    Ok(())
+}
diff --git a/datafusion/spark/src/function/string/concat.rs b/datafusion/spark/src/function/string/concat.rs
index 0dcc58d5bb8ed..242c5fe4710bb 100644
--- a/datafusion/spark/src/function/string/concat.rs
+++ b/datafusion/spark/src/function/string/concat.rs
@@ -15,18 +15,20 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::Array;
-use arrow::buffer::NullBuffer;
-use arrow::datatypes::DataType;
+use arrow::datatypes::{DataType, Field};
+use datafusion_common::arrow::datatypes::FieldRef;
 use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::ReturnFieldArgs;
 use datafusion_expr::{
-    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
-    Volatility,
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
 };
 use datafusion_functions::string::concat::ConcatFunc;
-use std::any::Any;
 use std::sync::Arc;
 
+use crate::function::null_utils::{
+    NullMaskResolution, apply_null_mask, compute_null_mask,
+};
+
 /// Spark-compatible `concat` expression
 /// <https://spark.apache.org/docs/latest/api/sql/index.html#concat>
 ///
@@ -50,19 +52,12 @@ impl Default for SparkConcat {
 impl SparkConcat {
     pub fn new() -> Self {
         Self {
-            signature: Signature::one_of(
-                vec![TypeSignature::UserDefined, TypeSignature::Nullary],
-                Volatility::Immutable,
-            ),
+            signature: Signature::user_defined(Volatility::Immutable),
         }
     }
 }
 
 impl ScalarUDFImpl for SparkConcat {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "concat"
     }
@@ -71,10 +66,6 @@ impl ScalarUDFImpl for SparkConcat {
         &self.signature
     }
 
-    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(DataType::Utf8)
-    }
-
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         spark_concat(args)
     }
@@ -83,16 +74,28 @@ impl ScalarUDFImpl for SparkConcat {
         // Accept any string types, including zero arguments
         Ok(arg_types.to_vec())
     }
-}
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        datafusion_common::internal_err!(
+            "return_type should not be called for Spark concat"
+        )
+    }
+    fn return_field_from_args(&self, args: ReturnFieldArgs<'_>) -> Result<FieldRef> {
+        use DataType::*;
+
+        // Spark semantics: concat returns NULL if ANY input is NULL
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        // Determine return type: Utf8View > LargeUtf8 > Utf8
+        let mut dt = &Utf8;
+        for field in args.arg_fields {
+            let data_type = field.data_type();
+            if data_type == &Utf8View || (data_type == &LargeUtf8 && dt != &Utf8View) {
+                dt = data_type;
+            }
+        }
 
-/// Represents the null state for Spark concat
-enum NullMaskResolution {
-    /// Return NULL as the result (e.g., scalar inputs with at least one NULL)
-    ReturnNull,
-    /// No null mask needed (e.g., all scalar inputs are non-NULL)
-    NoMask,
-    /// Null mask to apply for arrays
-    Apply(NullBuffer),
+        Ok(Arc::new(Field::new("concat", dt.clone(), nullable)))
+    }
 }
 
 /// Concatenates strings, returning NULL if any input is NULL
@@ -109,9 +112,18 @@ fn spark_concat(args: ScalarFunctionArgs) -> Result<ColumnarValue> {
 
     // Handle zero-argument case: return empty string
     if arg_values.is_empty() {
-        return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(
-            Some(String::new()),
-        )));
+        let return_type = return_field.data_type();
+        return match return_type {
+            DataType::Utf8View => Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
+                String::new(),
+            )))),
+            DataType::LargeUtf8 => Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(
+                Some(String::new()),
+            ))),
+            _ => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(
+                Some(String::new()),
+            ))),
+        };
     }
 
     // Step 1: Check for NULL mask in incoming args
@@ -119,11 +131,19 @@ fn spark_concat(args: ScalarFunctionArgs) -> Result<ColumnarValue> {
 
     // If all scalars and any is NULL, return NULL immediately
     if matches!(null_mask, NullMaskResolution::ReturnNull) {
-        return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)));
+        let return_type = return_field.data_type();
+        return match return_type {
+            DataType::Utf8View => Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(None))),
+            DataType::LargeUtf8 => {
+                Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)))
+            }
+            _ => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))),
+        };
     }
 
     // Step 2: Delegate to DataFusion's concat
     let concat_func = ConcatFunc::new();
+    let return_type = return_field.data_type().clone();
     let func_args = ScalarFunctionArgs {
         args: arg_values,
         arg_fields,
@@ -134,105 +154,14 @@ fn spark_concat(args: ScalarFunctionArgs) -> Result<ColumnarValue> {
     let result = concat_func.invoke_with_args(func_args)?;
 
     // Step 3: Apply NULL mask to result
-    apply_null_mask(result, null_mask)
-}
-
-/// Compute NULL mask for the arguments using NullBuffer::union
-fn compute_null_mask(
-    args: &[ColumnarValue],
-    number_rows: usize,
-) -> Result<NullMaskResolution> {
-    // Check if all arguments are scalars
-    let all_scalars = args
-        .iter()
-        .all(|arg| matches!(arg, ColumnarValue::Scalar(_)));
-
-    if all_scalars {
-        // For scalars, check if any is NULL
-        for arg in args {
-            if let ColumnarValue::Scalar(scalar) = arg {
-                if scalar.is_null() {
-                    return Ok(NullMaskResolution::ReturnNull);
-                }
-            }
-        }
-        // No NULLs in scalars
-        Ok(NullMaskResolution::NoMask)
-    } else {
-        // For arrays, compute NULL mask for each row using NullBuffer::union
-        let array_len = args
-            .iter()
-            .find_map(|arg| match arg {
-                ColumnarValue::Array(array) => Some(array.len()),
-                _ => None,
-            })
-            .unwrap_or(number_rows);
-
-        // Convert all scalars to arrays for uniform processing
-        let arrays: Result<Vec<_>> = args
-            .iter()
-            .map(|arg| match arg {
-                ColumnarValue::Array(array) => Ok(Arc::clone(array)),
-                ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(array_len),
-            })
-            .collect();
-        let arrays = arrays?;
-
-        // Use NullBuffer::union to combine all null buffers
-        let combined_nulls = arrays
-            .iter()
-            .map(|arr| arr.nulls())
-            .fold(None, |acc, nulls| NullBuffer::union(acc.as_ref(), nulls));
-
-        match combined_nulls {
-            Some(nulls) => Ok(NullMaskResolution::Apply(nulls)),
-            None => Ok(NullMaskResolution::NoMask),
-        }
-    }
-}
-
-/// Apply NULL mask to the result using NullBuffer::union
-fn apply_null_mask(
-    result: ColumnarValue,
-    null_mask: NullMaskResolution,
-) -> Result<ColumnarValue> {
-    match (result, null_mask) {
-        // Scalar with ReturnNull mask means return NULL
-        (ColumnarValue::Scalar(_), NullMaskResolution::ReturnNull) => {
-            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
-        }
-        // Scalar without mask, return as-is
-        (scalar @ ColumnarValue::Scalar(_), NullMaskResolution::NoMask) => Ok(scalar),
-        // Array with NULL mask - use NullBuffer::union to combine nulls
-        (ColumnarValue::Array(array), NullMaskResolution::Apply(null_mask)) => {
-            // Combine the result's existing nulls with our computed null mask
-            let combined_nulls = NullBuffer::union(array.nulls(), Some(&null_mask));
-
-            // Create new array with combined nulls
-            let new_array = array
-                .into_data()
-                .into_builder()
-                .nulls(combined_nulls)
-                .build()?;
-
-            Ok(ColumnarValue::Array(Arc::new(arrow::array::make_array(
-                new_array,
-            ))))
-        }
-        // Array without NULL mask, return as-is
-        (array @ ColumnarValue::Array(_), NullMaskResolution::NoMask) => Ok(array),
-        // Edge cases that shouldn't happen in practice
-        (scalar, _) => Ok(scalar),
-    }
+    apply_null_mask(result, null_mask, &return_type)
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
     use crate::function::utils::test::test_scalar_function;
-    use arrow::array::StringArray;
-    use arrow::datatypes::DataType;
-    use datafusion_common::Result;
+    use arrow::array::{Array, StringArray};
 
     #[test]
     fn test_concat_basic() -> Result<()> {
@@ -266,4 +195,51 @@ mod tests {
         );
         Ok(())
     }
+
+    #[test]
+    fn test_spark_concat_return_field_non_nullable() -> Result<()> {
+        let func = SparkConcat::new();
+
+        let fields = vec![
+            Arc::new(Field::new("a", DataType::Utf8, false)),
+            Arc::new(Field::new("b", DataType::Utf8, false)),
+        ];
+
+        let args = ReturnFieldArgs {
+            arg_fields: &fields,
+            scalar_arguments: &[],
+        };
+
+        let field = func.return_field_from_args(args)?;
+
+        assert!(
+            !field.is_nullable(),
+            "Expected concat result to be non-nullable when all inputs are non-nullable"
+        );
+
+        Ok(())
+    }
+    #[test]
+    fn test_spark_concat_return_field_nullable() -> Result<()> {
+        let func = SparkConcat::new();
+
+        let fields = vec![
+            Arc::new(Field::new("a", DataType::Utf8, false)),
+            Arc::new(Field::new("b", DataType::Utf8, true)),
+        ];
+
+        let args = ReturnFieldArgs {
+            arg_fields: &fields,
+            scalar_arguments: &[],
+        };
+
+        let field = func.return_field_from_args(args)?;
+
+        assert!(
+            field.is_nullable(),
+            "Expected concat result to be nullable when any input is nullable"
+        );
+
+        Ok(())
+    }
 }
diff --git a/datafusion/spark/src/function/string/elt.rs b/datafusion/spark/src/function/string/elt.rs
index 35a22fe5edb6f..c37ecd1d3fc39 100644
--- a/datafusion/spark/src/function/string/elt.rs
+++ b/datafusion/spark/src/function/string/elt.rs
@@ -15,7 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{
@@ -25,7 +24,7 @@ use arrow::compute::{can_cast_types, cast};
 use arrow::datatypes::DataType::{Int64, Utf8};
 use arrow::datatypes::{DataType, Int64Type};
 use datafusion_common::cast::as_string_array;
-use datafusion_common::{plan_datafusion_err, DataFusionError, Result};
+use datafusion_common::{DataFusionError, Result, plan_datafusion_err};
 use datafusion_expr::{
     ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
 };
@@ -51,10 +50,6 @@ impl SparkElt {
 }
 
 impl ScalarUDFImpl for SparkElt {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "elt"
     }
@@ -150,11 +145,6 @@ fn elt(args: &[ArrayRef]) -> Result<ArrayRef, DataFusionError> {
 mod tests {
     use super::*;
     use arrow::array::Int64Array;
-    use datafusion_common::Result;
-
-    use arrow::array::{ArrayRef, StringArray};
-    use datafusion_common::DataFusionError;
-    use std::sync::Arc;
 
     fn run_elt_arrays(arrs: Vec<ArrayRef>) -> Result<Arc<StringArray>> {
         let arr = elt(&arrs)?;
diff --git a/datafusion/spark/src/function/string/format_string.rs b/datafusion/spark/src/function/string/format_string.rs
index 9809456af9a40..ad97841c4a8ec 100644
--- a/datafusion/spark/src/function/string/format_string.rs
+++ b/datafusion/spark/src/function/string/format_string.rs
@@ -15,7 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::fmt::Write;
 use std::sync::Arc;
 
@@ -23,19 +22,19 @@ use core::num::FpCategory;
 
 use arrow::{
     array::{Array, ArrayRef, LargeStringArray, StringArray, StringViewArray},
-    datatypes::DataType,
+    datatypes::{DataType, Field, FieldRef},
 };
 use bigdecimal::{
-    num_bigint::{BigInt, Sign},
     BigDecimal, ToPrimitive,
+    num_bigint::{BigInt, Sign},
 };
 use chrono::{DateTime, Datelike, Timelike, Utc};
 use datafusion_common::{
-    exec_datafusion_err, exec_err, plan_err, DataFusionError, Result, ScalarValue,
+    DataFusionError, Result, ScalarValue, exec_datafusion_err, exec_err, plan_err,
 };
 use datafusion_expr::{
-    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
-    Volatility,
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature, Volatility,
 };
 
 /// Spark-compatible `format_string` expression
@@ -62,10 +61,6 @@ impl FormatStringFunc {
 }
 
 impl ScalarUDFImpl for FormatStringFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "format_string"
     }
@@ -78,11 +73,24 @@ impl ScalarUDFImpl for FormatStringFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        match arg_types[0] {
-            DataType::Null => Ok(DataType::Utf8),
-            DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => Ok(arg_types[0].clone()),
-            _ => plan_err!("The format_string function expects the first argument to be Utf8, LargeUtf8 or Utf8View")
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        datafusion_common::internal_err!(
+            "return_type should not be called, use return_field_from_args instead"
+        )
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        match args.arg_fields[0].data_type() {
+            DataType::Null => {
+                Ok(Arc::new(Field::new("format_string", DataType::Utf8, true)))
+            }
+            DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => {
+                Ok(Arc::clone(&args.arg_fields[0]))
+            }
+            _ => exec_err!(
+                "format_string expects the first argument to be Utf8, LargeUtf8 or Utf8View, got {} instead",
+                args.arg_fields[0].data_type()
+            ),
         }
     }
 
@@ -304,7 +312,7 @@ impl<'a> Formatter<'a> {
                         return exec_err!("No previous argument to reference");
                     };
                     let (spec, rest) =
-                        take_conversion_specifier(rest, p, arg_types[p - 1].clone())?;
+                        take_conversion_specifier(rest, p, &arg_types[p - 1])?;
                     res.push(FormatElement::Format(spec));
                     rem = rest;
                     continue;
@@ -317,7 +325,7 @@ impl<'a> Formatter<'a> {
                             (index as usize, &rest2[1..])
                         }
                         (NumericParam::FromArgument, true) => {
-                            return exec_err!("Invalid numeric parameter")
+                            return exec_err!("Invalid numeric parameter");
                         }
                         (_, false) => {
                             argument_index += 1;
@@ -335,7 +343,7 @@ impl<'a> Formatter<'a> {
                 let (spec, rest) = take_conversion_specifier(
                     rest,
                     current_argument_index,
-                    arg_types[current_argument_index - 1].clone(),
+                    &arg_types[current_argument_index - 1],
                 )
                 .map_err(|e| exec_datafusion_err!("{:?}, format string: {:?}", e, fmt))?;
                 res.push(FormatElement::Format(spec));
@@ -582,10 +590,10 @@ impl TryFrom<char> for TimeFormat {
 }
 
 impl ConversionType {
-    pub fn validate(&self, arg_type: DataType) -> Result<()> {
+    pub fn validate(&self, arg_type: &DataType) -> Result<()> {
         match self {
             ConversionType::BooleanLower | ConversionType::BooleanUpper => {
-                if !matches!(arg_type, DataType::Boolean) {
+                if *arg_type != DataType::Boolean {
                     return exec_err!(
                         "Invalid argument type for boolean conversion: {:?}",
                         arg_type
@@ -716,11 +724,11 @@ impl ConversionType {
     }
 }
 
-fn take_conversion_specifier(
-    mut s: &str,
+fn take_conversion_specifier<'a>(
+    mut s: &'a str,
     argument_index: usize,
-    arg_type: DataType,
-) -> Result<(ConversionSpecifier, &str)> {
+    arg_type: &DataType,
+) -> Result<(ConversionSpecifier, &'a str)> {
     let mut spec = ConversionSpecifier {
         argument_index,
         alt_form: false,
@@ -854,6 +862,28 @@ fn take_numeric_param(s: &str, zero: bool) -> (NumericParam, &str) {
 }
 
 impl ConversionSpecifier {
+    /// Validates that the grouping separator flag is not used with scientific
+    /// notation conversions, matching Java/Spark behavior which throws
+    /// `FormatFlagsConversionMismatchException` for `%,e` / `%,E`.
+    fn validate_grouping_separator(&self) -> Result<()> {
+        if self.grouping_separator
+            && matches!(
+                self.conversion_type,
+                ConversionType::SciFloatLower | ConversionType::SciFloatUpper
+            )
+        {
+            return exec_err!(
+                "Grouping separator ',' flag is not compatible with scientific notation conversion '{}'",
+                if self.conversion_type == ConversionType::SciFloatUpper {
+                    'E'
+                } else {
+                    'e'
+                }
+            );
+        }
+        Ok(())
+    }
+
     pub fn format(&self, string: &mut String, value: &ScalarValue) -> Result<()> {
         match value {
             ScalarValue::Boolean(value) => match self.conversion_type {
@@ -1186,7 +1216,7 @@ impl ConversionSpecifier {
                         | ConversionType::CompactFloatLower
                         | ConversionType::CompactFloatUpper,
                         Some(value),
-                    ) => self.format_decimal(string, value.to_string(), *scale as i64),
+                    ) => self.format_decimal(string, &value.to_string(), *scale as i64),
                     (
                         ConversionType::StringLower | ConversionType::StringUpper,
                         Some(value),
@@ -1212,7 +1242,7 @@ impl ConversionSpecifier {
                         | ConversionType::CompactFloatLower
                         | ConversionType::CompactFloatUpper,
                         Some(value),
-                    ) => self.format_decimal(string, value.to_string(), *scale as i64),
+                    ) => self.format_decimal(string, &value.to_string(), *scale as i64),
                     (
                         ConversionType::StringLower | ConversionType::StringUpper,
                         Some(value),
@@ -1418,7 +1448,7 @@ impl ConversionSpecifier {
                 let value = "null".to_string();
                 self.format_string(string, &value)
             }
-            _ => exec_err!("Invalid scalar value: {:?}", value),
+            _ => exec_err!("Invalid scalar value: {value}"),
         }
     }
 
@@ -1675,13 +1705,15 @@ impl ConversionSpecifier {
                 return exec_err!(
                     "Invalid conversion type: {:?} for boolean array",
                     self.conversion_type
-                )
+                );
             }
         };
         self.format_str(writer, formatted)
     }
 
     fn format_float(&self, writer: &mut String, value: f64) -> Result<()> {
+        self.validate_grouping_separator()?;
+
         let mut prefix = String::new();
         let mut suffix = String::new();
         let mut number = String::new();
@@ -1744,7 +1776,7 @@ impl ConversionSpecifier {
                     return exec_err!(
                         "Invalid conversion type: {:?} for float",
                         self.conversion_type
-                    )
+                    );
                 }
             }
 
@@ -1762,6 +1794,9 @@ impl ConversionSpecifier {
                 if strip_trailing_0s {
                     number = trim_trailing_0s(&number).to_owned();
                 }
+                if self.grouping_separator {
+                    number = insert_thousands_separator(&number);
+                }
             }
             if self.alt_form && !number.contains('.') {
                 number += ".";
@@ -1789,7 +1824,7 @@ impl ConversionSpecifier {
                     return exec_err!(
                         "Invalid conversion type: {:?} for float",
                         self.conversion_type
-                    )
+                    );
                 }
             }
         }
@@ -1874,20 +1909,11 @@ impl ConversionSpecifier {
         match self.conversion_type {
             ConversionType::DecInt => {
                 let num_str = format!("{value}");
-                if self.grouping_separator {
-                    // Add thousands separators
-                    let mut result = String::new();
-                    let chars: Vec<char> = num_str.chars().collect();
-                    for (i, c) in chars.iter().enumerate() {
-                        if i > 0 && (chars.len() - i).is_multiple_of(3) {
-                            result.push(',');
-                        }
-                        result.push(*c);
-                    }
-                    s = result;
+                s = if self.grouping_separator {
+                    insert_thousands_separator(&num_str)
                 } else {
-                    s = num_str;
-                }
+                    num_str
+                };
             }
             ConversionType::HexIntLower => {
                 alt_prefix = "0x";
@@ -1908,7 +1934,7 @@ impl ConversionSpecifier {
                 return exec_err!(
                     "Invalid conversion type: {:?} for u64",
                     self.conversion_type
-                )
+                );
             }
         }
         let mut prefix = if self.alt_form {
@@ -1991,12 +2017,9 @@ impl ConversionSpecifier {
         }
     }
 
-    fn format_decimal(
-        &self,
-        writer: &mut String,
-        value: String,
-        scale: i64,
-    ) -> Result<()> {
+    fn format_decimal(&self, writer: &mut String, value: &str, scale: i64) -> Result<()> {
+        self.validate_grouping_separator()?;
+
         let mut prefix = String::new();
         let upper = self.conversion_type.is_upper();
 
@@ -2007,6 +2030,10 @@ impl ConversionSpecifier {
         let decimal = BigDecimal::from_bigint(decimal, scale);
 
         // Handle sign
+        // TODO: `negative_in_parentheses` (the `(` flag) is not implemented here.
+        // Java/Spark wrap negative values in parentheses when this flag is set
+        // (e.g. `%(,.2f` with -1234.5 → "(1,234.50)"), but this path always
+        // uses a minus sign. See `format_float` for the correct implementation.
         let is_negative = decimal.sign() == Sign::Minus;
         let abs_decimal = decimal.abs();
 
@@ -2030,7 +2057,15 @@ impl ConversionSpecifier {
         let number = match self.conversion_type {
             ConversionType::DecFloatLower => {
                 // Format as fixed-point decimal
-                self.format_decimal_fixed(&abs_decimal, precision, strip_trailing_0s)?
+                let mut n = self.format_decimal_fixed(
+                    &abs_decimal,
+                    precision,
+                    strip_trailing_0s,
+                )?;
+                if self.grouping_separator {
+                    n = insert_thousands_separator(&n);
+                }
+                n
             }
             ConversionType::SciFloatLower => self.format_decimal_scientific(
                 &abs_decimal,
@@ -2059,18 +2094,22 @@ impl ConversionSpecifier {
                         strip_trailing_0s,
                     )?
                 } else {
-                    self.format_decimal_fixed(
+                    let mut n = self.format_decimal_fixed(
                         &abs_decimal,
                         precision - 1 - log10_val.floor() as i32,
                         strip_trailing_0s,
-                    )?
+                    )?;
+                    if self.grouping_separator {
+                        n = insert_thousands_separator(&n);
+                    }
+                    n
                 }
             }
             _ => {
                 return exec_err!(
                     "Invalid conversion type: {:?} for decimal",
                     self.conversion_type
-                )
+                );
             }
         };
 
@@ -2329,6 +2368,24 @@ impl FloatBits for f64 {
     }
 }
 
+/// Inserts thousands separators (`,`) into the integer part of a numeric string.
+/// For example, `"1234567.89"` becomes `"1,234,567.89"`.
+fn insert_thousands_separator(number: &str) -> String {
+    let (int_part, frac_part) = match number.find('.') {
+        Some(pos) => (&number[..pos], &number[pos..]),
+        None => (number, ""),
+    };
+    let mut result = String::with_capacity(number.len() + number.len() / 3);
+    for (i, c) in int_part.char_indices() {
+        if i > 0 && (int_part.len() - i) % 3 == 0 {
+            result.push(',');
+        }
+        result.push(c);
+    }
+    result.push_str(frac_part);
+    result
+}
+
 fn trim_trailing_0s(number: &str) -> &str {
     if number.contains('.') {
         for (i, c) in number.chars().rev().enumerate() {
@@ -2348,3 +2405,369 @@ fn trim_trailing_0s_hex(number: &str) -> &str {
     }
     number
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::function::utils::test::test_scalar_function;
+    use arrow::array::StringArray;
+    use arrow::datatypes::DataType::Utf8;
+
+    #[test]
+    fn test_format_string_nullability() -> Result<()> {
+        let func = FormatStringFunc::new();
+        let nullable_format: FieldRef = Arc::new(Field::new("fmt", Utf8, true));
+
+        let out_nullable = func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[nullable_format],
+            scalar_arguments: &[None],
+        })?;
+
+        assert!(
+            out_nullable.is_nullable(),
+            "format_string(fmt, ...) should be nullable when fmt is nullable"
+        );
+        let non_nullable_format: FieldRef = Arc::new(Field::new("fmt", Utf8, false));
+
+        let out_non_nullable = func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[non_nullable_format],
+            scalar_arguments: &[None],
+        })?;
+
+        assert!(
+            !out_non_nullable.is_nullable(),
+            "format_string(fmt, ...) should NOT be nullable when fmt is NOT nullable"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_insert_thousands_separator() {
+        assert_eq!(insert_thousands_separator("1234567.89"), "1,234,567.89");
+        assert_eq!(insert_thousands_separator("123.45"), "123.45");
+        assert_eq!(insert_thousands_separator("1234"), "1,234");
+        assert_eq!(insert_thousands_separator("12"), "12");
+        assert_eq!(insert_thousands_separator("0.5"), "0.5");
+        assert_eq!(
+            insert_thousands_separator("1234567890.1234"),
+            "1,234,567,890.1234"
+        );
+        assert_eq!(insert_thousands_separator("1000"), "1,000");
+        assert_eq!(insert_thousands_separator("100"), "100");
+    }
+
+    #[test]
+    fn test_grouping_separator_float() -> Result<()> {
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%,.2f".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(1234567.89))),
+            ],
+            Ok(Some("1,234,567.89")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_grouping_separator_decimal() -> Result<()> {
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%,.2f".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Decimal128(Some(123456789), 10, 2)),
+            ],
+            Ok(Some("1,234,567.89")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_grouping_separator_scientific_float() -> Result<()> {
+        // %,e — Java/Spark reject grouping separator with scientific notation
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%,e".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(1234567.89))),
+            ],
+            Err(DataFusionError::Execution(
+                "Grouping separator ',' flag is not compatible with scientific notation conversion 'e'".to_string(),
+            )),
+            &str,
+            Utf8,
+            StringArray
+        );
+        // %,E — uppercase scientific also rejected
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%,E".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(1234567.89))),
+            ],
+            Err(DataFusionError::Execution(
+                "Grouping separator ',' flag is not compatible with scientific notation conversion 'E'".to_string(),
+            )),
+            &str,
+            Utf8,
+            StringArray
+        );
+        // %,.0e — precision 0 scientific with grouping also rejected
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%,.0e".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(1234567.89))),
+            ],
+            Err(DataFusionError::Execution(
+                "Grouping separator ',' flag is not compatible with scientific notation conversion 'e'".to_string(),
+            )),
+            &str,
+            Utf8,
+            StringArray
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_grouping_separator_compact_float() -> Result<()> {
+        // %,g with large number — triggers scientific, no commas
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%,g".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(1234567.89))),
+            ],
+            Ok(Some("1.23457e+06")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        // %,g with small number — triggers fixed-point, commas in integer part
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%,g".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(12345.6))),
+            ],
+            Ok(Some("12,345.6")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        // %,.0g — precision 0 compact with grouping (large number, scientific)
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%,.0g".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(1234567.89))),
+            ],
+            Ok(Some("1e+06")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        // %,G — uppercase compact
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%,G".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(1234567.89))),
+            ],
+            Ok(Some("1.23457E+06")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_grouping_separator_scientific_decimal() -> Result<()> {
+        // %,e on decimal — Java/Spark reject grouping separator with scientific notation
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%,e".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Decimal128(Some(123456789), 10, 2)),
+            ],
+            Err(DataFusionError::Execution(
+                "Grouping separator ',' flag is not compatible with scientific notation conversion 'e'".to_string(),
+            )),
+            &str,
+            Utf8,
+            StringArray
+        );
+        // %,.0e on decimal — also rejected
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%,.0e".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Decimal128(Some(123456789), 10, 2)),
+            ],
+            Err(DataFusionError::Execution(
+                "Grouping separator ',' flag is not compatible with scientific notation conversion 'e'".to_string(),
+            )),
+            &str,
+            Utf8,
+            StringArray
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_grouping_separator_compact_decimal() -> Result<()> {
+        // %,g on decimal — large number triggers scientific, no commas
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%,g".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Decimal128(Some(123456789), 10, 2)),
+            ],
+            Ok(Some("1.23457e+06")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        // %,g on decimal — small number triggers fixed-point, commas expected
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%,g".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Decimal128(Some(1234560), 10, 2)),
+            ],
+            Ok(Some("12,345.6")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        // %,.0g on decimal — precision 0 compact with grouping (scientific)
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%,.0g".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Decimal128(Some(123456789), 10, 2)),
+            ],
+            Ok(Some("1e+06")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_grouping_separator_width_sign_float() -> Result<()> {
+        // %0,15.2f — zero-pad + grouping + width
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%0,15.2f".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(1234567.89))),
+            ],
+            Ok(Some("0001,234,567.89")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        // %+,15.2f — force-sign + grouping + width (space-padded)
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%+,15.2f".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(1234567.89))),
+            ],
+            Ok(Some("  +1,234,567.89")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        // %-,15.2f — left-adjust + grouping + width
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%-,15.2f".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(1234567.89))),
+            ],
+            Ok(Some("1,234,567.89   ")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_grouping_separator_width_sign_decimal() -> Result<()> {
+        // %0,15.2f — zero-pad + grouping + width on decimal
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%0,15.2f".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Decimal128(Some(123456789), 10, 2)),
+            ],
+            Ok(Some("0001,234,567.89")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        // %+,15.2f — force-sign + grouping + width on decimal
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%+,15.2f".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Decimal128(Some(123456789), 10, 2)),
+            ],
+            Ok(Some("  +1,234,567.89")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_grouping_separator_parentheses_float() -> Result<()> {
+        // %(,15.2f with negative — parentheses + grouping + width
+        // Java: String.format("%(,15.2f", -1234.5) → "     (1,234.50)"
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%(,15.2f".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(-1234.5))),
+            ],
+            Ok(Some("     (1,234.50)")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_grouping_separator_parentheses_decimal() -> Result<()> {
+        // %(,15.2f on negative decimal — format_decimal ignores negative_in_parentheses,
+        // always uses '-'. Check TODO in fn format_decimal
+        // Java: String.format("%(,15.2f", -1234.5) → "     (1,234.50)"
+        // Ours: "      -1,234.50" (minus sign, no parens)
+        test_scalar_function!(
+            FormatStringFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("%(,15.2f".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Decimal128(Some(-123450), 10, 2)),
+            ],
+            Ok(Some("      -1,234.50")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/string/ilike.rs b/datafusion/spark/src/function/string/ilike.rs
index a160749523f1e..3be63955c0447 100644
--- a/datafusion/spark/src/function/string/ilike.rs
+++ b/datafusion/spark/src/function/string/ilike.rs
@@ -17,12 +17,13 @@
 
 use arrow::array::ArrayRef;
 use arrow::compute::ilike;
-use arrow::datatypes::DataType;
-use datafusion_common::{exec_err, Result};
+use arrow::datatypes::{DataType, Field};
+use datafusion_common::{Result, exec_err, internal_err};
 use datafusion_expr::ColumnarValue;
-use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility};
+use datafusion_expr::{
+    ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
 use datafusion_functions::utils::make_scalar_function;
-use std::any::Any;
 use std::sync::Arc;
 
 /// ILIKE function for case-insensitive pattern matching
@@ -47,10 +48,6 @@ impl SparkILike {
 }
 
 impl ScalarUDFImpl for SparkILike {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "ilike"
     }
@@ -60,7 +57,14 @@ impl ScalarUDFImpl for SparkILike {
     }
 
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(DataType::Boolean)
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<Arc<Field>> {
+        // ILIKE returns a boolean value
+        // The result is nullable if any of the input arguments is nullable
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new("ilike", DataType::Boolean, nullable)))
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
@@ -84,8 +88,7 @@ mod tests {
     use crate::function::utils::test::test_scalar_function;
     use arrow::array::{Array, BooleanArray};
     use arrow::datatypes::DataType::Boolean;
-    use datafusion_common::{Result, ScalarValue};
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+    use datafusion_common::ScalarValue;
 
     macro_rules! test_ilike_string_invoke {
         ($INPUT1:expr, $INPUT2:expr, $EXPECTED:expr) => {
@@ -170,4 +173,73 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_ilike_nullability() {
+        let ilike = SparkILike::new();
+
+        // Test with non-nullable arguments
+        let non_nullable_field1 = Arc::new(Field::new("str", DataType::Utf8, false));
+        let non_nullable_field2 = Arc::new(Field::new("pattern", DataType::Utf8, false));
+
+        let result = ilike
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[
+                    Arc::clone(&non_nullable_field1),
+                    Arc::clone(&non_nullable_field2),
+                ],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        // The result should not be nullable when both inputs are non-nullable
+        assert!(!result.is_nullable());
+        assert_eq!(result.data_type(), &Boolean);
+
+        // Test with first argument nullable
+        let nullable_field1 = Arc::new(Field::new("str", DataType::Utf8, true));
+
+        let result = ilike
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[
+                    Arc::clone(&nullable_field1),
+                    Arc::clone(&non_nullable_field2),
+                ],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        // The result should be nullable when first input is nullable
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), &Boolean);
+
+        // Test with second argument nullable
+        let nullable_field2 = Arc::new(Field::new("pattern", DataType::Utf8, true));
+
+        let result = ilike
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[
+                    Arc::clone(&non_nullable_field1),
+                    Arc::clone(&nullable_field2),
+                ],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        // The result should be nullable when second input is nullable
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), &Boolean);
+
+        // Test with both arguments nullable
+        let result = ilike
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&nullable_field1), Arc::clone(&nullable_field2)],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        // The result should be nullable when both inputs are nullable
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), &Boolean);
+    }
 }
diff --git a/datafusion/spark/src/function/string/is_valid_utf8.rs b/datafusion/spark/src/function/string/is_valid_utf8.rs
new file mode 100644
index 0000000000000..04958a25317d2
--- /dev/null
+++ b/datafusion/spark/src/function/string/is_valid_utf8.rs
@@ -0,0 +1,118 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion::logical_expr::{ColumnarValue, Signature, Volatility};
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl};
+
+use arrow::array::{Array, ArrayRef, BooleanArray};
+use arrow::buffer::BooleanBuffer;
+use datafusion_common::cast::{
+    as_binary_array, as_binary_view_array, as_large_binary_array,
+};
+use datafusion_common::utils::take_function_args;
+use datafusion_functions::utils::make_scalar_function;
+
+use std::sync::Arc;
+
+/// Spark-compatible `is_valid_utf8` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#is_valid_utf8>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkIsValidUtf8 {
+    signature: Signature,
+}
+
+impl Default for SparkIsValidUtf8 {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkIsValidUtf8 {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::uniform(
+                1,
+                vec![
+                    DataType::Utf8,
+                    DataType::LargeUtf8,
+                    DataType::Utf8View,
+                    DataType::Binary,
+                    DataType::BinaryView,
+                    DataType::LargeBinary,
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkIsValidUtf8 {
+    fn name(&self) -> &str {
+        "is_valid_utf8"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, _args: ReturnFieldArgs) -> Result<FieldRef> {
+        Ok(Arc::new(Field::new(self.name(), DataType::Boolean, true)))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_is_valid_utf8_inner, vec![])(&args.args)
+    }
+}
+
+fn spark_is_valid_utf8_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [array] = take_function_args("is_valid_utf8", args)?;
+    match array.data_type() {
+        DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => {
+            Ok(Arc::new(BooleanArray::new(
+                BooleanBuffer::new_set(array.len()),
+                array.nulls().cloned(),
+            )))
+        }
+        DataType::Binary => Ok(Arc::new(
+            as_binary_array(array)?
+                .iter()
+                .map(|x| x.map(|y| str::from_utf8(y).is_ok()))
+                .collect::<BooleanArray>(),
+        )),
+        DataType::LargeBinary => Ok(Arc::new(
+            as_large_binary_array(array)?
+                .iter()
+                .map(|x| x.map(|y| str::from_utf8(y).is_ok()))
+                .collect::<BooleanArray>(),
+        )),
+        DataType::BinaryView => Ok(Arc::new(
+            as_binary_view_array(array)?
+                .iter()
+                .map(|x| x.map(|y| str::from_utf8(y).is_ok()))
+                .collect::<BooleanArray>(),
+        )),
+        data_type => {
+            internal_err!("is_valid_utf8 does not support: {data_type}")
+        }
+    }
+}
diff --git a/datafusion/spark/src/function/string/length.rs b/datafusion/spark/src/function/string/length.rs
index 1fa54d000effa..8c5539a0577d8 100644
--- a/datafusion/spark/src/function/string/length.rs
+++ b/datafusion/spark/src/function/string/length.rs
@@ -18,10 +18,11 @@
 use arrow::array::{
     Array, ArrayRef, AsArray, BinaryArrayType, PrimitiveArray, StringArrayType,
 };
-use arrow::datatypes::{DataType, Int32Type};
+use arrow::datatypes::{DataType, Field, FieldRef, Int32Type};
 use datafusion_common::exec_err;
 use datafusion_expr::{
-    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_functions::utils::make_scalar_function;
 use std::sync::Arc;
@@ -65,10 +66,6 @@ impl SparkLengthFunc {
 }
 
 impl ScalarUDFImpl for SparkLengthFunc {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "length"
     }
@@ -78,8 +75,9 @@ impl ScalarUDFImpl for SparkLengthFunc {
     }
 
     fn return_type(&self, _args: &[DataType]) -> datafusion_common::Result<DataType> {
-        // spark length always returns Int32
-        Ok(DataType::Int32)
+        datafusion_common::internal_err!(
+            "return_type should not be called, use return_field_from_args instead"
+        )
     }
 
     fn invoke_with_args(
@@ -92,39 +90,48 @@ impl ScalarUDFImpl for SparkLengthFunc {
     fn aliases(&self) -> &[String] {
         &self.aliases
     }
+
+    fn return_field_from_args(
+        &self,
+        args: ReturnFieldArgs,
+    ) -> datafusion_common::Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        // spark length always returns Int32
+        Ok(Arc::new(Field::new(self.name(), DataType::Int32, nullable)))
+    }
 }
 
 fn spark_length(args: &[ArrayRef]) -> datafusion_common::Result<ArrayRef> {
     match args[0].data_type() {
         DataType::Utf8 => {
             let string_array = args[0].as_string::<i32>();
-            character_length::<_>(string_array)
+            character_length::<_>(&string_array)
         }
         DataType::LargeUtf8 => {
             let string_array = args[0].as_string::<i64>();
-            character_length::<_>(string_array)
+            character_length::<_>(&string_array)
         }
         DataType::Utf8View => {
             let string_array = args[0].as_string_view();
-            character_length::<_>(string_array)
+            character_length::<_>(&string_array)
         }
         DataType::Binary => {
             let binary_array = args[0].as_binary::<i32>();
-            byte_length::<_>(binary_array)
+            byte_length::<_>(&binary_array)
         }
         DataType::LargeBinary => {
             let binary_array = args[0].as_binary::<i64>();
-            byte_length::<_>(binary_array)
+            byte_length::<_>(&binary_array)
         }
         DataType::BinaryView => {
             let binary_array = args[0].as_binary_view();
-            byte_length::<_>(binary_array)
+            byte_length::<_>(&binary_array)
         }
         other => exec_err!("Unsupported data type {other:?} for function `length`"),
     }
 }
 
-fn character_length<'a, V>(array: V) -> datafusion_common::Result<ArrayRef>
+fn character_length<'a, V>(array: &V) -> datafusion_common::Result<ArrayRef>
 where
     V: StringArrayType<'a>,
 {
@@ -169,7 +176,7 @@ where
     Ok(Arc::new(array))
 }
 
-fn byte_length<'a, V>(array: V) -> datafusion_common::Result<ArrayRef>
+fn byte_length<'a, V>(array: &V) -> datafusion_common::Result<ArrayRef>
 where
     V: BinaryArrayType<'a>,
 {
@@ -191,10 +198,9 @@ where
 mod tests {
     use super::*;
     use crate::function::utils::test::test_scalar_function;
-    use arrow::array::{Array, Int32Array};
+    use arrow::array::Int32Array;
     use arrow::datatypes::DataType::Int32;
     use datafusion_common::{Result, ScalarValue};
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
 
     macro_rules! test_spark_length_string {
         ($INPUT:expr, $EXPECTED:expr) => {
@@ -279,4 +285,36 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_spark_length_nullability() -> Result<()> {
+        let func = SparkLengthFunc::new();
+
+        let nullable_field: FieldRef = Arc::new(Field::new("col", DataType::Utf8, true));
+
+        let out_nullable = func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[nullable_field],
+            scalar_arguments: &[None],
+        })?;
+
+        assert!(
+            out_nullable.is_nullable(),
+            "length(col) should be nullable when child is nullable"
+        );
+
+        let non_nullable_field: FieldRef =
+            Arc::new(Field::new("col", DataType::Utf8, false));
+
+        let out_non_nullable = func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[non_nullable_field],
+            scalar_arguments: &[None],
+        })?;
+
+        assert!(
+            !out_non_nullable.is_nullable(),
+            "length(col) should NOT be nullable when child is NOT nullable"
+        );
+
+        Ok(())
+    }
 }
diff --git a/datafusion/spark/src/function/string/like.rs b/datafusion/spark/src/function/string/like.rs
index df8eaef7cecbc..50f0822fcbf7c 100644
--- a/datafusion/spark/src/function/string/like.rs
+++ b/datafusion/spark/src/function/string/like.rs
@@ -17,12 +17,13 @@
 
 use arrow::array::ArrayRef;
 use arrow::compute::like;
-use arrow::datatypes::DataType;
-use datafusion_common::{exec_err, Result};
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::{Result, exec_err, internal_err};
 use datafusion_expr::ColumnarValue;
-use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility};
+use datafusion_expr::{
+    ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
 use datafusion_functions::utils::make_scalar_function;
-use std::any::Any;
 use std::sync::Arc;
 
 /// LIKE function for case-sensitive pattern matching
@@ -47,10 +48,6 @@ impl SparkLike {
 }
 
 impl ScalarUDFImpl for SparkLike {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "like"
     }
@@ -60,7 +57,16 @@ impl ScalarUDFImpl for SparkLike {
     }
 
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(DataType::Boolean)
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new(
+            self.name(),
+            DataType::Boolean,
+            nullable,
+        )))
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
@@ -84,8 +90,7 @@ mod tests {
     use crate::function::utils::test::test_scalar_function;
     use arrow::array::{Array, BooleanArray};
     use arrow::datatypes::DataType::Boolean;
-    use datafusion_common::{Result, ScalarValue};
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+    use datafusion_common::ScalarValue;
 
     macro_rules! test_like_string_invoke {
         ($INPUT1:expr, $INPUT2:expr, $EXPECTED:expr) => {
@@ -175,4 +180,73 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_like_nullability() {
+        let like = SparkLike::new();
+
+        // Test with non-nullable arguments
+        let non_nullable_field1 = Arc::new(Field::new("str", DataType::Utf8, false));
+        let non_nullable_field2 = Arc::new(Field::new("pattern", DataType::Utf8, false));
+
+        let both_non_nullable = like
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[
+                    Arc::clone(&non_nullable_field1),
+                    Arc::clone(&non_nullable_field2),
+                ],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        // The result should not be nullable when both inputs are non-nullable
+        assert!(!both_non_nullable.is_nullable());
+        assert_eq!(both_non_nullable.data_type(), &Boolean);
+
+        // Test with first argument nullable
+        let nullable_field1 = Arc::new(Field::new("str", DataType::Utf8, true));
+
+        let first_nullable = like
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[
+                    Arc::clone(&nullable_field1),
+                    Arc::clone(&non_nullable_field2),
+                ],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        // The result should be nullable when first input is nullable
+        assert!(first_nullable.is_nullable());
+        assert_eq!(first_nullable.data_type(), &Boolean);
+
+        // Test with second argument nullable
+        let nullable_field2 = Arc::new(Field::new("pattern", DataType::Utf8, true));
+
+        let second_nullable = like
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[
+                    Arc::clone(&non_nullable_field1),
+                    Arc::clone(&nullable_field2),
+                ],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        // The result should be nullable when second input is nullable
+        assert!(second_nullable.is_nullable());
+        assert_eq!(second_nullable.data_type(), &Boolean);
+
+        // Test with both arguments nullable
+        let first_second_nullable = like
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&nullable_field1), Arc::clone(&nullable_field2)],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        // The result should be nullable when both inputs are nullable
+        assert!(first_second_nullable.is_nullable());
+        assert_eq!(first_second_nullable.data_type(), &Boolean);
+    }
 }
diff --git a/datafusion/spark/src/function/string/luhn_check.rs b/datafusion/spark/src/function/string/luhn_check.rs
index 090b16e34b8f1..9241f5e70d085 100644
--- a/datafusion/spark/src/function/string/luhn_check.rs
+++ b/datafusion/spark/src/function/string/luhn_check.rs
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{any::Any, sync::Arc};
+use std::sync::Arc;
 
 use arrow::array::{Array, AsArray, BooleanArray};
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::Boolean;
 use datafusion_common::utils::take_function_args;
-use datafusion_common::{exec_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err};
 use datafusion_expr::{
     ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
     Volatility,
@@ -56,10 +56,6 @@ impl SparkLuhnCheck {
 }
 
 impl ScalarUDFImpl for SparkLuhnCheck {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "luhn_check"
     }
diff --git a/datafusion/spark/src/function/string/make_valid_utf8.rs b/datafusion/spark/src/function/string/make_valid_utf8.rs
new file mode 100644
index 0000000000000..d2c2ae8b00051
--- /dev/null
+++ b/datafusion/spark/src/function/string/make_valid_utf8.rs
@@ -0,0 +1,125 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, LargeStringArray, StringArray};
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::cast::{
+    as_binary_array, as_binary_view_array, as_large_binary_array,
+};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use std::sync::Arc;
+
+/// Spark-compatible `make_valid_utf8` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#make_valid_utf8>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkMakeValidUtf8 {
+    signature: Signature,
+}
+
+impl Default for SparkMakeValidUtf8 {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkMakeValidUtf8 {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::uniform(
+                1,
+                vec![
+                    DataType::Utf8,
+                    DataType::LargeUtf8,
+                    DataType::Utf8View,
+                    DataType::Binary,
+                    DataType::BinaryView,
+                    DataType::LargeBinary,
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkMakeValidUtf8 {
+    fn name(&self) -> &str {
+        "make_valid_utf8"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let [make_valid_utf8] = take_function_args(self.name(), args.arg_fields)?;
+        let return_type = match make_valid_utf8.data_type() {
+            DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => {
+                Ok(make_valid_utf8.data_type().clone())
+            }
+            DataType::Binary | DataType::BinaryView => Ok(DataType::Utf8),
+            DataType::LargeBinary => Ok(DataType::LargeUtf8),
+            data_type => internal_err!("make_valid_utf8 does not support: {data_type}"),
+        }?;
+        Ok(Arc::new(Field::new(
+            self.name(),
+            return_type,
+            make_valid_utf8.is_nullable(),
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_make_valid_utf8_inner, vec![])(&args.args)
+    }
+}
+
+fn spark_make_valid_utf8_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let array = &args[0];
+    match &array.data_type() {
+        DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => Ok(array.to_owned()),
+        DataType::Binary => Ok(Arc::new(
+            as_binary_array(&array)?
+                .iter()
+                .map(|x| x.map(String::from_utf8_lossy))
+                .collect::<StringArray>(),
+        )),
+        DataType::BinaryView => Ok(Arc::new(
+            as_binary_view_array(&array)?
+                .iter()
+                .map(|x| x.map(String::from_utf8_lossy))
+                .collect::<StringArray>(),
+        )),
+        DataType::LargeBinary => Ok(Arc::new(
+            as_large_binary_array(&array)?
+                .iter()
+                .map(|x| x.map(String::from_utf8_lossy))
+                .collect::<LargeStringArray>(),
+        )),
+        data_type => {
+            internal_err!("make_valid_utf8 does not support: {data_type}")
+        }
+    }
+}
diff --git a/datafusion/spark/src/function/string/mod.rs b/datafusion/spark/src/function/string/mod.rs
index 480984f02159b..64d603cb8bb67 100644
--- a/datafusion/spark/src/function/string/mod.rs
+++ b/datafusion/spark/src/function/string/mod.rs
@@ -16,20 +16,27 @@
 // under the License.
 
 pub mod ascii;
+pub mod base64;
 pub mod char;
 pub mod concat;
 pub mod elt;
 pub mod format_string;
 pub mod ilike;
+pub mod is_valid_utf8;
 pub mod length;
 pub mod like;
 pub mod luhn_check;
+pub mod make_valid_utf8;
+pub mod soundex;
+pub mod space;
+pub mod substring;
 
 use datafusion_expr::ScalarUDF;
 use datafusion_functions::make_udf_function;
 use std::sync::Arc;
 
 make_udf_function!(ascii::SparkAscii, ascii);
+make_udf_function!(base64::SparkBase64, base64);
 make_udf_function!(char::CharFunc, char);
 make_udf_function!(concat::SparkConcat, concat);
 make_udf_function!(ilike::SparkILike, ilike);
@@ -38,6 +45,12 @@ make_udf_function!(elt::SparkElt, elt);
 make_udf_function!(like::SparkLike, like);
 make_udf_function!(luhn_check::SparkLuhnCheck, luhn_check);
 make_udf_function!(format_string::FormatStringFunc, format_string);
+make_udf_function!(space::SparkSpace, space);
+make_udf_function!(substring::SparkSubstring, substring);
+make_udf_function!(base64::SparkUnBase64, unbase64);
+make_udf_function!(soundex::SparkSoundex, soundex);
+make_udf_function!(make_valid_utf8::SparkMakeValidUtf8, make_valid_utf8);
+make_udf_function!(is_valid_utf8::SparkIsValidUtf8, is_valid_utf8);
 
 pub mod expr_fn {
     use datafusion_functions::export_functions;
@@ -47,6 +60,11 @@ pub mod expr_fn {
         "Returns the ASCII code point of the first character of string.",
         arg1
     ));
+    export_functions!((
+        base64,
+        "Encodes the input binary `bin` into a base64 string.",
+        bin
+    ));
     export_functions!((
         char,
         "Returns the ASCII character having the binary equivalent to col. If col is larger than 256 the result is equivalent to char(col % 256).",
@@ -87,11 +105,34 @@ pub mod expr_fn {
         "Returns a formatted string from printf-style format strings.",
         strfmt args
     ));
+    export_functions!((space, "Returns a string consisting of n spaces.", arg1));
+    export_functions!((
+        substring,
+        "Returns the substring from string `str` starting at position `pos` with length `length.",
+        str pos length
+    ));
+    export_functions!((
+        unbase64,
+        "Decodes the input string `str` from a base64 string into binary data.",
+        str
+    ));
+    export_functions!((soundex, "Returns Soundex code of the string.", str));
+    export_functions!((
+        is_valid_utf8,
+        "Returns true if str is a valid UTF-8 string, otherwise returns false",
+        str
+    ));
+    export_functions!((
+        make_valid_utf8,
+        "Returns the original string if str is a valid UTF-8 string, otherwise returns a new string whose invalid UTF8 byte sequences are replaced using the UNICODE replacement character U+FFFD.",
+        str
+    ));
 }
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
     vec![
         ascii(),
+        base64(),
         char(),
         concat(),
         elt(),
@@ -100,5 +141,11 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
         like(),
         luhn_check(),
         format_string(),
+        space(),
+        substring(),
+        unbase64(),
+        soundex(),
+        make_valid_utf8(),
+        is_valid_utf8(),
     ]
 }
diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs
new file mode 100644
index 0000000000000..1fef0d5384821
--- /dev/null
+++ b/datafusion/spark/src/function/string/soundex.rs
@@ -0,0 +1,150 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, OffsetSizeTrait, StringArray};
+use arrow::datatypes::DataType;
+use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, exec_err};
+use datafusion_expr::{ColumnarValue, Signature, Volatility};
+use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions::utils::make_scalar_function;
+use std::sync::Arc;
+
+/// Spark-compatible `soundex` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#soundex>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkSoundex {
+    signature: Signature,
+}
+
+impl Default for SparkSoundex {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkSoundex {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::string(1, Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkSoundex {
+    fn name(&self) -> &str {
+        "soundex"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        match &arg_types[0] {
+            DataType::LargeUtf8 => Ok(DataType::LargeUtf8),
+            _ => Ok(DataType::Utf8),
+        }
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_soundex_inner, vec![])(&args.args)
+    }
+}
+
+fn spark_soundex_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
+    let [array] = take_function_args("soundex", arg)?;
+    match &array.data_type() {
+        DataType::Utf8 => soundex_array::<i32>(array),
+        DataType::LargeUtf8 => soundex_array::<i64>(array),
+        DataType::Utf8View => soundex_view(array),
+        other => {
+            exec_err!("unsupported data type {other:?} for function `soundex`")
+        }
+    }
+}
+
+fn soundex_array<T: OffsetSizeTrait>(array: &ArrayRef) -> Result<ArrayRef> {
+    let str_array = as_generic_string_array::<T>(array)?;
+    let result = str_array
+        .iter()
+        .map(|s| s.map(compute_soundex))
+        .collect::<StringArray>();
+    Ok(Arc::new(result))
+}
+
+fn soundex_view(str_view: &ArrayRef) -> Result<ArrayRef> {
+    let str_array = as_string_view_array(str_view)?;
+    let result = str_array
+        .iter()
+        .map(|opt_str| opt_str.map(compute_soundex))
+        .collect::<StringArray>();
+    Ok(Arc::new(result) as ArrayRef)
+}
+
+fn classify_char(c: char) -> Option<char> {
+    match c.to_ascii_uppercase() {
+        'B' | 'F' | 'P' | 'V' => Some('1'),
+        'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => Some('2'),
+        'D' | 'T' => Some('3'),
+        'L' => Some('4'),
+        'M' | 'N' => Some('5'),
+        'R' => Some('6'),
+        _ => None,
+    }
+}
+
+fn is_ignored(c: char) -> bool {
+    matches!(c.to_ascii_uppercase(), 'H' | 'W')
+}
+
+fn compute_soundex(s: &str) -> String {
+    let mut chars = s.chars();
+
+    let first_char = match chars.next() {
+        Some(c) if c.is_ascii_alphabetic() => c.to_ascii_uppercase(),
+        _ => return s.to_string(),
+    };
+
+    let mut soundex_code = String::with_capacity(4);
+    soundex_code.push(first_char);
+    let mut last_code = classify_char(first_char);
+
+    for c in chars {
+        if soundex_code.len() >= 4 {
+            break;
+        }
+
+        if is_ignored(c) {
+            continue;
+        }
+
+        match classify_char(c) {
+            Some(code) => {
+                if last_code != Some(code) {
+                    soundex_code.push(code);
+                }
+                last_code = Some(code);
+            }
+            None => {
+                last_code = None;
+            }
+        }
+    }
+    format!("{soundex_code:0<4}")
+}
diff --git a/datafusion/spark/src/function/string/space.rs b/datafusion/spark/src/function/string/space.rs
new file mode 100644
index 0000000000000..a231401f3eef4
--- /dev/null
+++ b/datafusion/spark/src/function/string/space.rs
@@ -0,0 +1,227 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Array, ArrayRef, DictionaryArray, Int32Array, StringArray, StringBuilder,
+    as_dictionary_array,
+};
+use arrow::datatypes::{DataType, Int32Type};
+use datafusion_common::cast::as_int32_array;
+use datafusion_common::{Result, ScalarValue, exec_err};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use std::sync::Arc;
+
+/// Spark-compatible `space` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#space>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkSpace {
+    signature: Signature,
+}
+
+impl Default for SparkSpace {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkSpace {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::uniform(
+                1,
+                vec![
+                    DataType::Int32,
+                    DataType::Dictionary(
+                        Box::new(DataType::Int32),
+                        Box::new(DataType::Int32),
+                    ),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkSpace {
+    fn name(&self) -> &str {
+        "space"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, args: &[DataType]) -> Result<DataType> {
+        let return_type = match &args[0] {
+            DataType::Dictionary(key_type, _) => {
+                DataType::Dictionary(key_type.clone(), Box::new(DataType::Utf8))
+            }
+            _ => DataType::Utf8,
+        };
+        Ok(return_type)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        spark_space(&args.args)
+    }
+}
+
+pub fn spark_space(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+    if args.len() != 1 {
+        return exec_err!("space function takes exactly one argument");
+    }
+    match &args[0] {
+        ColumnarValue::Array(array) => {
+            let result = spark_space_array(array)?;
+            Ok(ColumnarValue::Array(result))
+        }
+        ColumnarValue::Scalar(scalar) => {
+            let result = spark_space_scalar(scalar)?;
+            Ok(ColumnarValue::Scalar(result))
+        }
+    }
+}
+
+fn spark_space_array(array: &ArrayRef) -> Result<ArrayRef> {
+    match array.data_type() {
+        DataType::Int32 => {
+            let array = as_int32_array(array)?;
+            Ok(Arc::new(spark_space_array_inner(array)))
+        }
+        DataType::Dictionary(_, _) => {
+            let dict = as_dictionary_array::<Int32Type>(array);
+            let values = spark_space_array(dict.values())?;
+            let result = DictionaryArray::try_new(dict.keys().clone(), values)?;
+            Ok(Arc::new(result))
+        }
+        other => {
+            exec_err!("Unsupported data type {other:?} for function `space`")
+        }
+    }
+}
+
+fn spark_space_scalar(scalar: &ScalarValue) -> Result<ScalarValue> {
+    match scalar {
+        ScalarValue::Int32(value) => {
+            let result = value.map(|v| {
+                if v <= 0 {
+                    String::new()
+                } else {
+                    " ".repeat(v as usize)
+                }
+            });
+            Ok(ScalarValue::Utf8(result))
+        }
+        other => {
+            exec_err!("Unsupported data type {other:?} for function `space`")
+        }
+    }
+}
+
+fn spark_space_array_inner(array: &Int32Array) -> StringArray {
+    let mut builder = StringBuilder::with_capacity(array.len(), array.len() * 16);
+    let mut space_buf = String::new();
+    for value in array.iter() {
+        match value {
+            None => builder.append_null(),
+            Some(l) if l > 0 => {
+                let l = l as usize;
+                if space_buf.len() < l {
+                    space_buf = " ".repeat(l);
+                }
+                builder.append_value(&space_buf[..l]);
+            }
+            Some(_) => builder.append_value(""),
+        }
+    }
+    builder.finish()
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::function::string::space::spark_space;
+    use arrow::array::{Array, Int32Array, Int32DictionaryArray};
+    use arrow::datatypes::Int32Type;
+    use datafusion_common::cast::{as_dictionary_array, as_string_array};
+    use datafusion_common::{Result, ScalarValue};
+    use datafusion_expr::ColumnarValue;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_spark_space_int32_array() -> Result<()> {
+        let int32_array = ColumnarValue::Array(Arc::new(Int32Array::from(vec![
+            Some(1),
+            Some(-3),
+            Some(0),
+            Some(5),
+            None,
+        ])));
+        let ColumnarValue::Array(result) = spark_space(&[int32_array])? else {
+            unreachable!()
+        };
+        let result = as_string_array(&result)?;
+
+        assert_eq!(result.value(0), " ");
+        assert_eq!(result.value(1), "");
+        assert_eq!(result.value(2), "");
+        assert_eq!(result.value(3), "     ");
+        assert!(result.is_null(4));
+        Ok(())
+    }
+
+    #[test]
+    fn test_spark_space_dictionary() -> Result<()> {
+        let dictionary = ColumnarValue::Array(Arc::new(Int32DictionaryArray::new(
+            Int32Array::from(vec![0, 1, 2, 3, 4]),
+            Arc::new(Int32Array::from(vec![
+                Some(1),
+                Some(-3),
+                Some(0),
+                Some(5),
+                None,
+            ])),
+        )));
+        let ColumnarValue::Array(result) = spark_space(&[dictionary])? else {
+            unreachable!()
+        };
+        let result =
+            as_string_array(as_dictionary_array::<Int32Type>(&result)?.values())?;
+        assert_eq!(result.value(0), " ");
+        assert_eq!(result.value(1), "");
+        assert_eq!(result.value(2), "");
+        assert_eq!(result.value(3), "     ");
+        assert!(result.is_null(4));
+        Ok(())
+    }
+
+    #[test]
+    fn test_spark_space_scalar() -> Result<()> {
+        let scalar = ColumnarValue::Scalar(ScalarValue::Int32(Some(-5)));
+        let ColumnarValue::Scalar(result) = spark_space(&[scalar])? else {
+            unreachable!()
+        };
+        match result {
+            ScalarValue::Utf8(Some(result)) => {
+                assert_eq!(result, "");
+            }
+            _ => unreachable!(),
+        }
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/string/substring.rs b/datafusion/spark/src/function/string/substring.rs
new file mode 100644
index 0000000000000..21b22423298b7
--- /dev/null
+++ b/datafusion/spark/src/function/string/substring.rs
@@ -0,0 +1,251 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Array, ArrayBuilder, ArrayRef, AsArray, GenericStringBuilder, Int64Array,
+    OffsetSizeTrait, StringArrayType, StringViewBuilder,
+};
+use arrow::datatypes::DataType;
+use datafusion_common::arrow::datatypes::{Field, FieldRef};
+use datafusion_common::cast::as_int64_array;
+use datafusion_common::types::{
+    NativeType, logical_int32, logical_int64, logical_string,
+};
+use datafusion_common::{Result, exec_err};
+use datafusion_expr::{Coercion, ReturnFieldArgs, TypeSignatureClass};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+    Volatility,
+};
+use datafusion_functions::unicode::substr::{enable_ascii_fast_path, get_true_start_end};
+use datafusion_functions::utils::make_scalar_function;
+use std::sync::Arc;
+
+/// Spark-compatible `substring` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#substring>
+///
+/// Returns the substring from string starting at position pos with length len.
+/// Position is 1-indexed. If pos is negative, it counts from the end of the string.
+/// Returns NULL if any input is NULL.
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkSubstring {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkSubstring {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkSubstring {
+    pub fn new() -> Self {
+        let string = Coercion::new_exact(TypeSignatureClass::Native(logical_string()));
+        let int64 = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int64()),
+            vec![TypeSignatureClass::Native(logical_int32())],
+            NativeType::Int64,
+        );
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![string.clone(), int64.clone()]),
+                    TypeSignature::Coercible(vec![
+                        string.clone(),
+                        int64.clone(),
+                        int64.clone(),
+                    ]),
+                ],
+                Volatility::Immutable,
+            )
+            .with_parameter_names(vec![
+                "str".to_string(),
+                "pos".to_string(),
+                "length".to_string(),
+            ])
+            .expect("valid parameter names"),
+            aliases: vec![String::from("substr")],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkSubstring {
+    fn name(&self) -> &str {
+        "substring"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_substring, vec![])(&args.args)
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        datafusion_common::internal_err!(
+            "return_type should not be called for Spark substring"
+        )
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs<'_>) -> Result<FieldRef> {
+        // Spark semantics: substring returns NULL if ANY input is NULL
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        Ok(Arc::new(Field::new(
+            "substring",
+            args.arg_fields[0].data_type().clone(),
+            nullable,
+        )))
+    }
+}
+
+fn spark_substring(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let start_array = as_int64_array(&args[1])?;
+    let length_array = if args.len() > 2 {
+        Some(as_int64_array(&args[2])?)
+    } else {
+        None
+    };
+
+    match args[0].data_type() {
+        DataType::Utf8 => spark_substring_impl(
+            &args[0].as_string::<i32>(),
+            start_array,
+            length_array,
+            GenericStringBuilder::<i32>::new(),
+        ),
+        DataType::LargeUtf8 => spark_substring_impl(
+            &args[0].as_string::<i64>(),
+            start_array,
+            length_array,
+            GenericStringBuilder::<i64>::new(),
+        ),
+        DataType::Utf8View => spark_substring_impl(
+            &args[0].as_string_view(),
+            start_array,
+            length_array,
+            StringViewBuilder::new(),
+        ),
+        other => exec_err!(
+            "Unsupported data type {other:?} for function spark_substring, expected Utf8View, Utf8 or LargeUtf8."
+        ),
+    }
+}
+
+/// Convert Spark's start position to DataFusion's 1-based start position.
+///
+/// Spark semantics:
+/// - Positive start: 1-based index from beginning
+/// - Zero start: treated as 1
+/// - Negative start: counts from end of string
+///
+/// The result may be `<= 0` when a negative start lands before the string
+/// (e.g. `start=-10` on a 3-char string gives `-6`). Such values are passed
+/// through to `get_true_start_end`, which clamps them and yields an empty
+/// slice — matching Spark's behavior for out-of-range negative positions.
+#[inline]
+fn spark_start_to_datafusion_start(start: i64, len: usize) -> i64 {
+    if start >= 0 {
+        start.max(1)
+    } else {
+        let len_i64 = i64::try_from(len).unwrap_or(i64::MAX);
+        start + len_i64 + 1
+    }
+}
+
+trait StringArrayBuilder: ArrayBuilder {
+    fn append_value(&mut self, val: &str);
+    fn append_null(&mut self);
+}
+
+impl<O: OffsetSizeTrait> StringArrayBuilder for GenericStringBuilder<O> {
+    fn append_value(&mut self, val: &str) {
+        GenericStringBuilder::append_value(self, val);
+    }
+    fn append_null(&mut self) {
+        GenericStringBuilder::append_null(self);
+    }
+}
+
+impl StringArrayBuilder for StringViewBuilder {
+    fn append_value(&mut self, val: &str) {
+        StringViewBuilder::append_value(self, val);
+    }
+    fn append_null(&mut self) {
+        StringViewBuilder::append_null(self);
+    }
+}
+
+fn spark_substring_impl<'a, V, B>(
+    string_array: &V,
+    start_array: &Int64Array,
+    length_array: Option<&Int64Array>,
+    mut builder: B,
+) -> Result<ArrayRef>
+where
+    V: StringArrayType<'a>,
+    B: StringArrayBuilder,
+{
+    let is_ascii = enable_ascii_fast_path(string_array, start_array, length_array);
+
+    for i in 0..string_array.len() {
+        if string_array.is_null(i) || start_array.is_null(i) {
+            builder.append_null();
+            continue;
+        }
+
+        if let Some(len_arr) = length_array
+            && len_arr.is_null(i)
+        {
+            builder.append_null();
+            continue;
+        }
+
+        let string = string_array.value(i);
+        let start = start_array.value(i);
+        let len_opt = length_array.map(|arr| arr.value(i));
+
+        // Spark: negative length returns empty string
+        if let Some(len) = len_opt
+            && len < 0
+        {
+            builder.append_value("");
+            continue;
+        }
+
+        let string_len = if is_ascii {
+            string.len()
+        } else {
+            string.chars().count()
+        };
+
+        let adjusted_start = spark_start_to_datafusion_start(start, string_len);
+
+        let (byte_start, byte_end) =
+            get_true_start_end(string, adjusted_start, len_opt, is_ascii)?;
+        let substr = &string[byte_start..byte_end];
+        builder.append_value(substr);
+    }
+
+    Ok(builder.finish())
+}
diff --git a/datafusion/spark/src/function/url/mod.rs b/datafusion/spark/src/function/url/mod.rs
index 82bf8a9e09616..1313edaed5347 100644
--- a/datafusion/spark/src/function/url/mod.rs
+++ b/datafusion/spark/src/function/url/mod.rs
@@ -21,9 +21,15 @@ use std::sync::Arc;
 
 pub mod parse_url;
 pub mod try_parse_url;
+pub mod try_url_decode;
+pub mod url_decode;
+pub mod url_encode;
 
 make_udf_function!(parse_url::ParseUrl, parse_url);
 make_udf_function!(try_parse_url::TryParseUrl, try_parse_url);
+make_udf_function!(try_url_decode::TryUrlDecode, try_url_decode);
+make_udf_function!(url_decode::UrlDecode, url_decode);
+make_udf_function!(url_encode::UrlEncode, url_encode);
 
 pub mod expr_fn {
     use datafusion_functions::export_functions;
@@ -38,8 +44,29 @@ pub mod expr_fn {
         "Same as parse_url but returns NULL if an invalid URL is provided.",
         args
     ));
+    export_functions!((
+        url_decode,
+        "Decodes a URL-encoded string in ‘application/x-www-form-urlencoded’ format to its original format.",
+        args
+    ));
+    export_functions!((
+        try_url_decode,
+        "Same as url_decode but returns NULL if an invalid URL-encoded string is provided",
+        args
+    ));
+    export_functions!((
+        url_encode,
+        "Encodes a string into a URL-encoded string in ‘application/x-www-form-urlencoded’ format.",
+        args
+    ));
 }
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
-    vec![parse_url(), try_parse_url()]
+    vec![
+        parse_url(),
+        try_parse_url(),
+        try_url_decode(),
+        url_decode(),
+        url_encode(),
+    ]
 }
diff --git a/datafusion/spark/src/function/url/parse_url.rs b/datafusion/spark/src/function/url/parse_url.rs
index d93c260b4f340..2374d6c084b0d 100644
--- a/datafusion/spark/src/function/url/parse_url.rs
+++ b/datafusion/spark/src/function/url/parse_url.rs
@@ -15,7 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{
@@ -26,7 +25,7 @@ use arrow::datatypes::DataType;
 use datafusion_common::cast::{
     as_large_string_array, as_string_array, as_string_view_array,
 };
-use datafusion_common::{exec_datafusion_err, exec_err, Result};
+use datafusion_common::{Result, exec_datafusion_err, exec_err};
 use datafusion_expr::{
     ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
     Volatility,
@@ -80,14 +79,43 @@ impl ParseUrl {
     /// * `Ok(Some(String))` - The extracted URL component as a string
     /// * `Ok(None)` - If the requested component doesn't exist or is empty
     /// * `Err(DataFusionError)` - If the URL is malformed and cannot be parsed
-    ///
     fn parse(value: &str, part: &str, key: Option<&str>) -> Result<Option<String>> {
         let url: std::result::Result<Url, ParseError> = Url::parse(value);
         if let Err(ParseError::RelativeUrlWithoutBase) = url {
             return if !value.contains("://") {
-                Ok(None)
+                // Schemeless URLs are treated as relative URIs (like java.net.URI).
+                // Manually parse path, query, and fragment components.
+                let (without_fragment, fragment) = match value.split_once('#') {
+                    Some((before, frag)) => (before, Some(frag)),
+                    None => (value, None),
+                };
+                let (path, query) = match without_fragment.split_once('?') {
+                    Some((p, q)) => (p, Some(q)),
+                    None => (without_fragment, None),
+                };
+                Ok(match part {
+                    "PATH" => Some(path.to_string()),
+                    "QUERY" => match key {
+                        None => query.map(String::from),
+                        Some(key) => query.and_then(|q| {
+                            q.split('&')
+                                .filter_map(|pair| pair.split_once('='))
+                                .find(|(k, _)| *k == key)
+                                .map(|(_, v)| v.to_string())
+                        }),
+                    },
+                    "REF" => fragment.map(String::from),
+                    "FILE" => {
+                        // FILE = path + query (without fragment)
+                        Some(without_fragment.to_string())
+                    }
+                    // HOST, PROTOCOL, AUTHORITY, USERINFO → NULL
+                    _ => None,
+                })
             } else {
-                Err(exec_datafusion_err!("The url is invalid: {value}. Use `try_parse_url` to tolerate invalid URL and return NULL instead. SQLSTATE: 22P02"))
+                Err(exec_datafusion_err!(
+                    "The url is invalid: {value}. Use `try_parse_url` to tolerate invalid URL and return NULL instead. SQLSTATE: 22P02"
+                ))
             };
         };
         url.map_err(|e| exec_datafusion_err!("{e:?}"))
@@ -131,10 +159,6 @@ impl ParseUrl {
 }
 
 impl ScalarUDFImpl for ParseUrl {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "parse_url"
     }
@@ -168,7 +192,6 @@ impl ScalarUDFImpl for ParseUrl {
 /// - A string array with extracted URL components
 /// - `None` values where extraction failed or component doesn't exist
 /// - The output array type (StringArray or LargeStringArray) is determined by input types
-///
 fn spark_parse_url(args: &[ArrayRef]) -> Result<ArrayRef> {
     spark_handled_parse_url(args, |x| x)
 }
@@ -188,7 +211,7 @@ pub fn spark_handled_parse_url(
     let url = &args[0];
     let part = &args[1];
 
-    let result = if args.len() == 3 {
+    if args.len() == 3 {
         // In this case, the 'key' argument is passed
         let key = &args[2];
 
@@ -199,6 +222,7 @@ pub fn spark_handled_parse_url(
                     as_string_array(part)?,
                     as_string_array(key)?,
                     handler_err,
+                    true,
                 )
             }
             (DataType::Utf8View, DataType::Utf8View, DataType::Utf8View) => {
@@ -207,6 +231,7 @@ pub fn spark_handled_parse_url(
                     as_string_view_array(part)?,
                     as_string_view_array(key)?,
                     handler_err,
+                    true,
                 )
             }
             (DataType::LargeUtf8, DataType::LargeUtf8, DataType::LargeUtf8) => {
@@ -215,9 +240,15 @@ pub fn spark_handled_parse_url(
                     as_large_string_array(part)?,
                     as_large_string_array(key)?,
                     handler_err,
+                    true,
                 )
             }
-            _ => exec_err!("{} expects STRING arguments, got {:?}", "`parse_url`", args),
+            _ => exec_err!(
+                "`parse_url` expects STRING arguments, got ({}, {}, {})",
+                url.data_type(),
+                part.data_type(),
+                key.data_type()
+            ),
         }
     } else {
         // The 'key' argument is omitted, assume all values are null
@@ -235,6 +266,7 @@ pub fn spark_handled_parse_url(
                     as_string_array(part)?,
                     &key,
                     handler_err,
+                    false,
                 )
             }
             (DataType::Utf8View, DataType::Utf8View) => {
@@ -243,6 +275,7 @@ pub fn spark_handled_parse_url(
                     as_string_view_array(part)?,
                     &key,
                     handler_err,
+                    false,
                 )
             }
             (DataType::LargeUtf8, DataType::LargeUtf8) => {
@@ -251,12 +284,16 @@ pub fn spark_handled_parse_url(
                     as_large_string_array(part)?,
                     &key,
                     handler_err,
+                    false,
                 )
             }
-            _ => exec_err!("{} expects STRING arguments, got {:?}", "`parse_url`", args),
+            _ => exec_err!(
+                "`parse_url` expects STRING arguments, got ({}, {})",
+                url.data_type(),
+                part.data_type()
+            ),
         }
-    };
-    result
+    }
 }
 
 fn process_parse_url<'a, A, B, C, T>(
@@ -264,6 +301,7 @@ fn process_parse_url<'a, A, B, C, T>(
     part_array: &'a B,
     key_array: &'a C,
     handle: impl Fn(Result<Option<String>>) -> Result<Option<String>>,
+    has_key_arg: bool,
 ) -> Result<ArrayRef>
 where
     &'a A: StringArrayType<'a>,
@@ -276,7 +314,11 @@ where
         .zip(part_array.iter())
         .zip(key_array.iter())
         .map(|((url, part), key)| {
-            if let (Some(url), Some(part), key) = (url, part, key) {
+            // Spark returns NULL when the third argument is explicitly NULL
+            if has_key_arg && key.is_none() {
+                return Ok(None);
+            }
+            if let (Some(url), Some(part)) = (url, part) {
                 handle(ParseUrl::parse(url, part, key))
             } else {
                 Ok(None)
@@ -289,10 +331,8 @@ where
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::array::{ArrayRef, Int32Array, StringArray};
-    use datafusion_common::Result;
+    use arrow::array::Int32Array;
     use std::array::from_ref;
-    use std::sync::Arc;
 
     fn sa(vals: &[Option<&str>]) -> ArrayRef {
         Arc::new(StringArray::from(vals.to_vec())) as ArrayRef
@@ -349,9 +389,86 @@ mod tests {
     }
 
     #[test]
-    fn test_parse_malformed_url_returns_error() -> Result<()> {
-        let got = ParseUrl::parse("notaurl", "HOST", None)?;
-        assert_eq!(got, None);
+    fn test_parse_schemeless_url() -> Result<()> {
+        // Spark's java.net.URI treats schemeless strings as relative URIs.
+        // Simple schemeless string: no query, no fragment.
+        assert_eq!(
+            ParseUrl::parse("notaurl", "PATH", None)?,
+            Some("notaurl".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl", "FILE", None)?,
+            Some("notaurl".to_string())
+        );
+        assert_eq!(ParseUrl::parse("notaurl", "HOST", None)?, None);
+        assert_eq!(ParseUrl::parse("notaurl", "PROTOCOL", None)?, None);
+        assert_eq!(ParseUrl::parse("notaurl", "QUERY", None)?, None);
+        assert_eq!(ParseUrl::parse("notaurl", "REF", None)?, None);
+        assert_eq!(ParseUrl::parse("notaurl", "AUTHORITY", None)?, None);
+        assert_eq!(ParseUrl::parse("notaurl", "USERINFO", None)?, None);
+
+        // Schemeless URL with query string
+        assert_eq!(
+            ParseUrl::parse("notaurl?key=value", "PATH", None)?,
+            Some("notaurl".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?key=value", "FILE", None)?,
+            Some("notaurl?key=value".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?key=value", "QUERY", None)?,
+            Some("key=value".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?key=value", "QUERY", Some("key"))?,
+            Some("value".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?key=value", "QUERY", Some("missing"))?,
+            None
+        );
+        assert_eq!(ParseUrl::parse("notaurl?key=value", "HOST", None)?, None);
+        assert_eq!(
+            ParseUrl::parse("notaurl?key=value", "PROTOCOL", None)?,
+            None
+        );
+
+        // Schemeless URL with fragment
+        assert_eq!(
+            ParseUrl::parse("notaurl#reference", "REF", None)?,
+            Some("reference".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl#reference", "PATH", None)?,
+            Some("notaurl".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl#reference", "FILE", None)?,
+            Some("notaurl".to_string())
+        );
+
+        // Schemeless URL with both query and fragment
+        assert_eq!(
+            ParseUrl::parse("notaurl?a=1&b=2#frag", "PATH", None)?,
+            Some("notaurl".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?a=1&b=2#frag", "QUERY", None)?,
+            Some("a=1&b=2".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?a=1&b=2#frag", "QUERY", Some("b"))?,
+            Some("2".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?a=1&b=2#frag", "REF", None)?,
+            Some("frag".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?a=1&b=2#frag", "FILE", None)?,
+            Some("notaurl?a=1&b=2".to_string())
+        );
         Ok(())
     }
 
diff --git a/datafusion/spark/src/function/url/try_parse_url.rs b/datafusion/spark/src/function/url/try_parse_url.rs
index c04850f3a6bf0..c9cafef97ba9f 100644
--- a/datafusion/spark/src/function/url/try_parse_url.rs
+++ b/datafusion/spark/src/function/url/try_parse_url.rs
@@ -15,9 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
-
-use crate::function::url::parse_url::{spark_handled_parse_url, ParseUrl};
+use crate::function::url::parse_url::{ParseUrl, spark_handled_parse_url};
 use arrow::array::ArrayRef;
 use arrow::datatypes::DataType;
 use datafusion_common::Result;
@@ -52,10 +50,6 @@ impl TryParseUrl {
 }
 
 impl ScalarUDFImpl for TryParseUrl {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "try_parse_url"
     }
diff --git a/datafusion/spark/src/function/url/try_url_decode.rs b/datafusion/spark/src/function/url/try_url_decode.rs
new file mode 100644
index 0000000000000..78968288fc2f5
--- /dev/null
+++ b/datafusion/spark/src/function/url/try_url_decode.rs
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::ArrayRef;
+use arrow::datatypes::DataType;
+
+use datafusion_common::Result;
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+use crate::function::url::url_decode::{UrlDecode, spark_handled_url_decode};
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct TryUrlDecode {
+    signature: Signature,
+    url_decoder: UrlDecode,
+}
+
+impl Default for TryUrlDecode {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl TryUrlDecode {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::string(1, Volatility::Immutable),
+            url_decoder: UrlDecode::new(),
+        }
+    }
+}
+
+impl ScalarUDFImpl for TryUrlDecode {
+    fn name(&self) -> &str {
+        "try_url_decode"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        self.url_decoder.return_type(arg_types)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+        make_scalar_function(spark_try_url_decode, vec![])(&args)
+    }
+}
+
+fn spark_try_url_decode(args: &[ArrayRef]) -> Result<ArrayRef> {
+    spark_handled_url_decode(args, |x| match x {
+        Err(_) => Ok(None),
+        result => result,
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::array::StringArray;
+    use datafusion_common::cast::as_string_array;
+
+    use super::*;
+
+    #[test]
+    fn test_try_decode_error_handled() -> Result<()> {
+        let input = Arc::new(StringArray::from(vec![
+            Some("http%3A%2F%2spark.apache.org"), // '%2s' is not a valid percent encoded character
+            // Valid cases
+            Some("https%3A%2F%2Fspark.apache.org"),
+            None,
+        ]));
+
+        let expected =
+            StringArray::from(vec![None, Some("https://spark.apache.org"), None]);
+
+        let result = spark_try_url_decode(&[input as ArrayRef])?;
+        let result = as_string_array(&result)?;
+
+        assert_eq!(&expected, result);
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/url/url_decode.rs b/datafusion/spark/src/function/url/url_decode.rs
new file mode 100644
index 0000000000000..0966cc380e497
--- /dev/null
+++ b/datafusion/spark/src/function/url/url_decode.rs
@@ -0,0 +1,254 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::borrow::Cow;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray};
+use arrow::datatypes::DataType;
+use datafusion_common::cast::{
+    as_large_string_array, as_string_array, as_string_view_array,
+};
+use datafusion_common::{Result, exec_datafusion_err, exec_err, plan_err};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use percent_encoding::percent_decode;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct UrlDecode {
+    signature: Signature,
+}
+
+impl Default for UrlDecode {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl UrlDecode {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::string(1, Volatility::Immutable),
+        }
+    }
+
+    /// Decodes a URL-encoded string from application/x-www-form-urlencoded format.
+    /// Although the `url::form_urlencoded` support decoding, it does not return error when the string is malformed
+    ///     For example: "%2s" is not a valid percent-encoding, the `decode` function from `url::form_urlencoded`
+    ///                  will ignore this instead of return error
+    /// This function reproduce the same decoding process, plus an extra validation step
+    /// See <https://github.com/servo/rust-url/blob/b06048d70d4cc9cf4ffb277f06cfcebd53b2141e/form_urlencoded/src/lib.rs#L70-L76>
+    ///
+    /// # Arguments
+    ///
+    /// * `value` - The URL-encoded string to decode
+    ///
+    /// # Returns
+    ///
+    /// * `Ok(String)` - The decoded string
+    /// * `Err(DataFusionError)` - If the input is malformed or contains invalid UTF-8
+    ///
+    fn decode(value: &str) -> Result<String> {
+        // Check if the string has valid percent encoding
+        Self::validate_percent_encoding(value)?;
+
+        let replaced = Self::replace_plus(value.as_bytes());
+        percent_decode(&replaced)
+            .decode_utf8()
+            .map_err(|e| exec_datafusion_err!("Invalid UTF-8 sequence: {e}"))
+            .map(|parsed| parsed.into_owned())
+    }
+
+    /// Replace b'+' with b' '
+    /// See: <https://github.com/servo/rust-url/blob/dbd526178ed9276176602dd039022eba89e8fc93/form_urlencoded/src/lib.rs#L79-L93>
+    fn replace_plus(input: &[u8]) -> Cow<'_, [u8]> {
+        match input.iter().position(|&b| b == b'+') {
+            None => Cow::Borrowed(input),
+            Some(first_position) => {
+                let mut replaced = input.to_owned();
+                replaced[first_position] = b' ';
+                for byte in &mut replaced[first_position + 1..] {
+                    if *byte == b'+' {
+                        *byte = b' ';
+                    }
+                }
+                Cow::Owned(replaced)
+            }
+        }
+    }
+
+    /// Validate percent-encoding of the string
+    fn validate_percent_encoding(value: &str) -> Result<()> {
+        let bytes = value.as_bytes();
+        let mut i = 0;
+
+        while i < bytes.len() {
+            if bytes[i] == b'%' {
+                // Check if we have at least 2 more characters
+                if i + 2 >= bytes.len() {
+                    return exec_err!(
+                        "Invalid percent-encoding: incomplete sequence at position {}",
+                        i
+                    );
+                }
+
+                let hex1 = bytes[i + 1];
+                let hex2 = bytes[i + 2];
+
+                if !hex1.is_ascii_hexdigit() || !hex2.is_ascii_hexdigit() {
+                    return exec_err!(
+                        "Invalid percent-encoding: invalid hex sequence '%{}{}' at position {}",
+                        hex1 as char,
+                        hex2 as char,
+                        i
+                    );
+                }
+                i += 3;
+            } else {
+                i += 1;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl ScalarUDFImpl for UrlDecode {
+    fn name(&self) -> &str {
+        "url_decode"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types.len() != 1 {
+            return plan_err!(
+                "{} expects 1 argument, but got {}",
+                self.name(),
+                arg_types.len()
+            );
+        }
+        // As the type signature is already checked, we can safely return the type of the first argument
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+        make_scalar_function(spark_url_decode, vec![])(&args)
+    }
+}
+
+/// Core implementation of URL decoding function.
+///
+/// # Arguments
+///
+/// * `args` - A slice containing exactly one ArrayRef with the URL-encoded strings to decode
+///
+/// # Returns
+///
+/// * `Ok(ArrayRef)` - A new array of the same type containing decoded strings
+/// * `Err(DataFusionError)` - If validation fails or invalid arguments are provided
+///
+fn spark_url_decode(args: &[ArrayRef]) -> Result<ArrayRef> {
+    spark_handled_url_decode(args, |x| x)
+}
+
+pub fn spark_handled_url_decode(
+    args: &[ArrayRef],
+    err_handle_fn: impl Fn(Result<Option<String>>) -> Result<Option<String>>,
+) -> Result<ArrayRef> {
+    if args.len() != 1 {
+        return exec_err!("`url_decode` expects 1 argument");
+    }
+
+    match &args[0].data_type() {
+        DataType::Utf8 => as_string_array(&args[0])?
+            .iter()
+            .map(|x| x.map(UrlDecode::decode).transpose())
+            .map(&err_handle_fn)
+            .collect::<Result<StringArray>>()
+            .map(|array| Arc::new(array) as ArrayRef),
+        DataType::LargeUtf8 => as_large_string_array(&args[0])?
+            .iter()
+            .map(|x| x.map(UrlDecode::decode).transpose())
+            .map(&err_handle_fn)
+            .collect::<Result<LargeStringArray>>()
+            .map(|array| Arc::new(array) as ArrayRef),
+        DataType::Utf8View => as_string_view_array(&args[0])?
+            .iter()
+            .map(|x| x.map(UrlDecode::decode).transpose())
+            .map(&err_handle_fn)
+            .collect::<Result<StringViewArray>>()
+            .map(|array| Arc::new(array) as ArrayRef),
+        other => exec_err!("`url_decode`: Expr must be STRING, got {other:?}"),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    #[test]
+    fn test_decode() -> Result<()> {
+        let input = Arc::new(StringArray::from(vec![
+            Some("https%3A%2F%2Fspark.apache.org"),
+            Some("inva+lid://user:pass@host/file\\;param?query\\;p2"),
+            Some("inva lid://user:pass@host/file\\;param?query\\;p2"),
+            Some("%7E%21%40%23%24%25%5E%26%2A%28%29%5F%2B"),
+            Some("%E4%BD%A0%E5%A5%BD"),
+            Some(""),
+            None,
+        ]));
+        let expected = StringArray::from(vec![
+            Some("https://spark.apache.org"),
+            Some("inva lid://user:pass@host/file\\;param?query\\;p2"),
+            Some("inva lid://user:pass@host/file\\;param?query\\;p2"),
+            Some("~!@#$%^&*()_+"),
+            Some("你好"),
+            Some(""),
+            None,
+        ]);
+
+        let result = spark_url_decode(&[input as ArrayRef])?;
+        let result = as_string_array(&result)?;
+
+        assert_eq!(&expected, result);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_decode_error() -> Result<()> {
+        let input = Arc::new(StringArray::from(vec![
+            Some("http%3A%2F%2spark.apache.org"), // '%2s' is not a valid percent encoded character
+            // Valid cases
+            Some("https%3A%2F%2Fspark.apache.org"),
+            None,
+        ]));
+
+        let result = spark_url_decode(&[input]);
+        assert!(
+            result.is_err_and(|e| e.to_string().contains("Invalid percent-encoding"))
+        );
+
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/url/url_encode.rs b/datafusion/spark/src/function/url/url_encode.rs
new file mode 100644
index 0000000000000..1ad2a111851ee
--- /dev/null
+++ b/datafusion/spark/src/function/url/url_encode.rs
@@ -0,0 +1,126 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray};
+use arrow::datatypes::DataType;
+use datafusion_common::cast::{
+    as_large_string_array, as_string_array, as_string_view_array,
+};
+use datafusion_common::{Result, exec_err, plan_err};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use url::form_urlencoded::byte_serialize;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct UrlEncode {
+    signature: Signature,
+}
+
+impl Default for UrlEncode {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl UrlEncode {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::string(1, Volatility::Immutable),
+        }
+    }
+
+    /// Encode a string to application/x-www-form-urlencoded format.
+    ///
+    /// # Arguments
+    ///
+    /// * `value` - The string to encode
+    ///
+    /// # Returns
+    ///
+    /// * `Ok(String)` - The encoded string
+    ///
+    fn encode(value: &str) -> Result<String> {
+        Ok(byte_serialize(value.as_bytes()).collect::<String>())
+    }
+}
+
+impl ScalarUDFImpl for UrlEncode {
+    fn name(&self) -> &str {
+        "url_encode"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types.len() != 1 {
+            return plan_err!(
+                "{} expects 1 argument, but got {}",
+                self.name(),
+                arg_types.len()
+            );
+        }
+        // As the type signature is already checked, we can safely return the type of the first argument
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+        make_scalar_function(spark_url_encode, vec![])(&args)
+    }
+}
+
+/// Core implementation of URL encoding function.
+///
+/// # Arguments
+///
+/// * `args` - A slice containing exactly one ArrayRef with the strings to encode
+///
+/// # Returns
+///
+/// * `Ok(ArrayRef)` - A new array of the same type containing encoded strings
+/// * `Err(DataFusionError)` - If invalid arguments are provided
+///
+fn spark_url_encode(args: &[ArrayRef]) -> Result<ArrayRef> {
+    if args.len() != 1 {
+        return exec_err!("`url_encode` expects 1 argument");
+    }
+
+    match &args[0].data_type() {
+        DataType::Utf8 => as_string_array(&args[0])?
+            .iter()
+            .map(|x| x.map(UrlEncode::encode).transpose())
+            .collect::<Result<StringArray>>()
+            .map(|array| Arc::new(array) as ArrayRef),
+        DataType::LargeUtf8 => as_large_string_array(&args[0])?
+            .iter()
+            .map(|x| x.map(UrlEncode::encode).transpose())
+            .collect::<Result<LargeStringArray>>()
+            .map(|array| Arc::new(array) as ArrayRef),
+        DataType::Utf8View => as_string_view_array(&args[0])?
+            .iter()
+            .map(|x| x.map(UrlEncode::encode).transpose())
+            .collect::<Result<StringViewArray>>()
+            .map(|array| Arc::new(array) as ArrayRef),
+        other => exec_err!("`url_encode`: Expr must be STRING, got {other:?}"),
+    }
+}
diff --git a/datafusion/spark/src/lib.rs b/datafusion/spark/src/lib.rs
index 4d45f3c482af3..2eee94c52ef78 100644
--- a/datafusion/spark/src/lib.rs
+++ b/datafusion/spark/src/lib.rs
@@ -22,6 +22,7 @@
 #![cfg_attr(docsrs, feature(doc_cfg))]
 // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! Spark Expression packages for [DataFusion].
 //!
@@ -42,7 +43,7 @@
 //!
 //! ```
 //! # use datafusion_execution::FunctionRegistry;
-//! # use datafusion_expr::{ScalarUDF, AggregateUDF, WindowUDF};
+//! # use datafusion_expr::{ScalarUDF, AggregateUDF, WindowUDF, HigherOrderUDF};
 //! # use datafusion_expr::planner::ExprPlanner;
 //! # use datafusion_common::Result;
 //! # use std::collections::HashSet;
@@ -54,9 +55,11 @@
 //! # impl FunctionRegistry for SessionContext {
 //! #    fn register_udf(&mut self, _udf: Arc<ScalarUDF>) -> Result<Option<Arc<ScalarUDF>>> { Ok (None) }
 //! #    fn udfs(&self) -> HashSet<String> { unimplemented!() }
+//! #    fn higher_order_function_names(&self) -> HashSet<String> { unimplemented!() }
 //! #    fn udafs(&self) -> HashSet<String> { unimplemented!() }
 //! #    fn udwfs(&self) -> HashSet<String> { unimplemented!() }
 //! #    fn udf(&self, _name: &str) -> Result<Arc<ScalarUDF>> { unimplemented!() }
+//! #    fn higher_order_function(&self, name: &str) -> Result<Arc<dyn HigherOrderUDF>> { unimplemented!() }
 //! #    fn udaf(&self, name: &str) -> Result<Arc<AggregateUDF>> {unimplemented!() }
 //! #    fn udwf(&self, name: &str) -> Result<Arc<WindowUDF>> { unimplemented!() }
 //! #    fn expr_planners(&self) -> Vec<Arc<dyn ExprPlanner>> { unimplemented!() }
@@ -88,11 +91,51 @@
 //! use datafusion_spark::expr_fn::sha2;
 //! // Create the expression `sha2(my_data, 256)`
 //! let expr = sha2(col("my_data"), lit(256));
-//!```
+//! ```
+//!
+//! # Example: using the Spark expression planner
+//!
+//! The [`planner::SparkFunctionPlanner`] provides Spark-compatible expression
+//! planning, such as mapping SQL `EXTRACT` expressions to Spark's `date_part`
+//! function. To use it, register it with your session context:
+//!
+//! ```ignore
+//! use std::sync::Arc;
+//! use datafusion::prelude::SessionContext;
+//! use datafusion_spark::planner::SparkFunctionPlanner;
+//!
+//! let mut ctx = SessionContext::new();
+//! // Register the Spark expression planner
+//! ctx.register_expr_planner(Arc::new(SparkFunctionPlanner))?;
+//! // Now EXTRACT expressions will use Spark semantics
+//! let df = ctx.sql("SELECT EXTRACT(YEAR FROM timestamp_col) FROM my_table").await?;
+//! ```
 //!
 //![`Expr`]: datafusion_expr::Expr
+//!
+//! # Example: enabling Apache Spark features with SessionStateBuilder
+//!
+//! The recommended way to enable Apache Spark compatibility is to use the
+//! `SessionStateBuilderSpark` extension trait. This registers all
+//! Apache Spark functions (scalar, aggregate, window, and table) as well as the Apache Spark
+//! expression planner.
+//!
+//! Enable the `core` feature in your `Cargo.toml`:
+//! ```toml
+//! datafusion-spark = { version = "X", features = ["core"] }
+//! ```
+//!
+//! Then use the extension trait - see [`SessionStateBuilderSpark::with_spark_features`]
+//! for an example.
 
 pub mod function;
+pub mod planner;
+
+#[cfg(feature = "core")]
+mod session_state;
+
+#[cfg(feature = "core")]
+pub use session_state::SessionStateBuilderSpark;
 
 use datafusion_catalog::TableFunction;
 use datafusion_common::Result;
@@ -102,7 +145,7 @@ use log::debug;
 use std::sync::Arc;
 
 /// Fluent-style API for creating `Expr`s
-#[allow(unused)]
+#[expect(unused_imports)]
 pub mod expr_fn {
     pub use super::function::aggregate::expr_fn::*;
     pub use super::function::array::expr_fn::*;
@@ -121,8 +164,8 @@ pub mod expr_fn {
     pub use super::function::math::expr_fn::*;
     pub use super::function::misc::expr_fn::*;
     pub use super::function::predicate::expr_fn::*;
-    pub use super::function::r#struct::expr_fn::*;
     pub use super::function::string::expr_fn::*;
+    pub use super::function::r#struct::expr_fn::*;
     pub use super::function::table::expr_fn::*;
     pub use super::function::url::expr_fn::*;
     pub use super::function::window::expr_fn::*;
diff --git a/datafusion/spark/src/planner.rs b/datafusion/spark/src/planner.rs
new file mode 100644
index 0000000000000..2dafbb1f9a570
--- /dev/null
+++ b/datafusion/spark/src/planner.rs
@@ -0,0 +1,43 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_expr::Expr;
+use datafusion_expr::expr::ScalarFunction;
+use datafusion_expr::planner::{ExprPlanner, PlannerResult};
+
+#[derive(Default, Debug)]
+pub struct SparkFunctionPlanner;
+
+impl ExprPlanner for SparkFunctionPlanner {
+    fn plan_extract(
+        &self,
+        args: Vec<Expr>,
+    ) -> datafusion_common::Result<PlannerResult<Vec<Expr>>> {
+        Ok(PlannerResult::Planned(Expr::ScalarFunction(
+            ScalarFunction::new_udf(crate::function::datetime::date_part(), args),
+        )))
+    }
+
+    fn plan_substring(
+        &self,
+        args: Vec<Expr>,
+    ) -> datafusion_common::Result<PlannerResult<Vec<Expr>>> {
+        Ok(PlannerResult::Planned(Expr::ScalarFunction(
+            ScalarFunction::new_udf(crate::function::string::substring(), args),
+        )))
+    }
+}
diff --git a/datafusion/spark/src/session_state.rs b/datafusion/spark/src/session_state.rs
new file mode 100644
index 0000000000000..e39de3a5888ea
--- /dev/null
+++ b/datafusion/spark/src/session_state.rs
@@ -0,0 +1,111 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use datafusion::execution::SessionStateBuilder;
+
+use crate::planner::SparkFunctionPlanner;
+use crate::{
+    all_default_aggregate_functions, all_default_scalar_functions,
+    all_default_table_functions, all_default_window_functions,
+};
+
+/// Extension trait for adding Apache Spark features to [`SessionStateBuilder`].
+///
+/// This trait provides a convenient way to register all Apache Spark-compatible
+/// functions and planners with a DataFusion session.
+///
+/// # Example
+///
+/// ```rust
+/// use datafusion::execution::SessionStateBuilder;
+/// use datafusion_spark::SessionStateBuilderSpark;
+///
+/// // Create a SessionState with Apache Spark features enabled
+/// // note: the order matters here, `with_spark_features` should be
+/// // called after `with_default_features` to overwrite any existing functions
+/// let state = SessionStateBuilder::new()
+///     .with_default_features()
+///     .with_spark_features()
+///     .build();
+/// ```
+pub trait SessionStateBuilderSpark {
+    /// Adds all expr_planners, scalar, aggregate, window and table functions
+    /// compatible with Apache Spark.
+    ///
+    /// Note: This overwrites any previously registered items with the same name.
+    fn with_spark_features(self) -> Self;
+}
+
+impl SessionStateBuilderSpark for SessionStateBuilder {
+    fn with_spark_features(mut self) -> Self {
+        self.expr_planners()
+            .get_or_insert_with(Vec::new)
+            // planners are evaluated in order of insertion. Push Apache Spark function planner to the front
+            // to take precedence over others
+            .insert(0, Arc::new(SparkFunctionPlanner));
+
+        self.scalar_functions()
+            .get_or_insert_with(Vec::new)
+            .extend(all_default_scalar_functions());
+
+        self.aggregate_functions()
+            .get_or_insert_with(Vec::new)
+            .extend(all_default_aggregate_functions());
+
+        self.window_functions()
+            .get_or_insert_with(Vec::new)
+            .extend(all_default_window_functions());
+
+        self.table_functions()
+            .get_or_insert_with(HashMap::new)
+            .extend(
+                all_default_table_functions()
+                    .into_iter()
+                    .map(|f| (f.name().to_string(), f)),
+            );
+
+        self
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_session_state_with_spark_features() {
+        let state = SessionStateBuilder::new().with_spark_features().build();
+
+        assert!(
+            state.scalar_functions().contains_key("sha2"),
+            "Apache Spark scalar function 'sha2' should be registered"
+        );
+
+        assert!(
+            state.aggregate_functions().contains_key("try_sum"),
+            "Apache Spark aggregate function 'try_sum' should be registered"
+        );
+
+        assert!(
+            !state.expr_planners().is_empty(),
+            "Apache Spark expr planners should be registered"
+        );
+    }
+}
diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml
index ea2cd6dfcc7d8..cc299ce507099 100644
--- a/datafusion/sql/Cargo.toml
+++ b/datafusion/sql/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -53,6 +56,7 @@ bigdecimal = { workspace = true }
 chrono = { workspace = true }
 datafusion-common = { workspace = true, features = ["sql"] }
 datafusion-expr = { workspace = true, features = ["sql"] }
+datafusion-functions-nested = { workspace = true, features = ["sql"] }
 indexmap = { workspace = true }
 log = { workspace = true }
 recursive = { workspace = true, optional = true }
@@ -69,5 +73,4 @@ datafusion-functions-window = { workspace = true }
 env_logger = { workspace = true }
 insta = { workspace = true }
 itertools = { workspace = true }
-paste = "^1.0"
 rstest = { workspace = true }
diff --git a/datafusion/sql/examples/sql.rs b/datafusion/sql/examples/sql.rs
index 2c0bb86cd8087..dc49b4460fec5 100644
--- a/datafusion/sql/examples/sql.rs
+++ b/datafusion/sql/examples/sql.rs
@@ -20,12 +20,12 @@ use std::{collections::HashMap, sync::Arc};
 use arrow::datatypes::{DataType, Field, Schema};
 
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::{plan_err, Result, TableReference};
+use datafusion_common::{Result, TableReference, plan_err};
 use datafusion_expr::planner::ExprPlanner;
-use datafusion_expr::WindowUDF;
 use datafusion_expr::{
-    logical_plan::builder::LogicalTableSource, AggregateUDF, ScalarUDF, TableSource,
+    AggregateUDF, ScalarUDF, TableSource, logical_plan::builder::LogicalTableSource,
 };
+use datafusion_expr::{HigherOrderUDF, WindowUDF};
 use datafusion_functions::core::planner::CoreFunctionPlanner;
 use datafusion_functions_aggregate::count::count_udaf;
 use datafusion_functions_aggregate::sum::sum_udaf;
@@ -138,6 +138,10 @@ impl ContextProvider for MyContextProvider {
         None
     }
 
+    fn get_higher_order_meta(&self, _name: &str) -> Option<Arc<dyn HigherOrderUDF>> {
+        None
+    }
+
     fn get_aggregate_meta(&self, name: &str) -> Option<Arc<AggregateUDF>> {
         self.udafs.get(name).cloned()
     }
@@ -158,6 +162,10 @@ impl ContextProvider for MyContextProvider {
         Vec::new()
     }
 
+    fn higher_order_function_names(&self) -> Vec<String> {
+        Vec::new()
+    }
+
     fn udaf_names(&self) -> Vec<String> {
         Vec::new()
     }
diff --git a/datafusion/sql/src/cte.rs b/datafusion/sql/src/cte.rs
index aceec676761cb..18766d7056355 100644
--- a/datafusion/sql/src/cte.rs
+++ b/datafusion/sql/src/cte.rs
@@ -20,9 +20,8 @@ use std::sync::Arc;
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
 
 use datafusion_common::{
-    not_impl_err, plan_err,
+    Result, not_impl_err, plan_err,
     tree_node::{TreeNode, TreeNodeRecursion},
-    Result,
 };
 use datafusion_expr::{LogicalPlan, LogicalPlanBuilder, TableSource};
 use sqlparser::ast::{Query, SetExpr, SetOperator, With};
@@ -46,7 +45,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
             // Create a logical plan for the CTE
             let cte_plan = if is_recursive {
-                self.recursive_cte(cte_name.clone(), *cte.query, planner_context)?
+                self.recursive_cte(&cte_name, *cte.query, planner_context)?
             } else {
                 self.non_recursive_cte(*cte.query, planner_context)?
             };
@@ -70,7 +69,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
     fn recursive_cte(
         &self,
-        cte_name: String,
+        cte_name: &str,
         mut cte_query: Query,
         planner_context: &mut PlannerContext,
     ) -> Result<LogicalPlan> {
@@ -92,7 +91,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             } => (left, right, set_quantifier),
             other => {
                 // If the query is not a UNION, then it is not a recursive CTE
-                cte_query.body = Box::new(other);
+                *cte_query.body = other;
                 return self.non_recursive_cte(cte_query, planner_context);
             }
         };
@@ -136,7 +135,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         // Step 2.1: Create a table source for the temporary relation
         let work_table_source = self
             .context_provider
-            .create_cte_work_table(&cte_name, Arc::clone(static_plan.schema().inner()))?;
+            .create_cte_work_table(cte_name, Arc::clone(static_plan.schema().inner()))?;
 
         // Step 2.2: Create a temporary relation logical plan that will be used
         // as the input to the recursive term
@@ -147,14 +146,14 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         )?
         .build()?;
 
-        let name = cte_name.clone();
+        let name = cte_name.to_string();
 
         // Step 2.3: Register the temporary relation in the planning context
         // For all the self references in the variadic term, we'll replace it
         // with the temporary relation we created above by temporarily registering
         // it as a CTE. This temporary relation in the planning context will be
         // replaced by the actual CTE plan once we're done with the planning.
-        planner_context.insert_cte(cte_name.clone(), work_table_plan);
+        planner_context.insert_cte(cte_name.to_string(), work_table_plan);
 
         // ---------- Step 3: Compile the recursive term ------------------
         // this uses the named_relation we inserted above to resolve the
@@ -166,7 +165,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         // if not, it is a non-recursive CTE
         if !has_work_table_reference(&recursive_plan, &work_table_source) {
             // Remove the work table plan from the context
-            planner_context.remove_cte(&cte_name);
+            planner_context.remove_cte(cte_name);
             // Compile it as a non-recursive CTE
             return self.set_operation_to_plan(
                 SetOperator::Union,
@@ -191,11 +190,11 @@ fn has_work_table_reference(
 ) -> bool {
     let mut has_reference = false;
     plan.apply(|node| {
-        if let LogicalPlan::TableScan(scan) = node {
-            if Arc::ptr_eq(&scan.source, work_table_source) {
-                has_reference = true;
-                return Ok(TreeNodeRecursion::Stop);
-            }
+        if let LogicalPlan::TableScan(scan) = node
+            && Arc::ptr_eq(&scan.source, work_table_source)
+        {
+            has_reference = true;
+            return Ok(TreeNodeRecursion::Stop);
         }
         Ok(TreeNodeRecursion::Continue)
     })
diff --git a/datafusion/sql/src/expr/binary_op.rs b/datafusion/sql/src/expr/binary_op.rs
index 1c06f5ee926f9..4e9025e02e0c7 100644
--- a/datafusion/sql/src/expr/binary_op.rs
+++ b/datafusion/sql/src/expr/binary_op.rs
@@ -16,12 +16,12 @@
 // under the License.
 
 use crate::planner::{ContextProvider, SqlToRel};
-use datafusion_common::{not_impl_err, Result};
+use datafusion_common::{Result, not_impl_err};
 use datafusion_expr::Operator;
 use sqlparser::ast::BinaryOperator;
 
 impl<S: ContextProvider> SqlToRel<'_, S> {
-    pub(crate) fn parse_sql_binary_op(&self, op: BinaryOperator) -> Result<Operator> {
+    pub(crate) fn parse_sql_binary_op(&self, op: &BinaryOperator) -> Result<Operator> {
         match op {
             BinaryOperator::Gt => Ok(Operator::Gt),
             BinaryOperator::GtEq => Ok(Operator::GtEq),
@@ -68,6 +68,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             BinaryOperator::Question => Ok(Operator::Question),
             BinaryOperator::QuestionAnd => Ok(Operator::QuestionAnd),
             BinaryOperator::QuestionPipe => Ok(Operator::QuestionPipe),
+            BinaryOperator::Custom(s) if s == ":" => Ok(Operator::Colon),
             _ => not_impl_err!("Unsupported binary operator: {:?}", op),
         }
     }
diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs
index 2d20aaf523589..1790e66a027bb 100644
--- a/datafusion/sql/src/expr/function.rs
+++ b/datafusion/sql/src/expr/function.rs
@@ -17,20 +17,26 @@
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
 
-use arrow::datatypes::DataType;
+use arrow::datatypes::{DataType, FieldRef};
 use datafusion_common::{
+    DFSchema, Dependency, Diagnostic, HashSet, Result, Span, datatype::FieldExt,
     internal_datafusion_err, internal_err, not_impl_err, plan_datafusion_err, plan_err,
-    DFSchema, Dependency, Diagnostic, Result, Span,
 };
-use datafusion_expr::expr::{
-    NullTreatment, ScalarFunction, Unnest, WildcardOptions, WindowFunction,
+use datafusion_expr::{
+    Expr, ExprSchemable, LambdaParametersProgress, SortExpr, ValueOrLambda, WindowFrame,
+    WindowFunctionDefinition,
+    arguments::ArgumentName,
+    expr::{
+        self, HigherOrderFunction, Lambda, NullTreatment, ScalarFunction, Unnest,
+        WildcardOptions, WindowFunction,
+    },
+    planner::{PlannerResult, RawAggregateExpr, RawWindowExpr},
+    type_coercion::functions::value_fields_with_higher_order_udf,
 };
-use datafusion_expr::planner::{PlannerResult, RawAggregateExpr, RawWindowExpr};
-use datafusion_expr::{expr, Expr, ExprSchemable, WindowFrame, WindowFunctionDefinition};
 use sqlparser::ast::{
     DuplicateTreatment, Expr as SQLExpr, Function as SQLFunction, FunctionArg,
     FunctionArgExpr, FunctionArgumentClause, FunctionArgumentList, FunctionArguments,
-    ObjectName, OrderByExpr, Spanned, WindowType,
+    LambdaFunction, ObjectName, OrderByExpr, Spanned, WindowType,
 };
 
 /// Suggest a valid function based on an invalid input function name
@@ -55,6 +61,7 @@ pub fn suggest_valid_function(
         let mut funcs = Vec::new();
 
         funcs.extend(ctx.udf_names());
+        funcs.extend(ctx.higher_order_function_names());
         funcs.extend(ctx.udaf_names());
 
         funcs
@@ -120,7 +127,7 @@ impl FunctionArgs {
                 null_treatment: null_treatment.map(|v| v.into()),
                 distinct: false,
                 within_group,
-                function_without_parentheses: matches!(args, FunctionArguments::None),
+                function_without_parentheses: args == FunctionArguments::None,
             });
         };
 
@@ -151,42 +158,46 @@ impl FunctionArgs {
                 FunctionArgumentClause::OrderBy(oby) => {
                     if order_by.is_some() {
                         if !within_group.is_empty() {
-                            return plan_err!("ORDER BY clause is only permitted in WITHIN GROUP clause when a WITHIN GROUP is used");
+                            return plan_err!(
+                                "ORDER BY clause is only permitted in WITHIN GROUP clause when a WITHIN GROUP is used"
+                            );
                         }
-                        return not_impl_err!("Calling {name}: Duplicated ORDER BY clause in function arguments");
+                        return not_impl_err!(
+                            "Calling {name}: Duplicated ORDER BY clause in function arguments"
+                        );
                     }
                     order_by = Some(oby);
                 }
                 FunctionArgumentClause::Limit(limit) => {
                     return not_impl_err!(
                         "Calling {name}: LIMIT not supported in function arguments: {limit}"
-                    )
+                    );
                 }
                 FunctionArgumentClause::OnOverflow(overflow) => {
                     return not_impl_err!(
                         "Calling {name}: ON OVERFLOW not supported in function arguments: {overflow}"
-                    )
+                    );
                 }
                 FunctionArgumentClause::Having(having) => {
                     return not_impl_err!(
                         "Calling {name}: HAVING not supported in function arguments: {having}"
-                    )
+                    );
                 }
                 FunctionArgumentClause::Separator(sep) => {
                     return not_impl_err!(
                         "Calling {name}: SEPARATOR not supported in function arguments: {sep}"
-                    )
+                    );
                 }
                 FunctionArgumentClause::JsonNullClause(jn) => {
                     return not_impl_err!(
                         "Calling {name}: JSON NULL clause not supported in function arguments: {jn}"
-                    )
+                    );
                 }
                 FunctionArgumentClause::JsonReturningClause(jr) => {
                     return not_impl_err!(
                         "Calling {name}: JSON RETURNING clause not supported in function arguments: {jr}"
-                    )
-                },
+                    );
+                }
             }
         }
 
@@ -212,6 +223,9 @@ impl FunctionArgs {
     }
 }
 
+// Helper type for extracting WITHIN GROUP ordering and prepended args
+type WithinGroupExtraction = (Vec<SortExpr>, Vec<Expr>, Vec<Option<ArgumentName>>);
+
 impl<S: ContextProvider> SqlToRel<'_, S> {
     pub(super) fn sql_function_to_expr(
         &self,
@@ -233,12 +247,16 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         } = function_args;
 
         if over.is_some() && !within_group.is_empty() {
-            return plan_err!("OVER and WITHIN GROUP clause cannot be used together. \
-                OVER is for window functions, whereas WITHIN GROUP is for ordered set aggregate functions");
+            return plan_err!(
+                "OVER and WITHIN GROUP clause cannot be used together. \
+                OVER is for window functions, whereas WITHIN GROUP is for ordered set aggregate functions"
+            );
         }
 
         if !order_by.is_empty() && !within_group.is_empty() {
-            return plan_err!("ORDER BY and WITHIN GROUP clauses cannot be used together in the same aggregate function");
+            return plan_err!(
+                "ORDER BY and WITHIN GROUP clauses cannot be used together in the same aggregate function"
+            );
         }
 
         // If function is a window function (it has an OVER clause),
@@ -257,14 +275,49 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     return plan_err!(
                         "Expected an identifier in function name, but found {:?}",
                         object_name.0[0]
-                    )
+                    );
+                }
+            }
+        };
+
+        // handle make_map and map functions
+        // make_map always uses plan_make_map: make_map(k1, v1, k2, v2, ...)
+        // map has 2 syntaxes:
+        //     1. map([keys], [values]) - two arrays that get zipped
+        //     2. map(k1, v1, k2, v2, ...) - variadic pairs (uses plan_make_map)
+        let use_plan_make_map = match name.as_str() {
+            "make_map" => true,
+            "map" => {
+                // for map, check if this is the first syntax variant (two-array)
+                let args =
+                    self.function_args_to_expr(args.clone(), schema, planner_context)?;
+
+                let is_two_array_syntax = args.len() == 2
+                    && args.iter().all(|arg| {
+                        matches!(
+                            arg.get_type(schema),
+                            Ok(DataType::List(_))
+                                | Ok(DataType::LargeList(_))
+                                | Ok(DataType::FixedSizeList(_, _))
+                        )
+                    });
+
+                // map function with variadic syntax requires non-empty list of arguments
+                if !is_two_array_syntax && args.is_empty() {
+                    return plan_err!(
+                        "Function 'map' expected at least one argument but received 0"
+                    );
                 }
+
+                !is_two_array_syntax
             }
+            _ => false,
         };
 
-        if name.eq("make_map") {
+        if use_plan_make_map {
             let mut fn_args =
                 self.function_args_to_expr(args.clone(), schema, planner_context)?;
+
             for planner in self.context_provider.get_expr_planners().iter() {
                 match planner.plan_make_map(fn_args)? {
                     PlannerResult::Planned(expr) => return Ok(expr),
@@ -315,6 +368,184 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             }
         }
 
+        if let Some(fm) = self.context_provider.get_higher_order_meta(&name) {
+            // plan non-lambda arguments first so we can get theirs datatype and call
+            // HigherOrderUDF::lambda_parameters to then plan the lambda arguments with
+            // resolved lambda variables
+            enum ExprOrLambda {
+                Expr(Expr),
+                Lambda(LambdaFunction),
+            }
+
+            let partially_planned = args
+                .into_iter()
+                .map(|a| match a {
+                    FunctionArg::Unnamed(FunctionArgExpr::Expr(SQLExpr::Lambda(
+                        lambda,
+                    ))) => {
+                        if !all_unique(&lambda.params) {
+                            return plan_err!(
+                                "lambda parameters names must be unique, got {}",
+                                lambda.params
+                            );
+                        }
+
+                        Ok(ExprOrLambda::Lambda(lambda))
+                    }
+                    _ => Ok(ExprOrLambda::Expr(self.sql_fn_arg_to_logical_expr(
+                        a,
+                        schema,
+                        planner_context,
+                    )?)),
+                })
+                .collect::<Result<Vec<_>>>()?;
+
+            let current_fields = partially_planned
+                .iter()
+                .map(|e| match e {
+                    ExprOrLambda::Expr(expr) => {
+                        Ok(ValueOrLambda::Value(expr.to_field(schema)?.1))
+                    }
+                    ExprOrLambda::Lambda(_lambda_function) => {
+                        Ok(ValueOrLambda::Lambda(()))
+                    }
+                })
+                .collect::<Result<Vec<_>>>()?;
+
+            // coerce fields because coercion may alter the lambda parameters
+            let mut fields =
+                value_fields_with_higher_order_udf(&current_fields, fm.as_ref())?
+                    .into_iter()
+                    .map(|arg| match arg {
+                        ValueOrLambda::Value(value) => ValueOrLambda::Value(value),
+                        ValueOrLambda::Lambda(_lambda) => ValueOrLambda::Lambda(None),
+                    })
+                    .collect::<Vec<_>>();
+
+            let lambda_count = partially_planned
+                .iter()
+                .filter(|a| matches!(a, ExprOrLambda::Lambda(_)))
+                .count();
+
+            let mut step = 0;
+
+            let args = loop {
+                match fm.lambda_parameters(step, &fields)? {
+                    LambdaParametersProgress::Partial(params) => {
+                        let mut params = params.into_iter();
+
+                        if params.len() != lambda_count {
+                            return plan_err!(
+                                "{} lambda_parameters returned {} lambdas but {lambda_count} expected",
+                                fm.name(),
+                                params.len()
+                            );
+                        }
+
+                        for (arg, field) in
+                            std::iter::zip(&partially_planned, &mut fields)
+                        {
+                            match (arg, field) {
+                                (
+                                    ExprOrLambda::Lambda(lambda),
+                                    ValueOrLambda::Lambda(field),
+                                ) => {
+                                    let params = params.next().ok_or_else(|| {
+                                        internal_datafusion_err!(
+                                            "params len should have been checked above"
+                                        )
+                                    })?;
+                                    if let Some(params) = params {
+                                        let lambda = self.sql_lambda_to_logical_lambda(
+                                            planner_context,
+                                            schema,
+                                            lambda.clone(),
+                                            params,
+                                        )?;
+
+                                        *field = Some(lambda.body.to_field(schema)?.1);
+                                    }
+                                }
+                                (ExprOrLambda::Expr(_), ValueOrLambda::Value(_)) => {} // nothing to do
+                                (ExprOrLambda::Expr(_), ValueOrLambda::Lambda(_)) => {
+                                    return internal_err!(
+                                        "value_fields_with_higher_order_udf returned a value for a lambda argument"
+                                    );
+                                }
+                                (ExprOrLambda::Lambda(_), ValueOrLambda::Value(_)) => {
+                                    return internal_err!(
+                                        "value_fields_with_higher_order_udf returned a lambda for a value argument"
+                                    );
+                                }
+                            }
+                        }
+                    }
+                    LambdaParametersProgress::Complete(params) => {
+                        let mut params = params.into_iter();
+
+                        if params.len() != lambda_count {
+                            return plan_err!(
+                                "{} lambda_parameters returned {} lambdas but {lambda_count} expected",
+                                fm.name(),
+                                params.len()
+                            );
+                        }
+
+                        break partially_planned
+                            .into_iter()
+                            .map(|arg| match arg {
+                                ExprOrLambda::Expr(expr) => Ok(expr),
+                                ExprOrLambda::Lambda(lambda) => {
+                                    let params = params.next().ok_or_else(|| {
+                                        internal_datafusion_err!(
+                                            "params len should have been checked above"
+                                        )
+                                    })?;
+
+                                    Ok(Expr::Lambda(self.sql_lambda_to_logical_lambda(
+                                        planner_context,
+                                        schema,
+                                        lambda,
+                                        params,
+                                    )?))
+                                }
+                            })
+                            .collect::<Result<Vec<_>>>()?;
+                    }
+                }
+
+                let limit = fm.signature().lambda_parameters_max_iterations;
+
+                step += 1;
+
+                if step > limit {
+                    return plan_err!(
+                        "{} lambda_parameters called {limit} times without completion",
+                        fm.name()
+                    );
+                }
+            };
+
+            let inner = HigherOrderFunction::new(fm, args);
+
+            if name.eq_ignore_ascii_case(inner.name()) {
+                return Ok(Expr::HigherOrderFunction(inner));
+            } else {
+                // If the function is called by an alias, a verbose string representation is created
+                // (e.g., "my_alias(arg1, arg2)") and the expression is wrapped in an `Alias`
+                // to ensure the output column name matches the user's query.
+                let arg_names = inner
+                    .args
+                    .iter()
+                    .map(|arg| arg.to_string())
+                    .collect::<Vec<_>>()
+                    .join(",");
+                let verbose_alias = format!("{name}({arg_names})");
+
+                return Ok(Expr::HigherOrderFunction(inner).alias(verbose_alias));
+            }
+        }
+
         // Build Unnest expression
         if name.eq("unnest") {
             let mut exprs = self.function_args_to_expr(args, schema, planner_context)?;
@@ -386,7 +617,30 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             };
 
             if let Ok(fun) = self.find_window_func(&name) {
-                let args = self.function_args_to_expr(args, schema, planner_context)?;
+                let (args, arg_names) =
+                    self.function_args_to_expr_with_names(args, schema, planner_context)?;
+
+                let resolved_args = if arg_names.iter().any(|name| name.is_some()) {
+                    let signature = match &fun {
+                        WindowFunctionDefinition::AggregateUDF(udaf) => udaf.signature(),
+                        WindowFunctionDefinition::WindowUDF(udwf) => udwf.signature(),
+                    };
+
+                    if let Some(param_names) = &signature.parameter_names {
+                        datafusion_expr::arguments::resolve_function_arguments(
+                            param_names,
+                            args,
+                            arg_names,
+                        )?
+                    } else {
+                        return plan_err!(
+                            "Window function '{}' does not support named arguments",
+                            name
+                        );
+                    }
+                } else {
+                    args
+                };
 
                 // Plan FILTER clause if present
                 let filter = filter
@@ -396,7 +650,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
                 let mut window_expr = RawWindowExpr {
                     func_def: fun,
-                    args,
+                    args: resolved_args,
                     partition_by,
                     order_by,
                     window_frame,
@@ -464,34 +718,77 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     );
                 }
 
-                let mut args =
-                    self.function_args_to_expr(args, schema, planner_context)?;
+                let (mut args, mut arg_names) =
+                    self.function_args_to_expr_with_names(args, schema, planner_context)?;
 
-                let order_by = if fm.supports_within_group_clause() {
-                    let within_group = self.order_by_to_sort_expr(
-                        within_group,
-                        schema,
-                        planner_context,
-                        false,
-                        None,
-                    )?;
+                // UDAFs must opt-in via `supports_within_group_clause()` to
+                // accept a WITHIN GROUP clause.
+                let supports_within_group = fm.supports_within_group_clause();
+
+                // Built-in ordered-set aggregates must also support WITHIN GROUP
+                let is_builtin_ordered_set = matches!(
+                    name.as_str(),
+                    "percentile_cont"
+                        | "quantile_cont"
+                        | "approx_percentile_cont"
+                        | "approx_percentile_cont_with_weight"
+                );
+
+                let supports_within_group =
+                    supports_within_group || is_builtin_ordered_set;
+
+                let mut within_group = within_group;
+                let mut order_by = order_by;
 
-                    // Add the WITHIN GROUP ordering expressions to the front of the argument list
-                    // So function(arg) WITHIN GROUP (ORDER BY x) becomes function(x, arg)
+                if supports_within_group
+                    && within_group.is_empty()
+                    && !order_by.is_empty()
+                {
+                    // Inline ORDER BY syntax:
+                    // quantile_cont(value, percentile ORDER BY value)
+                    if args.len() >= 2 {
+                        args.remove(0);
+                        arg_names.remove(0);
+                    }
+
+                    within_group = order_by;
+                    order_by = vec![];
+                }
+
+                if !supports_within_group && !within_group.is_empty() {
+                    return plan_err!(
+                        "WITHIN GROUP is only supported for ordered-set aggregate functions"
+                    );
+                }
+
+                let order_by: Vec<SortExpr> = if supports_within_group {
                     if !within_group.is_empty() {
-                        args = within_group
-                            .iter()
-                            .map(|sort| sort.expr.clone())
-                            .chain(args)
-                            .collect::<Vec<_>>();
+                        // WITHIN GROUP syntax
+                        let sorts = self.order_by_to_sort_expr(
+                            within_group,
+                            schema,
+                            planner_context,
+                            false,
+                            None,
+                        )?;
+
+                        if sorts.len() != 1 {
+                            return plan_err!(
+                                "Only a single ordering expression is permitted in WITHIN GROUP clause"
+                            );
+                        }
+
+                        // Prepend ordered value expression to args
+                        let value_expr = sorts[0].expr.clone();
+                        arg_names = std::iter::once(None).chain(arg_names).collect();
+                        args = std::iter::once(value_expr).chain(args).collect();
+
+                        sorts
+                    } else {
+                        vec![]
                     }
-                    within_group
                 } else {
-                    let order_by = if !order_by.is_empty() {
-                        order_by
-                    } else {
-                        within_group
-                    };
+                    // Normal aggregate behavior
                     self.order_by_to_sort_expr(
                         order_by,
                         schema,
@@ -506,9 +803,26 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     .transpose()?
                     .map(Box::new);
 
+                let resolved_args = if arg_names.iter().any(|name| name.is_some()) {
+                    if let Some(param_names) = &fm.signature().parameter_names {
+                        datafusion_expr::arguments::resolve_function_arguments(
+                            param_names,
+                            args,
+                            arg_names,
+                        )?
+                    } else {
+                        return plan_err!(
+                            "Aggregate function '{}' does not support named arguments",
+                            fm.name()
+                        );
+                    }
+                } else {
+                    args
+                };
+
                 let mut aggregate_expr = RawAggregateExpr {
                     func: fm,
-                    args,
+                    args: resolved_args,
                     distinct,
                     filter,
                     order_by,
@@ -598,6 +912,44 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         }
     }
 
+    fn sql_lambda_to_logical_lambda(
+        &self,
+        planner_context: &mut PlannerContext,
+        schema: &DFSchema,
+        lambda: LambdaFunction,
+        lambda_params: Vec<FieldRef>,
+    ) -> Result<Lambda> {
+        if lambda.params.len() > lambda_params.len() {
+            return plan_err!(
+                "lambda defined {} params but UDF support only {}",
+                lambda.params.len(),
+                lambda_params.len()
+            );
+        }
+
+        let params = lambda
+            .params
+            .iter()
+            .map(|p| crate::utils::normalize_ident(p.clone()))
+            .collect();
+
+        let lambda_parameters = std::iter::zip(lambda_params, &params)
+            .map(|(f, n): (FieldRef, &String)| f.renamed(n.as_str()));
+
+        let mut planner_context = planner_context
+            .clone()
+            .with_lambda_parameters(lambda_parameters);
+
+        Ok(Lambda {
+            params,
+            body: Box::new(self.sql_expr_to_logical_expr(
+                *lambda.body,
+                schema,
+                &mut planner_context,
+            )?),
+        })
+    }
+
     pub(super) fn sql_fn_name_to_expr(
         &self,
         expr: SQLExpr,
@@ -654,7 +1006,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         sql: FunctionArg,
         schema: &DFSchema,
         planner_context: &mut PlannerContext,
-    ) -> Result<(Expr, Option<String>)> {
+    ) -> Result<(Expr, Option<ArgumentName>)> {
         match sql {
             FunctionArg::Named {
                 name,
@@ -662,7 +1014,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 operator: _,
             } => {
                 let expr = self.sql_expr_to_logical_expr(arg, schema, planner_context)?;
-                let arg_name = crate::utils::normalize_ident(name);
+                let arg_name = ArgumentName {
+                    value: name.value,
+                    is_quoted: name.quote_style.is_some(),
+                };
                 Ok((expr, Some(arg_name)))
             }
             FunctionArg::Named {
@@ -675,7 +1030,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     qualifier: None,
                     options: Box::new(WildcardOptions::default()),
                 };
-                let arg_name = crate::utils::normalize_ident(name);
+                let arg_name = ArgumentName {
+                    value: name.value,
+                    is_quoted: name.quote_style.is_some(),
+                };
                 Ok((expr, Some(arg_name)))
             }
             FunctionArg::Unnamed(FunctionArgExpr::Expr(arg)) => {
@@ -712,7 +1070,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 operator: _,
             } => {
                 let expr = self.sql_expr_to_logical_expr(arg, schema, planner_context)?;
-                let arg_name = crate::utils::normalize_ident(name);
+                let arg_name = ArgumentName {
+                    value: name.value,
+                    is_quoted: name.quote_style.is_some(),
+                };
                 Ok((expr, Some(arg_name)))
             }
             FunctionArg::ExprNamed {
@@ -725,7 +1086,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     qualifier: None,
                     options: Box::new(WildcardOptions::default()),
                 };
-                let arg_name = crate::utils::normalize_ident(name);
+                let arg_name = ArgumentName {
+                    value: name.value,
+                    is_quoted: name.quote_style.is_some(),
+                };
                 Ok((expr, Some(arg_name)))
             }
             _ => not_impl_err!("Unsupported qualified wildcard argument: {sql:?}"),
@@ -748,8 +1112,8 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         args: Vec<FunctionArg>,
         schema: &DFSchema,
         planner_context: &mut PlannerContext,
-    ) -> Result<(Vec<Expr>, Vec<Option<String>>)> {
-        let results: Result<Vec<(Expr, Option<String>)>> = args
+    ) -> Result<(Vec<Expr>, Vec<Option<ArgumentName>>)> {
+        let results: Result<Vec<(Expr, Option<ArgumentName>)>> = args
             .into_iter()
             .map(|a| {
                 self.sql_fn_arg_to_logical_expr_with_name(a, schema, planner_context)
@@ -757,16 +1121,52 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             .collect();
 
         let pairs = results?;
-        let (exprs, names): (Vec<Expr>, Vec<Option<String>>) = pairs.into_iter().unzip();
+        let (exprs, names): (Vec<Expr>, Vec<Option<ArgumentName>>) =
+            pairs.into_iter().unzip();
         Ok((exprs, names))
     }
 
+    #[expect(dead_code)]
+    fn extract_and_prepend_within_group_args(
+        &self,
+        within_group: Vec<OrderByExpr>,
+        mut args: Vec<Expr>,
+        mut arg_names: Vec<Option<ArgumentName>>,
+        schema: &DFSchema,
+        planner_context: &mut PlannerContext,
+    ) -> Result<WithinGroupExtraction> {
+        let within_group = self.order_by_to_sort_expr(
+            within_group,
+            schema,
+            planner_context,
+            false,
+            None,
+        )?;
+
+        if !within_group.is_empty() {
+            let within_group_count = within_group.len();
+            arg_names = std::iter::repeat_n(None, within_group_count)
+                .chain(arg_names)
+                .collect();
+
+            args = within_group
+                .iter()
+                .map(|sort| sort.expr.clone())
+                .chain(args)
+                .collect::<Vec<_>>();
+        }
+
+        Ok((within_group, args, arg_names))
+    }
+
     pub(crate) fn check_unnest_arg(arg: &Expr, schema: &DFSchema) -> Result<()> {
         // Check argument type, array types are supported
         match arg.get_type(schema)? {
             DataType::List(_)
             | DataType::LargeList(_)
             | DataType::FixedSizeList(_, _)
+            | DataType::ListView(_)
+            | DataType::LargeListView(_)
             | DataType::Struct(_) => Ok(()),
             DataType::Null => {
                 not_impl_err!("unnest() does not support null yet")
@@ -777,3 +1177,24 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         }
     }
 }
+
+/// After normalization with [normalize_ident], check whether all params are unique
+///
+/// [normalize_ident]: crate::utils::normalize_ident
+fn all_unique(params: &[sqlparser::ast::Ident]) -> bool {
+    match params.len() {
+        0 | 1 => true,
+        2 => {
+            crate::utils::normalize_ident(params[0].clone())
+                != crate::utils::normalize_ident(params[1].clone())
+        }
+        _ => {
+            let mut set = HashSet::with_capacity(params.len());
+
+            params
+                .iter()
+                .map(|p| crate::utils::normalize_ident(p.clone()))
+                .all(|p| set.insert(p))
+        }
+    }
+}
diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs
index 3c57d195ade67..775cb2cfe86b4 100644
--- a/datafusion/sql/src/expr/identifier.rs
+++ b/datafusion/sql/src/expr/identifier.rs
@@ -15,11 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::datatypes::Field;
+use arrow::datatypes::FieldRef;
+use datafusion_common::datatype::DataTypeExt;
 use datafusion_common::{
+    Column, DFSchema, Result, Span, TableReference, assert_or_internal_err,
     exec_datafusion_err, internal_err, not_impl_err, plan_datafusion_err, plan_err,
-    Column, DFSchema, Result, Span, TableReference,
 };
+use datafusion_expr::expr::LambdaVariable;
 use datafusion_expr::planner::PlannerResult;
 use datafusion_expr::{Case, Expr};
 use sqlparser::ast::{CaseWhen, Expr as SQLExpr, Ident};
@@ -36,16 +38,21 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         planner_context: &mut PlannerContext,
     ) -> Result<Expr> {
         let id_span = id.span;
-        if id.value.starts_with('@') {
+        if id.value.starts_with('@') && id.quote_style.is_none() {
             // TODO: figure out if ScalarVariables should be insensitive.
             let var_names = vec![id.value];
-            let ty = self
+            let field = self
                 .context_provider
-                .get_variable_type(&var_names)
+                .get_variable_field(&var_names)
+                .or_else(|| {
+                    self.context_provider
+                        .get_variable_type(&var_names)
+                        .map(|ty| ty.into_nullable_field_ref())
+                })
                 .ok_or_else(|| {
                     plan_datafusion_err!("variable {var_names:?} has no type information")
                 })?;
-            Ok(Expr::ScalarVariable(ty, var_names))
+            Ok(Expr::ScalarVariable(field, var_names))
         } else {
             // Don't use `col()` here because it will try to
             // interpret names with '.' as if they were
@@ -53,6 +60,19 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             // identifier. (e.g. it is "foo.bar" not foo.bar)
             let normalize_ident = self.ident_normalizer.normalize(id);
 
+            // lambdas parameters have higher precedence
+            if let Some(field) = planner_context.lambda_parameters().get(&normalize_ident)
+            {
+                let mut lambda_var =
+                    LambdaVariable::new(normalize_ident, Some(Arc::clone(field)));
+                if self.options.collect_spans
+                    && let Some(span) = Span::try_from_sqlparser_span(id_span)
+                {
+                    lambda_var.spans_mut().add_span(span);
+                }
+                return Ok(Expr::LambdaVariable(lambda_var));
+            }
+
             // Check for qualified field with unqualified name
             if let Ok((qualifier, _)) =
                 schema.qualified_field_with_unqualified_name(normalize_ident.as_str())
@@ -61,22 +81,22 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     qualifier.filter(|q| q.table() != UNNAMED_TABLE).cloned(),
                     normalize_ident,
                 );
-                if self.options.collect_spans {
-                    if let Some(span) = Span::try_from_sqlparser_span(id_span) {
-                        column.spans_mut().add_span(span);
-                    }
+                if self.options.collect_spans
+                    && let Some(span) = Span::try_from_sqlparser_span(id_span)
+                {
+                    column.spans_mut().add_span(span);
                 }
                 return Ok(Expr::Column(column));
             }
 
             // Check the outer query schema
-            if let Some(outer) = planner_context.outer_query_schema() {
+            for outer in planner_context.outer_schemas_iter() {
                 if let Ok((qualifier, field)) =
                     outer.qualified_field_with_unqualified_name(normalize_ident.as_str())
                 {
                     // Found an exact match on a qualified name in the outer plan schema, so this is an outer reference column
                     return Ok(Expr::OuterReferenceColumn(
-                        Arc::new(field.clone()),
+                        Arc::clone(field),
                         Column::from((qualifier, field)),
                     ));
                 }
@@ -84,10 +104,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
             // Default case
             let mut column = Column::new_unqualified(normalize_ident);
-            if self.options.collect_spans {
-                if let Some(span) = Span::try_from_sqlparser_span(id_span) {
-                    column.spans_mut().add_span(span);
-                }
+            if self.options.collect_spans
+                && let Some(span) = Span::try_from_sqlparser_span(id_span)
+            {
+                column.spans_mut().add_span(span);
             }
             Ok(Expr::Column(column))
         }
@@ -99,27 +119,30 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         schema: &DFSchema,
         planner_context: &mut PlannerContext,
     ) -> Result<Expr> {
-        if ids.len() < 2 {
-            return internal_err!("Not a compound identifier: {ids:?}");
-        }
+        assert_or_internal_err!(ids.len() >= 2, "Not a compound identifier: {ids:?}");
 
         let ids_span = Span::union_iter(
             ids.iter()
                 .filter_map(|id| Span::try_from_sqlparser_span(id.span)),
         );
 
-        if ids[0].value.starts_with('@') {
+        if ids[0].value.starts_with('@') && ids[0].quote_style.is_none() {
             let var_names: Vec<_> = ids
                 .into_iter()
                 .map(|id| self.ident_normalizer.normalize(id))
                 .collect();
-            let ty = self
+            let field = self
                 .context_provider
-                .get_variable_type(&var_names)
+                .get_variable_field(&var_names)
+                .or_else(|| {
+                    self.context_provider
+                        .get_variable_type(&var_names)
+                        .map(|ty| ty.into_nullable_field_ref())
+                })
                 .ok_or_else(|| {
                     exec_datafusion_err!("variable {var_names:?} has no type information")
                 })?;
-            Ok(Expr::ScalarVariable(ty, var_names))
+            Ok(Expr::ScalarVariable(field, var_names))
         } else {
             let ids = ids
                 .into_iter()
@@ -150,10 +173,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 // Found matching field with no spare identifier(s)
                 Some((field, qualifier, _nested_names)) => {
                     let mut column = Column::from((qualifier, field));
-                    if self.options.collect_spans {
-                        if let Some(span) = ids_span {
-                            column.spans_mut().add_span(span);
-                        }
+                    if self.options.collect_spans
+                        && let Some(span) = ids_span
+                    {
+                        column.spans_mut().add_span(span);
                     }
                     Ok(Expr::Column(column))
                 }
@@ -164,48 +187,43 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                         not_impl_err!("compound identifier: {ids:?}")
                     } else {
                         // Check the outer_query_schema and try to find a match
-                        if let Some(outer) = planner_context.outer_query_schema() {
+                        for outer in planner_context.outer_schemas_iter() {
                             let search_result = search_dfschema(&ids, outer);
-                            match search_result {
+                            let result = match search_result {
                                 // Found matching field with spare identifier(s) for nested field(s) in structure
                                 Some((field, qualifier, nested_names))
                                     if !nested_names.is_empty() =>
                                 {
-                                    // TODO: remove when can support nested identifiers for OuterReferenceColumn
+                                    // TODO: remove this when we have support for nested identifiers for OuterReferenceColumn
                                     not_impl_err!(
                                         "Nested identifiers are not yet supported for OuterReferenceColumn {}",
-                                        Column::from((qualifier, field)).quoted_flat_name()
+                                        Column::from((qualifier, field))
+                                            .quoted_flat_name()
                                     )
                                 }
                                 // Found matching field with no spare identifier(s)
                                 Some((field, qualifier, _nested_names)) => {
                                     // Found an exact match on a qualified name in the outer plan schema, so this is an outer reference column
                                     Ok(Expr::OuterReferenceColumn(
-                                        Arc::new(field.clone()),
+                                        Arc::clone(field),
                                         Column::from((qualifier, field)),
                                     ))
                                 }
                                 // Found no matching field, will return a default
-                                None => {
-                                    let s = &ids[0..ids.len()];
-                                    // safe unwrap as s can never be empty or exceed the bounds
-                                    let (relation, column_name) =
-                                        form_identifier(s).unwrap();
-                                    Ok(Expr::Column(Column::new(relation, column_name)))
-                                }
-                            }
-                        } else {
-                            let s = &ids[0..ids.len()];
-                            // Safe unwrap as s can never be empty or exceed the bounds
-                            let (relation, column_name) = form_identifier(s).unwrap();
-                            let mut column = Column::new(relation, column_name);
-                            if self.options.collect_spans {
-                                if let Some(span) = ids_span {
-                                    column.spans_mut().add_span(span);
-                                }
-                            }
-                            Ok(Expr::Column(column))
+                                None => continue,
+                            };
+                            return result;
+                        }
+                        // Safe unwrap as column name can never be empty or exceed the bounds
+                        let (relation, column_name) =
+                            form_identifier(&ids[0..ids.len()]).unwrap();
+                        let mut column = Column::new(relation, column_name);
+                        if self.options.collect_spans
+                            && let Some(span) = ids_span
+                        {
+                            column.spans_mut().add_span(span);
                         }
+                        Ok(Expr::Column(column))
                     }
                 }
             }
@@ -293,7 +311,7 @@ fn search_dfschema<'ids, 'schema>(
     ids: &'ids [String],
     schema: &'schema DFSchema,
 ) -> Option<(
-    &'schema Field,
+    &'schema FieldRef,
     Option<&'schema TableReference>,
     &'ids [String],
 )> {
diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs
index 715a02db8b027..5cbc1c84bdb4b 100644
--- a/datafusion/sql/src/expr/mod.rs
+++ b/datafusion/sql/src/expr/mod.rs
@@ -22,23 +22,27 @@ use datafusion_expr::planner::{
 use sqlparser::ast::{
     AccessExpr, BinaryOperator, CastFormat, CastKind, CeilFloorKind,
     DataType as SQLDataType, DateTimeField, DictionaryField, Expr as SQLExpr,
-    ExprWithAlias as SQLExprWithAlias, MapEntry, StructField, Subscript, TrimWhereField,
-    TypedString, Value, ValueWithSpan,
+    ExprWithAlias as SQLExprWithAlias, JsonPath, MapEntry, StructField, Subscript,
+    TrimWhereField, TypedString, Value, ValueWithSpan,
 };
 
 use datafusion_common::{
-    internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, Result,
-    ScalarValue,
+    DFSchema, Result, ScalarValue, internal_datafusion_err, internal_err, not_impl_err,
+    plan_err,
 };
 
 use datafusion_expr::expr::ScalarFunction;
+use datafusion_expr::expr::SetQuantifier;
 use datafusion_expr::expr::{InList, WildcardOptions};
 use datafusion_expr::{
-    lit, Between, BinaryExpr, Cast, Expr, ExprSchemable, GetFieldAccess, Like, Literal,
-    Operator, TryCast,
+    Between, BinaryExpr, Cast, Expr, ExprSchemable, GetFieldAccess, Like, Literal,
+    Operator, TryCast, lit, when,
 };
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
+use datafusion_functions_nested::expr_fn::{
+    array_has, array_max, array_min, array_position, cardinality,
+};
 
 mod binary_op;
 mod function;
@@ -140,7 +144,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         let RawBinaryExpr { op, left, right } = binary_expr;
         Ok(Expr::BinaryExpr(BinaryExpr::new(
             Box::new(left),
-            self.parse_sql_binary_op(op)?,
+            self.parse_sql_binary_op(&op)?,
             Box::new(right),
         )))
     }
@@ -265,32 +269,38 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 planner_context,
             ),
 
+            SQLExpr::Cast { array: true, .. } => {
+                not_impl_err!("`CAST(... AS type ARRAY`) not supported")
+            }
+
             SQLExpr::Cast {
                 kind: CastKind::Cast | CastKind::DoubleColon,
                 expr,
                 data_type,
                 format,
-            } => self.sql_cast_to_expr(*expr, data_type, format, schema, planner_context),
+                array: false,
+            } => {
+                self.sql_cast_to_expr(*expr, &data_type, format, schema, planner_context)
+            }
 
             SQLExpr::Cast {
                 kind: CastKind::TryCast | CastKind::SafeCast,
                 expr,
                 data_type,
                 format,
+                array: false,
             } => {
                 if let Some(format) = format {
                     return not_impl_err!("CAST with format is not supported: {format}");
                 }
 
-                Ok(Expr::TryCast(TryCast::new(
+                Ok(Expr::TryCast(TryCast::new_from_field(
                     Box::new(self.sql_expr_to_logical_expr(
                         *expr,
                         schema,
                         planner_context,
                     )?),
-                    self.convert_data_type_to_field(&data_type)?
-                        .data_type()
-                        .clone(),
+                    self.convert_data_type_to_field(&data_type)?,
                 )))
             }
 
@@ -298,12 +308,19 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 data_type,
                 value,
                 uses_odbc_syntax: _,
-            }) => Ok(Expr::Cast(Cast::new(
-                Box::new(lit(value.into_string().unwrap())),
-                self.convert_data_type_to_field(&data_type)?
-                    .data_type()
-                    .clone(),
-            ))),
+            }) => {
+                let value = match value.into_string() {
+                    Some(value) => value,
+                    None => {
+                        return plan_err!("Typed literal requires a string payload");
+                    }
+                };
+
+                Ok(Expr::Cast(Cast::new_from_field(
+                    Box::new(lit(value)),
+                    self.convert_data_type_to_field(&data_type)?,
+                )))
+            }
 
             SQLExpr::IsNull(expr) => Ok(Expr::IsNull(Box::new(
                 self.sql_expr_to_logical_expr(*expr, schema, planner_context)?,
@@ -553,7 +570,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             }
 
             SQLExpr::Struct { values, fields } => {
-                self.parse_struct(schema, planner_context, values, fields)
+                self.parse_struct(schema, planner_context, values, &fields)
             }
             SQLExpr::Position { expr, r#in } => {
                 self.sql_position_to_expr(*expr, *r#in, schema, planner_context)
@@ -575,7 +592,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     _ => {
                         return not_impl_err!(
                             "Unsupported ast node in sqltorel: {time_zone:?}"
-                        )
+                        );
                     }
                 },
             ))),
@@ -592,32 +609,50 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 // ANY/SOME are equivalent, this field specifies which the user
                 // specified but it doesn't affect the plan so ignore the field
                 is_some: _,
-            } => {
-                let mut binary_expr = RawBinaryExpr {
-                    op: compare_op,
-                    left: self.sql_expr_to_logical_expr(
-                        *left,
-                        schema,
-                        planner_context,
-                    )?,
-                    right: self.sql_expr_to_logical_expr(
-                        *right,
-                        schema,
-                        planner_context,
-                    )?,
-                };
-                for planner in self.context_provider.get_expr_planners() {
-                    match planner.plan_any(binary_expr)? {
-                        PlannerResult::Planned(expr) => {
-                            return Ok(expr);
-                        }
-                        PlannerResult::Original(expr) => {
-                            binary_expr = expr;
-                        }
-                    }
+            } => match *right {
+                SQLExpr::Subquery(subquery) => self.parse_set_comparison_subquery(
+                    *left,
+                    *subquery,
+                    &compare_op,
+                    SetQuantifier::Any,
+                    schema,
+                    planner_context,
+                ),
+                _ => {
+                    let left_expr = self.sql_to_expr(*left, schema, planner_context)?;
+                    let right_expr = self.sql_to_expr(*right, schema, planner_context)?;
+                    plan_quantified_op(
+                        &left_expr,
+                        &right_expr,
+                        &compare_op,
+                        SetQuantifier::Any,
+                    )
                 }
-                not_impl_err!("AnyOp not supported by ExprPlanner: {binary_expr:?}")
-            }
+            },
+            SQLExpr::AllOp {
+                left,
+                compare_op,
+                right,
+            } => match *right {
+                SQLExpr::Subquery(subquery) => self.parse_set_comparison_subquery(
+                    *left,
+                    *subquery,
+                    &compare_op,
+                    SetQuantifier::All,
+                    schema,
+                    planner_context,
+                ),
+                _ => {
+                    let left_expr = self.sql_to_expr(*left, schema, planner_context)?;
+                    let right_expr = self.sql_to_expr(*right, schema, planner_context)?;
+                    plan_quantified_op(
+                        &left_expr,
+                        &right_expr,
+                        &compare_op,
+                        SetQuantifier::All,
+                    )
+                }
+            },
             #[expect(deprecated)]
             SQLExpr::Wildcard(_token) => Ok(Expr::Wildcard {
                 qualifier: None,
@@ -629,17 +664,43 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 options: Box::new(WildcardOptions::default()),
             }),
             SQLExpr::Tuple(values) => self.parse_tuple(schema, planner_context, values),
+            SQLExpr::JsonAccess { value, path } => {
+                self.parse_json_access(schema, planner_context, value, &path)
+            }
             _ => not_impl_err!("Unsupported ast node in sqltorel: {sql:?}"),
         }
     }
 
+    fn parse_json_access(
+        &self,
+        schema: &DFSchema,
+        planner_context: &mut PlannerContext,
+        value: Box<SQLExpr>,
+        path: &JsonPath,
+    ) -> Result<Expr> {
+        let json_path = path.to_string();
+        let json_path = if let Some(json_path) = json_path.strip_prefix(":") {
+            // sqlparser's JsonPath display adds an extra `:` at the beginning.
+            json_path.to_owned()
+        } else {
+            json_path
+        };
+        self.build_logical_expr(
+            BinaryOperator::Custom(":".to_owned()),
+            self.sql_to_expr(*value, schema, planner_context)?,
+            // pass json path as a string literal, let the impl parse it when needed.
+            Expr::Literal(ScalarValue::Utf8(Some(json_path)), None),
+            schema,
+        )
+    }
+
     /// Parses a struct(..) expression and plans it creation
     fn parse_struct(
         &self,
         schema: &DFSchema,
         planner_context: &mut PlannerContext,
         values: Vec<SQLExpr>,
-        fields: Vec<StructField>,
+        fields: &[StructField],
     ) -> Result<Expr> {
         if !fields.is_empty() {
             return not_impl_err!("Struct fields are not supported yet");
@@ -673,7 +734,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             Some(SQLExpr::Identifier(_))
             | Some(SQLExpr::Value(_))
             | Some(SQLExpr::CompoundIdentifier(_)) => {
-                self.parse_struct(schema, planner_context, values, vec![])
+                self.parse_struct(schema, planner_context, values, &[])
             }
             None => not_impl_err!("Empty tuple not supported yet"),
             _ => {
@@ -833,7 +894,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         )))
     }
 
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     fn sql_like_to_expr(
         &self,
         negated: bool,
@@ -853,7 +914,11 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             Some(Value::SingleQuotedString(char)) if char.len() == 1 => {
                 Some(char.chars().next().unwrap())
             }
-            Some(value) => return plan_err!("Invalid escape character in LIKE expression. Expected a single character wrapped with single quotes, got {value}"),
+            Some(value) => {
+                return plan_err!(
+                    "Invalid escape character in LIKE expression. Expected a single character wrapped with single quotes, got {value}"
+                );
+            }
             None => None,
         };
         Ok(Expr::Like(Like::new(
@@ -883,7 +948,11 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             Some(Value::SingleQuotedString(char)) if char.len() == 1 => {
                 Some(char.chars().next().unwrap())
             }
-            Some(value) => return plan_err!("Invalid escape character in SIMILAR TO expression. Expected a single character wrapped with single quotes, got {value}"),
+            Some(value) => {
+                return plan_err!(
+                    "Invalid escape character in SIMILAR TO expression. Expected a single character wrapped with single quotes, got {value}"
+                );
+            }
             None => None,
         };
         Ok(Expr::SimilarTo(Like::new(
@@ -979,7 +1048,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     fn sql_cast_to_expr(
         &self,
         expr: SQLExpr,
-        data_type: SQLDataType,
+        data_type: &SQLDataType,
         format: Option<CastFormat>,
         schema: &DFSchema,
         planner_context: &mut PlannerContext,
@@ -988,7 +1057,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             return not_impl_err!("CAST with format is not supported: {format}");
         }
 
-        let dt = self.convert_data_type_to_field(&data_type)?;
+        let dt = self.convert_data_type_to_field(data_type)?;
         let expr = self.sql_expr_to_logical_expr(expr, schema, planner_context)?;
 
         // numeric constants are treated as seconds (rather as nanoseconds)
@@ -1005,12 +1074,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             _ => expr,
         };
 
-        // Currently drops metadata attached to the type
-        // https://github.com/apache/datafusion/issues/18060
-        Ok(Expr::Cast(Cast::new(
-            Box::new(expr),
-            dt.data_type().clone(),
-        )))
+        Ok(Expr::Cast(Cast::new_from_field(Box::new(expr), dt)))
     }
 
     /// Extracts the root expression and access chain from a compound expression.
@@ -1195,6 +1259,86 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     }
 }
 
+/// Plans `needle <compare_op> ANY/ALL(haystack)` with proper SQL NULL semantics.
+///
+/// CASE/WHEN structure:
+///   WHEN arr IS NULL        → NULL
+///   WHEN empty              → vacuous_result (ANY:false, ALL:true)
+///   WHEN lhs IS NULL        → NULL
+///   WHEN decisive_condition → decisive_result (ANY:true match found, ALL:false violation found)
+///   WHEN has_nulls          → NULL
+///   ELSE                    → vacuous_result
+fn plan_quantified_op(
+    needle: &Expr,
+    haystack: &Expr,
+    compare_op: &BinaryOperator,
+    quantifier: SetQuantifier,
+) -> Result<Expr> {
+    let null_arr_check = haystack.clone().is_null();
+    let empty_check = cardinality(haystack.clone()).eq(lit(0u64));
+    let null_lhs_check = needle.clone().is_null();
+    // DataFusion's array_position uses is_null() checks internally (not equality),
+    // so it can locate NULL elements even though NULL = NULL is NULL in standard SQL.
+    let has_nulls =
+        array_position(haystack.clone(), lit(ScalarValue::Null), lit(1i64)).is_not_null();
+
+    let decisive_condition = match (compare_op, quantifier) {
+        (BinaryOperator::Eq, SetQuantifier::Any)
+        | (BinaryOperator::NotEq, SetQuantifier::All) => {
+            array_has(haystack.clone(), needle.clone())
+        }
+        (BinaryOperator::Eq, SetQuantifier::All)
+        | (BinaryOperator::NotEq, SetQuantifier::Any) => {
+            let all_equal = array_min(haystack.clone())
+                .eq(needle.clone())
+                .and(array_max(haystack.clone()).eq(needle.clone()));
+            Expr::Not(Box::new(all_equal))
+        }
+        (BinaryOperator::Gt, SetQuantifier::Any) => {
+            needle.clone().gt(array_min(haystack.clone()))
+        }
+        (BinaryOperator::Gt, SetQuantifier::All) => {
+            Expr::Not(Box::new(needle.clone().gt(array_max(haystack.clone()))))
+        }
+        (BinaryOperator::Lt, SetQuantifier::Any) => {
+            needle.clone().lt(array_max(haystack.clone()))
+        }
+        (BinaryOperator::Lt, SetQuantifier::All) => {
+            Expr::Not(Box::new(needle.clone().lt(array_min(haystack.clone()))))
+        }
+        (BinaryOperator::GtEq, SetQuantifier::Any) => {
+            needle.clone().gt_eq(array_min(haystack.clone()))
+        }
+        (BinaryOperator::GtEq, SetQuantifier::All) => {
+            Expr::Not(Box::new(needle.clone().gt_eq(array_max(haystack.clone()))))
+        }
+        (BinaryOperator::LtEq, SetQuantifier::Any) => {
+            needle.clone().lt_eq(array_max(haystack.clone()))
+        }
+        (BinaryOperator::LtEq, SetQuantifier::All) => {
+            Expr::Not(Box::new(needle.clone().lt_eq(array_min(haystack.clone()))))
+        }
+        _ => {
+            return plan_err!(
+                "Unsupported {quantifier}Op: '{compare_op}', only '=', '<>', '>', '<', '>=', '<=' are supported"
+            );
+        }
+    };
+
+    let (vacuous_result, decisive_result) = match quantifier {
+        SetQuantifier::Any => (false, true),
+        SetQuantifier::All => (true, false),
+    };
+
+    let null_bool = lit(ScalarValue::Boolean(None));
+    when(null_arr_check, null_bool.clone())
+        .when(empty_check, lit(vacuous_result))
+        .when(null_lhs_check, null_bool.clone())
+        .when(decisive_condition, lit(decisive_result))
+        .when(has_nulls, null_bool)
+        .otherwise(lit(vacuous_result))
+}
+
 #[cfg(test)]
 mod tests {
     use std::collections::HashMap;
@@ -1204,10 +1348,12 @@ mod tests {
     use sqlparser::dialect::GenericDialect;
     use sqlparser::parser::Parser;
 
-    use datafusion_common::config::ConfigOptions;
     use datafusion_common::TableReference;
+    use datafusion_common::config::ConfigOptions;
     use datafusion_expr::logical_plan::builder::LogicalTableSource;
-    use datafusion_expr::{AggregateUDF, ScalarUDF, TableSource, WindowUDF};
+    use datafusion_expr::{
+        AggregateUDF, HigherOrderUDF, ScalarUDF, TableSource, WindowUDF,
+    };
 
     use super::*;
 
@@ -1247,6 +1393,10 @@ mod tests {
             None
         }
 
+        fn get_higher_order_meta(&self, _name: &str) -> Option<Arc<dyn HigherOrderUDF>> {
+            None
+        }
+
         fn get_aggregate_meta(&self, name: &str) -> Option<Arc<AggregateUDF>> {
             match name {
                 "sum" => Some(datafusion_functions_aggregate::sum::sum_udaf()),
@@ -1270,6 +1420,10 @@ mod tests {
             Vec::new()
         }
 
+        fn higher_order_function_names(&self) -> Vec<String> {
+            Vec::new()
+        }
+
         fn udaf_names(&self) -> Vec<String> {
             vec!["sum".to_string()]
         }
@@ -1286,46 +1440,42 @@ mod tests {
     }
 
     macro_rules! test_stack_overflow {
-        ($num_expr:expr) => {
-            paste::item! {
-                #[test]
-                fn [<test_stack_overflow_ $num_expr>]() {
-                    let schema = DFSchema::empty();
-                    let mut planner_context = PlannerContext::default();
-
-                    let expr_str = (0..$num_expr)
-                        .map(|i| format!("column1 = 'value{:?}'", i))
-                        .collect::<Vec<String>>()
-                        .join(" OR ");
-
-                    let dialect = GenericDialect{};
-                    let mut parser = Parser::new(&dialect)
-                        .try_with_sql(expr_str.as_str())
-                        .unwrap();
-                    let sql_expr = parser.parse_expr().unwrap();
-
-                    let context_provider = TestContextProvider::new();
-                    let sql_to_rel = SqlToRel::new(&context_provider);
-
-                    // Should not stack overflow
-                    sql_to_rel.sql_expr_to_logical_expr(
-                        sql_expr,
-                        &schema,
-                        &mut planner_context,
-                    ).unwrap();
-                }
+        ($name:ident, $num_expr:expr) => {
+            #[test]
+            fn $name() {
+                let schema = DFSchema::empty();
+                let mut planner_context = PlannerContext::default();
+
+                let expr_str = (0..$num_expr)
+                    .map(|i| format!("column1 = 'value{:?}'", i))
+                    .collect::<Vec<String>>()
+                    .join(" OR ");
+
+                let dialect = GenericDialect {};
+                let mut parser = Parser::new(&dialect)
+                    .try_with_sql(expr_str.as_str())
+                    .unwrap();
+                let sql_expr = parser.parse_expr().unwrap();
+
+                let context_provider = TestContextProvider::new();
+                let sql_to_rel = SqlToRel::new(&context_provider);
+
+                // Should not stack overflow
+                sql_to_rel
+                    .sql_expr_to_logical_expr(sql_expr, &schema, &mut planner_context)
+                    .unwrap();
             }
         };
     }
 
-    test_stack_overflow!(64);
-    test_stack_overflow!(128);
-    test_stack_overflow!(256);
-    test_stack_overflow!(512);
-    test_stack_overflow!(1024);
-    test_stack_overflow!(2048);
-    test_stack_overflow!(4096);
-    test_stack_overflow!(8192);
+    test_stack_overflow!(test_stack_overflow_64, 64);
+    test_stack_overflow!(test_stack_overflow_128, 128);
+    test_stack_overflow!(test_stack_overflow_256, 256);
+    test_stack_overflow!(test_stack_overflow_512, 512);
+    test_stack_overflow!(test_stack_overflow_1024, 1024);
+    test_stack_overflow!(test_stack_overflow_2048, 2048);
+    test_stack_overflow!(test_stack_overflow_4096, 4096);
+    test_stack_overflow!(test_stack_overflow_8192, 8192);
     #[test]
     fn test_sql_to_expr_with_alias() {
         let schema = DFSchema::empty();
diff --git a/datafusion/sql/src/expr/order_by.rs b/datafusion/sql/src/expr/order_by.rs
index 79ebc5943ffbe..faecfbcfecc05 100644
--- a/datafusion/sql/src/expr/order_by.rs
+++ b/datafusion/sql/src/expr/order_by.rs
@@ -17,7 +17,7 @@
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
 use datafusion_common::{
-    not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema, Result,
+    Column, DFSchema, Result, not_impl_err, plan_datafusion_err, plan_err,
 };
 use datafusion_expr::expr::Sort;
 use datafusion_expr::{Expr, SortExpr};
diff --git a/datafusion/sql/src/expr/subquery.rs b/datafusion/sql/src/expr/subquery.rs
index 24bb813634cc1..662c44f6f2620 100644
--- a/datafusion/sql/src/expr/subquery.rs
+++ b/datafusion/sql/src/expr/subquery.rs
@@ -16,11 +16,11 @@
 // under the License.
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
-use datafusion_common::{plan_err, DFSchema, Diagnostic, Result, Span, Spans};
-use datafusion_expr::expr::{Exists, InSubquery};
+use datafusion_common::{DFSchema, Diagnostic, Result, Span, Spans, plan_err};
+use datafusion_expr::expr::{Exists, InSubquery, SetComparison, SetQuantifier};
 use datafusion_expr::{Expr, LogicalPlan, Subquery};
 use sqlparser::ast::Expr as SQLExpr;
-use sqlparser::ast::{Query, SelectItem, SetExpr};
+use sqlparser::ast::{BinaryOperator, Query, SelectItem, SetExpr};
 use std::sync::Arc;
 
 impl<S: ContextProvider> SqlToRel<'_, S> {
@@ -31,11 +31,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         input_schema: &DFSchema,
         planner_context: &mut PlannerContext,
     ) -> Result<Expr> {
-        let old_outer_query_schema =
-            planner_context.set_outer_query_schema(Some(input_schema.clone().into()));
+        planner_context.append_outer_query_schema(input_schema.clone().into());
         let sub_plan = self.query_to_plan(subquery, planner_context)?;
         let outer_ref_columns = sub_plan.all_out_ref_exprs();
-        planner_context.set_outer_query_schema(old_outer_query_schema);
+        planner_context.pop_outer_query_schema();
         Ok(Expr::Exists(Exists {
             subquery: Subquery {
                 subquery: Arc::new(sub_plan),
@@ -54,27 +53,26 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         input_schema: &DFSchema,
         planner_context: &mut PlannerContext,
     ) -> Result<Expr> {
-        let old_outer_query_schema =
-            planner_context.set_outer_query_schema(Some(input_schema.clone().into()));
+        planner_context.append_outer_query_schema(Arc::new(input_schema.clone()));
 
         let mut spans = Spans::new();
         if let SetExpr::Select(select) = &subquery.body.as_ref() {
             for item in &select.projection {
-                if let SelectItem::UnnamedExpr(SQLExpr::Identifier(ident)) = item {
-                    if let Some(span) = Span::try_from_sqlparser_span(ident.span) {
-                        spans.add_span(span);
-                    }
+                if let SelectItem::UnnamedExpr(SQLExpr::Identifier(ident)) = item
+                    && let Some(span) = Span::try_from_sqlparser_span(ident.span)
+                {
+                    spans.add_span(span);
                 }
             }
         }
 
         let sub_plan = self.query_to_plan(subquery, planner_context)?;
         let outer_ref_columns = sub_plan.all_out_ref_exprs();
-        planner_context.set_outer_query_schema(old_outer_query_schema);
+        planner_context.pop_outer_query_schema();
 
         self.validate_single_column(
             &sub_plan,
-            spans.clone(),
+            &spans,
             "Too many columns! The subquery should only return one column",
             "Select only one column in the subquery",
         )?;
@@ -98,25 +96,24 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         input_schema: &DFSchema,
         planner_context: &mut PlannerContext,
     ) -> Result<Expr> {
-        let old_outer_query_schema =
-            planner_context.set_outer_query_schema(Some(input_schema.clone().into()));
+        planner_context.append_outer_query_schema(Arc::new(input_schema.clone()));
         let mut spans = Spans::new();
         if let SetExpr::Select(select) = subquery.body.as_ref() {
             for item in &select.projection {
-                if let SelectItem::ExprWithAlias { alias, .. } = item {
-                    if let Some(span) = Span::try_from_sqlparser_span(alias.span) {
-                        spans.add_span(span);
-                    }
+                if let SelectItem::ExprWithAlias { alias, .. } = item
+                    && let Some(span) = Span::try_from_sqlparser_span(alias.span)
+                {
+                    spans.add_span(span);
                 }
             }
         }
         let sub_plan = self.query_to_plan(subquery, planner_context)?;
         let outer_ref_columns = sub_plan.all_out_ref_exprs();
-        planner_context.set_outer_query_schema(old_outer_query_schema);
+        planner_context.pop_outer_query_schema();
 
         self.validate_single_column(
             &sub_plan,
-            spans.clone(),
+            &spans,
             "Too many columns! The subquery should only return one column",
             "Select only one column in the subquery",
         )?;
@@ -131,7 +128,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     fn validate_single_column(
         &self,
         sub_plan: &LogicalPlan,
-        spans: Spans,
+        spans: &Spans,
         error_message: &str,
         help_message: &str,
     ) -> Result<()> {
@@ -148,7 +145,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
     fn build_multi_column_diagnostic(
         &self,
-        spans: Spans,
+        spans: &Spans,
         error_message: &str,
         help_message: &str,
     ) -> Diagnostic {
@@ -162,4 +159,50 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         diagnostic.add_help(help_message, None);
         diagnostic
     }
+
+    pub(super) fn parse_set_comparison_subquery(
+        &self,
+        left_expr: SQLExpr,
+        subquery: Query,
+        compare_op: &BinaryOperator,
+        quantifier: SetQuantifier,
+        input_schema: &DFSchema,
+        planner_context: &mut PlannerContext,
+    ) -> Result<Expr> {
+        planner_context.append_outer_query_schema(Arc::new(input_schema.clone()));
+
+        let mut spans = Spans::new();
+        if let SetExpr::Select(select) = subquery.body.as_ref() {
+            for item in &select.projection {
+                if let SelectItem::ExprWithAlias { alias, .. } = item
+                    && let Some(span) = Span::try_from_sqlparser_span(alias.span)
+                {
+                    spans.add_span(span);
+                }
+            }
+        }
+
+        let sub_plan = self.query_to_plan(subquery, planner_context)?;
+        let outer_ref_columns = sub_plan.all_out_ref_exprs();
+        planner_context.pop_outer_query_schema();
+
+        self.validate_single_column(
+            &sub_plan,
+            &spans,
+            "Too many columns! The subquery should only return one column",
+            "Select only one column in the subquery",
+        )?;
+
+        let expr_obj = self.sql_to_expr(left_expr, input_schema, planner_context)?;
+        Ok(Expr::SetComparison(SetComparison::new(
+            Box::new(expr_obj),
+            Subquery {
+                subquery: Arc::new(sub_plan),
+                outer_ref_columns,
+                spans,
+            },
+            self.parse_sql_binary_op(compare_op)?,
+            quantifier,
+        )))
+    }
 }
diff --git a/datafusion/sql/src/expr/substring.rs b/datafusion/sql/src/expr/substring.rs
index 0ff361be0e206..d3b56097c1f58 100644
--- a/datafusion/sql/src/expr/substring.rs
+++ b/datafusion/sql/src/expr/substring.rs
@@ -16,9 +16,9 @@
 // under the License.
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
-use datafusion_common::{not_impl_err, plan_err};
 use datafusion_common::{DFSchema, Result, ScalarValue};
-use datafusion_expr::{planner::PlannerResult, Expr};
+use datafusion_common::{not_impl_err, plan_err};
+use datafusion_expr::{Expr, planner::PlannerResult};
 
 use sqlparser::ast::Expr as SQLExpr;
 
@@ -79,7 +79,9 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             }
         }
 
-        not_impl_err!("Substring could not be planned by registered expr planner. \
-                        Hint: Please try with `unicode_expressions` DataFusion feature enabled")
+        not_impl_err!(
+            "Substring could not be planned by registered expr planner. \
+                        Hint: Please try with `unicode_expressions` DataFusion feature enabled"
+        )
     }
 }
diff --git a/datafusion/sql/src/expr/unary_op.rs b/datafusion/sql/src/expr/unary_op.rs
index e0c94543f6013..cd118c0fdd5c5 100644
--- a/datafusion/sql/src/expr/unary_op.rs
+++ b/datafusion/sql/src/expr/unary_op.rs
@@ -16,10 +16,10 @@
 // under the License.
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
-use datafusion_common::{not_impl_err, plan_err, DFSchema, Diagnostic, Result};
+use datafusion_common::{DFSchema, Diagnostic, Result, not_impl_err, plan_err};
 use datafusion_expr::{
-    type_coercion::{is_interval, is_timestamp},
     Expr, ExprSchemable,
+    type_coercion::{is_interval, is_timestamp},
 };
 use sqlparser::ast::{Expr as SQLExpr, UnaryOperator, Value, ValueWithSpan};
 
@@ -38,10 +38,11 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             UnaryOperator::Plus => {
                 let operand =
                     self.sql_expr_to_logical_expr(expr, schema, planner_context)?;
-                let (data_type, _) = operand.data_type_and_nullable(schema)?;
+                let field = operand.to_field(schema)?.1;
+                let data_type = field.data_type();
                 if data_type.is_numeric()
-                    || is_interval(&data_type)
-                    || is_timestamp(&data_type)
+                    || is_interval(data_type)
+                    || is_timestamp(data_type)
                 {
                     Ok(operand)
                 } else {
diff --git a/datafusion/sql/src/expr/value.rs b/datafusion/sql/src/expr/value.rs
index 3abb2752988f7..13a47f545cf7e 100644
--- a/datafusion/sql/src/expr/value.rs
+++ b/datafusion/sql/src/expr/value.rs
@@ -17,20 +17,20 @@
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
 use arrow::compute::kernels::cast_utils::{
-    parse_interval_month_day_nano_config, IntervalParseConfig, IntervalUnit,
+    IntervalParseConfig, IntervalUnit, parse_interval_month_day_nano_config,
 };
 use arrow::datatypes::{
-    i256, FieldRef, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION,
+    DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, FieldRef, i256,
 };
 use bigdecimal::num_bigint::BigInt;
 use bigdecimal::{BigDecimal, Signed, ToPrimitive};
 use datafusion_common::{
-    internal_datafusion_err, not_impl_err, plan_err, DFSchema, DataFusionError, Result,
-    ScalarValue,
+    DFSchema, DataFusionError, Result, ScalarValue, internal_datafusion_err,
+    not_impl_err, plan_err,
 };
 use datafusion_expr::expr::{BinaryExpr, Placeholder};
 use datafusion_expr::planner::PlannerResult;
-use datafusion_expr::{lit, Expr, Operator};
+use datafusion_expr::{Expr, Operator, lit};
 use log::debug;
 use sqlparser::ast::{
     BinaryOperator, Expr as SQLExpr, Interval, UnaryOperator, Value, ValueWithSpan,
@@ -45,7 +45,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     pub(crate) fn parse_value(
         &self,
         value: Value,
-        param_data_types: &[FieldRef],
+        param_data_types: &[Option<FieldRef>],
     ) -> Result<Expr> {
         match value {
             Value::Number(n, _) => self.parse_sql_number(&n, false),
@@ -86,10 +86,8 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             return Ok(lit(n));
         }
 
-        if !negative {
-            if let Ok(n) = unsigned_number.parse::<u64>() {
-                return Ok(lit(n));
-            }
+        if !negative && let Ok(n) = unsigned_number.parse::<u64>() {
+            return Ok(lit(n));
         }
 
         if self.options.parse_float_as_decimal {
@@ -104,13 +102,13 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     }
 
     /// Create a placeholder expression
-    /// This is the same as Postgres's prepare statement syntax in which a placeholder starts with `$` sign and then
-    /// number 1, 2, ... etc. For example, `$1` is the first placeholder; $2 is the second one and so on.
+    /// Both named (`$foo`) and positional (`$1`, `$2`, ...) placeholder styles are supported.
     fn create_placeholder_expr(
         param: String,
-        param_data_types: &[FieldRef],
+        param_data_types: &[Option<FieldRef>],
     ) -> Result<Expr> {
-        // Parse the placeholder as a number because it is the only support from sqlparser and postgres
+        // Try to parse the placeholder as a number. If the placeholder does not have a valid
+        // positional value, assume we have a named placeholder.
         let index = param[1..].parse::<usize>();
         let idx = match index {
             Ok(0) => {
@@ -123,19 +121,30 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 return if param_data_types.is_empty() {
                     Ok(Expr::Placeholder(Placeholder::new_with_field(param, None)))
                 } else {
-                    // when PREPARE Statement, param_data_types length is always 0
-                    plan_err!("Invalid placeholder, not a number: {param}")
+                    // FIXME: This branch is shared by params from PREPARE and CREATE FUNCTION, but
+                    // only CREATE FUNCTION currently supports named params. For now, we rewrite
+                    // these to positional params.
+                    let named_param_pos = param_data_types.iter().position(|v| {
+                        v.as_ref().is_some_and(|field| field.name() == &param[1..])
+                    });
+                    match named_param_pos {
+                        Some(pos) => Ok(Expr::Placeholder(Placeholder::new_with_field(
+                            format!("${}", pos + 1),
+                            param_data_types.get(pos).and_then(|v| v.clone()),
+                        ))),
+                        None => plan_err!("Unknown placeholder: {param}"),
+                    }
                 };
             }
         };
         // Check if the placeholder is in the parameter list
-        let param_type = param_data_types.get(idx);
+        // FIXME: In the CREATE FUNCTION branch, param_type = None should raise an error
+        let param_type = param_data_types.get(idx).and_then(|v| v.clone());
         // Data type of the parameter
         debug!("type of param {param} param_data_types[idx]: {param_type:?}");
 
         Ok(Expr::Placeholder(Placeholder::new_with_field(
-            param,
-            param_type.cloned(),
+            param, param_type,
         )))
     }
 
@@ -169,86 +178,91 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             }
         }
 
-        not_impl_err!("Could not plan array literal. Hint: Please try with `nested_expressions` DataFusion feature enabled")
+        not_impl_err!(
+            "Could not plan array literal. Hint: Please try with `nested_expressions` DataFusion feature enabled"
+        )
     }
 
     /// Convert a SQL interval expression to a DataFusion logical plan
     /// expression
-    #[allow(clippy::only_used_in_recursion)]
     pub(super) fn sql_interval_to_expr(
         &self,
         negative: bool,
         interval: Interval,
     ) -> Result<Expr> {
-        if interval.leading_precision.is_some() {
-            return not_impl_err!(
-                "Unsupported Interval Expression with leading_precision {:?}",
-                interval.leading_precision
-            );
-        }
-
-        if interval.last_field.is_some() {
-            return not_impl_err!(
-                "Unsupported Interval Expression with last_field {:?}",
-                interval.last_field
-            );
-        }
+        sql_interval_to_expr_impl(negative, interval)
+    }
+}
 
-        if interval.fractional_seconds_precision.is_some() {
-            return not_impl_err!(
-                "Unsupported Interval Expression with fractional_seconds_precision {:?}",
-                interval.fractional_seconds_precision
-            );
-        }
+fn sql_interval_to_expr_impl(negative: bool, interval: Interval) -> Result<Expr> {
+    if interval.leading_precision.is_some() {
+        return not_impl_err!(
+            "Unsupported Interval Expression with leading_precision {:?}",
+            interval.leading_precision
+        );
+    }
 
-        if let SQLExpr::BinaryOp { left, op, right } = *interval.value {
-            let df_op = match op {
-                BinaryOperator::Plus => Operator::Plus,
-                BinaryOperator::Minus => Operator::Minus,
-                _ => {
-                    return not_impl_err!("Unsupported interval operator: {op:?}");
-                }
-            };
-            let left_expr = self.sql_interval_to_expr(
-                negative,
-                Interval {
-                    value: left,
-                    leading_field: interval.leading_field.clone(),
-                    leading_precision: None,
-                    last_field: None,
-                    fractional_seconds_precision: None,
-                },
-            )?;
-            let right_expr = self.sql_interval_to_expr(
-                false,
-                Interval {
-                    value: right,
-                    leading_field: interval.leading_field,
-                    leading_precision: None,
-                    last_field: None,
-                    fractional_seconds_precision: None,
-                },
-            )?;
-            return Ok(Expr::BinaryExpr(BinaryExpr::new(
-                Box::new(left_expr),
-                df_op,
-                Box::new(right_expr),
-            )));
-        }
+    if interval.last_field.is_some() {
+        return not_impl_err!(
+            "Unsupported Interval Expression with last_field {:?}",
+            interval.last_field
+        );
+    }
 
-        let value = interval_literal(*interval.value, negative)?;
+    if interval.fractional_seconds_precision.is_some() {
+        return not_impl_err!(
+            "Unsupported Interval Expression with fractional_seconds_precision {:?}",
+            interval.fractional_seconds_precision
+        );
+    }
 
-        // leading_field really means the unit if specified
-        // For example, "month" in  `INTERVAL '5' month`
-        let value = match interval.leading_field.as_ref() {
-            Some(leading_field) => format!("{value} {leading_field}"),
-            None => value,
+    if let SQLExpr::BinaryOp { left, op, right } = *interval.value {
+        let df_op = match op {
+            BinaryOperator::Plus => Operator::Plus,
+            BinaryOperator::Minus => Operator::Minus,
+            _ => {
+                return not_impl_err!("Unsupported interval operator: {op:?}");
+            }
         };
-
-        let config = IntervalParseConfig::new(IntervalUnit::Second);
-        let val = parse_interval_month_day_nano_config(&value, config)?;
-        Ok(lit(ScalarValue::IntervalMonthDayNano(Some(val))))
+        let left_expr = sql_interval_to_expr_impl(
+            negative,
+            Interval {
+                value: left,
+                leading_field: interval.leading_field.clone(),
+                leading_precision: None,
+                last_field: None,
+                fractional_seconds_precision: None,
+            },
+        )?;
+        let right_expr = sql_interval_to_expr_impl(
+            false,
+            Interval {
+                value: right,
+                leading_field: interval.leading_field,
+                leading_precision: None,
+                last_field: None,
+                fractional_seconds_precision: None,
+            },
+        )?;
+        return Ok(Expr::BinaryExpr(BinaryExpr::new(
+            Box::new(left_expr),
+            df_op,
+            Box::new(right_expr),
+        )));
     }
+
+    let value = interval_literal(*interval.value, negative)?;
+
+    // leading_field really means the unit if specified
+    // For example, "month" in  `INTERVAL '5' month`
+    let value = match interval.leading_field.as_ref() {
+        Some(leading_field) => format!("{value} {leading_field}"),
+        None => value,
+    };
+
+    let config = IntervalParseConfig::new(IntervalUnit::Second);
+    let val = parse_interval_month_day_nano_config(&value, config)?;
+    Ok(lit(ScalarValue::IntervalMonthDayNano(Some(val))))
 }
 
 fn interval_literal(interval_value: SQLExpr, negative: bool) -> Result<String> {
@@ -282,14 +296,12 @@ fn interval_literal(interval_value: SQLExpr, negative: bool) -> Result<String> {
             interval_literal(*expr, negative)?
         }
         _ => {
-            return not_impl_err!("Unsupported interval argument. Expected string literal or number, got: {interval_value:?}");
+            return not_impl_err!(
+                "Unsupported interval argument. Expected string literal or number, got: {interval_value:?}"
+            );
         }
     };
-    if negative {
-        Ok(format!("-{s}"))
-    } else {
-        Ok(s)
-    }
+    if negative { Ok(format!("-{s}")) } else { Ok(s) }
 }
 
 /// Try to decode bytes from hex literal string.
@@ -492,9 +504,7 @@ mod tests {
 
         // scale < i8::MIN
         assert_eq!(
-            parse_decimal("1e129", false)
-                .unwrap_err()
-                .strip_backtrace(),
+            parse_decimal("1e129", false).unwrap_err().strip_backtrace(),
             "This feature is not implemented: Decimal scale -129 exceeds the minimum supported scale: -128"
         );
 
diff --git a/datafusion/sql/src/lib.rs b/datafusion/sql/src/lib.rs
index da15b90d22a84..7fef670933f9a 100644
--- a/datafusion/sql/src/lib.rs
+++ b/datafusion/sql/src/lib.rs
@@ -23,6 +23,7 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! This crate provides:
 //!
diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs
index 1f1ef2a672abc..1ecf90b7947c3 100644
--- a/datafusion/sql/src/parser.rs
+++ b/datafusion/sql/src/parser.rs
@@ -20,17 +20,17 @@
 //! This parser implements DataFusion specific statements such as
 //! `CREATE EXTERNAL TABLE`
 
-use datafusion_common::config::SqlParserOptions;
 use datafusion_common::DataFusionError;
-use datafusion_common::{sql_err, Diagnostic, Span};
-use sqlparser::ast::{ExprWithAlias, OrderByOptions};
+use datafusion_common::config::SqlParserOptions;
+use datafusion_common::{Diagnostic, Span, sql_err};
+use sqlparser::ast::{ExprWithAlias, Ident, OrderByOptions};
 use sqlparser::tokenizer::TokenWithSpan;
 use sqlparser::{
     ast::{
         ColumnDef, ColumnOptionDef, ObjectName, OrderByExpr, Query,
         Statement as SQLStatement, TableConstraint, Value,
     },
-    dialect::{keywords::Keyword, Dialect, GenericDialect},
+    dialect::{Dialect, GenericDialect, keywords::Keyword},
     parser::{Parser, ParserError},
     tokenizer::{Token, Tokenizer, Word},
 };
@@ -58,7 +58,7 @@ fn parse_file_type(s: &str) -> Result<String, DataFusionError> {
 /// Syntax:
 /// ```sql
 /// EXPLAIN <ANALYZE> <VERBOSE> [FORMAT format] statement
-///```
+/// ```
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct ExplainStatement {
     /// `EXPLAIN ANALYZE ..`
@@ -259,6 +259,21 @@ impl fmt::Display for CreateExternalTable {
     }
 }
 
+/// DataFusion extension for `RESET`
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum ResetStatement {
+    /// Reset a single configuration variable (stored as provided)
+    Variable(ObjectName),
+}
+
+impl fmt::Display for ResetStatement {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            ResetStatement::Variable(name) => write!(f, "RESET {name}"),
+        }
+    }
+}
+
 /// DataFusion SQL Statement.
 ///
 /// This can either be a [`Statement`] from [`sqlparser`] from a
@@ -276,6 +291,8 @@ pub enum Statement {
     CopyTo(CopyToStatement),
     /// EXPLAIN for extensions
     Explain(ExplainStatement),
+    /// Extension: `RESET`
+    Reset(ResetStatement),
 }
 
 impl fmt::Display for Statement {
@@ -285,6 +302,7 @@ impl fmt::Display for Statement {
             Statement::CreateExternalTable(stmt) => write!(f, "{stmt}"),
             Statement::CopyTo(stmt) => write!(f, "{stmt}"),
             Statement::Explain(stmt) => write!(f, "{stmt}"),
+            Statement::Reset(stmt) => write!(f, "{stmt}"),
         }
     }
 }
@@ -320,8 +338,7 @@ const DEFAULT_DIALECT: GenericDialect = GenericDialect {};
 /// # use datafusion_sql::parser::DFParserBuilder;
 /// # use datafusion_common::Result;
 /// # fn test() -> Result<()> {
-/// let mut parser = DFParserBuilder::new("SELECT * FROM foo; SELECT 1 + 2")
-///   .build()?;
+/// let mut parser = DFParserBuilder::new("SELECT * FROM foo; SELECT 1 + 2").build()?;
 /// // parse the SQL into DFStatements
 /// let statements = parser.parse_statements()?;
 /// assert_eq!(statements.len(), 2);
@@ -336,38 +353,59 @@ const DEFAULT_DIALECT: GenericDialect = GenericDialect {};
 /// # use datafusion_sql::sqlparser::dialect::MySqlDialect;
 /// # use datafusion_sql::sqlparser::ast::Expr;
 /// # fn test() -> Result<()> {
-/// let dialect = MySqlDialect{}; // Parse using MySQL dialect
+/// let dialect = MySqlDialect {}; // Parse using MySQL dialect
 /// let mut parser = DFParserBuilder::new("1 + 2")
-///   .with_dialect(&dialect)
-///   .build()?;
+///     .with_dialect(&dialect)
+///     .build()?;
 /// // parse 1+2 into an sqlparser::ast::Expr
 /// let res = parser.parse_expr()?;
-/// assert!(matches!(res.expr, Expr::BinaryOp {..}));
+/// assert!(matches!(res.expr, Expr::BinaryOp { .. }));
 /// # Ok(())
 /// # }
 /// ```
-pub struct DFParserBuilder<'a> {
-    /// The SQL string to parse
-    sql: &'a str,
+pub struct DFParserBuilder<'a, 'b> {
+    /// Parser input: either raw SQL or tokens
+    input: ParserInput<'a>,
     /// The Dialect to use (defaults to [`GenericDialect`]
-    dialect: &'a dyn Dialect,
+    dialect: &'b dyn Dialect,
     /// The recursion limit while parsing
     recursion_limit: usize,
 }
 
-impl<'a> DFParserBuilder<'a> {
+/// Describes a possible input for parser
+pub enum ParserInput<'a> {
+    /// Raw SQL. Tokenization will be performed automatically as a
+    /// part of [`DFParserBuilder::build`]
+    Sql(&'a str),
+    /// Tokens
+    Tokens(Vec<TokenWithSpan>),
+}
+
+impl<'a> From<&'a str> for ParserInput<'a> {
+    fn from(sql: &'a str) -> Self {
+        Self::Sql(sql)
+    }
+}
+
+impl From<Vec<TokenWithSpan>> for ParserInput<'static> {
+    fn from(tokens: Vec<TokenWithSpan>) -> Self {
+        Self::Tokens(tokens)
+    }
+}
+
+impl<'a, 'b> DFParserBuilder<'a, 'b> {
     /// Create a new parser builder for the specified tokens using the
     /// [`GenericDialect`].
-    pub fn new(sql: &'a str) -> Self {
+    pub fn new(input: impl Into<ParserInput<'a>>) -> Self {
         Self {
-            sql,
+            input: input.into(),
             dialect: &DEFAULT_DIALECT,
             recursion_limit: DEFAULT_RECURSION_LIMIT,
         }
     }
 
     /// Adjust the parser builder's dialect. Defaults to [`GenericDialect`]
-    pub fn with_dialect(mut self, dialect: &'a dyn Dialect) -> Self {
+    pub fn with_dialect(mut self, dialect: &'b dyn Dialect) -> Self {
         self.dialect = dialect;
         self
     }
@@ -378,12 +416,18 @@ impl<'a> DFParserBuilder<'a> {
         self
     }
 
-    pub fn build(self) -> Result<DFParser<'a>, DataFusionError> {
-        let mut tokenizer = Tokenizer::new(self.dialect, self.sql);
-        // Convert TokenizerError -> ParserError
-        let tokens = tokenizer
-            .tokenize_with_location()
-            .map_err(ParserError::from)?;
+    /// Build resulting parser
+    pub fn build(self) -> Result<DFParser<'b>, DataFusionError> {
+        let tokens = match self.input {
+            ParserInput::Tokens(tokens) => tokens,
+            ParserInput::Sql(sql) => {
+                let mut tokenizer = Tokenizer::new(self.dialect, sql);
+                // Convert TokenizerError -> ParserError
+                tokenizer
+                    .tokenize_with_location()
+                    .map_err(ParserError::from)?
+            }
+        };
 
         Ok(DFParser {
             parser: Parser::new(self.dialect)
@@ -457,7 +501,7 @@ impl<'a> DFParser<'a> {
                 break;
             }
             if expecting_statement_delimiter {
-                return self.expected("end of statement", self.parser.peek_token());
+                return self.expected("end of statement", &self.parser.peek_token());
             }
 
             let statement = self.parse_statement()?;
@@ -471,7 +515,7 @@ impl<'a> DFParser<'a> {
     fn expected<T>(
         &self,
         expected: &str,
-        found: TokenWithSpan,
+        found: &TokenWithSpan,
     ) -> Result<T, DataFusionError> {
         let sql_parser_span = found.span;
         let span = Span::try_from_sqlparser_span(sql_parser_span);
@@ -489,11 +533,11 @@ impl<'a> DFParser<'a> {
     fn expect_token(
         &mut self,
         expected: &str,
-        token: Token,
+        token: &Token,
     ) -> Result<(), DataFusionError> {
         let next_token = self.parser.peek_token_ref();
-        if next_token.token != token {
-            self.expected(expected, next_token.clone())
+        if next_token.token != *token {
+            self.expected(expected, next_token)
         } else {
             Ok(())
         }
@@ -522,6 +566,10 @@ impl<'a> DFParser<'a> {
                         self.parser.next_token(); // EXPLAIN
                         self.parse_explain()
                     }
+                    Keyword::RESET => {
+                        self.parser.next_token(); // RESET
+                        self.parse_reset()
+                    }
                     _ => {
                         // use sqlparser-rs parser
                         self.parse_and_handle_statement()
@@ -554,7 +602,7 @@ impl<'a> DFParser<'a> {
     /// contains any trailing, unparsed tokens.
     pub fn parse_into_expr(&mut self) -> Result<ExprWithAlias, DataFusionError> {
         let expr = self.parse_expr()?;
-        self.expect_token("end of expression", Token::EOF)?;
+        self.expect_token("end of expression", &Token::EOF)?;
         Ok(expr)
     }
 
@@ -619,7 +667,9 @@ impl<'a> DFParser<'a> {
                     Keyword::WITH => {
                         self.parser.expect_keyword(Keyword::HEADER)?;
                         self.parser.expect_keyword(Keyword::ROW)?;
-                        return parser_err!("WITH HEADER ROW clause is no longer in use. Please use the OPTIONS clause with 'format.has_header' set appropriately, e.g., OPTIONS ('format.has_header' 'true')")?;
+                        return parser_err!(
+                            "WITH HEADER ROW clause is no longer in use. Please use the OPTIONS clause with 'format.has_header' set appropriately, e.g., OPTIONS ('format.has_header' 'true')"
+                        )?;
                     }
                     Keyword::PARTITIONED => {
                         self.parser.expect_keyword(Keyword::BY)?;
@@ -635,11 +685,11 @@ impl<'a> DFParser<'a> {
                     }
                 }
             } else {
-                let token = self.parser.next_token();
+                let token = self.parser.peek_token();
                 if token == Token::EOF || token == Token::SemiColon {
                     break;
                 } else {
-                    return self.expected("end of statement or ;", token)?;
+                    return self.expected("end of statement or ;", &token)?;
                 }
             }
         }
@@ -676,7 +726,7 @@ impl<'a> DFParser<'a> {
                         // Unquoted namespaced keys have to conform to the syntax
                         // "<WORD>[\.<WORD>]*". If we have a key that breaks this
                         // pattern, error out:
-                        return self.expected("key name", next_token);
+                        return self.expected("key name", &next_token);
                     }
                 }
                 Ok(parts.join("."))
@@ -684,7 +734,7 @@ impl<'a> DFParser<'a> {
             Token::SingleQuotedString(s) => Ok(s),
             Token::DoubleQuotedString(s) => Ok(s),
             Token::EscapedStringLiteral(s) => Ok(s),
-            _ => self.expected("key name", next_token),
+            _ => self.expected("key name", &next_token),
         }
     }
 
@@ -703,7 +753,7 @@ impl<'a> DFParser<'a> {
             Token::DoubleQuotedString(s) => Ok(Value::DoubleQuotedString(s)),
             Token::EscapedStringLiteral(s) => Ok(Value::EscapedStringLiteral(s)),
             Token::Number(n, l) => Ok(Value::Number(n, l)),
-            _ => self.expected("string or numeric value", next_token),
+            _ => self.expected("string or numeric value", &next_token),
         }
     }
 
@@ -723,6 +773,47 @@ impl<'a> DFParser<'a> {
         }))
     }
 
+    /// Parse a SQL `RESET`
+    pub fn parse_reset(&mut self) -> Result<Statement, DataFusionError> {
+        let mut parts: Vec<String> = Vec::new();
+        let mut expecting_segment = true;
+
+        loop {
+            let next_token = self.parser.peek_token();
+            match &next_token.token {
+                Token::Word(word) => {
+                    self.parser.next_token();
+                    parts.push(word.value.clone());
+                    expecting_segment = false;
+                }
+                Token::SingleQuotedString(s)
+                | Token::DoubleQuotedString(s)
+                | Token::EscapedStringLiteral(s) => {
+                    self.parser.next_token();
+                    parts.push(s.clone());
+                    expecting_segment = false;
+                }
+                Token::Period => {
+                    self.parser.next_token();
+                    if expecting_segment || parts.is_empty() {
+                        return self.expected("configuration parameter", &next_token);
+                    }
+                    expecting_segment = true;
+                }
+                Token::EOF | Token::SemiColon => break,
+                _ => return self.expected("configuration parameter", &next_token),
+            }
+        }
+
+        if parts.is_empty() || expecting_segment {
+            return self.expected("configuration parameter", &self.parser.peek_token());
+        }
+
+        let idents: Vec<Ident> = parts.into_iter().map(Ident::new).collect();
+        let variable = ObjectName::from(idents);
+        Ok(Statement::Reset(ResetStatement::Variable(variable)))
+    }
+
     pub fn parse_explain_format(&mut self) -> Result<Option<String>, DataFusionError> {
         if !self.parser.parse_keyword(Keyword::FORMAT) {
             return Ok(None);
@@ -733,7 +824,7 @@ impl<'a> DFParser<'a> {
             Token::Word(w) => Ok(w.value),
             Token::SingleQuotedString(w) => Ok(w),
             Token::DoubleQuotedString(w) => Ok(w),
-            _ => self.expected("an explain format such as TREE", next_token),
+            _ => self.expected("an explain format such as TREE", &next_token),
         }?;
         Ok(Some(format))
     }
@@ -778,7 +869,7 @@ impl<'a> DFParser<'a> {
                 let identifier = self.parser.parse_identifier()?;
                 partitions.push(identifier.to_string());
             } else {
-                return self.expected("partition name", self.parser.peek_token());
+                return self.expected("partition name", &self.parser.peek_token());
             }
             let comma = self.parser.consume_token(&Token::Comma);
             if self.parser.consume_token(&Token::RParen) {
@@ -787,7 +878,7 @@ impl<'a> DFParser<'a> {
             } else if !comma {
                 return self.expected(
                     "',' or ')' after partition definition",
-                    self.parser.peek_token(),
+                    &self.parser.peek_token(),
                 );
             }
         }
@@ -858,7 +949,7 @@ impl<'a> DFParser<'a> {
             } else {
                 return self.expected(
                     "column name or constraint definition",
-                    self.parser.peek_token(),
+                    &self.parser.peek_token(),
                 );
             }
             let comma = self.parser.consume_token(&Token::Comma);
@@ -868,7 +959,7 @@ impl<'a> DFParser<'a> {
             } else if !comma {
                 return self.expected(
                     "',' or ')' after column definition",
-                    self.parser.peek_token(),
+                    &self.parser.peek_token(),
                 );
             }
         }
@@ -888,7 +979,7 @@ impl<'a> DFParser<'a> {
                 } else {
                     return self.expected(
                         "constraint details after CONSTRAINT <name>",
-                        self.parser.peek_token(),
+                        &self.parser.peek_token(),
                     );
                 }
             } else if let Some(option) = self.parser.parse_optional_column_option()? {
@@ -962,15 +1053,21 @@ impl<'a> DFParser<'a> {
                         } else {
                             self.parser.expect_keyword(Keyword::HEADER)?;
                             self.parser.expect_keyword(Keyword::ROW)?;
-                            return parser_err!("WITH HEADER ROW clause is no longer in use. Please use the OPTIONS clause with 'format.has_header' set appropriately, e.g., OPTIONS (format.has_header true)")?;
+                            return parser_err!(
+                                "WITH HEADER ROW clause is no longer in use. Please use the OPTIONS clause with 'format.has_header' set appropriately, e.g., OPTIONS (format.has_header true)"
+                            )?;
                         }
                     }
                     Keyword::DELIMITER => {
-                        return parser_err!("DELIMITER clause is no longer in use. Please use the OPTIONS clause with 'format.delimiter' set appropriately, e.g., OPTIONS (format.delimiter ',')")?;
+                        return parser_err!(
+                            "DELIMITER clause is no longer in use. Please use the OPTIONS clause with 'format.delimiter' set appropriately, e.g., OPTIONS (format.delimiter ',')"
+                        )?;
                     }
                     Keyword::COMPRESSION => {
                         self.parser.expect_keyword(Keyword::TYPE)?;
-                        return parser_err!("COMPRESSION TYPE clause is no longer in use. Please use the OPTIONS clause with 'format.compression' set appropriately, e.g., OPTIONS (format.compression gzip)")?;
+                        return parser_err!(
+                            "COMPRESSION TYPE clause is no longer in use. Please use the OPTIONS clause with 'format.compression' set appropriately, e.g., OPTIONS (format.compression gzip)"
+                        )?;
                     }
                     Keyword::PARTITIONED => {
                         self.parser.expect_keyword(Keyword::BY)?;
@@ -1009,11 +1106,11 @@ impl<'a> DFParser<'a> {
                     }
                 }
             } else {
-                let token = self.parser.next_token();
+                let token = self.parser.peek_token();
                 if token == Token::EOF || token == Token::SemiColon {
                     break;
                 } else {
-                    return self.expected("end of statement or ;", token)?;
+                    return self.expected("end of statement or ;", &token)?;
                 }
             }
         }
@@ -1052,7 +1149,7 @@ impl<'a> DFParser<'a> {
         let token = self.parser.next_token();
         match &token.token {
             Token::Word(w) => parse_file_type(&w.value),
-            _ => self.expected("one of ARROW, PARQUET, NDJSON, or CSV", token),
+            _ => self.expected("one of ARROW, PARQUET, NDJSON, or CSV", &token),
         }
     }
 
@@ -1075,7 +1172,7 @@ impl<'a> DFParser<'a> {
             } else if !comma {
                 return self.expected(
                     "',' or ')' after option definition",
-                    self.parser.peek_token(),
+                    &self.parser.peek_token(),
                 );
             }
         }
@@ -1092,7 +1189,7 @@ mod tests {
         BinaryOperator, DataType, ExactNumberInfo, Expr, Ident, ValueWithSpan,
     };
     use sqlparser::dialect::SnowflakeDialect;
-    use sqlparser::tokenizer::Span;
+    use sqlparser::tokenizer::{Location, Span, Whitespace};
 
     fn expect_parse_ok(sql: &str, expected: Statement) -> Result<(), DataFusionError> {
         let statements = DFParser::parse_sql(sql)?;
@@ -1323,8 +1420,7 @@ mod tests {
         expect_parse_ok(sql, expected)?;
 
         // positive case: it is ok for avro files not to have columns specified
-        let sql =
-            "CREATE EXTERNAL TABLE IF NOT EXISTS t STORED AS PARQUET LOCATION 'foo.parquet'";
+        let sql = "CREATE EXTERNAL TABLE IF NOT EXISTS t STORED AS PARQUET LOCATION 'foo.parquet'";
         let expected = Statement::CreateExternalTable(CreateExternalTable {
             name: name.clone(),
             columns: vec![],
@@ -1361,8 +1457,7 @@ mod tests {
         expect_parse_ok(sql, expected)?;
 
         // positive case: column definition allowed in 'partition by' clause
-        let sql =
-            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV PARTITIONED BY (p1 int) LOCATION 'foo.csv'";
+        let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV PARTITIONED BY (p1 int) LOCATION 'foo.csv'";
         let expected = Statement::CreateExternalTable(CreateExternalTable {
             name: name.clone(),
             columns: vec![
@@ -1383,17 +1478,18 @@ mod tests {
         expect_parse_ok(sql, expected)?;
 
         // negative case: mixed column defs and column names in `PARTITIONED BY` clause
-        let sql =
-            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV PARTITIONED BY (p1 int, c1) LOCATION 'foo.csv'";
+        let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV PARTITIONED BY (p1 int, c1) LOCATION 'foo.csv'";
         expect_parse_error(
             sql,
             "SQL error: ParserError(\"Expected: a data type name, found: ) at Line: 1, Column: 73\")",
         );
 
         // negative case: mixed column defs and column names in `PARTITIONED BY` clause
-        let sql =
-            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV PARTITIONED BY (c1, p1 int) LOCATION 'foo.csv'";
-        expect_parse_error(sql, "SQL error: ParserError(\"Expected: ',' or ')' after partition definition, found: int at Line: 1, Column: 70\")");
+        let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV PARTITIONED BY (c1, p1 int) LOCATION 'foo.csv'";
+        expect_parse_error(
+            sql,
+            "SQL error: ParserError(\"Expected: ',' or ')' after partition definition, found: int at Line: 1, Column: 70\")",
+        );
 
         // positive case: additional options (one entry) can be specified
         let sql =
@@ -1415,8 +1511,7 @@ mod tests {
         expect_parse_ok(sql, expected)?;
 
         // positive case: additional options (multiple entries) can be specified
-        let sql =
-            "CREATE EXTERNAL TABLE t STORED AS x OPTIONS ('k1' 'v1', k2 v2) LOCATION 'blahblah'";
+        let sql = "CREATE EXTERNAL TABLE t STORED AS x OPTIONS ('k1' 'v1', k2 v2) LOCATION 'blahblah'";
         let expected = Statement::CreateExternalTable(CreateExternalTable {
             name: name.clone(),
             columns: vec![],
@@ -1437,15 +1532,17 @@ mod tests {
         expect_parse_ok(sql, expected)?;
 
         // Ordered Col
-        let sqls = ["CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1) LOCATION 'foo.csv'",
-                        "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 NULLS FIRST) LOCATION 'foo.csv'",
-                        "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 NULLS LAST) LOCATION 'foo.csv'",
-                        "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 ASC) LOCATION 'foo.csv'",
-                        "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 DESC) LOCATION 'foo.csv'",
-                        "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 DESC NULLS FIRST) LOCATION 'foo.csv'",
-                        "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 DESC NULLS LAST) LOCATION 'foo.csv'",
-                        "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 ASC NULLS FIRST) LOCATION 'foo.csv'",
-                        "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 ASC NULLS LAST) LOCATION 'foo.csv'"];
+        let sqls = [
+            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1) LOCATION 'foo.csv'",
+            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 NULLS FIRST) LOCATION 'foo.csv'",
+            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 NULLS LAST) LOCATION 'foo.csv'",
+            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 ASC) LOCATION 'foo.csv'",
+            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 DESC) LOCATION 'foo.csv'",
+            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 DESC NULLS FIRST) LOCATION 'foo.csv'",
+            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 DESC NULLS LAST) LOCATION 'foo.csv'",
+            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 ASC NULLS FIRST) LOCATION 'foo.csv'",
+            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 ASC NULLS LAST) LOCATION 'foo.csv'",
+        ];
         let expected = vec![
             (None, None),
             (None, Some(true)),
@@ -1856,8 +1953,7 @@ mod tests {
     #[test]
     fn copy_to_multi_options() -> Result<(), DataFusionError> {
         // order of options is preserved
-        let sql =
-            "COPY foo TO bar STORED AS parquet OPTIONS ('format.row_group_size' 55, 'format.compression' snappy, 'execution.keep_partition_by_columns' true)";
+        let sql = "COPY foo TO bar STORED AS parquet OPTIONS ('format.row_group_size' 55, 'format.compression' snappy, 'execution.keep_partition_by_columns' true)";
 
         let expected_options = vec![
             (
@@ -1957,6 +2053,78 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_multistatement() {
+        let sql = "COPY foo TO bar STORED AS CSV; \
+             CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv'; \
+             RESET var;";
+        let statements = DFParser::parse_sql(sql).unwrap();
+        assert_eq!(
+            statements,
+            vec![
+                Statement::CopyTo(CopyToStatement {
+                    source: object_name("foo"),
+                    target: "bar".to_string(),
+                    partitioned_by: vec![],
+                    stored_as: Some("CSV".to_owned()),
+                    options: vec![],
+                }),
+                {
+                    let name = ObjectName::from(vec![Ident::from("t")]);
+                    let display = None;
+                    Statement::CreateExternalTable(CreateExternalTable {
+                        name: name.clone(),
+                        columns: vec![make_column_def("c1", DataType::Int(display))],
+                        file_type: "CSV".to_string(),
+                        location: "foo.csv".into(),
+                        table_partition_cols: vec![],
+                        order_exprs: vec![],
+                        if_not_exists: false,
+                        or_replace: false,
+                        temporary: false,
+                        unbounded: false,
+                        options: vec![],
+                        constraints: vec![],
+                    })
+                },
+                {
+                    let name = ObjectName::from(vec![Ident::from("var")]);
+                    Statement::Reset(ResetStatement::Variable(name))
+                }
+            ]
+        );
+    }
+
+    #[test]
+    fn test_custom_tokens() {
+        // Span mock.
+        let span = Span {
+            start: Location { line: 0, column: 0 },
+            end: Location { line: 0, column: 0 },
+        };
+        let tokens = vec![
+            TokenWithSpan {
+                token: Token::make_keyword("SELECT"),
+                span,
+            },
+            TokenWithSpan {
+                token: Token::Whitespace(Whitespace::Space),
+                span,
+            },
+            TokenWithSpan {
+                token: Token::Placeholder("1".to_string()),
+                span,
+            },
+        ];
+
+        let statements = DFParserBuilder::new(tokens)
+            .build()
+            .unwrap()
+            .parse_statements()
+            .unwrap();
+        assert_eq!(statements.len(), 1);
+    }
+
     fn expect_parse_expr_ok(sql: &str, expected: ExprWithAlias) {
         let expr = DFParser::parse_sql_into_expr(sql).unwrap();
         assert_eq!(expr, expected, "actual:\n{expr:#?}");
diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs
index 7bac0337672dc..df0e68eab67e4 100644
--- a/datafusion/sql/src/planner.rs
+++ b/datafusion/sql/src/planner.rs
@@ -23,19 +23,19 @@ use std::vec;
 
 use crate::utils::make_decimal_type;
 use arrow::datatypes::*;
+use datafusion_common::TableReference;
 use datafusion_common::config::SqlParserOptions;
 use datafusion_common::datatype::{DataTypeExt, FieldExt};
 use datafusion_common::error::add_possible_columns_to_diag;
-use datafusion_common::TableReference;
+use datafusion_common::{DFSchema, DataFusionError, Result, not_impl_err, plan_err};
 use datafusion_common::{
-    field_not_found, internal_err, plan_datafusion_err, DFSchemaRef, Diagnostic,
-    SchemaError,
+    DFSchemaRef, Diagnostic, SchemaError, field_not_found, internal_err,
+    plan_datafusion_err,
 };
-use datafusion_common::{not_impl_err, plan_err, DFSchema, DataFusionError, Result};
 use datafusion_expr::logical_plan::{LogicalPlan, LogicalPlanBuilder};
 pub use datafusion_expr::planner::ContextProvider;
 use datafusion_expr::utils::find_column_exprs;
-use datafusion_expr::{col, Expr};
+use datafusion_expr::{Expr, col};
 use sqlparser::ast::{ArrayElemTypeDef, ExactNumberInfo, TimezoneInfo};
 use sqlparser::ast::{ColumnDef as SQLColumnDef, ColumnOption};
 use sqlparser::ast::{DataType as SQLDataType, Ident, ObjectName, TableAlias};
@@ -202,7 +202,9 @@ impl FromStr for NullOrdering {
             "nulls_min" => Ok(Self::NullsMin),
             "nulls_first" => Ok(Self::NullsFirst),
             "nulls_last" => Ok(Self::NullsLast),
-            _ => plan_err!("Unknown null ordering: Expected one of 'nulls_first', 'nulls_last', 'nulls_min' or 'nulls_max'. Got {s}"),
+            _ => plan_err!(
+                "Unknown null ordering: Expected one of 'nulls_first', 'nulls_last', 'nulls_min' or 'nulls_max'. Got {s}"
+            ),
         }
     }
 }
@@ -251,22 +253,29 @@ impl IdentNormalizer {
 /// This helps resolve scoping issues of CTEs.
 /// By using cloning, a subquery can inherit CTEs from the outer query
 /// and can also define its own private CTEs without affecting the outer query.
-///
 #[derive(Debug, Clone)]
 pub struct PlannerContext {
     /// Data types for numbered parameters ($1, $2, etc), if supplied
     /// in `PREPARE` statement
-    prepare_param_data_types: Arc<Vec<FieldRef>>,
+    prepare_param_data_types: Arc<Vec<Option<FieldRef>>>,
     /// Map of CTE name to logical plan of the WITH clause.
     /// Use `Arc<LogicalPlan>` to allow cheap cloning
     ctes: HashMap<String, Arc<LogicalPlan>>,
-    /// The query schema of the outer query plan, used to resolve the columns in subquery
-    outer_query_schema: Option<DFSchemaRef>,
+
+    /// The queries schemas of outer query relations, used to resolve the outer referenced
+    /// columns in subquery (recursive aware)
+    outer_queries_schemas_stack: Vec<DFSchemaRef>,
     /// The joined schemas of all FROM clauses planned so far. When planning LATERAL
     /// FROM clauses, this should become a suffix of the `outer_query_schema`.
     outer_from_schema: Option<DFSchemaRef>,
     /// The query schema defined by the table
     create_table_schema: Option<DFSchemaRef>,
+    /// When planning non-first queries in a set expression
+    /// (UNION/INTERSECT/EXCEPT), holds the schema of the left-most query.
+    /// Used to alias duplicate expressions to match the left side's field names.
+    set_expr_left_schema: Option<DFSchemaRef>,
+    /// The parameters of all lambdas seen so far
+    lambda_parameters: HashMap<String, FieldRef>,
 }
 
 impl Default for PlannerContext {
@@ -281,34 +290,59 @@ impl PlannerContext {
         Self {
             prepare_param_data_types: Arc::new(vec![]),
             ctes: HashMap::new(),
-            outer_query_schema: None,
+            outer_queries_schemas_stack: vec![],
             outer_from_schema: None,
             create_table_schema: None,
+            set_expr_left_schema: None,
+            lambda_parameters: HashMap::new(),
         }
     }
 
     /// Update the PlannerContext with provided prepare_param_data_types
     pub fn with_prepare_param_data_types(
         mut self,
-        prepare_param_data_types: Vec<FieldRef>,
+        prepare_param_data_types: Vec<Option<FieldRef>>,
     ) -> Self {
         self.prepare_param_data_types = prepare_param_data_types.into();
         self
     }
 
-    // Return a reference to the outer query's schema
-    pub fn outer_query_schema(&self) -> Option<&DFSchema> {
-        self.outer_query_schema.as_ref().map(|s| s.as_ref())
+    /// Return the stack of outer relations' schemas, the outer most
+    /// relation are at the first entry
+    pub fn outer_queries_schemas(&self) -> &[DFSchemaRef] {
+        &self.outer_queries_schemas_stack
+    }
+
+    /// Return an iterator of the subquery relations' schemas, innermost
+    /// relation is returned first.
+    ///
+    /// This order corresponds to the order of resolution when looking up column
+    /// references in subqueries, which start from the innermost relation and
+    /// then look up the outer relations one by one until a match is found or no
+    /// more outer relation exist.
+    ///
+    /// NOTE this is *REVERSED* order of [`Self::outer_queries_schemas`]
+    ///
+    /// This is useful to resolve the column reference in the subquery by
+    /// looking up the outer query schemas one by one.
+    pub fn outer_schemas_iter(&self) -> impl Iterator<Item = &DFSchemaRef> {
+        self.outer_queries_schemas_stack.iter().rev()
     }
 
     /// Sets the outer query schema, returning the existing one, if
     /// any
-    pub fn set_outer_query_schema(
-        &mut self,
-        mut schema: Option<DFSchemaRef>,
-    ) -> Option<DFSchemaRef> {
-        std::mem::swap(&mut self.outer_query_schema, &mut schema);
-        schema
+    pub fn append_outer_query_schema(&mut self, schema: DFSchemaRef) {
+        self.outer_queries_schemas_stack.push(schema);
+    }
+
+    /// The schema of the adjacent outer relation
+    pub fn latest_outer_query_schema(&self) -> Option<&DFSchemaRef> {
+        self.outer_queries_schemas_stack.last()
+    }
+
+    /// Remove the schema of the adjacent outer relation
+    pub fn pop_outer_query_schema(&mut self) -> Option<DFSchemaRef> {
+        self.outer_queries_schemas_stack.pop()
     }
 
     pub fn set_table_schema(
@@ -347,7 +381,7 @@ impl PlannerContext {
     }
 
     /// Return the types of parameters (`$1`, `$2`, etc) if known
-    pub fn prepare_param_data_types(&self) -> &[FieldRef] {
+    pub fn prepare_param_data_types(&self) -> &[Option<FieldRef>] {
         &self.prepare_param_data_types
     }
 
@@ -370,10 +404,32 @@ impl PlannerContext {
         self.ctes.get(cte_name).map(|cte| cte.as_ref())
     }
 
+    pub fn lambda_parameters(&self) -> &HashMap<String, FieldRef> {
+        &self.lambda_parameters
+    }
+
+    pub fn with_lambda_parameters(
+        mut self,
+        parameters: impl IntoIterator<Item = FieldRef>,
+    ) -> Self {
+        self.lambda_parameters
+            .extend(parameters.into_iter().map(|f| (f.name().clone(), f)));
+
+        self
+    }
+
     /// Remove the plan of CTE / Subquery for the specified name
     pub(super) fn remove_cte(&mut self, cte_name: &str) {
         self.ctes.remove(cte_name);
     }
+
+    /// Sets the left-most set expression schema, returning the previous value
+    pub(super) fn set_set_expr_left_schema(
+        &mut self,
+        schema: Option<DFSchemaRef>,
+    ) -> Option<DFSchemaRef> {
+        std::mem::replace(&mut self.set_expr_left_schema, schema)
+    }
 }
 
 /// SQL query planner and binder
@@ -594,10 +650,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
         sql_type: &SQLDataType,
     ) -> Result<FieldRef> {
         // First check if any of the registered type_planner can handle this type
-        if let Some(type_planner) = self.context_provider.get_type_planner() {
-            if let Some(data_type) = type_planner.plan_type(sql_type)? {
-                return Ok(data_type.into_nullable_field_ref());
-            }
+        if let Some(type_planner) = self.context_provider.get_type_planner()
+            && let Some(data_type) = type_planner.plan_type_field(sql_type)?
+        {
+            return Ok(data_type);
         }
 
         // If no type_planner can handle this type, use the default conversion
@@ -687,8 +743,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             SQLDataType::Timestamp(precision, tz_info)
                 if precision.is_none() || [0, 3, 6, 9].contains(&precision.unwrap()) =>
             {
-                let tz = if matches!(tz_info, TimezoneInfo::Tz)
-                    || matches!(tz_info, TimezoneInfo::WithTimeZone)
+                let tz = if *tz_info == TimezoneInfo::Tz
+                    || *tz_info == TimezoneInfo::WithTimeZone
                 {
                     // Timestamp With Time Zone
                     // INPUT : [SQLDataType]   TimestampTz + [Config] Time Zone
@@ -709,8 +765,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             }
             SQLDataType::Date => Ok(DataType::Date32),
             SQLDataType::Time(None, tz_info) => {
-                if matches!(tz_info, TimezoneInfo::None)
-                    || matches!(tz_info, TimezoneInfo::WithoutTimeZone)
+                if *tz_info == TimezoneInfo::None
+                    || *tz_info == TimezoneInfo::WithoutTimeZone
                 {
                     Ok(DataType::Time64(TimeUnit::Nanosecond))
                 } else {
@@ -822,7 +878,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             | SQLDataType::HugeInt
             | SQLDataType::UHugeInt
             | SQLDataType::UBigInt
-            | SQLDataType::TimestampNtz
+            | SQLDataType::TimestampNtz{..}
             | SQLDataType::NamedTable { .. }
             | SQLDataType::TsVector
             | SQLDataType::TsQuery
diff --git a/datafusion/sql/src/query.rs b/datafusion/sql/src/query.rs
index d316550f4dd21..e320d2ee6e9c1 100644
--- a/datafusion/sql/src/query.rs
+++ b/datafusion/sql/src/query.rs
@@ -20,7 +20,7 @@ use std::sync::Arc;
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
 
 use crate::stack::StackGuard;
-use datafusion_common::{not_impl_err, Constraints, DFSchema, Result};
+use datafusion_common::{Constraints, DFSchema, Result, not_impl_err};
 use datafusion_expr::expr::{Sort, WildcardOptions};
 
 use datafusion_expr::select_expr::SelectExpr;
@@ -46,17 +46,34 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         let mut query_plan_context = outer_planner_context.clone();
         let planner_context = &mut query_plan_context;
 
-        if let Some(with) = query.with {
+        let Query {
+            with,
+            body,
+            order_by,
+            limit_clause,
+            fetch,
+            locks: _,
+            for_clause: _,
+            settings: _,
+            format_clause: _,
+            pipe_operators,
+        } = query;
+
+        if fetch.is_some() {
+            return not_impl_err!("FETCH clause is not supported yet");
+        }
+
+        if let Some(with) = with {
             self.plan_with_clause(with, planner_context)?;
         }
 
-        let set_expr = *query.body;
+        let set_expr = *body;
         let plan = match set_expr {
             SetExpr::Select(mut select) => {
                 let select_into = select.into.take();
                 let plan =
-                    self.select_to_plan(*select, query.order_by, planner_context)?;
-                let plan = self.limit(plan, query.limit_clause, planner_context)?;
+                    self.select_to_plan(*select, order_by.clone(), planner_context)?;
+                let plan = self.limit(plan, limit_clause.clone(), planner_context)?;
                 // Process the `SELECT INTO` after `LIMIT`.
                 self.select_into(plan, select_into)
             }
@@ -69,7 +86,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     let _guard = StackGuard::new(256 * 1024);
                     self.set_expr_to_plan(other, planner_context)
                 }?;
-                let oby_exprs = to_order_by_exprs(query.order_by)?;
+                let oby_exprs = to_order_by_exprs(order_by)?;
                 let order_by_rex = self.order_by_to_sort_expr(
                     oby_exprs,
                     plan.schema(),
@@ -78,11 +95,11 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     None,
                 )?;
                 let plan = self.order_by(plan, order_by_rex)?;
-                self.limit(plan, query.limit_clause, planner_context)
+                self.limit(plan, limit_clause, planner_context)
             }
         }?;
 
-        self.pipe_operators(plan, query.pipe_operators, planner_context)
+        self.pipe_operators(plan, pipe_operators, planner_context)
     }
 
     /// Apply pipe operators to a plan
@@ -135,7 +152,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 let empty_from = matches!(plan, LogicalPlan::EmptyRelation(_));
                 let select_exprs =
                     self.prepare_select_exprs(&plan, exprs, empty_from, planner_context)?;
-                self.project(plan, select_exprs)
+                self.project(plan, select_exprs, None)
             }
             PipeOperator::Extend { exprs } => {
                 let empty_from = matches!(plan, LogicalPlan::EmptyRelation(_));
@@ -145,7 +162,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     std::iter::once(SelectExpr::Wildcard(WildcardOptions::default()))
                         .chain(extend_exprs)
                         .collect();
-                self.project(plan, all_exprs)
+                self.project(plan, all_exprs, None)
             }
             PipeOperator::As { alias } => self.apply_table_alias(
                 plan,
@@ -153,6 +170,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     name: alias,
                     // Apply to all fields
                     columns: vec![],
+                    explicit: true,
                 },
             ),
             PipeOperator::Union {
diff --git a/datafusion/sql/src/relation/join.rs b/datafusion/sql/src/relation/join.rs
index 754ded1514a63..3343890c6dc1d 100644
--- a/datafusion/sql/src/relation/join.rs
+++ b/datafusion/sql/src/relation/join.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
-use datafusion_common::{not_impl_err, plan_datafusion_err, Column, Result};
+use datafusion_common::{Column, Result, not_impl_err, plan_datafusion_err};
 use datafusion_expr::{JoinType, LogicalPlan, LogicalPlanBuilder};
 use sqlparser::ast::{
     Join, JoinConstraint, JoinOperator, ObjectName, TableFactor, TableWithJoins,
@@ -179,7 +179,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     }
 }
 
-/// Return `true` iff the given [`TableFactor`] is lateral.
+/// Returns `true` if the given [`TableFactor`] is lateral.
 pub(crate) fn is_lateral(factor: &TableFactor) -> bool {
     match factor {
         TableFactor::Derived { lateral, .. } => *lateral,
@@ -189,7 +189,7 @@ pub(crate) fn is_lateral(factor: &TableFactor) -> bool {
     }
 }
 
-/// Return `true` iff the given [`Join`] is lateral.
+/// Returns `true` if the given [`Join`] is lateral.
 pub(crate) fn is_lateral_join(join: &Join) -> Result<bool> {
     let is_lateral_syntax = is_lateral(&join.relation);
     let is_apply_syntax = match join.join_operator {
diff --git a/datafusion/sql/src/relation/mod.rs b/datafusion/sql/src/relation/mod.rs
index 9dfa078701d3d..08a292475fd72 100644
--- a/datafusion/sql/src/relation/mod.rs
+++ b/datafusion/sql/src/relation/mod.rs
@@ -21,22 +21,132 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
 
 use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_common::{
-    not_impl_err, plan_err, DFSchema, Diagnostic, Result, Span, Spans, TableReference,
+    DFSchema, Diagnostic, Result, Span, Spans, TableReference, not_impl_err, plan_err,
 };
 use datafusion_expr::builder::subquery_alias;
-use datafusion_expr::{expr::Unnest, Expr, LogicalPlan, LogicalPlanBuilder};
+use datafusion_expr::planner::{
+    PlannedRelation, RelationPlannerContext, RelationPlanning,
+};
+use datafusion_expr::{Expr, LogicalPlan, LogicalPlanBuilder, expr::Unnest};
 use datafusion_expr::{Subquery, SubqueryAlias};
 use sqlparser::ast::{FunctionArg, FunctionArgExpr, Spanned, TableFactor};
 
 mod join;
 
+struct SqlToRelRelationContext<'a, 'b, S: ContextProvider> {
+    planner: &'a SqlToRel<'b, S>,
+    planner_context: &'a mut PlannerContext,
+}
+
+// Implement RelationPlannerContext
+impl<'a, 'b, S: ContextProvider> RelationPlannerContext
+    for SqlToRelRelationContext<'a, 'b, S>
+{
+    fn context_provider(&self) -> &dyn ContextProvider {
+        self.planner.context_provider
+    }
+
+    fn plan(&mut self, relation: TableFactor) -> Result<LogicalPlan> {
+        self.planner.create_relation(relation, self.planner_context)
+    }
+
+    fn sql_to_expr(
+        &mut self,
+        expr: sqlparser::ast::Expr,
+        schema: &DFSchema,
+    ) -> Result<Expr> {
+        self.planner.sql_to_expr(expr, schema, self.planner_context)
+    }
+
+    fn sql_expr_to_logical_expr(
+        &mut self,
+        expr: sqlparser::ast::Expr,
+        schema: &DFSchema,
+    ) -> Result<Expr> {
+        self.planner
+            .sql_expr_to_logical_expr(expr, schema, self.planner_context)
+    }
+
+    fn normalize_ident(&self, ident: sqlparser::ast::Ident) -> String {
+        self.planner.ident_normalizer.normalize(ident)
+    }
+
+    fn object_name_to_table_reference(
+        &self,
+        name: sqlparser::ast::ObjectName,
+    ) -> Result<TableReference> {
+        self.planner.object_name_to_table_reference(name)
+    }
+}
+
 impl<S: ContextProvider> SqlToRel<'_, S> {
-    /// Create a `LogicalPlan` that scans the named relation
+    /// Create a `LogicalPlan` that scans the named relation.
+    ///
+    /// First tries any registered extension planners. If no extension handles
+    /// the relation, falls back to the default planner.
     fn create_relation(
         &self,
         relation: TableFactor,
         planner_context: &mut PlannerContext,
     ) -> Result<LogicalPlan> {
+        let planned_relation =
+            match self.create_extension_relation(relation, planner_context)? {
+                RelationPlanning::Planned(planned) => planned,
+                RelationPlanning::Original(original) => {
+                    Box::new(self.create_default_relation(*original, planner_context)?)
+                }
+            };
+
+        let optimized_plan = optimize_subquery_sort(
+            planned_relation.plan,
+            self.context_provider
+                .options()
+                .sql_parser
+                .enable_subquery_sort_elimination,
+        )?
+        .data;
+        if let Some(alias) = planned_relation.alias {
+            self.apply_table_alias(optimized_plan, alias)
+        } else {
+            Ok(optimized_plan)
+        }
+    }
+
+    fn create_extension_relation(
+        &self,
+        relation: TableFactor,
+        planner_context: &mut PlannerContext,
+    ) -> Result<RelationPlanning> {
+        let planners = self.context_provider.get_relation_planners();
+        if planners.is_empty() {
+            return Ok(RelationPlanning::Original(Box::new(relation)));
+        }
+
+        let mut current_relation = relation;
+        for planner in planners.iter() {
+            let mut context = SqlToRelRelationContext {
+                planner: self,
+                planner_context,
+            };
+
+            match planner.plan_relation(current_relation, &mut context)? {
+                RelationPlanning::Planned(planned) => {
+                    return Ok(RelationPlanning::Planned(planned));
+                }
+                RelationPlanning::Original(original) => {
+                    current_relation = *original;
+                }
+            }
+        }
+
+        Ok(RelationPlanning::Original(Box::new(current_relation)))
+    }
+
+    fn create_default_relation(
+        &self,
+        relation: TableFactor,
+        planner_context: &mut PlannerContext,
+    ) -> Result<PlannedRelation> {
         let relation_span = relation.span();
         let (plan, alias) = match relation {
             TableFactor::Table {
@@ -48,7 +158,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     let args = func_args
                         .args
                         .into_iter()
-                        .flat_map(|arg| {
+                        .map(|arg| {
                             if let FunctionArg::Unnamed(FunctionArgExpr::Expr(expr)) = arg
                             {
                                 self.sql_expr_to_logical_expr(
@@ -60,7 +170,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                                 plan_err!("Unsupported function argument type: {}", arg)
                             }
                         })
-                        .collect::<Vec<_>>();
+                        .collect::<Result<Vec<_>>>()?;
                     let provider = self
                         .context_provider
                         .get_table_function_source(&tbl_func_name, args)?;
@@ -159,9 +269,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             } => {
                 let tbl_func_ref = self.object_name_to_table_reference(name)?;
                 let schema = planner_context
-                    .outer_query_schema()
+                    .outer_queries_schemas()
+                    .last()
                     .cloned()
-                    .unwrap_or_else(DFSchema::empty);
+                    .unwrap_or_else(|| Arc::new(DFSchema::empty()));
                 let func_args = args
                     .into_iter()
                     .map(|arg| match arg {
@@ -190,13 +301,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 );
             }
         };
-
-        let optimized_plan = optimize_subquery_sort(plan)?.data;
-        if let Some(alias) = alias {
-            self.apply_table_alias(optimized_plan, alias)
-        } else {
-            Ok(optimized_plan)
-        }
+        Ok(PlannedRelation::new(plan, alias))
     }
 
     pub(crate) fn create_relation_subquery(
@@ -213,20 +318,24 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         let old_from_schema = planner_context
             .set_outer_from_schema(None)
             .unwrap_or_else(|| Arc::new(DFSchema::empty()));
-        let new_query_schema = match planner_context.outer_query_schema() {
-            Some(old_query_schema) => {
+        let outer_query_schema = planner_context.pop_outer_query_schema();
+        let new_query_schema = match outer_query_schema {
+            Some(ref old_query_schema) => {
                 let mut new_query_schema = old_from_schema.as_ref().clone();
-                new_query_schema.merge(old_query_schema);
-                Some(Arc::new(new_query_schema))
+                new_query_schema.merge(old_query_schema.as_ref());
+                Arc::new(new_query_schema)
             }
-            None => Some(Arc::clone(&old_from_schema)),
+            None => Arc::clone(&old_from_schema),
         };
-        let old_query_schema = planner_context.set_outer_query_schema(new_query_schema);
+        planner_context.append_outer_query_schema(new_query_schema);
 
         let plan = self.create_relation(subquery, planner_context)?;
         let outer_ref_columns = plan.all_out_ref_exprs();
 
-        planner_context.set_outer_query_schema(old_query_schema);
+        planner_context.pop_outer_query_schema();
+        if let Some(schema) = outer_query_schema {
+            planner_context.append_outer_query_schema(schema);
+        }
         planner_context.set_outer_from_schema(Some(old_from_schema));
 
         // We can omit the subquery wrapper if there are no columns
@@ -255,7 +364,14 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     }
 }
 
-fn optimize_subquery_sort(plan: LogicalPlan) -> Result<Transformed<LogicalPlan>> {
+fn optimize_subquery_sort(
+    plan: LogicalPlan,
+    enable_subquery_sort_elimination: bool,
+) -> Result<Transformed<LogicalPlan>> {
+    if !enable_subquery_sort_elimination {
+        return Ok(Transformed::no(plan));
+    }
+
     // When initializing subqueries, we examine sort options since they might be unnecessary.
     // They are only important if the subquery result is affected by the ORDER BY statement,
     // which can happen when we have:
@@ -263,7 +379,8 @@ fn optimize_subquery_sort(plan: LogicalPlan) -> Result<Transformed<LogicalPlan>>
     // 2. RANK / ROW_NUMBER ... => Handled by a `WindowAggr` and its requirements.
     // 3. LIMIT => Handled by a `Sort`, so we need to search for it.
     let mut has_limit = false;
-    let new_plan = plan.transform_down(|c| {
+
+    plan.transform_down(|c| {
         if let LogicalPlan::Limit(_) = c {
             has_limit = true;
             return Ok(Transformed::no(c));
@@ -272,12 +389,11 @@ fn optimize_subquery_sort(plan: LogicalPlan) -> Result<Transformed<LogicalPlan>>
             LogicalPlan::Sort(s) => {
                 if !has_limit {
                     has_limit = false;
-                    return Ok(Transformed::yes(s.input.as_ref().clone()));
+                    return Ok(Transformed::yes(Arc::unwrap_or_clone(s.input)));
                 }
                 Ok(Transformed::no(LogicalPlan::Sort(s)))
             }
             _ => Ok(Transformed::no(c)),
         }
-    });
-    new_plan
+    })
 }
diff --git a/datafusion/sql/src/resolve.rs b/datafusion/sql/src/resolve.rs
index 9e909f66fa97a..955dbb86602a3 100644
--- a/datafusion/sql/src/resolve.rs
+++ b/datafusion/sql/src/resolve.rs
@@ -15,10 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::TableReference;
 use std::collections::BTreeSet;
 use std::ops::ControlFlow;
 
+use datafusion_common::{DataFusionError, Result};
+
+use crate::TableReference;
 use crate::parser::{CopyToSource, CopyToStatement, Statement as DFStatement};
 use crate::planner::object_name_to_table_reference;
 use sqlparser::ast::*;
@@ -45,27 +47,40 @@ const INFORMATION_SCHEMA_TABLES: &[&str] = &[
     PARAMETERS,
 ];
 
+// Collect table/CTE references as `TableReference`s and normalize them during traversal.
+// This avoids a second normalization/conversion pass after visiting the AST.
 struct RelationVisitor {
-    relations: BTreeSet<ObjectName>,
-    all_ctes: BTreeSet<ObjectName>,
-    ctes_in_scope: Vec<ObjectName>,
+    relations: BTreeSet<TableReference>,
+    all_ctes: BTreeSet<TableReference>,
+    ctes_in_scope: Vec<TableReference>,
+    enable_ident_normalization: bool,
 }
 
 impl RelationVisitor {
     /// Record the reference to `relation`, if it's not a CTE reference.
-    fn insert_relation(&mut self, relation: &ObjectName) {
-        if !self.relations.contains(relation) && !self.ctes_in_scope.contains(relation) {
-            self.relations.insert(relation.clone());
+    fn insert_relation(&mut self, relation: &ObjectName) -> ControlFlow<DataFusionError> {
+        match object_name_to_table_reference(
+            relation.clone(),
+            self.enable_ident_normalization,
+        ) {
+            Ok(relation) => {
+                if !self.relations.contains(&relation)
+                    && !self.ctes_in_scope.contains(&relation)
+                {
+                    self.relations.insert(relation);
+                }
+                ControlFlow::Continue(())
+            }
+            Err(e) => ControlFlow::Break(e),
         }
     }
 }
 
 impl Visitor for RelationVisitor {
-    type Break = ();
+    type Break = DataFusionError;
 
-    fn pre_visit_relation(&mut self, relation: &ObjectName) -> ControlFlow<()> {
-        self.insert_relation(relation);
-        ControlFlow::Continue(())
+    fn pre_visit_relation(&mut self, relation: &ObjectName) -> ControlFlow<Self::Break> {
+        self.insert_relation(relation)
     }
 
     fn pre_visit_query(&mut self, q: &Query) -> ControlFlow<Self::Break> {
@@ -78,10 +93,16 @@ impl Visitor for RelationVisitor {
                 if !with.recursive {
                     // This is a bit hackish as the CTE will be visited again as part of visiting `q`,
                     // but thankfully `insert_relation` is idempotent.
-                    let _ = cte.visit(self);
+                    cte.visit(self)?;
+                }
+                let cte_name = ObjectName::from(vec![cte.alias.name.clone()]);
+                match object_name_to_table_reference(
+                    cte_name,
+                    self.enable_ident_normalization,
+                ) {
+                    Ok(cte_ref) => self.ctes_in_scope.push(cte_ref),
+                    Err(e) => return ControlFlow::Break(e),
                 }
-                self.ctes_in_scope
-                    .push(ObjectName::from(vec![cte.alias.name.clone()]));
             }
         }
         ControlFlow::Continue(())
@@ -97,13 +118,13 @@ impl Visitor for RelationVisitor {
         ControlFlow::Continue(())
     }
 
-    fn pre_visit_statement(&mut self, statement: &Statement) -> ControlFlow<()> {
+    fn pre_visit_statement(&mut self, statement: &Statement) -> ControlFlow<Self::Break> {
         if let Statement::ShowCreate {
             obj_type: ShowCreateObject::Table | ShowCreateObject::View,
             obj_name,
         } = statement
         {
-            self.insert_relation(obj_name)
+            self.insert_relation(obj_name)?;
         }
 
         // SHOW statements will later be rewritten into a SELECT from the information_schema
@@ -120,34 +141,53 @@ impl Visitor for RelationVisitor {
         );
         if requires_information_schema {
             for s in INFORMATION_SCHEMA_TABLES {
-                self.relations.insert(ObjectName::from(vec![
+                // Information schema references are synthesized here, so convert directly.
+                let obj = ObjectName::from(vec![
                     Ident::new(INFORMATION_SCHEMA),
                     Ident::new(*s),
-                ]));
+                ]);
+                match object_name_to_table_reference(obj, self.enable_ident_normalization)
+                {
+                    Ok(tbl_ref) => {
+                        self.relations.insert(tbl_ref);
+                    }
+                    Err(e) => return ControlFlow::Break(e),
+                }
             }
         }
         ControlFlow::Continue(())
     }
 }
 
-fn visit_statement(statement: &DFStatement, visitor: &mut RelationVisitor) {
+fn control_flow_to_result(flow: ControlFlow<DataFusionError>) -> Result<()> {
+    match flow {
+        ControlFlow::Continue(()) => Ok(()),
+        ControlFlow::Break(err) => Err(err),
+    }
+}
+
+fn visit_statement(statement: &DFStatement, visitor: &mut RelationVisitor) -> Result<()> {
     match statement {
         DFStatement::Statement(s) => {
-            let _ = s.as_ref().visit(visitor);
+            control_flow_to_result(s.as_ref().visit(visitor))?;
         }
         DFStatement::CreateExternalTable(table) => {
-            visitor.relations.insert(table.name.clone());
+            control_flow_to_result(visitor.insert_relation(&table.name))?;
         }
         DFStatement::CopyTo(CopyToStatement { source, .. }) => match source {
             CopyToSource::Relation(table_name) => {
-                visitor.insert_relation(table_name);
+                control_flow_to_result(visitor.insert_relation(table_name))?;
             }
             CopyToSource::Query(query) => {
-                let _ = query.visit(visitor);
+                control_flow_to_result(query.visit(visitor))?;
             }
         },
-        DFStatement::Explain(explain) => visit_statement(&explain.statement, visitor),
+        DFStatement::Explain(explain) => {
+            visit_statement(&explain.statement, visitor)?;
+        }
+        DFStatement::Reset(_) => {}
     }
+    Ok(())
 }
 
 /// Collects all tables and views referenced in the SQL statement. CTEs are collected separately.
@@ -175,38 +215,32 @@ fn visit_statement(statement: &DFStatement, visitor: &mut RelationVisitor) {
 /// ## Example with CTEs  
 ///  
 /// ```  
-/// # use datafusion_sql::parser::DFParser;  
+/// # use datafusion_sql::parser::DFParser;
 /// # use datafusion_sql::resolve::resolve_table_references;
-/// let query = "with my_cte as (values (1), (2)) SELECT * from my_cte;";  
-/// let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();  
-/// let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();  
+/// let query = "with my_cte as (values (1), (2)) SELECT * from my_cte;";
+/// let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
+/// let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
 /// assert_eq!(table_refs.len(), 0);
-/// assert_eq!(ctes.len(), 1);  
-/// assert_eq!(ctes[0].to_string(), "my_cte");  
+/// assert_eq!(ctes.len(), 1);
+/// assert_eq!(ctes[0].to_string(), "my_cte");
 /// ```
 pub fn resolve_table_references(
     statement: &crate::parser::Statement,
     enable_ident_normalization: bool,
-) -> datafusion_common::Result<(Vec<TableReference>, Vec<TableReference>)> {
+) -> Result<(Vec<TableReference>, Vec<TableReference>)> {
     let mut visitor = RelationVisitor {
         relations: BTreeSet::new(),
         all_ctes: BTreeSet::new(),
         ctes_in_scope: vec![],
+        enable_ident_normalization,
     };
 
-    visit_statement(statement, &mut visitor);
-
-    let table_refs = visitor
-        .relations
-        .into_iter()
-        .map(|x| object_name_to_table_reference(x, enable_ident_normalization))
-        .collect::<datafusion_common::Result<_>>()?;
-    let ctes = visitor
-        .all_ctes
-        .into_iter()
-        .map(|x| object_name_to_table_reference(x, enable_ident_normalization))
-        .collect::<datafusion_common::Result<_>>()?;
-    Ok((table_refs, ctes))
+    visit_statement(statement, &mut visitor)?;
+
+    Ok((
+        visitor.relations.into_iter().collect(),
+        visitor.all_ctes.into_iter().collect(),
+    ))
 }
 
 #[cfg(test)]
@@ -269,4 +303,57 @@ mod tests {
         assert_eq!(ctes.len(), 1);
         assert_eq!(ctes[0].to_string(), "nodes");
     }
+
+    #[test]
+    fn resolve_table_references_cte_with_quoted_reference() {
+        use crate::parser::DFParser;
+
+        let query = r#"with barbaz as (select 1) select * from "barbaz""#;
+        let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
+        let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
+        assert_eq!(ctes.len(), 1);
+        assert_eq!(ctes[0].to_string(), "barbaz");
+        // Quoted reference should still resolve to the CTE when normalization is on
+        assert_eq!(table_refs.len(), 0);
+    }
+
+    #[test]
+    fn resolve_table_references_cte_with_quoted_reference_normalization_off() {
+        use crate::parser::DFParser;
+
+        let query = r#"with barbaz as (select 1) select * from "barbaz""#;
+        let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
+        let (table_refs, ctes) = resolve_table_references(&statement, false).unwrap();
+        assert_eq!(ctes.len(), 1);
+        assert_eq!(ctes[0].to_string(), "barbaz");
+        // Even with normalization off, quoted reference matches same-case CTE name
+        assert_eq!(table_refs.len(), 0);
+    }
+
+    #[test]
+    fn resolve_table_references_cte_with_quoted_reference_uppercase_normalization_on() {
+        use crate::parser::DFParser;
+
+        let query = r#"with FOObar as (select 1) select * from "FOObar""#;
+        let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
+        let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
+        // CTE name is normalized to lowercase, quoted reference preserves case, so they differ
+        assert_eq!(ctes.len(), 1);
+        assert_eq!(ctes[0].to_string(), "foobar");
+        assert_eq!(table_refs.len(), 1);
+        assert_eq!(table_refs[0].to_string(), "FOObar");
+    }
+
+    #[test]
+    fn resolve_table_references_cte_with_quoted_reference_uppercase_normalization_off() {
+        use crate::parser::DFParser;
+
+        let query = r#"with FOObar as (select 1) select * from "FOObar""#;
+        let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
+        let (table_refs, ctes) = resolve_table_references(&statement, false).unwrap();
+        // Without normalization, cases match exactly, so quoted reference resolves to the CTE
+        assert_eq!(ctes.len(), 1);
+        assert_eq!(ctes[0].to_string(), "FOObar");
+        assert_eq!(table_refs.len(), 0);
+    }
 }
diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs
index 42013a76a8657..09d8566c4a19e 100644
--- a/datafusion/sql/src/select.rs
+++ b/datafusion/sql/src/select.rs
@@ -22,16 +22,20 @@ use std::sync::Arc;
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
 use crate::query::to_order_by_exprs_with_select;
 use crate::utils::{
-    check_columns_satisfy_exprs, extract_aliases, rebase_expr, resolve_aliases_to_exprs,
-    resolve_columns, resolve_positions_to_exprs, rewrite_recursive_unnests_bottom_up,
     CheckColumnsMustReferenceAggregatePurpose, CheckColumnsSatisfyExprsPurpose,
+    check_columns_satisfy_exprs, extract_aliases, rebase_expr, resolve_aliases_to_exprs,
+    resolve_columns, resolve_positions_to_exprs, rewrite_recursive_unnest_bottom_up,
+    rewrite_recursive_unnests_bottom_up,
 };
 
+use arrow::datatypes::DataType;
 use datafusion_common::error::DataFusionErrorBuilder;
 use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
-use datafusion_common::{not_impl_err, plan_err, Result};
+use datafusion_common::{Column, DFSchema, DFSchemaRef, Result, not_impl_err, plan_err};
 use datafusion_common::{RecursionUnnestOption, UnnestOptions};
-use datafusion_expr::expr::{Alias, PlannedReplaceSelectItem, WildcardOptions};
+use datafusion_expr::ExprSchemable;
+use datafusion_expr::builder::get_struct_unnested_columns;
+use datafusion_expr::expr::{PlannedReplaceSelectItem, WildcardOptions};
 use datafusion_expr::expr_rewriter::{
     normalize_col, normalize_col_with_schemas_and_ambiguity_check, normalize_sorts,
 };
@@ -41,16 +45,32 @@ use datafusion_expr::utils::{
 };
 use datafusion_expr::{
     Aggregate, Expr, Filter, GroupingSet, LogicalPlan, LogicalPlanBuilder,
-    LogicalPlanBuilderOptions, Partitioning,
+    LogicalPlanBuilderOptions, Partitioning, SortExpr,
 };
 
 use indexmap::IndexMap;
 use sqlparser::ast::{
-    visit_expressions_mut, Distinct, Expr as SQLExpr, GroupByExpr, NamedWindowExpr,
-    OrderBy, SelectItemQualifiedWildcardKind, WildcardAdditionalOptions, WindowType,
+    Distinct, Expr as SQLExpr, GroupByExpr, NamedWindowExpr, OrderBy,
+    SelectItemQualifiedWildcardKind, WildcardAdditionalOptions, WindowType,
+    visit_expressions_mut,
 };
 use sqlparser::ast::{NamedWindowDefinition, Select, SelectItem, TableWithJoins};
 
+/// Result of the `aggregate` function, containing the aggregate plan and
+/// rewritten expressions that reference the aggregate output columns.
+struct AggregatePlanResult {
+    /// The aggregate logical plan
+    plan: LogicalPlan,
+    /// SELECT expressions rewritten to reference aggregate output columns
+    select_exprs: Vec<Expr>,
+    /// HAVING expression rewritten to reference aggregate output columns
+    having_expr: Option<Expr>,
+    /// QUALIFY expression rewritten to reference aggregate output columns
+    qualify_expr: Option<Expr>,
+    /// ORDER BY expressions rewritten to reference aggregate output columns
+    order_by_exprs: Vec<SortExpr>,
+}
+
 impl<S: ContextProvider> SqlToRel<'_, S> {
     /// Generate a logic plan from an SQL select
     pub(super) fn select_to_plan(
@@ -74,6 +94,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             return not_impl_err!("SORT BY");
         }
 
+        // Capture and clear set expression schema so it doesn't leak
+        // into subqueries planned during FROM clause handling.
+        let set_expr_left_schema = planner_context.set_set_expr_left_schema(None);
+
         // Process `from` clause
         let plan = self.plan_from_tables(select.from, planner_context)?;
         let empty_from = matches!(plan, LogicalPlan::EmptyRelation(_));
@@ -94,7 +118,8 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         )?;
 
         // Having and group by clause may reference aliases defined in select projection
-        let projected_plan = self.project(base_plan.clone(), select_exprs)?;
+        let projected_plan =
+            self.project(base_plan.clone(), select_exprs, set_expr_left_schema)?;
         let select_exprs = projected_plan.expressions();
 
         let order_by =
@@ -177,15 +202,11 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 .collect::<Result<Vec<Expr>>>()?
         } else {
             // 'group by all' groups wrt. all select expressions except 'AggregateFunction's.
-            // Filter and collect non-aggregate select expressions
+            // Filter and collect non-aggregate select expressions.
             select_exprs
                 .iter()
-                .filter(|select_expr| match select_expr {
-                    Expr::AggregateFunction(_) => false,
-                    Expr::Alias(Alias { expr, name: _, .. }) => {
-                        !matches!(**expr, Expr::AggregateFunction(_))
-                    }
-                    _ => true,
+                .filter(|select_expr| {
+                    find_aggregate_exprs(std::iter::once(*select_expr)).is_empty()
                 })
                 .cloned()
                 .collect()
@@ -219,33 +240,57 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             .transpose()?;
 
         // The outer expressions we will search through for aggregates.
-        // Aggregates may be sourced from the SELECT list or from the HAVING expression.
-        let aggr_expr_haystack = select_exprs
-            .iter()
-            .chain(having_expr_opt.iter())
-            .chain(qualify_expr_opt.iter());
-        // All of the aggregate expressions (deduplicated).
-        let aggr_exprs = find_aggregate_exprs(aggr_expr_haystack);
+        // First, find aggregates in SELECT, HAVING, and QUALIFY
+        let select_having_qualify_aggrs = find_aggregate_exprs(
+            select_exprs
+                .iter()
+                .chain(having_expr_opt.iter())
+                .chain(qualify_expr_opt.iter()),
+        );
+
+        // Find aggregates in ORDER BY
+        let order_by_aggrs = find_aggregate_exprs(order_by_rex.iter().map(|s| &s.expr));
+
+        // Combine: all aggregates from SELECT/HAVING/QUALIFY, plus ORDER BY aggregates
+        // that aren't already in SELECT/HAVING/QUALIFY
+        let mut aggr_exprs = select_having_qualify_aggrs;
+        for order_by_aggr in order_by_aggrs {
+            if !aggr_exprs.iter().any(|e| e == &order_by_aggr) {
+                aggr_exprs.push(order_by_aggr);
+            }
+        }
 
         // Process group by, aggregation or having
-        let (
+        let AggregatePlanResult {
             plan,
-            mut select_exprs_post_aggr,
-            having_expr_post_aggr,
-            qualify_expr_post_aggr,
-        ) = if !group_by_exprs.is_empty() || !aggr_exprs.is_empty() {
+            select_exprs: mut select_exprs_post_aggr,
+            having_expr: having_expr_post_aggr,
+            qualify_expr: qualify_expr_post_aggr,
+            order_by_exprs: mut order_by_rex,
+        } = if !group_by_exprs.is_empty() || !aggr_exprs.is_empty() {
             self.aggregate(
                 &base_plan,
                 &select_exprs,
                 having_expr_opt.as_ref(),
                 qualify_expr_opt.as_ref(),
+                &order_by_rex,
                 &group_by_exprs,
                 &aggr_exprs,
             )?
         } else {
             match having_expr_opt {
-                Some(having_expr) => return plan_err!("HAVING clause references: {having_expr} must appear in the GROUP BY clause or be used in an aggregate function"),
-                None => (base_plan.clone(), select_exprs.clone(), having_expr_opt, qualify_expr_opt)
+                Some(having_expr) => {
+                    return plan_err!(
+                        "HAVING clause references: {having_expr} must appear in the GROUP BY clause or be used in an aggregate function"
+                    );
+                }
+                None => AggregatePlanResult {
+                    plan: base_plan.clone(),
+                    select_exprs: select_exprs.clone(),
+                    having_expr: having_expr_opt,
+                    qualify_expr: qualify_expr_opt,
+                    order_by_exprs: order_by_rex,
+                },
             }
         };
 
@@ -257,14 +302,15 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             plan
         };
 
-        // The outer expressions we will search through for window functions.
-        // Window functions may be sourced from the SELECT list or from the QUALIFY expression.
-        let windows_expr_haystack = select_exprs_post_aggr
-            .iter()
-            .chain(qualify_expr_post_aggr.iter());
         // All of the window expressions (deduplicated and rewritten to reference aggregates as
-        // columns from input).
-        let window_func_exprs = find_window_exprs(windows_expr_haystack);
+        // columns from input). Window functions may be sourced from the SELECT list, QUALIFY
+        // expression, or ORDER BY.
+        let window_func_exprs = find_window_exprs(
+            select_exprs_post_aggr
+                .iter()
+                .chain(qualify_expr_post_aggr.iter())
+                .chain(order_by_rex.iter().map(|s| &s.expr)),
+        );
 
         // Process window functions after aggregation as they can reference
         // aggregate functions in their body
@@ -279,14 +325,30 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 .map(|expr| rebase_expr(expr, &window_func_exprs, &plan))
                 .collect::<Result<Vec<Expr>>>()?;
 
+            order_by_rex = order_by_rex
+                .into_iter()
+                .map(|sort_expr| {
+                    Ok(sort_expr.with_expr(rebase_expr(
+                        &sort_expr.expr,
+                        &window_func_exprs,
+                        &plan,
+                    )?))
+                })
+                .collect::<Result<Vec<_>>>()?;
+
             plan
         };
 
         // Process QUALIFY clause after window functions
         // QUALIFY filters the results of window functions, similar to how HAVING filters aggregates
         let plan = if let Some(qualify_expr) = qualify_expr_post_aggr {
-            // Validate that QUALIFY is used with window functions
-            if window_func_exprs.is_empty() {
+            // Validate that QUALIFY is used with window functions in SELECT or QUALIFY
+            let qualify_window_func_exprs = find_window_exprs(
+                select_exprs_post_aggr
+                    .iter()
+                    .chain(std::iter::once(&qualify_expr)),
+            );
+            if qualify_window_func_exprs.is_empty() {
                 return plan_err!(
                     "QUALIFY clause requires window functions in the SELECT list or QUALIFY clause"
                 );
@@ -321,6 +383,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         // Process distinct clause
         let plan = match select.distinct {
             None => Ok(plan),
+            Some(Distinct::All) => Ok(plan),
             Some(Distinct::Distinct) => {
                 LogicalPlanBuilder::from(plan).distinct()?.build()
             }
@@ -329,7 +392,9 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     || !group_by_exprs.is_empty()
                     || !window_func_exprs.is_empty()
                 {
-                    return not_impl_err!("DISTINCT ON expressions with GROUP BY, aggregation or window functions are not supported ");
+                    return not_impl_err!(
+                        "DISTINCT ON expressions with GROUP BY, aggregation or window functions are not supported "
+                    );
                 }
 
                 let on_expr = on_expr
@@ -366,7 +431,8 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             plan
         };
 
-        self.order_by(plan, order_by_rex)
+        let plan = self.order_by(plan, order_by_rex)?;
+        Ok(plan)
     }
 
     /// Try converting Expr(Unnest(Expr)) to Projection/Unnest/Projection
@@ -401,15 +467,30 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
             // expr returned here maybe different from the originals in inner_projection_exprs
             // for example:
-            // - unnest(struct_col) will be transformed into unnest(struct_col).field1, unnest(struct_col).field2
-            // - unnest(array_col) will be transformed into unnest(array_col).element
-            // - unnest(array_col) + 1 will be transformed into unnest(array_col).element +1
-            let outer_projection_exprs = rewrite_recursive_unnests_bottom_up(
-                &intermediate_plan,
-                &mut unnest_columns,
-                &mut inner_projection_exprs,
-                &intermediate_select_exprs,
-            )?;
+            // - unnest(struct_col) will be transformed into struct_col.field1, struct_col.field2
+            // - unnest(array_col) will be transformed into array_col.element
+            // - unnest(array_col) + 1 will be transformed into array_col.element +1
+            let mut outer_projection_exprs = vec![];
+            for expr in &intermediate_select_exprs {
+                let mut rewritten_exprs = rewrite_recursive_unnest_bottom_up(
+                    &intermediate_plan,
+                    &mut unnest_columns,
+                    &mut inner_projection_exprs,
+                    expr,
+                )?;
+
+                if let Some(columns) =
+                    self.get_struct_unnest_columns(&intermediate_plan, expr)?
+                {
+                    rewritten_exprs = rewritten_exprs
+                        .into_iter()
+                        .zip(columns)
+                        .map(|(expr, column)| expr.alias(column.flat_name()))
+                        .collect();
+                }
+
+                outer_projection_exprs.extend(rewritten_exprs);
+            }
 
             // No more unnest is possible
             if unnest_columns.is_empty() {
@@ -454,6 +535,35 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             .build()
     }
 
+    fn get_struct_unnest_columns(
+        &self,
+        input: &LogicalPlan,
+        expr: &Expr,
+    ) -> Result<Option<Vec<Column>>> {
+        let unnest_expr = match expr {
+            Expr::Unnest(unnest_expr) => Some(unnest_expr),
+            Expr::Alias(alias) => match alias.expr.as_ref() {
+                Expr::Unnest(unnest_expr) => Some(unnest_expr),
+                _ => None,
+            },
+            _ => None,
+        };
+
+        let Some(unnest_expr) = unnest_expr else {
+            return Ok(None);
+        };
+
+        let field = unnest_expr.expr.to_field(input.schema())?.1;
+        let DataType::Struct(inner_fields) = field.data_type() else {
+            return Ok(None);
+        };
+
+        Ok(Some(get_struct_unnested_columns(
+            &unnest_expr.expr.schema_name().to_string(),
+            inner_fields,
+        )))
+    }
+
     fn try_process_aggregate_unnest(&self, input: LogicalPlan) -> Result<LogicalPlan> {
         match input {
             // Fast path if there are no unnest in group by
@@ -535,9 +645,12 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             } else {
                 let mut unnest_options = UnnestOptions::new().with_preserve_nulls(false);
 
+                #[allow(clippy::allow_attributes, clippy::mutable_key_type)]
+                // Expr contains Arc with interior mutability but is intentionally used as hash key
                 let mut projection_exprs = match &aggr_expr_using_columns {
                     Some(exprs) => (*exprs).clone(),
                     None => {
+                        #[allow(clippy::allow_attributes, clippy::mutable_key_type)]
                         let mut columns = HashSet::new();
                         for expr in &aggr_expr {
                             expr.apply(|expr| {
@@ -594,11 +707,6 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         match selection {
             Some(predicate_expr) => {
                 let fallback_schemas = plan.fallback_normalize_schemas();
-                let outer_query_schema = planner_context.outer_query_schema().cloned();
-                let outer_query_schema_vec = outer_query_schema
-                    .as_ref()
-                    .map(|schema| vec![schema])
-                    .unwrap_or_else(Vec::new);
 
                 let filter_expr =
                     self.sql_to_expr(predicate_expr, plan.schema(), planner_context)?;
@@ -614,9 +722,19 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
                 let mut using_columns = HashSet::new();
                 expr_to_columns(&filter_expr, &mut using_columns)?;
+                let mut schema_stack: Vec<Vec<&DFSchema>> =
+                    vec![vec![plan.schema()], fallback_schemas];
+                for sc in planner_context.outer_schemas_iter() {
+                    schema_stack.push(vec![sc.as_ref()]);
+                }
+
                 let filter_expr = normalize_col_with_schemas_and_ambiguity_check(
                     filter_expr,
-                    &[&[plan.schema()], &fallback_schemas, &outer_query_schema_vec],
+                    schema_stack
+                        .iter()
+                        .map(|sc| sc.as_slice())
+                        .collect::<Vec<&[&DFSchema]>>()
+                        .as_slice(),
                     &[using_columns],
                 )?;
 
@@ -744,7 +862,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     SelectItemQualifiedWildcardKind::Expr(_) => {
                         return plan_err!(
                             "Qualified wildcard with expression not supported"
-                        )
+                        );
                     }
                 };
                 let qualifier = self.object_name_to_table_reference(object_name)?;
@@ -830,18 +948,29 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         &self,
         input: LogicalPlan,
         expr: Vec<SelectExpr>,
+        set_expr_left_schema: Option<DFSchemaRef>,
     ) -> Result<LogicalPlan> {
         // convert to Expr for validate_schema_satisfies_exprs
-        let exprs = expr
+        let plain_exprs = expr
             .iter()
             .filter_map(|e| match e {
                 SelectExpr::Expression(expr) => Some(expr.to_owned()),
                 _ => None,
             })
             .collect::<Vec<_>>();
-        self.validate_schema_satisfies_exprs(input.schema(), &exprs)?;
-
-        LogicalPlanBuilder::from(input).project(expr)?.build()
+        self.validate_schema_satisfies_exprs(input.schema(), &plain_exprs)?;
+
+        // When inside a set expression, pass the left-most schema so
+        // that expressions get aliased to match, avoiding duplicate
+        // name errors from expressions like `count(*), count(*)`.
+        let builder = LogicalPlanBuilder::from(input);
+        if let Some(left_schema) = set_expr_left_schema {
+            builder
+                .project_with_validation_and_schema(expr, &left_schema)?
+                .build()
+        } else {
+            builder.project(expr)?.build()
+        }
     }
 
     /// Create an aggregate plan.
@@ -872,16 +1001,19 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     ///   the aggregate
     /// * `qualify_expr_post_aggr`  - The "qualify" expression rewritten to reference a column from
     ///   the aggregate
-    #[allow(clippy::type_complexity)]
+    /// * `order_by_post_aggr`     - The ORDER BY expressions rewritten to reference columns from
+    ///   the aggregate
+    #[expect(clippy::too_many_arguments)]
     fn aggregate(
         &self,
         input: &LogicalPlan,
         select_exprs: &[Expr],
         having_expr_opt: Option<&Expr>,
         qualify_expr_opt: Option<&Expr>,
+        order_by_exprs: &[SortExpr],
         group_by_exprs: &[Expr],
         aggr_exprs: &[Expr],
-    ) -> Result<(LogicalPlan, Vec<Expr>, Option<Expr>, Option<Expr>)> {
+    ) -> Result<AggregatePlanResult> {
         // create the aggregate plan
         let options =
             LogicalPlanBuilderOptions::new().with_add_implicit_group_by_exprs(true);
@@ -988,12 +1120,70 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             None
         };
 
-        Ok((
+        // Rewrite the ORDER BY expressions to use the columns produced by the
+        // aggregation. If an ORDER BY expression matches a SELECT expression
+        // (ignoring aliases), use the SELECT's output column name to avoid
+        // duplication when the SELECT expression has an alias.
+        let order_by_post_aggr = order_by_exprs
+            .iter()
+            .map(|sort_expr| {
+                let rewritten_expr =
+                    rebase_expr(&sort_expr.expr, &aggr_projection_exprs, input)?;
+
+                // Check if this ORDER BY expression matches any aliased SELECT expression
+                // If so, use the SELECT's alias instead of the raw expression
+                let final_expr = select_exprs_post_aggr
+                    .iter()
+                    .find_map(|select_expr| {
+                        // Only consider aliased expressions
+                        if let Expr::Alias(alias) = select_expr {
+                            let rewritten_unaliased = match &rewritten_expr {
+                                Expr::Alias(a) => a.expr.as_ref(),
+                                other => other,
+                            };
+                            if alias.expr.as_ref() == rewritten_unaliased {
+                                return Some(Expr::Column(Column::new_unqualified(
+                                    alias.name.clone(),
+                                )));
+                            }
+                        }
+                        None
+                    })
+                    .unwrap_or(rewritten_expr);
+
+                Ok(sort_expr.with_expr(final_expr))
+            })
+            .collect::<Result<Vec<SortExpr>>>()?;
+
+        let all_valid_exprs: Vec<Expr> = column_exprs_post_aggr
+            .iter()
+            .cloned()
+            .chain(select_exprs_post_aggr.iter().filter_map(|e| {
+                if let Expr::Alias(alias) = e {
+                    Some(Expr::Column(Column::new_unqualified(alias.name.clone())))
+                } else {
+                    None
+                }
+            }))
+            .collect();
+
+        let order_by_exprs_only: Vec<Expr> =
+            order_by_post_aggr.iter().map(|s| s.expr.clone()).collect();
+        check_columns_satisfy_exprs(
+            &all_valid_exprs,
+            &order_by_exprs_only,
+            CheckColumnsSatisfyExprsPurpose::Aggregate(
+                CheckColumnsMustReferenceAggregatePurpose::OrderBy,
+            ),
+        )?;
+
+        Ok(AggregatePlanResult {
             plan,
-            select_exprs_post_aggr,
-            having_expr_post_aggr,
-            qualify_expr_post_aggr,
-        ))
+            select_exprs: select_exprs_post_aggr,
+            having_expr: having_expr_post_aggr,
+            qualify_expr: qualify_expr_post_aggr,
+            order_by_exprs: order_by_post_aggr,
+        })
     }
 
     // If the projection is done over a named window, that window
@@ -1013,33 +1203,32 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             {
                 let mut err = None;
                 let _ = visit_expressions_mut(expr, |expr| {
-                    if let SQLExpr::Function(f) = expr {
-                        if let Some(WindowType::NamedWindow(ident)) = &f.over {
-                            let normalized_ident =
-                                self.ident_normalizer.normalize(ident.clone());
-                            for (
-                                NamedWindowDefinition(_, window_expr),
-                                normalized_window_ident,
-                            ) in named_windows.iter()
-                            {
-                                if normalized_ident.eq(normalized_window_ident) {
-                                    f.over = Some(match window_expr {
-                                        NamedWindowExpr::NamedWindow(ident) => {
-                                            WindowType::NamedWindow(ident.clone())
-                                        }
-                                        NamedWindowExpr::WindowSpec(spec) => {
-                                            WindowType::WindowSpec(spec.clone())
-                                        }
-                                    })
-                                }
-                            }
-                            // All named windows must be defined with a WindowSpec.
-                            if let Some(WindowType::NamedWindow(ident)) = &f.over {
-                                err =
-                                    Some(plan_err!("The window {ident} is not defined!"));
-                                return ControlFlow::Break(());
+                    if let SQLExpr::Function(f) = expr
+                        && let Some(WindowType::NamedWindow(ident)) = &f.over
+                    {
+                        let normalized_ident =
+                            self.ident_normalizer.normalize(ident.clone());
+                        for (
+                            NamedWindowDefinition(_, window_expr),
+                            normalized_window_ident,
+                        ) in named_windows.iter()
+                        {
+                            if normalized_ident.eq(normalized_window_ident) {
+                                f.over = Some(match window_expr {
+                                    NamedWindowExpr::NamedWindow(ident) => {
+                                        WindowType::NamedWindow(ident.clone())
+                                    }
+                                    NamedWindowExpr::WindowSpec(spec) => {
+                                        WindowType::WindowSpec(spec.clone())
+                                    }
+                                })
                             }
                         }
+                        // All named windows must be defined with a WindowSpec.
+                        if let Some(WindowType::NamedWindow(ident)) = &f.over {
+                            err = Some(plan_err!("The window {ident} is not defined!"));
+                            return ControlFlow::Break(());
+                        }
                     }
                     ControlFlow::Continue(())
                 });
diff --git a/datafusion/sql/src/set_expr.rs b/datafusion/sql/src/set_expr.rs
index 5b65e1c045bdc..dc8e4f14d1ee8 100644
--- a/datafusion/sql/src/set_expr.rs
+++ b/datafusion/sql/src/set_expr.rs
@@ -15,9 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::sync::Arc;
+
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
 use datafusion_common::{
-    not_impl_err, plan_err, DataFusionError, Diagnostic, Result, Span,
+    DataFusionError, Diagnostic, Result, Span, not_impl_err, plan_err,
 };
 use datafusion_expr::{LogicalPlan, LogicalPlanBuilder};
 use sqlparser::ast::{SetExpr, SetOperator, SetQuantifier, Spanned};
@@ -42,7 +44,23 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 let left_span = Span::try_from_sqlparser_span(left.span());
                 let right_span = Span::try_from_sqlparser_span(right.span());
                 let left_plan = self.set_expr_to_plan(*left, planner_context);
+                // Store the left plan's schema so that the right side can
+                // alias duplicate expressions to match. Skip for BY NAME
+                // operations since those match columns by name, not position.
+                if let Ok(plan) = &left_plan
+                    && plan.schema().fields().len() > 1
+                    && !matches!(
+                        set_quantifier,
+                        SetQuantifier::ByName
+                            | SetQuantifier::AllByName
+                            | SetQuantifier::DistinctByName
+                    )
+                {
+                    planner_context
+                        .set_set_expr_left_schema(Some(Arc::clone(plan.schema())));
+                }
                 let right_plan = self.set_expr_to_plan(*right, planner_context);
+                planner_context.set_set_expr_left_schema(None);
                 let (left_plan, right_plan) = match (left_plan, right_plan) {
                     (Ok(left_plan), Ok(right_plan)) => (left_plan, right_plan),
                     (Err(left_err), Err(right_err)) => {
diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs
index 81381bf49fc5b..587ed02d13188 100644
--- a/datafusion/sql/src/statement.rs
+++ b/datafusion/sql/src/statement.rs
@@ -22,10 +22,10 @@ use std::sync::Arc;
 
 use crate::parser::{
     CopyToSource, CopyToStatement, CreateExternalTable, DFParser, ExplainStatement,
-    LexOrdering, Statement as DFStatement,
+    LexOrdering, ResetStatement, Statement as DFStatement,
 };
 use crate::planner::{
-    object_name_to_qualifier, ContextProvider, PlannerContext, SqlToRel,
+    ContextProvider, PlannerContext, SqlToRel, object_name_to_qualifier,
 };
 use crate::utils::normalize_ident;
 
@@ -33,31 +33,32 @@ use arrow::datatypes::{Field, FieldRef, Fields};
 use datafusion_common::error::_plan_err;
 use datafusion_common::parsers::CompressionTypeVariant;
 use datafusion_common::{
-    exec_err, internal_err, not_impl_err, plan_datafusion_err, plan_err, schema_err,
-    unqualified_field_not_found, Column, Constraint, Constraints, DFSchema, DFSchemaRef,
-    DataFusionError, Result, ScalarValue, SchemaError, SchemaReference, TableReference,
-    ToDFSchema,
+    Column, Constraint, Constraints, DFSchema, DFSchemaRef, DataFusionError, Result,
+    ScalarValue, SchemaError, SchemaReference, TableReference, ToDFSchema, exec_err,
+    internal_err, not_impl_err, plan_datafusion_err, plan_err, schema_err,
+    unqualified_field_not_found,
 };
 use datafusion_expr::dml::{CopyTo, InsertOp};
 use datafusion_expr::expr_rewriter::normalize_col_with_schemas_and_ambiguity_check;
-use datafusion_expr::logical_plan::builder::project;
 use datafusion_expr::logical_plan::DdlStatement;
+use datafusion_expr::logical_plan::builder::project;
 use datafusion_expr::utils::expr_to_columns;
 use datafusion_expr::{
-    cast, col, Analyze, CreateCatalog, CreateCatalogSchema,
+    Analyze, CreateCatalog, CreateCatalogSchema,
     CreateExternalTable as PlanCreateExternalTable, CreateFunction, CreateFunctionBody,
     CreateIndex as PlanCreateIndex, CreateMemoryTable, CreateView, Deallocate,
     DescribeTable, DmlStatement, DropCatalogSchema, DropFunction, DropTable, DropView,
     EmptyRelation, Execute, Explain, ExplainFormat, Expr, ExprSchemable, Filter,
-    LogicalPlan, LogicalPlanBuilder, OperateFunctionArg, PlanType, Prepare, SetVariable,
-    SortExpr, Statement as PlanStatement, ToStringifiedPlan, TransactionAccessMode,
-    TransactionConclusion, TransactionEnd, TransactionIsolationLevel, TransactionStart,
-    Volatility, WriteOp,
+    LogicalPlan, LogicalPlanBuilder, OperateFunctionArg, PlanType, Prepare,
+    ResetVariable, SetVariable, SortExpr, Statement as PlanStatement, ToStringifiedPlan,
+    TransactionAccessMode, TransactionConclusion, TransactionEnd,
+    TransactionIsolationLevel, TransactionStart, Volatility, WriteOp, cast, col,
 };
 use sqlparser::ast::{
-    self, BeginTransactionKind, IndexColumn, IndexType, NullsDistinctOption, OrderByExpr,
-    OrderByOptions, Set, ShowStatementIn, ShowStatementOptions, SqliteOnConflict,
-    TableObject, UpdateTableFromKind, ValueWithSpan,
+    self, BeginTransactionKind, CheckConstraint, ForeignKeyConstraint, IndexColumn,
+    IndexType, NullsDistinctOption, OrderByExpr, OrderByOptions, PrimaryKeyConstraint,
+    Set, ShowStatementIn, ShowStatementOptions, SqliteOnConflict, TableObject,
+    UniqueConstraint, Update, UpdateTableFromKind, ValueWithSpan,
 };
 use sqlparser::ast::{
     Assignment, AssignmentTarget, ColumnDef, CreateIndex, CreateTable,
@@ -102,38 +103,24 @@ fn get_schema_name(schema_name: &SchemaName) -> String {
 /// Construct `TableConstraint`(s) for the given columns by iterating over
 /// `columns` and extracting individual inline constraint definitions.
 fn calc_inline_constraints_from_columns(columns: &[ColumnDef]) -> Vec<TableConstraint> {
-    let mut constraints = vec![];
+    let mut constraints: Vec<TableConstraint> = vec![];
     for column in columns {
         for ast::ColumnOptionDef { name, option } in &column.options {
             match option {
-                ast::ColumnOption::Unique {
-                    is_primary: false,
+                ast::ColumnOption::Unique(UniqueConstraint {
                     characteristics,
-                } => constraints.push(TableConstraint::Unique {
+                    name,
+                    index_name: _index_name,
+                    index_type_display: _index_type_display,
+                    index_type: _index_type,
+                    columns: _column,
+                    index_options: _index_options,
+                    nulls_distinct: _nulls_distinct,
+                }) => constraints.push(TableConstraint::Unique(UniqueConstraint {
                     name: name.clone(),
-                    columns: vec![IndexColumn {
-                        column: OrderByExpr {
-                            expr: SQLExpr::Identifier(column.name.clone()),
-                            options: OrderByOptions {
-                                asc: None,
-                                nulls_first: None,
-                            },
-                            with_fill: None,
-                        },
-                        operator_class: None,
-                    }],
-                    characteristics: *characteristics,
                     index_name: None,
                     index_type_display: ast::KeyOrIndexDisplay::None,
                     index_type: None,
-                    index_options: vec![],
-                    nulls_distinct: NullsDistinctOption::None,
-                }),
-                ast::ColumnOption::Unique {
-                    is_primary: true,
-                    characteristics,
-                } => constraints.push(TableConstraint::PrimaryKey {
-                    name: name.clone(),
                     columns: vec![IndexColumn {
                         column: OrderByExpr {
                             expr: SQLExpr::Identifier(column.name.clone()),
@@ -145,35 +132,69 @@ fn calc_inline_constraints_from_columns(columns: &[ColumnDef]) -> Vec<TableConst
                         },
                         operator_class: None,
                     }],
-                    characteristics: *characteristics,
-                    index_name: None,
-                    index_type: None,
                     index_options: vec![],
-                }),
-                ast::ColumnOption::ForeignKey {
+                    characteristics: *characteristics,
+                    nulls_distinct: NullsDistinctOption::None,
+                })),
+                ast::ColumnOption::PrimaryKey(PrimaryKeyConstraint {
+                    characteristics,
+                    name: _name,
+                    index_name: _index_name,
+                    index_type: _index_type,
+                    columns: _columns,
+                    index_options: _index_options,
+                }) => {
+                    constraints.push(TableConstraint::PrimaryKey(PrimaryKeyConstraint {
+                        name: name.clone(),
+                        index_name: None,
+                        index_type: None,
+                        columns: vec![IndexColumn {
+                            column: OrderByExpr {
+                                expr: SQLExpr::Identifier(column.name.clone()),
+                                options: OrderByOptions {
+                                    asc: None,
+                                    nulls_first: None,
+                                },
+                                with_fill: None,
+                            },
+                            operator_class: None,
+                        }],
+                        index_options: vec![],
+                        characteristics: *characteristics,
+                    }))
+                }
+                ast::ColumnOption::ForeignKey(ForeignKeyConstraint {
                     foreign_table,
                     referred_columns,
                     on_delete,
                     on_update,
                     characteristics,
-                } => constraints.push(TableConstraint::ForeignKey {
-                    name: name.clone(),
-                    columns: vec![],
-                    foreign_table: foreign_table.clone(),
-                    referred_columns: referred_columns.to_vec(),
-                    on_delete: *on_delete,
-                    on_update: *on_update,
-                    characteristics: *characteristics,
-                    index_name: None,
-                }),
-                ast::ColumnOption::Check(expr) => {
-                    constraints.push(TableConstraint::Check {
+                    name: _name,
+                    index_name: _index_name,
+                    columns: _columns,
+                    match_kind: _match_kind,
+                }) => {
+                    constraints.push(TableConstraint::ForeignKey(ForeignKeyConstraint {
                         name: name.clone(),
-                        expr: Box::new(expr.clone()),
-                        enforced: None,
-                    })
-                }
-                // Other options are not constraint related.
+                        index_name: None,
+                        columns: vec![],
+                        foreign_table: foreign_table.clone(),
+                        referred_columns: referred_columns.clone(),
+                        on_delete: *on_delete,
+                        on_update: *on_update,
+                        match_kind: None,
+                        characteristics: *characteristics,
+                    }))
+                }
+                ast::ColumnOption::Check(CheckConstraint {
+                    name,
+                    expr,
+                    enforced: _enforced,
+                }) => constraints.push(TableConstraint::Check(CheckConstraint {
+                    name: name.clone(),
+                    expr: expr.clone(),
+                    enforced: None,
+                })),
                 ast::ColumnOption::Default(_)
                 | ast::ColumnOption::Null
                 | ast::ColumnOption::NotNull
@@ -191,7 +212,8 @@ fn calc_inline_constraints_from_columns(columns: &[ColumnDef]) -> Vec<TableConst
                 | ast::ColumnOption::Tags(_)
                 | ast::ColumnOption::Alias(_)
                 | ast::ColumnOption::Srid(_)
-                | ast::ColumnOption::Collation(_) => {}
+                | ast::ColumnOption::Collation(_)
+                | ast::ColumnOption::Invisible => {}
             }
         }
     }
@@ -211,6 +233,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 format,
                 statement,
             }) => self.explain_to_plan(verbose, analyze, format, *statement),
+            DFStatement::Reset(statement) => self.reset_statement_to_plan(statement),
         }
     }
 
@@ -319,152 +342,160 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 refresh_mode,
                 initialize,
                 require_user,
+                partition_of,
+                for_values,
             }) => {
                 if temporary {
-                    return not_impl_err!("Temporary tables not supported")?;
+                    return not_impl_err!("Temporary tables not supported");
                 }
                 if external {
-                    return not_impl_err!("External tables not supported")?;
+                    return not_impl_err!("External tables not supported");
                 }
                 if global.is_some() {
-                    return not_impl_err!("Global tables not supported")?;
+                    return not_impl_err!("Global tables not supported");
                 }
                 if transient {
-                    return not_impl_err!("Transient tables not supported")?;
+                    return not_impl_err!("Transient tables not supported");
                 }
                 if volatile {
-                    return not_impl_err!("Volatile tables not supported")?;
+                    return not_impl_err!("Volatile tables not supported");
                 }
                 if hive_distribution != ast::HiveDistributionStyle::NONE {
                     return not_impl_err!(
                         "Hive distribution not supported: {hive_distribution:?}"
-                    )?;
+                    );
                 }
-                if !matches!(
-                    hive_formats,
-                    Some(ast::HiveFormat {
-                        row_format: None,
-                        serde_properties: None,
-                        storage: None,
-                        location: None,
-                    })
-                ) {
-                    return not_impl_err!(
-                        "Hive formats not supported: {hive_formats:?}"
-                    )?;
+                if hive_formats.is_some()
+                    && !matches!(
+                        hive_formats,
+                        Some(ast::HiveFormat {
+                            row_format: None,
+                            serde_properties: None,
+                            storage: None,
+                            location: None,
+                        })
+                    )
+                {
+                    return not_impl_err!("Hive formats not supported: {hive_formats:?}");
                 }
                 if file_format.is_some() {
-                    return not_impl_err!("File format not supported")?;
+                    return not_impl_err!("File format not supported");
                 }
                 if location.is_some() {
-                    return not_impl_err!("Location not supported")?;
+                    return not_impl_err!("Location not supported");
                 }
                 if without_rowid {
-                    return not_impl_err!("Without rowid not supported")?;
+                    return not_impl_err!("Without rowid not supported");
                 }
                 if like.is_some() {
-                    return not_impl_err!("Like not supported")?;
+                    return not_impl_err!("Like not supported");
                 }
                 if clone.is_some() {
-                    return not_impl_err!("Clone not supported")?;
+                    return not_impl_err!("Clone not supported");
                 }
                 if comment.is_some() {
-                    return not_impl_err!("Comment not supported")?;
+                    return not_impl_err!("Comment not supported");
                 }
                 if on_commit.is_some() {
-                    return not_impl_err!("On commit not supported")?;
+                    return not_impl_err!("On commit not supported");
                 }
                 if on_cluster.is_some() {
-                    return not_impl_err!("On cluster not supported")?;
+                    return not_impl_err!("On cluster not supported");
                 }
                 if primary_key.is_some() {
-                    return not_impl_err!("Primary key not supported")?;
+                    return not_impl_err!("Primary key not supported");
                 }
                 if order_by.is_some() {
-                    return not_impl_err!("Order by not supported")?;
+                    return not_impl_err!("Order by not supported");
                 }
                 if partition_by.is_some() {
-                    return not_impl_err!("Partition by not supported")?;
+                    return not_impl_err!("Partition by not supported");
                 }
                 if cluster_by.is_some() {
-                    return not_impl_err!("Cluster by not supported")?;
+                    return not_impl_err!("Cluster by not supported");
                 }
                 if clustered_by.is_some() {
-                    return not_impl_err!("Clustered by not supported")?;
+                    return not_impl_err!("Clustered by not supported");
                 }
                 if strict {
-                    return not_impl_err!("Strict not supported")?;
+                    return not_impl_err!("Strict not supported");
                 }
                 if copy_grants {
-                    return not_impl_err!("Copy grants not supported")?;
+                    return not_impl_err!("Copy grants not supported");
                 }
                 if enable_schema_evolution.is_some() {
-                    return not_impl_err!("Enable schema evolution not supported")?;
+                    return not_impl_err!("Enable schema evolution not supported");
                 }
                 if change_tracking.is_some() {
-                    return not_impl_err!("Change tracking not supported")?;
+                    return not_impl_err!("Change tracking not supported");
                 }
                 if data_retention_time_in_days.is_some() {
-                    return not_impl_err!("Data retention time in days not supported")?;
+                    return not_impl_err!("Data retention time in days not supported");
                 }
                 if max_data_extension_time_in_days.is_some() {
                     return not_impl_err!(
                         "Max data extension time in days not supported"
-                    )?;
+                    );
                 }
                 if default_ddl_collation.is_some() {
-                    return not_impl_err!("Default DDL collation not supported")?;
+                    return not_impl_err!("Default DDL collation not supported");
                 }
                 if with_aggregation_policy.is_some() {
-                    return not_impl_err!("With aggregation policy not supported")?;
+                    return not_impl_err!("With aggregation policy not supported");
                 }
                 if with_row_access_policy.is_some() {
-                    return not_impl_err!("With row access policy not supported")?;
+                    return not_impl_err!("With row access policy not supported");
                 }
                 if with_tags.is_some() {
-                    return not_impl_err!("With tags not supported")?;
+                    return not_impl_err!("With tags not supported");
                 }
                 if iceberg {
-                    return not_impl_err!("Iceberg not supported")?;
+                    return not_impl_err!("Iceberg not supported");
                 }
                 if external_volume.is_some() {
-                    return not_impl_err!("External volume not supported")?;
+                    return not_impl_err!("External volume not supported");
                 }
                 if base_location.is_some() {
-                    return not_impl_err!("Base location not supported")?;
+                    return not_impl_err!("Base location not supported");
                 }
                 if catalog.is_some() {
-                    return not_impl_err!("Catalog not supported")?;
+                    return not_impl_err!("Catalog not supported");
                 }
                 if catalog_sync.is_some() {
-                    return not_impl_err!("Catalog sync not supported")?;
+                    return not_impl_err!("Catalog sync not supported");
                 }
                 if storage_serialization_policy.is_some() {
-                    return not_impl_err!("Storage serialization policy not supported")?;
+                    return not_impl_err!("Storage serialization policy not supported");
                 }
                 if inherits.is_some() {
-                    return not_impl_err!("Table inheritance not supported")?;
+                    return not_impl_err!("Table inheritance not supported");
                 }
                 if dynamic {
-                    return not_impl_err!("Dynamic tables not supported")?;
+                    return not_impl_err!("Dynamic tables not supported");
                 }
                 if version.is_some() {
-                    return not_impl_err!("Version not supported")?;
+                    return not_impl_err!("Version not supported");
                 }
                 if target_lag.is_some() {
-                    return not_impl_err!("Target lag not supported")?;
+                    return not_impl_err!("Target lag not supported");
                 }
                 if warehouse.is_some() {
-                    return not_impl_err!("Warehouse not supported")?;
+                    return not_impl_err!("Warehouse not supported");
                 }
                 if refresh_mode.is_some() {
-                    return not_impl_err!("Refresh mode not supported")?;
+                    return not_impl_err!("Refresh mode not supported");
                 }
                 if initialize.is_some() {
-                    return not_impl_err!("Initialize not supported")?;
+                    return not_impl_err!("Initialize not supported");
                 }
                 if require_user {
-                    return not_impl_err!("Require user not supported")?;
+                    return not_impl_err!("Require user not supported");
+                }
+                if partition_of.is_some() {
+                    return not_impl_err!("PARTITION OF not supported");
+                }
+                if for_values.is_some() {
+                    return not_impl_err!("PARTITION OF .. FOR VALUES .. not supported");
                 }
                 // Merge inline constraints and existing constraints
                 let mut all_constraints = constraints;
@@ -556,7 +587,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     }
                 }
             }
-            Statement::CreateView {
+            Statement::CreateView(ast::CreateView {
                 or_replace,
                 materialized,
                 name,
@@ -573,7 +604,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 or_alter,
                 secure,
                 name_before_not_exists,
-            } => {
+            }) => {
                 if materialized {
                     return not_impl_err!("Materialized views not supported")?;
                 }
@@ -595,7 +626,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
                 // put the statement back together temporarily to get the SQL
                 // string representation
-                let stmt = Statement::CreateView {
+                let stmt = Statement::CreateView(ast::CreateView {
                     or_replace,
                     materialized,
                     name,
@@ -612,16 +643,16 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     or_alter,
                     secure,
                     name_before_not_exists,
-                };
+                });
                 let sql = stmt.to_string();
-                let Statement::CreateView {
+                let Statement::CreateView(ast::CreateView {
                     name,
                     columns,
                     query,
                     or_replace,
                     temporary,
                     ..
-                } = stmt
+                }) = stmt
                 else {
                     return internal_err!("Unreachable code in create view");
                 };
@@ -716,18 +747,31 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     }
                     ObjectType::Schema => {
                         let name = match name {
-                            TableReference::Bare { table } => Ok(SchemaReference::Bare { schema: table }),
-                            TableReference::Partial { schema, table } => Ok(SchemaReference::Full { schema: table, catalog: schema }),
-                            TableReference::Full { catalog: _, schema: _, table: _ } => {
-                                Err(ParserError("Invalid schema specifier (has 3 parts)".to_string()))
+                            TableReference::Bare { table } => {
+                                Ok(SchemaReference::Bare { schema: table })
                             }
+                            TableReference::Partial { schema, table } => {
+                                Ok(SchemaReference::Full {
+                                    schema: table,
+                                    catalog: schema,
+                                })
+                            }
+                            TableReference::Full {
+                                catalog: _,
+                                schema: _,
+                                table: _,
+                            } => Err(ParserError(
+                                "Invalid schema specifier (has 3 parts)".to_string(),
+                            )),
                         }?;
-                        Ok(LogicalPlan::Ddl(DdlStatement::DropCatalogSchema(DropCatalogSchema {
-                            name,
-                            if_exists,
-                            cascade,
-                            schema: DFSchemaRef::new(DFSchema::empty()),
-                        })))
+                        Ok(LogicalPlan::Ddl(DdlStatement::DropCatalogSchema(
+                            DropCatalogSchema {
+                                name,
+                                if_exists,
+                                cascade,
+                                schema: DFSchemaRef::new(DFSchema::empty()),
+                            },
+                        )))
                     }
                     _ => not_impl_err!(
                         "Only `DROP TABLE/VIEW/SCHEMA  ...` statement is supported currently"
@@ -746,8 +790,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     .collect::<Result<_>>()?;
 
                 // Create planner context with parameters
-                let mut planner_context =
-                    PlannerContext::new().with_prepare_param_data_types(fields.clone());
+                let mut planner_context = PlannerContext::new()
+                    .with_prepare_param_data_types(
+                        fields.iter().cloned().map(Some).collect(),
+                    );
 
                 // Build logical plan for inner statement of the prepare statement
                 let plan = self.sql_statement_to_plan_with_context_impl(
@@ -764,7 +810,9 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                         })
                         .collect();
                     fields.extend(param_types.iter().cloned());
-                    planner_context.with_prepare_param_data_types(param_types);
+                    planner_context.with_prepare_param_data_types(
+                        param_types.into_iter().map(Some).collect(),
+                    );
                 }
 
                 Ok(LogicalPlan::Statement(PlanStatement::Prepare(Prepare {
@@ -951,11 +999,15 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 has_table_keyword,
                 settings,
                 format_clause,
+                insert_token: _, // record the location the `INSERT` token
+                optimizer_hint,
             }) => {
                 let table_name = match table {
                     TableObject::TableName(table_name) => table_name,
                     TableObject::TableFunction(_) => {
-                        return not_impl_err!("INSERT INTO Table functions not supported")
+                        return not_impl_err!(
+                            "INSERT INTO Table functions not supported"
+                        );
                     }
                 };
                 if let Some(or) = or {
@@ -1004,12 +1056,15 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 if format_clause.is_some() {
                     plan_err!("Inserts with format clause not supported")?;
                 }
+                if optimizer_hint.is_some() {
+                    plan_err!("Optimizer hints not supported")?;
+                }
                 // optional keywords don't change behavior
                 let _ = into;
                 let _ = has_table_keyword;
                 self.insert_to_plan(table_name, columns, source, overwrite, replace_into)
             }
-            Statement::Update {
+            Statement::Update(Update {
                 table,
                 assignments,
                 from,
@@ -1017,7 +1072,9 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 returning,
                 or,
                 limit,
-            } => {
+                update_token: _,
+                optimizer_hint,
+            }) => {
                 let from_clauses =
                     from.map(|update_table_from_kind| match update_table_from_kind {
                         UpdateTableFromKind::BeforeSet(from_clauses) => from_clauses,
@@ -1025,9 +1082,18 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     });
                 // TODO: support multiple tables in UPDATE SET FROM
                 if from_clauses.as_ref().is_some_and(|f| f.len() > 1) {
-                    plan_err!("Multiple tables in UPDATE SET FROM not yet supported")?;
+                    not_impl_err!(
+                        "Multiple tables in UPDATE SET FROM not yet supported"
+                    )?;
                 }
                 let update_from = from_clauses.and_then(|mut f| f.pop());
+
+                // UPDATE ... FROM is currently not working
+                // TODO fix https://github.com/apache/datafusion/issues/19950
+                if update_from.is_some() {
+                    return not_impl_err!("UPDATE ... FROM is not supported");
+                }
+
                 if returning.is_some() {
                     plan_err!("Update-returning clause not yet supported")?;
                 }
@@ -1037,7 +1103,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 if limit.is_some() {
                     return not_impl_err!("Update-limit clause not supported")?;
                 }
-                self.update_to_plan(table, assignments, update_from, selection)
+                if optimizer_hint.is_some() {
+                    plan_err!("Optimizer hints not supported")?;
+                }
+                self.update_to_plan(table, &assignments, update_from, selection)
             }
 
             Statement::Delete(Delete {
@@ -1048,6 +1117,8 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 from,
                 order_by,
                 limit,
+                delete_token: _,
+                optimizer_hint,
             }) => {
                 if !tables.is_empty() {
                     plan_err!("DELETE <TABLE> not supported")?;
@@ -1065,12 +1136,12 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     plan_err!("Delete-order-by clause not yet supported")?;
                 }
 
-                if limit.is_some() {
-                    plan_err!("Delete-limit clause not yet supported")?;
+                if optimizer_hint.is_some() {
+                    plan_err!("Optimizer hints not supported")?;
                 }
 
                 let table_name = self.get_delete_target(from)?;
-                self.delete_to_plan(table_name, selection)
+                self.delete_to_plan(&table_name, selection, limit)
             }
 
             Statement::StartTransaction {
@@ -1100,7 +1171,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 if has_end_keyword {
                     return not_impl_err!("Transaction with END keyword not supported");
                 }
-                self.validate_transaction_kind(transaction)?;
+                self.validate_transaction_kind(transaction.as_ref())?;
                 let isolation_level: ast::TransactionIsolationLevel = modes
                     .iter()
                     .filter_map(|m: &TransactionMode| match m {
@@ -1222,6 +1293,27 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     }
                     None => None,
                 };
+                // Validate default arguments
+                let first_default = match args.as_ref() {
+                    Some(arg) => arg.iter().position(|t| t.default_expr.is_some()),
+                    None => None,
+                };
+                let last_non_default = match args.as_ref() {
+                    Some(arg) => arg
+                        .iter()
+                        .rev()
+                        .position(|t| t.default_expr.is_none())
+                        .map(|reverse_pos| arg.len() - reverse_pos - 1),
+                    None => None,
+                };
+                if let (Some(pos_default), Some(pos_non_default)) =
+                    (first_default, last_non_default)
+                    && pos_non_default > pos_default
+                {
+                    return plan_err!(
+                        "Non-default arguments cannot follow default arguments."
+                    );
+                }
                 // At the moment functions can't be qualified `schema.name`
                 let name = match &name.0[..] {
                     [] => exec_err!("Function should have name")?,
@@ -1233,16 +1325,39 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 //
                 let arg_types = args.as_ref().map(|arg| {
                     arg.iter()
-                        .map(|t| Arc::new(Field::new("", t.data_type.clone(), true)))
+                        .map(|t| {
+                            let name = match t.name.clone() {
+                                Some(name) => name.value,
+                                None => "".to_string(),
+                            };
+                            Arc::new(Field::new(name, t.data_type.clone(), true))
+                        })
                         .collect::<Vec<_>>()
                 });
+                // Validate parameter style
+                if let Some(ref fields) = arg_types {
+                    let count_positional =
+                        fields.iter().filter(|f| f.name() == "").count();
+                    if !(count_positional == 0 || count_positional == fields.len()) {
+                        return plan_err!(
+                            "All function arguments must use either named or positional style."
+                        );
+                    }
+                }
                 let mut planner_context = PlannerContext::new()
-                    .with_prepare_param_data_types(arg_types.unwrap_or_default());
+                    .with_prepare_param_data_types(
+                        arg_types
+                            .unwrap_or_default()
+                            .into_iter()
+                            .map(Some)
+                            .collect(),
+                    );
 
                 let function_body = match function_body {
                     Some(r) => Some(self.sql_to_expr(
                         match r {
-                            ast::CreateFunctionBody::AsBeforeOptions(expr) => expr,
+                            // `link_symbol` indicates if the primary expression contains the name of shared library file.
+                            ast::CreateFunctionBody::AsBeforeOptions{body: expr, link_symbol: _link_symbol} => expr,
                             ast::CreateFunctionBody::AsAfterOptions(expr) => expr,
                             ast::CreateFunctionBody::Return(expr) => expr,
                             ast::CreateFunctionBody::AsBeginEnd(_) => {
@@ -1285,11 +1400,11 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
                 Ok(LogicalPlan::Ddl(statement))
             }
-            Statement::DropFunction {
+            Statement::DropFunction(ast::DropFunction {
                 if_exists,
                 func_desc,
-                ..
-            } => {
+                drop_behavior: _,
+            }) => {
                 // According to postgresql documentation it can be only one function
                 // specified in drop statement
                 if let Some(desc) = func_desc.first() {
@@ -1309,6 +1424,60 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     exec_err!("Function name not provided")
                 }
             }
+            Statement::Truncate(ast::Truncate {
+                table_names,
+                partitions,
+                identity,
+                cascade,
+                on_cluster,
+                table,
+                if_exists,
+            }) => {
+                let _ = table; // Support TRUNCATE TABLE and TRUNCATE syntax
+                if table_names.len() != 1 {
+                    return not_impl_err!(
+                        "TRUNCATE with multiple tables is not supported"
+                    );
+                }
+
+                let target = &table_names[0];
+                if target.only {
+                    return not_impl_err!("TRUNCATE with ONLY is not supported");
+                }
+                if partitions.is_some() {
+                    return not_impl_err!("TRUNCATE with PARTITION is not supported");
+                }
+                if identity.is_some() {
+                    return not_impl_err!(
+                        "TRUNCATE with RESTART/CONTINUE IDENTITY is not supported"
+                    );
+                }
+                if cascade.is_some() {
+                    return not_impl_err!(
+                        "TRUNCATE with CASCADE/RESTRICT is not supported"
+                    );
+                }
+                if on_cluster.is_some() {
+                    return not_impl_err!("TRUNCATE with ON CLUSTER is not supported");
+                }
+                if if_exists {
+                    return not_impl_err!("TRUNCATE .. with IF EXISTS is not supported");
+                }
+                let table = self.object_name_to_table_reference(target.name.clone())?;
+                let source = self.context_provider.get_table_source(table.clone())?;
+
+                // TRUNCATE does not operate on input rows. The EmptyRelation is a logical placeholder
+                // since the real operation is executed directly by the TableProvider's truncate() hook.
+                Ok(LogicalPlan::Dml(DmlStatement::new(
+                    table.clone(),
+                    source,
+                    WriteOp::Truncate,
+                    Arc::new(LogicalPlan::EmptyRelation(EmptyRelation {
+                        produce_one_row: false,
+                        schema: DFSchemaRef::new(DFSchema::empty()),
+                    })),
+                )))
+            }
             Statement::CreateIndex(CreateIndex {
                 name,
                 table_name,
@@ -1606,22 +1775,18 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         let constraints =
             self.new_constraint_from_table_constraints(&all_constraints, &df_schema)?;
         Ok(LogicalPlan::Ddl(DdlStatement::CreateExternalTable(
-            PlanCreateExternalTable {
-                schema: df_schema,
-                name,
-                location,
-                file_type,
-                table_partition_cols,
-                if_not_exists,
-                or_replace,
-                temporary,
-                definition,
-                order_exprs: ordered_exprs,
-                unbounded,
-                options: options_map,
-                constraints,
-                column_defaults,
-            },
+            PlanCreateExternalTable::builder(name, location, file_type, df_schema)
+                .with_partition_cols(table_partition_cols)
+                .with_if_not_exists(if_not_exists)
+                .with_or_replace(or_replace)
+                .with_temporary(temporary)
+                .with_definition(definition)
+                .with_order_exprs(ordered_exprs)
+                .with_unbounded(unbounded)
+                .with_options(options_map)
+                .with_constraints(constraints)
+                .with_column_defaults(column_defaults)
+                .build(),
         )))
     }
 
@@ -1667,8 +1832,17 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         let constraints = constraints
             .iter()
             .map(|c: &TableConstraint| match c {
-                TableConstraint::Unique { name, columns, .. } => {
-                    let constraint_name = match name {
+                TableConstraint::Unique(UniqueConstraint {
+                    name,
+                    index_name: _,
+                    index_type_display: _,
+                    index_type: _,
+                    columns,
+                    index_options: _,
+                    characteristics: _,
+                    nulls_distinct: _,
+                }) => {
+                    let constraint_name = match &name {
                         Some(name) => &format!("unique constraint with name '{name}'"),
                         None => "unique constraint",
                     };
@@ -1680,7 +1854,14 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     )?;
                     Ok(Constraint::Unique(indices))
                 }
-                TableConstraint::PrimaryKey { columns, .. } => {
+                TableConstraint::PrimaryKey(PrimaryKeyConstraint {
+                    name: _,
+                    index_name: _,
+                    index_type: _,
+                    columns,
+                    index_options: _,
+                    characteristics: _,
+                }) => {
                     // Get primary key indices in the schema
                     let indices = self.get_constraint_column_indices(
                         df_schema,
@@ -1832,7 +2013,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 .iter()
                 .any(|opt| opt.key == variable);
 
-            if !is_valid_variable {
+            // Check if it's a runtime variable
+            let is_runtime_variable = variable.starts_with("datafusion.runtime.");
+
+            if !is_valid_variable && !is_runtime_variable {
                 return plan_err!(
                     "'{variable}' is not a variable which can be viewed with 'SHOW'"
                 );
@@ -1866,6 +2050,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 let variable = object_name_to_string(&variable);
                 let mut variable_lower = variable.to_lowercase();
 
+                // Map PostgreSQL "timezone" and MySQL "time.zone" aliases to DataFusion's canonical name
                 if variable_lower == "timezone" || variable_lower == "time.zone" {
                     variable_lower = "datafusion.execution.time_zone".to_string();
                 }
@@ -1901,10 +2086,31 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         }
     }
 
+    fn reset_statement_to_plan(&self, statement: ResetStatement) -> Result<LogicalPlan> {
+        match statement {
+            ResetStatement::Variable(variable) => {
+                let variable = object_name_to_string(&variable);
+                let mut variable_lower = variable.to_lowercase();
+
+                // Map PostgreSQL "timezone" and MySQL "time.zone" aliases to DataFusion's canonical name
+                if variable_lower == "timezone" || variable_lower == "time.zone" {
+                    variable_lower = "datafusion.execution.time_zone".to_string();
+                }
+
+                Ok(LogicalPlan::Statement(PlanStatement::ResetVariable(
+                    ResetVariable {
+                        variable: variable_lower,
+                    },
+                )))
+            }
+        }
+    }
+
     fn delete_to_plan(
         &self,
-        table_name: ObjectName,
+        table_name: &ObjectName,
         predicate_expr: Option<SQLExpr>,
+        limit: Option<SQLExpr>,
     ) -> Result<LogicalPlan> {
         // Do a table lookup to verify the table exists
         let table_ref = self.object_name_to_table_reference(table_name.clone())?;
@@ -1918,7 +2124,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 .build()?;
         let mut planner_context = PlannerContext::new();
 
-        let source = match predicate_expr {
+        let mut source = match predicate_expr {
             None => scan,
             Some(predicate_expr) => {
                 let filter_expr =
@@ -1935,6 +2141,14 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             }
         };
 
+        if let Some(limit) = limit {
+            let empty_schema = DFSchema::empty();
+            let limit = self.sql_to_expr(limit, &empty_schema, &mut planner_context)?;
+            source = LogicalPlanBuilder::from(source)
+                .limit_by_expr(None, Some(limit))?
+                .build()?
+        }
+
         let plan = LogicalPlan::Dml(DmlStatement::new(
             table_ref,
             table_source,
@@ -1947,7 +2161,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     fn update_to_plan(
         &self,
         table: TableWithJoins,
-        assignments: Vec<Assignment>,
+        assignments: &[Assignment],
         from: Option<TableWithJoins>,
         predicate_expr: Option<SQLExpr>,
     ) -> Result<LogicalPlan> {
@@ -2105,7 +2319,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     } else {
                         value_indices[column_index] = Some(i);
                     }
-                    Ok(table_schema.field(column_index).clone())
+                    Ok(Arc::clone(table_schema.field(column_index)))
                 })
                 .collect::<Result<Vec<_>>>()?;
             (Fields::from(fields), value_indices)
@@ -2136,7 +2350,12 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 }
             }
         }
-        let prepare_param_data_types = prepare_param_data_types.into_values().collect();
+        let prepare_param_data_types = {
+            let len = prepare_param_data_types.keys().last().map_or(0, |&k| k + 1);
+            (0..len)
+                .map(|i| prepare_param_data_types.remove(&i))
+                .collect()
+        };
 
         // Projection
         let mut planner_context =
@@ -2178,7 +2397,9 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             (false, false) => InsertOp::Append,
             (true, false) => InsertOp::Overwrite,
             (false, true) => InsertOp::Replace,
-            (true, true) => plan_err!("Conflicting insert operations: `overwrite` and `replace_into` cannot both be true")?,
+            (true, true) => plan_err!(
+                "Conflicting insert operations: `overwrite` and `replace_into` cannot both be true"
+            )?,
         };
 
         let plan = LogicalPlan::Dml(DmlStatement::new(
@@ -2353,7 +2574,7 @@ ON p.function_name = r.routine_name
 
     fn validate_transaction_kind(
         &self,
-        kind: Option<BeginTransactionKind>,
+        kind: Option<&BeginTransactionKind>,
     ) -> Result<()> {
         match kind {
             // BEGIN
diff --git a/datafusion/sql/src/unparser/ast.rs b/datafusion/sql/src/unparser/ast.rs
index 2cf26009ac0f2..d8d5ec9e409fc 100644
--- a/datafusion/sql/src/unparser/ast.rs
+++ b/datafusion/sql/src/unparser/ast.rs
@@ -20,7 +20,7 @@ use std::ops::ControlFlow;
 
 use sqlparser::ast::helpers::attached_token::AttachedToken;
 use sqlparser::ast::{
-    self, visit_expressions_mut, LimitClause, OrderByKind, SelectFlavor,
+    self, LimitClause, OrderByKind, SelectFlavor, visit_expressions_mut,
 };
 
 #[derive(Clone)]
@@ -38,7 +38,6 @@ pub struct QueryBuilder {
     distinct_union: bool,
 }
 
-#[allow(dead_code)]
 impl QueryBuilder {
     pub fn with(&mut self, value: Option<ast::With>) -> &mut Self {
         self.with = value;
@@ -140,7 +139,16 @@ impl Default for QueryBuilder {
 pub struct SelectBuilder {
     distinct: Option<ast::Distinct>,
     top: Option<ast::Top>,
-    projection: Vec<ast::SelectItem>,
+    /// Projection items for the SELECT clause.
+    ///
+    /// This field uses `Option` to distinguish between three distinct states:
+    /// - `None`: No projection has been set (not yet initialized)
+    /// - `Some(vec![])`: Empty projection explicitly set (generates `SELECT FROM ...` or `SELECT 1 FROM ...`)
+    /// - `Some(vec![SelectItem::Wildcard(...)])`: Wildcard projection (generates `SELECT * FROM ...`)
+    /// - `Some(vec![...])`: Non-empty projection with specific columns/expressions
+    ///
+    /// Use `projection()` to set this field and `already_projected()` to check if it has been set.
+    projection: Option<Vec<ast::SelectItem>>,
     into: Option<ast::SelectInto>,
     from: Vec<TableWithJoinsBuilder>,
     lateral_views: Vec<ast::LateralView>,
@@ -154,10 +162,52 @@ pub struct SelectBuilder {
     qualify: Option<ast::Expr>,
     value_table_mode: Option<ast::ValueTableMode>,
     flavor: Option<SelectFlavor>,
+    /// Counter for generating unique LATERAL FLATTEN aliases within this SELECT.
+    flatten_alias_counter: usize,
+    /// Table aliases that correspond to LATERAL FLATTEN relations.
+    /// Column references into these aliases must use `VALUE` as the column name.
+    flatten_table_aliases: Vec<String>,
 }
 
-#[allow(dead_code)]
+/// Prefix used for auto-generated LATERAL FLATTEN table aliases.
+const FLATTEN_ALIAS_PREFIX: &str = "_unnest";
+
 impl SelectBuilder {
+    /// Generate a unique alias for a LATERAL FLATTEN relation
+    /// (`_unnest_1`, `_unnest_2`, …). Each call returns a fresh name.
+    pub fn next_flatten_alias(&mut self) -> String {
+        self.flatten_alias_counter += 1;
+        format!("{FLATTEN_ALIAS_PREFIX}_{}", self.flatten_alias_counter)
+    }
+
+    /// Register a table alias as pointing to a LATERAL FLATTEN relation.
+    pub fn add_flatten_table_alias(&mut self, alias: String) {
+        self.flatten_table_aliases.push(alias);
+    }
+
+    /// Returns true if no FLATTEN table aliases have been registered.
+    pub fn flatten_table_aliases_empty(&self) -> bool {
+        self.flatten_table_aliases.is_empty()
+    }
+
+    /// Returns true if the given table alias refers to a FLATTEN relation.
+    pub fn is_flatten_table_alias(&self, alias: &str) -> bool {
+        self.flatten_table_aliases.iter().any(|a| a == alias)
+    }
+
+    /// Returns the most recently generated flatten alias, or `None` if
+    /// `next_flatten_alias` has not been called yet.
+    pub fn current_flatten_alias(&self) -> Option<String> {
+        if self.flatten_alias_counter > 0 {
+            Some(format!(
+                "{FLATTEN_ALIAS_PREFIX}_{}",
+                self.flatten_alias_counter
+            ))
+        } else {
+            None
+        }
+    }
+
     pub fn distinct(&mut self, value: Option<ast::Distinct>) -> &mut Self {
         self.distinct = value;
         self
@@ -167,16 +217,37 @@ impl SelectBuilder {
         self
     }
     pub fn projection(&mut self, value: Vec<ast::SelectItem>) -> &mut Self {
-        self.projection = value;
+        self.projection = Some(value);
         self
     }
     pub fn pop_projections(&mut self) -> Vec<ast::SelectItem> {
-        let ret = self.projection.clone();
-        self.projection.clear();
-        ret
+        self.projection.take().unwrap_or_default()
     }
+    /// Returns true if a projection has been explicitly set via `projection()`.
+    ///
+    /// This method is used to determine whether the SELECT clause has already been
+    /// defined, which helps avoid creating duplicate projection nodes during query
+    /// unparsing. It returns `true` for both empty and non-empty projections.
+    ///
+    /// # Returns
+    ///
+    /// - `true` if `projection()` has been called (regardless of whether it was empty or not)
+    /// - `false` if no projection has been set yet
+    ///
+    /// # Example
+    ///
+    /// ```ignore
+    /// let mut builder = SelectBuilder::default();
+    /// assert!(!builder.already_projected());
+    ///
+    /// builder.projection(vec![]);
+    /// assert!(builder.already_projected()); // true even for empty projection
+    ///
+    /// builder.projection(vec![SelectItem::Wildcard(...)]);
+    /// assert!(builder.already_projected()); // true for non-empty projection
+    /// ```
     pub fn already_projected(&self) -> bool {
-        !self.projection.is_empty()
+        self.projection.is_some()
     }
     pub fn into(&mut self, value: Option<ast::SelectInto>) -> &mut Self {
         self.into = value;
@@ -287,10 +358,12 @@ impl SelectBuilder {
     }
     pub fn build(&self) -> Result<ast::Select, BuilderError> {
         Ok(ast::Select {
+            optimizer_hint: None,
             distinct: self.distinct.clone(),
+            select_modifiers: None,
             top_before_distinct: false,
             top: self.top.clone(),
-            projection: self.projection.clone(),
+            projection: self.projection.clone().unwrap_or_default(),
             into: self.into.clone(),
             from: self
                 .from
@@ -302,7 +375,7 @@ impl SelectBuilder {
             group_by: match self.group_by {
                 Some(ref value) => value.clone(),
                 None => {
-                    return Err(Into::into(UninitializedFieldError::from("group_by")))
+                    return Err(Into::into(UninitializedFieldError::from("group_by")));
                 }
             },
             cluster_by: self.cluster_by.clone(),
@@ -312,12 +385,12 @@ impl SelectBuilder {
             named_window: self.named_window.clone(),
             qualify: self.qualify.clone(),
             value_table_mode: self.value_table_mode,
-            connect_by: None,
+            connect_by: Vec::new(),
             window_before_qualify: false,
             prewhere: None,
             select_token: AttachedToken::empty(),
             flavor: match self.flavor {
-                Some(ref value) => value.clone(),
+                Some(ref value) => *value,
                 None => return Err(Into::into(UninitializedFieldError::from("flavor"))),
             },
             exclude: None,
@@ -327,7 +400,7 @@ impl SelectBuilder {
         Self {
             distinct: Default::default(),
             top: Default::default(),
-            projection: Default::default(),
+            projection: None,
             into: Default::default(),
             from: Default::default(),
             lateral_views: Default::default(),
@@ -341,6 +414,8 @@ impl SelectBuilder {
             qualify: Default::default(),
             value_table_mode: Default::default(),
             flavor: Some(SelectFlavor::Standard),
+            flatten_alias_counter: 0,
+            flatten_table_aliases: Vec::new(),
         }
     }
 }
@@ -356,7 +431,6 @@ pub struct TableWithJoinsBuilder {
     joins: Vec<ast::Join>,
 }
 
-#[allow(dead_code)]
 impl TableWithJoinsBuilder {
     pub fn relation(&mut self, value: RelationBuilder) -> &mut Self {
         self.relation = Some(value);
@@ -402,17 +476,15 @@ pub struct RelationBuilder {
     relation: Option<TableFactorBuilder>,
 }
 
-#[allow(dead_code)]
 #[derive(Clone)]
-#[allow(clippy::large_enum_variant)]
 enum TableFactorBuilder {
     Table(TableRelationBuilder),
     Derived(DerivedRelationBuilder),
     Unnest(UnnestRelationBuilder),
+    Flatten(FlattenRelationBuilder),
     Empty,
 }
 
-#[allow(dead_code)]
 impl RelationBuilder {
     pub fn has_relation(&self) -> bool {
         self.relation.is_some()
@@ -431,6 +503,11 @@ impl RelationBuilder {
         self
     }
 
+    pub fn flatten(&mut self, value: FlattenRelationBuilder) -> &mut Self {
+        self.relation = Some(TableFactorBuilder::Flatten(value));
+        self
+    }
+
     pub fn empty(&mut self) -> &mut Self {
         self.relation = Some(TableFactorBuilder::Empty);
         self
@@ -447,6 +524,9 @@ impl RelationBuilder {
             Some(TableFactorBuilder::Unnest(ref mut rel_builder)) => {
                 rel_builder.alias = value;
             }
+            Some(TableFactorBuilder::Flatten(ref mut rel_builder)) => {
+                rel_builder.alias = value;
+            }
             Some(TableFactorBuilder::Empty) => (),
             None => (),
         }
@@ -457,6 +537,7 @@ impl RelationBuilder {
             Some(TableFactorBuilder::Table(ref value)) => Some(value.build()?),
             Some(TableFactorBuilder::Derived(ref value)) => Some(value.build()?),
             Some(TableFactorBuilder::Unnest(ref value)) => Some(value.build()?),
+            Some(TableFactorBuilder::Flatten(ref value)) => Some(value.build()?),
             Some(TableFactorBuilder::Empty) => None,
             None => return Err(Into::into(UninitializedFieldError::from("relation"))),
         })
@@ -484,7 +565,6 @@ pub struct TableRelationBuilder {
     index_hints: Vec<ast::TableIndexHints>,
 }
 
-#[allow(dead_code)]
 impl TableRelationBuilder {
     pub fn name(&mut self, value: ast::ObjectName) -> &mut Self {
         self.name = Some(value);
@@ -558,7 +638,6 @@ pub struct DerivedRelationBuilder {
     alias: Option<ast::TableAlias>,
 }
 
-#[allow(dead_code)]
 impl DerivedRelationBuilder {
     pub fn lateral(&mut self, value: bool) -> &mut Self {
         self.lateral = Some(value);
@@ -581,10 +660,11 @@ impl DerivedRelationBuilder {
             subquery: match self.subquery {
                 Some(ref value) => value.clone(),
                 None => {
-                    return Err(Into::into(UninitializedFieldError::from("subquery")))
+                    return Err(Into::into(UninitializedFieldError::from("subquery")));
                 }
             },
             alias: self.alias.clone(),
+            sample: None,
         })
     }
     fn create_empty() -> Self {
@@ -610,7 +690,6 @@ pub struct UnnestRelationBuilder {
     with_ordinality: bool,
 }
 
-#[allow(dead_code)]
 impl UnnestRelationBuilder {
     pub fn alias(&mut self, value: Option<ast::TableAlias>) -> &mut Self {
         self.alias = value;
@@ -663,6 +742,77 @@ impl Default for UnnestRelationBuilder {
     }
 }
 
+/// Builds a `LATERAL FLATTEN(INPUT => expr, OUTER => bool)` table factor
+/// for Snowflake-style unnesting.
+#[derive(Clone)]
+pub struct FlattenRelationBuilder {
+    pub alias: Option<ast::TableAlias>,
+    /// The input expression to flatten (e.g. a column reference).
+    pub input_expr: Option<ast::Expr>,
+    /// Whether to preserve rows for NULL/empty inputs (Snowflake `OUTER` param).
+    pub outer: bool,
+}
+
+impl FlattenRelationBuilder {
+    pub fn alias(&mut self, value: Option<ast::TableAlias>) -> &mut Self {
+        self.alias = value;
+        self
+    }
+
+    pub fn input_expr(&mut self, value: ast::Expr) -> &mut Self {
+        self.input_expr = Some(value);
+        self
+    }
+
+    pub fn outer(&mut self, value: bool) -> &mut Self {
+        self.outer = value;
+        self
+    }
+
+    pub fn build(&self) -> Result<ast::TableFactor, BuilderError> {
+        let input = self.input_expr.clone().ok_or_else(|| {
+            BuilderError::from(UninitializedFieldError::from("input_expr"))
+        })?;
+
+        let mut args = vec![ast::FunctionArg::Named {
+            name: ast::Ident::new("INPUT"),
+            arg: ast::FunctionArgExpr::Expr(input),
+            operator: ast::FunctionArgOperator::RightArrow,
+        }];
+
+        if self.outer {
+            args.push(ast::FunctionArg::Named {
+                name: ast::Ident::new("OUTER"),
+                arg: ast::FunctionArgExpr::Expr(ast::Expr::Value(
+                    ast::Value::Boolean(true).into(),
+                )),
+                operator: ast::FunctionArgOperator::RightArrow,
+            });
+        }
+
+        Ok(ast::TableFactor::Function {
+            lateral: true,
+            name: ast::ObjectName::from(vec![ast::Ident::new("FLATTEN")]),
+            args,
+            alias: self.alias.clone(),
+        })
+    }
+
+    fn create_empty() -> Self {
+        Self {
+            alias: None,
+            input_expr: None,
+            outer: false,
+        }
+    }
+}
+
+impl Default for FlattenRelationBuilder {
+    fn default() -> Self {
+        Self::create_empty()
+    }
+}
+
 /// Runtime error when a `build()` method is called and one or more required fields
 /// do not have a value.
 #[derive(Debug, Clone)]
@@ -711,10 +861,10 @@ impl From<String> for BuilderError {
 impl fmt::Display for BuilderError {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         match self {
-            Self::UninitializedField(ref field) => {
+            Self::UninitializedField(field) => {
                 write!(f, "`{field}` must be initialized")
             }
-            Self::ValidationError(ref error) => write!(f, "{error}"),
+            Self::ValidationError(error) => write!(f, "{error}"),
         }
     }
 }
diff --git a/datafusion/sql/src/unparser/dialect.rs b/datafusion/sql/src/unparser/dialect.rs
index 834b0a97a47b0..d9344622405fc 100644
--- a/datafusion/sql/src/unparser/dialect.rs
+++ b/datafusion/sql/src/unparser/dialect.rs
@@ -18,13 +18,13 @@
 use std::{collections::HashMap, sync::Arc};
 
 use super::{
-    utils::character_length_to_sql, utils::date_part_to_sql,
-    utils::sqlite_date_trunc_to_sql, utils::sqlite_from_unixtime_to_sql, Unparser,
+    Unparser, utils::character_length_to_sql, utils::date_part_to_sql,
+    utils::sqlite_date_trunc_to_sql, utils::sqlite_from_unixtime_to_sql,
 };
 use arrow::array::timezone::Tz;
 use arrow::datatypes::TimeUnit;
 use chrono::DateTime;
-use datafusion_common::Result;
+use datafusion_common::{Result, internal_err};
 use datafusion_expr::Expr;
 use regex::Regex;
 use sqlparser::tokenizer::Span;
@@ -51,6 +51,11 @@ pub trait Dialect: Send + Sync {
     /// Return the character used to quote identifiers.
     fn identifier_quote_style(&self, _identifier: &str) -> Option<char>;
 
+    /// Whether array literals should be rendered with the `ARRAY[...]` keyword.
+    fn use_array_keyword_for_array_literals(&self) -> bool {
+        false
+    }
+
     /// Does the dialect support specifying `NULLS FIRST/LAST` in `ORDER BY` clauses?
     fn supports_nulls_first_in_sort(&self) -> bool {
         true
@@ -100,6 +105,12 @@ pub trait Dialect: Send + Sync {
         ast::DataType::BigInt(None)
     }
 
+    /// The SQL type to use for Arrow Int8 unparsing
+    /// Most dialects use TinyInt, but PostgreSQL prefers SmallInt
+    fn int8_cast_dtype(&self) -> ast::DataType {
+        ast::DataType::TinyInt(None)
+    }
+
     /// The SQL type to use for Arrow Int32 unparsing
     /// Most dialects use Integer, but some, like MySQL, require SIGNED
     fn int32_cast_dtype(&self) -> ast::DataType {
@@ -159,6 +170,18 @@ pub trait Dialect: Send + Sync {
         Ok(None)
     }
 
+    /// Allows the dialect to override higher order function unparsing if the dialect has specific rules.
+    /// Returns None if the default unparsing should be used, or Some(ast::Expr) if there is
+    /// a custom implementation for the function.
+    fn higher_order_function_to_sql_overrides(
+        &self,
+        _unparser: &Unparser,
+        _func_name: &str,
+        _args: &[Expr],
+    ) -> Result<Option<ast::Expr>> {
+        Ok(None)
+    }
+
     /// Allows the dialect to choose to omit window frame in unparsing
     /// based on function name and window frame bound
     /// Returns false if specific function name / window frame bound indicates no window frame is needed in unparsing
@@ -200,6 +223,15 @@ pub trait Dialect: Send + Sync {
         false
     }
 
+    /// Unparse the unnest plan as `LATERAL FLATTEN(INPUT => expr, ...)`.
+    ///
+    /// Snowflake uses FLATTEN as a table function instead of the SQL-standard UNNEST.
+    /// When this returns `true`, the unparser emits
+    /// `LATERAL FLATTEN(INPUT => <col>, OUTER => <bool>)` in the FROM clause.
+    fn unnest_as_lateral_flatten(&self) -> bool {
+        false
+    }
+
     /// Allows the dialect to override column alias unparsing if the dialect has specific rules.
     /// Returns None if the default unparsing should be used, or Some(String) if there is
     /// a custom implementation for the alias.
@@ -216,7 +248,48 @@ pub trait Dialect: Send + Sync {
 
     /// Allows the dialect to override logic of formatting datetime with tz into string.
     fn timestamp_with_tz_to_string(&self, dt: DateTime<Tz>, _unit: TimeUnit) -> String {
-        dt.to_string()
+        dt.to_rfc3339()
+    }
+
+    /// Whether the dialect supports an empty select list such as `SELECT FROM table`.
+    ///
+    /// An empty select list returns rows without any column data, which is useful for:
+    /// - Counting rows: `SELECT FROM users WHERE active = true` (combined with `COUNT(*)`)
+    /// - Testing row existence without retrieving column data
+    /// - Performance optimization when only row counts or existence checks are needed
+    ///
+    /// # Default
+    ///
+    /// Returns `false` for maximum compatibility across SQL dialects. When `false`,
+    /// the unparser falls back to `SELECT 1 FROM table`.
+    ///
+    /// # Implementation Note
+    ///
+    /// Specific dialects should override this method to return `true` if they support
+    /// the empty select list syntax (e.g., PostgreSQL).
+    ///
+    /// # Example SQL Output
+    ///
+    /// ```sql
+    /// -- When supported:
+    /// SELECT FROM users WHERE active = true;
+    ///
+    /// -- Fallback when unsupported:
+    /// SELECT 1 FROM users WHERE active = true;
+    /// ```
+    fn supports_empty_select_list(&self) -> bool {
+        false
+    }
+
+    /// Override the default string literal unparsing.
+    ///
+    /// Returns `Some(ast::Expr)` to replace the default single-quoted string,
+    /// or `None` to use the default behavior.
+    ///
+    /// For example, MSSQL requires non-ASCII strings to use national string
+    /// literal syntax (`N'datafusion資料融合'`).
+    fn string_literal_to_sql(&self, _s: &str) -> Option<ast::Expr> {
+        None
     }
 }
 
@@ -268,19 +341,22 @@ impl Dialect for DefaultDialect {
         let id_upper = identifier.to_uppercase();
         // Special case ignore "ID", see https://github.com/sqlparser-rs/sqlparser-rs/issues/1382
         // ID is a keyword in ClickHouse, but we don't want to quote it when unparsing SQL here
-        if (id_upper != "ID" && ALL_KEYWORDS.contains(&id_upper.as_str()))
+        // Also quote identifiers with uppercase letters since unquoted identifiers are
+        // normalized to lowercase by the SQL parser, which would break case-sensitive schemas
+        let needs_quote = (id_upper != "ID" && ALL_KEYWORDS.contains(&id_upper.as_str()))
             || !identifier_regex.is_match(identifier)
-        {
-            Some('"')
-        } else {
-            None
-        }
+            || identifier.chars().any(|c| c.is_ascii_uppercase());
+        if needs_quote { Some('"') } else { None }
     }
 }
 
 pub struct PostgreSqlDialect {}
 
 impl Dialect for PostgreSqlDialect {
+    fn use_array_keyword_for_array_literals(&self) -> bool {
+        true
+    }
+
     fn supports_qualify(&self) -> bool {
         false
     }
@@ -289,6 +365,10 @@ impl Dialect for PostgreSqlDialect {
         true
     }
 
+    fn supports_empty_select_list(&self) -> bool {
+        true
+    }
+
     fn identifier_quote_style(&self, _: &str) -> Option<char> {
         Some('"')
     }
@@ -301,12 +381,20 @@ impl Dialect for PostgreSqlDialect {
         ast::DataType::DoublePrecision
     }
 
+    fn int8_cast_dtype(&self) -> ast::DataType {
+        ast::DataType::SmallInt(None)
+    }
+
     fn scalar_function_to_sql_overrides(
         &self,
         unparser: &Unparser,
         func_name: &str,
         args: &[Expr],
     ) -> Result<Option<ast::Expr>> {
+        if func_name == "array_has" {
+            return self.array_has_to_sql_any(unparser, args);
+        }
+
         if func_name == "round" {
             return Ok(Some(
                 self.round_to_sql_enforce_numeric(unparser, func_name, args)?,
@@ -318,6 +406,23 @@ impl Dialect for PostgreSqlDialect {
 }
 
 impl PostgreSqlDialect {
+    fn array_has_to_sql_any(
+        &self,
+        unparser: &Unparser,
+        args: &[Expr],
+    ) -> Result<Option<ast::Expr>> {
+        let [haystack, needle] = args else {
+            return internal_err!("array_has expected 2 arguments, got {}", args.len());
+        };
+
+        Ok(Some(ast::Expr::AnyOp {
+            left: Box::new(unparser.expr_to_sql(needle)?),
+            compare_op: BinaryOperator::Eq,
+            right: Box::new(unparser.expr_to_sql(haystack)?),
+            is_some: false,
+        }))
+    }
+
     fn round_to_sql_enforce_numeric(
         &self,
         unparser: &Unparser,
@@ -339,6 +444,7 @@ impl PostgreSqlDialect {
                     kind: ast::CastKind::Cast,
                     expr: Box::new(expr.clone()),
                     data_type: ast::DataType::Numeric(ast::ExactNumberInfo::None),
+                    array: false,
                     format: None,
                 };
             }
@@ -423,17 +529,6 @@ impl Dialect for DuckDBDialect {
 
         Ok(None)
     }
-
-    fn timestamp_with_tz_to_string(&self, dt: DateTime<Tz>, unit: TimeUnit) -> String {
-        let format = match unit {
-            TimeUnit::Second => "%Y-%m-%d %H:%M:%S%:z",
-            TimeUnit::Millisecond => "%Y-%m-%d %H:%M:%S%.3f%:z",
-            TimeUnit::Microsecond => "%Y-%m-%d %H:%M:%S%.6f%:z",
-            TimeUnit::Nanosecond => "%Y-%m-%d %H:%M:%S%.9f%:z",
-        };
-
-        dt.format(format).to_string()
-    }
 }
 
 pub struct MySqlDialect {}
@@ -590,6 +685,51 @@ impl Dialect for BigQueryDialect {
     fn unnest_as_table_factor(&self) -> bool {
         true
     }
+
+    fn supports_column_alias_in_table_alias(&self) -> bool {
+        false
+    }
+
+    fn float64_ast_dtype(&self) -> ast::DataType {
+        ast::DataType::Float64
+    }
+
+    fn utf8_cast_dtype(&self) -> ast::DataType {
+        ast::DataType::String(None)
+    }
+
+    fn large_utf8_cast_dtype(&self) -> ast::DataType {
+        ast::DataType::String(None)
+    }
+
+    fn timestamp_cast_dtype(
+        &self,
+        _time_unit: &TimeUnit,
+        _tz: &Option<Arc<str>>,
+    ) -> ast::DataType {
+        ast::DataType::Timestamp(None, TimezoneInfo::None)
+    }
+
+    fn date_field_extract_style(&self) -> DateFieldExtractStyle {
+        DateFieldExtractStyle::Extract
+    }
+
+    fn interval_style(&self) -> IntervalStyle {
+        IntervalStyle::SQLStandard
+    }
+
+    fn scalar_function_to_sql_overrides(
+        &self,
+        unparser: &Unparser,
+        func_name: &str,
+        args: &[Expr],
+    ) -> Result<Option<ast::Expr>> {
+        if func_name == "date_part" {
+            return date_part_to_sql(unparser, self.date_field_extract_style(), args);
+        }
+
+        Ok(None)
+    }
 }
 
 impl BigQueryDialect {
@@ -599,6 +739,59 @@ impl BigQueryDialect {
     }
 }
 
+/// Dialect for Snowflake SQL.
+///
+/// Key differences from the default dialect:
+/// - Uses double-quote identifier quoting
+/// - Supports `NULLS FIRST`/`NULLS LAST` in `ORDER BY`
+/// - Does not support empty select lists (`SELECT FROM t`)
+/// - Does not support column aliases in table alias definitions
+///   (Snowflake accepts the syntax but silently ignores the renames in join contexts)
+/// - Unparses `UNNEST` plans as `LATERAL FLATTEN(INPUT => expr, ...)`
+pub struct SnowflakeDialect {}
+
+#[expect(clippy::new_without_default)]
+impl SnowflakeDialect {
+    #[must_use]
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl Dialect for SnowflakeDialect {
+    fn identifier_quote_style(&self, _: &str) -> Option<char> {
+        Some('"')
+    }
+
+    fn supports_nulls_first_in_sort(&self) -> bool {
+        true
+    }
+
+    fn supports_empty_select_list(&self) -> bool {
+        false
+    }
+
+    fn supports_column_alias_in_table_alias(&self) -> bool {
+        false
+    }
+
+    fn timestamp_cast_dtype(
+        &self,
+        _time_unit: &TimeUnit,
+        tz: &Option<Arc<str>>,
+    ) -> ast::DataType {
+        if tz.is_some() {
+            ast::DataType::Timestamp(None, TimezoneInfo::WithTimeZone)
+        } else {
+            ast::DataType::Timestamp(None, TimezoneInfo::None)
+        }
+    }
+
+    fn unnest_as_lateral_flatten(&self) -> bool {
+        true
+    }
+}
+
 pub struct CustomDialect {
     identifier_quote_style: Option<char>,
     supports_nulls_first_in_sort: bool,
@@ -609,6 +802,7 @@ pub struct CustomDialect {
     large_utf8_cast_dtype: ast::DataType,
     date_field_extract_style: DateFieldExtractStyle,
     character_length_style: CharacterLengthStyle,
+    int8_cast_dtype: ast::DataType,
     int64_cast_dtype: ast::DataType,
     int32_cast_dtype: ast::DataType,
     timestamp_cast_dtype: ast::DataType,
@@ -620,6 +814,7 @@ pub struct CustomDialect {
     window_func_support_window_frame: bool,
     full_qualified_col: bool,
     unnest_as_table_factor: bool,
+    unnest_as_lateral_flatten: bool,
 }
 
 impl Default for CustomDialect {
@@ -634,6 +829,7 @@ impl Default for CustomDialect {
             large_utf8_cast_dtype: ast::DataType::Text,
             date_field_extract_style: DateFieldExtractStyle::DatePart,
             character_length_style: CharacterLengthStyle::CharacterLength,
+            int8_cast_dtype: ast::DataType::TinyInt(None),
             int64_cast_dtype: ast::DataType::BigInt(None),
             int32_cast_dtype: ast::DataType::Integer(None),
             timestamp_cast_dtype: ast::DataType::Timestamp(None, TimezoneInfo::None),
@@ -648,6 +844,7 @@ impl Default for CustomDialect {
             window_func_support_window_frame: true,
             full_qualified_col: false,
             unnest_as_table_factor: false,
+            unnest_as_lateral_flatten: false,
         }
     }
 }
@@ -693,6 +890,10 @@ impl Dialect for CustomDialect {
         self.int64_cast_dtype.clone()
     }
 
+    fn int8_cast_dtype(&self) -> ast::DataType {
+        self.int8_cast_dtype.clone()
+    }
+
     fn int32_cast_dtype(&self) -> ast::DataType {
         self.int32_cast_dtype.clone()
     }
@@ -758,6 +959,10 @@ impl Dialect for CustomDialect {
     fn unnest_as_table_factor(&self) -> bool {
         self.unnest_as_table_factor
     }
+
+    fn unnest_as_lateral_flatten(&self) -> bool {
+        self.unnest_as_lateral_flatten
+    }
 }
 
 /// `CustomDialectBuilder` to build `CustomDialect` using builder pattern
@@ -784,6 +989,7 @@ pub struct CustomDialectBuilder {
     large_utf8_cast_dtype: ast::DataType,
     date_field_extract_style: DateFieldExtractStyle,
     character_length_style: CharacterLengthStyle,
+    int8_cast_dtype: ast::DataType,
     int64_cast_dtype: ast::DataType,
     int32_cast_dtype: ast::DataType,
     timestamp_cast_dtype: ast::DataType,
@@ -795,6 +1001,7 @@ pub struct CustomDialectBuilder {
     window_func_support_window_frame: bool,
     full_qualified_col: bool,
     unnest_as_table_factor: bool,
+    unnest_as_lateral_flatten: bool,
 }
 
 impl Default for CustomDialectBuilder {
@@ -815,6 +1022,7 @@ impl CustomDialectBuilder {
             large_utf8_cast_dtype: ast::DataType::Text,
             date_field_extract_style: DateFieldExtractStyle::DatePart,
             character_length_style: CharacterLengthStyle::CharacterLength,
+            int8_cast_dtype: ast::DataType::TinyInt(None),
             int64_cast_dtype: ast::DataType::BigInt(None),
             int32_cast_dtype: ast::DataType::Integer(None),
             timestamp_cast_dtype: ast::DataType::Timestamp(None, TimezoneInfo::None),
@@ -829,6 +1037,7 @@ impl CustomDialectBuilder {
             window_func_support_window_frame: true,
             full_qualified_col: false,
             unnest_as_table_factor: false,
+            unnest_as_lateral_flatten: false,
         }
     }
 
@@ -843,6 +1052,7 @@ impl CustomDialectBuilder {
             large_utf8_cast_dtype: self.large_utf8_cast_dtype,
             date_field_extract_style: self.date_field_extract_style,
             character_length_style: self.character_length_style,
+            int8_cast_dtype: self.int8_cast_dtype,
             int64_cast_dtype: self.int64_cast_dtype,
             int32_cast_dtype: self.int32_cast_dtype,
             timestamp_cast_dtype: self.timestamp_cast_dtype,
@@ -855,6 +1065,7 @@ impl CustomDialectBuilder {
             window_func_support_window_frame: self.window_func_support_window_frame,
             full_qualified_col: self.full_qualified_col,
             unnest_as_table_factor: self.unnest_as_table_factor,
+            unnest_as_lateral_flatten: self.unnest_as_lateral_flatten,
         }
     }
 
@@ -897,6 +1108,12 @@ impl CustomDialectBuilder {
         self
     }
 
+    /// Customize the dialect with a specific SQL type for Int8 casting: TinyInt, SmallInt, etc.
+    pub fn with_int8_cast_dtype(mut self, int8_cast_dtype: ast::DataType) -> Self {
+        self.int8_cast_dtype = int8_cast_dtype;
+        self
+    }
+
     /// Customize the dialect with a specific SQL type for Float64 casting: DOUBLE, DOUBLE PRECISION, etc.
     pub fn with_float64_ast_dtype(mut self, float64_ast_dtype: ast::DataType) -> Self {
         self.float64_ast_dtype = float64_ast_dtype;
@@ -995,4 +1212,12 @@ impl CustomDialectBuilder {
         self.unnest_as_table_factor = unnest_as_table_factor;
         self
     }
+
+    pub fn with_unnest_as_lateral_flatten(
+        mut self,
+        unnest_as_lateral_flatten: bool,
+    ) -> Self {
+        self.unnest_as_lateral_flatten = unnest_as_lateral_flatten;
+        self
+    }
 }
diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs
index 97f2b58bf8402..e76aea5849492 100644
--- a/datafusion/sql/src/unparser/expr.rs
+++ b/datafusion/sql/src/unparser/expr.rs
@@ -15,37 +15,42 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion_expr::expr::{AggregateFunctionParams, Unnest, WindowFunctionParams};
+use datafusion_common::datatype::DataTypeExt;
+use datafusion_expr::expr::{
+    AggregateFunctionParams, HigherOrderFunction, WindowFunctionParams,
+};
+use datafusion_expr::expr::{Lambda, Unnest};
 use sqlparser::ast::Value::SingleQuotedString;
 use sqlparser::ast::{
-    self, Array, BinaryOperator, CaseWhen, DuplicateTreatment, Expr as AstExpr, Function,
-    Ident, Interval, ObjectName, OrderByOptions, Subscript, TimezoneInfo, UnaryOperator,
-    ValueWithSpan,
+    self, Array, BinaryOperator, Expr as AstExpr, Function, Ident, Interval, ObjectName,
+    Subscript, TimezoneInfo, UnaryOperator,
 };
+use sqlparser::ast::{CaseWhen, DuplicateTreatment, OrderByOptions, ValueWithSpan};
 use std::sync::Arc;
 use std::vec;
 
-use super::dialect::IntervalStyle;
 use super::Unparser;
+use super::dialect::IntervalStyle;
 use arrow::array::{
+    ArrayRef, Date32Array, Date64Array, PrimitiveArray,
     types::{
         ArrowTemporalType, Time32MillisecondType, Time32SecondType,
         Time64MicrosecondType, Time64NanosecondType, TimestampMicrosecondType,
         TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
     },
-    ArrayRef, Date32Array, Date64Array, PrimitiveArray,
 };
 use arrow::datatypes::{
-    DataType, Decimal128Type, Decimal256Type, Decimal32Type, Decimal64Type, DecimalType,
+    DataType, Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, DecimalType,
+    FieldRef,
 };
 use arrow::util::display::array_value_to_string;
 use datafusion_common::{
-    internal_datafusion_err, internal_err, not_impl_err, plan_err, Column, Result,
-    ScalarValue,
+    Column, Result, ScalarValue, assert_eq_or_internal_err, assert_or_internal_err,
+    internal_datafusion_err, internal_err, not_impl_err, plan_err,
 };
 use datafusion_expr::{
-    expr::{Alias, Exists, InList, ScalarFunction, Sort, WindowFunction},
     Between, BinaryExpr, Case, Cast, Expr, GroupingSet, Like, Operator, TryCast,
+    expr::{Alias, Exists, InList, ScalarFunction, SetQuantifier, Sort, WindowFunction},
 };
 use sqlparser::ast::helpers::attached_token::AttachedToken;
 use sqlparser::tokenizer::Span;
@@ -70,9 +75,8 @@ use sqlparser::tokenizer::Span;
 /// use datafusion_expr::{col, lit};
 /// use datafusion_sql::unparser::expr_to_sql;
 /// let expr = col("a").gt(lit(4)); // form an expression `a > 4`
-/// let sql = expr_to_sql(&expr).unwrap(); // convert to ast::Expr
-/// // use the Display impl to convert to SQL text
-/// assert_eq!(sql.to_string(), "(a > 4)")
+/// let sql = expr_to_sql(&expr).unwrap(); // convert to ast::Expr, using
+/// assert_eq!(sql.to_string(), "(a > 4)"); // use Display impl for SQL text
 /// ```
 ///
 /// [`SqlToRel::sql_to_expr`]: crate::planner::SqlToRel::sql_to_expr
@@ -189,9 +193,7 @@ impl Unparser<'_> {
                     end_token: AttachedToken::empty(),
                 })
             }
-            Expr::Cast(Cast { expr, data_type }) => {
-                Ok(self.cast_to_sql(expr, data_type)?)
-            }
+            Expr::Cast(Cast { expr, field }) => Ok(self.cast_to_sql(expr, field)?),
             Expr::Literal(value, _) => Ok(self.scalar_to_sql(value)?),
             Expr::Alias(Alias { expr, name: _, .. }) => self.expr_to_sql_inner(expr),
             Expr::WindowFunction(window_fun) => {
@@ -394,6 +396,33 @@ impl Unparser<'_> {
                     negated: insubq.negated,
                 })
             }
+            Expr::SetComparison(set_cmp) => {
+                let left = Box::new(self.expr_to_sql_inner(set_cmp.expr.as_ref())?);
+                let sub_statement =
+                    self.plan_to_sql(set_cmp.subquery.subquery.as_ref())?;
+                let sub_query = if let ast::Statement::Query(inner_query) = sub_statement
+                {
+                    inner_query
+                } else {
+                    return plan_err!(
+                        "Subquery must be a Query, but found {sub_statement:?}"
+                    );
+                };
+                let compare_op = self.op_to_sql(&set_cmp.op)?;
+                match set_cmp.quantifier {
+                    SetQuantifier::Any => Ok(ast::Expr::AnyOp {
+                        left,
+                        compare_op,
+                        right: Box::new(ast::Expr::Subquery(sub_query)),
+                        is_some: false,
+                    }),
+                    SetQuantifier::All => Ok(ast::Expr::AllOp {
+                        left,
+                        compare_op,
+                        right: Box::new(ast::Expr::Subquery(sub_query)),
+                    }),
+                }
+            }
             Expr::Exists(Exists { subquery, negated }) => {
                 let sub_statement = self.plan_to_sql(subquery.subquery.as_ref())?;
                 let sub_query = if let ast::Statement::Query(inner_query) = sub_statement
@@ -448,9 +477,7 @@ impl Unparser<'_> {
                 })
             }
             Expr::ScalarVariable(_, ids) => {
-                if ids.is_empty() {
-                    return internal_err!("Not a valid ScalarVariable");
-                }
+                assert_or_internal_err!(!ids.is_empty(), "Not a valid ScalarVariable");
 
                 Ok(if ids.len() == 1 {
                     ast::Expr::Identifier(
@@ -464,12 +491,13 @@ impl Unparser<'_> {
                     )
                 })
             }
-            Expr::TryCast(TryCast { expr, data_type }) => {
+            Expr::TryCast(TryCast { expr, field }) => {
                 let inner_expr = self.expr_to_sql_inner(expr)?;
                 Ok(ast::Expr::Cast {
                     kind: ast::CastKind::TryCast,
                     expr: Box::new(inner_expr),
-                    data_type: self.arrow_dtype_to_ast_dtype(data_type)?,
+                    data_type: self.arrow_dtype_to_ast_dtype(field)?,
+                    array: false,
                     format: None,
                 })
             }
@@ -527,6 +555,33 @@ impl Unparser<'_> {
             }
             Expr::OuterReferenceColumn(_, col) => self.col_to_sql(col),
             Expr::Unnest(unnest) => self.unnest_to_sql(unnest),
+            Expr::HigherOrderFunction(HigherOrderFunction { func, args }) => {
+                let func_name = func.name();
+
+                if let Some(expr) = self
+                    .dialect
+                    .higher_order_function_to_sql_overrides(self, func_name, args)?
+                {
+                    return Ok(expr);
+                }
+
+                self.function_to_sql_internal(func_name, args)
+            }
+            Expr::Lambda(Lambda { params, body }) => {
+                Ok(ast::Expr::Lambda(ast::LambdaFunction {
+                    params: ast::OneOrManyWithParens::Many(
+                        params
+                            .iter()
+                            .map(|param| self.new_ident_quoted_if_needs(param.clone()))
+                            .collect(),
+                    ),
+                    body: Box::new(self.expr_to_sql_inner(body)?),
+                    syntax: ast::LambdaSyntax::Arrow,
+                }))
+            }
+            Expr::LambdaVariable(l) => Ok(ast::Expr::Identifier(
+                self.new_ident_quoted_if_needs(l.name.clone()),
+            )),
         }
     }
 
@@ -542,11 +597,11 @@ impl Unparser<'_> {
             "get_field" => self.get_field_to_sql(args),
             "map" => self.map_to_sql(args),
             // TODO: support for the construct and access functions of the `map` type
-            _ => self.scalar_function_to_sql_internal(func_name, args),
+            _ => self.function_to_sql_internal(func_name, args),
         }
     }
 
-    fn scalar_function_to_sql_internal(
+    fn function_to_sql_internal(
         &self,
         func_name: &str,
         args: &[Expr],
@@ -579,7 +634,7 @@ impl Unparser<'_> {
             .collect::<Result<Vec<_>>>()?;
         Ok(ast::Expr::Array(Array {
             elem: args,
-            named: false,
+            named: self.dialect.use_array_keyword_for_array_literals(),
         }))
     }
 
@@ -590,13 +645,18 @@ impl Unparser<'_> {
             elem.push(self.scalar_to_sql(&value)?);
         }
 
-        Ok(ast::Expr::Array(Array { elem, named: false }))
+        Ok(ast::Expr::Array(Array {
+            elem,
+            named: self.dialect.use_array_keyword_for_array_literals(),
+        }))
     }
 
     fn array_element_to_sql(&self, args: &[Expr]) -> Result<ast::Expr> {
-        if args.len() != 2 {
-            return internal_err!("array_element must have exactly 2 arguments");
-        }
+        assert_eq_or_internal_err!(
+            args.len(),
+            2,
+            "array_element must have exactly 2 arguments"
+        );
         let array = self.expr_to_sql(&args[0])?;
         let index = self.expr_to_sql(&args[1])?;
         Ok(ast::Expr::CompoundFieldAccess {
@@ -606,9 +666,10 @@ impl Unparser<'_> {
     }
 
     fn named_struct_to_sql(&self, args: &[Expr]) -> Result<ast::Expr> {
-        if !args.len().is_multiple_of(2) {
-            return internal_err!("named_struct must have an even number of arguments");
-        }
+        assert_or_internal_err!(
+            args.len().is_multiple_of(2),
+            "named_struct must have an even number of arguments"
+        );
 
         let args = args
             .chunks_exact(2)
@@ -629,38 +690,53 @@ impl Unparser<'_> {
     }
 
     fn get_field_to_sql(&self, args: &[Expr]) -> Result<ast::Expr> {
-        if args.len() != 2 {
-            return internal_err!("get_field must have exactly 2 arguments");
+        if args.len() < 2 {
+            return internal_err!(
+                "get_field must have at least 2 arguments, got {}",
+                args.len()
+            );
         }
 
-        let field = match &args[1] {
-            Expr::Literal(lit, _) => self.new_ident_quoted_if_needs(lit.to_string()),
-            _ => {
-                return internal_err!(
-                "get_field expects second argument to be a string, but received: {:?}",
-                &args[1]
-            )
-            }
-        };
+        // Extract all field names (args[1..])
+        let mut fields = Vec::with_capacity(args.len() - 1);
+        for arg in &args[1..] {
+            let field = match arg {
+                Expr::Literal(lit, _) => self.new_ident_quoted_if_needs(lit.to_string()),
+                _ => {
+                    return internal_err!(
+                        "get_field expects field arguments to be strings, but received: {:?}",
+                        arg
+                    );
+                }
+            };
+            fields.push(field);
+        }
 
         match &args[0] {
             Expr::Column(col) => {
                 let mut id = match self.col_to_sql(col)? {
                     ast::Expr::Identifier(ident) => vec![ident],
                     ast::Expr::CompoundIdentifier(idents) => idents,
-                    other => return internal_err!("expected col_to_sql to return an Identifier or CompoundIdentifier, but received: {:?}", other),
+                    other => {
+                        return internal_err!(
+                            "expected col_to_sql to return an Identifier or CompoundIdentifier, but received: {:?}",
+                            other
+                        );
+                    }
                 };
-                id.push(field);
+                id.extend(fields);
                 Ok(ast::Expr::CompoundIdentifier(id))
             }
             Expr::ScalarFunction(struct_expr) => {
                 let root = self
                     .scalar_function_to_sql(struct_expr.func.name(), &struct_expr.args)?;
+                let access_chain = fields
+                    .into_iter()
+                    .map(|field| ast::AccessExpr::Dot(ast::Expr::Identifier(field)))
+                    .collect();
                 Ok(ast::Expr::CompoundFieldAccess {
                     root: Box::new(root),
-                    access_chain: vec![ast::AccessExpr::Dot(ast::Expr::Identifier(
-                        field,
-                    ))],
+                    access_chain,
                 })
             }
             _ => {
@@ -673,9 +749,7 @@ impl Unparser<'_> {
     }
 
     fn map_to_sql(&self, args: &[Expr]) -> Result<ast::Expr> {
-        if args.len() != 2 {
-            return internal_err!("map must have exactly 2 arguments");
-        }
+        assert_eq_or_internal_err!(args.len(), 2, "map must have exactly 2 arguments");
 
         let ast::Expr::Array(Array { elem: keys, .. }) = self.expr_to_sql(&args[0])?
         else {
@@ -1053,6 +1127,7 @@ impl Unparser<'_> {
             Operator::Question => Ok(BinaryOperator::Question),
             Operator::QuestionAnd => Ok(BinaryOperator::QuestionAnd),
             Operator::QuestionPipe => Ok(BinaryOperator::QuestionPipe),
+            Operator::Colon => Ok(BinaryOperator::Custom(":".to_owned())),
         }
     }
 
@@ -1070,7 +1145,7 @@ impl Unparser<'_> {
                 return Err(internal_datafusion_err!(
                     "Expected Timestamp, got {:?}",
                     T::DATA_TYPE
-                ))
+                ));
             }
         };
 
@@ -1105,6 +1180,7 @@ impl Unparser<'_> {
             kind: ast::CastKind::Cast,
             expr: Box::new(ast::Expr::value(SingleQuotedString(ts))),
             data_type: self.dialect.timestamp_cast_dtype(&time_unit, &None),
+            array: false,
             format: None,
         })
     }
@@ -1127,30 +1203,36 @@ impl Unparser<'_> {
             kind: ast::CastKind::Cast,
             expr: Box::new(ast::Expr::value(SingleQuotedString(time))),
             data_type: ast::DataType::Time(None, TimezoneInfo::None),
+            array: false,
             format: None,
         })
     }
 
     // Explicit type cast on ast::Expr::Value is not needed by underlying engine for certain types
     // For example: CAST(Utf8("binary_value") AS Binary) and  CAST(Utf8("dictionary_value") AS Dictionary)
-    fn cast_to_sql(&self, expr: &Expr, data_type: &DataType) -> Result<ast::Expr> {
+    fn cast_to_sql(&self, expr: &Expr, field: &FieldRef) -> Result<ast::Expr> {
         let inner_expr = self.expr_to_sql_inner(expr)?;
+        let data_type = field.data_type();
         match inner_expr {
             ast::Expr::Value(_) => match data_type {
-                DataType::Dictionary(_, _) | DataType::Binary | DataType::BinaryView => {
+                DataType::Dictionary(_, _) | DataType::Binary | DataType::BinaryView
+                    if field.metadata().is_empty() =>
+                {
                     Ok(inner_expr)
                 }
                 _ => Ok(ast::Expr::Cast {
                     kind: ast::CastKind::Cast,
                     expr: Box::new(inner_expr),
-                    data_type: self.arrow_dtype_to_ast_dtype(data_type)?,
+                    data_type: self.arrow_dtype_to_ast_dtype(field)?,
+                    array: false,
                     format: None,
                 }),
             },
             _ => Ok(ast::Expr::Cast {
                 kind: ast::CastKind::Cast,
                 expr: Box::new(inner_expr),
-                data_type: self.arrow_dtype_to_ast_dtype(data_type)?,
+                data_type: self.arrow_dtype_to_ast_dtype(field)?,
+                array: false,
                 format: None,
             }),
         }
@@ -1245,18 +1327,17 @@ impl Unparser<'_> {
                 Ok(ast::Expr::value(ast::Value::Number(ui.to_string(), false)))
             }
             ScalarValue::UInt64(None) => Ok(ast::Expr::value(ast::Value::Null)),
-            ScalarValue::Utf8(Some(str)) => {
-                Ok(ast::Expr::value(SingleQuotedString(str.to_string())))
-            }
-            ScalarValue::Utf8(None) => Ok(ast::Expr::value(ast::Value::Null)),
-            ScalarValue::Utf8View(Some(str)) => {
-                Ok(ast::Expr::value(SingleQuotedString(str.to_string())))
-            }
-            ScalarValue::Utf8View(None) => Ok(ast::Expr::value(ast::Value::Null)),
-            ScalarValue::LargeUtf8(Some(str)) => {
+            ScalarValue::Utf8(Some(str))
+            | ScalarValue::Utf8View(Some(str))
+            | ScalarValue::LargeUtf8(Some(str)) => {
+                if let Some(expr) = self.dialect.string_literal_to_sql(str) {
+                    return Ok(expr);
+                }
                 Ok(ast::Expr::value(SingleQuotedString(str.to_string())))
             }
-            ScalarValue::LargeUtf8(None) => Ok(ast::Expr::value(ast::Value::Null)),
+            ScalarValue::Utf8(None)
+            | ScalarValue::Utf8View(None)
+            | ScalarValue::LargeUtf8(None) => Ok(ast::Expr::value(ast::Value::Null)),
             ScalarValue::Binary(Some(_)) => not_impl_err!("Unsupported scalar: {v:?}"),
             ScalarValue::Binary(None) => Ok(ast::Expr::value(ast::Value::Null)),
             ScalarValue::BinaryView(Some(_)) => {
@@ -1273,6 +1354,8 @@ impl Unparser<'_> {
             ScalarValue::FixedSizeList(a) => self.scalar_value_list_to_sql(a.values()),
             ScalarValue::List(a) => self.scalar_value_list_to_sql(a.values()),
             ScalarValue::LargeList(a) => self.scalar_value_list_to_sql(a.values()),
+            ScalarValue::ListView(a) => self.scalar_value_list_to_sql(a.values()),
+            ScalarValue::LargeListView(a) => self.scalar_value_list_to_sql(a.values()),
             ScalarValue::Date32(Some(_)) => {
                 let date = v
                     .to_array()?
@@ -1292,6 +1375,7 @@ impl Unparser<'_> {
                         date.to_string(),
                     ))),
                     data_type: ast::DataType::Date,
+                    array: false,
                     format: None,
                 })
             }
@@ -1315,6 +1399,7 @@ impl Unparser<'_> {
                         datetime.to_string(),
                     ))),
                     data_type: self.ast_type_for_date64_in_cast(),
+                    array: false,
                     format: None,
                 })
             }
@@ -1401,6 +1486,7 @@ impl Unparser<'_> {
             ScalarValue::Map(_) => not_impl_err!("Unsupported scalar: {v:?}"),
             ScalarValue::Union(..) => not_impl_err!("Unsupported scalar: {v:?}"),
             ScalarValue::Dictionary(_k, v) => self.scalar_to_sql(v),
+            ScalarValue::RunEndEncoded(_, _, v) => self.scalar_to_sql(v),
         }
     }
 
@@ -1430,7 +1516,9 @@ impl Unparser<'_> {
             };
             return Ok(ast::Expr::Interval(interval));
         } else if months != 0 {
-            return not_impl_err!("Unsupported Interval scalar with both Month and DayTime for IntervalStyle::MySQL");
+            return not_impl_err!(
+                "Unsupported Interval scalar with both Month and DayTime for IntervalStyle::MySQL"
+            );
         }
 
         // DAY only
@@ -1618,7 +1706,9 @@ impl Unparser<'_> {
                         };
                         Ok(ast::Expr::Interval(interval))
                     } else {
-                        not_impl_err!("Unsupported IntervalMonthDayNano scalar with both Month and DayTime for IntervalStyle::SQLStandard")
+                        not_impl_err!(
+                            "Unsupported IntervalMonthDayNano scalar with both Month and DayTime for IntervalStyle::SQLStandard"
+                        )
                     }
                 }
                 _ => not_impl_err!(
@@ -1672,13 +1762,14 @@ impl Unparser<'_> {
         }))
     }
 
-    fn arrow_dtype_to_ast_dtype(&self, data_type: &DataType) -> Result<ast::DataType> {
+    fn arrow_dtype_to_ast_dtype(&self, field: &FieldRef) -> Result<ast::DataType> {
+        let data_type = field.data_type();
         match data_type {
             DataType::Null => {
                 not_impl_err!("Unsupported DataType: conversion: {data_type}")
             }
             DataType::Boolean => Ok(ast::DataType::Bool),
-            DataType::Int8 => Ok(ast::DataType::TinyInt(None)),
+            DataType::Int8 => Ok(self.dialect.int8_cast_dtype()),
             DataType::Int16 => Ok(ast::DataType::SmallInt(None)),
             DataType::Int32 => Ok(self.dialect.int32_cast_dtype()),
             DataType::Int64 => Ok(self.dialect.int64_cast_dtype()),
@@ -1745,7 +1836,10 @@ impl Unparser<'_> {
             DataType::Union(_, _) => {
                 not_impl_err!("Unsupported DataType: conversion: {data_type}")
             }
-            DataType::Dictionary(_, val) => self.arrow_dtype_to_ast_dtype(val),
+            DataType::Dictionary(_, val) => {
+                self.arrow_dtype_to_ast_dtype(&val.clone().into_nullable_field_ref())
+            }
+            DataType::RunEndEncoded(_, val) => self.arrow_dtype_to_ast_dtype(val),
             DataType::Decimal32(precision, scale)
             | DataType::Decimal64(precision, scale)
             | DataType::Decimal128(precision, scale)
@@ -1767,9 +1861,6 @@ impl Unparser<'_> {
             DataType::Map(_, _) => {
                 not_impl_err!("Unsupported DataType: conversion: {data_type}")
             }
-            DataType::RunEndEncoded(_, _) => {
-                not_impl_err!("Unsupported DataType: conversion: {data_type}")
-            }
         }
     }
 }
@@ -1777,34 +1868,37 @@ impl Unparser<'_> {
 #[cfg(test)]
 mod tests {
     use std::ops::{Add, Sub};
-    use std::{any::Any, sync::Arc, vec};
+    use std::{sync::Arc, vec};
 
     use crate::unparser::dialect::SqliteDialect;
-    use arrow::array::{LargeListArray, ListArray};
+    use arrow::array::{LargeListArray, LargeListViewArray, ListArray, ListViewArray};
     use arrow::datatypes::{DataType::Int8, Field, Int32Type, Schema, TimeUnit};
     use ast::ObjectName;
+    use datafusion_common::datatype::DataTypeExt;
     use datafusion_common::{Spans, TableReference};
-    use datafusion_expr::expr::WildcardOptions;
+    use datafusion_expr::expr::{LambdaVariable, WildcardOptions};
     use datafusion_expr::{
-        case, cast, col, cube, exists, grouping_set, interval_datetime_lit,
-        interval_year_month_lit, lit, not, not_exists, out_ref_col, placeholder, rollup,
-        table_scan, try_cast, when, ColumnarValue, ScalarFunctionArgs, ScalarUDF,
-        ScalarUDFImpl, Signature, Volatility, WindowFrame, WindowFunctionDefinition,
+        ColumnarValue, HigherOrderUDF, LambdaParametersProgress, ScalarFunctionArgs,
+        ScalarUDF, ScalarUDFImpl, Signature, ValueOrLambda, Volatility, WindowFrame,
+        WindowFunctionDefinition, case, cast, col, cube, exists, grouping_set,
+        interval_datetime_lit, interval_year_month_lit, lambda, lit, not, not_exists,
+        out_ref_col, placeholder, rollup, table_scan, try_cast, when,
     };
-    use datafusion_expr::{interval_month_day_nano_lit, ExprFunctionExt};
+    use datafusion_expr::{ExprFunctionExt, interval_month_day_nano_lit};
     use datafusion_functions::datetime::from_unixtime::FromUnixtimeFunc;
     use datafusion_functions::expr_fn::{get_field, named_struct};
     use datafusion_functions_aggregate::count::count_udaf;
     use datafusion_functions_aggregate::expr_fn::sum;
-    use datafusion_functions_nested::expr_fn::{array_element, make_array};
+    use datafusion_functions_nested::expr_fn::{array_element, array_has, make_array};
     use datafusion_functions_nested::map::map;
     use datafusion_functions_window::rank::rank_udwf;
     use datafusion_functions_window::row_number::row_number_udwf;
     use sqlparser::ast::ExactNumberInfo;
 
     use crate::unparser::dialect::{
-        CharacterLengthStyle, CustomDialect, CustomDialectBuilder, DateFieldExtractStyle,
-        DefaultDialect, Dialect, DuckDBDialect, PostgreSqlDialect, ScalarFnToSqlHandler,
+        BigQueryDialect, CharacterLengthStyle, CustomDialect, CustomDialectBuilder,
+        DateFieldExtractStyle, DefaultDialect, Dialect, DuckDBDialect, PostgreSqlDialect,
+        ScalarFnToSqlHandler,
     };
 
     use super::*;
@@ -1824,10 +1918,6 @@ mod tests {
     }
 
     impl ScalarUDFImpl for DummyUDF {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
         fn name(&self) -> &str {
             "dummy_udf"
         }
@@ -1846,6 +1936,41 @@ mod tests {
     }
     // See sql::tests for E2E tests.
 
+    #[derive(Debug, Hash, Eq, PartialEq)]
+    struct DummyHigherOrderUDF;
+
+    impl HigherOrderUDF for DummyHigherOrderUDF {
+        fn name(&self) -> &str {
+            "dummy_higher_order_function"
+        }
+
+        fn signature(&self) -> &datafusion_expr::HigherOrderSignature {
+            unimplemented!()
+        }
+
+        fn lambda_parameters(
+            &self,
+            _step: usize,
+            _fields: &[ValueOrLambda<FieldRef, Option<FieldRef>>],
+        ) -> Result<LambdaParametersProgress> {
+            unimplemented!()
+        }
+
+        fn return_field_from_args(
+            &self,
+            _args: datafusion_expr::HigherOrderReturnFieldArgs,
+        ) -> Result<FieldRef> {
+            unimplemented!()
+        }
+
+        fn invoke_with_args(
+            &self,
+            _args: datafusion_expr::HigherOrderFunctionArgs,
+        ) -> Result<ColumnarValue> {
+            unimplemented!()
+        }
+    }
+
     #[test]
     fn expr_to_sql_ok() -> Result<()> {
         let dummy_schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
@@ -1885,34 +2010,25 @@ mod tests {
                 r#"CASE WHEN a IS NOT NULL THEN true ELSE false END"#,
             ),
             (
-                Expr::Cast(Cast {
-                    expr: Box::new(col("a")),
-                    data_type: DataType::Date64,
-                }),
+                Expr::Cast(Cast::new(Box::new(col("a")), DataType::Date64)),
                 r#"CAST(a AS DATETIME)"#,
             ),
             (
-                Expr::Cast(Cast {
-                    expr: Box::new(col("a")),
-                    data_type: DataType::Timestamp(
-                        TimeUnit::Nanosecond,
-                        Some("+08:00".into()),
-                    ),
-                }),
+                Expr::Cast(Cast::new(
+                    Box::new(col("a")),
+                    DataType::Timestamp(TimeUnit::Nanosecond, Some("+08:00".into())),
+                )),
                 r#"CAST(a AS TIMESTAMP WITH TIME ZONE)"#,
             ),
             (
-                Expr::Cast(Cast {
-                    expr: Box::new(col("a")),
-                    data_type: DataType::Timestamp(TimeUnit::Millisecond, None),
-                }),
+                Expr::Cast(Cast::new(
+                    Box::new(col("a")),
+                    DataType::Timestamp(TimeUnit::Millisecond, None),
+                )),
                 r#"CAST(a AS TIMESTAMP)"#,
             ),
             (
-                Expr::Cast(Cast {
-                    expr: Box::new(col("a")),
-                    data_type: DataType::UInt32,
-                }),
+                Expr::Cast(Cast::new(Box::new(col("a")), DataType::UInt32)),
                 r#"CAST(a AS INTEGER UNSIGNED)"#,
             ),
             (
@@ -1939,6 +2055,22 @@ mod tests {
                     .is_not_null(),
                 r#"dummy_udf(a, b) IS NOT NULL"#,
             ),
+            (
+                Expr::HigherOrderFunction(HigherOrderFunction::new(
+                    Arc::new(DummyHigherOrderUDF),
+                    vec![
+                        col("a"),
+                        lambda(
+                            ["v"],
+                            -Expr::LambdaVariable(LambdaVariable::new(
+                                "v".to_string(),
+                                Some(Arc::new(Field::new("", DataType::Null, true))),
+                            )),
+                        ),
+                    ],
+                )),
+                r#"dummy_higher_order_function(a, (v) -> -v)"#,
+            ),
             (
                 Expr::Like(Like {
                     negated: true,
@@ -2002,7 +2134,7 @@ mod tests {
                     ScalarValue::TimestampSecond(Some(10001), Some("+08:00".into())),
                     None,
                 ),
-                r#"CAST('1970-01-01 10:46:41 +08:00' AS TIMESTAMP)"#,
+                r#"CAST('1970-01-01T10:46:41+08:00' AS TIMESTAMP)"#,
             ),
             (
                 Expr::Literal(ScalarValue::TimestampMillisecond(Some(10001), None), None),
@@ -2013,7 +2145,7 @@ mod tests {
                     ScalarValue::TimestampMillisecond(Some(10001), Some("+08:00".into())),
                     None,
                 ),
-                r#"CAST('1970-01-01 08:00:10.001 +08:00' AS TIMESTAMP)"#,
+                r#"CAST('1970-01-01T08:00:10.001+08:00' AS TIMESTAMP)"#,
             ),
             (
                 Expr::Literal(ScalarValue::TimestampMicrosecond(Some(10001), None), None),
@@ -2024,7 +2156,7 @@ mod tests {
                     ScalarValue::TimestampMicrosecond(Some(10001), Some("+08:00".into())),
                     None,
                 ),
-                r#"CAST('1970-01-01 08:00:00.010001 +08:00' AS TIMESTAMP)"#,
+                r#"CAST('1970-01-01T08:00:00.010001+08:00' AS TIMESTAMP)"#,
             ),
             (
                 Expr::Literal(ScalarValue::TimestampNanosecond(Some(10001), None), None),
@@ -2035,7 +2167,7 @@ mod tests {
                     ScalarValue::TimestampNanosecond(Some(10001), Some("+08:00".into())),
                     None,
                 ),
-                r#"CAST('1970-01-01 08:00:00.000010001 +08:00' AS TIMESTAMP)"#,
+                r#"CAST('1970-01-01T08:00:00.000010001+08:00' AS TIMESTAMP)"#,
             ),
             (
                 Expr::Literal(ScalarValue::Time32Second(Some(10001)), None),
@@ -2169,12 +2301,15 @@ mod tests {
                 r#"TRY_CAST(a AS INTEGER UNSIGNED)"#,
             ),
             (
-                Expr::ScalarVariable(Int8, vec![String::from("@a")]),
+                Expr::ScalarVariable(
+                    Int8.into_nullable_field_ref(),
+                    vec![String::from("@a")],
+                ),
                 r#"@a"#,
             ),
             (
                 Expr::ScalarVariable(
-                    Int8,
+                    Int8.into_nullable_field_ref(),
                     vec![String::from("@root"), String::from("foo")],
                 ),
                 r#"@root.foo"#,
@@ -2227,10 +2362,7 @@ mod tests {
                 r#"((a + b) > 100.123)"#,
             ),
             (
-                Expr::Cast(Cast {
-                    expr: Box::new(col("a")),
-                    data_type: DataType::Decimal128(10, -2),
-                }),
+                Expr::Cast(Cast::new(Box::new(col("a")), DataType::Decimal128(10, -2))),
                 r#"CAST(a AS DECIMAL(12,0))"#,
             ),
             (
@@ -2268,6 +2400,17 @@ mod tests {
                 ),
                 "'foo'",
             ),
+            (
+                Expr::Literal(
+                    ScalarValue::RunEndEncoded(
+                        Field::new("run_ends", DataType::Int32, false).into(),
+                        Field::new("values", DataType::Utf8, true).into(),
+                        Box::new(ScalarValue::Utf8(Some("foo".into()))),
+                    ),
+                    None,
+                ),
+                "'foo'",
+            ),
             (
                 Expr::Literal(
                     ScalarValue::List(Arc::new(ListArray::from_iter_primitive::<
@@ -2294,6 +2437,28 @@ mod tests {
                 ),
                 "[1, 2, 3]",
             ),
+            (
+                Expr::Literal(
+                    ScalarValue::ListView(Arc::new(
+                        ListViewArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                            Some(vec![Some(1), Some(2), Some(3)]),
+                        ]),
+                    )),
+                    None,
+                ),
+                "[1, 2, 3]",
+            ),
+            (
+                Expr::Literal(
+                    ScalarValue::LargeListView(Arc::new(
+                        LargeListViewArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                            Some(vec![Some(1), Some(2), Some(3)]),
+                        ]),
+                    )),
+                    None,
+                ),
+                "[1, 2, 3]",
+            ),
             (
                 Expr::BinaryExpr(BinaryExpr {
                     left: Box::new(col("a")),
@@ -2337,7 +2502,6 @@ mod tests {
 
         let expected = r#"('a' > 4)"#;
         assert_eq!(actual, expected);
-
         Ok(())
     }
 
@@ -2367,10 +2531,7 @@ mod tests {
                 .build();
             let unparser = Unparser::new(&dialect);
 
-            let expr = Expr::Cast(Cast {
-                expr: Box::new(col("a")),
-                data_type: DataType::Date64,
-            });
+            let expr = Expr::Cast(Cast::new(Box::new(col("a")), DataType::Date64));
             let ast = unparser.expr_to_sql(&expr)?;
 
             let actual = format!("{ast}");
@@ -2392,10 +2553,7 @@ mod tests {
                 .build();
             let unparser = Unparser::new(&dialect);
 
-            let expr = Expr::Cast(Cast {
-                expr: Box::new(col("a")),
-                data_type: DataType::Float64,
-            });
+            let expr = Expr::Cast(Cast::new(Box::new(col("a")), DataType::Float64));
             let ast = unparser.expr_to_sql(&expr)?;
 
             let actual = format!("{ast}");
@@ -2625,23 +2783,23 @@ mod tests {
     fn test_cast_value_to_binary_expr() {
         let tests = [
             (
-                Expr::Cast(Cast {
-                    expr: Box::new(Expr::Literal(
+                Expr::Cast(Cast::new(
+                    Box::new(Expr::Literal(
                         ScalarValue::Utf8(Some("blah".to_string())),
                         None,
                     )),
-                    data_type: DataType::Binary,
-                }),
+                    DataType::Binary,
+                )),
                 "'blah'",
             ),
             (
-                Expr::Cast(Cast {
-                    expr: Box::new(Expr::Literal(
+                Expr::Cast(Cast::new(
+                    Box::new(Expr::Literal(
                         ScalarValue::Utf8(Some("blah".to_string())),
                         None,
                     )),
-                    data_type: DataType::BinaryView,
-                }),
+                    DataType::BinaryView,
+                )),
                 "'blah'",
             ),
         ];
@@ -2672,10 +2830,7 @@ mod tests {
         ] {
             let unparser = Unparser::new(dialect);
 
-            let expr = Expr::Cast(Cast {
-                expr: Box::new(col("a")),
-                data_type,
-            });
+            let expr = Expr::Cast(Cast::new(Box::new(col("a")), data_type));
             let ast = unparser.expr_to_sql(&expr)?;
 
             let actual = format!("{ast}");
@@ -2758,10 +2913,7 @@ mod tests {
             [(default_dialect, "BIGINT"), (mysql_dialect, "SIGNED")]
         {
             let unparser = Unparser::new(&dialect);
-            let expr = Expr::Cast(Cast {
-                expr: Box::new(col("a")),
-                data_type: DataType::Int64,
-            });
+            let expr = Expr::Cast(Cast::new(Box::new(col("a")), DataType::Int64));
             let ast = unparser.expr_to_sql(&expr)?;
 
             let actual = format!("{ast}");
@@ -2786,10 +2938,7 @@ mod tests {
             [(default_dialect, "INTEGER"), (mysql_dialect, "SIGNED")]
         {
             let unparser = Unparser::new(&dialect);
-            let expr = Expr::Cast(Cast {
-                expr: Box::new(col("a")),
-                data_type: DataType::Int32,
-            });
+            let expr = Expr::Cast(Cast::new(Box::new(col("a")), DataType::Int32));
             let ast = unparser.expr_to_sql(&expr)?;
 
             let actual = format!("{ast}");
@@ -2825,10 +2974,7 @@ mod tests {
             (&mysql_dialect, &timestamp_with_tz, "DATETIME"),
         ] {
             let unparser = Unparser::new(dialect);
-            let expr = Expr::Cast(Cast {
-                expr: Box::new(col("a")),
-                data_type: data_type.clone(),
-            });
+            let expr = Expr::Cast(Cast::new(Box::new(col("a")), data_type.clone()));
             let ast = unparser.expr_to_sql(&expr)?;
 
             let actual = format!("{ast}");
@@ -2881,10 +3027,7 @@ mod tests {
         ] {
             let unparser = Unparser::new(dialect);
 
-            let expr = Expr::Cast(Cast {
-                expr: Box::new(col("a")),
-                data_type,
-            });
+            let expr = Expr::Cast(Cast::new(Box::new(col("a")), data_type));
             let ast = unparser.expr_to_sql(&expr)?;
 
             let actual = format!("{ast}");
@@ -2921,16 +3064,80 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_mssql_dialect_national_literal() -> Result<()> {
+        struct MsSqlDialect;
+
+        impl Dialect for MsSqlDialect {
+            fn identifier_quote_style(&self, _identifier: &str) -> Option<char> {
+                Some('[')
+            }
+
+            fn string_literal_to_sql(&self, s: &str) -> Option<ast::Expr> {
+                if !s.is_ascii() {
+                    Some(ast::Expr::value(ast::Value::NationalStringLiteral(
+                        s.to_string(),
+                    )))
+                } else {
+                    None
+                }
+            }
+        }
+
+        let dialect = MsSqlDialect;
+        let unparser = Unparser::new(&dialect);
+
+        // Get nation string literal for the custom mssql dialect
+        for (s, expected) in [
+            ("national string", "'national string'"),
+            ("datafusion資料融合", "N'datafusion資料融合'"),
+        ] {
+            let expr = Expr::Literal(ScalarValue::Utf8(Some(s.to_string())), None);
+            let ast = unparser.expr_to_sql(&expr)?;
+            assert_eq!(ast.to_string(), expected);
+
+            let expr = Expr::Literal(ScalarValue::Utf8View(Some(s.to_string())), None);
+            let ast = unparser.expr_to_sql(&expr)?;
+            assert_eq!(ast.to_string(), expected);
+
+            let expr = Expr::Literal(ScalarValue::LargeUtf8(Some(s.to_string())), None);
+            let ast = unparser.expr_to_sql(&expr)?;
+            assert_eq!(ast.to_string(), expected);
+        }
+
+        let dialect = DefaultDialect {};
+        let unparser = Unparser::new(&dialect);
+
+        // Get normal string literal for default dialect
+        for (s, expected) in [
+            ("national string", "'national string'"),
+            ("datafusion資料融合", "'datafusion資料融合'"),
+        ] {
+            let expr = Expr::Literal(ScalarValue::Utf8(Some(s.to_string())), None);
+            let ast = unparser.expr_to_sql(&expr)?;
+            assert_eq!(ast.to_string(), expected);
+
+            let expr = Expr::Literal(ScalarValue::Utf8View(Some(s.to_string())), None);
+            let ast = unparser.expr_to_sql(&expr)?;
+            assert_eq!(ast.to_string(), expected);
+
+            let expr = Expr::Literal(ScalarValue::LargeUtf8(Some(s.to_string())), None);
+            let ast = unparser.expr_to_sql(&expr)?;
+            assert_eq!(ast.to_string(), expected);
+        }
+        Ok(())
+    }
+
     #[test]
     fn test_cast_value_to_dict_expr() {
         let tests = [(
-            Expr::Cast(Cast {
-                expr: Box::new(Expr::Literal(
+            Expr::Cast(Cast::new(
+                Box::new(Expr::Literal(
                     ScalarValue::Utf8(Some("variation".to_string())),
                     None,
                 )),
-                data_type: DataType::Dictionary(Box::new(Int8), Box::new(DataType::Utf8)),
-            }),
+                DataType::Dictionary(Box::new(Int8), Box::new(DataType::Utf8)),
+            )),
             "'variation'",
         )];
         for (value, expected) in tests {
@@ -2944,6 +3151,61 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_array_literal_scalar_value_to_sql_postgres() -> Result<()> {
+        let dialect: Arc<dyn Dialect> = Arc::new(PostgreSqlDialect {});
+        let unparser = Unparser::new(dialect.as_ref());
+
+        let expr = Expr::Literal(
+            ScalarValue::List(ScalarValue::new_list_nullable(
+                &[
+                    ScalarValue::Int32(Some(1)),
+                    ScalarValue::Int32(Some(2)),
+                    ScalarValue::Int32(Some(3)),
+                ],
+                &DataType::Int32,
+            )),
+            None,
+        );
+
+        let ast = unparser.expr_to_sql(&expr)?;
+        assert_eq!(ast.to_string(), "ARRAY[1, 2, 3]");
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_nested_array_literal_scalar_value_to_sql_postgres() -> Result<()> {
+        let dialect: Arc<dyn Dialect> = Arc::new(PostgreSqlDialect {});
+        let unparser = Unparser::new(dialect.as_ref());
+
+        let inner_type = DataType::Int32;
+        let nested_type =
+            DataType::List(Arc::new(Field::new_list_field(inner_type.clone(), true)));
+
+        let expr = Expr::Literal(
+            ScalarValue::List(ScalarValue::new_list_nullable(
+                &[
+                    ScalarValue::List(ScalarValue::new_list_nullable(
+                        &[ScalarValue::Int32(Some(1)), ScalarValue::Int32(Some(2))],
+                        &inner_type,
+                    )),
+                    ScalarValue::List(ScalarValue::new_list_nullable(
+                        &[ScalarValue::Int32(Some(3)), ScalarValue::Int32(Some(4))],
+                        &inner_type,
+                    )),
+                ],
+                &nested_type,
+            )),
+            None,
+        );
+
+        let ast = unparser.expr_to_sql(&expr)?;
+        assert_eq!(ast.to_string(), "ARRAY[ARRAY[1, 2], ARRAY[3, 4]]");
+
+        Ok(())
+    }
+
     #[test]
     fn test_round_scalar_fn_to_expr() -> Result<()> {
         let default_dialect: Arc<dyn Dialect> = Arc::new(
@@ -2962,10 +3224,7 @@ mod tests {
                     datafusion_functions::math::round::RoundFunc::new(),
                 )),
                 args: vec![
-                    Expr::Cast(Cast {
-                        expr: Box::new(col("a")),
-                        data_type: DataType::Float64,
-                    }),
+                    Expr::Cast(Cast::new(Box::new(col("a")), DataType::Float64)),
                     Expr::Literal(ScalarValue::Int64(Some(2)), None),
                 ],
             });
@@ -2979,6 +3238,24 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_postgres_array_has_to_any() -> Result<()> {
+        let default_dialect: Arc<dyn Dialect> = Arc::new(DefaultDialect {});
+        let postgres_dialect: Arc<dyn Dialect> = Arc::new(PostgreSqlDialect {});
+        let expr = array_has(col("items"), lit(1));
+
+        for (dialect, expected) in [
+            (default_dialect, "array_has(\"items\", 1)"),
+            (postgres_dialect, "1 = ANY(\"items\")"),
+        ] {
+            let unparser = Unparser::new(dialect.as_ref());
+            let actual = format!("{}", unparser.expr_to_sql(&expr)?);
+            assert_eq!(actual, expected);
+        }
+
+        Ok(())
+    }
+
     #[test]
     fn test_window_func_support_window_frame() -> Result<()> {
         let default_dialect: Arc<dyn Dialect> =
@@ -3127,10 +3404,31 @@ mod tests {
 
         let unparser = Unparser::new(&dialect);
 
-        let ast_dtype = unparser.arrow_dtype_to_ast_dtype(&DataType::Dictionary(
-            Box::new(DataType::Int32),
-            Box::new(DataType::Utf8),
-        ))?;
+        let arrow_field = Arc::new(Field::new(
+            "",
+            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+            true,
+        ));
+        let ast_dtype = unparser.arrow_dtype_to_ast_dtype(&arrow_field)?;
+
+        assert_eq!(ast_dtype, ast::DataType::Varchar(None));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_run_end_encoded_to_sql() -> Result<()> {
+        let dialect = CustomDialectBuilder::new().build();
+
+        let unparser = Unparser::new(&dialect);
+
+        let ast_dtype = unparser.arrow_dtype_to_ast_dtype(
+            &DataType::RunEndEncoded(
+                Field::new("run_ends", DataType::Int32, false).into(),
+                Field::new("values", DataType::Utf8, true).into(),
+            )
+            .into_nullable_field_ref(),
+        )?;
 
         assert_eq!(ast_dtype, ast::DataType::Varchar(None));
 
@@ -3144,7 +3442,8 @@ mod tests {
             .build();
         let unparser = Unparser::new(&dialect);
 
-        let ast_dtype = unparser.arrow_dtype_to_ast_dtype(&DataType::Utf8View)?;
+        let arrow_field = Arc::new(Field::new("", DataType::Utf8View, true));
+        let ast_dtype = unparser.arrow_dtype_to_ast_dtype(&arrow_field)?;
 
         assert_eq!(ast_dtype, ast::DataType::Char(None));
 
@@ -3212,10 +3511,10 @@ mod tests {
         let dialect: Arc<dyn Dialect> = Arc::new(SqliteDialect {});
 
         let unparser = Unparser::new(dialect.as_ref());
-        let expr = Expr::Cast(Cast {
-            expr: Box::new(col("a")),
-            data_type: DataType::Timestamp(TimeUnit::Nanosecond, None),
-        });
+        let expr = Expr::Cast(Cast::new(
+            Box::new(col("a")),
+            DataType::Timestamp(TimeUnit::Nanosecond, None),
+        ));
 
         let ast = unparser.expr_to_sql(&expr)?;
 
@@ -3233,12 +3532,13 @@ mod tests {
             Arc::new(CustomDialectBuilder::new().build());
 
         let duckdb_dialect: Arc<dyn Dialect> = Arc::new(DuckDBDialect::new());
+        let bigquery_dialect: Arc<dyn Dialect> = Arc::new(BigQueryDialect::new());
 
         for (dialect, scalar, expected) in [
             (
                 Arc::clone(&default_dialect),
                 ScalarValue::TimestampSecond(Some(1757934000), Some("+00:00".into())),
-                "CAST('2025-09-15 11:00:00 +00:00' AS TIMESTAMP)",
+                "CAST('2025-09-15T11:00:00+00:00' AS TIMESTAMP)",
             ),
             (
                 Arc::clone(&default_dialect),
@@ -3246,7 +3546,7 @@ mod tests {
                     Some(1757934000123),
                     Some("+01:00".into()),
                 ),
-                "CAST('2025-09-15 12:00:00.123 +01:00' AS TIMESTAMP)",
+                "CAST('2025-09-15T12:00:00.123+01:00' AS TIMESTAMP)",
             ),
             (
                 Arc::clone(&default_dialect),
@@ -3254,7 +3554,7 @@ mod tests {
                     Some(1757934000123456),
                     Some("-01:00".into()),
                 ),
-                "CAST('2025-09-15 10:00:00.123456 -01:00' AS TIMESTAMP)",
+                "CAST('2025-09-15T10:00:00.123456-01:00' AS TIMESTAMP)",
             ),
             (
                 Arc::clone(&default_dialect),
@@ -3262,12 +3562,12 @@ mod tests {
                     Some(1757934000123456789),
                     Some("+00:00".into()),
                 ),
-                "CAST('2025-09-15 11:00:00.123456789 +00:00' AS TIMESTAMP)",
+                "CAST('2025-09-15T11:00:00.123456789+00:00' AS TIMESTAMP)",
             ),
             (
                 Arc::clone(&duckdb_dialect),
                 ScalarValue::TimestampSecond(Some(1757934000), Some("+00:00".into())),
-                "CAST('2025-09-15 11:00:00+00:00' AS TIMESTAMP)",
+                "CAST('2025-09-15T11:00:00+00:00' AS TIMESTAMP)",
             ),
             (
                 Arc::clone(&duckdb_dialect),
@@ -3275,7 +3575,7 @@ mod tests {
                     Some(1757934000123),
                     Some("+01:00".into()),
                 ),
-                "CAST('2025-09-15 12:00:00.123+01:00' AS TIMESTAMP)",
+                "CAST('2025-09-15T12:00:00.123+01:00' AS TIMESTAMP)",
             ),
             (
                 Arc::clone(&duckdb_dialect),
@@ -3283,7 +3583,7 @@ mod tests {
                     Some(1757934000123456),
                     Some("-01:00".into()),
                 ),
-                "CAST('2025-09-15 10:00:00.123456-01:00' AS TIMESTAMP)",
+                "CAST('2025-09-15T10:00:00.123456-01:00' AS TIMESTAMP)",
             ),
             (
                 Arc::clone(&duckdb_dialect),
@@ -3291,7 +3591,36 @@ mod tests {
                     Some(1757934000123456789),
                     Some("+00:00".into()),
                 ),
-                "CAST('2025-09-15 11:00:00.123456789+00:00' AS TIMESTAMP)",
+                "CAST('2025-09-15T11:00:00.123456789+00:00' AS TIMESTAMP)",
+            ),
+            (
+                Arc::clone(&bigquery_dialect),
+                ScalarValue::TimestampSecond(Some(1757934000), Some("+00:00".into())),
+                "CAST('2025-09-15T11:00:00+00:00' AS TIMESTAMP)",
+            ),
+            (
+                Arc::clone(&bigquery_dialect),
+                ScalarValue::TimestampMillisecond(
+                    Some(1757934000123),
+                    Some("+01:00".into()),
+                ),
+                "CAST('2025-09-15T12:00:00.123+01:00' AS TIMESTAMP)",
+            ),
+            (
+                Arc::clone(&bigquery_dialect),
+                ScalarValue::TimestampMicrosecond(
+                    Some(1757934000123456),
+                    Some("-01:00".into()),
+                ),
+                "CAST('2025-09-15T10:00:00.123456-01:00' AS TIMESTAMP)",
+            ),
+            (
+                Arc::clone(&bigquery_dialect),
+                ScalarValue::TimestampNanosecond(
+                    Some(1757934000123456789),
+                    Some("+00:00".into()),
+                ),
+                "CAST('2025-09-15T11:00:00.123456789+00:00' AS TIMESTAMP)",
             ),
         ] {
             let unparser = Unparser::new(dialect.as_ref());
@@ -3303,4 +3632,53 @@ mod tests {
         }
         Ok(())
     }
+
+    #[test]
+    fn test_bigquery_dialect_overrides() -> Result<()> {
+        let bigquery_dialect: Arc<dyn Dialect> = Arc::new(BigQueryDialect::new());
+        let unparser = Unparser::new(bigquery_dialect.as_ref());
+
+        // date_field_extract_style: EXTRACT instead of date_part
+        let expr = Expr::ScalarFunction(ScalarFunction {
+            func: Arc::new(ScalarUDF::new_from_impl(
+                datafusion_functions::datetime::date_part::DatePartFunc::new(),
+            )),
+            args: vec![lit("YEAR"), col("date_col")],
+        });
+        let actual = format!("{}", unparser.expr_to_sql(&expr)?);
+        assert_eq!(actual, "EXTRACT(YEAR FROM `date_col`)");
+
+        // interval_style: SQL standard instead of PostgresVerbose
+        let expr = interval_year_month_lit("3 months");
+        let actual = format!("{}", unparser.expr_to_sql(&expr)?);
+        assert_eq!(actual, "INTERVAL '3' MONTH");
+
+        // float64_ast_dtype: FLOAT64 instead of DOUBLE
+        let expr = cast(col("a"), DataType::Float64);
+        let actual = format!("{}", unparser.expr_to_sql(&expr)?);
+        assert_eq!(actual, "CAST(`a` AS FLOAT64)");
+
+        // supports_column_alias_in_table_alias: false
+        assert!(!bigquery_dialect.supports_column_alias_in_table_alias());
+
+        // utf8_cast_dtype: STRING instead of VARCHAR
+        let expr = cast(col("a"), DataType::Utf8);
+        let actual = format!("{}", unparser.expr_to_sql(&expr)?);
+        assert_eq!(actual, "CAST(`a` AS STRING)");
+
+        // large_utf8_cast_dtype: STRING instead of TEXT
+        let expr = cast(col("a"), DataType::LargeUtf8);
+        let actual = format!("{}", unparser.expr_to_sql(&expr)?);
+        assert_eq!(actual, "CAST(`a` AS STRING)");
+
+        // timestamp_cast_dtype: TIMESTAMP (no WITH TIME ZONE)
+        let expr = cast(
+            col("a"),
+            DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())),
+        );
+        let actual = format!("{}", unparser.expr_to_sql(&expr)?);
+        assert_eq!(actual, "CAST(`a` AS TIMESTAMP)");
+
+        Ok(())
+    }
 }
diff --git a/datafusion/sql/src/unparser/extension_unparser.rs b/datafusion/sql/src/unparser/extension_unparser.rs
index b778130ca5a27..6633b38cf27cc 100644
--- a/datafusion/sql/src/unparser/extension_unparser.rs
+++ b/datafusion/sql/src/unparser/extension_unparser.rs
@@ -15,8 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::unparser::ast::{QueryBuilder, RelationBuilder, SelectBuilder};
 use crate::unparser::Unparser;
+use crate::unparser::ast::{QueryBuilder, RelationBuilder, SelectBuilder};
 use datafusion_expr::UserDefinedLogicalNode;
 use sqlparser::ast::Statement;
 
@@ -64,7 +64,7 @@ pub enum UnparseWithinStatementResult {
 }
 
 /// The result of unparsing a custom logical node to a statement.
-#[allow(clippy::large_enum_variant)]
+#[expect(clippy::large_enum_variant)]
 pub enum UnparseToStatementResult {
     /// If the custom logical node was successfully unparsed to a statement.
     Modified(Statement),
diff --git a/datafusion/sql/src/unparser/plan.rs b/datafusion/sql/src/unparser/plan.rs
index e7535338b7677..2c36fe0b2c98a 100644
--- a/datafusion/sql/src/unparser/plan.rs
+++ b/datafusion/sql/src/unparser/plan.rs
@@ -16,38 +16,41 @@
 // under the License.
 
 use super::{
+    Unparser,
     ast::{
         BuilderError, DerivedRelationBuilder, QueryBuilder, RelationBuilder,
         SelectBuilder, TableRelationBuilder, TableWithJoinsBuilder,
     },
     rewrite::{
-        inject_column_aliases_into_subquery, normalize_union_schema,
+        TableAliasRewriter, inject_column_aliases_into_subquery, normalize_union_schema,
         rewrite_plan_for_sort_on_non_projected_fields,
-        subquery_alias_inner_query_and_columns, TableAliasRewriter,
+        subquery_alias_inner_query_and_columns,
     },
     utils::{
         find_agg_node_within_select, find_unnest_node_within_select,
         find_window_nodes_within_select, try_transform_to_simple_table_scan_with_filters,
-        unproject_sort_expr, unproject_unnest_expr, unproject_window_exprs,
+        unproject_sort_expr, unproject_unnest_expr,
+        unproject_unnest_expr_as_flatten_value, unproject_window_exprs,
     },
-    Unparser,
 };
 use crate::unparser::extension_unparser::{
     UnparseToStatementResult, UnparseWithinStatementResult,
 };
 use crate::unparser::utils::{find_unnest_node_until_relation, unproject_agg_exprs};
-use crate::unparser::{ast::UnnestRelationBuilder, rewrite::rewrite_qualify};
+use crate::unparser::{
+    ast::FlattenRelationBuilder, ast::UnnestRelationBuilder, rewrite::rewrite_qualify,
+};
 use crate::utils::UNNEST_PLACEHOLDER;
 use datafusion_common::{
-    internal_err, not_impl_err,
-    tree_node::{TransformedResult, TreeNode},
-    Column, DataFusionError, Result, ScalarValue, TableReference,
+    Column, DataFusionError, Result, ScalarValue, TableReference, assert_or_internal_err,
+    internal_datafusion_err, internal_err, not_impl_err,
+    tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRecursion},
 };
-use datafusion_expr::expr::OUTER_REFERENCE_COLUMN_PREFIX;
+use datafusion_expr::expr::{OUTER_REFERENCE_COLUMN_PREFIX, UNNEST_COLUMN_PREFIX};
 use datafusion_expr::{
-    expr::Alias, BinaryExpr, Distinct, Expr, JoinConstraint, JoinType, LogicalPlan,
+    BinaryExpr, Distinct, Expr, JoinConstraint, JoinType, LogicalPlan,
     LogicalPlanBuilder, Operator, Projection, SortExpr, TableScan, Unnest,
-    UserDefinedLogicalNode,
+    UserDefinedLogicalNode, expr::Alias,
 };
 use sqlparser::ast::{self, Ident, OrderByKind, SetExpr, TableAliasColumnDef};
 use std::{sync::Arc, vec};
@@ -81,9 +84,13 @@ use std::{sync::Arc, vec};
 ///     .unwrap()
 ///     .build()
 ///     .unwrap();
-/// let sql = plan_to_sql(&plan).unwrap(); // convert to AST
+/// // convert to AST
+/// let sql = plan_to_sql(&plan).unwrap();
 /// // use the Display impl to convert to SQL text
-/// assert_eq!(sql.to_string(), "SELECT \"table\".id, \"table\".\"value\" FROM \"table\"")
+/// assert_eq!(
+///     sql.to_string(),
+///     "SELECT \"table\".id, \"table\".\"value\" FROM \"table\""
+/// )
 /// ```
 ///
 /// [`SqlToRel::sql_statement_to_plan`]: crate::planner::SqlToRel::sql_statement_to_plan
@@ -231,12 +238,44 @@ impl Unparser<'_> {
         let mut exprs = p.expr.clone();
 
         // If an Unnest node is found within the select, find and unproject the unnest column
+        let flatten_alias = select.current_flatten_alias();
         if let Some(unnest) = find_unnest_node_within_select(plan) {
+            if let Some(ref alias) = flatten_alias {
+                exprs = exprs
+                    .into_iter()
+                    .map(|e| unproject_unnest_expr_as_flatten_value(e, unnest, alias))
+                    .collect::<Result<Vec<_>>>()?;
+            } else {
+                exprs = exprs
+                    .into_iter()
+                    .map(|e| unproject_unnest_expr(e, unnest))
+                    .collect::<Result<Vec<_>>>()?;
+            }
+        };
+
+        // Rewrite column references that point to FLATTEN table aliases:
+        // in Snowflake, FLATTEN output is accessed via .VALUE, not the
+        // original column name.
+        if !select.flatten_table_aliases_empty() {
             exprs = exprs
                 .into_iter()
-                .map(|e| unproject_unnest_expr(e, unnest))
+                .map(|e| {
+                    e.transform(|expr| {
+                        if let Expr::Column(ref col) = expr
+                            && let Some(ref relation) = col.relation
+                            && select.is_flatten_table_alias(relation.table())
+                        {
+                            return Ok(Transformed::yes(Expr::Column(Column::new(
+                                Some(relation.clone()),
+                                "VALUE",
+                            ))));
+                        }
+                        Ok(Transformed::no(expr))
+                    })
+                    .map(|t| t.data)
+                })
                 .collect::<Result<Vec<_>>>()?;
-        };
+        }
 
         match (
             find_agg_node_within_select(plan, true),
@@ -275,7 +314,18 @@ impl Unparser<'_> {
             _ => {
                 let items = exprs
                     .iter()
-                    .map(|e| self.select_item_to_sql(e))
+                    .map(|e| {
+                        // After unproject_unnest_expr_as_flatten_value, an
+                        // internal UNNEST display-name alias may still wrap
+                        // the rewritten _unnest.VALUE column. Replace it
+                        // with the bare FLATTEN VALUE select item.
+                        if let Some(ref alias) = flatten_alias
+                            && Self::has_internal_unnest_alias(e)
+                        {
+                            return Ok(self.build_flatten_value_select_item(alias, None));
+                        }
+                        self.select_item_to_sql(e)
+                    })
                     .collect::<Result<Vec<_>>>()?;
                 select.projection(items);
             }
@@ -326,6 +376,107 @@ impl Unparser<'_> {
         }
     }
 
+    /// Projection unparsing when [`super::dialect::Dialect::unnest_as_lateral_flatten`] is enabled:
+    /// Snowflake-style `LATERAL FLATTEN` for unnest (not other dialect spellings).
+    ///
+    /// [`Self::peel_to_unnest_with_modifiers`] walks through any intermediate
+    /// Limit/Sort nodes (the optimizer can insert these between the Projection
+    /// and the Unnest), applies their modifiers to the query, and returns the
+    /// Unnest plus the [`LogicalPlan`] ref to recurse into. This bypasses the
+    /// normal Limit/Sort handlers which would wrap the subtree in a derived
+    /// subquery.
+    ///
+    /// SELECT rendering is delegated to [`Self::reconstruct_select_statement`],
+    /// which rewrites placeholder columns to `alias."VALUE"` via
+    /// [`unproject_unnest_expr_as_flatten_value`].
+    ///
+    /// Returns `Ok(true)` when this path fully handled the projection.
+    fn try_projection_unnest_as_lateral_flatten(
+        &self,
+        plan: &LogicalPlan,
+        p: &Projection,
+        query: &mut Option<QueryBuilder>,
+        select: &mut SelectBuilder,
+        relation: &mut RelationBuilder,
+        unnest_input_type: Option<&UnnestInputType>,
+    ) -> Result<bool> {
+        // unnest_as_lateral_flatten: Snowflake LATERAL FLATTEN
+        //
+        // Generate the alias up front so that peel_to_unnest_with_modifiers
+        // can rewrite ORDER BY placeholder columns to alias.VALUE.
+        if self.dialect.unnest_as_lateral_flatten() && unnest_input_type.is_some() {
+            let flatten_alias_name = if !select.already_projected() {
+                select.next_flatten_alias()
+            } else {
+                select
+                    .current_flatten_alias()
+                    .unwrap_or_else(|| select.next_flatten_alias())
+            };
+
+            if let Some((unnest, unnest_plan)) = self.peel_to_unnest_with_modifiers(
+                p.input.as_ref(),
+                query,
+                Some(&flatten_alias_name),
+            )? && let Some(mut flatten) =
+                self.try_unnest_to_lateral_flatten_sql(unnest)?
+            {
+                let inner_projection = Self::peel_to_inner_projection(
+                    unnest.input.as_ref(),
+                )
+                .ok_or_else(|| {
+                    internal_datafusion_err!(
+                        "Unnest input is not a Projection: {:?}",
+                        unnest.input
+                    )
+                })?;
+
+                flatten.alias(Some(ast::TableAlias {
+                    name: Ident::with_quote('"', &flatten_alias_name),
+                    columns: vec![],
+                    explicit: true,
+                }));
+
+                if !select.already_projected() {
+                    self.reconstruct_select_statement(plan, p, select)?;
+                }
+
+                if matches!(
+                    inner_projection.input.as_ref(),
+                    LogicalPlan::EmptyRelation(_)
+                ) {
+                    relation.flatten(flatten);
+                    self.select_to_sql_recursively(unnest_plan, query, select, relation)?;
+                    return Ok(true);
+                }
+
+                self.select_to_sql_recursively(unnest_plan, query, select, relation)?;
+
+                let flatten_factor = flatten.build().map_err(|e| {
+                    internal_datafusion_err!("Failed to build FLATTEN: {e}")
+                })?;
+                let cross_join = ast::Join {
+                    relation: flatten_factor,
+                    global: false,
+                    join_operator: ast::JoinOperator::CrossJoin(
+                        ast::JoinConstraint::None,
+                    ),
+                };
+                if let Some(mut from) = select.pop_from() {
+                    from.push_join(cross_join);
+                    select.push_from(from);
+                } else {
+                    let mut twj = TableWithJoinsBuilder::default();
+                    twj.push_join(cross_join);
+                    select.push_from(twj);
+                }
+
+                return Ok(true);
+            }
+        }
+
+        Ok(false)
+    }
+
     #[cfg_attr(feature = "recursive_protection", recursive::recursive)]
     fn select_to_sql_recursively(
         &self,
@@ -336,7 +487,7 @@ impl Unparser<'_> {
     ) -> Result<()> {
         match plan {
             LogicalPlan::TableScan(scan) => {
-                if let Some(unparsed_table_scan) = Self::unparse_table_scan_pushdown(
+                if let Some(unparsed_table_scan) = self.unparse_table_scan_pushdown(
                     plan,
                     None,
                     select.already_projected(),
@@ -372,28 +523,44 @@ impl Unparser<'_> {
                         .select_to_sql_recursively(&new_plan, query, select, relation);
                 }
 
-                // Projection can be top-level plan for unnest relation
-                // The projection generated by the `RecursiveUnnestRewriter` from a UNNEST relation will have
-                // only one expression, which is the placeholder column generated by the rewriter.
-                let unnest_input_type = if p.expr.len() == 1 {
-                    Self::check_unnest_placeholder_with_outer_ref(&p.expr[0])
-                } else {
-                    None
-                };
-                if self.dialect.unnest_as_table_factor() && unnest_input_type.is_some() {
-                    if let LogicalPlan::Unnest(unnest) = &p.input.as_ref() {
-                        if let Some(unnest_relation) =
-                            self.try_unnest_to_table_factor_sql(unnest)?
-                        {
-                            relation.unnest(unnest_relation);
-                            return self.select_to_sql_recursively(
-                                p.input.as_ref(),
-                                query,
-                                select,
-                                relation,
-                            );
-                        }
-                    }
+                // Projection can be top-level plan for unnest relation.
+                // The projection generated by the `RecursiveUnnestRewriter`
+                // will have at least one expression referencing an unnest
+                // placeholder column.
+                let unnest_input_type: Option<UnnestInputType> =
+                    p.expr.iter().find_map(Self::find_unnest_placeholder);
+
+                // --- UNNEST table factor path (BigQuery, etc.) ---
+                // Only fires for a single bare-placeholder projection.
+                // Uses peel_to_unnest_with_modifiers (rather than matching
+                // p.input directly) to handle Limit/Sort between Projection
+                // and Unnest.
+                if self.dialect.unnest_as_table_factor()
+                    && p.expr.len() == 1
+                    && Self::is_bare_unnest_placeholder(&p.expr[0])
+                    && let Some((unnest, unnest_plan)) =
+                        self.peel_to_unnest_with_modifiers(p.input.as_ref(), query, None)?
+                    && let Some(unnest_relation) =
+                        self.try_unnest_to_table_factor_sql(unnest)?
+                {
+                    relation.unnest(unnest_relation);
+                    return self.select_to_sql_recursively(
+                        unnest_plan,
+                        query,
+                        select,
+                        relation,
+                    );
+                }
+
+                if self.try_projection_unnest_as_lateral_flatten(
+                    plan,
+                    p,
+                    query,
+                    select,
+                    relation,
+                    unnest_input_type.as_ref(),
+                )? {
+                    return Ok(());
                 }
 
                 // If it's a unnest projection, we should provide the table column alias
@@ -420,6 +587,24 @@ impl Unparser<'_> {
                         columns,
                     );
                 }
+                // For Snowflake FLATTEN: when the outer Projection has
+                // UNNEST(...) display-name columns (from SELECT * / SELECT
+                // UNNEST(...)), generate a flatten alias now so that
+                // reconstruct_select_statement and the downstream Unnest
+                // handler both use the same alias.
+                if self.dialect.unnest_as_lateral_flatten()
+                    && p.expr.iter().any(Self::has_internal_unnest_alias)
+                {
+                    select.next_flatten_alias();
+                }
+                // Pre-register FLATTEN table aliases from SubqueryAlias
+                // nodes in the plan tree so that
+                // reconstruct_select_statement can rewrite column
+                // references (e.g. a.col → a.VALUE) before the
+                // SubqueryAlias handler runs.
+                if self.dialect.unnest_as_lateral_flatten() {
+                    Self::collect_flatten_aliases(p.input.as_ref(), select);
+                }
                 self.reconstruct_select_statement(plan, p, select)?;
                 self.select_to_sql_recursively(p.input.as_ref(), query, select, relation)
             }
@@ -506,6 +691,7 @@ impl Unparser<'_> {
                         vec![],
                     );
                 }
+
                 let Some(query_ref) = query else {
                     return internal_err!(
                         "Sort operator only valid in a statement context."
@@ -581,18 +767,17 @@ impl Unparser<'_> {
 
                 // If this distinct is the parent of a Union and we're in a query context,
                 // then we need to unparse as a `UNION` rather than a `UNION ALL`.
-                if let Distinct::All(input) = distinct {
-                    if matches!(input.as_ref(), LogicalPlan::Union(_)) {
-                        if let Some(query_mut) = query.as_mut() {
-                            query_mut.distinct_union();
-                            return self.select_to_sql_recursively(
-                                input.as_ref(),
-                                query,
-                                select,
-                                relation,
-                            );
-                        }
-                    }
+                if let Distinct::All(input) = distinct
+                    && matches!(input.as_ref(), LogicalPlan::Union(_))
+                    && let Some(query_mut) = query.as_mut()
+                {
+                    query_mut.distinct_union();
+                    return self.select_to_sql_recursively(
+                        input.as_ref(),
+                        query,
+                        select,
+                        relation,
+                    );
                 }
 
                 let (select_distinct, input) = match distinct {
@@ -678,32 +863,15 @@ impl Unparser<'_> {
                     &mut right_relation,
                 )?;
 
-                let join_filters = if table_scan_filters.is_empty() {
-                    join.filter.clone()
-                } else {
-                    // Combine `table_scan_filters` into a single filter using `AND`
-                    let Some(combined_filters) =
-                        table_scan_filters.into_iter().reduce(|acc, filter| {
-                            Expr::BinaryExpr(BinaryExpr {
-                                left: Box::new(acc),
-                                op: Operator::And,
-                                right: Box::new(filter),
-                            })
-                        })
-                    else {
-                        return internal_err!("Failed to combine TableScan filters");
-                    };
-
-                    // Combine `join.filter` with `combined_filters` using `AND`
-                    match &join.filter {
-                        Some(filter) => Some(Expr::BinaryExpr(BinaryExpr {
-                            left: Box::new(filter.clone()),
-                            op: Operator::And,
-                            right: Box::new(combined_filters),
-                        })),
-                        None => Some(combined_filters),
-                    }
-                };
+                let (join_filters, where_filters) = Self::split_join_on_and_where_filters(
+                    join.join_type,
+                    &join.filter,
+                    table_scan_filters,
+                );
+                for filter in where_filters {
+                    let filter_expr = self.expr_to_sql(&filter)?;
+                    select.selection(Some(filter_expr));
+                }
 
                 let join_constraint = self.join_constraint_to_sql(
                     join.join_constraint,
@@ -820,11 +988,55 @@ impl Unparser<'_> {
             LogicalPlan::SubqueryAlias(plan_alias) => {
                 let (plan, mut columns) =
                     subquery_alias_inner_query_and_columns(plan_alias);
-                let unparsed_table_scan = Self::unparse_table_scan_pushdown(
+                let unparsed_table_scan = self.unparse_table_scan_pushdown(
                     plan,
                     Some(plan_alias.alias.clone()),
                     select.already_projected(),
                 )?;
+
+                // If the (possibly rewritten) inner plan builds its own
+                // SELECT clauses (e.g. Aggregate adds GROUP BY, Window adds
+                // OVER, etc.) and unparse_table_scan_pushdown couldn't reduce it,
+                // we must emit a derived subquery: (SELECT ...) AS alias.
+                // Without this, the recursive handler would merge those clauses
+                // into the outer SELECT, losing the subquery structure entirely.
+                if unparsed_table_scan.is_none() && Self::requires_derived_subquery(plan)
+                {
+                    // When the dialect does not support column aliases in
+                    // table aliases (e.g. SQLite), inject the aliases into
+                    // the inner projection before wrapping as a derived
+                    // subquery.
+                    if !columns.is_empty()
+                        && !self.dialect.supports_column_alias_in_table_alias()
+                    {
+                        let Ok(rewritten_plan) =
+                            inject_column_aliases_into_subquery(plan.clone(), columns)
+                        else {
+                            return internal_err!(
+                                "Failed to transform SubqueryAlias plan"
+                            );
+                        };
+                        return self.derive(
+                            &rewritten_plan,
+                            relation,
+                            Some(self.new_table_alias(
+                                plan_alias.alias.table().to_string(),
+                                vec![],
+                            )),
+                            false,
+                        );
+                    }
+                    return self.derive(
+                        plan,
+                        relation,
+                        Some(self.new_table_alias(
+                            plan_alias.alias.table().to_string(),
+                            columns,
+                        )),
+                        false,
+                    );
+                }
+
                 // if the child plan is a TableScan with pushdown operations, we don't need to
                 // create an additional subquery for it
                 if !select.already_projected() && unparsed_table_scan.is_none() {
@@ -843,7 +1055,7 @@ impl Unparser<'_> {
                             Err(e) => {
                                 return internal_err!(
                                     "Failed to transform SubqueryAlias plan: {e}"
-                                )
+                                );
                             }
                         };
 
@@ -863,6 +1075,16 @@ impl Unparser<'_> {
                     self.new_table_alias(plan_alias.alias.table().to_string(), columns),
                 ));
 
+                // If this SubqueryAlias wraps a FLATTEN (Snowflake unnest),
+                // register the alias so the outer Projection can rewrite
+                // column references to use VALUE.
+                if self.dialect.unnest_as_lateral_flatten()
+                    && find_unnest_node_until_relation(plan_alias.input.as_ref())
+                        .is_some()
+                {
+                    select.add_flatten_table_alias(plan_alias.alias.table().to_string());
+                }
+
                 Ok(())
             }
             LogicalPlan::Union(union) => {
@@ -883,9 +1105,10 @@ impl Unparser<'_> {
                     .map(|input| self.select_to_sql_expr(input, query))
                     .collect::<Result<Vec<_>>>()?;
 
-                if input_exprs.len() < 2 {
-                    return internal_err!("UNION operator requires at least 2 inputs");
-                }
+                assert_or_internal_err!(
+                    input_exprs.len() >= 2,
+                    "UNION operator requires at least 2 inputs"
+                );
 
                 let set_quantifier =
                     if query.as_ref().is_some_and(|q| q.is_distinct_union()) {
@@ -954,12 +1177,42 @@ impl Unparser<'_> {
             }
             LogicalPlan::Unnest(unnest) => {
                 if !unnest.struct_type_columns.is_empty() {
+                    if self.dialect.unnest_as_lateral_flatten() {
+                        return not_impl_err!(
+                            "Snowflake FLATTEN cannot unparse struct unnest: \
+                             DataFusion expands struct fields into columns (horizontal), \
+                             but Snowflake FLATTEN expands them into rows (vertical). \
+                             Columns: {:?}",
+                            unnest.struct_type_columns
+                        );
+                    }
                     return internal_err!(
                         "Struct type columns are not currently supported in UNNEST: {:?}",
                         unnest.struct_type_columns
                     );
                 }
 
+                // For Snowflake FLATTEN: if the relation hasn't been set yet
+                // (UNNEST was in SELECT clause, not FROM clause), set the FLATTEN
+                // relation here so the FROM clause is emitted.
+                if self.dialect.unnest_as_lateral_flatten()
+                    && !relation.has_relation()
+                    && let Some(mut flatten_relation) =
+                        self.try_unnest_to_lateral_flatten_sql(unnest)?
+                {
+                    // Use the alias already generated by the Projection
+                    // handler so SELECT items and the FLATTEN relation
+                    // reference the same name.
+                    if let Some(alias) = select.current_flatten_alias() {
+                        flatten_relation.alias(Some(ast::TableAlias {
+                            name: Ident::with_quote('"', &alias),
+                            columns: vec![],
+                            explicit: true,
+                        }));
+                    }
+                    relation.flatten(flatten_relation);
+                }
+
                 // In the case of UNNEST, the Unnest node is followed by a duplicate Projection node that we should skip.
                 // Otherwise, there will be a duplicate SELECT clause.
                 // | Projection: table.col1, UNNEST(table.col2)
@@ -967,8 +1220,9 @@ impl Unparser<'_> {
                 // |     Projection: table.col1, table.col2 AS UNNEST(table.col2)
                 // |       Filter: table.col3 = Int64(3)
                 // |         TableScan: table projection=None
-                if let LogicalPlan::Projection(p) = unnest.input.as_ref() {
-                    // continue with projection input
+                if let Some(p) = Self::peel_to_inner_projection(unnest.input.as_ref()) {
+                    // Skip the inner Projection (synthetic rewriter node)
+                    // and continue with its input.
                     self.select_to_sql_recursively(&p.input, query, select, relation)
                 } else {
                     internal_err!("Unnest input is not a Projection: {unnest:?}")
@@ -978,7 +1232,9 @@ impl Unparser<'_> {
                 if find_unnest_node_until_relation(subquery.subquery.as_ref())
                     .is_some() =>
             {
-                if self.dialect.unnest_as_table_factor() {
+                if self.dialect.unnest_as_table_factor()
+                    || self.dialect.unnest_as_lateral_flatten()
+                {
                     self.select_to_sql_recursively(
                         subquery.subquery.as_ref(),
                         query,
@@ -1001,29 +1257,209 @@ impl Unparser<'_> {
         }
     }
 
-    /// Try to find the placeholder column name generated by `RecursiveUnnestRewriter`.
+    /// Walk through transparent nodes (SubqueryAlias) to find the inner
+    /// Projection that feeds an Unnest node.
     ///
-    /// - If the column is a placeholder column match the pattern `Expr::Alias(Expr::Column("__unnest_placeholder(...)"))`,
-    ///   it means it is a scalar column, return [UnnestInputType::Scalar].
-    /// - If the column is a placeholder column match the pattern `Expr::Alias(Expr::Column("__unnest_placeholder(outer_ref(...)))")`,
-    ///   it means it is an outer reference column, return [UnnestInputType::OuterReference].
-    /// - If the column is not a placeholder column, return [None].
+    /// The inner Projection is created atomically by the
+    /// `RecursiveUnnestRewriter` and contains the array expression that the
+    /// Unnest operates on. A `SubqueryAlias` (e.g. from a virtual/passthrough
+    /// table) may wrap the Projection.
+    fn peel_to_inner_projection(plan: &LogicalPlan) -> Option<&Projection> {
+        match plan {
+            LogicalPlan::Projection(p) => Some(p),
+            LogicalPlan::SubqueryAlias(alias) => {
+                Self::peel_to_inner_projection(alias.input.as_ref())
+            }
+            _ => None,
+        }
+    }
+
+    /// Walk through transparent nodes (Limit, Sort) between the outer
+    /// Projection and the Unnest, applying their SQL modifiers (LIMIT,
+    /// OFFSET, ORDER BY) to the query builder. Returns the `Unnest` node
+    /// and a reference to the enclosing `LogicalPlan` for recursion, or
+    /// `Ok(None)` if no Unnest is found.
     ///
-    /// `outer_ref` is the display result of [Expr::OuterReferenceColumn]
-    fn check_unnest_placeholder_with_outer_ref(expr: &Expr) -> Option<UnnestInputType> {
-        if let Expr::Alias(Alias { expr, .. }) = expr {
-            if let Expr::Column(Column { name, .. }) = expr.as_ref() {
-                if let Some(prefix) = name.strip_prefix(UNNEST_PLACEHOLDER) {
-                    if prefix.starts_with(&format!("({OUTER_REFERENCE_COLUMN_PREFIX}(")) {
-                        return Some(UnnestInputType::OuterReference);
-                    }
-                    return Some(UnnestInputType::Scalar);
+    /// By processing Limit/Sort inline and then recursing into the Unnest
+    /// plan directly, we bypass the normal Limit/Sort handlers which would
+    /// create unwanted derived subqueries (since `already_projected` is
+    /// set at the point this is called).
+    fn peel_to_unnest_with_modifiers<'a>(
+        &self,
+        plan: &'a LogicalPlan,
+        query: &mut Option<QueryBuilder>,
+        flatten_alias: Option<&str>,
+    ) -> Result<Option<(&'a Unnest, &'a LogicalPlan)>> {
+        match plan {
+            LogicalPlan::Unnest(unnest) => Ok(Some((unnest, plan))),
+            LogicalPlan::Limit(limit) => {
+                if let Some(fetch) = &limit.fetch
+                    && let Some(q) = query.as_mut()
+                {
+                    q.limit(Some(self.expr_to_sql(fetch)?));
+                }
+                if let Some(skip) = &limit.skip
+                    && let Some(q) = query.as_mut()
+                {
+                    q.offset(Some(ast::Offset {
+                        rows: ast::OffsetRows::None,
+                        value: self.expr_to_sql(skip)?,
+                    }));
                 }
+                self.peel_to_unnest_with_modifiers(
+                    limit.input.as_ref(),
+                    query,
+                    flatten_alias,
+                )
             }
+            LogicalPlan::Sort(sort) => {
+                let Some(query_ref) = query.as_mut() else {
+                    return internal_err!(
+                        "Sort between Projection and Unnest requires a statement context."
+                    );
+                };
+                if let Some(fetch) = sort.fetch {
+                    query_ref.limit(Some(ast::Expr::value(ast::Value::Number(
+                        fetch.to_string(),
+                        false,
+                    ))));
+                }
+                // When a flatten_alias is provided, rewrite
+                // __unnest_placeholder(...) columns in sort expressions to
+                // alias.VALUE so ORDER BY references the FLATTEN output.
+                let unnest_node = match sort.input.as_ref() {
+                    LogicalPlan::Unnest(u) => Some(u),
+                    _ => find_unnest_node_within_select(sort.input.as_ref()),
+                };
+                let sort_exprs = if let Some(alias) = flatten_alias
+                    && let Some(unnest) = unnest_node
+                {
+                    sort.expr
+                        .iter()
+                        .map(|s| {
+                            let rewritten = unproject_unnest_expr_as_flatten_value(
+                                s.expr.clone(),
+                                unnest,
+                                alias,
+                            )?;
+                            Ok(SortExpr {
+                                expr: rewritten,
+                                ..s.clone()
+                            })
+                        })
+                        .collect::<Result<Vec<_>>>()?
+                } else {
+                    sort.expr.clone()
+                };
+                query_ref.order_by(self.sorts_to_sql(&sort_exprs)?);
+                self.peel_to_unnest_with_modifiers(
+                    sort.input.as_ref(),
+                    query,
+                    flatten_alias,
+                )
+            }
+            _ => Ok(None),
+        }
+    }
+
+    /// Search an expression tree for an unnest placeholder column reference.
+    ///
+    /// Returns the [`UnnestInputType`] if any sub-expression is a column
+    /// whose name starts with `__unnest_placeholder`. The placeholder may
+    /// be at the top level (bare), inside a function call, or one of several
+    /// expressions — this function finds it regardless.
+    fn find_unnest_placeholder(expr: &Expr) -> Option<UnnestInputType> {
+        let mut result = None;
+        let _ = expr.apply(|e| {
+            if let Some(t) = Self::classify_placeholder_column(e) {
+                result = Some(t);
+                return Ok(TreeNodeRecursion::Stop);
+            }
+            Ok(TreeNodeRecursion::Continue)
+        });
+        result
+    }
+
+    /// Returns true if `expr` is a placeholder column, optionally wrapped
+    /// in a single alias (the rewriter's internal `UNNEST(...)` name).
+    /// Does NOT match when a user alias wraps the internal alias
+    /// (e.g. `Alias("c1", Alias("UNNEST(...)", Column(placeholder)))`),
+    /// so the table-factor path correctly falls through to
+    /// `reconstruct_select_statement` which preserves user aliases.
+    fn is_bare_unnest_placeholder(expr: &Expr) -> bool {
+        // Peel at most one alias layer (the rewriter's internal name).
+        let inner = match expr {
+            Expr::Alias(Alias { expr, .. }) => expr.as_ref(),
+            other => other,
+        };
+        Self::classify_placeholder_column(inner).is_some()
+    }
+
+    /// If `expr` is a `Column` whose name starts with `__unnest_placeholder`,
+    /// classify it as [`UnnestInputType::OuterReference`] or
+    /// [`UnnestInputType::Scalar`].
+    fn classify_placeholder_column(expr: &Expr) -> Option<UnnestInputType> {
+        if let Expr::Column(Column { name, .. }) = expr
+            && let Some(prefix) = name.strip_prefix(UNNEST_PLACEHOLDER)
+        {
+            if prefix.starts_with(&format!("({OUTER_REFERENCE_COLUMN_PREFIX}(")) {
+                return Some(UnnestInputType::OuterReference);
+            }
+            return Some(UnnestInputType::Scalar);
         }
         None
     }
 
+    /// Check whether an expression carries an internal `UNNEST(...)` display
+    /// name as its column name or outermost alias. After
+    /// [`unproject_unnest_expr_as_flatten_value`] rewrites the placeholder
+    /// column to `_unnest.VALUE`, the internal alias may still linger
+    /// (e.g. `Alias("UNNEST(make_array(...))", Column("_unnest.VALUE"))`).
+    /// Callers use this to replace the expression with a clean
+    /// `_unnest."VALUE"` select item.
+    fn has_internal_unnest_alias(expr: &Expr) -> bool {
+        match expr {
+            Expr::Column(col) => {
+                col.name.starts_with(&format!("{UNNEST_COLUMN_PREFIX}("))
+            }
+            Expr::Alias(Alias { name, .. }) => {
+                name.starts_with(&format!("{UNNEST_COLUMN_PREFIX}("))
+            }
+            _ => false,
+        }
+    }
+
+    /// Walk the plan tree and register any SubqueryAlias that wraps an
+    /// unnest as a FLATTEN table alias on the SelectBuilder. This allows
+    /// `reconstruct_select_statement` to rewrite column references (e.g.
+    /// `a.col` → `a.VALUE`) before the SubqueryAlias handler runs.
+    /// Returns true if a plan tree contains an Unnest node, searching
+    /// through Projection, Subquery, and SubqueryAlias wrappers.
+    fn contains_unnest(plan: &LogicalPlan) -> bool {
+        match plan {
+            LogicalPlan::Unnest(_) => true,
+            LogicalPlan::Projection(p) => Self::contains_unnest(&p.input),
+            LogicalPlan::Subquery(s) => Self::contains_unnest(&s.subquery),
+            LogicalPlan::SubqueryAlias(a) => Self::contains_unnest(&a.input),
+            _ => false,
+        }
+    }
+
+    fn collect_flatten_aliases(plan: &LogicalPlan, select: &mut SelectBuilder) {
+        match plan {
+            LogicalPlan::SubqueryAlias(alias) => {
+                if Self::contains_unnest(alias.input.as_ref()) {
+                    select.add_flatten_table_alias(alias.alias.table().to_string());
+                }
+            }
+            LogicalPlan::Join(join) => {
+                Self::collect_flatten_aliases(&join.left, select);
+                Self::collect_flatten_aliases(&join.right, select);
+            }
+            _ => {}
+        }
+    }
+
     fn try_unnest_to_table_factor_sql(
         &self,
         unnest: &Unnest,
@@ -1054,13 +1490,75 @@ impl Unparser<'_> {
         Ok(Some(unnest_relation))
     }
 
+    /// Build a `SELECT alias."VALUE"` item for Snowflake FLATTEN output.
+    fn build_flatten_value_select_item(
+        &self,
+        flatten_alias: &str,
+        user_alias: Option<&str>,
+    ) -> ast::SelectItem {
+        let compound = ast::Expr::CompoundIdentifier(vec![
+            self.new_ident_quoted_if_needs(flatten_alias.to_string()),
+            Ident::with_quote('"', "VALUE"),
+        ]);
+        match user_alias {
+            Some(alias) => ast::SelectItem::ExprWithAlias {
+                expr: compound,
+                alias: self.new_ident_quoted_if_needs(alias.to_string()),
+            },
+            None => ast::SelectItem::UnnamedExpr(compound),
+        }
+    }
+
+    /// Convert an `Unnest` logical plan node to a `LATERAL FLATTEN(INPUT => expr, ...)`
+    /// table factor for Snowflake-style SQL output.
+    fn try_unnest_to_lateral_flatten_sql(
+        &self,
+        unnest: &Unnest,
+    ) -> Result<Option<FlattenRelationBuilder>> {
+        let Some(projection) = Self::peel_to_inner_projection(unnest.input.as_ref())
+        else {
+            return Ok(None);
+        };
+
+        // For now, handle the simple case of a single expression to flatten.
+        // Multi-expression would require multiple LATERAL FLATTEN calls chained together.
+        let Some(first_expr) = projection.expr.first() else {
+            return Ok(None);
+        };
+
+        let input_expr = self.expr_to_sql(first_expr)?;
+
+        let mut flatten = FlattenRelationBuilder::default();
+        flatten.input_expr(input_expr);
+        flatten.outer(unnest.options.preserve_nulls);
+
+        Ok(Some(flatten))
+    }
+
     fn is_scan_with_pushdown(scan: &TableScan) -> bool {
         scan.projection.is_some() || !scan.filters.is_empty() || scan.fetch.is_some()
     }
 
+    /// Returns true if a plan, when used as the direct child of a SubqueryAlias,
+    /// must be emitted as a derived subquery `(SELECT ...) AS alias`.
+    ///
+    /// Plans like Aggregate or Window build their own SELECT clauses (GROUP BY,
+    /// window functions).
+    fn requires_derived_subquery(plan: &LogicalPlan) -> bool {
+        matches!(
+            plan,
+            LogicalPlan::Aggregate(_)
+                | LogicalPlan::Window(_)
+                | LogicalPlan::Sort(_)
+                | LogicalPlan::Limit(_)
+                | LogicalPlan::Union(_)
+        )
+    }
+
     /// Try to unparse a table scan with pushdown operations into a new subquery plan.
     /// If the table scan is without any pushdown operations, return None.
     fn unparse_table_scan_pushdown(
+        &self,
         plan: &LogicalPlan,
         alias: Option<TableReference>,
         already_projected: bool,
@@ -1087,42 +1585,37 @@ impl Unparser<'_> {
                 //
                 // Example:
                 //   select t1.c1 from t1 where t1.c1 > 1 -> select a.c1 from t1 as a where a.c1 > 1
-                if let Some(ref alias) = alias {
-                    if table_scan.projection.is_some() || !table_scan.filters.is_empty() {
-                        builder = builder.alias(alias.clone())?;
-                    }
+                if let Some(ref alias) = alias
+                    && (table_scan.projection.is_some() || !table_scan.filters.is_empty())
+                {
+                    builder = builder.alias(alias.clone())?;
                 }
 
                 // Avoid creating a duplicate Projection node, which would result in an additional subquery if a projection already exists.
                 // For example, if the `optimize_projection` rule is applied, there will be a Projection node, and duplicate projection
                 // information included in the TableScan node.
-                if !already_projected {
-                    if let Some(project_vec) = &table_scan.projection {
-                        if project_vec.is_empty() {
-                            builder = builder.project(vec![Expr::Literal(
-                                ScalarValue::Int64(Some(1)),
-                                None,
-                            )])?;
-                        } else {
-                            let project_columns = project_vec
-                                .iter()
-                                .cloned()
-                                .map(|i| {
-                                    let schema = table_scan.source.schema();
-                                    let field = schema.field(i);
-                                    if alias.is_some() {
-                                        Column::new(alias.clone(), field.name().clone())
-                                    } else {
-                                        Column::new(
-                                            Some(table_scan.table_name.clone()),
-                                            field.name().clone(),
-                                        )
-                                    }
-                                })
-                                .collect::<Vec<_>>();
-                            builder = builder.project(project_columns)?;
-                        };
-                    }
+                if !already_projected && let Some(project_vec) = &table_scan.projection {
+                    if project_vec.is_empty() {
+                        builder = builder.project(self.empty_projection_fallback())?;
+                    } else {
+                        let project_columns = project_vec
+                            .iter()
+                            .cloned()
+                            .map(|i| {
+                                let schema = table_scan.source.schema();
+                                let field = schema.field(i);
+                                if alias.is_some() {
+                                    Column::new(alias.clone(), field.name().clone())
+                                } else {
+                                    Column::new(
+                                        Some(table_scan.table_name.clone()),
+                                        field.name().clone(),
+                                    )
+                                }
+                            })
+                            .collect::<Vec<_>>();
+                        builder = builder.project(project_columns)?;
+                    };
                 }
 
                 let filter_expr: Result<Option<Expr>> = table_scan
@@ -1155,32 +1648,33 @@ impl Unparser<'_> {
                 // So we will append the alias to this subquery.
                 // Example:
                 //   select * from t1 limit 10 -> (select * from t1 limit 10) as a
-                if let Some(alias) = alias {
-                    if table_scan.projection.is_none() && table_scan.filters.is_empty() {
-                        builder = builder.alias(alias)?;
-                    }
+                if let Some(alias) = alias
+                    && table_scan.projection.is_none()
+                    && table_scan.filters.is_empty()
+                {
+                    builder = builder.alias(alias)?;
                 }
 
                 Ok(Some(builder.build()?))
             }
             LogicalPlan::SubqueryAlias(subquery_alias) => {
-                let ret = Self::unparse_table_scan_pushdown(
+                let ret = self.unparse_table_scan_pushdown(
                     &subquery_alias.input,
                     Some(subquery_alias.alias.clone()),
                     already_projected,
                 )?;
-                if let Some(alias) = alias {
-                    if let Some(plan) = ret {
-                        let plan = LogicalPlanBuilder::new(plan).alias(alias)?.build()?;
-                        return Ok(Some(plan));
-                    }
+                if let Some(alias) = alias
+                    && let Some(plan) = ret
+                {
+                    let plan = LogicalPlanBuilder::new(plan).alias(alias)?.build()?;
+                    return Ok(Some(plan));
                 }
                 Ok(ret)
             }
             // SubqueryAlias could be rewritten to a plan with a projection as the top node by [rewrite::subquery_alias_inner_query_and_columns].
             // The inner table scan could be a scan with pushdown operations.
             LogicalPlan::Projection(projection) => {
-                if let Some(plan) = Self::unparse_table_scan_pushdown(
+                if let Some(plan) = self.unparse_table_scan_pushdown(
                     &projection.input,
                     alias.clone(),
                     already_projected,
@@ -1407,12 +1901,70 @@ impl Unparser<'_> {
         ast::TableAlias {
             name: self.new_ident_quoted_if_needs(alias),
             columns,
+            explicit: true,
         }
     }
 
     fn dml_to_sql(&self, plan: &LogicalPlan) -> Result<ast::Statement> {
         not_impl_err!("Unsupported plan: {plan:?}")
     }
+
+    /// Generates appropriate projection expression for empty projection lists.
+    /// Returns an empty vec for dialects supporting empty select lists,
+    /// or a dummy literal `1` for other dialects.
+    fn empty_projection_fallback(&self) -> Vec<Expr> {
+        if self.dialect.supports_empty_select_list() {
+            Vec::new()
+        } else {
+            vec![Expr::Literal(ScalarValue::Int64(Some(1)), None)]
+        }
+    }
+
+    /// Decides where extracted table-scan filters belong in the unparsed SQL:
+    /// in the `JOIN ON` clause or in `WHERE`.
+    ///
+    /// For inner joins the two are semantically equivalent, so filters go to
+    /// `WHERE` (some dialects reject subqueries inside `JOIN ON`).
+    /// For outer joins the filters are AND-folded into `ON` to preserve correctness.
+    ///
+    /// Returns `(on_filter, where_filters)`.
+    fn split_join_on_and_where_filters(
+        join_type: JoinType,
+        join_filter: &Option<Expr>,
+        table_scan_filters: Vec<Expr>,
+    ) -> (Option<Expr>, Vec<Expr>) {
+        if table_scan_filters.is_empty() {
+            return (join_filter.clone(), vec![]);
+        }
+
+        if join_type == JoinType::Inner {
+            // ON and WHERE are equivalent for inner joins; prefer WHERE
+            // because some dialects reject subqueries inside JOIN ON.
+            return (join_filter.clone(), table_scan_filters);
+        }
+
+        // Outer joins: fold table-scan filters into ON to preserve semantics.
+        let combined = table_scan_filters.into_iter().reduce(|acc, filter| {
+            Expr::BinaryExpr(BinaryExpr {
+                left: Box::new(acc),
+                op: Operator::And,
+                right: Box::new(filter),
+            })
+        });
+
+        let on_filter = match (join_filter, combined) {
+            (Some(jf), Some(c)) => Some(Expr::BinaryExpr(BinaryExpr {
+                left: Box::new(jf.clone()),
+                op: Operator::And,
+                right: Box::new(c),
+            })),
+            (Some(jf), None) => Some(jf.clone()),
+            (None, Some(c)) => Some(c),
+            (None, None) => None,
+        };
+
+        (on_filter, vec![])
+    }
 }
 
 impl From<BuilderError> for DataFusionError {
diff --git a/datafusion/sql/src/unparser/rewrite.rs b/datafusion/sql/src/unparser/rewrite.rs
index c961f1d6f1f0c..a6bfba4cca7af 100644
--- a/datafusion/sql/src/unparser/rewrite.rs
+++ b/datafusion/sql/src/unparser/rewrite.rs
@@ -20,8 +20,8 @@ use std::{collections::HashSet, sync::Arc};
 use arrow::datatypes::Schema;
 use datafusion_common::tree_node::TreeNodeContainer;
 use datafusion_common::{
-    tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRewriter},
     Column, HashMap, Result, TableReference,
+    tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRewriter},
 };
 use datafusion_expr::expr::{Alias, UNNEST_COLUMN_PREFIX};
 use datafusion_expr::{Expr, LogicalPlan, Projection, Sort, SortExpr};
@@ -119,7 +119,6 @@ fn rewrite_sort_expr_for_union(exprs: Vec<SortExpr>) -> Result<Vec<SortExpr>> {
 ///     Projection: table.column1, table.column2
 ///       Window: window_function
 ///         TableScan: table
-///
 pub(super) fn rewrite_qualify(plan: LogicalPlan) -> Result<LogicalPlan> {
     let transformed_plan = plan.transform_up(|plan| match plan {
         // Check if the filter's input is a Window plan
@@ -224,7 +223,15 @@ pub(super) fn rewrite_plan_for_sort_on_non_projected_fields(
 
     let mut collects = p.expr.clone();
     for sort in &sort.expr {
-        collects.push(sort.expr.clone());
+        // Strip aliases from sort expressions so the comparison matches
+        // the inner Projection's raw expressions. The optimizer may add
+        // sort expressions to the inner Projection without aliases, while
+        // the Sort node's expressions carry aliases from the original plan.
+        let mut expr = sort.expr.clone();
+        while let Expr::Alias(alias) = expr {
+            expr = *alias.expr;
+        }
+        collects.push(expr);
     }
 
     // Compare outer collects Expr::to_string with inner collected transformed values
@@ -247,6 +254,48 @@ pub(super) fn rewrite_plan_for_sort_on_non_projected_fields(
             .map(|e| map.get(e).unwrap_or(e).clone())
             .collect::<Vec<_>>();
 
+        // The inner Projection may define aliases that the Sort references
+        // but the outer Projection does not include.  Since we are about to
+        // replace the inner Projection's expressions with `new_exprs` (which
+        // only contains the outer Projection's columns), those alias
+        // definitions will be lost.  To keep the Sort valid, rewrite any
+        // sort expression that references a dropped alias so that it uses
+        // the alias's underlying expression instead.
+        let projected_aliases: HashSet<&str> = new_exprs
+            .iter()
+            .filter_map(|e| match e {
+                Expr::Alias(alias) => Some(alias.name.as_str()),
+                _ => None,
+            })
+            .collect();
+
+        let dropped_aliases: HashMap<String, Expr> = inner_p
+            .expr
+            .iter()
+            .filter_map(|e| match e {
+                Expr::Alias(alias)
+                    if !projected_aliases.contains(alias.name.as_str()) =>
+                {
+                    Some((alias.name.clone(), (*alias.expr).clone()))
+                }
+                _ => None,
+            })
+            .collect();
+
+        if !dropped_aliases.is_empty() {
+            for sort_expr in &mut sort.expr {
+                let mut expr = sort_expr.expr.clone();
+                while let Expr::Alias(alias) = expr {
+                    expr = *alias.expr;
+                }
+                if let Expr::Column(ref col) = expr
+                    && let Some(underlying) = dropped_aliases.get(col.name())
+                {
+                    sort_expr.expr = underlying.clone();
+                }
+            }
+        }
+
         inner_p.expr.clone_from(&new_exprs);
         sort.input = Arc::new(LogicalPlan::Projection(inner_p));
 
@@ -311,8 +360,12 @@ pub(super) fn subquery_alias_inner_query_and_columns(
     // Check if the inner projection and outer projection have a matching pattern like
     //     Projection: j1.j1_id AS id
     //       Projection: j1.j1_id
+    if outer_projections.expr.len() != inner_projection.expr.len() {
+        return (plan, vec![]);
+    }
+
     for (i, inner_expr) in inner_projection.expr.iter().enumerate() {
-        let Expr::Alias(ref outer_alias) = &outer_projections.expr[i] else {
+        let Expr::Alias(outer_alias) = &outer_projections.expr[i] else {
             return (plan, vec![]);
         };
 
@@ -361,15 +414,14 @@ pub(super) fn find_unnest_column_alias(
         if projection.expr.len() != 1 {
             return (plan, None);
         }
-        if let Some(Expr::Alias(alias)) = projection.expr.first() {
-            if alias
+        if let Some(Expr::Alias(alias)) = projection.expr.first()
+            && alias
                 .expr
                 .schema_name()
                 .to_string()
                 .starts_with(&format!("{UNNEST_COLUMN_PREFIX}("))
-            {
-                return (projection.input.as_ref(), Some(alias.name.clone()));
-            }
+        {
+            return (projection.input.as_ref(), Some(alias.name.clone()));
         }
     }
     (plan, None)
@@ -484,3 +536,53 @@ impl TreeNodeRewriter for TableAliasRewriter<'_> {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::{DataType, Field};
+    use datafusion_expr::{LogicalPlanBuilder, col, table_scan};
+
+    // this is a regression test: when the outer projection has fewer expressions than
+    // the inner projection, `subquery_alias_inner_query_and_columns` must not panic
+    // with an index oob error
+    // note: this happens when optimizer passes (e.g. CommonSubexprEliminate)
+    // insert an inner projection with extra columns that a subsequent projection narrows
+    // back down
+    #[test]
+    fn test_stacked_projections_mismatched_lengths_no_panic() {
+        let schema = Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("name", DataType::Utf8, false),
+        ]);
+
+        // Inner projection has 2 expressions, outer has 0 (empty).
+        let inner_plan = LogicalPlanBuilder::from(
+            table_scan(Some("t"), &schema, Some(vec![0, 1]))
+                .unwrap()
+                .build()
+                .unwrap(),
+        )
+        .project(vec![col("t.id"), col("t.name")])
+        .unwrap()
+        .build()
+        .unwrap();
+
+        // Build an empty outer projection over the inner.
+        let outer_plan = LogicalPlanBuilder::from(inner_plan)
+            .project(Vec::<Expr>::new())
+            .unwrap()
+            .alias("sub")
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let LogicalPlan::SubqueryAlias(subquery_alias) = &outer_plan else {
+            panic!("expected SubqueryAlias");
+        };
+
+        // should return early without panicking
+        let (_plan, columns) = subquery_alias_inner_query_and_columns(subquery_alias);
+        assert!(columns.is_empty());
+    }
+}
diff --git a/datafusion/sql/src/unparser/utils.rs b/datafusion/sql/src/unparser/utils.rs
index 8b3791017a8af..3657516d534aa 100644
--- a/datafusion/sql/src/unparser/utils.rs
+++ b/datafusion/sql/src/unparser/utils.rs
@@ -18,17 +18,17 @@
 use std::{cmp::Ordering, sync::Arc, vec};
 
 use super::{
-    dialect::CharacterLengthStyle, dialect::DateFieldExtractStyle,
-    rewrite::TableAliasRewriter, Unparser,
+    Unparser, dialect::CharacterLengthStyle, dialect::DateFieldExtractStyle,
+    rewrite::TableAliasRewriter,
 };
 use datafusion_common::{
-    internal_err,
+    Column, DataFusionError, Result, ScalarValue, TableReference,
+    assert_eq_or_internal_err, internal_err,
     tree_node::{Transformed, TransformedResult, TreeNode},
-    Column, DataFusionError, Result, ScalarValue,
 };
 use datafusion_expr::{
-    expr, utils::grouping_set_to_exprlist, Aggregate, Expr, LogicalPlan,
-    LogicalPlanBuilder, Projection, SortExpr, Unnest, Window,
+    Aggregate, Expr, LogicalPlan, LogicalPlanBuilder, Projection, SortExpr, Unnest,
+    Window, expr, utils::grouping_set_to_exprlist,
 };
 
 use indexmap::IndexSet;
@@ -166,14 +166,12 @@ pub(crate) fn unproject_unnest_expr(expr: Expr, unnest: &Unnest) -> Result<Expr>
                 // Check if the column is among the columns to run unnest on. 
                 // Currently, only List/Array columns (defined in `list_type_columns`) are supported for unnesting. 
                 if unnest.list_type_columns.iter().any(|e| e.1.output_column.name == col_ref.name) {
-                    if let Ok(idx) = unnest.schema.index_of_column(col_ref) {
-                        if let LogicalPlan::Projection(Projection { expr, .. }) = unnest.input.as_ref() {
-                            if let Some(unprojected_expr) = expr.get(idx) {
+                    if let Ok(idx) = unnest.schema.index_of_column(col_ref)
+                        && let LogicalPlan::Projection(Projection { expr, .. }) = unnest.input.as_ref()
+                            && let Some(unprojected_expr) = expr.get(idx) {
                                 let unnest_expr = Expr::Unnest(expr::Unnest::new(unprojected_expr.clone()));
                                 return Ok(Transformed::yes(unnest_expr));
                             }
-                        }
-                    }
                     return internal_err!(
                         "Tried to unproject unnest expr for column '{}' that was not found in the provided Unnest!", &col_ref.name
                     );
@@ -185,6 +183,32 @@ pub(crate) fn unproject_unnest_expr(expr: Expr, unnest: &Unnest) -> Result<Expr>
         }).map(|e| e.data)
 }
 
+/// Like `unproject_unnest_expr`, but for Snowflake FLATTEN:
+/// transforms `__unnest_placeholder(...)` column references into
+/// `Expr::Column(Column { relation: Some(alias), name: "VALUE" })`.
+pub(crate) fn unproject_unnest_expr_as_flatten_value(
+    expr: Expr,
+    unnest: &Unnest,
+    flatten_alias: &str,
+) -> Result<Expr> {
+    expr.transform(|sub_expr| {
+        if let Expr::Column(col_ref) = &sub_expr
+            && unnest
+                .list_type_columns
+                .iter()
+                .any(|e| e.1.output_column.name == col_ref.name)
+        {
+            let value_col = Expr::Column(Column::new(
+                Some(TableReference::bare(flatten_alias)),
+                "VALUE",
+            ));
+            return Ok(Transformed::yes(value_col));
+        }
+        Ok(Transformed::no(sub_expr))
+    })
+    .map(|e| e.data)
+}
+
 /// Recursively identify all Column expressions and transform them into the appropriate
 /// aggregate expression contained in agg.
 ///
@@ -291,14 +315,14 @@ pub(crate) fn unproject_sort_expr(
                     }
 
                     // In case of aggregation there could be columns containing aggregation functions we need to unproject
-                    if let Some(agg) = agg {
-                        if agg.schema.is_column_from_schema(&col) {
-                            return Ok(Transformed::yes(unproject_agg_exprs(
-                                Expr::Column(col),
-                                agg,
-                                None,
-                            )?));
-                        }
+                    if let Some(agg) = agg
+                        && agg.schema.is_column_from_schema(&col)
+                    {
+                        return Ok(Transformed::yes(unproject_agg_exprs(
+                            Expr::Column(col),
+                            agg,
+                            None,
+                        )?));
                     }
 
                     // If SELECT and ORDER BY contain the same expression with a scalar function, the ORDER BY expression will
@@ -306,14 +330,12 @@ pub(crate) fn unproject_sort_expr(
                     // to transform it back to the actual expression.
                     if let LogicalPlan::Projection(Projection { expr, schema, .. }) =
                         input
+                        && let Ok(idx) = schema.index_of_column(&col)
+                        && let Some(Expr::ScalarFunction(scalar_fn)) = expr.get(idx)
                     {
-                        if let Ok(idx) = schema.index_of_column(&col) {
-                            if let Some(Expr::ScalarFunction(scalar_fn)) = expr.get(idx) {
-                                return Ok(Transformed::yes(Expr::ScalarFunction(
-                                    scalar_fn.clone(),
-                                )));
-                            }
-                        }
+                        return Ok(Transformed::yes(Expr::ScalarFunction(
+                            scalar_fn.clone(),
+                        )));
                     }
 
                     Ok(Transformed::no(Expr::Column(col)))
@@ -520,12 +542,12 @@ pub(crate) fn sqlite_from_unixtime_to_sql(
     unparser: &Unparser,
     from_unixtime_args: &[Expr],
 ) -> Result<Option<ast::Expr>> {
-    if from_unixtime_args.len() != 1 {
-        return internal_err!(
-            "from_unixtime for SQLite expects 1 argument, found {}",
-            from_unixtime_args.len()
-        );
-    }
+    assert_eq_or_internal_err!(
+        from_unixtime_args.len(),
+        1,
+        "from_unixtime for SQLite expects 1 argument, found {}",
+        from_unixtime_args.len()
+    );
 
     Ok(Some(unparser.scalar_function_to_sql(
         "datetime",
@@ -547,12 +569,12 @@ pub(crate) fn sqlite_date_trunc_to_sql(
     unparser: &Unparser,
     date_trunc_args: &[Expr],
 ) -> Result<Option<ast::Expr>> {
-    if date_trunc_args.len() != 2 {
-        return internal_err!(
-            "date_trunc for SQLite expects 2 arguments, found {}",
-            date_trunc_args.len()
-        );
-    }
+    assert_eq_or_internal_err!(
+        date_trunc_args.len(),
+        2,
+        "date_trunc for SQLite expects 2 arguments, found {}",
+        date_trunc_args.len()
+    );
 
     if let Expr::Literal(ScalarValue::Utf8(Some(unit)), _) = &date_trunc_args[0] {
         let format = match unit.to_lowercase().as_str() {
diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs
index 3c86d2d04905f..1a76dd69f46c5 100644
--- a/datafusion/sql/src/utils.rs
+++ b/datafusion/sql/src/utils.rs
@@ -20,14 +20,14 @@
 use std::vec;
 
 use arrow::datatypes::{
-    DataType, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE,
+    DECIMAL_DEFAULT_SCALE, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, DataType,
 };
 use datafusion_common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter,
 };
 use datafusion_common::{
-    exec_datafusion_err, exec_err, internal_err, plan_err, Column, DFSchemaRef,
-    Diagnostic, HashMap, Result, ScalarValue,
+    Column, DFSchemaRef, Diagnostic, HashMap, Result, ScalarValue,
+    assert_or_internal_err, exec_datafusion_err, exec_err, internal_err, plan_err,
 };
 use datafusion_expr::builder::get_struct_unnested_columns;
 use datafusion_expr::expr::{
@@ -35,7 +35,7 @@ use datafusion_expr::expr::{
 };
 use datafusion_expr::utils::{expr_as_column_expr, find_column_exprs};
 use datafusion_expr::{
-    col, expr_vec_fmt, ColumnUnnestList, Expr, ExprSchemable, LogicalPlan,
+    ColumnUnnestList, Expr, ExprSchemable, LogicalPlan, col, expr_vec_fmt,
 };
 
 use indexmap::IndexMap;
@@ -97,6 +97,7 @@ pub(crate) enum CheckColumnsMustReferenceAggregatePurpose {
     Projection,
     Having,
     Qualify,
+    OrderBy,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -116,11 +117,16 @@ impl CheckColumnsSatisfyExprsPurpose {
             Self::Aggregate(CheckColumnsMustReferenceAggregatePurpose::Qualify) => {
                 "Column in QUALIFY must be in GROUP BY or an aggregate function"
             }
+            Self::Aggregate(CheckColumnsMustReferenceAggregatePurpose::OrderBy) => {
+                "Column in ORDER BY must be in GROUP BY or an aggregate function"
+            }
         }
     }
 
     fn diagnostic_message(&self, expr: &Expr) -> String {
-        format!("'{expr}' must appear in GROUP BY clause because it's not an aggregate expression")
+        format!(
+            "'{expr}' must appear in GROUP BY clause because it's not an aggregate expression"
+        )
     }
 }
 
@@ -219,7 +225,8 @@ pub(crate) fn resolve_positions_to_exprs(
         }
         Expr::Literal(ScalarValue::Int64(Some(position)), _) => plan_err!(
             "Cannot find column with position {} in SELECT clause. Valid columns: 1 to {}",
-            position, select_exprs.len()
+            position,
+            select_exprs.len()
         ),
         _ => Ok(expr),
     }
@@ -288,7 +295,7 @@ pub(crate) fn make_decimal_type(
         (Some(p), Some(s)) => (p as u8, s as i8),
         (Some(p), None) => (p as u8, 0),
         (None, Some(_)) => {
-            return plan_err!("Cannot specify only scale for decimal data type")
+            return plan_err!("Cannot specify only scale for decimal data type");
         }
         (None, None) => (DECIMAL128_MAX_PRECISION, DECIMAL_DEFAULT_SCALE),
     };
@@ -324,6 +331,8 @@ pub(crate) fn value_to_string(value: &Value) -> Option<String> {
         Value::Number(_, _) | Value::Boolean(_) => Some(value.to_string()),
         Value::UnicodeStringLiteral(s) => Some(s.to_string()),
         Value::EscapedStringLiteral(s) => Some(s.to_string()),
+        Value::QuoteDelimitedStringLiteral(s)
+        | Value::NationalQuoteDelimitedStringLiteral(s) => Some(s.value.to_string()),
         Value::DoubleQuotedString(_)
         | Value::NationalStringLiteral(_)
         | Value::SingleQuotedByteStringLiteral(_)
@@ -367,7 +376,7 @@ pub(crate) fn rewrite_recursive_unnests_bottom_up(
 pub const UNNEST_PLACEHOLDER: &str = "__unnest_placeholder";
 
 /*
-This is only usedful when used with transform down up
+This is only useful when used with transform down up
 A full example of how the transformation works:
  */
 struct RecursiveUnnestRewriter<'a> {
@@ -399,6 +408,24 @@ impl RecursiveUnnestRewriter<'_> {
             .collect()
     }
 
+    /// Check if the current expression is at the root level for struct unnest purposes.
+    /// This is true if:
+    /// 1. The expression IS the root expression, OR
+    /// 2. The root expression is an Alias wrapping this expression
+    ///
+    /// This allows `unnest(struct_col) AS alias` to work, where the alias is simply
+    /// ignored for struct unnest (matching DuckDB behavior).
+    fn is_at_struct_allowed_root(&self, expr: &Expr) -> bool {
+        if expr == self.root_expr {
+            return true;
+        }
+        // Allow struct unnest when root is an alias wrapping the unnest
+        if let Expr::Alias(Alias { expr: inner, .. }) = self.root_expr {
+            return inner.as_ref() == expr;
+        }
+        false
+    }
+
     fn transform(
         &mut self,
         level: usize,
@@ -417,30 +444,31 @@ impl RecursiveUnnestRewriter<'_> {
         // This is due to the fact that unnest transformation should keep the original
         // column name as is, to comply with group by and order by
         let placeholder_column = Column::from_name(placeholder_name.clone());
-
-        let (data_type, _) = expr_in_unnest.data_type_and_nullable(self.input_schema)?;
+        let field = expr_in_unnest.to_field(self.input_schema)?.1;
+        let data_type = field.data_type();
 
         match data_type {
             DataType::Struct(inner_fields) => {
-                if !struct_allowed {
-                    return internal_err!("unnest on struct can only be applied at the root level of select expression");
-                }
+                assert_or_internal_err!(
+                    struct_allowed,
+                    "unnest on struct can only be applied at the root level of select expression"
+                );
                 push_projection_dedupl(
                     self.inner_projection_exprs,
                     expr_in_unnest.clone().alias(placeholder_name.clone()),
                 );
                 self.columns_unnestings
                     .insert(Column::from_name(placeholder_name.clone()), None);
-                Ok(
-                    get_struct_unnested_columns(&placeholder_name, &inner_fields)
-                        .into_iter()
-                        .map(Expr::Column)
-                        .collect(),
-                )
+                Ok(get_struct_unnested_columns(&placeholder_name, inner_fields)
+                    .into_iter()
+                    .map(Expr::Column)
+                    .collect())
             }
             DataType::List(_)
             | DataType::FixedSizeList(_, _)
-            | DataType::LargeList(_) => {
+            | DataType::LargeList(_)
+            | DataType::ListView(_)
+            | DataType::LargeListView(_) => {
                 push_projection_dedupl(
                     self.inner_projection_exprs,
                     expr_in_unnest.clone().alias(placeholder_name.clone()),
@@ -472,13 +500,13 @@ impl TreeNodeRewriter for RecursiveUnnestRewriter<'_> {
     type Node = Expr;
 
     /// This downward traversal needs to keep track of:
-    /// - Whether or not some unnest expr has been visited from the top util the current node
+    /// - Whether or not some unnest expr has been visited from the top until the current node
     /// - If some unnest expr has been visited, maintain a stack of such information, this
     ///   is used to detect if some recursive unnest expr exists (e.g **unnest(unnest(unnest(3d column))))**
     fn f_down(&mut self, expr: Expr) -> Result<Transformed<Expr>> {
         if let Expr::Unnest(ref unnest_expr) = expr {
-            let (data_type, _) =
-                unnest_expr.expr.data_type_and_nullable(self.input_schema)?;
+            let field = unnest_expr.expr.to_field(self.input_schema)?.1;
+            let data_type = field.data_type();
             self.consecutive_unnest.push(Some(unnest_expr.clone()));
             // if expr inside unnest is a struct, do not consider
             // the next unnest as consecutive unnest (if any)
@@ -531,7 +559,6 @@ impl TreeNodeRewriter for RecursiveUnnestRewriter<'_> {
     ///                          / /
     ///                       column2
     /// ```
-    ///
     fn f_up(&mut self, expr: Expr) -> Result<Transformed<Expr>> {
         if let Expr::Unnest(ref traversing_unnest) = expr {
             if traversing_unnest == self.top_most_unnest.as_ref().unwrap() {
@@ -561,7 +588,8 @@ impl TreeNodeRewriter for RecursiveUnnestRewriter<'_> {
                 // instead of unnest(struct_arr_col, depth = 2)
 
                 let unnest_recursion = unnest_stack.len();
-                let struct_allowed = (&expr == self.root_expr) && unnest_recursion == 1;
+                let struct_allowed =
+                    self.is_at_struct_allowed_root(&expr) && unnest_recursion == 1;
 
                 let mut transformed_exprs = self.transform(
                     unnest_recursion,
@@ -569,7 +597,9 @@ impl TreeNodeRewriter for RecursiveUnnestRewriter<'_> {
                     inner_expr,
                     struct_allowed,
                 )?;
-                if struct_allowed {
+                // Only set transformed_root_exprs for struct unnest (which returns multiple expressions).
+                // For list unnest (single expression), we let the normal rewrite handle the alias.
+                if struct_allowed && transformed_exprs.len() > 1 {
                     self.transformed_root_exprs = Some(transformed_exprs.clone());
                 }
                 return Ok(Transformed::new(
@@ -673,7 +703,7 @@ mod tests {
     use arrow::datatypes::{DataType as ArrowDataType, Field, Fields, Schema};
     use datafusion_common::{Column, DFSchema, Result};
     use datafusion_expr::{
-        col, lit, unnest, ColumnUnnestList, EmptyRelation, LogicalPlan,
+        ColumnUnnestList, EmptyRelation, LogicalPlan, col, lit, unnest,
     };
     use datafusion_functions::core::expr_ext::FieldAccessor;
     use datafusion_functions_aggregate::expr_fn::count;
@@ -745,13 +775,15 @@ mod tests {
         // Only the bottom most unnest exprs are transformed
         assert_eq!(
             transformed_exprs,
-            vec![col("__unnest_placeholder(3d_col,depth=2)")
-                .alias("UNNEST(UNNEST(3d_col))")
-                .add(
-                    col("__unnest_placeholder(3d_col,depth=2)")
-                        .alias("UNNEST(UNNEST(3d_col))")
-                )
-                .add(col("i64_col"))]
+            vec![
+                col("__unnest_placeholder(3d_col,depth=2)")
+                    .alias("UNNEST(UNNEST(3d_col))")
+                    .add(
+                        col("__unnest_placeholder(3d_col,depth=2)")
+                            .alias("UNNEST(UNNEST(3d_col))")
+                    )
+                    .add(col("i64_col"))
+            ]
         );
         column_unnests_eq(
             vec![
@@ -787,7 +819,9 @@ mod tests {
             ]
         );
         column_unnests_eq(
-            vec!["__unnest_placeholder(3d_col)=>[__unnest_placeholder(3d_col,depth=2)|depth=2, __unnest_placeholder(3d_col,depth=1)|depth=1]"],
+            vec![
+                "__unnest_placeholder(3d_col)=>[__unnest_placeholder(3d_col,depth=2)|depth=2, __unnest_placeholder(3d_col,depth=1)|depth=1]",
+            ],
             &unnest_placeholder_columns,
         );
         // Still reference struct_col in original schema but with alias,
@@ -879,9 +913,11 @@ mod tests {
         // Only transform the unnest children
         assert_eq!(
             transformed_exprs,
-            vec![col("__unnest_placeholder(array_col,depth=1)")
-                .alias("UNNEST(array_col)")
-                .add(lit(1i64))]
+            vec![
+                col("__unnest_placeholder(array_col,depth=1)")
+                    .alias("UNNEST(array_col)")
+                    .add(lit(1i64))
+            ]
         );
 
         // Keep appending to the current vector
diff --git a/datafusion/sql/src/values.rs b/datafusion/sql/src/values.rs
index dd8957c95470d..c8cdf1254f33f 100644
--- a/datafusion/sql/src/values.rs
+++ b/datafusion/sql/src/values.rs
@@ -18,7 +18,7 @@
 use std::sync::Arc;
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
-use datafusion_common::{DFSchema, Result};
+use datafusion_common::{DFSchema, Result, not_impl_err};
 use datafusion_expr::{LogicalPlan, LogicalPlanBuilder};
 use sqlparser::ast::Values as SQLValues;
 
@@ -31,7 +31,13 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         let SQLValues {
             explicit_row: _,
             rows,
+            value_keyword,
         } = values;
+        if value_keyword {
+            return not_impl_err!(
+                "`VALUE` keyword not supported. Did you mean `VALUES`?"
+            )?;
+        }
 
         let empty_schema = Arc::new(DFSchema::empty());
         let values = rows
diff --git a/datafusion/sql/tests/cases/collection.rs b/datafusion/sql/tests/cases/collection.rs
index 59704d6445b35..06a876dcfc9eb 100644
--- a/datafusion/sql/tests/cases/collection.rs
+++ b/datafusion/sql/tests/cases/collection.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion_common::{assert_contains, DataFusionError};
+use datafusion_common::{DataFusionError, assert_contains};
 use datafusion_sql::planner::SqlToRel;
 use sqlparser::{dialect::GenericDialect, parser::Parser};
 
@@ -42,9 +42,11 @@ fn test_collect_select_items() {
     let error = do_query(query);
     let errors = error.iter().collect::<Vec<_>>();
     assert_eq!(errors.len(), 2);
-    assert!(errors[0]
-        .to_string()
-        .contains("No field named first_namex."));
+    assert!(
+        errors[0]
+            .to_string()
+            .contains("No field named first_namex.")
+    );
     assert_contains!(errors[1].to_string(), "No field named last_namex.");
 }
 
diff --git a/datafusion/sql/tests/cases/diagnostic.rs b/datafusion/sql/tests/cases/diagnostic.rs
index 8648dffb50046..7a729739469d3 100644
--- a/datafusion/sql/tests/cases/diagnostic.rs
+++ b/datafusion/sql/tests/cases/diagnostic.rs
@@ -69,10 +69,12 @@ fn do_query(sql: &'static str) -> Diagnostic {
 /// ## Example
 ///
 /// ```rust
-/// let spans = get_spans("SELECT /*whole+left*/speed/*left*/ + /*right*/10/*right+whole*/ FROM cars");
-/// // whole is                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
-/// // left is                                  ^^^^^
-/// // right is                                                          ^^
+/// let spans = get_spans(
+///     "SELECT /*whole+left*/speed/*left*/ + /*right*/10/*right+whole*/ FROM cars",
+///     // whole is           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+///     // left is            ^^^^^
+///     // right is                                    ^^
+/// );
 /// dbg!(&spans["whole"]);
 /// dbg!(&spans["left"]);
 /// dbg!(&spans["right"]);
@@ -202,8 +204,7 @@ fn test_ambiguous_reference() -> Result<()> {
 
 #[test]
 fn test_incompatible_types_binary_arithmetic() -> Result<()> {
-    let query =
-        "SELECT /*whole+left*/id/*left*/ + /*right*/first_name/*right+whole*/ FROM person";
+    let query = "SELECT /*whole+left*/id/*left*/ + /*right*/first_name/*right+whole*/ FROM person";
     let spans = get_spans(query);
     let diag = do_query(query);
     assert_snapshot!(diag.message, @"expressions have incompatible types");
diff --git a/datafusion/sql/tests/cases/params.rs b/datafusion/sql/tests/cases/params.rs
index 147628656d8f3..68c560ead68cd 100644
--- a/datafusion/sql/tests/cases/params.rs
+++ b/datafusion/sql/tests/cases/params.rs
@@ -18,9 +18,8 @@
 use crate::logical_plan;
 use arrow::datatypes::{DataType, Field, FieldRef};
 use datafusion_common::{
-    assert_contains,
-    metadata::{format_type_and_metadata, ScalarAndMetadata},
-    ParamValues, ScalarValue,
+    ParamValues, ScalarValue, assert_contains,
+    metadata::{ScalarAndMetadata, format_type_and_metadata},
 };
 use datafusion_expr::{LogicalPlan, Prepare, Statement};
 use insta::assert_snapshot;
@@ -104,9 +103,7 @@ fn test_prepare_statement_to_plan_panic_param_format() {
 
     assert_snapshot!(
         logical_plan(sql).unwrap_err().strip_backtrace(),
-        @r###"
-        Error during planning: Invalid placeholder, not a number: $foo
-        "###
+        @"Error during planning: Unknown placeholder: $foo"
     );
 }
 
@@ -118,9 +115,7 @@ fn test_prepare_statement_to_plan_panic_param_zero() {
 
     assert_snapshot!(
         logical_plan(sql).unwrap_err().strip_backtrace(),
-        @r###"
-        Error during planning: Invalid placeholder, zero is not a valid index: $0
-        "###
+        @"Error during planning: Invalid placeholder, zero is not a valid index: $0"
     );
 }
 
@@ -129,10 +124,12 @@ fn test_prepare_statement_to_plan_panic_prepare_wrong_syntax() {
     // param is not number following the $ sign
     // panic due to error returned from the parser
     let sql = "PREPARE AS SELECT id, age  FROM person WHERE age = $foo";
-    assert!(logical_plan(sql)
-        .unwrap_err()
-        .strip_backtrace()
-        .contains("Expected: AS, found: SELECT"))
+    assert!(
+        logical_plan(sql)
+            .unwrap_err()
+            .strip_backtrace()
+            .contains("Expected: AS, found: SELECT")
+    )
 }
 
 #[test]
@@ -142,7 +139,7 @@ fn test_prepare_statement_to_plan_panic_no_relation_and_constant_param() {
     let plan = logical_plan(sql).unwrap_err().strip_backtrace();
     assert_snapshot!(
         plan,
-        @r"Schema error: No field named id."
+        @"Schema error: No field named id."
     );
 }
 
@@ -195,7 +192,7 @@ fn test_prepare_statement_to_plan_no_param() {
           TableScan: person
     "#
     );
-    assert_snapshot!(dt, @r#"Int32"#);
+    assert_snapshot!(dt, @"Int32");
 
     ///////////////////
     // replace params with values
@@ -223,7 +220,7 @@ fn test_prepare_statement_to_plan_no_param() {
           TableScan: person
     "#
     );
-    assert_snapshot!(dt, @r#""#);
+    assert_snapshot!(dt, @"");
 
     ///////////////////
     // replace params with values
@@ -251,9 +248,7 @@ fn test_prepare_statement_to_plan_one_param_no_value_panic() {
         plan.with_param_values(param_values)
         .unwrap_err()
         .strip_backtrace(),
-        @r###"
-        Error during planning: Expected 1 parameters, got 0
-        "###);
+        @"Error during planning: Expected 1 parameters, got 0");
 }
 
 #[test]
@@ -268,9 +263,7 @@ fn test_prepare_statement_to_plan_one_param_one_value_different_type_panic() {
         plan.with_param_values(param_values)
             .unwrap_err()
             .strip_backtrace(),
-        @r###"
-        Error during planning: Expected parameter of type Int32, got Float64 at index 0
-        "###
+        @"Error during planning: Expected parameter of type Int32, got Float64 at index 0"
     );
 }
 
@@ -286,9 +279,7 @@ fn test_prepare_statement_to_plan_no_param_on_value_panic() {
         plan.with_param_values(param_values)
             .unwrap_err()
             .strip_backtrace(),
-        @r###"
-        Error during planning: Expected 0 parameters, got 1
-        "###
+        @"Error during planning: Expected 0 parameters, got 1"
     );
 }
 
@@ -304,7 +295,7 @@ fn test_prepare_statement_to_plan_params_as_constants() {
         EmptyRelation: rows=1
     "#
     );
-    assert_snapshot!(dt, @r#"Int32"#);
+    assert_snapshot!(dt, @"Int32");
 
     ///////////////////
     // replace params with values
@@ -329,7 +320,7 @@ fn test_prepare_statement_to_plan_params_as_constants() {
         EmptyRelation: rows=1
     "#
     );
-    assert_snapshot!(dt, @r#"Int32"#);
+    assert_snapshot!(dt, @"Int32");
 
     ///////////////////
     // replace params with values
@@ -354,7 +345,7 @@ fn test_prepare_statement_to_plan_params_as_constants() {
         EmptyRelation: rows=1
     "#
     );
-    assert_snapshot!(dt, @r#"Int32, Float64"#);
+    assert_snapshot!(dt, @"Int32, Float64");
 
     ///////////////////
     // replace params with values
@@ -375,8 +366,7 @@ fn test_prepare_statement_to_plan_params_as_constants() {
 #[test]
 fn test_infer_types_from_join() {
     let test = ParameterTest {
-        sql:
-            "SELECT id, order_id FROM person JOIN orders ON id = customer_id and age = $1",
+        sql: "SELECT id, order_id FROM person JOIN orders ON id = customer_id and age = $1",
         expected_types: vec![("$1", Some(DataType::Int32))],
         param_values: vec![ScalarValue::Int32(Some(10))],
     };
@@ -403,7 +393,7 @@ fn test_prepare_statement_infer_types_from_join() {
     let test = ParameterTest {
         sql: "PREPARE my_plan AS SELECT id, order_id FROM person JOIN orders ON id = customer_id and age = $1",
         expected_types: vec![("$1", Some(DataType::Int32))],
-        param_values: vec![ScalarValue::Int32(Some(10))]
+        param_values: vec![ScalarValue::Int32(Some(10))],
     };
 
     assert_snapshot!(
@@ -527,7 +517,7 @@ fn test_infer_types_subquery() {
     let test = ParameterTest {
         sql: "SELECT id, age FROM person WHERE age = (select max(age) from person where id = $1)",
         expected_types: vec![("$1", Some(DataType::UInt32))],
-        param_values: vec![ScalarValue::UInt32(Some(10))]
+        param_values: vec![ScalarValue::UInt32(Some(10))],
     };
 
     assert_snapshot!(
@@ -560,7 +550,7 @@ fn test_prepare_statement_infer_types_subquery() {
     let test = ParameterTest {
         sql: "PREPARE my_plan AS SELECT id, age FROM person WHERE age = (select max(age) from person where id = $1)",
         expected_types: vec![("$1", Some(DataType::UInt32))],
-        param_values: vec![ScalarValue::UInt32(Some(10))]
+        param_values: vec![ScalarValue::UInt32(Some(10))],
     };
 
     assert_snapshot!(
@@ -690,7 +680,7 @@ fn test_prepare_statement_insert_infer() {
             ScalarValue::UInt32(Some(1)),
             ScalarValue::from("Alan"),
             ScalarValue::from("Turing"),
-        ]
+        ],
     };
     assert_snapshot!(
         test.run(),
@@ -721,7 +711,7 @@ fn test_prepare_statement_to_plan_one_param() {
           TableScan: person
     "#
     );
-    assert_snapshot!(dt, @r#"Int32"#);
+    assert_snapshot!(dt, @"Int32");
 
     ///////////////////
     // replace params with values
@@ -788,7 +778,7 @@ fn test_update_infer_with_metadata() {
     let test = ParameterTestWithMetadata {
         sql: "PREPARE my_plan AS update person_with_uuid_extension set last_name=$1 where id=$2",
         expected_types,
-        param_values
+        param_values,
     };
 
     assert_snapshot!(
@@ -839,7 +829,7 @@ fn test_insert_infer_with_metadata() {
     let test = ParameterTestWithMetadata {
         sql: "insert into person_with_uuid_extension (id, first_name, last_name) values ($1, $2, $3)",
         expected_types: expected_types.clone(),
-        param_values: param_values.clone()
+        param_values: param_values.clone(),
     };
 
     assert_snapshot!(
@@ -860,7 +850,7 @@ fn test_insert_infer_with_metadata() {
     let test = ParameterTestWithMetadata {
         sql: "PREPARE my_plan AS insert into person_with_uuid_extension (id, first_name, last_name) values ($1, $2, $3)",
         expected_types,
-        param_values
+        param_values,
     };
 
     assert_snapshot!(
@@ -895,7 +885,7 @@ fn test_prepare_statement_to_plan_data_type() {
           TableScan: person
     "#
     );
-    assert_snapshot!(dt, @r#"Float64"#);
+    assert_snapshot!(dt, @"Float64");
 
     ///////////////////
     // replace params with values still succeed and use Float64
@@ -928,7 +918,7 @@ fn test_prepare_statement_to_plan_multi_params() {
           TableScan: person
     "#
     );
-    assert_snapshot!(dt, @r#"Int32, Utf8View, Float64, Int32, Float64, Utf8View"#);
+    assert_snapshot!(dt, @"Int32, Utf8View, Float64, Int32, Float64, Utf8View");
 
     ///////////////////
     // replace params with values
@@ -973,7 +963,7 @@ fn test_prepare_statement_to_plan_having() {
               TableScan: person
     "#
     );
-    assert_snapshot!(dt, @r#"Int32, Float64, Float64, Float64"#);
+    assert_snapshot!(dt, @"Int32, Float64, Float64, Float64");
 
     ///////////////////
     // replace params with values
@@ -987,13 +977,13 @@ fn test_prepare_statement_to_plan_having() {
     let plan_with_params = plan.with_param_values(param_values).unwrap();
     assert_snapshot!(
         plan_with_params,
-        @r#"
+        @r"
     Projection: person.id, sum(person.age)
       Filter: sum(person.age) < Int32(10) AND sum(person.age) > Int64(10) OR sum(person.age) IN ([Float64(200), Float64(300)])
         Aggregate: groupBy=[[person.id]], aggr=[[sum(person.age)]]
           Filter: person.salary > Float64(100)
             TableScan: person
-    "#
+    "
     );
 }
 
@@ -1012,18 +1002,18 @@ fn test_prepare_statement_to_plan_limit() {
           TableScan: person
     "#
     );
-    assert_snapshot!(dt, @r#"Int64, Int64"#);
+    assert_snapshot!(dt, @"Int64, Int64");
 
     // replace params with values
     let param_values = vec![ScalarValue::Int64(Some(10)), ScalarValue::Int64(Some(200))];
     let plan_with_params = plan.with_param_values(param_values).unwrap();
     assert_snapshot!(
         plan_with_params,
-        @r#"
+        @r"
     Limit: skip=10, fetch=200
       Projection: person.id
         TableScan: person
-    "#
+    "
     );
 }
 
@@ -1051,6 +1041,22 @@ fn test_prepare_statement_unknown_hash_param() {
     );
 }
 
+#[test]
+fn test_insert_infer_with_function_wrapped_placeholder() {
+    let plan = logical_plan(
+        "INSERT INTO person (id, first_name, age) VALUES ($1, character_length($2), $3)",
+    )
+    .unwrap();
+
+    let actual_types = plan.get_parameter_types().unwrap();
+    let expected_types: HashMap<String, Option<DataType>> = HashMap::from([
+        ("$1".to_string(), Some(DataType::UInt32)),
+        ("$2".to_string(), None),
+        ("$3".to_string(), Some(DataType::Int32)),
+    ]);
+    assert_eq!(actual_types, expected_types);
+}
+
 #[test]
 fn test_prepare_statement_bad_list_idx() {
     let sql = "SELECT id from person where id = $foo";
@@ -1058,5 +1064,8 @@ fn test_prepare_statement_bad_list_idx() {
     let param_values = ParamValues::List(vec![]);
 
     let err = plan.replace_params_with_values(&param_values).unwrap_err();
-    assert_contains!(err.to_string(), "Error during planning: Failed to parse placeholder id: invalid digit found in string");
+    assert_contains!(
+        err.to_string(),
+        "Error during planning: Failed to parse placeholder id: invalid digit found in string"
+    );
 }
diff --git a/datafusion/sql/tests/cases/plan_to_sql.rs b/datafusion/sql/tests/cases/plan_to_sql.rs
index 5f76afb763cff..e30c4264720a8 100644
--- a/datafusion/sql/tests/cases/plan_to_sql.rs
+++ b/datafusion/sql/tests/cases/plan_to_sql.rs
@@ -18,17 +18,19 @@
 use arrow::datatypes::{DataType, Field, Schema};
 
 use datafusion_common::{
-    assert_contains, Column, DFSchema, DFSchemaRef, DataFusionError, Result,
-    TableReference,
+    Column, DFSchema, DFSchemaRef, DataFusionError, Result, TableReference,
+    assert_contains,
 };
 use datafusion_expr::expr::{WindowFunction, WindowFunctionParams};
 use datafusion_expr::test::function_stub::{
-    count_udaf, max_udaf, min_udaf, sum, sum_udaf,
+    count_udaf, max, max_udaf, min_udaf, sum, sum_udaf,
 };
 use datafusion_expr::{
-    cast, col, lit, table_scan, wildcard, EmptyRelation, Expr, Extension, LogicalPlan,
-    LogicalPlanBuilder, Union, UserDefinedLogicalNode, UserDefinedLogicalNodeCore,
-    WindowFrame, WindowFunctionDefinition,
+    ColumnarValue, EmptyRelation, Expr, Extension, LogicalPlan, LogicalPlanBuilder,
+    ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, Union,
+    UserDefinedLogicalNode, UserDefinedLogicalNodeCore, Volatility, WindowFrame,
+    WindowFunctionDefinition, cast, col, exists, in_subquery, lit, scalar_subquery,
+    table_scan, wildcard,
 };
 use datafusion_functions::unicode;
 use datafusion_functions_aggregate::grouping::grouping_udaf;
@@ -39,9 +41,9 @@ use datafusion_sql::planner::{ContextProvider, PlannerContext, SqlToRel};
 use datafusion_sql::unparser::dialect::{
     BigQueryDialect, CustomDialectBuilder, DefaultDialect as UnparserDefaultDialect,
     DefaultDialect, Dialect as UnparserDialect, MySqlDialect as UnparserMySqlDialect,
-    PostgreSqlDialect as UnparserPostgreSqlDialect, SqliteDialect,
+    PostgreSqlDialect as UnparserPostgreSqlDialect, SnowflakeDialect, SqliteDialect,
 };
-use datafusion_sql::unparser::{expr_to_sql, plan_to_sql, Unparser};
+use datafusion_sql::unparser::{Unparser, expr_to_sql, plan_to_sql};
 use insta::assert_snapshot;
 use sqlparser::ast::Statement;
 use std::hash::Hash;
@@ -70,26 +72,26 @@ use sqlparser::parser::Parser;
 #[test]
 fn test_roundtrip_expr_1() {
     let expr = roundtrip_expr(TableReference::bare("person"), "age > 35").unwrap();
-    assert_snapshot!(expr, @r#"(age > 35)"#);
+    assert_snapshot!(expr, @"(age > 35)");
 }
 
 #[test]
 fn test_roundtrip_expr_2() {
     let expr = roundtrip_expr(TableReference::bare("person"), "id = '10'").unwrap();
-    assert_snapshot!(expr, @r#"(id = '10')"#);
+    assert_snapshot!(expr, @"(id = '10')");
 }
 
 #[test]
 fn test_roundtrip_expr_3() {
     let expr =
         roundtrip_expr(TableReference::bare("person"), "CAST(id AS VARCHAR)").unwrap();
-    assert_snapshot!(expr, @r#"CAST(id AS VARCHAR)"#);
+    assert_snapshot!(expr, @"CAST(id AS VARCHAR)");
 }
 
 #[test]
 fn test_roundtrip_expr_4() {
     let expr = roundtrip_expr(TableReference::bare("person"), "sum((age * 2))").unwrap();
-    assert_snapshot!(expr, @r#"sum((age * 2))"#);
+    assert_snapshot!(expr, @"sum((age * 2))");
 }
 
 fn roundtrip_expr(table: TableReference, sql: &str) -> Result<String> {
@@ -286,7 +288,7 @@ fn roundtrip_crossjoin() -> Result<()> {
         plan_roundtrip,
         @r"
     Projection: j1.j1_id, j2.j2_string
-      Cross Join: 
+      Cross Join:
         TableScan: j1
         TableScan: j2
     "
@@ -334,9 +336,7 @@ fn roundtrip_statement_with_dialect_1() -> Result<(), DataFusionError> {
         sql: "select min(ta.j1_id) as j1_min from j1 ta order by min(ta.j1_id) limit 10;",
         parser_dialect: MySqlDialect {},
         unparser_dialect: UnparserMySqlDialect {},
-        // top projection sort gets derived into a subquery
-        // for MySQL, this subquery needs an alias
-        expected: @"SELECT `j1_min` FROM (SELECT min(`ta`.`j1_id`) AS `j1_min`, min(`ta`.`j1_id`) FROM `j1` AS `ta` ORDER BY min(`ta`.`j1_id`) ASC) AS `derived_sort` LIMIT 10",
+        expected: @"SELECT min(`ta`.`j1_id`) AS `j1_min` FROM `j1` AS `ta` ORDER BY `j1_min` ASC LIMIT 10",
     );
     Ok(())
 }
@@ -347,9 +347,7 @@ fn roundtrip_statement_with_dialect_2() -> Result<(), DataFusionError> {
         sql: "select min(ta.j1_id) as j1_min from j1 ta order by min(ta.j1_id) limit 10;",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        // top projection sort still gets derived into a subquery in default dialect
-        // except for the default dialect, the subquery is left non-aliased
-        expected: @"SELECT j1_min FROM (SELECT min(ta.j1_id) AS j1_min, min(ta.j1_id) FROM j1 AS ta ORDER BY min(ta.j1_id) ASC NULLS LAST) LIMIT 10",
+        expected: @"SELECT min(ta.j1_id) AS j1_min FROM j1 AS ta ORDER BY j1_min ASC NULLS LAST LIMIT 10",
     );
     Ok(())
 }
@@ -360,7 +358,18 @@ fn roundtrip_statement_with_dialect_3() -> Result<(), DataFusionError> {
         sql: "select min(ta.j1_id) as j1_min, max(tb.j1_max) from j1 ta, (select distinct max(ta.j1_id) as j1_max from j1 ta order by max(ta.j1_id)) tb order by min(ta.j1_id) limit 10;",
         parser_dialect: MySqlDialect {},
         unparser_dialect: UnparserMySqlDialect {},
-        expected: @"SELECT `j1_min`, `max(tb.j1_max)` FROM (SELECT min(`ta`.`j1_id`) AS `j1_min`, max(`tb`.`j1_max`), min(`ta`.`j1_id`) FROM `j1` AS `ta` CROSS JOIN (SELECT `j1_max` FROM (SELECT DISTINCT max(`ta`.`j1_id`) AS `j1_max` FROM `j1` AS `ta`) AS `derived_distinct`) AS `tb` ORDER BY min(`ta`.`j1_id`) ASC) AS `derived_sort` LIMIT 10",
+        expected: @"SELECT min(`ta`.`j1_id`) AS `j1_min`, max(`tb`.`j1_max`) FROM `j1` AS `ta` CROSS JOIN (SELECT DISTINCT max(`ta`.`j1_id`) AS `j1_max` FROM `j1` AS `ta`) AS `tb` ORDER BY `j1_min` ASC LIMIT 10",
+    );
+    Ok(())
+}
+
+#[test]
+fn roundtrip_statement_postgres_any_array_expr() -> Result<(), DataFusionError> {
+    roundtrip_statement_with_dialect_helper!(
+        sql: "select left from array where 1 = any(left);",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: UnparserPostgreSqlDialect {},
+        expected: @r#"SELECT "array"."left" FROM "array" WHERE CASE WHEN "array"."left" IS NULL THEN NULL WHEN (cardinality("array"."left") = 0) THEN false WHEN 1 IS NULL THEN NULL WHEN 1 = ANY("array"."left") THEN true WHEN array_position("array"."left", NULL, 1) IS NOT NULL THEN NULL ELSE false END"#,
     );
     Ok(())
 }
@@ -404,7 +413,7 @@ fn roundtrip_statement_with_dialect_7() -> Result<(), DataFusionError> {
         sql: "select ta.j1_id from j1 ta order by j1_id limit 10;",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT ta.j1_id FROM j1 AS ta ORDER BY ta.j1_id ASC NULLS LAST LIMIT 10"#,
+        expected: @"SELECT ta.j1_id FROM j1 AS ta ORDER BY ta.j1_id ASC NULLS LAST LIMIT 10",
     );
     Ok(())
 }
@@ -419,7 +428,7 @@ fn roundtrip_statement_with_dialect_8() -> Result<(), DataFusionError> {
                   LIMIT 10;",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT j1.j1_id FROM j1 UNION ALL SELECT tb.j2_id AS j1_id FROM j2 AS tb ORDER BY j1_id ASC NULLS LAST LIMIT 10"#,
+        expected: @"SELECT j1.j1_id FROM j1 UNION ALL SELECT tb.j2_id AS j1_id FROM j2 AS tb ORDER BY j1_id ASC NULLS LAST LIMIT 10",
     );
     Ok(())
 }
@@ -431,7 +440,7 @@ fn roundtrip_statement_with_dialect_9() -> Result<(), DataFusionError> {
         sql: "SELECT j1_string from j1 order by j1_id",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT j1.j1_string FROM j1 ORDER BY j1.j1_id ASC NULLS LAST"#,
+        expected: @"SELECT j1.j1_string FROM j1 ORDER BY j1.j1_id ASC NULLS LAST",
     );
     Ok(())
 }
@@ -442,7 +451,7 @@ fn roundtrip_statement_with_dialect_10() -> Result<(), DataFusionError> {
         sql: "SELECT j1_string AS a from j1 order by j1_id",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT j1.j1_string AS a FROM j1 ORDER BY j1.j1_id ASC NULLS LAST"#,
+        expected: @"SELECT j1.j1_string AS a FROM j1 ORDER BY j1.j1_id ASC NULLS LAST",
     );
     Ok(())
 }
@@ -453,7 +462,7 @@ fn roundtrip_statement_with_dialect_11() -> Result<(), DataFusionError> {
         sql: "SELECT j1_string from j1 join j2 on j1.j1_id = j2.j2_id order by j1_id",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT j1.j1_string FROM j1 INNER JOIN j2 ON (j1.j1_id = j2.j2_id) ORDER BY j1.j1_id ASC NULLS LAST"#,
+        expected: @"SELECT j1.j1_string FROM j1 INNER JOIN j2 ON (j1.j1_id = j2.j2_id) ORDER BY j1.j1_id ASC NULLS LAST",
     );
     Ok(())
 }
@@ -483,7 +492,7 @@ fn roundtrip_statement_with_dialect_12() -> Result<(), DataFusionError> {
                   abc.j2_string",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT abc.j1_string, abc.j2_string FROM (SELECT DISTINCT j1.j1_id, j1.j1_string, j2.j2_string FROM j1 INNER JOIN j2 ON (j1.j1_id = j2.j2_id) ORDER BY j1.j1_id DESC NULLS FIRST LIMIT 10) AS abc ORDER BY abc.j2_string ASC NULLS LAST"#,
+        expected: @"SELECT abc.j1_string, abc.j2_string FROM (SELECT DISTINCT j1.j1_id, j1.j1_string, j2.j2_string FROM j1 INNER JOIN j2 ON (j1.j1_id = j2.j2_id) ORDER BY j1.j1_id DESC NULLS FIRST LIMIT 10) AS abc ORDER BY abc.j2_string ASC NULLS LAST",
     );
     Ok(())
 }
@@ -505,7 +514,7 @@ fn roundtrip_statement_with_dialect_13() -> Result<(), DataFusionError> {
             ",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT agg.string_count FROM (SELECT j1.j1_id, min(j2.j2_string) FROM j1 LEFT OUTER JOIN j2 ON (j1.j1_id = j2.j2_id) GROUP BY j1.j1_id) AS agg (id, string_count)"#,
+        expected: @"SELECT agg.string_count FROM (SELECT j1.j1_id, min(j2.j2_string) FROM j1 LEFT OUTER JOIN j2 ON (j1.j1_id = j2.j2_id) GROUP BY j1.j1_id) AS agg (id, string_count)",
     );
     Ok(())
 }
@@ -539,7 +548,7 @@ fn roundtrip_statement_with_dialect_14() -> Result<(), DataFusionError> {
                   abc.j2_string",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT abc.j1_string, abc.j2_string FROM (SELECT j1.j1_id, j1.j1_string, j2.j2_string FROM j1 INNER JOIN j2 ON (j1.j1_id = j2.j2_id) GROUP BY j1.j1_id, j1.j1_string, j2.j2_string ORDER BY j1.j1_id DESC NULLS FIRST LIMIT 10) AS abc ORDER BY abc.j2_string ASC NULLS LAST"#,
+        expected: @"SELECT abc.j1_string, abc.j2_string FROM (SELECT j1.j1_id, j1.j1_string, j2.j2_string FROM j1 INNER JOIN j2 ON (j1.j1_id = j2.j2_id) GROUP BY j1.j1_id, j1.j1_string, j2.j2_string ORDER BY j1.j1_id DESC NULLS FIRST LIMIT 10) AS abc ORDER BY abc.j2_string ASC NULLS LAST",
     );
     Ok(())
 }
@@ -569,7 +578,7 @@ fn roundtrip_statement_with_dialect_15() -> Result<(), DataFusionError> {
                   j2_string",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT abc.j1_string FROM (SELECT j1.j1_string, j2.j2_string FROM j1 INNER JOIN j2 ON (j1.j1_id = j2.j2_id) ORDER BY j1.j1_id DESC NULLS FIRST, j2.j2_id DESC NULLS FIRST LIMIT 10) AS abc ORDER BY abc.j2_string ASC NULLS LAST"#,
+        expected: @"SELECT abc.j1_string FROM (SELECT j1.j1_string, j2.j2_string FROM j1 INNER JOIN j2 ON (j1.j1_id = j2.j2_id) ORDER BY j1.j1_id DESC NULLS FIRST, j2.j2_id DESC NULLS FIRST LIMIT 10) AS abc ORDER BY abc.j2_string ASC NULLS LAST",
     );
     Ok(())
 }
@@ -580,7 +589,7 @@ fn roundtrip_statement_with_dialect_16() -> Result<(), DataFusionError> {
         sql: "SELECT id FROM (SELECT j1_id from j1) AS c (id)",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT c.id FROM (SELECT j1.j1_id FROM j1) AS c (id)"#,
+        expected: @"SELECT c.id FROM (SELECT j1.j1_id FROM j1) AS c (id)",
     );
     Ok(())
 }
@@ -591,7 +600,7 @@ fn roundtrip_statement_with_dialect_17() -> Result<(), DataFusionError> {
         sql: "SELECT id FROM (SELECT j1_id as id from j1) AS c",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT c.id FROM (SELECT j1.j1_id AS id FROM j1) AS c"#,
+        expected: @"SELECT c.id FROM (SELECT j1.j1_id AS id FROM j1) AS c",
     );
     Ok(())
 }
@@ -603,7 +612,7 @@ fn roundtrip_statement_with_dialect_18() -> Result<(), DataFusionError> {
         sql: "SELECT id FROM (SELECT j1_id + 1 * 3 from j1) AS c (id)",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT c.id FROM (SELECT (j1.j1_id + (1 * 3)) FROM j1) AS c (id)"#,
+        expected: @"SELECT c.id FROM (SELECT (j1.j1_id + (1 * 3)) FROM j1) AS c (id)",
     );
     Ok(())
 }
@@ -615,7 +624,7 @@ fn roundtrip_statement_with_dialect_19() -> Result<(), DataFusionError> {
         sql: "SELECT id FROM (SELECT distinct (j1_id + 1 * 3) FROM j1 LIMIT 1) AS c (id)",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT c.id FROM (SELECT DISTINCT (j1.j1_id + (1 * 3)) FROM j1 LIMIT 1) AS c (id)"#,
+        expected: @"SELECT c.id FROM (SELECT DISTINCT (j1.j1_id + (1 * 3)) FROM j1 LIMIT 1) AS c (id)",
     );
     Ok(())
 }
@@ -626,7 +635,7 @@ fn roundtrip_statement_with_dialect_20() -> Result<(), DataFusionError> {
         sql: "SELECT id FROM (SELECT j1_id + 1 FROM j1 ORDER BY j1_id DESC LIMIT 1) AS c (id)",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT c.id FROM (SELECT (j1.j1_id + 1) FROM j1 ORDER BY j1.j1_id DESC NULLS FIRST LIMIT 1) AS c (id)"#,
+        expected: @"SELECT c.id FROM (SELECT (j1.j1_id + 1) FROM j1 ORDER BY j1.j1_id DESC NULLS FIRST LIMIT 1) AS c (id)",
     );
     Ok(())
 }
@@ -637,7 +646,7 @@ fn roundtrip_statement_with_dialect_21() -> Result<(), DataFusionError> {
         sql: "SELECT id FROM (SELECT CAST((CAST(j1_id as BIGINT) + 1) as int) * 10 FROM j1 LIMIT 1) AS c (id)",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT c.id FROM (SELECT (CAST((CAST(j1.j1_id AS BIGINT) + 1) AS INTEGER) * 10) FROM j1 LIMIT 1) AS c (id)"#,
+        expected: @"SELECT c.id FROM (SELECT (CAST((CAST(j1.j1_id AS BIGINT) + 1) AS INTEGER) * 10) FROM j1 LIMIT 1) AS c (id)",
     );
     Ok(())
 }
@@ -648,7 +657,7 @@ fn roundtrip_statement_with_dialect_22() -> Result<(), DataFusionError> {
         sql: "SELECT id FROM (SELECT CAST(j1_id as BIGINT) + 1 FROM j1 ORDER BY j1_id LIMIT 1) AS c (id)",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT c.id FROM (SELECT (CAST(j1.j1_id AS BIGINT) + 1) FROM j1 ORDER BY j1.j1_id ASC NULLS LAST LIMIT 1) AS c (id)"#,
+        expected: @"SELECT c.id FROM (SELECT (CAST(j1.j1_id AS BIGINT) + 1) FROM j1 ORDER BY j1.j1_id ASC NULLS LAST LIMIT 1) AS c (id)",
     );
     Ok(())
 }
@@ -659,7 +668,7 @@ fn roundtrip_statement_with_dialect_23() -> Result<(), DataFusionError> {
         sql: "SELECT temp_j.id2 FROM (SELECT j1_id, j1_string FROM j1) AS temp_j(id2, string2)",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT temp_j.id2 FROM (SELECT j1.j1_id, j1.j1_string FROM j1) AS temp_j (id2, string2)"#,
+        expected: @"SELECT temp_j.id2 FROM (SELECT j1.j1_id, j1.j1_string FROM j1) AS temp_j (id2, string2)",
     );
     Ok(())
 }
@@ -670,7 +679,7 @@ fn roundtrip_statement_with_dialect_24() -> Result<(), DataFusionError> {
         sql: "SELECT temp_j.id2 FROM (SELECT j1_id, j1_string FROM j1) AS temp_j(id2, string2)",
         parser_dialect: GenericDialect {},
         unparser_dialect: SqliteDialect {},
-        expected: @r#"SELECT `temp_j`.`id2` FROM (SELECT `j1`.`j1_id` AS `id2`, `j1`.`j1_string` AS `string2` FROM `j1`) AS `temp_j`"#,
+        expected: @"SELECT `temp_j`.`id2` FROM (SELECT `j1`.`j1_id` AS `id2`, `j1`.`j1_string` AS `string2` FROM `j1`) AS `temp_j`",
     );
     Ok(())
 }
@@ -681,7 +690,7 @@ fn roundtrip_statement_with_dialect_25() -> Result<(), DataFusionError> {
         sql: "SELECT * FROM (SELECT j1_id + 1 FROM j1) AS temp_j(id2)",
         parser_dialect: GenericDialect {},
         unparser_dialect: SqliteDialect {},
-        expected: @r#"SELECT `temp_j`.`id2` FROM (SELECT (`j1`.`j1_id` + 1) AS `id2` FROM `j1`) AS `temp_j`"#,
+        expected: @"SELECT `temp_j`.`id2` FROM (SELECT (`j1`.`j1_id` + 1) AS `id2` FROM `j1`) AS `temp_j`",
     );
     Ok(())
 }
@@ -692,7 +701,7 @@ fn roundtrip_statement_with_dialect_26() -> Result<(), DataFusionError> {
         sql: "SELECT * FROM (SELECT j1_id FROM j1 LIMIT 1) AS temp_j(id2)",
         parser_dialect: GenericDialect {},
         unparser_dialect: SqliteDialect {},
-        expected: @r#"SELECT `temp_j`.`id2` FROM (SELECT `j1`.`j1_id` AS `id2` FROM `j1` LIMIT 1) AS `temp_j`"#,
+        expected: @"SELECT `temp_j`.`id2` FROM (SELECT `j1`.`j1_id` AS `id2` FROM `j1` LIMIT 1) AS `temp_j`",
     );
     Ok(())
 }
@@ -761,7 +770,7 @@ fn roundtrip_statement_with_dialect_32() -> Result<(), DataFusionError> {
         sql: "SELECT * FROM UNNEST([1,2,3])",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT UNNEST(make_array(Int64(1),Int64(2),Int64(3))) FROM UNNEST([1, 2, 3])"#,
+        expected: @"SELECT UNNEST(make_array(Int64(1),Int64(2),Int64(3))) FROM UNNEST([1, 2, 3])",
     );
     Ok(())
 }
@@ -786,7 +795,7 @@ fn roundtrip_statement_with_dialect_34() -> Result<(), DataFusionError> {
         sql: "SELECT * FROM UNNEST([1,2,3]) AS t1 (c1)",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT t1.c1 FROM UNNEST([1, 2, 3]) AS t1 (c1)"#,
+        expected: @"SELECT t1.c1 FROM UNNEST([1, 2, 3]) AS t1 (c1)",
     );
     Ok(())
 }
@@ -800,7 +809,7 @@ fn roundtrip_statement_with_dialect_35() -> Result<(), DataFusionError> {
         sql: "SELECT * FROM UNNEST([1,2,3]), j1",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT UNNEST(make_array(Int64(1),Int64(2),Int64(3))), j1.j1_id, j1.j1_string FROM UNNEST([1, 2, 3]) CROSS JOIN j1"#,
+        expected: @"SELECT UNNEST(make_array(Int64(1),Int64(2),Int64(3))), j1.j1_id, j1.j1_string FROM UNNEST([1, 2, 3]) CROSS JOIN j1",
     );
     Ok(())
 }
@@ -814,7 +823,7 @@ fn roundtrip_statement_with_dialect_36() -> Result<(), DataFusionError> {
         sql: "SELECT * FROM UNNEST([1,2,3]) u(c1) JOIN j1 ON u.c1 = j1.j1_id",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT u.c1, j1.j1_id, j1.j1_string FROM UNNEST([1, 2, 3]) AS u (c1) INNER JOIN j1 ON (u.c1 = j1.j1_id)"#,
+        expected: @"SELECT u.c1, j1.j1_id, j1.j1_string FROM UNNEST([1, 2, 3]) AS u (c1) INNER JOIN j1 ON (u.c1 = j1.j1_id)",
     );
     Ok(())
 }
@@ -828,7 +837,7 @@ fn roundtrip_statement_with_dialect_37() -> Result<(), DataFusionError> {
         sql: "SELECT * FROM UNNEST([1,2,3]) u(c1) UNION ALL SELECT * FROM UNNEST([4,5,6]) u(c1)",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT u.c1 FROM UNNEST([1, 2, 3]) AS u (c1) UNION ALL SELECT u.c1 FROM UNNEST([4, 5, 6]) AS u (c1)"#,
+        expected: @"SELECT u.c1 FROM UNNEST([1, 2, 3]) AS u (c1) UNION ALL SELECT u.c1 FROM UNNEST([4, 5, 6]) AS u (c1)",
     );
     Ok(())
 }
@@ -842,7 +851,7 @@ fn roundtrip_statement_with_dialect_38() -> Result<(), DataFusionError> {
         sql: "SELECT UNNEST([1,2,3])",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT * FROM UNNEST([1, 2, 3])"#,
+        expected: @"SELECT * FROM UNNEST([1, 2, 3])",
     );
     Ok(())
 }
@@ -856,7 +865,7 @@ fn roundtrip_statement_with_dialect_39() -> Result<(), DataFusionError> {
         sql: "SELECT UNNEST([1,2,3]) as c1",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT UNNEST([1, 2, 3]) AS c1"#,
+        expected: @"SELECT UNNEST([1, 2, 3]) AS c1",
     );
     Ok(())
 }
@@ -870,7 +879,7 @@ fn roundtrip_statement_with_dialect_40() -> Result<(), DataFusionError> {
         sql: "SELECT UNNEST([1,2,3]), 1",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT UNNEST([1, 2, 3]) AS UNNEST(make_array(Int64(1),Int64(2),Int64(3))), Int64(1)"#,
+        expected: @"SELECT UNNEST([1, 2, 3]) AS UNNEST(make_array(Int64(1),Int64(2),Int64(3))), Int64(1)",
     );
     Ok(())
 }
@@ -884,7 +893,7 @@ fn roundtrip_statement_with_dialect_41() -> Result<(), DataFusionError> {
         sql: "SELECT * FROM unnest_table u, UNNEST(u.array_col)",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT u.array_col, u.struct_col, UNNEST(outer_ref(u.array_col)) FROM unnest_table AS u CROSS JOIN UNNEST(u.array_col)"#,
+        expected: @"SELECT u.array_col, u.struct_col, UNNEST(outer_ref(u.array_col)) FROM unnest_table AS u CROSS JOIN UNNEST(u.array_col)",
     );
     Ok(())
 }
@@ -898,7 +907,7 @@ fn roundtrip_statement_with_dialect_42() -> Result<(), DataFusionError> {
         sql: "SELECT * FROM unnest_table u, UNNEST(u.array_col) AS t1 (c1)",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT u.array_col, u.struct_col, t1.c1 FROM unnest_table AS u CROSS JOIN UNNEST(u.array_col) AS t1 (c1)"#,
+        expected: @"SELECT u.array_col, u.struct_col, t1.c1 FROM unnest_table AS u CROSS JOIN UNNEST(u.array_col) AS t1 (c1)",
     );
     Ok(())
 }
@@ -912,7 +921,7 @@ fn roundtrip_statement_with_dialect_43() -> Result<(), DataFusionError> {
         sql: "SELECT unnest([1, 2, 3, 4]) from unnest([1, 2, 3]);",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT UNNEST([1, 2, 3, 4]) AS UNNEST(make_array(Int64(1),Int64(2),Int64(3),Int64(4))) FROM UNNEST([1, 2, 3])"#,
+        expected: @"SELECT UNNEST([1, 2, 3, 4]) AS UNNEST(make_array(Int64(1),Int64(2),Int64(3),Int64(4))) FROM UNNEST([1, 2, 3])",
     );
     Ok(())
 }
@@ -934,25 +943,25 @@ fn roundtrip_statement_with_dialect_special_char_alias() -> Result<(), DataFusio
         sql: "select min(a) as \"min(a)\" from (select 1 as a)",
         parser_dialect: GenericDialect {},
         unparser_dialect: BigQueryDialect {},
-        expected: @r#"SELECT min(`a`) AS `min_40a_41` FROM (SELECT 1 AS `a`)"#,
+        expected: @"SELECT min(`a`) AS `min_40a_41` FROM (SELECT 1 AS `a`)",
     );
     roundtrip_statement_with_dialect_helper!(
         sql: "select a as \"a*\", b as \"b@\" from (select 1 as a , 2 as b)",
         parser_dialect: GenericDialect {},
         unparser_dialect: BigQueryDialect {},
-        expected: @r#"SELECT `a` AS `a_42`, `b` AS `b_64` FROM (SELECT 1 AS `a`, 2 AS `b`)"#,
+        expected: @"SELECT `a` AS `a_42`, `b` AS `b_64` FROM (SELECT 1 AS `a`, 2 AS `b`)",
     );
     roundtrip_statement_with_dialect_helper!(
         sql: "select a as \"a*\", b , c as \"c@\" from (select 1 as a , 2 as b, 3 as c)",
         parser_dialect: GenericDialect {},
         unparser_dialect: BigQueryDialect {},
-        expected: @r#"SELECT `a` AS `a_42`, `b`, `c` AS `c_64` FROM (SELECT 1 AS `a`, 2 AS `b`, 3 AS `c`)"#,
+        expected: @"SELECT `a` AS `a_42`, `b`, `c` AS `c_64` FROM (SELECT 1 AS `a`, 2 AS `b`, 3 AS `c`)",
     );
     roundtrip_statement_with_dialect_helper!(
         sql: "select * from (select a as \"a*\", b as \"b@\" from (select 1 as a , 2 as b)) where \"a*\" = 1",
         parser_dialect: GenericDialect {},
         unparser_dialect: BigQueryDialect {},
-        expected: @r#"SELECT `a_42`, `b_64` FROM (SELECT `a` AS `a_42`, `b` AS `b_64` FROM (SELECT 1 AS `a`, 2 AS `b`)) WHERE (`a_42` = 1)"#,
+        expected: @"SELECT `a_42`, `b_64` FROM (SELECT `a` AS `a_42`, `b` AS `b_64` FROM (SELECT 1 AS `a`, 2 AS `b`)) WHERE (`a_42` = 1)",
     );
     roundtrip_statement_with_dialect_helper!(
         sql: "select * from (select a as \"a*\", b as \"b@\" from (select 1 as a , 2 as b)) where \"a*\" = 1",
@@ -979,11 +988,12 @@ fn test_unnest_logical_plan() -> Result<()> {
     let plan = sql_to_rel.sql_statement_to_plan(statement).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: __unnest_placeholder(unnest_table.struct_col).field1, __unnest_placeholder(unnest_table.struct_col).field2, __unnest_placeholder(unnest_table.array_col,depth=1) AS UNNEST(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col
-  Unnest: lists[__unnest_placeholder(unnest_table.array_col)|depth=1] structs[__unnest_placeholder(unnest_table.struct_col)]
-    Projection: unnest_table.struct_col AS __unnest_placeholder(unnest_table.struct_col), unnest_table.array_col AS __unnest_placeholder(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col
-      TableScan: unnest_table"#
+        @r"
+    Projection: __unnest_placeholder(unnest_table.struct_col).field1 AS unnest_table.struct_col.field1, __unnest_placeholder(unnest_table.struct_col).field2 AS unnest_table.struct_col.field2, __unnest_placeholder(unnest_table.array_col,depth=1) AS UNNEST(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col
+      Unnest: lists[__unnest_placeholder(unnest_table.array_col)|depth=1] structs[__unnest_placeholder(unnest_table.struct_col)]
+        Projection: unnest_table.struct_col AS __unnest_placeholder(unnest_table.struct_col), unnest_table.array_col AS __unnest_placeholder(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col
+          TableScan: unnest_table
+    "
     );
 
     Ok(())
@@ -1237,6 +1247,84 @@ fn test_table_scan_with_empty_projection_in_plan_to_sql_3() {
     );
 }
 
+#[test]
+fn test_table_scan_with_empty_projection_in_plan_to_sql_postgres() {
+    let schema = test_schema();
+    let table_name = "table";
+    let plan = table_scan_with_empty_projection_and_none_projection_helper(
+        table_name,
+        schema,
+        Some(vec![]),
+    );
+    let unparser = Unparser::new(&UnparserPostgreSqlDialect {});
+    let sql = unparser.plan_to_sql(&plan).unwrap();
+    assert_snapshot!(
+        sql,
+        @r#"SELECT FROM "table""#
+    );
+}
+
+#[test]
+fn test_table_scan_with_empty_projection_in_plan_to_sql_default_dialect() {
+    let schema = test_schema();
+    let table_name = "table";
+    let plan = table_scan_with_empty_projection_and_none_projection_helper(
+        table_name,
+        schema,
+        Some(vec![]),
+    );
+    let unparser = Unparser::new(&UnparserDefaultDialect {});
+    let sql = unparser.plan_to_sql(&plan).unwrap();
+    assert_snapshot!(
+        sql,
+        @r#"SELECT 1 FROM "table""#
+    );
+}
+
+#[test]
+fn test_table_scan_with_empty_projection_and_filter_postgres() {
+    let schema = test_schema();
+    let table_name = "table";
+    let plan = table_scan_with_filter_and_fetch(
+        Some(table_name),
+        &schema,
+        Some(vec![]),
+        vec![col("id").gt(lit(10))],
+        None,
+    )
+    .unwrap()
+    .build()
+    .unwrap();
+    let unparser = Unparser::new(&UnparserPostgreSqlDialect {});
+    let sql = unparser.plan_to_sql(&plan).unwrap();
+    assert_snapshot!(
+        sql,
+        @r#"SELECT FROM "table" WHERE ("table"."id" > 10)"#
+    );
+}
+
+#[test]
+fn test_table_scan_with_empty_projection_and_filter_default_dialect() {
+    let schema = test_schema();
+    let table_name = "table";
+    let plan = table_scan_with_filter_and_fetch(
+        Some(table_name),
+        &schema,
+        Some(vec![]),
+        vec![col("id").gt(lit(10))],
+        None,
+    )
+    .unwrap()
+    .build()
+    .unwrap();
+    let unparser = Unparser::new(&UnparserDefaultDialect {});
+    let sql = unparser.plan_to_sql(&plan).unwrap();
+    assert_snapshot!(
+        sql,
+        @r#"SELECT 1 FROM "table" WHERE ("table".id > 10)"#
+    );
+}
+
 fn table_scan_with_empty_projection_and_none_projection_helper(
     table_name: &str,
     table_schema: Schema,
@@ -1390,7 +1478,7 @@ fn test_table_scan_alias() -> Result<()> {
     let sql = plan_to_sql(&plan)?;
     assert_snapshot!(
         sql,
-        @r#"SELECT * FROM (SELECT t1.id FROM t1 WHERE (t1.id > 5)) AS a"#
+        @"SELECT * FROM (SELECT t1.id FROM t1 WHERE (t1.id > 5)) AS a"
     );
 
     let table_scan_with_two_filter = table_scan_with_filters(
@@ -1405,7 +1493,7 @@ fn test_table_scan_alias() -> Result<()> {
     let table_scan_with_two_filter = plan_to_sql(&table_scan_with_two_filter)?;
     assert_snapshot!(
         table_scan_with_two_filter,
-        @r#"SELECT a.id FROM t1 AS a WHERE ((a.id > 1) AND (a.age < 2))"#
+        @"SELECT a.id FROM t1 AS a WHERE ((a.id > 1) AND (a.age < 2))"
     );
 
     let table_scan_with_fetch =
@@ -1416,7 +1504,7 @@ fn test_table_scan_alias() -> Result<()> {
     let table_scan_with_fetch = plan_to_sql(&table_scan_with_fetch)?;
     assert_snapshot!(
         table_scan_with_fetch,
-        @r#"SELECT a.id FROM (SELECT * FROM t1 LIMIT 10) AS a"#
+        @"SELECT a.id FROM (SELECT * FROM t1 LIMIT 10) AS a"
     );
 
     let table_scan_with_pushdown_all = table_scan_with_filter_and_fetch(
@@ -1432,7 +1520,7 @@ fn test_table_scan_alias() -> Result<()> {
     let table_scan_with_pushdown_all = plan_to_sql(&table_scan_with_pushdown_all)?;
     assert_snapshot!(
         table_scan_with_pushdown_all,
-        @r#"SELECT a.id FROM (SELECT a.id, a.age FROM t1 AS a WHERE (a.id > 1) LIMIT 10) AS a"#
+        @"SELECT a.id FROM (SELECT a.id, a.age FROM t1 AS a WHERE (a.id > 1) LIMIT 10) AS a"
     );
     Ok(())
 }
@@ -1448,21 +1536,21 @@ fn test_table_scan_pushdown() -> Result<()> {
     let scan_with_projection = plan_to_sql(&scan_with_projection)?;
     assert_snapshot!(
         scan_with_projection,
-        @r#"SELECT t1.id, t1.age FROM t1"#
+        @"SELECT t1.id, t1.age FROM t1"
     );
 
     let scan_with_projection = table_scan(Some("t1"), &schema, Some(vec![1]))?.build()?;
     let scan_with_projection = plan_to_sql(&scan_with_projection)?;
     assert_snapshot!(
         scan_with_projection,
-        @r#"SELECT t1.age FROM t1"#
+        @"SELECT t1.age FROM t1"
     );
 
     let scan_with_no_projection = table_scan(Some("t1"), &schema, None)?.build()?;
     let scan_with_no_projection = plan_to_sql(&scan_with_no_projection)?;
     assert_snapshot!(
         scan_with_no_projection,
-        @r#"SELECT * FROM t1"#
+        @"SELECT * FROM t1"
     );
 
     let table_scan_with_projection_alias =
@@ -1473,7 +1561,7 @@ fn test_table_scan_pushdown() -> Result<()> {
         plan_to_sql(&table_scan_with_projection_alias)?;
     assert_snapshot!(
         table_scan_with_projection_alias,
-        @r#"SELECT ta.id, ta.age FROM t1 AS ta"#
+        @"SELECT ta.id, ta.age FROM t1 AS ta"
     );
 
     let table_scan_with_projection_alias =
@@ -1484,7 +1572,7 @@ fn test_table_scan_pushdown() -> Result<()> {
         plan_to_sql(&table_scan_with_projection_alias)?;
     assert_snapshot!(
         table_scan_with_projection_alias,
-        @r#"SELECT ta.age FROM t1 AS ta"#
+        @"SELECT ta.age FROM t1 AS ta"
     );
 
     let table_scan_with_no_projection_alias = table_scan(Some("t1"), &schema, None)?
@@ -1494,7 +1582,7 @@ fn test_table_scan_pushdown() -> Result<()> {
         plan_to_sql(&table_scan_with_no_projection_alias)?;
     assert_snapshot!(
         table_scan_with_no_projection_alias,
-        @r#"SELECT * FROM t1 AS ta"#
+        @"SELECT * FROM t1 AS ta"
     );
 
     let query_from_table_scan_with_projection = LogicalPlanBuilder::from(
@@ -1506,7 +1594,7 @@ fn test_table_scan_pushdown() -> Result<()> {
         plan_to_sql(&query_from_table_scan_with_projection)?;
     assert_snapshot!(
         query_from_table_scan_with_projection,
-        @r#"SELECT t1.id, t1.age FROM t1"#
+        @"SELECT t1.id, t1.age FROM t1"
     );
 
     let query_from_table_scan_with_two_projections = LogicalPlanBuilder::from(
@@ -1519,7 +1607,7 @@ fn test_table_scan_pushdown() -> Result<()> {
         plan_to_sql(&query_from_table_scan_with_two_projections)?;
     assert_snapshot!(
         query_from_table_scan_with_two_projections,
-        @r#"SELECT t1.id, t1.age FROM (SELECT t1.id, t1.age FROM t1)"#
+        @"SELECT t1.id, t1.age FROM (SELECT t1.id, t1.age FROM t1)"
     );
 
     let table_scan_with_filter = table_scan_with_filters(
@@ -1532,7 +1620,7 @@ fn test_table_scan_pushdown() -> Result<()> {
     let table_scan_with_filter = plan_to_sql(&table_scan_with_filter)?;
     assert_snapshot!(
         table_scan_with_filter,
-        @r#"SELECT * FROM t1 WHERE (t1.id > t1.age)"#
+        @"SELECT * FROM t1 WHERE (t1.id > t1.age)"
     );
 
     let table_scan_with_two_filter = table_scan_with_filters(
@@ -1545,7 +1633,7 @@ fn test_table_scan_pushdown() -> Result<()> {
     let table_scan_with_two_filter = plan_to_sql(&table_scan_with_two_filter)?;
     assert_snapshot!(
         table_scan_with_two_filter,
-        @r#"SELECT * FROM t1 WHERE ((t1.id > 1) AND (t1.age < 2))"#
+        @"SELECT * FROM t1 WHERE ((t1.id > 1) AND (t1.age < 2))"
     );
 
     let table_scan_with_filter_alias = table_scan_with_filters(
@@ -1559,7 +1647,7 @@ fn test_table_scan_pushdown() -> Result<()> {
     let table_scan_with_filter_alias = plan_to_sql(&table_scan_with_filter_alias)?;
     assert_snapshot!(
         table_scan_with_filter_alias,
-        @r#"SELECT * FROM t1 AS ta WHERE (ta.id > ta.age)"#
+        @"SELECT * FROM t1 AS ta WHERE (ta.id > ta.age)"
     );
 
     let table_scan_with_projection_and_filter = table_scan_with_filters(
@@ -1573,7 +1661,7 @@ fn test_table_scan_pushdown() -> Result<()> {
         plan_to_sql(&table_scan_with_projection_and_filter)?;
     assert_snapshot!(
         table_scan_with_projection_and_filter,
-        @r#"SELECT t1.id, t1.age FROM t1 WHERE (t1.id > t1.age)"#
+        @"SELECT t1.id, t1.age FROM t1 WHERE (t1.id > t1.age)"
     );
 
     let table_scan_with_projection_and_filter = table_scan_with_filters(
@@ -1587,7 +1675,7 @@ fn test_table_scan_pushdown() -> Result<()> {
         plan_to_sql(&table_scan_with_projection_and_filter)?;
     assert_snapshot!(
         table_scan_with_projection_and_filter,
-        @r#"SELECT t1.age FROM t1 WHERE (t1.id > t1.age)"#
+        @"SELECT t1.age FROM t1 WHERE (t1.id > t1.age)"
     );
 
     let table_scan_with_inline_fetch =
@@ -1596,7 +1684,7 @@ fn test_table_scan_pushdown() -> Result<()> {
     let table_scan_with_inline_fetch = plan_to_sql(&table_scan_with_inline_fetch)?;
     assert_snapshot!(
         table_scan_with_inline_fetch,
-        @r#"SELECT * FROM t1 LIMIT 10"#
+        @"SELECT * FROM t1 LIMIT 10"
     );
 
     let table_scan_with_projection_and_inline_fetch = table_scan_with_filter_and_fetch(
@@ -1611,7 +1699,7 @@ fn test_table_scan_pushdown() -> Result<()> {
         plan_to_sql(&table_scan_with_projection_and_inline_fetch)?;
     assert_snapshot!(
         table_scan_with_projection_and_inline_fetch,
-        @r#"SELECT t1.id, t1.age FROM t1 LIMIT 10"#
+        @"SELECT t1.id, t1.age FROM t1 LIMIT 10"
     );
 
     let table_scan_with_all = table_scan_with_filter_and_fetch(
@@ -1625,7 +1713,7 @@ fn test_table_scan_pushdown() -> Result<()> {
     let table_scan_with_all = plan_to_sql(&table_scan_with_all)?;
     assert_snapshot!(
         table_scan_with_all,
-        @r#"SELECT t1.id, t1.age FROM t1 WHERE (t1.id > t1.age) LIMIT 10"#
+        @"SELECT t1.id, t1.age FROM t1 WHERE (t1.id > t1.age) LIMIT 10"
     );
 
     let table_scan_with_additional_filter = table_scan_with_filters(
@@ -1639,7 +1727,7 @@ fn test_table_scan_pushdown() -> Result<()> {
     let table_scan_with_filter = plan_to_sql(&table_scan_with_additional_filter)?;
     assert_snapshot!(
         table_scan_with_filter,
-        @r#"SELECT * FROM t1 WHERE (t1.id = 5) AND (t1.id > t1.age)"#
+        @"SELECT * FROM t1 WHERE (t1.id = 5) AND (t1.id > t1.age)"
     );
 
     Ok(())
@@ -1660,7 +1748,43 @@ fn test_sort_with_push_down_fetch() -> Result<()> {
     let sql = plan_to_sql(&plan)?;
     assert_snapshot!(
         sql,
-        @r#"SELECT t1.id, t1.age FROM t1 ORDER BY t1.age ASC NULLS FIRST LIMIT 10"#
+        @"SELECT t1.id, t1.age FROM t1 ORDER BY t1.age ASC NULLS FIRST LIMIT 10"
+    );
+    Ok(())
+}
+
+#[test]
+fn test_sort_with_scalar_fn_and_push_down_fetch() -> Result<()> {
+    let schema = Schema::new(vec![
+        Field::new("search_phrase", DataType::Utf8, false),
+        Field::new("event_time", DataType::Utf8, false),
+    ]);
+
+    let substr_udf = unicode::substr();
+
+    // Build a plan that mimics the DF52 optimizer output:
+    // Projection(search_phrase) → Sort(substr(event_time), fetch=10)
+    //   → Projection(search_phrase, event_time) → Filter → TableScan
+    // This triggers a subquery because the outer projection differs from the inner one.
+    // The ORDER BY scalar function must not reference the inner table qualifier.
+    let plan = table_scan(Some("t1"), &schema, None)?
+        .filter(col("search_phrase").not_eq(lit("")))?
+        .project(vec![col("search_phrase"), col("event_time")])?
+        .sort_with_limit(
+            vec![
+                substr_udf
+                    .call(vec![col("event_time"), lit(1), lit(5)])
+                    .sort(true, true),
+            ],
+            Some(10),
+        )?
+        .project(vec![col("search_phrase")])?
+        .build()?;
+
+    let sql = plan_to_sql(&plan)?;
+    assert_snapshot!(
+        sql,
+        @"SELECT t1.search_phrase FROM (SELECT t1.search_phrase, t1.event_time FROM t1 WHERE (t1.search_phrase <> '') ORDER BY substr(t1.event_time, 1, 5) ASC NULLS FIRST LIMIT 10)"
     );
     Ok(())
 }
@@ -1706,7 +1830,7 @@ fn test_join_with_table_scan_filters() -> Result<()> {
     let sql = plan_to_sql(&join_plan_with_filter)?;
     assert_snapshot!(
         sql,
-        @r#"SELECT * FROM left_table AS "left" INNER JOIN right_table ON "left".id = right_table.id AND (("left".id > 5) AND ("left"."name" LIKE 'some_name' AND (age > 10)))"#
+        @r#"SELECT * FROM left_table AS "left" INNER JOIN right_table ON "left".id = right_table.id AND ("left".id > 5) WHERE "left"."name" LIKE 'some_name' AND (age > 10)"#
     );
 
     let join_plan_no_filter = LogicalPlanBuilder::from(left_plan.clone())
@@ -1721,7 +1845,7 @@ fn test_join_with_table_scan_filters() -> Result<()> {
     let sql = plan_to_sql(&join_plan_no_filter)?;
     assert_snapshot!(
         sql,
-        @r#"SELECT * FROM left_table AS "left" INNER JOIN right_table ON "left".id = right_table.id AND ("left"."name" LIKE 'some_name' AND (age > 10))"#
+        @r#"SELECT * FROM left_table AS "left" INNER JOIN right_table ON "left".id = right_table.id WHERE "left"."name" LIKE 'some_name' AND (age > 10)"#
     );
 
     let right_plan_with_filter = table_scan_with_filters(
@@ -1746,7 +1870,7 @@ fn test_join_with_table_scan_filters() -> Result<()> {
     let sql = plan_to_sql(&join_plan_multiple_filters)?;
     assert_snapshot!(
         sql,
-        @r#"SELECT * FROM left_table AS "left" INNER JOIN right_table ON "left".id = right_table.id AND (("left".id > 5) AND (("left"."name" LIKE 'some_name' AND (right_table."name" = 'before_join_filter_val')) AND (age > 10))) WHERE ("left"."name" = 'after_join_filter_val')"#
+        @r#"SELECT * FROM left_table AS "left" INNER JOIN right_table ON "left".id = right_table.id AND ("left".id > 5) WHERE ("left"."name" = 'after_join_filter_val') AND "left"."name" LIKE 'some_name' AND (right_table."name" = 'before_join_filter_val') AND (age > 10)"#
     );
 
     let right_plan_with_filter_schema = table_scan_with_filters(
@@ -1776,7 +1900,103 @@ fn test_join_with_table_scan_filters() -> Result<()> {
     let sql = plan_to_sql(&join_plan_duplicated_filter)?;
     assert_snapshot!(
         sql,
-        @r#"SELECT * FROM left_table AS "left" INNER JOIN right_table ON "left".id = right_table.id AND (("left".id > 5) AND (("left"."name" LIKE 'some_name' AND (right_table.age > 10)) AND (right_table.age < 11)))"#
+        @r#"SELECT * FROM left_table AS "left" INNER JOIN right_table ON "left".id = right_table.id AND ("left".id > 5) WHERE "left"."name" LIKE 'some_name' AND (right_table.age > 10) AND (right_table.age < 11)"#
+    );
+
+    // Inner join with a scalar subquery in table_scan_filters. The subquery filter should appear in WHERE, not in JOIN ON,
+    // since dialects like BigQuery reject subqueries in join predicates.
+    let schema_subquery = Schema::new(vec![Field::new("id", DataType::Utf8, false)]);
+    let subquery_plan = table_scan(Some("subquery_table"), &schema_subquery, None)?
+        .aggregate(vec![] as Vec<Expr>, vec![max(col("subquery_table.id"))])?
+        .build()?;
+    let right_plan_with_subquery = table_scan_with_filters(
+        Some("right_table"),
+        &schema_right,
+        None,
+        vec![col("right_table.id").eq(scalar_subquery(Arc::new(subquery_plan)))],
+    )?
+    .build()?;
+
+    let left_plan =
+        table_scan(Some("left_table"), &schema_left, Some(vec![0, 1]))?.build()?;
+
+    let join_plan_subquery_filter = LogicalPlanBuilder::from(left_plan)
+        .join(
+            right_plan_with_subquery,
+            datafusion_expr::JoinType::Inner,
+            (vec!["left_table.id"], vec!["right_table.id"]),
+            None,
+        )?
+        .build()?;
+
+    let sql = plan_to_sql(&join_plan_subquery_filter)?;
+    assert_snapshot!(
+        sql,
+        @r#"SELECT left_table.id, left_table."name" FROM left_table INNER JOIN right_table ON left_table.id = right_table.id WHERE (right_table.id = (SELECT max(subquery_table.id) FROM subquery_table))"#
+    );
+
+    // Inner join with an IN subquery in table_scan_filters.
+    let subquery_plan_in = table_scan(Some("subquery_table"), &schema_subquery, None)?
+        .project(vec![col("subquery_table.id")])?
+        .build()?;
+    let right_plan_with_in = table_scan_with_filters(
+        Some("right_table"),
+        &schema_right,
+        None,
+        vec![in_subquery(
+            col("right_table.id"),
+            Arc::new(subquery_plan_in),
+        )],
+    )?
+    .build()?;
+
+    let left_plan_in =
+        table_scan(Some("left_table"), &schema_left, Some(vec![0, 1]))?.build()?;
+
+    let join_plan_in_subquery = LogicalPlanBuilder::from(left_plan_in)
+        .join(
+            right_plan_with_in,
+            datafusion_expr::JoinType::Inner,
+            (vec!["left_table.id"], vec!["right_table.id"]),
+            None,
+        )?
+        .build()?;
+
+    let sql = plan_to_sql(&join_plan_in_subquery)?;
+    assert_snapshot!(
+        sql,
+        @r#"SELECT left_table.id, left_table."name" FROM left_table INNER JOIN right_table ON left_table.id = right_table.id WHERE right_table.id IN (SELECT subquery_table.id FROM subquery_table)"#
+    );
+
+    // Inner join with an EXISTS subquery in table_scan_filters.
+    let subquery_plan_exists =
+        table_scan(Some("subquery_table"), &schema_subquery, None)?
+            .filter(col("subquery_table.id").eq(col("right_table.id")))?
+            .build()?;
+    let right_plan_with_exists = table_scan_with_filters(
+        Some("right_table"),
+        &schema_right,
+        None,
+        vec![exists(Arc::new(subquery_plan_exists))],
+    )?
+    .build()?;
+
+    let left_plan_exists =
+        table_scan(Some("left_table"), &schema_left, Some(vec![0, 1]))?.build()?;
+
+    let join_plan_exists = LogicalPlanBuilder::from(left_plan_exists)
+        .join(
+            right_plan_with_exists,
+            datafusion_expr::JoinType::Inner,
+            (vec!["left_table.id"], vec!["right_table.id"]),
+            None,
+        )?
+        .build()?;
+
+    let sql = plan_to_sql(&join_plan_exists)?;
+    assert_snapshot!(
+        sql,
+        @r#"SELECT left_table.id, left_table."name" FROM left_table INNER JOIN right_table ON left_table.id = right_table.id WHERE EXISTS (SELECT * FROM subquery_table WHERE (subquery_table.id = right_table.id))"#
     );
 
     Ok(())
@@ -1790,7 +2010,7 @@ fn test_interval_lhs_eq() {
     );
     assert_snapshot!(
         statement,
-        @r#"SELECT (INTERVAL '2.000000000 SECS' = INTERVAL '2.000000000 SECS')"#
+        @"SELECT (INTERVAL '2.000000000 SECS' = INTERVAL '2.000000000 SECS')"
     )
 }
 
@@ -1802,7 +2022,7 @@ fn test_interval_lhs_lt() {
     );
     assert_snapshot!(
         statement,
-        @r#"SELECT (INTERVAL '2.000000000 SECS' < INTERVAL '2.000000000 SECS')"#
+        @"SELECT (INTERVAL '2.000000000 SECS' < INTERVAL '2.000000000 SECS')"
     )
 }
 
@@ -1811,16 +2031,38 @@ fn test_without_offset() {
     let statement = generate_round_trip_statement(MySqlDialect {}, "select 1");
     assert_snapshot!(
         statement,
-        @r#"SELECT 1"#
+        @"SELECT 1"
     )
 }
 
+#[test]
+fn test_cast_to_tinyint() -> Result<(), DataFusionError> {
+    roundtrip_statement_with_dialect_helper!(
+        sql: "select cast(3 as tinyint)",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: UnparserPostgreSqlDialect {},
+        expected: @"SELECT CAST(3 AS SMALLINT)",
+    );
+    Ok(())
+}
+
+#[test]
+fn test_cast_to_tinyint_default_dialect() -> Result<(), DataFusionError> {
+    roundtrip_statement_with_dialect_helper!(
+        sql: "select cast(3 as tinyint)",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: UnparserDefaultDialect {},
+        expected: @"SELECT CAST(3 AS TINYINT)",
+    );
+    Ok(())
+}
+
 #[test]
 fn test_with_offset0() {
     let statement = generate_round_trip_statement(MySqlDialect {}, "select 1 offset 0");
     assert_snapshot!(
         statement,
-        @r#"SELECT 1 OFFSET 0"#
+        @"SELECT 1 OFFSET 0"
     )
 }
 
@@ -1829,7 +2071,7 @@ fn test_with_offset95() {
     let statement = generate_round_trip_statement(MySqlDialect {}, "select 1 offset 95");
     assert_snapshot!(
         statement,
-        @r#"SELECT 1 OFFSET 95"#
+        @"SELECT 1 OFFSET 95"
     )
 }
 
@@ -1842,7 +2084,7 @@ fn test_order_by_to_sql_1() {
     );
     assert_snapshot!(
         statement,
-        @r#"SELECT person.id, person.first_name, sum(person.id) FROM person GROUP BY person.id, person.first_name ORDER BY sum(person.id) ASC NULLS LAST, person.first_name DESC NULLS FIRST, person.id ASC NULLS LAST, person.first_name ASC NULLS LAST LIMIT 10"#
+        @"SELECT person.id, person.first_name, sum(person.id) FROM person GROUP BY person.id, person.first_name ORDER BY sum(person.id) ASC NULLS LAST, person.first_name DESC NULLS FIRST, person.id ASC NULLS LAST, person.first_name ASC NULLS LAST LIMIT 10"
     );
 }
 
@@ -1855,7 +2097,7 @@ fn test_order_by_to_sql_2() {
     );
     assert_snapshot!(
         statement,
-        @r#"SELECT person.id, person.first_name, sum(person.id) AS total_sum FROM person GROUP BY person.id, person.first_name ORDER BY total_sum ASC NULLS LAST, person.first_name DESC NULLS FIRST, person.id ASC NULLS LAST, person.first_name ASC NULLS LAST LIMIT 10"#
+        @"SELECT person.id, person.first_name, sum(person.id) AS total_sum FROM person GROUP BY person.id, person.first_name ORDER BY total_sum ASC NULLS LAST, person.first_name DESC NULLS FIRST, person.id ASC NULLS LAST, person.first_name ASC NULLS LAST LIMIT 10"
     );
 }
 
@@ -1867,7 +2109,7 @@ fn test_order_by_to_sql_3() {
     );
     assert_snapshot!(
         statement,
-        @r#"SELECT person.id, person.first_name, substr(person.first_name, 0, 5) FROM person ORDER BY person.id ASC NULLS LAST, substr(person.first_name, 0, 5) ASC NULLS LAST"#
+        @"SELECT person.id, person.first_name, substr(person.first_name, 0, 5) FROM person ORDER BY person.id ASC NULLS LAST, substr(person.first_name, 0, 5) ASC NULLS LAST"
     );
 }
 
@@ -1909,7 +2151,7 @@ fn test_complex_order_by_with_grouping() -> Result<()> {
     }, {
         assert_snapshot!(
             sql,
-            @r#"SELECT j1.j1_id, j1.j1_string, lochierarchy FROM (SELECT j1.j1_id, j1.j1_string, (grouping(j1.j1_id) + grouping(j1.j1_string)) AS lochierarchy, grouping(j1.j1_string), grouping(j1.j1_id) FROM j1 GROUP BY ROLLUP (j1.j1_id, j1.j1_string) ORDER BY (grouping(j1.j1_id) + grouping(j1.j1_string)) DESC NULLS FIRST, CASE WHEN ((grouping(j1.j1_id) + grouping(j1.j1_string)) = 0) THEN j1.j1_id END ASC NULLS LAST) LIMIT 100"#
+            @"SELECT j1.j1_id, j1.j1_string, (grouping(j1.j1_id) + grouping(j1.j1_string)) AS lochierarchy FROM j1 GROUP BY ROLLUP (j1.j1_id, j1.j1_string) ORDER BY lochierarchy DESC NULLS FIRST, CASE WHEN (lochierarchy = 0) THEN j1.j1_id END ASC NULLS LAST LIMIT 100"
         );
     });
 
@@ -1942,7 +2184,7 @@ fn test_unnest_to_sql_1() {
     );
     assert_snapshot!(
         statement,
-        @r#"SELECT UNNEST(unnest_table.array_col) AS u1, unnest_table.struct_col, unnest_table.array_col FROM unnest_table WHERE (unnest_table.array_col <> NULL) ORDER BY unnest_table.struct_col ASC NULLS LAST, unnest_table.array_col ASC NULLS LAST"#
+        @"SELECT UNNEST(unnest_table.array_col) AS u1, unnest_table.struct_col, unnest_table.array_col FROM unnest_table WHERE (unnest_table.array_col <> NULL) ORDER BY unnest_table.struct_col ASC NULLS LAST, unnest_table.array_col ASC NULLS LAST"
     );
 }
 
@@ -1954,7 +2196,7 @@ fn test_unnest_to_sql_2() {
     );
     assert_snapshot!(
         statement,
-        @r#"SELECT UNNEST([1, 2, 2, 5, NULL]) AS u1"#
+        @"SELECT UNNEST([1, 2, 2, 5, NULL]) AS u1"
     );
 }
 
@@ -1966,7 +2208,7 @@ fn test_join_with_no_conditions() {
     );
     assert_snapshot!(
         statement,
-        @r#"SELECT j1.j1_id, j1.j1_string FROM j1 CROSS JOIN j2"#
+        @"SELECT j1.j1_id, j1.j1_string FROM j1 CROSS JOIN j2"
     );
 }
 
@@ -2069,13 +2311,14 @@ fn test_unparse_extension_to_statement() -> Result<()> {
     let sql = unparser.plan_to_sql(&extension)?;
     assert_snapshot!(
         sql,
-        @r#"SELECT j1.j1_id, j1.j1_string FROM j1"#
+        @"SELECT j1.j1_id, j1.j1_string FROM j1"
     );
 
     if let Some(err) = plan_to_sql(&extension).err() {
         assert_contains!(
             err.to_string(),
-            "This feature is not implemented: Unsupported extension node: MockUserDefinedLogicalPlan");
+            "This feature is not implemented: Unsupported extension node: MockUserDefinedLogicalPlan"
+        );
     } else {
         panic!("Expected error");
     }
@@ -2134,7 +2377,7 @@ fn test_unparse_extension_to_sql() -> Result<()> {
     let sql = unparser.plan_to_sql(&plan)?;
     assert_snapshot!(
         sql,
-        @r#"SELECT j1.j1_id AS user_id FROM (SELECT j1.j1_id, j1.j1_string FROM j1)"#
+        @"SELECT j1.j1_id AS user_id FROM (SELECT j1.j1_id, j1.j1_string FROM j1)"
     );
 
     if let Some(err) = plan_to_sql(&plan).err() {
@@ -2175,15 +2418,13 @@ fn test_unparse_optimized_multi_union() -> Result<()> {
     });
     assert_snapshot!(
         unparser.plan_to_sql(&plan)?,
-        @r#"SELECT 1 AS x, 'a' AS y UNION ALL SELECT 1 AS x, 'b' AS y UNION ALL SELECT 2 AS x, 'a' AS y UNION ALL SELECT 2 AS x, 'c' AS y"#
+        @"SELECT 1 AS x, 'a' AS y UNION ALL SELECT 1 AS x, 'b' AS y UNION ALL SELECT 2 AS x, 'a' AS y UNION ALL SELECT 2 AS x, 'c' AS y"
     );
 
     let plan = LogicalPlan::Union(Union {
-        inputs: vec![project(
-            empty.clone(),
-            vec![lit(1).alias("x"), lit("a").alias("y")],
-        )?
-        .into()],
+        inputs: vec![
+            project(empty.clone(), vec![lit(1).alias("x"), lit("a").alias("y")])?.into(),
+        ],
         schema: dfschema.clone(),
     });
 
@@ -2256,7 +2497,7 @@ fn test_unparse_subquery_alias_with_table_pushdown() -> Result<()> {
     let sql = unparser.plan_to_sql(&plan)?;
     assert_snapshot!(
         sql,
-        @r#"SELECT customer_view.c_custkey, customer_view.c_name, customer_view.custkey_plus FROM (SELECT customer.c_custkey, (CAST(customer.c_custkey AS BIGINT) + 1) AS custkey_plus, customer.c_name FROM (SELECT customer.c_custkey, customer.c_name FROM customer AS customer) AS customer) AS customer_view"#
+        @"SELECT customer_view.c_custkey, customer_view.c_name, customer_view.custkey_plus FROM (SELECT customer.c_custkey, (CAST(customer.c_custkey AS BIGINT) + 1) AS custkey_plus, customer.c_name FROM (SELECT customer.c_custkey, customer.c_name FROM customer AS customer) AS customer) AS customer_view"
     );
     Ok(())
 }
@@ -2567,21 +2808,21 @@ fn test_unparse_window() -> Result<()> {
     let sql = unparser.plan_to_sql(&plan)?;
     assert_snapshot!(
         sql,
-        @r#"SELECT `test`.`k`, `test`.`v`, `rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` FROM (SELECT `test`.`k` AS `k`, `test`.`v` AS `v`, rank() OVER (PARTITION BY `test`.`k` ORDER BY `test`.`v` ASC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` FROM `test`) AS `test` WHERE (`rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` = 1)"#
+        @"SELECT `test`.`k`, `test`.`v`, `rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` FROM (SELECT `test`.`k` AS `k`, `test`.`v` AS `v`, rank() OVER (PARTITION BY `test`.`k` ORDER BY `test`.`v` ASC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` FROM `test`) AS `test` WHERE (`rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` = 1)"
     );
 
     let unparser = Unparser::new(&SqliteDialect {});
     let sql = unparser.plan_to_sql(&plan)?;
     assert_snapshot!(
         sql,
-        @r#"SELECT `test`.`k`, `test`.`v`, `rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` FROM (SELECT `test`.`k` AS `k`, `test`.`v` AS `v`, rank() OVER (PARTITION BY `test`.`k` ORDER BY `test`.`v` ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` FROM `test`) AS `test` WHERE (`rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` = 1)"#
+        @"SELECT `test`.`k`, `test`.`v`, `rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` FROM (SELECT `test`.`k` AS `k`, `test`.`v` AS `v`, rank() OVER (PARTITION BY `test`.`k` ORDER BY `test`.`v` ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` FROM `test`) AS `test` WHERE (`rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` = 1)"
     );
 
     let unparser = Unparser::new(&DefaultDialect {});
     let sql = unparser.plan_to_sql(&plan)?;
     assert_snapshot!(
         sql,
-        @r#"SELECT test.k, test.v, rank() OVER (PARTITION BY test.k ORDER BY test.v ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) FROM test QUALIFY (rank() OVER (PARTITION BY test.k ORDER BY test.v ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) = 1)"#
+        @"SELECT test.k, test.v, rank() OVER (PARTITION BY test.k ORDER BY test.v ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) FROM test QUALIFY (rank() OVER (PARTITION BY test.k ORDER BY test.v ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) = 1)"
     );
 
     // without table qualifier
@@ -2607,6 +2848,17 @@ fn test_unparse_window() -> Result<()> {
     Ok(())
 }
 
+#[test]
+fn test_array_to_sql_postgres() -> Result<(), DataFusionError> {
+    roundtrip_statement_with_dialect_helper!(
+        sql: "SELECT [1, 2, 3, 4, 5]",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: UnparserPostgreSqlDialect {},
+        expected: @"SELECT ARRAY[1, 2, 3, 4, 5]",
+    );
+    Ok(())
+}
+
 #[test]
 fn test_like_filter() {
     let statement = generate_round_trip_statement(
@@ -2747,3 +2999,615 @@ fn test_struct_expr3() {
         @r#"SELECT test.c1."metadata".product."name" FROM (SELECT {"metadata": {product: {"name": 'Product Name'}}} AS c1) AS test"#
     );
 }
+
+#[test]
+fn test_json_access_1() {
+    let statement = generate_round_trip_statement(
+        GenericDialect {},
+        r#"SELECT j1_string:field FROM j1"#,
+    );
+    assert_snapshot!(
+        statement,
+        @r#"SELECT (j1.j1_string : 'field') FROM j1"#
+    );
+}
+
+#[test]
+fn test_json_access_2() {
+    let statement = generate_round_trip_statement(
+        GenericDialect {},
+        r#"SELECT j1_string:field[0] FROM j1"#,
+    );
+    assert_snapshot!(
+        statement,
+        @r#"SELECT (j1.j1_string : 'field[0]') FROM j1"#
+    );
+}
+
+#[test]
+fn test_json_access_3() {
+    let statement = generate_round_trip_statement(
+        GenericDialect {},
+        r#"SELECT j1_string:field.inner1['inner2'] FROM j1"#,
+    );
+    assert_snapshot!(
+        statement,
+        @r#"SELECT (j1.j1_string : 'field.inner1[''inner2'']') FROM j1"#
+    );
+}
+
+/// Roundtrip test for a subquery aggregate with column aliases.
+/// Ensures that `subquery_alias_inner_query_and_columns` unwrapping
+/// a Projection -> Aggregate still triggers the derived-subquery path.
+#[test]
+fn roundtrip_subquery_aggregate_with_column_alias() -> Result<(), DataFusionError> {
+    roundtrip_statement_with_dialect_helper!(
+        sql: "SELECT id FROM (SELECT max(j1_id) FROM j1) AS c(id)",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: UnparserDefaultDialect {},
+        expected: @"SELECT c.id FROM (SELECT max(j1.j1_id) FROM j1) AS c (id)",
+    );
+    Ok(())
+}
+
+/// Test that unparsing a manually constructed join with a subquery aggregate
+/// preserves the MAX aggregate function.
+///
+/// Builds the equivalent of:
+///   SELECT j1.j1_string FROM j1
+///     JOIN (SELECT max(j2_id) AS max_id FROM j2) AS b
+///     ON j1.j1_id = b.max_id
+#[test]
+fn test_unparse_manual_join_with_subquery_aggregate() -> Result<()> {
+    let context = MockContextProvider {
+        state: MockSessionState::default(),
+    };
+    let j1_schema = context
+        .get_table_source(TableReference::bare("j1"))?
+        .schema();
+    let j2_schema = context
+        .get_table_source(TableReference::bare("j2"))?
+        .schema();
+
+    // Build the right side: SELECT max(j2_id) AS max_id FROM j2
+    let right_scan = table_scan(Some("j2"), &j2_schema, None)?.build()?;
+    let right_agg = LogicalPlanBuilder::from(right_scan)
+        .aggregate(
+            vec![] as Vec<Expr>,
+            vec![max(col("j2.j2_id")).alias("max_id")],
+        )?
+        .build()?;
+    let right_subquery = subquery_alias(right_agg, "b")?;
+
+    // Build the full plan: SELECT j1.j1_string FROM j1 JOIN (...) AS b ON j1.j1_id = b.max_id
+    let left_scan = table_scan(Some("j1"), &j1_schema, None)?.build()?;
+    let plan = LogicalPlanBuilder::from(left_scan)
+        .join(
+            right_subquery,
+            datafusion_expr::JoinType::Inner,
+            (
+                vec![Column::from_qualified_name("j1.j1_id")],
+                vec![Column::from_qualified_name("b.max_id")],
+            ),
+            None,
+        )?
+        .project(vec![col("j1.j1_string")])?
+        .build()?;
+
+    let unparser = Unparser::default();
+    let sql = unparser.plan_to_sql(&plan)?.to_string();
+    let sql_upper = sql.to_uppercase();
+    assert!(
+        sql_upper.contains("MAX("),
+        "Unparsed SQL should preserve the MAX aggregate function call, got: {sql}"
+    );
+
+    Ok(())
+}
+
+/// Regression test for https://github.com/apache/datafusion/issues/21490
+///
+/// When the outer Projection excludes a Sort column whose definition only
+/// exists as an alias in the inner Projection, the Unparser must inline the
+/// underlying expression into ORDER BY rather than emitting the now-missing
+/// alias name.
+#[test]
+fn test_sort_on_aliased_column_dropped_by_outer_projection() -> Result<()> {
+    let schema = Schema::new(vec![
+        Field::new("X", DataType::Utf8, true),
+        Field::new("Y", DataType::Utf8, true),
+        Field::new("Z", DataType::Utf8, true),
+    ]);
+
+    // Build:
+    //   Projection: [a, b]                         -- outer: excludes sort column "c"
+    //     Sort: [c DESC, fetch=1]                   -- references alias "c"
+    //       Projection: [X AS a, Y AS b, Z AS c]    -- defines alias "c"
+    //         SubqueryAlias: t
+    //           TableScan: phys_table [X, Y, Z]
+    let plan = table_scan(Some("phys_table"), &schema, None)?
+        .alias("t")?
+        .project(vec![
+            Expr::Column(Column::new(Some(TableReference::bare("t")), "X")).alias("a"),
+            Expr::Column(Column::new(Some(TableReference::bare("t")), "Y")).alias("b"),
+            Expr::Column(Column::new(Some(TableReference::bare("t")), "Z")).alias("c"),
+        ])?
+        .sort_with_limit(
+            vec![Expr::Column(Column::new_unqualified("c")).sort(false, true)],
+            Some(1),
+        )?
+        .project(vec![
+            Expr::Column(Column::new_unqualified("a")),
+            Expr::Column(Column::new_unqualified("b")),
+        ])?
+        .build()?;
+
+    let unparser = Unparser::default();
+    let sql = unparser.plan_to_sql(&plan)?;
+
+    // ORDER BY must reference the physical column, not the dropped alias.
+    assert_snapshot!(
+        sql,
+        @r#"SELECT t."X" AS a, t."Y" AS b FROM phys_table AS t ORDER BY t."Z" DESC NULLS FIRST LIMIT 1"#
+    );
+
+    Ok(())
+}
+
+#[test]
+fn snowflake_unnest_to_lateral_flatten_simple() -> Result<(), DataFusionError> {
+    let snowflake = SnowflakeDialect::new();
+    roundtrip_statement_with_dialect_helper!(
+        sql: "SELECT * FROM UNNEST([1,2,3])",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: snowflake,
+        expected: @r#"SELECT "_unnest_1"."VALUE" FROM LATERAL FLATTEN(INPUT => [1, 2, 3]) AS "_unnest_1""#,
+    );
+    Ok(())
+}
+
+#[test]
+fn snowflake_unnest_to_lateral_flatten_with_cross_join() -> Result<(), DataFusionError> {
+    let snowflake = SnowflakeDialect::new();
+    roundtrip_statement_with_dialect_helper!(
+        sql: "SELECT * FROM UNNEST([1,2,3]), j1",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: snowflake,
+        expected: @r#"SELECT "_unnest_1"."VALUE", "j1"."j1_id", "j1"."j1_string" FROM LATERAL FLATTEN(INPUT => [1, 2, 3]) AS "_unnest_1" CROSS JOIN "j1""#,
+    );
+    Ok(())
+}
+
+#[test]
+fn snowflake_unnest_to_lateral_flatten_cross_join_inline() -> Result<(), DataFusionError>
+{
+    // Cross join with two inline UNNEST sources — both produce valid FLATTEN.
+    // NOTE: UNNEST(table.column) is NOT tested with Snowflake because
+    // LATERAL FLATTEN(INPUT => col) requires the column to be a Snowflake
+    // VARIANT/ARRAY type, which cannot be validated at unparse time.
+    let snowflake = SnowflakeDialect::new();
+    roundtrip_statement_with_dialect_helper!(
+        sql: "SELECT * FROM UNNEST([1,2,3]) u(c1) JOIN j1 ON u.c1 = j1.j1_id",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: snowflake,
+        // NOTE: SELECT correctly uses VALUE, but the JOIN ON condition
+        // still references the original column alias (c1) because join
+        // filters are rendered outside reconstruct_select_statement.
+        expected: @r#"SELECT "u"."VALUE", "j1"."j1_id", "j1"."j1_string" FROM LATERAL FLATTEN(INPUT => [1, 2, 3]) AS "u" INNER JOIN "j1" ON ("u"."c1" = "j1"."j1_id")"#,
+    );
+    Ok(())
+}
+
+// --- Edge case tests for Snowflake FLATTEN ---
+
+#[test]
+fn snowflake_flatten_implicit_from() -> Result<(), DataFusionError> {
+    // UNNEST in SELECT clause (no explicit FROM UNNEST) — implicit table factor
+    let snowflake = SnowflakeDialect::new();
+    roundtrip_statement_with_dialect_helper!(
+        sql: "SELECT UNNEST([1,2,3])",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: snowflake,
+        expected: @r#"SELECT "_unnest_1"."VALUE" FROM LATERAL FLATTEN(INPUT => [1, 2, 3]) AS "_unnest_1""#,
+    );
+    Ok(())
+}
+
+#[test]
+fn snowflake_flatten_string_array() -> Result<(), DataFusionError> {
+    // String array unnest
+    let snowflake = SnowflakeDialect::new();
+    roundtrip_statement_with_dialect_helper!(
+        sql: "SELECT * FROM UNNEST(['a','b','c'])",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: snowflake,
+        expected: @r#"SELECT "_unnest_1"."VALUE" FROM LATERAL FLATTEN(INPUT => ['a', 'b', 'c']) AS "_unnest_1""#,
+    );
+    Ok(())
+}
+
+#[test]
+fn snowflake_flatten_select_unnest_with_alias() -> Result<(), DataFusionError> {
+    let snowflake = SnowflakeDialect::new();
+    roundtrip_statement_with_dialect_helper!(
+        sql: "SELECT UNNEST([1,2,3]) as c1",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: snowflake,
+        expected: @r#"SELECT "_unnest_1"."VALUE" AS "c1" FROM LATERAL FLATTEN(INPUT => [1, 2, 3]) AS "_unnest_1""#,
+    );
+    Ok(())
+}
+
+#[test]
+fn snowflake_flatten_select_unnest_plus_literal() -> Result<(), DataFusionError> {
+    let snowflake = SnowflakeDialect::new();
+    roundtrip_statement_with_dialect_helper!(
+        sql: "SELECT UNNEST([1,2,3]), 1",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: snowflake,
+        expected: @r#"SELECT "_unnest_1"."VALUE", "Int64(1)" FROM LATERAL FLATTEN(INPUT => [1, 2, 3]) AS "_unnest_1""#,
+    );
+    Ok(())
+}
+
+#[test]
+fn snowflake_flatten_from_unnest_with_table_alias() -> Result<(), DataFusionError> {
+    let snowflake = SnowflakeDialect::new();
+    roundtrip_statement_with_dialect_helper!(
+        sql: "SELECT * FROM UNNEST([1,2,3]) AS t1 (c1)",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: snowflake,
+        expected: @r#"SELECT "t1"."VALUE" FROM LATERAL FLATTEN(INPUT => [1, 2, 3]) AS "t1""#,
+    );
+    Ok(())
+}
+
+#[test]
+fn snowflake_flatten_unnest_from_subselect() -> Result<(), DataFusionError> {
+    // UNNEST operating on an array column produced by a subselect.
+    // Uses unnest_table which has array_col (List<Int64>).
+    // The filter uses array_col IS NOT NULL — a simple predicate
+    // that doesn't involve struct types (which Snowflake FLATTEN can't handle).
+    let snowflake = SnowflakeDialect::new();
+    roundtrip_statement_with_dialect_helper!(
+        sql: "SELECT UNNEST(array_col) FROM (SELECT array_col FROM unnest_table WHERE array_col IS NOT NULL LIMIT 3)",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: snowflake,
+        expected: @r#"SELECT "_unnest_1"."VALUE" FROM (SELECT "unnest_table"."array_col" FROM "unnest_table" WHERE "unnest_table"."array_col" IS NOT NULL LIMIT 3) CROSS JOIN LATERAL FLATTEN(INPUT => "unnest_table"."array_col") AS "_unnest_1""#,
+    );
+    Ok(())
+}
+
+/// Dummy scalar UDF for testing — takes a string and returns List<Int64>.
+/// Simulates any UDF that extracts an array from a column (e.g. parsing
+/// JSON, splitting a delimited string, etc.).
+#[derive(Debug, PartialEq, Eq, Hash)]
+struct ExtractArrayUdf {
+    signature: Signature,
+}
+
+impl ExtractArrayUdf {
+    fn new() -> Self {
+        Self {
+            signature: Signature::exact(vec![DataType::Utf8], Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for ExtractArrayUdf {
+    fn name(&self) -> &str {
+        "extract_array"
+    }
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::List(Arc::new(Field::new_list_field(
+            DataType::Int64,
+            true,
+        ))))
+    }
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        unimplemented!("test stub")
+    }
+}
+
+#[test]
+fn snowflake_flatten_unnest_udf_result() -> Result<(), DataFusionError> {
+    // UNNEST on a UDF result: extract_array(col) returns List<Int64>,
+    // then UNNEST flattens it. This exercises the path where the FLATTEN
+    // INPUT is a UDF call rather than a bare column reference.
+    let sql = "SELECT UNNEST(extract_array(j1_string)) AS items FROM j1 LIMIT 5";
+
+    let statement = Parser::new(&GenericDialect {})
+        .try_with_sql(sql)?
+        .parse_statement()?;
+
+    let state = MockSessionState::default()
+        .with_aggregate_function(max_udaf())
+        .with_aggregate_function(min_udaf())
+        .with_scalar_function(Arc::new(ScalarUDF::new_from_impl(ExtractArrayUdf::new())))
+        .with_expr_planner(Arc::new(CoreFunctionPlanner::default()))
+        .with_expr_planner(Arc::new(NestedFunctionPlanner))
+        .with_expr_planner(Arc::new(FieldAccessPlanner));
+
+    let context = MockContextProvider { state };
+    let sql_to_rel = SqlToRel::new(&context);
+    let plan = sql_to_rel
+        .sql_statement_to_plan(statement)
+        .unwrap_or_else(|e| panic!("Failed to parse sql: {sql}\n{e}"));
+
+    let snowflake = SnowflakeDialect::new();
+    let unparser = Unparser::new(&snowflake);
+    let result = unparser.plan_to_sql(&plan)?;
+    let actual = result.to_string();
+
+    insta::assert_snapshot!(actual, @r#"SELECT "_unnest_1"."VALUE" AS "items" FROM "j1" CROSS JOIN LATERAL FLATTEN(INPUT => extract_array("j1"."j1_string")) AS "_unnest_1" LIMIT 5"#);
+    Ok(())
+}
+
+#[test]
+fn snowflake_flatten_limit_between_projection_and_unnest() -> Result<(), DataFusionError>
+{
+    // Build: Projection → Limit → Unnest → Projection → TableScan
+    // The optimizer can insert a Limit between the outer Projection and the
+    // Unnest. The FLATTEN code path must look through transparent nodes
+    // (Limit, Sort) to find the Unnest.
+    let schema = Schema::new(vec![Field::new(
+        "items",
+        DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))),
+        true,
+    )]);
+
+    let plan = table_scan(Some("source"), &schema, None)?
+        .project(vec![col("items").alias("__unnest_placeholder(items)")])?
+        .unnest_column("__unnest_placeholder(items)")?
+        .limit(0, Some(5))? // Limit BETWEEN outer Projection and Unnest
+        .project(vec![col("__unnest_placeholder(items)").alias("item")])?
+        .build()?;
+
+    let snowflake = SnowflakeDialect::new();
+    let unparser = Unparser::new(&snowflake);
+    let result = unparser.plan_to_sql(&plan)?;
+    let actual = result.to_string();
+
+    // Must contain LATERAL FLATTEN — the Limit must not prevent FLATTEN detection
+    insta::assert_snapshot!(actual, @r#"SELECT "_unnest_1"."VALUE" AS "item" FROM "source" CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1" LIMIT 5"#);
+    Ok(())
+}
+
+#[test]
+fn snowflake_flatten_sort_between_projection_and_unnest() -> Result<(), DataFusionError> {
+    // Build: Projection → Sort → Unnest → Projection → TableScan
+    // Same as Limit test but with Sort instead.
+    let schema = Schema::new(vec![Field::new(
+        "items",
+        DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))),
+        true,
+    )]);
+
+    let plan = table_scan(Some("source"), &schema, None)?
+        .project(vec![col("items").alias("__unnest_placeholder(items)")])?
+        .unnest_column("__unnest_placeholder(items)")?
+        .sort(vec![col("__unnest_placeholder(items)").sort(true, true)])?
+        .project(vec![col("__unnest_placeholder(items)").alias("item")])?
+        .build()?;
+
+    let snowflake = SnowflakeDialect::new();
+    let unparser = Unparser::new(&snowflake);
+    let result = unparser.plan_to_sql(&plan)?;
+    let actual = result.to_string();
+
+    // Must contain LATERAL FLATTEN — the Sort must not prevent FLATTEN detection
+    insta::assert_snapshot!(actual, @r#"SELECT "_unnest_1"."VALUE" AS "item" FROM "source" CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1" ORDER BY "_unnest_1"."VALUE" ASC NULLS FIRST"#);
+    Ok(())
+}
+
+#[test]
+fn snowflake_flatten_limit_between_projection_and_unnest_with_subquery_alias()
+-> Result<(), DataFusionError> {
+    // Build: Projection → Limit → Unnest → SubqueryAlias → Projection → TableScan
+    // Combines the Limit and SubqueryAlias transparent node patterns.
+    let schema = Schema::new(vec![Field::new(
+        "items",
+        DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))),
+        true,
+    )]);
+
+    let plan = table_scan(Some("source"), &schema, None)?
+        .project(vec![col("items").alias("__unnest_placeholder(items)")])?
+        .alias("t")?
+        .unnest_column("__unnest_placeholder(items)")?
+        .limit(0, Some(10))?
+        .project(vec![col("__unnest_placeholder(items)").alias("item")])?
+        .build()?;
+
+    let snowflake = SnowflakeDialect::new();
+    let unparser = Unparser::new(&snowflake);
+    let result = unparser.plan_to_sql(&plan)?;
+    let actual = result.to_string();
+
+    insta::assert_snapshot!(actual, @r#"SELECT "_unnest_1"."VALUE" AS "item" FROM "source" CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1" LIMIT 10"#);
+    Ok(())
+}
+
+#[test]
+fn snowflake_flatten_composed_expression_wrapping_unnest() -> Result<(), DataFusionError>
+{
+    // Build: Projection(CAST(placeholder AS Int64) AS item_id) → Unnest → Projection → TableScan
+    // The outer Projection wraps the unnest output in a function call.
+    // The FLATTEN code path must detect the placeholder inside the function
+    // and still emit LATERAL FLATTEN.
+    let schema = Schema::new(vec![Field::new(
+        "items",
+        DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))),
+        true,
+    )]);
+
+    let plan = table_scan(Some("source"), &schema, None)?
+        .project(vec![col("items").alias("__unnest_placeholder(items)")])?
+        .unnest_column("__unnest_placeholder(items)")?
+        .project(vec![
+            cast(col("__unnest_placeholder(items)"), DataType::Int64).alias("item_id"),
+        ])?
+        .build()?;
+
+    let snowflake = SnowflakeDialect::new();
+    let unparser = Unparser::new(&snowflake);
+    let result = unparser.plan_to_sql(&plan)?;
+    let actual = result.to_string();
+
+    // Must contain LATERAL FLATTEN despite the placeholder being inside CAST
+    insta::assert_snapshot!(actual, @r#"SELECT CAST("_unnest_1"."VALUE" AS BIGINT) AS "item_id" FROM "source" CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1""#);
+    Ok(())
+}
+
+#[test]
+fn snowflake_flatten_composed_expression_with_limit() -> Result<(), DataFusionError> {
+    // Combines both bugs: composed expression + Limit between Projection and Unnest
+    // Build: Projection(CAST(placeholder AS Int64) AS item_id) → Limit → Unnest → Projection → TableScan
+    let schema = Schema::new(vec![Field::new(
+        "items",
+        DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))),
+        true,
+    )]);
+
+    let plan = table_scan(Some("source"), &schema, None)?
+        .project(vec![col("items").alias("__unnest_placeholder(items)")])?
+        .unnest_column("__unnest_placeholder(items)")?
+        .limit(0, Some(5))?
+        .project(vec![
+            cast(col("__unnest_placeholder(items)"), DataType::Int64).alias("item_id"),
+        ])?
+        .build()?;
+
+    let snowflake = SnowflakeDialect::new();
+    let unparser = Unparser::new(&snowflake);
+    let result = unparser.plan_to_sql(&plan)?;
+    let actual = result.to_string();
+
+    insta::assert_snapshot!(actual, @r#"SELECT CAST("_unnest_1"."VALUE" AS BIGINT) AS "item_id" FROM "source" CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1" LIMIT 5"#);
+    Ok(())
+}
+
+#[test]
+fn snowflake_flatten_multi_expression_projection() -> Result<(), DataFusionError> {
+    // Build: Projection([CAST(placeholder AS Int64) AS a, CAST(placeholder AS Utf8) AS b])
+    //          → Unnest → Projection → TableScan
+    // The outer Projection has TWO expressions — both reference the placeholder.
+    // The FLATTEN code path must fire even when p.expr.len() > 1.
+    let schema = Schema::new(vec![Field::new(
+        "items",
+        DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))),
+        true,
+    )]);
+
+    let plan = table_scan(Some("source"), &schema, None)?
+        .project(vec![col("items").alias("__unnest_placeholder(items)")])?
+        .unnest_column("__unnest_placeholder(items)")?
+        .project(vec![
+            cast(col("__unnest_placeholder(items)"), DataType::Int64).alias("a"),
+            cast(col("__unnest_placeholder(items)"), DataType::Utf8).alias("b"),
+        ])?
+        .build()?;
+
+    let snowflake = SnowflakeDialect::new();
+    let unparser = Unparser::new(&snowflake);
+    let result = unparser.plan_to_sql(&plan)?;
+    let actual = result.to_string();
+
+    insta::assert_snapshot!(actual, @r#"SELECT CAST("_unnest_1"."VALUE" AS BIGINT) AS "a", CAST("_unnest_1"."VALUE" AS VARCHAR) AS "b" FROM "source" CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1""#);
+    Ok(())
+}
+
+#[test]
+fn snowflake_flatten_multi_expression_with_limit() -> Result<(), DataFusionError> {
+    // Multi-expression + Limit between Projection and Unnest
+    let schema = Schema::new(vec![Field::new(
+        "items",
+        DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))),
+        true,
+    )]);
+
+    let plan = table_scan(Some("source"), &schema, None)?
+        .project(vec![col("items").alias("__unnest_placeholder(items)")])?
+        .unnest_column("__unnest_placeholder(items)")?
+        .limit(0, Some(10))?
+        .project(vec![
+            cast(col("__unnest_placeholder(items)"), DataType::Int64).alias("a"),
+            cast(col("__unnest_placeholder(items)"), DataType::Utf8).alias("b"),
+        ])?
+        .build()?;
+
+    let snowflake = SnowflakeDialect::new();
+    let unparser = Unparser::new(&snowflake);
+    let result = unparser.plan_to_sql(&plan)?;
+    let actual = result.to_string();
+
+    insta::assert_snapshot!(actual, @r#"SELECT CAST("_unnest_1"."VALUE" AS BIGINT) AS "a", CAST("_unnest_1"."VALUE" AS VARCHAR) AS "b" FROM "source" CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1" LIMIT 10"#);
+    Ok(())
+}
+
+#[test]
+fn snowflake_unnest_through_subquery_alias() -> Result<(), DataFusionError> {
+    // Build: Projection → Unnest → SubqueryAlias → Projection → TableScan
+    // This simulates the plan produced when a virtual/passthrough table
+    // wraps the source in a SubqueryAlias, which sits between the Unnest
+    // and its inner Projection.
+
+    let schema = Schema::new(vec![Field::new(
+        "items",
+        DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))),
+        true,
+    )]);
+
+    let plan = table_scan(Some("source"), &schema, None)?
+        .project(vec![col("items").alias("__unnest_placeholder(items)")])?
+        .alias("t")? // SubqueryAlias — this is what breaks
+        .unnest_column("__unnest_placeholder(items)")?
+        .project(vec![col("__unnest_placeholder(items)").alias("item")])?
+        .build()?;
+
+    let snowflake = SnowflakeDialect::new();
+    let unparser = Unparser::new(&snowflake);
+    let result = unparser.plan_to_sql(&plan)?;
+    let sql_str = result.to_string();
+
+    // Should contain LATERAL FLATTEN, not error
+    insta::assert_snapshot!(sql_str, @r#"SELECT "_unnest_1"."VALUE" AS "item" FROM "source" CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1""#);
+    Ok(())
+}
+
+#[test]
+fn snowflake_flatten_cross_join_unnest_table_column() -> Result<(), DataFusionError> {
+    // Single CROSS JOIN UNNEST from a table column with user-provided alias.
+    // Column references into the FLATTEN alias use .VALUE.
+    let snowflake = SnowflakeDialect::new();
+    roundtrip_statement_with_dialect_helper!(
+        sql: "SELECT * FROM multi_array_table CROSS JOIN UNNEST(column_a) AS a (a)",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: snowflake,
+        expected: @r#"SELECT "multi_array_table"."column_a", "multi_array_table"."column_b", "a"."VALUE" FROM "multi_array_table" CROSS JOIN LATERAL FLATTEN(INPUT => "multi_array_table"."column_a") AS "a""#,
+    );
+    Ok(())
+}
+
+#[test]
+fn snowflake_flatten_multiple_unnest_cross_join() -> Result<(), DataFusionError> {
+    // Realistic Snowflake pattern:
+    //   SELECT a, b
+    //   FROM multi_array_table
+    //   CROSS JOIN UNNEST(column_a) AS a
+    //   CROSS JOIN UNNEST(column_b) AS b
+    //
+    // Each CROSS JOIN UNNEST should produce a separate LATERAL FLATTEN
+    // with a distinct alias so they don't collide in the same FROM clause.
+    let snowflake = SnowflakeDialect::new();
+    roundtrip_statement_with_dialect_helper!(
+        sql: "SELECT a.a, b.b FROM multi_array_table CROSS JOIN UNNEST(column_a) AS a (a) CROSS JOIN UNNEST(column_b) AS b (b)",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: snowflake,
+        expected: @r#"SELECT "a"."VALUE", "b"."VALUE" FROM "multi_array_table" CROSS JOIN LATERAL FLATTEN(INPUT => "multi_array_table"."column_a") AS "a" CROSS JOIN LATERAL FLATTEN(INPUT => "multi_array_table"."column_b") AS "b""#,
+    );
+    Ok(())
+}
diff --git a/datafusion/sql/tests/common/mod.rs b/datafusion/sql/tests/common/mod.rs
index 5d9fd9f2c3740..71e864d2a733d 100644
--- a/datafusion/sql/tests/common/mod.rs
+++ b/datafusion/sql/tests/common/mod.rs
@@ -23,10 +23,13 @@ use std::{sync::Arc, vec};
 
 use arrow::datatypes::*;
 use datafusion_common::config::ConfigOptions;
+use datafusion_common::datatype::DataTypeExt;
 use datafusion_common::file_options::file_type::FileType;
-use datafusion_common::{plan_err, DFSchema, GetExt, Result, TableReference};
+use datafusion_common::{DFSchema, GetExt, Result, TableReference, plan_err};
 use datafusion_expr::planner::{ExprPlanner, PlannerResult, TypePlanner};
-use datafusion_expr::{AggregateUDF, Expr, ScalarUDF, TableSource, WindowUDF};
+use datafusion_expr::{
+    AggregateUDF, Expr, HigherOrderUDF, ScalarUDF, TableSource, WindowUDF,
+};
 use datafusion_functions_nested::expr_fn::make_array;
 use datafusion_sql::planner::ContextProvider;
 
@@ -53,6 +56,7 @@ impl Display for MockCsvType {
 #[derive(Default)]
 pub(crate) struct MockSessionState {
     scalar_functions: HashMap<String, Arc<ScalarUDF>>,
+    higher_order_functions: HashMap<String, Arc<dyn HigherOrderUDF>>,
     aggregate_functions: HashMap<String, Arc<AggregateUDF>>,
     expr_planners: Vec<Arc<dyn ExprPlanner>>,
     type_planner: Option<Arc<dyn TypePlanner>>,
@@ -94,6 +98,17 @@ impl MockSessionState {
             .insert(window_function.name().to_string(), window_function);
         self
     }
+
+    pub fn with_higher_order_function(
+        mut self,
+        higher_order_function: Arc<dyn HigherOrderUDF>,
+    ) -> Self {
+        self.higher_order_functions.insert(
+            higher_order_function.name().to_string(),
+            higher_order_function,
+        );
+        self
+    }
 }
 
 pub(crate) struct MockContextProvider {
@@ -161,12 +176,26 @@ impl ContextProvider for MockContextProvider {
             ])),
             "orders" => Ok(Schema::new(vec![
                 Field::new("order_id", DataType::UInt32, false),
+                Field::new("o_orderkey", DataType::UInt32, false),
+                Field::new("o_custkey", DataType::UInt32, false),
+                Field::new("o_orderstatus", DataType::Utf8, false),
                 Field::new("customer_id", DataType::UInt32, false),
+                Field::new("o_totalprice", DataType::Decimal128(15, 2), false),
                 Field::new("o_item_id", DataType::Utf8, false),
                 Field::new("qty", DataType::Int32, false),
                 Field::new("price", DataType::Float64, false),
                 Field::new("delivered", DataType::Boolean, false),
             ])),
+            "customer" => Ok(Schema::new(vec![
+                Field::new("c_custkey", DataType::UInt32, false),
+                Field::new("c_name", DataType::Utf8, false),
+                Field::new("c_address", DataType::Utf8, false),
+                Field::new("c_nationkey", DataType::UInt32, false),
+                Field::new("c_phone", DataType::Utf8, false),
+                Field::new("c_acctbal", DataType::Float64, false),
+                Field::new("c_mktsegment", DataType::Utf8, false),
+                Field::new("c_comment", DataType::Utf8, false),
+            ])),
             "array" => Ok(Schema::new(vec![
                 Field::new(
                     "left",
@@ -186,8 +215,10 @@ impl ContextProvider for MockContextProvider {
                 ),
             ])),
             "lineitem" => Ok(Schema::new(vec![
+                Field::new("l_orderkey", DataType::UInt32, false),
                 Field::new("l_item_id", DataType::UInt32, false),
                 Field::new("l_description", DataType::Utf8, false),
+                Field::new("l_extendedprice", DataType::Decimal128(15, 2), false),
                 Field::new("price", DataType::Float64, false),
             ])),
             "aggregate_test_100" => Ok(Schema::new(vec![
@@ -227,6 +258,26 @@ impl ContextProvider for MockContextProvider {
                     false,
                 ),
             ])),
+            "multi_array_table" => Ok(Schema::new(vec![
+                Field::new(
+                    "column_a",
+                    DataType::List(Arc::new(Field::new_list_field(
+                        DataType::Int64,
+                        true,
+                    ))),
+                    false,
+                ),
+                Field::new(
+                    "column_b",
+                    DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))),
+                    false,
+                ),
+            ])),
+            "@quoted_identifier_names_table" => Ok(Schema::new(vec![Field::new(
+                "@column",
+                DataType::UInt32,
+                false,
+            )])),
             _ => plan_err!("No table named: {} found", name.table()),
         };
 
@@ -240,12 +291,19 @@ impl ContextProvider for MockContextProvider {
         self.state.scalar_functions.get(name).cloned()
     }
 
+    fn get_higher_order_meta(&self, name: &str) -> Option<Arc<dyn HigherOrderUDF>> {
+        self.state.higher_order_functions.get(name).cloned()
+    }
+
     fn get_aggregate_meta(&self, name: &str) -> Option<Arc<AggregateUDF>> {
         self.state.aggregate_functions.get(name).cloned()
     }
 
-    fn get_variable_type(&self, _: &[String]) -> Option<DataType> {
-        unimplemented!()
+    fn get_variable_type(&self, variable_names: &[String]) -> Option<DataType> {
+        match variable_names {
+            [var] if var == "@variable" => Some(DataType::Date32),
+            _ => unimplemented!(),
+        }
     }
 
     fn get_window_meta(&self, name: &str) -> Option<Arc<WindowUDF>> {
@@ -272,6 +330,10 @@ impl ContextProvider for MockContextProvider {
         self.state.scalar_functions.keys().cloned().collect()
     }
 
+    fn higher_order_function_names(&self) -> Vec<String> {
+        self.state.higher_order_functions.keys().cloned().collect()
+    }
+
     fn udaf_names(&self) -> Vec<String> {
         self.state.aggregate_functions.keys().cloned().collect()
     }
@@ -304,10 +366,6 @@ impl EmptyTable {
 }
 
 impl TableSource for EmptyTable {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema(&self) -> SchemaRef {
         Arc::clone(&self.table_schema)
     }
@@ -317,8 +375,17 @@ impl TableSource for EmptyTable {
 pub struct CustomTypePlanner {}
 
 impl TypePlanner for CustomTypePlanner {
-    fn plan_type(&self, sql_type: &sqlparser::ast::DataType) -> Result<Option<DataType>> {
+    fn plan_type_field(
+        &self,
+        sql_type: &sqlparser::ast::DataType,
+    ) -> Result<Option<FieldRef>> {
         match sql_type {
+            sqlparser::ast::DataType::Uuid => Ok(Some(Arc::new(
+                Field::new("", DataType::FixedSizeBinary(16), true).with_metadata(
+                    [("ARROW:extension:name".to_string(), "arrow.uuid".to_string())]
+                        .into(),
+                ),
+            ))),
             sqlparser::ast::DataType::Datetime(precision) => {
                 let precision = match precision {
                     Some(0) => TimeUnit::Second,
@@ -327,7 +394,9 @@ impl TypePlanner for CustomTypePlanner {
                     None | Some(9) => TimeUnit::Nanosecond,
                     _ => unreachable!(),
                 };
-                Ok(Some(DataType::Timestamp(precision, None)))
+                Ok(Some(
+                    DataType::Timestamp(precision, None).into_nullable_field_ref(),
+                ))
             }
             _ => Ok(None),
         }
diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs
index 96d9f23522f1f..724057318ca40 100644
--- a/datafusion/sql/tests/sql_integration.rs
+++ b/datafusion/sql/tests/sql_integration.rs
@@ -15,7 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
+// This lint violation is acceptable for tests, so suppress for now
+// Issue: <https://github.com/apache/datafusion/issues/18503>
+#![expect(clippy::needless_pass_by_value)]
+
 use std::hash::Hash;
 #[cfg(test)]
 use std::sync::Arc;
@@ -23,30 +26,40 @@ use std::vec;
 
 use arrow::datatypes::{TimeUnit::Nanosecond, *};
 use common::MockContextProvider;
-use datafusion_common::{assert_contains, DataFusionError, Result};
+use datafusion_common::{DFSchema, DataFusionError, Result, assert_contains};
 use datafusion_expr::{
-    col, logical_plan::LogicalPlan, test::function_stub::sum_udaf, ColumnarValue,
-    CreateIndex, DdlStatement, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
-    Volatility,
+    ColumnarValue, CreateIndex, DdlStatement, Expr, HigherOrderFunctionArgs,
+    HigherOrderReturnFieldArgs, HigherOrderSignature, HigherOrderUDF,
+    LambdaParametersProgress, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
+    ValueOrLambda, Volatility, col,
+    expr::{HigherOrderFunction, LambdaVariable, ScalarFunction},
+    lambda,
+    logical_plan::LogicalPlan,
+    test::function_stub::sum_udaf,
 };
 use datafusion_functions::{string, unicode};
 use datafusion_sql::{
     parser::DFParser,
-    planner::{NullOrdering, ParserOptions, SqlToRel},
+    planner::{NullOrdering, ParserOptions, PlannerContext, SqlToRel},
 };
 
 use crate::common::{CustomExprPlanner, CustomTypePlanner, MockSessionState};
 use datafusion_functions::core::planner::CoreFunctionPlanner;
 use datafusion_functions_aggregate::{
-    approx_median::approx_median_udaf, count::count_udaf, min_max::max_udaf,
-    min_max::min_udaf,
+    approx_median::approx_median_udaf,
+    average::avg_udaf,
+    count::count_udaf,
+    grouping::grouping_udaf,
+    min_max::{max_udaf, min_udaf},
 };
-use datafusion_functions_aggregate::{average::avg_udaf, grouping::grouping_udaf};
 use datafusion_functions_nested::make_array::make_array_udf;
 use datafusion_functions_window::{rank::rank_udwf, row_number::row_number_udwf};
 use insta::{allow_duplicates, assert_snapshot};
 use rstest::rstest;
-use sqlparser::dialect::{Dialect, GenericDialect, HiveDialect, MySqlDialect};
+use sqlparser::dialect::{
+    DatabricksDialect, Dialect, GenericDialect, HiveDialect, MySqlDialect,
+};
+use sqlparser::parser::Parser;
 
 mod cases;
 mod common;
@@ -212,10 +225,10 @@ fn parse_ident_normalization_3() {
     let plan = logical_plan_with_options(sql, parser_option).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.age
-          TableScan: person
-        "#
+        @r"
+    Projection: person.age
+      TableScan: person
+    "
     );
 }
 
@@ -226,13 +239,48 @@ fn parse_ident_normalization_4() {
     let plan = logical_plan_with_options(sql, parser_option).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.age
-          TableScan: person
-        "#
+        @r"
+    Projection: person.age
+      TableScan: person
+    "
+    );
+}
+
+#[test]
+fn within_group_rejected_for_non_ordered_set_udaf() {
+    // MIN is order-sensitive by nature but does not implement the
+    // ordered-set `WITHIN GROUP` opt-in. The planner must reject
+    // explicit `WITHIN GROUP` syntax for functions that do not
+    // advertise `supports_within_group_clause()`.
+    let sql = "SELECT min(c1) WITHIN GROUP (ORDER BY c1) FROM person";
+    let err = logical_plan(sql)
+        .expect_err("expected planning to fail for MIN WITHIN GROUP")
+        .to_string();
+    assert_contains!(
+        err,
+        "WITHIN GROUP is only supported for ordered-set aggregate functions"
     );
 }
 
+#[test]
+fn typed_literal_without_string_payload_returns_error() {
+    let sql_expr = Parser::new(&GenericDialect {})
+        .try_with_sql("time 17542368000000000")
+        .unwrap()
+        .parse_expr()
+        .unwrap();
+    let context = MockContextProvider {
+        state: MockSessionState::default(),
+    };
+    let sql_to_rel = SqlToRel::new(&context);
+
+    let err = sql_to_rel
+        .sql_to_expr(sql_expr, &DFSchema::empty(), &mut PlannerContext::new())
+        .expect_err("planning invalid typed literals should return an error");
+
+    assert_contains!(err.to_string(), "Typed literal requires a string payload");
+}
+
 #[test]
 fn parse_ident_normalization_5() {
     let sql = "SELECT AGE FROM PERSON";
@@ -242,9 +290,7 @@ fn parse_ident_normalization_5() {
         .strip_backtrace();
     assert_snapshot!(
         plan,
-        @r#"
-        Error during planning: No table named: PERSON found
-        "#
+        @"Error during planning: No table named: PERSON found"
     );
 }
 
@@ -255,10 +301,10 @@ fn parse_ident_normalization_6() {
     let plan = logical_plan_with_options(sql, parser_option).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: UPPERCASE_test.Id
-          TableScan: UPPERCASE_test
-        "#
+        @r"
+    Projection: UPPERCASE_test.Id
+      TableScan: UPPERCASE_test
+    "
     );
 }
 
@@ -269,10 +315,10 @@ fn parse_ident_normalization_7() {
     let plan = logical_plan_with_options(sql, parser_option).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: UPPERCASE_test.Id, UPPERCASE_test.lower
-          TableScan: UPPERCASE_test
-        "#
+        @r"
+    Projection: UPPERCASE_test.Id, UPPERCASE_test.lower
+      TableScan: UPPERCASE_test
+    "
     );
 }
 
@@ -354,11 +400,11 @@ fn try_cast_from_aggregation() {
     let plan = logical_plan("SELECT TRY_CAST(sum(age) AS FLOAT) FROM person").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: TRY_CAST(sum(person.age) AS Float32)
-          Aggregate: groupBy=[[]], aggr=[[sum(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: TRY_CAST(sum(person.age) AS Float32)
+      Aggregate: groupBy=[[]], aggr=[[sum(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -370,7 +416,7 @@ fn cast_to_invalid_decimal_type_precision_0() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r"Error during planning: Decimal(precision = 0, scale = 0) should satisfy `0 < precision <= 76`, and `scale <= precision`."
+        @"Error during planning: Decimal(precision = 0, scale = 0) should satisfy `0 < precision <= 76`, and `scale <= precision`."
     );
 }
 
@@ -396,7 +442,7 @@ fn cast_to_invalid_decimal_type_precision_gt_76() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r"Error during planning: Decimal(precision = 79, scale = 0) should satisfy `0 < precision <= 76`, and `scale <= precision`."
+        @"Error during planning: Decimal(precision = 79, scale = 0) should satisfy `0 < precision <= 76`, and `scale <= precision`."
     );
 }
 
@@ -408,7 +454,7 @@ fn cast_to_invalid_decimal_type_precision_lt_scale() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r"Error during planning: Decimal(precision = 5, scale = 10) should satisfy `0 < precision <= 76`, and `scale <= precision`."
+        @"Error during planning: Decimal(precision = 5, scale = 10) should satisfy `0 < precision <= 76`, and `scale <= precision`."
     );
 }
 
@@ -514,9 +560,7 @@ fn plan_start_transaction() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        TransactionStart: ReadWrite Serializable
-        "#
+        @"TransactionStart: ReadWrite Serializable"
     );
 }
 
@@ -526,9 +570,7 @@ fn plan_start_transaction_isolation() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        TransactionStart: ReadWrite ReadCommitted
-        "#
+        @"TransactionStart: ReadWrite ReadCommitted"
     );
 }
 
@@ -538,9 +580,7 @@ fn plan_start_transaction_read_only() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        TransactionStart: ReadOnly Serializable
-        "#
+        @"TransactionStart: ReadOnly Serializable"
     );
 }
 
@@ -550,9 +590,7 @@ fn plan_start_transaction_fully_qualified() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        TransactionStart: ReadOnly ReadCommitted
-        "#
+        @"TransactionStart: ReadOnly ReadCommitted"
     );
 }
 
@@ -566,9 +604,7 @@ isolation level repeatable read
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        TransactionStart: ReadOnly RepeatableRead
-        "#
+        @"TransactionStart: ReadOnly RepeatableRead"
     );
 }
 
@@ -578,9 +614,7 @@ fn plan_commit_transaction() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        TransactionEnd: Commit chain:=false
-        "#
+        @"TransactionEnd: Commit chain:=false"
     );
 }
 
@@ -590,9 +624,7 @@ fn plan_commit_transaction_chained() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        TransactionEnd: Commit chain:=true
-        "#
+        @"TransactionEnd: Commit chain:=true"
     );
 }
 
@@ -602,9 +634,7 @@ fn plan_rollback_transaction() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        TransactionEnd: Rollback chain:=false
-        "#
+        @"TransactionEnd: Rollback chain:=false"
     );
 }
 
@@ -614,9 +644,7 @@ fn plan_rollback_transaction_chained() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        TransactionEnd: Rollback chain:=true
-        "#
+        @"TransactionEnd: Rollback chain:=true"
     );
 }
 
@@ -626,10 +654,10 @@ fn plan_copy_to() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        CopyTo: format=csv output_url=output.csv options: ()
-          TableScan: test_decimal
-        "#
+        @r"
+    CopyTo: format=csv output_url=output.csv options: ()
+      TableScan: test_decimal
+    "
     );
 }
 
@@ -639,11 +667,11 @@ fn plan_explain_copy_to() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Explain
-          CopyTo: format=csv output_url=output.csv options: ()
-            TableScan: test_decimal
-        "#
+        @r"
+    Explain
+      CopyTo: format=csv output_url=output.csv options: ()
+        TableScan: test_decimal
+    "
     );
 }
 
@@ -653,11 +681,11 @@ fn plan_explain_copy_to_format() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Explain
-          CopyTo: format=csv output_url=output.tbl options: ()
-            TableScan: test_decimal
-        "#
+        @r"
+    Explain
+      CopyTo: format=csv output_url=output.tbl options: ()
+        TableScan: test_decimal
+    "
     );
 }
 
@@ -682,11 +710,11 @@ fn plan_insert_no_target_columns() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Dml: op=[Insert Into] table=[test_decimal]
-          Projection: column1 AS id, column2 AS price
-            Values: (CAST(Int64(1) AS Int32), CAST(Int64(2) AS Decimal128(10, 2))), (CAST(Int64(3) AS Int32), CAST(Int64(4) AS Decimal128(10, 2)))
-        "#
+        @r"
+    Dml: op=[Insert Into] table=[test_decimal]
+      Projection: column1 AS id, column2 AS price
+        Values: (CAST(Int64(1) AS Int32), CAST(Int64(2) AS Decimal128(10, 2))), (CAST(Int64(3) AS Int32), CAST(Int64(4) AS Decimal128(10, 2)))
+    "
     );
 }
 
@@ -729,11 +757,11 @@ fn plan_update() {
     assert_snapshot!(
         plan,
         @r#"
-        Dml: op=[Update] table=[person]
-          Projection: person.id AS id, person.first_name AS first_name, Utf8("Kay") AS last_name, person.age AS age, person.state AS state, person.salary AS salary, person.birth_date AS birth_date, person.😀 AS 😀
-            Filter: person.id = Int64(1)
-              TableScan: person
-        "#
+    Dml: op=[Update] table=[person]
+      Projection: person.id AS id, person.first_name AS first_name, Utf8("Kay") AS last_name, person.age AS age, person.state AS state, person.salary AS salary, person.birth_date AS birth_date, person.😀 AS 😀
+        Filter: person.id = Int64(1)
+          TableScan: person
+    "#
     );
 }
 
@@ -755,11 +783,11 @@ fn plan_delete() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Dml: op=[Delete] table=[person]
-          Filter: person.id = Int64(1)
-            TableScan: person
-        "#
+        @r"
+    Dml: op=[Delete] table=[person]
+      Filter: person.id = Int64(1)
+        TableScan: person
+    "
     );
 }
 
@@ -770,11 +798,11 @@ fn plan_delete_quoted_identifier_case_sensitive() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Dml: op=[Delete] table=[SomeCatalog.SomeSchema.UPPERCASE_test]
-          Filter: SomeCatalog.SomeSchema.UPPERCASE_test.Id = Int64(1)
-            TableScan: SomeCatalog.SomeSchema.UPPERCASE_test
-        "#
+        @r"
+    Dml: op=[Delete] table=[SomeCatalog.SomeSchema.UPPERCASE_test]
+      Filter: SomeCatalog.SomeSchema.UPPERCASE_test.Id = Int64(1)
+        TableScan: SomeCatalog.SomeSchema.UPPERCASE_test
+    "
     );
 }
 
@@ -792,9 +820,7 @@ fn select_repeated_column() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Projections require unique expression names but the expression "person.age" at position 0 and "person.age" at position 1 have the same name. Consider aliasing ("AS") one of them.
-        "#
+        @r#"Error during planning: Projections require unique expression names but the expression "person.age" at position 0 and "person.age" at position 1 have the same name. Consider aliasing ("AS") one of them."#
     );
 }
 
@@ -818,10 +844,10 @@ fn select_simple_filter() {
     assert_snapshot!(
         plan,
         @r#"
-        Projection: person.id, person.first_name, person.last_name
-          Filter: person.state = Utf8("CO")
-            TableScan: person
-        "#
+    Projection: person.id, person.first_name, person.last_name
+      Filter: person.state = Utf8("CO")
+        TableScan: person
+    "#
     );
 }
 
@@ -846,11 +872,11 @@ fn select_neg_filter() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.id, person.first_name, person.last_name
-          Filter: NOT person.state
-            TableScan: person
-        "#
+        @r"
+    Projection: person.id, person.first_name, person.last_name
+      Filter: NOT person.state
+        TableScan: person
+    "
     );
 }
 
@@ -862,10 +888,10 @@ fn select_compound_filter() {
     assert_snapshot!(
         plan,
         @r#"
-        Projection: person.id, person.first_name, person.last_name
-          Filter: person.state = Utf8("CO") AND person.age >= Int64(21) AND person.age <= Int64(65)
-            TableScan: person
-        "#
+    Projection: person.id, person.first_name, person.last_name
+      Filter: person.state = Utf8("CO") AND person.age >= Int64(21) AND person.age <= Int64(65)
+        TableScan: person
+    "#
     );
 }
 
@@ -890,10 +916,10 @@ fn test_date_filter() {
     assert_snapshot!(
         plan,
         @r#"
-        Projection: person.state
-          Filter: person.birth_date < CAST(Utf8("2020-01-01") AS Date32)
-            TableScan: person
-        "#
+    Projection: person.state
+      Filter: person.birth_date < CAST(Utf8("2020-01-01") AS Date32)
+        TableScan: person
+    "#
     );
 }
 
@@ -910,11 +936,11 @@ fn select_all_boolean_operators() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.age, person.first_name, person.last_name
-          Filter: person.age = Int64(21) AND person.age != Int64(21) AND person.age > Int64(21) AND person.age >= Int64(21) AND person.age < Int64(65) AND person.age <= Int64(65)
-            TableScan: person
-        "#
+        @r"
+    Projection: person.age, person.first_name, person.last_name
+      Filter: person.age = Int64(21) AND person.age != Int64(21) AND person.age > Int64(21) AND person.age >= Int64(21) AND person.age < Int64(65) AND person.age <= Int64(65)
+        TableScan: person
+    "
     );
 }
 
@@ -924,11 +950,11 @@ fn select_between() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state
-          Filter: person.age BETWEEN Int64(21) AND Int64(65)
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state
+      Filter: person.age BETWEEN Int64(21) AND Int64(65)
+        TableScan: person
+    "
     );
 }
 
@@ -938,11 +964,11 @@ fn select_between_negated() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state
-          Filter: person.age NOT BETWEEN Int64(21) AND Int64(65)
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state
+      Filter: person.age NOT BETWEEN Int64(21) AND Int64(65)
+        TableScan: person
+    "
     );
 }
 
@@ -959,14 +985,14 @@ fn select_nested() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: b.fn2, b.last_name
-          SubqueryAlias: b
-            Projection: a.fn1 AS fn2, a.last_name, a.birth_date
-              SubqueryAlias: a
-                Projection: person.first_name AS fn1, person.last_name, person.birth_date, person.age
-                  TableScan: person
-        "#
+        @r"
+    Projection: b.fn2, b.last_name
+      SubqueryAlias: b
+        Projection: a.fn1 AS fn2, a.last_name, a.birth_date
+          SubqueryAlias: a
+            Projection: person.first_name AS fn1, person.last_name, person.birth_date, person.age
+              TableScan: person
+    "
     );
 }
 
@@ -983,29 +1009,29 @@ fn select_nested_with_filters() {
     assert_snapshot!(
         plan,
         @r#"
-        Projection: a.fn1, a.age
-          Filter: a.fn1 = Utf8("X") AND a.age < Int64(30)
-            SubqueryAlias: a
-              Projection: person.first_name AS fn1, person.age
-                Filter: person.age > Int64(20)
-                  TableScan: person
-        "#
+    Projection: a.fn1, a.age
+      Filter: a.fn1 = Utf8("X") AND a.age < Int64(30)
+        SubqueryAlias: a
+          Projection: person.first_name AS fn1, person.age
+            Filter: person.age > Int64(20)
+              TableScan: person
+    "#
     );
 }
 
 #[test]
 fn table_with_column_alias() {
-    let sql = "SELECT a, b, c
-                   FROM lineitem l (a, b, c)";
+    let sql = "SELECT a, b, c, d, e
+                   FROM lineitem l (a, b, c, d, e)";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: l.a, l.b, l.c
-          SubqueryAlias: l
-            Projection: lineitem.l_item_id AS a, lineitem.l_description AS b, lineitem.price AS c
-              TableScan: lineitem
-        "#
+        @r"
+    Projection: l.a, l.b, l.c, l.d, l.e
+      SubqueryAlias: l
+        Projection: lineitem.l_orderkey AS a, lineitem.l_item_id AS b, lineitem.l_description AS c, lineitem.l_extendedprice AS d, lineitem.price AS e
+          TableScan: lineitem
+    "
     );
 }
 
@@ -1017,7 +1043,7 @@ fn table_with_column_alias_number_cols() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r"Error during planning: Source table contains 3 columns but only 2 names given as column alias"
+        @"Error during planning: Source table contains 5 columns but only 2 names given as column alias"
     );
 }
 
@@ -1028,7 +1054,7 @@ fn select_with_ambiguous_column() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r"Schema error: Ambiguous reference to unqualified field id"
+        @"Schema error: Ambiguous reference to unqualified field id"
     );
 }
 
@@ -1039,14 +1065,14 @@ fn join_with_ambiguous_column() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: a.id
-          Inner Join: Using a.id = b.id
-            SubqueryAlias: a
-              TableScan: person
-            SubqueryAlias: b
-              TableScan: person
-        "#
+        @r"
+    Projection: a.id
+      Inner Join: Using a.id = b.id
+        SubqueryAlias: a
+          TableScan: person
+        SubqueryAlias: b
+          TableScan: person
+    "
     );
 }
 
@@ -1056,14 +1082,14 @@ fn natural_left_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: a.l_item_id
-          Left Join: Using a.l_item_id = b.l_item_id, a.l_description = b.l_description, a.price = b.price
-            SubqueryAlias: a
-              TableScan: lineitem
-            SubqueryAlias: b
-              TableScan: lineitem
-        "#
+        @r"
+    Projection: a.l_item_id
+      Left Join: Using a.l_orderkey = b.l_orderkey, a.l_item_id = b.l_item_id, a.l_description = b.l_description, a.l_extendedprice = b.l_extendedprice, a.price = b.price
+        SubqueryAlias: a
+          TableScan: lineitem
+        SubqueryAlias: b
+          TableScan: lineitem
+    "
     );
 }
 
@@ -1073,14 +1099,14 @@ fn natural_right_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: a.l_item_id
-          Right Join: Using a.l_item_id = b.l_item_id, a.l_description = b.l_description, a.price = b.price
-            SubqueryAlias: a
-              TableScan: lineitem
-            SubqueryAlias: b
-              TableScan: lineitem
-        "#
+        @r"
+    Projection: a.l_item_id
+      Right Join: Using a.l_orderkey = b.l_orderkey, a.l_item_id = b.l_item_id, a.l_description = b.l_description, a.l_extendedprice = b.l_extendedprice, a.price = b.price
+        SubqueryAlias: a
+          TableScan: lineitem
+        SubqueryAlias: b
+          TableScan: lineitem
+    "
     );
 }
 
@@ -1093,7 +1119,7 @@ fn select_with_having() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r"Error during planning: HAVING clause references: person.age > Int64(100) AND person.age < Int64(200) must appear in the GROUP BY clause or be used in an aggregate function"
+        @"Error during planning: HAVING clause references: person.age > Int64(100) AND person.age < Int64(200) must appear in the GROUP BY clause or be used in an aggregate function"
     );
 }
 
@@ -1106,9 +1132,7 @@ fn select_with_having_referencing_column_not_in_select() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: HAVING clause references: person.first_name = Utf8("M") must appear in the GROUP BY clause or be used in an aggregate function
-        "#
+        @r#"Error during planning: HAVING clause references: person.first_name = Utf8("M") must appear in the GROUP BY clause or be used in an aggregate function"#
     );
 }
 
@@ -1122,9 +1146,7 @@ fn select_with_having_refers_to_invalid_column() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.first_name" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "person.id, max(person.age)" appears in the SELECT clause satisfies this requirement
-        "#
+        @r#"Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.first_name" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "person.id, max(person.age)" appears in the SELECT clause satisfies this requirement"#
     );
 }
 
@@ -1137,9 +1159,7 @@ fn select_with_having_referencing_column_nested_in_select_expression() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: HAVING clause references: person.age > Int64(100) must appear in the GROUP BY clause or be used in an aggregate function
-        "#
+        @"Error during planning: HAVING clause references: person.age > Int64(100) must appear in the GROUP BY clause or be used in an aggregate function"
     );
 }
 
@@ -1164,12 +1184,12 @@ fn select_aggregate_with_having_that_reuses_aggregate() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: max(person.age)
-          Filter: max(person.age) < Int64(30)
-            Aggregate: groupBy=[[]], aggr=[[max(person.age)]]
-              TableScan: person
-        "#
+        @r"
+    Projection: max(person.age)
+      Filter: max(person.age) < Int64(30)
+        Aggregate: groupBy=[[]], aggr=[[max(person.age)]]
+          TableScan: person
+    "
     );
 }
 
@@ -1182,11 +1202,11 @@ fn select_aggregate_with_having_with_aggregate_not_in_select() {
     assert_snapshot!(
         plan,
         @r#"
-        Projection: max(person.age)
-          Filter: max(person.first_name) > Utf8("M")
-            Aggregate: groupBy=[[]], aggr=[[max(person.age), max(person.first_name)]]
-              TableScan: person
-        "#
+    Projection: max(person.age)
+      Filter: max(person.first_name) > Utf8("M")
+        Aggregate: groupBy=[[]], aggr=[[max(person.age), max(person.first_name)]]
+          TableScan: person
+    "#
     );
 }
 
@@ -1199,9 +1219,7 @@ fn select_aggregate_with_having_referencing_column_not_in_select() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.first_name" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "count(*)" appears in the SELECT clause satisfies this requirement
-        "#
+        @r#"Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.first_name" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "count(*)" appears in the SELECT clause satisfies this requirement"#
     );
 }
 
@@ -1214,12 +1232,12 @@ fn select_aggregate_aliased_with_having_referencing_aggregate_by_its_alias() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: max(person.age) AS max_age
-          Filter: max(person.age) < Int64(30)
-            Aggregate: groupBy=[[]], aggr=[[max(person.age)]]
-              TableScan: person
-        "#
+        @r"
+    Projection: max(person.age) AS max_age
+      Filter: max(person.age) < Int64(30)
+        Aggregate: groupBy=[[]], aggr=[[max(person.age)]]
+          TableScan: person
+    "
     );
 }
 
@@ -1231,12 +1249,12 @@ fn select_aggregate_aliased_with_having_that_reuses_aggregate_but_not_by_its_ali
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: max(person.age) AS max_age
-          Filter: max(person.age) < Int64(30)
-            Aggregate: groupBy=[[]], aggr=[[max(person.age)]]
-              TableScan: person
-        "#
+        @r"
+    Projection: max(person.age) AS max_age
+      Filter: max(person.age) < Int64(30)
+        Aggregate: groupBy=[[]], aggr=[[max(person.age)]]
+          TableScan: person
+    "
     );
 }
 
@@ -1250,11 +1268,11 @@ fn select_aggregate_with_group_by_with_having() {
     assert_snapshot!(
         plan,
         @r#"
-        Projection: person.first_name, max(person.age)
-          Filter: person.first_name = Utf8("M")
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
-              TableScan: person
-        "#
+    Projection: person.first_name, max(person.age)
+      Filter: person.first_name = Utf8("M")
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
+          TableScan: person
+    "#
     );
 }
 
@@ -1268,13 +1286,13 @@ fn select_aggregate_with_group_by_with_having_and_where() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.first_name, max(person.age)
-          Filter: max(person.age) < Int64(100)
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
-              Filter: person.id > Int64(5)
-                TableScan: person
-        "#
+        @r"
+    Projection: person.first_name, max(person.age)
+      Filter: max(person.age) < Int64(100)
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
+          Filter: person.id > Int64(5)
+            TableScan: person
+    "
     );
 }
 
@@ -1288,13 +1306,13 @@ fn select_aggregate_with_group_by_with_having_and_where_filtering_on_aggregate_c
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.first_name, max(person.age)
-          Filter: max(person.age) < Int64(100)
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
-              Filter: person.id > Int64(5) AND person.age > Int64(18)
-                TableScan: person
-        "#
+        @r"
+    Projection: person.first_name, max(person.age)
+      Filter: max(person.age) < Int64(100)
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
+          Filter: person.id > Int64(5) AND person.age > Int64(18)
+            TableScan: person
+    "
     );
 }
 
@@ -1308,17 +1326,17 @@ fn select_aggregate_with_group_by_with_having_using_column_by_alias() {
     assert_snapshot!(
         plan,
         @r#"
-        Projection: person.first_name AS fn, max(person.age)
-          Filter: max(person.age) > Int64(2) AND person.first_name = Utf8("M")
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
-              TableScan: person
-        "#
+    Projection: person.first_name AS fn, max(person.age)
+      Filter: max(person.age) > Int64(2) AND person.first_name = Utf8("M")
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
+          TableScan: person
+    "#
     );
 }
 
 #[test]
-fn select_aggregate_with_group_by_with_having_using_columns_with_and_without_their_aliases(
-) {
+fn select_aggregate_with_group_by_with_having_using_columns_with_and_without_their_aliases()
+ {
     let sql = "SELECT first_name AS fn, MAX(age) AS max_age
                    FROM person
                    GROUP BY first_name
@@ -1327,11 +1345,11 @@ fn select_aggregate_with_group_by_with_having_using_columns_with_and_without_the
     assert_snapshot!(
         plan,
         @r#"
-        Projection: person.first_name AS fn, max(person.age) AS max_age
-          Filter: max(person.age) > Int64(2) AND max(person.age) < Int64(5) AND person.first_name = Utf8("M") AND person.first_name = Utf8("N")
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
-              TableScan: person
-        "#
+    Projection: person.first_name AS fn, max(person.age) AS max_age
+      Filter: max(person.age) > Int64(2) AND max(person.age) < Int64(5) AND person.first_name = Utf8("M") AND person.first_name = Utf8("N")
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
+          TableScan: person
+    "#
     );
 }
 
@@ -1344,12 +1362,12 @@ fn select_aggregate_with_group_by_with_having_that_reuses_aggregate() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.first_name, max(person.age)
-          Filter: max(person.age) > Int64(100)
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
-              TableScan: person
-        "#
+        @r"
+    Projection: person.first_name, max(person.age)
+      Filter: max(person.age) > Int64(100)
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
+          TableScan: person
+    "
     );
 }
 
@@ -1363,9 +1381,7 @@ fn select_aggregate_with_group_by_with_having_referencing_column_not_in_group_by
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.last_name" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "person.first_name, max(person.age)" appears in the SELECT clause satisfies this requirement
-        "#
+        @r#"Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.last_name" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "person.first_name, max(person.age)" appears in the SELECT clause satisfies this requirement"#
     );
 }
 
@@ -1378,12 +1394,12 @@ fn select_aggregate_with_group_by_with_having_that_reuses_aggregate_multiple_tim
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.first_name, max(person.age)
-          Filter: max(person.age) > Int64(100) AND max(person.age) < Int64(200)
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
-              TableScan: person
-        "#
+        @r"
+    Projection: person.first_name, max(person.age)
+      Filter: max(person.age) > Int64(100) AND max(person.age) < Int64(200)
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
+          TableScan: person
+    "
     );
 }
 
@@ -1396,12 +1412,12 @@ fn select_aggregate_with_group_by_with_having_using_aggregate_not_in_select() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.first_name, max(person.age)
-          Filter: max(person.age) > Int64(100) AND min(person.id) < Int64(50)
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age), min(person.id)]]
-              TableScan: person
-        "#
+        @r"
+    Projection: person.first_name, max(person.age)
+      Filter: max(person.age) > Int64(100) AND min(person.id) < Int64(50)
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age), min(person.id)]]
+          TableScan: person
+    "
     );
 }
 
@@ -1415,18 +1431,18 @@ fn select_aggregate_aliased_with_group_by_with_having_referencing_aggregate_by_i
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.first_name, max(person.age) AS max_age
-          Filter: max(person.age) > Int64(100)
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
-              TableScan: person
-        "#
+        @r"
+    Projection: person.first_name, max(person.age) AS max_age
+      Filter: max(person.age) > Int64(100)
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
+          TableScan: person
+    "
     );
 }
 
 #[test]
-fn select_aggregate_compound_aliased_with_group_by_with_having_referencing_compound_aggregate_by_its_alias(
-) {
+fn select_aggregate_compound_aliased_with_group_by_with_having_referencing_compound_aggregate_by_its_alias()
+ {
     let sql = "SELECT first_name, MAX(age) + 1 AS max_age_plus_one
                    FROM person
                    GROUP BY first_name
@@ -1434,18 +1450,18 @@ fn select_aggregate_compound_aliased_with_group_by_with_having_referencing_compo
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.first_name, max(person.age) + Int64(1) AS max_age_plus_one
-          Filter: max(person.age) + Int64(1) > Int64(100)
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
-              TableScan: person
-        "#
+        @r"
+    Projection: person.first_name, max(person.age) + Int64(1) AS max_age_plus_one
+      Filter: max(person.age) + Int64(1) > Int64(100)
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
+          TableScan: person
+    "
     );
 }
 
 #[test]
-fn select_aggregate_with_group_by_with_having_using_derived_column_aggregate_not_in_select(
-) {
+fn select_aggregate_with_group_by_with_having_using_derived_column_aggregate_not_in_select()
+ {
     let sql = "SELECT first_name, MAX(age)
                    FROM person
                    GROUP BY first_name
@@ -1453,12 +1469,12 @@ fn select_aggregate_with_group_by_with_having_using_derived_column_aggregate_not
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.first_name, max(person.age)
-          Filter: max(person.age) > Int64(100) AND min(person.id - Int64(2)) < Int64(50)
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age), min(person.id - Int64(2))]]
-              TableScan: person
-        "#
+        @r"
+    Projection: person.first_name, max(person.age)
+      Filter: max(person.age) > Int64(100) AND min(person.id - Int64(2)) < Int64(50)
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age), min(person.id - Int64(2))]]
+          TableScan: person
+    "
     );
 }
 
@@ -1471,12 +1487,12 @@ fn select_aggregate_with_group_by_with_having_using_count_star_not_in_select() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.first_name, max(person.age)
-          Filter: max(person.age) > Int64(100) AND count(*) < Int64(50)
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age), count(*)]]
-              TableScan: person
-        "#
+        @r"
+    Projection: person.first_name, max(person.age)
+      Filter: max(person.age) > Int64(100) AND count(*) < Int64(50)
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age), count(*)]]
+          TableScan: person
+    "
     );
 }
 
@@ -1486,10 +1502,10 @@ fn select_binary_expr() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.age + person.salary
-          TableScan: person
-        "#
+        @r"
+    Projection: person.age + person.salary
+      TableScan: person
+    "
     );
 }
 
@@ -1499,10 +1515,10 @@ fn select_binary_expr_nested() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: (person.age + person.salary) / Int64(2)
-          TableScan: person
-        "#
+        @r"
+    Projection: (person.age + person.salary) / Int64(2)
+      TableScan: person
+    "
     );
 }
 
@@ -1511,11 +1527,11 @@ fn select_simple_aggregate() {
     let plan = logical_plan("SELECT MIN(age) FROM person").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: min(person.age)
-          Aggregate: groupBy=[[]], aggr=[[min(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: min(person.age)
+      Aggregate: groupBy=[[]], aggr=[[min(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1524,11 +1540,11 @@ fn test_sum_aggregate() {
     let plan = logical_plan("SELECT sum(age) from person").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: sum(person.age)
-          Aggregate: groupBy=[[]], aggr=[[sum(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: sum(person.age)
+      Aggregate: groupBy=[[]], aggr=[[sum(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1546,9 +1562,7 @@ fn select_simple_aggregate_repeated_aggregate() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Projections require unique expression names but the expression "min(person.age)" at position 0 and "min(person.age)" at position 1 have the same name. Consider aliasing ("AS") one of them.
-        "#
+        @r#"Error during planning: Projections require unique expression names but the expression "min(person.age)" at position 0 and "min(person.age)" at position 1 have the same name. Consider aliasing ("AS") one of them."#
     );
 }
 
@@ -1557,11 +1571,11 @@ fn select_simple_aggregate_repeated_aggregate_with_single_alias() {
     let plan = logical_plan("SELECT MIN(age), MIN(age) AS a FROM person").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: min(person.age), min(person.age) AS a
-          Aggregate: groupBy=[[]], aggr=[[min(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: min(person.age), min(person.age) AS a
+      Aggregate: groupBy=[[]], aggr=[[min(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1570,11 +1584,11 @@ fn select_simple_aggregate_repeated_aggregate_with_unique_aliases() {
     let plan = logical_plan("SELECT MIN(age) AS a, MIN(age) AS b FROM person").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: min(person.age) AS a, min(person.age) AS b
-          Aggregate: groupBy=[[]], aggr=[[min(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: min(person.age) AS a, min(person.age) AS b
+      Aggregate: groupBy=[[]], aggr=[[min(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1601,9 +1615,7 @@ fn select_simple_aggregate_repeated_aggregate_with_repeated_aliases() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Projections require unique expression names but the expression "min(person.age) AS a" at position 0 and "min(person.age) AS a" at position 1 have the same name. Consider aliasing ("AS") one of them.
-        "#
+        @r#"Error during planning: Projections require unique expression names but the expression "min(person.age) AS a" at position 0 and "min(person.age) AS a" at position 1 have the same name. Consider aliasing ("AS") one of them."#
     );
 }
 
@@ -1614,11 +1626,11 @@ fn select_simple_aggregate_with_groupby() {
             .unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state, min(person.age), max(person.age)
-          Aggregate: groupBy=[[person.state]], aggr=[[min(person.age), max(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state, min(person.age), max(person.age)
+      Aggregate: groupBy=[[person.state]], aggr=[[min(person.age), max(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1629,11 +1641,11 @@ fn select_simple_aggregate_with_groupby_with_aliases() {
             .unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state AS a, min(person.age) AS b
-          Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state AS a, min(person.age) AS b
+      Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1644,9 +1656,7 @@ fn select_simple_aggregate_with_groupby_with_aliases_repeated() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Projections require unique expression names but the expression "person.state AS a" at position 0 and "min(person.age) AS a" at position 1 have the same name. Consider aliasing ("AS") one of them.
-        "#
+        @r#"Error during planning: Projections require unique expression names but the expression "person.state AS a" at position 0 and "min(person.age) AS a" at position 1 have the same name. Consider aliasing ("AS") one of them."#
     );
 }
 
@@ -1656,11 +1666,11 @@ fn select_simple_aggregate_with_groupby_column_unselected() {
         logical_plan("SELECT MIN(age), MAX(age) FROM person GROUP BY state").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: min(person.age), max(person.age)
-          Aggregate: groupBy=[[person.state]], aggr=[[min(person.age), max(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: min(person.age), max(person.age)
+      Aggregate: groupBy=[[person.state]], aggr=[[min(person.age), max(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1671,9 +1681,7 @@ fn select_simple_aggregate_with_groupby_and_column_in_group_by_does_not_exist()
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Schema error: No field named doesnotexist. Valid fields are "sum(person.age)", person.id, person.first_name, person.last_name, person.age, person.state, person.salary, person.birth_date, person."😀".
-        "#
+        @r#"Schema error: No field named doesnotexist. Valid fields are "sum(person.age)", person.id, person.first_name, person.last_name, person.age, person.state, person.salary, person.birth_date, person."😀"."#
     );
 }
 
@@ -1691,9 +1699,7 @@ fn select_interval_out_of_range() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Arrow error: Invalid argument error: Unable to represent 100000000000000000 days in a signed 32-bit integer
-        "#
+        @"Arrow error: Invalid argument error: Unable to represent 100000000000000000 days in a signed 32-bit integer"
     );
 }
 
@@ -1703,11 +1709,11 @@ fn select_simple_aggregate_with_groupby_and_column_is_in_aggregate_and_groupby()
         logical_plan("SELECT MAX(first_name) FROM person GROUP BY first_name").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: max(person.first_name)
-          Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.first_name)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: max(person.first_name)
+      Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.first_name)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1717,21 +1723,21 @@ fn select_simple_aggregate_with_groupby_can_use_positions() {
         .unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state, person.age AS b, count(Int64(1))
-          Aggregate: groupBy=[[person.state, person.age]], aggr=[[count(Int64(1))]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state, person.age AS b, count(Int64(1))
+      Aggregate: groupBy=[[person.state, person.age]], aggr=[[count(Int64(1))]]
+        TableScan: person
+    "
     );
     let plan = logical_plan("SELECT state, age AS b, count(1) FROM person GROUP BY 2, 1")
         .unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state, person.age AS b, count(Int64(1))
-          Aggregate: groupBy=[[person.age, person.state]], aggr=[[count(Int64(1))]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state, person.age AS b, count(Int64(1))
+      Aggregate: groupBy=[[person.age, person.state]], aggr=[[count(Int64(1))]]
+        TableScan: person
+    "
     );
 }
 
@@ -1742,9 +1748,7 @@ fn select_simple_aggregate_with_groupby_position_out_of_range() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Cannot find column with position 0 in SELECT clause. Valid columns: 1 to 2
-        "#
+        @"Error during planning: Cannot find column with position 0 in SELECT clause. Valid columns: 1 to 2"
     );
 
     let sql2 = "SELECT state, MIN(age) FROM person GROUP BY 5";
@@ -1752,9 +1756,7 @@ fn select_simple_aggregate_with_groupby_position_out_of_range() {
 
     assert_snapshot!(
         err2.strip_backtrace(),
-        @r#"
-        Error during planning: Cannot find column with position 5 in SELECT clause. Valid columns: 1 to 2
-        "#
+        @"Error during planning: Cannot find column with position 5 in SELECT clause. Valid columns: 1 to 2"
     );
 }
 
@@ -1764,11 +1766,11 @@ fn select_simple_aggregate_with_groupby_can_use_alias() {
         logical_plan("SELECT state AS a, MIN(age) AS b FROM person GROUP BY a").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state AS a, min(person.age) AS b
-          Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state AS a, min(person.age) AS b
+      Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1779,9 +1781,7 @@ fn select_simple_aggregate_with_groupby_aggregate_repeated() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Projections require unique expression names but the expression "min(person.age)" at position 1 and "min(person.age)" at position 2 have the same name. Consider aliasing ("AS") one of them.
-        "#
+        @r#"Error during planning: Projections require unique expression names but the expression "min(person.age)" at position 1 and "min(person.age)" at position 2 have the same name. Consider aliasing ("AS") one of them."#
     );
 }
 
@@ -1792,11 +1792,11 @@ fn select_simple_aggregate_with_groupby_aggregate_repeated_and_one_has_alias() {
             .unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state, min(person.age), min(person.age) AS ma
-          Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state, min(person.age), min(person.age) AS ma
+      Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1806,11 +1806,11 @@ fn select_simple_aggregate_with_groupby_non_column_expression_unselected() {
         logical_plan("SELECT MIN(first_name) FROM person GROUP BY age + 1").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: min(person.first_name)
-          Aggregate: groupBy=[[person.age + Int64(1)]], aggr=[[min(person.first_name)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: min(person.first_name)
+      Aggregate: groupBy=[[person.age + Int64(1)]], aggr=[[min(person.first_name)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1821,22 +1821,22 @@ fn select_simple_aggregate_with_groupby_non_column_expression_selected_and_resol
             .unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.age + Int64(1), min(person.first_name)
-          Aggregate: groupBy=[[person.age + Int64(1)]], aggr=[[min(person.first_name)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.age + Int64(1), min(person.first_name)
+      Aggregate: groupBy=[[person.age + Int64(1)]], aggr=[[min(person.first_name)]]
+        TableScan: person
+    "
     );
     let plan =
         logical_plan("SELECT MIN(first_name), age + 1 FROM person GROUP BY age + 1")
             .unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: min(person.first_name), person.age + Int64(1)
-          Aggregate: groupBy=[[person.age + Int64(1)]], aggr=[[min(person.first_name)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: min(person.first_name), person.age + Int64(1)
+      Aggregate: groupBy=[[person.age + Int64(1)]], aggr=[[min(person.first_name)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1847,11 +1847,11 @@ fn select_simple_aggregate_with_groupby_non_column_expression_nested_and_resolva
     ).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.age + Int64(1) / Int64(2) * person.age + Int64(1), min(person.first_name)
-          Aggregate: groupBy=[[person.age + Int64(1)]], aggr=[[min(person.first_name)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.age + Int64(1) / Int64(2) * person.age + Int64(1), min(person.first_name)
+      Aggregate: groupBy=[[person.age + Int64(1)]], aggr=[[min(person.first_name)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1864,9 +1864,7 @@ fn select_simple_aggregate_with_groupby_non_column_expression_nested_and_not_res
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.age" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "person.age + Int64(1), min(person.first_name)" appears in the SELECT clause satisfies this requirement
-        "#
+        @r#"Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.age" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "person.age + Int64(1), min(person.first_name)" appears in the SELECT clause satisfies this requirement"#
     );
 }
 
@@ -1877,9 +1875,7 @@ fn select_simple_aggregate_with_groupby_non_column_expression_and_its_column_sel
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.age" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "person.age + Int64(1), min(person.first_name)" appears in the SELECT clause satisfies this requirement
-        "#
+        @r#"Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.age" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "person.age + Int64(1), min(person.first_name)" appears in the SELECT clause satisfies this requirement"#
     );
 }
 
@@ -1889,11 +1885,11 @@ fn select_simple_aggregate_nested_in_binary_expr_with_groupby() {
         logical_plan("SELECT state, MIN(age) < 10 FROM person GROUP BY state").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state, min(person.age) < Int64(10)
-          Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state, min(person.age) < Int64(10)
+      Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1903,11 +1899,11 @@ fn select_simple_aggregate_and_nested_groupby_column() {
         logical_plan("SELECT MAX(first_name), age + 1 FROM person GROUP BY age").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: max(person.first_name), person.age + Int64(1)
-          Aggregate: groupBy=[[person.age]], aggr=[[max(person.first_name)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: max(person.first_name), person.age + Int64(1)
+      Aggregate: groupBy=[[person.age]], aggr=[[max(person.first_name)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1916,11 +1912,11 @@ fn select_aggregate_compounded_with_groupby_column() {
     let plan = logical_plan("SELECT age + MIN(salary) FROM person GROUP BY age").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.age + min(person.salary)
-          Aggregate: groupBy=[[person.age]], aggr=[[min(person.salary)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.age + min(person.salary)
+      Aggregate: groupBy=[[person.age]], aggr=[[min(person.salary)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1930,11 +1926,11 @@ fn select_aggregate_with_non_column_inner_expression_with_groupby() {
         logical_plan("SELECT state, MIN(age + 1) FROM person GROUP BY state").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state, min(person.age + Int64(1))
-          Aggregate: groupBy=[[person.state]], aggr=[[min(person.age + Int64(1))]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state, min(person.age + Int64(1))
+      Aggregate: groupBy=[[person.state]], aggr=[[min(person.age + Int64(1))]]
+        TableScan: person
+    "
     );
 }
 
@@ -1944,11 +1940,11 @@ fn select_count_one() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: count(Int64(1))
-  Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
-    TableScan: person
-"#
+        @r"
+    Projection: count(Int64(1))
+      Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+        TableScan: person
+    "
     );
 }
 
@@ -1958,11 +1954,11 @@ fn select_count_column() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: count(person.id)
-  Aggregate: groupBy=[[]], aggr=[[count(person.id)]]
-    TableScan: person
-"#
+        @r"
+    Projection: count(person.id)
+      Aggregate: groupBy=[[]], aggr=[[count(person.id)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1972,11 +1968,11 @@ fn select_approx_median() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: approx_median(person.age)
-  Aggregate: groupBy=[[]], aggr=[[approx_median(person.age)]]
-    TableScan: person
-"#
+        @r"
+    Projection: approx_median(person.age)
+      Aggregate: groupBy=[[]], aggr=[[approx_median(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1986,10 +1982,10 @@ fn select_scalar_func() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: sqrt(person.age)
-  TableScan: person
-"#
+        @r"
+    Projection: sqrt(person.age)
+      TableScan: person
+    "
     );
 }
 
@@ -1999,10 +1995,10 @@ fn select_aliased_scalar_func() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: sqrt(person.age) AS square_people
-  TableScan: person
-"#
+        @r"
+    Projection: sqrt(person.age) AS square_people
+      TableScan: person
+    "
     );
 }
 
@@ -2013,11 +2009,11 @@ fn select_where_nullif_division() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: aggregate_test_100.c3 / (aggregate_test_100.c4 + aggregate_test_100.c5)
-  Filter: aggregate_test_100.c3 / nullif(aggregate_test_100.c4 + aggregate_test_100.c5, Int64(0)) > Float64(0.1)
-    TableScan: aggregate_test_100
-"#
+        @r"
+    Projection: aggregate_test_100.c3 / (aggregate_test_100.c4 + aggregate_test_100.c5)
+      Filter: aggregate_test_100.c3 / nullif(aggregate_test_100.c4 + aggregate_test_100.c5, Int64(0)) > Float64(0.1)
+        TableScan: aggregate_test_100
+    "
     );
 }
 
@@ -2027,11 +2023,11 @@ fn select_where_with_negative_operator() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: aggregate_test_100.c3
-  Filter: aggregate_test_100.c3 > Float64(-0.1) AND (- aggregate_test_100.c4) > Int64(0)
-    TableScan: aggregate_test_100
-"#
+        @r"
+    Projection: aggregate_test_100.c3
+      Filter: aggregate_test_100.c3 > Float64(-0.1) AND (- aggregate_test_100.c4) > Int64(0)
+        TableScan: aggregate_test_100
+    "
     );
 }
 
@@ -2041,11 +2037,11 @@ fn select_where_with_positive_operator() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: aggregate_test_100.c3
-  Filter: aggregate_test_100.c3 > Float64(0.1) AND aggregate_test_100.c4 > Int64(0)
-    TableScan: aggregate_test_100
-"#
+        @r"
+    Projection: aggregate_test_100.c3
+      Filter: aggregate_test_100.c3 > Float64(0.1) AND aggregate_test_100.c4 > Int64(0)
+        TableScan: aggregate_test_100
+    "
     );
 }
 
@@ -2057,11 +2053,11 @@ fn select_where_compound_identifiers() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: public.aggregate_test_100.c3
-  Filter: public.aggregate_test_100.c3 > Float64(0.1)
-    TableScan: public.aggregate_test_100
-"#
+        @r"
+    Projection: public.aggregate_test_100.c3
+      Filter: public.aggregate_test_100.c3 > Float64(0.1)
+        TableScan: public.aggregate_test_100
+    "
     );
 }
 
@@ -2071,11 +2067,11 @@ fn select_order_by_index() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Sort: person.id ASC NULLS LAST
-  Projection: person.id
-    TableScan: person
-"#
+        @r"
+    Sort: person.id ASC NULLS LAST
+      Projection: person.id
+        TableScan: person
+    "
     );
 }
 
@@ -2085,11 +2081,11 @@ fn select_order_by_multiple_index() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Sort: person.id ASC NULLS LAST, person.age ASC NULLS LAST
-  Projection: person.id, person.state, person.age
-    TableScan: person
-"#
+        @r"
+    Sort: person.id ASC NULLS LAST, person.age ASC NULLS LAST
+      Projection: person.id, person.state, person.age
+        TableScan: person
+    "
     );
 }
 
@@ -2102,9 +2098,7 @@ fn select_order_by_index_of_0() {
 
     assert_snapshot!(
         err,
-        @r#"
-        Error during planning: Order by index starts at 1 for column indexes
-        "#
+        @"Error during planning: Order by index starts at 1 for column indexes"
     );
 }
 
@@ -2117,9 +2111,7 @@ fn select_order_by_index_oob() {
 
     assert_snapshot!(
         err,
-        @r#"
-        Error during planning: Order by column out of bounds, specified: 2, max: 1
-        "#
+        @"Error during planning: Order by column out of bounds, specified: 2, max: 1"
     );
 }
 
@@ -2129,11 +2121,11 @@ fn select_with_order_by() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Sort: person.id ASC NULLS LAST
-  Projection: person.id
-    TableScan: person
-"#
+        @r"
+    Sort: person.id ASC NULLS LAST
+      Projection: person.id
+        TableScan: person
+    "
     );
 }
 
@@ -2143,11 +2135,11 @@ fn select_order_by_desc() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Sort: person.id DESC NULLS FIRST
-  Projection: person.id
-    TableScan: person
-"#
+        @r"
+    Sort: person.id DESC NULLS FIRST
+      Projection: person.id
+        TableScan: person
+    "
     );
 }
 
@@ -2156,21 +2148,21 @@ fn select_order_by_nulls_last() {
     let plan = logical_plan("SELECT id FROM person ORDER BY id DESC NULLS LAST").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Sort: person.id DESC NULLS LAST
-  Projection: person.id
-    TableScan: person
-"#
+        @r"
+    Sort: person.id DESC NULLS LAST
+      Projection: person.id
+        TableScan: person
+    "
     );
 
     let plan = logical_plan("SELECT id FROM person ORDER BY id NULLS LAST").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Sort: person.id ASC NULLS LAST
-  Projection: person.id
-    TableScan: person
-"#
+        @r"
+    Sort: person.id ASC NULLS LAST
+      Projection: person.id
+        TableScan: person
+    "
     );
 }
 
@@ -2180,11 +2172,11 @@ fn select_group_by() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.state
-  Aggregate: groupBy=[[person.state]], aggr=[[]]
-    TableScan: person
-"#
+        @r"
+    Projection: person.state
+      Aggregate: groupBy=[[person.state]], aggr=[[]]
+        TableScan: person
+    "
     );
 }
 
@@ -2194,11 +2186,11 @@ fn select_group_by_columns_not_in_select() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: max(person.age)
-  Aggregate: groupBy=[[person.state]], aggr=[[max(person.age)]]
-    TableScan: person
-"#
+        @r"
+    Projection: max(person.age)
+      Aggregate: groupBy=[[person.state]], aggr=[[max(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -2208,11 +2200,11 @@ fn select_group_by_count_star() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.state, count(*)
-  Aggregate: groupBy=[[person.state]], aggr=[[count(*)]]
-    TableScan: person
-"#
+        @r"
+    Projection: person.state, count(*)
+      Aggregate: groupBy=[[person.state]], aggr=[[count(*)]]
+        TableScan: person
+    "
     );
 }
 
@@ -2222,11 +2214,11 @@ fn select_group_by_needs_projection() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: count(person.state), person.state
-          Aggregate: groupBy=[[person.state]], aggr=[[count(person.state)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: count(person.state), person.state
+      Aggregate: groupBy=[[person.state]], aggr=[[count(person.state)]]
+        TableScan: person
+    "
     );
 }
 
@@ -2236,11 +2228,11 @@ fn select_7480_1() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: aggregate_test_100.c1, min(aggregate_test_100.c12)
-  Aggregate: groupBy=[[aggregate_test_100.c1, aggregate_test_100.c13]], aggr=[[min(aggregate_test_100.c12)]]
-    TableScan: aggregate_test_100
-"#
+        @r"
+    Projection: aggregate_test_100.c1, min(aggregate_test_100.c12)
+      Aggregate: groupBy=[[aggregate_test_100.c1, aggregate_test_100.c13]], aggr=[[min(aggregate_test_100.c12)]]
+        TableScan: aggregate_test_100
+    "
     );
 }
 
@@ -2251,9 +2243,7 @@ fn select_7480_2() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "aggregate_test_100.c13" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "aggregate_test_100.c1, min(aggregate_test_100.c12)" appears in the SELECT clause satisfies this requirement
-        "#
+        @r#"Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "aggregate_test_100.c13" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "aggregate_test_100.c1, min(aggregate_test_100.c12)" appears in the SELECT clause satisfies this requirement"#
     );
 }
 
@@ -2263,9 +2253,7 @@ fn create_external_table_csv() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateExternalTable: Bare { table: "t" }
-"#
+        @r#"CreateExternalTable: Bare { table: "t" }"#
     );
 }
 
@@ -2275,9 +2263,7 @@ fn create_external_table_with_pk() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateExternalTable: Bare { table: "t" } constraints=[PrimaryKey([0])]
-    "#
+        @r#"CreateExternalTable: Bare { table: "t" } constraints=[PrimaryKey([0])]"#
     );
 }
 
@@ -2287,9 +2273,7 @@ fn create_external_table_wih_schema() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateExternalTable: Partial { schema: "staging", table: "foo" }
-"#
+        @r#"CreateExternalTable: Partial { schema: "staging", table: "foo" }"#
     );
 }
 
@@ -2299,9 +2283,7 @@ fn create_schema_with_quoted_name() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateCatalogSchema: "quoted_schema_name"
-"#
+        @r#"CreateCatalogSchema: "quoted_schema_name""#
     );
 }
 
@@ -2311,9 +2293,7 @@ fn create_schema_with_quoted_unnormalized_name() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateCatalogSchema: "Foo"
-"#
+        @r#"CreateCatalogSchema: "Foo""#
     );
 }
 
@@ -2323,9 +2303,7 @@ fn create_schema_with_unquoted_normalized_name() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateCatalogSchema: "foo"
-"#
+        @r#"CreateCatalogSchema: "foo""#
     );
 }
 
@@ -2335,9 +2313,7 @@ fn create_external_table_custom() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateExternalTable: Bare { table: "dt" }
-"#
+        @r#"CreateExternalTable: Bare { table: "dt" }"#
     );
 }
 
@@ -2347,9 +2323,7 @@ fn create_external_table_csv_no_schema() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateExternalTable: Bare { table: "t" }
-"#
+        @r#"CreateExternalTable: Bare { table: "t" }"#
     );
 }
 
@@ -2362,16 +2336,14 @@ fn create_external_table_with_compression_type() {
         "CREATE EXTERNAL TABLE t(c1 int) STORED AS JSON LOCATION 'foo.json.gz' OPTIONS ('format.compression' 'gzip')",
         "CREATE EXTERNAL TABLE t(c1 int) STORED AS JSON LOCATION 'foo.json.bz2' OPTIONS ('format.compression' 'bzip2')",
         "CREATE EXTERNAL TABLE t(c1 int) STORED AS NONSTANDARD LOCATION 'foo.unk' OPTIONS ('format.compression' 'gzip')",
-         ];
+    ];
 
     allow_duplicates! {
         for sql in sqls {
             let plan = logical_plan(sql).unwrap();
             assert_snapshot!(
                 plan,
-                @r#"
-                CreateExternalTable: Bare { table: "t" }
-                "#
+                @r#"CreateExternalTable: Bare { table: "t" }"#
             );
         }
 
@@ -2393,9 +2365,7 @@ fn create_external_table_with_compression_type() {
 
             assert_snapshot!(
                 err.strip_backtrace(),
-                @r#"
-                Error during planning: File compression type cannot be set for PARQUET, AVRO, or ARROW files.
-                "#
+                @"Error during planning: File compression type cannot be set for PARQUET, AVRO, or ARROW files."
             );
 
         }
@@ -2408,9 +2378,7 @@ fn create_external_table_parquet() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateExternalTable: Bare { table: "t" }
-"#
+        @r#"CreateExternalTable: Bare { table: "t" }"#
     );
 }
 
@@ -2420,9 +2388,7 @@ fn create_external_table_parquet_sort_order() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateExternalTable: Bare { table: "foo" }
-"#
+        @r#"CreateExternalTable: Bare { table: "foo" }"#
     );
 }
 
@@ -2442,9 +2408,7 @@ fn create_external_table_parquet_no_schema_sort_order() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateExternalTable: Bare { table: "t" }
-"#
+        @r#"CreateExternalTable: Bare { table: "t" }"#
     );
 }
 
@@ -2457,12 +2421,12 @@ fn equijoin_explicit_syntax() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: person.id = orders.customer_id
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: person.id = orders.customer_id
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -2475,12 +2439,12 @@ fn equijoin_with_condition() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: person.id = orders.customer_id AND orders.order_id > Int64(1)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: person.id = orders.customer_id AND orders.order_id > Int64(1)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -2493,12 +2457,12 @@ fn left_equijoin_with_conditions() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Left Join:  Filter: person.id = orders.customer_id AND orders.order_id > Int64(1) AND person.age < Int64(30)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Left Join:  Filter: person.id = orders.customer_id AND orders.order_id > Int64(1) AND person.age < Int64(30)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -2511,12 +2475,12 @@ fn right_equijoin_with_conditions() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Right Join:  Filter: person.id = orders.customer_id AND person.id > Int64(1) AND orders.order_id < Int64(100)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Right Join:  Filter: person.id = orders.customer_id AND person.id > Int64(1) AND orders.order_id < Int64(100)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -2529,12 +2493,12 @@ fn full_equijoin_with_conditions() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Full Join:  Filter: person.id = orders.customer_id AND person.id > Int64(1) AND orders.order_id < Int64(100)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Full Join:  Filter: person.id = orders.customer_id AND person.id > Int64(1) AND orders.order_id < Int64(100)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -2547,12 +2511,12 @@ fn join_with_table_name() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: person.id = orders.customer_id
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: person.id = orders.customer_id
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -2565,13 +2529,13 @@ fn join_with_using() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.first_name, person.id
-  Inner Join: Using person.id = person2.id
-    TableScan: person
-    SubqueryAlias: person2
-      TableScan: person
-"#
+        @r"
+    Projection: person.first_name, person.id
+      Inner Join: Using person.id = person2.id
+        TableScan: person
+        SubqueryAlias: person2
+          TableScan: person
+    "
     );
 }
 
@@ -2584,17 +2548,17 @@ fn equijoin_explicit_syntax_3_tables() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id, lineitem.l_description
-  Inner Join:  Filter: orders.o_item_id = lineitem.l_item_id
-    Inner Join:  Filter: person.id = orders.customer_id
-      TableScan: person
-      TableScan: orders
-    TableScan: lineitem
-"#
-    );
-}
-
+        @r"
+    Projection: person.id, orders.order_id, lineitem.l_description
+      Inner Join:  Filter: orders.o_item_id = lineitem.l_item_id
+        Inner Join:  Filter: person.id = orders.customer_id
+          TableScan: person
+          TableScan: orders
+        TableScan: lineitem
+    "
+    );
+}
+
 #[test]
 fn boolean_literal_in_condition_expression() {
     let sql = "SELECT order_id \
@@ -2603,11 +2567,11 @@ fn boolean_literal_in_condition_expression() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id
-  Filter: orders.delivered = Boolean(false) OR orders.delivered = Boolean(true)
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id
+      Filter: orders.delivered = Boolean(false) OR orders.delivered = Boolean(true)
+        TableScan: orders
+    "
     );
 }
 
@@ -2617,14 +2581,14 @@ fn union() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Distinct:
-  Union
-    Projection: orders.order_id
-      TableScan: orders
-    Projection: orders.order_id
-      TableScan: orders
-"#
+        @r"
+    Distinct:
+      Union
+        Projection: orders.order_id
+          TableScan: orders
+        Projection: orders.order_id
+          TableScan: orders
+    "
     );
 }
 
@@ -2634,16 +2598,16 @@ fn union_by_name_different_columns() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Distinct:
-  Union
-    Projection: order_id, NULL AS Int64(1)
-      Projection: orders.order_id
-        TableScan: orders
-    Projection: order_id, Int64(1)
-      Projection: orders.order_id, Int64(1)
-        TableScan: orders
-"#
+        @r"
+    Distinct:
+      Union
+        Projection: order_id, NULL AS Int64(1)
+          Projection: orders.order_id
+            TableScan: orders
+        Projection: order_id, Int64(1)
+          Projection: orders.order_id, Int64(1)
+            TableScan: orders
+    "
     );
 }
 
@@ -2653,14 +2617,14 @@ fn union_by_name_same_column_names() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Distinct:
-  Union
-    Projection: orders.order_id
-      TableScan: orders
-    Projection: orders.order_id
-      TableScan: orders
-"#
+        @r"
+    Distinct:
+      Union
+        Projection: orders.order_id
+          TableScan: orders
+        Projection: orders.order_id
+          TableScan: orders
+    "
     );
 }
 
@@ -2670,13 +2634,13 @@ fn union_all() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Union
-  Projection: orders.order_id
-    TableScan: orders
-  Projection: orders.order_id
-    TableScan: orders
-"#
+        @r"
+    Union
+      Projection: orders.order_id
+        TableScan: orders
+      Projection: orders.order_id
+        TableScan: orders
+    "
     );
 }
 
@@ -2687,15 +2651,15 @@ fn union_all_by_name_different_columns() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Union
-  Projection: order_id, NULL AS Int64(1)
-    Projection: orders.order_id
-      TableScan: orders
-  Projection: order_id, Int64(1)
-    Projection: orders.order_id, Int64(1)
-      TableScan: orders
-"#
+        @r"
+    Union
+      Projection: order_id, NULL AS Int64(1)
+        Projection: orders.order_id
+          TableScan: orders
+      Projection: order_id, Int64(1)
+        Projection: orders.order_id, Int64(1)
+          TableScan: orders
+    "
     );
 }
 
@@ -2705,15 +2669,115 @@ fn union_all_by_name_same_column_names() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Union
-  Projection: order_id
-    Projection: orders.order_id
-      TableScan: orders
-  Projection: order_id
-    Projection: orders.order_id
-      TableScan: orders
-"#
+        @r"
+    Union
+      Projection: order_id
+        Projection: orders.order_id
+          TableScan: orders
+      Projection: order_id
+        Projection: orders.order_id
+          TableScan: orders
+    "
+    );
+}
+
+#[test]
+fn union_all_with_duplicate_expressions() {
+    let sql = "\
+        SELECT 0 a, 0 b \
+        UNION ALL SELECT 1, 1 \
+        UNION ALL SELECT count(*), count(*) FROM orders";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    Union
+      Union
+        Projection: Int64(0) AS a, Int64(0) AS b
+          EmptyRelation: rows=1
+        Projection: Int64(1) AS a, Int64(1) AS b
+          EmptyRelation: rows=1
+      Projection: count(*) AS a, count(*) AS b
+        Aggregate: groupBy=[[]], aggr=[[count(*)]]
+          TableScan: orders
+    "
+    );
+}
+
+#[test]
+fn union_with_qualified_and_duplicate_expressions() {
+    let sql = "\
+        SELECT 0 a, id b, price c, 0 d FROM test_decimal \
+        UNION SELECT 1, *, 1 FROM test_decimal";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @"
+    Distinct:
+      Union
+        Projection: Int64(0) AS a, test_decimal.id AS b, test_decimal.price AS c, Int64(0) AS d
+          TableScan: test_decimal
+        Projection: Int64(1) AS a, test_decimal.id, test_decimal.price, Int64(1) AS d
+          TableScan: test_decimal
+    "
+    );
+}
+
+#[test]
+fn intersect_with_duplicate_expressions() {
+    let sql = "\
+        SELECT 0 a, 0 b \
+        INTERSECT SELECT 1, 1 \
+        INTERSECT SELECT count(*), count(*) FROM orders";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    LeftSemi Join: left.a = right.a, left.b = right.b
+      Distinct:
+        SubqueryAlias: left
+          LeftSemi Join: left.a = right.a, left.b = right.b
+            Distinct:
+              SubqueryAlias: left
+                Projection: Int64(0) AS a, Int64(0) AS b
+                  EmptyRelation: rows=1
+            SubqueryAlias: right
+              Projection: Int64(1) AS a, Int64(1) AS b
+                EmptyRelation: rows=1
+      SubqueryAlias: right
+        Projection: count(*) AS a, count(*) AS b
+          Aggregate: groupBy=[[]], aggr=[[count(*)]]
+            TableScan: orders
+    "
+    );
+}
+
+#[test]
+fn except_with_duplicate_expressions() {
+    let sql = "\
+        SELECT 0 a, 0 b \
+        EXCEPT SELECT 1, 1 \
+        EXCEPT SELECT count(*), count(*) FROM orders";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    LeftAnti Join: left.a = right.a, left.b = right.b
+      Distinct:
+        SubqueryAlias: left
+          LeftAnti Join: left.a = right.a, left.b = right.b
+            Distinct:
+              SubqueryAlias: left
+                Projection: Int64(0) AS a, Int64(0) AS b
+                  EmptyRelation: rows=1
+            SubqueryAlias: right
+              Projection: Int64(1) AS a, Int64(1) AS b
+                EmptyRelation: rows=1
+      SubqueryAlias: right
+        Projection: count(*) AS a, count(*) AS b
+          Aggregate: groupBy=[[]], aggr=[[count(*)]]
+            TableScan: orders
+    "
     );
 }
 
@@ -2723,11 +2787,11 @@ fn empty_over() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
-  WindowAggr: windowExpr=[[max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+      WindowAggr: windowExpr=[[max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        TableScan: orders
+    "
     );
 }
 
@@ -2737,11 +2801,11 @@ fn empty_over_with_alias() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id AS oid, max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS max_oid
-  WindowAggr: windowExpr=[[max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id AS oid, max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS max_oid
+      WindowAggr: windowExpr=[[max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        TableScan: orders
+    "
     );
 }
 
@@ -2751,11 +2815,11 @@ fn empty_over_dup_with_alias() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id AS oid, max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS max_oid, max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS max_oid_dup
-  WindowAggr: windowExpr=[[max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id AS oid, max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS max_oid, max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS max_oid_dup
+      WindowAggr: windowExpr=[[max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        TableScan: orders
+    "
     );
 }
 
@@ -2765,12 +2829,12 @@ fn empty_over_dup_with_different_sort() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id AS oid, max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, max(orders.order_id) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    WindowAggr: windowExpr=[[max(orders.order_id) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id AS oid, max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, max(orders.order_id) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        WindowAggr: windowExpr=[[max(orders.order_id) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: orders
+    "
     );
 }
 
@@ -2780,11 +2844,11 @@ fn empty_over_plus() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty * Float64(1.1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
-  WindowAggr: windowExpr=[[max(orders.qty * Float64(1.1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty * Float64(1.1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+      WindowAggr: windowExpr=[[max(orders.qty * Float64(1.1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        TableScan: orders
+    "
     );
 }
 
@@ -2794,11 +2858,11 @@ fn empty_over_multiple() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, avg(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
-  WindowAggr: windowExpr=[[max(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, avg(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, avg(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+      WindowAggr: windowExpr=[[max(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, avg(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        TableScan: orders
+    "
     );
 }
 
@@ -2817,11 +2881,11 @@ fn over_partition_by() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
-  WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+      WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        TableScan: orders
+    "
     );
 }
 
@@ -2843,12 +2907,12 @@ fn over_order_by() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-    WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+        WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: orders
+    "
     );
 }
 
@@ -2858,12 +2922,144 @@ fn over_order_by_with_window_frame_double_end() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING, min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING]]
-    WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING, min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING]]
+        WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: orders
+    "
+    );
+}
+
+#[test]
+fn window_function_only_in_order_by() {
+    let sql = "SELECT order_id FROM orders ORDER BY MAX(qty) OVER (ORDER BY order_id)";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    Projection: orders.order_id
+      Sort: max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ASC NULLS LAST
+        Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+          WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+            TableScan: orders
+    "
+    );
+}
+
+#[test]
+fn window_function_in_select_and_order_by() {
+    let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id) FROM orders ORDER BY MAX(qty) OVER (ORDER BY order_id)";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    Sort: max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ASC NULLS LAST
+      Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+        WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: orders
+    "
+    );
+}
+
+#[test]
+fn window_function_in_order_by_nested_expr() {
+    let sql =
+        "SELECT order_id FROM orders ORDER BY MAX(qty) OVER (ORDER BY order_id) + 1";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    Projection: orders.order_id
+      Sort: max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW + Int64(1) ASC NULLS LAST
+        Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+          WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+            TableScan: orders
+    "
+    );
+}
+
+#[test]
+fn window_function_in_order_by_desc() {
+    let sql =
+        "SELECT order_id FROM orders ORDER BY MAX(qty) OVER (ORDER BY order_id) DESC";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    Projection: orders.order_id
+      Sort: max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW DESC NULLS FIRST
+        Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+          WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+            TableScan: orders
+    "
+    );
+}
+
+#[test]
+fn multiple_window_functions_in_order_by() {
+    let sql = "SELECT order_id FROM orders ORDER BY MAX(qty) OVER (ORDER BY order_id), MIN(qty) OVER (ORDER BY order_id DESC)";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    Projection: orders.order_id
+      Sort: max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ASC NULLS LAST, min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ASC NULLS LAST
+        Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+          WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+            WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+              TableScan: orders
+    "
+    );
+}
+
+#[test]
+fn window_function_in_order_by_with_group_by() {
+    let sql = "SELECT order_id, SUM(qty) FROM orders GROUP BY order_id ORDER BY MAX(SUM(qty)) OVER (ORDER BY order_id)";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    Projection: orders.order_id, sum(orders.qty)
+      Sort: max(sum(orders.qty)) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ASC NULLS LAST
+        Projection: orders.order_id, sum(orders.qty), max(sum(orders.qty)) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+          WindowAggr: windowExpr=[[max(sum(orders.qty)) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+            Aggregate: groupBy=[[orders.order_id]], aggr=[[sum(orders.qty)]]
+              TableScan: orders
+    "
+    );
+}
+
+#[test]
+fn window_function_in_order_by_with_qualify() {
+    let sql = "SELECT person.id, ROW_NUMBER() OVER (PARTITION BY person.age ORDER BY person.id) as rn FROM person QUALIFY rn = 1 ORDER BY ROW_NUMBER() OVER (PARTITION BY person.age ORDER BY person.id)";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    Sort: rn ASC NULLS LAST
+      Projection: person.id, row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn
+        Filter: row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW = Int64(1)
+          WindowAggr: windowExpr=[[row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+            TableScan: person
+    "
+    );
+}
+
+#[test]
+fn window_function_in_order_by_not_in_select() {
+    let sql =
+        "SELECT order_id FROM orders ORDER BY MIN(qty) OVER (PARTITION BY order_id)";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    Projection: orders.order_id
+      Sort: min(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ASC NULLS LAST
+        Projection: orders.order_id, min(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+          WindowAggr: windowExpr=[[min(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+            TableScan: orders
+    "
     );
 }
 
@@ -2873,12 +3069,12 @@ fn over_order_by_with_window_frame_single_end() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] ROWS BETWEEN 3 PRECEDING AND CURRENT ROW, min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] ROWS BETWEEN 3 PRECEDING AND CURRENT ROW]]
-    WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] ROWS BETWEEN 3 PRECEDING AND CURRENT ROW, min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] ROWS BETWEEN 3 PRECEDING AND CURRENT ROW]]
+        WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: orders
+    "
     );
 }
 
@@ -2888,12 +3084,12 @@ fn over_order_by_with_window_frame_single_end_groups() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW, min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW]]
-    WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW, min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW]]
+        WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: orders
+    "
     );
 }
 
@@ -2915,12 +3111,12 @@ fn over_order_by_two_sort_keys() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(orders.qty) ORDER BY [orders.order_id + Int64(1) ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-    WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id + Int64(1) ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(orders.qty) ORDER BY [orders.order_id + Int64(1) ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+        WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id + Int64(1) ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: orders
+    "
     );
 }
 
@@ -2943,13 +3139,13 @@ fn over_order_by_sort_keys_sorting() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) ORDER BY [orders.qty ASC NULLS LAST, orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.qty ASC NULLS LAST, orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-        TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) ORDER BY [orders.qty ASC NULLS LAST, orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.qty ASC NULLS LAST, orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+            TableScan: orders
+    "
     );
 }
 
@@ -2970,13 +3166,13 @@ fn over_order_by_sort_keys_sorting_prefix_compacting() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-        TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+            TableScan: orders
+    "
     );
 }
 
@@ -3002,14 +3198,14 @@ fn over_order_by_sort_keys_sorting_global_order_compacting() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Sort: orders.order_id ASC NULLS LAST
-  Projection: orders.order_id, max(orders.qty) ORDER BY [orders.qty ASC NULLS LAST, orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-    WindowAggr: windowExpr=[[sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-      WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.qty ASC NULLS LAST, orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-        WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-          TableScan: orders
-"#
+        @r"
+    Sort: orders.order_id ASC NULLS LAST
+      Projection: orders.order_id, max(orders.qty) ORDER BY [orders.qty ASC NULLS LAST, orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+        WindowAggr: windowExpr=[[sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+          WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.qty ASC NULLS LAST, orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+            WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+              TableScan: orders
+    "
     );
 }
 
@@ -3029,11 +3225,11 @@ fn over_partition_by_order_by() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+        TableScan: orders
+    "
     );
 }
 
@@ -3048,16 +3244,15 @@ Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id] ORDE
 /// ```
 #[test]
 fn over_partition_by_order_by_no_dup() {
-    let sql =
-        "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id, qty ORDER BY qty) from orders";
+    let sql = "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id, qty ORDER BY qty) from orders";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+        TableScan: orders
+    "
     );
 }
 
@@ -3075,17 +3270,16 @@ Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id, orde
 /// ```
 #[test]
 fn over_partition_by_order_by_mix_up() {
-    let sql =
-            "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id, qty ORDER BY qty), MIN(qty) OVER (PARTITION BY qty ORDER BY order_id) from orders";
+    let sql = "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id, qty ORDER BY qty), MIN(qty) OVER (PARTITION BY qty ORDER BY order_id) from orders";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(orders.qty) PARTITION BY [orders.qty] ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[min(orders.qty) PARTITION BY [orders.qty] ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-    WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(orders.qty) PARTITION BY [orders.qty] ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[min(orders.qty) PARTITION BY [orders.qty] ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+        WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: orders
+    "
     );
 }
 
@@ -3102,17 +3296,16 @@ Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id, orde
 /// FIXME: for now we are not detecting prefix of sorting keys in order to save one sort exec phase
 #[test]
 fn over_partition_by_order_by_mix_up_prefix() {
-    let sql =
-            "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id ORDER BY qty), MIN(qty) OVER (PARTITION BY order_id, qty ORDER BY price) from orders";
+    let sql = "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id ORDER BY qty), MIN(qty) OVER (PARTITION BY order_id, qty ORDER BY price) from orders";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.price ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-    WindowAggr: windowExpr=[[min(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.price ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.price ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+        WindowAggr: windowExpr=[[min(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.price ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: orders
+    "
     );
 }
 
@@ -3123,11 +3316,11 @@ fn approx_median_window() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, approx_median(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
-  WindowAggr: windowExpr=[[approx_median(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, approx_median(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+      WindowAggr: windowExpr=[[approx_median(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        TableScan: orders
+    "
     );
 }
 
@@ -3163,10 +3356,10 @@ fn select_multibyte_column() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.😀
-  TableScan: person
-"#
+        @r"
+    Projection: person.😀
+      TableScan: person
+    "
     );
 }
 
@@ -3211,31 +3404,80 @@ fn select_groupby_orderby() {
                 plan,
                 // expect that this is not an ambiguous reference
                 @r#"
-        Sort: birth_date ASC NULLS LAST
-          Projection: avg(person.age) AS value, date_trunc(Utf8("month"), person.birth_date) AS birth_date
-            Aggregate: groupBy=[[person.birth_date]], aggr=[[avg(person.age)]]
-              TableScan: person
-        "#
+            Sort: birth_date ASC NULLS LAST
+              Projection: avg(person.age) AS value, date_trunc(Utf8("month"), person.birth_date) AS birth_date
+                Aggregate: groupBy=[[person.birth_date]], aggr=[[avg(person.age)]]
+                  TableScan: person
+            "#
             );
         }
     }
 
-    // Use columnized `avg(age)` in the order by
-    let sql = r#"SELECT
-  avg(age) + avg(age),
-  date_trunc('month', person.birth_date) AS "birth_date"
-  FROM person GROUP BY person.birth_date ORDER BY avg(age) + avg(age);
-"#;
+    // Use columnized `avg(age)` in the order by
+    let sql = r#"SELECT
+  avg(age) + avg(age),
+  date_trunc('month', person.birth_date) AS "birth_date"
+  FROM person GROUP BY person.birth_date ORDER BY avg(age) + avg(age);
+"#;
+
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r#"
+    Sort: avg(person.age) + avg(person.age) ASC NULLS LAST
+      Projection: avg(person.age) + avg(person.age), date_trunc(Utf8("month"), person.birth_date) AS birth_date
+        Aggregate: groupBy=[[person.birth_date]], aggr=[[avg(person.age)]]
+          TableScan: person
+    "#
+    );
+}
+
+#[test]
+fn select_groupby_orderby_aggregate_on_non_selected_column() {
+    let sql = "SELECT state FROM person GROUP BY state ORDER BY MIN(age)";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    Projection: person.state
+      Sort: min(person.age) ASC NULLS LAST
+        Projection: person.state, min(person.age)
+          Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]
+            TableScan: person
+    "
+    );
+}
+
+#[test]
+fn select_groupby_orderby_multiple_aggregates_on_non_selected_columns() {
+    let sql =
+        "SELECT state FROM person GROUP BY state ORDER BY MIN(age), MAX(salary) DESC";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    Projection: person.state
+      Sort: min(person.age) ASC NULLS LAST, max(person.salary) DESC NULLS FIRST
+        Projection: person.state, min(person.age), max(person.salary)
+          Aggregate: groupBy=[[person.state]], aggr=[[min(person.age), max(person.salary)]]
+            TableScan: person
+    "
+    );
+}
 
+#[test]
+fn select_groupby_orderby_aggregate_on_non_selected_column_original_issue() {
+    let sql = "SELECT id FROM person GROUP BY id ORDER BY min(age)";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Sort: avg(person.age) + avg(person.age) ASC NULLS LAST
-  Projection: avg(person.age) + avg(person.age), date_trunc(Utf8("month"), person.birth_date) AS birth_date
-    Aggregate: groupBy=[[person.birth_date]], aggr=[[avg(person.age)]]
-      TableScan: person
-"#
+        @r"
+    Projection: person.id
+      Sort: min(person.age) ASC NULLS LAST
+        Projection: person.id, min(person.age)
+          Aggregate: groupBy=[[person.id]], aggr=[[min(person.age)]]
+            TableScan: person
+    "
     );
 }
 
@@ -3243,13 +3485,24 @@ fn logical_plan(sql: &str) -> Result<LogicalPlan> {
     logical_plan_with_options(sql, ParserOptions::default())
 }
 
+fn logical_plan_with_config(
+    sql: &str,
+    config_options: datafusion_common::config::ConfigOptions,
+) -> Result<LogicalPlan> {
+    logical_plan_with_config_and_options(sql, config_options, ParserOptions::default())
+}
+
 fn logical_plan_with_options(sql: &str, options: ParserOptions) -> Result<LogicalPlan> {
     let dialect = &GenericDialect {};
     logical_plan_with_dialect_and_options(sql, dialect, options)
 }
 
 fn logical_plan_with_dialect(sql: &str, dialect: &dyn Dialect) -> Result<LogicalPlan> {
-    let state = MockSessionState::default().with_aggregate_function(sum_udaf());
+    let state = MockSessionState::default()
+        .with_aggregate_function(sum_udaf())
+        .with_higher_order_function(Arc::new(MockArrayReduce::new()))
+        .with_scalar_function(make_array_udf())
+        .with_expr_planner(Arc::new(CustomExprPlanner {})); // plan array literal
     let context = MockContextProvider { state };
     let planner = SqlToRel::new(&context);
     let result = DFParser::parse_sql_with_dialect(sql, dialect);
@@ -3262,7 +3515,25 @@ fn logical_plan_with_dialect_and_options(
     dialect: &dyn Dialect,
     options: ParserOptions,
 ) -> Result<LogicalPlan> {
-    let state = MockSessionState::default()
+    let state = mock_session_state();
+
+    logical_plan_from_state(sql, dialect, options, state)
+}
+
+fn logical_plan_with_config_and_options(
+    sql: &str,
+    config_options: datafusion_common::config::ConfigOptions,
+    options: ParserOptions,
+) -> Result<LogicalPlan> {
+    let dialect = &GenericDialect {};
+    let mut state = mock_session_state();
+    state.config_options = config_options;
+
+    logical_plan_from_state(sql, dialect, options, state)
+}
+
+fn mock_session_state() -> MockSessionState {
+    MockSessionState::default()
         .with_scalar_function(Arc::new(unicode::character_length().as_ref().clone()))
         .with_scalar_function(Arc::new(string::concat().as_ref().clone()))
         .with_scalar_function(Arc::new(make_udf(
@@ -3299,8 +3570,15 @@ fn logical_plan_with_dialect_and_options(
         .with_aggregate_function(grouping_udaf())
         .with_window_function(rank_udwf())
         .with_window_function(row_number_udwf())
-        .with_expr_planner(Arc::new(CoreFunctionPlanner::default()));
+        .with_expr_planner(Arc::new(CoreFunctionPlanner::default()))
+}
 
+fn logical_plan_from_state(
+    sql: &str,
+    dialect: &dyn Dialect,
+    options: ParserOptions,
+    state: MockSessionState,
+) -> Result<LogicalPlan> {
     let context = MockContextProvider { state };
     let planner = SqlToRel::new_with_options(&context, options);
     let result = DFParser::parse_sql_with_dialect(sql, dialect);
@@ -3331,10 +3609,6 @@ impl DummyUDF {
 }
 
 impl ScalarUDFImpl for DummyUDF {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         self.name
     }
@@ -3394,10 +3668,10 @@ fn select_partially_qualified_column() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: public.person.first_name
-  TableScan: public.person
-"#
+        @r"
+    Projection: public.person.first_name
+      TableScan: public.person
+    "
     );
 }
 
@@ -3408,15 +3682,15 @@ fn cross_join_not_to_inner_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id
-  Filter: person.id = person.age
-    Cross Join: 
-      Cross Join: 
-        TableScan: person
-        TableScan: orders
-      TableScan: lineitem
-"#
+        @r"
+    Projection: person.id
+      Filter: person.id = person.age
+        Cross Join:
+          Cross Join:
+            TableScan: person
+            TableScan: orders
+          TableScan: lineitem
+    "
     );
 }
 
@@ -3426,14 +3700,14 @@ fn join_with_aliases() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: peeps.id, folks.first_name
-  Inner Join:  Filter: peeps.id = folks.id
-    SubqueryAlias: peeps
-      TableScan: person
-    SubqueryAlias: folks
-      TableScan: person
-"#
+        @r"
+    Projection: peeps.id, folks.first_name
+      Inner Join:  Filter: peeps.id = folks.id
+        SubqueryAlias: peeps
+          TableScan: person
+        SubqueryAlias: folks
+          TableScan: person
+    "
     );
 }
 
@@ -3483,9 +3757,9 @@ fn date_plus_interval_in_projection() {
     assert_snapshot!(
         plan,
         @r#"
-Projection: test.t_date32 + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 5, nanoseconds: 0 }")
-  TableScan: test
-"#
+    Projection: test.t_date32 + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 5, nanoseconds: 0 }")
+      TableScan: test
+    "#
     );
 }
 
@@ -3499,10 +3773,10 @@ fn date_plus_interval_in_filter() {
     assert_snapshot!(
         plan,
         @r#"
-Projection: test.t_date64
-  Filter: test.t_date64 BETWEEN CAST(Utf8("1999-12-31") AS Date32) AND CAST(Utf8("1999-12-31") AS Date32) + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 30, nanoseconds: 0 }")
-    TableScan: test
-"#
+    Projection: test.t_date64
+      Filter: test.t_date64 BETWEEN CAST(Utf8("1999-12-31") AS Date32) AND CAST(Utf8("1999-12-31") AS Date32) + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 30, nanoseconds: 0 }")
+        TableScan: test
+    "#
     );
 }
 
@@ -3515,16 +3789,16 @@ fn exists_subquery() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: p.id
-  Filter: EXISTS (<subquery>)
-    Subquery:
-      Projection: person.first_name
-        Filter: person.last_name = outer_ref(p.last_name) AND person.state = outer_ref(p.state)
+        @r"
+    Projection: p.id
+      Filter: EXISTS (<subquery>)
+        Subquery:
+          Projection: person.first_name
+            Filter: person.last_name = outer_ref(p.last_name) AND person.state = outer_ref(p.state)
+              TableScan: person
+        SubqueryAlias: p
           TableScan: person
-    SubqueryAlias: p
-      TableScan: person
-"#
+    "
     );
 }
 
@@ -3540,21 +3814,21 @@ fn exists_subquery_schema_outer_schema_overlap() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id
-  Filter: person.id = p.id AND EXISTS (<subquery>)
-    Subquery:
-      Projection: person.first_name
-        Filter: person.id = p2.id AND person.last_name = outer_ref(p.last_name) AND person.state = outer_ref(p.state)
-          Cross Join: 
+        @r"
+    Projection: person.id
+      Filter: person.id = p.id AND EXISTS (<subquery>)
+        Subquery:
+          Projection: person.first_name
+            Filter: person.id = p2.id AND person.last_name = outer_ref(p.last_name) AND person.state = outer_ref(p.state)
+              Cross Join:
+                TableScan: person
+                SubqueryAlias: p2
+                  TableScan: person
+        Cross Join:
+          TableScan: person
+          SubqueryAlias: p
             TableScan: person
-            SubqueryAlias: p2
-              TableScan: person
-    Cross Join: 
-      TableScan: person
-      SubqueryAlias: p
-        TableScan: person
-"#
+    "
     );
 }
 
@@ -3565,15 +3839,48 @@ fn in_subquery_uncorrelated() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: p.id
-  Filter: p.id IN (<subquery>)
-    Subquery:
-      Projection: person.id
-        TableScan: person
-    SubqueryAlias: p
-      TableScan: person
-"#
+        @r"
+    Projection: p.id
+      Filter: p.id IN (<subquery>)
+        Subquery:
+          Projection: person.id
+            TableScan: person
+        SubqueryAlias: p
+          TableScan: person
+    "
+    );
+}
+
+#[test]
+fn subquery_order_by_is_eliminated_by_default() {
+    let sql = "SELECT x.* FROM (SELECT id FROM person ORDER BY id) x";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan.display_indent_schema().to_string(),
+        @r"
+    Projection: x.id [id:UInt32]
+      SubqueryAlias: x [id:UInt32]
+        Projection: person.id [id:UInt32]
+          TableScan: person [id:UInt32, first_name:Utf8, last_name:Utf8, age:Int32, state:Utf8, salary:Float64, birth_date:Timestamp(ns), 😀:Int32]
+    "
+    );
+}
+
+#[test]
+fn subquery_order_by_can_be_preserved() {
+    let sql = "SELECT x.* FROM (SELECT id FROM person ORDER BY id) x";
+    let mut config_options = datafusion_common::config::ConfigOptions::new();
+    config_options.sql_parser.enable_subquery_sort_elimination = false;
+    let plan = logical_plan_with_config(sql, config_options).unwrap();
+    assert_snapshot!(
+        plan.display_indent_schema().to_string(),
+        @r"
+    Projection: x.id [id:UInt32]
+      SubqueryAlias: x [id:UInt32]
+        Sort: person.id ASC NULLS LAST [id:UInt32]
+          Projection: person.id [id:UInt32]
+            TableScan: person [id:UInt32, first_name:Utf8, last_name:Utf8, age:Int32, state:Utf8, salary:Float64, birth_date:Timestamp(ns), 😀:Int32]
+    "
     );
 }
 
@@ -3585,35 +3892,34 @@ fn not_in_subquery_correlated() {
     assert_snapshot!(
         plan,
         @r#"
-Projection: p.id
-  Filter: p.id NOT IN (<subquery>)
-    Subquery:
-      Projection: person.id
-        Filter: person.last_name = outer_ref(p.last_name) AND person.state = Utf8("CO")
+    Projection: p.id
+      Filter: p.id NOT IN (<subquery>)
+        Subquery:
+          Projection: person.id
+            Filter: person.last_name = outer_ref(p.last_name) AND person.state = Utf8("CO")
+              TableScan: person
+        SubqueryAlias: p
           TableScan: person
-    SubqueryAlias: p
-      TableScan: person
-"#
+    "#
     );
 }
 
 #[test]
 fn scalar_subquery() {
-    let sql =
-        "SELECT p.id, (SELECT MAX(id) FROM person WHERE last_name = p.last_name) FROM person p";
+    let sql = "SELECT p.id, (SELECT MAX(id) FROM person WHERE last_name = p.last_name) FROM person p";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: p.id, (<subquery>)
-  Subquery:
-    Projection: max(person.id)
-      Aggregate: groupBy=[[]], aggr=[[max(person.id)]]
-        Filter: person.last_name = outer_ref(p.last_name)
-          TableScan: person
-  SubqueryAlias: p
-    TableScan: person
-"#
+        @r"
+    Projection: p.id, (<subquery>)
+      Subquery:
+        Projection: max(person.id)
+          Aggregate: groupBy=[[]], aggr=[[max(person.id)]]
+            Filter: person.last_name = outer_ref(p.last_name)
+              TableScan: person
+      SubqueryAlias: p
+        TableScan: person
+    "
     );
 }
 
@@ -3629,20 +3935,20 @@ fn scalar_subquery_reference_outer_field() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: j1.j1_string, j2.j2_string
-  Filter: j1.j1_id = j2.j2_id - Int64(1) AND j2.j2_id < (<subquery>)
-    Subquery:
-      Projection: count(*)
-        Aggregate: groupBy=[[]], aggr=[[count(*)]]
-          Filter: outer_ref(j2.j2_id) = j1.j1_id AND j1.j1_id = j3.j3_id
-            Cross Join: 
-              TableScan: j1
-              TableScan: j3
-    Cross Join: 
-      TableScan: j1
-      TableScan: j2
-"#
+        @r"
+    Projection: j1.j1_string, j2.j2_string
+      Filter: j1.j1_id = j2.j2_id - Int64(1) AND j2.j2_id < (<subquery>)
+        Subquery:
+          Projection: count(*)
+            Aggregate: groupBy=[[]], aggr=[[count(*)]]
+              Filter: outer_ref(j2.j2_id) = j1.j1_id AND j1.j1_id = j3.j3_id
+                Cross Join:
+                  TableScan: j1
+                  TableScan: j3
+        Cross Join:
+          TableScan: j1
+          TableScan: j2
+    "
     );
 }
 
@@ -3653,11 +3959,11 @@ fn aggregate_with_rollup() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.state, person.age, count(*)
-  Aggregate: groupBy=[[GROUPING SETS ((person.id), (person.id, person.state), (person.id, person.state, person.age))]], aggr=[[count(*)]]
-    TableScan: person
-"#
+        @r"
+    Projection: person.id, person.state, person.age, count(*)
+      Aggregate: groupBy=[[GROUPING SETS ((person.id), (person.id, person.state), (person.id, person.state, person.age))]], aggr=[[count(*)]]
+        TableScan: person
+    "
     );
 }
 
@@ -3668,11 +3974,11 @@ fn aggregate_with_rollup_with_grouping() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.state, person.age, grouping(person.state), grouping(person.age), grouping(person.state) + grouping(person.age), count(*)
-  Aggregate: groupBy=[[GROUPING SETS ((person.id), (person.id, person.state), (person.id, person.state, person.age))]], aggr=[[grouping(person.state), grouping(person.age), count(*)]]
-    TableScan: person
-"#
+        @r"
+    Projection: person.id, person.state, person.age, grouping(person.state), grouping(person.age), grouping(person.state) + grouping(person.age), count(*)
+      Aggregate: groupBy=[[GROUPING SETS ((person.id), (person.id, person.state), (person.id, person.state, person.age))]], aggr=[[grouping(person.state), grouping(person.age), count(*)]]
+        TableScan: person
+    "
     );
 }
 
@@ -3694,12 +4000,12 @@ fn rank_partition_grouping() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: sum(person.age) AS total_sum, person.state, person.last_name, grouping(person.state) + grouping(person.last_name) AS x, rank() PARTITION BY [grouping(person.state) + grouping(person.last_name), CASE WHEN grouping(person.last_name) = Int64(0) THEN person.state END] ORDER BY [sum(person.age) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS the_rank
-  WindowAggr: windowExpr=[[rank() PARTITION BY [grouping(person.state) + grouping(person.last_name), CASE WHEN grouping(person.last_name) = Int64(0) THEN person.state END] ORDER BY [sum(person.age) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-    Aggregate: groupBy=[[ROLLUP (person.state, person.last_name)]], aggr=[[sum(person.age), grouping(person.state), grouping(person.last_name)]]
-      TableScan: person
-"#
+        @r"
+    Projection: sum(person.age) AS total_sum, person.state, person.last_name, grouping(person.state) + grouping(person.last_name) AS x, rank() PARTITION BY [grouping(person.state) + grouping(person.last_name), CASE WHEN grouping(person.last_name) = Int64(0) THEN person.state END] ORDER BY [sum(person.age) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS the_rank
+      WindowAggr: windowExpr=[[rank() PARTITION BY [grouping(person.state) + grouping(person.last_name), CASE WHEN grouping(person.last_name) = Int64(0) THEN person.state END] ORDER BY [sum(person.age) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+        Aggregate: groupBy=[[ROLLUP (person.state, person.last_name)]], aggr=[[sum(person.age), grouping(person.state), grouping(person.last_name)]]
+          TableScan: person
+    "
     );
 }
 
@@ -3710,11 +4016,11 @@ fn aggregate_with_cube() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.state, person.age, count(*)
-  Aggregate: groupBy=[[GROUPING SETS ((person.id), (person.id, person.state), (person.id, person.age), (person.id, person.state, person.age))]], aggr=[[count(*)]]
-    TableScan: person
-"#
+        @r"
+    Projection: person.id, person.state, person.age, count(*)
+      Aggregate: groupBy=[[GROUPING SETS ((person.id), (person.id, person.state), (person.id, person.age), (person.id, person.state, person.age))]], aggr=[[count(*)]]
+        TableScan: person
+    "
     );
 }
 
@@ -3724,10 +4030,10 @@ fn round_decimal() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: round(test_decimal.price / Int64(3), Int64(2))
-  TableScan: test_decimal
-"#
+        @r"
+    Projection: round(test_decimal.price / Int64(3), Int64(2))
+      TableScan: test_decimal
+    "
     );
 }
 
@@ -3737,11 +4043,11 @@ fn aggregate_with_grouping_sets() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.state, person.age, count(*)
-  Aggregate: groupBy=[[GROUPING SETS ((person.id, person.state), (person.id, person.state, person.age), (person.id, person.id, person.state))]], aggr=[[count(*)]]
-    TableScan: person
-"#
+        @r"
+    Projection: person.id, person.state, person.age, count(*)
+      Aggregate: groupBy=[[GROUPING SETS ((person.id, person.state), (person.id, person.state, person.age), (person.id, person.id, person.state))]], aggr=[[count(*)]]
+        TableScan: person
+    "
     );
 }
 
@@ -3753,12 +4059,12 @@ fn join_on_disjunction_condition() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: person.id = orders.customer_id OR person.age > Int64(30)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: person.id = orders.customer_id OR person.age > Int64(30)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -3771,11 +4077,11 @@ fn join_on_complex_condition() {
     assert_snapshot!(
         plan,
         @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: person.id = orders.customer_id AND (person.age > Int64(30) OR person.last_name = Utf8("X"))
-    TableScan: person
-    TableScan: orders
-"#
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: person.id = orders.customer_id AND (person.age > Int64(30) OR person.last_name = Utf8("X"))
+        TableScan: person
+        TableScan: orders
+    "#
     );
 }
 
@@ -3787,11 +4093,11 @@ fn hive_aggregate_with_filter() -> Result<()> {
 
     assert_snapshot!(
         plan,
-        @r##"
-        Projection: sum(person.age) FILTER (WHERE person.age > Int64(4))
-          Aggregate: groupBy=[[]], aggr=[[sum(person.age) FILTER (WHERE person.age > Int64(4))]]
-            TableScan: person
-        "##
+        @r"
+    Projection: sum(person.age) FILTER (WHERE person.age > Int64(4))
+      Aggregate: groupBy=[[]], aggr=[[sum(person.age) FILTER (WHERE person.age > Int64(4))]]
+        TableScan: person
+    "
     );
 
     Ok(())
@@ -3807,14 +4113,13 @@ fn order_by_unaliased_name() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: z, q
-  Sort: p.state ASC NULLS LAST
-    Projection: p.state AS z, sum(p.age) AS q, p.state
-      Aggregate: groupBy=[[p.state]], aggr=[[sum(p.age)]]
-        SubqueryAlias: p
-          TableScan: person
-"#
+        @r"
+    Sort: z ASC NULLS LAST
+      Projection: p.state AS z, sum(p.age) AS q
+        Aggregate: groupBy=[[p.state]], aggr=[[sum(p.age)]]
+          SubqueryAlias: p
+            TableScan: person
+    "
     );
 }
 
@@ -3825,9 +4130,7 @@ fn order_by_ambiguous_name() {
 
     assert_snapshot!(
         err,
-        @r###"
-        Schema error: Ambiguous reference to unqualified field age
-        "###
+        @"Schema error: Ambiguous reference to unqualified field age"
     );
 }
 
@@ -3838,9 +4141,7 @@ fn group_by_ambiguous_name() {
 
     assert_snapshot!(
         err,
-        @r###"
-        Schema error: Ambiguous reference to unqualified field age
-        "###
+        @"Schema error: Ambiguous reference to unqualified field age"
     );
 }
 
@@ -3850,24 +4151,24 @@ fn test_zero_offset_with_limit() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Limit: skip=0, fetch=5
-  Projection: person.id
-    Filter: person.id > Int64(100)
-      TableScan: person
-"#
+        @r"
+    Limit: skip=0, fetch=5
+      Projection: person.id
+        Filter: person.id > Int64(100)
+          TableScan: person
+    "
     );
     // Flip the order of LIMIT and OFFSET in the query. Plan should remain the same.
     let sql = "SELECT id FROM person WHERE person.id > 100 OFFSET 0 LIMIT 5;";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Limit: skip=0, fetch=5
-  Projection: person.id
-    Filter: person.id > Int64(100)
-      TableScan: person
-"#
+        @r"
+    Limit: skip=0, fetch=5
+      Projection: person.id
+        Filter: person.id > Int64(100)
+          TableScan: person
+    "
     );
 }
 
@@ -3877,12 +4178,12 @@ fn test_offset_no_limit() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Limit: skip=5, fetch=None
-  Projection: person.id
-    Filter: person.id > Int64(100)
-      TableScan: person
-"#
+        @r"
+    Limit: skip=5, fetch=None
+      Projection: person.id
+        Filter: person.id > Int64(100)
+          TableScan: person
+    "
     );
 }
 
@@ -3892,27 +4193,34 @@ fn test_offset_after_limit() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Limit: skip=3, fetch=5
-  Projection: person.id
-    Filter: person.id > Int64(100)
-      TableScan: person
-"#
+        @r"
+    Limit: skip=3, fetch=5
+      Projection: person.id
+        Filter: person.id > Int64(100)
+          TableScan: person
+    "
     );
 }
 
+#[test]
+fn fetch_clause_is_not_supported() {
+    let sql = "SELECT 1 FETCH NEXT 1 ROW ONLY";
+    let err = logical_plan(sql).unwrap_err();
+    assert_contains!(err.to_string(), "FETCH clause is not supported yet");
+}
+
 #[test]
 fn test_offset_before_limit() {
     let sql = "select id from person where person.id > 100 OFFSET 3 LIMIT 5;";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Limit: skip=3, fetch=5
-  Projection: person.id
-    Filter: person.id > Int64(100)
-      TableScan: person
-"#
+        @r"
+    Limit: skip=3, fetch=5
+      Projection: person.id
+        Filter: person.id > Int64(100)
+          TableScan: person
+    "
     );
 }
 
@@ -3922,11 +4230,11 @@ fn test_distribute_by() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Repartition: DistributeBy(person.state)
-  Projection: person.id
-    TableScan: person
-"#
+        @r"
+    Repartition: DistributeBy(person.state)
+      Projection: person.id
+        TableScan: person
+    "
     );
 }
 
@@ -3958,12 +4266,12 @@ fn test_constant_expr_eq_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: person.id = Int64(10)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: person.id = Int64(10)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -3976,12 +4284,12 @@ fn test_right_left_expr_eq_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: orders.customer_id * Int64(2) = person.id + Int64(10)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: orders.customer_id * Int64(2) = person.id + Int64(10)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -3994,12 +4302,12 @@ fn test_single_column_expr_eq_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: person.id + Int64(10) = orders.customer_id * Int64(2)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: person.id + Int64(10) = orders.customer_id * Int64(2)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -4012,12 +4320,12 @@ fn test_multiple_column_expr_eq_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: person.id + person.age + Int64(10) = orders.customer_id * Int64(2) - orders.price
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: person.id + person.age + Int64(10) = orders.customer_id * Int64(2) - orders.price
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -4030,12 +4338,12 @@ fn test_left_expr_eq_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: person.id + person.age + Int64(10) = orders.customer_id
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: person.id + person.age + Int64(10) = orders.customer_id
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -4048,12 +4356,12 @@ fn test_right_expr_eq_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: person.id = orders.customer_id * Int64(2) - orders.price
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: person.id = orders.customer_id * Int64(2) - orders.price
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -4066,12 +4374,12 @@ fn test_noneq_with_filter_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.first_name
-  Inner Join:  Filter: person.age > Int64(10)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, person.first_name
+      Inner Join:  Filter: person.age > Int64(10)
+        TableScan: person
+        TableScan: orders
+    "
     );
     // left join
     let sql = "SELECT person.id, person.first_name \
@@ -4080,12 +4388,12 @@ Projection: person.id, person.first_name
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.first_name
-  Left Join:  Filter: person.age > Int64(10)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, person.first_name
+      Left Join:  Filter: person.age > Int64(10)
+        TableScan: person
+        TableScan: orders
+    "
     );
     // right join
     let sql = "SELECT person.id, person.first_name \
@@ -4094,12 +4402,12 @@ Projection: person.id, person.first_name
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.first_name
-  Right Join:  Filter: person.age > Int64(10)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, person.first_name
+      Right Join:  Filter: person.age > Int64(10)
+        TableScan: person
+        TableScan: orders
+    "
     );
     // full join
     let sql = "SELECT person.id, person.first_name \
@@ -4108,12 +4416,12 @@ Projection: person.id, person.first_name
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.first_name
-  Full Join:  Filter: person.age > Int64(10)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, person.first_name
+      Full Join:  Filter: person.age > Int64(10)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -4128,12 +4436,12 @@ fn test_one_side_constant_full_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Full Join:  Filter: person.id = Int64(10)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Full Join:  Filter: person.id = Int64(10)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -4146,12 +4454,12 @@ fn test_select_join_key_inner_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.customer_id * Int64(2), person.id + Int64(10)
-  Inner Join:  Filter: orders.customer_id * Int64(2) = person.id + Int64(10)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.customer_id * Int64(2), person.id + Int64(10)
+      Inner Join:  Filter: orders.customer_id * Int64(2) = person.id + Int64(10)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -4162,11 +4470,11 @@ fn test_select_order_by() {
     assert_snapshot!(
         plan,
         @r#"
-Projection: Utf8("1")
-  Sort: person.id ASC NULLS LAST
-    Projection: Utf8("1"), person.id
-      TableScan: person
-"#
+    Projection: Utf8("1")
+      Sort: person.id ASC NULLS LAST
+        Projection: Utf8("1"), person.id
+          TableScan: person
+    "#
     );
 }
 
@@ -4181,9 +4489,7 @@ fn test_select_distinct_order_by() {
 
     assert_snapshot!(
         err,
-        @r###"
-        Error during planning: For SELECT DISTINCT, ORDER BY expressions person.id must appear in select list
-        "###
+        @"Error during planning: For SELECT DISTINCT, ORDER BY expressions person.id must appear in select list"
     );
 }
 
@@ -4193,12 +4499,12 @@ fn test_select_qualify_basic() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn
-  Filter: row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW = Int64(1)
-    WindowAggr: windowExpr=[[row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      TableScan: person
-"#
+        @r"
+    Projection: person.id, row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn
+      Filter: row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW = Int64(1)
+        WindowAggr: windowExpr=[[row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: person
+    "
     );
 }
 
@@ -4273,19 +4579,29 @@ fn test_select_qualify_without_window_function() {
     );
 }
 
+#[test]
+fn test_select_qualify_without_window_function_but_window_in_order_by() {
+    let sql = "SELECT person.id FROM person QUALIFY person.id > 1 ORDER BY ROW_NUMBER() OVER (ORDER BY person.id)";
+    let err = logical_plan(sql).unwrap_err();
+    assert_eq!(
+        err.strip_backtrace(),
+        "Error during planning: QUALIFY clause requires window functions in the SELECT list or QUALIFY clause"
+    );
+}
+
 #[test]
 fn test_select_qualify_complex_condition() {
     let sql = "SELECT person.id, person.age, ROW_NUMBER() OVER (PARTITION BY person.age ORDER BY person.id) as rn, RANK() OVER (ORDER BY person.salary) as rank FROM person QUALIFY rn <= 2 AND rank <= 5";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.age, row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn, rank() ORDER BY [person.salary ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rank
-  Filter: row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(2) AND rank() ORDER BY [person.salary ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(5)
-    WindowAggr: windowExpr=[[rank() ORDER BY [person.salary ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      WindowAggr: windowExpr=[[row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-        TableScan: person
-"#
+        @r"
+    Projection: person.id, person.age, row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn, rank() ORDER BY [person.salary ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rank
+      Filter: row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(2) AND rank() ORDER BY [person.salary ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(5)
+        WindowAggr: windowExpr=[[rank() ORDER BY [person.salary ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          WindowAggr: windowExpr=[[row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+            TableScan: person
+    "
     );
 }
 
@@ -4314,17 +4630,16 @@ fn test_select_unsupported_syntax_errors(#[case] sql: &str, #[case] error: &str)
 
 #[test]
 fn select_order_by_with_cast() {
-    let sql =
-        "SELECT first_name AS first_name FROM (SELECT first_name AS first_name FROM person) ORDER BY CAST(first_name as INT)";
+    let sql = "SELECT first_name AS first_name FROM (SELECT first_name AS first_name FROM person) ORDER BY CAST(first_name as INT)";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Sort: CAST(person.first_name AS Int32) ASC NULLS LAST
-  Projection: person.first_name
-    Projection: person.first_name
-      TableScan: person
-"#
+        @r"
+    Sort: CAST(person.first_name AS Int32) ASC NULLS LAST
+      Projection: person.first_name
+        Projection: person.first_name
+          TableScan: person
+    "
     );
 }
 
@@ -4349,12 +4664,12 @@ fn test_duplicated_left_join_key_inner_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.age
-  Inner Join:  Filter: person.id * Int64(2) = orders.customer_id + Int64(10) AND person.id * Int64(2) = orders.order_id
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, person.age
+      Inner Join:  Filter: person.id * Int64(2) = orders.customer_id + Int64(10) AND person.id * Int64(2) = orders.order_id
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -4368,12 +4683,12 @@ fn test_duplicated_right_join_key_inner_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.age
-  Inner Join:  Filter: person.id * Int64(2) = orders.customer_id + Int64(10) AND person.id = orders.customer_id + Int64(10)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, person.age
+      Inner Join:  Filter: person.id * Int64(2) = orders.customer_id + Int64(10) AND person.id = orders.customer_id + Int64(10)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -4391,9 +4706,7 @@ fn test_ambiguous_column_references_in_on_join() {
 
     assert_snapshot!(
         err,
-        @r###"
-        Schema error: Ambiguous reference to unqualified field id
-        "###
+        @"Schema error: Ambiguous reference to unqualified field id"
     );
 }
 
@@ -4406,14 +4719,14 @@ fn test_ambiguous_column_references_with_in_using_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: p1.id, p1.age, p2.id
-  Inner Join: Using p1.id = p2.id
-    SubqueryAlias: p1
-      TableScan: person
-    SubqueryAlias: p2
-      TableScan: person
-"#
+        @r"
+    Projection: p1.id, p1.age, p2.id
+      Inner Join: Using p1.id = p2.id
+        SubqueryAlias: p1
+          TableScan: person
+        SubqueryAlias: p2
+          TableScan: person
+    "
     );
 }
 
@@ -4426,12 +4739,12 @@ fn test_inner_join_with_cast_key() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.age
-  Inner Join:  Filter: CAST(person.id AS Int32) = CAST(orders.customer_id AS Int32)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, person.age
+      Inner Join:  Filter: CAST(person.id AS Int32) = CAST(orders.customer_id AS Int32)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -4445,11 +4758,11 @@ fn test_multi_grouping_sets() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.age
-  Aggregate: groupBy=[[GROUPING SETS ((person.id, person.age, person.salary), (person.id, person.age))]], aggr=[[]]
-    TableScan: person
-"#
+        @r"
+    Projection: person.id, person.age
+      Aggregate: groupBy=[[GROUPING SETS ((person.id, person.age, person.salary), (person.id, person.age))]], aggr=[[]]
+        TableScan: person
+    "
     );
     let sql = "SELECT person.id, person.age
             FROM person
@@ -4460,11 +4773,11 @@ Projection: person.id, person.age
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.age
-  Aggregate: groupBy=[[GROUPING SETS ((person.id, person.age, person.salary), (person.id, person.age, person.salary, person.state), (person.id, person.age, person.salary, person.state, person.birth_date), (person.id, person.age), (person.id, person.age, person.state), (person.id, person.age, person.state, person.birth_date))]], aggr=[[]]
-    TableScan: person
-"#
+        @r"
+    Projection: person.id, person.age
+      Aggregate: groupBy=[[GROUPING SETS ((person.id, person.age, person.salary), (person.id, person.age, person.salary, person.state), (person.id, person.age, person.salary, person.state, person.birth_date), (person.id, person.age), (person.id, person.age, person.state), (person.id, person.age, person.state, person.birth_date))]], aggr=[[]]
+        TableScan: person
+    "
     );
 }
 
@@ -4477,9 +4790,7 @@ fn test_field_not_found_window_function() {
 
     assert_snapshot!(
         order_by_err,
-        @r###"
-        Schema error: No field named a.
-        "###
+        @"Schema error: No field named a."
     );
 
     let partition_by_sql = "SELECT count() OVER (PARTITION BY a);";
@@ -4489,20 +4800,18 @@ fn test_field_not_found_window_function() {
 
     assert_snapshot!(
         partition_by_err,
-        @r###"
-        Schema error: No field named a.
-        "###
+        @"Schema error: No field named a."
     );
 
     let sql = "SELECT order_id, MAX(qty) OVER (PARTITION BY orders.order_id) from orders";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
-  WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+      WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        TableScan: orders
+    "
     );
 }
 
@@ -4532,16 +4841,54 @@ fn test_parse_escaped_string_literal_value() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @"Projection: character_length(Utf8(\"%\")) AS len, Utf8(\"K\") AS hex, Utf8(\"\u{1}\") AS unicode\n  EmptyRelation: rows=1"
+        @r#"
+    Projection: character_length(Utf8("%")) AS len, Utf8("K") AS hex, Utf8("") AS unicode
+      EmptyRelation: rows=1
+    "#
     );
 
     let sql = r"SELECT character_length(E'\000') AS len";
 
     assert_snapshot!(
         logical_plan(sql).unwrap_err(),
-        @r###"
-        SQL error: TokenizerError("Unterminated encoded string literal at Line: 1, Column: 25")
-        "###
+        @r#"SQL error: TokenizerError("Unterminated encoded string literal at Line: 1, Column: 25")"#
+    );
+}
+
+#[test]
+fn test_parse_quoted_column_name_with_at_sign() {
+    let sql = r"SELECT `@column` FROM `@quoted_identifier_names_table`";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r#"
+    Projection: @quoted_identifier_names_table.@column
+      TableScan: @quoted_identifier_names_table
+    "#
+    );
+
+    let sql = r"SELECT `@quoted_identifier_names_table`.`@column` FROM `@quoted_identifier_names_table`";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r#"
+    Projection: @quoted_identifier_names_table.@column
+      TableScan: @quoted_identifier_names_table
+    "#
+    );
+}
+
+#[test]
+fn test_variable_identifier() {
+    let sql = r"SELECT t_date32 FROM test WHERE t_date32 = @variable";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r#"
+    Projection: test.t_date32
+      Filter: test.t_date32 = @variable
+        TableScan: test
+    "#
     );
 }
 
@@ -4574,6 +4921,26 @@ fn plan_create_index() {
     }
 }
 
+#[test]
+fn test_table_function_with_unsupported_arg_propagates_error() {
+    let sql = "SELECT * FROM my_func(('a', 'b', 'c'))";
+    let dialect = &GenericDialect {};
+    let state = MockSessionState::default();
+    let context = MockContextProvider { state };
+    let planner = SqlToRel::new(&context);
+    let result = DFParser::parse_sql_with_dialect(sql, dialect);
+    let mut ast = result.unwrap();
+    let err = planner
+        .statement_to_plan(ast.pop_front().unwrap())
+        .expect_err("query should have failed");
+    let msg = err.strip_backtrace();
+    assert!(
+        !msg.contains("Table Functions are not supported"),
+        "tuple argument error should be propagated before reaching get_table_function_source, got: {msg}"
+    );
+    assert_contains!(msg, "Struct not supported");
+}
+
 fn assert_field_not_found(mut err: DataFusionError, name: &str) {
     let err = loop {
         match err {
@@ -4713,59 +5080,46 @@ fn test_custom_type_plan() -> Result<()> {
     "#
     );
 
+    let plan = plan_sql("SELECT UUID '00010203-0405-0607-0809-000102030506'");
+    assert_snapshot!(
+        plan,
+        @r#"
+    Projection: CAST(Utf8("00010203-0405-0607-0809-000102030506") AS FixedSizeBinary(16)<{"ARROW:extension:name": "arrow.uuid"}>)
+      EmptyRelation: rows=1
+    "#
+    );
     Ok(())
 }
 
-fn error_message_test(sql: &str, err_msg_starts_with: &str) {
+fn error_message(sql: &str) -> String {
     let err = logical_plan(sql).expect_err("query should have failed");
-    assert!(
-        err.strip_backtrace().starts_with(err_msg_starts_with),
-        "Expected error to start with '{}', but got: '{}'",
-        err_msg_starts_with,
-        err.strip_backtrace(),
-    );
+    err.strip_backtrace()
 }
 
 #[test]
 fn test_error_message_invalid_scalar_function_signature() {
-    error_message_test(
-        "select sqrt()",
-        "Error during planning: 'sqrt' does not support zero arguments",
-    );
-    error_message_test(
-        "select sqrt(1, 2)",
-        "Error during planning: Failed to coerce arguments",
+    assert!(
+        error_message("select sqrt()").starts_with(
+            r"Error during planning: 'sqrt' does not support zero arguments"
+        )
     );
+    assert!(error_message("select sqrt(1, 2)").starts_with(r"Error during planning: Failed to coerce arguments to satisfy a call to 'sqrt' function: coercion from Int64, Int64 to the signature Exact(Int64) failed"));
 }
 
 #[test]
 fn test_error_message_invalid_aggregate_function_signature() {
-    error_message_test(
-        "select sum()",
-        "Error during planning: Execution error: Function 'sum' user-defined coercion failed with \"Execution error: sum function requires 1 argument, got 0\"",
-    );
-    // We keep two different prefixes because they clarify each other.
-    // It might be incorrect, and we should consider keeping only one.
-    error_message_test(
-        "select max(9, 3)",
-        "Error during planning: Execution error: Function 'max' user-defined coercion failed",
-    );
+    assert!(error_message("select sum()").starts_with(r"Error during planning: Execution error: Function 'sum' user-defined coercion failed with: Execution error: sum function requires 1 argument, got 0"));
+    assert!(error_message("select max(9, 3)").starts_with(r"Error during planning: Execution error: Function 'max' user-defined coercion failed with: Execution error: min/max was called with 2 arguments. It requires only 1"));
 }
 
 #[test]
 fn test_error_message_invalid_window_function_signature() {
-    error_message_test(
-        "select rank(1) over()",
-        "Error during planning: The function 'rank' expected zero argument but received 1",
-    );
+    assert!(error_message("select rank(1) over()").starts_with(r"Error during planning: The function 'rank' expected zero argument but received 1"));
 }
 
 #[test]
 fn test_error_message_invalid_window_aggregate_function_signature() {
-    error_message_test(
-        "select sum() over()",
-        "Error during planning: Execution error: Function 'sum' user-defined coercion failed with \"Execution error: sum function requires 1 argument, got 0\"",
-    );
+    assert!(error_message("select sum() over()").starts_with(r"Error during planning: Execution error: Function 'sum' user-defined coercion failed with: Execution error: sum function requires 1 argument, got 0"));
 }
 
 // Test issue: https://github.com/apache/datafusion/issues/14058
@@ -4787,7 +5141,11 @@ fn test_using_join_wildcard_schema() {
     // Only columns from one join side should be present
     let expected_fields = vec![
         "o1.order_id".to_string(),
+        "o1.o_orderkey".to_string(),
+        "o1.o_custkey".to_string(),
+        "o1.o_orderstatus".to_string(),
         "o1.customer_id".to_string(),
+        "o1.o_totalprice".to_string(),
         "o1.o_item_id".to_string(),
         "o1.qty".to_string(),
         "o1.price".to_string(),
@@ -4841,3 +5199,291 @@ fn test_using_join_wildcard_schema() {
         ]
     );
 }
+
+#[test]
+fn test_using_join_wildcard_schema_semi_anti() {
+    let s_columns = &["s.x1", "s.x2", "s.x3"];
+    let t_columns = &["t.x1", "t.x2", "t.x3"];
+
+    let sql = "WITH 
+        s AS (SELECT 1 AS x1, 2 AS x2, 3 AS x3),
+        t AS (SELECT 1 AS x1, 4 AS x2, 5 AS x3)
+        SELECT * FROM s LEFT SEMI JOIN t USING (x1)";
+    let plan = logical_plan(sql).unwrap();
+    assert_eq!(plan.schema().field_names(), s_columns);
+
+    let sql = "WITH
+        s AS (SELECT 1 AS x1, 2 AS x2, 3 AS x3),
+        t AS (SELECT 1 AS x1, 4 AS x2, 5 AS x3)
+        SELECT * FROM t RIGHT SEMI JOIN s USING (x1)";
+    let plan = logical_plan(sql).unwrap();
+    assert_eq!(plan.schema().field_names(), s_columns);
+
+    let sql = "WITH
+        s AS (SELECT 1 AS x1, 2 AS x2, 3 AS x3),
+        t AS (SELECT 1 AS x1, 4 AS x2, 5 AS x3)
+        SELECT * FROM s LEFT ANTI JOIN t USING (x1)";
+    let plan = logical_plan(sql).unwrap();
+    assert_eq!(plan.schema().field_names(), s_columns);
+
+    let sql = "WITH
+        s AS (SELECT 1 AS x1, 2 AS x2, 3 AS x3),
+        t AS (SELECT 1 AS x1, 4 AS x2, 5 AS x3)
+        SELECT * FROM t RIGHT ANTI JOIN s USING (x1)";
+    let plan = logical_plan(sql).unwrap();
+    assert_eq!(plan.schema().field_names(), s_columns);
+
+    // Same as above, but with swapped s and t sides.
+    // Tests the issue fixed with #20990.
+
+    let sql = "WITH
+        s AS (SELECT 1 AS x1, 2 AS x2, 3 AS x3),
+        t AS (SELECT 1 AS x1, 4 AS x2, 5 AS x3)
+        SELECT * FROM t LEFT SEMI JOIN s USING (x1)";
+    let plan = logical_plan(sql).unwrap();
+    assert_eq!(plan.schema().field_names(), t_columns);
+
+    let sql = "WITH
+        s AS (SELECT 1 AS x1, 2 AS x2, 3 AS x3),
+        t AS (SELECT 1 AS x1, 4 AS x2, 5 AS x3)
+        SELECT * FROM s RIGHT SEMI JOIN t USING (x1)";
+    let plan = logical_plan(sql).unwrap();
+    assert_eq!(plan.schema().field_names(), t_columns);
+
+    let sql = "WITH
+        s AS (SELECT 1 AS x1, 2 AS x2, 3 AS x3),
+        t AS (SELECT 1 AS x1, 4 AS x2, 5 AS x3)
+        SELECT * FROM t LEFT ANTI JOIN s USING (x1)";
+    let plan = logical_plan(sql).unwrap();
+    assert_eq!(plan.schema().field_names(), t_columns);
+
+    let sql = "WITH
+        s AS (SELECT 1 AS x1, 2 AS x2, 3 AS x3),
+        t AS (SELECT 1 AS x1, 4 AS x2, 5 AS x3)
+        SELECT * FROM s RIGHT ANTI JOIN t USING (x1)";
+    let plan = logical_plan(sql).unwrap();
+    assert_eq!(plan.schema().field_names(), t_columns);
+}
+
+#[test]
+fn test_2_nested_lateral_join_with_the_deepest_join_referencing_the_outer_most_relation()
+{
+    let sql = "SELECT * FROM j1 j1_outer, LATERAL (
+    SELECT * FROM j1 j1_inner, LATERAL (
+        SELECT * FROM j2 WHERE j1_inner.j1_id = j2_id and j1_outer.j1_id=j2_id
+    ) as j2
+) as j2";
+
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+         plan,
+         @r#"
+Projection: j1_outer.j1_id, j1_outer.j1_string, j2.j1_id, j2.j1_string, j2.j2_id, j2.j2_string
+  Cross Join:
+    SubqueryAlias: j1_outer
+      TableScan: j1
+    SubqueryAlias: j2
+      Subquery:
+        Projection: j1_inner.j1_id, j1_inner.j1_string, j2.j2_id, j2.j2_string
+          Cross Join:
+            SubqueryAlias: j1_inner
+              TableScan: j1
+            SubqueryAlias: j2
+              Subquery:
+                Projection: j2.j2_id, j2.j2_string
+                  Filter: outer_ref(j1_inner.j1_id) = j2.j2_id AND outer_ref(j1_outer.j1_id) = j2.j2_id
+                    TableScan: j2
+"#
+    );
+}
+
+#[test]
+fn test_correlated_recursive_scalar_subquery_with_level_3_scalar_subquery_referencing_level1_relation()
+ {
+    let sql = "select c_custkey from customer
+            where c_acctbal < (
+            select sum(o_totalprice) from orders
+            where o_custkey = c_custkey
+            and o_totalprice < (
+            select sum(l_extendedprice) as price from lineitem where l_orderkey = o_orderkey
+            and l_extendedprice < c_acctbal
+        )
+        ) order by c_custkey";
+
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+         plan,
+         @r#"
+Sort: customer.c_custkey ASC NULLS LAST
+  Projection: customer.c_custkey
+    Filter: customer.c_acctbal < (<subquery>)
+      Subquery:
+        Projection: sum(orders.o_totalprice)
+          Aggregate: groupBy=[[]], aggr=[[sum(orders.o_totalprice)]]
+            Filter: orders.o_custkey = outer_ref(customer.c_custkey) AND orders.o_totalprice < (<subquery>)
+              Subquery:
+                Projection: sum(lineitem.l_extendedprice) AS price
+                  Aggregate: groupBy=[[]], aggr=[[sum(lineitem.l_extendedprice)]]
+                    Filter: lineitem.l_orderkey = outer_ref(orders.o_orderkey) AND lineitem.l_extendedprice < outer_ref(customer.c_acctbal)
+                      TableScan: lineitem
+              TableScan: orders
+      TableScan: customer
+"#
+    );
+}
+
+#[test]
+fn test_progressive_lambda_parameters() {
+    let sql = "select array_reduce([1.0, 2.0], 0, (acc, v) -> acc + v, v -> -v)";
+
+    let expr = logical_plan_with_dialect(sql, &DatabricksDialect)
+        .unwrap()
+        .head_output_expr()
+        .unwrap()
+        .unwrap();
+
+    // taking into account the user defined coercion that coerced the List(Float64) to List(Float32),
+    // test that merge accumulator parameter and finish parameter correctly got the type of the merge
+    // lambda output (Float32 instead of Float64), and not the initial value type (Int64)
+    assert_eq!(
+        expr,
+        Expr::HigherOrderFunction(HigherOrderFunction::new(
+            Arc::new(MockArrayReduce::new()),
+            vec![
+                Expr::ScalarFunction(ScalarFunction::new_udf(
+                    make_array_udf(),
+                    // note the array being reduced is List(Float64)
+                    vec![
+                        Expr::Literal(1.0f64.into(), None),
+                        Expr::Literal(2.0f64.into(), None)
+                    ]
+                )),
+                Expr::Literal(0i64.into(), None),
+                lambda(
+                    ["acc", "v"],
+                    // lambda vars are Float32
+                    resolved_lambda_var("acc", DataType::Float32)
+                        + resolved_lambda_var("v", DataType::Float32)
+                ),
+                lambda(["v"], -resolved_lambda_var("v", DataType::Float32)),
+            ]
+        ))
+    )
+}
+
+fn resolved_lambda_var(name: &str, dt: DataType) -> Expr {
+    Expr::LambdaVariable(LambdaVariable::new(
+        name.to_string(),
+        Some(Arc::new(Field::new(name.to_string(), dt, true))),
+    ))
+}
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct MockArrayReduce {
+    signature: HigherOrderSignature,
+}
+
+impl MockArrayReduce {
+    #[expect(clippy::new_without_default)]
+    pub fn new() -> Self {
+        Self {
+            signature: HigherOrderSignature::user_defined(Volatility::Immutable),
+        }
+    }
+}
+
+impl HigherOrderUDF for MockArrayReduce {
+    fn name(&self) -> &str {
+        "array_reduce"
+    }
+
+    fn aliases(&self) -> &[String] {
+        &[]
+    }
+
+    fn signature(&self) -> &HigherOrderSignature {
+        &self.signature
+    }
+
+    fn coerce_value_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        let [_list, initial] = arg_types else {
+            unreachable!()
+        };
+
+        Ok(vec![
+            DataType::new_list(DataType::Float32, true),
+            initial.clone(),
+        ])
+    }
+
+    fn lambda_parameters(
+        &self,
+        step: usize,
+        fields: &[ValueOrLambda<FieldRef, Option<FieldRef>>],
+    ) -> Result<LambdaParametersProgress> {
+        // optional finish not supported for simplicity
+        let [
+            ValueOrLambda::Value(list),
+            ValueOrLambda::Value(initial_value),
+            ValueOrLambda::Lambda(merge),
+            ValueOrLambda::Lambda(_finish),
+        ] = fields
+        else {
+            unreachable!()
+        };
+
+        let list_field = match list.data_type() {
+            DataType::List(field) => field,
+            _ => unreachable!(),
+        };
+
+        Ok(match (step, merge) {
+            (0, None) => {
+                // at the first step, we use the initial_value as merge accumulator,
+                // and return None for finish since we don't know the output of merge
+                LambdaParametersProgress::Partial(vec![
+                    // merge
+                    Some(vec![Arc::clone(initial_value), Arc::clone(list_field)]),
+                    // finish
+                    None,
+                ])
+            }
+            (1, Some(accumulator)) | (0, Some(accumulator)) => {
+                // now we can use the merge output as it's accumulator and
+                // as the finish parameter
+                LambdaParametersProgress::Complete(vec![
+                    // merge
+                    vec![Arc::clone(accumulator), Arc::clone(list_field)],
+                    // finish
+                    vec![Arc::clone(accumulator)],
+                ])
+            }
+            (1, None) => {
+                unreachable!()
+            }
+            _ => todo!(),
+        })
+    }
+
+    fn return_field_from_args(
+        &self,
+        args: HigherOrderReturnFieldArgs,
+    ) -> Result<FieldRef> {
+        // optional finish not supported for simplicity
+        let [
+            ValueOrLambda::Value(_list),
+            ValueOrLambda::Value(_initial_value),
+            ValueOrLambda::Lambda(_merge),
+            ValueOrLambda::Lambda(finish),
+        ] = args.arg_fields
+        else {
+            unreachable!()
+        };
+
+        Ok(Arc::clone(finish))
+    }
+
+    fn invoke_with_args(&self, _args: HigherOrderFunctionArgs) -> Result<ColumnarValue> {
+        unreachable!()
+    }
+}
diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml
index e719a8851df7c..1159b7f3b703a 100644
--- a/datafusion/sqllogictest/Cargo.toml
+++ b/datafusion/sqllogictest/Cargo.toml
@@ -30,6 +30,9 @@ version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -42,29 +45,26 @@ async-trait = { workspace = true }
 bigdecimal = { workspace = true }
 bytes = { workspace = true, optional = true }
 chrono = { workspace = true, optional = true }
-clap = { version = "4.5.50", features = ["derive", "env"] }
-datafusion = { workspace = true, default-features = true, features = ["avro", "parquet_encryption"] }
-datafusion-spark = { workspace = true, default-features = true }
-datafusion-substrait = { workspace = true, default-features = true }
+clap = { version = "4.5.60", features = ["derive", "env"] }
+datafusion = { workspace = true, default-features = true, features = ["avro"] }
+datafusion-spark = { workspace = true, features = ["core"] }
+datafusion-substrait = { workspace = true, default-features = true, optional = true }
 futures = { workspace = true }
 half = { workspace = true, default-features = true }
 indicatif = "0.18"
 itertools = { workspace = true }
 log = { workspace = true }
 object_store = { workspace = true }
-postgres-protocol = { version = "0.6.7", optional = true }
-postgres-types = { version = "0.2.11", features = ["derive", "with-chrono-0_4"], optional = true }
-rust_decimal = { version = "1.38.0", features = ["tokio-pg"] }
+postgres-types = { version = "0.2.13", features = ["derive", "with-chrono-0_4"], optional = true }
 # When updating the following dependency verify that sqlite test file regeneration works correctly
 # by running the regenerate_sqlite_files.sh script.
-sqllogictest = "0.28.4"
+sqllogictest = "0.29.1"
 sqlparser = { workspace = true }
 tempfile = { workspace = true }
-testcontainers = { workspace = true, optional = true }
 testcontainers-modules = { workspace = true, features = ["postgres"], optional = true }
-thiserror = "2.0.17"
+thiserror = "2.0.18"
 tokio = { workspace = true }
-tokio-postgres = { version = "0.7.14", optional = true }
+tokio-postgres = { version = "0.7.17", optional = true }
 
 [features]
 avro = ["datafusion/avro"]
@@ -73,18 +73,28 @@ postgres = [
     "bytes",
     "chrono",
     "postgres-types",
-    "postgres-protocol",
-    "testcontainers",
     "testcontainers-modules",
     "tokio-postgres",
 ]
+parquet_encryption = [
+    "datafusion/parquet_encryption",
+]
+substrait = ["datafusion-substrait"]
 
 [dev-dependencies]
 env_logger = { workspace = true }
 regex = { workspace = true }
+# Required to make sure tests for pg display behaves consistently
+# regardless of feature unification with dependencies
+serde_json = { workspace = true, features = ["preserve_order"] }
 tokio = { workspace = true, features = ["rt-multi-thread"] }
 
 [[test]]
 harness = false
 name = "sqllogictests"
 path = "bin/sqllogictests.rs"
+
+# Required because we pull serde_json with a feature to get consistent pg display,
+# but its not directly used.
+[package.metadata.cargo-machete]
+ignored = "serde_json"
diff --git a/datafusion/sqllogictest/README.md b/datafusion/sqllogictest/README.md
index a389ae1ef60e2..f0a54cf978fbf 100644
--- a/datafusion/sqllogictest/README.md
+++ b/datafusion/sqllogictest/README.md
@@ -70,6 +70,37 @@ cargo test --test sqllogictests -- ddl --complete
 RUST_LOG=debug cargo test --test sqllogictests -- ddl
 ```
 
+### Per-file timing summary
+
+The sqllogictest runner can emit deterministic per-file elapsed timings to help
+identify slow test files.
+
+Timing summary output is disabled by default and enabled with
+`--timing-summary` (or `SLT_TIMING_SUMMARY=true`).
+
+When timing summary is enabled, periodic `Progress:` lines are suppressed by
+default to keep output stable.
+
+```shell
+# Show deterministic per-file elapsed timings (sorted slowest first)
+cargo test --test sqllogictests -- --timing-summary
+```
+
+```shell
+# Keep only the top 10 lines using standard shell tooling
+cargo test --test sqllogictests -- --timing-summary | head -n 10
+```
+
+```shell
+# Enable via environment variable
+SLT_TIMING_SUMMARY=1 cargo test --test sqllogictests
+```
+
+```shell
+# Optional debug logging for per-task slow files (>30s), disabled by default
+SLT_TIMING_DEBUG_SLOW_FILES=1 cargo test --test sqllogictests
+```
+
 ## Cookbook: Adding Tests
 
 1. Add queries
@@ -142,6 +173,17 @@ select substr('Andrew Lamb', 1, 6), '|'
 Andrew |
 ```
 
+## Cookbook: Ignoring volatile output
+
+Sometimes parts of a result change every run (timestamps, counters, etc.). To keep the rest of the snapshot checked in, replace those fragments with the `<slt:ignore>` marker inside the expected block. During validation the marker acts like a wildcard, so only the surrounding text must match.
+
+```text
+query TT
+EXPLAIN ANALYZE SELECT * FROM generate_series(100);
+----
+Plan with Metrics LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=0, end=100, batch_size=8192], metrics=[output_rows=101, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>]
+```
+
 # Reference
 
 ## Running tests: Validation Mode
diff --git a/datafusion/sqllogictest/bin/postgres_container.rs b/datafusion/sqllogictest/bin/postgres_container.rs
index 411562a7ccc74..fde5937760074 100644
--- a/datafusion/sqllogictest/bin/postgres_container.rs
+++ b/datafusion/sqllogictest/bin/postgres_container.rs
@@ -16,19 +16,19 @@
 // under the License.
 
 use crate::Options;
+use ContainerCommands::{FetchHost, FetchPort};
 use datafusion::common::Result;
 use log::info;
 use std::env::set_var;
 use std::future::Future;
 use std::sync::LazyLock;
 use std::{env, thread};
-use testcontainers::core::IntoContainerPort;
-use testcontainers::runners::AsyncRunner;
-use testcontainers::ImageExt;
 use testcontainers_modules::postgres;
+use testcontainers_modules::testcontainers::ImageExt;
+use testcontainers_modules::testcontainers::core::IntoContainerPort;
+use testcontainers_modules::testcontainers::runners::AsyncRunner;
 use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender};
-use tokio::sync::{mpsc, Mutex};
-use ContainerCommands::{FetchHost, FetchPort};
+use tokio::sync::{Mutex, mpsc};
 
 #[derive(Debug)]
 pub enum ContainerCommands {
@@ -86,7 +86,9 @@ pub async fn initialize_postgres_container(options: &Options) -> Result<()> {
         let pg_uri = format!("postgresql://postgres:postgres@{db_host}:{db_port}/test");
         info!("Postgres uri is {pg_uri}");
 
-        set_var("PG_URI", pg_uri);
+        unsafe {
+            set_var("PG_URI", pg_uri);
+        }
     } else {
         // close receiver
         POSTGRES_IN.rx.lock().await.close();
diff --git a/datafusion/sqllogictest/bin/sqllogictests.rs b/datafusion/sqllogictest/bin/sqllogictests.rs
index 7aca0fdd6e8d4..8ca19025bd7b7 100644
--- a/datafusion/sqllogictest/bin/sqllogictests.rs
+++ b/datafusion/sqllogictest/bin/sqllogictests.rs
@@ -15,14 +15,17 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use clap::Parser;
+use clap::{ColorChoice, Parser};
 use datafusion::common::instant::Instant;
 use datafusion::common::utils::get_available_parallelism;
-use datafusion::common::{exec_datafusion_err, exec_err, DataFusionError, Result};
+use datafusion::common::{DataFusionError, Result, exec_datafusion_err, exec_err};
+#[cfg(feature = "substrait")]
+use datafusion_sqllogictest::DataFusionSubstraitRoundTrip;
+use datafusion_sqllogictest::TestFile;
 use datafusion_sqllogictest::{
-    df_value_validator, read_dir_recursive, setup_scratch_dir, should_skip_file,
-    should_skip_record, value_normalizer, DataFusion, DataFusionSubstraitRoundTrip,
-    Filter, TestContext,
+    CurrentlyExecutingSqlTracker, DataFusion, Filter, TestContext, df_value_validator,
+    read_dir_recursive, setup_scratch_dir, should_skip_file, should_skip_record,
+    value_normalizer,
 };
 use futures::stream::StreamExt;
 use indicatif::{
@@ -32,8 +35,8 @@ use itertools::Itertools;
 use log::Level::Info;
 use log::{info, log_enabled};
 use sqllogictest::{
-    parse_file, strict_column_validator, AsyncDB, Condition, MakeConnection, Normalizer,
-    Record, Validator,
+    AsyncDB, Condition, MakeConnection, Normalizer, Record, Validator, parse_file,
+    strict_column_validator,
 };
 
 #[cfg(feature = "postgres")]
@@ -41,9 +44,14 @@ use crate::postgres_container::{
     initialize_postgres_container, terminate_postgres_container,
 };
 use datafusion::common::runtime::SpawnedTask;
-use std::ffi::OsStr;
+use futures::FutureExt;
 use std::fs;
+use std::io::{IsTerminal, Write, stderr, stdout};
 use std::path::{Path, PathBuf};
+use std::str::FromStr;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::{Arc, Mutex};
+use std::time::Duration;
 
 #[cfg(feature = "postgres")]
 mod postgres_container;
@@ -51,8 +59,29 @@ mod postgres_container;
 const TEST_DIRECTORY: &str = "test_files/";
 const DATAFUSION_TESTING_TEST_DIRECTORY: &str = "../../datafusion-testing/data/";
 const PG_COMPAT_FILE_PREFIX: &str = "pg_compat_";
+const TPCH_PREFIX: &str = "tpch";
 const SQLITE_PREFIX: &str = "sqlite";
 const ERRS_PER_FILE_LIMIT: usize = 10;
+const TIMING_DEBUG_SLOW_FILES_ENV: &str = "SLT_TIMING_DEBUG_SLOW_FILES";
+
+#[derive(Debug)]
+struct FileTiming {
+    relative_path: PathBuf,
+    elapsed: Duration,
+}
+
+type DataFusionConfigChangeErrors = Arc<Mutex<Vec<String>>>;
+
+fn config_change_result(
+    config_change_errors: &DataFusionConfigChangeErrors,
+) -> Result<()> {
+    let errors = config_change_errors.lock().unwrap();
+    if errors.is_empty() {
+        Ok(())
+    } else {
+        Err(DataFusionError::External(errors.join("\n\n").into()))
+    }
+}
 
 pub fn main() -> Result<()> {
     tokio::runtime::Builder::new_multi_thread()
@@ -95,6 +124,7 @@ async fn run_tests() -> Result<()> {
     env_logger::init();
 
     let options: Options = Parser::parse();
+    let timing_debug_slow_files = is_env_truthy(TIMING_DEBUG_SLOW_FILES_ENV);
     if options.list {
         // nextest parses stdout, so print messages to stderr
         eprintln!("NOTICE: --list option unsupported, quitting");
@@ -107,6 +137,13 @@ async fn run_tests() -> Result<()> {
 
     options.warn_on_ignored();
 
+    // Print parallelism info for debugging CI performance
+    eprintln!(
+        "Running with {} test threads (available parallelism: {})",
+        options.test_threads,
+        get_available_parallelism()
+    );
+
     #[cfg(feature = "postgres")]
     initialize_postgres_container(&options).await?;
 
@@ -122,6 +159,8 @@ async fn run_tests() -> Result<()> {
     .unwrap()
     .progress_chars("##-");
 
+    let colored_output = options.is_colored();
+
     let start = Instant::now();
 
     let test_files = read_test_files(&options)?;
@@ -134,13 +173,22 @@ async fn run_tests() -> Result<()> {
             eprintln!("  {error}");
         }
 
-        eprintln!("\nTemporary file check failed. Please ensure that within each test file, any scratch file created is placed under a folder with the same name as the test file (without extension).\nExample: inside `join.slt`, temporary files must be created under `.../scratch/join/`\n");
+        eprintln!(
+            "\nTemporary file check failed. Please ensure that within each test file, any scratch file created is placed under a folder with the same name as the test file (without extension).\nExample: inside `join.slt`, temporary files must be created under `.../scratch/join/`\n"
+        );
 
         return exec_err!("sqllogictests scratch file check failed");
     }
 
     let num_tests = test_files.len();
-    let errors: Vec<_> = futures::stream::iter(test_files)
+    // For CI environments without TTY, print progress periodically unless
+    // deterministic timing summary output is requested.
+    let is_ci = !stderr().is_terminal();
+    let print_periodic_progress = is_ci && !options.timing_summary;
+    let progress_interval = std::cmp::max(1, num_tests / 10);
+    let completed_count = Arc::new(AtomicUsize::new(0));
+
+    let file_results: Vec<_> = futures::stream::iter(test_files)
         .map(|test_file| {
             let validator = if options.include_sqlite
                 && test_file.relative_path.starts_with(SQLITE_PREFIX)
@@ -154,8 +202,15 @@ async fn run_tests() -> Result<()> {
             let m_style_clone = m_style.clone();
             let filters = options.filters.clone();
 
+            let relative_path = test_file.relative_path.clone();
+            let relative_path_for_timing = test_file.relative_path.clone();
+
+            let currently_running_sql_tracker = CurrentlyExecutingSqlTracker::new();
+            let currently_running_sql_tracker_clone =
+                currently_running_sql_tracker.clone();
+            let file_start = Instant::now();
             SpawnedTask::spawn(async move {
-                match (
+                let result = match (
                     options.postgres_runner,
                     options.complete,
                     options.substrait_round_trip,
@@ -167,8 +222,10 @@ async fn run_tests() -> Result<()> {
                             m_clone,
                             m_style_clone,
                             filters.as_ref(),
+                            currently_running_sql_tracker_clone,
+                            colored_output,
                         )
-                        .await?
+                        .await
                     }
                     (false, false, _) => {
                         run_test_file(
@@ -177,12 +234,20 @@ async fn run_tests() -> Result<()> {
                             m_clone,
                             m_style_clone,
                             filters.as_ref(),
+                            currently_running_sql_tracker_clone,
+                            colored_output,
                         )
-                        .await?
+                        .await
                     }
                     (false, true, _) => {
-                        run_complete_file(test_file, validator, m_clone, m_style_clone)
-                            .await?
+                        run_complete_file(
+                            test_file,
+                            validator,
+                            m_clone,
+                            m_style_clone,
+                            currently_running_sql_tracker_clone,
+                        )
+                        .await
                     }
                     (true, false, _) => {
                         run_test_file_with_postgres(
@@ -191,8 +256,9 @@ async fn run_tests() -> Result<()> {
                             m_clone,
                             m_style_clone,
                             filters.as_ref(),
+                            currently_running_sql_tracker_clone,
                         )
-                        .await?
+                        .await
                     }
                     (true, true, _) => {
                         run_complete_file_with_postgres(
@@ -200,27 +266,116 @@ async fn run_tests() -> Result<()> {
                             validator,
                             m_clone,
                             m_style_clone,
+                            currently_running_sql_tracker_clone,
                         )
-                        .await?
+                        .await
                     }
+                };
+
+                let elapsed = file_start.elapsed();
+                if timing_debug_slow_files && elapsed.as_secs() > 30 {
+                    eprintln!(
+                        "Slow file: {} took {:.1}s",
+                        relative_path_for_timing.display(),
+                        elapsed.as_secs_f64()
+                    );
                 }
-                Ok(()) as Result<()>
+
+                (result, elapsed)
             })
             .join()
+            .map(move |result| {
+                let elapsed = match &result {
+                    Ok((_, elapsed)) => *elapsed,
+                    Err(_) => Duration::ZERO,
+                };
+
+                (
+                    result.map(|(thread_result, _)| thread_result),
+                    relative_path,
+                    currently_running_sql_tracker,
+                    elapsed,
+                )
+            })
         })
         // run up to num_cpus streams in parallel
         .buffer_unordered(options.test_threads)
-        .flat_map(|result| {
-            // Filter out any Ok() leaving only the DataFusionErrors
-            futures::stream::iter(match result {
-                // Tokio panic error
-                Err(e) => Some(DataFusionError::External(Box::new(e))),
-                Ok(thread_result) => thread_result.err(),
-            })
+        .inspect({
+            let completed_count = Arc::clone(&completed_count);
+            move |_| {
+                let completed = completed_count.fetch_add(1, Ordering::Relaxed) + 1;
+                // Print progress at 10% intervals, every 50 files, and completion.
+                if print_periodic_progress
+                    && (completed.is_multiple_of(progress_interval)
+                        || completed.is_multiple_of(50)
+                        || completed == num_tests)
+                {
+                    eprintln!(
+                        "Progress: {}/{} files completed ({:.0}%)",
+                        completed,
+                        num_tests,
+                        (completed as f64 / num_tests as f64) * 100.0
+                    );
+                }
+            }
         })
         .collect()
         .await;
 
+    let mut file_timings: Vec<FileTiming> = file_results
+        .iter()
+        .map(|(_, path, _, elapsed)| FileTiming {
+            relative_path: path.clone(),
+            elapsed: *elapsed,
+        })
+        .collect();
+
+    file_timings.sort_by(|a, b| {
+        b.elapsed
+            .cmp(&a.elapsed)
+            .then_with(|| a.relative_path.cmp(&b.relative_path))
+    });
+
+    print_timing_summary(&options, &file_timings)?;
+
+    let errors: Vec<_> = file_results
+        .into_iter()
+        .filter_map(|(result, test_file_path, current_sql, _)| {
+            // Filter out any Ok() leaving only the DataFusionErrors
+            match result {
+                Err(e) => {
+                    let error = DataFusionError::External(Box::new(e));
+                    let current_sql = current_sql.get_currently_running_sqls();
+
+                    if current_sql.is_empty() {
+                        Some(error.context(format!(
+                            "failure in {} with no currently running sql tracked",
+                            test_file_path.display()
+                        )))
+                    } else if current_sql.len() == 1 {
+                        let sql = &current_sql[0];
+                        Some(error.context(format!(
+                            "failure in {} for sql {sql}",
+                            test_file_path.display()
+                        )))
+                    } else {
+                        let sqls = current_sql
+                            .iter()
+                            .enumerate()
+                            .map(|(i, sql)| format!("\n[{}]: {}", i + 1, sql))
+                            .collect::<String>();
+                        Some(error.context(format!(
+                            "failure in {} for multiple currently running sqls: {}",
+                            test_file_path.display(),
+                            sqls
+                        )))
+                    }
+                }
+                Ok(thread_result) => thread_result.err(),
+            }
+        })
+        .collect();
+
     m.println(format!(
         "Completed {} test files in {}",
         num_tests,
@@ -241,12 +396,47 @@ async fn run_tests() -> Result<()> {
     }
 }
 
+fn print_timing_summary(options: &Options, file_timings: &[FileTiming]) -> Result<()> {
+    if !options.timing_summary || file_timings.is_empty() {
+        return Ok(());
+    }
+
+    let mut output = stdout().lock();
+    writeln!(output, "Per-file elapsed summary (deterministic):")?;
+    for (idx, timing) in file_timings.iter().enumerate() {
+        writeln!(
+            output,
+            "{:>3}. {:>8.3}s  {}",
+            idx + 1,
+            timing.elapsed.as_secs_f64(),
+            timing.relative_path.display()
+        )?;
+    }
+    output.flush()?;
+
+    Ok(())
+}
+
+fn is_env_truthy(name: &str) -> bool {
+    std::env::var_os(name)
+        .and_then(|value| value.into_string().ok())
+        .is_some_and(|value| {
+            matches!(
+                value.trim().to_ascii_lowercase().as_str(),
+                "1" | "true" | "yes" | "on"
+            )
+        })
+}
+
+#[cfg(feature = "substrait")]
 async fn run_test_file_substrait_round_trip(
     test_file: TestFile,
     validator: Validator,
     mp: MultiProgress,
     mp_style: ProgressStyle,
     filters: &[Filter],
+    currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
+    colored_output: bool,
 ) -> Result<()> {
     let TestFile {
         path,
@@ -269,23 +459,39 @@ async fn run_test_file_substrait_round_trip(
             test_ctx.session_ctx().clone(),
             relative_path.clone(),
             pb.clone(),
-        ))
+        )
+        .with_currently_executing_sql_tracker(currently_executing_sql_tracker.clone()))
     });
     runner.add_label("DatafusionSubstraitRoundTrip");
     runner.with_column_validator(strict_column_validator);
     runner.with_normalizer(value_normalizer);
     runner.with_validator(validator);
-    let res = run_file_in_runner(path, runner, filters).await;
+    let res = run_file_in_runner(path, &mut runner, filters, colored_output).await;
     pb.finish_and_clear();
     res
 }
 
+#[cfg(not(feature = "substrait"))]
+async fn run_test_file_substrait_round_trip(
+    _test_file: TestFile,
+    _validator: Validator,
+    _mp: MultiProgress,
+    _mp_style: ProgressStyle,
+    _filters: &[Filter],
+    _currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
+    _colored_output: bool,
+) -> Result<()> {
+    exec_err!("Cannot run substrait round-trip: the 'substrait' feature is not enabled")
+}
+
 async fn run_test_file(
     test_file: TestFile,
     validator: Validator,
     mp: MultiProgress,
     mp_style: ProgressStyle,
     filters: &[Filter],
+    currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
+    colored_output: bool,
 ) -> Result<()> {
     let TestFile {
         path,
@@ -303,26 +509,39 @@ async fn run_test_file(
     pb.set_style(mp_style);
     pb.set_message(format!("{:?}", &relative_path));
 
+    // If DataFusion configuration has changed during test file runs, errors will be
+    // pushed to this vec.
+    // HACK: managed externally because `sqllogictest` is an external dependency, and
+    // it doesn't have an API to directly access the inner runner.
+    let config_change_errors = Arc::new(Mutex::new(Vec::new()));
     let mut runner = sqllogictest::Runner::new(|| async {
         Ok(DataFusion::new(
             test_ctx.session_ctx().clone(),
             relative_path.clone(),
             pb.clone(),
-        ))
+        )
+        .with_currently_executing_sql_tracker(currently_executing_sql_tracker.clone())
+        .with_config_change_errors(Arc::clone(&config_change_errors)))
     });
     runner.add_label("Datafusion");
     runner.with_column_validator(strict_column_validator);
     runner.with_normalizer(value_normalizer);
     runner.with_validator(validator);
-    let result = run_file_in_runner(path, runner, filters).await;
+    let result = run_file_in_runner(path, &mut runner, filters, colored_output).await;
     pb.finish_and_clear();
-    result
+
+    result?;
+
+    // If there was no correctness error, check that the config is unchanged.
+    runner.shutdown_async().await;
+    config_change_result(&config_change_errors)
 }
 
 async fn run_file_in_runner<D: AsyncDB, M: MakeConnection<Conn = D>>(
     path: PathBuf,
-    mut runner: sqllogictest::Runner<D, M>,
+    runner: &mut sqllogictest::Runner<D, M>,
     filters: &[Filter],
+    colored_output: bool,
 ) -> Result<()> {
     let path = path.canonicalize()?;
     let records =
@@ -336,7 +555,11 @@ async fn run_file_in_runner<D: AsyncDB, M: MakeConnection<Conn = D>>(
             continue;
         }
         if let Err(err) = runner.run_async(record).await {
-            errs.push(format!("{err}"));
+            if colored_output {
+                errs.push(format!("{}", err.display(true)));
+            } else {
+                errs.push(format!("{err}"));
+            }
         }
     }
 
@@ -359,6 +582,7 @@ async fn run_file_in_runner<D: AsyncDB, M: MakeConnection<Conn = D>>(
     Ok(())
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn get_record_count(path: &PathBuf, label: String) -> u64 {
     let records: Vec<Record<<DataFusion as AsyncDB>::ColumnType>> =
         parse_file(path).unwrap();
@@ -402,6 +626,7 @@ async fn run_test_file_with_postgres(
     mp: MultiProgress,
     mp_style: ProgressStyle,
     filters: &[Filter],
+    currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
 ) -> Result<()> {
     use datafusion_sqllogictest::Postgres;
     let TestFile {
@@ -417,13 +642,17 @@ async fn run_test_file_with_postgres(
     pb.set_message(format!("{:?}", &relative_path));
 
     let mut runner = sqllogictest::Runner::new(|| {
-        Postgres::connect(relative_path.clone(), pb.clone())
+        Postgres::connect_with_tracked_sql(
+            relative_path.clone(),
+            pb.clone(),
+            currently_executing_sql_tracker.clone(),
+        )
     });
     runner.add_label("postgres");
     runner.with_column_validator(strict_column_validator);
     runner.with_normalizer(value_normalizer);
     runner.with_validator(validator);
-    let result = run_file_in_runner(path, runner, filters).await;
+    let result = run_file_in_runner(path, &mut runner, filters, false).await;
     pb.finish_and_clear();
     result
 }
@@ -435,6 +664,7 @@ async fn run_test_file_with_postgres(
     _mp: MultiProgress,
     _mp_style: ProgressStyle,
     _filters: &[Filter],
+    _currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
 ) -> Result<()> {
     use datafusion::common::plan_err;
     plan_err!("Can not run with postgres as postgres feature is not enabled")
@@ -445,6 +675,7 @@ async fn run_complete_file(
     validator: Validator,
     mp: MultiProgress,
     mp_style: ProgressStyle,
+    currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
 ) -> Result<()> {
     let TestFile {
         path,
@@ -465,12 +696,15 @@ async fn run_complete_file(
     pb.set_style(mp_style);
     pb.set_message(format!("{:?}", &relative_path));
 
+    let config_change_errors = Arc::new(Mutex::new(Vec::new()));
     let mut runner = sqllogictest::Runner::new(|| async {
         Ok(DataFusion::new(
             test_ctx.session_ctx().clone(),
             relative_path.clone(),
             pb.clone(),
-        ))
+        )
+        .with_currently_executing_sql_tracker(currently_executing_sql_tracker.clone())
+        .with_config_change_errors(Arc::clone(&config_change_errors)))
     });
 
     let col_separator = " ";
@@ -488,7 +722,9 @@ async fn run_complete_file(
 
     pb.finish_and_clear();
 
-    res
+    res?;
+    runner.shutdown_async().await;
+    config_change_result(&config_change_errors)
 }
 
 #[cfg(feature = "postgres")]
@@ -497,6 +733,7 @@ async fn run_complete_file_with_postgres(
     validator: Validator,
     mp: MultiProgress,
     mp_style: ProgressStyle,
+    currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
 ) -> Result<()> {
     use datafusion_sqllogictest::Postgres;
     let TestFile {
@@ -516,7 +753,11 @@ async fn run_complete_file_with_postgres(
     pb.set_message(format!("{:?}", &relative_path));
 
     let mut runner = sqllogictest::Runner::new(|| {
-        Postgres::connect(relative_path.clone(), pb.clone())
+        Postgres::connect_with_tracked_sql(
+            relative_path.clone(),
+            pb.clone(),
+            currently_executing_sql_tracker.clone(),
+        )
     });
     runner.add_label("postgres");
     runner.with_column_validator(strict_column_validator);
@@ -547,81 +788,40 @@ async fn run_complete_file_with_postgres(
     _validator: Validator,
     _mp: MultiProgress,
     _mp_style: ProgressStyle,
+    _currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
 ) -> Result<()> {
     use datafusion::common::plan_err;
     plan_err!("Can not run with postgres as postgres feature is not enabled")
 }
 
-/// Represents a parsed test file
-#[derive(Debug)]
-struct TestFile {
-    /// The absolute path to the file
-    pub path: PathBuf,
-    /// The relative path of the file (used for display)
-    pub relative_path: PathBuf,
-}
-
-impl TestFile {
-    fn new(path: PathBuf) -> Self {
-        let p = path.to_string_lossy();
-        let relative_path = PathBuf::from(if p.starts_with(TEST_DIRECTORY) {
-            p.strip_prefix(TEST_DIRECTORY).unwrap()
-        } else if p.starts_with(DATAFUSION_TESTING_TEST_DIRECTORY) {
-            p.strip_prefix(DATAFUSION_TESTING_TEST_DIRECTORY).unwrap()
-        } else {
-            ""
-        });
-
-        Self {
-            path,
-            relative_path,
-        }
-    }
-
-    fn is_slt_file(&self) -> bool {
-        self.path.extension() == Some(OsStr::new("slt"))
-    }
-
-    fn check_sqlite(&self, options: &Options) -> bool {
-        if !self.relative_path.starts_with(SQLITE_PREFIX) {
-            return true;
-        }
-
-        options.include_sqlite
-    }
-
-    fn check_tpch(&self, options: &Options) -> bool {
-        if !self.relative_path.starts_with("tpch") {
-            return true;
-        }
+fn read_test_files(options: &Options) -> Result<Vec<TestFile>> {
+    let prefixes: &[&str] = if options.include_sqlite {
+        &[TEST_DIRECTORY, DATAFUSION_TESTING_TEST_DIRECTORY]
+    } else {
+        &[TEST_DIRECTORY]
+    };
 
-        options.include_tpch
-    }
-}
+    let directories = prefixes
+        .iter()
+        .map(|prefix| {
+            read_dir_recursive(prefix).map_err(|e| {
+                exec_datafusion_err!("Error reading test directory {prefix}: {e}")
+            })
+        })
+        .collect::<Result<Vec<_>>>()?;
 
-fn read_test_files(options: &Options) -> Result<Vec<TestFile>> {
-    let mut paths = read_dir_recursive(TEST_DIRECTORY)?
+    let mut paths = directories
         .into_iter()
-        .map(TestFile::new)
+        .flatten()
+        .map(|p| TestFile::new(p, prefixes))
         .filter(|f| options.check_test_file(&f.path))
         .filter(|f| f.is_slt_file())
-        .filter(|f| f.check_tpch(options))
-        .filter(|f| f.check_sqlite(options))
+        .filter(|f| !f.relative_path_starts_with(TPCH_PREFIX) || options.include_tpch)
+        .filter(|f| !f.relative_path_starts_with(SQLITE_PREFIX) || options.include_sqlite)
         .filter(|f| options.check_pg_compat_file(f.path.as_path()))
         .collect::<Vec<_>>();
-    if options.include_sqlite {
-        let mut sqlite_paths = read_dir_recursive(DATAFUSION_TESTING_TEST_DIRECTORY)?
-            .into_iter()
-            .map(TestFile::new)
-            .filter(|f| options.check_test_file(&f.path))
-            .filter(|f| f.is_slt_file())
-            .filter(|f| f.check_sqlite(options))
-            .filter(|f| options.check_pg_compat_file(f.path.as_path()))
-            .collect::<Vec<_>>();
-
-        paths.append(&mut sqlite_paths)
-    }
 
+    paths.sort_unstable();
     Ok(paths)
 }
 
@@ -707,6 +907,22 @@ struct Options {
         default_value_t = get_available_parallelism()
     )]
     test_threads: usize,
+
+    #[clap(
+        long,
+        env = "SLT_TIMING_SUMMARY",
+        default_value_t = false,
+        help = "Print deterministic per-file timing summary"
+    )]
+    timing_summary: bool,
+
+    #[clap(
+        long,
+        value_name = "MODE",
+        help = "Control colored output",
+        default_value_t = ColorChoice::Auto
+    )]
+    color: ColorChoice,
 }
 
 impl Options {
@@ -748,6 +964,37 @@ impl Options {
             eprintln!("WARNING: Ignoring `--show-output` compatibility option");
         }
     }
+
+    /// Determine if colour output should be enabled, respecting --color, NO_COLOR, CARGO_TERM_COLOR, and terminal detection
+    fn is_colored(&self) -> bool {
+        // NO_COLOR takes precedence
+        if std::env::var_os("NO_COLOR").is_some() {
+            return false;
+        }
+
+        match self.color {
+            ColorChoice::Always => true,
+            ColorChoice::Never => false,
+            ColorChoice::Auto => {
+                // CARGO_TERM_COLOR takes precedence over auto-detection
+                let cargo_term_color = <ColorChoice as FromStr>::from_str(
+                    &std::env::var("CARGO_TERM_COLOR")
+                        .unwrap_or_else(|_| "auto".to_string()),
+                )
+                .unwrap_or(ColorChoice::Auto);
+                match cargo_term_color {
+                    ColorChoice::Always => true,
+                    ColorChoice::Never => false,
+                    ColorChoice::Auto => {
+                        // Auto for both CLI argument and CARGO_TERM_COLOR,
+                        // then use colors by default for non-dumb terminals
+                        stdout().is_terminal()
+                            && std::env::var("TERM").unwrap_or_default() != "dumb"
+                    }
+                }
+            }
+        }
+    }
 }
 
 /// Performs scratch file check for all test files.
@@ -794,18 +1041,18 @@ fn scratch_file_check(test_files: &[TestFile]) -> Result<Vec<String>> {
         let lines: Vec<&str> = content.lines().collect();
 
         for (line_num, line) in lines.iter().enumerate() {
-            if let Some(captures) = scratch_pattern.captures(line) {
-                if let Some(found_target) = captures.get(1) {
-                    let found_target = found_target.as_str();
-                    if found_target != expected_target {
-                        errors.push(format!(
-                            "File {}:{}: scratch target '{}' does not match file name '{}'",
-                            test_file.path.display(),
-                            line_num + 1,
-                            found_target,
-                            expected_target
-                        ));
-                    }
+            if let Some(captures) = scratch_pattern.captures(line)
+                && let Some(found_target) = captures.get(1)
+            {
+                let found_target = found_target.as_str();
+                if found_target != expected_target {
+                    errors.push(format!(
+                        "File {}:{}: scratch target '{}' does not match file name '{}'",
+                        test_file.path.display(),
+                        line_num + 1,
+                        found_target,
+                        expected_target
+                    ));
                 }
             }
         }
diff --git a/datafusion/sqllogictest/src/engines/conversion.rs b/datafusion/sqllogictest/src/engines/conversion.rs
index de3acbee93b1a..3e519042f4ee0 100644
--- a/datafusion/sqllogictest/src/engines/conversion.rs
+++ b/datafusion/sqllogictest/src/engines/conversion.rs
@@ -15,10 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::datatypes::{i256, Decimal128Type, Decimal256Type, DecimalType};
+use arrow::datatypes::{Decimal128Type, Decimal256Type, DecimalType, i256};
 use bigdecimal::BigDecimal;
 use half::f16;
-use rust_decimal::prelude::*;
+use std::str::FromStr;
 
 /// Represents a constant for NULL string in your database.
 pub const NULL_STR: &str = "NULL";
@@ -115,13 +115,14 @@ pub(crate) fn decimal_256_to_str(value: i256, scale: i8) -> String {
 }
 
 #[cfg(feature = "postgres")]
-pub(crate) fn decimal_to_str(value: Decimal) -> String {
-    big_decimal_to_str(BigDecimal::from_str(&value.to_string()).unwrap(), None)
+pub(crate) fn decimal_to_str(value: BigDecimal) -> String {
+    big_decimal_to_str(value, None)
 }
 
 /// Converts a `BigDecimal` to its plain string representation, optionally rounding to a specified number of decimal places.
 ///
 /// If `round_digits` is `None`, the value is rounded to 12 decimal places by default.
+#[expect(clippy::needless_pass_by_value)]
 pub(crate) fn big_decimal_to_str(value: BigDecimal, round_digits: Option<i64>) -> String {
     // Round the value to limit the number of decimal places
     let value = value.round(round_digits.unwrap_or(12)).normalized();
@@ -132,7 +133,7 @@ pub(crate) fn big_decimal_to_str(value: BigDecimal, round_digits: Option<i64>) -
 #[cfg(test)]
 mod tests {
     use super::big_decimal_to_str;
-    use bigdecimal::{num_bigint::BigInt, BigDecimal};
+    use bigdecimal::{BigDecimal, num_bigint::BigInt};
 
     macro_rules! assert_decimal_str_eq {
         ($integer:expr, $scale:expr, $round_digits:expr, $expected:expr) => {
diff --git a/datafusion/sqllogictest/src/engines/currently_executed_sql.rs b/datafusion/sqllogictest/src/engines/currently_executed_sql.rs
new file mode 100644
index 0000000000000..5b1979b4ee9a9
--- /dev/null
+++ b/datafusion/sqllogictest/src/engines/currently_executed_sql.rs
@@ -0,0 +1,85 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::collections::HashMap;
+use std::sync::atomic::AtomicUsize;
+use std::sync::{Arc, Mutex};
+
+/// Hold the currently executed SQL statements.
+/// This is used to save the currently running SQLs in case of a crash.
+#[derive(Clone)]
+pub struct CurrentlyExecutingSqlTracker {
+    /// The index of the SQL statement.
+    /// Used to uniquely identify each SQL statement even if they are the same.
+    sql_index: Arc<AtomicUsize>,
+    /// Lock to store the currently executed SQL statement.
+    /// It DOES NOT hold the lock for the duration of query execution and only execute the lock
+    /// when updating the currently executed SQL statement to allow for saving the last executed SQL
+    /// in case of a crash.
+    currently_executed_sqls: Arc<Mutex<HashMap<usize, String>>>,
+}
+
+impl Default for CurrentlyExecutingSqlTracker {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl CurrentlyExecutingSqlTracker {
+    pub fn new() -> Self {
+        Self {
+            sql_index: Arc::new(AtomicUsize::new(0)),
+            currently_executed_sqls: Arc::new(Mutex::new(HashMap::new())),
+        }
+    }
+
+    /// Set the currently executed SQL statement.
+    ///
+    /// Returns a key to use to remove the SQL statement when done.
+    ///
+    /// We are not returning a guard that will automatically remove the SQL statement when dropped.
+    /// as on panic the drop can be called, and it will remove the SQL statement before we can log it.
+    #[must_use = "The returned index must be used to remove the SQL statement when done."]
+    pub fn set_sql(&self, sql: impl Into<String>) -> usize {
+        let index = self
+            .sql_index
+            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        self.currently_executed_sqls
+            .lock()
+            .unwrap_or_else(|e| e.into_inner())
+            .insert(index, sql.into());
+        index
+    }
+
+    /// Remove the currently executed SQL statement by the provided key that was returned by [`Self::set_sql`].
+    pub fn remove_sql(&self, index: usize) {
+        self.currently_executed_sqls
+            .lock()
+            .unwrap_or_else(|e| e.into_inner())
+            .remove(&index);
+    }
+
+    /// Get the currently executed SQL statements.
+    pub fn get_currently_running_sqls(&self) -> Vec<String> {
+        self.currently_executed_sqls
+            .lock()
+            .unwrap_or_else(|e| e.into_inner())
+            .values()
+            .cloned()
+            .collect()
+    }
+}
diff --git a/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs b/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs
index 87108b67424b2..2c549422d6547 100644
--- a/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs
+++ b/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs
@@ -185,9 +185,8 @@ macro_rules! get_row_value {
 /// [NULL Values and empty strings]: https://duckdb.org/dev/sqllogictest/result_verification#null-values-and-empty-strings
 ///
 /// Floating numbers are rounded to have a consistent representation with the Postgres runner.
-///
 pub fn cell_to_string(col: &ArrayRef, row: usize, is_spark_path: bool) -> Result<String> {
-    if !col.is_valid(row) {
+    if col.is_null(row) {
         // represent any null value with the string "NULL"
         Ok(NULL_STR.to_string())
     } else {
@@ -254,7 +253,7 @@ pub fn cell_to_string(col: &ArrayRef, row: usize, is_spark_path: bool) -> Result
     }
 }
 
-/// Converts columns to a result as expected by sqllogicteset.
+/// Converts columns to a result as expected by sqllogictest.
 pub fn convert_schema_to_types(columns: &Fields) -> Vec<DFColumnType> {
     columns
         .iter()
diff --git a/datafusion/sqllogictest/src/engines/datafusion_engine/runner.rs b/datafusion/sqllogictest/src/engines/datafusion_engine/runner.rs
index 45deefdc9bbdf..08facc48005dc 100644
--- a/datafusion/sqllogictest/src/engines/datafusion_engine/runner.rs
+++ b/datafusion/sqllogictest/src/engines/datafusion_engine/runner.rs
@@ -15,10 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::sync::Arc;
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
 use std::{path::PathBuf, time::Duration};
 
-use super::{error::Result, normalize, DFSqlLogicTestError};
+use super::{DFSqlLogicTestError, error::Result, normalize};
+use crate::engines::currently_executed_sql::CurrentlyExecutingSqlTracker;
+use crate::engines::output::{DFColumnType, DFOutput};
+use crate::is_spark_path;
 use arrow::record_batch::RecordBatch;
 use async_trait::async_trait;
 use datafusion::physical_plan::common::collect;
@@ -30,24 +34,55 @@ use log::{debug, log_enabled, warn};
 use sqllogictest::DBOutput;
 use tokio::time::Instant;
 
-use crate::engines::output::{DFColumnType, DFOutput};
-use crate::is_spark_path;
-
 pub struct DataFusion {
     ctx: SessionContext,
     relative_path: PathBuf,
     pb: ProgressBar,
+    currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
+    default_config: HashMap<String, Option<String>>,
+    config_change_errors: Option<Arc<Mutex<Vec<String>>>>,
 }
 
 impl DataFusion {
     pub fn new(ctx: SessionContext, relative_path: PathBuf, pb: ProgressBar) -> Self {
+        let default_config = ctx
+            .state()
+            .config()
+            .options()
+            .entries()
+            .iter()
+            .map(|e| (e.key.clone(), e.value.clone()))
+            .collect();
+
         Self {
             ctx,
             relative_path,
             pb,
+            currently_executing_sql_tracker: CurrentlyExecutingSqlTracker::default(),
+            default_config,
+            config_change_errors: None,
         }
     }
 
+    /// Add a tracker that will track the currently executed SQL statement.
+    ///
+    /// This is useful for logging and debugging purposes.
+    pub fn with_currently_executing_sql_tracker(
+        mut self,
+        currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
+    ) -> Self {
+        self.currently_executing_sql_tracker = currently_executing_sql_tracker;
+        self
+    }
+
+    pub fn with_config_change_errors(
+        mut self,
+        config_change_errors: Arc<Mutex<Vec<String>>>,
+    ) -> Self {
+        self.config_change_errors = Some(config_change_errors);
+        self
+    }
+
     fn update_slow_count(&self) {
         let msg = self.pb.message();
         let split: Vec<&str> = msg.split(" ").collect();
@@ -63,6 +98,43 @@ impl DataFusion {
         self.pb
             .set_message(format!("{} - {} took > 500 ms", split[0], current_count));
     }
+
+    pub fn validate_config_unchanged(&mut self) -> Result<()> {
+        let mut changed = false;
+        let mut message = format!(
+            "SLT file {} left modified configuration",
+            self.relative_path.display()
+        );
+
+        for entry in self.ctx.state().config().options().entries() {
+            let default_entry = self.default_config.remove(&entry.key);
+
+            if let Some(default_entry) = default_entry
+                && default_entry.as_ref() != entry.value.as_ref()
+            {
+                changed = true;
+
+                let default = default_entry.as_deref().unwrap_or("NULL");
+                let current = entry.value.as_deref().unwrap_or("NULL");
+
+                message
+                    .push_str(&format!("\n  {}: {} -> {}", entry.key, default, current));
+            }
+        }
+
+        for (key, value) in &self.default_config {
+            changed = true;
+
+            let default = value.as_deref().unwrap_or("NULL");
+            message.push_str(&format!("\n  {key}: {default} -> NULL"));
+        }
+
+        if changed {
+            Err(DFSqlLogicTestError::Other(message))
+        } else {
+            Ok(())
+        }
+    }
 }
 
 #[async_trait]
@@ -79,10 +151,14 @@ impl sqllogictest::AsyncDB for DataFusion {
             );
         }
 
+        let tracked_sql = self.currently_executing_sql_tracker.set_sql(sql);
+
         let start = Instant::now();
         let result = run_query(&self.ctx, is_spark_path(&self.relative_path), sql).await;
         let duration = start.elapsed();
 
+        self.currently_executing_sql_tracker.remove_sql(tracked_sql);
+
         if duration.gt(&Duration::from_millis(500)) {
             self.update_slow_count();
         }
@@ -113,7 +189,14 @@ impl sqllogictest::AsyncDB for DataFusion {
         tokio::time::sleep(dur).await;
     }
 
-    async fn shutdown(&mut self) {}
+    /// Shutdown and check no DataFusion configuration has changed during test
+    async fn shutdown(&mut self) {
+        if let Some(config_change_errors) = self.config_change_errors.clone()
+            && let Err(error) = self.validate_config_unchanged()
+        {
+            config_change_errors.lock().unwrap().push(error.to_string());
+        }
+    }
 }
 
 async fn run_query(
@@ -137,3 +220,31 @@ async fn run_query(
         Ok(DBOutput::Rows { types, rows })
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use sqllogictest::AsyncDB;
+
+    #[tokio::test]
+    async fn validate_config_unchanged_detects_modified_config() {
+        let ctx = SessionContext::new();
+        let default_batch_size = ctx.state().config().options().execution.batch_size;
+        let mut runner =
+            DataFusion::new(ctx, PathBuf::from("test.slt"), ProgressBar::hidden());
+
+        <DataFusion as AsyncDB>::run(
+            &mut runner,
+            "SET datafusion.execution.batch_size = 2048",
+        )
+        .await
+        .unwrap();
+
+        let error = runner.validate_config_unchanged().unwrap_err();
+        let message = error.to_string();
+
+        assert!(message.contains("test.slt left modified configuration"));
+        assert!(message.contains("datafusion.execution.batch_size"));
+        assert!(message.contains(&format!("{default_batch_size} -> 2048")));
+    }
+}
diff --git a/datafusion/sqllogictest/src/engines/datafusion_substrait_roundtrip_engine/runner.rs b/datafusion/sqllogictest/src/engines/datafusion_substrait_roundtrip_engine/runner.rs
index 2df93f0dede33..d4b4377e30875 100644
--- a/datafusion/sqllogictest/src/engines/datafusion_substrait_roundtrip_engine/runner.rs
+++ b/datafusion/sqllogictest/src/engines/datafusion_substrait_roundtrip_engine/runner.rs
@@ -18,9 +18,10 @@
 use std::sync::Arc;
 use std::{path::PathBuf, time::Duration};
 
+use crate::engines::currently_executed_sql::CurrentlyExecutingSqlTracker;
 use crate::engines::datafusion_engine::Result;
 use crate::engines::output::{DFColumnType, DFOutput};
-use crate::{convert_batches, convert_schema_to_types, DFSqlLogicTestError};
+use crate::{DFSqlLogicTestError, convert_batches, convert_schema_to_types};
 use arrow::record_batch::RecordBatch;
 use async_trait::async_trait;
 use datafusion::logical_expr::LogicalPlan;
@@ -39,6 +40,7 @@ pub struct DataFusionSubstraitRoundTrip {
     ctx: SessionContext,
     relative_path: PathBuf,
     pb: ProgressBar,
+    currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
 }
 
 impl DataFusionSubstraitRoundTrip {
@@ -47,6 +49,20 @@ impl DataFusionSubstraitRoundTrip {
             ctx,
             relative_path,
             pb,
+            currently_executing_sql_tracker: CurrentlyExecutingSqlTracker::default(),
+        }
+    }
+
+    /// Add a tracker that will track the currently executed SQL statement.
+    ///
+    /// This is useful for logging and debugging purposes.
+    pub fn with_currently_executing_sql_tracker(
+        self,
+        currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
+    ) -> Self {
+        Self {
+            currently_executing_sql_tracker,
+            ..self
         }
     }
 
@@ -81,10 +97,14 @@ impl sqllogictest::AsyncDB for DataFusionSubstraitRoundTrip {
             );
         }
 
+        let tracked_sql = self.currently_executing_sql_tracker.set_sql(sql);
+
         let start = Instant::now();
         let result = run_query_substrait_round_trip(&self.ctx, sql).await;
         let duration = start.elapsed();
 
+        self.currently_executing_sql_tracker.remove_sql(tracked_sql);
+
         if duration.gt(&Duration::from_millis(500)) {
             self.update_slow_count();
         }
@@ -132,6 +152,7 @@ async fn run_query_substrait_round_trip(
         | LogicalPlan::Explain(_)
         | LogicalPlan::Dml(_)
         | LogicalPlan::Copy(_)
+        | LogicalPlan::DescribeTable(_)
         | LogicalPlan::Statement(_) => df.logical_plan().clone(),
         // For any other plan, convert to Substrait
         logical_plan => {
diff --git a/datafusion/sqllogictest/src/engines/mod.rs b/datafusion/sqllogictest/src/engines/mod.rs
index ef6335ddbed66..0c9643b1c73a0 100644
--- a/datafusion/sqllogictest/src/engines/mod.rs
+++ b/datafusion/sqllogictest/src/engines/mod.rs
@@ -17,18 +17,23 @@
 
 /// Implementation of sqllogictest for datafusion.
 mod conversion;
+mod currently_executed_sql;
 mod datafusion_engine;
+#[cfg(feature = "substrait")]
 mod datafusion_substrait_roundtrip_engine;
 mod output;
 
-pub use datafusion_engine::convert_batches;
-pub use datafusion_engine::convert_schema_to_types;
 pub use datafusion_engine::DFSqlLogicTestError;
 pub use datafusion_engine::DataFusion;
+pub use datafusion_engine::convert_batches;
+pub use datafusion_engine::convert_schema_to_types;
+#[cfg(feature = "substrait")]
 pub use datafusion_substrait_roundtrip_engine::DataFusionSubstraitRoundTrip;
 pub use output::DFColumnType;
 pub use output::DFOutput;
 
+pub use currently_executed_sql::CurrentlyExecutingSqlTracker;
+
 #[cfg(feature = "postgres")]
 mod postgres_engine;
 
diff --git a/datafusion/sqllogictest/src/engines/postgres_engine/mod.rs b/datafusion/sqllogictest/src/engines/postgres_engine/mod.rs
index 4d310711687f2..c3f266dcd1b62 100644
--- a/datafusion/sqllogictest/src/engines/postgres_engine/mod.rs
+++ b/datafusion/sqllogictest/src/engines/postgres_engine/mod.rs
@@ -16,6 +16,7 @@
 // under the License.
 
 use async_trait::async_trait;
+use bigdecimal::BigDecimal;
 use bytes::Bytes;
 use datafusion::common::runtime::SpawnedTask;
 use futures::{SinkExt, StreamExt};
@@ -27,16 +28,13 @@ use std::str::FromStr;
 use std::time::Duration;
 
 use super::conversion::*;
+use crate::engines::currently_executed_sql::CurrentlyExecutingSqlTracker;
 use crate::engines::output::{DFColumnType, DFOutput};
 use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
 use indicatif::ProgressBar;
 use postgres_types::Type;
-use rust_decimal::Decimal;
 use tokio::time::Instant;
-use tokio_postgres::{Column, Row};
-use types::PgRegtype;
-
-mod types;
+use tokio_postgres::{SimpleQueryMessage, SimpleQueryRow};
 
 // default connect string, can be overridden by the `PG_URL` environment variable
 const PG_URI: &str = "postgresql://postgres@127.0.0.1/test";
@@ -59,6 +57,7 @@ pub struct Postgres {
     /// Relative test file path
     relative_path: PathBuf,
     pb: ProgressBar,
+    currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
 }
 
 impl Postgres {
@@ -118,9 +117,34 @@ impl Postgres {
             spawned_task: Some(spawned_task),
             relative_path,
             pb,
+            currently_executing_sql_tracker: CurrentlyExecutingSqlTracker::default(),
         })
     }
 
+    /// Creates a runner for executing queries against an existing postgres connection
+    /// with a tracker for currently executing SQL statements.
+    pub async fn connect_with_tracked_sql(
+        relative_path: PathBuf,
+        pb: ProgressBar,
+        currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
+    ) -> Result<Self> {
+        let conn = Self::connect(relative_path, pb).await?;
+        Ok(conn.with_currently_executing_sql_tracker(currently_executing_sql_tracker))
+    }
+
+    /// Add a tracker that will track the currently executed SQL statement.
+    ///
+    /// This is useful for logging and debugging purposes.
+    pub fn with_currently_executing_sql_tracker(
+        self,
+        currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
+    ) -> Self {
+        Self {
+            currently_executing_sql_tracker,
+            ..self
+        }
+    }
+
     fn get_client(&mut self) -> &mut tokio_postgres::Client {
         self.client.as_mut().expect("client is shutdown")
     }
@@ -242,6 +266,8 @@ impl sqllogictest::AsyncDB for Postgres {
             sql
         );
 
+        let tracked_sql = self.currently_executing_sql_tracker.set_sql(sql);
+
         let lower_sql = sql.trim_start().to_ascii_lowercase();
 
         let is_query_sql = {
@@ -258,16 +284,32 @@ impl sqllogictest::AsyncDB for Postgres {
 
         if lower_sql.starts_with("copy") {
             self.pb.inc(1);
-            return self.run_copy_command(sql).await;
+            let result = self.run_copy_command(sql).await;
+            self.currently_executing_sql_tracker.remove_sql(tracked_sql);
+
+            return result;
         }
 
         if !is_query_sql {
             self.get_client().execute(sql, &[]).await?;
+            self.currently_executing_sql_tracker.remove_sql(tracked_sql);
             self.pb.inc(1);
             return Ok(DBOutput::StatementComplete(0));
         }
+        // Use a prepared statement to get the output column types
+        let statement = self.get_client().prepare(sql).await?;
+        let types: Vec<Type> = statement
+            .columns()
+            .iter()
+            .map(|c| c.type_().clone())
+            .collect();
+
+        // Run the actual query using the "simple query" protocol that returns all
+        // rows as text. Doing this avoids having to convert values from the binary
+        // format to strings, which is somewhat tricky for numeric types.
+        // See https://github.com/apache/datafusion/pull/19666#discussion_r2668090587
         let start = Instant::now();
-        let rows = self.get_client().query(sql, &[]).await?;
+        let messages = self.get_client().simple_query(sql).await?;
         let duration = start.elapsed();
 
         if duration.gt(&Duration::from_millis(500)) {
@@ -276,28 +318,16 @@ impl sqllogictest::AsyncDB for Postgres {
 
         self.pb.inc(1);
 
-        let types: Vec<Type> = if rows.is_empty() {
-            self.get_client()
-                .prepare(sql)
-                .await?
-                .columns()
-                .iter()
-                .map(|c| c.type_().clone())
-                .collect()
-        } else {
-            rows[0]
-                .columns()
-                .iter()
-                .map(|c| c.type_().clone())
-                .collect()
-        };
+        self.currently_executing_sql_tracker.remove_sql(tracked_sql);
+
+        let rows = convert_rows(&types, &messages);
 
         if rows.is_empty() && types.is_empty() {
             Ok(DBOutput::StatementComplete(0))
         } else {
             Ok(DBOutput::Rows {
                 types: convert_types(types),
-                rows: convert_rows(rows),
+                rows,
             })
         }
     }
@@ -316,58 +346,68 @@ impl sqllogictest::AsyncDB for Postgres {
     }
 }
 
-fn convert_rows(rows: Vec<Row>) -> Vec<Vec<String>> {
-    rows.iter()
+fn convert_rows(types: &[Type], messages: &[SimpleQueryMessage]) -> Vec<Vec<String>> {
+    messages
+        .iter()
+        .filter_map(|message| match message {
+            SimpleQueryMessage::Row(row) => Some(row),
+            _ => None,
+        })
         .map(|row| {
-            row.columns()
+            types
                 .iter()
                 .enumerate()
-                .map(|(idx, column)| cell_to_string(row, column, idx))
+                .map(|(idx, column_type)| cell_to_string(row, column_type, idx))
                 .collect::<Vec<String>>()
         })
         .collect::<Vec<_>>()
 }
 
-macro_rules! make_string {
-    ($row:ident, $idx:ident, $t:ty) => {{
-        let value: Option<$t> = $row.get($idx);
-        match value {
-            Some(value) => value.to_string(),
-            None => NULL_STR.to_string(),
+fn cell_to_string(row: &SimpleQueryRow, column_type: &Type, idx: usize) -> String {
+    // simple_query returns text values, so we parse by Postgres type to keep
+    // normalization aligned with the DataFusion engine output.
+    let value = row.get(idx);
+    match (column_type, value) {
+        (_, None) => NULL_STR.to_string(),
+        (&Type::CHAR, Some(value)) => value
+            .as_bytes()
+            .first()
+            .map(|byte| (*byte as i8).to_string())
+            .unwrap_or_else(|| NULL_STR.to_string()),
+        (&Type::INT2, Some(value)) => value.parse::<i16>().unwrap().to_string(),
+        (&Type::INT4, Some(value)) => value.parse::<i32>().unwrap().to_string(),
+        (&Type::INT8, Some(value)) => value.parse::<i64>().unwrap().to_string(),
+        (&Type::NUMERIC, Some(value)) => {
+            decimal_to_str(BigDecimal::from_str(value).unwrap())
         }
-    }};
-    ($row:ident, $idx:ident, $t:ty, $convert:ident) => {{
-        let value: Option<$t> = $row.get($idx);
-        match value {
-            Some(value) => $convert(value).to_string(),
-            None => NULL_STR.to_string(),
+        // Parse date/time strings explicitly to avoid locale-specific formatting.
+        (&Type::DATE, Some(value)) => NaiveDate::parse_from_str(value, "%Y-%m-%d")
+            .unwrap()
+            .to_string(),
+        (&Type::TIME, Some(value)) => NaiveTime::parse_from_str(value, "%H:%M:%S%.f")
+            .unwrap()
+            .to_string(),
+        (&Type::TIMESTAMP, Some(value)) => {
+            let parsed = NaiveDateTime::parse_from_str(value, "%Y-%m-%d %H:%M:%S%.f")
+                .or_else(|_| NaiveDateTime::parse_from_str(value, "%Y-%m-%dT%H:%M:%S%.f"))
+                .unwrap();
+            format!("{parsed:?}")
         }
-    }};
-}
-
-fn cell_to_string(row: &Row, column: &Column, idx: usize) -> String {
-    match column.type_().clone() {
-        Type::CHAR => make_string!(row, idx, i8),
-        Type::INT2 => make_string!(row, idx, i16),
-        Type::INT4 => make_string!(row, idx, i32),
-        Type::INT8 => make_string!(row, idx, i64),
-        Type::NUMERIC => make_string!(row, idx, Decimal, decimal_to_str),
-        Type::DATE => make_string!(row, idx, NaiveDate),
-        Type::TIME => make_string!(row, idx, NaiveTime),
-        Type::TIMESTAMP => {
-            let value: Option<NaiveDateTime> = row.get(idx);
-            value
-                .map(|d| format!("{d:?}"))
-                .unwrap_or_else(|| "NULL".to_string())
+        (&Type::BOOL, Some(value)) => {
+            let parsed = match value {
+                "t" | "true" | "TRUE" => true,
+                "f" | "false" | "FALSE" => false,
+                _ => panic!("Unsupported boolean value: {value}"),
+            };
+            bool_to_str(parsed)
         }
-        Type::BOOL => make_string!(row, idx, bool, bool_to_str),
-        Type::BPCHAR | Type::VARCHAR | Type::TEXT => {
-            make_string!(row, idx, &str, varchar_to_str)
+        (&Type::BPCHAR | &Type::VARCHAR | &Type::TEXT, Some(value)) => {
+            varchar_to_str(value)
         }
-        Type::FLOAT4 => make_string!(row, idx, f32, f32_to_str),
-        Type::FLOAT8 => make_string!(row, idx, f64, f64_to_str),
-        Type::REGTYPE => make_string!(row, idx, PgRegtype),
-        _ => unimplemented!("Unsupported type: {}", column.type_().name()),
+        (&Type::FLOAT4, Some(value)) => f32_to_str(value.parse::<f32>().unwrap()),
+        (&Type::FLOAT8, Some(value)) => f64_to_str(value.parse::<f64>().unwrap()),
+        (&Type::REGTYPE, Some(value)) => value.to_string(),
+        _ => unimplemented!("Unsupported type: {}", column_type.name()),
     }
 }
 
diff --git a/datafusion/sqllogictest/src/filters.rs b/datafusion/sqllogictest/src/filters.rs
index 44482236f7c5b..568fa3f66676e 100644
--- a/datafusion/sqllogictest/src/filters.rs
+++ b/datafusion/sqllogictest/src/filters.rs
@@ -120,10 +120,10 @@ pub fn should_skip_record<D: AsyncDB>(
         if !loc.file().contains(&filter.file_substring) {
             continue;
         }
-        if let Some(line_num) = filter.line_number {
-            if loc.line() != line_num {
-                continue;
-            }
+        if let Some(line_num) = filter.line_number
+            && loc.line() != line_num
+        {
+            continue;
         }
 
         // This filter matches both file name substring and the exact
@@ -142,12 +142,11 @@ fn statement_is_skippable(statement: &Statement) -> bool {
 
     // Cannot skip SELECT INTO statements, as they can also create tables
     // that further test cases will use.
-    if let SqlStatement::Query(v) = sql_stmt.as_ref() {
-        if let SetExpr::Select(v) = v.body.as_ref() {
-            if v.into.is_some() {
-                return false;
-            }
-        }
+    if let SqlStatement::Query(v) = sql_stmt.as_ref()
+        && let SetExpr::Select(v) = v.body.as_ref()
+        && v.into.is_some()
+    {
+        return false;
     }
 
     // Only SELECT and EXPLAIN statements can be skipped, as any other
diff --git a/datafusion/sqllogictest/src/lib.rs b/datafusion/sqllogictest/src/lib.rs
index f3a78607242ce..6b6c40365f855 100644
--- a/datafusion/sqllogictest/src/lib.rs
+++ b/datafusion/sqllogictest/src/lib.rs
@@ -27,14 +27,17 @@
 //! DataFusion sqllogictest driver
 
 mod engines;
+mod test_file;
 
-pub use engines::convert_batches;
-pub use engines::convert_schema_to_types;
+pub use engines::CurrentlyExecutingSqlTracker;
 pub use engines::DFColumnType;
 pub use engines::DFOutput;
 pub use engines::DFSqlLogicTestError;
 pub use engines::DataFusion;
+#[cfg(feature = "substrait")]
 pub use engines::DataFusionSubstraitRoundTrip;
+pub use engines::convert_batches;
+pub use engines::convert_schema_to_types;
 
 #[cfg(feature = "postgres")]
 pub use engines::Postgres;
@@ -45,4 +48,5 @@ mod util;
 
 pub use filters::*;
 pub use test_context::TestContext;
+pub use test_file::TestFile;
 pub use util::*;
diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs
index b499401e5589c..33756e191909f 100644
--- a/datafusion/sqllogictest/src/test_context.rs
+++ b/datafusion/sqllogictest/src/test_context.rs
@@ -15,29 +15,32 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
 use std::collections::HashMap;
 use std::fs::File;
 use std::io::Write;
 use std::path::Path;
 use std::sync::Arc;
+use std::vec;
 
 use arrow::array::{
     Array, ArrayRef, BinaryArray, Float64Array, Int32Array, LargeBinaryArray,
     LargeStringArray, StringArray, TimestampNanosecondArray, UnionArray,
 };
 use arrow::buffer::ScalarBuffer;
-use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit, UnionFields};
+use arrow::datatypes::{
+    DataType, Field, FieldRef, Schema, SchemaRef, TimeUnit, UnionFields,
+};
 use arrow::record_batch::RecordBatch;
 use datafusion::catalog::{
-    CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, Session,
+    CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, SchemaProvider, Session,
 };
-use datafusion::common::{not_impl_err, DataFusionError, Result};
+use datafusion::common::{DataFusionError, Result, not_impl_err};
 use datafusion::functions::math::abs;
 use datafusion::logical_expr::async_udf::{AsyncScalarUDF, AsyncScalarUDFImpl};
+use datafusion::logical_expr::planner::TypePlanner;
 use datafusion::logical_expr::{
-    create_udf, ColumnarValue, Expr, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl,
-    Signature, Volatility,
+    ColumnarValue, Expr, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
+    Volatility, create_udf,
 };
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::prelude::*;
@@ -45,13 +48,15 @@ use datafusion::{
     datasource::{MemTable, TableProvider, TableType},
     prelude::{CsvReadOptions, SessionContext},
 };
+use datafusion_spark::SessionStateBuilderSpark;
 
 use crate::is_spark_path;
 use async_trait::async_trait;
 use datafusion::common::cast::as_float64_array;
-use datafusion::execution::runtime_env::RuntimeEnv;
 use datafusion::execution::SessionStateBuilder;
+use datafusion::execution::runtime_env::RuntimeEnv;
 use log::info;
+use sqlparser::ast;
 use tempfile::TempDir;
 
 /// Context for running tests
@@ -62,6 +67,23 @@ pub struct TestContext {
     test_dir: Option<TempDir>,
 }
 
+#[derive(Debug)]
+struct SqlLogicTestTypePlanner;
+
+impl TypePlanner for SqlLogicTestTypePlanner {
+    fn plan_type_field(&self, sql_type: &ast::DataType) -> Result<Option<FieldRef>> {
+        match sql_type {
+            ast::DataType::Uuid => Ok(Some(Arc::new(
+                Field::new("", DataType::FixedSizeBinary(16), true).with_metadata(
+                    [("ARROW:extension:name".to_string(), "arrow.uuid".to_string())]
+                        .into(),
+                ),
+            ))),
+            _ => Ok(None),
+        }
+    }
+}
+
 impl TestContext {
     pub fn new(ctx: SessionContext) -> Self {
         Self {
@@ -80,22 +102,34 @@ impl TestContext {
             // hardcode target partitions so plans are deterministic
             .with_target_partitions(4);
         let runtime = Arc::new(RuntimeEnv::default());
-        let mut state = SessionStateBuilder::new()
+
+        let mut state_builder = SessionStateBuilder::new()
             .with_config(config)
             .with_runtime_env(runtime)
-            .with_default_features()
-            .build();
+            .with_default_features();
 
         if is_spark_path(relative_path) {
-            info!("Registering Spark functions");
-            datafusion_spark::register_all(&mut state)
-                .expect("Can not register Spark functions");
+            state_builder = state_builder.with_spark_features();
         }
 
+        if matches!(
+            relative_path.file_name().and_then(|name| name.to_str()),
+            Some("cast_extension_type_metadata.slt")
+        ) {
+            state_builder =
+                state_builder.with_type_planner(Arc::new(SqlLogicTestTypePlanner));
+        }
+
+        let state = state_builder.build();
+
         let mut test_ctx = TestContext::new(SessionContext::new_with_state(state));
 
         let file_name = relative_path.file_name().unwrap().to_str().unwrap();
         match file_name {
+            "cte.slt" => {
+                info!("Registering strict schema provider for CTE tests");
+                register_strict_schema_provider(test_ctx.session_ctx());
+            }
             "information_schema_table_types.slt" => {
                 info!("Registering local temporary table");
                 register_temp_table(test_ctx.session_ctx()).await;
@@ -131,7 +165,7 @@ impl TestContext {
                 info!("Registering table with many types");
                 register_table_with_many_types(test_ctx.session_ctx()).await;
             }
-            "metadata.slt" => {
+            "metadata.slt" | "arrow_field.slt" => {
                 info!("Registering metadata table tables");
                 register_metadata_tables(test_ctx.session_ctx()).await;
             }
@@ -171,6 +205,81 @@ impl TestContext {
     }
 }
 
+// ==============================================================================
+// Strict Schema Provider (sqllogictest-only)
+// ==============================================================================
+//
+// The goal of `StrictOrdersSchema` is to exercise end-to-end query planning
+// while detecting *unexpected* catalog lookups.
+//
+// Specifically, if DataFusion incorrectly treats a CTE reference (e.g. `"barbaz"`)
+// as a real table reference, the planner will attempt to resolve it through the
+// schema provider. The types below deliberately `panic!` on any lookup other than
+// the one table we expect (`orders`).
+//
+// This makes the "extra provider lookup" bug observable in an end-to-end test,
+// rather than being silently ignored by default providers that return `Ok(None)`
+// for unknown tables.
+#[derive(Debug)]
+struct StrictOrdersSchema {
+    orders: Arc<dyn TableProvider>,
+}
+
+#[async_trait]
+impl SchemaProvider for StrictOrdersSchema {
+    fn table_names(&self) -> Vec<String> {
+        vec!["orders".to_string()]
+    }
+
+    async fn table(
+        &self,
+        name: &str,
+    ) -> Result<Option<Arc<dyn TableProvider>>, DataFusionError> {
+        match name {
+            "orders" => Ok(Some(Arc::clone(&self.orders))),
+            other => panic!(
+                "unexpected table lookup: {other}. This maybe indicates a CTE reference was \
+                 incorrectly treated as a catalog table reference."
+            ),
+        }
+    }
+
+    fn table_exist(&self, name: &str) -> bool {
+        name == "orders"
+    }
+}
+
+fn register_strict_schema_provider(ctx: &SessionContext) {
+    let schema = Arc::new(Schema::new(vec![Field::new(
+        "order_id",
+        DataType::Int32,
+        false,
+    )]));
+
+    let batch = RecordBatch::try_new(
+        Arc::clone(&schema),
+        vec![Arc::new(Int32Array::from(vec![1, 2]))],
+    )
+    .expect("record batch should be valid");
+
+    let orders =
+        MemTable::try_new(schema, vec![vec![batch]]).expect("memtable should be valid");
+
+    let schema_provider: Arc<dyn SchemaProvider> = Arc::new(StrictOrdersSchema {
+        orders: Arc::new(orders),
+    });
+
+    let previous = ctx
+        .catalog("datafusion")
+        .expect("default catalog should exist")
+        .register_schema("strict_schema", schema_provider)
+        .expect("strict schema registration should succeed");
+    assert!(
+        previous.is_none(),
+        "strict_schema unexpectedly already existed in datafusion catalog"
+    );
+}
+
 #[cfg(feature = "avro")]
 pub async fn register_avro_tables(ctx: &mut TestContext) {
     use datafusion::prelude::AvroReadOptions;
@@ -245,10 +354,6 @@ pub async fn register_temp_table(ctx: &SessionContext) {
 
     #[async_trait]
     impl TableProvider for TestTable {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
         fn schema(&self) -> SchemaRef {
             unimplemented!()
         }
@@ -436,14 +541,15 @@ fn create_example_udf() -> ScalarUDF {
 
 fn register_union_table(ctx: &SessionContext) {
     let union = UnionArray::try_new(
-        UnionFields::new(
+        UnionFields::try_new(
             // typeids: 3 for int, 1 for string
             vec![3, 1],
             vec![
                 Field::new("int", DataType::Int32, false),
                 Field::new("string", DataType::Utf8, false),
             ],
-        ),
+        )
+        .unwrap(),
         ScalarBuffer::from(vec![3, 1, 3]),
         None,
         vec![
@@ -480,10 +586,6 @@ fn register_async_abs_udf(ctx: &SessionContext) {
         }
     }
     impl ScalarUDFImpl for AsyncAbs {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
         fn name(&self) -> &str {
             "async_abs"
         }
diff --git a/datafusion/sqllogictest/src/test_file.rs b/datafusion/sqllogictest/src/test_file.rs
new file mode 100644
index 0000000000000..71dbfa6edc944
--- /dev/null
+++ b/datafusion/sqllogictest/src/test_file.rs
@@ -0,0 +1,184 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::collections::HashMap;
+use std::ffi::OsStr;
+use std::path::{Path, PathBuf};
+use std::sync::LazyLock;
+
+/// Represents a parsed test file
+///
+/// Note there is a custom Ord implementation that sorts test files by:
+/// 1. Hard coded test priority (lower runs first),
+/// 2. Relative path as deterministic tie-breaker.
+#[derive(Debug, PartialEq, Eq)]
+pub struct TestFile {
+    /// The absolute path to the file
+    pub path: PathBuf,
+    /// The relative path of the file (used for display)
+    pub relative_path: PathBuf,
+}
+
+impl TestFile {
+    /// Create a new [`TestFile`] from the given path, stripping any of the
+    /// known test directory prefixes for the relative path.
+    pub fn new(path: PathBuf, prefixes: &[&str]) -> Self {
+        let p = path.to_string_lossy();
+        for prefix in prefixes {
+            if p.starts_with(prefix) {
+                let relative_path = PathBuf::from(p.strip_prefix(prefix).unwrap());
+                return Self {
+                    path,
+                    relative_path,
+                };
+            }
+        }
+        let relative_path = PathBuf::from("");
+
+        Self {
+            path,
+            relative_path,
+        }
+    }
+
+    /// Returns true if the file has a .slt extension, indicating it is a sqllogictest file.
+    pub fn is_slt_file(&self) -> bool {
+        self.path.extension() == Some(OsStr::new("slt"))
+    }
+
+    /// Returns true if the relative path starts with the given prefix, which
+    /// can be used to filter tests by subdirectory or filename patterns.
+    pub fn relative_path_starts_with(&self, prefix: impl AsRef<Path>) -> bool {
+        self.relative_path.starts_with(prefix)
+    }
+}
+
+impl PartialOrd for TestFile {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for TestFile {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        let self_path = &self.relative_path;
+        let other_path = &other.relative_path;
+
+        let priority_self = TEST_PRIORITY.get(self_path).unwrap_or(&DEFAULT_PRIORITY);
+        let priority_other = TEST_PRIORITY.get(other_path).unwrap_or(&DEFAULT_PRIORITY);
+
+        priority_self
+            .cmp(priority_other)
+            .then_with(|| self_path.cmp(other_path)) // Tie-breaker: lexicographic order of relative paths.
+            // Final tie-breaker keeps Ord consistent with Eq when relative paths collide.
+            .then_with(|| self.path.cmp(&other.path))
+    }
+}
+
+/// TEST PRIORITY
+///
+/// Heuristically prioritize some test to run earlier.
+///
+/// Prioritizes test to run earlier if they are known to be long running (as
+/// each test file itself is run sequentially, but multiple test files are run
+/// in parallel.
+///
+/// Tests not listed here will run after the listed tests in deterministic
+/// lexicographic order by relative path.
+///
+/// You can find the top longest running tests by running `--timing-summary`
+/// mode. For example
+///
+/// ```shell
+/// $ cargo test --profile=ci --test sqllogictests -- --timing-summary top
+/// ...
+/// Per-file elapsed summary (deterministic):
+/// 1.    3.568s  aggregate.slt
+/// 2.    3.464s  joins.slt
+/// 3.    3.336s  imdb.slt
+/// 4.    3.085s  push_down_filter_regression.slt
+/// 5.    2.926s  aggregate_skip_partial.slt
+/// 6.    2.399s  window.slt
+/// 7.    2.198s  group_by.slt
+/// 8.    1.281s  clickbench.slt
+/// 9.    1.058s  datetime/timestamps.slt
+/// ```
+const TEST_PRIORITY_ENTRIES: &[&str] = &[
+    "aggregate.slt", //  longest-running files go first
+    "joins.slt",
+    "imdb.slt",
+    "push_down_filter_regression.slt",
+    "aggregate_skip_partial.slt",
+    "window.slt",
+    "group_by.slt",
+    "clickbench.slt",
+    "datetime/timestamps.slt",
+];
+
+/// Default priority for tests not in the priority map. Tests with lower
+/// priority values run first.
+const DEFAULT_PRIORITY: usize = 100;
+
+static TEST_PRIORITY: LazyLock<HashMap<PathBuf, usize>> = LazyLock::new(|| {
+    TEST_PRIORITY_ENTRIES
+        .iter()
+        .enumerate()
+        .map(|(priority, path)| (PathBuf::from(path), priority))
+        .collect()
+});
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn prioritized_files_are_first() {
+        let mut input = vec!["z_unlisted.slt", "a_unlisted.slt"];
+        input.extend(TEST_PRIORITY_ENTRIES.iter());
+        input.push("q_unlisted.slt");
+
+        let mut sorted = to_test_files(input);
+        sorted.sort_unstable();
+
+        println!("Sorted input: {sorted:?}");
+
+        // the prioritized files should be first, in the order specified by TEST_PRIORITY_ENTRIES
+        for file in sorted.iter().take(TEST_PRIORITY_ENTRIES.len()) {
+            assert!(
+                TEST_PRIORITY.contains_key(&file.relative_path),
+                "Expected prioritized file {file:?} not found in input {sorted:?}"
+            );
+        }
+        // last three files should be the unlisted ones in deterministic order
+        let expected_files =
+            to_test_files(["a_unlisted.slt", "q_unlisted.slt", "z_unlisted.slt"]);
+        assert!(
+            sorted.ends_with(&expected_files),
+            "Expected unlisted files {expected_files:?} at the end in deterministic order of {sorted:?}"
+        );
+    }
+
+    fn to_test_files<'a>(files: impl IntoIterator<Item = &'a str>) -> Vec<TestFile> {
+        files
+            .into_iter()
+            .map(|f| TestFile {
+                path: PathBuf::from(f),
+                relative_path: PathBuf::from(f),
+            })
+            .collect()
+    }
+}
diff --git a/datafusion/sqllogictest/src/util.rs b/datafusion/sqllogictest/src/util.rs
index 695fe463fa676..b0cf32266ea31 100644
--- a/datafusion/sqllogictest/src/util.rs
+++ b/datafusion/sqllogictest/src/util.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion::common::{exec_datafusion_err, Result};
+use datafusion::common::{Result, exec_datafusion_err};
 use itertools::Itertools;
 use log::Level::Warn;
 use log::{info, log_enabled, warn};
@@ -44,7 +44,7 @@ pub fn setup_scratch_dir(name: &Path) -> Result<()> {
 /// Trailing whitespace from lines in SLT will typically be removed, but do not fail if it is not
 /// If particular test wants to cover trailing whitespace on a value,
 /// it should project additional non-whitespace column on the right.
-#[allow(clippy::ptr_arg)]
+#[expect(clippy::ptr_arg)]
 pub fn value_normalizer(s: &String) -> String {
     s.trim_end().to_string()
 }
@@ -82,6 +82,10 @@ pub fn df_value_validator(
     actual: &[Vec<String>],
     expected: &[String],
 ) -> bool {
+    // Support ignore marker <slt:ignore> to skip volatile parts of output.
+    const IGNORE_MARKER: &str = "<slt:ignore>";
+    let contains_ignore_marker = expected.iter().any(|line| line.contains(IGNORE_MARKER));
+
     let normalized_expected = expected.iter().map(normalizer).collect::<Vec<_>>();
     let normalized_actual = actual
         .iter()
@@ -89,13 +93,39 @@ pub fn df_value_validator(
         .map(|str| str.trim_end().to_string())
         .collect_vec();
 
+    // If ignore marker present, perform fragment-based matching on the full snapshot.
+    if contains_ignore_marker {
+        let expected_snapshot = normalized_expected.join("\n");
+        let actual_snapshot = normalized_actual.join("\n");
+        let fragments: Vec<&str> = expected_snapshot.split(IGNORE_MARKER).collect();
+        let mut pos = 0;
+        for (i, frag) in fragments.iter().enumerate() {
+            if frag.is_empty() {
+                continue;
+            }
+            if let Some(idx) = actual_snapshot[pos..].find(frag) {
+                // Edge case: The following example is expected to fail
+                // Actual - 'foo bar baz'
+                // Expected - 'bar <slt:ignore>'
+                if (i == 0) && (idx != 0) {
+                    return false;
+                }
+
+                pos += idx + frag.len();
+            } else {
+                return false;
+            }
+        }
+        return true;
+    }
+
     if log_enabled!(Warn) && normalized_actual != normalized_expected {
         warn!("df validation failed. actual vs expected:");
         for i in 0..normalized_actual.len() {
             warn!("[{i}] {}<eol>", normalized_actual[i]);
             warn!(
                 "[{i}] {}<eol>",
-                if normalized_expected.len() >= i {
+                if normalized_expected.len() > i {
                     &normalized_expected[i]
                 } else {
                     "No more results"
@@ -110,3 +140,20 @@ pub fn df_value_validator(
 pub fn is_spark_path(relative_path: &Path) -> bool {
     relative_path.starts_with("spark/")
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // Validation should fail for the below case:
+    // Actual - 'foo bar baz'
+    // Expected - 'bar <slt:ignore>'
+    #[test]
+    fn ignore_marker_does_not_skip_leading_text() {
+        // Actual snapshot contains unexpected prefix before the expected fragment.
+        let actual = vec![vec!["foo bar baz".to_string()]];
+        let expected = vec!["bar <slt:ignore>".to_string()];
+
+        assert!(!df_value_validator(value_normalizer, &actual, &expected));
+    }
+}
diff --git a/datafusion/sqllogictest/test_files/agg_func_substitute.slt b/datafusion/sqllogictest/test_files/agg_func_substitute.slt
index 9aeaaacb10718..e0199c82501e4 100644
--- a/datafusion/sqllogictest/test_files/agg_func_substitute.slt
+++ b/datafusion/sqllogictest/test_files/agg_func_substitute.slt
@@ -45,12 +45,10 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[a@0 as a, nth_value(multiple_ordered_table.c,Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]@1 as result]
 02)--AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[nth_value(multiple_ordered_table.c,Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]], ordering_mode=Sorted
-03)----SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
-06)----------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[nth_value(multiple_ordered_table.c,Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]], ordering_mode=Sorted
-07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true
+03)----RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+04)------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[nth_value(multiple_ordered_table.c,Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]], ordering_mode=Sorted
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true
 
 
 query TT
@@ -65,12 +63,10 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[a@0 as a, nth_value(multiple_ordered_table.c,Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]@1 as result]
 02)--AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[nth_value(multiple_ordered_table.c,Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]], ordering_mode=Sorted
-03)----SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
-06)----------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[nth_value(multiple_ordered_table.c,Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]], ordering_mode=Sorted
-07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true
+03)----RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+04)------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[nth_value(multiple_ordered_table.c,Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]], ordering_mode=Sorted
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true
 
 query TT
 EXPLAIN SELECT a, ARRAY_AGG(c ORDER BY c)[1 + 100] as result
@@ -84,12 +80,10 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[a@0 as a, nth_value(multiple_ordered_table.c,Int64(1) + Int64(100)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]@1 as result]
 02)--AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[nth_value(multiple_ordered_table.c,Int64(1) + Int64(100)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]], ordering_mode=Sorted
-03)----SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
-06)----------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[nth_value(multiple_ordered_table.c,Int64(1) + Int64(100)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]], ordering_mode=Sorted
-07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true
+03)----RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+04)------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[nth_value(multiple_ordered_table.c,Int64(1) + Int64(100)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]], ordering_mode=Sorted
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true
 
 query II
 SELECT a, ARRAY_AGG(c ORDER BY c)[1] as result
diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt
index a5973afc0a93d..00ca0482a31e1 100644
--- a/datafusion/sqllogictest/test_files/aggregate.slt
+++ b/datafusion/sqllogictest/test_files/aggregate.slt
@@ -125,37 +125,71 @@ CREATE TABLE group_median_table_nullable (
 ( 'group1', 125,  32766,  2147483646,  arrow_cast(9223372036854775806,'Int64'),  100,  101,   4294967294, arrow_cast(100,'UInt64'), 3.2,  5.5,   arrow_cast('NAN','Float64'), 0.0004, 0.0004 ),
 ( 'group1', 127,  32767,  2147483647,  arrow_cast(9223372036854775807,'Int64'),  255,  65535, 4294967295, 18446744073709551615,     2.2,  2.2,   arrow_cast('NAN','Float64'), 0.0005, 0.0005 )
 
+query R
+select quantile_cont(col0, 0.75 order by col0)
+from values (1, 3), (2, 2), (3, 1) t(col0, col1);
+----
+2.5
+
+query R
+select quantile_cont(col0, 0.75 order by col0 desc)
+from values (1, 3), (2, 2), (3, 1) t(col0, col1);
+----
+1.5
+
+query R
+select quantile_cont(0.75 order by col0)
+from values (1, 3), (2, 2), (3, 1) t(col0, col1);
+----
+2.5
+
+query R
+select quantile_cont(0.75 order by col0 desc)
+from values (1, 3), (2, 2), (3, 1) t(col0, col1);
+----
+1.5
+
+query R
+select quantile_cont(0.75) within group (order by col0)
+from values (1), (2), (3) t(col0);
+----
+2.5
+
 #######
 # Error tests
 #######
 
+statement error DataFusion error: Error during planning: WITHIN GROUP is only supported for ordered-set aggregate functions
+SELECT SUM(c2) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100
+
+# WITHIN GROUP rejected for non-ordered-set UDAF
+# MIN does not implement ordered-set semantics (`supports_within_group_clause()`),
+# so the planner should reject the WITHIN GROUP syntax.
+statement error DataFusion error: Error during planning: WITHIN GROUP is only supported for ordered-set aggregate functions
+SELECT MIN(c) WITHIN GROUP (ORDER BY c) FROM (VALUES (1),(2)) as t(c);
+
+
 # https://github.com/apache/datafusion/issues/3353
 statement error DataFusion error: Schema error: Schema contains duplicate unqualified field name "approx_distinct\(aggregate_test_100\.c9\)"
 SELECT approx_distinct(c9) count_c9, approx_distinct(cast(c9 as varchar)) count_c9_str FROM aggregate_test_100
 
 # csv_query_approx_percentile_cont_with_weight
-statement error Failed to coerce arguments to satisfy a call to 'approx_percentile_cont_with_weight' function
+statement error Function 'approx_percentile_cont_with_weight' failed to match any signature
 SELECT approx_percentile_cont_with_weight(c2, 0.95) WITHIN GROUP (ORDER BY c1) FROM aggregate_test_100
 
-statement error Failed to coerce arguments to satisfy a call to 'approx_percentile_cont_with_weight' function
+statement error Function 'approx_percentile_cont_with_weight' failed to match any signature
 SELECT approx_percentile_cont_with_weight(c1, 0.95) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100
 
-statement error Failed to coerce arguments to satisfy a call to 'approx_percentile_cont_with_weight' function
+statement error Function 'approx_percentile_cont_with_weight' failed to match any signature
 SELECT approx_percentile_cont_with_weight(c2, c1) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100
 
 # csv_query_approx_percentile_cont_with_histogram_bins
 statement error DataFusion error: Error during planning: Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be UInt > 0 literal \(got data type Int64\)\.
 SELECT c1, approx_percentile_cont(0.95, -1000) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
 
-statement error Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function
+statement error Function 'approx_percentile_cont' failed to match any signature
 SELECT approx_percentile_cont(0.95, c1) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100
 
-statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Int16, Float64, Float64 to the signature OneOf(.*) failed(.|\n)*
-SELECT approx_percentile_cont(0.95, 111.1) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100
-
-statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Float64, Float64, Float64 to the signature OneOf(.*) failed(.|\n)*
-SELECT approx_percentile_cont(0.95, 111.1) WITHIN GROUP (ORDER BY c12) FROM aggregate_test_100
-
 statement error DataFusion error: Error during planning: Percentile value for 'APPROX_PERCENTILE_CONT' must be a literal
 SELECT approx_percentile_cont(c12) WITHIN GROUP (ORDER BY c12) FROM aggregate_test_100
 
@@ -369,6 +403,59 @@ select array_sort(c1), array_sort(c2) from (
 statement ok
 drop table array_agg_distinct_list_table;
 
+# Test array_agg with DISTINCT and IGNORE NULLS (regression test for issue #19735)
+query ?
+SELECT array_sort(ARRAY_AGG(DISTINCT x IGNORE NULLS)) as result
+FROM (VALUES (1), (2), (NULL), (2), (NULL), (1)) AS t(x);
+----
+[1, 2]
+
+# Test that non-DISTINCT aggregates also preserve IGNORE NULLS when mixed with DISTINCT
+# This tests the two-phase aggregation rewrite in SingleDistinctToGroupBy
+query I?
+SELECT
+  COUNT(DISTINCT x) as distinct_count,
+  array_sort(ARRAY_AGG(y IGNORE NULLS)) as y_agg
+FROM (VALUES
+  (1, 10),
+  (1, 20),
+  (2, 30),
+  (3, NULL),
+  (3, 40),
+  (NULL, 50)
+) AS t(x, y)
+----
+3 [10, 20, 30, 40, 50]
+
+# Test that FILTER clause is preserved in two-phase aggregation rewrite
+query II
+SELECT
+  COUNT(DISTINCT x) as distinct_count,
+  SUM(y) FILTER (WHERE y > 15) as filtered_sum
+FROM (VALUES
+  (1, 10),
+  (1, 20),
+  (2, 5),
+  (2, 30),
+  (3, 25)
+) AS t(x, y)
+----
+3 75
+
+# Test that ORDER BY is preserved in two-phase aggregation rewrite
+query I?
+SELECT
+  COUNT(DISTINCT x) as distinct_count,
+  ARRAY_AGG(y ORDER BY y DESC) as ordered_agg
+FROM (VALUES
+  (1, 10),
+  (1, 30),
+  (2, 20),
+  (2, 40)
+) AS t(x, y)
+----
+2 [40, 30, 20, 10]
+
 statement error This feature is not implemented: Calling array_agg: LIMIT not supported in function arguments: 1
 SELECT array_agg(c13 LIMIT 1) FROM aggregate_test_100
 
@@ -424,20 +511,19 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[array_length(array_agg(DISTINCT a.foo)@1) as array_length(array_agg(DISTINCT a.foo)), sum(DISTINCT Int64(1))@2 as sum(DISTINCT Int64(1))]
 02)--AggregateExec: mode=FinalPartitioned, gby=[id@0 as id], aggr=[array_agg(DISTINCT a.foo), sum(DISTINCT Int64(1))], ordering_mode=Sorted
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------RepartitionExec: partitioning=Hash([id@0], 4), input_partitions=5
-05)--------AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[array_agg(DISTINCT a.foo), sum(DISTINCT Int64(1))], ordering_mode=Sorted
-06)----------UnionExec
-07)------------ProjectionExec: expr=[1 as id, 2 as foo]
-08)--------------PlaceholderRowExec
-09)------------ProjectionExec: expr=[1 as id, NULL as foo]
-10)--------------PlaceholderRowExec
-11)------------ProjectionExec: expr=[1 as id, NULL as foo]
-12)--------------PlaceholderRowExec
-13)------------ProjectionExec: expr=[1 as id, 3 as foo]
-14)--------------PlaceholderRowExec
-15)------------ProjectionExec: expr=[1 as id, 2 as foo]
-16)--------------PlaceholderRowExec
+03)----RepartitionExec: partitioning=Hash([id@0], 4), input_partitions=5
+04)------AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[array_agg(DISTINCT a.foo), sum(DISTINCT Int64(1))], ordering_mode=Sorted
+05)--------UnionExec
+06)----------ProjectionExec: expr=[1 as id, 2 as foo]
+07)------------PlaceholderRowExec
+08)----------ProjectionExec: expr=[1 as id, NULL as foo]
+09)------------PlaceholderRowExec
+10)----------ProjectionExec: expr=[1 as id, NULL as foo]
+11)------------PlaceholderRowExec
+12)----------ProjectionExec: expr=[1 as id, 3 as foo]
+13)------------PlaceholderRowExec
+14)----------ProjectionExec: expr=[1 as id, 2 as foo]
+15)------------PlaceholderRowExec
 
 
 # FIX: custom absolute values
@@ -491,6 +577,12 @@ SELECT bit_xor(distinct c5 % 2) FROM aggregate_test_100
 ----
 -2
 
+# edge case for null accumulator state fields
+query ???I
+SELECT bit_and(NULL), bit_or(NULL), bit_xor(NULL), approx_distinct(NULL) from aggregate_test_100
+----
+NULL NULL NULL 0
+
 # csv_query_covariance_1
 query R
 SELECT covar_pop(c2, c12) FROM aggregate_test_100
@@ -503,6 +595,16 @@ SELECT covar(c2, c12) FROM aggregate_test_100
 ----
 -0.079969012479
 
+query R
+SELECT covar_pop(arrow_cast(c2, 'Float16'), arrow_cast(c12, 'Float16')) FROM aggregate_test_100
+----
+-0.079163311005
+
+query R
+SELECT covar(arrow_cast(c2, 'Float16'), arrow_cast(c12, 'Float16')) FROM aggregate_test_100
+----
+-0.079962940409
+
 # single_row_query_covar_1
 query R
 select covar_samp(sq.column1, sq.column2) from (values (1.1, 2.2)) as sq
@@ -591,6 +693,70 @@ from data
 ----
 1
 
+# group correlation_query_with_nans_f32
+query IR
+select id, corr(f, b)
+from values
+    (1, 1, 'nan'::float),
+    (2, 'nan'::float, 1),
+    (3, 'nan'::float, null),
+    (4, null, 'nan'::float),
+    (5, 'nan'::float, 'nan'::float),
+    (5, 1, 1),
+    (5, 2, 2),
+    (6, 'nan'::float, 'nan'::float) t(id, f, b)
+group by id
+order by id
+----
+1 NULL
+2 NULL
+3 NULL
+4 NULL
+5 NaN
+6 NaN
+
+# correlation_query_with_nans_f32
+query RR
+with data as (
+    select 'nan'::float as f, 'nan'::float as b
+)
+select corr(f, b), corr('nan'::float, 'nan'::float)
+from data
+----
+NaN NaN
+
+# group correlation_query_with_nans_f64
+query IR
+select id, corr(f, b)
+from values
+    (1, 1, 'nan'::double),
+    (2, 'nan'::double, 1),
+    (3, 'nan'::double, null),
+    (4, null, 'nan'::float),
+    (5, 'nan'::double, 'nan'::double),
+    (5, 1, 1),
+    (5, 2, 2),
+    (6, 'nan'::double, 'nan'::double) t(id, f, b)
+group by id
+order by id
+----
+1 NULL
+2 NULL
+3 NULL
+4 NULL
+5 NaN
+6 NaN
+
+# correlation_query_with_nans_f64
+query RR
+with data as (
+    select 'nan'::double as f, 'nan'::double as b
+)
+select corr(f, b), corr('nan'::double, 'nan'::double)
+from data
+----
+NaN NaN
+
 # csv_query_variance_1
 query R
 SELECT var_pop(c2) FROM aggregate_test_100
@@ -621,8 +787,10 @@ SELECT var(distinct c2) FROM aggregate_test_100
 ----
 2.5
 
-statement error DataFusion error: This feature is not implemented: VAR\(DISTINCT\) aggregations are not available
+query RR
 SELECT var(c2), var(distinct c2) FROM aggregate_test_100
+----
+1.886363636364 2.5
 
 # csv_query_distinct_variance_population
 query R
@@ -630,8 +798,10 @@ SELECT var_pop(distinct c2) FROM aggregate_test_100
 ----
 2
 
-statement error DataFusion error: This feature is not implemented: VAR_POP\(DISTINCT\) aggregations are not available
+query RR
 SELECT var_pop(c2), var_pop(distinct c2) FROM aggregate_test_100
+----
+1.8675 2
 
 # csv_query_variance_5
 query R
@@ -772,16 +942,16 @@ SELECT c2, var_samp(CASE WHEN c12 > 0.90 THEN c12 ELSE null END) FROM aggregate_
 
 
 # csv_query_approx_median_1
-query I
+query R
 SELECT approx_median(c2) FROM aggregate_test_100
 ----
 3
 
 # csv_query_approx_median_2
-query I
+query R
 SELECT approx_median(c6) FROM aggregate_test_100
 ----
-1146409980542786560
+1146409980542786600
 
 # csv_query_approx_median_3
 query R
@@ -830,7 +1000,7 @@ SELECT median(col_i8), median(distinct col_i8) FROM median_table
 -14 100
 
 # approx_distinct_median_i8
-query I
+query R
 SELECT approx_median(distinct col_i8) FROM median_table
 ----
 100
@@ -838,6 +1008,13 @@ SELECT approx_median(distinct col_i8) FROM median_table
 statement error DataFusion error: This feature is not implemented: APPROX_MEDIAN\(DISTINCT\) aggregations are not available
 SELECT approx_median(col_i8), approx_median(distinct col_i8) FROM median_table
 
+# null handling clauses not supported
+query error DataFusion error: Error during planning: \[IGNORE \| RESPECT\] NULLS are not permitted for median
+SELECT median(c2) IGNORE NULLS FROM aggregate_test_100
+
+query error DataFusion error: Error during planning: \[IGNORE \| RESPECT\] NULLS are not permitted for median
+SELECT median(c2) RESPECT NULLS FROM aggregate_test_100
+
 # median_i16
 query I
 SELECT median(col_i16) FROM median_table
@@ -904,6 +1081,311 @@ SELECT approx_median(col_f64_nan) FROM median_table
 ----
 NaN
 
+
+# median_i8_overflow_negative
+query I
+SELECT median(v) FROM (VALUES (arrow_cast(-85, 'Int8')), (arrow_cast(-56, 'Int8'))) AS t(v);
+----
+-70
+
+# median_i8_overflow_positive
+# Test overflow with positive values: 100 + 120 = 220 > 127 (max i8)
+query I
+SELECT median(v) FROM (VALUES (arrow_cast(100, 'Int8')), (arrow_cast(120, 'Int8'))) AS t(v);
+----
+110
+
+# median_u8_overflow
+# Test unsigned overflow: 200 + 250 = 450 > 255 (max u8)
+query I
+SELECT median(v) FROM (VALUES (arrow_cast(200, 'UInt8')), (arrow_cast(250, 'UInt8'))) AS t(v);
+----
+225
+
+# median_i8_no_overflow_normal_case
+# Normal case that doesn't overflow for comparison
+query I
+SELECT median(v) FROM (VALUES (arrow_cast(4, 'Int8')), (arrow_cast(5, 'Int8'))) AS t(v);
+----
+4
+
+# median_i8_max_values
+# Test with both i8::MAX values: 127 + 127 = 254 > 127, overflow
+query I
+SELECT median(v) FROM (VALUES (arrow_cast(127, 'Int8')), (arrow_cast(127, 'Int8'))) AS t(v);
+----
+127
+
+# median_i8_min_values
+# Test with both i8::MIN values: -128 + -128 = -256 < -128, underflow
+query I
+SELECT median(v) FROM (VALUES (arrow_cast(-128, 'Int8')), (arrow_cast(-128, 'Int8'))) AS t(v);
+----
+-128
+
+# median_i8_min_max_values
+# Test with i8::MIN and i8::MAX: -128 + 127 = -1, no overflow, median = 0 (truncated from -0.5)
+query I
+SELECT median(v) FROM (VALUES (arrow_cast(-128, 'Int8')), (arrow_cast(127, 'Int8'))) AS t(v);
+----
+0
+
+# median_u8_max_values
+# Test with both u8::MAX values: 255 + 255 = 510 > 255, overflow
+query I
+SELECT median(v) FROM (VALUES (arrow_cast(255, 'UInt8')), (arrow_cast(255, 'UInt8'))) AS t(v);
+----
+255
+
+# median_sliding_window
+statement ok
+CREATE TABLE median_window_test (
+    timestamp INT,
+    tags VARCHAR,
+    value DOUBLE
+);
+
+statement ok
+INSERT INTO median_window_test (timestamp, tags, value) VALUES
+(1, 'tag1', 10.0),
+(2, 'tag1', 20.0),
+(3, 'tag1', 30.0),
+(4, 'tag1', 40.0),
+(5, 'tag1', 50.0),
+(1, 'tag2', 60.0),
+(2, 'tag2', 70.0),
+(3, 'tag2', 80.0),
+(4, 'tag2', 90.0),
+(5, 'tag2', 100.0);
+
+query ITRR
+SELECT
+    timestamp,
+    tags,
+    value,
+    median(value) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING
+    ) AS value_median_3
+FROM median_window_test
+ORDER BY tags, timestamp;
+----
+1 tag1 10 15
+2 tag1 20 20
+3 tag1 30 30
+4 tag1 40 40
+5 tag1 50 45
+1 tag2 60 65
+2 tag2 70 70
+3 tag2 80 80
+4 tag2 90 90
+5 tag2 100 95
+
+# median_non_sliding_window
+query ITRRRR
+SELECT
+    timestamp,
+    tags,
+    value,
+    median(value) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+    ) AS value_median_unbounded_preceding,
+    median(value) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+    ) AS value_median_unbounded_both,
+    median(value) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING
+    ) AS value_median_unbounded_following
+FROM median_window_test
+ORDER BY tags, timestamp;
+----
+1 tag1 10 10 30 30
+2 tag1 20 15 30 35
+3 tag1 30 20 30 40
+4 tag1 40 25 30 45
+5 tag1 50 30 30 50
+1 tag2 60 60 80 80
+2 tag2 70 65 80 85
+3 tag2 80 70 80 90
+4 tag2 90 75 80 95
+5 tag2 100 80 80 100
+
+# Test median on a sliding RANGE window with repeated values.
+query RR
+SELECT
+    x,
+    median(x) OVER (
+        ORDER BY x
+        RANGE BETWEEN 1 PRECEDING AND CURRENT ROW
+    ) AS median_val
+FROM (VALUES (1.0), (1.0), (2.0), (5.0)) AS t(x)
+ORDER BY x;
+----
+1 1
+1 1
+2 1
+5 5
+
+###########
+# Issue #19612: Test that percentile_cont produces correct results
+# in window frame queries. Previously percentile_cont consumed its internal state
+# during evaluate(), causing incorrect results when called multiple times.
+###########
+
+# Test percentile_cont sliding window (same as median)
+query ITRR
+SELECT
+    timestamp,
+    tags,
+    value,
+    percentile_cont(value, 0.5) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING
+    ) AS value_percentile_50
+FROM median_window_test
+ORDER BY tags, timestamp;
+----
+1 tag1 10 15
+2 tag1 20 20
+3 tag1 30 30
+4 tag1 40 40
+5 tag1 50 45
+1 tag2 60 65
+2 tag2 70 70
+3 tag2 80 80
+4 tag2 90 90
+5 tag2 100 95
+
+# Test percentile_cont non-sliding window
+query ITRRRR
+SELECT
+    timestamp,
+    tags,
+    value,
+    percentile_cont(value, 0.5) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+    ) AS value_percentile_unbounded_preceding,
+    percentile_cont(value, 0.5) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+    ) AS value_percentile_unbounded_both,
+    percentile_cont(value, 0.5) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING
+    ) AS value_percentile_unbounded_following
+FROM median_window_test
+ORDER BY tags, timestamp;
+----
+1 tag1 10 10 30 30
+2 tag1 20 15 30 35
+3 tag1 30 20 30 40
+4 tag1 40 25 30 45
+5 tag1 50 30 30 50
+1 tag2 60 60 80 80
+2 tag2 70 65 80 85
+3 tag2 80 70 80 90
+4 tag2 90 75 80 95
+5 tag2 100 80 80 100
+
+# Test percentile_cont with different percentile values
+query ITRRR
+SELECT
+    timestamp,
+    tags,
+    value,
+    percentile_cont(value, 0.25) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+    ) AS p25,
+    percentile_cont(value, 0.75) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+    ) AS p75
+FROM median_window_test
+ORDER BY tags, timestamp;
+----
+1 tag1 10 10 10
+2 tag1 20 12.5 17.5
+3 tag1 30 15 25
+4 tag1 40 17.5 32.5
+5 tag1 50 20 40
+1 tag2 60 60 60
+2 tag2 70 62.5 67.5
+3 tag2 80 65 75
+4 tag2 90 67.5 82.5
+5 tag2 100 70 90
+
+
+# Test distinct median non-sliding window
+query ITRR
+SELECT
+    timestamp,
+    tags,
+    value,
+    median(DISTINCT value) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+    ) AS distinct_median
+FROM median_window_test
+ORDER BY tags, timestamp;
+----
+1 tag1 10 10
+2 tag1 20 15
+3 tag1 30 20
+4 tag1 40 25
+5 tag1 50 30
+1 tag2 60 60
+2 tag2 70 65
+3 tag2 80 70
+4 tag2 90 75
+5 tag2 100 80
+
+statement ok
+DROP TABLE median_window_test;
+
+query RT
+select approx_median(arrow_cast(col_f32, 'Float16')), arrow_typeof(approx_median(arrow_cast(col_f32, 'Float16'))) from median_table;
+----
+2.75 Float16
+
+# This shouldn't be NaN, see:
+# https://github.com/apache/datafusion/issues/18945
+query RT
+select
+  percentile_cont(0.5) within group (order by arrow_cast(col_f32, 'Float16')),
+  arrow_typeof(percentile_cont(0.5) within group (order by arrow_cast(col_f32, 'Float16')))
+from median_table;
+----
+2.75 Float16
+
+query RT
+select
+  approx_percentile_cont(0.5) within group (order by arrow_cast(col_f32, 'Float16')),
+  arrow_typeof(approx_percentile_cont(0.5) within group (order by arrow_cast(col_f32, 'Float16')))
+from median_table;
+----
+2.75 Float16
+
+query ?T
+select approx_median(NULL), arrow_typeof(approx_median(NULL)) from median_table;
+----
+NULL Null
+
 # median decimal
 statement ok
 create table t(c decimal(10, 4)) as values (0.0001), (0.0002), (0.0003), (0.0004), (0.0005), (0.0006);
@@ -1179,11 +1661,9 @@ physical_plan
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[median(alias1)]
 05)--------AggregateExec: mode=FinalPartitioned, gby=[alias1@0 as alias1], aggr=[]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------RepartitionExec: partitioning=Hash([alias1@0], 4), input_partitions=4
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------AggregateExec: mode=Partial, gby=[c@0 as alias1], aggr=[]
-10)------------------DataSourceExec: partitions=1, partition_sizes=[1]
+06)----------RepartitionExec: partitioning=Hash([alias1@0], 4), input_partitions=1
+07)------------AggregateExec: mode=Partial, gby=[c@0 as alias1], aggr=[]
+08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 drop table t;
@@ -1336,7 +1816,7 @@ SELECT approx_distinct(null)
 query II
 SELECT approx_distinct(c9) AS a, approx_distinct(c9) AS b FROM aggregate_test_100
 ----
-100 100
+99 99
 
 # csv_query_approx_count_date_timestamp
 query IIIII
@@ -1546,26 +2026,29 @@ SELECT (ABS(1 - CAST(approx_percentile_cont(0.9) WITHIN GROUP (ORDER BY c11) AS
 true
 
 # percentile_cont_with_nulls
-query I
+query R
 SELECT APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (NULL), (NULL), (NULL)) as t (v);
 ----
 2
 
 # percentile_cont_with_nulls_only
-query I
+query R
 SELECT APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (CAST(NULL as INT))) as t (v);
 ----
 NULL
 
+query error DataFusion error: Error during planning: Percentile value for 'APPROX_PERCENTILE_CONT' must be Float32 or Float64 literal \(got null\)
+SELECT APPROX_PERCENTILE_CONT(NULL) WITHIN GROUP (ORDER BY v) FROM (VALUES (CAST(NULL as INT))) as t (v);
+
 # percentile_cont_with_weight_with_nulls
-query I
+query R
 SELECT APPROX_PERCENTILE_CONT_WITH_WEIGHT(w, 0.5) WITHIN GROUP (ORDER BY v)
 FROM (VALUES (1, 1), (2, 1), (3, 1), (4, NULL), (NULL, 1), (NULL, NULL)) as t (v, w);
 ----
 2
 
 # percentile_cont_with_weight_nulls_only
-query I
+query R
 SELECT APPROX_PERCENTILE_CONT_WITH_WEIGHT(1, 0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (CAST(NULL as INT))) as t (v);
 ----
 NULL
@@ -1617,11 +2100,12 @@ statement ok
 INSERT INTO t1 VALUES (TRUE);
 
 # ISSUE: https://github.com/apache/datafusion/issues/12716
-# This test verifies that approx_percentile_cont_with_weight does not panic when given 'NaN' and returns 'inf'
+# This test verifies that approx_percentile_cont_with_weight does not panic when given 'NaN'
+# With weight=0, the data point does not contribute, so result is NULL
 query R
 SELECT approx_percentile_cont_with_weight(0, 0) WITHIN GROUP (ORDER BY 'NaN'::DOUBLE) FROM t1 WHERE t1.v1;
 ----
-Infinity
+NULL
 
 statement ok
 DROP TABLE t1;
@@ -1845,36 +2329,36 @@ b 5 NULL 20135.4
 b NULL NULL 7732.315789473684
 
 # csv_query_approx_percentile_cont_with_weight
-query TI
+query TR
 SELECT c1, approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
 ----
-a 73
+a 73.55
 b 68
-c 122
-d 124
-e 115
+c 122.5
+d 124.2
+e 115.6
 
 
 # csv_query_approx_percentile_cont_with_weight (should be the same as above)
-query TI
+query TR
 SELECT c1, approx_percentile_cont(c3, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
 ----
-a 73
+a 73.55
 b 68
-c 122
-d 124
-e 115
+c 122.5
+d 124.2
+e 115.6
 
 
 # using approx_percentile_cont on 2 columns with same signature
-query TII
+query TRR
 SELECT c1, approx_percentile_cont(c2, 0.95) AS c2, approx_percentile_cont(c3, 0.95) AS c3 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
 ----
-a 5 73
+a 5 73.55
 b 5 68
-c 5 122
-d 5 124
-e 5 115
+c 5 122.5
+d 5 124.2
+e 5 115.6
 
 # error is unique to this UDAF
 query TRR
@@ -1888,73 +2372,82 @@ e 3 40.333333333333
 
 
 
-query TI
+query TR
 SELECT c1, approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c3 DESC) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
 ----
 a -101
-b -114
-c -109
-d -98
-e -93
+b -114.3
+c -109.475
+d -98.6
+e -93.65
 
 # csv_query_approx_percentile_cont_with_weight (2)
-query TI
+query TR
 SELECT c1, approx_percentile_cont_with_weight(1, 0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
 ----
-a 73
+a 73.55
 b 68
-c 122
-d 124
-e 115
+c 122.5
+d 124.2
+e 115.6
 
 # csv_query_approx_percentile_cont_with_weight alternate syntax
-query TI
+query TR
 SELECT c1, approx_percentile_cont_with_weight(c3, 1, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
 ----
-a 73
+a 73.55
 b 68
-c 122
-d 124
-e 115
-
+c 122.5
+d 124.2
+e 115.6
 
-query TI
+query TR
 SELECT c1, approx_percentile_cont_with_weight(1, 0.95) WITHIN GROUP (ORDER BY c3 DESC) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
 ----
 a -101
-b -114
-c -109
-d -98
-e -93
+b -114.3
+c -109.475
+d -98.6
+e -93.65
 
 # csv_query_approx_percentile_cont_with_histogram_bins
-query TI
+query TR
 SELECT c1, approx_percentile_cont(0.95, 200) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
 ----
-a 73
+a 73.55
 b 68
-c 122
-d 124
-e 115
+c 122.5
+d 124.2
+e 115.6
 
-query TI
+query TR
+SELECT c1, approx_percentile_cont(0.95, 200.1) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
+----
+a 73.55
+b 68
+c 122.5
+d 124.2
+e 115.6
+
+
+query TR
 SELECT c1, approx_percentile_cont_with_weight(c2, 0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
 ----
-a 74
+a 65
 b 68
-c 123
-d 124
-e 115
+c 122
+d 123.15
+e 110.266666666667
 
 # approx_percentile_cont_with_weight with centroids
-query TI
+query TR
 SELECT c1, approx_percentile_cont_with_weight(c2, 0.95, 200) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
 ----
-a 74
+a 65
 b 68
-c 123
-d 124
-e 115
+c 122
+d 123.15
+e 110.266666666667
 
 # csv_query_sum_crossjoin
 query TTI
@@ -3347,10 +3840,10 @@ SELECT COUNT(DISTINCT c1) FROM test
 # TODO: aggregate_with_alias
 
 # test_approx_percentile_cont_decimal_support
-query TI
+query TR
 SELECT c1, approx_percentile_cont(cast(0.85 as decimal(10,2))) WITHIN GROUP (ORDER BY c2) apc FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
 ----
-a 4
+a 4.175
 b 5
 c 4
 d 4
@@ -3402,6 +3895,59 @@ SELECT percentile_cont(1.0) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100
 ----
 5
 
+# Ensure percentile_cont simplification rewrites to min/max plans
+query TT
+EXPLAIN SELECT percentile_cont(0.0) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[min(CAST(aggregate_test_100.c2 AS Float64)) AS percentile_cont(Float64(0)) WITHIN GROUP [aggregate_test_100.c2 ASC NULLS LAST]]]
+02)--TableScan: aggregate_test_100 projection=[c2]
+physical_plan
+01)AggregateExec: mode=Final, gby=[], aggr=[percentile_cont(Float64(0)) WITHIN GROUP [aggregate_test_100.c2 ASC NULLS LAST]]
+02)--CoalescePartitionsExec
+03)----AggregateExec: mode=Partial, gby=[], aggr=[percentile_cont(Float64(0)) WITHIN GROUP [aggregate_test_100.c2 ASC NULLS LAST]]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c2], file_type=csv, has_header=true
+
+query TT
+EXPLAIN SELECT percentile_cont(0.0) WITHIN GROUP (ORDER BY c2 DESC) FROM aggregate_test_100;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[max(CAST(aggregate_test_100.c2 AS Float64)) AS percentile_cont(Float64(0)) WITHIN GROUP [aggregate_test_100.c2 DESC NULLS FIRST]]]
+02)--TableScan: aggregate_test_100 projection=[c2]
+physical_plan
+01)AggregateExec: mode=Final, gby=[], aggr=[percentile_cont(Float64(0)) WITHIN GROUP [aggregate_test_100.c2 DESC NULLS FIRST]]
+02)--CoalescePartitionsExec
+03)----AggregateExec: mode=Partial, gby=[], aggr=[percentile_cont(Float64(0)) WITHIN GROUP [aggregate_test_100.c2 DESC NULLS FIRST]]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c2], file_type=csv, has_header=true
+
+query TT
+EXPLAIN SELECT percentile_cont(c2, 0.0) FROM aggregate_test_100;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[min(CAST(aggregate_test_100.c2 AS Float64)) AS percentile_cont(aggregate_test_100.c2,Float64(0))]]
+02)--TableScan: aggregate_test_100 projection=[c2]
+physical_plan
+01)AggregateExec: mode=Final, gby=[], aggr=[percentile_cont(aggregate_test_100.c2,Float64(0))]
+02)--CoalescePartitionsExec
+03)----AggregateExec: mode=Partial, gby=[], aggr=[percentile_cont(aggregate_test_100.c2,Float64(0))]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c2], file_type=csv, has_header=true
+
+query TT
+EXPLAIN SELECT percentile_cont(c2, 1.0) FROM aggregate_test_100;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[max(CAST(aggregate_test_100.c2 AS Float64)) AS percentile_cont(aggregate_test_100.c2,Float64(1))]]
+02)--TableScan: aggregate_test_100 projection=[c2]
+physical_plan
+01)AggregateExec: mode=Final, gby=[], aggr=[percentile_cont(aggregate_test_100.c2,Float64(1))]
+02)--CoalescePartitionsExec
+03)----AggregateExec: mode=Partial, gby=[], aggr=[percentile_cont(aggregate_test_100.c2,Float64(1))]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c2], file_type=csv, has_header=true
+
 query R
 SELECT percentile_cont(0.25) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100
 ----
@@ -4537,6 +5083,16 @@ SELECT max(column1), max(column2), max(column3), max(column4) FROM d;
 ----
 0 days 0 hours 0 mins 11 secs 0 days 0 hours 0 mins 0.022 secs 0 days 0 hours 0 mins 0.000033 secs 0 days 0 hours 0 mins 0.000000044 secs
 
+query ????
+SELECT avg(column1), avg(column2), avg(column3), avg(column4) FROM d;
+----
+0 days 0 hours 0 mins 6 secs 0 days 0 hours 0 mins 0.012 secs 0 days 0 hours 0 mins 0.000018 secs 0 days 0 hours 0 mins 0.000000024 secs
+
+query ????
+SELECT sum(column1), sum(column2), sum(column3), sum(column4) FROM d;
+----
+0 days 0 hours 0 mins 12 secs 0 days 0 hours 0 mins 0.024 secs 0 days 0 hours 0 mins 0.000036 secs 0 days 0 hours 0 mins 0.000000048 secs
+
 # GROUP BY follows a different code path
 query ????I
 SELECT min(column1), min(column2), min(column3), min(column4), column5 FROM d GROUP BY column5;
@@ -4548,6 +5104,16 @@ SELECT max(column1), max(column2), max(column3), max(column4), column5 FROM d GR
 ----
 0 days 0 hours 0 mins 11 secs 0 days 0 hours 0 mins 0.022 secs 0 days 0 hours 0 mins 0.000033 secs 0 days 0 hours 0 mins 0.000000044 secs 1
 
+query ????I
+SELECT avg(column1), avg(column2), avg(column3), avg(column4), column5 FROM d GROUP BY column5;
+----
+0 days 0 hours 0 mins 6 secs 0 days 0 hours 0 mins 0.012 secs 0 days 0 hours 0 mins 0.000018 secs 0 days 0 hours 0 mins 0.000000024 secs 1
+
+query ????I
+SELECT sum(column1), sum(column2), sum(column3), sum(column4), column5 FROM d GROUP BY column5;
+----
+0 days 0 hours 0 mins 12 secs 0 days 0 hours 0 mins 0.024 secs 0 days 0 hours 0 mins 0.000036 secs 0 days 0 hours 0 mins 0.000000048 secs 1
+
 statement ok
 INSERT INTO d VALUES
   (arrow_cast(3, 'Duration(Second)'), arrow_cast(1, 'Duration(Millisecond)'), arrow_cast(7, 'Duration(Microsecond)'), arrow_cast(2, 'Duration(Nanosecond)'), 1),
@@ -4563,6 +5129,16 @@ SELECT min(column1), min(column2), min(column3), min(column4), column5 FROM d GR
 ----
 0 days 0 hours 0 mins 0 secs 0 days 0 hours 0 mins 0.001 secs 0 days 0 hours 0 mins 0.000003 secs 0 days 0 hours 0 mins 0.000000002 secs 1
 
+query ????I
+SELECT avg(column1), avg(column2), avg(column3), avg(column4), column5 FROM d GROUP BY column5 ORDER BY column5;
+----
+0 days 0 hours 0 mins 3 secs 0 days 0 hours 0 mins 0.008 secs 0 days 0 hours 0 mins 0.000012 secs 0 days 0 hours 0 mins 0.000000014 secs 1
+
+query ????I
+SELECT sum(column1), sum(column2), sum(column3), sum(column4), column5 FROM d GROUP BY column5 ORDER BY column5;
+----
+0 days 0 hours 0 mins 15 secs 0 days 0 hours 0 mins 0.034 secs 0 days 0 hours 0 mins 0.000048 secs 0 days 0 hours 0 mins 0.000000058 secs 1
+
 statement ok
 drop table d;
 
@@ -5006,10 +5582,10 @@ as values
 statement ok
 create table t as
 select
-  arrow_cast(column1, 'Timestamp(Nanosecond, None)') as nanos,
-  arrow_cast(column1, 'Timestamp(Microsecond, None)') as micros,
-  arrow_cast(column1, 'Timestamp(Millisecond, None)') as millis,
-  arrow_cast(column1, 'Timestamp(Second, None)') as secs,
+  arrow_cast(column1, 'Timestamp(ns)') as nanos,
+  arrow_cast(column1, 'Timestamp(µs)') as micros,
+  arrow_cast(column1, 'Timestamp(ms)') as millis,
+  arrow_cast(column1, 'Timestamp(s)') as secs,
   arrow_cast(column1, 'Timestamp(Nanosecond, Some("UTC"))') as nanos_utc,
   arrow_cast(column1, 'Timestamp(Microsecond, Some("UTC"))') as micros_utc,
   arrow_cast(column1, 'Timestamp(Millisecond, Some("UTC"))') as millis_utc,
@@ -5092,7 +5668,7 @@ SELECT tag, avg(nanos), avg(micros), avg(millis), avg(secs) FROM t GROUP BY tag
 
 # aggregate_duration_array_agg
 query T?
-SELECT tag, array_agg(millis - arrow_cast(secs, 'Timestamp(Millisecond, None)')) FROM t GROUP BY tag ORDER BY tag;
+SELECT tag, array_agg(millis - arrow_cast(secs, 'Timestamp(ms)')) FROM t GROUP BY tag ORDER BY tag;
 ----
 X [0 days 0 hours 0 mins 0.011 secs, 0 days 0 hours 0 mins 0.123 secs]
 Y [NULL, 0 days 0 hours 0 mins 0.432 secs]
@@ -5408,7 +5984,7 @@ select median(a) from (select 1 as a where 1=0);
 ----
 NULL
 
-query I
+query R
 select approx_median(a) from (select 1 as a where 1=0);
 ----
 NULL
@@ -5933,16 +6509,12 @@ logical_plan
 04)------TableScan: aggregate_test_100 projection=[c1, c3]
 physical_plan
 01)CoalescePartitionsExec: fetch=5
-02)--AggregateExec: mode=FinalPartitioned, gby=[c3@0 as c3, min(aggregate_test_100.c1)@1 as min(aggregate_test_100.c1)], aggr=[], lim=[5]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------RepartitionExec: partitioning=Hash([c3@0, min(aggregate_test_100.c1)@1], 4), input_partitions=4
-05)--------AggregateExec: mode=Partial, gby=[c3@0 as c3, min(aggregate_test_100.c1)@1 as min(aggregate_test_100.c1)], aggr=[], lim=[5]
-06)----------AggregateExec: mode=FinalPartitioned, gby=[c3@0 as c3], aggr=[min(aggregate_test_100.c1)]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------RepartitionExec: partitioning=Hash([c3@0], 4), input_partitions=4
-09)----------------AggregateExec: mode=Partial, gby=[c3@1 as c3], aggr=[min(aggregate_test_100.c1)]
-10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c1, c3], file_type=csv, has_header=true
+02)--AggregateExec: mode=SinglePartitioned, gby=[c3@0 as c3, min(aggregate_test_100.c1)@1 as min(aggregate_test_100.c1)], aggr=[], lim=[5]
+03)----AggregateExec: mode=FinalPartitioned, gby=[c3@0 as c3], aggr=[min(aggregate_test_100.c1)]
+04)------RepartitionExec: partitioning=Hash([c3@0], 4), input_partitions=4
+05)--------AggregateExec: mode=Partial, gby=[c3@1 as c3], aggr=[min(aggregate_test_100.c1)]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c1, c3], file_type=csv, has_header=true
 
 
 #
@@ -6023,10 +6595,9 @@ physical_plan
 07)------------AggregateExec: mode=Final, gby=[c2@0 as c2, c3@1 as c3], aggr=[]
 08)--------------CoalescePartitionsExec
 09)----------------AggregateExec: mode=Partial, gby=[c2@0 as c2, c3@1 as c3], aggr=[]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------FilterExec: c3@1 >= 10 AND c3@1 <= 20
-12)----------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-13)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c2, c3], file_type=csv, has_header=true
+10)------------------FilterExec: c3@1 >= 10 AND c3@1 <= 20
+11)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+12)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c2, c3], file_type=csv, has_header=true
 
 query I
 SELECT DISTINCT c3 FROM aggregate_test_100 WHERE c3 between 10 and 20 group by c3 order by c3 limit 4;
@@ -6228,7 +6799,12 @@ from aggregate_test_100;
 ----
 0.051534002628 0.48427355347 100 0.001929150558 0.479274948239 0.508972509913 6.707779292571 9.234223721582 0.345678715695
 
-
+query R
+select
+    regr_slope(arrow_cast(c12, 'Float16'), arrow_cast(c11, 'Float16'))
+from aggregate_test_100;
+----
+0.051477733249
 
 # regr_*() functions ignore NULLs
 query RRIRRRRRR
@@ -6467,7 +7043,7 @@ SELECT STRING_AGG(DISTINCT x,'|' ORDER BY x) FROM strings
 ----
 a|b|i|j|p|x|y|z
 
-query error This feature is not implemented: The second argument of the string_agg function must be a string literal
+query error This feature is not implemented: string_agg delimiter must be a string literal
 SELECT STRING_AGG(DISTINCT x,y) FROM strings
 
 query error Execution error: In an aggregate with DISTINCT, ORDER BY expressions must appear in argument list
@@ -7085,7 +7661,7 @@ physical_plan
 01)AggregateExec: mode=Final, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 DESC NULLS FIRST]]
 02)--CoalescePartitionsExec
 03)----AggregateExec: mode=Partial, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 ASC NULLS LAST]]
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/convert_first_last.csv]]}, projection=[c1, c3], output_orderings=[[c1@0 ASC NULLS LAST], [c3@1 ASC NULLS LAST]], file_type=csv, has_header=true
 
 # test last to first
@@ -7099,7 +7675,7 @@ physical_plan
 01)AggregateExec: mode=Final, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 ASC NULLS LAST]]
 02)--CoalescePartitionsExec
 03)----AggregateExec: mode=Partial, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 DESC NULLS FIRST]]
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/convert_first_last.csv]]}, projection=[c1, c2], output_orderings=[[c1@0 ASC NULLS LAST], [c2@1 DESC]], file_type=csv, has_header=true
 
 # test building plan with aggreagte sum
@@ -7134,11 +7710,16 @@ statement ok
 drop table employee_csv;
 
 # test null literal handling in supported aggregate functions
-query I??III?T
+query I??????T
 select count(null), min(null), max(null), bit_and(NULL), bit_or(NULL), bit_xor(NULL), nth_value(NULL, 1), string_agg(NULL, ',');
 ----
 0 NULL NULL NULL NULL NULL NULL NULL
 
+query TTT
+SELECT arrow_typeof(bit_and(NULL)), arrow_typeof(bit_or(NULL)), arrow_typeof(bit_xor(NULL))
+----
+Null Null Null
+
 statement ok
 create table having_test(v1 int, v2 int)
 
@@ -7166,14 +7747,11 @@ logical_plan
 03)----Aggregate: groupBy=[[having_test.v1, having_test.v2]], aggr=[[max(having_test.v1)]]
 04)------TableScan: having_test projection=[v1, v2]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: max(having_test.v1)@2 = 3, projection=[v1@0, v2@1]
-03)----AggregateExec: mode=FinalPartitioned, gby=[v1@0 as v1, v2@1 as v2], aggr=[max(having_test.v1)]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([v1@0, v2@1], 4), input_partitions=4
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------AggregateExec: mode=Partial, gby=[v1@0 as v1, v2@1 as v2], aggr=[max(having_test.v1)]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: max(having_test.v1)@2 = 3, projection=[v1@0, v2@1]
+02)--AggregateExec: mode=FinalPartitioned, gby=[v1@0 as v1, v2@1 as v2], aggr=[max(having_test.v1)]
+03)----RepartitionExec: partitioning=Hash([v1@0, v2@1], 4), input_partitions=1
+04)------AggregateExec: mode=Partial, gby=[v1@0 as v1, v2@1 as v2], aggr=[max(having_test.v1)]
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 
 query error
@@ -7318,20 +7896,19 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[last_value(a.foo) ORDER BY [a.foo ASC NULLS LAST]@1 as last_value(a.foo) ORDER BY [a.foo ASC NULLS LAST], sum(DISTINCT Int64(1))@2 as sum(DISTINCT Int64(1))]
 02)--AggregateExec: mode=FinalPartitioned, gby=[id@0 as id], aggr=[last_value(a.foo) ORDER BY [a.foo ASC NULLS LAST], sum(DISTINCT Int64(1))], ordering_mode=Sorted
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------RepartitionExec: partitioning=Hash([id@0], 4), input_partitions=5
-05)--------AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[last_value(a.foo) ORDER BY [a.foo ASC NULLS LAST], sum(DISTINCT Int64(1))], ordering_mode=Sorted
-06)----------UnionExec
-07)------------ProjectionExec: expr=[1 as id, 2 as foo]
-08)--------------PlaceholderRowExec
-09)------------ProjectionExec: expr=[1 as id, 4 as foo]
-10)--------------PlaceholderRowExec
-11)------------ProjectionExec: expr=[1 as id, 5 as foo]
-12)--------------PlaceholderRowExec
-13)------------ProjectionExec: expr=[1 as id, 3 as foo]
-14)--------------PlaceholderRowExec
-15)------------ProjectionExec: expr=[1 as id, 2 as foo]
-16)--------------PlaceholderRowExec
+03)----RepartitionExec: partitioning=Hash([id@0], 4), input_partitions=5
+04)------AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[last_value(a.foo) ORDER BY [a.foo ASC NULLS LAST], sum(DISTINCT Int64(1))], ordering_mode=Sorted
+05)--------UnionExec
+06)----------ProjectionExec: expr=[1 as id, 2 as foo]
+07)------------PlaceholderRowExec
+08)----------ProjectionExec: expr=[1 as id, 4 as foo]
+09)------------PlaceholderRowExec
+10)----------ProjectionExec: expr=[1 as id, 5 as foo]
+11)------------PlaceholderRowExec
+12)----------ProjectionExec: expr=[1 as id, 3 as foo]
+13)------------PlaceholderRowExec
+14)----------ProjectionExec: expr=[1 as id, 2 as foo]
+15)------------PlaceholderRowExec
 
 # SortExec is removed if it is coming after one-row producing AggregateExec's having an empty group by expression
 query TT
@@ -7460,8 +8037,9 @@ logical_plan
 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
 03)----TableScan: t projection=[]
 physical_plan
-01)ProjectionExec: expr=[2 as count(Int64(1)), 2 as count()]
-02)--PlaceholderRowExec
+01)ProjectionExec: expr=[count(Int64(1))@0 as count(Int64(1)), count(Int64(1))@0 as count()]
+02)--ProjectionExec: expr=[2 as count(Int64(1))]
+03)----PlaceholderRowExec
 
 query II
 select count(1), count(*) from t;
@@ -7476,8 +8054,9 @@ logical_plan
 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
 03)----TableScan: t projection=[]
 physical_plan
-01)ProjectionExec: expr=[2 as count(Int64(1)), 2 as count(*)]
-02)--PlaceholderRowExec
+01)ProjectionExec: expr=[count(Int64(1))@0 as count(Int64(1)), count(Int64(1))@0 as count(*)]
+02)--ProjectionExec: expr=[2 as count(Int64(1))]
+03)----PlaceholderRowExec
 
 query II
 select count(), count(*) from t;
@@ -7492,8 +8071,9 @@ logical_plan
 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
 03)----TableScan: t projection=[]
 physical_plan
-01)ProjectionExec: expr=[2 as count(), 2 as count(*)]
-02)--PlaceholderRowExec
+01)ProjectionExec: expr=[count(Int64(1))@0 as count(), count(Int64(1))@0 as count(*)]
+02)--ProjectionExec: expr=[2 as count(Int64(1))]
+03)----PlaceholderRowExec
 
 query TT
 explain select count(1) * count(2) from t;
@@ -7522,21 +8102,21 @@ group0 -14
 group1 100
 
 # group median i16 non-nullable
-query TI
+query TI rowsort
 SELECT col_group, median(col_i16) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 -16334
 group1 100
 
 # group median i32 non-nullable
-query TI
+query TI rowsort
 SELECT col_group, median(col_i32) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 -1073741774
 group1 100
 
 # group median i64 non-nullable
-query TI
+query TI rowsort
 SELECT col_group, median(col_i64) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 -4611686018427387854
@@ -7550,56 +8130,56 @@ group0 50
 group1 100
 
 # group median u16 non-nullable
-query TI
+query TI rowsort
 SELECT col_group, median(col_u16) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 50
 group1 100
 
 # group median u32 non-nullable
-query TI
+query TI rowsort
 SELECT col_group, median(col_u32) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 50
 group1 100
 
 # group median u64 non-nullable
-query TI
+query TI rowsort
 SELECT col_group, median(col_u64) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 50
 group1 100
 
 # group median f32 non-nullable
-query TR
+query TR rowsort
 SELECT col_group, median(col_f32) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 2.75
 group1 3.2
 
 # group median f64 non-nullable
-query TR
+query TR rowsort
 SELECT col_group, median(col_f64) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 2.75
 group1 3.3
 
 # group median f64_nan non-nullable
-query TR
+query TR rowsort
 SELECT col_group, median(col_f64_nan) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 NaN
 group1 NaN
 
 # group median decimal128 non-nullable
-query TR
+query TR rowsort
 SELECT col_group, median(col_decimal128) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 0.0002
 group1 0.0003
 
 # group median decimal256 non-nullable
-query TR
+query TR rowsort
 SELECT col_group, median(col_decimal256) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 0.0002
@@ -7851,17 +8431,15 @@ VALUES
 ----
 x 1
 
-query ?
+query error Error during planning: WITHIN GROUP is only supported for ordered-set aggregate functions
 SELECT array_agg(a_varchar) WITHIN GROUP (ORDER BY a_varchar)
 FROM (VALUES ('a'), ('d'), ('c'), ('a')) t(a_varchar);
-----
-[a, a, c, d]
 
-query ?
+
+query error Error during planning: WITHIN GROUP is only supported for ordered-set aggregate functions
 SELECT array_agg(DISTINCT a_varchar) WITHIN GROUP (ORDER BY a_varchar)
 FROM (VALUES ('a'), ('d'), ('c'), ('a')) t(a_varchar);
-----
-[a, c, d]
+
 
 query error Error during planning: ORDER BY and WITHIN GROUP clauses cannot be used together in the same aggregate function
 SELECT array_agg(a_varchar order by a_varchar) WITHIN GROUP (ORDER BY a_varchar)
@@ -7931,3 +8509,451 @@ NULL NULL NULL NULL
 
 statement ok
 drop table distinct_avg;
+
+query R
+select percentile_cont(null, 0.5);
+----
+NULL
+
+# Test string_agg window frame behavior (fix for issue #19612)
+statement ok
+CREATE TABLE string_agg_window_test (
+    id INT,
+    grp VARCHAR,
+    val VARCHAR
+);
+
+statement ok
+INSERT INTO string_agg_window_test (id, grp, val) VALUES
+(1, 'A', 'a'),
+(2, 'A', 'b'),
+(3, 'A', 'c'),
+(1, 'B', 'x'),
+(2, 'B', 'y'),
+(3, 'B', 'z');
+
+# Test string_agg with ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+# The function should maintain state correctly across multiple evaluate() calls
+query ITT
+SELECT
+    id,
+    grp,
+    string_agg(val, ',') OVER (
+        PARTITION BY grp
+        ORDER BY id
+        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+    ) AS cumulative_string
+FROM string_agg_window_test
+ORDER BY grp, id;
+----
+1 A a
+2 A a,b
+3 A a,b,c
+1 B x
+2 B x,y
+3 B x,y,z
+
+statement ok
+DROP TABLE string_agg_window_test;
+
+# Enable streaming aggregation by limiting partitions and ensuring sorted input
+statement ok
+set datafusion.execution.target_partitions = 1;
+
+# Setup data
+statement ok
+CREATE TABLE stream_test (
+    g INT,
+    x DOUBLE,
+    y DOUBLE,
+    i INT,
+    b BOOLEAN,
+    s VARCHAR
+) AS VALUES
+(1, 1.0, 1.0, 1, true, 'a'), (1, 2.0, 2.0, 2, true, 'b'),
+(2, 1.0, 5.0, 3, false, 'c'), (2, 2.0, 5.0, 4, true, 'd'),
+(3, 1.0, 1.0, 7, false, 'e'), (3, 2.0, 2.0, 8, false, 'f');
+
+# Test comprehensive aggregates with streaming
+# This verifies that CORR and other aggregates work together in a streaming plan (ordering_mode=Sorted)
+
+# Basic Aggregates
+query TT
+EXPLAIN SELECT
+  g,
+  COUNT(*),
+  SUM(x),
+  AVG(x),
+  MEAN(x),
+  MIN(x),
+  MAX(y),
+  BIT_AND(i),
+  BIT_OR(i),
+  BIT_XOR(i),
+  BOOL_AND(b),
+  BOOL_OR(b),
+  MEDIAN(x),
+  GROUPING(g),
+  VAR(x),
+  VAR_SAMP(x),
+  VAR_POP(x),
+  VAR_SAMPLE(x),
+  VAR_POPULATION(x),
+  STDDEV(x),
+  STDDEV_SAMP(x),
+  STDDEV_POP(x)
+FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
+GROUP BY g
+ORDER BY g;
+----
+logical_plan
+01)Sort: stream_test.g ASC NULLS LAST
+02)--Projection: stream_test.g, count(Int64(1)) AS count(*), sum(stream_test.x), avg(stream_test.x), avg(stream_test.x) AS mean(stream_test.x), min(stream_test.x), max(stream_test.y), bit_and(stream_test.i), bit_or(stream_test.i), bit_xor(stream_test.i), bool_and(stream_test.b), bool_or(stream_test.b), median(stream_test.x), Int32(0) AS grouping(stream_test.g), var(stream_test.x), var(stream_test.x) AS var_samp(stream_test.x), var_pop(stream_test.x), var(stream_test.x) AS var_sample(stream_test.x), var_pop(stream_test.x) AS var_population(stream_test.x), stddev(stream_test.x), stddev(stream_test.x) AS stddev_samp(stream_test.x), stddev_pop(stream_test.x)
+03)----Aggregate: groupBy=[[stream_test.g]], aggr=[[count(Int64(1)), sum(stream_test.x), avg(stream_test.x), min(stream_test.x), max(stream_test.y), bit_and(stream_test.i), bit_or(stream_test.i), bit_xor(stream_test.i), bool_and(stream_test.b), bool_or(stream_test.b), median(stream_test.x), var(stream_test.x), var_pop(stream_test.x), stddev(stream_test.x), stddev_pop(stream_test.x)]]
+04)------Sort: stream_test.g ASC NULLS LAST, fetch=10000
+05)--------TableScan: stream_test projection=[g, x, y, i, b]
+physical_plan
+01)ProjectionExec: expr=[g@0 as g, count(Int64(1))@1 as count(*), sum(stream_test.x)@2 as sum(stream_test.x), avg(stream_test.x)@3 as avg(stream_test.x), avg(stream_test.x)@3 as mean(stream_test.x), min(stream_test.x)@4 as min(stream_test.x), max(stream_test.y)@5 as max(stream_test.y), bit_and(stream_test.i)@6 as bit_and(stream_test.i), bit_or(stream_test.i)@7 as bit_or(stream_test.i), bit_xor(stream_test.i)@8 as bit_xor(stream_test.i), bool_and(stream_test.b)@9 as bool_and(stream_test.b), bool_or(stream_test.b)@10 as bool_or(stream_test.b), median(stream_test.x)@11 as median(stream_test.x), 0 as grouping(stream_test.g), var(stream_test.x)@12 as var(stream_test.x), var(stream_test.x)@12 as var_samp(stream_test.x), var_pop(stream_test.x)@13 as var_pop(stream_test.x), var(stream_test.x)@12 as var_sample(stream_test.x), var_pop(stream_test.x)@13 as var_population(stream_test.x), stddev(stream_test.x)@14 as stddev(stream_test.x), stddev(stream_test.x)@14 as stddev_samp(stream_test.x), stddev_pop(stream_test.x)@15 as stddev_pop(stream_test.x)]
+02)--AggregateExec: mode=Single, gby=[g@0 as g], aggr=[count(Int64(1)), sum(stream_test.x), avg(stream_test.x), min(stream_test.x), max(stream_test.y), bit_and(stream_test.i), bit_or(stream_test.i), bit_xor(stream_test.i), bool_and(stream_test.b), bool_or(stream_test.b), median(stream_test.x), var(stream_test.x), var_pop(stream_test.x), stddev(stream_test.x), stddev_pop(stream_test.x)], ordering_mode=Sorted
+03)----SortExec: TopK(fetch=10000), expr=[g@0 ASC NULLS LAST], preserve_partitioning=[false]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query IIRRRRRIIIBBRIRRRRRRRR
+SELECT
+  g,
+  COUNT(*),
+  SUM(x),
+  AVG(x),
+  MEAN(x),
+  MIN(x),
+  MAX(y),
+  BIT_AND(i),
+  BIT_OR(i),
+  BIT_XOR(i),
+  BOOL_AND(b),
+  BOOL_OR(b),
+  MEDIAN(x),
+  GROUPING(g),
+  VAR(x),
+  VAR_SAMP(x),
+  VAR_POP(x),
+  VAR_SAMPLE(x),
+  VAR_POPULATION(x),
+  STDDEV(x),
+  STDDEV_SAMP(x),
+  STDDEV_POP(x)
+FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
+GROUP BY g
+ORDER BY g;
+----
+1 2 3 1.5 1.5 1 2 0 3 3 true true 1.5 0 0.5 0.5 0.25 0.5 0.25 0.707106781187 0.707106781187 0.5
+2 2 3 1.5 1.5 1 5 0 7 7 false true 1.5 0 0.5 0.5 0.25 0.5 0.25 0.707106781187 0.707106781187 0.5
+3 2 3 1.5 1.5 1 2 0 15 15 false false 1.5 0 0.5 0.5 0.25 0.5 0.25 0.707106781187 0.707106781187 0.5
+
+# Ordered Aggregates (by x)
+query TT
+EXPLAIN SELECT
+  g,
+  ARRAY_AGG(x ORDER BY x),
+  ARRAY_AGG(DISTINCT x ORDER BY x),
+  FIRST_VALUE(x ORDER BY x),
+  LAST_VALUE(x ORDER BY x),
+  NTH_VALUE(x, 1 ORDER BY x)
+FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
+GROUP BY g
+ORDER BY g;
+----
+logical_plan
+01)Sort: stream_test.g ASC NULLS LAST
+02)--Aggregate: groupBy=[[stream_test.g]], aggr=[[array_agg(stream_test.x) ORDER BY [stream_test.x ASC NULLS LAST], array_agg(DISTINCT stream_test.x) ORDER BY [stream_test.x ASC NULLS LAST], first_value(stream_test.x) ORDER BY [stream_test.x ASC NULLS LAST], last_value(stream_test.x) ORDER BY [stream_test.x ASC NULLS LAST], nth_value(stream_test.x, Int64(1)) ORDER BY [stream_test.x ASC NULLS LAST]]]
+03)----Sort: stream_test.g ASC NULLS LAST, fetch=10000
+04)------TableScan: stream_test projection=[g, x]
+physical_plan
+01)AggregateExec: mode=Single, gby=[g@0 as g], aggr=[array_agg(stream_test.x) ORDER BY [stream_test.x ASC NULLS LAST], array_agg(DISTINCT stream_test.x) ORDER BY [stream_test.x ASC NULLS LAST], first_value(stream_test.x) ORDER BY [stream_test.x ASC NULLS LAST], last_value(stream_test.x) ORDER BY [stream_test.x ASC NULLS LAST], nth_value(stream_test.x,Int64(1)) ORDER BY [stream_test.x ASC NULLS LAST]], ordering_mode=Sorted
+02)--SortExec: TopK(fetch=10000), expr=[g@0 ASC NULLS LAST, x@1 ASC NULLS LAST], preserve_partitioning=[false]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+query I??RRR
+SELECT
+  g,
+  ARRAY_AGG(x ORDER BY x),
+  ARRAY_AGG(DISTINCT x ORDER BY x),
+  FIRST_VALUE(x ORDER BY x),
+  LAST_VALUE(x ORDER BY x),
+  NTH_VALUE(x, 1 ORDER BY x)
+FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
+GROUP BY g
+ORDER BY g;
+----
+1 [1.0, 2.0] [1.0, 2.0] 1 2 1
+2 [1.0, 2.0] [1.0, 2.0] 1 2 1
+3 [1.0, 2.0] [1.0, 2.0] 1 2 1
+
+# Ordered Aggregates (by s)
+query TT
+EXPLAIN SELECT
+  g,
+  ARRAY_AGG(s ORDER BY s),
+  STRING_AGG(s, '|' ORDER BY s),
+  STRING_AGG(DISTINCT s, '|' ORDER BY s)
+FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
+GROUP BY g
+ORDER BY g;
+----
+logical_plan
+01)Sort: stream_test.g ASC NULLS LAST
+02)--Aggregate: groupBy=[[stream_test.g]], aggr=[[array_agg(stream_test.s) ORDER BY [stream_test.s ASC NULLS LAST], string_agg(stream_test.s, Utf8("|")) ORDER BY [stream_test.s ASC NULLS LAST], string_agg(DISTINCT stream_test.s, Utf8("|")) ORDER BY [stream_test.s ASC NULLS LAST]]]
+03)----Sort: stream_test.g ASC NULLS LAST, fetch=10000
+04)------TableScan: stream_test projection=[g, s]
+physical_plan
+01)AggregateExec: mode=Single, gby=[g@0 as g], aggr=[array_agg(stream_test.s) ORDER BY [stream_test.s ASC NULLS LAST], string_agg(stream_test.s,Utf8("|")) ORDER BY [stream_test.s ASC NULLS LAST], string_agg(DISTINCT stream_test.s,Utf8("|")) ORDER BY [stream_test.s ASC NULLS LAST]], ordering_mode=Sorted
+02)--SortExec: TopK(fetch=10000), expr=[g@0 ASC NULLS LAST, s@1 ASC NULLS LAST], preserve_partitioning=[false]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+query I?TT
+SELECT
+  g,
+  ARRAY_AGG(s ORDER BY s),
+  STRING_AGG(s, '|' ORDER BY s),
+  STRING_AGG(DISTINCT s, '|' ORDER BY s)
+FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
+GROUP BY g
+ORDER BY g;
+----
+1 [a, b] a|b a|b
+2 [c, d] c|d c|d
+3 [e, f] e|f e|f
+
+# Statistical & Regression Aggregates
+query TT
+EXPLAIN SELECT
+  g,
+  CORR(x, y),
+  COVAR(x, y),
+  COVAR_SAMP(x, y),
+  COVAR_POP(x, y),
+  REGR_SXX(x, y),
+  REGR_SXY(x, y),
+  REGR_SYY(x, y),
+  REGR_AVGX(x, y),
+  REGR_AVGY(x, y),
+  REGR_COUNT(x, y),
+  REGR_SLOPE(x, y),
+  REGR_INTERCEPT(x, y),
+  REGR_R2(x, y)
+FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
+GROUP BY g
+ORDER BY g;
+----
+logical_plan
+01)Sort: stream_test.g ASC NULLS LAST
+02)--Projection: stream_test.g, corr(stream_test.x,stream_test.y), covar_samp(stream_test.x,stream_test.y) AS covar(stream_test.x,stream_test.y), covar_samp(stream_test.x,stream_test.y), covar_pop(stream_test.x,stream_test.y), regr_sxx(stream_test.x,stream_test.y), regr_sxy(stream_test.x,stream_test.y), regr_syy(stream_test.x,stream_test.y), regr_avgx(stream_test.x,stream_test.y), regr_avgy(stream_test.x,stream_test.y), regr_count(stream_test.x,stream_test.y), regr_slope(stream_test.x,stream_test.y), regr_intercept(stream_test.x,stream_test.y), regr_r2(stream_test.x,stream_test.y)
+03)----Aggregate: groupBy=[[stream_test.g]], aggr=[[corr(stream_test.x, stream_test.y), covar_samp(stream_test.x, stream_test.y), covar_pop(stream_test.x, stream_test.y), regr_sxx(stream_test.x, stream_test.y), regr_sxy(stream_test.x, stream_test.y), regr_syy(stream_test.x, stream_test.y), regr_avgx(stream_test.x, stream_test.y), regr_avgy(stream_test.x, stream_test.y), regr_count(stream_test.x, stream_test.y), regr_slope(stream_test.x, stream_test.y), regr_intercept(stream_test.x, stream_test.y), regr_r2(stream_test.x, stream_test.y)]]
+04)------Sort: stream_test.g ASC NULLS LAST, fetch=10000
+05)--------TableScan: stream_test projection=[g, x, y]
+physical_plan
+01)ProjectionExec: expr=[g@0 as g, corr(stream_test.x,stream_test.y)@1 as corr(stream_test.x,stream_test.y), covar_samp(stream_test.x,stream_test.y)@2 as covar(stream_test.x,stream_test.y), covar_samp(stream_test.x,stream_test.y)@2 as covar_samp(stream_test.x,stream_test.y), covar_pop(stream_test.x,stream_test.y)@3 as covar_pop(stream_test.x,stream_test.y), regr_sxx(stream_test.x,stream_test.y)@4 as regr_sxx(stream_test.x,stream_test.y), regr_sxy(stream_test.x,stream_test.y)@5 as regr_sxy(stream_test.x,stream_test.y), regr_syy(stream_test.x,stream_test.y)@6 as regr_syy(stream_test.x,stream_test.y), regr_avgx(stream_test.x,stream_test.y)@7 as regr_avgx(stream_test.x,stream_test.y), regr_avgy(stream_test.x,stream_test.y)@8 as regr_avgy(stream_test.x,stream_test.y), regr_count(stream_test.x,stream_test.y)@9 as regr_count(stream_test.x,stream_test.y), regr_slope(stream_test.x,stream_test.y)@10 as regr_slope(stream_test.x,stream_test.y), regr_intercept(stream_test.x,stream_test.y)@11 as regr_intercept(stream_test.x,stream_test.y), regr_r2(stream_test.x,stream_test.y)@12 as regr_r2(stream_test.x,stream_test.y)]
+02)--AggregateExec: mode=Single, gby=[g@0 as g], aggr=[corr(stream_test.x,stream_test.y), covar_samp(stream_test.x,stream_test.y), covar_pop(stream_test.x,stream_test.y), regr_sxx(stream_test.x,stream_test.y), regr_sxy(stream_test.x,stream_test.y), regr_syy(stream_test.x,stream_test.y), regr_avgx(stream_test.x,stream_test.y), regr_avgy(stream_test.x,stream_test.y), regr_count(stream_test.x,stream_test.y), regr_slope(stream_test.x,stream_test.y), regr_intercept(stream_test.x,stream_test.y), regr_r2(stream_test.x,stream_test.y)], ordering_mode=Sorted
+03)----SortExec: TopK(fetch=10000), expr=[g@0 ASC NULLS LAST], preserve_partitioning=[false]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query IRRRRRRRRRIRRR
+SELECT
+  g,
+  CORR(x, y),
+  COVAR(x, y),
+  COVAR_SAMP(x, y),
+  COVAR_POP(x, y),
+  REGR_SXX(x, y),
+  REGR_SXY(x, y),
+  REGR_SYY(x, y),
+  REGR_AVGX(x, y),
+  REGR_AVGY(x, y),
+  REGR_COUNT(x, y),
+  REGR_SLOPE(x, y),
+  REGR_INTERCEPT(x, y),
+  REGR_R2(x, y)
+FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
+GROUP BY g
+ORDER BY g;
+----
+1 1 0.5 0.5 0.25 0.5 0.5 0.5 1.5 1.5 2 1 0 1
+2 NULL 0 0 0 0 0 0.5 5 1.5 2 NULL NULL NULL
+3 1 0.5 0.5 0.25 0.5 0.5 0.5 1.5 1.5 2 1 0 1
+
+# Approximate and Ordered-Set Aggregates
+query TT
+EXPLAIN SELECT
+  g,
+  APPROX_DISTINCT(i),
+  APPROX_MEDIAN(x),
+  PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY x),
+  QUANTILE_CONT(0.5) WITHIN GROUP (ORDER BY x),
+  APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY x),
+  APPROX_PERCENTILE_CONT_WITH_WEIGHT(1.0, 0.5) WITHIN GROUP (ORDER BY x),
+  PERCENTILE_CONT(x, 0.5),
+  APPROX_PERCENTILE_CONT(x, 0.5),
+  APPROX_PERCENTILE_CONT_WITH_WEIGHT(x, 1.0, 0.5)
+FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
+GROUP BY g
+ORDER BY g;
+----
+logical_plan
+01)Sort: stream_test.g ASC NULLS LAST
+02)--Projection: stream_test.g, approx_distinct(stream_test.i), approx_median(stream_test.x), percentile_cont(Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST], percentile_cont(Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST] AS quantile_cont(stream_test.x,Float64(0.5)), approx_percentile_cont(Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST], approx_percentile_cont_with_weight(Float64(1),Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST], percentile_cont(stream_test.x,Float64(0.5)), approx_percentile_cont(stream_test.x,Float64(0.5)), approx_percentile_cont_with_weight(stream_test.x,Float64(1),Float64(0.5))
+03)----Aggregate: groupBy=[[stream_test.g]], aggr=[[approx_distinct(stream_test.i), approx_median(stream_test.x), percentile_cont(stream_test.x, Float64(0.5)) ORDER BY [stream_test.x ASC NULLS LAST], approx_percentile_cont(stream_test.x, Float64(0.5)) ORDER BY [stream_test.x ASC NULLS LAST], approx_percentile_cont_with_weight(stream_test.x, Float64(1), Float64(0.5)) ORDER BY [stream_test.x ASC NULLS LAST], percentile_cont(stream_test.x, Float64(0.5)), approx_percentile_cont(stream_test.x, Float64(0.5)), approx_percentile_cont_with_weight(stream_test.x, Float64(1), Float64(0.5))]]
+04)------Sort: stream_test.g ASC NULLS LAST, fetch=10000
+05)--------TableScan: stream_test projection=[g, x, i]
+physical_plan
+01)ProjectionExec: expr=[g@0 as g, approx_distinct(stream_test.i)@1 as approx_distinct(stream_test.i), approx_median(stream_test.x)@2 as approx_median(stream_test.x), percentile_cont(Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST]@3 as percentile_cont(Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST], percentile_cont(Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST]@3 as quantile_cont(stream_test.x,Float64(0.5)), approx_percentile_cont(Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST]@4 as approx_percentile_cont(Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST], approx_percentile_cont_with_weight(Float64(1),Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST]@5 as approx_percentile_cont_with_weight(Float64(1),Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST], percentile_cont(stream_test.x,Float64(0.5))@6 as percentile_cont(stream_test.x,Float64(0.5)), approx_percentile_cont(stream_test.x,Float64(0.5))@7 as approx_percentile_cont(stream_test.x,Float64(0.5)), approx_percentile_cont_with_weight(stream_test.x,Float64(1),Float64(0.5))@8 as approx_percentile_cont_with_weight(stream_test.x,Float64(1),Float64(0.5))]
+02)--AggregateExec: mode=Single, gby=[g@0 as g], aggr=[approx_distinct(stream_test.i), approx_median(stream_test.x), percentile_cont(Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST], approx_percentile_cont(Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST], approx_percentile_cont_with_weight(Float64(1),Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST], percentile_cont(stream_test.x,Float64(0.5)), approx_percentile_cont(stream_test.x,Float64(0.5)), approx_percentile_cont_with_weight(stream_test.x,Float64(1),Float64(0.5))], ordering_mode=Sorted
+03)----SortExec: TopK(fetch=10000), expr=[g@0 ASC NULLS LAST], preserve_partitioning=[false]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query IIRRRRRRRR
+SELECT
+  g,
+  APPROX_DISTINCT(i),
+  APPROX_MEDIAN(x),
+  PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY x),
+  QUANTILE_CONT(0.5) WITHIN GROUP (ORDER BY x),
+  APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY x),
+  APPROX_PERCENTILE_CONT_WITH_WEIGHT(1.0, 0.5) WITHIN GROUP (ORDER BY x),
+  PERCENTILE_CONT(x, 0.5),
+  APPROX_PERCENTILE_CONT(x, 0.5),
+  APPROX_PERCENTILE_CONT_WITH_WEIGHT(x, 1.0, 0.5)
+FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
+GROUP BY g
+ORDER BY g;
+----
+1 2 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5
+2 2 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5
+3 2 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5
+
+# Config reset
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+DROP TABLE stream_test;
+
+
+#################
+# first_value on strings/binary with groups and ordering
+#################
+
+statement ok
+CREATE TABLE first_last_value_str_tests(id INT, sort_key INT, val TEXT) AS VALUES
+(1, 10, 'apple'),
+(1, 2,  'banana'),
+(1, 5,  'cherry'),
+(2, 100, 'dog'),
+(2, 200, 'elephant');
+
+# Utf8
+query IT
+SELECT id, first_value(val ORDER BY sort_key)
+FROM (SELECT id, sort_key, arrow_cast(val, 'Utf8') as val FROM first_last_value_str_tests)
+GROUP BY id ORDER BY id;
+----
+1 banana
+2 dog
+
+query IT
+SELECT id, last_value(val ORDER BY sort_key)
+FROM (SELECT id, sort_key, arrow_cast(val, 'Utf8') as val FROM first_last_value_str_tests)
+GROUP BY id ORDER BY id;
+----
+1 apple
+2 elephant
+
+# LargeUtf8
+query IT
+SELECT id, first_value(val ORDER BY sort_key)
+FROM (SELECT id, sort_key, arrow_cast(val, 'LargeUtf8') as val FROM first_last_value_str_tests)
+GROUP BY id ORDER BY id;
+----
+1 banana
+2 dog
+
+query IT
+SELECT id, last_value(val ORDER BY sort_key)
+FROM (SELECT id, sort_key, arrow_cast(val, 'LargeUtf8') as val FROM first_last_value_str_tests)
+GROUP BY id ORDER BY id;
+----
+1 apple
+2 elephant
+
+# Utf8View
+query IT
+SELECT id, first_value(val ORDER BY sort_key)
+FROM (SELECT id, sort_key, arrow_cast(val, 'Utf8View') as val FROM first_last_value_str_tests)
+GROUP BY id ORDER BY id;
+----
+1 banana
+2 dog
+
+query IT
+SELECT id, last_value(val ORDER BY sort_key)
+FROM (SELECT id, sort_key, arrow_cast(val, 'Utf8View') as val FROM first_last_value_str_tests)
+GROUP BY id ORDER BY id;
+----
+1 apple
+2 elephant
+
+# Binary
+query I?
+SELECT id, first_value(val ORDER BY sort_key)
+FROM (SELECT id, sort_key, arrow_cast(val, 'Binary') as val FROM first_last_value_str_tests)
+GROUP BY id ORDER BY id;
+----
+1 62616e616e61
+2 646f67
+
+query I?
+SELECT id, last_value(val ORDER BY sort_key)
+FROM (SELECT id, sort_key, arrow_cast(val, 'Binary') as val FROM first_last_value_str_tests)
+GROUP BY id ORDER BY id;
+----
+1 6170706c65
+2 656c657068616e74
+
+# LargeBinary
+query I?
+SELECT id, first_value(val ORDER BY sort_key)
+FROM (SELECT id, sort_key, arrow_cast(val, 'LargeBinary') as val FROM first_last_value_str_tests)
+GROUP BY id ORDER BY id;
+----
+1 62616e616e61
+2 646f67
+
+query I?
+SELECT id, last_value(val ORDER BY sort_key)
+FROM (SELECT id, sort_key, arrow_cast(val, 'LargeBinary') as val FROM first_last_value_str_tests)
+GROUP BY id ORDER BY id;
+----
+1 6170706c65
+2 656c657068616e74
+
+# BinaryView
+query I?
+SELECT id, first_value(val ORDER BY sort_key)
+FROM (SELECT id, sort_key, arrow_cast(arrow_cast(val, 'Binary'), 'BinaryView') as val FROM first_last_value_str_tests)
+GROUP BY id ORDER BY id;
+----
+1 62616e616e61
+2 646f67
+
+query I?
+SELECT id, last_value(val ORDER BY sort_key)
+FROM (SELECT id, sort_key, arrow_cast(arrow_cast(val, 'Binary'), 'BinaryView') as val FROM first_last_value_str_tests)
+GROUP BY id ORDER BY id;
+----
+1 6170706c65
+2 656c657068616e74
+
+statement ok
+DROP TABLE first_last_value_str_tests;
diff --git a/datafusion/sqllogictest/test_files/aggregate_repartition.slt b/datafusion/sqllogictest/test_files/aggregate_repartition.slt
index 27602b61e4244..1f1e726811675 100644
--- a/datafusion/sqllogictest/test_files/aggregate_repartition.slt
+++ b/datafusion/sqllogictest/test_files/aggregate_repartition.slt
@@ -77,16 +77,12 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[env@0 as env, count(Int64(1))@1 as count(*)]
 02)--AggregateExec: mode=FinalPartitioned, gby=[env@0 as env], aggr=[count(Int64(1))]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------RepartitionExec: partitioning=Hash([env@0], 4), input_partitions=4
-05)--------AggregateExec: mode=Partial, gby=[env@0 as env], aggr=[count(Int64(1))]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/aggregate_repartition/dim.csv]]}, projection=[env], file_type=csv, has_header=true
+03)----RepartitionExec: partitioning=Hash([env@0], 4), input_partitions=4
+04)------AggregateExec: mode=Partial, gby=[env@0 as env], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/aggregate_repartition/dim.csv]]}, projection=[env], file_type=csv, has_header=true
 
 # Test 2: EXPLAIN query for Parquet table with GROUP BY
-# This plan differs from the one above and includes two consecutive repartitions — one round-robin and one hash —
-# which seems unnecessary. We may want to align it with the previous plan (push the round robin down or remove the round robin), or, if the input file is small,
-# avoid repartitioning altogether. A single partition should suffice for a single-step aggregate as the plan after this.
 
 query TT
 EXPLAIN SELECT env, count(*) FROM dim_parquet GROUP BY env;
@@ -98,11 +94,9 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[env@0 as env, count(Int64(1))@1 as count(*)]
 02)--AggregateExec: mode=FinalPartitioned, gby=[env@0 as env], aggr=[count(Int64(1))]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------RepartitionExec: partitioning=Hash([env@0], 4), input_partitions=4
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)----------AggregateExec: mode=Partial, gby=[env@0 as env], aggr=[count(Int64(1))]
-07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/aggregate_repartition/dim.parquet]]}, projection=[env], file_type=parquet
+03)----RepartitionExec: partitioning=Hash([env@0], 4), input_partitions=1
+04)------AggregateExec: mode=Partial, gby=[env@0 as env], aggr=[count(Int64(1))]
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/aggregate_repartition/dim.parquet]]}, projection=[env], file_type=parquet
 
 # Verify the queries actually work and return the same results
 query TI rowsort
@@ -134,3 +128,10 @@ physical_plan
 01)ProjectionExec: expr=[env@0 as env, count(Int64(1))@1 as count(*)]
 02)--AggregateExec: mode=Single, gby=[env@0 as env], aggr=[count(Int64(1))]
 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/aggregate_repartition/dim.parquet]]}, projection=[env], file_type=parquet
+
+# Config reset
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+SET datafusion.execution.target_partitions = 4;
diff --git a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt
index 5dcb72b7055b8..cd95426a9b0bb 100644
--- a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt
+++ b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt
@@ -145,7 +145,7 @@ GROUP BY 1, 2 ORDER BY 1 LIMIT 5;
 -2117946883 d -2117946883 NULL NULL NULL
 -2098805236 c -2098805236 NULL NULL NULL
 
-query ITIIII
+query ITRRRR
 SELECT c5, c1,
        APPROX_MEDIAN(c5),
        APPROX_MEDIAN(CASE WHEN c1 = 'a' THEN c5 ELSE NULL END),
@@ -175,18 +175,32 @@ GROUP BY 1, 2 ORDER BY 1 LIMIT 5;
 -2117946883 d 1 0 0 0
 -2098805236 c 1 0 0 0
 
-# FIXME: add bool_and(v3) column when issue fixed
-# ISSUE https://github.com/apache/datafusion/issues/11846
-query TBBB rowsort
-select v1, bool_or(v2), bool_and(v2), bool_or(v3)
+query IT????
+SELECT c5, c1,
+       ARRAY_AGG(c3),
+       ARRAY_AGG(CASE WHEN c1 = 'a' THEN c3 ELSE NULL END),
+       ARRAY_AGG(c3) FILTER (WHERE c1 = 'b'),
+       ARRAY_AGG(CASE WHEN c1 = 'a' THEN c3 ELSE NULL END) FILTER (WHERE c1 = 'b')
+FROM aggregate_test_100
+GROUP BY 1, 2 ORDER BY 1 LIMIT 5;
+----
+-2141999138 c [-2] [NULL] NULL NULL
+-2141451704 a [-72] [-72] NULL NULL
+-2138770630 b [63] [NULL] [63] [NULL]
+-2117946883 d [-59] [NULL] NULL NULL
+-2098805236 c [22] [NULL] NULL NULL
+
+# Regression test for https://github.com/apache/datafusion/issues/11846
+query TBBBB rowsort
+select v1, bool_or(v2), bool_and(v2), bool_or(v3), bool_and(v3)
 from aggregate_test_100_bool
 group by v1
 ----
-a true false true
-b true false true
-c true false false
-d true false false
-e true false NULL
+a true false true true
+b true false true true
+c true false false false
+d true false false false
+e true false NULL NULL
 
 query TBBB rowsort
 select v1,
@@ -202,6 +216,21 @@ c true false NULL
 d NULL false NULL
 e true false NULL
 
+# Config reset
+statement ok
+reset datafusion.execution.batch_size;
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+reset datafusion.execution.skip_partial_aggregation_probe_ratio_threshold;
+
+statement ok
+reset datafusion.execution.skip_partial_aggregation_probe_rows_threshold;
+
 # Prepare settings to always skip aggregation after couple of batches
 statement ok
 set datafusion.execution.skip_partial_aggregation_probe_rows_threshold = 10;
@@ -245,6 +274,19 @@ SELECT c2, count(c1), count(c5), count(c11) FROM aggregate_test_100 GROUP BY c2
 4 23 23 23
 5 14 14 14
 
+# Test array_agg; we sort the output to ensure deterministic results
+query I??
+SELECT c2,
+       array_sort(array_agg(c5)),
+       array_sort(array_agg(c3) FILTER (WHERE c3 > 0))
+FROM aggregate_test_100 GROUP BY c2 ORDER BY c2;
+----
+1 [-1991133944, -1882293856, -1448995523, -1383162419, -1339586153, -1331533190, -1176490478, -1143802338, -928766616, -644225469, -335410409, 383352709, 431378678, 794623392, 994303988, 1171968280, 1188089983, 1213926989, 1325868318, 1413111008, 2106705285, 2143473091] [12, 29, 36, 38, 41, 54, 57, 70, 71, 83, 103, 120, 125]
+2 [-2138770630, -1927628110, -1908480893, -1899175111, -1808210365, -1660426473, -1222533990, -1090239422, -1011669561, -800561771, -587831330, -537142430, -168758331, -108973366, 49866617, 370975815, 439738328, 715235348, 1354539333, 1593800404, 2033001162, 2053379412] [1, 29, 31, 45, 49, 52, 52, 63, 68, 93, 97, 113, 122]
+3 [-2141999138, -2141451704, -2098805236, -1302295658, -903316089, -421042466, -382483011, -346989627, 141218956, 240273900, 397430452, 670497898, 912707948, 1299719633, 1337043149, 1436496767, 1489733240, 1738331255, 2030965207] [13, 13, 14, 17, 17, 22, 71, 73, 77, 97, 104, 112, 123]
+4 [-1885422396, -1813935549, -1009656194, -673237643, -237425046, -4229382, 61035129, 427197269, 434021400, 659422734, 702611616, 762932956, 852509237, 1282464673, 1423957796, 1544188174, 1579876740, 1902023838, 1991172974, 1993193190, 2047637360, 2051224722, 2064155045] [3, 5, 17, 30, 47, 55, 65, 73, 74, 96, 97, 102, 123]
+5 [-2117946883, -842693467, -629486480, -467659022, -134213907, 41423756, 586844478, 623103518, 706441268, 1188285940, 1689098844, 1824882165, 1955646088, 2025611582] [36, 62, 64, 68, 118]
+
 # Test min / max for int / float
 query IIIRR
 SELECT c2, min(c5), max(c5), min(c11), max(c11) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2;
@@ -276,14 +318,14 @@ SELECT c2, median(c5), median(c11) FROM aggregate_test_100 GROUP BY c2 ORDER BY
 5 604973998 0.49842384
 
 # Test approx_median for int / float
-query IIR
+query IRR
 SELECT c2, approx_median(c5), approx_median(c11) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2;
 ----
-1 191655437 0.59926736
+1 191655437.25 0.59926736
 2 -587831330 0.43230486
 3 240273900 0.40199697
 4 762932956 0.48515016
-5 593204320 0.5156586
+5 593204320.5 0.5156586
 
 # Test approx_distinct for varchar / int
 query III
@@ -357,14 +399,14 @@ SELECT c2, median(c3), median(c11) FROM aggregate_test_100_null GROUP BY c2 ORDE
 5 -35 0.5536642
 
 # Test approx_median with nullable fields
-query IIR
+query IRR
 SELECT c2, approx_median(c3), approx_median(c11) FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2;
 ----
 1 12 0.6067944
 2 1 0.46076488
 3 14 0.40154034
-4 -7 0.48515016
-5 -39 0.5536642
+4 -7.75 0.48515016
+5 -39.75 0.5536642
 
 # Test approx_distinct with nullable fields
 query II
@@ -390,19 +432,6 @@ c 2.666666666667 0.425241138254
 d 2.444444444444 0.541519476308
 e 3 0.505440263521
 
-# FIXME: add bool_and(v3) column when issue fixed
-# ISSUE https://github.com/apache/datafusion/issues/11846
-query TBBB rowsort
-select v1, bool_or(v2), bool_and(v2), bool_or(v3)
-from aggregate_test_100_bool
-group by v1
-----
-a true false true
-b true false true
-c true false false
-d true false false
-e true false NULL
-
 query TBBB rowsort
 select v1,
       bool_or(v2) FILTER (WHERE v1 = 'a' OR v1 = 'c' OR v1 = 'e'),
@@ -487,7 +516,7 @@ FROM aggregate_test_100 GROUP BY c2 ORDER BY c2;
 5 64 -59
 
 # Test approx_median with filter
-query III
+query IRR
 SELECT
   c2,
   approx_median(c3) FILTER (WHERE c3 > 0),
@@ -497,7 +526,7 @@ FROM aggregate_test_100 GROUP BY c2 ORDER BY c2;
 1 57 -56
 2 52 -60
 3 71 -76
-4 65 -64
+4 65 -64.75
 5 64 -59
 
 # Test count with nullable fields and filter
@@ -633,20 +662,20 @@ FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2;
 5 -22
 
 # Test approx_median with nullable fields and filter
-query IIR
+query IRR
 SELECT c2,
        approx_median(c3) FILTER (WHERE c5 > 0),
        approx_median(c11) FILTER (WHERE c5 < 0)
 FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2;
 ----
 1 -5 0.6623719
-2 12 0.52930677
+2 12.25 0.52930677
 3 13 0.32792538
 4 -38 0.49774808
-5 -21 0.47652745
+5 -21.75 0.47652745
 
 # Test approx_median with nullable fields and nullable filter
-query II
+query IR
 SELECT c2,
        approx_median(c3) FILTER (WHERE c11 > 0.5)
 FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2;
@@ -679,6 +708,21 @@ ORDER BY i;
 2 66
 3 33
 
+# Config reset
+statement ok
+reset datafusion.execution.batch_size;
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+reset datafusion.execution.skip_partial_aggregation_probe_ratio_threshold;
+
+statement ok
+reset datafusion.execution.skip_partial_aggregation_probe_rows_threshold;
+
 statement ok
 DROP TABLE decimal_table;
 
@@ -724,5 +768,20 @@ SELECT bool_and(c1), bool_and(c2), bool_and(c3), bool_and(c4), bool_and(c5), boo
 ----
 true false false false false true false NULL
 
+# Config reset
+statement ok
+reset datafusion.execution.batch_size;
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+reset datafusion.execution.skip_partial_aggregation_probe_ratio_threshold;
+
+statement ok
+reset datafusion.execution.skip_partial_aggregation_probe_rows_threshold;
+
 statement ok
 DROP TABLE aggregate_test_100_bool
diff --git a/datafusion/sqllogictest/test_files/aggregates_simplify.slt b/datafusion/sqllogictest/test_files/aggregates_simplify.slt
new file mode 100644
index 0000000000000..9aa3ecf7a29f8
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/aggregates_simplify.slt
@@ -0,0 +1,358 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#######
+# Tests for aggregate optimizations / simplifications
+#######
+
+statement ok
+CREATE TABLE sum_simplify_t AS VALUES (1, 100), (1, 200), (2, 100), (NULL, NULL);
+
+# Baseline SUM of an expression
+query I
+SELECT SUM(column1 + 1) FROM sum_simplify_t;
+----
+7
+
+query TT
+EXPLAIN SELECT SUM(column1 + 1) FROM sum_simplify_t;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1))]]
+02)--TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1))]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+
+# Mixed aggregate expressions with type validation
+query TI
+SELECT arrow_typeof(SUM(column1)), SUM(column1 + 1) FROM sum_simplify_t;
+----
+Int64 7
+
+query TT
+EXPLAIN SELECT arrow_typeof(SUM(column1)), SUM(column1), SUM(column1 + 1) FROM sum_simplify_t;
+----
+logical_plan
+01)Projection: arrow_typeof(sum(sum_simplify_t.column1)), sum(sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1))
+02)--Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1))]]
+03)----TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)ProjectionExec: expr=[arrow_typeof(sum(sum_simplify_t.column1)@0) as arrow_typeof(sum(sum_simplify_t.column1)), sum(sum_simplify_t.column1)@0 as sum(sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1))@1 as sum(sum_simplify_t.column1 + Int64(1))]
+02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1))]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Duplicate aggregate expressions
+query II
+SELECT SUM(column1 + 1) AS sum_plus_1_a, SUM(column1 + 1) AS sum_plus_1_b FROM sum_simplify_t;
+----
+7 7
+
+query TT
+EXPLAIN SELECT SUM(column1 + 1) AS sum_plus_1_a, SUM(column1 + 1) AS sum_plus_1_b FROM sum_simplify_t;
+----
+logical_plan
+01)Projection: sum(sum_simplify_t.column1 + Int64(1)) AS sum_plus_1_a, sum(sum_simplify_t.column1 + Int64(1)) AS sum_plus_1_b
+02)--Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1))]]
+03)----TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)ProjectionExec: expr=[sum(sum_simplify_t.column1 + Int64(1))@0 as sum_plus_1_a, sum(sum_simplify_t.column1 + Int64(1))@0 as sum_plus_1_b]
+02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1))]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+
+# constant aggregate expressions
+query II
+SELECT SUM(2+1), SUM(3) FROM sum_simplify_t;
+----
+12 12
+
+query TT
+EXPLAIN SELECT SUM(2+1), SUM(3) FROM sum_simplify_t;
+----
+logical_plan
+01)Projection: __common_expr_1 AS sum(Int64(2) + Int64(1)), __common_expr_1 AS sum(Int64(3))
+02)--Aggregate: groupBy=[[]], aggr=[[sum(Int64(3)) AS __common_expr_1]]
+03)----TableScan: sum_simplify_t projection=[]
+physical_plan
+01)ProjectionExec: expr=[__common_expr_1@0 as sum(Int64(2) + Int64(1)), __common_expr_1@0 as sum(Int64(3))]
+02)--AggregateExec: mode=Single, gby=[], aggr=[__common_expr_1]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+
+# Duplicated expression across multiple aggregate arguments.
+query II
+SELECT SUM(column1 + 1), SUM(column1 + 2) FROM sum_simplify_t;
+----
+7 10
+
+
+query TT
+EXPLAIN SELECT SUM(column1 + 1), SUM(column1 + 2) FROM sum_simplify_t;
+----
+logical_plan
+01)Projection: sum(sum_simplify_t.column1) + __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1) + Int64(2) * __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(2))
+02)--Projection: CAST(count(sum_simplify_t.column1) AS Int64) AS __common_expr_1, sum(sum_simplify_t.column1)
+03)----Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]]
+04)------TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)ProjectionExec: expr=[sum(sum_simplify_t.column1)@0 + count(sum_simplify_t.column1)@1 as sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1)@0 + 2 * count(sum_simplify_t.column1)@1 as sum(sum_simplify_t.column1 + Int64(2))]
+02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Reordered expressions that still compute the same thing
+query II
+SELECT SUM(1 + column1), SUM(column1 + 2) FROM sum_simplify_t;
+----
+7 10
+
+query TT
+EXPLAIN SELECT SUM(1 + column1), SUM(column1 + 2) FROM sum_simplify_t;
+----
+logical_plan
+01)Projection: sum(sum_simplify_t.column1) + __common_expr_1 AS sum(Int64(1) + sum_simplify_t.column1), sum(sum_simplify_t.column1) + Int64(2) * __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(2))
+02)--Projection: CAST(count(sum_simplify_t.column1) AS Int64) AS __common_expr_1, sum(sum_simplify_t.column1)
+03)----Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]]
+04)------TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)ProjectionExec: expr=[sum(sum_simplify_t.column1)@0 + count(sum_simplify_t.column1)@1 as sum(Int64(1) + sum_simplify_t.column1), sum(sum_simplify_t.column1)@0 + 2 * count(sum_simplify_t.column1)@1 as sum(sum_simplify_t.column1 + Int64(2))]
+02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+# DISTINCT aggregates with different arguments
+query II
+SELECT SUM(DISTINCT column1 + 1), SUM(DISTINCT column1 + 2) FROM sum_simplify_t;
+----
+5 7
+
+query TT
+EXPLAIN SELECT SUM(DISTINCT column1 + 1), SUM(DISTINCT column1 + 2) FROM sum_simplify_t;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[sum(DISTINCT sum_simplify_t.column1 + Int64(1)), sum(DISTINCT sum_simplify_t.column1 + Int64(2))]]
+02)--TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)AggregateExec: mode=Single, gby=[], aggr=[sum(DISTINCT sum_simplify_t.column1 + Int64(1)), sum(DISTINCT sum_simplify_t.column1 + Int64(2))]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+# DISTINCT and non-DISTINCT aggregates
+query II
+SELECT SUM(DISTINCT column1 + 1), SUM(column1 + 1) FROM sum_simplify_t;
+----
+5 7
+
+query TT
+EXPLAIN SELECT SUM(DISTINCT column1 + 1), SUM(column1 + 1) FROM sum_simplify_t;
+----
+logical_plan
+01)Projection: sum(alias1) AS sum(DISTINCT sum_simplify_t.column1 + Int64(1)), sum(alias2) AS sum(sum_simplify_t.column1 + Int64(1))
+02)--Aggregate: groupBy=[[]], aggr=[[sum(alias1), sum(alias2)]]
+03)----Aggregate: groupBy=[[__common_expr_1 AS alias1]], aggr=[[sum(__common_expr_1) AS alias2]]
+04)------Projection: sum_simplify_t.column1 + Int64(1) AS __common_expr_1
+05)--------TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)ProjectionExec: expr=[sum(alias1)@0 as sum(DISTINCT sum_simplify_t.column1 + Int64(1)), sum(alias2)@1 as sum(sum_simplify_t.column1 + Int64(1))]
+02)--AggregateExec: mode=Final, gby=[], aggr=[sum(alias1), sum(alias2)]
+03)----CoalescePartitionsExec
+04)------AggregateExec: mode=Partial, gby=[], aggr=[sum(alias1), sum(alias2)]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[alias1@0 as alias1], aggr=[alias2]
+06)----------RepartitionExec: partitioning=Hash([alias1@0], 4), input_partitions=1
+07)------------AggregateExec: mode=Partial, gby=[__common_expr_1@0 as alias1], aggr=[alias2]
+08)--------------ProjectionExec: expr=[column1@0 + 1 as __common_expr_1]
+09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# FILTER clauses with different aggregate arguments
+query II
+SELECT SUM(column1 + 1) FILTER (WHERE column1 > 1), SUM(column1 + 2) FILTER (WHERE column1 > 2) FROM sum_simplify_t;
+----
+3 NULL
+
+query TT
+EXPLAIN SELECT SUM(column1 + 1) FILTER (WHERE column1 > 1), SUM(column1 + 2) FILTER (WHERE column1 > 2) FROM sum_simplify_t;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)), sum(sum_simplify_t.column1 + Int64(2)) FILTER (WHERE sum_simplify_t.column1 > Int64(2))]]
+02)--TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)), sum(sum_simplify_t.column1 + Int64(2)) FILTER (WHERE sum_simplify_t.column1 > Int64(2))]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+# FILTER clauses with the same aggregate argument
+query II
+SELECT
+    SUM(column1 + 1) FILTER (WHERE column1 > 1) AS filtered_sum_a,
+    SUM(column1 + 1) FILTER (WHERE column1 > 1) AS filtered_sum_b
+FROM sum_simplify_t;
+----
+3 3
+
+query TT
+EXPLAIN SELECT
+    SUM(column1 + 1) FILTER (WHERE column1 > 1) AS filtered_sum_a,
+    SUM(column1 + 1) FILTER (WHERE column1 > 1) AS filtered_sum_b
+FROM sum_simplify_t;
+----
+logical_plan
+01)Projection: sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)) AS filtered_sum_a, sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)) AS filtered_sum_b
+02)--Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1))]]
+03)----TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)ProjectionExec: expr=[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1))@0 as filtered_sum_a, sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1))@0 as filtered_sum_b]
+02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1))]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Same aggregate argument with different FILTER predicates
+query II
+SELECT SUM(column1 + 1) FILTER (WHERE column1 > 1), SUM(column1 + 1) FILTER (WHERE column1 > 0) FROM sum_simplify_t;
+----
+3 7
+
+query TT
+EXPLAIN SELECT SUM(column1 + 1) FILTER (WHERE column1 > 1), SUM(column1 + 1) FILTER (WHERE column1 > 0) FROM sum_simplify_t;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[sum(__common_expr_1 AS sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)), sum(__common_expr_1 AS sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(0))]]
+02)--Projection: sum_simplify_t.column1 + Int64(1) AS __common_expr_1, sum_simplify_t.column1
+03)----TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)), sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(0))]
+02)--ProjectionExec: expr=[column1@0 + 1 as __common_expr_1, column1@0 as column1]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+# volatile aggregate arguments
+query B
+SELECT SUM(random() + 1) < SUM(random() + 2) FROM sum_simplify_t;
+----
+true
+
+query TT
+EXPLAIN SELECT SUM(random() + 1) < SUM(random() + 2) FROM sum_simplify_t;
+----
+logical_plan
+01)Projection: sum(random() + Int64(2)) > sum(random() + Int64(1)) AS sum(random() + Int64(1)) < sum(random() + Int64(2))
+02)--Aggregate: groupBy=[[]], aggr=[[sum(random() + Float64(1)) AS sum(random() + Int64(1)), sum(random() + Float64(2)) AS sum(random() + Int64(2))]]
+03)----TableScan: sum_simplify_t projection=[]
+physical_plan
+01)ProjectionExec: expr=[sum(random() + Int64(2))@1 > sum(random() + Int64(1))@0 as sum(random() + Int64(1)) < sum(random() + Int64(2))]
+02)--AggregateExec: mode=Single, gby=[], aggr=[sum(random() + Int64(1)), sum(random() + Int64(2))]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Checks grouped aggregates with explicit ORDER BY return deterministic row order.
+query III
+SELECT column2, SUM(column1 + 1), SUM(column1 + 2) FROM sum_simplify_t GROUP BY column2 ORDER BY column2 DESC NULLS LAST;
+----
+200 2 3
+100 5 7
+NULL NULL NULL
+
+query TT
+EXPLAIN SELECT column2, SUM(column1 + 1), SUM(column1 + 2) FROM sum_simplify_t GROUP BY column2 ORDER BY column2 DESC NULLS LAST;
+----
+logical_plan
+01)Sort: sum_simplify_t.column2 DESC NULLS LAST
+02)--Projection: sum_simplify_t.column2, sum(sum_simplify_t.column1) + __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1) + Int64(2) * __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(2))
+03)----Projection: CAST(count(sum_simplify_t.column1) AS Int64) AS __common_expr_1, sum_simplify_t.column2, sum(sum_simplify_t.column1)
+04)------Aggregate: groupBy=[[sum_simplify_t.column2]], aggr=[[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]]
+05)--------TableScan: sum_simplify_t projection=[column1, column2]
+physical_plan
+01)SortPreservingMergeExec: [column2@0 DESC NULLS LAST]
+02)--SortExec: expr=[column2@0 DESC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[column2@0 as column2, sum(sum_simplify_t.column1)@1 + count(sum_simplify_t.column1)@2 as sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1)@1 + 2 * count(sum_simplify_t.column1)@2 as sum(sum_simplify_t.column1 + Int64(2))]
+04)------AggregateExec: mode=FinalPartitioned, gby=[column2@0 as column2], aggr=[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]
+05)--------RepartitionExec: partitioning=Hash([column2@0], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[column2@1 as column2], aggr=[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Checks commutative forms of equivalent aggregate arguments are simplified consistently.
+query II
+SELECT SUM(1 + column1), SUM(column1 + 1) FROM sum_simplify_t;
+----
+7 7
+
+query TT
+EXPLAIN SELECT SUM(1 + column1), SUM(column1 + 1) FROM sum_simplify_t;
+----
+logical_plan
+01)Projection: __common_expr_1 AS sum(Int64(1) + sum_simplify_t.column1), __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(1))
+02)--Projection: sum(sum_simplify_t.column1) + CAST(count(sum_simplify_t.column1) AS Int64) AS __common_expr_1
+03)----Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]]
+04)------TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)ProjectionExec: expr=[__common_expr_1@0 as sum(Int64(1) + sum_simplify_t.column1), __common_expr_1@0 as sum(sum_simplify_t.column1 + Int64(1))]
+02)--ProjectionExec: expr=[sum(sum_simplify_t.column1)@0 + count(sum_simplify_t.column1)@1 as __common_expr_1]
+03)----AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Checks unsigned overflow edge case from PR discussion using transformed SUM arguments.
+statement ok
+CREATE TABLE IF NOT EXISTS tbl (val INTEGER UNSIGNED);
+
+statement ok
+INSERT INTO tbl VALUES (4294967295);
+
+statement ok
+INSERT INTO tbl VALUES (4294967295);
+
+# Checks transformed SUM results for unsigned max values are preserved.
+query TII
+SELECT arrow_typeof(SUM(val + 1)), SUM(val + 1), SUM(val + 2) FROM tbl;
+----
+Int64 8589934592 8589934594
+
+query TT
+EXPLAIN SELECT arrow_typeof(SUM(val + 1)), SUM(val + 1), SUM(val + 2) FROM tbl;
+----
+logical_plan
+01)Projection: arrow_typeof(sum(tbl.val + Int64(1))), sum(tbl.val + Int64(1)), sum(tbl.val + Int64(2))
+02)--Projection: sum(tbl.val) + __common_expr_1 AS sum(tbl.val + Int64(1)), sum(tbl.val) + Int64(2) * __common_expr_1 AS sum(tbl.val + Int64(2))
+03)----Projection: CAST(count(tbl.val) AS Int64) AS __common_expr_1, sum(tbl.val)
+04)------Aggregate: groupBy=[[]], aggr=[[sum(__common_expr_2 AS tbl.val), count(__common_expr_2 AS tbl.val)]]
+05)--------Projection: CAST(tbl.val AS Int64) AS __common_expr_2
+06)----------TableScan: tbl projection=[val]
+physical_plan
+01)ProjectionExec: expr=[arrow_typeof(sum(tbl.val + Int64(1))@0) as arrow_typeof(sum(tbl.val + Int64(1))), sum(tbl.val + Int64(1))@0 as sum(tbl.val + Int64(1)), sum(tbl.val + Int64(2))@1 as sum(tbl.val + Int64(2))]
+02)--ProjectionExec: expr=[sum(tbl.val)@0 + count(tbl.val)@1 as sum(tbl.val + Int64(1)), sum(tbl.val)@0 + 2 * count(tbl.val)@1 as sum(tbl.val + Int64(2))]
+03)----AggregateExec: mode=Single, gby=[], aggr=[sum(tbl.val), count(tbl.val)]
+04)------ProjectionExec: expr=[CAST(val@0 AS Int64) as __common_expr_2]
+05)--------DataSourceExec: partitions=1, partition_sizes=[2]
+
+# Checks equivalent rewritten form (SUM + COUNT terms) matches transformed SUM semantics.
+query RR
+SELECT SUM(val) + 1 * COUNT(val), SUM(val) + 2 * COUNT(val) FROM tbl;
+----
+8589934592 8589934594
+
+query TT
+EXPLAIN SELECT SUM(val) + 1 * COUNT(val), SUM(val) + 2 * COUNT(val) FROM tbl;
+----
+logical_plan
+01)Projection: __common_expr_1 + CAST(count(tbl.val) AS Decimal128(20, 0)) AS sum(tbl.val) + Int64(1) * count(tbl.val), __common_expr_1 AS sum(tbl.val) + CAST(Int64(2) * count(tbl.val) AS Decimal128(20, 0))
+02)--Projection: CAST(sum(tbl.val) AS Decimal128(20, 0)) AS __common_expr_1, count(tbl.val)
+03)----Aggregate: groupBy=[[]], aggr=[[sum(CAST(tbl.val AS UInt64)), count(tbl.val)]]
+04)------TableScan: tbl projection=[val]
+physical_plan
+01)ProjectionExec: expr=[__common_expr_1@0 + CAST(count(tbl.val)@1 AS Decimal128(20, 0)) as sum(tbl.val) + Int64(1) * count(tbl.val), __common_expr_1@0 + CAST(2 * count(tbl.val)@1 AS Decimal128(20, 0)) as sum(tbl.val) + Int64(2) * count(tbl.val)]
+02)--ProjectionExec: expr=[CAST(sum(tbl.val)@0 AS Decimal128(20, 0)) as __common_expr_1, count(tbl.val)@1 as count(tbl.val)]
+03)----AggregateExec: mode=Single, gby=[], aggr=[sum(tbl.val), count(tbl.val)]
+04)------DataSourceExec: partitions=1, partition_sizes=[2]
+
+statement ok
+DROP TABLE IF EXISTS tbl;
+
+statement ok
+DROP TABLE sum_simplify_t;
diff --git a/datafusion/sqllogictest/test_files/aggregates_topk.slt b/datafusion/sqllogictest/test_files/aggregates_topk.slt
index cc1693843848a..19ead8965ed01 100644
--- a/datafusion/sqllogictest/test_files/aggregates_topk.slt
+++ b/datafusion/sqllogictest/test_files/aggregates_topk.slt
@@ -46,11 +46,9 @@ physical_plan
 01)SortPreservingMergeExec: [max(traces.timestamp)@1 DESC], fetch=4
 02)--SortExec: TopK(fetch=4), expr=[max(traces.timestamp)@1 DESC], preserve_partitioning=[true]
 03)----AggregateExec: mode=FinalPartitioned, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=4
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TI
 select * from (select trace_id, MAX(timestamp) max_ts from traces t group by trace_id) where trace_id != 'b' order by max_ts desc limit 3;
@@ -110,11 +108,9 @@ physical_plan
 01)SortPreservingMergeExec: [max(traces.timestamp)@1 DESC], fetch=4
 02)--SortExec: TopK(fetch=4), expr=[max(traces.timestamp)@1 DESC], preserve_partitioning=[true]
 03)----AggregateExec: mode=FinalPartitioned, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)], lim=[4]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=4
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)], lim=[4]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)], lim=[4]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TT
 explain select trace_id, MIN(timestamp) from traces group by trace_id order by MIN(timestamp) desc limit 4;
@@ -127,11 +123,9 @@ physical_plan
 01)SortPreservingMergeExec: [min(traces.timestamp)@1 DESC], fetch=4
 02)--SortExec: TopK(fetch=4), expr=[min(traces.timestamp)@1 DESC], preserve_partitioning=[true]
 03)----AggregateExec: mode=FinalPartitioned, gby=[trace_id@0 as trace_id], aggr=[min(traces.timestamp)]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=4
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[min(traces.timestamp)]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[min(traces.timestamp)]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TT
 explain select trace_id, MAX(timestamp) from traces group by trace_id order by MAX(timestamp) asc limit 4;
@@ -144,11 +138,9 @@ physical_plan
 01)SortPreservingMergeExec: [max(traces.timestamp)@1 ASC NULLS LAST], fetch=4
 02)--SortExec: TopK(fetch=4), expr=[max(traces.timestamp)@1 ASC NULLS LAST], preserve_partitioning=[true]
 03)----AggregateExec: mode=FinalPartitioned, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=4
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TT
 explain select trace_id, MAX(timestamp) from traces group by trace_id order by trace_id asc limit 4;
@@ -161,11 +153,9 @@ physical_plan
 01)SortPreservingMergeExec: [trace_id@0 ASC NULLS LAST], fetch=4
 02)--SortExec: TopK(fetch=4), expr=[trace_id@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----AggregateExec: mode=FinalPartitioned, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=4
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TI
 select trace_id, max(timestamp) from traces group by trace_id order by MAX(timestamp) desc limit 4;
@@ -205,6 +195,70 @@ a -1 -1
 NULL 0 0
 a 1 1
 
+statement ok
+CREATE TABLE string_topk(category varchar, val varchar) AS VALUES
+('x', 'apple'),
+('x', 'zebra'),
+('y', 'banana'),
+('y', 'apricot'),
+('z', 'mango');
+
+statement ok
+CREATE VIEW string_topk_view AS
+SELECT
+  arrow_cast(category, 'Utf8View') AS category,
+  arrow_cast(val, 'Utf8View') AS val
+FROM
+  string_topk;
+
+query TT
+select category, max(val) from string_topk group by category order by max(val) desc limit 2;
+----
+x zebra
+z mango
+
+query TT
+explain select category, max(val) max_val from string_topk group by category order by max_val desc limit 2;
+----
+logical_plan
+01)Sort: max_val DESC NULLS FIRST, fetch=2
+02)--Projection: string_topk.category, max(string_topk.val) AS max_val
+03)----Aggregate: groupBy=[[string_topk.category]], aggr=[[max(string_topk.val)]]
+04)------TableScan: string_topk projection=[category, val]
+physical_plan
+01)SortPreservingMergeExec: [max_val@1 DESC], fetch=2
+02)--SortExec: TopK(fetch=2), expr=[max_val@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[category@0 as category, max(string_topk.val)@1 as max_val]
+04)------AggregateExec: mode=FinalPartitioned, gby=[category@0 as category], aggr=[max(string_topk.val)], lim=[2]
+05)--------RepartitionExec: partitioning=Hash([category@0], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[category@0 as category], aggr=[max(string_topk.val)], lim=[2]
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+select category, max(val) from string_topk_view group by category order by max(val) desc limit 2;
+----
+x zebra
+z mango
+
+query TT
+explain select category, max(val) max_val from string_topk_view group by category order by max_val desc limit 2;
+----
+logical_plan
+01)Sort: max_val DESC NULLS FIRST, fetch=2
+02)--Projection: string_topk_view.category, max(string_topk_view.val) AS max_val
+03)----Aggregate: groupBy=[[string_topk_view.category]], aggr=[[max(string_topk_view.val)]]
+04)------SubqueryAlias: string_topk_view
+05)--------Projection: string_topk.category AS category, string_topk.val AS val
+06)----------TableScan: string_topk projection=[category, val]
+physical_plan
+01)SortPreservingMergeExec: [max_val@1 DESC], fetch=2
+02)--SortExec: TopK(fetch=2), expr=[max_val@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[category@0 as category, max(string_topk_view.val)@1 as max_val]
+04)------AggregateExec: mode=FinalPartitioned, gby=[category@0 as category], aggr=[max(string_topk_view.val)], lim=[2]
+05)--------RepartitionExec: partitioning=Hash([category@0], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[category@0 as category], aggr=[max(string_topk_view.val)], lim=[2]
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+
 query TII
 select trace_id, min(other), MIN(timestamp) from traces group by trace_id order by MIN(timestamp), MIN(other) limit 4;
 ----
@@ -213,6 +267,30 @@ a -1 -1
 NULL 0 0
 c 1 2
 
+# Regression tests for string max with ORDER BY ... LIMIT to ensure schema stability
+query TT
+select trace_id, max(trace_id) as max_trace from traces group by trace_id order by max_trace desc limit 2;
+----
+c c
+b b
+
+query TT
+explain select trace_id, max(trace_id) as max_trace from traces group by trace_id order by max_trace desc limit 2;
+----
+logical_plan
+01)Sort: max_trace DESC NULLS FIRST, fetch=2
+02)--Projection: traces.trace_id, max(traces.trace_id) AS max_trace
+03)----Aggregate: groupBy=[[traces.trace_id]], aggr=[[max(traces.trace_id)]]
+04)------TableScan: traces projection=[trace_id]
+physical_plan
+01)SortPreservingMergeExec: [max_trace@1 DESC], fetch=2
+02)--SortExec: TopK(fetch=2), expr=[max_trace@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[trace_id@0 as trace_id, max(traces.trace_id)@1 as max_trace]
+04)------AggregateExec: mode=FinalPartitioned, gby=[trace_id@0 as trace_id], aggr=[max(traces.trace_id)], lim=[2]
+05)--------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces.trace_id)], lim=[2]
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+
 
 # Setting to map varchar to utf8view, to test PR https://github.com/apache/datafusion/pull/15152
 # Before the PR, the test case would not work because the Utf8View will not be supported by the TopK aggregation
@@ -235,11 +313,9 @@ physical_plan
 01)SortPreservingMergeExec: [max(traces_utf8view.timestamp)@1 DESC], fetch=4
 02)--SortExec: TopK(fetch=4), expr=[max(traces_utf8view.timestamp)@1 DESC], preserve_partitioning=[true]
 03)----AggregateExec: mode=FinalPartitioned, gby=[trace_id@0 as trace_id], aggr=[max(traces_utf8view.timestamp)], lim=[4]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=4
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces_utf8view.timestamp)], lim=[4]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces_utf8view.timestamp)], lim=[4]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 
 # Also add LargeUtf8 to test PR https://github.com/apache/datafusion/pull/15152
@@ -263,12 +339,128 @@ physical_plan
 01)SortPreservingMergeExec: [max(traces_largeutf8.timestamp)@1 DESC], fetch=4
 02)--SortExec: TopK(fetch=4), expr=[max(traces_largeutf8.timestamp)@1 DESC], preserve_partitioning=[true]
 03)----AggregateExec: mode=FinalPartitioned, gby=[trace_id@0 as trace_id], aggr=[max(traces_largeutf8.timestamp)], lim=[4]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=4
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces_largeutf8.timestamp)], lim=[4]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces_largeutf8.timestamp)], lim=[4]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 
+## Test GROUP BY with ORDER BY on the same column (no aggregate functions)
+statement ok
+CREATE TABLE ids(id int, value int) AS VALUES
+(1, 10),
+(2, 20),
+(3, 30),
+(4, 40),
+(1, 50),
+(2, 60),
+(5, 70);
+
+query TT
+explain select id from ids group by id order by id desc limit 3;
+----
+logical_plan
+01)Sort: ids.id DESC NULLS FIRST, fetch=3
+02)--Aggregate: groupBy=[[ids.id]], aggr=[[]]
+03)----TableScan: ids projection=[id]
+physical_plan
+01)SortPreservingMergeExec: [id@0 DESC], fetch=3
+02)--SortExec: TopK(fetch=3), expr=[id@0 DESC], preserve_partitioning=[true]
+03)----AggregateExec: mode=FinalPartitioned, gby=[id@0 as id], aggr=[], lim=[3]
+04)------RepartitionExec: partitioning=Hash([id@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[], lim=[3]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query I
+select id from ids group by id order by id desc limit 3;
+----
+5
+4
+3
+
+query TT
+explain select id from ids group by id order by id asc limit 2;
+----
+logical_plan
+01)Sort: ids.id ASC NULLS LAST, fetch=2
+02)--Aggregate: groupBy=[[ids.id]], aggr=[[]]
+03)----TableScan: ids projection=[id]
+physical_plan
+01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=2
+02)--SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----AggregateExec: mode=FinalPartitioned, gby=[id@0 as id], aggr=[], lim=[2]
+04)------RepartitionExec: partitioning=Hash([id@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[], lim=[2]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query I
+select id from ids group by id order by id asc limit 2;
+----
+1
+2
+
+# Test with larger limit than distinct values
+query I
+select id from ids group by id order by id desc limit 100;
+----
+5
+4
+3
+2
+1
+
+# Test with bigint group by
+statement ok
+CREATE TABLE values_table (value INT, category BIGINT) AS VALUES
+(10, 100),
+(20, 200),
+(30, 300),
+(40, 400),
+(50, 500),
+(20, 200),
+(10, 100),
+(40, 400);
+
+query TT
+explain select category from values_table group by category order by category desc limit 3;
+----
+logical_plan
+01)Sort: values_table.category DESC NULLS FIRST, fetch=3
+02)--Aggregate: groupBy=[[values_table.category]], aggr=[[]]
+03)----TableScan: values_table projection=[category]
+physical_plan
+01)SortPreservingMergeExec: [category@0 DESC], fetch=3
+02)--SortExec: TopK(fetch=3), expr=[category@0 DESC], preserve_partitioning=[true]
+03)----AggregateExec: mode=FinalPartitioned, gby=[category@0 as category], aggr=[], lim=[3]
+04)------RepartitionExec: partitioning=Hash([category@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[category@0 as category], aggr=[], lim=[3]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query I
+select category from values_table group by category order by category desc limit 3;
+----
+500
+400
+300
+
+# Test with integer group by
+query I
+select value from values_table group by value order by value asc limit 3;
+----
+10
+20
+30
+
+# Test DISTINCT semantics are preserved
+query I
+select count(*) from (select category from values_table group by category order by category desc limit 3);
+----
+3
+
+statement ok
+drop table values_table;
+
+statement ok
+drop table ids;
+
 statement ok
 drop table traces;
diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt
deleted file mode 100644
index 38bdd7f3e3eb3..0000000000000
--- a/datafusion/sqllogictest/test_files/array.slt
+++ /dev/null
@@ -1,8567 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-
-#   http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-#############
-## Array Expressions Tests
-#############
-
-### Tables
-
-statement ok
-CREATE TABLE values(
-  a INT,
-  b INT,
-  c INT,
-  d FLOAT,
-  e VARCHAR,
-  f VARCHAR
-) AS VALUES
-  (1,    1,    2,    1.1,  'Lorem',       'A'),
-  (2,    3,    4,    2.2,  'ipsum',       ''),
-  (3,    5,    6,    3.3,  'dolor',       'BB'),
-  (4,    7,    8,    4.4,  'sit',          NULL),
-  (NULL, 9,    10,   5.5,  'amet',        'CCC'),
-  (5,    NULL, 12,   6.6,  ',',           'DD'),
-  (6,    11,   NULL, 7.7,  'consectetur', 'E'),
-  (7,    13,   14,   NULL, 'adipiscing',  'F'),
-  (8,    15,   16,   8.8,   NULL,          '')
-;
-
-statement ok
-CREATE TABLE values_without_nulls
-AS VALUES
-  (1,    1,    2,    1.1,  'Lorem',       'A'),
-  (2,    3,    4,    2.2,  'ipsum',       ''),
-  (3,    5,    6,    3.3,  'dolor',       'BB'),
-  (4,    7,    8,    4.4,  'sit',          NULL),
-  (5,    9,    10,   5.5,  'amet',        'CCC'),
-  (6,    11,   12,   6.6,  ',',           'DD'),
-  (7,    13,   14,   7.7,  'consectetur', 'E'),
-  (8,    15,   16,   8.8,  'adipiscing',  'F'),
-  (9,    17,   18,   9.9,  'elit',        '')
-;
-
-statement ok
-CREATE TABLE arrays
-AS VALUES
-  (make_array(make_array(NULL, 2),make_array(3, NULL)), make_array(1.1, 2.2, 3.3), make_array('L', 'o', 'r', 'e', 'm')),
-  (make_array(make_array(3, 4),make_array(5, 6)), make_array(NULL, 5.5, 6.6), make_array('i', 'p', NULL, 'u', 'm')),
-  (make_array(make_array(5, 6),make_array(7, 8)), make_array(7.7, 8.8, 9.9), make_array('d', NULL, 'l', 'o', 'r')),
-  (make_array(make_array(7, NULL),make_array(9, 10)), make_array(10.1, NULL, 12.2), make_array('s', 'i', 't')),
-  (NULL, make_array(13.3, 14.4, 15.5), make_array('a', 'm', 'e', 't')),
-  (make_array(make_array(11, 12),make_array(13, 14)), NULL, make_array(',')),
-  (make_array(make_array(15, 16),make_array(NULL, 18)), make_array(16.6, 17.7, 18.8), NULL)
-;
-
-statement ok
-CREATE TABLE large_arrays
-AS
-  SELECT
-    arrow_cast(column1, 'LargeList(List(Int64))') AS column1,
-    arrow_cast(column2, 'LargeList(Float64)') AS column2,
-    arrow_cast(column3, 'LargeList(Utf8)') AS column3
-  FROM arrays
-;
-
-statement ok
-CREATE TABLE fixed_size_arrays
-AS VALUES
-  (arrow_cast(make_array(make_array(NULL, 2),make_array(3, NULL)), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(1.1, 2.2, 3.3), 'FixedSizeList(3, Float64)'), arrow_cast(make_array('L', 'o', 'r', 'e', 'm'), 'FixedSizeList(5, Utf8)')),
-  (arrow_cast(make_array(make_array(3, 4),make_array(5, 6)), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(NULL, 5.5, 6.6), 'FixedSizeList(3, Float64)'), arrow_cast(make_array('i', 'p', NULL, 'u', 'm'), 'FixedSizeList(5, Utf8)')),
-  (arrow_cast(make_array(make_array(5, 6),make_array(7, 8)), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(7.7, 8.8, 9.9), 'FixedSizeList(3, Float64)'), arrow_cast(make_array('d', NULL, 'l', 'o', 'r'), 'FixedSizeList(5, Utf8)')),
-  (arrow_cast(make_array(make_array(7, NULL),make_array(9, 10)), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(10.1, NULL, 12.2), 'FixedSizeList(3, Float64)'), arrow_cast(make_array('s', 'i', 't', 'a', 'b'), 'FixedSizeList(5, Utf8)')),
-  (NULL, arrow_cast(make_array(13.3, 14.4, 15.5), 'FixedSizeList(3, Float64)'), arrow_cast(make_array('a', 'm', 'e', 't', 'x'), 'FixedSizeList(5, Utf8)')),
-  (arrow_cast(make_array(make_array(11, 12),make_array(13, 14)), 'FixedSizeList(2, List(Int64))'), NULL, arrow_cast(make_array(',','a','b','c','d'), 'FixedSizeList(5, Utf8)')),
-  (arrow_cast(make_array(make_array(15, 16),make_array(NULL, 18)), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(16.6, 17.7, 18.8), 'FixedSizeList(3, Float64)'), NULL)
-;
-
-statement ok
-CREATE TABLE slices
-AS VALUES
-  (make_array(NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10), 1, 1),
-  (make_array(11, 12, 13, 14, 15, 16, 17, 18, NULL, 20), 2, -4),
-  (make_array(21, 22, 23, NULL, 25, 26, 27, 28, 29, 30), 0, 0),
-  (make_array(31, 32, 33, 34, 35, NULL, 37, 38, 39, 40), -4, -7),
-  (NULL, 4, 5),
-  (make_array(41, 42, 43, 44, 45, 46, 47, 48, 49, 50), NULL, 6),
-  (make_array(51, 52, NULL, 54, 55, 56, 57, 58, 59, 60), 5, NULL)
-;
-
-statement ok
-CREATE TABLE fixed_slices
-AS VALUES
-  (arrow_cast(make_array(NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10), 'FixedSizeList(10, Int64)'), 1, 1),
-  (arrow_cast(make_array(11, 12, 13, 14, 15, 16, 17, 18, NULL, 20), 'FixedSizeList(10, Int64)'), 2, -4),
-  (arrow_cast(make_array(21, 22, 23, NULL, 25, 26, 27, 28, 29, 30), 'FixedSizeList(10, Int64)'), 0, 0),
-  (arrow_cast(make_array(31, 32, 33, 34, 35, NULL, 37, 38, 39, 40), 'FixedSizeList(10, Int64)'), -4, -7),
-  (arrow_cast(make_array(41, 42, 43, 44, 45, 46, 47, 48, 49, 50), 'FixedSizeList(10, Int64)'), NULL, 6),
-  (arrow_cast(make_array(51, 52, NULL, 54, 55, 56, 57, 58, 59, 60),'FixedSizeList(10, Int64)'), 5, NULL)
-;
-
-statement ok
-CREATE TABLE arrayspop
-AS VALUES
-  (make_array(1, 2, NULL)),
-  (make_array(3, 4, 5, NULL)),
-  (make_array(6, 7, 8, NULL, 9)),
-  (make_array(NULL, NULL, 100)),
-  (NULL),
-  (make_array(NULL, 10, 11, 12))
-;
-
-statement ok
-CREATE TABLE large_arrayspop
-AS SELECT
-  arrow_cast(column1, 'LargeList(Int64)') AS column1
-FROM arrayspop
-;
-
-statement ok
-CREATE TABLE nested_arrays
-AS VALUES
-  (make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6)), make_array(7, 8, 9), 2, make_array([[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]), make_array(11, 12, 13)),
-  (make_array(make_array(4, 5, 6), make_array(10, 11, 12), make_array(4, 9, 8), make_array(7, 8, 9), make_array(10, 11, 12), make_array(1, 8, 7)), make_array(10, 11, 12), 3, make_array([[11, 12, 13], [14, 15, 16]], [[17, 18, 19], [20, 21, 22]]), make_array(121, 131, 141))
-;
-
-statement ok
-CREATE TABLE large_nested_arrays
-AS
-  SELECT
-    arrow_cast(column1, 'LargeList(LargeList(Int64))') AS column1,
-    arrow_cast(column2, 'LargeList(Int64)') AS column2,
-    column3,
-    arrow_cast(column4, 'LargeList(LargeList(List(Int64)))') AS column4,
-    arrow_cast(column5, 'LargeList(Int64)') AS column5
-  FROM nested_arrays
-;
-
-statement ok
-CREATE TABLE fixed_size_nested_arrays
-AS VALUES
-  (arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6)), 'FixedSizeList(6, List(Int64))'), arrow_cast(make_array(7, 8, 9), 'FixedSizeList(3, Int64)'), 2, arrow_cast(make_array([[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]), 'FixedSizeList(2, List(List(Int64)))'), arrow_cast(make_array(11, 12, 13), 'FixedSizeList(3, Int64)')),
-  (arrow_cast(make_array(make_array(4, 5, 6), make_array(10, 11, 12), make_array(4, 9, 8), make_array(7, 8, 9), make_array(10, 11, 12), make_array(1, 8, 7)), 'FixedSizeList(6, List(Int64))'), arrow_cast(make_array(10, 11, 12), 'FixedSizeList(3, Int64)'), 3, arrow_cast(make_array([[11, 12, 13], [14, 15, 16]], [[17, 18, 19], [20, 21, 22]]), 'FixedSizeList(2, List(List(Int64)))'), arrow_cast(make_array(121, 131, 141), 'FixedSizeList(3, Int64)'))
-;
-
-statement ok
-CREATE TABLE arrays_values
-AS VALUES
-  (make_array(NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10), 1, 1, ','),
-  (make_array(11, 12, 13, 14, 15, 16, 17, 18, NULL, 20), 12, 2, '.'),
-  (make_array(21, 22, 23, NULL, 25, 26, 27, 28, 29, 30), 23, 3, '-'),
-  (make_array(31, 32, 33, 34, 35, NULL, 37, 38, 39, 40), 34, 4, 'ok'),
-  (NULL, 44, 5, '@'),
-  (make_array(41, 42, 43, 44, 45, 46, 47, 48, 49, 50), NULL, 6, '$'),
-  (make_array(51, 52, NULL, 54, 55, 56, 57, 58, 59, 60), 55, NULL, '^'),
-  (make_array(61, 62, 63, 64, 65, 66, 67, 68, 69, 70), 66, 7, NULL)
-;
-
-statement ok
-CREATE TABLE large_arrays_values
-AS SELECT
-  arrow_cast(column1, 'LargeList(Int64)') AS column1,
-  column2,
-  column3,
-  column4
-FROM arrays_values
-;
-
-statement ok
-CREATE TABLE fixed_arrays_values
-AS SELECT
-  arrow_cast(column1, 'FixedSizeList(10, Int64)') AS column1,
-  column2,
-  column3,
-  column4
-FROM arrays_values
-;
-
-statement ok
-CREATE TABLE arrays_values_v2
-AS VALUES
-  (make_array(NULL, 2, 3), make_array(4, 5, NULL), 12, make_array([30, 40, 50])),
-  (NULL, make_array(7, NULL, 8), 13, make_array(make_array(NULL,NULL,60))),
-  (make_array(9, NULL, 10), NULL, 14, make_array(make_array(70,NULL,NULL))),
-  (make_array(NULL, 1), make_array(NULL, 21), NULL, NULL),
-  (make_array(11, 12), NULL, NULL, NULL),
-  (NULL, NULL, NULL, NULL)
-;
-
-statement ok
-CREATE TABLE large_arrays_values_v2
-AS SELECT
-  arrow_cast(column1, 'LargeList(Int64)') AS column1,
-  arrow_cast(column2, 'LargeList(Int64)') AS column2,
-  column3,
-  arrow_cast(column4, 'LargeList(LargeList(Int64))') AS column4
-FROM arrays_values_v2
-;
-
-statement ok
-CREATE TABLE flatten_table
-AS VALUES
-  (make_array([1], [2], [3]), make_array([[1, 2, 3]], [[4, 5]], [[6]]), make_array([[[1]]], [[[2, 3]]]), make_array([1.0], [2.1, 2.2], [3.2, 3.3, 3.4])),
-  (make_array([1, 2], [3, 4], [5, 6]), make_array([[8]]), make_array([[[1,2]]], [[[3]]]), make_array([1.0, 2.0], [3.0, 4.0], [5.0, 6.0]))
-;
-
-statement ok
-CREATE TABLE large_flatten_table
-AS
-  SELECT
-    arrow_cast(column1, 'LargeList(LargeList(Int64))') AS column1,
-    arrow_cast(column2, 'LargeList(LargeList(LargeList(Int64)))') AS column2,
-    arrow_cast(column3, 'LargeList(LargeList(LargeList(LargeList(Int64))))') AS column3,
-    arrow_cast(column4, 'LargeList(LargeList(Float64))') AS column4
-  FROM flatten_table
-;
-
-statement ok
-CREATE TABLE fixed_size_flatten_table
-AS VALUES
-  (arrow_cast(make_array([1], [2], [3]), 'FixedSizeList(3, List(Int64))'),
-   arrow_cast(make_array([[1, 2, 3]], [[4, 5]], [[6]]), 'FixedSizeList(3, List(List(Int64)))'),
-   arrow_cast(make_array([[[1]]], [[[2, 3]]]), 'FixedSizeList(2, List(List(List(Int64))))'),
-   arrow_cast(make_array([1.0], [2.1, 2.2], [3.2, 3.3, 3.4]), 'FixedSizeList(3, List(Float64))')
-   ),
-  (
-    arrow_cast(make_array([1, 2], [3, 4], [5, 6]), 'FixedSizeList(3, List(Int64))'),
-    arrow_cast(make_array([[8]], [[9, 10]], [[11, 12, 13]]), 'FixedSizeList(3, List(List(Int64)))'),
-    arrow_cast(make_array([[[1,2]]], [[[3]]]), 'FixedSizeList(2, List(List(List(Int64))))'),
-    arrow_cast(make_array([1.0, 2.0], [3.0, 4.0], [5.0, 6.0]), 'FixedSizeList(3, List(Float64))')
-    )
-;
-
-statement ok
-CREATE TABLE array_has_table_1D
-AS VALUES
-  (make_array(1, 2), 1, make_array(1,2,3), make_array(1,3), make_array(1,3,5), make_array(2,4,6,8,1,3,5)),
-  (make_array(3, 4, 5), 2, make_array(1,2,3,4), make_array(2,5), make_array(2,4,6), make_array(1,3,5))
-;
-
-statement ok
-CREATE TABLE fixed_size_array_has_table_1D
-AS VALUES
-  (arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 1, arrow_cast(make_array(1, 2, 3, 4), 'FixedSizeList(4, Int64)'), arrow_cast(make_array(1,3), 'FixedSizeList(2, Int64)'), arrow_cast(make_array(1,3,5), 'FixedSizeList(3, Int64)'), arrow_cast(make_array(2, 4, 6, 8, 1, 3, 5), 'FixedSizeList(7, Int64)')),
-  (arrow_cast(make_array(3, 4, 5), 'FixedSizeList(3, Int64)'), 2, arrow_cast(make_array(1, 2, 3, 4), 'FixedSizeList(4, Int64)'), arrow_cast(make_array(2,5), 'FixedSizeList(2, Int64)'), arrow_cast(make_array(2,4,6), 'FixedSizeList(3, Int64)'), arrow_cast(make_array(1, 3, 5, 7, 9, 11, 13), 'FixedSizeList(7, Int64)'))
-;
-
-statement ok
-CREATE TABLE array_has_table_1D_Float
-AS VALUES
-  (make_array(1.0, 2.0), 1.0, make_array(1.0,2.0,3.0), make_array(1.0,3.0), make_array(1.11), make_array(2.22, 3.33)),
-  (make_array(3.0, 4.0, 5.0), 2.0, make_array(1.0,2.0,3.0,4.0), make_array(2.0,5.0), make_array(2.22, 1.11), make_array(1.11, 3.33))
-;
-
-statement ok
-CREATE TABLE fixed_size_array_has_table_1D_Float
-AS VALUES
-  (arrow_cast(make_array(1.0, 2.0, 3.0), 'FixedSizeList(3, Float64)'), 1.0, arrow_cast(make_array(1.0, 2.0, 3.0, 4.0), 'FixedSizeList(4, Float64)'), arrow_cast(make_array(1.0,3.0), 'FixedSizeList(2, Float64)'), arrow_cast(make_array(1.11, 2.22), 'FixedSizeList(2, Float64)'), arrow_cast(make_array(2.22, 3.33), 'FixedSizeList(2, Float64)')),
-  (arrow_cast(make_array(3.0, 4.0, 5.0), 'FixedSizeList(3, Float64)'), 2.0, arrow_cast(make_array(1.0, 2.0, 3.0, 4.0), 'FixedSizeList(4, Float64)'), arrow_cast(make_array(2.0,5.0), 'FixedSizeList(2, Float64)'), arrow_cast(make_array(2.22, 1.11), 'FixedSizeList(2, Float64)'), arrow_cast(make_array(1.11, 3.33), 'FixedSizeList(2, Float64)'))
-;
-
-statement ok
-CREATE TABLE array_has_table_1D_Boolean
-AS VALUES
-  (make_array(true, true, true), false, make_array(true, true, false, true, false), make_array(true, false, true), make_array(false), make_array(true, false)),
-  (make_array(false, false, false), false, make_array(true, false, true), make_array(true, true), make_array(true, true), make_array(false,false,true))
-;
-
-statement ok
-CREATE TABLE fixed_size_array_has_table_1D_Boolean
-AS VALUES
-  (arrow_cast(make_array(true, true, true), 'FixedSizeList(3, Boolean)'), false, arrow_cast(make_array(true, true, false, true, false), 'FixedSizeList(5, Boolean)'), arrow_cast(make_array(true, false, true), 'FixedSizeList(3, Boolean)'), arrow_cast(make_array(false, true), 'FixedSizeList(2, Boolean)'), arrow_cast(make_array(true, false, true), 'FixedSizeList(3, Boolean)')),
-  (arrow_cast(make_array(false, false, false), 'FixedSizeList(3, Boolean)'), false, arrow_cast(make_array(true, false, true, true, false), 'FixedSizeList(5, Boolean)'), arrow_cast(make_array(true, true, false), 'FixedSizeList(3, Boolean)'), arrow_cast(make_array(true, true), 'FixedSizeList(2, Boolean)'), arrow_cast(make_array(false,false,true), 'FixedSizeList(3, Boolean)'))
-;
-
-statement ok
-CREATE TABLE array_has_table_1D_UTF8
-AS VALUES
-  (make_array('a', 'bc', 'def'), 'bc', make_array('datafusion', 'rust', 'arrow'), make_array('rust', 'arrow'), make_array('rust', 'arrow', 'python'), make_array('data')),
-  (make_array('a', 'bc', 'def'), 'defg', make_array('datafusion', 'rust', 'arrow'), make_array('datafusion', 'rust', 'arrow', 'python'), make_array('rust', 'arrow'), make_array('datafusion', 'rust', 'arrow'))
-;
-
-statement ok
-CREATE TABLE fixed_size_array_has_table_1D_UTF8
-AS VALUES
-  (arrow_cast(make_array('a', 'bc', 'def'), 'FixedSizeList(3, Utf8)'), 'bc', arrow_cast(make_array('datafusion', 'rust', 'arrow'), 'FixedSizeList(3, Utf8)'), arrow_cast(make_array('rust', 'arrow', 'datafusion', 'rust'), 'FixedSizeList(4, Utf8)'), arrow_cast(make_array('rust', 'arrow', 'python'), 'FixedSizeList(3, Utf8)'), arrow_cast(make_array('data', 'fusion', 'rust'), 'FixedSizeList(3, Utf8)')),
-  (arrow_cast(make_array('a', 'bc', 'def'), 'FixedSizeList(3, Utf8)'), 'defg', arrow_cast(make_array('datafusion', 'rust', 'arrow'), 'FixedSizeList(3, Utf8)'), arrow_cast(make_array('datafusion', 'rust', 'arrow', 'python'), 'FixedSizeList(4, Utf8)'), arrow_cast(make_array('rust', 'arrow', 'python'), 'FixedSizeList(3, Utf8)'), arrow_cast(make_array('datafusion', 'rust', 'arrow'), 'FixedSizeList(3, Utf8)'))
-;
-
-statement ok
-CREATE TABLE array_has_table_2D
-AS VALUES
-  (make_array([1,2]), make_array(1,3), make_array([1,2,3], [4,5], [6,7]), make_array([4,5], [6,7])),
-  (make_array([3,4], [5]), make_array(5), make_array([1,2,3,4], [5,6,7], [8,9,10]), make_array([1,2,3], [5,6,7], [8,9,10]))
-;
-
-statement ok
-CREATE TABLE fixed_size_array_has_table_2D
-AS VALUES
-  (arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(1,3), 'FixedSizeList(2, Int64)'), arrow_cast(make_array([1,2,3], [4,5], [6,7]), 'FixedSizeList(3, List(Int64))'), arrow_cast(make_array([4,5], [6,7], [1,2,3]), 'FixedSizeList(3, List(Int64))')),
-  (arrow_cast(make_array([3,4], [5]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(5, 3), 'FixedSizeList(2, Int64)'), arrow_cast(make_array([1,2,3,4], [5,6,7], [8,9,10]), 'FixedSizeList(3, List(Int64))'), arrow_cast(make_array([1,2,3], [5,6,7], [8,9,10]), 'FixedSizeList(3, List(Int64))'))
-;
-
-statement ok
-CREATE TABLE array_has_table_2D_float
-AS VALUES
-  (make_array([1.0, 2.0, 3.0], [1.1, 2.2], [3.3]), make_array([1.1, 2.2], [3.3])),
-  (make_array([1.0, 2.0, 3.0], [1.1, 2.2], [3.3]), make_array([1.0], [1.1, 2.2], [3.3]))
-;
-
-statement ok
-CREATE TABLE fixed_size_array_has_table_2D_Float
-AS VALUES
-  (arrow_cast(make_array([1.0, 2.0, 3.0], [1.1, 2.2], [3.3]), 'FixedSizeList(3, List(Float64))'), arrow_cast(make_array([1.1, 2.2], [3.3], [4.4]), 'FixedSizeList(3, List(Float64))')),
-  (arrow_cast(make_array([1.0, 2.0, 3.0], [1.1, 2.2], [3.3]), 'FixedSizeList(3, List(Float64))'), arrow_cast(make_array([1.0], [1.1, 2.2], [3.3]), 'FixedSizeList(3, List(Float64))'))
-;
-
-statement ok
-CREATE TABLE array_has_table_3D
-AS VALUES
-  (make_array([[1,2]]), make_array([1])),
-  (make_array([[1,2]]), make_array([1,2])),
-  (make_array([[1,2]]), make_array([1,2,3])),
-  (make_array([[1], [2]]), make_array([2])),
-  (make_array([[1], [2]]), make_array([1], [2])),
-  (make_array([[1], [2]], [[2], [3]]), make_array([1], [2], [3])),
-  (make_array([[1], [2]], [[2], [3]]), make_array([1], [2]))
-;
-
-statement ok
-CREATE TABLE fixed_size_array_has_table_3D
-AS VALUES
-  (arrow_cast(make_array([[1,2]], [[3, 4]]), 'FixedSizeList(2, List(List(Int64)))'), arrow_cast(make_array([1], [2]), 'FixedSizeList(2, List(Int64))')),
-  (arrow_cast(make_array([[1,2]], [[4, 4]]), 'FixedSizeList(2, List(List(Int64)))'), arrow_cast(make_array([1,2], [3, 4]), 'FixedSizeList(2, List(Int64))')),
-  (arrow_cast(make_array([[1,2]], [[4, 4]]), 'FixedSizeList(2, List(List(Int64)))'), arrow_cast(make_array([1,2,3], [1]), 'FixedSizeList(2, List(Int64))')),
-  (arrow_cast(make_array([[1], [2]], [[]]), 'FixedSizeList(2, List(List(Int64)))'), arrow_cast(make_array([2], [3]), 'FixedSizeList(2, List(Int64))')),
-  (arrow_cast(make_array([[1], [2]], [[]]), 'FixedSizeList(2, List(List(Int64)))'), arrow_cast(make_array([1], [2]), 'FixedSizeList(2, List(Int64))')),
-  (arrow_cast(make_array([[1], [2]], [[2], [3]]), 'FixedSizeList(2, List(List(Int64)))'), arrow_cast(make_array([1], [2]), 'FixedSizeList(2, List(Int64))')),
-  (arrow_cast(make_array([[1], [2]], [[2], [3]]), 'FixedSizeList(2, List(List(Int64)))'), arrow_cast(make_array([1], [2]), 'FixedSizeList(2, List(Int64))'))
-;
-
-statement ok
-CREATE TABLE array_has_table_null
-AS VALUES
-  (make_array(1, 2), 1),
-  (make_array(1, NULL), 1),
-  (make_array(3, 4, 5), 2),
-  (make_array(3, NULL, 5), 2),
-  (make_array(NULL, NULL, NULL), 2)
-;
-
-statement ok
-CREATE TABLE array_has_table_empty
-AS VALUES
-  (make_array(1, 3, 5), 1),
-  (make_array(), 1),
-  (NULL, 1)
-;
-
-statement ok
-CREATE TABLE array_distinct_table_1D
-AS VALUES
-  (make_array(1, 1, 2, 2, 3)),
-  (make_array(1, 2, 3, 4, 5)),
-  (make_array(3, 5, 3, 3, 3))
-;
-
-statement ok
-CREATE TABLE array_distinct_table_1D_UTF8
-AS VALUES
-  (make_array('a', 'a', 'bc', 'bc', 'def')),
-  (make_array('a', 'bc', 'def', 'defg', 'defg')),
-  (make_array('defg', 'defg', 'defg', 'defg', 'defg'))
-;
-
-statement ok
-CREATE TABLE array_distinct_table_2D
-AS VALUES
-  (make_array([1,2], [1,2], [3,4], [3,4], [5,6])),
-  (make_array([1,2], [3,4], [5,6], [7,8], [9,10])),
-  (make_array([5,6], [5,6], NULL))
-;
-
-statement ok
-CREATE TABLE array_distinct_table_1D_large
-AS SELECT
-  arrow_cast(column1, 'LargeList(Int64)') AS column1
-FROM array_distinct_table_1D
-;
-
-statement ok
-CREATE TABLE array_distinct_table_1D_fixed
-AS SELECT
-  arrow_cast(column1, 'FixedSizeList(5, Int64)') AS column1
-FROM array_distinct_table_1D
-;
-
-statement ok
-CREATE TABLE array_distinct_table_1D_UTF8_fixed
-AS SELECT
-  arrow_cast(column1, 'FixedSizeList(5, Utf8)') AS column1
-FROM array_distinct_table_1D_UTF8
-;
-
-statement ok
-CREATE TABLE array_distinct_table_2D_fixed
-AS VALUES
-  (arrow_cast(make_array([1,2], [1,2], [3,4], [3,4], [5,6]), 'FixedSizeList(5, List(Int64))')),
-  (arrow_cast(make_array([1,2], [3,4], [5,6], [7,8], [9,10]), 'FixedSizeList(5, List(Int64))')),
-  (arrow_cast(make_array([5,6], [5,6], NULL, NULL, NULL), 'FixedSizeList(5, List(Int64))'))
-;
-
-statement ok
-CREATE TABLE array_intersect_table_1D
-AS VALUES
-  (make_array(1, 2), make_array(1), make_array(1,2,3), make_array(1,3), make_array(1,3,5), make_array(2,4,6,8,1,3)),
-  (make_array(11, 22), make_array(11), make_array(11,22,33), make_array(11,33), make_array(11,33,55), make_array(22,44,66,88,11,33))
-;
-
-statement ok
-CREATE TABLE large_array_intersect_table_1D
-AS
-  SELECT
-    arrow_cast(column1, 'LargeList(Int64)') as column1,
-    arrow_cast(column2, 'LargeList(Int64)') as column2,
-    arrow_cast(column3, 'LargeList(Int64)') as column3,
-    arrow_cast(column4, 'LargeList(Int64)') as column4,
-    arrow_cast(column5, 'LargeList(Int64)') as column5,
-    arrow_cast(column6, 'LargeList(Int64)') as column6
-FROM array_intersect_table_1D
-;
-
-statement ok
-CREATE TABLE array_intersect_table_1D_Float
-AS VALUES
-  (make_array(1.0, 2.0), make_array(1.0), make_array(1.0,2.0,3.0), make_array(1.0,3.0), make_array(1.11), make_array(2.22, 3.33)),
-  (make_array(3.0, 4.0, 5.0), make_array(2.0), make_array(1.0,2.0,3.0,4.0), make_array(2.0,5.0), make_array(2.22, 1.11), make_array(1.11, 3.33))
-;
-
-statement ok
-CREATE TABLE large_array_intersect_table_1D_Float
-AS
-  SELECT
-    arrow_cast(column1, 'LargeList(Float64)') as column1,
-    arrow_cast(column2, 'LargeList(Float64)') as column2,
-    arrow_cast(column3, 'LargeList(Float64)') as column3,
-    arrow_cast(column4, 'LargeList(Float64)') as column4,
-    arrow_cast(column5, 'LargeList(Float64)') as column5,
-    arrow_cast(column6, 'LargeList(Float64)') as column6
-FROM array_intersect_table_1D_Float
-;
-
-statement ok
-CREATE TABLE array_intersect_table_1D_Boolean
-AS VALUES
-  (make_array(true, true, true), make_array(false), make_array(true, true, false, true, false), make_array(true, false, true), make_array(false), make_array(true, false)),
-  (make_array(false, false, false), make_array(false), make_array(true, false, true), make_array(true, true), make_array(true, true), make_array(false,false,true))
-;
-
-statement ok
-CREATE TABLE large_array_intersect_table_1D_Boolean
-AS
-  SELECT
-    arrow_cast(column1, 'LargeList(Boolean)') as column1,
-    arrow_cast(column2, 'LargeList(Boolean)') as column2,
-    arrow_cast(column3, 'LargeList(Boolean)') as column3,
-    arrow_cast(column4, 'LargeList(Boolean)') as column4,
-    arrow_cast(column5, 'LargeList(Boolean)') as column5,
-    arrow_cast(column6, 'LargeList(Boolean)') as column6
-FROM array_intersect_table_1D_Boolean
-;
-
-statement ok
-CREATE TABLE array_intersect_table_1D_UTF8
-AS VALUES
-  (make_array('a', 'bc', 'def'), make_array('bc'), make_array('datafusion', 'rust', 'arrow'), make_array('rust', 'arrow'), make_array('rust', 'arrow', 'python'), make_array('data')),
-  (make_array('a', 'bc', 'def'), make_array('defg'), make_array('datafusion', 'rust', 'arrow'), make_array('datafusion', 'rust', 'arrow', 'python'), make_array('rust', 'arrow'), make_array('datafusion', 'rust', 'arrow'))
-;
-
-statement ok
-CREATE TABLE large_array_intersect_table_1D_UTF8
-AS
-  SELECT
-    arrow_cast(column1, 'LargeList(Utf8)') as column1,
-    arrow_cast(column2, 'LargeList(Utf8)') as column2,
-    arrow_cast(column3, 'LargeList(Utf8)') as column3,
-    arrow_cast(column4, 'LargeList(Utf8)') as column4,
-    arrow_cast(column5, 'LargeList(Utf8)') as column5,
-    arrow_cast(column6, 'LargeList(Utf8)') as column6
-FROM array_intersect_table_1D_UTF8
-;
-
-statement ok
-CREATE TABLE array_intersect_table_2D
-AS VALUES
-  (make_array([1,2]), make_array([1,3]), make_array([1,2,3], [4,5], [6,7]), make_array([4,5], [6,7])),
-  (make_array([3,4], [5]), make_array([3,4]), make_array([1,2,3,4], [5,6,7], [8,9,10]), make_array([1,2,3], [5,6,7], [8,9,10]))
-;
-
-statement ok
-CREATE TABLE large_array_intersect_table_2D
-AS
-  SELECT
-    arrow_cast(column1, 'LargeList(List(Int64))') as column1,
-    arrow_cast(column2, 'LargeList(List(Int64))') as column2,
-    arrow_cast(column3, 'LargeList(List(Int64))') as column3,
-    arrow_cast(column4, 'LargeList(List(Int64))') as column4
-FROM array_intersect_table_2D
-;
-
-statement ok
-CREATE TABLE array_intersect_table_2D_float
-AS VALUES
-  (make_array([1.0, 2.0, 3.0], [1.1, 2.2], [3.3]), make_array([1.1, 2.2], [3.3])),
-  (make_array([1.0, 2.0, 3.0], [1.1, 2.2], [3.3]), make_array([1.0], [1.1, 2.2], [3.3]))
-;
-
-statement ok
-CREATE TABLE large_array_intersect_table_2D_Float
-AS
-  SELECT
-    arrow_cast(column1, 'LargeList(List(Float64))') as column1,
-    arrow_cast(column2, 'LargeList(List(Float64))') as column2
-FROM array_intersect_table_2D_Float
-;
-
-statement ok
-CREATE TABLE array_intersect_table_3D
-AS VALUES
-  (make_array([[1,2]]), make_array([[1]])),
-  (make_array([[1,2]]), make_array([[1,2]]))
-;
-
-statement ok
-CREATE TABLE large_array_intersect_table_3D
-AS
-  SELECT
-    arrow_cast(column1, 'LargeList(List(List(Int64)))') as column1,
-    arrow_cast(column2, 'LargeList(List(List(Int64)))') as column2
-FROM array_intersect_table_3D
-;
-
-statement ok
-CREATE TABLE arrays_values_without_nulls
-AS VALUES
-  (make_array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 1, 1, ',', [2,3]),
-  (make_array(11, 12, 13, 14, 15, 16, 17, 18, 19, 20), 12, 2, '.', [4,5]),
-  (make_array(21, 22, 23, 24, 25, 26, 27, 28, 29, 30), 23, 3, '-', [6,7]),
-  (make_array(31, 32, 33, 34, 35, 26, 37, 38, 39, 40), 34, 4, 'ok', [8,9])
-;
-
-statement ok
-CREATE TABLE large_arrays_values_without_nulls
-AS SELECT
-  arrow_cast(column1, 'LargeList(Int64)') AS column1,
-  column2,
-  column3,
-  column4,
-  arrow_cast(column5, 'LargeList(Int64)') AS column5
-FROM arrays_values_without_nulls
-;
-
-statement ok
-CREATE TABLE fixed_size_arrays_values_without_nulls
-AS VALUES
-  (arrow_cast(make_array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 'FixedSizeList(10, Int64)'), 1, 1, ',', [2,3]),
-  (arrow_cast(make_array(11, 12, 13, 14, 15, 16, 17, 18, 19, 20), 'FixedSizeList(10, Int64)'), 12, 2, '.', [4,5]),
-  (arrow_cast(make_array(21, 22, 23, 24, 25, 26, 27, 28, 29, 30), 'FixedSizeList(10, Int64)'), 23, 3, '-', [6,7]),
-  (arrow_cast(make_array(31, 32, 33, 34, 35, 26, 37, 38, 39, 40), 'FixedSizeList(10, Int64)'), 34, 4, 'ok', [8,9])
-;
-
-statement ok
-CREATE TABLE arrays_range
-AS VALUES
-  (3, 10, 2),
-  (4, 13, 3)
-;
-
-statement ok
-CREATE TABLE arrays_with_repeating_elements
-AS VALUES
-  (make_array(1, 2, 1, 3, 2, 2, 1, 3, 2, 3), 2, 4, 3),
-  (make_array(4, 4, 5, 5, 6, 5, 5, 5, 4, 4), 4, 7, 2),
-  (make_array(7, 7, 7, 8, 7, 9, 7, 8, 7, 7), 7, 10, 5),
-  (make_array(10, 11, 12, 10, 11, 12, 10, 11, 12, 10), 10, 13, 10)
-;
-
-statement ok
-CREATE TABLE large_arrays_with_repeating_elements
-AS
-  SELECT
-    arrow_cast(column1, 'LargeList(Int64)') AS column1,
-    column2,
-    column3,
-    column4
-  FROM arrays_with_repeating_elements
-;
-
-statement ok
-CREATE TABLE fixed_arrays_with_repeating_elements
-AS VALUES
-  (arrow_cast(make_array(1, 2, 1, 3, 2, 2, 1, 3, 2, 3), 'FixedSizeList(10, Int64)'), 2, 4, 3),
-  (arrow_cast(make_array(4, 4, 5, 5, 6, 5, 5, 5, 4, 4), 'FixedSizeList(10, Int64)'), 4, 7, 2),
-  (arrow_cast(make_array(7, 7, 7, 8, 7, 9, 7, 8, 7, 7), 'FixedSizeList(10, Int64)'), 7, 10, 5),
-  (arrow_cast(make_array(10, 11, 12, 10, 11, 12, 10, 11, 12, 10), 'FixedSizeList(10, Int64)'), 10, 13, 10)
-;
-
-statement ok
-CREATE TABLE nested_arrays_with_repeating_elements
-AS VALUES
-  (make_array([1, 2, 3], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]), [4, 5, 6], [10, 11, 12], 3),
-  (make_array([10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]), [10, 11, 12], [19, 20, 21], 2),
-  (make_array([19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]), [19, 20, 21], [28, 29, 30], 5),
-  (make_array([28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]), [28, 29, 30], [37, 38, 39], 10)
-;
-
-statement ok
-CREATE TABLE large_nested_arrays_with_repeating_elements
-AS
-  SELECT
-    arrow_cast(column1, 'LargeList(List(Int64))') AS column1,
-    column2,
-    column3,
-    column4
-  FROM nested_arrays_with_repeating_elements
-;
-
-statement ok
-CREATE TABLE fixed_size_nested_arrays_with_repeating_elements
-AS VALUES
-  (arrow_cast(make_array([1, 2, 3], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(10, List(Int64))'), [4, 5, 6], [10, 11, 12], 3),
-  (arrow_cast(make_array([10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]), 'FixedSizeList(10, List(Int64))'), [10, 11, 12], [19, 20, 21], 2),
-  (arrow_cast(make_array([19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]), 'FixedSizeList(10, List(Int64))'), [19, 20, 21], [28, 29, 30], 5),
-  (arrow_cast(make_array([28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]), 'FixedSizeList(10, List(Int64))'), [28, 29, 30], [28, 29, 30], 10)
-;
-
-statement ok
-CREATE TABLE arrays_distance_table
-AS VALUES
-  (make_array(1, 2, 3), make_array(1, 2, 3), make_array(1.1, 2.2, 3.3) , make_array(1.1, NULL, 3.3)),
-  (make_array(1, 2, 3), make_array(4, 5, 6), make_array(4.4, 5.5, 6.6), make_array(4.4, NULL, 6.6)),
-  (make_array(1, 2, 3), make_array(7, 8, 9), make_array(7.7, 8.8, 9.9), make_array(7.7, NULL, 9.9)),
-  (make_array(1, 2, 3), make_array(10, 11, 12), make_array(10.1, 11.2, 12.3), make_array(10.1, NULL, 12.3))
-;
-
-statement ok
-CREATE TABLE large_arrays_distance_table
-AS
-  SELECT
-    arrow_cast(column1, 'LargeList(Int64)') AS column1,
-    arrow_cast(column2, 'LargeList(Int64)') AS column2,
-    arrow_cast(column3, 'LargeList(Float64)') AS column3,
-    arrow_cast(column4, 'LargeList(Float64)') AS column4
-FROM arrays_distance_table
-;
-
-statement ok
-CREATE TABLE fixed_size_arrays_distance_table
-AS
-  SELECT
-    arrow_cast(column1, 'FixedSizeList(3, Int64)') AS column1,
-    arrow_cast(column2, 'FixedSizeList(3, Int64)') AS column2,
-    arrow_cast(column3, 'FixedSizeList(3, Float64)') AS column3,
-    arrow_cast(column4, 'FixedSizeList(3, Float64)') AS column4
-FROM arrays_distance_table
-;
-
-
-# Array literal
-
-## boolean coercion is not supported
-query error
-select [1, true, null]
-
-## wrapped in array_length to get deterministic results
-query I
-SELECT array_length([now()])
-----
-1
-
-## array literal with functions
-query ?
-select [abs(-1.2), sin(-1), log(2), ceil(3.141)]
-----
-[1.2, -0.8414709848078965, 0.30102999566398114, 4.0]
-
-## array literal with nested types
-query ???
-select
-  [struct('foo', 1)],
-  [struct('foo', [1,2,3])],
-  [struct('foo', [struct(3, 'x')])]
-;
-----
-[{c0: foo, c1: 1}] [{c0: foo, c1: [1, 2, 3]}] [{c0: foo, c1: [{c0: 3, c1: x}]}]
-
-query TTT
-select arrow_typeof(column1), arrow_typeof(column2), arrow_typeof(column3) from arrays;
-----
-List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8)
-List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8)
-List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8)
-List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8)
-List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8)
-List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8)
-List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8)
-
-# arrays table
-query ???
-select column1, column2, column3 from arrays;
-----
-[[NULL, 2], [3, NULL]] [1.1, 2.2, 3.3] [L, o, r, e, m]
-[[3, 4], [5, 6]] [NULL, 5.5, 6.6] [i, p, NULL, u, m]
-[[5, 6], [7, 8]] [7.7, 8.8, 9.9] [d, NULL, l, o, r]
-[[7, NULL], [9, 10]] [10.1, NULL, 12.2] [s, i, t]
-NULL [13.3, 14.4, 15.5] [a, m, e, t]
-[[11, 12], [13, 14]] NULL [,]
-[[15, 16], [NULL, 18]] [16.6, 17.7, 18.8] NULL
-
-# nested_arrays table
-query ??I??
-select column1, column2, column3, column4, column5 from nested_arrays;
-----
-[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]] [7, 8, 9] 2 [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]] [11, 12, 13]
-[[4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7]] [10, 11, 12] 3 [[[11, 12, 13], [14, 15, 16]], [[17, 18, 19], [20, 21, 22]]] [121, 131, 141]
-
-# values table
-query IIIRT
-select a, b, c, d, e from values;
-----
-1 1 2 1.1 Lorem
-2 3 4 2.2 ipsum
-3 5 6 3.3 dolor
-4 7 8 4.4 sit
-NULL 9 10 5.5 amet
-5 NULL 12 6.6 ,
-6 11 NULL 7.7 consectetur
-7 13 14 NULL adipiscing
-8 15 16 8.8 NULL
-
-# arrays_values table
-query ?IIT
-select column1, column2, column3, column4 from arrays_values;
-----
-[NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10] 1 1 ,
-[11, 12, 13, 14, 15, 16, 17, 18, NULL, 20] 12 2 .
-[21, 22, 23, NULL, 25, 26, 27, 28, 29, 30] 23 3 -
-[31, 32, 33, 34, 35, NULL, 37, 38, 39, 40] 34 4 ok
-NULL 44 5 @
-[41, 42, 43, 44, 45, 46, 47, 48, 49, 50] NULL 6 $
-[51, 52, NULL, 54, 55, 56, 57, 58, 59, 60] 55 NULL ^
-[61, 62, 63, 64, 65, 66, 67, 68, 69, 70] 66 7 NULL
-
-# slices table
-query ?II
-select column1, column2, column3 from slices;
-----
-[NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10] 1 1
-[11, 12, 13, 14, 15, 16, 17, 18, NULL, 20] 2 -4
-[21, 22, 23, NULL, 25, 26, 27, 28, 29, 30] 0 0
-[31, 32, 33, 34, 35, NULL, 37, 38, 39, 40] -4 -7
-NULL 4 5
-[41, 42, 43, 44, 45, 46, 47, 48, 49, 50] NULL 6
-[51, 52, NULL, 54, 55, 56, 57, 58, 59, 60] 5 NULL
-
-query ??I?
-select column1, column2, column3, column4 from arrays_values_v2;
-----
-[NULL, 2, 3] [4, 5, NULL] 12 [[30, 40, 50]]
-NULL [7, NULL, 8] 13 [[NULL, NULL, 60]]
-[9, NULL, 10] NULL 14 [[70, NULL, NULL]]
-[NULL, 1] [NULL, 21] NULL NULL
-[11, 12] NULL NULL NULL
-NULL NULL NULL NULL
-
-# arrays_values_without_nulls table
-query ?IIT
-select column1, column2, column3, column4 from arrays_values_without_nulls;
-----
-[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 1 1 ,
-[11, 12, 13, 14, 15, 16, 17, 18, 19, 20] 12 2 .
-[21, 22, 23, 24, 25, 26, 27, 28, 29, 30] 23 3 -
-[31, 32, 33, 34, 35, 26, 37, 38, 39, 40] 34 4 ok
-
-# arrays_with_repeating_elements table
-query ?III
-select column1, column2, column3, column4 from arrays_with_repeating_elements;
-----
-[1, 2, 1, 3, 2, 2, 1, 3, 2, 3] 2 4 3
-[4, 4, 5, 5, 6, 5, 5, 5, 4, 4] 4 7 2
-[7, 7, 7, 8, 7, 9, 7, 8, 7, 7] 7 10 5
-[10, 11, 12, 10, 11, 12, 10, 11, 12, 10] 10 13 10
-
-# nested_arrays_with_repeating_elements table
-query ???I
-select column1, column2, column3, column4 from nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]] [4, 5, 6] [10, 11, 12] 3
-[[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] [10, 11, 12] [19, 20, 21] 2
-[[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [19, 20, 21] [28, 29, 30] 5
-[[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]] [28, 29, 30] [37, 38, 39] 10
-
-
-### Array index
-
-
-## array[i]
-
-# single index with scalars #1 (positive index)
-query IRT
-select make_array(1, 2, 3)[1], make_array(1.0, 2.0, 3.0)[2], make_array('h', 'e', 'l', 'l', 'o')[3];
-----
-1 2 l
-
-# single index with scalars #2 (zero index)
-query I
-select make_array(1, 2, 3)[0];
-----
-NULL
-
-# single index with scalars #3 (negative index)
-query IRT
-select make_array(1, 2, 3)[-1], make_array(1.0, 2.0, 3.0)[-2], make_array('h', 'e', 'l', 'l', 'o')[-3];
-----
-3 2 l
-
-# single index with scalars #4 (complex index)
-query IRT
-select make_array(1, 2, 3)[1 + 2 - 1], make_array(1.0, 2.0, 3.0)[2 * 1 * 0 - 2], make_array('h', 'e', 'l', 'l', 'o')[2 - 3];
-----
-2 2 o
-
-# single index with columns #1 (positive index)
-query ?RT
-select column1[2], column2[3], column3[1] from arrays;
-----
-[3, NULL] 3.3 L
-[5, 6] 6.6 i
-[7, 8] 9.9 d
-[9, 10] 12.2 s
-NULL 15.5 a
-[13, 14] NULL ,
-[NULL, 18] 18.8 NULL
-
-# single index with columns #2 (zero index)
-query ?RT
-select column1[0], column2[0], column3[0] from arrays;
-----
-NULL NULL NULL
-NULL NULL NULL
-NULL NULL NULL
-NULL NULL NULL
-NULL NULL NULL
-NULL NULL NULL
-NULL NULL NULL
-
-# single index with columns #3 (negative index)
-query ?RT
-select column1[-2], column2[-3], column3[-1] from arrays;
-----
-[NULL, 2] 1.1 m
-[3, 4] NULL m
-[5, 6] 7.7 r
-[7, NULL] 10.1 t
-NULL 13.3 t
-[11, 12] NULL ,
-[15, 16] 16.6 NULL
-
-# single index with columns #4 (complex index)
-query ?RT
-select column1[9 - 7], column2[2 * 0], column3[1 - 3] from arrays;
-----
-[3, NULL] NULL e
-[5, 6] NULL u
-[7, 8] NULL o
-[9, 10] NULL i
-NULL NULL e
-[13, 14] NULL NULL
-[NULL, 18] NULL NULL
-
-# TODO: support index as column
-# single index with columns #5 (index as column)
-# query ?
-# select make_array(1, 2, 3, 4, 5)[column2] from arrays_with_repeating_elements;
-# ----
-
-# TODO: support argument and index as columns
-# single index with columns #6 (argument and index as columns)
-# query I
-# select column1[column2] from arrays_with_repeating_elements;
-# ----
-
-## array[i:j]
-
-# multiple index with columns #1 (positive index)
-query ???
-select make_array(1, 2, 3)[1:2], make_array(1.0, 2.0, 3.0)[2:3], make_array('h', 'e', 'l', 'l', 'o')[2:4];
-----
-[1, 2] [2.0, 3.0] [e, l, l]
-
-query ???
-select arrow_cast([1, 2, 3], 'LargeList(Int64)')[1:2],
-       arrow_cast([1.0, 2.0, 3.0], 'LargeList(Int64)')[2:3],
-       arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)')[2:4]
-;
-----
-[1, 2] [2, 3] [e, l, l]
-
-# multiple index with columns #2 (zero index)
-query ???
-select make_array(1, 2, 3)[0:0], make_array(1.0, 2.0, 3.0)[0:2], make_array('h', 'e', 'l', 'l', 'o')[0:6];
-----
-[] [1.0, 2.0] [h, e, l, l, o]
-
-query ???
-select arrow_cast([1, 2, 3], 'LargeList(Int64)')[0:0],
-       arrow_cast([1.0, 2.0, 3.0], 'LargeList(Int64)')[0:2],
-       arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)')[0:6]
-;
-----
-[] [1, 2] [h, e, l, l, o]
-
-query I
-select arrow_cast([1, 2, 3], 'LargeList(Int64)')[1];
-----
-1
-
-# TODO: support multiple negative index
-# multiple index with columns #3 (negative index)
-# query II
-# select make_array(1, 2, 3)[-3:-1], make_array(1.0, 2.0, 3.0)[-3:-1], make_array('h', 'e', 'l', 'l', 'o')[-2:0];
-# ----
-
-# TODO: support complex index
-# multiple index with columns #4 (complex index)
-# query III
-# select make_array(1, 2, 3)[2 + 1 - 1:10], make_array(1.0, 2.0, 3.0)[2 | 2:10], make_array('h', 'e', 'l', 'l', 'o')[6 ^ 6:10];
-# ----
-
-# multiple index with columns #1 (positive index)
-query ???
-select column1[2:4], column2[1:4], column3[3:4] from arrays;
-----
-[[3, NULL]] [1.1, 2.2, 3.3] [r, e]
-[[5, 6]] [NULL, 5.5, 6.6] [NULL, u]
-[[7, 8]] [7.7, 8.8, 9.9] [l, o]
-[[9, 10]] [10.1, NULL, 12.2] [t]
-NULL [13.3, 14.4, 15.5] [e, t]
-[[13, 14]] NULL []
-[[NULL, 18]] [16.6, 17.7, 18.8] NULL
-
-# multiple index with columns #2 (zero index)
-query ???
-select column1[0:5], column2[0:3], column3[0:9] from arrays;
-----
-[[NULL, 2], [3, NULL]] [1.1, 2.2, 3.3] [L, o, r, e, m]
-[[3, 4], [5, 6]] [NULL, 5.5, 6.6] [i, p, NULL, u, m]
-[[5, 6], [7, 8]] [7.7, 8.8, 9.9] [d, NULL, l, o, r]
-[[7, NULL], [9, 10]] [10.1, NULL, 12.2] [s, i, t]
-NULL [13.3, 14.4, 15.5] [a, m, e, t]
-[[11, 12], [13, 14]] NULL [,]
-[[15, 16], [NULL, 18]] [16.6, 17.7, 18.8] NULL
-
-# TODO: support negative index
-# multiple index with columns #3 (negative index)
-# query ?RT
-# select column1[-2:-4], column2[-3:-5], column3[-1:-4] from arrays;
-# ----
-# [NULL, 2] 1.1 m
-
-# TODO: support complex index
-# multiple index with columns #4 (complex index)
-# query ?RT
-# select column1[9 - 7:2 + 2], column2[1 * 0:2 * 3], column3[1 + 1 - 0:5 % 3] from arrays;
-# ----
-
-# TODO: support first index as column
-# multiple index with columns #5 (first index as column)
-# query ?
-# select make_array(1, 2, 3, 4, 5)[column2:4] from arrays_with_repeating_elements
-# ----
-
-# TODO: support last index as column
-# multiple index with columns #6 (last index as column)
-# query ?RT
-# select make_array(1, 2, 3, 4, 5)[2:column3] from arrays_with_repeating_elements;
-# ----
-
-# TODO: support argument and indices as column
-# multiple index with columns #7 (argument and indices as column)
-# query ?RT
-# select column1[column2:column3] from arrays_with_repeating_elements;
-# ----
-
-# array[i:j:k]
-
-# multiple index with columns #1 (positive index)
-query ???
-select make_array(1, 2, 3)[1:2:2], make_array(1.0, 2.0, 3.0)[2:3:2], make_array('h', 'e', 'l', 'l', 'o')[2:4:2];
-----
-[1] [2.0] [e, l]
-
-# multiple index with columns #2 (zero index)
-query ???
-select make_array(1, 2, 3)[0:0:2], make_array(1.0, 2.0, 3.0)[0:2:2], make_array('h', 'e', 'l', 'l', 'o')[0:6:2];
-----
-[] [1.0] [h, l, o]
-
-#TODO: sqlparser does not support negative index
-## multiple index with columns #3 (negative index)
-#query ???
-#select make_array(1, 2, 3)[-1:-2:-2], make_array(1.0, 2.0, 3.0)[-2:-3:-2], make_array('h', 'e', 'l', 'l', 'o')[-2:-4:-2];
-#----
-#[1] [2.0] [e, l]
-
-# multiple index with columns #1 (positive index)
-query ???
-select column1[2:4:2], column2[1:4:2], column3[3:4:2] from arrays;
-----
-[[3, NULL]] [1.1, 3.3] [r]
-[[5, 6]] [NULL, 6.6] [NULL]
-[[7, 8]] [7.7, 9.9] [l]
-[[9, 10]] [10.1, 12.2] [t]
-NULL [13.3, 15.5] [e]
-[[13, 14]] NULL []
-[[NULL, 18]] [16.6, 18.8] NULL
-
-# multiple index with columns #2 (zero index)
-query ???
-select column1[0:5:2], column2[0:3:2], column3[0:9:2] from arrays;
-----
-[[NULL, 2]] [1.1, 3.3] [L, r, m]
-[[3, 4]] [NULL, 6.6] [i, NULL, m]
-[[5, 6]] [7.7, 9.9] [d, l, r]
-[[7, NULL]] [10.1, 12.2] [s, t]
-NULL [13.3, 15.5] [a, e]
-[[11, 12]] NULL [,]
-[[15, 16]] [16.6, 18.8] NULL
-
-
-### Array function tests
-
-
-## make_array (aliases: `make_list`)
-
-# make_array scalar function #1
-query ???
-select make_array(1, 2, 3), make_array(1.0, 2.0, 3.0), make_array('h', 'e', 'l', 'l', 'o');
-----
-[1, 2, 3] [1.0, 2.0, 3.0] [h, e, l, l, o]
-
-# make_array scalar function #2
-query ???
-select make_array(1, 2, 3), make_array(make_array(1, 2), make_array(3, 4)), make_array([[[[1], [2]]]]);
-----
-[1, 2, 3] [[1, 2], [3, 4]] [[[[[1], [2]]]]]
-
-# make_array scalar function #3
-query ??
-select make_array([1, 2, 3], [4, 5, 6], [7, 8, 9]), make_array([[1, 2], [3, 4]], [[5, 6], [7, 8]]);
-----
-[[1, 2, 3], [4, 5, 6], [7, 8, 9]] [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
-
-# make_array scalar function #4
-query ??
-select make_array([1.0, 2.0], [3.0, 4.0]), make_array('h', 'e', 'l', 'l', 'o');
-----
-[[1.0, 2.0], [3.0, 4.0]] [h, e, l, l, o]
-
-# make_array scalar function #5
-query ?
-select make_array(make_array(make_array(make_array(1, 2, 3), make_array(4, 5, 6)), make_array(make_array(7, 8, 9), make_array(10, 11, 12))))
-----
-[[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]]
-
-# make_array scalar function #6
-query ?
-select make_array()
-----
-[]
-
-# make_array scalar function #7
-query ??
-select make_array(make_array()), make_array(make_array(make_array()))
-----
-[[]] [[[]]]
-
-# make_list scalar function #8 (function alias: `make_array`)
-query ???
-select make_list(1, 2, 3), make_list(1.0, 2.0, 3.0), make_list('h', 'e', 'l', 'l', 'o');
-----
-[1, 2, 3] [1.0, 2.0, 3.0] [h, e, l, l, o]
-
-# make_array scalar function with nulls
-query ???
-select make_array(1, NULL, 3), make_array(NULL, 2.0, NULL), make_array('h', NULL, 'l', NULL, 'o');
-----
-[1, NULL, 3] [NULL, 2.0, NULL] [h, NULL, l, NULL, o]
-
-# make_array scalar function with nulls #2
-query ??
-select make_array(1, 2, NULL), make_array(make_array(NULL, 2), make_array(NULL, 3));
-----
-[1, 2, NULL] [[NULL, 2], [NULL, 3]]
-
-# make_array scalar function with nulls #3
-query ???
-select make_array(NULL), make_array(NULL, NULL, NULL), make_array(make_array(NULL, NULL), make_array(NULL, NULL));
-----
-[NULL] [NULL, NULL, NULL] [[NULL, NULL], [NULL, NULL]]
-
-# make_array with 1 columns
-query ???
-select make_array(a), make_array(d), make_array(e) from values;
-----
-[1] [1.1] [Lorem]
-[2] [2.2] [ipsum]
-[3] [3.3] [dolor]
-[4] [4.4] [sit]
-[NULL] [5.5] [amet]
-[5] [6.6] [,]
-[6] [7.7] [consectetur]
-[7] [NULL] [adipiscing]
-[8] [8.8] [NULL]
-
-# make_array with 2 columns #1
-query ??
-select make_array(b, c), make_array(e, f) from values;
-----
-[1, 2] [Lorem, A]
-[3, 4] [ipsum, ]
-[5, 6] [dolor, BB]
-[7, 8] [sit, NULL]
-[9, 10] [amet, CCC]
-[NULL, 12] [,, DD]
-[11, NULL] [consectetur, E]
-[13, 14] [adipiscing, F]
-[15, 16] [NULL, ]
-
-# make_array with 4 columns
-query ?
-select make_array(a, b, c, d) from values;
-----
-[1.0, 1.0, 2.0, 1.1]
-[2.0, 3.0, 4.0, 2.2]
-[3.0, 5.0, 6.0, 3.3]
-[4.0, 7.0, 8.0, 4.4]
-[NULL, 9.0, 10.0, 5.5]
-[5.0, NULL, 12.0, 6.6]
-[6.0, 11.0, NULL, 7.7]
-[7.0, 13.0, 14.0, NULL]
-[8.0, 15.0, 16.0, 8.8]
-
-# make_array with column of list
-query ??
-select column1, column5 from arrays_values_without_nulls;
-----
-[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] [2, 3]
-[11, 12, 13, 14, 15, 16, 17, 18, 19, 20] [4, 5]
-[21, 22, 23, 24, 25, 26, 27, 28, 29, 30] [6, 7]
-[31, 32, 33, 34, 35, 26, 37, 38, 39, 40] [8, 9]
-
-# make array with arrays of different types
-query ?
-select make_array(make_array(1), arrow_cast(make_array(-1), 'LargeList(Int8)'))
-----
-[[1], [-1]]
-
-query T
-select arrow_typeof(make_array(make_array(1), arrow_cast(make_array(-1), 'LargeList(Int8)')));
-----
-List(nullable LargeList(nullable Int64))
-
-
-query ???
-select make_array(column1),
-       make_array(column1, column5),
-       make_array(column1, make_array(50,51,52))
-from arrays_values_without_nulls;
-----
-[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]] [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [2, 3]] [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [50, 51, 52]]
-[[11, 12, 13, 14, 15, 16, 17, 18, 19, 20]] [[11, 12, 13, 14, 15, 16, 17, 18, 19, 20], [4, 5]] [[11, 12, 13, 14, 15, 16, 17, 18, 19, 20], [50, 51, 52]]
-[[21, 22, 23, 24, 25, 26, 27, 28, 29, 30]] [[21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [6, 7]] [[21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [50, 51, 52]]
-[[31, 32, 33, 34, 35, 26, 37, 38, 39, 40]] [[31, 32, 33, 34, 35, 26, 37, 38, 39, 40], [8, 9]] [[31, 32, 33, 34, 35, 26, 37, 38, 39, 40], [50, 51, 52]]
-
-## array_element (aliases: array_extract, list_extract, list_element)
-
-# Testing with empty arguments should result in an error
-query error DataFusion error: Error during planning: 'array_element' does not support zero arguments
-select array_element();
-
-# array_element error
-query error
-select array_element(1, 2);
-
-# array_element with null
-query I
-select array_element([1, 2], NULL);
-----
-NULL
-
-query ?
-select array_element(NULL, 2);
-----
-NULL
-
-# array_element scalar function #1 (with positive index)
-query IT
-select array_element(make_array(1, 2, 3, 4, 5), 2), array_element(make_array('h', 'e', 'l', 'l', 'o'), 3);
-----
-2 l
-
-query IT
-select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 3);
-----
-2 l
-
-query IT
-select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 2), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 3);
-----
-2 l
-
-# array_element scalar function #2 (with positive index; out of bounds)
-query IT
-select array_element(make_array(1, 2, 3, 4, 5), 7), array_element(make_array('h', 'e', 'l', 'l', 'o'), 11);
-----
-NULL NULL
-
-query IT
-select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 7), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 11);
-----
-NULL NULL
-
-query IT
-select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 7), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 11);
-----
-NULL NULL
-
-# array_element scalar function #3 (with zero)
-query IT
-select array_element(make_array(1, 2, 3, 4, 5), 0), array_element(make_array('h', 'e', 'l', 'l', 'o'), 0);
-----
-NULL NULL
-
-query IT
-select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 0), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 0);
-----
-NULL NULL
-
-query IT
-select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 0), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 0);
-----
-NULL NULL
-
-# array_element scalar function #4 (with NULL)
-query IT
-select array_element(make_array(1, 2, 3, 4, 5), NULL), array_element(make_array('h', 'e', 'l', 'l', 'o'), NULL);
-----
-NULL NULL
-
-query IT
-select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), NULL), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), NULL);
-----
-NULL NULL
-
-query IT
-select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), NULL), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), NULL);
-----
-NULL NULL
-
-# array_element scalar function #5 (with negative index)
-query IT
-select array_element(make_array(1, 2, 3, 4, 5), -2), array_element(make_array('h', 'e', 'l', 'l', 'o'), -3);
-----
-4 l
-
-query IT
-select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), -2), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), -3);
-----
-4 l
-
-query IT
-select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), -2), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), -3);
-----
-4 l
-
-# array_element scalar function #6 (with negative index; out of bounds)
-query IT
-select array_element(make_array(1, 2, 3, 4, 5), -11), array_element(make_array('h', 'e', 'l', 'l', 'o'), -7);
-----
-NULL NULL
-
-query IT
-select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), -11), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), -7);
-----
-NULL NULL
-
-query IT
-select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), -11), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), -7);
-----
-NULL NULL
-
-# array_element scalar function #7 (nested array)
-query ?
-select array_element(make_array(make_array(1, 2, 3, 4, 5), make_array(6, 7, 8, 9, 10)), 1);
-----
-[1, 2, 3, 4, 5]
-
-query ?
-select array_element(arrow_cast(make_array(make_array(1, 2, 3, 4, 5), make_array(6, 7, 8, 9, 10)), 'LargeList(List(Int64))'), 1);
-----
-[1, 2, 3, 4, 5]
-
-query ?
-select array_element(arrow_cast(make_array(make_array(1, 2, 3, 4, 5), make_array(6, 7, 8, 9, 10)), 'FixedSizeList(2, List(Int64))'), 1);
-----
-[1, 2, 3, 4, 5]
-
-# array_extract scalar function #8 (function alias `array_element`)
-query IT
-select array_extract(make_array(1, 2, 3, 4, 5), 2), array_extract(make_array('h', 'e', 'l', 'l', 'o'), 3);
-----
-2 l
-
-query IT
-select array_extract(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2), array_extract(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 3);
-----
-2 l
-
-query IT
-select array_extract(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 2), array_extract(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 3);
-----
-2 l
-
-# list_element scalar function #9 (function alias `array_element`)
-query IT
-select list_element(make_array(1, 2, 3, 4, 5), 2), list_element(make_array('h', 'e', 'l', 'l', 'o'), 3);
-----
-2 l
-
-query IT
-select list_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2), array_extract(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 3);
-----
-2 l
-
-query IT
-select list_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 2), list_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 3);
-----
-2 l
-
-# list_extract scalar function #10 (function alias `array_element`)
-query IT
-select list_extract(make_array(1, 2, 3, 4, 5), 2), list_extract(make_array('h', 'e', 'l', 'l', 'o'), 3);
-----
-2 l
-
-query IT
-select list_extract(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2), list_extract(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 3);
-----
-2 l
-
-query IT
-select list_extract(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 2), list_extract(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 3);
-----
-2 l
-
-# array_element with columns
-query I
-select array_element(column1, column2) from slices;
-----
-NULL
-12
-NULL
-37
-NULL
-NULL
-55
-
-query I
-select array_element(arrow_cast(column1, 'LargeList(Int64)'), column2) from slices;
-----
-NULL
-12
-NULL
-37
-NULL
-NULL
-55
-
-query I
-select array_element(column1, column2) from fixed_slices;
-----
-NULL
-12
-NULL
-37
-NULL
-55
-
-# array_element with columns and scalars
-query II
-select array_element(make_array(1, 2, 3, 4, 5), column2), array_element(column1, 3) from slices;
-----
-1 3
-2 13
-NULL 23
-2 33
-4 NULL
-NULL 43
-5 NULL
-
-query II
-select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), column2), array_element(arrow_cast(column1, 'LargeList(Int64)'), 3) from slices;
-----
-1 3
-2 13
-NULL 23
-2 33
-4 NULL
-NULL 43
-5 NULL
-
-query II
-select array_element(make_array(1, 2, 3, 4, 5), column2), array_element(column1, 3) from fixed_slices;
-----
-1 3
-2 13
-NULL 23
-2 33
-NULL 43
-5 NULL
-
-# array_element of empty array
-query T
-select coalesce(array_element([], 1), array_element(NULL, 1), 'ok');
-----
-ok
-
-
-## array_max
-# array_max scalar function #1 (with positive index)
-query I
-select array_max(make_array(5, 3, 6, 4));
-----
-6
-
-query I
-select array_max(make_array(5, 3, 4, NULL, 6, NULL));
-----
-6
-
-query ?
-select array_max(make_array(NULL, NULL));
-----
-NULL
-
-query T
-select array_max(make_array('h', 'e', 'o', 'l', 'l'));
-----
-o
-
-query T
-select array_max(make_array('h', 'e', 'l', NULL, 'l', 'o', NULL));
-----
-o
-
-query B
-select array_max(make_array(false, true, false, true));
-----
-true
-
-query B
-select array_max(make_array(false, true, NULL, false, true));
-----
-true
-
-query D
-select array_max(make_array(DATE '1992-09-01', DATE '1993-03-01', DATE '1999-05-01', DATE '1985-11-01'));
-----
-1999-05-01
-
-query D
-select array_max(make_array(DATE '1995-09-01', DATE '1999-05-01', DATE '1993-03-01', NULL));
-----
-1999-05-01
-
-query P
-select array_max(make_array(TIMESTAMP '1992-09-01', TIMESTAMP '1995-06-01', TIMESTAMP '1984-10-01'));
-----
-1995-06-01T00:00:00
-
-query P
-select array_max(make_array(NULL, TIMESTAMP '1996-10-01', TIMESTAMP '1995-06-01'));
-----
-1996-10-01T00:00:00
-
-query R
-select array_max(make_array(5.1, -3.2, 6.3, 4.9));
-----
-6.3
-
-query ?I
-select input, array_max(input) from (select make_array(d - 1, d, d + 1) input from (values (0), (10), (20), (30), (NULL)) t(d))
-----
-[-1, 0, 1] 1
-[9, 10, 11] 11
-[19, 20, 21] 21
-[29, 30, 31] 31
-[NULL, NULL, NULL] NULL
-
-query II
-select array_max(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), array_max(arrow_cast(make_array(1), 'LargeList(Int64)'));
-----
-3 1
-
-query II
-select array_max(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), array_max(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)'));
-----
-3 1
-
-query ?
-select array_max(make_array());
-----
-NULL
-
-# Testing with empty arguments should result in an error
-query error DataFusion error: Error during planning: 'array_max' does not support zero arguments
-select array_max();
-
-## array_min
-
-query I
-select array_min(make_array(5, 3, 6, 4));
-----
-3
-
-query I
-select array_min(make_array(5, 3, 4, NULL, 6, NULL));
-----
-3
-
-query ?
-select array_min(make_array(NULL, NULL));
-----
-NULL
-
-query T
-select array_min(make_array('h', 'e', 'o', 'l', 'l'));
-----
-e
-
-query T
-select array_min(make_array('h', 'e', 'l', NULL, 'l', 'o', NULL));
-----
-e
-
-query B
-select array_min(make_array(false, true, false, true));
-----
-false
-
-query B
-select array_min(make_array(false, true, NULL, false, true));
-----
-false
-
-query D
-select array_min(make_array(DATE '1992-09-01', DATE '1993-03-01', DATE '1999-05-01', DATE '1985-11-01'));
-----
-1985-11-01
-
-query D
-select array_min(make_array(DATE '1995-09-01', DATE '1999-05-01', DATE '1993-03-01', NULL));
-----
-1993-03-01
-
-query P
-select array_min(make_array(TIMESTAMP '1992-09-01', TIMESTAMP '1995-06-01', TIMESTAMP '1984-10-01'));
-----
-1984-10-01T00:00:00
-
-query P
-select array_min(make_array(NULL, TIMESTAMP '1996-10-01', TIMESTAMP '1995-06-01'));
-----
-1995-06-01T00:00:00
-
-query R
-select array_min(make_array(5.1, -3.2, 6.3, 4.9));
-----
--3.2
-
-query ?I
-select input, array_min(input) from (select make_array(d - 1, d, d + 1) input from (values (0), (10), (20), (30), (NULL)) t(d))
-----
-[-1, 0, 1] -1
-[9, 10, 11] 9
-[19, 20, 21] 19
-[29, 30, 31] 29
-[NULL, NULL, NULL] NULL
-
-query II
-select array_min(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), array_min(arrow_cast(make_array(1), 'LargeList(Int64)'));
-----
-1 1
-
-query II
-select array_min(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), array_min(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)'));
-----
-1 1
-
-query ?
-select array_min(make_array());
-----
-NULL
-
-# Testing with empty arguments should result in an error
-query error DataFusion error: Error during planning: 'array_min' does not support zero arguments
-select array_min();
-
-
-## array_pop_back (aliases: `list_pop_back`)
-
-# array_pop_back scalar function with null
-#TODO: https://github.com/apache/datafusion/issues/7142
-# follow clickhouse and duckdb
-#query ?
-#select array_pop_back(null);
-#----
-#NULL
-
-# array_pop_back scalar function #1
-query ??
-select array_pop_back(make_array(1, 2, 3, 4, 5)), array_pop_back(make_array('h', 'e', 'l', 'l', 'o'));
-----
-[1, 2, 3, 4] [h, e, l, l]
-
-query ??
-select array_pop_back(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)')), array_pop_back(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'));
-----
-[1, 2, 3, 4] [h, e, l, l]
-
-query ??
-select array_pop_back(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)')), array_pop_back(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'));
-----
-[1, 2, 3, 4] [h, e, l, l]
-
-# array_pop_back scalar function #2 (after array_pop_back, array is empty)
-query ?
-select array_pop_back(make_array(1));
-----
-[]
-
-query ?
-select array_pop_back(arrow_cast(make_array(1), 'LargeList(Int64)'));
-----
-[]
-
-query ?
-select array_pop_back(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)'));
-----
-[]
-
-# array_pop_back scalar function #3 (array_pop_back the empty array)
-query ?
-select array_pop_back(array_pop_back(make_array(1)));
-----
-[]
-
-query ?
-select array_pop_back(array_pop_back(arrow_cast(make_array(1), 'LargeList(Int64)')));
-----
-[]
-
-query ?
-select array_pop_back(array_pop_back(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)')));
-----
-[]
-
-# array_pop_back scalar function #4 (array_pop_back the arrays which have NULL)
-query ??
-select array_pop_back(make_array(1, 2, 3, 4, NULL)), array_pop_back(make_array(NULL, 'e', 'l', NULL, 'o'));
-----
-[1, 2, 3, 4] [NULL, e, l, NULL]
-
-query ??
-select array_pop_back(arrow_cast(make_array(1, 2, 3, 4, NULL), 'LargeList(Int64)')), array_pop_back(arrow_cast(make_array(NULL, 'e', 'l', NULL, 'o'), 'LargeList(Utf8)'));
-----
-[1, 2, 3, 4] [NULL, e, l, NULL]
-
-query ??
-select array_pop_back(arrow_cast(make_array(1, 2, 3, 4, NULL), 'FixedSizeList(5, Int64)')), array_pop_back(arrow_cast(make_array(NULL, 'e', 'l', NULL, 'o'), 'FixedSizeList(5, Utf8)'));
-----
-[1, 2, 3, 4] [NULL, e, l, NULL]
-
-# array_pop_back scalar function #5 (array_pop_back the nested arrays)
-query ?
-select array_pop_back(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6)));
-----
-[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]]
-
-query ?
-select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6)), 'LargeList(List(Int64))'));
-----
-[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]]
-
-query ?
-select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6)), 'FixedSizeList(6, List(Int64))'));
-----
-[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]]
-
-# array_pop_back scalar function #6 (array_pop_back the nested arrays with NULL)
-query ?
-select array_pop_back(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), NULL));
-----
-[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]]
-
-query ?
-select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), NULL), 'LargeList(List(Int64))'));
-----
-[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]]
-
-query ?
-select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), NULL), 'FixedSizeList(6, List(Int64))'));
-----
-[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]]
-
-# array_pop_back scalar function #7 (array_pop_back the nested arrays with NULL)
-query ?
-select array_pop_back(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), NULL, make_array(1, 7, 4)));
-----
-[[1, 2, 3], [2, 9, 1], [7, 8, 9], NULL]
-
-query ?
-select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), NULL, make_array(1, 7, 4)), 'LargeList(List(Int64))'));
-----
-[[1, 2, 3], [2, 9, 1], [7, 8, 9], NULL]
-
-query ?
-select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), NULL, make_array(1, 7, 4)), 'FixedSizeList(5, List(Int64))'));
-----
-[[1, 2, 3], [2, 9, 1], [7, 8, 9], NULL]
-
-# array_pop_back scalar function #8 (after array_pop_back, nested array is empty)
-query ?
-select array_pop_back(make_array(make_array(1, 2, 3)));
-----
-[]
-
-query ?
-select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3)), 'LargeList(List(Int64))'));
-----
-[]
-
-query ?
-select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3)), 'FixedSizeList(1, List(Int64))'));
-----
-[]
-
-# array_pop_back with columns
-query ?
-select array_pop_back(column1) from arrayspop;
-----
-[1, 2]
-[3, 4, 5]
-[6, 7, 8, NULL]
-[NULL, NULL]
-NULL
-[NULL, 10, 11]
-
-query ?
-select array_pop_back(arrow_cast(column1, 'LargeList(Int64)')) from arrayspop;
-----
-[1, 2]
-[3, 4, 5]
-[6, 7, 8, NULL]
-[NULL, NULL]
-NULL
-[NULL, 10, 11]
-
-query ?
-select array_pop_back(column1) from large_arrayspop;
-----
-[1, 2]
-[3, 4, 5]
-[6, 7, 8, NULL]
-[NULL, NULL]
-NULL
-[NULL, 10, 11]
-
-query ?
-select array_pop_back(arrow_cast(column1, 'LargeList(Int64)')) from large_arrayspop;
-----
-[1, 2]
-[3, 4, 5]
-[6, 7, 8, NULL]
-[NULL, NULL]
-NULL
-[NULL, 10, 11]
-
-## array_pop_front (aliases: `list_pop_front`)
-
-#TODO:https://github.com/apache/datafusion/issues/7142
-# array_pop_front scalar function with null
-# follow clickhouse and duckdb
-#query ?
-#select array_pop_front(null);
-#----
-#NULL
-
-# array_pop_front scalar function #1
-query ??
-select array_pop_front(make_array(1, 2, 3, 4, 5)), array_pop_front(make_array('h', 'e', 'l', 'l', 'o'));
-----
-[2, 3, 4, 5] [e, l, l, o]
-
-query ??
-select array_pop_front(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)')), array_pop_front(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'));
-----
-[2, 3, 4, 5] [e, l, l, o]
-
-query ??
-select array_pop_front(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)')), array_pop_front(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'));
-----
-[2, 3, 4, 5] [e, l, l, o]
-
-# array_pop_front scalar function #2 (after array_pop_front, array is empty)
-query ?
-select array_pop_front(make_array(1));
-----
-[]
-
-query ?
-select array_pop_front(arrow_cast(make_array(1), 'LargeList(Int64)'));
-----
-[]
-
-query ?
-select array_pop_front(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)'));
-----
-[]
-
-# array_pop_front scalar function #3 (array_pop_front the empty array)
-query ?
-select array_pop_front(array_pop_front(make_array(1)));
-----
-[]
-
-query ?
-select array_pop_front(array_pop_front(arrow_cast(make_array(1), 'LargeList(Int64)')));
-----
-[]
-
-query ?
-select array_pop_front(array_pop_front(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)')));
-----
-[]
-
-# array_pop_front scalar function #5 (array_pop_front the nested arrays)
-query ?
-select array_pop_front(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6)));
-----
-[[2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]]
-
-query ?
-select array_pop_front(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6)), 'LargeList(List(Int64))'));
-----
-[[2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]]
-
-query ?
-select array_pop_front(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6)), 'FixedSizeList(6, List(Int64))'));
-----
-[[2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]]
-
-# array_pop_front scalar function #6 (array_pop_front the nested arrays with NULL)
-query ?
-select array_pop_front(make_array(NULL, make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4)));
-----
-[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]]
-
-query ?
-select array_pop_front(arrow_cast(make_array(NULL, make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4)), 'LargeList(List(Int64))'));
-----
-[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]]
-
-query ?
-select array_pop_front(arrow_cast(make_array(NULL, make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4)), 'FixedSizeList(6, List(Int64))'));
-----
-[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]]
-
-# array_pop_front scalar function #8 (after array_pop_front, nested array is empty)
-query ?
-select array_pop_front(make_array(make_array(1, 2, 3)));
-----
-[]
-
-query ?
-select array_pop_front(arrow_cast(make_array(make_array(1, 2, 3)), 'LargeList(List(Int64))'));
-----
-[]
-
-query ?
-select array_pop_front(arrow_cast(make_array(make_array(1, 2, 3)), 'FixedSizeList(1, List(Int64))'));
-----
-[]
-
-## array_slice (aliases: list_slice)
-
-# array_slice scalar function #1 (with positive indexes)
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), 2, 4), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 1, 2);
-----
-[2, 3, 4] [h, e]
-
-query ????
-select array_slice(make_array(1, 2, 3, 4, 5), 1, 5, 2), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 1, 5, 2),
-       array_slice(make_array(1, 2, 3, 4, 5), 0, 5, 2), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 0, 5, 2);
-----
-[1, 3, 5] [h, l, o] [1, 3, 5] [h, l, o]
-
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), 1, 5, -1), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 1, 5, -1);
-----
-[] []
-
-query error Execution error: array_slice got invalid stride: 0, it cannot be 0
-select array_slice(make_array(1, 2, 3, 4, 5), 1, 5, 0), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 1, 5, 0);
-
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), 5, 1, -2), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 5, 1, -2);
-----
-[5, 3, 1] [o, l, h]
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2, 4), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 1, 2);
-----
-[2, 3, 4] [h, e]
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 2, 4),
-       array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 1, 2);
-----
-[2, 3, 4] [h, e]
-
-# array_slice scalar function #2 (with positive indexes; full array)
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), 0, 6), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 0, 5);
-----
-[1, 2, 3, 4, 5] [h, e, l, l, o]
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 0, 6), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 0, 5);
-----
-[1, 2, 3, 4, 5] [h, e, l, l, o]
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 0, 6),
-       array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 0, 5);
-----
-[1, 2, 3, 4, 5] [h, e, l, l, o]
-
-# array_slice scalar function #3 (with positive indexes; first index = second index)
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), 4, 4), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 3, 3);
-----
-[4] [l]
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 4, 4), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 3, 3);
-----
-[4] [l]
-
-# array_slice scalar function #4 (with positive indexes; first index > second_index)
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), 2, 1), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 4, 1);
-----
-[] []
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2, 1), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 4, 1);
-----
-[] []
-
-# array_slice scalar function #5 (with positive indexes; out of bounds)
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), 2, 6), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 3, 7);
-----
-[2, 3, 4, 5] [l, l, o]
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2, 6), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 3, 7);
-----
-[2, 3, 4, 5] [l, l, o]
-
-# array_slice scalar function #6 (with positive indexes; nested array)
-query ?
-select array_slice(make_array(make_array(1, 2, 3, 4, 5), make_array(6, 7, 8, 9, 10)), 1, 1);
-----
-[[1, 2, 3, 4, 5]]
-
-query ?
-select array_slice(arrow_cast(make_array(make_array(1, 2, 3, 4, 5), make_array(6, 7, 8, 9, 10)), 'LargeList(List(Int64))'), 1, 1);
-----
-[[1, 2, 3, 4, 5]]
-
-# array_slice scalar function #7 (with zero and positive number)
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), 0, 4), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 0, 3);
-----
-[1, 2, 3, 4] [h, e, l]
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 0, 4), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 0, 3);
-----
-[1, 2, 3, 4] [h, e, l]
-
-# array_slice scalar function #8 (with NULL and positive number)
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), NULL, 4), array_slice(make_array('h', 'e', 'l', 'l', 'o'), NULL, 3);
-----
-NULL NULL
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), NULL, 4), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), NULL, 3);
-----
-NULL NULL
-
-# array_slice scalar function #9 (with positive number and NULL)
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), 2, NULL), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 3, NULL);
-----
-NULL NULL
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2, NULL), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 3, NULL);
-----
-NULL NULL
-
-# array_slice scalar function #10 (with zero-zero)
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), 0, 0), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 0, 0);
-----
-[] []
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 0, 0), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 0, 0);
-----
-[] []
-
-# array_slice scalar function #11 (with NULL-NULL)
-query error
-select array_slice(make_array(1, 2, 3, 4, 5), NULL), array_slice(make_array('h', 'e', 'l', 'l', 'o'), NULL);
-
-query error
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), NULL), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), NULL);
-
-# array_slice scalar function #12 (with zero and negative number)
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), 0, -4), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 0, -3);
-----
-[1, 2] [h, e, l]
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 0, -4), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 0, -3);
-----
-[1, 2] [h, e, l]
-
-# array_slice scalar function #13 (with negative number and NULL)
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), -2, NULL), array_slice(make_array('h', 'e', 'l', 'l', 'o'), -3, NULL);
-----
-NULL NULL
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), -2, NULL), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), -3, NULL);
-----
-NULL NULL
-
-# array_slice scalar function #14 (with NULL and negative number)
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), NULL, -4), array_slice(make_array('h', 'e', 'l', 'l', 'o'), NULL, -3);
-----
-NULL NULL
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), NULL, -4), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), NULL, -3);
-----
-NULL NULL
-
-# array_slice scalar function #15 (with negative indexes)
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), -4, -1), array_slice(make_array('h', 'e', 'l', 'l', 'o'), -3, -1);
-----
-[2, 3, 4, 5] [l, l, o]
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), -4, -1), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), -3, -1);
-----
-[2, 3, 4, 5] [l, l, o]
-
-# array_slice scalar function #16 (with negative indexes; almost full array (only with negative indices cannot return full array))
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), -5, -1), array_slice(make_array('h', 'e', 'l', 'l', 'o'), -5, -1);
-----
-[1, 2, 3, 4, 5] [h, e, l, l, o]
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), -5, -1), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), -5, -1);
-----
-[1, 2, 3, 4, 5] [h, e, l, l, o]
-
-# array_slice scalar function #17 (with negative indexes; first index = second index)
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), -4, -4), array_slice(make_array('h', 'e', 'l', 'l', 'o'), -3, -3);
-----
-[2] [l]
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), -4, -4), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), -3, -3);
-----
-[2] [l]
-
-# array_slice scalar function #18 (with negative indexes; first index > second_index)
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), -4, -6), array_slice(make_array('h', 'e', 'l', 'l', 'o'), -3, -6);
-----
-[] []
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), -4, -6), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), -3, -6);
-----
-[] []
-
-# array_slice scalar function #19 (with negative indexes; out of bounds)
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), -7, -2), array_slice(make_array('h', 'e', 'l', 'l', 'o'), -7, -3);
-----
-[1, 2, 3, 4] [h, e, l]
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), -7, -2), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), -7, -3);
-----
-[1, 2, 3, 4] [h, e, l]
-
-# array_slice scalar function #20 (with negative indexes; nested array)
-query ??
-select array_slice(make_array(make_array(1, 2, 3, 4, 5), make_array(6, 7, 8, 9, 10)), -2, -1), array_slice(make_array(make_array(1, 2, 3), make_array(6, 7, 8)), -1, -1);
-----
-[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]] [[6, 7, 8]]
-
-query ??
-select array_slice(arrow_cast(make_array(make_array(1, 2, 3, 4, 5), make_array(6, 7, 8, 9, 10)), 'LargeList(List(Int64))'), -2, -1), array_slice(arrow_cast(make_array(make_array(1, 2, 3), make_array(6, 7, 8)), 'LargeList(List(Int64))'), -1, -1);
-----
-[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]] [[6, 7, 8]]
-
-
-# array_slice scalar function #21 (with first positive index and last negative index)
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), 2, -3), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 2, -2);
-----
-[2, 3] [e, l, l]
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2, -3), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 2, -2);
-----
-[2, 3] [e, l, l]
-
-# array_slice scalar function #22 (with first negative index and last positive index)
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), -2, 5), array_slice(make_array('h', 'e', 'l', 'l', 'o'), -3, 4);
-----
-[4, 5] [l, l]
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), -2, 5), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), -3, 4);
-----
-[4, 5] [l, l]
-
-# list_slice scalar function #23 (function alias `array_slice`)
-query ??
-select list_slice(make_array(1, 2, 3, 4, 5), 2, 4), list_slice(make_array('h', 'e', 'l', 'l', 'o'), 1, 2);
-----
-[2, 3, 4] [h, e]
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2, 4), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 1, 2);
-----
-[2, 3, 4] [h, e]
-
-# array_slice scalar function #24 (with first negative index larger than len)
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), -2147483648, 1), list_slice(make_array('h', 'e', 'l', 'l', 'o'), -2147483648, 1);
-----
-[1] [h]
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), -9223372036854775808, 1), list_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), -9223372036854775808, 1);
-----
-[1] [h]
-
-# array_slice scalar function #25 (with negative step and equal indexes)
-query ??
-select array_slice(make_array(1, 2, 3, 4, 5), 2, 2, -1), list_slice(make_array('h', 'e', 'l', 'l', 'o'), 2, 2, -1);
-----
-[2] [e]
-
-query ??
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2, 2, -1), list_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 2, 2, -1);
-----
-[2] [e]
-
-# array_slice with columns
-query ?
-select array_slice(column1, column2, column3) from slices;
-----
-[NULL]
-[12, 13, 14, 15, 16, 17]
-[]
-[]
-NULL
-NULL
-NULL
-
-query ?
-select array_slice(arrow_cast(column1, 'LargeList(Int64)'), column2, column3) from slices;
-----
-[NULL]
-[12, 13, 14, 15, 16, 17]
-[]
-[]
-NULL
-NULL
-NULL
-
-# TODO: support NULLS in output instead of `[]`
-# array_slice with columns and scalars
-query ???
-select array_slice(make_array(1, 2, 3, 4, 5), column2, column3), array_slice(column1, 3, column3), array_slice(column1, column2, 5) from slices;
-----
-[1] [] [NULL, 2, 3, 4, 5]
-[2] [13, 14, 15, 16, 17] [12, 13, 14, 15]
-[] [] [21, 22, 23, NULL, 25]
-[] [33, 34] []
-[4, 5] NULL NULL
-NULL [43, 44, 45, 46] NULL
-NULL NULL [55]
-
-query ???
-select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), column2, column3), array_slice(arrow_cast(column1, 'LargeList(Int64)'), 3, column3), array_slice(arrow_cast(column1, 'LargeList(Int64)'), column2, 5) from slices;
-----
-[1] [] [NULL, 2, 3, 4, 5]
-[2] [13, 14, 15, 16, 17] [12, 13, 14, 15]
-[] [] [21, 22, 23, NULL, 25]
-[] [33, 34] []
-[4, 5] NULL NULL
-NULL [43, 44, 45, 46] NULL
-NULL NULL [55]
-
-# Test issue: https://github.com/apache/datafusion/issues/10425
-# `from` may be larger than `to` and `stride` is positive
-query ????
-select array_slice(a, -1, 2, 1), array_slice(a, -1, 2),
-       array_slice(a, 3, 2, 1), array_slice(a, 3, 2)
-  from (values ([1.0, 2.0, 3.0, 3.0]), ([4.0, 5.0, 3.0]), ([6.0])) t(a);
-----
-[] [] [] []
-[] [] [] []
-[6.0] [6.0] [] []
-
-# Testing with empty arguments should result in an error
-query error DataFusion error: Error during planning: 'array_slice' does not support zero arguments
-select array_slice();
-
-query error Failed to coerce arguments
-select array_slice(3.5, NULL, NULL);
-
-## array_any_value (aliases: list_any_value)
-
-# Testing with empty arguments should result in an error
-query error
-select array_any_value();
-
-# Testing with non-array arguments should result in an error
-query error
-select array_any_value(1), array_any_value('a'), array_any_value(NULL);
-
-# array_any_value scalar function #1 (with null and non-null elements)
-
-query IT?I
-select array_any_value(make_array(NULL, 1, 2, 3, 4, 5)), array_any_value(make_array(NULL, 'h', 'e', 'l', 'l', 'o')), array_any_value(make_array(NULL, NULL)), array_any_value(make_array(NULL, NULL, 1, 2, 3));
-----
-1 h NULL 1
-
-query ITITI
-select array_any_value(arrow_cast(make_array(NULL, 1, 2, 3, 4, 5), 'LargeList(Int64)')), array_any_value(arrow_cast(make_array(NULL, 'h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)')), array_any_value(arrow_cast(make_array(NULL, NULL), 'LargeList(Int64)')), array_any_value(arrow_cast(make_array(NULL, NULL), 'LargeList(Utf8)')), array_any_value(arrow_cast(make_array(NULL, NULL, 1, 2, 3), 'LargeList(Int64)'));;
-----
-1 h NULL NULL 1
-
-query ITITI
-select array_any_value(arrow_cast(make_array(NULL, 1, 2, 3, 4, 5), 'FixedSizeList(6, Int64)')), array_any_value(arrow_cast(make_array(NULL, 'h', 'e', 'l', 'l', 'o'), 'FixedSizeList(6, Utf8)')), array_any_value(arrow_cast(make_array(NULL, NULL), 'FixedSizeList(2, Int64)')), array_any_value(arrow_cast(make_array(NULL, NULL), 'FixedSizeList(2, Utf8)')), array_any_value(arrow_cast(make_array(NULL, NULL, 1, 2, 3, 4), 'FixedSizeList(6, Int64)'));
-----
-1 h NULL NULL 1
-
-# array_any_value scalar function #2 (with nested array)
-
-query ?
-select array_any_value(make_array(NULL, make_array(NULL, 1, 2, 3, 4, 5), make_array(NULL, 6, 7, 8, 9, 10)));
-----
-[NULL, 1, 2, 3, 4, 5]
-
-query ?
-select array_any_value(arrow_cast(make_array(NULL, make_array(NULL, 1, 2, 3, 4, 5), make_array(NULL, 6, 7, 8, 9, 10)), 'LargeList(List(Int64))'));
-----
-[NULL, 1, 2, 3, 4, 5]
-
-query ?
-select array_any_value(arrow_cast(make_array(NULL, make_array(NULL, 1, 2, 3, 4, 5), make_array(NULL, 6, 7, 8, 9, 10)), 'FixedSizeList(3, List(Int64))'));
-----
-[NULL, 1, 2, 3, 4, 5]
-
-# array_any_value scalar function #3 (using function alias `list_any_value`)
-query IT
-select list_any_value(make_array(NULL, 1, 2, 3, 4, 5)), list_any_value(make_array(NULL, 'h', 'e', 'l', 'l', 'o'));
-----
-1 h
-
-query IT
-select list_any_value(arrow_cast(make_array(NULL, 1, 2, 3, 4, 5), 'LargeList(Int64)')), list_any_value(arrow_cast(make_array(NULL, 'h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'));
-----
-1 h
-
-query IT
-select list_any_value(arrow_cast(make_array(NULL, 1, 2, 3, 4, 5), 'FixedSizeList(6, Int64)')), list_any_value(arrow_cast(make_array(NULL, 'h', 'e', 'l', 'l', 'o'), 'FixedSizeList(6, Utf8)'));
-----
-1 h
-
-# array_any_value with columns
-
-query I
-select array_any_value(column1) from slices;
-----
-2
-11
-21
-31
-NULL
-41
-51
-
-query I
-select array_any_value(arrow_cast(column1, 'LargeList(Int64)')) from slices;
-----
-2
-11
-21
-31
-NULL
-41
-51
-
-query I
-select array_any_value(column1) from fixed_slices;
-----
-2
-11
-21
-31
-41
-51
-
-# array_any_value with columns and scalars
-
-query II
-select array_any_value(make_array(NULL, 1, 2, 3, 4, 5)), array_any_value(column1) from slices;
-----
-1 2
-1 11
-1 21
-1 31
-1 NULL
-1 41
-1 51
-
-query II
-select array_any_value(arrow_cast(make_array(NULL, 1, 2, 3, 4, 5), 'LargeList(Int64)')), array_any_value(arrow_cast(column1, 'LargeList(Int64)')) from slices;
-----
-1 2
-1 11
-1 21
-1 31
-1 NULL
-1 41
-1 51
-
-query II
-select array_any_value(make_array(NULL, 1, 2, 3, 4, 5)), array_any_value(column1) from fixed_slices;
-----
-1 2
-1 11
-1 21
-1 31
-1 41
-1 51
-
-# make_array with nulls
-query ???????
-select make_array(make_array('a','b'), null),
-       make_array(make_array('a','b'), null, make_array('c','d')),
-       make_array(null, make_array('a','b'), null),
-       make_array(null, make_array('a','b'), null, null, make_array('c','d')),
-       make_array(['a', 'bc', 'def'], null, make_array('rust')),
-       make_array([1,2,3], null, make_array(4,5,6,7)),
-       make_array(null, 1, null, 2, null, 3, null, null, 4, 5);
-----
-[[a, b], NULL] [[a, b], NULL, [c, d]] [NULL, [a, b], NULL] [NULL, [a, b], NULL, NULL, [c, d]] [[a, bc, def], NULL, [rust]] [[1, 2, 3], NULL, [4, 5, 6, 7]] [NULL, 1, NULL, 2, NULL, 3, NULL, NULL, 4, 5]
-
-query ?
-select make_array(column5, null, column5) from arrays_values_without_nulls;
-----
-[[2, 3], NULL, [2, 3]]
-[[4, 5], NULL, [4, 5]]
-[[6, 7], NULL, [6, 7]]
-[[8, 9], NULL, [8, 9]]
-
-query ?
-select make_array(['a','b'], null);
-----
-[[a, b], NULL]
-
-## array_sort (aliases: `list_sort`)
-query ???
-select array_sort(make_array(1, 3, null, 5, NULL, -5)), array_sort(make_array(1, 3, null, 2), 'ASC'), array_sort(make_array(1, 3, null, 2), 'desc', 'NULLS FIRST');
-----
-[NULL, NULL, -5, 1, 3, 5] [NULL, 1, 2, 3] [NULL, 3, 2, 1]
-
-query ???
-select array_sort(arrow_cast(make_array(1, 3, null, 5, NULL, -5), 'LargeList(Int64)')),
-       array_sort(arrow_cast(make_array(1, 3, null, 2), 'LargeList(Int64)'), 'ASC'),
-       array_sort(arrow_cast(make_array(1, 3, null, 2), 'LargeList(Int64)'), 'desc', 'NULLS FIRST');
-----
-[NULL, NULL, -5, 1, 3, 5] [NULL, 1, 2, 3] [NULL, 3, 2, 1]
-
-query ???
-select array_sort(arrow_cast(make_array(1, 3, null, 5, NULL, -5), 'FixedSizeList(6, Int64)')),
-       array_sort(arrow_cast(make_array(1, 3, null, 2), 'FixedSizeList(4, Int64)'), 'ASC'),
-       array_sort(arrow_cast(make_array(1, 3, null, 2), 'FixedSizeList(4, Int64)'), 'desc', 'NULLS FIRST');
-----
-[NULL, NULL, -5, 1, 3, 5] [NULL, 1, 2, 3] [NULL, 3, 2, 1]
-
-query ?
-select array_sort(column1, 'DESC', 'NULLS LAST') from arrays_values;
-----
-[10, 9, 8, 7, 6, 5, 4, 3, 2, NULL]
-[20, 18, 17, 16, 15, 14, 13, 12, 11, NULL]
-[30, 29, 28, 27, 26, 25, 23, 22, 21, NULL]
-[40, 39, 38, 37, 35, 34, 33, 32, 31, NULL]
-NULL
-[50, 49, 48, 47, 46, 45, 44, 43, 42, 41]
-[60, 59, 58, 57, 56, 55, 54, 52, 51, NULL]
-[70, 69, 68, 67, 66, 65, 64, 63, 62, 61]
-
-query ?
-select array_sort(column1, 'ASC', 'NULLS FIRST') from arrays_values;
-----
-[NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-[NULL, 11, 12, 13, 14, 15, 16, 17, 18, 20]
-[NULL, 21, 22, 23, 25, 26, 27, 28, 29, 30]
-[NULL, 31, 32, 33, 34, 35, 37, 38, 39, 40]
-NULL
-[41, 42, 43, 44, 45, 46, 47, 48, 49, 50]
-[NULL, 51, 52, 54, 55, 56, 57, 58, 59, 60]
-[61, 62, 63, 64, 65, 66, 67, 68, 69, 70]
-
-# test with empty table
-query ?
-select array_sort(column1, 'DESC', 'NULLS FIRST') from arrays_values where false;
-----
-
-# test with empty array
-query ?
-select array_sort([]);
-----
-[]
-
-# test with null arguments
-query ?
-select array_sort(NULL);
-----
-NULL
-
-query ?
-select array_sort(column1, NULL) from arrays_values;
-----
-NULL
-NULL
-NULL
-NULL
-NULL
-NULL
-NULL
-NULL
-
-query ??
-select array_sort(column1, 'DESC', NULL), array_sort(column1, 'ASC', NULL) from arrays_values;
-----
-NULL NULL
-NULL NULL
-NULL NULL
-NULL NULL
-NULL NULL
-NULL NULL
-NULL NULL
-NULL NULL
-
-query ??
-select array_sort(column1, NULL, 'NULLS FIRST'), array_sort(column1, NULL, 'NULLS LAST') from arrays_values;
-----
-NULL NULL
-NULL NULL
-NULL NULL
-NULL NULL
-NULL NULL
-NULL NULL
-NULL NULL
-NULL NULL
-
-query ?
-select array_sort([struct('foo', 3), struct('foo', 1), struct('bar', 1)])
-----
-[{c0: bar, c1: 1}, {c0: foo, c1: 1}, {c0: foo, c1: 3}]
-
-## test with argument of incorrect types
-query error DataFusion error: Execution error: the second parameter of array_sort expects DESC or ASC
-select array_sort([1, 3, null, 5, NULL, -5], 1), array_sort([1, 3, null, 5, NULL, -5], 'DESC', 1), array_sort([1, 3, null, 5, NULL, -5], 1, 1);
-
-# test with empty row, the row that does not match the condition has row count 0
-statement ok
-create table t1(a int, b int) as values (100, 1), (101, 2), (102, 3), (101, 2);
-
-# rowsort is to ensure the order of group by is deterministic, array_sort has no effect here, since the sum() always returns single row.
-query ? rowsort
-select array_sort([sum(a)]) from t1 where a > 100 group by b;
-----
-[102]
-[202]
-
-statement ok
-drop table t1;
-
-## list_sort (aliases: `array_sort`)
-query ???
-select list_sort(make_array(1, 3, null, 5, NULL, -5)), list_sort(make_array(1, 3, null, 2), 'ASC'), list_sort(make_array(1, 3, null, 2), 'desc', 'NULLS FIRST');
-----
-[NULL, NULL, -5, 1, 3, 5] [NULL, 1, 2, 3] [NULL, 3, 2, 1]
-
-
-## array_append (aliases: `list_append`, `array_push_back`, `list_push_back`)
-
-# array_append with NULLs
-
-query ?
-select array_append(null, 1);
-----
-[1]
-
-query ?
-select array_append(null, [2, 3]);
-----
-[[2, 3]]
-
-query ?
-select array_append(null, [[4]]);
-----
-[[[4]]]
-
-query ????
-select
-  array_append(make_array(), 4),
-  array_append(make_array(), null),
-  array_append(make_array(1, null, 3), 4),
-  array_append(make_array(null, null), 1)
-;
-----
-[4] [NULL] [1, NULL, 3, 4] [NULL, NULL, 1]
-
-query ????
-select
-  array_append(arrow_cast(make_array(), 'LargeList(Int64)'), 4),
-  array_append(arrow_cast(make_array(), 'LargeList(Int64)'), null),
-  array_append(arrow_cast(make_array(1, null, 3), 'LargeList(Int64)'), 4),
-  array_append(arrow_cast(make_array(null, null), 'LargeList(Int64)'), 1)
-;
-----
-[4] [NULL] [1, NULL, 3, 4] [NULL, NULL, 1]
-
-query ??
-select
-  array_append(arrow_cast(make_array(1, null, 3), 'FixedSizeList(3, Int64)'), 4),
-  array_append(arrow_cast(make_array(null, null), 'FixedSizeList(2, Int64)'), 1)
-;
-----
-[1, NULL, 3, 4] [NULL, NULL, 1]
-
-# test invalid (non-null)
-query error
-select array_append(1, 2);
-
-query error
-select array_append(1, [2]);
-
-query error
-select array_append([1], [2]);
-
-query ??
-select
-  array_append(make_array(make_array(1, null, 3)), make_array(null)),
-  array_append(make_array(make_array(1, null, 3)), null);
-----
-[[1, NULL, 3], [NULL]] [[1, NULL, 3], NULL]
-
-query ??
-select
-  array_append(arrow_cast(make_array(make_array(1, null, 3)), 'LargeList(LargeList(Int64))'), arrow_cast(make_array(null), 'LargeList(Int64)')),
-  array_append(arrow_cast(make_array(make_array(1, null, 3)), 'LargeList(LargeList(Int64))'), null);
-----
-[[1, NULL, 3], [NULL]] [[1, NULL, 3], NULL]
-
-query ??
-select
-  array_append(arrow_cast(make_array(make_array(1, null, 3)), 'FixedSizeList(1, List(Int64))'), [null]),
-  array_append(arrow_cast(make_array(make_array(1, null, 3)), 'FixedSizeList(1, List(Int64))'), null);
-----
-[[1, NULL, 3], [NULL]] [[1, NULL, 3], NULL]
-
-# array_append scalar function #3
-query ???
-select array_append(make_array(1, 2, 3), 4), array_append(make_array(1.0, 2.0, 3.0), 4.0), array_append(make_array('h', 'e', 'l', 'l'), 'o');
-----
-[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
-
-query ???
-select array_append(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4), array_append(arrow_cast(make_array(1.0, 2.0, 3.0), 'LargeList(Float64)'), 4.0), array_append(arrow_cast(make_array('h', 'e', 'l', 'l'), 'LargeList(Utf8)'), 'o');
-----
-[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
-
-query ???
-select array_append(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 4), array_append(arrow_cast(make_array(1.0, 2.0, 3.0), 'FixedSizeList(3, Float64)'), 4.0), array_append(arrow_cast(make_array('h', 'e', 'l', 'l'), 'FixedSizeList(4, Utf8)'), 'o');
-----
-[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
-
-# array_append scalar function #4 (element is list)
-query ???
-select array_append(make_array([1], [2], [3]), make_array(4)), array_append(make_array([1.0], [2.0], [3.0]), make_array(4.0)), array_append(make_array(['h'], ['e'], ['l'], ['l']), make_array('o'));
-----
-[[1], [2], [3], [4]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]]
-
-query ???
-select array_append(arrow_cast(make_array([1], [2], [3]), 'LargeList(LargeList(Int64))'), arrow_cast(make_array(4), 'LargeList(Int64)')), array_append(arrow_cast(make_array([1.0], [2.0], [3.0]), 'LargeList(LargeList(Float64))'), arrow_cast(make_array(4.0), 'LargeList(Float64)')), array_append(arrow_cast(make_array(['h'], ['e'], ['l'], ['l']), 'LargeList(LargeList(Utf8))'), arrow_cast(make_array('o'), 'LargeList(Utf8)'));
-----
-[[1], [2], [3], [4]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]]
-
-query ???
-select array_append(arrow_cast(make_array([1], [2], [3]), 'FixedSizeList(3, List(Int64))'), [4]), array_append(arrow_cast(make_array([1.0], [2.0], [3.0]), 'FixedSizeList(3, List(Float64))'), [4.0]), array_append(arrow_cast(make_array(['h'], ['e'], ['l'], ['l']), 'FixedSizeList(4, List(Utf8))'), ['o']);
-----
-[[1], [2], [3], [4]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]]
-
-# list_append scalar function #5 (function alias `array_append`)
-query ???
-select list_append(make_array(1, 2, 3), 4), list_append(make_array(1.0, 2.0, 3.0), 4.0), list_append(make_array('h', 'e', 'l', 'l'), 'o');
-----
-[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
-
-query ???
-select list_append(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4), list_append(arrow_cast(make_array(1.0, 2.0, 3.0), 'LargeList(Float64)'), 4.0), list_append(make_array('h', 'e', 'l', 'l'), 'o');
-----
-[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
-
-# array_push_back scalar function #6 (function alias `array_append`)
-query ???
-select array_push_back(make_array(1, 2, 3), 4), array_push_back(make_array(1.0, 2.0, 3.0), 4.0), array_push_back(make_array('h', 'e', 'l', 'l'), 'o');
-----
-[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
-
-query ???
-select array_push_back(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4), array_push_back(arrow_cast(make_array(1.0, 2.0, 3.0), 'LargeList(Float64)'), 4.0), array_push_back(make_array('h', 'e', 'l', 'l'), 'o');
-----
-[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
-
-# list_push_back scalar function #7 (function alias `array_append`)
-query ???
-select list_push_back(make_array(1, 2, 3), 4), list_push_back(make_array(1.0, 2.0, 3.0), 4.0), list_push_back(make_array('h', 'e', 'l', 'l'), 'o');
-----
-[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
-
-query ???
-select list_push_back(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4), list_push_back(arrow_cast(make_array(1.0, 2.0, 3.0), 'LargeList(Float64)'), 4.0), list_push_back(make_array('h', 'e', 'l', 'l'), 'o');
-----
-[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
-
-# array_append with columns #1
-query ?
-select array_append(column1, column2) from arrays_values;
-----
-[NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1]
-[11, 12, 13, 14, 15, 16, 17, 18, NULL, 20, 12]
-[21, 22, 23, NULL, 25, 26, 27, 28, 29, 30, 23]
-[31, 32, 33, 34, 35, NULL, 37, 38, 39, 40, 34]
-[44]
-[41, 42, 43, 44, 45, 46, 47, 48, 49, 50, NULL]
-[51, 52, NULL, 54, 55, 56, 57, 58, 59, 60, 55]
-[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 66]
-
-query ?
-select array_append(column1, column2) from large_arrays_values;
-----
-[NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1]
-[11, 12, 13, 14, 15, 16, 17, 18, NULL, 20, 12]
-[21, 22, 23, NULL, 25, 26, 27, 28, 29, 30, 23]
-[31, 32, 33, 34, 35, NULL, 37, 38, 39, 40, 34]
-[44]
-[41, 42, 43, 44, 45, 46, 47, 48, 49, 50, NULL]
-[51, 52, NULL, 54, 55, 56, 57, 58, 59, 60, 55]
-[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 66]
-
-query ?
-select array_append(column1, column2) from fixed_arrays_values;
-----
-[NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1]
-[11, 12, 13, 14, 15, 16, 17, 18, NULL, 20, 12]
-[21, 22, 23, NULL, 25, 26, 27, 28, 29, 30, 23]
-[31, 32, 33, 34, 35, NULL, 37, 38, 39, 40, 34]
-[NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 44]
-[41, 42, 43, 44, 45, 46, 47, 48, 49, 50, NULL]
-[51, 52, NULL, 54, 55, 56, 57, 58, 59, 60, 55]
-[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 66]
-
-# array_append with columns #2 (element is list)
-query ?
-select array_append(column1, column2) from nested_arrays;
-----
-[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6], [7, 8, 9]]
-[[4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7], [10, 11, 12]]
-
-query ?
-select array_append(column1, column2) from large_nested_arrays;
-----
-[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6], [7, 8, 9]]
-[[4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7], [10, 11, 12]]
-
-query ?
-select array_append(column1, column2) from fixed_size_nested_arrays;
-----
-[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6], [7, 8, 9]]
-[[4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7], [10, 11, 12]]
-
-# array_append with columns and scalars #1
-query ??
-select array_append(column2, 100.1), array_append(column3, '.') from arrays;
-----
-[1.1, 2.2, 3.3, 100.1] [L, o, r, e, m, .]
-[NULL, 5.5, 6.6, 100.1] [i, p, NULL, u, m, .]
-[7.7, 8.8, 9.9, 100.1] [d, NULL, l, o, r, .]
-[10.1, NULL, 12.2, 100.1] [s, i, t, .]
-[13.3, 14.4, 15.5, 100.1] [a, m, e, t, .]
-[100.1] [,, .]
-[16.6, 17.7, 18.8, 100.1] [.]
-
-query ??
-select array_append(column2, 100.1), array_append(column3, '.') from large_arrays;
-----
-[1.1, 2.2, 3.3, 100.1] [L, o, r, e, m, .]
-[NULL, 5.5, 6.6, 100.1] [i, p, NULL, u, m, .]
-[7.7, 8.8, 9.9, 100.1] [d, NULL, l, o, r, .]
-[10.1, NULL, 12.2, 100.1] [s, i, t, .]
-[13.3, 14.4, 15.5, 100.1] [a, m, e, t, .]
-[100.1] [,, .]
-[16.6, 17.7, 18.8, 100.1] [.]
-
-query ??
-select array_append(column2, 100.1), array_append(column3, '.') from fixed_size_arrays;
-----
-[1.1, 2.2, 3.3, 100.1] [L, o, r, e, m, .]
-[NULL, 5.5, 6.6, 100.1] [i, p, NULL, u, m, .]
-[7.7, 8.8, 9.9, 100.1] [d, NULL, l, o, r, .]
-[10.1, NULL, 12.2, 100.1] [s, i, t, a, b, .]
-[13.3, 14.4, 15.5, 100.1] [a, m, e, t, x, .]
-[NULL, NULL, NULL, 100.1] [,, a, b, c, d, .]
-[16.6, 17.7, 18.8, 100.1] [NULL, NULL, NULL, NULL, NULL, .]
-
-# array_append with columns and scalars #2
-query ??
-select array_append(column1, make_array(1, 11, 111)), array_append(make_array(make_array(1, 2, 3), make_array(11, 12, 13)), column2) from nested_arrays;
-----
-[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6], [1, 11, 111]] [[1, 2, 3], [11, 12, 13], [7, 8, 9]]
-[[4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7], [1, 11, 111]] [[1, 2, 3], [11, 12, 13], [10, 11, 12]]
-
-query ??
-select array_append(column1, arrow_cast(make_array(1, 11, 111), 'LargeList(Int64)')), array_append(arrow_cast(make_array(make_array(1, 2, 3), make_array(11, 12, 13)), 'LargeList(LargeList(Int64))'), column2) from large_nested_arrays;
-----
-[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6], [1, 11, 111]] [[1, 2, 3], [11, 12, 13], [7, 8, 9]]
-[[4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7], [1, 11, 111]] [[1, 2, 3], [11, 12, 13], [10, 11, 12]]
-
-query ??
-select array_append(column1, arrow_cast(make_array(1, 11, 111), 'FixedSizeList(3, Int64)')), array_append(arrow_cast(make_array(make_array(1, 2, 3), make_array(11, 12, 13)), 'FixedSizeList(2, List(Int64))'), column2) from fixed_size_nested_arrays;
-----
-[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6], [1, 11, 111]] [[1, 2, 3], [11, 12, 13], [7, 8, 9]]
-[[4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7], [1, 11, 111]] [[1, 2, 3], [11, 12, 13], [10, 11, 12]]
-
-## array_prepend (aliases: `list_prepend`, `array_push_front`, `list_push_front`)
-
-# array_prepend with NULLs
-
-# DuckDB: [4]
-# ClickHouse: Null
-query ?
-select array_prepend(4, NULL);
-----
-[4]
-
-query ?
-select array_prepend(4, []);
-----
-[4]
-
-query ?
-select array_prepend(4, [null]);
-----
-[4, NULL]
-
-# DuckDB: [null]
-# ClickHouse: [null]
-query ?
-select array_prepend(null, []);
-----
-[NULL]
-
-query ?
-select array_prepend(null, [1]);
-----
-[NULL, 1]
-
-query ?
-select array_prepend(null, [[1,2,3]]);
-----
-[NULL, [1, 2, 3]]
-
-# DuckDB: [[]]
-# ClickHouse: [[]]
-# TODO: We may also return [[]]
-query ?
-select array_prepend([], []);
-----
-[[]]
-
-query ?
-select array_prepend(null, null);
-----
-[NULL]
-
-query ?
-select array_append([], null);
-----
-[NULL]
-
-
-# array_prepend scalar function #3
-query ???
-select array_prepend(1, make_array(2, 3, 4)), array_prepend(1.0, make_array(2.0, 3.0, 4.0)), array_prepend('h', make_array('e', 'l', 'l', 'o'));
-----
-[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
-
-query ???
-select array_prepend(1, arrow_cast(make_array(2, 3, 4), 'LargeList(Int64)')), array_prepend(1.0, arrow_cast(make_array(2.0, 3.0, 4.0), 'LargeList(Float64)')), array_prepend('h', arrow_cast(make_array('e', 'l', 'l', 'o'), 'LargeList(Utf8)'));
-----
-[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
-
-query ???
-select array_prepend(1, arrow_cast([2, 3, 4], 'FixedSizeList(3, Int64)')), array_prepend(1.0, arrow_cast([2.0, 3.0, 4.0], 'FixedSizeList(3, Float64)')), array_prepend('h', arrow_cast(['e', 'l', 'l', 'o'], 'FixedSizeList(4, Utf8)'));
-----
-[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
-
-# array_prepend scalar function #4 (element is list)
-query ???
-select array_prepend(make_array(1), make_array(make_array(2), make_array(3), make_array(4))), array_prepend(make_array(1.0), make_array([2.0], [3.0], [4.0])), array_prepend(make_array('h'), make_array(['e'], ['l'], ['l'], ['o']));
-----
-[[1], [2], [3], [4]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]]
-
-query ???
-select array_prepend(arrow_cast(make_array(1), 'LargeList(Int64)'), arrow_cast(make_array(make_array(2), make_array(3), make_array(4)), 'LargeList(LargeList(Int64))')),
-       array_prepend(arrow_cast(make_array(1.0), 'LargeList(Float64)'), arrow_cast(make_array([2.0], [3.0], [4.0]), 'LargeList(LargeList(Float64))')),
-       array_prepend(arrow_cast(make_array('h'), 'LargeList(Utf8)'), arrow_cast(make_array(['e'], ['l'], ['l'], ['o']), 'LargeList(LargeList(Utf8))'));
-----
-[[1], [2], [3], [4]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]]
-
-query ???
-select array_prepend(arrow_cast([1], 'FixedSizeList(1, Int64)'), arrow_cast([[1], [2], [3]], 'FixedSizeList(3, List(Int64))')),
-       array_prepend(arrow_cast([1.0], 'FixedSizeList(1, Float64)'), arrow_cast([[2.0], [3.0], [4.0]], 'FixedSizeList(3, List(Float64))')),
-       array_prepend(arrow_cast(['h'], 'FixedSizeList(1, Utf8)'), arrow_cast([['e'], ['l'], ['l'], ['o']], 'FixedSizeList(4, List(Utf8))'));
-----
-[[1], [1], [2], [3]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]]
-
-# list_prepend scalar function #5 (function alias `array_prepend`)
-query ???
-select list_prepend(1, make_array(2, 3, 4)), list_prepend(1.0, make_array(2.0, 3.0, 4.0)), list_prepend('h', make_array('e', 'l', 'l', 'o'));
-----
-[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
-
-query ???
-select list_prepend(1, arrow_cast(make_array(2, 3, 4), 'LargeList(Int64)')), list_prepend(1.0, arrow_cast(make_array(2.0, 3.0, 4.0), 'LargeList(Float64)')), list_prepend('h', arrow_cast(make_array('e', 'l', 'l', 'o'), 'LargeList(Utf8)'));
-----
-[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
-
-# array_push_front scalar function #6 (function alias `array_prepend`)
-query ???
-select array_push_front(1, make_array(2, 3, 4)), array_push_front(1.0, make_array(2.0, 3.0, 4.0)), array_push_front('h', make_array('e', 'l', 'l', 'o'));
-----
-[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
-
-query ???
-select array_push_front(1, arrow_cast(make_array(2, 3, 4), 'LargeList(Int64)')), array_push_front(1.0, arrow_cast(make_array(2.0, 3.0, 4.0), 'LargeList(Float64)')), array_push_front('h', arrow_cast(make_array('e', 'l', 'l', 'o'), 'LargeList(Utf8)'));
-----
-[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
-
-# list_push_front scalar function #7 (function alias `array_prepend`)
-query ???
-select list_push_front(1, make_array(2, 3, 4)), list_push_front(1.0, make_array(2.0, 3.0, 4.0)), list_push_front('h', make_array('e', 'l', 'l', 'o'));
-----
-[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
-
-query ???
-select list_push_front(1, arrow_cast(make_array(2, 3, 4), 'LargeList(Int64)')), list_push_front(1.0, arrow_cast(make_array(2.0, 3.0, 4.0), 'LargeList(Float64)')), list_push_front('h', arrow_cast(make_array('e', 'l', 'l', 'o'), 'LargeList(Utf8)'));
-----
-[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
-
-# array_prepend scalar function #7 (element is fixed size list)
-query ???
-select array_prepend(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)'), make_array(arrow_cast(make_array(2), 'FixedSizeList(1, Int64)'), arrow_cast(make_array(3), 'FixedSizeList(1, Int64)'), arrow_cast(make_array(4), 'FixedSizeList(1, Int64)'))),
-       array_prepend(arrow_cast(make_array(1.0), 'FixedSizeList(1, Float64)'), make_array(arrow_cast([2.0], 'FixedSizeList(1, Float64)'), arrow_cast([3.0], 'FixedSizeList(1, Float64)'), arrow_cast([4.0], 'FixedSizeList(1, Float64)'))),
-       array_prepend(arrow_cast(make_array('h'), 'FixedSizeList(1, Utf8)'), make_array(arrow_cast(['e'], 'FixedSizeList(1, Utf8)'), arrow_cast(['l'], 'FixedSizeList(1, Utf8)'), arrow_cast(['l'], 'FixedSizeList(1, Utf8)'), arrow_cast(['o'], 'FixedSizeList(1, Utf8)')));
-----
-[[1], [2], [3], [4]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]]
-
-query ???
-select array_prepend(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)'), arrow_cast(make_array(make_array(2), make_array(3), make_array(4)), 'LargeList(FixedSizeList(1, Int64))')),
-       array_prepend(arrow_cast(make_array(1.0), 'FixedSizeList(1, Float64)'), arrow_cast(make_array([2.0], [3.0], [4.0]), 'LargeList(FixedSizeList(1, Float64))')),
-       array_prepend(arrow_cast(make_array('h'), 'FixedSizeList(1, Utf8)'), arrow_cast(make_array(['e'], ['l'], ['l'], ['o']), 'LargeList(FixedSizeList(1, Utf8))'));
-----
-[[1], [2], [3], [4]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]]
-
-query ???
-select array_prepend(arrow_cast([1], 'FixedSizeList(1, Int64)'), arrow_cast([[1], [2], [3]], 'FixedSizeList(3, FixedSizeList(1, Int64))')),
-       array_prepend(arrow_cast([1.0], 'FixedSizeList(1, Float64)'), arrow_cast([[2.0], [3.0], [4.0]], 'FixedSizeList(3, FixedSizeList(1, Float64))')),
-       array_prepend(arrow_cast(['h'], 'FixedSizeList(1, Utf8)'), arrow_cast([['e'], ['l'], ['l'], ['o']], 'FixedSizeList(4, FixedSizeList(1, Utf8))'));
-----
-[[1], [1], [2], [3]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]]
-
-# array_prepend with columns #1
-query ?
-select array_prepend(column2, column1) from arrays_values;
-----
-[1, NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-[12, 11, 12, 13, 14, 15, 16, 17, 18, NULL, 20]
-[23, 21, 22, 23, NULL, 25, 26, 27, 28, 29, 30]
-[34, 31, 32, 33, 34, 35, NULL, 37, 38, 39, 40]
-[44]
-[NULL, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]
-[55, 51, 52, NULL, 54, 55, 56, 57, 58, 59, 60]
-[66, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70]
-
-query ?
-select array_prepend(column2, column1) from large_arrays_values;
-----
-[1, NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-[12, 11, 12, 13, 14, 15, 16, 17, 18, NULL, 20]
-[23, 21, 22, 23, NULL, 25, 26, 27, 28, 29, 30]
-[34, 31, 32, 33, 34, 35, NULL, 37, 38, 39, 40]
-[44]
-[NULL, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]
-[55, 51, 52, NULL, 54, 55, 56, 57, 58, 59, 60]
-[66, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70]
-
-query ?
-select array_prepend(column2, column1) from fixed_arrays_values;
-----
-[1, NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-[12, 11, 12, 13, 14, 15, 16, 17, 18, NULL, 20]
-[23, 21, 22, 23, NULL, 25, 26, 27, 28, 29, 30]
-[34, 31, 32, 33, 34, 35, NULL, 37, 38, 39, 40]
-[44, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL]
-[NULL, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]
-[55, 51, 52, NULL, 54, 55, 56, 57, 58, 59, 60]
-[66, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70]
-
-# array_prepend with columns #2 (element is list)
-query ?
-select array_prepend(column2, column1) from nested_arrays;
-----
-[[7, 8, 9], [1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]]
-[[10, 11, 12], [4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7]]
-
-query ?
-select array_prepend(column2, column1) from large_nested_arrays;
-----
-[[7, 8, 9], [1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]]
-[[10, 11, 12], [4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7]]
-
-query ?
-select array_prepend(column2, column1) from fixed_size_nested_arrays;
-----
-[[7, 8, 9], [1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]]
-[[10, 11, 12], [4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7]]
-
-# array_prepend with columns and scalars #1
-query ??
-select array_prepend(100.1, column2), array_prepend('.', column3) from arrays;
-----
-[100.1, 1.1, 2.2, 3.3] [., L, o, r, e, m]
-[100.1, NULL, 5.5, 6.6] [., i, p, NULL, u, m]
-[100.1, 7.7, 8.8, 9.9] [., d, NULL, l, o, r]
-[100.1, 10.1, NULL, 12.2] [., s, i, t]
-[100.1, 13.3, 14.4, 15.5] [., a, m, e, t]
-[100.1] [., ,]
-[100.1, 16.6, 17.7, 18.8] [.]
-
-query ??
-select array_prepend(100.1, column2), array_prepend('.', column3) from large_arrays;
-----
-[100.1, 1.1, 2.2, 3.3] [., L, o, r, e, m]
-[100.1, NULL, 5.5, 6.6] [., i, p, NULL, u, m]
-[100.1, 7.7, 8.8, 9.9] [., d, NULL, l, o, r]
-[100.1, 10.1, NULL, 12.2] [., s, i, t]
-[100.1, 13.3, 14.4, 15.5] [., a, m, e, t]
-[100.1] [., ,]
-[100.1, 16.6, 17.7, 18.8] [.]
-
-query ??
-select array_prepend(100.1, column2), array_prepend('.', column3) from fixed_size_arrays;
-----
-[100.1, 1.1, 2.2, 3.3] [., L, o, r, e, m]
-[100.1, NULL, 5.5, 6.6] [., i, p, NULL, u, m]
-[100.1, 7.7, 8.8, 9.9] [., d, NULL, l, o, r]
-[100.1, 10.1, NULL, 12.2] [., s, i, t, a, b]
-[100.1, 13.3, 14.4, 15.5] [., a, m, e, t, x]
-[100.1, NULL, NULL, NULL] [., ,, a, b, c, d]
-[100.1, 16.6, 17.7, 18.8] [., NULL, NULL, NULL, NULL, NULL]
-
-# array_prepend with columns and scalars #2 (element is list)
-query ??
-select array_prepend(make_array(1, 11, 111), column1), array_prepend(column2, make_array(make_array(1, 2, 3), make_array(11, 12, 13))) from nested_arrays;
-----
-[[1, 11, 111], [1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]] [[7, 8, 9], [1, 2, 3], [11, 12, 13]]
-[[1, 11, 111], [4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7]] [[10, 11, 12], [1, 2, 3], [11, 12, 13]]
-
-query ??
-select array_prepend(arrow_cast(make_array(1, 11, 111), 'LargeList(Int64)'), column1), array_prepend(column2, arrow_cast(make_array(make_array(1, 2, 3), make_array(11, 12, 13)), 'LargeList(LargeList(Int64))')) from large_nested_arrays;
-----
-[[1, 11, 111], [1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]] [[7, 8, 9], [1, 2, 3], [11, 12, 13]]
-[[1, 11, 111], [4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7]] [[10, 11, 12], [1, 2, 3], [11, 12, 13]]
-
-query ??
-select array_prepend(arrow_cast(make_array(1, 11, 111), 'FixedSizeList(3, Int64)'), column1), array_prepend(column2, arrow_cast(make_array(make_array(1, 2, 3), make_array(11, 12, 13)), 'FixedSizeList(2, List(Int64))')) from fixed_size_nested_arrays;
-----
-[[1, 11, 111], [1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]] [[7, 8, 9], [1, 2, 3], [11, 12, 13]]
-[[1, 11, 111], [4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7]] [[10, 11, 12], [1, 2, 3], [11, 12, 13]]
-
-## array_repeat (aliases: `list_repeat`)
-
-# array_repeat scalar function #1
-query ????????
-select
-  array_repeat(1, 5),
-  array_repeat(3.14, 3),
-  array_repeat('l', 4),
-  array_repeat(null, 2),
-  list_repeat(-1, 5),
-  list_repeat(-3.14, 0),
-  list_repeat('rust', 4),
-  list_repeat(null, 0);
-----
-[1, 1, 1, 1, 1] [3.14, 3.14, 3.14] [l, l, l, l] [NULL, NULL] [-1, -1, -1, -1, -1] [] [rust, rust, rust, rust] []
-
-# array_repeat scalar function #2 (element as list)
-query ????
-select
-  array_repeat([1], 5),
-  array_repeat([1.1, 2.2, 3.3], 3),
-  array_repeat([null, null], 3),
-  array_repeat([[1, 2], [3, 4]], 2);
-----
-[[1], [1], [1], [1], [1]] [[1.1, 2.2, 3.3], [1.1, 2.2, 3.3], [1.1, 2.2, 3.3]] [[NULL, NULL], [NULL, NULL], [NULL, NULL]] [[[1, 2], [3, 4]], [[1, 2], [3, 4]]]
-
-query ????
-select
-  array_repeat(arrow_cast([1], 'LargeList(Int64)'), 5),
-  array_repeat(arrow_cast([1.1, 2.2, 3.3], 'LargeList(Float64)'), 3),
-  array_repeat(arrow_cast([null, null], 'LargeList(Int64)'), 3),
-  array_repeat(arrow_cast([[1, 2], [3, 4]], 'LargeList(List(Int64))'), 2);
-----
-[[1], [1], [1], [1], [1]] [[1.1, 2.2, 3.3], [1.1, 2.2, 3.3], [1.1, 2.2, 3.3]] [[NULL, NULL], [NULL, NULL], [NULL, NULL]] [[[1, 2], [3, 4]], [[1, 2], [3, 4]]]
-
-# array_repeat scalar function with count of different integer types
-query ????????
-Select
-  array_repeat(1, arrow_cast(2,'Int8')),
-  array_repeat(2, arrow_cast(2,'Int16')),
-  array_repeat(3, arrow_cast(2,'Int32')),
-  array_repeat(4, arrow_cast(2,'Int64')),
-  array_repeat(1, arrow_cast(2,'UInt8')),
-  array_repeat(2, arrow_cast(2,'UInt16')),
-  array_repeat(3, arrow_cast(2,'UInt32')),
-  array_repeat(4, arrow_cast(2,'UInt64'));
-----
-[1, 1] [2, 2] [3, 3] [4, 4] [1, 1] [2, 2] [3, 3] [4, 4]
-
-# array_repeat scalar function with count of negative integer types
-query ????
-Select
-  array_repeat(1, arrow_cast(-2,'Int8')),
-  array_repeat(2, arrow_cast(-2,'Int16')),
-  array_repeat(3, arrow_cast(-2,'Int32')),
-  array_repeat(4, arrow_cast(-2,'Int64'));
-----
-[] [] [] []
-
-# array_repeat with columns #1
-
-statement ok
-CREATE TABLE array_repeat_table
-AS VALUES
-  (1, 1, 1.1, 'a', make_array(4, 5, 6)),
-  (2, null, null, null, null),
-  (3, 2, 2.2, 'rust', make_array(7)),
-  (0, 3, 3.3, 'datafusion', make_array(8, 9));
-
-statement ok
-CREATE TABLE large_array_repeat_table
-AS SELECT
-  column1,
-  column2,
-  column3,
-  column4,
-  arrow_cast(column5, 'LargeList(Int64)') as column5
-FROM array_repeat_table;
-
-query ??????
-select
-  array_repeat(column2, column1),
-  array_repeat(column3, column1),
-  array_repeat(column4, column1),
-  array_repeat(column5, column1),
-  array_repeat(column2, 3),
-  array_repeat(make_array(1), column1)
-from array_repeat_table;
-----
-[1] [1.1] [a] [[4, 5, 6]] [1, 1, 1] [[1]]
-[NULL, NULL] [NULL, NULL] [NULL, NULL] [NULL, NULL] [NULL, NULL, NULL] [[1], [1]]
-[2, 2, 2] [2.2, 2.2, 2.2] [rust, rust, rust] [[7], [7], [7]] [2, 2, 2] [[1], [1], [1]]
-[] [] [] [] [3, 3, 3] []
-
-query ??????
-select
-  array_repeat(column2, column1),
-  array_repeat(column3, column1),
-  array_repeat(column4, column1),
-  array_repeat(column5, column1),
-  array_repeat(column2, 3),
-  array_repeat(make_array(1), column1)
-from large_array_repeat_table;
-----
-[1] [1.1] [a] [[4, 5, 6]] [1, 1, 1] [[1]]
-[NULL, NULL] [NULL, NULL] [NULL, NULL] [NULL, NULL] [NULL, NULL, NULL] [[1], [1]]
-[2, 2, 2] [2.2, 2.2, 2.2] [rust, rust, rust] [[7], [7], [7]] [2, 2, 2] [[1], [1], [1]]
-[] [] [] [] [3, 3, 3] []
-
-statement ok
-drop table array_repeat_table;
-
-statement ok
-drop table large_array_repeat_table;
-
-## array_concat (aliases: `array_cat`, `list_concat`, `list_cat`)
-
-# test with empty array
-query ?
-select array_concat([]);
-----
-[]
-
-# test with NULL array
-query ?
-select array_concat(NULL::integer[]);
-----
-NULL
-
-# test with multiple NULL arrays
-query ?
-select array_concat(NULL::integer[], NULL::integer[]);
-----
-NULL
-
-# test with NULL LargeList
-query ?
-select array_concat(arrow_cast(NULL::string[], 'LargeList(Utf8)'));
-----
-NULL
-
-# test with NULL FixedSizeList
-query ?
-select array_concat(arrow_cast(NULL::string[], 'FixedSizeList(2, Utf8)'));
-----
-NULL
-
-# test with mix of NULL and empty arrays
-query ?
-select array_concat(NULL::integer[], []);
-----
-[]
-
-# test with mix of NULL and non-empty arrays
-query ?
-select array_concat(NULL::integer[], [1, 2, 3]);
-----
-[1, 2, 3]
-
-# Concatenating strings arrays
-query ?
-select array_concat(
-  ['1', '2'],
-  ['3']
-);
-----
-[1, 2, 3]
-
-query ?
-select array_concat(
-  arrow_cast(['1', '2'], 'LargeList(Utf8)'),
-  arrow_cast(['3'], 'LargeList(Utf8)')
-);
-----
-[1, 2, 3]
-
-query ?
-select array_concat(
-  arrow_cast(['1', '2'], 'FixedSizeList(2, Utf8)'),
-  arrow_cast(['3'], 'FixedSizeList(1, Utf8)')
-);
-----
-[1, 2, 3]
-
-# Concatenating string arrays
-query ?
-select array_concat(
-  [arrow_cast('1', 'LargeUtf8'), arrow_cast('2', 'LargeUtf8')],
-  [arrow_cast('3', 'LargeUtf8')]
-);
-----
-[1, 2, 3]
-
-# Concatenating stringview
-query ?
-select array_concat(
-  [arrow_cast('1', 'Utf8View'), arrow_cast('2', 'Utf8View')],
-  [arrow_cast('3', 'Utf8View')]
-);
-----
-[1, 2, 3]
-
-# Concatenating Mixed types
-query ?
-select array_concat(
-  [arrow_cast('1', 'Utf8'), arrow_cast('2', 'Utf8')],
-  [arrow_cast('3', 'LargeUtf8')]
-);
-----
-[1, 2, 3]
-
-# Concatenating Mixed types
-query ?T
-select
-    array_concat([arrow_cast('1', 'Utf8'), arrow_cast('2', 'Utf8')], [arrow_cast('3', 'Utf8View')]),
-    arrow_typeof(array_concat([arrow_cast('1', 'Utf8'), arrow_cast('2', 'Utf8')], [arrow_cast('3', 'Utf8View')]));
-----
-[1, 2, 3] List(nullable Utf8View)
-
-# array_concat error
-query error DataFusion error: Error during planning: Execution error: Function 'array_concat' user-defined coercion failed with "Error during planning: array_concat does not support type Int64"
-select array_concat(1, 2);
-
-# array_concat scalar function #1
-query ??
-select array_concat(make_array(1, 2, 3), make_array(4, 5, 6), make_array(7, 8, 9)), array_concat(make_array([1], [2]), make_array([3], [4]));
-----
-[1, 2, 3, 4, 5, 6, 7, 8, 9] [[1], [2], [3], [4]]
-
-# array_concat scalar function #2
-query ?
-select array_concat(make_array(make_array(1, 2), make_array(3, 4)), make_array(make_array(5, 6), make_array(7, 8)));
-----
-[[1, 2], [3, 4], [5, 6], [7, 8]]
-
-# array_concat scalar function #3
-query ?
-select array_concat(make_array([1], [2], [3]), make_array([4], [5], [6]), make_array([7], [8], [9]));
-----
-[[1], [2], [3], [4], [5], [6], [7], [8], [9]]
-
-# array_concat scalar function #4
-query ?
-select array_concat(make_array([[1]]), make_array([[2]]));
-----
-[[[1]], [[2]]]
-
-# array_concat scalar function #5
-query ?
-select array_concat(make_array(2, 3), make_array());
-----
-[2, 3]
-
-# array_concat scalar function #6
-query ?
-select array_concat(make_array(), make_array(2, 3));
-----
-[2, 3]
-
-# array_concat scalar function #7 (with empty arrays)
-query ?
-select array_concat(make_array(make_array(1, 2), make_array(3, 4)), make_array(make_array()));
-----
-[[1, 2], [3, 4], []]
-
-# array_concat scalar function #8 (with empty arrays)
-query ?
-select array_concat(make_array(make_array(1, 2), make_array(3, 4)), make_array(make_array()), make_array(make_array(), make_array()), make_array(make_array(5, 6), make_array(7, 8)));
-----
-[[1, 2], [3, 4], [], [], [], [5, 6], [7, 8]]
-
-# array_concat scalar function #9 (with empty arrays)
-query ?
-select array_concat(make_array(make_array()), make_array(make_array(1, 2), make_array(3, 4)));
-----
-[[], [1, 2], [3, 4]]
-
-# array_cat scalar function #10 (function alias `array_concat`)
-query ??
-select array_cat(make_array(1, 2, 3), make_array(4, 5, 6), make_array(7, 8, 9)), array_cat(make_array([1], [2]), make_array([3], [4]));
-----
-[1, 2, 3, 4, 5, 6, 7, 8, 9] [[1], [2], [3], [4]]
-
-# list_concat scalar function #11 (function alias `array_concat`)
-query ??
-select list_concat(make_array(1, 2, 3), make_array(4, 5, 6), make_array(7, 8, 9)), list_concat(make_array([1], [2]), make_array([3], [4]));
-----
-[1, 2, 3, 4, 5, 6, 7, 8, 9] [[1], [2], [3], [4]]
-
-# list_cat scalar function #12 (function alias `array_concat`)
-query ??
-select list_cat(make_array(1, 2, 3), make_array(4, 5, 6), make_array(7, 8, 9)), list_cat(make_array([1], [2]), make_array([3], [4]));
-----
-[1, 2, 3, 4, 5, 6, 7, 8, 9] [[1], [2], [3], [4]]
-
-# array_concat with different dimensions #1 (2D + 1D)
-query ?
-select array_concat(make_array([1,2], [3,4]), make_array(5, 6));
-----
-[[1, 2], [3, 4], [5, 6]]
-
-# array_concat with different dimensions #2 (1D + 2D)
-query ?
-select array_concat(make_array(5, 6), make_array([1,2], [3,4]));
-----
-[[5, 6], [1, 2], [3, 4]]
-
-# array_concat with different dimensions #3 (2D + 1D + 1D)
-query ?
-select array_concat(make_array([1,2], [3,4]), make_array(5, 6), make_array(7,8));
-----
-[[1, 2], [3, 4], [5, 6], [7, 8]]
-
-# array_concat with different dimensions #4 (1D + 2D + 3D)
-query ?
-select array_concat(make_array(10, 20), make_array([30, 40]), make_array([[50, 60]]));
-----
-[[[10, 20]], [[30, 40]], [[50, 60]]]
-
-# array_concat with different dimensions #5 (2D + 1D + 3D)
-query ?
-select array_concat(make_array([30, 40]), make_array(10, 20), make_array([[50, 60]]));
-----
-[[[30, 40]], [[10, 20]], [[50, 60]]]
-
-# array_concat with different dimensions #6 (2D + 1D + 3D + 4D + 3D)
-query ?
-select array_concat(make_array([30, 40]), make_array(10, 20), make_array([[50, 60]]),  make_array([[[70, 80]]]), make_array([[80, 40]]));
-----
-[[[[30, 40]]], [[[10, 20]]], [[[50, 60]]], [[[70, 80]]], [[[80, 40]]]]
-
-# array_concat column-wise #1
-query ?
-select array_concat(column1, make_array(0)) from arrays_values_without_nulls;
-----
-[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0]
-[11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0]
-[21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0]
-[31, 32, 33, 34, 35, 26, 37, 38, 39, 40, 0]
-
-# array_concat column-wise #2
-query ?
-select array_concat(column1, column1) from arrays_values_without_nulls;
-----
-[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-[11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
-[21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
-[31, 32, 33, 34, 35, 26, 37, 38, 39, 40, 31, 32, 33, 34, 35, 26, 37, 38, 39, 40]
-
-# array_concat column-wise #3
-query ?
-select array_concat(make_array(column2), make_array(column3)) from arrays_values_without_nulls;
-----
-[1, 1]
-[12, 2]
-[23, 3]
-[34, 4]
-
-# array_concat column-wise #4
-query ?
-select array_concat(make_array(column2), make_array(0)) from arrays_values;
-----
-[1, 0]
-[12, 0]
-[23, 0]
-[34, 0]
-[44, 0]
-[NULL, 0]
-[55, 0]
-[66, 0]
-
-# array_concat column-wise #5
-query ???
-select array_concat(column1, column1), array_concat(column2, column2), array_concat(column3, column3) from arrays;
-----
-[[NULL, 2], [3, NULL], [NULL, 2], [3, NULL]] [1.1, 2.2, 3.3, 1.1, 2.2, 3.3] [L, o, r, e, m, L, o, r, e, m]
-[[3, 4], [5, 6], [3, 4], [5, 6]] [NULL, 5.5, 6.6, NULL, 5.5, 6.6] [i, p, NULL, u, m, i, p, NULL, u, m]
-[[5, 6], [7, 8], [5, 6], [7, 8]] [7.7, 8.8, 9.9, 7.7, 8.8, 9.9] [d, NULL, l, o, r, d, NULL, l, o, r]
-[[7, NULL], [9, 10], [7, NULL], [9, 10]] [10.1, NULL, 12.2, 10.1, NULL, 12.2] [s, i, t, s, i, t]
-NULL [13.3, 14.4, 15.5, 13.3, 14.4, 15.5] [a, m, e, t, a, m, e, t]
-[[11, 12], [13, 14], [11, 12], [13, 14]] NULL [,, ,]
-[[15, 16], [NULL, 18], [15, 16], [NULL, 18]] [16.6, 17.7, 18.8, 16.6, 17.7, 18.8] NULL
-
-# array_concat column-wise #6
-query ??
-select array_concat(column1, make_array(make_array(1, 2), make_array(3, 4))), array_concat(column2, make_array(1.1, 2.2, 3.3)) from arrays;
-----
-[[NULL, 2], [3, NULL], [1, 2], [3, 4]] [1.1, 2.2, 3.3, 1.1, 2.2, 3.3]
-[[3, 4], [5, 6], [1, 2], [3, 4]] [NULL, 5.5, 6.6, 1.1, 2.2, 3.3]
-[[5, 6], [7, 8], [1, 2], [3, 4]] [7.7, 8.8, 9.9, 1.1, 2.2, 3.3]
-[[7, NULL], [9, 10], [1, 2], [3, 4]] [10.1, NULL, 12.2, 1.1, 2.2, 3.3]
-[[1, 2], [3, 4]] [13.3, 14.4, 15.5, 1.1, 2.2, 3.3]
-[[11, 12], [13, 14], [1, 2], [3, 4]] [1.1, 2.2, 3.3]
-[[15, 16], [NULL, 18], [1, 2], [3, 4]] [16.6, 17.7, 18.8, 1.1, 2.2, 3.3]
-
-# array_concat column-wise #7
-query ?
-select array_concat(column3, make_array('.', '.', '.')) from arrays;
-----
-[L, o, r, e, m, ., ., .]
-[i, p, NULL, u, m, ., ., .]
-[d, NULL, l, o, r, ., ., .]
-[s, i, t, ., ., .]
-[a, m, e, t, ., ., .]
-[,, ., ., .]
-[., ., .]
-
-# query ??I?
-# select column1, column2, column3, column4 from arrays_values_v2;
-# ----
-# [NULL, 2, 3] [4, 5, NULL] 12 [[30, 40, 50]]
-# NULL [7, NULL, 8] 13 [[NULL, NULL, 60]]
-# [9, NULL, 10] NULL 14 [[70, NULL, NULL]]
-# [NULL, 1] [NULL, 21] NULL NULL
-# [11, 12] NULL NULL NULL
-# NULL NULL NULL NULL
-
-
-# array_concat column-wise #8 (1D + 1D)
-query ?
-select array_concat(column1, column2) from arrays_values_v2;
-----
-[NULL, 2, 3, 4, 5, NULL]
-[7, NULL, 8]
-[9, NULL, 10]
-[NULL, 1, NULL, 21]
-[11, 12]
-NULL
-
-# array_concat column-wise #9 (2D + 1D)
-query ?
-select array_concat(column4, make_array(column3)) from arrays_values_v2;
-----
-[[30, 40, 50], [12]]
-[[NULL, NULL, 60], [13]]
-[[70, NULL, NULL], [14]]
-[[NULL]]
-[[NULL]]
-[[NULL]]
-
-# array_concat column-wise #10 (3D + 2D + 1D)
-query ?
-select array_concat(column4, column1, column2) from nested_arrays;
-----
-[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]], [[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]], [[7, 8, 9]]]
-[[[11, 12, 13], [14, 15, 16]], [[17, 18, 19], [20, 21, 22]], [[4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7]], [[10, 11, 12]]]
-
-# array_concat column-wise #11 (2D + 1D)
-query ?
-select array_concat(column4, column1) from arrays_values_v2;
-----
-[[30, 40, 50], [NULL, 2, 3]]
-[[NULL, NULL, 60], NULL]
-[[70, NULL, NULL], [9, NULL, 10]]
-[[NULL, 1]]
-[[11, 12]]
-[NULL]
-
-# array_concat column-wise #12 (1D + 1D + 1D)
-query ?
-select array_concat(make_array(column3), column1, column2) from arrays_values_v2;
-----
-[12, NULL, 2, 3, 4, 5, NULL]
-[13, 7, NULL, 8]
-[14, 9, NULL, 10]
-[NULL, NULL, 1, NULL, 21]
-[NULL, 11, 12]
-[NULL]
-
-## array_position (aliases: `list_position`, `array_indexof`, `list_indexof`)
-
-## array_position with NULL (follow PostgreSQL)
-query II
-select array_position([1, 2, 3, 4, 5], arrow_cast(NULL, 'Int64')), array_position(arrow_cast(NULL, 'List(Int64)'), 1);
-----
-NULL NULL
-
-# array_position with no match (incl. empty array) returns NULL
-query II
-select array_position([], 1), array_position([2], 1);
-----
-NULL NULL
-
-# array_position scalar function #1
-query III
-select array_position(['h', 'e', 'l', 'l', 'o'], 'l'), array_position([1, 2, 3, 4, 5], 5), array_position([1, 1, 1], 1);
-----
-3 5 1
-
-query III
-select array_position(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), 'l'), array_position(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), 5), array_position(arrow_cast([1, 1, 1], 'LargeList(Int64)'), 1);
-----
-3 5 1
-
-query III
-select array_position(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'FixedSizeList(5, Utf8)'), 'l'), array_position(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)'), 5), array_position(arrow_cast([1, 1, 1], 'FixedSizeList(3, Int64)'), 1);
-----
-3 5 1
-
-# array_position scalar function #2 (with optional argument)
-query III
-select array_position(['h', 'e', 'l', 'l', 'o'], 'l', 4), array_position([1, 2, 5, 4, 5], 5, 4), array_position([1, 1, 1], 1, 2);
-----
-4 5 2
-
-query III
-select array_position(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), 'l', 4), array_position(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), 5, 4), array_position(arrow_cast([1, 1, 1], 'LargeList(Int64)'), 1, 2);
-----
-4 5 2
-
-query III
-select array_position(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'FixedSizeList(5, Utf8)'), 'l', 4), array_position(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)'), 5, 4), array_position(arrow_cast([1, 1, 1], 'FixedSizeList(3, Int64)'), 1, 2);
-----
-4 5 2
-
-# array_position scalar function #3 (element is list)
-query II
-select array_position(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), [4, 5, 6]), array_position(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), [2, 3, 4]);
-----
-2 2
-
-# array_position scalar function #4 (element in list; with optional argument)
-query II
-select array_position(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), [4, 5, 6], 3), array_position(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), [2, 3, 4], 3);
-----
-4 3
-
-query II
-select array_position(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'LargeList(List(Int64))'), [4, 5, 6]), array_position(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'LargeList(List(Int64))'), [2, 3, 4]);
-----
-2 2
-
-query I
-SELECT array_position(arrow_cast([5, 2, 3, 4, 5], 'List(Int32)'), 5)
-----
-1
-
-query I
-SELECT array_position(arrow_cast([5, 2, 3, 4, 5], 'List(Int32)'), 5, 2)
-----
-5
-
-query I
-SELECT array_position(arrow_cast([1, 1, 100, 1, 1], 'LargeList(Int32)'), 100)
-----
-3
-
-query error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'array_position' function: coercion from
-SELECT array_position([1, 2, 3], 'foo')
-
-query error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'array_position' function: coercion from
-SELECT array_position([1, 2, 3], 'foo', 2)
-
-# list_position scalar function #5 (function alias `array_position`)
-query III
-select list_position(['h', 'e', 'l', 'l', 'o'], 'l'), list_position([1, 2, 3, 4, 5], 5), list_position([1, 1, 1], 1);
-----
-3 5 1
-
-query III
-select list_position(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), 'l'), list_position(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), 5), list_position(arrow_cast([1, 1, 1], 'LargeList(Int64)'), 1);
-----
-3 5 1
-
-# array_indexof scalar function #6 (function alias `array_position`)
-query III
-select array_indexof(['h', 'e', 'l', 'l', 'o'], 'l'), array_indexof([1, 2, 3, 4, 5], 5), array_indexof([1, 1, 1], 1);
-----
-3 5 1
-
-query III
-select array_indexof(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), 'l'), array_indexof(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), 5), array_indexof(arrow_cast([1, 1, 1], 'LargeList(Int64)'), 1);
-----
-3 5 1
-
-# list_indexof scalar function #7 (function alias `array_position`)
-query III
-select list_indexof(['h', 'e', 'l', 'l', 'o'], 'l'), list_indexof([1, 2, 3, 4, 5], 5), list_indexof([1, 1, 1], 1);
-----
-3 5 1
-
-query III
-select list_indexof(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), 'l'), list_indexof(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), 5), list_indexof(arrow_cast([1, 1, 1], 'LargeList(Int64)'), 1);
-----
-3 5 1
-
-# array_position with columns #1
-query II
-select array_position(column1, column2), array_position(column1, column2, column3) from arrays_values_without_nulls;
-----
-1 1
-2 2
-3 3
-4 4
-
-query II
-select array_position(column1, column2), array_position(column1, column2, column3) from large_arrays_values_without_nulls;
-----
-1 1
-2 2
-3 3
-4 4
-
-# array_position with columns #2 (element is list)
-query II
-select array_position(column1, column2), array_position(column1, column2, column3) from nested_arrays;
-----
-3 3
-2 5
-
-query II
-select array_position(column1, column2), array_position(column1, column2, column3) from nested_arrays;
-----
-3 3
-2 5
-
-# array_position with columns and scalars #1
-query III
-select array_position(make_array(1, 2, 3, 4, 5), column2), array_position(column1, 3), array_position(column1, 3, 5) from arrays_values_without_nulls;
-----
-1 3 NULL
-NULL NULL NULL
-NULL NULL NULL
-NULL NULL NULL
-
-query III
-select array_position(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), column2), array_position(column1, 3), array_position(column1, 3, 5) from large_arrays_values_without_nulls;
-----
-1 3 NULL
-NULL NULL NULL
-NULL NULL NULL
-NULL NULL NULL
-
-# array_position with columns and scalars #2 (element is list)
-query III
-select array_position(make_array([1, 2, 3], [4, 5, 6], [11, 12, 13]), column2), array_position(column1, make_array(4, 5, 6)), array_position(column1, make_array(1, 2, 3), 2) from nested_arrays;
-----
-NULL 6 4
-NULL 1 NULL
-
-query III
-select array_position(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [11, 12, 13]), 'LargeList(LargeList(Int64))'), column2), array_position(column1, arrow_cast(make_array(4, 5, 6), 'LargeList(Int64)')), array_position(column1, arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 2) from large_nested_arrays;
-----
-NULL 6 4
-NULL 1 NULL
-
-## array_positions (aliases: `list_positions`)
-
-query ?
-select array_positions([1, 2, 3, 4, 5], null);
-----
-[]
-
-#TODO: https://github.com/apache/datafusion/issues/7142
-# array_positions with NULL (follow PostgreSQL)
-#query ?
-#select array_positions(null, 1);
-#----
-#NULL
-
-# array_positions scalar function #1
-query ???
-select array_positions(['h', 'e', 'l', 'l', 'o'], 'l'), array_positions([1, 2, 3, 4, 5], 5), array_positions([1, 1, 1], 1);
-----
-[3, 4] [5] [1, 2, 3]
-
-query ???
-select array_positions(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), 'l'), array_positions(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), 5), array_positions(arrow_cast([1, 1, 1], 'LargeList(Int64)'), 1);
-----
-[3, 4] [5] [1, 2, 3]
-
-query ???
-select array_positions(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'FixedSizeList(5, Utf8)'), 'l'), array_positions(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)'), 5), array_positions(arrow_cast([1, 1, 1], 'FixedSizeList(3, Int64)'), 1);
-----
-[3, 4] [5] [1, 2, 3]
-
-# array_positions scalar function #2 (element is list)
-query ?
-select array_positions(make_array([1, 2, 3], [2, 1, 3], [1, 5, 6], [2, 1, 3], [4, 5, 6]), [2, 1, 3]);
-----
-[2, 4]
-
-query ?
-select array_positions(arrow_cast(make_array([1, 2, 3], [2, 1, 3], [1, 5, 6], [2, 1, 3], [4, 5, 6]), 'LargeList(List(Int64))'), [2, 1, 3]);
-----
-[2, 4]
-
-query ?
-select array_positions(arrow_cast(make_array([1, 2, 3], [2, 1, 3], [1, 5, 6], [2, 1, 3], [4, 5, 6]), 'FixedSizeList(5, List(Int64))'), [2, 1, 3]);
-----
-[2, 4]
-
-# list_positions scalar function #3 (function alias `array_positions`)
-query ???
-select list_positions(['h', 'e', 'l', 'l', 'o'], 'l'), list_positions([1, 2, 3, 4, 5], 5), list_positions([1, 1, 1], 1);
-----
-[3, 4] [5] [1, 2, 3]
-
-query ???
-select list_positions(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), 'l'), list_positions(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), 5), list_positions(arrow_cast([1, 1, 1], 'LargeList(Int64)'), 1);
-----
-[3, 4] [5] [1, 2, 3]
-
-query ???
-select list_positions(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'FixedSizeList(5, Utf8)'), 'l'),
-       list_positions(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)'), 5),
-       list_positions(arrow_cast([1, 1, 1], 'FixedSizeList(3, Int64)'), 1);
-----
-[3, 4] [5] [1, 2, 3]
-
-# array_positions with columns #1
-query ?
-select array_positions(column1, column2) from arrays_values_without_nulls;
-----
-[1]
-[2]
-[3]
-[4]
-
-query ?
-select array_positions(arrow_cast(column1, 'LargeList(Int64)'), column2) from arrays_values_without_nulls;
-----
-[1]
-[2]
-[3]
-[4]
-
-query ?
-select array_positions(arrow_cast(column1, 'LargeList(Int64)'), column2) from fixed_size_arrays_values_without_nulls;
-----
-[1]
-[2]
-[3]
-[4]
-
-# array_positions with columns #2 (element is list)
-query ?
-select array_positions(column1, column2) from nested_arrays;
-----
-[3]
-[2, 5]
-
-query ?
-select array_positions(arrow_cast(column1, 'LargeList(List(Int64))'), column2) from nested_arrays;
-----
-[3]
-[2, 5]
-
-query ?
-select array_positions(column1, column2) from fixed_size_nested_arrays;
-----
-[3]
-[2, 5]
-
-# array_positions with columns and scalars #1
-query ??
-select array_positions(column1, 4), array_positions(array[1, 2, 23, 13, 33, 45], column2) from arrays_values_without_nulls;
-----
-[4] [1]
-[] []
-[] [3]
-[] []
-
-query ??
-select array_positions(arrow_cast(column1, 'LargeList(Int64)'), 4), array_positions(array[1, 2, 23, 13, 33, 45], column2) from arrays_values_without_nulls;
-----
-[4] [1]
-[] []
-[] [3]
-[] []
-
-query ??
-select array_positions(column1, 4), array_positions(array[1, 2, 23, 13, 33, 45], column2) from fixed_size_arrays_values_without_nulls;
-----
-[4] [1]
-[] []
-[] [3]
-[] []
-
-# array_positions with columns and scalars #2 (element is list)
-query ??
-select array_positions(column1, make_array(4, 5, 6)), array_positions(make_array([1, 2, 3], [11, 12, 13], [4, 5, 6]), column2) from nested_arrays;
-----
-[6] []
-[1] []
-
-query ??
-select array_positions(arrow_cast(column1, 'LargeList(List(Int64))'), make_array(4, 5, 6)), array_positions(arrow_cast(make_array([1, 2, 3], [11, 12, 13], [4, 5, 6]), 'LargeList(List(Int64))'), column2) from nested_arrays;
-----
-[6] []
-[1] []
-
-query ??
-select array_positions(column1, make_array(4, 5, 6)), array_positions(make_array([1, 2, 3], [11, 12, 13], [4, 5, 6]), column2) from fixed_size_nested_arrays;
-----
-[6] []
-[1] []
-
-## array_replace (aliases: `list_replace`)
-
-# array_replace scalar function #1
-query ???
-select
-  array_replace(make_array(1, 2, 3, 4), 2, 3),
-  array_replace(make_array(1, 4, 4, 5, 4, 6, 7), 4, 0),
-  array_replace(make_array(1, 2, 3), 4, 0);
-----
-[1, 3, 3, 4] [1, 0, 4, 5, 4, 6, 7] [1, 2, 3]
-
-query ???
-select
-  array_replace(arrow_cast(make_array(1, 2, 3, 4), 'LargeList(Int64)'), 2, 3),
-  array_replace(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'LargeList(Int64)'), 4, 0),
-  array_replace(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4, 0);
-----
-[1, 3, 3, 4] [1, 0, 4, 5, 4, 6, 7] [1, 2, 3]
-
-query ???
-select
-  array_replace(arrow_cast(make_array(1, 2, 3, 4), 'FixedSizeList(4, Int64)'), 2, 3),
-  array_replace(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'FixedSizeList(7, Int64)'), 4, 0),
-  array_replace(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 4, 0);
-----
-[1, 3, 3, 4] [1, 0, 4, 5, 4, 6, 7] [1, 2, 3]
-
-# array_replace scalar function #2 (element is list)
-query ??
-select
-  array_replace(
-    make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]),
-    [4, 5, 6],
-    [1, 1, 1]
-  ),
-  array_replace(
-    make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]),
-    [2, 3, 4],
-    [3, 1, 4]
-  );
-----
-[[1, 2, 3], [1, 1, 1], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]]
-
-query ??
-select
-  array_replace(
-    arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'LargeList(List(Int64))'),
-    [4, 5, 6],
-    [1, 1, 1]
-  ),
-  array_replace(
-    arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'LargeList(List(Int64))'),
-    [2, 3, 4],
-    [3, 1, 4]
-  );
-----
-[[1, 2, 3], [1, 1, 1], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]]
-
-query ??
-select
-  array_replace(
-    arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'),
-    [4, 5, 6],
-    [1, 1, 1]
-  ),
-  array_replace(
-    arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'),
-    [2, 3, 4],
-    [3, 1, 4]
-  );
-----
-[[1, 2, 3], [1, 1, 1], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]]
-
-# list_replace scalar function #3 (function alias `list_replace`)
-query ???
-select list_replace(
-  make_array(1, 2, 3, 4), 2, 3),
-  list_replace(make_array(1, 4, 4, 5, 4, 6, 7), 4, 0),
-  list_replace(make_array(1, 2, 3), 4, 0);
-----
-[1, 3, 3, 4] [1, 0, 4, 5, 4, 6, 7] [1, 2, 3]
-
-query ???
-select list_replace(
-  arrow_cast(make_array(1, 2, 3, 4), 'LargeList(Int64)'), 2, 3),
-  list_replace(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'LargeList(Int64)'), 4, 0),
-  list_replace(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4, 0);
-----
-[1, 3, 3, 4] [1, 0, 4, 5, 4, 6, 7] [1, 2, 3]
-
-# array_replace scalar function #4 (null input)
-query ?
-select array_replace(make_array(1, 2, 3, 4, 5), NULL, NULL);
-----
-[1, 2, 3, 4, 5]
-
-query ?
-select array_replace(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), NULL, NULL);
-----
-[1, 2, 3, 4, 5]
-
-# array_replace scalar function with columns #1
-query ?
-select array_replace(column1, column2, column3) from arrays_with_repeating_elements;
-----
-[1, 4, 1, 3, 2, 2, 1, 3, 2, 3]
-[7, 4, 5, 5, 6, 5, 5, 5, 4, 4]
-[10, 7, 7, 8, 7, 9, 7, 8, 7, 7]
-[13, 11, 12, 10, 11, 12, 10, 11, 12, 10]
-
-query ?
-select array_replace(column1, column2, column3) from large_arrays_with_repeating_elements;
-----
-[1, 4, 1, 3, 2, 2, 1, 3, 2, 3]
-[7, 4, 5, 5, 6, 5, 5, 5, 4, 4]
-[10, 7, 7, 8, 7, 9, 7, 8, 7, 7]
-[13, 11, 12, 10, 11, 12, 10, 11, 12, 10]
-
-# array_replace scalar function with columns #2 (element is list)
-query ?
-select array_replace(column1, column2, column3) from nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [10, 11, 12], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
-[[19, 20, 21], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
-[[28, 29, 30], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
-[[37, 38, 39], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
-
-query ?
-select array_replace(column1, column2, column3) from large_nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [10, 11, 12], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
-[[19, 20, 21], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
-[[28, 29, 30], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
-[[37, 38, 39], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
-
-# array_replace scalar function with columns and scalars #1
-query ???
-select
-  array_replace(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2, column3),
-  array_replace(column1, 1, column3),
-  array_replace(column1, column2, 4)
-from arrays_with_repeating_elements;
-----
-[1, 4, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8] [4, 2, 1, 3, 2, 2, 1, 3, 2, 3] [1, 4, 1, 3, 2, 2, 1, 3, 2, 3]
-[1, 2, 2, 7, 5, 4, 4, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4]
-[1, 2, 2, 4, 5, 4, 4, 10, 7, 10, 7, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7] [4, 7, 7, 8, 7, 9, 7, 8, 7, 7]
-[1, 2, 2, 4, 5, 4, 4, 7, 7, 13, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10] [4, 11, 12, 10, 11, 12, 10, 11, 12, 10]
-
-query ???
-select
-  array_replace(arrow_cast(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), 'LargeList(Int64)'), column2, column3),
-  array_replace(column1, 1, column3),
-  array_replace(column1, column2, 4)
-from large_arrays_with_repeating_elements;
-----
-[1, 4, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8] [4, 2, 1, 3, 2, 2, 1, 3, 2, 3] [1, 4, 1, 3, 2, 2, 1, 3, 2, 3]
-[1, 2, 2, 7, 5, 4, 4, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4]
-[1, 2, 2, 4, 5, 4, 4, 10, 7, 10, 7, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7] [4, 7, 7, 8, 7, 9, 7, 8, 7, 7]
-[1, 2, 2, 4, 5, 4, 4, 7, 7, 13, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10] [4, 11, 12, 10, 11, 12, 10, 11, 12, 10]
-
-# array_replace scalar function with columns and scalars #2 (element is list)
-query ???
-select
-  array_replace(
-    make_array(
-      [1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]),
-      column2,
-      column3
-    ),
-  array_replace(column1, make_array(1, 2, 3), column3),
-  array_replace(column1, column2, make_array(11, 12, 13))
-from nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [10, 11, 12], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]] [[1, 2, 3], [11, 12, 13], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [19, 20, 21], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] [[11, 12, 13], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [28, 29, 30], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [[11, 12, 13], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [37, 38, 39], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]] [[11, 12, 13], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
-
-query ???
-select
-  array_replace(
-    arrow_cast(make_array(
-      [1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]),'LargeList(List(Int64))'),
-      column2,
-      column3
-    ),
-  array_replace(column1, make_array(1, 2, 3), column3),
-  array_replace(column1, column2, make_array(11, 12, 13))
-from large_nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [10, 11, 12], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]] [[1, 2, 3], [11, 12, 13], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [19, 20, 21], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] [[11, 12, 13], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [28, 29, 30], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [[11, 12, 13], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [37, 38, 39], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]] [[11, 12, 13], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
-
-## array_replace_n (aliases: `list_replace_n`)
-
-# array_replace_n scalar function #1
-query ???
-select
-  array_replace_n(make_array(1, 2, 3, 4), 2, 3, 2),
-  array_replace_n(make_array(1, 4, 4, 5, 4, 6, 7), 4, 0, 2),
-  array_replace_n(make_array(1, 2, 3), 4, 0, 3);
-----
-[1, 3, 3, 4] [1, 0, 0, 5, 4, 6, 7] [1, 2, 3]
-
-query ???
-select
-  array_replace_n(arrow_cast(make_array(1, 2, 3, 4), 'LargeList(Int64)'), 2, 3, 2),
-  array_replace_n(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'LargeList(Int64)'), 4, 0, 2),
-  array_replace_n(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4, 0, 3);
-----
-[1, 3, 3, 4] [1, 0, 0, 5, 4, 6, 7] [1, 2, 3]
-
-query ???
-select
-  array_replace_n(arrow_cast(make_array(1, 2, 3, 4), 'FixedSizeList(4, Int64)'), 2, 3, 2),
-  array_replace_n(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'FixedSizeList(7, Int64)'), 4, 0, 2),
-  array_replace_n(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 4, 0, 3);
-----
-[1, 3, 3, 4] [1, 0, 0, 5, 4, 6, 7] [1, 2, 3]
-
-# array_replace_n scalar function #2 (element is list)
-query ??
-select
-  array_replace_n(
-    make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]),
-      [4, 5, 6],
-      [1, 1, 1],
-      2
-    ),
-  array_replace_n(
-    make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]),
-    [2, 3, 4],
-    [3, 1, 4],
-    2
-  );
-----
-[[1, 2, 3], [1, 1, 1], [5, 5, 5], [1, 1, 1], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [3, 1, 4], [5, 3, 1], [1, 3, 2]]
-
-query ??
-select
-  array_replace_n(
-    arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'LargeList(List(Int64))'),
-      [4, 5, 6],
-      [1, 1, 1],
-      2
-    ),
-  array_replace_n(
-    arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'LargeList(List(Int64))'),
-    [2, 3, 4],
-    [3, 1, 4],
-    2
-  );
-----
-[[1, 2, 3], [1, 1, 1], [5, 5, 5], [1, 1, 1], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [3, 1, 4], [5, 3, 1], [1, 3, 2]]
-
-query ??
-select
-  array_replace_n(
-    arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'),
-      [4, 5, 6],
-      [1, 1, 1],
-      2
-    ),
-  array_replace_n(
-    arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'),
-    [2, 3, 4],
-    [3, 1, 4],
-    2
-  );
-----
-[[1, 2, 3], [1, 1, 1], [5, 5, 5], [1, 1, 1], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [3, 1, 4], [5, 3, 1], [1, 3, 2]]
-
-# list_replace_n scalar function #3 (function alias `array_replace_n`)
-query ???
-select
-  list_replace_n(make_array(1, 2, 3, 4), 2, 3, 2),
-  list_replace_n(make_array(1, 4, 4, 5, 4, 6, 7), 4, 0, 2),
-  list_replace_n(make_array(1, 2, 3), 4, 0, 3);
-----
-[1, 3, 3, 4] [1, 0, 0, 5, 4, 6, 7] [1, 2, 3]
-
-query ???
-select
-  list_replace_n(arrow_cast(make_array(1, 2, 3, 4), 'LargeList(Int64)'), 2, 3, 2),
-  list_replace_n(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'LargeList(Int64)'), 4, 0, 2),
-  list_replace_n(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4, 0, 3);
-----
-[1, 3, 3, 4] [1, 0, 0, 5, 4, 6, 7] [1, 2, 3]
-
-# array_replace_n scalar function #4 (null input)
-query ?
-select array_replace_n(make_array(1, 2, 3, 4, 5), NULL, NULL, NULL);
-----
-[1, 2, 3, 4, 5]
-
-query ?
-select array_replace_n(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), NULL, NULL, NULL);
-----
-[1, 2, 3, 4, 5]
-
-# array_replace_n scalar function with columns #1
-query ?
-select
-  array_replace_n(column1, column2, column3, column4)
-from arrays_with_repeating_elements;
-----
-[1, 4, 1, 3, 4, 4, 1, 3, 2, 3]
-[7, 7, 5, 5, 6, 5, 5, 5, 4, 4]
-[10, 10, 10, 8, 10, 9, 10, 8, 7, 7]
-[13, 11, 12, 13, 11, 12, 13, 11, 12, 13]
-
-query ?
-select
-  array_replace_n(column1, column2, column3, column4)
-from large_arrays_with_repeating_elements;
-----
-[1, 4, 1, 3, 4, 4, 1, 3, 2, 3]
-[7, 7, 5, 5, 6, 5, 5, 5, 4, 4]
-[10, 10, 10, 8, 10, 9, 10, 8, 7, 7]
-[13, 11, 12, 13, 11, 12, 13, 11, 12, 13]
-
-# array_replace_n scalar function with columns #2 (element is list)
-query ?
-select
-  array_replace_n(column1, column2, column3, column4)
-from nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [10, 11, 12], [1, 2, 3], [7, 8, 9], [10, 11, 12], [10, 11, 12], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
-[[19, 20, 21], [19, 20, 21], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
-[[28, 29, 30], [28, 29, 30], [28, 29, 30], [22, 23, 24], [28, 29, 30], [25, 26, 27], [28, 29, 30], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
-[[37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39]]
-
-query ?
-select
-  array_replace_n(column1, column2, column3, column4)
-from large_nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [10, 11, 12], [1, 2, 3], [7, 8, 9], [10, 11, 12], [10, 11, 12], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
-[[19, 20, 21], [19, 20, 21], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
-[[28, 29, 30], [28, 29, 30], [28, 29, 30], [22, 23, 24], [28, 29, 30], [25, 26, 27], [28, 29, 30], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
-[[37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39]]
-
-
-# array_replace_n scalar function with columns and scalars #1
-query ????
-select
-  array_replace_n(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2, column3, column4),
-  array_replace_n(column1, 1, column3, column4),
-  array_replace_n(column1, column2, 4, column4),
-  array_replace_n(column1, column2, column3, 2)
-from arrays_with_repeating_elements;
-----
-[1, 4, 4, 4, 5, 4, 4, 7, 7, 10, 7, 8] [4, 2, 4, 3, 2, 2, 4, 3, 2, 3] [1, 4, 1, 3, 4, 4, 1, 3, 2, 3] [1, 4, 1, 3, 4, 2, 1, 3, 2, 3]
-[1, 2, 2, 7, 5, 7, 4, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4] [7, 7, 5, 5, 6, 5, 5, 5, 4, 4]
-[1, 2, 2, 4, 5, 4, 4, 10, 10, 10, 10, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7] [4, 4, 4, 8, 4, 9, 4, 8, 7, 7] [10, 10, 7, 8, 7, 9, 7, 8, 7, 7]
-[1, 2, 2, 4, 5, 4, 4, 7, 7, 13, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10] [4, 11, 12, 4, 11, 12, 4, 11, 12, 4] [13, 11, 12, 13, 11, 12, 10, 11, 12, 10]
-
-query ????
-select
-  array_replace_n(arrow_cast(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), 'LargeList(Int64)'), column2, column3, column4),
-  array_replace_n(column1, 1, column3, column4),
-  array_replace_n(column1, column2, 4, column4),
-  array_replace_n(column1, column2, column3, 2)
-from large_arrays_with_repeating_elements;
-----
-[1, 4, 4, 4, 5, 4, 4, 7, 7, 10, 7, 8] [4, 2, 4, 3, 2, 2, 4, 3, 2, 3] [1, 4, 1, 3, 4, 4, 1, 3, 2, 3] [1, 4, 1, 3, 4, 2, 1, 3, 2, 3]
-[1, 2, 2, 7, 5, 7, 4, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4] [7, 7, 5, 5, 6, 5, 5, 5, 4, 4]
-[1, 2, 2, 4, 5, 4, 4, 10, 10, 10, 10, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7] [4, 4, 4, 8, 4, 9, 4, 8, 7, 7] [10, 10, 7, 8, 7, 9, 7, 8, 7, 7]
-[1, 2, 2, 4, 5, 4, 4, 7, 7, 13, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10] [4, 11, 12, 4, 11, 12, 4, 11, 12, 4] [13, 11, 12, 13, 11, 12, 10, 11, 12, 10]
-
-# array_replace_n scalar function with columns and scalars #2 (element is list)
-query ????
-select
-  array_replace_n(
-    make_array(
-      [7, 8, 9], [2, 1, 3], [1, 5, 6], [10, 11, 12], [2, 1, 3], [7, 8, 9], [4, 5, 6]),
-      column2,
-      column3,
-      column4
-    ),
-    array_replace_n(column1, make_array(1, 2, 3), column3, column4),
-    array_replace_n(column1, column2, make_array(11, 12, 13), column4),
-    array_replace_n(column1, column2, column3, 2)
-from nested_arrays_with_repeating_elements;
-----
-[[7, 8, 9], [2, 1, 3], [1, 5, 6], [10, 11, 12], [2, 1, 3], [7, 8, 9], [10, 11, 12]] [[10, 11, 12], [4, 5, 6], [10, 11, 12], [7, 8, 9], [4, 5, 6], [4, 5, 6], [10, 11, 12], [7, 8, 9], [4, 5, 6], [7, 8, 9]] [[1, 2, 3], [11, 12, 13], [1, 2, 3], [7, 8, 9], [11, 12, 13], [11, 12, 13], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]] [[1, 2, 3], [10, 11, 12], [1, 2, 3], [7, 8, 9], [10, 11, 12], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
-[[7, 8, 9], [2, 1, 3], [1, 5, 6], [19, 20, 21], [2, 1, 3], [7, 8, 9], [4, 5, 6]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] [[11, 12, 13], [11, 12, 13], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] [[19, 20, 21], [19, 20, 21], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
-[[7, 8, 9], [2, 1, 3], [1, 5, 6], [10, 11, 12], [2, 1, 3], [7, 8, 9], [4, 5, 6]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [[11, 12, 13], [11, 12, 13], [11, 12, 13], [22, 23, 24], [11, 12, 13], [25, 26, 27], [11, 12, 13], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [[28, 29, 30], [28, 29, 30], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
-[[7, 8, 9], [2, 1, 3], [1, 5, 6], [10, 11, 12], [2, 1, 3], [7, 8, 9], [4, 5, 6]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]] [[11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13]] [[37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
-
-query ????
-select
-  array_replace_n(
-    arrow_cast(make_array(
-      [7, 8, 9], [2, 1, 3], [1, 5, 6], [10, 11, 12], [2, 1, 3], [7, 8, 9], [4, 5, 6]), 'LargeList(List(Int64))'),
-      column2,
-      column3,
-      column4
-    ),
-    array_replace_n(column1, make_array(1, 2, 3), column3, column4),
-    array_replace_n(column1, column2, make_array(11, 12, 13), column4),
-    array_replace_n(column1, column2, column3, 2)
-from large_nested_arrays_with_repeating_elements;
-----
-[[7, 8, 9], [2, 1, 3], [1, 5, 6], [10, 11, 12], [2, 1, 3], [7, 8, 9], [10, 11, 12]] [[10, 11, 12], [4, 5, 6], [10, 11, 12], [7, 8, 9], [4, 5, 6], [4, 5, 6], [10, 11, 12], [7, 8, 9], [4, 5, 6], [7, 8, 9]] [[1, 2, 3], [11, 12, 13], [1, 2, 3], [7, 8, 9], [11, 12, 13], [11, 12, 13], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]] [[1, 2, 3], [10, 11, 12], [1, 2, 3], [7, 8, 9], [10, 11, 12], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
-[[7, 8, 9], [2, 1, 3], [1, 5, 6], [19, 20, 21], [2, 1, 3], [7, 8, 9], [4, 5, 6]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] [[11, 12, 13], [11, 12, 13], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] [[19, 20, 21], [19, 20, 21], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
-[[7, 8, 9], [2, 1, 3], [1, 5, 6], [10, 11, 12], [2, 1, 3], [7, 8, 9], [4, 5, 6]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [[11, 12, 13], [11, 12, 13], [11, 12, 13], [22, 23, 24], [11, 12, 13], [25, 26, 27], [11, 12, 13], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [[28, 29, 30], [28, 29, 30], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
-[[7, 8, 9], [2, 1, 3], [1, 5, 6], [10, 11, 12], [2, 1, 3], [7, 8, 9], [4, 5, 6]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]] [[11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13]] [[37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
-
-## array_replace_all (aliases: `list_replace_all`)
-
-# array_replace_all scalar function #1
-query ???
-select
-  array_replace_all(make_array(1, 2, 3, 4), 2, 3),
-  array_replace_all(make_array(1, 4, 4, 5, 4, 6, 7), 4, 0),
-  array_replace_all(make_array(1, 2, 3), 4, 0);
-----
-[1, 3, 3, 4] [1, 0, 0, 5, 0, 6, 7] [1, 2, 3]
-
-query ???
-select
-  array_replace_all(arrow_cast(make_array(1, 2, 3, 4), 'LargeList(Int64)'), 2, 3),
-  array_replace_all(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'LargeList(Int64)'), 4, 0),
-  array_replace_all(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4, 0);
-----
-[1, 3, 3, 4] [1, 0, 0, 5, 0, 6, 7] [1, 2, 3]
-
-query ???
-select
-  array_replace_all(arrow_cast(make_array(1, 2, 3, 4), 'FixedSizeList(4, Int64)'), 2, 3),
-  array_replace_all(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'FixedSizeList(7, Int64)'), 4, 0),
-  array_replace_all(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 4, 0);
-----
-[1, 3, 3, 4] [1, 0, 0, 5, 0, 6, 7] [1, 2, 3]
-
-# array_replace_all scalar function #2 (element is list)
-query ??
-select
-  array_replace_all(
-    make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]),
-    [4, 5, 6],
-    [1, 1, 1]
-  ),
-  array_replace_all(
-    make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]),
-    [2, 3, 4],
-    [3, 1, 4]
-  );
-----
-[[1, 2, 3], [1, 1, 1], [5, 5, 5], [1, 1, 1], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [3, 1, 4], [5, 3, 1], [1, 3, 2]]
-
-query ??
-select
-  array_replace_all(
-    arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'LargeList(List(Int64))'),
-    [4, 5, 6],
-    [1, 1, 1]
-  ),
-  array_replace_all(
-    arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'LargeList(List(Int64))'),
-    [2, 3, 4],
-    [3, 1, 4]
-  );
-----
-[[1, 2, 3], [1, 1, 1], [5, 5, 5], [1, 1, 1], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [3, 1, 4], [5, 3, 1], [1, 3, 2]]
-
-query ??
-select
-  array_replace_all(
-    arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'),
-    [4, 5, 6],
-    [1, 1, 1]
-  ),
-  array_replace_all(
-    arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'),
-    [2, 3, 4],
-    [3, 1, 4]
-  );
-----
-[[1, 2, 3], [1, 1, 1], [5, 5, 5], [1, 1, 1], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [3, 1, 4], [5, 3, 1], [1, 3, 2]]
-
-# list_replace_all scalar function #3 (function alias `array_replace_all`)
-query ???
-select
-  list_replace_all(make_array(1, 2, 3, 4), 2, 3),
-  list_replace_all(make_array(1, 4, 4, 5, 4, 6, 7), 4, 0),
-  list_replace_all(make_array(1, 2, 3), 4, 0);
-----
-[1, 3, 3, 4] [1, 0, 0, 5, 0, 6, 7] [1, 2, 3]
-
-query ???
-select
-  list_replace_all(arrow_cast(make_array(1, 2, 3, 4), 'LargeList(Int64)'), 2, 3),
-  list_replace_all(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'LargeList(Int64)'), 4, 0),
-  list_replace_all(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4, 0);
-----
-[1, 3, 3, 4] [1, 0, 0, 5, 0, 6, 7] [1, 2, 3]
-
-# array_replace_all scalar function #4 (null input)
-query ?
-select array_replace_all(make_array(1, 2, 3, 4, 5), NULL, NULL);
-----
-[1, 2, 3, 4, 5]
-
-query ?
-select array_replace_all(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), NULL, NULL);
-----
-[1, 2, 3, 4, 5]
-
-# array_replace_all scalar function with columns #1
-query ?
-select
-  array_replace_all(column1, column2, column3)
-from arrays_with_repeating_elements;
-----
-[1, 4, 1, 3, 4, 4, 1, 3, 4, 3]
-[7, 7, 5, 5, 6, 5, 5, 5, 7, 7]
-[10, 10, 10, 8, 10, 9, 10, 8, 10, 10]
-[13, 11, 12, 13, 11, 12, 13, 11, 12, 13]
-
-query ?
-select
-  array_replace_all(column1, column2, column3)
-from large_arrays_with_repeating_elements;
-----
-[1, 4, 1, 3, 4, 4, 1, 3, 4, 3]
-[7, 7, 5, 5, 6, 5, 5, 5, 7, 7]
-[10, 10, 10, 8, 10, 9, 10, 8, 10, 10]
-[13, 11, 12, 13, 11, 12, 13, 11, 12, 13]
-
-# array_replace_all scalar function with columns #2 (element is list)
-query ?
-select
-  array_replace_all(column1, column2, column3)
-from nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [10, 11, 12], [1, 2, 3], [7, 8, 9], [10, 11, 12], [10, 11, 12], [1, 2, 3], [7, 8, 9], [10, 11, 12], [7, 8, 9]]
-[[19, 20, 21], [19, 20, 21], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [19, 20, 21], [19, 20, 21]]
-[[28, 29, 30], [28, 29, 30], [28, 29, 30], [22, 23, 24], [28, 29, 30], [25, 26, 27], [28, 29, 30], [22, 23, 24], [28, 29, 30], [28, 29, 30]]
-[[37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39]]
-
-query ?
-select
-  array_replace_all(column1, column2, column3)
-from large_nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [10, 11, 12], [1, 2, 3], [7, 8, 9], [10, 11, 12], [10, 11, 12], [1, 2, 3], [7, 8, 9], [10, 11, 12], [7, 8, 9]]
-[[19, 20, 21], [19, 20, 21], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [19, 20, 21], [19, 20, 21]]
-[[28, 29, 30], [28, 29, 30], [28, 29, 30], [22, 23, 24], [28, 29, 30], [25, 26, 27], [28, 29, 30], [22, 23, 24], [28, 29, 30], [28, 29, 30]]
-[[37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39]]
-
-# array_replace_all scalar function with columns and scalars #1
-query ???
-select
-  array_replace_all(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2, column3),
-  array_replace_all(column1, 1, column3),
-  array_replace_all(column1, column2, 4)
-from arrays_with_repeating_elements;
-----
-[1, 4, 4, 4, 5, 4, 4, 7, 7, 10, 7, 8] [4, 2, 4, 3, 2, 2, 4, 3, 2, 3] [1, 4, 1, 3, 4, 4, 1, 3, 4, 3]
-[1, 2, 2, 7, 5, 7, 7, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4]
-[1, 2, 2, 4, 5, 4, 4, 10, 10, 10, 10, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7] [4, 4, 4, 8, 4, 9, 4, 8, 4, 4]
-[1, 2, 2, 4, 5, 4, 4, 7, 7, 13, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10] [4, 11, 12, 4, 11, 12, 4, 11, 12, 4]
-
-query ???
-select
-  array_replace_all(arrow_cast(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), 'LargeList(Int64)'), column2, column3),
-  array_replace_all(column1, 1, column3),
-  array_replace_all(column1, column2, 4)
-from large_arrays_with_repeating_elements;
-----
-[1, 4, 4, 4, 5, 4, 4, 7, 7, 10, 7, 8] [4, 2, 4, 3, 2, 2, 4, 3, 2, 3] [1, 4, 1, 3, 4, 4, 1, 3, 4, 3]
-[1, 2, 2, 7, 5, 7, 7, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4]
-[1, 2, 2, 4, 5, 4, 4, 10, 10, 10, 10, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7] [4, 4, 4, 8, 4, 9, 4, 8, 4, 4]
-[1, 2, 2, 4, 5, 4, 4, 7, 7, 13, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10] [4, 11, 12, 4, 11, 12, 4, 11, 12, 4]
-
-# array_replace_all scalar function with columns and scalars #2 (element is list)
-query ???
-select
-  array_replace_all(
-    make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]),
-    column2,
-    column3
-  ),
-  array_replace_all(column1, make_array(1, 2, 3), column3),
-  array_replace_all(column1, column2, make_array(11, 12, 13))
-from nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [10, 11, 12], [10, 11, 12], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [4, 5, 6], [10, 11, 12], [7, 8, 9], [4, 5, 6], [4, 5, 6], [10, 11, 12], [7, 8, 9], [4, 5, 6], [7, 8, 9]] [[1, 2, 3], [11, 12, 13], [1, 2, 3], [7, 8, 9], [11, 12, 13], [11, 12, 13], [1, 2, 3], [7, 8, 9], [11, 12, 13], [7, 8, 9]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [19, 20, 21], [13, 14, 15], [19, 20, 21], [19, 20, 21], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] [[11, 12, 13], [11, 12, 13], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [11, 12, 13], [11, 12, 13]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [28, 29, 30], [28, 29, 30], [28, 29, 30], [28, 29, 30], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [[11, 12, 13], [11, 12, 13], [11, 12, 13], [22, 23, 24], [11, 12, 13], [25, 26, 27], [11, 12, 13], [22, 23, 24], [11, 12, 13], [11, 12, 13]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [37, 38, 39], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]] [[11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13]]
-
-query ???
-select
-  array_replace_all(
-    arrow_cast(make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]), 'LargeList(List(Int64))'),
-    column2,
-    column3
-  ),
-  array_replace_all(column1, make_array(1, 2, 3), column3),
-  array_replace_all(column1, column2, make_array(11, 12, 13))
-from nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [10, 11, 12], [10, 11, 12], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [4, 5, 6], [10, 11, 12], [7, 8, 9], [4, 5, 6], [4, 5, 6], [10, 11, 12], [7, 8, 9], [4, 5, 6], [7, 8, 9]] [[1, 2, 3], [11, 12, 13], [1, 2, 3], [7, 8, 9], [11, 12, 13], [11, 12, 13], [1, 2, 3], [7, 8, 9], [11, 12, 13], [7, 8, 9]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [19, 20, 21], [13, 14, 15], [19, 20, 21], [19, 20, 21], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] [[11, 12, 13], [11, 12, 13], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [11, 12, 13], [11, 12, 13]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [28, 29, 30], [28, 29, 30], [28, 29, 30], [28, 29, 30], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [[11, 12, 13], [11, 12, 13], [11, 12, 13], [22, 23, 24], [11, 12, 13], [25, 26, 27], [11, 12, 13], [22, 23, 24], [11, 12, 13], [11, 12, 13]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [37, 38, 39], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]] [[11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13]]
-
-# array_replace with null handling
-
-statement ok
-create table t as values
-  (make_array(3, 1, NULL, 3), 3,    4,    2),
-  (make_array(3, 1, NULL, 3), NULL, 5,    2),
-  (NULL,                            3,    2,    1),
-  (make_array(3, 1, 3),             3,    NULL, 1)
-;
-
-
-# ([3, 1, NULL, 3], 3,    4,    2)  => [4, 1, NULL, 4] NULL not matched
-# ([3, 1, NULL, 3], NULL, 5,    2)  => [3, 1, NULL, 3] NULL is replaced with 5
-# ([NULL],          3,    2,    1)  => NULL
-# ([3, 1, 3],       3,    NULL, 1)  => [NULL, 1 3]
-
-query ?III?
-select column1, column2, column3, column4, array_replace_n(column1, column2, column3, column4) from t;
-----
-[3, 1, NULL, 3] 3 4 2 [4, 1, NULL, 4]
-[3, 1, NULL, 3] NULL 5 2 [3, 1, 5, 3]
-NULL 3 2 1 NULL
-[3, 1, 3] 3 NULL 1 [NULL, 1, 3]
-
-
-
-statement ok
-drop table t;
-
-
-
-## array_to_string (aliases: `list_to_string`, `array_join`, `list_join`)
-
-# array_to_string scalar function #1
-query TTT
-select array_to_string(['h', 'e', 'l', 'l', 'o'], ','), array_to_string([1, 2, 3, 4, 5], '-'), array_to_string([1.0, 2.0, 3.0], '|');
-----
-h,e,l,l,o 1-2-3-4-5 1|2|3
-
-# array_to_string scalar function #2
-query TTT
-select array_to_string([1, 1, 1], '1'), array_to_string([[1, 2], [3, 4], [5, 6]], '+'), array_to_string(array_repeat(array_repeat(array_repeat(3, 2), 2), 3), '/\');
-----
-11111 1+2+3+4+5+6 3/\3/\3/\3/\3/\3/\3/\3/\3/\3/\3/\3
-
-# array_to_string scalar function #3
-query T
-select array_to_string(make_array(), ',')
-----
-(empty)
-
-# array to string dictionary
-statement ok
-CREATE TABLE table1 AS VALUES
-  (1, 'foo'),
-  (3, 'bar'),
-  (1, 'foo'),
-  (2, NULL),
-  (NULL, 'baz')
-  ;
-
-# expect 1-3-1-2 (dictionary values should be repeated)
-query T
-SELECT array_to_string(array_agg(column1),'-')
-FROM (
-  SELECT arrow_cast(column1, 'Dictionary(Int32, Int32)') as column1
-  FROM table1
-);
-----
-1-3-1-2
-
-# expect foo,bar,foo,baz (dictionary values should be repeated)
-query T
-SELECT array_to_string(array_agg(column2),',')
-FROM (
-  SELECT arrow_cast(column2, 'Dictionary(Int64, Utf8)') as column2
-  FROM table1
-);
-----
-foo,bar,foo,baz
-
-# Expect only values that are in the group
-query I?T
-SELECT column1, array_agg(column2), array_to_string(array_agg(column2),',')
-FROM (
-  SELECT column1, arrow_cast(column2, 'Dictionary(Int32, Utf8)') as column2
-  FROM table1
-)
-GROUP BY column1
-ORDER BY column1;
-----
-1 [foo, foo] foo,foo
-2 [NULL] (empty)
-3 [bar] bar
-NULL [baz] baz
-
-# verify make_array does force to Utf8View
-query T
-SELECT arrow_typeof(make_array(arrow_cast('a', 'Utf8View'), 'b', 'c', 'd'));
-----
-List(nullable Utf8View)
-
-# expect a,b,c,d. make_array forces all types to be of a common type (see above)
-query T
-SELECT array_to_string(make_array(arrow_cast('a', 'Utf8View'), 'b', 'c', 'd'), ',');
-----
-a,b,c,d
-
-# array_to_string using largeutf8 for second arg
-query TTT
-select array_to_string(['h', 'e', 'l', 'l', 'o'], arrow_cast(',', 'LargeUtf8')), array_to_string([1, 2, 3, 4, 5], arrow_cast('-', 'LargeUtf8')), array_to_string([1.0, 2.0, 3.0], arrow_cast('|', 'LargeUtf8'));
-----
-h,e,l,l,o 1-2-3-4-5 1|2|3
-
-# array_to_string using utf8view for second arg
-query TTT
-select array_to_string(['h', 'e', 'l', 'l', 'o'], arrow_cast(',', 'Utf8View')), array_to_string([1, 2, 3, 4, 5], arrow_cast('-', 'Utf8View')), array_to_string([1.0, 2.0, 3.0], arrow_cast('|', 'Utf8View'));
-----
-h,e,l,l,o 1-2-3-4-5 1|2|3
-
-statement ok
-drop table table1;
-
-
-## array_union (aliases: `list_union`)
-
-# array_union scalar function #1
-query ?
-select array_union([1, 2, 3, 4], [5, 6, 3, 4]);
-----
-[1, 2, 3, 4, 5, 6]
-
-query ?
-select array_union(arrow_cast([1, 2, 3, 4], 'LargeList(Int64)'), arrow_cast([5, 6, 3, 4], 'LargeList(Int64)'));
-----
-[1, 2, 3, 4, 5, 6]
-
-query ?
-select array_union(arrow_cast([1, 2, 3, 4], 'FixedSizeList(4, Int64)'), arrow_cast([5, 6, 3, 4], 'FixedSizeList(4, Int64)'));
-----
-[1, 2, 3, 4, 5, 6]
-
-query ?
-select array_union(arrow_cast([1, 2, 3, 4], 'FixedSizeList(4, Int64)'), arrow_cast([5, 6], 'FixedSizeList(2, Int64)'));
-----
-[1, 2, 3, 4, 5, 6]
-
-# array_union scalar function #2
-query ?
-select array_union([1, 2, 3, 4], [5, 6, 7, 8]);
-----
-[1, 2, 3, 4, 5, 6, 7, 8]
-
-query ?
-select array_union(arrow_cast([1, 2, 3, 4], 'LargeList(Int64)'), arrow_cast([5, 6, 7, 8], 'LargeList(Int64)'));
-----
-[1, 2, 3, 4, 5, 6, 7, 8]
-
-# array_union scalar function #3
-query ?
-select array_union([1,2,3], []);
-----
-[1, 2, 3]
-
-query ?
-select array_union(arrow_cast([1,2,3], 'LargeList(Int64)'), arrow_cast([], 'LargeList(Int64)'));
-----
-[1, 2, 3]
-
-# array_union scalar function #4
-query ?
-select array_union([1, 2, 3, 4], [5, 4]);
-----
-[1, 2, 3, 4, 5]
-
-query ?
-select array_union(arrow_cast([1, 2, 3, 4], 'LargeList(Int64)'), arrow_cast([5, 4], 'LargeList(Int64)'));
-----
-[1, 2, 3, 4, 5]
-
-# array_union scalar function #5
-statement ok
-CREATE TABLE arrays_with_repeating_elements_for_union
-AS VALUES
-  ([0, 1, 1], []),
-  ([1, 1], [2]),
-  ([2, 3], [3]),
-  ([3], [3, 4])
-;
-
-query ?
-select array_union(column1, column2) from arrays_with_repeating_elements_for_union;
-----
-[0, 1]
-[1, 2]
-[2, 3]
-[3, 4]
-
-query ?
-select array_union(arrow_cast(column1, 'LargeList(Int64)'), arrow_cast(column2, 'LargeList(Int64)')) from arrays_with_repeating_elements_for_union;
-----
-[0, 1]
-[1, 2]
-[2, 3]
-[3, 4]
-
-statement ok
-drop table arrays_with_repeating_elements_for_union;
-
-# array_union scalar function #6
-query ?
-select array_union([], []);
-----
-[]
-
-query ?
-select array_union(arrow_cast([], 'LargeList(Int64)'), arrow_cast([], 'LargeList(Int64)'));
-----
-[]
-
-# array_union scalar function #7
-query ?
-select array_union([[null]], []);
-----
-[[]]
-
-query error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'array_union' function:
-select array_union(arrow_cast([[null]], 'LargeList(List(Int64))'), arrow_cast([], 'LargeList(Int64)'));
-
-# array_union scalar function #8
-query ?
-select array_union([null], [null]);
-----
-[NULL]
-
-query ?
-select array_union(arrow_cast([[null]], 'LargeList(List(Int64))'), arrow_cast([[null]], 'LargeList(List(Int64))'));
-----
-[[NULL]]
-
-# array_union scalar function #9
-query ?
-select array_union(null, []);
-----
-[]
-
-query ?
-select array_union(null, arrow_cast([], 'LargeList(Int64)'));
-----
-[]
-
-# array_union scalar function #10
-query ?
-select array_union(null, null);
-----
-NULL
-
-# array_union scalar function #11
-query ?
-select array_union([1, 1, 2, 2, 3, 3], null);
-----
-[1, 2, 3]
-
-query ?
-select array_union(arrow_cast([1, 1, 2, 2, 3, 3], 'LargeList(Int64)'), null);
-----
-[1, 2, 3]
-
-# array_union scalar function #12
-query ?
-select array_union(null, [1, 1, 2, 2, 3, 3]);
-----
-[1, 2, 3]
-
-query ?
-select array_union(null, arrow_cast([1, 1, 2, 2, 3, 3], 'LargeList(Int64)'));
-----
-[1, 2, 3]
-
-# array_union scalar function #13
-query ?
-select array_union([1.2, 3.0], [1.2, 3.0, 5.7]);
-----
-[1.2, 3.0, 5.7]
-
-query ?
-select array_union(arrow_cast([1.2, 3.0], 'LargeList(Float64)'), arrow_cast([1.2, 3.0, 5.7], 'LargeList(Float64)'));
-----
-[1.2, 3.0, 5.7]
-
-# array_union scalar function #14
-query ?
-select array_union(['hello'], ['hello','datafusion']);
-----
-[hello, datafusion]
-
-query ?
-select array_union(arrow_cast(['hello'], 'LargeList(Utf8)'), arrow_cast(['hello','datafusion'], 'LargeList(Utf8)'));
-----
-[hello, datafusion]
-
-
-# list_to_string scalar function #4 (function alias `array_to_string`)
-query TTT
-select list_to_string(['h', 'e', 'l', 'l', 'o'], ','), list_to_string([1, 2, 3, 4, 5], '-'), list_to_string([1.0, 2.0, 3.0], '|');
-----
-h,e,l,l,o 1-2-3-4-5 1|2|3
-
-query TTT
-select list_to_string(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), ','), list_to_string(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), '-'), list_to_string(arrow_cast([1.0, 2.0, 3.0], 'LargeList(Float64)'), '|');
-----
-h,e,l,l,o 1-2-3-4-5 1|2|3
-
-# array_join scalar function #5 (function alias `array_to_string`)
-query TTT
-select array_join(['h', 'e', 'l', 'l', 'o'], ','), array_join([1, 2, 3, 4, 5], '-'), array_join([1.0, 2.0, 3.0], '|');
-----
-h,e,l,l,o 1-2-3-4-5 1|2|3
-
-query TTT
-select array_join(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), ','), array_join(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), '-'), array_join(arrow_cast([1.0, 2.0, 3.0], 'LargeList(Float64)'), '|');
-----
-h,e,l,l,o 1-2-3-4-5 1|2|3
-
-# list_join scalar function #6 (function alias `list_join`)
-query TTT
-select list_join(['h', 'e', 'l', 'l', 'o'], ','), list_join([1, 2, 3, 4, 5], '-'), list_join([1.0, 2.0, 3.0], '|');
-----
-h,e,l,l,o 1-2-3-4-5 1|2|3
-
-query TTT
-select list_join(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), ','), list_join(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), '-'), list_join(arrow_cast([1.0, 2.0, 3.0], 'LargeList(Float64)'), '|');
-----
-h,e,l,l,o 1-2-3-4-5 1|2|3
-
-# array_to_string scalar function with nulls #1
-query TTT
-select array_to_string(make_array('h', NULL, 'l', NULL, 'o'), ','), array_to_string(make_array(1, NULL, 3, NULL, 5), '-'), array_to_string(make_array(NULL, 2.0, 3.0), '|');
-----
-h,l,o 1-3-5 2|3
-
-query TTT
-select array_to_string(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), ','), array_to_string(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), '-'), array_to_string(arrow_cast([1.0, 2.0, 3.0], 'LargeList(Float64)'), '|');
-----
-h,e,l,l,o 1-2-3-4-5 1|2|3
-
-query TTT
-select array_to_string(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'FixedSizeList(5, Utf8)'), ','), array_to_string(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)'), '-'), array_to_string(arrow_cast([1.0, 2.0, 3.0], 'FixedSizeList(3, Float64)'), '|');
-----
-h,e,l,l,o 1-2-3-4-5 1|2|3
-
-# array_to_string scalar function with nulls #2
-query TTT
-select array_to_string(make_array('h', NULL, NULL, NULL, 'o'), ',', '-'), array_to_string(make_array(NULL, 2, NULL, 4, 5), '-', 'nil'), array_to_string(make_array(1.0, NULL, 3.0), '|', '0');
-----
-h,-,-,-,o nil-2-nil-4-5 1|0|3
-
-query TTT
-select array_to_string(arrow_cast(make_array('h', NULL, NULL, NULL, 'o'), 'LargeList(Utf8)'), ',', '-'), array_to_string(arrow_cast(make_array(NULL, 2, NULL, 4, 5), 'LargeList(Int64)'), '-', 'nil'), array_to_string(arrow_cast(make_array(1.0, NULL, 3.0), 'LargeList(Float64)'), '|', '0');
-----
-h,-,-,-,o nil-2-nil-4-5 1|0|3
-
-query TTT
-select array_to_string(arrow_cast(make_array('h', NULL, NULL, NULL, 'o'), 'FixedSizeList(5, Utf8)'), ',', '-'), array_to_string(arrow_cast(make_array(NULL, 2, NULL, 4, 5), 'FixedSizeList(5, Int64)'), '-', 'nil'), array_to_string(arrow_cast(make_array(1.0, NULL, 3.0), 'FixedSizeList(3, Float64)'), '|', '0');
-----
-h,-,-,-,o nil-2-nil-4-5 1|0|3
-
-query T
-select array_to_string(arrow_cast([arrow_cast([NULL, 'a'], 'FixedSizeList(2, Utf8)'), NULL], 'FixedSizeList(2, FixedSizeList(2, Utf8))'), ',', '-');
-----
--,a,-
-
-# array_to_string with columns #1
-
-# For reference
-# select column1, column4 from arrays_values;
-# ----
-# [NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10] ,
-# [11, 12, 13, 14, 15, 16, 17, 18, NULL, 20] .
-# [21, 22, 23, NULL, 25, 26, 27, 28, 29, 30] -
-# [31, 32, 33, 34, 35, NULL, 37, 38, 39, 40] ok
-# NULL @
-# [41, 42, 43, 44, 45, 46, 47, 48, 49, 50] $
-# [51, 52, NULL, 54, 55, 56, 57, 58, 59, 60] ^
-# [61, 62, 63, 64, 65, 66, 67, 68, 69, 70] NULL
-
-query T
-select array_to_string(column1, column4) from arrays_values;
-----
-2,3,4,5,6,7,8,9,10
-11.12.13.14.15.16.17.18.20
-21-22-23-25-26-27-28-29-30
-31ok32ok33ok34ok35ok37ok38ok39ok40
-NULL
-41$42$43$44$45$46$47$48$49$50
-51^52^54^55^56^57^58^59^60
-NULL
-
-query T
-select array_to_string(column1, column4) from large_arrays_values;
-----
-2,3,4,5,6,7,8,9,10
-11.12.13.14.15.16.17.18.20
-21-22-23-25-26-27-28-29-30
-31ok32ok33ok34ok35ok37ok38ok39ok40
-NULL
-41$42$43$44$45$46$47$48$49$50
-51^52^54^55^56^57^58^59^60
-NULL
-
-query TT
-select array_to_string(column1, '_'), array_to_string(make_array(1,2,3), '/') from arrays_values;
-----
-2_3_4_5_6_7_8_9_10 1/2/3
-11_12_13_14_15_16_17_18_20 1/2/3
-21_22_23_25_26_27_28_29_30 1/2/3
-31_32_33_34_35_37_38_39_40 1/2/3
-NULL 1/2/3
-41_42_43_44_45_46_47_48_49_50 1/2/3
-51_52_54_55_56_57_58_59_60 1/2/3
-61_62_63_64_65_66_67_68_69_70 1/2/3
-
-query TT
-select array_to_string(column1, '_'), array_to_string(make_array(1,2,3), '/') from large_arrays_values;
-----
-2_3_4_5_6_7_8_9_10 1/2/3
-11_12_13_14_15_16_17_18_20 1/2/3
-21_22_23_25_26_27_28_29_30 1/2/3
-31_32_33_34_35_37_38_39_40 1/2/3
-NULL 1/2/3
-41_42_43_44_45_46_47_48_49_50 1/2/3
-51_52_54_55_56_57_58_59_60 1/2/3
-61_62_63_64_65_66_67_68_69_70 1/2/3
-
-query TT
-select array_to_string(column1, '_', '*'), array_to_string(make_array(make_array(1,2,3)), '.') from arrays_values;
-----
-*_2_3_4_5_6_7_8_9_10 1.2.3
-11_12_13_14_15_16_17_18_*_20 1.2.3
-21_22_23_*_25_26_27_28_29_30 1.2.3
-31_32_33_34_35_*_37_38_39_40 1.2.3
-NULL 1.2.3
-41_42_43_44_45_46_47_48_49_50 1.2.3
-51_52_*_54_55_56_57_58_59_60 1.2.3
-61_62_63_64_65_66_67_68_69_70 1.2.3
-
-query TT
-select array_to_string(column1, '_', '*'), array_to_string(make_array(make_array(1,2,3)), '.') from large_arrays_values;
-----
-*_2_3_4_5_6_7_8_9_10 1.2.3
-11_12_13_14_15_16_17_18_*_20 1.2.3
-21_22_23_*_25_26_27_28_29_30 1.2.3
-31_32_33_34_35_*_37_38_39_40 1.2.3
-NULL 1.2.3
-41_42_43_44_45_46_47_48_49_50 1.2.3
-51_52_*_54_55_56_57_58_59_60 1.2.3
-61_62_63_64_65_66_67_68_69_70 1.2.3
-
-## cardinality
-
-# cardinality scalar function
-query III
-select cardinality(make_array(1, 2, 3, 4, 5)), cardinality([1, 3, 5]), cardinality(make_array('h', 'e', 'l', 'l', 'o'));
-----
-5 3 5
-
-query III
-select cardinality(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)')), cardinality(arrow_cast([1, 3, 5], 'LargeList(Int64)')), cardinality(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'));
-----
-5 3 5
-
-query III
-select cardinality(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)')), cardinality(arrow_cast([1, 3, 5], 'FixedSizeList(3, Int64)')), cardinality(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'));
-----
-5 3 5
-
-# cardinality scalar function #2
-query II
-select cardinality(make_array([1, 2], [3, 4], [5, 6])), cardinality(array_repeat(array_repeat(array_repeat(3, 3), 2), 3));
-----
-6 18
-
-query I
-select cardinality(arrow_cast(make_array([1, 2], [3, 4], [5, 6]), 'LargeList(List(Int64))'));
-----
-6
-
-query I
-select cardinality(arrow_cast([[1, 2], [3, 4], [5, 6]], 'FixedSizeList(3, List(Int64))'));
-----
-6
-
-# cardinality scalar function #3
-query II
-select cardinality(make_array()), cardinality(make_array(make_array()))
-----
-NULL 0
-
-query II
-select cardinality(arrow_cast(make_array(), 'LargeList(Int64)')), cardinality(arrow_cast(make_array(make_array()), 'LargeList(List(Int64))'))
-----
-NULL 0
-
-#TODO
-#https://github.com/apache/datafusion/issues/9158
-#query II
-#select cardinality(arrow_cast(make_array(), 'FixedSizeList(1, Null)')), cardinality(arrow_cast(make_array(make_array()), 'FixedSizeList(1, List(Int64))'))
-#----
-#NULL 0
-
-# cardinality with columns
-query III
-select cardinality(column1), cardinality(column2), cardinality(column3) from arrays;
-----
-4 3 5
-4 3 5
-4 3 5
-4 3 3
-NULL 3 4
-4 NULL 1
-4 3 NULL
-
-query III
-select cardinality(column1), cardinality(column2), cardinality(column3) from large_arrays;
-----
-4 3 5
-4 3 5
-4 3 5
-4 3 3
-NULL 3 4
-4 NULL 1
-4 3 NULL
-
-query III
-select cardinality(column1), cardinality(column2), cardinality(column3) from fixed_size_arrays;
-----
-4 3 5
-4 3 5
-4 3 5
-4 3 5
-NULL 3 5
-4 NULL 5
-4 3 NULL
-
-## array_remove (aliases: `list_remove`)
-
-# array_remove scalar function #1
-query ???
-select array_remove(make_array(1, 2, 2, 1, 1), 2), array_remove(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 1.0), array_remove(make_array('h', 'e', 'l', 'l', 'o'), 'l');
-----
-[1, 2, 1, 1] [2.0, 2.0, 1.0, 1.0] [h, e, l, o]
-
-query ???
-select array_remove(arrow_cast(make_array(1, 2, 2, 1, 1), 'LargeList(Int64)'), 2),
-       array_remove(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'LargeList(Float64)'), 1.0),
-       array_remove(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 'l');
-----
-[1, 2, 1, 1] [2.0, 2.0, 1.0, 1.0] [h, e, l, o]
-
-query ???
-select array_remove(arrow_cast(make_array(1, 2, 2, 1, 1), 'FixedSizeList(5, Int64)'), 2),
-       array_remove(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'FixedSizeList(5, Float64)'), 1.0),
-       array_remove(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 'l');
-----
-[1, 2, 1, 1] [2.0, 2.0, 1.0, 1.0] [h, e, l, o]
-
-query ???
-select
-  array_remove(make_array(1, null, 2, 3), 2),
-  array_remove(make_array(1.1, null, 2.2, 3.3), 1.1),
-  array_remove(make_array('a', null, 'bc'), 'a');
-----
-[1, NULL, 3] [NULL, 2.2, 3.3] [NULL, bc]
-
-query ???
-select
-  array_remove(arrow_cast(make_array(1, null, 2, 3), 'LargeList(Int64)'), 2),
-  array_remove(arrow_cast(make_array(1.1, null, 2.2, 3.3), 'LargeList(Float64)'), 1.1),
-  array_remove(arrow_cast(make_array('a', null, 'bc'), 'LargeList(Utf8)'), 'a');
-----
-[1, NULL, 3] [NULL, 2.2, 3.3] [NULL, bc]
-
-query ???
-select
-  array_remove(arrow_cast(make_array(1, null, 2, 3), 'FixedSizeList(4, Int64)'), 2),
-  array_remove(arrow_cast(make_array(1.1, null, 2.2, 3.3), 'FixedSizeList(4, Float64)'), 1.1),
-  array_remove(arrow_cast(make_array('a', null, 'bc'), 'FixedSizeList(3, Utf8)'), 'a');
-----
-[1, NULL, 3] [NULL, 2.2, 3.3] [NULL, bc]
-
-#TODO: https://github.com/apache/datafusion/issues/7142
-# follow PostgreSQL behavior
-#query ?
-#select
-#  array_remove(NULL, 1)
-#----
-#NULL
-
-query ??
-select
- array_remove(make_array(1, null, 2), null),
- array_remove(make_array(1, null, 2, null), null);
-----
-[1, 2] [1, 2, NULL]
-
-query ??
-select
- array_remove(arrow_cast(make_array(1, null, 2), 'LargeList(Int64)'), null),
- array_remove(arrow_cast(make_array(1, null, 2, null), 'LargeList(Int64)'), null);
-----
-[1, 2] [1, 2, NULL]
-
-query ??
-select
- array_remove(arrow_cast(make_array(1, null, 2), 'FixedSizeList(3, Int64)'), null),
- array_remove(arrow_cast(make_array(1, null, 2, null), 'FixedSizeList(4, Int64)'), null);
-----
-[1, 2] [1, 2, NULL]
-
-# array_remove scalar function #2 (element is list)
-query ??
-select array_remove(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), [4, 5, 6]), array_remove(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), [2, 3, 4]);
-----
-[[1, 2, 3], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [2, 3, 4], [5, 3, 1], [1, 3, 2]]
-
-query ??
-select array_remove(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'LargeList(List(Int64))'), [4, 5, 6]),
-       array_remove(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'LargeList(List(Int64))'), [2, 3, 4]);
-----
-[[1, 2, 3], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [2, 3, 4], [5, 3, 1], [1, 3, 2]]
-
-query ??
-select array_remove(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, List(Int64))'), [4, 5, 6]),
-       array_remove(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, List(Int64))'), [2, 3, 4]);
-----
-[[1, 2, 3], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [2, 3, 4], [5, 3, 1], [1, 3, 2]]
-
-query ??
-select array_remove(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [4, 5, 6]),
-       array_remove(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [2, 3, 4]);
-----
-[[1, 2, 3], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [2, 3, 4], [5, 3, 1], [1, 3, 2]]
-
-# list_remove scalar function #3 (function alias `array_remove`)
-query ???
-select list_remove(make_array(1, 2, 2, 1, 1), 2), list_remove(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 1.0), list_remove(make_array('h', 'e', 'l', 'l', 'o'), 'l');
-----
-[1, 2, 1, 1] [2.0, 2.0, 1.0, 1.0] [h, e, l, o]
-
-query ??
-select list_remove(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, List(Int64))'), [4, 5, 6]),
-       list_remove(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, List(Int64))'), [2, 3, 4]);
-----
-[[1, 2, 3], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [2, 3, 4], [5, 3, 1], [1, 3, 2]]
-
-# array_remove scalar function with columns #1
-query ?
-select array_remove(column1, column2) from arrays_with_repeating_elements;
-----
-[1, 1, 3, 2, 2, 1, 3, 2, 3]
-[4, 5, 5, 6, 5, 5, 5, 4, 4]
-[7, 7, 8, 7, 9, 7, 8, 7, 7]
-[11, 12, 10, 11, 12, 10, 11, 12, 10]
-
-query ?
-select array_remove(column1, column2) from large_arrays_with_repeating_elements;
-----
-[1, 1, 3, 2, 2, 1, 3, 2, 3]
-[4, 5, 5, 6, 5, 5, 5, 4, 4]
-[7, 7, 8, 7, 9, 7, 8, 7, 7]
-[11, 12, 10, 11, 12, 10, 11, 12, 10]
-
-query ?
-select array_remove(column1, column2) from fixed_arrays_with_repeating_elements;
-----
-[1, 1, 3, 2, 2, 1, 3, 2, 3]
-[4, 5, 5, 6, 5, 5, 5, 4, 4]
-[7, 7, 8, 7, 9, 7, 8, 7, 7]
-[11, 12, 10, 11, 12, 10, 11, 12, 10]
-
-# array_remove scalar function with columns #2 (element is list)
-query ?
-select array_remove(column1, column2) from nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
-[[10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
-[[19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
-[[31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
-
-query ?
-select array_remove(column1, column2) from large_nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
-[[10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
-[[19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
-[[31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
-
-query ?
-select array_remove(column1, column2) from fixed_size_nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
-[[10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
-[[19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
-[[31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
-
-# array_remove scalar function with columns and scalars #1
-query ??
-select array_remove(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2), array_remove(column1, 1) from arrays_with_repeating_elements;
-----
-[1, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8] [2, 1, 3, 2, 2, 1, 3, 2, 3]
-[1, 2, 2, 5, 4, 4, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4]
-[1, 2, 2, 4, 5, 4, 4, 7, 10, 7, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7]
-[1, 2, 2, 4, 5, 4, 4, 7, 7, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10]
-
-query ??
-select array_remove(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2), array_remove(column1, 1) from large_arrays_with_repeating_elements;
-----
-[1, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8] [2, 1, 3, 2, 2, 1, 3, 2, 3]
-[1, 2, 2, 5, 4, 4, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4]
-[1, 2, 2, 4, 5, 4, 4, 7, 10, 7, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7]
-[1, 2, 2, 4, 5, 4, 4, 7, 7, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10]
-
-query ??
-select array_remove(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2), array_remove(column1, 1) from fixed_arrays_with_repeating_elements;
-----
-[1, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8] [2, 1, 3, 2, 2, 1, 3, 2, 3]
-[1, 2, 2, 5, 4, 4, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4]
-[1, 2, 2, 4, 5, 4, 4, 7, 10, 7, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7]
-[1, 2, 2, 4, 5, 4, 4, 7, 7, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10]
-
-# array_remove scalar function with columns and scalars #2 (element is list)
-query ??
-select array_remove(make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]), column2),
-       array_remove(column1, make_array(1, 2, 3)) from nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
-
-query ??
-select array_remove(make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]), column2),
-       array_remove(column1, make_array(1, 2, 3)) from large_nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
-
-query ??
-select array_remove(make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]), column2),
-       array_remove(column1, make_array(1, 2, 3)) from fixed_size_nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
-
-## array_remove_n (aliases: `list_remove_n`)
-
-# array_remove_n scalar function #1
-query ???
-select array_remove_n(make_array(1, 2, 2, 1, 1), 2, 2), array_remove_n(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 1.0, 2), array_remove_n(make_array('h', 'e', 'l', 'l', 'o'), 'l', 3);
-----
-[1, 1, 1] [2.0, 2.0, 1.0] [h, e, o]
-
-query ???
-select array_remove_n(arrow_cast(make_array(1, 2, 2, 1, 1), 'LargeList(Int32)'), 2, 2),
-       array_remove_n(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'LargeList(Float32)'), 1.0, 2),
-       array_remove_n(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 'l', 3);
-----
-[1, 1, 1] [2.0, 2.0, 1.0] [h, e, o]
-
-query ???
-select array_remove_n(arrow_cast(make_array(1, 2, 2, 1, 1), 'FixedSizeList(5, Int32)'), 2, 2),
-       array_remove_n(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'FixedSizeList(5, Float32)'), 1.0, 2),
-       array_remove_n(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 'l', 3);
-----
-[1, 1, 1] [2.0, 2.0, 1.0] [h, e, o]
-
-# array_remove_n scalar function #2 (element is list)
-query ??
-select array_remove_n(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), [4, 5, 6], 2), array_remove_n(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), [2, 3, 4], 2);
-----
-[[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]]
-
-query ??
-select array_remove_n(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'LargeList(List(Int64))'), [4, 5, 6], 2),
-       array_remove_n(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'LargeList(List(Int64))'), [2, 3, 4], 2);
-----
-[[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]]
-
-query ??
-select array_remove_n(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [4, 5, 6], 2),
-       array_remove_n(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [2, 3, 4], 2);
-----
-[[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]]
-
-# list_remove_n scalar function #3 (function alias `array_remove_n`)
-query ???
-select list_remove_n(make_array(1, 2, 2, 1, 1), 2, 2), list_remove_n(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 1.0, 2), list_remove_n(make_array('h', 'e', 'l', 'l', 'o'), 'l', 3);
-----
-[1, 1, 1] [2.0, 2.0, 1.0] [h, e, o]
-
-# array_remove_n scalar function with columns #1
-query ?
-select array_remove_n(column1, column2, column4) from arrays_with_repeating_elements;
-----
-[1, 1, 3, 1, 3, 2, 3]
-[5, 5, 6, 5, 5, 5, 4, 4]
-[8, 9, 8, 7, 7]
-[11, 12, 11, 12, 11, 12]
-
-# array_remove_n scalar function with columns #2 (element is list)
-query ?
-select array_remove_n(column1, column2, column4) from nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [1, 2, 3], [7, 8, 9], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
-[[13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
-[[22, 23, 24], [25, 26, 27], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
-[[31, 32, 33], [34, 35, 36], [31, 32, 33], [34, 35, 36], [31, 32, 33], [34, 35, 36]]
-
-# array_remove_n scalar function with columns and scalars #1
-query ???
-select array_remove_n(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2, column4), array_remove_n(column1, 1, column4), array_remove_n(column1, column2, 2) from arrays_with_repeating_elements;
-----
-[1, 4, 5, 4, 4, 7, 7, 10, 7, 8] [2, 3, 2, 2, 3, 2, 3] [1, 1, 3, 2, 1, 3, 2, 3]
-[1, 2, 2, 5, 4, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4] [5, 5, 6, 5, 5, 5, 4, 4]
-[1, 2, 2, 4, 5, 4, 4, 10, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7] [7, 8, 7, 9, 7, 8, 7, 7]
-[1, 2, 2, 4, 5, 4, 4, 7, 7, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10] [11, 12, 11, 12, 10, 11, 12, 10]
-
-# array_remove_n scalar function with columns and scalars #2 (element is list)
-query ???
-select array_remove_n(make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]), column2, column4), array_remove_n(column1, make_array(1, 2, 3), column4), array_remove_n(column1, column2, 2) from nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[4, 5, 6], [7, 8, 9], [4, 5, 6], [4, 5, 6], [7, 8, 9], [4, 5, 6], [7, 8, 9]] [[1, 2, 3], [1, 2, 3], [7, 8, 9], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [13, 14, 15], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] [[13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [28, 29, 30], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [[19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]] [[31, 32, 33], [34, 35, 36], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
-
-## array_remove_all (aliases: `list_removes`)
-
-#TODO: https://github.com/apache/datafusion/issues/7142
-# array_remove_all with NULL elements
-#query ?
-#select array_remove_all(NULL, 1);
-#----
-#NULL
-
-query ?
-select array_remove_all(make_array(1, 2, 2, 1, 1), NULL);
-----
-[1, 2, 2, 1, 1]
-
-# array_remove_all scalar function #1
-query ???
-select array_remove_all(make_array(1, 2, 2, 1, 1), 2), array_remove_all(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 1.0), array_remove_all(make_array('h', 'e', 'l', 'l', 'o'), 'l');
-----
-[1, 1, 1] [2.0, 2.0] [h, e, o]
-
-query ???
-select array_remove_all(arrow_cast(make_array(1, 2, 2, 1, 1), 'LargeList(Int64)'), 2),
-       array_remove_all(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'LargeList(Float64)'), 1.0),
-       array_remove_all(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 'l');
-----
-[1, 1, 1] [2.0, 2.0] [h, e, o]
-
-query ???
-select array_remove_all(arrow_cast(make_array(1, 2, 2, 1, 1), 'FixedSizeList(5, Int64)'), 2), array_remove_all(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'FixedSizeList(5, Float64)'), 1.0), array_remove_all(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 'l');
-----
-[1, 1, 1] [2.0, 2.0] [h, e, o]
-
-# array_remove_all scalar function #2 (element is list)
-query ??
-select array_remove_all(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), [4, 5, 6]), array_remove_all(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), [2, 3, 4]);
-----
-[[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]]
-
-query ??
-select array_remove_all(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, List(Int64))'), [4, 5, 6]),
-       array_remove_all(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]),  'FixedSizeList(5, List(Int64))'), [2, 3, 4]);
-----
-[[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]]
-
-query ??
-select array_remove_all(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [4, 5, 6]),
-       array_remove_all(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]),  'FixedSizeList(5, FixedSizeList(3, Int64))'), [2, 3, 4]);
-----
-[[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]]
-
-# list_remove_all scalar function #3 (function alias `array_remove_all`)
-query ???
-select list_remove_all(make_array(1, 2, 2, 1, 1), 2), list_remove_all(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 1.0), list_remove_all(make_array('h', 'e', 'l', 'l', 'o'), 'l');
-----
-[1, 1, 1] [2.0, 2.0] [h, e, o]
-
-query ??
-select list_remove_all(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, List(Int64))'), [4, 5, 6]),
-       list_remove_all(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]),  'FixedSizeList(5, List(Int64))'), [2, 3, 4]);
-----
-[[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]]
-
-# array_remove_all scalar function with columns #1
-query ?
-select array_remove_all(column1, column2) from arrays_with_repeating_elements;
-----
-[1, 1, 3, 1, 3, 3]
-[5, 5, 6, 5, 5, 5]
-[8, 9, 8]
-[11, 12, 11, 12, 11, 12]
-
-query ?
-select array_remove_all(column1, column2) from fixed_arrays_with_repeating_elements;
-----
-[1, 1, 3, 1, 3, 3]
-[5, 5, 6, 5, 5, 5]
-[8, 9, 8]
-[11, 12, 11, 12, 11, 12]
-
-# array_remove_all scalar function with columns #2 (element is list)
-query ?
-select array_remove_all(column1, column2) from nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [1, 2, 3], [7, 8, 9], [1, 2, 3], [7, 8, 9], [7, 8, 9]]
-[[13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15]]
-[[22, 23, 24], [25, 26, 27], [22, 23, 24]]
-[[31, 32, 33], [34, 35, 36], [31, 32, 33], [34, 35, 36], [31, 32, 33], [34, 35, 36]]
-
-query ?
-select array_remove_all(column1, column2) from fixed_size_nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [1, 2, 3], [7, 8, 9], [1, 2, 3], [7, 8, 9], [7, 8, 9]]
-[[13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15]]
-[[22, 23, 24], [25, 26, 27], [22, 23, 24]]
-[[31, 32, 33], [34, 35, 36], [31, 32, 33], [34, 35, 36], [31, 32, 33], [34, 35, 36]]
-
-# array_remove_all scalar function with columns and scalars #1
-query ??
-select array_remove_all(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2), array_remove_all(column1, 1) from arrays_with_repeating_elements;
-----
-[1, 4, 5, 4, 4, 7, 7, 10, 7, 8] [2, 3, 2, 2, 3, 2, 3]
-[1, 2, 2, 5, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4]
-[1, 2, 2, 4, 5, 4, 4, 10, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7]
-[1, 2, 2, 4, 5, 4, 4, 7, 7, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10]
-
-query ??
-select array_remove_all(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2), array_remove_all(column1, 1) from fixed_arrays_with_repeating_elements;
-----
-[1, 4, 5, 4, 4, 7, 7, 10, 7, 8] [2, 3, 2, 2, 3, 2, 3]
-[1, 2, 2, 5, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4]
-[1, 2, 2, 4, 5, 4, 4, 10, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7]
-[1, 2, 2, 4, 5, 4, 4, 7, 7, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10]
-
-# array_remove_all scalar function with columns and scalars #2 (element is list)
-query ??
-select array_remove_all(make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]), column2), array_remove_all(column1, make_array(1, 2, 3)) from nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[4, 5, 6], [7, 8, 9], [4, 5, 6], [4, 5, 6], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [13, 14, 15], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [28, 29, 30], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
-
-query ??
-select array_remove_all(make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]), column2),
-       array_remove_all(column1, make_array(1, 2, 3)) from fixed_size_nested_arrays_with_repeating_elements;
-----
-[[1, 2, 3], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[4, 5, 6], [7, 8, 9], [4, 5, 6], [4, 5, 6], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [13, 14, 15], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [28, 29, 30], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
-[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
-
-## trim_array (deprecated)
-
-## array_length (aliases: `list_length`)
-
-# array_length scalar function #1
-query III
-select array_length(make_array(1, 2, 3, 4, 5)), array_length(make_array(1, 2, 3)), array_length(make_array([1, 2], [3, 4], [5, 6]));
-----
-5 3 3
-
-query III
-select array_length(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)')), array_length(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), array_length(arrow_cast(make_array([1, 2], [3, 4], [5, 6]), 'LargeList(List(Int64))'));
-----
-5 3 3
-
-# array_length scalar function #2
-query III
-select array_length(make_array(1, 2, 3, 4, 5), 1), array_length(make_array(1, 2, 3), 1), array_length(make_array([1, 2], [3, 4], [5, 6]), 1);
-----
-5 3 3
-
-query III
-select array_length(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 1), array_length(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 1), array_length(arrow_cast(make_array([1, 2], [3, 4], [5, 6]), 'LargeList(List(Int64))'), 1);
-----
-5 3 3
-
-# array_length scalar function #3
-query III
-select array_length(make_array(1, 2, 3, 4, 5), 2), array_length(make_array(1, 2, 3), 2), array_length(make_array([1, 2], [3, 4], [5, 6]), 2);
-----
-NULL NULL 2
-
-query III
-select array_length(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2), array_length(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 2), array_length(arrow_cast(make_array([1, 2], [3, 4], [5, 6]), 'LargeList(List(Int64))'), 2);
-----
-NULL NULL 2
-
-# array_length scalar function #4
-query II
-select array_length(array_repeat(array_repeat(array_repeat(3, 5), 2), 3), 1), array_length(array_repeat(array_repeat(array_repeat(3, 5), 2), 3), 2);
-----
-3 2
-
-query II
-select array_length(arrow_cast(array_repeat(array_repeat(array_repeat(3, 5), 2), 3), 'LargeList(List(List(Int64)))'), 1), array_length(arrow_cast(array_repeat(array_repeat(array_repeat(3, 5), 2), 3), 'LargeList(List(List(Int64)))'), 2);
-----
-3 2
-
-# array_length scalar function #5
-query III
-select array_length(make_array()), array_length(make_array(), 1), array_length(make_array(), 2)
-----
-0 0 NULL
-
-# array_length scalar function #6 nested array
-query III
-select array_length([[1, 2, 3, 4], [5, 6, 7, 8]]), array_length([[1, 2, 3, 4], [5, 6, 7, 8]], 1), array_length([[1, 2, 3, 4], [5, 6, 7, 8]], 2);
-----
-2 2 4
-
-# list_length scalar function #7 (function alias `array_length`)
-query IIII
-select list_length(make_array(1, 2, 3, 4, 5)), list_length(make_array(1, 2, 3)), list_length(make_array([1, 2], [3, 4], [5, 6])), array_length([[1, 2, 3, 4], [5, 6, 7, 8]], 3);
-----
-5 3 3 NULL
-
-query III
-select list_length(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)')), list_length(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), list_length(arrow_cast(make_array([1, 2], [3, 4], [5, 6]), 'LargeList(List(Int64))'));
-----
-5 3 3
-
-# array_length with columns
-query I
-select array_length(column1, column3) from arrays_values;
-----
-10
-NULL
-NULL
-NULL
-NULL
-NULL
-NULL
-NULL
-
-query I
-select array_length(arrow_cast(column1, 'LargeList(Int64)'), column3) from arrays_values;
-----
-10
-NULL
-NULL
-NULL
-NULL
-NULL
-NULL
-NULL
-
-# array_length with columns and scalars
-query II
-select array_length(array[array[1, 2], array[3, 4]], column3), array_length(column1, 1) from arrays_values;
-----
-2 10
-2 10
-NULL 10
-NULL 10
-NULL NULL
-NULL 10
-NULL 10
-NULL 10
-
-query II
-select array_length(arrow_cast(array[array[1, 2], array[3, 4]], 'LargeList(List(Int64))'), column3), array_length(arrow_cast(column1, 'LargeList(Int64)'), 1) from arrays_values;
-----
-2 10
-2 10
-NULL 10
-NULL 10
-NULL NULL
-NULL 10
-NULL 10
-NULL 10
-
-# array_length for fixed sized list
-
-query III
-select array_length(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)')), array_length(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), array_length(arrow_cast(make_array([1, 2], [3, 4], [5, 6]), 'FixedSizeList(3, List(Int64))'));
-----
-5 3 3
-
-query III
-select array_length(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 1), array_length(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 1), array_length(arrow_cast(make_array([1, 2], [3, 4], [5, 6]), 'FixedSizeList(3, List(Int64))'), 1);
-----
-5 3 3
-
-
-query RRR
-select array_distance([2], [3]), list_distance([1], [2]), list_distance([1], [-2]);
-----
-1 1 3
-
-query error
-select list_distance([1], [1, 2]);
-
-query R
-select array_distance([[1, 1]], [1, 2]);
-----
-1
-
-query R
-select array_distance([[1, 1]], [[1, 2]]);
-----
-1
-
-query R
-select array_distance([[1, 1]], [[1, 2]]);
-----
-1
-
-query RR
-select array_distance([1, 1, 0, 0], [2, 2, 1, 1]), list_distance([1, 2, 3], [1, 2, 3]);
-----
-2 0
-
-query RR
-select array_distance([1.0, 1, 0, 0], [2, 2.0, 1, 1]), list_distance([1, 2.0, 3], [1, 2, 3]);
-----
-2 0
-
-query R
-select list_distance([1, 1, NULL, 0], [2, 2, NULL, NULL]);
-----
-NULL
-
-query R
-select list_distance([NULL, NULL], [NULL, NULL]);
-----
-NULL
-
-query R
-select list_distance([1.0, 2.0, 3.0], [1.0, 2.0, 3.5]) AS distance;
-----
-0.5
-
-query R
-select list_distance([1, 2, 3], [1, 2, 3]) AS distance;
-----
-0
-
-# array_distance with columns
-query RRR
-select array_distance(column1, column2), array_distance(column1, column3), array_distance(column1, column4) from arrays_distance_table;
-----
-0 0.374165738677 NULL
-5.196152422707 6.063827174318 NULL
-10.392304845413 11.778794505381 NULL
-15.58845726812 15.935494971917 NULL
-
-query RRR
-select array_distance(column1, column2), array_distance(column1, column3), array_distance(column1, column4) from large_arrays_distance_table;
-----
-0 0.374165738677 NULL
-5.196152422707 6.063827174318 NULL
-10.392304845413 11.778794505381 NULL
-15.58845726812 15.935494971917 NULL
-
-query RRR
-select array_distance(column1, column2), array_distance(column1, column3), array_distance(column1, column4) from fixed_size_arrays_distance_table;
-----
-0 0.374165738677 NULL
-5.196152422707 6.063827174318 NULL
-10.392304845413 11.778794505381 NULL
-15.58845726812 15.935494971917 NULL
-
-
-## array_dims (aliases: `list_dims`)
-
-# array dims error
-query error
-select array_dims(1);
-
-# array_dims scalar function
-query ???
-select array_dims(make_array(1, 2, 3)), array_dims(make_array([1, 2], [3, 4])), array_dims(make_array([[[[1], [2]]]]));
-----
-[3] [2, 2] [1, 1, 1, 2, 1]
-
-query ???
-select array_dims(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), array_dims(arrow_cast(make_array([1, 2], [3, 4]), 'LargeList(List(Int64))')), array_dims(arrow_cast(make_array([[[[1], [2]]]]), 'LargeList(List(List(List(List(Int64)))))'));
-----
-[3] [2, 2] [1, 1, 1, 2, 1]
-
-query ???
-select array_dims(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), array_dims(arrow_cast(make_array([1, 2], [3, 4]), 'FixedSizeList(2, List(Int64))')), array_dims(arrow_cast(make_array([[[[1], [2]]]]), 'FixedSizeList(1, List(List(List(List(Int64)))))'));
-----
-[3] [2, 2] [1, 1, 1, 2, 1]
-
-# array_dims scalar function #2
-query ??
-select array_dims(array_repeat(array_repeat(array_repeat(2, 3), 2), 1)), array_dims(array_repeat(array_repeat(array_repeat(3, 4), 5), 2));
-----
-[1, 2, 3] [2, 5, 4]
-
-query ??
-select array_dims(arrow_cast(array_repeat(array_repeat(array_repeat(2, 3), 2), 1), 'LargeList(List(List(Int64)))')), array_dims(arrow_cast(array_repeat(array_repeat(array_repeat(3, 4), 5), 2), 'LargeList(List(List(Int64)))'));
-----
-[1, 2, 3] [2, 5, 4]
-
-query ??
-select array_dims(arrow_cast(array_repeat(array_repeat(array_repeat(2, 3), 2), 1), 'FixedSizeList(1, List(List(Int64)))')), array_dims(arrow_cast(array_repeat(array_repeat(array_repeat(3, 4), 5), 2), 'FixedSizeList(2, List(List(Int64)))'));
-----
-[1, 2, 3] [2, 5, 4]
-
-# array_dims scalar function #3
-query ??
-select array_dims(make_array()), array_dims(make_array(make_array()))
-----
-NULL [1, 0]
-
-query ??
-select array_dims(arrow_cast(make_array(), 'LargeList(Int64)')), array_dims(arrow_cast(make_array(make_array()), 'LargeList(List(Int64))'))
-----
-NULL [1, 0]
-
-# list_dims scalar function #4 (function alias `array_dims`)
-query ???
-select list_dims(make_array(1, 2, 3)), list_dims(make_array([1, 2], [3, 4])), list_dims(make_array([[[[1], [2]]]]));
-----
-[3] [2, 2] [1, 1, 1, 2, 1]
-
-query ???
-select list_dims(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), list_dims(arrow_cast(make_array([1, 2], [3, 4]), 'LargeList(List(Int64))')), list_dims(arrow_cast(make_array([[[[1], [2]]]]), 'LargeList(List(List(List(List(Int64)))))'));
-----
-[3] [2, 2] [1, 1, 1, 2, 1]
-
-query ???
-select list_dims(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), list_dims(arrow_cast(make_array([1, 2], [3, 4]), 'FixedSizeList(2, List(Int64))')), list_dims(arrow_cast(make_array([[[[1], [2]]]]), 'FixedSizeList(1, List(List(List(List(Int64)))))'));
-----
-[3] [2, 2] [1, 1, 1, 2, 1]
-
-# array_dims with columns
-query ???
-select array_dims(column1), array_dims(column2), array_dims(column3) from arrays;
-----
-[2, 2] [3] [5]
-[2, 2] [3] [5]
-[2, 2] [3] [5]
-[2, 2] [3] [3]
-NULL [3] [4]
-[2, 2] NULL [1]
-[2, 2] [3] NULL
-
-query ???
-select array_dims(column1), array_dims(column2), array_dims(column3) from large_arrays;
-----
-[2, 2] [3] [5]
-[2, 2] [3] [5]
-[2, 2] [3] [5]
-[2, 2] [3] [3]
-NULL [3] [4]
-[2, 2] NULL [1]
-[2, 2] [3] NULL
-
-query ???
-select array_dims(column1), array_dims(column2), array_dims(column3) from fixed_size_arrays;
-----
-[2, 2] [3] [5]
-[2, 2] [3] [5]
-[2, 2] [3] [5]
-[2, 2] [3] [5]
-NULL [3] [5]
-[2, 2] NULL [5]
-[2, 2] [3] NULL
-
-
-## array_ndims (aliases: `list_ndims`)
-
-# array_ndims scalar function #1
-
-#follow PostgreSQL
-query I
-select
-  array_ndims(null);
-----
-NULL
-
-query I
-select
-  array_ndims([2, 3]);
-----
-1
-
-statement ok
-CREATE TABLE array_ndims_table
-AS VALUES
-  ([1], [1, 2, 3], [[7]], [[[[[10]]]]]),
-  ([2], [4, 5], [[8]], [[[[[10]]]]]),
-  (NUll, [6, 7], [[9]], [[[[[10]]]]]),
-  ([3], [6], [[9]], [[[[[10]]]]])
-;
-
-statement ok
-CREATE TABLE large_array_ndims_table
-AS SELECT
-  column1,
-  arrow_cast(column2, 'LargeList(Int64)') as column2,
-  arrow_cast(column3, 'LargeList(List(Int64))') as column3,
-  arrow_cast(column4, 'LargeList(List(List(List(List(Int64)))))') as column4
-FROM array_ndims_table;
-
-statement ok
-CREATE TABLE fixed_array_ndims_table
-AS VALUES
-  (arrow_cast([1], 'FixedSizeList(1, Int64)'), arrow_cast([1, 2, 3], 'FixedSizeList(3, Int64)'), arrow_cast([[7]], 'FixedSizeList(1, List(Int64))'), arrow_cast([[[[[10]]]]], 'FixedSizeList(1, List(List(List(List(Int64)))))')),
-  (arrow_cast([2], 'FixedSizeList(1, Int64)'), arrow_cast([4, 5, 6], 'FixedSizeList(3, Int64)'), arrow_cast([[8]], 'FixedSizeList(1, List(Int64))'), arrow_cast([[[[[10]]]]], 'FixedSizeList(1, List(List(List(List(Int64)))))')),
-  (null, arrow_cast([6, 7, 8], 'FixedSizeList(3, Int64)'), arrow_cast([[9]], 'FixedSizeList(1, List(Int64))'), arrow_cast([[[[[10]]]]], 'FixedSizeList(1, List(List(List(List(Int64)))))')),
-  (arrow_cast([3], 'FixedSizeList(1, Int64)'), arrow_cast([6, 7, 8], 'FixedSizeList(3, Int64)'), arrow_cast([[9]], 'FixedSizeList(1, List(Int64))'), arrow_cast([[[[[10]]]]], 'FixedSizeList(1, List(List(List(List(Int64)))))'))
-;
-
-query IIII
-select
-  array_ndims(column1),
-  array_ndims(column2),
-  array_ndims(column3),
-  array_ndims(column4)
-from array_ndims_table;
-----
-1 1 2 5
-1 1 2 5
-NULL 1 2 5
-1 1 2 5
-
-query IIII
-select
-  array_ndims(column1),
-  array_ndims(column2),
-  array_ndims(column3),
-  array_ndims(column4)
-from large_array_ndims_table;
-----
-1 1 2 5
-1 1 2 5
-NULL 1 2 5
-1 1 2 5
-
-query IIII
-select
-  array_ndims(column1),
-  array_ndims(column2),
-  array_ndims(column3),
-  array_ndims(column4)
-from fixed_array_ndims_table;
-----
-1 1 2 5
-1 1 2 5
-NULL 1 2 5
-1 1 2 5
-
-
-
-statement ok
-drop table array_ndims_table;
-
-statement ok
-drop table large_array_ndims_table
-
-query I
-select array_ndims(arrow_cast([null], 'List(List(List(Int64)))'));
-----
-3
-
-# array_ndims scalar function #2
-query II
-select array_ndims(array_repeat(array_repeat(array_repeat(1, 3), 2), 1)), array_ndims([[[[[[[[[[[[[[[[[[[[[1]]]]]]]]]]]]]]]]]]]]]);
-----
-3 21
-
-# array_ndims scalar function #3
-query II
-select array_ndims(make_array()), array_ndims(make_array(make_array()))
-----
-1 2
-
-query II
-select array_ndims(arrow_cast(make_array(), 'LargeList(Int64)')), array_ndims(arrow_cast(make_array(make_array()), 'LargeList(List(Int64))'))
-----
-1 2
-
-# list_ndims scalar function #4 (function alias `array_ndims`)
-query III
-select list_ndims(make_array(1, 2, 3)), list_ndims(make_array([1, 2], [3, 4])), list_ndims(make_array([[[[1], [2]]]]));
-----
-1 2 5
-
-query III
-select list_ndims(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), list_ndims(arrow_cast(make_array([1, 2], [3, 4]), 'LargeList(List(Int64))')), list_ndims(arrow_cast(make_array([[[[1], [2]]]]), 'LargeList(List(List(List(List(Int64)))))'));
-----
-1 2 5
-
-query II
-select list_ndims(make_array()), list_ndims(make_array(make_array()))
-----
-1 2
-
-query II
-select list_ndims(arrow_cast(make_array(), 'LargeList(Int64)')), list_ndims(arrow_cast(make_array(make_array()), 'LargeList(List(Int64))'))
-----
-1 2
-
-# array_ndims with columns
-query III
-select array_ndims(column1), array_ndims(column2), array_ndims(column3) from arrays;
-----
-2 1 1
-2 1 1
-2 1 1
-2 1 1
-NULL 1 1
-2 NULL 1
-2 1 NULL
-
-query III
-select array_ndims(column1), array_ndims(column2), array_ndims(column3) from large_arrays;
-----
-2 1 1
-2 1 1
-2 1 1
-2 1 1
-NULL 1 1
-2 NULL 1
-2 1 NULL
-
-## array_has/array_has_all/array_has_any
-
-# If lhs is empty, return false
-query B
-select array_has([], 1);
-----
-false
-
-# If rhs is Null, we returns Null
-query BBB
-select array_has([], null),
-       array_has([1, 2, 3], null),
-       array_has([null, 1], null);
-----
-NULL NULL NULL
-
-# Always return false if not contained even if list has null elements
-query BB
-select array_has([1, null, 2], 3),
-       array_has([null, null, null], 3);
-----
-false false
-
-#TODO: array_has_all and array_has_any cannot handle NULL
-#query BBBB
-#select array_has_any([], null),
-#       array_has_any([1, 2, 3], null),
-#       array_has_all([], null),
-#       array_has_all([1, 2, 3], null);
-#----
-#false false false false
-
-query BBBBBBBBBBBB
-select array_has(make_array(1,2), 1),
-       array_has(make_array(1,2,NULL), 1),
-       array_has(make_array([2,3], [3,4]), make_array(2,3)),
-       array_has(make_array([[1], [2,3]], [[4,5], [6]]), make_array([1], [2,3])),
-       array_has(make_array([[1], [2,3]], [[4,5], [6]]), make_array([4,5], [6])),
-       array_has(make_array([[1], [2,3]], [[4,5], [6]]), make_array([1])),
-       array_has(make_array([[[1]]]), make_array([[1]])),
-       array_has(make_array([[[1]]], [[[1], [2]]]), make_array([[2]])),
-       array_has(make_array([[[1]]], [[[1], [2]]]), make_array([[1], [2]])),
-       list_has(make_array(1,2,3), 4),
-       array_contains(make_array(1,2,3), 3),
-       list_contains(make_array(1,2,3), 0)
-;
-----
-true true true true true false true false true false true false
-
-query BBBBBBBBBBBB
-select array_has(arrow_cast(make_array(1,2), 'LargeList(Int64)'), 1),
-       array_has(arrow_cast(make_array(1,2,NULL), 'LargeList(Int64)'), 1),
-       array_has(arrow_cast(make_array([2,3], [3,4]), 'LargeList(List(Int64))'), make_array(2,3)),
-       array_has(arrow_cast(make_array([[1], [2,3]], [[4,5], [6]]), 'LargeList(List(List(Int64)))'), make_array([1], [2,3])),
-       array_has(arrow_cast(make_array([[1], [2,3]], [[4,5], [6]]), 'LargeList(List(List(Int64)))'), make_array([4,5], [6])),
-       array_has(arrow_cast(make_array([[1], [2,3]], [[4,5], [6]]), 'LargeList(List(List(Int64)))'), make_array([1])),
-       array_has(arrow_cast(make_array([[[1]]]), 'LargeList(List(List(List(Int64))))'), make_array([[1]])),
-       array_has(arrow_cast(make_array([[[1]]], [[[1], [2]]]), 'LargeList(List(List(List(Int64))))'), make_array([[2]])),
-       array_has(arrow_cast(make_array([[[1]]], [[[1], [2]]]), 'LargeList(List(List(List(Int64))))'), make_array([[1], [2]])),
-       list_has(arrow_cast(make_array(1,2,3), 'LargeList(Int64)'), 4),
-       array_contains(arrow_cast(make_array(1,2,3), 'LargeList(Int64)'), 3),
-       list_contains(arrow_cast(make_array(1,2,3), 'LargeList(Int64)'), 0)
-;
-----
-true true true true true false true false true false true false
-
-query BBBBBBBBBBBB
-select array_has(arrow_cast(make_array(1,2), 'FixedSizeList(2, Int64)'), 1),
-       array_has(arrow_cast(make_array(1,2,NULL), 'FixedSizeList(3, Int64)'), 1),
-       array_has(arrow_cast(make_array([2,3], [3,4]), 'FixedSizeList(2, List(Int64))'), make_array(2,3)),
-       array_has(arrow_cast(make_array([[1], [2,3]], [[4,5], [6]]), 'FixedSizeList(2, List(List(Int64)))'), make_array([1], [2,3])),
-       array_has(arrow_cast(make_array([[1], [2,3]], [[4,5], [6]]), 'FixedSizeList(2, List(List(Int64)))'), make_array([4,5], [6])),
-       array_has(arrow_cast(make_array([[1], [2,3]], [[4,5], [6]]), 'FixedSizeList(2, List(List(Int64)))'), make_array([1])),
-       array_has(arrow_cast(make_array([[[1]]]), 'FixedSizeList(1, List(List(List(Int64))))'), make_array([[1]])),
-       array_has(arrow_cast(make_array([[[1]]], [[[1], [2]]]), 'FixedSizeList(2, List(List(List(Int64))))'), make_array([[2]])),
-       array_has(arrow_cast(make_array([[[1]]], [[[1], [2]]]), 'FixedSizeList(2, List(List(List(Int64))))'), make_array([[1], [2]])),
-       list_has(arrow_cast(make_array(1,2,3), 'FixedSizeList(3, Int64)'), 4),
-       array_contains(arrow_cast(make_array(1,2,3), 'FixedSizeList(3, Int64)'), 3),
-       list_contains(arrow_cast(make_array(1,2,3), 'FixedSizeList(3, Int64)'), 0)
-;
-----
-true true true true true false true false true false true false
-
-query BBB
-select array_has(column1, column2),
-       array_has_all(column3, column4),
-       array_has_any(column5, column6)
-from array_has_table_1D;
-----
-true true true
-false false false
-
-query BBB
-select array_has(arrow_cast(column1, 'LargeList(Int64)'), column2),
-       array_has_all(arrow_cast(column3, 'LargeList(Int64)'), arrow_cast(column4, 'LargeList(Int64)')),
-       array_has_any(arrow_cast(column5, 'LargeList(Int64)'), arrow_cast(column6, 'LargeList(Int64)'))
-from array_has_table_1D;
-----
-true true true
-false false false
-
-query B
-select array_has(column1, column2)
-from array_has_table_null;
-----
-true
-true
-false
-false
-false
-
-# array_has([1, 3, 5], 1) -> true (array contains element)
-# array_has([], 1) -> false (empty array, not null)
-# array_has(null, 1) -> null (null array)
-query BB
-select array_has(column1, column2), array_has(null, column2)
-from array_has_table_empty;
-----
-true NULL
-false NULL
-NULL NULL
-
-# Test for issue: array_has should return false for empty arrays, not null
-# This test demonstrates the correct behavior with COALESCE to show the distinction
-# array_has([1, 3, 5], 1) -> 'true'
-# array_has([], 1) -> 'false' (empty array should return false)
-# array_has(null, 1) -> 'null' (null array should return null)
-query ?T
-SELECT column1, COALESCE(CAST(array_has(column1, column2) AS VARCHAR), 'null')
-from array_has_table_empty;
-----
-[1, 3, 5] true
-[] false
-NULL null
-
-query B
-select array_has(column1, column2)
-from fixed_size_array_has_table_1D;
-----
-true
-false
-
-query BB
-select array_has_all(column3, column4),
-       array_has_any(column5, column6)
-from fixed_size_array_has_table_1D;
-----
-true true
-false false
-
-query BBB
-select array_has(column1, column2),
-       array_has_all(column3, column4),
-       array_has_any(column5, column6)
-from array_has_table_1D_Float;
-----
-true true false
-false false true
-
-query BBB
-select array_has(arrow_cast(column1, 'LargeList(Float64)'), column2),
-       array_has_all(arrow_cast(column3, 'LargeList(Float64)'), arrow_cast(column4, 'LargeList(Float64)')),
-       array_has_any(arrow_cast(column5, 'LargeList(Float64)'), arrow_cast(column6, 'LargeList(Float64)'))
-from array_has_table_1D_Float;
-----
-true true false
-false false true
-
-query B
-select array_has(column1, column2)
-from fixed_size_array_has_table_1D_Float;
-----
-true
-false
-
-query BB
-select array_has_all(column3, column4),
-       array_has_any(column5, column6)
-from fixed_size_array_has_table_1D_Float;
-----
-true true
-false true
-
-query BBB
-select array_has(column1, column2),
-       array_has_all(column3, column4),
-       array_has_any(column5, column6)
-from array_has_table_1D_Boolean;
-----
-false true true
-true true true
-
-query BBB
-select array_has(arrow_cast(column1, 'LargeList(Boolean)'), column2),
-       array_has_all(arrow_cast(column3, 'LargeList(Boolean)'), arrow_cast(column4, 'LargeList(Boolean)')),
-       array_has_any(arrow_cast(column5, 'LargeList(Boolean)'), arrow_cast(column6, 'LargeList(Boolean)'))
-from array_has_table_1D_Boolean;
-----
-false true true
-true true true
-
-query B
-select array_has(column1, column2)
-from fixed_size_array_has_table_1D_Boolean;
-----
-false
-true
-
-query BB
-select array_has_all(column3, column4),
-       array_has_any(column5, column6)
-from fixed_size_array_has_table_1D_Boolean;
-----
-true true
-true true
-
-query BBBBBBBB
-select array_has_all(column3, arrow_cast(column4,'LargeList(Boolean)')),
-       array_has_any(column5, arrow_cast(column6,'LargeList(Boolean)')),
-       array_has_all(column3, arrow_cast(column4,'List(Boolean)')),
-       array_has_any(column5, arrow_cast(column6,'List(Boolean)')),
-       array_has_all(arrow_cast(column3, 'LargeList(Boolean)'), column4),
-       array_has_any(arrow_cast(column5, 'LargeList(Boolean)'), column6),
-       array_has_all(arrow_cast(column3, 'List(Boolean)'), column4),
-       array_has_any(arrow_cast(column5, 'List(Boolean)'), column6)
-from fixed_size_array_has_table_1D_Boolean;
-----
-true true true true true true true true
-true true true true true true true true
-
-query BBB
-select array_has(column1, column2),
-       array_has_all(column3, column4),
-       array_has_any(column5, column6)
-from array_has_table_1D_UTF8;
-----
-true true false
-false false true
-
-query BBB
-select array_has(arrow_cast(column1, 'LargeList(Utf8)'), column2),
-       array_has_all(arrow_cast(column3, 'LargeList(Utf8)'), arrow_cast(column4, 'LargeList(Utf8)')),
-       array_has_any(arrow_cast(column5, 'LargeList(Utf8)'), arrow_cast(column6, 'LargeList(Utf8)'))
-from array_has_table_1D_UTF8;
-----
-true true false
-false false true
-
-query B
-select array_has(column1, column2)
-from fixed_size_array_has_table_1D_UTF8;
-----
-true
-false
-
-query BB
-select array_has(column1, column2),
-       array_has_all(column3, column4)
-from array_has_table_2D;
-----
-false true
-true false
-
-query BB
-select array_has(arrow_cast(column1, 'LargeList(List(Int64))'), column2),
-       array_has_all(arrow_cast(column3, 'LargeList(List(Int64))'), arrow_cast(column4, 'LargeList(List(Int64))'))
-from array_has_table_2D;
-----
-false true
-true false
-
-query B
-select array_has(arrow_cast(column1, 'LargeList(List(Int64))'), column2)
-from fixed_size_array_has_table_2D;
-----
-false
-false
-
-query B
-select array_has_all(arrow_cast(column3, 'LargeList(List(Int64))'), arrow_cast(column4, 'LargeList(List(Int64))'))
-from fixed_size_array_has_table_2D;
-----
-true
-false
-
-query B
-select array_has_all(column1, column2)
-from array_has_table_2D_float;
-----
-true
-false
-
-query B
-select array_has_all(arrow_cast(column1, 'LargeList(List(Float64))'), arrow_cast(column2, 'LargeList(List(Float64))'))
-from array_has_table_2D_float;
-----
-true
-false
-
-query B
-select array_has_all(column1, column2)
-from fixed_size_array_has_table_2D_float;
-----
-false
-false
-
-query B
-select array_has(column1, column2) from array_has_table_3D;
-----
-false
-true
-false
-false
-true
-false
-true
-
-query B
-select array_has(arrow_cast(column1, 'LargeList(List(List(Int64)))'), column2) from array_has_table_3D;
-----
-false
-true
-false
-false
-true
-false
-true
-
-query B
-select array_has(column1, column2) from fixed_size_array_has_table_3D;
-----
-false
-false
-false
-false
-true
-true
-true
-
-query BBBB
-select array_has(column1, make_array(5, 6)),
-       array_has(column1, make_array(7, NULL)),
-       array_has(column2, 5.5),
-       array_has(column3, 'o')
-from arrays;
-----
-false false false true
-true false true false
-true false false true
-false true false false
-NULL NULL false false
-false false NULL false
-false false false NULL
-
-query BBBB
-select array_has(arrow_cast(column1, 'LargeList(List(Int64))'), make_array(5, 6)),
-       array_has(arrow_cast(column1, 'LargeList(List(Int64))'), make_array(7, NULL)),
-       array_has(arrow_cast(column2, 'LargeList(Float64)'), 5.5),
-       array_has(arrow_cast(column3, 'LargeList(Utf8)'), 'o')
-from arrays;
-----
-false false false true
-true false true false
-true false false true
-false true false false
-NULL NULL false false
-false false NULL false
-false false false NULL
-
-# Row 1: [[NULL,2],[3,NULL]], [1.1,2.2,3.3], ['L','o','r','e','m']
-# Row 2: [[3,4],[5,6]], [NULL,5.5,6.6], ['i','p',NULL,'u','m']
-# Row 3: [[5,6],[7,8]], [7.7,8.8,9.9], ['d',NULL,'l','o','r']
-# Row 4: [[7,NULL],[9,10]], [10.1,NULL,12.2], ['s','i','t','a','b']
-# Row 5: NULL, [13.3,14.4,15.5], ['a','m','e','t','x']
-# Row 6: [[11,12],[13,14]], NULL, [',','a','b','c','d']
-# Row 7: [[15,16],[NULL,18]], [16.6,17.7,18.8], NULL
-query BBBB
-select array_has(column1, make_array(5, 6)),
-       array_has(column1, make_array(7, NULL)),
-       array_has(column2, 5.5),
-       array_has(column3, 'o')
-from fixed_size_arrays;
-----
-false false false true
-true false true false
-true false false true
-false true false false
-NULL NULL false false
-false false NULL false
-false false false NULL
-
-query BBBB
-select array_has_all(make_array(1,2,3), []),
-       array_has_any(make_array(1,2,3), []),
-       array_has_all(make_array('aa','bb','cc'), []),
-       array_has_any(make_array('aa','bb','cc'), [])
-;
-----
-true false true false
-
-query BBBBBBBBBBBBB
-select array_has_all(make_array(1,2,3), make_array(1,3)),
-       array_has_all(make_array(1,2,3), make_array(1,4)),
-       array_has_all(make_array([1,2], [3,4]), make_array([1,2])),
-       array_has_all(make_array([1,2], [3,4]), make_array([1,3])),
-       array_has_all(make_array([1,2], [3,4]), make_array([1,2], [3,4], [5,6])),
-       array_has_all(make_array([[1,2,3]]), make_array([[1]])),
-       array_has_all(make_array([[1,2,3]]), make_array([[1,2,3]])),
-       array_has_any(make_array(1,2,3), make_array(1,10,100)),
-       array_has_any(make_array(1,2,3), make_array(10,100)),
-       array_has_any(make_array([1,2], [3,4]), make_array([1,10], [10,4])),
-       array_has_any(make_array([1,2], [3,4]), make_array([10,20], [3,4])),
-       array_has_any(make_array([[1,2,3]]), make_array([[1,2,3], [4,5,6]])),
-       array_has_any(make_array([[1,2,3]]), make_array([[1,2,3]], [[4,5,6]]))
-;
-----
-true false true false false false true true false false true false true
-
-query BBBBBBBBBBBBB
-select array_has_all(arrow_cast(make_array(1,2,3), 'LargeList(Int64)'), arrow_cast(make_array(1,3), 'LargeList(Int64)')),
-       array_has_all(arrow_cast(make_array(1,2,3),'LargeList(Int64)'), arrow_cast(make_array(1,4), 'LargeList(Int64)')),
-       array_has_all(arrow_cast(make_array([1,2], [3,4]), 'LargeList(List(Int64))'), arrow_cast(make_array([1,2]), 'LargeList(List(Int64))')),
-       array_has_all(arrow_cast(make_array([1,2], [3,4]), 'LargeList(List(Int64))'), arrow_cast(make_array([1,3]), 'LargeList(List(Int64))')),
-       array_has_all(arrow_cast(make_array([1,2], [3,4]), 'LargeList(List(Int64))'), arrow_cast(make_array([1,2], [3,4], [5,6]), 'LargeList(List(Int64))')),
-       array_has_all(arrow_cast(make_array([[1,2,3]]), 'LargeList(List(List(Int64)))'), arrow_cast(make_array([[1]]), 'LargeList(List(List(Int64)))')),
-       array_has_all(arrow_cast(make_array([[1,2,3]]), 'LargeList(List(List(Int64)))'), arrow_cast(make_array([[1,2,3]]), 'LargeList(List(List(Int64)))')),
-       array_has_any(arrow_cast(make_array(1,2,3),'LargeList(Int64)'), arrow_cast(make_array(1,10,100), 'LargeList(Int64)')),
-       array_has_any(arrow_cast(make_array(1,2,3),'LargeList(Int64)'), arrow_cast(make_array(10,100),'LargeList(Int64)')),
-       array_has_any(arrow_cast(make_array([1,2], [3,4]), 'LargeList(List(Int64))'), arrow_cast(make_array([1,10], [10,4]), 'LargeList(List(Int64))')),
-       array_has_any(arrow_cast(make_array([1,2], [3,4]), 'LargeList(List(Int64))'), arrow_cast(make_array([10,20], [3,4]), 'LargeList(List(Int64))')),
-       array_has_any(arrow_cast(make_array([[1,2,3]]), 'LargeList(List(List(Int64)))'), arrow_cast(make_array([[1,2,3], [4,5,6]]), 'LargeList(List(List(Int64)))')),
-       array_has_any(arrow_cast(make_array([[1,2,3]]), 'LargeList(List(List(Int64)))'), arrow_cast(make_array([[1,2,3]], [[4,5,6]]), 'LargeList(List(List(Int64)))'))
-;
-----
-true false true false false false true true false false true false true
-
-query BBBBBBBBBBBBB
-select array_has_all(arrow_cast(make_array(1,2,3), 'FixedSizeList(3, Int64)'), arrow_cast(make_array(1, 3), 'FixedSizeList(2, Int64)')),
-       array_has_all(arrow_cast(make_array(1,2,3),'FixedSizeList(3, Int64)'), arrow_cast(make_array(1, 4), 'FixedSizeList(2, Int64)')),
-       array_has_all(arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array([1,2]), 'FixedSizeList(1, List(Int64))')),
-       array_has_all(arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array([1,3]), 'FixedSizeList(1, List(Int64))')),
-       array_has_all(arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array([1,2], [3,4], [5,6]), 'FixedSizeList(3, List(Int64))')),
-       array_has_all(arrow_cast(make_array([[1,2,3]]), 'FixedSizeList(1, List(List(Int64)))'), arrow_cast(make_array([[1]]), 'FixedSizeList(1, List(List(Int64)))')),
-       array_has_all(arrow_cast(make_array([[1,2,3]]), 'FixedSizeList(1, List(List(Int64)))'), arrow_cast(make_array([[1,2,3]]), 'FixedSizeList(1, List(List(Int64)))')),
-       array_has_any(arrow_cast(make_array(1,2,3),'FixedSizeList(3, Int64)'), arrow_cast(make_array(1,10,100), 'FixedSizeList(3, Int64)')),
-       array_has_any(arrow_cast(make_array(1,2,3),'FixedSizeList(3, Int64)'), arrow_cast(make_array(10, 100),'FixedSizeList(2, Int64)')),
-       array_has_any(arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array([1,10], [10,4]), 'FixedSizeList(2, List(Int64))')),
-       array_has_any(arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array([10,20], [3,4]), 'FixedSizeList(2, List(Int64))')),
-       array_has_any(arrow_cast(make_array([[1,2,3]]), 'FixedSizeList(1, List(List(Int64)))'), arrow_cast(make_array([[1,2,3], [4,5,6]]), 'FixedSizeList(1, List(List(Int64)))')),
-       array_has_any(arrow_cast(make_array([[1,2,3]]), 'FixedSizeList(1, List(List(Int64)))'), arrow_cast(make_array([[1,2,3]], [[4,5,6]]), 'FixedSizeList(2, List(List(Int64)))'))
-;
-----
-true false true false false false true true false false true false true
-
-# rewrite various array_has operations to InList where the haystack is a literal list
-# NB that `col in (a, b, c)` is simplified to OR if there are <= 3 elements, so we make 4-element haystack lists
-
-query I
-with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
-select count(*) from test WHERE needle IN ('7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c');
-----
-1
-
-query TT
-explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
-select count(*) from test WHERE needle IN ('7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c');
-----
-logical_plan
-01)Projection: count(Int64(1)) AS count(*)
-02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
-03)----SubqueryAlias: test
-04)------SubqueryAlias: t
-05)--------Projection:
-06)----------Filter: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
-07)------------TableScan: generate_series() projection=[value]
-physical_plan
-01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
-02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
-03)----CoalescePartitionsExec
-04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
-05)--------ProjectionExec: expr=[]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c])
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
-
-query I
-with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
-select count(*) from test WHERE needle = ANY(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c']);
-----
-1
-
-query TT
-explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
-select count(*) from test WHERE needle = ANY(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c']);
-----
-logical_plan
-01)Projection: count(Int64(1)) AS count(*)
-02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
-03)----SubqueryAlias: test
-04)------SubqueryAlias: t
-05)--------Projection:
-06)----------Filter: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
-07)------------TableScan: generate_series() projection=[value]
-physical_plan
-01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
-02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
-03)----CoalescePartitionsExec
-04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
-05)--------ProjectionExec: expr=[]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c])
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
-
-query I
-with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
-select count(*) from test WHERE array_has(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], needle);
-----
-1
-
-query TT
-explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
-select count(*) from test WHERE array_has(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], needle);
-----
-logical_plan
-01)Projection: count(Int64(1)) AS count(*)
-02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
-03)----SubqueryAlias: test
-04)------SubqueryAlias: t
-05)--------Projection:
-06)----------Filter: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
-07)------------TableScan: generate_series() projection=[value]
-physical_plan
-01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
-02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
-03)----CoalescePartitionsExec
-04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
-05)--------ProjectionExec: expr=[]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c])
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
-
-query I
-with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
-select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'LargeList(Utf8View)'), needle);
-----
-1
-
-query TT
-explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
-select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'LargeList(Utf8View)'), needle);
-----
-logical_plan
-01)Projection: count(Int64(1)) AS count(*)
-02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
-03)----SubqueryAlias: test
-04)------SubqueryAlias: t
-05)--------Projection:
-06)----------Filter: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
-07)------------TableScan: generate_series() projection=[value]
-physical_plan
-01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
-02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
-03)----CoalescePartitionsExec
-04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
-05)--------ProjectionExec: expr=[]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c])
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
-
-query I
-with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
-select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'FixedSizeList(4, Utf8View)'), needle);
-----
-1
-
-query TT
-explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
-select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'FixedSizeList(4, Utf8View)'), needle);
-----
-logical_plan
-01)Projection: count(Int64(1)) AS count(*)
-02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
-03)----SubqueryAlias: test
-04)------SubqueryAlias: t
-05)--------Projection:
-06)----------Filter: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
-07)------------TableScan: generate_series() projection=[value]
-physical_plan
-01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
-02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
-03)----CoalescePartitionsExec
-04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
-05)--------ProjectionExec: expr=[]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c])
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
-
-query I
-with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
-select count(*) from test WHERE array_has([needle], needle);
-----
-100000
-
-# The optimizer does not currently eliminate the filter;
-# Instead, it's rewritten as `IS NULL OR NOT NULL` due to SQL null semantics
-query TT
-explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
-select count(*) from test WHERE array_has([needle], needle);
-----
-logical_plan
-01)Projection: count(Int64(1)) AS count(*)
-02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
-03)----SubqueryAlias: test
-04)------SubqueryAlias: t
-05)--------Projection:
-06)----------Filter: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IS NOT NULL OR Boolean(NULL)
-07)------------TableScan: generate_series() projection=[value]
-physical_plan
-01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
-02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
-03)----CoalescePartitionsExec
-04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
-05)--------ProjectionExec: expr=[]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IS NOT NULL OR NULL
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
-
-# any operator
-query ?
-select column3 from arrays where 'L'=any(column3);
-----
-[L, o, r, e, m]
-
-query I
-select count(*) from arrays where 'L'=any(column3);
-----
-1
-
-query I
-select count(*) from arrays where 'X'=any(column3);
-----
-0
-
-query error DataFusion error: Error during planning: Unsupported AnyOp: '>', only '=' is supported
-select count(*) from arrays where 'X'>any(column3);
-
-## array_distinct
-
-#TODO: https://github.com/apache/datafusion/issues/7142
-#query ?
-#select array_distinct(null);
-#----
-#NULL
-
-# test with empty row, the row that does not match the condition has row count 0
-statement ok
-create table t1(a int, b int) as values (100, 1), (101, 2), (102, 3), (101, 2);
-
-# rowsort is to ensure the order of group by is deterministic, array_sort has no effect here, since the sum() always returns single row.
-query ? rowsort
-select array_distinct([sum(a)]) from t1 where a > 100 group by b;
-----
-[102]
-[202]
-
-statement ok
-drop table t1;
-
-query ?
-select array_distinct(a) from values ([1, 2, 3]), (null), ([1, 3, 1]) as X(a);
-----
-[1, 2, 3]
-NULL
-[1, 3]
-
-query ?
-select array_distinct(arrow_cast(null, 'LargeList(Int64)'));
-----
-NULL
-
-query ?
-select array_distinct([]);
-----
-[]
-
-query ?
-select array_distinct([[], []]);
-----
-[[]]
-
-query ?
-select array_distinct(column1)
-from array_distinct_table_1D;
-----
-[1, 2, 3]
-[1, 2, 3, 4, 5]
-[3, 5]
-
-query ?
-select array_distinct(column1)
-from array_distinct_table_1D_UTF8;
-----
-[a, bc, def]
-[a, bc, def, defg]
-[defg]
-
-query ?
-select array_distinct(column1)
-from array_distinct_table_2D;
-----
-[[1, 2], [3, 4], [5, 6]]
-[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
-[NULL, [5, 6]]
-
-query ?
-select array_distinct(column1)
-from array_distinct_table_1D_large;
-----
-[1, 2, 3]
-[1, 2, 3, 4, 5]
-[3, 5]
-
-query ?
-select array_distinct(column1)
-from array_distinct_table_1D_fixed;
-----
-[1, 2, 3]
-[1, 2, 3, 4, 5]
-[3, 5]
-
-query ?
-select array_distinct(column1)
-from array_distinct_table_1D_UTF8_fixed;
-----
-[a, bc, def]
-[a, bc, def, defg]
-[defg]
-
-query ?
-select array_distinct(column1)
-from array_distinct_table_2D_fixed;
-----
-[[1, 2], [3, 4], [5, 6]]
-[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
-[NULL, [5, 6]]
-
-query ???
-select array_intersect(column1, column2),
-       array_intersect(column3, column4),
-       array_intersect(column5, column6)
-from array_intersect_table_1D;
-----
-[1] [1, 3] [1, 3]
-[11] [11, 33] [11, 33]
-
-query ???
-select array_intersect(column1, column2),
-       array_intersect(column3, column4),
-       array_intersect(column5, column6)
-from large_array_intersect_table_1D;
-----
-[1] [1, 3] [1, 3]
-[11] [11, 33] [11, 33]
-
-query ???
-select array_intersect(column1, column2),
-       array_intersect(column3, column4),
-       array_intersect(column5, column6)
-from array_intersect_table_1D_Float;
-----
-[1.0] [1.0, 3.0] []
-[] [2.0] [1.11]
-
-query ???
-select array_intersect(column1, column2),
-       array_intersect(column3, column4),
-       array_intersect(column5, column6)
-from array_intersect_table_1D_Boolean;
-----
-[] [false, true] [false]
-[false] [true] [true]
-
-query ???
-select array_intersect(column1, column2),
-       array_intersect(column3, column4),
-       array_intersect(column5, column6)
-from large_array_intersect_table_1D_Boolean;
-----
-[] [false, true] [false]
-[false] [true] [true]
-
-query ???
-select array_intersect(column1, column2),
-       array_intersect(column3, column4),
-       array_intersect(column5, column6)
-from array_intersect_table_1D_UTF8;
-----
-[bc] [arrow, rust] []
-[] [arrow, datafusion, rust] [arrow, rust]
-
-query ???
-select array_intersect(column1, column2),
-       array_intersect(column3, column4),
-       array_intersect(column5, column6)
-from large_array_intersect_table_1D_UTF8;
-----
-[bc] [arrow, rust] []
-[] [arrow, datafusion, rust] [arrow, rust]
-
-query ??
-select array_intersect(column1, column2),
-       array_intersect(column3, column4)
-from array_intersect_table_2D;
-----
-[] [[4, 5], [6, 7]]
-[[3, 4]] [[5, 6, 7], [8, 9, 10]]
-
-query ??
-select array_intersect(column1, column2),
-       array_intersect(column3, column4)
-from large_array_intersect_table_2D;
-----
-[] [[4, 5], [6, 7]]
-[[3, 4]] [[5, 6, 7], [8, 9, 10]]
-
-
-query ?
-select array_intersect(column1, column2)
-from array_intersect_table_2D_float;
-----
-[[1.1, 2.2], [3.3]]
-[[1.1, 2.2], [3.3]]
-
-query ?
-select array_intersect(column1, column2)
-from large_array_intersect_table_2D_float;
-----
-[[1.1, 2.2], [3.3]]
-[[1.1, 2.2], [3.3]]
-
-query ?
-select array_intersect(column1, column2)
-from array_intersect_table_3D;
-----
-[]
-[[[1, 2]]]
-
-query ?
-select array_intersect(column1, column2)
-from large_array_intersect_table_3D;
-----
-[]
-[[[1, 2]]]
-
-query ??????
-SELECT  array_intersect(make_array(1,2,3), make_array(2,3,4)),
-        array_intersect(make_array(1,3,5), make_array(2,4,6)),
-        array_intersect(make_array('aa','bb','cc'), make_array('cc','aa','dd')),
-        array_intersect(make_array(true, false), make_array(true)),
-        array_intersect(make_array(1.1, 2.2, 3.3), make_array(2.2, 3.3, 4.4)),
-        array_intersect(make_array([1, 1], [2, 2], [3, 3]), make_array([2, 2], [3, 3], [4, 4]))
-;
-----
-[2, 3] [] [aa, cc] [true] [2.2, 3.3] [[2, 2], [3, 3]]
-
-query ??????
-SELECT  array_intersect(arrow_cast(make_array(1,2,3), 'LargeList(Int64)'), arrow_cast(make_array(2,3,4), 'LargeList(Int64)')),
-        array_intersect(arrow_cast(make_array(1,3,5), 'LargeList(Int64)'), arrow_cast(make_array(2,4,6), 'LargeList(Int64)')),
-        array_intersect(arrow_cast(make_array('aa','bb','cc'), 'LargeList(Utf8)'), arrow_cast(make_array('cc','aa','dd'), 'LargeList(Utf8)')),
-        array_intersect(arrow_cast(make_array(true, false), 'LargeList(Boolean)'), arrow_cast(make_array(true), 'LargeList(Boolean)')),
-        array_intersect(arrow_cast(make_array(1.1, 2.2, 3.3), 'LargeList(Float64)'), arrow_cast(make_array(2.2, 3.3, 4.4), 'LargeList(Float64)')),
-        array_intersect(arrow_cast(make_array([1, 1], [2, 2], [3, 3]), 'LargeList(List(Int64))'), arrow_cast(make_array([2, 2], [3, 3], [4, 4]), 'LargeList(List(Int64))'))
-;
-----
-[2, 3] [] [aa, cc] [true] [2.2, 3.3] [[2, 2], [3, 3]]
-
-query ??????
-SELECT  array_intersect(arrow_cast(make_array(1,2,3), 'FixedSizeList(3, Int64)'), arrow_cast(make_array(2,3,4), 'FixedSizeList(3, Int64)')),
-        array_intersect(arrow_cast(make_array(1,3,5), 'FixedSizeList(3, Int64)'), arrow_cast(make_array(2,4,6), 'FixedSizeList(3, Int64)')),
-        array_intersect(arrow_cast(make_array('aa','bb','cc'), 'FixedSizeList(3, Utf8)'), arrow_cast(make_array('cc','aa','dd'), 'FixedSizeList(3, Utf8)')),
-        array_intersect(arrow_cast(make_array(true, false), 'FixedSizeList(2, Boolean)'), arrow_cast(make_array(true), 'FixedSizeList(1, Boolean)')),
-        array_intersect(arrow_cast(make_array(1.1, 2.2, 3.3), 'FixedSizeList(3, Float64)'), arrow_cast(make_array(2.2, 3.3, 4.4), 'FixedSizeList(3, Float64)')),
-        array_intersect(arrow_cast(make_array([1, 1], [2, 2], [3, 3]), 'FixedSizeList(3, List(Int64))'), arrow_cast(make_array([2, 2], [3, 3], [4, 4]), 'FixedSizeList(3, List(Int64))'))
-;
-----
-[2, 3] [] [aa, cc] [true] [2.2, 3.3] [[2, 2], [3, 3]]
-
-query ?
-select array_intersect([], []);
-----
-[]
-
-query ?
-select array_intersect(arrow_cast([], 'LargeList(Int64)'), arrow_cast([], 'LargeList(Int64)'));
-----
-[]
-
-query ?
-select array_intersect([1, 1, 2, 2, 3, 3], null);
-----
-[]
-
-query ?
-select array_intersect(arrow_cast([1, 1, 2, 2, 3, 3], 'LargeList(Int64)'), null);
-----
-[]
-
-query ?
-select array_intersect(null, [1, 1, 2, 2, 3, 3]);
-----
-[]
-
-query ?
-select array_intersect(null, arrow_cast([1, 1, 2, 2, 3, 3], 'LargeList(Int64)'));
-----
-[]
-
-query ?
-select array_intersect([], null);
-----
-[]
-
-query ?
-select array_intersect([[1,2,3]], [[]]);
-----
-[]
-
-query ?
-select array_intersect([[null]], [[]]);
-----
-[]
-
-query ?
-select array_intersect(arrow_cast([], 'LargeList(Int64)'), null);
-----
-[]
-
-query ?
-select array_intersect(null, []);
-----
-[]
-
-query ?
-select array_intersect(null, arrow_cast([], 'LargeList(Int64)'));
-----
-[]
-
-query ?
-select array_intersect(null, null);
-----
-NULL
-
-query ??????
-SELECT  list_intersect(make_array(1,2,3), make_array(2,3,4)),
-        list_intersect(make_array(1,3,5), make_array(2,4,6)),
-        list_intersect(make_array('aa','bb','cc'), make_array('cc','aa','dd')),
-        list_intersect(make_array(true, false), make_array(true)),
-        list_intersect(make_array(1.1, 2.2, 3.3), make_array(2.2, 3.3, 4.4)),
-        list_intersect(make_array([1, 1], [2, 2], [3, 3]), make_array([2, 2], [3, 3], [4, 4]))
-;
-----
-[2, 3] [] [aa, cc] [true] [2.2, 3.3] [[2, 2], [3, 3]]
-
-query ??????
-SELECT  list_intersect(arrow_cast(make_array(1,2,3), 'LargeList(Int64)'), arrow_cast(make_array(2,3,4), 'LargeList(Int64)')),
-        list_intersect(arrow_cast(make_array(1,3,5), 'LargeList(Int64)'), arrow_cast(make_array(2,4,6), 'LargeList(Int64)')),
-        list_intersect(arrow_cast(make_array('aa','bb','cc'), 'LargeList(Utf8)'), arrow_cast(make_array('cc','aa','dd'), 'LargeList(Utf8)')),
-        list_intersect(arrow_cast(make_array(true, false), 'LargeList(Boolean)'), arrow_cast(make_array(true), 'LargeList(Boolean)')),
-        list_intersect(arrow_cast(make_array(1.1, 2.2, 3.3), 'LargeList(Float64)'), arrow_cast(make_array(2.2, 3.3, 4.4), 'LargeList(Float64)')),
-        list_intersect(arrow_cast(make_array([1, 1], [2, 2], [3, 3]), 'LargeList(List(Int64))'), arrow_cast(make_array([2, 2], [3, 3], [4, 4]), 'LargeList(List(Int64))'))
-;
-----
-[2, 3] [] [aa, cc] [true] [2.2, 3.3] [[2, 2], [3, 3]]
-
-query BBBB
-select list_has_all(make_array(1,2,3), make_array(4,5,6)),
-       list_has_all(make_array(1,2,3), make_array(1,2)),
-       list_has_any(make_array(1,2,3), make_array(4,5,6)),
-        list_has_any(make_array(1,2,3), make_array(1,2,4))
-;
-----
-false true false true
-
-query BBBB
-select arrays_overlap(make_array(1,2,3), make_array(4,5,6)),
-        arrays_overlap(make_array(1,2,3), make_array(1,2,4)),
-        arrays_overlap(make_array(['aa']), make_array(['aa'],['bb'])),
-        arrays_overlap(make_array('aa',NULL), make_array('bb',NULL))
-;
-----
-false true true true
-
-query ???
-select range(column2),
-       range(column1, column2),
-       range(column1, column2, column3)
-from arrays_range;
-----
-[0, 1, 2, 3, 4, 5, 6, 7, 8, 9] [3, 4, 5, 6, 7, 8, 9] [3, 5, 7, 9]
-[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] [4, 5, 6, 7, 8, 9, 10, 11, 12] [4, 7, 10]
-
-query ???????????
-select range(5),
-       range(2, 5),
-       range(2, 10, 3),
-       range(10, 2, -3),
-       range(1, 5, -1),
-       range(1, -5, 1),
-       range(1, -5, -1),
-       range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH),
-       range(DATE '1993-02-01', DATE '1993-01-01', INTERVAL '-1' DAY),
-       range(DATE '1989-04-01', DATE '1993-03-01', INTERVAL '1' YEAR),
-       range(DATE '1993-03-01', DATE '1989-04-01', INTERVAL '1' YEAR)
-;
-----
-[0, 1, 2, 3, 4] [2, 3, 4] [2, 5, 8] [10, 7, 4] [] [] [1, 0, -1, -2, -3, -4] [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01] [1993-02-01, 1993-01-31, 1993-01-30, 1993-01-29, 1993-01-28, 1993-01-27, 1993-01-26, 1993-01-25, 1993-01-24, 1993-01-23, 1993-01-22, 1993-01-21, 1993-01-20, 1993-01-19, 1993-01-18, 1993-01-17, 1993-01-16, 1993-01-15, 1993-01-14, 1993-01-13, 1993-01-12, 1993-01-11, 1993-01-10, 1993-01-09, 1993-01-08, 1993-01-07, 1993-01-06, 1993-01-05, 1993-01-04, 1993-01-03, 1993-01-02] [1989-04-01, 1990-04-01, 1991-04-01] []
-
-# Ensure can coerce from other valid types
-query ???????????
-select range(5),
-       range(2, 5),
-       range(2, 10, 3),
-       range(10, 2, -3),
-       range(arrow_cast(1, 'Int8'), 5, -1),
-       range(arrow_cast(1, 'Int16'), arrow_cast(-5, 'Int8'), 1),
-       range(arrow_cast(1, 'Int32'), arrow_cast(-5, 'Int16'), arrow_cast(-1, 'Int8')),
-       range(DATE '1992-09-01', DATE '1993-03-01', arrow_cast('1 MONTH', 'Interval(YearMonth)')),
-       range(DATE '1993-02-01', arrow_cast(DATE '1993-01-01', 'Date64'), INTERVAL '-1' DAY),
-       range(arrow_cast(DATE '1989-04-01', 'Date64'), DATE '1993-03-01', INTERVAL '1' YEAR),
-       range(arrow_cast(DATE '1993-03-01', 'Date64'), arrow_cast(DATE '1989-04-01', 'Date64'), INTERVAL '1' YEAR)
-;
-----
-[0, 1, 2, 3, 4] [2, 3, 4] [2, 5, 8] [10, 7, 4] [] [] [1, 0, -1, -2, -3, -4] [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01] [1993-02-01, 1993-01-31, 1993-01-30, 1993-01-29, 1993-01-28, 1993-01-27, 1993-01-26, 1993-01-25, 1993-01-24, 1993-01-23, 1993-01-22, 1993-01-21, 1993-01-20, 1993-01-19, 1993-01-18, 1993-01-17, 1993-01-16, 1993-01-15, 1993-01-14, 1993-01-13, 1993-01-12, 1993-01-11, 1993-01-10, 1993-01-09, 1993-01-08, 1993-01-07, 1993-01-06, 1993-01-05, 1993-01-04, 1993-01-03, 1993-01-02] [1989-04-01, 1990-04-01, 1991-04-01] []
-
-# Test range with zero step
-query error DataFusion error: Execution error: step can't be 0 for function range\(start \[, stop, step\]\)
-select range(1, 1, 0);
-
-# Test range with big steps
-query ????
-select
-  range(-9223372036854775808, -9223372036854775808, -9223372036854775808) as c1,
-  range(9223372036854775807, 9223372036854775807, 9223372036854775807) as c2,
-  range(0, -9223372036854775808, -9223372036854775808) as c3,
-  range(0, 9223372036854775807, 9223372036854775807) as c4;
-----
-[] [] [0] [0]
-
-# Test range for other edge cases
-query ????????
-select
-  range(9223372036854775807, 9223372036854775807, -1) as c1,
-  range(9223372036854775807, 9223372036854775806, -1) as c2,
-  range(9223372036854775807, 9223372036854775807, 1) as c3,
-  range(9223372036854775806, 9223372036854775807, 1) as c4,
-  range(-9223372036854775808, -9223372036854775808, -1) as c5,
-  range(-9223372036854775807, -9223372036854775808, -1) as c6,
-  range(-9223372036854775808, -9223372036854775808, 1) as c7,
-  range(-9223372036854775808, -9223372036854775807, 1) as c8;
-----
-[] [9223372036854775807] [] [9223372036854775806] [] [-9223372036854775807] [] [-9223372036854775808]
-
-# Test range(start, stop, step) with NULL values
-query ?
-select range(start, stop, step) from
-  (values (1), (NULL)) as start_values(start),
-  (values (10), (NULL)) as stop_values(stop),
-  (values (3), (NULL)) as step_values(step)
-where start is null or stop is null or step is null
-----
-NULL
-NULL
-NULL
-NULL
-NULL
-NULL
-NULL
-
-# Test range(start, stop) with NULL values
-query ?
-select range(start, stop) from
-  (values (1), (NULL)) as start_values(start),
-  (values (10), (NULL)) as stop_values(stop)
-where start is null or stop is null
-----
-NULL
-NULL
-NULL
-
-# Test range(stop) with NULL value
-query ?
-select range(NULL)
-----
-NULL
-
-## should return NULL
-query ?
-select range(DATE '1992-09-01', NULL, INTERVAL '1' YEAR);
-----
-NULL
-
-## should return NULL
-query ?
-select range(TIMESTAMP '1992-09-01', NULL, INTERVAL '1' YEAR);
-----
-NULL
-
-query ?
-select range(DATE '1992-09-01', DATE '1993-03-01', NULL);
-----
-NULL
-
-query ?
-select range(TIMESTAMP '1992-09-01', TIMESTAMP '1993-03-01', NULL);
-----
-NULL
-
-query ?
-select range(NULL, DATE '1993-03-01', INTERVAL '1' YEAR);
-----
-NULL
-
-query ?
-select generate_series(NULL::Date, DATE '1993-03-01', INTERVAL '1' YEAR);
-----
-NULL
-
-query ?
-select generate_series(DATE '1993-03-01', NULL::Date, INTERVAL '1' YEAR);
-----
-NULL
-
-query ?
-select generate_series(DATE '1993-02-01', DATE '1993-03-01', NULL::Interval);
-----
-NULL
-
-query ?
-select range(NULL, TIMESTAMP '1993-03-01', INTERVAL '1' YEAR);
-----
-NULL
-
-query ?
-select range(NULL, NULL, NULL);
-----
-NULL
-
-query ?
-select range(NULL::timestamp, NULL::timestamp, NULL);
-----
-NULL
-
-query ?
-select range(DATE '1989-04-01', DATE '1993-03-01', INTERVAL '-1' YEAR)
-----
-[]
-
-query ?
-select range(TIMESTAMP '1989-04-01', TIMESTAMP '1993-03-01', INTERVAL '-1' YEAR)
-----
-[]
-
-query ?
-select range(DATE '1993-03-01', DATE '1989-04-01', INTERVAL '1' YEAR)
-----
-[]
-
-query ?
-select range(TIMESTAMP '1993-03-01', TIMESTAMP '1989-04-01', INTERVAL '1' YEAR)
-----
-[]
-
-query error DataFusion error: Execution error: Cannot generate date range less than 1 day\.
-select range(DATE '1993-03-01', DATE '1993-03-01', INTERVAL '1' HOUR)
-
-query ?
-select range(TIMESTAMP '1993-03-01', TIMESTAMP '1993-03-01', INTERVAL '1' HOUR)
-----
-[]
-
-query ?????????
-select generate_series(5),
-       generate_series(2, 5),
-       generate_series(2, 10, 3),
-       generate_series(1, 5, 1),
-       generate_series(5, 1, -1),
-       generate_series(10, 2, -3),
-       generate_series(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH),
-       generate_series(DATE '1993-02-01', DATE '1993-01-01', INTERVAL '-1' DAY),
-       generate_series(DATE '1989-04-01', DATE '1993-03-01', INTERVAL '1' YEAR)
-;
-----
-[0, 1, 2, 3, 4, 5] [2, 3, 4, 5] [2, 5, 8] [1, 2, 3, 4, 5] [5, 4, 3, 2, 1] [10, 7, 4] [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01, 1993-03-01] [1993-02-01, 1993-01-31, 1993-01-30, 1993-01-29, 1993-01-28, 1993-01-27, 1993-01-26, 1993-01-25, 1993-01-24, 1993-01-23, 1993-01-22, 1993-01-21, 1993-01-20, 1993-01-19, 1993-01-18, 1993-01-17, 1993-01-16, 1993-01-15, 1993-01-14, 1993-01-13, 1993-01-12, 1993-01-11, 1993-01-10, 1993-01-09, 1993-01-08, 1993-01-07, 1993-01-06, 1993-01-05, 1993-01-04, 1993-01-03, 1993-01-02, 1993-01-01] [1989-04-01, 1990-04-01, 1991-04-01, 1992-04-01]
-
-query ?
-select generate_series('2021-01-01'::timestamp, '2021-01-01T15:00:00'::timestamp, INTERVAL '1' HOUR);
-----
-[2021-01-01T00:00:00, 2021-01-01T01:00:00, 2021-01-01T02:00:00, 2021-01-01T03:00:00, 2021-01-01T04:00:00, 2021-01-01T05:00:00, 2021-01-01T06:00:00, 2021-01-01T07:00:00, 2021-01-01T08:00:00, 2021-01-01T09:00:00, 2021-01-01T10:00:00, 2021-01-01T11:00:00, 2021-01-01T12:00:00, 2021-01-01T13:00:00, 2021-01-01T14:00:00, 2021-01-01T15:00:00]
-
-# Other timestamp types are coerced to nanosecond
-query ?
-select generate_series(arrow_cast('2021-01-01'::timestamp, 'Timestamp(Second, None)'), '2021-01-01T15:00:00'::timestamp, INTERVAL '1' HOUR);
-----
-[2021-01-01T00:00:00, 2021-01-01T01:00:00, 2021-01-01T02:00:00, 2021-01-01T03:00:00, 2021-01-01T04:00:00, 2021-01-01T05:00:00, 2021-01-01T06:00:00, 2021-01-01T07:00:00, 2021-01-01T08:00:00, 2021-01-01T09:00:00, 2021-01-01T10:00:00, 2021-01-01T11:00:00, 2021-01-01T12:00:00, 2021-01-01T13:00:00, 2021-01-01T14:00:00, 2021-01-01T15:00:00]
-
-query ?
-select generate_series('2021-01-01'::timestamp, arrow_cast('2021-01-01T15:00:00'::timestamp, 'Timestamp(Microsecond, None)'), INTERVAL '1' HOUR);
-----
-[2021-01-01T00:00:00, 2021-01-01T01:00:00, 2021-01-01T02:00:00, 2021-01-01T03:00:00, 2021-01-01T04:00:00, 2021-01-01T05:00:00, 2021-01-01T06:00:00, 2021-01-01T07:00:00, 2021-01-01T08:00:00, 2021-01-01T09:00:00, 2021-01-01T10:00:00, 2021-01-01T11:00:00, 2021-01-01T12:00:00, 2021-01-01T13:00:00, 2021-01-01T14:00:00, 2021-01-01T15:00:00]
-
-query ?
-select generate_series('2021-01-01T00:00:00EST'::timestamp, '2021-01-01T15:00:00-12:00'::timestamp, INTERVAL '1' HOUR);
-----
-[2021-01-01T05:00:00, 2021-01-01T06:00:00, 2021-01-01T07:00:00, 2021-01-01T08:00:00, 2021-01-01T09:00:00, 2021-01-01T10:00:00, 2021-01-01T11:00:00, 2021-01-01T12:00:00, 2021-01-01T13:00:00, 2021-01-01T14:00:00, 2021-01-01T15:00:00, 2021-01-01T16:00:00, 2021-01-01T17:00:00, 2021-01-01T18:00:00, 2021-01-01T19:00:00, 2021-01-01T20:00:00, 2021-01-01T21:00:00, 2021-01-01T22:00:00, 2021-01-01T23:00:00, 2021-01-02T00:00:00, 2021-01-02T01:00:00, 2021-01-02T02:00:00, 2021-01-02T03:00:00]
-
-query ?
-select generate_series(arrow_cast('2021-01-01T00:00:00', 'Timestamp(Nanosecond, Some("-05:00"))'), arrow_cast('2021-01-01T15:00:00', 'Timestamp(Nanosecond, Some("+05:00"))'), INTERVAL '1' HOUR);
-----
-[2021-01-01T00:00:00-05:00, 2021-01-01T01:00:00-05:00, 2021-01-01T02:00:00-05:00, 2021-01-01T03:00:00-05:00, 2021-01-01T04:00:00-05:00, 2021-01-01T05:00:00-05:00]
-
-## -5500000000 ns is -5.5 sec
-query ?
-select generate_series(arrow_cast('2021-01-01T00:00:00', 'Timestamp(Nanosecond, Some("-05:00"))'), arrow_cast('2021-01-01T06:00:00', 'Timestamp(Nanosecond, Some("-05:00"))'), INTERVAL '1 HOUR 30 MINUTE -5500000000 NANOSECOND');
-----
-[2021-01-01T00:00:00-05:00, 2021-01-01T01:29:54.500-05:00, 2021-01-01T02:59:49-05:00, 2021-01-01T04:29:43.500-05:00, 2021-01-01T05:59:38-05:00]
-
-## mixing types for timestamps is not supported
-query error DataFusion error: Error during planning: Internal error: Function 'generate_series' failed to match any signature
-select generate_series(arrow_cast('2021-01-01T00:00:00', 'Timestamp(Nanosecond, Some("-05:00"))'), DATE '2021-01-02', INTERVAL '1' HOUR);
-
-## mixing types not allowed even if an argument is null
-query error DataFusion error: Error during planning: Internal error: Function 'generate_series' failed to match any signature
-select generate_series(TIMESTAMP '1992-09-01', DATE '1993-03-01', NULL);
-
-query error DataFusion error: Error during planning: Internal error: Function 'generate_series' failed to match any signature
-select generate_series(1, '2024-01-01', '2025-01-02');
-
-query error DataFusion error: Error during planning: Internal error: Function 'generate_series' failed to match any signature
-select generate_series('2024-01-01'::timestamp, '2025-01-02', interval '1 day');
-
-## should return NULL
-query ?
-select generate_series(DATE '1992-09-01', NULL, INTERVAL '1' YEAR);
-----
-NULL
-
-## should return NULL
-query ?
-select generate_series(TIMESTAMP '1992-09-01', NULL, INTERVAL '1' YEAR);
-----
-NULL
-
-query ?
-select generate_series(DATE '1992-09-01', DATE '1993-03-01', NULL);
-----
-NULL
-
-query ?
-select generate_series(NULL, DATE '1993-03-01', INTERVAL '1' YEAR);
-----
-NULL
-
-query ?
-select generate_series(NULL::Date, DATE '1993-03-01', INTERVAL '1' YEAR);
-----
-NULL
-
-query ?
-select generate_series(DATE '1993-03-01', NULL::Date, INTERVAL '1' YEAR);
-----
-NULL
-
-query ?
-select generate_series(DATE '1993-02-01', DATE '1993-03-01', NULL::Interval);
-----
-NULL
-
-query ?
-select generate_series(NULL, TIMESTAMP '1993-03-01', INTERVAL '1' YEAR);
-----
-NULL
-
-query ?
-select generate_series(NULL, NULL, NULL);
-----
-NULL
-
-query ?
-select generate_series(NULL::timestamp, NULL::timestamp, NULL);
-----
-NULL
-
-query ?
-select generate_series(DATE '1989-04-01', DATE '1993-03-01', INTERVAL '-1' YEAR)
-----
-[]
-
-query ?
-select generate_series(TIMESTAMP '1989-04-01', TIMESTAMP '1993-03-01', INTERVAL '-1' YEAR)
-----
-[]
-
-query ?
-select generate_series(DATE '1993-03-01', DATE '1989-04-01', INTERVAL '1' YEAR)
-----
-[]
-
-query ?
-select generate_series(TIMESTAMP '1993-03-01', TIMESTAMP '1989-04-01', INTERVAL '1' YEAR)
-----
-[]
-
-query error DataFusion error: Execution error: Cannot generate date range less than 1 day.
-select generate_series(DATE '2000-01-01', DATE '2000-01-03', INTERVAL '1' HOUR)
-
-query error DataFusion error: Execution error: Cannot generate date range less than 1 day.
-select generate_series(DATE '2000-01-01', DATE '2000-01-03', INTERVAL '-1' HOUR)
-
-query ?
-select generate_series(TIMESTAMP '2000-01-01', TIMESTAMP '2000-01-02', INTERVAL '1' HOUR)
-----
-[2000-01-01T00:00:00, 2000-01-01T01:00:00, 2000-01-01T02:00:00, 2000-01-01T03:00:00, 2000-01-01T04:00:00, 2000-01-01T05:00:00, 2000-01-01T06:00:00, 2000-01-01T07:00:00, 2000-01-01T08:00:00, 2000-01-01T09:00:00, 2000-01-01T10:00:00, 2000-01-01T11:00:00, 2000-01-01T12:00:00, 2000-01-01T13:00:00, 2000-01-01T14:00:00, 2000-01-01T15:00:00, 2000-01-01T16:00:00, 2000-01-01T17:00:00, 2000-01-01T18:00:00, 2000-01-01T19:00:00, 2000-01-01T20:00:00, 2000-01-01T21:00:00, 2000-01-01T22:00:00, 2000-01-01T23:00:00, 2000-01-02T00:00:00]
-
-query ?
-select generate_series(TIMESTAMP '2000-01-02', TIMESTAMP '2000-01-01', INTERVAL '-1' HOUR)
-----
-[2000-01-02T00:00:00, 2000-01-01T23:00:00, 2000-01-01T22:00:00, 2000-01-01T21:00:00, 2000-01-01T20:00:00, 2000-01-01T19:00:00, 2000-01-01T18:00:00, 2000-01-01T17:00:00, 2000-01-01T16:00:00, 2000-01-01T15:00:00, 2000-01-01T14:00:00, 2000-01-01T13:00:00, 2000-01-01T12:00:00, 2000-01-01T11:00:00, 2000-01-01T10:00:00, 2000-01-01T09:00:00, 2000-01-01T08:00:00, 2000-01-01T07:00:00, 2000-01-01T06:00:00, 2000-01-01T05:00:00, 2000-01-01T04:00:00, 2000-01-01T03:00:00, 2000-01-01T02:00:00, 2000-01-01T01:00:00, 2000-01-01T00:00:00]
-
-# Test generate_series with small intervals
-query ?
-select generate_series('2000-01-01T00:00:00.000000001Z'::timestamp, '2000-01-01T00:00:00.00000001Z'::timestamp, INTERVAL '1' NANOSECONDS)
-----
-[2000-01-01T00:00:00.000000001, 2000-01-01T00:00:00.000000002, 2000-01-01T00:00:00.000000003, 2000-01-01T00:00:00.000000004, 2000-01-01T00:00:00.000000005, 2000-01-01T00:00:00.000000006, 2000-01-01T00:00:00.000000007, 2000-01-01T00:00:00.000000008, 2000-01-01T00:00:00.000000009, 2000-01-01T00:00:00.000000010]
-
-# Test generate_series with zero step
-query error DataFusion error: Execution error: step can't be 0 for function generate_series\(start \[, stop, step\]\)
-select generate_series(1, 1, 0);
-
-# Test generate_series with zero step
-query error DataFusion error: Execution error: Interval argument to generate_series must not be 0
-select generate_series(TIMESTAMP '2000-01-02', TIMESTAMP '2000-01-01', INTERVAL '0' MINUTE);
-
-# Test generate_series with big steps
-query ????
-select
-  generate_series(-9223372036854775808, -9223372036854775808, -9223372036854775808) as c1,
-  generate_series(9223372036854775807, 9223372036854775807, 9223372036854775807) as c2,
-  generate_series(0, -9223372036854775808, -9223372036854775808) as c3,
-  generate_series(0, 9223372036854775807, 9223372036854775807) as c4;
-----
-[-9223372036854775808] [9223372036854775807] [0, -9223372036854775808] [0, 9223372036854775807]
-
-
-# Test generate_series for other edge cases
-query ????
-select
-  generate_series(9223372036854775807, 9223372036854775807, -1) as c1,
-  generate_series(9223372036854775807, 9223372036854775807, 1) as c2,
-  generate_series(-9223372036854775808, -9223372036854775808, -1) as c3,
-  generate_series(-9223372036854775808, -9223372036854775808, 1) as c4;
-----
-[9223372036854775807] [9223372036854775807] [-9223372036854775808] [-9223372036854775808]
-
-# Test generate_series(start, stop, step) with NULL values
-query ?
-select generate_series(start, stop, step) from
-  (values (1), (NULL)) as start_values(start),
-  (values (10), (NULL)) as stop_values(stop),
-  (values (3), (NULL)) as step_values(step)
-where start is null or stop is null or step is null
-----
-NULL
-NULL
-NULL
-NULL
-NULL
-NULL
-NULL
-
-# Test generate_series(start, stop) with NULL values
-query ?
-select generate_series(start, stop) from
-  (values (1), (NULL)) as start_values(start),
-  (values (10), (NULL)) as stop_values(stop)
-where start is null or stop is null
-----
-NULL
-NULL
-NULL
-
-# Test generate_series(stop) with NULL value
-query ?
-select generate_series(NULL)
-----
-NULL
-
-# Test generate_series with a table of date values
-statement ok
-CREATE TABLE date_table(
-  start DATE,
-  stop DATE,
-  step INTERVAL
-) AS VALUES
-  (DATE '1992-01-01', DATE '1993-01-02', INTERVAL '1' MONTH),
-  (DATE '1993-02-01', DATE '1993-01-01', INTERVAL '-1' DAY),
-  (DATE '1989-04-01', DATE '1993-03-01', INTERVAL '1' YEAR);
-
-query ?
-select generate_series(start, stop, step) from date_table;
-----
-[1992-01-01, 1992-02-01, 1992-03-01, 1992-04-01, 1992-05-01, 1992-06-01, 1992-07-01, 1992-08-01, 1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01]
-[1993-02-01, 1993-01-31, 1993-01-30, 1993-01-29, 1993-01-28, 1993-01-27, 1993-01-26, 1993-01-25, 1993-01-24, 1993-01-23, 1993-01-22, 1993-01-21, 1993-01-20, 1993-01-19, 1993-01-18, 1993-01-17, 1993-01-16, 1993-01-15, 1993-01-14, 1993-01-13, 1993-01-12, 1993-01-11, 1993-01-10, 1993-01-09, 1993-01-08, 1993-01-07, 1993-01-06, 1993-01-05, 1993-01-04, 1993-01-03, 1993-01-02, 1993-01-01]
-[1989-04-01, 1990-04-01, 1991-04-01, 1992-04-01]
-
-query ?
-select generate_series(start, stop, INTERVAL '1 year') from date_table;
-----
-[1992-01-01, 1993-01-01]
-[]
-[1989-04-01, 1990-04-01, 1991-04-01, 1992-04-01]
-
-query ?
-select generate_series(start, '1993-03-01'::date, INTERVAL '1 year') from date_table;
-----
-[1992-01-01, 1993-01-01]
-[1993-02-01]
-[1989-04-01, 1990-04-01, 1991-04-01, 1992-04-01]
-
-# Test generate_series with a table of timestamp values
-statement ok
-CREATE TABLE timestamp_table(
-  start TIMESTAMP,
-  stop TIMESTAMP,
-  step INTERVAL
-) AS VALUES
-  (TIMESTAMP '1992-01-01T00:00:00', TIMESTAMP '1993-01-02T00:00:00', INTERVAL '1' MONTH),
-  (TIMESTAMP '1993-02-01T00:00:00', TIMESTAMP '1993-01-01T00:00:00', INTERVAL '-1' DAY),
-  (TIMESTAMP '1989-04-01T00:00:00', TIMESTAMP '1993-03-01T00:00:00', INTERVAL '1' YEAR);
-
-query ?
-select generate_series(start, stop, step) from timestamp_table;
-----
-[1992-01-01T00:00:00, 1992-02-01T00:00:00, 1992-03-01T00:00:00, 1992-04-01T00:00:00, 1992-05-01T00:00:00, 1992-06-01T00:00:00, 1992-07-01T00:00:00, 1992-08-01T00:00:00, 1992-09-01T00:00:00, 1992-10-01T00:00:00, 1992-11-01T00:00:00, 1992-12-01T00:00:00, 1993-01-01T00:00:00]
-[1993-02-01T00:00:00, 1993-01-31T00:00:00, 1993-01-30T00:00:00, 1993-01-29T00:00:00, 1993-01-28T00:00:00, 1993-01-27T00:00:00, 1993-01-26T00:00:00, 1993-01-25T00:00:00, 1993-01-24T00:00:00, 1993-01-23T00:00:00, 1993-01-22T00:00:00, 1993-01-21T00:00:00, 1993-01-20T00:00:00, 1993-01-19T00:00:00, 1993-01-18T00:00:00, 1993-01-17T00:00:00, 1993-01-16T00:00:00, 1993-01-15T00:00:00, 1993-01-14T00:00:00, 1993-01-13T00:00:00, 1993-01-12T00:00:00, 1993-01-11T00:00:00, 1993-01-10T00:00:00, 1993-01-09T00:00:00, 1993-01-08T00:00:00, 1993-01-07T00:00:00, 1993-01-06T00:00:00, 1993-01-05T00:00:00, 1993-01-04T00:00:00, 1993-01-03T00:00:00, 1993-01-02T00:00:00, 1993-01-01T00:00:00]
-[1989-04-01T00:00:00, 1990-04-01T00:00:00, 1991-04-01T00:00:00, 1992-04-01T00:00:00]
-
-query ?
-select generate_series(start, stop, INTERVAL '1 year') from timestamp_table;
-----
-[1992-01-01T00:00:00, 1993-01-01T00:00:00]
-[]
-[1989-04-01T00:00:00, 1990-04-01T00:00:00, 1991-04-01T00:00:00, 1992-04-01T00:00:00]
-
-query ?
-select generate_series(start, '1993-03-01T00:00:00'::timestamp, INTERVAL '1 year') from timestamp_table;
-----
-[1992-01-01T00:00:00, 1993-01-01T00:00:00]
-[1993-02-01T00:00:00]
-[1989-04-01T00:00:00, 1990-04-01T00:00:00, 1991-04-01T00:00:00, 1992-04-01T00:00:00]
-
-# https://github.com/apache/datafusion/issues/11922
-query ?
-select generate_series(start, '1993-03-01T00:00:00'::timestamp, INTERVAL '1 year') from timestamp_table;
-----
-[1992-01-01T00:00:00, 1993-01-01T00:00:00]
-[1993-02-01T00:00:00]
-[1989-04-01T00:00:00, 1990-04-01T00:00:00, 1991-04-01T00:00:00, 1992-04-01T00:00:00]
-
-## array_except
-
-statement ok
-CREATE TABLE array_except_table
-AS VALUES
-  ([1, 2, 2, 3], [2, 3, 4]),
-  ([2, 3, 3], [3]),
-  ([3], [3, 3, 4]),
-  (null, [3, 4]),
-  ([1, 2], null),
-  (null, null)
-;
-
-query ?
-select array_except(column1, column2) from array_except_table;
-----
-[1]
-[2]
-[]
-NULL
-[1, 2]
-NULL
-
-statement ok
-drop table array_except_table;
-
-statement ok
-CREATE TABLE array_except_nested_list_table
-AS VALUES
-  ([[1, 2], [3]], [[2], [3], [4, 5]]),
-  ([[1, 2], [3]], [[2], [1, 2]]),
-  ([[1, 2], [3]], null),
-  (null, [[1], [2, 3], [4, 5, 6]]),
-  ([[1], [2, 3], [4, 5, 6]], [[2, 3], [4, 5, 6], [1]])
-;
-
-query ?
-select array_except(column1, column2) from array_except_nested_list_table;
-----
-[[1, 2]]
-[[3]]
-[[1, 2], [3]]
-NULL
-[]
-
-statement ok
-drop table array_except_nested_list_table;
-
-statement ok
-CREATE TABLE array_except_table_float
-AS VALUES
-  ([1.1, 2.2, 3.3], [2.2]),
-  ([1.1, 2.2, 3.3], [4.4]),
-  ([1.1, 2.2, 3.3], [3.3, 2.2, 1.1])
-;
-
-query ?
-select array_except(column1, column2) from array_except_table_float;
-----
-[1.1, 3.3]
-[1.1, 2.2, 3.3]
-[]
-
-statement ok
-drop table array_except_table_float;
-
-statement ok
-CREATE TABLE array_except_table_ut8
-AS VALUES
-  (['a', 'b', 'c'], ['a']),
-  (['a', 'bc', 'def'], ['g', 'def']),
-  (['a', 'bc', 'def'], null),
-  (null, ['a'])
-;
-
-query ?
-select array_except(column1, column2) from array_except_table_ut8;
-----
-[b, c]
-[a, bc]
-[a, bc, def]
-NULL
-
-statement ok
-drop table array_except_table_ut8;
-
-statement ok
-CREATE TABLE array_except_table_bool
-AS VALUES
-  ([true, false, false], [false]),
-  ([true, true, true], [false]),
-  ([false, false, false], [true]),
-  ([true, false], null),
-  (null, [true, false])
-;
-
-query ?
-select array_except(column1, column2) from array_except_table_bool;
-----
-[true]
-[true]
-[false]
-[true, false]
-NULL
-
-statement ok
-drop table array_except_table_bool;
-
-query ?
-select array_except([], null);
-----
-[]
-
-query ?
-select array_except([], []);
-----
-[]
-
-query ?
-select array_except(null, []);
-----
-NULL
-
-query ?
-select array_except(null, null)
-----
-NULL
-
-query ?
-select array_except(arrow_cast([1, 2, 3, 4], 'LargeList(Int64)'), arrow_cast([5, 6, 3, 4], 'LargeList(Int64)'));
-----
-[1, 2]
-
-query ?
-select array_except(arrow_cast([1, 2, 3, 4], 'FixedSizeList(4, Int64)'), arrow_cast([5, 6, 3, 4], 'FixedSizeList(4, Int64)'));
-----
-[1, 2]
-
-### Array operators tests
-
-
-## array concatenate operator
-
-# array concatenate operator with scalars #1 (like array_concat scalar function)
-query ??
-select make_array(1, 2, 3) || make_array(4, 5, 6) || make_array(7, 8, 9), make_array([1], [2]) || make_array([3], [4]);
-----
-[1, 2, 3, 4, 5, 6, 7, 8, 9] [[1], [2], [3], [4]]
-
-# array concatenate operator with scalars #2 (like array_append scalar function)
-query ???
-select make_array(1, 2, 3) || 4, make_array(1.0, 2.0, 3.0) || 4.0, make_array('h', 'e', 'l', 'l') || 'o';
-----
-[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
-
-# array concatenate operator with scalars #3 (like array_prepend scalar function)
-query ???
-select 1 || make_array(2, 3, 4), 1.0 || make_array(2.0, 3.0, 4.0), 'h' || make_array('e', 'l', 'l', 'o');
-----
-[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
-
-# array concatenate operator with scalars #4 (mixed)
-query ?
-select 0 || [1,2,3] || 4 || [5] || [6,7];
-----
-[0, 1, 2, 3, 4, 5, 6, 7]
-
-# array concatenate operator with nd-list #5 (mixed)
-query ?
-select 0 || [1,2,3] || [[4,5]] || [[6,7,8]] || [9,10];
-----
-[[0, 1, 2, 3], [4, 5], [6, 7, 8], [9, 10]]
-
-# array concatenate operator non-valid cases
-## concat 2D with scalar is not valid
-query error
-select 0 || [1,2,3] || [[4,5]] || [[6,7,8]] || [9,10] || 11;
-
-## concat scalar with 2D is not valid
-query error
-select 0 || [[1,2,3]];
-
-# array concatenate operator with column
-
-statement ok
-CREATE TABLE array_concat_operator_table
-AS VALUES
-  (0, [1, 2, 2, 3], 4, [5, 6, 5]),
-  (-1, [4, 5, 6], 7, [8, 1, 1])
-;
-
-query ?
-select column1 || column2 || column3 || column4 from array_concat_operator_table;
-----
-[0, 1, 2, 2, 3, 4, 5, 6, 5]
-[-1, 4, 5, 6, 7, 8, 1, 1]
-
-statement ok
-drop table array_concat_operator_table;
-
-## array containment operator
-
-# array containment operator with scalars #1 (at arrow)
-query BBBBBBB
-select make_array(1,2,3) @> make_array(1,3),
-       make_array(1,2,3) @> make_array(1,4),
-       make_array([1,2], [3,4]) @> make_array([1,2]),
-       make_array([1,2], [3,4]) @> make_array([1,3]),
-       make_array([1,2], [3,4]) @> make_array([1,2], [3,4], [5,6]),
-       make_array([[1,2,3]]) @> make_array([[1]]),
-       make_array([[1,2,3]]) @> make_array([[1,2,3]]);
-----
-true false true false false false true
-
-# Make sure it is rewritten to function array_has_all()
-query TT
-explain select [1,2,3] @> [1,3];
-----
-logical_plan
-01)Projection: Boolean(true) AS array_has_all(make_array(Int64(1),Int64(2),Int64(3)),make_array(Int64(1),Int64(3)))
-02)--EmptyRelation: rows=1
-physical_plan
-01)ProjectionExec: expr=[true as array_has_all(make_array(Int64(1),Int64(2),Int64(3)),make_array(Int64(1),Int64(3)))]
-02)--PlaceholderRowExec
-
-# array containment operator with scalars #2 (arrow at)
-query BBBBBBB
-select make_array(1,3) <@ make_array(1,2,3),
-       make_array(1,4) <@ make_array(1,2,3),
-       make_array([1,2]) <@ make_array([1,2], [3,4]),
-       make_array([1,3]) <@ make_array([1,2], [3,4]),
-       make_array([1,2], [3,4], [5,6]) <@ make_array([1,2], [3,4]),
-       make_array([[1]]) <@ make_array([[1,2,3]]),
-       make_array([[1,2,3]]) <@ make_array([[1,2,3]]);
-----
-true false true false false false true
-
-# Make sure it is rewritten to function array_has_all()
-query TT
-explain select [1,3] <@ [1,2,3];
-----
-logical_plan
-01)Projection: Boolean(true) AS array_has_all(make_array(Int64(1),Int64(2),Int64(3)),make_array(Int64(1),Int64(3)))
-02)--EmptyRelation: rows=1
-physical_plan
-01)ProjectionExec: expr=[true as array_has_all(make_array(Int64(1),Int64(2),Int64(3)),make_array(Int64(1),Int64(3)))]
-02)--PlaceholderRowExec
-
-### Array casting tests
-
-
-## make_array
-
-# make_array scalar function #1
-query ?
-select make_array(1, 2.0)
-----
-[1.0, 2.0]
-
-# make_array scalar function #2
-query ?
-select make_array(null, 1.0)
-----
-[NULL, 1.0]
-
-# make_array scalar function #3
-query ?
-select make_array(1, 2.0, null, 3)
-----
-[1.0, 2.0, NULL, 3.0]
-
-# make_array scalar function #4
-query ?
-select make_array(1.0, '2', null)
-----
-[1.0, 2.0, NULL]
-
-### FixedSizeListArray
-
-statement ok
-CREATE EXTERNAL TABLE fixed_size_list_array STORED AS PARQUET LOCATION '../core/tests/data/fixed_size_list_array.parquet';
-
-query T
-select arrow_typeof(f0) from fixed_size_list_array;
-----
-FixedSizeList(2 x nullable Int64)
-FixedSizeList(2 x nullable Int64)
-
-query ?
-select * from fixed_size_list_array;
-----
-[1, 2]
-[3, 4]
-
-query ?
-select f0 from fixed_size_list_array;
-----
-[1, 2]
-[3, 4]
-
-query ?
-select arrow_cast(f0, 'List(Int64)') from fixed_size_list_array;
-----
-[1, 2]
-[3, 4]
-
-query ?
-select make_array(arrow_cast(f0, 'List(Int64)')) from fixed_size_list_array
-----
-[[1, 2]]
-[[3, 4]]
-
-query T
-select arrow_typeof(make_array(arrow_cast(f0, 'List(Int64)'))) from fixed_size_list_array
-----
-List(nullable List(nullable Int64))
-List(nullable List(nullable Int64))
-
-query ?
-select make_array(f0) from fixed_size_list_array
-----
-[[1, 2]]
-[[3, 4]]
-
-query T
-select arrow_typeof(make_array(f0)) from fixed_size_list_array
-----
-List(nullable FixedSizeList(2 x nullable Int64))
-List(nullable FixedSizeList(2 x nullable Int64))
-
-query ?
-select array_concat(column1, [7]) from arrays_values_v2;
-----
-[NULL, 2, 3, 7]
-[7]
-[9, NULL, 10, 7]
-[NULL, 1, 7]
-[11, 12, 7]
-[7]
-
-# flatten
-
-query ?
-select flatten(NULL);
-----
-NULL
-
-# flatten with scalar values #1
-query ???
-select flatten(make_array(1, 2, 1, 3, 2)),
-       flatten(make_array([1], [2, 3], [null], make_array(4, null, 5))),
-       flatten(make_array([[1.1]], [[2.2]], [[3.3], [4.4]]));
-----
-[1, 2, 1, 3, 2] [1, 2, 3, NULL, 4, NULL, 5] [[1.1], [2.2], [3.3], [4.4]]
-
-query ???
-select flatten(arrow_cast(make_array(1, 2, 1, 3, 2), 'LargeList(Int64)')),
-       flatten(arrow_cast(make_array([1], null, [2, 3], [null], make_array(4, null, 5)), 'LargeList(LargeList(Int64))')),
-       flatten(arrow_cast(make_array([[1.1]], [[2.2]], [[3.3], [4.4]]), 'LargeList(LargeList(LargeList(Float64)))'));
-----
-[1, 2, 1, 3, 2] [1, 2, 3, NULL, 4, NULL, 5] [[1.1], [2.2], [3.3], [4.4]]
-
-query ???
-select flatten(arrow_cast(make_array(1, 2, 1, 3, 2), 'FixedSizeList(5, Int64)')),
-       flatten(arrow_cast(make_array([1], [2, 3], [null], make_array(4, null, 5)), 'FixedSizeList(4, List(Int64))')),
-       flatten(arrow_cast(make_array([[1.1], [2.2]], [[3.3], [4.4]]), 'FixedSizeList(2, List(List(Float64)))'));
-----
-[1, 2, 1, 3, 2] [1, 2, 3, NULL, 4, NULL, 5] [[1.1], [2.2], [3.3], [4.4]]
-
-query ??TT
-select flatten(arrow_cast(make_array([1], [2, 3], [null], make_array(4, null, 5)), 'FixedSizeList(4, LargeList(Int64))')),
-       flatten(arrow_cast(make_array([[1.1], [2.2]], [[3.3], [4.4]]), 'List(LargeList(FixedSizeList(1, Float64)))')),
-       arrow_typeof(flatten(arrow_cast(make_array([1], [2, 3], [null], make_array(4, null, 5)), 'FixedSizeList(4, LargeList(Int64))'))),
-       arrow_typeof(flatten(arrow_cast(make_array([[1.1], [2.2]], [[3.3], [4.4]]), 'List(LargeList(FixedSizeList(1, Float64)))')));
-----
-[1, 2, 3, NULL, 4, NULL, 5] [[1.1], [2.2], [3.3], [4.4]] LargeList(nullable Int64) LargeList(nullable FixedSizeList(1 x nullable Float64))
-
-# flatten with column values
-query ????
-select flatten(column1),
-       flatten(column2),
-       flatten(column3),
-       flatten(column4)
-from flatten_table;
-----
-[1, 2, 3] [[1, 2, 3], [4, 5], [6]] [[[1]], [[2, 3]]] [1.0, 2.1, 2.2, 3.2, 3.3, 3.4]
-[1, 2, 3, 4, 5, 6] [[8]] [[[1, 2]], [[3]]] [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
-
-query ????
-select flatten(column1),
-       flatten(column2),
-       flatten(column3),
-       flatten(column4)
-from large_flatten_table;
-----
-[1, 2, 3] [[1, 2, 3], [4, 5], [6]] [[[1]], [[2, 3]]] [1.0, 2.1, 2.2, 3.2, 3.3, 3.4]
-[1, 2, 3, 4, 5, 6] [[8]] [[[1, 2]], [[3]]] [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
-
-query ????
-select flatten(column1),
-       flatten(column2),
-       flatten(column3),
-       flatten(column4)
-from fixed_size_flatten_table;
-----
-[1, 2, 3] [[1, 2, 3], [4, 5], [6]] [[[1]], [[2, 3]]] [1.0, 2.1, 2.2, 3.2, 3.3, 3.4]
-[1, 2, 3, 4, 5, 6] [[8], [9, 10], [11, 12, 13]] [[[1, 2]], [[3]]] [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
-
-# flatten with different inner list type
-query ??????
-select flatten(arrow_cast(make_array([1, 2], [3, 4]), 'List(FixedSizeList(2, Int64))')),
-       flatten(arrow_cast(make_array([[1, 2]], [[3, 4]]), 'List(FixedSizeList(1, List(Int64)))')),
-       flatten(arrow_cast(make_array([1, 2], [3, 4]), 'LargeList(List(Int64))')),
-       flatten(arrow_cast(make_array([[1, 2]], [[3, 4]]), 'LargeList(List(List(Int64)))')),
-       flatten(arrow_cast(make_array([1, 2], [3, 4]), 'LargeList(FixedSizeList(2, Int64))')),
-       flatten(arrow_cast(make_array([[1, 2]], [[3, 4]]), 'LargeList(FixedSizeList(1, List(Int64)))'))
-----
-[1, 2, 3, 4] [[1, 2], [3, 4]] [1, 2, 3, 4] [[1, 2], [3, 4]] [1, 2, 3, 4] [[1, 2], [3, 4]]
-
-## empty (aliases: `array_empty`, `list_empty`)
-# empty scalar function #1
-query B
-select empty(make_array(1));
-----
-false
-
-query B
-select empty(arrow_cast(make_array(1), 'LargeList(Int64)'));
-----
-false
-
-query B
-select empty(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)'));
-----
-false
-
-# empty scalar function #2
-query B
-select empty(make_array());
-----
-true
-
-query B
-select empty(arrow_cast(make_array(), 'LargeList(Int64)'));
-----
-true
-
-#TODO: https://github.com/apache/datafusion/issues/9158
-#query B
-#select empty(arrow_cast(make_array(), 'FixedSizeList(0, Null)'));
-#----
-#true
-
-# empty scalar function #3
-query B
-select empty(make_array(NULL));
-----
-false
-
-query B
-select empty(arrow_cast(make_array(NULL), 'LargeList(Int64)'));
-----
-false
-
-query B
-select empty(arrow_cast(make_array(NULL), 'FixedSizeList(1, Int64)'));
-----
-false
-
-#TODO: https://github.com/apache/datafusion/issues/7142
-# empty scalar function #4
-#query B
-#select empty(NULL);
-#----
-#NULL
-
-# empty scalar function #5
-query B
-select empty(column1) from arrays;
-----
-false
-false
-false
-false
-NULL
-false
-false
-
-query B
-select empty(arrow_cast(column1, 'LargeList(List(Int64))')) from arrays;
-----
-false
-false
-false
-false
-NULL
-false
-false
-
-query B
-select empty(column1) from fixed_size_arrays;
-----
-false
-false
-false
-false
-NULL
-false
-false
-
-## array_empty (aliases: `empty`, `list_empty`)
-# array_empty scalar function #1
-query B
-select array_empty(make_array(1));
-----
-false
-
-query B
-select array_empty(arrow_cast(make_array(1), 'LargeList(Int64)'));
-----
-false
-
-# array_empty scalar function #2
-query B
-select array_empty(make_array());
-----
-true
-
-query B
-select array_empty(arrow_cast(make_array(), 'LargeList(Int64)'));
-----
-true
-
-# array_empty scalar function #3
-query B
-select array_empty(make_array(NULL));
-----
-false
-
-query B
-select array_empty(arrow_cast(make_array(NULL), 'LargeList(Int64)'));
-----
-false
-
-## list_empty (aliases: `empty`, `array_empty`)
-# list_empty scalar function #1
-query B
-select list_empty(make_array(1));
-----
-false
-
-query B
-select list_empty(arrow_cast(make_array(1), 'LargeList(Int64)'));
-----
-false
-
-# list_empty scalar function #2
-query B
-select list_empty(make_array());
-----
-true
-
-query B
-select list_empty(arrow_cast(make_array(), 'LargeList(Int64)'));
-----
-true
-
-# list_empty scalar function #3
-query B
-select list_empty(make_array(NULL));
-----
-false
-
-query B
-select list_empty(arrow_cast(make_array(NULL), 'LargeList(Int64)'));
-----
-false
-
-# string_to_array scalar function
-query ?
-SELECT string_to_array('abcxxxdef', 'xxx')
-----
-[abc, def]
-
-query ?
-SELECT string_to_array('abc', '')
-----
-[abc]
-
-query ?
-SELECT string_to_array('abc', NULL)
-----
-[a, b, c]
-
-query ?
-SELECT string_to_array('abc def', ' ', 'def')
-----
-[abc, NULL]
-
-query ?
-select string_to_array(e, ',') from values;
-----
-[Lorem]
-[ipsum]
-[dolor]
-[sit]
-[amet]
-[, ]
-[consectetur]
-[adipiscing]
-NULL
-
-# karge string tests for string_to_array
-
-# string_to_array scalar function
-query ?
-SELECT string_to_array(arrow_cast('abcxxxdef', 'LargeUtf8'), 'xxx')
-----
-[abc, def]
-
-# string_to_array scalar function
-query ?
-SELECT string_to_array(arrow_cast('abcxxxdef', 'LargeUtf8'), arrow_cast('xxx', 'LargeUtf8'))
-----
-[abc, def]
-
-query ?
-SELECT string_to_array(arrow_cast('abc', 'LargeUtf8'), NULL)
-----
-[a, b, c]
-
-query ?
-select string_to_array(arrow_cast(e, 'LargeUtf8'), ',') from values;
-----
-[Lorem]
-[ipsum]
-[dolor]
-[sit]
-[amet]
-[, ]
-[consectetur]
-[adipiscing]
-NULL
-
-query ?
-select string_to_array(arrow_cast(e, 'LargeUtf8'), ',', arrow_cast('Lorem', 'LargeUtf8')) from values;
-----
-[NULL]
-[ipsum]
-[dolor]
-[sit]
-[amet]
-[, ]
-[consectetur]
-[adipiscing]
-NULL
-
-# string view tests for string_to_array
-
-# string_to_array scalar function
-query ?
-SELECT string_to_array(arrow_cast('abcxxxdef', 'Utf8View'), 'xxx')
-----
-[abc, def]
-
-query ?
-SELECT string_to_array(arrow_cast('abc', 'Utf8View'), NULL)
-----
-[a, b, c]
-
-query ?
-select string_to_array(arrow_cast(e, 'Utf8View'), ',') from values;
-----
-[Lorem]
-[ipsum]
-[dolor]
-[sit]
-[amet]
-[, ]
-[consectetur]
-[adipiscing]
-NULL
-
-# test string_to_array aliases
-
-query ?
-select string_to_list(e, 'm') from values;
-----
-[Lore, ]
-[ipsu, ]
-[dolor]
-[sit]
-[a, et]
-[,]
-[consectetur]
-[adipiscing]
-NULL
-
-# array_resize scalar function #1
-query ?
-select array_resize(make_array(1, 2, 3), 1);
-----
-[1]
-
-query ?
-select array_resize(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 1);
-----
-[1]
-
-query ?
-select array_resize(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 1);
-----
-[1]
-
-# array_resize scalar function #2
-query ?
-select array_resize(make_array(1, 2, 3), 5);
-----
-[1, 2, 3, NULL, NULL]
-
-query ?
-select array_resize(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 5);
-----
-[1, 2, 3, NULL, NULL]
-
-query ?
-select array_resize(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 5);
-----
-[1, 2, 3, NULL, NULL]
-
-# array_resize scalar function #3
-query ?
-select array_resize(make_array(1, 2, 3), 5, 4);
-----
-[1, 2, 3, 4, 4]
-
-query ?
-select array_resize(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 5, 4);
-----
-[1, 2, 3, 4, 4]
-
-# array_resize scalar function #4
-query error
-select array_resize(make_array(1, 2, 3), -5, 2);
-
-# array_resize scalar function #5
-query ?
-select array_resize(make_array(1.1, 2.2, 3.3), 10, 9.9);
-----
-[1.1, 2.2, 3.3, 9.9, 9.9, 9.9, 9.9, 9.9, 9.9, 9.9]
-
-query ?
-select array_resize(arrow_cast(make_array(1.1, 2.2, 3.3), 'LargeList(Float64)'), 10, 9.9);
-----
-[1.1, 2.2, 3.3, 9.9, 9.9, 9.9, 9.9, 9.9, 9.9, 9.9]
-
-# array_resize scalar function #5
-query ?
-select array_resize(column1, column2, column3) from arrays_values;
-----
-[NULL]
-[11, 12, 13, 14, 15, 16, 17, 18, NULL, 20, 2, 2]
-[21, 22, 23, NULL, 25, 26, 27, 28, 29, 30, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
-[31, 32, 33, 34, 35, NULL, 37, 38, 39, 40, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
-NULL
-[]
-[51, 52, NULL, 54, 55, 56, 57, 58, 59, 60, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL]
-[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
-
-query ?
-select array_resize(arrow_cast(column1, 'LargeList(Int64)'), column2, column3) from arrays_values;
-----
-[NULL]
-[11, 12, 13, 14, 15, 16, 17, 18, NULL, 20, 2, 2]
-[21, 22, 23, NULL, 25, 26, 27, 28, 29, 30, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
-[31, 32, 33, 34, 35, NULL, 37, 38, 39, 40, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
-NULL
-[]
-[51, 52, NULL, 54, 55, 56, 57, 58, 59, 60, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL]
-[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
-
-# array_resize scalar function #5
-query ?
-select array_resize([[1], [2], [3]], 10, [5]);
-----
-[[1], [2], [3], [5], [5], [5], [5], [5], [5], [5]]
-
-query ?
-select array_resize(arrow_cast([[1], [2], [3]], 'LargeList(List(Int64))'), 10, [5]);
-----
-[[1], [2], [3], [5], [5], [5], [5], [5], [5], [5]]
-
-# array_resize null value
-query ?
-select array_resize(arrow_cast(NULL, 'List(Int8)'), 1);
-----
-NULL
-
-statement ok
-CREATE TABLE array_resize_values
-AS VALUES
-  (make_array(1, NULL, 3, 4, 5, 6, 7, 8, 9, 10), 2, 1),
-  (make_array(11, 12, NULL, 14, 15, 16, 17, 18, 19, 20), 5, 2),
-  (make_array(21, 22, 23, 24, NULL, 26, 27, 28, 29, 30), 8, 3),
-  (make_array(31, 32, 33, 34, 35, 36, NULL, 38, 39, 40), 12, 4),
-  (NULL, 3, 0),
-  (make_array(41, 42, 43, 44, 45, 46, 47, 48, 49, 50), NULL, 6),
-  (make_array(51, 52, 53, 54, 55, NULL, 57, 58, 59, 60), 13, NULL),
-  (make_array(61, 62, 63, 64, 65, 66, 67, 68, 69, 70), 15, 7)
-;
-
-# array_resize columnar test #1
-query ?
-select array_resize(column1, column2, column3) from array_resize_values;
-----
-[1, NULL]
-[11, 12, NULL, 14, 15]
-[21, 22, 23, 24, NULL, 26, 27, 28]
-[31, 32, 33, 34, 35, 36, NULL, 38, 39, 40, 4, 4]
-NULL
-[]
-[51, 52, 53, 54, 55, NULL, 57, 58, 59, 60, NULL, NULL, NULL]
-[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 7, 7, 7, 7, 7]
-
-# array_resize columnar test #2
-query ?
-select array_resize(arrow_cast(column1, 'LargeList(Int64)'), column2, column3) from array_resize_values;
-----
-[1, NULL]
-[11, 12, NULL, 14, 15]
-[21, 22, 23, 24, NULL, 26, 27, 28]
-[31, 32, 33, 34, 35, 36, NULL, 38, 39, 40, 4, 4]
-NULL
-[]
-[51, 52, 53, 54, 55, NULL, 57, 58, 59, 60, NULL, NULL, NULL]
-[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 7, 7, 7, 7, 7]
-
-## array_reverse
-query ??
-select array_reverse(make_array(1, 2, 3)), array_reverse(make_array(1));
-----
-[3, 2, 1] [1]
-
-query ??
-select array_reverse(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), array_reverse(arrow_cast(make_array(1), 'LargeList(Int64)'));
-----
-[3, 2, 1] [1]
-
-query ????
-select array_reverse(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')),
-  array_reverse(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)')),
-  array_reverse(arrow_cast(make_array(1, NULL, 3), 'FixedSizeList(3, Int64)')),
-  array_reverse(arrow_cast(make_array(NULL, NULL, NULL), 'FixedSizeList(3, Int64)'));
-----
-[3, 2, 1] [1] [3, NULL, 1] [NULL, NULL, NULL]
-
-query ??
-select array_reverse(NULL), array_reverse([]);
-----
-NULL []
-
-query ??
-select array_reverse(column1), column1 from arrays_values;
-----
-[10, 9, 8, 7, 6, 5, 4, 3, 2, NULL] [NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-[20, NULL, 18, 17, 16, 15, 14, 13, 12, 11] [11, 12, 13, 14, 15, 16, 17, 18, NULL, 20]
-[30, 29, 28, 27, 26, 25, NULL, 23, 22, 21] [21, 22, 23, NULL, 25, 26, 27, 28, 29, 30]
-[40, 39, 38, 37, NULL, 35, 34, 33, 32, 31] [31, 32, 33, 34, 35, NULL, 37, 38, 39, 40]
-NULL NULL
-[50, 49, 48, 47, 46, 45, 44, 43, 42, 41] [41, 42, 43, 44, 45, 46, 47, 48, 49, 50]
-[60, 59, 58, 57, 56, 55, 54, NULL, 52, 51] [51, 52, NULL, 54, 55, 56, 57, 58, 59, 60]
-[70, 69, 68, 67, 66, 65, 64, 63, 62, 61] [61, 62, 63, 64, 65, 66, 67, 68, 69, 70]
-
-statement ok
-CREATE TABLE test_reverse_fixed_size AS VALUES
-  (arrow_cast([1, 2, 3], 'FixedSizeList(3, Int64)')),
-  (arrow_cast([4, 5, 6], 'FixedSizeList(3, Int64)')),
-  (arrow_cast([NULL, 8, 9], 'FixedSizeList(3, Int64)')),
-  (NULL);
-
-query ?
-SELECT array_reverse(column1) FROM test_reverse_fixed_size;
-----
-[3, 2, 1]
-[6, 5, 4]
-[9, 8, NULL]
-NULL
-
-statement ok
-DROP TABLE test_reverse_fixed_size;
-
-# Test defining a table with array columns
-statement ok
-create table test_create_array_table(
-  a int[],
-  b text[],
-  -- two-dimensional array
-  c int[][],
-  d int
-);
-
-query I
-insert into test_create_array_table values
-  ([1, 2, 3], ['a', 'b', 'c'], [[4,6], [6,7,8]], 1);
-----
-1
-
-query ???I
-select * from test_create_array_table;
-----
-[1, 2, 3] [a, b, c] [[4, 6], [6, 7, 8]] 1
-
-query T
-select arrow_typeof(a) from test_create_array_table;
-----
-List(nullable Int32)
-
-query T
-select arrow_typeof(c) from test_create_array_table;
-----
-List(nullable List(nullable Int32))
-
-# Test casting to array types
-# issue: https://github.com/apache/datafusion/issues/9440
-query ??T
-select [1,2,3]::int[], [['1']]::int[][], arrow_typeof([]::text[]);
-----
-[1, 2, 3] [[1]] List(nullable Utf8View)
-
-# test empty arrays return length
-# issue: https://github.com/apache/datafusion/pull/12459
-statement ok
-create table values_all_empty (a int[]) as values ([]), ([]);
-
-query B
-select array_has(a, 1) from values_all_empty;
-----
-false
-false
-
-# Test create table with fixed sized array
-statement ok
-create table fixed_size_col_table (a int[3]) as values ([1,2,3]), ([4,5,6]);
-
-query T
-select arrow_typeof(a) from fixed_size_col_table;
-----
-FixedSizeList(3 x nullable Int32)
-FixedSizeList(3 x nullable Int32)
-
-query ? rowsort
-SELECT DISTINCT a FROM fixed_size_col_table
-----
-[1, 2, 3]
-[4, 5, 6]
-
-query ?I rowsort
-SELECT a, count(*) FROM fixed_size_col_table GROUP BY a
-----
-[1, 2, 3] 1
-[4, 5, 6] 1
-
-statement error Cast error: Cannot cast to FixedSizeList\(3\): value at index 0 has length 2
-create table varying_fixed_size_col_table (a int[3]) as values ([1,2,3]), ([4,5]);
-
-# https://github.com/apache/datafusion/issues/16187
-# should be NULL in case of out of bounds for Null Type
-query ?
-select [named_struct('a', 1, 'b', null)][-2];
-----
-NULL
-
-statement ok
-COPY (select [[true, false], [false, true]] a, [false, true] b union select [[null, null]], null) to 'test_files/scratch/array/array_has/single_file.parquet' stored as parquet;
-
-statement ok
-CREATE EXTERNAL TABLE array_has STORED AS PARQUET location 'test_files/scratch/array/array_has/single_file.parquet';
-
-query B
-select array_contains(a, b) from array_has order by 1 nulls last;
-----
-true
-NULL
-
-### Delete tables
-
-statement ok
-drop table values;
-
-statement ok
-drop table values_without_nulls;
-
-statement ok
-drop table nested_arrays;
-
-statement ok
-drop table large_nested_arrays;
-
-statement ok
-drop table fixed_size_nested_arrays;
-
-statement ok
-drop table arrays;
-
-statement ok
-drop table large_arrays;
-
-statement ok
-drop table fixed_size_arrays;
-
-statement ok
-drop table slices;
-
-statement ok
-drop table fixed_slices;
-
-statement ok
-drop table arrayspop;
-
-statement ok
-drop table large_arrayspop;
-
-statement ok
-drop table arrays_values;
-
-statement ok
-drop table arrays_values_v2;
-
-statement ok
-drop table large_arrays_values_v2;
-
-statement ok
-drop table array_has_table_1D;
-
-statement ok
-drop table array_has_table_1D_Float;
-
-statement ok
-drop table array_has_table_1D_Boolean;
-
-statement ok
-drop table array_has_table_1D_UTF8;
-
-statement ok
-drop table array_has_table_2D;
-
-statement ok
-drop table array_has_table_2D_float;
-
-statement ok
-drop table array_has_table_3D;
-
-statement ok
-drop table array_intersect_table_1D;
-
-statement ok
-drop table large_array_intersect_table_1D;
-
-statement ok
-drop table array_intersect_table_1D_Float;
-
-statement ok
-drop table large_array_intersect_table_1D_Float;
-
-statement ok
-drop table array_intersect_table_1D_Boolean;
-
-statement ok
-drop table large_array_intersect_table_1D_Boolean;
-
-statement ok
-drop table array_intersect_table_1D_UTF8;
-
-statement ok
-drop table large_array_intersect_table_1D_UTF8;
-
-statement ok
-drop table array_intersect_table_2D;
-
-statement ok
-drop table large_array_intersect_table_2D;
-
-statement ok
-drop table array_intersect_table_2D_float;
-
-statement ok
-drop table large_array_intersect_table_2D_float;
-
-statement ok
-drop table array_intersect_table_3D;
-
-statement ok
-drop table large_array_intersect_table_3D;
-
-statement ok
-drop table fixed_size_array_has_table_1D;
-
-statement ok
-drop table fixed_size_array_has_table_1D_Float;
-
-statement ok
-drop table fixed_size_array_has_table_1D_Boolean;
-
-statement ok
-drop table fixed_size_array_has_table_1D_UTF8;
-
-statement ok
-drop table fixed_size_array_has_table_2D;
-
-statement ok
-drop table fixed_size_array_has_table_2D_float;
-
-statement ok
-drop table fixed_size_array_has_table_3D;
-
-statement ok
-drop table arrays_range;
-
-statement ok
-drop table arrays_with_repeating_elements;
-
-statement ok
-drop table large_arrays_with_repeating_elements;
-
-statement ok
-drop table fixed_arrays_with_repeating_elements;
-
-statement ok
-drop table nested_arrays_with_repeating_elements;
-
-statement ok
-drop table large_nested_arrays_with_repeating_elements;
-
-statement ok
-drop table fixed_size_nested_arrays_with_repeating_elements;
-
-statement ok
-drop table flatten_table;
-
-statement ok
-drop table large_flatten_table;
-
-statement ok
-drop table fixed_size_flatten_table;
-
-statement ok
-drop table arrays_values_without_nulls;
-
-statement ok
-drop table large_arrays_values_without_nulls;
-
-statement ok
-drop table fixed_size_arrays_values_without_nulls;
-
-statement ok
-drop table test_create_array_table;
-
-statement ok
-drop table values_all_empty;
-
-statement ok
-drop table fixed_size_col_table;
-
-statement ok
-drop table array_has;
diff --git a/datafusion/sqllogictest/test_files/array/array_all.slt b/datafusion/sqllogictest/test_files/array/array_all.slt
new file mode 100644
index 0000000000000..70ba15edbf47b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_all.slt
@@ -0,0 +1,221 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+## all operator
+
+# = ALL: true when all elements equal val
+query B
+select 5 = ALL(make_array(5, 5, 5));
+----
+true
+
+query B
+select 5 = ALL(make_array(5, 5, 3));
+----
+false
+
+# <> ALL: true when val differs from every element
+query B
+select 5 <> ALL(make_array(1, 2, 3));
+----
+true
+
+query B
+select 5 <> ALL(make_array(1, 2, 5));
+----
+false
+
+# > ALL: true when val greater than all elements
+query B
+select 10 > ALL(make_array(1, 2, 3));
+----
+true
+
+query B
+select 3 > ALL(make_array(1, 2, 3));
+----
+false
+
+# < ALL: true when val less than all elements
+query B
+select 0 < ALL(make_array(1, 2, 3));
+----
+true
+
+query B
+select 2 < ALL(make_array(1, 2, 3));
+----
+false
+
+# >= ALL: true when val >= all elements
+query B
+select 5 >= ALL(make_array(1, 2, 5));
+----
+true
+
+query B
+select 4 >= ALL(make_array(1, 2, 5));
+----
+false
+
+# <= ALL: true when val <= all elements
+query B
+select 1 <= ALL(make_array(1, 2, 5));
+----
+true
+
+query B
+select 2 <= ALL(make_array(1, 2, 5));
+----
+false
+
+# Empty arrays: all operators return TRUE (vacuous truth)
+query B
+select 5 = ALL(arrow_cast(make_array(), 'List(Int64)'));
+----
+true
+
+query B
+select 5 <> ALL(arrow_cast(make_array(), 'List(Int64)'));
+----
+true
+
+query B
+select 5 > ALL(arrow_cast(make_array(), 'List(Int64)'));
+----
+true
+
+query B
+select 5 < ALL(arrow_cast(make_array(), 'List(Int64)'));
+----
+true
+
+query B
+select 5 >= ALL(arrow_cast(make_array(), 'List(Int64)'));
+----
+true
+
+query B
+select 5 <= ALL(arrow_cast(make_array(), 'List(Int64)'));
+----
+true
+
+# NULL LHS with empty array returns TRUE (vacuous truth)
+query B
+select NULL = ALL(arrow_cast(make_array(), 'List(Int64)'));
+----
+true
+
+# NULL LHS with non-empty array returns NULL
+query B
+select NULL = ALL(make_array(1, 2, 3));
+----
+NULL
+
+query B
+select NULL > ALL(make_array(1, 2, 3));
+----
+NULL
+
+query B
+select NULL <> ALL(make_array(1, 2, 3));
+----
+NULL
+
+# All-NULL arrays: returns NULL
+query B
+select 5 = ALL(make_array(NULL::INT, NULL::INT));
+----
+NULL
+
+query B
+select 5 <> ALL(make_array(NULL::INT, NULL::INT));
+----
+NULL
+
+query B
+select 5 > ALL(make_array(NULL::INT, NULL::INT));
+----
+NULL
+
+query B
+select 5 < ALL(make_array(NULL::INT, NULL::INT));
+----
+NULL
+
+query B
+select 5 >= ALL(make_array(NULL::INT, NULL::INT));
+----
+NULL
+
+query B
+select 5 <= ALL(make_array(NULL::INT, NULL::INT));
+----
+NULL
+
+# Mixed NULL + non-NULL (non-NULL elements satisfy, but NULLs present → NULL)
+query B
+select 5 > ALL(make_array(3, NULL));
+----
+NULL
+
+query B
+select 5 >= ALL(make_array(5, NULL));
+----
+NULL
+
+query B
+select 1 < ALL(make_array(3, NULL));
+----
+NULL
+
+query B
+select 1 <= ALL(make_array(1, NULL));
+----
+NULL
+
+# Mixed NULL + non-NULL (not satisfying condition → FALSE wins over NULL)
+query B
+select 5 > ALL(make_array(6, NULL));
+----
+false
+
+query B
+select 5 < ALL(make_array(3, NULL));
+----
+false
+
+query B
+select 5 = ALL(make_array(5, 3, NULL));
+----
+false
+
+# NULL array input returns NULL
+query B
+select 5 = ALL(NULL::INT[]);
+----
+NULL
+
+query B
+select 5 > ALL(NULL::INT[]);
+----
+NULL
+
+query B
+select 5 < ALL(NULL::INT[]);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/array/array_any_match.slt b/datafusion/sqllogictest/test_files/array/array_any_match.slt
new file mode 100644
index 0000000000000..27f2a5339ef68
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_any_match.slt
@@ -0,0 +1,110 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#############
+## array_any_match Tests
+#############
+
+statement ok
+set datafusion.sql_parser.dialect = databricks;
+
+statement ok
+CREATE TABLE t (list array<int>, number int)
+AS VALUES
+([1, 50], 10),
+([4, 50], 40),
+([7, 50], 60);
+
+# basic: some elements match
+query B
+SELECT array_any_match([1, 2, 3], x -> x > 2);
+----
+true
+
+# basic: no elements match
+query B
+SELECT array_any_match([1, 2, 3], x -> x > 5);
+----
+false
+
+# empty array returns false
+query B
+SELECT array_any_match(arrow_cast(make_array(), 'List(Int32)'), x -> x > 0);
+----
+false
+
+# null array returns null
+query B
+SELECT array_any_match(arrow_cast(NULL, 'List(Int32)'), x -> x > 0);
+----
+NULL
+
+# predicate returns null for some elements, false for others -> null
+query B
+SELECT array_any_match([1, 2, NULL], x -> x > 5);
+----
+NULL
+
+# predicate returns null for some elements but true for others -> true
+query B
+SELECT array_any_match([1, NULL, 10], x -> x > 5);
+----
+true
+
+# multiple rows
+query B
+SELECT array_any_match(list, x -> x > 5) FROM t;
+----
+true
+true
+true
+
+# predicate always returns null -> null
+query B
+SELECT array_any_match([1, 2, 3], x -> NULL::boolean);
+----
+NULL
+
+# all elements null, predicate always null -> null
+query B
+SELECT array_any_match(arrow_cast([NULL, NULL, NULL], 'List(Int32)'), x -> x > 5);
+----
+NULL
+
+# predicate returns true for null elements -> true
+query B
+SELECT array_any_match(arrow_cast([NULL, NULL], 'List(Int32)'), x -> x IS NULL);
+----
+true
+
+# alias any_match works
+query B
+SELECT any_match([1, 2, 3], x -> x > 2);
+----
+true
+
+# alias list_any_match works
+query B
+SELECT list_any_match([1, 2, 3], x -> x > 2);
+----
+true
+
+statement ok
+drop table t;
+
+statement ok
+set datafusion.sql_parser.dialect = generic;
diff --git a/datafusion/sqllogictest/test_files/array/array_any_value.slt b/datafusion/sqllogictest/test_files/array/array_any_value.slt
new file mode 100644
index 0000000000000..6579e88ac7dba
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_any_value.slt
@@ -0,0 +1,174 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## array_any_value (aliases: list_any_value)
+
+# Testing with empty arguments should result in an error
+query error
+select array_any_value();
+
+# Testing with non-array arguments should result in an error
+query error
+select array_any_value(1), array_any_value('a'), array_any_value(NULL);
+
+# array_any_value scalar function #1 (with null and non-null elements)
+
+query IT?I
+select array_any_value(make_array(NULL, 1, 2, 3, 4, 5)), array_any_value(make_array(NULL, 'h', 'e', 'l', 'l', 'o')), array_any_value(make_array(NULL, NULL)), array_any_value(make_array(NULL, NULL, 1, 2, 3));
+----
+1 h NULL 1
+
+query ITITI
+select array_any_value(arrow_cast(make_array(NULL, 1, 2, 3, 4, 5), 'LargeList(Int64)')), array_any_value(arrow_cast(make_array(NULL, 'h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)')), array_any_value(arrow_cast(make_array(NULL, NULL), 'LargeList(Int64)')), array_any_value(arrow_cast(make_array(NULL, NULL), 'LargeList(Utf8)')), array_any_value(arrow_cast(make_array(NULL, NULL, 1, 2, 3), 'LargeList(Int64)'));;
+----
+1 h NULL NULL 1
+
+query ITITI
+select array_any_value(arrow_cast(make_array(NULL, 1, 2, 3, 4, 5), 'FixedSizeList(6, Int64)')), array_any_value(arrow_cast(make_array(NULL, 'h', 'e', 'l', 'l', 'o'), 'FixedSizeList(6, Utf8)')), array_any_value(arrow_cast(make_array(NULL, NULL), 'FixedSizeList(2, Int64)')), array_any_value(arrow_cast(make_array(NULL, NULL), 'FixedSizeList(2, Utf8)')), array_any_value(arrow_cast(make_array(NULL, NULL, 1, 2, 3, 4), 'FixedSizeList(6, Int64)'));
+----
+1 h NULL NULL 1
+
+# array_any_value scalar function #2 (with nested array)
+
+query ?
+select array_any_value(make_array(NULL, make_array(NULL, 1, 2, 3, 4, 5), make_array(NULL, 6, 7, 8, 9, 10)));
+----
+[NULL, 1, 2, 3, 4, 5]
+
+query ?
+select array_any_value(arrow_cast(make_array(NULL, make_array(NULL, 1, 2, 3, 4, 5), make_array(NULL, 6, 7, 8, 9, 10)), 'LargeList(List(Int64))'));
+----
+[NULL, 1, 2, 3, 4, 5]
+
+query ?
+select array_any_value(arrow_cast(make_array(NULL, make_array(NULL, 1, 2, 3, 4, 5), make_array(NULL, 6, 7, 8, 9, 10)), 'FixedSizeList(3, List(Int64))'));
+----
+[NULL, 1, 2, 3, 4, 5]
+
+# array_any_value scalar function #3 (using function alias `list_any_value`)
+query IT
+select list_any_value(make_array(NULL, 1, 2, 3, 4, 5)), list_any_value(make_array(NULL, 'h', 'e', 'l', 'l', 'o'));
+----
+1 h
+
+query IT
+select list_any_value(arrow_cast(make_array(NULL, 1, 2, 3, 4, 5), 'LargeList(Int64)')), list_any_value(arrow_cast(make_array(NULL, 'h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'));
+----
+1 h
+
+query IT
+select list_any_value(arrow_cast(make_array(NULL, 1, 2, 3, 4, 5), 'FixedSizeList(6, Int64)')), list_any_value(arrow_cast(make_array(NULL, 'h', 'e', 'l', 'l', 'o'), 'FixedSizeList(6, Utf8)'));
+----
+1 h
+
+# array_any_value with columns
+
+query I
+select array_any_value(column1) from slices;
+----
+2
+11
+21
+31
+NULL
+41
+51
+
+query I
+select array_any_value(arrow_cast(column1, 'LargeList(Int64)')) from slices;
+----
+2
+11
+21
+31
+NULL
+41
+51
+
+query I
+select array_any_value(column1) from fixed_slices;
+----
+2
+11
+21
+31
+41
+51
+
+# array_any_value with columns and scalars
+
+query II
+select array_any_value(make_array(NULL, 1, 2, 3, 4, 5)), array_any_value(column1) from slices;
+----
+1 2
+1 11
+1 21
+1 31
+1 NULL
+1 41
+1 51
+
+query II
+select array_any_value(arrow_cast(make_array(NULL, 1, 2, 3, 4, 5), 'LargeList(Int64)')), array_any_value(arrow_cast(column1, 'LargeList(Int64)')) from slices;
+----
+1 2
+1 11
+1 21
+1 31
+1 NULL
+1 41
+1 51
+
+query II
+select array_any_value(make_array(NULL, 1, 2, 3, 4, 5)), array_any_value(column1) from fixed_slices;
+----
+1 2
+1 11
+1 21
+1 31
+1 41
+1 51
+
+# make_array with nulls
+query ???????
+select make_array(make_array('a','b'), null),
+       make_array(make_array('a','b'), null, make_array('c','d')),
+       make_array(null, make_array('a','b'), null),
+       make_array(null, make_array('a','b'), null, null, make_array('c','d')),
+       make_array(['a', 'bc', 'def'], null, make_array('rust')),
+       make_array([1,2,3], null, make_array(4,5,6,7)),
+       make_array(null, 1, null, 2, null, 3, null, null, 4, 5);
+----
+[[a, b], NULL] [[a, b], NULL, [c, d]] [NULL, [a, b], NULL] [NULL, [a, b], NULL, NULL, [c, d]] [[a, bc, def], NULL, [rust]] [[1, 2, 3], NULL, [4, 5, 6, 7]] [NULL, 1, NULL, 2, NULL, 3, NULL, NULL, 4, 5]
+
+query ?
+select make_array(column5, null, column5) from arrays_values_without_nulls;
+----
+[[2, 3], NULL, [2, 3]]
+[[4, 5], NULL, [4, 5]]
+[[6, 7], NULL, [6, 7]]
+[[8, 9], NULL, [8, 9]]
+
+query ?
+select make_array(['a','b'], null);
+----
+[[a, b], NULL]
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_append.slt b/datafusion/sqllogictest/test_files/array/array_append.slt
new file mode 100644
index 0000000000000..50949948c890e
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_append.slt
@@ -0,0 +1,273 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## array_append (aliases: `list_append`, `array_push_back`, `list_push_back`)
+
+# array_append with NULLs
+
+query ?
+select array_append(null, 1);
+----
+[1]
+
+query ?
+select array_append(null, [2, 3]);
+----
+[[2, 3]]
+
+query ?
+select array_append(null, [[4]]);
+----
+[[[4]]]
+
+query ????
+select
+  array_append(make_array(), 4),
+  array_append(make_array(), null),
+  array_append(make_array(1, null, 3), 4),
+  array_append(make_array(null, null), 1)
+;
+----
+[4] [NULL] [1, NULL, 3, 4] [NULL, NULL, 1]
+
+query ????
+select
+  array_append(arrow_cast(make_array(), 'LargeList(Int64)'), 4),
+  array_append(arrow_cast(make_array(), 'LargeList(Int64)'), null),
+  array_append(arrow_cast(make_array(1, null, 3), 'LargeList(Int64)'), 4),
+  array_append(arrow_cast(make_array(null, null), 'LargeList(Int64)'), 1)
+;
+----
+[4] [NULL] [1, NULL, 3, 4] [NULL, NULL, 1]
+
+query ??
+select
+  array_append(arrow_cast(make_array(1, null, 3), 'FixedSizeList(3, Int64)'), 4),
+  array_append(arrow_cast(make_array(null, null), 'FixedSizeList(2, Int64)'), 1)
+;
+----
+[1, NULL, 3, 4] [NULL, NULL, 1]
+
+# test invalid (non-null)
+query error
+select array_append(1, 2);
+
+query error
+select array_append(1, [2]);
+
+query error
+select array_append([1], [2]);
+
+query ??
+select
+  array_append(make_array(make_array(1, null, 3)), make_array(null)),
+  array_append(make_array(make_array(1, null, 3)), null);
+----
+[[1, NULL, 3], [NULL]] [[1, NULL, 3], NULL]
+
+query ??
+select
+  array_append(arrow_cast(make_array(make_array(1, null, 3)), 'LargeList(LargeList(Int64))'), arrow_cast(make_array(null), 'LargeList(Int64)')),
+  array_append(arrow_cast(make_array(make_array(1, null, 3)), 'LargeList(LargeList(Int64))'), null);
+----
+[[1, NULL, 3], [NULL]] [[1, NULL, 3], NULL]
+
+query ??
+select
+  array_append(arrow_cast(make_array(make_array(1, null, 3)), 'FixedSizeList(1, List(Int64))'), [null]),
+  array_append(arrow_cast(make_array(make_array(1, null, 3)), 'FixedSizeList(1, List(Int64))'), null);
+----
+[[1, NULL, 3], [NULL]] [[1, NULL, 3], NULL]
+
+# array_append scalar function #3
+query ???
+select array_append(make_array(1, 2, 3), 4), array_append(make_array(1.0, 2.0, 3.0), 4.0), array_append(make_array('h', 'e', 'l', 'l'), 'o');
+----
+[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
+
+query ???
+select array_append(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4), array_append(arrow_cast(make_array(1.0, 2.0, 3.0), 'LargeList(Float64)'), 4.0), array_append(arrow_cast(make_array('h', 'e', 'l', 'l'), 'LargeList(Utf8)'), 'o');
+----
+[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
+
+query ???
+select array_append(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 4), array_append(arrow_cast(make_array(1.0, 2.0, 3.0), 'FixedSizeList(3, Float64)'), 4.0), array_append(arrow_cast(make_array('h', 'e', 'l', 'l'), 'FixedSizeList(4, Utf8)'), 'o');
+----
+[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
+
+# array_append scalar function #4 (element is list)
+query ???
+select array_append(make_array([1], [2], [3]), make_array(4)), array_append(make_array([1.0], [2.0], [3.0]), make_array(4.0)), array_append(make_array(['h'], ['e'], ['l'], ['l']), make_array('o'));
+----
+[[1], [2], [3], [4]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]]
+
+query ???
+select array_append(arrow_cast(make_array([1], [2], [3]), 'LargeList(LargeList(Int64))'), arrow_cast(make_array(4), 'LargeList(Int64)')), array_append(arrow_cast(make_array([1.0], [2.0], [3.0]), 'LargeList(LargeList(Float64))'), arrow_cast(make_array(4.0), 'LargeList(Float64)')), array_append(arrow_cast(make_array(['h'], ['e'], ['l'], ['l']), 'LargeList(LargeList(Utf8))'), arrow_cast(make_array('o'), 'LargeList(Utf8)'));
+----
+[[1], [2], [3], [4]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]]
+
+query ???
+select array_append(arrow_cast(make_array([1], [2], [3]), 'FixedSizeList(3, List(Int64))'), [4]), array_append(arrow_cast(make_array([1.0], [2.0], [3.0]), 'FixedSizeList(3, List(Float64))'), [4.0]), array_append(arrow_cast(make_array(['h'], ['e'], ['l'], ['l']), 'FixedSizeList(4, List(Utf8))'), ['o']);
+----
+[[1], [2], [3], [4]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]]
+
+# list_append scalar function #5 (function alias `array_append`)
+query ???
+select list_append(make_array(1, 2, 3), 4), list_append(make_array(1.0, 2.0, 3.0), 4.0), list_append(make_array('h', 'e', 'l', 'l'), 'o');
+----
+[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
+
+query ???
+select list_append(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4), list_append(arrow_cast(make_array(1.0, 2.0, 3.0), 'LargeList(Float64)'), 4.0), list_append(make_array('h', 'e', 'l', 'l'), 'o');
+----
+[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
+
+# array_push_back scalar function #6 (function alias `array_append`)
+query ???
+select array_push_back(make_array(1, 2, 3), 4), array_push_back(make_array(1.0, 2.0, 3.0), 4.0), array_push_back(make_array('h', 'e', 'l', 'l'), 'o');
+----
+[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
+
+query ???
+select array_push_back(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4), array_push_back(arrow_cast(make_array(1.0, 2.0, 3.0), 'LargeList(Float64)'), 4.0), array_push_back(make_array('h', 'e', 'l', 'l'), 'o');
+----
+[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
+
+# list_push_back scalar function #7 (function alias `array_append`)
+query ???
+select list_push_back(make_array(1, 2, 3), 4), list_push_back(make_array(1.0, 2.0, 3.0), 4.0), list_push_back(make_array('h', 'e', 'l', 'l'), 'o');
+----
+[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
+
+query ???
+select list_push_back(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4), list_push_back(arrow_cast(make_array(1.0, 2.0, 3.0), 'LargeList(Float64)'), 4.0), list_push_back(make_array('h', 'e', 'l', 'l'), 'o');
+----
+[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
+
+# array_append with columns #1
+query ?
+select array_append(column1, column2) from arrays_values;
+----
+[NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1]
+[11, 12, 13, 14, 15, 16, 17, 18, NULL, 20, 12]
+[21, 22, 23, NULL, 25, 26, 27, 28, 29, 30, 23]
+[31, 32, 33, 34, 35, NULL, 37, 38, 39, 40, 34]
+[44]
+[41, 42, 43, 44, 45, 46, 47, 48, 49, 50, NULL]
+[51, 52, NULL, 54, 55, 56, 57, 58, 59, 60, 55]
+[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 66]
+
+query ?
+select array_append(column1, column2) from large_arrays_values;
+----
+[NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1]
+[11, 12, 13, 14, 15, 16, 17, 18, NULL, 20, 12]
+[21, 22, 23, NULL, 25, 26, 27, 28, 29, 30, 23]
+[31, 32, 33, 34, 35, NULL, 37, 38, 39, 40, 34]
+[44]
+[41, 42, 43, 44, 45, 46, 47, 48, 49, 50, NULL]
+[51, 52, NULL, 54, 55, 56, 57, 58, 59, 60, 55]
+[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 66]
+
+query ?
+select array_append(column1, column2) from fixed_arrays_values;
+----
+[NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1]
+[11, 12, 13, 14, 15, 16, 17, 18, NULL, 20, 12]
+[21, 22, 23, NULL, 25, 26, 27, 28, 29, 30, 23]
+[31, 32, 33, 34, 35, NULL, 37, 38, 39, 40, 34]
+[NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 44]
+[41, 42, 43, 44, 45, 46, 47, 48, 49, 50, NULL]
+[51, 52, NULL, 54, 55, 56, 57, 58, 59, 60, 55]
+[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 66]
+
+# array_append with columns #2 (element is list)
+query ?
+select array_append(column1, column2) from nested_arrays;
+----
+[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6], [7, 8, 9]]
+[[4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7], [10, 11, 12]]
+
+query ?
+select array_append(column1, column2) from large_nested_arrays;
+----
+[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6], [7, 8, 9]]
+[[4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7], [10, 11, 12]]
+
+query ?
+select array_append(column1, column2) from fixed_size_nested_arrays;
+----
+[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6], [7, 8, 9]]
+[[4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7], [10, 11, 12]]
+
+# array_append with columns and scalars #1
+query ??
+select array_append(column2, 100.1), array_append(column3, '.') from arrays;
+----
+[1.1, 2.2, 3.3, 100.1] [L, o, r, e, m, .]
+[NULL, 5.5, 6.6, 100.1] [i, p, NULL, u, m, .]
+[7.7, 8.8, 9.9, 100.1] [d, NULL, l, o, r, .]
+[10.1, NULL, 12.2, 100.1] [s, i, t, .]
+[13.3, 14.4, 15.5, 100.1] [a, m, e, t, .]
+[100.1] [,, .]
+[16.6, 17.7, 18.8, 100.1] [.]
+
+query ??
+select array_append(column2, 100.1), array_append(column3, '.') from large_arrays;
+----
+[1.1, 2.2, 3.3, 100.1] [L, o, r, e, m, .]
+[NULL, 5.5, 6.6, 100.1] [i, p, NULL, u, m, .]
+[7.7, 8.8, 9.9, 100.1] [d, NULL, l, o, r, .]
+[10.1, NULL, 12.2, 100.1] [s, i, t, .]
+[13.3, 14.4, 15.5, 100.1] [a, m, e, t, .]
+[100.1] [,, .]
+[16.6, 17.7, 18.8, 100.1] [.]
+
+query ??
+select array_append(column2, 100.1), array_append(column3, '.') from fixed_size_arrays;
+----
+[1.1, 2.2, 3.3, 100.1] [L, o, r, e, m, .]
+[NULL, 5.5, 6.6, 100.1] [i, p, NULL, u, m, .]
+[7.7, 8.8, 9.9, 100.1] [d, NULL, l, o, r, .]
+[10.1, NULL, 12.2, 100.1] [s, i, t, a, b, .]
+[13.3, 14.4, 15.5, 100.1] [a, m, e, t, x, .]
+[NULL, NULL, NULL, 100.1] [,, a, b, c, d, .]
+[16.6, 17.7, 18.8, 100.1] [NULL, NULL, NULL, NULL, NULL, .]
+
+# array_append with columns and scalars #2
+query ??
+select array_append(column1, make_array(1, 11, 111)), array_append(make_array(make_array(1, 2, 3), make_array(11, 12, 13)), column2) from nested_arrays;
+----
+[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6], [1, 11, 111]] [[1, 2, 3], [11, 12, 13], [7, 8, 9]]
+[[4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7], [1, 11, 111]] [[1, 2, 3], [11, 12, 13], [10, 11, 12]]
+
+query ??
+select array_append(column1, arrow_cast(make_array(1, 11, 111), 'LargeList(Int64)')), array_append(arrow_cast(make_array(make_array(1, 2, 3), make_array(11, 12, 13)), 'LargeList(LargeList(Int64))'), column2) from large_nested_arrays;
+----
+[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6], [1, 11, 111]] [[1, 2, 3], [11, 12, 13], [7, 8, 9]]
+[[4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7], [1, 11, 111]] [[1, 2, 3], [11, 12, 13], [10, 11, 12]]
+
+query ??
+select array_append(column1, arrow_cast(make_array(1, 11, 111), 'FixedSizeList(3, Int64)')), array_append(arrow_cast(make_array(make_array(1, 2, 3), make_array(11, 12, 13)), 'FixedSizeList(2, List(Int64))'), column2) from fixed_size_nested_arrays;
+----
+[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6], [1, 11, 111]] [[1, 2, 3], [11, 12, 13], [7, 8, 9]]
+[[4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7], [1, 11, 111]] [[1, 2, 3], [11, 12, 13], [10, 11, 12]]
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_cast.slt b/datafusion/sqllogictest/test_files/array/array_cast.slt
new file mode 100644
index 0000000000000..9fa404c4e8621
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_cast.slt
@@ -0,0 +1,189 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## make_array
+
+# make_array scalar function #1
+query ?
+select make_array(1, 2.0)
+----
+[1.0, 2.0]
+
+# make_array scalar function #2
+query ?
+select make_array(null, 1.0)
+----
+[NULL, 1.0]
+
+# make_array scalar function #3
+query ?
+select make_array(1, 2.0, null, 3)
+----
+[1.0, 2.0, NULL, 3.0]
+
+# make_array scalar function #4
+query ?
+select make_array(1.0, '2', null)
+----
+[1.0, 2.0, NULL]
+
+### FixedSizeListArray
+
+statement ok
+CREATE EXTERNAL TABLE fixed_size_list_array STORED AS PARQUET LOCATION '../core/tests/data/fixed_size_list_array.parquet';
+
+query T
+select arrow_typeof(f0) from fixed_size_list_array;
+----
+FixedSizeList(2 x Int64)
+FixedSizeList(2 x Int64)
+
+query ?
+select * from fixed_size_list_array;
+----
+[1, 2]
+[3, 4]
+
+query ?
+select f0 from fixed_size_list_array;
+----
+[1, 2]
+[3, 4]
+
+query ?
+select arrow_cast(f0, 'List(Int64)') from fixed_size_list_array;
+----
+[1, 2]
+[3, 4]
+
+query ?
+select make_array(arrow_cast(f0, 'List(Int64)')) from fixed_size_list_array
+----
+[[1, 2]]
+[[3, 4]]
+
+query T
+select arrow_typeof(make_array(arrow_cast(f0, 'List(Int64)'))) from fixed_size_list_array
+----
+List(List(Int64))
+List(List(Int64))
+
+query ?
+select make_array(f0) from fixed_size_list_array
+----
+[[1, 2]]
+[[3, 4]]
+
+query T
+select arrow_typeof(make_array(f0)) from fixed_size_list_array
+----
+List(FixedSizeList(2 x Int64))
+List(FixedSizeList(2 x Int64))
+
+query ?
+select array_concat(column1, [7]) from arrays_values_v2;
+----
+[NULL, 2, 3, 7]
+[7]
+[9, NULL, 10, 7]
+[NULL, 1, 7]
+[11, 12, 7]
+[7]
+
+# flatten
+
+query ?
+select flatten(NULL);
+----
+NULL
+
+# flatten with scalar values #1
+query ???
+select flatten(make_array(1, 2, 1, 3, 2)),
+       flatten(make_array([1], [2, 3], [null], make_array(4, null, 5))),
+       flatten(make_array([[1.1]], [[2.2]], [[3.3], [4.4]]));
+----
+[1, 2, 1, 3, 2] [1, 2, 3, NULL, 4, NULL, 5] [[1.1], [2.2], [3.3], [4.4]]
+
+query ???
+select flatten(arrow_cast(make_array(1, 2, 1, 3, 2), 'LargeList(Int64)')),
+       flatten(arrow_cast(make_array([1], null, [2, 3], [null], make_array(4, null, 5)), 'LargeList(LargeList(Int64))')),
+       flatten(arrow_cast(make_array([[1.1]], [[2.2]], [[3.3], [4.4]]), 'LargeList(LargeList(LargeList(Float64)))'));
+----
+[1, 2, 1, 3, 2] [1, 2, 3, NULL, 4, NULL, 5] [[1.1], [2.2], [3.3], [4.4]]
+
+query ???
+select flatten(arrow_cast(make_array(1, 2, 1, 3, 2), 'FixedSizeList(5, Int64)')),
+       flatten(arrow_cast(make_array([1], [2, 3], [null], make_array(4, null, 5)), 'FixedSizeList(4, List(Int64))')),
+       flatten(arrow_cast(make_array([[1.1], [2.2]], [[3.3], [4.4]]), 'FixedSizeList(2, List(List(Float64)))'));
+----
+[1, 2, 1, 3, 2] [1, 2, 3, NULL, 4, NULL, 5] [[1.1], [2.2], [3.3], [4.4]]
+
+query ??TT
+select flatten(arrow_cast(make_array([1], [2, 3], [null], make_array(4, null, 5)), 'FixedSizeList(4, LargeList(Int64))')),
+       flatten(arrow_cast(make_array([[1.1], [2.2]], [[3.3], [4.4]]), 'List(LargeList(FixedSizeList(1, Float64)))')),
+       arrow_typeof(flatten(arrow_cast(make_array([1], [2, 3], [null], make_array(4, null, 5)), 'FixedSizeList(4, LargeList(Int64))'))),
+       arrow_typeof(flatten(arrow_cast(make_array([[1.1], [2.2]], [[3.3], [4.4]]), 'List(LargeList(FixedSizeList(1, Float64)))')));
+----
+[1, 2, 3, NULL, 4, NULL, 5] [[1.1], [2.2], [3.3], [4.4]] LargeList(Int64) LargeList(FixedSizeList(1 x Float64))
+
+# flatten with column values
+query ????
+select flatten(column1),
+       flatten(column2),
+       flatten(column3),
+       flatten(column4)
+from flatten_table;
+----
+[1, 2, 3] [[1, 2, 3], [4, 5], [6]] [[[1]], [[2, 3]]] [1.0, 2.1, 2.2, 3.2, 3.3, 3.4]
+[1, 2, 3, 4, 5, 6] [[8]] [[[1, 2]], [[3]]] [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
+
+query ????
+select flatten(column1),
+       flatten(column2),
+       flatten(column3),
+       flatten(column4)
+from large_flatten_table;
+----
+[1, 2, 3] [[1, 2, 3], [4, 5], [6]] [[[1]], [[2, 3]]] [1.0, 2.1, 2.2, 3.2, 3.3, 3.4]
+[1, 2, 3, 4, 5, 6] [[8]] [[[1, 2]], [[3]]] [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
+
+query ????
+select flatten(column1),
+       flatten(column2),
+       flatten(column3),
+       flatten(column4)
+from fixed_size_flatten_table;
+----
+[1, 2, 3] [[1, 2, 3], [4, 5], [6]] [[[1]], [[2, 3]]] [1.0, 2.1, 2.2, 3.2, 3.3, 3.4]
+[1, 2, 3, 4, 5, 6] [[8], [9, 10], [11, 12, 13]] [[[1, 2]], [[3]]] [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
+
+# flatten with different inner list type
+query ??????
+select flatten(arrow_cast(make_array([1, 2], [3, 4]), 'List(FixedSizeList(2, Int64))')),
+       flatten(arrow_cast(make_array([[1, 2]], [[3, 4]]), 'List(FixedSizeList(1, List(Int64)))')),
+       flatten(arrow_cast(make_array([1, 2], [3, 4]), 'LargeList(List(Int64))')),
+       flatten(arrow_cast(make_array([[1, 2]], [[3, 4]]), 'LargeList(List(List(Int64)))')),
+       flatten(arrow_cast(make_array([1, 2], [3, 4]), 'LargeList(FixedSizeList(2, Int64))')),
+       flatten(arrow_cast(make_array([[1, 2]], [[3, 4]]), 'LargeList(FixedSizeList(1, List(Int64)))'))
+----
+[1, 2, 3, 4] [[1, 2], [3, 4]] [1, 2, 3, 4] [[1, 2], [3, 4]] [1, 2, 3, 4] [[1, 2], [3, 4]]
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_concat.slt b/datafusion/sqllogictest/test_files/array/array_concat.slt
new file mode 100644
index 0000000000000..168b307a1e636
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_concat.slt
@@ -0,0 +1,423 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## array_concat (aliases: `array_cat`, `list_concat`, `list_cat`)
+
+# test with empty array
+query ?
+select array_concat([]);
+----
+[]
+
+# test with NULL array
+query ?
+select array_concat(NULL::integer[]);
+----
+NULL
+
+# test with multiple NULL arrays
+query ?
+select array_concat(NULL::integer[], NULL::integer[]);
+----
+NULL
+
+# test with NULL LargeList
+query ?
+select array_concat(arrow_cast(NULL::string[], 'LargeList(Utf8)'));
+----
+NULL
+
+# test with NULL FixedSizeList
+query ?
+select array_concat(arrow_cast(NULL::string[], 'FixedSizeList(2, Utf8)'));
+----
+NULL
+
+# test with mix of NULL and empty arrays
+query ?
+select array_concat(NULL::integer[], []);
+----
+[]
+
+# test with mix of NULL and non-empty arrays
+query ?
+select array_concat(NULL::integer[], [1, 2, 3]);
+----
+[1, 2, 3]
+
+# Concatenating strings arrays
+query ?
+select array_concat(
+  ['1', '2'],
+  ['3']
+);
+----
+[1, 2, 3]
+
+query ?
+select array_concat(
+  arrow_cast(['1', '2'], 'LargeList(Utf8)'),
+  arrow_cast(['3'], 'LargeList(Utf8)')
+);
+----
+[1, 2, 3]
+
+query ?
+select array_concat(
+  arrow_cast(['1', '2'], 'FixedSizeList(2, Utf8)'),
+  arrow_cast(['3'], 'FixedSizeList(1, Utf8)')
+);
+----
+[1, 2, 3]
+
+# Concatenating string arrays
+query ?
+select array_concat(
+  [arrow_cast('1', 'LargeUtf8'), arrow_cast('2', 'LargeUtf8')],
+  [arrow_cast('3', 'LargeUtf8')]
+);
+----
+[1, 2, 3]
+
+# Concatenating stringview
+query ?
+select array_concat(
+  [arrow_cast('1', 'Utf8View'), arrow_cast('2', 'Utf8View')],
+  [arrow_cast('3', 'Utf8View')]
+);
+----
+[1, 2, 3]
+
+# Concatenating Mixed types
+query ?
+select array_concat(
+  [arrow_cast('1', 'Utf8'), arrow_cast('2', 'Utf8')],
+  [arrow_cast('3', 'LargeUtf8')]
+);
+----
+[1, 2, 3]
+
+# Concatenating Mixed types
+query ?T
+select
+    array_concat([arrow_cast('1', 'Utf8'), arrow_cast('2', 'Utf8')], [arrow_cast('3', 'Utf8View')]),
+    arrow_typeof(array_concat([arrow_cast('1', 'Utf8'), arrow_cast('2', 'Utf8')], [arrow_cast('3', 'Utf8View')]));
+----
+[1, 2, 3] List(Utf8View)
+
+# Concatenating mixed list and large list — return type widens to LargeList
+query ?T
+select
+    array_concat(make_array(1, 2), arrow_cast([3, 4], 'LargeList(Int64)')),
+    arrow_typeof(array_concat(make_array(1, 2), arrow_cast([3, 4], 'LargeList(Int64)')));
+----
+[1, 2, 3, 4] LargeList(Int64)
+
+# Reverse argument order: LargeList first, plain list second
+query ?T
+select
+    array_concat(arrow_cast([1, 2], 'LargeList(Int64)'), make_array(3, 4)),
+    arrow_typeof(array_concat(arrow_cast([1, 2], 'LargeList(Int64)'), make_array(3, 4)));
+----
+[1, 2, 3, 4] LargeList(Int64)
+
+# FixedSizeList combined with LargeList — also widens to LargeList
+query ?T
+select
+    array_concat(arrow_cast([1, 2], 'FixedSizeList(2, Int64)'), arrow_cast([3, 4], 'LargeList(Int64)')),
+    arrow_typeof(array_concat(arrow_cast([1, 2], 'FixedSizeList(2, Int64)'), arrow_cast([3, 4], 'LargeList(Int64)')));
+----
+[1, 2, 3, 4] LargeList(Int64)
+
+# Three-way mix: List, LargeList, List
+query ?T
+select
+    array_concat(make_array(1, 2), arrow_cast([3], 'LargeList(Int64)'), make_array(4, 5)),
+    arrow_typeof(array_concat(make_array(1, 2), arrow_cast([3], 'LargeList(Int64)'), make_array(4, 5)));
+----
+[1, 2, 3, 4, 5] LargeList(Int64)
+
+# array_concat with NULL elements inside arrays
+query ?
+select array_concat([1, NULL, 3], [NULL, 5]);
+----
+[1, NULL, 3, NULL, 5]
+
+query ?
+select array_concat([NULL, NULL], [1, 2], [NULL]);
+----
+[NULL, NULL, 1, 2, NULL]
+
+query ?
+select array_concat([NULL, NULL], [NULL, NULL]);
+----
+[NULL, NULL, NULL, NULL]
+
+# array_concat error
+query error DataFusion error: Error during planning: Execution error: Function 'array_concat' user-defined coercion failed with: Error during planning: array_concat does not support type Int64
+select array_concat(1, 2);
+
+# array_concat scalar function #1
+query ??
+select array_concat(make_array(1, 2, 3), make_array(4, 5, 6), make_array(7, 8, 9)), array_concat(make_array([1], [2]), make_array([3], [4]));
+----
+[1, 2, 3, 4, 5, 6, 7, 8, 9] [[1], [2], [3], [4]]
+
+# array_concat scalar function #2
+query ?
+select array_concat(make_array(make_array(1, 2), make_array(3, 4)), make_array(make_array(5, 6), make_array(7, 8)));
+----
+[[1, 2], [3, 4], [5, 6], [7, 8]]
+
+# array_concat scalar function #3
+query ?
+select array_concat(make_array([1], [2], [3]), make_array([4], [5], [6]), make_array([7], [8], [9]));
+----
+[[1], [2], [3], [4], [5], [6], [7], [8], [9]]
+
+# array_concat scalar function #4
+query ?
+select array_concat(make_array([[1]]), make_array([[2]]));
+----
+[[[1]], [[2]]]
+
+# array_concat scalar function #5
+query ?
+select array_concat(make_array(2, 3), make_array());
+----
+[2, 3]
+
+# array_concat scalar function #6
+query ?
+select array_concat(make_array(), make_array(2, 3));
+----
+[2, 3]
+
+# array_concat scalar function #7 (with empty arrays)
+query ?
+select array_concat(make_array(make_array(1, 2), make_array(3, 4)), make_array(make_array()));
+----
+[[1, 2], [3, 4], []]
+
+# array_concat scalar function #8 (with empty arrays)
+query ?
+select array_concat(make_array(make_array(1, 2), make_array(3, 4)), make_array(make_array()), make_array(make_array(), make_array()), make_array(make_array(5, 6), make_array(7, 8)));
+----
+[[1, 2], [3, 4], [], [], [], [5, 6], [7, 8]]
+
+# array_concat scalar function #9 (with empty arrays)
+query ?
+select array_concat(make_array(make_array()), make_array(make_array(1, 2), make_array(3, 4)));
+----
+[[], [1, 2], [3, 4]]
+
+# array_cat scalar function #10 (function alias `array_concat`)
+query ??
+select array_cat(make_array(1, 2, 3), make_array(4, 5, 6), make_array(7, 8, 9)), array_cat(make_array([1], [2]), make_array([3], [4]));
+----
+[1, 2, 3, 4, 5, 6, 7, 8, 9] [[1], [2], [3], [4]]
+
+# list_concat scalar function #11 (function alias `array_concat`)
+query ??
+select list_concat(make_array(1, 2, 3), make_array(4, 5, 6), make_array(7, 8, 9)), list_concat(make_array([1], [2]), make_array([3], [4]));
+----
+[1, 2, 3, 4, 5, 6, 7, 8, 9] [[1], [2], [3], [4]]
+
+# list_cat scalar function #12 (function alias `array_concat`)
+query ??
+select list_cat(make_array(1, 2, 3), make_array(4, 5, 6), make_array(7, 8, 9)), list_cat(make_array([1], [2]), make_array([3], [4]));
+----
+[1, 2, 3, 4, 5, 6, 7, 8, 9] [[1], [2], [3], [4]]
+
+# array_concat with different dimensions #1 (2D + 1D)
+query ?
+select array_concat(make_array([1,2], [3,4]), make_array(5, 6));
+----
+[[1, 2], [3, 4], [5, 6]]
+
+# array_concat with different dimensions #2 (1D + 2D)
+query ?
+select array_concat(make_array(5, 6), make_array([1,2], [3,4]));
+----
+[[5, 6], [1, 2], [3, 4]]
+
+# array_concat with different dimensions #3 (2D + 1D + 1D)
+query ?
+select array_concat(make_array([1,2], [3,4]), make_array(5, 6), make_array(7,8));
+----
+[[1, 2], [3, 4], [5, 6], [7, 8]]
+
+# array_concat with different dimensions #4 (1D + 2D + 3D)
+query ?
+select array_concat(make_array(10, 20), make_array([30, 40]), make_array([[50, 60]]));
+----
+[[[10, 20]], [[30, 40]], [[50, 60]]]
+
+# array_concat with different dimensions #5 (2D + 1D + 3D)
+query ?
+select array_concat(make_array([30, 40]), make_array(10, 20), make_array([[50, 60]]));
+----
+[[[30, 40]], [[10, 20]], [[50, 60]]]
+
+# array_concat with different dimensions #6 (2D + 1D + 3D + 4D + 3D)
+query ?
+select array_concat(make_array([30, 40]), make_array(10, 20), make_array([[50, 60]]),  make_array([[[70, 80]]]), make_array([[80, 40]]));
+----
+[[[[30, 40]]], [[[10, 20]]], [[[50, 60]]], [[[70, 80]]], [[[80, 40]]]]
+
+# array_concat column-wise #1
+query ?
+select array_concat(column1, make_array(0)) from arrays_values_without_nulls;
+----
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0]
+[11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0]
+[21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0]
+[31, 32, 33, 34, 35, 26, 37, 38, 39, 40, 0]
+
+# array_concat column-wise #2
+query ?
+select array_concat(column1, column1) from arrays_values_without_nulls;
+----
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+[11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
+[21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
+[31, 32, 33, 34, 35, 26, 37, 38, 39, 40, 31, 32, 33, 34, 35, 26, 37, 38, 39, 40]
+
+# array_concat column-wise #3
+query ?
+select array_concat(make_array(column2), make_array(column3)) from arrays_values_without_nulls;
+----
+[1, 1]
+[12, 2]
+[23, 3]
+[34, 4]
+
+# array_concat column-wise #4
+query ?
+select array_concat(make_array(column2), make_array(0)) from arrays_values;
+----
+[1, 0]
+[12, 0]
+[23, 0]
+[34, 0]
+[44, 0]
+[NULL, 0]
+[55, 0]
+[66, 0]
+
+# array_concat column-wise #5
+query ???
+select array_concat(column1, column1), array_concat(column2, column2), array_concat(column3, column3) from arrays;
+----
+[[NULL, 2], [3, NULL], [NULL, 2], [3, NULL]] [1.1, 2.2, 3.3, 1.1, 2.2, 3.3] [L, o, r, e, m, L, o, r, e, m]
+[[3, 4], [5, 6], [3, 4], [5, 6]] [NULL, 5.5, 6.6, NULL, 5.5, 6.6] [i, p, NULL, u, m, i, p, NULL, u, m]
+[[5, 6], [7, 8], [5, 6], [7, 8]] [7.7, 8.8, 9.9, 7.7, 8.8, 9.9] [d, NULL, l, o, r, d, NULL, l, o, r]
+[[7, NULL], [9, 10], [7, NULL], [9, 10]] [10.1, NULL, 12.2, 10.1, NULL, 12.2] [s, i, t, s, i, t]
+NULL [13.3, 14.4, 15.5, 13.3, 14.4, 15.5] [a, m, e, t, a, m, e, t]
+[[11, 12], [13, 14], [11, 12], [13, 14]] NULL [,, ,]
+[[15, 16], [NULL, 18], [15, 16], [NULL, 18]] [16.6, 17.7, 18.8, 16.6, 17.7, 18.8] NULL
+
+# array_concat column-wise #6
+query ??
+select array_concat(column1, make_array(make_array(1, 2), make_array(3, 4))), array_concat(column2, make_array(1.1, 2.2, 3.3)) from arrays;
+----
+[[NULL, 2], [3, NULL], [1, 2], [3, 4]] [1.1, 2.2, 3.3, 1.1, 2.2, 3.3]
+[[3, 4], [5, 6], [1, 2], [3, 4]] [NULL, 5.5, 6.6, 1.1, 2.2, 3.3]
+[[5, 6], [7, 8], [1, 2], [3, 4]] [7.7, 8.8, 9.9, 1.1, 2.2, 3.3]
+[[7, NULL], [9, 10], [1, 2], [3, 4]] [10.1, NULL, 12.2, 1.1, 2.2, 3.3]
+[[1, 2], [3, 4]] [13.3, 14.4, 15.5, 1.1, 2.2, 3.3]
+[[11, 12], [13, 14], [1, 2], [3, 4]] [1.1, 2.2, 3.3]
+[[15, 16], [NULL, 18], [1, 2], [3, 4]] [16.6, 17.7, 18.8, 1.1, 2.2, 3.3]
+
+# array_concat column-wise #7
+query ?
+select array_concat(column3, make_array('.', '.', '.')) from arrays;
+----
+[L, o, r, e, m, ., ., .]
+[i, p, NULL, u, m, ., ., .]
+[d, NULL, l, o, r, ., ., .]
+[s, i, t, ., ., .]
+[a, m, e, t, ., ., .]
+[,, ., ., .]
+[., ., .]
+
+# query ??I?
+# select column1, column2, column3, column4 from arrays_values_v2;
+# ----
+# [NULL, 2, 3] [4, 5, NULL] 12 [[30, 40, 50]]
+# NULL [7, NULL, 8] 13 [[NULL, NULL, 60]]
+# [9, NULL, 10] NULL 14 [[70, NULL, NULL]]
+# [NULL, 1] [NULL, 21] NULL NULL
+# [11, 12] NULL NULL NULL
+# NULL NULL NULL NULL
+
+
+# array_concat column-wise #8 (1D + 1D)
+query ?
+select array_concat(column1, column2) from arrays_values_v2;
+----
+[NULL, 2, 3, 4, 5, NULL]
+[7, NULL, 8]
+[9, NULL, 10]
+[NULL, 1, NULL, 21]
+[11, 12]
+NULL
+
+# array_concat column-wise #9 (2D + 1D)
+query ?
+select array_concat(column4, make_array(column3)) from arrays_values_v2;
+----
+[[30, 40, 50], [12]]
+[[NULL, NULL, 60], [13]]
+[[70, NULL, NULL], [14]]
+[[NULL]]
+[[NULL]]
+[[NULL]]
+
+# array_concat column-wise #10 (3D + 2D + 1D)
+query ?
+select array_concat(column4, column1, column2) from nested_arrays;
+----
+[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]], [[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]], [[7, 8, 9]]]
+[[[11, 12, 13], [14, 15, 16]], [[17, 18, 19], [20, 21, 22]], [[4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7]], [[10, 11, 12]]]
+
+# array_concat column-wise #11 (2D + 1D)
+query ?
+select array_concat(column4, column1) from arrays_values_v2;
+----
+[[30, 40, 50], [NULL, 2, 3]]
+[[NULL, NULL, 60], NULL]
+[[70, NULL, NULL], [9, NULL, 10]]
+[[NULL, 1]]
+[[11, 12]]
+[NULL]
+
+# array_concat column-wise #12 (1D + 1D + 1D)
+query ?
+select array_concat(make_array(column3), column1, column2) from arrays_values_v2;
+----
+[12, NULL, 2, 3, 4, 5, NULL]
+[13, 7, NULL, 8]
+[14, 9, NULL, 10]
+[NULL, NULL, 1, NULL, 21]
+[NULL, 11, 12]
+[NULL]
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_distinct.slt b/datafusion/sqllogictest/test_files/array/array_distinct.slt
new file mode 100644
index 0000000000000..88ffdf7f2ff78
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_distinct.slt
@@ -0,0 +1,214 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## array_distinct
+
+#TODO: https://github.com/apache/datafusion/issues/7142
+#query ?
+#select array_distinct(null);
+#----
+#NULL
+
+# test with empty row, the row that does not match the condition has row count 0
+statement ok
+create table t1(a int, b int) as values (100, 1), (101, 2), (102, 3), (101, 2);
+
+# rowsort is to ensure the order of group by is deterministic, array_sort has no effect here, since the sum() always returns single row.
+query ? rowsort
+select array_distinct([sum(a)]) from t1 where a > 100 group by b;
+----
+[102]
+[202]
+
+statement ok
+drop table t1;
+
+query ?
+select array_distinct(a) from values ([1, 2, 3]), (null), ([1, 3, 1]) as X(a);
+----
+[1, 2, 3]
+NULL
+[1, 3]
+
+query ?
+select array_distinct(arrow_cast(null, 'LargeList(Int64)'));
+----
+NULL
+
+query ?
+select array_distinct([]);
+----
+[]
+
+query ?
+select array_distinct([[], []]);
+----
+[[]]
+
+query ?
+select array_distinct(column1)
+from array_distinct_table_1D;
+----
+[1, 2, 3]
+[1, 2, 3, 4, 5]
+[3, 5]
+
+query ?
+select array_distinct(column1)
+from array_distinct_table_1D_UTF8;
+----
+[a, bc, def]
+[a, bc, def, defg]
+[defg]
+
+query ?
+select array_distinct(column1)
+from array_distinct_table_2D;
+----
+[[1, 2], [3, 4], [5, 6]]
+[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
+[[5, 6], NULL]
+
+query ?
+select array_distinct(column1)
+from array_distinct_table_1D_large;
+----
+[1, 2, 3]
+[1, 2, 3, 4, 5]
+[3, 5]
+
+query ?
+select array_distinct(column1)
+from array_distinct_table_1D_fixed;
+----
+[1, 2, 3]
+[1, 2, 3, 4, 5]
+[3, 5]
+
+query ?
+select array_distinct(column1)
+from array_distinct_table_1D_UTF8_fixed;
+----
+[a, bc, def]
+[a, bc, def, defg]
+[defg]
+
+query ?
+select array_distinct(column1)
+from array_distinct_table_2D_fixed;
+----
+[[1, 2], [3, 4], [5, 6]]
+[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
+[[5, 6], NULL]
+
+## array_compact (aliases: `list_compact`)
+
+# basic: remove nulls from integer array
+query ?
+select array_compact([1, NULL, 2, NULL, 3]);
+----
+[1, 2, 3]
+
+# no nulls present
+query ?
+select array_compact([1, 2, 3]);
+----
+[1, 2, 3]
+
+# all nulls
+query ?
+select array_compact(arrow_cast([NULL, NULL, NULL], 'List(Int64)'));
+----
+[]
+
+# empty array
+query ?
+select array_compact([]);
+----
+[]
+
+# NULL input returns NULL
+query ?
+select array_compact(NULL::INT[]);
+----
+NULL
+
+# string array
+query ?
+select array_compact(['a', NULL, 'b', NULL, 'c']);
+----
+[a, b, c]
+
+# float array
+query ?
+select array_compact([1.0, NULL, 2.0, NULL]);
+----
+[1.0, 2.0]
+
+# nested array (2D)
+query ?
+select array_compact([make_array(1, 2), NULL, make_array(3, 4)]);
+----
+[[1, 2], [3, 4]]
+
+# LargeList
+query ?
+select array_compact(arrow_cast([1, NULL, 2, NULL, 3], 'LargeList(Int64)'));
+----
+[1, 2, 3]
+
+# alias list_compact
+query ?
+select list_compact([1, NULL, 2]);
+----
+[1, 2]
+
+# table-based test
+statement ok
+CREATE TABLE array_compact_table AS VALUES
+  (make_array(1, NULL, 2, NULL, 3)),
+  (make_array(NULL, NULL, NULL)),
+  (make_array(4, 5, 6)),
+  (NULL::INT[])
+;
+
+query ?
+select array_compact(column1) from array_compact_table;
+----
+[1, 2, 3]
+[]
+[4, 5, 6]
+NULL
+
+statement ok
+DROP TABLE array_compact_table;
+
+# FixedSizeList (coerced to List)
+query ?
+select array_compact(arrow_cast(make_array(1, NULL, 2, NULL, 3), 'FixedSizeList(5, Int64)'));
+----
+[1, 2, 3]
+
+query ?
+select array_compact(arrow_cast(make_array(NULL, NULL, NULL), 'FixedSizeList(3, Int64)'));
+----
+[]
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_element.slt b/datafusion/sqllogictest/test_files/array/array_element.slt
new file mode 100644
index 0000000000000..7c960653edcf1
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_element.slt
@@ -0,0 +1,275 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## array_element (aliases: array_extract, list_extract, list_element)
+
+# Testing with empty arguments should result in an error
+query error DataFusion error: Error during planning: 'array_element' does not support zero arguments
+select array_element();
+
+# array_element error
+query error
+select array_element(1, 2);
+
+# array_element with null
+query I
+select array_element([1, 2], NULL);
+----
+NULL
+
+query ?
+select array_element(NULL, 2);
+----
+NULL
+
+# array_element scalar function #1 (with positive index)
+query IT
+select array_element(make_array(1, 2, 3, 4, 5), 2), array_element(make_array('h', 'e', 'l', 'l', 'o'), 3);
+----
+2 l
+
+query IT
+select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 3);
+----
+2 l
+
+query IT
+select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 2), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 3);
+----
+2 l
+
+# array_element scalar function #2 (with positive index; out of bounds)
+query IT
+select array_element(make_array(1, 2, 3, 4, 5), 7), array_element(make_array('h', 'e', 'l', 'l', 'o'), 11);
+----
+NULL NULL
+
+query IT
+select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 7), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 11);
+----
+NULL NULL
+
+query IT
+select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 7), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 11);
+----
+NULL NULL
+
+# array_element scalar function #3 (with zero)
+query IT
+select array_element(make_array(1, 2, 3, 4, 5), 0), array_element(make_array('h', 'e', 'l', 'l', 'o'), 0);
+----
+NULL NULL
+
+query IT
+select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 0), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 0);
+----
+NULL NULL
+
+query IT
+select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 0), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 0);
+----
+NULL NULL
+
+# array_element scalar function #4 (with NULL)
+query IT
+select array_element(make_array(1, 2, 3, 4, 5), NULL), array_element(make_array('h', 'e', 'l', 'l', 'o'), NULL);
+----
+NULL NULL
+
+query IT
+select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), NULL), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), NULL);
+----
+NULL NULL
+
+query IT
+select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), NULL), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), NULL);
+----
+NULL NULL
+
+# array_element scalar function #5 (with negative index)
+query IT
+select array_element(make_array(1, 2, 3, 4, 5), -2), array_element(make_array('h', 'e', 'l', 'l', 'o'), -3);
+----
+4 l
+
+query IT
+select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), -2), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), -3);
+----
+4 l
+
+query IT
+select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), -2), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), -3);
+----
+4 l
+
+# array_element scalar function #6 (with negative index; out of bounds)
+query IT
+select array_element(make_array(1, 2, 3, 4, 5), -11), array_element(make_array('h', 'e', 'l', 'l', 'o'), -7);
+----
+NULL NULL
+
+query IT
+select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), -11), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), -7);
+----
+NULL NULL
+
+query IT
+select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), -11), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), -7);
+----
+NULL NULL
+
+# array_element scalar function #7 (nested array)
+query ?
+select array_element(make_array(make_array(1, 2, 3, 4, 5), make_array(6, 7, 8, 9, 10)), 1);
+----
+[1, 2, 3, 4, 5]
+
+query ?
+select array_element(arrow_cast(make_array(make_array(1, 2, 3, 4, 5), make_array(6, 7, 8, 9, 10)), 'LargeList(List(Int64))'), 1);
+----
+[1, 2, 3, 4, 5]
+
+query ?
+select array_element(arrow_cast(make_array(make_array(1, 2, 3, 4, 5), make_array(6, 7, 8, 9, 10)), 'FixedSizeList(2, List(Int64))'), 1);
+----
+[1, 2, 3, 4, 5]
+
+# array_extract scalar function #8 (function alias `array_element`)
+query IT
+select array_extract(make_array(1, 2, 3, 4, 5), 2), array_extract(make_array('h', 'e', 'l', 'l', 'o'), 3);
+----
+2 l
+
+query IT
+select array_extract(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2), array_extract(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 3);
+----
+2 l
+
+query IT
+select array_extract(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 2), array_extract(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 3);
+----
+2 l
+
+# list_element scalar function #9 (function alias `array_element`)
+query IT
+select list_element(make_array(1, 2, 3, 4, 5), 2), list_element(make_array('h', 'e', 'l', 'l', 'o'), 3);
+----
+2 l
+
+query IT
+select list_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2), array_extract(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 3);
+----
+2 l
+
+query IT
+select list_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 2), list_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 3);
+----
+2 l
+
+# list_extract scalar function #10 (function alias `array_element`)
+query IT
+select list_extract(make_array(1, 2, 3, 4, 5), 2), list_extract(make_array('h', 'e', 'l', 'l', 'o'), 3);
+----
+2 l
+
+query IT
+select list_extract(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2), list_extract(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 3);
+----
+2 l
+
+query IT
+select list_extract(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 2), list_extract(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 3);
+----
+2 l
+
+# array_element with columns
+query I
+select array_element(column1, column2) from slices;
+----
+NULL
+12
+NULL
+37
+NULL
+NULL
+55
+
+query I
+select array_element(arrow_cast(column1, 'LargeList(Int64)'), column2) from slices;
+----
+NULL
+12
+NULL
+37
+NULL
+NULL
+55
+
+query I
+select array_element(column1, column2) from fixed_slices;
+----
+NULL
+12
+NULL
+37
+NULL
+55
+
+# array_element with columns and scalars
+query II
+select array_element(make_array(1, 2, 3, 4, 5), column2), array_element(column1, 3) from slices;
+----
+1 3
+2 13
+NULL 23
+2 33
+4 NULL
+NULL 43
+5 NULL
+
+query II
+select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), column2), array_element(arrow_cast(column1, 'LargeList(Int64)'), 3) from slices;
+----
+1 3
+2 13
+NULL 23
+2 33
+4 NULL
+NULL 43
+5 NULL
+
+query II
+select array_element(make_array(1, 2, 3, 4, 5), column2), array_element(column1, 3) from fixed_slices;
+----
+1 3
+2 13
+NULL 23
+2 33
+NULL 43
+5 NULL
+
+# array_element of empty array
+query T
+select coalesce(array_element([], 1), array_element(NULL, 1), 'ok');
+----
+ok
+
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_empty.slt b/datafusion/sqllogictest/test_files/array/array_empty.slt
new file mode 100644
index 0000000000000..62ac5f66b74c5
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_empty.slt
@@ -0,0 +1,190 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## empty (aliases: `array_empty`, `list_empty`)
+# empty scalar function #1
+query B
+select empty(make_array(1));
+----
+false
+
+query B
+select empty(arrow_cast(make_array(1), 'LargeList(Int64)'));
+----
+false
+
+query B
+select empty(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)'));
+----
+false
+
+# empty scalar function #2
+query B
+select empty(make_array());
+----
+true
+
+query B
+select empty(arrow_cast(make_array(), 'LargeList(Int64)'));
+----
+true
+
+#TODO: https://github.com/apache/datafusion/issues/9158
+#query B
+#select empty(arrow_cast(make_array(), 'FixedSizeList(0, Null)'));
+#----
+#true
+
+# empty scalar function #3
+query B
+select empty(make_array(NULL));
+----
+false
+
+query B
+select empty(arrow_cast(make_array(NULL), 'LargeList(Int64)'));
+----
+false
+
+query B
+select empty(arrow_cast(make_array(NULL), 'FixedSizeList(1, Int64)'));
+----
+false
+
+#TODO: https://github.com/apache/datafusion/issues/7142
+# empty scalar function #4
+#query B
+#select empty(NULL);
+#----
+#NULL
+
+# empty scalar function #5
+query B
+select empty(column1) from arrays;
+----
+false
+false
+false
+false
+NULL
+false
+false
+
+query B
+select empty(arrow_cast(column1, 'LargeList(List(Int64))')) from arrays;
+----
+false
+false
+false
+false
+NULL
+false
+false
+
+query B
+select empty(column1) from fixed_size_arrays;
+----
+false
+false
+false
+false
+NULL
+false
+false
+
+## array_empty (aliases: `empty`, `list_empty`)
+# array_empty scalar function #1
+query B
+select array_empty(make_array(1));
+----
+false
+
+query B
+select array_empty(arrow_cast(make_array(1), 'LargeList(Int64)'));
+----
+false
+
+# array_empty scalar function #2
+query B
+select array_empty(make_array());
+----
+true
+
+query B
+select array_empty(arrow_cast(make_array(), 'LargeList(Int64)'));
+----
+true
+
+# array_empty scalar function #3
+query B
+select array_empty(make_array(NULL));
+----
+false
+
+query B
+select array_empty(arrow_cast(make_array(NULL), 'LargeList(Int64)'));
+----
+false
+
+## list_empty (aliases: `empty`, `array_empty`)
+# list_empty scalar function #1
+query B
+select list_empty(make_array(1));
+----
+false
+
+query B
+select list_empty(arrow_cast(make_array(1), 'LargeList(Int64)'));
+----
+false
+
+# list_empty scalar function #2
+query B
+select list_empty(make_array());
+----
+true
+
+query B
+select list_empty(arrow_cast(make_array(), 'LargeList(Int64)'));
+----
+true
+
+# list_empty scalar function #3
+query B
+select list_empty(make_array(NULL));
+----
+false
+
+query B
+select list_empty(arrow_cast(make_array(NULL), 'LargeList(Int64)'));
+----
+false
+
+query B
+select empty(arrow_cast([1], 'ListView(Int64)'));
+----
+false
+
+query B
+select empty(arrow_cast([1], 'LargeListView(Int64)'));
+----
+false
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_except.slt b/datafusion/sqllogictest/test_files/array/array_except.slt
new file mode 100644
index 0000000000000..a718723e58c38
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_except.slt
@@ -0,0 +1,159 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## array_except
+
+statement ok
+CREATE TABLE array_except_table
+AS VALUES
+  ([1, 2, 2, 3], [2, 3, 4]),
+  ([2, 3, 3], [3]),
+  ([3], [3, 3, 4]),
+  (null, [3, 4]),
+  ([1, 2], null),
+  (null, null)
+;
+
+query ?
+select array_except(column1, column2) from array_except_table;
+----
+[1]
+[2]
+[]
+NULL
+NULL
+NULL
+
+statement ok
+drop table array_except_table;
+
+statement ok
+CREATE TABLE array_except_nested_list_table
+AS VALUES
+  ([[1, 2], [3]], [[2], [3], [4, 5]]),
+  ([[1, 2], [3]], [[2], [1, 2]]),
+  ([[1, 2], [3]], null),
+  (null, [[1], [2, 3], [4, 5, 6]]),
+  ([[1], [2, 3], [4, 5, 6]], [[2, 3], [4, 5, 6], [1]])
+;
+
+query ?
+select array_except(column1, column2) from array_except_nested_list_table;
+----
+[[1, 2]]
+[[3]]
+NULL
+NULL
+[]
+
+statement ok
+drop table array_except_nested_list_table;
+
+statement ok
+CREATE TABLE array_except_table_float
+AS VALUES
+  ([1.1, 2.2, 3.3], [2.2]),
+  ([1.1, 2.2, 3.3], [4.4]),
+  ([1.1, 2.2, 3.3], [3.3, 2.2, 1.1])
+;
+
+query ?
+select array_except(column1, column2) from array_except_table_float;
+----
+[1.1, 3.3]
+[1.1, 2.2, 3.3]
+[]
+
+statement ok
+drop table array_except_table_float;
+
+statement ok
+CREATE TABLE array_except_table_ut8
+AS VALUES
+  (['a', 'b', 'c'], ['a']),
+  (['a', 'bc', 'def'], ['g', 'def']),
+  (['a', 'bc', 'def'], null),
+  (null, ['a'])
+;
+
+query ?
+select array_except(column1, column2) from array_except_table_ut8;
+----
+[b, c]
+[a, bc]
+NULL
+NULL
+
+statement ok
+drop table array_except_table_ut8;
+
+statement ok
+CREATE TABLE array_except_table_bool
+AS VALUES
+  ([true, false, false], [false]),
+  ([true, true, true], [false]),
+  ([false, false, false], [true]),
+  ([true, false], null),
+  (null, [true, false])
+;
+
+query ?
+select array_except(column1, column2) from array_except_table_bool;
+----
+[true]
+[true]
+[false]
+NULL
+NULL
+
+statement ok
+drop table array_except_table_bool;
+
+query ?
+select array_except([], null);
+----
+NULL
+
+query ?
+select array_except([], []);
+----
+[]
+
+query ?
+select array_except(null, []);
+----
+NULL
+
+query ?
+select array_except(null, null)
+----
+NULL
+
+query ?
+select array_except(arrow_cast([1, 2, 3, 4], 'LargeList(Int64)'), arrow_cast([5, 6, 3, 4], 'LargeList(Int64)'));
+----
+[1, 2]
+
+query ?
+select array_except(arrow_cast([1, 2, 3, 4], 'FixedSizeList(4, Int64)'), arrow_cast([5, 6, 3, 4], 'FixedSizeList(4, Int64)'));
+----
+[1, 2]
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_has.slt b/datafusion/sqllogictest/test_files/array/array_has.slt
new file mode 100644
index 0000000000000..abfd697a42d54
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_has.slt
@@ -0,0 +1,933 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## array_has/array_has_all/array_has_any
+
+# If lhs is empty, return false
+query B
+select array_has([], 1);
+----
+false
+
+# If rhs is Null, we returns Null
+query BBB
+select array_has([], null),
+       array_has([1, 2, 3], null),
+       array_has([null, 1], null);
+----
+NULL NULL NULL
+
+# Always return false if not contained even if list has null elements
+query BB
+select array_has([1, null, 2], 3),
+       array_has([null, null, null], 3);
+----
+false false
+
+#TODO: array_has_all and array_has_any cannot handle NULL
+#query BBBB
+#select array_has_any([], null),
+#       array_has_any([1, 2, 3], null),
+#       array_has_all([], null),
+#       array_has_all([1, 2, 3], null);
+#----
+#false false false false
+
+query BBBBBBBBBBBB
+select array_has(make_array(1,2), 1),
+       array_has(make_array(1,2,NULL), 1),
+       array_has(make_array([2,3], [3,4]), make_array(2,3)),
+       array_has(make_array([[1], [2,3]], [[4,5], [6]]), make_array([1], [2,3])),
+       array_has(make_array([[1], [2,3]], [[4,5], [6]]), make_array([4,5], [6])),
+       array_has(make_array([[1], [2,3]], [[4,5], [6]]), make_array([1])),
+       array_has(make_array([[[1]]]), make_array([[1]])),
+       array_has(make_array([[[1]]], [[[1], [2]]]), make_array([[2]])),
+       array_has(make_array([[[1]]], [[[1], [2]]]), make_array([[1], [2]])),
+       list_has(make_array(1,2,3), 4),
+       array_contains(make_array(1,2,3), 3),
+       list_contains(make_array(1,2,3), 0)
+;
+----
+true true true true true false true false true false true false
+
+query BBBBBBBBBBBB
+select array_has(arrow_cast(make_array(1,2), 'LargeList(Int64)'), 1),
+       array_has(arrow_cast(make_array(1,2,NULL), 'LargeList(Int64)'), 1),
+       array_has(arrow_cast(make_array([2,3], [3,4]), 'LargeList(List(Int64))'), make_array(2,3)),
+       array_has(arrow_cast(make_array([[1], [2,3]], [[4,5], [6]]), 'LargeList(List(List(Int64)))'), make_array([1], [2,3])),
+       array_has(arrow_cast(make_array([[1], [2,3]], [[4,5], [6]]), 'LargeList(List(List(Int64)))'), make_array([4,5], [6])),
+       array_has(arrow_cast(make_array([[1], [2,3]], [[4,5], [6]]), 'LargeList(List(List(Int64)))'), make_array([1])),
+       array_has(arrow_cast(make_array([[[1]]]), 'LargeList(List(List(List(Int64))))'), make_array([[1]])),
+       array_has(arrow_cast(make_array([[[1]]], [[[1], [2]]]), 'LargeList(List(List(List(Int64))))'), make_array([[2]])),
+       array_has(arrow_cast(make_array([[[1]]], [[[1], [2]]]), 'LargeList(List(List(List(Int64))))'), make_array([[1], [2]])),
+       list_has(arrow_cast(make_array(1,2,3), 'LargeList(Int64)'), 4),
+       array_contains(arrow_cast(make_array(1,2,3), 'LargeList(Int64)'), 3),
+       list_contains(arrow_cast(make_array(1,2,3), 'LargeList(Int64)'), 0)
+;
+----
+true true true true true false true false true false true false
+
+query BBBBBBBBBBBB
+select array_has(arrow_cast(make_array(1,2), 'FixedSizeList(2, Int64)'), 1),
+       array_has(arrow_cast(make_array(1,2,NULL), 'FixedSizeList(3, Int64)'), 1),
+       array_has(arrow_cast(make_array([2,3], [3,4]), 'FixedSizeList(2, List(Int64))'), make_array(2,3)),
+       array_has(arrow_cast(make_array([[1], [2,3]], [[4,5], [6]]), 'FixedSizeList(2, List(List(Int64)))'), make_array([1], [2,3])),
+       array_has(arrow_cast(make_array([[1], [2,3]], [[4,5], [6]]), 'FixedSizeList(2, List(List(Int64)))'), make_array([4,5], [6])),
+       array_has(arrow_cast(make_array([[1], [2,3]], [[4,5], [6]]), 'FixedSizeList(2, List(List(Int64)))'), make_array([1])),
+       array_has(arrow_cast(make_array([[[1]]]), 'FixedSizeList(1, List(List(List(Int64))))'), make_array([[1]])),
+       array_has(arrow_cast(make_array([[[1]]], [[[1], [2]]]), 'FixedSizeList(2, List(List(List(Int64))))'), make_array([[2]])),
+       array_has(arrow_cast(make_array([[[1]]], [[[1], [2]]]), 'FixedSizeList(2, List(List(List(Int64))))'), make_array([[1], [2]])),
+       list_has(arrow_cast(make_array(1,2,3), 'FixedSizeList(3, Int64)'), 4),
+       array_contains(arrow_cast(make_array(1,2,3), 'FixedSizeList(3, Int64)'), 3),
+       list_contains(arrow_cast(make_array(1,2,3), 'FixedSizeList(3, Int64)'), 0)
+;
+----
+true true true true true false true false true false true false
+
+query BBB
+select array_has(column1, column2),
+       array_has_all(column3, column4),
+       array_has_any(column5, column6)
+from array_has_table_1D;
+----
+true true true
+false false false
+
+query BBB
+select array_has(arrow_cast(column1, 'LargeList(Int64)'), column2),
+       array_has_all(arrow_cast(column3, 'LargeList(Int64)'), arrow_cast(column4, 'LargeList(Int64)')),
+       array_has_any(arrow_cast(column5, 'LargeList(Int64)'), arrow_cast(column6, 'LargeList(Int64)'))
+from array_has_table_1D;
+----
+true true true
+false false false
+
+query B
+select array_has(column1, column2)
+from array_has_table_null;
+----
+true
+true
+false
+false
+false
+
+# array_has([1, 3, 5], 1) -> true (array contains element)
+# array_has([], 1) -> false (empty array, not null)
+# array_has(null, 1) -> null (null array)
+query BB
+select array_has(column1, column2), array_has(null, column2)
+from array_has_table_empty;
+----
+true NULL
+false NULL
+NULL NULL
+
+# Test for issue: array_has should return false for empty arrays, not null
+# This test demonstrates the correct behavior with COALESCE to show the distinction
+# array_has([1, 3, 5], 1) -> 'true'
+# array_has([], 1) -> 'false' (empty array should return false)
+# array_has(null, 1) -> 'null' (null array should return null)
+query ?T
+SELECT column1, COALESCE(CAST(array_has(column1, column2) AS VARCHAR), 'null')
+from array_has_table_empty;
+----
+[1, 3, 5] true
+[] false
+NULL null
+
+query B
+select array_has(column1, column2)
+from fixed_size_array_has_table_1D;
+----
+true
+false
+
+query BB
+select array_has_all(column3, column4),
+       array_has_any(column5, column6)
+from fixed_size_array_has_table_1D;
+----
+true true
+false false
+
+query BBB
+select array_has(column1, column2),
+       array_has_all(column3, column4),
+       array_has_any(column5, column6)
+from array_has_table_1D_Float;
+----
+true true false
+false false true
+
+query BBB
+select array_has(arrow_cast(column1, 'LargeList(Float64)'), column2),
+       array_has_all(arrow_cast(column3, 'LargeList(Float64)'), arrow_cast(column4, 'LargeList(Float64)')),
+       array_has_any(arrow_cast(column5, 'LargeList(Float64)'), arrow_cast(column6, 'LargeList(Float64)'))
+from array_has_table_1D_Float;
+----
+true true false
+false false true
+
+query B
+select array_has(column1, column2)
+from fixed_size_array_has_table_1D_Float;
+----
+true
+false
+
+query BB
+select array_has_all(column3, column4),
+       array_has_any(column5, column6)
+from fixed_size_array_has_table_1D_Float;
+----
+true true
+false true
+
+query BBB
+select array_has(column1, column2),
+       array_has_all(column3, column4),
+       array_has_any(column5, column6)
+from array_has_table_1D_Boolean;
+----
+false true true
+true true true
+
+query BBB
+select array_has(arrow_cast(column1, 'LargeList(Boolean)'), column2),
+       array_has_all(arrow_cast(column3, 'LargeList(Boolean)'), arrow_cast(column4, 'LargeList(Boolean)')),
+       array_has_any(arrow_cast(column5, 'LargeList(Boolean)'), arrow_cast(column6, 'LargeList(Boolean)'))
+from array_has_table_1D_Boolean;
+----
+false true true
+true true true
+
+query B
+select array_has(column1, column2)
+from fixed_size_array_has_table_1D_Boolean;
+----
+false
+true
+
+query BB
+select array_has_all(column3, column4),
+       array_has_any(column5, column6)
+from fixed_size_array_has_table_1D_Boolean;
+----
+true true
+true true
+
+query BBBBBBBB
+select array_has_all(column3, arrow_cast(column4,'LargeList(Boolean)')),
+       array_has_any(column5, arrow_cast(column6,'LargeList(Boolean)')),
+       array_has_all(column3, arrow_cast(column4,'List(Boolean)')),
+       array_has_any(column5, arrow_cast(column6,'List(Boolean)')),
+       array_has_all(arrow_cast(column3, 'LargeList(Boolean)'), column4),
+       array_has_any(arrow_cast(column5, 'LargeList(Boolean)'), column6),
+       array_has_all(arrow_cast(column3, 'List(Boolean)'), column4),
+       array_has_any(arrow_cast(column5, 'List(Boolean)'), column6)
+from fixed_size_array_has_table_1D_Boolean;
+----
+true true true true true true true true
+true true true true true true true true
+
+query BBB
+select array_has(column1, column2),
+       array_has_all(column3, column4),
+       array_has_any(column5, column6)
+from array_has_table_1D_UTF8;
+----
+true true false
+false false true
+
+query BBB
+select array_has(arrow_cast(column1, 'LargeList(Utf8)'), column2),
+       array_has_all(arrow_cast(column3, 'LargeList(Utf8)'), arrow_cast(column4, 'LargeList(Utf8)')),
+       array_has_any(arrow_cast(column5, 'LargeList(Utf8)'), arrow_cast(column6, 'LargeList(Utf8)'))
+from array_has_table_1D_UTF8;
+----
+true true false
+false false true
+
+query B
+select array_has(column1, column2)
+from fixed_size_array_has_table_1D_UTF8;
+----
+true
+false
+
+query BB
+select array_has(column1, column2),
+       array_has_all(column3, column4)
+from array_has_table_2D;
+----
+false true
+true false
+
+query BB
+select array_has(arrow_cast(column1, 'LargeList(List(Int64))'), column2),
+       array_has_all(arrow_cast(column3, 'LargeList(List(Int64))'), arrow_cast(column4, 'LargeList(List(Int64))'))
+from array_has_table_2D;
+----
+false true
+true false
+
+query B
+select array_has(arrow_cast(column1, 'LargeList(List(Int64))'), column2)
+from fixed_size_array_has_table_2D;
+----
+false
+false
+
+query B
+select array_has_all(arrow_cast(column3, 'LargeList(List(Int64))'), arrow_cast(column4, 'LargeList(List(Int64))'))
+from fixed_size_array_has_table_2D;
+----
+true
+false
+
+query B
+select array_has_all(column1, column2)
+from array_has_table_2D_float;
+----
+true
+false
+
+query B
+select array_has_all(arrow_cast(column1, 'LargeList(List(Float64))'), arrow_cast(column2, 'LargeList(List(Float64))'))
+from array_has_table_2D_float;
+----
+true
+false
+
+query B
+select array_has_all(column1, column2)
+from fixed_size_array_has_table_2D_float;
+----
+false
+false
+
+query B
+select array_has(column1, column2) from array_has_table_3D;
+----
+false
+true
+false
+false
+true
+false
+true
+
+query B
+select array_has(arrow_cast(column1, 'LargeList(List(List(Int64)))'), column2) from array_has_table_3D;
+----
+false
+true
+false
+false
+true
+false
+true
+
+query B
+select array_has(column1, column2) from fixed_size_array_has_table_3D;
+----
+false
+false
+false
+false
+true
+true
+true
+
+query BBBB
+select array_has(column1, make_array(5, 6)),
+       array_has(column1, make_array(7, NULL)),
+       array_has(column2, 5.5),
+       array_has(column3, 'o')
+from arrays;
+----
+false false false true
+true false true false
+true false false true
+false true false false
+NULL NULL false false
+false false NULL false
+false false false NULL
+
+query BBBB
+select array_has(arrow_cast(column1, 'LargeList(List(Int64))'), make_array(5, 6)),
+       array_has(arrow_cast(column1, 'LargeList(List(Int64))'), make_array(7, NULL)),
+       array_has(arrow_cast(column2, 'LargeList(Float64)'), 5.5),
+       array_has(arrow_cast(column3, 'LargeList(Utf8)'), 'o')
+from arrays;
+----
+false false false true
+true false true false
+true false false true
+false true false false
+NULL NULL false false
+false false NULL false
+false false false NULL
+
+# Row 1: [[NULL,2],[3,NULL]], [1.1,2.2,3.3], ['L','o','r','e','m']
+# Row 2: [[3,4],[5,6]], [NULL,5.5,6.6], ['i','p',NULL,'u','m']
+# Row 3: [[5,6],[7,8]], [7.7,8.8,9.9], ['d',NULL,'l','o','r']
+# Row 4: [[7,NULL],[9,10]], [10.1,NULL,12.2], ['s','i','t','a','b']
+# Row 5: NULL, [13.3,14.4,15.5], ['a','m','e','t','x']
+# Row 6: [[11,12],[13,14]], NULL, [',','a','b','c','d']
+# Row 7: [[15,16],[NULL,18]], [16.6,17.7,18.8], NULL
+query BBBB
+select array_has(column1, make_array(5, 6)),
+       array_has(column1, make_array(7, NULL)),
+       array_has(column2, 5.5),
+       array_has(column3, 'o')
+from fixed_size_arrays;
+----
+false false false true
+true false true false
+true false false true
+false true false false
+NULL NULL false false
+false false NULL false
+false false false NULL
+
+query BBBB
+select array_has_all(make_array(1,2,3), []),
+       array_has_any(make_array(1,2,3), []),
+       array_has_all(make_array('aa','bb','cc'), []),
+       array_has_any(make_array('aa','bb','cc'), [])
+;
+----
+true false true false
+
+query BBBBBBBBBBBBB
+select array_has_all(make_array(1,2,3), make_array(1,3)),
+       array_has_all(make_array(1,2,3), make_array(1,4)),
+       array_has_all(make_array([1,2], [3,4]), make_array([1,2])),
+       array_has_all(make_array([1,2], [3,4]), make_array([1,3])),
+       array_has_all(make_array([1,2], [3,4]), make_array([1,2], [3,4], [5,6])),
+       array_has_all(make_array([[1,2,3]]), make_array([[1]])),
+       array_has_all(make_array([[1,2,3]]), make_array([[1,2,3]])),
+       array_has_any(make_array(1,2,3), make_array(1,10,100)),
+       array_has_any(make_array(1,2,3), make_array(10,100)),
+       array_has_any(make_array([1,2], [3,4]), make_array([1,10], [10,4])),
+       array_has_any(make_array([1,2], [3,4]), make_array([10,20], [3,4])),
+       array_has_any(make_array([[1,2,3]]), make_array([[1,2,3], [4,5,6]])),
+       array_has_any(make_array([[1,2,3]]), make_array([[1,2,3]], [[4,5,6]]))
+;
+----
+true false true false false false true true false false true false true
+
+query BBBBBBBBBBBBB
+select array_has_all(arrow_cast(make_array(1,2,3), 'LargeList(Int64)'), arrow_cast(make_array(1,3), 'LargeList(Int64)')),
+       array_has_all(arrow_cast(make_array(1,2,3),'LargeList(Int64)'), arrow_cast(make_array(1,4), 'LargeList(Int64)')),
+       array_has_all(arrow_cast(make_array([1,2], [3,4]), 'LargeList(List(Int64))'), arrow_cast(make_array([1,2]), 'LargeList(List(Int64))')),
+       array_has_all(arrow_cast(make_array([1,2], [3,4]), 'LargeList(List(Int64))'), arrow_cast(make_array([1,3]), 'LargeList(List(Int64))')),
+       array_has_all(arrow_cast(make_array([1,2], [3,4]), 'LargeList(List(Int64))'), arrow_cast(make_array([1,2], [3,4], [5,6]), 'LargeList(List(Int64))')),
+       array_has_all(arrow_cast(make_array([[1,2,3]]), 'LargeList(List(List(Int64)))'), arrow_cast(make_array([[1]]), 'LargeList(List(List(Int64)))')),
+       array_has_all(arrow_cast(make_array([[1,2,3]]), 'LargeList(List(List(Int64)))'), arrow_cast(make_array([[1,2,3]]), 'LargeList(List(List(Int64)))')),
+       array_has_any(arrow_cast(make_array(1,2,3),'LargeList(Int64)'), arrow_cast(make_array(1,10,100), 'LargeList(Int64)')),
+       array_has_any(arrow_cast(make_array(1,2,3),'LargeList(Int64)'), arrow_cast(make_array(10,100),'LargeList(Int64)')),
+       array_has_any(arrow_cast(make_array([1,2], [3,4]), 'LargeList(List(Int64))'), arrow_cast(make_array([1,10], [10,4]), 'LargeList(List(Int64))')),
+       array_has_any(arrow_cast(make_array([1,2], [3,4]), 'LargeList(List(Int64))'), arrow_cast(make_array([10,20], [3,4]), 'LargeList(List(Int64))')),
+       array_has_any(arrow_cast(make_array([[1,2,3]]), 'LargeList(List(List(Int64)))'), arrow_cast(make_array([[1,2,3], [4,5,6]]), 'LargeList(List(List(Int64)))')),
+       array_has_any(arrow_cast(make_array([[1,2,3]]), 'LargeList(List(List(Int64)))'), arrow_cast(make_array([[1,2,3]], [[4,5,6]]), 'LargeList(List(List(Int64)))'))
+;
+----
+true false true false false false true true false false true false true
+
+query BBBBBBBBBBBBB
+select array_has_all(arrow_cast(make_array(1,2,3), 'FixedSizeList(3, Int64)'), arrow_cast(make_array(1, 3), 'FixedSizeList(2, Int64)')),
+       array_has_all(arrow_cast(make_array(1,2,3),'FixedSizeList(3, Int64)'), arrow_cast(make_array(1, 4), 'FixedSizeList(2, Int64)')),
+       array_has_all(arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array([1,2]), 'FixedSizeList(1, List(Int64))')),
+       array_has_all(arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array([1,3]), 'FixedSizeList(1, List(Int64))')),
+       array_has_all(arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array([1,2], [3,4], [5,6]), 'FixedSizeList(3, List(Int64))')),
+       array_has_all(arrow_cast(make_array([[1,2,3]]), 'FixedSizeList(1, List(List(Int64)))'), arrow_cast(make_array([[1]]), 'FixedSizeList(1, List(List(Int64)))')),
+       array_has_all(arrow_cast(make_array([[1,2,3]]), 'FixedSizeList(1, List(List(Int64)))'), arrow_cast(make_array([[1,2,3]]), 'FixedSizeList(1, List(List(Int64)))')),
+       array_has_any(arrow_cast(make_array(1,2,3),'FixedSizeList(3, Int64)'), arrow_cast(make_array(1,10,100), 'FixedSizeList(3, Int64)')),
+       array_has_any(arrow_cast(make_array(1,2,3),'FixedSizeList(3, Int64)'), arrow_cast(make_array(10, 100),'FixedSizeList(2, Int64)')),
+       array_has_any(arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array([1,10], [10,4]), 'FixedSizeList(2, List(Int64))')),
+       array_has_any(arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array([10,20], [3,4]), 'FixedSizeList(2, List(Int64))')),
+       array_has_any(arrow_cast(make_array([[1,2,3]]), 'FixedSizeList(1, List(List(Int64)))'), arrow_cast(make_array([[1,2,3], [4,5,6]]), 'FixedSizeList(1, List(List(Int64)))')),
+       array_has_any(arrow_cast(make_array([[1,2,3]]), 'FixedSizeList(1, List(List(Int64)))'), arrow_cast(make_array([[1,2,3]], [[4,5,6]]), 'FixedSizeList(2, List(List(Int64)))'))
+;
+----
+true false true false false false true true false false true false true
+
+# rewrite various array_has operations to InList where the haystack is a literal list
+# NB that `col in (a, b, c)` is simplified to OR if there are <= 3 elements, so we make 4-element haystack lists
+
+query I
+with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
+select count(*) from test WHERE needle IN ('7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c');
+----
+1
+
+query TT
+explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
+select count(*) from test WHERE needle IN ('7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c');
+----
+logical_plan
+01)Projection: count(Int64(1)) AS count(*)
+02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+03)----SubqueryAlias: test
+04)------SubqueryAlias: t
+05)--------Projection:
+06)----------Filter: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
+07)------------TableScan: generate_series() projection=[value]
+physical_plan
+01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
+02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
+03)----CoalescePartitionsExec
+04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
+05)--------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), projection=[]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
+
+query I
+with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
+select count(*) from test WHERE needle = ANY(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c']);
+----
+1
+
+query TT
+explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
+select count(*) from test WHERE needle = ANY(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c']);
+----
+logical_plan
+01)Projection: count(Int64(1)) AS count(*)
+02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+03)----SubqueryAlias: test
+04)------SubqueryAlias: t
+05)--------Projection:
+06)----------Filter: __common_expr_3 IS NULL AND Boolean(NULL) OR __common_expr_3 IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) IS NOT DISTINCT FROM Boolean(true) AND __common_expr_3 IS NOT NULL
+07)------------Projection: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) AS __common_expr_3
+08)--------------TableScan: generate_series() projection=[value]
+physical_plan
+01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
+02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
+03)----CoalescePartitionsExec
+04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
+05)--------FilterExec: __common_expr_3@0 IS NULL AND NULL OR __common_expr_3@0 IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) IS NOT DISTINCT FROM true AND __common_expr_3@0 IS NOT NULL, projection=[]
+06)----------ProjectionExec: expr=[substr(md5(CAST(value@0 AS Utf8View)), 1, 32) as __common_expr_3]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
+
+query I
+with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
+select count(*) from test WHERE array_has(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], needle);
+----
+1
+
+query TT
+explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
+select count(*) from test WHERE array_has(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], needle);
+----
+logical_plan
+01)Projection: count(Int64(1)) AS count(*)
+02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+03)----SubqueryAlias: test
+04)------SubqueryAlias: t
+05)--------Projection:
+06)----------Filter: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
+07)------------TableScan: generate_series() projection=[value]
+physical_plan
+01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
+02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
+03)----CoalescePartitionsExec
+04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
+05)--------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), projection=[]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
+
+query I
+with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
+select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'LargeList(Utf8View)'), needle);
+----
+1
+
+query TT
+explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
+select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'LargeList(Utf8View)'), needle);
+----
+logical_plan
+01)Projection: count(Int64(1)) AS count(*)
+02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+03)----SubqueryAlias: test
+04)------SubqueryAlias: t
+05)--------Projection:
+06)----------Filter: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
+07)------------TableScan: generate_series() projection=[value]
+physical_plan
+01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
+02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
+03)----CoalescePartitionsExec
+04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
+05)--------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), projection=[]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
+
+query I
+with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
+select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'FixedSizeList(4, Utf8View)'), needle);
+----
+1
+
+query TT
+explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
+select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'FixedSizeList(4, Utf8View)'), needle);
+----
+logical_plan
+01)Projection: count(Int64(1)) AS count(*)
+02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+03)----SubqueryAlias: test
+04)------SubqueryAlias: t
+05)--------Projection:
+06)----------Filter: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
+07)------------TableScan: generate_series() projection=[value]
+physical_plan
+01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
+02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
+03)----CoalescePartitionsExec
+04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
+05)--------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), projection=[]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
+
+query I
+with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
+select count(*) from test WHERE array_has([needle], needle);
+----
+100000
+
+# The optimizer does not currently eliminate the filter;
+# Instead, it's rewritten as `IS NULL OR NOT NULL` due to SQL null semantics
+query TT
+explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
+select count(*) from test WHERE array_has([needle], needle);
+----
+logical_plan
+01)Projection: count(Int64(1)) AS count(*)
+02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+03)----SubqueryAlias: test
+04)------SubqueryAlias: t
+05)--------Projection:
+06)----------Filter: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IS NOT NULL OR Boolean(NULL)
+07)------------TableScan: generate_series() projection=[value]
+physical_plan
+01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
+02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
+03)----CoalescePartitionsExec
+04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
+05)--------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IS NOT NULL OR NULL, projection=[]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
+
+# any operator
+query ?
+select column3 from arrays where 'L'=any(column3);
+----
+[L, o, r, e, m]
+
+query I
+select count(*) from arrays where 'L'=any(column3);
+----
+1
+
+query I
+select count(*) from arrays where 'X'=any(column3);
+----
+0
+
+# any operator with comparison operators
+# Use inline arrays so the test data is visible and the needle (5)
+# falls within the range of some arrays but not others.
+statement ok
+CREATE TABLE any_op_test AS VALUES
+  (1, make_array(1, 2, 3)),
+  (2, make_array(4, 5, 6)),
+  (3, make_array(7, 8, 9)),
+  (4, make_array(3, 5, 7));
+
+# 5 > ANY(arr): true when array_min < 5
+# row1: min=1 < 5 ✓, row2: min=4 < 5 ✓, row3: min=7 < 5 ✗, row4: min=3 < 5 ✓
+query I?
+select column1, column2 from any_op_test where 5 > any(column2) order by column1;
+----
+1 [1, 2, 3]
+2 [4, 5, 6]
+4 [3, 5, 7]
+
+# 5 >= ANY(arr): true when array_min <= 5
+# row1: min=1 <= 5 ✓, row2: min=4 <= 5 ✓, row3: min=7 <= 5 ✗, row4: min=3 <= 5 ✓
+query I?
+select column1, column2 from any_op_test where 5 >= any(column2) order by column1;
+----
+1 [1, 2, 3]
+2 [4, 5, 6]
+4 [3, 5, 7]
+
+# 5 < ANY(arr): true when array_max > 5
+# row1: max=3 > 5 ✗, row2: max=6 > 5 ✓, row3: max=9 > 5 ✓, row4: max=7 > 5 ✓
+query I?
+select column1, column2 from any_op_test where 5 < any(column2) order by column1;
+----
+2 [4, 5, 6]
+3 [7, 8, 9]
+4 [3, 5, 7]
+
+# 5 <= ANY(arr): true when array_max >= 5
+# row1: max=3 >= 5 ✗, row2: max=6 >= 5 ✓, row3: max=9 >= 5 ✓, row4: max=7 >= 5 ✓
+query I?
+select column1, column2 from any_op_test where 5 <= any(column2) order by column1;
+----
+2 [4, 5, 6]
+3 [7, 8, 9]
+4 [3, 5, 7]
+
+# 5 <> ANY(arr): true when array_min != 5 OR array_max != 5
+# row1: [1,2,3] min=1!=5 ✓, row2: [4,5,6] min=4!=5 ✓, row3: [7,8,9] min=7!=5 ✓, row4: [3,5,7] min=3!=5 ✓
+query I?
+select column1, column2 from any_op_test where 5 <> any(column2) order by column1;
+----
+1 [1, 2, 3]
+2 [4, 5, 6]
+3 [7, 8, 9]
+4 [3, 5, 7]
+
+# For a single-element array where the element equals the needle, <> should return false
+query B
+select 5 <> any(make_array(5));
+----
+false
+
+# For a uniform array [5,5,5], <> should also return false
+query B
+select 5 <> any(make_array(5, 5, 5));
+----
+false
+
+# Empty array: all operators should return false (no elements satisfy the condition)
+query B
+select 5 = any(make_array());
+----
+false
+
+query B
+select 5 <> any(make_array());
+----
+false
+
+query B
+select 5 > any(make_array());
+----
+false
+
+query B
+select 5 < any(make_array());
+----
+false
+
+query B
+select 5 >= any(make_array());
+----
+false
+
+query B
+select 5 <= any(make_array());
+----
+false
+
+# Mixed NULL + non-NULL array where no non-NULL element satisfies the condition
+# These return NULL because NULLs leave the result indeterminate
+query B
+select 5 > any(make_array(6, NULL));
+----
+NULL
+
+query B
+select 5 < any(make_array(3, NULL));
+----
+NULL
+
+query B
+select 5 >= any(make_array(6, NULL));
+----
+NULL
+
+query B
+select 5 <= any(make_array(3, NULL));
+----
+NULL
+
+# Mixed NULL + non-NULL array where a non-NULL element satisfies the condition
+query B
+select 5 > any(make_array(3, NULL));
+----
+true
+
+query B
+select 5 < any(make_array(6, NULL));
+----
+true
+
+query B
+select 5 >= any(make_array(5, NULL));
+----
+true
+
+query B
+select 5 <= any(make_array(5, NULL));
+----
+true
+
+query B
+select 5 <> any(make_array(3, NULL));
+----
+true
+
+query B
+select 5 <> any(make_array(5, NULL));
+----
+NULL
+
+# All-NULL array: all operators should return NULL (unknown comparison)
+query B
+select 5 > any(make_array(NULL::INT, NULL::INT));
+----
+NULL
+
+query B
+select 5 < any(make_array(NULL::INT, NULL::INT));
+----
+NULL
+
+query B
+select 5 >= any(make_array(NULL::INT, NULL::INT));
+----
+NULL
+
+query B
+select 5 <= any(make_array(NULL::INT, NULL::INT));
+----
+NULL
+
+query B
+select 5 <> any(make_array(NULL::INT, NULL::INT));
+----
+NULL
+
+query B
+select 5 = any(make_array(NULL::INT, NULL::INT));
+----
+NULL
+
+# NULL left operand: should return NULL for non-empty arrays
+query B
+select NULL > any(make_array(1, 2, 3));
+----
+NULL
+
+query B
+select NULL < any(make_array(1, 2, 3));
+----
+NULL
+
+query B
+select NULL >= any(make_array(1, 2, 3));
+----
+NULL
+
+query B
+select NULL <= any(make_array(1, 2, 3));
+----
+NULL
+
+query B
+select NULL <> any(make_array(1, 2, 3));
+----
+NULL
+
+# NULL left operand with empty array: should return false
+query B
+select NULL > any(make_array());
+----
+false
+
+# NULL array: should return NULL
+query B
+select 5 > any(NULL::INT[]);
+----
+NULL
+
+query B
+select 5 < any(NULL::INT[]);
+----
+NULL
+
+query B
+select 5 >= any(NULL::INT[]);
+----
+NULL
+
+query B
+select 5 <= any(NULL::INT[]);
+----
+NULL
+
+query B
+select 5 <> any(NULL::INT[]);
+----
+NULL
+
+query B
+select 5 = any(NULL::INT[]);
+----
+NULL
+
+# NULL = ANY with non-empty array
+query B
+select NULL = any(make_array(1, 2, 3));
+----
+NULL
+
+# = ANY with no match, no NULLs
+query B
+select 5 = any(make_array(1, 2, 3));
+----
+false
+
+# = ANY with mixed NULL (satisfying) returns TRUE
+query B
+select 5 = any(make_array(5, NULL));
+----
+true
+
+# = ANY with mixed NULL (non-satisfying): NULLs leave result indeterminate
+query B
+select 5 = any(make_array(1, 2, NULL));
+----
+NULL
+
+statement ok
+DROP TABLE any_op_test;
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_index.slt b/datafusion/sqllogictest/test_files/array/array_index.slt
new file mode 100644
index 0000000000000..9cd033418d24b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_index.slt
@@ -0,0 +1,258 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+### Array index
+
+
+## array[i]
+
+# single index with scalars #1 (positive index)
+query IRT
+select make_array(1, 2, 3)[1], make_array(1.0, 2.0, 3.0)[2], make_array('h', 'e', 'l', 'l', 'o')[3];
+----
+1 2 l
+
+# single index with scalars #2 (zero index)
+query I
+select make_array(1, 2, 3)[0];
+----
+NULL
+
+# single index with scalars #3 (negative index)
+query IRT
+select make_array(1, 2, 3)[-1], make_array(1.0, 2.0, 3.0)[-2], make_array('h', 'e', 'l', 'l', 'o')[-3];
+----
+3 2 l
+
+# single index with scalars #4 (complex index)
+query IRT
+select make_array(1, 2, 3)[1 + 2 - 1], make_array(1.0, 2.0, 3.0)[2 * 1 * 0 - 2], make_array('h', 'e', 'l', 'l', 'o')[2 - 3];
+----
+2 2 o
+
+# single index with columns #1 (positive index)
+query ?RT
+select column1[2], column2[3], column3[1] from arrays;
+----
+[3, NULL] 3.3 L
+[5, 6] 6.6 i
+[7, 8] 9.9 d
+[9, 10] 12.2 s
+NULL 15.5 a
+[13, 14] NULL ,
+[NULL, 18] 18.8 NULL
+
+# single index with columns #2 (zero index)
+query ?RT
+select column1[0], column2[0], column3[0] from arrays;
+----
+NULL NULL NULL
+NULL NULL NULL
+NULL NULL NULL
+NULL NULL NULL
+NULL NULL NULL
+NULL NULL NULL
+NULL NULL NULL
+
+# single index with columns #3 (negative index)
+query ?RT
+select column1[-2], column2[-3], column3[-1] from arrays;
+----
+[NULL, 2] 1.1 m
+[3, 4] NULL m
+[5, 6] 7.7 r
+[7, NULL] 10.1 t
+NULL 13.3 t
+[11, 12] NULL ,
+[15, 16] 16.6 NULL
+
+# single index with columns #4 (complex index)
+query ?RT
+select column1[9 - 7], column2[2 * 0], column3[1 - 3] from arrays;
+----
+[3, NULL] NULL e
+[5, 6] NULL u
+[7, 8] NULL o
+[9, 10] NULL i
+NULL NULL e
+[13, 14] NULL NULL
+[NULL, 18] NULL NULL
+
+# TODO: support index as column
+# single index with columns #5 (index as column)
+# query ?
+# select make_array(1, 2, 3, 4, 5)[column2] from arrays_with_repeating_elements;
+# ----
+
+# TODO: support argument and index as columns
+# single index with columns #6 (argument and index as columns)
+# query I
+# select column1[column2] from arrays_with_repeating_elements;
+# ----
+
+## array[i:j]
+
+# multiple index with columns #1 (positive index)
+query ???
+select make_array(1, 2, 3)[1:2], make_array(1.0, 2.0, 3.0)[2:3], make_array('h', 'e', 'l', 'l', 'o')[2:4];
+----
+[1, 2] [2.0, 3.0] [e, l, l]
+
+query ???
+select arrow_cast([1, 2, 3], 'LargeList(Int64)')[1:2],
+       arrow_cast([1.0, 2.0, 3.0], 'LargeList(Int64)')[2:3],
+       arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)')[2:4]
+;
+----
+[1, 2] [2, 3] [e, l, l]
+
+# multiple index with columns #2 (zero index)
+query ???
+select make_array(1, 2, 3)[0:0], make_array(1.0, 2.0, 3.0)[0:2], make_array('h', 'e', 'l', 'l', 'o')[0:6];
+----
+[] [1.0, 2.0] [h, e, l, l, o]
+
+query ???
+select arrow_cast([1, 2, 3], 'LargeList(Int64)')[0:0],
+       arrow_cast([1.0, 2.0, 3.0], 'LargeList(Int64)')[0:2],
+       arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)')[0:6]
+;
+----
+[] [1, 2] [h, e, l, l, o]
+
+query I
+select arrow_cast([1, 2, 3], 'LargeList(Int64)')[1];
+----
+1
+
+# TODO: support multiple negative index
+# multiple index with columns #3 (negative index)
+# query II
+# select make_array(1, 2, 3)[-3:-1], make_array(1.0, 2.0, 3.0)[-3:-1], make_array('h', 'e', 'l', 'l', 'o')[-2:0];
+# ----
+
+# TODO: support complex index
+# multiple index with columns #4 (complex index)
+# query III
+# select make_array(1, 2, 3)[2 + 1 - 1:10], make_array(1.0, 2.0, 3.0)[2 | 2:10], make_array('h', 'e', 'l', 'l', 'o')[6 ^ 6:10];
+# ----
+
+# multiple index with columns #1 (positive index)
+query ???
+select column1[2:4], column2[1:4], column3[3:4] from arrays;
+----
+[[3, NULL]] [1.1, 2.2, 3.3] [r, e]
+[[5, 6]] [NULL, 5.5, 6.6] [NULL, u]
+[[7, 8]] [7.7, 8.8, 9.9] [l, o]
+[[9, 10]] [10.1, NULL, 12.2] [t]
+NULL [13.3, 14.4, 15.5] [e, t]
+[[13, 14]] NULL []
+[[NULL, 18]] [16.6, 17.7, 18.8] NULL
+
+# multiple index with columns #2 (zero index)
+query ???
+select column1[0:5], column2[0:3], column3[0:9] from arrays;
+----
+[[NULL, 2], [3, NULL]] [1.1, 2.2, 3.3] [L, o, r, e, m]
+[[3, 4], [5, 6]] [NULL, 5.5, 6.6] [i, p, NULL, u, m]
+[[5, 6], [7, 8]] [7.7, 8.8, 9.9] [d, NULL, l, o, r]
+[[7, NULL], [9, 10]] [10.1, NULL, 12.2] [s, i, t]
+NULL [13.3, 14.4, 15.5] [a, m, e, t]
+[[11, 12], [13, 14]] NULL [,]
+[[15, 16], [NULL, 18]] [16.6, 17.7, 18.8] NULL
+
+# TODO: support negative index
+# multiple index with columns #3 (negative index)
+# query ?RT
+# select column1[-2:-4], column2[-3:-5], column3[-1:-4] from arrays;
+# ----
+# [NULL, 2] 1.1 m
+
+# TODO: support complex index
+# multiple index with columns #4 (complex index)
+# query ?RT
+# select column1[9 - 7:2 + 2], column2[1 * 0:2 * 3], column3[1 + 1 - 0:5 % 3] from arrays;
+# ----
+
+# TODO: support first index as column
+# multiple index with columns #5 (first index as column)
+# query ?
+# select make_array(1, 2, 3, 4, 5)[column2:4] from arrays_with_repeating_elements
+# ----
+
+# TODO: support last index as column
+# multiple index with columns #6 (last index as column)
+# query ?RT
+# select make_array(1, 2, 3, 4, 5)[2:column3] from arrays_with_repeating_elements;
+# ----
+
+# TODO: support argument and indices as column
+# multiple index with columns #7 (argument and indices as column)
+# query ?RT
+# select column1[column2:column3] from arrays_with_repeating_elements;
+# ----
+
+# array[i:j:k]
+
+# multiple index with columns #1 (positive index)
+query ???
+select make_array(1, 2, 3)[1:2:2], make_array(1.0, 2.0, 3.0)[2:3:2], make_array('h', 'e', 'l', 'l', 'o')[2:4:2];
+----
+[1] [2.0] [e, l]
+
+# multiple index with columns #2 (zero index)
+query ???
+select make_array(1, 2, 3)[0:0:2], make_array(1.0, 2.0, 3.0)[0:2:2], make_array('h', 'e', 'l', 'l', 'o')[0:6:2];
+----
+[] [1.0] [h, l, o]
+
+#TODO: sqlparser does not support negative index
+## multiple index with columns #3 (negative index)
+#query ???
+#select make_array(1, 2, 3)[-1:-2:-2], make_array(1.0, 2.0, 3.0)[-2:-3:-2], make_array('h', 'e', 'l', 'l', 'o')[-2:-4:-2];
+#----
+#[1] [2.0] [e, l]
+
+# multiple index with columns #1 (positive index)
+query ???
+select column1[2:4:2], column2[1:4:2], column3[3:4:2] from arrays;
+----
+[[3, NULL]] [1.1, 3.3] [r]
+[[5, 6]] [NULL, 6.6] [NULL]
+[[7, 8]] [7.7, 9.9] [l]
+[[9, 10]] [10.1, 12.2] [t]
+NULL [13.3, 15.5] [e]
+[[13, 14]] NULL []
+[[NULL, 18]] [16.6, 18.8] NULL
+
+# multiple index with columns #2 (zero index)
+query ???
+select column1[0:5:2], column2[0:3:2], column3[0:9:2] from arrays;
+----
+[[NULL, 2]] [1.1, 3.3] [L, r, m]
+[[3, 4]] [NULL, 6.6] [i, NULL, m]
+[[5, 6]] [7.7, 9.9] [d, l, r]
+[[7, NULL]] [10.1, 12.2] [s, t]
+NULL [13.3, 15.5] [a, e]
+[[11, 12]] NULL [,]
+[[15, 16]] [16.6, 18.8] NULL
+
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_length.slt b/datafusion/sqllogictest/test_files/array/array_length.slt
new file mode 100644
index 0000000000000..1bb5382339854
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_length.slt
@@ -0,0 +1,491 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## trim_array (deprecated)
+
+## array_length (aliases: `list_length`)
+
+# array_length scalar function #1
+query III
+select array_length(make_array(1, 2, 3, 4, 5)), array_length(make_array(1, 2, 3)), array_length(make_array([1, 2], [3, 4], [5, 6]));
+----
+5 3 3
+
+query III
+select array_length(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)')), array_length(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), array_length(arrow_cast(make_array([1, 2], [3, 4], [5, 6]), 'LargeList(List(Int64))'));
+----
+5 3 3
+
+# array_length scalar function #2
+query III
+select array_length(make_array(1, 2, 3, 4, 5), 1), array_length(make_array(1, 2, 3), 1), array_length(make_array([1, 2], [3, 4], [5, 6]), 1);
+----
+5 3 3
+
+query III
+select array_length(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 1), array_length(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 1), array_length(arrow_cast(make_array([1, 2], [3, 4], [5, 6]), 'LargeList(List(Int64))'), 1);
+----
+5 3 3
+
+# array_length scalar function #3
+query III
+select array_length(make_array(1, 2, 3, 4, 5), 2), array_length(make_array(1, 2, 3), 2), array_length(make_array([1, 2], [3, 4], [5, 6]), 2);
+----
+NULL NULL 2
+
+query III
+select array_length(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2), array_length(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 2), array_length(arrow_cast(make_array([1, 2], [3, 4], [5, 6]), 'LargeList(List(Int64))'), 2);
+----
+NULL NULL 2
+
+# array_length scalar function #4
+query II
+select array_length(array_repeat(array_repeat(array_repeat(3, 5), 2), 3), 1), array_length(array_repeat(array_repeat(array_repeat(3, 5), 2), 3), 2);
+----
+3 2
+
+query II
+select array_length(arrow_cast(array_repeat(array_repeat(array_repeat(3, 5), 2), 3), 'LargeList(List(List(Int64)))'), 1), array_length(arrow_cast(array_repeat(array_repeat(array_repeat(3, 5), 2), 3), 'LargeList(List(List(Int64)))'), 2);
+----
+3 2
+
+# array_length scalar function #5
+query III
+select array_length(make_array()), array_length(make_array(), 1), array_length(make_array(), 2)
+----
+0 0 NULL
+
+# array_length scalar function #6 nested array
+query III
+select array_length([[1, 2, 3, 4], [5, 6, 7, 8]]), array_length([[1, 2, 3, 4], [5, 6, 7, 8]], 1), array_length([[1, 2, 3, 4], [5, 6, 7, 8]], 2);
+----
+2 2 4
+
+# list_length scalar function #7 (function alias `array_length`)
+query IIII
+select list_length(make_array(1, 2, 3, 4, 5)), list_length(make_array(1, 2, 3)), list_length(make_array([1, 2], [3, 4], [5, 6])), array_length([[1, 2, 3, 4], [5, 6, 7, 8]], 3);
+----
+5 3 3 NULL
+
+query III
+select list_length(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)')), list_length(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), list_length(arrow_cast(make_array([1, 2], [3, 4], [5, 6]), 'LargeList(List(Int64))'));
+----
+5 3 3
+
+# array_length with columns
+query I
+select array_length(column1, column3) from arrays_values;
+----
+10
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+
+query I
+select array_length(arrow_cast(column1, 'LargeList(Int64)'), column3) from arrays_values;
+----
+10
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+
+# array_length with columns and scalars
+query II
+select array_length(array[array[1, 2], array[3, 4]], column3), array_length(column1, 1) from arrays_values;
+----
+2 10
+2 10
+NULL 10
+NULL 10
+NULL NULL
+NULL 10
+NULL 10
+NULL 10
+
+query II
+select array_length(arrow_cast(array[array[1, 2], array[3, 4]], 'LargeList(List(Int64))'), column3), array_length(arrow_cast(column1, 'LargeList(Int64)'), 1) from arrays_values;
+----
+2 10
+2 10
+NULL 10
+NULL 10
+NULL NULL
+NULL 10
+NULL 10
+NULL 10
+
+# array_length for fixed sized list
+
+query III
+select array_length(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)')), array_length(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), array_length(arrow_cast(make_array([1, 2], [3, 4], [5, 6]), 'FixedSizeList(3, List(Int64))'));
+----
+5 3 3
+
+query III
+select array_length(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 1), array_length(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 1), array_length(arrow_cast(make_array([1, 2], [3, 4], [5, 6]), 'FixedSizeList(3, List(Int64))'), 1);
+----
+5 3 3
+
+
+query RRR
+select array_distance([2], [3]), list_distance([1], [2]), list_distance([1], [-2]);
+----
+1 1 3
+
+query error
+select list_distance([1], [1, 2]);
+
+query R
+select array_distance([[1, 1]], [1, 2]);
+----
+1
+
+query R
+select array_distance([[1, 1]], [[1, 2]]);
+----
+1
+
+query R
+select array_distance([[1, 1]], [[1, 2]]);
+----
+1
+
+query RR
+select array_distance([1, 1, 0, 0], [2, 2, 1, 1]), list_distance([1, 2, 3], [1, 2, 3]);
+----
+2 0
+
+query RR
+select array_distance([1.0, 1, 0, 0], [2, 2.0, 1, 1]), list_distance([1, 2.0, 3], [1, 2, 3]);
+----
+2 0
+
+query R
+select list_distance([1, 1, NULL, 0], [2, 2, NULL, NULL]);
+----
+NULL
+
+query R
+select list_distance([NULL, NULL], [NULL, NULL]);
+----
+NULL
+
+query R
+select list_distance([1.0, 2.0, 3.0], [1.0, 2.0, 3.5]) AS distance;
+----
+0.5
+
+query R
+select list_distance([1, 2, 3], [1, 2, 3]) AS distance;
+----
+0
+
+# array_distance with columns
+query RRR
+select array_distance(column1, column2), array_distance(column1, column3), array_distance(column1, column4) from arrays_distance_table;
+----
+0 0.374165738677 NULL
+5.196152422707 6.063827174318 NULL
+10.392304845413 11.778794505381 NULL
+15.58845726812 15.935494971917 NULL
+
+query RRR
+select array_distance(column1, column2), array_distance(column1, column3), array_distance(column1, column4) from large_arrays_distance_table;
+----
+0 0.374165738677 NULL
+5.196152422707 6.063827174318 NULL
+10.392304845413 11.778794505381 NULL
+15.58845726812 15.935494971917 NULL
+
+query RRR
+select array_distance(column1, column2), array_distance(column1, column3), array_distance(column1, column4) from fixed_size_arrays_distance_table;
+----
+0 0.374165738677 NULL
+5.196152422707 6.063827174318 NULL
+10.392304845413 11.778794505381 NULL
+15.58845726812 15.935494971917 NULL
+
+
+## array_dims (aliases: `list_dims`)
+
+# array dims error
+query error
+select array_dims(1);
+
+# array_dims scalar function
+query ???
+select array_dims(make_array(1, 2, 3)), array_dims(make_array([1, 2], [3, 4])), array_dims(make_array([[[[1], [2]]]]));
+----
+[3] [2, 2] [1, 1, 1, 2, 1]
+
+query ???
+select array_dims(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), array_dims(arrow_cast(make_array([1, 2], [3, 4]), 'LargeList(List(Int64))')), array_dims(arrow_cast(make_array([[[[1], [2]]]]), 'LargeList(List(List(List(List(Int64)))))'));
+----
+[3] [2, 2] [1, 1, 1, 2, 1]
+
+query ???
+select array_dims(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), array_dims(arrow_cast(make_array([1, 2], [3, 4]), 'FixedSizeList(2, List(Int64))')), array_dims(arrow_cast(make_array([[[[1], [2]]]]), 'FixedSizeList(1, List(List(List(List(Int64)))))'));
+----
+[3] [2, 2] [1, 1, 1, 2, 1]
+
+# array_dims scalar function #2
+query ??
+select array_dims(array_repeat(array_repeat(array_repeat(2, 3), 2), 1)), array_dims(array_repeat(array_repeat(array_repeat(3, 4), 5), 2));
+----
+[1, 2, 3] [2, 5, 4]
+
+query ??
+select array_dims(arrow_cast(array_repeat(array_repeat(array_repeat(2, 3), 2), 1), 'LargeList(List(List(Int64)))')), array_dims(arrow_cast(array_repeat(array_repeat(array_repeat(3, 4), 5), 2), 'LargeList(List(List(Int64)))'));
+----
+[1, 2, 3] [2, 5, 4]
+
+query ??
+select array_dims(arrow_cast(array_repeat(array_repeat(array_repeat(2, 3), 2), 1), 'FixedSizeList(1, List(List(Int64)))')), array_dims(arrow_cast(array_repeat(array_repeat(array_repeat(3, 4), 5), 2), 'FixedSizeList(2, List(List(Int64)))'));
+----
+[1, 2, 3] [2, 5, 4]
+
+# array_dims scalar function #3
+query ??
+select array_dims(make_array()), array_dims(make_array(make_array()))
+----
+NULL [1, 0]
+
+query ??
+select array_dims(arrow_cast(make_array(), 'LargeList(Int64)')), array_dims(arrow_cast(make_array(make_array()), 'LargeList(List(Int64))'))
+----
+NULL [1, 0]
+
+# list_dims scalar function #4 (function alias `array_dims`)
+query ???
+select list_dims(make_array(1, 2, 3)), list_dims(make_array([1, 2], [3, 4])), list_dims(make_array([[[[1], [2]]]]));
+----
+[3] [2, 2] [1, 1, 1, 2, 1]
+
+query ???
+select list_dims(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), list_dims(arrow_cast(make_array([1, 2], [3, 4]), 'LargeList(List(Int64))')), list_dims(arrow_cast(make_array([[[[1], [2]]]]), 'LargeList(List(List(List(List(Int64)))))'));
+----
+[3] [2, 2] [1, 1, 1, 2, 1]
+
+query ???
+select list_dims(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), list_dims(arrow_cast(make_array([1, 2], [3, 4]), 'FixedSizeList(2, List(Int64))')), list_dims(arrow_cast(make_array([[[[1], [2]]]]), 'FixedSizeList(1, List(List(List(List(Int64)))))'));
+----
+[3] [2, 2] [1, 1, 1, 2, 1]
+
+# array_dims with columns
+query ???
+select array_dims(column1), array_dims(column2), array_dims(column3) from arrays;
+----
+[2, 2] [3] [5]
+[2, 2] [3] [5]
+[2, 2] [3] [5]
+[2, 2] [3] [3]
+NULL [3] [4]
+[2, 2] NULL [1]
+[2, 2] [3] NULL
+
+query ???
+select array_dims(column1), array_dims(column2), array_dims(column3) from large_arrays;
+----
+[2, 2] [3] [5]
+[2, 2] [3] [5]
+[2, 2] [3] [5]
+[2, 2] [3] [3]
+NULL [3] [4]
+[2, 2] NULL [1]
+[2, 2] [3] NULL
+
+query ???
+select array_dims(column1), array_dims(column2), array_dims(column3) from fixed_size_arrays;
+----
+[2, 2] [3] [5]
+[2, 2] [3] [5]
+[2, 2] [3] [5]
+[2, 2] [3] [5]
+NULL [3] [5]
+[2, 2] NULL [5]
+[2, 2] [3] NULL
+
+
+## array_ndims (aliases: `list_ndims`)
+
+# array_ndims scalar function #1
+
+#follow PostgreSQL
+query I
+select
+  array_ndims(null);
+----
+NULL
+
+query I
+select
+  array_ndims([2, 3]);
+----
+1
+
+statement ok
+CREATE TABLE array_ndims_table
+AS VALUES
+  ([1], [1, 2, 3], [[7]], [[[[[10]]]]]),
+  ([2], [4, 5], [[8]], [[[[[10]]]]]),
+  (NUll, [6, 7], [[9]], [[[[[10]]]]]),
+  ([3], [6], [[9]], [[[[[10]]]]])
+;
+
+statement ok
+CREATE TABLE large_array_ndims_table
+AS SELECT
+  column1,
+  arrow_cast(column2, 'LargeList(Int64)') as column2,
+  arrow_cast(column3, 'LargeList(List(Int64))') as column3,
+  arrow_cast(column4, 'LargeList(List(List(List(List(Int64)))))') as column4
+FROM array_ndims_table;
+
+statement ok
+CREATE TABLE fixed_array_ndims_table
+AS VALUES
+  (arrow_cast([1], 'FixedSizeList(1, Int64)'), arrow_cast([1, 2, 3], 'FixedSizeList(3, Int64)'), arrow_cast([[7]], 'FixedSizeList(1, List(Int64))'), arrow_cast([[[[[10]]]]], 'FixedSizeList(1, List(List(List(List(Int64)))))')),
+  (arrow_cast([2], 'FixedSizeList(1, Int64)'), arrow_cast([4, 5, 6], 'FixedSizeList(3, Int64)'), arrow_cast([[8]], 'FixedSizeList(1, List(Int64))'), arrow_cast([[[[[10]]]]], 'FixedSizeList(1, List(List(List(List(Int64)))))')),
+  (null, arrow_cast([6, 7, 8], 'FixedSizeList(3, Int64)'), arrow_cast([[9]], 'FixedSizeList(1, List(Int64))'), arrow_cast([[[[[10]]]]], 'FixedSizeList(1, List(List(List(List(Int64)))))')),
+  (arrow_cast([3], 'FixedSizeList(1, Int64)'), arrow_cast([6, 7, 8], 'FixedSizeList(3, Int64)'), arrow_cast([[9]], 'FixedSizeList(1, List(Int64))'), arrow_cast([[[[[10]]]]], 'FixedSizeList(1, List(List(List(List(Int64)))))'))
+;
+
+query IIII
+select
+  array_ndims(column1),
+  array_ndims(column2),
+  array_ndims(column3),
+  array_ndims(column4)
+from array_ndims_table;
+----
+1 1 2 5
+1 1 2 5
+NULL 1 2 5
+1 1 2 5
+
+query IIII
+select
+  array_ndims(column1),
+  array_ndims(column2),
+  array_ndims(column3),
+  array_ndims(column4)
+from large_array_ndims_table;
+----
+1 1 2 5
+1 1 2 5
+NULL 1 2 5
+1 1 2 5
+
+query IIII
+select
+  array_ndims(column1),
+  array_ndims(column2),
+  array_ndims(column3),
+  array_ndims(column4)
+from fixed_array_ndims_table;
+----
+1 1 2 5
+1 1 2 5
+NULL 1 2 5
+1 1 2 5
+
+
+
+statement ok
+drop table array_ndims_table;
+
+statement ok
+drop table large_array_ndims_table
+
+query I
+select array_ndims(arrow_cast([null], 'List(List(List(Int64)))'));
+----
+3
+
+# array_ndims scalar function #2
+query II
+select array_ndims(array_repeat(array_repeat(array_repeat(1, 3), 2), 1)), array_ndims([[[[[[[[[[[[[[[[[[[[[1]]]]]]]]]]]]]]]]]]]]]);
+----
+3 21
+
+# array_ndims scalar function #3
+query II
+select array_ndims(make_array()), array_ndims(make_array(make_array()))
+----
+1 2
+
+query II
+select array_ndims(arrow_cast(make_array(), 'LargeList(Int64)')), array_ndims(arrow_cast(make_array(make_array()), 'LargeList(List(Int64))'))
+----
+1 2
+
+# list_ndims scalar function #4 (function alias `array_ndims`)
+query III
+select list_ndims(make_array(1, 2, 3)), list_ndims(make_array([1, 2], [3, 4])), list_ndims(make_array([[[[1], [2]]]]));
+----
+1 2 5
+
+query III
+select list_ndims(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), list_ndims(arrow_cast(make_array([1, 2], [3, 4]), 'LargeList(List(Int64))')), list_ndims(arrow_cast(make_array([[[[1], [2]]]]), 'LargeList(List(List(List(List(Int64)))))'));
+----
+1 2 5
+
+query II
+select list_ndims(make_array()), list_ndims(make_array(make_array()))
+----
+1 2
+
+query II
+select list_ndims(arrow_cast(make_array(), 'LargeList(Int64)')), list_ndims(arrow_cast(make_array(make_array()), 'LargeList(List(Int64))'))
+----
+1 2
+
+# array_ndims with columns
+query III
+select array_ndims(column1), array_ndims(column2), array_ndims(column3) from arrays;
+----
+2 1 1
+2 1 1
+2 1 1
+2 1 1
+NULL 1 1
+2 NULL 1
+2 1 NULL
+
+query III
+select array_ndims(column1), array_ndims(column2), array_ndims(column3) from large_arrays;
+----
+2 1 1
+2 1 1
+2 1 1
+2 1 1
+NULL 1 1
+2 NULL 1
+2 1 NULL
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_literal.slt b/datafusion/sqllogictest/test_files/array/array_literal.slt
new file mode 100644
index 0000000000000..ab036d2d96543
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_literal.slt
@@ -0,0 +1,156 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+# Array literal
+
+## boolean coercion is not supported
+query error
+select [1, true, null]
+
+## wrapped in array_length to get deterministic results
+query I
+SELECT array_length([now()])
+----
+1
+
+## array literal with functions
+query ?
+select [abs(-1.2), sin(-1), log(2), ceil(3.141)]
+----
+[1.2, -0.8414709848078965, 0.30102999566398114, 4.0]
+
+## array literal with nested types
+query ???
+select
+  [struct('foo', 1)],
+  [struct('foo', [1,2,3])],
+  [struct('foo', [struct(3, 'x')])]
+;
+----
+[{c0: foo, c1: 1}] [{c0: foo, c1: [1, 2, 3]}] [{c0: foo, c1: [{c0: 3, c1: x}]}]
+
+query TTT
+select arrow_typeof(column1), arrow_typeof(column2), arrow_typeof(column3) from arrays;
+----
+List(List(Int64)) List(Float64) List(Utf8)
+List(List(Int64)) List(Float64) List(Utf8)
+List(List(Int64)) List(Float64) List(Utf8)
+List(List(Int64)) List(Float64) List(Utf8)
+List(List(Int64)) List(Float64) List(Utf8)
+List(List(Int64)) List(Float64) List(Utf8)
+List(List(Int64)) List(Float64) List(Utf8)
+
+# arrays table
+query ???
+select column1, column2, column3 from arrays;
+----
+[[NULL, 2], [3, NULL]] [1.1, 2.2, 3.3] [L, o, r, e, m]
+[[3, 4], [5, 6]] [NULL, 5.5, 6.6] [i, p, NULL, u, m]
+[[5, 6], [7, 8]] [7.7, 8.8, 9.9] [d, NULL, l, o, r]
+[[7, NULL], [9, 10]] [10.1, NULL, 12.2] [s, i, t]
+NULL [13.3, 14.4, 15.5] [a, m, e, t]
+[[11, 12], [13, 14]] NULL [,]
+[[15, 16], [NULL, 18]] [16.6, 17.7, 18.8] NULL
+
+# nested_arrays table
+query ??I??
+select column1, column2, column3, column4, column5 from nested_arrays;
+----
+[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]] [7, 8, 9] 2 [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]] [11, 12, 13]
+[[4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7]] [10, 11, 12] 3 [[[11, 12, 13], [14, 15, 16]], [[17, 18, 19], [20, 21, 22]]] [121, 131, 141]
+
+# values table
+query IIIRT
+select a, b, c, d, e from values;
+----
+1 1 2 1.1 Lorem
+2 3 4 2.2 ipsum
+3 5 6 3.3 dolor
+4 7 8 4.4 sit
+NULL 9 10 5.5 amet
+5 NULL 12 6.6 ,
+6 11 NULL 7.7 consectetur
+7 13 14 NULL adipiscing
+8 15 16 8.8 NULL
+
+# arrays_values table
+query ?IIT
+select column1, column2, column3, column4 from arrays_values;
+----
+[NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10] 1 1 ,
+[11, 12, 13, 14, 15, 16, 17, 18, NULL, 20] 12 2 .
+[21, 22, 23, NULL, 25, 26, 27, 28, 29, 30] 23 3 -
+[31, 32, 33, 34, 35, NULL, 37, 38, 39, 40] 34 4 ok
+NULL 44 5 @
+[41, 42, 43, 44, 45, 46, 47, 48, 49, 50] NULL 6 $
+[51, 52, NULL, 54, 55, 56, 57, 58, 59, 60] 55 NULL ^
+[61, 62, 63, 64, 65, 66, 67, 68, 69, 70] 66 7 NULL
+
+# slices table
+query ?II
+select column1, column2, column3 from slices;
+----
+[NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10] 1 1
+[11, 12, 13, 14, 15, 16, 17, 18, NULL, 20] 2 -4
+[21, 22, 23, NULL, 25, 26, 27, 28, 29, 30] 0 0
+[31, 32, 33, 34, 35, NULL, 37, 38, 39, 40] -4 -7
+NULL 4 5
+[41, 42, 43, 44, 45, 46, 47, 48, 49, 50] NULL 6
+[51, 52, NULL, 54, 55, 56, 57, 58, 59, 60] 5 NULL
+
+query ??I?
+select column1, column2, column3, column4 from arrays_values_v2;
+----
+[NULL, 2, 3] [4, 5, NULL] 12 [[30, 40, 50]]
+NULL [7, NULL, 8] 13 [[NULL, NULL, 60]]
+[9, NULL, 10] NULL 14 [[70, NULL, NULL]]
+[NULL, 1] [NULL, 21] NULL NULL
+[11, 12] NULL NULL NULL
+NULL NULL NULL NULL
+
+# arrays_values_without_nulls table
+query ?IIT
+select column1, column2, column3, column4 from arrays_values_without_nulls;
+----
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 1 1 ,
+[11, 12, 13, 14, 15, 16, 17, 18, 19, 20] 12 2 .
+[21, 22, 23, 24, 25, 26, 27, 28, 29, 30] 23 3 -
+[31, 32, 33, 34, 35, 26, 37, 38, 39, 40] 34 4 ok
+
+# arrays_with_repeating_elements table
+query ?III
+select column1, column2, column3, column4 from arrays_with_repeating_elements;
+----
+[1, 2, 1, 3, 2, 2, 1, 3, 2, 3] 2 4 3
+[4, 4, 5, 5, 6, 5, 5, 5, 4, 4] 4 7 2
+[7, 7, 7, 8, 7, 9, 7, 8, 7, 7] 7 10 5
+[10, 11, 12, 10, 11, 12, 10, 11, 12, 10] 10 13 10
+
+# nested_arrays_with_repeating_elements table
+query ???I
+select column1, column2, column3, column4 from nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]] [4, 5, 6] [10, 11, 12] 3
+[[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] [10, 11, 12] [19, 20, 21] 2
+[[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [19, 20, 21] [28, 29, 30] 5
+[[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]] [28, 29, 30] [37, 38, 39] 10
+
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_min_max.slt b/datafusion/sqllogictest/test_files/array/array_min_max.slt
new file mode 100644
index 0000000000000..2e9c1035b765d
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_min_max.slt
@@ -0,0 +1,317 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## array_max
+# array_max scalar function #1 (with positive index)
+query I
+select array_max(make_array(5, 3, 6, 4));
+----
+6
+
+query I
+select array_max(make_array(5, 3, 4, NULL, 6, NULL));
+----
+6
+
+query ?
+select array_max(make_array(NULL, NULL));
+----
+NULL
+
+query T
+select array_max(make_array('h', 'e', 'o', 'l', 'l'));
+----
+o
+
+query T
+select array_max(make_array('h', 'e', 'l', NULL, 'l', 'o', NULL));
+----
+o
+
+query B
+select array_max(make_array(false, true, false, true));
+----
+true
+
+query B
+select array_max(make_array(false, true, NULL, false, true));
+----
+true
+
+query D
+select array_max(make_array(DATE '1992-09-01', DATE '1993-03-01', DATE '1999-05-01', DATE '1985-11-01'));
+----
+1999-05-01
+
+query D
+select array_max(make_array(DATE '1995-09-01', DATE '1999-05-01', DATE '1993-03-01', NULL));
+----
+1999-05-01
+
+query P
+select array_max(make_array(TIMESTAMP '1992-09-01', TIMESTAMP '1995-06-01', TIMESTAMP '1984-10-01'));
+----
+1995-06-01T00:00:00
+
+query P
+select array_max(make_array(NULL, TIMESTAMP '1996-10-01', TIMESTAMP '1995-06-01'));
+----
+1996-10-01T00:00:00
+
+query R
+select array_max(make_array(5.1, -3.2, 6.3, 4.9));
+----
+6.3
+
+query ?I
+select input, array_max(input) from (select make_array(d - 1, d, d + 1) input from (values (0), (10), (20), (30), (NULL)) t(d))
+----
+[-1, 0, 1] 1
+[9, 10, 11] 11
+[19, 20, 21] 21
+[29, 30, 31] 31
+[NULL, NULL, NULL] NULL
+
+query II
+select array_max(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), array_max(arrow_cast(make_array(1), 'LargeList(Int64)'));
+----
+3 1
+
+query II
+select array_max(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), array_max(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)'));
+----
+3 1
+
+query ?
+select array_max(make_array());
+----
+NULL
+
+# Testing with empty arguments should result in an error
+query error DataFusion error: Error during planning: 'array_max' does not support zero arguments
+select array_max();
+
+# array_max over multiple rows (exercises the offsets-based iteration)
+query I
+select array_max(column1) from (values
+    (make_array(1, 5, 3)),
+    (make_array(10, 2, 8)),
+    (NULL),
+    (make_array(NULL, 7, NULL)),
+    (make_array(100))
+) as t(column1);
+----
+5
+10
+NULL
+7
+100
+
+# array_max with NaN values (NaN should not be returned as max)
+query R
+select array_max(make_array(1.0, 'NaN'::double, 3.0));
+----
+NaN
+
+query R
+select array_max(make_array('NaN'::double, 'NaN'::double));
+----
+NaN
+
+query R
+select array_max(make_array('NaN'::double, NULL));
+----
+NaN
+
+# array_max with Int32 (exercises a different primitive type than Int64)
+query I
+select array_max(arrow_cast(make_array(10, -5, 3), 'List(Int32)'));
+----
+10
+
+## array_min
+
+query I
+select array_min(make_array(5, 3, 6, 4));
+----
+3
+
+query I
+select array_min(make_array(5, 3, 4, NULL, 6, NULL));
+----
+3
+
+query ?
+select array_min(make_array(NULL, NULL));
+----
+NULL
+
+query T
+select array_min(make_array('h', 'e', 'o', 'l', 'l'));
+----
+e
+
+query T
+select array_min(make_array('h', 'e', 'l', NULL, 'l', 'o', NULL));
+----
+e
+
+query B
+select array_min(make_array(false, true, false, true));
+----
+false
+
+query B
+select array_min(make_array(false, true, NULL, false, true));
+----
+false
+
+query D
+select array_min(make_array(DATE '1992-09-01', DATE '1993-03-01', DATE '1999-05-01', DATE '1985-11-01'));
+----
+1985-11-01
+
+query D
+select array_min(make_array(DATE '1995-09-01', DATE '1999-05-01', DATE '1993-03-01', NULL));
+----
+1993-03-01
+
+query P
+select array_min(make_array(TIMESTAMP '1992-09-01', TIMESTAMP '1995-06-01', TIMESTAMP '1984-10-01'));
+----
+1984-10-01T00:00:00
+
+query P
+select array_min(make_array(NULL, TIMESTAMP '1996-10-01', TIMESTAMP '1995-06-01'));
+----
+1995-06-01T00:00:00
+
+query R
+select array_min(make_array(5.1, -3.2, 6.3, 4.9));
+----
+-3.2
+
+query ?I
+select input, array_min(input) from (select make_array(d - 1, d, d + 1) input from (values (0), (10), (20), (30), (NULL)) t(d))
+----
+[-1, 0, 1] -1
+[9, 10, 11] 9
+[19, 20, 21] 19
+[29, 30, 31] 29
+[NULL, NULL, NULL] NULL
+
+query II
+select array_min(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), array_min(arrow_cast(make_array(1), 'LargeList(Int64)'));
+----
+1 1
+
+query II
+select array_min(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), array_min(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)'));
+----
+1 1
+
+query ?
+select array_min(make_array());
+----
+NULL
+
+# Testing with empty arguments should result in an error
+query error DataFusion error: Error during planning: 'array_min' does not support zero arguments
+select array_min();
+
+# array_min over multiple rows (exercises the offsets-based iteration)
+query I
+select array_min(column1) from (values
+    (make_array(1, 5, 3)),
+    (make_array(10, 2, 8)),
+    (NULL),
+    (make_array(NULL, 7, NULL)),
+    (make_array(100))
+) as t(column1);
+----
+1
+2
+NULL
+7
+100
+
+# array_min with NaN values (NaN should not be returned as min)
+query R
+select array_min(make_array(1.0, 'NaN'::double, 3.0));
+----
+1
+
+query R
+select array_min(make_array('NaN'::double, 'NaN'::double));
+----
+NaN
+
+query R
+select array_min(make_array('NaN'::double, NULL));
+----
+NaN
+
+# array_min with Int32 (exercises a different primitive type than Int64)
+query I
+select array_min(arrow_cast(make_array(10, -5, 3), 'List(Int32)'));
+----
+-5
+
+# array_min/array_max preserve parameterized primitive metadata
+query PPTT
+select
+  array_min(ts_list),
+  array_max(ts_list),
+  arrow_typeof(array_min(ts_list)),
+  arrow_typeof(array_max(ts_list))
+from (
+  select arrow_cast(
+    make_array(
+      arrow_cast(20, 'Timestamp(Nanosecond, Some("UTC"))'),
+      arrow_cast(10, 'Timestamp(Nanosecond, Some("UTC"))'),
+      arrow_cast(30, 'Timestamp(Nanosecond, Some("UTC"))')
+    ),
+    'List(Timestamp(Nanosecond, Some("UTC")))'
+  ) as ts_list
+) t;
+----
+1970-01-01T00:00:00.000000010Z 1970-01-01T00:00:00.000000030Z Timestamp(ns, "UTC") Timestamp(ns, "UTC")
+
+query RRTT
+select
+  array_min(dec_list),
+  array_max(dec_list),
+  arrow_typeof(array_min(dec_list)),
+  arrow_typeof(array_max(dec_list))
+from (
+  select arrow_cast(
+    make_array(
+      arrow_cast(200, 'Decimal128(20, 4)'),
+      arrow_cast(100, 'Decimal128(20, 4)'),
+      arrow_cast(300, 'Decimal128(20, 4)')
+    ),
+    'List(Decimal128(20, 4))'
+  ) as dec_list
+) t;
+----
+100 300 Decimal128(20, 4) Decimal128(20, 4)
+
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_operators.slt b/datafusion/sqllogictest/test_files/array/array_operators.slt
new file mode 100644
index 0000000000000..fed972ac4968b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_operators.slt
@@ -0,0 +1,134 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+### Array operators tests
+
+
+## array concatenate operator
+
+# array concatenate operator with scalars #1 (like array_concat scalar function)
+query ??
+select make_array(1, 2, 3) || make_array(4, 5, 6) || make_array(7, 8, 9), make_array([1], [2]) || make_array([3], [4]);
+----
+[1, 2, 3, 4, 5, 6, 7, 8, 9] [[1], [2], [3], [4]]
+
+# array concatenate operator with scalars #2 (like array_append scalar function)
+query ???
+select make_array(1, 2, 3) || 4, make_array(1.0, 2.0, 3.0) || 4.0, make_array('h', 'e', 'l', 'l') || 'o';
+----
+[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
+
+# array concatenate operator with scalars #3 (like array_prepend scalar function)
+query ???
+select 1 || make_array(2, 3, 4), 1.0 || make_array(2.0, 3.0, 4.0), 'h' || make_array('e', 'l', 'l', 'o');
+----
+[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
+
+# array concatenate operator with scalars #4 (mixed)
+query ?
+select 0 || [1,2,3] || 4 || [5] || [6,7];
+----
+[0, 1, 2, 3, 4, 5, 6, 7]
+
+# array concatenate operator with nd-list #5 (mixed)
+query ?
+select 0 || [1,2,3] || [[4,5]] || [[6,7,8]] || [9,10];
+----
+[[0, 1, 2, 3], [4, 5], [6, 7, 8], [9, 10]]
+
+# array concatenate operator non-valid cases
+## concat 2D with scalar is not valid
+query error
+select 0 || [1,2,3] || [[4,5]] || [[6,7,8]] || [9,10] || 11;
+
+## concat scalar with 2D is not valid
+query error
+select 0 || [[1,2,3]];
+
+# array concatenate operator with column
+
+statement ok
+CREATE TABLE array_concat_operator_table
+AS VALUES
+  (0, [1, 2, 2, 3], 4, [5, 6, 5]),
+  (-1, [4, 5, 6], 7, [8, 1, 1])
+;
+
+query ?
+select column1 || column2 || column3 || column4 from array_concat_operator_table;
+----
+[0, 1, 2, 2, 3, 4, 5, 6, 5]
+[-1, 4, 5, 6, 7, 8, 1, 1]
+
+statement ok
+drop table array_concat_operator_table;
+
+## array containment operator
+
+# array containment operator with scalars #1 (at arrow)
+query BBBBBBB
+select make_array(1,2,3) @> make_array(1,3),
+       make_array(1,2,3) @> make_array(1,4),
+       make_array([1,2], [3,4]) @> make_array([1,2]),
+       make_array([1,2], [3,4]) @> make_array([1,3]),
+       make_array([1,2], [3,4]) @> make_array([1,2], [3,4], [5,6]),
+       make_array([[1,2,3]]) @> make_array([[1]]),
+       make_array([[1,2,3]]) @> make_array([[1,2,3]]);
+----
+true false true false false false true
+
+# Make sure it is rewritten to function array_has_all()
+query TT
+explain select [1,2,3] @> [1,3];
+----
+logical_plan
+01)Projection: Boolean(true) AS array_has_all(make_array(Int64(1),Int64(2),Int64(3)),make_array(Int64(1),Int64(3)))
+02)--EmptyRelation: rows=1
+physical_plan
+01)ProjectionExec: expr=[true as array_has_all(make_array(Int64(1),Int64(2),Int64(3)),make_array(Int64(1),Int64(3)))]
+02)--PlaceholderRowExec
+
+# array containment operator with scalars #2 (arrow at)
+query BBBBBBB
+select make_array(1,3) <@ make_array(1,2,3),
+       make_array(1,4) <@ make_array(1,2,3),
+       make_array([1,2]) <@ make_array([1,2], [3,4]),
+       make_array([1,3]) <@ make_array([1,2], [3,4]),
+       make_array([1,2], [3,4], [5,6]) <@ make_array([1,2], [3,4]),
+       make_array([[1]]) <@ make_array([[1,2,3]]),
+       make_array([[1,2,3]]) <@ make_array([[1,2,3]]);
+----
+true false true false false false true
+
+# Make sure it is rewritten to function array_has_all()
+query TT
+explain select [1,3] <@ [1,2,3];
+----
+logical_plan
+01)Projection: Boolean(true) AS array_has_all(make_array(Int64(1),Int64(2),Int64(3)),make_array(Int64(1),Int64(3)))
+02)--EmptyRelation: rows=1
+physical_plan
+01)ProjectionExec: expr=[true as array_has_all(make_array(Int64(1),Int64(2),Int64(3)),make_array(Int64(1),Int64(3)))]
+02)--PlaceholderRowExec
+
+### Array casting tests
+
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_pop.slt b/datafusion/sqllogictest/test_files/array/array_pop.slt
new file mode 100644
index 0000000000000..b830fa464a984
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_pop.slt
@@ -0,0 +1,326 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## array_pop_back (aliases: `list_pop_back`)
+
+# array_pop_back scalar function with null
+#TODO: https://github.com/apache/datafusion/issues/7142
+# follow clickhouse and duckdb
+#query ?
+#select array_pop_back(null);
+#----
+#NULL
+
+# array_pop_back scalar function #1
+query ??
+select array_pop_back(make_array(1, 2, 3, 4, 5)), array_pop_back(make_array('h', 'e', 'l', 'l', 'o'));
+----
+[1, 2, 3, 4] [h, e, l, l]
+
+query ??
+select array_pop_back(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)')), array_pop_back(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'));
+----
+[1, 2, 3, 4] [h, e, l, l]
+
+query ??
+select array_pop_back(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)')), array_pop_back(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'));
+----
+[1, 2, 3, 4] [h, e, l, l]
+
+# array_pop_back scalar function #2 (after array_pop_back, array is empty)
+query ?
+select array_pop_back(make_array(1));
+----
+[]
+
+query ?
+select array_pop_back(arrow_cast(make_array(1), 'LargeList(Int64)'));
+----
+[]
+
+query ?
+select array_pop_back(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)'));
+----
+[]
+
+# array_pop_back scalar function #3 (array_pop_back the empty array)
+query ?
+select array_pop_back(array_pop_back(make_array(1)));
+----
+[]
+
+query ?
+select array_pop_back(array_pop_back(arrow_cast(make_array(1), 'LargeList(Int64)')));
+----
+[]
+
+query ?
+select array_pop_back(array_pop_back(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)')));
+----
+[]
+
+# array_pop_back scalar function #4 (array_pop_back the arrays which have NULL)
+query ??
+select array_pop_back(make_array(1, 2, 3, 4, NULL)), array_pop_back(make_array(NULL, 'e', 'l', NULL, 'o'));
+----
+[1, 2, 3, 4] [NULL, e, l, NULL]
+
+query ??
+select array_pop_back(arrow_cast(make_array(1, 2, 3, 4, NULL), 'LargeList(Int64)')), array_pop_back(arrow_cast(make_array(NULL, 'e', 'l', NULL, 'o'), 'LargeList(Utf8)'));
+----
+[1, 2, 3, 4] [NULL, e, l, NULL]
+
+query ??
+select array_pop_back(arrow_cast(make_array(1, 2, 3, 4, NULL), 'FixedSizeList(5, Int64)')), array_pop_back(arrow_cast(make_array(NULL, 'e', 'l', NULL, 'o'), 'FixedSizeList(5, Utf8)'));
+----
+[1, 2, 3, 4] [NULL, e, l, NULL]
+
+# array_pop_back scalar function #5 (array_pop_back the nested arrays)
+query ?
+select array_pop_back(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6)));
+----
+[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]]
+
+query ?
+select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6)), 'LargeList(List(Int64))'));
+----
+[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]]
+
+query ?
+select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6)), 'FixedSizeList(6, List(Int64))'));
+----
+[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]]
+
+# array_pop_back scalar function #6 (array_pop_back the nested arrays with NULL)
+query ?
+select array_pop_back(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), NULL));
+----
+[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]]
+
+query ?
+select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), NULL), 'LargeList(List(Int64))'));
+----
+[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]]
+
+query ?
+select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), NULL), 'FixedSizeList(6, List(Int64))'));
+----
+[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]]
+
+# array_pop_back scalar function #7 (array_pop_back the nested arrays with NULL)
+query ?
+select array_pop_back(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), NULL, make_array(1, 7, 4)));
+----
+[[1, 2, 3], [2, 9, 1], [7, 8, 9], NULL]
+
+query ?
+select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), NULL, make_array(1, 7, 4)), 'LargeList(List(Int64))'));
+----
+[[1, 2, 3], [2, 9, 1], [7, 8, 9], NULL]
+
+query ?
+select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), NULL, make_array(1, 7, 4)), 'FixedSizeList(5, List(Int64))'));
+----
+[[1, 2, 3], [2, 9, 1], [7, 8, 9], NULL]
+
+# array_pop_back scalar function #8 (after array_pop_back, nested array is empty)
+query ?
+select array_pop_back(make_array(make_array(1, 2, 3)));
+----
+[]
+
+query ?
+select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3)), 'LargeList(List(Int64))'));
+----
+[]
+
+query ?
+select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3)), 'FixedSizeList(1, List(Int64))'));
+----
+[]
+
+# array_pop_back with columns
+query ?
+select array_pop_back(column1) from arrayspop;
+----
+[1, 2]
+[3, 4, 5]
+[6, 7, 8, NULL]
+[NULL, NULL]
+NULL
+[NULL, 10, 11]
+
+query ?
+select array_pop_back(arrow_cast(column1, 'LargeList(Int64)')) from arrayspop;
+----
+[1, 2]
+[3, 4, 5]
+[6, 7, 8, NULL]
+[NULL, NULL]
+NULL
+[NULL, 10, 11]
+
+query ?
+select array_pop_back(column1) from large_arrayspop;
+----
+[1, 2]
+[3, 4, 5]
+[6, 7, 8, NULL]
+[NULL, NULL]
+NULL
+[NULL, 10, 11]
+
+query ?
+select array_pop_back(arrow_cast(column1, 'LargeList(Int64)')) from large_arrayspop;
+----
+[1, 2]
+[3, 4, 5]
+[6, 7, 8, NULL]
+[NULL, NULL]
+NULL
+[NULL, 10, 11]
+
+## array_pop_front (aliases: `list_pop_front`)
+
+#TODO:https://github.com/apache/datafusion/issues/7142
+# array_pop_front scalar function with null
+# follow clickhouse and duckdb
+#query ?
+#select array_pop_front(null);
+#----
+#NULL
+
+# array_pop_front scalar function #1
+query ??
+select array_pop_front(make_array(1, 2, 3, 4, 5)), array_pop_front(make_array('h', 'e', 'l', 'l', 'o'));
+----
+[2, 3, 4, 5] [e, l, l, o]
+
+query ??
+select array_pop_front(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)')), array_pop_front(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'));
+----
+[2, 3, 4, 5] [e, l, l, o]
+
+query ??
+select array_pop_front(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)')), array_pop_front(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'));
+----
+[2, 3, 4, 5] [e, l, l, o]
+
+# array_pop_front scalar function #2 (after array_pop_front, array is empty)
+query ?
+select array_pop_front(make_array(1));
+----
+[]
+
+query ?
+select array_pop_front(arrow_cast(make_array(1), 'LargeList(Int64)'));
+----
+[]
+
+query ?
+select array_pop_front(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)'));
+----
+[]
+
+# array_pop_front scalar function #3 (array_pop_front the empty array)
+query ?
+select array_pop_front(array_pop_front(make_array(1)));
+----
+[]
+
+query ?
+select array_pop_front(array_pop_front(arrow_cast(make_array(1), 'LargeList(Int64)')));
+----
+[]
+
+query ?
+select array_pop_front(array_pop_front(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)')));
+----
+[]
+
+# array_pop_front scalar function #5 (array_pop_front the nested arrays)
+query ?
+select array_pop_front(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6)));
+----
+[[2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]]
+
+query ?
+select array_pop_front(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6)), 'LargeList(List(Int64))'));
+----
+[[2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]]
+
+query ?
+select array_pop_front(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6)), 'FixedSizeList(6, List(Int64))'));
+----
+[[2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]]
+
+# array_pop_front scalar function #6 (array_pop_front the nested arrays with NULL)
+query ?
+select array_pop_front(make_array(NULL, make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4)));
+----
+[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]]
+
+query ?
+select array_pop_front(arrow_cast(make_array(NULL, make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4)), 'LargeList(List(Int64))'));
+----
+[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]]
+
+query ?
+select array_pop_front(arrow_cast(make_array(NULL, make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4)), 'FixedSizeList(6, List(Int64))'));
+----
+[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]]
+
+# array_pop_front scalar function #8 (after array_pop_front, nested array is empty)
+query ?
+select array_pop_front(make_array(make_array(1, 2, 3)));
+----
+[]
+
+query ?
+select array_pop_front(arrow_cast(make_array(make_array(1, 2, 3)), 'LargeList(List(Int64))'));
+----
+[]
+
+query ?
+select array_pop_front(arrow_cast(make_array(make_array(1, 2, 3)), 'FixedSizeList(1, List(Int64))'));
+----
+[]
+
+query ?
+select array_pop_back(arrow_cast([1, 2], 'ListView(Int64)'));
+----
+[1]
+
+query ?
+select array_pop_front(arrow_cast([1, 2], 'ListView(Int64)'));
+----
+[2]
+
+query ?
+select array_pop_back(arrow_cast([1, 2], 'LargeListView(Int64)'));
+----
+[1]
+
+query ?
+select array_pop_front(arrow_cast([1, 2], 'LargeListView(Int64)'));
+----
+[2]
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_position.slt b/datafusion/sqllogictest/test_files/array/array_position.slt
new file mode 100644
index 0000000000000..07e3d3143592c
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_position.slt
@@ -0,0 +1,461 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## array_position (aliases: `list_position`, `array_indexof`, `list_indexof`)
+
+## array_position with NULL (follow PostgreSQL)
+query II
+select array_position([1, 2, 3, 4, 5], arrow_cast(NULL, 'Int64')), array_position(arrow_cast(NULL, 'List(Int64)'), 1);
+----
+NULL NULL
+
+# array_position with no match (incl. empty array) returns NULL
+query II
+select array_position([], 1), array_position([2], 1);
+----
+NULL NULL
+
+# array_position scalar function #1
+query III
+select array_position(['h', 'e', 'l', 'l', 'o'], 'l'), array_position([1, 2, 3, 4, 5], 5), array_position([1, 1, 1], 1);
+----
+3 5 1
+
+query III
+select array_position(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), 'l'), array_position(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), 5), array_position(arrow_cast([1, 1, 1], 'LargeList(Int64)'), 1);
+----
+3 5 1
+
+query III
+select array_position(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'FixedSizeList(5, Utf8)'), 'l'), array_position(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)'), 5), array_position(arrow_cast([1, 1, 1], 'FixedSizeList(3, Int64)'), 1);
+----
+3 5 1
+
+# array_position scalar function #2 (with optional argument)
+query III
+select array_position(['h', 'e', 'l', 'l', 'o'], 'l', 4), array_position([1, 2, 5, 4, 5], 5, 4), array_position([1, 1, 1], 1, 2);
+----
+4 5 2
+
+query III
+select array_position(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), 'l', 4), array_position(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), 5, 4), array_position(arrow_cast([1, 1, 1], 'LargeList(Int64)'), 1, 2);
+----
+4 5 2
+
+query III
+select array_position(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'FixedSizeList(5, Utf8)'), 'l', 4), array_position(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)'), 5, 4), array_position(arrow_cast([1, 1, 1], 'FixedSizeList(3, Int64)'), 1, 2);
+----
+4 5 2
+
+# array_position scalar function #3 (element is list)
+query II
+select array_position(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), [4, 5, 6]), array_position(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), [2, 3, 4]);
+----
+2 2
+
+# array_position scalar function #4 (element in list; with optional argument)
+query II
+select array_position(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), [4, 5, 6], 3), array_position(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), [2, 3, 4], 3);
+----
+4 3
+
+query II
+select array_position(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'LargeList(List(Int64))'), [4, 5, 6]), array_position(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'LargeList(List(Int64))'), [2, 3, 4]);
+----
+2 2
+
+query I
+SELECT array_position(arrow_cast([5, 2, 3, 4, 5], 'List(Int32)'), 5)
+----
+1
+
+query I
+SELECT array_position(arrow_cast([5, 2, 3, 4, 5], 'List(Int32)'), 5, 2)
+----
+5
+
+query I
+SELECT array_position(arrow_cast([1, 1, 100, 1, 1], 'LargeList(Int32)'), 100)
+----
+3
+
+query error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'array_position' function: coercion from
+SELECT array_position([1, 2, 3], 'foo')
+
+query error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'array_position' function: coercion from
+SELECT array_position([1, 2, 3], 'foo', 2)
+
+# list_position scalar function #5 (function alias `array_position`)
+query III
+select list_position(['h', 'e', 'l', 'l', 'o'], 'l'), list_position([1, 2, 3, 4, 5], 5), list_position([1, 1, 1], 1);
+----
+3 5 1
+
+query III
+select list_position(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), 'l'), list_position(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), 5), list_position(arrow_cast([1, 1, 1], 'LargeList(Int64)'), 1);
+----
+3 5 1
+
+# array_indexof scalar function #6 (function alias `array_position`)
+query III
+select array_indexof(['h', 'e', 'l', 'l', 'o'], 'l'), array_indexof([1, 2, 3, 4, 5], 5), array_indexof([1, 1, 1], 1);
+----
+3 5 1
+
+query III
+select array_indexof(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), 'l'), array_indexof(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), 5), array_indexof(arrow_cast([1, 1, 1], 'LargeList(Int64)'), 1);
+----
+3 5 1
+
+# list_indexof scalar function #7 (function alias `array_position`)
+query III
+select list_indexof(['h', 'e', 'l', 'l', 'o'], 'l'), list_indexof([1, 2, 3, 4, 5], 5), list_indexof([1, 1, 1], 1);
+----
+3 5 1
+
+query III
+select list_indexof(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), 'l'), list_indexof(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), 5), list_indexof(arrow_cast([1, 1, 1], 'LargeList(Int64)'), 1);
+----
+3 5 1
+
+# array_position with columns #1
+query II
+select array_position(column1, column2), array_position(column1, column2, column3) from arrays_values_without_nulls;
+----
+1 1
+2 2
+3 3
+4 4
+
+query II
+select array_position(column1, column2), array_position(column1, column2, column3) from large_arrays_values_without_nulls;
+----
+1 1
+2 2
+3 3
+4 4
+
+# array_position with columns #2 (element is list)
+query II
+select array_position(column1, column2), array_position(column1, column2, column3) from nested_arrays;
+----
+3 3
+2 5
+
+query II
+select array_position(column1, column2), array_position(column1, column2, column3) from nested_arrays;
+----
+3 3
+2 5
+
+# array_position with columns and scalars #1
+query III
+select array_position(make_array(1, 2, 3, 4, 5), column2), array_position(column1, 3), array_position(column1, 3, 5) from arrays_values_without_nulls;
+----
+1 3 NULL
+NULL NULL NULL
+NULL NULL NULL
+NULL NULL NULL
+
+query III
+select array_position(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), column2), array_position(column1, 3), array_position(column1, 3, 5) from large_arrays_values_without_nulls;
+----
+1 3 NULL
+NULL NULL NULL
+NULL NULL NULL
+NULL NULL NULL
+
+# array_position with columns and scalars #2 (element is list)
+query III
+select array_position(make_array([1, 2, 3], [4, 5, 6], [11, 12, 13]), column2), array_position(column1, make_array(4, 5, 6)), array_position(column1, make_array(1, 2, 3), 2) from nested_arrays;
+----
+NULL 6 4
+NULL 1 NULL
+
+query III
+select array_position(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [11, 12, 13]), 'LargeList(LargeList(Int64))'), column2), array_position(column1, arrow_cast(make_array(4, 5, 6), 'LargeList(Int64)')), array_position(column1, arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 2) from large_nested_arrays;
+----
+NULL 6 4
+NULL 1 NULL
+
+# array_position with NULL element in haystack array (NULL = NULL semantics)
+query III
+select array_position([1, NULL, 3], arrow_cast(NULL, 'Int64')), array_position([NULL, 2, 3], arrow_cast(NULL, 'Int64')), array_position([1, 2, NULL], arrow_cast(NULL, 'Int64'));
+----
+2 1 3
+
+query I
+select array_position(arrow_cast([1, NULL, 3], 'LargeList(Int64)'), arrow_cast(NULL, 'Int64'));
+----
+2
+
+# array_position with NULL element in array and start_from
+query II
+select array_position([NULL, 1, NULL, 2], arrow_cast(NULL, 'Int64'), 2), array_position([NULL, 1, NULL, 2], arrow_cast(NULL, 'Int64'), 1);
+----
+3 1
+
+# array_position with column array and scalar element
+query IIII
+select array_position(column1, 3), array_position(column1, 10), array_position(column1, 20), array_position(column1, 999) from arrays_values_without_nulls;
+----
+3 10 NULL NULL
+NULL NULL 10 NULL
+NULL NULL NULL NULL
+NULL NULL NULL NULL
+
+query II
+select array_position(column1, 3), array_position(column1, 20) from large_arrays_values_without_nulls;
+----
+3 NULL
+NULL 10
+NULL NULL
+NULL NULL
+
+query II
+select array_position(column1, 3), array_position(column1, 20) from fixed_size_arrays_values_without_nulls;
+----
+3 NULL
+NULL 10
+NULL NULL
+NULL NULL
+
+# array_position with column array, scalar element, and scalar start_from
+query II
+select array_position(column1, 3, 1), array_position(column1, 3, 4) from arrays_values_without_nulls;
+----
+3 NULL
+NULL NULL
+NULL NULL
+NULL NULL
+
+query II
+select array_position(column1, 3, 1), array_position(column1, 3, 4) from large_arrays_values_without_nulls;
+----
+3 NULL
+NULL NULL
+NULL NULL
+NULL NULL
+
+# array_position with column array, scalar element, and column start_from
+query I
+select array_position(column1, 3, column3) from arrays_values_without_nulls;
+----
+3
+NULL
+NULL
+NULL
+
+# array_position with scalar haystack, scalar element, and column start_from
+query I
+select array_position([1, 2, 1, 2], 2, column3) from arrays_values_without_nulls;
+----
+2
+2
+4
+4
+
+# array_position start_from boundary cases
+query IIII
+select array_position([1, 2, 3], 3, 3), array_position([1, 2, 3], 1, 2), array_position([1, 2, 3], 1, 1), array_position([1, 2, 3], 3, 4);
+----
+3 NULL 1 NULL
+
+query II
+select array_position([1, 2, 3], 3, 4), array_position([1], 1, 2);
+----
+NULL NULL
+
+# array_position with empty array in various contexts
+query II
+select array_position(arrow_cast(make_array(), 'List(Int64)'), 1), array_position(arrow_cast(make_array(), 'LargeList(Int64)'), 1);
+----
+NULL NULL
+
+# FixedSizeList with start_from
+query II
+select array_position(arrow_cast([1, 2, 3, 1, 2], 'FixedSizeList(5, Int64)'), 1, 2), array_position(arrow_cast([1, 2, 3, 1, 2], 'FixedSizeList(5, Int64)'), 2, 4);
+----
+4 5
+
+query I
+select array_position(arrow_cast(['a', 'b', 'c', 'b'], 'FixedSizeList(4, Utf8)'), 'b', 3);
+----
+4
+
+## array_positions (aliases: `list_positions`)
+
+# array_positions with empty array
+query ?
+select array_positions(arrow_cast(make_array(), 'List(Int64)'), 1);
+----
+[]
+
+query ?
+select array_positions([1, 2, 3, 4, 5], null);
+----
+[]
+
+#TODO: https://github.com/apache/datafusion/issues/7142
+# array_positions with NULL (follow PostgreSQL)
+#query ?
+#select array_positions(null, 1);
+#----
+#NULL
+
+# array_positions scalar function #1
+query ???
+select array_positions(['h', 'e', 'l', 'l', 'o'], 'l'), array_positions([1, 2, 3, 4, 5], 5), array_positions([1, 1, 1], 1);
+----
+[3, 4] [5] [1, 2, 3]
+
+query ???
+select array_positions(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), 'l'), array_positions(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), 5), array_positions(arrow_cast([1, 1, 1], 'LargeList(Int64)'), 1);
+----
+[3, 4] [5] [1, 2, 3]
+
+query ???
+select array_positions(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'FixedSizeList(5, Utf8)'), 'l'), array_positions(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)'), 5), array_positions(arrow_cast([1, 1, 1], 'FixedSizeList(3, Int64)'), 1);
+----
+[3, 4] [5] [1, 2, 3]
+
+# array_positions scalar function #2 (element is list)
+query ?
+select array_positions(make_array([1, 2, 3], [2, 1, 3], [1, 5, 6], [2, 1, 3], [4, 5, 6]), [2, 1, 3]);
+----
+[2, 4]
+
+query ?
+select array_positions(arrow_cast(make_array([1, 2, 3], [2, 1, 3], [1, 5, 6], [2, 1, 3], [4, 5, 6]), 'LargeList(List(Int64))'), [2, 1, 3]);
+----
+[2, 4]
+
+query ?
+select array_positions(arrow_cast(make_array([1, 2, 3], [2, 1, 3], [1, 5, 6], [2, 1, 3], [4, 5, 6]), 'FixedSizeList(5, List(Int64))'), [2, 1, 3]);
+----
+[2, 4]
+
+# list_positions scalar function #3 (function alias `array_positions`)
+query ???
+select list_positions(['h', 'e', 'l', 'l', 'o'], 'l'), list_positions([1, 2, 3, 4, 5], 5), list_positions([1, 1, 1], 1);
+----
+[3, 4] [5] [1, 2, 3]
+
+query ???
+select list_positions(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), 'l'), list_positions(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), 5), list_positions(arrow_cast([1, 1, 1], 'LargeList(Int64)'), 1);
+----
+[3, 4] [5] [1, 2, 3]
+
+query ???
+select list_positions(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'FixedSizeList(5, Utf8)'), 'l'),
+       list_positions(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)'), 5),
+       list_positions(arrow_cast([1, 1, 1], 'FixedSizeList(3, Int64)'), 1);
+----
+[3, 4] [5] [1, 2, 3]
+
+# array_positions with columns #1
+query ?
+select array_positions(column1, column2) from arrays_values_without_nulls;
+----
+[1]
+[2]
+[3]
+[4]
+
+query ?
+select array_positions(arrow_cast(column1, 'LargeList(Int64)'), column2) from arrays_values_without_nulls;
+----
+[1]
+[2]
+[3]
+[4]
+
+query ?
+select array_positions(arrow_cast(column1, 'LargeList(Int64)'), column2) from fixed_size_arrays_values_without_nulls;
+----
+[1]
+[2]
+[3]
+[4]
+
+# array_positions with columns #2 (element is list)
+query ?
+select array_positions(column1, column2) from nested_arrays;
+----
+[3]
+[2, 5]
+
+query ?
+select array_positions(arrow_cast(column1, 'LargeList(List(Int64))'), column2) from nested_arrays;
+----
+[3]
+[2, 5]
+
+query ?
+select array_positions(column1, column2) from fixed_size_nested_arrays;
+----
+[3]
+[2, 5]
+
+# array_positions with columns and scalars #1
+query ??
+select array_positions(column1, 4), array_positions(array[1, 2, 23, 13, 33, 45], column2) from arrays_values_without_nulls;
+----
+[4] [1]
+[] []
+[] [3]
+[] []
+
+query ??
+select array_positions(arrow_cast(column1, 'LargeList(Int64)'), 4), array_positions(array[1, 2, 23, 13, 33, 45], column2) from arrays_values_without_nulls;
+----
+[4] [1]
+[] []
+[] [3]
+[] []
+
+query ??
+select array_positions(column1, 4), array_positions(array[1, 2, 23, 13, 33, 45], column2) from fixed_size_arrays_values_without_nulls;
+----
+[4] [1]
+[] []
+[] [3]
+[] []
+
+# array_positions with columns and scalars #2 (element is list)
+query ??
+select array_positions(column1, make_array(4, 5, 6)), array_positions(make_array([1, 2, 3], [11, 12, 13], [4, 5, 6]), column2) from nested_arrays;
+----
+[6] []
+[1] []
+
+query ??
+select array_positions(arrow_cast(column1, 'LargeList(List(Int64))'), make_array(4, 5, 6)), array_positions(arrow_cast(make_array([1, 2, 3], [11, 12, 13], [4, 5, 6]), 'LargeList(List(Int64))'), column2) from nested_arrays;
+----
+[6] []
+[1] []
+
+query ??
+select array_positions(column1, make_array(4, 5, 6)), array_positions(make_array([1, 2, 3], [11, 12, 13], [4, 5, 6]), column2) from fixed_size_nested_arrays;
+----
+[6] []
+[1] []
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_prepend.slt b/datafusion/sqllogictest/test_files/array/array_prepend.slt
new file mode 100644
index 0000000000000..0782680ed2de9
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_prepend.slt
@@ -0,0 +1,278 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## array_prepend (aliases: `list_prepend`, `array_push_front`, `list_push_front`)
+
+# array_prepend with NULLs
+
+# DuckDB: [4]
+# ClickHouse: Null
+query ?
+select array_prepend(4, NULL);
+----
+[4]
+
+query ?
+select array_prepend(4, []);
+----
+[4]
+
+query ?
+select array_prepend(4, [null]);
+----
+[4, NULL]
+
+# DuckDB: [null]
+# ClickHouse: [null]
+query ?
+select array_prepend(null, []);
+----
+[NULL]
+
+query ?
+select array_prepend(null, [1]);
+----
+[NULL, 1]
+
+query ?
+select array_prepend(null, [[1,2,3]]);
+----
+[NULL, [1, 2, 3]]
+
+# DuckDB: [[]]
+# ClickHouse: [[]]
+# TODO: We may also return [[]]
+query ?
+select array_prepend([], []);
+----
+[[]]
+
+query ?
+select array_prepend(null, null);
+----
+[NULL]
+
+query ?
+select array_append([], null);
+----
+[NULL]
+
+
+# array_prepend scalar function #3
+query ???
+select array_prepend(1, make_array(2, 3, 4)), array_prepend(1.0, make_array(2.0, 3.0, 4.0)), array_prepend('h', make_array('e', 'l', 'l', 'o'));
+----
+[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
+
+query ???
+select array_prepend(1, arrow_cast(make_array(2, 3, 4), 'LargeList(Int64)')), array_prepend(1.0, arrow_cast(make_array(2.0, 3.0, 4.0), 'LargeList(Float64)')), array_prepend('h', arrow_cast(make_array('e', 'l', 'l', 'o'), 'LargeList(Utf8)'));
+----
+[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
+
+query ???
+select array_prepend(1, arrow_cast([2, 3, 4], 'FixedSizeList(3, Int64)')), array_prepend(1.0, arrow_cast([2.0, 3.0, 4.0], 'FixedSizeList(3, Float64)')), array_prepend('h', arrow_cast(['e', 'l', 'l', 'o'], 'FixedSizeList(4, Utf8)'));
+----
+[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
+
+# array_prepend scalar function #4 (element is list)
+query ???
+select array_prepend(make_array(1), make_array(make_array(2), make_array(3), make_array(4))), array_prepend(make_array(1.0), make_array([2.0], [3.0], [4.0])), array_prepend(make_array('h'), make_array(['e'], ['l'], ['l'], ['o']));
+----
+[[1], [2], [3], [4]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]]
+
+query ???
+select array_prepend(arrow_cast(make_array(1), 'LargeList(Int64)'), arrow_cast(make_array(make_array(2), make_array(3), make_array(4)), 'LargeList(LargeList(Int64))')),
+       array_prepend(arrow_cast(make_array(1.0), 'LargeList(Float64)'), arrow_cast(make_array([2.0], [3.0], [4.0]), 'LargeList(LargeList(Float64))')),
+       array_prepend(arrow_cast(make_array('h'), 'LargeList(Utf8)'), arrow_cast(make_array(['e'], ['l'], ['l'], ['o']), 'LargeList(LargeList(Utf8))'));
+----
+[[1], [2], [3], [4]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]]
+
+query ???
+select array_prepend(arrow_cast([1], 'FixedSizeList(1, Int64)'), arrow_cast([[1], [2], [3]], 'FixedSizeList(3, List(Int64))')),
+       array_prepend(arrow_cast([1.0], 'FixedSizeList(1, Float64)'), arrow_cast([[2.0], [3.0], [4.0]], 'FixedSizeList(3, List(Float64))')),
+       array_prepend(arrow_cast(['h'], 'FixedSizeList(1, Utf8)'), arrow_cast([['e'], ['l'], ['l'], ['o']], 'FixedSizeList(4, List(Utf8))'));
+----
+[[1], [1], [2], [3]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]]
+
+# list_prepend scalar function #5 (function alias `array_prepend`)
+query ???
+select list_prepend(1, make_array(2, 3, 4)), list_prepend(1.0, make_array(2.0, 3.0, 4.0)), list_prepend('h', make_array('e', 'l', 'l', 'o'));
+----
+[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
+
+query ???
+select list_prepend(1, arrow_cast(make_array(2, 3, 4), 'LargeList(Int64)')), list_prepend(1.0, arrow_cast(make_array(2.0, 3.0, 4.0), 'LargeList(Float64)')), list_prepend('h', arrow_cast(make_array('e', 'l', 'l', 'o'), 'LargeList(Utf8)'));
+----
+[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
+
+# array_push_front scalar function #6 (function alias `array_prepend`)
+query ???
+select array_push_front(1, make_array(2, 3, 4)), array_push_front(1.0, make_array(2.0, 3.0, 4.0)), array_push_front('h', make_array('e', 'l', 'l', 'o'));
+----
+[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
+
+query ???
+select array_push_front(1, arrow_cast(make_array(2, 3, 4), 'LargeList(Int64)')), array_push_front(1.0, arrow_cast(make_array(2.0, 3.0, 4.0), 'LargeList(Float64)')), array_push_front('h', arrow_cast(make_array('e', 'l', 'l', 'o'), 'LargeList(Utf8)'));
+----
+[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
+
+# list_push_front scalar function #7 (function alias `array_prepend`)
+query ???
+select list_push_front(1, make_array(2, 3, 4)), list_push_front(1.0, make_array(2.0, 3.0, 4.0)), list_push_front('h', make_array('e', 'l', 'l', 'o'));
+----
+[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
+
+query ???
+select list_push_front(1, arrow_cast(make_array(2, 3, 4), 'LargeList(Int64)')), list_push_front(1.0, arrow_cast(make_array(2.0, 3.0, 4.0), 'LargeList(Float64)')), list_push_front('h', arrow_cast(make_array('e', 'l', 'l', 'o'), 'LargeList(Utf8)'));
+----
+[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
+
+# array_prepend scalar function #7 (element is fixed size list)
+query ???
+select array_prepend(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)'), make_array(arrow_cast(make_array(2), 'FixedSizeList(1, Int64)'), arrow_cast(make_array(3), 'FixedSizeList(1, Int64)'), arrow_cast(make_array(4), 'FixedSizeList(1, Int64)'))),
+       array_prepend(arrow_cast(make_array(1.0), 'FixedSizeList(1, Float64)'), make_array(arrow_cast([2.0], 'FixedSizeList(1, Float64)'), arrow_cast([3.0], 'FixedSizeList(1, Float64)'), arrow_cast([4.0], 'FixedSizeList(1, Float64)'))),
+       array_prepend(arrow_cast(make_array('h'), 'FixedSizeList(1, Utf8)'), make_array(arrow_cast(['e'], 'FixedSizeList(1, Utf8)'), arrow_cast(['l'], 'FixedSizeList(1, Utf8)'), arrow_cast(['l'], 'FixedSizeList(1, Utf8)'), arrow_cast(['o'], 'FixedSizeList(1, Utf8)')));
+----
+[[1], [2], [3], [4]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]]
+
+query ???
+select array_prepend(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)'), arrow_cast(make_array(make_array(2), make_array(3), make_array(4)), 'LargeList(FixedSizeList(1, Int64))')),
+       array_prepend(arrow_cast(make_array(1.0), 'FixedSizeList(1, Float64)'), arrow_cast(make_array([2.0], [3.0], [4.0]), 'LargeList(FixedSizeList(1, Float64))')),
+       array_prepend(arrow_cast(make_array('h'), 'FixedSizeList(1, Utf8)'), arrow_cast(make_array(['e'], ['l'], ['l'], ['o']), 'LargeList(FixedSizeList(1, Utf8))'));
+----
+[[1], [2], [3], [4]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]]
+
+query ???
+select array_prepend(arrow_cast([1], 'FixedSizeList(1, Int64)'), arrow_cast([[1], [2], [3]], 'FixedSizeList(3, FixedSizeList(1, Int64))')),
+       array_prepend(arrow_cast([1.0], 'FixedSizeList(1, Float64)'), arrow_cast([[2.0], [3.0], [4.0]], 'FixedSizeList(3, FixedSizeList(1, Float64))')),
+       array_prepend(arrow_cast(['h'], 'FixedSizeList(1, Utf8)'), arrow_cast([['e'], ['l'], ['l'], ['o']], 'FixedSizeList(4, FixedSizeList(1, Utf8))'));
+----
+[[1], [1], [2], [3]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]]
+
+# array_prepend with columns #1
+query ?
+select array_prepend(column2, column1) from arrays_values;
+----
+[1, NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+[12, 11, 12, 13, 14, 15, 16, 17, 18, NULL, 20]
+[23, 21, 22, 23, NULL, 25, 26, 27, 28, 29, 30]
+[34, 31, 32, 33, 34, 35, NULL, 37, 38, 39, 40]
+[44]
+[NULL, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]
+[55, 51, 52, NULL, 54, 55, 56, 57, 58, 59, 60]
+[66, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70]
+
+query ?
+select array_prepend(column2, column1) from large_arrays_values;
+----
+[1, NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+[12, 11, 12, 13, 14, 15, 16, 17, 18, NULL, 20]
+[23, 21, 22, 23, NULL, 25, 26, 27, 28, 29, 30]
+[34, 31, 32, 33, 34, 35, NULL, 37, 38, 39, 40]
+[44]
+[NULL, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]
+[55, 51, 52, NULL, 54, 55, 56, 57, 58, 59, 60]
+[66, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70]
+
+query ?
+select array_prepend(column2, column1) from fixed_arrays_values;
+----
+[1, NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+[12, 11, 12, 13, 14, 15, 16, 17, 18, NULL, 20]
+[23, 21, 22, 23, NULL, 25, 26, 27, 28, 29, 30]
+[34, 31, 32, 33, 34, 35, NULL, 37, 38, 39, 40]
+[44, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL]
+[NULL, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]
+[55, 51, 52, NULL, 54, 55, 56, 57, 58, 59, 60]
+[66, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70]
+
+# array_prepend with columns #2 (element is list)
+query ?
+select array_prepend(column2, column1) from nested_arrays;
+----
+[[7, 8, 9], [1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]]
+[[10, 11, 12], [4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7]]
+
+query ?
+select array_prepend(column2, column1) from large_nested_arrays;
+----
+[[7, 8, 9], [1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]]
+[[10, 11, 12], [4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7]]
+
+query ?
+select array_prepend(column2, column1) from fixed_size_nested_arrays;
+----
+[[7, 8, 9], [1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]]
+[[10, 11, 12], [4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7]]
+
+# array_prepend with columns and scalars #1
+query ??
+select array_prepend(100.1, column2), array_prepend('.', column3) from arrays;
+----
+[100.1, 1.1, 2.2, 3.3] [., L, o, r, e, m]
+[100.1, NULL, 5.5, 6.6] [., i, p, NULL, u, m]
+[100.1, 7.7, 8.8, 9.9] [., d, NULL, l, o, r]
+[100.1, 10.1, NULL, 12.2] [., s, i, t]
+[100.1, 13.3, 14.4, 15.5] [., a, m, e, t]
+[100.1] [., ,]
+[100.1, 16.6, 17.7, 18.8] [.]
+
+query ??
+select array_prepend(100.1, column2), array_prepend('.', column3) from large_arrays;
+----
+[100.1, 1.1, 2.2, 3.3] [., L, o, r, e, m]
+[100.1, NULL, 5.5, 6.6] [., i, p, NULL, u, m]
+[100.1, 7.7, 8.8, 9.9] [., d, NULL, l, o, r]
+[100.1, 10.1, NULL, 12.2] [., s, i, t]
+[100.1, 13.3, 14.4, 15.5] [., a, m, e, t]
+[100.1] [., ,]
+[100.1, 16.6, 17.7, 18.8] [.]
+
+query ??
+select array_prepend(100.1, column2), array_prepend('.', column3) from fixed_size_arrays;
+----
+[100.1, 1.1, 2.2, 3.3] [., L, o, r, e, m]
+[100.1, NULL, 5.5, 6.6] [., i, p, NULL, u, m]
+[100.1, 7.7, 8.8, 9.9] [., d, NULL, l, o, r]
+[100.1, 10.1, NULL, 12.2] [., s, i, t, a, b]
+[100.1, 13.3, 14.4, 15.5] [., a, m, e, t, x]
+[100.1, NULL, NULL, NULL] [., ,, a, b, c, d]
+[100.1, 16.6, 17.7, 18.8] [., NULL, NULL, NULL, NULL, NULL]
+
+# array_prepend with columns and scalars #2 (element is list)
+query ??
+select array_prepend(make_array(1, 11, 111), column1), array_prepend(column2, make_array(make_array(1, 2, 3), make_array(11, 12, 13))) from nested_arrays;
+----
+[[1, 11, 111], [1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]] [[7, 8, 9], [1, 2, 3], [11, 12, 13]]
+[[1, 11, 111], [4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7]] [[10, 11, 12], [1, 2, 3], [11, 12, 13]]
+
+query ??
+select array_prepend(arrow_cast(make_array(1, 11, 111), 'LargeList(Int64)'), column1), array_prepend(column2, arrow_cast(make_array(make_array(1, 2, 3), make_array(11, 12, 13)), 'LargeList(LargeList(Int64))')) from large_nested_arrays;
+----
+[[1, 11, 111], [1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]] [[7, 8, 9], [1, 2, 3], [11, 12, 13]]
+[[1, 11, 111], [4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7]] [[10, 11, 12], [1, 2, 3], [11, 12, 13]]
+
+query ??
+select array_prepend(arrow_cast(make_array(1, 11, 111), 'FixedSizeList(3, Int64)'), column1), array_prepend(column2, arrow_cast(make_array(make_array(1, 2, 3), make_array(11, 12, 13)), 'FixedSizeList(2, List(Int64))')) from fixed_size_nested_arrays;
+----
+[[1, 11, 111], [1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]] [[7, 8, 9], [1, 2, 3], [11, 12, 13]]
+[[1, 11, 111], [4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7]] [[10, 11, 12], [1, 2, 3], [11, 12, 13]]
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_range.slt b/datafusion/sqllogictest/test_files/array/array_range.slt
new file mode 100644
index 0000000000000..cb7348558178d
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_range.slt
@@ -0,0 +1,472 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+# Ensure can coerce from other valid types
+query ???????????
+select range(5),
+       range(2, 5),
+       range(2, 10, 3),
+       range(10, 2, -3),
+       range(arrow_cast(1, 'Int8'), 5, -1),
+       range(arrow_cast(1, 'Int16'), arrow_cast(-5, 'Int8'), 1),
+       range(arrow_cast(1, 'Int32'), arrow_cast(-5, 'Int16'), arrow_cast(-1, 'Int8')),
+       range(DATE '1992-09-01', DATE '1993-03-01', arrow_cast('1 MONTH', 'Interval(YearMonth)')),
+       range(DATE '1993-02-01', arrow_cast(DATE '1993-01-01', 'Date64'), INTERVAL '-1' DAY),
+       range(arrow_cast(DATE '1989-04-01', 'Date64'), DATE '1993-03-01', INTERVAL '1' YEAR),
+       range(arrow_cast(DATE '1993-03-01', 'Date64'), arrow_cast(DATE '1989-04-01', 'Date64'), INTERVAL '1' YEAR)
+;
+----
+[0, 1, 2, 3, 4] [2, 3, 4] [2, 5, 8] [10, 7, 4] [] [] [1, 0, -1, -2, -3, -4] [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01] [1993-02-01, 1993-01-31, 1993-01-30, 1993-01-29, 1993-01-28, 1993-01-27, 1993-01-26, 1993-01-25, 1993-01-24, 1993-01-23, 1993-01-22, 1993-01-21, 1993-01-20, 1993-01-19, 1993-01-18, 1993-01-17, 1993-01-16, 1993-01-15, 1993-01-14, 1993-01-13, 1993-01-12, 1993-01-11, 1993-01-10, 1993-01-09, 1993-01-08, 1993-01-07, 1993-01-06, 1993-01-05, 1993-01-04, 1993-01-03, 1993-01-02] [1989-04-01, 1990-04-01, 1991-04-01] []
+
+# Test range with zero step
+query error DataFusion error: Execution error: step can't be 0 for function range\(start \[, stop, step\]\)
+select range(1, 1, 0);
+
+# Test range with big steps
+query ????
+select
+  range(-9223372036854775808, -9223372036854775808, -9223372036854775808) as c1,
+  range(9223372036854775807, 9223372036854775807, 9223372036854775807) as c2,
+  range(0, -9223372036854775808, -9223372036854775808) as c3,
+  range(0, 9223372036854775807, 9223372036854775807) as c4;
+----
+[] [] [0] [0]
+
+# Test range for other edge cases
+query ????????
+select
+  range(9223372036854775807, 9223372036854775807, -1) as c1,
+  range(9223372036854775807, 9223372036854775806, -1) as c2,
+  range(9223372036854775807, 9223372036854775807, 1) as c3,
+  range(9223372036854775806, 9223372036854775807, 1) as c4,
+  range(-9223372036854775808, -9223372036854775808, -1) as c5,
+  range(-9223372036854775807, -9223372036854775808, -1) as c6,
+  range(-9223372036854775808, -9223372036854775808, 1) as c7,
+  range(-9223372036854775808, -9223372036854775807, 1) as c8;
+----
+[] [9223372036854775807] [] [9223372036854775806] [] [-9223372036854775807] [] [-9223372036854775808]
+
+# Test range(start, stop, step) with NULL values
+query ?
+select range(start, stop, step) from
+  (values (1), (NULL)) as start_values(start),
+  (values (10), (NULL)) as stop_values(stop),
+  (values (3), (NULL)) as step_values(step)
+where start is null or stop is null or step is null
+----
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+
+# Test range(start, stop) with NULL values
+query ?
+select range(start, stop) from
+  (values (1), (NULL)) as start_values(start),
+  (values (10), (NULL)) as stop_values(stop)
+where start is null or stop is null
+----
+NULL
+NULL
+NULL
+
+# Test range(stop) with NULL value
+query ?
+select range(NULL)
+----
+NULL
+
+## should return NULL
+query ?
+select range(DATE '1992-09-01', NULL, INTERVAL '1' YEAR);
+----
+NULL
+
+## should return NULL
+query ?
+select range(TIMESTAMP '1992-09-01', NULL, INTERVAL '1' YEAR);
+----
+NULL
+
+query ?
+select range(DATE '1992-09-01', DATE '1993-03-01', NULL);
+----
+NULL
+
+query ?
+select range(TIMESTAMP '1992-09-01', TIMESTAMP '1993-03-01', NULL);
+----
+NULL
+
+query ?
+select range(NULL, DATE '1993-03-01', INTERVAL '1' YEAR);
+----
+NULL
+
+query ?
+select generate_series(NULL::Date, DATE '1993-03-01', INTERVAL '1' YEAR);
+----
+NULL
+
+query ?
+select generate_series(DATE '1993-03-01', NULL::Date, INTERVAL '1' YEAR);
+----
+NULL
+
+query ?
+select generate_series(DATE '1993-02-01', DATE '1993-03-01', NULL::Interval);
+----
+NULL
+
+query ?
+select range(NULL, TIMESTAMP '1993-03-01', INTERVAL '1' YEAR);
+----
+NULL
+
+query ?
+select range(NULL, NULL, NULL);
+----
+NULL
+
+query ?
+select range(NULL::timestamp, NULL::timestamp, NULL);
+----
+NULL
+
+query ?
+select range(DATE '1989-04-01', DATE '1993-03-01', INTERVAL '-1' YEAR)
+----
+[]
+
+query ?
+select range(TIMESTAMP '1989-04-01', TIMESTAMP '1993-03-01', INTERVAL '-1' YEAR)
+----
+[]
+
+query ?
+select range(DATE '1993-03-01', DATE '1989-04-01', INTERVAL '1' YEAR)
+----
+[]
+
+query ?
+select range(TIMESTAMP '1993-03-01', TIMESTAMP '1989-04-01', INTERVAL '1' YEAR)
+----
+[]
+
+query error DataFusion error: Execution error: Cannot generate date range less than 1 day\.
+select range(DATE '1993-03-01', DATE '1993-03-01', INTERVAL '1' HOUR)
+
+query ?
+select range(TIMESTAMP '1993-03-01', TIMESTAMP '1993-03-01', INTERVAL '1' HOUR)
+----
+[]
+
+query ?????????
+select generate_series(5),
+       generate_series(2, 5),
+       generate_series(2, 10, 3),
+       generate_series(1, 5, 1),
+       generate_series(5, 1, -1),
+       generate_series(10, 2, -3),
+       generate_series(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH),
+       generate_series(DATE '1993-02-01', DATE '1993-01-01', INTERVAL '-1' DAY),
+       generate_series(DATE '1989-04-01', DATE '1993-03-01', INTERVAL '1' YEAR)
+;
+----
+[0, 1, 2, 3, 4, 5] [2, 3, 4, 5] [2, 5, 8] [1, 2, 3, 4, 5] [5, 4, 3, 2, 1] [10, 7, 4] [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01, 1993-03-01] [1993-02-01, 1993-01-31, 1993-01-30, 1993-01-29, 1993-01-28, 1993-01-27, 1993-01-26, 1993-01-25, 1993-01-24, 1993-01-23, 1993-01-22, 1993-01-21, 1993-01-20, 1993-01-19, 1993-01-18, 1993-01-17, 1993-01-16, 1993-01-15, 1993-01-14, 1993-01-13, 1993-01-12, 1993-01-11, 1993-01-10, 1993-01-09, 1993-01-08, 1993-01-07, 1993-01-06, 1993-01-05, 1993-01-04, 1993-01-03, 1993-01-02, 1993-01-01] [1989-04-01, 1990-04-01, 1991-04-01, 1992-04-01]
+
+query ?
+select generate_series('2021-01-01'::timestamp, '2021-01-01T15:00:00'::timestamp, INTERVAL '1' HOUR);
+----
+[2021-01-01T00:00:00, 2021-01-01T01:00:00, 2021-01-01T02:00:00, 2021-01-01T03:00:00, 2021-01-01T04:00:00, 2021-01-01T05:00:00, 2021-01-01T06:00:00, 2021-01-01T07:00:00, 2021-01-01T08:00:00, 2021-01-01T09:00:00, 2021-01-01T10:00:00, 2021-01-01T11:00:00, 2021-01-01T12:00:00, 2021-01-01T13:00:00, 2021-01-01T14:00:00, 2021-01-01T15:00:00]
+
+# Other timestamp types are coerced to nanosecond
+query ?
+select generate_series(arrow_cast('2021-01-01'::timestamp, 'Timestamp(s)'), '2021-01-01T15:00:00'::timestamp, INTERVAL '1' HOUR);
+----
+[2021-01-01T00:00:00, 2021-01-01T01:00:00, 2021-01-01T02:00:00, 2021-01-01T03:00:00, 2021-01-01T04:00:00, 2021-01-01T05:00:00, 2021-01-01T06:00:00, 2021-01-01T07:00:00, 2021-01-01T08:00:00, 2021-01-01T09:00:00, 2021-01-01T10:00:00, 2021-01-01T11:00:00, 2021-01-01T12:00:00, 2021-01-01T13:00:00, 2021-01-01T14:00:00, 2021-01-01T15:00:00]
+
+query ?
+select generate_series('2021-01-01'::timestamp, arrow_cast('2021-01-01T15:00:00'::timestamp, 'Timestamp(µs)'), INTERVAL '1' HOUR);
+----
+[2021-01-01T00:00:00, 2021-01-01T01:00:00, 2021-01-01T02:00:00, 2021-01-01T03:00:00, 2021-01-01T04:00:00, 2021-01-01T05:00:00, 2021-01-01T06:00:00, 2021-01-01T07:00:00, 2021-01-01T08:00:00, 2021-01-01T09:00:00, 2021-01-01T10:00:00, 2021-01-01T11:00:00, 2021-01-01T12:00:00, 2021-01-01T13:00:00, 2021-01-01T14:00:00, 2021-01-01T15:00:00]
+
+query ?
+select generate_series('2021-01-01T00:00:00EST'::timestamp, '2021-01-01T15:00:00-12:00'::timestamp, INTERVAL '1' HOUR);
+----
+[2021-01-01T05:00:00, 2021-01-01T06:00:00, 2021-01-01T07:00:00, 2021-01-01T08:00:00, 2021-01-01T09:00:00, 2021-01-01T10:00:00, 2021-01-01T11:00:00, 2021-01-01T12:00:00, 2021-01-01T13:00:00, 2021-01-01T14:00:00, 2021-01-01T15:00:00, 2021-01-01T16:00:00, 2021-01-01T17:00:00, 2021-01-01T18:00:00, 2021-01-01T19:00:00, 2021-01-01T20:00:00, 2021-01-01T21:00:00, 2021-01-01T22:00:00, 2021-01-01T23:00:00, 2021-01-02T00:00:00, 2021-01-02T01:00:00, 2021-01-02T02:00:00, 2021-01-02T03:00:00]
+
+query ?
+select generate_series(arrow_cast('2021-01-01T00:00:00', 'Timestamp(Nanosecond, Some("-05:00"))'), arrow_cast('2021-01-01T15:00:00', 'Timestamp(Nanosecond, Some("+05:00"))'), INTERVAL '1' HOUR);
+----
+[2021-01-01T00:00:00-05:00, 2021-01-01T01:00:00-05:00, 2021-01-01T02:00:00-05:00, 2021-01-01T03:00:00-05:00, 2021-01-01T04:00:00-05:00, 2021-01-01T05:00:00-05:00]
+
+## -5500000000 ns is -5.5 sec
+query ?
+select generate_series(arrow_cast('2021-01-01T00:00:00', 'Timestamp(Nanosecond, Some("-05:00"))'), arrow_cast('2021-01-01T06:00:00', 'Timestamp(Nanosecond, Some("-05:00"))'), INTERVAL '1 HOUR 30 MINUTE -5500000000 NANOSECOND');
+----
+[2021-01-01T00:00:00-05:00, 2021-01-01T01:29:54.500-05:00, 2021-01-01T02:59:49-05:00, 2021-01-01T04:29:43.500-05:00, 2021-01-01T05:59:38-05:00]
+
+## mixing types for timestamps is not supported
+query error DataFusion error: Error during planning: Internal error: Function 'generate_series' failed to match any signature
+select generate_series(arrow_cast('2021-01-01T00:00:00', 'Timestamp(Nanosecond, Some("-05:00"))'), DATE '2021-01-02', INTERVAL '1' HOUR);
+
+## mixing types not allowed even if an argument is null
+query error DataFusion error: Error during planning: Internal error: Function 'generate_series' failed to match any signature
+select generate_series(TIMESTAMP '1992-09-01', DATE '1993-03-01', NULL);
+
+query error DataFusion error: Error during planning: Internal error: Function 'generate_series' failed to match any signature
+select generate_series(1, '2024-01-01', '2025-01-02');
+
+query error DataFusion error: Error during planning: Internal error: Function 'generate_series' failed to match any signature
+select generate_series('2024-01-01'::timestamp, '2025-01-02', interval '1 day');
+
+## should return NULL
+query ?
+select generate_series(DATE '1992-09-01', NULL, INTERVAL '1' YEAR);
+----
+NULL
+
+## should return NULL
+query ?
+select generate_series(TIMESTAMP '1992-09-01', NULL, INTERVAL '1' YEAR);
+----
+NULL
+
+query ?
+select generate_series(DATE '1992-09-01', DATE '1993-03-01', NULL);
+----
+NULL
+
+query ?
+select generate_series(NULL, DATE '1993-03-01', INTERVAL '1' YEAR);
+----
+NULL
+
+query ?
+select generate_series(NULL::Date, DATE '1993-03-01', INTERVAL '1' YEAR);
+----
+NULL
+
+query ?
+select generate_series(DATE '1993-03-01', NULL::Date, INTERVAL '1' YEAR);
+----
+NULL
+
+query ?
+select generate_series(DATE '1993-02-01', DATE '1993-03-01', NULL::Interval);
+----
+NULL
+
+query ?
+select generate_series(NULL, TIMESTAMP '1993-03-01', INTERVAL '1' YEAR);
+----
+NULL
+
+query ?
+select generate_series(NULL, NULL, NULL);
+----
+NULL
+
+query ?
+select generate_series(NULL::timestamp, NULL::timestamp, NULL);
+----
+NULL
+
+query ?
+select generate_series(DATE '1989-04-01', DATE '1993-03-01', INTERVAL '-1' YEAR)
+----
+[]
+
+query ?
+select generate_series(TIMESTAMP '1989-04-01', TIMESTAMP '1993-03-01', INTERVAL '-1' YEAR)
+----
+[]
+
+query ?
+select generate_series(DATE '1993-03-01', DATE '1989-04-01', INTERVAL '1' YEAR)
+----
+[]
+
+query ?
+select generate_series(TIMESTAMP '1993-03-01', TIMESTAMP '1989-04-01', INTERVAL '1' YEAR)
+----
+[]
+
+query error DataFusion error: Execution error: Cannot generate date range less than 1 day.
+select generate_series(DATE '2000-01-01', DATE '2000-01-03', INTERVAL '1' HOUR)
+
+query error DataFusion error: Execution error: Cannot generate date range less than 1 day.
+select generate_series(DATE '2000-01-01', DATE '2000-01-03', INTERVAL '-1' HOUR)
+
+query ?
+select generate_series(TIMESTAMP '2000-01-01', TIMESTAMP '2000-01-02', INTERVAL '1' HOUR)
+----
+[2000-01-01T00:00:00, 2000-01-01T01:00:00, 2000-01-01T02:00:00, 2000-01-01T03:00:00, 2000-01-01T04:00:00, 2000-01-01T05:00:00, 2000-01-01T06:00:00, 2000-01-01T07:00:00, 2000-01-01T08:00:00, 2000-01-01T09:00:00, 2000-01-01T10:00:00, 2000-01-01T11:00:00, 2000-01-01T12:00:00, 2000-01-01T13:00:00, 2000-01-01T14:00:00, 2000-01-01T15:00:00, 2000-01-01T16:00:00, 2000-01-01T17:00:00, 2000-01-01T18:00:00, 2000-01-01T19:00:00, 2000-01-01T20:00:00, 2000-01-01T21:00:00, 2000-01-01T22:00:00, 2000-01-01T23:00:00, 2000-01-02T00:00:00]
+
+query ?
+select generate_series(TIMESTAMP '2000-01-02', TIMESTAMP '2000-01-01', INTERVAL '-1' HOUR)
+----
+[2000-01-02T00:00:00, 2000-01-01T23:00:00, 2000-01-01T22:00:00, 2000-01-01T21:00:00, 2000-01-01T20:00:00, 2000-01-01T19:00:00, 2000-01-01T18:00:00, 2000-01-01T17:00:00, 2000-01-01T16:00:00, 2000-01-01T15:00:00, 2000-01-01T14:00:00, 2000-01-01T13:00:00, 2000-01-01T12:00:00, 2000-01-01T11:00:00, 2000-01-01T10:00:00, 2000-01-01T09:00:00, 2000-01-01T08:00:00, 2000-01-01T07:00:00, 2000-01-01T06:00:00, 2000-01-01T05:00:00, 2000-01-01T04:00:00, 2000-01-01T03:00:00, 2000-01-01T02:00:00, 2000-01-01T01:00:00, 2000-01-01T00:00:00]
+
+# Test generate_series with small intervals
+query ?
+select generate_series('2000-01-01T00:00:00.000000001Z'::timestamp, '2000-01-01T00:00:00.00000001Z'::timestamp, INTERVAL '1' NANOSECONDS)
+----
+[2000-01-01T00:00:00.000000001, 2000-01-01T00:00:00.000000002, 2000-01-01T00:00:00.000000003, 2000-01-01T00:00:00.000000004, 2000-01-01T00:00:00.000000005, 2000-01-01T00:00:00.000000006, 2000-01-01T00:00:00.000000007, 2000-01-01T00:00:00.000000008, 2000-01-01T00:00:00.000000009, 2000-01-01T00:00:00.000000010]
+
+# Test generate_series with zero step
+query error DataFusion error: Execution error: step can't be 0 for function generate_series\(start \[, stop, step\]\)
+select generate_series(1, 1, 0);
+
+# Test generate_series with zero step
+query error DataFusion error: Execution error: Interval argument to generate_series must not be 0
+select generate_series(TIMESTAMP '2000-01-02', TIMESTAMP '2000-01-01', INTERVAL '0' MINUTE);
+
+# Test generate_series with big steps
+query ????
+select
+  generate_series(-9223372036854775808, -9223372036854775808, -9223372036854775808) as c1,
+  generate_series(9223372036854775807, 9223372036854775807, 9223372036854775807) as c2,
+  generate_series(0, -9223372036854775808, -9223372036854775808) as c3,
+  generate_series(0, 9223372036854775807, 9223372036854775807) as c4;
+----
+[-9223372036854775808] [9223372036854775807] [0, -9223372036854775808] [0, 9223372036854775807]
+
+
+# Test generate_series for other edge cases
+query ????
+select
+  generate_series(9223372036854775807, 9223372036854775807, -1) as c1,
+  generate_series(9223372036854775807, 9223372036854775807, 1) as c2,
+  generate_series(-9223372036854775808, -9223372036854775808, -1) as c3,
+  generate_series(-9223372036854775808, -9223372036854775808, 1) as c4;
+----
+[9223372036854775807] [9223372036854775807] [-9223372036854775808] [-9223372036854775808]
+
+# Test generate_series(start, stop, step) with NULL values
+query ?
+select generate_series(start, stop, step) from
+  (values (1), (NULL)) as start_values(start),
+  (values (10), (NULL)) as stop_values(stop),
+  (values (3), (NULL)) as step_values(step)
+where start is null or stop is null or step is null
+----
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+
+# Test generate_series(start, stop) with NULL values
+query ?
+select generate_series(start, stop) from
+  (values (1), (NULL)) as start_values(start),
+  (values (10), (NULL)) as stop_values(stop)
+where start is null or stop is null
+----
+NULL
+NULL
+NULL
+
+# Test generate_series(stop) with NULL value
+query ?
+select generate_series(NULL)
+----
+NULL
+
+# Test generate_series with a table of date values
+statement ok
+CREATE TABLE date_table(
+  start DATE,
+  stop DATE,
+  step INTERVAL
+) AS VALUES
+  (DATE '1992-01-01', DATE '1993-01-02', INTERVAL '1' MONTH),
+  (DATE '1993-02-01', DATE '1993-01-01', INTERVAL '-1' DAY),
+  (DATE '1989-04-01', DATE '1993-03-01', INTERVAL '1' YEAR);
+
+query ?
+select generate_series(start, stop, step) from date_table;
+----
+[1992-01-01, 1992-02-01, 1992-03-01, 1992-04-01, 1992-05-01, 1992-06-01, 1992-07-01, 1992-08-01, 1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01]
+[1993-02-01, 1993-01-31, 1993-01-30, 1993-01-29, 1993-01-28, 1993-01-27, 1993-01-26, 1993-01-25, 1993-01-24, 1993-01-23, 1993-01-22, 1993-01-21, 1993-01-20, 1993-01-19, 1993-01-18, 1993-01-17, 1993-01-16, 1993-01-15, 1993-01-14, 1993-01-13, 1993-01-12, 1993-01-11, 1993-01-10, 1993-01-09, 1993-01-08, 1993-01-07, 1993-01-06, 1993-01-05, 1993-01-04, 1993-01-03, 1993-01-02, 1993-01-01]
+[1989-04-01, 1990-04-01, 1991-04-01, 1992-04-01]
+
+query ?
+select generate_series(start, stop, INTERVAL '1 year') from date_table;
+----
+[1992-01-01, 1993-01-01]
+[]
+[1989-04-01, 1990-04-01, 1991-04-01, 1992-04-01]
+
+query ?
+select generate_series(start, '1993-03-01'::date, INTERVAL '1 year') from date_table;
+----
+[1992-01-01, 1993-01-01]
+[1993-02-01]
+[1989-04-01, 1990-04-01, 1991-04-01, 1992-04-01]
+
+# Test generate_series with a table of timestamp values
+statement ok
+CREATE TABLE timestamp_table(
+  start TIMESTAMP,
+  stop TIMESTAMP,
+  step INTERVAL
+) AS VALUES
+  (TIMESTAMP '1992-01-01T00:00:00', TIMESTAMP '1993-01-02T00:00:00', INTERVAL '1' MONTH),
+  (TIMESTAMP '1993-02-01T00:00:00', TIMESTAMP '1993-01-01T00:00:00', INTERVAL '-1' DAY),
+  (TIMESTAMP '1989-04-01T00:00:00', TIMESTAMP '1993-03-01T00:00:00', INTERVAL '1' YEAR);
+
+query ?
+select generate_series(start, stop, step) from timestamp_table;
+----
+[1992-01-01T00:00:00, 1992-02-01T00:00:00, 1992-03-01T00:00:00, 1992-04-01T00:00:00, 1992-05-01T00:00:00, 1992-06-01T00:00:00, 1992-07-01T00:00:00, 1992-08-01T00:00:00, 1992-09-01T00:00:00, 1992-10-01T00:00:00, 1992-11-01T00:00:00, 1992-12-01T00:00:00, 1993-01-01T00:00:00]
+[1993-02-01T00:00:00, 1993-01-31T00:00:00, 1993-01-30T00:00:00, 1993-01-29T00:00:00, 1993-01-28T00:00:00, 1993-01-27T00:00:00, 1993-01-26T00:00:00, 1993-01-25T00:00:00, 1993-01-24T00:00:00, 1993-01-23T00:00:00, 1993-01-22T00:00:00, 1993-01-21T00:00:00, 1993-01-20T00:00:00, 1993-01-19T00:00:00, 1993-01-18T00:00:00, 1993-01-17T00:00:00, 1993-01-16T00:00:00, 1993-01-15T00:00:00, 1993-01-14T00:00:00, 1993-01-13T00:00:00, 1993-01-12T00:00:00, 1993-01-11T00:00:00, 1993-01-10T00:00:00, 1993-01-09T00:00:00, 1993-01-08T00:00:00, 1993-01-07T00:00:00, 1993-01-06T00:00:00, 1993-01-05T00:00:00, 1993-01-04T00:00:00, 1993-01-03T00:00:00, 1993-01-02T00:00:00, 1993-01-01T00:00:00]
+[1989-04-01T00:00:00, 1990-04-01T00:00:00, 1991-04-01T00:00:00, 1992-04-01T00:00:00]
+
+query ?
+select generate_series(start, stop, INTERVAL '1 year') from timestamp_table;
+----
+[1992-01-01T00:00:00, 1993-01-01T00:00:00]
+[]
+[1989-04-01T00:00:00, 1990-04-01T00:00:00, 1991-04-01T00:00:00, 1992-04-01T00:00:00]
+
+query ?
+select generate_series(start, '1993-03-01T00:00:00'::timestamp, INTERVAL '1 year') from timestamp_table;
+----
+[1992-01-01T00:00:00, 1993-01-01T00:00:00]
+[1993-02-01T00:00:00]
+[1989-04-01T00:00:00, 1990-04-01T00:00:00, 1991-04-01T00:00:00, 1992-04-01T00:00:00]
+
+# https://github.com/apache/datafusion/issues/11922
+query ?
+select generate_series(start, '1993-03-01T00:00:00'::timestamp, INTERVAL '1 year') from timestamp_table;
+----
+[1992-01-01T00:00:00, 1993-01-01T00:00:00]
+[1993-02-01T00:00:00]
+[1989-04-01T00:00:00, 1990-04-01T00:00:00, 1991-04-01T00:00:00, 1992-04-01T00:00:00]
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_remove.slt b/datafusion/sqllogictest/test_files/array/array_remove.slt
new file mode 100644
index 0000000000000..c3ce7073eca83
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_remove.slt
@@ -0,0 +1,540 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## array_remove (aliases: `list_remove`)
+
+# array_remove scalar function #1
+query ???
+select array_remove(make_array(1, 2, 2, 1, 1), 2), array_remove(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 1.0), array_remove(make_array('h', 'e', 'l', 'l', 'o'), 'l');
+----
+[1, 2, 1, 1] [2.0, 2.0, 1.0, 1.0] [h, e, l, o]
+
+query ???
+select array_remove(arrow_cast(make_array(1, 2, 2, 1, 1), 'LargeList(Int64)'), 2),
+       array_remove(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'LargeList(Float64)'), 1.0),
+       array_remove(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 'l');
+----
+[1, 2, 1, 1] [2.0, 2.0, 1.0, 1.0] [h, e, l, o]
+
+query ???
+select array_remove(arrow_cast(make_array(1, 2, 2, 1, 1), 'FixedSizeList(5, Int64)'), 2),
+       array_remove(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'FixedSizeList(5, Float64)'), 1.0),
+       array_remove(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 'l');
+----
+[1, 2, 1, 1] [2.0, 2.0, 1.0, 1.0] [h, e, l, o]
+
+query ???
+select
+  array_remove(make_array(1, null, 2, 3), 2),
+  array_remove(make_array(1.1, null, 2.2, 3.3), 1.1),
+  array_remove(make_array('a', null, 'bc'), 'a');
+----
+[1, NULL, 3] [NULL, 2.2, 3.3] [NULL, bc]
+
+query ???
+select
+  array_remove(arrow_cast(make_array(1, null, 2, 3), 'LargeList(Int64)'), 2),
+  array_remove(arrow_cast(make_array(1.1, null, 2.2, 3.3), 'LargeList(Float64)'), 1.1),
+  array_remove(arrow_cast(make_array('a', null, 'bc'), 'LargeList(Utf8)'), 'a');
+----
+[1, NULL, 3] [NULL, 2.2, 3.3] [NULL, bc]
+
+query ???
+select
+  array_remove(arrow_cast(make_array(1, null, 2, 3), 'FixedSizeList(4, Int64)'), 2),
+  array_remove(arrow_cast(make_array(1.1, null, 2.2, 3.3), 'FixedSizeList(4, Float64)'), 1.1),
+  array_remove(arrow_cast(make_array('a', null, 'bc'), 'FixedSizeList(3, Utf8)'), 'a');
+----
+[1, NULL, 3] [NULL, 2.2, 3.3] [NULL, bc]
+
+#TODO: https://github.com/apache/datafusion/issues/7142
+# follow PostgreSQL behavior
+#query ?
+#select
+#  array_remove(NULL, 1)
+#----
+#NULL
+
+query ??
+select
+ array_remove(make_array(1, null, 2), null),
+ array_remove(make_array(1, null, 2, null), null);
+----
+NULL NULL
+
+query ??
+select
+ array_remove(arrow_cast(make_array(1, null, 2), 'LargeList(Int64)'), null),
+ array_remove(arrow_cast(make_array(1, null, 2, null), 'LargeList(Int64)'), null);
+----
+NULL NULL
+
+query ??
+select
+ array_remove(arrow_cast(make_array(1, null, 2), 'FixedSizeList(3, Int64)'), null),
+ array_remove(arrow_cast(make_array(1, null, 2, null), 'FixedSizeList(4, Int64)'), null);
+----
+NULL NULL
+
+# array_remove with null element from column
+query ?
+select array_remove(column1, column2) from (values
+  (make_array(1, 2, 3), 2),
+  (make_array(4, 5, 6), null),
+  (make_array(7, 8, 9), 8),
+  (null, 1)
+) as t(column1, column2);
+----
+[1, 3]
+NULL
+[7, 9]
+NULL
+
+# array_remove with null element from column (LargeList)
+query ?
+select array_remove(column1, column2) from (values
+  (arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 2),
+  (arrow_cast(make_array(4, 5, 6), 'LargeList(Int64)'), null),
+  (arrow_cast(make_array(7, 8, 9), 'LargeList(Int64)'), 8)
+) as t(column1, column2);
+----
+[1, 3]
+NULL
+[7, 9]
+
+# array_remove scalar function #2 (element is list)
+query ??
+select array_remove(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), [4, 5, 6]), array_remove(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), [2, 3, 4]);
+----
+[[1, 2, 3], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [2, 3, 4], [5, 3, 1], [1, 3, 2]]
+
+query ??
+select array_remove(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'LargeList(List(Int64))'), [4, 5, 6]),
+       array_remove(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'LargeList(List(Int64))'), [2, 3, 4]);
+----
+[[1, 2, 3], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [2, 3, 4], [5, 3, 1], [1, 3, 2]]
+
+query ??
+select array_remove(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, List(Int64))'), [4, 5, 6]),
+       array_remove(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, List(Int64))'), [2, 3, 4]);
+----
+[[1, 2, 3], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [2, 3, 4], [5, 3, 1], [1, 3, 2]]
+
+query ??
+select array_remove(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [4, 5, 6]),
+       array_remove(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [2, 3, 4]);
+----
+[[1, 2, 3], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [2, 3, 4], [5, 3, 1], [1, 3, 2]]
+
+# list_remove scalar function #3 (function alias `array_remove`)
+query ???
+select list_remove(make_array(1, 2, 2, 1, 1), 2), list_remove(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 1.0), list_remove(make_array('h', 'e', 'l', 'l', 'o'), 'l');
+----
+[1, 2, 1, 1] [2.0, 2.0, 1.0, 1.0] [h, e, l, o]
+
+query ??
+select list_remove(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, List(Int64))'), [4, 5, 6]),
+       list_remove(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, List(Int64))'), [2, 3, 4]);
+----
+[[1, 2, 3], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [2, 3, 4], [5, 3, 1], [1, 3, 2]]
+
+# array_remove scalar function with columns #1
+query ?
+select array_remove(column1, column2) from arrays_with_repeating_elements;
+----
+[1, 1, 3, 2, 2, 1, 3, 2, 3]
+[4, 5, 5, 6, 5, 5, 5, 4, 4]
+[7, 7, 8, 7, 9, 7, 8, 7, 7]
+[11, 12, 10, 11, 12, 10, 11, 12, 10]
+
+query ?
+select array_remove(column1, column2) from large_arrays_with_repeating_elements;
+----
+[1, 1, 3, 2, 2, 1, 3, 2, 3]
+[4, 5, 5, 6, 5, 5, 5, 4, 4]
+[7, 7, 8, 7, 9, 7, 8, 7, 7]
+[11, 12, 10, 11, 12, 10, 11, 12, 10]
+
+query ?
+select array_remove(column1, column2) from fixed_arrays_with_repeating_elements;
+----
+[1, 1, 3, 2, 2, 1, 3, 2, 3]
+[4, 5, 5, 6, 5, 5, 5, 4, 4]
+[7, 7, 8, 7, 9, 7, 8, 7, 7]
+[11, 12, 10, 11, 12, 10, 11, 12, 10]
+
+# array_remove scalar function with columns #2 (element is list)
+query ?
+select array_remove(column1, column2) from nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
+[[10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
+[[19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
+[[31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
+
+query ?
+select array_remove(column1, column2) from large_nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
+[[10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
+[[19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
+[[31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
+
+query ?
+select array_remove(column1, column2) from fixed_size_nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
+[[10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
+[[19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
+[[31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
+
+# array_remove scalar function with columns and scalars #1
+query ??
+select array_remove(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2), array_remove(column1, 1) from arrays_with_repeating_elements;
+----
+[1, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8] [2, 1, 3, 2, 2, 1, 3, 2, 3]
+[1, 2, 2, 5, 4, 4, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4]
+[1, 2, 2, 4, 5, 4, 4, 7, 10, 7, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7]
+[1, 2, 2, 4, 5, 4, 4, 7, 7, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10]
+
+query ??
+select array_remove(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2), array_remove(column1, 1) from large_arrays_with_repeating_elements;
+----
+[1, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8] [2, 1, 3, 2, 2, 1, 3, 2, 3]
+[1, 2, 2, 5, 4, 4, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4]
+[1, 2, 2, 4, 5, 4, 4, 7, 10, 7, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7]
+[1, 2, 2, 4, 5, 4, 4, 7, 7, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10]
+
+query ??
+select array_remove(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2), array_remove(column1, 1) from fixed_arrays_with_repeating_elements;
+----
+[1, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8] [2, 1, 3, 2, 2, 1, 3, 2, 3]
+[1, 2, 2, 5, 4, 4, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4]
+[1, 2, 2, 4, 5, 4, 4, 7, 10, 7, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7]
+[1, 2, 2, 4, 5, 4, 4, 7, 7, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10]
+
+# array_remove scalar function with columns and scalars #2 (element is list)
+query ??
+select array_remove(make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]), column2),
+       array_remove(column1, make_array(1, 2, 3)) from nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
+
+query ??
+select array_remove(make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]), column2),
+       array_remove(column1, make_array(1, 2, 3)) from large_nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
+
+query ??
+select array_remove(make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]), column2),
+       array_remove(column1, make_array(1, 2, 3)) from fixed_size_nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
+
+## array_remove_n (aliases: `list_remove_n`)
+
+# array_remove_n with null element scalar
+query ??
+select array_remove_n(make_array(1, 2, 2, 1, 1), NULL, 2),
+       array_remove_n(make_array(1, 2, 2, 1, 1), 2, 2);
+----
+NULL [1, 1, 1]
+
+# array_remove_n with null element scalar (LargeList)
+query ??
+select array_remove_n(arrow_cast(make_array(1, 2, 2, 1, 1), 'LargeList(Int64)'), NULL, 2),
+       array_remove_n(arrow_cast(make_array(1, 2, 2, 1, 1), 'LargeList(Int64)'), 2, 2);
+----
+NULL [1, 1, 1]
+
+# array_remove_n with null element from column
+query ?
+select array_remove_n(column1, column2, column3) from (values
+  (make_array(1, 2, 2, 1, 1), 2, 2),
+  (make_array(3, 4, 4, 3, 3), null, 2),
+  (make_array(5, 6, 6, 5, 5), 6, 1),
+  (null, 1, 1)
+) as t(column1, column2, column3);
+----
+[1, 1, 1]
+NULL
+[5, 6, 5, 5]
+NULL
+
+# array_remove_n with null element from column (LargeList)
+query ?
+select array_remove_n(column1, column2, column3) from (values
+  (arrow_cast(make_array(1, 2, 2, 1, 1), 'LargeList(Int64)'), 2, 2),
+  (arrow_cast(make_array(3, 4, 4, 3, 3), 'LargeList(Int64)'), null, 2),
+  (arrow_cast(make_array(5, 6, 6, 5, 5), 'LargeList(Int64)'), 6, 1)
+) as t(column1, column2, column3);
+----
+[1, 1, 1]
+NULL
+[5, 6, 5, 5]
+
+# array_remove_n scalar function #1
+query ???
+select array_remove_n(make_array(1, 2, 2, 1, 1), 2, 2), array_remove_n(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 1.0, 2), array_remove_n(make_array('h', 'e', 'l', 'l', 'o'), 'l', 3);
+----
+[1, 1, 1] [2.0, 2.0, 1.0] [h, e, o]
+
+query ???
+select array_remove_n(arrow_cast(make_array(1, 2, 2, 1, 1), 'LargeList(Int32)'), 2, 2),
+       array_remove_n(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'LargeList(Float32)'), 1.0, 2),
+       array_remove_n(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 'l', 3);
+----
+[1, 1, 1] [2.0, 2.0, 1.0] [h, e, o]
+
+query ???
+select array_remove_n(arrow_cast(make_array(1, 2, 2, 1, 1), 'FixedSizeList(5, Int32)'), 2, 2),
+       array_remove_n(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'FixedSizeList(5, Float32)'), 1.0, 2),
+       array_remove_n(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 'l', 3);
+----
+[1, 1, 1] [2.0, 2.0, 1.0] [h, e, o]
+
+# array_remove_n scalar function #2 (element is list)
+query ??
+select array_remove_n(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), [4, 5, 6], 2), array_remove_n(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), [2, 3, 4], 2);
+----
+[[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]]
+
+query ??
+select array_remove_n(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'LargeList(List(Int64))'), [4, 5, 6], 2),
+       array_remove_n(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'LargeList(List(Int64))'), [2, 3, 4], 2);
+----
+[[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]]
+
+query ??
+select array_remove_n(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [4, 5, 6], 2),
+       array_remove_n(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [2, 3, 4], 2);
+----
+[[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]]
+
+# list_remove_n scalar function #3 (function alias `array_remove_n`)
+query ???
+select list_remove_n(make_array(1, 2, 2, 1, 1), 2, 2), list_remove_n(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 1.0, 2), list_remove_n(make_array('h', 'e', 'l', 'l', 'o'), 'l', 3);
+----
+[1, 1, 1] [2.0, 2.0, 1.0] [h, e, o]
+
+# array_remove_n scalar function with columns #1
+query ?
+select array_remove_n(column1, column2, column4) from arrays_with_repeating_elements;
+----
+[1, 1, 3, 1, 3, 2, 3]
+[5, 5, 6, 5, 5, 5, 4, 4]
+[8, 9, 8, 7, 7]
+[11, 12, 11, 12, 11, 12]
+
+# array_remove_n scalar function with columns #2 (element is list)
+query ?
+select array_remove_n(column1, column2, column4) from nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [1, 2, 3], [7, 8, 9], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
+[[13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
+[[22, 23, 24], [25, 26, 27], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
+[[31, 32, 33], [34, 35, 36], [31, 32, 33], [34, 35, 36], [31, 32, 33], [34, 35, 36]]
+
+# array_remove_n scalar function with columns and scalars #1
+query ???
+select array_remove_n(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2, column4), array_remove_n(column1, 1, column4), array_remove_n(column1, column2, 2) from arrays_with_repeating_elements;
+----
+[1, 4, 5, 4, 4, 7, 7, 10, 7, 8] [2, 3, 2, 2, 3, 2, 3] [1, 1, 3, 2, 1, 3, 2, 3]
+[1, 2, 2, 5, 4, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4] [5, 5, 6, 5, 5, 5, 4, 4]
+[1, 2, 2, 4, 5, 4, 4, 10, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7] [7, 8, 7, 9, 7, 8, 7, 7]
+[1, 2, 2, 4, 5, 4, 4, 7, 7, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10] [11, 12, 11, 12, 10, 11, 12, 10]
+
+# array_remove_n scalar function with columns and scalars #2 (element is list)
+query ???
+select array_remove_n(make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]), column2, column4), array_remove_n(column1, make_array(1, 2, 3), column4), array_remove_n(column1, column2, 2) from nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[4, 5, 6], [7, 8, 9], [4, 5, 6], [4, 5, 6], [7, 8, 9], [4, 5, 6], [7, 8, 9]] [[1, 2, 3], [1, 2, 3], [7, 8, 9], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [13, 14, 15], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] [[13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [28, 29, 30], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [[19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]] [[31, 32, 33], [34, 35, 36], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
+
+## array_remove_all (aliases: `list_removes`)
+
+#TODO: https://github.com/apache/datafusion/issues/7142
+# array_remove_all with NULL elements
+#query ?
+#select array_remove_all(NULL, 1);
+#----
+#NULL
+
+query ?
+select array_remove_all(make_array(1, 2, 2, 1, 1), NULL);
+----
+NULL
+
+# array_remove_all with null element from column
+query ?
+select array_remove_all(column1, column2) from (values
+  (make_array(1, 2, 2, 1, 1), 2),
+  (make_array(3, 4, 4, 3, 3), null),
+  (make_array(5, 6, 6, 5, 5), 6),
+  (null, 1)
+) as t(column1, column2);
+----
+[1, 1, 1]
+NULL
+[5, 5, 5]
+NULL
+
+# array_remove_all with null element from column (LargeList)
+query ?
+select array_remove_all(column1, column2) from (values
+  (arrow_cast(make_array(1, 2, 2, 1, 1), 'LargeList(Int64)'), 2),
+  (arrow_cast(make_array(3, 4, 4, 3, 3), 'LargeList(Int64)'), null),
+  (arrow_cast(make_array(5, 6, 6, 5, 5), 'LargeList(Int64)'), 6)
+) as t(column1, column2);
+----
+[1, 1, 1]
+NULL
+[5, 5, 5]
+
+# array_remove_all scalar function #1
+query ???
+select array_remove_all(make_array(1, 2, 2, 1, 1), 2), array_remove_all(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 1.0), array_remove_all(make_array('h', 'e', 'l', 'l', 'o'), 'l');
+----
+[1, 1, 1] [2.0, 2.0] [h, e, o]
+
+query ???
+select array_remove_all(arrow_cast(make_array(1, 2, 2, 1, 1), 'LargeList(Int64)'), 2),
+       array_remove_all(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'LargeList(Float64)'), 1.0),
+       array_remove_all(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 'l');
+----
+[1, 1, 1] [2.0, 2.0] [h, e, o]
+
+query ???
+select array_remove_all(arrow_cast(make_array(1, 2, 2, 1, 1), 'FixedSizeList(5, Int64)'), 2), array_remove_all(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'FixedSizeList(5, Float64)'), 1.0), array_remove_all(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 'l');
+----
+[1, 1, 1] [2.0, 2.0] [h, e, o]
+
+# array_remove_all scalar function #2 (element is list)
+query ??
+select array_remove_all(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), [4, 5, 6]), array_remove_all(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), [2, 3, 4]);
+----
+[[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]]
+
+query ??
+select array_remove_all(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, List(Int64))'), [4, 5, 6]),
+       array_remove_all(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]),  'FixedSizeList(5, List(Int64))'), [2, 3, 4]);
+----
+[[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]]
+
+query ??
+select array_remove_all(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [4, 5, 6]),
+       array_remove_all(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]),  'FixedSizeList(5, FixedSizeList(3, Int64))'), [2, 3, 4]);
+----
+[[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]]
+
+# list_remove_all scalar function #3 (function alias `array_remove_all`)
+query ???
+select list_remove_all(make_array(1, 2, 2, 1, 1), 2), list_remove_all(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 1.0), list_remove_all(make_array('h', 'e', 'l', 'l', 'o'), 'l');
+----
+[1, 1, 1] [2.0, 2.0] [h, e, o]
+
+query ??
+select list_remove_all(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, List(Int64))'), [4, 5, 6]),
+       list_remove_all(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]),  'FixedSizeList(5, List(Int64))'), [2, 3, 4]);
+----
+[[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]]
+
+# array_remove_all scalar function with columns #1
+query ?
+select array_remove_all(column1, column2) from arrays_with_repeating_elements;
+----
+[1, 1, 3, 1, 3, 3]
+[5, 5, 6, 5, 5, 5]
+[8, 9, 8]
+[11, 12, 11, 12, 11, 12]
+
+query ?
+select array_remove_all(column1, column2) from fixed_arrays_with_repeating_elements;
+----
+[1, 1, 3, 1, 3, 3]
+[5, 5, 6, 5, 5, 5]
+[8, 9, 8]
+[11, 12, 11, 12, 11, 12]
+
+# array_remove_all scalar function with columns #2 (element is list)
+query ?
+select array_remove_all(column1, column2) from nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [1, 2, 3], [7, 8, 9], [1, 2, 3], [7, 8, 9], [7, 8, 9]]
+[[13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15]]
+[[22, 23, 24], [25, 26, 27], [22, 23, 24]]
+[[31, 32, 33], [34, 35, 36], [31, 32, 33], [34, 35, 36], [31, 32, 33], [34, 35, 36]]
+
+query ?
+select array_remove_all(column1, column2) from fixed_size_nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [1, 2, 3], [7, 8, 9], [1, 2, 3], [7, 8, 9], [7, 8, 9]]
+[[13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15]]
+[[22, 23, 24], [25, 26, 27], [22, 23, 24]]
+[[31, 32, 33], [34, 35, 36], [31, 32, 33], [34, 35, 36], [31, 32, 33], [34, 35, 36]]
+
+# array_remove_all scalar function with columns and scalars #1
+query ??
+select array_remove_all(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2), array_remove_all(column1, 1) from arrays_with_repeating_elements;
+----
+[1, 4, 5, 4, 4, 7, 7, 10, 7, 8] [2, 3, 2, 2, 3, 2, 3]
+[1, 2, 2, 5, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4]
+[1, 2, 2, 4, 5, 4, 4, 10, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7]
+[1, 2, 2, 4, 5, 4, 4, 7, 7, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10]
+
+query ??
+select array_remove_all(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2), array_remove_all(column1, 1) from fixed_arrays_with_repeating_elements;
+----
+[1, 4, 5, 4, 4, 7, 7, 10, 7, 8] [2, 3, 2, 2, 3, 2, 3]
+[1, 2, 2, 5, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4]
+[1, 2, 2, 4, 5, 4, 4, 10, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7]
+[1, 2, 2, 4, 5, 4, 4, 7, 7, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10]
+
+# array_remove_all scalar function with columns and scalars #2 (element is list)
+query ??
+select array_remove_all(make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]), column2), array_remove_all(column1, make_array(1, 2, 3)) from nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[4, 5, 6], [7, 8, 9], [4, 5, 6], [4, 5, 6], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [13, 14, 15], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [28, 29, 30], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
+
+query ??
+select array_remove_all(make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]), column2),
+       array_remove_all(column1, make_array(1, 2, 3)) from fixed_size_nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[4, 5, 6], [7, 8, 9], [4, 5, 6], [4, 5, 6], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [13, 14, 15], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [28, 29, 30], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_repeat.slt b/datafusion/sqllogictest/test_files/array/array_repeat.slt
new file mode 100644
index 0000000000000..8052f09cb32c7
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_repeat.slt
@@ -0,0 +1,229 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## array_repeat (aliases: `list_repeat`)
+
+# array_repeat scalar function #1
+query ????????
+select
+  array_repeat(1, 5),
+  array_repeat(3.14, 3),
+  array_repeat('l', 4),
+  array_repeat(null, 2),
+  list_repeat(-1, 5),
+  list_repeat(-3.14, 0),
+  list_repeat('rust', 4),
+  list_repeat(null, 0);
+----
+[1, 1, 1, 1, 1] [3.14, 3.14, 3.14] [l, l, l, l] [NULL, NULL] [-1, -1, -1, -1, -1] [] [rust, rust, rust, rust] []
+
+# array_repeat scalar function #2 (element as list)
+query ????
+select
+  array_repeat([1], 5),
+  array_repeat([1.1, 2.2, 3.3], 3),
+  array_repeat([null, null], 3),
+  array_repeat([[1, 2], [3, 4]], 2);
+----
+[[1], [1], [1], [1], [1]] [[1.1, 2.2, 3.3], [1.1, 2.2, 3.3], [1.1, 2.2, 3.3]] [[NULL, NULL], [NULL, NULL], [NULL, NULL]] [[[1, 2], [3, 4]], [[1, 2], [3, 4]]]
+
+query ????
+select
+  array_repeat(arrow_cast([1], 'LargeList(Int64)'), 5),
+  array_repeat(arrow_cast([1.1, 2.2, 3.3], 'LargeList(Float64)'), 3),
+  array_repeat(arrow_cast([null, null], 'LargeList(Int64)'), 3),
+  array_repeat(arrow_cast([[1, 2], [3, 4]], 'LargeList(List(Int64))'), 2);
+----
+[[1], [1], [1], [1], [1]] [[1.1, 2.2, 3.3], [1.1, 2.2, 3.3], [1.1, 2.2, 3.3]] [[NULL, NULL], [NULL, NULL], [NULL, NULL]] [[[1, 2], [3, 4]], [[1, 2], [3, 4]]]
+
+# array_repeat scalar function with count of different integer types
+query ????????
+Select
+  array_repeat(1, arrow_cast(2,'Int8')),
+  array_repeat(2, arrow_cast(2,'Int16')),
+  array_repeat(3, arrow_cast(2,'Int32')),
+  array_repeat(4, arrow_cast(2,'Int64')),
+  array_repeat(1, arrow_cast(2,'UInt8')),
+  array_repeat(2, arrow_cast(2,'UInt16')),
+  array_repeat(3, arrow_cast(2,'UInt32')),
+  array_repeat(4, arrow_cast(2,'UInt64'));
+----
+[1, 1] [2, 2] [3, 3] [4, 4] [1, 1] [2, 2] [3, 3] [4, 4]
+
+# array_repeat scalar function with count of negative integer types
+query ????
+Select
+  array_repeat(1, arrow_cast(-2,'Int8')),
+  array_repeat(2, arrow_cast(-2,'Int16')),
+  array_repeat(3, arrow_cast(-2,'Int32')),
+  array_repeat(4, arrow_cast(-2,'Int64'));
+----
+[] [] [] []
+
+# array_repeat with columns #1
+
+statement ok
+CREATE TABLE array_repeat_table
+AS VALUES
+  (1, 1, 1.1, 'a', make_array(4, 5, 6)),
+  (2, null, null, null, null),
+  (3, 2, 2.2, 'rust', make_array(7)),
+  (0, 3, 3.3, 'datafusion', make_array(8, 9));
+
+statement ok
+CREATE TABLE large_array_repeat_table
+AS SELECT
+  column1,
+  column2,
+  column3,
+  column4,
+  arrow_cast(column5, 'LargeList(Int64)') as column5
+FROM array_repeat_table;
+
+query ??????
+select
+  array_repeat(column2, column1),
+  array_repeat(column3, column1),
+  array_repeat(column4, column1),
+  array_repeat(column5, column1),
+  array_repeat(column2, 3),
+  array_repeat(make_array(1), column1)
+from array_repeat_table;
+----
+[1] [1.1] [a] [[4, 5, 6]] [1, 1, 1] [[1]]
+[NULL, NULL] [NULL, NULL] [NULL, NULL] [NULL, NULL] [NULL, NULL, NULL] [[1], [1]]
+[2, 2, 2] [2.2, 2.2, 2.2] [rust, rust, rust] [[7], [7], [7]] [2, 2, 2] [[1], [1], [1]]
+[] [] [] [] [3, 3, 3] []
+
+query ??????
+select
+  array_repeat(column2, column1),
+  array_repeat(column3, column1),
+  array_repeat(column4, column1),
+  array_repeat(column5, column1),
+  array_repeat(column2, 3),
+  array_repeat(make_array(1), column1)
+from large_array_repeat_table;
+----
+[1] [1.1] [a] [[4, 5, 6]] [1, 1, 1] [[1]]
+[NULL, NULL] [NULL, NULL] [NULL, NULL] [NULL, NULL] [NULL, NULL, NULL] [[1], [1]]
+[2, 2, 2] [2.2, 2.2, 2.2] [rust, rust, rust] [[7], [7], [7]] [2, 2, 2] [[1], [1], [1]]
+[] [] [] [] [3, 3, 3] []
+
+statement ok
+drop table array_repeat_table;
+
+statement ok
+drop table large_array_repeat_table;
+
+# array_repeat: arrays with NULL counts
+statement ok
+create table array_repeat_null_count_table
+as values
+(1, 2),
+(2, null),
+(3, 1),
+(4, -1),
+(null, null);
+
+query I?
+select column1, array_repeat(column1, column2) from array_repeat_null_count_table;
+----
+1 [1, 1]
+2 NULL
+3 [3]
+4 []
+NULL NULL
+
+statement ok
+drop table array_repeat_null_count_table
+
+# array_repeat: nested arrays with NULL counts
+statement ok
+create table array_repeat_nested_null_count_table
+as values
+([[1, 2], [3, 4]], 2),
+([[5, 6], [7, 8]], null),
+([[null, null], [9, 10]], 1),
+(null, 3),
+([[11, 12]], -1);
+
+query ??
+select column1, array_repeat(column1, column2) from array_repeat_nested_null_count_table;
+----
+[[1, 2], [3, 4]] [[[1, 2], [3, 4]], [[1, 2], [3, 4]]]
+[[5, 6], [7, 8]] NULL
+[[NULL, NULL], [9, 10]] [[[NULL, NULL], [9, 10]]]
+NULL [NULL, NULL, NULL]
+[[11, 12]] []
+
+statement ok
+drop table array_repeat_nested_null_count_table
+
+# array_repeat edge cases: empty arrays
+query ???
+select array_repeat([], 3), array_repeat([], 0), array_repeat([], null);
+----
+[[], [], []] [] NULL
+
+query ??
+select array_repeat(null::int, 0), array_repeat(null::int, null);
+----
+[] NULL
+
+# array_repeat LargeList with NULL count
+statement ok
+create table array_repeat_large_list_null_table
+as values
+(arrow_cast([1, 2, 3], 'LargeList(Int64)'), 2),
+(arrow_cast([4, 5], 'LargeList(Int64)'), null),
+(arrow_cast(null, 'LargeList(Int64)'), 3);
+
+query ??
+select column1, array_repeat(column1, column2) from array_repeat_large_list_null_table;
+----
+[1, 2, 3] [[1, 2, 3], [1, 2, 3]]
+[4, 5] NULL
+NULL [NULL, NULL, NULL]
+
+statement ok
+drop table array_repeat_large_list_null_table
+
+# array_repeat edge cases: LargeList nested with NULL count
+statement ok
+create table array_repeat_large_nested_null_table
+as values
+(arrow_cast([[1, 2], [3, 4]], 'LargeList(List(Int64))'), 2),
+(arrow_cast([[5, 6], [7, 8]], 'LargeList(List(Int64))'), null),
+(arrow_cast([[null, null]], 'LargeList(List(Int64))'), 1),
+(null, 3);
+
+query ??
+select column1, array_repeat(column1, column2) from array_repeat_large_nested_null_table;
+----
+[[1, 2], [3, 4]] [[[1, 2], [3, 4]], [[1, 2], [3, 4]]]
+[[5, 6], [7, 8]] NULL
+[[NULL, NULL]] [[[NULL, NULL]]]
+NULL [NULL, NULL, NULL]
+
+statement ok
+drop table array_repeat_large_nested_null_table
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_replace.slt b/datafusion/sqllogictest/test_files/array/array_replace.slt
new file mode 100644
index 0000000000000..4039915f05be7
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_replace.slt
@@ -0,0 +1,664 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## array_replace (aliases: `list_replace`)
+
+# array_replace scalar function #1
+query ???
+select
+  array_replace(make_array(1, 2, 3, 4), 2, 3),
+  array_replace(make_array(1, 4, 4, 5, 4, 6, 7), 4, 0),
+  array_replace(make_array(1, 2, 3), 4, 0);
+----
+[1, 3, 3, 4] [1, 0, 4, 5, 4, 6, 7] [1, 2, 3]
+
+query ???
+select
+  array_replace(arrow_cast(make_array(1, 2, 3, 4), 'LargeList(Int64)'), 2, 3),
+  array_replace(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'LargeList(Int64)'), 4, 0),
+  array_replace(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4, 0);
+----
+[1, 3, 3, 4] [1, 0, 4, 5, 4, 6, 7] [1, 2, 3]
+
+query ???
+select
+  array_replace(arrow_cast(make_array(1, 2, 3, 4), 'FixedSizeList(4, Int64)'), 2, 3),
+  array_replace(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'FixedSizeList(7, Int64)'), 4, 0),
+  array_replace(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 4, 0);
+----
+[1, 3, 3, 4] [1, 0, 4, 5, 4, 6, 7] [1, 2, 3]
+
+# array_replace scalar function #2 (element is list)
+query ??
+select
+  array_replace(
+    make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]),
+    [4, 5, 6],
+    [1, 1, 1]
+  ),
+  array_replace(
+    make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]),
+    [2, 3, 4],
+    [3, 1, 4]
+  );
+----
+[[1, 2, 3], [1, 1, 1], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]]
+
+query ??
+select
+  array_replace(
+    arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'LargeList(List(Int64))'),
+    [4, 5, 6],
+    [1, 1, 1]
+  ),
+  array_replace(
+    arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'LargeList(List(Int64))'),
+    [2, 3, 4],
+    [3, 1, 4]
+  );
+----
+[[1, 2, 3], [1, 1, 1], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]]
+
+query ??
+select
+  array_replace(
+    arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'),
+    [4, 5, 6],
+    [1, 1, 1]
+  ),
+  array_replace(
+    arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'),
+    [2, 3, 4],
+    [3, 1, 4]
+  );
+----
+[[1, 2, 3], [1, 1, 1], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]]
+
+# list_replace scalar function #3 (function alias `list_replace`)
+query ???
+select list_replace(
+  make_array(1, 2, 3, 4), 2, 3),
+  list_replace(make_array(1, 4, 4, 5, 4, 6, 7), 4, 0),
+  list_replace(make_array(1, 2, 3), 4, 0);
+----
+[1, 3, 3, 4] [1, 0, 4, 5, 4, 6, 7] [1, 2, 3]
+
+query ???
+select list_replace(
+  arrow_cast(make_array(1, 2, 3, 4), 'LargeList(Int64)'), 2, 3),
+  list_replace(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'LargeList(Int64)'), 4, 0),
+  list_replace(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4, 0);
+----
+[1, 3, 3, 4] [1, 0, 4, 5, 4, 6, 7] [1, 2, 3]
+
+# array_replace scalar function #4 (null input)
+query ?
+select array_replace(make_array(1, 2, 3, 4, 5), NULL, NULL);
+----
+[1, 2, 3, 4, 5]
+
+query ?
+select array_replace(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), NULL, NULL);
+----
+[1, 2, 3, 4, 5]
+
+# array_replace scalar function with columns #1
+query ?
+select array_replace(column1, column2, column3) from arrays_with_repeating_elements;
+----
+[1, 4, 1, 3, 2, 2, 1, 3, 2, 3]
+[7, 4, 5, 5, 6, 5, 5, 5, 4, 4]
+[10, 7, 7, 8, 7, 9, 7, 8, 7, 7]
+[13, 11, 12, 10, 11, 12, 10, 11, 12, 10]
+
+query ?
+select array_replace(column1, column2, column3) from large_arrays_with_repeating_elements;
+----
+[1, 4, 1, 3, 2, 2, 1, 3, 2, 3]
+[7, 4, 5, 5, 6, 5, 5, 5, 4, 4]
+[10, 7, 7, 8, 7, 9, 7, 8, 7, 7]
+[13, 11, 12, 10, 11, 12, 10, 11, 12, 10]
+
+# array_replace scalar function with columns #2 (element is list)
+query ?
+select array_replace(column1, column2, column3) from nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [10, 11, 12], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
+[[19, 20, 21], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
+[[28, 29, 30], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
+[[37, 38, 39], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
+
+query ?
+select array_replace(column1, column2, column3) from large_nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [10, 11, 12], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
+[[19, 20, 21], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
+[[28, 29, 30], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
+[[37, 38, 39], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
+
+# array_replace scalar function with columns and scalars #1
+query ???
+select
+  array_replace(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2, column3),
+  array_replace(column1, 1, column3),
+  array_replace(column1, column2, 4)
+from arrays_with_repeating_elements;
+----
+[1, 4, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8] [4, 2, 1, 3, 2, 2, 1, 3, 2, 3] [1, 4, 1, 3, 2, 2, 1, 3, 2, 3]
+[1, 2, 2, 7, 5, 4, 4, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4]
+[1, 2, 2, 4, 5, 4, 4, 10, 7, 10, 7, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7] [4, 7, 7, 8, 7, 9, 7, 8, 7, 7]
+[1, 2, 2, 4, 5, 4, 4, 7, 7, 13, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10] [4, 11, 12, 10, 11, 12, 10, 11, 12, 10]
+
+query ???
+select
+  array_replace(arrow_cast(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), 'LargeList(Int64)'), column2, column3),
+  array_replace(column1, 1, column3),
+  array_replace(column1, column2, 4)
+from large_arrays_with_repeating_elements;
+----
+[1, 4, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8] [4, 2, 1, 3, 2, 2, 1, 3, 2, 3] [1, 4, 1, 3, 2, 2, 1, 3, 2, 3]
+[1, 2, 2, 7, 5, 4, 4, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4]
+[1, 2, 2, 4, 5, 4, 4, 10, 7, 10, 7, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7] [4, 7, 7, 8, 7, 9, 7, 8, 7, 7]
+[1, 2, 2, 4, 5, 4, 4, 7, 7, 13, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10] [4, 11, 12, 10, 11, 12, 10, 11, 12, 10]
+
+# array_replace scalar function with columns and scalars #2 (element is list)
+query ???
+select
+  array_replace(
+    make_array(
+      [1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]),
+      column2,
+      column3
+    ),
+  array_replace(column1, make_array(1, 2, 3), column3),
+  array_replace(column1, column2, make_array(11, 12, 13))
+from nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [10, 11, 12], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]] [[1, 2, 3], [11, 12, 13], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [19, 20, 21], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] [[11, 12, 13], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [28, 29, 30], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [[11, 12, 13], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [37, 38, 39], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]] [[11, 12, 13], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
+
+query ???
+select
+  array_replace(
+    arrow_cast(make_array(
+      [1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]),'LargeList(List(Int64))'),
+      column2,
+      column3
+    ),
+  array_replace(column1, make_array(1, 2, 3), column3),
+  array_replace(column1, column2, make_array(11, 12, 13))
+from large_nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [10, 11, 12], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]] [[1, 2, 3], [11, 12, 13], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [19, 20, 21], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] [[11, 12, 13], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [28, 29, 30], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [[11, 12, 13], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [37, 38, 39], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]] [[11, 12, 13], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
+
+## array_replace_n (aliases: `list_replace_n`)
+
+# array_replace_n scalar function #1
+query ???
+select
+  array_replace_n(make_array(1, 2, 3, 4), 2, 3, 2),
+  array_replace_n(make_array(1, 4, 4, 5, 4, 6, 7), 4, 0, 2),
+  array_replace_n(make_array(1, 2, 3), 4, 0, 3);
+----
+[1, 3, 3, 4] [1, 0, 0, 5, 4, 6, 7] [1, 2, 3]
+
+query ???
+select
+  array_replace_n(arrow_cast(make_array(1, 2, 3, 4), 'LargeList(Int64)'), 2, 3, 2),
+  array_replace_n(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'LargeList(Int64)'), 4, 0, 2),
+  array_replace_n(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4, 0, 3);
+----
+[1, 3, 3, 4] [1, 0, 0, 5, 4, 6, 7] [1, 2, 3]
+
+query ???
+select
+  array_replace_n(arrow_cast(make_array(1, 2, 3, 4), 'FixedSizeList(4, Int64)'), 2, 3, 2),
+  array_replace_n(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'FixedSizeList(7, Int64)'), 4, 0, 2),
+  array_replace_n(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 4, 0, 3);
+----
+[1, 3, 3, 4] [1, 0, 0, 5, 4, 6, 7] [1, 2, 3]
+
+# array_replace_n scalar function #2 (element is list)
+query ??
+select
+  array_replace_n(
+    make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]),
+      [4, 5, 6],
+      [1, 1, 1],
+      2
+    ),
+  array_replace_n(
+    make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]),
+    [2, 3, 4],
+    [3, 1, 4],
+    2
+  );
+----
+[[1, 2, 3], [1, 1, 1], [5, 5, 5], [1, 1, 1], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [3, 1, 4], [5, 3, 1], [1, 3, 2]]
+
+query ??
+select
+  array_replace_n(
+    arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'LargeList(List(Int64))'),
+      [4, 5, 6],
+      [1, 1, 1],
+      2
+    ),
+  array_replace_n(
+    arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'LargeList(List(Int64))'),
+    [2, 3, 4],
+    [3, 1, 4],
+    2
+  );
+----
+[[1, 2, 3], [1, 1, 1], [5, 5, 5], [1, 1, 1], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [3, 1, 4], [5, 3, 1], [1, 3, 2]]
+
+query ??
+select
+  array_replace_n(
+    arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'),
+      [4, 5, 6],
+      [1, 1, 1],
+      2
+    ),
+  array_replace_n(
+    arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'),
+    [2, 3, 4],
+    [3, 1, 4],
+    2
+  );
+----
+[[1, 2, 3], [1, 1, 1], [5, 5, 5], [1, 1, 1], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [3, 1, 4], [5, 3, 1], [1, 3, 2]]
+
+# list_replace_n scalar function #3 (function alias `array_replace_n`)
+query ???
+select
+  list_replace_n(make_array(1, 2, 3, 4), 2, 3, 2),
+  list_replace_n(make_array(1, 4, 4, 5, 4, 6, 7), 4, 0, 2),
+  list_replace_n(make_array(1, 2, 3), 4, 0, 3);
+----
+[1, 3, 3, 4] [1, 0, 0, 5, 4, 6, 7] [1, 2, 3]
+
+query ???
+select
+  list_replace_n(arrow_cast(make_array(1, 2, 3, 4), 'LargeList(Int64)'), 2, 3, 2),
+  list_replace_n(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'LargeList(Int64)'), 4, 0, 2),
+  list_replace_n(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4, 0, 3);
+----
+[1, 3, 3, 4] [1, 0, 0, 5, 4, 6, 7] [1, 2, 3]
+
+# array_replace_n scalar function #4 (null input)
+query ?
+select array_replace_n(make_array(1, 2, 3, 4, 5), NULL, NULL, NULL);
+----
+[1, 2, 3, 4, 5]
+
+query ?
+select array_replace_n(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), NULL, NULL, NULL);
+----
+[1, 2, 3, 4, 5]
+
+# array_replace_n scalar function with columns #1
+query ?
+select
+  array_replace_n(column1, column2, column3, column4)
+from arrays_with_repeating_elements;
+----
+[1, 4, 1, 3, 4, 4, 1, 3, 2, 3]
+[7, 7, 5, 5, 6, 5, 5, 5, 4, 4]
+[10, 10, 10, 8, 10, 9, 10, 8, 7, 7]
+[13, 11, 12, 13, 11, 12, 13, 11, 12, 13]
+
+query ?
+select
+  array_replace_n(column1, column2, column3, column4)
+from large_arrays_with_repeating_elements;
+----
+[1, 4, 1, 3, 4, 4, 1, 3, 2, 3]
+[7, 7, 5, 5, 6, 5, 5, 5, 4, 4]
+[10, 10, 10, 8, 10, 9, 10, 8, 7, 7]
+[13, 11, 12, 13, 11, 12, 13, 11, 12, 13]
+
+# array_replace_n scalar function with columns #2 (element is list)
+query ?
+select
+  array_replace_n(column1, column2, column3, column4)
+from nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [10, 11, 12], [1, 2, 3], [7, 8, 9], [10, 11, 12], [10, 11, 12], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
+[[19, 20, 21], [19, 20, 21], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
+[[28, 29, 30], [28, 29, 30], [28, 29, 30], [22, 23, 24], [28, 29, 30], [25, 26, 27], [28, 29, 30], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
+[[37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39]]
+
+query ?
+select
+  array_replace_n(column1, column2, column3, column4)
+from large_nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [10, 11, 12], [1, 2, 3], [7, 8, 9], [10, 11, 12], [10, 11, 12], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
+[[19, 20, 21], [19, 20, 21], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
+[[28, 29, 30], [28, 29, 30], [28, 29, 30], [22, 23, 24], [28, 29, 30], [25, 26, 27], [28, 29, 30], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
+[[37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39]]
+
+
+# array_replace_n scalar function with columns and scalars #1
+query ????
+select
+  array_replace_n(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2, column3, column4),
+  array_replace_n(column1, 1, column3, column4),
+  array_replace_n(column1, column2, 4, column4),
+  array_replace_n(column1, column2, column3, 2)
+from arrays_with_repeating_elements;
+----
+[1, 4, 4, 4, 5, 4, 4, 7, 7, 10, 7, 8] [4, 2, 4, 3, 2, 2, 4, 3, 2, 3] [1, 4, 1, 3, 4, 4, 1, 3, 2, 3] [1, 4, 1, 3, 4, 2, 1, 3, 2, 3]
+[1, 2, 2, 7, 5, 7, 4, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4] [7, 7, 5, 5, 6, 5, 5, 5, 4, 4]
+[1, 2, 2, 4, 5, 4, 4, 10, 10, 10, 10, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7] [4, 4, 4, 8, 4, 9, 4, 8, 7, 7] [10, 10, 7, 8, 7, 9, 7, 8, 7, 7]
+[1, 2, 2, 4, 5, 4, 4, 7, 7, 13, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10] [4, 11, 12, 4, 11, 12, 4, 11, 12, 4] [13, 11, 12, 13, 11, 12, 10, 11, 12, 10]
+
+query ????
+select
+  array_replace_n(arrow_cast(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), 'LargeList(Int64)'), column2, column3, column4),
+  array_replace_n(column1, 1, column3, column4),
+  array_replace_n(column1, column2, 4, column4),
+  array_replace_n(column1, column2, column3, 2)
+from large_arrays_with_repeating_elements;
+----
+[1, 4, 4, 4, 5, 4, 4, 7, 7, 10, 7, 8] [4, 2, 4, 3, 2, 2, 4, 3, 2, 3] [1, 4, 1, 3, 4, 4, 1, 3, 2, 3] [1, 4, 1, 3, 4, 2, 1, 3, 2, 3]
+[1, 2, 2, 7, 5, 7, 4, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4] [7, 7, 5, 5, 6, 5, 5, 5, 4, 4]
+[1, 2, 2, 4, 5, 4, 4, 10, 10, 10, 10, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7] [4, 4, 4, 8, 4, 9, 4, 8, 7, 7] [10, 10, 7, 8, 7, 9, 7, 8, 7, 7]
+[1, 2, 2, 4, 5, 4, 4, 7, 7, 13, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10] [4, 11, 12, 4, 11, 12, 4, 11, 12, 4] [13, 11, 12, 13, 11, 12, 10, 11, 12, 10]
+
+# array_replace_n scalar function with columns and scalars #2 (element is list)
+query ????
+select
+  array_replace_n(
+    make_array(
+      [7, 8, 9], [2, 1, 3], [1, 5, 6], [10, 11, 12], [2, 1, 3], [7, 8, 9], [4, 5, 6]),
+      column2,
+      column3,
+      column4
+    ),
+    array_replace_n(column1, make_array(1, 2, 3), column3, column4),
+    array_replace_n(column1, column2, make_array(11, 12, 13), column4),
+    array_replace_n(column1, column2, column3, 2)
+from nested_arrays_with_repeating_elements;
+----
+[[7, 8, 9], [2, 1, 3], [1, 5, 6], [10, 11, 12], [2, 1, 3], [7, 8, 9], [10, 11, 12]] [[10, 11, 12], [4, 5, 6], [10, 11, 12], [7, 8, 9], [4, 5, 6], [4, 5, 6], [10, 11, 12], [7, 8, 9], [4, 5, 6], [7, 8, 9]] [[1, 2, 3], [11, 12, 13], [1, 2, 3], [7, 8, 9], [11, 12, 13], [11, 12, 13], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]] [[1, 2, 3], [10, 11, 12], [1, 2, 3], [7, 8, 9], [10, 11, 12], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
+[[7, 8, 9], [2, 1, 3], [1, 5, 6], [19, 20, 21], [2, 1, 3], [7, 8, 9], [4, 5, 6]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] [[11, 12, 13], [11, 12, 13], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] [[19, 20, 21], [19, 20, 21], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
+[[7, 8, 9], [2, 1, 3], [1, 5, 6], [10, 11, 12], [2, 1, 3], [7, 8, 9], [4, 5, 6]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [[11, 12, 13], [11, 12, 13], [11, 12, 13], [22, 23, 24], [11, 12, 13], [25, 26, 27], [11, 12, 13], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [[28, 29, 30], [28, 29, 30], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
+[[7, 8, 9], [2, 1, 3], [1, 5, 6], [10, 11, 12], [2, 1, 3], [7, 8, 9], [4, 5, 6]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]] [[11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13]] [[37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
+
+query ????
+select
+  array_replace_n(
+    arrow_cast(make_array(
+      [7, 8, 9], [2, 1, 3], [1, 5, 6], [10, 11, 12], [2, 1, 3], [7, 8, 9], [4, 5, 6]), 'LargeList(List(Int64))'),
+      column2,
+      column3,
+      column4
+    ),
+    array_replace_n(column1, make_array(1, 2, 3), column3, column4),
+    array_replace_n(column1, column2, make_array(11, 12, 13), column4),
+    array_replace_n(column1, column2, column3, 2)
+from large_nested_arrays_with_repeating_elements;
+----
+[[7, 8, 9], [2, 1, 3], [1, 5, 6], [10, 11, 12], [2, 1, 3], [7, 8, 9], [10, 11, 12]] [[10, 11, 12], [4, 5, 6], [10, 11, 12], [7, 8, 9], [4, 5, 6], [4, 5, 6], [10, 11, 12], [7, 8, 9], [4, 5, 6], [7, 8, 9]] [[1, 2, 3], [11, 12, 13], [1, 2, 3], [7, 8, 9], [11, 12, 13], [11, 12, 13], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]] [[1, 2, 3], [10, 11, 12], [1, 2, 3], [7, 8, 9], [10, 11, 12], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]]
+[[7, 8, 9], [2, 1, 3], [1, 5, 6], [19, 20, 21], [2, 1, 3], [7, 8, 9], [4, 5, 6]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] [[11, 12, 13], [11, 12, 13], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] [[19, 20, 21], [19, 20, 21], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]]
+[[7, 8, 9], [2, 1, 3], [1, 5, 6], [10, 11, 12], [2, 1, 3], [7, 8, 9], [4, 5, 6]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [[11, 12, 13], [11, 12, 13], [11, 12, 13], [22, 23, 24], [11, 12, 13], [25, 26, 27], [11, 12, 13], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [[28, 29, 30], [28, 29, 30], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]]
+[[7, 8, 9], [2, 1, 3], [1, 5, 6], [10, 11, 12], [2, 1, 3], [7, 8, 9], [4, 5, 6]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]] [[11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13]] [[37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]]
+
+## array_replace_all (aliases: `list_replace_all`)
+
+# array_replace_all scalar function #1
+query ???
+select
+  array_replace_all(make_array(1, 2, 3, 4), 2, 3),
+  array_replace_all(make_array(1, 4, 4, 5, 4, 6, 7), 4, 0),
+  array_replace_all(make_array(1, 2, 3), 4, 0);
+----
+[1, 3, 3, 4] [1, 0, 0, 5, 0, 6, 7] [1, 2, 3]
+
+query ???
+select
+  array_replace_all(arrow_cast(make_array(1, 2, 3, 4), 'LargeList(Int64)'), 2, 3),
+  array_replace_all(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'LargeList(Int64)'), 4, 0),
+  array_replace_all(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4, 0);
+----
+[1, 3, 3, 4] [1, 0, 0, 5, 0, 6, 7] [1, 2, 3]
+
+query ???
+select
+  array_replace_all(arrow_cast(make_array(1, 2, 3, 4), 'FixedSizeList(4, Int64)'), 2, 3),
+  array_replace_all(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'FixedSizeList(7, Int64)'), 4, 0),
+  array_replace_all(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 4, 0);
+----
+[1, 3, 3, 4] [1, 0, 0, 5, 0, 6, 7] [1, 2, 3]
+
+# array_replace_all scalar function #2 (element is list)
+query ??
+select
+  array_replace_all(
+    make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]),
+    [4, 5, 6],
+    [1, 1, 1]
+  ),
+  array_replace_all(
+    make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]),
+    [2, 3, 4],
+    [3, 1, 4]
+  );
+----
+[[1, 2, 3], [1, 1, 1], [5, 5, 5], [1, 1, 1], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [3, 1, 4], [5, 3, 1], [1, 3, 2]]
+
+query ??
+select
+  array_replace_all(
+    arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'LargeList(List(Int64))'),
+    [4, 5, 6],
+    [1, 1, 1]
+  ),
+  array_replace_all(
+    arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'LargeList(List(Int64))'),
+    [2, 3, 4],
+    [3, 1, 4]
+  );
+----
+[[1, 2, 3], [1, 1, 1], [5, 5, 5], [1, 1, 1], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [3, 1, 4], [5, 3, 1], [1, 3, 2]]
+
+query ??
+select
+  array_replace_all(
+    arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'),
+    [4, 5, 6],
+    [1, 1, 1]
+  ),
+  array_replace_all(
+    arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'),
+    [2, 3, 4],
+    [3, 1, 4]
+  );
+----
+[[1, 2, 3], [1, 1, 1], [5, 5, 5], [1, 1, 1], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [3, 1, 4], [5, 3, 1], [1, 3, 2]]
+
+# list_replace_all scalar function #3 (function alias `array_replace_all`)
+query ???
+select
+  list_replace_all(make_array(1, 2, 3, 4), 2, 3),
+  list_replace_all(make_array(1, 4, 4, 5, 4, 6, 7), 4, 0),
+  list_replace_all(make_array(1, 2, 3), 4, 0);
+----
+[1, 3, 3, 4] [1, 0, 0, 5, 0, 6, 7] [1, 2, 3]
+
+query ???
+select
+  list_replace_all(arrow_cast(make_array(1, 2, 3, 4), 'LargeList(Int64)'), 2, 3),
+  list_replace_all(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'LargeList(Int64)'), 4, 0),
+  list_replace_all(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4, 0);
+----
+[1, 3, 3, 4] [1, 0, 0, 5, 0, 6, 7] [1, 2, 3]
+
+# array_replace_all scalar function #4 (null input)
+query ?
+select array_replace_all(make_array(1, 2, 3, 4, 5), NULL, NULL);
+----
+[1, 2, 3, 4, 5]
+
+query ?
+select array_replace_all(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), NULL, NULL);
+----
+[1, 2, 3, 4, 5]
+
+# array_replace_all scalar function with columns #1
+query ?
+select
+  array_replace_all(column1, column2, column3)
+from arrays_with_repeating_elements;
+----
+[1, 4, 1, 3, 4, 4, 1, 3, 4, 3]
+[7, 7, 5, 5, 6, 5, 5, 5, 7, 7]
+[10, 10, 10, 8, 10, 9, 10, 8, 10, 10]
+[13, 11, 12, 13, 11, 12, 13, 11, 12, 13]
+
+query ?
+select
+  array_replace_all(column1, column2, column3)
+from large_arrays_with_repeating_elements;
+----
+[1, 4, 1, 3, 4, 4, 1, 3, 4, 3]
+[7, 7, 5, 5, 6, 5, 5, 5, 7, 7]
+[10, 10, 10, 8, 10, 9, 10, 8, 10, 10]
+[13, 11, 12, 13, 11, 12, 13, 11, 12, 13]
+
+# array_replace_all scalar function with columns #2 (element is list)
+query ?
+select
+  array_replace_all(column1, column2, column3)
+from nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [10, 11, 12], [1, 2, 3], [7, 8, 9], [10, 11, 12], [10, 11, 12], [1, 2, 3], [7, 8, 9], [10, 11, 12], [7, 8, 9]]
+[[19, 20, 21], [19, 20, 21], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [19, 20, 21], [19, 20, 21]]
+[[28, 29, 30], [28, 29, 30], [28, 29, 30], [22, 23, 24], [28, 29, 30], [25, 26, 27], [28, 29, 30], [22, 23, 24], [28, 29, 30], [28, 29, 30]]
+[[37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39]]
+
+query ?
+select
+  array_replace_all(column1, column2, column3)
+from large_nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [10, 11, 12], [1, 2, 3], [7, 8, 9], [10, 11, 12], [10, 11, 12], [1, 2, 3], [7, 8, 9], [10, 11, 12], [7, 8, 9]]
+[[19, 20, 21], [19, 20, 21], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [19, 20, 21], [19, 20, 21]]
+[[28, 29, 30], [28, 29, 30], [28, 29, 30], [22, 23, 24], [28, 29, 30], [25, 26, 27], [28, 29, 30], [22, 23, 24], [28, 29, 30], [28, 29, 30]]
+[[37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39], [31, 32, 33], [34, 35, 36], [37, 38, 39]]
+
+# array_replace_all scalar function with columns and scalars #1
+query ???
+select
+  array_replace_all(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2, column3),
+  array_replace_all(column1, 1, column3),
+  array_replace_all(column1, column2, 4)
+from arrays_with_repeating_elements;
+----
+[1, 4, 4, 4, 5, 4, 4, 7, 7, 10, 7, 8] [4, 2, 4, 3, 2, 2, 4, 3, 2, 3] [1, 4, 1, 3, 4, 4, 1, 3, 4, 3]
+[1, 2, 2, 7, 5, 7, 7, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4]
+[1, 2, 2, 4, 5, 4, 4, 10, 10, 10, 10, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7] [4, 4, 4, 8, 4, 9, 4, 8, 4, 4]
+[1, 2, 2, 4, 5, 4, 4, 7, 7, 13, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10] [4, 11, 12, 4, 11, 12, 4, 11, 12, 4]
+
+query ???
+select
+  array_replace_all(arrow_cast(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), 'LargeList(Int64)'), column2, column3),
+  array_replace_all(column1, 1, column3),
+  array_replace_all(column1, column2, 4)
+from large_arrays_with_repeating_elements;
+----
+[1, 4, 4, 4, 5, 4, 4, 7, 7, 10, 7, 8] [4, 2, 4, 3, 2, 2, 4, 3, 2, 3] [1, 4, 1, 3, 4, 4, 1, 3, 4, 3]
+[1, 2, 2, 7, 5, 7, 7, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4]
+[1, 2, 2, 4, 5, 4, 4, 10, 10, 10, 10, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7] [4, 4, 4, 8, 4, 9, 4, 8, 4, 4]
+[1, 2, 2, 4, 5, 4, 4, 7, 7, 13, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10] [4, 11, 12, 4, 11, 12, 4, 11, 12, 4]
+
+# array_replace_all scalar function with columns and scalars #2 (element is list)
+query ???
+select
+  array_replace_all(
+    make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]),
+    column2,
+    column3
+  ),
+  array_replace_all(column1, make_array(1, 2, 3), column3),
+  array_replace_all(column1, column2, make_array(11, 12, 13))
+from nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [10, 11, 12], [10, 11, 12], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [4, 5, 6], [10, 11, 12], [7, 8, 9], [4, 5, 6], [4, 5, 6], [10, 11, 12], [7, 8, 9], [4, 5, 6], [7, 8, 9]] [[1, 2, 3], [11, 12, 13], [1, 2, 3], [7, 8, 9], [11, 12, 13], [11, 12, 13], [1, 2, 3], [7, 8, 9], [11, 12, 13], [7, 8, 9]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [19, 20, 21], [13, 14, 15], [19, 20, 21], [19, 20, 21], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] [[11, 12, 13], [11, 12, 13], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [11, 12, 13], [11, 12, 13]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [28, 29, 30], [28, 29, 30], [28, 29, 30], [28, 29, 30], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [[11, 12, 13], [11, 12, 13], [11, 12, 13], [22, 23, 24], [11, 12, 13], [25, 26, 27], [11, 12, 13], [22, 23, 24], [11, 12, 13], [11, 12, 13]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [37, 38, 39], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]] [[11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13]]
+
+query ???
+select
+  array_replace_all(
+    arrow_cast(make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]), 'LargeList(List(Int64))'),
+    column2,
+    column3
+  ),
+  array_replace_all(column1, make_array(1, 2, 3), column3),
+  array_replace_all(column1, column2, make_array(11, 12, 13))
+from nested_arrays_with_repeating_elements;
+----
+[[1, 2, 3], [10, 11, 12], [10, 11, 12], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [4, 5, 6], [10, 11, 12], [7, 8, 9], [4, 5, 6], [4, 5, 6], [10, 11, 12], [7, 8, 9], [4, 5, 6], [7, 8, 9]] [[1, 2, 3], [11, 12, 13], [1, 2, 3], [7, 8, 9], [11, 12, 13], [11, 12, 13], [1, 2, 3], [7, 8, 9], [11, 12, 13], [7, 8, 9]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [19, 20, 21], [13, 14, 15], [19, 20, 21], [19, 20, 21], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] [[11, 12, 13], [11, 12, 13], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [11, 12, 13], [11, 12, 13]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [28, 29, 30], [28, 29, 30], [28, 29, 30], [28, 29, 30], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [[11, 12, 13], [11, 12, 13], [11, 12, 13], [22, 23, 24], [11, 12, 13], [25, 26, 27], [11, 12, 13], [22, 23, 24], [11, 12, 13], [11, 12, 13]]
+[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [37, 38, 39], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]] [[11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13], [31, 32, 33], [34, 35, 36], [11, 12, 13]]
+
+# array_replace with null handling
+
+statement ok
+create table t as values
+  (make_array(3, 1, NULL, 3), 3,    4,    2),
+  (make_array(3, 1, NULL, 3), NULL, 5,    2),
+  (NULL,                            3,    2,    1),
+  (make_array(3, 1, 3),             3,    NULL, 1)
+;
+
+
+# ([3, 1, NULL, 3], 3,    4,    2)  => [4, 1, NULL, 4] NULL not matched
+# ([3, 1, NULL, 3], NULL, 5,    2)  => [3, 1, NULL, 3] NULL is replaced with 5
+# ([NULL],          3,    2,    1)  => NULL
+# ([3, 1, 3],       3,    NULL, 1)  => [NULL, 1 3]
+
+query ?III?
+select column1, column2, column3, column4, array_replace_n(column1, column2, column3, column4) from t;
+----
+[3, 1, NULL, 3] 3 4 2 [4, 1, NULL, 4]
+[3, 1, NULL, 3] NULL 5 2 [3, 1, 5, 3]
+NULL 3 2 1 NULL
+[3, 1, 3] 3 NULL 1 [NULL, 1, 3]
+
+
+
+statement ok
+drop table t;
+
+
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_resize.slt b/datafusion/sqllogictest/test_files/array/array_resize.slt
new file mode 100644
index 0000000000000..91febb76ac00e
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_resize.slt
@@ -0,0 +1,173 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+# array_resize scalar function #1
+query ?
+select array_resize(make_array(1, 2, 3), 1);
+----
+[1]
+
+query ?
+select array_resize(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 1);
+----
+[1]
+
+query ?
+select array_resize(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 1);
+----
+[1]
+
+# array_resize scalar function #2
+query ?
+select array_resize(make_array(1, 2, 3), 5);
+----
+[1, 2, 3, NULL, NULL]
+
+query ?
+select array_resize(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 5);
+----
+[1, 2, 3, NULL, NULL]
+
+query ?
+select array_resize(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 5);
+----
+[1, 2, 3, NULL, NULL]
+
+# array_resize scalar function #3
+query ?
+select array_resize(make_array(1, 2, 3), 5, 4);
+----
+[1, 2, 3, 4, 4]
+
+query ?
+select array_resize(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 5, 4);
+----
+[1, 2, 3, 4, 4]
+
+# array_resize scalar function #4
+query error
+select array_resize(make_array(1, 2, 3), -5, 2);
+
+# array_resize scalar function #5
+query ?
+select array_resize(make_array(1.1, 2.2, 3.3), 10, 9.9);
+----
+[1.1, 2.2, 3.3, 9.9, 9.9, 9.9, 9.9, 9.9, 9.9, 9.9]
+
+query ?
+select array_resize(arrow_cast(make_array(1.1, 2.2, 3.3), 'LargeList(Float64)'), 10, 9.9);
+----
+[1.1, 2.2, 3.3, 9.9, 9.9, 9.9, 9.9, 9.9, 9.9, 9.9]
+
+# array_resize scalar function #5
+query ?
+select array_resize(column1, column2, column3) from arrays_values;
+----
+[NULL]
+[11, 12, 13, 14, 15, 16, 17, 18, NULL, 20, 2, 2]
+[21, 22, 23, NULL, 25, 26, 27, 28, 29, 30, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
+[31, 32, 33, 34, 35, NULL, 37, 38, 39, 40, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
+NULL
+[]
+[51, 52, NULL, 54, 55, 56, 57, 58, 59, 60, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL]
+[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
+
+query ?
+select array_resize(arrow_cast(column1, 'LargeList(Int64)'), column2, column3) from arrays_values;
+----
+[NULL]
+[11, 12, 13, 14, 15, 16, 17, 18, NULL, 20, 2, 2]
+[21, 22, 23, NULL, 25, 26, 27, 28, 29, 30, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
+[31, 32, 33, 34, 35, NULL, 37, 38, 39, 40, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
+NULL
+[]
+[51, 52, NULL, 54, 55, 56, 57, 58, 59, 60, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL]
+[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
+
+# array_resize scalar function #5
+query ?
+select array_resize([[1], [2], [3]], 10, [5]);
+----
+[[1], [2], [3], [5], [5], [5], [5], [5], [5], [5]]
+
+query ?
+select array_resize(arrow_cast([[1], [2], [3]], 'LargeList(List(Int64))'), 10, [5]);
+----
+[[1], [2], [3], [5], [5], [5], [5], [5], [5], [5]]
+
+# array_resize null value
+query ?
+select array_resize(arrow_cast(NULL, 'List(Int8)'), 1);
+----
+NULL
+
+statement ok
+CREATE TABLE array_resize_values
+AS VALUES
+  (make_array(1, NULL, 3, 4, 5, 6, 7, 8, 9, 10), 2, 1),
+  (make_array(11, 12, NULL, 14, 15, 16, 17, 18, 19, 20), 5, 2),
+  (make_array(21, 22, 23, 24, NULL, 26, 27, 28, 29, 30), 8, 3),
+  (make_array(31, 32, 33, 34, 35, 36, NULL, 38, 39, 40), 12, 4),
+  (NULL, 3, 0),
+  (make_array(41, 42, 43, 44, 45, 46, 47, 48, 49, 50), NULL, 6),
+  (make_array(51, 52, 53, 54, 55, NULL, 57, 58, 59, 60), 13, NULL),
+  (make_array(61, 62, 63, 64, 65, 66, 67, 68, 69, 70), 15, 7)
+;
+
+# array_resize columnar test #1
+query ?
+select array_resize(column1, column2, column3) from array_resize_values;
+----
+[1, NULL]
+[11, 12, NULL, 14, 15]
+[21, 22, 23, 24, NULL, 26, 27, 28]
+[31, 32, 33, 34, 35, 36, NULL, 38, 39, 40, 4, 4]
+NULL
+[]
+[51, 52, 53, 54, 55, NULL, 57, 58, 59, 60, NULL, NULL, NULL]
+[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 7, 7, 7, 7, 7]
+
+# array_resize columnar test #2
+query ?
+select array_resize(arrow_cast(column1, 'LargeList(Int64)'), column2, column3) from array_resize_values;
+----
+[1, NULL]
+[11, 12, NULL, 14, 15]
+[21, 22, 23, 24, NULL, 26, 27, 28]
+[31, 32, 33, 34, 35, 36, NULL, 38, 39, 40, 4, 4]
+NULL
+[]
+[51, 52, 53, 54, 55, NULL, 57, 58, 59, 60, NULL, NULL, NULL]
+[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 7, 7, 7, 7, 7]
+
+# array_resize columnar test #3
+query ?
+select array_resize(column1, column2, 9) from array_resize_values;
+----
+[1, NULL]
+[11, 12, NULL, 14, 15]
+[21, 22, 23, 24, NULL, 26, 27, 28]
+[31, 32, 33, 34, 35, 36, NULL, 38, 39, 40, 9, 9]
+NULL
+[]
+[51, 52, 53, 54, 55, NULL, 57, 58, 59, 60, 9, 9, 9]
+[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 9, 9, 9, 9, 9]
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_reverse.slt b/datafusion/sqllogictest/test_files/array/array_reverse.slt
new file mode 100644
index 0000000000000..e9d1f5cf54ff0
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_reverse.slt
@@ -0,0 +1,190 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## array_reverse
+query ??
+select array_reverse(make_array(1, 2, 3)), array_reverse(make_array(1));
+----
+[3, 2, 1] [1]
+
+query ??
+select array_reverse(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), array_reverse(arrow_cast(make_array(1), 'LargeList(Int64)'));
+----
+[3, 2, 1] [1]
+
+query ????
+select array_reverse(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')),
+  array_reverse(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)')),
+  array_reverse(arrow_cast(make_array(1, NULL, 3), 'FixedSizeList(3, Int64)')),
+  array_reverse(arrow_cast(make_array(NULL, NULL, NULL), 'FixedSizeList(3, Int64)'));
+----
+[3, 2, 1] [1] [3, NULL, 1] [NULL, NULL, NULL]
+
+query ??
+select array_reverse(NULL), array_reverse([]);
+----
+NULL []
+
+query ??
+select array_reverse(column1), column1 from arrays_values;
+----
+[10, 9, 8, 7, 6, 5, 4, 3, 2, NULL] [NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+[20, NULL, 18, 17, 16, 15, 14, 13, 12, 11] [11, 12, 13, 14, 15, 16, 17, 18, NULL, 20]
+[30, 29, 28, 27, 26, 25, NULL, 23, 22, 21] [21, 22, 23, NULL, 25, 26, 27, 28, 29, 30]
+[40, 39, 38, 37, NULL, 35, 34, 33, 32, 31] [31, 32, 33, 34, 35, NULL, 37, 38, 39, 40]
+NULL NULL
+[50, 49, 48, 47, 46, 45, 44, 43, 42, 41] [41, 42, 43, 44, 45, 46, 47, 48, 49, 50]
+[60, 59, 58, 57, 56, 55, 54, NULL, 52, 51] [51, 52, NULL, 54, 55, 56, 57, 58, 59, 60]
+[70, 69, 68, 67, 66, 65, 64, 63, 62, 61] [61, 62, 63, 64, 65, 66, 67, 68, 69, 70]
+
+statement ok
+CREATE TABLE test_reverse_fixed_size AS VALUES
+  (arrow_cast([1, 2, 3], 'FixedSizeList(3, Int64)')),
+  (arrow_cast([4, 5, 6], 'FixedSizeList(3, Int64)')),
+  (arrow_cast([NULL, 8, 9], 'FixedSizeList(3, Int64)')),
+  (NULL);
+
+query ?
+SELECT array_reverse(column1) FROM test_reverse_fixed_size;
+----
+[3, 2, 1]
+[6, 5, 4]
+[9, 8, NULL]
+NULL
+
+statement ok
+DROP TABLE test_reverse_fixed_size;
+
+# Test defining a table with array columns
+statement ok
+create table test_create_array_table(
+  a int[],
+  b text[],
+  -- two-dimensional array
+  c int[][],
+  d int
+);
+
+query I
+insert into test_create_array_table values
+  ([1, 2, 3], ['a', 'b', 'c'], [[4,6], [6,7,8]], 1);
+----
+1
+
+query ???I
+select * from test_create_array_table;
+----
+[1, 2, 3] [a, b, c] [[4, 6], [6, 7, 8]] 1
+
+query T
+select arrow_typeof(a) from test_create_array_table;
+----
+List(Int32)
+
+query T
+select arrow_typeof(c) from test_create_array_table;
+----
+List(List(Int32))
+
+# Test casting to array types
+# issue: https://github.com/apache/datafusion/issues/9440
+query ??T
+select [1,2,3]::int[], [['1']]::int[][], arrow_typeof([]::text[]);
+----
+[1, 2, 3] [[1]] List(Utf8View)
+
+# test empty arrays return length
+# issue: https://github.com/apache/datafusion/pull/12459
+statement ok
+create table values_all_empty (a int[]) as values ([]), ([]);
+
+query B
+select array_has(a, 1) from values_all_empty;
+----
+false
+false
+
+# Test create table with fixed sized array
+statement ok
+create table fixed_size_col_table (a int[3]) as values ([1,2,3]), ([4,5,6]);
+
+query T
+select arrow_typeof(a) from fixed_size_col_table;
+----
+FixedSizeList(3 x Int32)
+FixedSizeList(3 x Int32)
+
+query ? rowsort
+SELECT DISTINCT a FROM fixed_size_col_table
+----
+[1, 2, 3]
+[4, 5, 6]
+
+query ?I rowsort
+SELECT a, count(*) FROM fixed_size_col_table GROUP BY a
+----
+[1, 2, 3] 1
+[4, 5, 6] 1
+
+statement error Cast error: Cannot cast to FixedSizeList\(3\): value at index 0 has length 2
+create table varying_fixed_size_col_table (a int[3]) as values ([1,2,3]), ([4,5]);
+
+# https://github.com/apache/datafusion/issues/16187
+# should be NULL in case of out of bounds for Null Type
+query ?
+select [named_struct('a', 1, 'b', null)][-2];
+----
+NULL
+
+statement ok
+COPY (select [[true, false], [false, true]] a, [false, true] b union select [[null, null]], null) to 'test_files/scratch/array_reverse/array_has/single_file.parquet' stored as parquet;
+
+statement ok
+CREATE EXTERNAL TABLE array_has STORED AS PARQUET location 'test_files/scratch/array_reverse/array_has/single_file.parquet';
+
+query B
+select array_contains(a, b) from array_has order by 1 nulls last;
+----
+true
+NULL
+
+query ?
+select array_reverse(arrow_cast([1, 2, 3, 4, 5], 'ListView(Int64)'));
+----
+[5, 4, 3, 2, 1]
+
+query ?
+select array_reverse(arrow_cast([1, 2, 3, 4, 5], 'LargeListView(Int64)'));
+----
+[5, 4, 3, 2, 1]
+
+statement ok
+drop table test_create_array_table;
+
+statement ok
+drop table values_all_empty;
+
+statement ok
+drop table fixed_size_col_table;
+
+statement ok
+drop table array_has;
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_slice.slt b/datafusion/sqllogictest/test_files/array/array_slice.slt
new file mode 100644
index 0000000000000..14587a50b2266
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_slice.slt
@@ -0,0 +1,461 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## array_slice (aliases: list_slice)
+
+# array_slice scalar function #1 (with positive indexes)
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), 2, 4), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 1, 2);
+----
+[2, 3, 4] [h, e]
+
+query ????
+select array_slice(make_array(1, 2, 3, 4, 5), 1, 5, 2), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 1, 5, 2),
+       array_slice(make_array(1, 2, 3, 4, 5), 0, 5, 2), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 0, 5, 2);
+----
+[1, 3, 5] [h, l, o] [1, 3, 5] [h, l, o]
+
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), 1, 5, -1), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 1, 5, -1);
+----
+[] []
+
+query error Execution error: array_slice got invalid stride: 0, it cannot be 0
+select array_slice(make_array(1, 2, 3, 4, 5), 1, 5, 0), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 1, 5, 0);
+
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), 5, 1, -2), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 5, 1, -2);
+----
+[5, 3, 1] [o, l, h]
+
+# Test NULL stride
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), 1, 5, NULL), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 1, 5, NULL);
+----
+NULL NULL
+
+# Test NULL stride
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 1, 5, NULL),
+       array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 1, 5, NULL);
+----
+NULL NULL
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2, 4), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 1, 2);
+----
+[2, 3, 4] [h, e]
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 2, 4),
+       array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 1, 2);
+----
+[2, 3, 4] [h, e]
+
+# array_slice scalar function #2 (with positive indexes; full array)
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), 0, 6), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 0, 5);
+----
+[1, 2, 3, 4, 5] [h, e, l, l, o]
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 0, 6), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 0, 5);
+----
+[1, 2, 3, 4, 5] [h, e, l, l, o]
+
+query ??
+select array_slice(arrow_cast([1, 2, 3, 4, 5], 'ListView(Int64)'), 0, 6),
+       array_slice(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'ListView(Utf8)'), 0, 5);
+----
+[1, 2, 3, 4, 5] [h, e, l, l, o]
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 0, 6),
+       array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 0, 5);
+----
+[1, 2, 3, 4, 5] [h, e, l, l, o]
+
+# array_slice scalar function #3 (with positive indexes; first index = second index)
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), 4, 4), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 3, 3);
+----
+[4] [l]
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 4, 4), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 3, 3);
+----
+[4] [l]
+
+# array_slice scalar function #4 (with positive indexes; first index > second_index)
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), 2, 1), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 4, 1);
+----
+[] []
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2, 1), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 4, 1);
+----
+[] []
+
+# array_slice scalar function #5 (with positive indexes; out of bounds)
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), 2, 6), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 3, 7);
+----
+[2, 3, 4, 5] [l, l, o]
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2, 6), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 3, 7);
+----
+[2, 3, 4, 5] [l, l, o]
+
+query ??
+select array_slice(arrow_cast([1, 2, 3, 4, 5], 'LargeListView(Int64)'), 2, 6),
+       array_slice(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeListView(Utf8)'), 3, 7);
+----
+[2, 3, 4, 5] [l, l, o]
+
+
+# array_slice scalar function #6 (with positive indexes; nested array)
+query ?
+select array_slice(make_array(make_array(1, 2, 3, 4, 5), make_array(6, 7, 8, 9, 10)), 1, 1);
+----
+[[1, 2, 3, 4, 5]]
+
+query ?
+select array_slice(arrow_cast(make_array(make_array(1, 2, 3, 4, 5), make_array(6, 7, 8, 9, 10)), 'LargeList(List(Int64))'), 1, 1);
+----
+[[1, 2, 3, 4, 5]]
+
+# array_slice scalar function #7 (with zero and positive number)
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), 0, 4), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 0, 3);
+----
+[1, 2, 3, 4] [h, e, l]
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 0, 4), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 0, 3);
+----
+[1, 2, 3, 4] [h, e, l]
+
+# array_slice scalar function #8 (with NULL and positive number)
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), NULL, 4), array_slice(make_array('h', 'e', 'l', 'l', 'o'), NULL, 3);
+----
+NULL NULL
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), NULL, 4), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), NULL, 3);
+----
+NULL NULL
+
+# array_slice scalar function #9 (with positive number and NULL)
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), 2, NULL), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 3, NULL);
+----
+NULL NULL
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2, NULL), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 3, NULL);
+----
+NULL NULL
+
+# array_slice scalar function #10 (with zero-zero)
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), 0, 0), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 0, 0);
+----
+[] []
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 0, 0), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 0, 0);
+----
+[] []
+
+# array_slice scalar function #11 (with NULL-NULL)
+query error
+select array_slice(make_array(1, 2, 3, 4, 5), NULL), array_slice(make_array('h', 'e', 'l', 'l', 'o'), NULL);
+
+query error
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), NULL), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), NULL);
+
+# array_slice scalar function #12 (with zero and negative number)
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), 0, -4), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 0, -3);
+----
+[1, 2] [h, e, l]
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 0, -4), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 0, -3);
+----
+[1, 2] [h, e, l]
+
+# array_slice scalar function #13 (with negative number and NULL)
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), -2, NULL), array_slice(make_array('h', 'e', 'l', 'l', 'o'), -3, NULL);
+----
+NULL NULL
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), -2, NULL), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), -3, NULL);
+----
+NULL NULL
+
+# array_slice scalar function #14 (with NULL and negative number)
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), NULL, -4), array_slice(make_array('h', 'e', 'l', 'l', 'o'), NULL, -3);
+----
+NULL NULL
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), NULL, -4), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), NULL, -3);
+----
+NULL NULL
+
+# array_slice scalar function #15 (with negative indexes)
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), -4, -1), array_slice(make_array('h', 'e', 'l', 'l', 'o'), -3, -1);
+----
+[2, 3, 4, 5] [l, l, o]
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), -4, -1), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), -3, -1);
+----
+[2, 3, 4, 5] [l, l, o]
+
+# array_slice scalar function #16 (with negative indexes; almost full array (only with negative indices cannot return full array))
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), -5, -1), array_slice(make_array('h', 'e', 'l', 'l', 'o'), -5, -1);
+----
+[1, 2, 3, 4, 5] [h, e, l, l, o]
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), -5, -1), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), -5, -1);
+----
+[1, 2, 3, 4, 5] [h, e, l, l, o]
+
+# array_slice scalar function #17 (with negative indexes; first index = second index)
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), -4, -4), array_slice(make_array('h', 'e', 'l', 'l', 'o'), -3, -3);
+----
+[2] [l]
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), -4, -4), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), -3, -3);
+----
+[2] [l]
+
+# array_slice scalar function #18 (with negative indexes; first index > second_index)
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), -4, -6), array_slice(make_array('h', 'e', 'l', 'l', 'o'), -3, -6);
+----
+[] []
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), -4, -6), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), -3, -6);
+----
+[] []
+
+# array_slice scalar function #19 (with negative indexes; out of bounds)
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), -7, -2), array_slice(make_array('h', 'e', 'l', 'l', 'o'), -7, -3);
+----
+[1, 2, 3, 4] [h, e, l]
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), -7, -2), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), -7, -3);
+----
+[1, 2, 3, 4] [h, e, l]
+
+# array_slice scalar function #20 (with negative indexes; nested array)
+query ??
+select array_slice(make_array(make_array(1, 2, 3, 4, 5), make_array(6, 7, 8, 9, 10)), -2, -1), array_slice(make_array(make_array(1, 2, 3), make_array(6, 7, 8)), -1, -1);
+----
+[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]] [[6, 7, 8]]
+
+query ??
+select array_slice(arrow_cast(make_array(make_array(1, 2, 3, 4, 5), make_array(6, 7, 8, 9, 10)), 'LargeList(List(Int64))'), -2, -1), array_slice(arrow_cast(make_array(make_array(1, 2, 3), make_array(6, 7, 8)), 'LargeList(List(Int64))'), -1, -1);
+----
+[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]] [[6, 7, 8]]
+
+
+# array_slice scalar function #21 (with first positive index and last negative index)
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), 2, -3), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 2, -2);
+----
+[2, 3] [e, l, l]
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2, -3), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 2, -2);
+----
+[2, 3] [e, l, l]
+
+# array_slice scalar function #22 (with first negative index and last positive index)
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), -2, 5), array_slice(make_array('h', 'e', 'l', 'l', 'o'), -3, 4);
+----
+[4, 5] [l, l]
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), -2, 5), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), -3, 4);
+----
+[4, 5] [l, l]
+
+# list_slice scalar function #23 (function alias `array_slice`)
+query ??
+select list_slice(make_array(1, 2, 3, 4, 5), 2, 4), list_slice(make_array('h', 'e', 'l', 'l', 'o'), 1, 2);
+----
+[2, 3, 4] [h, e]
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2, 4), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 1, 2);
+----
+[2, 3, 4] [h, e]
+
+# array_slice scalar function #24 (with first negative index larger than len)
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), -2147483648, 1), list_slice(make_array('h', 'e', 'l', 'l', 'o'), -2147483648, 1);
+----
+[1] [h]
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), -9223372036854775808, 1), list_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), -9223372036854775808, 1);
+----
+[1] [h]
+
+# array_slice scalar function #25 (with negative step and equal indexes)
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), 2, 2, -1), list_slice(make_array('h', 'e', 'l', 'l', 'o'), 2, 2, -1);
+----
+[2] [e]
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2, 2, -1), list_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 2, 2, -1);
+----
+[2] [e]
+
+# array_slice with columns
+query ?
+select array_slice(column1, column2, column3) from slices;
+----
+[NULL]
+[12, 13, 14, 15, 16, 17]
+[]
+[]
+NULL
+NULL
+NULL
+
+query ?
+select array_slice(arrow_cast(column1, 'LargeList(Int64)'), column2, column3) from slices;
+----
+[NULL]
+[12, 13, 14, 15, 16, 17]
+[]
+[]
+NULL
+NULL
+NULL
+
+# TODO: support NULLS in output instead of `[]`
+# array_slice with columns and scalars
+query ???
+select array_slice(make_array(1, 2, 3, 4, 5), column2, column3), array_slice(column1, 3, column3), array_slice(column1, column2, 5) from slices;
+----
+[1] [] [NULL, 2, 3, 4, 5]
+[2] [13, 14, 15, 16, 17] [12, 13, 14, 15]
+[] [] [21, 22, 23, NULL, 25]
+[] [33, 34] []
+[4, 5] NULL NULL
+NULL [43, 44, 45, 46] NULL
+NULL NULL [55]
+
+query ???
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), column2, column3), array_slice(arrow_cast(column1, 'LargeList(Int64)'), 3, column3), array_slice(arrow_cast(column1, 'LargeList(Int64)'), column2, 5) from slices;
+----
+[1] [] [NULL, 2, 3, 4, 5]
+[2] [13, 14, 15, 16, 17] [12, 13, 14, 15]
+[] [] [21, 22, 23, NULL, 25]
+[] [33, 34] []
+[4, 5] NULL NULL
+NULL [43, 44, 45, 46] NULL
+NULL NULL [55]
+
+# Test issue: https://github.com/apache/datafusion/issues/10425
+# `from` may be larger than `to` and `stride` is positive
+query ????
+select array_slice(a, -1, 2, 1), array_slice(a, -1, 2),
+       array_slice(a, 3, 2, 1), array_slice(a, 3, 2)
+  from (values ([1.0, 2.0, 3.0, 3.0]), ([4.0, 5.0, 3.0]), ([6.0])) t(a);
+----
+[] [] [] []
+[] [] [] []
+[6.0] [6.0] [] []
+
+# array_slice with overlapping nulls across multiple inputs
+query ?
+select array_slice(column1, column2, column3) from (
+  values
+    (make_array(1, 2, 3), NULL, NULL),
+    (NULL, NULL, 3),
+    (NULL, 1, NULL),
+    (make_array(4, 5, 6), 1, 3)
+) as t(column1, column2, column3);
+----
+NULL
+NULL
+NULL
+[4, 5, 6]
+
+query ?
+select array_slice(arrow_cast(column1, 'LargeList(Int64)'), column2, column3) from (
+  values
+    (make_array(1, 2, 3), NULL, NULL),
+    (NULL, NULL, 3),
+    (NULL, 1, NULL),
+    (make_array(4, 5, 6), 1, 3)
+) as t(column1, column2, column3);
+----
+NULL
+NULL
+NULL
+[4, 5, 6]
+
+# array_slice with overlapping nulls including stride
+query ?
+select array_slice(column1, column2, column3, column4) from (
+  values
+    (make_array(1, 2, 3, 4, 5), 1, 5, NULL),
+    (NULL, NULL, 3, 2),
+    (make_array(1, 2, 3, 4, 5), NULL, NULL, NULL),
+    (make_array(1, 2, 3, 4, 5), 1, 5, 2)
+) as t(column1, column2, column3, column4);
+----
+NULL
+NULL
+NULL
+[1, 3, 5]
+
+# Testing with empty arguments should result in an error
+query error DataFusion error: Error during planning: 'array_slice' does not support zero arguments
+select array_slice();
+
+query error Failed to coerce arguments
+select array_slice(3.5, NULL, NULL);
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_sort.slt b/datafusion/sqllogictest/test_files/array/array_sort.slt
new file mode 100644
index 0000000000000..343aa5a82ef0c
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_sort.slt
@@ -0,0 +1,268 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## array_sort (aliases: `list_sort`)
+query ???
+select array_sort(make_array(1, 3, null, 5, NULL, -5)), array_sort(make_array(1, 3, null, 2), 'ASC'), array_sort(make_array(1, 3, null, 2), 'desc', 'NULLS FIRST');
+----
+[NULL, NULL, -5, 1, 3, 5] [NULL, 1, 2, 3] [NULL, 3, 2, 1]
+
+query ???
+select array_sort(arrow_cast(make_array(1, 3, null, 5, NULL, -5), 'LargeList(Int64)')),
+       array_sort(arrow_cast(make_array(1, 3, null, 2), 'LargeList(Int64)'), 'ASC'),
+       array_sort(arrow_cast(make_array(1, 3, null, 2), 'LargeList(Int64)'), 'desc', 'NULLS FIRST');
+----
+[NULL, NULL, -5, 1, 3, 5] [NULL, 1, 2, 3] [NULL, 3, 2, 1]
+
+query ???
+select array_sort(arrow_cast(make_array(1, 3, null, 5, NULL, -5), 'FixedSizeList(6, Int64)')),
+       array_sort(arrow_cast(make_array(1, 3, null, 2), 'FixedSizeList(4, Int64)'), 'ASC'),
+       array_sort(arrow_cast(make_array(1, 3, null, 2), 'FixedSizeList(4, Int64)'), 'desc', 'NULLS FIRST');
+----
+[NULL, NULL, -5, 1, 3, 5] [NULL, 1, 2, 3] [NULL, 3, 2, 1]
+
+query ?
+select array_sort(column1, 'DESC', 'NULLS LAST') from arrays_values;
+----
+[10, 9, 8, 7, 6, 5, 4, 3, 2, NULL]
+[20, 18, 17, 16, 15, 14, 13, 12, 11, NULL]
+[30, 29, 28, 27, 26, 25, 23, 22, 21, NULL]
+[40, 39, 38, 37, 35, 34, 33, 32, 31, NULL]
+NULL
+[50, 49, 48, 47, 46, 45, 44, 43, 42, 41]
+[60, 59, 58, 57, 56, 55, 54, 52, 51, NULL]
+[70, 69, 68, 67, 66, 65, 64, 63, 62, 61]
+
+query ?
+select array_sort(column1, 'ASC', 'NULLS FIRST') from arrays_values;
+----
+[NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+[NULL, 11, 12, 13, 14, 15, 16, 17, 18, 20]
+[NULL, 21, 22, 23, 25, 26, 27, 28, 29, 30]
+[NULL, 31, 32, 33, 34, 35, 37, 38, 39, 40]
+NULL
+[41, 42, 43, 44, 45, 46, 47, 48, 49, 50]
+[NULL, 51, 52, 54, 55, 56, 57, 58, 59, 60]
+[61, 62, 63, 64, 65, 66, 67, 68, 69, 70]
+
+# test with empty table
+query ?
+select array_sort(column1, 'DESC', 'NULLS FIRST') from arrays_values where false;
+----
+
+# test with empty array
+query ?
+select array_sort([]);
+----
+[]
+
+# empty-but-non-null string arrays should remain non-null, not become null
+query ?B
+select array_sort(column1), array_sort(column1) is null
+from (values (arrow_cast(make_array('b', 'a'), 'List(Utf8)')), (arrow_cast([], 'List(Utf8)'))) as t(column1);
+----
+[a, b] false
+[] false
+
+# test with null arguments
+query ?
+select array_sort(NULL);
+----
+NULL
+
+query ?
+select array_sort(column1, NULL) from arrays_values;
+----
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+
+query ??
+select array_sort(column1, 'DESC', NULL), array_sort(column1, 'ASC', NULL) from arrays_values;
+----
+NULL NULL
+NULL NULL
+NULL NULL
+NULL NULL
+NULL NULL
+NULL NULL
+NULL NULL
+NULL NULL
+
+query ??
+select array_sort(column1, NULL, 'NULLS FIRST'), array_sort(column1, NULL, 'NULLS LAST') from arrays_values;
+----
+NULL NULL
+NULL NULL
+NULL NULL
+NULL NULL
+NULL NULL
+NULL NULL
+NULL NULL
+NULL NULL
+
+# maintains inner nullability
+query ?T
+select array_sort(column1), arrow_typeof(array_sort(column1))
+from values
+  (arrow_cast([], 'List(non-null Int32)')),
+  (arrow_cast(NULL, 'List(non-null Int32)')),
+  (arrow_cast([1, 3, 5, -5], 'List(non-null Int32)'))
+;
+----
+[] List(non-null Int32)
+NULL List(non-null Int32)
+[-5, 1, 3, 5] List(non-null Int32)
+
+query ?T
+select column1, arrow_typeof(column1)
+from values (array_sort(arrow_cast([1, 3, 5, -5], 'LargeList(non-null Int32)')));
+----
+[-5, 1, 3, 5] LargeList(non-null Int32)
+
+query ?T
+select column1, arrow_typeof(column1)
+from values (array_sort(arrow_cast([1, 3, 5, -5], 'FixedSizeList(4 x non-null Int32)')));
+----
+[-5, 1, 3, 5] List(non-null Int32)
+
+# arrays of strings
+query ???
+select array_sort(make_array('banana', 'apple', null, 'cherry')),
+       array_sort(make_array('banana', 'apple', null, 'cherry'), 'DESC', 'NULLS LAST'),
+       array_sort(make_array('banana', 'apple', null, 'cherry'), 'ASC', 'NULLS LAST');
+----
+[NULL, apple, banana, cherry] [cherry, banana, apple, NULL] [apple, banana, cherry, NULL]
+
+query ?
+select array_sort([struct('foo', 3), struct('foo', 1), struct('bar', 1)])
+----
+[{c0: bar, c1: 1}, {c0: foo, c1: 1}, {c0: foo, c1: 3}]
+
+## test with argument of incorrect types
+query error DataFusion error: Execution error: the second parameter of array_sort expects DESC or ASC
+select array_sort([1, 3, null, 5, NULL, -5], 1), array_sort([1, 3, null, 5, NULL, -5], 'DESC', 1), array_sort([1, 3, null, 5, NULL, -5], 1, 1);
+
+# test with empty row, the row that does not match the condition has row count 0
+statement ok
+create table t1(a int, b int) as values (100, 1), (101, 2), (102, 3), (101, 2);
+
+# rowsort is to ensure the order of group by is deterministic, array_sort has no effect here, since the sum() always returns single row.
+query ? rowsort
+select array_sort([sum(a)]) from t1 where a > 100 group by b;
+----
+[102]
+[202]
+
+statement ok
+drop table t1;
+
+# float arrays with NaN and Infinity (NaN sorts after Infinity per IEEE totalOrder)
+query ???
+select array_sort(make_array(1.0, 'NaN'::double, -1.0, 'Infinity'::double, '-Infinity'::double, null)),
+       array_sort(make_array(1.0, 'NaN'::double, -1.0, 'Infinity'::double, '-Infinity'::double, null), 'DESC', 'NULLS LAST'),
+       array_sort(make_array('NaN'::double, 'NaN'::double, 1.0));
+----
+[NULL, -inf, -1.0, 1.0, inf, NaN] [NaN, inf, 1.0, -1.0, -inf, NULL] [1.0, NaN, NaN]
+
+# float32 arrays
+query ??
+select array_sort(arrow_cast(make_array(3.0, 1.0, 'NaN'::double, null, 2.0), 'List(Float32)')),
+       array_sort(arrow_cast(make_array(3.0, 1.0, 'NaN'::double, null, 2.0), 'List(Float32)'), 'DESC', 'NULLS LAST');
+----
+[NULL, 1.0, 2.0, 3.0, NaN] [NaN, 3.0, 2.0, 1.0, NULL]
+
+# element-level nulls with all sort option combinations
+query ????
+select array_sort(make_array(3, null, 1, null, 2), 'ASC', 'NULLS FIRST'),
+       array_sort(make_array(3, null, 1, null, 2), 'ASC', 'NULLS LAST'),
+       array_sort(make_array(3, null, 1, null, 2), 'DESC', 'NULLS FIRST'),
+       array_sort(make_array(3, null, 1, null, 2), 'DESC', 'NULLS LAST');
+----
+[NULL, NULL, 1, 2, 3] [1, 2, 3, NULL, NULL] [NULL, NULL, 3, 2, 1] [3, 2, 1, NULL, NULL]
+
+# timestamp arrays
+query ??
+select array_sort(make_array(arrow_cast('2024-01-15T10:00:00', 'Timestamp(Nanosecond, None)'),
+                             arrow_cast('2024-01-01T00:00:00', 'Timestamp(Nanosecond, None)'),
+                             null,
+                             arrow_cast('2024-06-15T12:00:00', 'Timestamp(Nanosecond, None)'))),
+       array_sort(make_array(arrow_cast('2024-01-15T10:00:00', 'Timestamp(Nanosecond, None)'),
+                             arrow_cast('2024-01-01T00:00:00', 'Timestamp(Nanosecond, None)'),
+                             null,
+                             arrow_cast('2024-06-15T12:00:00', 'Timestamp(Nanosecond, None)')), 'DESC', 'NULLS LAST');
+----
+[NULL, 2024-01-01T00:00:00, 2024-01-15T10:00:00, 2024-06-15T12:00:00] [2024-06-15T12:00:00, 2024-01-15T10:00:00, 2024-01-01T00:00:00, NULL]
+
+# date arrays
+query ??
+select array_sort(make_array('2024-03-01'::date, '2024-01-01'::date, null, '2024-02-01'::date)),
+       array_sort(make_array('2024-03-01'::date, '2024-01-01'::date, null, '2024-02-01'::date), 'DESC', 'NULLS LAST');
+----
+[NULL, 2024-01-01, 2024-02-01, 2024-03-01] [2024-03-01, 2024-02-01, 2024-01-01, NULL]
+
+# struct arrays with nulls and DESC
+query ??
+select array_sort([struct('b', 2), struct('a', 1), null, struct('a', 3)]),
+       array_sort([struct('b', 2), struct('a', 1), null, struct('a', 3)], 'DESC', 'NULLS LAST');
+----
+[NULL, {c0: a, c1: 1}, {c0: a, c1: 3}, {c0: b, c1: 2}] [{c0: b, c1: 2}, {c0: a, c1: 3}, {c0: a, c1: 1}, NULL]
+
+# boolean arrays
+query ??
+select array_sort(make_array(true, false, null, true, false)),
+       array_sort(make_array(true, false, null, true, false), 'DESC', 'NULLS LAST');
+----
+[NULL, false, false, true, true] [true, true, false, false, NULL]
+
+# all-null array
+query ?
+select array_sort(make_array(null, null, null));
+----
+[NULL, NULL, NULL]
+
+# single-element arrays
+query ??
+select array_sort(make_array(42)), array_sort(make_array(null::int));
+----
+[42] [NULL]
+
+## list_sort (aliases: `array_sort`)
+query ???
+select list_sort(make_array(1, 3, null, 5, NULL, -5)), list_sort(make_array(1, 3, null, 2), 'ASC'), list_sort(make_array(1, 3, null, 2), 'desc', 'NULLS FIRST');
+----
+[NULL, NULL, -5, 1, 3, 5] [NULL, 1, 2, 3] [NULL, 3, 2, 1]
+
+query ?
+select array_sort(arrow_cast([1, 3, null, 5, NULL, -5], 'ListView(Int64)'));
+----
+[NULL, NULL, -5, 1, 3, 5]
+
+query ?
+select array_sort(arrow_cast([1, 3, null, 5, NULL, -5], 'LargeListView(Int64)'));
+----
+[NULL, NULL, -5, 1, 3, 5]
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_to_string.slt b/datafusion/sqllogictest/test_files/array/array_to_string.slt
new file mode 100644
index 0000000000000..47529975a3d58
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_to_string.slt
@@ -0,0 +1,378 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## array_to_string (aliases: `list_to_string`, `array_join`, `list_join`)
+
+# array_to_string scalar function #1
+query TTT
+select array_to_string(['h', 'e', 'l', 'l', 'o'], ','), array_to_string([1, 2, 3, 4, 5], '-'), array_to_string([1.0, 2.0, 3.0], '|');
+----
+h,e,l,l,o 1-2-3-4-5 1|2|3
+
+# array_to_string scalar function #2
+query TTT
+select array_to_string([1, 1, 1], '1'), array_to_string([[1, 2], [3, 4], [5, 6]], '+'), array_to_string(array_repeat(array_repeat(array_repeat(3, 2), 2), 3), '/\');
+----
+11111 1+2+3+4+5+6 3/\3/\3/\3/\3/\3/\3/\3/\3/\3/\3/\3
+
+# array_to_string scalar function #3
+query T
+select array_to_string(make_array(), ',')
+----
+(empty)
+
+# array to string dictionary
+statement ok
+CREATE TABLE table1 AS VALUES
+  (1, 'foo'),
+  (3, 'bar'),
+  (1, 'foo'),
+  (2, NULL),
+  (NULL, 'baz')
+  ;
+
+# expect 1-3-1-2 (dictionary values should be repeated)
+query T
+SELECT array_to_string(array_agg(column1),'-')
+FROM (
+  SELECT arrow_cast(column1, 'Dictionary(Int32, Int32)') as column1
+  FROM table1
+);
+----
+1-3-1-2
+
+# expect foo,bar,foo,baz (dictionary values should be repeated)
+query T
+SELECT array_to_string(array_agg(column2),',')
+FROM (
+  SELECT arrow_cast(column2, 'Dictionary(Int64, Utf8)') as column2
+  FROM table1
+);
+----
+foo,bar,foo,baz
+
+# Expect only values that are in the group
+query I?T
+SELECT column1, array_agg(column2), array_to_string(array_agg(column2),',')
+FROM (
+  SELECT column1, arrow_cast(column2, 'Dictionary(Int32, Utf8)') as column2
+  FROM table1
+)
+GROUP BY column1
+ORDER BY column1;
+----
+1 [foo, foo] foo,foo
+2 [NULL] (empty)
+3 [bar] bar
+NULL [baz] baz
+
+# verify make_array does force to Utf8View
+query T
+SELECT arrow_typeof(make_array(arrow_cast('a', 'Utf8View'), 'b', 'c', 'd'));
+----
+List(Utf8View)
+
+# expect a,b,c,d. make_array forces all types to be of a common type (see above)
+query T
+SELECT array_to_string(make_array(arrow_cast('a', 'Utf8View'), 'b', 'c', 'd'), ',');
+----
+a,b,c,d
+
+# array_to_string using largeutf8 for second arg
+query TTT
+select array_to_string(['h', 'e', 'l', 'l', 'o'], arrow_cast(',', 'LargeUtf8')), array_to_string([1, 2, 3, 4, 5], arrow_cast('-', 'LargeUtf8')), array_to_string([1.0, 2.0, 3.0], arrow_cast('|', 'LargeUtf8'));
+----
+h,e,l,l,o 1-2-3-4-5 1|2|3
+
+# array_to_string using utf8view for second arg
+query TTT
+select array_to_string(['h', 'e', 'l', 'l', 'o'], arrow_cast(',', 'Utf8View')), array_to_string([1, 2, 3, 4, 5], arrow_cast('-', 'Utf8View')), array_to_string([1.0, 2.0, 3.0], arrow_cast('|', 'Utf8View'));
+----
+h,e,l,l,o 1-2-3-4-5 1|2|3
+
+statement ok
+drop table table1;
+
+
+# list_to_string scalar function #4 (function alias `array_to_string`)
+query TTT
+select list_to_string(['h', 'e', 'l', 'l', 'o'], ','), list_to_string([1, 2, 3, 4, 5], '-'), list_to_string([1.0, 2.0, 3.0], '|');
+----
+h,e,l,l,o 1-2-3-4-5 1|2|3
+
+query TTT
+select list_to_string(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), ','), list_to_string(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), '-'), list_to_string(arrow_cast([1.0, 2.0, 3.0], 'LargeList(Float64)'), '|');
+----
+h,e,l,l,o 1-2-3-4-5 1|2|3
+
+# array_join scalar function #5 (function alias `array_to_string`)
+query TTT
+select array_join(['h', 'e', 'l', 'l', 'o'], ','), array_join([1, 2, 3, 4, 5], '-'), array_join([1.0, 2.0, 3.0], '|');
+----
+h,e,l,l,o 1-2-3-4-5 1|2|3
+
+query TTT
+select array_join(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), ','), array_join(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), '-'), array_join(arrow_cast([1.0, 2.0, 3.0], 'LargeList(Float64)'), '|');
+----
+h,e,l,l,o 1-2-3-4-5 1|2|3
+
+# list_join scalar function #6 (function alias `list_join`)
+query TTT
+select list_join(['h', 'e', 'l', 'l', 'o'], ','), list_join([1, 2, 3, 4, 5], '-'), list_join([1.0, 2.0, 3.0], '|');
+----
+h,e,l,l,o 1-2-3-4-5 1|2|3
+
+query TTT
+select list_join(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), ','), list_join(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), '-'), list_join(arrow_cast([1.0, 2.0, 3.0], 'LargeList(Float64)'), '|');
+----
+h,e,l,l,o 1-2-3-4-5 1|2|3
+
+# array_to_string scalar function with nulls #1
+query TTT
+select array_to_string(make_array('h', NULL, 'l', NULL, 'o'), ','), array_to_string(make_array(1, NULL, 3, NULL, 5), '-'), array_to_string(make_array(NULL, 2.0, 3.0), '|');
+----
+h,l,o 1-3-5 2|3
+
+query TTT
+select array_to_string(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), ','), array_to_string(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), '-'), array_to_string(arrow_cast([1.0, 2.0, 3.0], 'LargeList(Float64)'), '|');
+----
+h,e,l,l,o 1-2-3-4-5 1|2|3
+
+query TTT
+select array_to_string(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'FixedSizeList(5, Utf8)'), ','), array_to_string(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)'), '-'), array_to_string(arrow_cast([1.0, 2.0, 3.0], 'FixedSizeList(3, Float64)'), '|');
+----
+h,e,l,l,o 1-2-3-4-5 1|2|3
+
+# array_to_string scalar function with nulls #2
+query TTT
+select array_to_string(make_array('h', NULL, NULL, NULL, 'o'), ',', '-'), array_to_string(make_array(NULL, 2, NULL, 4, 5), '-', 'nil'), array_to_string(make_array(1.0, NULL, 3.0), '|', '0');
+----
+h,-,-,-,o nil-2-nil-4-5 1|0|3
+
+query TTT
+select array_to_string(arrow_cast(make_array('h', NULL, NULL, NULL, 'o'), 'LargeList(Utf8)'), ',', '-'), array_to_string(arrow_cast(make_array(NULL, 2, NULL, 4, 5), 'LargeList(Int64)'), '-', 'nil'), array_to_string(arrow_cast(make_array(1.0, NULL, 3.0), 'LargeList(Float64)'), '|', '0');
+----
+h,-,-,-,o nil-2-nil-4-5 1|0|3
+
+query TTT
+select array_to_string(arrow_cast(make_array('h', NULL, NULL, NULL, 'o'), 'FixedSizeList(5, Utf8)'), ',', '-'), array_to_string(arrow_cast(make_array(NULL, 2, NULL, 4, 5), 'FixedSizeList(5, Int64)'), '-', 'nil'), array_to_string(arrow_cast(make_array(1.0, NULL, 3.0), 'FixedSizeList(3, Float64)'), '|', '0');
+----
+h,-,-,-,o nil-2-nil-4-5 1|0|3
+
+# array_to_string float formatting: special values and longer decimals
+query TTT
+select
+  array_to_string(make_array(CAST('NaN' AS DOUBLE), CAST('Infinity' AS DOUBLE), CAST('-Infinity' AS DOUBLE), CAST('0.30000000000000004' AS DOUBLE), CAST('1.2345678901234567' AS DOUBLE)), '|'),
+  array_to_string(arrow_cast(make_array(CAST('NaN' AS DOUBLE), CAST('Infinity' AS DOUBLE), CAST('-Infinity' AS DOUBLE), CAST('0.30000000000000004' AS DOUBLE), CAST('1.2345678901234567' AS DOUBLE)), 'LargeList(Float64)'), '|'),
+  array_to_string(arrow_cast(make_array(CAST('NaN' AS DOUBLE), CAST('Infinity' AS DOUBLE), CAST('-Infinity' AS DOUBLE), CAST('0.30000000000000004' AS DOUBLE), CAST('1.2345678901234567' AS DOUBLE)), 'FixedSizeList(5, Float64)'), '|');
+----
+NaN|inf|-inf|0.30000000000000004|1.2345678901234567 NaN|inf|-inf|0.30000000000000004|1.2345678901234567 NaN|inf|-inf|0.30000000000000004|1.2345678901234567
+
+# array_to_string float formatting: scientific-notation inputs
+query T
+select array_to_string(
+  make_array(
+    CAST('1E20' AS DOUBLE),
+    CAST('-1e+20' AS DOUBLE),
+    CAST('6.02214076e23' AS DOUBLE),
+    CAST('1.2345e6' AS DOUBLE),
+    CAST('1e-5' AS DOUBLE),
+    CAST('-1e-5' AS DOUBLE),
+    CAST('9.1093837015e-31' AS DOUBLE),
+    CAST('-2.5e-4' AS DOUBLE)
+  ),
+  '|'
+);
+----
+100000000000000000000|-100000000000000000000|602214076000000000000000|1234500|0.00001|-0.00001|0.00000000000000000000000000000091093837015|-0.00025
+
+query T
+select array_to_string(arrow_cast([arrow_cast([NULL, 'a'], 'FixedSizeList(2, Utf8)'), NULL], 'FixedSizeList(2, FixedSizeList(2, Utf8))'), ',', '-');
+----
+-,a,-
+
+# array_to_string with columns #1
+
+# For reference
+# select column1, column4 from arrays_values;
+# ----
+# [NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10] ,
+# [11, 12, 13, 14, 15, 16, 17, 18, NULL, 20] .
+# [21, 22, 23, NULL, 25, 26, 27, 28, 29, 30] -
+# [31, 32, 33, 34, 35, NULL, 37, 38, 39, 40] ok
+# NULL @
+# [41, 42, 43, 44, 45, 46, 47, 48, 49, 50] $
+# [51, 52, NULL, 54, 55, 56, 57, 58, 59, 60] ^
+# [61, 62, 63, 64, 65, 66, 67, 68, 69, 70] NULL
+
+query T
+select array_to_string(column1, column4) from arrays_values;
+----
+2,3,4,5,6,7,8,9,10
+11.12.13.14.15.16.17.18.20
+21-22-23-25-26-27-28-29-30
+31ok32ok33ok34ok35ok37ok38ok39ok40
+NULL
+41$42$43$44$45$46$47$48$49$50
+51^52^54^55^56^57^58^59^60
+NULL
+
+query T
+select array_to_string(column1, column4) from large_arrays_values;
+----
+2,3,4,5,6,7,8,9,10
+11.12.13.14.15.16.17.18.20
+21-22-23-25-26-27-28-29-30
+31ok32ok33ok34ok35ok37ok38ok39ok40
+NULL
+41$42$43$44$45$46$47$48$49$50
+51^52^54^55^56^57^58^59^60
+NULL
+
+query TT
+select array_to_string(column1, '_'), array_to_string(make_array(1,2,3), '/') from arrays_values;
+----
+2_3_4_5_6_7_8_9_10 1/2/3
+11_12_13_14_15_16_17_18_20 1/2/3
+21_22_23_25_26_27_28_29_30 1/2/3
+31_32_33_34_35_37_38_39_40 1/2/3
+NULL 1/2/3
+41_42_43_44_45_46_47_48_49_50 1/2/3
+51_52_54_55_56_57_58_59_60 1/2/3
+61_62_63_64_65_66_67_68_69_70 1/2/3
+
+query TT
+select array_to_string(column1, '_'), array_to_string(make_array(1,2,3), '/') from large_arrays_values;
+----
+2_3_4_5_6_7_8_9_10 1/2/3
+11_12_13_14_15_16_17_18_20 1/2/3
+21_22_23_25_26_27_28_29_30 1/2/3
+31_32_33_34_35_37_38_39_40 1/2/3
+NULL 1/2/3
+41_42_43_44_45_46_47_48_49_50 1/2/3
+51_52_54_55_56_57_58_59_60 1/2/3
+61_62_63_64_65_66_67_68_69_70 1/2/3
+
+query TT
+select array_to_string(column1, '_', '*'), array_to_string(make_array(make_array(1,2,3)), '.') from arrays_values;
+----
+*_2_3_4_5_6_7_8_9_10 1.2.3
+11_12_13_14_15_16_17_18_*_20 1.2.3
+21_22_23_*_25_26_27_28_29_30 1.2.3
+31_32_33_34_35_*_37_38_39_40 1.2.3
+NULL 1.2.3
+41_42_43_44_45_46_47_48_49_50 1.2.3
+51_52_*_54_55_56_57_58_59_60 1.2.3
+61_62_63_64_65_66_67_68_69_70 1.2.3
+
+query TT
+select array_to_string(column1, '_', '*'), array_to_string(make_array(make_array(1,2,3)), '.') from large_arrays_values;
+----
+*_2_3_4_5_6_7_8_9_10 1.2.3
+11_12_13_14_15_16_17_18_*_20 1.2.3
+21_22_23_*_25_26_27_28_29_30 1.2.3
+31_32_33_34_35_*_37_38_39_40 1.2.3
+NULL 1.2.3
+41_42_43_44_45_46_47_48_49_50 1.2.3
+51_52_*_54_55_56_57_58_59_60 1.2.3
+61_62_63_64_65_66_67_68_69_70 1.2.3
+
+# array_to_string with per-row null_string column
+statement ok
+CREATE TABLE test_null_str_col AS VALUES
+  (make_array(1, NULL, 3), ',', 'N/A'),
+  (make_array(NULL, 5, NULL), ',', 'MISSING'),
+  (make_array(10, NULL, 12), '-', 'X'),
+  (make_array(20, NULL, 21), '-', NULL);
+
+query T
+SELECT array_to_string(column1, column2, column3) FROM test_null_str_col;
+----
+1,N/A,3
+MISSING,5,MISSING
+10-X-12
+20-21
+
+statement ok
+DROP TABLE test_null_str_col;
+
+# array_to_string with decimal values
+query T
+select array_to_string(arrow_cast(make_array(1.5, NULL, 3.14), 'List(Decimal128(10, 2))'), ',', 'N');
+----
+1.50,N,3.14
+
+# array_to_string with date values
+query T
+select array_to_string(arrow_cast(make_array('2024-01-15', '2024-06-30', '2024-12-25'), 'List(Date32)'), ',');
+----
+2024-01-15,2024-06-30,2024-12-25
+
+query T
+select array_to_string(arrow_cast(make_array('2024-01-15', NULL, '2024-12-25'), 'List(Date32)'), ',', 'N');
+----
+2024-01-15,N,2024-12-25
+
+# array_to_string with timestamp values
+query T
+select array_to_string(make_array(arrow_cast('2024-01-15T10:30:00', 'Timestamp(Second, None)'), arrow_cast('2024-06-30T15:45:00', 'Timestamp(Second, None)')), '|');
+----
+2024-01-15T10:30:00|2024-06-30T15:45:00
+
+query T
+select array_to_string(make_array(arrow_cast('2024-01-15T10:30:00', 'Timestamp(Millisecond, None)'), arrow_cast('2024-06-30T15:45:00', 'Timestamp(Millisecond, None)')), '|');
+----
+2024-01-15T10:30:00|2024-06-30T15:45:00
+
+query T
+select array_to_string(make_array(arrow_cast('2024-01-15T10:30:00', 'Timestamp(Microsecond, None)'), arrow_cast('2024-06-30T15:45:00', 'Timestamp(Microsecond, None)')), '|');
+----
+2024-01-15T10:30:00|2024-06-30T15:45:00
+
+query T
+select array_to_string(make_array(arrow_cast('2024-01-15T10:30:00', 'Timestamp(Nanosecond, None)'), arrow_cast('2024-06-30T15:45:00', 'Timestamp(Nanosecond, None)')), '|');
+----
+2024-01-15T10:30:00|2024-06-30T15:45:00
+
+# array_to_string with time values
+query T
+select array_to_string(make_array(arrow_cast('10:30:00', 'Time32(Second)'), arrow_cast('15:45:00', 'Time32(Second)')), ',');
+----
+10:30:00,15:45:00
+
+query T
+select array_to_string(make_array(arrow_cast('10:30:00', 'Time64(Microsecond)'), arrow_cast('15:45:00', 'Time64(Microsecond)')), ',');
+----
+10:30:00,15:45:00
+
+# array_to_string with interval values
+query T
+select array_to_string(make_array(interval '1 year 2 months', interval '3 days 4 hours'), ',');
+----
+14 mons,3 days 4 hours
+
+# array_to_string with duration values
+query T
+select array_to_string(make_array(arrow_cast(1000, 'Duration(Millisecond)'), arrow_cast(2000, 'Duration(Millisecond)')), ',');
+----
+PT1S,PT2S
+
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/array_transform.slt b/datafusion/sqllogictest/test_files/array/array_transform.slt
new file mode 100644
index 0000000000000..235abf5b229c4
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_transform.slt
@@ -0,0 +1,286 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#############
+## array_transform Tests
+#############
+
+statement ok
+set datafusion.sql_parser.dialect = databricks;
+
+statement ok
+CREATE TABLE t (list array<int>, number int)
+AS VALUES
+([1, 50], 10),
+([4, 50], 40),
+([7, 50], 60);
+
+statement ok
+CREATE TABLE with_null_list (list array<int>)
+AS VALUES
+([1, 2]),
+(NULL);
+
+statement ok
+CREATE TABLE fully_null_list (list array<int>)
+AS VALUES
+(NULL),
+(NULL);
+
+query ?
+SELECT array_transform([1,2,3,4,5], v -> v*2);
+----
+[2, 4, 6, 8, 10]
+
+query ?
+SELECT array_transform([1,2,3,4,5], v -> repeat("a", v));
+----
+[a, aa, aaa, aaaa, aaaaa]
+
+query ?
+SELECT array_transform([1,2,3,4,5], v -> list_repeat("a", v));
+----
+[[a], [a, a], [a, a, a], [a, a, a, a], [a, a, a, a, a]]
+
+# return scalar
+query I?
+SELECT t.number, array_transform([1, 2], e1 -> 24) from t;
+----
+10 [24, 24]
+40 [24, 24]
+60 [24, 24]
+
+# shadows parent lambda variable
+query ?
+SELECT array_transform([[1, 2]], a -> array_transform(a, a -> a+1))
+----
+[[2, 3]]
+
+# multiple nesting
+query ?
+SELECT array_transform([[[1], [2], [3]]], a -> array_transform(a, b -> array_transform(b, c -> c*2)));
+----
+[[[2], [4], [6]]]
+
+# parameter shadows unqualified column
+query I?
+SELECT number, array_transform([1, 2], number -> number+1) from t;
+----
+10 [2, 3]
+40 [2, 3]
+60 [2, 3]
+
+# type coercion inside lambda body
+query ?
+SELECT array_transform([t.number], v -> v + 3.0) from t;
+----
+[13.0]
+[43.0]
+[63.0]
+
+query TT
+EXPLAIN SELECT array_transform([t.number], v -> v + 3.0) from t;
+----
+logical_plan
+01)Projection: array_transform(make_array(t.number), (v) -> CAST(v AS Float64) + Float64(3))
+02)--TableScan: t projection=[number]
+physical_plan
+01)ProjectionExec: expr=[array_transform(make_array(number@0), (v) -> CAST(v@0 AS Float64) + 3) as array_transform(make_array(t.number),(v) -> v + Float64(3))]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+#cse should not eliminate subtrees containing lambdas
+query TT
+explain select array_transform([t.number], v -> 5), array_transform([t.number+1], v -> 5) from t;
+----
+logical_plan
+01)Projection: array_transform(make_array(t.number), (v) -> Int64(5)), array_transform(make_array(CAST(t.number AS Int64) + Int64(1)), (v) -> Int64(5))
+02)--TableScan: t projection=[number]
+physical_plan
+01)ProjectionExec: expr=[array_transform(make_array(number@0), (v) -> 5) as array_transform(make_array(t.number),(v) -> Int64(5)), array_transform(make_array(CAST(number@0 AS Int64) + 1), (v) -> 5) as array_transform(make_array(t.number + Int64(1)),(v) -> Int64(5))]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+#cse should not eliminate subtrees containing lambda variables
+query TT
+explain select array_transform([t.number], v -> v*2), array_transform([t.number], v -> v*2-1) from t;
+----
+logical_plan
+01)Projection: array_transform(__common_expr_1 AS make_array(t.number), (v) -> CAST(v AS Int64) * Int64(2)), array_transform(__common_expr_1 AS make_array(t.number), (v) -> CAST(v AS Int64) * Int64(2) - Int64(1))
+02)--Projection: make_array(t.number) AS __common_expr_1
+03)----TableScan: t projection=[number]
+physical_plan
+01)ProjectionExec: expr=[array_transform(__common_expr_1@0, (v) -> CAST(v@0 AS Int64) * 2) as array_transform(make_array(t.number),(v) -> v * Int64(2)), array_transform(__common_expr_1@0, (v) -> CAST(v@0 AS Int64) * 2 - 1) as array_transform(make_array(t.number),(v) -> v * Int64(2) - Int64(1))]
+02)--ProjectionExec: expr=[make_array(number@0) as __common_expr_1]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+
+# test that sql planner plans resolved lambda variables, as v[1] planning checks the datatype of lhs
+query ?
+SELECT array_transform([[10, 20]], v -> v[1]);
+----
+[10]
+
+
+# expr simplifier inside lambda body
+query TT
+EXPLAIN SELECT array_transform([t.number], v -> v = v) from t;
+----
+logical_plan
+01)Projection: array_transform(make_array(t.number), (v) -> v IS NOT NULL OR Boolean(NULL)) AS array_transform(make_array(t.number),(v) -> v = v)
+02)--TableScan: t projection=[number]
+physical_plan
+01)ProjectionExec: expr=[array_transform(make_array(number@0), (v) -> v@0 IS NOT NULL OR NULL) as array_transform(make_array(t.number),(v) -> v = v)]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+
+# array_transform coercion rules
+query TT
+explain select array_transform(arrow_cast(t.list, 'ListView(Int32)'), a -> a+1) from t;
+----
+logical_plan
+01)Projection: array_transform(CAST(CAST(t.list AS ListView(Int32)) AS List(Int32)), (a) -> CAST(a AS Int64) + Int64(1)) AS array_transform(arrow_cast(t.list,Utf8("ListView(Int32)")),(a) -> a + Int64(1))
+02)--TableScan: t projection=[list]
+physical_plan
+01)ProjectionExec: expr=[array_transform(CAST(CAST(list@0 AS ListView(Int32)) AS List(Int32)), (a) -> CAST(a@0 AS Int64) + 1) as array_transform(arrow_cast(t.list,Utf8("ListView(Int32)")),(a) -> a + Int64(1))]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query ?
+select array_transform(arrow_cast(t.list, 'ListView(Int32)'), a -> a+1) from t;
+----
+[2, 51]
+[5, 51]
+[8, 51]
+
+# higher order function with inner case using lambda variables only
+query ?
+select array_transform([3, 5, 0], v -> case when v > 1 then 2 when v > 4 then 6 else 8 end);
+----
+[2, 2, 8]
+
+#case with inner higher order function
+query I??
+select
+    t.number,
+    t.list,
+    case
+        when t.number > 30 then array_transform(t.list, v -> v+1)
+        else array_transform(t.list, v -> v*2)
+    end
+from t
+order by t.number;
+----
+10 [1, 50] [2, 100]
+40 [4, 50] [5, 51]
+60 [7, 50] [8, 51]
+
+# array_transform coercion rules
+query TT
+explain select array_transform(arrow_cast(t.list, 'FixedSizeList(2, Int32)'), a -> a+1) from t;
+----
+logical_plan
+01)Projection: array_transform(CAST(CAST(t.list AS FixedSizeList(2 x Int32)) AS List(Int32)), (a) -> CAST(a AS Int64) + Int64(1)) AS array_transform(arrow_cast(t.list,Utf8("FixedSizeList(2, Int32)")),(a) -> a + Int64(1))
+02)--TableScan: t projection=[list]
+physical_plan
+01)ProjectionExec: expr=[array_transform(CAST(CAST(list@0 AS FixedSizeList(2 x Int32)) AS List(Int32)), (a) -> CAST(a@0 AS Int64) + 1) as array_transform(arrow_cast(t.list,Utf8("FixedSizeList(2, Int32)")),(a) -> a + Int64(1))]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query ?
+select array_transform(arrow_cast(t.list, 'FixedSizeList(2, Int32)'), a -> a+1) from t;
+----
+[2, 51]
+[5, 51]
+[8, 51]
+
+query ?
+select array_transform(arrow_cast(list, 'FixedSizeList(2, Int32)'), a -> a+1) from fully_null_list;
+----
+NULL
+NULL
+
+query ?
+select array_transform(arrow_cast(list, 'FixedSizeList(2, Int32)'), a -> a+1) from with_null_list;
+----
+[2, 3]
+NULL
+
+query ?
+select array_transform(list, a -> a+1) from with_null_list;
+----
+[2, 3]
+NULL
+
+# empty sublists fast path
+query ?
+select array_transform([], v -> v+1);
+----
+[]
+
+# all null fast path
+query ?
+select array_transform(list, v -> v+1) from fully_null_list;
+----
+NULL
+NULL
+
+# higher order function with inner case using lambda variables and captured column(capture not supported yet)
+query error DataFusion error: Error during planning: lambda doesn't support column capture
+select array_transform([3, 5, 9], v -> case when v > 1 then t.number when v > 4 then 6 else 8 end) from t;
+
+# higher order function with inner case using captured column only(capture not supported yet)
+query error DataFusion error: Error during planning: lambda doesn't support column capture
+select array_transform([3, 5, 9], v -> case when t.number > 1 then 2 when t.number > 4 then 6 else 8 end) from t;
+
+query error
+select array_transform();
+----
+DataFusion error: Error during planning: array_transform function requires 1 value arguments, got 0
+
+
+query error DataFusion error: Error during planning: array_transform expected a list as first argument, got Int64
+select array_transform(1, v -> v*2);
+
+query error DataFusion error: Error during planning: array_transform expects a value followed by a lambda, got Lambda\(None\) and Value\(Field \{ name: "make_array\(Int64\(1\),Int64\(2\)\)", data_type: List\(Field \{ data_type: Int64, nullable: true \}\), nullable: true \}\)
+select array_transform(v -> v*2, [1, 2]);
+
+query error DataFusion error: Error during planning: lambda defined 3 params but UDF support only 1
+SELECT array_transform([1, 2], (e, i, j) -> i);
+
+query error DataFusion error: Error during planning: lambda parameters names must be unique, got \(v, v\)
+SELECT array_transform([1], (v, v) -> v*2);
+
+query error DataFusion error: This feature is not implemented: Unsupported ast node in sqltorel: Lambda\(LambdaFunction \{ params: One\(Ident \{ value: "v", quote_style: None, span: Span\(Location\(1,12\)\.\.Location\(1,13\)\) \}\), body: Identifier\(Ident \{ value: "v", quote_style: None, span: Span\(Location\(1,17\)\.\.Location\(1,18\)\) \}\), syntax: Arrow \}\)
+SELECT abs(v -> v);
+
+query error DataFusion error: This feature is not implemented: Unsupported ast node in sqltorel: Lambda\(LambdaFunction \{ params: One\(Ident \{ value: "v", quote_style: None, span: Span\(Location\(1,8\)\.\.Location\(1,9\)\) \}\), body: Identifier\(Ident \{ value: "v", quote_style: None, span: Span\(Location\(1,13\)\.\.Location\(1,14\)\) \}\), syntax: Arrow \}\)
+SELECT v -> v;
+
+query error DataFusion error: This feature is not implemented: Unsupported ast node in sqltorel: Lambda\(LambdaFunction \{ params: One\(Ident \{ value: "v", quote_style: None, span: Span\(Location\(1,34\)\.\.Location\(1,35\)\) \}\), body: BinaryOp \{ left: Identifier\(Ident \{ value: "v", quote_style: None, span: Span\(Location\(1,39\)\.\.Location\(1,40\)\) \}\), op: Plus, right: Value\(ValueWithSpan \{ value: Number\("1", false\), span: Span\(Location\(1,41\)\.\.Location\(1,42\)\) \}\) \}, syntax: Arrow \}\)
+SELECT array_transform([1], v -> v -> v+1);
+
+query error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \) at Line: 1, Column: 30"\)
+SELECT array_transform([1], () -> 1);
+
+statement ok
+drop table t;
+
+statement ok
+drop table with_null_list;
+
+statement ok
+drop table fully_null_list;
+
+statement ok
+set datafusion.sql_parser.dialect = generic;
diff --git a/datafusion/sqllogictest/test_files/array/array_union.slt b/datafusion/sqllogictest/test_files/array/array_union.slt
new file mode 100644
index 0000000000000..6a0fdc546e7d7
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/array_union.slt
@@ -0,0 +1,239 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## array_union (aliases: `list_union`)
+
+# array_union scalar function #1
+query ?
+select array_union([1, 2, 3, 4], [5, 6, 3, 4]);
+----
+[1, 2, 3, 4, 5, 6]
+
+query ?
+select array_union(arrow_cast([1, 2, 3, 4], 'LargeList(Int64)'), arrow_cast([5, 6, 3, 4], 'LargeList(Int64)'));
+----
+[1, 2, 3, 4, 5, 6]
+
+query ?
+select array_union(arrow_cast([1, 2, 3, 4], 'FixedSizeList(4, Int64)'), arrow_cast([5, 6, 3, 4], 'FixedSizeList(4, Int64)'));
+----
+[1, 2, 3, 4, 5, 6]
+
+query ?
+select array_union(arrow_cast([1, 2, 3, 4], 'FixedSizeList(4, Int64)'), arrow_cast([5, 6], 'FixedSizeList(2, Int64)'));
+----
+[1, 2, 3, 4, 5, 6]
+
+# array_union scalar function #2
+query ?
+select array_union([1, 2, 3, 4], [5, 6, 7, 8]);
+----
+[1, 2, 3, 4, 5, 6, 7, 8]
+
+query ?
+select array_union(arrow_cast([1, 2, 3, 4], 'LargeList(Int64)'), arrow_cast([5, 6, 7, 8], 'LargeList(Int64)'));
+----
+[1, 2, 3, 4, 5, 6, 7, 8]
+
+# array_union scalar function #3
+query ?
+select array_union([1,2,3], []);
+----
+[1, 2, 3]
+
+query ?
+select array_union(arrow_cast([1,2,3], 'LargeList(Int64)'), arrow_cast([], 'LargeList(Int64)'));
+----
+[1, 2, 3]
+
+# array_union scalar function #4
+query ?
+select array_union([1, 2, 3, 4], [5, 4]);
+----
+[1, 2, 3, 4, 5]
+
+query ?
+select array_union(arrow_cast([1, 2, 3, 4], 'LargeList(Int64)'), arrow_cast([5, 4], 'LargeList(Int64)'));
+----
+[1, 2, 3, 4, 5]
+
+# array_union scalar function #5
+statement ok
+CREATE TABLE arrays_with_repeating_elements_for_union
+AS VALUES
+  ([0, 1, 1], []),
+  ([1, 1], [2]),
+  ([2, 3], [3]),
+  ([3], [3, 4])
+;
+
+query ?
+select array_union(column1, column2) from arrays_with_repeating_elements_for_union;
+----
+[0, 1]
+[1, 2]
+[2, 3]
+[3, 4]
+
+query ?
+select array_union(arrow_cast(column1, 'LargeList(Int64)'), arrow_cast(column2, 'LargeList(Int64)')) from arrays_with_repeating_elements_for_union;
+----
+[0, 1]
+[1, 2]
+[2, 3]
+[3, 4]
+
+statement ok
+drop table arrays_with_repeating_elements_for_union;
+
+# array_union scalar function #6
+query ?
+select array_union([], []);
+----
+[]
+
+query ?
+select array_union(arrow_cast([], 'LargeList(Int64)'), arrow_cast([], 'LargeList(Int64)'));
+----
+[]
+
+# array_union scalar function #7
+query ?
+select array_union([[null]], []);
+----
+[[NULL]]
+
+query error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'array_union' function:
+select array_union(arrow_cast([[null]], 'LargeList(List(Int64))'), arrow_cast([], 'LargeList(Int64)'));
+
+# array_union scalar function #8
+query ?
+select array_union([null], [null]);
+----
+[NULL]
+
+query ?
+select array_union(arrow_cast([[null]], 'LargeList(List(Int64))'), arrow_cast([[null]], 'LargeList(List(Int64))'));
+----
+[[NULL]]
+
+# array_union scalar function #9
+query ?
+select array_union(null, []);
+----
+NULL
+
+query ?
+select array_union(null, arrow_cast([], 'LargeList(Int64)'));
+----
+NULL
+
+# array_union scalar function #10
+query ?
+select array_union(null, null);
+----
+NULL
+
+# array_union scalar function #11
+query ?
+select array_union([1, 1, 2, 2, 3, 3], null);
+----
+NULL
+
+query ?
+select array_union(arrow_cast([1, 1, 2, 2, 3, 3], 'LargeList(Int64)'), null);
+----
+NULL
+
+# array_union scalar function #12
+query ?
+select array_union(null, [1, 1, 2, 2, 3, 3]);
+----
+NULL
+
+query ?
+select array_union(null, arrow_cast([1, 1, 2, 2, 3, 3], 'LargeList(Int64)'));
+----
+NULL
+
+# array_union scalar function #13
+query ?
+select array_union([1.2, 3.0], [1.2, 3.0, 5.7]);
+----
+[1.2, 3.0, 5.7]
+
+query ?
+select array_union(arrow_cast([1.2, 3.0], 'LargeList(Float64)'), arrow_cast([1.2, 3.0, 5.7], 'LargeList(Float64)'));
+----
+[1.2, 3.0, 5.7]
+
+# array_union scalar function #14
+query ?
+select array_union(['hello'], ['hello','datafusion']);
+----
+[hello, datafusion]
+
+query ?
+select array_union(arrow_cast(['hello'], 'LargeList(Utf8)'), arrow_cast(['hello','datafusion'], 'LargeList(Utf8)'));
+----
+[hello, datafusion]
+
+query ?
+select array_union(column1, column2)
+from array_intersect_table_1D_NULL;
+----
+[1, 2, 3, 4]
+[2, 3]
+[3, 4]
+NULL
+NULL
+NULL
+
+query ?
+select array_union(arrow_cast(null, 'List(Int64)'), [1, 2]);
+----
+NULL
+
+query ?
+select array_union([1, 2], arrow_cast(null, 'List(Int64)'));
+----
+NULL
+
+query ?
+select array_intersect(arrow_cast(null, 'List(Int64)'), [1, 2]);
+----
+NULL
+
+query ?
+select array_intersect([1, 2], arrow_cast(null, 'List(Int64)'));
+----
+NULL
+
+query ?
+select array_except(arrow_cast(null, 'List(Int64)'), [1, 2]);
+----
+NULL
+
+query ?
+select array_except([1, 2], arrow_cast(null, 'List(Int64)'));
+----
+NULL
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/arrays_zip.slt b/datafusion/sqllogictest/test_files/array/arrays_zip.slt
new file mode 100644
index 0000000000000..f5ec0d56f494b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/arrays_zip.slt
@@ -0,0 +1,569 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## arrays_zip (aliases: `list_zip`)
+
+# Spark example: arrays_zip(array(1, 2, 3), array(2, 3, 4))
+query ?
+select arrays_zip([1, 2, 3], [2, 3, 4]);
+----
+[{1: 1, 2: 2}, {1: 2, 2: 3}, {1: 3, 2: 4}]
+
+# Spark example: arrays_zip(array(1, 2), array(2, 3), array(3, 4))
+query ?
+select arrays_zip([1, 2], [2, 3], [3, 4]);
+----
+[{1: 1, 2: 2, 3: 3}, {1: 2, 2: 3, 3: 4}]
+
+# basic: two integer arrays of equal length
+query ?
+select arrays_zip([1, 2, 3], [10, 20, 30]);
+----
+[{1: 1, 2: 10}, {1: 2, 2: 20}, {1: 3, 2: 30}]
+
+# basic: two arrays with different element types (int + string)
+query ?
+select arrays_zip([1, 2, 3], ['a', 'b', 'c']);
+----
+[{1: 1, 2: a}, {1: 2, 2: b}, {1: 3, 2: c}]
+
+# three arrays of equal length
+query ?
+select arrays_zip([1, 2, 3], [10, 20, 30], [100, 200, 300]);
+----
+[{1: 1, 2: 10, 3: 100}, {1: 2, 2: 20, 3: 200}, {1: 3, 2: 30, 3: 300}]
+
+# four arrays of equal length
+query ?
+select arrays_zip([1], [2], [3], [4]);
+----
+[{1: 1, 2: 2, 3: 3, 4: 4}]
+
+# mixed element types: float + boolean
+query ?
+select arrays_zip([1.5, 2.5], [true, false]);
+----
+[{1: 1.5, 2: true}, {1: 2.5, 2: false}]
+
+# different length arrays: shorter array padded with NULLs
+query ?
+select arrays_zip([1, 2], [3, 4, 5]);
+----
+[{1: 1, 2: 3}, {1: 2, 2: 4}, {1: NULL, 2: 5}]
+
+# different length arrays: first longer
+query ?
+select arrays_zip([1, 2, 3], [10]);
+----
+[{1: 1, 2: 10}, {1: 2, 2: NULL}, {1: 3, 2: NULL}]
+
+# different length: one single element, other three elements
+query ?
+select arrays_zip([1], ['a', 'b', 'c']);
+----
+[{1: 1, 2: a}, {1: NULL, 2: b}, {1: NULL, 2: c}]
+
+# empty arrays
+query ?
+select arrays_zip([], []);
+----
+[]
+
+# one empty, one non-empty
+query ?
+select arrays_zip([], [1, 2, 3]);
+----
+[{1: NULL, 2: 1}, {1: NULL, 2: 2}, {1: NULL, 2: 3}]
+
+# NULL elements inside arrays
+query ?
+select arrays_zip([1, NULL, 3], ['a', 'b', 'c']);
+----
+[{1: 1, 2: a}, {1: NULL, 2: b}, {1: 3, 2: c}]
+
+# all NULL elements
+query ?
+select arrays_zip([NULL::int, NULL, NULL], [NULL::text, NULL, NULL]);
+----
+[{1: NULL, 2: NULL}, {1: NULL, 2: NULL}, {1: NULL, 2: NULL}]
+
+# both args are NULL (entire list null)
+query ?
+select arrays_zip(NULL::int[], NULL::int[]);
+----
+NULL
+
+# one arg is NULL list, other is real array
+query ?
+select arrays_zip(NULL::int[], [1, 2, 3]);
+----
+[{1: NULL, 2: 1}, {1: NULL, 2: 2}, {1: NULL, 2: 3}]
+
+# real array + NULL list
+query ?
+select arrays_zip([1, 2], NULL::text[]);
+----
+[{1: 1, 2: NULL}, {1: 2, 2: NULL}]
+
+# column-level test with multiple rows
+query ?
+select arrays_zip(a, b) from (values ([1, 2], [10, 20]), ([3, 4, 5], [30]), ([6], [60, 70])) as t(a, b);
+----
+[{1: 1, 2: 10}, {1: 2, 2: 20}]
+[{1: 3, 2: 30}, {1: 4, 2: NULL}, {1: 5, 2: NULL}]
+[{1: 6, 2: 60}, {1: NULL, 2: 70}]
+
+# column-level test with NULL rows
+query ?
+select arrays_zip(a, b) from (values ([1, 2], [10, 20]), (null, [30, 40]), ([5, 6], null)) as t(a, b);
+----
+[{1: 1, 2: 10}, {1: 2, 2: 20}]
+[{1: NULL, 2: 30}, {1: NULL, 2: 40}]
+[{1: 5, 2: NULL}, {1: 6, 2: NULL}]
+
+# column-level test with single argument
+query ?
+select arrays_zip(a) from (values ([1, 2], [10, 20]), (null, [30, 40]), ([5, 6], null)) as t(a, b);
+----
+[{1: 1}, {1: 2}]
+NULL
+[{1: 5}, {1: 6}]
+
+query ?
+select arrays_zip(b) from (values ([1, 2], [10, 20]), (null, [30, 40]), ([5, 6], null)) as t(a, b);
+----
+[{1: 10}, {1: 20}]
+[{1: 30}, {1: 40}]
+NULL
+
+# No input
+query error Error during planning: 'arrays_zip' does not support zero arguments
+select arrays_zip();
+
+# Non-array input
+query error DataFusion error: Execution error: arrays_zip expects array arguments, got Int64
+select arrays_zip(1, 2);
+
+# null input
+query ?
+select arrays_zip(null)
+----
+NULL
+
+# single empty array
+query ?
+select arrays_zip([])
+----
+[]
+
+
+# single array of null
+query ?
+select arrays_zip([null])
+----
+[{1: NULL}]
+
+query ?
+select arrays_zip([NULL::int])
+----
+[{1: NULL}]
+
+query ?
+select arrays_zip([NULL::int[]])
+----
+[{1: NULL}]
+
+# alias: list_zip
+query ?
+select list_zip([1, 2], [3, 4]);
+----
+[{1: 1, 2: 3}, {1: 2, 2: 4}]
+
+# column test: total values equal (3 each) but per-row lengths differ
+# a: [1]     b: [10, 20]   → row 0: a has 1, b has 2
+# a: [2, 3]  b: [30]       → row 1: a has 2, b has 1
+# total a values = 3, total b values = 3 (same!) but rows are misaligned
+query ?
+select arrays_zip(a, b) from (values ([1], [10, 20]), ([2, 3], [30])) as t(a, b);
+----
+[{1: 1, 2: 10}, {1: NULL, 2: 20}]
+[{1: 2, 2: 30}, {1: 3, 2: NULL}]
+
+# single element arrays
+query ?
+select arrays_zip([42], ['hello']);
+----
+[{1: 42, 2: hello}]
+
+# single argument
+query ?
+select arrays_zip([1, 2, 3]);
+----
+[{1: 1}, {1: 2}, {1: 3}]
+
+# arrays_zip with LargeList inputs
+query ?
+select arrays_zip(
+    arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'),
+    arrow_cast(make_array(10, 20, 30), 'LargeList(Int64)')
+);
+----
+[{1: 1, 2: 10}, {1: 2, 2: 20}, {1: 3, 2: 30}]
+
+# arrays_zip with LargeList different lengths (padding)
+query ?
+select arrays_zip(
+    arrow_cast(make_array(1, 2), 'LargeList(Int64)'),
+    arrow_cast(make_array(10, 20, 30), 'LargeList(Int64)')
+);
+----
+[{1: 1, 2: 10}, {1: 2, 2: 20}, {1: NULL, 2: 30}]
+
+# single argument from LargeList
+query ?
+select arrays_zip(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'));
+----
+[{1: 1}, {1: 2}, {1: 3}]
+
+# arrays_zip with FixedSizeList inputs
+query ?
+select arrays_zip(
+    arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'),
+    arrow_cast(make_array(10, 20, 30), 'FixedSizeList(3, Int64)')
+);
+----
+[{1: 1, 2: 10}, {1: 2, 2: 20}, {1: 3, 2: 30}]
+
+# single argument from FixedSizeList
+query ?
+select arrays_zip(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'));
+----
+[{1: 1}, {1: 2}, {1: 3}]
+
+# arrays_zip mixing List and LargeList
+query ?
+select arrays_zip(
+    [1, 2, 3],
+    arrow_cast(make_array(10, 20, 30), 'LargeList(Int64)')
+);
+----
+[{1: 1, 2: 10}, {1: 2, 2: 20}, {1: 3, 2: 30}]
+
+# arrays_zip mixing List and FixedSizeList with different lengths (padding)
+query ?
+select arrays_zip(
+    [1, 2, 3],
+    arrow_cast(make_array(10, 20), 'FixedSizeList(2, Int64)')
+);
+----
+[{1: 1, 2: 10}, {1: 2, 2: 20}, {1: 3, 2: NULL}]
+
+# arrays_zip with LargeList and FixedSizeList mixed types
+query ?
+select arrays_zip(
+    arrow_cast(make_array(1, 2), 'LargeList(Int64)'),
+    arrow_cast(make_array('a', 'b'), 'FixedSizeList(2, Utf8)')
+);
+----
+[{1: 1, 2: a}, {1: 2, 2: b}]
+
+query ???
+select array_intersect(column1, column2),
+       array_intersect(column3, column4),
+       array_intersect(column5, column6)
+from array_intersect_table_1D;
+----
+[1] [1, 3] [1, 3]
+[11] [11, 33] [11, 33]
+
+query ???
+select array_intersect(column1, column2),
+       array_intersect(column3, column4),
+       array_intersect(column5, column6)
+from large_array_intersect_table_1D;
+----
+[1] [1, 3] [1, 3]
+[11] [11, 33] [11, 33]
+
+query ???
+select array_intersect(column1, column2),
+       array_intersect(column3, column4),
+       array_intersect(column5, column6)
+from array_intersect_table_1D_Float;
+----
+[1.0] [1.0, 3.0] []
+[] [2.0] [1.11]
+
+query ???
+select array_intersect(column1, column2),
+       array_intersect(column3, column4),
+       array_intersect(column5, column6)
+from array_intersect_table_1D_Boolean;
+----
+[] [true, false] [false]
+[false] [true] [true]
+
+query ???
+select array_intersect(column1, column2),
+       array_intersect(column3, column4),
+       array_intersect(column5, column6)
+from large_array_intersect_table_1D_Boolean;
+----
+[] [true, false] [false]
+[false] [true] [true]
+
+query ???
+select array_intersect(column1, column2),
+       array_intersect(column3, column4),
+       array_intersect(column5, column6)
+from array_intersect_table_1D_UTF8;
+----
+[bc] [rust, arrow] []
+[] [datafusion, rust, arrow] [rust, arrow]
+
+query ???
+select array_intersect(column1, column2),
+       array_intersect(column3, column4),
+       array_intersect(column5, column6)
+from large_array_intersect_table_1D_UTF8;
+----
+[bc] [rust, arrow] []
+[] [datafusion, rust, arrow] [rust, arrow]
+
+query ?
+select array_intersect(column1, column2)
+from array_intersect_table_1D_NULL;
+----
+[2, 3]
+[3]
+[3]
+NULL
+NULL
+NULL
+
+query ??
+select array_intersect(column1, column2),
+       array_intersect(column3, column4)
+from array_intersect_table_2D;
+----
+[] [[4, 5], [6, 7]]
+[[3, 4]] [[5, 6, 7], [8, 9, 10]]
+
+query ??
+select array_intersect(column1, column2),
+       array_intersect(column3, column4)
+from large_array_intersect_table_2D;
+----
+[] [[4, 5], [6, 7]]
+[[3, 4]] [[5, 6, 7], [8, 9, 10]]
+
+
+query ?
+select array_intersect(column1, column2)
+from array_intersect_table_2D_float;
+----
+[[1.1, 2.2], [3.3]]
+[[1.1, 2.2], [3.3]]
+
+query ?
+select array_intersect(column1, column2)
+from large_array_intersect_table_2D_float;
+----
+[[1.1, 2.2], [3.3]]
+[[1.1, 2.2], [3.3]]
+
+query ?
+select array_intersect(column1, column2)
+from array_intersect_table_3D;
+----
+[]
+[[[1, 2]]]
+
+query ?
+select array_intersect(column1, column2)
+from large_array_intersect_table_3D;
+----
+[]
+[[[1, 2]]]
+
+query ??????
+SELECT  array_intersect(make_array(1,2,3), make_array(2,3,4)),
+        array_intersect(make_array(1,3,5), make_array(2,4,6)),
+        array_intersect(make_array('aa','bb','cc'), make_array('cc','aa','dd')),
+        array_intersect(make_array(true, false), make_array(true)),
+        array_intersect(make_array(1.1, 2.2, 3.3), make_array(2.2, 3.3, 4.4)),
+        array_intersect(make_array([1, 1], [2, 2], [3, 3]), make_array([2, 2], [3, 3], [4, 4]))
+;
+----
+[2, 3] [] [aa, cc] [true] [2.2, 3.3] [[2, 2], [3, 3]]
+
+query ??????
+SELECT  array_intersect(arrow_cast(make_array(1,2,3), 'LargeList(Int64)'), arrow_cast(make_array(2,3,4), 'LargeList(Int64)')),
+        array_intersect(arrow_cast(make_array(1,3,5), 'LargeList(Int64)'), arrow_cast(make_array(2,4,6), 'LargeList(Int64)')),
+        array_intersect(arrow_cast(make_array('aa','bb','cc'), 'LargeList(Utf8)'), arrow_cast(make_array('cc','aa','dd'), 'LargeList(Utf8)')),
+        array_intersect(arrow_cast(make_array(true, false), 'LargeList(Boolean)'), arrow_cast(make_array(true), 'LargeList(Boolean)')),
+        array_intersect(arrow_cast(make_array(1.1, 2.2, 3.3), 'LargeList(Float64)'), arrow_cast(make_array(2.2, 3.3, 4.4), 'LargeList(Float64)')),
+        array_intersect(arrow_cast(make_array([1, 1], [2, 2], [3, 3]), 'LargeList(List(Int64))'), arrow_cast(make_array([2, 2], [3, 3], [4, 4]), 'LargeList(List(Int64))'))
+;
+----
+[2, 3] [] [aa, cc] [true] [2.2, 3.3] [[2, 2], [3, 3]]
+
+query ??????
+SELECT  array_intersect(arrow_cast(make_array(1,2,3), 'FixedSizeList(3, Int64)'), arrow_cast(make_array(2,3,4), 'FixedSizeList(3, Int64)')),
+        array_intersect(arrow_cast(make_array(1,3,5), 'FixedSizeList(3, Int64)'), arrow_cast(make_array(2,4,6), 'FixedSizeList(3, Int64)')),
+        array_intersect(arrow_cast(make_array('aa','bb','cc'), 'FixedSizeList(3, Utf8)'), arrow_cast(make_array('cc','aa','dd'), 'FixedSizeList(3, Utf8)')),
+        array_intersect(arrow_cast(make_array(true, false), 'FixedSizeList(2, Boolean)'), arrow_cast(make_array(true), 'FixedSizeList(1, Boolean)')),
+        array_intersect(arrow_cast(make_array(1.1, 2.2, 3.3), 'FixedSizeList(3, Float64)'), arrow_cast(make_array(2.2, 3.3, 4.4), 'FixedSizeList(3, Float64)')),
+        array_intersect(arrow_cast(make_array([1, 1], [2, 2], [3, 3]), 'FixedSizeList(3, List(Int64))'), arrow_cast(make_array([2, 2], [3, 3], [4, 4]), 'FixedSizeList(3, List(Int64))'))
+;
+----
+[2, 3] [] [aa, cc] [true] [2.2, 3.3] [[2, 2], [3, 3]]
+
+query ?
+select array_intersect([], []);
+----
+[]
+
+query ?
+select array_intersect(arrow_cast([], 'LargeList(Int64)'), arrow_cast([], 'LargeList(Int64)'));
+----
+[]
+
+query ?
+select array_intersect([1, 1, 2, 2, 3, 3], null);
+----
+NULL
+
+query ?
+select array_intersect(arrow_cast([1, 1, 2, 2, 3, 3], 'LargeList(Int64)'), null);
+----
+NULL
+
+query ?
+select array_intersect(null, [1, 1, 2, 2, 3, 3]);
+----
+NULL
+
+query ?
+select array_intersect(null, arrow_cast([1, 1, 2, 2, 3, 3], 'LargeList(Int64)'));
+----
+NULL
+
+query ?
+select array_intersect([], null);
+----
+NULL
+
+query ?
+select array_intersect([[1,2,3]], [[]]);
+----
+[]
+
+query ?
+select array_intersect([[null]], [[]]);
+----
+[]
+
+query ?
+select array_intersect(arrow_cast([], 'LargeList(Int64)'), null);
+----
+NULL
+
+query ?
+select array_intersect(null, []);
+----
+NULL
+
+query ?
+select array_intersect(null, arrow_cast([], 'LargeList(Int64)'));
+----
+NULL
+
+query ?
+select array_intersect(null, null);
+----
+NULL
+
+query ??????
+SELECT  list_intersect(make_array(1,2,3), make_array(2,3,4)),
+        list_intersect(make_array(1,3,5), make_array(2,4,6)),
+        list_intersect(make_array('aa','bb','cc'), make_array('cc','aa','dd')),
+        list_intersect(make_array(true, false), make_array(true)),
+        list_intersect(make_array(1.1, 2.2, 3.3), make_array(2.2, 3.3, 4.4)),
+        list_intersect(make_array([1, 1], [2, 2], [3, 3]), make_array([2, 2], [3, 3], [4, 4]))
+;
+----
+[2, 3] [] [aa, cc] [true] [2.2, 3.3] [[2, 2], [3, 3]]
+
+query ??????
+SELECT  list_intersect(arrow_cast(make_array(1,2,3), 'LargeList(Int64)'), arrow_cast(make_array(2,3,4), 'LargeList(Int64)')),
+        list_intersect(arrow_cast(make_array(1,3,5), 'LargeList(Int64)'), arrow_cast(make_array(2,4,6), 'LargeList(Int64)')),
+        list_intersect(arrow_cast(make_array('aa','bb','cc'), 'LargeList(Utf8)'), arrow_cast(make_array('cc','aa','dd'), 'LargeList(Utf8)')),
+        list_intersect(arrow_cast(make_array(true, false), 'LargeList(Boolean)'), arrow_cast(make_array(true), 'LargeList(Boolean)')),
+        list_intersect(arrow_cast(make_array(1.1, 2.2, 3.3), 'LargeList(Float64)'), arrow_cast(make_array(2.2, 3.3, 4.4), 'LargeList(Float64)')),
+        list_intersect(arrow_cast(make_array([1, 1], [2, 2], [3, 3]), 'LargeList(List(Int64))'), arrow_cast(make_array([2, 2], [3, 3], [4, 4]), 'LargeList(List(Int64))'))
+;
+----
+[2, 3] [] [aa, cc] [true] [2.2, 3.3] [[2, 2], [3, 3]]
+
+query BBBB
+select list_has_all(make_array(1,2,3), make_array(4,5,6)),
+       list_has_all(make_array(1,2,3), make_array(1,2)),
+       list_has_any(make_array(1,2,3), make_array(4,5,6)),
+        list_has_any(make_array(1,2,3), make_array(1,2,4))
+;
+----
+false true false true
+
+query BBBB
+select arrays_overlap(make_array(1,2,3), make_array(4,5,6)),
+        arrays_overlap(make_array(1,2,3), make_array(1,2,4)),
+        arrays_overlap(make_array(['aa']), make_array(['aa'],['bb'])),
+        arrays_overlap(make_array('aa',NULL), make_array('bb',NULL))
+;
+----
+false true true true
+
+query ???
+select range(column2),
+       range(column1, column2),
+       range(column1, column2, column3)
+from arrays_range;
+----
+[0, 1, 2, 3, 4, 5, 6, 7, 8, 9] [3, 4, 5, 6, 7, 8, 9] [3, 5, 7, 9]
+[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] [4, 5, 6, 7, 8, 9, 10, 11, 12] [4, 7, 10]
+
+query ???????????
+select range(5),
+       range(2, 5),
+       range(2, 10, 3),
+       range(10, 2, -3),
+       range(1, 5, -1),
+       range(1, -5, 1),
+       range(1, -5, -1),
+       range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH),
+       range(DATE '1993-02-01', DATE '1993-01-01', INTERVAL '-1' DAY),
+       range(DATE '1989-04-01', DATE '1993-03-01', INTERVAL '1' YEAR),
+       range(DATE '1993-03-01', DATE '1989-04-01', INTERVAL '1' YEAR)
+;
+----
+[0, 1, 2, 3, 4] [2, 3, 4] [2, 5, 8] [10, 7, 4] [] [] [1, 0, -1, -2, -3, -4] [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01] [1993-02-01, 1993-01-31, 1993-01-30, 1993-01-29, 1993-01-28, 1993-01-27, 1993-01-26, 1993-01-25, 1993-01-24, 1993-01-23, 1993-01-22, 1993-01-21, 1993-01-20, 1993-01-19, 1993-01-18, 1993-01-17, 1993-01-16, 1993-01-15, 1993-01-14, 1993-01-13, 1993-01-12, 1993-01-11, 1993-01-10, 1993-01-09, 1993-01-08, 1993-01-07, 1993-01-06, 1993-01-05, 1993-01-04, 1993-01-03, 1993-01-02] [1989-04-01, 1990-04-01, 1991-04-01] []
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/cardinality.slt b/datafusion/sqllogictest/test_files/array/cardinality.slt
new file mode 100644
index 0000000000000..52b1a2b5445d9
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/cardinality.slt
@@ -0,0 +1,118 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+## cardinality
+
+# cardinality scalar function
+query III
+select cardinality(make_array(1, 2, 3, 4, 5)), cardinality([1, 3, 5]), cardinality(make_array('h', 'e', 'l', 'l', 'o'));
+----
+5 3 5
+
+query III
+select cardinality(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)')), cardinality(arrow_cast([1, 3, 5], 'LargeList(Int64)')), cardinality(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'));
+----
+5 3 5
+
+query III
+select cardinality(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)')), cardinality(arrow_cast([1, 3, 5], 'FixedSizeList(3, Int64)')), cardinality(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'));
+----
+5 3 5
+
+# cardinality scalar function #2
+query II
+select cardinality(make_array([1, 2], [3, 4], [5, 6])), cardinality(array_repeat(array_repeat(array_repeat(3, 3), 2), 3));
+----
+6 18
+
+query I
+select cardinality(arrow_cast(make_array([1, 2], [3, 4], [5, 6]), 'LargeList(List(Int64))'));
+----
+6
+
+query I
+select cardinality(arrow_cast([[1, 2], [3, 4], [5, 6]], 'FixedSizeList(3, List(Int64))'));
+----
+6
+
+# cardinality scalar function #3
+query II
+select cardinality(make_array()), cardinality(make_array(make_array()))
+----
+0 0
+
+query II
+select cardinality([]), cardinality([]::int[]) as with_cast
+----
+0 0
+
+query II
+select cardinality(arrow_cast(make_array(), 'LargeList(Int64)')), cardinality(arrow_cast(make_array(make_array()), 'LargeList(List(Int64))'))
+----
+0 0
+
+#TODO
+#https://github.com/apache/datafusion/issues/9158
+#query II
+#select cardinality(arrow_cast(make_array(), 'FixedSizeList(1, Null)')), cardinality(arrow_cast(make_array(make_array()), 'FixedSizeList(1, List(Int64))'))
+#----
+#NULL 0
+
+# cardinality of NULL arrays should return NULL
+query II
+select cardinality(NULL), cardinality(arrow_cast(NULL, 'LargeList(Int64)'))
+----
+NULL NULL
+
+# cardinality with columns
+query III
+select cardinality(column1), cardinality(column2), cardinality(column3) from arrays;
+----
+4 3 5
+4 3 5
+4 3 5
+4 3 3
+NULL 3 4
+4 NULL 1
+4 3 NULL
+
+query III
+select cardinality(column1), cardinality(column2), cardinality(column3) from large_arrays;
+----
+4 3 5
+4 3 5
+4 3 5
+4 3 3
+NULL 3 4
+4 NULL 1
+4 3 NULL
+
+query III
+select cardinality(column1), cardinality(column2), cardinality(column3) from fixed_size_arrays;
+----
+4 3 5
+4 3 5
+4 3 5
+4 3 5
+NULL 3 5
+4 NULL 5
+4 3 NULL
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/cleanup.slt.part b/datafusion/sqllogictest/test_files/array/cleanup.slt.part
new file mode 100644
index 0000000000000..a11a4770ec058
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/cleanup.slt.part
@@ -0,0 +1,170 @@
+### Delete tables
+
+statement ok
+drop table values;
+
+statement ok
+drop table values_without_nulls;
+
+statement ok
+drop table nested_arrays;
+
+statement ok
+drop table large_nested_arrays;
+
+statement ok
+drop table fixed_size_nested_arrays;
+
+statement ok
+drop table arrays;
+
+statement ok
+drop table large_arrays;
+
+statement ok
+drop table fixed_size_arrays;
+
+statement ok
+drop table slices;
+
+statement ok
+drop table fixed_slices;
+
+statement ok
+drop table arrayspop;
+
+statement ok
+drop table large_arrayspop;
+
+statement ok
+drop table arrays_values;
+
+statement ok
+drop table arrays_values_v2;
+
+statement ok
+drop table large_arrays_values_v2;
+
+statement ok
+drop table array_has_table_1D;
+
+statement ok
+drop table array_has_table_1D_Float;
+
+statement ok
+drop table array_has_table_1D_Boolean;
+
+statement ok
+drop table array_has_table_1D_UTF8;
+
+statement ok
+drop table array_has_table_2D;
+
+statement ok
+drop table array_has_table_2D_float;
+
+statement ok
+drop table array_has_table_3D;
+
+statement ok
+drop table array_intersect_table_1D;
+
+statement ok
+drop table large_array_intersect_table_1D;
+
+statement ok
+drop table array_intersect_table_1D_Float;
+
+statement ok
+drop table large_array_intersect_table_1D_Float;
+
+statement ok
+drop table array_intersect_table_1D_Boolean;
+
+statement ok
+drop table large_array_intersect_table_1D_Boolean;
+
+statement ok
+drop table array_intersect_table_1D_UTF8;
+
+statement ok
+drop table large_array_intersect_table_1D_UTF8;
+
+statement ok
+drop table array_intersect_table_2D;
+
+statement ok
+drop table large_array_intersect_table_2D;
+
+statement ok
+drop table array_intersect_table_2D_float;
+
+statement ok
+drop table large_array_intersect_table_2D_float;
+
+statement ok
+drop table array_intersect_table_3D;
+
+statement ok
+drop table large_array_intersect_table_3D;
+
+statement ok
+drop table fixed_size_array_has_table_1D;
+
+statement ok
+drop table fixed_size_array_has_table_1D_Float;
+
+statement ok
+drop table fixed_size_array_has_table_1D_Boolean;
+
+statement ok
+drop table fixed_size_array_has_table_1D_UTF8;
+
+statement ok
+drop table fixed_size_array_has_table_2D;
+
+statement ok
+drop table fixed_size_array_has_table_2D_float;
+
+statement ok
+drop table fixed_size_array_has_table_3D;
+
+statement ok
+drop table arrays_range;
+
+statement ok
+drop table arrays_with_repeating_elements;
+
+statement ok
+drop table large_arrays_with_repeating_elements;
+
+statement ok
+drop table fixed_arrays_with_repeating_elements;
+
+statement ok
+drop table nested_arrays_with_repeating_elements;
+
+statement ok
+drop table large_nested_arrays_with_repeating_elements;
+
+statement ok
+drop table fixed_size_nested_arrays_with_repeating_elements;
+
+statement ok
+drop table flatten_table;
+
+statement ok
+drop table large_flatten_table;
+
+statement ok
+drop table fixed_size_flatten_table;
+
+statement ok
+drop table arrays_values_without_nulls;
+
+statement ok
+drop table large_arrays_values_without_nulls;
+
+statement ok
+drop table fixed_size_arrays_values_without_nulls;
+
diff --git a/datafusion/sqllogictest/test_files/array/init_data.slt.part b/datafusion/sqllogictest/test_files/array/init_data.slt.part
new file mode 100644
index 0000000000000..f5cc58fb2be58
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/init_data.slt.part
@@ -0,0 +1,692 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#############
+## Array Expressions Tests
+#############
+
+### Tables
+
+statement ok
+CREATE TABLE values(
+  a INT,
+  b INT,
+  c INT,
+  d FLOAT,
+  e VARCHAR,
+  f VARCHAR
+) AS VALUES
+  (1,    1,    2,    1.1,  'Lorem',       'A'),
+  (2,    3,    4,    2.2,  'ipsum',       ''),
+  (3,    5,    6,    3.3,  'dolor',       'BB'),
+  (4,    7,    8,    4.4,  'sit',          NULL),
+  (NULL, 9,    10,   5.5,  'amet',        'CCC'),
+  (5,    NULL, 12,   6.6,  ',',           'DD'),
+  (6,    11,   NULL, 7.7,  'consectetur', 'E'),
+  (7,    13,   14,   NULL, 'adipiscing',  'F'),
+  (8,    15,   16,   8.8,   NULL,          '')
+;
+
+statement ok
+CREATE TABLE values_without_nulls
+AS VALUES
+  (1,    1,    2,    1.1,  'Lorem',       'A'),
+  (2,    3,    4,    2.2,  'ipsum',       ''),
+  (3,    5,    6,    3.3,  'dolor',       'BB'),
+  (4,    7,    8,    4.4,  'sit',          NULL),
+  (5,    9,    10,   5.5,  'amet',        'CCC'),
+  (6,    11,   12,   6.6,  ',',           'DD'),
+  (7,    13,   14,   7.7,  'consectetur', 'E'),
+  (8,    15,   16,   8.8,  'adipiscing',  'F'),
+  (9,    17,   18,   9.9,  'elit',        '')
+;
+
+statement ok
+CREATE TABLE arrays
+AS VALUES
+  (make_array(make_array(NULL, 2),make_array(3, NULL)), make_array(1.1, 2.2, 3.3), make_array('L', 'o', 'r', 'e', 'm')),
+  (make_array(make_array(3, 4),make_array(5, 6)), make_array(NULL, 5.5, 6.6), make_array('i', 'p', NULL, 'u', 'm')),
+  (make_array(make_array(5, 6),make_array(7, 8)), make_array(7.7, 8.8, 9.9), make_array('d', NULL, 'l', 'o', 'r')),
+  (make_array(make_array(7, NULL),make_array(9, 10)), make_array(10.1, NULL, 12.2), make_array('s', 'i', 't')),
+  (NULL, make_array(13.3, 14.4, 15.5), make_array('a', 'm', 'e', 't')),
+  (make_array(make_array(11, 12),make_array(13, 14)), NULL, make_array(',')),
+  (make_array(make_array(15, 16),make_array(NULL, 18)), make_array(16.6, 17.7, 18.8), NULL)
+;
+
+statement ok
+CREATE TABLE large_arrays
+AS
+  SELECT
+    arrow_cast(column1, 'LargeList(List(Int64))') AS column1,
+    arrow_cast(column2, 'LargeList(Float64)') AS column2,
+    arrow_cast(column3, 'LargeList(Utf8)') AS column3
+  FROM arrays
+;
+
+statement ok
+CREATE TABLE fixed_size_arrays
+AS VALUES
+  (arrow_cast(make_array(make_array(NULL, 2),make_array(3, NULL)), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(1.1, 2.2, 3.3), 'FixedSizeList(3, Float64)'), arrow_cast(make_array('L', 'o', 'r', 'e', 'm'), 'FixedSizeList(5, Utf8)')),
+  (arrow_cast(make_array(make_array(3, 4),make_array(5, 6)), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(NULL, 5.5, 6.6), 'FixedSizeList(3, Float64)'), arrow_cast(make_array('i', 'p', NULL, 'u', 'm'), 'FixedSizeList(5, Utf8)')),
+  (arrow_cast(make_array(make_array(5, 6),make_array(7, 8)), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(7.7, 8.8, 9.9), 'FixedSizeList(3, Float64)'), arrow_cast(make_array('d', NULL, 'l', 'o', 'r'), 'FixedSizeList(5, Utf8)')),
+  (arrow_cast(make_array(make_array(7, NULL),make_array(9, 10)), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(10.1, NULL, 12.2), 'FixedSizeList(3, Float64)'), arrow_cast(make_array('s', 'i', 't', 'a', 'b'), 'FixedSizeList(5, Utf8)')),
+  (NULL, arrow_cast(make_array(13.3, 14.4, 15.5), 'FixedSizeList(3, Float64)'), arrow_cast(make_array('a', 'm', 'e', 't', 'x'), 'FixedSizeList(5, Utf8)')),
+  (arrow_cast(make_array(make_array(11, 12),make_array(13, 14)), 'FixedSizeList(2, List(Int64))'), NULL, arrow_cast(make_array(',','a','b','c','d'), 'FixedSizeList(5, Utf8)')),
+  (arrow_cast(make_array(make_array(15, 16),make_array(NULL, 18)), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(16.6, 17.7, 18.8), 'FixedSizeList(3, Float64)'), NULL)
+;
+
+statement ok
+CREATE TABLE slices
+AS VALUES
+  (make_array(NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10), 1, 1),
+  (make_array(11, 12, 13, 14, 15, 16, 17, 18, NULL, 20), 2, -4),
+  (make_array(21, 22, 23, NULL, 25, 26, 27, 28, 29, 30), 0, 0),
+  (make_array(31, 32, 33, 34, 35, NULL, 37, 38, 39, 40), -4, -7),
+  (NULL, 4, 5),
+  (make_array(41, 42, 43, 44, 45, 46, 47, 48, 49, 50), NULL, 6),
+  (make_array(51, 52, NULL, 54, 55, 56, 57, 58, 59, 60), 5, NULL)
+;
+
+statement ok
+CREATE TABLE fixed_slices
+AS VALUES
+  (arrow_cast(make_array(NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10), 'FixedSizeList(10, Int64)'), 1, 1),
+  (arrow_cast(make_array(11, 12, 13, 14, 15, 16, 17, 18, NULL, 20), 'FixedSizeList(10, Int64)'), 2, -4),
+  (arrow_cast(make_array(21, 22, 23, NULL, 25, 26, 27, 28, 29, 30), 'FixedSizeList(10, Int64)'), 0, 0),
+  (arrow_cast(make_array(31, 32, 33, 34, 35, NULL, 37, 38, 39, 40), 'FixedSizeList(10, Int64)'), -4, -7),
+  (arrow_cast(make_array(41, 42, 43, 44, 45, 46, 47, 48, 49, 50), 'FixedSizeList(10, Int64)'), NULL, 6),
+  (arrow_cast(make_array(51, 52, NULL, 54, 55, 56, 57, 58, 59, 60),'FixedSizeList(10, Int64)'), 5, NULL)
+;
+
+statement ok
+CREATE TABLE arrayspop
+AS VALUES
+  (make_array(1, 2, NULL)),
+  (make_array(3, 4, 5, NULL)),
+  (make_array(6, 7, 8, NULL, 9)),
+  (make_array(NULL, NULL, 100)),
+  (NULL),
+  (make_array(NULL, 10, 11, 12))
+;
+
+statement ok
+CREATE TABLE large_arrayspop
+AS SELECT
+  arrow_cast(column1, 'LargeList(Int64)') AS column1
+FROM arrayspop
+;
+
+statement ok
+CREATE TABLE nested_arrays
+AS VALUES
+  (make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6)), make_array(7, 8, 9), 2, make_array([[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]), make_array(11, 12, 13)),
+  (make_array(make_array(4, 5, 6), make_array(10, 11, 12), make_array(4, 9, 8), make_array(7, 8, 9), make_array(10, 11, 12), make_array(1, 8, 7)), make_array(10, 11, 12), 3, make_array([[11, 12, 13], [14, 15, 16]], [[17, 18, 19], [20, 21, 22]]), make_array(121, 131, 141))
+;
+
+statement ok
+CREATE TABLE large_nested_arrays
+AS
+  SELECT
+    arrow_cast(column1, 'LargeList(LargeList(Int64))') AS column1,
+    arrow_cast(column2, 'LargeList(Int64)') AS column2,
+    column3,
+    arrow_cast(column4, 'LargeList(LargeList(List(Int64)))') AS column4,
+    arrow_cast(column5, 'LargeList(Int64)') AS column5
+  FROM nested_arrays
+;
+
+statement ok
+CREATE TABLE fixed_size_nested_arrays
+AS VALUES
+  (arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6)), 'FixedSizeList(6, List(Int64))'), arrow_cast(make_array(7, 8, 9), 'FixedSizeList(3, Int64)'), 2, arrow_cast(make_array([[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]), 'FixedSizeList(2, List(List(Int64)))'), arrow_cast(make_array(11, 12, 13), 'FixedSizeList(3, Int64)')),
+  (arrow_cast(make_array(make_array(4, 5, 6), make_array(10, 11, 12), make_array(4, 9, 8), make_array(7, 8, 9), make_array(10, 11, 12), make_array(1, 8, 7)), 'FixedSizeList(6, List(Int64))'), arrow_cast(make_array(10, 11, 12), 'FixedSizeList(3, Int64)'), 3, arrow_cast(make_array([[11, 12, 13], [14, 15, 16]], [[17, 18, 19], [20, 21, 22]]), 'FixedSizeList(2, List(List(Int64)))'), arrow_cast(make_array(121, 131, 141), 'FixedSizeList(3, Int64)'))
+;
+
+statement ok
+CREATE TABLE arrays_values
+AS VALUES
+  (make_array(NULL, 2, 3, 4, 5, 6, 7, 8, 9, 10), 1, 1, ','),
+  (make_array(11, 12, 13, 14, 15, 16, 17, 18, NULL, 20), 12, 2, '.'),
+  (make_array(21, 22, 23, NULL, 25, 26, 27, 28, 29, 30), 23, 3, '-'),
+  (make_array(31, 32, 33, 34, 35, NULL, 37, 38, 39, 40), 34, 4, 'ok'),
+  (NULL, 44, 5, '@'),
+  (make_array(41, 42, 43, 44, 45, 46, 47, 48, 49, 50), NULL, 6, '$'),
+  (make_array(51, 52, NULL, 54, 55, 56, 57, 58, 59, 60), 55, NULL, '^'),
+  (make_array(61, 62, 63, 64, 65, 66, 67, 68, 69, 70), 66, 7, NULL)
+;
+
+statement ok
+CREATE TABLE large_arrays_values
+AS SELECT
+  arrow_cast(column1, 'LargeList(Int64)') AS column1,
+  column2,
+  column3,
+  column4
+FROM arrays_values
+;
+
+statement ok
+CREATE TABLE fixed_arrays_values
+AS SELECT
+  arrow_cast(column1, 'FixedSizeList(10, Int64)') AS column1,
+  column2,
+  column3,
+  column4
+FROM arrays_values
+;
+
+statement ok
+CREATE TABLE arrays_values_v2
+AS VALUES
+  (make_array(NULL, 2, 3), make_array(4, 5, NULL), 12, make_array([30, 40, 50])),
+  (NULL, make_array(7, NULL, 8), 13, make_array(make_array(NULL,NULL,60))),
+  (make_array(9, NULL, 10), NULL, 14, make_array(make_array(70,NULL,NULL))),
+  (make_array(NULL, 1), make_array(NULL, 21), NULL, NULL),
+  (make_array(11, 12), NULL, NULL, NULL),
+  (NULL, NULL, NULL, NULL)
+;
+
+statement ok
+CREATE TABLE large_arrays_values_v2
+AS SELECT
+  arrow_cast(column1, 'LargeList(Int64)') AS column1,
+  arrow_cast(column2, 'LargeList(Int64)') AS column2,
+  column3,
+  arrow_cast(column4, 'LargeList(LargeList(Int64))') AS column4
+FROM arrays_values_v2
+;
+
+statement ok
+CREATE TABLE flatten_table
+AS VALUES
+  (make_array([1], [2], [3]), make_array([[1, 2, 3]], [[4, 5]], [[6]]), make_array([[[1]]], [[[2, 3]]]), make_array([1.0], [2.1, 2.2], [3.2, 3.3, 3.4])),
+  (make_array([1, 2], [3, 4], [5, 6]), make_array([[8]]), make_array([[[1,2]]], [[[3]]]), make_array([1.0, 2.0], [3.0, 4.0], [5.0, 6.0]))
+;
+
+statement ok
+CREATE TABLE large_flatten_table
+AS
+  SELECT
+    arrow_cast(column1, 'LargeList(LargeList(Int64))') AS column1,
+    arrow_cast(column2, 'LargeList(LargeList(LargeList(Int64)))') AS column2,
+    arrow_cast(column3, 'LargeList(LargeList(LargeList(LargeList(Int64))))') AS column3,
+    arrow_cast(column4, 'LargeList(LargeList(Float64))') AS column4
+  FROM flatten_table
+;
+
+statement ok
+CREATE TABLE fixed_size_flatten_table
+AS VALUES
+  (arrow_cast(make_array([1], [2], [3]), 'FixedSizeList(3, List(Int64))'),
+   arrow_cast(make_array([[1, 2, 3]], [[4, 5]], [[6]]), 'FixedSizeList(3, List(List(Int64)))'),
+   arrow_cast(make_array([[[1]]], [[[2, 3]]]), 'FixedSizeList(2, List(List(List(Int64))))'),
+   arrow_cast(make_array([1.0], [2.1, 2.2], [3.2, 3.3, 3.4]), 'FixedSizeList(3, List(Float64))')
+   ),
+  (
+    arrow_cast(make_array([1, 2], [3, 4], [5, 6]), 'FixedSizeList(3, List(Int64))'),
+    arrow_cast(make_array([[8]], [[9, 10]], [[11, 12, 13]]), 'FixedSizeList(3, List(List(Int64)))'),
+    arrow_cast(make_array([[[1,2]]], [[[3]]]), 'FixedSizeList(2, List(List(List(Int64))))'),
+    arrow_cast(make_array([1.0, 2.0], [3.0, 4.0], [5.0, 6.0]), 'FixedSizeList(3, List(Float64))')
+    )
+;
+
+statement ok
+CREATE TABLE array_has_table_1D
+AS VALUES
+  (make_array(1, 2), 1, make_array(1,2,3), make_array(1,3), make_array(1,3,5), make_array(2,4,6,8,1,3,5)),
+  (make_array(3, 4, 5), 2, make_array(1,2,3,4), make_array(2,5), make_array(2,4,6), make_array(1,3,5))
+;
+
+statement ok
+CREATE TABLE fixed_size_array_has_table_1D
+AS VALUES
+  (arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 1, arrow_cast(make_array(1, 2, 3, 4), 'FixedSizeList(4, Int64)'), arrow_cast(make_array(1,3), 'FixedSizeList(2, Int64)'), arrow_cast(make_array(1,3,5), 'FixedSizeList(3, Int64)'), arrow_cast(make_array(2, 4, 6, 8, 1, 3, 5), 'FixedSizeList(7, Int64)')),
+  (arrow_cast(make_array(3, 4, 5), 'FixedSizeList(3, Int64)'), 2, arrow_cast(make_array(1, 2, 3, 4), 'FixedSizeList(4, Int64)'), arrow_cast(make_array(2,5), 'FixedSizeList(2, Int64)'), arrow_cast(make_array(2,4,6), 'FixedSizeList(3, Int64)'), arrow_cast(make_array(1, 3, 5, 7, 9, 11, 13), 'FixedSizeList(7, Int64)'))
+;
+
+statement ok
+CREATE TABLE array_has_table_1D_Float
+AS VALUES
+  (make_array(1.0, 2.0), 1.0, make_array(1.0,2.0,3.0), make_array(1.0,3.0), make_array(1.11), make_array(2.22, 3.33)),
+  (make_array(3.0, 4.0, 5.0), 2.0, make_array(1.0,2.0,3.0,4.0), make_array(2.0,5.0), make_array(2.22, 1.11), make_array(1.11, 3.33))
+;
+
+statement ok
+CREATE TABLE fixed_size_array_has_table_1D_Float
+AS VALUES
+  (arrow_cast(make_array(1.0, 2.0, 3.0), 'FixedSizeList(3, Float64)'), 1.0, arrow_cast(make_array(1.0, 2.0, 3.0, 4.0), 'FixedSizeList(4, Float64)'), arrow_cast(make_array(1.0,3.0), 'FixedSizeList(2, Float64)'), arrow_cast(make_array(1.11, 2.22), 'FixedSizeList(2, Float64)'), arrow_cast(make_array(2.22, 3.33), 'FixedSizeList(2, Float64)')),
+  (arrow_cast(make_array(3.0, 4.0, 5.0), 'FixedSizeList(3, Float64)'), 2.0, arrow_cast(make_array(1.0, 2.0, 3.0, 4.0), 'FixedSizeList(4, Float64)'), arrow_cast(make_array(2.0,5.0), 'FixedSizeList(2, Float64)'), arrow_cast(make_array(2.22, 1.11), 'FixedSizeList(2, Float64)'), arrow_cast(make_array(1.11, 3.33), 'FixedSizeList(2, Float64)'))
+;
+
+statement ok
+CREATE TABLE array_has_table_1D_Boolean
+AS VALUES
+  (make_array(true, true, true), false, make_array(true, true, false, true, false), make_array(true, false, true), make_array(false), make_array(true, false)),
+  (make_array(false, false, false), false, make_array(true, false, true), make_array(true, true), make_array(true, true), make_array(false,false,true))
+;
+
+statement ok
+CREATE TABLE fixed_size_array_has_table_1D_Boolean
+AS VALUES
+  (arrow_cast(make_array(true, true, true), 'FixedSizeList(3, Boolean)'), false, arrow_cast(make_array(true, true, false, true, false), 'FixedSizeList(5, Boolean)'), arrow_cast(make_array(true, false, true), 'FixedSizeList(3, Boolean)'), arrow_cast(make_array(false, true), 'FixedSizeList(2, Boolean)'), arrow_cast(make_array(true, false, true), 'FixedSizeList(3, Boolean)')),
+  (arrow_cast(make_array(false, false, false), 'FixedSizeList(3, Boolean)'), false, arrow_cast(make_array(true, false, true, true, false), 'FixedSizeList(5, Boolean)'), arrow_cast(make_array(true, true, false), 'FixedSizeList(3, Boolean)'), arrow_cast(make_array(true, true), 'FixedSizeList(2, Boolean)'), arrow_cast(make_array(false,false,true), 'FixedSizeList(3, Boolean)'))
+;
+
+statement ok
+CREATE TABLE array_has_table_1D_UTF8
+AS VALUES
+  (make_array('a', 'bc', 'def'), 'bc', make_array('datafusion', 'rust', 'arrow'), make_array('rust', 'arrow'), make_array('rust', 'arrow', 'python'), make_array('data')),
+  (make_array('a', 'bc', 'def'), 'defg', make_array('datafusion', 'rust', 'arrow'), make_array('datafusion', 'rust', 'arrow', 'python'), make_array('rust', 'arrow'), make_array('datafusion', 'rust', 'arrow'))
+;
+
+statement ok
+CREATE TABLE fixed_size_array_has_table_1D_UTF8
+AS VALUES
+  (arrow_cast(make_array('a', 'bc', 'def'), 'FixedSizeList(3, Utf8)'), 'bc', arrow_cast(make_array('datafusion', 'rust', 'arrow'), 'FixedSizeList(3, Utf8)'), arrow_cast(make_array('rust', 'arrow', 'datafusion', 'rust'), 'FixedSizeList(4, Utf8)'), arrow_cast(make_array('rust', 'arrow', 'python'), 'FixedSizeList(3, Utf8)'), arrow_cast(make_array('data', 'fusion', 'rust'), 'FixedSizeList(3, Utf8)')),
+  (arrow_cast(make_array('a', 'bc', 'def'), 'FixedSizeList(3, Utf8)'), 'defg', arrow_cast(make_array('datafusion', 'rust', 'arrow'), 'FixedSizeList(3, Utf8)'), arrow_cast(make_array('datafusion', 'rust', 'arrow', 'python'), 'FixedSizeList(4, Utf8)'), arrow_cast(make_array('rust', 'arrow', 'python'), 'FixedSizeList(3, Utf8)'), arrow_cast(make_array('datafusion', 'rust', 'arrow'), 'FixedSizeList(3, Utf8)'))
+;
+
+statement ok
+CREATE TABLE array_has_table_2D
+AS VALUES
+  (make_array([1,2]), make_array(1,3), make_array([1,2,3], [4,5], [6,7]), make_array([4,5], [6,7])),
+  (make_array([3,4], [5]), make_array(5), make_array([1,2,3,4], [5,6,7], [8,9,10]), make_array([1,2,3], [5,6,7], [8,9,10]))
+;
+
+statement ok
+CREATE TABLE fixed_size_array_has_table_2D
+AS VALUES
+  (arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(1,3), 'FixedSizeList(2, Int64)'), arrow_cast(make_array([1,2,3], [4,5], [6,7]), 'FixedSizeList(3, List(Int64))'), arrow_cast(make_array([4,5], [6,7], [1,2,3]), 'FixedSizeList(3, List(Int64))')),
+  (arrow_cast(make_array([3,4], [5]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(5, 3), 'FixedSizeList(2, Int64)'), arrow_cast(make_array([1,2,3,4], [5,6,7], [8,9,10]), 'FixedSizeList(3, List(Int64))'), arrow_cast(make_array([1,2,3], [5,6,7], [8,9,10]), 'FixedSizeList(3, List(Int64))'))
+;
+
+statement ok
+CREATE TABLE array_has_table_2D_float
+AS VALUES
+  (make_array([1.0, 2.0, 3.0], [1.1, 2.2], [3.3]), make_array([1.1, 2.2], [3.3])),
+  (make_array([1.0, 2.0, 3.0], [1.1, 2.2], [3.3]), make_array([1.0], [1.1, 2.2], [3.3]))
+;
+
+statement ok
+CREATE TABLE fixed_size_array_has_table_2D_Float
+AS VALUES
+  (arrow_cast(make_array([1.0, 2.0, 3.0], [1.1, 2.2], [3.3]), 'FixedSizeList(3, List(Float64))'), arrow_cast(make_array([1.1, 2.2], [3.3], [4.4]), 'FixedSizeList(3, List(Float64))')),
+  (arrow_cast(make_array([1.0, 2.0, 3.0], [1.1, 2.2], [3.3]), 'FixedSizeList(3, List(Float64))'), arrow_cast(make_array([1.0], [1.1, 2.2], [3.3]), 'FixedSizeList(3, List(Float64))'))
+;
+
+statement ok
+CREATE TABLE array_has_table_3D
+AS VALUES
+  (make_array([[1,2]]), make_array([1])),
+  (make_array([[1,2]]), make_array([1,2])),
+  (make_array([[1,2]]), make_array([1,2,3])),
+  (make_array([[1], [2]]), make_array([2])),
+  (make_array([[1], [2]]), make_array([1], [2])),
+  (make_array([[1], [2]], [[2], [3]]), make_array([1], [2], [3])),
+  (make_array([[1], [2]], [[2], [3]]), make_array([1], [2]))
+;
+
+statement ok
+CREATE TABLE fixed_size_array_has_table_3D
+AS VALUES
+  (arrow_cast(make_array([[1,2]], [[3, 4]]), 'FixedSizeList(2, List(List(Int64)))'), arrow_cast(make_array([1], [2]), 'FixedSizeList(2, List(Int64))')),
+  (arrow_cast(make_array([[1,2]], [[4, 4]]), 'FixedSizeList(2, List(List(Int64)))'), arrow_cast(make_array([1,2], [3, 4]), 'FixedSizeList(2, List(Int64))')),
+  (arrow_cast(make_array([[1,2]], [[4, 4]]), 'FixedSizeList(2, List(List(Int64)))'), arrow_cast(make_array([1,2,3], [1]), 'FixedSizeList(2, List(Int64))')),
+  (arrow_cast(make_array([[1], [2]], [[]]), 'FixedSizeList(2, List(List(Int64)))'), arrow_cast(make_array([2], [3]), 'FixedSizeList(2, List(Int64))')),
+  (arrow_cast(make_array([[1], [2]], [[]]), 'FixedSizeList(2, List(List(Int64)))'), arrow_cast(make_array([1], [2]), 'FixedSizeList(2, List(Int64))')),
+  (arrow_cast(make_array([[1], [2]], [[2], [3]]), 'FixedSizeList(2, List(List(Int64)))'), arrow_cast(make_array([1], [2]), 'FixedSizeList(2, List(Int64))')),
+  (arrow_cast(make_array([[1], [2]], [[2], [3]]), 'FixedSizeList(2, List(List(Int64)))'), arrow_cast(make_array([1], [2]), 'FixedSizeList(2, List(Int64))'))
+;
+
+statement ok
+CREATE TABLE array_has_table_null
+AS VALUES
+  (make_array(1, 2), 1),
+  (make_array(1, NULL), 1),
+  (make_array(3, 4, 5), 2),
+  (make_array(3, NULL, 5), 2),
+  (make_array(NULL, NULL, NULL), 2)
+;
+
+statement ok
+CREATE TABLE array_has_table_empty
+AS VALUES
+  (make_array(1, 3, 5), 1),
+  (make_array(), 1),
+  (NULL, 1)
+;
+
+statement ok
+CREATE TABLE array_distinct_table_1D
+AS VALUES
+  (make_array(1, 1, 2, 2, 3)),
+  (make_array(1, 2, 3, 4, 5)),
+  (make_array(3, 5, 3, 3, 3))
+;
+
+statement ok
+CREATE TABLE array_distinct_table_1D_UTF8
+AS VALUES
+  (make_array('a', 'a', 'bc', 'bc', 'def')),
+  (make_array('a', 'bc', 'def', 'defg', 'defg')),
+  (make_array('defg', 'defg', 'defg', 'defg', 'defg'))
+;
+
+statement ok
+CREATE TABLE array_distinct_table_2D
+AS VALUES
+  (make_array([1,2], [1,2], [3,4], [3,4], [5,6])),
+  (make_array([1,2], [3,4], [5,6], [7,8], [9,10])),
+  (make_array([5,6], [5,6], NULL))
+;
+
+statement ok
+CREATE TABLE array_distinct_table_1D_large
+AS SELECT
+  arrow_cast(column1, 'LargeList(Int64)') AS column1
+FROM array_distinct_table_1D
+;
+
+statement ok
+CREATE TABLE array_distinct_table_1D_fixed
+AS SELECT
+  arrow_cast(column1, 'FixedSizeList(5, Int64)') AS column1
+FROM array_distinct_table_1D
+;
+
+statement ok
+CREATE TABLE array_distinct_table_1D_UTF8_fixed
+AS SELECT
+  arrow_cast(column1, 'FixedSizeList(5, Utf8)') AS column1
+FROM array_distinct_table_1D_UTF8
+;
+
+statement ok
+CREATE TABLE array_distinct_table_2D_fixed
+AS VALUES
+  (arrow_cast(make_array([1,2], [1,2], [3,4], [3,4], [5,6]), 'FixedSizeList(5, List(Int64))')),
+  (arrow_cast(make_array([1,2], [3,4], [5,6], [7,8], [9,10]), 'FixedSizeList(5, List(Int64))')),
+  (arrow_cast(make_array([5,6], [5,6], NULL, NULL, NULL), 'FixedSizeList(5, List(Int64))'))
+;
+
+statement ok
+CREATE TABLE array_intersect_table_1D
+AS VALUES
+  (make_array(1, 2), make_array(1), make_array(1,2,3), make_array(1,3), make_array(1,3,5), make_array(2,4,6,8,1,3)),
+  (make_array(11, 22), make_array(11), make_array(11,22,33), make_array(11,33), make_array(11,33,55), make_array(22,44,66,88,11,33))
+;
+
+statement ok
+CREATE TABLE large_array_intersect_table_1D
+AS
+  SELECT
+    arrow_cast(column1, 'LargeList(Int64)') as column1,
+    arrow_cast(column2, 'LargeList(Int64)') as column2,
+    arrow_cast(column3, 'LargeList(Int64)') as column3,
+    arrow_cast(column4, 'LargeList(Int64)') as column4,
+    arrow_cast(column5, 'LargeList(Int64)') as column5,
+    arrow_cast(column6, 'LargeList(Int64)') as column6
+FROM array_intersect_table_1D
+;
+
+statement ok
+CREATE TABLE array_intersect_table_1D_Float
+AS VALUES
+  (make_array(1.0, 2.0), make_array(1.0), make_array(1.0,2.0,3.0), make_array(1.0,3.0), make_array(1.11), make_array(2.22, 3.33)),
+  (make_array(3.0, 4.0, 5.0), make_array(2.0), make_array(1.0,2.0,3.0,4.0), make_array(2.0,5.0), make_array(2.22, 1.11), make_array(1.11, 3.33))
+;
+
+statement ok
+CREATE TABLE large_array_intersect_table_1D_Float
+AS
+  SELECT
+    arrow_cast(column1, 'LargeList(Float64)') as column1,
+    arrow_cast(column2, 'LargeList(Float64)') as column2,
+    arrow_cast(column3, 'LargeList(Float64)') as column3,
+    arrow_cast(column4, 'LargeList(Float64)') as column4,
+    arrow_cast(column5, 'LargeList(Float64)') as column5,
+    arrow_cast(column6, 'LargeList(Float64)') as column6
+FROM array_intersect_table_1D_Float
+;
+
+statement ok
+CREATE TABLE array_intersect_table_1D_Boolean
+AS VALUES
+  (make_array(true, true, true), make_array(false), make_array(true, true, false, true, false), make_array(true, false, true), make_array(false), make_array(true, false)),
+  (make_array(false, false, false), make_array(false), make_array(true, false, true), make_array(true, true), make_array(true, true), make_array(false,false,true))
+;
+
+statement ok
+CREATE TABLE large_array_intersect_table_1D_Boolean
+AS
+  SELECT
+    arrow_cast(column1, 'LargeList(Boolean)') as column1,
+    arrow_cast(column2, 'LargeList(Boolean)') as column2,
+    arrow_cast(column3, 'LargeList(Boolean)') as column3,
+    arrow_cast(column4, 'LargeList(Boolean)') as column4,
+    arrow_cast(column5, 'LargeList(Boolean)') as column5,
+    arrow_cast(column6, 'LargeList(Boolean)') as column6
+FROM array_intersect_table_1D_Boolean
+;
+
+statement ok
+CREATE TABLE array_intersect_table_1D_UTF8
+AS VALUES
+  (make_array('a', 'bc', 'def'), make_array('bc'), make_array('datafusion', 'rust', 'arrow'), make_array('rust', 'arrow'), make_array('rust', 'arrow', 'python'), make_array('data')),
+  (make_array('a', 'bc', 'def'), make_array('defg'), make_array('datafusion', 'rust', 'arrow'), make_array('datafusion', 'rust', 'arrow', 'python'), make_array('rust', 'arrow'), make_array('datafusion', 'rust', 'arrow'))
+;
+
+statement ok
+CREATE TABLE large_array_intersect_table_1D_UTF8
+AS
+  SELECT
+    arrow_cast(column1, 'LargeList(Utf8)') as column1,
+    arrow_cast(column2, 'LargeList(Utf8)') as column2,
+    arrow_cast(column3, 'LargeList(Utf8)') as column3,
+    arrow_cast(column4, 'LargeList(Utf8)') as column4,
+    arrow_cast(column5, 'LargeList(Utf8)') as column5,
+    arrow_cast(column6, 'LargeList(Utf8)') as column6
+FROM array_intersect_table_1D_UTF8
+;
+
+statement ok
+CREATE TABLE array_intersect_table_1D_NULL
+AS VALUES
+  ([1, 2, 2, 3], [2, 3, 4]),
+  ([2, 3, 3], [3]),
+  ([3], [3, 3, 4]),
+  (null, [3, 4]),
+  ([1, 2], null),
+  (null, null)
+;
+
+statement ok
+CREATE TABLE array_intersect_table_2D
+AS VALUES
+  (make_array([1,2]), make_array([1,3]), make_array([1,2,3], [4,5], [6,7]), make_array([4,5], [6,7])),
+  (make_array([3,4], [5]), make_array([3,4]), make_array([1,2,3,4], [5,6,7], [8,9,10]), make_array([1,2,3], [5,6,7], [8,9,10]))
+;
+
+statement ok
+CREATE TABLE large_array_intersect_table_2D
+AS
+  SELECT
+    arrow_cast(column1, 'LargeList(List(Int64))') as column1,
+    arrow_cast(column2, 'LargeList(List(Int64))') as column2,
+    arrow_cast(column3, 'LargeList(List(Int64))') as column3,
+    arrow_cast(column4, 'LargeList(List(Int64))') as column4
+FROM array_intersect_table_2D
+;
+
+statement ok
+CREATE TABLE array_intersect_table_2D_float
+AS VALUES
+  (make_array([1.0, 2.0, 3.0], [1.1, 2.2], [3.3]), make_array([1.1, 2.2], [3.3])),
+  (make_array([1.0, 2.0, 3.0], [1.1, 2.2], [3.3]), make_array([1.0], [1.1, 2.2], [3.3]))
+;
+
+statement ok
+CREATE TABLE large_array_intersect_table_2D_Float
+AS
+  SELECT
+    arrow_cast(column1, 'LargeList(List(Float64))') as column1,
+    arrow_cast(column2, 'LargeList(List(Float64))') as column2
+FROM array_intersect_table_2D_Float
+;
+
+statement ok
+CREATE TABLE array_intersect_table_3D
+AS VALUES
+  (make_array([[1,2]]), make_array([[1]])),
+  (make_array([[1,2]]), make_array([[1,2]]))
+;
+
+statement ok
+CREATE TABLE large_array_intersect_table_3D
+AS
+  SELECT
+    arrow_cast(column1, 'LargeList(List(List(Int64)))') as column1,
+    arrow_cast(column2, 'LargeList(List(List(Int64)))') as column2
+FROM array_intersect_table_3D
+;
+
+statement ok
+CREATE TABLE arrays_values_without_nulls
+AS VALUES
+  (make_array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 1, 1, ',', [2,3]),
+  (make_array(11, 12, 13, 14, 15, 16, 17, 18, 19, 20), 12, 2, '.', [4,5]),
+  (make_array(21, 22, 23, 24, 25, 26, 27, 28, 29, 30), 23, 3, '-', [6,7]),
+  (make_array(31, 32, 33, 34, 35, 26, 37, 38, 39, 40), 34, 4, 'ok', [8,9])
+;
+
+statement ok
+CREATE TABLE large_arrays_values_without_nulls
+AS SELECT
+  arrow_cast(column1, 'LargeList(Int64)') AS column1,
+  column2,
+  column3,
+  column4,
+  arrow_cast(column5, 'LargeList(Int64)') AS column5
+FROM arrays_values_without_nulls
+;
+
+statement ok
+CREATE TABLE fixed_size_arrays_values_without_nulls
+AS VALUES
+  (arrow_cast(make_array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 'FixedSizeList(10, Int64)'), 1, 1, ',', [2,3]),
+  (arrow_cast(make_array(11, 12, 13, 14, 15, 16, 17, 18, 19, 20), 'FixedSizeList(10, Int64)'), 12, 2, '.', [4,5]),
+  (arrow_cast(make_array(21, 22, 23, 24, 25, 26, 27, 28, 29, 30), 'FixedSizeList(10, Int64)'), 23, 3, '-', [6,7]),
+  (arrow_cast(make_array(31, 32, 33, 34, 35, 26, 37, 38, 39, 40), 'FixedSizeList(10, Int64)'), 34, 4, 'ok', [8,9])
+;
+
+statement ok
+CREATE TABLE arrays_range
+AS VALUES
+  (3, 10, 2),
+  (4, 13, 3)
+;
+
+statement ok
+CREATE TABLE arrays_with_repeating_elements
+AS VALUES
+  (make_array(1, 2, 1, 3, 2, 2, 1, 3, 2, 3), 2, 4, 3),
+  (make_array(4, 4, 5, 5, 6, 5, 5, 5, 4, 4), 4, 7, 2),
+  (make_array(7, 7, 7, 8, 7, 9, 7, 8, 7, 7), 7, 10, 5),
+  (make_array(10, 11, 12, 10, 11, 12, 10, 11, 12, 10), 10, 13, 10)
+;
+
+statement ok
+CREATE TABLE large_arrays_with_repeating_elements
+AS
+  SELECT
+    arrow_cast(column1, 'LargeList(Int64)') AS column1,
+    column2,
+    column3,
+    column4
+  FROM arrays_with_repeating_elements
+;
+
+statement ok
+CREATE TABLE fixed_arrays_with_repeating_elements
+AS VALUES
+  (arrow_cast(make_array(1, 2, 1, 3, 2, 2, 1, 3, 2, 3), 'FixedSizeList(10, Int64)'), 2, 4, 3),
+  (arrow_cast(make_array(4, 4, 5, 5, 6, 5, 5, 5, 4, 4), 'FixedSizeList(10, Int64)'), 4, 7, 2),
+  (arrow_cast(make_array(7, 7, 7, 8, 7, 9, 7, 8, 7, 7), 'FixedSizeList(10, Int64)'), 7, 10, 5),
+  (arrow_cast(make_array(10, 11, 12, 10, 11, 12, 10, 11, 12, 10), 'FixedSizeList(10, Int64)'), 10, 13, 10)
+;
+
+statement ok
+CREATE TABLE nested_arrays_with_repeating_elements
+AS VALUES
+  (make_array([1, 2, 3], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]), [4, 5, 6], [10, 11, 12], 3),
+  (make_array([10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]), [10, 11, 12], [19, 20, 21], 2),
+  (make_array([19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]), [19, 20, 21], [28, 29, 30], 5),
+  (make_array([28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]), [28, 29, 30], [37, 38, 39], 10)
+;
+
+statement ok
+CREATE TABLE large_nested_arrays_with_repeating_elements
+AS
+  SELECT
+    arrow_cast(column1, 'LargeList(List(Int64))') AS column1,
+    column2,
+    column3,
+    column4
+  FROM nested_arrays_with_repeating_elements
+;
+
+statement ok
+CREATE TABLE fixed_size_nested_arrays_with_repeating_elements
+AS VALUES
+  (arrow_cast(make_array([1, 2, 3], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(10, List(Int64))'), [4, 5, 6], [10, 11, 12], 3),
+  (arrow_cast(make_array([10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]), 'FixedSizeList(10, List(Int64))'), [10, 11, 12], [19, 20, 21], 2),
+  (arrow_cast(make_array([19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]), 'FixedSizeList(10, List(Int64))'), [19, 20, 21], [28, 29, 30], 5),
+  (arrow_cast(make_array([28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]), 'FixedSizeList(10, List(Int64))'), [28, 29, 30], [28, 29, 30], 10)
+;
+
+statement ok
+CREATE TABLE arrays_distance_table
+AS VALUES
+  (make_array(1, 2, 3), make_array(1, 2, 3), make_array(1.1, 2.2, 3.3) , make_array(1.1, NULL, 3.3)),
+  (make_array(1, 2, 3), make_array(4, 5, 6), make_array(4.4, 5.5, 6.6), make_array(4.4, NULL, 6.6)),
+  (make_array(1, 2, 3), make_array(7, 8, 9), make_array(7.7, 8.8, 9.9), make_array(7.7, NULL, 9.9)),
+  (make_array(1, 2, 3), make_array(10, 11, 12), make_array(10.1, 11.2, 12.3), make_array(10.1, NULL, 12.3))
+;
+
+statement ok
+CREATE TABLE large_arrays_distance_table
+AS
+  SELECT
+    arrow_cast(column1, 'LargeList(Int64)') AS column1,
+    arrow_cast(column2, 'LargeList(Int64)') AS column2,
+    arrow_cast(column3, 'LargeList(Float64)') AS column3,
+    arrow_cast(column4, 'LargeList(Float64)') AS column4
+FROM arrays_distance_table
+;
+
+statement ok
+CREATE TABLE fixed_size_arrays_distance_table
+AS
+  SELECT
+    arrow_cast(column1, 'FixedSizeList(3, Int64)') AS column1,
+    arrow_cast(column2, 'FixedSizeList(3, Int64)') AS column2,
+    arrow_cast(column3, 'FixedSizeList(3, Float64)') AS column3,
+    arrow_cast(column4, 'FixedSizeList(3, Float64)') AS column4
+FROM arrays_distance_table
+;
+
+
diff --git a/datafusion/sqllogictest/test_files/array/make_array.slt b/datafusion/sqllogictest/test_files/array/make_array.slt
new file mode 100644
index 0000000000000..fa91dcbaabc74
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/make_array.slt
@@ -0,0 +1,166 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+### Array function tests
+
+
+## make_array (aliases: `make_list`)
+
+# make_array scalar function #1
+query ???
+select make_array(1, 2, 3), make_array(1.0, 2.0, 3.0), make_array('h', 'e', 'l', 'l', 'o');
+----
+[1, 2, 3] [1.0, 2.0, 3.0] [h, e, l, l, o]
+
+# make_array scalar function #2
+query ???
+select make_array(1, 2, 3), make_array(make_array(1, 2), make_array(3, 4)), make_array([[[[1], [2]]]]);
+----
+[1, 2, 3] [[1, 2], [3, 4]] [[[[[1], [2]]]]]
+
+# make_array scalar function #3
+query ??
+select make_array([1, 2, 3], [4, 5, 6], [7, 8, 9]), make_array([[1, 2], [3, 4]], [[5, 6], [7, 8]]);
+----
+[[1, 2, 3], [4, 5, 6], [7, 8, 9]] [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+
+# make_array scalar function #4
+query ??
+select make_array([1.0, 2.0], [3.0, 4.0]), make_array('h', 'e', 'l', 'l', 'o');
+----
+[[1.0, 2.0], [3.0, 4.0]] [h, e, l, l, o]
+
+# make_array scalar function #5
+query ?
+select make_array(make_array(make_array(make_array(1, 2, 3), make_array(4, 5, 6)), make_array(make_array(7, 8, 9), make_array(10, 11, 12))))
+----
+[[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]]
+
+# make_array scalar function #6
+query ?
+select make_array()
+----
+[]
+
+# make_array scalar function #7
+query ??
+select make_array(make_array()), make_array(make_array(make_array()))
+----
+[[]] [[[]]]
+
+# make_list scalar function #8 (function alias: `make_array`)
+query ???
+select make_list(1, 2, 3), make_list(1.0, 2.0, 3.0), make_list('h', 'e', 'l', 'l', 'o');
+----
+[1, 2, 3] [1.0, 2.0, 3.0] [h, e, l, l, o]
+
+# make_array scalar function with nulls
+query ???
+select make_array(1, NULL, 3), make_array(NULL, 2.0, NULL), make_array('h', NULL, 'l', NULL, 'o');
+----
+[1, NULL, 3] [NULL, 2.0, NULL] [h, NULL, l, NULL, o]
+
+# make_array scalar function with nulls #2
+query ??
+select make_array(1, 2, NULL), make_array(make_array(NULL, 2), make_array(NULL, 3));
+----
+[1, 2, NULL] [[NULL, 2], [NULL, 3]]
+
+# make_array scalar function with nulls #3
+query ???
+select make_array(NULL), make_array(NULL, NULL, NULL), make_array(make_array(NULL, NULL), make_array(NULL, NULL));
+----
+[NULL] [NULL, NULL, NULL] [[NULL, NULL], [NULL, NULL]]
+
+# make_array with 1 columns
+query ???
+select make_array(a), make_array(d), make_array(e) from values;
+----
+[1] [1.1] [Lorem]
+[2] [2.2] [ipsum]
+[3] [3.3] [dolor]
+[4] [4.4] [sit]
+[NULL] [5.5] [amet]
+[5] [6.6] [,]
+[6] [7.7] [consectetur]
+[7] [NULL] [adipiscing]
+[8] [8.8] [NULL]
+
+# make_array with 2 columns #1
+query ??
+select make_array(b, c), make_array(e, f) from values;
+----
+[1, 2] [Lorem, A]
+[3, 4] [ipsum, ]
+[5, 6] [dolor, BB]
+[7, 8] [sit, NULL]
+[9, 10] [amet, CCC]
+[NULL, 12] [,, DD]
+[11, NULL] [consectetur, E]
+[13, 14] [adipiscing, F]
+[15, 16] [NULL, ]
+
+# make_array with 4 columns
+query ?
+select make_array(a, b, c, d) from values;
+----
+[1.0, 1.0, 2.0, 1.1]
+[2.0, 3.0, 4.0, 2.2]
+[3.0, 5.0, 6.0, 3.3]
+[4.0, 7.0, 8.0, 4.4]
+[NULL, 9.0, 10.0, 5.5]
+[5.0, NULL, 12.0, 6.6]
+[6.0, 11.0, NULL, 7.7]
+[7.0, 13.0, 14.0, NULL]
+[8.0, 15.0, 16.0, 8.8]
+
+# make_array with column of list
+query ??
+select column1, column5 from arrays_values_without_nulls;
+----
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] [2, 3]
+[11, 12, 13, 14, 15, 16, 17, 18, 19, 20] [4, 5]
+[21, 22, 23, 24, 25, 26, 27, 28, 29, 30] [6, 7]
+[31, 32, 33, 34, 35, 26, 37, 38, 39, 40] [8, 9]
+
+# make array with arrays of different types
+query ?
+select make_array(make_array(1), arrow_cast(make_array(-1), 'LargeList(Int8)'))
+----
+[[1], [-1]]
+
+query T
+select arrow_typeof(make_array(make_array(1), arrow_cast(make_array(-1), 'LargeList(Int8)')));
+----
+List(LargeList(Int64))
+
+
+query ???
+select make_array(column1),
+       make_array(column1, column5),
+       make_array(column1, make_array(50,51,52))
+from arrays_values_without_nulls;
+----
+[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]] [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [2, 3]] [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [50, 51, 52]]
+[[11, 12, 13, 14, 15, 16, 17, 18, 19, 20]] [[11, 12, 13, 14, 15, 16, 17, 18, 19, 20], [4, 5]] [[11, 12, 13, 14, 15, 16, 17, 18, 19, 20], [50, 51, 52]]
+[[21, 22, 23, 24, 25, 26, 27, 28, 29, 30]] [[21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [6, 7]] [[21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [50, 51, 52]]
+[[31, 32, 33, 34, 35, 26, 37, 38, 39, 40]] [[31, 32, 33, 34, 35, 26, 37, 38, 39, 40], [8, 9]] [[31, 32, 33, 34, 35, 26, 37, 38, 39, 40], [50, 51, 52]]
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/array/string_to_array.slt b/datafusion/sqllogictest/test_files/array/string_to_array.slt
new file mode 100644
index 0000000000000..63c4211159058
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/array/string_to_array.slt
@@ -0,0 +1,229 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include ./init_data.slt.part
+
+# string_to_array scalar function
+query ?
+SELECT string_to_array('abcxxxdef', 'xxx')
+----
+[abc, def]
+
+query I
+SELECT cardinality(string_to_array('', ','))
+----
+0
+
+query I
+SELECT cardinality(string_to_array('', ''))
+----
+0
+
+query I
+SELECT cardinality(string_to_array('', ',', 'x'))
+----
+0
+
+query I
+SELECT cardinality(string_to_array('', '', 'x'))
+----
+0
+
+query ?
+SELECT string_to_array('abc', '')
+----
+[abc]
+
+query ?
+SELECT string_to_array('abc', NULL)
+----
+[a, b, c]
+
+query ?
+SELECT string_to_array('abc def', ' ', 'def')
+----
+[abc, NULL]
+
+query ?
+select string_to_array(e, ',') from values;
+----
+[Lorem]
+[ipsum]
+[dolor]
+[sit]
+[amet]
+[, ]
+[consectetur]
+[adipiscing]
+NULL
+
+# karge string tests for string_to_array
+
+# string_to_array scalar function
+query ?
+SELECT string_to_array(arrow_cast('abcxxxdef', 'LargeUtf8'), 'xxx')
+----
+[abc, def]
+
+# string_to_array scalar function
+query ?
+SELECT string_to_array(arrow_cast('abcxxxdef', 'LargeUtf8'), arrow_cast('xxx', 'LargeUtf8'))
+----
+[abc, def]
+
+query ?
+SELECT string_to_array(arrow_cast('abc', 'LargeUtf8'), NULL)
+----
+[a, b, c]
+
+query ?
+select string_to_array(arrow_cast(e, 'LargeUtf8'), ',') from values;
+----
+[Lorem]
+[ipsum]
+[dolor]
+[sit]
+[amet]
+[, ]
+[consectetur]
+[adipiscing]
+NULL
+
+query ?
+select string_to_array(arrow_cast(e, 'LargeUtf8'), ',', arrow_cast('Lorem', 'LargeUtf8')) from values;
+----
+[NULL]
+[ipsum]
+[dolor]
+[sit]
+[amet]
+[, ]
+[consectetur]
+[adipiscing]
+NULL
+
+# string view tests for string_to_array
+
+# string_to_array scalar function
+query ?
+SELECT string_to_array(arrow_cast('abcxxxdef', 'Utf8View'), 'xxx')
+----
+[abc, def]
+
+query ?
+SELECT string_to_array(arrow_cast('abc', 'Utf8View'), NULL)
+----
+[a, b, c]
+
+query ?
+select string_to_array(arrow_cast(e, 'Utf8View'), ',') from values;
+----
+[Lorem]
+[ipsum]
+[dolor]
+[sit]
+[amet]
+[, ]
+[consectetur]
+[adipiscing]
+NULL
+
+# test string_to_array aliases
+
+query ?
+select string_to_list(e, 'm') from values;
+----
+[Lore, ]
+[ipsu, ]
+[dolor]
+[sit]
+[a, et]
+[,]
+[consectetur]
+[adipiscing]
+NULL
+
+# string_to_array: single-char delimiter producing multiple elements
+query ?
+SELECT string_to_array('a,b,c', ',')
+----
+[a, b, c]
+
+# string_to_array: delimiter not found in input
+query ?
+SELECT string_to_array('abc', ',')
+----
+[abc]
+
+# string_to_array: empty string input
+query ?
+SELECT string_to_array('', ',')
+----
+[]
+
+# string_to_array: null_str matching multiple elements
+query ?
+SELECT string_to_array('a,NULL,b,NULL,c', ',', 'NULL')
+----
+[a, NULL, b, NULL, c]
+
+# string_to_array: null_str matching all elements
+query ?
+SELECT string_to_array('x,x,x', ',', 'x')
+----
+[NULL, NULL, NULL]
+
+# string_to_array: null_str with empty-string delimiter
+query ?
+SELECT string_to_array('abc', '', 'abc')
+----
+[NULL]
+
+# string_to_array: NULL string input
+query ?
+SELECT string_to_array(NULL, ',')
+----
+NULL
+
+# string_to_array: columnar delimiter
+query ??
+SELECT string_to_array('a,b,c', col1), string_to_array('a::b::c', col2)
+  FROM (VALUES (',', '::')) AS t(col1, col2)
+----
+[a, b, c] [a, b, c]
+
+# string_to_array: columnar null_str
+query ?
+SELECT string_to_array('a,NULL,b', ',', col1)
+  FROM (VALUES ('NULL')) AS t(col1)
+----
+[a, NULL, b]
+
+# string_to_array: adjacent delimiters produce empty strings
+query ?
+SELECT string_to_array('a,,b', ',')
+----
+[a, , b]
+
+# string_to_array: delimiter at start and end
+query ?
+SELECT string_to_array(',a,b,', ',')
+----
+[, a, b, ]
+
+
+include ./cleanup.slt.part
diff --git a/datafusion/sqllogictest/test_files/arrow_field.slt b/datafusion/sqllogictest/test_files/arrow_field.slt
new file mode 100644
index 0000000000000..7c6ab9e193011
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/arrow_field.slt
@@ -0,0 +1,146 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# arrow_field on integer literal
+query ?
+SELECT arrow_field(1)
+----
+{name: lit, data_type: Int64, nullable: false, metadata: {}}
+
+# arrow_field on null literal
+query ?
+SELECT arrow_field(null)
+----
+{name: lit, data_type: Null, nullable: true, metadata: {}}
+
+# arrow_field on boolean literal
+query ?
+SELECT arrow_field(true)
+----
+{name: lit, data_type: Boolean, nullable: false, metadata: {}}
+
+# arrow_field on string literal
+query ?
+SELECT arrow_field('foo')
+----
+{name: lit, data_type: Utf8, nullable: false, metadata: {}}
+
+# arrow_field on float literal
+query ?
+SELECT arrow_field(1.0)
+----
+{name: lit, data_type: Float64, nullable: false, metadata: {}}
+
+# arrow_field on list
+query ?
+SELECT arrow_field(ARRAY[1,2,3])
+----
+{name: lit, data_type: List(Int64), nullable: false, metadata: {}}
+
+# arrow_field on map
+query ?
+SELECT arrow_field(MAP {'a': 1, 'b': 2})
+----
+{name: lit, data_type: Map("entries": non-null Struct("key": non-null Utf8, "value": Int64), unsorted), nullable: false, metadata: {}}
+
+# arrow_field on struct
+query ?
+SELECT arrow_field({a: 1, b: 'foo'})
+----
+{name: lit, data_type: Struct("a": Int64, "b": Utf8), nullable: false, metadata: {}}
+
+# arrow_field on dictionary
+query ?
+SELECT arrow_field(arrow_cast('foo', 'Dictionary(Int32, Utf8)'))
+----
+{name: lit, data_type: Dictionary(Int32, Utf8), nullable: false, metadata: {}}
+
+# arrow_field struct field access - data_type
+query T
+SELECT arrow_field(1)['data_type']
+----
+Int64
+
+# arrow_field struct field access - nullable
+query B
+SELECT arrow_field(1)['nullable']
+----
+false
+
+# arrow_field struct field access - name
+query T
+SELECT arrow_field(1)['name']
+----
+lit
+
+# arrow_field with table columns
+statement ok
+CREATE TABLE arrow_field_test(x INT NOT NULL, y TEXT) AS VALUES (1, 'a');
+
+query ?
+SELECT arrow_field(x) FROM arrow_field_test
+----
+{name: x, data_type: Int32, nullable: false, metadata: {}}
+
+query ?
+SELECT arrow_field(y) FROM arrow_field_test
+----
+{name: y, data_type: Utf8View, nullable: true, metadata: {}}
+
+# arrow_field column access - name reflects column name
+query T
+SELECT arrow_field(x)['name'] FROM arrow_field_test
+----
+x
+
+# arrow_field column access - nullability
+query B
+SELECT arrow_field(x)['nullable'] FROM arrow_field_test
+----
+false
+
+query B
+SELECT arrow_field(y)['nullable'] FROM arrow_field_test
+----
+true
+
+statement ok
+DROP TABLE arrow_field_test;
+
+# arrow_field on a column that carries field metadata
+# (table_with_metadata is registered by the Rust test harness)
+query ?
+SELECT arrow_field(id) FROM table_with_metadata LIMIT 1
+----
+{name: id, data_type: Int32, nullable: true, metadata: {metadata_key: the id field}}
+
+# arrow_field metadata field access
+query ?
+SELECT arrow_field(id)['metadata'] FROM table_with_metadata LIMIT 1
+----
+{metadata_key: the id field}
+
+# arrow_field nullability field access
+query I
+SELECT count(*) FROM table_with_metadata WHERE NOT arrow_field(id)['nullable']
+----
+0
+
+query I
+SELECT count(*) FROM table_with_metadata WHERE arrow_field(id)['nullable']
+----
+3
diff --git a/datafusion/sqllogictest/test_files/arrow_files.slt b/datafusion/sqllogictest/test_files/arrow_files.slt
index b3975e0c3f471..94e1507388891 100644
--- a/datafusion/sqllogictest/test_files/arrow_files.slt
+++ b/datafusion/sqllogictest/test_files/arrow_files.slt
@@ -128,3 +128,267 @@ physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/
 # Errors in partition filters should be reported
 query error Divide by zero error
 SELECT f0 FROM arrow_partitioned WHERE CASE WHEN true THEN 1 / 0 ELSE part END = 1;
+
+#############
+## Arrow IPC stream format support
+#############
+
+# Test CREATE EXTERNAL TABLE with stream format
+statement ok
+CREATE EXTERNAL TABLE arrow_stream
+STORED AS ARROW
+LOCATION '../datasource-arrow/tests/data/example_stream.arrow';
+
+# physical plan for stream format
+query TT
+EXPLAIN SELECT * FROM arrow_stream
+----
+logical_plan TableScan: arrow_stream projection=[f0, f1, f2]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/datasource-arrow/tests/data/example_stream.arrow]]}, projection=[f0, f1, f2], file_type=arrow_stream
+
+# stream format should return same data as file format
+query ITB
+SELECT * FROM arrow_stream
+----
+1 foo true
+2 bar NULL
+3 baz false
+4 NULL true
+
+# Verify both file and stream formats return identical results
+query ITB
+SELECT * FROM arrow_simple ORDER BY f0
+----
+1 foo true
+2 bar NULL
+3 baz false
+4 NULL true
+
+query ITB
+SELECT * FROM arrow_stream ORDER BY f0
+----
+1 foo true
+2 bar NULL
+3 baz false
+4 NULL true
+
+# Both formats should support projection pushdown
+query IT
+SELECT f0, f1 FROM arrow_simple ORDER BY f0
+----
+1 foo
+2 bar
+3 baz
+4 NULL
+
+query IT
+SELECT f0, f1 FROM arrow_stream ORDER BY f0
+----
+1 foo
+2 bar
+3 baz
+4 NULL
+
+# Both formats should support filtering
+query ITB
+SELECT * FROM arrow_simple WHERE f0 > 2 ORDER BY f0
+----
+3 baz false
+4 NULL true
+
+query ITB
+SELECT * FROM arrow_stream WHERE f0 > 2 ORDER BY f0
+----
+3 baz false
+4 NULL true
+
+# Test aggregations on stream format
+query I
+SELECT COUNT(*) FROM arrow_stream
+----
+4
+
+query I
+SELECT SUM(f0) FROM arrow_stream
+----
+10
+
+query I
+SELECT MAX(f0) FROM arrow_stream
+----
+4
+
+query I
+SELECT MIN(f0) FROM arrow_stream WHERE f0 IS NOT NULL
+----
+1
+
+# Test aggregations on file format for comparison
+query I
+SELECT COUNT(*) FROM arrow_simple
+----
+4
+
+query I
+SELECT SUM(f0) FROM arrow_simple
+----
+10
+
+# Test joins between file and stream formats
+query ITBITB
+SELECT a.f0, a.f1, a.f2, b.f0, b.f1, b.f2
+FROM arrow_simple a
+JOIN arrow_stream b ON a.f0 = b.f0
+WHERE a.f0 <= 2
+ORDER BY a.f0
+----
+1 foo true 1 foo true
+2 bar NULL 2 bar NULL
+
+# Test that both formats work in UNION
+query ITB
+SELECT * FROM arrow_simple WHERE f0 = 1
+UNION ALL
+SELECT * FROM arrow_stream WHERE f0 = 2
+ORDER BY f0
+----
+1 foo true
+2 bar NULL
+
+# Test GROUP BY on stream format
+query BI
+SELECT f2, COUNT(*) as cnt FROM arrow_stream GROUP BY f2 ORDER BY f2
+----
+false 1
+true 2
+NULL 1
+
+# Test DISTINCT on stream format
+query B
+SELECT DISTINCT f2 FROM arrow_stream ORDER BY f2
+----
+false
+true
+NULL
+
+# Test subquery with stream format
+query I
+SELECT f0 FROM arrow_simple WHERE f0 IN (SELECT f0 FROM arrow_stream WHERE f0 < 3) ORDER BY f0
+----
+1
+2
+
+# ARROW partitioned table (stream format)
+statement ok
+CREATE EXTERNAL TABLE arrow_partitioned_stream (
+    part Int,
+    f0 Bigint,
+    f1 String,
+    f2 Boolean
+)
+STORED AS ARROW
+LOCATION '../core/tests/data/partitioned_table_arrow_stream/'
+PARTITIONED BY (part);
+
+# select wildcard
+query ITBI
+SELECT * FROM arrow_partitioned_stream ORDER BY f0;
+----
+1 foo true 123
+2 bar false 123
+3 baz true 456
+4 NULL NULL 456
+
+# select all fields
+query IITB
+SELECT part, f0, f1, f2 FROM arrow_partitioned_stream ORDER BY f0;
+----
+123 1 foo true
+123 2 bar false
+456 3 baz true
+456 4 NULL NULL
+
+# select without partition column
+query IB
+SELECT f0, f2 FROM arrow_partitioned_stream ORDER BY f0
+----
+1 true
+2 false
+3 true
+4 NULL
+
+# select only partition column
+query I
+SELECT part FROM arrow_partitioned_stream ORDER BY part
+----
+123
+123
+456
+456
+
+# select without any table-related columns in projection
+query I
+SELECT 1 FROM arrow_partitioned_stream
+----
+1
+1
+1
+1
+
+# select with partition filter
+query I
+SELECT f0 FROM arrow_partitioned_stream WHERE part = 123 ORDER BY f0
+----
+1
+2
+
+# select with partition filter should scan only one directory
+query TT
+EXPLAIN SELECT f0 FROM arrow_partitioned_stream WHERE part = 456
+----
+logical_plan TableScan: arrow_partitioned_stream projection=[f0], full_filters=[arrow_partitioned_stream.part = Int32(456)]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_table_arrow_stream/part=456/data.arrow]]}, projection=[f0], file_type=arrow_stream
+
+
+# Errors in partition filters should be reported
+query error Divide by zero error
+SELECT f0 FROM arrow_partitioned_stream WHERE CASE WHEN true THEN 1 / 0 ELSE part END = 1;
+
+# Test CREATE EXTERNAL TABLE with empty stream format
+statement ok
+CREATE EXTERNAL TABLE arrow_stream_empty
+STORED AS ARROW
+LOCATION '../datasource-arrow/tests/data/example_stream_empty.arrow'; 
+
+# physical plan for empty stream format
+query TT
+EXPLAIN SELECT * FROM arrow_stream_empty
+----
+logical_plan TableScan: arrow_stream_empty projection=[f0, f1, f2]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/datasource-arrow/tests/data/example_stream_empty.arrow]]}, projection=[f0, f1, f2], file_type=arrow_stream
+
+# stream format should return same data as file format
+query ITB
+SELECT * FROM arrow_stream_empty
+----
+
+# Test CREATE EXTERNAL TABLE with corrupted stream format
+statement ok
+CREATE EXTERNAL TABLE arrow_stream_corrupted_metadata_length
+STORED AS ARROW
+LOCATION '../datasource-arrow/tests/data/example_stream_corrupted_metadata_length.arrow'; 
+
+# physical plan for corrupted stream format
+query TT
+EXPLAIN SELECT * FROM arrow_stream_corrupted_metadata_length
+----
+logical_plan TableScan: arrow_stream_corrupted_metadata_length projection=[f0, f1, f2]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/datasource-arrow/tests/data/example_stream_corrupted_metadata_length.arrow]]}, projection=[f0, f1, f2], file_type=arrow_stream
+
+# querying corrupted stream format should result in error
+query error DataFusion error: Arrow error: Parser error: Unsupported message header type in IPC stream: 'NONE'
+SELECT * FROM arrow_stream_corrupted_metadata_length
+
+# Config reset
+statement ok
+RESET datafusion.sql_parser.map_string_types_to_utf8view;
diff --git a/datafusion/sqllogictest/test_files/arrow_try_cast.slt b/datafusion/sqllogictest/test_files/arrow_try_cast.slt
new file mode 100644
index 0000000000000..fffb340798634
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/arrow_try_cast.slt
@@ -0,0 +1,109 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+# Tests for arrow_try_cast: like arrow_cast but returns NULL on cast failure
+##########
+
+# Successful cast to Float64
+query R
+select arrow_try_cast(123, 'Float64');
+----
+123
+
+# Successful cast to Int64
+query I
+select arrow_try_cast('123', 'Int64');
+----
+123
+
+# Failed cast returns NULL
+query I
+select arrow_try_cast('not_a_number', 'Int64');
+----
+NULL
+
+# Same-type passthrough
+query I
+select arrow_try_cast(1, 'Int32');
+----
+1
+
+# Cast to LargeUtf8
+query T
+select arrow_try_cast('foo', 'LargeUtf8');
+----
+foo
+
+# Cast integer to string
+query T
+select arrow_try_cast(42, 'Utf8');
+----
+42
+
+# Cast to dictionary type
+query T
+select arrow_try_cast('bar', 'Dictionary(Int32, Utf8)');
+----
+bar
+
+# NULL input stays NULL
+query I
+select arrow_try_cast(NULL, 'Int64');
+----
+NULL
+
+# Error on invalid type string
+statement error
+select arrow_try_cast(1, 'NotAType');
+
+# Error when second argument is not a string constant
+statement error
+select arrow_try_cast(1, 123);
+
+# Multiple arrow_try_cast in one query
+query IT
+select arrow_try_cast('456', 'Int64') as a,
+       arrow_try_cast(789, 'Utf8') as b;
+----
+456 789
+
+# Tests that exercise physical execution (not constant folding)
+
+# Cast column values to Int64, with mixed valid/null/invalid inputs
+query I
+select arrow_try_cast(a, 'Int64') from (values('100'), (NULL), ('foo')) t(a);
+----
+100
+NULL
+NULL
+
+# Cast column values to Float64
+query R
+select arrow_try_cast(a, 'Float64') from (values('3.14'), ('not_num'), (NULL)) t(a);
+----
+3.14
+NULL
+NULL
+
+# Cast integer column to Utf8
+query T
+select arrow_try_cast(a, 'Utf8') from (values(1), (2), (NULL)) t(a);
+----
+1
+2
+NULL
diff --git a/datafusion/sqllogictest/test_files/arrow_typeof.slt b/datafusion/sqllogictest/test_files/arrow_typeof.slt
index 5ba62be6873c3..e00909ad5fc59 100644
--- a/datafusion/sqllogictest/test_files/arrow_typeof.slt
+++ b/datafusion/sqllogictest/test_files/arrow_typeof.slt
@@ -95,9 +95,12 @@ SELECT arrow_cast('1', 'Int16')
 query error
 SELECT arrow_cast('1')
 
-query error DataFusion error: Execution error: arrow_cast requires its second argument to be a non\-empty constant string
+query error DataFusion error: Error during planning: Function 'arrow_cast' requires String, but received Int64 \(DataType: Int64\)
 SELECT arrow_cast('1', 43)
 
+query error DataFusion error: Execution error: arrow_cast requires its second argument to be a non\-empty constant string
+SELECT arrow_cast('1', arrow_cast('Utf8', 'Utf8'))
+
 query error DataFusion error: Execution error: Unsupported type 'unknown'\. Must be a supported arrow type name such as 'Int32' or 'Timestamp\(ns\)'\. Error unknown token: unknown
 SELECT arrow_cast('1', 'unknown')
 
@@ -120,10 +123,10 @@ SELECT
   arrow_typeof(arrow_cast('foo', 'Utf8View')) as col_utf8_view,
   arrow_typeof(arrow_cast('foo', 'Binary')) as col_binary,
   arrow_typeof(arrow_cast('foo', 'LargeBinary')) as col_large_binary,
-  arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Second, None)')) as col_ts_s,
-  arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Millisecond, None)')) as col_ts_ms,
-  arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Microsecond, None)')) as col_ts_us,
-  arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Nanosecond, None)')) as col_ts_ns,
+  arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(s)')) as col_ts_s,
+  arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(ms)')) as col_ts_ms,
+  arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(µs)')) as col_ts_us,
+  arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(ns)')) as col_ts_ns,
   arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Second, Some("+08:00"))')) as col_tstz_s,
   arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Millisecond, Some("+08:00"))')) as col_tstz_ms,
   arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Microsecond, Some("+08:00"))')) as col_tstz_us,
@@ -239,10 +242,10 @@ drop table foo
 
 statement ok
 create table foo as select
-  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Second, None)') as col_ts_s,
-  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Millisecond, None)') as col_ts_ms,
-  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Microsecond, None)') as col_ts_us,
-  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Nanosecond, None)') as col_ts_ns
+  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(s)') as col_ts_s,
+  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(ms)') as col_ts_ms,
+  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(µs)') as col_ts_us,
+  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(ns)') as col_ts_ns
 ;
 
 ## Ensure each column in the table has the expected type
@@ -357,12 +360,12 @@ select arrow_cast(make_array(1, 2, 3), 'List(Int64)');
 query T
 select arrow_typeof(arrow_cast(make_array(1, 2, 3), 'List(Int64)'));
 ----
-List(nullable Int64)
+List(Int64)
 
 query T
 select arrow_typeof(arrow_cast(arrow_cast(make_array([1, 2, 3]), 'LargeList(LargeList(Int64))'), 'List(List(Int64))'));
 ----
-List(nullable List(nullable Int64))
+List(List(Int64))
 
 ## LargeList
 
@@ -380,12 +383,12 @@ select arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)');
 query T
 select arrow_typeof(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'));
 ----
-LargeList(nullable Int64)
+LargeList(Int64)
 
 query T
 select arrow_typeof(arrow_cast(make_array([1, 2, 3]), 'LargeList(LargeList(Int64))'));
 ----
-LargeList(nullable LargeList(nullable Int64))
+LargeList(LargeList(Int64))
 
 ## FixedSizeList
 
@@ -417,7 +420,7 @@ select arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)');
 query T
 select arrow_typeof(arrow_cast(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 'FixedSizeList(3, Int64)'));
 ----
-FixedSizeList(3 x nullable Int64)
+FixedSizeList(3 x Int64)
 
 query ?
 select arrow_cast([1, 2, 3], 'FixedSizeList(3, Int64)');
diff --git a/datafusion/sqllogictest/test_files/async_udf.slt b/datafusion/sqllogictest/test_files/async_udf.slt
index c61d02cfecfd4..0708b59e519a0 100644
--- a/datafusion/sqllogictest/test_files/async_udf.slt
+++ b/datafusion/sqllogictest/test_files/async_udf.slt
@@ -37,8 +37,7 @@ physical_plan
 03)----AggregateExec: mode=Partial, gby=[], aggr=[min(async_abs(data.x))]
 04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
 05)--------AsyncFuncExec: async_expr=[async_expr(name=__async_fn_0, expr=async_abs(x@0))]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Async udf can be used in aggregation with group by
 query I rowsort
@@ -58,14 +57,12 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[min(async_abs(data.x))@1 as min(async_abs(data.x))]
 02)--AggregateExec: mode=FinalPartitioned, gby=[async_abs(data.x)@0 as async_abs(data.x)], aggr=[min(async_abs(data.x))]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------RepartitionExec: partitioning=Hash([async_abs(data.x)@0], 4), input_partitions=4
-05)--------AggregateExec: mode=Partial, gby=[__common_expr_1@0 as async_abs(data.x)], aggr=[min(async_abs(data.x))]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------ProjectionExec: expr=[__async_fn_0@1 as __common_expr_1]
-08)--------------AsyncFuncExec: async_expr=[async_expr(name=__async_fn_0, expr=async_abs(x@0))]
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----RepartitionExec: partitioning=Hash([async_abs(data.x)@0], 4), input_partitions=4
+04)------AggregateExec: mode=Partial, gby=[__common_expr_1@0 as async_abs(data.x)], aggr=[min(async_abs(data.x))]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+06)----------ProjectionExec: expr=[__async_fn_0@1 as __common_expr_1]
+07)------------AsyncFuncExec: async_expr=[async_expr(name=__async_fn_0, expr=async_abs(x@0))]
+08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Async udf can be used in filter
 query I
@@ -80,12 +77,10 @@ logical_plan
 01)Filter: async_abs(data.x) < Int32(5)
 02)--TableScan: data projection=[x]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: __async_fn_0@1 < 5, projection=[x@0]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-04)------AsyncFuncExec: async_expr=[async_expr(name=__async_fn_0, expr=async_abs(x@0))]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: __async_fn_0@1 < 5, projection=[x@0]
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----AsyncFuncExec: async_expr=[async_expr(name=__async_fn_0, expr=async_abs(x@0))]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Async udf can be used in projection
 query I rowsort
@@ -103,5 +98,4 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[__async_fn_0@1 as async_abs(data.x)]
 02)--AsyncFuncExec: async_expr=[async_expr(name=__async_fn_0, expr=async_abs(x@0))]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
diff --git a/datafusion/sqllogictest/test_files/avro.slt b/datafusion/sqllogictest/test_files/avro.slt
index 2ad60c0082e87..a2ef9620e401a 100644
--- a/datafusion/sqllogictest/test_files/avro.slt
+++ b/datafusion/sqllogictest/test_files/avro.slt
@@ -105,6 +105,23 @@ CREATE EXTERNAL TABLE alltypes_plain_zstandard (
 STORED AS AVRO
 LOCATION '../../testing/data/avro/alltypes_plain.zstandard.avro';
 
+statement ok
+CREATE EXTERNAL TABLE alltypes_plain_reordered (
+  string_col VARCHAR,
+  id INT,
+  bool_col BOOLEAN NOT NULL,
+  tinyint_col TINYINT NOT NULL,
+  smallint_col SMALLINT NOT NULL,
+  int_col INT NOT NULL,
+  bigint_col BIGINT NOT NULL,
+  float_col FLOAT NOT NULL,
+  double_col DOUBLE NOT NULL,
+  date_string_col BYTEA NOT NULL,
+  timestamp_col TIMESTAMP NOT NULL,
+)
+STORED AS AVRO
+LOCATION '../../testing/data/avro/alltypes_plain.avro';
+
 statement ok
 CREATE EXTERNAL TABLE single_nan (
   mycol FLOAT
@@ -267,3 +284,22 @@ SELECT id, string_col, int_col, bigint_col FROM alltypes_plain ORDER BY id LIMIT
 2 0 0 0
 3 1 1 10
 4 0 0 0
+
+# projection should follow column names even when external table column order differs
+query TB
+SELECT string_col, bool_col FROM alltypes_plain_reordered ORDER BY id LIMIT 2
+----
+0 true
+1 false
+
+# explain plan for reordered table projection path
+query TT
+EXPLAIN SELECT string_col, bool_col FROM alltypes_plain_reordered
+----
+logical_plan TableScan: alltypes_plain_reordered projection=[string_col, bool_col]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/avro/alltypes_plain.avro]]}, projection=[string_col, bool_col], file_type=avro
+
+
+# Config reset
+statement ok
+reset datafusion.sql_parser.map_string_types_to_utf8view;
diff --git a/datafusion/sqllogictest/test_files/binary.slt b/datafusion/sqllogictest/test_files/binary.slt
index 1077c32e46f35..a57c31547f08d 100644
--- a/datafusion/sqllogictest/test_files/binary.slt
+++ b/datafusion/sqllogictest/test_files/binary.slt
@@ -311,3 +311,50 @@ Foo foo Foo foo
 NULL NULL NULL NULL
 Bar Bar Bar Bar
 FooBar fooBar FooBar fooBar
+
+# show helpful error msg when Binary type is used with string functions
+query error DataFusion error: Error during planning: Function 'split_part' requires String, but received Binary \(DataType: Binary\)\.\n\nHint: Binary types are not automatically coerced to String\. Use CAST\(column AS VARCHAR\) to convert Binary data to String\.
+SELECT split_part(binary, '~', 2) FROM t WHERE binary IS NOT NULL LIMIT 1;
+
+# ensure the suggested CAST workaround works
+query T
+SELECT split_part(CAST(binary AS VARCHAR), 'o', 2) FROM t WHERE binary = X'466f6f';
+----
+(empty)
+
+# Pipe concatenation of binaries always provides a binary
+query ?
+SELECT x'636166c3a9' || x'68656c6c6f';
+----
+636166c3a968656c6c6f
+
+# Pipe concatenation of binary and other kind of binary also provides a binary
+query ?
+SELECT x'636166c3a9' || arrow_cast(x'68656c6c6f', 'LargeBinary');
+----
+636166c3a968656c6c6f
+
+query ?
+SELECT x'636166c3a9' || arrow_cast(arrow_cast(x'68656c6c6f', 'LargeBinary'), 'BinaryView');
+----
+636166c3a968656c6c6f
+
+query ?T
+SELECT x'636166c3a9' || arrow_cast(x'68656c6c6f', 'FixedSizeBinary(5)'), arrow_typeof(x'636166c3a9' || arrow_cast(x'68656c6c6f', 'FixedSizeBinary(5)'));
+----
+636166c3a968656c6c6f Binary
+
+query ?T
+SELECT arrow_cast(x'6361', 'FixedSizeBinary(2)') || arrow_cast(x'68656c6c6f', 'FixedSizeBinary(5)'), arrow_typeof(arrow_cast(x'6361', 'FixedSizeBinary(2)') || arrow_cast(x'68656c6c6f', 'FixedSizeBinary(5)'));
+----
+636168656c6c6f Binary
+
+# Byte pipe operator is forbidden for mixed binary and text
+query error DataFusion error: Error during planning: Cannot infer common string type for string concat operation Binary || Utf8
+SELECT x'c3a9' || 'hello';
+
+query error DataFusion error: Error during planning: Cannot infer common string type for string concat operation Utf8 || LargeBinary
+SELECT 'hello' || arrow_cast(arrow_cast('hello', 'Binary'), 'LargeBinary');
+
+query error DataFusion error: Error during planning: Cannot infer common string type for string concat operation Utf8 || BinaryView
+SELECT 'hello' || arrow_cast(arrow_cast('hello', 'Binary'), 'BinaryView');
diff --git a/datafusion/sqllogictest/test_files/case.slt b/datafusion/sqllogictest/test_files/case.slt
index 1a4b6a7a2b4a8..3953878ceb666 100644
--- a/datafusion/sqllogictest/test_files/case.slt
+++ b/datafusion/sqllogictest/test_files/case.slt
@@ -383,9 +383,10 @@ SELECT column2, column3, column4  FROM t;
 ----
 {foo: a, xxx: b} {xxx: c, foo: d} {xxx: e}
 
-# coerce structs with different field orders,
-# (note the *value*s are from column2 but the field name is 'xxx', as the coerced
-# type takes the field name from the last argument (column3)
+# coerce structs with different field orders
+# With name-based struct coercion, matching fields by name:
+# column2={foo:a, xxx:b} unified with column3={xxx:c, foo:d}
+# Result uses the THEN branch's field order (when executed): {xxx: b, foo: a}
 query ?
 SELECT
   case
@@ -394,9 +395,10 @@ SELECT
   end
 FROM t;
 ----
-{xxx: a, foo: b}
+{xxx: b, foo: a}
 
 # coerce structs with different field orders
+# When ELSE branch executes, uses its field order: {xxx: c, foo: d}
 query ?
 SELECT
   case
@@ -407,8 +409,9 @@ FROM t;
 ----
 {xxx: c, foo: d}
 
-# coerce structs with subset of fields
-query error Failed to coerce then
+# coerce structs with subset of fields - field count mismatch causes type coercion failure
+# column3 has 2 fields but column4 has only 1 field
+query error DataFusion error: type_coercion\ncaused by\nError during planning: Failed to coerce then .* and else .* to common types in CASE WHEN expression
 SELECT
   case
     when column1 > 0 then column3
@@ -618,6 +621,59 @@ a
 b
 c
 
+query I
+SELECT CASE WHEN d != 0 THEN n / d ELSE NULL END FROM (VALUES (1, 1), (1, 0), (1, -1)) t(n,d)
+----
+1
+NULL
+-1
+
+query I
+SELECT CASE WHEN d > 0 THEN n / d ELSE NULL END FROM (VALUES (1, 1), (1, 0), (1, -1)) t(n,d)
+----
+1
+NULL
+NULL
+
+query I
+SELECT CASE WHEN d < 0 THEN n / d ELSE NULL END FROM (VALUES (1, 1), (1, 0), (1, -1)) t(n,d)
+----
+NULL
+NULL
+-1
+
+# single WHEN, no ELSE (absent)
+query I
+SELECT CASE WHEN a > 0 THEN b END
+FROM (VALUES (1, 10), (0, 20)) AS t(a, b);
+----
+10
+NULL
+
+# single WHEN, explicit ELSE NULL
+query I
+SELECT CASE WHEN a > 0 THEN b ELSE NULL END
+FROM (VALUES (1, 10), (0, 20)) AS t(a, b);
+----
+10
+NULL
+
+# fallible THEN expression should only be evaluated on true rows
+query I
+SELECT CASE WHEN a > 0 THEN 10 / a END
+FROM (VALUES (1), (0)) AS t(a);
+----
+10
+NULL
+
+# all-false path returns typed NULLs
+query I
+SELECT CASE WHEN a < 0 THEN b END
+FROM (VALUES (1, 10), (2, 20)) AS t(a, b);
+----
+NULL
+NULL
+
 # EvalMethod::WithExpression using subset of all selected columns in case expression
 query III
 SELECT CASE a1 WHEN 1 THEN a1 WHEN 2 THEN a2 WHEN 3 THEN b END, b, c
@@ -683,3 +739,157 @@ FROM (
 10 10 100
 -20 20 200
 NULL 30 300
+
+# Case-with-expression that was incorrectly classified as not-nullable, but evaluates to null
+query I
+SELECT CASE 0 WHEN 0 THEN NULL WHEN SUM(1) + COUNT(*) THEN 10 ELSE 20 END
+----
+NULL
+
+query TT
+EXPLAIN SELECT CASE WHEN CASE WHEN a IS NOT NULL THEN a ELSE 1 END IS NOT NULL THEN a ELSE 1 END FROM (
+    VALUES (10), (20), (30)
+  ) t(a);
+----
+logical_plan
+01)Projection: t.a AS CASE WHEN CASE WHEN t.a IS NOT NULL THEN t.a ELSE Int64(1) END IS NOT NULL THEN t.a ELSE Int64(1) END
+02)--SubqueryAlias: t
+03)----Projection: column1 AS a
+04)------Values: (Int64(10)), (Int64(20)), (Int64(30))
+physical_plan
+01)ProjectionExec: expr=[column1@0 as CASE WHEN CASE WHEN t.a IS NOT NULL THEN t.a ELSE Int64(1) END IS NOT NULL THEN t.a ELSE Int64(1) END]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+#####
+# CASE with literal characters (to test lookup table CASE optimization)
+#####
+statement ok
+create table source (letter varchar) as values ('a'), ('b'), (NULL), ('c'), ('a'), ('c'), ('d');
+
+# Table with different string types
+statement ok
+create table letters as
+select
+  arrow_cast(letter, 'Utf8') as letter_utf8,
+  arrow_cast(letter, 'LargeUtf8') as letter_large_utf8,
+  arrow_cast(letter, 'Utf8View') as letter_utf8_view,
+  arrow_cast(letter, 'Dictionary(Int32, Utf8)') as letter_string_dict,
+from source;
+
+
+query TIIIII
+select
+  letter_utf8 as letter
+  ,CASE letter_utf8       WHEN 'b' THEN 1 WHEN 'a' THEN 2 WHEN 'd' THEN 3 ELSE 0 END as utf8
+  ,CASE letter_large_utf8 WHEN 'b' THEN 1 WHEN 'a' THEN 2 WHEN 'd' THEN 3 ELSE 0 END as large_utf8
+  ,CASE letter_utf8_view  WHEN 'b' THEN 1 WHEN 'a' THEN 2 WHEN 'd' THEN 3 ELSE 0 END as utf8_view
+  ,CASE letter_string_dict WHEN 'b' THEN 1 WHEN 'a' THEN 2 WHEN 'd' THEN 3 ELSE 0 END as string_dict
+  ,CASE letter_utf8       WHEN 'b' THEN 1 WHEN NULL THEN 2 WHEN 'd' THEN 3 ELSE 0 END as utf8_with_null
+FROM letters;
+----
+a 2 2 2 2 0
+b 1 1 1 1 1
+NULL 0 0 0 0 0
+c 0 0 0 0 0
+a 2 2 2 2 0
+c 0 0 0 0 0
+d 3 3 3 3 3
+
+statement ok
+create table letters_binary as
+select
+  arrow_cast(letter, 'Binary') as letter_binary,
+  arrow_cast(letter, 'LargeBinary') as letter_large_binary,
+  arrow_cast(letter, 'BinaryView') as letter_binary_view,
+  arrow_cast(letter, 'Dictionary(Int32, Binary)') as letter_binary_dict,
+  arrow_cast(arrow_cast(letter, 'Binary'), 'FixedSizeBinary(1)') as letter_fsb
+from source;
+
+query ?IIIII
+select
+    letter_binary as letter
+    ,CASE letter_binary       WHEN X'62' THEN 1 WHEN X'61' THEN 2 WHEN X'64' THEN 3 ELSE 0 END as binary
+    ,CASE letter_large_binary WHEN X'62' THEN 1 WHEN X'61' THEN 2 WHEN X'64' THEN 3 ELSE 0 END as large_binary
+    ,CASE letter_binary_view  WHEN X'62' THEN 1 WHEN X'61' THEN 2 WHEN X'64' THEN 3 ELSE 0 END as binary_view
+    ,CASE letter_binary_dict  WHEN X'62' THEN 1 WHEN X'61' THEN 2 WHEN X'64' THEN 3 ELSE 0 END as binary_dict
+    ,CASE letter_fsb          WHEN X'62' THEN 1 WHEN X'61' THEN 2 WHEN X'64' THEN 3 ELSE 0 END as fsb
+FROM letters_binary;
+----
+61 2 2 2 2 2
+62 1 1 1 1 1
+NULL 0 0 0 0 0
+63 0 0 0 0 0
+61 2 2 2 2 2
+63 0 0 0 0 0
+64 3 3 3 3 3
+
+statement ok
+drop table source;
+
+
+statement ok
+drop table letters;
+
+statement ok
+drop table letters_binary;
+
+# Tests for CASE with boolean expressions
+statement ok
+create table booleans (b boolean) as values (true), (false), (null), (true), (null), (false);
+
+query BIII
+select
+    b as boolean_value
+    ,CASE b WHEN true THEN 1 WHEN false THEN 2 ELSE 0 END as boolean_case
+    ,CASE b WHEN false THEN 1 WHEN true THEN 2 ELSE 0 END as boolean_case_rev
+    ,CASE b WHEN true THEN 1 WHEN NULL THEN 2 WHEN false THEN 3 ELSE 0 END as boolean_with_nulls
+FROM booleans;
+----
+true 1 2 1
+false 2 1 3
+NULL 0 0 0
+true 1 2 1
+NULL 0 0 0
+false 2 1 3
+
+statement ok
+drop table booleans;
+
+# Tests for CASE with floating point literals
+statement ok
+create table float_source (f float) as values (1.0), (2.0), (null), (3.5), (2.0), (null);
+
+statement ok
+create table floats as
+select
+    arrow_cast(f, 'Float16') as f16,
+    arrow_cast(f, 'Float32') as f32,
+    arrow_cast(f, 'Float64') as f64,
+    arrow_cast(f, 'Dictionary(Int32, Float32)') as f32_dict,
+from float_source;
+
+query RTTTT
+select
+    f32 as float_value
+    ,CASE f16      WHEN 1.0 THEN 'one'  WHEN 3.5 THEN 'three_point_five' WHEN 2.0 THEN 'two' ELSE 'N/A' END as f16_case
+    ,CASE f32      WHEN 1.0 THEN 'one'  WHEN 3.5 THEN 'three_point_five' WHEN 2.0 THEN 'two' ELSE 'N/A' END as f32_case
+    ,CASE f64      WHEN 1.0 THEN 'one'  WHEN 3.5 THEN 'three_point_five' WHEN 2.0 THEN 'two' ELSE 'N/A' END as f64_case
+    ,CASE f32_dict WHEN 1.0 THEN 'one'  WHEN 3.5 THEN 'three_point_five' WHEN 2.0 THEN 'two' ELSE 'N/A' END as f32_dict_case
+FROM floats;
+----
+1 one one one one
+2 two two two two
+NULL N/A N/A N/A N/A
+3.5 three_point_five three_point_five three_point_five three_point_five
+2 two two two two
+NULL N/A N/A N/A N/A
+
+statement ok
+drop table float_source;
+
+statement ok
+drop table floats;
+
+#####
+# End of lookup table CASE tests
+#####
diff --git a/datafusion/sqllogictest/test_files/cast.slt b/datafusion/sqllogictest/test_files/cast.slt
index 3466354e54d71..916895b8be1eb 100644
--- a/datafusion/sqllogictest/test_files/cast.slt
+++ b/datafusion/sqllogictest/test_files/cast.slt
@@ -89,3 +89,39 @@ select * from t0 where v0<1e100;
 
 statement ok
 drop table t0;
+
+
+# ensure that automatically casting with "datafusion.optimizer.expand_views_at_output" does not
+# change the column name
+
+statement ok
+create table t(a int, b varchar);
+
+statement ok
+set datafusion.optimizer.expand_views_at_output = true;
+
+query TT
+explain select * from t;
+----
+logical_plan
+01)Projection: t.a, CAST(t.b AS LargeUtf8) AS b
+02)--TableScan: t projection=[a, b]
+physical_plan
+01)ProjectionExec: expr=[a@0 as a, CAST(b@1 AS LargeUtf8) as b]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+
+query TT
+explain select b from t;
+----
+logical_plan
+01)Projection: CAST(t.b AS LargeUtf8) AS b
+02)--TableScan: t projection=[b]
+physical_plan
+01)ProjectionExec: expr=[CAST(b@0 AS LargeUtf8) as b]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+
+statement ok
+set datafusion.optimizer.expand_views_at_output = false;
+
+statement ok
+drop table t;
diff --git a/datafusion/sqllogictest/test_files/cast_extension_type_metadata.slt b/datafusion/sqllogictest/test_files/cast_extension_type_metadata.slt
new file mode 100644
index 0000000000000..425d8ac16eaee
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/cast_extension_type_metadata.slt
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Regression tests for logical CAST targets that carry explicit field metadata.
+
+query ?T
+SELECT
+    CAST(
+        arrow_cast(X'00010203040506070809000102030506', 'FixedSizeBinary(16)')
+        AS UUID
+    ),
+    arrow_metadata(
+        CAST(
+            arrow_cast(X'00010203040506070809000102030506', 'FixedSizeBinary(16)')
+            AS UUID
+        ),
+        'ARROW:extension:name'
+    );
+----
+00010203040506070809000102030506 arrow.uuid
+
+query ?T
+SELECT
+    CAST(raw AS UUID),
+    arrow_metadata(CAST(raw AS UUID), 'ARROW:extension:name')
+FROM (
+    VALUES (
+        arrow_cast(X'00010203040506070809000102030506', 'FixedSizeBinary(16)')
+    )
+) AS uuids(raw);
+----
+00010203040506070809000102030506 arrow.uuid
+
+statement error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*TryCast from FixedSizeBinary\(16\) to FixedSizeBinary\(16\)<\{"ARROW:extension:name": "arrow\.uuid"\}> is not supported
+SELECT TRY_CAST(arrow_cast(X'00010203040506070809000102030506', 'FixedSizeBinary(16)') AS UUID);
diff --git a/datafusion/sqllogictest/test_files/cast_to_type.slt b/datafusion/sqllogictest/test_files/cast_to_type.slt
new file mode 100644
index 0000000000000..128846c0f5157
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/cast_to_type.slt
@@ -0,0 +1,347 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#######
+## Tests for cast_to_type function
+#######
+
+# Basic string to integer cast
+query I
+SELECT cast_to_type('42', 1::INTEGER);
+----
+42
+
+# String to double cast
+query R
+SELECT cast_to_type('3.14', 1.0::DOUBLE);
+----
+3.14
+
+# Integer to string cast
+query T
+SELECT cast_to_type(42, 'a'::VARCHAR);
+----
+42
+
+# Integer to double cast
+query R
+SELECT cast_to_type(42, 0.0::DOUBLE);
+----
+42
+
+# Same-type is a no-op
+query I
+SELECT cast_to_type(42, 0::INTEGER);
+----
+42
+
+# Second argument is a typed NULL double
+query R
+SELECT cast_to_type('3.14', NULL::DOUBLE);
+----
+3.14
+
+# Second argument is a typed NULL integer
+query I
+SELECT cast_to_type(42, NULL::INTEGER);
+----
+42
+
+# Second argument is a typed NULL string
+query T
+SELECT cast_to_type('42', NULL::VARCHAR);
+----
+42
+
+# NULL first argument
+query I
+SELECT cast_to_type(NULL, 0::INTEGER);
+----
+NULL
+
+# CASE expression as first argument
+query I
+SELECT cast_to_type(CASE WHEN true THEN '1' ELSE '2' END, NULL::INTEGER);
+----
+1
+
+# Arithmetic expression as first argument
+query R
+SELECT cast_to_type(1 + 2, NULL::DOUBLE);
+----
+3
+
+# Nested cast_to_type
+query T
+SELECT cast_to_type(cast_to_type('3.14', NULL::DOUBLE), NULL::VARCHAR);
+----
+3.14
+
+# Subquery as second argument
+query I
+SELECT cast_to_type('42', (SELECT NULL::INTEGER));
+----
+42
+
+# Column reference as second argument
+statement ok
+CREATE TABLE t1 (int_col INTEGER, text_col VARCHAR, double_col DOUBLE);
+
+statement ok
+INSERT INTO t1 VALUES (1, 'hello', 3.14), (2, 'world', 2.72);
+
+query I
+SELECT cast_to_type('99', int_col) FROM t1 LIMIT 1;
+----
+99
+
+query T
+SELECT cast_to_type(123, text_col) FROM t1 LIMIT 1;
+----
+123
+
+query R
+SELECT cast_to_type('1.5', double_col) FROM t1 LIMIT 1;
+----
+1.5
+
+# Case statement as second argument
+query I
+SELECT cast_to_type('42', CASE WHEN random() < 2 THEN 1 ELSE 0 END);
+----
+42
+
+# Use with column values as first argument
+query R
+SELECT cast_to_type(int_col, 1.0::DOUBLE) FROM t1;
+----
+1
+2
+
+# Cast column to match another column's type
+query T
+SELECT cast_to_type(int_col, text_col) FROM t1;
+----
+1
+2
+
+# Boolean cast
+query B
+SELECT cast_to_type(1, NULL::BOOLEAN);
+----
+true
+
+# String to date cast
+query D
+SELECT cast_to_type('2024-01-15', NULL::DATE);
+----
+2024-01-15
+
+# Error on invalid cast
+statement error Cannot cast string 'not_a_number' to value of Int32 type
+SELECT cast_to_type('not_a_number', NULL::INTEGER);
+
+# Error on invalid target type
+statement error Unsupported SQL type INVALID
+SELECT cast_to_type('42', NULL::INVALID);
+
+statement ok
+DROP TABLE t1;
+
+#######
+## Nullability tests for cast_to_type
+#######
+
+statement ok
+set datafusion.catalog.information_schema = true;
+
+# Non-nullable input -> non-nullable output
+statement ok
+CREATE VIEW v_cast_nonnull AS SELECT cast_to_type(42, NULL::INTEGER) as a;
+
+query TTT
+SELECT column_name, data_type, is_nullable FROM information_schema.columns WHERE table_name = 'v_cast_nonnull';
+----
+a Int32 NO
+
+statement ok
+DROP VIEW v_cast_nonnull;
+
+# Nullable input -> nullable output
+statement ok
+CREATE TABLE t_nullable (x INTEGER);
+
+statement ok
+INSERT INTO t_nullable VALUES (1), (NULL);
+
+statement ok
+CREATE VIEW v_cast_null AS SELECT cast_to_type(x, 1.0::DOUBLE) as a FROM t_nullable;
+
+query TTT
+SELECT column_name, data_type, is_nullable FROM information_schema.columns WHERE table_name = 'v_cast_null';
+----
+a Float64 YES
+
+# If we cast to the null type itself the result is nullable even if the input is not
+statement ok
+CREATE VIEW v_cast_to_null AS SELECT cast_to_type(42, null) as a;
+
+query TTT
+SELECT column_name, data_type, is_nullable FROM information_schema.columns WHERE table_name = 'v_cast_to_null';
+----
+a Null YES
+
+statement ok
+DROP VIEW v_cast_null;
+
+statement ok
+DROP TABLE t_nullable;
+
+#######
+## Tests for try_cast_to_type function (fallible variant returning NULL)
+#######
+
+# Basic string to integer cast
+query I
+SELECT try_cast_to_type('42', NULL::INTEGER);
+----
+42
+
+# Invalid cast returns NULL instead of error
+query I
+SELECT try_cast_to_type('not_a_number', NULL::INTEGER);
+----
+NULL
+
+# String to double cast
+query R
+SELECT try_cast_to_type('3.14', NULL::DOUBLE);
+----
+3.14
+
+# Invalid double returns NULL
+query R
+SELECT try_cast_to_type('abc', NULL::DOUBLE);
+----
+NULL
+
+# Integer to string cast (always succeeds)
+query T
+SELECT try_cast_to_type(42, NULL::VARCHAR);
+----
+42
+
+# Same-type is a no-op
+query I
+SELECT try_cast_to_type(42, 0::INTEGER);
+----
+42
+
+# NULL first argument
+query I
+SELECT try_cast_to_type(NULL, 0::INTEGER);
+----
+NULL
+
+# CASE expression as first argument
+query I
+SELECT try_cast_to_type(CASE WHEN true THEN '1' ELSE '2' END, NULL::INTEGER);
+----
+1
+
+# Arithmetic expression as first argument
+query R
+SELECT try_cast_to_type(1 + 2, NULL::DOUBLE);
+----
+3
+
+# Nested: try_cast_to_type inside cast_to_type
+query T
+SELECT cast_to_type(try_cast_to_type('3.14', NULL::DOUBLE), NULL::VARCHAR);
+----
+3.14
+
+# Subquery as second argument
+query I
+SELECT try_cast_to_type('42', (SELECT NULL::INTEGER));
+----
+42
+
+# Column reference as second argument
+statement ok
+CREATE TABLE t2 (int_col INTEGER, text_col VARCHAR);
+
+statement ok
+INSERT INTO t2 VALUES (1, 'hello'), (2, 'world');
+
+query I
+SELECT try_cast_to_type('99', int_col) FROM t2 LIMIT 1;
+----
+99
+
+query I
+SELECT try_cast_to_type(text_col, int_col) FROM t2;
+----
+NULL
+NULL
+
+# Cast column to match another column's type
+query T
+SELECT try_cast_to_type(int_col, text_col) FROM t2;
+----
+1
+2
+
+# Boolean cast
+query B
+SELECT try_cast_to_type(1, NULL::BOOLEAN);
+----
+true
+
+# String to date - valid
+query D
+SELECT try_cast_to_type('2024-01-15', NULL::DATE);
+----
+2024-01-15
+
+# String to date - invalid returns NULL
+query D
+SELECT try_cast_to_type('not_a_date', NULL::DATE);
+----
+NULL
+
+statement ok
+DROP TABLE t2;
+
+#######
+## Nullability tests for try_cast_to_type
+#######
+
+# try_cast_to_type is always nullable (cast can fail)
+statement ok
+CREATE VIEW v_trycast AS SELECT try_cast_to_type(42, 1::INTEGER) as a;
+
+query TTT
+SELECT column_name, data_type, is_nullable FROM information_schema.columns WHERE table_name = 'v_trycast';
+----
+a Int32 YES
+
+statement ok
+DROP VIEW v_trycast;
+
+statement ok
+set datafusion.catalog.information_schema = false;
diff --git a/datafusion/sqllogictest/test_files/clickbench.slt b/datafusion/sqllogictest/test_files/clickbench.slt
index 4c60a4365ee26..c79701e347109 100644
--- a/datafusion/sqllogictest/test_files/clickbench.slt
+++ b/datafusion/sqllogictest/test_files/clickbench.slt
@@ -15,10 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
-
-# This file contains the clickbench schema and queries
-# and the first 10 rows of data. Since ClickBench contains case sensitive queries
-# this is also a good test of that usecase too
+## Notes: This file contains the ClickBench schema and queries and the first 10
+## rows of data. Since ClickBench contains case sensitive identifiers (e.g.
+## "EventDate") this is also a good test of that usecase too
 
 # create.sql came from
 # https://github.com/ClickHouse/ClickBench/blob/8b9e3aa05ea18afa427f14909ddc678b8ef0d5e6/datafusion/create.sql
@@ -26,53 +25,230 @@
 # COPY (SELECT * FROM 'hits.parquet' LIMIT 10) TO 'clickbench_hits_10.parquet' (FORMAT PARQUET);
 
 statement ok
-CREATE EXTERNAL TABLE hits
+CREATE EXTERNAL TABLE hits_raw
 STORED AS PARQUET
 LOCATION '../core/tests/data/clickbench_hits_10.parquet';
 
+# ClickBench encodes EventDate as UInt16 days since epoch.
+# So we define this view to convert it to the correct DATE type (this is done
+# in the ClickBench runner as well, see https://github.com/ClickHouse/ClickBench/pull/803
+statement ok
+CREATE VIEW hits AS
+SELECT * EXCEPT ("EventDate"),
+       CAST(CAST("EventDate" AS INTEGER) AS DATE) AS "EventDate"
+FROM hits_raw;
+
+# Verify EventDate transformation from UInt16 to DATE
+
+query D
+SELECT "EventDate" FROM hits LIMIT 1;
+----
+2013-07-15
+
+# Verify the raw value is still UInt16 in hits_raw
+query I
+SELECT "EventDate" FROM hits_raw LIMIT 1;
+----
+15901
 
 # queries.sql came from
 # https://github.com/ClickHouse/ClickBench/blob/8b9e3aa05ea18afa427f14909ddc678b8ef0d5e6/datafusion/queries.sql
 
+## Q0
+query TT
+EXPLAIN SELECT COUNT(*) FROM hits;
+----
+logical_plan
+01)Projection: count(Int64(1)) AS count(*)
+02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+03)----SubqueryAlias: hits
+04)------TableScan: hits_raw projection=[]
+physical_plan
+01)ProjectionExec: expr=[10 as count(*)]
+02)--PlaceholderRowExec
+
 query I
 SELECT COUNT(*) FROM hits;
 ----
 10
 
+## Q1
+query TT
+EXPLAIN SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0;
+----
+logical_plan
+01)Projection: count(Int64(1)) AS count(*)
+02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+03)----SubqueryAlias: hits
+04)------Projection:
+05)--------Filter: hits_raw.AdvEngineID != Int16(0)
+06)----------TableScan: hits_raw projection=[AdvEngineID], partial_filters=[hits_raw.AdvEngineID != Int16(0)]
+physical_plan
+01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
+02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
+03)----CoalescePartitionsExec
+04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
+05)--------FilterExec: AdvEngineID@0 != 0, projection=[]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[AdvEngineID], file_type=parquet, predicate=AdvEngineID@40 != 0, pruning_predicate=AdvEngineID_null_count@2 != row_count@3 AND (AdvEngineID_min@0 != 0 OR 0 != AdvEngineID_max@1), required_guarantees=[AdvEngineID not in (0)]
+
 query I
 SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0;
 ----
 0
 
+query TT
+EXPLAIN SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits;
+----
+logical_plan
+01)Projection: sum(hits.AdvEngineID), count(Int64(1)) AS count(*), avg(hits.ResolutionWidth)
+02)--Aggregate: groupBy=[[]], aggr=[[sum(CAST(hits.AdvEngineID AS Int64)), count(Int64(1)), avg(CAST(hits.ResolutionWidth AS Float64))]]
+03)----SubqueryAlias: hits
+04)------TableScan: hits_raw projection=[ResolutionWidth, AdvEngineID]
+physical_plan
+01)ProjectionExec: expr=[sum(hits.AdvEngineID)@0 as sum(hits.AdvEngineID), count(Int64(1))@1 as count(*), avg(hits.ResolutionWidth)@2 as avg(hits.ResolutionWidth)]
+02)--AggregateExec: mode=Single, gby=[], aggr=[sum(hits.AdvEngineID), count(Int64(1)), avg(hits.ResolutionWidth)]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[ResolutionWidth, AdvEngineID], file_type=parquet
+
 query IIR
 SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits;
 ----
 0 10 0
 
+## Q3
+query TT
+EXPLAIN SELECT AVG("UserID") FROM hits;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[avg(CAST(hits.UserID AS Float64))]]
+02)--SubqueryAlias: hits
+03)----TableScan: hits_raw projection=[UserID]
+physical_plan
+01)AggregateExec: mode=Single, gby=[], aggr=[avg(hits.UserID)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[UserID], file_type=parquet
+
 query R
 SELECT AVG("UserID") FROM hits;
 ----
 -304548765855551740
 
+## Q4
+query TT
+EXPLAIN SELECT COUNT(DISTINCT "UserID") FROM hits;
+----
+logical_plan
+01)Projection: count(alias1) AS count(DISTINCT hits.UserID)
+02)--Aggregate: groupBy=[[]], aggr=[[count(alias1)]]
+03)----Aggregate: groupBy=[[hits.UserID AS alias1]], aggr=[[]]
+04)------SubqueryAlias: hits
+05)--------TableScan: hits_raw projection=[UserID]
+physical_plan
+01)ProjectionExec: expr=[count(alias1)@0 as count(DISTINCT hits.UserID)]
+02)--AggregateExec: mode=Final, gby=[], aggr=[count(alias1)]
+03)----CoalescePartitionsExec
+04)------AggregateExec: mode=Partial, gby=[], aggr=[count(alias1)]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[alias1@0 as alias1], aggr=[]
+06)----------RepartitionExec: partitioning=Hash([alias1@0], 4), input_partitions=1
+07)------------AggregateExec: mode=Partial, gby=[UserID@0 as alias1], aggr=[]
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[UserID], file_type=parquet
+
 query I
 SELECT COUNT(DISTINCT "UserID") FROM hits;
 ----
 5
 
+## Q5
+query TT
+EXPLAIN SELECT COUNT(DISTINCT "SearchPhrase") FROM hits;
+----
+logical_plan
+01)Projection: count(alias1) AS count(DISTINCT hits.SearchPhrase)
+02)--Aggregate: groupBy=[[]], aggr=[[count(alias1)]]
+03)----Aggregate: groupBy=[[hits.SearchPhrase AS alias1]], aggr=[[]]
+04)------SubqueryAlias: hits
+05)--------TableScan: hits_raw projection=[SearchPhrase]
+physical_plan
+01)ProjectionExec: expr=[count(alias1)@0 as count(DISTINCT hits.SearchPhrase)]
+02)--AggregateExec: mode=Final, gby=[], aggr=[count(alias1)]
+03)----CoalescePartitionsExec
+04)------AggregateExec: mode=Partial, gby=[], aggr=[count(alias1)]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[alias1@0 as alias1], aggr=[]
+06)----------RepartitionExec: partitioning=Hash([alias1@0], 4), input_partitions=1
+07)------------AggregateExec: mode=Partial, gby=[SearchPhrase@0 as alias1], aggr=[]
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[SearchPhrase], file_type=parquet
+
 query I
 SELECT COUNT(DISTINCT "SearchPhrase") FROM hits;
 ----
 1
 
-query II
+## Q6
+query TT
+EXPLAIN SELECT MIN("EventDate"), MAX("EventDate") FROM hits;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[min(hits.EventDate), max(hits.EventDate)]]
+02)--SubqueryAlias: hits
+03)----Projection: CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) AS EventDate
+04)------TableScan: hits_raw projection=[EventDate]
+physical_plan
+01)AggregateExec: mode=Single, gby=[], aggr=[min(hits.EventDate), max(hits.EventDate)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[CAST(CAST(EventDate@5 AS Int32) AS Date32) as EventDate], file_type=parquet
+
+query DD
 SELECT MIN("EventDate"), MAX("EventDate") FROM hits;
 ----
-15901 15901
+2013-07-15 2013-07-15
+
+## Q7
+query TT
+EXPLAIN SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY COUNT(*) DESC;
+----
+logical_plan
+01)Sort: count(*) DESC NULLS FIRST
+02)--Projection: hits.AdvEngineID, count(Int64(1)) AS count(*)
+03)----Aggregate: groupBy=[[hits.AdvEngineID]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------Filter: hits_raw.AdvEngineID != Int16(0)
+06)----------TableScan: hits_raw projection=[AdvEngineID], partial_filters=[hits_raw.AdvEngineID != Int16(0)]
+physical_plan
+01)SortPreservingMergeExec: [count(*)@1 DESC]
+02)--SortExec: expr=[count(*)@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[AdvEngineID@0 as AdvEngineID, count(Int64(1))@1 as count(*)]
+04)------AggregateExec: mode=FinalPartitioned, gby=[AdvEngineID@0 as AdvEngineID], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([AdvEngineID@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[AdvEngineID@0 as AdvEngineID], aggr=[count(Int64(1))]
+07)------------FilterExec: AdvEngineID@0 != 0
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[AdvEngineID], file_type=parquet, predicate=AdvEngineID@40 != 0, pruning_predicate=AdvEngineID_null_count@2 != row_count@3 AND (AdvEngineID_min@0 != 0 OR 0 != AdvEngineID_max@1), required_guarantees=[AdvEngineID not in (0)]
 
 query II
 SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY COUNT(*) DESC;
 ----
 
+## Q8
+query TT
+EXPLAIN SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10;
+----
+logical_plan
+01)Sort: u DESC NULLS FIRST, fetch=10
+02)--Projection: hits.RegionID, count(alias1) AS u
+03)----Aggregate: groupBy=[[hits.RegionID]], aggr=[[count(alias1)]]
+04)------Aggregate: groupBy=[[hits.RegionID, hits.UserID AS alias1]], aggr=[[]]
+05)--------SubqueryAlias: hits
+06)----------TableScan: hits_raw projection=[RegionID, UserID]
+physical_plan
+01)SortPreservingMergeExec: [u@1 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[u@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[RegionID@0 as RegionID, count(alias1)@1 as u]
+04)------AggregateExec: mode=FinalPartitioned, gby=[RegionID@0 as RegionID], aggr=[count(alias1)]
+05)--------RepartitionExec: partitioning=Hash([RegionID@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[RegionID@0 as RegionID], aggr=[count(alias1)]
+07)------------AggregateExec: mode=FinalPartitioned, gby=[RegionID@0 as RegionID, alias1@1 as alias1], aggr=[]
+08)--------------RepartitionExec: partitioning=Hash([RegionID@0, alias1@1], 4), input_partitions=1
+09)----------------AggregateExec: mode=Partial, gby=[RegionID@0 as RegionID, UserID@1 as alias1], aggr=[]
+10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[RegionID, UserID], file_type=parquet
+
 query II rowsort
 SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10;
 ----
@@ -81,6 +257,25 @@ SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" O
 39 1
 839 2
 
+## Q9
+query TT
+EXPLAIN SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: hits.RegionID, sum(hits.AdvEngineID), count(Int64(1)) AS count(*) AS c, avg(hits.ResolutionWidth), count(DISTINCT hits.UserID)
+03)----Aggregate: groupBy=[[hits.RegionID]], aggr=[[sum(CAST(hits.AdvEngineID AS Int64)), count(Int64(1)), avg(CAST(hits.ResolutionWidth AS Float64)), count(DISTINCT hits.UserID)]]
+04)------SubqueryAlias: hits
+05)--------TableScan: hits_raw projection=[RegionID, UserID, ResolutionWidth, AdvEngineID]
+physical_plan
+01)SortPreservingMergeExec: [c@2 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@2 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[RegionID@0 as RegionID, sum(hits.AdvEngineID)@1 as sum(hits.AdvEngineID), count(Int64(1))@2 as c, avg(hits.ResolutionWidth)@3 as avg(hits.ResolutionWidth), count(DISTINCT hits.UserID)@4 as count(DISTINCT hits.UserID)]
+04)------AggregateExec: mode=FinalPartitioned, gby=[RegionID@0 as RegionID], aggr=[sum(hits.AdvEngineID), count(Int64(1)), avg(hits.ResolutionWidth), count(DISTINCT hits.UserID)]
+05)--------RepartitionExec: partitioning=Hash([RegionID@0], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[RegionID@0 as RegionID], aggr=[sum(hits.AdvEngineID), count(Int64(1)), avg(hits.ResolutionWidth), count(DISTINCT hits.UserID)]
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[RegionID, UserID, ResolutionWidth, AdvEngineID], file_type=parquet
+
 query IIIRI rowsort
 SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10;
 ----
@@ -89,26 +284,167 @@ SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), CO
 39 0 1 0 1
 839 0 6 0 2
 
+## Q10
+query TT
+EXPLAIN SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
+----
+logical_plan
+01)Sort: u DESC NULLS FIRST, fetch=10
+02)--Projection: hits.MobilePhoneModel, count(alias1) AS u
+03)----Aggregate: groupBy=[[hits.MobilePhoneModel]], aggr=[[count(alias1)]]
+04)------Aggregate: groupBy=[[hits.MobilePhoneModel, hits.UserID AS alias1]], aggr=[[]]
+05)--------SubqueryAlias: hits
+06)----------Filter: hits_raw.MobilePhoneModel != Utf8View("")
+07)------------TableScan: hits_raw projection=[UserID, MobilePhoneModel], partial_filters=[hits_raw.MobilePhoneModel != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [u@1 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[u@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[MobilePhoneModel@0 as MobilePhoneModel, count(alias1)@1 as u]
+04)------AggregateExec: mode=FinalPartitioned, gby=[MobilePhoneModel@0 as MobilePhoneModel], aggr=[count(alias1)]
+05)--------RepartitionExec: partitioning=Hash([MobilePhoneModel@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[MobilePhoneModel@0 as MobilePhoneModel], aggr=[count(alias1)]
+07)------------AggregateExec: mode=FinalPartitioned, gby=[MobilePhoneModel@0 as MobilePhoneModel, alias1@1 as alias1], aggr=[]
+08)--------------RepartitionExec: partitioning=Hash([MobilePhoneModel@0, alias1@1], 4), input_partitions=4
+09)----------------AggregateExec: mode=Partial, gby=[MobilePhoneModel@1 as MobilePhoneModel, UserID@0 as alias1], aggr=[]
+10)------------------FilterExec: MobilePhoneModel@1 != 
+11)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+12)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[UserID, MobilePhoneModel], file_type=parquet, predicate=MobilePhoneModel@34 != , pruning_predicate=MobilePhoneModel_null_count@2 != row_count@3 AND (MobilePhoneModel_min@0 !=  OR  != MobilePhoneModel_max@1), required_guarantees=[MobilePhoneModel not in ()]
+
 query TI
 SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
 ----
 
+## Q11
+query TT
+EXPLAIN SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
+----
+logical_plan
+01)Sort: u DESC NULLS FIRST, fetch=10
+02)--Projection: hits.MobilePhone, hits.MobilePhoneModel, count(alias1) AS u
+03)----Aggregate: groupBy=[[hits.MobilePhone, hits.MobilePhoneModel]], aggr=[[count(alias1)]]
+04)------Aggregate: groupBy=[[hits.MobilePhone, hits.MobilePhoneModel, hits.UserID AS alias1]], aggr=[[]]
+05)--------SubqueryAlias: hits
+06)----------Filter: hits_raw.MobilePhoneModel != Utf8View("")
+07)------------TableScan: hits_raw projection=[UserID, MobilePhone, MobilePhoneModel], partial_filters=[hits_raw.MobilePhoneModel != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [u@2 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[u@2 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[MobilePhone@0 as MobilePhone, MobilePhoneModel@1 as MobilePhoneModel, count(alias1)@2 as u]
+04)------AggregateExec: mode=FinalPartitioned, gby=[MobilePhone@0 as MobilePhone, MobilePhoneModel@1 as MobilePhoneModel], aggr=[count(alias1)]
+05)--------RepartitionExec: partitioning=Hash([MobilePhone@0, MobilePhoneModel@1], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[MobilePhone@0 as MobilePhone, MobilePhoneModel@1 as MobilePhoneModel], aggr=[count(alias1)]
+07)------------AggregateExec: mode=FinalPartitioned, gby=[MobilePhone@0 as MobilePhone, MobilePhoneModel@1 as MobilePhoneModel, alias1@2 as alias1], aggr=[]
+08)--------------RepartitionExec: partitioning=Hash([MobilePhone@0, MobilePhoneModel@1, alias1@2], 4), input_partitions=4
+09)----------------AggregateExec: mode=Partial, gby=[MobilePhone@1 as MobilePhone, MobilePhoneModel@2 as MobilePhoneModel, UserID@0 as alias1], aggr=[]
+10)------------------FilterExec: MobilePhoneModel@2 != 
+11)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+12)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[UserID, MobilePhone, MobilePhoneModel], file_type=parquet, predicate=MobilePhoneModel@34 != , pruning_predicate=MobilePhoneModel_null_count@2 != row_count@3 AND (MobilePhoneModel_min@0 !=  OR  != MobilePhoneModel_max@1), required_guarantees=[MobilePhoneModel not in ()]
+
 query ITI
 SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
 ----
 
+## Q12
+query TT
+EXPLAIN SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: hits.SearchPhrase, count(Int64(1)) AS count(*) AS c
+03)----Aggregate: groupBy=[[hits.SearchPhrase]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------Filter: hits_raw.SearchPhrase != Utf8View("")
+06)----------TableScan: hits_raw projection=[SearchPhrase], partial_filters=[hits_raw.SearchPhrase != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [c@1 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase, count(Int64(1))@1 as c]
+04)------AggregateExec: mode=FinalPartitioned, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([SearchPhrase@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1))]
+07)------------FilterExec: SearchPhrase@0 != 
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[SearchPhrase], file_type=parquet, predicate=SearchPhrase@39 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 !=  OR  != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()]
+
 query TI
 SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
 ----
 
+## Q13
+query TT
+EXPLAIN SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10;
+----
+logical_plan
+01)Sort: u DESC NULLS FIRST, fetch=10
+02)--Projection: hits.SearchPhrase, count(alias1) AS u
+03)----Aggregate: groupBy=[[hits.SearchPhrase]], aggr=[[count(alias1)]]
+04)------Aggregate: groupBy=[[hits.SearchPhrase, hits.UserID AS alias1]], aggr=[[]]
+05)--------SubqueryAlias: hits
+06)----------Filter: hits_raw.SearchPhrase != Utf8View("")
+07)------------TableScan: hits_raw projection=[UserID, SearchPhrase], partial_filters=[hits_raw.SearchPhrase != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [u@1 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[u@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase, count(alias1)@1 as u]
+04)------AggregateExec: mode=FinalPartitioned, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(alias1)]
+05)--------RepartitionExec: partitioning=Hash([SearchPhrase@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(alias1)]
+07)------------AggregateExec: mode=FinalPartitioned, gby=[SearchPhrase@0 as SearchPhrase, alias1@1 as alias1], aggr=[]
+08)--------------RepartitionExec: partitioning=Hash([SearchPhrase@0, alias1@1], 4), input_partitions=4
+09)----------------AggregateExec: mode=Partial, gby=[SearchPhrase@1 as SearchPhrase, UserID@0 as alias1], aggr=[]
+10)------------------FilterExec: SearchPhrase@1 != 
+11)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+12)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[UserID, SearchPhrase], file_type=parquet, predicate=SearchPhrase@39 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 !=  OR  != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()]
+
 query TI
 SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10;
 ----
 
+## Q14
+query TT
+EXPLAIN SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: hits.SearchEngineID, hits.SearchPhrase, count(Int64(1)) AS count(*) AS c
+03)----Aggregate: groupBy=[[hits.SearchEngineID, hits.SearchPhrase]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------Filter: hits_raw.SearchPhrase != Utf8View("")
+06)----------TableScan: hits_raw projection=[SearchEngineID, SearchPhrase], partial_filters=[hits_raw.SearchPhrase != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [c@2 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@2 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[SearchEngineID@0 as SearchEngineID, SearchPhrase@1 as SearchPhrase, count(Int64(1))@2 as c]
+04)------AggregateExec: mode=FinalPartitioned, gby=[SearchEngineID@0 as SearchEngineID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([SearchEngineID@0, SearchPhrase@1], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[SearchEngineID@0 as SearchEngineID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))]
+07)------------FilterExec: SearchPhrase@1 != 
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[SearchEngineID, SearchPhrase], file_type=parquet, predicate=SearchPhrase@39 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 !=  OR  != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()]
+
 query ITI
 SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10;
 ----
 
+## Q15
+query TT
+EXPLAIN SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10;
+----
+logical_plan
+01)Sort: count(*) DESC NULLS FIRST, fetch=10
+02)--Projection: hits.UserID, count(Int64(1)) AS count(*)
+03)----Aggregate: groupBy=[[hits.UserID]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------TableScan: hits_raw projection=[UserID]
+physical_plan
+01)SortPreservingMergeExec: [count(*)@1 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[count(*)@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[UserID@0 as UserID, count(Int64(1))@1 as count(*)]
+04)------AggregateExec: mode=FinalPartitioned, gby=[UserID@0 as UserID], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([UserID@0], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[UserID@0 as UserID], aggr=[count(Int64(1))]
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[UserID], file_type=parquet
+
 query II rowsort
 SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10;
 ----
@@ -118,6 +454,25 @@ SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIM
 519640690937130534 2
 7418527520126366595 1
 
+## Q16
+query TT
+EXPLAIN SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
+----
+logical_plan
+01)Sort: count(*) DESC NULLS FIRST, fetch=10
+02)--Projection: hits.UserID, hits.SearchPhrase, count(Int64(1)) AS count(*)
+03)----Aggregate: groupBy=[[hits.UserID, hits.SearchPhrase]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------TableScan: hits_raw projection=[UserID, SearchPhrase]
+physical_plan
+01)SortPreservingMergeExec: [count(*)@2 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[count(*)@2 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase, count(Int64(1))@2 as count(*)]
+04)------AggregateExec: mode=FinalPartitioned, gby=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([UserID@0, SearchPhrase@1], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))]
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[UserID, SearchPhrase], file_type=parquet
+
 query ITI rowsort
 SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
 ----
@@ -127,6 +482,24 @@ SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPh
 519640690937130534 (empty) 2
 7418527520126366595 (empty) 1
 
+## Q17
+query TT
+EXPLAIN SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10;
+----
+logical_plan
+01)Projection: hits.UserID, hits.SearchPhrase, count(Int64(1)) AS count(*)
+02)--Limit: skip=0, fetch=10
+03)----Aggregate: groupBy=[[hits.UserID, hits.SearchPhrase]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------TableScan: hits_raw projection=[UserID, SearchPhrase]
+physical_plan
+01)ProjectionExec: expr=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase, count(Int64(1))@2 as count(*)]
+02)--CoalescePartitionsExec: fetch=10
+03)----AggregateExec: mode=FinalPartitioned, gby=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))]
+04)------RepartitionExec: partitioning=Hash([UserID@0, SearchPhrase@1], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))]
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[UserID, SearchPhrase], file_type=parquet
+
 query ITI rowsort
 SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10;
 ----
@@ -136,6 +509,25 @@ SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPh
 519640690937130534 (empty) 2
 7418527520126366595 (empty) 1
 
+## Q18
+query TT
+EXPLAIN SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
+----
+logical_plan
+01)Sort: count(*) DESC NULLS FIRST, fetch=10
+02)--Projection: hits.UserID, date_part(Utf8("MINUTE"),to_timestamp_seconds(hits.EventTime)) AS m, hits.SearchPhrase, count(Int64(1)) AS count(*)
+03)----Aggregate: groupBy=[[hits.UserID, date_part(Utf8("MINUTE"), to_timestamp_seconds(hits.EventTime)), hits.SearchPhrase]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------TableScan: hits_raw projection=[EventTime, UserID, SearchPhrase]
+physical_plan
+01)SortPreservingMergeExec: [count(*)@3 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[count(*)@3 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[UserID@0 as UserID, date_part(Utf8("MINUTE"),to_timestamp_seconds(hits.EventTime))@1 as m, SearchPhrase@2 as SearchPhrase, count(Int64(1))@3 as count(*)]
+04)------AggregateExec: mode=FinalPartitioned, gby=[UserID@0 as UserID, date_part(Utf8("MINUTE"),to_timestamp_seconds(hits.EventTime))@1 as date_part(Utf8("MINUTE"),to_timestamp_seconds(hits.EventTime)), SearchPhrase@2 as SearchPhrase], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([UserID@0, date_part(Utf8("MINUTE"),to_timestamp_seconds(hits.EventTime))@1, SearchPhrase@2], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[UserID@1 as UserID, date_part(MINUTE, to_timestamp_seconds(EventTime@0)) as date_part(Utf8("MINUTE"),to_timestamp_seconds(hits.EventTime)), SearchPhrase@2 as SearchPhrase], aggr=[count(Int64(1))]
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[EventTime, UserID, SearchPhrase], file_type=parquet
+
 query IITI rowsort
 SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
 ----
@@ -150,60 +542,338 @@ SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "S
 519640690937130534 36 (empty) 1
 7418527520126366595 18 (empty) 1
 
+## Q19
+query TT
+EXPLAIN SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449;
+----
+logical_plan
+01)SubqueryAlias: hits
+02)--Filter: hits_raw.UserID = Int64(435090932899640449)
+03)----TableScan: hits_raw projection=[UserID], partial_filters=[hits_raw.UserID = Int64(435090932899640449)]
+physical_plan
+01)FilterExec: UserID@0 = 435090932899640449
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[UserID], file_type=parquet, predicate=UserID@9 = 435090932899640449, pruning_predicate=UserID_null_count@2 != row_count@3 AND UserID_min@0 <= 435090932899640449 AND 435090932899640449 <= UserID_max@1, required_guarantees=[UserID in (435090932899640449)]
+
 query I
 SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449;
 ----
 
+## Q20
+query TT
+EXPLAIN SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%';
+----
+logical_plan
+01)Projection: count(Int64(1)) AS count(*)
+02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+03)----SubqueryAlias: hits
+04)------Projection:
+05)--------Filter: hits_raw.URL LIKE Utf8View("%google%")
+06)----------TableScan: hits_raw projection=[URL], partial_filters=[hits_raw.URL LIKE Utf8View("%google%")]
+physical_plan
+01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
+02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
+03)----CoalescePartitionsExec
+04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
+05)--------FilterExec: URL@0 LIKE %google%, projection=[]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[URL], file_type=parquet, predicate=URL@13 LIKE %google%
+
 query I
 SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%';
 ----
 0
 
+## Q21
+query TT
+EXPLAIN SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: hits.SearchPhrase, min(hits.URL), count(Int64(1)) AS count(*) AS c
+03)----Aggregate: groupBy=[[hits.SearchPhrase]], aggr=[[min(hits.URL), count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------Filter: hits_raw.URL LIKE Utf8View("%google%") AND hits_raw.SearchPhrase != Utf8View("")
+06)----------TableScan: hits_raw projection=[URL, SearchPhrase], partial_filters=[hits_raw.URL LIKE Utf8View("%google%"), hits_raw.SearchPhrase != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [c@2 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@2 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase, min(hits.URL)@1 as min(hits.URL), count(Int64(1))@2 as c]
+04)------AggregateExec: mode=FinalPartitioned, gby=[SearchPhrase@0 as SearchPhrase], aggr=[min(hits.URL), count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([SearchPhrase@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[SearchPhrase@1 as SearchPhrase], aggr=[min(hits.URL), count(Int64(1))]
+07)------------FilterExec: URL@0 LIKE %google% AND SearchPhrase@1 != 
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[URL, SearchPhrase], file_type=parquet, predicate=URL@13 LIKE %google% AND SearchPhrase@39 != , pruning_predicate=SearchPhrase_null_count@4 != row_count@5 AND (SearchPhrase_min@2 !=  OR  != SearchPhrase_max@3), required_guarantees=[SearchPhrase not in ()]
+
 query TTI
 SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
 ----
 
+## Q22
+query TT
+EXPLAIN SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: hits.SearchPhrase, min(hits.URL), min(hits.Title), count(Int64(1)) AS count(*) AS c, count(DISTINCT hits.UserID)
+03)----Aggregate: groupBy=[[hits.SearchPhrase]], aggr=[[min(hits.URL), min(hits.Title), count(Int64(1)), count(DISTINCT hits.UserID)]]
+04)------SubqueryAlias: hits
+05)--------Filter: hits_raw.Title LIKE Utf8View("%Google%") AND hits_raw.URL NOT LIKE Utf8View("%.google.%") AND hits_raw.SearchPhrase != Utf8View("")
+06)----------TableScan: hits_raw projection=[Title, UserID, URL, SearchPhrase], partial_filters=[hits_raw.Title LIKE Utf8View("%Google%"), hits_raw.URL NOT LIKE Utf8View("%.google.%"), hits_raw.SearchPhrase != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [c@3 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@3 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase, min(hits.URL)@1 as min(hits.URL), min(hits.Title)@2 as min(hits.Title), count(Int64(1))@3 as c, count(DISTINCT hits.UserID)@4 as count(DISTINCT hits.UserID)]
+04)------AggregateExec: mode=FinalPartitioned, gby=[SearchPhrase@0 as SearchPhrase], aggr=[min(hits.URL), min(hits.Title), count(Int64(1)), count(DISTINCT hits.UserID)]
+05)--------RepartitionExec: partitioning=Hash([SearchPhrase@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[SearchPhrase@3 as SearchPhrase], aggr=[min(hits.URL), min(hits.Title), count(Int64(1)), count(DISTINCT hits.UserID)]
+07)------------FilterExec: Title@0 LIKE %Google% AND URL@2 NOT LIKE %.google.% AND SearchPhrase@3 != 
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[Title, UserID, URL, SearchPhrase], file_type=parquet, predicate=Title@2 LIKE %Google% AND URL@13 NOT LIKE %.google.% AND SearchPhrase@39 != , pruning_predicate=SearchPhrase_null_count@6 != row_count@7 AND (SearchPhrase_min@4 !=  OR  != SearchPhrase_max@5), required_guarantees=[SearchPhrase not in ()]
+
 query TTTII
 SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
 ----
 
-query IITIIIIIIIIIITTIIIIIIIIIITIIITIIIITTIIITIIIIIIIIIITIIIIITIIIIIITIIIIIIIIIITTTTIIIIIIIITITTITTTTTTTTTTIIII
+## Q23
+query TT
+EXPLAIN SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY "EventTime" LIMIT 10;
+----
+logical_plan
+01)Sort: hits.EventTime ASC NULLS LAST, fetch=10
+02)--SubqueryAlias: hits
+03)----Projection: hits_raw.WatchID, hits_raw.JavaEnable, hits_raw.Title, hits_raw.GoodEvent, hits_raw.EventTime, hits_raw.CounterID, hits_raw.ClientIP, hits_raw.RegionID, hits_raw.UserID, hits_raw.CounterClass, hits_raw.OS, hits_raw.UserAgent, hits_raw.URL, hits_raw.Referer, hits_raw.IsRefresh, hits_raw.RefererCategoryID, hits_raw.RefererRegionID, hits_raw.URLCategoryID, hits_raw.URLRegionID, hits_raw.ResolutionWidth, hits_raw.ResolutionHeight, hits_raw.ResolutionDepth, hits_raw.FlashMajor, hits_raw.FlashMinor, hits_raw.FlashMinor2, hits_raw.NetMajor, hits_raw.NetMinor, hits_raw.UserAgentMajor, hits_raw.UserAgentMinor, hits_raw.CookieEnable, hits_raw.JavascriptEnable, hits_raw.IsMobile, hits_raw.MobilePhone, hits_raw.MobilePhoneModel, hits_raw.Params, hits_raw.IPNetworkID, hits_raw.TraficSourceID, hits_raw.SearchEngineID, hits_raw.SearchPhrase, hits_raw.AdvEngineID, hits_raw.IsArtifical, hits_raw.WindowClientWidth, hits_raw.WindowClientHeight, hits_raw.ClientTimeZone, hits_raw.ClientEventTime, hits_raw.SilverlightVersion1, hits_raw.SilverlightVersion2, hits_raw.SilverlightVersion3, hits_raw.SilverlightVersion4, hits_raw.PageCharset, hits_raw.CodeVersion, hits_raw.IsLink, hits_raw.IsDownload, hits_raw.IsNotBounce, hits_raw.FUniqID, hits_raw.OriginalURL, hits_raw.HID, hits_raw.IsOldCounter, hits_raw.IsEvent, hits_raw.IsParameter, hits_raw.DontCountHits, hits_raw.WithHash, hits_raw.HitColor, hits_raw.LocalEventTime, hits_raw.Age, hits_raw.Sex, hits_raw.Income, hits_raw.Interests, hits_raw.Robotness, hits_raw.RemoteIP, hits_raw.WindowName, hits_raw.OpenerName, hits_raw.HistoryLength, hits_raw.BrowserLanguage, hits_raw.BrowserCountry, hits_raw.SocialNetwork, hits_raw.SocialAction, hits_raw.HTTPError, hits_raw.SendTiming, hits_raw.DNSTiming, hits_raw.ConnectTiming, hits_raw.ResponseStartTiming, hits_raw.ResponseEndTiming, hits_raw.FetchTiming, hits_raw.SocialSourceNetworkID, hits_raw.SocialSourcePage, hits_raw.ParamPrice, hits_raw.ParamOrderID, hits_raw.ParamCurrency, hits_raw.ParamCurrencyID, hits_raw.OpenstatServiceName, hits_raw.OpenstatCampaignID, hits_raw.OpenstatAdID, hits_raw.OpenstatSourceID, hits_raw.UTMSource, hits_raw.UTMMedium, hits_raw.UTMCampaign, hits_raw.UTMContent, hits_raw.UTMTerm, hits_raw.FromTag, hits_raw.HasGCLID, hits_raw.RefererHash, hits_raw.URLHash, hits_raw.CLID, CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) AS EventDate
+04)------Filter: hits_raw.URL LIKE Utf8View("%google%")
+05)--------TableScan: hits_raw projection=[WatchID, JavaEnable, Title, GoodEvent, EventTime, EventDate, CounterID, ClientIP, RegionID, UserID, CounterClass, OS, UserAgent, URL, Referer, IsRefresh, RefererCategoryID, RefererRegionID, URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable, IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, WindowClientWidth, WindowClientHeight, ClientTimeZone, ClientEventTime, SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, IsNotBounce, FUniqID, OriginalURL, HID, IsOldCounter, IsEvent, IsParameter, DontCountHits, WithHash, HitColor, LocalEventTime, Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, OpenerName, HistoryLength, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID, OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, UTMContent, UTMTerm, FromTag, HasGCLID, RefererHash, URLHash, CLID], partial_filters=[hits_raw.URL LIKE Utf8View("%google%")]
+physical_plan
+01)SortPreservingMergeExec: [EventTime@4 ASC NULLS LAST], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[EventTime@4 ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[WatchID@0 as WatchID, JavaEnable@1 as JavaEnable, Title@2 as Title, GoodEvent@3 as GoodEvent, EventTime@4 as EventTime, CounterID@6 as CounterID, ClientIP@7 as ClientIP, RegionID@8 as RegionID, UserID@9 as UserID, CounterClass@10 as CounterClass, OS@11 as OS, UserAgent@12 as UserAgent, URL@13 as URL, Referer@14 as Referer, IsRefresh@15 as IsRefresh, RefererCategoryID@16 as RefererCategoryID, RefererRegionID@17 as RefererRegionID, URLCategoryID@18 as URLCategoryID, URLRegionID@19 as URLRegionID, ResolutionWidth@20 as ResolutionWidth, ResolutionHeight@21 as ResolutionHeight, ResolutionDepth@22 as ResolutionDepth, FlashMajor@23 as FlashMajor, FlashMinor@24 as FlashMinor, FlashMinor2@25 as FlashMinor2, NetMajor@26 as NetMajor, NetMinor@27 as NetMinor, UserAgentMajor@28 as UserAgentMajor, UserAgentMinor@29 as UserAgentMinor, CookieEnable@30 as CookieEnable, JavascriptEnable@31 as JavascriptEnable, IsMobile@32 as IsMobile, MobilePhone@33 as MobilePhone, MobilePhoneModel@34 as MobilePhoneModel, Params@35 as Params, IPNetworkID@36 as IPNetworkID, TraficSourceID@37 as TraficSourceID, SearchEngineID@38 as SearchEngineID, SearchPhrase@39 as SearchPhrase, AdvEngineID@40 as AdvEngineID, IsArtifical@41 as IsArtifical, WindowClientWidth@42 as WindowClientWidth, WindowClientHeight@43 as WindowClientHeight, ClientTimeZone@44 as ClientTimeZone, ClientEventTime@45 as ClientEventTime, SilverlightVersion1@46 as SilverlightVersion1, SilverlightVersion2@47 as SilverlightVersion2, SilverlightVersion3@48 as SilverlightVersion3, SilverlightVersion4@49 as SilverlightVersion4, PageCharset@50 as PageCharset, CodeVersion@51 as CodeVersion, IsLink@52 as IsLink, IsDownload@53 as IsDownload, IsNotBounce@54 as IsNotBounce, FUniqID@55 as FUniqID, OriginalURL@56 as OriginalURL, HID@57 as HID, IsOldCounter@58 as IsOldCounter, IsEvent@59 as IsEvent, IsParameter@60 as IsParameter, DontCountHits@61 as DontCountHits, WithHash@62 as WithHash, HitColor@63 as HitColor, LocalEventTime@64 as LocalEventTime, Age@65 as Age, Sex@66 as Sex, Income@67 as Income, Interests@68 as Interests, Robotness@69 as Robotness, RemoteIP@70 as RemoteIP, WindowName@71 as WindowName, OpenerName@72 as OpenerName, HistoryLength@73 as HistoryLength, BrowserLanguage@74 as BrowserLanguage, BrowserCountry@75 as BrowserCountry, SocialNetwork@76 as SocialNetwork, SocialAction@77 as SocialAction, HTTPError@78 as HTTPError, SendTiming@79 as SendTiming, DNSTiming@80 as DNSTiming, ConnectTiming@81 as ConnectTiming, ResponseStartTiming@82 as ResponseStartTiming, ResponseEndTiming@83 as ResponseEndTiming, FetchTiming@84 as FetchTiming, SocialSourceNetworkID@85 as SocialSourceNetworkID, SocialSourcePage@86 as SocialSourcePage, ParamPrice@87 as ParamPrice, ParamOrderID@88 as ParamOrderID, ParamCurrency@89 as ParamCurrency, ParamCurrencyID@90 as ParamCurrencyID, OpenstatServiceName@91 as OpenstatServiceName, OpenstatCampaignID@92 as OpenstatCampaignID, OpenstatAdID@93 as OpenstatAdID, OpenstatSourceID@94 as OpenstatSourceID, UTMSource@95 as UTMSource, UTMMedium@96 as UTMMedium, UTMCampaign@97 as UTMCampaign, UTMContent@98 as UTMContent, UTMTerm@99 as UTMTerm, FromTag@100 as FromTag, HasGCLID@101 as HasGCLID, RefererHash@102 as RefererHash, URLHash@103 as URLHash, CLID@104 as CLID, CAST(CAST(EventDate@5 AS Int32) AS Date32) as EventDate]
+04)------FilterExec: URL@13 LIKE %google%
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[WatchID, JavaEnable, Title, GoodEvent, EventTime, EventDate, CounterID, ClientIP, RegionID, UserID, CounterClass, OS, UserAgent, URL, Referer, IsRefresh, RefererCategoryID, RefererRegionID, URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable, IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, WindowClientWidth, WindowClientHeight, ClientTimeZone, ClientEventTime, SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, IsNotBounce, FUniqID, OriginalURL, HID, IsOldCounter, IsEvent, IsParameter, DontCountHits, WithHash, HitColor, LocalEventTime, Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, OpenerName, HistoryLength, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID, OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, UTMContent, UTMTerm, FromTag, HasGCLID, RefererHash, URLHash, CLID], file_type=parquet, predicate=URL@13 LIKE %google% AND DynamicFilter [ empty ]
+
+query IITIIIIIIIIITTIIIIIIIIIITIIITIIIITTIIITIIIIIIIIIITIIIIITIIIIIITIIIIIIIIIITTTTIIIIIIIITITTITTTTTTTTTTIIIID
 SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY "EventTime" LIMIT 10;
 ----
 
+## Q24
+query TT
+EXPLAIN SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime" LIMIT 10;
+----
+logical_plan
+01)Projection: hits.SearchPhrase
+02)--Sort: hits.EventTime ASC NULLS LAST, fetch=10
+03)----Projection: hits.SearchPhrase, hits.EventTime
+04)------SubqueryAlias: hits
+05)--------Filter: hits_raw.SearchPhrase != Utf8View("")
+06)----------TableScan: hits_raw projection=[EventTime, SearchPhrase], partial_filters=[hits_raw.SearchPhrase != Utf8View("")]
+physical_plan
+01)ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase]
+02)--SortPreservingMergeExec: [EventTime@1 ASC NULLS LAST], fetch=10
+03)----SortExec: TopK(fetch=10), expr=[EventTime@1 ASC NULLS LAST], preserve_partitioning=[true]
+04)------FilterExec: SearchPhrase@1 != , projection=[SearchPhrase@1, EventTime@0]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[EventTime, SearchPhrase], file_type=parquet, predicate=SearchPhrase@39 !=  AND DynamicFilter [ empty ], pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 !=  OR  != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()]
+
 query T
 SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime" LIMIT 10;
 ----
 
+## Q25
+query TT
+EXPLAIN SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10;
+----
+logical_plan
+01)Sort: hits.SearchPhrase ASC NULLS LAST, fetch=10
+02)--SubqueryAlias: hits
+03)----Filter: hits_raw.SearchPhrase != Utf8View("")
+04)------TableScan: hits_raw projection=[SearchPhrase], partial_filters=[hits_raw.SearchPhrase != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [SearchPhrase@0 ASC NULLS LAST], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[SearchPhrase@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----FilterExec: SearchPhrase@0 != 
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[SearchPhrase], file_type=parquet, predicate=SearchPhrase@39 !=  AND DynamicFilter [ empty ], pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 !=  OR  != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()]
+
 query T
 SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10;
 ----
 
+## Q26
+query TT
+EXPLAIN SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime", "SearchPhrase" LIMIT 10;
+----
+logical_plan
+01)Projection: hits.SearchPhrase
+02)--Sort: hits.EventTime ASC NULLS LAST, hits.SearchPhrase ASC NULLS LAST, fetch=10
+03)----Projection: hits.SearchPhrase, hits.EventTime
+04)------SubqueryAlias: hits
+05)--------Filter: hits_raw.SearchPhrase != Utf8View("")
+06)----------TableScan: hits_raw projection=[EventTime, SearchPhrase], partial_filters=[hits_raw.SearchPhrase != Utf8View("")]
+physical_plan
+01)ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase]
+02)--SortPreservingMergeExec: [EventTime@1 ASC NULLS LAST, SearchPhrase@0 ASC NULLS LAST], fetch=10
+03)----SortExec: TopK(fetch=10), expr=[EventTime@1 ASC NULLS LAST, SearchPhrase@0 ASC NULLS LAST], preserve_partitioning=[true]
+04)------FilterExec: SearchPhrase@1 != , projection=[SearchPhrase@1, EventTime@0]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[EventTime, SearchPhrase], file_type=parquet, predicate=SearchPhrase@39 !=  AND DynamicFilter [ empty ], pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 !=  OR  != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()]
+
 query T
 SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime", "SearchPhrase" LIMIT 10;
 ----
 
+## Q27
+query TT
+EXPLAIN SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
+----
+logical_plan
+01)Sort: l DESC NULLS FIRST, fetch=25
+02)--Projection: hits.CounterID, avg(length(hits.URL)) AS l, count(Int64(1)) AS count(*) AS c
+03)----Filter: count(Int64(1)) > Int64(100000)
+04)------Aggregate: groupBy=[[hits.CounterID]], aggr=[[avg(CAST(character_length(hits.URL) AS length(hits.URL) AS Float64)), count(Int64(1))]]
+05)--------SubqueryAlias: hits
+06)----------Filter: hits_raw.URL != Utf8View("")
+07)------------TableScan: hits_raw projection=[CounterID, URL], partial_filters=[hits_raw.URL != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [l@1 DESC], fetch=25
+02)--SortExec: TopK(fetch=25), expr=[l@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[CounterID@0 as CounterID, avg(length(hits.URL))@1 as l, count(Int64(1))@2 as c]
+04)------FilterExec: count(Int64(1))@2 > 100000
+05)--------AggregateExec: mode=FinalPartitioned, gby=[CounterID@0 as CounterID], aggr=[avg(length(hits.URL)), count(Int64(1))]
+06)----------RepartitionExec: partitioning=Hash([CounterID@0], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[CounterID@0 as CounterID], aggr=[avg(length(hits.URL)), count(Int64(1))]
+08)--------------FilterExec: URL@1 != 
+09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[CounterID, URL], file_type=parquet, predicate=URL@13 != , pruning_predicate=URL_null_count@2 != row_count@3 AND (URL_min@0 !=  OR  != URL_max@1), required_guarantees=[URL not in ()]
+
 query IRI
 SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
 ----
 
+## Q28
+query TT
+EXPLAIN SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
+----
+logical_plan
+01)Sort: l DESC NULLS FIRST, fetch=25
+02)--Projection: regexp_replace(hits.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("\1")) AS k, avg(length(hits.Referer)) AS l, count(Int64(1)) AS count(*) AS c, min(hits.Referer)
+03)----Filter: count(Int64(1)) > Int64(100000)
+04)------Aggregate: groupBy=[[regexp_replace(hits.Referer, Utf8View("^https?://(?:www\.)?([^/]+)/.*$"), Utf8View("\1")) AS regexp_replace(hits.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("\1"))]], aggr=[[avg(CAST(character_length(hits.Referer) AS length(hits.Referer) AS Float64)), count(Int64(1)), min(hits.Referer)]]
+05)--------SubqueryAlias: hits
+06)----------Filter: hits_raw.Referer != Utf8View("")
+07)------------TableScan: hits_raw projection=[Referer], partial_filters=[hits_raw.Referer != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [l@1 DESC], fetch=25
+02)--SortExec: TopK(fetch=25), expr=[l@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[regexp_replace(hits.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("\1"))@0 as k, avg(length(hits.Referer))@1 as l, count(Int64(1))@2 as c, min(hits.Referer)@3 as min(hits.Referer)]
+04)------FilterExec: count(Int64(1))@2 > 100000
+05)--------AggregateExec: mode=FinalPartitioned, gby=[regexp_replace(hits.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("\1"))@0 as regexp_replace(hits.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("\1"))], aggr=[avg(length(hits.Referer)), count(Int64(1)), min(hits.Referer)]
+06)----------RepartitionExec: partitioning=Hash([regexp_replace(hits.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("\1"))@0], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[regexp_replace(Referer@0, ^https?://(?:www\.)?([^/]+)/.*$, \1) as regexp_replace(hits.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("\1"))], aggr=[avg(length(hits.Referer)), count(Int64(1)), min(hits.Referer)]
+08)--------------FilterExec: Referer@0 != 
+09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[Referer], file_type=parquet, predicate=Referer@14 != , pruning_predicate=Referer_null_count@2 != row_count@3 AND (Referer_min@0 !=  OR  != Referer_max@1), required_guarantees=[Referer not in ()]
+
 query TRIT
 SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
 ----
 
+## Q29
+query TT
+EXPLAIN SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits;
+----
+logical_plan
+01)Projection: sum(hits.ResolutionWidth), sum(hits.ResolutionWidth) + __common_expr_1 AS sum(hits.ResolutionWidth + Int64(1)), sum(hits.ResolutionWidth) + Int64(2) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(2)), sum(hits.ResolutionWidth) + Int64(3) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(3)), sum(hits.ResolutionWidth) + Int64(4) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(4)), sum(hits.ResolutionWidth) + Int64(5) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(5)), sum(hits.ResolutionWidth) + Int64(6) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(6)), sum(hits.ResolutionWidth) + Int64(7) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(7)), sum(hits.ResolutionWidth) + Int64(8) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(8)), sum(hits.ResolutionWidth) + Int64(9) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(9)), sum(hits.ResolutionWidth) + Int64(10) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(10)), sum(hits.ResolutionWidth) + Int64(11) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(11)), sum(hits.ResolutionWidth) + Int64(12) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(12)), sum(hits.ResolutionWidth) + Int64(13) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(13)), sum(hits.ResolutionWidth) + Int64(14) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(14)), sum(hits.ResolutionWidth) + Int64(15) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(15)), sum(hits.ResolutionWidth) + Int64(16) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(16)), sum(hits.ResolutionWidth) + Int64(17) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(17)), sum(hits.ResolutionWidth) + Int64(18) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(18)), sum(hits.ResolutionWidth) + Int64(19) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(19)), sum(hits.ResolutionWidth) + Int64(20) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(20)), sum(hits.ResolutionWidth) + Int64(21) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(21)), sum(hits.ResolutionWidth) + Int64(22) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(22)), sum(hits.ResolutionWidth) + Int64(23) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(23)), sum(hits.ResolutionWidth) + Int64(24) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(24)), sum(hits.ResolutionWidth) + Int64(25) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(25)), sum(hits.ResolutionWidth) + Int64(26) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(26)), sum(hits.ResolutionWidth) + Int64(27) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(27)), sum(hits.ResolutionWidth) + Int64(28) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(28)), sum(hits.ResolutionWidth) + Int64(29) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(29)), sum(hits.ResolutionWidth) + Int64(30) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(30)), sum(hits.ResolutionWidth) + Int64(31) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(31)), sum(hits.ResolutionWidth) + Int64(32) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(32)), sum(hits.ResolutionWidth) + Int64(33) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(33)), sum(hits.ResolutionWidth) + Int64(34) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(34)), sum(hits.ResolutionWidth) + Int64(35) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(35)), sum(hits.ResolutionWidth) + Int64(36) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(36)), sum(hits.ResolutionWidth) + Int64(37) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(37)), sum(hits.ResolutionWidth) + Int64(38) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(38)), sum(hits.ResolutionWidth) + Int64(39) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(39)), sum(hits.ResolutionWidth) + Int64(40) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(40)), sum(hits.ResolutionWidth) + Int64(41) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(41)), sum(hits.ResolutionWidth) + Int64(42) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(42)), sum(hits.ResolutionWidth) + Int64(43) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(43)), sum(hits.ResolutionWidth) + Int64(44) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(44)), sum(hits.ResolutionWidth) + Int64(45) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(45)), sum(hits.ResolutionWidth) + Int64(46) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(46)), sum(hits.ResolutionWidth) + Int64(47) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(47)), sum(hits.ResolutionWidth) + Int64(48) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(48)), sum(hits.ResolutionWidth) + Int64(49) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(49)), sum(hits.ResolutionWidth) + Int64(50) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(50)), sum(hits.ResolutionWidth) + Int64(51) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(51)), sum(hits.ResolutionWidth) + Int64(52) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(52)), sum(hits.ResolutionWidth) + Int64(53) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(53)), sum(hits.ResolutionWidth) + Int64(54) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(54)), sum(hits.ResolutionWidth) + Int64(55) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(55)), sum(hits.ResolutionWidth) + Int64(56) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(56)), sum(hits.ResolutionWidth) + Int64(57) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(57)), sum(hits.ResolutionWidth) + Int64(58) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(58)), sum(hits.ResolutionWidth) + Int64(59) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(59)), sum(hits.ResolutionWidth) + Int64(60) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(60)), sum(hits.ResolutionWidth) + Int64(61) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(61)), sum(hits.ResolutionWidth) + Int64(62) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(62)), sum(hits.ResolutionWidth) + Int64(63) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(63)), sum(hits.ResolutionWidth) + Int64(64) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(64)), sum(hits.ResolutionWidth) + Int64(65) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(65)), sum(hits.ResolutionWidth) + Int64(66) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(66)), sum(hits.ResolutionWidth) + Int64(67) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(67)), sum(hits.ResolutionWidth) + Int64(68) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(68)), sum(hits.ResolutionWidth) + Int64(69) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(69)), sum(hits.ResolutionWidth) + Int64(70) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(70)), sum(hits.ResolutionWidth) + Int64(71) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(71)), sum(hits.ResolutionWidth) + Int64(72) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(72)), sum(hits.ResolutionWidth) + Int64(73) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(73)), sum(hits.ResolutionWidth) + Int64(74) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(74)), sum(hits.ResolutionWidth) + Int64(75) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(75)), sum(hits.ResolutionWidth) + Int64(76) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(76)), sum(hits.ResolutionWidth) + Int64(77) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(77)), sum(hits.ResolutionWidth) + Int64(78) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(78)), sum(hits.ResolutionWidth) + Int64(79) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(79)), sum(hits.ResolutionWidth) + Int64(80) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(80)), sum(hits.ResolutionWidth) + Int64(81) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(81)), sum(hits.ResolutionWidth) + Int64(82) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(82)), sum(hits.ResolutionWidth) + Int64(83) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(83)), sum(hits.ResolutionWidth) + Int64(84) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(84)), sum(hits.ResolutionWidth) + Int64(85) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(85)), sum(hits.ResolutionWidth) + Int64(86) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(86)), sum(hits.ResolutionWidth) + Int64(87) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(87)), sum(hits.ResolutionWidth) + Int64(88) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(88)), sum(hits.ResolutionWidth) + Int64(89) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(89))
+02)--Projection: CAST(count(hits.ResolutionWidth) AS Int64) AS __common_expr_1, sum(hits.ResolutionWidth)
+03)----Aggregate: groupBy=[[]], aggr=[[sum(__common_expr_2 AS hits.ResolutionWidth), count(__common_expr_2 AS hits.ResolutionWidth)]]
+04)------Projection: CAST(hits.ResolutionWidth AS Int64) AS __common_expr_2
+05)--------SubqueryAlias: hits
+06)----------TableScan: hits_raw projection=[ResolutionWidth]
+physical_plan
+01)ProjectionExec: expr=[sum(hits.ResolutionWidth)@0 as sum(hits.ResolutionWidth), sum(hits.ResolutionWidth)@0 + count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(1)), sum(hits.ResolutionWidth)@0 + 2 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(2)), sum(hits.ResolutionWidth)@0 + 3 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(3)), sum(hits.ResolutionWidth)@0 + 4 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(4)), sum(hits.ResolutionWidth)@0 + 5 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(5)), sum(hits.ResolutionWidth)@0 + 6 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(6)), sum(hits.ResolutionWidth)@0 + 7 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(7)), sum(hits.ResolutionWidth)@0 + 8 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(8)), sum(hits.ResolutionWidth)@0 + 9 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(9)), sum(hits.ResolutionWidth)@0 + 10 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(10)), sum(hits.ResolutionWidth)@0 + 11 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(11)), sum(hits.ResolutionWidth)@0 + 12 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(12)), sum(hits.ResolutionWidth)@0 + 13 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(13)), sum(hits.ResolutionWidth)@0 + 14 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(14)), sum(hits.ResolutionWidth)@0 + 15 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(15)), sum(hits.ResolutionWidth)@0 + 16 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(16)), sum(hits.ResolutionWidth)@0 + 17 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(17)), sum(hits.ResolutionWidth)@0 + 18 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(18)), sum(hits.ResolutionWidth)@0 + 19 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(19)), sum(hits.ResolutionWidth)@0 + 20 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(20)), sum(hits.ResolutionWidth)@0 + 21 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(21)), sum(hits.ResolutionWidth)@0 + 22 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(22)), sum(hits.ResolutionWidth)@0 + 23 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(23)), sum(hits.ResolutionWidth)@0 + 24 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(24)), sum(hits.ResolutionWidth)@0 + 25 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(25)), sum(hits.ResolutionWidth)@0 + 26 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(26)), sum(hits.ResolutionWidth)@0 + 27 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(27)), sum(hits.ResolutionWidth)@0 + 28 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(28)), sum(hits.ResolutionWidth)@0 + 29 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(29)), sum(hits.ResolutionWidth)@0 + 30 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(30)), sum(hits.ResolutionWidth)@0 + 31 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(31)), sum(hits.ResolutionWidth)@0 + 32 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(32)), sum(hits.ResolutionWidth)@0 + 33 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(33)), sum(hits.ResolutionWidth)@0 + 34 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(34)), sum(hits.ResolutionWidth)@0 + 35 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(35)), sum(hits.ResolutionWidth)@0 + 36 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(36)), sum(hits.ResolutionWidth)@0 + 37 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(37)), sum(hits.ResolutionWidth)@0 + 38 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(38)), sum(hits.ResolutionWidth)@0 + 39 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(39)), sum(hits.ResolutionWidth)@0 + 40 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(40)), sum(hits.ResolutionWidth)@0 + 41 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(41)), sum(hits.ResolutionWidth)@0 + 42 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(42)), sum(hits.ResolutionWidth)@0 + 43 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(43)), sum(hits.ResolutionWidth)@0 + 44 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(44)), sum(hits.ResolutionWidth)@0 + 45 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(45)), sum(hits.ResolutionWidth)@0 + 46 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(46)), sum(hits.ResolutionWidth)@0 + 47 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(47)), sum(hits.ResolutionWidth)@0 + 48 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(48)), sum(hits.ResolutionWidth)@0 + 49 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(49)), sum(hits.ResolutionWidth)@0 + 50 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(50)), sum(hits.ResolutionWidth)@0 + 51 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(51)), sum(hits.ResolutionWidth)@0 + 52 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(52)), sum(hits.ResolutionWidth)@0 + 53 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(53)), sum(hits.ResolutionWidth)@0 + 54 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(54)), sum(hits.ResolutionWidth)@0 + 55 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(55)), sum(hits.ResolutionWidth)@0 + 56 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(56)), sum(hits.ResolutionWidth)@0 + 57 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(57)), sum(hits.ResolutionWidth)@0 + 58 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(58)), sum(hits.ResolutionWidth)@0 + 59 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(59)), sum(hits.ResolutionWidth)@0 + 60 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(60)), sum(hits.ResolutionWidth)@0 + 61 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(61)), sum(hits.ResolutionWidth)@0 + 62 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(62)), sum(hits.ResolutionWidth)@0 + 63 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(63)), sum(hits.ResolutionWidth)@0 + 64 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(64)), sum(hits.ResolutionWidth)@0 + 65 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(65)), sum(hits.ResolutionWidth)@0 + 66 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(66)), sum(hits.ResolutionWidth)@0 + 67 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(67)), sum(hits.ResolutionWidth)@0 + 68 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(68)), sum(hits.ResolutionWidth)@0 + 69 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(69)), sum(hits.ResolutionWidth)@0 + 70 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(70)), sum(hits.ResolutionWidth)@0 + 71 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(71)), sum(hits.ResolutionWidth)@0 + 72 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(72)), sum(hits.ResolutionWidth)@0 + 73 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(73)), sum(hits.ResolutionWidth)@0 + 74 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(74)), sum(hits.ResolutionWidth)@0 + 75 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(75)), sum(hits.ResolutionWidth)@0 + 76 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(76)), sum(hits.ResolutionWidth)@0 + 77 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(77)), sum(hits.ResolutionWidth)@0 + 78 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(78)), sum(hits.ResolutionWidth)@0 + 79 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(79)), sum(hits.ResolutionWidth)@0 + 80 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(80)), sum(hits.ResolutionWidth)@0 + 81 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(81)), sum(hits.ResolutionWidth)@0 + 82 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(82)), sum(hits.ResolutionWidth)@0 + 83 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(83)), sum(hits.ResolutionWidth)@0 + 84 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(84)), sum(hits.ResolutionWidth)@0 + 85 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(85)), sum(hits.ResolutionWidth)@0 + 86 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(86)), sum(hits.ResolutionWidth)@0 + 87 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(87)), sum(hits.ResolutionWidth)@0 + 88 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(88)), sum(hits.ResolutionWidth)@0 + 89 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(89))]
+02)--AggregateExec: mode=Single, gby=[], aggr=[sum(hits.ResolutionWidth), count(hits.ResolutionWidth)]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[CAST(ResolutionWidth@20 AS Int64) as __common_expr_2], file_type=parquet
+
 query IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
 SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits;
 ----
 0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 290 300 310 320 330 340 350 360 370 380 390 400 410 420 430 440 450 460 470 480 490 500 510 520 530 540 550 560 570 580 590 600 610 620 630 640 650 660 670 680 690 700 710 720 730 740 750 760 770 780 790 800 810 820 830 840 850 860 870 880 890
 
+## Q30
+query TT
+EXPLAIN SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: hits.SearchEngineID, hits.ClientIP, count(Int64(1)) AS count(*) AS c, sum(hits.IsRefresh), avg(hits.ResolutionWidth)
+03)----Aggregate: groupBy=[[hits.SearchEngineID, hits.ClientIP]], aggr=[[count(Int64(1)), sum(CAST(hits.IsRefresh AS Int64)), avg(CAST(hits.ResolutionWidth AS Float64))]]
+04)------SubqueryAlias: hits
+05)--------Projection: hits_raw.ClientIP, hits_raw.IsRefresh, hits_raw.ResolutionWidth, hits_raw.SearchEngineID
+06)----------Filter: hits_raw.SearchPhrase != Utf8View("")
+07)------------TableScan: hits_raw projection=[ClientIP, IsRefresh, ResolutionWidth, SearchEngineID, SearchPhrase], partial_filters=[hits_raw.SearchPhrase != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [c@2 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@2 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[SearchEngineID@0 as SearchEngineID, ClientIP@1 as ClientIP, count(Int64(1))@2 as c, sum(hits.IsRefresh)@3 as sum(hits.IsRefresh), avg(hits.ResolutionWidth)@4 as avg(hits.ResolutionWidth)]
+04)------AggregateExec: mode=FinalPartitioned, gby=[SearchEngineID@0 as SearchEngineID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(hits.IsRefresh), avg(hits.ResolutionWidth)]
+05)--------RepartitionExec: partitioning=Hash([SearchEngineID@0, ClientIP@1], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[SearchEngineID@3 as SearchEngineID, ClientIP@0 as ClientIP], aggr=[count(Int64(1)), sum(hits.IsRefresh), avg(hits.ResolutionWidth)]
+07)------------FilterExec: SearchPhrase@4 != , projection=[ClientIP@0, IsRefresh@1, ResolutionWidth@2, SearchEngineID@3]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[ClientIP, IsRefresh, ResolutionWidth, SearchEngineID, SearchPhrase], file_type=parquet, predicate=SearchPhrase@39 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 !=  OR  != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()]
+
 query IIIIR
 SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10;
 ----
 
+## Q31
+query TT
+EXPLAIN SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: hits.WatchID, hits.ClientIP, count(Int64(1)) AS count(*) AS c, sum(hits.IsRefresh), avg(hits.ResolutionWidth)
+03)----Aggregate: groupBy=[[hits.WatchID, hits.ClientIP]], aggr=[[count(Int64(1)), sum(CAST(hits.IsRefresh AS Int64)), avg(CAST(hits.ResolutionWidth AS Float64))]]
+04)------SubqueryAlias: hits
+05)--------Projection: hits_raw.WatchID, hits_raw.ClientIP, hits_raw.IsRefresh, hits_raw.ResolutionWidth
+06)----------Filter: hits_raw.SearchPhrase != Utf8View("")
+07)------------TableScan: hits_raw projection=[WatchID, ClientIP, IsRefresh, ResolutionWidth, SearchPhrase], partial_filters=[hits_raw.SearchPhrase != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [c@2 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@2 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[WatchID@0 as WatchID, ClientIP@1 as ClientIP, count(Int64(1))@2 as c, sum(hits.IsRefresh)@3 as sum(hits.IsRefresh), avg(hits.ResolutionWidth)@4 as avg(hits.ResolutionWidth)]
+04)------AggregateExec: mode=FinalPartitioned, gby=[WatchID@0 as WatchID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(hits.IsRefresh), avg(hits.ResolutionWidth)]
+05)--------RepartitionExec: partitioning=Hash([WatchID@0, ClientIP@1], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[WatchID@0 as WatchID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(hits.IsRefresh), avg(hits.ResolutionWidth)]
+07)------------FilterExec: SearchPhrase@4 != , projection=[WatchID@0, ClientIP@1, IsRefresh@2, ResolutionWidth@3]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[WatchID, ClientIP, IsRefresh, ResolutionWidth, SearchPhrase], file_type=parquet, predicate=SearchPhrase@39 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 !=  OR  != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()]
+
 query IIIIR
 SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
 ----
 
+## Q32
+query TT
+EXPLAIN SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: hits.WatchID, hits.ClientIP, count(Int64(1)) AS count(*) AS c, sum(hits.IsRefresh), avg(hits.ResolutionWidth)
+03)----Aggregate: groupBy=[[hits.WatchID, hits.ClientIP]], aggr=[[count(Int64(1)), sum(CAST(hits.IsRefresh AS Int64)), avg(CAST(hits.ResolutionWidth AS Float64))]]
+04)------SubqueryAlias: hits
+05)--------TableScan: hits_raw projection=[WatchID, ClientIP, IsRefresh, ResolutionWidth]
+physical_plan
+01)SortPreservingMergeExec: [c@2 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@2 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[WatchID@0 as WatchID, ClientIP@1 as ClientIP, count(Int64(1))@2 as c, sum(hits.IsRefresh)@3 as sum(hits.IsRefresh), avg(hits.ResolutionWidth)@4 as avg(hits.ResolutionWidth)]
+04)------AggregateExec: mode=FinalPartitioned, gby=[WatchID@0 as WatchID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(hits.IsRefresh), avg(hits.ResolutionWidth)]
+05)--------RepartitionExec: partitioning=Hash([WatchID@0, ClientIP@1], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[WatchID@0 as WatchID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(hits.IsRefresh), avg(hits.ResolutionWidth)]
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[WatchID, ClientIP, IsRefresh, ResolutionWidth], file_type=parquet
+
 query IIIIR rowsort
 SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
 ----
@@ -218,6 +888,25 @@ SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWi
 8924809397503602651 -1216690514 1 0 0
 9110818468285196899 -1216690514 1 0 0
 
+## Q33
+query TT
+EXPLAIN SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: hits.URL, count(Int64(1)) AS count(*) AS c
+03)----Aggregate: groupBy=[[hits.URL]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------TableScan: hits_raw projection=[URL]
+physical_plan
+01)SortPreservingMergeExec: [c@1 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[URL@0 as URL, count(Int64(1))@1 as c]
+04)------AggregateExec: mode=FinalPartitioned, gby=[URL@0 as URL], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([URL@0], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[URL@0 as URL], aggr=[count(Int64(1))]
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[URL], file_type=parquet
+
 query TI rowsort
 SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10;
 ----
@@ -228,6 +917,25 @@ http://bonprix.ru/index.ru/cinema/art/A00387,3797); ru)&bL 1
 http://holodilnik.ru/russia/05jul2013&model=0 1
 http://tours/Ekategoriya%2F&sr=http://slovareniye 1
 
+## Q34
+query TT
+EXPLAIN SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: Int64(1), hits.URL, count(Int64(1)) AS c
+03)----Aggregate: groupBy=[[hits.URL]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------TableScan: hits_raw projection=[URL]
+physical_plan
+01)SortPreservingMergeExec: [c@2 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@2 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[1 as Int64(1), URL@0 as URL, count(Int64(1))@1 as c]
+04)------AggregateExec: mode=FinalPartitioned, gby=[URL@0 as URL], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([URL@0], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[URL@0 as URL], aggr=[count(Int64(1))]
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[URL], file_type=parquet
+
 query ITI rowsort
 SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10;
 ----
@@ -238,6 +946,27 @@ SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT
 1 http://holodilnik.ru/russia/05jul2013&model=0 1
 1 http://tours/Ekategoriya%2F&sr=http://slovareniye 1
 
+## Q35
+query TT
+EXPLAIN SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: hits.ClientIP, __common_expr_1 - Int64(1) AS hits.ClientIP - Int64(1), __common_expr_1 - Int64(2) AS hits.ClientIP - Int64(2), __common_expr_1 - Int64(3) AS hits.ClientIP - Int64(3), count(Int64(1)) AS c
+03)----Projection: CAST(hits.ClientIP AS Int64) AS __common_expr_1, hits.ClientIP, count(Int64(1))
+04)------Aggregate: groupBy=[[hits.ClientIP]], aggr=[[count(Int64(1))]]
+05)--------SubqueryAlias: hits
+06)----------TableScan: hits_raw projection=[ClientIP]
+physical_plan
+01)SortPreservingMergeExec: [c@4 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@4 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[ClientIP@1 as ClientIP, __common_expr_1@0 - 1 as hits.ClientIP - Int64(1), __common_expr_1@0 - 2 as hits.ClientIP - Int64(2), __common_expr_1@0 - 3 as hits.ClientIP - Int64(3), count(Int64(1))@2 as c]
+04)------ProjectionExec: expr=[CAST(ClientIP@0 AS Int64) as __common_expr_1, ClientIP@0 as ClientIP, count(Int64(1))@1 as count(Int64(1))]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[ClientIP@0 as ClientIP], aggr=[count(Int64(1))]
+06)----------RepartitionExec: partitioning=Hash([ClientIP@0], 4), input_partitions=1
+07)------------AggregateExec: mode=Partial, gby=[ClientIP@0 as ClientIP], aggr=[count(Int64(1))]
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[ClientIP], file_type=parquet
+
 query IIIII rowsort
 SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10;
 ----
@@ -246,46 +975,256 @@ SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c
 1568366281 1568366280 1568366279 1568366278 2
 1615432634 1615432633 1615432632 1615432631 1
 
+## Q36
+query TT
+EXPLAIN SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10;
+----
+logical_plan
+01)Sort: pageviews DESC NULLS FIRST, fetch=10
+02)--Projection: hits.URL, count(Int64(1)) AS count(*) AS pageviews
+03)----Aggregate: groupBy=[[hits.URL]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------Projection: hits_raw.URL
+06)----------Filter: hits_raw.CounterID = Int32(62) AND hits_raw.EventDate >= UInt16(15887) AND hits_raw.EventDate <= UInt16(15917) AND hits_raw.DontCountHits = Int16(0) AND hits_raw.IsRefresh = Int16(0) AND hits_raw.URL != Utf8View("")
+07)------------TableScan: hits_raw projection=[EventDate, CounterID, URL, IsRefresh, DontCountHits], partial_filters=[hits_raw.CounterID = Int32(62), hits_raw.EventDate >= UInt16(15887), hits_raw.EventDate <= UInt16(15917), hits_raw.DontCountHits = Int16(0), hits_raw.IsRefresh = Int16(0), hits_raw.URL != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [pageviews@1 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[pageviews@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[URL@0 as URL, count(Int64(1))@1 as pageviews]
+04)------AggregateExec: mode=FinalPartitioned, gby=[URL@0 as URL], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([URL@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[URL@0 as URL], aggr=[count(Int64(1))]
+07)------------FilterExec: CounterID@1 = 62 AND EventDate@0 >= 15887 AND EventDate@0 <= 15917 AND DontCountHits@4 = 0 AND IsRefresh@3 = 0 AND URL@2 != , projection=[URL@2]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[EventDate, CounterID, URL, IsRefresh, DontCountHits], file_type=parquet, predicate=CounterID@6 = 62 AND EventDate@5 >= 15887 AND EventDate@5 <= 15917 AND DontCountHits@61 = 0 AND IsRefresh@15 = 0 AND URL@13 != , pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 15887 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 15917 AND DontCountHits_null_count@9 != row_count@3 AND DontCountHits_min@7 <= 0 AND 0 <= DontCountHits_max@8 AND IsRefresh_null_count@12 != row_count@3 AND IsRefresh_min@10 <= 0 AND 0 <= IsRefresh_max@11 AND URL_null_count@15 != row_count@3 AND (URL_min@13 !=  OR  != URL_max@14), required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0), URL not in ()]
+
 query TI
 SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10;
 ----
 
+## Q37
+query TT
+EXPLAIN SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10;
+----
+logical_plan
+01)Sort: pageviews DESC NULLS FIRST, fetch=10
+02)--Projection: hits.Title, count(Int64(1)) AS count(*) AS pageviews
+03)----Aggregate: groupBy=[[hits.Title]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------Projection: hits_raw.Title
+06)----------Filter: hits_raw.CounterID = Int32(62) AND hits_raw.EventDate >= UInt16(15887) AND hits_raw.EventDate <= UInt16(15917) AND hits_raw.DontCountHits = Int16(0) AND hits_raw.IsRefresh = Int16(0) AND hits_raw.Title != Utf8View("")
+07)------------TableScan: hits_raw projection=[Title, EventDate, CounterID, IsRefresh, DontCountHits], partial_filters=[hits_raw.CounterID = Int32(62), hits_raw.EventDate >= UInt16(15887), hits_raw.EventDate <= UInt16(15917), hits_raw.DontCountHits = Int16(0), hits_raw.IsRefresh = Int16(0), hits_raw.Title != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [pageviews@1 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[pageviews@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[Title@0 as Title, count(Int64(1))@1 as pageviews]
+04)------AggregateExec: mode=FinalPartitioned, gby=[Title@0 as Title], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([Title@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[Title@0 as Title], aggr=[count(Int64(1))]
+07)------------FilterExec: CounterID@2 = 62 AND EventDate@1 >= 15887 AND EventDate@1 <= 15917 AND DontCountHits@4 = 0 AND IsRefresh@3 = 0 AND Title@0 != , projection=[Title@0]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[Title, EventDate, CounterID, IsRefresh, DontCountHits], file_type=parquet, predicate=CounterID@6 = 62 AND EventDate@5 >= 15887 AND EventDate@5 <= 15917 AND DontCountHits@61 = 0 AND IsRefresh@15 = 0 AND Title@2 != , pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 15887 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 15917 AND DontCountHits_null_count@9 != row_count@3 AND DontCountHits_min@7 <= 0 AND 0 <= DontCountHits_max@8 AND IsRefresh_null_count@12 != row_count@3 AND IsRefresh_min@10 <= 0 AND 0 <= IsRefresh_max@11 AND Title_null_count@15 != row_count@3 AND (Title_min@13 !=  OR  != Title_max@14), required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0), Title not in ()]
+
 query TI
 SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10;
 ----
 
+## Q38
+query TT
+EXPLAIN SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
+----
+logical_plan
+01)Limit: skip=1000, fetch=10
+02)--Sort: pageviews DESC NULLS FIRST, fetch=1010
+03)----Projection: hits.URL, count(Int64(1)) AS count(*) AS pageviews
+04)------Aggregate: groupBy=[[hits.URL]], aggr=[[count(Int64(1))]]
+05)--------SubqueryAlias: hits
+06)----------Projection: hits_raw.URL
+07)------------Filter: hits_raw.CounterID = Int32(62) AND hits_raw.EventDate >= UInt16(15887) AND hits_raw.EventDate <= UInt16(15917) AND hits_raw.IsRefresh = Int16(0) AND hits_raw.IsLink != Int16(0) AND hits_raw.IsDownload = Int16(0)
+08)--------------TableScan: hits_raw projection=[EventDate, CounterID, URL, IsRefresh, IsLink, IsDownload], partial_filters=[hits_raw.CounterID = Int32(62), hits_raw.EventDate >= UInt16(15887), hits_raw.EventDate <= UInt16(15917), hits_raw.IsRefresh = Int16(0), hits_raw.IsLink != Int16(0), hits_raw.IsDownload = Int16(0)]
+physical_plan
+01)GlobalLimitExec: skip=1000, fetch=10
+02)--SortPreservingMergeExec: [pageviews@1 DESC], fetch=1010
+03)----SortExec: TopK(fetch=1010), expr=[pageviews@1 DESC], preserve_partitioning=[true]
+04)------ProjectionExec: expr=[URL@0 as URL, count(Int64(1))@1 as pageviews]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[URL@0 as URL], aggr=[count(Int64(1))]
+06)----------RepartitionExec: partitioning=Hash([URL@0], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[URL@0 as URL], aggr=[count(Int64(1))]
+08)--------------FilterExec: CounterID@1 = 62 AND EventDate@0 >= 15887 AND EventDate@0 <= 15917 AND IsRefresh@3 = 0 AND IsLink@4 != 0 AND IsDownload@5 = 0, projection=[URL@2]
+09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[EventDate, CounterID, URL, IsRefresh, IsLink, IsDownload], file_type=parquet, predicate=CounterID@6 = 62 AND EventDate@5 >= 15887 AND EventDate@5 <= 15917 AND IsRefresh@15 = 0 AND IsLink@52 != 0 AND IsDownload@53 = 0, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 15887 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 15917 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND IsLink_null_count@12 != row_count@3 AND (IsLink_min@10 != 0 OR 0 != IsLink_max@11) AND IsDownload_null_count@15 != row_count@3 AND IsDownload_min@13 <= 0 AND 0 <= IsDownload_max@14, required_guarantees=[CounterID in (62), IsDownload in (0), IsLink not in (0), IsRefresh in (0)]
+
 query TI
 SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
 ----
 
+## Q39
+query TT
+EXPLAIN SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
+----
+logical_plan
+01)Limit: skip=1000, fetch=10
+02)--Sort: pageviews DESC NULLS FIRST, fetch=1010
+03)----Projection: hits.TraficSourceID, hits.SearchEngineID, hits.AdvEngineID, CASE WHEN hits.SearchEngineID = Int64(0) AND hits.AdvEngineID = Int64(0) THEN hits.Referer ELSE Utf8("") END AS src, hits.URL AS dst, count(Int64(1)) AS count(*) AS pageviews
+04)------Aggregate: groupBy=[[hits.TraficSourceID, hits.SearchEngineID, hits.AdvEngineID, CASE WHEN hits.SearchEngineID = Int16(0) AND hits.AdvEngineID = Int16(0) THEN hits.Referer ELSE Utf8View("") END AS CASE WHEN hits.SearchEngineID = Int64(0) AND hits.AdvEngineID = Int64(0) THEN hits.Referer ELSE Utf8("") END, hits.URL]], aggr=[[count(Int64(1))]]
+05)--------SubqueryAlias: hits
+06)----------Projection: hits_raw.URL, hits_raw.Referer, hits_raw.TraficSourceID, hits_raw.SearchEngineID, hits_raw.AdvEngineID
+07)------------Filter: hits_raw.CounterID = Int32(62) AND hits_raw.EventDate >= UInt16(15887) AND hits_raw.EventDate <= UInt16(15917) AND hits_raw.IsRefresh = Int16(0)
+08)--------------TableScan: hits_raw projection=[EventDate, CounterID, URL, Referer, IsRefresh, TraficSourceID, SearchEngineID, AdvEngineID], partial_filters=[hits_raw.CounterID = Int32(62), hits_raw.EventDate >= UInt16(15887), hits_raw.EventDate <= UInt16(15917), hits_raw.IsRefresh = Int16(0)]
+physical_plan
+01)GlobalLimitExec: skip=1000, fetch=10
+02)--SortPreservingMergeExec: [pageviews@5 DESC], fetch=1010
+03)----SortExec: TopK(fetch=1010), expr=[pageviews@5 DESC], preserve_partitioning=[true]
+04)------ProjectionExec: expr=[TraficSourceID@0 as TraficSourceID, SearchEngineID@1 as SearchEngineID, AdvEngineID@2 as AdvEngineID, CASE WHEN hits.SearchEngineID = Int64(0) AND hits.AdvEngineID = Int64(0) THEN hits.Referer ELSE Utf8("") END@3 as src, URL@4 as dst, count(Int64(1))@5 as pageviews]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[TraficSourceID@0 as TraficSourceID, SearchEngineID@1 as SearchEngineID, AdvEngineID@2 as AdvEngineID, CASE WHEN hits.SearchEngineID = Int64(0) AND hits.AdvEngineID = Int64(0) THEN hits.Referer ELSE Utf8("") END@3 as CASE WHEN hits.SearchEngineID = Int64(0) AND hits.AdvEngineID = Int64(0) THEN hits.Referer ELSE Utf8("") END, URL@4 as URL], aggr=[count(Int64(1))]
+06)----------RepartitionExec: partitioning=Hash([TraficSourceID@0, SearchEngineID@1, AdvEngineID@2, CASE WHEN hits.SearchEngineID = Int64(0) AND hits.AdvEngineID = Int64(0) THEN hits.Referer ELSE Utf8("") END@3, URL@4], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[TraficSourceID@2 as TraficSourceID, SearchEngineID@3 as SearchEngineID, AdvEngineID@4 as AdvEngineID, CASE WHEN SearchEngineID@3 = 0 AND AdvEngineID@4 = 0 THEN Referer@1 ELSE  END as CASE WHEN hits.SearchEngineID = Int64(0) AND hits.AdvEngineID = Int64(0) THEN hits.Referer ELSE Utf8("") END, URL@0 as URL], aggr=[count(Int64(1))]
+08)--------------FilterExec: CounterID@1 = 62 AND EventDate@0 >= 15887 AND EventDate@0 <= 15917 AND IsRefresh@4 = 0, projection=[URL@2, Referer@3, TraficSourceID@5, SearchEngineID@6, AdvEngineID@7]
+09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[EventDate, CounterID, URL, Referer, IsRefresh, TraficSourceID, SearchEngineID, AdvEngineID], file_type=parquet, predicate=CounterID@6 = 62 AND EventDate@5 >= 15887 AND EventDate@5 <= 15917 AND IsRefresh@15 = 0, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 15887 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 15917 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8, required_guarantees=[CounterID in (62), IsRefresh in (0)]
+
 query IIITTI
 SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
 ----
 
-query III
+## Q40
+query TT
+EXPLAIN SELECT "URLHash", "EventDate", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate" ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
+----
+logical_plan
+01)Limit: skip=100, fetch=10
+02)--Sort: pageviews DESC NULLS FIRST, fetch=110
+03)----Projection: hits.URLHash, hits.EventDate, count(Int64(1)) AS count(*) AS pageviews
+04)------Aggregate: groupBy=[[hits.URLHash, hits.EventDate]], aggr=[[count(Int64(1))]]
+05)--------SubqueryAlias: hits
+06)----------Projection: hits_raw.URLHash, CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) AS EventDate
+07)------------Filter: hits_raw.CounterID = Int32(62) AND hits_raw.EventDate >= UInt16(15887) AND hits_raw.EventDate <= UInt16(15917) AND hits_raw.IsRefresh = Int16(0) AND (hits_raw.TraficSourceID = Int16(-1) OR hits_raw.TraficSourceID = Int16(6)) AND hits_raw.RefererHash = Int64(3594120000172545465)
+08)--------------TableScan: hits_raw projection=[EventDate, CounterID, IsRefresh, TraficSourceID, RefererHash, URLHash], partial_filters=[hits_raw.CounterID = Int32(62), hits_raw.EventDate >= UInt16(15887), hits_raw.EventDate <= UInt16(15917), hits_raw.IsRefresh = Int16(0), hits_raw.TraficSourceID = Int16(-1) OR hits_raw.TraficSourceID = Int16(6), hits_raw.RefererHash = Int64(3594120000172545465)]
+physical_plan
+01)GlobalLimitExec: skip=100, fetch=10
+02)--SortPreservingMergeExec: [pageviews@2 DESC], fetch=110
+03)----SortExec: TopK(fetch=110), expr=[pageviews@2 DESC], preserve_partitioning=[true]
+04)------ProjectionExec: expr=[URLHash@0 as URLHash, EventDate@1 as EventDate, count(Int64(1))@2 as pageviews]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[URLHash@0 as URLHash, EventDate@1 as EventDate], aggr=[count(Int64(1))]
+06)----------RepartitionExec: partitioning=Hash([URLHash@0, EventDate@1], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[URLHash@0 as URLHash, EventDate@1 as EventDate], aggr=[count(Int64(1))]
+08)--------------ProjectionExec: expr=[URLHash@0 as URLHash, CAST(CAST(EventDate@1 AS Int32) AS Date32) as EventDate]
+09)----------------FilterExec: CounterID@1 = 62 AND EventDate@0 >= 15887 AND EventDate@0 <= 15917 AND IsRefresh@2 = 0 AND (TraficSourceID@3 = -1 OR TraficSourceID@3 = 6) AND RefererHash@4 = 3594120000172545465, projection=[URLHash@5, EventDate@0]
+10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[EventDate, CounterID, IsRefresh, TraficSourceID, RefererHash, URLHash], file_type=parquet, predicate=CounterID@6 = 62 AND EventDate@5 >= 15887 AND EventDate@5 <= 15917 AND IsRefresh@15 = 0 AND (TraficSourceID@37 = -1 OR TraficSourceID@37 = 6) AND RefererHash@102 = 3594120000172545465, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 15887 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 15917 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND (TraficSourceID_null_count@12 != row_count@3 AND TraficSourceID_min@10 <= -1 AND -1 <= TraficSourceID_max@11 OR TraficSourceID_null_count@12 != row_count@3 AND TraficSourceID_min@10 <= 6 AND 6 <= TraficSourceID_max@11) AND RefererHash_null_count@15 != row_count@3 AND RefererHash_min@13 <= 3594120000172545465 AND 3594120000172545465 <= RefererHash_max@14, required_guarantees=[CounterID in (62), IsRefresh in (0), RefererHash in (3594120000172545465), TraficSourceID in (-1, 6)]
+
+query IDI
 SELECT "URLHash", "EventDate", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate" ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
 ----
 
+## Q41
+query TT
+EXPLAIN SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
+----
+logical_plan
+01)Limit: skip=10000, fetch=10
+02)--Sort: pageviews DESC NULLS FIRST, fetch=10010
+03)----Projection: hits.WindowClientWidth, hits.WindowClientHeight, count(Int64(1)) AS count(*) AS pageviews
+04)------Aggregate: groupBy=[[hits.WindowClientWidth, hits.WindowClientHeight]], aggr=[[count(Int64(1))]]
+05)--------SubqueryAlias: hits
+06)----------Projection: hits_raw.WindowClientWidth, hits_raw.WindowClientHeight
+07)------------Filter: hits_raw.CounterID = Int32(62) AND hits_raw.EventDate >= UInt16(15887) AND hits_raw.EventDate <= UInt16(15917) AND hits_raw.IsRefresh = Int16(0) AND hits_raw.DontCountHits = Int16(0) AND hits_raw.URLHash = Int64(2868770270353813622)
+08)--------------TableScan: hits_raw projection=[EventDate, CounterID, IsRefresh, WindowClientWidth, WindowClientHeight, DontCountHits, URLHash], partial_filters=[hits_raw.CounterID = Int32(62), hits_raw.EventDate >= UInt16(15887), hits_raw.EventDate <= UInt16(15917), hits_raw.IsRefresh = Int16(0), hits_raw.DontCountHits = Int16(0), hits_raw.URLHash = Int64(2868770270353813622)]
+physical_plan
+01)GlobalLimitExec: skip=10000, fetch=10
+02)--SortPreservingMergeExec: [pageviews@2 DESC], fetch=10010
+03)----SortExec: TopK(fetch=10010), expr=[pageviews@2 DESC], preserve_partitioning=[true]
+04)------ProjectionExec: expr=[WindowClientWidth@0 as WindowClientWidth, WindowClientHeight@1 as WindowClientHeight, count(Int64(1))@2 as pageviews]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[WindowClientWidth@0 as WindowClientWidth, WindowClientHeight@1 as WindowClientHeight], aggr=[count(Int64(1))]
+06)----------RepartitionExec: partitioning=Hash([WindowClientWidth@0, WindowClientHeight@1], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[WindowClientWidth@0 as WindowClientWidth, WindowClientHeight@1 as WindowClientHeight], aggr=[count(Int64(1))]
+08)--------------FilterExec: CounterID@1 = 62 AND EventDate@0 >= 15887 AND EventDate@0 <= 15917 AND IsRefresh@2 = 0 AND DontCountHits@5 = 0 AND URLHash@6 = 2868770270353813622, projection=[WindowClientWidth@3, WindowClientHeight@4]
+09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[EventDate, CounterID, IsRefresh, WindowClientWidth, WindowClientHeight, DontCountHits, URLHash], file_type=parquet, predicate=CounterID@6 = 62 AND EventDate@5 >= 15887 AND EventDate@5 <= 15917 AND IsRefresh@15 = 0 AND DontCountHits@61 = 0 AND URLHash@103 = 2868770270353813622, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 15887 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 15917 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND DontCountHits_null_count@12 != row_count@3 AND DontCountHits_min@10 <= 0 AND 0 <= DontCountHits_max@11 AND URLHash_null_count@15 != row_count@3 AND URLHash_min@13 <= 2868770270353813622 AND 2868770270353813622 <= URLHash_max@14, required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0), URLHash in (2868770270353813622)]
+
 query III
 SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
 ----
 
+## Q42
+query TT
+EXPLAIN SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000;
+----
+logical_plan
+01)Limit: skip=1000, fetch=10
+02)--Sort: date_trunc(Utf8("minute"), m) ASC NULLS LAST, fetch=1010
+03)----Projection: date_trunc(Utf8("minute"),to_timestamp_seconds(hits.EventTime)) AS m, count(Int64(1)) AS count(*) AS pageviews
+04)------Aggregate: groupBy=[[date_trunc(Utf8("minute"), to_timestamp_seconds(hits.EventTime))]], aggr=[[count(Int64(1))]]
+05)--------SubqueryAlias: hits
+06)----------Projection: hits_raw.EventTime
+07)------------Filter: hits_raw.CounterID = Int32(62) AND hits_raw.EventDate >= UInt16(15900) AND hits_raw.EventDate <= UInt16(15901) AND hits_raw.IsRefresh = Int16(0) AND hits_raw.DontCountHits = Int16(0)
+08)--------------TableScan: hits_raw projection=[EventTime, EventDate, CounterID, IsRefresh, DontCountHits], partial_filters=[hits_raw.CounterID = Int32(62), hits_raw.EventDate >= UInt16(15900), hits_raw.EventDate <= UInt16(15901), hits_raw.IsRefresh = Int16(0), hits_raw.DontCountHits = Int16(0)]
+physical_plan
+01)GlobalLimitExec: skip=1000, fetch=10
+02)--SortPreservingMergeExec: [date_trunc(minute, m@0) ASC NULLS LAST], fetch=1010
+03)----SortExec: TopK(fetch=1010), expr=[date_trunc(minute, m@0) ASC NULLS LAST], preserve_partitioning=[true]
+04)------ProjectionExec: expr=[date_trunc(Utf8("minute"),to_timestamp_seconds(hits.EventTime))@0 as m, count(Int64(1))@1 as pageviews]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[date_trunc(Utf8("minute"),to_timestamp_seconds(hits.EventTime))@0 as date_trunc(Utf8("minute"),to_timestamp_seconds(hits.EventTime))], aggr=[count(Int64(1))]
+06)----------RepartitionExec: partitioning=Hash([date_trunc(Utf8("minute"),to_timestamp_seconds(hits.EventTime))@0], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[date_trunc(minute, to_timestamp_seconds(EventTime@0)) as date_trunc(Utf8("minute"),to_timestamp_seconds(hits.EventTime))], aggr=[count(Int64(1))]
+08)--------------FilterExec: CounterID@2 = 62 AND EventDate@1 >= 15900 AND EventDate@1 <= 15901 AND IsRefresh@3 = 0 AND DontCountHits@4 = 0, projection=[EventTime@0]
+09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[EventTime, EventDate, CounterID, IsRefresh, DontCountHits], file_type=parquet, predicate=CounterID@6 = 62 AND EventDate@5 >= 15900 AND EventDate@5 <= 15901 AND IsRefresh@15 = 0 AND DontCountHits@61 = 0, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND EventDate_max@4 >= 15900 AND EventDate_null_count@5 != row_count@3 AND EventDate_min@6 <= 15901 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND DontCountHits_null_count@12 != row_count@3 AND DontCountHits_min@10 <= 0 AND 0 <= DontCountHits_max@11, required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0)]
+
 query PI
 SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000;
 ----
 
 # Clickbench "Extended" queries that test count distinct
 
+query TT
+EXPLAIN SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[count(DISTINCT hits.SearchPhrase), count(DISTINCT hits.MobilePhone), count(DISTINCT hits.MobilePhoneModel)]]
+02)--SubqueryAlias: hits
+03)----TableScan: hits_raw projection=[MobilePhone, MobilePhoneModel, SearchPhrase]
+physical_plan
+01)AggregateExec: mode=Single, gby=[], aggr=[count(DISTINCT hits.SearchPhrase), count(DISTINCT hits.MobilePhone), count(DISTINCT hits.MobilePhoneModel)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[MobilePhone, MobilePhoneModel, SearchPhrase], file_type=parquet
+
 query III
 SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;
 ----
 1 1 1
 
+query TT
+EXPLAIN SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserCountry"), COUNT(DISTINCT "BrowserLanguage")  FROM hits;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[count(DISTINCT hits.HitColor), count(DISTINCT hits.BrowserCountry), count(DISTINCT hits.BrowserLanguage)]]
+02)--SubqueryAlias: hits
+03)----TableScan: hits_raw projection=[HitColor, BrowserLanguage, BrowserCountry]
+physical_plan
+01)ProjectionExec: expr=[1 as count(DISTINCT hits.HitColor), 1 as count(DISTINCT hits.BrowserCountry), 1 as count(DISTINCT hits.BrowserLanguage)]
+02)--PlaceholderRowExec
+
 query III
 SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserCountry"), COUNT(DISTINCT "BrowserLanguage")  FROM hits;
 ----
 1 1 1
 
+query TT
+EXPLAIN SELECT "BrowserCountry",  COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction") FROM hits GROUP BY 1 ORDER BY 2 DESC LIMIT 10;
+----
+logical_plan
+01)Sort: count(DISTINCT hits.SocialNetwork) DESC NULLS FIRST, fetch=10
+02)--Aggregate: groupBy=[[hits.BrowserCountry]], aggr=[[count(DISTINCT hits.SocialNetwork), count(DISTINCT hits.HitColor), count(DISTINCT hits.BrowserLanguage), count(DISTINCT hits.SocialAction)]]
+03)----SubqueryAlias: hits
+04)------TableScan: hits_raw projection=[HitColor, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction]
+physical_plan
+01)SortPreservingMergeExec: [count(DISTINCT hits.SocialNetwork)@1 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[count(DISTINCT hits.SocialNetwork)@1 DESC], preserve_partitioning=[true]
+03)----AggregateExec: mode=FinalPartitioned, gby=[BrowserCountry@0 as BrowserCountry], aggr=[count(DISTINCT hits.SocialNetwork), count(DISTINCT hits.HitColor), count(DISTINCT hits.BrowserLanguage), count(DISTINCT hits.SocialAction)]
+04)------RepartitionExec: partitioning=Hash([BrowserCountry@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[BrowserCountry@2 as BrowserCountry], aggr=[count(DISTINCT hits.SocialNetwork), count(DISTINCT hits.HitColor), count(DISTINCT hits.BrowserLanguage), count(DISTINCT hits.SocialAction)]
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[HitColor, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction], file_type=parquet
+
 query TIIII
 SELECT "BrowserCountry",  COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction") FROM hits GROUP BY 1 ORDER BY 2 DESC LIMIT 10;
 ----
@@ -293,4 +1232,7 @@ SELECT "BrowserCountry",  COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitCo
 
 
 statement ok
-drop table hits;
+drop view hits;
+
+statement ok
+drop table hits_raw;
diff --git a/datafusion/sqllogictest/test_files/clickbench_extended.slt b/datafusion/sqllogictest/test_files/clickbench_extended.slt
index ee3e33551ee3e..6b0d78cdba8f3 100644
--- a/datafusion/sqllogictest/test_files/clickbench_extended.slt
+++ b/datafusion/sqllogictest/test_files/clickbench_extended.slt
@@ -52,7 +52,7 @@ query IIIIII
 SELECT "ClientIP", "WatchID",  COUNT(*) c, MIN("ResponseStartTiming") tmin, MEDIAN("ResponseStartTiming") tmed, MAX("ResponseStartTiming") tmax FROM hits WHERE "JavaEnable" = 0  GROUP BY  "ClientIP", "WatchID" HAVING c > 1 ORDER BY tmed DESC LIMIT 10;
 ----
 
-query IIIIII
+query IIIIRI
 SELECT "ClientIP", "WatchID",  COUNT(*) c, MIN("ResponseStartTiming") tmin, APPROX_PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY "ResponseStartTiming") tp95, MAX("ResponseStartTiming") tmax FROM 'hits' WHERE "JavaEnable" = 0 GROUP BY  "ClientIP", "WatchID" HAVING c > 1 ORDER BY tp95 DESC LIMIT 10;
 ----
 
diff --git a/datafusion/sqllogictest/test_files/coalesce.slt b/datafusion/sqllogictest/test_files/coalesce.slt
index e34a601851d78..79a29f7bf8ec0 100644
--- a/datafusion/sqllogictest/test_files/coalesce.slt
+++ b/datafusion/sqllogictest/test_files/coalesce.slt
@@ -199,14 +199,14 @@ select
   coalesce(array[1, 2], array[3, 4]),
   arrow_typeof(coalesce(array[1, 2], array[3, 4]));
 ----
-[1, 2] List(nullable Int64)
+[1, 2] List(Int64)
 
 query ?T
 select
   coalesce(null, array[3, 4]),
   arrow_typeof(coalesce(array[1, 2], array[3, 4]));
 ----
-[3, 4] List(nullable Int64)
+[3, 4] List(Int64)
 
 # coalesce with array
 query ?T
@@ -214,7 +214,7 @@ select
   coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')]),
   arrow_typeof(coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')]));
 ----
-[1, 2] List(nullable Int64)
+[1, 2] List(Int64)
 
 # test dict(int32, utf8)
 statement ok
@@ -443,3 +443,19 @@ query T
 select coalesce(arrow_cast('', 'Utf8View'), arrow_cast('', 'Dictionary(UInt32, Utf8)'));
 ----
 (empty)
+
+# RunEndEncoded column coalesced with a string literal
+statement ok
+create table ree_t as
+select arrow_cast(c, 'RunEndEncoded("run_ends": non-null Int32, "values": Utf8)') as ree_col
+from (values ('hello'), (NULL), ('world')) as t(c);
+
+query ?
+select coalesce(ree_col, '__null__') from ree_t;
+----
+hello
+__null__
+world
+
+statement ok
+drop table ree_t;
diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt
index 9af0dc63936ae..402ac8e8512bf 100644
--- a/datafusion/sqllogictest/test_files/copy.slt
+++ b/datafusion/sqllogictest/test_files/copy.slt
@@ -200,6 +200,17 @@ physical_plan
 01)DataSinkExec: sink=ParquetSink(file_groups=[])
 02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
+# Verify ParquetSink exposes rows_written, bytes_written, and elapsed_compute metrics
+# Use a query with Sort and Projection to verify metrics across all operators
+query TT
+EXPLAIN ANALYZE COPY (SELECT col1, upper(col2) AS col2_upper FROM source_table ORDER BY col1) TO 'test_files/scratch/copy/table_metrics/' STORED AS PARQUET;
+----
+Plan with Metrics
+01)DataSinkExec: sink=ParquetSink(file_groups=[]), metrics=[elapsed_compute=<slt:ignore>, bytes_written=<slt:ignore>, rows_written=2]
+02)--SortExec: expr=[col1@0 ASC NULLS LAST], preserve_partitioning=[false], metrics=[output_rows=2, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, output_batches=<slt:ignore>, spill_count=0, spilled_bytes=0.0 B, spilled_rows=0]
+03)----ProjectionExec: expr=[col1@0 as col1, upper(col2@1) as col2_upper], metrics=[output_rows=2, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, output_batches=1, expr_0_eval_time=<slt:ignore>, expr_1_eval_time=<slt:ignore>]
+04)------DataSourceExec: partitions=1, partition_sizes=[1], metrics=[]
+
 # Copy to directory as partitioned files with keep_partition_by_columns enabled
 query I
 COPY (values ('1', 'a'), ('2', 'b'), ('3', 'c')) TO 'test_files/scratch/copy/partitioned_table4/' STORED AS parquet PARTITIONED BY (column1)
diff --git a/datafusion/sqllogictest/test_files/cosine_distance.slt b/datafusion/sqllogictest/test_files/cosine_distance.slt
new file mode 100644
index 0000000000000..9142aac8cf684
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/cosine_distance.slt
@@ -0,0 +1,167 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+## cosine_distance
+
+# Orthogonal vectors: distance = 1.0
+query R
+select cosine_distance([1.0, 0.0], [0.0, 1.0]);
+----
+1
+
+# Identical vectors: distance = 0.0
+query R
+select cosine_distance([1.0, 2.0, 3.0], [1.0, 2.0, 3.0]);
+----
+0
+
+# Opposite vectors: distance = 2.0
+query R
+select cosine_distance([1.0, 0.0], [-1.0, 0.0]);
+----
+2
+
+# 45-degree angle: distance ≈ 0.293
+query R
+select round(cosine_distance([1.0, 0.0], [1.0, 1.0]), 3);
+----
+0.293
+
+# Bare NULL input returns NULL
+query R
+select cosine_distance(NULL, [1.0, 2.0]);
+----
+NULL
+
+# NULL in second position returns NULL
+query R
+select cosine_distance([1.0, 2.0], NULL);
+----
+NULL
+
+# Both NULL returns NULL
+query R
+select cosine_distance(NULL, NULL);
+----
+NULL
+
+# Zero vector returns NULL (undefined cosine similarity)
+query R
+select cosine_distance([0.0, 0.0], [1.0, 2.0]);
+----
+NULL
+
+# Mismatched lengths error
+query error cosine_distance requires both list inputs to have the same length
+select cosine_distance([1.0, 2.0], [1.0]);
+
+# NULL element inside a list returns NULL for that row
+query R
+select cosine_distance([1.0, 2.0, NULL], [1.0, 2.0, 3.0]);
+----
+NULL
+
+# LargeList support
+query R
+select cosine_distance(
+    arrow_cast([1.0, 0.0], 'LargeList(Float64)'),
+    arrow_cast([0.0, 1.0], 'LargeList(Float64)')
+);
+----
+1
+
+# Mixed List + LargeList: widens to LargeList
+query R
+select cosine_distance([1.0, 0.0], arrow_cast([0.0, 1.0], 'LargeList(Float64)'));
+----
+1
+
+# Reverse order: LargeList + List also widens
+query R
+select cosine_distance(arrow_cast([1.0, 0.0], 'LargeList(Float64)'), [0.0, 1.0]);
+----
+1
+
+# FixedSizeList inputs (coerced to List)
+query R
+select cosine_distance(
+    arrow_cast([1.0, 0.0], 'FixedSizeList(2, Float64)'),
+    arrow_cast([0.0, 1.0], 'FixedSizeList(2, Float64)')
+);
+----
+1
+
+# FixedSizeList + LargeList: widens to LargeList
+query R
+select cosine_distance(
+    arrow_cast([1.0, 0.0], 'FixedSizeList(2, Float64)'),
+    arrow_cast([0.0, 1.0], 'LargeList(Float64)')
+);
+----
+1
+
+# Float32 inner type (coerced to Float64)
+query R
+select cosine_distance(arrow_cast([1.0, 0.0], 'List(Float32)'), [0.0, 1.0]);
+----
+1
+
+# Int64 inner type (coerced to Float64)
+query R
+select cosine_distance(arrow_cast([1, 0], 'List(Int64)'), arrow_cast([0, 1], 'List(Int64)'));
+----
+1
+
+# Integer literals (coerced to Float64)
+query R
+select cosine_distance([1, 0], [0, 1]);
+----
+1
+
+# Unsupported non-list input (plan error)
+query error cosine_distance does not support type
+select cosine_distance(1, 2);
+
+# Multi-row query with NULL row propagation
+query R
+select cosine_distance(column1, column2) from (values
+    (make_array(1.0, 0.0), make_array(0.0, 1.0)),
+    (make_array(1.0, 1.0), make_array(1.0, 1.0)),
+    (make_array(1.0, 0.0), make_array(-1.0, 0.0)),
+    (make_array(1.0, 0.0), NULL)
+) as t(column1, column2);
+----
+1
+0
+2
+NULL
+
+# Empty arrays return NULL (magnitude = 0)
+query R
+select cosine_distance(arrow_cast(make_array(), 'List(Float64)'), arrow_cast(make_array(), 'List(Float64)'));
+----
+NULL
+
+# No arguments error
+query error cosine_distance function requires 2 arguments, got 0
+select cosine_distance();
+
+# Return type is Float64
+query RT
+select cosine_distance([1.0, 0.0], [0.0, 1.0]), arrow_typeof(cosine_distance([1.0, 0.0], [0.0, 1.0]));
+----
+1 Float64
diff --git a/datafusion/sqllogictest/test_files/count_star_rule.slt b/datafusion/sqllogictest/test_files/count_star_rule.slt
index b78c021a565c1..a1c0e6303a765 100644
--- a/datafusion/sqllogictest/test_files/count_star_rule.slt
+++ b/datafusion/sqllogictest/test_files/count_star_rule.slt
@@ -1,4 +1,4 @@
-# Licensed to the Apache Software Foundation (ASF) under one
+# Licensed to the Apache Software Foundation (ASF) under onecount_star
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
@@ -49,11 +49,9 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[a@0 as a, count(Int64(1))@1 as count()]
 02)--AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(Int64(1))]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)----------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(Int64(1))]
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1
+04)------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(Int64(1))]
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TT
 EXPLAIN SELECT t1.a, COUNT() AS cnt FROM t1 GROUP BY t1.a HAVING COUNT() > 0;
@@ -65,14 +63,11 @@ logical_plan
 04)------TableScan: t1 projection=[a]
 physical_plan
 01)ProjectionExec: expr=[a@0 as a, count(Int64(1))@1 as cnt]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: count(Int64(1))@1 > 0
-04)------AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(Int64(1))]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
-07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-08)--------------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(Int64(1))]
-09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--FilterExec: count(Int64(1))@1 > 0
+03)----AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(Int64(1))]
+04)------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(Int64(1))]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query II
 SELECT t1.a, COUNT() AS cnt FROM t1 GROUP BY t1.a HAVING COUNT() > 1;
diff --git a/datafusion/sqllogictest/test_files/create_external_table.slt b/datafusion/sqllogictest/test_files/create_external_table.slt
index 1e6183f48bac7..0b15a7f8ec5dd 100644
--- a/datafusion/sqllogictest/test_files/create_external_table.slt
+++ b/datafusion/sqllogictest/test_files/create_external_table.slt
@@ -264,7 +264,7 @@ logical_plan
 02)--TableScan: t projection=[id]
 physical_plan
 01)SortExec: expr=[id@0 DESC], preserve_partitioning=[false]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id], file_type=parquet, reverse_row_groups=true
 
 statement ok
 DROP TABLE t;
diff --git a/datafusion/sqllogictest/test_files/csv_files.slt b/datafusion/sqllogictest/test_files/csv_files.slt
index 5a7fa309dbfab..d980e802c83cb 100644
--- a/datafusion/sqllogictest/test_files/csv_files.slt
+++ b/datafusion/sqllogictest/test_files/csv_files.slt
@@ -380,3 +380,194 @@ SET datafusion.optimizer.repartition_file_min_size = 10485760;
 
 statement ok
 drop table stored_table_with_cr_terminator;
+
+# Test quote_style option
+
+statement ok
+CREATE TABLE quote_style_source (
+  int_col INT,
+  string_col TEXT,
+  float_col DOUBLE
+) AS VALUES
+(1, 'hello', 1.1),
+(2, 'world', 2.2),
+(3, 'comma,value', 3.3);
+
+# QuoteStyle::Always - all fields are quoted
+query I
+COPY quote_style_source TO 'test_files/scratch/csv_files/quote_style_always.csv'
+STORED AS csv
+OPTIONS ('format.has_header' 'true', 'format.quote_style' 'Always');
+----
+3
+
+# Read the file back using a delimiter that doesn't appear in the data,
+# so each line is a single column and we can verify the actual quoting
+statement ok
+CREATE EXTERNAL TABLE stored_quote_style_always (
+  whole_file TEXT
+) STORED AS CSV
+LOCATION 'test_files/scratch/csv_files/quote_style_always.csv'
+OPTIONS ('format.has_header' 'true', 'format.delimiter' '@', 'format.quote' '~');
+
+query T
+select * from stored_quote_style_always;
+----
+"1","hello","1.1"
+"2","world","2.2"
+"3","comma,value","3.3"
+
+statement ok
+DROP TABLE stored_quote_style_always;
+
+# QuoteStyle::NonNumeric - only string fields are quoted
+query I
+COPY quote_style_source TO 'test_files/scratch/csv_files/quote_style_nonnumeric.csv'
+STORED AS csv
+OPTIONS ('format.has_header' 'true', 'format.quote_style' 'NonNumeric');
+----
+3
+
+# Read as a single column with a non-matching delimiter/quote to see raw output
+statement ok
+CREATE EXTERNAL TABLE stored_quote_style_nonnumeric (
+  whole_file TEXT
+) STORED AS CSV
+LOCATION 'test_files/scratch/csv_files/quote_style_nonnumeric.csv'
+OPTIONS ('format.has_header' 'true', 'format.delimiter' '@', 'format.quote' '~');
+
+query T
+select * from stored_quote_style_nonnumeric;
+----
+1,"hello",1.1
+2,"world",2.2
+3,"comma,value",3.3
+
+statement ok
+DROP TABLE stored_quote_style_nonnumeric;
+
+# QuoteStyle::Never - no fields are quoted (can produce invalid CSV)
+# Note: the comma in 'comma,value' will NOT be quoted, so reading back
+# will see an extra column
+query I
+COPY quote_style_source TO 'test_files/scratch/csv_files/quote_style_never.csv'
+STORED AS csv
+OPTIONS ('format.has_header' 'true', 'format.quote_style' 'Never');
+----
+3
+
+# Read as a single column with a non-matching delimiter/quote to see raw output
+statement ok
+CREATE EXTERNAL TABLE stored_quote_style_never (
+  whole_file TEXT
+) STORED AS CSV
+LOCATION 'test_files/scratch/csv_files/quote_style_never.csv'
+OPTIONS ('format.has_header' 'true', 'format.delimiter' '@', 'format.quote' '~');
+
+# No fields are quoted - note the comma in 'comma,value' is unescaped
+query T
+select * from stored_quote_style_never;
+----
+1,hello,1.1
+2,world,2.2
+3,comma,value,3.3
+
+statement ok
+DROP TABLE stored_quote_style_never;
+
+statement ok
+DROP TABLE quote_style_source;
+
+# Test ignore_leading_whitespace and ignore_trailing_whitespace options
+
+statement ok
+CREATE TABLE whitespace_source (
+  id INT,
+  value TEXT
+) AS VALUES
+(1, '  hello  '),
+(2, '  world  '),
+(3, 'no_space');
+
+# Write with ignore_leading_whitespace to trim leading spaces
+query I
+COPY whitespace_source TO 'test_files/scratch/csv_files/trim_leading.csv'
+STORED AS csv
+OPTIONS ('format.has_header' 'true', 'format.ignore_leading_whitespace' 'true');
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE stored_trim_leading (
+  id INT,
+  value TEXT
+) STORED AS CSV
+LOCATION 'test_files/scratch/csv_files/trim_leading.csv'
+OPTIONS ('format.has_header' 'true');
+
+query ITT
+select id, value, 'end' from stored_trim_leading order by id;
+----
+1 hello   end
+2 world   end
+3 no_space end
+
+statement ok
+DROP TABLE stored_trim_leading;
+
+# Write with ignore_trailing_whitespace to trim trailing spaces
+query I
+COPY whitespace_source TO 'test_files/scratch/csv_files/trim_trailing.csv'
+STORED AS csv
+OPTIONS ('format.has_header' 'true', 'format.ignore_trailing_whitespace' 'true');
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE stored_trim_trailing (
+  id INT,
+  value TEXT
+) STORED AS CSV
+LOCATION 'test_files/scratch/csv_files/trim_trailing.csv'
+OPTIONS ('format.has_header' 'true');
+
+query ITT
+select id, value, 'end' from stored_trim_trailing order by id;
+----
+1   hello end
+2   world end
+3 no_space end
+
+statement ok
+DROP TABLE stored_trim_trailing;
+
+# Write with both ignore_leading and ignore_trailing whitespace
+query I
+COPY whitespace_source TO 'test_files/scratch/csv_files/trim_both.csv'
+STORED AS csv
+OPTIONS ('format.has_header' 'true',
+         'format.ignore_leading_whitespace' 'true',
+         'format.ignore_trailing_whitespace' 'true');
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE stored_trim_both (
+  id INT,
+  value TEXT
+) STORED AS CSV
+LOCATION 'test_files/scratch/csv_files/trim_both.csv'
+OPTIONS ('format.has_header' 'true');
+
+query ITT
+select id, value, 'end' from stored_trim_both order by id;
+----
+1 hello end
+2 world end
+3 no_space end
+
+statement ok
+DROP TABLE stored_trim_both;
+
+statement ok
+DROP TABLE whitespace_source;
diff --git a/datafusion/sqllogictest/test_files/cte.slt b/datafusion/sqllogictest/test_files/cte.slt
index e7ca7a5ae1d8d..e9c1c0245d1c8 100644
--- a/datafusion/sqllogictest/test_files/cte.slt
+++ b/datafusion/sqllogictest/test_files/cte.slt
@@ -42,6 +42,76 @@ physical_plan
 statement error DataFusion error: Error during planning: WITH query name "a" specified more than once
 WITH a AS (SELECT 1), a AS (SELECT 2) SELECT * FROM a;
 
+
+##########
+## CTE Reference Resolution
+##########
+
+# These tests exercise CTE reference resolution with and without identifier
+# normalization. The session is configured with a strict catalog/schema provider
+# (see `datafusion/sqllogictest/src/test_context.rs`) that only provides the
+# `orders` table and panics on any unexpected table lookup.
+#
+# This makes it observable if DataFusion incorrectly treats a CTE reference as a
+# catalog lookup.
+#
+# Refs: https://github.com/apache/datafusion/issues/18932
+#
+# NOTE: This test relies on a strict schema provider registered in
+# `datafusion/sqllogictest/src/test_context.rs` that provides only the `orders`
+# table and panics on unexpected lookups.
+
+# Use the strict schema provider
+statement ok
+set datafusion.catalog.default_schema = "strict_schema";
+
+statement ok
+set datafusion.sql_parser.enable_ident_normalization = true;
+
+query I
+with barbaz as (select * from orders) select * from "barbaz";
+----
+1
+2
+
+query I
+with BarBaz as (select * from orders) select * from "barbaz";
+----
+1
+2
+
+query I
+with barbaz as (select * from orders) select * from barbaz;
+----
+1
+2
+
+statement ok
+set datafusion.sql_parser.enable_ident_normalization = false;
+
+query I
+with barbaz as (select * from orders) select * from "barbaz";
+----
+1
+2
+
+query I
+with barbaz as (select * from orders) select * from barbaz;
+----
+1
+2
+
+# Reset to default configs
+statement ok
+set datafusion.sql_parser.enable_ident_normalization = true;
+
+statement ok
+set datafusion.catalog.default_schema = "public";
+
+## CTE reference resolution tests end ##
+
+
+
 # Test disabling recursive CTE
 statement ok
 set datafusion.execution.enable_recursive_ctes = false;
@@ -58,18 +128,6 @@ WITH RECURSIVE nodes AS (
 statement ok
 set datafusion.execution.enable_recursive_ctes = true;
 
-
-# DISTINCT UNION is not supported
-query error DataFusion error: This feature is not implemented: Recursive queries with a distinct 'UNION' \(in which the previous iteration's results will be de\-duplicated\) is not supported
-WITH RECURSIVE nodes AS (
-    SELECT 1 as id
-    UNION
-    SELECT id + 1 as id
-    FROM nodes
-    WHERE id < 3
-) SELECT * FROM nodes
-
-
 # trivial recursive CTE works
 query I rowsort
 WITH RECURSIVE nodes AS (
@@ -117,10 +175,44 @@ physical_plan
 03)----PlaceholderRowExec
 04)--CoalescePartitionsExec
 05)----ProjectionExec: expr=[id@0 + 1 as id]
-06)------CoalesceBatchesExec: target_batch_size=8192
-07)--------FilterExec: id@0 < 10
-08)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)------------WorkTableExec: name=nodes
+06)------FilterExec: id@0 < 10
+07)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)----------WorkTableExec: name=nodes
+
+# simple deduplicating recursive CTE works
+query I
+WITH RECURSIVE nodes AS (
+    SELECT id from (VALUES (1), (2)) nodes(id)
+    UNION
+    SELECT id + 1 as id
+    FROM nodes
+    WHERE id < 4
+)
+SELECT * FROM nodes
+----
+1
+2
+3
+4
+
+# deduplicating recursive CTE with two variables works
+query II
+WITH RECURSIVE ranges AS (
+    SELECT min, max from (VALUES (1, 1), (2, 2)) ranges(min, max)
+    UNION
+    SELECT min, max + 1 as max
+    FROM ranges
+    WHERE max < 4
+)
+SELECT * FROM ranges
+----
+1 1
+2 2
+1 2
+2 3
+1 3
+2 4
+1 4
 
 # setup
 statement ok
@@ -162,10 +254,9 @@ physical_plan
 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/recursive_cte/balance.csv]]}, projection=[time, name, account_balance], file_type=csv, has_header=true
 04)----CoalescePartitionsExec
 05)------ProjectionExec: expr=[time@0 + 1 as time, name@1 as name, account_balance@2 + 10 as account_balance]
-06)--------CoalesceBatchesExec: target_batch_size=2
-07)----------FilterExec: time@0 < 10
-08)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)--------------WorkTableExec: name=balances
+06)--------FilterExec: time@0 < 10
+07)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)------------WorkTableExec: name=balances
 
 # recursive CTE with static term derived from table works
 # note that this is run with batch size set to 2. This should produce multiple batches per iteration since the input
@@ -645,21 +736,51 @@ ORDER BY
 3 1400 1
 1 2700 2
 
-#expect error from recursive CTE with nested recursive terms
-query error DataFusion error: This feature is not implemented: Recursive queries cannot be nested
+#nested recursive ctes
+query I
 WITH RECURSIVE outer_cte AS (
     SELECT 1 as a
     UNION ALL (
-        WITH  RECURSIVE nested_cte AS (
+        WITH RECURSIVE nested_cte AS (
            SELECT 1 as a
            UNION ALL
-           SELECT a+2 as a
-	   FROM nested_cte where a < 3
-         )
-    SELECT outer_cte.a +2
-    FROM outer_cte JOIN nested_cte USING(a)
-    WHERE nested_cte.a < 4
-   )
+           SELECT a + 2 as a
+           FROM nested_cte where a < 3
+        )
+        SELECT outer_cte.a + 2 as a
+        FROM outer_cte JOIN nested_cte USING(a)
+        WHERE nested_cte.a < 4
+    )
+)
+SELECT a FROM outer_cte;
+----
+1
+3
+5
+
+# Check that CTE name shadowing is returning an error
+query error DataFusion error: Error during planning: WITH query name "outer_cte" specified more than once
+WITH RECURSIVE outer_cte AS (
+    SELECT 1 as a
+    UNION ALL (
+        WITH RECURSIVE nested_cte AS (
+           SELECT 1 as a
+           UNION ALL (
+               WITH RECURSIVE outer_cte AS (
+                    SELECT 1 as a
+                    UNION ALL
+                    SELECT a + 2 as a
+                    FROM outer_cte where a < 3
+               )
+               SELECT nested_cte.a + outer_cte.a as a
+               FROM nested_cte JOIN outer_cte USING(a)
+               WHERE outer_cte_cte.a < 8
+           )
+        )
+        SELECT outer_cte.a + nested_cte.a as a
+        FROM outer_cte JOIN nested_cte USING(a)
+        WHERE nested_cte.a < 8
+    )
 )
 SELECT a FROM outer_cte;
 
@@ -734,12 +855,11 @@ physical_plan
 04)--ProjectionExec: expr=[2 as val]
 05)----CrossJoinExec
 06)------CoalescePartitionsExec
-07)--------CoalesceBatchesExec: target_batch_size=8182
-08)----------FilterExec: val@0 < 2
-09)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-10)--------------WorkTableExec: name=recursive_cte
-11)------ProjectionExec: expr=[2 as val]
-12)--------PlaceholderRowExec
+07)--------FilterExec: val@0 < 2
+08)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)------------WorkTableExec: name=recursive_cte
+10)------ProjectionExec: expr=[2 as val]
+11)--------PlaceholderRowExec
 
 # Test issue: https://github.com/apache/datafusion/issues/9794
 # Non-recursive term and recursive term have different types
@@ -908,10 +1028,9 @@ logical_plan
 04)----SubqueryAlias: cte
 05)------TableScan: person projection=[id]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8182
-02)--HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(id@0, id@0)]
-03)----DataSourceExec: partitions=1, partition_sizes=[0]
-04)----DataSourceExec: partitions=1, partition_sizes=[0]
+01)HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(id@0, id@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[0]
 
 statement count 0
 drop table person;
@@ -947,7 +1066,7 @@ query TT
 explain WITH RECURSIVE numbers AS (
   select 1 as n
   UNION ALL
-  select n + 1 FROM numbers WHERE N < 10
+  select n + 1 FROM numbers WHERE n < 10
 ) select * from numbers;
 ----
 logical_plan
@@ -964,16 +1083,15 @@ physical_plan
 03)----PlaceholderRowExec
 04)--CoalescePartitionsExec
 05)----ProjectionExec: expr=[n@0 + 1 as numbers.n + Int64(1)]
-06)------CoalesceBatchesExec: target_batch_size=8182
-07)--------FilterExec: n@0 < 10
-08)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)------------WorkTableExec: name=numbers
+06)------FilterExec: n@0 < 10
+07)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)----------WorkTableExec: name=numbers
 
 query TT
 explain WITH RECURSIVE numbers AS (
   select 1 as n
   UNION ALL
-  select n + 1 FROM numbers WHERE N < 10
+  select n + 1 FROM numbers WHERE n < 10
 ) select * from numbers;
 ----
 logical_plan
@@ -990,10 +1108,9 @@ physical_plan
 03)----PlaceholderRowExec
 04)--CoalescePartitionsExec
 05)----ProjectionExec: expr=[n@0 + 1 as numbers.n + Int64(1)]
-06)------CoalesceBatchesExec: target_batch_size=8182
-07)--------FilterExec: n@0 < 10
-08)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)------------WorkTableExec: name=numbers
+06)------FilterExec: n@0 < 10
+07)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)----------WorkTableExec: name=numbers
 
 # Test for issue #16998: SortExec shares DynamicFilterPhysicalExpr across multiple executions
 query II
@@ -1049,6 +1166,140 @@ physical_plan
 05)----SortExec: TopK(fetch=1), expr=[v@1 ASC NULLS LAST], preserve_partitioning=[false]
 06)------WorkTableExec: name=r
 
+# setup
+statement ok
+CREATE EXTERNAL TABLE closure STORED as CSV LOCATION '../core/tests/data/recursive_cte/closure.csv' OPTIONS ('format.has_header' 'true');
+
+# transitive closure with loop
+query II
+WITH RECURSIVE trans AS (
+    SELECT * FROM closure
+    UNION
+    SELECT l.start, r.end
+    FROM trans as l, closure AS r
+    WHERE l.end = r.start
+) SELECT * FROM trans ORDER BY start, end
+----
+1 1
+1 2
+1 3
+1 4
+2 1
+2 2
+2 3
+2 4
+4 1
+4 2
+4 3
+4 4
+
+query TT
+EXPLAIN WITH RECURSIVE trans AS (
+    SELECT * FROM closure
+    UNION
+    SELECT l.start, r.end
+    FROM trans as l, closure AS r
+    WHERE l.end = r.start
+) SELECT * FROM trans
+----
+logical_plan
+01)SubqueryAlias: trans
+02)--RecursiveQuery: is_distinct=true
+03)----Projection: closure.start, closure.end
+04)------TableScan: closure
+05)----Projection: l.start, r.end
+06)------Inner Join: l.end = r.start
+07)--------SubqueryAlias: l
+08)----------TableScan: trans
+09)--------SubqueryAlias: r
+10)----------TableScan: closure
+physical_plan
+01)RecursiveQueryExec: name=trans, is_distinct=true
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/recursive_cte/closure.csv]]}, projection=[start, end], file_type=csv, has_header=true
+03)--CoalescePartitionsExec
+04)----HashJoinExec: mode=Partitioned, join_type=Inner, on=[(end@1, start@0)], projection=[start@0, end@3]
+05)------RepartitionExec: partitioning=Hash([end@1], 4), input_partitions=1
+06)--------WorkTableExec: name=trans
+07)------RepartitionExec: partitioning=Hash([start@0], 4), input_partitions=1
+08)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/recursive_cte/closure.csv]]}, projection=[start, end], file_type=csv, has_header=true
+
+# Regression test: recursive CTE column-name stability under ORDER BY + LIMIT.
+#
+# The recursive CTE below has an anchor that aliases a computed column
+# (`upper(val) AS val`) while the recursive term leaves the same expression
+# un-aliased (`upper(r.val)`). RecursiveQueryExec declares its output schema
+# from the anchor, so the plan-level column is `val`; but if it forwards the
+# recursive term's batches without rebinding their schema to the anchor's,
+# downstream consumers that read `batch.schema().field(i).name()` observe
+# the leaked `upper(r.val)` name. The outer `ORDER BY ts LIMIT 10` routes
+# through the TopK path in SortExec, which passes batch schemas through
+# verbatim — so the leak surfaces there.
+#
+# sqllogictest compares row values and can't assert column names directly.
+# To surface the leak inside SLT, we `COPY` the result to a CSV file with a
+# header row (the CSV writer names header columns from each batch's own
+# schema) and then read the file back as headerless CSV so the header line
+# appears as a data row we can compare against the expected column names.
+statement ok
+CREATE TABLE cte_schema_records (
+    id VARCHAR NOT NULL,
+    parent_id VARCHAR,
+    ts TIMESTAMP NOT NULL,
+    val VARCHAR
+);
+
+statement ok
+INSERT INTO cte_schema_records VALUES
+  ('a00', NULL,  TIMESTAMP '2025-01-01 00:00:00', 'v_span'),
+  ('a01', 'a00', TIMESTAMP '2025-01-01 00:00:01', 'v_log_1'),
+  ('a02', 'a01', TIMESTAMP '2025-01-01 00:00:02', 'v_log_2'),
+  ('a03', 'a02', TIMESTAMP '2025-01-01 00:00:03', 'v_log_3'),
+  ('a04', 'a03', TIMESTAMP '2025-01-01 00:00:04', 'v_log_4'),
+  ('a05', 'a04', TIMESTAMP '2025-01-01 00:00:05', 'v_log_5');
+
+query I
+COPY (WITH RECURSIVE descendants AS (
+    SELECT id, parent_id, ts, upper(val) AS val
+      FROM cte_schema_records WHERE id = 'a00'
+    UNION ALL
+    SELECT r.id, r.parent_id, r.ts, upper(r.val)
+      FROM cte_schema_records r
+      INNER JOIN descendants d ON r.parent_id = d.id
+  )
+  SELECT id, parent_id, ts, val
+    FROM descendants
+    ORDER BY ts ASC
+    LIMIT 10)
+TO 'test_files/scratch/cte/recursive_order_limit_schema/'
+STORED AS CSV OPTIONS ('format.has_header' 'true');
+----
+6
+
+statement ok
+CREATE EXTERNAL TABLE cte_schema_reread (
+    id VARCHAR, parent_id VARCHAR, ts VARCHAR, val VARCHAR
+)
+STORED AS CSV
+LOCATION 'test_files/scratch/cte/recursive_order_limit_schema/'
+OPTIONS ('format.has_header' 'false');
+
+# The CSV header row appears as a data row here because we re-read with
+# has_header='false'. It must match the CTE's declared column names; if
+# RecursiveQueryExec leaked the recursive branch's schema, the last field
+# would read `upper(r.val)` instead of `val`.
+query TTTT
+SELECT id, parent_id, ts, val
+  FROM cte_schema_reread
+  WHERE id = 'id';
+----
+id parent_id ts val
+
+statement ok
+DROP TABLE cte_schema_reread;
+
+statement ok
+DROP TABLE cte_schema_records;
+
 statement count 0
 set datafusion.execution.enable_recursive_ctes = false;
 
@@ -1056,5 +1307,15 @@ query error DataFusion error: This feature is not implemented: Recursive CTEs ar
 explain WITH RECURSIVE numbers AS (
   select 1 as n
   UNION ALL
-  select n + 1 FROM numbers WHERE N < 10
+  select n + 1 FROM numbers WHERE n < 10
 ) select * from numbers;
+
+# Config reset
+statement ok
+RESET datafusion.execution.batch_size;
+
+statement ok
+RESET datafusion.execution.enable_recursive_ctes;
+
+statement ok
+RESET datafusion.sql_parser.enable_ident_normalization;
diff --git a/datafusion/sqllogictest/test_files/date_bin_errors.slt b/datafusion/sqllogictest/test_files/date_bin_errors.slt
new file mode 100644
index 0000000000000..b6cda471d7afa
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/date_bin_errors.slt
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for DATE_BIN error handling with out-of-range values
+
+# Test case from issue #20219 - should return NULL instead of panicking
+query P
+select date_bin(interval '1637426858 months', to_timestamp_millis(1040292460), timestamp '1984-01-07 00:00:00');
+----
+NULL
+
+# Negative timestamp with month interval - should return NULL instead of panicking  
+query P
+select date_bin(interval '1 month', to_timestamp_millis(-1040292460), timestamp '1984-01-07 00:00:00');
+----
+NULL
+
+# Large stride causing overflow - should return NULL
+query P
+select date_bin(
+  interval '1637426858 months',
+  timestamp '1969-12-31 00:00:00',
+  timestamp '1984-01-07 00:00:00'
+);
+----
+NULL
+
+# Another large stride test
+query P
+select date_bin(
+  interval '1637426858 months',
+  to_timestamp_millis(-1040292000),
+  timestamp '1984-01-07 00:00:00'
+) as b;
+----
+NULL
+
+# Test with 1900-01-01 timestamp
+query P
+select date_bin(
+  interval '1637426858 months',
+  to_timestamp_millis(-2208988800000),
+  timestamp '1984-01-07 00:00:00'
+) as b;
+----
+NULL
\ No newline at end of file
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_date_date.slt b/datafusion/sqllogictest/test_files/datetime/arith_date_date.slt
new file mode 100644
index 0000000000000..8eb5cc176f365
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_date_date.slt
@@ -0,0 +1,15 @@
+# date - date → integer
+# Subtract dates, producing the number of days elapsed
+# date '2001-10-01' - date '2001-09-28' → 3
+# This aligns with PostgreSQL, DuckDB, and MySQL behavior
+# Resolved by: https://github.com/apache/datafusion/issues/19528
+
+query I
+SELECT '2001-10-01'::date - '2001-09-28'::date
+----
+3
+
+query T
+SELECT arrow_typeof('2001-10-01'::date - '2001-09-28'::date)
+----
+Int64
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_date_integer.slt b/datafusion/sqllogictest/test_files/datetime/arith_date_integer.slt
new file mode 100644
index 0000000000000..512c507d9478c
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_date_integer.slt
@@ -0,0 +1,89 @@
+# date + integer → date
+# Add a number of days to a date
+# date '2001-09-28' + 7 → 2001-10-05
+
+query D
+SELECT '2001-09-28'::date + 7
+----
+2001-10-05
+
+query D
+SELECT 7 + '2001-09-28'::date
+----
+2001-10-05
+
+query T
+SELECT arrow_typeof('2001-09-28'::date + 7)
+----
+Date32
+
+query D
+SELECT arrow_cast('2001-09-28', 'Date64') + 7
+----
+2001-10-05T00:00:00
+
+query D
+SELECT 7::smallint + '2001-09-28'::date
+----
+2001-10-05
+
+query D
+SELECT 7::smallint unsigned + '2001-09-28'::date
+----
+2001-10-05
+
+query D
+SELECT 7::int unsigned + '2001-09-28'::date
+----
+2001-10-05
+
+query D
+SELECT 7::bigint + '2001-09-28'::date
+----
+2001-10-05
+
+query D
+SELECT 7::bigint unsigned + '2001-09-28'::date
+----
+2001-10-05
+
+query D
+SELECT 7 + arrow_cast('2001-09-28', 'Date64')
+----
+2001-10-05T00:00:00
+
+query T
+SELECT arrow_typeof(arrow_cast('2001-09-28', 'Date64') + 7)
+----
+Date64
+
+# date - integer → date
+# Subtract a number of days from a date
+# date '2001-10-01' - 7 → 2001-09-24
+
+query D
+SELECT '2001-10-01'::date - 7
+----
+2001-09-24
+
+query D
+SELECT arrow_cast('2001-10-01', 'Date64') - 7
+----
+2001-09-24T00:00:00
+
+query T
+SELECT arrow_typeof('2001-10-01'::date - 7)
+----
+Date32
+
+query error Invalid arithmetic operation
+SELECT 7 - '2001-10-01'::date
+
+query error Invalid date arithmetic operation
+SELECT '2001-10-01'::date * 7
+
+query error Invalid date arithmetic operation
+SELECT '2001-10-01'::date / 7
+
+query error Invalid date arithmetic operation
+SELECT '2001-10-01'::date % 7
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_date_interval.slt b/datafusion/sqllogictest/test_files/datetime/arith_date_interval.slt
new file mode 100644
index 0000000000000..01e1939996dfc
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_date_interval.slt
@@ -0,0 +1,49 @@
+# postgresql behavior
+#
+# date + interval → timestamp
+# Add an interval to a date
+# date '2001-09-28' + interval '1 hour' → 2001-09-28 01:00:00
+#
+# note that while the above reflects what postgresql does
+# in the case of datafusion/arrow that is not the case. The
+# result will be date32/date64
+#
+# Tracking issue: https://github.com/apache/datafusion/issues/19527
+
+query D
+SELECT '2001-09-28'::date + interval '1 hour'
+----
+2001-09-28
+
+query T
+SELECT arrow_typeof('2001-09-28'::date + interval '1 hour')
+----
+Date32
+
+# postgresql behavior
+#
+# date - interval → timestamp
+# Subtract an interval from a date
+# date '2001-09-28' - interval '1 hour' → 2001-09-27 23:00:00
+
+query D
+SELECT '2001-09-28'::date - interval '25 hour'
+----
+2001-09-27
+
+query T
+SELECT arrow_typeof('2001-09-28'::date - interval '25 hour')
+----
+Date32
+
+query error Arrow error: Compute error: Date arithmetic overflow
+SELECT arrow_cast('2020-01-01', 'Date32') + INTERVAL '999999' YEAR
+
+query error Arrow error: Compute error: Date arithmetic overflow
+SELECT arrow_cast('2020-01-01', 'Date32') - INTERVAL '999999' YEAR
+
+query error Arrow error: Compute error: Date arithmetic overflow
+SELECT arrow_cast('2020-01-01', 'Date64') + INTERVAL '999999' YEAR
+
+query error Arrow error: Compute error: Date arithmetic overflow
+SELECT arrow_cast('2020-01-01', 'Date64') - INTERVAL '999999' YEAR
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_date_time.slt b/datafusion/sqllogictest/test_files/datetime/arith_date_time.slt
new file mode 100644
index 0000000000000..8e85c8f90580e
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_date_time.slt
@@ -0,0 +1,115 @@
+# date + time → timestamp
+# Add a time-of-day to a date
+# date '2001-09-28' + time '03:00' → 2001-09-28 03:00:00
+
+query P
+SELECT '2001-09-28'::date + '03:00'::time
+----
+2001-09-28T03:00:00
+
+query P
+SELECT '03:00'::time + '2001-09-28'::date
+----
+2001-09-28T03:00:00
+
+query T
+SELECT arrow_typeof('2001-09-28'::date + '03:00'::time)
+----
+Timestamp(ns)
+
+query P
+SELECT '2001-09-28'::date - '03:00'::time
+----
+2001-09-27T21:00:00
+
+query P
+SELECT arrow_cast('2001-09-28', 'Date32') + arrow_cast('03:00', 'Time32(Second)')
+----
+2001-09-28T03:00:00
+
+query P
+SELECT arrow_cast('2001-09-28', 'Date32') + arrow_cast('03:00', 'Time32(Millisecond)')
+----
+2001-09-28T03:00:00
+
+query P
+SELECT arrow_cast('2001-09-28', 'Date32') + arrow_cast('03:00', 'Time64(Microsecond)')
+----
+2001-09-28T03:00:00
+
+query P
+SELECT arrow_cast('2001-09-28', 'Date32') + arrow_cast('03:00', 'Time64(Nanosecond)')
+----
+2001-09-28T03:00:00
+
+query P
+SELECT arrow_cast('2001-09-28', 'Date64') + arrow_cast('03:00', 'Time32(Second)')
+----
+2001-09-28T03:00:00
+
+query P
+SELECT arrow_cast('2001-09-28', 'Date64') + arrow_cast('03:00:00.123', 'Time32(Millisecond)')
+----
+2001-09-28T03:00:00.123
+
+query P
+SELECT arrow_cast('2001-09-28', 'Date64') + arrow_cast('03:00:00.123456', 'Time64(Microsecond)')
+----
+2001-09-28T03:00:00.123456
+
+query P
+SELECT arrow_cast('2001-09-28', 'Date64') + arrow_cast('03:00:00.001234567', 'Time64(Nanosecond)')
+----
+2001-09-28T03:00:00.001234567
+
+query P
+SELECT arrow_cast('03:00', 'Time32(Second)') + arrow_cast('2001-09-28', 'Date32')
+----
+2001-09-28T03:00:00
+
+query P
+SELECT arrow_cast('03:00', 'Time32(Millisecond)') + arrow_cast('2001-09-28', 'Date32')
+----
+2001-09-28T03:00:00
+
+query P
+SELECT arrow_cast('03:00', 'Time64(Microsecond)') + arrow_cast('2001-09-28', 'Date32')
+----
+2001-09-28T03:00:00
+
+query P
+SELECT arrow_cast('03:00', 'Time64(Nanosecond)') + arrow_cast('2001-09-28', 'Date32')
+----
+2001-09-28T03:00:00
+
+query P
+SELECT arrow_cast('03:00', 'Time32(Second)') + arrow_cast('2001-09-28', 'Date64')
+----
+2001-09-28T03:00:00
+
+query P
+SELECT arrow_cast('03:00:00.123', 'Time32(Millisecond)') + arrow_cast('2001-09-28', 'Date64')
+----
+2001-09-28T03:00:00.123
+
+query P
+SELECT arrow_cast('03:00:00.123456', 'Time64(Microsecond)') + arrow_cast('2001-09-28', 'Date64')
+----
+2001-09-28T03:00:00.123456
+
+query P
+SELECT arrow_cast('03:00:00.001234567', 'Time64(Nanosecond)') + arrow_cast('2001-09-28', 'Date64')
+----
+2001-09-28T03:00:00.001234567
+
+query error Invalid arithmetic operation
+SELECT '03:00'::time - '2001-09-28'::date
+
+query error Invalid timestamp arithmetic operation
+SELECT '2001-09-28'::date * '03:00'::time
+
+query error Invalid timestamp arithmetic operation
+SELECT '2001-09-28'::date / '03:00'::time
+
+query error Invalid timestamp arithmetic operation
+SELECT '2001-09-28'::date % '03:00'::time
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_interval_double.slt b/datafusion/sqllogictest/test_files/datetime/arith_interval_double.slt
new file mode 100644
index 0000000000000..d48d2b59c8bee
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_interval_double.slt
@@ -0,0 +1,41 @@
+# interval * double precision → interval
+# Multiply an interval by a scalar
+# interval '1 second' * 900 → 00:15:00
+# interval '1 day' * 21 → 21 days
+# interval '1 hour' * 3.5 → 03:30:00
+
+# these currently do not work - https://github.com/apache/arrow-rs/issues/9030
+
+query error Invalid interval arithmetic operation: Interval\(MonthDayNano\) \* Interval\(MonthDayNano\)
+SELECT interval '1 second' * 900
+
+
+query error Invalid interval arithmetic operation: Interval\(MonthDayNano\) \* Interval\(MonthDayNano\)
+SELECT 900 * interval '1 second'
+
+
+query error Invalid interval arithmetic operation: Interval\(MonthDayNano\) \* Interval\(MonthDayNano\)
+SELECT interval '1 day' * 21
+
+
+query error Invalid interval arithmetic operation: Interval\(MonthDayNano\) \* Interval\(MonthDayNano\)
+SELECT interval '1 hour' * 3.5
+
+
+query error Invalid interval arithmetic operation: Interval\(MonthDayNano\) \* Interval\(MonthDayNano\)
+SELECT 3.5 * interval '1 hour'
+
+
+query error Invalid interval arithmetic operation: Interval\(MonthDayNano\) \* Interval\(MonthDayNano\)
+SELECT arrow_typeof(interval '1 second' * 900)
+
+# interval / double precision → interval
+# Divide an interval by a scalar
+# interval '1 hour' / 1.5 → 00:40:00
+
+query error Invalid interval arithmetic operation: Interval\(MonthDayNano\) / Interval\(MonthDayNano\)
+SELECT interval '1 hour' / 1.5
+
+
+query error Invalid interval arithmetic operation: Interval\(MonthDayNano\) / Interval\(MonthDayNano\)
+SELECT arrow_typeof(interval '1 hour' / 1.5)
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_interval_interval.slt b/datafusion/sqllogictest/test_files/datetime/arith_interval_interval.slt
new file mode 100644
index 0000000000000..d8a701356b6e3
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_interval_interval.slt
@@ -0,0 +1,27 @@
+# interval + interval → interval
+# Add intervals
+# interval '1 day' + interval '1 hour' → 1 day 01:00:00
+
+query ?
+SELECT interval '1 day' + interval '1 hour'
+----
+1 days 1 hours
+
+query T
+SELECT arrow_typeof(interval '1 day' + interval '1 hour')
+----
+Interval(MonthDayNano)
+
+# interval - interval → interval
+# Subtract intervals
+# interval '1 day' - interval '1 hour' → 1 day -01:00:00
+
+query ?
+SELECT interval '1 day' - interval '1 hour'
+----
+1 days -1 hours
+
+query T
+SELECT arrow_typeof(interval '1 day' - interval '1 hour')
+----
+Interval(MonthDayNano)
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_negate_interval.slt b/datafusion/sqllogictest/test_files/datetime/arith_negate_interval.slt
new file mode 100644
index 0000000000000..52ef046bf22da
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_negate_interval.slt
@@ -0,0 +1,13 @@
+# - interval → interval
+# Negate an interval
+# - interval '23 hours' → -23:00:00
+
+query ?
+SELECT - interval '23 hours'
+----
+-23 hours
+
+query T
+SELECT arrow_typeof(- interval '23 hours')
+----
+Interval(MonthDayNano)
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_time_interval.slt b/datafusion/sqllogictest/test_files/datetime/arith_time_interval.slt
new file mode 100644
index 0000000000000..997eae9b1bd8b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_time_interval.slt
@@ -0,0 +1,70 @@
+# postgresql behavior
+#
+# time + interval → time
+# Add an interval to a time
+# time '01:00' + interval '3 hours' → 04:00:00
+#
+# note that while the above reflects what postgresql does
+# in the case of datafusion/arrow that is not the case. The
+# result will be an interval, not a time.
+
+query ?
+SELECT '01:00'::time + interval '3 hours'
+----
+4 hours
+
+query T
+SELECT arrow_typeof('01:00'::time + interval '3 hours')
+----
+Interval(MonthDayNano)
+
+query ?
+SELECT '22:00'::time + interval '3 hours'
+----
+25 hours
+
+query ?
+SELECT interval '3 hours' + '22:00'::time
+----
+25 hours
+
+query ?
+SELECT arrow_cast('22:00', 'Time32(Second)') + interval '3 hours'
+----
+25 hours
+
+query ?
+SELECT arrow_cast('22:00', 'Time32(Millisecond)') + interval '3 hours'
+----
+25 hours
+
+query ?
+SELECT arrow_cast('22:00', 'Time64(Microsecond)') + interval '3 hours'
+----
+25 hours
+
+query ?
+SELECT arrow_cast('22:00', 'Time64(Nanosecond)') + interval '3 hours'
+----
+25 hours
+
+# postgresql behavior
+#
+# time - interval → time
+# Subtract an interval from a time
+# time '05:00' - interval '2 hours' → 03:00:00
+
+query ?
+SELECT '05:00'::time - interval '2 hours'
+----
+3 hours
+
+query T
+SELECT arrow_typeof('05:00'::time - interval '2 hours')
+----
+Interval(MonthDayNano)
+
+query ?
+SELECT '02:00'::time - interval '3 hours'
+----
+-1 hours
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_time_time.slt b/datafusion/sqllogictest/test_files/datetime/arith_time_time.slt
new file mode 100644
index 0000000000000..4cf081970e2f9
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_time_time.slt
@@ -0,0 +1,47 @@
+# time - time → interval
+# Subtract times
+# time '05:00' - time '03:00' → 02:00:00
+
+query ?
+SELECT '05:00'::time - '03:00'::time
+----
+2 hours
+
+query T
+SELECT arrow_typeof('05:00'::time - '03:00'::time)
+----
+Interval(MonthDayNano)
+
+query ?
+SELECT '05:00'::time + '03:00'::time
+----
+8 hours
+
+query ?
+SELECT arrow_cast('05:00', 'Time32(Second)') - arrow_cast('03:00', 'Time32(Millisecond)')
+----
+2 hours
+
+query ?
+SELECT arrow_cast('05:00', 'Time32(Second)') - arrow_cast('03:00', 'Time64(Microsecond)')
+----
+2 hours
+
+query ?
+SELECT arrow_cast('05:00', 'Time64(Microsecond)') - arrow_cast('03:00', 'Time32(Millisecond)')
+----
+2 hours
+
+query ?
+SELECT arrow_cast('05:00', 'Time64(Nanosecond)') - arrow_cast('03:00', 'Time32(Second)')
+----
+2 hours
+
+query error Invalid interval arithmetic operation
+SELECT '05:00'::time * '03:00'::time
+
+query error Invalid interval arithmetic operation
+SELECT '05:00'::time / '03:00'::time
+
+query error Invalid interval arithmetic operation
+SELECT '05:00'::time % '03:00'::time
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_timestamp_duration.slt b/datafusion/sqllogictest/test_files/datetime/arith_timestamp_duration.slt
new file mode 100644
index 0000000000000..aeeebe73db701
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_timestamp_duration.slt
@@ -0,0 +1,147 @@
+# timestamp + duration → timestamp
+# Add an duration to a timestamp
+# timestamp '2001-09-28 01:00' + arrow_cast(12345000000000, 'Duration(Nanosecond)') → 2001-09-29 00:00:00
+
+query P
+SELECT '2001-09-28T01:00:00'::timestamp + arrow_cast(12345, 'Duration(Second)');
+----
+2001-09-28T04:25:45
+
+query P
+SELECT '2001-09-28T01:00:00'::timestamp - arrow_cast(12345, 'Duration(Second)');
+----
+2001-09-27T21:34:15
+
+query P
+SELECT arrow_cast(12345, 'Duration(Second)') + '2001-09-28T01:00:00'::timestamp;
+----
+2001-09-28T04:25:45
+
+query T
+SELECT arrow_typeof('2001-09-28T01:00:00'::timestamp + arrow_cast(12345, 'Duration(Second)'))
+----
+Timestamp(ns)
+
+query P
+SELECT '2001-09-28T01:00:00'::timestamp + arrow_cast(12345000, 'Duration(Millisecond)');
+----
+2001-09-28T04:25:45
+
+query P
+SELECT '2001-09-28T01:00:00'::timestamp - arrow_cast(12345000, 'Duration(Millisecond)');
+----
+2001-09-27T21:34:15
+
+query T
+SELECT arrow_typeof('2001-09-28T01:00:00'::timestamp + arrow_cast(12345000, 'Duration(Millisecond)'))
+----
+Timestamp(ns)
+
+query P
+SELECT '2001-09-28T01:00:00'::timestamp + arrow_cast(12345000000, 'Duration(Microsecond)');
+----
+2001-09-28T04:25:45
+
+query P
+SELECT '2001-09-28T01:00:00'::timestamp - arrow_cast(12345000000, 'Duration(Microsecond)');
+----
+2001-09-27T21:34:15
+
+query T
+SELECT arrow_typeof('2001-09-28T01:00:00'::timestamp + arrow_cast(12345000000, 'Duration(Microsecond)'))
+----
+Timestamp(ns)
+
+query P
+SELECT '2001-09-28T01:00:00'::timestamp + arrow_cast(12345000000999, 'Duration(Nanosecond)');
+----
+2001-09-28T04:25:45.000000999
+
+query P
+SELECT '2001-09-28T01:00:00'::timestamp - arrow_cast(12345000000999, 'Duration(Nanosecond)');
+----
+2001-09-27T21:34:14.999999001
+
+query T
+SELECT arrow_typeof('2001-09-28T01:00:00'::timestamp + arrow_cast(12345000000999, 'Duration(Nanosecond)'))
+----
+Timestamp(ns)
+
+# test with other timestamp timeunits beyond the default ns
+
+# second +/- millisecond
+query P
+SELECT arrow_cast('2001-09-28T01:00:00', 'Timestamp(Second)') + arrow_cast(12345000, 'Duration(Millisecond)');
+----
+2001-09-28T04:25:45
+
+query P
+SELECT arrow_cast('2001-09-28T01:00:00', 'Timestamp(Second)') - arrow_cast(12345000, 'Duration(Millisecond)');
+----
+2001-09-27T21:34:15
+
+query T
+SELECT arrow_typeof(arrow_cast('2001-09-28T01:00:00', 'Timestamp(Second)') + arrow_cast(12345000, 'Duration(Millisecond)'))
+----
+Timestamp(s)
+
+# second +/- microsecond
+query P
+SELECT arrow_cast('2001-09-28T01:00:00', 'Timestamp(Second)') + arrow_cast(12345000000, 'Duration(Microsecond)');
+----
+2001-09-28T04:25:45
+
+query P
+SELECT arrow_cast('2001-09-28T01:00:00', 'Timestamp(Second)') - arrow_cast(12345000000, 'Duration(Microsecond)');
+----
+2001-09-27T21:34:15
+
+query T
+SELECT arrow_typeof(arrow_cast('2001-09-28T01:00:00', 'Timestamp(Second)') + arrow_cast(12345000000, 'Duration(Microsecond)'))
+----
+Timestamp(s)
+
+# millisecond +/- nanosecond
+query P
+SELECT arrow_cast('2001-09-28T01:00:00', 'Timestamp(Millisecond)') + arrow_cast(12345000000999, 'Duration(Nanosecond)');
+----
+2001-09-28T04:25:45
+
+query P
+SELECT arrow_cast('2001-09-28T01:00:00', 'Timestamp(Millisecond)') - arrow_cast(12345000000999, 'Duration(Nanosecond)');
+----
+2001-09-27T21:34:15
+
+query T
+SELECT arrow_typeof(arrow_cast('2001-09-28T01:00:00', 'Timestamp(Millisecond)') + arrow_cast(12345000000999, 'Duration(Nanosecond)'))
+----
+Timestamp(ms)
+
+# millisecond +/- microsecond
+query P
+SELECT arrow_cast('2001-09-28T01:00:00', 'Timestamp(Millisecond)') + arrow_cast(12345000000, 'Duration(Microsecond)');
+----
+2001-09-28T04:25:45
+
+query P
+SELECT arrow_cast('2001-09-28T01:00:00', 'Timestamp(Millisecond)') - arrow_cast(12345000000, 'Duration(Microsecond)');
+----
+2001-09-27T21:34:15
+
+query T
+SELECT arrow_typeof(arrow_cast('2001-09-28T01:00:00', 'Timestamp(Millisecond)') + arrow_cast(12345000000, 'Duration(Microsecond)'))
+----
+Timestamp(ms)
+
+# while timestamp + duration makes sense, duration - timestamp does not
+query error Invalid arithmetic operation: Duration\(ns\) - Timestamp\(ns\)
+SELECT arrow_cast(12345, 'Duration(Second)') - '2001-09-28T01:00:00'::timestamp;
+
+query error Invalid timestamp arithmetic operation
+SELECT '2001-09-28T01:00:00'::timestamp * arrow_cast(12345, 'Duration(Second)');
+
+query error Invalid timestamp arithmetic operation
+SELECT '2001-09-28T01:00:00'::timestamp % arrow_cast(12345, 'Duration(Second)');
+
+query error Invalid timestamp arithmetic operation
+SELECT '2001-09-28T01:00:00'::timestamp / arrow_cast(12345, 'Duration(Second)');
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_timestamp_interval.slt b/datafusion/sqllogictest/test_files/datetime/arith_timestamp_interval.slt
new file mode 100644
index 0000000000000..aaf629f1f07da
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_timestamp_interval.slt
@@ -0,0 +1,36 @@
+# timestamp + interval → timestamp
+# Add an interval to a timestamp
+# timestamp '2001-09-28 01:00' + interval '23 hours' → 2001-09-29 00:00:00
+
+query P
+SELECT '2001-09-28T01:00:00'::timestamp + interval '23 hours'
+----
+2001-09-29T00:00:00
+
+query T
+SELECT arrow_typeof('2001-09-28T01:00:00'::timestamp + interval '23 hours')
+----
+Timestamp(ns)
+
+# timestamp - interval → timestamp
+# Subtract an interval from a timestamp
+# timestamp '2001-09-28 23:00' - interval '23 hours' → 2001-09-28 00:00:00
+
+query P
+SELECT '2001-09-28T23:00:00'::timestamp - interval '23 hours'
+----
+2001-09-28T00:00:00
+
+query T
+SELECT arrow_typeof('2001-09-28T23:00:00'::timestamp - interval '23 hours')
+----
+Timestamp(ns)
+
+query error Cannot coerce arithmetic expression Timestamp\(ns\) \* Interval\(MonthDayNano\) to valid types
+SELECT '2001-09-28T23:00:00'::timestamp * interval '23 hours'
+
+query error Cannot coerce arithmetic expression Timestamp\(ns\) / Interval\(MonthDayNano\) to valid types
+SELECT '2001-09-28T23:00:00'::timestamp / interval '23 hours'
+
+query error Cannot coerce arithmetic expression Timestamp\(ns\) % Interval\(MonthDayNano\) to valid types
+SELECT '2001-09-28T23:00:00'::timestamp % interval '23 hours'
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_timestamp_timestamp.slt b/datafusion/sqllogictest/test_files/datetime/arith_timestamp_timestamp.slt
new file mode 100644
index 0000000000000..975365ae22ebe
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_timestamp_timestamp.slt
@@ -0,0 +1,13 @@
+# timestamp - timestamp → interval
+# Subtract timestamps (converting 24-hour intervals into days, similarly to justify_hours())
+# timestamp '2001-09-29 03:00' - timestamp '2001-07-27 12:00' → 63 days 15:00:00
+
+query ?
+SELECT '2001-09-29T03:00:00'::timestamp - '2001-07-27T12:00:00'::timestamp
+----
+63 days 15 hours 0 mins 0.000000000 secs
+
+query T
+SELECT arrow_typeof('2001-09-29T03:00:00'::timestamp - '2001-07-27T12:00:00'::timestamp)
+----
+Duration(ns)
diff --git a/datafusion/sqllogictest/test_files/current_date_timezone.slt b/datafusion/sqllogictest/test_files/datetime/current_date_timezone.slt
similarity index 96%
rename from datafusion/sqllogictest/test_files/current_date_timezone.slt
rename to datafusion/sqllogictest/test_files/datetime/current_date_timezone.slt
index 1b9c3cddeecec..180b79c9b1c70 100644
--- a/datafusion/sqllogictest/test_files/current_date_timezone.slt
+++ b/datafusion/sqllogictest/test_files/datetime/current_date_timezone.slt
@@ -78,3 +78,7 @@ query B
 SELECT current_date() = today();
 ----
 true
+
+# Config reset
+statement ok
+RESET datafusion.execution.time_zone;
diff --git a/datafusion/sqllogictest/test_files/current_time_timezone.slt b/datafusion/sqllogictest/test_files/datetime/current_time_timezone.slt
similarity index 97%
rename from datafusion/sqllogictest/test_files/current_time_timezone.slt
rename to datafusion/sqllogictest/test_files/datetime/current_time_timezone.slt
index c80c4b51d5ac8..e61dc84735eb8 100644
--- a/datafusion/sqllogictest/test_files/current_time_timezone.slt
+++ b/datafusion/sqllogictest/test_files/datetime/current_time_timezone.slt
@@ -98,3 +98,7 @@ query B
 SELECT current_time() = current_time();
 ----
 true
+
+# Config reset
+statement ok
+RESET datafusion.execution.time_zone;
diff --git a/datafusion/sqllogictest/test_files/expr/date_part.slt b/datafusion/sqllogictest/test_files/datetime/date_part.slt
similarity index 51%
rename from datafusion/sqllogictest/test_files/expr/date_part.slt
rename to datafusion/sqllogictest/test_files/datetime/date_part.slt
index bee8602d80bd2..07dc1302b9ece 100644
--- a/datafusion/sqllogictest/test_files/expr/date_part.slt
+++ b/datafusion/sqllogictest/test_files/datetime/date_part.slt
@@ -19,7 +19,7 @@
 # for the same function).
 
 
-## Begin tests fo rdate_part with columns and timestamp's with timezones
+## Begin tests for date_part with columns and timestamp's with timezones
 
 # Source data table has
 # timestamps with millisecond (very common timestamp precision) and nanosecond (maximum precision) timestamps
@@ -40,30 +40,32 @@ with t as (values
 )
 SELECT
   -- nanoseconds, with no, utc, and local timezone
-  arrow_cast(column1, 'Timestamp(Nanosecond, None)') as ts_nano_no_tz,
+  arrow_cast(column1, 'Timestamp(ns)') as ts_nano_no_tz,
+  arrow_cast(column1, 'Timestamp(Nanosecond, None)') as ts_nano_no_tz_old_format,
   arrow_cast(column1, 'Timestamp(Nanosecond, Some("UTC"))') as ts_nano_utc,
   arrow_cast(column1, 'Timestamp(Nanosecond, Some("America/New_York"))') as ts_nano_eastern,
   -- milliseconds, with no, utc, and local timezone
-  arrow_cast(column1, 'Timestamp(Millisecond, None)') as ts_milli_no_tz,
+  arrow_cast(column1, 'Timestamp(ms)') as ts_milli_no_tz,
+  arrow_cast(column1, 'Timestamp(Millisecond, None)') as ts_milli_no_tz_old_format,
   arrow_cast(column1, 'Timestamp(Millisecond, Some("UTC"))') as ts_milli_utc,
   arrow_cast(column1, 'Timestamp(Millisecond, Some("America/New_York"))') as ts_milli_eastern
 FROM t;
 
 
-query PPPPPP
+query PPPPPPPP
 SELECT * FROM source_ts;
 ----
-2020-01-01T00:00:00 2020-01-01T00:00:00Z 2019-12-31T19:00:00-05:00 2020-01-01T00:00:00 2020-01-01T00:00:00Z 2019-12-31T19:00:00-05:00
-2021-01-01T00:00:00 2021-01-01T00:00:00Z 2020-12-31T19:00:00-05:00 2021-01-01T00:00:00 2021-01-01T00:00:00Z 2020-12-31T19:00:00-05:00
-2020-09-01T00:00:00 2020-09-01T00:00:00Z 2020-08-31T20:00:00-04:00 2020-09-01T00:00:00 2020-09-01T00:00:00Z 2020-08-31T20:00:00-04:00
-2020-01-25T00:00:00 2020-01-25T00:00:00Z 2020-01-24T19:00:00-05:00 2020-01-25T00:00:00 2020-01-25T00:00:00Z 2020-01-24T19:00:00-05:00
-2020-01-24T00:00:00 2020-01-24T00:00:00Z 2020-01-23T19:00:00-05:00 2020-01-24T00:00:00 2020-01-24T00:00:00Z 2020-01-23T19:00:00-05:00
-2020-01-01T12:00:00 2020-01-01T12:00:00Z 2020-01-01T07:00:00-05:00 2020-01-01T12:00:00 2020-01-01T12:00:00Z 2020-01-01T07:00:00-05:00
-2020-01-01T00:30:00 2020-01-01T00:30:00Z 2019-12-31T19:30:00-05:00 2020-01-01T00:30:00 2020-01-01T00:30:00Z 2019-12-31T19:30:00-05:00
-2020-01-01T00:00:30 2020-01-01T00:00:30Z 2019-12-31T19:00:30-05:00 2020-01-01T00:00:30 2020-01-01T00:00:30Z 2019-12-31T19:00:30-05:00
-2020-01-01T00:00:00.123 2020-01-01T00:00:00.123Z 2019-12-31T19:00:00.123-05:00 2020-01-01T00:00:00.123 2020-01-01T00:00:00.123Z 2019-12-31T19:00:00.123-05:00
-2020-01-01T00:00:00.123456 2020-01-01T00:00:00.123456Z 2019-12-31T19:00:00.123456-05:00 2020-01-01T00:00:00.123 2020-01-01T00:00:00.123Z 2019-12-31T19:00:00.123-05:00
-2020-01-01T00:00:00.123456789 2020-01-01T00:00:00.123456789Z 2019-12-31T19:00:00.123456789-05:00 2020-01-01T00:00:00.123 2020-01-01T00:00:00.123Z 2019-12-31T19:00:00.123-05:00
+2020-01-01T00:00:00 2020-01-01T00:00:00 2020-01-01T00:00:00Z 2019-12-31T19:00:00-05:00 2020-01-01T00:00:00 2020-01-01T00:00:00 2020-01-01T00:00:00Z 2019-12-31T19:00:00-05:00
+2021-01-01T00:00:00 2021-01-01T00:00:00 2021-01-01T00:00:00Z 2020-12-31T19:00:00-05:00 2021-01-01T00:00:00 2021-01-01T00:00:00 2021-01-01T00:00:00Z 2020-12-31T19:00:00-05:00
+2020-09-01T00:00:00 2020-09-01T00:00:00 2020-09-01T00:00:00Z 2020-08-31T20:00:00-04:00 2020-09-01T00:00:00 2020-09-01T00:00:00 2020-09-01T00:00:00Z 2020-08-31T20:00:00-04:00
+2020-01-25T00:00:00 2020-01-25T00:00:00 2020-01-25T00:00:00Z 2020-01-24T19:00:00-05:00 2020-01-25T00:00:00 2020-01-25T00:00:00 2020-01-25T00:00:00Z 2020-01-24T19:00:00-05:00
+2020-01-24T00:00:00 2020-01-24T00:00:00 2020-01-24T00:00:00Z 2020-01-23T19:00:00-05:00 2020-01-24T00:00:00 2020-01-24T00:00:00 2020-01-24T00:00:00Z 2020-01-23T19:00:00-05:00
+2020-01-01T12:00:00 2020-01-01T12:00:00 2020-01-01T12:00:00Z 2020-01-01T07:00:00-05:00 2020-01-01T12:00:00 2020-01-01T12:00:00 2020-01-01T12:00:00Z 2020-01-01T07:00:00-05:00
+2020-01-01T00:30:00 2020-01-01T00:30:00 2020-01-01T00:30:00Z 2019-12-31T19:30:00-05:00 2020-01-01T00:30:00 2020-01-01T00:30:00 2020-01-01T00:30:00Z 2019-12-31T19:30:00-05:00
+2020-01-01T00:00:30 2020-01-01T00:00:30 2020-01-01T00:00:30Z 2019-12-31T19:00:30-05:00 2020-01-01T00:00:30 2020-01-01T00:00:30 2020-01-01T00:00:30Z 2019-12-31T19:00:30-05:00
+2020-01-01T00:00:00.123 2020-01-01T00:00:00.123 2020-01-01T00:00:00.123Z 2019-12-31T19:00:00.123-05:00 2020-01-01T00:00:00.123 2020-01-01T00:00:00.123 2020-01-01T00:00:00.123Z 2019-12-31T19:00:00.123-05:00
+2020-01-01T00:00:00.123456 2020-01-01T00:00:00.123456 2020-01-01T00:00:00.123456Z 2019-12-31T19:00:00.123456-05:00 2020-01-01T00:00:00.123 2020-01-01T00:00:00.123 2020-01-01T00:00:00.123Z 2019-12-31T19:00:00.123-05:00
+2020-01-01T00:00:00.123456789 2020-01-01T00:00:00.123456789 2020-01-01T00:00:00.123456789Z 2019-12-31T19:00:00.123456789-05:00 2020-01-01T00:00:00.123 2020-01-01T00:00:00.123 2020-01-01T00:00:00.123Z 2019-12-31T19:00:00.123-05:00
 
 # date_part (year) with columns and explicit timestamp
 query IIIIII
@@ -81,6 +83,23 @@ SELECT date_part('year', ts_nano_no_tz), date_part('year', ts_nano_utc), date_pa
 2020 2020 2019 2020 2020 2019
 2020 2020 2019 2020 2020 2019
 
+# date_part (isoyear) with columns and explicit timestamp
+query IIIIII
+SELECT date_part('isoyear', ts_nano_no_tz), date_part('isoyear', ts_nano_utc), date_part('isoyear', ts_nano_eastern), date_part('isoyear', ts_milli_no_tz), date_part('isoyear', ts_milli_utc), date_part('isoyear', ts_milli_eastern)  FROM source_ts;
+----
+2020 2020 2020 2020 2020 2020
+2020 2020 2020 2020 2020 2020
+2020 2020 2020 2020 2020 2020
+2020 2020 2020 2020 2020 2020
+2020 2020 2020 2020 2020 2020
+2020 2020 2020 2020 2020 2020
+2020 2020 2020 2020 2020 2020
+2020 2020 2020 2020 2020 2020
+2020 2020 2020 2020 2020 2020
+2020 2020 2020 2020 2020 2020
+2020 2020 2020 2020 2020 2020
+
+
 # date_part (month)
 query IIIIII
 SELECT date_part('month', ts_nano_no_tz), date_part('month', ts_nano_utc), date_part('month', ts_nano_eastern), date_part('month', ts_milli_no_tz), date_part('month', ts_milli_utc), date_part('month', ts_milli_eastern)  FROM source_ts;
@@ -193,6 +212,22 @@ SELECT date_part('microsecond', ts_nano_no_tz), date_part('microsecond', ts_nano
 123456 123456 123456 123000 123000 123000
 123456 123456 123456 123000 123000 123000
 
+# date_part (nanosecond)
+query IIIIII
+SELECT date_part('nanosecond', ts_nano_no_tz), date_part('nanosecond', ts_nano_utc), date_part('nanosecond', ts_nano_eastern), date_part('nanosecond', ts_milli_no_tz), date_part('nanosecond', ts_milli_utc), date_part('nanosecond', ts_milli_eastern)  FROM source_ts;
+----
+0 0 0 0 0 0
+0 0 0 0 0 0
+0 0 0 0 0 0
+0 0 0 0 0 0
+0 0 0 0 0 0
+0 0 0 0 0 0
+0 0 0 0 0 0
+30000000000 30000000000 30000000000 30000000000 30000000000 30000000000
+123000000 123000000 123000000 123000000 123000000 123000000
+123456000 123456000 123456000 123000000 123000000 123000000
+123456789 123456789 123456789 123000000 123000000 123000000
+
 ### Cleanup
 statement ok
 drop table source_ts;
@@ -228,6 +263,26 @@ SELECT EXTRACT('year' FROM  timestamp '2020-09-08T12:00:00+00:00')
 ----
 2020
 
+query I
+SELECT date_part('ISOYEAR', CAST('2000-01-01' AS DATE))
+----
+1999
+
+query I
+SELECT EXTRACT(isoyear FROM  timestamp '2020-09-08T12:00:00+00:00')
+----
+2020
+
+query I
+SELECT EXTRACT("isoyear" FROM  timestamp '2020-09-08T12:00:00+00:00')
+----
+2020
+
+query I
+SELECT EXTRACT('isoyear' FROM  timestamp '2020-09-08T12:00:00+00:00')
+----
+2020
+
 query I
 SELECT date_part('QUARTER', CAST('2000-01-01' AS DATE))
 ----
@@ -394,6 +449,12 @@ SELECT arrow_typeof(date_part('minute', to_timestamp('2020-09-08T12:12:00+00:00'
 ----
 Int32
 
+# nanosecond can exceed Int32 and returns Int64
+query T
+SELECT arrow_typeof(date_part('nanosecond', to_timestamp('2020-09-08T12:12:00+00:00')))
+----
+Int64
+
 query I
 SELECT EXTRACT(second FROM timestamp '2020-09-08T12:00:12.12345678+00:00')
 ----
@@ -409,8 +470,11 @@ SELECT EXTRACT(microsecond FROM timestamp '2020-09-08T12:00:12.12345678+00:00')
 ----
 12123456
 
-query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported
+# note the output is more than Int32 can store
+query I
 SELECT EXTRACT(nanosecond FROM timestamp '2020-09-08T12:00:12.12345678+00:00')
+----
+12123456780
 
 query I
 SELECT EXTRACT("second" FROM timestamp '2020-09-08T12:00:12.12345678+00:00')
@@ -427,8 +491,10 @@ SELECT EXTRACT("microsecond" FROM timestamp '2020-09-08T12:00:12.12345678+00:00'
 ----
 12123456
 
-query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported
+query I
 SELECT EXTRACT("nanosecond" FROM timestamp '2020-09-08T12:00:12.12345678+00:00')
+----
+12123456780
 
 query I
 SELECT EXTRACT('second' FROM timestamp '2020-09-08T12:00:12.12345678+00:00')
@@ -445,9 +511,10 @@ SELECT EXTRACT('microsecond' FROM timestamp '2020-09-08T12:00:12.12345678+00:00'
 ----
 12123456
 
-query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported
+query I
 SELECT EXTRACT('nanosecond' FROM timestamp '2020-09-08T12:00:12.12345678+00:00')
-
+----
+12123456780
 
 # Keep precision when coercing Utf8 to Timestamp
 query I
@@ -465,9 +532,10 @@ SELECT date_part('microsecond', timestamp '2020-09-08T12:00:12.12345678+00:00')
 ----
 12123456
 
-query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported
+query I
 SELECT date_part('nanosecond', timestamp '2020-09-08T12:00:12.12345678+00:00')
-
+----
+12123456780
 
 query I
 SELECT date_part('second', '2020-09-08T12:00:12.12345678+00:00')
@@ -484,8 +552,30 @@ SELECT date_part('microsecond', '2020-09-08T12:00:12.12345678+00:00')
 ----
 12123456
 
-query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported
+query I
 SELECT date_part('nanosecond', '2020-09-08T12:00:12.12345678+00:00')
+----
+12123456780
+
+query I
+SELECT EXTRACT(nanosecond FROM ts)
+FROM (VALUES
+  (timestamp '2020-09-08T12:00:12.12345678+00:00'),
+  (NULL::timestamp)
+) AS t(ts)
+----
+12123456780
+NULL
+
+query I
+SELECT date_part('nanosecond', ts)
+FROM (VALUES
+  (timestamp '2020-09-08T12:00:12.12345678+00:00'),
+  (NULL::timestamp)
+) AS t(ts)
+----
+12123456780
+NULL
 
 # test_date_part_time
 
@@ -540,8 +630,10 @@ SELECT extract(microsecond from arrow_cast('23:32:50'::time, 'Time32(Second)'))
 ----
 50000000
 
-query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported
+query I
 SELECT extract(nanosecond from arrow_cast('23:32:50'::time, 'Time32(Second)'))
+----
+50000000000
 
 query R
 SELECT date_part('epoch', arrow_cast('23:32:50'::time, 'Time32(Second)'))
@@ -604,8 +696,10 @@ SELECT extract(microsecond from arrow_cast('23:32:50.123'::time, 'Time32(Millise
 ----
 50123000
 
-query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported
+query I
 SELECT extract(nanosecond from arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)'))
+----
+50123000000
 
 query R
 SELECT date_part('epoch', arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)'))
@@ -668,8 +762,10 @@ SELECT extract(microsecond from arrow_cast('23:32:50.123456'::time, 'Time64(Micr
 ----
 50123456
 
-query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported
+query I
 SELECT extract(nanosecond from arrow_cast('23:32:50.123456'::time, 'Time64(Microsecond)'))
+----
+50123456000
 
 query R
 SELECT date_part('epoch', arrow_cast('23:32:50.123456'::time, 'Time64(Microsecond)'))
@@ -758,8 +854,10 @@ SELECT extract(us from arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond
 ----
 50123456
 
-query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported
+query I
 SELECT date_part('nanosecond', arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)'))
+----
+50123456789
 
 query R
 SELECT date_part('epoch', arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)'))
@@ -865,9 +963,15 @@ SELECT extract(month from arrow_cast('20 months', 'Interval(YearMonth)'))
 ----
 8
 
+query error DataFusion error: Arrow error: Compute error: YearISO does not support: Interval\(YearMonth\)
+SELECT extract(isoyear from arrow_cast('10 years', 'Interval(YearMonth)'))
+
 query error DataFusion error: Arrow error: Compute error: Year does not support: Interval\(DayTime\)
 SELECT extract(year from arrow_cast('10 days', 'Interval(DayTime)'))
 
+query error DataFusion error: Arrow error: Compute error: YearISO does not support: Interval\(DayTime\)
+SELECT extract(isoyear from arrow_cast('10 days', 'Interval(DayTime)'))
+
 query error DataFusion error: Arrow error: Compute error: Month does not support: Interval\(DayTime\)
 SELECT extract(month from arrow_cast('10 days', 'Interval(DayTime)'))
 
@@ -936,6 +1040,57 @@ SELECT extract(second from arrow_cast(NULL, 'Interval(MonthDayNano)'))
 ----
 NULL
 
+# extract epoch from intervals
+query R
+SELECT extract(epoch from interval '15 minutes')
+----
+900
+
+query R
+SELECT extract(epoch from interval '1 hour')
+----
+3600
+
+query R
+SELECT extract(epoch from interval '1 day')
+----
+86400
+
+query R
+SELECT extract(epoch from interval '1 month')
+----
+2592000
+
+query R
+SELECT extract(epoch from arrow_cast('3 days', 'Interval(DayTime)'))
+----
+259200
+
+query R
+SELECT extract(epoch from arrow_cast('100 milliseconds', 'Interval(MonthDayNano)'))
+----
+0.1
+
+query R
+SELECT extract(epoch from arrow_cast('500 microseconds', 'Interval(MonthDayNano)'))
+----
+0.0005
+
+query R
+SELECT extract(epoch from arrow_cast('2500 nanoseconds', 'Interval(MonthDayNano)'))
+----
+0.0000025
+
+query R
+SELECT extract(epoch from arrow_cast('1 month 2 days 500 milliseconds', 'Interval(MonthDayNano)'))
+----
+2764800.5
+
+query R
+SELECT extract(epoch from arrow_cast('2 months', 'Interval(YearMonth)'))
+----
+5184000
+
 statement ok
 create table t (id int, i interval) as values
   (0, interval '5 months 1 day 10 nanoseconds'),
@@ -1011,6 +1166,9 @@ SELECT extract(month from arrow_cast(864000, 'Duration(Second)'))
 query error DataFusion error: Arrow error: Compute error: Year does not support: Duration\(s\)
 SELECT extract(year from arrow_cast(864000, 'Duration(Second)'))
 
+query error DataFusion error: Arrow error: Compute error: YearISO does not support: Duration\(s\)
+SELECT extract(isoyear from arrow_cast(864000, 'Duration(Second)'))
+
 query I
 SELECT extract(day from arrow_cast(NULL, 'Duration(Second)'))
 ----
@@ -1023,6 +1181,11 @@ SELECT (date_part('year', now()) = EXTRACT(year FROM now()))
 ----
 true
 
+query B
+SELECT (date_part('isoyear', now()) = EXTRACT(isoyear FROM now()))
+----
+true
+
 query B
 SELECT (date_part('quarter', now()) = EXTRACT(quarter FROM now()))
 ----
@@ -1068,8 +1231,11 @@ SELECT (date_part('microsecond', now()) = EXTRACT(microsecond FROM now()))
 ----
 true
 
-query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported
+query B
 SELECT (date_part('nanosecond', now()) = EXTRACT(nanosecond FROM now()))
+----
+true
+
 
 query I
 SELECT date_part('ISODOW', CAST('2000-01-01' AS DATE))
@@ -1090,3 +1256,563 @@ query I
 SELECT EXTRACT('isodow' FROM to_timestamp('2020-09-08T12:00:00+00:00'))
 ----
 1
+
+## Preimage tests
+
+statement ok
+create table t1(c DATE) as VALUES (NULL), ('1990-01-01'), ('2024-01-01'), ('2030-01-01');
+
+# Simple optimizations, col on LHS
+
+query D
+select c from t1 where extract(year from c) = 2024;
+----
+2024-01-01
+
+query D
+select c from t1 where extract(year from c) <> 2024;
+----
+1990-01-01
+2030-01-01
+
+query D
+select c from t1 where extract(year from c) > 2024;
+----
+2030-01-01
+
+query D
+select c from t1 where extract(year from c) < 2024;
+----
+1990-01-01
+
+query D
+select c from t1 where extract(year from c) >= 2024;
+----
+2024-01-01
+2030-01-01
+
+query D
+select c from t1 where extract(year from c) <= 2024;
+----
+1990-01-01
+2024-01-01
+
+query D
+select c from t1 where extract(year from c) is not distinct from 2024
+----
+2024-01-01
+
+query D
+select c from t1 where extract(year from c) is distinct from 2024
+----
+NULL
+1990-01-01
+2030-01-01
+
+# IN list optimization
+query D
+select c from t1 where extract(year from c) in (1990, 2024);
+----
+1990-01-01
+2024-01-01
+
+# NOT IN list optimization (NULL does not satisfy NOT IN)
+query D
+select c from t1 where extract(year from c) not in (1990, 2024);
+----
+2030-01-01
+
+# Check that date_part is not in the explain statements
+
+query TT
+explain select c from t1 where extract (year from c) = 2024
+----
+logical_plan
+01)Filter: t1.c >= Date32("2024-01-01") AND t1.c < Date32("2025-01-01")
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: c@0 >= 2024-01-01 AND c@0 < 2025-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (year from c) <> 2024
+----
+logical_plan
+01)Filter: t1.c < Date32("2024-01-01") OR t1.c >= Date32("2025-01-01")
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: c@0 < 2024-01-01 OR c@0 >= 2025-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (year from c) > 2024
+----
+logical_plan
+01)Filter: t1.c >= Date32("2025-01-01")
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: c@0 >= 2025-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (year from c) < 2024
+----
+logical_plan
+01)Filter: t1.c < Date32("2024-01-01")
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: c@0 < 2024-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (year from c) >= 2024
+----
+logical_plan
+01)Filter: t1.c >= Date32("2024-01-01")
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: c@0 >= 2024-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (year from c) <= 2024
+----
+logical_plan
+01)Filter: t1.c < Date32("2025-01-01")
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: c@0 < 2025-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (year from c) is not distinct from 2024
+----
+logical_plan
+01)Filter: t1.c IS NOT NULL AND t1.c >= Date32("2024-01-01") AND t1.c < Date32("2025-01-01")
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: c@0 IS NOT NULL AND c@0 >= 2024-01-01 AND c@0 < 2025-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (year from c) is distinct from 2024
+----
+logical_plan
+01)Filter: t1.c < Date32("2024-01-01") OR t1.c >= Date32("2025-01-01") OR t1.c IS NULL
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: c@0 < 2024-01-01 OR c@0 >= 2025-01-01 OR c@0 IS NULL
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (year from c) in (1990, 2024)
+----
+logical_plan
+01)Filter: t1.c >= Date32("1990-01-01") AND t1.c < Date32("1991-01-01") OR t1.c >= Date32("2024-01-01") AND t1.c < Date32("2025-01-01")
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: c@0 >= 1990-01-01 AND c@0 < 1991-01-01 OR c@0 >= 2024-01-01 AND c@0 < 2025-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Simple optimizations, column on RHS
+
+query D
+select c from t1 where 2024 = extract(year from c);
+----
+2024-01-01
+
+query D
+select c from t1 where 2024 <> extract(year from c);
+----
+1990-01-01
+2030-01-01
+
+query D
+select c from t1 where 2024 < extract(year from c);
+----
+2030-01-01
+
+query D
+select c from t1 where 2024 > extract(year from c);
+----
+1990-01-01
+
+query D
+select c from t1 where 2024 <= extract(year from c);
+----
+2024-01-01
+2030-01-01
+
+query D
+select c from t1 where 2024 >= extract(year from c);
+----
+1990-01-01
+2024-01-01
+
+query D
+select c from t1 where 2024 is not distinct from extract(year from c);
+----
+2024-01-01
+
+query D
+select c from t1 where 2024 is distinct from extract(year from c);
+----
+NULL
+1990-01-01
+2030-01-01
+
+# Check explain statements for optimizations for other interval types
+
+query TT
+explain select c from t1 where extract (quarter from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("QUARTER"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(QUARTER, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (month from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("MONTH"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(MONTH, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (week from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("WEEK"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(WEEK, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (day from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("DAY"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(DAY, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (hour from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("HOUR"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(HOUR, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (minute from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("MINUTE"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(MINUTE, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (second from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("SECOND"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(SECOND, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (millisecond from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("MILLISECOND"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(MILLISECOND, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (microsecond from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("MICROSECOND"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(MICROSECOND, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (nanosecond from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("NANOSECOND"), t1.c) = Int64(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(NANOSECOND, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (dow from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("DOW"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(DOW, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (doy from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("DOY"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(DOY, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (epoch from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("EPOCH"), t1.c) = Float64(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(EPOCH, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (isodow from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("ISODOW"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(ISODOW, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Simple optimize different datatypes
+
+statement ok
+create table t2(
+    c1_date32 DATE,
+    c2_ts_sec timestamp,
+    c3_ts_mili timestamp,
+    c4_ts_micro timestamp,
+    c5_ts_nano timestamp
+) as VALUES
+    (NULL,
+     NULL,
+     NULL,
+     NULL,
+     NULL),
+    ('1990-05-20',
+     '1990-05-20T00:00:10'::timestamp,
+     '1990-05-20T00:00:10.987'::timestamp,
+     '1990-05-20T00:00:10.987654'::timestamp,
+     '1990-05-20T00:00:10.987654321'::timestamp),
+    ('2024-01-01',
+     '2024-01-01T00:00:00'::timestamp,
+     '2024-01-01T00:00:00.123'::timestamp,
+     '2024-01-01T00:00:00.123456'::timestamp,
+     '2024-01-01T00:00:00.123456789'::timestamp),
+    ('2030-12-31',
+     '2030-12-31T23:59:59'::timestamp,
+     '2030-12-31T23:59:59.001'::timestamp,
+     '2030-12-31T23:59:59.001234'::timestamp,
+     '2030-12-31T23:59:59.001234567'::timestamp)
+;
+
+query D
+select c1_date32 from t2 where extract(year from c1_date32) = 2024;
+----
+2024-01-01
+
+query D
+select c1_date32 from t2 where extract(year from c1_date32) <> 2024;
+----
+1990-05-20
+2030-12-31
+
+query P
+select c2_ts_sec from t2 where extract(year from c2_ts_sec) > 2024;
+----
+2030-12-31T23:59:59
+
+query P
+select c3_ts_mili from t2 where extract(year from c3_ts_mili) < 2024;
+----
+1990-05-20T00:00:10.987
+
+query P
+select c4_ts_micro from t2 where extract(year from c4_ts_micro) >= 2024;
+----
+2024-01-01T00:00:00.123456
+2030-12-31T23:59:59.001234
+
+query P
+select c5_ts_nano from t2 where extract(year from c5_ts_nano) <= 2024;
+----
+1990-05-20T00:00:10.987654321
+2024-01-01T00:00:00.123456789
+
+query D
+select c1_date32 from t2 where extract(year from c1_date32) is not distinct from 2024
+----
+2024-01-01
+
+query D
+select c1_date32 from t2 where extract(year from c1_date32) is distinct from 2024
+----
+NULL
+1990-05-20
+2030-12-31
+
+# Check that date_part is not in the explain statements for other datatypes
+
+query TT
+explain select c1_date32 from t2 where extract (year from c1_date32) = 2024
+----
+logical_plan
+01)Filter: t2.c1_date32 >= Date32("2024-01-01") AND t2.c1_date32 < Date32("2025-01-01")
+02)--TableScan: t2 projection=[c1_date32]
+physical_plan
+01)FilterExec: c1_date32@0 >= 2024-01-01 AND c1_date32@0 < 2025-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c1_date32 from t2 where extract (year from c1_date32) <> 2024
+----
+logical_plan
+01)Filter: t2.c1_date32 < Date32("2024-01-01") OR t2.c1_date32 >= Date32("2025-01-01")
+02)--TableScan: t2 projection=[c1_date32]
+physical_plan
+01)FilterExec: c1_date32@0 < 2024-01-01 OR c1_date32@0 >= 2025-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c2_ts_sec from t2 where extract (year from c2_ts_sec) > 2024
+----
+logical_plan
+01)Filter: t2.c2_ts_sec >= TimestampNanosecond(1735689600000000000, None)
+02)--TableScan: t2 projection=[c2_ts_sec]
+physical_plan
+01)FilterExec: c2_ts_sec@0 >= 1735689600000000000
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c3_ts_mili from t2 where extract (year from c3_ts_mili) < 2024
+----
+logical_plan
+01)Filter: t2.c3_ts_mili < TimestampNanosecond(1704067200000000000, None)
+02)--TableScan: t2 projection=[c3_ts_mili]
+physical_plan
+01)FilterExec: c3_ts_mili@0 < 1704067200000000000
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c4_ts_micro from t2 where extract (year from c4_ts_micro) >= 2024
+----
+logical_plan
+01)Filter: t2.c4_ts_micro >= TimestampNanosecond(1704067200000000000, None)
+02)--TableScan: t2 projection=[c4_ts_micro]
+physical_plan
+01)FilterExec: c4_ts_micro@0 >= 1704067200000000000
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c5_ts_nano from t2 where extract (year from c5_ts_nano) <= 2024
+----
+logical_plan
+01)Filter: t2.c5_ts_nano < TimestampNanosecond(1735689600000000000, None)
+02)--TableScan: t2 projection=[c5_ts_nano]
+physical_plan
+01)FilterExec: c5_ts_nano@0 < 1735689600000000000
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c1_date32 from t2 where extract (year from c1_date32) is not distinct from 2024
+----
+logical_plan
+01)Filter: t2.c1_date32 IS NOT NULL AND t2.c1_date32 >= Date32("2024-01-01") AND t2.c1_date32 < Date32("2025-01-01")
+02)--TableScan: t2 projection=[c1_date32]
+physical_plan
+01)FilterExec: c1_date32@0 IS NOT NULL AND c1_date32@0 >= 2024-01-01 AND c1_date32@0 < 2025-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c1_date32 from t2 where extract (year from c1_date32) is distinct from 2024
+----
+logical_plan
+01)Filter: t2.c1_date32 < Date32("2024-01-01") OR t2.c1_date32 >= Date32("2025-01-01") OR t2.c1_date32 IS NULL
+02)--TableScan: t2 projection=[c1_date32]
+physical_plan
+01)FilterExec: c1_date32@0 < 2024-01-01 OR c1_date32@0 >= 2025-01-01 OR c1_date32@0 IS NULL
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Preimage with timestamp with America/New_York timezone
+
+statement ok
+SET datafusion.execution.time_zone = 'America/New_York';
+
+statement ok
+create table t3(
+    c1_ts_tz timestamptz
+) as VALUES
+    (NULL),
+    ('2024-01-01T04:59:59Z'::timestamptz), -- local 2023-12-31 23:59:59 -05
+    ('2024-01-01T05:00:00Z'::timestamptz), -- local 2024-01-01 00:00:00 -05
+    ('2025-01-01T04:59:59Z'::timestamptz), -- local 2024-12-31 23:59:59 -05
+    ('2025-01-01T05:00:00Z'::timestamptz)  -- local 2025-01-01 00:00:00 -05
+;
+
+query P
+select c1_ts_tz
+from t3
+where extract(year from c1_ts_tz) = 2024
+order by c1_ts_tz
+----
+2024-01-01T00:00:00-05:00
+2024-12-31T23:59:59-05:00
+
+query TT
+explain select c1_ts_tz from t3 where extract(year from c1_ts_tz) = 2024
+----
+logical_plan
+01)Filter: t3.c1_ts_tz >= TimestampNanosecond(1704085200000000000, Some("America/New_York")) AND t3.c1_ts_tz < TimestampNanosecond(1735707600000000000, Some("America/New_York"))
+02)--TableScan: t3 projection=[c1_ts_tz]
+physical_plan
+01)FilterExec: c1_ts_tz@0 >= 1704085200000000000 AND c1_ts_tz@0 < 1735707600000000000
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+RESET datafusion.execution.time_zone;
+
+# Test non-Int32 rhs argument
+
+query D
+select c from t1 where extract(year from c) = cast(2024 as bigint);
+----
+2024-01-01
+
+query TT
+explain select c from t1 where extract (year from c) = cast(2024 as bigint)
+----
+logical_plan
+01)Filter: t1.c >= Date32("2024-01-01") AND t1.c < Date32("2025-01-01")
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: c@0 >= 2024-01-01 AND c@0 < 2025-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
diff --git a/datafusion/sqllogictest/test_files/dates.slt b/datafusion/sqllogictest/test_files/datetime/dates.slt
similarity index 68%
rename from datafusion/sqllogictest/test_files/dates.slt
rename to datafusion/sqllogictest/test_files/datetime/dates.slt
index abf64675e9039..d2a7360b120c6 100644
--- a/datafusion/sqllogictest/test_files/dates.slt
+++ b/datafusion/sqllogictest/test_files/datetime/dates.slt
@@ -94,13 +94,6 @@ caused by
 Error during planning: Cannot coerce arithmetic expression Timestamp(ns) + Utf8 to valid types
 
 
-# DATE minus DATE
-# https://github.com/apache/arrow-rs/issues/4383
-query ?
-SELECT DATE '2023-04-09' - DATE '2023-04-02';
-----
-7 days 0 hours 0 mins 0 secs
-
 # DATE minus Timestamp
 query ?
 SELECT DATE '2023-04-09' - '2000-01-01T00:00:00'::timestamp;
@@ -113,17 +106,18 @@ SELECT '2023-01-01T00:00:00'::timestamp - DATE '2021-01-01';
 ----
 730 days 0 hours 0 mins 0.000000000 secs
 
-# NULL with DATE arithmetic should yield NULL
-query ?
+# NULL with DATE arithmetic should yield NULL (but Int64 type)
+query I
 SELECT NULL - DATE '1984-02-28';
 ----
 NULL
 
-query ?
+query I
 SELECT DATE '1984-02-28' - NULL
 ----
 NULL
 
+
 # to_date_test
 statement ok
 create table to_date_t1(ts bigint) as VALUES
@@ -164,12 +158,114 @@ SELECT to_date('21311111');
 statement error DataFusion error: Arrow error:
 SELECT to_date('213111111');
 
+# verify date cast with tinyint input
+query DDDDDD
+SELECT to_date(null::tinyint), to_date(0::tinyint), to_date(19::tinyint), to_date(1::tinyint), to_date(-1::tinyint), to_date((0-1)::tinyint)
+----
+NULL 1970-01-01 1970-01-20 1970-01-02 1969-12-31 1969-12-31
+
+# verify date cast with smallint input
+query DDDDDD
+SELECT to_date(null::smallint), to_date(0::smallint), to_date(19234::smallint), to_date(1::smallint), to_date(-1::smallint), to_date((0-1)::smallint)
+----
+NULL 1970-01-01 2022-08-30 1970-01-02 1969-12-31 1969-12-31
+
 # verify date cast with integer input
 query DDDDDD
 SELECT to_date(null), to_date(0), to_date(19266320), to_date(1), to_date(-1), to_date(0-1)
 ----
 NULL 1970-01-01 +54719-05-25 1970-01-02 1969-12-31 1969-12-31
 
+# verify date cast with bigint input
+query DDDDDD
+SELECT to_date(null::bigint), to_date(0::bigint), to_date(191234::bigint), to_date(1::bigint), to_date(-1::bigint), to_date((0-1)::bigint)
+----
+NULL 1970-01-01 2493-07-31 1970-01-02 1969-12-31 1969-12-31
+
+# verify date cast with unsigned tinyint input
+query DDDD
+SELECT to_date(null::tinyint unsigned), to_date(0::tinyint unsigned), to_date(192::tinyint unsigned), to_date(1::tinyint unsigned)
+----
+NULL 1970-01-01 1970-07-12 1970-01-02
+
+# verify date cast with unsigned smallint input
+query DDDD
+SELECT to_date(null::smallint unsigned), to_date(0::smallint unsigned), to_date(19260::smallint unsigned), to_date(1::smallint unsigned)
+----
+NULL 1970-01-01 2022-09-25 1970-01-02
+
+# verify date cast with unsigned int input
+query DDDD
+SELECT to_date(null::int unsigned), to_date(0::int unsigned), to_date(19260::int unsigned), to_date(1::int unsigned)
+----
+NULL 1970-01-01 2022-09-25 1970-01-02
+
+# verify date cast with unsigned bigint input
+query DDDD
+SELECT to_date(null::bigint unsigned), to_date(0::bigint unsigned), to_date(19260000::bigint unsigned), to_date(1::bigint unsigned)
+----
+NULL 1970-01-01 +54702-02-03 1970-01-02
+
+# verify date cast with real input (float32)
+query DDDDDD
+SELECT to_date(null::real), to_date(0.0::real), to_date(19260.1::real), to_date(1.1::real), to_date(-1.1::real), to_date(0-1.1::real)
+----
+NULL 1970-01-01 2022-09-25 1970-01-02 1969-12-31 1969-12-31
+
+# verify date cast with double input (float64)
+query DDDDDD
+SELECT to_date(null::double), to_date(0.0::double), to_date(19260.1::double), to_date(1.1::double), to_date(-1.1::double), to_date(0-1.1::double)
+----
+NULL 1970-01-01 2022-09-25 1970-01-02 1969-12-31 1969-12-31
+
+# verify date cast with decimal32 input (Decimal32)
+query DDDDDD
+SELECT to_date(arrow_cast(null, 'Decimal32(8,2)')), to_date(arrow_cast(0.0, 'Decimal32(8,2)')), to_date(arrow_cast(19260.1, 'Decimal32(8,2)')), to_date(arrow_cast(1.1, 'Decimal32(8,2)')), to_date(arrow_cast(-1.1, 'Decimal32(8,2)')), to_date(0-arrow_cast(1.1, 'Decimal32(8,2)'))
+----
+NULL 1970-01-01 2022-09-25 1970-01-02 1969-12-31 1969-12-31
+
+# verify date cast with Decimal64 input
+query DDDDDD
+SELECT to_date(arrow_cast(null, 'Decimal64(8,2)')), to_date(arrow_cast(0.0, 'Decimal64(8,2)')), to_date(arrow_cast(19260.1, 'Decimal64(8,2)')), to_date(arrow_cast(1.1, 'Decimal64(8,2)')), to_date(arrow_cast(-1.1, 'Decimal64(8,2)')), to_date(0-arrow_cast(1.1, 'Decimal64(8,2)'))
+----
+NULL 1970-01-01 2022-09-25 1970-01-02 1969-12-31 1969-12-31
+
+# verify date cast with Decimal128 input
+query DDDDDD
+SELECT to_date(arrow_cast(null, 'Decimal128(8,2)')), to_date(arrow_cast(0.0, 'Decimal128(8,2)')), to_date(arrow_cast(19260.1, 'Decimal128(8,2)')), to_date(arrow_cast(1.1, 'Decimal128(8,2)')), to_date(arrow_cast(-1.1, 'Decimal128(8,2)')), to_date(0-arrow_cast(1.1, 'Decimal128(8,2)'))
+----
+NULL 1970-01-01 2022-09-25 1970-01-02 1969-12-31 1969-12-31
+
+# verify date cast with decimal input (Decimal128)
+query DDDDDD
+SELECT to_date(null::decimal(10,2)), to_date(0.0::decimal(10,2)), to_date(19260.1::decimal(10,2)), to_date(1.1::decimal(10,2)), to_date(-1.1::decimal(10,2)), to_date(0-1.1::decimal(10,2))
+----
+NULL 1970-01-01 2022-09-25 1970-01-02 1969-12-31 1969-12-31
+
+# verify date cast with Decimal256 input
+query DDDDDD
+SELECT to_date(arrow_cast(null, 'Decimal256(8,2)')), to_date(arrow_cast(0.0, 'Decimal256(8,2)')), to_date(arrow_cast(19260.1, 'Decimal256(8,2)')), to_date(arrow_cast(1.1, 'Decimal256(8,2)')), to_date(arrow_cast(-1.1, 'Decimal256(8,2)')), to_date(0-arrow_cast(1.1, 'Decimal256(8,2)'))
+----
+NULL 1970-01-01 2022-09-25 1970-01-02 1969-12-31 1969-12-31
+
+# verify date cast with date input
+query DDDD
+SELECT to_date('2024-12-1'::date), to_date('1920-01-12'::date), to_date('1970-01-01'::date), to_date('-0200-07-22'::date)
+----
+2024-12-01 1920-01-12 1970-01-01 -0200-07-22
+
+# verify date cast with date64 input
+query DDDD
+SELECT to_date(arrow_cast('2024-12-1', 'Date64')), to_date(arrow_cast('1920-01-12', 'Date64')), to_date(arrow_cast('1970-01-01', 'Date64')), to_date(arrow_cast(-863999913600000, 'Date64'))
+----
+2024-12-01 1920-01-12 1970-01-01 -25410-12-07
+
+# verify date cast with timestamp input
+query DD
+SELECT to_date('2024-12-01T00:32:45'::timestamp), to_date('1677-12-01T00:32:45'::timestamp)
+----
+2024-12-01 1677-12-01
+
 # verify date output types
 query TTT
 SELECT arrow_typeof(to_date(1)), arrow_typeof(to_date(null)), arrow_typeof(to_date('2023-01-10 12:34:56.000'))
diff --git a/datafusion/sqllogictest/test_files/interval.slt b/datafusion/sqllogictest/test_files/datetime/interval.slt
similarity index 100%
rename from datafusion/sqllogictest/test_files/interval.slt
rename to datafusion/sqllogictest/test_files/datetime/interval.slt
diff --git a/datafusion/sqllogictest/test_files/interval_mysql.slt b/datafusion/sqllogictest/test_files/datetime/interval_mysql.slt
similarity index 100%
rename from datafusion/sqllogictest/test_files/interval_mysql.slt
rename to datafusion/sqllogictest/test_files/datetime/interval_mysql.slt
diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/datetime/timestamps.slt
similarity index 73%
rename from datafusion/sqllogictest/test_files/timestamps.slt
rename to datafusion/sqllogictest/test_files/datetime/timestamps.slt
index cdacad0fda0d3..875d7aa4c478d 100644
--- a/datafusion/sqllogictest/test_files/timestamps.slt
+++ b/datafusion/sqllogictest/test_files/datetime/timestamps.slt
@@ -19,10 +19,10 @@
 ## Common timestamp data
 #
 # ts_data:        Int64 nanoseconds
-# ts_data_nanos:  Timestamp(Nanosecond, None)
-# ts_data_micros: Timestamp(Microsecond, None)
-# ts_data_millis: Timestamp(Millisecond, None)
-# ts_data_secs:   Timestamp(Second, None)
+# ts_data_nanos:  Timestamp(ns)
+# ts_data_micros: Timestamp(µs)
+# ts_data_millis: Timestamp(ms)
+# ts_data_secs:   Timestamp(s)
 ##########
 
 # Create timestamp tables with different precisions but the same logical values
@@ -34,16 +34,19 @@ create table ts_data(ts bigint, value int) as values
   (1599565349190855123, 3);
 
 statement ok
-create table ts_data_nanos as select arrow_cast(ts, 'Timestamp(Nanosecond, None)') as ts, value from ts_data;
+create table ts_data_nanos as select arrow_cast(ts, 'Timestamp(ns)') as ts, value from ts_data;
 
 statement ok
-create table ts_data_micros as select arrow_cast(ts / 1000, 'Timestamp(Microsecond, None)') as ts, value from ts_data;
+create table ts_data_micros as select arrow_cast(ts / 1000, 'Timestamp(µs)') as ts, value from ts_data;
 
 statement ok
-create table ts_data_millis as select arrow_cast(ts / 1000000, 'Timestamp(Millisecond, None)') as ts, value from ts_data;
+create table ts_data_millis as select arrow_cast(ts / 1000000, 'Timestamp(ms)') as ts, value from ts_data;
 
 statement ok
-create table ts_data_secs as select arrow_cast(ts / 1000000000, 'Timestamp(Second, None)') as ts, value from ts_data;
+create table ts_data_secs as select arrow_cast(ts / 1000000000, 'Timestamp(s)') as ts, value from ts_data;
+
+statement ok
+create table ts_data_micros_kolkata as select arrow_cast(ts / 1000, 'Timestamp(Microsecond, Some("Asia/Kolkata"))') as ts, value from ts_data;
 
 
 ##########
@@ -190,6 +193,8 @@ SELECT TIMESTAMPTZ '2000-01-01T01:01:01'
 ----
 2000-01-01T01:01:01Z
 
+statement ok
+RESET datafusion.execution.time_zone
 
 ##########
 ## cast tests
@@ -766,6 +771,18 @@ select to_timestamp_seconds(cast (1 as int));
 ## test date_bin function
 ##########
 
+# NULL stride should return NULL, not a planning error
+query P
+SELECT date_bin(NULL, TIMESTAMP '2023-01-01 12:30:00', TIMESTAMP '2023-01-01 12:00:00')
+----
+NULL
+
+# NULL stride should return NULL, not a planning error
+query P
+SELECT date_bin(NULL, TIMESTAMP '2023-01-01 12:30:00')
+----
+NULL
+
 # invalid second arg type
 query error
 SELECT DATE_BIN(INTERVAL '0 second', 25, TIMESTAMP '1970-01-01T00:00:00Z')
@@ -788,6 +805,81 @@ FROM (
   ) as t (time, val)
 group by time;
 
+query D
+SELECT DATE_BIN(INTERVAL '15 minutes', TIME '14:38:50', TIME '00:00:00')
+----
+14:30:00
+
+# Supports Month-Day-Nano nanosecond interval
+query D
+SELECT DATE_BIN(INTERVAL '10 nanoseconds', TIME '14:38:50.000000016', TIME '00:00:00')
+----
+14:38:50.000000010
+
+# Supports Month-Day-Nano nanosecond interval via fractions
+query D
+SELECT DATE_BIN(INTERVAL '0.000000010 seconds', TIME '14:38:50.000000016', TIME '00:00:00')
+----
+14:38:50.000000010
+
+# Supports Month-Day-Nano microsecond interval
+query D
+SELECT DATE_BIN(INTERVAL '5 microseconds', TIME '14:38:50.000006', TIME '00:00:00')
+----
+14:38:50.000005
+
+# stride by 7 days
+query error DataFusion error: Execution error: DATE_BIN stride for TIME input must be less than 1 day
+SELECT DATE_BIN(INTERVAL '7 days', TIME '14:38:50', TIME '00:00:00')
+
+# stride by 25 hours
+query error DataFusion error: Execution error: DATE_BIN stride for TIME input must be less than 1 day
+SELECT DATE_BIN(INTERVAL '25 hours', TIME '14:38:50', TIME '00:00:00')
+
+# stride by 23 hours, 59 minutes 59 seconds
+query D
+SELECT DATE_BIN(INTERVAL '23 hours 59 minutes 59 seconds', TIME '14:38:50', TIME '00:00:00')
+----
+00:00:00
+
+# mixed types
+query error Failed to coerce arguments to satisfy a call to 'date_bin' function:
+SELECT DATE_BIN(INTERVAL '23 hours', TIME '14:38:50', TIMESTAMP '2022-08-03 14:38:50.000000006Z')
+
+# mixed types
+query error Failed to coerce arguments to satisfy a call to 'date_bin' function:
+SELECT DATE_BIN(INTERVAL '23 hours', TIMESTAMP '2022-08-03 14:38:50.000000006Z', TIME '14:38:50')
+
+# Can coerce all string arguments
+query D
+SELECT DATE_BIN('15 minutes', '14:38:50'::time, '00:00:00'::time)
+----
+14:30:00
+
+# Call in two arguments (should be the same as the above query)
+query B
+SELECT DATE_BIN('15 minutes', '14:38:50'::time) = DATE_BIN('15 minutes', '14:38:50'::time, '00:00:00'::time)
+----
+true
+
+# Shift forward by 5 minutes
+query D
+SELECT DATE_BIN(INTERVAL '15 minutes', TIME '14:38:50', TIME '00:05:00')
+----
+14:35:00
+
+# Shift backward by 5 minutes
+query D
+SELECT DATE_BIN(INTERVAL '15 minutes', TIME '14:38:50', TIME '23:55:00')
+----
+14:25:00
+
+# origin after source, TIME in previous bucket
+query D
+SELECT DATE_BIN(INTERVAL '15 minutes', TIME '14:38:50', TIME '14:40:00')
+----
+14:25:00
+
 query P
 SELECT DATE_BIN(INTERVAL '15 minutes', TIMESTAMP '2022-08-03 14:38:50Z', TIMESTAMP '1970-01-01T00:00:00Z')
 ----
@@ -1499,13 +1591,13 @@ second 2020-09-08T13:42:29
 
 # test date trunc on different timestamp scalar types and ensure they are consistent
 query P rowsort
-SELECT DATE_TRUNC('second', arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(Second, None)')) as ts
+SELECT DATE_TRUNC('second', arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(s)')) as ts
   UNION ALL
-SELECT DATE_TRUNC('second', arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(Nanosecond, None)')) as ts
+SELECT DATE_TRUNC('second', arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(ns)')) as ts
   UNION ALL
-SELECT DATE_TRUNC('day', arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(Microsecond, None)')) as ts
+SELECT DATE_TRUNC('day', arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(µs)')) as ts
   UNION ALL
-SELECT DATE_TRUNC('day', arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(Millisecond, None)')) as ts
+SELECT DATE_TRUNC('day', arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(ms)')) as ts
 ----
 2023-08-03T00:00:00
 2023-08-03T00:00:00
@@ -1873,27 +1965,6 @@ true false true true
 
 
 
-##########
-## Common timestamp data
-##########
-
-statement ok
-drop table ts_data
-
-statement ok
-drop table ts_data_nanos
-
-statement ok
-drop table ts_data_micros
-
-statement ok
-drop table ts_data_millis
-
-statement ok
-drop table ts_data_secs
-
-
-
 ##########
 ## Timezone impact on scalar functions
 #
@@ -2268,13 +2339,13 @@ SET TIME ZONE = '+05:00'
 
 statement ok
 CREATE TABLE foo (time TIMESTAMPTZ) AS VALUES
-    ('2020-01-01T00:00:00+05:00'), 
+    ('2020-01-01T00:00:00+05:00'),
     ('2020-01-01T01:00:00+05:00'),
     ('2020-01-01T02:00:00+05:00'),
     ('2020-01-01T03:00:00+05:00')
 
 statement ok
-SET TIME ZONE = '+00'
+RESET datafusion.execution.time_zone
 
 # verify column type
 query T
@@ -2317,6 +2388,59 @@ select arrow_typeof(date_trunc('microsecond', to_timestamp(61)))
 ----
 Timestamp(ns)
 
+##########
+## date_trunc with Time types
+##########
+
+# Truncate time to hour
+query D
+SELECT date_trunc('hour', TIME '14:30:45');
+----
+14:00:00
+
+# Truncate time to minute
+query D
+SELECT date_trunc('minute', TIME '14:30:45');
+----
+14:30:00
+
+# Truncate time to second (removes fractional seconds)
+query D
+SELECT date_trunc('second', TIME '14:30:45.123456789');
+----
+14:30:45
+
+# Truncate time to millisecond
+query D
+SELECT date_trunc('millisecond', TIME '14:30:45.123456789');
+----
+14:30:45.123
+
+# Truncate time to microsecond
+query D
+SELECT date_trunc('microsecond', TIME '14:30:45.123456789');
+----
+14:30:45.123456
+
+# Return type should be Time64(ns)
+query T
+SELECT arrow_typeof(date_trunc('hour', TIME '14:30:45'));
+----
+Time64(ns)
+
+# Error for granularities not valid for Time types
+query error date_trunc does not support 'day' granularity for Time types
+SELECT date_trunc('day', TIME '14:30:45');
+
+query error date_trunc does not support 'week' granularity for Time types
+SELECT date_trunc('week', TIME '14:30:45');
+
+query error date_trunc does not support 'month' granularity for Time types
+SELECT date_trunc('month', TIME '14:30:45');
+
+query error date_trunc does not support 'year' granularity for Time types
+SELECT date_trunc('year', TIME '14:30:45');
+
 # check date_bin
 query P
 SELECT date_bin(INTERVAL '1 day', time, '1970-01-01T00:00:00+05:00') FROM foo
@@ -2353,17 +2477,17 @@ NULL 1970-01-01T00:00:00 2031-01-19T23:33:25 1970-01-01T00:00:01 1969-12-31T23:5
 # verify timestamp syntax styles are consistent
 query BBBBBBBBBBBBB
 SELECT to_timestamp(null) is null as c1,
-       null::timestamp is null as c2, 
-       cast(null as timestamp) is null as c3, 
-       to_timestamp(0) = 0::timestamp as c4, 
-       to_timestamp(1926632005) = 1926632005::timestamp as c5, 
-       to_timestamp(1) = 1::timestamp as c6, 
-       to_timestamp(-1) = -1::timestamp as c7, 
+       null::timestamp is null as c2,
+       cast(null as timestamp) is null as c3,
+       to_timestamp(0) = 0::timestamp as c4,
+       to_timestamp(1926632005) = 1926632005::timestamp as c5,
+       to_timestamp(1) = 1::timestamp as c6,
+       to_timestamp(-1) = -1::timestamp as c7,
        to_timestamp(0-1) = (0-1)::timestamp as c8,
-       to_timestamp(0) = cast(0 as timestamp) as c9, 
-       to_timestamp(1926632005) = cast(1926632005 as timestamp) as c10, 
-       to_timestamp(1) = cast(1 as timestamp) as c11, 
-       to_timestamp(-1) = cast(-1 as timestamp) as c12, 
+       to_timestamp(0) = cast(0 as timestamp) as c9,
+       to_timestamp(1926632005) = cast(1926632005 as timestamp) as c10,
+       to_timestamp(1) = cast(1 as timestamp) as c11,
+       to_timestamp(-1) = cast(-1 as timestamp) as c12,
        to_timestamp(0-1) = cast(0-1 as timestamp) as c13
 ----
 true true true true true true true true true true true true true
@@ -2376,10 +2500,10 @@ Timestamp(ns) Timestamp(ns) Timestamp(ns)
 
 # verify timestamp output types using timestamp literal syntax
 query BBBBBB
-SELECT arrow_typeof(to_timestamp(1)) = arrow_typeof(1::timestamp) as c1, 
+SELECT arrow_typeof(to_timestamp(1)) = arrow_typeof(1::timestamp) as c1,
        arrow_typeof(to_timestamp(null)) = arrow_typeof(null::timestamp) as c2,
        arrow_typeof(to_timestamp('2023-01-10 12:34:56.000')) = arrow_typeof('2023-01-10 12:34:56.000'::timestamp) as c3,
-       arrow_typeof(to_timestamp(1)) = arrow_typeof(cast(1 as timestamp)) as c4, 
+       arrow_typeof(to_timestamp(1)) = arrow_typeof(cast(1 as timestamp)) as c4,
        arrow_typeof(to_timestamp(null)) = arrow_typeof(cast(null as timestamp)) as c5,
        arrow_typeof(to_timestamp('2023-01-10 12:34:56.000')) = arrow_typeof(cast('2023-01-10 12:34:56.000' as timestamp)) as c6
 ----
@@ -2388,7 +2512,7 @@ true true true true true true
 # known issues. currently overflows (expects default precision to be microsecond instead of nanoseconds. Work pending)
 #verify extreme values
 #query PPPPPPPP
-#SELECT to_timestamp(-62125747200), to_timestamp(1926632005177), -62125747200::timestamp, 1926632005177::timestamp, cast(-62125747200 as timestamp), cast(1926632005177 as timestamp)
+#SELECT to_timestamp(-62125747200), to_timestamp(1926632005177), -62125747200::timestamp as t1, 1926632005177::timestamp, cast(-62125747200 as timestamp), cast(1926632005177 as timestamp) as t2
 #----
 #0001-04-25T00:00:00 +63022-07-16T12:59:37 0001-04-25T00:00:00 +63022-07-16T12:59:37 0001-04-25T00:00:00 +63022-07-16T12:59:37
 
@@ -2594,7 +2718,7 @@ drop table ts_utf8_data
 ##########
 
 query B
-select arrow_cast(now(), 'Date64') < arrow_cast('2022-02-02 02:02:02', 'Timestamp(Nanosecond, None)');
+select arrow_cast(now(), 'Date64') < arrow_cast('2022-02-02 02:02:02', 'Timestamp(ns)');
 ----
 false
 
@@ -2623,13 +2747,13 @@ drop table table_a
 ##########
 
 statement ok
-create table table_a (ts timestamp) as values 
-    ('2020-09-08T11:42:29Z'::timestamp), 
+create table table_a (ts timestamp) as values
+    ('2020-09-08T11:42:29Z'::timestamp),
     ('2020-09-08T12:42:29Z'::timestamp),
     ('2020-09-08T13:42:29Z'::timestamp)
 
 statement ok
-create table table_b (ts timestamp) as values 
+create table table_b (ts timestamp) as values
     ('2020-09-08T11:42:29.190Z'::timestamp),
     ('2020-09-08T13:42:29.190Z'::timestamp),
     ('2020-09-08T12:42:29.190Z'::timestamp)
@@ -2714,8 +2838,8 @@ SELECT t1.ts, t1.ts + INTERVAL '1' SECOND FROM t1;
 query PT
 SELECT t1.ts::timestamptz, arrow_typeof(t1.ts::timestamptz) FROM t1;
 ----
-2018-07-01T06:00:00Z Timestamp(ns, "+00")
-2018-07-01T07:00:00Z Timestamp(ns, "+00")
+2018-07-01T06:00:00 Timestamp(ns)
+2018-07-01T07:00:00 Timestamp(ns)
 
 query D
 SELECT 0::TIME
@@ -2751,8 +2875,8 @@ statement ok
 drop table t1
 
 statement ok
-create table table_a (val int, ts1 timestamp, ts2 timestamp) as values 
-    (1, '2018-07-01T06:00:00'::timestamp, '2018-07-01T07:00:00'::timestamp), 
+create table table_a (val int, ts1 timestamp, ts2 timestamp) as values
+    (1, '2018-07-01T06:00:00'::timestamp, '2018-07-01T07:00:00'::timestamp),
     (2, '2018-07-01T07:00:00'::timestamp, '2018-07-01T08:00:00'::timestamp)
 
 query I?
@@ -2865,8 +2989,12 @@ select make_date(t.year, t.month, '4') from table_nums t;
 statement ok
 insert into table_nums values (2024, null, 23);
 
-query error DataFusion error: Execution error: Unable to parse date from 2024, 0, 23
+query D
 select make_date(t.year, t.month, t.day) from table_nums t;
+----
+2024-01-23
+2023-11-30
+NULL
 
 statement ok
 drop table table_nums;
@@ -2885,821 +3013,2381 @@ select make_date(t.year, t.month, t.day) from table_strings t;
 statement ok
 insert into table_strings values (2024, null, 23);
 
-query error DataFusion error: Execution error: Unable to parse date from 2024, 0, 23
+query D
 select make_date(t.year, t.month, t.day) from table_strings t;
+----
+2024-01-23
+2023-11-30
+NULL
 
 statement ok
 drop table table_strings;
 
-query error DataFusion error: Execution error: Unable to parse date from 2024, 13, 23
+query error DataFusion error: Execution error: Month value '13' is out of range
 select make_date(2024, 13, 23);
 
-query error DataFusion error: Execution error: Unable to parse date from 2024, 1, 32
-select make_date(2024, 01, 32);
+query error DataFusion error: Execution error: Day value '32' is out of range
+select make_date(2024, 1, 32);
 
-query error DataFusion error: Execution error: Unable to parse date from 2024, 0, 23
+query error DataFusion error: Execution error: Month value '0' is out of range
 select make_date(2024, 0, 23);
 
 query error DataFusion error: Execution error: Month value '\-1' is out of range
 select make_date(2024, -1, 23);
 
-query error DataFusion error: Execution error: Unable to parse date from 2024, 12, 0
+query error DataFusion error: Execution error: Day value '0' is out of range
 select make_date(2024, 12, 0);
 
-query error DataFusion error: Execution error: Day value '\-1' is out of range
+query error DataFusion error: Execution error: Month value '13' is out of range
 select make_date(2024, 13, -1);
 
-query error DataFusion error: Execution error: Unable to parse date from null/empty value
+query D
 select make_date(null, 1, 23);
+----
+NULL
 
-query error DataFusion error: Arrow error: Cast error: Cannot cast string '' to value of Int32 type
+query error Cast error: Cannot cast string '' to value of Int32 type
 select make_date('', 1, 23);
 
-query error DataFusion error: Execution error: Unable to parse date from null/empty value
+query D
 select make_date(2024, null, 23);
+----
+NULL
 
-query error DataFusion error: Arrow error: Cast error: Cannot cast string '' to value of Int32 type
+query error Arrow error: Cast error: Cannot cast string '' to value of Int32 type
 select make_date(2024, '', 27);
 
-query error DataFusion error: Execution error: Unable to parse date from null/empty value
+query D
 select make_date(2024, 1, null);
+----
+NULL
 
-query error DataFusion error: Arrow error: Cast error: Cannot cast string '' to value of Int32 type
+query error Arrow error: Cast error: Cannot cast string '' to value of Int32 type
 select make_date(2024, 1, '');
 
+query error DataFusion error: Execution error: Unable to parse date from 2024, 11, 31
+select make_date(2024, 11, 31);
+
+query D
+select make_date(null, 1::bigint, 14::bigint unsigned);
+----
+NULL
+
+query error DataFusion error: Error during planning: Function 'make_date' expects 3 arguments but received 1
+select make_date(1);
+
+query error DataFusion error: Error during planning: Function 'make_date' requires Int32, but received Interval\(MonthDayNano\) \(DataType: Interval\(MonthDayNano\)\).
+select make_date(interval '1 day', '2001-05-21'::timestamp, '2001-05-21'::timestamp);
 
 ##########
-## to_char tests
+## make time tests
 ##########
 
-statement ok
-create table formats (
-    dates date,
-    times time,
-    timestamps timestamp,
-    date_format varchar,
-    time_format varchar,
-    timestamp_format varchar)
-as values
-    ('2000-01-01'::date, '23:45:01'::time, '2024-01-01 06:00:00'::timestamp, '%d:%m:%Y', '%H-%M-%S', '%d:%m:%Y %H-%M-%S'),
-    ('2003-04-05'::date, '04:56:32'::time, '2025-01-01 23:59:58'::timestamp, '%d:%m:%Y', '%H::%M::%S', '%d:%m:%Y %H-%M-%S');
+query D
+select make_time(22, 1, 27);
+----
+22:01:27
 
+query D
+select make_time(22, 0, 0);
+----
+22:00:00
 
-query T
-select to_char(dates, date_format) from formats;
+query D
+select make_time(0, 0, 0);
 ----
-01:01:2000
-05:04:2003
+00:00:00
 
-query T
-select date_format(dates, date_format) from formats;
+query D
+select make_time(22, 2, 29);
 ----
-01:01:2000
-05:04:2003
+22:02:29
 
-query T
-select date_format(dates, time_format) from formats;
+query D
+select make_time(null, null, null);
 ----
-00-00-00
-00::00::00
+NULL
 
-query T
-select date_format(dates, timestamp_format) from formats;
+query D
+select make_time(null, 1, 23);
 ----
-01:01:2000 00-00-00
-05:04:2003 00-00-00
+NULL
 
-query T
-select to_char(times, time_format) from formats;
+query D
+select make_time(22, null, 23);
 ----
-23-45-01
-04::56::32
+NULL
 
-query T
-select to_char(timestamps, date_format) from formats;
+query D
+select make_time(22, 1, null);
 ----
-01:01:2024
-01:01:2025
+NULL
 
-query T
-select to_char(timestamps, timestamp_format) from formats;
+query D
+select make_time('22', '01', '27');
 ----
-01:01:2024 06-00-00
-01:01:2025 23-59-58
+22:01:27
 
-query T
-select to_char('2000-02-03'::date, '%Y:%d:%m');
+query D
+select make_time(12 + 11, '01', '27');
 ----
-2000:03:02
+23:01:27
 
-query T
-select to_char(arrow_cast(12345::int, 'Time32(Second)'), '%H-%M-%S')
+query D
+select make_time(22::tinyint, 01::tinyint, 27::tinyint);
 ----
-03-25-45
+22:01:27
 
-query T
-select to_char(arrow_cast(12344567::int, 'Time32(Millisecond)'), '%H-%M-%S %f')
+query D
+select make_time(22::smallint, 01::smallint, 27::smallint);
 ----
-03-25-44 567000000
+22:01:27
 
-query T
-select to_char(arrow_cast(12344567000, 'Time64(Microsecond)'), '%H-%M-%S %f')
+query D
+select make_time(22::int, 01::int, 27::int);
 ----
-03-25-44 567000000
+22:01:27
 
-query T
-select to_char(arrow_cast(12344567890000, 'Time64(Nanosecond)'), '%H-%M-%S %f')
+query D
+select make_time(22::bigint, 01::bigint, 27::bigint);
 ----
-03-25-44 567890000
+22:01:27
 
-query T
-select to_char(arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(Second, None)'), '%d-%m-%Y %H-%M-%S')
+query D
+select make_time(22::tinyint unsigned, 01::tinyint unsigned, 27::tinyint unsigned);
 ----
-03-08-2023 14-38-50
+22:01:27
 
-query T
-select to_char(arrow_cast('2023-09-04'::date, 'Timestamp(Second, Some("UTC"))'), '%Y-%m-%dT%H:%M:%S%.3f');
+query D
+select make_time(22::smallint unsigned, 01::smallint unsigned, 27::smallint unsigned);
 ----
-2023-09-04T00:00:00.000
+22:01:27
 
-query T
-select to_char(arrow_cast(123456, 'Duration(Second)'), 'pretty');
+query D
+select make_time(22::int unsigned, 01::int unsigned, 27::int unsigned);
 ----
-1 days 10 hours 17 mins 36 secs
+22:01:27
 
-query T
-select to_char(arrow_cast(123456, 'Duration(Second)'), 'iso8601');
+query D
+select make_time(22::bigint unsigned, 01::bigint unsigned, 27::bigint unsigned);
 ----
-PT123456S
+22:01:27
 
-query T
-select to_char(arrow_cast(123456, 'Duration(Second)'), null);
+query D
+select make_time(arrow_cast(22, 'Int32'), arrow_cast(1, 'Int32'), arrow_cast(27, 'Int32'));
 ----
-NULL
+22:01:27
 
-query error DataFusion error: Execution error: Cast error: Format error
-SELECT to_char(timestamps, '%X%K') from formats;
+query D
+select make_time(arrow_cast(22, 'Int64'), arrow_cast(1, 'Int64'), arrow_cast(27, 'Int64'));
+----
+22:01:27
 
-query error DataFusion error: Execution error: Cast error: Format error
-SELECT to_char('2000-02-03'::date, '%X%K');
+query D
+select make_time(arrow_cast('22', 'Utf8'), arrow_cast('1', 'Utf8'), arrow_cast('27', 'Utf8'));
+----
+22:01:27
 
-query T
-SELECT to_char(timestamps, null) from formats;
+query D
+select make_time(arrow_cast('22', 'Utf8View'), arrow_cast('1', 'Utf8View'), arrow_cast('27', 'Utf8View'));
 ----
-NULL
-NULL
+22:01:27
 
-query T
-SELECT to_char(null, '%d-%m-%Y');
+query D
+select make_time(arrow_cast('22', 'LargeUtf8'), arrow_cast('1', 'LargeUtf8'), arrow_cast('27', 'LargeUtf8'));
 ----
-NULL
+22:01:27
 
-query T
-SELECT to_char(date_column, '%Y-%m-%d')
-FROM (VALUES 
-    (DATE '2020-09-01'),
-    (NULL)
-) AS t(date_column);
+query D
+select make_time(22, arrow_cast('1', 'Int64'), arrow_cast('27', 'UInt32'));
 ----
-2020-09-01
-NULL
+22:01:27
 
-query T
-SELECT to_char(date_column, '%Y-%m-%d')
-FROM (VALUES 
-    (NULL),
-    (DATE '2020-09-01')
-) AS t(date_column);
+query D
+select make_time(22, arrow_cast('1', 'UInt64'), arrow_cast('27', 'UInt32'));
 ----
-NULL
-2020-09-01
+22:01:27
 
-query T
-SELECT to_char(column1, column2)
-FROM
-(VALUES ('2024-01-01 06:00:00'::timestamp, null), ('2025-01-01 23:59:58'::timestamp, '%d:%m:%Y %H-%M-%S'));
+query D
+select make_time(arrow_cast('22', 'Utf8'), arrow_cast('1', 'LargeUtf8'), arrow_cast('27', 'Utf8'));
 ----
-NULL
-01:01:2025 23-59-58
+22:01:27
+
+query D
+select make_time(22, arrow_cast('1', 'LargeUtf8'), arrow_cast('27', 'Utf8'));
+----
+22:01:27
+
+query error Can't cast value 18446744073709551615 to type Int32
+select make_time(22, 18446744073709551615, 27);
 
 query T
-select to_char('2020-01-01 00:10:20.123'::timestamp at time zone 'America/New_York', '%Y-%m-%d %H:%M:%S.%3f');
+select arrow_typeof(make_time(22, 1, 27));
 ----
-2020-01-01 00:10:20.123
+Time32(s)
 
 statement ok
-drop table formats;
-
-##########
-## to_unixtime tests
-##########
+create table table_nums (hour int, minute int, second int) as values
+    (22, 1, 23),
+    (20, 11, 30);
 
-query I
-select to_unixtime('2020-09-08T12:00:00+00:00');
+query D
+select make_time(t.hour, t.minute, t.second) from table_nums t;
 ----
-1599566400
+22:01:23
+20:11:30
 
-query I
-select to_unixtime(arrow_cast(to_timestamp('2023-01-14T01:01:30'), 'Timestamp(Second, Some("+05:30"))'));
+query D
+select make_time(21, t.minute, t.second) from table_nums t;
 ----
-1673638290
+21:01:23
+21:11:30
 
-query I
-select to_unixtime(arrow_cast(to_timestamp('2023-01-14T01:01:30'), 'Timestamp(Millisecond, None)'));
+query D
+select make_time(t.hour, 3, t.second) from table_nums t;
 ----
-1673658090
+22:03:23
+20:03:30
 
-query I
-select to_unixtime('01-14-2023 01:01:30+05:30', '%q', '%d-%m-%Y %H/%M/%S', '%+', '%m-%d-%Y %H:%M:%S%#z');
+query D
+select make_time(t.hour, t.minute, 4) from table_nums t;
 ----
-1673638290
+22:01:04
+20:11:04
 
-query I
-select to_unixtime('03:59:00.123456789 05-17-2023', '%c', '%+', '%H:%M:%S%.f %m-%d-%Y');
+query D
+select make_time('21', t.minute, t.second) from table_nums t;
 ----
-1684295940
+21:01:23
+21:11:30
 
-query I
-select to_unixtime(arrow_cast('2020-09-08T12:00:00+00:00', 'Date64'));
+query D
+select make_time(t.hour, '3', t.second) from table_nums t;
 ----
-1599566400
+22:03:23
+20:03:30
 
-query I
-select to_unixtime(arrow_cast('2020-09-08', 'Date32'));
+query D
+select make_time(t.hour, t.minute, '4') from table_nums t;
 ----
-1599523200
+22:01:04
+20:11:04
 
-query I
-select to_unixtime(to_timestamp('2020-09-08'));
-----
-1599523200
+statement ok
+insert into table_nums values (25, null, 77);
 
-query I
-select to_unixtime(to_timestamp_seconds('2020-09-08'));
+query D
+select make_time(t.hour, t.minute, t.second) from table_nums t;
 ----
-1599523200
+22:01:23
+20:11:30
+NULL
 
-query I
-select to_unixtime(to_timestamp_millis('2020-09-08'));
-----
-1599523200
+statement ok
+drop table table_nums;
 
-query I
-select to_unixtime(to_timestamp_micros('2020-09-08'));
-----
-1599523200
+statement ok
+create table table_strings (hour varchar(4), minute varchar(2), second varchar(2)) as values
+    ('22', '1', '23'),
+    ('23', '11', '30');
 
-query I
-select to_unixtime(to_timestamp_nanos('2020-09-08'));
+query D
+select make_time(t.hour, t.minute, t.second) from table_strings t;
 ----
-1599523200
+22:01:23
+23:11:30
 
-query I
-select to_unixtime(arrow_cast(1599523200, 'Int32'));
-----
-1599523200
+statement ok
+insert into table_strings values ('33', null, '23');
 
-query I
-select to_unixtime(arrow_cast(1599523200, 'Int64'));
+query D
+select make_time(t.hour, t.minute, t.second) from table_strings t;
 ----
-1599523200
+22:01:23
+23:11:30
+NULL
 
-query I
-select to_unixtime(arrow_cast(1599523200.414, 'Float64'));
-----
-1599523200
+statement ok
+insert into table_strings values ('33', '12', '23');
+
+query error DataFusion error: Execution error: Hour value '33' is out of range
+select make_time(t.hour, t.minute, t.second) from table_strings t;
+
+statement ok
+drop table table_strings;
+
+query error Function 'make_time' expects 3 arguments but received 1
+select make_time(22);
+
+query error Function 'make_time' expects 3 arguments but received 2
+select make_time(22, 22);
+
+query error DataFusion error: Execution error: Hour value '26' is out of range
+select make_time(26, 13, 23);
+
+query error DataFusion error: Execution error: Second value '62' is out of range
+select make_time(22, 01, 62);
+
+query error DataFusion error: Execution error: Minute value '64' is out of range
+select make_time(22, 64, 23);
+
+query error DataFusion error: Execution error: Hour value '-1' is out of range
+select make_time(-1, 12, 0);
+
+query error DataFusion error: Execution error: Minute value '-1' is out of range
+select make_time(22, -1, 23);
+
+query error DataFusion error: Execution error: Second value '-1' is out of range
+select make_time(22, 13, -1);
+
+query error Cannot cast string '' to value of Int32 type
+select make_time('', 1, 23);
+
+query error Cannot cast string '' to value of Int32 type
+select make_time(22, '', 27);
+
+query error Cannot cast string '' to value of Int32 type
+select make_time(22, 1, '');
+
+query error DataFusion error: Error during planning: Function 'make_time' requires Int32, but received Float64 \(DataType: Float64\)
+select make_time(arrow_cast(22, 'Float64'), 1, '');
 
 ##########
-## Tests for the "AT TIME ZONE" clause
+## to_time tests
 ##########
 
-query P
-SELECT '2000-12-01 04:04:12' AT TIME ZONE 'UTC';
+# Basic time parsing
+
+query D
+select to_time('12:30:45');
 ----
-2000-12-01T04:04:12Z
+12:30:45
 
-query P
-SELECT '2000-12-01 04:04:12' AT TIME ZONE 'America/New_York';
+query D
+select to_time('00:00:00');
 ----
-2000-12-01T04:04:12-05:00
+00:00:00
 
-query P
-SELECT '2024-03-30 00:00:20' AT TIME ZONE 'Europe/Brussels';
+query D
+select to_time('23:59:59');
 ----
-2024-03-30T00:00:20+01:00
+23:59:59
 
-query P
-SELECT '2024-03-30 00:00:20'::timestamp AT TIME ZONE 'Europe/Brussels';
+query D
+select to_time('08:15:30');
 ----
-2024-03-30T00:00:20+01:00
+08:15:30
 
-query P
-SELECT '2024-03-30 00:00:20Z' AT TIME ZONE 'Europe/Brussels';
+# Time with fractional seconds
+
+query D
+select to_time('12:30:45.123');
 ----
-2024-03-30T01:00:20+01:00
+12:30:45.123
 
-query P
-SELECT '2024-03-30 00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels';
+query D
+select to_time('12:30:45.123456789');
 ----
-2024-03-30T00:00:20+01:00
+12:30:45.123456789
 
-## date-time strings that already have a explicit timezone can be used with AT TIME ZONE
+# Time with custom format
 
-# same time zone as provided date-time
-query P
-SELECT '2000-12-01T04:04:12-05:00' AT TIME ZONE 'America/New_York';
+query D
+select to_time('12-30-45', '%H-%M-%S');
 ----
-2000-12-01T04:04:12-05:00
+12:30:45
 
-# different time zone than provided date-time
-query P
-SELECT '2000-12-01T04:04:12-05:00' AT TIME ZONE 'Europe/Berlin';
+query D
+select to_time('14/25/30', '%H/%M/%S');
 ----
-2000-12-01T10:04:12+01:00
+14:25:30
 
-# longform timezones need whitespace converted to underscore
-statement error
-SELECT '2000-12-01 04:04:12' AT TIME ZONE 'America/New York';
+query D
+select to_time('02:30:45 PM', '%I:%M:%S %p');
+----
+14:30:45
 
-# abbreviated timezone is not supported
-statement error
-SELECT '2023-03-12 02:00:00' AT TIME ZONE 'EDT';
+# Null handling
 
-# Test current_time without parentheses
-query B
-select current_time = current_time;
+query D
+select to_time(null);
 ----
-true
+NULL
 
-# Test temporal coercion for UTC
-query ?
-select arrow_cast('2024-06-17T11:00:00', 'Timestamp(Nanosecond, Some("UTC"))') - arrow_cast('2024-06-17T12:00:00', 'Timestamp(Microsecond, Some("UTC"))');
+# Return type check
+
+query T
+select arrow_typeof(to_time('12:30:45'));
 ----
-0 days -1 hours 0 mins 0.000000 secs
+Time64(ns)
 
-query ?
-select arrow_cast('2024-06-17T13:00:00', 'Timestamp(Nanosecond, Some("+00:00"))') - arrow_cast('2024-06-17T12:00:00', 'Timestamp(Microsecond, Some("UTC"))');
+# Table input
+
+statement ok
+create table time_strings (time_str varchar) as values
+    ('12:30:45'),
+    ('23:59:59'),
+    ('00:00:00');
+
+query D
+select to_time(time_str) from time_strings;
 ----
-0 days 1 hours 0 mins 0.000000 secs
+12:30:45
+23:59:59
+00:00:00
 
-query ?
-select arrow_cast('2024-06-17T13:00:00', 'Timestamp(Nanosecond, Some("UTC"))') - arrow_cast('2024-06-17T12:00:00', 'Timestamp(Microsecond, Some("+00:00"))');
+statement ok
+drop table time_strings;
+
+# Error cases
+
+query error Error parsing 'not_a_time' as time
+select to_time('not_a_time');
+
+query error Error parsing '25:00:00' as time
+select to_time('25:00:00');
+
+# Out of range minutes
+query error Error parsing '12:60:00' as time
+select to_time('12:60:00');
+
+# Out of range seconds (61 is invalid, 60 is allowed as leap second)
+query error Error parsing '12:30:61' as time
+select to_time('12:30:61');
+
+query error does not support zero arguments
+select to_time();
+
+# StringView type support
+
+query D
+select to_time(arrow_cast('08:15:30', 'Utf8View'));
 ----
-0 days 1 hours 0 mins 0.000000 secs
+08:15:30
 
-# not supported: coercion across timezones
-query error
-select arrow_cast('2024-06-17T13:00:00', 'Timestamp(Nanosecond, Some("UTC"))') - arrow_cast('2024-06-17T12:00:00', 'Timestamp(Microsecond, Some("+01:00"))');
+# LargeUtf8 type support
 
-query error
-select arrow_cast('2024-06-17T13:00:00', 'Timestamp(Nanosecond, Some("+00:00"))') - arrow_cast('2024-06-17T12:00:00', 'Timestamp(Microsecond, Some("+01:00"))');
+query D
+select to_time(arrow_cast('14:45:00', 'LargeUtf8'));
+----
+14:45:00
 
-##########
-## Test to_local_time function
-##########
+# HH:MM default parsing (no seconds)
 
-# invalid number of arguments -- no argument
-statement error
-select to_local_time();
+query D
+select to_time('14:30');
+----
+14:30:00
 
-# invalid number of arguments -- more than 1 argument
-statement error
-select to_local_time('2024-04-01T00:00:20Z'::timestamp, 'some string');
+query D
+select to_time('09:05');
+----
+09:05:00
 
-# invalid argument data type
-statement error The to_local_time function can only accept Timestamp as the arg got Utf8
-select to_local_time('2024-04-01T00:00:20Z');
+query D
+select to_time('00:00');
+----
+00:00:00
 
-# invalid timezone
-statement error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Parser error: Invalid timezone "Europe/timezone": failed to parse timezone
-select to_local_time('2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/timezone');
+# Timestamp input - extract time portion
 
-# valid query
-query P
-select to_local_time('2024-04-01T00:00:20Z'::timestamp);
+query D
+select to_time(to_timestamp('2024-01-15 14:30:45'));
 ----
-2024-04-01T00:00:20
+14:30:45
 
-query P
-select to_local_time('2024-04-01T00:00:20Z'::timestamp AT TIME ZONE '+05:00');
+query D
+select to_time('2024-03-20 09:15:30'::timestamp);
 ----
-2024-04-01T00:00:20
+09:15:30
 
-query P
-select to_local_time('2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels');
+query D
+select to_time('2024-06-15 23:59:59.123456789'::timestamp);
 ----
-2024-04-01T00:00:20
+23:59:59.123456789
 
-query P
-select to_local_time(NULL);
+query D
+select to_time('2024-01-01 00:00:00'::timestamp);
+----
+00:00:00
+
+# Timestamp with timezone
+
+query D
+select to_time(to_timestamp('2024-01-15T14:30:45+00:00'));
+----
+14:30:45
+
+# Null timestamp
+
+query D
+select to_time(null::timestamp);
 ----
 NULL
 
-query PTPT
-select
-  time,
-  arrow_typeof(time) as type,
-  to_local_time(time) as to_local_time,
-  arrow_typeof(to_local_time(time)) as to_local_time_type
-from (
-  select '2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels' as time
-);
+# Return type check with timestamp input
+
+query T
+select arrow_typeof(to_time(to_timestamp('2024-01-15 12:30:45')));
 ----
-2024-04-01T00:00:20+02:00 Timestamp(ns, "Europe/Brussels") 2024-04-01T00:00:20 Timestamp(ns)
+Time64(ns)
 
-# use to_local_time() in date_bin()
-query P
-select date_bin(interval '1 day', to_local_time('2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels'));
+# Timestamp with timezone offset - to_timestamp parses and normalizes to UTC
+# 14:30:45 in UTC-5 = 19:30:45 in UTC, and Arrow stores as UTC internally
+
+query D
+select to_time(to_timestamp('2024-01-15T14:30:45-05:00'));
 ----
-2024-04-01T00:00:00
+19:30:45
 
-query P
-select date_bin(interval '1 day', to_local_time('2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels')) AT TIME ZONE 'Europe/Brussels';
+# Timestamp without timezone - time portion is extracted as-is
+
+query D
+select to_time('2024-03-20 09:15:30'::timestamp);
 ----
-2024-04-01T00:00:00+02:00
+09:15:30
 
-# test using to_local_time() on array values
-statement ok
-create table t AS
-VALUES
-  (NULL),
-  ('2024-01-01T00:00:01Z'),
-  ('2024-02-01T00:00:01Z'),
-  ('2024-03-01T00:00:01Z'),
-  ('2024-04-01T00:00:01Z'),
-  ('2024-05-01T00:00:01Z'),
-  ('2024-06-01T00:00:01Z'),
-  ('2024-07-01T00:00:01Z'),
-  ('2024-08-01T00:00:01Z'),
-  ('2024-09-01T00:00:01Z'),
-  ('2024-10-01T00:00:01Z'),
-  ('2024-11-01T00:00:01Z'),
-  ('2024-12-01T00:00:01Z')
-;
+# Timestamp with timezone (timestamptz) - time is extracted as stored
+# Note: AT TIME ZONE labels the naive timestamp but doesn't convert the time value
+
+query D
+select to_time('2024-03-20 09:15:30'::timestamp AT TIME ZONE 'America/Los_Angeles');
+----
+09:15:30
+
+# Timestamp before epoch (1969-12-31 23:30:00 UTC)
+
+query D
+select to_time(to_timestamp('1969-12-31T23:30:00+00:00'));
+----
+23:30:00
+
+query D
+select to_time(to_timestamp('1960-06-15T08:45:30+00:00'));
+----
+08:45:30
+
+##########
+## to_char tests
+##########
 
 statement ok
-create view t_utc as
-select column1::timestamp AT TIME ZONE 'UTC' as "column1"
-from t;
+create table formats (
+    dates date,
+    times time,
+    timestamps timestamp,
+    date_format varchar,
+    time_format varchar,
+    timestamp_format varchar)
+as values
+    ('2000-01-01'::date, '23:45:01'::time, '2024-01-01 06:00:00'::timestamp, '%d:%m:%Y', '%H-%M-%S', '%d:%m:%Y %H-%M-%S'),
+    ('2003-04-05'::date, '04:56:32'::time, '2025-01-01 23:59:58'::timestamp, '%d:%m:%Y', '%H::%M::%S', '%d:%m:%Y %H-%M-%S');
+
+
+query T
+select to_char(dates, date_format) from formats;
+----
+01:01:2000
+05:04:2003
+
+query T
+select date_format(dates, date_format) from formats;
+----
+01:01:2000
+05:04:2003
+
+query T
+select date_format(dates, time_format) from formats;
+----
+00-00-00
+00::00::00
+
+query T
+select date_format(dates, timestamp_format) from formats;
+----
+01:01:2000 00-00-00
+05:04:2003 00-00-00
+
+query T
+select to_char(times, time_format) from formats;
+----
+23-45-01
+04::56::32
+
+query T
+select to_char(timestamps, date_format) from formats;
+----
+01:01:2024
+01:01:2025
+
+query T
+select to_char(timestamps, timestamp_format) from formats;
+----
+01:01:2024 06-00-00
+01:01:2025 23-59-58
+
+query T
+select to_char('2000-02-03'::date, '%Y:%d:%m');
+----
+2000:03:02
+
+query T
+select to_char(arrow_cast(12345::int, 'Time32(Second)'), '%H-%M-%S')
+----
+03-25-45
+
+query T
+select to_char(arrow_cast(12344567::int, 'Time32(Millisecond)'), '%H-%M-%S %f')
+----
+03-25-44 567000000
+
+query T
+select to_char(arrow_cast(12344567000, 'Time64(Microsecond)'), '%H-%M-%S %f')
+----
+03-25-44 567000000
+
+query T
+select to_char(arrow_cast(12344567890000, 'Time64(Nanosecond)'), '%H-%M-%S %f')
+----
+03-25-44 567890000
+
+query T
+select to_char(arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(s)'), '%d-%m-%Y %H-%M-%S')
+----
+03-08-2023 14-38-50
+
+query T
+select to_char(arrow_cast('2023-09-04'::date, 'Timestamp(Second, Some("UTC"))'), '%Y-%m-%dT%H:%M:%S%.3f');
+----
+2023-09-04T00:00:00.000
+
+query T
+select to_char(arrow_cast(123456, 'Duration(Second)'), 'pretty');
+----
+1 days 10 hours 17 mins 36 secs
+
+query T
+select to_char(arrow_cast(123456, 'Duration(Second)'), 'iso8601');
+----
+PT123456S
+
+query T
+select to_char(arrow_cast(123456, 'Duration(Second)'), null);
+----
+NULL
+
+query error DataFusion error: Arrow error: Cast error: Format error
+SELECT to_char(timestamps, '%X%K') from formats;
+
+query error DataFusion error: Arrow error: Cast error: Format error
+SELECT to_char('2000-02-03'::date, '%X%K');
+
+query T
+SELECT to_char(timestamps, null) from formats;
+----
+NULL
+NULL
+
+query T
+SELECT to_char(null, '%d-%m-%Y');
+----
+NULL
+
+query T
+SELECT to_char(date_column, '%Y-%m-%d')
+FROM (VALUES
+    (DATE '2020-09-01'),
+    (NULL)
+) AS t(date_column);
+----
+2020-09-01
+NULL
+
+query T
+SELECT to_char(date_column, '%Y-%m-%d')
+FROM (VALUES
+    (NULL),
+    (DATE '2020-09-01')
+) AS t(date_column);
+----
+NULL
+2020-09-01
+
+query T
+SELECT to_char(column1, column2)
+FROM
+(VALUES ('2024-01-01 06:00:00'::timestamp, null), ('2025-01-01 23:59:58'::timestamp, '%d:%m:%Y %H-%M-%S'));
+----
+NULL
+01:01:2025 23-59-58
+
+query T
+select to_char('2020-01-01 00:10:20.123'::timestamp at time zone 'America/New_York', '%Y-%m-%d %H:%M:%S.%3f');
+----
+2020-01-01 00:10:20.123
+
+# Null values with array format
+query T
+SELECT to_char(column1, column2)
+FROM (VALUES
+    (DATE '2020-09-01', '%Y-%m-%d'),
+    (NULL, '%Y-%m-%d'),
+    (DATE '2020-09-02', NULL),
+    (NULL, NULL)
+);
+----
+2020-09-01
+NULL
+NULL
+NULL
 
 statement ok
-create view t_timezone as
-select column1::timestamp AT TIME ZONE 'Europe/Brussels' as "column1"
-from t;
+drop table formats;
 
-query PPT
-select column1, to_local_time(column1::timestamp), arrow_typeof(to_local_time(column1::timestamp)) from t_utc;
+##########
+## to_unixtime tests
+##########
+
+query I
+select to_unixtime('2020-09-08T12:00:00+00:00');
 ----
-NULL NULL Timestamp(ns)
-2024-01-01T00:00:01Z 2024-01-01T00:00:01 Timestamp(ns)
-2024-02-01T00:00:01Z 2024-02-01T00:00:01 Timestamp(ns)
-2024-03-01T00:00:01Z 2024-03-01T00:00:01 Timestamp(ns)
-2024-04-01T00:00:01Z 2024-04-01T00:00:01 Timestamp(ns)
-2024-05-01T00:00:01Z 2024-05-01T00:00:01 Timestamp(ns)
-2024-06-01T00:00:01Z 2024-06-01T00:00:01 Timestamp(ns)
-2024-07-01T00:00:01Z 2024-07-01T00:00:01 Timestamp(ns)
-2024-08-01T00:00:01Z 2024-08-01T00:00:01 Timestamp(ns)
-2024-09-01T00:00:01Z 2024-09-01T00:00:01 Timestamp(ns)
-2024-10-01T00:00:01Z 2024-10-01T00:00:01 Timestamp(ns)
-2024-11-01T00:00:01Z 2024-11-01T00:00:01 Timestamp(ns)
-2024-12-01T00:00:01Z 2024-12-01T00:00:01 Timestamp(ns)
+1599566400
 
-query PPT
-select column1, to_local_time(column1), arrow_typeof(to_local_time(column1)) from t_utc;
+query I
+select to_unixtime(arrow_cast(to_timestamp('2023-01-14T01:01:30'), 'Timestamp(Second, Some("+05:30"))'));
 ----
-NULL NULL Timestamp(ns)
-2024-01-01T00:00:01Z 2024-01-01T00:00:01 Timestamp(ns)
-2024-02-01T00:00:01Z 2024-02-01T00:00:01 Timestamp(ns)
-2024-03-01T00:00:01Z 2024-03-01T00:00:01 Timestamp(ns)
-2024-04-01T00:00:01Z 2024-04-01T00:00:01 Timestamp(ns)
-2024-05-01T00:00:01Z 2024-05-01T00:00:01 Timestamp(ns)
-2024-06-01T00:00:01Z 2024-06-01T00:00:01 Timestamp(ns)
-2024-07-01T00:00:01Z 2024-07-01T00:00:01 Timestamp(ns)
-2024-08-01T00:00:01Z 2024-08-01T00:00:01 Timestamp(ns)
-2024-09-01T00:00:01Z 2024-09-01T00:00:01 Timestamp(ns)
-2024-10-01T00:00:01Z 2024-10-01T00:00:01 Timestamp(ns)
-2024-11-01T00:00:01Z 2024-11-01T00:00:01 Timestamp(ns)
-2024-12-01T00:00:01Z 2024-12-01T00:00:01 Timestamp(ns)
+1673638290
 
-query PPT
-select column1, to_local_time(column1), arrow_typeof(to_local_time(column1)) from t_timezone;
+query I
+select to_unixtime(arrow_cast(to_timestamp('2023-01-14T01:01:30'), 'Timestamp(ms)'));
 ----
-NULL NULL Timestamp(ns)
-2024-01-01T00:00:01+01:00 2024-01-01T00:00:01 Timestamp(ns)
-2024-02-01T00:00:01+01:00 2024-02-01T00:00:01 Timestamp(ns)
-2024-03-01T00:00:01+01:00 2024-03-01T00:00:01 Timestamp(ns)
-2024-04-01T00:00:01+02:00 2024-04-01T00:00:01 Timestamp(ns)
-2024-05-01T00:00:01+02:00 2024-05-01T00:00:01 Timestamp(ns)
-2024-06-01T00:00:01+02:00 2024-06-01T00:00:01 Timestamp(ns)
-2024-07-01T00:00:01+02:00 2024-07-01T00:00:01 Timestamp(ns)
-2024-08-01T00:00:01+02:00 2024-08-01T00:00:01 Timestamp(ns)
-2024-09-01T00:00:01+02:00 2024-09-01T00:00:01 Timestamp(ns)
-2024-10-01T00:00:01+02:00 2024-10-01T00:00:01 Timestamp(ns)
-2024-11-01T00:00:01+01:00 2024-11-01T00:00:01 Timestamp(ns)
-2024-12-01T00:00:01+01:00 2024-12-01T00:00:01 Timestamp(ns)
+1673658090
+
+query I
+select to_unixtime('01-14-2023 01:01:30+05:30', '%q', '%d-%m-%Y %H/%M/%S', '%+', '%m-%d-%Y %H:%M:%S%#z');
+----
+1673638290
+
+query I
+select to_unixtime('03:59:00.123456789 05-17-2023', '%c', '%+', '%H:%M:%S%.f %m-%d-%Y');
+----
+1684295940
+
+query I
+select to_unixtime(arrow_cast('2020-09-08T12:00:00+00:00', 'Date64'));
+----
+1599566400
+
+query I
+select to_unixtime(arrow_cast('2020-09-08', 'Date32'));
+----
+1599523200
+
+query I
+select to_unixtime(to_timestamp('2020-09-08'));
+----
+1599523200
+
+query I
+select to_unixtime(to_timestamp_seconds('2020-09-08'));
+----
+1599523200
+
+query I
+select to_unixtime(to_timestamp_millis('2020-09-08'));
+----
+1599523200
+
+query I
+select to_unixtime(to_timestamp_micros('2020-09-08'));
+----
+1599523200
+
+query I
+select to_unixtime(to_timestamp_nanos('2020-09-08'));
+----
+1599523200
+
+query I
+select to_unixtime(arrow_cast(1599523200, 'Int32'));
+----
+1599523200
+
+query I
+select to_unixtime(arrow_cast(1599523200, 'Int64'));
+----
+1599523200
+
+query I
+select to_unixtime(arrow_cast(1599523200.414, 'Float64'));
+----
+1599523200
+
+query I
+select to_unixtime(arrow_cast(-1, 'Int8'));
+----
+-1
+
+query I
+select to_unixtime(arrow_cast(null, 'Int8'));
+----
+NULL
+
+query I
+select to_unixtime(arrow_cast(1000, 'Int16'));
+----
+1000
+
+query I
+select to_unixtime(arrow_cast(255, 'UInt8'));
+----
+255
+
+query I
+select to_unixtime(arrow_cast(65535, 'UInt16'));
+----
+65535
+
+query I
+select to_unixtime(arrow_cast(1599523200, 'UInt32'));
+----
+1599523200
+
+query I
+select to_unixtime(arrow_cast(1599523200, 'UInt64'));
+----
+1599523200
+
+query error DataFusion error: Arrow error: Cast error: Can't cast value 18446744073709551615 to type Int64
+select to_unixtime(arrow_cast(18446744073709551615, 'UInt64'));
+
+query I
+select to_unixtime(arrow_cast(1000.12, 'Float16'));
+----
+1000
+
+query I
+select to_unixtime(arrow_cast(1000.414, 'Float32'));
+----
+1000
+
+query I
+select to_unixtime(arrow_cast('2020-09-08T12:00:00+00:00', 'Utf8View'));
+----
+1599566400
+
+query I
+select to_unixtime(arrow_cast('2020-09-08T12:00:00+00:00', 'LargeUtf8'));
+----
+1599566400
+
+##########
+## Tests for the "AT TIME ZONE" clause
+##########
+
+query P
+SELECT '2000-12-01 04:04:12' AT TIME ZONE 'UTC';
+----
+2000-12-01T04:04:12Z
+
+query P
+SELECT '2000-12-01 04:04:12' AT TIME ZONE 'America/New_York';
+----
+2000-12-01T04:04:12-05:00
+
+query P
+SELECT '2024-03-30 00:00:20' AT TIME ZONE 'Europe/Brussels';
+----
+2024-03-30T00:00:20+01:00
+
+query P
+SELECT '2024-03-30 00:00:20'::timestamp AT TIME ZONE 'Europe/Brussels';
+----
+2024-03-30T00:00:20+01:00
+
+query P
+SELECT '2024-03-30 00:00:20Z' AT TIME ZONE 'Europe/Brussels';
+----
+2024-03-30T01:00:20+01:00
+
+query P
+SELECT '2024-03-30 00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels';
+----
+2024-03-30T00:00:20+01:00
+
+## date-time strings that already have a explicit timezone can be used with AT TIME ZONE
+
+# same time zone as provided date-time
+query P
+SELECT '2000-12-01T04:04:12-05:00' AT TIME ZONE 'America/New_York';
+----
+2000-12-01T04:04:12-05:00
+
+# different time zone than provided date-time
+query P
+SELECT '2000-12-01T04:04:12-05:00' AT TIME ZONE 'Europe/Berlin';
+----
+2000-12-01T10:04:12+01:00
+
+# longform timezones need whitespace converted to underscore
+statement error
+SELECT '2000-12-01 04:04:12' AT TIME ZONE 'America/New York';
+
+# abbreviated timezone is not supported
+statement error
+SELECT '2023-03-12 02:00:00' AT TIME ZONE 'EDT';
+
+# Test current_time without parentheses
+query B
+select current_time = current_time;
+----
+true
+
+# Test temporal coercion for UTC
+query ?
+select arrow_cast('2024-06-17T11:00:00', 'Timestamp(Nanosecond, Some("UTC"))') - arrow_cast('2024-06-17T12:00:00', 'Timestamp(Microsecond, Some("UTC"))');
+----
+0 days -1 hours 0 mins 0.000000 secs
+
+query ?
+select arrow_cast('2024-06-17T13:00:00', 'Timestamp(Nanosecond, Some("+00:00"))') - arrow_cast('2024-06-17T12:00:00', 'Timestamp(Microsecond, Some("UTC"))');
+----
+0 days 1 hours 0 mins 0.000000 secs
+
+query ?
+select arrow_cast('2024-06-17T13:00:00', 'Timestamp(Nanosecond, Some("UTC"))') - arrow_cast('2024-06-17T12:00:00', 'Timestamp(Microsecond, Some("+00:00"))');
+----
+0 days 1 hours 0 mins 0.000000 secs
+
+# not supported: coercion across timezones
+query error
+select arrow_cast('2024-06-17T13:00:00', 'Timestamp(Nanosecond, Some("UTC"))') - arrow_cast('2024-06-17T12:00:00', 'Timestamp(Microsecond, Some("+01:00"))');
+
+query error
+select arrow_cast('2024-06-17T13:00:00', 'Timestamp(Nanosecond, Some("+00:00"))') - arrow_cast('2024-06-17T12:00:00', 'Timestamp(Microsecond, Some("+01:00"))');
+
+##########
+## Test to_local_time function
+##########
+
+# invalid number of arguments -- no argument
+statement error
+select to_local_time();
+
+# invalid number of arguments -- more than 1 argument
+statement error
+select to_local_time('2024-04-01T00:00:20Z'::timestamp, 'some string');
+
+# invalid argument data type
+statement error DataFusion error: Error during planning: Function 'to_local_time' requires Timestamp, but received String \(DataType: Utf8\)
+select to_local_time('2024-04-01T00:00:20Z');
+
+# invalid timezone
+statement error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Parser error: Invalid timezone "Europe/timezone": failed to parse timezone
+select to_local_time('2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/timezone');
+
+# valid query
+query P
+select to_local_time('2024-04-01T00:00:20Z'::timestamp);
+----
+2024-04-01T00:00:20
+
+query P
+select to_local_time('2024-04-01T00:00:20Z'::timestamp AT TIME ZONE '+05:00');
+----
+2024-04-01T00:00:20
+
+query P
+select to_local_time('2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels');
+----
+2024-04-01T00:00:20
+
+query P
+select to_local_time(NULL);
+----
+NULL
+
+query PT
+select
+  to_local_time(arrow_cast(null, 'Timestamp(s, "Asia/Tokyo")')),
+  arrow_typeof(to_local_time(arrow_cast(null, 'Timestamp(s, "Asia/Tokyo")')));
+----
+NULL Timestamp(s)
+
+query PTPT
+select
+  time,
+  arrow_typeof(time) as type,
+  to_local_time(time) as to_local_time,
+  arrow_typeof(to_local_time(time)) as to_local_time_type
+from (
+  select '2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels' as time
+);
+----
+2024-04-01T00:00:20+02:00 Timestamp(ns, "Europe/Brussels") 2024-04-01T00:00:20 Timestamp(ns)
+
+# use to_local_time() in date_bin()
+query P
+select date_bin(interval '1 day', to_local_time('2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels'));
+----
+2024-04-01T00:00:00
+
+query P
+select date_bin(interval '1 day', to_local_time('2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels')) AT TIME ZONE 'Europe/Brussels';
+----
+2024-04-01T00:00:00+02:00
+
+# test using to_local_time() on array values
+statement ok
+create table t AS
+VALUES
+  (NULL),
+  ('2024-01-01T00:00:01Z'),
+  ('2024-02-01T00:00:01Z'),
+  ('2024-03-01T00:00:01Z'),
+  ('2024-04-01T00:00:01Z'),
+  ('2024-05-01T00:00:01Z'),
+  ('2024-06-01T00:00:01Z'),
+  ('2024-07-01T00:00:01Z'),
+  ('2024-08-01T00:00:01Z'),
+  ('2024-09-01T00:00:01Z'),
+  ('2024-10-01T00:00:01Z'),
+  ('2024-11-01T00:00:01Z'),
+  ('2024-12-01T00:00:01Z')
+;
+
+statement ok
+create view t_utc as
+select column1::timestamp AT TIME ZONE 'UTC' as "column1"
+from t;
+
+statement ok
+create view t_timezone as
+select column1::timestamp AT TIME ZONE 'Europe/Brussels' as "column1"
+from t;
+
+query PPT
+select column1, to_local_time(column1::timestamp), arrow_typeof(to_local_time(column1::timestamp)) from t_utc;
+----
+NULL NULL Timestamp(ns)
+2024-01-01T00:00:01Z 2024-01-01T00:00:01 Timestamp(ns)
+2024-02-01T00:00:01Z 2024-02-01T00:00:01 Timestamp(ns)
+2024-03-01T00:00:01Z 2024-03-01T00:00:01 Timestamp(ns)
+2024-04-01T00:00:01Z 2024-04-01T00:00:01 Timestamp(ns)
+2024-05-01T00:00:01Z 2024-05-01T00:00:01 Timestamp(ns)
+2024-06-01T00:00:01Z 2024-06-01T00:00:01 Timestamp(ns)
+2024-07-01T00:00:01Z 2024-07-01T00:00:01 Timestamp(ns)
+2024-08-01T00:00:01Z 2024-08-01T00:00:01 Timestamp(ns)
+2024-09-01T00:00:01Z 2024-09-01T00:00:01 Timestamp(ns)
+2024-10-01T00:00:01Z 2024-10-01T00:00:01 Timestamp(ns)
+2024-11-01T00:00:01Z 2024-11-01T00:00:01 Timestamp(ns)
+2024-12-01T00:00:01Z 2024-12-01T00:00:01 Timestamp(ns)
+
+query PPT
+select column1, to_local_time(column1), arrow_typeof(to_local_time(column1)) from t_utc;
+----
+NULL NULL Timestamp(ns)
+2024-01-01T00:00:01Z 2024-01-01T00:00:01 Timestamp(ns)
+2024-02-01T00:00:01Z 2024-02-01T00:00:01 Timestamp(ns)
+2024-03-01T00:00:01Z 2024-03-01T00:00:01 Timestamp(ns)
+2024-04-01T00:00:01Z 2024-04-01T00:00:01 Timestamp(ns)
+2024-05-01T00:00:01Z 2024-05-01T00:00:01 Timestamp(ns)
+2024-06-01T00:00:01Z 2024-06-01T00:00:01 Timestamp(ns)
+2024-07-01T00:00:01Z 2024-07-01T00:00:01 Timestamp(ns)
+2024-08-01T00:00:01Z 2024-08-01T00:00:01 Timestamp(ns)
+2024-09-01T00:00:01Z 2024-09-01T00:00:01 Timestamp(ns)
+2024-10-01T00:00:01Z 2024-10-01T00:00:01 Timestamp(ns)
+2024-11-01T00:00:01Z 2024-11-01T00:00:01 Timestamp(ns)
+2024-12-01T00:00:01Z 2024-12-01T00:00:01 Timestamp(ns)
+
+query PPT
+select column1, to_local_time(column1), arrow_typeof(to_local_time(column1)) from t_timezone;
+----
+NULL NULL Timestamp(ns)
+2024-01-01T00:00:01+01:00 2024-01-01T00:00:01 Timestamp(ns)
+2024-02-01T00:00:01+01:00 2024-02-01T00:00:01 Timestamp(ns)
+2024-03-01T00:00:01+01:00 2024-03-01T00:00:01 Timestamp(ns)
+2024-04-01T00:00:01+02:00 2024-04-01T00:00:01 Timestamp(ns)
+2024-05-01T00:00:01+02:00 2024-05-01T00:00:01 Timestamp(ns)
+2024-06-01T00:00:01+02:00 2024-06-01T00:00:01 Timestamp(ns)
+2024-07-01T00:00:01+02:00 2024-07-01T00:00:01 Timestamp(ns)
+2024-08-01T00:00:01+02:00 2024-08-01T00:00:01 Timestamp(ns)
+2024-09-01T00:00:01+02:00 2024-09-01T00:00:01 Timestamp(ns)
+2024-10-01T00:00:01+02:00 2024-10-01T00:00:01 Timestamp(ns)
+2024-11-01T00:00:01+01:00 2024-11-01T00:00:01 Timestamp(ns)
+2024-12-01T00:00:01+01:00 2024-12-01T00:00:01 Timestamp(ns)
+
+# combine to_local_time() with date_bin()
+query P
+select date_bin(interval '1 day', to_local_time(column1)) AT TIME ZONE 'Europe/Brussels' as date_bin from t_utc;
+----
+NULL
+2024-01-01T00:00:00+01:00
+2024-02-01T00:00:00+01:00
+2024-03-01T00:00:00+01:00
+2024-04-01T00:00:00+02:00
+2024-05-01T00:00:00+02:00
+2024-06-01T00:00:00+02:00
+2024-07-01T00:00:00+02:00
+2024-08-01T00:00:00+02:00
+2024-09-01T00:00:00+02:00
+2024-10-01T00:00:00+02:00
+2024-11-01T00:00:00+01:00
+2024-12-01T00:00:00+01:00
+
+query P
+select date_bin(interval '1 day', to_local_time(column1)) AT TIME ZONE 'Europe/Brussels' as date_bin from t_timezone;
+----
+NULL
+2024-01-01T00:00:00+01:00
+2024-02-01T00:00:00+01:00
+2024-03-01T00:00:00+01:00
+2024-04-01T00:00:00+02:00
+2024-05-01T00:00:00+02:00
+2024-06-01T00:00:00+02:00
+2024-07-01T00:00:00+02:00
+2024-08-01T00:00:00+02:00
+2024-09-01T00:00:00+02:00
+2024-10-01T00:00:00+02:00
+2024-11-01T00:00:00+01:00
+2024-12-01T00:00:00+01:00
+
+statement ok
+drop table t;
+
+statement ok
+drop view t_utc;
+
+statement ok
+drop view t_timezone;
+
+# test comparisons across timestamps
+statement ok
+create table t AS
+VALUES
+  ('2024-01-01T00:00:01Z'),
+  ('2024-02-01T00:00:01Z'),
+  ('2024-03-01T00:00:01Z')
+;
+
+statement ok
+create view t_utc as
+select column1::timestamp AT TIME ZONE 'UTC' as "column1"
+from t;
+
+statement ok
+create view t_europe as
+select column1::timestamp AT TIME ZONE 'Europe/Brussels' as "column1"
+from t;
+
+query P
+SELECT column1 FROM t_utc WHERE column1 < '2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles';
+----
+2024-01-01T00:00:01Z
+2024-02-01T00:00:01Z
+
+query P
+SELECT column1 FROM t_europe WHERE column1 = '2024-01-31T16:00:01' AT TIME ZONE 'America/Los_Angeles';
+----
+2024-02-01T00:00:01+01:00
+
+query P
+SELECT column1 FROM t_europe WHERE column1 BETWEEN '2020-01-01T00:00:00' AT TIME ZONE 'Australia/Brisbane' AND '2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles';
+----
+2024-01-01T00:00:01+01:00
+2024-02-01T00:00:01+01:00
+
+query P
+SELECT column1 FROM t_utc WHERE column1 IN ('2024-01-31T16:00:01' AT TIME ZONE 'America/Los_Angeles');
+----
+2024-02-01T00:00:01Z
+
+query P
+SELECT column1 as u from t_utc UNION SELECT column1 from t_europe ORDER BY u;
+----
+2023-12-31T23:00:01Z
+2024-01-01T00:00:01Z
+2024-01-31T23:00:01Z
+2024-02-01T00:00:01Z
+2024-02-29T23:00:01Z
+2024-03-01T00:00:01Z
+
+query P
+SELECT column1 as e from t_europe UNION SELECT column1 from t_utc ORDER BY e;
+----
+2024-01-01T00:00:01+01:00
+2024-01-01T01:00:01+01:00
+2024-02-01T00:00:01+01:00
+2024-02-01T01:00:01+01:00
+2024-03-01T00:00:01+01:00
+2024-03-01T01:00:01+01:00
+
+query P
+SELECT nvl2(null, '2020-01-01T00:00:00-04:00'::timestamp, '2021-02-03T04:05:06Z'::timestamp)
+----
+2021-02-03T04:05:06
+
+query ?
+SELECT make_array('2020-01-01T00:00:00-04:00'::timestamp, '2021-01-01T01:02:03Z'::timestamp);
+----
+[2020-01-01T04:00:00, 2021-01-01T01:02:03]
+
+query P
+SELECT * FROM VALUES
+ ('2023-12-31T23:00:00Z' AT TIME ZONE 'UTC'),
+ ('2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles');
+----
+2023-12-31T15:00:00-08:00
+2024-02-01T00:00:00-08:00
+
+query P
+SELECT * FROM VALUES
+ ('2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles'),
+ ('2023-12-31T23:00:00' AT TIME ZONE 'UTC');
+----
+2024-02-01T08:00:00Z
+2023-12-31T23:00:00Z
+
+# interval vs. duration comparison
+query B
+select (now() - now()) < interval '1 seconds';
+----
+true
+
+query B
+select (now() - now()) <= interval '1 seconds';
+----
+true
+
+query B
+select (now() - now()) = interval '0 seconds';
+----
+true
+
+query B
+select (now() - now()) != interval '1 seconds';
+----
+true
+
+query B
+select (now() - now()) > interval '-1 seconds';
+----
+true
+
+query B
+select (now() - now()) >= interval '-1 seconds';
+----
+true
+
+query B
+select arrow_cast(123, 'Duration(Nanosecond)') < interval '200 nanoseconds';
+----
+true
+
+query B
+select arrow_cast(123, 'Duration(Nanosecond)') < interval '100 nanoseconds';
+----
+false
+
+query B
+select arrow_cast(123, 'Duration(Nanosecond)') < interval '1 seconds';
+----
+true
+
+query B
+select interval '1 seconds' < arrow_cast(123, 'Duration(Nanosecond)')
+----
+false
+
+# interval as LHS
+query B
+select interval '2 seconds' = interval '2 seconds';
+----
+true
+
+query B
+select interval '1 seconds' < interval '2 seconds';
+----
+true
+
+statement ok
+drop table t;
+
+statement ok
+drop view t_utc;
+
+statement ok
+drop view t_europe;
+
+# TODO: In Postgres, '-1' is unknown type and interpreted to float8 so they don't fail on this query
+query error DataFusion error: Arrow error: Parser error: Error parsing timestamp from '\-1': timestamp must contain at least 10 characters
+select to_timestamp('-1');
+
+query error DataFusion error: Arrow error: Parser error: Error parsing timestamp from '\-1': timestamp must contain at least 10 characters
+select to_timestamp(arrow_cast('-1', 'Utf8'));
+
+query P
+SELECT CAST(CAST(1   AS decimal(17,2)) AS timestamp(3)) AS a UNION ALL
+SELECT CAST(CAST(one AS decimal(17,2)) AS timestamp(3)) AS a FROM (VALUES (1)) t(one);
+----
+1970-01-01T00:00:00.001
+1970-01-01T00:00:00.001
+
+query P
+SELECT arrow_cast(CAST(1   AS decimal(17,2)), 'Timestamp(ns)') AS a UNION ALL
+SELECT arrow_cast(CAST(one AS decimal(17,2)), 'Timestamp(ns)') AS a FROM (VALUES (1)) t(one);
+----
+1970-01-01T00:00:00.000000001
+1970-01-01T00:00:00.000000001
+
+query P
+SELECT arrow_cast(CAST(1   AS decimal(17,2)), 'Timestamp(µs)') AS a UNION ALL
+SELECT arrow_cast(CAST(one AS decimal(17,2)), 'Timestamp(µs)') AS a FROM (VALUES (1)) t(one);
+----
+1970-01-01T00:00:00.000001
+1970-01-01T00:00:00.000001
+
+query P
+SELECT arrow_cast(CAST(1   AS decimal(17,2)), 'Timestamp(ms)') AS a UNION ALL
+SELECT arrow_cast(CAST(one AS decimal(17,2)), 'Timestamp(ms)') AS a FROM (VALUES (1)) t(one);
+----
+1970-01-01T00:00:00.001
+1970-01-01T00:00:00.001
+
+query P
+SELECT arrow_cast(CAST(1   AS decimal(17,2)), 'Timestamp(s)') AS a UNION ALL
+SELECT arrow_cast(CAST(one AS decimal(17,2)), 'Timestamp(s)') AS a FROM (VALUES (1)) t(one);
+----
+1970-01-01T00:00:01
+1970-01-01T00:00:01
+
+
+query P
+SELECT arrow_cast(CAST(1.123 AS decimal(17,3)), 'Timestamp(ns)') AS a UNION ALL
+SELECT arrow_cast(CAST(one AS decimal(17,3)), 'Timestamp(ns)') AS a FROM (VALUES (1.123)) t(one);
+----
+1970-01-01T00:00:00.000000001
+1970-01-01T00:00:00.000000001
+
+query P
+SELECT arrow_cast(CAST(1.123 AS decimal(17,3)), 'Timestamp(µs)') AS a UNION ALL
+SELECT arrow_cast(CAST(one AS decimal(17,3)), 'Timestamp(µs)') AS a FROM (VALUES (1.123)) t(one);
+----
+1970-01-01T00:00:00.000001
+1970-01-01T00:00:00.000001
+
+query P
+SELECT arrow_cast(CAST(1.123 AS decimal(17,3)), 'Timestamp(ms)') AS a UNION ALL
+SELECT arrow_cast(CAST(one AS decimal(17,3)), 'Timestamp(ms)') AS a FROM (VALUES (1.123)) t(one);
+----
+1970-01-01T00:00:00.001
+1970-01-01T00:00:00.001
+
+query P
+SELECT arrow_cast(CAST(1.123 AS decimal(17,3)), 'Timestamp(s)') AS a UNION ALL
+SELECT arrow_cast(CAST(one AS decimal(17,3)), 'Timestamp(s)') AS a FROM (VALUES (1.123)) t(one);
+----
+1970-01-01T00:00:01
+1970-01-01T00:00:01
+
+query TTTTT
+SELECT
+    arrow_typeof(a),
+    CAST(a AS varchar),
+    arrow_cast(a, 'Utf8'),
+    arrow_cast(a, 'Utf8View'),
+    arrow_cast(a, 'LargeUtf8')
+FROM (SELECT DATE '2005-09-10' AS a)
+----
+Date32 2005-09-10 2005-09-10 2005-09-10 2005-09-10
+
+query TTTTT
+SELECT
+    arrow_typeof(a),
+    CAST(a AS varchar),
+    arrow_cast(a, 'Utf8'),
+    arrow_cast(a, 'Utf8View'),
+    arrow_cast(a, 'LargeUtf8')
+FROM (SELECT TIMESTAMP '2005-09-10 13:31:00' AS a)
+----
+Timestamp(ns) 2005-09-10T13:31:00 2005-09-10T13:31:00 2005-09-10T13:31:00 2005-09-10T13:31:00
+
+query TTTTT
+SELECT
+    arrow_typeof(a),
+    CAST(a AS varchar),
+    arrow_cast(a, 'Utf8'),
+    arrow_cast(a, 'Utf8View'),
+    arrow_cast(a, 'LargeUtf8')
+FROM (SELECT CAST('2005-09-10 13:31:00 +02:00' AS timestamp with time zone) AS a)
+----
+Timestamp(ns) 2005-09-10T11:31:00 2005-09-10T11:31:00 2005-09-10T11:31:00 2005-09-10T11:31:00
+
+query P
+SELECT
+    date_trunc('millisecond', ts)
+FROM ts_data_micros_kolkata
+----
+2020-09-08T19:12:29.190+05:30
+2020-09-08T18:12:29.190+05:30
+2020-09-08T17:12:29.190+05:30
+
+
+##########
+## Casting between timestamp with and without timezone
+##########
+
+# Test casting from Timestamp(Nanosecond, Some("UTC")) to Timestamp(ns)
+# Verifies that the underlying nanosecond values are preserved when removing timezone
+
+# Verify input type
+query T
+SELECT arrow_typeof(arrow_cast(1, 'Timestamp(Nanosecond, Some("UTC"))'));
+----
+Timestamp(ns, "UTC")
+
+# Verify output type after casting
+query T
+SELECT arrow_typeof(arrow_cast(arrow_cast(1, 'Timestamp(Nanosecond, Some("UTC"))'), 'Timestamp(ns)'));
+----
+Timestamp(ns)
+
+# Verify values are preserved when casting from timestamp with timezone to timestamp without timezone
+query P rowsort
+SELECT arrow_cast(column1, 'Timestamp(ns)')
+FROM (VALUES
+  (arrow_cast(1, 'Timestamp(Nanosecond, Some("UTC"))')),
+  (arrow_cast(2, 'Timestamp(Nanosecond, Some("UTC"))')),
+  (arrow_cast(3, 'Timestamp(Nanosecond, Some("UTC"))')),
+  (arrow_cast(4, 'Timestamp(Nanosecond, Some("UTC"))')),
+  (arrow_cast(5, 'Timestamp(Nanosecond, Some("UTC"))'))
+) t;
+----
+1970-01-01T00:00:00.000000001
+1970-01-01T00:00:00.000000002
+1970-01-01T00:00:00.000000003
+1970-01-01T00:00:00.000000004
+1970-01-01T00:00:00.000000005
+
+# Test casting from Timestamp(ns) to Timestamp(Nanosecond, Some("UTC"))
+# Verifies that the underlying nanosecond values are preserved when adding timezone
+
+# Verify input type
+query T
+SELECT arrow_typeof(arrow_cast(1, 'Timestamp(ns)'));
+----
+Timestamp(ns)
+
+# Verify output type after casting
+query T
+SELECT arrow_typeof(arrow_cast(arrow_cast(1, 'Timestamp(ns)'), 'Timestamp(Nanosecond, Some("UTC"))'));
+----
+Timestamp(ns, "UTC")
+
+# Verify values are preserved when casting from timestamp without timezone to timestamp with timezone
+query P rowsort
+SELECT arrow_cast(column1, 'Timestamp(Nanosecond, Some("UTC"))')
+FROM (VALUES
+  (arrow_cast(1, 'Timestamp(ns)')),
+  (arrow_cast(2, 'Timestamp(ns)')),
+  (arrow_cast(3, 'Timestamp(ns)')),
+  (arrow_cast(4, 'Timestamp(ns)')),
+  (arrow_cast(5, 'Timestamp(ns)'))
+) t;
+----
+1970-01-01T00:00:00.000000001Z
+1970-01-01T00:00:00.000000002Z
+1970-01-01T00:00:00.000000003Z
+1970-01-01T00:00:00.000000004Z
+1970-01-01T00:00:00.000000005Z
+
+
+##########
+## to_timestamp functions with all numeric types
+##########
+
+# Test to_timestamp with all integer types
+# Int8
+query P
+SELECT to_timestamp(arrow_cast(0, 'Int8'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(100, 'Int8'));
+----
+1970-01-01T00:01:40
+
+# Int16
+query P
+SELECT to_timestamp(arrow_cast(0, 'Int16'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(1000, 'Int16'));
+----
+1970-01-01T00:16:40
+
+# Int32
+query P
+SELECT to_timestamp(arrow_cast(0, 'Int32'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(86400, 'Int32'));
+----
+1970-01-02T00:00:00
+
+# Int64
+query P
+SELECT to_timestamp(arrow_cast(0, 'Int64'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(86400, 'Int64'));
+----
+1970-01-02T00:00:00
+
+# UInt8
+query P
+SELECT to_timestamp(arrow_cast(0, 'UInt8'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(100, 'UInt8'));
+----
+1970-01-01T00:01:40
+
+# UInt16
+query P
+SELECT to_timestamp(arrow_cast(0, 'UInt16'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(1000, 'UInt16'));
+----
+1970-01-01T00:16:40
+
+# UInt32
+query P
+SELECT to_timestamp(arrow_cast(0, 'UInt32'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(86400, 'UInt32'));
+----
+1970-01-02T00:00:00
+
+# UInt64
+query P
+SELECT to_timestamp(arrow_cast(0, 'UInt64'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(86400, 'UInt64'));
+----
+1970-01-02T00:00:00
+
+# Float16
+query P
+SELECT to_timestamp(arrow_cast(0.0, 'Float16'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(1.5, 'Float16'));
+----
+1970-01-01T00:00:01.500
+
+# Float32
+query P
+SELECT to_timestamp(arrow_cast(0.0, 'Float32'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(1.5, 'Float32'));
+----
+1970-01-01T00:00:01.500
+
+# Float64
+query P
+SELECT to_timestamp(arrow_cast(0.0, 'Float64'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(1.5, 'Float64'));
+----
+1970-01-01T00:00:01.500
+
+# Test to_timestamp_seconds with all integer types
+# Int8
+query P
+SELECT to_timestamp_seconds(arrow_cast(0, 'Int8'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp_seconds(arrow_cast(100, 'Int8'));
+----
+1970-01-01T00:01:40
+
+# Int16
+query P
+SELECT to_timestamp_seconds(arrow_cast(1000, 'Int16'));
+----
+1970-01-01T00:16:40
+
+# Int32
+query P
+SELECT to_timestamp_seconds(arrow_cast(86400, 'Int32'));
+----
+1970-01-02T00:00:00
+
+# Int64
+query P
+SELECT to_timestamp_seconds(arrow_cast(86400, 'Int64'));
+----
+1970-01-02T00:00:00
+
+# UInt8
+query P
+SELECT to_timestamp_seconds(arrow_cast(100, 'UInt8'));
+----
+1970-01-01T00:01:40
+
+# UInt16
+query P
+SELECT to_timestamp_seconds(arrow_cast(1000, 'UInt16'));
+----
+1970-01-01T00:16:40
+
+# UInt32
+query P
+SELECT to_timestamp_seconds(arrow_cast(86400, 'UInt32'));
+----
+1970-01-02T00:00:00
+
+# UInt64
+query P
+SELECT to_timestamp_seconds(arrow_cast(86400, 'UInt64'));
+----
+1970-01-02T00:00:00
+
+# Float16
+query P
+SELECT to_timestamp_seconds(arrow_cast(1.9, 'Float16'));
+----
+1970-01-01T00:00:01
+
+# Float32
+query P
+SELECT to_timestamp_seconds(arrow_cast(1.9, 'Float32'));
+----
+1970-01-01T00:00:01
+
+# Float64
+query P
+SELECT to_timestamp_seconds(arrow_cast(1.9, 'Float64'));
+----
+1970-01-01T00:00:01
+
+# Test to_timestamp_millis with all integer types
+# Int8
+query P
+SELECT to_timestamp_millis(arrow_cast(0, 'Int8'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp_millis(arrow_cast(100, 'Int8'));
+----
+1970-01-01T00:00:00.100
+
+# Int16
+query P
+SELECT to_timestamp_millis(arrow_cast(1000, 'Int16'));
+----
+1970-01-01T00:00:01
+
+# Int32
+query P
+SELECT to_timestamp_millis(arrow_cast(86400000, 'Int32'));
+----
+1970-01-02T00:00:00
+
+# Int64
+query P
+SELECT to_timestamp_millis(arrow_cast(86400000, 'Int64'));
+----
+1970-01-02T00:00:00
+
+# UInt8
+query P
+SELECT to_timestamp_millis(arrow_cast(100, 'UInt8'));
+----
+1970-01-01T00:00:00.100
+
+# UInt16
+query P
+SELECT to_timestamp_millis(arrow_cast(1000, 'UInt16'));
+----
+1970-01-01T00:00:01
+
+# UInt32
+query P
+SELECT to_timestamp_millis(arrow_cast(86400000, 'UInt32'));
+----
+1970-01-02T00:00:00
+
+# UInt64
+query P
+SELECT to_timestamp_millis(arrow_cast(86400000, 'UInt64'));
+----
+1970-01-02T00:00:00
+
+# Float16
+query P
+SELECT to_timestamp_millis(arrow_cast(1000, 'Float16'));
+----
+1970-01-01T00:00:01
+
+# Float32
+query P
+SELECT to_timestamp_millis(arrow_cast(1000.9, 'Float32'));
+----
+1970-01-01T00:00:01
+
+# Float64
+query P
+SELECT to_timestamp_millis(arrow_cast(1000.9, 'Float64'));
+----
+1970-01-01T00:00:01
+
+# Test to_timestamp_micros with all integer types
+# Int8
+query P
+SELECT to_timestamp_micros(arrow_cast(0, 'Int8'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp_micros(arrow_cast(100, 'Int8'));
+----
+1970-01-01T00:00:00.000100
+
+# Int16
+query P
+SELECT to_timestamp_micros(arrow_cast(1000, 'Int16'));
+----
+1970-01-01T00:00:00.001
+
+# Int32
+query P
+SELECT to_timestamp_micros(arrow_cast(1000000, 'Int32'));
+----
+1970-01-01T00:00:01
+
+# Int64
+query P
+SELECT to_timestamp_micros(arrow_cast(86400000000, 'Int64'));
+----
+1970-01-02T00:00:00
+
+# UInt8
+query P
+SELECT to_timestamp_micros(arrow_cast(100, 'UInt8'));
+----
+1970-01-01T00:00:00.000100
+
+# UInt16
+query P
+SELECT to_timestamp_micros(arrow_cast(1000, 'UInt16'));
+----
+1970-01-01T00:00:00.001
+
+# UInt32
+query P
+SELECT to_timestamp_micros(arrow_cast(1000000, 'UInt32'));
+----
+1970-01-01T00:00:01
+
+# UInt64
+query P
+SELECT to_timestamp_micros(arrow_cast(1000000, 'UInt64'));
+----
+1970-01-01T00:00:01
+
+# Float16
+query P
+SELECT to_timestamp_micros(arrow_cast(1000, 'Float16'));
+----
+1970-01-01T00:00:00.001
+
+# Float32
+query P
+SELECT to_timestamp_micros(arrow_cast(1000000.9, 'Float32'));
+----
+1970-01-01T00:00:01
+
+# Float64
+query P
+SELECT to_timestamp_micros(arrow_cast(1000000.9, 'Float64'));
+----
+1970-01-01T00:00:01
+
+# Test to_timestamp_nanos with all integer types
+# Int8
+query P
+SELECT to_timestamp_nanos(arrow_cast(0, 'Int8'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp_nanos(arrow_cast(100, 'Int8'));
+----
+1970-01-01T00:00:00.000000100
+
+# Int16
+query P
+SELECT to_timestamp_nanos(arrow_cast(1000, 'Int16'));
+----
+1970-01-01T00:00:00.000001
+
+# Int32
+query P
+SELECT to_timestamp_nanos(arrow_cast(1000000000, 'Int32'));
+----
+1970-01-01T00:00:01
+
+# Int64
+query P
+SELECT to_timestamp_nanos(arrow_cast(86400000000000, 'Int64'));
+----
+1970-01-02T00:00:00
+
+# UInt8
+query P
+SELECT to_timestamp_nanos(arrow_cast(100, 'UInt8'));
+----
+1970-01-01T00:00:00.000000100
+
+# UInt16
+query P
+SELECT to_timestamp_nanos(arrow_cast(1000, 'UInt16'));
+----
+1970-01-01T00:00:00.000001
+
+# UInt32
+query P
+SELECT to_timestamp_nanos(arrow_cast(1000000000, 'UInt32'));
+----
+1970-01-01T00:00:01
+
+# UInt64
+query P
+SELECT to_timestamp_nanos(arrow_cast(1000000000, 'UInt64'));
+----
+1970-01-01T00:00:01
+
+# Float16
+query P
+SELECT to_timestamp_nanos(arrow_cast(1000, 'Float16'));
+----
+1970-01-01T00:00:00.000001
+
+# Float32
+query P
+SELECT to_timestamp_nanos(arrow_cast(1000000000.9, 'Float32'));
+----
+1970-01-01T00:00:01
+
+# Float64
+query P
+SELECT to_timestamp_nanos(arrow_cast(1000000000.9, 'Float64'));
+----
+1970-01-01T00:00:01
+
+# Verify arrow_typeof for all to_timestamp functions with various input types
+query T
+SELECT arrow_typeof(to_timestamp(arrow_cast(0, 'Int8')));
+----
+Timestamp(ns)
+
+query T
+SELECT arrow_typeof(to_timestamp(arrow_cast(0, 'UInt64')));
+----
+Timestamp(ns)
+
+query T
+SELECT arrow_typeof(to_timestamp(arrow_cast(0.0, 'Float32')));
+----
+Timestamp(ns)
+
+query T
+SELECT arrow_typeof(to_timestamp_seconds(arrow_cast(0, 'Int8')));
+----
+Timestamp(s)
+
+query T
+SELECT arrow_typeof(to_timestamp_seconds(arrow_cast(0, 'UInt64')));
+----
+Timestamp(s)
+
+query T
+SELECT arrow_typeof(to_timestamp_seconds(arrow_cast(0.0, 'Float32')));
+----
+Timestamp(s)
+
+query T
+SELECT arrow_typeof(to_timestamp_millis(arrow_cast(0, 'Int8')));
+----
+Timestamp(ms)
+
+query T
+SELECT arrow_typeof(to_timestamp_millis(arrow_cast(0, 'UInt64')));
+----
+Timestamp(ms)
+
+query T
+SELECT arrow_typeof(to_timestamp_millis(arrow_cast(0.0, 'Float32')));
+----
+Timestamp(ms)
+
+query T
+SELECT arrow_typeof(to_timestamp_micros(arrow_cast(0, 'Int8')));
+----
+Timestamp(µs)
+
+query T
+SELECT arrow_typeof(to_timestamp_micros(arrow_cast(0, 'UInt64')));
+----
+Timestamp(µs)
+
+query T
+SELECT arrow_typeof(to_timestamp_micros(arrow_cast(0.0, 'Float32')));
+----
+Timestamp(µs)
+
+query T
+SELECT arrow_typeof(to_timestamp_nanos(arrow_cast(0, 'Int8')));
+----
+Timestamp(ns)
+
+query T
+SELECT arrow_typeof(to_timestamp_nanos(arrow_cast(0, 'UInt64')));
+----
+Timestamp(ns)
+
+query T
+SELECT arrow_typeof(to_timestamp_nanos(arrow_cast(0.0, 'Float32')));
+----
+Timestamp(ns)
+
+# Test decimal type support for all to_timestamp functions
+# Decimal32
+query P
+SELECT to_timestamp(arrow_cast(1.5, 'Decimal32(5,1)'));
+----
+1970-01-01T00:00:01.500
+
+query P
+SELECT to_timestamp_seconds(arrow_cast(86400, 'Decimal32(9,0)'));
+----
+1970-01-02T00:00:00
+
+query P
+SELECT to_timestamp_millis(arrow_cast(1000, 'Decimal32(9,0)'));
+----
+1970-01-01T00:00:01
+
+query P
+SELECT to_timestamp_micros(arrow_cast(1000000, 'Decimal32(9,0)'));
+----
+1970-01-01T00:00:01
+
+query P
+SELECT to_timestamp_nanos(arrow_cast(1000000, 'Decimal32(9,0)'));
+----
+1970-01-01T00:00:00.001
+
+# Decimal64
+query P
+SELECT to_timestamp(arrow_cast(1.5, 'Decimal64(10,1)'));
+----
+1970-01-01T00:00:01.500
+
+query P
+SELECT to_timestamp_seconds(arrow_cast(86400, 'Decimal64(18,0)'));
+----
+1970-01-02T00:00:00
+
+query P
+SELECT to_timestamp_millis(arrow_cast(86400000, 'Decimal64(18,0)'));
+----
+1970-01-02T00:00:00
+
+query P
+SELECT to_timestamp_micros(arrow_cast(86400000000, 'Decimal64(18,0)'));
+----
+1970-01-02T00:00:00
+
+query P
+SELECT to_timestamp_nanos(arrow_cast(86400000000000, 'Decimal64(18,0)'));
+----
+1970-01-02T00:00:00
+
+# Decimal128
+query P
+SELECT to_timestamp(arrow_cast(1.5, 'Decimal128(10,1)'));
+----
+1970-01-01T00:00:01.500
+
+query P
+SELECT to_timestamp_seconds(arrow_cast(86400, 'Decimal128(10,0)'));
+----
+1970-01-02T00:00:00
+
+query P
+SELECT to_timestamp_millis(arrow_cast(86400000, 'Decimal128(15,0)'));
+----
+1970-01-02T00:00:00
+
+query P
+SELECT to_timestamp_micros(arrow_cast(86400000000, 'Decimal128(15,0)'));
+----
+1970-01-02T00:00:00
+
+query P
+SELECT to_timestamp_nanos(arrow_cast(86400000000000, 'Decimal128(20,0)'));
+----
+1970-01-02T00:00:00
+
+# Decimal256
+query P
+SELECT to_timestamp(arrow_cast(1.5, 'Decimal256(10,1)'));
+----
+1970-01-01T00:00:01.500
+
+query P
+SELECT to_timestamp_seconds(arrow_cast(86400, 'Decimal256(38,0)'));
+----
+1970-01-02T00:00:00
+
+query P
+SELECT to_timestamp_millis(arrow_cast(86400000, 'Decimal256(38,0)'));
+----
+1970-01-02T00:00:00
 
-# combine to_local_time() with date_bin()
 query P
-select date_bin(interval '1 day', to_local_time(column1)) AT TIME ZONE 'Europe/Brussels' as date_bin from t_utc;
+SELECT to_timestamp_micros(arrow_cast(86400000000, 'Decimal256(38,0)'));
 ----
-NULL
-2024-01-01T00:00:00+01:00
-2024-02-01T00:00:00+01:00
-2024-03-01T00:00:00+01:00
-2024-04-01T00:00:00+02:00
-2024-05-01T00:00:00+02:00
-2024-06-01T00:00:00+02:00
-2024-07-01T00:00:00+02:00
-2024-08-01T00:00:00+02:00
-2024-09-01T00:00:00+02:00
-2024-10-01T00:00:00+02:00
-2024-11-01T00:00:00+01:00
-2024-12-01T00:00:00+01:00
+1970-01-02T00:00:00
 
 query P
-select date_bin(interval '1 day', to_local_time(column1)) AT TIME ZONE 'Europe/Brussels' as date_bin from t_timezone;
+SELECT to_timestamp_nanos(arrow_cast(86400000000000, 'Decimal256(38,0)'));
 ----
-NULL
-2024-01-01T00:00:00+01:00
-2024-02-01T00:00:00+01:00
-2024-03-01T00:00:00+01:00
-2024-04-01T00:00:00+02:00
-2024-05-01T00:00:00+02:00
-2024-06-01T00:00:00+02:00
-2024-07-01T00:00:00+02:00
-2024-08-01T00:00:00+02:00
-2024-09-01T00:00:00+02:00
-2024-10-01T00:00:00+02:00
-2024-11-01T00:00:00+01:00
-2024-12-01T00:00:00+01:00
+1970-01-02T00:00:00
 
-statement ok
-drop table t;
+# Verify arrow_typeof for decimal inputs
+query T
+SELECT arrow_typeof(to_timestamp(arrow_cast(0, 'Decimal128(10,0)')));
+----
+Timestamp(ns)
 
-statement ok
-drop view t_utc;
+query T
+SELECT arrow_typeof(to_timestamp_seconds(arrow_cast(0, 'Decimal128(10,0)')));
+----
+Timestamp(s)
 
-statement ok
-drop view t_timezone;
+query T
+SELECT arrow_typeof(to_timestamp_millis(arrow_cast(0, 'Decimal128(10,0)')));
+----
+Timestamp(ms)
 
-# test comparisons across timestamps
-statement ok
-create table t AS
-VALUES
-  ('2024-01-01T00:00:01Z'),
-  ('2024-02-01T00:00:01Z'),
-  ('2024-03-01T00:00:01Z')
-;
+query T
+SELECT arrow_typeof(to_timestamp_micros(arrow_cast(0, 'Decimal128(10,0)')));
+----
+Timestamp(µs)
 
-statement ok
-create view t_utc as
-select column1::timestamp AT TIME ZONE 'UTC' as "column1"
-from t;
+query T
+SELECT arrow_typeof(to_timestamp_nanos(arrow_cast(0, 'Decimal128(10,0)')));
+----
+Timestamp(ns)
 
+# Test decimal array inputs for to_timestamp
 statement ok
-create view t_europe as
-select column1::timestamp AT TIME ZONE 'Europe/Brussels' as "column1"
-from t;
+CREATE TABLE test_decimal_timestamps (
+    d128 DECIMAL(20, 9),
+    d256 DECIMAL(40, 9)
+) AS VALUES
+    (1.5, 1.5),
+    (86400.123456789, 86400.123456789),
+    (0.0, 0.0),
+    (NULL, NULL);
 
 query P
-SELECT column1 FROM t_utc WHERE column1 < '2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles';
+SELECT to_timestamp(d128) FROM test_decimal_timestamps ORDER BY d128 NULLS LAST;
 ----
-2024-01-01T00:00:01Z
-2024-02-01T00:00:01Z
+1970-01-01T00:00:00
+1970-01-01T00:00:01.500
+1970-01-02T00:00:00.123456789
+NULL
 
 query P
-SELECT column1 FROM t_europe WHERE column1 = '2024-01-31T16:00:01' AT TIME ZONE 'America/Los_Angeles';
+SELECT to_timestamp(d256) FROM test_decimal_timestamps ORDER BY d256 NULLS LAST;
 ----
-2024-02-01T00:00:01+01:00
+1970-01-01T00:00:00
+1970-01-01T00:00:01.500
+1970-01-02T00:00:00.123456789
+NULL
+
+statement ok
+DROP TABLE test_decimal_timestamps;
 
+# Test negative values
+# to_timestamp with negative seconds
+# Int8
 query P
-SELECT column1 FROM t_europe WHERE column1 BETWEEN '2020-01-01T00:00:00' AT TIME ZONE 'Australia/Brisbane' AND '2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles';
+SELECT to_timestamp(arrow_cast(-1, 'Int8'));
 ----
-2024-01-01T00:00:01+01:00
-2024-02-01T00:00:01+01:00
+1969-12-31T23:59:59
 
+# Int16
 query P
-SELECT column1 FROM t_utc WHERE column1 IN ('2024-01-31T16:00:01' AT TIME ZONE 'America/Los_Angeles');
+SELECT to_timestamp(arrow_cast(-1, 'Int16'));
 ----
-2024-02-01T00:00:01Z
+1969-12-31T23:59:59
 
+# Int32
 query P
-SELECT column1 as u from t_utc UNION SELECT column1 from t_europe ORDER BY u;
+SELECT to_timestamp(arrow_cast(-86400, 'Int32'));
 ----
-2023-12-31T23:00:01Z
-2024-01-01T00:00:01Z
-2024-01-31T23:00:01Z
-2024-02-01T00:00:01Z
-2024-02-29T23:00:01Z
-2024-03-01T00:00:01Z
+1969-12-31T00:00:00
 
+# Int64
 query P
-SELECT column1 as e from t_europe UNION SELECT column1 from t_utc ORDER BY e;
+SELECT to_timestamp(arrow_cast(-1, 'Int64'));
 ----
-2024-01-01T00:00:01+01:00
-2024-01-01T01:00:01+01:00
-2024-02-01T00:00:01+01:00
-2024-02-01T01:00:01+01:00
-2024-03-01T00:00:01+01:00
-2024-03-01T01:00:01+01:00
+1969-12-31T23:59:59
 
+# Float64
 query P
-SELECT nvl2(null, '2020-01-01T00:00:00-04:00'::timestamp, '2021-02-03T04:05:06Z'::timestamp)
+SELECT to_timestamp(arrow_cast(-0.5, 'Float64'));
 ----
-2021-02-03T04:05:06
+1969-12-31T23:59:59.500
 
-query ?
-SELECT make_array('2020-01-01T00:00:00-04:00'::timestamp, '2021-01-01T01:02:03Z'::timestamp);
+# to_timestamp_seconds with negative values
+# Int8
+query P
+SELECT to_timestamp_seconds(arrow_cast(-1, 'Int8'));
 ----
-[2020-01-01T04:00:00, 2021-01-01T01:02:03]
+1969-12-31T23:59:59
 
+# Int16
 query P
-SELECT * FROM VALUES
- ('2023-12-31T23:00:00Z' AT TIME ZONE 'UTC'),
- ('2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles');
+SELECT to_timestamp_seconds(arrow_cast(-1, 'Int16'));
 ----
-2023-12-31T15:00:00-08:00
-2024-02-01T00:00:00-08:00
+1969-12-31T23:59:59
 
+# Int32
 query P
-SELECT * FROM VALUES
- ('2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles'),
- ('2023-12-31T23:00:00' AT TIME ZONE 'UTC');
+SELECT to_timestamp_seconds(arrow_cast(-86400, 'Int32'));
 ----
-2024-02-01T08:00:00Z
-2023-12-31T23:00:00Z
+1969-12-31T00:00:00
 
-# interval vs. duration comparison
-query B
-select (now() - now()) < interval '1 seconds';
+# Int64
+query P
+SELECT to_timestamp_seconds(arrow_cast(-1, 'Int64'));
 ----
-true
+1969-12-31T23:59:59
 
-query B
-select (now() - now()) <= interval '1 seconds';
+# to_timestamp_millis with negative values
+# Int8
+query P
+SELECT to_timestamp_millis(arrow_cast(-1, 'Int8'));
 ----
-true
+1969-12-31T23:59:59.999
 
-query B
-select (now() - now()) = interval '0 seconds';
+# Int16
+query P
+SELECT to_timestamp_millis(arrow_cast(-1, 'Int16'));
 ----
-true
+1969-12-31T23:59:59.999
 
-query B
-select (now() - now()) != interval '1 seconds';
+# Int32
+query P
+SELECT to_timestamp_millis(arrow_cast(-1000, 'Int32'));
 ----
-true
+1969-12-31T23:59:59
 
-query B
-select (now() - now()) > interval '-1 seconds';
+# Int64
+query P
+SELECT to_timestamp_millis(arrow_cast(-1, 'Int64'));
 ----
-true
+1969-12-31T23:59:59.999
 
-query B
-select (now() - now()) >= interval '-1 seconds';
+# to_timestamp_micros with negative values
+# Int8
+query P
+SELECT to_timestamp_micros(arrow_cast(-1, 'Int8'));
 ----
-true
+1969-12-31T23:59:59.999999
 
-query B
-select arrow_cast(123, 'Duration(Nanosecond)') < interval '200 nanoseconds';
+# Int16
+query P
+SELECT to_timestamp_micros(arrow_cast(-1, 'Int16'));
 ----
-true
+1969-12-31T23:59:59.999999
 
-query B
-select arrow_cast(123, 'Duration(Nanosecond)') < interval '100 nanoseconds';
+# Int32
+query P
+SELECT to_timestamp_micros(arrow_cast(-1000000, 'Int32'));
 ----
-false
+1969-12-31T23:59:59
 
-query B
-select arrow_cast(123, 'Duration(Nanosecond)') < interval '1 seconds';
+# Int64
+query P
+SELECT to_timestamp_micros(arrow_cast(-1, 'Int64'));
 ----
-true
+1969-12-31T23:59:59.999999
 
-query B
-select interval '1 seconds' < arrow_cast(123, 'Duration(Nanosecond)')
+# to_timestamp_nanos with negative values
+# Int8
+query P
+SELECT to_timestamp_nanos(arrow_cast(-1, 'Int8'));
 ----
-false
+1969-12-31T23:59:59.999999999
 
-# interval as LHS
-query B
-select interval '2 seconds' = interval '2 seconds';
+# Int16
+query P
+SELECT to_timestamp_nanos(arrow_cast(-1, 'Int16'));
 ----
-true
+1969-12-31T23:59:59.999999999
 
-query B
-select interval '1 seconds' < interval '2 seconds';
+# Int32
+query P
+SELECT to_timestamp_nanos(arrow_cast(-1000000000, 'Int32'));
 ----
-true
+1969-12-31T23:59:59
 
-statement ok
-drop table t;
-
-statement ok
-drop view t_utc;
-
-statement ok
-drop view t_europe;
-
-# TODO: In Postgres, '-1' is unknown type and interpreted to float8 so they don't fail on this query
-query error DataFusion error: Arrow error: Parser error: Error parsing timestamp from '\-1': timestamp must contain at least 10 characters
-select to_timestamp('-1');
-
-query error DataFusion error: Arrow error: Parser error: Error parsing timestamp from '\-1': timestamp must contain at least 10 characters
-select to_timestamp(arrow_cast('-1', 'Utf8'));
+# Int64
+query P
+SELECT to_timestamp_nanos(arrow_cast(-1000000000, 'Int64'));
+----
+1969-12-31T23:59:59
 
 query P
-SELECT CAST(CAST(1   AS decimal(17,2)) AS timestamp(3)) AS a UNION ALL
-SELECT CAST(CAST(one AS decimal(17,2)) AS timestamp(3)) AS a FROM (VALUES (1)) t(one);
+SELECT to_timestamp_nanos(arrow_cast(-1, 'Int64'));
 ----
-1970-01-01T00:00:00.001
-1970-01-01T00:00:00.001
+1969-12-31T23:59:59.999999999
 
+# Test large unsigned values
 query P
-SELECT arrow_cast(CAST(1   AS decimal(17,2)), 'Timestamp(Nanosecond, None)') AS a UNION ALL
-SELECT arrow_cast(CAST(one AS decimal(17,2)), 'Timestamp(Nanosecond, None)') AS a FROM (VALUES (1)) t(one);
+SELECT to_timestamp_seconds(arrow_cast(4294967295, 'UInt64'));
 ----
-1970-01-01T00:00:00.000000001
-1970-01-01T00:00:00.000000001
+2106-02-07T06:28:15
 
+# Large UInt64 value for milliseconds
 query P
-SELECT arrow_cast(CAST(1   AS decimal(17,2)), 'Timestamp(Microsecond, None)') AS a UNION ALL
-SELECT arrow_cast(CAST(one AS decimal(17,2)), 'Timestamp(Microsecond, None)') AS a FROM (VALUES (1)) t(one);
+SELECT to_timestamp_millis(arrow_cast(4294967295000, 'UInt64'));
 ----
-1970-01-01T00:00:00.000001
-1970-01-01T00:00:00.000001
+2106-02-07T06:28:15
+
+# Test UInt64 value larger than i64::MAX (9223372036854775808 = i64::MAX + 1)
+query error Cast error: Can't cast value 9223372036854775808 to type Int64
+SELECT to_timestamp_nanos(arrow_cast(9223372036854775808, 'UInt64'));
 
+# Test boundary values for to_timestamp
 query P
-SELECT arrow_cast(CAST(1   AS decimal(17,2)), 'Timestamp(Millisecond, None)') AS a UNION ALL
-SELECT arrow_cast(CAST(one AS decimal(17,2)), 'Timestamp(Millisecond, None)') AS a FROM (VALUES (1)) t(one);
+SELECT to_timestamp(arrow_cast(9223372036, 'Int64'));
 ----
-1970-01-01T00:00:00.001
-1970-01-01T00:00:00.001
+2262-04-11T23:47:16
 
+# Minimum value for to_timestamp
 query P
-SELECT arrow_cast(CAST(1   AS decimal(17,2)), 'Timestamp(Second, None)') AS a UNION ALL
-SELECT arrow_cast(CAST(one AS decimal(17,2)), 'Timestamp(Second, None)') AS a FROM (VALUES (1)) t(one);
+SELECT to_timestamp(arrow_cast(-9223372036, 'Int64'));
 ----
-1970-01-01T00:00:01
-1970-01-01T00:00:01
+1677-09-21T00:12:44
 
+# Overflow error when value exceeds valid range
+query error Arithmetic overflow
+SELECT to_timestamp(arrow_cast(9223372037, 'Int64'));
 
+# Float truncation behavior
 query P
-SELECT arrow_cast(CAST(1.123 AS decimal(17,3)), 'Timestamp(Nanosecond, None)') AS a UNION ALL
-SELECT arrow_cast(CAST(one AS decimal(17,3)), 'Timestamp(Nanosecond, None)') AS a FROM (VALUES (1.123)) t(one);
+SELECT to_timestamp_seconds(arrow_cast(-1.9, 'Float64'));
 ----
-1970-01-01T00:00:00.000000001
-1970-01-01T00:00:00.000000001
+1969-12-31T23:59:59
 
 query P
-SELECT arrow_cast(CAST(1.123 AS decimal(17,3)), 'Timestamp(Microsecond, None)') AS a UNION ALL
-SELECT arrow_cast(CAST(one AS decimal(17,3)), 'Timestamp(Microsecond, None)') AS a FROM (VALUES (1.123)) t(one);
+SELECT to_timestamp_millis(arrow_cast(-1.9, 'Float64'));
 ----
-1970-01-01T00:00:00.000001
-1970-01-01T00:00:00.000001
+1969-12-31T23:59:59.999
+
+
+##########
+## Common timestamp data
+##########
+
+statement ok
+drop table ts_data
+
+statement ok
+drop table ts_data_nanos
+
+statement ok
+drop table ts_data_micros
+
+statement ok
+drop table ts_data_millis
+
+statement ok
+drop table ts_data_secs
+
+statement ok
+drop table ts_data_micros_kolkata
+
+##########
+## Test to_timestamp with scalar float inputs
+##########
+
+statement ok
+create table test_to_timestamp_scalar(id int, name varchar) as values
+  (1, 'foo'),
+  (2, 'bar');
 
 query P
-SELECT arrow_cast(CAST(1.123 AS decimal(17,3)), 'Timestamp(Millisecond, None)') AS a UNION ALL
-SELECT arrow_cast(CAST(one AS decimal(17,3)), 'Timestamp(Millisecond, None)') AS a FROM (VALUES (1.123)) t(one);
+SELECT to_timestamp(123.5, name) FROM test_to_timestamp_scalar ORDER BY id;
 ----
-1970-01-01T00:00:00.001
-1970-01-01T00:00:00.001
+1970-01-01T00:02:03.500
+1970-01-01T00:02:03.500
 
 query P
-SELECT arrow_cast(CAST(1.123 AS decimal(17,3)), 'Timestamp(Second, None)') AS a UNION ALL
-SELECT arrow_cast(CAST(one AS decimal(17,3)), 'Timestamp(Second, None)') AS a FROM (VALUES (1.123)) t(one);
+SELECT to_timestamp(456.789::float, name) FROM test_to_timestamp_scalar ORDER BY id;
 ----
-1970-01-01T00:00:01
-1970-01-01T00:00:01
+1970-01-01T00:07:36.789001464
+1970-01-01T00:07:36.789001464
 
-query TTTTT
-SELECT
-    arrow_typeof(a),
-    CAST(a AS varchar),
-    arrow_cast(a, 'Utf8'),
-    arrow_cast(a, 'Utf8View'),
-    arrow_cast(a, 'LargeUtf8')
-FROM (SELECT DATE '2005-09-10' AS a)
+query P
+SELECT to_timestamp(arrow_cast(100.5, 'Float16'), name) FROM test_to_timestamp_scalar ORDER BY id;
 ----
-Date32 2005-09-10 2005-09-10 2005-09-10 2005-09-10
+1970-01-01T00:01:40.500
+1970-01-01T00:01:40.500
 
-query TTTTT
-SELECT
-    arrow_typeof(a),
-    CAST(a AS varchar),
-    arrow_cast(a, 'Utf8'),
-    arrow_cast(a, 'Utf8View'),
-    arrow_cast(a, 'LargeUtf8')
-FROM (SELECT TIMESTAMP '2005-09-10 13:31:00' AS a)
-----
-Timestamp(ns) 2005-09-10T13:31:00 2005-09-10T13:31:00 2005-09-10T13:31:00 2005-09-10T13:31:00
+statement ok
+drop table test_to_timestamp_scalar
 
-query TTTTT
-SELECT
-    arrow_typeof(a),
-    CAST(a AS varchar),
-    arrow_cast(a, 'Utf8'),
-    arrow_cast(a, 'Utf8View'),
-    arrow_cast(a, 'LargeUtf8')
-FROM (SELECT CAST('2005-09-10 13:31:00 +02:00' AS timestamp with time zone) AS a)
+# date_bin with NULL interval should return NULL, not a planning error
+query P
+SELECT date_bin(NULL, TIMESTAMP '2023-01-01 12:30:00', TIMESTAMP '2023-01-01 12:00:00')
 ----
-Timestamp(ns, "+00") 2005-09-10T11:31:00Z 2005-09-10T11:31:00Z 2005-09-10T11:31:00Z 2005-09-10T11:31:00Z
+NULL
diff --git a/datafusion/sqllogictest/test_files/decimal.slt b/datafusion/sqllogictest/test_files/decimal.slt
index 502821fcc3043..5485a5fd30141 100644
--- a/datafusion/sqllogictest/test_files/decimal.slt
+++ b/datafusion/sqllogictest/test_files/decimal.slt
@@ -614,22 +614,11 @@ select a / b from foo;
 ----
 0.2
 
-statement ok
-create table t as values (arrow_cast(123, 'Decimal256(5,2)'));
-
-# make sure query below runs in single partition
-# otherwise error message may not be deterministic
-statement ok
-set datafusion.execution.target_partitions = 1;
-
 query R
-select AVG(column1) from t;
+select AVG(column1) from values (arrow_cast(123, 'Decimal256(5,2)'));
 ----
 123
 
-statement ok
-drop table t;
-
 statement ok
 CREATE EXTERNAL TABLE decimal256_simple (
 c1 DECIMAL(50,6) NOT NULL,
@@ -788,21 +777,71 @@ Float64 133333333333333330000000000000000000000000000
 statement ok
 set datafusion.sql_parser.parse_float_as_decimal = true;
 
+# round should keep decimals when parse_float_as_decimal is enabled
+query TR
+select arrow_typeof(round(173975140545.855, 2)),
+       round(173975140545.855, 2);
+----
+Decimal128(15, 2) 173975140545.86
+
 # smoke test for decimal parsing
 query RT
 select 100000000000000000000000000000000000::decimal(38,0), arrow_typeof(100000000000000000000000000000000000::decimal(38,0));
 ----
 100000000000000000000000000000000000 Decimal128(38, 0)
 
+# log for small decimal32
+query R
+select log(arrow_cast(100, 'Decimal32(9, 0)'));
+----
+2
+
+query R
+select log(arrow_cast(100, 'Decimal32(9, 2)'));
+----
+2
+
+query R
+select log(2.0, arrow_cast(12345.67, 'Decimal32(9, 2)'));
+----
+13.591717513272
+
+# log for small decimal64
+query R
+select log(arrow_cast(100, 'Decimal64(18, 0)'));
+----
+2
+
+query R
+select log(arrow_cast(100, 'Decimal64(18, 2)'));
+----
+2
+
+query R
+select log(2.0, arrow_cast(12345.6789, 'Decimal64(15, 4)'));
+----
+13.591718553311
+
+
 # log for small decimal128
 query R
-select log(100::decimal(38,0));
+select log(arrow_cast(100, 'Decimal128(38, 0)'));
+----
+2
+
+query R
+select log(arrow_cast(100, 'Decimal128(38, 2)'));
 ----
 2
 
 # log for small decimal256
 query R
-select log(100::decimal(76,0));
+select log(arrow_cast(100, 'Decimal256(76, 0)'));
+----
+2
+
+query R
+select log(arrow_cast(100, 'Decimal256(76, 2)'));
 ----
 2
 
@@ -829,9 +868,11 @@ select log(100000000000000000000000000000000000::decimal(76,0));
 ----
 35
 
-# log(10^50) for decimal256 for a value larger than i128
-query error Arrow error: Not yet implemented: Log of Decimal256 larger than Decimal128 is not yet supported
+# log(10^50) for decimal256 for a value larger than i128 (uses f64 fallback)
+query R
 select log(100000000000000000000000000000000000000000000000000::decimal(76,0));
+----
+50
 
 # log(10^35) for decimal128 with explicit base
 query R
@@ -855,7 +896,7 @@ select log(10::decimal(38, 0), 100000000000000000000000000000000000::decimal(38,
 query R
 select log(2, 100000000000000000000000000000000000::decimal(38,0));
 ----
-116
+116.267483321058
 
 # log(10^35) for decimal128 with another base
 query R
@@ -863,6 +904,12 @@ select log(2.0, 100000000000000000000000000000000000::decimal(38,0));
 ----
 116.267483321058
 
+# log with non-integer base (fallback to f64)
+query R
+select log(2.5, 100::decimal(38,0));
+----
+5.025883189464
+
 # null cases
 query R
 select log(null, 100);
@@ -884,6 +931,257 @@ select log(2.0, null);
 ----
 NULL
 
+# log with negative scale decimals
+# Using scientific notation to create decimals with negative scales
+# 1e4 = 10000 with scale -4, log10(10000) = 4.0
+query R
+select log(1e4);
+----
+4
+
+# log with negative scale and explicit base 10
+query R
+select log(10, 1e4);
+----
+4
+
+# log with negative scale and base 2
+# 8e1 = 80 with scale -1, log2(80) ≈ 6.321928
+query R
+select log(2.0, 8e1);
+----
+6.321928094887
+
+# log with negative scale and base 2 (another value)
+# 16e1 = 160 with scale -1, log2(160) ≈ 7.321928
+query R
+select log(2.0, 16e1);
+----
+7.321928094887
+
+# log with negative scale -3
+# 5e3 = 5000 with scale -3, log10(5000) ≈ 3.69897
+query R
+select log(5e3);
+----
+3.698970004336
+
+# log with negative scale array values
+query R rowsort
+select log(value) from (values (1e3), (1e4), (1e5)) as t(value);
+----
+3
+4
+5
+
+# log with negative scale and different bases
+query R rowsort
+select log(base, 1e4) from (values (10.0), (2.0), (3.0)) as t(base);
+----
+13.287712379549
+4
+8.383613097158
+
+# log(decimal32) with negative scale
+# 1e4 = 10000 with scale -4
+query R
+select log(CAST(1e4 AS DECIMAL(9, -4)));
+----
+4
+
+# log(decimal32) with negative scale and base 2
+# 8e1 = 80, log2(80) ≈ 6.321928
+query R
+select log(2.0, CAST(8e1 AS DECIMAL(9, -1)));
+----
+6.321928094887
+
+
+# log(decimal64) with negative scale
+# 5e3 = 5000, log10(5000) ≈ 3.69897
+query R
+select log(CAST(5e3 AS DECIMAL(18, -3)));
+----
+3.698970004336
+
+# log(decimal64) with negative scale and different bases
+query R rowsort
+select log(base, CAST(1e4 AS DECIMAL(18, -4)))
+from (values (10.0), (2.0), (3.0)) as t(base);
+----
+13.287712379549
+4
+8.383613097158
+
+# log(decimal128) with negative scale and base 2
+# 8e1 = 80, log2(80) ≈ 6.321928
+query R
+select log(2.0, CAST(8e1 AS DECIMAL(38, -1)));
+----
+6.321928094887
+
+
+# log(decimal128) with negative scale and different bases
+query R rowsort
+select log(base, CAST(1e4 AS DECIMAL(38, -4)))
+from (values (10.0), (2.0), (3.0)) as t(base);
+----
+13.287712379549
+4
+8.383613097158
+
+# Test log of a decimal value between 0 and 1 (e.g., 0.5)
+query R
+SELECT log(10, arrow_cast(0.5, 'Decimal32(5, 1)'))
+----
+-0.301029995664
+
+query R
+SELECT log(10, arrow_cast(1 , 'Decimal32(5, 1)'))
+----
+0
+
+# power with decimals
+
+query RT
+SELECT power(2::decimal(38, 0), 4), arrow_typeof(power(2::decimal(38, 0), 4));
+----
+16 Decimal128(38, 0)
+
+query RT
+SELECT power(10000000000::decimal(38, 0), 2), arrow_typeof(power(10000000000::decimal(38, 0), 2));
+----
+100000000000000000000 Decimal128(38, 0)
+
+query R
+SELECT power(2.5, 4)
+----
+39
+
+query R
+SELECT power(2.5, 1)
+----
+2.5
+
+query R
+SELECT power(2.5, 0)
+----
+1
+
+query R
+SELECT power(1e4, 2)
+----
+100000000
+
+# int64 base with decimal exponent (coerced to float computation)
+query R
+SELECT power(10, -2.0)
+----
+0.01
+
+query R
+SELECT power(2, -0.5)
+----
+0.707106781187
+
+# query error Unsupported data type Decimal128\(2, 1\) for power function
+# SELECT power(2.5, 4.0)
+
+# power() with very large exponent returns infinity (Float64 behavior)
+query R
+SELECT power(2, 100000000000)
+----
+Infinity
+
+# Negative exponent now works (fallback to f64)
+query RT
+SELECT power(2::decimal(38, 0), -5), arrow_typeof(power(2::decimal(38, 0), -5));
+----
+0 Decimal128(38, 0)
+
+# Negative exponent with scale preserves decimal places
+query RT
+SELECT power(4::decimal(38, 5), -1), arrow_typeof(power(4::decimal(38, 5), -1));
+----
+0.25 Decimal128(38, 5)
+
+# Expected to have `16 Decimal128(38, 0)`
+# Due to type coericion, it becomes Float -> Float -> Float
+query RT
+SELECT power(2::decimal(38, 0), 4), arrow_typeof(power(2::decimal(38, 0), 4));
+----
+16 Decimal128(38, 0)
+
+# Arbitrary scale
+query RT
+SELECT power(2.5::decimal(38, 3), 4), arrow_typeof(power(2.5::decimal(38, 3), 4));
+----
+39.062 Decimal128(38, 3)
+
+query RT
+SELECT power(2.5, 4.0), arrow_typeof(power(2.5, 4.0));
+----
+39 Decimal128(2, 1)
+
+# Non-integer exponent now works (fallback to f64)
+query RT
+SELECT power(2.5, 4.2), arrow_typeof(power(2.5, 4.2));
+----
+46.9 Decimal128(2, 1)
+
+query error Compute error: Cannot use non-finite exp: NaN
+SELECT power(2::decimal(38, 0), arrow_cast('NaN','Float64'))
+
+query error Compute error: Cannot use non-finite exp: inf
+SELECT power(2::decimal(38, 0), arrow_cast('INF','Float64'))
+
+# Floating above u32::max now works (fallback to f64, returns infinity which is an error)
+query error Arrow error: Arithmetic overflow: Result of 2\^5000000000.1 is not finite
+SELECT power(2::decimal(38, 0), 5000000000.1)
+
+# Integer Above u32::max - still goes through integer path which fails
+query error Arrow error: Arithmetic overflow: Unsupported exp value
+SELECT power(2::decimal(38, 0), 5000000000)
+
+query ?T
+SELECT power(arrow_cast(2, 'Decimal32(5, 0)'), 4), arrow_typeof(power(arrow_cast(2, 'Decimal32(5, 0)'), 4));
+----
+16 Decimal32(5, 0)
+
+query ?T
+SELECT power(arrow_cast(2, 'Decimal64(5, 0)'), 4), arrow_typeof(power(arrow_cast(2, 'Decimal64(5, 0)'), 4));
+----
+16 Decimal64(5, 0)
+
+query RT
+SELECT power(2::decimal(76, 0), 4), arrow_typeof(power(2::decimal(76, 0), 4));
+----
+16 Decimal256(76, 0)
+
+query R
+SELECT power(2.0, null)
+----
+NULL
+
+# Array variants of power function
+query RR rowsort
+SELECT distinct c1*100000, power(c1*100000, 2) from decimal_simple;
+----
+1 1
+2 4
+3 9
+4 16
+5 25
+
+query RR rowsort
+SELECT distinct c1*100000, power(c1*100000, 2.0) from decimal_simple;
+----
+1 1
+2 4
+3 9
+4 16
+5 25
+
 # Set parse_float_as_decimal to false to test float parsing
 statement ok
 set datafusion.sql_parser.parse_float_as_decimal = false;
@@ -895,15 +1193,45 @@ select 100000000000000000000000000000000000::decimal(38,0)
 99999999999999996863366107917975552
 
 # log(10^35) for decimal128 with explicit decimal base
-# Float parsing is rounding down
+# Float parsing is rounding down, but log uses float computation so result rounds to 35
 query R
 select log(10, 100000000000000000000000000000000000::decimal(38,0));
 ----
-34
+35
 
 # log(10^35) for large decimal128 if parsed as float
-# Float parsing is rounding down
+# Float parsing is rounding down, but log uses float computation so result rounds to 35
 query R
 select log(100000000000000000000000000000000000::decimal(38,0))
 ----
-34
+35
+
+# Result is decimal since argument is decimal regardless decimals-as-floats parsing
+query R
+SELECT power(10000000000::decimal(38, 0), 2);
+----
+100000000000000000000
+
+query RT
+SELECT power(10000000000::decimal(38, 0), 2),
+       arrow_typeof(power(10000000000::decimal(38, 0), 2));
+----
+100000000000000000000 Decimal128(38, 0)
+
+query R
+SELECT power(2.5, 4.0)
+----
+39.0625
+
+query R
+SELECT power(2.5, 4)
+----
+39.0625
+
+query R
+SELECT power(2, null)
+----
+NULL
+
+query error Arrow error: Invalid argument error: 1.10 is too large to store in a Decimal128 of precision 2. Max is 0.99
+select cast(1.1 as decimal(2, 2)) + 1;
diff --git a/datafusion/sqllogictest/test_files/delete.slt b/datafusion/sqllogictest/test_files/delete.slt
index 258318f09423c..6131d6db3d5f7 100644
--- a/datafusion/sqllogictest/test_files/delete.slt
+++ b/datafusion/sqllogictest/test_files/delete.slt
@@ -34,7 +34,9 @@ explain delete from t1;
 logical_plan
 01)Dml: op=[Delete] table=[t1]
 02)--TableScan: t1
-physical_plan_error This feature is not implemented: Unsupported logical plan: Dml(Delete)
+physical_plan
+01)CooperativeExec
+02)--DmlResultExec: rows_affected=0
 
 
 # Filtered by existing columns
@@ -43,9 +45,11 @@ explain delete from t1 where a = 1 and b = 2 and c > 3 and d != 4;
 ----
 logical_plan
 01)Dml: op=[Delete] table=[t1]
-02)--Filter: CAST(t1.a AS Int64) = Int64(1) AND t1.b = CAST(Int64(2) AS Utf8View) AND t1.c > CAST(Int64(3) AS Float64) AND CAST(t1.d AS Int64) != Int64(4)
+02)--Filter: CAST(t1.a AS Int64) = Int64(1) AND CAST(t1.b AS Int64) = Int64(2) AND t1.c > CAST(Int64(3) AS Float64) AND CAST(t1.d AS Int64) != Int64(4)
 03)----TableScan: t1
-physical_plan_error This feature is not implemented: Unsupported logical plan: Dml(Delete)
+physical_plan
+01)CooperativeExec
+02)--DmlResultExec: rows_affected=0
 
 
 # Filtered by existing columns, using qualified and unqualified names
@@ -54,9 +58,11 @@ explain delete from t1 where t1.a = 1 and b = 2 and t1.c > 3 and d != 4;
 ----
 logical_plan
 01)Dml: op=[Delete] table=[t1]
-02)--Filter: CAST(t1.a AS Int64) = Int64(1) AND t1.b = CAST(Int64(2) AS Utf8View) AND t1.c > CAST(Int64(3) AS Float64) AND CAST(t1.d AS Int64) != Int64(4)
+02)--Filter: CAST(t1.a AS Int64) = Int64(1) AND CAST(t1.b AS Int64) = Int64(2) AND t1.c > CAST(Int64(3) AS Float64) AND CAST(t1.d AS Int64) != Int64(4)
 03)----TableScan: t1
-physical_plan_error This feature is not implemented: Unsupported logical plan: Dml(Delete)
+physical_plan
+01)CooperativeExec
+02)--DmlResultExec: rows_affected=0
 
 
 # Filtered by a mix of columns and literal predicates
@@ -67,7 +73,9 @@ logical_plan
 01)Dml: op=[Delete] table=[t1]
 02)--Filter: CAST(t1.a AS Int64) = Int64(1) AND Int64(1) = Int64(1) AND Boolean(true)
 03)----TableScan: t1
-physical_plan_error This feature is not implemented: Unsupported logical plan: Dml(Delete)
+physical_plan
+01)CooperativeExec
+02)--DmlResultExec: rows_affected=0
 
 
 # Deleting by columns that do not exist returns an error
@@ -105,3 +113,34 @@ logical_plan
 05)--------TableScan: t2
 06)----TableScan: t1
 physical_plan_error This feature is not implemented: Physical plan does not support logical expression InSubquery(InSubquery { expr: Column(Column { relation: Some(Bare { table: "t1" }), name: "a" }), subquery: <subquery>, negated: false })
+
+
+# Delete with limit
+
+query TT
+explain delete from t1 limit 10
+----
+logical_plan
+01)Dml: op=[Delete] table=[t1]
+02)--Limit: skip=0, fetch=10
+03)----TableScan: t1
+physical_plan
+01)CooperativeExec
+02)--DmlResultExec: rows_affected=0
+
+
+query TT
+explain delete from t1 where a = 1 and b = '2' limit 10
+----
+logical_plan
+01)Dml: op=[Delete] table=[t1]
+02)--Limit: skip=0, fetch=10
+03)----Filter: CAST(t1.a AS Int64) = Int64(1) AND t1.b = CAST(Utf8("2") AS Utf8View)
+04)------TableScan: t1
+physical_plan
+01)CooperativeExec
+02)--DmlResultExec: rows_affected=0
+
+# Config reset
+statement ok
+RESET datafusion.optimizer.max_passes;
diff --git a/datafusion/sqllogictest/test_files/dictionary.slt b/datafusion/sqllogictest/test_files/dictionary.slt
index fd9a7fb9ce447..92e6c41835d75 100644
--- a/datafusion/sqllogictest/test_files/dictionary.slt
+++ b/datafusion/sqllogictest/test_files/dictionary.slt
@@ -36,7 +36,7 @@ SELECT
     arrow_cast(column3, 'Utf8') as f2,
     arrow_cast(column4, 'Utf8') as f3,
     arrow_cast(column5, 'Float64') as f4,
-    arrow_cast(column6, 'Timestamp(Nanosecond, None)') as time
+    arrow_cast(column6, 'Timestamp(ns)') as time
 FROM (
     VALUES
     -- equivalent to the following line protocol data
@@ -111,7 +111,7 @@ SELECT
     arrow_cast(column1, 'Dictionary(Int32, Utf8)') as type,
     arrow_cast(column2, 'Dictionary(Int32, Utf8)') as tag_id,
     arrow_cast(column3, 'Float64') as f5,
-    arrow_cast(column4, 'Timestamp(Nanosecond, None)') as time
+    arrow_cast(column4, 'Timestamp(ns)') as time
 FROM (
     VALUES
     -- equivalent to the following line protocol data
@@ -410,9 +410,8 @@ logical_plan
 01)Filter: test.column2 = Dictionary(Int32, Utf8("1"))
 02)--TableScan: test projection=[column1, column2]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: column2@1 = 1
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: column2@1 = 1
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 # try literal = col to verify order doesn't matter
 # filter should not cast column2
@@ -423,12 +422,12 @@ logical_plan
 01)Filter: test.column2 = Dictionary(Int32, Utf8("1"))
 02)--TableScan: test projection=[column1, column2]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: column2@1 = 1
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: column2@1 = 1
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 
-# Now query using an integer which must be coerced into a dictionary string
+# Query using an integer literal: comparison coercion prefers numeric types,
+# so the dictionary string column is cast to Int64
 query TT
 SELECT * from test where column2 = 1;
 ----
@@ -438,12 +437,11 @@ query TT
 explain SELECT * from test where column2 = 1;
 ----
 logical_plan
-01)Filter: test.column2 = Dictionary(Int32, Utf8("1"))
+01)Filter: CAST(test.column2 AS Int64) = Int64(1)
 02)--TableScan: test projection=[column1, column2]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: column2@1 = 1
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: CAST(column2@1 AS Int64) = 1
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Window Functions
 query I
@@ -457,3 +455,63 @@ CREATE TABLE test0 AS VALUES ('foo',1), ('bar',2), ('foo',3);
 
 statement ok
 COPY (SELECT arrow_cast(column1, 'Dictionary(Int32, Utf8)') AS column1, column2 FROM test0) TO 'test_files/scratch/dictionary/part_dict_test' STORED AS PARQUET PARTITIONED BY (column1);
+
+
+# Dictionary-encoded large hashes repeated across 10 rows (3 unique hashes)
+statement ok
+CREATE TABLE dict_hash_src AS
+SELECT
+  arrow_cast(column1, 'Int64') AS id,
+  arrow_cast(column2, 'Dictionary(Int32, Utf8)') AS payload_hash,
+  arrow_cast(column3, 'Float64') AS metric,
+  arrow_cast(column4, 'Timestamp(ns)') AS ts
+FROM (
+  VALUES
+    (1,  '9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08', 10.0, 1703030400000000000),
+    (2,  '9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08', 11.0, 1703031000000000000),
+    (3,  '9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08', 12.0, 1703031600000000000),
+    (4,  '9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08', 13.0, 1703032200000000000),
+    (5,  '1f40fc92da241694750979ee6cf582f2d5d7d28e18335de05abc54d0560e0f53', 14.0, 1703032800000000000),
+    (6,  '1f40fc92da241694750979ee6cf582f2d5d7d28e18335de05abc54d0560e0f53', 15.0, 1703033400000000000),
+    (7,  '1f40fc92da241694750979ee6cf582f2d5d7d28e18335de05abc54d0560e0f53', 16.0, 1703034000000000000),
+    (8,  '6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b', 17.0, 1703034600000000000),
+    (9,  '6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b', 18.0, 1703035200000000000),
+    (10, '6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b', 19.0, 1703035800000000000)
+);
+
+statement ok
+COPY dict_hash_src
+TO 'test_files/scratch/dictionary/dict_hash_10.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE dict_hash_10
+STORED AS PARQUET
+LOCATION 'test_files/scratch/dictionary/dict_hash_10.parquet';
+
+query TTT
+DESCRIBE dict_hash_10;
+----
+id Int64 YES
+payload_hash Dictionary(Int32, Utf8) YES
+metric Float64 YES
+ts Timestamp(ns) YES
+
+query II
+SELECT COUNT(*), COUNT(DISTINCT payload_hash)
+FROM dict_hash_10;
+----
+10 3
+
+query I
+SELECT COUNT(*)
+FROM dict_hash_10
+WHERE payload_hash = '1f40fc92da241694750979ee6cf582f2d5d7d28e18335de05abc54d0560e0f53';
+----
+3
+
+statement ok
+DROP TABLE dict_hash_10;
+
+statement ok
+DROP TABLE dict_hash_src;
diff --git a/datafusion/sqllogictest/test_files/distinct_on.slt b/datafusion/sqllogictest/test_files/distinct_on.slt
index b4a491619e893..5b18915080f8f 100644
--- a/datafusion/sqllogictest/test_files/distinct_on.slt
+++ b/datafusion/sqllogictest/test_files/distinct_on.slt
@@ -98,11 +98,10 @@ physical_plan
 02)--SortPreservingMergeExec: [c1@0 ASC NULLS LAST]
 03)----SortExec: expr=[c1@0 ASC NULLS LAST], preserve_partitioning=[true]
 04)------AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[first_value(aggregate_test_100.c3) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST], first_value(aggregate_test_100.c2) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST]]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([c1@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[first_value(aggregate_test_100.c3) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST], first_value(aggregate_test_100.c2) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST]]
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true
+05)--------RepartitionExec: partitioning=Hash([c1@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[first_value(aggregate_test_100.c3) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST], first_value(aggregate_test_100.c2) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST]]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true
 
 # ON expressions are not a sub-set of the ORDER BY expressions
 query error SELECT DISTINCT ON expressions must match initial ORDER BY expressions
@@ -190,5 +189,9 @@ logical_plan
 01)Aggregate: groupBy=[[t.a, t.b]], aggr=[[]]
 02)--TableScan: t projection=[a, b]
 
+# Config reset
+statement ok
+RESET datafusion.explain.logical_plan_only;
+
 statement ok
 drop table t;
diff --git a/datafusion/sqllogictest/test_files/dml_delete.slt b/datafusion/sqllogictest/test_files/dml_delete.slt
new file mode 100644
index 0000000000000..3dae431ada377
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/dml_delete.slt
@@ -0,0 +1,202 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## DELETE tests for MemTable
+##########
+
+# Test basic DELETE with WHERE clause
+statement ok
+CREATE TABLE test_delete AS VALUES (1, 'a'), (2, 'b'), (3, 'c');
+
+query I
+DELETE FROM test_delete WHERE column1 > 1;
+----
+2
+
+query IT rowsort
+SELECT * FROM test_delete;
+----
+1 a
+
+statement ok
+DROP TABLE test_delete;
+
+# Test DELETE all rows (no WHERE clause)
+statement ok
+CREATE TABLE test_delete_all AS VALUES (1, 'x'), (2, 'y'), (3, 'z');
+
+query I
+DELETE FROM test_delete_all;
+----
+3
+
+query I
+SELECT COUNT(*) FROM test_delete_all;
+----
+0
+
+statement ok
+DROP TABLE test_delete_all;
+
+# Test DELETE with compound predicate (AND)
+statement ok
+CREATE TABLE test_delete_compound AS VALUES (1, 10), (2, 20), (3, 30), (4, 40);
+
+query I
+DELETE FROM test_delete_compound WHERE column1 > 1 AND column2 < 40;
+----
+2
+
+query II rowsort
+SELECT * FROM test_delete_compound;
+----
+1 10
+4 40
+
+statement ok
+DROP TABLE test_delete_compound;
+
+# Test DELETE with OR predicate
+statement ok
+CREATE TABLE test_delete_or AS VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd');
+
+query I
+DELETE FROM test_delete_or WHERE column1 = 1 OR column1 = 4;
+----
+2
+
+query IT rowsort
+SELECT * FROM test_delete_or;
+----
+2 b
+3 c
+
+statement ok
+DROP TABLE test_delete_or;
+
+# Test DELETE with no matching rows
+statement ok
+CREATE TABLE test_delete_nomatch AS VALUES (1, 'a'), (2, 'b');
+
+query I
+DELETE FROM test_delete_nomatch WHERE column1 > 100;
+----
+0
+
+query IT rowsort
+SELECT * FROM test_delete_nomatch;
+----
+1 a
+2 b
+
+statement ok
+DROP TABLE test_delete_nomatch;
+
+# Test DELETE with IS NULL predicate
+statement ok
+CREATE TABLE test_delete_null(id INT, name VARCHAR);
+
+statement ok
+INSERT INTO test_delete_null VALUES (1, 'one'), (2, NULL), (3, 'three');
+
+query I
+DELETE FROM test_delete_null WHERE name IS NULL;
+----
+1
+
+query IT rowsort
+SELECT * FROM test_delete_null;
+----
+1 one
+3 three
+
+statement ok
+DROP TABLE test_delete_null;
+
+# Test DELETE with NULL predicate (SQL three-valued logic)
+# When comparing with NULL, the predicate evaluates to NULL, not true/false
+# Rows where predicate is NULL should NOT be deleted
+statement ok
+CREATE TABLE test_delete_null_pred(id INT, value INT);
+
+statement ok
+INSERT INTO test_delete_null_pred VALUES (1, 10), (2, NULL), (3, 30);
+
+# This predicate evaluates to NULL for row with id=2 (because NULL > 15 is NULL)
+# Only row with id=3 should be deleted (30 > 15 is true)
+# Row with id=2 should be kept (NULL > 15 is NULL, not true)
+query I
+DELETE FROM test_delete_null_pred WHERE value > 15;
+----
+1
+
+query II rowsort
+SELECT * FROM test_delete_null_pred;
+----
+1 10
+2 NULL
+
+statement ok
+DROP TABLE test_delete_null_pred;
+
+# Test multiple DELETEs on same table (state persistence)
+statement ok
+CREATE TABLE test_multi_delete AS VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e');
+
+query I
+DELETE FROM test_multi_delete WHERE column1 = 1;
+----
+1
+
+query I
+DELETE FROM test_multi_delete WHERE column1 = 3;
+----
+1
+
+query I
+DELETE FROM test_multi_delete WHERE column1 = 5;
+----
+1
+
+query IT rowsort
+SELECT * FROM test_multi_delete;
+----
+2 b
+4 d
+
+statement ok
+DROP TABLE test_multi_delete;
+
+# Test DELETE with IN predicate
+statement ok
+CREATE TABLE test_delete_in AS VALUES (1), (2), (3), (4), (5);
+
+query I
+DELETE FROM test_delete_in WHERE column1 IN (2, 4);
+----
+2
+
+query I rowsort
+SELECT * FROM test_delete_in;
+----
+1
+3
+5
+
+statement ok
+DROP TABLE test_delete_in;
diff --git a/datafusion/sqllogictest/test_files/dml_update.slt b/datafusion/sqllogictest/test_files/dml_update.slt
new file mode 100644
index 0000000000000..10f74ae3970da
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/dml_update.slt
@@ -0,0 +1,286 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## UPDATE tests for MemTable
+##########
+
+# Test basic UPDATE with WHERE clause
+statement ok
+CREATE TABLE test_update AS VALUES (1, 'a'), (2, 'b'), (3, 'c');
+
+query I
+UPDATE test_update SET column2 = 'updated' WHERE column1 = 2;
+----
+1
+
+query IT rowsort
+SELECT * FROM test_update;
+----
+1 a
+2 updated
+3 c
+
+statement ok
+DROP TABLE test_update;
+
+# Test UPDATE all rows (no WHERE clause)
+statement ok
+CREATE TABLE test_update_all AS VALUES (1, 'x'), (2, 'y'), (3, 'z');
+
+query I
+UPDATE test_update_all SET column2 = 'changed';
+----
+3
+
+query IT rowsort
+SELECT * FROM test_update_all;
+----
+1 changed
+2 changed
+3 changed
+
+statement ok
+DROP TABLE test_update_all;
+
+# Test UPDATE multiple columns
+statement ok
+CREATE TABLE test_update_multi(id INT, name VARCHAR, value INT);
+
+statement ok
+INSERT INTO test_update_multi VALUES (1, 'one', 10), (2, 'two', 20), (3, 'three', 30);
+
+query I
+UPDATE test_update_multi SET name = 'updated', value = 99 WHERE id = 2;
+----
+1
+
+query ITI rowsort
+SELECT * FROM test_update_multi;
+----
+1 one 10
+2 updated 99
+3 three 30
+
+statement ok
+DROP TABLE test_update_multi;
+
+# Test UPDATE with compound predicate (AND)
+statement ok
+CREATE TABLE test_update_compound AS VALUES (1, 10), (2, 20), (3, 30), (4, 40);
+
+query I
+UPDATE test_update_compound SET column2 = 0 WHERE column1 > 1 AND column2 < 40;
+----
+2
+
+query II rowsort
+SELECT * FROM test_update_compound;
+----
+1 10
+2 0
+3 0
+4 40
+
+statement ok
+DROP TABLE test_update_compound;
+
+# Test UPDATE with OR predicate
+statement ok
+CREATE TABLE test_update_or AS VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd');
+
+query I
+UPDATE test_update_or SET column2 = 'modified' WHERE column1 = 1 OR column1 = 4;
+----
+2
+
+query IT rowsort
+SELECT * FROM test_update_or;
+----
+1 modified
+2 b
+3 c
+4 modified
+
+statement ok
+DROP TABLE test_update_or;
+
+# Test UPDATE with no matching rows
+statement ok
+CREATE TABLE test_update_nomatch AS VALUES (1, 'a'), (2, 'b');
+
+query I
+UPDATE test_update_nomatch SET column2 = 'new' WHERE column1 > 100;
+----
+0
+
+query IT rowsort
+SELECT * FROM test_update_nomatch;
+----
+1 a
+2 b
+
+statement ok
+DROP TABLE test_update_nomatch;
+
+# Test UPDATE with NULL predicate (SQL three-valued logic)
+# Rows where predicate is NULL should NOT be updated
+statement ok
+CREATE TABLE test_update_null_pred(id INT, value INT, name VARCHAR);
+
+statement ok
+INSERT INTO test_update_null_pred VALUES (1, 10, 'a'), (2, NULL, 'b'), (3, 30, 'c');
+
+# This predicate evaluates to NULL for row with id=2 (because NULL > 15 is NULL)
+# Only row with id=3 should be updated (30 > 15 is true)
+# Row with id=2 should keep its original name 'b' (NULL > 15 is NULL, not true)
+query I
+UPDATE test_update_null_pred SET name = 'updated' WHERE value > 15;
+----
+1
+
+query IIT rowsort
+SELECT * FROM test_update_null_pred;
+----
+1 10 a
+2 NULL b
+3 30 updated
+
+statement ok
+DROP TABLE test_update_null_pred;
+
+# Test UPDATE with arithmetic expression (SET column = column * 2)
+statement ok
+CREATE TABLE test_update_expr AS VALUES (1, 10), (2, 20), (3, 30);
+
+query I
+UPDATE test_update_expr SET column2 = column2 * 2 WHERE column1 = 2;
+----
+1
+
+query II rowsort
+SELECT * FROM test_update_expr;
+----
+1 10
+2 40
+3 30
+
+statement ok
+DROP TABLE test_update_expr;
+
+# Test UPDATE setting column to NULL
+statement ok
+CREATE TABLE test_update_null(id INT, name VARCHAR);
+
+statement ok
+INSERT INTO test_update_null VALUES (1, 'one'), (2, 'two'), (3, 'three');
+
+query I
+UPDATE test_update_null SET name = NULL WHERE id = 2;
+----
+1
+
+query IT rowsort
+SELECT * FROM test_update_null;
+----
+1 one
+2 NULL
+3 three
+
+statement ok
+DROP TABLE test_update_null;
+
+# Test UPDATE with CASE expression in SET
+statement ok
+CREATE TABLE test_update_case AS VALUES (1, 'low'), (50, 'medium'), (100, 'high');
+
+query I
+UPDATE test_update_case SET column2 = CASE
+    WHEN column1 < 25 THEN 'small'
+    WHEN column1 < 75 THEN 'medium'
+    ELSE 'large'
+END;
+----
+3
+
+query IT rowsort
+SELECT * FROM test_update_case;
+----
+1 small
+100 large
+50 medium
+
+statement ok
+DROP TABLE test_update_case;
+
+# Test UPDATE with column reference (SET a = b)
+statement ok
+CREATE TABLE test_update_col_ref AS VALUES (1, 10, 100), (2, 20, 200);
+
+query I
+UPDATE test_update_col_ref SET column1 = column2 WHERE column3 = 100;
+----
+1
+
+query III rowsort
+SELECT * FROM test_update_col_ref;
+----
+10 10 100
+2 20 200
+
+statement ok
+DROP TABLE test_update_col_ref;
+
+# Test UPDATE with invalid column name (error case)
+statement ok
+CREATE TABLE test_update_error(id INT, name VARCHAR);
+
+statement ok
+INSERT INTO test_update_error VALUES (1, 'test');
+
+statement error No field named nonexistent
+UPDATE test_update_error SET nonexistent = 'value';
+
+statement ok
+DROP TABLE test_update_error;
+
+# Test UPDATE with expression that would error on non-matching rows
+# Regression test: expressions should only be evaluated on rows that match
+# the WHERE clause, not all rows. This prevents divide-by-zero errors
+# on rows that won't be updated.
+statement ok
+CREATE TABLE test_update_div(id INT, divisor INT, result INT);
+
+statement ok
+INSERT INTO test_update_div VALUES (1, 0, 0), (2, 2, 0), (3, 5, 0);
+
+# This should succeed: 1/divisor is only evaluated where divisor != 0
+# Row 1 (divisor=0) is excluded by WHERE clause and expression is NOT evaluated
+query I
+UPDATE test_update_div SET result = 100 / divisor WHERE divisor != 0;
+----
+2
+
+query III rowsort
+SELECT * FROM test_update_div;
+----
+1 0 0
+2 2 50
+3 5 20
+
+statement ok
+DROP TABLE test_update_div;
diff --git a/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt b/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt
index e5cd6d88b08f4..179a611d37e1f 100644
--- a/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt
+++ b/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt
@@ -18,7 +18,8 @@
 # Tests for dynamic filter pushdown configuration options
 # - enable_topk_dynamic_filter_pushdown (for TopK dynamic filters)
 # - enable_join_dynamic_filter_pushdown (for Join dynamic filters)
-# - enable_dynamic_filter_pushdown (controls both)
+# - enable_aggregate_dynamic_filter_pushdown (for Aggregate dynamic filters)
+# - enable_dynamic_filter_pushdown (controls all three)
 
 # Setup: Create parquet test files
 statement ok
@@ -91,6 +92,30 @@ physical_plan
 01)SortExec: TopK(fetch=3), expr=[value@1 DESC], preserve_partitioning=[false]
 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/test_data.parquet]]}, projection=[id, value, name], file_type=parquet, predicate=DynamicFilter [ empty ]
 
+statement ok
+set datafusion.explain.analyze_level = summary;
+
+query TT
+EXPLAIN ANALYZE SELECT id, value AS v, value + id as name FROM test_parquet where value > 3 ORDER BY v DESC LIMIT 3;
+----
+Plan with Metrics
+01)SortPreservingMergeExec: [v@1 DESC], fetch=3, metrics=[output_rows=3, <slt:ignore>]
+02)--SortExec: TopK(fetch=3), expr=[v@1 DESC], preserve_partitioning=[true], filter=[v@1 IS NULL OR v@1 > 800], metrics=[output_rows=3, <slt:ignore>]
+03)----ProjectionExec: expr=[id@0 as id, value@1 as v, value@1 + id@0 as name], metrics=[output_rows=10, <slt:ignore>]
+04)------FilterExec: value@1 > 3, metrics=[output_rows=10, <slt:ignore>, selectivity=100% (10/10)]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, metrics=[output_rows=10, <slt:ignore>]
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/test_data.parquet]]}, projection=[id, value], file_type=parquet, predicate=value@1 > 3 AND DynamicFilter [ value@1 IS NULL OR value@1 > 800 ], pruning_predicate=value_null_count@1 != row_count@2 AND value_max@0 > 3 AND (value_null_count@1 > 0 OR value_null_count@1 != row_count@2 AND value_max@0 > 800), required_guarantees=[], metrics=[output_rows=10, elapsed_compute=<slt:ignore>, output_bytes=80.0 B, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched -> 1 fully matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=1 total → 1 matched, limit_pruned_row_groups=0 total → 0 matched, bytes_scanned=210, metadata_load_time=<slt:ignore>, scan_efficiency_ratio=18.31% (210/1.15 K)]
+
+statement ok
+set datafusion.explain.analyze_level = dev;
+
+query III
+SELECT id, value AS v, value + id as name FROM test_parquet where value > 3 ORDER BY v DESC LIMIT 3;
+----
+10 1000 1010
+9 900 909
+8 800 808
+
 # Disable TopK dynamic filter pushdown
 statement ok
 SET datafusion.optimizer.enable_topk_dynamic_filter_pushdown = false;
@@ -105,6 +130,13 @@ physical_plan
 01)SortExec: TopK(fetch=3), expr=[value@1 DESC], preserve_partitioning=[false]
 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/test_data.parquet]]}, projection=[id, value, name], file_type=parquet
 
+query IIT
+SELECT id, value AS v, name FROM (SELECT * FROM test_parquet UNION ALL SELECT * FROM test_parquet) ORDER BY v DESC LIMIT 3;
+----
+10 1000 j
+10 1000 j
+9 900 i
+
 # Re-enable for next tests
 statement ok
 SET datafusion.optimizer.enable_topk_dynamic_filter_pushdown = true;
@@ -123,11 +155,9 @@ logical_plan
 05)----SubqueryAlias: r
 06)------TableScan: right_parquet projection=[id, info]
 physical_plan
-01)ProjectionExec: expr=[id@1 as id, data@2 as data, info@0 as info]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3]
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
-05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[id@2, data@3, info@1]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ]
 
 # Disable Join dynamic filter pushdown
 statement ok
@@ -147,16 +177,320 @@ logical_plan
 05)----SubqueryAlias: r
 06)------TableScan: right_parquet projection=[id, info]
 physical_plan
-01)ProjectionExec: expr=[id@1 as id, data@2 as data, info@0 as info]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3]
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
-05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[id@2, data@3, info@1]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet
 
 # Re-enable for next tests
 statement ok
 SET datafusion.optimizer.enable_join_dynamic_filter_pushdown = true;
 
+# Test 2b: Dynamic filter pushdown for non-inner join types
+# LEFT JOIN: optimizer swaps to physical Right join (build=right_parquet, probe=left_parquet).
+# Dynamic filter is NOT pushed because Right join needs all probe rows in output.
+query TT
+EXPLAIN SELECT l.*, r.info
+FROM left_parquet l
+LEFT JOIN right_parquet r ON l.id = r.id;
+----
+logical_plan
+01)Projection: l.id, l.data, r.info
+02)--Left Join: l.id = r.id
+03)----SubqueryAlias: l
+04)------TableScan: left_parquet projection=[id, data]
+05)----SubqueryAlias: r
+06)------TableScan: right_parquet projection=[id, info]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Right, on=[(id@0, id@0)], projection=[id@2, data@3, info@1]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet
+
+# LEFT JOIN correctness: all left rows appear, unmatched right rows produce NULLs
+query ITT
+SELECT l.id, l.data, r.info
+FROM left_parquet l
+LEFT JOIN right_parquet r ON l.id = r.id
+ORDER BY l.id;
+----
+1 left1 right1
+2 left2 NULL
+3 left3 right3
+4 left4 NULL
+5 left5 right5
+
+# RIGHT JOIN: optimizer swaps to physical Left join (build=right_parquet, probe=left_parquet).
+# Physical Left join generates a self-generated dynamic filter on the probe side.
+query TT
+EXPLAIN SELECT l.*, r.info
+FROM left_parquet l
+RIGHT JOIN right_parquet r ON l.id = r.id;
+----
+logical_plan
+01)Projection: l.id, l.data, r.info
+02)--Right Join: l.id = r.id
+03)----SubqueryAlias: l
+04)------TableScan: left_parquet projection=[id, data]
+05)----SubqueryAlias: r
+06)------TableScan: right_parquet projection=[id, info]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Left, on=[(id@0, id@0)], projection=[id@2, data@3, info@1]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# RIGHT JOIN correctness: all right rows appear, unmatched left rows produce NULLs
+query ITT
+SELECT l.id, l.data, r.info
+FROM left_parquet l
+RIGHT JOIN right_parquet r ON l.id = r.id
+ORDER BY r.id;
+----
+1 left1 right1
+3 left3 right3
+5 left5 right5
+
+# FULL JOIN: dynamic filter should NOT be pushed (both sides must preserve all rows)
+query TT
+EXPLAIN SELECT l.id, r.id as rid, l.data, r.info
+FROM left_parquet l
+FULL JOIN right_parquet r ON l.id = r.id;
+----
+logical_plan
+01)Projection: l.id, r.id AS rid, l.data, r.info
+02)--Full Join: l.id = r.id
+03)----SubqueryAlias: l
+04)------TableScan: left_parquet projection=[id, data]
+05)----SubqueryAlias: r
+06)------TableScan: right_parquet projection=[id, info]
+physical_plan
+01)ProjectionExec: expr=[id@2 as id, id@0 as rid, data@3 as data, info@1 as info]
+02)--HashJoinExec: mode=CollectLeft, join_type=Full, on=[(id@0, id@0)]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet
+
+# LEFT SEMI JOIN: optimizer swaps to RightSemi (build=right_parquet, probe=left_parquet)
+# and pushes the self-generated filter to the right side (left parquet).
+query TT
+EXPLAIN SELECT l.*
+FROM left_parquet l
+WHERE l.id IN (SELECT r.id FROM right_parquet r);
+----
+logical_plan
+01)LeftSemi Join: l.id = __correlated_sq_1.id
+02)--SubqueryAlias: l
+03)----TableScan: left_parquet projection=[id, data]
+04)--SubqueryAlias: __correlated_sq_1
+05)----SubqueryAlias: r
+06)------TableScan: right_parquet projection=[id]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(id@0, id@0)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id], file_type=parquet
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# LEFT SEMI JOIN (physical LeftSemi): reverse table roles so optimizer keeps LeftSemi
+# (right_parquet has 3 rows < left_parquet has 5 rows, so no swap occurs).
+# Physical LeftSemi generates a self-generated dynamic filter on the probe side.
+query TT
+EXPLAIN SELECT r.*
+FROM right_parquet r
+WHERE r.id IN (SELECT l.id FROM left_parquet l);
+----
+logical_plan
+01)LeftSemi Join: r.id = __correlated_sq_1.id
+02)--SubqueryAlias: r
+03)----TableScan: right_parquet projection=[id, info]
+04)--SubqueryAlias: __correlated_sq_1
+05)----SubqueryAlias: l
+06)------TableScan: left_parquet projection=[id]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(id@0, id@0)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# LEFT SEMI (physical LeftSemi) correctness: only right rows with matching left ids
+query IT rowsort
+SELECT r.*
+FROM right_parquet r
+WHERE r.id IN (SELECT l.id FROM left_parquet l);
+----
+1 right1
+3 right3
+5 right5
+
+# LEFT SEMI JOIN with ORDER BY: the join preserves probe-side ordering, so the
+# sort is pushed below the join. The join-generated dynamic filter should then
+# be pushed through the SortExec into the parquet scan predicate.
+query TT
+EXPLAIN SELECT l.*
+FROM left_parquet l
+WHERE l.id IN (SELECT r.id FROM right_parquet r)
+ORDER BY l.data DESC;
+----
+logical_plan
+01)Sort: l.data DESC NULLS FIRST
+02)--LeftSemi Join: l.id = __correlated_sq_1.id
+03)----SubqueryAlias: l
+04)------TableScan: left_parquet projection=[id, data]
+05)----SubqueryAlias: __correlated_sq_1
+06)------SubqueryAlias: r
+07)--------TableScan: right_parquet projection=[id]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(id@0, id@0)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id], file_type=parquet
+03)--SortExec: expr=[data@1 DESC], preserve_partitioning=[false]
+04)----FilterExec: DynamicFilter [ empty ]
+05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+statement count 0
+SET datafusion.execution.parquet.pushdown_filters = true;
+
+query TT
+EXPLAIN SELECT l.*
+FROM left_parquet l
+WHERE l.id IN (SELECT r.id FROM right_parquet r)
+ORDER BY l.data DESC;
+----
+logical_plan
+01)Sort: l.data DESC NULLS FIRST
+02)--LeftSemi Join: l.id = __correlated_sq_1.id
+03)----SubqueryAlias: l
+04)------TableScan: left_parquet projection=[id, data]
+05)----SubqueryAlias: __correlated_sq_1
+06)------SubqueryAlias: r
+07)--------TableScan: right_parquet projection=[id]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(id@0, id@0)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id], file_type=parquet
+03)--SortExec: expr=[data@1 DESC], preserve_partitioning=[false]
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+statement count 0
+RESET datafusion.execution.parquet.pushdown_filters;
+
+# LEFT ANTI JOIN: both self generated and parent filters can push to the
+# preserved (left/build) side.
+query TT
+EXPLAIN SELECT l.*
+FROM left_parquet l
+WHERE l.id NOT IN (SELECT r.id FROM right_parquet r);
+----
+logical_plan
+01)LeftAnti Join: l.id = __correlated_sq_1.id
+02)--SubqueryAlias: l
+03)----TableScan: left_parquet projection=[id, data]
+04)--SubqueryAlias: __correlated_sq_1
+05)----SubqueryAlias: r
+06)------TableScan: right_parquet projection=[id]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(id@0, id@0)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# LEFT MARK JOIN: the OR prevents decorrelation to LeftSemi, so the optimizer
+# uses LeftMark. Self-generated dynamic filter pushes to the probe side.
+query TT
+EXPLAIN SELECT r.id, r.info
+FROM right_parquet r
+WHERE EXISTS (SELECT 1 FROM left_parquet l WHERE r.id = l.id)
+   OR r.id = 999;
+----
+logical_plan
+01)Projection: r.id, r.info
+02)--Filter: __correlated_sq_1.mark OR r.id = Int32(999)
+03)----LeftMark Join: r.id = __correlated_sq_1.id
+04)------SubqueryAlias: r
+05)--------TableScan: right_parquet projection=[id, info]
+06)------SubqueryAlias: __correlated_sq_1
+07)--------SubqueryAlias: l
+08)----------TableScan: left_parquet projection=[id]
+physical_plan
+01)FilterExec: mark@2 OR id@0 = 999, projection=[id@0, info@1]
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)]
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# LEFT MARK correctness: all right rows match EXISTS, so all 3 appear
+query IT rowsort
+SELECT r.id, r.info
+FROM right_parquet r
+WHERE EXISTS (SELECT 1 FROM left_parquet l WHERE r.id = l.id)
+   OR r.id = 999;
+----
+1 right1
+3 right3
+5 right5
+
+# Test 2c: Parent dynamic filter (from TopK) pushed through semi/anti joins
+# Sort on the join key (id) so the TopK dynamic filter pushes to BOTH sides.
+
+# SEMI JOIN with TopK parent: TopK generates a dynamic filter on `id` (join
+# key) that pushes through the RightSemi join to both the build and probe sides
+# as well as the HashJoinExec pushing the self-generated filter to the
+# right-hand side of the join.
+query TT
+EXPLAIN SELECT l.*
+FROM left_parquet l
+WHERE l.id IN (SELECT r.id FROM right_parquet r)
+ORDER BY l.id LIMIT 2;
+----
+logical_plan
+01)Sort: l.id ASC NULLS LAST, fetch=2
+02)--LeftSemi Join: l.id = __correlated_sq_1.id
+03)----SubqueryAlias: l
+04)------TableScan: left_parquet projection=[id, data]
+05)----SubqueryAlias: __correlated_sq_1
+06)------SubqueryAlias: r
+07)--------TableScan: right_parquet projection=[id]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(id@0, id@0)]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ]
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ] AND DynamicFilter [ empty ]
+
+# Correctness check
+query IT
+SELECT l.*
+FROM left_parquet l
+WHERE l.id IN (SELECT r.id FROM right_parquet r)
+ORDER BY l.id LIMIT 2;
+----
+1 left1
+3 left3
+
+# ANTI JOIN with TopK parent: TopK generates a dynamic filter on `id` (join
+# key) that pushes through the LeftAnti join to both the preserved and
+# non-preserved sides. The HashJoin pushes the self-generated filter to the
+# right hand side of the LeftAnti join.
+query TT
+EXPLAIN SELECT l.*
+FROM left_parquet l
+WHERE l.id NOT IN (SELECT r.id FROM right_parquet r)
+ORDER BY l.id LIMIT 2;
+----
+logical_plan
+01)Sort: l.id ASC NULLS LAST, fetch=2
+02)--LeftAnti Join: l.id = __correlated_sq_1.id
+03)----SubqueryAlias: l
+04)------TableScan: left_parquet projection=[id, data]
+05)----SubqueryAlias: __correlated_sq_1
+06)------SubqueryAlias: r
+07)--------TableScan: right_parquet projection=[id]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(id@0, id@0)]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ]
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ] AND DynamicFilter [ empty ]
+
+# Correctness check
+query IT
+SELECT l.*
+FROM left_parquet l
+WHERE l.id NOT IN (SELECT r.id FROM right_parquet r)
+ORDER BY l.id LIMIT 2;
+----
+2 left2
+4 left4
+
 # Test 3: Test independent control
 
 # Disable TopK, keep Join enabled
@@ -180,11 +514,9 @@ logical_plan
 05)----SubqueryAlias: r
 06)------TableScan: right_parquet projection=[id, info]
 physical_plan
-01)ProjectionExec: expr=[id@1 as id, data@2 as data, info@0 as info]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3]
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
-05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[id@2, data@3, info@1]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ]
 
 # Enable TopK, disable Join
 statement ok
@@ -207,13 +539,103 @@ logical_plan
 05)----SubqueryAlias: r
 06)------TableScan: right_parquet projection=[id, info]
 physical_plan
-01)ProjectionExec: expr=[id@1 as id, data@2 as data, info@0 as info]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3]
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
-05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[id@2, data@3, info@1]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet
+
+# Test 4: Aggregate dynamic filter pushdown
+
+# Prepare aggregate-specific parquet data without statistics so aggregate statistics optimizer
+# doesn't pre-compute results.
+statement ok
+CREATE TABLE agg_source(category VARCHAR, score INT) AS VALUES
+('alpha', 10),
+('alpha', 25),
+('beta', 5),
+('beta', 12),
+('gamma', 42),
+('gamma', 8);
+
+statement ok
+SET datafusion.execution.parquet.statistics_enabled = 'none';
+
+statement ok
+COPY agg_source TO 'test_files/scratch/dynamic_filter_pushdown_config/agg_data.parquet' STORED AS PARQUET;
+
+statement ok
+SET datafusion.execution.parquet.statistics_enabled = 'page';
+
+statement ok
+CREATE EXTERNAL TABLE agg_parquet(category VARCHAR, score INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/dynamic_filter_pushdown_config/agg_data.parquet';
+
+statement ok
+SET datafusion.execution.parquet.pushdown_filters = true;
+
+# Aggregate dynamic filter should be pushed into the scan when enabled
+# Expecting a `DynamicFilter` inside parquet scanner's predicate
+query TT
+EXPLAIN SELECT MAX(score) FROM agg_parquet WHERE category = 'alpha'
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[max(agg_parquet.score)]]
+02)--Projection: agg_parquet.score
+03)----Filter: agg_parquet.category = Utf8View("alpha")
+04)------TableScan: agg_parquet projection=[category, score], partial_filters=[agg_parquet.category = Utf8View("alpha")]
+physical_plan
+01)AggregateExec: mode=Final, gby=[], aggr=[max(agg_parquet.score)]
+02)--CoalescePartitionsExec
+03)----AggregateExec: mode=Partial, gby=[], aggr=[max(agg_parquet.score)]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/agg_data.parquet]]}, projection=[score], file_type=parquet, predicate=category@0 = alpha AND DynamicFilter [ empty ], pruning_predicate=category_null_count@2 != row_count@3 AND category_min@0 <= alpha AND alpha <= category_max@1, required_guarantees=[category in (alpha)]
+
+# Test 4b: COUNT + MAX — DynamicFilter should NOT appear here in mixed aggregates
+
+query TT
+EXPLAIN SELECT COUNT(*), MAX(score) FROM agg_parquet WHERE category = 'alpha';
+----
+logical_plan
+01)Projection: count(Int64(1)) AS count(*), max(agg_parquet.score)
+02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1)), max(agg_parquet.score)]]
+03)----Projection: agg_parquet.score
+04)------Filter: agg_parquet.category = Utf8View("alpha")
+05)--------TableScan: agg_parquet projection=[category, score], partial_filters=[agg_parquet.category = Utf8View("alpha")]
+physical_plan
+01)ProjectionExec: expr=[count(Int64(1))@0 as count(*), max(agg_parquet.score)@1 as max(agg_parquet.score)]
+02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1)), max(agg_parquet.score)]
+03)----CoalescePartitionsExec
+04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1)), max(agg_parquet.score)]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/agg_data.parquet]]}, projection=[score], file_type=parquet, predicate=category@0 = alpha, pruning_predicate=category_null_count@2 != row_count@3 AND category_min@0 <= alpha AND alpha <= category_max@1, required_guarantees=[category in (alpha)]
+
+# Disable aggregate dynamic filters only
+statement ok
+SET datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown = false;
+
+# Expecting no `DynamicFilter` inside parquet scanner's predicate
+query TT
+EXPLAIN SELECT MAX(score) FROM agg_parquet WHERE category = 'alpha'
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[max(agg_parquet.score)]]
+02)--Projection: agg_parquet.score
+03)----Filter: agg_parquet.category = Utf8View("alpha")
+04)------TableScan: agg_parquet projection=[category, score], partial_filters=[agg_parquet.category = Utf8View("alpha")]
+physical_plan
+01)AggregateExec: mode=Final, gby=[], aggr=[max(agg_parquet.score)]
+02)--CoalescePartitionsExec
+03)----AggregateExec: mode=Partial, gby=[], aggr=[max(agg_parquet.score)]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/agg_data.parquet]]}, projection=[score], file_type=parquet, predicate=category@0 = alpha, pruning_predicate=category_null_count@2 != row_count@3 AND category_min@0 <= alpha AND alpha <= category_max@1, required_guarantees=[category in (alpha)]
+
+statement ok
+SET datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown = true;
+
+statement ok
+SET datafusion.execution.parquet.pushdown_filters = false;
 
-# Test 4: Backward compatibility
+# Test 5: Backward compatibility
 
 # First, set both new configs to specific values
 statement ok
@@ -229,7 +651,7 @@ set datafusion.catalog.information_schema = true
 statement ok
 SET datafusion.optimizer.enable_dynamic_filter_pushdown = false;
 
-# Verify both configs are now false
+# Verify all configs are now false
 query T
 SELECT value FROM information_schema.df_settings
 WHERE name = 'datafusion.optimizer.enable_topk_dynamic_filter_pushdown';
@@ -242,6 +664,12 @@ WHERE name = 'datafusion.optimizer.enable_join_dynamic_filter_pushdown';
 ----
 false
 
+query T
+SELECT value FROM information_schema.df_settings
+WHERE name = 'datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown';
+----
+false
+
 statement ok
 set datafusion.catalog.information_schema = false
 
@@ -259,11 +687,9 @@ logical_plan
 05)----SubqueryAlias: r
 06)------TableScan: right_parquet projection=[id, info]
 physical_plan
-01)ProjectionExec: expr=[id@1 as id, data@2 as data, info@0 as info]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3]
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
-05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[id@2, data@3, info@1]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet
 
 # Re-enable
 statement ok
@@ -272,7 +698,7 @@ SET datafusion.optimizer.enable_dynamic_filter_pushdown = true;
 statement ok
 set datafusion.catalog.information_schema = true
 
-# Verify both configs are now true
+# Verify all configs are now true
 query T
 SELECT value FROM information_schema.df_settings
 WHERE name = 'datafusion.optimizer.enable_topk_dynamic_filter_pushdown';
@@ -285,6 +711,12 @@ WHERE name = 'datafusion.optimizer.enable_join_dynamic_filter_pushdown';
 ----
 true
 
+query T
+SELECT value FROM information_schema.df_settings
+WHERE name = 'datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown';
+----
+true
+
 statement ok
 set datafusion.catalog.information_schema = false
 
@@ -302,11 +734,100 @@ logical_plan
 05)----SubqueryAlias: r
 06)------TableScan: right_parquet projection=[id, info]
 physical_plan
-01)ProjectionExec: expr=[id@1 as id, data@2 as data, info@0 as info]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3]
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
-05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[id@2, data@3, info@1]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Test 6: Regression test for issue #20213 - dynamic filter applied to wrong table
+# when subquery join has same column names on both sides.
+#
+# The bug: when an outer join pushes a DynamicFilter for column "k" through an
+# inner join where both sides have a column named "k", the name-based routing
+# incorrectly pushed the filter to BOTH sides instead of only the correct one.
+# This caused wrong results (0 rows instead of expected).
+
+# Create tables with same column names (k, v) on both sides
+statement ok
+CREATE TABLE issue_20213_t1(k INT, v INT) AS
+SELECT i as k, i as v FROM generate_series(1, 1000) t(i);
+
+statement ok
+CREATE TABLE issue_20213_t2(k INT, v INT) AS
+SELECT i + 100 as k, i as v FROM generate_series(1, 100) t(i);
+
+# Use small row groups to make statistics-based pruning more likely to manifest the bug
+statement ok
+SET datafusion.execution.parquet.max_row_group_size = 10;
+
+query I
+COPY issue_20213_t1 TO 'test_files/scratch/dynamic_filter_pushdown_config/issue_20213_t1.parquet' STORED AS PARQUET;
+----
+1000
+
+query I
+COPY issue_20213_t2 TO 'test_files/scratch/dynamic_filter_pushdown_config/issue_20213_t2.parquet' STORED AS PARQUET;
+----
+100
+
+# Reset row group size
+statement ok
+SET datafusion.execution.parquet.max_row_group_size = 1000000;
+
+statement ok
+CREATE EXTERNAL TABLE t1_20213(k INT, v INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/dynamic_filter_pushdown_config/issue_20213_t1.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE t2_20213(k INT, v INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/dynamic_filter_pushdown_config/issue_20213_t2.parquet';
+
+# The query from issue #20213: subquery joins t1 and t2 on v, then outer
+# join uses t2's k column. The dynamic filter on k from the outer join
+# must only apply to t2 (k range 101-200), NOT to t1 (k range 1-1000).
+query I
+SELECT count(*) FROM (
+    SELECT t2_20213.k as k, t1_20213.k as k2
+    FROM t1_20213
+    JOIN t2_20213 ON t1_20213.v = t2_20213.v
+) a
+JOIN t2_20213 b ON a.k = b.k
+WHERE b.v < 10;
+----
+9
+
+# Also verify with SELECT * to catch row-level correctness
+query IIII rowsort
+SELECT * FROM (
+    SELECT t2_20213.k as k, t1_20213.k as k2
+    FROM t1_20213
+    JOIN t2_20213 ON t1_20213.v = t2_20213.v
+) a
+JOIN t2_20213 b ON a.k = b.k
+WHERE b.v < 10;
+----
+101 1 101 1
+102 2 102 2
+103 3 103 3
+104 4 104 4
+105 5 105 5
+106 6 106 6
+107 7 107 7
+108 8 108 8
+109 9 109 9
+
+statement ok
+DROP TABLE issue_20213_t1;
+
+statement ok
+DROP TABLE issue_20213_t2;
+
+statement ok
+DROP TABLE t1_20213;
+
+statement ok
+DROP TABLE t2_20213;
 
 # Cleanup
 
@@ -328,12 +849,24 @@ DROP TABLE left_parquet;
 statement ok
 DROP TABLE right_parquet;
 
-# Reset configs to defaults
+statement ok
+DROP TABLE agg_source;
+
+statement ok
+DROP TABLE agg_parquet;
+
+# Config reset
 statement ok
 SET datafusion.optimizer.enable_topk_dynamic_filter_pushdown = true;
 
 statement ok
 SET datafusion.optimizer.enable_join_dynamic_filter_pushdown = true;
 
+statement ok
+SET datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown = true;
+
 statement ok
 SET datafusion.optimizer.enable_dynamic_filter_pushdown = true;
+
+statement ok
+RESET datafusion.execution.parquet.max_row_group_size;
diff --git a/datafusion/sqllogictest/test_files/eliminate_outer_join.slt b/datafusion/sqllogictest/test_files/eliminate_outer_join.slt
new file mode 100644
index 0000000000000..dff7692a4451e
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/eliminate_outer_join.slt
@@ -0,0 +1,492 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test EliminateOuterJoin rule: outer joins with null-rejecting
+# predicates (IN list, BETWEEN) should be converted to inner joins.
+
+statement ok
+create table t1(a int, b int, c varchar);
+
+statement ok
+create table t2(x int, y int, z varchar);
+
+statement ok
+insert into t1 values (1, 10, 'a'), (2, 20, 'b'), (3, 30, 'c'), (null, 40, 'd');
+
+statement ok
+insert into t2 values (1, 100, 'p'), (2, 200, 'q'), (null, 300, 'r');
+
+statement ok
+set datafusion.explain.logical_plan_only = true;
+
+###
+### IN list tests
+###
+
+# LEFT JOIN + WHERE t2.x IN (...) -> INNER JOIN
+# IN is null-rejecting: if t2.x is NULL, IN returns NULL (filtered out).
+# After conversion to INNER, PushDownFilter infers the predicate to both sides.
+query TT
+explain select * from t1 left join t2 on t1.a = t2.x where t2.x in (1, 2, 3);
+----
+logical_plan
+01)Inner Join: t1.a = t2.x
+02)--Filter: t1.a = Int32(1) OR t1.a = Int32(2) OR t1.a = Int32(3)
+03)----TableScan: t1 projection=[a, b, c]
+04)--Filter: t2.x = Int32(1) OR t2.x = Int32(2) OR t2.x = Int32(3)
+05)----TableScan: t2 projection=[x, y, z]
+
+# Verify result correctness
+query IITIIT rowsort
+select * from t1 left join t2 on t1.a = t2.x where t2.x in (1, 2, 3);
+----
+1 10 a 1 100 p
+2 20 b 2 200 q
+
+# RIGHT JOIN + WHERE t1.a IN (...) -> INNER JOIN
+query TT
+explain select * from t1 right join t2 on t1.a = t2.x where t1.a in (1, 2);
+----
+logical_plan
+01)Inner Join: t1.a = t2.x
+02)--Filter: t1.a = Int32(1) OR t1.a = Int32(2)
+03)----TableScan: t1 projection=[a, b, c]
+04)--Filter: t2.x = Int32(1) OR t2.x = Int32(2)
+05)----TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 right join t2 on t1.a = t2.x where t1.a in (1, 2);
+----
+1 10 a 1 100 p
+2 20 b 2 200 q
+
+# FULL JOIN + WHERE on both sides -> INNER JOIN
+query TT
+explain select * from t1 full join t2 on t1.a = t2.x where t1.a in (1, 2) and t2.x in (1, 2);
+----
+logical_plan
+01)Inner Join: t1.a = t2.x
+02)--Filter: t1.a = Int32(1) OR t1.a = Int32(2)
+03)----TableScan: t1 projection=[a, b, c]
+04)--Filter: t2.x = Int32(1) OR t2.x = Int32(2)
+05)----TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 full join t2 on t1.a = t2.x where t1.a in (1, 2) and t2.x in (1, 2);
+----
+1 10 a 1 100 p
+2 20 b 2 200 q
+
+# IN list with NULL in the list — still null-rejecting on input column
+query TT
+explain select * from t1 left join t2 on t1.a = t2.x where t2.x in (1, 2, null);
+----
+logical_plan
+01)Inner Join: t1.a = t2.x
+02)--Filter: t1.a = Int32(1) OR t1.a = Int32(2) OR Boolean(NULL)
+03)----TableScan: t1 projection=[a, b, c]
+04)--Filter: t2.x = Int32(1) OR t2.x = Int32(2) OR Boolean(NULL)
+05)----TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 left join t2 on t1.a = t2.x where t2.x in (1, 2, null);
+----
+1 10 a 1 100 p
+2 20 b 2 200 q
+
+# NOT IN — also null-rejecting
+query TT
+explain select * from t1 left join t2 on t1.a = t2.x where t2.x not in (99);
+----
+logical_plan
+01)Inner Join: t1.a = t2.x
+02)--Filter: t1.a != Int32(99)
+03)----TableScan: t1 projection=[a, b, c]
+04)--Filter: t2.x != Int32(99)
+05)----TableScan: t2 projection=[x, y, z]
+
+###
+### BETWEEN tests
+###
+
+# LEFT JOIN + WHERE t2.x BETWEEN ... -> INNER JOIN
+query TT
+explain select * from t1 left join t2 on t1.a = t2.x where t2.x between 1 and 3;
+----
+logical_plan
+01)Inner Join: t1.a = t2.x
+02)--Filter: t1.a >= Int32(1) AND t1.a <= Int32(3)
+03)----TableScan: t1 projection=[a, b, c]
+04)--Filter: t2.x >= Int32(1) AND t2.x <= Int32(3)
+05)----TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 left join t2 on t1.a = t2.x where t2.x between 1 and 3;
+----
+1 10 a 1 100 p
+2 20 b 2 200 q
+
+# RIGHT JOIN + WHERE t1.a BETWEEN ... -> INNER JOIN
+query TT
+explain select * from t1 right join t2 on t1.a = t2.x where t1.a between 1 and 2;
+----
+logical_plan
+01)Inner Join: t1.a = t2.x
+02)--Filter: t1.a >= Int32(1) AND t1.a <= Int32(2)
+03)----TableScan: t1 projection=[a, b, c]
+04)--Filter: t2.x >= Int32(1) AND t2.x <= Int32(2)
+05)----TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 right join t2 on t1.a = t2.x where t1.a between 1 and 2;
+----
+1 10 a 1 100 p
+2 20 b 2 200 q
+
+###
+### Negative tests: join should NOT be converted
+###
+
+# IN on preserved side of LEFT JOIN — doesn't help eliminate.
+# Filter is still pushed down to the right side via join key inference.
+query TT
+explain select * from t1 left join t2 on t1.a = t2.x where t1.a in (1, 2, 3);
+----
+logical_plan
+01)Left Join: t1.a = t2.x
+02)--Filter: t1.a = Int32(1) OR t1.a = Int32(2) OR t1.a = Int32(3)
+03)----TableScan: t1 projection=[a, b, c]
+04)--Filter: t2.x = Int32(1) OR t2.x = Int32(2) OR t2.x = Int32(3)
+05)----TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 left join t2 on t1.a = t2.x where t1.a in (1, 2, 3);
+----
+1 10 a 1 100 p
+2 20 b 2 200 q
+3 30 c NULL NULL NULL
+
+# IN(...) OR IS NULL — not null-rejecting, join preserved
+query TT
+explain select * from t1 left join t2 on t1.a = t2.x where t2.x in (1, 2) or t2.x is null;
+----
+logical_plan
+01)Filter: t2.x = Int32(1) OR t2.x = Int32(2) OR t2.x IS NULL
+02)--Left Join: t1.a = t2.x
+03)----TableScan: t1 projection=[a, b, c]
+04)----TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 left join t2 on t1.a = t2.x where t2.x in (1, 2) or t2.x is null;
+----
+1 10 a 1 100 p
+2 20 b 2 200 q
+3 30 c NULL NULL NULL
+NULL 40 d NULL NULL NULL
+
+###
+### LIKE tests
+###
+
+# LEFT JOIN + WHERE t2.z LIKE ... -> INNER JOIN
+# LIKE is null-rejecting: if t2.z is NULL, LIKE returns NULL (filtered out).
+query TT
+explain select * from t1 left join t2 on t1.a = t2.x where t2.z like 'p%';
+----
+logical_plan
+01)Inner Join: t1.a = t2.x
+02)--TableScan: t1 projection=[a, b, c]
+03)--Filter: t2.z LIKE Utf8View("p%")
+04)----TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 left join t2 on t1.a = t2.x where t2.z like 'p%';
+----
+1 10 a 1 100 p
+
+# RIGHT JOIN + WHERE t1.c LIKE ... -> INNER JOIN
+query TT
+explain select * from t1 right join t2 on t1.a = t2.x where t1.c like '%b%';
+----
+logical_plan
+01)Inner Join: t1.a = t2.x
+02)--Filter: t1.c LIKE Utf8View("%b%")
+03)----TableScan: t1 projection=[a, b, c]
+04)--TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 right join t2 on t1.a = t2.x where t1.c like '%b%';
+----
+2 20 b 2 200 q
+
+# LEFT JOIN + nullable column on pattern side: 'x' LIKE t2.z -> INNER JOIN
+# LIKE returns NULL when the pattern is NULL, so this is null-rejecting.
+query TT
+explain select * from t1 left join t2 on t1.a = t2.x where 'x' like t2.z;
+----
+logical_plan
+01)Inner Join: t1.a = t2.x
+02)--TableScan: t1 projection=[a, b, c]
+03)--Filter: Utf8View("x") LIKE t2.z
+04)----TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 left join t2 on t1.a = t2.x where 'x' like t2.z;
+----
+
+# FULL JOIN + LIKE with columns from both sides: t1.c LIKE t2.z -> INNER JOIN
+# Either side being NULL makes LIKE return NULL, so both sides are non-nullable.
+query TT
+explain select * from t1 full join t2 on t1.a = t2.x where t1.c like t2.z;
+----
+logical_plan
+01)Inner Join: t1.a = t2.x Filter: t1.c LIKE t2.z
+02)--TableScan: t1 projection=[a, b, c]
+03)--TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 full join t2 on t1.a = t2.x where t1.c like t2.z;
+----
+
+# LEFT JOIN + LIKE on preserved side pattern — should NOT eliminate
+# t1.c is from the preserved (left) side, never NULL from the join.
+# This does not reject null t2 rows, so LEFT JOIN must stay.
+query TT
+explain select * from t1 left join t2 on t1.a = t2.x where 'hello' like t1.c;
+----
+logical_plan
+01)Left Join: t1.a = t2.x
+02)--Filter: Utf8View("hello") LIKE t1.c
+03)----TableScan: t1 projection=[a, b, c]
+04)--TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 left join t2 on t1.a = t2.x where 'hello' like t1.c;
+----
+
+###
+### IS TRUE / IS FALSE / IS NOT UNKNOWN tests
+###
+
+# LEFT JOIN + WHERE (t2.y > 150) IS TRUE -> INNER JOIN
+# IS TRUE is null-rejecting: if the expression is NULL, IS TRUE returns false.
+query TT
+explain select * from t1 left join t2 on t1.a = t2.x where (t2.y > 150) is true;
+----
+logical_plan
+01)Inner Join: t1.a = t2.x
+02)--TableScan: t1 projection=[a, b, c]
+03)--Filter: t2.y > Int32(150) IS TRUE
+04)----TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 left join t2 on t1.a = t2.x where (t2.y > 150) is true;
+----
+2 20 b 2 200 q
+
+# LEFT JOIN + WHERE (t2.y > 150) IS FALSE -> INNER JOIN
+# IS FALSE is null-rejecting: if the expression is NULL, IS FALSE returns false.
+query TT
+explain select * from t1 left join t2 on t1.a = t2.x where (t2.y > 150) is false;
+----
+logical_plan
+01)Inner Join: t1.a = t2.x
+02)--TableScan: t1 projection=[a, b, c]
+03)--Filter: t2.y > Int32(150) IS FALSE
+04)----TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 left join t2 on t1.a = t2.x where (t2.y > 150) is false;
+----
+1 10 a 1 100 p
+
+# LEFT JOIN + WHERE (t2.y > 150) IS NOT UNKNOWN -> INNER JOIN
+# IS NOT UNKNOWN is null-rejecting: if the expression is NULL, IS NOT UNKNOWN returns false.
+query TT
+explain select * from t1 left join t2 on t1.a = t2.x where (t2.y > 150) is not unknown;
+----
+logical_plan
+01)Inner Join: t1.a = t2.x
+02)--TableScan: t1 projection=[a, b, c]
+03)--Filter: t2.y > Int32(150) IS NOT UNKNOWN
+04)----TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 left join t2 on t1.a = t2.x where (t2.y > 150) is not unknown;
+----
+1 10 a 1 100 p
+2 20 b 2 200 q
+
+###
+### Negative: IS NOT TRUE / IS UNKNOWN should NOT eliminate
+###
+
+# LEFT JOIN + WHERE (t2.y > 150) IS NOT TRUE -> stays LEFT JOIN
+# IS NOT TRUE returns true for NULL, so it is NOT null-rejecting.
+query TT
+explain select * from t1 left join t2 on t1.a = t2.x where (t2.y > 150) is not true;
+----
+logical_plan
+01)Filter: t2.y > Int32(150) IS NOT TRUE
+02)--Left Join: t1.a = t2.x
+03)----TableScan: t1 projection=[a, b, c]
+04)----TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 left join t2 on t1.a = t2.x where (t2.y > 150) is not true;
+----
+1 10 a 1 100 p
+3 30 c NULL NULL NULL
+NULL 40 d NULL NULL NULL
+
+# LEFT JOIN + WHERE (t2.y > 150) IS UNKNOWN -> stays LEFT JOIN
+# IS UNKNOWN returns true for NULL, so it is NOT null-rejecting.
+query TT
+explain select * from t1 left join t2 on t1.a = t2.x where (t2.y > 150) is unknown;
+----
+logical_plan
+01)Filter: t2.y > Int32(150) IS UNKNOWN
+02)--Left Join: t1.a = t2.x
+03)----TableScan: t1 projection=[a, b, c]
+04)----TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 left join t2 on t1.a = t2.x where (t2.y > 150) is unknown;
+----
+3 30 c NULL NULL NULL
+NULL 40 d NULL NULL NULL
+
+###
+### FULL JOIN → LEFT / RIGHT conversion tests
+###
+
+# FULL JOIN + filter on left side only → LEFT JOIN
+# Left side becomes non-nullable, right side stays nullable.
+query TT
+explain select * from t1 full join t2 on t1.a = t2.x where t1.a > 0;
+----
+logical_plan
+01)Left Join: t1.a = t2.x
+02)--Filter: t1.a > Int32(0)
+03)----TableScan: t1 projection=[a, b, c]
+04)--Filter: t2.x > Int32(0)
+05)----TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 full join t2 on t1.a = t2.x where t1.a > 0;
+----
+1 10 a 1 100 p
+2 20 b 2 200 q
+3 30 c NULL NULL NULL
+
+# FULL JOIN + filter on right side only → RIGHT JOIN
+query TT
+explain select * from t1 full join t2 on t1.a = t2.x where t2.x in (1, 2);
+----
+logical_plan
+01)Right Join: t1.a = t2.x
+02)--Filter: t1.a = Int32(1) OR t1.a = Int32(2)
+03)----TableScan: t1 projection=[a, b, c]
+04)--Filter: t2.x = Int32(1) OR t2.x = Int32(2)
+05)----TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 full join t2 on t1.a = t2.x where t2.x in (1, 2);
+----
+1 10 a 1 100 p
+2 20 b 2 200 q
+
+# FULL JOIN + LIKE on left, BETWEEN on right → INNER JOIN (mixed predicates)
+query TT
+explain select * from t1 full join t2 on t1.a = t2.x where t1.c like '%a%' and t2.y between 50 and 250;
+----
+logical_plan
+01)Inner Join: t1.a = t2.x
+02)--Filter: t1.c LIKE Utf8View("%a%")
+03)----TableScan: t1 projection=[a, b, c]
+04)--Filter: t2.y >= Int32(50) AND t2.y <= Int32(250)
+05)----TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 full join t2 on t1.a = t2.x where t1.c like '%a%' and t2.y between 50 and 250;
+----
+1 10 a 1 100 p
+
+# FULL JOIN + IS TRUE on right only → RIGHT JOIN
+query TT
+explain select * from t1 full join t2 on t1.a = t2.x where (t2.y > 150) is true;
+----
+logical_plan
+01)Right Join: t1.a = t2.x
+02)--TableScan: t1 projection=[a, b, c]
+03)--Filter: t2.y > Int32(150) IS TRUE
+04)----TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 full join t2 on t1.a = t2.x where (t2.y > 150) is true;
+----
+2 20 b 2 200 q
+NULL NULL NULL NULL 300 r
+
+###
+### Nested AND / OR tests
+###
+
+# AND of multiple null-rejecting predicates on nullable side → INNER JOIN
+query TT
+explain select * from t1 left join t2 on t1.a = t2.x where t2.x in (1, 2) and t2.y between 50 and 250;
+----
+logical_plan
+01)Inner Join: t1.a = t2.x
+02)--Filter: t1.a = Int32(1) OR t1.a = Int32(2)
+03)----TableScan: t1 projection=[a, b, c]
+04)--Filter: (t2.x = Int32(1) OR t2.x = Int32(2)) AND t2.y >= Int32(50) AND t2.y <= Int32(250)
+05)----TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 left join t2 on t1.a = t2.x where t2.x in (1, 2) and t2.y between 50 and 250;
+----
+1 10 a 1 100 p
+2 20 b 2 200 q
+
+# IS TRUE AND IN on nullable side → INNER JOIN (mixed null-rejecting types)
+query TT
+explain select * from t1 left join t2 on t1.a = t2.x where (t2.y > 150) is true and t2.z like 'q%';
+----
+logical_plan
+01)Inner Join: t1.a = t2.x
+02)--TableScan: t1 projection=[a, b, c]
+03)--Filter: t2.y > Int32(150) IS TRUE AND t2.z LIKE Utf8View("q%")
+04)----TableScan: t2 projection=[x, y, z]
+
+query IITIIT rowsort
+select * from t1 left join t2 on t1.a = t2.x where (t2.y > 150) is true and t2.z like 'q%';
+----
+2 20 b 2 200 q
+
+###
+### Cleanup
+###
+
+statement ok
+set datafusion.explain.logical_plan_only = false;
+
+statement ok
+drop table t1;
+
+statement ok
+drop table t2;
diff --git a/datafusion/sqllogictest/test_files/encoding.slt b/datafusion/sqllogictest/test_files/encoding.slt
index 300294f6e1157..c68d59819ea63 100644
--- a/datafusion/sqllogictest/test_files/encoding.slt
+++ b/datafusion/sqllogictest/test_files/encoding.slt
@@ -15,6 +15,52 @@
 # specific language governing permissions and limitations
 # under the License.
 
+query T
+SELECT encode(arrow_cast('tom', 'Utf8View'),'base64');
+----
+dG9t
+
+query T
+SELECT encode(arrow_cast('tommy', 'Utf8View'),'base64pad');
+----
+dG9tbXk=
+
+query T
+SELECT arrow_cast(decode(arrow_cast('dG9t', 'Utf8View'),'base64'), 'Utf8');
+----
+tom
+
+query T
+SELECT arrow_cast(decode(arrow_cast('dG9tbXk=', 'Utf8View'),'base64pad'), 'Utf8');
+----
+tommy
+
+query T
+SELECT encode(arrow_cast('tom', 'BinaryView'),'base64');
+----
+dG9t
+
+query T
+SELECT encode(arrow_cast('tommy', 'BinaryView'),'base64pad');
+----
+dG9tbXk=
+
+query T
+SELECT arrow_cast(decode(arrow_cast('dG9t', 'BinaryView'),'base64'), 'Utf8');
+----
+tom
+
+query T
+SELECT arrow_cast(decode(arrow_cast('dG9tbXk=', 'BinaryView'),'base64pad'), 'Utf8');
+----
+tommy
+
+# test for hex digest
+query T
+select encode(digest('hello', 'sha256'), 'hex');
+----
+2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824
+
 statement ok
 CREATE TABLE test(
   num INT,
@@ -29,23 +75,29 @@ CREATE TABLE test(
 ;
 
 # errors
-query error 1st argument should be Utf8 or Binary or Null, got Int64
+query error DataFusion error: Error during planning: Function 'encode' requires Binary, but received Int64 \(DataType: Int64\)
 select encode(12, 'hex');
 
-query error DataFusion error: Error during planning: There is no built\-in encoding named 'non_encoding', currently supported encodings are: base64, hex
-select encode(bin_field, 'non_encoding') from test;
-
-query error 1st argument should be Utf8 or Binary or Null, got Int64
+query error DataFusion error: Error during planning: Function 'decode' requires Binary, but received Int64 \(DataType: Int64\)
 select decode(12, 'hex');
 
-query error DataFusion error: Error during planning: There is no built\-in encoding named 'non_encoding', currently supported encodings are: base64, hex
-select decode(hex_field, 'non_encoding') from test;
+query error DataFusion error: Error during planning: There is no built\-in encoding named 'non_encoding', currently supported encodings are: base64, base64pad, hex
+select encode('', 'non_encoding');
+
+query error DataFusion error: Error during planning: There is no built\-in encoding named 'non_encoding', currently supported encodings are: base64, base64pad, hex
+select decode('', 'non_encoding');
 
-query error
+query error DataFusion error: Execution error: Encoding must be a non-null string
+select decode('', null) from test;
+
+query error DataFusion error: This feature is not implemented: Encoding must be a scalar; array specified encoding is not yet supported
+select decode('', hex_field) from test;
+
+query error DataFusion error: Error during planning: Function 'to_hex' requires Integer, but received String \(DataType: Utf8View\)
 select to_hex(hex_field) from test;
 
-query error
-select arrow_cast(decode(X'8f50d3f60eae370ddbf85c86219c55108a350165', 'base64'), 'Utf8');
+query error DataFusion error: Execution error: Failed to decode value using base64
+select decode(X'8f50d3f60eae370ddbf85c86219c55108a350165', 'base64');
 
 # Arrays tests
 query T
@@ -56,13 +108,20 @@ SELECT encode(bin_field, 'hex') FROM test ORDER BY num;
 NULL
 8f50d3f60eae370ddbf85c86219c55108a350165
 
-query T
-SELECT arrow_cast(decode(base64_field, 'base64'), 'Utf8') FROM test ORDER BY num;
+query TTTTTT
+SELECT
+  arrow_cast(decode(arrow_cast(base64_field, 'Utf8'), 'base64'), 'Utf8'),
+  arrow_cast(decode(arrow_cast(base64_field, 'LargeUtf8'), 'base64'), 'Utf8'),
+  arrow_cast(decode(arrow_cast(base64_field, 'Utf8View'), 'base64'), 'Utf8'),
+  arrow_cast(decode(arrow_cast(base64_field, 'Binary'), 'base64'), 'Utf8'),
+  arrow_cast(decode(arrow_cast(base64_field, 'LargeBinary'), 'base64'), 'Utf8'),
+  arrow_cast(decode(arrow_cast(base64_field, 'BinaryView'), 'base64'), 'Utf8')
+FROM test ORDER BY num;
 ----
-abc
-qweqw
-NULL
-8f50d3f60eae370ddbf85c86219c55108a350165
+abc abc abc abc abc abc
+qweqw qweqw qweqw qweqw qweqw qweqw
+NULL NULL NULL NULL NULL NULL
+8f50d3f60eae370ddbf85c86219c55108a350165 8f50d3f60eae370ddbf85c86219c55108a350165 8f50d3f60eae370ddbf85c86219c55108a350165 8f50d3f60eae370ddbf85c86219c55108a350165 8f50d3f60eae370ddbf85c86219c55108a350165 8f50d3f60eae370ddbf85c86219c55108a350165
 
 query T
 SELECT arrow_cast(decode(hex_field, 'hex'), 'Utf8') FROM test ORDER BY num;
@@ -85,43 +144,115 @@ select encode(bin_field, 'base64') FROM test WHERE num = 3;
 ----
 j1DT9g6uNw3b+FyGIZxVEIo1AWU
 
+query T
+select encode(bin_field, 'base64pad') FROM test WHERE num = 3;
+----
+j1DT9g6uNw3b+FyGIZxVEIo1AWU=
+
 query B
 select decode(encode(bin_field, 'base64'), 'base64') = X'8f50d3f60eae370ddbf85c86219c55108a350165' FROM test WHERE num = 3;
 ----
 true
 
-# test for Utf8View support for encode
+query B
+select decode(encode(bin_field, 'base64pad'), 'base64pad') = X'8f50d3f60eae370ddbf85c86219c55108a350165' FROM test WHERE num = 3;
+----
+true
+
 statement ok
-CREATE TABLE test_source AS VALUES
-    ('Andrew', 'X'),
-    ('Xiangpeng', 'Xiangpeng'),
-    ('Raphael', 'R'),
-    (NULL, 'R');
+drop table test
 
+# test for Utf8View support for encode
 statement ok
 CREATE TABLE test_utf8view AS
 select
   arrow_cast(column1, 'Utf8View') AS column1_utf8view,
   arrow_cast(column2, 'Utf8View') AS column2_utf8view
-FROM test_source;
+FROM VALUES
+    ('Andrew', 'X'),
+    ('Xiangpeng', 'Xiangpeng'),
+    ('Raphael', 'R'),
+    (NULL, 'R');
 
 query TTTTTT
 SELECT
-  column1_utf8view,
   encode(column1_utf8view, 'base64') AS column1_base64,
+  encode(column1_utf8view, 'base64pad') AS column1_base64pad,
   encode(column1_utf8view, 'hex') AS column1_hex,
-  column2_utf8view,
   encode(column2_utf8view, 'base64') AS column2_base64,
+  encode(column2_utf8view, 'base64pad') AS column2_base64pad,
   encode(column2_utf8view, 'hex') AS column2_hex
 FROM test_utf8view;
 ----
-Andrew QW5kcmV3 416e64726577 X WA 58
-Xiangpeng WGlhbmdwZW5n 5869616e6770656e67 Xiangpeng WGlhbmdwZW5n 5869616e6770656e67
-Raphael UmFwaGFlbA 5261706861656c R Ug 52
-NULL NULL NULL R Ug 52
+QW5kcmV3 QW5kcmV3 416e64726577 WA WA== 58
+WGlhbmdwZW5n WGlhbmdwZW5n 5869616e6770656e67 WGlhbmdwZW5n WGlhbmdwZW5n 5869616e6770656e67
+UmFwaGFlbA UmFwaGFlbA== 5261706861656c Ug Ug== 52
+NULL NULL NULL Ug Ug== 52
 
-# test for hex digest
-query T
-select encode(digest('hello', 'sha256'), 'hex');
+query TTTTTT
+SELECT
+  encode(arrow_cast(column1_utf8view, 'Utf8'), 'base64'),
+  encode(arrow_cast(column1_utf8view, 'LargeUtf8'), 'base64'),
+  encode(arrow_cast(column1_utf8view, 'Utf8View'), 'base64'),
+  encode(arrow_cast(column1_utf8view, 'Binary'), 'base64'),
+  encode(arrow_cast(column1_utf8view, 'LargeBinary'), 'base64'),
+  encode(arrow_cast(column1_utf8view, 'BinaryView'), 'base64')
+FROM test_utf8view;
 ----
-2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824
+QW5kcmV3 QW5kcmV3 QW5kcmV3 QW5kcmV3 QW5kcmV3 QW5kcmV3
+WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n
+UmFwaGFlbA UmFwaGFlbA UmFwaGFlbA UmFwaGFlbA UmFwaGFlbA UmFwaGFlbA
+NULL NULL NULL NULL NULL NULL
+
+
+query TTTTTT
+SELECT
+  encode(arrow_cast(column1_utf8view, 'Utf8'), 'base64pad'),
+  encode(arrow_cast(column1_utf8view, 'LargeUtf8'), 'base64pad'),
+  encode(arrow_cast(column1_utf8view, 'Utf8View'), 'base64pad'),
+  encode(arrow_cast(column1_utf8view, 'Binary'), 'base64pad'),
+  encode(arrow_cast(column1_utf8view, 'LargeBinary'), 'base64pad'),
+  encode(arrow_cast(column1_utf8view, 'BinaryView'), 'base64pad')
+FROM test_utf8view;
+----
+QW5kcmV3 QW5kcmV3 QW5kcmV3 QW5kcmV3 QW5kcmV3 QW5kcmV3
+WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n
+UmFwaGFlbA== UmFwaGFlbA== UmFwaGFlbA== UmFwaGFlbA== UmFwaGFlbA== UmFwaGFlbA==
+NULL NULL NULL NULL NULL NULL
+
+statement ok
+drop table test_utf8view
+
+# FixedSizeBinary support
+statement ok
+CREATE TABLE test_fsb AS
+SELECT arrow_cast(X'0123456789ABCDEF', 'FixedSizeBinary(8)') as fsb_col;
+
+query ???
+SELECT
+  decode(encode(arrow_cast(X'0123456789abcdef', 'FixedSizeBinary(8)'), 'base64'), 'base64'),
+  decode(encode(arrow_cast(X'0123456789abcdef', 'FixedSizeBinary(8)'), 'base64pad'), 'base64pad'),
+  decode(encode(arrow_cast(X'0123456789abcdef', 'FixedSizeBinary(8)'), 'hex'), 'hex');
+----
+0123456789abcdef 0123456789abcdef 0123456789abcdef
+
+query ???
+SELECT
+  decode(encode(column1, 'base64'), 'base64'),
+  decode(encode(column1, 'base64pad'), 'base64pad'),
+  decode(encode(column1, 'hex'), 'hex')
+FROM values
+  (arrow_cast(X'0123456789abcdef', 'FixedSizeBinary(8)')),
+  (arrow_cast(X'ffffffffffffffff', 'FixedSizeBinary(8)'));
+----
+0123456789abcdef 0123456789abcdef 0123456789abcdef
+ffffffffffffffff ffffffffffffffff ffffffffffffffff
+
+query error DataFusion error: Execution error: Failed to decode value using base64
+select decode('invalid', 'base64');
+
+query error DataFusion error: Execution error: Failed to decode value using base64pad
+select decode('invalid', 'base64pad');
+
+query error DataFusion error: Execution error: Failed to decode value using hex
+select decode('invalid', 'hex');
diff --git a/datafusion/sqllogictest/test_files/errors.slt b/datafusion/sqllogictest/test_files/errors.slt
index 41f747df5baac..20c1db5cb1511 100644
--- a/datafusion/sqllogictest/test_files/errors.slt
+++ b/datafusion/sqllogictest/test_files/errors.slt
@@ -61,7 +61,7 @@ WITH t AS (WITH u as (SELECT 1) SELECT 1) SELECT * from u
 
 # select_wildcard_without_table
 statement error Error during planning: SELECT \* with no tables specified is not valid
-SELECT * 
+SELECT *
 
 # invalid_qualified_table_references
 statement error Error during planning: table 'datafusion\.nonexistentschema\.aggregate_test_100' not found
@@ -74,6 +74,11 @@ statement error DataFusion error: Error during planning: Unsupported compound id
 SELECT COUNT(*) FROM way.too.many.namespaces.as.ident.prefixes.aggregate_test_100
 
 
+# fetch_clause_not_supported
+statement error FETCH clause is not supported yet
+SELECT 1 FETCH NEXT 1 ROW ONLY
+
+
 
 #
 # Wrong scalar function signature
@@ -120,7 +125,7 @@ from aggregate_test_100
 order by c9
 
 # WindowFunction wrong signature
-statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'nth_value' function: coercion from Int32, Int64, Int64 to the signature OneOf\(\[Any\(0\), Any\(1\), Any\(2\)\]\) failed
+statement error DataFusion error: Error during planning: Internal error: Function 'nth_value' failed to match any signature
 select
 c9,
 nth_value(c5, 2, 3) over (order by c9) as nv1
diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt
index a3b6d40aea2d1..2e8a65385541e 100644
--- a/datafusion/sqllogictest/test_files/explain.slt
+++ b/datafusion/sqllogictest/test_files/explain.slt
@@ -43,10 +43,9 @@ logical_plan
 02)--Filter: aggregate_test_100.c2 > Int8(10)
 03)----TableScan: aggregate_test_100 projection=[c1, c2], partial_filters=[aggregate_test_100.c2 > Int8(10)]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: c2@1 > 10, projection=[c1@0]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2], file_type=csv, has_header=true
+01)FilterExec: c2@1 > 10, projection=[c1@0]
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2], file_type=csv, has_header=true
 
 # explain_csv_exec_scan_config
 
@@ -177,7 +176,8 @@ initial_logical_plan
 logical_plan after resolve_grouping_function SAME TEXT AS ABOVE
 logical_plan after type_coercion SAME TEXT AS ABOVE
 analyzed_logical_plan SAME TEXT AS ABOVE
-logical_plan after eliminate_nested_union SAME TEXT AS ABOVE
+logical_plan after rewrite_set_comparison SAME TEXT AS ABOVE
+logical_plan after optimize_unions SAME TEXT AS ABOVE
 logical_plan after simplify_expressions SAME TEXT AS ABOVE
 logical_plan after replace_distinct_aggregate SAME TEXT AS ABOVE
 logical_plan after eliminate_join SAME TEXT AS ABOVE
@@ -190,7 +190,6 @@ logical_plan after eliminate_filter SAME TEXT AS ABOVE
 logical_plan after eliminate_cross_join SAME TEXT AS ABOVE
 logical_plan after eliminate_limit SAME TEXT AS ABOVE
 logical_plan after propagate_empty_relation SAME TEXT AS ABOVE
-logical_plan after eliminate_one_union SAME TEXT AS ABOVE
 logical_plan after filter_null_join_keys SAME TEXT AS ABOVE
 logical_plan after eliminate_outer_join SAME TEXT AS ABOVE
 logical_plan after push_down_limit SAME TEXT AS ABOVE
@@ -198,8 +197,11 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE
 logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE
 logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE
 logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE
+logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE
+logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE
 logical_plan after optimize_projections TableScan: simple_explain_test projection=[a, b, c]
-logical_plan after eliminate_nested_union SAME TEXT AS ABOVE
+logical_plan after rewrite_set_comparison SAME TEXT AS ABOVE
+logical_plan after optimize_unions SAME TEXT AS ABOVE
 logical_plan after simplify_expressions SAME TEXT AS ABOVE
 logical_plan after replace_distinct_aggregate SAME TEXT AS ABOVE
 logical_plan after eliminate_join SAME TEXT AS ABOVE
@@ -212,7 +214,6 @@ logical_plan after eliminate_filter SAME TEXT AS ABOVE
 logical_plan after eliminate_cross_join SAME TEXT AS ABOVE
 logical_plan after eliminate_limit SAME TEXT AS ABOVE
 logical_plan after propagate_empty_relation SAME TEXT AS ABOVE
-logical_plan after eliminate_one_union SAME TEXT AS ABOVE
 logical_plan after filter_null_join_keys SAME TEXT AS ABOVE
 logical_plan after eliminate_outer_join SAME TEXT AS ABOVE
 logical_plan after push_down_limit SAME TEXT AS ABOVE
@@ -220,6 +221,8 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE
 logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE
 logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE
 logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE
+logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE
+logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE
 logical_plan after optimize_projections SAME TEXT AS ABOVE
 logical_plan TableScan: simple_explain_test projection=[a, b, c]
 initial_physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true
@@ -236,14 +239,16 @@ physical_plan after EnforceDistribution SAME TEXT AS ABOVE
 physical_plan after CombinePartialFinalAggregate SAME TEXT AS ABOVE
 physical_plan after EnforceSorting SAME TEXT AS ABOVE
 physical_plan after OptimizeAggregateOrder SAME TEXT AS ABOVE
+physical_plan after WindowTopN SAME TEXT AS ABOVE
 physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
-physical_plan after coalesce_batches SAME TEXT AS ABOVE
-physical_plan after coalesce_async_exec_input SAME TEXT AS ABOVE
 physical_plan after OutputRequirements DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true
 physical_plan after LimitAggregation SAME TEXT AS ABOVE
 physical_plan after LimitPushPastWindows SAME TEXT AS ABOVE
+physical_plan after HashJoinBuffering SAME TEXT AS ABOVE
 physical_plan after LimitPushdown SAME TEXT AS ABOVE
+physical_plan after TopKRepartition SAME TEXT AS ABOVE
 physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
+physical_plan after PushdownSort SAME TEXT AS ABOVE
 physical_plan after EnsureCooperative SAME TEXT AS ABOVE
 physical_plan after FilterPushdown(Post) SAME TEXT AS ABOVE
 physical_plan after SanityCheckPlan SAME TEXT AS ABOVE
@@ -291,22 +296,22 @@ CREATE EXTERNAL TABLE alltypes_plain STORED AS PARQUET LOCATION '../../parquet-t
 query TT
 EXPLAIN SELECT * FROM alltypes_plain limit 10;
 ----
-physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
 
 # explain verbose with both collect & show statistics on
 query TT
 EXPLAIN VERBOSE SELECT * FROM alltypes_plain limit 10;
 ----
 initial_physical_plan
-01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
+01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
 initial_physical_plan_with_schema
-01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
+01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(ns);N]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(ns);N]
 physical_plan after OutputRequirements
-01)OutputRequirementExec: order_by=[], dist_by=Unspecified, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
-02)--GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
-03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
+01)OutputRequirementExec: order_by=[], dist_by=Unspecified, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
+02)--GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
 physical_plan after aggregate_statistics SAME TEXT AS ABOVE
 physical_plan after join_selection SAME TEXT AS ABOVE
 physical_plan after LimitedDistinctAggregation SAME TEXT AS ABOVE
@@ -315,21 +320,23 @@ physical_plan after EnforceDistribution SAME TEXT AS ABOVE
 physical_plan after CombinePartialFinalAggregate SAME TEXT AS ABOVE
 physical_plan after EnforceSorting SAME TEXT AS ABOVE
 physical_plan after OptimizeAggregateOrder SAME TEXT AS ABOVE
+physical_plan after WindowTopN SAME TEXT AS ABOVE
 physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
-physical_plan after coalesce_batches SAME TEXT AS ABOVE
-physical_plan after coalesce_async_exec_input SAME TEXT AS ABOVE
 physical_plan after OutputRequirements
-01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
+01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
 physical_plan after LimitAggregation SAME TEXT AS ABOVE
 physical_plan after LimitPushPastWindows SAME TEXT AS ABOVE
-physical_plan after LimitPushdown DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
+physical_plan after HashJoinBuffering SAME TEXT AS ABOVE
+physical_plan after LimitPushdown DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
+physical_plan after TopKRepartition SAME TEXT AS ABOVE
 physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
+physical_plan after PushdownSort SAME TEXT AS ABOVE
 physical_plan after EnsureCooperative SAME TEXT AS ABOVE
 physical_plan after FilterPushdown(Post) SAME TEXT AS ABOVE
 physical_plan after SanityCheckPlan SAME TEXT AS ABOVE
-physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
-physical_plan_with_schema DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
+physical_plan_with_schema DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(ns);N]
 
 
 statement ok
@@ -343,11 +350,11 @@ initial_physical_plan
 01)GlobalLimitExec: skip=0, fetch=10
 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet
 initial_physical_plan_with_stats
-01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
+01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
 initial_physical_plan_with_schema
-01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
+01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(ns);N]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(ns);N]
 physical_plan after OutputRequirements
 01)OutputRequirementExec: order_by=[], dist_by=Unspecified
 02)--GlobalLimitExec: skip=0, fetch=10
@@ -360,26 +367,29 @@ physical_plan after EnforceDistribution SAME TEXT AS ABOVE
 physical_plan after CombinePartialFinalAggregate SAME TEXT AS ABOVE
 physical_plan after EnforceSorting SAME TEXT AS ABOVE
 physical_plan after OptimizeAggregateOrder SAME TEXT AS ABOVE
+physical_plan after WindowTopN SAME TEXT AS ABOVE
 physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
-physical_plan after coalesce_batches SAME TEXT AS ABOVE
-physical_plan after coalesce_async_exec_input SAME TEXT AS ABOVE
 physical_plan after OutputRequirements
 01)GlobalLimitExec: skip=0, fetch=10
 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet
 physical_plan after LimitAggregation SAME TEXT AS ABOVE
 physical_plan after LimitPushPastWindows SAME TEXT AS ABOVE
+physical_plan after HashJoinBuffering SAME TEXT AS ABOVE
 physical_plan after LimitPushdown DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet
+physical_plan after TopKRepartition SAME TEXT AS ABOVE
 physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
+physical_plan after PushdownSort SAME TEXT AS ABOVE
 physical_plan after EnsureCooperative SAME TEXT AS ABOVE
 physical_plan after FilterPushdown(Post) SAME TEXT AS ABOVE
 physical_plan after SanityCheckPlan SAME TEXT AS ABOVE
 physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet
-physical_plan_with_stats DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
-physical_plan_with_schema DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
+physical_plan_with_stats DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
+physical_plan_with_schema DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(ns);N]
 
 
+# Config reset
 statement ok
-set datafusion.execution.collect_statistics = false;
+reset datafusion.execution.collect_statistics;
 
 # Explain ArrayFunctions
 
@@ -541,7 +551,8 @@ initial_logical_plan
 logical_plan after resolve_grouping_function SAME TEXT AS ABOVE
 logical_plan after type_coercion SAME TEXT AS ABOVE
 analyzed_logical_plan SAME TEXT AS ABOVE
-logical_plan after eliminate_nested_union SAME TEXT AS ABOVE
+logical_plan after rewrite_set_comparison SAME TEXT AS ABOVE
+logical_plan after optimize_unions SAME TEXT AS ABOVE
 logical_plan after simplify_expressions SAME TEXT AS ABOVE
 logical_plan after replace_distinct_aggregate SAME TEXT AS ABOVE
 logical_plan after eliminate_join SAME TEXT AS ABOVE
@@ -554,7 +565,6 @@ logical_plan after eliminate_filter SAME TEXT AS ABOVE
 logical_plan after eliminate_cross_join SAME TEXT AS ABOVE
 logical_plan after eliminate_limit SAME TEXT AS ABOVE
 logical_plan after propagate_empty_relation SAME TEXT AS ABOVE
-logical_plan after eliminate_one_union SAME TEXT AS ABOVE
 logical_plan after filter_null_join_keys SAME TEXT AS ABOVE
 logical_plan after eliminate_outer_join SAME TEXT AS ABOVE
 logical_plan after push_down_limit SAME TEXT AS ABOVE
@@ -562,8 +572,11 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE
 logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE
 logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE
 logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE
+logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE
+logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE
 logical_plan after optimize_projections TableScan: simple_explain_test projection=[a, b, c]
-logical_plan after eliminate_nested_union SAME TEXT AS ABOVE
+logical_plan after rewrite_set_comparison SAME TEXT AS ABOVE
+logical_plan after optimize_unions SAME TEXT AS ABOVE
 logical_plan after simplify_expressions SAME TEXT AS ABOVE
 logical_plan after replace_distinct_aggregate SAME TEXT AS ABOVE
 logical_plan after eliminate_join SAME TEXT AS ABOVE
@@ -576,7 +589,6 @@ logical_plan after eliminate_filter SAME TEXT AS ABOVE
 logical_plan after eliminate_cross_join SAME TEXT AS ABOVE
 logical_plan after eliminate_limit SAME TEXT AS ABOVE
 logical_plan after propagate_empty_relation SAME TEXT AS ABOVE
-logical_plan after eliminate_one_union SAME TEXT AS ABOVE
 logical_plan after filter_null_join_keys SAME TEXT AS ABOVE
 logical_plan after eliminate_outer_join SAME TEXT AS ABOVE
 logical_plan after push_down_limit SAME TEXT AS ABOVE
@@ -584,6 +596,8 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE
 logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE
 logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE
 logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE
+logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE
+logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE
 logical_plan after optimize_projections SAME TEXT AS ABOVE
 logical_plan TableScan: simple_explain_test projection=[a, b, c]
 initial_physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true
@@ -600,14 +614,16 @@ physical_plan after EnforceDistribution SAME TEXT AS ABOVE
 physical_plan after CombinePartialFinalAggregate SAME TEXT AS ABOVE
 physical_plan after EnforceSorting SAME TEXT AS ABOVE
 physical_plan after OptimizeAggregateOrder SAME TEXT AS ABOVE
+physical_plan after WindowTopN SAME TEXT AS ABOVE
 physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
-physical_plan after coalesce_batches SAME TEXT AS ABOVE
-physical_plan after coalesce_async_exec_input SAME TEXT AS ABOVE
 physical_plan after OutputRequirements DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true
 physical_plan after LimitAggregation SAME TEXT AS ABOVE
 physical_plan after LimitPushPastWindows SAME TEXT AS ABOVE
+physical_plan after HashJoinBuffering SAME TEXT AS ABOVE
 physical_plan after LimitPushdown SAME TEXT AS ABOVE
+physical_plan after TopKRepartition SAME TEXT AS ABOVE
 physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
+physical_plan after PushdownSort SAME TEXT AS ABOVE
 physical_plan after EnsureCooperative SAME TEXT AS ABOVE
 physical_plan after FilterPushdown(Post) SAME TEXT AS ABOVE
 physical_plan after SanityCheckPlan SAME TEXT AS ABOVE
@@ -635,11 +651,11 @@ logical_plan
 02)--{
 03)----"Plan": {
 04)------"Node Type": "Values",
-05)------"Output": [
-06)--------"column1"
-07)------],
-08)------"Plans": [],
-09)------"Values": "(Int64(1))"
+05)------"Values": "(Int64(1))",
+06)------"Plans": [],
+07)------"Output": [
+08)--------"column1"
+09)------]
 10)----}
 11)--}
 12)]
diff --git a/datafusion/sqllogictest/test_files/explain_analyze.slt b/datafusion/sqllogictest/test_files/explain_analyze.slt
new file mode 100644
index 0000000000000..b272a8cd9c57e
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/explain_analyze.slt
@@ -0,0 +1,309 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+statement ok
+set datafusion.explain.analyze_level = summary;
+
+query TT
+EXPLAIN ANALYZE SELECT * FROM generate_series(100);
+----
+Plan with Metrics LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=0, end=100, batch_size=8192], metrics=[output_rows=101, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>]
+
+# --------------------------------------------
+# Test parquet-only output_rows_skew metric (WITH ORDER(x) → preserve_order scan)
+# --------------------------------------------
+
+# Parquet supports dynamic work scheduling by default; per-partition data
+# sources might steal work from siblings, making row counts per partition
+# non-deterministic. Forcing `WITH ORDER` disables this behavior.
+statement ok
+set datafusion.explain.analyze_level = dev;
+
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+query I
+COPY (
+  SELECT 1 AS x
+  UNION ALL SELECT 2
+  UNION ALL SELECT 3
+  UNION ALL SELECT 4
+)
+TO 'test_files/scratch/explain_analyze/output_rows_skew/f1.parquet'
+STORED AS PARQUET;
+----
+4
+
+statement ok
+CREATE EXTERNAL TABLE skew_parquet
+STORED AS PARQUET
+WITH ORDER (x)
+LOCATION 'test_files/scratch/explain_analyze/output_rows_skew';
+
+# All partition's output_rows: [4]
+query TT
+EXPLAIN ANALYZE SELECT * FROM skew_parquet;
+----
+Plan with Metrics DataSourceExec: <slt:ignore>output_rows_skew=0%<slt:ignore>
+
+statement ok
+DROP TABLE skew_parquet;
+
+query I
+COPY (SELECT * FROM (SELECT 1 AS x) WHERE false)
+TO 'test_files/scratch/explain_analyze/output_rows_skew/f2.parquet'
+STORED AS PARQUET;
+----
+0
+
+# All partition's output_rows: [4, 0]
+statement ok
+CREATE EXTERNAL TABLE skew_parquet
+STORED AS PARQUET
+WITH ORDER (x)
+LOCATION 'test_files/scratch/explain_analyze/output_rows_skew';
+
+query TT
+EXPLAIN ANALYZE SELECT * FROM skew_parquet;
+----
+Plan with Metrics DataSourceExec: <slt:ignore>output_rows_skew=100%<slt:ignore>
+
+statement ok
+DROP TABLE skew_parquet;
+
+query I
+COPY (SELECT * FROM (SELECT 1 AS x))
+TO 'test_files/scratch/explain_analyze/output_rows_skew/f3.parquet'
+STORED AS PARQUET;
+----
+1
+
+query I
+COPY (SELECT * FROM (SELECT 1 AS x) WHERE false)
+TO 'test_files/scratch/explain_analyze/output_rows_skew/f4.parquet'
+STORED AS PARQUET;
+----
+0
+
+# All partition's output_rows: [4, 0, 1, 0]
+statement ok
+CREATE EXTERNAL TABLE skew_parquet
+STORED AS PARQUET
+WITH ORDER (x)
+LOCATION 'test_files/scratch/explain_analyze/output_rows_skew';
+
+query TT
+EXPLAIN ANALYZE SELECT * FROM skew_parquet;
+----
+Plan with Metrics DataSourceExec: <slt:ignore>output_rows_skew=84.31%<slt:ignore>
+
+# All partition's output_rows: [0]
+statement ok
+CREATE EXTERNAL TABLE skew_parquet_single
+STORED AS PARQUET
+WITH ORDER (x)
+LOCATION 'test_files/scratch/explain_analyze/output_rows_skew/f4.parquet';
+
+query TT
+EXPLAIN ANALYZE SELECT * FROM skew_parquet_single;
+----
+Plan with Metrics DataSourceExec: <slt:ignore>output_rows_skew=0%<slt:ignore>
+
+statement ok
+DROP TABLE skew_parquet_single;
+
+statement ok
+DROP TABLE skew_parquet;
+
+# The SLT runner sets `target_partitions` to 4 for deterministic plans.
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+# --------------------------------------------
+# Test ProjectionExec's per-expression metrics
+# --------------------------------------------
+
+statement ok
+set datafusion.explain.analyze_level = dev;
+
+# 1 expr
+# Expect metric `expr_0_eval_time` exists in ProjectionExec
+query TT
+EXPLAIN ANALYZE
+SELECT a
+FROM generate_series(1, 100) as t1(a);
+----
+Plan with Metrics
+01)ProjectionExec: expr=[value@0 as a], metrics=[output_rows=100, elapsed_compute=<slt:ignore>, output_bytes=64.0 KB, output_batches=1, expr_0_eval_time=<slt:ignore>]
+<slt:ignore>
+
+# 2 exprs
+# Expect metrics `expr_0_eval_time` and `expr_1_eval_time` exist in ProjectionExec
+query TT
+EXPLAIN ANALYZE
+SELECT a+1, pow(a,2)
+FROM generate_series(1, 100) as t1(a);
+----
+Plan with Metrics
+01)ProjectionExec: expr=[a@0 + 1 as t1.a + Int64(1), power(CAST(a@0 AS Float64), 2) as pow(t1.a,Int64(2))], metrics=[output_rows=100, elapsed_compute=<slt:ignore>, output_bytes=1632.0 B, output_batches=1, expr_0_eval_time=<slt:ignore>, expr_1_eval_time=<slt:ignore>]
+<slt:ignore>
+
+# common expressions
+# Expect metrics `expr_0_eval_time` and `expr_1_eval_time` exist in ProjectionExec
+query TT
+EXPLAIN ANALYZE
+SELECT a+1, a+1 as another_a_plus_one
+FROM generate_series(1, 100) as t1(a);
+----
+Plan with Metrics
+01)ProjectionExec: expr=[__common_expr_1@0 as t1.a + Int64(1), __common_expr_1@0 as another_a_plus_one], metrics=[output_rows=100, elapsed_compute=<slt:ignore>, output_bytes=800.0 B, output_batches=1, expr_0_eval_time=<slt:ignore>, expr_1_eval_time=<slt:ignore>]
+02)--ProjectionExec: expr=[a@0 + 1 as __common_expr_1], metrics=[output_rows=100, elapsed_compute=<slt:ignore>, output_bytes=800.0 B, output_batches=1, expr_0_eval_time=<slt:ignore>]
+<slt:ignore>
+
+statement ok
+reset datafusion.explain.analyze_level;
+
+# ------------------------------------------------
+# Test analyze_categories: filter metrics by kind
+# ------------------------------------------------
+# Categories classify metrics by determinism:
+#   rows, bytes  — depend on plan + data, deterministic across runs
+#   timing       — varies run-to-run even on same hardware
+
+# --- Setup: create a small parquet table with multiple row groups ---
+
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+statement ok
+CREATE TABLE _cat_data AS VALUES
+  ('Anow Vole', 7),
+  ('Brown Bear', 133),
+  ('Gray Wolf', 82),
+  ('Lynx', 71),
+  ('Red Fox', 40),
+  ('Alpine Bat', 6),
+  ('Nlpine Ibex', 101),
+  ('Nlpine Goat', 76),
+  ('Nlpine Sheep', 83),
+  ('Europ. Mole', 4),
+  ('Polecat', 16),
+  ('Alpine Ibex', 97);
+
+statement ok
+COPY (SELECT column1 as species, column2 as s FROM _cat_data)
+TO 'test_files/scratch/explain_analyze/data.parquet'
+STORED AS PARQUET
+OPTIONS ('format.max_row_group_size' '3');
+
+statement ok
+drop table _cat_data;
+
+statement ok
+CREATE EXTERNAL TABLE cat_tracking
+STORED AS PARQUET
+LOCATION 'test_files/scratch/explain_analyze/data.parquet';
+
+# ---- categories = 'none': plan only, no metrics at all ----
+
+statement ok
+set datafusion.explain.analyze_level = summary;
+
+statement ok
+set datafusion.explain.analyze_categories = 'none';
+
+query TT
+explain analyze select * from cat_tracking where species > 'M' AND s >= 50 order by species limit 3;
+----
+Plan with Metrics
+01)SortExec: TopK(fetch=3), expr=[species@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[species@0 < Nlpine Sheep], metrics=[]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/explain_analyze/data.parquet]]}, projection=[species, s], file_type=parquet, predicate=species@0 > M AND s@1 >= 50 AND DynamicFilter [ species@0 < Nlpine Sheep ], pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50 AND species_null_count@1 != row_count@2 AND species_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[]
+
+statement ok
+reset datafusion.explain.analyze_categories;
+
+# ---- categories = 'rows': deterministic row-count metrics only ----
+# Note: no elapsed_compute, no output_bytes, no bytes_scanned, no metadata_load_time
+
+statement ok
+set datafusion.explain.analyze_categories = 'rows';
+
+query TT
+explain analyze select * from cat_tracking where species > 'M' AND s >= 50 order by species limit 3;
+----
+Plan with Metrics
+01)SortExec: TopK(fetch=3), expr=[species@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[species@0 < Nlpine Sheep], metrics=[output_rows=3]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/explain_analyze/data.parquet]]}, projection=[species, s], file_type=parquet, predicate=species@0 > M AND s@1 >= 50 AND DynamicFilter [ species@0 < Nlpine Sheep ], pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50 AND species_null_count@1 != row_count@2 AND species_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[output_rows=3, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_pages_pruned=6 total → 6 matched, limit_pruned_row_groups=0 total → 0 matched, scan_efficiency_ratio=22.13% (521/2.35 K)]
+
+statement ok
+reset datafusion.explain.analyze_categories;
+
+# ---- categories = 'rows,bytes': add byte metrics, still no timing ----
+
+statement ok
+set datafusion.explain.analyze_categories = 'rows,bytes';
+
+query TT
+explain analyze select * from cat_tracking where species > 'M' AND s >= 50 order by species limit 3;
+----
+Plan with Metrics
+01)SortExec: TopK(fetch=3), expr=[species@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[species@0 < Nlpine Sheep], metrics=[output_rows=3, output_bytes=<slt:ignore>]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/explain_analyze/data.parquet]]}, projection=[species, s], file_type=parquet, predicate=species@0 > M AND s@1 >= 50 AND DynamicFilter [ species@0 < Nlpine Sheep ], pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50 AND species_null_count@1 != row_count@2 AND species_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[output_rows=3, output_bytes=<slt:ignore>, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_pages_pruned=6 total → 6 matched, limit_pruned_row_groups=0 total → 0 matched, bytes_scanned=<slt:ignore>, scan_efficiency_ratio=<slt:ignore>]
+
+statement ok
+reset datafusion.explain.analyze_categories;
+
+# ---- categories = 'rows,bytes,uncategorized': everything except timing ----
+
+statement ok
+set datafusion.explain.analyze_categories = 'rows,bytes,uncategorized';
+
+query TT
+explain analyze select * from cat_tracking where species > 'M' AND s >= 50 order by species limit 3;
+----
+Plan with Metrics
+01)SortExec: TopK(fetch=3), expr=[species@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[species@0 < Nlpine Sheep], metrics=[output_rows=3, output_bytes=<slt:ignore>]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/explain_analyze/data.parquet]]}, projection=[species, s], file_type=parquet, predicate=species@0 > M AND s@1 >= 50 AND DynamicFilter [ species@0 < Nlpine Sheep ], pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50 AND species_null_count@1 != row_count@2 AND species_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[output_rows=3, output_bytes=<slt:ignore>, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_pages_pruned=6 total → 6 matched, limit_pruned_row_groups=0 total → 0 matched, bytes_scanned=<slt:ignore>, scan_efficiency_ratio=<slt:ignore>]
+
+statement ok
+reset datafusion.explain.analyze_categories;
+
+# ---- categories = 'timing': only timing metrics (non-deterministic) ----
+
+statement ok
+set datafusion.explain.analyze_categories = 'timing';
+
+query TT
+explain analyze select * from cat_tracking where species > 'M' AND s >= 50 order by species limit 3;
+----
+Plan with Metrics
+01)SortExec: TopK(fetch=3), expr=[species@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[species@0 < Nlpine Sheep], metrics=[elapsed_compute=<slt:ignore>]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/explain_analyze/data.parquet]]}, projection=[species, s], file_type=parquet, predicate=species@0 > M AND s@1 >= 50 AND DynamicFilter [ species@0 < Nlpine Sheep ], pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50 AND species_null_count@1 != row_count@2 AND species_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[elapsed_compute=<slt:ignore>, metadata_load_time=<slt:ignore>]
+
+statement ok
+reset datafusion.explain.analyze_categories;
+
+statement ok
+reset datafusion.explain.analyze_level;
+
+# --- Teardown ---
+
+statement ok
+drop table cat_tracking;
+
+statement ok
+reset datafusion.execution.parquet.pushdown_filters;
diff --git a/datafusion/sqllogictest/test_files/explain_tree.slt b/datafusion/sqllogictest/test_files/explain_tree.slt
index 22f19a0af32e4..46d01f39a920b 100644
--- a/datafusion/sqllogictest/test_files/explain_tree.slt
+++ b/datafusion/sqllogictest/test_files/explain_tree.slt
@@ -166,32 +166,26 @@ explain SELECT int_col FROM table1 WHERE string_col != 'foo';
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│         FilterExec        │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
+04)│         predicate:        │
+05)│     string_col != foo     │
 06)└─────────────┬─────────────┘
 07)┌─────────────┴─────────────┐
-08)│         FilterExec        │
+08)│      RepartitionExec      │
 09)│    --------------------   │
-10)│         predicate:        │
-11)│     string_col != foo     │
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│      RepartitionExec      │
-15)│    --------------------   │
-16)│ partition_count(in->out): │
-17)│           1 -> 4          │
-18)│                           │
-19)│    partitioning_scheme:   │
-20)│     RoundRobinBatch(4)    │
-21)└─────────────┬─────────────┘
-22)┌─────────────┴─────────────┐
-23)│       DataSourceExec      │
-24)│    --------------------   │
-25)│          files: 1         │
-26)│        format: csv        │
-27)└───────────────────────────┘
+10)│ partition_count(in->out): │
+11)│           1 -> 4          │
+12)│                           │
+13)│    partitioning_scheme:   │
+14)│     RoundRobinBatch(4)    │
+15)└─────────────┬─────────────┘
+16)┌─────────────┴─────────────┐
+17)│       DataSourceExec      │
+18)│    --------------------   │
+19)│          files: 1         │
+20)│        format: csv        │
+21)└───────────────────────────┘
 
 # Aggregate
 query TT
@@ -210,44 +204,38 @@ physical_plan
 10)│      FinalPartitioned     │
 11)└─────────────┬─────────────┘
 12)┌─────────────┴─────────────┐
-13)│    CoalesceBatchesExec    │
+13)│      RepartitionExec      │
 14)│    --------------------   │
-15)│     target_batch_size:    │
-16)│            8192           │
-17)└─────────────┬─────────────┘
-18)┌─────────────┴─────────────┐
-19)│      RepartitionExec      │
-20)│    --------------------   │
-21)│ partition_count(in->out): │
-22)│           4 -> 4          │
-23)│                           │
-24)│    partitioning_scheme:   │
-25)│  Hash([string_col@0], 4)  │
-26)└─────────────┬─────────────┘
-27)┌─────────────┴─────────────┐
-28)│       AggregateExec       │
-29)│    --------------------   │
-30)│           aggr:           │
-31)│   sum(table1.bigint_col)  │
-32)│                           │
-33)│    group_by: string_col   │
-34)│       mode: Partial       │
-35)└─────────────┬─────────────┘
-36)┌─────────────┴─────────────┐
-37)│      RepartitionExec      │
-38)│    --------------------   │
-39)│ partition_count(in->out): │
-40)│           1 -> 4          │
-41)│                           │
-42)│    partitioning_scheme:   │
-43)│     RoundRobinBatch(4)    │
-44)└─────────────┬─────────────┘
-45)┌─────────────┴─────────────┐
-46)│       DataSourceExec      │
-47)│    --------------------   │
-48)│          files: 1         │
-49)│        format: csv        │
-50)└───────────────────────────┘
+15)│ partition_count(in->out): │
+16)│           4 -> 4          │
+17)│                           │
+18)│    partitioning_scheme:   │
+19)│  Hash([string_col@0], 4)  │
+20)└─────────────┬─────────────┘
+21)┌─────────────┴─────────────┐
+22)│       AggregateExec       │
+23)│    --------------------   │
+24)│           aggr:           │
+25)│   sum(table1.bigint_col)  │
+26)│                           │
+27)│    group_by: string_col   │
+28)│       mode: Partial       │
+29)└─────────────┬─────────────┘
+30)┌─────────────┴─────────────┐
+31)│      RepartitionExec      │
+32)│    --------------------   │
+33)│ partition_count(in->out): │
+34)│           1 -> 4          │
+35)│                           │
+36)│    partitioning_scheme:   │
+37)│     RoundRobinBatch(4)    │
+38)└─────────────┬─────────────┘
+39)┌─────────────┴─────────────┐
+40)│       DataSourceExec      │
+41)│    --------------------   │
+42)│          files: 1         │
+43)│        format: csv        │
+44)└───────────────────────────┘
 
 
 # Limit
@@ -280,7 +268,7 @@ physical_plan
 06)┌─────────────┴─────────────┐
 07)│       DataSourceExec      │
 08)│    --------------------   │
-09)│        bytes: 1040        │
+09)│        bytes: 1024        │
 10)│       format: memory      │
 11)│          rows: 2          │
 12)└───────────────────────────┘
@@ -296,44 +284,30 @@ ON
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│       ProjectionExec      │
+02)│        HashJoinExec       │
 03)│    --------------------   │
-04)│     date_col: date_col    │
-05)│                           │
-06)│        string_col:        │
-07)│         string_col        │
-08)└─────────────┬─────────────┘
-09)┌─────────────┴─────────────┐
-10)│    CoalesceBatchesExec    │
-11)│    --------------------   │
-12)│     target_batch_size:    │
-13)│            8192           │
-14)└─────────────┬─────────────┘
-15)┌─────────────┴─────────────┐
-16)│        HashJoinExec       │
-17)│    --------------------   │
-18)│          filter:          │
-19)│ CAST(int_col + int_col AS │
-20)│       Int64) % 2 = 0      ├──────────────┐
-21)│                           │              │
-22)│            on:            │              │
-23)│    (int_col = int_col)    │              │
-24)└─────────────┬─────────────┘              │
-25)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-26)│       DataSourceExec      ││      RepartitionExec      │
-27)│    --------------------   ││    --------------------   │
-28)│          files: 1         ││ partition_count(in->out): │
-29)│      format: parquet      ││           1 -> 4          │
-30)│                           ││                           │
-31)│                           ││    partitioning_scheme:   │
-32)│                           ││     RoundRobinBatch(4)    │
-33)└───────────────────────────┘└─────────────┬─────────────┘
-34)-----------------------------┌─────────────┴─────────────┐
-35)-----------------------------│       DataSourceExec      │
-36)-----------------------------│    --------------------   │
-37)-----------------------------│          files: 1         │
-38)-----------------------------│        format: csv        │
-39)-----------------------------└───────────────────────────┘
+04)│          filter:          │
+05)│ CAST(int_col + int_col AS │
+06)│       Int64) % 2 = 0      ├──────────────┐
+07)│                           │              │
+08)│            on:            │              │
+09)│    (int_col = int_col)    │              │
+10)└─────────────┬─────────────┘              │
+11)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+12)│       DataSourceExec      ││      RepartitionExec      │
+13)│    --------------------   ││    --------------------   │
+14)│          files: 1         ││ partition_count(in->out): │
+15)│      format: parquet      ││           1 -> 4          │
+16)│                           ││                           │
+17)│                           ││    partitioning_scheme:   │
+18)│                           ││     RoundRobinBatch(4)    │
+19)└───────────────────────────┘└─────────────┬─────────────┘
+20)-----------------------------┌─────────────┴─────────────┐
+21)-----------------------------│       DataSourceExec      │
+22)-----------------------------│    --------------------   │
+23)-----------------------------│          files: 1         │
+24)-----------------------------│        format: csv        │
+25)-----------------------------└───────────────────────────┘
 
 # 3 Joins
 query TT
@@ -347,61 +321,33 @@ FROM
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│       ProjectionExec      │
+02)│        HashJoinExec       │
 03)│    --------------------   │
-04)│     date_col: date_col    │
-05)│                           │
-06)│        string_col:        │
-07)│         string_col        │
-08)└─────────────┬─────────────┘
-09)┌─────────────┴─────────────┐
-10)│    CoalesceBatchesExec    │
-11)│    --------------------   │
-12)│     target_batch_size:    │
-13)│            8192           │
-14)└─────────────┬─────────────┘
-15)┌─────────────┴─────────────┐
-16)│        HashJoinExec       │
-17)│    --------------------   │
-18)│            on:            ├──────────────┐
-19)│    (int_col = int_col)    │              │
-20)└─────────────┬─────────────┘              │
-21)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-22)│       DataSourceExec      ││       ProjectionExec      │
-23)│    --------------------   ││    --------------------   │
-24)│         bytes: 520        ││     date_col: date_col    │
-25)│       format: memory      ││      int_col: int_col     │
-26)│          rows: 1          ││                           │
-27)│                           ││        string_col:        │
-28)│                           ││         string_col        │
-29)└───────────────────────────┘└─────────────┬─────────────┘
-30)-----------------------------┌─────────────┴─────────────┐
-31)-----------------------------│    CoalesceBatchesExec    │
-32)-----------------------------│    --------------------   │
-33)-----------------------------│     target_batch_size:    │
-34)-----------------------------│            8192           │
-35)-----------------------------└─────────────┬─────────────┘
-36)-----------------------------┌─────────────┴─────────────┐
-37)-----------------------------│        HashJoinExec       │
-38)-----------------------------│    --------------------   │
-39)-----------------------------│            on:            ├──────────────┐
-40)-----------------------------│    (int_col = int_col)    │              │
-41)-----------------------------└─────────────┬─────────────┘              │
-42)-----------------------------┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-43)-----------------------------│       DataSourceExec      ││      RepartitionExec      │
-44)-----------------------------│    --------------------   ││    --------------------   │
-45)-----------------------------│          files: 1         ││ partition_count(in->out): │
-46)-----------------------------│      format: parquet      ││           1 -> 4          │
-47)-----------------------------│                           ││                           │
-48)-----------------------------│         predicate:        ││    partitioning_scheme:   │
-49)-----------------------------│  DynamicFilter [ empty ]  ││     RoundRobinBatch(4)    │
-50)-----------------------------└───────────────────────────┘└─────────────┬─────────────┘
-51)----------------------------------------------------------┌─────────────┴─────────────┐
-52)----------------------------------------------------------│       DataSourceExec      │
-53)----------------------------------------------------------│    --------------------   │
-54)----------------------------------------------------------│          files: 1         │
-55)----------------------------------------------------------│        format: csv        │
-56)----------------------------------------------------------└───────────────────────────┘
+04)│            on:            ├──────────────┐
+05)│    (int_col = int_col)    │              │
+06)└─────────────┬─────────────┘              │
+07)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+08)│       DataSourceExec      ││        HashJoinExec       │
+09)│    --------------------   ││    --------------------   │
+10)│         bytes: 512        ││            on:            ├──────────────┐
+11)│       format: memory      ││    (int_col = int_col)    │              │
+12)│          rows: 1          ││                           │              │
+13)└───────────────────────────┘└─────────────┬─────────────┘              │
+14)-----------------------------┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+15)-----------------------------│       DataSourceExec      ││      RepartitionExec      │
+16)-----------------------------│    --------------------   ││    --------------------   │
+17)-----------------------------│          files: 1         ││ partition_count(in->out): │
+18)-----------------------------│      format: parquet      ││           1 -> 4          │
+19)-----------------------------│                           ││                           │
+20)-----------------------------│         predicate:        ││    partitioning_scheme:   │
+21)-----------------------------│  DynamicFilter [ empty ]  ││     RoundRobinBatch(4)    │
+22)-----------------------------└───────────────────────────┘└─────────────┬─────────────┘
+23)----------------------------------------------------------┌─────────────┴─────────────┐
+24)----------------------------------------------------------│       DataSourceExec      │
+25)----------------------------------------------------------│    --------------------   │
+26)----------------------------------------------------------│          files: 1         │
+27)----------------------------------------------------------│        format: csv        │
+28)----------------------------------------------------------└───────────────────────────┘
 
 # Long Filter (demonstrate what happens with wrapping)
 query TT
@@ -411,36 +357,30 @@ WHERE string_col != 'foo' AND string_col != 'bar' AND string_col != 'a really lo
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│         FilterExec        │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
-06)└─────────────┬─────────────┘
-07)┌─────────────┴─────────────┐
-08)│         FilterExec        │
-09)│    --------------------   │
-10)│         predicate:        │
-11)│   string_col != foo AND   │
-12)│      string_col != bar    │
-13)│     AND string_col != a   │
-14)│     really long string    │
-15)│          constant         │
-16)└─────────────┬─────────────┘
-17)┌─────────────┴─────────────┐
-18)│      RepartitionExec      │
-19)│    --------------------   │
-20)│ partition_count(in->out): │
-21)│           1 -> 4          │
-22)│                           │
-23)│    partitioning_scheme:   │
-24)│     RoundRobinBatch(4)    │
-25)└─────────────┬─────────────┘
-26)┌─────────────┴─────────────┐
-27)│       DataSourceExec      │
-28)│    --------------------   │
-29)│          files: 1         │
-30)│        format: csv        │
-31)└───────────────────────────┘
+04)│         predicate:        │
+05)│   string_col != foo AND   │
+06)│      string_col != bar    │
+07)│     AND string_col != a   │
+08)│     really long string    │
+09)│          constant         │
+10)└─────────────┬─────────────┘
+11)┌─────────────┴─────────────┐
+12)│      RepartitionExec      │
+13)│    --------------------   │
+14)│ partition_count(in->out): │
+15)│           1 -> 4          │
+16)│                           │
+17)│    partitioning_scheme:   │
+18)│     RoundRobinBatch(4)    │
+19)└─────────────┬─────────────┘
+20)┌─────────────┴─────────────┐
+21)│       DataSourceExec      │
+22)│    --------------------   │
+23)│          files: 1         │
+24)│        format: csv        │
+25)└───────────────────────────┘
 
 # Check maximum line limit.
 query TT
@@ -449,17 +389,17 @@ WHERE string_col != 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│         FilterExec        │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
-06)└─────────────┬─────────────┘
-07)┌─────────────┴─────────────┐
-08)│         FilterExec        │
-09)│    --------------------   │
-10)│         predicate:        │
-11)│       string_col !=       │
-12)│        aaaaaaaaaaaa       │
+04)│         predicate:        │
+05)│       string_col !=       │
+06)│        aaaaaaaaaaaa       │
+07)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
+08)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
+09)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
+10)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
+11)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
+12)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
 13)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
 14)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
 15)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
@@ -480,29 +420,23 @@ physical_plan
 30)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
 31)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
 32)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
-33)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
-34)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
-35)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
-36)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
-37)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
-38)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
-39)│            ...            │
-40)└─────────────┬─────────────┘
-41)┌─────────────┴─────────────┐
-42)│      RepartitionExec      │
-43)│    --------------------   │
-44)│ partition_count(in->out): │
-45)│           1 -> 4          │
-46)│                           │
-47)│    partitioning_scheme:   │
-48)│     RoundRobinBatch(4)    │
-49)└─────────────┬─────────────┘
-50)┌─────────────┴─────────────┐
-51)│       DataSourceExec      │
-52)│    --------------------   │
-53)│          files: 1         │
-54)│        format: csv        │
-55)└───────────────────────────┘
+33)│            ...            │
+34)└─────────────┬─────────────┘
+35)┌─────────────┴─────────────┐
+36)│      RepartitionExec      │
+37)│    --------------------   │
+38)│ partition_count(in->out): │
+39)│           1 -> 4          │
+40)│                           │
+41)│    partitioning_scheme:   │
+42)│     RoundRobinBatch(4)    │
+43)└─────────────┬─────────────┘
+44)┌─────────────┴─────────────┐
+45)│       DataSourceExec      │
+46)│    --------------------   │
+47)│          files: 1         │
+48)│        format: csv        │
+49)└───────────────────────────┘
 
 # Check exactly the render width.
 query TT
@@ -511,32 +445,26 @@ WHERE string_col != 'aaaaaaaaaaaaa';
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│         FilterExec        │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
+04)│         predicate:        │
+05)│string_col != aaaaaaaaaaaaa│
 06)└─────────────┬─────────────┘
 07)┌─────────────┴─────────────┐
-08)│         FilterExec        │
+08)│      RepartitionExec      │
 09)│    --------------------   │
-10)│         predicate:        │
-11)│string_col != aaaaaaaaaaaaa│
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│      RepartitionExec      │
-15)│    --------------------   │
-16)│ partition_count(in->out): │
-17)│           1 -> 4          │
-18)│                           │
-19)│    partitioning_scheme:   │
-20)│     RoundRobinBatch(4)    │
-21)└─────────────┬─────────────┘
-22)┌─────────────┴─────────────┐
-23)│       DataSourceExec      │
-24)│    --------------------   │
-25)│          files: 1         │
-26)│        format: csv        │
-27)└───────────────────────────┘
+10)│ partition_count(in->out): │
+11)│           1 -> 4          │
+12)│                           │
+13)│    partitioning_scheme:   │
+14)│     RoundRobinBatch(4)    │
+15)└─────────────┬─────────────┘
+16)┌─────────────┴─────────────┐
+17)│       DataSourceExec      │
+18)│    --------------------   │
+19)│          files: 1         │
+20)│        format: csv        │
+21)└───────────────────────────┘
 
 # Check with the render witdth + 1.
 query TT
@@ -545,34 +473,28 @@ WHERE string_col != 'aaaaaaaaaaaaaaa';
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│         FilterExec        │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
-06)└─────────────┬─────────────┘
-07)┌─────────────┴─────────────┐
-08)│         FilterExec        │
-09)│    --------------------   │
-10)│         predicate:        │
-11)│       string_col !=       │
-12)│        aaaaaaaaaaaa       │
-13)│            aaa            │
-14)└─────────────┬─────────────┘
-15)┌─────────────┴─────────────┐
-16)│      RepartitionExec      │
-17)│    --------------------   │
-18)│ partition_count(in->out): │
-19)│           1 -> 4          │
-20)│                           │
-21)│    partitioning_scheme:   │
-22)│     RoundRobinBatch(4)    │
-23)└─────────────┬─────────────┘
-24)┌─────────────┴─────────────┐
-25)│       DataSourceExec      │
-26)│    --------------------   │
-27)│          files: 1         │
-28)│        format: csv        │
-29)└───────────────────────────┘
+04)│         predicate:        │
+05)│       string_col !=       │
+06)│        aaaaaaaaaaaa       │
+07)│            aaa            │
+08)└─────────────┬─────────────┘
+09)┌─────────────┴─────────────┐
+10)│      RepartitionExec      │
+11)│    --------------------   │
+12)│ partition_count(in->out): │
+13)│           1 -> 4          │
+14)│                           │
+15)│    partitioning_scheme:   │
+16)│     RoundRobinBatch(4)    │
+17)└─────────────┬─────────────┘
+18)┌─────────────┴─────────────┐
+19)│       DataSourceExec      │
+20)│    --------------------   │
+21)│          files: 1         │
+22)│        format: csv        │
+23)└───────────────────────────┘
 
 # Query with filter on csv
 query TT
@@ -580,32 +502,26 @@ explain SELECT int_col FROM table1 WHERE string_col != 'foo';
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│         FilterExec        │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
+04)│         predicate:        │
+05)│     string_col != foo     │
 06)└─────────────┬─────────────┘
 07)┌─────────────┴─────────────┐
-08)│         FilterExec        │
+08)│      RepartitionExec      │
 09)│    --------------------   │
-10)│         predicate:        │
-11)│     string_col != foo     │
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│      RepartitionExec      │
-15)│    --------------------   │
-16)│ partition_count(in->out): │
-17)│           1 -> 4          │
-18)│                           │
-19)│    partitioning_scheme:   │
-20)│     RoundRobinBatch(4)    │
-21)└─────────────┬─────────────┘
-22)┌─────────────┴─────────────┐
-23)│       DataSourceExec      │
-24)│    --------------------   │
-25)│          files: 1         │
-26)│        format: csv        │
-27)└───────────────────────────┘
+10)│ partition_count(in->out): │
+11)│           1 -> 4          │
+12)│                           │
+13)│    partitioning_scheme:   │
+14)│     RoundRobinBatch(4)    │
+15)└─────────────┬─────────────┘
+16)┌─────────────┴─────────────┐
+17)│       DataSourceExec      │
+18)│    --------------------   │
+19)│          files: 1         │
+20)│        format: csv        │
+21)└───────────────────────────┘
 
 
 # Query with filter on parquet
@@ -614,35 +530,29 @@ explain SELECT int_col FROM table2 WHERE string_col != 'foo';
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│         FilterExec        │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
+04)│         predicate:        │
+05)│     string_col != foo     │
 06)└─────────────┬─────────────┘
 07)┌─────────────┴─────────────┐
-08)│         FilterExec        │
+08)│      RepartitionExec      │
 09)│    --------------------   │
-10)│         predicate:        │
-11)│     string_col != foo     │
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│      RepartitionExec      │
-15)│    --------------------   │
-16)│ partition_count(in->out): │
-17)│           1 -> 4          │
-18)│                           │
-19)│    partitioning_scheme:   │
-20)│     RoundRobinBatch(4)    │
-21)└─────────────┬─────────────┘
-22)┌─────────────┴─────────────┐
-23)│       DataSourceExec      │
-24)│    --------------------   │
-25)│          files: 1         │
-26)│      format: parquet      │
-27)│                           │
-28)│         predicate:        │
-29)│     string_col != foo     │
-30)└───────────────────────────┘
+10)│ partition_count(in->out): │
+11)│           1 -> 4          │
+12)│                           │
+13)│    partitioning_scheme:   │
+14)│     RoundRobinBatch(4)    │
+15)└─────────────┬─────────────┘
+16)┌─────────────┴─────────────┐
+17)│       DataSourceExec      │
+18)│    --------------------   │
+19)│          files: 1         │
+20)│      format: parquet      │
+21)│                           │
+22)│         predicate:        │
+23)│     string_col != foo     │
+24)└───────────────────────────┘
 
 # Query with filter on memory
 query TT
@@ -650,24 +560,18 @@ explain SELECT int_col FROM table3 WHERE string_col != 'foo';
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│         FilterExec        │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
+04)│         predicate:        │
+05)│     string_col != foo     │
 06)└─────────────┬─────────────┘
 07)┌─────────────┴─────────────┐
-08)│         FilterExec        │
+08)│       DataSourceExec      │
 09)│    --------------------   │
-10)│         predicate:        │
-11)│     string_col != foo     │
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│       DataSourceExec      │
-15)│    --------------------   │
-16)│         bytes: 520        │
-17)│       format: memory      │
-18)│          rows: 1          │
-19)└───────────────────────────┘
+10)│         bytes: 512        │
+11)│       format: memory      │
+12)│          rows: 1          │
+13)└───────────────────────────┘
 
 # Query with filter on json
 query TT
@@ -675,32 +579,26 @@ explain SELECT int_col FROM table4 WHERE string_col != 'foo';
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│         FilterExec        │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
+04)│         predicate:        │
+05)│     string_col != foo     │
 06)└─────────────┬─────────────┘
 07)┌─────────────┴─────────────┐
-08)│         FilterExec        │
+08)│      RepartitionExec      │
 09)│    --------------------   │
-10)│         predicate:        │
-11)│     string_col != foo     │
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│      RepartitionExec      │
-15)│    --------------------   │
-16)│ partition_count(in->out): │
-17)│           1 -> 4          │
-18)│                           │
-19)│    partitioning_scheme:   │
-20)│     RoundRobinBatch(4)    │
-21)└─────────────┬─────────────┘
-22)┌─────────────┴─────────────┐
-23)│       DataSourceExec      │
-24)│    --------------------   │
-25)│          files: 1         │
-26)│        format: json       │
-27)└───────────────────────────┘
+10)│ partition_count(in->out): │
+11)│           1 -> 4          │
+12)│                           │
+13)│    partitioning_scheme:   │
+14)│     RoundRobinBatch(4)    │
+15)└─────────────┬─────────────┘
+16)┌─────────────┴─────────────┐
+17)│       DataSourceExec      │
+18)│    --------------------   │
+19)│          files: 1         │
+20)│        format: json       │
+21)└───────────────────────────┘
 
 # Query with filter on arrow
 query TT
@@ -708,32 +606,26 @@ explain SELECT int_col FROM table5 WHERE string_col != 'foo';
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│         FilterExec        │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
+04)│         predicate:        │
+05)│     string_col != foo     │
 06)└─────────────┬─────────────┘
 07)┌─────────────┴─────────────┐
-08)│         FilterExec        │
+08)│      RepartitionExec      │
 09)│    --------------------   │
-10)│         predicate:        │
-11)│     string_col != foo     │
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│      RepartitionExec      │
-15)│    --------------------   │
-16)│ partition_count(in->out): │
-17)│           1 -> 4          │
-18)│                           │
-19)│    partitioning_scheme:   │
-20)│     RoundRobinBatch(4)    │
-21)└─────────────┬─────────────┘
-22)┌─────────────┴─────────────┐
-23)│       DataSourceExec      │
-24)│    --------------------   │
-25)│          files: 1         │
-26)│       format: arrow       │
-27)└───────────────────────────┘
+10)│ partition_count(in->out): │
+11)│           1 -> 4          │
+12)│                           │
+13)│    partitioning_scheme:   │
+14)│     RoundRobinBatch(4)    │
+15)└─────────────┬─────────────┘
+16)┌─────────────┴─────────────┐
+17)│       DataSourceExec      │
+18)│    --------------------   │
+19)│          files: 1         │
+20)│       format: arrow       │
+21)└───────────────────────────┘
 
 
 # Query with window agg.
@@ -1012,23 +904,11 @@ explain SELECT int_col, bigint_col, int_col+bigint_col AS sum_col FROM table2;
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│       ProjectionExec      │
+02)│       DataSourceExec      │
 03)│    --------------------   │
-04)│        bigint_col:        │
-05)│         bigint_col        │
-06)│                           │
-07)│      int_col: int_col     │
-08)│                           │
-09)│          sum_col:         │
-10)│  CAST(int_col AS Int64) + │
-11)│         bigint_col        │
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│       DataSourceExec      │
-15)│    --------------------   │
-16)│          files: 1         │
-17)│      format: parquet      │
-18)└───────────────────────────┘
+04)│          files: 1         │
+05)│      format: parquet      │
+06)└───────────────────────────┘
 
 # Query with projection on memory
 query TT
@@ -1050,7 +930,7 @@ physical_plan
 13)┌─────────────┴─────────────┐
 14)│       DataSourceExec      │
 15)│    --------------------   │
-16)│         bytes: 520        │
+16)│         bytes: 512        │
 17)│       format: memory      │
 18)│          rows: 1          │
 19)└───────────────────────────┘
@@ -1171,46 +1051,28 @@ explain select * from table1 inner join table2 on table1.int_col = table2.int_co
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│       ProjectionExec      │
+02)│        HashJoinExec       │
 03)│    --------------------   │
-04)│        bigint_col:        │
-05)│         bigint_col        │
-06)│                           │
-07)│     date_col: date_col    │
-08)│      int_col: int_col     │
-09)│                           │
-10)│        string_col:        │
-11)│         string_col        │
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│    CoalesceBatchesExec    │
-15)│    --------------------   │
-16)│     target_batch_size:    │
-17)│            8192           │
-18)└─────────────┬─────────────┘
-19)┌─────────────┴─────────────┐
-20)│        HashJoinExec       │
-21)│    --------------------   │
-22)│            on:            │
-23)│   (int_col = int_col),    ├──────────────┐
-24)│       (string_col =       │              │
-25)│         string_col)       │              │
-26)└─────────────┬─────────────┘              │
-27)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-28)│       DataSourceExec      ││      RepartitionExec      │
-29)│    --------------------   ││    --------------------   │
-30)│          files: 1         ││ partition_count(in->out): │
-31)│      format: parquet      ││           1 -> 4          │
-32)│                           ││                           │
-33)│                           ││    partitioning_scheme:   │
-34)│                           ││     RoundRobinBatch(4)    │
-35)└───────────────────────────┘└─────────────┬─────────────┘
-36)-----------------------------┌─────────────┴─────────────┐
-37)-----------------------------│       DataSourceExec      │
-38)-----------------------------│    --------------------   │
-39)-----------------------------│          files: 1         │
-40)-----------------------------│        format: csv        │
-41)-----------------------------└───────────────────────────┘
+04)│            on:            │
+05)│   (int_col = int_col),    ├──────────────┐
+06)│       (string_col =       │              │
+07)│         string_col)       │              │
+08)└─────────────┬─────────────┘              │
+09)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+10)│       DataSourceExec      ││      RepartitionExec      │
+11)│    --------------------   ││    --------------------   │
+12)│          files: 1         ││ partition_count(in->out): │
+13)│      format: parquet      ││           1 -> 4          │
+14)│                           ││                           │
+15)│                           ││    partitioning_scheme:   │
+16)│                           ││     RoundRobinBatch(4)    │
+17)└───────────────────────────┘└─────────────┬─────────────┘
+18)-----------------------------┌─────────────┴─────────────┐
+19)-----------------------------│       DataSourceExec      │
+20)-----------------------------│    --------------------   │
+21)-----------------------------│          files: 1         │
+22)-----------------------------│        format: csv        │
+23)-----------------------------└───────────────────────────┘
 
 # Query with outer hash join.
 query TT
@@ -1218,48 +1080,30 @@ explain select * from table1 left outer join table2 on table1.int_col = table2.i
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│       ProjectionExec      │
+02)│        HashJoinExec       │
 03)│    --------------------   │
-04)│        bigint_col:        │
-05)│         bigint_col        │
-06)│                           │
-07)│     date_col: date_col    │
-08)│      int_col: int_col     │
-09)│                           │
-10)│        string_col:        │
-11)│         string_col        │
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│    CoalesceBatchesExec    │
-15)│    --------------------   │
-16)│     target_batch_size:    │
-17)│            8192           │
-18)└─────────────┬─────────────┘
-19)┌─────────────┴─────────────┐
-20)│        HashJoinExec       │
-21)│    --------------------   │
-22)│      join_type: Right     │
-23)│                           │
-24)│            on:            ├──────────────┐
-25)│   (int_col = int_col),    │              │
-26)│       (string_col =       │              │
-27)│         string_col)       │              │
-28)└─────────────┬─────────────┘              │
-29)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-30)│       DataSourceExec      ││      RepartitionExec      │
-31)│    --------------------   ││    --------------------   │
-32)│          files: 1         ││ partition_count(in->out): │
-33)│      format: parquet      ││           1 -> 4          │
-34)│                           ││                           │
-35)│                           ││    partitioning_scheme:   │
-36)│                           ││     RoundRobinBatch(4)    │
-37)└───────────────────────────┘└─────────────┬─────────────┘
-38)-----------------------------┌─────────────┴─────────────┐
-39)-----------------------------│       DataSourceExec      │
-40)-----------------------------│    --------------------   │
-41)-----------------------------│          files: 1         │
-42)-----------------------------│        format: csv        │
-43)-----------------------------└───────────────────────────┘
+04)│      join_type: Right     │
+05)│                           │
+06)│            on:            ├──────────────┐
+07)│   (int_col = int_col),    │              │
+08)│       (string_col =       │              │
+09)│         string_col)       │              │
+10)└─────────────┬─────────────┘              │
+11)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+12)│       DataSourceExec      ││      RepartitionExec      │
+13)│    --------------------   ││    --------------------   │
+14)│          files: 1         ││ partition_count(in->out): │
+15)│      format: parquet      ││           1 -> 4          │
+16)│                           ││                           │
+17)│                           ││    partitioning_scheme:   │
+18)│                           ││     RoundRobinBatch(4)    │
+19)└───────────────────────────┘└─────────────┬─────────────┘
+20)-----------------------------┌─────────────┴─────────────┐
+21)-----------------------------│       DataSourceExec      │
+22)-----------------------------│    --------------------   │
+23)-----------------------------│          files: 1         │
+24)-----------------------------│        format: csv        │
+25)-----------------------------└───────────────────────────┘
 
 # Query with nested loop join.
 query TT
@@ -1396,42 +1240,27 @@ physical_plan
 25)│      FinalPartitioned     ││      FinalPartitioned     │
 26)└─────────────┬─────────────┘└─────────────┬─────────────┘
 27)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-28)│    CoalesceBatchesExec    ││    CoalesceBatchesExec    │
+28)│      RepartitionExec      ││      RepartitionExec      │
 29)│    --------------------   ││    --------------------   │
-30)│     target_batch_size:    ││     target_batch_size:    │
-31)│            8192           ││            8192           │
-32)└─────────────┬─────────────┘└─────────────┬─────────────┘
-33)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-34)│      RepartitionExec      ││      RepartitionExec      │
-35)│    --------------------   ││    --------------------   │
-36)│ partition_count(in->out): ││ partition_count(in->out): │
-37)│           4 -> 4          ││           4 -> 4          │
-38)│                           ││                           │
-39)│    partitioning_scheme:   ││    partitioning_scheme:   │
-40)│     Hash([name@0], 4)     ││     Hash([name@0], 4)     │
+30)│ partition_count(in->out): ││ partition_count(in->out): │
+31)│           1 -> 4          ││           1 -> 4          │
+32)│                           ││                           │
+33)│    partitioning_scheme:   ││    partitioning_scheme:   │
+34)│     Hash([name@0], 4)     ││     Hash([name@0], 4)     │
+35)└─────────────┬─────────────┘└─────────────┬─────────────┘
+36)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+37)│       AggregateExec       ││       AggregateExec       │
+38)│    --------------------   ││    --------------------   │
+39)│       group_by: name      ││       group_by: name      │
+40)│       mode: Partial       ││       mode: Partial       │
 41)└─────────────┬─────────────┘└─────────────┬─────────────┘
 42)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-43)│      RepartitionExec      ││      RepartitionExec      │
+43)│       DataSourceExec      ││       DataSourceExec      │
 44)│    --------------------   ││    --------------------   │
-45)│ partition_count(in->out): ││ partition_count(in->out): │
-46)│           1 -> 4          ││           1 -> 4          │
-47)│                           ││                           │
-48)│    partitioning_scheme:   ││    partitioning_scheme:   │
-49)│     RoundRobinBatch(4)    ││     RoundRobinBatch(4)    │
-50)└─────────────┬─────────────┘└─────────────┬─────────────┘
-51)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-52)│       AggregateExec       ││       AggregateExec       │
-53)│    --------------------   ││    --------------------   │
-54)│       group_by: name      ││       group_by: name      │
-55)│       mode: Partial       ││       mode: Partial       │
-56)└─────────────┬─────────────┘└─────────────┬─────────────┘
-57)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-58)│       DataSourceExec      ││       DataSourceExec      │
-59)│    --------------------   ││    --------------------   │
-60)│         bytes: 296        ││         bytes: 288        │
-61)│       format: memory      ││       format: memory      │
-62)│          rows: 1          ││          rows: 1          │
-63)└───────────────────────────┘└───────────────────────────┘
+45)│         bytes: 288        ││         bytes: 280        │
+46)│       format: memory      ││       format: memory      │
+47)│          rows: 1          ││          rows: 1          │
+48)└───────────────────────────┘└───────────────────────────┘
 
 # Test explain tree for UnionExec
 query TT
@@ -1447,14 +1276,14 @@ physical_plan
 04)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
 05)│       DataSourceExec      ││       ProjectionExec      │
 06)│    --------------------   ││    --------------------   │
-07)│         bytes: 296        ││   id: CAST(id AS Int32)   │
+07)│         bytes: 288        ││   id: CAST(id AS Int32)   │
 08)│       format: memory      ││         name: name        │
 09)│          rows: 1          ││                           │
 10)└───────────────────────────┘└─────────────┬─────────────┘
 11)-----------------------------┌─────────────┴─────────────┐
 12)-----------------------------│       DataSourceExec      │
 13)-----------------------------│    --------------------   │
-14)-----------------------------│         bytes: 288        │
+14)-----------------------------│         bytes: 280        │
 15)-----------------------------│       format: memory      │
 16)-----------------------------│          rows: 1          │
 17)-----------------------------└───────────────────────────┘
@@ -1492,31 +1321,25 @@ physical_plan
 05)│       ASC NULLS LAST      │
 06)└─────────────┬─────────────┘
 07)┌─────────────┴─────────────┐
-08)│    CoalesceBatchesExec    │
+08)│         FilterExec        │
 09)│    --------------------   │
-10)│     target_batch_size:    │
-11)│            8192           │
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│         FilterExec        │
-15)│    --------------------   │
-16)│   predicate: ticker = A   │
-17)└─────────────┬─────────────┘
-18)┌─────────────┴─────────────┐
-19)│      RepartitionExec      │
-20)│    --------------------   │
-21)│ partition_count(in->out): │
-22)│           1 -> 4          │
-23)│                           │
-24)│    partitioning_scheme:   │
-25)│     RoundRobinBatch(4)    │
-26)└─────────────┬─────────────┘
-27)┌─────────────┴─────────────┐
-28)│     StreamingTableExec    │
-29)│    --------------------   │
-30)│       infinite: true      │
-31)│        limit: None        │
-32)└───────────────────────────┘
+10)│   predicate: ticker = A   │
+11)└─────────────┬─────────────┘
+12)┌─────────────┴─────────────┐
+13)│      RepartitionExec      │
+14)│    --------------------   │
+15)│ partition_count(in->out): │
+16)│           1 -> 4          │
+17)│                           │
+18)│    partitioning_scheme:   │
+19)│     RoundRobinBatch(4)    │
+20)└─────────────┬─────────────┘
+21)┌─────────────┴─────────────┐
+22)│     StreamingTableExec    │
+23)│    --------------------   │
+24)│       infinite: true      │
+25)│        limit: None        │
+26)└───────────────────────────┘
 
 
 # constant ticker, CAST(time AS DATE) = time, order by time
@@ -1532,33 +1355,27 @@ physical_plan
 04)│    time ASC NULLS LAST    │
 05)└─────────────┬─────────────┘
 06)┌─────────────┴─────────────┐
-07)│    CoalesceBatchesExec    │
+07)│         FilterExec        │
 08)│    --------------------   │
-09)│     target_batch_size:    │
-10)│            8192           │
-11)└─────────────┬─────────────┘
-12)┌─────────────┴─────────────┐
-13)│         FilterExec        │
-14)│    --------------------   │
-15)│         predicate:        │
-16)│  ticker = A AND CAST(time │
-17)│      AS Date32) = date    │
-18)└─────────────┬─────────────┘
-19)┌─────────────┴─────────────┐
-20)│      RepartitionExec      │
-21)│    --------------------   │
-22)│ partition_count(in->out): │
-23)│           1 -> 4          │
-24)│                           │
-25)│    partitioning_scheme:   │
-26)│     RoundRobinBatch(4)    │
-27)└─────────────┬─────────────┘
-28)┌─────────────┴─────────────┐
-29)│     StreamingTableExec    │
-30)│    --------------------   │
-31)│       infinite: true      │
-32)│        limit: None        │
-33)└───────────────────────────┘
+09)│         predicate:        │
+10)│  ticker = A AND CAST(time │
+11)│      AS Date32) = date    │
+12)└─────────────┬─────────────┘
+13)┌─────────────┴─────────────┐
+14)│      RepartitionExec      │
+15)│    --------------------   │
+16)│ partition_count(in->out): │
+17)│           1 -> 4          │
+18)│                           │
+19)│    partitioning_scheme:   │
+20)│     RoundRobinBatch(4)    │
+21)└─────────────┬─────────────┘
+22)┌─────────────┴─────────────┐
+23)│     StreamingTableExec    │
+24)│    --------------------   │
+25)│       infinite: true      │
+26)│        limit: None        │
+27)└───────────────────────────┘
 
 # same thing but order by date
 query TT
@@ -1573,33 +1390,27 @@ physical_plan
 04)│    date ASC NULLS LAST    │
 05)└─────────────┬─────────────┘
 06)┌─────────────┴─────────────┐
-07)│    CoalesceBatchesExec    │
+07)│         FilterExec        │
 08)│    --------------------   │
-09)│     target_batch_size:    │
-10)│            8192           │
-11)└─────────────┬─────────────┘
-12)┌─────────────┴─────────────┐
-13)│         FilterExec        │
-14)│    --------------------   │
-15)│         predicate:        │
-16)│  ticker = A AND CAST(time │
-17)│      AS Date32) = date    │
-18)└─────────────┬─────────────┘
-19)┌─────────────┴─────────────┐
-20)│      RepartitionExec      │
-21)│    --------------------   │
-22)│ partition_count(in->out): │
-23)│           1 -> 4          │
-24)│                           │
-25)│    partitioning_scheme:   │
-26)│     RoundRobinBatch(4)    │
-27)└─────────────┬─────────────┘
-28)┌─────────────┴─────────────┐
-29)│     StreamingTableExec    │
-30)│    --------------------   │
-31)│       infinite: true      │
-32)│        limit: None        │
-33)└───────────────────────────┘
+09)│         predicate:        │
+10)│  ticker = A AND CAST(time │
+11)│      AS Date32) = date    │
+12)└─────────────┬─────────────┘
+13)┌─────────────┴─────────────┐
+14)│      RepartitionExec      │
+15)│    --------------------   │
+16)│ partition_count(in->out): │
+17)│           1 -> 4          │
+18)│                           │
+19)│    partitioning_scheme:   │
+20)│     RoundRobinBatch(4)    │
+21)└─────────────┬─────────────┘
+22)┌─────────────┴─────────────┐
+23)│     StreamingTableExec    │
+24)│    --------------------   │
+25)│       infinite: true      │
+26)│        limit: None        │
+27)└───────────────────────────┘
 
 # same thing but order by ticker
 query TT
@@ -1612,33 +1423,27 @@ physical_plan
 02)│   CoalescePartitionsExec  │
 03)└─────────────┬─────────────┘
 04)┌─────────────┴─────────────┐
-05)│    CoalesceBatchesExec    │
+05)│         FilterExec        │
 06)│    --------------------   │
-07)│     target_batch_size:    │
-08)│            8192           │
-09)└─────────────┬─────────────┘
-10)┌─────────────┴─────────────┐
-11)│         FilterExec        │
-12)│    --------------------   │
-13)│         predicate:        │
-14)│  ticker = A AND CAST(time │
-15)│      AS Date32) = date    │
-16)└─────────────┬─────────────┘
-17)┌─────────────┴─────────────┐
-18)│      RepartitionExec      │
-19)│    --------------------   │
-20)│ partition_count(in->out): │
-21)│           1 -> 4          │
-22)│                           │
-23)│    partitioning_scheme:   │
-24)│     RoundRobinBatch(4)    │
-25)└─────────────┬─────────────┘
-26)┌─────────────┴─────────────┐
-27)│     StreamingTableExec    │
-28)│    --------------------   │
-29)│       infinite: true      │
-30)│        limit: None        │
-31)└───────────────────────────┘
+07)│         predicate:        │
+08)│  ticker = A AND CAST(time │
+09)│      AS Date32) = date    │
+10)└─────────────┬─────────────┘
+11)┌─────────────┴─────────────┐
+12)│      RepartitionExec      │
+13)│    --------------------   │
+14)│ partition_count(in->out): │
+15)│           1 -> 4          │
+16)│                           │
+17)│    partitioning_scheme:   │
+18)│     RoundRobinBatch(4)    │
+19)└─────────────┬─────────────┘
+20)┌─────────────┴─────────────┐
+21)│     StreamingTableExec    │
+22)│    --------------------   │
+23)│       infinite: true      │
+24)│        limit: None        │
+25)└───────────────────────────┘
 
 
 # same thing but order by time, date
@@ -1655,33 +1460,27 @@ physical_plan
 05)│       ASC NULLS LAST      │
 06)└─────────────┬─────────────┘
 07)┌─────────────┴─────────────┐
-08)│    CoalesceBatchesExec    │
+08)│         FilterExec        │
 09)│    --------------------   │
-10)│     target_batch_size:    │
-11)│            8192           │
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│         FilterExec        │
-15)│    --------------------   │
-16)│         predicate:        │
-17)│  ticker = A AND CAST(time │
-18)│      AS Date32) = date    │
-19)└─────────────┬─────────────┘
-20)┌─────────────┴─────────────┐
-21)│      RepartitionExec      │
-22)│    --------------------   │
-23)│ partition_count(in->out): │
-24)│           1 -> 4          │
-25)│                           │
-26)│    partitioning_scheme:   │
-27)│     RoundRobinBatch(4)    │
-28)└─────────────┬─────────────┘
-29)┌─────────────┴─────────────┐
-30)│     StreamingTableExec    │
-31)│    --------------------   │
-32)│       infinite: true      │
-33)│        limit: None        │
-34)└───────────────────────────┘
+10)│         predicate:        │
+11)│  ticker = A AND CAST(time │
+12)│      AS Date32) = date    │
+13)└─────────────┬─────────────┘
+14)┌─────────────┴─────────────┐
+15)│      RepartitionExec      │
+16)│    --------------------   │
+17)│ partition_count(in->out): │
+18)│           1 -> 4          │
+19)│                           │
+20)│    partitioning_scheme:   │
+21)│     RoundRobinBatch(4)    │
+22)└─────────────┬─────────────┘
+23)┌─────────────┴─────────────┐
+24)│     StreamingTableExec    │
+25)│    --------------------   │
+26)│       infinite: true      │
+27)│        limit: None        │
+28)└───────────────────────────┘
 
 
 
@@ -1702,34 +1501,28 @@ physical_plan
 07)│     time ASC NULLS LAST   │
 08)└─────────────┬─────────────┘
 09)┌─────────────┴─────────────┐
-10)│    CoalesceBatchesExec    │
+10)│         FilterExec        │
 11)│    --------------------   │
-12)│          limit: 5         │
+12)│          fetch: 5         │
 13)│                           │
-14)│     target_batch_size:    │
-15)│            8192           │
+14)│         predicate:        │
+15)│     date = 2006-01-02     │
 16)└─────────────┬─────────────┘
 17)┌─────────────┴─────────────┐
-18)│         FilterExec        │
+18)│      RepartitionExec      │
 19)│    --------------------   │
-20)│         predicate:        │
-21)│     date = 2006-01-02     │
-22)└─────────────┬─────────────┘
-23)┌─────────────┴─────────────┐
-24)│      RepartitionExec      │
-25)│    --------------------   │
-26)│ partition_count(in->out): │
-27)│           1 -> 4          │
-28)│                           │
-29)│    partitioning_scheme:   │
-30)│     RoundRobinBatch(4)    │
-31)└─────────────┬─────────────┘
-32)┌─────────────┴─────────────┐
-33)│     StreamingTableExec    │
-34)│    --------------------   │
-35)│       infinite: true      │
-36)│        limit: None        │
-37)└───────────────────────────┘
+20)│ partition_count(in->out): │
+21)│           1 -> 4          │
+22)│                           │
+23)│    partitioning_scheme:   │
+24)│     RoundRobinBatch(4)    │
+25)└─────────────┬─────────────┘
+26)┌─────────────┴─────────────┐
+27)│     StreamingTableExec    │
+28)│    --------------------   │
+29)│       infinite: true      │
+30)│        limit: None        │
+31)└───────────────────────────┘
 
 
 
@@ -1748,32 +1541,26 @@ physical_plan
 05)│     time ASC NULLS LAST   │
 06)└─────────────┬─────────────┘
 07)┌─────────────┴─────────────┐
-08)│    CoalesceBatchesExec    │
+08)│         FilterExec        │
 09)│    --------------------   │
-10)│     target_batch_size:    │
-11)│            8192           │
+10)│         predicate:        │
+11)│     date = 2006-01-02     │
 12)└─────────────┬─────────────┘
 13)┌─────────────┴─────────────┐
-14)│         FilterExec        │
+14)│      RepartitionExec      │
 15)│    --------------------   │
-16)│         predicate:        │
-17)│     date = 2006-01-02     │
-18)└─────────────┬─────────────┘
-19)┌─────────────┴─────────────┐
-20)│      RepartitionExec      │
-21)│    --------------------   │
-22)│ partition_count(in->out): │
-23)│           1 -> 4          │
-24)│                           │
-25)│    partitioning_scheme:   │
-26)│     RoundRobinBatch(4)    │
-27)└─────────────┬─────────────┘
-28)┌─────────────┴─────────────┐
-29)│     StreamingTableExec    │
-30)│    --------------------   │
-31)│       infinite: true      │
-32)│        limit: None        │
-33)└───────────────────────────┘
+16)│ partition_count(in->out): │
+17)│           1 -> 4          │
+18)│                           │
+19)│    partitioning_scheme:   │
+20)│     RoundRobinBatch(4)    │
+21)└─────────────┬─────────────┘
+22)┌─────────────┴─────────────┐
+23)│     StreamingTableExec    │
+24)│    --------------------   │
+25)│       infinite: true      │
+26)│        limit: None        │
+27)└───────────────────────────┘
 
 
 
@@ -1803,30 +1590,24 @@ physical_plan
 12)│                           ││         id: id + 1        │
 13)└───────────────────────────┘└─────────────┬─────────────┘
 14)-----------------------------┌─────────────┴─────────────┐
-15)-----------------------------│    CoalesceBatchesExec    │
+15)-----------------------------│         FilterExec        │
 16)-----------------------------│    --------------------   │
-17)-----------------------------│     target_batch_size:    │
-18)-----------------------------│            8192           │
-19)-----------------------------└─────────────┬─────────────┘
-20)-----------------------------┌─────────────┴─────────────┐
-21)-----------------------------│         FilterExec        │
-22)-----------------------------│    --------------------   │
-23)-----------------------------│     predicate: id < 10    │
-24)-----------------------------└─────────────┬─────────────┘
-25)-----------------------------┌─────────────┴─────────────┐
-26)-----------------------------│      RepartitionExec      │
-27)-----------------------------│    --------------------   │
-28)-----------------------------│ partition_count(in->out): │
-29)-----------------------------│           1 -> 4          │
-30)-----------------------------│                           │
-31)-----------------------------│    partitioning_scheme:   │
-32)-----------------------------│     RoundRobinBatch(4)    │
-33)-----------------------------└─────────────┬─────────────┘
-34)-----------------------------┌─────────────┴─────────────┐
-35)-----------------------------│       WorkTableExec       │
-36)-----------------------------│    --------------------   │
-37)-----------------------------│        name: nodes        │
-38)-----------------------------└───────────────────────────┘
+17)-----------------------------│     predicate: id < 10    │
+18)-----------------------------└─────────────┬─────────────┘
+19)-----------------------------┌─────────────┴─────────────┐
+20)-----------------------------│      RepartitionExec      │
+21)-----------------------------│    --------------------   │
+22)-----------------------------│ partition_count(in->out): │
+23)-----------------------------│           1 -> 4          │
+24)-----------------------------│                           │
+25)-----------------------------│    partitioning_scheme:   │
+26)-----------------------------│     RoundRobinBatch(4)    │
+27)-----------------------------└─────────────┬─────────────┘
+28)-----------------------------┌─────────────┴─────────────┐
+29)-----------------------------│       WorkTableExec       │
+30)-----------------------------│    --------------------   │
+31)-----------------------------│        name: nodes        │
+32)-----------------------------└───────────────────────────┘
 
 query TT
 explain COPY (VALUES (1, 'foo', 1, '2023-01-01'), (2, 'bar', 2, '2023-01-02'), (3, 'baz', 3, '2023-01-03'))
@@ -1946,25 +1727,18 @@ physical_plan
 38)│          skip: 6          │
 39)└─────────────┬─────────────┘
 40)┌─────────────┴─────────────┐
-41)│    CoalesceBatchesExec    │
+41)│         FilterExec        │
 42)│    --------------------   │
-43)│          limit: 9         │
-44)│                           │
-45)│     target_batch_size:    │
-46)│            8192           │
-47)└─────────────┬─────────────┘
-48)┌─────────────┴─────────────┐
-49)│         FilterExec        │
-50)│    --------------------   │
-51)│      predicate: a > 3     │
-52)└─────────────┬─────────────┘
-53)┌─────────────┴─────────────┐
-54)│       DataSourceExec      │
-55)│    --------------------   │
-56)│         bytes: 160        │
-57)│       format: memory      │
-58)│          rows: 1          │
-59)└───────────────────────────┘
+43)│          fetch: 9         │
+44)│      predicate: a > 3     │
+45)└─────────────┬─────────────┘
+46)┌─────────────┴─────────────┐
+47)│       DataSourceExec      │
+48)│    --------------------   │
+49)│         bytes: 160        │
+50)│       format: memory      │
+51)│          rows: 1          │
+52)└───────────────────────────┘
 
 # clean up
 statement ok
@@ -1994,33 +1768,26 @@ physical_plan
 04)│          limit: 5         │
 05)└─────────────┬─────────────┘
 06)┌─────────────┴─────────────┐
-07)│    CoalesceBatchesExec    │
+07)│         FilterExec        │
 08)│    --------------------   │
-09)│          limit: 5         │
-10)│                           │
-11)│     target_batch_size:    │
-12)│            8192           │
-13)└─────────────┬─────────────┘
-14)┌─────────────┴─────────────┐
-15)│         FilterExec        │
-16)│    --------------------   │
-17)│     predicate: c3 > 0     │
-18)└─────────────┬─────────────┘
-19)┌─────────────┴─────────────┐
-20)│      RepartitionExec      │
-21)│    --------------------   │
-22)│ partition_count(in->out): │
-23)│           1 -> 4          │
-24)│                           │
-25)│    partitioning_scheme:   │
-26)│     RoundRobinBatch(4)    │
-27)└─────────────┬─────────────┘
-28)┌─────────────┴─────────────┐
-29)│     StreamingTableExec    │
-30)│    --------------------   │
-31)│       infinite: true      │
-32)│        limit: None        │
-33)└───────────────────────────┘
+09)│          fetch: 5         │
+10)│     predicate: c3 > 0     │
+11)└─────────────┬─────────────┘
+12)┌─────────────┴─────────────┐
+13)│      RepartitionExec      │
+14)│    --------------------   │
+15)│ partition_count(in->out): │
+16)│           1 -> 4          │
+17)│                           │
+18)│    partitioning_scheme:   │
+19)│     RoundRobinBatch(4)    │
+20)└─────────────┬─────────────┘
+21)┌─────────────┴─────────────┐
+22)│     StreamingTableExec    │
+23)│    --------------------   │
+24)│       infinite: true      │
+25)│        limit: None        │
+26)└───────────────────────────┘
 
 # Test explain tree for PlaceholderRowExec
 query TT
@@ -2266,3 +2033,10 @@ physical_plan
 
 statement ok
 DROP TABLE t
+
+# Config reset
+statement ok
+RESET datafusion.explain.format;
+
+statement ok
+RESET datafusion.explain.tree_maximum_render_width;
diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt
index 87345b833e264..163730baae9e8 100644
--- a/datafusion/sqllogictest/test_files/expr.slt
+++ b/datafusion/sqllogictest/test_files/expr.slt
@@ -22,7 +22,7 @@ SELECT true, false, false = false, true = false
 true false true false
 
 # test_mathematical_expressions_with_null
-query RRRRRRRRRRRRRRRRRRRRRRRRIIIRRRRRRBB
+query RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRBB
 SELECT
     sqrt(NULL),
     cbrt(NULL),
@@ -60,7 +60,7 @@ SELECT
     isnan(NULL),
     iszero(NULL)
 ----
-NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
+NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1 NULL NULL NULL
 
 # test_array_cast_invalid_timezone_will_panic
 statement error Parser error: Invalid timezone "Foo": failed to parse timezone
@@ -432,6 +432,16 @@ SELECT chr(CAST(0 AS int))
 statement error DataFusion error: Execution error: invalid Unicode scalar value: 9223372036854775807
 SELECT chr(CAST(9223372036854775807 AS bigint))
 
+statement error DataFusion error: Execution error: invalid Unicode scalar value: 1114112
+SELECT chr(CAST(1114112 AS bigint))
+
+statement error DataFusion error: Execution error: invalid Unicode scalar value: -1
+SELECT chr(CAST(-1 AS bigint))
+
+# surrogate code point (invalid scalar value)
+statement error DataFusion error: Execution error: invalid Unicode scalar value: 55297
+SELECT chr(CAST(55297 AS bigint))
+
 query T
 SELECT concat('a','b','c')
 ----
@@ -494,6 +504,25 @@ abc
 statement ok
 drop table foo
 
+# concat_ws with a Utf8View column as separator
+statement ok
+create table test_concat_ws_sep (sep varchar, val1 varchar, val2 varchar) as values (',', 'foo', 'bar'), ('|', 'a', 'b');
+
+query T
+SELECT concat_ws(arrow_cast(sep, 'Utf8View'), val1, val2) FROM test_concat_ws_sep ORDER BY val1
+----
+a|b
+foo,bar
+
+query T
+SELECT concat_ws(arrow_cast(sep, 'LargeUtf8'), val1, val2) FROM test_concat_ws_sep ORDER BY val1
+----
+a|b
+foo,bar
+
+statement ok
+drop table test_concat_ws_sep
+
 query T
 SELECT initcap('')
 ----
@@ -589,7 +618,7 @@ select repeat('-1.2', arrow_cast(3, 'Int32'));
 ----
 -1.2-1.2-1.2
 
-query error DataFusion error: Error during planning: Internal error: Expect TypeSignatureClass::Native\(LogicalType\(Native\(Int64\), Int64\)\) but received NativeType::Float64, DataType: Float64
+query error DataFusion error: Error during planning: Function 'repeat' requires Int64, but received Float64 \(DataType: Float64\)
 select repeat('-1.2', 3.2);
 
 query T
@@ -672,9 +701,110 @@ SELECT split_part('abc~@~def~@~ghi', '~@~', -100)
 ----
 (empty)
 
+query T
+SELECT split_part('a,b', '', 1)
+----
+a,b
+
+query T
+SELECT split_part('a,b', '', -1)
+----
+a,b
+
+query T
+SELECT split_part('a,b', '', 2)
+----
+(empty)
+
+query T
+SELECT split_part('a,b', '', -2)
+----
+(empty)
+
 statement error DataFusion error: Execution error: field position must not be zero
 SELECT split_part('abc~@~def~@~ghi', '~@~', 0)
 
+# Position 0 with column input errors even for empty/null inputs
+statement error DataFusion error: Execution error: field position must not be zero
+SELECT split_part(column1, '.', 0) FROM (VALUES (NULL::text)) AS t(column1)
+
+# NULL delimiter with position 0 returns null (not an error), matching the
+# slow path where null args short-circuit before the position check.
+query T
+SELECT split_part(column1, NULL, 0) FROM (VALUES ('a.b.c')) AS t(column1)
+----
+NULL
+
+# split_part with column input (exercises the scalar-delimiter fast path)
+query TTT
+SELECT
+  split_part(column1, '.', 1),
+  split_part(column1, '.', 2),
+  split_part(column1, '.', 3)
+FROM (VALUES ('a.b.c'), ('d.e.f'), ('x.y')) AS t(column1)
+----
+a b c
+d e f
+x y (empty)
+
+# Multi-char delimiter with column input
+query TT
+SELECT
+  split_part(column1, '~@~', 2),
+  split_part(column1, '~@~', 3)
+FROM (VALUES ('abc~@~def~@~ghi'), ('one~@~two')) AS t(column1)
+----
+def ghi
+two (empty)
+
+# Negative position with column input
+query TT
+SELECT
+  split_part(column1, '.', -1),
+  split_part(column1, '.', -2)
+FROM (VALUES ('a.b.c'), ('x.y')) AS t(column1)
+----
+c b
+y x
+
+# Empty delimiter with column input
+query TT
+SELECT
+  split_part(column1, '', 1),
+  split_part(column1, '', 2)
+FROM (VALUES ('abc'), ('xyz')) AS t(column1)
+----
+abc (empty)
+xyz (empty)
+
+# NULL column values with scalar delimiter
+query T
+SELECT split_part(column1, '.', 2)
+FROM (VALUES ('a.b'), (NULL), ('c.d')) AS t(column1)
+----
+b
+NULL
+d
+
+# Utf8View column with scalar delimiter
+query TT
+SELECT
+  split_part(column1, '.', 1),
+  split_part(column1, '.', 2)
+FROM (SELECT arrow_cast(column1, 'Utf8View') AS column1
+      FROM (VALUES ('a.b.c'), ('x.y.z')) AS t(column1))
+----
+a b
+x y
+
+# LargeUtf8 column with scalar delimiter
+query T
+SELECT split_part(arrow_cast(column1, 'LargeUtf8'), '.', 2)
+FROM (VALUES ('a.b.c'), (NULL)) AS t(column1)
+----
+b
+NULL
+
 query B
 SELECT starts_with('alphabet', 'alph')
 ----
@@ -715,6 +845,27 @@ SELECT to_hex(CAST(NULL AS int))
 ----
 NULL
 
+query T
+SELECT to_hex(0)
+----
+0
+
+# negative values (two's complement encoding)
+query T
+SELECT to_hex(-1)
+----
+ffffffffffffffff
+
+query T
+SELECT to_hex(CAST(-1 AS INT))
+----
+ffffffffffffffff
+
+query T
+SELECT to_hex(CAST(255 AS TINYINT UNSIGNED))
+----
+ff
+
 query T
 SELECT trim(' tom ')
 ----
@@ -1016,56 +1167,245 @@ SELECT 0.3 NOT IN (0.0,0.1,0.2,NULL)
 ----
 NULL
 
-query B
+# Mixed string/integer IN lists: comparison coercion picks the numeric
+# type, so non-numeric strings like 'a' fail to cast to Int64.
+query error Cannot cast string 'a' to value of Int64 type
 SELECT '1' IN ('a','b',1)
+
+query error Cannot cast string 'a' to value of Int64 type
+SELECT '2' IN ('a','b',1)
+
+query error Cannot cast string 'a' to value of Int64 type
+SELECT '2' NOT IN ('a','b',1)
+
+query error Cannot cast string 'a' to value of Int64 type
+SELECT '1' NOT IN ('a','b',1)
+
+query error Cannot cast string 'a' to value of Int64 type
+SELECT NULL IN ('a','b',1)
+
+query error Cannot cast string 'a' to value of Int64 type
+SELECT NULL NOT IN ('a','b',1)
+
+query error Cannot cast string 'a' to value of Int64 type
+SELECT '1' IN ('a','b',NULL,1)
+
+query error Cannot cast string 'a' to value of Int64 type
+SELECT '2' IN ('a','b',NULL,1)
+
+query error Cannot cast string 'a' to value of Int64 type
+SELECT '1' NOT IN ('a','b',NULL,1)
+
+query error Cannot cast string 'a' to value of Int64 type
+SELECT '2' NOT IN ('a','b',NULL,1)
+
+# ========================================================================
+# Comprehensive IN LIST tests with NULL handling
+# These tests validate SQL three-valued logic for IN operations
+# ========================================================================
+
+# test_in_list_null_literals
+# NULL IN (any_list) should always return NULL per SQL three-valued logic
+
+query B
+SELECT NULL IN (1, 1)
+----
+NULL
+
+query B
+SELECT NULL IN (NULL, 1)
+----
+NULL
+
+query B
+SELECT NULL IN (NULL, NULL)
+----
+NULL
+
+# test_in_list_with_columns
+# Create test table for column-based IN LIST tests
+
+statement ok
+CREATE OR REPLACE TABLE in_list_test(b INT) AS VALUES (1), (2), (3), (4), (NULL);
+
+# Test: b IN (1, 2) with various values
+
+query B
+SELECT b IN (1, 2) FROM in_list_test WHERE b = 1;
 ----
 true
 
+query IB
+SELECT b, b IN (1, 2) FROM in_list_test WHERE b IN (1, 2) ORDER BY b;
+----
+1 true
+2 true
+
+query IB
+SELECT b, b IN (1, 2) FROM in_list_test WHERE b IN (3, 4) ORDER BY b;
+----
+3 false
+4 false
+
 query B
-SELECT '2' IN ('a','b',1)
+SELECT b IN (1, 2) FROM in_list_test WHERE b = 1;
+----
+true
+
+query B
+SELECT b IN (1, 2) FROM in_list_test WHERE b = 3;
 ----
 false
 
 query B
-SELECT '2' NOT IN ('a','b',1)
+SELECT b IN (1, 2) FROM in_list_test WHERE b IS NULL;
+----
+NULL
+
+# Test: b IN (NULL, 1) - list contains NULL
+
+query B
+SELECT b IN (NULL, 1) FROM in_list_test WHERE b = 1;
 ----
 true
 
 query B
-SELECT '1' NOT IN ('a','b',1)
+SELECT b IN (NULL, 1) FROM in_list_test WHERE b = 2;
 ----
-false
+NULL
 
 query B
-SELECT NULL IN ('a','b',1)
+SELECT b IN (NULL, 1) FROM in_list_test WHERE b IS NULL;
 ----
 NULL
 
+# Test: b IN (NULL, NULL) - list contains only NULLs
+
 query B
-SELECT NULL NOT IN ('a','b',1)
+SELECT b IN (NULL, NULL) FROM in_list_test WHERE b = 1;
 ----
 NULL
 
 query B
-SELECT '1' IN ('a','b',NULL,1)
+SELECT b IN (NULL, NULL) FROM in_list_test WHERE b IS NULL;
+----
+NULL
+
+# Test: literal IN (list_with_column) - column appears in the list
+
+statement ok
+CREATE OR REPLACE TABLE in_list_col_test(b INT) AS VALUES (1), (3), (NULL);
+
+query B
+SELECT 1 IN (2, b) FROM in_list_col_test WHERE b = 1;
 ----
 true
 
 query B
-SELECT '2' IN ('a','b',NULL,1)
+SELECT 1 IN (2, b) FROM in_list_col_test WHERE b = 3;
+----
+false
+
+query B
+SELECT 1 IN (2, b) FROM in_list_col_test WHERE b IS NULL;
 ----
 NULL
 
+# Test: b IN (1, b) - column references itself in list
+
 query B
-SELECT '1' NOT IN ('a','b',NULL,1)
+SELECT b IN (1, b) FROM in_list_col_test WHERE b = 1;
 ----
-false
+true
 
 query B
-SELECT '2' NOT IN ('a','b',NULL,1)
+SELECT b IN (1, b) FROM in_list_col_test WHERE b = 3;
+----
+true
+
+query B
+SELECT b IN (1, b) FROM in_list_col_test WHERE b IS NULL;
 ----
 NULL
 
+# test_in_list_tuples
+# Test tuple/row-wise IN comparisons using struct syntax
+# Note: Using arrow_cast for precise type control
+
+# (NULL, NULL) IN ((1, 2)) => FALSE
+query B
+SELECT struct(arrow_cast(NULL, 'Int32'), arrow_cast(NULL, 'Int32')) IN (struct(1, 2))
+----
+false
+
+# (NULL, NULL) IN ((NULL, 1)) => FALSE
+query B
+SELECT struct(arrow_cast(NULL, 'Int32'), arrow_cast(NULL, 'Int32')) IN (struct(arrow_cast(NULL, 'Int32'), 1))
+----
+false
+
+# (NULL, NULL) IN ((NULL, NULL)) => TRUE (exact match)
+query B
+SELECT struct(arrow_cast(NULL, 'Int32'), arrow_cast(NULL, 'Int32')) IN (struct(arrow_cast(NULL, 'Int32'), arrow_cast(NULL, 'Int32')))
+----
+true
+
+# (NULL, 1) IN ((1, 2)) => FALSE
+query B
+SELECT struct(arrow_cast(NULL, 'Int32'), 1) IN (struct(1, 2))
+----
+false
+
+# (NULL, 1) IN ((NULL, 1)) => TRUE (exact match)
+query B
+SELECT struct(arrow_cast(NULL, 'Int32'), 1) IN (struct(arrow_cast(NULL, 'Int32'), 1))
+----
+true
+
+# (NULL, 1) IN ((NULL, NULL)) => FALSE
+query B
+SELECT struct(arrow_cast(NULL, 'Int32'), 1) IN (struct(arrow_cast(NULL, 'Int32'), arrow_cast(NULL, 'Int32')))
+----
+false
+
+# (1, 2) IN ((1, 2)) => TRUE
+query B
+SELECT struct(1, 2) IN (struct(1, 2))
+----
+true
+
+# (1, 3) IN ((1, 2)) => FALSE
+query B
+SELECT struct(1, 3) IN (struct(1, 2))
+----
+false
+
+# (4, 4) IN ((1, 2)) => FALSE
+query B
+SELECT struct(4, 4) IN (struct(1, 2))
+----
+false
+
+# (1, 1) IN ((NULL, 1)) => FALSE
+query B
+SELECT struct(1, 1) IN (struct(NULL, 1))
+----
+false
+
+# (1, 1) IN ((NULL, NULL)) => FALSE
+query B
+SELECT struct(1, 1) IN (struct(NULL, NULL))
+----
+false
+
+# Cleanup test tables
+
+statement ok
+DROP TABLE in_list_test;
+
+statement ok
+DROP TABLE in_list_col_test;
+
 query T
 SELECT encode('tom','base64');
 ----
@@ -1191,6 +1531,11 @@ SELECT md5('tom');
 ----
 34b7da764b21d298ef307d04d8152dc5
 
+query T
+SELECT md5(arrow_cast('tom', 'Dictionary(Int32, Utf8)'));
+----
+34b7da764b21d298ef307d04d8152dc5
+
 query ?
 SELECT digest('tom','md5');
 ----
diff --git a/datafusion/sqllogictest/test_files/filter_without_sort_exec.slt b/datafusion/sqllogictest/test_files/filter_without_sort_exec.slt
index a09d8ce26ddfb..633e19f7915db 100644
--- a/datafusion/sqllogictest/test_files/filter_without_sort_exec.slt
+++ b/datafusion/sqllogictest/test_files/filter_without_sort_exec.slt
@@ -38,10 +38,9 @@ logical_plan
 03)----TableScan: data projection=[date, ticker, time]
 physical_plan
 01)SortPreservingMergeExec: [date@0 ASC NULLS LAST, time@2 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: ticker@1 = A
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
+02)--FilterExec: ticker@1 = A
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+04)------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
 
 # constant ticker, CAST(time AS DATE) = time, order by time
 query TT
@@ -55,10 +54,9 @@ logical_plan
 03)----TableScan: data projection=[date, ticker, time]
 physical_plan
 01)SortPreservingMergeExec: [time@2 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: ticker@1 = A AND CAST(time@2 AS Date32) = date@0
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
+02)--FilterExec: ticker@1 = A AND CAST(time@2 AS Date32) = date@0
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+04)------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
 
 # same thing but order by date
 query TT
@@ -72,10 +70,9 @@ logical_plan
 03)----TableScan: data projection=[date, ticker, time]
 physical_plan
 01)SortPreservingMergeExec: [date@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: ticker@1 = A AND CAST(time@2 AS Date32) = date@0
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
+02)--FilterExec: ticker@1 = A AND CAST(time@2 AS Date32) = date@0
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+04)------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
 
 # same thing but order by ticker
 query TT
@@ -89,10 +86,9 @@ logical_plan
 03)----TableScan: data projection=[date, ticker, time]
 physical_plan
 01)CoalescePartitionsExec
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: ticker@1 = A AND CAST(time@2 AS Date32) = date@0
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
+02)--FilterExec: ticker@1 = A AND CAST(time@2 AS Date32) = date@0
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+04)------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
 
 # same thing but order by time, date
 query TT
@@ -106,10 +102,9 @@ logical_plan
 03)----TableScan: data projection=[date, ticker, time]
 physical_plan
 01)SortPreservingMergeExec: [time@2 ASC NULLS LAST, date@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: ticker@1 = A AND CAST(time@2 AS Date32) = date@0
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
+02)--FilterExec: ticker@1 = A AND CAST(time@2 AS Date32) = date@0
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+04)------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
 
 # CAST(time AS DATE) <> date (should require a sort)
 # no physical plan due to sort breaking pipeline
@@ -147,7 +142,50 @@ logical_plan
 03)----TableScan: data projection=[date, ticker, time]
 physical_plan
 01)SortPreservingMergeExec: [ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: date@0 = 2006-01-02
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
+02)--FilterExec: date@0 = 2006-01-02
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+04)------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
+
+# schema adaptation cast should preserve ordering (regression test for cast properties)
+statement ok
+COPY (
+  SELECT arrow_cast(column1, 'Int32') AS b
+  FROM (VALUES (1), (2), (3), (4))
+) TO 'test_files/scratch/filter_without_sort_exec/cast_ordering.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE cast_ordered (
+    b BIGINT
+)
+STORED AS PARQUET
+WITH ORDER (b)
+LOCATION 'test_files/scratch/filter_without_sort_exec/';
+
+statement ok
+CREATE EXTERNAL TABLE cast_physical
+STORED AS PARQUET
+LOCATION 'test_files/scratch/filter_without_sort_exec/';
+
+query T
+SELECT DISTINCT arrow_typeof(b) FROM cast_physical;
+----
+Int32
+
+query T
+SELECT DISTINCT arrow_typeof(b) FROM cast_ordered;
+----
+Int64
+
+query TT
+EXPLAIN SELECT b FROM cast_ordered WHERE b > 1 ORDER BY b;
+----
+logical_plan
+01)Sort: cast_ordered.b ASC NULLS LAST
+02)--Filter: cast_ordered.b > Int64(1)
+03)----TableScan: cast_ordered projection=[b], partial_filters=[cast_ordered.b > Int64(1)]
+physical_plan
+01)SortPreservingMergeExec: [b@0 ASC NULLS LAST]
+02)--FilterExec: b@0 > 1
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/filter_without_sort_exec/cast_ordering.parquet]]}, projection=[b], output_ordering=[b@0 ASC NULLS LAST], file_type=parquet, predicate=b@0 > 1, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 1, required_guarantees=[]
diff --git a/datafusion/sqllogictest/test_files/floor_preimage.slt b/datafusion/sqllogictest/test_files/floor_preimage.slt
new file mode 100644
index 0000000000000..960b966ebbba0
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/floor_preimage.slt
@@ -0,0 +1,312 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## Floor Preimage Tests
+##
+## Tests for floor function preimage optimization:
+## floor(col) = N transforms to col >= N AND col < N + 1
+##
+## Uses representative types only (Float64, Int32, Decimal128).
+## Unit tests cover all type variants.
+##########
+
+# Setup: Single table with representative types
+statement ok
+CREATE TABLE test_data (
+    id INT,
+    float_val DOUBLE,
+    int_val INT,
+    decimal_val DECIMAL(10,2)
+) AS VALUES
+    (1, 5.3,   100, 100.00),
+    (2, 5.7,   101, 100.50),
+    (3, 6.0,   102, 101.00),
+    (4, 6.5,   -5,  101.99),
+    (5, 7.0,   0,   102.00),
+    (6, NULL,  NULL, NULL);
+
+##########
+## Data Correctness Tests
+##########
+
+# Float64: floor(x) = 5 matches values in [5.0, 6.0)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) = arrow_cast(5, 'Float64');
+----
+1
+2
+
+# Int32: floor(x) = 100 matches values in [100, 101)
+query I rowsort
+SELECT id FROM test_data WHERE floor(int_val) = 100;
+----
+1
+
+# Decimal128: floor(x) = 100 matches values in [100.00, 101.00)
+query I rowsort
+SELECT id FROM test_data WHERE floor(decimal_val) = arrow_cast(100, 'Decimal128(10,2)');
+----
+1
+2
+
+# Negative value: floor(x) = -5 matches values in [-5, -4)
+query I rowsort
+SELECT id FROM test_data WHERE floor(int_val) = -5;
+----
+4
+
+# Zero value: floor(x) = 0 matches values in [0, 1)
+query I rowsort
+SELECT id FROM test_data WHERE floor(int_val) = 0;
+----
+5
+
+# Column on RHS (same result as LHS)
+query I rowsort
+SELECT id FROM test_data WHERE arrow_cast(5, 'Float64') = floor(float_val);
+----
+1
+2
+
+# IS NOT DISTINCT FROM (excludes NULLs)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) IS NOT DISTINCT FROM arrow_cast(5, 'Float64');
+----
+1
+2
+
+# IS DISTINCT FROM (includes NULLs)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) IS DISTINCT FROM arrow_cast(5, 'Float64');
+----
+3
+4
+5
+6
+
+# Non-integer literal (empty result - floor returns integers)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) = arrow_cast(5.5, 'Float64');
+----
+
+# IN list: floor(x) IN (5, 7) matches [5.0, 6.0) and [7.0, 8.0)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) IN (arrow_cast(5, 'Float64'), arrow_cast(7, 'Float64'));
+----
+1
+2
+5
+
+# NOT IN list: floor(x) NOT IN (5, 7) excludes matching ranges and NULLs
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) NOT IN (arrow_cast(5, 'Float64'), arrow_cast(7, 'Float64'));
+----
+3
+4
+
+##########
+## EXPLAIN Tests - Plan Optimization
+##########
+
+statement ok
+set datafusion.explain.logical_plan_only = true;
+
+# 1. Basic: Float64 - floor(col) = N transforms to col >= N AND col < N+1
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) = arrow_cast(5, 'Float64');
+----
+logical_plan
+01)Filter: test_data.float_val >= Float64(5) AND test_data.float_val < Float64(6)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# 2. Basic: Int32 - transformed (coerced to Float64)
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(int_val) = 100;
+----
+logical_plan
+01)Projection: test_data.id, test_data.float_val, test_data.int_val, test_data.decimal_val
+02)--Filter: __common_expr_3 >= Float64(100) AND __common_expr_3 < Float64(101)
+03)----Projection: CAST(test_data.int_val AS Float64) AS __common_expr_3, test_data.id, test_data.float_val, test_data.int_val, test_data.decimal_val
+04)------TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# 3. Basic: Decimal128 - same transformation
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(decimal_val) = arrow_cast(100, 'Decimal128(10,2)');
+----
+logical_plan
+01)Filter: test_data.decimal_val >= Decimal128(Some(10000),10,2) AND test_data.decimal_val < Decimal128(Some(10100),10,2)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# 4. Column on RHS - same transformation
+query TT
+EXPLAIN SELECT * FROM test_data WHERE arrow_cast(5, 'Float64') = floor(float_val);
+----
+logical_plan
+01)Filter: test_data.float_val >= Float64(5) AND test_data.float_val < Float64(6)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# 5. IS NOT DISTINCT FROM - adds IS NOT NULL
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) IS NOT DISTINCT FROM arrow_cast(5, 'Float64');
+----
+logical_plan
+01)Filter: test_data.float_val IS NOT NULL AND test_data.float_val >= Float64(5) AND test_data.float_val < Float64(6)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# 6. IS DISTINCT FROM - includes NULL check
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) IS DISTINCT FROM arrow_cast(5, 'Float64');
+----
+logical_plan
+01)Filter: test_data.float_val < Float64(5) OR test_data.float_val >= Float64(6) OR test_data.float_val IS NULL
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# 7. Non-optimizable: non-integer literal (original predicate preserved)
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) = arrow_cast(5.5, 'Float64');
+----
+logical_plan
+01)Filter: floor(test_data.float_val) = Float64(5.5)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# 8. Non-optimizable: extreme float literal (2^53) where n+1 loses precision, so preimage returns None
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) = 9007199254740992;
+----
+logical_plan
+01)Filter: floor(test_data.float_val) = Float64(9007199254740992)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# 9. IN list: each list item is rewritten with preimage and OR-ed together
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) IN (arrow_cast(5, 'Float64'), arrow_cast(7, 'Float64'));
+----
+logical_plan
+01)Filter: test_data.float_val >= Float64(5) AND test_data.float_val < Float64(6) OR test_data.float_val >= Float64(7) AND test_data.float_val < Float64(8)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# Data correctness: floor(col) = 2^53 returns no rows (no value in test_data has floor exactly 2^53)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) = 9007199254740992;
+----
+
+##########
+## Other Comparison Operators
+##
+## The preimage framework automatically handles all comparison operators:
+##   floor(x) <> N  ->  x < N OR x >= N+1
+##   floor(x) > N   ->  x >= N+1
+##   floor(x) < N   ->  x < N
+##   floor(x) >= N  ->  x >= N
+##   floor(x) <= N  ->  x < N+1
+##########
+
+# Data correctness tests for other operators
+
+# Not equals: floor(x) <> 5 matches values outside [5.0, 6.0)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) <> arrow_cast(5, 'Float64');
+----
+3
+4
+5
+
+# Greater than: floor(x) > 5 matches values in [6.0, inf)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) > arrow_cast(5, 'Float64');
+----
+3
+4
+5
+
+# Less than: floor(x) < 6 matches values in (-inf, 6.0)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) < arrow_cast(6, 'Float64');
+----
+1
+2
+
+# Greater than or equal: floor(x) >= 5 matches values in [5.0, inf)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) >= arrow_cast(5, 'Float64');
+----
+1
+2
+3
+4
+5
+
+# Less than or equal: floor(x) <= 5 matches values in (-inf, 6.0)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) <= arrow_cast(5, 'Float64');
+----
+1
+2
+
+# EXPLAIN tests showing optimized transformations
+
+# Not equals: floor(x) <> 5 -> x < 5 OR x >= 6
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) <> arrow_cast(5, 'Float64');
+----
+logical_plan
+01)Filter: test_data.float_val < Float64(5) OR test_data.float_val >= Float64(6)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# Greater than: floor(x) > 5 -> x >= 6
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) > arrow_cast(5, 'Float64');
+----
+logical_plan
+01)Filter: test_data.float_val >= Float64(6)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# Less than: floor(x) < 6 -> x < 6
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) < arrow_cast(6, 'Float64');
+----
+logical_plan
+01)Filter: test_data.float_val < Float64(6)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# Greater than or equal: floor(x) >= 5 -> x >= 5
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) >= arrow_cast(5, 'Float64');
+----
+logical_plan
+01)Filter: test_data.float_val >= Float64(5)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# Less than or equal: floor(x) <= 5 -> x < 6
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) <= arrow_cast(5, 'Float64');
+----
+logical_plan
+01)Filter: test_data.float_val < Float64(6)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# Config reset
+statement ok
+RESET datafusion.explain.logical_plan_only;
+
+##########
+## Cleanup
+##########
+
+statement ok
+DROP TABLE test_data;
diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt
index 20f79622a62c6..ee11dc973bbd7 100644
--- a/datafusion/sqllogictest/test_files/functions.slt
+++ b/datafusion/sqllogictest/test_files/functions.slt
@@ -193,10 +193,25 @@ SELECT substr('alphabet', 3, CAST(NULL AS int))
 ----
 NULL
 
-statement error The first argument of the substr function can only be a string, but got Int64
+query T
+SELECT substr(NULL, 1, 2)
+----
+NULL
+
+query T
+SELECT substr('alphabet', 1, NULL)
+----
+NULL
+
+query T
+SELECT substr('alphabet', NULL, 2)
+----
+NULL
+
+statement error Function 'substr' failed to match any signature
 SELECT substr(1, 3)
 
-statement error The first argument of the substr function can only be a string, but got Int64
+statement error Function 'substr' failed to match any signature
 SELECT substr(1, 3, 4)
 
 query T
@@ -224,6 +239,11 @@ SELECT translate('12345', '143', NULL)
 ----
 NULL
 
+query T
+SELECT translate(arrow_cast('12345', 'LargeUtf8'), '143', 'ax')
+----
+a2x5
+
 statement ok
 CREATE TABLE test(
   c1 VARCHAR
@@ -415,6 +435,11 @@ SELECT upper(arrow_cast('foo', 'Dictionary(Int32, Utf8)'))
 ----
 FOO
 
+query T
+SELECT upper(arrow_cast(arrow_cast('foo', 'Dictionary(Int32, Utf8)'), 'Dictionary(Int32, Utf8View)'))
+----
+FOO
+
 query T
 SELECT upper('árvore ação αβγ')
 ----
@@ -425,6 +450,26 @@ SELECT upper(arrow_cast('árvore ação αβγ', 'Dictionary(Int32, Utf8)'))
 ----
 ÁRVORE AÇÃO ΑΒΓ
 
+query T
+SELECT arrow_typeof(upper('foo'))
+----
+Utf8
+
+query T
+SELECT arrow_typeof(upper(arrow_cast('foo', 'LargeUtf8')))
+----
+LargeUtf8
+
+query T
+SELECT arrow_typeof(upper(arrow_cast('foo', 'Utf8View')))
+----
+Utf8View
+
+query T
+SELECT arrow_typeof(upper(arrow_cast(arrow_cast('foo', 'Dictionary(Int32, Utf8)'), 'Dictionary(Int32, Utf8View)')))
+----
+Utf8View
+
 query T
 SELECT btrim('   foo  ')
 ----
@@ -470,6 +515,11 @@ SELECT lower(arrow_cast('FOObar', 'Dictionary(Int32, Utf8)'))
 ----
 foobar
 
+query T
+SELECT lower(arrow_cast(arrow_cast('FOObar', 'Dictionary(Int32, Utf8)'), 'Dictionary(Int32, Utf8View)'))
+----
+foobar
+
 query T
 SELECT lower('ÁRVORE AÇÃO ΑΒΓ')
 ----
@@ -480,6 +530,26 @@ SELECT lower(arrow_cast('ÁRVORE AÇÃO ΑΒΓ', 'Dictionary(Int32, Utf8)'))
 ----
 árvore ação αβγ
 
+query T
+SELECT arrow_typeof(lower('FOObar'))
+----
+Utf8
+
+query T
+SELECT arrow_typeof(lower(arrow_cast('FOObar', 'LargeUtf8')))
+----
+LargeUtf8
+
+query T
+SELECT arrow_typeof(lower(arrow_cast('FOObar', 'Utf8View')))
+----
+Utf8View
+
+query T
+SELECT arrow_typeof(lower(arrow_cast(arrow_cast('FOObar', 'Dictionary(Int32, Utf8)'), 'Dictionary(Int32, Utf8View)')))
+----
+Utf8View
+
 query T
 SELECT ltrim('   foo')
 ----
@@ -521,6 +591,15 @@ SELECT trim(arrow_cast('  foo  ', 'Dictionary(Int32, Utf8)'))
 ----
 foo
 
+# Verify that trim, ltrim, and rtrim only strip spaces by default,
+# not other whitespace characters (tabs, newlines, etc.)
+query III
+SELECT length(trim(chr(9) || 'foo' || chr(10))),
+       length(ltrim(chr(9) || 'foo')),
+       length(rtrim('foo' || chr(10)))
+----
+5 4 4
+
 query I
 SELECT bit_length('foo')
 ----
@@ -858,7 +937,7 @@ SELECT greatest(-1, 1, 2.3, 123456789, 3 + 5, -(-4), abs(-9.0))
 123456789
 
 
-query error Function 'greatest' user-defined coercion failed with "Error during planning: greatest was called without any arguments. It requires at least 1."
+query error Function 'greatest' user-defined coercion failed with: Error during planning: greatest was called without any arguments. It requires at least 1.
 SELECT greatest()
 
 query I
@@ -1056,7 +1135,7 @@ SELECT least(-1, 1, 2.3, 123456789, 3 + 5, -(-4), abs(-9.0))
 -1
 
 
-query error Function 'least' user-defined coercion failed with "Error during planning: least was called without any arguments. It requires at least 1."
+query error Function 'least' user-defined coercion failed with: Error during planning: least was called without any arguments. It requires at least 1.
 SELECT least()
 
 query I
diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt
index fe7871c22b4c3..b313424951532 100644
--- a/datafusion/sqllogictest/test_files/group_by.slt
+++ b/datafusion/sqllogictest/test_files/group_by.slt
@@ -1979,6 +1979,11 @@ SELECT col0, col1, COUNT(col2), SUM(col3) FROM tab3 GROUP BY ALL
 0 2 2 -3
 1 NULL 1 -2
 
+query I
+SELECT COUNT(*) AS c FROM tab3 GROUP BY ALL
+----
+5
+
 # query below should work in multi partition, successfully.
 query II
 SELECT l.col0, LAST_VALUE(r.col1 ORDER BY r.col0) as last_col1
@@ -2017,15 +2022,12 @@ physical_plan
 02)--SortExec: expr=[col0@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[col0@0 as col0, last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]@3 as last_col1]
 04)------AggregateExec: mode=FinalPartitioned, gby=[col0@0 as col0, col1@1 as col1, col2@2 as col2], aggr=[last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([col0@0, col1@1, col2@2], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[col0@0 as col0, col1@1 as col1, col2@2 as col2], aggr=[last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]]
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------ProjectionExec: expr=[col0@2 as col0, col1@3 as col1, col2@4 as col2, col0@0 as col0, col1@1 as col1]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(col0@0, col0@0)]
-12)----------------------DataSourceExec: partitions=1, partition_sizes=[3]
-13)----------------------DataSourceExec: partitions=1, partition_sizes=[3]
+05)--------RepartitionExec: partitioning=Hash([col0@0, col1@1, col2@2], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[col0@0 as col0, col1@1 as col1, col2@2 as col2], aggr=[last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(col0@0, col0@0)], projection=[col0@2, col1@3, col2@4, col0@0, col1@1]
+09)----------------DataSourceExec: partitions=1, partition_sizes=[3]
+10)----------------DataSourceExec: partitions=1, partition_sizes=[3]
 
 # Columns in the table are a,b,c,d. Source is DataSourceExec which is ordered by
 # a,b,c column. Column a has cardinality 2, column b has cardinality 4.
@@ -2943,11 +2945,9 @@ physical_plan
 01)SortExec: expr=[sn@2 ASC NULLS LAST], preserve_partitioning=[false]
 02)--ProjectionExec: expr=[zip_code@1 as zip_code, country@2 as country, sn@0 as sn, ts@3 as ts, currency@4 as currency, last_value(e.amount) ORDER BY [e.sn ASC NULLS LAST]@5 as last_rate]
 03)----AggregateExec: mode=Single, gby=[sn@2 as sn, zip_code@0 as zip_code, country@1 as country, ts@3 as ts, currency@4 as currency], aggr=[last_value(e.amount) ORDER BY [e.sn ASC NULLS LAST]]
-04)------ProjectionExec: expr=[zip_code@2 as zip_code, country@3 as country, sn@4 as sn, ts@5 as ts, currency@6 as currency, sn@0 as sn, amount@1 as amount]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(currency@2, currency@4)], filter=ts@0 >= ts@1, projection=[sn@0, amount@3, zip_code@4, country@5, sn@6, ts@7, currency@8]
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(currency@2, currency@4)], filter=ts@0 >= ts@1, projection=[zip_code@4, country@5, sn@6, ts@7, currency@8, sn@0, amount@3]
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query ITIPTR rowsort
 SELECT s.zip_code, s.country, s.sn, s.ts, s.currency, LAST_VALUE(e.amount ORDER BY e.sn) AS last_rate
@@ -2988,11 +2988,9 @@ physical_plan
 02)--SortExec: expr=[country@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[country@0 as country, first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]@1 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]@2 as fv2]
 04)------AggregateExec: mode=FinalPartitioned, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8
-07)------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-08)--------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]]
-09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]]
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TRR
 SELECT country, FIRST_VALUE(amount ORDER BY ts ASC) AS fv1,
@@ -3024,11 +3022,9 @@ physical_plan
 02)--SortExec: expr=[country@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[country@0 as country, first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]@1 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@2 as fv2]
 04)------AggregateExec: mode=FinalPartitioned, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8
-07)------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-08)--------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]]
-09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]]
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 
 query TRR
@@ -3188,12 +3184,11 @@ physical_plan
 02)--SortExec: expr=[country@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[country@0 as country, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@1 as array_agg1]
 04)------AggregateExec: mode=FinalPartitioned, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]]
-05)--------CoalesceBatchesExec: target_batch_size=4
-06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8
-07)------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]]
-08)--------------SortExec: expr=[amount@1 ASC NULLS LAST], preserve_partitioning=[true]
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-10)------------------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8
+06)----------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]]
+07)------------SortExec: expr=[amount@1 ASC NULLS LAST], preserve_partitioning=[true]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query T?
 SELECT country, ARRAY_AGG(amount ORDER BY amount ASC) AS array_agg1
@@ -3224,12 +3219,11 @@ physical_plan
 02)--SortExec: expr=[country@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[country@0 as country, array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@1 as amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@2 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@3 as fv2]
 04)------AggregateExec: mode=FinalPartitioned, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]]
-05)--------CoalesceBatchesExec: target_batch_size=4
-06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8
-07)------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]]
-08)--------------SortExec: expr=[amount@1 DESC], preserve_partitioning=[true]
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-10)------------------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8
+06)----------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]]
+07)------------SortExec: expr=[amount@1 DESC], preserve_partitioning=[true]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query T?RR
 SELECT country, ARRAY_AGG(amount ORDER BY amount DESC) AS amounts,
@@ -3425,11 +3419,10 @@ physical_plan
 02)--SortExec: expr=[sn@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[sn@0 as sn, amount@1 as amount, 2 * CAST(sn@0 AS Int64) as Int64(2) * s.sn]
 04)------AggregateExec: mode=FinalPartitioned, gby=[sn@0 as sn, amount@1 as amount], aggr=[]
-05)--------CoalesceBatchesExec: target_batch_size=4
-06)----------RepartitionExec: partitioning=Hash([sn@0, amount@1], 8), input_partitions=8
-07)------------AggregateExec: mode=Partial, gby=[sn@0 as sn, amount@1 as amount], aggr=[]
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-09)----------------DataSourceExec: partitions=1, partition_sizes=[2]
+05)--------RepartitionExec: partitioning=Hash([sn@0, amount@1], 8), input_partitions=8
+06)----------AggregateExec: mode=Partial, gby=[sn@0 as sn, amount@1 as amount], aggr=[]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+08)--------------DataSourceExec: partitions=1, partition_sizes=[2]
 
 query IRI
 SELECT s.sn, s.amount, 2*s.sn
@@ -3494,13 +3487,12 @@ physical_plan
 02)--SortExec: expr=[sn@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[sn@0 as sn, sum(l.amount)@2 as sum(l.amount), amount@1 as amount]
 04)------AggregateExec: mode=FinalPartitioned, gby=[sn@0 as sn, amount@1 as amount], aggr=[sum(l.amount)]
-05)--------CoalesceBatchesExec: target_batch_size=4
-06)----------RepartitionExec: partitioning=Hash([sn@0, amount@1], 8), input_partitions=8
-07)------------AggregateExec: mode=Partial, gby=[sn@1 as sn, amount@2 as amount], aggr=[sum(l.amount)]
-08)--------------NestedLoopJoinExec: join_type=Inner, filter=sn@0 >= sn@1, projection=[amount@1, sn@2, amount@3]
-09)----------------DataSourceExec: partitions=1, partition_sizes=[2]
-10)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-11)------------------DataSourceExec: partitions=1, partition_sizes=[2]
+05)--------RepartitionExec: partitioning=Hash([sn@0, amount@1], 8), input_partitions=8
+06)----------AggregateExec: mode=Partial, gby=[sn@1 as sn, amount@2 as amount], aggr=[sum(l.amount)]
+07)------------NestedLoopJoinExec: join_type=Inner, filter=sn@0 >= sn@1, projection=[amount@1, sn@2, amount@3]
+08)--------------DataSourceExec: partitions=1, partition_sizes=[2]
+09)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+10)----------------DataSourceExec: partitions=1, partition_sizes=[2]
 
 query IRR
 SELECT r.sn, SUM(l.amount), r.amount
@@ -3641,13 +3633,12 @@ physical_plan
 02)--SortExec: expr=[sn@2 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[zip_code@1 as zip_code, country@2 as country, sn@0 as sn, ts@3 as ts, currency@4 as currency, amount@5 as amount, sum_amount@6 as sum_amount]
 04)------AggregateExec: mode=FinalPartitioned, gby=[sn@0 as sn, zip_code@1 as zip_code, country@2 as country, ts@3 as ts, currency@4 as currency, amount@5 as amount, sum_amount@6 as sum_amount], aggr=[]
-05)--------CoalesceBatchesExec: target_batch_size=4
-06)----------RepartitionExec: partitioning=Hash([sn@0, zip_code@1, country@2, ts@3, currency@4, amount@5, sum_amount@6], 8), input_partitions=8
-07)------------AggregateExec: mode=Partial, gby=[sn@2 as sn, zip_code@0 as zip_code, country@1 as country, ts@3 as ts, currency@4 as currency, amount@5 as amount, sum_amount@6 as sum_amount], aggr=[]
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-09)----------------ProjectionExec: expr=[zip_code@0 as zip_code, country@1 as country, sn@2 as sn, ts@3 as ts, currency@4 as currency, amount@5 as amount, sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@6 as sum_amount]
-10)------------------BoundedWindowAggExec: wdw=[sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
-11)--------------------DataSourceExec: partitions=1, partition_sizes=[2]
+05)--------RepartitionExec: partitioning=Hash([sn@0, zip_code@1, country@2, ts@3, currency@4, amount@5, sum_amount@6], 8), input_partitions=8
+06)----------AggregateExec: mode=Partial, gby=[sn@2 as sn, zip_code@0 as zip_code, country@1 as country, ts@3 as ts, currency@4 as currency, amount@5 as amount, sum_amount@6 as sum_amount], aggr=[]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+08)--------------ProjectionExec: expr=[zip_code@0 as zip_code, country@1 as country, sn@2 as sn, ts@3 as ts, currency@4 as currency, amount@5 as amount, sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@6 as sum_amount]
+09)----------------BoundedWindowAggExec: wdw=[sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
+10)------------------DataSourceExec: partitions=1, partition_sizes=[2]
 
 
 query ITIPTRR
@@ -3869,11 +3860,10 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[first_value(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.a ASC NULLS LAST]@1 as first_a, last_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST]@2 as last_c]
 02)--AggregateExec: mode=FinalPartitioned, gby=[d@0 as d], aggr=[first_value(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.a ASC NULLS LAST], last_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST]]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------RepartitionExec: partitioning=Hash([d@0], 8), input_partitions=8
-05)--------AggregateExec: mode=Partial, gby=[d@2 as d], aggr=[first_value(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.a ASC NULLS LAST], first_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c, d], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true
+03)----RepartitionExec: partitioning=Hash([d@0], 8), input_partitions=8
+04)------AggregateExec: mode=Partial, gby=[d@2 as d], aggr=[first_value(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.a ASC NULLS LAST], first_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c, d], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true
 
 query II rowsort
 SELECT FIRST_VALUE(a ORDER BY a ASC) as first_a,
@@ -3939,12 +3929,11 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[last_value(l.d) ORDER BY [l.a ASC NULLS LAST]@1 as amount_usd]
 02)--AggregateExec: mode=Single, gby=[row_n@2 as row_n], aggr=[last_value(l.d) ORDER BY [l.a ASC NULLS LAST]], ordering_mode=Sorted
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d@1, d@1)], filter=CAST(a@0 AS Int64) >= CAST(a@1 AS Int64) - 10, projection=[a@0, d@1, row_n@4]
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
-06)--------ProjectionExec: expr=[a@0 as a, d@1 as d, row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as row_n]
-07)----------BoundedWindowAggExec: wdw=[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
-08)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d@1, d@1)], filter=CAST(a@0 AS Int64) >= CAST(a@1 AS Int64) - 10, projection=[a@0, d@1, row_n@4]
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
+05)------ProjectionExec: expr=[a@0 as a, d@1 as d, row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as row_n]
+06)--------BoundedWindowAggExec: wdw=[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+07)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 # reset partition number to 8.
 statement ok
@@ -3980,12 +3969,10 @@ logical_plan
 02)--TableScan: multiple_ordered_table_with_pk projection=[b, c, d]
 physical_plan
 01)AggregateExec: mode=FinalPartitioned, gby=[c@0 as c, b@1 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
-02)--SortExec: expr=[c@0 ASC NULLS LAST], preserve_partitioning=[true]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------RepartitionExec: partitioning=Hash([c@0, b@1], 8), input_partitions=8
-05)--------AggregateExec: mode=Partial, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
-06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], constraints=[PrimaryKey([3])], file_type=csv, has_header=true
+02)--RepartitionExec: partitioning=Hash([c@0, b@1], 8), input_partitions=8, preserve_order=true, sort_exprs=c@0 ASC NULLS LAST
+03)----AggregateExec: mode=Partial, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
+04)------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], constraints=[PrimaryKey([3])], file_type=csv, has_header=true
 
 # drop table multiple_ordered_table_with_pk
 statement ok
@@ -4021,12 +4008,10 @@ logical_plan
 02)--TableScan: multiple_ordered_table_with_pk projection=[b, c, d]
 physical_plan
 01)AggregateExec: mode=FinalPartitioned, gby=[c@0 as c, b@1 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
-02)--SortExec: expr=[c@0 ASC NULLS LAST], preserve_partitioning=[true]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------RepartitionExec: partitioning=Hash([c@0, b@1], 8), input_partitions=8
-05)--------AggregateExec: mode=Partial, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
-06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], constraints=[PrimaryKey([3])], file_type=csv, has_header=true
+02)--RepartitionExec: partitioning=Hash([c@0, b@1], 8), input_partitions=8, preserve_order=true, sort_exprs=c@0 ASC NULLS LAST
+03)----AggregateExec: mode=Partial, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
+04)------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], constraints=[PrimaryKey([3])], file_type=csv, has_header=true
 
 statement ok
 set datafusion.execution.target_partitions = 1;
@@ -4094,15 +4079,13 @@ logical_plan
 09)--------Aggregate: groupBy=[[multiple_ordered_table_with_pk.c, multiple_ordered_table_with_pk.b]], aggr=[[sum(CAST(multiple_ordered_table_with_pk.d AS Int64))]]
 10)----------TableScan: multiple_ordered_table_with_pk projection=[b, c, d]
 physical_plan
-01)ProjectionExec: expr=[c@0 as c, c@2 as c, sum1@1 as sum1, sum1@3 as sum1]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(b@1, b@1)], projection=[c@0, sum1@2, c@3, sum1@5]
-04)------ProjectionExec: expr=[c@0 as c, b@1 as b, sum(multiple_ordered_table_with_pk.d)@2 as sum1]
-05)--------AggregateExec: mode=Single, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
-06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], constraints=[PrimaryKey([3])], file_type=csv, has_header=true
-07)------ProjectionExec: expr=[c@0 as c, b@1 as b, sum(multiple_ordered_table_with_pk.d)@2 as sum1]
-08)--------AggregateExec: mode=Single, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
-09)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], constraints=[PrimaryKey([3])], file_type=csv, has_header=true
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(b@1, b@1)], projection=[c@0, c@3, sum1@2, sum1@5]
+02)--ProjectionExec: expr=[c@0 as c, b@1 as b, sum(multiple_ordered_table_with_pk.d)@2 as sum1]
+03)----AggregateExec: mode=Single, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], constraints=[PrimaryKey([3])], file_type=csv, has_header=true
+05)--ProjectionExec: expr=[c@0 as c, b@1 as b, sum(multiple_ordered_table_with_pk.d)@2 as sum1]
+06)----AggregateExec: mode=Single, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
+07)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], constraints=[PrimaryKey([3])], file_type=csv, has_header=true
 
 query TT
 EXPLAIN SELECT lhs.c, rhs.c, lhs.sum1, rhs.sum1
@@ -4244,11 +4227,9 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[sum(DISTINCT t1.x)@1 as sum(DISTINCT t1.x), max(DISTINCT t1.x)@2 as max(DISTINCT t1.x)]
 02)--AggregateExec: mode=FinalPartitioned, gby=[y@0 as y], aggr=[sum(DISTINCT t1.x), max(DISTINCT t1.x)]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------RepartitionExec: partitioning=Hash([y@0], 8), input_partitions=8
-05)--------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-06)----------AggregateExec: mode=Partial, gby=[y@1 as y], aggr=[sum(DISTINCT t1.x), max(DISTINCT t1.x)]
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----RepartitionExec: partitioning=Hash([y@0], 8), input_partitions=1
+04)------AggregateExec: mode=Partial, gby=[y@1 as y], aggr=[sum(DISTINCT t1.x), max(DISTINCT t1.x)]
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TT
 EXPLAIN SELECT SUM(DISTINCT CAST(x AS DOUBLE)), MAX(DISTINCT CAST(x AS DOUBLE)) FROM t1 GROUP BY y;
@@ -4261,15 +4242,12 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[sum(alias1)@1 as sum(DISTINCT t1.x), max(alias1)@2 as max(DISTINCT t1.x)]
 02)--AggregateExec: mode=FinalPartitioned, gby=[y@0 as y], aggr=[sum(alias1), max(alias1)]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------RepartitionExec: partitioning=Hash([y@0], 8), input_partitions=8
-05)--------AggregateExec: mode=Partial, gby=[y@0 as y], aggr=[sum(alias1), max(alias1)]
-06)----------AggregateExec: mode=FinalPartitioned, gby=[y@0 as y, alias1@1 as alias1], aggr=[]
-07)------------CoalesceBatchesExec: target_batch_size=2
-08)--------------RepartitionExec: partitioning=Hash([y@0, alias1@1], 8), input_partitions=8
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-10)------------------AggregateExec: mode=Partial, gby=[y@1 as y, CAST(x@0 AS Float64) as alias1], aggr=[]
-11)--------------------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----RepartitionExec: partitioning=Hash([y@0], 8), input_partitions=8
+04)------AggregateExec: mode=Partial, gby=[y@0 as y], aggr=[sum(alias1), max(alias1)]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[y@0 as y, alias1@1 as alias1], aggr=[]
+06)----------RepartitionExec: partitioning=Hash([y@0, alias1@1], 8), input_partitions=1
+07)------------AggregateExec: mode=Partial, gby=[y@1 as y, CAST(x@0 AS Float64) as alias1], aggr=[]
+08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # create an unbounded table that contains ordered timestamp.
 statement ok
@@ -4298,11 +4276,10 @@ physical_plan
 01)SortPreservingMergeExec: [time_chunks@0 DESC], fetch=5
 02)--ProjectionExec: expr=[date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)@0 as time_chunks]
 03)----AggregateExec: mode=FinalPartitioned, gby=[date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)@0 as date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)], aggr=[], ordering_mode=Sorted
-04)------CoalesceBatchesExec: target_batch_size=2
-05)--------RepartitionExec: partitioning=Hash([date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)@0], 8), input_partitions=8, preserve_order=true, sort_exprs=date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)@0 DESC
-06)----------AggregateExec: mode=Partial, gby=[date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 900000000000 }, ts@0) as date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)], aggr=[], ordering_mode=Sorted
-07)------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-08)--------------StreamingTableExec: partition_sizes=1, projection=[ts], infinite_source=true, output_ordering=[ts@0 DESC]
+04)------RepartitionExec: partitioning=Hash([date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)@0], 8), input_partitions=8, preserve_order=true, sort_exprs=date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)@0 DESC
+05)--------AggregateExec: mode=Partial, gby=[date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 900000000000 }, ts@0) as date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)], aggr=[], ordering_mode=Sorted
+06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+07)------------StreamingTableExec: partition_sizes=1, projection=[ts], infinite_source=true, output_ordering=[ts@0 DESC]
 
 query P
 SELECT date_bin('15 minutes', ts) as time_chunks
@@ -4352,12 +4329,11 @@ physical_plan
 01)SortPreservingMergeExec: [months@0 DESC], fetch=5
 02)--SortExec: TopK(fetch=5), expr=[months@0 DESC], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[date_part(Utf8("MONTH"),csv_with_timestamps.ts)@0 as months]
-04)------AggregateExec: mode=FinalPartitioned, gby=[date_part(Utf8("MONTH"),csv_with_timestamps.ts)@0 as date_part(Utf8("MONTH"),csv_with_timestamps.ts)], aggr=[]
-05)--------CoalesceBatchesExec: target_batch_size=2
-06)----------RepartitionExec: partitioning=Hash([date_part(Utf8("MONTH"),csv_with_timestamps.ts)@0], 8), input_partitions=8
-07)------------AggregateExec: mode=Partial, gby=[date_part(MONTH, ts@0) as date_part(Utf8("MONTH"),csv_with_timestamps.ts)], aggr=[]
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/timestamps.csv]]}, projection=[ts], output_ordering=[ts@0 DESC], file_type=csv, has_header=false
+04)------AggregateExec: mode=FinalPartitioned, gby=[date_part(Utf8("MONTH"),csv_with_timestamps.ts)@0 as date_part(Utf8("MONTH"),csv_with_timestamps.ts)], aggr=[], lim=[5]
+05)--------RepartitionExec: partitioning=Hash([date_part(Utf8("MONTH"),csv_with_timestamps.ts)@0], 8), input_partitions=8
+06)----------AggregateExec: mode=Partial, gby=[date_part(MONTH, ts@0) as date_part(Utf8("MONTH"),csv_with_timestamps.ts)], aggr=[], lim=[5]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/timestamps.csv]]}, projection=[ts], output_ordering=[ts@0 DESC], file_type=csv, has_header=false
 
 query I
 SELECT extract(month from ts) as months
@@ -4396,7 +4372,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [name@0 DESC, time_chunks@1 DESC], fetch=5
 02)--ProjectionExec: expr=[name@0 as name, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 900000000000 }, ts@1) as time_chunks]
-03)----RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
 04)------StreamingTableExec: partition_sizes=1, projection=[name, ts], infinite_source=true, output_ordering=[name@0 DESC, ts@1 DESC]
 
 statement ok
@@ -4465,15 +4441,13 @@ physical_plan
 02)--SortExec: expr=[c1@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[c1@0 as c1, count(alias1)@1 as count(DISTINCT aggregate_test_100.c2), min(alias1)@2 as min(DISTINCT aggregate_test_100.c2), sum(alias2)@3 as sum(aggregate_test_100.c3), max(alias3)@4 as max(aggregate_test_100.c4)]
 04)------AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[count(alias1), min(alias1), sum(alias2), max(alias3)]
-05)--------CoalesceBatchesExec: target_batch_size=2
-06)----------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8
-07)------------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[count(alias1), min(alias1), sum(alias2), max(alias3)]
-08)--------------AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1, alias1@1 as alias1], aggr=[alias2, alias3]
-09)----------------CoalesceBatchesExec: target_batch_size=2
-10)------------------RepartitionExec: partitioning=Hash([c1@0, alias1@1], 8), input_partitions=8
-11)--------------------AggregateExec: mode=Partial, gby=[c1@0 as c1, c2@1 as alias1], aggr=[alias2, alias3]
-12)----------------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-13)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c4], file_type=csv, has_header=true
+05)--------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8
+06)----------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[count(alias1), min(alias1), sum(alias2), max(alias3)]
+07)------------AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1, alias1@1 as alias1], aggr=[alias2, alias3]
+08)--------------RepartitionExec: partitioning=Hash([c1@0, alias1@1], 8), input_partitions=8
+09)----------------AggregateExec: mode=Partial, gby=[c1@0 as c1, c2@1 as alias1], aggr=[alias2, alias3]
+10)------------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c4], file_type=csv, has_header=true
 
 query II
 SELECT c2, count(distinct c3) FILTER (WHERE c1 != 'a') FROM aggregate_test_100 GROUP BY c2 ORDER BY c2;
@@ -4635,11 +4609,10 @@ physical_plan
 01)SortPreservingMergeExec: [max(timestamp_table.t1)@1 DESC], fetch=4
 02)--SortExec: TopK(fetch=4), expr=[max(timestamp_table.t1)@1 DESC], preserve_partitioning=[true]
 03)----AggregateExec: mode=FinalPartitioned, gby=[c2@0 as c2], aggr=[max(timestamp_table.t1)], lim=[4]
-04)------CoalesceBatchesExec: target_batch_size=2
-05)--------RepartitionExec: partitioning=Hash([c2@0], 8), input_partitions=8
-06)----------AggregateExec: mode=Partial, gby=[c2@1 as c2], aggr=[max(timestamp_table.t1)], lim=[4]
-07)------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4
-08)--------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/group_by/timestamp_table/0.csv], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/group_by/timestamp_table/1.csv], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/group_by/timestamp_table/2.csv], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/group_by/timestamp_table/3.csv]]}, projection=[t1, c2], file_type=csv, has_header=true
+04)------RepartitionExec: partitioning=Hash([c2@0], 8), input_partitions=8
+05)--------AggregateExec: mode=Partial, gby=[c2@1 as c2], aggr=[max(timestamp_table.t1)], lim=[4]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4
+07)------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/group_by/timestamp_table/0.csv], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/group_by/timestamp_table/1.csv], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/group_by/timestamp_table/2.csv], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/group_by/timestamp_table/3.csv]]}, projection=[t1, c2], file_type=csv, has_header=true
 
 # Clean up
 statement ok
@@ -5171,10 +5144,9 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 }"),keywords_stream.ts,Utf8("2000-01-01"))@0 as ts_chunk, count(keywords_stream.keyword)@1 as alert_keyword_count]
 02)--AggregateExec: mode=Single, gby=[date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 }, ts@0, 946684800000000000) as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 }"),keywords_stream.ts,Utf8("2000-01-01"))], aggr=[count(keywords_stream.keyword)]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(keyword@0, keyword@1)]
-05)--------DataSourceExec: partitions=1, partition_sizes=[3]
-06)--------DataSourceExec: partitions=1, partition_sizes=[3]
+03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(keyword@0, keyword@1)]
+04)------DataSourceExec: partitions=1, partition_sizes=[3]
+05)------DataSourceExec: partitions=1, partition_sizes=[3]
 
 query PI
 SELECT
@@ -5231,6 +5203,41 @@ NULL NULL 1
 statement ok
 drop table t;
 
+# regression: duplicate grouping sets must not be collapsed into one
+statement ok
+create table duplicate_grouping_sets(deptno int, job varchar, sal int, comm int) as values
+(10, 'CLERK', 1300, null),
+(20, 'MANAGER', 3000, null);
+
+query ITIIIII
+select deptno, job, sal, sum(comm), grouping(deptno), grouping(job), grouping(sal)
+from duplicate_grouping_sets
+group by grouping sets ((deptno, job), (deptno, sal), (deptno, job))
+order by deptno, job, sal, grouping(deptno), grouping(job), grouping(sal);
+----
+10 CLERK NULL NULL 0 0 1
+10 CLERK NULL NULL 0 0 1
+10 NULL 1300 NULL 0 1 0
+20 MANAGER NULL NULL 0 0 1
+20 MANAGER NULL NULL 0 0 1
+20 NULL 3000 NULL 0 1 0
+
+query ITII
+select deptno, job, sal, grouping(deptno, job, sal)
+from duplicate_grouping_sets
+group by grouping sets ((deptno, job), (deptno, sal), (deptno, job))
+order by deptno, job, sal, grouping(deptno, job, sal);
+----
+10 CLERK NULL 1
+10 CLERK NULL 1
+10 NULL 1300 2
+20 MANAGER NULL 1
+20 MANAGER NULL 1
+20 NULL 3000 2
+
+statement ok
+drop table duplicate_grouping_sets;
+
 # test multi group by for binary type without nulls
 statement ok
 create table t(a int, b bytea) as values (1, 0xa), (1, 0xa), (2, 0xb), (3, 0xb), (3, 0xb);
@@ -5506,7 +5513,7 @@ create table source as values
 ;
 
 statement ok
-create view t as select column1 as a, arrow_cast(column2, 'Timestamp(Nanosecond, None)') as b from source;
+create view t as select column1 as a, arrow_cast(column2, 'Timestamp(ns)') as b from source;
 
 query IPI
 select a, b, count(*) from t group by a, b order by a, b;
@@ -5623,5 +5630,14 @@ physical_plan
 01)AggregateExec: mode=Single, gby=[a@0 as a, b@1 as b], aggr=[]
 02)--DataSourceExec: partitions=1, partition_sizes=[0]
 
+# Config reset
+statement ok
+reset datafusion.execution.batch_size;
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
+
 statement count 0
 drop table t;
diff --git a/datafusion/sqllogictest/test_files/grouping.slt b/datafusion/sqllogictest/test_files/grouping.slt
index 64d040d012f99..3d38576bdbf5f 100644
--- a/datafusion/sqllogictest/test_files/grouping.slt
+++ b/datafusion/sqllogictest/test_files/grouping.slt
@@ -212,3 +212,15 @@ select c1, grouping(c1, c2) from test group by CUBE(c1);
 
 statement error zero arguments
 select c1, grouping() from test group by CUBE(c1);
+
+# grouping_sets_with_empty_set
+query I
+SELECT COUNT(*) FROM test GROUP BY GROUPING SETS (());
+----
+2
+
+# grouping_sets_with_empty_set
+query I
+SELECT SUM(v1) FROM generate_series(10) AS t1(v1) GROUP BY GROUPING SETS(())
+----
+55
diff --git a/datafusion/sqllogictest/test_files/grouping_set_repartition.slt b/datafusion/sqllogictest/test_files/grouping_set_repartition.slt
new file mode 100644
index 0000000000000..16ab90651c8b3
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/grouping_set_repartition.slt
@@ -0,0 +1,246 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+# Tests for ROLLUP/CUBE/GROUPING SETS with multiple partitions
+#
+# This tests the fix for https://github.com/apache/datafusion/issues/19849
+# where ROLLUP queries produced incorrect results with multiple partitions
+# because subset partitioning satisfaction was incorrectly applied.
+#
+# The bug manifests when:
+# 1. UNION ALL of subqueries each with hash-partitioned aggregates
+# 2. Outer ROLLUP groups by more columns than inner hash partitioning
+# 3. InterleaveExec preserves the inner hash partitioning
+# 4. Optimizer incorrectly uses subset satisfaction, skipping necessary repartition
+#
+# The fix ensures that when hash partitioning includes __grouping_id,
+# subset satisfaction is disabled and proper RepartitionExec is inserted.
+##########
+
+##########
+# SETUP: Create partitioned parquet files to simulate distributed data
+##########
+
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+set datafusion.optimizer.repartition_aggregations = true;
+
+# Create partition 1
+statement ok
+COPY (SELECT column1 as channel, column2 as brand, column3 as amount FROM (VALUES
+    ('store', 'nike', 100),
+    ('store', 'nike', 200),
+    ('store', 'adidas', 150)
+))
+TO 'test_files/scratch/grouping_set_repartition/part=1/data.parquet'
+STORED AS PARQUET;
+
+# Create partition 2
+statement ok
+COPY (SELECT column1 as channel, column2 as brand, column3 as amount FROM (VALUES
+    ('store', 'adidas', 250),
+    ('web', 'nike', 300),
+    ('web', 'nike', 400)
+))
+TO 'test_files/scratch/grouping_set_repartition/part=2/data.parquet'
+STORED AS PARQUET;
+
+# Create partition 3
+statement ok
+COPY (SELECT column1 as channel, column2 as brand, column3 as amount FROM (VALUES
+    ('web', 'adidas', 350),
+    ('web', 'adidas', 450),
+    ('catalog', 'nike', 500)
+))
+TO 'test_files/scratch/grouping_set_repartition/part=3/data.parquet'
+STORED AS PARQUET;
+
+# Create partition 4
+statement ok
+COPY (SELECT column1 as channel, column2 as brand, column3 as amount FROM (VALUES
+    ('catalog', 'nike', 600),
+    ('catalog', 'adidas', 550),
+    ('catalog', 'adidas', 650)
+))
+TO 'test_files/scratch/grouping_set_repartition/part=4/data.parquet'
+STORED AS PARQUET;
+
+# Create external table pointing to the partitioned data
+statement ok
+CREATE EXTERNAL TABLE sales (channel VARCHAR, brand VARCHAR, amount INT)
+STORED AS PARQUET
+PARTITIONED BY (part INT)
+LOCATION 'test_files/scratch/grouping_set_repartition/';
+
+##########
+# TEST 1: UNION ALL + ROLLUP pattern (similar to TPC-DS q14)
+# This query pattern triggers the subset satisfaction bug because:
+# - Each UNION ALL branch has hash partitioning on (brand)
+# - The outer ROLLUP requires hash partitioning on (channel, brand, __grouping_id)
+# - Without the fix, subset satisfaction incorrectly skips repartition
+#
+# Verify the physical plan includes RepartitionExec with __grouping_id
+##########
+
+query TT
+EXPLAIN SELECT channel, brand, SUM(total) as grand_total
+FROM (
+    SELECT 'store' as channel, brand, SUM(amount) as total
+    FROM sales WHERE channel = 'store'
+    GROUP BY brand
+    UNION ALL
+    SELECT 'web' as channel, brand, SUM(amount) as total
+    FROM sales WHERE channel = 'web'
+    GROUP BY brand
+    UNION ALL
+    SELECT 'catalog' as channel, brand, SUM(amount) as total
+    FROM sales WHERE channel = 'catalog'
+    GROUP BY brand
+) sub
+GROUP BY ROLLUP(channel, brand)
+ORDER BY channel NULLS FIRST, brand NULLS FIRST;
+----
+logical_plan
+01)Sort: sub.channel ASC NULLS FIRST, sub.brand ASC NULLS FIRST
+02)--Projection: sub.channel, sub.brand, sum(sub.total) AS grand_total
+03)----Aggregate: groupBy=[[ROLLUP (sub.channel, sub.brand)]], aggr=[[sum(sub.total)]]
+04)------SubqueryAlias: sub
+05)--------Union
+06)----------Projection: Utf8("store") AS channel, sales.brand, sum(sales.amount) AS total
+07)------------Aggregate: groupBy=[[sales.brand]], aggr=[[sum(CAST(sales.amount AS Int64))]]
+08)--------------Projection: sales.brand, sales.amount
+09)----------------Filter: sales.channel = Utf8View("store")
+10)------------------TableScan: sales projection=[channel, brand, amount], partial_filters=[sales.channel = Utf8View("store")]
+11)----------Projection: Utf8("web") AS channel, sales.brand, sum(sales.amount) AS total
+12)------------Aggregate: groupBy=[[sales.brand]], aggr=[[sum(CAST(sales.amount AS Int64))]]
+13)--------------Projection: sales.brand, sales.amount
+14)----------------Filter: sales.channel = Utf8View("web")
+15)------------------TableScan: sales projection=[channel, brand, amount], partial_filters=[sales.channel = Utf8View("web")]
+16)----------Projection: Utf8("catalog") AS channel, sales.brand, sum(sales.amount) AS total
+17)------------Aggregate: groupBy=[[sales.brand]], aggr=[[sum(CAST(sales.amount AS Int64))]]
+18)--------------Projection: sales.brand, sales.amount
+19)----------------Filter: sales.channel = Utf8View("catalog")
+20)------------------TableScan: sales projection=[channel, brand, amount], partial_filters=[sales.channel = Utf8View("catalog")]
+physical_plan
+01)SortPreservingMergeExec: [channel@0 ASC, brand@1 ASC]
+02)--SortExec: expr=[channel@0 ASC, brand@1 ASC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[channel@0 as channel, brand@1 as brand, sum(sub.total)@3 as grand_total]
+04)------AggregateExec: mode=FinalPartitioned, gby=[channel@0 as channel, brand@1 as brand, __grouping_id@2 as __grouping_id], aggr=[sum(sub.total)]
+05)--------RepartitionExec: partitioning=Hash([channel@0, brand@1, __grouping_id@2], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[(NULL as channel, NULL as brand), (channel@0 as channel, NULL as brand), (channel@0 as channel, brand@1 as brand)], aggr=[sum(sub.total)]
+07)------------InterleaveExec
+08)--------------ProjectionExec: expr=[store as channel, brand@0 as brand, sum(sales.amount)@1 as total]
+09)----------------AggregateExec: mode=FinalPartitioned, gby=[brand@0 as brand], aggr=[sum(sales.amount)]
+10)------------------RepartitionExec: partitioning=Hash([brand@0], 4), input_partitions=4
+11)--------------------AggregateExec: mode=Partial, gby=[brand@0 as brand], aggr=[sum(sales.amount)]
+12)----------------------FilterExec: channel@0 = store, projection=[brand@1, amount@2]
+13)------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=1/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=2/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=3/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=4/data.parquet]]}, projection=[channel, brand, amount], file_type=parquet, predicate=channel@0 = store, pruning_predicate=channel_null_count@2 != row_count@3 AND channel_min@0 <= store AND store <= channel_max@1, required_guarantees=[channel in (store)]
+14)--------------ProjectionExec: expr=[web as channel, brand@0 as brand, sum(sales.amount)@1 as total]
+15)----------------AggregateExec: mode=FinalPartitioned, gby=[brand@0 as brand], aggr=[sum(sales.amount)]
+16)------------------RepartitionExec: partitioning=Hash([brand@0], 4), input_partitions=4
+17)--------------------AggregateExec: mode=Partial, gby=[brand@0 as brand], aggr=[sum(sales.amount)]
+18)----------------------FilterExec: channel@0 = web, projection=[brand@1, amount@2]
+19)------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=1/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=2/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=3/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=4/data.parquet]]}, projection=[channel, brand, amount], file_type=parquet, predicate=channel@0 = web, pruning_predicate=channel_null_count@2 != row_count@3 AND channel_min@0 <= web AND web <= channel_max@1, required_guarantees=[channel in (web)]
+20)--------------ProjectionExec: expr=[catalog as channel, brand@0 as brand, sum(sales.amount)@1 as total]
+21)----------------AggregateExec: mode=FinalPartitioned, gby=[brand@0 as brand], aggr=[sum(sales.amount)]
+22)------------------RepartitionExec: partitioning=Hash([brand@0], 4), input_partitions=4
+23)--------------------AggregateExec: mode=Partial, gby=[brand@0 as brand], aggr=[sum(sales.amount)]
+24)----------------------FilterExec: channel@0 = catalog, projection=[brand@1, amount@2]
+25)------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=1/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=2/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=3/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=4/data.parquet]]}, projection=[channel, brand, amount], file_type=parquet, predicate=channel@0 = catalog, pruning_predicate=channel_null_count@2 != row_count@3 AND channel_min@0 <= catalog AND catalog <= channel_max@1, required_guarantees=[channel in (catalog)]
+
+query TTI rowsort
+SELECT channel, brand, SUM(total) as grand_total
+FROM (
+    SELECT 'store' as channel, brand, SUM(amount) as total
+    FROM sales WHERE channel = 'store'
+    GROUP BY brand
+    UNION ALL
+    SELECT 'web' as channel, brand, SUM(amount) as total
+    FROM sales WHERE channel = 'web'
+    GROUP BY brand
+    UNION ALL
+    SELECT 'catalog' as channel, brand, SUM(amount) as total
+    FROM sales WHERE channel = 'catalog'
+    GROUP BY brand
+) sub
+GROUP BY ROLLUP(channel, brand)
+ORDER BY channel NULLS FIRST, brand NULLS FIRST;
+----
+NULL NULL 4500
+catalog NULL 2300
+catalog adidas 1200
+catalog nike 1100
+store NULL 700
+store adidas 400
+store nike 300
+web NULL 1500
+web adidas 800
+web nike 700
+
+##########
+# TEST 2: Simple ROLLUP (baseline test)
+##########
+
+query TTI rowsort
+SELECT channel, brand, SUM(amount) as total
+FROM sales
+GROUP BY ROLLUP(channel, brand)
+ORDER BY channel NULLS FIRST, brand NULLS FIRST;
+----
+NULL NULL 4500
+catalog NULL 2300
+catalog adidas 1200
+catalog nike 1100
+store NULL 700
+store adidas 400
+store nike 300
+web NULL 1500
+web adidas 800
+web nike 700
+
+##########
+# TEST 3: Verify CUBE also works correctly
+##########
+
+query TTI rowsort
+SELECT channel, brand, SUM(amount) as total
+FROM sales
+GROUP BY CUBE(channel, brand)
+ORDER BY channel NULLS FIRST, brand NULLS FIRST;
+----
+NULL NULL 4500
+NULL adidas 2400
+NULL nike 2100
+catalog NULL 2300
+catalog adidas 1200
+catalog nike 1100
+store NULL 700
+store adidas 400
+store nike 300
+web NULL 1500
+web adidas 800
+web nike 700
+
+##########
+# CLEANUP
+##########
+
+statement ok
+DROP TABLE sales;
diff --git a/datafusion/sqllogictest/test_files/ident_normalization.slt b/datafusion/sqllogictest/test_files/ident_normalization.slt
index ac2f460ebc430..b1bdb1d882274 100644
--- a/datafusion/sqllogictest/test_files/ident_normalization.slt
+++ b/datafusion/sqllogictest/test_files/ident_normalization.slt
@@ -137,6 +137,10 @@ HelloWorld
 statement ok
 set datafusion.sql_parser.enable_ident_normalization = true;
 
+# Config reset
+statement ok
+RESET datafusion.catalog.information_schema;
+
 ##########
 ## Constraint Column Name Normalization
 ##########
diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt
index 7009d976d646f..b04c78bd2774c 100644
--- a/datafusion/sqllogictest/test_files/information_schema.slt
+++ b/datafusion/sqllogictest/test_files/information_schema.slt
@@ -217,12 +217,15 @@ datafusion.catalog.newlines_in_values false
 datafusion.execution.batch_size 8192
 datafusion.execution.coalesce_batches true
 datafusion.execution.collect_statistics true
+datafusion.execution.enable_ansi_mode false
 datafusion.execution.enable_recursive_ctes true
 datafusion.execution.enforce_batch_size_in_joins false
+datafusion.execution.hash_join_buffering_capacity 0
 datafusion.execution.keep_partition_by_columns false
 datafusion.execution.listing_table_factory_infer_partitions true
 datafusion.execution.listing_table_ignore_subdirectory true
 datafusion.execution.max_buffered_batches_per_output_file 2
+datafusion.execution.max_spill_file_size_bytes 134217728
 datafusion.execution.meta_fetch_concurrency 32
 datafusion.execution.minimum_parallel_output_files 4
 datafusion.execution.objectstore_writer_buffer_size 10485760
@@ -242,6 +245,7 @@ datafusion.execution.parquet.dictionary_enabled true
 datafusion.execution.parquet.dictionary_page_size_limit 1048576
 datafusion.execution.parquet.enable_page_index true
 datafusion.execution.parquet.encoding NULL
+datafusion.execution.parquet.force_filter_selections false
 datafusion.execution.parquet.max_predicate_cache_size NULL
 datafusion.execution.parquet.max_row_group_size 1048576
 datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2
@@ -255,20 +259,25 @@ datafusion.execution.parquet.skip_arrow_metadata false
 datafusion.execution.parquet.skip_metadata true
 datafusion.execution.parquet.statistics_enabled page
 datafusion.execution.parquet.statistics_truncate_length 64
+datafusion.execution.parquet.use_content_defined_chunking NULL
 datafusion.execution.parquet.write_batch_size 1024
 datafusion.execution.parquet.writer_version 1.0
+datafusion.execution.perfect_hash_join_min_key_density 0.15
+datafusion.execution.perfect_hash_join_small_build_threshold 1024
 datafusion.execution.planning_concurrency 13
 datafusion.execution.skip_partial_aggregation_probe_ratio_threshold 0.8
 datafusion.execution.skip_partial_aggregation_probe_rows_threshold 100000
 datafusion.execution.skip_physical_aggregate_schema_check false
 datafusion.execution.soft_max_rows_per_output_file 50000000
 datafusion.execution.sort_in_place_threshold_bytes 1048576
+datafusion.execution.sort_pushdown_buffer_capacity 1073741824
 datafusion.execution.sort_spill_reservation_bytes 10485760
 datafusion.execution.spill_compression uncompressed
 datafusion.execution.split_file_groups_by_statistics false
 datafusion.execution.target_partitions 7
 datafusion.execution.time_zone NULL
 datafusion.execution.use_row_number_estimates_to_optimize_partitioning false
+datafusion.explain.analyze_categories all
 datafusion.explain.analyze_level dev
 datafusion.explain.format indent
 datafusion.explain.logical_plan_only false
@@ -288,22 +297,31 @@ datafusion.format.timestamp_tz_format NULL
 datafusion.format.types_info false
 datafusion.optimizer.allow_symmetric_joins_without_pruning true
 datafusion.optimizer.default_filter_selectivity 20
+datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown true
 datafusion.optimizer.enable_distinct_aggregation_soft_limit true
 datafusion.optimizer.enable_dynamic_filter_pushdown true
 datafusion.optimizer.enable_join_dynamic_filter_pushdown true
+datafusion.optimizer.enable_leaf_expression_pushdown true
 datafusion.optimizer.enable_piecewise_merge_join false
 datafusion.optimizer.enable_round_robin_repartition true
+datafusion.optimizer.enable_sort_pushdown true
 datafusion.optimizer.enable_topk_aggregation true
 datafusion.optimizer.enable_topk_dynamic_filter_pushdown true
+datafusion.optimizer.enable_topk_repartition true
 datafusion.optimizer.enable_window_limits true
+datafusion.optimizer.enable_window_topn false
 datafusion.optimizer.expand_views_at_output false
 datafusion.optimizer.filter_null_join_keys false
+datafusion.optimizer.hash_join_inlist_pushdown_max_distinct_values 150
+datafusion.optimizer.hash_join_inlist_pushdown_max_size 131072
 datafusion.optimizer.hash_join_single_partition_threshold 1048576
 datafusion.optimizer.hash_join_single_partition_threshold_rows 131072
+datafusion.optimizer.join_reordering true
 datafusion.optimizer.max_passes 3
 datafusion.optimizer.prefer_existing_sort false
 datafusion.optimizer.prefer_existing_union false
 datafusion.optimizer.prefer_hash_join true
+datafusion.optimizer.preserve_file_partitions 0
 datafusion.optimizer.repartition_aggregations true
 datafusion.optimizer.repartition_file_min_size 10485760
 datafusion.optimizer.repartition_file_scans true
@@ -311,12 +329,21 @@ datafusion.optimizer.repartition_joins true
 datafusion.optimizer.repartition_sorts true
 datafusion.optimizer.repartition_windows true
 datafusion.optimizer.skip_failed_rules false
+datafusion.optimizer.subset_repartition_threshold 4
 datafusion.optimizer.top_down_join_key_reordering true
+datafusion.optimizer.use_statistics_registry false
+datafusion.runtime.list_files_cache_limit 1M
+datafusion.runtime.list_files_cache_ttl NULL
+datafusion.runtime.max_temp_directory_size 100G
+datafusion.runtime.memory_limit unlimited
+datafusion.runtime.metadata_cache_limit 50M
+datafusion.runtime.temp_directory NULL
 datafusion.sql_parser.collect_spans false
 datafusion.sql_parser.default_null_ordering nulls_max
 datafusion.sql_parser.dialect generic
 datafusion.sql_parser.enable_ident_normalization true
 datafusion.sql_parser.enable_options_value_normalization false
+datafusion.sql_parser.enable_subquery_sort_elimination true
 datafusion.sql_parser.map_string_types_to_utf8view true
 datafusion.sql_parser.parse_float_as_decimal false
 datafusion.sql_parser.recursion_limit 50
@@ -337,12 +364,15 @@ datafusion.catalog.newlines_in_values false Specifies whether newlines in (quote
 datafusion.execution.batch_size 8192 Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption
 datafusion.execution.coalesce_batches true When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting
 datafusion.execution.collect_statistics true Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true.
+datafusion.execution.enable_ansi_mode false Whether to enable ANSI SQL mode. The flag is experimental and relevant only for DataFusion Spark built-in functions When `enable_ansi_mode` is set to `true`, the query engine follows ANSI SQL semantics for expressions, casting, and error handling. This means: - **Strict type coercion rules:** implicit casts between incompatible types are disallowed. - **Standard SQL arithmetic behavior:** operations such as division by zero,   numeric overflow, or invalid casts raise runtime errors rather than returning   `NULL` or adjusted values. - **Consistent ANSI behavior** for string concatenation, comparisons, and `NULL` handling. When `enable_ansi_mode` is `false` (the default), the engine uses a more permissive, non-ANSI mode designed for user convenience and backward compatibility. In this mode: - Implicit casts between types are allowed (e.g., string to integer when possible). - Arithmetic operations are more lenient — for example, `abs()` on the minimum   representable integer value returns the input value instead of raising overflow. - Division by zero or invalid casts may return `NULL` instead of failing. # Default `false` — ANSI SQL mode is disabled by default.
 datafusion.execution.enable_recursive_ctes true Should DataFusion support recursive CTEs
 datafusion.execution.enforce_batch_size_in_joins false Should DataFusion enforce batch size in joins or not. By default, DataFusion will not enforce batch size in joins. Enforcing batch size in joins can reduce memory usage when joining large tables with a highly-selective join filter, but is also slightly slower.
+datafusion.execution.hash_join_buffering_capacity 0 How many bytes to buffer in the probe side of hash joins while the build side is concurrently being built. Without this, hash joins will wait until the full materialization of the build side before polling the probe side. This is useful in scenarios where the query is not completely CPU bounded, allowing to do some early work concurrently and reducing the latency of the query. Note that when hash join buffering is enabled, the probe side will start eagerly polling data, not giving time for the producer side of dynamic filters to produce any meaningful predicate. Queries with dynamic filters might see performance degradation. Disabled by default, set to a number greater than 0 for enabling it.
 datafusion.execution.keep_partition_by_columns false Should DataFusion keep the columns used for partition_by in the output RecordBatches
 datafusion.execution.listing_table_factory_infer_partitions true Should a `ListingTable` created through the `ListingTableFactory` infer table partitions from Hive compliant directories. Defaults to true (partition columns are inferred and will be represented in the table schema).
 datafusion.execution.listing_table_ignore_subdirectory true Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`).
 datafusion.execution.max_buffered_batches_per_output_file 2 This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption
+datafusion.execution.max_spill_file_size_bytes 134217728 Maximum size in bytes for individual spill files before rotating to a new file. When operators spill data to disk (e.g., RepartitionExec), they write multiple batches to the same file until this size limit is reached, then rotate to a new file. This reduces syscall overhead compared to one-file-per-batch while preventing files from growing too large. A larger value reduces file creation overhead but may hold more disk space. A smaller value creates more files but allows finer-grained space reclamation as files can be deleted once fully consumed. Now only `RepartitionExec` supports this spill file rotation feature, other spilling operators may create spill files larger than the limit. Default: 128 MB
 datafusion.execution.meta_fetch_concurrency 32 Number of files to read in parallel when inferring schema and statistics
 datafusion.execution.minimum_parallel_output_files 4 Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached.
 datafusion.execution.objectstore_writer_buffer_size 10485760 Size (bytes) of data buffer DataFusion uses when writing output files. This affects the size of the data chunks that are uploaded to remote object stores (e.g. AWS S3). If very large (>= 100 GiB) output files are being written, it may be necessary to increase this size to avoid errors from the remote end point.
@@ -354,7 +384,7 @@ datafusion.execution.parquet.bloom_filter_on_read true (reading) Use any availab
 datafusion.execution.parquet.bloom_filter_on_write false (writing) Write bloom filters for all columns when creating parquet files
 datafusion.execution.parquet.coerce_int96 NULL (reading) If true, parquet reader will read columns of physical type int96 as originating from a different resolution than nanosecond. This is useful for reading data from systems like Spark which stores microsecond resolution timestamps in an int96 allowing it to write values with a larger date range than 64-bit timestamps with nanosecond resolution.
 datafusion.execution.parquet.column_index_truncate_length 64 (writing) Sets column index truncate length
-datafusion.execution.parquet.compression zstd(3) (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting.
+datafusion.execution.parquet.compression zstd(3) (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting.
 datafusion.execution.parquet.created_by datafusion (writing) Sets "created by" property
 datafusion.execution.parquet.data_page_row_count_limit 20000 (writing) Sets best effort maximum number of rows in data page
 datafusion.execution.parquet.data_pagesize_limit 1048576 (writing) Sets best effort maximum size of data page in bytes
@@ -362,6 +392,7 @@ datafusion.execution.parquet.dictionary_enabled true (writing) Sets if dictionar
 datafusion.execution.parquet.dictionary_page_size_limit 1048576 (writing) Sets best effort maximum dictionary page size, in bytes
 datafusion.execution.parquet.enable_page_index true (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded.
 datafusion.execution.parquet.encoding NULL (writing)  Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting
+datafusion.execution.parquet.force_filter_selections false (reading) Force the use of RowSelections for filter results, when pushdown_filters is enabled. If false, the reader will automatically choose between a RowSelection and a Bitmap based on the number and pattern of selected rows.
 datafusion.execution.parquet.max_predicate_cache_size NULL (reading) The maximum predicate cache size, in bytes. When `pushdown_filters` is enabled, sets the maximum memory used to cache the results of predicate evaluation between filter evaluation and output generation. Decreasing this value will reduce memory usage, but may increase IO and CPU usage. None means use the default parquet reader setting. 0 means no caching.
 datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read.
 datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.
@@ -375,20 +406,25 @@ datafusion.execution.parquet.skip_arrow_metadata false (writing) Skip encoding t
 datafusion.execution.parquet.skip_metadata true (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata
 datafusion.execution.parquet.statistics_enabled page (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting
 datafusion.execution.parquet.statistics_truncate_length 64 (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting
-datafusion.execution.parquet.write_batch_size 1024 (writing) Sets write_batch_size in bytes
+datafusion.execution.parquet.use_content_defined_chunking NULL (writing) EXPERIMENTAL: Enable content-defined chunking (CDC) when writing parquet files. When `Some`, CDC is enabled with the given options; when `None` (the default), CDC is disabled. When CDC is enabled, parallel writing is automatically disabled since the chunker state must persist across row groups.
+datafusion.execution.parquet.write_batch_size 1024 (writing) Sets write_batch_size in rows
 datafusion.execution.parquet.writer_version 1.0 (writing) Sets parquet writer version valid values are "1.0" and "2.0"
+datafusion.execution.perfect_hash_join_min_key_density 0.15 The minimum required density of join keys on the build side to consider a perfect hash join (see `HashJoinExec` for more details). Density is calculated as: `(number of rows) / (max_key - min_key + 1)`. A perfect hash join may be used if the actual key density > this value. Currently only supports cases where build_side.num_rows() < u32::MAX. Support for build_side.num_rows() >= u32::MAX will be added in the future.
+datafusion.execution.perfect_hash_join_small_build_threshold 1024 A perfect hash join (see `HashJoinExec` for more details) will be considered if the range of keys (max - min) on the build side is < this threshold. This provides a fast path for joins with very small key ranges, bypassing the density check. Currently only supports cases where build_side.num_rows() < u32::MAX. Support for build_side.num_rows() >= u32::MAX will be added in the future.
 datafusion.execution.planning_concurrency 13 Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system
 datafusion.execution.skip_partial_aggregation_probe_ratio_threshold 0.8 Aggregation ratio (number of distinct groups / number of input rows) threshold for skipping partial aggregation. If the value is greater then partial aggregation will skip aggregation for further input
 datafusion.execution.skip_partial_aggregation_probe_rows_threshold 100000 Number of input rows partial aggregation partition should process, before aggregation ratio check and trying to switch to skipping aggregation mode
 datafusion.execution.skip_physical_aggregate_schema_check false When set to true, skips verifying that the schema produced by planning the input of `LogicalPlan::Aggregate` exactly matches the schema of the input plan. When set to false, if the schema does not match exactly (including nullability and metadata), a planning error will be raised. This is used to workaround bugs in the planner that are now caught by the new schema verification step.
 datafusion.execution.soft_max_rows_per_output_file 50000000 Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max
 datafusion.execution.sort_in_place_threshold_bytes 1048576 When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged.
+datafusion.execution.sort_pushdown_buffer_capacity 1073741824 Maximum buffer capacity (in bytes) per partition for BufferExec inserted during sort pushdown optimization. When PushdownSort eliminates a SortExec under SortPreservingMergeExec, a BufferExec is inserted to replace SortExec's buffering role. This prevents I/O stalls by allowing the scan to run ahead of the merge. This uses strictly less memory than the SortExec it replaces (which buffers the entire partition). The buffer respects the global memory pool limit. Setting this to a large value is safe — actual memory usage is bounded by partition size and global memory limits.
 datafusion.execution.sort_spill_reservation_bytes 10485760 Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured).
 datafusion.execution.spill_compression uncompressed Sets the compression codec used when spilling data to disk. Since datafusion writes spill files using the Arrow IPC Stream format, only codecs supported by the Arrow IPC Stream Writer are allowed. Valid values are: uncompressed, lz4_frame, zstd. Note: lz4_frame offers faster (de)compression, but typically results in larger spill files. In contrast, zstd achieves higher compression ratios at the cost of slower (de)compression speed.
 datafusion.execution.split_file_groups_by_statistics false Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental
 datafusion.execution.target_partitions 7 Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of CPU cores on the system
 datafusion.execution.time_zone NULL The default time zone Some functions, e.g. `now` return timestamps in this time zone
 datafusion.execution.use_row_number_estimates_to_optimize_partitioning false Should DataFusion use row number estimates at the input to decide whether increasing parallelism is beneficial or not. By default, only exact row numbers (not estimates) are used for this decision. Setting this flag to `true` will likely produce better plans. if the source of statistics is accurate. We plan to make this the default in the future.
+datafusion.explain.analyze_categories all Which metric categories to include in "EXPLAIN ANALYZE" output. Comma-separated list of: "rows", "bytes", "timing", "uncategorized". Use "none" to show plan structure only, or "all" (default) to show everything. Metrics without a declared category are treated as "uncategorized".
 datafusion.explain.analyze_level dev Verbosity level for "EXPLAIN ANALYZE". Default is "dev" "summary" shows common metrics for high-level insights. "dev" provides deep operator-level introspection for developers.
 datafusion.explain.format indent Display format of explain. Default is "indent". When set to "tree", it will print the plan in a tree-rendered format.
 datafusion.explain.logical_plan_only false When set to true, the explain statement will only print logical plans
@@ -408,22 +444,31 @@ datafusion.format.timestamp_tz_format NULL Timestamp format for timestamp with t
 datafusion.format.types_info false Show types in visual representation batches
 datafusion.optimizer.allow_symmetric_joins_without_pruning true Should DataFusion allow symmetric hash joins for unbounded data sources even when its inputs do not have any ordering or filtering If the flag is not enabled, the SymmetricHashJoin operator will be unable to prune its internal buffers, resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, RightAnti, and RightSemi - being produced only at the end of the execution. This is not typical in stream processing. Additionally, without proper design for long runner execution, all types of joins may encounter out-of-memory errors.
 datafusion.optimizer.default_filter_selectivity 20 The default filter selectivity used by Filter Statistics when an exact selectivity cannot be determined. Valid values are between 0 (no selectivity) and 100 (all rows are selected).
+datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown true When set to true, the optimizer will attempt to push down Aggregate dynamic filters into the file scan phase.
 datafusion.optimizer.enable_distinct_aggregation_soft_limit true When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read.
-datafusion.optimizer.enable_dynamic_filter_pushdown true When set to true attempts to push down dynamic filters generated by operators (topk & join) into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. The config will suppress `enable_join_dynamic_filter_pushdown` & `enable_topk_dynamic_filter_pushdown` So if you disable `enable_topk_dynamic_filter_pushdown`, then enable `enable_dynamic_filter_pushdown`, the `enable_topk_dynamic_filter_pushdown` will be overridden.
+datafusion.optimizer.enable_dynamic_filter_pushdown true When set to true attempts to push down dynamic filters generated by operators (TopK, Join & Aggregate) into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. The config will suppress `enable_join_dynamic_filter_pushdown`, `enable_topk_dynamic_filter_pushdown` & `enable_aggregate_dynamic_filter_pushdown` So if you disable `enable_topk_dynamic_filter_pushdown`, then enable `enable_dynamic_filter_pushdown`, the `enable_topk_dynamic_filter_pushdown` will be overridden.
 datafusion.optimizer.enable_join_dynamic_filter_pushdown true When set to true, the optimizer will attempt to push down Join dynamic filters into the file scan phase.
+datafusion.optimizer.enable_leaf_expression_pushdown true When set to true, the optimizer will extract leaf expressions (such as `get_field`) from filter/sort/join nodes into projections closer to the leaf table scans, and push those projections down towards the leaf nodes.
 datafusion.optimizer.enable_piecewise_merge_join false When set to true, piecewise merge join is enabled. PiecewiseMergeJoin is currently experimental. Physical planner will opt for PiecewiseMergeJoin when there is only one range filter.
 datafusion.optimizer.enable_round_robin_repartition true When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores
+datafusion.optimizer.enable_sort_pushdown true Enable sort pushdown optimization. When enabled, attempts to push sort requirements down to data sources that can natively handle them (e.g., by reversing file/row group read order). Returns **inexact ordering**: Sort operator is kept for correctness, but optimized input enables early termination for TopK queries (ORDER BY ... LIMIT N), providing significant speedup. Memory: No additional overhead (only changes read order). Future: Will add option to detect perfectly sorted data and eliminate Sort completely. Default: true
 datafusion.optimizer.enable_topk_aggregation true When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible
 datafusion.optimizer.enable_topk_dynamic_filter_pushdown true When set to true, the optimizer will attempt to push down TopK dynamic filters into the file scan phase.
+datafusion.optimizer.enable_topk_repartition true When set to true, the optimizer will push TopK (Sort with fetch) below hash repartition when the partition key is a prefix of the sort key, reducing data volume before the shuffle.
 datafusion.optimizer.enable_window_limits true When set to true, the optimizer will attempt to push limit operations past window functions, if possible
+datafusion.optimizer.enable_window_topn false When set to true, the optimizer will replace Filter(rn<=K) → Window(ROW_NUMBER) → Sort patterns with a PartitionedTopKExec that maintains per-partition heaps, avoiding a full sort of the input. When the window partition key has low cardinality, enabling this optimization can improve performance. However, for high cardinality keys, it may cause regressions in both memory usage and runtime.
 datafusion.optimizer.expand_views_at_output false When set to true, if the returned type is a view type then the output will be coerced to a non-view. Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`.
 datafusion.optimizer.filter_null_join_keys false When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down.
+datafusion.optimizer.hash_join_inlist_pushdown_max_distinct_values 150 Maximum number of distinct values (rows) in the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides with more rows than this will use hash table lookups instead. Set to 0 to always use hash table lookups. This provides an additional limit beyond `hash_join_inlist_pushdown_max_size` to prevent very large IN lists that might not provide much benefit over hash table lookups. This uses the deduplicated row count once the build side has been evaluated. The default is 150 values per partition. This is inspired by Trino's `max-filter-keys-per-column` setting. See: <https://trino.io/docs/current/admin/dynamic-filtering.html#dynamic-filter-collection-thresholds>
+datafusion.optimizer.hash_join_inlist_pushdown_max_size 131072 Maximum size in bytes for the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides larger than this will use hash table lookups instead. Set to 0 to always use hash table lookups. InList pushdown can be more efficient for small build sides because it can result in better statistics pruning as well as use any bloom filters present on the scan side. InList expressions are also more transparent and easier to serialize over the network in distributed uses of DataFusion. On the other hand InList pushdown requires making a copy of the data and thus adds some overhead to the build side and uses more memory. This setting is per-partition, so we may end up using `hash_join_inlist_pushdown_max_size` * `target_partitions` memory. The default is 128kB per partition. This should allow point lookup joins (e.g. joining on a unique primary key) to use InList pushdown in most cases but avoids excessive memory usage or overhead for larger joins.
 datafusion.optimizer.hash_join_single_partition_threshold 1048576 The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition
 datafusion.optimizer.hash_join_single_partition_threshold_rows 131072 The maximum estimated size in rows for one input side of a HashJoin will be collected into a single partition
+datafusion.optimizer.join_reordering true When set to true, the physical plan optimizer may swap join inputs based on statistics. When set to false, statistics-driven join input reordering is disabled and the original join order in the query is used.
 datafusion.optimizer.max_passes 3 Number of times that the optimizer will attempt to optimize the plan
 datafusion.optimizer.prefer_existing_sort false When true, DataFusion will opportunistically remove sorts when the data is already sorted, (i.e. setting `preserve_order` to true on `RepartitionExec`  and using `SortPreservingMergeExec`) When false, DataFusion will maximize plan parallelism using `RepartitionExec` even if this requires subsequently resorting data using a `SortExec`.
 datafusion.optimizer.prefer_existing_union false When set to true, the optimizer will not attempt to convert Union to Interleave
 datafusion.optimizer.prefer_hash_join true When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory
+datafusion.optimizer.preserve_file_partitions 0 Minimum number of distinct partition values required to group files by their Hive partition column values (enabling Hash partitioning declaration). How the option is used:     - preserve_file_partitions=0: Disable it.     - preserve_file_partitions=1: Always enable it.     - preserve_file_partitions=N, actual file partitions=M: Only enable when M >= N.     This threshold preserves I/O parallelism when file partitioning is below it. Note: This may reduce parallelism, rooting from the I/O level, if the number of distinct partitions is less than the target_partitions.
 datafusion.optimizer.repartition_aggregations true Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level
 datafusion.optimizer.repartition_file_min_size 10485760 Minimum total files size in bytes to perform file scan repartitioning.
 datafusion.optimizer.repartition_file_scans true When set to `true`, datasource partitions will be repartitioned to achieve maximum parallelism. This applies to both in-memory partitions and FileSource's file groups (1 group is 1 partition). For FileSources, only Parquet and CSV formats are currently supported. If set to `true` for a FileSource, all files will be repartitioned evenly (i.e., a single large file might be partitioned into smaller chunks) for parallel scanning. If set to `false` for a FileSource, different files will be read in parallel, but repartitioning won't happen within a single file. If set to `true` for an in-memory source, all memtable's partitions will have their batches repartitioned evenly to the desired number of `target_partitions`. Repartitioning can change the total number of partitions and batches per partition, but does not slice the initial record tables provided to the MemTable on creation.
@@ -431,12 +476,21 @@ datafusion.optimizer.repartition_joins true Should DataFusion repartition data u
 datafusion.optimizer.repartition_sorts true Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below ```text      "SortExec: [a@0 ASC]",      "  CoalescePartitionsExec",      "    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ``` would turn into the plan below which performs better in multithreaded environments ```text      "SortPreservingMergeExec: [a@0 ASC]",      "  SortExec: [a@0 ASC]",      "    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ```
 datafusion.optimizer.repartition_windows true Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level
 datafusion.optimizer.skip_failed_rules false When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail
+datafusion.optimizer.subset_repartition_threshold 4 Partition count threshold for subset satisfaction optimization. When the current partition count is >= this threshold, DataFusion will skip repartitioning if the required partitioning expression is a subset of the current partition expression such as Hash(a) satisfies Hash(a, b). When the current partition count is < this threshold, DataFusion will repartition to increase parallelism even when subset satisfaction applies. Set to 0 to always repartition (disable subset satisfaction optimization). Set to a high value to always use subset satisfaction. Example (subset_repartition_threshold = 4): ```text     Hash([a]) satisfies Hash([a, b]) because (Hash([a, b]) is subset of Hash([a])     If current partitions (3) < threshold (4), repartition:     AggregateExec: mode=FinalPartitioned, gby=[a, b], aggr=[SUM(x)]       RepartitionExec: partitioning=Hash([a, b], 8), input_partitions=3         AggregateExec: mode=Partial, gby=[a, b], aggr=[SUM(x)]           DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 3)     If current partitions (8) >= threshold (4), use subset satisfaction:     AggregateExec: mode=SinglePartitioned, gby=[a, b], aggr=[SUM(x)]       DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 8) ```
 datafusion.optimizer.top_down_join_key_reordering true When set to true, the physical plan optimizer will run a top down process to reorder the join keys
+datafusion.optimizer.use_statistics_registry false When set to true, the physical plan optimizer uses the pluggable `StatisticsRegistry` for statistics propagation across operators. This enables more accurate cardinality estimates compared to each operator's built-in `partition_statistics`.
+datafusion.runtime.list_files_cache_limit 1M Maximum memory to use for list files cache. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.
+datafusion.runtime.list_files_cache_ttl NULL TTL (time-to-live) of the entries in the list file cache. Supports units m (minutes), and s (seconds). Example: '2m' for 2 minutes.
+datafusion.runtime.max_temp_directory_size 100G Maximum temporary file directory size. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.
+datafusion.runtime.memory_limit unlimited Maximum memory limit for query execution. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.
+datafusion.runtime.metadata_cache_limit 50M Maximum memory to use for file metadata cache such as Parquet metadata. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.
+datafusion.runtime.temp_directory NULL The path to the temporary file directory.
 datafusion.sql_parser.collect_spans false When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes.
 datafusion.sql_parser.default_null_ordering nulls_max Specifies the default null ordering for query results. There are 4 options: - `nulls_max`: Nulls appear last in ascending order. - `nulls_min`: Nulls appear first in ascending order. - `nulls_first`: Nulls always be first in any order. - `nulls_last`: Nulls always be last in any order. By default, `nulls_max` is used to follow Postgres's behavior. postgres rule: <https://www.postgresql.org/docs/current/queries-order.html>
 datafusion.sql_parser.dialect generic Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks.
 datafusion.sql_parser.enable_ident_normalization true When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted)
 datafusion.sql_parser.enable_options_value_normalization false When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically.
+datafusion.sql_parser.enable_subquery_sort_elimination true When set to true, DataFusion may remove `ORDER BY` clauses from subqueries or CTEs during SQL planning when their ordering cannot affect the result, such as when no `LIMIT` or other order-sensitive operator depends on them. Disable this option to preserve explicit subquery ordering in the planned query.
 datafusion.sql_parser.map_string_types_to_utf8view true If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. If false, they are mapped to `Utf8`. Default is true.
 datafusion.sql_parser.parse_float_as_decimal false When set to true, SQL parser will parse float as decimal type
 datafusion.sql_parser.recursion_limit 50 Specifies the recursion depth limit when parsing complex SQL Queries
@@ -763,14 +817,11 @@ string_agg String AGGREGATE
 query TTTTTTTBTTTT rowsort
 select * from information_schema.routines where routine_name = 'date_trunc' OR routine_name = 'string_agg' OR routine_name = 'rank' ORDER BY routine_name
 ----
-datafusion public date_trunc datafusion public date_trunc FUNCTION true Timestamp(Microsecond, None) SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-datafusion public date_trunc datafusion public date_trunc FUNCTION true Timestamp(Microsecond, Some("+TZ")) SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-datafusion public date_trunc datafusion public date_trunc FUNCTION true Timestamp(Millisecond, None) SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-datafusion public date_trunc datafusion public date_trunc FUNCTION true Timestamp(Millisecond, Some("+TZ")) SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-datafusion public date_trunc datafusion public date_trunc FUNCTION true Timestamp(Nanosecond, None) SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-datafusion public date_trunc datafusion public date_trunc FUNCTION true Timestamp(Nanosecond, Some("+TZ")) SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-datafusion public date_trunc datafusion public date_trunc FUNCTION true Timestamp(Second, None) SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-datafusion public date_trunc datafusion public date_trunc FUNCTION true Timestamp(Second, Some("+TZ")) SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
+datafusion public date_trunc datafusion public date_trunc FUNCTION true Date SCALAR Truncates a timestamp or time value to a specified precision. date_trunc(precision, expression)
+datafusion public date_trunc datafusion public date_trunc FUNCTION true String SCALAR Truncates a timestamp or time value to a specified precision. date_trunc(precision, expression)
+datafusion public date_trunc datafusion public date_trunc FUNCTION true Time(ns) SCALAR Truncates a timestamp or time value to a specified precision. date_trunc(precision, expression)
+datafusion public date_trunc datafusion public date_trunc FUNCTION true Timestamp(ns) SCALAR Truncates a timestamp or time value to a specified precision. date_trunc(precision, expression)
+datafusion public date_trunc datafusion public date_trunc FUNCTION true Timestamp(ns, "+TZ") SCALAR Truncates a timestamp or time value to a specified precision. date_trunc(precision, expression)
 datafusion public rank datafusion public rank FUNCTION true NULL WINDOW Returns the rank of the current row within its partition, allowing gaps between ranks. This function provides a ranking similar to `row_number`, but skips ranks for identical values. rank()
 datafusion public string_agg datafusion public string_agg FUNCTION true String AGGREGATE Concatenates the values of string expressions and places separator values between them. If ordering is required, strings are concatenated in the specified order. This aggregation function can only mix DISTINCT and ORDER BY if the ordering expression is exactly the same as the first argument expression. string_agg([DISTINCT] expression, delimiter [ORDER BY expression])
 
@@ -783,30 +834,21 @@ false
 query TTTITTTTBI
 select * from information_schema.parameters where specific_name = 'date_trunc' OR specific_name = 'string_agg' OR specific_name = 'rank' ORDER BY specific_name, rid, data_type;
 ----
+datafusion public date_trunc 1 OUT NULL Date NULL false 0
+datafusion public date_trunc 2 IN expression Date NULL false 0
 datafusion public date_trunc 1 IN precision String NULL false 0
-datafusion public date_trunc 2 IN expression Timestamp(Microsecond, None) NULL false 0
-datafusion public date_trunc 1 OUT NULL Timestamp(Microsecond, None) NULL false 0
 datafusion public date_trunc 1 IN precision String NULL false 1
-datafusion public date_trunc 2 IN expression Timestamp(Microsecond, Some("+TZ")) NULL false 1
-datafusion public date_trunc 1 OUT NULL Timestamp(Microsecond, Some("+TZ")) NULL false 1
+datafusion public date_trunc 2 IN expression String NULL false 1
+datafusion public date_trunc 1 OUT NULL String NULL false 1
 datafusion public date_trunc 1 IN precision String NULL false 2
-datafusion public date_trunc 2 IN expression Timestamp(Millisecond, None) NULL false 2
-datafusion public date_trunc 1 OUT NULL Timestamp(Millisecond, None) NULL false 2
+datafusion public date_trunc 2 IN expression Time(ns) NULL false 2
+datafusion public date_trunc 1 OUT NULL Time(ns) NULL false 2
 datafusion public date_trunc 1 IN precision String NULL false 3
-datafusion public date_trunc 2 IN expression Timestamp(Millisecond, Some("+TZ")) NULL false 3
-datafusion public date_trunc 1 OUT NULL Timestamp(Millisecond, Some("+TZ")) NULL false 3
+datafusion public date_trunc 2 IN expression Timestamp(ns) NULL false 3
+datafusion public date_trunc 1 OUT NULL Timestamp(ns) NULL false 3
 datafusion public date_trunc 1 IN precision String NULL false 4
-datafusion public date_trunc 2 IN expression Timestamp(Nanosecond, None) NULL false 4
-datafusion public date_trunc 1 OUT NULL Timestamp(Nanosecond, None) NULL false 4
-datafusion public date_trunc 1 IN precision String NULL false 5
-datafusion public date_trunc 2 IN expression Timestamp(Nanosecond, Some("+TZ")) NULL false 5
-datafusion public date_trunc 1 OUT NULL Timestamp(Nanosecond, Some("+TZ")) NULL false 5
-datafusion public date_trunc 1 IN precision String NULL false 6
-datafusion public date_trunc 2 IN expression Timestamp(Second, None) NULL false 6
-datafusion public date_trunc 1 OUT NULL Timestamp(Second, None) NULL false 6
-datafusion public date_trunc 1 IN precision String NULL false 7
-datafusion public date_trunc 2 IN expression Timestamp(Second, Some("+TZ")) NULL false 7
-datafusion public date_trunc 1 OUT NULL Timestamp(Second, Some("+TZ")) NULL false 7
+datafusion public date_trunc 2 IN expression Timestamp(ns, "+TZ") NULL false 4
+datafusion public date_trunc 1 OUT NULL Timestamp(ns, "+TZ") NULL false 4
 datafusion public string_agg 2 IN delimiter Null NULL false 0
 datafusion public string_agg 1 IN expression String NULL false 0
 datafusion public string_agg 1 OUT NULL String NULL false 0
@@ -818,8 +860,10 @@ datafusion public string_agg 1 OUT NULL String NULL false 1
 query TTTBI rowsort
 select specific_name, data_type, parameter_mode, is_variadic, rid from information_schema.parameters where specific_name = 'concat';
 ----
-concat String IN true 0
+concat Binary IN true 0
+concat String IN true 1
 concat String OUT false 0
+concat String OUT false 1
 
 # test ceorcion signature
 query TTITI rowsort
@@ -832,14 +876,26 @@ repeat String 1 OUT 0
 query TT??TTT rowsort
 show functions like 'date_trunc';
 ----
-date_trunc Timestamp(Microsecond, None) [precision, expression] [String, Timestamp(Microsecond, None)] SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-date_trunc Timestamp(Microsecond, Some("+TZ")) [precision, expression] [String, Timestamp(Microsecond, Some("+TZ"))] SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-date_trunc Timestamp(Millisecond, None) [precision, expression] [String, Timestamp(Millisecond, None)] SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-date_trunc Timestamp(Millisecond, Some("+TZ")) [precision, expression] [String, Timestamp(Millisecond, Some("+TZ"))] SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-date_trunc Timestamp(Nanosecond, None) [precision, expression] [String, Timestamp(Nanosecond, None)] SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-date_trunc Timestamp(Nanosecond, Some("+TZ")) [precision, expression] [String, Timestamp(Nanosecond, Some("+TZ"))] SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-date_trunc Timestamp(Second, None) [precision, expression] [String, Timestamp(Second, None)] SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-date_trunc Timestamp(Second, Some("+TZ")) [precision, expression] [String, Timestamp(Second, Some("+TZ"))] SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
+date_trunc Date [precision, expression] [String, Date] SCALAR Truncates a timestamp or time value to a specified precision. date_trunc(precision, expression)
+date_trunc String [precision, expression] [String, String] SCALAR Truncates a timestamp or time value to a specified precision. date_trunc(precision, expression)
+date_trunc Time(ns) [precision, expression] [String, Time(ns)] SCALAR Truncates a timestamp or time value to a specified precision. date_trunc(precision, expression)
+date_trunc Timestamp(ns) [precision, expression] [String, Timestamp(ns)] SCALAR Truncates a timestamp or time value to a specified precision. date_trunc(precision, expression)
+date_trunc Timestamp(ns, "+TZ") [precision, expression] [String, Timestamp(ns, "+TZ")] SCALAR Truncates a timestamp or time value to a specified precision. date_trunc(precision, expression)
 
 statement ok
 show functions
+
+# Config reset
+statement ok
+reset datafusion.catalog.information_schema;
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+reset datafusion.execution.planning_concurrency;
+
+statement ok
+reset datafusion.execution.parquet.created_by;
diff --git a/datafusion/sqllogictest/test_files/information_schema_columns.slt b/datafusion/sqllogictest/test_files/information_schema_columns.slt
index c733b3baa7a47..76ec8f0a73d48 100644
--- a/datafusion/sqllogictest/test_files/information_schema_columns.slt
+++ b/datafusion/sqllogictest/test_files/information_schema_columns.slt
@@ -51,3 +51,13 @@ drop table t1
 
 statement ok
 drop table table_with_many_types
+
+# Config reset
+statement ok
+RESET datafusion.catalog.default_catalog;
+
+statement ok
+RESET datafusion.catalog.default_schema;
+
+statement ok
+RESET datafusion.catalog.information_schema;
diff --git a/datafusion/sqllogictest/test_files/information_schema_multiple_catalogs.slt b/datafusion/sqllogictest/test_files/information_schema_multiple_catalogs.slt
index 0594aa7cfca84..f57ecbc32a57a 100644
--- a/datafusion/sqllogictest/test_files/information_schema_multiple_catalogs.slt
+++ b/datafusion/sqllogictest/test_files/information_schema_multiple_catalogs.slt
@@ -121,3 +121,13 @@ drop table t1
 
 statement ok
 drop table t2
+
+# Config reset
+statement ok
+RESET datafusion.catalog.default_catalog;
+
+statement ok
+RESET datafusion.catalog.default_schema;
+
+statement ok
+RESET datafusion.catalog.information_schema;
diff --git a/datafusion/sqllogictest/test_files/information_schema_table_types.slt b/datafusion/sqllogictest/test_files/information_schema_table_types.slt
index 5650d537b06d0..858327fb207c1 100644
--- a/datafusion/sqllogictest/test_files/information_schema_table_types.slt
+++ b/datafusion/sqllogictest/test_files/information_schema_table_types.slt
@@ -47,6 +47,10 @@ datafusion public temp LOCAL TEMPORARY
 
 # Cleanup
 
+# Config reset
+statement ok
+RESET datafusion.catalog.information_schema;
+
 statement ok
 drop table physical
 
diff --git a/datafusion/sqllogictest/test_files/insert.slt b/datafusion/sqllogictest/test_files/insert.slt
index b8b2a7c372768..c807f73d60fe0 100644
--- a/datafusion/sqllogictest/test_files/insert.slt
+++ b/datafusion/sqllogictest/test_files/insert.slt
@@ -70,10 +70,8 @@ physical_plan
 04)------ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, c1@0 as c1]
 05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
 06)----------SortExec: expr=[c1@0 ASC NULLS LAST, c9@2 ASC NULLS LAST], preserve_partitioning=[true]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c4, c9], file_type=csv, has_header=true
+07)------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=1
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c4, c9], file_type=csv, has_header=true
 
 query I
 INSERT INTO table_without_values SELECT
@@ -130,10 +128,8 @@ physical_plan
 03)----ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as field1, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as field2]
 04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
 05)--------SortExec: expr=[c1@0 ASC NULLS LAST, c9@2 ASC NULLS LAST], preserve_partitioning=[true]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c4, c9], file_type=csv, has_header=true
+06)----------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=1
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c4, c9], file_type=csv, has_header=true
 
 
 
@@ -169,7 +165,7 @@ ORDER BY c1
 ----
 logical_plan
 01)Dml: op=[Insert Into] table=[table_without_values]
-02)--Projection: a1 AS a1, a2 AS a2
+02)--Projection: a1, a2
 03)----Sort: aggregate_test_100.c1 ASC NULLS LAST
 04)------Projection: sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a1, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a2, aggregate_test_100.c1
 05)--------WindowAggr: windowExpr=[[sum(CAST(aggregate_test_100.c4 AS Int64)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]]
@@ -181,10 +177,8 @@ physical_plan
 04)------ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as a1, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as a2, c1@0 as c1]
 05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
 06)----------SortExec: expr=[c1@0 ASC NULLS LAST, c9@2 ASC NULLS LAST], preserve_partitioning=[true]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c4, c9], file_type=csv, has_header=true
+07)------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=1
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c4, c9], file_type=csv, has_header=true
 
 
 query I
@@ -466,3 +460,10 @@ select * from unsigned_bigint_test
 
 statement ok
 drop table unsigned_bigint_test
+
+# Config reset
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
diff --git a/datafusion/sqllogictest/test_files/insert_to_external.slt b/datafusion/sqllogictest/test_files/insert_to_external.slt
index dc8ef59bbedcb..75476c0278c40 100644
--- a/datafusion/sqllogictest/test_files/insert_to_external.slt
+++ b/datafusion/sqllogictest/test_files/insert_to_external.slt
@@ -424,10 +424,8 @@ physical_plan
 04)------ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, c1@0 as c1]
 05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
 06)----------SortExec: expr=[c1@0 ASC NULLS LAST, c9@2 ASC NULLS LAST], preserve_partitioning=[true]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c4, c9], file_type=csv, has_header=true
+07)------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=1
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c4, c9], file_type=csv, has_header=true
 
 query I
 INSERT INTO table_without_values SELECT
@@ -485,10 +483,8 @@ physical_plan
 03)----ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as field1, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as field2]
 04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
 05)--------SortExec: expr=[c1@0 ASC NULLS LAST, c9@2 ASC NULLS LAST], preserve_partitioning=[true]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c4, c9], file_type=csv, has_header=true
+06)----------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=1
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c4, c9], file_type=csv, has_header=true
 
 
 
@@ -697,3 +693,10 @@ CREATE EXTERNAL TABLE test_column_defaults(
   b int default a+1
 ) STORED AS parquet
 LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q7/';
+
+# Config reset
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
diff --git a/datafusion/sqllogictest/test_files/join.slt.part b/datafusion/sqllogictest/test_files/join.slt.part
index fe3356af88fcc..b9d163d877596 100644
--- a/datafusion/sqllogictest/test_files/join.slt.part
+++ b/datafusion/sqllogictest/test_files/join.slt.part
@@ -776,10 +776,9 @@ logical_plan
 03)--SubqueryAlias: t2
 04)----TableScan: t1 projection=[a, b]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0)]
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
-04)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Reset the configs to old values
 statement ok
@@ -936,10 +935,9 @@ logical_plan
 06)----TableScan: department projection=[dept_name]
 physical_plan
 01)CrossJoinExec
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: name@1 = Alice OR name@1 = Bob
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)--DataSourceExec: partitions=1, partition_sizes=[1]
+02)--FilterExec: name@1 = Alice OR name@1 = Bob
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 # expect no row for Carol
 query ITT
@@ -975,24 +973,21 @@ ON e.emp_id = d.emp_id
 WHERE ((dept_name != 'Engineering' AND e.name = 'Alice') OR (name != 'Alice' AND e.name = 'Carol'));
 ----
 logical_plan
-01)Filter: d.dept_name != Utf8View("Engineering") AND e.name = Utf8View("Alice") OR e.name != Utf8View("Alice") AND e.name = Utf8View("Carol")
+01)Filter: d.dept_name != Utf8View("Engineering") AND e.name = Utf8View("Alice") OR e.name = Utf8View("Carol")
 02)--Projection: e.emp_id, e.name, d.dept_name
 03)----Left Join: e.emp_id = d.emp_id
 04)------SubqueryAlias: e
-05)--------Filter: employees.name = Utf8View("Alice") OR employees.name != Utf8View("Alice") AND employees.name = Utf8View("Carol")
+05)--------Filter: employees.name = Utf8View("Alice") OR employees.name = Utf8View("Carol")
 06)----------TableScan: employees projection=[emp_id, name]
 07)------SubqueryAlias: d
 08)--------TableScan: department projection=[emp_id, dept_name]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: dept_name@2 != Engineering AND name@1 = Alice OR name@1 != Alice AND name@1 = Carol
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------HashJoinExec: mode=CollectLeft, join_type=Left, on=[(emp_id@0, emp_id@0)], projection=[emp_id@0, name@1, dept_name@3]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------FilterExec: name@1 = Alice OR name@1 != Alice AND name@1 = Carol
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
-09)----------DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: dept_name@2 != Engineering AND name@1 = Alice OR name@1 = Carol
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----HashJoinExec: mode=CollectLeft, join_type=Left, on=[(emp_id@0, emp_id@0)], projection=[emp_id@0, name@1, dept_name@3]
+04)------FilterExec: name@1 = Alice OR name@1 = Carol
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query ITT
 SELECT e.emp_id, e.name, d.dept_name
@@ -1180,16 +1175,14 @@ logical_plan
 06)--------TableScan: t5 projection=[v0, v1, v2, v3, v4]
 07)----TableScan: t0 projection=[v0, v1]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(CAST(t1.v0 AS Float64)@6, v1@1)], filter=v1@1 + CAST(v0@0 AS Float64) > 0, projection=[v0@0, v1@1, v2@3, v3@4, v4@5, v0@7, v1@8]
-03)----CoalescePartitionsExec
-04)------ProjectionExec: expr=[v0@0 as v0, v1@1 as v1, v0@2 as v0, v2@3 as v2, v3@4 as v3, v4@5 as v4, CAST(v0@0 AS Float64) as CAST(t1.v0 AS Float64)]
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(v0@0, v0@0), (v1@1, v1@1)], projection=[v0@0, v1@1, v0@2, v2@4, v3@5, v4@6]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[0]
-09)--------------DataSourceExec: partitions=1, partition_sizes=[0]
-10)----DataSourceExec: partitions=1, partition_sizes=[0]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(CAST(t1.v0 AS Float64)@6, v1@1)], filter=v1@1 + CAST(v0@0 AS Float64) > 0, projection=[v0@0, v1@1, v2@3, v3@4, v4@5, v0@7, v1@8]
+02)--CoalescePartitionsExec
+03)----ProjectionExec: expr=[v0@0 as v0, v1@1 as v1, v0@2 as v0, v2@3 as v2, v3@4 as v3, v4@5 as v4, CAST(v0@0 AS Float64) as CAST(t1.v0 AS Float64)]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(v0@0, v0@0), (v1@1, v1@1)], projection=[v0@0, v1@1, v0@2, v2@4, v3@5, v4@6]
+06)----------DataSourceExec: partitions=1, partition_sizes=[0]
+07)----------DataSourceExec: partitions=1, partition_sizes=[0]
+08)--DataSourceExec: partitions=1, partition_sizes=[0]
 
 
 
@@ -1374,21 +1367,16 @@ logical_plan
 06)----TableScan: f projection=[a]
 07)--TableScan: s projection=[b]
 physical_plan
-01)ProjectionExec: expr=[col0@1 as col0, col1@2 as col1, a@3 as a, b@0 as b]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(CAST(s.b AS Int64)@1, col1@1)], projection=[b@0, col0@2, col1@3, a@4]
-04)------ProjectionExec: expr=[b@0 as b, CAST(b@0 AS Int64) as CAST(s.b AS Int64)]
-05)--------DataSourceExec: partitions=1, partition_sizes=[1]
-06)------ProjectionExec: expr=[col0@1 as col0, col1@2 as col1, a@0 as a]
-07)--------CoalesceBatchesExec: target_batch_size=8192
-08)----------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(CAST(f.a AS Int64)@1, col0@0)], projection=[a@0, col0@2, col1@3]
-09)------------ProjectionExec: expr=[a@0 as a, CAST(a@0 AS Int64) as CAST(f.a AS Int64)]
-10)--------------DataSourceExec: partitions=1, partition_sizes=[1]
-11)------------ProjectionExec: expr=[CAST(x@0 AS Int64) + 1 as col0, CAST(y@1 AS Int64) + 1 as col1]
-12)--------------RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1
-13)----------------CoalesceBatchesExec: target_batch_size=8192
-14)------------------FilterExec: y@1 = x@0
-15)--------------------DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(CAST(s.b AS Int64)@1, col1@1)], projection=[col0@2, col1@3, a@4, b@0]
+02)--ProjectionExec: expr=[b@0 as b, CAST(b@0 AS Int64) as CAST(s.b AS Int64)]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(CAST(f.a AS Int64)@1, col0@0)], projection=[col0@2, col1@3, a@0]
+05)----ProjectionExec: expr=[a@0 as a, CAST(a@0 AS Int64) as CAST(f.a AS Int64)]
+06)------DataSourceExec: partitions=1, partition_sizes=[1]
+07)----ProjectionExec: expr=[CAST(x@0 AS Int64) + 1 as col0, CAST(y@1 AS Int64) + 1 as col1]
+08)------RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1
+09)--------FilterExec: y@1 = x@0
+10)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 drop table pairs;
@@ -1431,18 +1419,14 @@ logical_plan
 05)------Aggregate: groupBy=[[t1.v0]], aggr=[[sum(t1.v1)]]
 06)--------TableScan: t1 projection=[v0, v1]
 physical_plan
-01)ProjectionExec: expr=[v0@1 as v0, v1@2 as v1, sum(t1.v1)@0 as sum(t1.v1)]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----HashJoinExec: mode=CollectLeft, join_type=Right, on=[(v0@1, v0@0)], projection=[sum(t1.v1)@0, v0@2, v1@3]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[sum(t1.v1)@1 as sum(t1.v1), v0@0 as v0]
-06)----------AggregateExec: mode=FinalPartitioned, gby=[v0@0 as v0], aggr=[sum(t1.v1)]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------RepartitionExec: partitioning=Hash([v0@0], 4), input_partitions=4
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-10)------------------AggregateExec: mode=Partial, gby=[v0@0 as v0], aggr=[sum(t1.v1)]
-11)--------------------DataSourceExec: partitions=1, partition_sizes=[1]
-12)------DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Right, on=[(v0@1, v0@0)], projection=[v0@2, v1@3, sum(t1.v1)@0]
+02)--CoalescePartitionsExec
+03)----ProjectionExec: expr=[sum(t1.v1)@1 as sum(t1.v1), v0@0 as v0]
+04)------AggregateExec: mode=FinalPartitioned, gby=[v0@0 as v0], aggr=[sum(t1.v1)]
+05)--------RepartitionExec: partitioning=Hash([v0@0], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[v0@0 as v0], aggr=[sum(t1.v1)]
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+08)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 query III
 SELECT *
@@ -1462,10 +1446,9 @@ logical_plan
 02)--TableScan: t0 projection=[v0, v1]
 03)--TableScan: t1 projection=[v0, v1]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(v0@0, v0@0)]
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
-04)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(v0@0, v0@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 query IIII
 SELECT * FROM t0, LATERAL (SELECT * FROM t1 WHERE t0.v0 = t1.v0);
@@ -1509,7 +1492,7 @@ drop table t0;
 statement ok
 create table t1(v1 int, v2 int);
 
-query error DataFusion error: Schema error: No field named tt1.v2. Valid fields are tt1.v1.
+query error DataFusion error: Error during planning: Column in ORDER BY must be in GROUP BY or an aggregate function
 select v1 from t1 as tt1 natural join t1 as tt2 group by v1 order by v2;
 
 statement ok
diff --git a/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt b/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt
index a1efc1317b4aa..59f3d8285af49 100644
--- a/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt
+++ b/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt
@@ -55,11 +55,10 @@ logical_plan
 07)--------TableScan: annotated_data projection=[a, c]
 physical_plan
 01)SortPreservingMergeExec: [a@0 ASC NULLS LAST], fetch=5
-02)--CoalesceBatchesExec: target_batch_size=8192, fetch=5
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(c@0, c@1)], projection=[a@1]
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], file_type=csv, has_header=true
-05)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(c@0, c@1)], projection=[a@1], fetch=5
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], file_type=csv, has_header=true
+04)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 # preserve_inner_join
 query IIII nosort
@@ -89,20 +88,22 @@ logical_plan
 02)--Projection: t2.a AS a2, t2.b
 03)----RightSemi Join: t1.d = t2.d, t1.c = t2.c
 04)------SubqueryAlias: t1
-05)--------TableScan: annotated_data projection=[c, d]
-06)------SubqueryAlias: t2
-07)--------Filter: annotated_data.d = Int32(3)
-08)----------TableScan: annotated_data projection=[a, b, c, d], partial_filters=[annotated_data.d = Int32(3)]
+05)--------Filter: annotated_data.d = Int32(3)
+06)----------TableScan: annotated_data projection=[c, d], partial_filters=[annotated_data.d = Int32(3)]
+07)------SubqueryAlias: t2
+08)--------Filter: annotated_data.d = Int32(3)
+09)----------TableScan: annotated_data projection=[a, b, c, d], partial_filters=[annotated_data.d = Int32(3)]
 physical_plan
 01)SortPreservingMergeExec: [a2@0 ASC NULLS LAST, b@1 ASC NULLS LAST], fetch=10
 02)--ProjectionExec: expr=[a@0 as a2, b@1 as b]
-03)----CoalesceBatchesExec: target_batch_size=8192, fetch=10
-04)------HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(d@1, d@3), (c@0, c@2)], projection=[a@0, b@1]
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c, d], file_type=csv, has_header=true
-06)--------CoalesceBatchesExec: target_batch_size=8192
-07)----------FilterExec: d@3 = 3
-08)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], file_type=csv, has_header=true
+03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(d@1, d@3), (c@0, c@2)], projection=[a@0, b@1], fetch=10
+04)------CoalescePartitionsExec
+05)--------FilterExec: d@1 = 3
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c, d], file_type=csv, has_header=true
+08)------FilterExec: d@3 = 3
+09)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+10)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], file_type=csv, has_header=true
 
 # preserve_right_semi_join
 query II nosort
diff --git a/datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt b/datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt
index 0336cfc2d3314..1b6f2e4c86385 100644
--- a/datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt
+++ b/datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt
@@ -80,10 +80,9 @@ logical_plan
 04)----TableScan: t2 projection=[id, val]
 physical_plan
 01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as val]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@1, val@1)], NullsEqual: true
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@1, val@1)], NullsEqual: true
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 set datafusion.explain.format = "tree";
@@ -103,25 +102,19 @@ physical_plan
 06)│          val: val         │
 07)└─────────────┬─────────────┘
 08)┌─────────────┴─────────────┐
-09)│    CoalesceBatchesExec    │
+09)│        HashJoinExec       │
 10)│    --------------------   │
-11)│     target_batch_size:    │
-12)│            8192           │
-13)└─────────────┬─────────────┘
-14)┌─────────────┴─────────────┐
-15)│        HashJoinExec       │
-16)│    --------------------   │
-17)│      NullsEqual: true     ├──────────────┐
-18)│                           │              │
-19)│      on: (val = val)      │              │
-20)└─────────────┬─────────────┘              │
-21)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-22)│       DataSourceExec      ││       DataSourceExec      │
-23)│    --------------------   ││    --------------------   │
-24)│         bytes: 288        ││         bytes: 288        │
-25)│       format: memory      ││       format: memory      │
-26)│          rows: 1          ││          rows: 1          │
-27)└───────────────────────────┘└───────────────────────────┘
+11)│      NullsEqual: true     ├──────────────┐
+12)│                           │              │
+13)│      on: (val = val)      │              │
+14)└─────────────┬─────────────┘              │
+15)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+16)│       DataSourceExec      ││       DataSourceExec      │
+17)│    --------------------   ││    --------------------   │
+18)│         bytes: 288        ││         bytes: 288        │
+19)│       format: memory      ││       format: memory      │
+20)│          rows: 1          ││          rows: 1          │
+21)└───────────────────────────┘└───────────────────────────┘
 
 statement ok
 set datafusion.explain.format = "indent";
@@ -147,17 +140,15 @@ logical_plan
 04)------TableScan: t1 projection=[id, val]
 05)----TableScan: t2 projection=[id, val]
 physical_plan
-01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as val]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1.val + Int64(1)@2, t2.val + Int64(1)@2)], projection=[id@0, val@1, id@3, val@4], NullsEqual: true
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS Int64) + 1 as t1.val + Int64(1)]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------FilterExec: CAST(val@1 AS Int64) + 1 IS NOT DISTINCT FROM 11
-09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
-10)------ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS Int64) + 1 as t2.val + Int64(1)]
-11)--------DataSourceExec: partitions=1, partition_sizes=[1]
+01)ProjectionExec: expr=[id@0 as t1_id, id@1 as t2_id, val@2 as val, val@3 as val]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1.val + Int64(1)@2, t2.val + Int64(1)@2)], projection=[id@0, id@3, val@1, val@4], NullsEqual: true
+03)----CoalescePartitionsExec
+04)------ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS Int64) + 1 as t1.val + Int64(1)]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+06)----------FilterExec: CAST(val@1 AS Int64) + 1 IS NOT DISTINCT FROM 11
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+08)----ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS Int64) + 1 as t2.val + Int64(1)]
+09)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Mixed join predicate with `IS DISTINCT FROM` and `IS NOT DISTINCT FROM`
 query IIII rowsort
@@ -178,13 +169,12 @@ logical_plan
 03)----TableScan: t1 projection=[id, val]
 04)----TableScan: t2 projection=[id, val]
 physical_plan
-01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as val]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1.val + Int64(1)@2, t2.val + Int64(1)@2)], filter=CAST(val@0 AS Int64) % 3 IS DISTINCT FROM CAST(val@1 AS Int64) % 3, projection=[id@0, val@1, id@3, val@4], NullsEqual: true
-04)------ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS Int64) + 1 as t1.val + Int64(1)]
-05)--------DataSourceExec: partitions=1, partition_sizes=[1]
-06)------ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS Int64) + 1 as t2.val + Int64(1)]
-07)--------DataSourceExec: partitions=1, partition_sizes=[1]
+01)ProjectionExec: expr=[id@0 as t1_id, id@1 as t2_id, val@2 as val, val@3 as val]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1.val + Int64(1)@2, t2.val + Int64(1)@2)], filter=CAST(val@0 AS Int64) % 3 IS DISTINCT FROM CAST(val@1 AS Int64) % 3, projection=[id@0, id@3, val@1, val@4], NullsEqual: true
+03)----ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS Int64) + 1 as t1.val + Int64(1)]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+05)----ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS Int64) + 1 as t2.val + Int64(1)]
+06)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Test mixed equal and IS NOT DISTINCT FROM conditions
 # The `IS NOT DISTINCT FROM` expr should NOT in HashJoin's `on` predicate
@@ -200,10 +190,9 @@ logical_plan
 04)----TableScan: t2 projection=[id, val]
 physical_plan
 01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as val]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], filter=val@0 IS NOT DISTINCT FROM val@1
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], filter=val@0 IS NOT DISTINCT FROM val@1
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Test the mixed condition join result
 query IIII rowsort
@@ -240,15 +229,13 @@ logical_plan
 05)------TableScan: t2 projection=[id, val]
 06)----TableScan: t0 projection=[val]
 physical_plan
-01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as val]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@0, val@1)], projection=[id@1, val@2, id@3, val@4], NullsEqual: true
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)--------CoalesceBatchesExec: target_batch_size=8192
-07)----------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@1, val@1)], NullsEqual: true
-08)------------DataSourceExec: partitions=1, partition_sizes=[1]
-09)------------DataSourceExec: partitions=1, partition_sizes=[1]
+01)ProjectionExec: expr=[id@0 as t1_id, id@1 as t2_id, val@2 as val, val@3 as val]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@0, val@1)], projection=[id@1, id@3, val@2, val@4], NullsEqual: true
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@1, val@1)], NullsEqual: true
+06)--------DataSourceExec: partitions=1, partition_sizes=[1]
+07)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Test IS NOT DISTINCT FROM with multiple columns
 statement ok
@@ -285,13 +272,12 @@ JOIN t4 ON (t3.val1 IS NOT DISTINCT FROM t4.val1) AND (t3.val2 IS NOT DISTINCT F
 ----
 01)Projection: t3.id AS t3_id, t4.id AS t4_id, t3.val1, t4.val1, t3.val2, t4.val2
 01)ProjectionExec: expr=[id@0 as t3_id, id@3 as t4_id, val1@1 as val1, val1@4 as val1, val2@2 as val2, val2@5 as val2]
-02)--CoalesceBatchesExec: target_batch_size=8192
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val1@1, val1@1), (val2@2, val2@2)], NullsEqual: true
 02)--Inner Join: t3.val1 = t4.val1, t3.val2 = t4.val2
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val1@1, val1@1), (val2@2, val2@2)], NullsEqual: true
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
 03)----TableScan: t3 projection=[id, val1, val2]
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
+04)----DataSourceExec: partitions=1, partition_sizes=[1]
 04)----TableScan: t4 projection=[id, val1, val2]
-05)------DataSourceExec: partitions=1, partition_sizes=[1]
 logical_plan
 physical_plan
 
@@ -305,6 +291,36 @@ JOIN t4 ON (t3.val1 IS NOT DISTINCT FROM t4.val1) AND (t3.val2 IS NOT DISTINCT F
 2 2 NULL NULL 200 200
 3 3 30 30 NULL NULL
 
+# Test mixed: 1 Eq key + multiple IS NOT DISTINCT FROM keys.
+# The optimizer unconditionally favours Eq keys (see extract_equijoin_predicate.rs,
+# "Only convert when there are NO equijoin predicates, to be conservative").
+# All IS NOT DISTINCT FROM predicates should be demoted to filter, even when they outnumber the Eq key.
+query TT
+EXPLAIN SELECT t3.id AS t3_id, t4.id AS t4_id, t3.val1, t4.val1, t3.val2, t4.val2
+FROM t3
+JOIN t4 ON (t3.id = t4.id) AND (t3.val1 IS NOT DISTINCT FROM t4.val1) AND (t3.val2 IS NOT DISTINCT FROM t4.val2)
+----
+logical_plan
+01)Projection: t3.id AS t3_id, t4.id AS t4_id, t3.val1, t4.val1, t3.val2, t4.val2
+02)--Inner Join: t3.id = t4.id Filter: t3.val1 IS NOT DISTINCT FROM t4.val1 AND t3.val2 IS NOT DISTINCT FROM t4.val2
+03)----TableScan: t3 projection=[id, val1, val2]
+04)----TableScan: t4 projection=[id, val1, val2]
+physical_plan
+01)ProjectionExec: expr=[id@0 as t3_id, id@3 as t4_id, val1@1 as val1, val1@4 as val1, val2@2 as val2, val2@5 as val2]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], filter=val1@0 IS NOT DISTINCT FROM val1@2 AND val2@1 IS NOT DISTINCT FROM val2@3
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Verify correct results: all 3 rows should match (including NULL=NULL via IS NOT DISTINCT FROM in filter)
+query IIIIII rowsort
+SELECT t3.id AS t3_id, t4.id AS t4_id, t3.val1, t4.val1, t3.val2, t4.val2
+FROM t3
+JOIN t4 ON (t3.id = t4.id) AND (t3.val1 IS NOT DISTINCT FROM t4.val1) AND (t3.val2 IS NOT DISTINCT FROM t4.val2)
+----
+1 1 10 10 100 100
+2 2 NULL NULL 200 200
+3 3 30 30 NULL NULL
+
 statement ok
 drop table t0;
 
diff --git a/datafusion/sqllogictest/test_files/join_limit_pushdown.slt b/datafusion/sqllogictest/test_files/join_limit_pushdown.slt
new file mode 100644
index 0000000000000..ea9c971065411
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/join_limit_pushdown.slt
@@ -0,0 +1,280 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for limit pushdown into joins
+
+# need to use a single partition for deterministic results
+statement ok
+set datafusion.execution.target_partitions = 1;
+
+statement ok
+set datafusion.explain.logical_plan_only = false;
+
+statement ok
+set datafusion.optimizer.prefer_hash_join = true;
+
+# Create test tables
+statement ok
+CREATE TABLE t1 (a INT, b VARCHAR) AS VALUES
+  (1, 'one'),
+  (2, 'two'),
+  (3, 'three'),
+  (4, 'four'),
+  (5, 'five');
+
+statement ok
+CREATE TABLE t2 (x INT, y VARCHAR) AS VALUES
+  (1, 'alpha'),
+  (2, 'beta'),
+  (3, 'gamma'),
+  (6, 'delta'),
+  (7, 'epsilon');
+
+query TT
+EXPLAIN SELECT t1.a, t2.x FROM t1 INNER JOIN t2 ON t1.a = t2.x LIMIT 2;
+----
+logical_plan
+01)Limit: skip=0, fetch=2
+02)--Inner Join: t1.a = t2.x
+03)----TableScan: t1 projection=[a]
+04)----TableScan: t2 projection=[x]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, x@0)], fetch=2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query II
+SELECT t1.a, t2.x FROM t1 INNER JOIN t2 ON t1.a = t2.x LIMIT 2;
+----
+1 1
+2 2
+
+# Right join is converted to Left join with projection - fetch pushdown is supported
+query TT
+EXPLAIN SELECT t1.a, t2.x FROM t1 RIGHT JOIN t2 ON t1.a = t2.x LIMIT 3;
+----
+logical_plan
+01)Limit: skip=0, fetch=3
+02)--Right Join: t1.a = t2.x
+03)----TableScan: t1 projection=[a]
+04)----Limit: skip=0, fetch=3
+05)------TableScan: t2 projection=[x], fetch=3
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Left, on=[(x@0, a@0)], projection=[a@1, x@0], fetch=3
+02)--DataSourceExec: partitions=1, partition_sizes=[1], fetch=3
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query II
+SELECT t1.a, t2.x FROM t1 RIGHT JOIN t2 ON t1.a = t2.x LIMIT 3;
+----
+1 1
+2 2
+3 3
+
+# Left join supports fetch pushdown
+query TT
+EXPLAIN SELECT t1.a, t2.x FROM t1 LEFT JOIN t2 ON t1.a = t2.x LIMIT 3;
+----
+logical_plan
+01)Limit: skip=0, fetch=3
+02)--Left Join: t1.a = t2.x
+03)----Limit: skip=0, fetch=3
+04)------TableScan: t1 projection=[a], fetch=3
+05)----TableScan: t2 projection=[x]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Left, on=[(a@0, x@0)], fetch=3
+02)--DataSourceExec: partitions=1, partition_sizes=[1], fetch=3
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query II
+SELECT t1.a, t2.x FROM t1 LEFT JOIN t2 ON t1.a = t2.x LIMIT 3;
+----
+1 1
+2 2
+3 3
+
+
+# Full join supports fetch pushdown
+query TT
+EXPLAIN SELECT t1.a, t2.x FROM t1 FULL OUTER JOIN t2 ON t1.a = t2.x LIMIT 4;
+----
+logical_plan
+01)Limit: skip=0, fetch=4
+02)--Full Join: t1.a = t2.x
+03)----TableScan: t1 projection=[a]
+04)----TableScan: t2 projection=[x]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Full, on=[(a@0, x@0)], fetch=4
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Note: FULL OUTER JOIN order is not deterministic, so we just check count
+query I
+SELECT COUNT(*) FROM (SELECT t1.a, t2.x FROM t1 FULL OUTER JOIN t2 ON t1.a = t2.x LIMIT 4);
+----
+4
+
+# EXISTS becomes left semi join - fetch pushdown is supported
+query TT
+EXPLAIN SELECT t2.x FROM t2 WHERE EXISTS (SELECT 1 FROM t1 WHERE t1.a = t2.x) LIMIT 2;
+----
+logical_plan
+01)Limit: skip=0, fetch=2
+02)--LeftSemi Join: t2.x = __correlated_sq_1.a
+03)----TableScan: t2 projection=[x]
+04)----SubqueryAlias: __correlated_sq_1
+05)------TableScan: t1 projection=[a]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(x@0, a@0)], fetch=2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query I
+SELECT t2.x FROM t2 WHERE EXISTS (SELECT 1 FROM t1 WHERE t1.a = t2.x) LIMIT 2;
+----
+1
+2
+
+# NOT EXISTS becomes LeftAnti - fetch pushdown is supported
+query TT
+EXPLAIN SELECT t2.x FROM t2 WHERE NOT EXISTS (SELECT 1 FROM t1 WHERE t1.a = t2.x) LIMIT 1;
+----
+logical_plan
+01)Limit: skip=0, fetch=1
+02)--LeftAnti Join: t2.x = __correlated_sq_1.a
+03)----TableScan: t2 projection=[x]
+04)----SubqueryAlias: __correlated_sq_1
+05)------TableScan: t1 projection=[a]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(x@0, a@0)], fetch=1
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query I
+SELECT t2.x FROM t2 WHERE NOT EXISTS (SELECT 1 FROM t1 WHERE t1.a = t2.x) LIMIT 1;
+----
+6
+
+# Inner join should push
+query TT
+EXPLAIN SELECT t1.a, t2.x FROM t1 INNER JOIN t2 ON t1.a = t2.x LIMIT 1 OFFSET 1;
+----
+logical_plan
+01)Limit: skip=1, fetch=1
+02)--Inner Join: t1.a = t2.x
+03)----TableScan: t1 projection=[a]
+04)----TableScan: t2 projection=[x]
+physical_plan
+01)GlobalLimitExec: skip=1, fetch=1
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, x@0)], fetch=2
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+query II
+SELECT t1.a, t2.x FROM t1 INNER JOIN t2 ON t1.a = t2.x LIMIT 1 OFFSET 1;
+----
+2 2
+
+query TT
+EXPLAIN SELECT t1.a, t2.x FROM t1 INNER JOIN t2 ON t1.a = t2.x LIMIT 0;
+----
+logical_plan EmptyRelation: rows=0
+physical_plan EmptyExec
+
+query II
+SELECT t1.a, t2.x FROM t1 INNER JOIN t2 ON t1.a = t2.x LIMIT 0;
+----
+
+statement ok
+CREATE TABLE t3 (p INT, q VARCHAR) AS VALUES
+  (1, 'foo'),
+  (2, 'bar'),
+  (3, 'baz');
+
+query TT
+EXPLAIN SELECT t1.a, t2.x, t3.p 
+FROM t1 
+INNER JOIN t2 ON t1.a = t2.x 
+INNER JOIN t3 ON t2.x = t3.p 
+LIMIT 2;
+----
+logical_plan
+01)Limit: skip=0, fetch=2
+02)--Inner Join: t2.x = t3.p
+03)----Inner Join: t1.a = t2.x
+04)------TableScan: t1 projection=[a]
+05)------TableScan: t2 projection=[x]
+06)----TableScan: t3 projection=[p]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(p@0, x@1)], projection=[a@1, x@2, p@0], fetch=2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, x@0)]
+04)----DataSourceExec: partitions=1, partition_sizes=[1]
+05)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+query III
+SELECT t1.a, t2.x, t3.p 
+FROM t1 
+INNER JOIN t2 ON t1.a = t2.x 
+INNER JOIN t3 ON t2.x = t3.p 
+LIMIT 2;
+----
+1 1 1
+2 2 2
+
+# Try larger limit
+query TT
+EXPLAIN SELECT t1.a, t2.x FROM t1 INNER JOIN t2 ON t1.a = t2.x LIMIT 100;
+----
+logical_plan
+01)Limit: skip=0, fetch=100
+02)--Inner Join: t1.a = t2.x
+03)----TableScan: t1 projection=[a]
+04)----TableScan: t2 projection=[x]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, x@0)], fetch=100
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query II
+SELECT t1.a, t2.x FROM t1 INNER JOIN t2 ON t1.a = t2.x LIMIT 100;
+----
+1 1
+2 2
+3 3
+
+# Config reset
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+reset datafusion.explain.logical_plan_only;
+
+statement ok
+reset datafusion.optimizer.prefer_hash_join;
+
+statement ok
+DROP TABLE t1;
+
+statement ok
+DROP TABLE t2;
+
+statement ok
+DROP TABLE t3;
diff --git a/datafusion/sqllogictest/test_files/join_only.slt b/datafusion/sqllogictest/test_files/join_only.slt
index b2b6a1fa9b9de..1c04d0123b598 100644
--- a/datafusion/sqllogictest/test_files/join_only.slt
+++ b/datafusion/sqllogictest/test_files/join_only.slt
@@ -16,3 +16,7 @@
 # under the License.
 
 include ./join.slt.part
+
+# Config reset
+statement ok
+RESET datafusion.optimizer.repartition_joins;
diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt
index 4bdf2e5da9632..e0be63fe71525 100644
--- a/datafusion/sqllogictest/test_files/joins.slt
+++ b/datafusion/sqllogictest/test_files/joins.slt
@@ -57,15 +57,15 @@ statement ok
 CREATE TABLE join_t3(s3 struct<id INT>)
   AS VALUES
   (NULL),
-  (struct(1)),
-  (struct(2));
+    ({id: 1}),
+    ({id: 2});
 
 statement ok
 CREATE TABLE join_t4(s4 struct<id INT>)
   AS VALUES
   (NULL),
-  (struct(2)),
-  (struct(3));
+    ({id: 2}),
+    ({id: 3});
 
 # Left semi anti join
 
@@ -146,10 +146,10 @@ AS VALUES
 statement ok
 CREATE TABLE test_timestamps_table as
 SELECT
-  arrow_cast(ts::timestamp::bigint, 'Timestamp(Nanosecond, None)') as nanos,
-  arrow_cast(ts::timestamp::bigint / 1000, 'Timestamp(Microsecond, None)') as micros,
-  arrow_cast(ts::timestamp::bigint / 1000000, 'Timestamp(Millisecond, None)') as millis,
-  arrow_cast(ts::timestamp::bigint / 1000000000, 'Timestamp(Second, None)') as secs,
+  arrow_cast(ts::timestamp::bigint, 'Timestamp(ns)') as nanos,
+  arrow_cast(ts::timestamp::bigint / 1000, 'Timestamp(µs)') as micros,
+  arrow_cast(ts::timestamp::bigint / 1000000, 'Timestamp(ms)') as millis,
+  arrow_cast(ts::timestamp::bigint / 1000000000, 'Timestamp(s)') as secs,
   names
 FROM
   test_timestamps_table_source;
@@ -1339,14 +1339,12 @@ logical_plan
 05)------TableScan: join_t2 projection=[t2_id]
 physical_plan
 01)AggregateExec: mode=FinalPartitioned, gby=[t1_id@0 as t1_id], aggr=[]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2
-04)------AggregateExec: mode=Partial, gby=[t1_id@0 as t1_id], aggr=[]
-05)--------CoalesceBatchesExec: target_batch_size=2
-06)----------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, t2_id@0)], projection=[t1_id@0]
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-09)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2
+03)----AggregateExec: mode=Partial, gby=[t1_id@0 as t1_id], aggr=[]
+04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, t2_id@0)], projection=[t1_id@0]
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+07)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Join on struct
 query TT
@@ -1359,11 +1357,10 @@ logical_plan
 02)--TableScan: join_t3 projection=[s3]
 03)--TableScan: join_t4 projection=[s4]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(s3@0, s4@0)]
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
-04)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-05)------DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(s3@0, s4@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+04)----DataSourceExec: partitions=1, partition_sizes=[1]
 
 query ??
 select join_t3.s3, join_t4.s4
@@ -1397,14 +1394,12 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[count(Int64(1))@1 as count(*)]
 02)--AggregateExec: mode=FinalPartitioned, gby=[t1_id@0 as t1_id], aggr=[count(Int64(1))]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2
-05)--------AggregateExec: mode=Partial, gby=[t1_id@0 as t1_id], aggr=[count(Int64(1))]
-06)----------CoalesceBatchesExec: target_batch_size=2
-07)------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, t2_id@0)], projection=[t1_id@0]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
-09)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-10)----------------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2
+04)------AggregateExec: mode=Partial, gby=[t1_id@0 as t1_id], aggr=[count(Int64(1))]
+05)--------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, t2_id@0)], projection=[t1_id@0]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+07)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+08)------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TT
 EXPLAIN
@@ -1426,14 +1421,12 @@ physical_plan
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(alias1)]
 05)--------AggregateExec: mode=FinalPartitioned, gby=[alias1@0 as alias1], aggr=[]
-06)----------CoalesceBatchesExec: target_batch_size=2
-07)------------RepartitionExec: partitioning=Hash([alias1@0], 2), input_partitions=2
-08)--------------AggregateExec: mode=Partial, gby=[t1_id@0 as alias1], aggr=[]
-09)----------------CoalesceBatchesExec: target_batch_size=2
-10)------------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, t2_id@0)], projection=[t1_id@0]
-11)--------------------DataSourceExec: partitions=1, partition_sizes=[1]
-12)--------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-13)----------------------DataSourceExec: partitions=1, partition_sizes=[1]
+06)----------RepartitionExec: partitioning=Hash([alias1@0], 2), input_partitions=2
+07)------------AggregateExec: mode=Partial, gby=[t1_id@0 as alias1], aggr=[]
+08)--------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, t2_id@0)], projection=[t1_id@0]
+09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
+10)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+11)------------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 set datafusion.explain.logical_plan_only = true;
@@ -1492,15 +1485,14 @@ logical_plan
 04)----TableScan: join_t2 projection=[t2_id, t2_name, t2_int]
 physical_plan
 01)ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int, t2_id@3 as t2_id, t2_name@4 as t2_name, t2_int@5 as t2_int, CAST(t1_id@0 AS Int64) + 11 as join_t1.t1_id + Int64(11)]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t1.t1_id + Int64(11)@3, CAST(join_t2.t2_id AS Int64)@3)], projection=[t1_id@0, t1_name@1, t1_int@2, t2_id@4, t2_name@5, t2_int@6]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int, CAST(t1_id@0 AS Int64) + 11 as join_t1.t1_id + Int64(11)]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, CAST(t2_id@0 AS Int64) as CAST(join_t2.t2_id AS Int64)]
-09)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-10)----------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t1.t1_id + Int64(11)@3, CAST(join_t2.t2_id AS Int64)@3)], projection=[t1_id@0, t1_name@1, t1_int@2, t2_id@4, t2_name@5, t2_int@6]
+03)----CoalescePartitionsExec
+04)------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int, CAST(t1_id@0 AS Int64) + 11 as join_t1.t1_id + Int64(11)]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+07)----ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, CAST(t2_id@0 AS Int64) as CAST(join_t2.t2_id AS Int64)]
+08)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+09)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 set datafusion.optimizer.repartition_joins = true;
@@ -1518,15 +1510,14 @@ logical_plan
 04)----TableScan: join_t2 projection=[t2_id, t2_name, t2_int]
 physical_plan
 01)ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int, t2_id@3 as t2_id, t2_name@4 as t2_name, t2_int@5 as t2_int, CAST(t1_id@0 AS Int64) + 11 as join_t1.t1_id + Int64(11)]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t1.t1_id + Int64(11)@3, CAST(join_t2.t2_id AS Int64)@3)], projection=[t1_id@0, t1_name@1, t1_int@2, t2_id@4, t2_name@5, t2_int@6]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int, CAST(t1_id@0 AS Int64) + 11 as join_t1.t1_id + Int64(11)]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, CAST(t2_id@0 AS Int64) as CAST(join_t2.t2_id AS Int64)]
-09)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-10)----------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t1.t1_id + Int64(11)@3, CAST(join_t2.t2_id AS Int64)@3)], projection=[t1_id@0, t1_name@1, t1_int@2, t2_id@4, t2_name@5, t2_int@6]
+03)----CoalescePartitionsExec
+04)------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int, CAST(t1_id@0 AS Int64) + 11 as join_t1.t1_id + Int64(11)]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+07)----ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, CAST(t2_id@0 AS Int64) as CAST(join_t2.t2_id AS Int64)]
+08)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+09)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Both side expr key inner join
 
@@ -1545,16 +1536,14 @@ logical_plan
 03)----TableScan: join_t1 projection=[t1_id, t1_name]
 04)----TableScan: join_t2 projection=[t2_id]
 physical_plan
-01)ProjectionExec: expr=[t1_id@1 as t1_id, t2_id@0 as t2_id, t1_name@2 as t1_name]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t2.t2_id + UInt32(1)@1, join_t1.t1_id + UInt32(12)@2)], projection=[t2_id@0, t1_id@2, t1_name@3]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[t2_id@0 as t2_id, t2_id@0 + 1 as join_t2.t2_id + UInt32(1)]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 12 as join_t1.t1_id + UInt32(12)]
-09)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-10)----------DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t2.t2_id + UInt32(1)@1, join_t1.t1_id + UInt32(12)@2)], projection=[t1_id@2, t2_id@0, t1_name@3]
+02)--CoalescePartitionsExec
+03)----ProjectionExec: expr=[t2_id@0 as t2_id, t2_id@0 + 1 as join_t2.t2_id + UInt32(1)]
+04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)--ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 12 as join_t1.t1_id + UInt32(12)]
+07)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+08)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 set datafusion.optimizer.repartition_joins = true;
@@ -1571,16 +1560,14 @@ logical_plan
 03)----TableScan: join_t1 projection=[t1_id, t1_name]
 04)----TableScan: join_t2 projection=[t2_id]
 physical_plan
-01)ProjectionExec: expr=[t1_id@1 as t1_id, t2_id@0 as t2_id, t1_name@2 as t1_name]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t2.t2_id + UInt32(1)@1, join_t1.t1_id + UInt32(12)@2)], projection=[t2_id@0, t1_id@2, t1_name@3]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[t2_id@0 as t2_id, t2_id@0 + 1 as join_t2.t2_id + UInt32(1)]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 12 as join_t1.t1_id + UInt32(12)]
-09)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-10)----------DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t2.t2_id + UInt32(1)@1, join_t1.t1_id + UInt32(12)@2)], projection=[t1_id@2, t2_id@0, t1_name@3]
+02)--CoalescePartitionsExec
+03)----ProjectionExec: expr=[t2_id@0 as t2_id, t2_id@0 + 1 as join_t2.t2_id + UInt32(1)]
+04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)--ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 12 as join_t1.t1_id + UInt32(12)]
+07)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+08)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Left side expr key inner join
 
@@ -1600,13 +1587,41 @@ logical_plan
 03)----TableScan: join_t1 projection=[t1_id, t1_name]
 04)----TableScan: join_t2 projection=[t2_id]
 physical_plan
-01)ProjectionExec: expr=[t1_id@1 as t1_id, t2_id@0 as t2_id, t1_name@2 as t1_name]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t2_id@0, join_t1.t1_id + UInt32(11)@2)], projection=[t2_id@0, t1_id@1, t1_name@2]
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 11 as join_t1.t1_id + UInt32(11)]
-06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)----------DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t2_id@0, join_t1.t1_id + UInt32(11)@2)], projection=[t1_id@1, t2_id@0, t1_name@2]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 11 as join_t1.t1_id + UInt32(11)]
+04)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+05)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# don't reorder join based on stats, and use the order in the query
+
+statement ok
+set datafusion.optimizer.join_reordering = false;
+
+statement ok
+set datafusion.explain.physical_plan_only = true;
+
+query TT
+EXPLAIN
+SELECT join_t1.t1_id, join_t2.t2_id, join_t1.t1_name
+FROM join_t1
+INNER JOIN join_t2
+ON join_t1.t1_id + cast(11 as INT UNSIGNED)  = join_t2.t2_id
+----
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t1.t1_id + UInt32(11)@2, t2_id@0)], projection=[t1_id@0, t2_id@3, t1_name@1]
+02)--CoalescePartitionsExec
+03)----ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 11 as join_t1.t1_id + UInt32(11)]
+04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+07)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+set datafusion.optimizer.join_reordering = true;
+
+statement ok
+set datafusion.explain.physical_plan_only = false;
 
 statement ok
 set datafusion.optimizer.repartition_joins = true;
@@ -1624,13 +1639,11 @@ logical_plan
 03)----TableScan: join_t1 projection=[t1_id, t1_name]
 04)----TableScan: join_t2 projection=[t2_id]
 physical_plan
-01)ProjectionExec: expr=[t1_id@1 as t1_id, t2_id@0 as t2_id, t1_name@2 as t1_name]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t2_id@0, join_t1.t1_id + UInt32(11)@2)], projection=[t2_id@0, t1_id@1, t1_name@2]
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 11 as join_t1.t1_id + UInt32(11)]
-06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)----------DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t2_id@0, join_t1.t1_id + UInt32(11)@2)], projection=[t1_id@1, t2_id@0, t1_name@2]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 11 as join_t1.t1_id + UInt32(11)]
+04)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+05)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Right side expr key inner join
 
@@ -1650,15 +1663,13 @@ logical_plan
 03)----TableScan: join_t1 projection=[t1_id, t1_name]
 04)----TableScan: join_t2 projection=[t2_id]
 physical_plan
-01)ProjectionExec: expr=[t1_id@1 as t1_id, t2_id@0 as t2_id, t1_name@2 as t1_name]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t2.t2_id - UInt32(11)@1, t1_id@0)], projection=[t2_id@0, t1_id@2, t1_name@3]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[t2_id@0 as t2_id, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-09)--------DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t2.t2_id - UInt32(11)@1, t1_id@0)], projection=[t1_id@2, t2_id@0, t1_name@3]
+02)--CoalescePartitionsExec
+03)----ProjectionExec: expr=[t2_id@0 as t2_id, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)]
+04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+07)----DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 set datafusion.optimizer.repartition_joins = true;
@@ -1676,15 +1687,13 @@ logical_plan
 03)----TableScan: join_t1 projection=[t1_id, t1_name]
 04)----TableScan: join_t2 projection=[t2_id]
 physical_plan
-01)ProjectionExec: expr=[t1_id@1 as t1_id, t2_id@0 as t2_id, t1_name@2 as t1_name]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t2.t2_id - UInt32(11)@1, t1_id@0)], projection=[t2_id@0, t1_id@2, t1_name@3]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[t2_id@0 as t2_id, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-09)--------DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t2.t2_id - UInt32(11)@1, t1_id@0)], projection=[t1_id@2, t2_id@0, t1_name@3]
+02)--CoalescePartitionsExec
+03)----ProjectionExec: expr=[t2_id@0 as t2_id, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)]
+04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+07)----DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Select wildcard with expr key inner join
 
@@ -1703,12 +1712,11 @@ logical_plan
 02)--TableScan: join_t1 projection=[t1_id, t1_name, t1_int]
 03)--TableScan: join_t2 projection=[t2_id, t2_name, t2_int]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, join_t2.t2_id - UInt32(11)@3)], projection=[t1_id@0, t1_name@1, t1_int@2, t2_id@3, t2_name@4, t2_int@5]
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
-04)----ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)]
-05)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-06)--------DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, join_t2.t2_id - UInt32(11)@3)], projection=[t1_id@0, t1_name@1, t1_int@2, t2_id@3, t2_name@4, t2_int@5]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)]
+04)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+05)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 set datafusion.optimizer.repartition_joins = true;
@@ -1725,12 +1733,11 @@ logical_plan
 02)--TableScan: join_t1 projection=[t1_id, t1_name, t1_int]
 03)--TableScan: join_t2 projection=[t2_id, t2_name, t2_int]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, join_t2.t2_id - UInt32(11)@3)], projection=[t1_id@0, t1_name@1, t1_int@2, t2_id@3, t2_name@4, t2_int@5]
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
-04)----ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)]
-05)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-06)--------DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, join_t2.t2_id - UInt32(11)@3)], projection=[t1_id@0, t1_name@1, t1_int@2, t2_id@3, t2_name@4, t2_int@5]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)]
+04)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+05)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 #####
 # Config teardown
@@ -2049,17 +2056,14 @@ logical_plan
 05)----Filter: join_t2.t2_int > UInt32(1)
 06)------TableScan: join_t2 projection=[t2_id, t2_int]
 physical_plan
-01)ProjectionExec: expr=[t1_id@1 as t1_id, t2_id@0 as t2_id]
-02)--NestedLoopJoinExec: join_type=Inner, filter=t1_id@0 > t2_id@1
-03)----CoalescePartitionsExec
-04)------CoalesceBatchesExec: target_batch_size=2
-05)--------FilterExec: t2_int@1 > 1, projection=[t2_id@0]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)----CoalesceBatchesExec: target_batch_size=2
-09)------FilterExec: t1_id@0 > 10
-10)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-11)----------DataSourceExec: partitions=1, partition_sizes=[1]
+01)NestedLoopJoinExec: join_type=Inner, filter=t1_id@0 > t2_id@1, projection=[t1_id@1, t2_id@0]
+02)--CoalescePartitionsExec
+03)----FilterExec: t2_int@1 > 1, projection=[t2_id@0]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)--FilterExec: t1_id@0 > 10
+07)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query II
 SELECT join_t1.t1_id, join_t2.t2_id
@@ -2072,6 +2076,37 @@ ORDER BY 1
 33 11
 44 11
 
+# don't reorder join based on stats, and use the order in the query
+
+statement ok
+set datafusion.optimizer.join_reordering = false;
+
+statement ok
+set datafusion.explain.physical_plan_only = true;
+
+query TT
+EXPLAIN
+SELECT join_t1.t1_id, join_t2.t2_id
+FROM join_t1
+INNER JOIN join_t2 ON join_t1.t1_id > join_t2.t2_id
+WHERE join_t1.t1_id > 10 AND join_t2.t2_int > 1
+----
+physical_plan
+01)NestedLoopJoinExec: join_type=Inner, filter=t1_id@0 > t2_id@1
+02)--CoalescePartitionsExec
+03)----FilterExec: t1_id@0 > 10
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)--FilterExec: t2_int@1 > 1, projection=[t2_id@0]
+07)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+set datafusion.optimizer.join_reordering = true;
+
+statement ok
+set datafusion.explain.physical_plan_only = false;
+
 # Left as inner table nested loop join
 
 query TT
@@ -2092,21 +2127,19 @@ logical_plan
 physical_plan
 01)NestedLoopJoinExec: join_type=Right, filter=t1_id@0 < t2_id@1
 02)--CoalescePartitionsExec
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------FilterExec: t1_id@0 > 22
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)----------DataSourceExec: partitions=1, partition_sizes=[1]
-07)--CoalesceBatchesExec: target_batch_size=2
-08)----FilterExec: t2_id@0 > 11
-09)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-10)--------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----FilterExec: t1_id@0 > 22
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)--FilterExec: t2_id@0 > 11
+07)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query II
 SELECT join_t1.t1_id, join_t2.t2_id
 FROM (select t1_id from join_t1 where join_t1.t1_id > 22) as join_t1
 RIGHT JOIN (select t2_id from join_t2 where join_t2.t2_id > 11) as join_t2
         ON join_t1.t1_id < join_t2.t2_id
-ORDER BY 1, 2        
+ORDER BY 1, 2
 ----
 33 44
 33 55
@@ -2571,11 +2604,10 @@ logical_plan
 04)--SubqueryAlias: t2
 05)----TableScan: test_timestamps_tz_table projection=[nanos, micros, millis, secs, names]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(millis@2, millis@2)]
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
-04)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-05)------DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(millis@2, millis@2)]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+04)----DataSourceExec: partitions=1, partition_sizes=[1]
 
 # left_join_using_2
 query II
@@ -2743,17 +2775,13 @@ logical_plan
 04)--SubqueryAlias: t2
 05)----TableScan: hashjoin_datatype_table_t2 projection=[c1, c2, c3, c4]
 physical_plan
-01)SortMergeJoin: join_type=Inner, on=[(c1@0, c1@0)]
+01)SortMergeJoinExec: join_type=Inner, on=[(c1@0, c1@0)]
 02)--SortExec: expr=[c1@0 ASC], preserve_partitioning=[true]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-06)----------DataSourceExec: partitions=1, partition_sizes=[1]
-07)--SortExec: expr=[c1@0 ASC], preserve_partitioning=[true]
-08)----CoalesceBatchesExec: target_batch_size=2
-09)------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-10)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-11)----------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=1
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--SortExec: expr=[c1@0 ASC], preserve_partitioning=[true]
+06)----RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=1
+07)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # sort_merge_join_on_date32 inner sort merge join on data type (Date32)
 query DDRTDDRT rowsort
@@ -2774,18 +2802,15 @@ logical_plan
 05)----TableScan: hashjoin_datatype_table_t2 projection=[c1, c2, c3, c4]
 physical_plan
 01)ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3, c4@3 as c4, c1@5 as c1, c2@6 as c2, c3@7 as c3, c4@8 as c4]
-02)--SortMergeJoin: join_type=Right, on=[(CAST(t1.c3 AS Decimal128(10, 2))@4, c3@2)]
+02)--SortMergeJoinExec: join_type=Right, on=[(CAST(t1.c3 AS Decimal128(10, 2))@4, c3@2)]
 03)----SortExec: expr=[CAST(t1.c3 AS Decimal128(10, 2))@4 ASC], preserve_partitioning=[true]
-04)------CoalesceBatchesExec: target_batch_size=2
-05)--------RepartitionExec: partitioning=Hash([CAST(t1.c3 AS Decimal128(10, 2))@4], 2), input_partitions=2
-06)----------ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3, c4@3 as c4, CAST(c3@2 AS Decimal128(10, 2)) as CAST(t1.c3 AS Decimal128(10, 2))]
-07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
-09)----SortExec: expr=[c3@2 ASC], preserve_partitioning=[true]
-10)------CoalesceBatchesExec: target_batch_size=2
-11)--------RepartitionExec: partitioning=Hash([c3@2], 2), input_partitions=2
-12)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-13)------------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------RepartitionExec: partitioning=Hash([CAST(t1.c3 AS Decimal128(10, 2))@4], 2), input_partitions=2
+05)--------ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3, c4@3 as c4, CAST(c3@2 AS Decimal128(10, 2)) as CAST(t1.c3 AS Decimal128(10, 2))]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+08)----SortExec: expr=[c3@2 ASC], preserve_partitioning=[true]
+09)------RepartitionExec: partitioning=Hash([c3@2], 2), input_partitions=1
+10)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # sort_merge_join_on_decimal right join on data type (Decimal)
 query DDRTDDRT rowsort
@@ -2837,10 +2862,10 @@ explain SELECT t1_id, t1_name FROM left_semi_anti_join_table_t1 t1 WHERE t1_id I
 ----
 physical_plan
 01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)]
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)------FilterExec: DynamicFilter [ empty ]
 06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
 07)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
@@ -2873,10 +2898,10 @@ explain SELECT t1_id, t1_name FROM left_semi_anti_join_table_t1 t1 LEFT SEMI JOI
 ----
 physical_plan
 01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)]
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)------FilterExec: DynamicFilter [ empty ]
 06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
 07)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
@@ -2930,10 +2955,10 @@ explain SELECT t1_id, t1_name FROM left_semi_anti_join_table_t1 t1 WHERE t1_id I
 ----
 physical_plan
 01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)]
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)------FilterExec: DynamicFilter [ empty ]
 06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
 07)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
@@ -2966,10 +2991,10 @@ explain SELECT t1_id, t1_name FROM left_semi_anti_join_table_t1 t1 LEFT SEMI JOI
 ----
 physical_plan
 01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)]
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)------FilterExec: DynamicFilter [ empty ]
 06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
 07)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
@@ -3024,10 +3049,10 @@ explain SELECT t1_id, t1_name, t1_int FROM right_semi_anti_join_table_t1 t1 WHER
 ----
 physical_plan
 01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@1 != t1_name@0
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@1 != t1_name@0
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)------FilterExec: DynamicFilter [ empty ]
 06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
 07)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
@@ -3041,10 +3066,10 @@ explain SELECT t1_id, t1_name, t1_int FROM right_semi_anti_join_table_t2 t2 RIGH
 ----
 physical_plan
 01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@0 != t1_name@1
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@0 != t1_name@1
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)------FilterExec: DynamicFilter [ empty ]
 06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
 07)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
@@ -3096,10 +3121,10 @@ explain SELECT t1_id, t1_name, t1_int FROM right_semi_anti_join_table_t1 t1 WHER
 ----
 physical_plan
 01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@1 != t1_name@0
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@1 != t1_name@0
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)------FilterExec: DynamicFilter [ empty ]
 06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
 07)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
@@ -3113,10 +3138,10 @@ explain SELECT t1_id, t1_name, t1_int FROM right_semi_anti_join_table_t2 t2 RIGH
 ----
 physical_plan
 01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@0 != t1_name@1
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@0 != t1_name@1
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)------FilterExec: DynamicFilter [ empty ]
 06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
 07)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
@@ -3194,17 +3219,13 @@ logical_plan
 08)------TableScan: annotated_data projection=[a0, a, b, c, d]
 physical_plan
 01)SortPreservingMergeExec: [rn1@5 ASC NULLS LAST]
-02)--SortMergeJoin: join_type=Inner, on=[(a@1, a@1)]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, rn1@5 ASC NULLS LAST
-05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-06)----------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
-07)------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
-08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
-09)----CoalesceBatchesExec: target_batch_size=2
-10)------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST
-11)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-12)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+02)--SortMergeJoinExec: join_type=Inner, on=[(a@1, a@1)]
+03)----RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=1, maintains_sort_order=true
+04)------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
+05)--------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+07)----RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=1, maintains_sort_order=true
+08)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 # sort merge join should propagate ordering equivalence of the right side
 # for right join. Hence final requirement rn1 ASC is already satisfied at
@@ -3228,22 +3249,18 @@ logical_plan
 08)----------TableScan: annotated_data projection=[a0, a, b, c, d]
 physical_plan
 01)SortPreservingMergeExec: [rn1@10 ASC NULLS LAST]
-02)--SortMergeJoin: join_type=Right, on=[(a@1, a@1)]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST
-05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
-07)----CoalesceBatchesExec: target_batch_size=2
-08)------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, rn1@5 ASC NULLS LAST
-09)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-10)----------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
-11)------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
-12)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+02)--SortMergeJoinExec: join_type=Right, on=[(a@1, a@1)]
+03)----RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=1, maintains_sort_order=true
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+05)----RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=1, maintains_sort_order=true
+06)------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
+07)--------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
+08)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 statement ok
 set datafusion.optimizer.prefer_existing_sort = false;
 
-# SortMergeJoin should add ordering equivalences of
+# SortMergeJoinExec should add ordering equivalences of
 # right table as lexicographical append to the global ordering
 # below query shouldn't add any SortExec for order by clause.
 # since its requirement is already satisfied at the output of SortMergeJoinExec
@@ -3269,22 +3286,15 @@ logical_plan
 10)----------TableScan: annotated_data projection=[a0, a, b, c, d]
 physical_plan
 01)SortPreservingMergeExec: [a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, rn1@11 ASC NULLS LAST]
-02)--SortExec: expr=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, rn1@11 ASC NULLS LAST], preserve_partitioning=[true]
-03)----SortMergeJoin: join_type=Inner, on=[(a@1, a@1)]
-04)------SortExec: expr=[a@1 ASC], preserve_partitioning=[true]
-05)--------CoalesceBatchesExec: target_batch_size=2
-06)----------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2
-07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-08)--------------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
-09)----------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
-10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
-11)------SortExec: expr=[a@1 ASC], preserve_partitioning=[true]
-12)--------CoalesceBatchesExec: target_batch_size=2
-13)----------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2
-14)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-15)--------------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
-16)----------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
-17)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+02)--SortMergeJoinExec: join_type=Inner, on=[(a@1, a@1)]
+03)----RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=1, maintains_sort_order=true
+04)------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
+05)--------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+07)----RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=1, maintains_sort_order=true
+08)------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
+09)--------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
+10)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 statement ok
 set datafusion.optimizer.prefer_hash_join = true;
@@ -3314,12 +3324,11 @@ logical_plan
 07)--------WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
 08)----------TableScan: annotated_data projection=[a0, a, b, c, d]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@1, a@1)]
-03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
-04)----ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
-05)------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
-06)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@1, a@1)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+03)--ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
+04)----BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
+05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 # hash join should propagate ordering equivalence of the right side for RIGHT ANTI join.
 # Hence final requirement rn1 ASC is already satisfied at the end of HashJoinExec.
@@ -3341,12 +3350,11 @@ logical_plan
 07)--------WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
 08)----------TableScan: annotated_data projection=[a0, a, b, c, d]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=2
-02)--HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(a@0, a@1)]
-03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a], output_ordering=[a@0 ASC], file_type=csv, has_header=true
-04)----ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
-05)------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
-06)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+01)HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(a@0, a@1)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a], output_ordering=[a@0 ASC], file_type=csv, has_header=true
+03)--ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
+04)----BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
+05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 # Test ordering preservation for RIGHT join
 query TT
@@ -3364,10 +3372,9 @@ logical_plan
 05)----SubqueryAlias: r_table
 06)------TableScan: annotated_data projection=[a0, a, b, c, d]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(b@2, b@2)]
-03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
-04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+01)HashJoinExec: mode=CollectLeft, join_type=Right, on=[(b@2, b@2)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 query TT
 EXPLAIN SELECT l.a, LAST_VALUE(r.b ORDER BY r.a ASC NULLS FIRST) as last_col1
@@ -3389,10 +3396,9 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[a@0 as a, last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]@3 as last_col1]
 02)--AggregateExec: mode=Single, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]], ordering_mode=PartiallySorted([0])
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0)]
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], file_type=csv, has_header=true
-06)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST], file_type=csv, has_header=true
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0)]
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], file_type=csv, has_header=true
+05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST], file_type=csv, has_header=true
 
 # create a table where there more than one valid ordering
 # that describes table.
@@ -3437,12 +3443,11 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[last_value(l.d) ORDER BY [l.a ASC NULLS LAST]@1 as amount_usd]
 02)--AggregateExec: mode=Single, gby=[row_n@2 as row_n], aggr=[last_value(l.d) ORDER BY [l.a ASC NULLS LAST]], ordering_mode=Sorted
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d@1, d@1)], filter=CAST(a@0 AS Int64) >= CAST(a@1 AS Int64) - 10, projection=[a@0, d@1, row_n@4]
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
-06)--------ProjectionExec: expr=[a@0 as a, d@1 as d, row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as row_n]
-07)----------BoundedWindowAggExec: wdw=[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
-08)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d@1, d@1)], filter=CAST(a@0 AS Int64) >= CAST(a@1 AS Int64) - 10, projection=[a@0, d@1, row_n@4]
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
+05)------ProjectionExec: expr=[a@0 as a, d@1 as d, row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as row_n]
+06)--------BoundedWindowAggExec: wdw=[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+07)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 # run query above in multiple partitions
 statement ok
@@ -3471,22 +3476,15 @@ logical_plan
 08)----------TableScan: annotated_data projection=[a, b]
 physical_plan
 01)SortPreservingMergeExec: [a@0 ASC]
-02)--SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
-03)----ProjectionExec: expr=[a@0 as a, last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]@3 as last_col1]
-04)------AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]]
-05)--------CoalesceBatchesExec: target_batch_size=2
-06)----------RepartitionExec: partitioning=Hash([a@0, b@1, c@2], 2), input_partitions=2
-07)------------AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]]
-08)--------------CoalesceBatchesExec: target_batch_size=2
-09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0)]
-10)------------------CoalesceBatchesExec: target_batch_size=2
-11)--------------------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
-12)----------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-13)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], file_type=csv, has_header=true
-14)------------------CoalesceBatchesExec: target_batch_size=2
-15)--------------------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
-16)----------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-17)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST], file_type=csv, has_header=true
+02)--ProjectionExec: expr=[a@0 as a, last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]@3 as last_col1]
+03)----AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]], ordering_mode=PartiallySorted([0])
+04)------RepartitionExec: partitioning=Hash([a@0, b@1, c@2], 2), input_partitions=2, preserve_order=true, sort_exprs=a@0 ASC
+05)--------AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]], ordering_mode=PartiallySorted([0])
+06)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0)]
+07)------------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=1, maintains_sort_order=true
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], file_type=csv, has_header=true
+09)------------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=1, maintains_sort_order=true
+10)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST], file_type=csv, has_header=true
 
 query TT
 EXPLAIN SELECT *
@@ -3502,7 +3500,7 @@ logical_plan
 physical_plan
 01)NestedLoopJoinExec: join_type=Inner, filter=a@1 < a@0
 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
-03)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+03)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
 04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 # Currently datafusion can pushdown filter conditions with scalar UDF into
@@ -3520,11 +3518,10 @@ logical_plan
 05)----TableScan: annotated_data projection=[a0, a, b, c, d]
 physical_plan
 01)NestedLoopJoinExec: join_type=Inner, filter=example(join_proj_push_down_1@0, join_proj_push_down_2@1) > 3, projection=[a0@0, a@1, b@2, c@3, d@4, a0@6, a@7, b@8, c@9, d@10]
-02)--ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, CAST(a@1 AS Float64) as join_proj_push_down_1]
-03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
-04)--ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, CAST(a@1 AS Float64) as join_proj_push_down_2]
-05)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-06)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d, CAST(a@1 AS Float64) as join_proj_push_down_1], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+03)--ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, CAST(a@1 AS Float64) as join_proj_push_down_2]
+04)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
+05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 ####
 # Config teardown
@@ -3581,7 +3578,6 @@ AS VALUES
 query IT
 SELECT t1_id, t1_name FROM join_test_left WHERE t1_id NOT IN (SELECT t2_id FROM join_test_right) ORDER BY t1_id;
 ----
-NULL e
 
 ####
 # join_partitioned_test
@@ -3638,16 +3634,14 @@ logical_plan
 09)----Projection: Int64(1) AS e, Int64(3) AS f
 10)------EmptyRelation: rows=1
 physical_plan
-01)ProjectionExec: expr=[c@2 as c, d@3 as d, e@0 as e, f@1 as f]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Full, on=[(e@0, c@0)]
-04)------ProjectionExec: expr=[1 as e, 3 as f]
-05)--------PlaceholderRowExec
-06)------UnionExec
-07)--------ProjectionExec: expr=[1 as c, 2 as d]
-08)----------PlaceholderRowExec
-09)--------ProjectionExec: expr=[1 as c, 3 as d]
-10)----------PlaceholderRowExec
+01)HashJoinExec: mode=CollectLeft, join_type=Full, on=[(e@0, c@0)], projection=[c@2, d@3, e@0, f@1]
+02)--ProjectionExec: expr=[1 as e, 3 as f]
+03)----PlaceholderRowExec
+04)--UnionExec
+05)----ProjectionExec: expr=[1 as c, 2 as d]
+06)------PlaceholderRowExec
+07)----ProjectionExec: expr=[1 as c, 3 as d]
+08)------PlaceholderRowExec
 
 query IIII rowsort
 SELECT * FROM (
@@ -3681,16 +3675,14 @@ logical_plan
 09)----Projection: Int64(1) AS e, Int64(3) AS f
 10)------EmptyRelation: rows=1
 physical_plan
-01)ProjectionExec: expr=[c@2 as c, d@3 as d, e@0 as e, f@1 as f]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Full, on=[(e@0, c@0)]
-04)------ProjectionExec: expr=[1 as e, 3 as f]
-05)--------PlaceholderRowExec
-06)------UnionExec
-07)--------ProjectionExec: expr=[1 as c, 2 as d]
-08)----------PlaceholderRowExec
-09)--------ProjectionExec: expr=[1 as c, 3 as d]
-10)----------PlaceholderRowExec
+01)HashJoinExec: mode=CollectLeft, join_type=Full, on=[(e@0, c@0)], projection=[c@2, d@3, e@0, f@1]
+02)--ProjectionExec: expr=[1 as e, 3 as f]
+03)----PlaceholderRowExec
+04)--UnionExec
+05)----ProjectionExec: expr=[1 as c, 2 as d]
+06)------PlaceholderRowExec
+07)----ProjectionExec: expr=[1 as c, 3 as d]
+08)------PlaceholderRowExec
 
 query IIII rowsort
 SELECT * FROM (
@@ -3894,12 +3886,10 @@ logical_plan
 05)----Sort: right_table_no_nulls.b ASC NULLS LAST, fetch=10
 06)------TableScan: right_table_no_nulls projection=[a, b]
 physical_plan
-01)ProjectionExec: expr=[a@2 as a, b@3 as b, a@0 as a, b@1 as b]
-02)--CoalesceBatchesExec: target_batch_size=3
-03)----HashJoinExec: mode=CollectLeft, join_type=Left, on=[(b@1, b@1)]
-04)------SortExec: TopK(fetch=10), expr=[b@1 ASC NULLS LAST], preserve_partitioning=[false]
-05)--------DataSourceExec: partitions=1, partition_sizes=[2]
-06)------DataSourceExec: partitions=1, partition_sizes=[2]
+01)HashJoinExec: mode=CollectLeft, join_type=Left, on=[(b@1, b@1)], projection=[a@2, b@3, a@0, b@1]
+02)--SortExec: TopK(fetch=10), expr=[b@1 ASC NULLS LAST], preserve_partitioning=[false]
+03)----DataSourceExec: partitions=1, partition_sizes=[2]
+04)--DataSourceExec: partitions=1, partition_sizes=[2]
 
 
 
@@ -3952,11 +3942,9 @@ logical_plan
 04)--SubqueryAlias: rhs
 05)----TableScan: right_table_no_nulls projection=[a, b]
 physical_plan
-01)ProjectionExec: expr=[a@2 as a, b@3 as b, a@0 as a, b@1 as b]
-02)--CoalesceBatchesExec: target_batch_size=3
-03)----HashJoinExec: mode=CollectLeft, join_type=Left, on=[(b@1, b@1)]
-04)------DataSourceExec: partitions=1, partition_sizes=[2]
-05)------DataSourceExec: partitions=1, partition_sizes=[2]
+01)HashJoinExec: mode=CollectLeft, join_type=Left, on=[(b@1, b@1)], projection=[a@2, b@3, a@0, b@1]
+02)--DataSourceExec: partitions=1, partition_sizes=[2]
+03)--DataSourceExec: partitions=1, partition_sizes=[2]
 
 
 # Null build indices:
@@ -4012,12 +4000,10 @@ logical_plan
 05)----Sort: right_table_no_nulls.b ASC NULLS LAST, fetch=10
 06)------TableScan: right_table_no_nulls projection=[a, b]
 physical_plan
-01)ProjectionExec: expr=[a@2 as a, b@3 as b, a@0 as a, b@1 as b]
-02)--CoalesceBatchesExec: target_batch_size=3
-03)----HashJoinExec: mode=CollectLeft, join_type=Left, on=[(b@1, b@1)]
-04)------SortExec: TopK(fetch=10), expr=[b@1 ASC NULLS LAST], preserve_partitioning=[false]
-05)--------DataSourceExec: partitions=1, partition_sizes=[2]
-06)------DataSourceExec: partitions=1, partition_sizes=[2]
+01)HashJoinExec: mode=CollectLeft, join_type=Left, on=[(b@1, b@1)], projection=[a@2, b@3, a@0, b@1]
+02)--SortExec: TopK(fetch=10), expr=[b@1 ASC NULLS LAST], preserve_partitioning=[false]
+03)----DataSourceExec: partitions=1, partition_sizes=[2]
+04)--DataSourceExec: partitions=1, partition_sizes=[2]
 
 
 # Test CROSS JOIN LATERAL syntax (planning)
@@ -4025,17 +4011,16 @@ query TT
 explain select t1_id, t1_name, i from join_t1 t1 cross join lateral (select * from unnest(generate_series(1, t1_int))) as series(i);
 ----
 logical_plan
-01)Cross Join: 
+01)Cross Join:
 02)--SubqueryAlias: t1
 03)----TableScan: join_t1 projection=[t1_id, t1_name]
 04)--SubqueryAlias: series
-05)----Subquery:
-06)------Projection: UNNEST(generate_series(Int64(1),outer_ref(t1.t1_int))) AS i
-07)--------Subquery:
-08)----------Projection: __unnest_placeholder(generate_series(Int64(1),outer_ref(t1.t1_int)),depth=1) AS UNNEST(generate_series(Int64(1),outer_ref(t1.t1_int)))
-09)------------Unnest: lists[__unnest_placeholder(generate_series(Int64(1),outer_ref(t1.t1_int)))|depth=1] structs[]
-10)--------------Projection: generate_series(Int64(1), CAST(outer_ref(t1.t1_int) AS Int64)) AS __unnest_placeholder(generate_series(Int64(1),outer_ref(t1.t1_int)))
-11)----------------EmptyRelation: rows=1
+05)----Projection: UNNEST(generate_series(Int64(1),outer_ref(t1.t1_int))) AS i
+06)------Subquery:
+07)--------Projection: __unnest_placeholder(generate_series(Int64(1),outer_ref(t1.t1_int)),depth=1) AS UNNEST(generate_series(Int64(1),outer_ref(t1.t1_int)))
+08)----------Unnest: lists[__unnest_placeholder(generate_series(Int64(1),outer_ref(t1.t1_int)))|depth=1] structs[]
+09)------------Projection: generate_series(Int64(1), CAST(outer_ref(t1.t1_int) AS Int64)) AS __unnest_placeholder(generate_series(Int64(1),outer_ref(t1.t1_int)))
+10)--------------EmptyRelation: rows=1
 physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "t1_int", data_type: UInt32, nullable: true }, Column { relation: Some(Bare { table: "t1" }), name: "t1_int" })
 
 
@@ -4054,13 +4039,12 @@ logical_plan
 02)--SubqueryAlias: t2
 03)----TableScan: join_t1 projection=[t1_id, t1_name]
 04)--SubqueryAlias: series
-05)----Subquery:
-06)------Projection: UNNEST(generate_series(Int64(1),outer_ref(t2.t1_int))) AS i
-07)--------Subquery:
-08)----------Projection: __unnest_placeholder(generate_series(Int64(1),outer_ref(t2.t1_int)),depth=1) AS UNNEST(generate_series(Int64(1),outer_ref(t2.t1_int)))
-09)------------Unnest: lists[__unnest_placeholder(generate_series(Int64(1),outer_ref(t2.t1_int)))|depth=1] structs[]
-10)--------------Projection: generate_series(Int64(1), CAST(outer_ref(t2.t1_int) AS Int64)) AS __unnest_placeholder(generate_series(Int64(1),outer_ref(t2.t1_int)))
-11)----------------EmptyRelation: rows=1
+05)----Projection: UNNEST(generate_series(Int64(1),outer_ref(t2.t1_int))) AS i
+06)------Subquery:
+07)--------Projection: __unnest_placeholder(generate_series(Int64(1),outer_ref(t2.t1_int)),depth=1) AS UNNEST(generate_series(Int64(1),outer_ref(t2.t1_int)))
+08)----------Unnest: lists[__unnest_placeholder(generate_series(Int64(1),outer_ref(t2.t1_int)))|depth=1] structs[]
+09)------------Projection: generate_series(Int64(1), CAST(outer_ref(t2.t1_int) AS Int64)) AS __unnest_placeholder(generate_series(Int64(1),outer_ref(t2.t1_int)))
+10)--------------EmptyRelation: rows=1
 physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "t1_int", data_type: UInt32, nullable: true }, Column { relation: Some(Bare { table: "t2" }), name: "t1_int" })
 
 
@@ -4120,12 +4104,10 @@ physical_plan
 01)SortExec: expr=[sn@1 ASC NULLS LAST], preserve_partitioning=[false]
 02)--ProjectionExec: expr=[ts@1 as ts, sn@0 as sn, amount@2 as amount, currency@3 as currency, CAST(amount@2 AS Float32) * last_value(e.rate)@4 as amount_usd]
 03)----AggregateExec: mode=Single, gby=[sn@1 as sn, ts@0 as ts, amount@2 as amount, currency@3 as currency], aggr=[last_value(e.rate)]
-04)------CoalesceBatchesExec: target_batch_size=3
-05)--------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(currency@3, currency_from@1)], filter=ts@0 >= ts@1, projection=[ts@0, sn@1, amount@2, currency@3, rate@6]
-06)----------DataSourceExec: partitions=1, partition_sizes=[0]
-07)----------CoalesceBatchesExec: target_batch_size=3
-08)------------FilterExec: currency_to@2 = USD, projection=[ts@0, currency_from@1, rate@3]
-09)--------------DataSourceExec: partitions=1, partition_sizes=[0]
+04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(currency@3, currency_from@1)], filter=ts@0 >= ts@1, projection=[ts@0, sn@1, amount@2, currency@3, rate@6]
+05)--------DataSourceExec: partitions=1, partition_sizes=[0]
+06)--------FilterExec: currency_to@2 = USD, projection=[ts@0, currency_from@1, rate@3]
+07)----------DataSourceExec: partitions=1, partition_sizes=[0]
 
 statement ok
 DROP TABLE sales_global;
@@ -4164,10 +4146,10 @@ logical_plan
 03)----TableScan: left_table projection=[a, b, c]
 04)----TableScan: right_table projection=[x, y, z]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(b@1, y@1)], filter=a@0 < x@1
-03)----DataSourceExec: partitions=1, partition_sizes=[0]
-04)----SortExec: expr=[x@0 ASC NULLS LAST], preserve_partitioning=[false]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(b@1, y@1)], filter=a@0 < x@1
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--SortExec: expr=[x@0 ASC NULLS LAST], preserve_partitioning=[false]
+04)----FilterExec: DynamicFilter [ empty ]
 05)------DataSourceExec: partitions=1, partition_sizes=[0]
 
 # Test full join with limit
@@ -4235,10 +4217,9 @@ logical_plan
 03)----TableScan: t0 projection=[c1, c2]
 04)----TableScan: t1 projection=[c1, c2, c3]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3, fetch=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Full, on=[(c1@0, c1@0)]
-03)----DataSourceExec: partitions=1, partition_sizes=[2]
-04)----DataSourceExec: partitions=1, partition_sizes=[2]
+01)HashJoinExec: mode=CollectLeft, join_type=Full, on=[(c1@0, c1@0)], fetch=2
+02)--DataSourceExec: partitions=1, partition_sizes=[2]
+03)--DataSourceExec: partitions=1, partition_sizes=[2]
 
 ## Test join.on.is_empty() && join.filter.is_some() -> single filter now a PWMJ
 query TT
@@ -4265,10 +4246,9 @@ logical_plan
 03)----TableScan: t0 projection=[c1, c2]
 04)----TableScan: t1 projection=[c1, c2, c3]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3, fetch=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Full, on=[(c1@0, c1@0)], filter=c2@0 >= c2@1
-03)----DataSourceExec: partitions=1, partition_sizes=[2]
-04)----DataSourceExec: partitions=1, partition_sizes=[2]
+01)HashJoinExec: mode=CollectLeft, join_type=Full, on=[(c1@0, c1@0)], filter=c2@0 >= c2@1, fetch=2
+02)--DataSourceExec: partitions=1, partition_sizes=[2]
+03)--DataSourceExec: partitions=1, partition_sizes=[2]
 
 ## Add more test cases for join limit pushdown
 statement ok
@@ -4319,6 +4299,7 @@ select * from t1 LEFT JOIN t2 ON t1.a = t2.b LIMIT 2;
 1 1
 
 # can only push down to t1 (preserved side)
+# limit pushdown supported for left join - both to join and probe side
 query TT
 explain select * from t1 LEFT JOIN t2 ON t1.a = t2.b LIMIT 2;
 ----
@@ -4329,10 +4310,9 @@ logical_plan
 04)------TableScan: t1 projection=[a], fetch=2
 05)----TableScan: t2 projection=[b]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3, fetch=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Left, on=[(a@0, b@0)]
-03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/joins/t1.csv]]}, projection=[a], limit=2, file_type=csv, has_header=true
-04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/joins/t2.csv]]}, projection=[b], file_type=csv, has_header=true
+01)HashJoinExec: mode=CollectLeft, join_type=Left, on=[(a@0, b@0)], fetch=2
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/joins/t1.csv]]}, projection=[a], limit=2, file_type=csv, has_header=true
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/joins/t2.csv]]}, projection=[b], file_type=csv, has_header=true
 
 ######
 ## RIGHT JOIN w/ LIMIT
@@ -4363,10 +4343,9 @@ logical_plan
 04)----Limit: skip=0, fetch=2
 05)------TableScan: t2 projection=[b], fetch=2
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3, fetch=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(a@0, b@0)]
-03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/joins/t1.csv]]}, projection=[a], file_type=csv, has_header=true
-04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/joins/t2.csv]]}, projection=[b], limit=2, file_type=csv, has_header=true
+01)HashJoinExec: mode=CollectLeft, join_type=Right, on=[(a@0, b@0)], fetch=2
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/joins/t1.csv]]}, projection=[a], file_type=csv, has_header=true
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/joins/t2.csv]]}, projection=[b], limit=2, file_type=csv, has_header=true
 
 ######
 ## FULL JOIN w/ LIMIT
@@ -4390,7 +4369,7 @@ select * from t1 FULL JOIN t2 ON t1.a = t2.b LIMIT 2;
 4 4
 
 
-# can't push limit for full outer join
+# full outer join supports fetch pushdown
 query TT
 explain select * from t1 FULL JOIN t2 ON t1.a = t2.b LIMIT 2;
 ----
@@ -4400,10 +4379,9 @@ logical_plan
 03)----TableScan: t1 projection=[a]
 04)----TableScan: t2 projection=[b]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3, fetch=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Full, on=[(a@0, b@0)]
-03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/joins/t1.csv]]}, projection=[a], file_type=csv, has_header=true
-04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/joins/t2.csv]]}, projection=[b], file_type=csv, has_header=true
+01)HashJoinExec: mode=CollectLeft, join_type=Full, on=[(a@0, b@0)], fetch=2
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/joins/t1.csv]]}, projection=[a], file_type=csv, has_header=true
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/joins/t2.csv]]}, projection=[b], file_type=csv, has_header=true
 
 statement ok
 drop table t1;
@@ -4441,11 +4419,9 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
 02)--AggregateExec: mode=Single, gby=[], aggr=[count(Int64(1))]
-03)----ProjectionExec: expr=[]
-04)------CoalesceBatchesExec: target_batch_size=3
-05)--------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(binary_col@0, binary_col@0)]
-06)----------DataSourceExec: partitions=1, partition_sizes=[1]
-07)----------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(binary_col@0, binary_col@0)], projection=[]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+05)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Test hash join sort push down
 # Issue: https://github.com/apache/datafusion/issues/13559
@@ -4471,14 +4447,13 @@ logical_plan
 07)----------TableScan: test projection=[a, b]
 physical_plan
 01)SortPreservingMergeExec: [c@2 DESC]
-02)--CoalesceBatchesExec: target_batch_size=3
-03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(a@0, a@0)]
-04)------CoalescePartitionsExec
-05)--------CoalesceBatchesExec: target_batch_size=3
-06)----------FilterExec: b@1 > 3, projection=[a@0]
-07)------------DataSourceExec: partitions=2, partition_sizes=[1, 1]
-08)------SortExec: expr=[c@2 DESC], preserve_partitioning=[true]
-09)--------DataSourceExec: partitions=2, partition_sizes=[1, 1]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(a@0, a@0)]
+03)----CoalescePartitionsExec
+04)------FilterExec: b@1 > 3, projection=[a@0]
+05)--------DataSourceExec: partitions=2, partition_sizes=[1, 1]
+06)----SortExec: expr=[c@2 DESC], preserve_partitioning=[true]
+07)------FilterExec: DynamicFilter [ empty ]
+08)--------DataSourceExec: partitions=2, partition_sizes=[1, 1]
 
 query TT
 explain select * from test where a in (select a from test where b > 3) order by c desc nulls last;
@@ -4493,14 +4468,13 @@ logical_plan
 07)----------TableScan: test projection=[a, b]
 physical_plan
 01)SortPreservingMergeExec: [c@2 DESC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=3
-03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(a@0, a@0)]
-04)------CoalescePartitionsExec
-05)--------CoalesceBatchesExec: target_batch_size=3
-06)----------FilterExec: b@1 > 3, projection=[a@0]
-07)------------DataSourceExec: partitions=2, partition_sizes=[1, 1]
-08)------SortExec: expr=[c@2 DESC NULLS LAST], preserve_partitioning=[true]
-09)--------DataSourceExec: partitions=2, partition_sizes=[1, 1]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(a@0, a@0)]
+03)----CoalescePartitionsExec
+04)------FilterExec: b@1 > 3, projection=[a@0]
+05)--------DataSourceExec: partitions=2, partition_sizes=[1, 1]
+06)----SortExec: expr=[c@2 DESC NULLS LAST], preserve_partitioning=[true]
+07)------FilterExec: DynamicFilter [ empty ]
+08)--------DataSourceExec: partitions=2, partition_sizes=[1, 1]
 
 query III
 select * from test where a in (select a from test where b > 3) order by c desc nulls first;
@@ -4538,10 +4512,9 @@ logical_plan
 05)----SubqueryAlias: b
 06)------TableScan: person projection=[id, age, state]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1)], projection=[id@0, age@1, state@2, state@5]
-03)----DataSourceExec: partitions=1, partition_sizes=[0]
-04)----DataSourceExec: partitions=1, partition_sizes=[0]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1)], projection=[id@0, age@1, state@2, state@5]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[0]
 
 query TT
 explain SELECT age FROM (SELECT * FROM person a join person b using (id, age, state));
@@ -4554,10 +4527,9 @@ logical_plan
 05)----SubqueryAlias: b
 06)------TableScan: person projection=[id, age, state]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1), (state@2, state@2)], projection=[age@1]
-03)----DataSourceExec: partitions=1, partition_sizes=[0]
-04)----DataSourceExec: partitions=1, partition_sizes=[0]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1), (state@2, state@2)], projection=[age@1]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[0]
 
 query TT
 explain SELECT a.* FROM person a join person b using (id, age);
@@ -4570,10 +4542,9 @@ logical_plan
 05)----SubqueryAlias: b
 06)------TableScan: person projection=[id, age]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1)], projection=[id@0, age@1, state@2]
-03)----DataSourceExec: partitions=1, partition_sizes=[0]
-04)----DataSourceExec: partitions=1, partition_sizes=[0]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1)], projection=[id@0, age@1, state@2]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[0]
 
 query TT
 explain SELECT a.*, b.* FROM person a join person b using (id, age);
@@ -4585,10 +4556,9 @@ logical_plan
 04)--SubqueryAlias: b
 05)----TableScan: person projection=[id, age, state]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1)]
-03)----DataSourceExec: partitions=1, partition_sizes=[0]
-04)----DataSourceExec: partitions=1, partition_sizes=[0]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1)]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[0]
 
 query TT
 explain SELECT * FROM person a join person b using (id, age, state) join person c using (id, age, state);
@@ -4605,19 +4575,17 @@ logical_plan
 09)----SubqueryAlias: c
 10)------TableScan: person projection=[id, age, state]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1), (state@2, state@2)], projection=[id@0, age@1, state@2]
 02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1), (state@2, state@2)], projection=[id@0, age@1, state@2]
-03)----CoalesceBatchesExec: target_batch_size=3
-04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1), (state@2, state@2)], projection=[id@0, age@1, state@2]
-05)--------DataSourceExec: partitions=1, partition_sizes=[0]
-06)--------DataSourceExec: partitions=1, partition_sizes=[0]
-07)----DataSourceExec: partitions=1, partition_sizes=[0]
+03)----DataSourceExec: partitions=1, partition_sizes=[0]
+04)----DataSourceExec: partitions=1, partition_sizes=[0]
+05)--DataSourceExec: partitions=1, partition_sizes=[0]
 
 query TT
 explain SELECT * FROM person a NATURAL JOIN lineitem b;
 ----
 logical_plan
-01)Cross Join: 
+01)Cross Join:
 02)--SubqueryAlias: a
 03)----TableScan: person projection=[id, age, state]
 04)--SubqueryAlias: b
@@ -4637,10 +4605,9 @@ logical_plan
 04)----SubqueryAlias: lineitem2
 05)------TableScan: lineitem projection=[c1]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(c1@0, c1@0)], projection=[c1@0]
-03)----DataSourceExec: partitions=1, partition_sizes=[0]
-04)----DataSourceExec: partitions=1, partition_sizes=[0]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(c1@0, c1@0)], projection=[c1@0]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[0]
 
 statement count 0
 drop table person;
@@ -4664,67 +4631,86 @@ query TT
 explain SELECT j1_string, j2_string FROM j1, LATERAL (SELECT * FROM j2 WHERE j1_id < j2_id) AS j2;
 ----
 logical_plan
-01)Cross Join: 
-02)--TableScan: j1 projection=[j1_string]
-03)--SubqueryAlias: j2
-04)----Projection: j2.j2_string
-05)------Subquery:
-06)--------Filter: outer_ref(j1.j1_id) < j2.j2_id
-07)----------TableScan: j2 projection=[j2_string, j2_id]
-physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" })
+01)Projection: j1.j1_string, j2.j2_string
+02)--Inner Join:  Filter: j1.j1_id < j2.j2_id
+03)----TableScan: j1 projection=[j1_string, j1_id]
+04)----SubqueryAlias: j2
+05)------TableScan: j2 projection=[j2_string, j2_id]
+physical_plan
+01)NestedLoopJoinExec: join_type=Inner, filter=j1_id@0 < j2_id@1, projection=[j1_string@0, j2_string@2]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[0]
 
 query TT
 explain SELECT * FROM j1 JOIN (j2 JOIN j3 ON(j2_id = j3_id - 2)) ON(j1_id = j2_id), LATERAL (SELECT * FROM j3 WHERE j3_string = j2_string) as j4
 ----
 logical_plan
-01)Cross Join: 
+01)Inner Join: j2.j2_string = j4.j3_string
 02)--Inner Join: CAST(j2.j2_id AS Int64) = CAST(j3.j3_id AS Int64) - Int64(2)
 03)----Inner Join: j1.j1_id = j2.j2_id
 04)------TableScan: j1 projection=[j1_string, j1_id]
 05)------TableScan: j2 projection=[j2_string, j2_id]
 06)----TableScan: j3 projection=[j3_string, j3_id]
 07)--SubqueryAlias: j4
-08)----Subquery:
-09)------Filter: j3.j3_string = outer_ref(j2.j2_string)
-10)--------TableScan: j3 projection=[j3_string, j3_id]
-physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j2_string", data_type: Utf8View, nullable: true }, Column { relation: Some(Bare { table: "j2" }), name: "j2_string" })
+08)----TableScan: j3 projection=[j3_string, j3_id]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(j2_string@2, j3_string@0)]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(CAST(j2.j2_id AS Int64)@4, j3.j3_id - Int64(2)@2)], projection=[j1_string@0, j1_id@1, j2_string@2, j2_id@3, j3_string@5, j3_id@6]
+03)----ProjectionExec: expr=[j1_string@0 as j1_string, j1_id@1 as j1_id, j2_string@2 as j2_string, j2_id@3 as j2_id, CAST(j2_id@3 AS Int64) as CAST(j2.j2_id AS Int64)]
+04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(j1_id@1, j2_id@1)]
+05)--------DataSourceExec: partitions=1, partition_sizes=[0]
+06)--------DataSourceExec: partitions=1, partition_sizes=[0]
+07)----ProjectionExec: expr=[j3_string@0 as j3_string, j3_id@1 as j3_id, CAST(j3_id@1 AS Int64) - 2 as j3.j3_id - Int64(2)]
+08)------DataSourceExec: partitions=1, partition_sizes=[0]
+09)--DataSourceExec: partitions=1, partition_sizes=[0]
 
+# Nested LATERAL: each level only references siblings, no skip-level correlation.
 query TT
 explain SELECT * FROM j1, LATERAL (SELECT * FROM j1, LATERAL (SELECT * FROM j2 WHERE j1_id = j2_id) as j2) as j2;
 ----
 logical_plan
-01)Cross Join: 
+01)Cross Join:
 02)--TableScan: j1 projection=[j1_string, j1_id]
 03)--SubqueryAlias: j2
-04)----Subquery:
-05)------Cross Join: 
-06)--------TableScan: j1 projection=[j1_string, j1_id]
-07)--------SubqueryAlias: j2
-08)----------Subquery:
-09)------------Filter: outer_ref(j1.j1_id) = j2.j2_id
-10)--------------TableScan: j2 projection=[j2_string, j2_id]
-physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" })
+04)----Inner Join: j1.j1_id = j2.j2_id
+05)------TableScan: j1 projection=[j1_string, j1_id]
+06)------SubqueryAlias: j2
+07)--------TableScan: j2 projection=[j2_string, j2_id]
+physical_plan
+01)CrossJoinExec
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(j1_id@1, j2_id@1)]
+04)----DataSourceExec: partitions=1, partition_sizes=[0]
+05)----DataSourceExec: partitions=1, partition_sizes=[0]
 
 query TT
 explain SELECT j1_string, j2_string FROM j1 LEFT JOIN LATERAL (SELECT * FROM j2 WHERE j1_id < j2_id) AS j2 ON(true);
 ----
 logical_plan
-01)Left Join: 
-02)--TableScan: j1 projection=[j1_string]
-03)--SubqueryAlias: j2
-04)----Projection: j2.j2_string
-05)------Subquery:
-06)--------Filter: outer_ref(j1.j1_id) < j2.j2_id
-07)----------TableScan: j2 projection=[j2_string, j2_id]
-physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" })
+01)Projection: j1.j1_string, j2.j2_string
+02)--Left Join:  Filter: j1.j1_id < j2.j2_id
+03)----TableScan: j1 projection=[j1_string, j1_id]
+04)----SubqueryAlias: j2
+05)------TableScan: j2 projection=[j2_string, j2_id]
+physical_plan
+01)NestedLoopJoinExec: join_type=Left, filter=j1_id@0 < j2_id@1, projection=[j1_string@0, j2_string@2]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[0]
+
+# INNER lateral with multi-scope correlation: the subquery references
+# j1 (grandparent scope) and j2 (parent scope). The optimizer
+# restructures this into a valid plan via join reordering.
+query TITITI
+SELECT * FROM j1, (j2 CROSS JOIN LATERAL (SELECT * FROM j3 WHERE j1_id + j2_id = j3_id) AS j3_lat);
+----
 
 query TT
 explain SELECT * FROM j1, (j2 LEFT JOIN LATERAL (SELECT * FROM j3 WHERE j1_id + j2_id = j3_id) AS j3 ON(true));
 ----
 logical_plan
-01)Cross Join: 
+01)Cross Join:
 02)--TableScan: j1 projection=[j1_string, j1_id]
-03)--Left Join: 
+03)--Left Join:
 04)----TableScan: j2 projection=[j2_string, j2_id]
 05)----SubqueryAlias: j3
 06)------Subquery:
@@ -4736,7 +4722,7 @@ query TT
 explain SELECT * FROM j1, LATERAL (SELECT 1) AS j2;
 ----
 logical_plan
-01)Cross Join: 
+01)Cross Join:
 02)--TableScan: j1 projection=[j1_string, j1_id]
 03)--SubqueryAlias: j2
 04)----Projection: Int64(1)
@@ -4773,12 +4759,11 @@ logical_plan
 02)--TableScan: person projection=[id]
 03)--TableScan: orders projection=[customer_id]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(person.id + Int64(10)@1, orders.customer_id * Int64(2)@1)], projection=[id@0, customer_id@2]
-03)----ProjectionExec: expr=[id@0 as id, CAST(id@0 AS Int64) + 10 as person.id + Int64(10)]
-04)------DataSourceExec: partitions=1, partition_sizes=[0]
-05)----ProjectionExec: expr=[customer_id@0 as customer_id, CAST(customer_id@0 AS Int64) * 2 as orders.customer_id * Int64(2)]
-06)------DataSourceExec: partitions=1, partition_sizes=[0]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(person.id + Int64(10)@1, orders.customer_id * Int64(2)@1)], projection=[id@0, customer_id@2]
+02)--ProjectionExec: expr=[id@0 as id, CAST(id@0 AS Int64) + 10 as person.id + Int64(10)]
+03)----DataSourceExec: partitions=1, partition_sizes=[0]
+04)--ProjectionExec: expr=[customer_id@0 as customer_id, CAST(customer_id@0 AS Int64) * 2 as orders.customer_id * Int64(2)]
+05)----DataSourceExec: partitions=1, partition_sizes=[0]
 
 statement count 0
 drop table person;
@@ -4865,10 +4850,9 @@ logical_plan
 04)----TableScan: t2 projection=[k]
 physical_plan
 01)SortExec: TopK(fetch=2), expr=[k@0 ASC NULLS LAST], preserve_partitioning=[false]
-02)--CoalesceBatchesExec: target_batch_size=3
-03)----HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(k@0, k@0)]
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------DataSourceExec: partitions=1, partition_sizes=[3334]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(k@0, k@0)]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----DataSourceExec: partitions=1, partition_sizes=[3334]
 
 
 query II
@@ -4896,11 +4880,10 @@ logical_plan
 03)----TableScan: t1 projection=[k, v]
 04)----TableScan: t2 projection=[k]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(k@0, k@0)]
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
-04)----SortExec: expr=[k@0 ASC NULLS LAST], preserve_partitioning=[false]
-05)------DataSourceExec: partitions=1, partition_sizes=[3334]
+01)HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(k@0, k@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--SortExec: expr=[k@0 ASC NULLS LAST], preserve_partitioning=[false]
+04)----DataSourceExec: partitions=1, partition_sizes=[3334]
 
 statement ok
 DROP TABLE t1;
@@ -4931,11 +4914,9 @@ FROM t1
 JOIN t2 ON k1 = k2
 ----
 physical_plan
-01)ProjectionExec: expr=[k1@2 as k1, v1@3 as v1, k2@0 as k2, v2@1 as v2]
-02)--CoalesceBatchesExec: target_batch_size=3
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(k2@0, k1@0)]
-04)------DataSourceExec: partitions=1, partition_sizes=[0]
-05)------DataSourceExec: partitions=1, partition_sizes=[10000]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(k2@0, k1@0)], projection=[k1@2, v1@3, k2@0, v2@1]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[10000]
 
 query IIII
 SELECT sum(k1), sum(v1), sum(k2), sum(v2)
@@ -4952,11 +4933,9 @@ FROM t1
 LEFT JOIN t2 ON k1 = k2
 ----
 physical_plan
-01)ProjectionExec: expr=[k1@2 as k1, v1@3 as v1, k2@0 as k2, v2@1 as v2]
-02)--CoalesceBatchesExec: target_batch_size=3
-03)----HashJoinExec: mode=CollectLeft, join_type=Right, on=[(k2@0, k1@0)]
-04)------DataSourceExec: partitions=1, partition_sizes=[0]
-05)------DataSourceExec: partitions=1, partition_sizes=[10000]
+01)HashJoinExec: mode=CollectLeft, join_type=Right, on=[(k2@0, k1@0)], projection=[k1@2, v1@3, k2@0, v2@1]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[10000]
 
 query IIII
 SELECT sum(k1), sum(v1), sum(k2), sum(v2)
@@ -4973,11 +4952,9 @@ FROM t1
 RIGHT JOIN t2 ON k1 = k2
 ----
 physical_plan
-01)ProjectionExec: expr=[k1@2 as k1, v1@3 as v1, k2@0 as k2, v2@1 as v2]
-02)--CoalesceBatchesExec: target_batch_size=3
-03)----HashJoinExec: mode=CollectLeft, join_type=Left, on=[(k2@0, k1@0)]
-04)------DataSourceExec: partitions=1, partition_sizes=[0]
-05)------DataSourceExec: partitions=1, partition_sizes=[10000]
+01)HashJoinExec: mode=CollectLeft, join_type=Left, on=[(k2@0, k1@0)], projection=[k1@2, v1@3, k2@0, v2@1]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[10000]
 
 query IIII
 SELECT sum(k1), sum(v1), sum(k2), sum(v2)
@@ -4994,10 +4971,9 @@ FROM t1
 LEFT SEMI JOIN t2 ON k1 = k2
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(k2@0, k1@0)]
-03)----DataSourceExec: partitions=1, partition_sizes=[0]
-04)----DataSourceExec: partitions=1, partition_sizes=[10000]
+01)HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(k2@0, k1@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[10000]
 
 query II
 SELECT sum(k1), sum(v1)
@@ -5014,10 +4990,9 @@ FROM t1
 RIGHT SEMI JOIN t2 ON k1 = k2
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(k2@0, k1@0)]
-03)----DataSourceExec: partitions=1, partition_sizes=[0]
-04)----DataSourceExec: partitions=1, partition_sizes=[10000]
+01)HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(k2@0, k1@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[10000]
 
 query II
 SELECT sum(k2), sum(v2)
@@ -5034,10 +5009,9 @@ FROM t1
 LEFT ANTI JOIN t2 ON k1 = k2
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(k2@0, k1@0)]
-03)----DataSourceExec: partitions=1, partition_sizes=[0]
-04)----DataSourceExec: partitions=1, partition_sizes=[10000]
+01)HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(k2@0, k1@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[10000]
 
 query II
 SELECT sum(k1), sum(v1)
@@ -5054,10 +5028,9 @@ FROM t1
 RIGHT ANTI JOIN t2 ON k1 = k2
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(k2@0, k1@0)]
-03)----DataSourceExec: partitions=1, partition_sizes=[0]
-04)----DataSourceExec: partitions=1, partition_sizes=[10000]
+01)HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(k2@0, k1@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[10000]
 
 query II
 SELECT sum(k2), sum(v2)
@@ -5074,11 +5047,9 @@ FROM t1
 FULL JOIN t2 ON k1 = k2
 ----
 physical_plan
-01)ProjectionExec: expr=[k1@2 as k1, v1@3 as v1, k2@0 as k2, v2@1 as v2]
-02)--CoalesceBatchesExec: target_batch_size=3
-03)----HashJoinExec: mode=CollectLeft, join_type=Full, on=[(k2@0, k1@0)]
-04)------DataSourceExec: partitions=1, partition_sizes=[0]
-05)------DataSourceExec: partitions=1, partition_sizes=[10000]
+01)HashJoinExec: mode=CollectLeft, join_type=Full, on=[(k2@0, k1@0)], projection=[k1@2, v1@3, k2@0, v2@1]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[10000]
 
 query IIII
 SELECT sum(k1), sum(v1), sum(k2), sum(v2)
@@ -5089,7 +5060,7 @@ FULL JOIN t2 ON k1 = k2
 
 # LEFT MARK JOIN
 query TT
-EXPLAIN 
+EXPLAIN
 SELECT *
 FROM t2
 WHERE k2 > 0
@@ -5100,12 +5071,10 @@ WHERE k2 > 0
     )
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--FilterExec: k2@0 > 0 OR mark@2, projection=[k2@0, v2@1]
-03)----CoalesceBatchesExec: target_batch_size=3
-04)------HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(k2@0, k1@0)]
-05)--------DataSourceExec: partitions=1, partition_sizes=[0]
-06)--------DataSourceExec: partitions=1, partition_sizes=[10000]
+01)FilterExec: k2@0 > 0 OR mark@2, projection=[k2@0, v2@1]
+02)--HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(k2@0, k1@0)]
+03)----DataSourceExec: partitions=1, partition_sizes=[0]
+04)----DataSourceExec: partitions=1, partition_sizes=[10000]
 
 query II
 SELECT *
@@ -5127,10 +5096,9 @@ LEFT ANTI JOIN t2 ON k1 = k2
 ----
 physical_plan
 01)AggregateExec: mode=Single, gby=[v1@0 as v1], aggr=[]
-02)--CoalesceBatchesExec: target_batch_size=3
-03)----HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(k2@0, k1@0)], projection=[v1@1]
-04)------DataSourceExec: partitions=1, partition_sizes=[0]
-05)------DataSourceExec: partitions=1, partition_sizes=[10000]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(k2@0, k1@0)], projection=[v1@1]
+03)----DataSourceExec: partitions=1, partition_sizes=[0]
+04)----DataSourceExec: partitions=1, partition_sizes=[10000]
 
 query I
 SELECT distinct(v1)
@@ -5148,12 +5116,11 @@ LEFT ANTI JOIN t2 ON k1 = k2
 WHERE k1 < 0
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(k2@0, k1@0)]
+01)HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(k2@0, k1@0)]
+02)--FilterExec: k2@0 < 0
 03)----DataSourceExec: partitions=1, partition_sizes=[0]
-04)----CoalesceBatchesExec: target_batch_size=3
-05)------FilterExec: k1@0 < 0
-06)--------DataSourceExec: partitions=1, partition_sizes=[10000]
+04)--FilterExec: k1@0 < 0
+05)----DataSourceExec: partitions=1, partition_sizes=[10000]
 
 query II
 SELECT *
@@ -5162,20 +5129,63 @@ LEFT ANTI JOIN t2 ON k1 = k2
 WHERE k1 < 0
 ----
 
+# Also check that the reported number of output rows/batches are correct in the "empty build side"
+# optimization.
+# Issue: https://github.com/apache/datafusion/issues/20809
+query TT
+EXPLAIN ANALYZE
+WITH t1 (k) AS (
+    VALUES (1), (2)
+), t2 (k) AS (
+    VALUES (1)
+)
+SELECT *
+FROM t1
+LEFT ANTI JOIN (
+    SELECT *
+    FROM t2
+    WHERE k <> 1
+) t2 ON t1.k = t2.k;
+----
+Plan with Metrics
+01)HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(k@0, k@0)], metrics=[output_rows=2, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, output_batches=1, array_map_created_count=0, build_input_batches=0, build_input_rows=0, input_batches=1, input_rows=2, build_mem_used=<slt:ignore>, build_time=<slt:ignore>, join_time=<slt:ignore>, avg_fanout=N/A (0/0), probe_hit_rate=0% (0/2)]     
+02)--ProjectionExec: expr=[column1@0 as k], metrics=[output_rows=0, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, output_batches=0, expr_0_eval_time=<slt:ignore>]
+03)----FilterExec: column1@0 != 1, metrics=[output_rows=0, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, output_batches=0, selectivity=0% (0/1)]
+04)------DataSourceExec: partitions=1, partition_sizes=[1], metrics=[]
+05)--ProjectionExec: expr=[column1@0 as k], metrics=[output_rows=2, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, output_batches=1, expr_0_eval_time=<slt:ignore>]
+06)----DataSourceExec: partitions=1, partition_sizes=[1], metrics=[]
+
+query I
+WITH t1 (k) AS (
+    VALUES (1), (2)
+), t2 (k) AS (
+    VALUES (1)
+)
+SELECT *
+FROM t1
+LEFT ANTI JOIN (
+    SELECT *
+    FROM t2
+    WHERE k <> 1
+) t2 ON t1.k = t2.k;
+----
+1
+2
+
 # Mark testing
 statement ok
 CREATE OR REPLACE TABLE t1(b INT, c INT, d INT);
 
 statement ok
 INSERT INTO t1 VALUES
-  (10, 5, 3),   
-  ( 1, 7, 8),  
-  ( 2, 9, 7),   
-  ( 3, 8,10),   
-  ( 5, 6, 6),   
-  ( 0, 4, 9),   
-  ( 4, 8, 7),   
-  (100,6, 5);  
+  (10, 5, 3),
+  ( 1, 7, 8),
+  ( 2, 9, 7),
+  ( 3, 8,10),
+  ( 5, 6, 6),
+  ( 0, 4, 9),
+  ( 4, 8, 7),
+  (100,6, 5);
 
 query I rowsort
 SELECT c
@@ -5220,12 +5230,10 @@ physical_plan
 01)SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[false]
 02)--PiecewiseMergeJoin: operator=Gt, join_type=Inner, on=(t1_id > t2_id)
 03)----SortExec: expr=[t1_id@0 ASC], preserve_partitioning=[false]
-04)------CoalesceBatchesExec: target_batch_size=3
-05)--------FilterExec: t1_id@0 > 10
-06)----------DataSourceExec: partitions=1, partition_sizes=[1]
-07)----CoalesceBatchesExec: target_batch_size=3
-08)------FilterExec: t2_int@1 > 1, projection=[t2_id@0]
-09)--------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------FilterExec: t1_id@0 > 10
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)----FilterExec: t2_int@1 > 1, projection=[t2_id@0]
+07)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 DROP TABLE t1;
@@ -5238,3 +5246,284 @@ set datafusion.explain.physical_plan_only = false;
 
 statement ok
 set datafusion.optimizer.enable_piecewise_merge_join = false;
+
+# Test hash join with columns named c0, c1, c2
+# These names match the internal naming pattern in inlist_builder.rs
+# Regression test for https://github.com/apache/datafusion/pull/18393#discussion_r2601145291
+
+statement ok
+CREATE TABLE t1_c_source(c0 INT, c1 VARCHAR, c2 INT) AS VALUES
+(1, 'a', 100),
+(2, 'b', 200),
+(3, 'c', 300);
+
+statement ok
+CREATE TABLE t2_c_source(c0 INT, c1 VARCHAR) AS VALUES
+(1, 'x'),
+(3, 'z');
+
+query I
+COPY t1_c_source TO 'test_files/scratch/joins/t1_c.parquet' STORED AS PARQUET;
+----
+3
+
+query I
+COPY t2_c_source TO 'test_files/scratch/joins/t2_c.parquet' STORED AS PARQUET;
+----
+2
+
+statement ok
+CREATE EXTERNAL TABLE t1_c(c0 INT, c1 VARCHAR, c2 INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/joins/t1_c.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE t2_c(c0 INT, c1 VARCHAR)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/joins/t2_c.parquet';
+
+# Test single-column join with column named c0
+query ITI rowsort
+SELECT t1.c0, t1.c1, t1.c2
+FROM t1_c t1
+INNER JOIN t2_c t2 ON t1.c0 = t2.c0;
+----
+1 a 100
+3 c 300
+
+# Test multi-column join with columns named c0, c1
+query ITI rowsort
+SELECT t1.c0, t1.c1, t1.c2
+FROM t1_c t1
+INNER JOIN t2_c t2 ON t1.c0 = t2.c0 AND t1.c1 = t2.c1;
+----
+
+statement ok
+DROP TABLE t1_c_source;
+
+statement ok
+DROP TABLE t2_c_source;
+
+statement ok
+DROP TABLE t1_c;
+
+statement ok
+DROP TABLE t2_c;
+
+# Reproducer of https://github.com/apache/datafusion/issues/19067
+statement count 0
+set datafusion.explain.physical_plan_only = true;
+
+# Setup Left Table with FixedSizeBinary(4)
+statement count 0
+CREATE TABLE issue_19067_left AS
+SELECT
+  column1 as id,
+  arrow_cast(decode(column2, 'hex'), 'FixedSizeBinary(4)') as join_key
+FROM (VALUES
+  (1, 'AAAAAAAA'),
+  (2, 'BBBBBBBB'),
+  (3, 'CCCCCCCC')
+);
+
+# Setup Right Table with FixedSizeBinary(4)
+statement count 0
+CREATE TABLE issue_19067_right AS
+SELECT
+  arrow_cast(decode(column1, 'hex'), 'FixedSizeBinary(4)') as join_key,
+  column2 as value
+FROM (VALUES
+  ('AAAAAAAA', 1000),
+  ('BBBBBBBB', 2000)
+);
+
+# Perform Left Join. Third row should contain NULL in `right_key`.
+query I??I
+SELECT
+  l.id,
+  l.join_key as left_key,
+  r.join_key as right_key,
+  r.value
+FROM issue_19067_left l
+LEFT JOIN issue_19067_right r ON l.join_key = r.join_key
+ORDER BY l.id;
+----
+1 aaaaaaaa aaaaaaaa 1000
+2 bbbbbbbb bbbbbbbb 2000
+3 cccccccc NULL NULL
+
+# Ensure usage of HashJoinExec
+query TT
+EXPLAIN
+SELECT
+  l.id,
+  l.join_key as left_key,
+  r.join_key as right_key,
+  r.value
+FROM issue_19067_left l
+LEFT JOIN issue_19067_right r ON l.join_key = r.join_key
+ORDER BY l.id;
+----
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--ProjectionExec: expr=[id@2 as id, join_key@3 as left_key, join_key@0 as right_key, value@1 as value]
+03)----HashJoinExec: mode=CollectLeft, join_type=Right, on=[(join_key@0, join_key@1)]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+05)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement count 0
+set datafusion.explain.physical_plan_only = false;
+
+statement count 0
+DROP TABLE issue_19067_left;
+
+statement count 0
+DROP TABLE issue_19067_right;
+
+# Test that empty projections pushed into joins produce correct row counts at runtime.
+# When count(1) is used over a RIGHT/FULL JOIN, the optimizer embeds an empty projection
+# (projection=[]) into the HashJoinExec. This validates that the runtime batch construction
+# handles zero-column output correctly, preserving the correct number of rows.
+
+statement ok
+CREATE TABLE empty_proj_left AS VALUES (1, 'a'), (2, 'b'), (3, 'c');
+
+statement ok
+CREATE TABLE empty_proj_right AS VALUES (1, 'x'), (2, 'y'), (4, 'z');
+
+query I
+SELECT count(1) FROM empty_proj_left RIGHT JOIN empty_proj_right ON empty_proj_left.column1 = empty_proj_right.column1;
+----
+3
+
+query I
+SELECT count(1) FROM empty_proj_left FULL JOIN empty_proj_right ON empty_proj_left.column1 = empty_proj_right.column1;
+----
+4
+
+statement count 0
+DROP TABLE empty_proj_left;
+
+statement count 0
+DROP TABLE empty_proj_right;
+
+# Issue #20437: HashJoin panic with dictionary-encoded columns in multi-key joins
+# https://github.com/apache/datafusion/issues/20437
+
+statement ok
+CREATE TABLE issue_20437_small AS
+SELECT id, arrow_cast(region, 'Dictionary(Int32, Utf8)') AS region
+FROM (VALUES (1, 'west'), (2, 'west')) AS t(id, region);
+
+statement ok
+CREATE TABLE issue_20437_large AS
+SELECT id, region, value
+FROM (VALUES (1, 'west', 100), (2, 'west', 200), (3, 'east', 300)) AS t(id, region, value);
+
+query ITI
+SELECT s.id, s.region, l.value
+FROM issue_20437_small s
+JOIN issue_20437_large l ON s.id = l.id AND s.region = l.region
+ORDER BY s.id;
+----
+1 west 100
+2 west 200
+
+statement count 0
+DROP TABLE issue_20437_small;
+
+statement count 0
+DROP TABLE issue_20437_large;
+
+# Test count(*) with right semi/anti joins returns correct row counts
+# issue: https://github.com/apache/datafusion/issues/20669
+
+statement ok
+CREATE TABLE t1 (k INT, v INT);
+
+statement ok
+CREATE TABLE t2 (k INT, v INT);
+
+statement ok
+INSERT INTO t1 SELECT i AS k, i AS v FROM generate_series(1, 100) t(i);
+
+statement ok
+INSERT INTO t2 VALUES (1, 1);
+
+query I
+WITH t AS (
+    SELECT *
+    FROM t1
+    LEFT ANTI JOIN t2 ON t1.k = t2.k
+)
+SELECT count(*)
+FROM t;
+----
+99
+
+query I
+WITH t AS (
+    SELECT *
+    FROM t1
+    LEFT SEMI JOIN t2 ON t1.k = t2.k
+)
+SELECT count(*)
+FROM t;
+----
+1
+
+# Config reset
+statement ok
+reset datafusion.execution.batch_size;
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+reset datafusion.explain.logical_plan_only;
+
+statement ok
+reset datafusion.explain.physical_plan_only;
+
+statement ok
+reset datafusion.optimizer.repartition_joins;
+
+statement ok
+reset datafusion.optimizer.prefer_hash_join;
+
+statement ok
+reset datafusion.optimizer.prefer_existing_sort;
+
+statement ok
+reset datafusion.optimizer.enable_piecewise_merge_join;
+
+statement count 0
+DROP TABLE t1;
+
+statement count 0
+DROP TABLE t2;
+
+statement ok
+CREATE TABLE t1(a INT, b INT) AS VALUES 
+  (NULL, 1), (NULL, 2), (NULL, 3), (NULL, 4), (NULL, 5);
+
+statement ok
+CREATE TABLE t2(c INT) AS VALUES (1), (2);
+
+# This query panicked before the fix: the ORDER BY forces a SortExec,
+# the LIMIT gets pushed into SortExec.fetch, and the HashJoinExec
+# calls partition_statistics() on the SortExec child during execution.
+query II
+SELECT sub.a, sub.b FROM (
+  SELECT * FROM t1 ORDER BY b LIMIT 1
+) sub 
+JOIN t2 ON sub.a = t2.c;
+----
+
+statement ok
+DROP TABLE t1;
+
+statement ok
+DROP TABLE t2;
diff --git a/datafusion/sqllogictest/test_files/json.slt b/datafusion/sqllogictest/test_files/json.slt
index b46b8c49d6623..60bec4213db02 100644
--- a/datafusion/sqllogictest/test_files/json.slt
+++ b/datafusion/sqllogictest/test_files/json.slt
@@ -146,3 +146,31 @@ EXPLAIN SELECT id FROM json_partitioned_test WHERE part = 2
 ----
 logical_plan TableScan: json_partitioned_test projection=[id], full_filters=[json_partitioned_test.part = Int32(2)]
 physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_table_json/part=2/data.json]]}, projection=[id], file_type=json
+
+##########
+## JSON Array Format Tests
+##########
+
+# Test reading JSON array format file with newline_delimited=false
+statement ok
+CREATE EXTERNAL TABLE json_array_test
+STORED AS JSON
+OPTIONS ('format.newline_delimited' 'false')
+LOCATION '../core/tests/data/json_array.json';
+
+query IT rowsort
+SELECT a, b FROM json_array_test
+----
+1 hello
+2 world
+3 test
+
+statement ok
+DROP TABLE json_array_test;
+
+# Test that reading JSON array format WITHOUT newline_delimited option fails
+# (default is newline_delimited=true which can't parse array format correctly)
+statement error Not valid JSON
+CREATE EXTERNAL TABLE json_array_as_ndjson
+STORED AS JSON
+LOCATION '../core/tests/data/json_array.json';
diff --git a/datafusion/sqllogictest/test_files/lateral_join.slt b/datafusion/sqllogictest/test_files/lateral_join.slt
new file mode 100644
index 0000000000000..cae3e67153246
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/lateral_join.slt
@@ -0,0 +1,836 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# LATERAL join tests for DataFusion.
+#
+# Many tests adapted from DuckDB (MIT license) test/sql/subquery/lateral/
+# and PostgreSQL (BSD license) regression tests.
+
+###########################################################
+# Setup
+###########################################################
+
+statement ok
+CREATE TABLE integers(i INTEGER) AS VALUES (1), (2), (3), (NULL);
+
+statement ok
+CREATE TABLE students(id INTEGER, name VARCHAR) AS VALUES
+  (1, 'Mark'),
+  (2, 'Dirk'),
+  (3, 'Dana');
+
+statement ok
+CREATE TABLE exams(sid INTEGER, course VARCHAR, grade INTEGER) AS VALUES
+  (1, 'Database Systems', 10),
+  (1, 'Graphics', 9),
+  (2, 'Database Systems', 7),
+  (2, 'Graphics', 7);
+
+statement ok
+CREATE TABLE t1(id INTEGER, val TEXT) AS VALUES (1, 'a'), (2, 'b'), (3, 'c');
+
+statement ok
+CREATE TABLE t2(id INTEGER, t1_id INTEGER, data TEXT) AS VALUES
+  (10, 1, 'x'),
+  (20, 1, 'y'),
+  (30, 2, 'z');
+
+statement ok
+CREATE TABLE t3(id INTEGER, t2_id INTEGER) AS VALUES (100, 10), (200, 20), (300, 30);
+
+###########################################################
+# Section 1: Basic LATERAL with correlated WHERE
+#
+# The current decorrelation path handles outer references
+# that appear in WHERE clauses. These tests exercise the
+# three syntax variants: comma+LATERAL, JOIN LATERAL ON,
+# and CROSS JOIN LATERAL.
+###########################################################
+
+# Comma + LATERAL keyword
+query ITT
+SELECT t1.id, t1.val, sub.data
+FROM t1, LATERAL (SELECT t2.data FROM t2 WHERE t2.t1_id = t1.id) AS sub
+ORDER BY t1.id, sub.data;
+----
+1 a x
+1 a y
+2 b z
+
+# JOIN LATERAL ... ON true
+query ITT
+SELECT t1.id, t1.val, sub.data
+FROM t1 JOIN LATERAL (SELECT t2.data FROM t2 WHERE t2.t1_id = t1.id) AS sub ON true
+ORDER BY t1.id, sub.data;
+----
+1 a x
+1 a y
+2 b z
+
+# CROSS JOIN LATERAL
+query ITT
+SELECT t1.id, t1.val, sub.data
+FROM t1 CROSS JOIN LATERAL (SELECT t2.data FROM t2 WHERE t2.t1_id = t1.id) AS sub
+ORDER BY t1.id, sub.data;
+----
+1 a x
+1 a y
+2 b z
+
+# Correlation with inequality predicate
+query II
+SELECT i1.i, sub.j
+FROM integers i1, LATERAL (SELECT i2.i AS j FROM integers i2 WHERE i2.i > i1.i) sub
+ORDER BY i1.i, sub.j;
+----
+1 2
+1 3
+2 3
+
+# Unmatched outer rows are excluded (inner join semantics)
+# t1.id=3 has no matching t2 rows, so it does not appear
+query IT
+SELECT t1.id, sub.data
+FROM t1, LATERAL (SELECT t2.data FROM t2 WHERE t2.t1_id = t1.id) AS sub
+ORDER BY t1.id;
+----
+1 x
+1 y
+2 z
+
+# Lateral with no outer reference (degenerates to cross join)
+query II
+SELECT * FROM (SELECT 42) t(a), LATERAL (SELECT 100) t2(b);
+----
+42 100
+
+# WHERE-only chained lateral: filter references multiple prior tables
+query III
+SELECT * FROM
+  (SELECT 42) t1(i),
+  (SELECT 22) t2(j),
+  LATERAL (SELECT 1 AS l WHERE i + j = 64) t3;
+----
+42 22 1
+
+# WHERE filter that eliminates all rows
+query III
+SELECT * FROM
+  (SELECT 64) t1(i),
+  (SELECT 22) t2(j),
+  LATERAL (SELECT 1 AS l WHERE i + j = 64) t3;
+----
+
+# Multiple correlation predicates
+query ITI
+SELECT t1.id, t1.val, sub.t2_id
+FROM t1, LATERAL (
+    SELECT t2.id AS t2_id FROM t2 WHERE t2.t1_id = t1.id AND t2.id > t1.id * 10
+) sub
+ORDER BY t1.id, sub.t2_id;
+----
+1 a 20
+2 b 30
+
+# JOIN LATERAL with non-trivial ON clause (no aggregate)
+query ITT
+SELECT t1.id, t1.val, sub.data
+FROM t1 JOIN LATERAL (SELECT t2.data FROM t2 WHERE t2.t1_id = t1.id) sub ON sub.data = 'x'
+ORDER BY t1.id;
+----
+1 a x
+
+# Empty outer table produces no rows
+query II
+SELECT t1.id, sub.cnt
+FROM (SELECT 1 AS id WHERE false) t1,
+  LATERAL (SELECT count(*) AS cnt FROM t2 WHERE t2.t1_id = t1.id) sub;
+----
+
+###########################################################
+# Section 2: LATERAL with aggregation and the COUNT bug
+###########################################################
+
+# Students and total grades (SUM, with JOIN LATERAL)
+query TI rowsort
+SELECT name, total
+FROM students
+JOIN LATERAL (SELECT SUM(grade) AS total FROM exams WHERE exams.sid = students.id) grades ON true;
+----
+Dana NULL
+Dirk 14
+Mark 19
+
+# Same with comma + LATERAL
+query TI rowsort
+SELECT name, total
+FROM students, LATERAL (SELECT SUM(grade) AS total FROM exams WHERE exams.sid = students.id) grades;
+----
+Dana NULL
+Dirk 14
+Mark 19
+
+# COUNT(*) must return 0 (not NULL) for unmatched rows
+query ITI
+SELECT t1.id, t1.val, sub.cnt
+FROM t1, LATERAL (SELECT count(*) AS cnt FROM t2 WHERE t2.t1_id = t1.id) AS sub
+ORDER BY t1.id;
+----
+1 a 2
+2 b 1
+3 c 0
+
+# SUM returns NULL for empty groups
+query ITI
+SELECT t1.id, t1.val, sub.total
+FROM t1, LATERAL (SELECT sum(t2.id) AS total FROM t2 WHERE t2.t1_id = t1.id) AS sub
+ORDER BY t1.id;
+----
+1 a 30
+2 b 30
+3 c NULL
+
+# AVG returns NULL for empty groups
+query ITR
+SELECT t1.id, t1.val, sub.avg_id
+FROM t1, LATERAL (SELECT avg(t2.id) AS avg_id FROM t2 WHERE t2.t1_id = t1.id) AS sub
+ORDER BY t1.id;
+----
+1 a 15
+2 b 30
+3 c NULL
+
+# MIN returns NULL for empty groups
+query ITI
+SELECT t1.id, t1.val, sub.min_id
+FROM t1, LATERAL (SELECT min(t2.id) AS min_id FROM t2 WHERE t2.t1_id = t1.id) AS sub
+ORDER BY t1.id;
+----
+1 a 10
+2 b 30
+3 c NULL
+
+# MAX returns NULL for empty groups
+query ITI
+SELECT t1.id, t1.val, sub.max_id
+FROM t1, LATERAL (SELECT max(t2.id) AS max_id FROM t2 WHERE t2.t1_id = t1.id) AS sub
+ORDER BY t1.id;
+----
+1 a 20
+2 b 30
+3 c NULL
+
+# COUNT bug compensation with type-changing expressions: the default
+# value must have the correct type even after CAST or arithmetic.
+query ITT
+SELECT t1.id, t1.val, sub.cnt
+FROM t1, LATERAL (
+  SELECT CAST(count(*) AS TEXT) AS cnt FROM t2 WHERE t2.t1_id = t1.id
+) AS sub
+ORDER BY t1.id;
+----
+1 a 2
+2 b 1
+3 c 0
+
+query ITR
+SELECT t1.id, t1.val, sub.result
+FROM t1, LATERAL (
+  SELECT count(*) + 0.5 AS result FROM t2 WHERE t2.t1_id = t1.id
+) AS sub
+ORDER BY t1.id;
+----
+1 a 2.5
+2 b 1.5
+3 c 0.5
+
+# Multiple aggregates: COUNT should be 0, SUM should be NULL
+query ITII
+SELECT t1.id, t1.val, sub.cnt, sub.total
+FROM t1, LATERAL (
+  SELECT count(*) AS cnt, sum(t2.id) AS total
+  FROM t2 WHERE t2.t1_id = t1.id
+) AS sub
+ORDER BY t1.id;
+----
+1 a 2 30
+2 b 1 30
+3 c 0 NULL
+
+# Name collision: left side has 'id', right side aliases count(*) AS 'id'.
+# Count bug compensation must only target right-side fields.
+query II
+SELECT t1.id, sub.id AS cnt
+FROM t1, LATERAL (SELECT count(*) AS id FROM t2 WHERE t2.t1_id = t1.id) sub
+ORDER BY t1.id;
+----
+1 2
+2 1
+3 0
+
+# Filter on count: t1.id=3 has cnt=0, satisfies cnt < 5
+query ITI
+SELECT t1.id, t1.val, sub.cnt
+FROM t1, LATERAL (SELECT count(*) AS cnt FROM t2 WHERE t2.t1_id = t1.id) AS sub
+WHERE sub.cnt < 5
+ORDER BY t1.id;
+----
+1 a 2
+2 b 1
+3 c 0
+
+# JOIN LATERAL with non-trivial ON clause and scalar aggregate: the ON
+# condition must be applied after the count bug projection, not as part
+# of the left join condition (which would incorrectly preserve non-matching
+# outer rows with count-bug defaults).
+query ITI
+SELECT t1.id, t1.val, sub.cnt
+FROM t1 JOIN LATERAL (
+    SELECT count(*) AS cnt FROM t2 WHERE t2.t1_id = t1.id
+) sub ON sub.cnt > 1
+ORDER BY t1.id;
+----
+1 a 2
+
+# Validate: equivalent scalar subquery produces same result
+query ITI
+SELECT t1.id, t1.val,
+  (SELECT count(*) FROM t2 WHERE t2.t1_id = t1.id) AS cnt
+FROM t1
+ORDER BY t1.id;
+----
+1 a 2
+2 b 1
+3 c 0
+
+# Validate: equivalent scalar subquery for SUM
+query ITI
+SELECT t1.id, t1.val,
+  (SELECT sum(t2.id) FROM t2 WHERE t2.t1_id = t1.id) AS total
+FROM t1
+ORDER BY t1.id;
+----
+1 a 30
+2 b 30
+3 c NULL
+
+# JOIN LATERAL ON + outer WHERE: both filters applied correctly
+query ITI
+SELECT t1.id, t1.val, sub.cnt
+FROM t1 JOIN LATERAL (
+    SELECT count(*) AS cnt FROM t2 WHERE t2.t1_id = t1.id
+) sub ON sub.cnt > 0
+WHERE sub.cnt < 3
+ORDER BY t1.id;
+----
+1 a 2
+2 b 1
+
+# NULL in correlation column: NULL = NULL is unknown, so no match;
+# count bug gives 0 for the unmatched group
+query II
+SELECT i1.i, sub.cnt
+FROM integers i1, LATERAL (
+    SELECT count(*) AS cnt FROM integers i2 WHERE i2.i = i1.i
+) sub
+ORDER BY i1.i;
+----
+1 1
+2 1
+3 1
+NULL 0
+
+# Lateral subquery with GROUP BY (not a scalar aggregate, so inner join)
+query ITTI
+SELECT t1.id, t1.val, sub.data, sub.cnt
+FROM t1, LATERAL (
+    SELECT t2.data, count(*) AS cnt FROM t2
+    WHERE t2.t1_id = t1.id GROUP BY t2.data
+) sub
+ORDER BY t1.id, sub.data;
+----
+1 a x 1
+1 a y 1
+2 b z 1
+
+###########################################################
+# Section 3: LEFT JOIN LATERAL
+#
+# LEFT lateral joins preserve all outer rows, producing
+# NULLs on the right side when the lateral subquery returns
+# no matching rows.
+###########################################################
+
+# Basic LEFT JOIN LATERAL with ON true: unmatched outer rows get NULLs
+query ITT
+SELECT t1.id, t1.val, sub.data
+FROM t1 LEFT JOIN LATERAL (
+    SELECT t2.data FROM t2 WHERE t2.t1_id = t1.id
+) sub ON true
+ORDER BY t1.id, sub.data;
+----
+1 a x
+1 a y
+2 b z
+3 c NULL
+
+# LEFT JOIN LATERAL with non-trivial ON condition (no aggregate)
+query ITT
+SELECT t1.id, t1.val, sub.data
+FROM t1 LEFT JOIN LATERAL (
+    SELECT t2.data FROM t2 WHERE t2.t1_id = t1.id
+) sub ON sub.data = 'x'
+ORDER BY t1.id;
+----
+1 a x
+2 b NULL
+3 c NULL
+
+# LEFT JOIN LATERAL with COUNT + ON true: count bug gives 0 for empty groups
+query ITI
+SELECT t1.id, t1.val, sub.cnt
+FROM t1 LEFT JOIN LATERAL (
+    SELECT count(*) AS cnt FROM t2 WHERE t2.t1_id = t1.id
+) sub ON true
+ORDER BY t1.id;
+----
+1 a 2
+2 b 1
+3 c 0
+
+# LEFT JOIN LATERAL with SUM + ON true: SUM returns NULL for empty groups
+query ITI
+SELECT t1.id, t1.val, sub.total
+FROM t1 LEFT JOIN LATERAL (
+    SELECT sum(t2.id) AS total FROM t2 WHERE t2.t1_id = t1.id
+) sub ON true
+ORDER BY t1.id;
+----
+1 a 30
+2 b 30
+3 c NULL
+
+# LEFT JOIN LATERAL with COUNT + non-trivial ON: ON-nullification
+# For t1.id=3, the lateral produces cnt=0 (count bug default), but
+# ON 0>0 is false, so the LEFT join preserves the row with NULLs.
+query ITI
+SELECT t1.id, t1.val, sub.cnt
+FROM t1 LEFT JOIN LATERAL (
+    SELECT count(*) AS cnt FROM t2 WHERE t2.t1_id = t1.id
+) sub ON sub.cnt > 0
+ORDER BY t1.id;
+----
+1 a 2
+2 b 1
+3 c NULL
+
+# LEFT JOIN LATERAL with COUNT + non-trivial ON referencing both sides
+query ITI
+SELECT t1.id, t1.val, sub.cnt
+FROM t1 LEFT JOIN LATERAL (
+    SELECT count(*) AS cnt FROM t2 WHERE t2.t1_id = t1.id
+) sub ON sub.cnt * 2 > t1.id
+ORDER BY t1.id;
+----
+1 a 2
+2 b NULL
+3 c NULL
+
+# Multiple aggregates (COUNT + SUM) + non-trivial ON
+# COUNT should be NULL (not 0) when ON fails; SUM should also be NULL
+query ITII
+SELECT t1.id, t1.val, sub.cnt, sub.total
+FROM t1 LEFT JOIN LATERAL (
+    SELECT count(*) AS cnt, sum(t2.id) AS total
+    FROM t2 WHERE t2.t1_id = t1.id
+) sub ON sub.cnt > 0
+ORDER BY t1.id;
+----
+1 a 2 30
+2 b 1 30
+3 c NULL NULL
+
+# LEFT JOIN LATERAL with GROUP BY (not scalar agg, no count bug)
+query ITTI
+SELECT t1.id, t1.val, sub.data, sub.cnt
+FROM t1 LEFT JOIN LATERAL (
+    SELECT t2.data, count(*) AS cnt FROM t2
+    WHERE t2.t1_id = t1.id GROUP BY t2.data
+) sub ON true
+ORDER BY t1.id, sub.data;
+----
+1 a x 1
+1 a y 1
+2 b z 1
+3 c NULL NULL
+
+# LEFT JOIN LATERAL with inequality correlation
+query II
+SELECT i1.i, sub.j
+FROM integers i1 LEFT JOIN LATERAL (
+    SELECT i2.i AS j FROM integers i2 WHERE i2.i > i1.i
+) sub ON true
+ORDER BY i1.i, sub.j;
+----
+1 2
+1 3
+2 3
+3 NULL
+NULL NULL
+
+# LEFT JOIN LATERAL where no inner rows match any outer row
+query IT
+SELECT t1.id, sub.data
+FROM t1 LEFT JOIN LATERAL (
+    SELECT t2.data FROM t2 WHERE t2.t1_id = t1.id AND t2.t1_id > 999
+) sub ON true
+ORDER BY t1.id;
+----
+1 NULL
+2 NULL
+3 NULL
+
+# Validate: LEFT LATERAL with COUNT + ON true gives same result
+# as comma LATERAL (both use count bug compensation)
+query ITI
+SELECT t1.id, t1.val, sub.cnt
+FROM t1, LATERAL (SELECT count(*) AS cnt FROM t2 WHERE t2.t1_id = t1.id) AS sub
+ORDER BY t1.id;
+----
+1 a 2
+2 b 1
+3 c 0
+
+# Post-join WHERE filter must eliminate NULL-producing LEFT lateral rows.
+# Regression test for the class of bug in duckdb/duckdb#21609.
+query ITT
+SELECT t1.id, t1.val, sub.data
+FROM t1 LEFT JOIN LATERAL (
+    SELECT t2.data FROM t2 WHERE t2.t1_id = t1.id
+) sub ON true
+WHERE sub.data IS NOT NULL
+ORDER BY t1.id, sub.data;
+----
+1 a x
+1 a y
+2 b z
+
+# Post-join WHERE after count-bug compensation: the WHERE must see the
+# compensated value (0 for COUNT, not NULL).
+query ITI
+SELECT t1.id, t1.val, sub.cnt
+FROM t1 LEFT JOIN LATERAL (
+    SELECT count(*) AS cnt FROM t2 WHERE t2.t1_id = t1.id
+) sub ON true
+WHERE sub.cnt > 0
+ORDER BY t1.id;
+----
+1 a 2
+2 b 1
+
+# Full pipeline: LEFT + COUNT + non-trivial ON + post-join WHERE.
+# Decorrelation → count-bug compensation → ON-nullification → WHERE.
+query ITI
+SELECT t1.id, t1.val, sub.cnt
+FROM t1 LEFT JOIN LATERAL (
+    SELECT count(*) AS cnt FROM t2 WHERE t2.t1_id = t1.id
+) sub ON sub.cnt > 0
+WHERE t1.id <= 2
+ORDER BY t1.id;
+----
+1 a 2
+2 b 1
+
+# Empty left table + LEFT LATERAL produces 0 rows
+query IT
+SELECT t1.id, sub.data
+FROM (SELECT 1 AS id WHERE false) t1
+LEFT JOIN LATERAL (SELECT t2.data FROM t2 WHERE t2.t1_id = t1.id) sub ON true;
+----
+
+# LEFT LATERAL + NULL correlation keys + COUNT: count-bug compensation
+# must produce 0 when the correlation key is NULL (NULL=NULL is unknown).
+query II
+SELECT i1.i, sub.cnt
+FROM integers i1 LEFT JOIN LATERAL (
+    SELECT count(*) AS cnt FROM integers i2 WHERE i2.i = i1.i
+) sub ON true
+ORDER BY i1.i;
+----
+1 1
+2 1
+3 1
+NULL 0
+
+# LEFT JOIN LATERAL with GROUP BY + non-trivial ON: the ON condition
+# is merged into the join condition (no ON-nullification needed).
+query ITTI
+SELECT t1.id, t1.val, sub.data, sub.cnt
+FROM t1 LEFT JOIN LATERAL (
+    SELECT t2.data, count(*) AS cnt FROM t2
+    WHERE t2.t1_id = t1.id GROUP BY t2.data
+) sub ON sub.data >= 'y'
+ORDER BY t1.id, sub.data;
+----
+1 a y 1
+2 b z 1
+3 c NULL NULL
+
+###########################################################
+# Section 4: Nested LATERAL joins
+###########################################################
+
+# Both levels correlate via WHERE with their sibling table
+query III
+SELECT t1.id AS t1_id, sub.t2_id, sub.t3_id
+FROM t1, LATERAL (
+    SELECT t2_sub.id AS t2_id, t3_sub.id AS t3_id
+    FROM t2 AS t2_sub, LATERAL (
+        SELECT t3.id FROM t3 WHERE t3.t2_id = t2_sub.id
+    ) AS t3_sub
+    WHERE t2_sub.t1_id = t1.id
+) AS sub
+ORDER BY t1_id, t2_id, t3_id;
+----
+1 10 100
+1 20 200
+2 30 300
+
+# Nested lateral with aggregation at inner level only
+query III
+SELECT t1.id, sub.t2_id, sub.t3_cnt
+FROM t1, LATERAL (
+    SELECT t2_sub.id AS t2_id, t3_agg.t3_cnt
+    FROM t2 AS t2_sub, LATERAL (
+        SELECT count(*) AS t3_cnt FROM t3 WHERE t3.t2_id = t2_sub.id
+    ) AS t3_agg
+    WHERE t2_sub.t1_id = t1.id
+) AS sub
+ORDER BY t1.id, sub.t2_id;
+----
+1 10 1
+1 20 1
+2 30 1
+
+# Nested LEFT JOIN LATERAL: outer LEFT preserves unmatched outer rows
+query III
+SELECT t1.id, sub.t2_id, sub.t3_id
+FROM t1 LEFT JOIN LATERAL (
+    SELECT t2_sub.id AS t2_id, t3_sub.id AS t3_id
+    FROM t2 AS t2_sub LEFT JOIN LATERAL (
+        SELECT t3.id FROM t3 WHERE t3.t2_id = t2_sub.id
+    ) AS t3_sub ON true
+    WHERE t2_sub.t1_id = t1.id
+) AS sub ON true
+ORDER BY t1.id, sub.t2_id, sub.t3_id;
+----
+1 10 100
+1 20 200
+2 30 300
+3 NULL NULL
+
+###########################################################
+# Section 5: Semantically invalid queries (permanent errors)
+###########################################################
+
+# FULL/RIGHT JOIN LATERAL are invalid per the SQL standard: the right
+# side cannot both reference and be independent of the left side.
+statement error
+SELECT * FROM integers FULL JOIN LATERAL (SELECT integers.i AS b) t ON (true);
+
+statement error
+SELECT * FROM integers RIGHT JOIN LATERAL (SELECT integers.i AS b) t ON (true);
+
+# Aggregating directly over outer columns in LATERAL is invalid — the
+# aggregate would need all rows of the outer table, but LATERAL evaluates
+# per-row. Note: using an outer column *inside* an aggregate over an inner
+# table (e.g., SUM(inner.x + outer.y)) is valid SQL — see Section 5.
+statement error
+SELECT * FROM integers, LATERAL (SELECT SUM(i)) t(s);
+
+statement error
+SELECT * FROM integers, LATERAL (SELECT SUM(i) AS s) t;
+
+###########################################################
+# Section 6: Currently unsupported patterns
+#
+# These are valid queries that DataFusion cannot decorrelate yet.
+# The primary limitation is that outer references must appear
+# in WHERE clauses: outer references in SELECT expressions or
+# inside aggregate arguments are not yet supported.
+###########################################################
+
+# --- Outer reference in SELECT expression (not WHERE) ---
+
+# Simplest case: SELECT references outer table column
+statement error OuterReferenceColumn
+SELECT * FROM (SELECT 42) t(a) CROSS JOIN LATERAL (SELECT t.a + 1) t2(b);
+
+# Outer ref in SELECT with multiple rows
+statement error OuterReferenceColumn
+SELECT * FROM (VALUES (42), (84)) t(a), LATERAL (SELECT t.a + 1) t2(b);
+
+# Outer ref inside aggregate expression argument
+statement error OuterReferenceColumn
+SELECT i1.i, sub.s
+FROM integers i1,
+  LATERAL (SELECT SUM(i2.i + i1.i) AS s FROM integers i2) sub
+ORDER BY i1.i;
+
+# Chained lateral with outer ref in SELECT
+statement error OuterReferenceColumn
+SELECT i1.i, sub1.j, sub2.k
+FROM integers i1,
+  LATERAL (SELECT i1.i * 2 AS j) sub1,
+  LATERAL (SELECT sub1.j + 1 AS k) sub2
+ORDER BY i1.i;
+
+# --- LEFT JOIN LATERAL with outer ref in SELECT ---
+# These fail because outer references in the SELECT list (not WHERE)
+# cannot be decorrelated yet, not because LEFT lateral is unsupported.
+# LEFT lateral with outer refs only in WHERE works — see Section 3.
+
+# LEFT JOIN with outer ref in SELECT expression
+statement error OuterReferenceColumn
+SELECT i1.i, sub.b
+FROM integers i1
+LEFT JOIN LATERAL (SELECT i1.i AS b WHERE i1.i IN (1, 3)) sub ON (i1.i = sub.b)
+ORDER BY i1.i;
+
+# LEFT JOIN with outer ref in SELECT, all rows match
+statement error OuterReferenceColumn
+SELECT i1.i, sub.b
+FROM integers i1
+LEFT JOIN LATERAL (SELECT i1.i AS b) sub ON (i1.i = sub.b)
+ORDER BY i1.i;
+
+# LEFT JOIN with outer ref in SELECT, no rows match
+statement error OuterReferenceColumn
+SELECT i1.i, sub.b
+FROM integers i1
+LEFT JOIN LATERAL (SELECT i1.i + 1 AS b) sub ON (i1.i = sub.b)
+ORDER BY i1.i;
+
+# --- HAVING in lateral subquery ---
+# https://github.com/apache/datafusion/issues/21198
+
+# HAVING count(*) < 2 evaluates to true on an empty group (0 < 2), so
+# PullUpCorrelatedExpr pulls it out. The lateral join code does not yet
+# re-apply the pulled-up HAVING filter as a post-join predicate.
+# Expected result once supported: only rows for t1.id=2 (cnt=1) and t1.id=3 (cnt=0).
+statement error OuterReferenceColumn
+SELECT t1.id, t1.val, sub.cnt
+FROM t1, LATERAL (
+    SELECT count(*) AS cnt FROM t2 WHERE t2.t1_id = t1.id HAVING count(*) < 2
+) AS sub
+ORDER BY t1.id;
+
+# --- Implicit LATERAL (comma without LATERAL keyword) ---
+# DuckDB auto-detects lateral correlation; DataFusion requires
+# the explicit LATERAL keyword.
+
+statement error
+SELECT name, total FROM students,
+  (SELECT SUM(grade) AS total FROM exams WHERE exams.sid = students.id) grades;
+
+###########################################################
+# Section 7: EXPLAIN plan verification
+###########################################################
+
+# Verify the COUNT bug fix: Left Join with CASE WHEN compensation
+query TT
+EXPLAIN SELECT t1.id, sub.cnt
+FROM t1, LATERAL (SELECT count(*) AS cnt FROM t2 WHERE t2.t1_id = t1.id) AS sub
+ORDER BY t1.id;
+----
+logical_plan
+01)Sort: t1.id ASC NULLS LAST
+02)--Projection: t1.id, CASE WHEN sub.__always_true IS NULL THEN Int64(0) ELSE sub.cnt END AS cnt
+03)----Left Join: t1.id = sub.t1_id
+04)------TableScan: t1 projection=[id]
+05)------SubqueryAlias: sub
+06)--------Projection: count(Int64(1)) AS cnt, t2.t1_id, Boolean(true) AS __always_true
+07)----------Aggregate: groupBy=[[t2.t1_id]], aggr=[[count(Int64(1))]]
+08)------------TableScan: t2 projection=[t1_id]
+physical_plan
+01)SortPreservingMergeExec: [id@0 ASC NULLS LAST]
+02)--SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[id@0 as id, CASE WHEN __always_true@1 IS NULL THEN 0 ELSE cnt@2 END as cnt]
+04)------HashJoinExec: mode=CollectLeft, join_type=Left, on=[(id@0, t1_id@1)], projection=[id@0, __always_true@3, cnt@1]
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)--------ProjectionExec: expr=[count(Int64(1))@1 as cnt, t1_id@0 as t1_id, true as __always_true]
+07)----------AggregateExec: mode=FinalPartitioned, gby=[t1_id@0 as t1_id], aggr=[count(Int64(1))]
+08)------------RepartitionExec: partitioning=Hash([t1_id@0], 4), input_partitions=1
+09)--------------AggregateExec: mode=Partial, gby=[t1_id@0 as t1_id], aggr=[count(Int64(1))]
+10)----------------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Verify LEFT lateral without aggregate decorrelates to left join
+# (no post-join filter — ON merged into join condition)
+query TT
+EXPLAIN SELECT t1.id, sub.data
+FROM t1 LEFT JOIN LATERAL (SELECT t2.data FROM t2 WHERE t2.t1_id = t1.id) AS sub ON true;
+----
+logical_plan
+01)Projection: t1.id, sub.data
+02)--Left Join: t1.id = sub.t1_id
+03)----TableScan: t1 projection=[id]
+04)----SubqueryAlias: sub
+05)------Projection: t2.data, t2.t1_id
+06)--------TableScan: t2 projection=[t1_id, data]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Left, on=[(id@0, t1_id@1)], projection=[id@0, data@1]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Verify non-aggregate lateral decorrelates to inner join
+query TT
+EXPLAIN SELECT t1.id, sub.data
+FROM t1, LATERAL (SELECT t2.data FROM t2 WHERE t2.t1_id = t1.id) AS sub;
+----
+logical_plan
+01)Projection: t1.id, sub.data
+02)--Inner Join: t1.id = sub.t1_id
+03)----TableScan: t1 projection=[id]
+04)----SubqueryAlias: sub
+05)------Projection: t2.data, t2.t1_id
+06)--------TableScan: t2 projection=[t1_id, data]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, t1_id@1)], projection=[id@0, data@1]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+###########################################################
+# Cleanup
+###########################################################
+
+statement ok
+DROP TABLE integers;
+
+statement ok
+DROP TABLE students;
+
+statement ok
+DROP TABLE exams;
+
+statement ok
+DROP TABLE t1;
+
+statement ok
+DROP TABLE t2;
+
+statement ok
+DROP TABLE t3;
diff --git a/datafusion/sqllogictest/test_files/limit.slt b/datafusion/sqllogictest/test_files/limit.slt
index ae82aee5e1559..8e931ca52fb59 100644
--- a/datafusion/sqllogictest/test_files/limit.slt
+++ b/datafusion/sqllogictest/test_files/limit.slt
@@ -377,15 +377,33 @@ physical_plan
 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
 06)----------ProjectionExec: expr=[]
 07)------------GlobalLimitExec: skip=6, fetch=3
-08)--------------CoalesceBatchesExec: target_batch_size=8192, fetch=9
-09)----------------FilterExec: a@0 > 3
-10)------------------DataSourceExec: partitions=1, partition_sizes=[1]
+08)--------------FilterExec: a@0 > 3, fetch=9
+09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query I
 SELECT COUNT(*) FROM (SELECT a FROM t1 WHERE a > 3 LIMIT 3 OFFSET 6);
 ----
 1
 
+# Verify that EXPLAIN FORMAT TREE shows fetch on FilterExec
+query TT
+EXPLAIN FORMAT TREE SELECT a FROM t1 WHERE a > 3 LIMIT 5;
+----
+physical_plan
+01)┌───────────────────────────┐
+02)│         FilterExec        │
+03)│    --------------------   │
+04)│          fetch: 5         │
+05)│      predicate: a > 3     │
+06)└─────────────┬─────────────┘
+07)┌─────────────┴─────────────┐
+08)│       DataSourceExec      │
+09)│    --------------------   │
+10)│         bytes: 160        │
+11)│       format: memory      │
+12)│          rows: 1          │
+13)└───────────────────────────┘
+
 # generate BIGINT data from 1 to 1000 in multiple partitions
 statement ok
 CREATE TABLE t1000 (i BIGINT) AS
@@ -405,11 +423,9 @@ logical_plan
 02)--TableScan: t1000 projection=[i]
 physical_plan
 01)AggregateExec: mode=FinalPartitioned, gby=[i@0 as i], aggr=[]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----RepartitionExec: partitioning=Hash([i@0], 4), input_partitions=4
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------AggregateExec: mode=Partial, gby=[i@0 as i], aggr=[]
-06)----------DataSourceExec: partitions=1
+02)--RepartitionExec: partitioning=Hash([i@0], 4), input_partitions=1
+03)----AggregateExec: mode=Partial, gby=[i@0 as i], aggr=[]
+04)------DataSourceExec: partitions=1
 
 statement ok
 set datafusion.explain.show_sizes = true;
@@ -637,11 +653,10 @@ physical_plan
 02)--SortPreservingMergeExec: [b@0 DESC], fetch=3
 03)----SortExec: TopK(fetch=3), expr=[b@0 DESC], preserve_partitioning=[true]
 04)------AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[sum(ordered_table.a)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[b@1 as b], aggr=[sum(ordered_table.a)]
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], file_type=csv, has_header=true
+05)--------RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[b@1 as b], aggr=[sum(ordered_table.a)]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], file_type=csv, has_header=true
 
 # Applying offset & limit when multiple streams from union
 # the plan must still have a global limit to apply the offset
@@ -666,7 +681,7 @@ physical_plan
 03)----SortExec: TopK(fetch=14), expr=[c@0 DESC], preserve_partitioning=[true]
 04)------UnionExec
 05)--------ProjectionExec: expr=[CAST(c@0 AS Int64) as c]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true
 08)--------ProjectionExec: expr=[CAST(d@0 AS Int64) as c]
 09)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
@@ -683,19 +698,19 @@ ON t1.b = t2.b
 ORDER BY t1.b desc, c desc, c2 desc;
 ----
 3 98 96
-3 98 89
+3 98 87
 3 98 82
 3 98 79
 3 97 96
-3 97 89
+3 97 87
 3 97 82
 3 97 79
 3 96 96
-3 96 89
+3 96 87
 3 96 82
 3 96 79
 3 95 96
-3 95 89
+3 95 87
 3 95 82
 3 95 79
 
@@ -710,8 +725,8 @@ ON t1.b = t2.b
 ORDER BY t1.b desc, c desc, c2 desc
 OFFSET 3 LIMIT 2;
 ----
-3 99 82
-3 99 79
+3 98 79
+3 97 96
 
 statement ok
 drop table ordered_table;
@@ -869,10 +884,108 @@ limit 1000;
 ----
 1
 
+# Config reset
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+reset datafusion.explain.show_sizes;
+
+statement ok
+reset datafusion.explain.logical_plan_only;
+
+statement ok
+reset datafusion.optimizer.repartition_file_min_size;
+
 # Tear down test_filter_with_limit table:
 statement ok
 DROP TABLE test_limit_with_partitions;
 
+# Tests for filter pushdown behavior with Sort + LIMIT (fetch).
+
+statement ok
+CREATE TABLE t(id INT, value INT) AS VALUES
+(1, 100),
+(2, 200),
+(3, 300),
+(4, 400),
+(5, 500);
+
+# Take the 3 smallest values (100, 200, 300), then filter value > 200.
+query II
+SELECT * FROM (SELECT * FROM t ORDER BY value LIMIT 3) sub WHERE sub.value > 200;
+----
+3 300
+
+# Take the 3 largest values (500, 400, 300), then filter value < 400.
+query II
+SELECT * FROM (SELECT * FROM t ORDER BY value DESC LIMIT 3) sub WHERE sub.value < 400;
+----
+3 300
+
+# The filter stays above the sort+fetch in the plan.
+query TT
+EXPLAIN SELECT * FROM (SELECT * FROM t ORDER BY value LIMIT 3) sub WHERE sub.value > 200;
+----
+logical_plan
+01)SubqueryAlias: sub
+02)--Filter: t.value > Int32(200)
+03)----Sort: t.value ASC NULLS LAST, fetch=3
+04)------TableScan: t projection=[id, value]
+physical_plan
+01)FilterExec: value@1 > 200
+02)--SortExec: TopK(fetch=3), expr=[value@1 ASC NULLS LAST], preserve_partitioning=[false]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+DROP TABLE t;
+
 # Tear down src_table table:
 statement ok
 DROP TABLE src_table;
+
+# LIMIT must work when SELECT projects columns in different order than table schema
+
+statement ok
+CREATE TABLE t21176 (col_a TEXT, col_b DOUBLE, col_c TEXT) AS VALUES
+  ('a-0', 0, 'c-0'), ('a-1', 1, 'c-1'), ('a-2', 2, 'c-2'), ('a-3', 3, 'c-3'),
+  ('a-4', 4, 'c-4'), ('a-5', 5, 'c-5'), ('a-6', 6, 'c-6'), ('a-7', 7, 'c-7'),
+  ('a-8', 8, 'c-8'), ('a-9', 9, 'c-9'), ('a-10', 10, 'c-10'), ('a-11', 11, 'c-11'),
+  ('a-12', 12, 'c-12'), ('a-13', 13, 'c-13'), ('a-14', 14, 'c-14'), ('a-15', 15, 'c-15'),
+  ('a-16', 16, 'c-16'), ('a-17', 17, 'c-17'), ('a-18', 18, 'c-18'), ('a-19', 19, 'c-19');
+
+# Schema-order SELECT with LIMIT should return 5 rows
+query RT rowsort
+SELECT col_b, col_c FROM t21176 LIMIT 5;
+----
+0 c-0
+1 c-1
+2 c-2
+3 c-3
+4 c-4
+
+# Reverse-order SELECT with LIMIT should also return 5 rows (not 20)
+query TR rowsort
+SELECT col_c, col_b FROM t21176 LIMIT 5;
+----
+c-0 0
+c-1 1
+c-2 2
+c-3 3
+c-4 4
+
+# Single column reverse SELECT with LIMIT
+query T rowsort
+SELECT col_c FROM t21176 LIMIT 5;
+----
+c-0
+c-1
+c-2
+c-3
+c-4
+
+statement ok
+DROP TABLE t21176;
diff --git a/datafusion/sqllogictest/test_files/limit_pruning.slt b/datafusion/sqllogictest/test_files/limit_pruning.slt
new file mode 100644
index 0000000000000..34acb98f60033
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/limit_pruning.slt
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+
+statement ok
+CREATE TABLE tracking_data AS VALUES
+-- ***** Row Group 0 *****
+  ('Anow Vole', 7),
+  ('Brown Bear', 133),
+  ('Gray Wolf', 82),
+-- ***** Row Group 1 *****
+  ('Lynx', 71),
+  ('Red Fox', 40),
+  ('Alpine Bat', 6),
+-- ***** Row Group 2 *****
+  ('Nlpine Ibex', 101),
+  ('Nlpine Goat', 76),
+  ('Nlpine Sheep', 83),
+-- ***** Row Group 3 *****
+  ('Europ. Mole', 4),
+  ('Polecat', 16),
+  ('Alpine Ibex', 97);
+
+statement ok
+COPY (SELECT column1 as species, column2 as s FROM tracking_data)
+TO 'test_files/scratch/limit_pruning/data.parquet'
+STORED AS PARQUET
+OPTIONS (
+  'format.max_row_group_size' '3'
+);
+
+statement ok
+drop table tracking_data;
+
+statement ok
+CREATE EXTERNAL TABLE tracking_data
+STORED AS PARQUET
+LOCATION 'test_files/scratch/limit_pruning/data.parquet';
+
+
+statement ok
+set datafusion.explain.analyze_level = summary;
+
+# row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched
+# limit_pruned_row_groups=2 total → 0 matched
+query TT
+explain analyze select * from tracking_data where species > 'M' AND s >= 50 limit 3;
+----
+Plan with Metrics DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[species, s], limit=3, file_type=parquet, predicate=species@0 > M AND s@1 >= 50, pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_pages_pruned=2 total → 2 matched, limit_pruned_row_groups=2 total → 0 matched, bytes_scanned=<slt:ignore>, metadata_load_time=<slt:ignore>, scan_efficiency_ratio=<slt:ignore> (171/2.35 K)]
+
+# limit_pruned_row_groups=0 total → 0 matched
+# because of order by, scan needs to preserve sort, so limit pruning is disabled
+query TT
+explain analyze select * from tracking_data where species > 'M' AND s >= 50 order by species limit 3;
+----
+Plan with Metrics
+01)SortExec: TopK(fetch=3), expr=[species@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[species@0 < Nlpine Sheep], metrics=[output_rows=3, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[species, s], file_type=parquet, predicate=species@0 > M AND s@1 >= 50 AND DynamicFilter [ species@0 < Nlpine Sheep ], pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50 AND species_null_count@1 != row_count@2 AND species_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_pages_pruned=6 total → 6 matched, limit_pruned_row_groups=0 total → 0 matched, bytes_scanned=<slt:ignore>, metadata_load_time=<slt:ignore>, scan_efficiency_ratio=<slt:ignore> (521/2.35 K)]
+
+statement ok
+drop table tracking_data;
+
+statement ok
+reset datafusion.explain.analyze_level;
+
+# Config reset
+statement ok
+RESET datafusion.execution.parquet.pushdown_filters;
diff --git a/datafusion/sqllogictest/test_files/limit_single_row_batches.slt b/datafusion/sqllogictest/test_files/limit_single_row_batches.slt
new file mode 100644
index 0000000000000..6b5368620c982
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/limit_single_row_batches.slt
@@ -0,0 +1,31 @@
+
+#  minimize batch size to 1 in order to trigger different code paths
+statement ok
+set datafusion.execution.batch_size = '1';
+
+# ----
+# tests with target partition set to 1
+# ----
+statement ok
+set datafusion.execution.target_partitions = '1';
+
+
+statement ok
+CREATE TABLE filter_limit (i INT) as values (1), (2);
+
+query I
+SELECT COUNT(*) FROM (SELECT i FROM filter_limit WHERE i <> 0 LIMIT 1);
+----
+1
+
+# Config reset
+statement ok
+reset datafusion.execution.batch_size;
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+DROP TABLE filter_limit;
diff --git a/datafusion/sqllogictest/test_files/list_view.slt b/datafusion/sqllogictest/test_files/list_view.slt
new file mode 100644
index 0000000000000..0e0fe817f670e
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/list_view.slt
@@ -0,0 +1,66 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#############
+## ListView Aggregation Tests
+#############
+
+### Setup: Create test tables with ListView arrays
+
+statement ok
+CREATE TABLE list_view_agg_test AS
+SELECT
+    id,
+    group_col,
+    arrow_cast(make_array(val1, val2, val3), 'ListView(Int64)') as list_view_col,
+    arrow_cast(make_array(val1, val2, val3), 'LargeListView(Int64)') as large_list_view_col
+FROM (VALUES
+    (1, 'A', 10, 20, 30),
+    (2, 'A', 40, 50, 60),
+    (3, 'B', 70, 80, 90),
+    (4, 'B', 100, 110, 120),
+    (5, 'C', 1, 2, 3)
+) AS t(id, group_col, val1, val2, val3);
+
+### Test: GROUP BY on ListView column
+
+query ?I rowsort
+SELECT list_view_col, COUNT(*)
+FROM list_view_agg_test
+GROUP BY list_view_col;
+----
+[1, 2, 3] 1
+[10, 20, 30] 1
+[100, 110, 120] 1
+[40, 50, 60] 1
+[70, 80, 90] 1
+
+query ?I rowsort
+SELECT large_list_view_col, COUNT(*)
+FROM list_view_agg_test
+GROUP BY large_list_view_col;
+----
+[1, 2, 3] 1
+[10, 20, 30] 1
+[100, 110, 120] 1
+[40, 50, 60] 1
+[70, 80, 90] 1
+
+### Cleanup
+
+statement ok
+DROP TABLE list_view_agg_test;
diff --git a/datafusion/sqllogictest/test_files/listing_table_partitions.slt b/datafusion/sqllogictest/test_files/listing_table_partitions.slt
index 52433429cfe80..5df78b674fe8d 100644
--- a/datafusion/sqllogictest/test_files/listing_table_partitions.slt
+++ b/datafusion/sqllogictest/test_files/listing_table_partitions.slt
@@ -73,3 +73,70 @@ foo
 
 statement count 0
 set datafusion.execution.listing_table_factory_infer_partitions = true;
+
+# Test: files outside partition structure are skipped
+# This simulates a table that transitioned from non-partitioned to
+# hive-partitioned storage, leaving a stale file in the root directory.
+
+# Create partitioned files first
+query I
+copy (values(1, 'alice'), (2, 'bob'))
+to 'test_files/scratch/listing_table_partitions/root_file_skipped/year=2024/data.parquet';
+----
+2
+
+query I
+copy (values(3, 'charlie'))
+to 'test_files/scratch/listing_table_partitions/root_file_skipped/year=2025/data.parquet';
+----
+1
+
+# Create the table before adding the stale root file, so partition
+# inference succeeds (it only runs at CREATE TABLE time).
+statement count 0
+create external table root_file_test
+stored as parquet
+location 'test_files/scratch/listing_table_partitions/root_file_skipped/';
+
+# Now add a stale root-level file (outside any partition directory).
+# This simulates a file left over from before partitioning was added.
+query I
+copy (values(99, 'stale'))
+to 'test_files/scratch/listing_table_partitions/root_file_skipped/stale.parquet';
+----
+1
+
+# The root file should be skipped — only partitioned files are included
+query IT
+select column1, column2 from root_file_test order by column1;
+----
+1 alice
+2 bob
+3 charlie
+
+# Partition column should be accessible
+query ITT
+select column1, column2, year from root_file_test order by column1;
+----
+1 alice 2024
+2 bob 2024
+3 charlie 2025
+
+# Partition filter should work
+query ITT
+select column1, column2, year from root_file_test where year = '2025';
+----
+3 charlie 2025
+
+# COUNT should not include the root file's rows
+query I
+select count(*) from root_file_test;
+----
+3
+
+# GROUP BY partition column should work
+query TI
+select year, count(*) from root_file_test group by year order by year;
+----
+2024 2
+2025 1
diff --git a/datafusion/sqllogictest/test_files/listing_table_statistics.slt b/datafusion/sqllogictest/test_files/listing_table_statistics.slt
index 37daf551c2c39..4b2aa0f563b22 100644
--- a/datafusion/sqllogictest/test_files/listing_table_statistics.slt
+++ b/datafusion/sqllogictest/test_files/listing_table_statistics.slt
@@ -35,13 +35,14 @@ query TT
 explain format indent select * from t;
 ----
 logical_plan TableScan: t projection=[int_col, str_col]
-physical_plan DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/2.parquet]]}, projection=[int_col, str_col], file_type=parquet, statistics=[Rows=Exact(4), Bytes=Exact(212), [(Col[0]: Min=Exact(Int64(-1)) Max=Exact(Int64(3)) Null=Exact(0)),(Col[1]: Min=Exact(Utf8View("a")) Max=Exact(Utf8View("d")) Null=Exact(0))]]
+physical_plan DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/2.parquet]]}, projection=[int_col, str_col], file_type=parquet, statistics=[Rows=Exact(4), Bytes=Absent, [(Col[0]: Min=Exact(Int64(-1)) Max=Exact(Int64(3)) Null=Exact(0) ScanBytes=Exact(32)),(Col[1]: Min=Exact(Utf8View("a")) Max=Exact(Utf8View("d")) Null=Exact(0) ScanBytes=Inexact(100))]]
 
 statement ok
 drop table t;
 
+# Config reset
 statement ok
-set datafusion.execution.collect_statistics = false;
+reset datafusion.execution.collect_statistics;
 
 statement ok
-set datafusion.explain.show_statistics = false;
+reset datafusion.explain.show_statistics;
diff --git a/datafusion/sqllogictest/test_files/map.slt b/datafusion/sqllogictest/test_files/map.slt
index a3234b4e7ee52..62e70e6080bab 100644
--- a/datafusion/sqllogictest/test_files/map.slt
+++ b/datafusion/sqllogictest/test_files/map.slt
@@ -43,8 +43,8 @@ LOCATION '../core/tests/data/parquet_map.parquet';
 query TTT
 describe data;
 ----
-ints Map("entries": Struct("key": Utf8, "value": Int64), unsorted) NO
-strings Map("entries": Struct("key": Utf8, "value": Utf8), unsorted) NO
+ints Map("entries": non-null Struct("key": non-null Utf8, "value": non-null Int64), unsorted) NO
+strings Map("entries": non-null Struct("key": non-null Utf8, "value": non-null Utf8), unsorted) NO
 timestamp Utf8View NO
 
 query ??T
@@ -113,9 +113,8 @@ logical_plan
 01)Filter: table_with_map.int_field > Int64(0)
 02)--TableScan: table_with_map projection=[int_field, map_field]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: int_field@0 > 0
-03)----DataSourceExec: partitions=1, partition_sizes=[0]
+01)FilterExec: int_field@0 > 0
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
 
 statement ok
 drop table table_with_map;
@@ -175,6 +174,16 @@ SELECT MAP(['POST', 'HEAD', 'PATCH'], [41, 33, 30]);
 ----
 {POST: 41, HEAD: 33, PATCH: 30}
 
+query ?
+SELECT MAP('type', 'test');
+----
+{type: test}
+
+query ?
+SELECT MAP('a', 2, 'b', 3);
+----
+{a: 2, b: 3}
+
 query ?
 SELECT MAP(['POST', 'HEAD', 'PATCH'], [41, 33, null]);
 ----
@@ -188,10 +197,10 @@ SELECT MAP([[1,2], [3,4]], ['a', 'b']);
 query error
 SELECT MAP()
 
-query error DataFusion error: Execution error: map function requires 2 arguments, got 1
+query error DataFusion error: Error during planning: make_map requires an even number of arguments
 SELECT MAP(['POST', 'HEAD'])
 
-query error DataFusion error: Execution error: Expected list, large_list or fixed_size_list, got Null
+query error DataFusion error: Execution error: map key cannot be null
 SELECT MAP(null, [41, 33, 30]);
 
 query error DataFusion error: Execution error: map requires key and value lists to have the same length
@@ -233,7 +242,7 @@ SELECT map(column5, column6) FROM duplicate_keys_table;
 
 # key is a nested type
 query error DataFusion error: Execution error: map key must be unique, duplicate key found: \[1, 2\]
-SELECT MAP([[1,2], [1,2], [NULL]], [41, 33, null]);
+SELECT MAP([[1,2], [1,2]], [41, 33]);
 
 query error DataFusion error: Execution error: map key must be unique, duplicate key found: \[\{1:1\}\]
 SELECT MAP([Map {1:'1'}, Map {1:'1'}, Map {2:'2'}], [41, 33, null]);
@@ -254,6 +263,32 @@ SELECT MAP(arrow_cast(make_array('POST', 'HEAD', 'PATCH'), 'LargeList(Utf8)'), a
 ----
 {POST: 41, HEAD: 33, PATCH: 30}
 
+query ?
+SELECT MAP(make_array(X'6162', X'6364'), make_array(1, 2));
+----
+{6162: 1, 6364: 2}
+
+query error DataFusion error: Execution error: map key must be unique, duplicate key found: \[100, 117, 112\]
+SELECT MAP(make_array(X'647570', X'647570'), make_array(1, 2));
+
+# StringView test: map with StringView keys
+query ?
+SELECT MAP(arrow_cast(make_array('POST', 'HEAD', 'PATCH'), 'List(Utf8View)'), make_array(41, 33, 30));
+----
+{POST: 41, HEAD: 33, PATCH: 30}
+
+query error DataFusion error: Execution error: map key cannot be null
+SELECT MAP(arrow_cast(make_array('POST', null, 'PATCH'), 'List(Utf8View)'), make_array(41, 33, 30));
+
+# BinaryView test: map with BinaryView keys
+query ?
+SELECT MAP(arrow_cast(make_array(X'6162', X'6364'), 'List(BinaryView)'), make_array(1, 2));
+----
+{6162: 1, 6364: 2}
+
+query error DataFusion error: Execution error: map key must be unique, duplicate key found: \[100, 117, 112\]
+SELECT MAP(arrow_cast(make_array(X'647570', X'647570'), 'List(BinaryView)'), make_array(1, 2));
+
 statement ok
 create table t as values
 ('a', 1, 'k1', 10, ['k1', 'k2'], [1, 2], 'POST', [[1,2,3]], ['a']),
@@ -281,8 +316,12 @@ SELECT map(column8, column9) FROM t;
 {[4]: b}
 {[1, 2]: c}
 
-query error
+query ?
 SELECT map(column6, column7) FROM t;
+----
+{[1, 2]: POST}
+{[3]: PUT}
+{[5]: NULL}
 
 query ?
 select Map {column6: column7} from t;
@@ -544,11 +583,19 @@ SELECT (CASE WHEN 1 > 0 THEN MAP {'x': 100} ELSE MAP {'y': 200} END)['x'];
 ----
 100
 
-# TODO(https://github.com/apache/datafusion/issues/11785): fix accessing map with non-string key
-# query ?
-# SELECT MAP { MAP {1:'a', 2:'b'}:1, MAP {1:'c', 2:'d'}:2 }[MAP {1:'a', 2:'b'}];
-# ----
-# 1
+# fix accessing map with nested key
+query I
+SELECT MAP { MAP {1:'a', 2:'b'}:1, MAP {1:'c', 2:'d'}:2 }[MAP {1:'a', 2:'b'}];
+----
+1
+
+query I
+SELECT MAP { MAP {1:'a', 2:'b'}:1, MAP {1:'c', 2:'d'}:2 }[MAP {2:'b', 1:'a'}];
+----
+NULL
+
+# TODO(https://github.com/apache/datafusion/pull/18394): Test accessing map with empty map as key
+# TODO(https://github.com/apache/datafusion/pull/18394): Test accessing map with null map as key
 
 # accessing map with non-string key
 query I
diff --git a/datafusion/sqllogictest/test_files/math.slt b/datafusion/sqllogictest/test_files/math.slt
index 1cb68b85b2bce..e00edf47c176b 100644
--- a/datafusion/sqllogictest/test_files/math.slt
+++ b/datafusion/sqllogictest/test_files/math.slt
@@ -111,12 +111,44 @@ SELECT isnan(1.0::FLOAT), isnan('NaN'::FLOAT), isnan(-'NaN'::FLOAT), isnan(NULL:
 ----
 false true true NULL
 
+# isnan: non-float numeric inputs are never NaN
+query BBBB
+SELECT isnan(1::INT), isnan(0::INT), isnan(NULL::INT), isnan(123::BIGINT)
+----
+false false NULL false
+
+query BBBB
+SELECT isnan(1::INT UNSIGNED), isnan(0::INT UNSIGNED), isnan(NULL::INT UNSIGNED), isnan(255::TINYINT UNSIGNED)
+----
+false false NULL false
+
+query BBBB
+SELECT isnan(1::DECIMAL(10,2)), isnan(0::DECIMAL(10,2)), isnan(NULL::DECIMAL(10,2)), isnan(-1::DECIMAL(10,2))
+----
+false false NULL false
+
 # iszero
 query BBBB
 SELECT iszero(1.0), iszero(0.0), iszero(-0.0), iszero(NULL)
 ----
 false true true NULL
 
+# iszero: integers / unsigned / decimals
+query BBBB
+SELECT iszero(1::INT), iszero(0::INT), iszero(NULL::INT), iszero(-1::INT)
+----
+false true NULL false
+
+query BBBB
+SELECT iszero(1::INT UNSIGNED), iszero(0::INT UNSIGNED), iszero(NULL::INT UNSIGNED), iszero(255::TINYINT UNSIGNED)
+----
+false true NULL false
+
+query BBBB
+SELECT iszero(1::DECIMAL(10,2)), iszero(0::DECIMAL(10,2)), iszero(NULL::DECIMAL(10,2)), iszero(-1::DECIMAL(10,2))
+----
+false true NULL false
+
 # abs: empty argument
 statement error
 SELECT abs();
@@ -126,15 +158,15 @@ statement error
 SELECT abs(1, 2);
 
 # abs: unsupported argument type
-query error DataFusion error: Error during planning: Function 'abs' expects NativeType::Numeric but received NativeType::String
+query error DataFusion error: Error during planning: Function 'abs' expects Numeric but received String
 SELECT abs('foo');
 
 # abs: numeric string
 # TODO: In Postgres, '-1.2' is unknown type and interpreted to float8 so they don't fail on this query
-query error DataFusion error: Error during planning: Function 'abs' expects NativeType::Numeric but received NativeType::String
+query error DataFusion error: Error during planning: Function 'abs' expects Numeric but received String
 select abs('-1.2');
 
-query error DataFusion error: Error during planning: Function 'abs' expects NativeType::Numeric but received NativeType::String
+query error DataFusion error: Error during planning: Function 'abs' expects Numeric but received String
 select abs(arrow_cast('-1.2', 'Utf8'));
 
 statement ok
@@ -696,12 +728,216 @@ query error DataFusion error: Arrow error: Compute error: Signed integer overflo
 select lcm(2, 9223372036854775803);
 
 
-query error DataFusion error: Arrow error: Arithmetic overflow: Overflow happened on: 2107754225 \^ 1221660777
+## pow/power
+
+# pow() with integer base and negative float exponent (verifies type coercion)
+query R
+SELECT pow(2, -0.5)
+----
+0.707106781187
+
+# pow() with negative integer base and negative float exponent (returns NaN)
+query R
+SELECT pow(-2, -0.5)
+----
+NaN
+
+# pow() with zero base and negative exponent (returns Infinity)
+query R
+SELECT pow(0, -0.5)
+----
+Infinity
+
+# pow() with integer base of 1 and negative exponent
+query R
+SELECT pow(1, -0.5)
+----
+1
+
+# pow() with large integer base and small negative exponent
+query R
+SELECT pow(1000, -0.1)
+----
+0.501187233627
+
+# pow() with integer base and negative integer exponent returns float (like PostgreSQL)
+query R
+SELECT pow(2, -2)
+----
+0.25
+
+# power() with very large exponent returns infinity (Float64 behavior)
+query R
 select power(2107754225, 1221660777);
+----
+Infinity
+
+query R rowsort
+select power(base::double, exponent::double)
+from values
+  (2.0, 2.0),
+  (5.0, 4.0),
+  (2.0, 3.0),
+  (3.0, 4.0) as t(base, exponent);
+----
+4
+625
+8
+81
+
+query R rowsort
+select power(base::bigint, exponent::bigint)
+from values
+  (2, 2),
+  (5, 4),
+  (2, 3),
+  (3, 4),
+  (2, NULL) as t(base, exponent);
+----
+4
+625
+8
+81
+NULL
+
+query RT rowsort
+select
+  power(base::decimal(38, 0), exponent::decimal(38, 0)),
+  arrow_typeof(power(base::decimal(38, 0), exponent::decimal(38, 0)))
+from values
+  (0, 4),
+  (5, 0),
+  (2, 2),
+  (5, 4),
+  (2, 3),
+  (3, 4) as t(base, exponent);
+----
+0 Decimal128(38, 0)
+1 Decimal128(38, 0)
+4 Decimal128(38, 0)
+625 Decimal128(38, 0)
+8 Decimal128(38, 0)
+81 Decimal128(38, 0)
+
+query RT
+select
+  pow(2.5::decimal(2, 1), 4::bigint),
+  arrow_typeof(pow(2.5::decimal(2, 1), 4::bigint));
+----
+39 Decimal128(2, 1)
 
 # factorial overflow
-query error DataFusion error: Arrow error: Compute error: Overflow happened on FACTORIAL\(350943270\)
+query error DataFusion error: Execution error: Overflow happened on FACTORIAL\(350943270\)
 select FACTORIAL(350943270);
 
 statement ok
 drop table signed_integers
+
+# Null propagation for log
+query TT
+EXPLAIN SELECT log(NULL, c2) from aggregate_simple;
+----
+logical_plan
+01)Projection: Float64(NULL) AS log(NULL,aggregate_simple.c2)
+02)--TableScan: aggregate_simple projection=[]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/aggregate_simple.csv]]}, projection=[NULL as log(NULL,aggregate_simple.c2)], file_type=csv, has_header=true
+
+# Float 16/32/64 for log
+query RT
+SELECT log(2.5, arrow_cast(10.9, 'Float16')), arrow_typeof(log(2.5, arrow_cast(10.9, 'Float16')));
+----
+2.6074219 Float16
+
+query RT
+SELECT log(2.5, 10.9::float), arrow_typeof(log(2.5, 10.9::float));
+----
+2.606992 Float32
+
+query RT
+SELECT log(2.5, 10.9::double), arrow_typeof(log(2.5, 10.9::double));
+----
+2.606992198152 Float64
+
+# lcm with array and scalar
+
+query I
+SELECT lcm(column1, 5) FROM (VALUES (0), (3), (25), (-16));
+----
+0
+15
+25
+80
+
+query I
+SELECT lcm(6, column1) FROM (VALUES (4), (9), (0));
+----
+12
+18
+0
+
+# lcm array and scalar with nulls in the array
+query I
+SELECT lcm(column1, 5) FROM (VALUES (0), (NULL), (25));
+----
+0
+NULL
+25
+
+query I
+SELECT lcm(6, column1) FROM (VALUES (4), (NULL), (0));
+----
+12
+NULL
+0
+
+# lcm scalar edge values
+query I
+SELECT lcm(9223372036854775807, 1);
+----
+9223372036854775807
+
+query I
+SELECT lcm(9223372036854775807, 9223372036854775807);
+----
+9223372036854775807
+
+# gcd with array and scalar
+
+query I
+SELECT gcd(column1, 12) FROM (VALUES (8), (18), (0), (-36));
+----
+4
+6
+12
+12
+
+query I
+SELECT gcd(15, column1) FROM (VALUES (10), (25), (0));
+----
+5
+5
+15
+
+# gcd array and scalar with nulls in the array
+query I
+SELECT gcd(column1, 12) FROM (VALUES (8), (NULL), (0), (-36));
+----
+4
+NULL
+12
+12
+
+query I
+SELECT gcd(15, column1) FROM (VALUES (NULL), (25), (0));
+----
+NULL
+5
+15
+
+# gcd array and scalar=0
+query I
+SELECT gcd(column1, 0) FROM (VALUES (7), (-3), (0));
+----
+7
+3
+0
diff --git a/datafusion/sqllogictest/test_files/metadata.slt b/datafusion/sqllogictest/test_files/metadata.slt
index 8753d39cb7ef7..3fea8df260f05 100644
--- a/datafusion/sqllogictest/test_files/metadata.slt
+++ b/datafusion/sqllogictest/test_files/metadata.slt
@@ -24,6 +24,22 @@
 ## in the test harness as there is no way to define schema
 ## with metadata in SQL.
 
+query ITTPT
+select * from table_with_metadata;
+----
+1 NULL NULL 2020-09-08T13:42:29.190855123 no_foo
+NULL bar l_bar 2020-09-08T13:42:29.190855123 no_bar
+3 baz l_baz 2020-09-08T13:42:29.190855123 no_baz
+
+query TTT
+describe table_with_metadata;
+----
+id Int32 YES
+name Utf8 YES
+l_name Utf8 YES
+ts Timestamp(ns) NO
+nonnull_name Utf8 NO
+
 query IT
 select id, name from table_with_metadata;
 ----
@@ -32,11 +48,9 @@ NULL bar
 3 baz
 
 query I rowsort
-SELECT (
-  SELECT id FROM table_with_metadata
-  ) UNION (
-  SELECT id FROM table_with_metadata
-  );
+SELECT id FROM table_with_metadata
+UNION
+SELECT id FROM table_with_metadata;
 ----
 1
 3
@@ -67,7 +81,7 @@ select count(distinct name) from table_with_metadata;
 2
 
 # Regression test: prevent field metadata loss per https://github.com/apache/datafusion/issues/12687
-query I
+query R
 select approx_median(distinct id) from table_with_metadata;
 ----
 2
@@ -123,6 +137,23 @@ ORDER BY id, name, l_name;
 NULL bar NULL
 NULL NULL l_bar
 
+# Regression test: UNION ALL + optimize_projections pruning unused columns causes
+# metadata loss when one branch has NULL literals (empty metadata) and the other
+# has field metadata. The optimizer prunes unused columns, triggering recompute_schema
+# which rebuilds the Union schema via intersect_metadata_for_union. Previously, this
+# intersection would drop metadata when any branch had empty metadata (from NULL literals).
+# See https://github.com/apache/datafusion/issues/19049
+query T
+SELECT name FROM (
+  SELECT id, name FROM "table_with_metadata"
+  UNION ALL
+  SELECT NULL::int as id, NULL::string as name
+) GROUP BY name ORDER BY name;
+----
+bar
+baz
+NULL
+
 # Regression test: missing field metadata from left side of the union when right side is chosen
 query T
 select name from (
@@ -187,6 +218,34 @@ FROM table_with_metadata;
 2020-09-08
 2020-09-08
 
+# Regression test: CAST should preserve source field metadata
+query DT
+SELECT
+    CAST(ts AS DATE) as casted,
+    arrow_metadata(CAST(ts AS DATE), 'metadata_key')
+FROM table_with_metadata;
+----
+2020-09-08 ts non-nullable field
+2020-09-08 ts non-nullable field
+2020-09-08 ts non-nullable field
+
+# Regression test: CAST preserves metadata on integer column
+query IT
+SELECT
+    CAST(id AS BIGINT) as casted,
+    arrow_metadata(CAST(id AS BIGINT), 'metadata_key')
+FROM table_with_metadata;
+----
+1 the id field
+NULL the id field
+3 the id field
+
+# Regression test: CAST with single-argument arrow_metadata (returns full map)
+query ?
+select arrow_metadata(CAST(id AS BIGINT)) from table_with_metadata limit 1;
+----
+{metadata_key: the id field}
+
 # Regression test: distinct with cast
 query D
 SELECT DISTINCT (ts::DATE) AS dist
@@ -235,7 +294,183 @@ order by 1 asc nulls last;
 3 1
 NULL 1
 
+# Reproducer for https://github.com/apache/datafusion/issues/18337
+# this query should not get an internal error
+query TI
+SELECT
+  'foo' AS name,
+  COUNT(
+    CASE
+      WHEN prev_value = 'no_bar' AND value = 'no_baz' THEN 1
+      ELSE NULL
+      END
+     ) AS count_rises
+FROM
+  (
+    SELECT
+      nonnull_name as value,
+      LAG(nonnull_name) OVER (ORDER BY ts) AS prev_value
+    FROM
+      table_with_metadata
+);
+----
+foo 1
+
+# Regression test: first_value should preserve metadata
+query IT
+select first_value(id order by id asc nulls last), arrow_metadata(first_value(id order by id asc nulls last), 'metadata_key')
+from table_with_metadata;
+----
+1 the id field
+
+# Regression test: last_value should preserve metadata
+query IT
+select last_value(id order by id asc nulls first), arrow_metadata(last_value(id order by id asc nulls first), 'metadata_key')
+from table_with_metadata;
+----
+3 the id field
+
+# Regression test: DISTINCT ON should preserve metadata (uses first_value internally)
+query ITTT
+select distinct on (id) id, arrow_metadata(id, 'metadata_key'), name, arrow_metadata(name, 'metadata_key')
+from table_with_metadata order by id asc nulls last;
+----
+1 the id field NULL the name field
+3 the id field baz the name field
+NULL the id field bar the name field
+
+# Regression test: DISTINCT should preserve metadata
+query ITTT
+with res AS (
+  select distinct id, name from table_with_metadata
+)
+select id, arrow_metadata(id, 'metadata_key'), name, arrow_metadata(name, 'metadata_key')
+from res
+order by id asc nulls last;
+----
+1 the id field NULL the name field
+3 the id field baz the name field
+NULL the id field bar the name field
+
+# Regression test: grouped columns should preserve metadata
+query ITTT
+with res AS (
+  select name, count(*), id
+  from table_with_metadata
+  group by id, name
+)
+select id, arrow_metadata(id, 'metadata_key'), name, arrow_metadata(name, 'metadata_key')
+from res
+order by id asc nulls last, name asc nulls last
+----
+1 the id field NULL the name field
+3 the id field baz the name field
+NULL the id field bar the name field
+
+# Test arrow_metadata with single argument (returns Map)
+query ?
+select arrow_metadata(id) from table_with_metadata limit 1;
+----
+{metadata_key: the id field}
 
+# Regression test: TRY_CAST should preserve source field metadata
+query DT
+SELECT
+    TRY_CAST(ts AS DATE) as try_casted,
+    arrow_metadata(TRY_CAST(ts AS DATE), 'metadata_key')
+FROM table_with_metadata;
+----
+2020-09-08 ts non-nullable field
+2020-09-08 ts non-nullable field
+2020-09-08 ts non-nullable field
+
+# Regression test: TRY_CAST preserves metadata on integer column
+query IT
+SELECT
+    TRY_CAST(id AS BIGINT) as try_casted,
+    arrow_metadata(TRY_CAST(id AS BIGINT), 'metadata_key')
+FROM table_with_metadata;
+----
+1 the id field
+NULL the id field
+3 the id field
+
+# Regression test: TRY_CAST preserves metadata even when cast fails (returns NULL)
+query IT
+SELECT
+    TRY_CAST(name AS INT) as try_casted,
+    arrow_metadata(TRY_CAST(name AS INT), 'metadata_key')
+FROM table_with_metadata;
+----
+NULL the name field
+NULL the name field
+NULL the name field
+
+# Regression test: TRY_CAST with single-argument arrow_metadata (returns full map)
+query ?
+select arrow_metadata(TRY_CAST(id AS BIGINT)) from table_with_metadata limit 1;
+----
+{metadata_key: the id field}
+
+# with_metadata: attach a single key and read it back
+query T
+select arrow_metadata(with_metadata(id, 'unit', 'ms'), 'unit') from table_with_metadata limit 1;
+----
+ms
+
+# with_metadata: attach multiple keys in one call
+query ?
+select arrow_metadata(with_metadata(id, 'unit', 'ms', 'source', 'sensor')) from table_with_metadata limit 1;
+----
+{metadata_key: the id field, source: sensor, unit: ms}
+
+# with_metadata: merge with existing field metadata (preserves upstream keys)
+query T
+select arrow_metadata(with_metadata(id, 'unit', 'ms'), 'metadata_key') from table_with_metadata limit 1;
+----
+the id field
+
+# with_metadata: new keys overwrite existing on collision
+query T
+select arrow_metadata(with_metadata(id, 'metadata_key', 'overridden'), 'metadata_key') from table_with_metadata limit 1;
+----
+overridden
+
+# with_metadata: nesting composes (inner keys + outer keys, outer wins on collision)
+query ?
+select arrow_metadata(with_metadata(with_metadata(id, 'a', '1'), 'b', '2')) from table_with_metadata limit 1;
+----
+{a: 1, b: 2, metadata_key: the id field}
+
+# with_metadata: values pass through unchanged
+query I
+select with_metadata(id, 'unit', 'ms') from table_with_metadata order by id nulls last;
+----
+1
+3
+NULL
+
+# with_metadata: error on even arity (missing value for last key)
+statement error with_metadata requires an odd number of arguments
+select with_metadata(id, 'a', '1', 'b') from table_with_metadata;
+
+# with_metadata: error on too few args
+statement error with_metadata requires the input expression plus at least one
+select with_metadata(id) from table_with_metadata;
+
+# with_metadata: error on non-literal key
+statement error with_metadata requires argument 1 \(key\) to be a non-empty constant string
+select with_metadata(id, name, 'v') from table_with_metadata;
+
+# with_metadata: error on empty key
+statement error with_metadata requires argument 1 \(key\) to be a non-empty constant string
+select with_metadata(id, '', 'v') from table_with_metadata;
+
+# with_metadata: empty values are allowed
+query T
+select arrow_metadata(with_metadata(id, 'unit', ''), 'unit') from table_with_metadata limit 1;
+----
+(empty)
 
 statement ok
 drop table table_with_metadata;
diff --git a/datafusion/sqllogictest/test_files/monotonic_projection_test.slt b/datafusion/sqllogictest/test_files/monotonic_projection_test.slt
index 9c806cfa0d8aa..7feefc169fcab 100644
--- a/datafusion/sqllogictest/test_files/monotonic_projection_test.slt
+++ b/datafusion/sqllogictest/test_files/monotonic_projection_test.slt
@@ -46,7 +46,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [a_big@0 ASC NULLS LAST, b@1 ASC NULLS LAST]
 02)--ProjectionExec: expr=[CAST(a@0 AS Int64) as a_big, b@1 as b]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true
 
 query TT
@@ -62,7 +62,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@2 ASC NULLS LAST]
 02)--ProjectionExec: expr=[a@0 as a, CAST(a@0 AS Int64) as a_big, b@1 as b]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true
 
 # Cast to larger types as well as preserving ordering
@@ -83,7 +83,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [a_big@1 ASC NULLS LAST, b@2 ASC NULLS LAST]
 02)--ProjectionExec: expr=[a@0 as a, CAST(a@0 AS Int64) as a_big, b@1 as b]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true
 
 # test for common rename
@@ -97,9 +97,7 @@ logical_plan
 01)Sort: a_big ASC NULLS LAST, multiple_ordered_table.b ASC NULLS LAST
 02)--Projection: multiple_ordered_table.a, multiple_ordered_table.a AS a_big, multiple_ordered_table.b
 03)----TableScan: multiple_ordered_table projection=[a, b]
-physical_plan
-01)ProjectionExec: expr=[a@0 as a, a@0 as a_big, b@1 as b]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, a@1 as a_big, b], output_ordering=[a@0 ASC NULLS LAST, b@2 ASC NULLS LAST], file_type=csv, has_header=true
 
 query TT
 EXPLAIN
@@ -111,9 +109,7 @@ logical_plan
 01)Sort: multiple_ordered_table.a ASC NULLS LAST, multiple_ordered_table.b ASC NULLS LAST
 02)--Projection: multiple_ordered_table.a, multiple_ordered_table.a AS a_big, multiple_ordered_table.b
 03)----TableScan: multiple_ordered_table projection=[a, b]
-physical_plan
-01)ProjectionExec: expr=[a@0 as a, a@0 as a_big, b@1 as b]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, a@1 as a_big, b], output_ordering=[a@0 ASC NULLS LAST, b@2 ASC NULLS LAST], file_type=csv, has_header=true
 
 
 # test for cast Utf8
@@ -135,7 +131,7 @@ physical_plan
 01)SortPreservingMergeExec: [a_str@0 ASC NULLS LAST, b@1 ASC NULLS LAST]
 02)--SortExec: expr=[a_str@0 ASC NULLS LAST, b@1 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[CAST(a@0 AS Utf8View) as a_str, b@1 as b]
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true
 
 # We cannot determine a+b is ordered from the
@@ -170,5 +166,5 @@ physical_plan
 01)SortPreservingMergeExec: [sum_expr@0 ASC NULLS LAST]
 02)--SortExec: expr=[sum_expr@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[CAST(a@0 + b@1 AS Int64) as sum_expr, a@0 as a, b@1 as b]
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true
diff --git a/datafusion/sqllogictest/test_files/named_arguments.slt b/datafusion/sqllogictest/test_files/named_arguments.slt
index c93da7e7a8f9e..07b6cc6a79a0c 100644
--- a/datafusion/sqllogictest/test_files/named_arguments.slt
+++ b/datafusion/sqllogictest/test_files/named_arguments.slt
@@ -79,13 +79,14 @@ SELECT substr(Str => 'hello world', Start_Pos => 7);
 ----
 world
 
-# Error: case-sensitive quoted parameter names don't match
+# Error: quoted identifiers are case-sensitive per SQL standards
+# "STR" does not match parameter "str" (wrong case)
 query error DataFusion error: Error during planning: Unknown parameter name 'STR'
 SELECT substr("STR" => 'hello world', "start_pos" => 7);
 
 # Error: wrong number of arguments
 # This query provides only 1 argument but substr requires 2 or 3
-query error DataFusion error: Error during planning: Execution error: Function 'substr' user-defined coercion failed with "Error during planning: The substr function requires 2 or 3 arguments, but got 1."
+query error Function 'substr' failed to match any signature
 SELECT substr(str => 'hello world');
 
 #############
@@ -137,3 +138,135 @@ SELECT substr(str => 'hello world', start_pos => 7, length => 5);
 # Reset to default dialect
 statement ok
 set datafusion.sql_parser.dialect = 'Generic';
+
+#############
+## Aggregate UDF Tests - using corr(y, x) function
+#############
+
+# Setup test data
+statement ok
+CREATE TABLE correlation_test(col1 DOUBLE, col2 DOUBLE) AS VALUES
+  (1.0, 2.0),
+  (2.0, 4.0),
+  (3.0, 6.0),
+  (4.0, 8.0);
+
+# Test positional arguments (baseline)
+query R
+SELECT corr(col1, col2) FROM correlation_test;
+----
+1
+
+# Test named arguments out of order (proves named args work for aggregates)
+query R
+SELECT corr(x => col2, y => col1) FROM correlation_test;
+----
+1
+
+# Error: function doesn't support named arguments (count has no parameter names)
+query error DataFusion error: Error during planning: Aggregate function 'count' does not support named arguments
+SELECT count(value => col1) FROM correlation_test;
+
+# Cleanup
+statement ok
+DROP TABLE correlation_test;
+
+#############
+## Aggregate UDF with WITHIN GROUP Tests - using percentile_cont(expression, percentile)
+## This tests the special handling where WITHIN GROUP ORDER BY expressions are prepended to args
+#############
+
+# Setup test data
+statement ok
+CREATE TABLE percentile_test(salary DOUBLE) AS VALUES
+  (50000.0),
+  (60000.0),
+  (70000.0),
+  (80000.0),
+  (90000.0);
+
+# Test positional arguments (baseline) - standard call without WITHIN GROUP
+query R
+SELECT percentile_cont(salary, 0.5) FROM percentile_test;
+----
+70000
+
+# Test WITHIN GROUP with positional argument
+query R
+SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY salary) FROM percentile_test;
+----
+70000
+
+# Test WITHIN GROUP with named argument for percentile
+# The ORDER BY expression (salary) is prepended internally, becoming: percentile_cont(salary, 0.5)
+# We use named argument for percentile, which should work correctly
+query R
+SELECT percentile_cont(percentile => 0.5) WITHIN GROUP (ORDER BY salary) FROM percentile_test;
+----
+70000
+
+# Verify the WITHIN GROUP prepending logic with different percentile value
+query R
+SELECT percentile_cont(percentile => 0.25) WITHIN GROUP (ORDER BY salary) FROM percentile_test;
+----
+60000
+
+# Cleanup
+statement ok
+DROP TABLE percentile_test;
+
+#############
+## Window UDF Tests - using lead(expression, offset, default) function
+#############
+
+# Setup test data
+statement ok
+CREATE TABLE window_test(id INT, value INT) AS VALUES
+  (1, 10),
+  (2, 20),
+  (3, 30),
+  (4, 40);
+
+# Test positional arguments (baseline)
+query II
+SELECT id, lead(value, 1, 0) OVER (ORDER BY id) FROM window_test ORDER BY id;
+----
+1 20
+2 30
+3 40
+4 0
+
+# Test named arguments out of order (proves named args work for window functions)
+query II
+SELECT id, lead(default => 0, offset => 1, expr => value) OVER (ORDER BY id) FROM window_test ORDER BY id;
+----
+1 20
+2 30
+3 40
+4 0
+
+# Test with 1 argument (offset and default use defaults)
+query II
+SELECT id, lead(expr => value) OVER (ORDER BY id) FROM window_test ORDER BY id;
+----
+1 20
+2 30
+3 40
+4 NULL
+
+# Test with 2 arguments (default uses default)
+query II
+SELECT id, lead(expr => value, offset => 2) OVER (ORDER BY id) FROM window_test ORDER BY id;
+----
+1 30
+2 40
+3 NULL
+4 NULL
+
+# Error: function doesn't support named arguments (row_number has no parameter names)
+query error DataFusion error: Error during planning: Window function 'row_number' does not support named arguments
+SELECT row_number(value => 1) OVER (ORDER BY id) FROM window_test;
+
+# Cleanup
+statement ok
+DROP TABLE window_test;
diff --git a/datafusion/sqllogictest/test_files/nested_loop_join_spill.slt b/datafusion/sqllogictest/test_files/nested_loop_join_spill.slt
new file mode 100644
index 0000000000000..7b5da1d4b8e03
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/nested_loop_join_spill.slt
@@ -0,0 +1,68 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for NestedLoopJoinExec memory-limited execution (spill to disk).
+# When the left (build) side exceeds the memory budget, the operator falls
+# back to a multi-pass strategy that spills the right side to disk.
+
+# Use a single target partition to avoid repartitioning.
+statement ok
+SET datafusion.execution.target_partitions = 1
+
+# Set a memory limit small enough to force OOM on the left-side buffering
+# (100K rows × 8 bytes ≈ 800KB > 150KB limit), triggering spill fallback.
+statement ok
+SET datafusion.runtime.memory_limit = '150K'
+
+# --- INNER JOIN with non-equijoin predicate (forces NestedLoopJoinExec) ---
+# Verify query succeeds and produces correct results under tight memory.
+query III nosort
+SELECT count(*) as cnt, min(v1) as mn, max(v1) as mx
+FROM generate_series(1, 100000) AS t1(v1)
+INNER JOIN generate_series(1, 1) AS t2(v2)
+  ON (t1.v1 + t2.v2) > 0
+----
+100000 1 100000
+
+# --- Verify spill metrics via EXPLAIN ANALYZE ---
+# The NestedLoopJoinExec line should show spill_count=2, confirming
+# the memory-limited fallback path was taken (left side spilled once,
+# right side spilled once).
+query TT
+EXPLAIN ANALYZE SELECT count(*)
+FROM generate_series(1, 100000) AS t1(v1)
+INNER JOIN generate_series(1, 1) AS t2(v2)
+  ON (t1.v1 + t2.v2) > 0
+----
+Plan with Metrics
+01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)], metrics=[<slt:ignore>]
+02)--AggregateExec: mode=Single, gby=[], aggr=[count(Int64(1))], metrics=[<slt:ignore>]
+03)----NestedLoopJoinExec: join_type=Inner, filter=v1@0 + v2@1 > 0, projection=[], metrics=[output_rows=100.0 K, <slt:ignore> spill_count=2, <slt:ignore>]
+04)------ProjectionExec: expr=[value@0 as v1], metrics=[<slt:ignore>]
+05)--------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192], metrics=[<slt:ignore>]
+06)------ProjectionExec: expr=[value@0 as v2], metrics=[<slt:ignore>]
+07)--------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=1, batch_size=8192], metrics=[<slt:ignore>]
+
+# Restore settings to slt runner defaults
+statement ok
+RESET datafusion.runtime.memory_limit
+
+statement ok
+SET datafusion.execution.target_partitions = 4
+
+statement ok
+RESET datafusion.catalog.create_default_catalog_and_schema
diff --git a/datafusion/sqllogictest/test_files/null_aware_anti_join.slt b/datafusion/sqllogictest/test_files/null_aware_anti_join.slt
new file mode 100644
index 0000000000000..5907a85a9b923
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/null_aware_anti_join.slt
@@ -0,0 +1,453 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#############
+## Null-Aware Anti Join Tests
+## Tests for automatic null-aware semantics in NOT IN subqueries
+#############
+
+statement ok
+CREATE TABLE outer_table(id INT, value TEXT) AS VALUES
+(1, 'a'),
+(2, 'b'),
+(3, 'c'),
+(4, 'd'),
+(NULL, 'e');
+
+statement ok
+CREATE TABLE inner_table_no_null(id INT, value TEXT) AS VALUES
+(2, 'x'),
+(4, 'y');
+
+statement ok
+CREATE TABLE inner_table_with_null(id INT, value TEXT) AS VALUES
+(2, 'x'),
+(NULL, 'y');
+
+#############
+## Test 1: NOT IN with no NULLs - should behave like regular anti join
+#############
+
+query IT rowsort
+SELECT * FROM outer_table WHERE id NOT IN (SELECT id FROM inner_table_no_null);
+----
+1 a
+3 c
+
+# Verify the plan uses LeftAnti join
+query TT
+EXPLAIN SELECT * FROM outer_table WHERE id NOT IN (SELECT id FROM inner_table_no_null);
+----
+logical_plan
+01)LeftAnti Join: outer_table.id = __correlated_sq_1.id
+02)--TableScan: outer_table projection=[id, value]
+03)--SubqueryAlias: __correlated_sq_1
+04)----TableScan: inner_table_no_null projection=[id]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(id@0, id@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+#############
+## Test 2: NOT IN with NULL in subquery - should return 0 rows (null-aware semantics)
+#############
+
+query IT rowsort
+SELECT * FROM outer_table WHERE id NOT IN (SELECT id FROM inner_table_with_null);
+----
+
+# Verify the result is empty even though there are rows in outer_table
+# that don't match the non-NULL value (2) in the subquery.
+# This is correct null-aware behavior: if subquery contains NULL, result is unknown.
+
+#############
+## Test 3: NOT IN with NULL in outer table but not in subquery
+## NULL rows from outer should not appear in output
+#############
+
+query IT rowsort
+SELECT * FROM outer_table WHERE id NOT IN (SELECT id FROM inner_table_no_null) AND id IS NOT NULL;
+----
+1 a
+3 c
+
+#############
+## Test 4: Test with all NULL subquery
+#############
+
+statement ok
+CREATE TABLE all_null_table(id INT) AS VALUES (NULL), (NULL);
+
+query IT rowsort
+SELECT * FROM outer_table WHERE id NOT IN (SELECT id FROM all_null_table);
+----
+
+#############
+## Test 5: Test with empty subquery - should return all rows
+#############
+
+statement ok
+CREATE TABLE empty_table(id INT, value TEXT);
+
+query IT rowsort
+SELECT * FROM outer_table WHERE id NOT IN (SELECT id FROM empty_table);
+----
+1 a
+2 b
+3 c
+4 d
+NULL e
+
+#############
+## Test 6: NOT IN with complex expression
+#############
+
+query IT rowsort
+SELECT * FROM outer_table WHERE id + 1 NOT IN (SELECT id FROM inner_table_no_null);
+----
+2 b
+4 d
+
+#############
+## Test 7: NOT IN with complex expression and NULL in subquery
+#############
+
+query IT rowsort
+SELECT * FROM outer_table WHERE id + 1 NOT IN (SELECT id FROM inner_table_with_null);
+----
+
+#############
+## Test 8: Multiple NOT IN conditions (AND)
+#############
+
+statement ok
+CREATE TABLE inner_table2(id INT) AS VALUES (1), (3);
+
+query IT rowsort
+SELECT * FROM outer_table
+WHERE id NOT IN (SELECT id FROM inner_table_no_null)
+  AND id NOT IN (SELECT id FROM inner_table2);
+----
+
+#############
+## Test 9: Multiple NOT IN conditions (OR)
+#############
+
+# KNOWN LIMITATION: Mark joins used for OR conditions don't support null-aware semantics.
+# The NULL row is incorrectly returned here. According to SQL semantics:
+# - NULL NOT IN (2, 4) = UNKNOWN
+# - NULL NOT IN (1, 3) = UNKNOWN
+# - UNKNOWN OR UNKNOWN = UNKNOWN (should be filtered out)
+# But mark joins treat NULL keys as non-matching (FALSE), so:
+# - NULL mark column = FALSE
+# - NOT FALSE OR NOT FALSE = TRUE OR TRUE = TRUE (incorrectly included)
+# TODO: Implement null-aware support for mark joins to fix this
+
+query IT rowsort
+SELECT * FROM outer_table
+WHERE id NOT IN (SELECT id FROM inner_table_no_null)
+   OR id NOT IN (SELECT id FROM inner_table2);
+----
+1 a
+2 b
+3 c
+4 d
+NULL e
+
+#############
+## Test 10: NOT IN with WHERE clause in subquery
+#############
+
+query IT rowsort
+SELECT * FROM outer_table
+WHERE id NOT IN (SELECT id FROM inner_table_with_null WHERE value = 'x');
+----
+1 a
+3 c
+4 d
+
+# Note: The NULL row from inner_table_with_null is filtered out by WHERE clause,
+# so this behaves like regular anti join (not null-aware)
+
+#############
+## Test 11: Verify NULL-aware flag is set for LeftAnti joins
+#############
+
+# Check that the physical plan shows null-aware anti join
+# Note: The exact format may vary, but we should see LeftAnti join type
+query TT
+EXPLAIN SELECT * FROM outer_table WHERE id NOT IN (SELECT id FROM inner_table_with_null);
+----
+logical_plan
+01)LeftAnti Join: outer_table.id = __correlated_sq_1.id
+02)--TableScan: outer_table projection=[id, value]
+03)--SubqueryAlias: __correlated_sq_1
+04)----TableScan: inner_table_with_null projection=[id]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(id@0, id@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+#############
+## Test 12: Correlated NOT IN subquery with NULL
+#############
+
+statement ok
+CREATE TABLE orders(order_id INT, customer_id INT) AS VALUES
+(1, 100),
+(2, 200),
+(3, 300);
+
+statement ok
+CREATE TABLE payments(payment_id INT, order_id INT) AS VALUES
+(1, 1),
+(2, NULL);
+
+# Find orders that don't have payments
+# Should return empty because there's a NULL in payments.order_id
+query I rowsort
+SELECT order_id FROM orders
+WHERE order_id NOT IN (SELECT order_id FROM payments);
+----
+
+#############
+## Test 13: NOT IN with DISTINCT in subquery
+#############
+
+statement ok
+CREATE TABLE duplicates_with_null(id INT) AS VALUES
+(2),
+(2),
+(NULL),
+(NULL);
+
+query IT rowsort
+SELECT * FROM outer_table
+WHERE id NOT IN (SELECT DISTINCT id FROM duplicates_with_null);
+----
+
+#############
+## Test 14: NOT EXISTS vs NOT IN - Demonstrating the difference
+#############
+
+# NOT EXISTS should NOT use null-aware semantics
+# It uses two-valued logic (TRUE/FALSE), not three-valued logic (TRUE/FALSE/UNKNOWN)
+
+# Setup tables for comparison
+statement ok
+CREATE TABLE customers(id INT, name TEXT) AS VALUES
+(1, 'Alice'),
+(2, 'Bob'),
+(3, 'Charlie'),
+(NULL, 'Dave');
+
+statement ok
+CREATE TABLE banned(id INT) AS VALUES
+(2),
+(NULL);
+
+# Test 14a: NOT IN with NULL in subquery - Returns EMPTY (null-aware)
+query IT rowsort
+SELECT * FROM customers WHERE id NOT IN (SELECT id FROM banned);
+----
+
+# Test 14b: NOT EXISTS with NULL in subquery - Returns rows (NOT null-aware)
+# This should return (1, 'Alice'), (3, 'Charlie'), (NULL, 'Dave')
+# Because NOT EXISTS uses two-valued logic: NULL = NULL is FALSE, so no match found
+query IT rowsort
+SELECT * FROM customers c
+WHERE NOT EXISTS (SELECT 1 FROM banned b WHERE c.id = b.id);
+----
+1 Alice
+3 Charlie
+NULL Dave
+
+# Test 14c: Verify with EXPLAIN that NOT EXISTS doesn't use null-aware
+query TT
+EXPLAIN SELECT * FROM customers c
+WHERE NOT EXISTS (SELECT 1 FROM banned b WHERE c.id = b.id);
+----
+logical_plan
+01)LeftAnti Join: c.id = __correlated_sq_1.id
+02)--SubqueryAlias: c
+03)----TableScan: customers projection=[id, name]
+04)--SubqueryAlias: __correlated_sq_1
+05)----SubqueryAlias: b
+06)------TableScan: banned projection=[id]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(id@0, id@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+#############
+## Test 15: NOT EXISTS - No NULLs
+#############
+
+statement ok
+CREATE TABLE active_customers(id INT) AS VALUES (1), (3);
+
+# Should return only Bob (id=2) and Dave (id=NULL)
+query IT rowsort
+SELECT * FROM customers c
+WHERE NOT EXISTS (SELECT 1 FROM active_customers a WHERE c.id = a.id);
+----
+2 Bob
+NULL Dave
+
+#############
+## Test 16: NOT EXISTS - Correlated subquery
+#############
+
+statement ok
+CREATE TABLE orders_test(order_id INT, customer_id INT) AS VALUES
+(1, 100),
+(2, 200),
+(3, NULL);
+
+statement ok
+CREATE TABLE customers_test(customer_id INT, name TEXT) AS VALUES
+(100, 'Alice'),
+(200, 'Bob'),
+(300, 'Charlie'),
+(NULL, 'Unknown');
+
+# Find customers with no orders
+# Should return Charlie (300) and Unknown (NULL)
+query IT rowsort
+SELECT * FROM customers_test c
+WHERE NOT EXISTS (
+    SELECT 1 FROM orders_test o WHERE o.customer_id = c.customer_id
+);
+----
+300 Charlie
+NULL Unknown
+
+#############
+## Test 17: NOT EXISTS with all NULL subquery
+#############
+
+statement ok
+CREATE TABLE all_null_banned(id INT) AS VALUES (NULL), (NULL);
+
+# NOT EXISTS should return all rows because NULL = NULL is FALSE (no matches)
+query IT rowsort
+SELECT * FROM customers c
+WHERE NOT EXISTS (SELECT 1 FROM all_null_banned b WHERE c.id = b.id);
+----
+1 Alice
+2 Bob
+3 Charlie
+NULL Dave
+
+# Compare with NOT IN which returns empty
+query IT rowsort
+SELECT * FROM customers WHERE id NOT IN (SELECT id FROM all_null_banned);
+----
+
+#############
+## Test 18: Nested NOT EXISTS and NOT IN
+#############
+
+# NOT EXISTS outside, NOT IN inside - should work correctly
+query IT rowsort
+SELECT * FROM customers c
+WHERE NOT EXISTS (
+    SELECT 1 FROM banned b
+    WHERE c.id = b.id
+    AND b.id NOT IN (SELECT id FROM active_customers)
+);
+----
+1 Alice
+3 Charlie
+NULL Dave
+
+#############
+## Test from GitHub issue #10583
+## Tests NOT IN with NULL in subquery result - should return empty result
+#############
+
+statement ok
+CREATE TABLE test_table(c1 INT, c2 INT) AS VALUES
+(1, 1),
+(2, 2),
+(3, 3),
+(4, NULL),
+(NULL, 0);
+
+# When subquery contains NULL, NOT IN should return empty result
+# because NULL NOT IN (values including NULL) is UNKNOWN for all rows
+query II rowsort
+SELECT * FROM test_table WHERE (c1 NOT IN (SELECT c2 FROM test_table)) = true;
+----
+
+# NOTE: The correlated subquery version from issue #10583:
+# SELECT * FROM test_table t1 WHERE c1 NOT IN (SELECT c2 FROM test_table t2 WHERE t1.c1 = t2.c1)
+# is not yet supported because it creates a multi-column join (correlation + NOT IN condition).
+# This is a known limitation - currently only supports single column null-aware anti joins.
+# This will be addressed in next Phase (multi-column support).
+
+#############
+## Cleanup
+#############
+
+statement ok
+DROP TABLE test_table;
+
+statement ok
+DROP TABLE outer_table;
+
+statement ok
+DROP TABLE inner_table_no_null;
+
+statement ok
+DROP TABLE inner_table_with_null;
+
+statement ok
+DROP TABLE all_null_table;
+
+statement ok
+DROP TABLE empty_table;
+
+statement ok
+DROP TABLE inner_table2;
+
+statement ok
+DROP TABLE orders;
+
+statement ok
+DROP TABLE payments;
+
+statement ok
+DROP TABLE duplicates_with_null;
+
+statement ok
+DROP TABLE customers;
+
+statement ok
+DROP TABLE banned;
+
+statement ok
+DROP TABLE active_customers;
+
+statement ok
+DROP TABLE orders_test;
+
+statement ok
+DROP TABLE customers_test;
+
+statement ok
+DROP TABLE all_null_banned;
diff --git a/datafusion/sqllogictest/test_files/nvl.slt b/datafusion/sqllogictest/test_files/nvl.slt
index f4225148ab781..7f78b02baccdb 100644
--- a/datafusion/sqllogictest/test_files/nvl.slt
+++ b/datafusion/sqllogictest/test_files/nvl.slt
@@ -114,7 +114,7 @@ SELECT NVL(1, 3);
 ----
 1
 
-query I
+query B
 SELECT NVL(NULL, NULL);
 ----
 NULL
diff --git a/datafusion/sqllogictest/test_files/operator.slt b/datafusion/sqllogictest/test_files/operator.slt
index 6f3c40188172d..e50fa721c8850 100644
--- a/datafusion/sqllogictest/test_files/operator.slt
+++ b/datafusion/sqllogictest/test_files/operator.slt
@@ -287,9 +287,8 @@ EXPLAIN SELECT * FROM numeric_types
 WHERE  int64 < 5 AND uint64 < 5 AND float64 < 5 AND decimal < 5;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: int64@3 < 5 AND uint64@7 < 5 AND float64@9 < 5 AND decimal@10 < Some(500),5,2
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: int64@3 < 5 AND uint64@7 < 5 AND float64@9 < 5 AND decimal@10 < Some(500),5,2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 ## < negative  integer (expect no casts)
 query TT
@@ -297,9 +296,8 @@ EXPLAIN SELECT * FROM numeric_types
 WHERE  int64 < -5 AND uint64 < -5 AND float64 < -5 AND decimal < -5;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: int64@3 < -5 AND CAST(uint64@7 AS Decimal128(20, 0)) < Some(-5),20,0 AND float64@9 < -5 AND decimal@10 < Some(-500),5,2
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: int64@3 < -5 AND CAST(uint64@7 AS Decimal128(20, 0)) < Some(-5),20,0 AND float64@9 < -5 AND decimal@10 < Some(-500),5,2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 ## < decimal (expect casts for integers to float)
 query TT
@@ -307,9 +305,8 @@ EXPLAIN SELECT * FROM numeric_types
 WHERE  int64 < 5.1 AND uint64 < 5.1 AND float64 < 5.1 AND decimal < 5.1;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: CAST(int64@3 AS Float64) < 5.1 AND CAST(uint64@7 AS Float64) < 5.1 AND float64@9 < 5.1 AND decimal@10 < Some(510),5,2
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: CAST(int64@3 AS Float64) < 5.1 AND CAST(uint64@7 AS Float64) < 5.1 AND float64@9 < 5.1 AND decimal@10 < Some(510),5,2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 ## < negative decimal (expect casts for integers to float)
 query TT
@@ -317,9 +314,8 @@ EXPLAIN SELECT * FROM numeric_types
 WHERE  int64 < -5.1 AND uint64 < -5.1 AND float64 < -5.1 AND decimal < -5.1;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: CAST(int64@3 AS Float64) < -5.1 AND CAST(uint64@7 AS Float64) < -5.1 AND float64@9 < -5.1 AND decimal@10 < Some(-510),5,2
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: CAST(int64@3 AS Float64) < -5.1 AND CAST(uint64@7 AS Float64) < -5.1 AND float64@9 < -5.1 AND decimal@10 < Some(-510),5,2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 
 ############### Equality ###############
@@ -330,9 +326,8 @@ EXPLAIN SELECT * FROM numeric_types
 WHERE  int64 = 5 AND uint64 = 5 AND float64 = 5 AND decimal = 5;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: int64@3 = 5 AND uint64@7 = 5 AND float64@9 = 5 AND decimal@10 = Some(500),5,2
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: int64@3 = 5 AND uint64@7 = 5 AND float64@9 = 5 AND decimal@10 = Some(500),5,2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 ## = negative  integer (expect no casts)
 query TT
@@ -340,9 +335,8 @@ EXPLAIN SELECT * FROM numeric_types
 WHERE  int64 = -5 AND uint64 = -5 AND float64 = -5 AND decimal = -5;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: int64@3 = -5 AND CAST(uint64@7 AS Decimal128(20, 0)) = Some(-5),20,0 AND float64@9 = -5 AND decimal@10 = Some(-500),5,2
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: int64@3 = -5 AND CAST(uint64@7 AS Decimal128(20, 0)) = Some(-5),20,0 AND float64@9 = -5 AND decimal@10 = Some(-500),5,2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 ## = decimal (expect casts for integers to float)
 query TT
@@ -350,9 +344,8 @@ EXPLAIN SELECT * FROM numeric_types
 WHERE  int64 = 5.1 AND uint64 = 5.1 AND float64 = 5.1 AND decimal = 5.1;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: CAST(int64@3 AS Float64) = 5.1 AND CAST(uint64@7 AS Float64) = 5.1 AND float64@9 = 5.1 AND decimal@10 = Some(510),5,2
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: CAST(int64@3 AS Float64) = 5.1 AND CAST(uint64@7 AS Float64) = 5.1 AND float64@9 = 5.1 AND decimal@10 = Some(510),5,2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 ## = negative decimal (expect casts for integers to float)
 query TT
@@ -360,9 +353,8 @@ EXPLAIN SELECT * FROM numeric_types
 WHERE  int64 = -5.1 AND uint64 = -5.1 AND float64 = -5.1 AND decimal = -5.1;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: CAST(int64@3 AS Float64) = -5.1 AND CAST(uint64@7 AS Float64) = -5.1 AND float64@9 = -5.1 AND decimal@10 = Some(-510),5,2
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: CAST(int64@3 AS Float64) = -5.1 AND CAST(uint64@7 AS Float64) = -5.1 AND float64@9 = -5.1 AND decimal@10 = Some(-510),5,2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 
 statement ok
diff --git a/datafusion/sqllogictest/test_files/optimizer_group_by_constant.slt b/datafusion/sqllogictest/test_files/optimizer_group_by_constant.slt
index de6a153f58d98..da1e7de22bb7a 100644
--- a/datafusion/sqllogictest/test_files/optimizer_group_by_constant.slt
+++ b/datafusion/sqllogictest/test_files/optimizer_group_by_constant.slt
@@ -116,3 +116,13 @@ logical_plan
 01)Aggregate: groupBy=[[random()]], aggr=[[]]
 02)--SubqueryAlias: t
 03)----TableScan: test_table projection=[]
+
+# Config reset
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+reset datafusion.explain.logical_plan_only;
diff --git a/datafusion/sqllogictest/test_files/options.slt b/datafusion/sqllogictest/test_files/options.slt
index 71ff12e8cc507..0d1583dbc0086 100644
--- a/datafusion/sqllogictest/test_files/options.slt
+++ b/datafusion/sqllogictest/test_files/options.slt
@@ -23,7 +23,6 @@
 statement ok
 create table a(c0 int) as values (1), (2);
 
-# Expect coalesce and default batch size
 query TT
 explain SELECT * FROM a WHERE c0 < 1;
 ----
@@ -31,9 +30,8 @@ logical_plan
 01)Filter: a.c0 < Int32(1)
 02)--TableScan: a projection=[c0]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: c0@0 < 1
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: c0@0 < 1
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 ##
 # test_disable_coalesce
@@ -72,9 +70,8 @@ logical_plan
 01)Filter: a.c0 < Int32(1)
 02)--TableScan: a projection=[c0]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=1234
-02)--FilterExec: c0@0 < 1
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: c0@0 < 1
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 
 statement ok
diff --git a/datafusion/sqllogictest/test_files/order.slt b/datafusion/sqllogictest/test_files/order.slt
index 04a7615c764b8..ffd48d5996576 100644
--- a/datafusion/sqllogictest/test_files/order.slt
+++ b/datafusion/sqllogictest/test_files/order.slt
@@ -260,6 +260,64 @@ physical_plan
 02)--SortExec: expr=[c2@1 ASC NULLS LAST, c3@2 ASC NULLS LAST], preserve_partitioning=[false]
 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true
 
+
+# eliminate redundant fd sort expr
+query TT
+explain SELECT c2, SUM(c3) AS total_sal FROM aggregate_test_100 GROUP BY c2 ORDER BY c2, total_sal
+----
+logical_plan
+01)Sort: aggregate_test_100.c2 ASC NULLS LAST
+02)--Projection: aggregate_test_100.c2, sum(aggregate_test_100.c3) AS total_sal
+03)----Aggregate: groupBy=[[aggregate_test_100.c2]], aggr=[[sum(CAST(aggregate_test_100.c3 AS Int64))]]
+04)------TableScan: aggregate_test_100 projection=[c2, c3]
+physical_plan
+01)SortPreservingMergeExec: [c2@0 ASC NULLS LAST]
+02)--SortExec: expr=[c2@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[c2@0 as c2, sum(aggregate_test_100.c3)@1 as total_sal]
+04)------AggregateExec: mode=FinalPartitioned, gby=[c2@0 as c2], aggr=[sum(aggregate_test_100.c3)]
+05)--------RepartitionExec: partitioning=Hash([c2@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[c2@0 as c2], aggr=[sum(aggregate_test_100.c3)]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], file_type=csv, has_header=true
+
+# keep order by when dependency comes later
+query TT
+explain SELECT c2, SUM(c3) AS total_sal FROM aggregate_test_100 GROUP BY c2 ORDER BY total_sal, c2
+----
+logical_plan
+01)Sort: total_sal ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST
+02)--Projection: aggregate_test_100.c2, sum(aggregate_test_100.c3) AS total_sal
+03)----Aggregate: groupBy=[[aggregate_test_100.c2]], aggr=[[sum(CAST(aggregate_test_100.c3 AS Int64))]]
+04)------TableScan: aggregate_test_100 projection=[c2, c3]
+physical_plan
+01)SortPreservingMergeExec: [total_sal@1 ASC NULLS LAST, c2@0 ASC NULLS LAST]
+02)--SortExec: expr=[total_sal@1 ASC NULLS LAST, c2@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[c2@0 as c2, sum(aggregate_test_100.c3)@1 as total_sal]
+04)------AggregateExec: mode=FinalPartitioned, gby=[c2@0 as c2], aggr=[sum(aggregate_test_100.c3)]
+05)--------RepartitionExec: partitioning=Hash([c2@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[c2@0 as c2], aggr=[sum(aggregate_test_100.c3)]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], file_type=csv, has_header=true
+
+# eliminate redundant sort expr even with non schema expr
+query TT
+explain SELECT c2, SUM(c3) AS total_sal FROM aggregate_test_100 GROUP BY c2 ORDER BY c2, total_sal, abs(c2)
+----
+logical_plan
+01)Sort: aggregate_test_100.c2 ASC NULLS LAST, abs(aggregate_test_100.c2) ASC NULLS LAST
+02)--Projection: aggregate_test_100.c2, sum(aggregate_test_100.c3) AS total_sal
+03)----Aggregate: groupBy=[[aggregate_test_100.c2]], aggr=[[sum(CAST(aggregate_test_100.c3 AS Int64))]]
+04)------TableScan: aggregate_test_100 projection=[c2, c3]
+physical_plan
+01)SortPreservingMergeExec: [c2@0 ASC NULLS LAST, abs(c2@0) ASC NULLS LAST]
+02)--SortExec: expr=[c2@0 ASC NULLS LAST, abs(c2@0) ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[c2@0 as c2, sum(aggregate_test_100.c3)@1 as total_sal]
+04)------AggregateExec: mode=FinalPartitioned, gby=[c2@0 as c2], aggr=[sum(aggregate_test_100.c3)]
+05)--------RepartitionExec: partitioning=Hash([c2@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[c2@0 as c2], aggr=[sum(aggregate_test_100.c3)]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], file_type=csv, has_header=true
+
 query II
 SELECT c2, c3 FROM aggregate_test_100 ORDER BY c2, c3, c2
 ----
@@ -419,6 +477,42 @@ select column1 + column2 from foo group by column1, column2 ORDER BY column2 des
 7
 3
 
+# Test ordering by aggregate on non-selected column (issue #18683)
+# Previously failed with "Schema error: No field named foo.column2"
+query I
+select column1 from foo group by column1 order by min(column2);
+----
+1
+3
+5
+
+# Test ordering by aggregate expression on non-selected columns
+query I
+select column1 from foo group by column1 order by min(column2) + max(column2);
+----
+1
+3
+5
+
+# Test ordering by multiple aggregates on non-selected columns
+query I
+select column1 from foo group by column1 order by min(column2), max(column2);
+----
+1
+3
+5
+
+# Test GROUP BY alias with ORDER BY column index
+# Regression test: GROUP BY an aliased column, ORDER BY using column index
+query TI
+with t as (select 'foo' as x)
+select x, count(*) as "Count"
+from t
+group by x
+order by 2 desc;
+----
+foo 1
+
 # Test issue: https://github.com/apache/datafusion/issues/11549
 query I
 select column1 from foo order by log(column2);
@@ -435,6 +529,54 @@ select column1 from foo order by column2 % 2, column2;
 3
 5
 
+# ORDER BY aggregate expression that is aliased in SELECT
+query II
+select column1, min(column2) as min_val from foo group by column1 order by min(column2);
+----
+1 2
+3 4
+5 6
+
+# ORDER BY aggregate with alias, using DESC
+query II rowsort
+select column1, count(*) as cnt from foo group by column1 order by count(*) desc;
+----
+1 1
+3 1
+5 1
+
+# ORDER BY aggregate not in SELECT, while other aggregates in SELECT are aliased
+query I
+select column1 from foo group by column1 order by max(column2);
+----
+1
+3
+5
+
+# SELECT has composite expression containing the aggregate, plus standalone alias
+query III
+select column1, min(column2) + max(column2) as range_val, min(column2) as min_val from foo group by column1 order by min(column2);
+----
+1 4 2
+3 8 4
+5 12 6
+
+# ORDER BY aggregate that matches multiple aliased SELECT expressions
+query III
+select column1, min(column2) as first_min, min(column2) as second_min from foo group by column1 order by min(column2);
+----
+1 2 2
+3 4 4
+5 6 6
+
+# ORDER BY with CAST on aliased aggregate
+query II
+select column1, min(column2) as min_val from foo group by column1 order by CAST(min(column2) AS BIGINT);
+----
+1 2
+3 4
+5 6
+
 # Cleanup
 statement ok
 drop table foo;
@@ -561,7 +703,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [result@0 ASC NULLS LAST]
 02)--ProjectionExec: expr=[b@1 + a@0 + c@2 as result]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c], output_orderings=[[a@0 ASC NULLS LAST], [b@1 ASC NULLS LAST], [c@2 ASC NULLS LAST]], file_type=csv, has_header=true
 
 statement ok
@@ -592,7 +734,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [db15@0 ASC NULLS LAST]
 02)--ProjectionExec: expr=[date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 900000000000 }, ts@0, 1659537600000000000) as db15]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/timestamps.csv]]}, projection=[ts], output_ordering=[ts@0 ASC NULLS LAST], file_type=csv, has_header=false
 
 query TT
@@ -607,7 +749,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [dt_day@0 ASC NULLS LAST]
 02)--ProjectionExec: expr=[date_trunc(DAY, ts@0) as dt_day]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/timestamps.csv]]}, projection=[ts], output_ordering=[ts@0 ASC NULLS LAST], file_type=csv, has_header=false
 
 statement ok
@@ -650,7 +792,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [atan_c11@0 ASC NULLS LAST]
 02)--ProjectionExec: expr=[atan(c11@0) as atan_c11]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c11], output_ordering=[c11@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 query TT
@@ -665,7 +807,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [ceil_c11@0 ASC NULLS LAST]
 02)--ProjectionExec: expr=[ceil(c11@0) as ceil_c11]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c11], output_ordering=[c11@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 query TT
@@ -675,12 +817,12 @@ query TT
 ----
 logical_plan
 01)Sort: log_c11_base_c12 ASC NULLS LAST
-02)--Projection: log(aggregate_test_100.c12, CAST(aggregate_test_100.c11 AS Float64)) AS log_c11_base_c12
+02)--Projection: log(aggregate_test_100.c12, aggregate_test_100.c11) AS log_c11_base_c12
 03)----TableScan: aggregate_test_100 projection=[c11, c12]
 physical_plan
 01)SortPreservingMergeExec: [log_c11_base_c12@0 ASC NULLS LAST]
-02)--ProjectionExec: expr=[log(c12@1, CAST(c11@0 AS Float64)) as log_c11_base_c12]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+02)--ProjectionExec: expr=[log(c12@1, c11@0) as log_c11_base_c12]
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c11, c12], output_orderings=[[c11@0 ASC NULLS LAST], [c12@1 DESC NULLS LAST]], file_type=csv, has_header=true
 
 query TT
@@ -690,12 +832,12 @@ ORDER BY log_c12_base_c11 DESC NULLS LAST;
 ----
 logical_plan
 01)Sort: log_c12_base_c11 DESC NULLS LAST
-02)--Projection: log(CAST(aggregate_test_100.c11 AS Float64), aggregate_test_100.c12) AS log_c12_base_c11
+02)--Projection: log(aggregate_test_100.c11, aggregate_test_100.c12) AS log_c12_base_c11
 03)----TableScan: aggregate_test_100 projection=[c11, c12]
 physical_plan
 01)SortPreservingMergeExec: [log_c12_base_c11@0 DESC NULLS LAST]
-02)--ProjectionExec: expr=[log(CAST(c11@0 AS Float64), c12@1) as log_c12_base_c11]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+02)--ProjectionExec: expr=[log(c11@0, c12@1) as log_c12_base_c11]
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c11, c12], output_orderings=[[c11@0 ASC NULLS LAST], [c12@1 DESC NULLS LAST]], file_type=csv, has_header=true
 
 statement ok
@@ -893,20 +1035,16 @@ physical_plan
 03)----InterleaveExec
 04)------ProjectionExec: expr=[0 as m, t@0 as t]
 05)--------AggregateExec: mode=FinalPartitioned, gby=[t@0 as t], aggr=[]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------RepartitionExec: partitioning=Hash([t@0], 2), input_partitions=2
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-09)----------------AggregateExec: mode=Partial, gby=[t@0 as t], aggr=[]
-10)------------------ProjectionExec: expr=[column1@0 as t]
-11)--------------------DataSourceExec: partitions=1, partition_sizes=[1]
-12)------ProjectionExec: expr=[1 as m, t@0 as t]
-13)--------AggregateExec: mode=FinalPartitioned, gby=[t@0 as t], aggr=[]
-14)----------CoalesceBatchesExec: target_batch_size=8192
-15)------------RepartitionExec: partitioning=Hash([t@0], 2), input_partitions=2
-16)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-17)----------------AggregateExec: mode=Partial, gby=[t@0 as t], aggr=[]
-18)------------------ProjectionExec: expr=[column1@0 as t]
-19)--------------------DataSourceExec: partitions=1, partition_sizes=[1]
+06)----------RepartitionExec: partitioning=Hash([t@0], 2), input_partitions=1
+07)------------AggregateExec: mode=Partial, gby=[t@0 as t], aggr=[]
+08)--------------ProjectionExec: expr=[column1@0 as t]
+09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
+10)------ProjectionExec: expr=[1 as m, t@0 as t]
+11)--------AggregateExec: mode=FinalPartitioned, gby=[t@0 as t], aggr=[]
+12)----------RepartitionExec: partitioning=Hash([t@0], 2), input_partitions=1
+13)------------AggregateExec: mode=Partial, gby=[t@0 as t], aggr=[]
+14)--------------ProjectionExec: expr=[column1@0 as t]
+15)----------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 #####
 # Multi column sorting with lists
@@ -1024,10 +1162,10 @@ ORDER BY SUM(column1)
 
 # ORDER BY with a GROUP BY clause
 query I
-SELECT SUM(column1) 
-  FROM foo 
-GROUP BY column2 
-ORDER BY SUM(column1) 
+SELECT SUM(column1)
+  FROM foo
+GROUP BY column2
+ORDER BY SUM(column1)
 ----
 0
 2
@@ -1039,12 +1177,12 @@ ORDER BY SUM(column1)
 
 # ORDER BY with a GROUP BY clause and a HAVING clause
 query I
-SELECT 
-  SUM(column1) 
-FROM foo 
-GROUP BY column2 
-HAVING SUM(column1) < 3 
-ORDER BY SUM(column1) 
+SELECT
+  SUM(column1)
+FROM foo
+GROUP BY column2
+HAVING SUM(column1) < 3
+ORDER BY SUM(column1)
 ----
 0
 2
@@ -1057,8 +1195,8 @@ SELECT SUM(column1) FROM foo ORDER BY SUM(column1)
 ----
 16
 
-# Order by unprojected aggregate expressions is not supported
-query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression AggregateFunction
+# Order by unprojected aggregate expressions requires GROUP BY
+query error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function
 SELECT column2 FROM foo ORDER BY SUM(column1)
 
 statement ok
@@ -1145,7 +1283,7 @@ physical_plan
 01)SortPreservingMergeExec: [c_str@0 ASC NULLS LAST], fetch=5
 02)--SortExec: TopK(fetch=5), expr=[c_str@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[CAST(c@0 AS Utf8View) as c_str]
-04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 
@@ -1175,11 +1313,11 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [c_bigint@0 ASC NULLS LAST], fetch=5
 02)--ProjectionExec: expr=[CAST(c@0 AS Int64) as c_bigint]
-03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 statement ok
-drop table ordered_table; 
+drop table ordered_table;
 
 
 # ABS(x) breaks the ordering if x's range contains both negative and positive values.
@@ -1211,11 +1349,11 @@ physical_plan
 01)SortPreservingMergeExec: [abs_c@0 ASC NULLS LAST], fetch=5
 02)--SortExec: TopK(fetch=5), expr=[abs_c@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[abs(c@0) as abs_c]
-04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 statement ok
-drop table ordered_table; 
+drop table ordered_table;
 
 # ABS(x) preserves the ordering if x's range falls into positive values.
 # Since x is defined as INT UNSIGNED, its range is assumed to be from 0 to INF.
@@ -1245,7 +1383,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [abs_c@0 ASC NULLS LAST], fetch=5
 02)--ProjectionExec: expr=[abs(c@0) as abs_c]
-03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 # Boolean to integer casts preserve the order.
@@ -1271,7 +1409,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [c@0 ASC NULLS LAST]
 02)--ProjectionExec: expr=[CAST(inc_col@0 > desc_col@1 AS Int32) as c]
-03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[inc_col, desc_col], output_orderings=[[inc_col@0 ASC NULLS LAST], [desc_col@1 DESC]], file_type=csv, has_header=true
 
 # Union a query with the actual data and one with a constant
@@ -1294,7 +1432,7 @@ logical_plan
 03)----TableScan: ordered_table projection=[a, b]
 physical_plan
 01)ProjectionExec: expr=[a@0 + b@1 as sum1]
-02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
 03)----SortExec: TopK(fetch=1), expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false]
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], file_type=csv, has_header=true
 
@@ -1333,7 +1471,7 @@ logical_plan
 03)----TableScan: ordered_table projection=[a, b]
 physical_plan
 01)ProjectionExec: expr=[a@0 + b@1 as sum1]
-02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
 03)----SortExec: TopK(fetch=1), expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false]
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], file_type=csv, has_header=true
 
@@ -1359,10 +1497,8 @@ physical_plan
 01)SortPreservingMergeExec: [d@4 ASC NULLS LAST, c@1 ASC NULLS LAST, a@2 ASC NULLS LAST, a0@3 ASC NULLS LAST, b@0 ASC NULLS LAST], fetch=2
 02)--SortExec: TopK(fetch=2), expr=[d@4 ASC NULLS LAST, c@1 ASC NULLS LAST, a@2 ASC NULLS LAST, a0@3 ASC NULLS LAST, b@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----UnionExec
-04)------ProjectionExec: expr=[b@1 as b, c@2 as c, a@0 as a, NULL as a0, d@3 as d]
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_ordering=[c@2 ASC NULLS LAST], file_type=csv, has_header=true
-06)------ProjectionExec: expr=[b@1 as b, c@2 as c, NULL as a, a0@0 as a0, d@3 as d]
-07)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, b, c, d], output_ordering=[c@2 ASC NULLS LAST], file_type=csv, has_header=true
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, a, NULL as a0, d], output_ordering=[c@1 ASC NULLS LAST], file_type=csv, has_header=true
+05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, NULL as a, a0, d], output_ordering=[c@1 ASC NULLS LAST], file_type=csv, has_header=true
 
 # Test: run the query from above
 query IIIII
@@ -1443,7 +1579,7 @@ query TT
 EXPLAIN SELECT c1, c2 FROM table_with_ordered_pk ORDER BY c1, c2;
 ----
 logical_plan
-01)Sort: table_with_ordered_pk.c1 ASC NULLS LAST, table_with_ordered_pk.c2 ASC NULLS LAST
+01)Sort: table_with_ordered_pk.c1 ASC NULLS LAST
 02)--TableScan: table_with_ordered_pk projection=[c1, c2]
 physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/aggregate_agg_multi_order.csv]]}, projection=[c1, c2], output_ordering=[c1@0 ASC NULLS LAST], constraints=[PrimaryKey([0])], file_type=csv, has_header=true
 
@@ -1554,3 +1690,83 @@ EXPLAIN SELECT * from ordered ORDER BY a;
 physical_plan
 01)SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false]
 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/data/composite_order.csv]]}, projection=[a, b], output_ordering=[a@0 + b@1 ASC NULLS LAST], file_type=csv, has_header=true
+
+# Sort elimination through named_struct projections
+#
+# When data is sorted by a column and that column is wrapped into a struct
+# via named_struct, sorting by the struct field should NOT require a
+# separate SortExec because the optimizer can track that the struct field
+# preserves the original ordering.
+
+# Wrapping the ordered expression (a + b) into a struct field — sort eliminated
+# Reuses the `ordered` table above which has WITH ORDER (a + b).
+query TT
+EXPLAIN SELECT named_struct('sum', a + b) AS s FROM ordered ORDER BY s['sum'];
+----
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/data/composite_order.csv]]}, projection=[named_struct(sum, a@0 + b@1) as s], file_type=csv, has_header=true
+
+# Wrapping a non-ordered column into a struct — SortExec required
+# Reuses the `ordered` table above which has WITH ORDER (a + b).
+query TT
+EXPLAIN SELECT named_struct('a', a, 'b', b) AS s FROM ordered ORDER BY s['a'];
+----
+physical_plan
+01)ProjectionExec: expr=[s@0 as s]
+02)--SortExec: expr=[__datafusion_extracted_1@1 ASC NULLS LAST], preserve_partitioning=[false]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/data/composite_order.csv]]}, projection=[named_struct(a, a@0, b, b@1) as s, get_field(named_struct(a, a@0, b, b@1), a) as __datafusion_extracted_1], file_type=csv, has_header=true
+
+# Simple column ordering tests using a table ordered by (a)
+statement ok
+CREATE EXTERNAL TABLE ordered_by_a (
+  a  BIGINT NOT NULL,
+  b  BIGINT NOT NULL
+)
+STORED AS CSV
+LOCATION 'data/composite_order.csv'
+OPTIONS ('format.has_header' 'true')
+WITH ORDER (a);
+
+# Single struct field matching source ordering -- sort eliminated
+query TT
+EXPLAIN SELECT named_struct('a', a, 'b', b) AS s FROM ordered_by_a ORDER BY s['a'];
+----
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/data/composite_order.csv]]}, projection=[named_struct(a, a@0, b, b@1) as s], file_type=csv, has_header=true
+
+# Struct field not matching source ordering -- SortExec required
+query TT
+EXPLAIN SELECT named_struct('a', a, 'b', b) AS s FROM ordered_by_a ORDER BY s['b'];
+----
+physical_plan
+01)ProjectionExec: expr=[s@0 as s]
+02)--SortExec: expr=[__datafusion_extracted_1@1 ASC NULLS LAST], preserve_partitioning=[false]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/data/composite_order.csv]]}, projection=[named_struct(a, a@0, b, b@1) as s, get_field(named_struct(a, a@0, b, b@1), b) as __datafusion_extracted_1], file_type=csv, has_header=true
+
+# Mixed projection: top-level column alongside struct, order by struct field
+query TT
+EXPLAIN SELECT a, named_struct('a', a, 'b', b) AS s FROM ordered_by_a ORDER BY s['a'];
+----
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/data/composite_order.csv]]}, projection=[a, named_struct(a, a@0, b, b@1) as s], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
+
+# Config reset
+statement ok
+reset datafusion.catalog.information_schema;
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+reset datafusion.optimizer.repartition_sorts;
+
+statement ok
+reset datafusion.execution.use_row_number_estimates_to_optimize_partitioning;
+
+statement ok
+reset datafusion.explain.physical_plan_only;
+
+statement ok
+reset datafusion.sql_parser.default_null_ordering;
+
+statement ok
+reset datafusion.sql_parser.dialect;
diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt
index c21f3129d4ee9..781d0b00a5e4f 100644
--- a/datafusion/sqllogictest/test_files/parquet.slt
+++ b/datafusion/sqllogictest/test_files/parquet.slt
@@ -21,6 +21,10 @@
 statement ok
 set datafusion.execution.target_partitions = 2;
 
+# disable the listing cache so DataFusion picks up changes from COPY statements
+statement ok
+set datafusion.runtime.list_files_cache_limit = "0K";
+
 # Create a table as a data source
 statement ok
 CREATE TABLE src_table (
@@ -454,10 +458,9 @@ logical_plan
 01)Filter: CAST(binary_as_string_default.binary_col AS Utf8View) LIKE Utf8View("%a%") AND CAST(binary_as_string_default.largebinary_col AS Utf8View) LIKE Utf8View("%a%") AND CAST(binary_as_string_default.binaryview_col AS Utf8View) LIKE Utf8View("%a%")
 02)--TableScan: binary_as_string_default projection=[binary_col, largebinary_col, binaryview_col], partial_filters=[CAST(binary_as_string_default.binary_col AS Utf8View) LIKE Utf8View("%a%"), CAST(binary_as_string_default.largebinary_col AS Utf8View) LIKE Utf8View("%a%"), CAST(binary_as_string_default.binaryview_col AS Utf8View) LIKE Utf8View("%a%")]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: CAST(binary_col@0 AS Utf8View) LIKE %a% AND CAST(largebinary_col@1 AS Utf8View) LIKE %a% AND CAST(binaryview_col@2 AS Utf8View) LIKE %a%
-03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/binary_as_string.parquet]]}, projection=[binary_col, largebinary_col, binaryview_col], file_type=parquet, predicate=CAST(binary_col@0 AS Utf8View) LIKE %a% AND CAST(largebinary_col@1 AS Utf8View) LIKE %a% AND CAST(binaryview_col@2 AS Utf8View) LIKE %a%
+01)FilterExec: CAST(binary_col@0 AS Utf8View) LIKE %a% AND CAST(largebinary_col@1 AS Utf8View) LIKE %a% AND CAST(binaryview_col@2 AS Utf8View) LIKE %a%
+02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/binary_as_string.parquet]]}, projection=[binary_col, largebinary_col, binaryview_col], file_type=parquet, predicate=CAST(binary_col@0 AS Utf8View) LIKE %a% AND CAST(largebinary_col@1 AS Utf8View) LIKE %a% AND CAST(binaryview_col@2 AS Utf8View) LIKE %a%
 
 
 statement ok
@@ -502,10 +505,9 @@ logical_plan
 01)Filter: binary_as_string_option.binary_col LIKE Utf8View("%a%") AND binary_as_string_option.largebinary_col LIKE Utf8View("%a%") AND binary_as_string_option.binaryview_col LIKE Utf8View("%a%")
 02)--TableScan: binary_as_string_option projection=[binary_col, largebinary_col, binaryview_col], partial_filters=[binary_as_string_option.binary_col LIKE Utf8View("%a%"), binary_as_string_option.largebinary_col LIKE Utf8View("%a%"), binary_as_string_option.binaryview_col LIKE Utf8View("%a%")]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
-03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/binary_as_string.parquet]]}, projection=[binary_col, largebinary_col, binaryview_col], file_type=parquet, predicate=binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
+01)FilterExec: binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
+02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/binary_as_string.parquet]]}, projection=[binary_col, largebinary_col, binaryview_col], file_type=parquet, predicate=binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
 
 
 statement ok
@@ -553,10 +555,9 @@ logical_plan
 01)Filter: binary_as_string_both.binary_col LIKE Utf8View("%a%") AND binary_as_string_both.largebinary_col LIKE Utf8View("%a%") AND binary_as_string_both.binaryview_col LIKE Utf8View("%a%")
 02)--TableScan: binary_as_string_both projection=[binary_col, largebinary_col, binaryview_col], partial_filters=[binary_as_string_both.binary_col LIKE Utf8View("%a%"), binary_as_string_both.largebinary_col LIKE Utf8View("%a%"), binary_as_string_both.binaryview_col LIKE Utf8View("%a%")]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
-03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/binary_as_string.parquet]]}, projection=[binary_col, largebinary_col, binaryview_col], file_type=parquet, predicate=binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
+01)FilterExec: binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
+02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/binary_as_string.parquet]]}, projection=[binary_col, largebinary_col, binaryview_col], file_type=parquet, predicate=binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
 
 
 statement ok
@@ -668,10 +669,9 @@ logical_plan
 01)Filter: foo.column1 LIKE Utf8View("f%")
 02)--TableScan: foo projection=[column1], partial_filters=[foo.column1 LIKE Utf8View("f%")]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: column1@0 LIKE f%
-03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/foo.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 LIKE f%, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= g AND f <= column1_max@1, required_guarantees=[]
+01)FilterExec: column1@0 LIKE f%
+02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/foo.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 LIKE f%, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= g AND f <= column1_max@1, required_guarantees=[]
 
 statement ok
 drop table foo
@@ -887,5 +887,25 @@ WHERE b = 2;
 ----
 2
 
+# Config reset
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+reset datafusion.runtime.list_files_cache_limit;
+
+statement ok
+reset datafusion.execution.listing_table_ignore_subdirectory;
+
+statement ok
+reset datafusion.execution.parquet.coerce_int96;
+
+# Config reset
+statement ok
+RESET datafusion.catalog.create_default_catalog_and_schema;
+
 statement ok
 DROP TABLE t;
diff --git a/datafusion/sqllogictest/test_files/parquet_cdc.slt b/datafusion/sqllogictest/test_files/parquet_cdc.slt
new file mode 100644
index 0000000000000..f87f05af74a0c
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/parquet_cdc.slt
@@ -0,0 +1,231 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test parquet content-defined chunking (CDC) end-to-end:
+# write parquet files with CDC enabled, then read them back and verify correctness.
+
+# Create source data
+statement ok
+CREATE TABLE cdc_source AS VALUES
+    (1, 'alice', 100.50),
+    (2, 'bob', 200.75),
+    (3, 'charlie', 300.25),
+    (4, 'diana', 400.00),
+    (5, 'eve', 500.99)
+
+#
+# Test 1: Enable CDC with 'true' (uses default options)
+#
+
+query I
+COPY cdc_source TO 'test_files/scratch/parquet_cdc/enabled_true/'
+STORED AS PARQUET
+OPTIONS (
+    'format.use_content_defined_chunking' 'true'
+)
+----
+5
+
+statement ok
+CREATE EXTERNAL TABLE cdc_enabled_true_read
+STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet_cdc/enabled_true/'
+
+query ITR rowsort
+SELECT * FROM cdc_enabled_true_read
+----
+1 alice 100.5
+2 bob 200.75
+3 charlie 300.25
+4 diana 400
+5 eve 500.99
+
+# Verify filtering works on CDC-written files
+query ITR
+SELECT * FROM cdc_enabled_true_read WHERE column1 > 3 ORDER BY column1
+----
+4 diana 400
+5 eve 500.99
+
+# Verify aggregation works on CDC-written files
+query R
+SELECT SUM(column3) FROM cdc_enabled_true_read
+----
+1502.49
+
+#
+# Test 2: Disable CDC with 'false' (same as default behavior)
+#
+
+query I
+COPY cdc_source TO 'test_files/scratch/parquet_cdc/disabled_false/'
+STORED AS PARQUET
+OPTIONS (
+    'format.use_content_defined_chunking' 'false'
+)
+----
+5
+
+statement ok
+CREATE EXTERNAL TABLE cdc_disabled_false_read
+STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet_cdc/disabled_false/'
+
+query ITR rowsort
+SELECT * FROM cdc_disabled_false_read
+----
+1 alice 100.5
+2 bob 200.75
+3 charlie 300.25
+4 diana 400
+5 eve 500.99
+
+#
+# Test 3: Enable CDC with custom sub-field options
+#
+
+query I
+COPY cdc_source TO 'test_files/scratch/parquet_cdc/custom_chunks/'
+STORED AS PARQUET
+OPTIONS (
+    'format.use_content_defined_chunking.min_chunk_size' '1024',
+    'format.use_content_defined_chunking.max_chunk_size' '4096',
+    'format.use_content_defined_chunking.norm_level' '1'
+)
+----
+5
+
+statement ok
+CREATE EXTERNAL TABLE cdc_custom_read
+STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet_cdc/custom_chunks/'
+
+query ITR rowsort
+SELECT * FROM cdc_custom_read
+----
+1 alice 100.5
+2 bob 200.75
+3 charlie 300.25
+4 diana 400
+5 eve 500.99
+
+#
+# Test 4: Write via external table with CDC enabled
+#
+
+statement ok
+CREATE EXTERNAL TABLE cdc_external_write (
+    id INT,
+    name VARCHAR,
+    value DOUBLE
+)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet_cdc/external_table/' OPTIONS (
+    'format.use_content_defined_chunking' 'true'
+)
+
+query I
+INSERT INTO cdc_external_write SELECT * FROM cdc_source
+----
+5
+
+query ITR rowsort
+SELECT * FROM cdc_external_write
+----
+1 alice 100.5
+2 bob 200.75
+3 charlie 300.25
+4 diana 400
+5 eve 500.99
+
+#
+# Test 5: Write larger dataset to exercise CDC chunking logic
+#
+
+statement ok
+CREATE TABLE cdc_large_source AS
+    SELECT
+        value as id,
+        CONCAT('name_', CAST(value AS VARCHAR)) as name,
+        CAST(value AS DOUBLE) * 1.5 as amount,
+        CASE WHEN value % 2 = 0 THEN true ELSE false END as flag
+    FROM generate_series(1, 1000) t
+
+query I
+COPY cdc_large_source TO 'test_files/scratch/parquet_cdc/large/'
+STORED AS PARQUET
+OPTIONS (
+    'format.use_content_defined_chunking' 'true'
+)
+----
+1000
+
+statement ok
+CREATE EXTERNAL TABLE cdc_large_read
+STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet_cdc/large/'
+
+query I
+SELECT COUNT(*) FROM cdc_large_read
+----
+1000
+
+query IR
+SELECT MIN(id), MIN(amount) FROM cdc_large_read
+----
+1 1.5
+
+query IR
+SELECT MAX(id), MAX(amount) FROM cdc_large_read
+----
+1000 1500
+
+query I
+SELECT COUNT(*) FROM cdc_large_read WHERE flag = true
+----
+500
+
+#
+# Test 6: CDC with different data types including NULLs
+#
+
+statement ok
+CREATE TABLE cdc_types_source AS VALUES
+    (1::INT, 'text'::VARCHAR, 3.14::DOUBLE, true::BOOLEAN, DATE '2024-01-15', TIMESTAMP '2024-01-15 10:30:00'),
+    (2::INT, 'more'::VARCHAR, 2.72::DOUBLE, false::BOOLEAN, DATE '2024-06-20', TIMESTAMP '2024-06-20 14:45:00'),
+    (3::INT, NULL::VARCHAR, NULL::DOUBLE, NULL::BOOLEAN, NULL::DATE, NULL::TIMESTAMP)
+
+query I
+COPY cdc_types_source TO 'test_files/scratch/parquet_cdc/types/'
+STORED AS PARQUET
+OPTIONS (
+    'format.use_content_defined_chunking' 'true'
+)
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE cdc_types_read
+STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet_cdc/types/'
+
+query ITRBDP rowsort
+SELECT * FROM cdc_types_read
+----
+1 text 3.14 true 2024-01-15 2024-01-15T10:30:00
+2 more 2.72 false 2024-06-20 2024-06-20T14:45:00
+3 NULL NULL NULL NULL NULL
diff --git a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt
index 0166cd2572ce6..85f9549357138 100644
--- a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt
+++ b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt
@@ -95,10 +95,9 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [a@0 ASC NULLS LAST]
 02)--SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------FilterExec: b@1 > 2, projection=[a@0]
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
-06)----------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=b@1 > 2, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2, required_guarantees=[]
+03)----FilterExec: b@1 > 2, projection=[a@0]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
+05)--------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=b@1 > 2, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2, required_guarantees=[]
 
 query TT
 EXPLAIN select a from t_pushdown where b > 2 ORDER BY a;
@@ -133,11 +132,9 @@ logical_plan
 04)------TableScan: t projection=[a, b], partial_filters=[t.b = Int32(2)]
 physical_plan
 01)CoalescePartitionsExec
-02)--ProjectionExec: expr=[a@0 as a]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------FilterExec: b@1 = 2
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
-06)----------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=b@1 = 2, pruning_predicate=b_null_count@2 != row_count@3 AND b_min@0 <= 2 AND 2 <= b_max@1, required_guarantees=[b in (2)]
+02)--FilterExec: b@1 = 2, projection=[a@0]
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
+04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=b@1 = 2, pruning_predicate=b_null_count@2 != row_count@3 AND b_min@0 <= 2 AND 2 <= b_max@1, required_guarantees=[b in (2)]
 
 query TT
 EXPLAIN select a from t_pushdown where b = 2 ORDER BY b;
@@ -266,10 +263,9 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [a@0 ASC NULLS LAST]
 02)--SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------FilterExec: b@1 > 2, projection=[a@0]
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
-06)----------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=b@1 > 2, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2, required_guarantees=[]
+03)----FilterExec: b@1 > 2, projection=[a@0]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
+05)--------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=b@1 > 2, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2, required_guarantees=[]
 
 query TT
 EXPLAIN select a from t_pushdown where b > 2 ORDER BY a;
@@ -304,11 +300,9 @@ logical_plan
 04)------TableScan: t projection=[a, b], partial_filters=[t.b = Int32(2)]
 physical_plan
 01)CoalescePartitionsExec
-02)--ProjectionExec: expr=[a@0 as a]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------FilterExec: b@1 = 2
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
-06)----------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=b@1 = 2, pruning_predicate=b_null_count@2 != row_count@3 AND b_min@0 <= 2 AND 2 <= b_max@1, required_guarantees=[b in (2)]
+02)--FilterExec: b@1 = 2, projection=[a@0]
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
+04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=b@1 = 2, pruning_predicate=b_null_count@2 != row_count@3 AND b_min@0 <= 2 AND 2 <= b_max@1, required_guarantees=[b in (2)]
 
 query TT
 EXPLAIN select a from t_pushdown where b = 2 ORDER BY b;
@@ -344,10 +338,9 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [a@0 ASC NULLS LAST]
 02)--SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------FilterExec: b@1 > 2, projection=[a@0]
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
-06)----------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=b@1 > 2, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2, required_guarantees=[]
+03)----FilterExec: b@1 > 2, projection=[a@0]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
+05)--------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=b@1 > 2, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2, required_guarantees=[]
 
 query T
 select a from t_pushdown where b = 2 ORDER BY b;
@@ -416,9 +409,8 @@ logical_plan
 02)--Filter: CAST(t_pushdown.b AS Float64) > random()
 03)----TableScan: t_pushdown projection=[a, b]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: CAST(b@1 AS Float64) > random(), projection=[a@0]
-03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet
+01)FilterExec: CAST(b@1 AS Float64) > random(), projection=[a@0]
+02)--DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet
 
 ## cleanup
 statement ok
@@ -571,3 +563,380 @@ ORDER BY start_timestamp, trace_id
 LIMIT 1;
 ----
 2024-10-01T00:00:00
+
+###
+# Array function predicate pushdown tests
+# These tests verify that array_has, array_has_all, and array_has_any predicates
+# are correctly pushed down to the DataSourceExec node
+###
+
+# Create test data with array columns
+statement ok
+COPY (
+    SELECT 1 as id, ['rust', 'performance'] as tags
+    UNION ALL
+    SELECT 2 as id, ['python', 'javascript'] as tags
+    UNION ALL
+    SELECT 3 as id, ['rust', 'webassembly'] as tags
+)
+TO 'test_files/scratch/parquet_filter_pushdown/array_data/data.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE array_test STORED AS PARQUET LOCATION 'test_files/scratch/parquet_filter_pushdown/array_data/';
+
+statement ok
+SET datafusion.execution.parquet.pushdown_filters = true;
+
+# Test array_has predicate pushdown
+query I?
+SELECT id, tags FROM array_test WHERE array_has(tags, 'rust') ORDER BY id;
+----
+1 [rust, performance]
+3 [rust, webassembly]
+
+query TT
+EXPLAIN SELECT id, tags FROM array_test WHERE array_has(tags, 'rust') ORDER BY id;
+----
+logical_plan
+01)Sort: array_test.id ASC NULLS LAST
+02)--Filter: array_has(array_test.tags, Utf8("rust"))
+03)----TableScan: array_test projection=[id, tags], partial_filters=[array_has(array_test.tags, Utf8("rust"))]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/array_data/data.parquet]]}, projection=[id, tags], file_type=parquet, predicate=array_has(tags@1, rust)
+
+# Test array_has_all predicate pushdown
+query I?
+SELECT id, tags FROM array_test WHERE array_has_all(tags, ['rust', 'performance']) ORDER BY id;
+----
+1 [rust, performance]
+
+query TT
+EXPLAIN SELECT id, tags FROM array_test WHERE array_has_all(tags, ['rust', 'performance']) ORDER BY id;
+----
+logical_plan
+01)Sort: array_test.id ASC NULLS LAST
+02)--Filter: array_has_all(array_test.tags, List([rust, performance]))
+03)----TableScan: array_test projection=[id, tags], partial_filters=[array_has_all(array_test.tags, List([rust, performance]))]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/array_data/data.parquet]]}, projection=[id, tags], file_type=parquet, predicate=array_has_all(tags@1, [rust, performance])
+
+# Test array_has_any predicate pushdown
+query I?
+SELECT id, tags FROM array_test WHERE array_has_any(tags, ['python', 'go']) ORDER BY id;
+----
+2 [python, javascript]
+
+query TT
+EXPLAIN SELECT id, tags FROM array_test WHERE array_has_any(tags, ['python', 'go']) ORDER BY id;
+----
+logical_plan
+01)Sort: array_test.id ASC NULLS LAST
+02)--Filter: array_has_any(array_test.tags, List([python, go]))
+03)----TableScan: array_test projection=[id, tags], partial_filters=[array_has_any(array_test.tags, List([python, go]))]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/array_data/data.parquet]]}, projection=[id, tags], file_type=parquet, predicate=array_has_any(tags@1, [python, go])
+
+# Test complex predicate with OR
+query I?
+SELECT id, tags FROM array_test WHERE array_has_all(tags, ['rust']) OR array_has_any(tags, ['python', 'go']) ORDER BY id;
+----
+1 [rust, performance]
+2 [python, javascript]
+3 [rust, webassembly]
+
+query TT
+EXPLAIN SELECT id, tags FROM array_test WHERE array_has_all(tags, ['rust']) OR array_has_any(tags, ['python', 'go']) ORDER BY id;
+----
+logical_plan
+01)Sort: array_test.id ASC NULLS LAST
+02)--Filter: array_has_all(array_test.tags, List([rust])) OR array_has_any(array_test.tags, List([python, go]))
+03)----TableScan: array_test projection=[id, tags], partial_filters=[array_has_all(array_test.tags, List([rust])) OR array_has_any(array_test.tags, List([python, go]))]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/array_data/data.parquet]]}, projection=[id, tags], file_type=parquet, predicate=array_has_all(tags@1, [rust]) OR array_has_any(tags@1, [python, go])
+
+# Test array function with other predicates
+query I?
+SELECT id, tags FROM array_test WHERE id > 1 AND array_has(tags, 'rust') ORDER BY id;
+----
+3 [rust, webassembly]
+
+query TT
+EXPLAIN SELECT id, tags FROM array_test WHERE id > 1 AND array_has(tags, 'rust') ORDER BY id;
+----
+logical_plan
+01)Sort: array_test.id ASC NULLS LAST
+02)--Filter: array_test.id > Int64(1) AND array_has(array_test.tags, Utf8("rust"))
+03)----TableScan: array_test projection=[id, tags], partial_filters=[array_test.id > Int64(1), array_has(array_test.tags, Utf8("rust"))]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/array_data/data.parquet]]}, projection=[id, tags], file_type=parquet, predicate=id@0 > 1 AND array_has(tags@1, rust), pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[]
+
+###
+# Test filter pushdown through UNION with mixed support
+# This tests the case where one child supports filter pushdown (parquet) and one doesn't (memory table)
+###
+
+# enable filter pushdown
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+statement ok
+set datafusion.optimizer.max_passes = 0;
+
+# Create memory table with matching schema (a: VARCHAR, b: BIGINT)
+statement ok
+CREATE TABLE t_union_mem(a VARCHAR, b BIGINT) AS VALUES ('qux', 4), ('quux', 5);
+
+# Create parquet table with matching schema
+statement ok
+CREATE EXTERNAL TABLE t_union_parquet(a VARCHAR, b BIGINT) STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet';
+
+# Query results combining memory table and Parquet with filter
+query I rowsort
+SELECT b FROM (
+  SELECT a, b FROM t_union_mem
+  UNION ALL
+  SELECT a, b FROM t_union_parquet
+) WHERE b > 2;
+----
+3
+4
+5
+50
+
+# Explain the union query - filter should be pushed to parquet but not memory table
+query TT
+EXPLAIN SELECT b FROM (
+  SELECT a, b FROM t_union_mem
+  UNION ALL
+  SELECT a, b FROM t_union_parquet
+) WHERE b > 2;
+----
+logical_plan
+01)Projection: b
+02)--Filter: b > Int64(2)
+03)----Union
+04)------Projection: t_union_mem.a, t_union_mem.b
+05)--------TableScan: t_union_mem
+06)------Projection: t_union_parquet.a, t_union_parquet.b
+07)--------TableScan: t_union_parquet
+physical_plan
+01)UnionExec
+02)--FilterExec: b@0 > 2
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet]]}, projection=[b], file_type=parquet, predicate=b@1 > 2, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2, required_guarantees=[]
+
+# Clean up union test tables
+statement ok
+DROP TABLE t_union_mem;
+
+statement ok
+DROP TABLE t_union_parquet;
+
+# Cleanup settings
+statement ok
+set datafusion.optimizer.max_passes = 3;
+
+statement ok
+set datafusion.execution.parquet.pushdown_filters = false;
+
+
+# Regression test for https://github.com/apache/datafusion/issues/20696
+# Multi-column INNER JOIN with dictionary fails
+# when parquet pushdown filters are enabled.
+
+
+statement ok
+COPY (
+  SELECT
+    to_timestamp_nanos(time_ns) AS time,
+    arrow_cast(state, 'Dictionary(Int32, Utf8)') AS state,
+    arrow_cast(city, 'Dictionary(Int32, Utf8)') AS city,
+    temp
+  FROM (
+    VALUES
+      (200, 'CA', 'LA', 90.0),
+      (250, 'MA', 'Boston', 72.4),
+      (100, 'MA', 'Boston', 70.4),
+      (350, 'CA', 'LA', 90.0)
+  ) AS t(time_ns, state, city, temp)
+)
+TO 'test_files/scratch/parquet_filter_pushdown/issue_20696/h2o/data.parquet';
+
+statement ok
+COPY (
+  SELECT
+    to_timestamp_nanos(time_ns) AS time,
+    arrow_cast(state, 'Dictionary(Int32, Utf8)') AS state,
+    arrow_cast(city, 'Dictionary(Int32, Utf8)') AS city,
+    temp,
+    reading
+  FROM (
+    VALUES
+      (250, 'MA', 'Boston', 53.4, 51.0),
+      (100, 'MA', 'Boston', 50.4, 50.0)
+  ) AS t(time_ns, state, city, temp, reading)
+)
+TO 'test_files/scratch/parquet_filter_pushdown/issue_20696/o2/data.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE h2o_parquet_20696 STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet_filter_pushdown/issue_20696/h2o/';
+
+statement ok
+CREATE EXTERNAL TABLE o2_parquet_20696 STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet_filter_pushdown/issue_20696/o2/';
+
+# Query should work both with and without filters
+statement ok
+set datafusion.execution.parquet.pushdown_filters = false;
+
+query RRR
+SELECT
+  h2o_parquet_20696.temp AS h2o_temp,
+  o2_parquet_20696.temp AS o2_temp,
+  o2_parquet_20696.reading
+FROM h2o_parquet_20696
+INNER JOIN o2_parquet_20696
+  ON h2o_parquet_20696.time = o2_parquet_20696.time
+  AND h2o_parquet_20696.state = o2_parquet_20696.state
+  AND h2o_parquet_20696.city = o2_parquet_20696.city
+WHERE h2o_parquet_20696.time >= '1970-01-01T00:00:00.000000050Z'
+  AND h2o_parquet_20696.time <= '1970-01-01T00:00:00.000000300Z';
+----
+72.4 53.4 51
+70.4 50.4 50
+
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+query RRR
+SELECT
+  h2o_parquet_20696.temp AS h2o_temp,
+  o2_parquet_20696.temp AS o2_temp,
+  o2_parquet_20696.reading
+FROM h2o_parquet_20696
+INNER JOIN o2_parquet_20696
+  ON h2o_parquet_20696.time = o2_parquet_20696.time
+  AND h2o_parquet_20696.state = o2_parquet_20696.state
+  AND h2o_parquet_20696.city = o2_parquet_20696.city
+WHERE h2o_parquet_20696.time >= '1970-01-01T00:00:00.000000050Z'
+  AND h2o_parquet_20696.time <= '1970-01-01T00:00:00.000000300Z';
+----
+72.4 53.4 51
+70.4 50.4 50
+
+# Cleanup
+statement ok
+DROP TABLE h2o_parquet_20696;
+
+statement ok
+DROP TABLE o2_parquet_20696;
+
+# Cleanup settings
+statement ok
+set datafusion.execution.parquet.pushdown_filters = false;
+
+##########
+# Regression test: filter pushdown with Struct columns in schema
+#
+# When a schema has Struct columns, Arrow field indices diverge from Parquet
+# leaf indices (Struct children become separate leaves). A filter on a
+# primitive column *after* a Struct must use the correct Parquet leaf index.
+#
+# Schema:
+#   Arrow:   col_a=0  struct_col=1              col_b=2
+#   Parquet: col_a=0  struct_col.x=1  struct_col.y=2  col_b=3
+##########
+
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+statement ok
+COPY (
+  SELECT
+    column1 as col_a,
+    column2 as struct_col,
+    column3 as col_b
+  FROM VALUES
+    (1, {x: 10, y: 100}, 'aaa'),
+    (2, {x: 20, y: 200}, 'target'),
+    (3, {x: 30, y: 300}, 'zzz')
+) TO 'test_files/scratch/parquet_filter_pushdown/struct_filter.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE t_struct_filter
+STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet_filter_pushdown/struct_filter.parquet';
+
+# Filter on col_b (the primitive column after the struct).
+# Before the fix, this returned 0 rows because the filter read struct_col.y
+# (Parquet leaf 2) instead of col_b (Parquet leaf 3).
+query IT
+SELECT col_a, col_b FROM t_struct_filter WHERE col_b = 'target';
+----
+2 target
+
+# Clean up
+statement ok
+set datafusion.execution.parquet.pushdown_filters = false;
+
+statement ok
+DROP TABLE t_struct_filter;
+
+##########
+# Regression test for https://github.com/apache/datafusion/issues/20937
+#
+# Dynamic filter pushdown fails when joining VALUES against
+# Dictionary-encoded Parquet columns. The InListExpr's ArrayStaticFilter
+# unwraps the needle Dictionary but not the stored in_array, causing a
+# make_comparator(Utf8, Dictionary) type mismatch.
+##########
+
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+statement ok
+set datafusion.execution.parquet.reorder_filters = true;
+
+statement ok
+COPY (
+  SELECT
+    arrow_cast(chr(65 + (row_num % 26)), 'Dictionary(Int32, Utf8)') as tag1,
+    row_num * 1.0 as value
+  FROM (SELECT unnest(range(0, 10000)) as row_num)
+) TO 'test_files/scratch/parquet_filter_pushdown/dict_filter_bug.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE dict_filter_bug
+STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet_filter_pushdown/dict_filter_bug.parquet';
+
+query TR
+SELECT t.tag1, t.value
+FROM dict_filter_bug t
+JOIN (VALUES ('A'), ('B')) AS v(c1)
+ON t.tag1 = v.c1
+ORDER BY t.tag1, t.value
+LIMIT 4;
+----
+A 0
+A 26
+A 52
+A 78
+
+# Cleanup
+statement ok
+set datafusion.execution.parquet.pushdown_filters = false;
+
+statement ok
+set datafusion.execution.parquet.reorder_filters = false;
+
+statement ok
+DROP TABLE dict_filter_bug;
diff --git a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt
index fe909e70ffb00..a4a613e383ec8 100644
--- a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt
+++ b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt
@@ -274,5 +274,17 @@ logical_plan
 02)--TableScan: test_table projection=[constant_col]
 physical_plan
 01)SortPreservingMergeExec: [constant_col@0 ASC NULLS LAST]
-02)--SortExec: expr=[constant_col@0 ASC NULLS LAST], preserve_partitioning=[true]
-03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet]]}, projection=[constant_col], file_type=parquet
+02)--DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet]]}, projection=[constant_col], output_ordering=[constant_col@0 ASC NULLS LAST], file_type=parquet
+
+# Config reset
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+reset datafusion.execution.collect_statistics;
+
+statement ok
+reset datafusion.execution.split_file_groups_by_statistics;
diff --git a/datafusion/sqllogictest/test_files/parquet_statistics.slt b/datafusion/sqllogictest/test_files/parquet_statistics.slt
index c04235ef4ee6f..1073f60a0fef2 100644
--- a/datafusion/sqllogictest/test_files/parquet_statistics.slt
+++ b/datafusion/sqllogictest/test_files/parquet_statistics.slt
@@ -59,10 +59,9 @@ query TT
 EXPLAIN SELECT * FROM test_table WHERE column1 = 1;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]]
-02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
-04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
+01)FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(10), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0) Distinct=Exact(1) ScanBytes=Inexact(40))]]
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(40), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0) ScanBytes=Inexact(40))]]
+03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(40), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0) ScanBytes=Inexact(40))]]
 
 # cleanup
 statement ok
@@ -85,16 +84,14 @@ query TT
 EXPLAIN SELECT * FROM test_table WHERE column1 = 1;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]]
-02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
-04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
+01)FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(10), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0) Distinct=Exact(1) ScanBytes=Inexact(40))]]
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(40), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0) ScanBytes=Inexact(40))]]
+03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(40), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0) ScanBytes=Inexact(40))]]
 
 # cleanup
 statement ok
 DROP TABLE test_table;
 
-
 ######
 # When the setting is false, the statistics are NOT gathered
 ######
@@ -112,11 +109,89 @@ query TT
 EXPLAIN SELECT * FROM test_table WHERE column1 = 1;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(1)))]]
-02)--FilterExec: column1@0 = 1, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)))]]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
-04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
+01)FilterExec: column1@0 = 1, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Distinct=Exact(1))]]
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
+03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
 
 # cleanup
 statement ok
 DROP TABLE test_table;
+
+######
+# Equality filter NDV=Exact(1) for different numeric types
+######
+
+statement ok
+set datafusion.execution.collect_statistics = true;
+
+query I
+COPY (
+    SELECT
+        arrow_cast(v, 'Int8') AS i8,
+        arrow_cast(v, 'Int64') AS i64,
+        arrow_cast(v + 0.5, 'Float32') AS f32,
+        arrow_cast(v + 0.5, 'Float64') AS f64
+    FROM (VALUES (1), (2), (3), (4), (5)) AS t(v)
+)
+TO 'test_files/scratch/parquet_statistics/typed_table.parquet'
+STORED AS PARQUET;
+----
+5
+
+statement ok
+CREATE EXTERNAL TABLE typed_table (
+    i8 TINYINT,
+    i64 BIGINT,
+    f32 FLOAT,
+    f64 DOUBLE
+) STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet_statistics/typed_table.parquet';
+
+# Int8 equality
+query TT
+EXPLAIN SELECT i8 FROM typed_table WHERE i8 = 2;
+----
+physical_plan
+01)FilterExec: i8@0 = 2, statistics=[Rows=Inexact(1), Bytes=Inexact(1), [(Col[0]: Min=Exact(Int8(2)) Max=Exact(Int8(2)) Null=Inexact(0) Distinct=Exact(1) ScanBytes=Inexact(5))]]
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, statistics=[Rows=Inexact(5), Bytes=Inexact(5), [(Col[0]: Min=Inexact(Int8(1)) Max=Inexact(Int8(5)) Null=Inexact(0) ScanBytes=Inexact(5))]]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/typed_table.parquet]]}, projection=[i8], file_type=parquet, predicate=i8@0 = 2, pruning_predicate=i8_null_count@2 != row_count@3 AND i8_min@0 <= 2 AND 2 <= i8_max@1, required_guarantees=[i8 in (2)], statistics=[Rows=Inexact(5), Bytes=Inexact(5), [(Col[0]: Min=Inexact(Int8(1)) Max=Inexact(Int8(5)) Null=Inexact(0) ScanBytes=Inexact(5))]]
+
+# Int64 equality
+query TT
+EXPLAIN SELECT i64 FROM typed_table WHERE i64 = 2;
+----
+physical_plan
+01)FilterExec: i64@0 = 2, statistics=[Rows=Inexact(1), Bytes=Inexact(8), [(Col[0]: Min=Exact(Int64(2)) Max=Exact(Int64(2)) Null=Inexact(0) Distinct=Exact(1) ScanBytes=Inexact(40))]]
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, statistics=[Rows=Inexact(5), Bytes=Inexact(40), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(5)) Null=Inexact(0) ScanBytes=Inexact(40))]]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/typed_table.parquet]]}, projection=[i64], file_type=parquet, predicate=i64@1 = 2, pruning_predicate=i64_null_count@2 != row_count@3 AND i64_min@0 <= 2 AND 2 <= i64_max@1, required_guarantees=[i64 in (2)], statistics=[Rows=Inexact(5), Bytes=Inexact(40), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(5)) Null=Inexact(0) ScanBytes=Inexact(40))]]
+
+# Float32 equality
+query TT
+EXPLAIN SELECT f32 FROM typed_table WHERE f32 = 2.5;
+----
+physical_plan
+01)FilterExec: CAST(f32@0 AS Float64) = 2.5, statistics=[Rows=Inexact(1), Bytes=Inexact(1), [(Col[0]: Min=Exact(Float32(2.5)) Max=Exact(Float32(2.5)) Null=Inexact(0) Distinct=Exact(1) ScanBytes=Inexact(20))]]
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, statistics=[Rows=Inexact(5), Bytes=Inexact(20), [(Col[0]: Min=Inexact(Float32(1.5)) Max=Inexact(Float32(5.5)) Null=Inexact(0) ScanBytes=Inexact(20))]]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/typed_table.parquet]]}, projection=[f32], file_type=parquet, predicate=CAST(f32@2 AS Float64) = 2.5, pruning_predicate=f32_null_count@2 != row_count@3 AND CAST(f32_min@0 AS Float64) <= 2.5 AND 2.5 <= CAST(f32_max@1 AS Float64), required_guarantees=[], statistics=[Rows=Inexact(5), Bytes=Inexact(20), [(Col[0]: Min=Inexact(Float32(1.5)) Max=Inexact(Float32(5.5)) Null=Inexact(0) ScanBytes=Inexact(20))]]
+
+# Reversed operand order: literal = column (Float64)
+query TT
+EXPLAIN SELECT f64 FROM typed_table WHERE 2.5 = f64;
+----
+physical_plan
+01)FilterExec: f64@0 = 2.5, statistics=[Rows=Inexact(1), Bytes=Inexact(1), [(Col[0]: Min=Exact(Float64(2.5)) Max=Exact(Float64(2.5)) Null=Inexact(0) Distinct=Exact(1) ScanBytes=Inexact(40))]]
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, statistics=[Rows=Inexact(5), Bytes=Inexact(40), [(Col[0]: Min=Inexact(Float64(1.5)) Max=Inexact(Float64(5.5)) Null=Inexact(0) ScanBytes=Inexact(40))]]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/typed_table.parquet]]}, projection=[f64], file_type=parquet, predicate=f64@3 = 2.5, pruning_predicate=f64_null_count@2 != row_count@3 AND f64_min@0 <= 2.5 AND 2.5 <= f64_max@1, required_guarantees=[f64 in (2.5)], statistics=[Rows=Inexact(5), Bytes=Inexact(40), [(Col[0]: Min=Inexact(Float64(1.5)) Max=Inexact(Float64(5.5)) Null=Inexact(0) ScanBytes=Inexact(40))]]
+
+statement ok
+DROP TABLE typed_table;
+
+# Config reset
+statement ok
+RESET datafusion.execution.collect_statistics;
+
+statement ok
+RESET datafusion.explain.physical_plan_only;
+
+statement ok
+RESET datafusion.explain.show_statistics;
diff --git a/datafusion/sqllogictest/test_files/pipe_operator.slt b/datafusion/sqllogictest/test_files/pipe_operator.slt
index 5908b3d6b2a4d..406ddafc7bdea 100644
--- a/datafusion/sqllogictest/test_files/pipe_operator.slt
+++ b/datafusion/sqllogictest/test_files/pipe_operator.slt
@@ -195,3 +195,7 @@ query TII
 ----
 apples 2 123
 bananas 5 NULL
+
+# Config reset
+statement ok
+RESET datafusion.sql_parser.dialect;
diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt
index 77ee3e4f05a0d..dd1f69b286a91 100644
--- a/datafusion/sqllogictest/test_files/predicates.slt
+++ b/datafusion/sqllogictest/test_files/predicates.slt
@@ -668,20 +668,15 @@ logical_plan
 05)----Filter: (part.p_brand = Utf8View("Brand#12") AND part.p_size <= Int32(5) OR part.p_brand = Utf8View("Brand#23") AND part.p_size <= Int32(10) OR part.p_brand = Utf8View("Brand#34") AND part.p_size <= Int32(15)) AND part.p_size >= Int32(1)
 06)------TableScan: part projection=[p_partkey, p_brand, p_size], partial_filters=[part.p_size >= Int32(1), part.p_brand = Utf8View("Brand#12") AND part.p_size <= Int32(5) OR part.p_brand = Utf8View("Brand#23") AND part.p_size <= Int32(10) OR part.p_brand = Utf8View("Brand#34") AND part.p_size <= Int32(15)]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], filter=p_brand@1 = Brand#12 AND l_quantity@0 >= Some(100),15,2 AND l_quantity@0 <= Some(1100),15,2 AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND l_quantity@0 >= Some(1000),15,2 AND l_quantity@0 <= Some(2000),15,2 AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND l_quantity@0 >= Some(2000),15,2 AND l_quantity@0 <= Some(3000),15,2 AND p_size@2 <= 15, projection=[l_partkey@0]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------FilterExec: l_quantity@1 >= Some(100),15,2 AND l_quantity@1 <= Some(1100),15,2 OR l_quantity@1 >= Some(1000),15,2 AND l_quantity@1 <= Some(2000),15,2 OR l_quantity@1 >= Some(2000),15,2 AND l_quantity@1 <= Some(3000),15,2
-07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/lineitem.csv]]}, projection=[l_partkey, l_quantity], file_type=csv, has_header=true
-09)----CoalesceBatchesExec: target_batch_size=8192
-10)------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
-11)--------CoalesceBatchesExec: target_batch_size=8192
-12)----------FilterExec: (p_brand@1 = Brand#12 AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_size@2 <= 15) AND p_size@2 >= 1
-13)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-14)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/part.csv]]}, projection=[p_partkey, p_brand, p_size], file_type=csv, has_header=true
+01)HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], filter=p_brand@1 = Brand#12 AND l_quantity@0 >= Some(100),15,2 AND l_quantity@0 <= Some(1100),15,2 AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND l_quantity@0 >= Some(1000),15,2 AND l_quantity@0 <= Some(2000),15,2 AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND l_quantity@0 >= Some(2000),15,2 AND l_quantity@0 <= Some(3000),15,2 AND p_size@2 <= 15, projection=[l_partkey@0]
+02)--RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
+03)----FilterExec: l_quantity@1 >= Some(100),15,2 AND l_quantity@1 <= Some(1100),15,2 OR l_quantity@1 >= Some(1000),15,2 AND l_quantity@1 <= Some(2000),15,2 OR l_quantity@1 >= Some(2000),15,2 AND l_quantity@1 <= Some(3000),15,2
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/lineitem.csv]]}, projection=[l_partkey, l_quantity], file_type=csv, has_header=true
+06)--RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
+07)----FilterExec: (p_brand@1 = Brand#12 AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_size@2 <= 15) AND p_size@2 >= 1
+08)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/part.csv]]}, projection=[p_partkey, p_brand, p_size], file_type=csv, has_header=true
 
 ########
 # TPCH Q19 - Pull predicates to inner join (simplified)
@@ -760,22 +755,15 @@ logical_plan
 10)------TableScan: partsupp projection=[ps_partkey, ps_suppkey]
 physical_plan
 01)AggregateExec: mode=SinglePartitioned, gby=[p_partkey@2 as p_partkey], aggr=[sum(lineitem.l_extendedprice), avg(lineitem.l_discount), count(DISTINCT partsupp.ps_suppkey)]
-02)--ProjectionExec: expr=[l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, p_partkey@3 as p_partkey, ps_suppkey@0 as ps_suppkey]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(ps_partkey@0, p_partkey@2)], projection=[ps_suppkey@1, l_extendedprice@2, l_discount@3, p_partkey@4]
-05)--------DataSourceExec: partitions=1, partition_sizes=[1]
-06)--------CoalesceBatchesExec: target_batch_size=8192
-07)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], projection=[l_extendedprice@1, l_discount@2, p_partkey@3]
-08)------------CoalesceBatchesExec: target_batch_size=8192
-09)--------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
-10)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-11)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/lineitem.csv]]}, projection=[l_partkey, l_extendedprice, l_discount], file_type=csv, has_header=true
-12)------------CoalesceBatchesExec: target_batch_size=8192
-13)--------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
-14)----------------CoalesceBatchesExec: target_batch_size=8192
-15)------------------FilterExec: p_brand@1 = Brand#12 OR p_brand@1 = Brand#23, projection=[p_partkey@0]
-16)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-17)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/part.csv]]}, projection=[p_partkey, p_brand], file_type=csv, has_header=true
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(ps_partkey@0, p_partkey@2)], projection=[l_extendedprice@2, l_discount@3, p_partkey@4, ps_suppkey@1]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], projection=[l_extendedprice@1, l_discount@2, p_partkey@3]
+05)------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=1
+06)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/lineitem.csv]]}, projection=[l_partkey, l_extendedprice, l_discount], file_type=csv, has_header=true
+07)------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
+08)--------FilterExec: p_brand@1 = Brand#12 OR p_brand@1 = Brand#23, projection=[p_partkey@0]
+09)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+10)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/part.csv]]}, projection=[p_partkey, p_brand], file_type=csv, has_header=true
 
 # Simplification of a binary operator with a NULL value
 
@@ -805,9 +793,8 @@ logical_plan
 01)Filter: t.x < Int32(5) AND Boolean(NULL)
 02)--TableScan: t projection=[x]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: x@0 < 5 AND NULL
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: x@0 < 5 AND NULL
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TT
 EXPLAIN FORMAT INDENT SELECT * FROM t WHERE x < 5 OR (10 * NULL < x);
@@ -816,9 +803,8 @@ logical_plan
 01)Filter: t.x < Int32(5) OR Boolean(NULL)
 02)--TableScan: t projection=[x]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: x@0 < 5 OR NULL
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: x@0 < 5 OR NULL
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 drop table t;
@@ -847,9 +833,8 @@ logical_plan
 01)Filter: t.x = Int32(5)
 02)--TableScan: t projection=[x]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: x@0 = 5
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: x@0 = 5
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TT
 explain select x from t where x NOT IN (1,2,3,4,5) AND x IN (1,2,3);
@@ -878,5 +863,9 @@ WHERE NULL NOT IN (SELECT * FROM empty); -- all rows should be returned
 ----
 1
 
+# Config reset
+statement ok
+RESET datafusion.catalog.information_schema;
+
 statement ok
 drop table t;
diff --git a/datafusion/sqllogictest/test_files/prepare.slt b/datafusion/sqllogictest/test_files/prepare.slt
index d61603ae65588..8e8b1cd8e6ad0 100644
--- a/datafusion/sqllogictest/test_files/prepare.slt
+++ b/datafusion/sqllogictest/test_files/prepare.slt
@@ -34,7 +34,7 @@ statement error DataFusion error: SQL error: ParserError
 PREPARE AS SELECT id, age  FROM person WHERE age = $foo;
 
 # param following a non-number, $foo, not supported
-statement error Invalid placeholder, not a number: \$foo
+statement error Unknown placeholder: \$foo
 PREPARE my_plan(INT) AS SELECT id, age  FROM person WHERE age = $foo;
 
 # not specify table hence cannot specify columns
@@ -204,9 +204,11 @@ EXECUTE my_plan6(20.0);
 statement error Cast error: Cannot cast string 'foo' to value of Int32 type
 EXECUTE my_plan6('foo');
 
-# TODO: support non-literal expressions
-statement error Unsupported parameter type
-EXECUTE my_plan6(10 + 20);
+# support non-literal expressions
+query II
+EXECUTE my_plan6(10 + 10);
+----
+1 20
 
 statement ok
 DEALLOCATE my_plan6;
@@ -327,3 +329,47 @@ EXECUTE my_plan('a', 'b');
 ----
 1 a
 2 b
+
+statement ok
+SET datafusion.explain.logical_plan_only=false;
+
+statement ok
+DEALLOCATE my_plan
+
+statement ok
+SET datafusion.explain.logical_plan_only=true;
+
+# Prepare with alias
+query TT
+EXPLAIN PREPARE my_plan(INT, INT) AS SELECT $1 AS one, $2 AS two;
+----
+logical_plan
+01)Prepare: "my_plan" [Int32, Int32]
+02)--Projection: $1 AS one, $2 AS two
+03)----EmptyRelation: rows=1
+
+statement ok
+PREPARE my_plan(INT, INT) AS SELECT $1 AS one, $2 AS two;
+
+query II
+EXECUTE my_plan(1, 2)
+----
+1 2
+
+statement ok
+SET datafusion.explain.logical_plan_only=false;
+
+statement ok
+DEALLOCATE my_plan
+
+
+statement ok
+PREPARE my_plan AS SELECT a, b FROM (VALUES ($1, $2)) AS t(a, b);
+
+query II
+EXECUTE my_plan(1, 2)
+----
+1 2
+
+statement ok
+DEALLOCATE my_plan
diff --git a/datafusion/sqllogictest/test_files/preserve_file_partitioning.slt b/datafusion/sqllogictest/test_files/preserve_file_partitioning.slt
new file mode 100644
index 0000000000000..175d7d90cd8ed
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/preserve_file_partitioning.slt
@@ -0,0 +1,785 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+# Tests for preserve_file_partitions optimization
+#
+# Data Model:
+# - Fact table: Hive-partitioned by f_dkey, sorted by f_dkey, timestamp
+#   Schema: timestamp TIMESTAMP, value FLOAT64, partition column: f_dkey STRING
+#
+# - Dimension table: Single file for CollectLeft joins
+#   Schema: d_dkey STRING, env STRING, service STRING, host STRING
+#
+# Key benefits demonstrated:
+# - Eliminates RepartitionExec for aggregates/joins/windows on partition columns
+# - Eliminates SortExec when data is already sorted by partition + order columns
+# - Uses SinglePartitioned aggregation mode
+##########
+
+##########
+# SETUP: Configuration and Data Generation
+##########
+
+statement ok
+set datafusion.execution.target_partitions = 3;
+
+# Create fact table partitioned by f_dkey
+# Each partition has data sorted by timestamp
+# Partition: f_dkey=A
+query I
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 95.5),
+    (TIMESTAMP '2023-01-01T09:00:10', 102.3),
+    (TIMESTAMP '2023-01-01T09:00:20', 98.7),
+    (TIMESTAMP '2023-01-01T09:12:20', 105.1),
+    (TIMESTAMP '2023-01-01T09:12:30', 100.0),
+    (TIMESTAMP '2023-01-01T09:12:40', 150.0),
+    (TIMESTAMP '2023-01-01T09:12:50', 120.8)
+))
+TO 'test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet'
+STORED AS PARQUET;
+----
+7
+
+# Partition: f_dkey=B
+query I
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 75.2),
+    (TIMESTAMP '2023-01-01T09:00:10', 82.4),
+    (TIMESTAMP '2023-01-01T09:00:20', 78.9),
+    (TIMESTAMP '2023-01-01T09:00:30', 85.6),
+    (TIMESTAMP '2023-01-01T09:12:30', 80.0),
+    (TIMESTAMP '2023-01-01T09:12:40', 120.0),
+    (TIMESTAMP '2023-01-01T09:12:50', 92.3)
+))
+TO 'test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet'
+STORED AS PARQUET;
+----
+7
+
+# Partition: f_dkey=C
+query I
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 300.5),
+    (TIMESTAMP '2023-01-01T09:00:10', 285.7),
+    (TIMESTAMP '2023-01-01T09:00:20', 310.2),
+    (TIMESTAMP '2023-01-01T09:00:30', 295.8),
+    (TIMESTAMP '2023-01-01T09:00:40', 300.0),
+    (TIMESTAMP '2023-01-01T09:12:40', 250.0),
+    (TIMESTAMP '2023-01-01T09:12:50', 275.4)
+))
+TO 'test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet'
+STORED AS PARQUET;
+----
+7
+
+# Create dimension table (single file for CollectLeft joins)
+query I
+COPY (SELECT column1 as d_dkey, column2 as env, column3 as service, column4 as host FROM (VALUES
+    ('A', 'dev', 'log', 'ma'),
+    ('B', 'prod', 'log', 'ma'),
+    ('C', 'prod', 'log', 'vim'),
+    ('D', 'prod', 'trace', 'vim')
+))
+TO 'test_files/scratch/preserve_file_partitioning/dimension/data.parquet'
+STORED AS PARQUET;
+----
+4
+
+# Create hive-partitioned dimension table (3 partitions matching fact_table)
+# For testing Partitioned joins with matching partition counts
+query I
+COPY (SELECT 'dev' as env, 'log' as service)
+TO 'test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=A/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+query I
+COPY (SELECT 'prod' as env, 'log' as service)
+TO 'test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=B/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+query I
+COPY (SELECT 'prod' as env, 'log' as service)
+TO 'test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=C/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+# Create high-cardinality fact table (5 partitions > 3 target_partitions)
+# For testing partition merging with consistent hashing
+query I
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 100.0)
+))
+TO 'test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=A/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+query I
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 200.0)
+))
+TO 'test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=B/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+query I
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 300.0)
+))
+TO 'test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=C/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+query I
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 400.0)
+))
+TO 'test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=D/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+query I
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 500.0)
+))
+TO 'test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=E/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+##########
+# TABLE DECLARATIONS
+##########
+
+# Fact table without ordering (for basic aggregate tests)
+statement ok
+CREATE EXTERNAL TABLE fact_table (timestamp TIMESTAMP, value DOUBLE)
+STORED AS PARQUET
+PARTITIONED BY (f_dkey STRING)
+LOCATION 'test_files/scratch/preserve_file_partitioning/fact/';
+
+# Fact table with ordering (for sort elimination tests)
+statement ok
+CREATE EXTERNAL TABLE fact_table_ordered (timestamp TIMESTAMP, value DOUBLE)
+STORED AS PARQUET
+PARTITIONED BY (f_dkey STRING)
+WITH ORDER (f_dkey ASC, timestamp ASC)
+LOCATION 'test_files/scratch/preserve_file_partitioning/fact/';
+
+# Dimension table (for join tests)
+statement ok
+CREATE EXTERNAL TABLE dimension_table (d_dkey STRING, env STRING, service STRING, host STRING)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/preserve_file_partitioning/dimension/';
+
+# Hive-partitioned dimension table (3 partitions matching fact_table for Partitioned join tests)
+statement ok
+CREATE EXTERNAL TABLE dimension_table_partitioned (env STRING, service STRING)
+STORED AS PARQUET
+PARTITIONED BY (d_dkey STRING)
+LOCATION 'test_files/scratch/preserve_file_partitioning/dimension_partitioned/';
+
+# 'High'-cardinality fact table (5 partitions > 3 target_partitions)
+statement ok
+CREATE EXTERNAL TABLE high_cardinality_table (timestamp TIMESTAMP, value DOUBLE)
+STORED AS PARQUET
+PARTITIONED BY (f_dkey STRING)
+LOCATION 'test_files/scratch/preserve_file_partitioning/high_cardinality/';
+
+##########
+# TEST 1: Basic Aggregate - Without Optimization
+# Shows RepartitionExec and two-phase aggregation
+##########
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 0;
+
+query TT
+EXPLAIN SELECT f_dkey, count(*), sum(value) FROM fact_table GROUP BY f_dkey;
+----
+logical_plan
+01)Projection: fact_table.f_dkey, count(Int64(1)) AS count(*), sum(fact_table.value)
+02)--Aggregate: groupBy=[[fact_table.f_dkey]], aggr=[[count(Int64(1)), sum(fact_table.value)]]
+03)----TableScan: fact_table projection=[value, f_dkey]
+physical_plan
+01)ProjectionExec: expr=[f_dkey@0 as f_dkey, count(Int64(1))@1 as count(*), sum(fact_table.value)@2 as sum(fact_table.value)]
+02)--AggregateExec: mode=FinalPartitioned, gby=[f_dkey@0 as f_dkey], aggr=[count(Int64(1)), sum(fact_table.value)]
+03)----RepartitionExec: partitioning=Hash([f_dkey@0], 3), input_partitions=3
+04)------AggregateExec: mode=Partial, gby=[f_dkey@1 as f_dkey], aggr=[count(Int64(1)), sum(fact_table.value)]
+05)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], file_type=parquet
+
+# Verify results without optimization
+query TIR rowsort
+SELECT f_dkey, count(*), sum(value) FROM fact_table GROUP BY f_dkey;
+----
+A 7 772.4
+B 7 614.4
+C 7 2017.6
+
+##########
+# TEST 2: Basic Aggregate - With Optimization
+# Shows SinglePartitioned mode, no RepartitionExec
+##########
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 1;
+
+query TT
+EXPLAIN SELECT f_dkey, count(*), sum(value) FROM fact_table GROUP BY f_dkey;
+----
+logical_plan
+01)Projection: fact_table.f_dkey, count(Int64(1)) AS count(*), sum(fact_table.value)
+02)--Aggregate: groupBy=[[fact_table.f_dkey]], aggr=[[count(Int64(1)), sum(fact_table.value)]]
+03)----TableScan: fact_table projection=[value, f_dkey]
+physical_plan
+01)ProjectionExec: expr=[f_dkey@0 as f_dkey, count(Int64(1))@1 as count(*), sum(fact_table.value)@2 as sum(fact_table.value)]
+02)--AggregateExec: mode=SinglePartitioned, gby=[f_dkey@1 as f_dkey], aggr=[count(Int64(1)), sum(fact_table.value)]
+03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], file_type=parquet
+
+# Verify results with optimization match results without optimization
+query TIR rowsort
+SELECT f_dkey, count(*), sum(value) FROM fact_table GROUP BY f_dkey;
+----
+A 7 772.4
+B 7 614.4
+C 7 2017.6
+
+##########
+# TEST 3: Aggregate with ORDER BY - Without Optimization
+# Shows SortExec and RepartitionExec
+##########
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 0;
+
+query TT
+EXPLAIN SELECT f_dkey, count(*), avg(value) FROM fact_table_ordered GROUP BY f_dkey ORDER BY f_dkey;
+----
+logical_plan
+01)Sort: fact_table_ordered.f_dkey ASC NULLS LAST
+02)--Projection: fact_table_ordered.f_dkey, count(Int64(1)) AS count(*), avg(fact_table_ordered.value)
+03)----Aggregate: groupBy=[[fact_table_ordered.f_dkey]], aggr=[[count(Int64(1)), avg(fact_table_ordered.value)]]
+04)------TableScan: fact_table_ordered projection=[value, f_dkey]
+physical_plan
+01)SortPreservingMergeExec: [f_dkey@0 ASC NULLS LAST]
+02)--ProjectionExec: expr=[f_dkey@0 as f_dkey, count(Int64(1))@1 as count(*), avg(fact_table_ordered.value)@2 as avg(fact_table_ordered.value)]
+03)----AggregateExec: mode=FinalPartitioned, gby=[f_dkey@0 as f_dkey], aggr=[count(Int64(1)), avg(fact_table_ordered.value)], ordering_mode=Sorted
+04)------RepartitionExec: partitioning=Hash([f_dkey@0], 3), input_partitions=3, preserve_order=true, sort_exprs=f_dkey@0 ASC NULLS LAST
+05)--------AggregateExec: mode=Partial, gby=[f_dkey@1 as f_dkey], aggr=[count(Int64(1)), avg(fact_table_ordered.value)], ordering_mode=Sorted
+06)----------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], output_ordering=[f_dkey@1 ASC NULLS LAST], file_type=parquet
+
+# Verify results without optimization
+query TIR
+SELECT f_dkey, count(*), avg(value) FROM fact_table_ordered GROUP BY f_dkey ORDER BY f_dkey;
+----
+A 7 110.342857142857
+B 7 87.771428571429
+C 7 288.228571428571
+
+##########
+# TEST 4: Aggregate with ORDER BY - With Optimization
+# No SortExec, no RepartitionExec, just SortPreservingMergeExec
+##########
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 1;
+
+query TT
+EXPLAIN SELECT f_dkey, count(*), avg(value) FROM fact_table_ordered GROUP BY f_dkey ORDER BY f_dkey;
+----
+logical_plan
+01)Sort: fact_table_ordered.f_dkey ASC NULLS LAST
+02)--Projection: fact_table_ordered.f_dkey, count(Int64(1)) AS count(*), avg(fact_table_ordered.value)
+03)----Aggregate: groupBy=[[fact_table_ordered.f_dkey]], aggr=[[count(Int64(1)), avg(fact_table_ordered.value)]]
+04)------TableScan: fact_table_ordered projection=[value, f_dkey]
+physical_plan
+01)SortPreservingMergeExec: [f_dkey@0 ASC NULLS LAST]
+02)--ProjectionExec: expr=[f_dkey@0 as f_dkey, count(Int64(1))@1 as count(*), avg(fact_table_ordered.value)@2 as avg(fact_table_ordered.value)]
+03)----AggregateExec: mode=SinglePartitioned, gby=[f_dkey@1 as f_dkey], aggr=[count(Int64(1)), avg(fact_table_ordered.value)], ordering_mode=Sorted
+04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], output_ordering=[f_dkey@1 ASC NULLS LAST], file_type=parquet
+
+query TIR
+SELECT f_dkey, count(*), avg(value) FROM fact_table_ordered GROUP BY f_dkey ORDER BY f_dkey;
+----
+A 7 110.342857142857
+B 7 87.771428571429
+C 7 288.228571428571
+
+##########
+# TEST 5: Join with Hash Partitioning Propagation - Without Optimization
+# CollectLeft join followed by RepartitionExec and SortExec for aggregate
+##########
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 0;
+
+query TT
+EXPLAIN SELECT f.f_dkey, MAX(d.env), MAX(d.service), count(*), sum(f.value)
+FROM fact_table_ordered f
+INNER JOIN dimension_table d ON f.f_dkey = d.d_dkey
+WHERE d.service = 'log'
+GROUP BY f.f_dkey
+ORDER BY f.f_dkey;
+----
+logical_plan
+01)Sort: f.f_dkey ASC NULLS LAST
+02)--Projection: f.f_dkey, max(d.env), max(d.service), count(Int64(1)) AS count(*), sum(f.value)
+03)----Aggregate: groupBy=[[f.f_dkey]], aggr=[[max(d.env), max(d.service), count(Int64(1)), sum(f.value)]]
+04)------Projection: f.value, f.f_dkey, d.env, d.service
+05)--------Inner Join: f.f_dkey = d.d_dkey
+06)----------SubqueryAlias: f
+07)------------TableScan: fact_table_ordered projection=[value, f_dkey]
+08)----------SubqueryAlias: d
+09)------------Filter: dimension_table.service = Utf8View("log")
+10)--------------TableScan: dimension_table projection=[d_dkey, env, service], partial_filters=[dimension_table.service = Utf8View("log")]
+physical_plan
+01)SortPreservingMergeExec: [f_dkey@0 ASC NULLS LAST]
+02)--ProjectionExec: expr=[f_dkey@0 as f_dkey, max(d.env)@1 as max(d.env), max(d.service)@2 as max(d.service), count(Int64(1))@3 as count(*), sum(f.value)@4 as sum(f.value)]
+03)----AggregateExec: mode=FinalPartitioned, gby=[f_dkey@0 as f_dkey], aggr=[max(d.env), max(d.service), count(Int64(1)), sum(f.value)], ordering_mode=Sorted
+04)------RepartitionExec: partitioning=Hash([f_dkey@0], 3), input_partitions=3, preserve_order=true, sort_exprs=f_dkey@0 ASC NULLS LAST
+05)--------AggregateExec: mode=Partial, gby=[f_dkey@1 as f_dkey], aggr=[max(d.env), max(d.service), count(Int64(1)), sum(f.value)], ordering_mode=Sorted
+06)----------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d_dkey@0, f_dkey@1)], projection=[value@3, f_dkey@4, env@1, service@2]
+07)------------CoalescePartitionsExec
+08)--------------FilterExec: service@2 = log
+09)----------------RepartitionExec: partitioning=RoundRobinBatch(3), input_partitions=1
+10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension/data.parquet]]}, projection=[d_dkey, env, service], file_type=parquet, predicate=service@2 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)]
+11)------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], output_ordering=[f_dkey@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify results without optimization
+query TTTIR rowsort
+SELECT f.f_dkey, MAX(d.env), MAX(d.service), count(*), sum(f.value)
+FROM fact_table_ordered f
+INNER JOIN dimension_table d ON f.f_dkey = d.d_dkey
+WHERE d.service = 'log'
+GROUP BY f.f_dkey
+ORDER BY f.f_dkey;
+----
+A dev log 7 772.4
+B prod log 7 614.4
+C prod log 7 2017.6
+
+##########
+# TEST 6: Join with Hash Partitioning Propagation - With Optimization
+# Hash partitioning propagates through join, no RepartitionExec/SortExec after join
+##########
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 1;
+
+query TT
+EXPLAIN SELECT f.f_dkey, MAX(d.env), MAX(d.service), count(*), sum(f.value)
+FROM fact_table_ordered f
+INNER JOIN dimension_table d ON f.f_dkey = d.d_dkey
+WHERE d.service = 'log'
+GROUP BY f.f_dkey
+ORDER BY f.f_dkey;
+----
+logical_plan
+01)Sort: f.f_dkey ASC NULLS LAST
+02)--Projection: f.f_dkey, max(d.env), max(d.service), count(Int64(1)) AS count(*), sum(f.value)
+03)----Aggregate: groupBy=[[f.f_dkey]], aggr=[[max(d.env), max(d.service), count(Int64(1)), sum(f.value)]]
+04)------Projection: f.value, f.f_dkey, d.env, d.service
+05)--------Inner Join: f.f_dkey = d.d_dkey
+06)----------SubqueryAlias: f
+07)------------TableScan: fact_table_ordered projection=[value, f_dkey]
+08)----------SubqueryAlias: d
+09)------------Filter: dimension_table.service = Utf8View("log")
+10)--------------TableScan: dimension_table projection=[d_dkey, env, service], partial_filters=[dimension_table.service = Utf8View("log")]
+physical_plan
+01)SortPreservingMergeExec: [f_dkey@0 ASC NULLS LAST]
+02)--ProjectionExec: expr=[f_dkey@0 as f_dkey, max(d.env)@1 as max(d.env), max(d.service)@2 as max(d.service), count(Int64(1))@3 as count(*), sum(f.value)@4 as sum(f.value)]
+03)----AggregateExec: mode=SinglePartitioned, gby=[f_dkey@1 as f_dkey], aggr=[max(d.env), max(d.service), count(Int64(1)), sum(f.value)], ordering_mode=Sorted
+04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d_dkey@0, f_dkey@1)], projection=[value@3, f_dkey@4, env@1, service@2]
+05)--------CoalescePartitionsExec
+06)----------FilterExec: service@2 = log
+07)------------RepartitionExec: partitioning=RoundRobinBatch(3), input_partitions=1
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension/data.parquet]]}, projection=[d_dkey, env, service], file_type=parquet, predicate=service@2 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)]
+09)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], output_ordering=[f_dkey@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+query TTTIR rowsort
+SELECT f.f_dkey, MAX(d.env), MAX(d.service), count(*), sum(f.value)
+FROM fact_table_ordered f
+INNER JOIN dimension_table d ON f.f_dkey = d.d_dkey
+WHERE d.service = 'log'
+GROUP BY f.f_dkey
+ORDER BY f.f_dkey;
+----
+A dev log 7 772.4
+B prod log 7 614.4
+C prod log 7 2017.6
+
+##########
+# TEST 7: Window Function - Without Optimization
+# Shows RepartitionExec and SortExec (hash repartition destroys ordering)
+##########
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 0;
+
+query TT
+EXPLAIN SELECT f_dkey, timestamp, value,
+       ROW_NUMBER() OVER (PARTITION BY f_dkey ORDER BY timestamp) as rn
+FROM fact_table_ordered;
+----
+logical_plan
+01)Projection: fact_table_ordered.f_dkey, fact_table_ordered.timestamp, fact_table_ordered.value, row_number() PARTITION BY [fact_table_ordered.f_dkey] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn
+02)--WindowAggr: windowExpr=[[row_number() PARTITION BY [fact_table_ordered.f_dkey] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+03)----TableScan: fact_table_ordered projection=[timestamp, value, f_dkey]
+physical_plan
+01)ProjectionExec: expr=[f_dkey@2 as f_dkey, timestamp@0 as timestamp, value@1 as value, row_number() PARTITION BY [fact_table_ordered.f_dkey] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [fact_table_ordered.f_dkey] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [fact_table_ordered.f_dkey] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----SortExec: expr=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], preserve_partitioning=[true]
+04)------RepartitionExec: partitioning=Hash([f_dkey@2], 3), input_partitions=3
+05)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet
+
+# Verify results without optimization (limited for readability)
+query TPRI rowsort
+SELECT f_dkey, timestamp, value,
+       ROW_NUMBER() OVER (PARTITION BY f_dkey ORDER BY timestamp) as rn
+FROM fact_table_ordered
+WHERE timestamp < TIMESTAMP '2023-01-01T09:00:30';
+----
+A 2023-01-01T09:00:00 95.5 1
+A 2023-01-01T09:00:10 102.3 2
+A 2023-01-01T09:00:20 98.7 3
+B 2023-01-01T09:00:00 75.2 1
+B 2023-01-01T09:00:10 82.4 2
+B 2023-01-01T09:00:20 78.9 3
+C 2023-01-01T09:00:00 300.5 1
+C 2023-01-01T09:00:10 285.7 2
+C 2023-01-01T09:00:20 310.2 3
+
+##########
+# TEST 8: Window Function - With Optimization
+# No RepartitionExec, no SortExec (data already sorted by f_dkey, timestamp)
+##########
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 1;
+
+query TT
+EXPLAIN SELECT f_dkey, timestamp, value,
+       ROW_NUMBER() OVER (PARTITION BY f_dkey ORDER BY timestamp) as rn
+FROM fact_table_ordered;
+----
+logical_plan
+01)Projection: fact_table_ordered.f_dkey, fact_table_ordered.timestamp, fact_table_ordered.value, row_number() PARTITION BY [fact_table_ordered.f_dkey] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn
+02)--WindowAggr: windowExpr=[[row_number() PARTITION BY [fact_table_ordered.f_dkey] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+03)----TableScan: fact_table_ordered projection=[timestamp, value, f_dkey]
+physical_plan
+01)ProjectionExec: expr=[f_dkey@2 as f_dkey, timestamp@0 as timestamp, value@1 as value, row_number() PARTITION BY [fact_table_ordered.f_dkey] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [fact_table_ordered.f_dkey] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [fact_table_ordered.f_dkey] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet
+
+query TPRI rowsort
+SELECT f_dkey, timestamp, value,
+       ROW_NUMBER() OVER (PARTITION BY f_dkey ORDER BY timestamp) as rn
+FROM fact_table_ordered
+WHERE timestamp < TIMESTAMP '2023-01-01T09:00:30';
+----
+A 2023-01-01T09:00:00 95.5 1
+A 2023-01-01T09:00:10 102.3 2
+A 2023-01-01T09:00:20 98.7 3
+B 2023-01-01T09:00:00 75.2 1
+B 2023-01-01T09:00:10 82.4 2
+B 2023-01-01T09:00:20 78.9 3
+C 2023-01-01T09:00:00 300.5 1
+C 2023-01-01T09:00:10 285.7 2
+C 2023-01-01T09:00:20 310.2 3
+
+##########
+# TEST 9: High-Cardinality Partitions (more partitions than target_partitions)
+# Since num_partitions > target_partitions (5 > 3), files are merged using
+# round-robin assignment to ensure exactly target_partitions groups are created.
+##########
+
+# First verify results without optimization
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 0;
+
+query TIR rowsort
+SELECT f_dkey, count(*), sum(value)
+FROM high_cardinality_table
+GROUP BY f_dkey;
+----
+A 1 100
+B 1 200
+C 1 300
+D 1 400
+E 1 500
+
+# Now with optimization - verify plan shows SinglePartitioned mode and no RepartitionExec
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 1;
+
+# Verify the plan uses SinglePartitioned mode with no RepartitionExec
+# The 5 partitions are merged into 3 file groups using round-robin assignment
+query TT
+EXPLAIN SELECT f_dkey, count(*), sum(value) FROM high_cardinality_table GROUP BY f_dkey;
+----
+logical_plan
+01)Projection: high_cardinality_table.f_dkey, count(Int64(1)) AS count(*), sum(high_cardinality_table.value)
+02)--Aggregate: groupBy=[[high_cardinality_table.f_dkey]], aggr=[[count(Int64(1)), sum(high_cardinality_table.value)]]
+03)----TableScan: high_cardinality_table projection=[value, f_dkey]
+physical_plan
+01)ProjectionExec: expr=[f_dkey@0 as f_dkey, count(Int64(1))@1 as count(*), sum(high_cardinality_table.value)@2 as sum(high_cardinality_table.value)]
+02)--AggregateExec: mode=SinglePartitioned, gby=[f_dkey@1 as f_dkey], aggr=[count(Int64(1)), sum(high_cardinality_table.value)]
+03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=A/data.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=D/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=B/data.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=E/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], file_type=parquet
+
+# Verify results with optimization match results without optimization
+query TIR rowsort
+SELECT f_dkey, count(*), sum(value)
+FROM high_cardinality_table
+GROUP BY f_dkey;
+----
+A 1 100
+B 1 200
+C 1 300
+D 1 400
+E 1 500
+
+query R
+SELECT sum(value) FROM high_cardinality_table;
+----
+1500
+
+##########
+# Test 10: Threshold higher than distinct partition count
+##########
+# When preserve_file_partitions threshold is higher than the number of distinct
+# partition values, the optimization should NOT apply and we fall back to split_files.
+# The high_cardinality_table has 5 distinct partition values (A, B, C, D, E).
+# Setting threshold to 10 means we need at least 10 distinct partitions to enable
+# Hash partitioning, so this should show RepartitionExec in the plan.
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 10;
+
+# Verify the plan falls back to regular aggregation with RepartitionExec
+query TT
+EXPLAIN SELECT f_dkey, count(*), sum(value) FROM high_cardinality_table GROUP BY f_dkey;
+----
+logical_plan
+01)Projection: high_cardinality_table.f_dkey, count(Int64(1)) AS count(*), sum(high_cardinality_table.value)
+02)--Aggregate: groupBy=[[high_cardinality_table.f_dkey]], aggr=[[count(Int64(1)), sum(high_cardinality_table.value)]]
+03)----TableScan: high_cardinality_table projection=[value, f_dkey]
+physical_plan
+01)ProjectionExec: expr=[f_dkey@0 as f_dkey, count(Int64(1))@1 as count(*), sum(high_cardinality_table.value)@2 as sum(high_cardinality_table.value)]
+02)--AggregateExec: mode=FinalPartitioned, gby=[f_dkey@0 as f_dkey], aggr=[count(Int64(1)), sum(high_cardinality_table.value)]
+03)----RepartitionExec: partitioning=Hash([f_dkey@0], 3), input_partitions=3
+04)------AggregateExec: mode=Partial, gby=[f_dkey@1 as f_dkey], aggr=[count(Int64(1)), sum(high_cardinality_table.value)]
+05)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=A/data.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=C/data.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=D/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=E/data.parquet]]}, projection=[value, f_dkey], file_type=parquet
+
+query TIR rowsort
+SELECT f_dkey, count(*), sum(value)
+FROM high_cardinality_table
+GROUP BY f_dkey;
+----
+A 1 100
+B 1 200
+C 1 300
+D 1 400
+E 1 500
+
+##########
+# TEST 11: Partitioned Join with Matching Partition Counts - Without Optimization
+# fact_table (3 partitions) joins dimension_table_partitioned (3 partitions)
+# Shows RepartitionExec added when preserve_file_partitions is disabled
+##########
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 0;
+
+# Force Partitioned join mode (not CollectLeft)
+statement ok
+set datafusion.optimizer.hash_join_single_partition_threshold = 0;
+
+statement ok
+set datafusion.optimizer.hash_join_single_partition_threshold_rows = 0;
+
+query TT
+EXPLAIN SELECT f.f_dkey, d.env, sum(f.value)
+FROM fact_table f
+INNER JOIN dimension_table_partitioned d ON f.f_dkey = d.d_dkey
+GROUP BY f.f_dkey, d.env;
+----
+logical_plan
+01)Aggregate: groupBy=[[f.f_dkey, d.env]], aggr=[[sum(f.value)]]
+02)--Projection: f.value, f.f_dkey, d.env
+03)----Inner Join: f.f_dkey = d.d_dkey
+04)------SubqueryAlias: f
+05)--------TableScan: fact_table projection=[value, f_dkey]
+06)------SubqueryAlias: d
+07)--------TableScan: dimension_table_partitioned projection=[env, d_dkey]
+physical_plan
+01)AggregateExec: mode=FinalPartitioned, gby=[f_dkey@0 as f_dkey, env@1 as env], aggr=[sum(f.value)]
+02)--RepartitionExec: partitioning=Hash([f_dkey@0, env@1], 3), input_partitions=3
+03)----AggregateExec: mode=Partial, gby=[f_dkey@1 as f_dkey, env@2 as env], aggr=[sum(f.value)]
+04)------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@1, f_dkey@1)], projection=[value@2, f_dkey@3, env@0]
+05)--------RepartitionExec: partitioning=Hash([d_dkey@1], 3), input_partitions=3
+06)----------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=C/data.parquet]]}, projection=[env, d_dkey], file_type=parquet
+07)--------RepartitionExec: partitioning=Hash([f_dkey@1], 3), input_partitions=3
+08)----------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+query TTR rowsort
+SELECT f.f_dkey, d.env, sum(f.value)
+FROM fact_table f
+INNER JOIN dimension_table_partitioned d ON f.f_dkey = d.d_dkey
+GROUP BY f.f_dkey, d.env;
+----
+A dev 772.4
+B prod 614.4
+C prod 2017.6
+
+##########
+# TEST 12: Partitioned Join with Matching Partition Counts - With Optimization
+# Both tables have 3 partitions matching target_partitions=3
+# No RepartitionExec needed for join - partitions already satisfy the requirement
+# Dynamic filter pushdown is disabled in this mode because preserve_file_partitions
+# reports Hash partitioning for Hive-style file groups, which are not hash-routed.
+##########
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 1;
+
+query TT
+EXPLAIN SELECT f.f_dkey, d.env, sum(f.value)
+FROM fact_table f
+INNER JOIN dimension_table_partitioned d ON f.f_dkey = d.d_dkey
+GROUP BY f.f_dkey, d.env;
+----
+logical_plan
+01)Aggregate: groupBy=[[f.f_dkey, d.env]], aggr=[[sum(f.value)]]
+02)--Projection: f.value, f.f_dkey, d.env
+03)----Inner Join: f.f_dkey = d.d_dkey
+04)------SubqueryAlias: f
+05)--------TableScan: fact_table projection=[value, f_dkey]
+06)------SubqueryAlias: d
+07)--------TableScan: dimension_table_partitioned projection=[env, d_dkey]
+physical_plan
+01)AggregateExec: mode=FinalPartitioned, gby=[f_dkey@0 as f_dkey, env@1 as env], aggr=[sum(f.value)]
+02)--RepartitionExec: partitioning=Hash([f_dkey@0, env@1], 3), input_partitions=3
+03)----AggregateExec: mode=Partial, gby=[f_dkey@1 as f_dkey, env@2 as env], aggr=[sum(f.value)]
+04)------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@1, f_dkey@1)], projection=[value@2, f_dkey@3, env@0]
+05)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=C/data.parquet]]}, projection=[env, d_dkey], file_type=parquet
+06)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], file_type=parquet
+
+query TTR rowsort
+SELECT f.f_dkey, d.env, sum(f.value)
+FROM fact_table f
+INNER JOIN dimension_table_partitioned d ON f.f_dkey = d.d_dkey
+GROUP BY f.f_dkey, d.env;
+----
+A dev 772.4
+B prod 614.4
+C prod 2017.6
+
+##########
+# TEST 13: Partitioned Join where Number of File Groups is less than target_partitions
+# With preserve_file_partitions enabled, we should still avoid repartitioning
+##########
+
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 1;
+
+query TT
+EXPLAIN SELECT f_dkey, timestamp,
+       COUNT(*), AVG(value)
+FROM fact_table
+GROUP BY f_dkey, timestamp;
+----
+logical_plan
+01)Projection: fact_table.f_dkey, fact_table.timestamp, count(Int64(1)) AS count(*), avg(fact_table.value)
+02)--Aggregate: groupBy=[[fact_table.f_dkey, fact_table.timestamp]], aggr=[[count(Int64(1)), avg(fact_table.value)]]
+03)----TableScan: fact_table projection=[timestamp, value, f_dkey]
+physical_plan
+01)ProjectionExec: expr=[f_dkey@0 as f_dkey, timestamp@1 as timestamp, count(Int64(1))@2 as count(*), avg(fact_table.value)@3 as avg(fact_table.value)]
+02)--AggregateExec: mode=SinglePartitioned, gby=[f_dkey@2 as f_dkey, timestamp@0 as timestamp], aggr=[count(Int64(1)), avg(fact_table.value)]
+03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], file_type=parquet
+
+query TPIR rowsort
+SELECT f_dkey, timestamp,
+       COUNT(*), AVG(value)
+FROM fact_table
+GROUP BY f_dkey, timestamp;
+----
+A 2023-01-01T09:00:00 1 95.5
+A 2023-01-01T09:00:10 1 102.3
+A 2023-01-01T09:00:20 1 98.7
+A 2023-01-01T09:12:20 1 105.1
+A 2023-01-01T09:12:30 1 100
+A 2023-01-01T09:12:40 1 150
+A 2023-01-01T09:12:50 1 120.8
+B 2023-01-01T09:00:00 1 75.2
+B 2023-01-01T09:00:10 1 82.4
+B 2023-01-01T09:00:20 1 78.9
+B 2023-01-01T09:00:30 1 85.6
+B 2023-01-01T09:12:30 1 80
+B 2023-01-01T09:12:40 1 120
+B 2023-01-01T09:12:50 1 92.3
+C 2023-01-01T09:00:00 1 300.5
+C 2023-01-01T09:00:10 1 285.7
+C 2023-01-01T09:00:20 1 310.2
+C 2023-01-01T09:00:30 1 295.8
+C 2023-01-01T09:00:40 1 300
+C 2023-01-01T09:12:40 1 250
+C 2023-01-01T09:12:50 1 275.4
+
+##########
+# CLEANUP
+##########
+
+# Config reset (4 is hard-coded from SLT runner setting)
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+reset datafusion.optimizer.preserve_file_partitions;
+
+statement ok
+reset datafusion.optimizer.hash_join_single_partition_threshold;
+
+statement ok
+reset datafusion.optimizer.hash_join_single_partition_threshold_rows;
+
+statement ok
+DROP TABLE fact_table;
+
+statement ok
+DROP TABLE fact_table_ordered;
+
+statement ok
+DROP TABLE dimension_table;
+
+statement ok
+DROP TABLE dimension_table_partitioned;
+
+statement ok
+DROP TABLE high_cardinality_table;
diff --git a/datafusion/sqllogictest/test_files/projection.slt b/datafusion/sqllogictest/test_files/projection.slt
index 9f840e7bdc2f0..e18114bc51ca8 100644
--- a/datafusion/sqllogictest/test_files/projection.slt
+++ b/datafusion/sqllogictest/test_files/projection.slt
@@ -167,12 +167,12 @@ set datafusion.explain.logical_plan_only = false
 
 # project cast dictionary
 query T
-SELECT 
-    CASE 
+SELECT
+    CASE
         WHEN cpu_load_short.host IS NULL THEN ''
         ELSE cpu_load_short.host
     END AS host
-FROM 
+FROM
     cpu_load_short;
 ----
 host1
@@ -275,8 +275,6 @@ logical_plan
 02)--Filter: t1.a > Int64(1)
 03)----TableScan: t1 projection=[a], partial_filters=[t1.a > Int64(1)]
 physical_plan
-01)ProjectionExec: expr=[]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: a@0 > 1
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection/17513.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 > 1, pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 > 1, required_guarantees=[]
+01)FilterExec: a@0 > 1, projection=[]
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection/17513.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 > 1, pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 > 1, required_guarantees=[]
diff --git a/datafusion/sqllogictest/test_files/projection_pushdown.slt b/datafusion/sqllogictest/test_files/projection_pushdown.slt
new file mode 100644
index 0000000000000..a40c1b8c7e246
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/projection_pushdown.slt
@@ -0,0 +1,2076 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+# Tests for projection pushdown behavior with get_field expressions
+#
+# This file tests the ExtractTrivialProjections optimizer rule and
+# physical projection pushdown for:
+# - get_field expressions (struct field access like s['foo'])
+# - Pushdown through Filter, Sort, and TopK operators
+# - Multi-partition scenarios with SortPreservingMergeExec
+##########
+
+#####################
+# Section 1: Setup - Single Partition Tests
+#####################
+
+# Set target_partitions = 1 for deterministic plan output
+statement ok
+SET datafusion.execution.target_partitions = 1;
+
+# Create parquet file with struct column containing value and label fields
+statement ok
+COPY (
+    SELECT
+        column1 as id,
+        column2 as s
+    FROM VALUES
+        (1, {value: 100, label: 'alpha'}),
+        (2, {value: 200, label: 'beta'}),
+        (3, {value: 150, label: 'gamma'}),
+        (4, {value: 300, label: 'delta'}),
+        (5, {value: 250, label: 'epsilon'})
+) TO 'test_files/scratch/projection_pushdown/simple.parquet'
+STORED AS PARQUET;
+
+# Create table for simple struct tests
+statement ok
+CREATE EXTERNAL TABLE simple_struct STORED AS PARQUET
+LOCATION 'test_files/scratch/projection_pushdown/simple.parquet';
+
+# Create parquet file with nested struct column
+statement ok
+COPY (
+    SELECT
+        column1 as id,
+        column2 as nested
+    FROM VALUES
+        (1, {outer: {inner: 10, name: 'one'}, extra: 'x'}),
+        (2, {outer: {inner: 20, name: 'two'}, extra: 'y'}),
+        (3, {outer: {inner: 30, name: 'three'}, extra: 'z'})
+) TO 'test_files/scratch/projection_pushdown/nested.parquet'
+STORED AS PARQUET;
+
+# Create table for nested struct tests
+statement ok
+CREATE EXTERNAL TABLE nested_struct STORED AS PARQUET
+LOCATION 'test_files/scratch/projection_pushdown/nested.parquet';
+
+# Create parquet file with nullable struct column
+statement ok
+COPY (
+    SELECT
+        column1 as id,
+        column2 as s
+    FROM VALUES
+        (1, {value: 100, label: 'alpha'}),
+        (2, NULL),
+        (3, {value: 150, label: 'gamma'}),
+        (4, NULL),
+        (5, {value: 250, label: 'epsilon'})
+) TO 'test_files/scratch/projection_pushdown/nullable.parquet'
+STORED AS PARQUET;
+
+# Create table for nullable struct tests
+statement ok
+CREATE EXTERNAL TABLE nullable_struct STORED AS PARQUET
+LOCATION 'test_files/scratch/projection_pushdown/nullable.parquet';
+
+
+#####################
+# Section 2: Basic get_field Pushdown (Projection above scan)
+#####################
+
+###
+# Test 2.1: Simple s['value']
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM simple_struct;
+----
+logical_plan
+01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value"))
+02)--TableScan: simple_struct projection=[id, s]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM simple_struct ORDER BY id;
+----
+1 100
+2 200
+3 150
+4 300
+5 250
+
+query TT
+EXPLAIN SELECT s['label'] FROM simple_struct;
+----
+logical_plan
+01)Projection: get_field(simple_struct.s, Utf8("label"))
+02)--TableScan: simple_struct projection=[s]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, label) as simple_struct.s[label]], file_type=parquet
+
+# Verify correctness
+query T
+SELECT s['label'] FROM simple_struct ORDER BY s['label'];
+----
+alpha
+beta
+delta
+epsilon
+gamma
+
+###
+# Test 2.2: Multiple get_field expressions
+###
+
+query TT
+EXPLAIN SELECT id, s['value'], s['label'] FROM simple_struct;
+----
+logical_plan
+01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label"))
+02)--TableScan: simple_struct projection=[id, s]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as simple_struct.s[label]], file_type=parquet
+
+# Verify correctness
+query IIT
+SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id;
+----
+1 100 alpha
+2 200 beta
+3 150 gamma
+4 300 delta
+5 250 epsilon
+
+###
+# Test 2.3: Nested s['outer']['inner']
+###
+
+query TT
+EXPLAIN SELECT id, nested['outer']['inner'] FROM nested_struct;
+----
+logical_plan
+01)Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner"))
+02)--TableScan: nested_struct projection=[id, nested]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nested.parquet]]}, projection=[id, get_field(nested@1, outer, inner) as nested_struct.nested[outer][inner]], file_type=parquet
+
+# Verify correctness
+query II
+SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id;
+----
+1 10
+2 20
+3 30
+
+###
+# Test 2.4: s['value'] + 1
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct;
+----
+logical_plan
+01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1)
+02)--TableScan: simple_struct projection=[id, s]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet
+
+# Verify correctness
+query II
+SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id;
+----
+1 101
+2 201
+3 151
+4 301
+5 251
+
+###
+# Test 2.5: s['label'] || '_suffix'
+###
+
+query TT
+EXPLAIN SELECT id, s['label'] || '_suffix' FROM simple_struct;
+----
+logical_plan
+01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix")
+02)--TableScan: simple_struct projection=[id, s]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, label) || _suffix as simple_struct.s[label] || Utf8("_suffix")], file_type=parquet
+
+# Verify correctness
+query IT
+SELECT id, s['label'] || '_suffix' FROM simple_struct ORDER BY id;
+----
+1 alpha_suffix
+2 beta_suffix
+3 gamma_suffix
+4 delta_suffix
+5 epsilon_suffix
+
+
+#####################
+# Section 3: Projection Pushdown Through FilterExec
+#####################
+
+###
+# Test 3.1: Simple get_field through Filter
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 2;
+----
+logical_plan
+01)Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value]
+02)--Filter: simple_struct.id > Int64(2)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)]
+physical_plan
+01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]]
+02)--FilterExec: id@1 > 2
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[]
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM simple_struct WHERE id > 2 ORDER BY id;
+----
+3 150
+4 300
+5 250
+
+###
+# Test 3.2: s['value'] + 1 through Filter
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 2;
+----
+logical_plan
+01)Projection: simple_struct.id, __datafusion_extracted_1 + Int64(1) AS simple_struct.s[value] + Int64(1)
+02)--Filter: simple_struct.id > Int64(2)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)]
+physical_plan
+01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 + 1 as simple_struct.s[value] + Int64(1)]
+02)--FilterExec: id@1 > 2
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[]
+
+# Verify correctness
+query II
+SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 2 ORDER BY id;
+----
+3 151
+4 301
+5 251
+
+###
+# Test 3.3: Filter on get_field expression
+###
+
+query TT
+EXPLAIN SELECT id, s['label'] FROM simple_struct WHERE s['value'] > 150;
+----
+logical_plan
+01)Projection: simple_struct.id, __datafusion_extracted_2 AS simple_struct.s[label]
+02)--Filter: __datafusion_extracted_1 > Int64(150)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)]
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, __datafusion_extracted_2@1 as simple_struct.s[label]]
+02)--FilterExec: __datafusion_extracted_1@0 > 150, projection=[id@1, __datafusion_extracted_2@2]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id, get_field(s@1, label) as __datafusion_extracted_2], file_type=parquet, predicate=get_field(s@1, value) > 150
+
+# Verify correctness
+query IT
+SELECT id, s['label'] FROM simple_struct WHERE s['value'] > 150 ORDER BY id;
+----
+2 beta
+4 delta
+5 epsilon
+
+
+#####################
+# Section 4: Projection Pushdown Through SortExec (no LIMIT)
+#####################
+
+###
+# Test 4.1: Simple get_field through Sort
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value"))
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM simple_struct ORDER BY id;
+----
+1 100
+2 200
+3 150
+4 300
+5 250
+
+###
+# Test 4.2: s['value'] + 1 through Sort - split projection
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1)
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet
+
+# Verify correctness
+query II
+SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id;
+----
+1 101
+2 201
+3 151
+4 301
+5 251
+
+###
+# Test 4.3: Sort by get_field expression
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY s['value'];
+----
+logical_plan
+01)Sort: simple_struct.s[value] ASC NULLS LAST
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value"))
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM simple_struct ORDER BY s['value'];
+----
+1 100
+3 150
+2 200
+5 250
+4 300
+
+###
+# Test 4.4: Projection with duplicate column through Sort
+# The projection expands the number of columns from 3 to 4 by introducing `col_b_dup`
+###
+
+statement ok
+COPY (
+    SELECT
+        column1 as col_a,
+        column2 as col_b,
+        column3 as col_c
+    FROM VALUES
+        (1, 2, 3),
+        (4, 5, 6),
+        (7, 8, 9)
+) TO 'test_files/scratch/projection_pushdown/three_cols.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE three_cols STORED AS PARQUET
+LOCATION 'test_files/scratch/projection_pushdown/three_cols.parquet';
+
+query TT
+EXPLAIN SELECT col_a, col_b, col_c, col_b as col_b_dup FROM three_cols ORDER BY col_a;
+----
+logical_plan
+01)Sort: three_cols.col_a ASC NULLS LAST
+02)--Projection: three_cols.col_a, three_cols.col_b, three_cols.col_c, three_cols.col_b AS col_b_dup
+03)----TableScan: three_cols projection=[col_a, col_b, col_c]
+physical_plan
+01)SortExec: expr=[col_a@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/three_cols.parquet]]}, projection=[col_a, col_b, col_c, col_b@1 as col_b_dup], file_type=parquet
+
+# Verify correctness
+query IIII
+SELECT col_a, col_b, col_c, col_b as col_b_dup FROM three_cols ORDER BY col_a DESC;
+----
+7 8 9 8
+4 5 6 5
+1 2 3 2
+
+statement ok
+DROP TABLE three_cols;
+
+
+#####################
+# Section 5: Projection Pushdown Through TopK (ORDER BY + LIMIT)
+#####################
+
+###
+# Test 5.1: Simple get_field through TopK
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=3
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value"))
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM simple_struct ORDER BY id LIMIT 3;
+----
+1 100
+2 200
+3 150
+
+###
+# Test 5.2: s['value'] + 1 through TopK
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=3
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1)
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query II
+SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id LIMIT 3;
+----
+1 101
+2 201
+3 151
+
+###
+# Test 5.3: Multiple get_field through TopK
+###
+
+query TT
+EXPLAIN SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=3
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label"))
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as simple_struct.s[label]], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query IIT
+SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id LIMIT 3;
+----
+1 100 alpha
+2 200 beta
+3 150 gamma
+
+###
+# Test 5.4: Nested get_field through TopK
+###
+
+query TT
+EXPLAIN SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id LIMIT 2;
+----
+logical_plan
+01)Sort: nested_struct.id ASC NULLS LAST, fetch=2
+02)--Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner"))
+03)----TableScan: nested_struct projection=[id, nested]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nested.parquet]]}, projection=[id, get_field(nested@1, outer, inner) as nested_struct.nested[outer][inner]], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query II
+SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id LIMIT 2;
+----
+1 10
+2 20
+
+###
+# Test 5.5: String concat through TopK
+###
+
+query TT
+EXPLAIN SELECT id, s['label'] || '_suffix' FROM simple_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=3
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix")
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, label) || _suffix as simple_struct.s[label] || Utf8("_suffix")], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query IT
+SELECT id, s['label'] || '_suffix' FROM simple_struct ORDER BY id LIMIT 3;
+----
+1 alpha_suffix
+2 beta_suffix
+3 gamma_suffix
+
+
+#####################
+# Section 6: Combined Operators (Filter + Sort/TopK)
+#####################
+
+###
+# Test 6.1: Filter + Sort + get_field
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value'];
+----
+logical_plan
+01)Sort: simple_struct.s[value] ASC NULLS LAST
+02)--Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value]
+03)----Filter: simple_struct.id > Int64(1)
+04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)]
+physical_plan
+01)SortExec: expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false]
+02)--ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]]
+03)----FilterExec: id@1 > 1
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[]
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value'];
+----
+3 150
+2 200
+5 250
+4 300
+
+###
+# Test 6.2: Filter + TopK + get_field
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value'] LIMIT 2;
+----
+logical_plan
+01)Sort: simple_struct.s[value] ASC NULLS LAST, fetch=2
+02)--Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value]
+03)----Filter: simple_struct.id > Int64(1)
+04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false]
+02)--ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]]
+03)----FilterExec: id@1 > 1
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND DynamicFilter [ empty ], pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[]
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value'] LIMIT 2;
+----
+3 150
+2 200
+
+###
+# Test 6.3: Filter + TopK + get_field with arithmetic
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 1 ORDER BY id LIMIT 2;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=2
+02)--Projection: simple_struct.id, __datafusion_extracted_1 + Int64(1) AS simple_struct.s[value] + Int64(1)
+03)----Filter: simple_struct.id > Int64(1)
+04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 + 1 as simple_struct.s[value] + Int64(1)]
+03)----FilterExec: id@1 > 1
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND DynamicFilter [ empty ], pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[]
+
+# Verify correctness
+query II
+SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 1 ORDER BY id LIMIT 2;
+----
+2 201
+3 151
+
+
+#####################
+# Section 7: Multi-Partition Tests
+#####################
+
+# Config reset
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+SET datafusion.execution.target_partitions = 4;
+
+# Create 5 parquet files (more than partitions) for parallel tests
+statement ok
+COPY (SELECT 1 as id, {value: 100, label: 'alpha'} as s)
+TO 'test_files/scratch/projection_pushdown/multi/part1.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (SELECT 2 as id, {value: 200, label: 'beta'} as s)
+TO 'test_files/scratch/projection_pushdown/multi/part2.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (SELECT 3 as id, {value: 150, label: 'gamma'} as s)
+TO 'test_files/scratch/projection_pushdown/multi/part3.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (SELECT 4 as id, {value: 300, label: 'delta'} as s)
+TO 'test_files/scratch/projection_pushdown/multi/part4.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (SELECT 5 as id, {value: 250, label: 'epsilon'} as s)
+TO 'test_files/scratch/projection_pushdown/multi/part5.parquet'
+STORED AS PARQUET;
+
+# Create table from multiple parquet files
+statement ok
+CREATE EXTERNAL TABLE multi_struct STORED AS PARQUET
+LOCATION 'test_files/scratch/projection_pushdown/multi/';
+
+###
+# Test 7.1: Multi-partition Sort with get_field
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM multi_struct ORDER BY id;
+----
+logical_plan
+01)Sort: multi_struct.id ASC NULLS LAST
+02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value"))
+03)----TableScan: multi_struct projection=[id, s]
+physical_plan
+01)SortPreservingMergeExec: [id@0 ASC NULLS LAST]
+02)--SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[id, get_field(s@1, value) as multi_struct.s[value]], file_type=parquet
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM multi_struct ORDER BY id;
+----
+1 100
+2 200
+3 150
+4 300
+5 250
+
+###
+# Test 7.2: Multi-partition TopK with get_field
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM multi_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: multi_struct.id ASC NULLS LAST, fetch=3
+02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value"))
+03)----TableScan: multi_struct projection=[id, s]
+physical_plan
+01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=3
+02)--SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[id, get_field(s@1, value) as multi_struct.s[value]], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM multi_struct ORDER BY id LIMIT 3;
+----
+1 100
+2 200
+3 150
+
+###
+# Test 7.3: Multi-partition TopK with arithmetic (non-trivial stays above merge)
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] + 1 FROM multi_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: multi_struct.id ASC NULLS LAST, fetch=3
+02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) + Int64(1)
+03)----TableScan: multi_struct projection=[id, s]
+physical_plan
+01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=3
+02)--SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[id, get_field(s@1, value) + 1 as multi_struct.s[value] + Int64(1)], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query II
+SELECT id, s['value'] + 1 FROM multi_struct ORDER BY id LIMIT 3;
+----
+1 101
+2 201
+3 151
+
+###
+# Test 7.4: Multi-partition Filter with get_field
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM multi_struct WHERE id > 2 ORDER BY id;
+----
+logical_plan
+01)Sort: multi_struct.id ASC NULLS LAST
+02)--Projection: multi_struct.id, __datafusion_extracted_1 AS multi_struct.s[value]
+03)----Filter: multi_struct.id > Int64(2)
+04)------Projection: get_field(multi_struct.s, Utf8("value")) AS __datafusion_extracted_1, multi_struct.id
+05)--------TableScan: multi_struct projection=[id, s], partial_filters=[multi_struct.id > Int64(2)]
+physical_plan
+01)SortPreservingMergeExec: [id@0 ASC NULLS LAST]
+02)--SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as multi_struct.s[value]]
+04)------FilterExec: id@1 > 2
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=3
+06)----------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[]
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM multi_struct WHERE id > 2 ORDER BY id;
+----
+3 150
+4 300
+5 250
+
+###
+# Test 7.5: Aggregation with get_field (CoalescePartitions)
+###
+
+query TT
+EXPLAIN SELECT s['label'], SUM(s['value']) FROM multi_struct GROUP BY s['label'];
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 AS multi_struct.s[label], sum(__datafusion_extracted_2) AS sum(multi_struct.s[value])
+02)--Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[sum(__datafusion_extracted_2)]]
+03)----Projection: get_field(multi_struct.s, Utf8("label")) AS __datafusion_extracted_1, get_field(multi_struct.s, Utf8("value")) AS __datafusion_extracted_2
+04)------TableScan: multi_struct projection=[s]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 as multi_struct.s[label], sum(__datafusion_extracted_2)@1 as sum(multi_struct.s[value])]
+02)--AggregateExec: mode=FinalPartitioned, gby=[__datafusion_extracted_1@0 as __datafusion_extracted_1], aggr=[sum(__datafusion_extracted_2)]
+03)----RepartitionExec: partitioning=Hash([__datafusion_extracted_1@0], 4), input_partitions=3
+04)------AggregateExec: mode=Partial, gby=[__datafusion_extracted_1@0 as __datafusion_extracted_1], aggr=[sum(__datafusion_extracted_2)]
+05)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[get_field(s@1, label) as __datafusion_extracted_1, get_field(s@1, value) as __datafusion_extracted_2], file_type=parquet
+
+# Verify correctness
+query TI
+SELECT s['label'], SUM(s['value']) FROM multi_struct GROUP BY s['label'] ORDER BY s['label'];
+----
+alpha 100
+beta 200
+delta 300
+epsilon 250
+gamma 150
+
+
+#####################
+# Section 8: Edge Cases
+#####################
+
+# Reset to single partition for edge case tests
+statement ok
+SET datafusion.execution.target_partitions = 1;
+
+###
+# Test 8.1: get_field on nullable struct column
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM nullable_struct;
+----
+logical_plan
+01)Projection: nullable_struct.id, get_field(nullable_struct.s, Utf8("value"))
+02)--TableScan: nullable_struct projection=[id, s]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nullable.parquet]]}, projection=[id, get_field(s@1, value) as nullable_struct.s[value]], file_type=parquet
+
+# Verify correctness (NULL struct returns NULL field)
+query II
+SELECT id, s['value'] FROM nullable_struct ORDER BY id;
+----
+1 100
+2 NULL
+3 150
+4 NULL
+5 250
+
+###
+# Test 8.2: get_field returning NULL values
+###
+
+query TT
+EXPLAIN SELECT id, s['label'] FROM nullable_struct WHERE s['value'] IS NOT NULL;
+----
+logical_plan
+01)Projection: nullable_struct.id, __datafusion_extracted_2 AS nullable_struct.s[label]
+02)--Filter: __datafusion_extracted_1 IS NOT NULL
+03)----Projection: get_field(nullable_struct.s, Utf8("value")) AS __datafusion_extracted_1, nullable_struct.id, get_field(nullable_struct.s, Utf8("label")) AS __datafusion_extracted_2
+04)------TableScan: nullable_struct projection=[id, s], partial_filters=[get_field(nullable_struct.s, Utf8("value")) IS NOT NULL]
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, __datafusion_extracted_2@1 as nullable_struct.s[label]]
+02)--FilterExec: __datafusion_extracted_1@0 IS NOT NULL, projection=[id@1, __datafusion_extracted_2@2]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nullable.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id, get_field(s@1, label) as __datafusion_extracted_2], file_type=parquet, predicate=get_field(s@1, value) IS NOT NULL
+
+# Verify correctness
+query IT
+SELECT id, s['label'] FROM nullable_struct WHERE s['value'] IS NOT NULL ORDER BY id;
+----
+1 alpha
+3 gamma
+5 epsilon
+
+###
+# Test 8.3: Mixed trivial and non-trivial in same projection
+###
+
+query TT
+EXPLAIN SELECT id, s['value'], s['value'] + 10, s['label'] FROM simple_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=3
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("value")) + Int64(10), get_field(simple_struct.s, Utf8("label"))
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, value) + 10 as simple_struct.s[value] + Int64(10), get_field(s@1, label) as simple_struct.s[label]], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query IIIT
+SELECT id, s['value'], s['value'] + 10, s['label'] FROM simple_struct ORDER BY id LIMIT 3;
+----
+1 100 110 alpha
+2 200 210 beta
+3 150 160 gamma
+
+###
+# Test 8.4: Literal projection through TopK
+###
+
+query TT
+EXPLAIN SELECT id, 42 as constant FROM simple_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=3
+02)--Projection: simple_struct.id, Int64(42) AS constant
+03)----TableScan: simple_struct projection=[id]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, 42 as constant], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query II
+SELECT id, 42 as constant FROM simple_struct ORDER BY id LIMIT 3;
+----
+1 42
+2 42
+3 42
+
+###
+# Test 8.5: Simple column through TopK (baseline comparison)
+###
+
+query TT
+EXPLAIN SELECT id FROM simple_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=3
+02)--TableScan: simple_struct projection=[id]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query I
+SELECT id FROM simple_struct ORDER BY id LIMIT 3;
+----
+1
+2
+3
+
+
+#####################
+# Section 9: Coverage Tests - Edge Cases for Uncovered Code Paths
+#####################
+
+###
+# Test 9.1: TopK with computed projection
+###
+
+query TT
+EXPLAIN SELECT id, id + 100 as computed FROM simple_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=3
+02)--Projection: simple_struct.id, simple_struct.id + Int64(100) AS computed
+03)----TableScan: simple_struct projection=[id]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, id@0 + 100 as computed], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query II
+SELECT id, id + 100 as computed FROM simple_struct ORDER BY id LIMIT 3;
+----
+1 101
+2 102
+3 103
+
+###
+# Test 9.2: Duplicate get_field expressions (same expression referenced twice)
+# Common subexpression elimination happens in the logical plan, and the physical
+# plan extracts the shared get_field for efficient computation
+###
+
+query TT
+EXPLAIN SELECT (id + s['value']) * (id + s['value']) as id_and_value FROM simple_struct WHERE id > 2;
+----
+logical_plan
+01)Projection: __common_expr_1 * __common_expr_1 AS id_and_value
+02)--Projection: simple_struct.id + __datafusion_extracted_2 AS __common_expr_1
+03)----Filter: simple_struct.id > Int64(2)
+04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_2, simple_struct.id
+05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)]
+physical_plan
+01)ProjectionExec: expr=[__common_expr_1@0 * __common_expr_1@0 as id_and_value]
+02)--ProjectionExec: expr=[id@1 + __datafusion_extracted_2@0 as __common_expr_1]
+03)----FilterExec: id@1 > 2
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[]
+
+
+query TT
+EXPLAIN SELECT s['value'] + s['value'] as doubled FROM simple_struct WHERE id > 2;
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 + __datafusion_extracted_1 AS doubled
+02)--Filter: simple_struct.id > Int64(2)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 + __datafusion_extracted_1@0 as doubled]
+02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[]
+
+# Verify correctness
+query I
+SELECT s['value'] + s['value'] as doubled FROM simple_struct WHERE id > 2 ORDER BY doubled;
+----
+300
+500
+600
+
+###
+# Test 9.3: Projection with only get_field expressions through Filter
+###
+
+query TT
+EXPLAIN SELECT s['value'], s['label'] FROM simple_struct WHERE id > 2;
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 AS simple_struct.s[value], __datafusion_extracted_2 AS simple_struct.s[label]
+02)--Filter: simple_struct.id > Int64(2)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value], __datafusion_extracted_2@1 as simple_struct.s[label]]
+02)--FilterExec: id@2 > 2, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[]
+
+# Verify correctness
+query IT
+SELECT s['value'], s['label'] FROM simple_struct WHERE id > 2 ORDER BY s['value'];
+----
+150 gamma
+250 epsilon
+300 delta
+
+###
+# Test 9.4: Mixed column reference with get_field in expression through TopK
+# Tests column remapping in finalize_outer_exprs when outer expr references both extracted and original columns
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] + id as combined FROM simple_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=3
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + simple_struct.id AS combined
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + id@0 as combined], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query II
+SELECT id, s['value'] + id as combined FROM simple_struct ORDER BY id LIMIT 3;
+----
+1 101
+2 202
+3 153
+
+###
+# Test 9.5: Multiple get_field from same struct in expression through Filter
+# Tests extraction when base struct is shared across multiple get_field calls
+###
+
+query TT
+EXPLAIN SELECT s['value'] * 2 + length(s['label']) as score FROM simple_struct WHERE id > 1;
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 * Int64(2) + CAST(character_length(__datafusion_extracted_2) AS Int64) AS score
+02)--Filter: simple_struct.id > Int64(1)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 * 2 + CAST(character_length(__datafusion_extracted_2@1) AS Int64) as score]
+02)--FilterExec: id@2 > 1, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[]
+
+# Verify correctness
+query I
+SELECT s['value'] * 2 + length(s['label']) as score FROM simple_struct WHERE id > 1 ORDER BY score;
+----
+305
+404
+507
+605
+
+
+#####################
+# Section 10: Literal with get_field Expressions
+#####################
+
+###
+# Test 10.1: Literal constant + get_field in same projection
+# Tests projection with both trivial (literal) and get_field expressions
+###
+
+query TT
+EXPLAIN SELECT id, 42 as answer, s['label'] FROM simple_struct ORDER BY id LIMIT 2;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=2
+02)--Projection: simple_struct.id, Int64(42) AS answer, get_field(simple_struct.s, Utf8("label"))
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, 42 as answer, get_field(s@1, label) as simple_struct.s[label]], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query IIT
+SELECT id, 42 as answer, s['label'] FROM simple_struct ORDER BY id LIMIT 2;
+----
+1 42 alpha
+2 42 beta
+
+###
+# Test 10.2: Multiple non-trivial get_field expressions together
+# Tests arithmetic on one field and string concat on another in same projection
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] + 100, s['label'] || '_test' FROM simple_struct ORDER BY id LIMIT 2;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=2
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(100), get_field(simple_struct.s, Utf8("label")) || Utf8("_test")
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 100 as simple_struct.s[value] + Int64(100), get_field(s@1, label) || _test as simple_struct.s[label] || Utf8("_test")], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query IIT
+SELECT id, s['value'] + 100, s['label'] || '_test' FROM simple_struct ORDER BY id LIMIT 2;
+----
+1 200 alpha_test
+2 300 beta_test
+
+#####################
+# Section 11: FilterExec Projection Pushdown - Handling Predicate Column Requirements
+#####################
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1;
+----
+logical_plan
+01)Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value]
+02)--Filter: simple_struct.id > Int64(1)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)]
+physical_plan
+01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]]
+02)--FilterExec: id@1 > 1
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[]
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY id LIMIT 2;
+----
+2 200
+3 150
+
+query TT
+EXPLAIN SELECT s['value'] FROM simple_struct WHERE id > 1 AND (id < 4 OR id = 5);
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 AS simple_struct.s[value]
+02)--Filter: simple_struct.id > Int64(1) AND (simple_struct.id < Int64(4) OR simple_struct.id = Int64(5))
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1), simple_struct.id < Int64(4) OR simple_struct.id = Int64(5)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value]]
+02)--FilterExec: id@1 > 1 AND (id@1 < 4 OR id@1 = 5), projection=[__datafusion_extracted_1@0]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND (id@0 < 4 OR id@0 = 5), pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1 AND (id_null_count@1 != row_count@2 AND id_min@3 < 4 OR id_null_count@1 != row_count@2 AND id_min@3 <= 5 AND 5 <= id_max@0), required_guarantees=[]
+
+# Verify correctness - should return rows where (id > 1) AND ((id < 4) OR (id = 5))
+# That's: id=2,3 (1<id<4) and id=5 (id=5 from second branch)
+query I
+SELECT s['value'] FROM simple_struct WHERE id > 1 AND (id < 4 OR id = 5) ORDER BY s['value'];
+----
+150
+200
+250
+
+query TT
+EXPLAIN SELECT s['value'] FROM simple_struct WHERE id > 1 AND id < 5;
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 AS simple_struct.s[value]
+02)--Filter: simple_struct.id > Int64(1) AND simple_struct.id < Int64(5)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1), simple_struct.id < Int64(5)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value]]
+02)--FilterExec: id@1 > 1 AND id@1 < 5, projection=[__datafusion_extracted_1@0]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND id@0 < 5, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1 AND id_null_count@1 != row_count@2 AND id_min@3 < 5, required_guarantees=[]
+
+# Verify correctness - should return rows where 1 < id < 5 (id=2,3,4)
+query I
+SELECT s['value'] FROM simple_struct WHERE id > 1 AND id < 5 ORDER BY s['value'];
+----
+150
+200
+300
+
+query TT
+EXPLAIN SELECT s['value'], s['label'], id FROM simple_struct WHERE id > 1;
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 AS simple_struct.s[value], __datafusion_extracted_2 AS simple_struct.s[label], simple_struct.id
+02)--Filter: simple_struct.id > Int64(1)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value], __datafusion_extracted_2@1 as simple_struct.s[label], id@2 as id]
+02)--FilterExec: id@2 > 1
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[]
+
+# Verify correctness - note that id is now at index 2 in the augmented projection
+query ITI
+SELECT s['value'], s['label'], id FROM simple_struct WHERE id > 1 ORDER BY id LIMIT 3;
+----
+200 beta 2
+150 gamma 3
+300 delta 4
+
+query TT
+EXPLAIN SELECT s['value'] FROM simple_struct WHERE length(s['label']) > 4;
+----
+logical_plan
+01)Projection: __datafusion_extracted_2 AS simple_struct.s[value]
+02)--Filter: character_length(__datafusion_extracted_1) > Int32(4)
+03)----Projection: get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_2
+04)------TableScan: simple_struct projection=[s], partial_filters=[character_length(get_field(simple_struct.s, Utf8("label"))) > Int32(4)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_2@0 as simple_struct.s[value]]
+02)--FilterExec: character_length(__datafusion_extracted_1@0) > 4, projection=[__datafusion_extracted_2@1]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, label) as __datafusion_extracted_1, get_field(s@1, value) as __datafusion_extracted_2], file_type=parquet, predicate=character_length(get_field(s@1, label)) > 4
+
+# Verify correctness - filter on rows where label length > 4 (all have length 5, except 'one' has 3)
+# Wait, from the data: alpha(5), beta(4), gamma(5), delta(5), epsilon(7)
+# So: alpha, gamma, delta, epsilon (not beta which has 4 characters)
+query I
+SELECT s['value'] FROM simple_struct WHERE length(s['label']) > 4 ORDER BY s['value'];
+----
+100
+150
+250
+300
+
+#####################
+# Section 11a: ProjectionExec on top of a SortExec with missing Sort Columns
+#####################
+
+###
+# Test 11a.1: Sort by dropped column
+# Selects only id, drops s entirely, but sorts by s['value']
+###
+
+query TT
+EXPLAIN SELECT id FROM simple_struct ORDER BY s['value'];
+----
+logical_plan
+01)Projection: simple_struct.id
+02)--Sort: __datafusion_extracted_1 ASC NULLS LAST
+03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1
+04)------TableScan: simple_struct projection=[id, s]
+physical_plan
+01)ProjectionExec: expr=[id@0 as id]
+02)--SortExec: expr=[__datafusion_extracted_1@1 ASC NULLS LAST], preserve_partitioning=[false]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as __datafusion_extracted_1], file_type=parquet
+
+# Verify correctness
+query I
+SELECT id FROM simple_struct ORDER BY s['value'];
+----
+1
+3
+2
+5
+4
+
+###
+# Test 11a.2: Multiple sort columns with partial selection
+# Selects only id and s['value'], but sorts by id and s['label']
+# One sort column (s['label']) is not selected but needed for ordering
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id, s['label'];
+----
+logical_plan
+01)Projection: simple_struct.id, simple_struct.s[value]
+02)--Sort: simple_struct.id ASC NULLS LAST, __datafusion_extracted_1 ASC NULLS LAST
+03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_1
+04)------TableScan: simple_struct projection=[id, s]
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, simple_struct.s[value]@1 as simple_struct.s[value]]
+02)--SortExec: expr=[id@0 ASC NULLS LAST, __datafusion_extracted_1@2 ASC NULLS LAST], preserve_partitioning=[false]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as __datafusion_extracted_1], file_type=parquet
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM simple_struct ORDER BY id, s['label'];
+----
+1 100
+2 200
+3 150
+4 300
+5 250
+
+
+###
+# Test 11a.3: TopK with dropped sort column
+# Same as test 11a.1 but with LIMIT
+###
+
+query TT
+EXPLAIN SELECT id FROM simple_struct ORDER BY s['value'] LIMIT 2;
+----
+logical_plan
+01)Projection: simple_struct.id
+02)--Sort: __datafusion_extracted_1 ASC NULLS LAST, fetch=2
+03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1
+04)------TableScan: simple_struct projection=[id, s]
+physical_plan
+01)ProjectionExec: expr=[id@0 as id]
+02)--SortExec: TopK(fetch=2), expr=[__datafusion_extracted_1@1 ASC NULLS LAST], preserve_partitioning=[false]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as __datafusion_extracted_1], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query I
+SELECT id FROM simple_struct ORDER BY s['value'] LIMIT 2;
+----
+1
+3
+
+###
+# Test 11a.4: Sort by derived expression with dropped column
+# Projects only id, sorts by s['value'] * 2 (derived expression)
+# Sort column is computed but requires base columns not in projection
+###
+
+query TT
+EXPLAIN SELECT id FROM simple_struct ORDER BY s['value'] * 2;
+----
+logical_plan
+01)Projection: simple_struct.id
+02)--Sort: __datafusion_extracted_1 * Int64(2) ASC NULLS LAST
+03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1
+04)------TableScan: simple_struct projection=[id, s]
+physical_plan
+01)ProjectionExec: expr=[id@0 as id]
+02)--SortExec: expr=[__datafusion_extracted_1@1 * 2 ASC NULLS LAST], preserve_partitioning=[false]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as __datafusion_extracted_1], file_type=parquet
+
+# Verify correctness
+query I
+SELECT id FROM simple_struct ORDER BY s['value'] * 2;
+----
+1
+3
+2
+5
+4
+
+###
+# Test 11a.5: All sort columns selected
+# All columns needed for sorting are included in projection
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id, s['value'];
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, simple_struct.s[value] ASC NULLS LAST
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value"))
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST, simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM simple_struct ORDER BY id, s['value'];
+----
+1 100
+2 200
+3 150
+4 300
+5 250
+
+#####################
+# Section 12: Join Tests - get_field Extraction from Join Nodes
+#####################
+
+# Create a second table for join tests
+statement ok
+COPY (
+    SELECT
+        column1 as id,
+        column2 as s
+    FROM VALUES
+        (1, {role: 'admin', level: 10}),
+        (2, {role: 'user', level: 5}),
+        (3, {role: 'guest', level: 1}),
+        (4, {role: 'admin', level: 8}),
+        (5, {role: 'user', level: 3})
+) TO 'test_files/scratch/projection_pushdown/join_right.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE join_right STORED AS PARQUET
+LOCATION 'test_files/scratch/projection_pushdown/join_right.parquet';
+
+###
+# Test 12.1: Join with get_field in equijoin condition
+# Tests extraction from join ON clause - get_field on each side routed appropriately
+###
+
+query TT
+EXPLAIN SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.s['value'] = join_right.s['level'] * 10;
+----
+logical_plan
+01)Projection: simple_struct.id, join_right.id
+02)--Inner Join: __datafusion_extracted_1 = __datafusion_extracted_2 * Int64(10)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s]
+05)----Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_2, join_right.id
+06)------TableScan: join_right projection=[id, s]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(__datafusion_extracted_1@0, __datafusion_extracted_2 * Int64(10)@2)], projection=[id@1, id@3]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id, get_field(s@1, level) * 10 as __datafusion_extracted_2 * Int64(10)], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness - value = level * 10
+# simple_struct: (1,100), (2,200), (3,150), (4,300), (5,250)
+# join_right: (1,10), (2,5), (3,1), (4,8), (5,3)
+# Matches: simple_struct.value=100 matches join_right.level*10=100 (level=10, id=1)
+query II
+SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.s['value'] = join_right.s['level'] * 10
+ORDER BY simple_struct.id;
+----
+1 1
+
+###
+# Test 12.2: Join with get_field in non-equi filter
+# Tests extraction from join filter expression - left side only
+###
+
+query TT
+EXPLAIN SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.id = join_right.id
+WHERE simple_struct.s['value'] > 150;
+----
+logical_plan
+01)Inner Join: simple_struct.id = join_right.id
+02)--Projection: simple_struct.id
+03)----Filter: __datafusion_extracted_1 > Int64(150)
+04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+05)--------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)]
+06)--TableScan: join_right projection=[id]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)]
+02)--FilterExec: __datafusion_extracted_1@0 > 150, projection=[id@1]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=get_field(s@1, value) > 150
+04)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness - id matches and value > 150
+query II
+SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.id = join_right.id
+WHERE simple_struct.s['value'] > 150
+ORDER BY simple_struct.id;
+----
+2 2
+4 4
+5 5
+
+###
+# Test 12.3: Join with get_field from both sides in filter
+# Tests extraction routing to both left and right inputs
+###
+
+query TT
+EXPLAIN SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.id = join_right.id
+WHERE simple_struct.s['value'] > 100 AND join_right.s['level'] > 3;
+----
+logical_plan
+01)Inner Join: simple_struct.id = join_right.id
+02)--Projection: simple_struct.id
+03)----Filter: __datafusion_extracted_1 > Int64(100)
+04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+05)--------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(100)]
+06)--Projection: join_right.id
+07)----Filter: __datafusion_extracted_2 > Int64(3)
+08)------Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_2, join_right.id
+09)--------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("level")) > Int64(3)]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)]
+02)--FilterExec: __datafusion_extracted_1@0 > 100, projection=[id@1]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=get_field(s@1, value) > 100
+04)--FilterExec: __datafusion_extracted_2@0 > 3, projection=[id@1]
+05)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id], file_type=parquet, predicate=get_field(s@1, level) > 3 AND DynamicFilter [ empty ]
+
+# Verify correctness - id matches, value > 100, and level > 3
+# Matching ids where value > 100: 2(200), 3(150), 4(300), 5(250)
+# Of those, level > 3: 2(5), 4(8), 5(3) -> only 2 and 4
+query II
+SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.id = join_right.id
+WHERE simple_struct.s['value'] > 100 AND join_right.s['level'] > 3
+ORDER BY simple_struct.id;
+----
+2 2
+4 4
+
+###
+# Test 12.4: Join with get_field in SELECT projection
+# Tests that get_field in output columns pushes down through the join
+###
+
+query TT
+EXPLAIN SELECT simple_struct.id, simple_struct.s['label'], join_right.s['role']
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.id = join_right.id;
+----
+logical_plan
+01)Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[label], __datafusion_extracted_2 AS join_right.s[role]
+02)--Inner Join: simple_struct.id = join_right.id
+03)----Projection: get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_1, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s]
+05)----Projection: get_field(join_right.s, Utf8("role")) AS __datafusion_extracted_2, join_right.id
+06)------TableScan: join_right projection=[id, s]
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, __datafusion_extracted_1@1 as simple_struct.s[label], __datafusion_extracted_2@2 as join_right.s[role]]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@1, id@1)], projection=[id@1, __datafusion_extracted_1@0, __datafusion_extracted_2@2]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, label) as __datafusion_extracted_1, id], file_type=parquet
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, role) as __datafusion_extracted_2, id], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query ITT
+SELECT simple_struct.id, simple_struct.s['label'], join_right.s['role']
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.id = join_right.id
+ORDER BY simple_struct.id;
+----
+1 alpha admin
+2 beta user
+3 gamma guest
+4 delta admin
+5 epsilon user
+
+###
+# Test 12.5: Join without get_field (baseline - no extraction needed)
+# Verifies no unnecessary projections are added when there's nothing to extract
+###
+
+query TT
+EXPLAIN SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.id = join_right.id;
+----
+logical_plan
+01)Inner Join: simple_struct.id = join_right.id
+02)--TableScan: simple_struct projection=[id]
+03)--TableScan: join_right projection=[id]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id], file_type=parquet
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query II
+SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.id = join_right.id
+ORDER BY simple_struct.id;
+----
+1 1
+2 2
+3 3
+4 4
+5 5
+
+###
+# Test 12.6: Left Join with get_field extraction
+# Tests extraction works correctly with outer joins
+###
+
+query TT
+EXPLAIN SELECT simple_struct.id, simple_struct.s['value'], join_right.s['level']
+FROM simple_struct
+LEFT JOIN join_right ON simple_struct.id = join_right.id AND join_right.s['level'] > 5;
+----
+logical_plan
+01)Projection: simple_struct.id, __datafusion_extracted_2 AS simple_struct.s[value], __datafusion_extracted_3 AS join_right.s[level]
+02)--Left Join: simple_struct.id = join_right.id
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_2, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s]
+05)----Projection: join_right.id, __datafusion_extracted_3
+06)------Filter: __datafusion_extracted_1 > Int64(5)
+07)--------Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_1, join_right.id, get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_3
+08)----------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("level")) > Int64(5)]
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, __datafusion_extracted_2@1 as simple_struct.s[value], __datafusion_extracted_3@2 as join_right.s[level]]
+02)--HashJoinExec: mode=CollectLeft, join_type=Left, on=[(id@1, id@0)], projection=[id@1, __datafusion_extracted_2@0, __datafusion_extracted_3@3]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_2, id], file_type=parquet
+04)----FilterExec: __datafusion_extracted_1@0 > 5, projection=[id@1, __datafusion_extracted_3@2]
+05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_1, id, get_field(s@1, level) as __datafusion_extracted_3], file_type=parquet, predicate=get_field(s@1, level) > 5 AND DynamicFilter [ empty ]
+
+# Verify correctness - left join with level > 5 condition
+# Only join_right rows with level > 5 are matched: id=1 (level=10), id=4 (level=8)
+query III
+SELECT simple_struct.id, simple_struct.s['value'], join_right.s['level']
+FROM simple_struct
+LEFT JOIN join_right ON simple_struct.id = join_right.id AND join_right.s['level'] > 5
+ORDER BY simple_struct.id;
+----
+1 100 10
+2 200 NULL
+3 150 NULL
+4 300 8
+5 250 NULL
+
+#####################
+# Section 13: RepartitionExec tests
+#####################
+
+# Set target partitions to 32 -> this forces a RepartitionExec
+statement ok
+SET datafusion.execution.target_partitions = 32;
+
+query TT
+EXPLAIN SELECT s['value'] FROM simple_struct WHERE id > 2;
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 AS simple_struct.s[value]
+02)--Filter: simple_struct.id > Int64(2)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value]]
+02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0]
+03)----RepartitionExec: partitioning=RoundRobinBatch(32), input_partitions=1
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[]
+
+#####################
+# Section 14: SubqueryAlias tests
+#####################
+
+# Reset target partitions
+statement ok
+SET datafusion.execution.target_partitions = 1;
+
+# get_field pushdown through subquery alias with filter
+query TT
+EXPLAIN SELECT t.s['value'] FROM (SELECT * FROM simple_struct) t WHERE t.id > 2;
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 AS t.s[value]
+02)--SubqueryAlias: t
+03)----Projection: __datafusion_extracted_1
+04)------Filter: simple_struct.id > Int64(2)
+05)--------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+06)----------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 as t.s[value]]
+02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[]
+
+# Verify correctness
+query I
+SELECT t.s['value'] FROM (SELECT * FROM simple_struct) t WHERE t.id > 2 ORDER BY t.id;
+----
+150
+300
+250
+
+# Multiple get_field through subquery alias with sort
+query TT
+EXPLAIN SELECT t.s['value'], t.s['label'] FROM (SELECT * FROM simple_struct) t ORDER BY t.s['value'];
+----
+logical_plan
+01)Sort: t.s[value] ASC NULLS LAST
+02)--Projection: __datafusion_extracted_1 AS t.s[value], __datafusion_extracted_2 AS t.s[label]
+03)----SubqueryAlias: t
+04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2
+05)--------TableScan: simple_struct projection=[s]
+physical_plan
+01)SortExec: expr=[t.s[value]@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as t.s[value], get_field(s@1, label) as t.s[label]], file_type=parquet
+
+# Verify correctness
+query IT
+SELECT t.s['value'], t.s['label'] FROM (SELECT * FROM simple_struct) t ORDER BY t.s['value'];
+----
+100 alpha
+150 gamma
+200 beta
+250 epsilon
+300 delta
+
+# Nested subquery aliases
+query TT
+EXPLAIN SELECT u.s['value'] FROM (SELECT * FROM (SELECT * FROM simple_struct) t) u WHERE u.id > 2;
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 AS u.s[value]
+02)--SubqueryAlias: u
+03)----SubqueryAlias: t
+04)------Projection: __datafusion_extracted_1
+05)--------Filter: simple_struct.id > Int64(2)
+06)----------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+07)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 as u.s[value]]
+02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[]
+
+# Verify correctness
+query I
+SELECT u.s['value'] FROM (SELECT * FROM (SELECT * FROM simple_struct) t) u WHERE u.id > 2 ORDER BY u.id;
+----
+150
+300
+250
+
+# get_field in filter through subquery alias
+query TT
+EXPLAIN SELECT t.id FROM (SELECT * FROM simple_struct) t WHERE t.s['value'] > 200;
+----
+logical_plan
+01)SubqueryAlias: t
+02)--Projection: simple_struct.id
+03)----Filter: __datafusion_extracted_1 > Int64(200)
+04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+05)--------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(200)]
+physical_plan
+01)FilterExec: __datafusion_extracted_1@0 > 200, projection=[id@1]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=get_field(s@1, value) > 200
+
+# Verify correctness
+query I
+SELECT t.id FROM (SELECT * FROM simple_struct) t WHERE t.s['value'] > 200 ORDER BY t.id;
+----
+4
+5
+
+#####################
+# Section 15: UNION ALL tests
+#####################
+
+# get_field on UNION ALL result
+query TT
+EXPLAIN SELECT s['value'] FROM (
+    SELECT s FROM simple_struct WHERE id <= 3
+    UNION ALL
+    SELECT s FROM simple_struct WHERE id > 3
+) t;
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 AS t.s[value]
+02)--SubqueryAlias: t
+03)----Union
+04)------Projection: __datafusion_extracted_1
+05)--------Filter: simple_struct.id <= Int64(3)
+06)----------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+07)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id <= Int64(3)]
+08)------Projection: __datafusion_extracted_1
+09)--------Filter: simple_struct.id > Int64(3)
+10)----------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+11)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(3)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 as t.s[value]]
+02)--UnionExec
+03)----FilterExec: id@1 <= 3, projection=[__datafusion_extracted_1@0]
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 <= 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 <= 3, required_guarantees=[]
+05)----FilterExec: id@1 > 3, projection=[__datafusion_extracted_1@0]
+06)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 3, required_guarantees=[]
+
+# Verify correctness
+query I
+SELECT s['value'] FROM (
+    SELECT s FROM simple_struct WHERE id <= 3
+    UNION ALL
+    SELECT s FROM simple_struct WHERE id > 3
+) t ORDER BY s['value'];
+----
+100
+150
+200
+250
+300
+
+# Multiple get_field on UNION ALL with ORDER BY
+query TT
+EXPLAIN SELECT s['value'], s['label'] FROM (
+    SELECT s FROM simple_struct WHERE id <= 3
+    UNION ALL
+    SELECT s FROM simple_struct WHERE id > 3
+) t ORDER BY s['value'];
+----
+logical_plan
+01)Sort: t.s[value] ASC NULLS LAST
+02)--Projection: __datafusion_extracted_1 AS t.s[value], __datafusion_extracted_2 AS t.s[label]
+03)----SubqueryAlias: t
+04)------Union
+05)--------Projection: __datafusion_extracted_1, __datafusion_extracted_2
+06)----------Filter: simple_struct.id <= Int64(3)
+07)------------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id
+08)--------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id <= Int64(3)]
+09)--------Projection: __datafusion_extracted_1, __datafusion_extracted_2
+10)----------Filter: simple_struct.id > Int64(3)
+11)------------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id
+12)--------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(3)]
+physical_plan
+01)SortPreservingMergeExec: [t.s[value]@0 ASC NULLS LAST]
+02)--SortExec: expr=[t.s[value]@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[__datafusion_extracted_1@0 as t.s[value], __datafusion_extracted_2@1 as t.s[label]]
+04)------UnionExec
+05)--------FilterExec: id@2 <= 3, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1]
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 <= 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 <= 3, required_guarantees=[]
+07)--------FilterExec: id@2 > 3, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1]
+08)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 3, required_guarantees=[]
+
+# Verify correctness
+query IT
+SELECT s['value'], s['label'] FROM (
+    SELECT s FROM simple_struct WHERE id <= 3
+    UNION ALL
+    SELECT s FROM simple_struct WHERE id > 3
+) t ORDER BY s['value'];
+----
+100 alpha
+150 gamma
+200 beta
+250 epsilon
+300 delta
+
+#####################
+# Section 16: Aggregate / Join edge-case tests
+# Translated from unit tests in extract_leaf_expressions.rs
+#####################
+
+###
+# Test 16.1: Projection with get_field above Aggregate
+# Aggregate blocks pushdown, so the get_field stays in the top projection.
+# (mirrors test_projection_with_leaf_expr_above_aggregate)
+###
+
+query TT
+EXPLAIN SELECT s['label'] IS NOT NULL AS has_label, COUNT(1)
+FROM simple_struct GROUP BY s;
+----
+logical_plan
+01)Projection: get_field(simple_struct.s, Utf8("label")) IS NOT NULL AS has_label, count(Int64(1))
+02)--Aggregate: groupBy=[[simple_struct.s]], aggr=[[count(Int64(1))]]
+03)----TableScan: simple_struct projection=[s]
+physical_plan
+01)ProjectionExec: expr=[get_field(s@0, label) IS NOT NULL as has_label, count(Int64(1))@1 as count(Int64(1))]
+02)--AggregateExec: mode=Single, gby=[s@0 as s], aggr=[count(Int64(1))]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[s], file_type=parquet
+
+# Verify correctness - all labels are non-null
+query BI
+SELECT s['label'] IS NOT NULL AS has_label, COUNT(1)
+FROM simple_struct GROUP BY s ORDER BY COUNT(1);
+----
+true 1
+true 1
+true 1
+true 1
+true 1
+
+###
+# Test 16.2: Join with get_field filter on qualified right side
+# The get_field on join_right.s['role'] must be routed to the right input only.
+# (mirrors test_extract_from_join_qualified_right_side)
+###
+
+query TT
+EXPLAIN
+SELECT s.s['value'], j.s['role']
+FROM join_right j
+INNER JOIN simple_struct s ON s.id = j.id
+WHERE s.s['value'] > j.s['level'];
+----
+logical_plan
+01)Projection: __datafusion_extracted_3 AS s.s[value], __datafusion_extracted_4 AS j.s[role]
+02)--Inner Join: j.id = s.id Filter: __datafusion_extracted_1 > __datafusion_extracted_2
+03)----SubqueryAlias: j
+04)------Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_2, get_field(join_right.s, Utf8("role")) AS __datafusion_extracted_4, join_right.id
+05)--------TableScan: join_right projection=[id, s]
+06)----SubqueryAlias: s
+07)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_3, simple_struct.id
+08)--------TableScan: simple_struct projection=[id, s]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_3@0 as s.s[value], __datafusion_extracted_4@1 as j.s[role]]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@2, id@2)], filter=__datafusion_extracted_1@1 > __datafusion_extracted_2@0, projection=[__datafusion_extracted_3@4, __datafusion_extracted_4@1]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, get_field(s@1, role) as __datafusion_extracted_4, id], file_type=parquet
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, value) as __datafusion_extracted_3, id], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness - only admin roles match (ids 1 and 4)
+query II
+SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right
+  ON simple_struct.id = join_right.id
+  AND join_right.s['role'] = 'admin'
+ORDER BY simple_struct.id;
+----
+1 1
+4 4
+
+###
+# Test 16.3: Join with cross-input get_field comparison in WHERE
+# get_field from each side is extracted and routed to its respective input independently.
+# (mirrors test_extract_from_join_cross_input_expression)
+###
+
+query TT
+EXPLAIN SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.id = join_right.id
+WHERE simple_struct.s['value'] > join_right.s['level'];
+----
+logical_plan
+01)Projection: simple_struct.id, join_right.id
+02)--Inner Join: simple_struct.id = join_right.id Filter: __datafusion_extracted_1 > __datafusion_extracted_2
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s]
+05)----Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_2, join_right.id
+06)------TableScan: join_right projection=[id, s]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@1, id@1)], filter=__datafusion_extracted_1@0 > __datafusion_extracted_2@1, projection=[id@1, id@3]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness - all rows match since value >> level for all ids
+# simple_struct: (1,100), (2,200), (3,150), (4,300), (5,250)
+# join_right:    (1,10),  (2,5),   (3,1),   (4,8),   (5,3)
+query II
+SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.id = join_right.id
+WHERE simple_struct.s['value'] > join_right.s['level']
+ORDER BY simple_struct.id;
+----
+1 1
+2 2
+3 3
+4 4
+5 5
+
+# =========================================================================
+# Regression: user-provided __datafusion_extracted aliases must not
+# collide with optimizer-generated ones
+# (https://github.com/apache/datafusion/issues/20430)
+# =========================================================================
+
+statement ok
+COPY ( select {f1: 1, f2: 2} as s
+) TO 'test_files/scratch/projection_pushdown/test.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE t
+STORED AS PARQUET
+LOCATION 'test_files/scratch/projection_pushdown/test.parquet';
+
+# Verify that the user-provided __datafusion_extracted_2 alias is preserved
+# and the optimizer skips to _3 and _4 for its generated aliases.
+query TT
+EXPLAIN SELECT
+    get_field(s, 'f1') AS __datafusion_extracted_2
+FROM t
+WHERE COALESCE(get_field(s, 'f1'), get_field(s, 'f2')) = 1;
+----
+logical_plan
+01)Projection: __datafusion_extracted_2
+02)--Filter: CASE WHEN __datafusion_extracted_3 IS NOT NULL THEN __datafusion_extracted_3 ELSE __datafusion_extracted_4 END = Int64(1)
+03)----Projection: get_field(t.s, Utf8("f1")) AS __datafusion_extracted_3, get_field(t.s, Utf8("f2")) AS __datafusion_extracted_4, get_field(t.s, Utf8("f1")) AS __datafusion_extracted_2
+04)------TableScan: t projection=[s], partial_filters=[CASE WHEN get_field(t.s, Utf8("f1")) IS NOT NULL THEN get_field(t.s, Utf8("f1")) ELSE get_field(t.s, Utf8("f2")) END = Int64(1)]
+physical_plan
+01)FilterExec: CASE WHEN __datafusion_extracted_3@0 IS NOT NULL THEN __datafusion_extracted_3@0 ELSE __datafusion_extracted_4@1 END = 1, projection=[__datafusion_extracted_2@2]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/test.parquet]]}, projection=[get_field(s@0, f1) as __datafusion_extracted_3, get_field(s@0, f2) as __datafusion_extracted_4, get_field(s@0, f1) as __datafusion_extracted_2], file_type=parquet, predicate=CASE WHEN get_field(s@0, f1) IS NOT NULL THEN get_field(s@0, f1) ELSE get_field(s@0, f2) END = 1
+
+query I
+SELECT
+    get_field(s, 'f1') AS __datafusion_extracted_2
+FROM t
+WHERE COALESCE(get_field(s, 'f1'), get_field(s, 'f2')) = 1;
+----
+1
+
+#####################
+# Section 8: SELECT * with struct field filter
+#####################
+
+# When SELECT * includes the full struct but the filter only accesses a
+# sub-field (e.g. s['id']), the leaf-level projection must not narrow the
+# struct schema in the output. Previously build_projection_read_plan would
+# produce a schema with Struct("id": Int32) while the data still contained
+# Struct("id": Int32, "value": Utf8), causing an ArrowError.
+
+# 8.1: SELECT * with equality filter on struct sub-field
+query I?
+SELECT * FROM simple_struct WHERE s['value'] = 100;
+----
+1 {value: 100, label: alpha}
+
+# 8.2: Explicit SELECT of whole struct with struct sub-field filter
+query ?
+SELECT s FROM simple_struct WHERE s['value'] = 100;
+----
+{value: 100, label: alpha}
+
+# 8.3: Whole struct + sub-field projection + sub-field filter
+query I?I
+SELECT s['value'], s, id FROM simple_struct WHERE s['value'] = 100;
+----
+100 {value: 100, label: alpha} 1
+
+# 8.4: Whole struct in output, filter on a different sub-field than projected
+query ?T
+SELECT s, s['label'] FROM simple_struct WHERE s['value'] > 200;
+----
+{value: 300, label: delta} delta
+{value: 250, label: epsilon} epsilon
+
+# 8.5: Filter references both sub-fields, output includes whole struct
+query I?
+SELECT id, s FROM simple_struct WHERE s['value'] > 100 AND s['label'] = 'beta';
+----
+2 {value: 200, label: beta}
+
+# 8.6: Only sub-field projection with sub-field filter (no whole struct — should prune)
+query II
+SELECT id, s['value'] FROM simple_struct WHERE s['value'] = 100;
+----
+1 100
+
+# 8.7: Nested struct — whole struct output with deeply nested field filter
+query I?
+SELECT * FROM nested_struct WHERE nested['outer']['inner'] > 15;
+----
+2 {outer: {inner: 20, name: two}, extra: y}
+3 {outer: {inner: 30, name: three}, extra: z}
+
+# 8.8: Nested struct — explicit whole struct select with sibling field filter
+query ?
+SELECT nested FROM nested_struct WHERE nested['extra'] = 'y';
+----
+{outer: {inner: 20, name: two}, extra: y}
+
+# 8.9: Nullable struct — whole struct output with sub-field filter
+query ?
+SELECT s FROM nullable_struct WHERE s['value'] > 100;
+----
+{value: 150, label: gamma}
+{value: 250, label: epsilon}
+
+# 8.10: Struct sub-field filter combined with top-level column filter
+query ?I
+SELECT s, id FROM simple_struct WHERE s['value'] > 100 AND id < 4;
+----
+{value: 200, label: beta} 2
+{value: 150, label: gamma} 3
+
+# Config reset
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so
+# reset it explicitly.
+statement ok
+SET datafusion.execution.target_partitions = 4;
diff --git a/datafusion/sqllogictest/test_files/propagate_empty_relation_outer_join.slt b/datafusion/sqllogictest/test_files/propagate_empty_relation_outer_join.slt
new file mode 100644
index 0000000000000..016a3108e509b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/propagate_empty_relation_outer_join.slt
@@ -0,0 +1,155 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test PropagateEmptyRelation rule: outer joins where one side is
+# an EmptyRelation should be replaced with a null-padded projection.
+
+statement ok
+create table t1(a int, b varchar, c double);
+
+statement ok
+create table t2(x int, y varchar, z double);
+
+statement ok
+insert into t1 values (1, 'a', 10.0), (2, 'b', 20.0), (3, 'c', 30.0);
+
+statement ok
+insert into t2 values (1, 'p', 100.0), (2, 'q', 200.0);
+
+statement ok
+set datafusion.explain.logical_plan_only = true;
+
+###
+### LEFT JOIN with empty right (WHERE false subquery)
+###
+
+# The join should be eliminated — no join operator in the plan
+query TT
+explain select * from t1 left join (select * from t2 where false) r on t1.a = r.x;
+----
+logical_plan
+01)Projection: t1.a, t1.b, t1.c, Int32(NULL) AS x, Utf8View(NULL) AS y, Float64(NULL) AS z
+02)--TableScan: t1 projection=[a, b, c]
+
+# Verify result correctness — all left rows with NULLs on right
+query ITRITR rowsort
+select * from t1 left join (select * from t2 where false) r on t1.a = r.x;
+----
+1 a 10 NULL NULL NULL
+2 b 20 NULL NULL NULL
+3 c 30 NULL NULL NULL
+
+###
+### RIGHT JOIN with empty left
+###
+
+query TT
+explain select * from (select * from t1 where false) l right join t2 on l.a = t2.x;
+----
+logical_plan
+01)Projection: Int32(NULL) AS a, Utf8View(NULL) AS b, Float64(NULL) AS c, t2.x, t2.y, t2.z
+02)--TableScan: t2 projection=[x, y, z]
+
+query ITRITR rowsort
+select * from (select * from t1 where false) l right join t2 on l.a = t2.x;
+----
+NULL NULL NULL 1 p 100
+NULL NULL NULL 2 q 200
+
+###
+### FULL JOIN with empty right
+###
+
+query TT
+explain select * from t1 full join (select * from t2 where false) r on t1.a = r.x;
+----
+logical_plan
+01)Projection: t1.a, t1.b, t1.c, Int32(NULL) AS x, Utf8View(NULL) AS y, Float64(NULL) AS z
+02)--TableScan: t1 projection=[a, b, c]
+
+query ITRITR rowsort
+select * from t1 full join (select * from t2 where false) r on t1.a = r.x;
+----
+1 a 10 NULL NULL NULL
+2 b 20 NULL NULL NULL
+3 c 30 NULL NULL NULL
+
+###
+### FULL JOIN with empty left
+###
+
+query TT
+explain select * from (select * from t1 where false) l full join t2 on l.a = t2.x;
+----
+logical_plan
+01)Projection: Int32(NULL) AS a, Utf8View(NULL) AS b, Float64(NULL) AS c, t2.x, t2.y, t2.z
+02)--TableScan: t2 projection=[x, y, z]
+
+query ITRITR rowsort
+select * from (select * from t1 where false) l full join t2 on l.a = t2.x;
+----
+NULL NULL NULL 1 p 100
+NULL NULL NULL 2 q 200
+
+###
+### Filter on top of optimized join
+###
+
+query TT
+explain select * from t1 left join (select * from t2 where false) r on t1.a = r.x where t1.a > 1;
+----
+logical_plan
+01)Projection: t1.a, t1.b, t1.c, Int32(NULL) AS x, Utf8View(NULL) AS y, Float64(NULL) AS z
+02)--Filter: t1.a > Int32(1)
+03)----TableScan: t1 projection=[a, b, c]
+
+query ITRITR rowsort
+select * from t1 left join (select * from t2 where false) r on t1.a = r.x where t1.a > 1;
+----
+2 b 20 NULL NULL NULL
+3 c 30 NULL NULL NULL
+
+###
+### LEFT JOIN with complex ON condition and empty right
+###
+
+query TT
+explain select * from t1 left join (select * from t2 where false) r on t1.a = r.x and t1.c > r.z;
+----
+logical_plan
+01)Projection: t1.a, t1.b, t1.c, Int32(NULL) AS x, Utf8View(NULL) AS y, Float64(NULL) AS z
+02)--TableScan: t1 projection=[a, b, c]
+
+query ITRITR rowsort
+select * from t1 left join (select * from t2 where false) r on t1.a = r.x and t1.c > r.z;
+----
+1 a 10 NULL NULL NULL
+2 b 20 NULL NULL NULL
+3 c 30 NULL NULL NULL
+
+###
+### Cleanup
+###
+
+statement ok
+set datafusion.explain.logical_plan_only = false;
+
+statement ok
+drop table t1;
+
+statement ok
+drop table t2;
diff --git a/datafusion/sqllogictest/test_files/push_down_filter.slt b/datafusion/sqllogictest/test_files/push_down_filter.slt
deleted file mode 100644
index 47095d92d9376..0000000000000
--- a/datafusion/sqllogictest/test_files/push_down_filter.slt
+++ /dev/null
@@ -1,422 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-
-#   http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Test push down filter
-
-statement ok
-set datafusion.explain.physical_plan_only = true;
-
-statement ok
-CREATE TABLE IF NOT EXISTS v AS VALUES(1,[1,2,3]),(2,[3,4,5]);
-
-query I
-select uc2 from (select unnest(column2) as uc2, column1 from v) where column1 = 2;
-----
-3
-4
-5
-
-# test push down filter for unnest with filter on non-unnest column
-# filter plan is pushed down into projection plan
-query TT
-explain select uc2 from (select unnest(column2) as uc2, column1 from v) where column1 = 2;
-----
-physical_plan
-01)ProjectionExec: expr=[__unnest_placeholder(v.column2,depth=1)@0 as uc2]
-02)--UnnestExec
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-04)------ProjectionExec: expr=[column2@0 as __unnest_placeholder(v.column2)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------FilterExec: column1@0 = 2, projection=[column2@1]
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-
-query I
-select uc2 from (select unnest(column2) as uc2, column1 from v) where uc2 > 3;
-----
-4
-5
-
-# test push down filter for unnest with filter on unnest column
-query TT
-explain select uc2 from (select unnest(column2) as uc2, column1 from v) where uc2 > 3;
-----
-physical_plan
-01)ProjectionExec: expr=[__unnest_placeholder(v.column2,depth=1)@0 as uc2]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: __unnest_placeholder(v.column2,depth=1)@0 > 3
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------UnnestExec
-06)----------ProjectionExec: expr=[column2@0 as __unnest_placeholder(v.column2)]
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-
-query II
-select uc2, column1 from  (select unnest(column2) as uc2, column1 from v) where uc2 > 3 AND column1 = 2;
-----
-4 2
-5 2
-
-# Could push the filter (column1 = 2) down below unnest
-query TT
-explain select uc2, column1 from  (select unnest(column2) as uc2, column1 from v) where uc2 > 3 AND column1 = 2;
-----
-physical_plan
-01)ProjectionExec: expr=[__unnest_placeholder(v.column2,depth=1)@0 as uc2, column1@1 as column1]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: __unnest_placeholder(v.column2,depth=1)@0 > 3
-04)------UnnestExec
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)----------ProjectionExec: expr=[column2@1 as __unnest_placeholder(v.column2), column1@0 as column1]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------FilterExec: column1@0 = 2
-09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
-
-query II
-select uc2, column1 from  (select unnest(column2) as uc2, column1 from v) where uc2 > 3 OR column1 = 2;
-----
-3 2
-4 2
-5 2
-
-# only non-unnest filter in AND clause could be pushed down
-query TT
-explain select uc2, column1 from  (select unnest(column2) as uc2, column1 from v) where uc2 > 3 OR column1 = 2;
-----
-physical_plan
-01)ProjectionExec: expr=[__unnest_placeholder(v.column2,depth=1)@0 as uc2, column1@1 as column1]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: __unnest_placeholder(v.column2,depth=1)@0 > 3 OR column1@1 = 2
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------UnnestExec
-06)----------ProjectionExec: expr=[column2@1 as __unnest_placeholder(v.column2), column1@0 as column1]
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-
-statement ok
-drop table v;
-
-# test with unnest struct, should not push down filter
-statement ok
-CREATE TABLE d AS VALUES(1,[named_struct('a', 1, 'b', 2)]),(2,[named_struct('a', 3, 'b', 4), named_struct('a', 5, 'b', 6)]);
-
-query I?
-select * from (select column1, unnest(column2) as o from d) where o['a'] = 1;
-----
-1 {a: 1, b: 2}
-
-query TT
-explain select * from (select column1, unnest(column2) as o from d) where o['a'] = 1;
-----
-physical_plan
-01)ProjectionExec: expr=[column1@0 as column1, __unnest_placeholder(d.column2,depth=1)@1 as o]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: get_field(__unnest_placeholder(d.column2,depth=1)@1, a) = 1
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------UnnestExec
-06)----------ProjectionExec: expr=[column1@0 as column1, column2@1 as __unnest_placeholder(d.column2)]
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-
-statement ok
-drop table d;
-
-statement ok
-CREATE TABLE d AS VALUES (named_struct('a', 1, 'b', 2)), (named_struct('a', 3, 'b', 4)), (named_struct('a', 5, 'b', 6));
-
-query II
-select * from (select unnest(column1) from d) where "__unnest_placeholder(d.column1).b" > 5;
-----
-5 6
-
-query TT
-explain select * from (select unnest(column1) from d) where "__unnest_placeholder(d.column1).b" > 5;
-----
-physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: __unnest_placeholder(d.column1).b@1 > 5
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-04)------UnnestExec
-05)--------ProjectionExec: expr=[column1@0 as __unnest_placeholder(d.column1)]
-06)----------DataSourceExec: partitions=1, partition_sizes=[1]
-
-statement ok
-drop table d;
-
-# Test push down filter with limit for parquet
-statement ok
-set datafusion.execution.parquet.pushdown_filters = true;
-
-# this one is also required to make DF skip second file due to "sufficient" amount of rows
-statement ok
-set datafusion.execution.collect_statistics = true;
-
-# Create a table as a data source
-statement ok
-CREATE TABLE src_table (
-    part_key INT,
-    value INT
-) AS VALUES(1, 0), (1, 1), (1, 100), (2, 0), (2, 2), (2, 2), (2, 100), (3, 4), (3, 5), (3, 6);
-
-
-# There will be more than 2 records filtered from the table to check that `limit 1` actually applied.
-# Setup 3 files, i.e., as many as there are partitions:
-
-# File 1:
-query I
-COPY (SELECT * FROM src_table where part_key = 1)
-TO 'test_files/scratch/push_down_filter/test_filter_with_limit/part-0.parquet'
-STORED AS PARQUET;
-----
-3
-
-# File 2:
-query I
-COPY (SELECT * FROM src_table where part_key = 2)
-TO 'test_files/scratch/push_down_filter/test_filter_with_limit/part-1.parquet'
-STORED AS PARQUET;
-----
-4
-
-# File 3:
-query I
-COPY (SELECT * FROM src_table where part_key = 3)
-TO 'test_files/scratch/push_down_filter/test_filter_with_limit/part-2.parquet'
-STORED AS PARQUET;
-----
-3
-
-statement ok
-CREATE EXTERNAL TABLE test_filter_with_limit
-(
-  part_key INT,
-  value INT
-)
-STORED AS PARQUET
-LOCATION 'test_files/scratch/push_down_filter/test_filter_with_limit/';
-
-query TT
-explain select * from test_filter_with_limit where value = 2 limit 1;
-----
-physical_plan
-01)CoalescePartitionsExec: fetch=1
-02)--DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/test_filter_with_limit/part-0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/test_filter_with_limit/part-1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/test_filter_with_limit/part-2.parquet]]}, projection=[part_key, value], limit=1, file_type=parquet, predicate=value@1 = 2, pruning_predicate=value_null_count@2 != row_count@3 AND value_min@0 <= 2 AND 2 <= value_max@1, required_guarantees=[value in (2)]
-
-query II
-select * from test_filter_with_limit where value = 2 limit 1;
-----
-2 2
-
-
-# Tear down test_filter_with_limit table:
-statement ok
-DROP TABLE test_filter_with_limit;
-
-# Tear down src_table table:
-statement ok
-DROP TABLE src_table;
-
-
-query I
-COPY (VALUES (1), (2), (3), (4), (5), (6), (7), (8), (9), (10))
-TO 'test_files/scratch/push_down_filter/t.parquet'
-STORED AS PARQUET;
-----
-10
-
-statement ok
-CREATE EXTERNAL TABLE t
-(
-  a INT
-)
-STORED AS PARQUET
-LOCATION 'test_files/scratch/push_down_filter/t.parquet';
-
-
-# The predicate should not have a column cast  when the value is a valid i32
-query TT
-explain select a from t where a = '100';
-----
-physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 = 100, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= 100 AND 100 <= a_max@1, required_guarantees=[a in (100)]
-
-# The predicate should not have a column cast  when the value is a valid i32
-query TT
-explain select a from t where a != '100';
-----
-physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 != 100, pruning_predicate=a_null_count@2 != row_count@3 AND (a_min@0 != 100 OR 100 != a_max@1), required_guarantees=[a not in (100)]
-
-# The predicate should still have the column cast when the value is a NOT valid i32
-query TT
-explain select a from t where a = '99999999999';
-----
-physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 99999999999
-
-# The predicate should still have the column cast when the value is a NOT valid i32
-query TT
-explain select a from t where a = '99.99';
-----
-physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 99.99
-
-# The predicate should still have the column cast when the value is a NOT valid i32
-query TT
-explain select a from t where a = '';
-----
-physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 
-
-# The predicate should not have a column cast when the operator is = or != and the literal can be round-trip casted without losing information.
-query TT
-explain select a from t where cast(a as string) = '100';
-----
-physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 = 100, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= 100 AND 100 <= a_max@1, required_guarantees=[a in (100)]
-
-# The predicate should still have the column cast when the literal alters its string representation after round-trip casting (leading zero lost).
-query TT
-explain select a from t where CAST(a AS string) = '0123';
-----
-physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8View) = 0123
-
-
-# Test dynamic filter pushdown with swapped join inputs (issue #17196)
-# Create tables with different sizes to force join input swapping
-statement ok
-copy (select i as k from generate_series(1, 100) t(i)) to 'test_files/scratch/push_down_filter/small_table.parquet';
-
-statement ok
-copy (select i as k, i as v from generate_series(1, 1000) t(i)) to 'test_files/scratch/push_down_filter/large_table.parquet';
-
-statement ok
-create external table small_table stored as parquet location 'test_files/scratch/push_down_filter/small_table.parquet';
-
-statement ok
-create external table large_table stored as parquet location 'test_files/scratch/push_down_filter/large_table.parquet';
-
-# Test that dynamic filter is applied to the correct table after join input swapping
-# The small_table should be the build side, large_table should be the probe side with dynamic filter
-query TT
-explain select * from small_table join large_table on small_table.k = large_table.k where large_table.v >= 50;
-----
-physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(k@0, k@0)]
-03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/small_table.parquet]]}, projection=[k], file_type=parquet
-04)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/large_table.parquet]]}, projection=[k, v], file_type=parquet, predicate=v@1 >= 50 AND DynamicFilter [ empty ], pruning_predicate=v_null_count@1 != row_count@2 AND v_max@0 >= 50, required_guarantees=[]
-
-statement ok
-drop table small_table;
-
-statement ok
-drop table large_table;
-
-statement ok
-drop table t;
-
-# Regression test for https://github.com/apache/datafusion/issues/17188
-query I
-COPY (select i as k from generate_series(1, 10000000) as t(i))
-TO 'test_files/scratch/push_down_filter/t1.parquet'
-STORED AS PARQUET;
-----
-10000000
-
-query I
-COPY (select i as k, i as v from generate_series(1, 10000000) as t(i))
-TO 'test_files/scratch/push_down_filter/t2.parquet'
-STORED AS PARQUET;
-----
-10000000
-
-statement ok
-create external table t1 stored as parquet location 'test_files/scratch/push_down_filter/t1.parquet';
-
-statement ok
-create external table t2 stored as parquet location 'test_files/scratch/push_down_filter/t2.parquet';
-
-# The failure before https://github.com/apache/datafusion/pull/17197 was non-deterministic and random
-# So we'll run the same query a couple of times just to have more certainty it's fixed
-# Sorry about the spam in this slt test...
-
-query III rowsort
-select *
-from t1
-join t2 on t1.k = t2.k
-where v = 1 or v = 10000000
-order by t1.k, t2.v;
-----
-1 1 1
-10000000 10000000 10000000
-
-query III rowsort
-select *
-from t1
-join t2 on t1.k = t2.k
-where v = 1 or v = 10000000
-order by t1.k, t2.v;
-----
-1 1 1
-10000000 10000000 10000000
-
-query III rowsort
-select *
-from t1
-join t2 on t1.k = t2.k
-where v = 1 or v = 10000000
-order by t1.k, t2.v;
-----
-1 1 1
-10000000 10000000 10000000
-
-query III rowsort
-select *
-from t1
-join t2 on t1.k = t2.k
-where v = 1 or v = 10000000
-order by t1.k, t2.v;
-----
-1 1 1
-10000000 10000000 10000000
-
-query III rowsort
-select *
-from t1
-join t2 on t1.k = t2.k
-where v = 1 or v = 10000000
-order by t1.k, t2.v;
-----
-1 1 1
-10000000 10000000 10000000
-
-# Regression test for https://github.com/apache/datafusion/issues/17512
-
-query I
-COPY (
-    SELECT arrow_cast('2025-01-01T00:00:00Z'::timestamptz, 'Timestamp(Microsecond, Some("UTC"))') AS start_timestamp
-)
-TO 'test_files/scratch/push_down_filter/17512.parquet'
-STORED AS PARQUET;
-----
-1
-
-statement ok
-CREATE EXTERNAL TABLE records STORED AS PARQUET LOCATION 'test_files/scratch/push_down_filter/17512.parquet';
-
-query I
-SELECT 1
-FROM (
-    SELECT start_timestamp
-    FROM records
-    WHERE start_timestamp <= '2025-01-01T00:00:00Z'::timestamptz
-) AS t
-WHERE t.start_timestamp::time < '00:00:01'::time;
-----
-1
diff --git a/datafusion/sqllogictest/test_files/push_down_filter_outer_joins.slt b/datafusion/sqllogictest/test_files/push_down_filter_outer_joins.slt
new file mode 100644
index 0000000000000..2e5f7c317fd43
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/push_down_filter_outer_joins.slt
@@ -0,0 +1,264 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test push down filter
+
+# check LEFT/RIGHT joins with filter pushdown to both relations (when possible)
+
+statement ok
+create table t1(k int, v int);
+
+statement ok
+create table t2(k int, v int);
+
+statement ok
+insert into t1 values
+  (1, 10),
+  (2, 20),
+  (3, 30),
+  (null, 40),
+  (50, null),
+  (null, null);
+
+statement ok
+insert into t2 values
+  (1, 11),
+  (2, 21),
+  (2, 22),
+  (null, 41),
+  (51, null),
+  (null, null);
+
+statement ok
+set datafusion.explain.physical_plan_only = false;
+
+statement ok
+set datafusion.explain.logical_plan_only = true;
+
+
+# left join + filter on join key -> pushed
+query TT
+explain select * from t1 left join t2 on t1.k = t2.k where t1.k > 1;
+----
+logical_plan
+01)Left Join: t1.k = t2.k
+02)--Filter: t1.k > Int32(1)
+03)----TableScan: t1 projection=[k, v]
+04)--Filter: t2.k > Int32(1)
+05)----TableScan: t2 projection=[k, v]
+
+query IIII rowsort
+select * from t1 left join t2 on t1.k = t2.k where t1.k > 1;
+----
+2 20 2 21
+2 20 2 22
+3 30 NULL NULL
+50 NULL NULL NULL
+
+# left join + filter on another column -> not pushed
+query TT
+explain select * from t1 left join t2 on t1.k = t2.k where t1.v > 1;
+----
+logical_plan
+01)Left Join: t1.k = t2.k
+02)--Filter: t1.v > Int32(1)
+03)----TableScan: t1 projection=[k, v]
+04)--TableScan: t2 projection=[k, v]
+
+query IIII rowsort
+select * from t1 left join t2 on t1.k = t2.k where t1.v > 1;
+----
+1 10 1 11
+2 20 2 21
+2 20 2 22
+3 30 NULL NULL
+NULL 40 NULL NULL
+
+# left join + or + filter on another column -> not pushed
+query TT
+explain select * from t1 left join t2 on t1.k = t2.k where t1.k > 3 or t1.v > 20;
+----
+logical_plan
+01)Left Join: t1.k = t2.k
+02)--Filter: t1.k > Int32(3) OR t1.v > Int32(20)
+03)----TableScan: t1 projection=[k, v]
+04)--TableScan: t2 projection=[k, v]
+
+query IIII rowsort
+select * from t1 left join t2 on t1.k = t2.k where t1.k > 3 or t1.v > 20;
+----
+3 30 NULL NULL
+50 NULL NULL NULL
+NULL 40 NULL NULL
+
+
+# right join + filter on join key -> pushed
+query TT
+explain select * from t1 right join t2 on t1.k = t2.k where t1.k > 1;
+----
+logical_plan
+01)Inner Join: t1.k = t2.k
+02)--Filter: t1.k > Int32(1)
+03)----TableScan: t1 projection=[k, v]
+04)--Filter: t2.k > Int32(1)
+05)----TableScan: t2 projection=[k, v]
+
+query IIII rowsort
+select * from t1 right join t2 on t1.k = t2.k where t1.k > 1;
+----
+2 20 2 21
+2 20 2 22
+
+# right join + filter on another column -> not pushed
+query TT
+explain select * from t1 right join t2 on t1.k = t2.k where t1.v > 1;
+----
+logical_plan
+01)Inner Join: t1.k = t2.k
+02)--Filter: t1.v > Int32(1)
+03)----TableScan: t1 projection=[k, v]
+04)--TableScan: t2 projection=[k, v]
+
+query IIII rowsort
+select * from t1 right join t2 on t1.k = t2.k where t1.v > 1;
+----
+1 10 1 11
+2 20 2 21
+2 20 2 22
+
+# right join + or + filter on another column -> not pushed
+query TT
+explain select * from t1 right join t2 on t1.k = t2.k where t1.k > 3 or t1.v > 20;
+----
+logical_plan
+01)Inner Join: t1.k = t2.k
+02)--Filter: t1.k > Int32(3) OR t1.v > Int32(20)
+03)----TableScan: t1 projection=[k, v]
+04)--TableScan: t2 projection=[k, v]
+
+query IIII rowsort
+select * from t1 right join t2 on t1.k = t2.k where t1.k > 3 or t1.v > 20;
+----
+
+
+# left anti join + filter on join key -> pushed
+query TT
+explain select * from t1 left anti join t2 on t1.k = t2.k where t1.k > 1;
+----
+logical_plan
+01)LeftAnti Join: t1.k = t2.k
+02)--Filter: t1.k > Int32(1)
+03)----TableScan: t1 projection=[k, v]
+04)--Filter: t2.k > Int32(1)
+05)----TableScan: t2 projection=[k]
+
+query II rowsort
+select * from t1 left anti join t2 on t1.k = t2.k where t1.k > 1;
+----
+3 30
+50 NULL
+
+# left anti join + filter on another column -> not pushed
+query TT
+explain select * from t1 left anti join t2 on t1.k = t2.k where t1.v > 1;
+----
+logical_plan
+01)LeftAnti Join: t1.k = t2.k
+02)--Filter: t1.v > Int32(1)
+03)----TableScan: t1 projection=[k, v]
+04)--TableScan: t2 projection=[k]
+
+query II rowsort
+select * from t1 left anti join t2 on t1.k = t2.k where t1.v > 1;
+----
+3 30
+NULL 40
+
+# left anti join + or + filter on another column -> not pushed
+query TT
+explain select * from t1 left anti join t2 on t1.k = t2.k where t1.k > 3 or t1.v > 20;
+----
+logical_plan
+01)LeftAnti Join: t1.k = t2.k
+02)--Filter: t1.k > Int32(3) OR t1.v > Int32(20)
+03)----TableScan: t1 projection=[k, v]
+04)--TableScan: t2 projection=[k]
+
+query II rowsort
+select * from t1 left anti join t2 on t1.k = t2.k where t1.k > 3 or t1.v > 20;
+----
+3 30
+50 NULL
+NULL 40
+
+
+# right anti join + filter on join key -> pushed
+query TT
+explain select * from t1 right anti join t2 on t1.k = t2.k where t2.k > 1;
+----
+logical_plan
+01)RightAnti Join: t1.k = t2.k
+02)--Filter: t1.k > Int32(1)
+03)----TableScan: t1 projection=[k]
+04)--Filter: t2.k > Int32(1)
+05)----TableScan: t2 projection=[k, v]
+
+query II rowsort
+select * from t1 right anti join t2 on t1.k = t2.k where t2.k > 1;
+----
+51 NULL
+
+# right anti join + filter on another column -> not pushed
+query TT
+explain select * from t1 right anti join t2 on t1.k = t2.k where t2.v > 1;
+----
+logical_plan
+01)RightAnti Join: t1.k = t2.k
+02)--TableScan: t1 projection=[k]
+03)--Filter: t2.v > Int32(1)
+04)----TableScan: t2 projection=[k, v]
+
+query II rowsort
+select * from t1 right anti join t2 on t1.k = t2.k where t2.v > 1;
+----
+NULL 41
+
+# right anti join + or + filter on another column -> not pushed
+query TT
+explain select * from t1 right anti join t2 on t1.k = t2.k where t2.k > 3 or t2.v > 20;
+----
+logical_plan
+01)RightAnti Join: t1.k = t2.k
+02)--TableScan: t1 projection=[k]
+03)--Filter: t2.k > Int32(3) OR t2.v > Int32(20)
+04)----TableScan: t2 projection=[k, v]
+
+query II rowsort
+select * from t1 right anti join t2 on t1.k = t2.k where t2.k > 3 or t2.v > 20;
+----
+51 NULL
+NULL 41
+
+
+statement ok
+set datafusion.explain.logical_plan_only = false;
+
+statement ok
+drop table t1;
+
+statement ok
+drop table t2;
diff --git a/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt b/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt
new file mode 100644
index 0000000000000..8469c32a17033
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt
@@ -0,0 +1,1034 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test push down filter
+
+statement ok
+set datafusion.explain.physical_plan_only = true;
+
+# Test push down filter with limit for parquet
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+# this one is also required to make DF skip second file due to "sufficient" amount of rows
+statement ok
+set datafusion.execution.collect_statistics = true;
+
+# Create a table as a data source
+statement ok
+CREATE TABLE src_table (
+    part_key INT,
+    value INT
+) AS VALUES(1, 0), (1, 1), (1, 100), (2, 0), (2, 2), (2, 2), (2, 100), (3, 4), (3, 5), (3, 6);
+
+
+# There will be more than 2 records filtered from the table to check that `limit 1` actually applied.
+# Setup 3 files, i.e., as many as there are partitions:
+
+# File 1:
+query I
+COPY (SELECT * FROM src_table where part_key = 1)
+TO 'test_files/scratch/push_down_filter_parquet/test_filter_with_limit/part-0.parquet'
+STORED AS PARQUET;
+----
+3
+
+# File 2:
+query I
+COPY (SELECT * FROM src_table where part_key = 2)
+TO 'test_files/scratch/push_down_filter_parquet/test_filter_with_limit/part-1.parquet'
+STORED AS PARQUET;
+----
+4
+
+# File 3:
+query I
+COPY (SELECT * FROM src_table where part_key = 3)
+TO 'test_files/scratch/push_down_filter_parquet/test_filter_with_limit/part-2.parquet'
+STORED AS PARQUET;
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE test_filter_with_limit
+(
+  part_key INT,
+  value INT
+)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/test_filter_with_limit/';
+
+query TT
+explain select * from test_filter_with_limit where value = 2 limit 1;
+----
+physical_plan
+01)CoalescePartitionsExec: fetch=1
+02)--DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/test_filter_with_limit/part-0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/test_filter_with_limit/part-1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/test_filter_with_limit/part-2.parquet]]}, projection=[part_key, value], limit=1, file_type=parquet, predicate=value@1 = 2, pruning_predicate=value_null_count@2 != row_count@3 AND value_min@0 <= 2 AND 2 <= value_max@1, required_guarantees=[value in (2)]
+
+query II
+select * from test_filter_with_limit where value = 2 limit 1;
+----
+2 2
+
+
+# Tear down test_filter_with_limit table:
+statement ok
+DROP TABLE test_filter_with_limit;
+
+# Tear down src_table table:
+statement ok
+DROP TABLE src_table;
+
+
+query I
+COPY (VALUES (1), (2), (3), (4), (5), (6), (7), (8), (9), (10))
+TO 'test_files/scratch/push_down_filter_parquet/t.parquet'
+STORED AS PARQUET;
+----
+10
+
+statement ok
+CREATE EXTERNAL TABLE t
+(
+  a INT
+)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/t.parquet';
+
+
+# The predicate should not have a column cast  when the value is a valid i32
+query TT
+explain select a from t where a = '100';
+----
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 = 100, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= 100 AND 100 <= a_max@1, required_guarantees=[a in (100)]
+
+# The predicate should not have a column cast  when the value is a valid i32
+query TT
+explain select a from t where a != '100';
+----
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 != 100, pruning_predicate=a_null_count@2 != row_count@3 AND (a_min@0 != 100 OR 100 != a_max@1), required_guarantees=[a not in (100)]
+
+# The predicate should not have a column cast when the operator is = or != and the literal can be round-trip casted without losing information.
+query TT
+explain select a from t where cast(a as string) = '100';
+----
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 = 100, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= 100 AND 100 <= a_max@1, required_guarantees=[a in (100)]
+
+# The predicate should still have the column cast when the literal alters its string representation after round-trip casting (leading zero lost).
+query TT
+explain select a from t where CAST(a AS string) = '0123';
+----
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8View) = 0123
+
+
+# Test dynamic filter pushdown with swapped join inputs (issue #17196)
+# Create tables with different sizes to force join input swapping
+statement ok
+copy (select i as k from generate_series(1, 100) t(i)) to 'test_files/scratch/push_down_filter_parquet/small_table.parquet';
+
+statement ok
+copy (select i as k, i as v from generate_series(1, 1000) t(i)) to 'test_files/scratch/push_down_filter_parquet/large_table.parquet';
+
+statement ok
+create external table small_table stored as parquet location 'test_files/scratch/push_down_filter_parquet/small_table.parquet';
+
+statement ok
+create external table large_table stored as parquet location 'test_files/scratch/push_down_filter_parquet/large_table.parquet';
+
+# Test that dynamic filter is applied to the correct table after join input swapping
+# The small_table should be the build side, large_table should be the probe side with dynamic filter
+query TT
+explain select * from small_table join large_table on small_table.k = large_table.k where large_table.v >= 50;
+----
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(k@0, k@0)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/small_table.parquet]]}, projection=[k], file_type=parquet
+03)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/large_table.parquet]]}, projection=[k, v], file_type=parquet, predicate=v@1 >= 50 AND DynamicFilter [ empty ], pruning_predicate=v_null_count@1 != row_count@2 AND v_max@0 >= 50, required_guarantees=[]
+
+statement ok
+drop table small_table;
+
+statement ok
+drop table large_table;
+
+
+########
+# Ported from datafusion/core/tests/physical_optimizer/filter_pushdown.rs::
+#   test_topk_dynamic_filter_pushdown_integration
+#
+# Integration test for dynamic filter pushdown with TopK:
+# a TopK over an ordered parquet file emits a dynamic filter that should
+# get pushed into the parquet scan and prune every row group except the
+# first (128 rows), visible via `pushdown_rows_matched` / `pushdown_rows_pruned`.
+#
+# `t * t` (rather than `t`) is used as the sort expression so the
+# order-pushdown optimizer does not rewrite the TopK away.
+########
+
+statement ok
+set datafusion.execution.parquet.max_row_group_size = 128;
+
+query I
+COPY (
+  SELECT 1372708800 + value AS t
+  FROM generate_series(0, 99999)
+  ORDER BY t
+) TO 'test_files/scratch/push_down_filter_parquet/topk_pushdown.parquet'
+STORED AS PARQUET;
+----
+100000
+
+statement ok
+CREATE EXTERNAL TABLE topk_pushdown
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/topk_pushdown.parquet';
+
+statement ok
+set datafusion.explain.analyze_categories = 'rows';
+
+query TT
+EXPLAIN ANALYZE SELECT t FROM topk_pushdown ORDER BY t * t LIMIT 10;
+----
+Plan with Metrics
+01)SortExec: TopK(fetch=10), expr=[t@0 * t@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[t@0 * t@0 < 1884329474306198481], metrics=[output_rows=10, output_batches=1, row_replacements=10]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_pushdown.parquet]]}, projection=[t], output_ordering=[t@0 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ t@0 * t@0 < 1884329474306198481 ], metrics=[output_rows=128, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=782 total → 782 matched, row_groups_pruned_bloom_filter=782 total → 782 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=128, pushdown_rows_pruned=99.87 K, predicate_cache_inner_records=128, predicate_cache_records=128, scan_efficiency_ratio=64.87% (258.7 K/398.8 K)]
+
+statement ok
+reset datafusion.explain.analyze_categories;
+
+statement ok
+reset datafusion.execution.parquet.max_row_group_size;
+
+statement ok
+drop table topk_pushdown;
+
+
+########
+# Ported from datafusion/core/tests/physical_optimizer/filter_pushdown.rs::
+#   test_topk_dynamic_filter_pushdown
+#
+# A `ORDER BY b DESC LIMIT 1` over a parquet file should emit a dynamic
+# filter on the TopK that gets pushed into the scan's predicate, visible
+# in EXPLAIN ANALYZE as `DynamicFilter [ b > <max> ]` tightened to the
+# running max seen by the TopK during execution.
+########
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    ('aa', 'bd', 1.0),
+    ('ab', 'bc', 2.0),
+    ('ac', 'bb', 2.0),
+    ('ad', 'ba', 1.0)
+  ) AS v(a, b, c)
+) TO 'test_files/scratch/push_down_filter_parquet/topk_single_col.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE topk_single_col (a VARCHAR, b VARCHAR, c DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/topk_single_col.parquet';
+
+# Data-correctness check — top row by b desc is (aa, bd, 1).
+query TTR
+SELECT * FROM topk_single_col ORDER BY b DESC LIMIT 1;
+----
+aa bd 1
+
+# Before execution: a dynamic filter is emitted but still empty
+# (DynamicFilter [ empty ]) on the parquet scan.
+query TT
+EXPLAIN SELECT * FROM topk_single_col ORDER BY b DESC LIMIT 1;
+----
+physical_plan
+01)SortExec: TopK(fetch=1), expr=[b@1 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_single_col.parquet]]}, projection=[a, b, c], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+statement ok
+set datafusion.explain.analyze_categories = 'rows';
+
+# After execution: the dynamic filter is tightened to `b > <max_seen>`.
+query TT
+EXPLAIN ANALYZE SELECT * FROM topk_single_col ORDER BY b DESC LIMIT 1;
+----
+Plan with Metrics
+01)SortExec: TopK(fetch=1), expr=[b@1 DESC], preserve_partitioning=[false], filter=[b@1 IS NULL OR b@1 > bd], metrics=[output_rows=1, output_batches=1, row_replacements=1]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_single_col.parquet]]}, projection=[a, b, c], file_type=parquet, predicate=DynamicFilter [ b@1 IS NULL OR b@1 > bd ], pruning_predicate=b_null_count@0 > 0 OR b_null_count@0 != row_count@2 AND b_max@1 > bd, required_guarantees=[], metrics=[output_rows=4, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=4, pushdown_rows_pruned=0, predicate_cache_inner_records=4, predicate_cache_records=4, scan_efficiency_ratio=22.37% (240/1.07 K)]
+
+statement ok
+reset datafusion.explain.analyze_categories;
+
+statement ok
+drop table topk_single_col;
+
+
+########
+# Ported from datafusion/core/tests/physical_optimizer/filter_pushdown.rs::
+#   test_topk_dynamic_filter_pushdown_multi_column_sort
+#
+# TopK with a compound sort key (b ASC NULLS LAST, a DESC). The dynamic
+# filter emitted to the scan should encode the tiebreaker as a compound
+# predicate of the form `b < <k1> OR (b = <k1> AND a > <k2>)`.
+# With fetch=2 the top-2 rows are (ad, ba) and (ac, bb), so the final
+# filter reads `b < bb OR (b = bb AND a > ac)`.
+########
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    ('ac', 'bb', 2.0),
+    ('ad', 'ba', 1.0),
+    ('aa', 'bc', 1.0),
+    ('ab', 'bd', 2.0)
+  ) AS v(a, b, c)
+) TO 'test_files/scratch/push_down_filter_parquet/topk_multi_col.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE topk_multi_col (a VARCHAR, b VARCHAR, c DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/topk_multi_col.parquet';
+
+# Data-correctness: confirm the top-2 rows come out in the expected order.
+query TTR
+SELECT * FROM topk_multi_col ORDER BY b ASC NULLS LAST, a DESC LIMIT 2;
+----
+ad ba 1
+ac bb 2
+
+statement ok
+set datafusion.explain.analyze_categories = 'rows';
+
+query TT
+EXPLAIN ANALYZE SELECT * FROM topk_multi_col ORDER BY b ASC NULLS LAST, a DESC LIMIT 2;
+----
+Plan with Metrics
+01)SortExec: TopK(fetch=2), expr=[b@1 ASC NULLS LAST, a@0 DESC], preserve_partitioning=[false], filter=[b@1 < bb OR b@1 = bb AND (a@0 IS NULL OR a@0 > ac)], metrics=[output_rows=2, output_batches=1, row_replacements=2]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_multi_col.parquet]]}, projection=[a, b, c], file_type=parquet, predicate=DynamicFilter [ b@1 < bb OR b@1 = bb AND (a@0 IS NULL OR a@0 > ac) ], pruning_predicate=b_null_count@1 != row_count@2 AND b_min@0 < bb OR b_null_count@1 != row_count@2 AND b_min@0 <= bb AND bb <= b_max@3 AND (a_null_count@4 > 0 OR a_null_count@4 != row_count@2 AND a_max@5 > ac), required_guarantees=[], metrics=[output_rows=4, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=4, pushdown_rows_pruned=0, predicate_cache_inner_records=8, predicate_cache_records=8, scan_efficiency_ratio=22.37% (240/1.07 K)]
+
+statement ok
+reset datafusion.explain.analyze_categories;
+
+statement ok
+drop table topk_multi_col;
+
+
+########
+# Ported from datafusion/core/tests/physical_optimizer/filter_pushdown.rs::
+#   test_hashjoin_dynamic_filter_pushdown
+#
+# CollectLeft hash join on two equi-join keys: the dynamic filter emitted
+# by the build side should cover both keys and include an `IN (SET)`
+# predicate over `struct(a, b)` for the rows it collected.
+########
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    ('aa', 'ba', 1.0),
+    ('ab', 'bb', 2.0)
+  ) AS v(a, b, c)
+) TO 'test_files/scratch/push_down_filter_parquet/join_build.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    ('aa', 'ba', 1.0),
+    ('ab', 'bb', 2.0),
+    ('ac', 'bc', 3.0),
+    ('ad', 'bd', 4.0)
+  ) AS v(a, b, e)
+) TO 'test_files/scratch/push_down_filter_parquet/join_probe.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE join_build (a VARCHAR, b VARCHAR, c DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/join_build.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE join_probe (a VARCHAR, b VARCHAR, e DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/join_probe.parquet';
+
+# Data-correctness: only the 2 build rows should match.
+query TTRR
+SELECT p.a, p.b, build.c, p.e
+FROM join_probe p INNER JOIN join_build AS build
+  ON p.a = build.a AND p.b = build.b
+ORDER BY p.a;
+----
+aa ba 1 1
+ab bb 2 2
+
+statement ok
+set datafusion.explain.analyze_categories = 'rows';
+
+query TT
+EXPLAIN ANALYZE
+SELECT p.a, p.b, build.c, p.e
+FROM join_probe p INNER JOIN join_build AS build
+  ON p.a = build.a AND p.b = build.b;
+----
+Plan with Metrics
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)], projection=[a@3, b@4, c@2, e@5], metrics=[output_rows=2, output_batches=1, array_map_created_count=0, build_input_batches=1, build_input_rows=2, input_batches=1, input_rows=2, avg_fanout=100% (2/2), probe_hit_rate=100% (2/2)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/join_build.parquet]]}, projection=[a, b, c], file_type=parquet, metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=20.48% (214/1.04 K)]
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/join_probe.parquet]]}, projection=[a, b, e], file_type=parquet, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ], pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 >= aa AND a_null_count@1 != row_count@2 AND a_min@3 <= ab AND b_null_count@5 != row_count@2 AND b_max@4 >= ba AND b_null_count@5 != row_count@2 AND b_min@6 <= bb, required_guarantees=[], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=2, pushdown_rows_pruned=2, predicate_cache_inner_records=8, predicate_cache_records=4, scan_efficiency_ratio=22.78% (246/1.08 K)]
+
+statement ok
+reset datafusion.explain.analyze_categories;
+
+statement ok
+drop table join_build;
+
+statement ok
+drop table join_probe;
+
+
+########
+# Ported from datafusion/core/tests/physical_optimizer/filter_pushdown.rs::
+#   test_nested_hashjoin_dynamic_filter_pushdown
+#
+# Nested hash joins: `t1 JOIN (t2 JOIN t3 ON t2.c = t3.d) ON t1.a = t2.b`
+# should push dynamic filters down to BOTH the t2 and t3 scans
+# (t2 gets a filter on `b`, t3 gets a filter on `d`).
+########
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    ('aa', 1.0),
+    ('ab', 2.0)
+  ) AS v(a, x)
+) TO 'test_files/scratch/push_down_filter_parquet/nested_t1.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    ('aa', 'ca', 1.0),
+    ('ab', 'cb', 2.0),
+    ('ac', 'cc', 3.0),
+    ('ad', 'cd', 4.0),
+    ('ae', 'ce', 5.0)
+  ) AS v(b, c, y)
+) TO 'test_files/scratch/push_down_filter_parquet/nested_t2.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    ('ca', 1.0),
+    ('cb', 2.0),
+    ('cc', 3.0),
+    ('cd', 4.0),
+    ('ce', 5.0),
+    ('cf', 6.0),
+    ('cg', 7.0),
+    ('ch', 8.0)
+  ) AS v(d, z)
+) TO 'test_files/scratch/push_down_filter_parquet/nested_t3.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE nested_t1 (a VARCHAR, x DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/nested_t1.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE nested_t2 (b VARCHAR, c VARCHAR, y DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/nested_t2.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE nested_t3 (d VARCHAR, z DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/nested_t3.parquet';
+
+statement ok
+set datafusion.explain.analyze_categories = 'rows';
+
+query TT
+EXPLAIN ANALYZE
+SELECT *
+FROM nested_t1
+INNER JOIN nested_t2 ON nested_t1.a = nested_t2.b
+INNER JOIN nested_t3 ON nested_t2.c = nested_t3.d;
+----
+Plan with Metrics
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(c@3, d@0)], metrics=[output_rows=2, output_batches=1, array_map_created_count=0, build_input_batches=1, build_input_rows=2, input_batches=1, input_rows=2, avg_fanout=100% (2/2), probe_hit_rate=100% (2/2)]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, b@0)], metrics=[output_rows=2, output_batches=1, array_map_created_count=0, build_input_batches=1, build_input_rows=2, input_batches=1, input_rows=2, avg_fanout=100% (2/2), probe_hit_rate=100% (2/2)]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/nested_t1.parquet]]}, projection=[a, x], file_type=parquet, metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=18.23% (144/790)]
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/nested_t2.parquet]]}, projection=[b, c, y], file_type=parquet, predicate=DynamicFilter [ b@0 >= aa AND b@0 <= ab AND b@0 IN (SET) ([aa, ab]) ], pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 >= aa AND b_null_count@1 != row_count@2 AND b_min@3 <= ab AND (b_null_count@1 != row_count@2 AND b_min@3 <= aa AND aa <= b_max@0 OR b_null_count@1 != row_count@2 AND b_min@3 <= ab AND ab <= b_max@0), required_guarantees=[b in (aa, ab)], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=1 total → 1 matched, page_index_rows_pruned=5 total → 5 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=2, pushdown_rows_pruned=3, predicate_cache_inner_records=5, predicate_cache_records=2, scan_efficiency_ratio=23.2% (252/1.09 K)]
+05)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/nested_t3.parquet]]}, projection=[d, z], file_type=parquet, predicate=DynamicFilter [ d@0 >= ca AND d@0 <= cb AND hash_lookup ], pruning_predicate=d_null_count@1 != row_count@2 AND d_max@0 >= ca AND d_null_count@1 != row_count@2 AND d_min@3 <= cb, required_guarantees=[], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=1 total → 1 matched, page_index_rows_pruned=8 total → 8 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=2, pushdown_rows_pruned=6, predicate_cache_inner_records=8, predicate_cache_records=2, scan_efficiency_ratio=22.12% (184/832)]
+
+statement ok
+reset datafusion.explain.analyze_categories;
+
+statement ok
+drop table nested_t1;
+
+statement ok
+drop table nested_t2;
+
+statement ok
+drop table nested_t3;
+
+
+########
+# Ported from datafusion/core/tests/physical_optimizer/filter_pushdown.rs::
+#   test_hashjoin_parent_filter_pushdown
+#
+# A FilterExec above a HashJoin should split: the single-side predicates
+# should be pushed down to each scan, while any cross-side predicate
+# stays above the join.
+########
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    ('aa', 'ba', 1.0),
+    ('ab', 'bb', 2.0)
+  ) AS v(a, b, c)
+) TO 'test_files/scratch/push_down_filter_parquet/parent_build.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    ('aa', 'ba', 1.0),
+    ('ab', 'bb', 2.0),
+    ('ac', 'bc', 3.0),
+    ('ad', 'bd', 4.0)
+  ) AS v(d, e, f)
+) TO 'test_files/scratch/push_down_filter_parquet/parent_probe.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE parent_build (a VARCHAR, b VARCHAR, c DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/parent_build.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE parent_probe (d VARCHAR, e VARCHAR, f DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/parent_probe.parquet';
+
+query TT
+EXPLAIN
+SELECT *
+FROM parent_build
+INNER JOIN parent_probe ON parent_build.a = parent_probe.d
+WHERE parent_build.a = 'aa' AND parent_probe.e = 'ba';
+----
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, d@0)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/parent_build.parquet]]}, projection=[a, b, c], file_type=parquet, predicate=a@0 = aa, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= aa AND aa <= a_max@1, required_guarantees=[a in (aa)]
+03)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/parent_probe.parquet]]}, projection=[d, e, f], file_type=parquet, predicate=e@1 = ba AND d@0 = aa AND DynamicFilter [ empty ], pruning_predicate=e_null_count@2 != row_count@3 AND e_min@0 <= ba AND ba <= e_max@1 AND d_null_count@6 != row_count@3 AND d_min@4 <= aa AND aa <= d_max@5, required_guarantees=[d in (aa), e in (ba)]
+
+statement ok
+drop table parent_build;
+
+statement ok
+drop table parent_probe;
+
+
+########
+# Ported from datafusion/core/tests/physical_optimizer/filter_pushdown.rs::
+#   test_dynamic_filter_pushdown_through_hash_join_with_topk
+#
+# Composition: a TopK sitting above an InnerJoin should push its dynamic
+# filter THROUGH the hash join and into the probe-side scan, where it
+# gets combined (AND) with the hash join's own dynamic filter. The
+# probe-side predicate should contain BOTH a `DynamicFilter [ d IN (...) ]`
+# (from the join) and a `DynamicFilter [ e < <topk_bound> ]` (from the TopK).
+########
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    ('aa', 'ba', 1.0),
+    ('ab', 'bb', 2.0)
+  ) AS v(a, b, c)
+) TO 'test_files/scratch/push_down_filter_parquet/topk_join_build.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    ('aa', 'ba', 1.0),
+    ('ab', 'bb', 2.0),
+    ('ac', 'bc', 3.0),
+    ('ad', 'bd', 4.0)
+  ) AS v(d, e, f)
+) TO 'test_files/scratch/push_down_filter_parquet/topk_join_probe.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE topk_join_build (a VARCHAR, b VARCHAR, c DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/topk_join_build.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE topk_join_probe (d VARCHAR, e VARCHAR, f DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/topk_join_probe.parquet';
+
+statement ok
+set datafusion.explain.analyze_categories = 'rows';
+
+query TT
+EXPLAIN ANALYZE
+SELECT topk_join_probe.e
+FROM topk_join_build
+INNER JOIN topk_join_probe ON topk_join_build.a = topk_join_probe.d
+ORDER BY topk_join_probe.e ASC
+LIMIT 2;
+----
+Plan with Metrics
+01)SortExec: TopK(fetch=2), expr=[e@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[e@0 < bb], metrics=[output_rows=2, output_batches=1, row_replacements=2]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, d@0)], projection=[e@2], metrics=[output_rows=2, output_batches=1, array_map_created_count=0, build_input_batches=1, build_input_rows=2, input_batches=1, input_rows=2, avg_fanout=100% (2/2), probe_hit_rate=100% (2/2)]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_join_build.parquet]]}, projection=[a], file_type=parquet, metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=6.7% (70/1.04 K)]
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_join_probe.parquet]]}, projection=[d, e], file_type=parquet, predicate=DynamicFilter [ d@0 >= aa AND d@0 <= ab AND d@0 IN (SET) ([aa, ab]) ] AND DynamicFilter [ e@1 < bb ], pruning_predicate=d_null_count@1 != row_count@2 AND d_max@0 >= aa AND d_null_count@1 != row_count@2 AND d_min@3 <= ab AND (d_null_count@1 != row_count@2 AND d_min@3 <= aa AND aa <= d_max@0 OR d_null_count@1 != row_count@2 AND d_min@3 <= ab AND ab <= d_max@0) AND e_null_count@5 != row_count@2 AND e_min@4 < bb, required_guarantees=[d in (aa, ab)], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=1 total → 1 matched, page_index_rows_pruned=4 total → 4 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=2, pushdown_rows_pruned=2, predicate_cache_inner_records=8, predicate_cache_records=4, scan_efficiency_ratio=15.37% (166/1.08 K)]
+
+statement ok
+reset datafusion.explain.analyze_categories;
+
+statement ok
+drop table topk_join_build;
+
+statement ok
+drop table topk_join_probe;
+
+
+########
+# Ported from datafusion/core/tests/physical_optimizer/filter_pushdown.rs::
+#   test_topk_with_projection_transformation_on_dyn_filter
+#
+# When a ProjectionExec sits between a TopK and the parquet scan, the
+# dynamic filter emitted by the TopK (on the projected column) must be
+# rewritten back to the original scan-side column before it reaches the
+# scan. Covers: simple `SELECT a`, column reorder (`SELECT b, a`),
+# expression projection (`SELECT a+1 AS a_plus_1`), and column alias
+# shadowing (`SELECT a+1 AS a`).
+########
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    (1, 'x', 1.0),
+    (2, 'y', 2.0),
+    (3, 'z', 3.0)
+  ) AS v(a, b, c)
+) TO 'test_files/scratch/push_down_filter_parquet/topk_proj.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE topk_proj (a INT, b VARCHAR, c DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/topk_proj.parquet';
+
+statement ok
+set datafusion.explain.analyze_categories = 'rows';
+
+# Case 1: reorder — `SELECT b, a` — filter on the TopK uses projected column
+# `a`, but the scan predicate must reference the original `a@0`.
+query TT
+EXPLAIN ANALYZE SELECT b, a FROM topk_proj ORDER BY a LIMIT 2;
+----
+Plan with Metrics
+01)SortExec: TopK(fetch=2), expr=[a@1 ASC NULLS LAST], preserve_partitioning=[false], filter=[a@1 < 2], metrics=[output_rows=2, output_batches=1, row_replacements=2]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_proj.parquet]]}, projection=[b, a], file_type=parquet, predicate=DynamicFilter [ a@0 < 2 ], pruning_predicate=a_null_count@1 != row_count@2 AND a_min@0 < 2, required_guarantees=[], metrics=[output_rows=3, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=3, pushdown_rows_pruned=0, predicate_cache_inner_records=3, predicate_cache_records=3, scan_efficiency_ratio=13.72% (153/1.11 K)]
+
+# Case 2: prune — `SELECT a` — filter stays as `a < 2` on the scan.
+query TT
+EXPLAIN ANALYZE SELECT a FROM topk_proj ORDER BY a LIMIT 2;
+----
+Plan with Metrics
+01)SortExec: TopK(fetch=2), expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[a@0 < 2], metrics=[output_rows=2, output_batches=1, row_replacements=2]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_proj.parquet]]}, projection=[a], file_type=parquet, predicate=DynamicFilter [ a@0 < 2 ], pruning_predicate=a_null_count@1 != row_count@2 AND a_min@0 < 2, required_guarantees=[], metrics=[output_rows=3, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=3, pushdown_rows_pruned=0, predicate_cache_inner_records=3, predicate_cache_records=3, scan_efficiency_ratio=7.09% (79/1.11 K)]
+
+# Case 3: expression — `SELECT a+1 AS a_plus_1` — the TopK filter is on
+# `a_plus_1`, the scan predicate must read `a@0 + 1`.
+query TT
+EXPLAIN ANALYZE SELECT a + 1 AS a_plus_1, b FROM topk_proj ORDER BY a_plus_1 LIMIT 2;
+----
+Plan with Metrics
+01)SortExec: TopK(fetch=2), expr=[a_plus_1@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[a_plus_1@0 < 3], metrics=[output_rows=2, output_batches=1, row_replacements=2]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_proj.parquet]]}, projection=[CAST(a@0 AS Int64) + 1 as a_plus_1, b], file_type=parquet, predicate=DynamicFilter [ CAST(a@0 AS Int64) + 1 < 3 ], metrics=[output_rows=3, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=3, pushdown_rows_pruned=0, predicate_cache_inner_records=3, predicate_cache_records=3, scan_efficiency_ratio=13.72% (153/1.11 K)]
+
+# Case 4: alias shadowing — `SELECT a+1 AS a` — the projection renames
+# `a+1` to `a`, so the TopK's `a < 3` must still be rewritten to
+# `a@0 + 1 < 3` on the scan.
+query TT
+EXPLAIN ANALYZE SELECT a + 1 AS a, b FROM topk_proj ORDER BY a LIMIT 2;
+----
+Plan with Metrics
+01)SortExec: TopK(fetch=2), expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[a@0 < 3], metrics=[output_rows=2, output_batches=1, row_replacements=2]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_proj.parquet]]}, projection=[CAST(a@0 AS Int64) + 1 as a, b], file_type=parquet, predicate=DynamicFilter [ CAST(a@0 AS Int64) + 1 < 3 ], metrics=[output_rows=3, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=3, pushdown_rows_pruned=0, predicate_cache_inner_records=3, predicate_cache_records=3, scan_efficiency_ratio=13.72% (153/1.11 K)]
+
+statement ok
+reset datafusion.explain.analyze_categories;
+
+statement ok
+drop table topk_proj;
+
+
+########
+# Ported from datafusion/core/tests/physical_optimizer/filter_pushdown.rs::
+#   test_hashjoin_dynamic_filter_pushdown_through_aggregate_with_reordered_input
+#
+# The HashJoin's dynamic filter on the join key should propagate down
+# through a GROUP BY aggregate and land on the underlying parquet scan.
+########
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES ('h1'), ('h2')) AS v(a)
+) TO 'test_files/scratch/push_down_filter_parquet/join_agg_build.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    ('h1', 1.0),
+    ('h2', 2.0),
+    ('h3', 3.0),
+    ('h4', 4.0)
+  ) AS v(a, value)
+) TO 'test_files/scratch/push_down_filter_parquet/join_agg_probe.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE join_agg_build (a VARCHAR)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/join_agg_build.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE join_agg_probe (a VARCHAR, value DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/join_agg_probe.parquet';
+
+statement ok
+set datafusion.explain.analyze_categories = 'rows';
+
+query TT
+EXPLAIN ANALYZE
+SELECT b.a, agg.min_value
+FROM join_agg_build b
+INNER JOIN (
+  SELECT a, MIN(value) AS min_value FROM join_agg_probe GROUP BY a
+) agg ON b.a = agg.a;
+----
+Plan with Metrics
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0)], projection=[a@0, min_value@2], metrics=[output_rows=2, output_batches=2, array_map_created_count=0, build_input_batches=1, build_input_rows=2, input_batches=2, input_rows=2, avg_fanout=100% (2/2), probe_hit_rate=100% (2/2)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/join_agg_build.parquet]]}, projection=[a], file_type=parquet, metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=15.32% (70/457)]
+03)--ProjectionExec: expr=[a@0 as a, min(join_agg_probe.value)@1 as min_value], metrics=[output_rows=2, output_batches=2]
+04)----AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[min(join_agg_probe.value)], metrics=[output_rows=2, output_batches=2, spill_count=0, spilled_rows=0]
+05)------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1, metrics=[output_rows=2, output_batches=2, spill_count=0, spilled_rows=0]
+06)--------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[min(join_agg_probe.value)], metrics=[output_rows=2, output_batches=1, spill_count=0, spilled_rows=0, skipped_aggregation_rows=0, reduction_factor=100% (2/2)]
+07)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/join_agg_probe.parquet]]}, projection=[a, value], file_type=parquet, predicate=DynamicFilter [ a@0 >= h1 AND a@0 <= h2 AND a@0 IN (SET) ([h1, h2]) ], pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 >= h1 AND a_null_count@1 != row_count@2 AND a_min@3 <= h2 AND (a_null_count@1 != row_count@2 AND a_min@3 <= h1 AND h1 <= a_max@0 OR a_null_count@1 != row_count@2 AND a_min@3 <= h2 AND h2 <= a_max@0), required_guarantees=[a in (h1, h2)], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=1 total → 1 matched, page_index_rows_pruned=4 total → 4 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=2, pushdown_rows_pruned=2, predicate_cache_inner_records=4, predicate_cache_records=2, scan_efficiency_ratio=19.81% (163/823)]
+
+statement ok
+reset datafusion.explain.analyze_categories;
+
+statement ok
+drop table join_agg_build;
+
+statement ok
+drop table join_agg_probe;
+
+
+########
+# Ported from datafusion/core/tests/physical_optimizer/filter_pushdown.rs::
+#   test_hashjoin_dynamic_filter_with_nulls
+#
+# HashJoin build side contains NULL join-key values. The dynamic filter
+# emitted to the probe scan should still correctly handle NULLs.
+########
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    ('aa', 1),
+    (CAST(NULL AS VARCHAR), 2),
+    ('ab', CAST(NULL AS INT))
+  ) AS v(a, b)
+) TO 'test_files/scratch/push_down_filter_parquet/nulls_build.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    ('aa', 1, 1.0),
+    ('ab', 3, 2.0),
+    ('ac', 4, 3.0),
+    (CAST(NULL AS VARCHAR), 5, 4.0)
+  ) AS v(a, b, c)
+) TO 'test_files/scratch/push_down_filter_parquet/nulls_probe.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE nulls_build (a VARCHAR, b INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/nulls_build.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE nulls_probe (a VARCHAR, b INT, c DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/nulls_probe.parquet';
+
+statement ok
+set datafusion.explain.analyze_categories = 'rows';
+
+query TT
+EXPLAIN ANALYZE
+SELECT nulls_build.a, nulls_build.b, nulls_probe.a, nulls_probe.b, nulls_probe.c
+FROM nulls_build INNER JOIN nulls_probe
+ON nulls_build.a = nulls_probe.a AND nulls_build.b = nulls_probe.b;
+----
+Plan with Metrics
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)], metrics=[output_rows=1, output_batches=1, array_map_created_count=0, build_input_batches=1, build_input_rows=3, input_batches=1, input_rows=1, avg_fanout=100% (1/1), probe_hit_rate=100% (1/1)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/nulls_build.parquet]]}, projection=[a, b], file_type=parquet, metrics=[output_rows=3, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=18.6% (144/774)]
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/nulls_probe.parquet]]}, projection=[a, b, c], file_type=parquet, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= 1 AND b@1 <= 2 AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:1}, {c0:,c1:2}, {c0:ab,c1:}]) ], pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 >= aa AND a_null_count@1 != row_count@2 AND a_min@3 <= ab AND b_null_count@5 != row_count@2 AND b_max@4 >= 1 AND b_null_count@5 != row_count@2 AND b_min@6 <= 2, required_guarantees=[], metrics=[output_rows=1, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=1, pushdown_rows_pruned=3, predicate_cache_inner_records=8, predicate_cache_records=2, scan_efficiency_ratio=21.1% (237/1.12 K)]
+
+statement ok
+reset datafusion.explain.analyze_categories;
+
+statement ok
+drop table nulls_build;
+
+statement ok
+drop table nulls_probe;
+
+
+########
+# Ported from datafusion/core/tests/physical_optimizer/filter_pushdown.rs::
+#   test_hashjoin_dynamic_filter_pushdown_left_join
+#   test_hashjoin_dynamic_filter_pushdown_left_semi_join
+#
+# For LEFT and LEFT SEMI joins (without a WHERE that would convert them
+# to INNER), the HashJoin's dynamic filter should still push into the
+# probe scan on the join keys — unmatched probe rows that can never
+# contribute to any build match are safely pruned there.
+########
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    ('aa', 'ba', 1.0),
+    ('ab', 'bb', 2.0)
+  ) AS v(a, b, c)
+) TO 'test_files/scratch/push_down_filter_parquet/lj_build.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    ('aa', 'ba', 1.0),
+    ('ab', 'bb', 2.0),
+    ('ac', 'bc', 3.0),
+    ('ad', 'bd', 4.0)
+  ) AS v(a, b, e)
+) TO 'test_files/scratch/push_down_filter_parquet/lj_probe.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE lj_build (a VARCHAR, b VARCHAR, c DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/lj_build.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE lj_probe (a VARCHAR, b VARCHAR, e DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/lj_probe.parquet';
+
+statement ok
+set datafusion.explain.analyze_categories = 'rows';
+
+# LEFT JOIN: build is preserved; probe rows get dynamic filter based on build keys.
+query TT
+EXPLAIN ANALYZE
+SELECT lj_build.a, lj_build.b, lj_build.c, lj_probe.a, lj_probe.b, lj_probe.e
+FROM lj_build LEFT JOIN lj_probe
+ON lj_build.a = lj_probe.a AND lj_build.b = lj_probe.b;
+----
+Plan with Metrics
+01)HashJoinExec: mode=CollectLeft, join_type=Left, on=[(a@0, a@0), (b@1, b@1)], metrics=[output_rows=2, output_batches=1, array_map_created_count=0, build_input_batches=1, build_input_rows=2, input_batches=2, input_rows=2, avg_fanout=100% (2/2), probe_hit_rate=100% (2/2)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/lj_build.parquet]]}, projection=[a, b, c], file_type=parquet, metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=20.48% (214/1.04 K)]
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/lj_probe.parquet]]}, projection=[a, b, e], file_type=parquet, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ], pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 >= aa AND a_null_count@1 != row_count@2 AND a_min@3 <= ab AND b_null_count@5 != row_count@2 AND b_max@4 >= ba AND b_null_count@5 != row_count@2 AND b_min@6 <= bb, required_guarantees=[], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=2, pushdown_rows_pruned=2, predicate_cache_inner_records=8, predicate_cache_records=4, scan_efficiency_ratio=22.78% (246/1.08 K)]
+
+# LEFT SEMI JOIN: only matching build rows are returned; probe scan still
+# receives the dynamic filter.
+query TT
+EXPLAIN ANALYZE
+SELECT lj_build.*
+FROM lj_build
+WHERE EXISTS (
+  SELECT 1 FROM lj_probe
+  WHERE lj_probe.a = lj_build.a AND lj_probe.b = lj_build.b
+);
+----
+Plan with Metrics
+01)HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(a@0, a@0), (b@1, b@1)], metrics=[output_rows=2, output_batches=1, array_map_created_count=0, build_input_batches=1, build_input_rows=2, input_batches=2, input_rows=4, avg_fanout=100% (2/2), probe_hit_rate=100% (2/2)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/lj_build.parquet]]}, projection=[a, b, c], file_type=parquet, metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=20.48% (214/1.04 K)]
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/lj_probe.parquet]]}, projection=[a, b], file_type=parquet, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ], pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 >= aa AND a_null_count@1 != row_count@2 AND a_min@3 <= ab AND b_null_count@5 != row_count@2 AND b_max@4 >= ba AND b_null_count@5 != row_count@2 AND b_min@6 <= bb, required_guarantees=[], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=2, pushdown_rows_pruned=2, predicate_cache_inner_records=8, predicate_cache_records=4, scan_efficiency_ratio=15.37% (166/1.08 K)]
+
+statement ok
+reset datafusion.explain.analyze_categories;
+
+statement ok
+drop table lj_build;
+
+statement ok
+drop table lj_probe;
+
+
+########
+# Ported from datafusion/core/tests/physical_optimizer/filter_pushdown.rs::
+#   test_hashjoin_hash_table_pushdown_collect_left
+#   test_hashjoin_hash_table_pushdown_integer_keys
+#
+# With `hash_join_inlist_pushdown_max_size = 1`, the HashJoin dynamic filter
+# should use the HashTable strategy (`hash_lookup`) instead of the InList
+# strategy (`IN (SET) ([...])`), even when the build side is tiny.
+########
+
+statement ok
+set datafusion.optimizer.hash_join_inlist_pushdown_max_size = 1;
+
+# --- multi-column string keys ---
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    ('aa', 'ba', 1.0),
+    ('ab', 'bb', 2.0)
+  ) AS v(a, b, c)
+) TO 'test_files/scratch/push_down_filter_parquet/hl_build.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    ('aa', 'ba', 1.0),
+    ('ab', 'bb', 2.0),
+    ('ac', 'bc', 3.0),
+    ('ad', 'bd', 4.0)
+  ) AS v(a, b, e)
+) TO 'test_files/scratch/push_down_filter_parquet/hl_probe.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE hl_build (a VARCHAR, b VARCHAR, c DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/hl_build.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE hl_probe (a VARCHAR, b VARCHAR, e DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/hl_probe.parquet';
+
+statement ok
+set datafusion.explain.analyze_categories = 'rows';
+
+# probe scan filter should contain `hash_lookup`, not `IN (SET)`.
+query TT
+EXPLAIN ANALYZE
+SELECT p.a, p.b, build.c, p.e
+FROM hl_probe p INNER JOIN hl_build AS build
+  ON p.a = build.a AND p.b = build.b;
+----
+Plan with Metrics
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)], projection=[a@3, b@4, c@2, e@5], metrics=[output_rows=2, output_batches=1, array_map_created_count=0, build_input_batches=1, build_input_rows=2, input_batches=1, input_rows=2, avg_fanout=100% (2/2), probe_hit_rate=100% (2/2)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/hl_build.parquet]]}, projection=[a, b, c], file_type=parquet, metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=20.48% (214/1.04 K)]
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/hl_probe.parquet]]}, projection=[a, b, e], file_type=parquet, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND hash_lookup ], pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 >= aa AND a_null_count@1 != row_count@2 AND a_min@3 <= ab AND b_null_count@5 != row_count@2 AND b_max@4 >= ba AND b_null_count@5 != row_count@2 AND b_min@6 <= bb, required_guarantees=[], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=2, pushdown_rows_pruned=2, predicate_cache_inner_records=8, predicate_cache_records=4, scan_efficiency_ratio=22.78% (246/1.08 K)]
+
+statement ok
+drop table hl_build;
+
+statement ok
+drop table hl_probe;
+
+# --- multi-column integer keys ---
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    (1, 10, 100.0),
+    (2, 20, 200.0)
+  ) AS v(id1, id2, value)
+) TO 'test_files/scratch/push_down_filter_parquet/int_build.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES
+    (1, 10, 'a'),
+    (2, 20, 'b'),
+    (3, 30, 'c'),
+    (4, 40, 'd')
+  ) AS v(id1, id2, data)
+) TO 'test_files/scratch/push_down_filter_parquet/int_probe.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE int_build (id1 INT, id2 INT, value DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/int_build.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE int_probe (id1 INT, id2 INT, data VARCHAR)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/int_probe.parquet';
+
+query TT
+EXPLAIN ANALYZE
+SELECT b.id1, b.id2, b.value, p.data
+FROM int_build b INNER JOIN int_probe p
+  ON b.id1 = p.id1 AND b.id2 = p.id2;
+----
+Plan with Metrics
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id1@0, id1@0), (id2@1, id2@1)], projection=[id1@0, id2@1, value@2, data@5], metrics=[output_rows=2, output_batches=1, array_map_created_count=0, build_input_batches=1, build_input_rows=2, input_batches=1, input_rows=2, avg_fanout=100% (2/2), probe_hit_rate=100% (2/2)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/int_build.parquet]]}, projection=[id1, id2, value], file_type=parquet, metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=19.02% (222/1.17 K)]
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/int_probe.parquet]]}, projection=[id1, id2, data], file_type=parquet, predicate=DynamicFilter [ id1@0 >= 1 AND id1@0 <= 2 AND id2@1 >= 10 AND id2@1 <= 20 AND hash_lookup ], pruning_predicate=id1_null_count@1 != row_count@2 AND id1_max@0 >= 1 AND id1_null_count@1 != row_count@2 AND id1_min@3 <= 2 AND id2_null_count@5 != row_count@2 AND id2_max@4 >= 10 AND id2_null_count@5 != row_count@2 AND id2_min@6 <= 20, required_guarantees=[], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=2, pushdown_rows_pruned=2, predicate_cache_inner_records=8, predicate_cache_records=4, scan_efficiency_ratio=21.43% (239/1.11 K)]
+
+statement ok
+reset datafusion.explain.analyze_categories;
+
+statement ok
+reset datafusion.optimizer.hash_join_inlist_pushdown_max_size;
+
+statement ok
+drop table int_build;
+
+statement ok
+drop table int_probe;
+
+
+# Config reset
+statement ok
+RESET datafusion.explain.physical_plan_only;
+
+statement ok
+RESET datafusion.execution.parquet.pushdown_filters;
+
+statement ok
+drop table t;
diff --git a/datafusion/sqllogictest/test_files/push_down_filter_regression.slt b/datafusion/sqllogictest/test_files/push_down_filter_regression.slt
new file mode 100644
index 0000000000000..7334054ff2c04
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/push_down_filter_regression.slt
@@ -0,0 +1,576 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test push down filter
+
+# Regression test for https://github.com/apache/datafusion/issues/17188
+query I
+COPY (select i as k, i as v from generate_series(1, 10000000) as t(i))
+TO 'test_files/scratch/push_down_filter_regression/t2.parquet'
+STORED AS PARQUET
+OPTIONS ('format.compression' 'uncompressed');
+----
+10000000
+
+statement ok
+create external table t2 stored as parquet location 'test_files/scratch/push_down_filter_regression/t2.parquet';
+
+statement ok
+create external table t1 (k bigint not null) stored as parquet location 'test_files/scratch/push_down_filter_regression/t2.parquet';
+
+# The failure before https://github.com/apache/datafusion/pull/17197 was non-deterministic and random
+# So we'll run the same query a couple of times just to have more certainty it's fixed
+# Sorry about the spam in this slt test...
+
+query III rowsort
+select *
+from t1
+join t2 on t1.k = t2.k
+where v = 1 or v = 10000000
+order by t1.k, t2.v;
+----
+1 1 1
+10000000 10000000 10000000
+
+query III rowsort
+select *
+from t1
+join t2 on t1.k = t2.k
+where v = 1 or v = 10000000
+order by t1.k, t2.v;
+----
+1 1 1
+10000000 10000000 10000000
+
+query III rowsort
+select *
+from t1
+join t2 on t1.k = t2.k
+where v = 1 or v = 10000000
+order by t1.k, t2.v;
+----
+1 1 1
+10000000 10000000 10000000
+
+query III rowsort
+select *
+from t1
+join t2 on t1.k = t2.k
+where v = 1 or v = 10000000
+order by t1.k, t2.v;
+----
+1 1 1
+10000000 10000000 10000000
+
+query III rowsort
+select *
+from t1
+join t2 on t1.k = t2.k
+where v = 1 or v = 10000000
+order by t1.k, t2.v;
+----
+1 1 1
+10000000 10000000 10000000
+
+# Regression test for https://github.com/apache/datafusion/issues/17512
+
+query I
+COPY (
+    SELECT arrow_cast('2025-01-01T00:00:00Z'::timestamptz, 'Timestamp(Microsecond, Some("UTC"))') AS start_timestamp
+)
+TO 'test_files/scratch/push_down_filter_regression/17512.parquet'
+STORED AS PARQUET
+OPTIONS ('format.compression' 'uncompressed');
+----
+1
+
+statement ok
+CREATE EXTERNAL TABLE records STORED AS PARQUET LOCATION 'test_files/scratch/push_down_filter_regression/17512.parquet';
+
+query I
+SELECT 1
+FROM (
+    SELECT start_timestamp
+    FROM records
+    WHERE start_timestamp <= '2025-01-01T00:00:00Z'::timestamptz
+) AS t
+WHERE t.start_timestamp::time < '00:00:01'::time;
+----
+1
+
+# Test aggregate dynamic filter pushdown
+# Note: most of the test coverage lives in `datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs`
+# , to compare dynamic filter content easier. Here the tests are simple end-to-end
+# exercises.
+
+statement ok
+set datafusion.explain.format = 'indent';
+
+statement ok
+set datafusion.explain.physical_plan_only = true;
+
+statement ok
+set datafusion.execution.target_partitions = 2;
+
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+statement ok
+set datafusion.optimizer.enable_dynamic_filter_pushdown = true;
+
+statement ok
+set datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown = true;
+
+statement ok
+create external table agg_dyn_test stored as parquet location '../core/tests/data/test_statistics_per_partition';
+
+# Expect dynamic filter available inside data source
+query TT
+explain select max(id) from agg_dyn_test where id > 1;
+----
+physical_plan
+01)AggregateExec: mode=Final, gby=[], aggr=[max(agg_dyn_test.id)]
+02)--CoalescePartitionsExec
+03)----AggregateExec: mode=Partial, gby=[], aggr=[max(agg_dyn_test.id)]
+04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-01/j5fUeSDQo22oPyPU.parquet, WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-02/j5fUeSDQo22oPyPU.parquet], [WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-03/j5fUeSDQo22oPyPU.parquet, WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-04/j5fUeSDQo22oPyPU.parquet]]}, projection=[id], file_type=parquet, predicate=id@0 > 1 AND DynamicFilter [ empty ], pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[]
+
+query I
+select max(id) from agg_dyn_test where id > 1;
+----
+4
+
+# Expect dynamic filter available inside data source
+query TT
+explain select max(id) from agg_dyn_test where (id+1) > 1;
+----
+physical_plan
+01)AggregateExec: mode=Final, gby=[], aggr=[max(agg_dyn_test.id)]
+02)--CoalescePartitionsExec
+03)----AggregateExec: mode=Partial, gby=[], aggr=[max(agg_dyn_test.id)]
+04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-01/j5fUeSDQo22oPyPU.parquet, WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-02/j5fUeSDQo22oPyPU.parquet], [WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-03/j5fUeSDQo22oPyPU.parquet, WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-04/j5fUeSDQo22oPyPU.parquet]]}, projection=[id], file_type=parquet, predicate=CAST(id@0 AS Int64) + 1 > 1 AND DynamicFilter [ empty ]
+
+# Expect dynamic filter available inside data source
+query TT
+explain select max(id), min(id) from agg_dyn_test where id < 10;
+----
+physical_plan
+01)AggregateExec: mode=Final, gby=[], aggr=[max(agg_dyn_test.id), min(agg_dyn_test.id)]
+02)--CoalescePartitionsExec
+03)----AggregateExec: mode=Partial, gby=[], aggr=[max(agg_dyn_test.id), min(agg_dyn_test.id)]
+04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-01/j5fUeSDQo22oPyPU.parquet, WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-02/j5fUeSDQo22oPyPU.parquet], [WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-03/j5fUeSDQo22oPyPU.parquet, WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-04/j5fUeSDQo22oPyPU.parquet]]}, projection=[id], file_type=parquet, predicate=id@0 < 10 AND DynamicFilter [ empty ], pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 < 10, required_guarantees=[]
+
+# Dynamic filter should not be available for grouping sets
+query TT
+explain select max(id) from agg_dyn_test where id < 10
+group by grouping sets ((), (id))
+----
+physical_plan
+01)ProjectionExec: expr=[max(agg_dyn_test.id)@2 as max(agg_dyn_test.id)]
+02)--AggregateExec: mode=FinalPartitioned, gby=[id@0 as id, __grouping_id@1 as __grouping_id], aggr=[max(agg_dyn_test.id)]
+03)----RepartitionExec: partitioning=Hash([id@0, __grouping_id@1], 2), input_partitions=2
+04)------AggregateExec: mode=Partial, gby=[(NULL as id), (id@0 as id)], aggr=[max(agg_dyn_test.id)]
+05)--------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-01/j5fUeSDQo22oPyPU.parquet, WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-02/j5fUeSDQo22oPyPU.parquet], [WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-03/j5fUeSDQo22oPyPU.parquet, WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-04/j5fUeSDQo22oPyPU.parquet]]}, projection=[id], file_type=parquet, predicate=id@0 < 10, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 < 10, required_guarantees=[]
+
+# Ported from datafusion/core/tests/physical_optimizer/filter_pushdown.rs::
+#   test_aggregate_dynamic_filter_parquet_e2e
+#
+# End-to-end: `select max(id) from t where id > 1` should pick up a dynamic
+# filter from the partial aggregate that prunes at least one file range
+# (files_ranges_pruned_statistics must report < 4 matched out of 4 total).
+#
+# Build a self-contained fixture of 4 single-row parquet files, one per
+# `id` value. With target_partitions=2, files are split into 2 groups of 2.
+# Writing the files in decreasing-id order ensures the first file in each
+# group has the larger id, so after reading it the dynamic filter tightens
+# (e.g. to `id > 4` in group 1) and prunes the second file in the group.
+
+statement ok
+COPY (VALUES (4)) TO 'test_files/scratch/push_down_filter_regression/agg_dyn/file_0.parquet' STORED AS PARQUET;
+
+statement ok
+COPY (VALUES (3)) TO 'test_files/scratch/push_down_filter_regression/agg_dyn/file_1.parquet' STORED AS PARQUET;
+
+statement ok
+COPY (VALUES (2)) TO 'test_files/scratch/push_down_filter_regression/agg_dyn/file_2.parquet' STORED AS PARQUET;
+
+statement ok
+COPY (VALUES (1)) TO 'test_files/scratch/push_down_filter_regression/agg_dyn/file_3.parquet' STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE agg_dyn_e2e (column1 INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_regression/agg_dyn/';
+
+statement ok
+set datafusion.execution.collect_statistics = true;
+
+# Suppress metrics: pruning counts are nondeterministic under parallel
+# execution (the order in which Partial aggregates publish dynamic filter
+# updates races against when the scan reads each partition). The original
+# Rust test only asserted matched < 4; the important invariant here is
+# that the DynamicFilter text is correct.
+statement ok
+set datafusion.explain.analyze_level = summary;
+
+statement ok
+set datafusion.explain.analyze_categories = 'none';
+
+query TT
+EXPLAIN ANALYZE select max(column1) from agg_dyn_e2e where column1 > 1;
+----
+Plan with Metrics
+01)AggregateExec: mode=Final, gby=[], aggr=[max(agg_dyn_e2e.column1)], metrics=[]
+02)--CoalescePartitionsExec, metrics=[]
+03)----AggregateExec: mode=Partial, gby=[], aggr=[max(agg_dyn_e2e.column1)], metrics=[]
+04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_regression/agg_dyn/file_0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_regression/agg_dyn/file_1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_regression/agg_dyn/file_2.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_regression/agg_dyn/file_3.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 > 1 AND DynamicFilter [ column1@0 > 4 ], pruning_predicate=column1_null_count@1 != row_count@2 AND column1_max@0 > 1 AND column1_null_count@1 != row_count@2 AND column1_max@0 > 4, required_guarantees=[], metrics=[]
+
+statement ok
+reset datafusion.explain.analyze_categories;
+
+statement ok
+reset datafusion.explain.analyze_level;
+
+statement ok
+reset datafusion.execution.collect_statistics;
+
+statement ok
+drop table agg_dyn_e2e;
+
+
+########
+# Ported from datafusion/core/tests/physical_optimizer/filter_pushdown.rs:
+#   test_aggregate_dynamic_filter_min_simple
+#   test_aggregate_dynamic_filter_max_simple
+#   test_aggregate_dynamic_filter_min_expression_not_supported
+#   test_aggregate_dynamic_filter_min_max_same_column
+#   test_aggregate_dynamic_filter_min_max_different_columns
+#   test_aggregate_dynamic_filter_multiple_mixed_expressions
+#   test_aggregate_dynamic_filter_min_all_nulls
+#
+# Baseline behavior of aggregate dynamic filter emission:
+#   MIN(a)      -> filter `a < <min_seen>`
+#   MAX(a)      -> filter `a > <max_seen>`
+#   MIN + MAX   -> filter `a < <min> OR a > <max>` (or across columns)
+#   MIN(a+1)    -> no dynamic filter (expression input is unsupported)
+#   all-NULLs   -> filter stays `true` (no meaningful bound to propagate)
+#
+# Dynamic aggregate filters are only emitted in Partial+Final mode, not
+# Single mode, so each fixture is written to 2 files and runs with
+# target_partitions=2 to force a multi-partition aggregate plan.
+
+statement ok
+set datafusion.execution.target_partitions = 2;
+
+# --- single-column fixture ([5, 1, 3, 8]) split across 2 files ---
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES (5), (1)) AS v(a)
+) TO 'test_files/scratch/push_down_filter_regression/agg_dyn_single/file_0.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES (3), (8)) AS v(a)
+) TO 'test_files/scratch/push_down_filter_regression/agg_dyn_single/file_1.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE agg_dyn_single (a INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_regression/agg_dyn_single/';
+
+# Use `analyze_level = summary` + `analyze_categories = 'none'` so metrics
+# render empty; we only care that the `predicate=DynamicFilter [ ... ]` text
+# matches. Pruning metrics here are subject to a parallel-execution race
+# (the order in which Partial aggregates publish filter updates vs. when the
+# scan reads each partition), so the filter *content* is deterministic but
+# the pruning counts are not.
+statement ok
+set datafusion.explain.analyze_level = summary;
+
+statement ok
+set datafusion.explain.analyze_categories = 'none';
+
+# MIN(a) -> DynamicFilter [ a < 1 ]
+query TT
+EXPLAIN ANALYZE SELECT MIN(a) FROM agg_dyn_single;
+----
+Plan with Metrics
+01)AggregateExec: mode=Final, gby=[], aggr=[min(agg_dyn_single.a)], metrics=[]
+02)--CoalescePartitionsExec, metrics=[]
+03)----AggregateExec: mode=Partial, gby=[], aggr=[min(agg_dyn_single.a)], metrics=[]
+04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_regression/agg_dyn_single/file_0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_regression/agg_dyn_single/file_1.parquet]]}, projection=[a], file_type=parquet, predicate=DynamicFilter [ a@0 < 1 ], pruning_predicate=a_null_count@1 != row_count@2 AND a_min@0 < 1, required_guarantees=[], metrics=[]
+
+# MAX(a) -> DynamicFilter [ a > 8 ]
+query TT
+EXPLAIN ANALYZE SELECT MAX(a) FROM agg_dyn_single;
+----
+Plan with Metrics
+01)AggregateExec: mode=Final, gby=[], aggr=[max(agg_dyn_single.a)], metrics=[]
+02)--CoalescePartitionsExec, metrics=[]
+03)----AggregateExec: mode=Partial, gby=[], aggr=[max(agg_dyn_single.a)], metrics=[]
+04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_regression/agg_dyn_single/file_0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_regression/agg_dyn_single/file_1.parquet]]}, projection=[a], file_type=parquet, predicate=DynamicFilter [ a@0 > 8 ], pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 > 8, required_guarantees=[], metrics=[]
+
+# MIN(a), MAX(a) -> DynamicFilter [ a < 1 OR a > 8 ]
+query TT
+EXPLAIN ANALYZE SELECT MIN(a), MAX(a) FROM agg_dyn_single;
+----
+Plan with Metrics
+01)AggregateExec: mode=Final, gby=[], aggr=[min(agg_dyn_single.a), max(agg_dyn_single.a)], metrics=[]
+02)--CoalescePartitionsExec, metrics=[]
+03)----AggregateExec: mode=Partial, gby=[], aggr=[min(agg_dyn_single.a), max(agg_dyn_single.a)], metrics=[]
+04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_regression/agg_dyn_single/file_0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_regression/agg_dyn_single/file_1.parquet]]}, projection=[a], file_type=parquet, predicate=DynamicFilter [ a@0 < 1 OR a@0 > 8 ], pruning_predicate=a_null_count@1 != row_count@2 AND a_min@0 < 1 OR a_null_count@1 != row_count@2 AND a_max@3 > 8, required_guarantees=[], metrics=[]
+
+# MIN(a+1) -> no dynamic filter (expression input is not a plain column)
+query TT
+EXPLAIN ANALYZE SELECT MIN(a + 1) FROM agg_dyn_single;
+----
+Plan with Metrics
+01)AggregateExec: mode=Final, gby=[], aggr=[min(agg_dyn_single.a + Int64(1))], metrics=[]
+02)--CoalescePartitionsExec, metrics=[]
+03)----AggregateExec: mode=Partial, gby=[], aggr=[min(agg_dyn_single.a + Int64(1))], metrics=[]
+04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_regression/agg_dyn_single/file_0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_regression/agg_dyn_single/file_1.parquet]]}, projection=[a], file_type=parquet, metrics=[]
+
+statement ok
+drop table agg_dyn_single;
+
+# --- two-column fixture: MIN(a) + MAX(b) across columns ---
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES (5, 7), (1, 2)) AS v(a, b)
+) TO 'test_files/scratch/push_down_filter_regression/agg_dyn_two_col/file_0.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES (3, 4), (8, 9)) AS v(a, b)
+) TO 'test_files/scratch/push_down_filter_regression/agg_dyn_two_col/file_1.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE agg_dyn_two_col (a INT, b INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_regression/agg_dyn_two_col/';
+
+# MIN(a), MAX(b) -> DynamicFilter [ a < 1 OR b > 9 ]
+query TT
+EXPLAIN ANALYZE SELECT MIN(a), MAX(b) FROM agg_dyn_two_col;
+----
+Plan with Metrics
+01)AggregateExec: mode=Final, gby=[], aggr=[min(agg_dyn_two_col.a), max(agg_dyn_two_col.b)], metrics=[]
+02)--CoalescePartitionsExec, metrics=[]
+03)----AggregateExec: mode=Partial, gby=[], aggr=[min(agg_dyn_two_col.a), max(agg_dyn_two_col.b)], metrics=[]
+04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_regression/agg_dyn_two_col/file_0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_regression/agg_dyn_two_col/file_1.parquet]]}, projection=[a, b], file_type=parquet, predicate=DynamicFilter [ a@0 < 1 OR b@1 > 9 ], pruning_predicate=a_null_count@1 != row_count@2 AND a_min@0 < 1 OR b_null_count@4 != row_count@2 AND b_max@3 > 9, required_guarantees=[], metrics=[]
+
+statement ok
+drop table agg_dyn_two_col;
+
+# --- mixed expressions: MIN(a), MAX(a), MAX(b), MIN(c+1) ---
+# Supported aggregates (MIN(a), MAX(a), MAX(b)) should drive a filter;
+# MIN(c+1) is unsupported and must not contribute.
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES (5, 10, 100), (1, 4, 70)) AS v(a, b, c)
+) TO 'test_files/scratch/push_down_filter_regression/agg_dyn_mixed/file_0.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES (3, 6, 90), (8, 12, 110)) AS v(a, b, c)
+) TO 'test_files/scratch/push_down_filter_regression/agg_dyn_mixed/file_1.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE agg_dyn_mixed (a INT, b INT, c INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_regression/agg_dyn_mixed/';
+
+# -> DynamicFilter [ a < 1 OR a > 8 OR b > 12 ] (MIN(c+1) dropped as unsupported)
+query TT
+EXPLAIN ANALYZE SELECT MIN(a), MAX(a), MAX(b), MIN(c + 1) FROM agg_dyn_mixed;
+----
+Plan with Metrics
+01)AggregateExec: mode=Final, gby=[], aggr=[min(agg_dyn_mixed.a), max(agg_dyn_mixed.a), max(agg_dyn_mixed.b), min(agg_dyn_mixed.c + Int64(1))], metrics=[]
+02)--CoalescePartitionsExec, metrics=[]
+03)----AggregateExec: mode=Partial, gby=[], aggr=[min(agg_dyn_mixed.a), max(agg_dyn_mixed.a), max(agg_dyn_mixed.b), min(agg_dyn_mixed.c + Int64(1))], metrics=[]
+04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_regression/agg_dyn_mixed/file_0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_regression/agg_dyn_mixed/file_1.parquet]]}, projection=[a, b, c], file_type=parquet, predicate=DynamicFilter [ a@0 < 1 OR a@0 > 8 OR b@1 > 12 ], pruning_predicate=a_null_count@1 != row_count@2 AND a_min@0 < 1 OR a_null_count@1 != row_count@2 AND a_max@3 > 8 OR b_null_count@5 != row_count@2 AND b_max@4 > 12, required_guarantees=[], metrics=[]
+
+statement ok
+drop table agg_dyn_mixed;
+
+# --- all-NULLs input: filter should stay `true` (no meaningful bound) ---
+
+statement ok
+COPY (
+  SELECT CAST(NULL AS INT) AS a FROM (VALUES (1), (2)) AS v(n)
+) TO 'test_files/scratch/push_down_filter_regression/agg_dyn_nulls/file_0.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (
+  SELECT CAST(NULL AS INT) AS a FROM (VALUES (3), (4)) AS v(n)
+) TO 'test_files/scratch/push_down_filter_regression/agg_dyn_nulls/file_1.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE agg_dyn_nulls (a INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_regression/agg_dyn_nulls/';
+
+# MIN(a) over all-NULL input -> DynamicFilter [ true ]
+query TT
+EXPLAIN ANALYZE SELECT MIN(a) FROM agg_dyn_nulls;
+----
+Plan with Metrics
+01)AggregateExec: mode=Final, gby=[], aggr=[min(agg_dyn_nulls.a)], metrics=[]
+02)--CoalescePartitionsExec, metrics=[]
+03)----AggregateExec: mode=Partial, gby=[], aggr=[min(agg_dyn_nulls.a)], metrics=[]
+04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_regression/agg_dyn_nulls/file_0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_regression/agg_dyn_nulls/file_1.parquet]]}, projection=[a], file_type=parquet, predicate=DynamicFilter [ true ], metrics=[]
+
+statement ok
+reset datafusion.explain.analyze_categories;
+
+statement ok
+reset datafusion.explain.analyze_level;
+
+statement ok
+drop table agg_dyn_nulls;
+
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+
+########
+# Ported from datafusion/core/tests/physical_optimizer/filter_pushdown.rs::
+#   test_aggregate_filter_pushdown
+#   test_no_pushdown_filter_on_aggregate_result
+#
+# - A filter that only references grouping columns (`WHERE a = 'x' GROUP BY a`)
+#   should push through the aggregate and land on the scan's predicate.
+# - A filter on an aggregate *result* (`HAVING count(b) > 5`) must NOT push
+#   through — it has to stay above the aggregate.
+########
+
+statement ok
+COPY (
+  SELECT * FROM (VALUES ('x', 'foo'), ('y', 'bar')) AS v(a, b)
+) TO 'test_files/scratch/push_down_filter_regression/agg_filter_pushdown.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE agg_filter_pushdown (a VARCHAR, b VARCHAR)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_regression/agg_filter_pushdown.parquet';
+
+# Filter on grouping column pushes down to the scan.
+query TT
+EXPLAIN SELECT a, count(b) FROM agg_filter_pushdown WHERE a = 'x' GROUP BY a;
+----
+physical_plan
+01)AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(agg_filter_pushdown.b)], ordering_mode=Sorted
+02)--RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
+03)----AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(agg_filter_pushdown.b)], ordering_mode=Sorted
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_regression/agg_filter_pushdown.parquet]]}, projection=[a, b], file_type=parquet, predicate=a@0 = x, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= x AND x <= a_max@1, required_guarantees=[a in (x)]
+
+# Filter on aggregate result (HAVING) stays above the aggregate.
+query TT
+EXPLAIN SELECT a, count(b) AS cnt FROM agg_filter_pushdown GROUP BY a HAVING count(b) > 5;
+----
+physical_plan
+01)ProjectionExec: expr=[a@0 as a, count(agg_filter_pushdown.b)@1 as cnt]
+02)--FilterExec: count(agg_filter_pushdown.b)@1 > 5
+03)----AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(agg_filter_pushdown.b)]
+04)------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(agg_filter_pushdown.b)]
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_regression/agg_filter_pushdown.parquet]]}, projection=[a, b], file_type=parquet
+
+statement ok
+drop table agg_filter_pushdown;
+
+# Config reset
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+reset datafusion.explain.format;
+
+statement ok
+reset datafusion.explain.physical_plan_only;
+
+statement ok
+reset datafusion.execution.parquet.pushdown_filters;
+
+statement ok
+reset datafusion.optimizer.enable_dynamic_filter_pushdown;
+
+statement ok
+reset datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown;
+
+# Regression test for https://github.com/apache/datafusion/issues/21065
+# Ensure filter pushdown through AggregateExec still works when a ProjectionExec
+# reorders aggregate input columns.
+
+statement ok
+create external table agg_reordered_pushdown stored as parquet location '../../parquet-testing/data/alltypes_plain.parquet';
+
+query TBI rowsort
+select a, b, cnt
+from (
+    select a, b, count(c) as cnt
+    from (
+        select id as c, cast(string_col as varchar) as a, bool_col as b
+        from agg_reordered_pushdown
+    ) t
+    group by a, b
+) q
+where b = true;
+----
+0 true 4
+
+query TBI rowsort
+select a, b, cnt
+from (
+    select a, b, count(c) as cnt
+    from (
+        select id as c, cast(string_col as varchar) as a, bool_col as b
+        from agg_reordered_pushdown
+    ) t
+    group by a, b
+) q
+where cnt > 1;
+----
+0 true 4
+1 false 4
+
+statement ok
+drop table agg_reordered_pushdown;
+
+statement ok
+drop table agg_dyn_test;
+
+statement ok
+drop table t1;
+
+statement ok
+drop table t2;
diff --git a/datafusion/sqllogictest/test_files/push_down_filter_unnest.slt b/datafusion/sqllogictest/test_files/push_down_filter_unnest.slt
new file mode 100644
index 0000000000000..1921fbfb4316c
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/push_down_filter_unnest.slt
@@ -0,0 +1,153 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test push down filter
+
+statement ok
+set datafusion.explain.physical_plan_only = true;
+
+statement ok
+CREATE TABLE IF NOT EXISTS v AS VALUES(1,[1,2,3]),(2,[3,4,5]);
+
+query I
+select uc2 from (select unnest(column2) as uc2, column1 from v) where column1 = 2;
+----
+3
+4
+5
+
+# test push down filter for unnest with filter on non-unnest column
+# filter plan is pushed down into projection plan
+query TT
+explain select uc2 from (select unnest(column2) as uc2, column1 from v) where column1 = 2;
+----
+physical_plan
+01)ProjectionExec: expr=[__unnest_placeholder(v.column2,depth=1)@0 as uc2]
+02)--UnnestExec
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------ProjectionExec: expr=[column2@0 as __unnest_placeholder(v.column2)]
+05)--------FilterExec: column1@0 = 2, projection=[column2@1]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query I
+select uc2 from (select unnest(column2) as uc2, column1 from v) where uc2 > 3;
+----
+4
+5
+
+# test push down filter for unnest with filter on unnest column
+query TT
+explain select uc2 from (select unnest(column2) as uc2, column1 from v) where uc2 > 3;
+----
+physical_plan
+01)ProjectionExec: expr=[__unnest_placeholder(v.column2,depth=1)@0 as uc2]
+02)--FilterExec: __unnest_placeholder(v.column2,depth=1)@0 > 3
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------UnnestExec
+05)--------ProjectionExec: expr=[column2@0 as __unnest_placeholder(v.column2)]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query II
+select uc2, column1 from  (select unnest(column2) as uc2, column1 from v) where uc2 > 3 AND column1 = 2;
+----
+4 2
+5 2
+
+# Could push the filter (column1 = 2) down below unnest
+query TT
+explain select uc2, column1 from  (select unnest(column2) as uc2, column1 from v) where uc2 > 3 AND column1 = 2;
+----
+physical_plan
+01)ProjectionExec: expr=[__unnest_placeholder(v.column2,depth=1)@0 as uc2, column1@1 as column1]
+02)--FilterExec: __unnest_placeholder(v.column2,depth=1)@0 > 3
+03)----UnnestExec
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------ProjectionExec: expr=[column2@1 as __unnest_placeholder(v.column2), column1@0 as column1]
+06)----------FilterExec: column1@0 = 2
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query II
+select uc2, column1 from  (select unnest(column2) as uc2, column1 from v) where uc2 > 3 OR column1 = 2;
+----
+3 2
+4 2
+5 2
+
+# only non-unnest filter in AND clause could be pushed down
+query TT
+explain select uc2, column1 from  (select unnest(column2) as uc2, column1 from v) where uc2 > 3 OR column1 = 2;
+----
+physical_plan
+01)ProjectionExec: expr=[__unnest_placeholder(v.column2,depth=1)@0 as uc2, column1@1 as column1]
+02)--FilterExec: __unnest_placeholder(v.column2,depth=1)@0 > 3 OR column1@1 = 2
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------UnnestExec
+05)--------ProjectionExec: expr=[column2@1 as __unnest_placeholder(v.column2), column1@0 as column1]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+drop table v;
+
+# test with unnest struct, should not push down filter
+statement ok
+CREATE TABLE d AS VALUES(1,[named_struct('a', 1, 'b', 2)]),(2,[named_struct('a', 3, 'b', 4), named_struct('a', 5, 'b', 6)]);
+
+query I?
+select * from (select column1, unnest(column2) as o from d) where o['a'] = 1;
+----
+1 {a: 1, b: 2}
+
+query TT
+explain select * from (select column1, unnest(column2) as o from d) where o['a'] = 1;
+----
+physical_plan
+01)ProjectionExec: expr=[column1@0 as column1, __unnest_placeholder(d.column2,depth=1)@1 as o]
+02)--FilterExec: __datafusion_extracted_1@0 = 1, projection=[column1@1, __unnest_placeholder(d.column2,depth=1)@2]
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------ProjectionExec: expr=[get_field(__unnest_placeholder(d.column2,depth=1)@1, a) as __datafusion_extracted_1, column1@0 as column1, __unnest_placeholder(d.column2,depth=1)@1 as __unnest_placeholder(d.column2,depth=1)]
+05)--------UnnestExec
+06)----------ProjectionExec: expr=[column1@0 as column1, column2@1 as __unnest_placeholder(d.column2)]
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+drop table d;
+
+statement ok
+CREATE TABLE d AS VALUES (named_struct('a', 1, 'b', 2)), (named_struct('a', 3, 'b', 4)), (named_struct('a', 5, 'b', 6));
+
+query II
+select * from (select unnest(column1) from d) where "d.column1.b" > 5;
+----
+5 6
+
+query TT
+explain select * from (select unnest(column1) from d) where "d.column1.b" > 5;
+----
+physical_plan
+01)ProjectionExec: expr=[__unnest_placeholder(d.column1).a@0 as d.column1.a, __unnest_placeholder(d.column1).b@1 as d.column1.b]
+02)--FilterExec: __unnest_placeholder(d.column1).b@1 > 5
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------UnnestExec
+05)--------ProjectionExec: expr=[column1@0 as __unnest_placeholder(d.column1)]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+drop table d;
+
+# Config reset
+statement ok
+RESET datafusion.explain.physical_plan_only;
diff --git a/datafusion/sqllogictest/test_files/pwmj.slt b/datafusion/sqllogictest/test_files/pwmj.slt
index eafa4d0ba3945..295eb94318ee5 100644
--- a/datafusion/sqllogictest/test_files/pwmj.slt
+++ b/datafusion/sqllogictest/test_files/pwmj.slt
@@ -87,13 +87,11 @@ physical_plan
 02)--SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----PiecewiseMergeJoin: operator=Gt, join_type=Inner, on=(t1_id > t2_id)
 04)------SortExec: expr=[t1_id@0 ASC], preserve_partitioning=[false]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------FilterExec: t1_id@0 > 10
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)--------CoalesceBatchesExec: target_batch_size=8192
-10)----------FilterExec: t2_int@1 > 1, projection=[t2_id@0]
-11)------------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--------FilterExec: t1_id@0 > 10
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+07)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------FilterExec: t2_int@1 > 1, projection=[t2_id@0]
+09)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query II
 SELECT t1.t1_id, t2.t2_id
@@ -134,13 +132,11 @@ physical_plan
 02)--SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----PiecewiseMergeJoin: operator=GtEq, join_type=Inner, on=(t1_id >= t2_id)
 04)------SortExec: expr=[t1_id@0 ASC], preserve_partitioning=[false]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------FilterExec: t1_id@0 >= 22
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)--------CoalesceBatchesExec: target_batch_size=8192
-10)----------FilterExec: t2_int@1 = 3, projection=[t2_id@0]
-11)------------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--------FilterExec: t1_id@0 >= 22
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+07)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------FilterExec: t2_int@1 = 3, projection=[t2_id@0]
+09)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query II
 SELECT t1.t1_id, t2.t2_id
@@ -183,9 +179,8 @@ physical_plan
 04)------SortExec: expr=[t1_id@0 DESC], preserve_partitioning=[false]
 05)--------DataSourceExec: partitions=1, partition_sizes=[1]
 06)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)--------CoalesceBatchesExec: target_batch_size=8192
-08)----------FilterExec: t2_int@1 >= 3, projection=[t2_id@0]
-09)------------DataSourceExec: partitions=1, partition_sizes=[1]
+07)--------FilterExec: t2_int@1 >= 3, projection=[t2_id@0]
+08)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 
 query II
@@ -231,9 +226,8 @@ physical_plan
 04)------SortExec: expr=[CAST(t1_id@0 AS Int64) DESC], preserve_partitioning=[false]
 05)--------DataSourceExec: partitions=1, partition_sizes=[1]
 06)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)--------CoalesceBatchesExec: target_batch_size=8192
-08)----------FilterExec: t2_int@1 >= 3, projection=[t2_id@0]
-09)------------DataSourceExec: partitions=1, partition_sizes=[1]
+07)--------FilterExec: t2_int@1 >= 3, projection=[t2_id@0]
+08)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query II
 SELECT t1.t1_id, t2.t2_id
@@ -275,13 +269,11 @@ physical_plan
 02)--SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----PiecewiseMergeJoin: operator=LtEq, join_type=Inner, on=(t1_id <= t2_id)
 04)------SortExec: expr=[t1_id@0 DESC], preserve_partitioning=[false]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------FilterExec: t1_id@0 = 11 OR t1_id@0 = 44
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)--------CoalesceBatchesExec: target_batch_size=8192
-10)----------FilterExec: t2_name@1 != y, projection=[t2_id@0]
-11)------------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--------FilterExec: t1_id@0 = 11 OR t1_id@0 = 44
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+07)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------FilterExec: t2_name@1 != y, projection=[t2_id@0]
+09)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 CREATE TABLE null_join_t1 (id INT);
diff --git a/datafusion/sqllogictest/test_files/qualify.slt b/datafusion/sqllogictest/test_files/qualify.slt
index 366d65df67929..ce58e3998cf57 100644
--- a/datafusion/sqllogictest/test_files/qualify.slt
+++ b/datafusion/sqllogictest/test_files/qualify.slt
@@ -273,10 +273,9 @@ logical_plan
 05)--------TableScan: users projection=[id, name]
 physical_plan
 01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 > 1, projection=[id@0, name@1]
-04)------WindowAggExec: wdw=[count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
-05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--FilterExec: count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 > 1, projection=[id@0, name@1]
+03)----WindowAggExec: wdw=[count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # plan row_number()
 query TT
@@ -290,12 +289,11 @@ logical_plan
 05)--------TableScan: users projection=[dept]
 physical_plan
 01)ProjectionExec: expr=[row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@0 as rk]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@0 > 1
-04)------ProjectionExec: expr=[row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]
-05)--------BoundedWindowAggExec: wdw=[row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
-06)----------SortExec: expr=[dept@0 ASC NULLS LAST], preserve_partitioning=[false]
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--FilterExec: row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@0 > 1
+03)----ProjectionExec: expr=[row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]
+04)------BoundedWindowAggExec: wdw=[row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
+05)--------SortExec: expr=[dept@0 ASC NULLS LAST], preserve_partitioning=[false]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # plan with window function and group by
 query TT
@@ -318,23 +316,18 @@ logical_plan
 09)----------------TableScan: users projection=[salary, dept]
 physical_plan
 01)ProjectionExec: expr=[dept@0 as dept, avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as r]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 > Some(60000000000),14,6
-04)------ProjectionExec: expr=[dept@0 as dept, avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 as avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]
-05)--------WindowAggExec: wdw=[avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Decimal128(14, 6), nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
-06)----------SortExec: expr=[dept@0 ASC NULLS LAST], preserve_partitioning=[true]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------RepartitionExec: partitioning=Hash([dept@0], 4), input_partitions=4
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------FilterExec: sum(users.salary)@2 > Some(2000000),20,2, projection=[dept@0, salary@1]
-11)--------------------AggregateExec: mode=FinalPartitioned, gby=[dept@0 as dept, salary@1 as salary], aggr=[sum(users.salary)]
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------RepartitionExec: partitioning=Hash([dept@0, salary@1], 4), input_partitions=4
-14)--------------------------AggregateExec: mode=Partial, gby=[dept@1 as dept, salary@0 as salary], aggr=[sum(users.salary)]
-15)----------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-16)------------------------------CoalesceBatchesExec: target_batch_size=8192
-17)--------------------------------FilterExec: salary@0 > Some(500000),10,2
-18)----------------------------------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--FilterExec: avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 > Some(60000000000),14,6
+03)----ProjectionExec: expr=[dept@0 as dept, avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 as avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]
+04)------WindowAggExec: wdw=[avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Decimal128(14, 6), nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+05)--------SortExec: expr=[dept@0 ASC NULLS LAST], preserve_partitioning=[true]
+06)----------RepartitionExec: partitioning=Hash([dept@0], 4), input_partitions=4
+07)------------FilterExec: sum(users.salary)@2 > Some(2000000),20,2, projection=[dept@0, salary@1]
+08)--------------AggregateExec: mode=FinalPartitioned, gby=[dept@0 as dept, salary@1 as salary], aggr=[sum(users.salary)]
+09)----------------RepartitionExec: partitioning=Hash([dept@0, salary@1], 4), input_partitions=4
+10)------------------AggregateExec: mode=Partial, gby=[dept@1 as dept, salary@0 as salary], aggr=[sum(users.salary)]
+11)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+12)----------------------FilterExec: salary@0 > Some(500000),10,2
+13)------------------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # plan with aggregate function
 query TT
@@ -355,18 +348,15 @@ physical_plan
 01)SortPreservingMergeExec: [dept@0 ASC NULLS LAST]
 02)--SortExec: expr=[dept@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[dept@0 as dept, sum(users.salary)@1 as s]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------FilterExec: rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 = 1, projection=[dept@0, sum(users.salary)@1]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------BoundedWindowAggExec: wdw=[rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
-08)--------------SortPreservingMergeExec: [sum(users.salary)@1 DESC]
-09)----------------SortExec: expr=[sum(users.salary)@1 DESC], preserve_partitioning=[true]
-10)------------------AggregateExec: mode=FinalPartitioned, gby=[dept@0 as dept], aggr=[sum(users.salary)]
-11)--------------------CoalesceBatchesExec: target_batch_size=8192
-12)----------------------RepartitionExec: partitioning=Hash([dept@0], 4), input_partitions=4
-13)------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-14)--------------------------AggregateExec: mode=Partial, gby=[dept@1 as dept], aggr=[sum(users.salary)]
-15)----------------------------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------FilterExec: rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 = 1, projection=[dept@0, sum(users.salary)@1]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+06)----------BoundedWindowAggExec: wdw=[rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+07)------------SortPreservingMergeExec: [sum(users.salary)@1 DESC]
+08)--------------SortExec: expr=[sum(users.salary)@1 DESC], preserve_partitioning=[true]
+09)----------------AggregateExec: mode=FinalPartitioned, gby=[dept@0 as dept], aggr=[sum(users.salary)]
+10)------------------RepartitionExec: partitioning=Hash([dept@0], 4), input_partitions=1
+11)--------------------AggregateExec: mode=Partial, gby=[dept@1 as dept], aggr=[sum(users.salary)]
+12)----------------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Clean up
 statement ok
diff --git a/datafusion/sqllogictest/test_files/regexp/regexp_like.slt b/datafusion/sqllogictest/test_files/regexp/regexp_like.slt
index dd42511eade93..22d5066d5f782 100644
--- a/datafusion/sqllogictest/test_files/regexp/regexp_like.slt
+++ b/datafusion/sqllogictest/test_files/regexp/regexp_like.slt
@@ -251,9 +251,8 @@ logical_plan
 01)Filter: dict_table.column1 LIKE Utf8("%oo%")
 02)--TableScan: dict_table projection=[column1]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: column1@0 LIKE %oo%
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: column1@0 LIKE %oo%
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Ensure casting / coercion works for all operators
 # (there should be no casts to Utf8)
@@ -278,6 +277,23 @@ drop table strings
 statement ok
 drop table dict_table
 
+# Dict value types that themselves need further coercion against the literal
+statement ok
+create table dict_inner as
+select arrow_cast(arrow_cast(c, 'Binary'), 'Dictionary(UInt32, Binary)') as bin_col,
+       arrow_cast(arrow_cast(c, 'Dictionary(UInt32, Utf8)'),
+                  'Dictionary(Int32, Dictionary(UInt32, Utf8))') as nested_col
+from (values ('foo'), ('bar')) as t(c);
+
+query BB
+select bin_col LIKE 'foo', nested_col ~ 'foo' from dict_inner;
+----
+true true
+false false
+
+statement ok
+drop table dict_inner
+
 # Ensure that regexp_like is rewritten to use the (more optimized) regex operators
 statement ok
 create table regexp_test as values
@@ -335,5 +351,10 @@ true true
 false false
 false false
 
+query TT
+select * from regexp_test where regexp_like('f', regexp_replace((('v\r') like ('f_*sP6H1*')), '339629555', '-1459539013'));
+----
+
+
 statement ok
 drop table if exists dict_table;
diff --git a/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt b/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt
index a16801adcef78..e27ff1e9c1a00 100644
--- a/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt
+++ b/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt
@@ -127,3 +127,130 @@ from (values ('a'), ('b')) as tbl(col);
 ----
 NULL NULL NULL
 NULL NULL NULL
+
+# Stripping trailing .*$ must not change match semantics for inputs with
+# newlines when the original pattern does not use the 's' flag.
+query B
+SELECT regexp_replace(concat('http://x/', chr(10), 'rest'), '^https?://([^/]+)/.*$', '\1')
+       = concat('http://x/', chr(10), 'rest');
+----
+true
+
+# Inline multiline mode still allows only the matched prefix to be replaced.
+# The remainder of the string must be preserved.
+query B
+SELECT regexp_replace(
+           concat('http://x/path', chr(10), 'rest'),
+           '^(?m)https?://([^/]+)/.*$',
+           '\1'
+       ) = concat('x', chr(10), 'rest');
+----
+true
+
+
+# Fixture for testing optimizations in regexp_replace
+statement ok
+CREATE TABLE regexp_replace_optimized_cases (
+    value string,
+    regexp string,
+    replacement string,
+    expected string
+);
+
+# Extract domain from URL using anchored pattern with trailing .*
+# This tests that the full URL suffix is replaced, not just the matched prefix.
+statement ok
+INSERT INTO regexp_replace_optimized_cases VALUES
+    ('https://www.example.com/path/to/page?q=1', '^https?://(?:www\.)?([^/]+)/.*$', '\1', 'example.com'),
+    ('http://test.org/foo/bar', '^https?://(?:www\.)?([^/]+)/.*$', '\1', 'test.org'),
+    ('https://example.com/', '^https?://(?:www\.)?([^/]+)/.*$', '\1', 'example.com'),
+    ('not-a-url', '^https?://(?:www\.)?([^/]+)/.*$', '\1', 'not-a-url');
+
+# More than one capture group should disable the short-regex fast path.
+# This still uses replacement \1, but captures_len() will be > 2, so the
+# implementation must fall back to the normal regexp_replace path.
+statement ok
+INSERT INTO regexp_replace_optimized_cases VALUES
+    ('https://www.example.com/path/to/page?q=1', '^https?://((www\.)?([^/]+))/.*$', '\1', 'www.example.com'),
+    ('http://test.org/foo/bar', '^https?://((www\.)?([^/]+))/.*$', '\1', 'test.org'),
+    ('not-a-url', '^https?://((www\.)?([^/]+))/.*$', '\1', 'not-a-url');
+
+# If the overall pattern matches but capture group 1 does not participate,
+# regexp_replace(..., '\1') should substitute the empty string, not keep
+# the original input.
+statement ok
+INSERT INTO regexp_replace_optimized_cases VALUES
+    ('bzzz', '^(a)?b.*$', '\1', '');
+
+
+query TB
+SELECT value, regexp_replace(value, regexp, replacement) = expected
+FROM regexp_replace_optimized_cases
+ORDER BY regexp, value, replacement, expected;
+----
+bzzz true
+http://test.org/foo/bar true
+https://www.example.com/path/to/page?q=1 true
+not-a-url true
+http://test.org/foo/bar true
+https://example.com/ true
+https://www.example.com/path/to/page?q=1 true
+not-a-url true
+
+query TB
+SELECT value, regexp_replace(
+           arrow_cast(value, 'LargeUtf8'),
+           arrow_cast(regexp, 'LargeUtf8'),
+           arrow_cast(replacement, 'LargeUtf8')
+       ) = arrow_cast(expected, 'LargeUtf8')
+FROM regexp_replace_optimized_cases
+ORDER BY regexp, value, replacement, expected;
+----
+bzzz true
+http://test.org/foo/bar true
+https://www.example.com/path/to/page?q=1 true
+not-a-url true
+http://test.org/foo/bar true
+https://example.com/ true
+https://www.example.com/path/to/page?q=1 true
+not-a-url true
+
+query TB
+SELECT value, regexp_replace(
+           arrow_cast(value, 'Utf8View'),
+           arrow_cast(regexp, 'Utf8View'),
+           arrow_cast(replacement, 'Utf8View')
+       ) = arrow_cast(expected, 'Utf8View')
+FROM regexp_replace_optimized_cases
+ORDER BY regexp, value, replacement, expected;
+----
+bzzz true
+http://test.org/foo/bar true
+https://www.example.com/path/to/page?q=1 true
+not-a-url true
+http://test.org/foo/bar true
+https://example.com/ true
+https://www.example.com/path/to/page?q=1 true
+not-a-url true
+
+query TB
+SELECT value, regexp_replace(
+           arrow_cast(value, 'Dictionary(Int32, Utf8)'),
+           arrow_cast(regexp, 'Dictionary(Int32, Utf8)'),
+           arrow_cast(replacement, 'Dictionary(Int32, Utf8)')
+       ) = arrow_cast(expected, 'Dictionary(Int32, Utf8)')
+FROM regexp_replace_optimized_cases
+ORDER BY regexp, value, replacement, expected;
+----
+bzzz true
+http://test.org/foo/bar true
+https://www.example.com/path/to/page?q=1 true
+not-a-url true
+http://test.org/foo/bar true
+https://example.com/ true
+https://www.example.com/path/to/page?q=1 true
+not-a-url true
+
+# cleanup
+statement ok
+DROP TABLE regexp_replace_optimized_cases;
diff --git a/datafusion/sqllogictest/test_files/repartition.slt b/datafusion/sqllogictest/test_files/repartition.slt
index 29d20d10b6715..cf913caefc525 100644
--- a/datafusion/sqllogictest/test_files/repartition.slt
+++ b/datafusion/sqllogictest/test_files/repartition.slt
@@ -44,11 +44,9 @@ logical_plan
 02)--TableScan: parquet_table projection=[column1, column2]
 physical_plan
 01)AggregateExec: mode=FinalPartitioned, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----RepartitionExec: partitioning=Hash([column1@0], 4), input_partitions=4
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------AggregateExec: mode=Partial, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
-06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition/parquet_table/2.parquet]]}, projection=[column1, column2], file_type=parquet
+02)--RepartitionExec: partitioning=Hash([column1@0], 4), input_partitions=1
+03)----AggregateExec: mode=Partial, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition/parquet_table/2.parquet]]}, projection=[column1, column2], file_type=parquet
 
 # disable round robin repartitioning
 statement ok
@@ -62,10 +60,9 @@ logical_plan
 02)--TableScan: parquet_table projection=[column1, column2]
 physical_plan
 01)AggregateExec: mode=FinalPartitioned, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----RepartitionExec: partitioning=Hash([column1@0], 4), input_partitions=1
-04)------AggregateExec: mode=Partial, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition/parquet_table/2.parquet]]}, projection=[column1, column2], file_type=parquet
+02)--RepartitionExec: partitioning=Hash([column1@0], 4), input_partitions=1
+03)----AggregateExec: mode=Partial, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition/parquet_table/2.parquet]]}, projection=[column1, column2], file_type=parquet
 
 
 # Cleanup
@@ -122,12 +119,11 @@ logical_plan
 03)----TableScan: sink_table projection=[c1, c2, c3]
 physical_plan
 01)CoalescePartitionsExec: fetch=5
-02)--CoalesceBatchesExec: target_batch_size=8192, fetch=5
-03)----FilterExec: c3@2 > 0
-04)------RepartitionExec: partitioning=RoundRobinBatch(3), input_partitions=1
-05)--------StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true
+02)--FilterExec: c3@2 > 0, fetch=5
+03)----RepartitionExec: partitioning=RoundRobinBatch(3), input_partitions=1
+04)------StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true
 
-# Start repratition on empty column test.
+# Start repartition on empty column test.
 # See https://github.com/apache/datafusion/issues/12057
 
 statement ok
@@ -142,6 +138,16 @@ FROM t1 WHERE ((false > (v1 = v1)) IS DISTINCT FROM true);
 ----
 1
 
+# Config reset
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+reset datafusion.optimizer.enable_round_robin_repartition;
+
 statement ok
 DROP TABLE t1;
 
diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt
index 41718b3aebc27..88eaf7118f8a5 100644
--- a/datafusion/sqllogictest/test_files/repartition_scan.slt
+++ b/datafusion/sqllogictest/test_files/repartition_scan.slt
@@ -27,6 +27,10 @@ set datafusion.execution.target_partitions = 4;
 statement ok
 set datafusion.optimizer.repartition_file_min_size = 1;
 
+# disable the listing cache so DataFusion picks up changes from COPY statements
+statement ok
+set datafusion.runtime.list_files_cache_limit = "0K";
+
 ###################
 ### Parquet tests
 ###################
@@ -59,9 +63,8 @@ logical_plan
 01)Filter: parquet_table.column1 != Int32(42)
 02)--TableScan: parquet_table projection=[column1], partial_filters=[parquet_table.column1 != Int32(42)]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: column1@0 != 42
-03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..135], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:135..270], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:270..405], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:405..537]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)]
+01)FilterExec: column1@0 != 42
+02)--DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..135], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:135..270], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:270..405], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:405..537]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)]
 
 # disable round robin repartitioning
 statement ok
@@ -75,9 +78,8 @@ logical_plan
 01)Filter: parquet_table.column1 != Int32(42)
 02)--TableScan: parquet_table projection=[column1], partial_filters=[parquet_table.column1 != Int32(42)]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: column1@0 != 42
-03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..135], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:135..270], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:270..405], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:405..537]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)]
+01)FilterExec: column1@0 != 42
+02)--DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..135], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:135..270], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:270..405], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:405..537]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)]
 
 # enable round robin repartitioning again
 statement ok
@@ -100,9 +102,8 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [column1@0 ASC NULLS LAST]
 02)--SortExec: expr=[column1@0 ASC NULLS LAST], preserve_partitioning=[true]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------FilterExec: column1@0 != 42
-05)--------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..266], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:266..526, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..272], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:272..537]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)]
+03)----FilterExec: column1@0 != 42
+04)------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..266], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:266..526, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..272], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:272..537]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)]
 
 
 ## Read the files as though they are ordered
@@ -136,9 +137,8 @@ logical_plan
 03)----TableScan: parquet_table_with_order projection=[column1], partial_filters=[parquet_table_with_order.column1 != Int32(42)]
 physical_plan
 01)SortPreservingMergeExec: [column1@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: column1@0 != 42
-04)------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..263], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..268], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:268..537], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:263..526]]}, projection=[column1], output_ordering=[column1@0 ASC NULLS LAST], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)]
+02)--FilterExec: column1@0 != 42
+03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..263], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..268], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:268..537], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:263..526]]}, projection=[column1], output_ordering=[column1@0 ASC NULLS LAST], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)]
 
 # Cleanup
 statement ok
@@ -183,9 +183,8 @@ logical_plan
 01)Filter: csv_table.column1 != Int32(42)
 02)--TableScan: csv_table projection=[column1], partial_filters=[csv_table.column1 != Int32(42)]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: column1@0 != 42
-03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/csv_table/1.csv:0..5], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/csv_table/1.csv:5..10], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/csv_table/1.csv:10..15], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/csv_table/1.csv:15..18]]}, projection=[column1], file_type=csv, has_header=true
+01)FilterExec: column1@0 != 42
+02)--DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/csv_table/1.csv:0..5], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/csv_table/1.csv:5..10], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/csv_table/1.csv:10..15], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/csv_table/1.csv:15..18]]}, projection=[column1], file_type=csv, has_header=true
 
 # Cleanup
 statement ok
@@ -226,9 +225,8 @@ logical_plan
 01)Filter: json_table.column1 != Int32(42)
 02)--TableScan: json_table projection=[column1], partial_filters=[json_table.column1 != Int32(42)]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: column1@0 != 42
-03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/json_table/1.json:0..18], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/json_table/1.json:18..36], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/json_table/1.json:36..54], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/json_table/1.json:54..70]]}, projection=[column1], file_type=json
+01)FilterExec: column1@0 != 42
+02)--DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/json_table/1.json:0..18], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/json_table/1.json:18..36], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/json_table/1.json:36..54], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/json_table/1.json:54..70]]}, projection=[column1], file_type=json
 
 # Cleanup
 statement ok
@@ -288,5 +286,12 @@ logical_plan TableScan: avro_table projection=[f1, f2, f3]
 physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/avro/simple_enum.avro]]}, projection=[f1, f2, f3], file_type=avro
 
 # Cleanup
+# Config reset
+statement ok
+RESET datafusion.catalog.create_default_catalog_and_schema;
+
+statement ok
+RESET datafusion.optimizer.repartition_file_min_size;
+
 statement ok
 DROP TABLE avro_table;
diff --git a/datafusion/sqllogictest/test_files/repartition_subset_satisfaction.slt b/datafusion/sqllogictest/test_files/repartition_subset_satisfaction.slt
new file mode 100644
index 0000000000000..dbf31dec5e118
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/repartition_subset_satisfaction.slt
@@ -0,0 +1,538 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+# Tests for Subset Partitioning Optimization
+#
+# Subset partitioning allows Hash([a]) to satisfy Hash([a, b]) requirements
+# when the required partitioning expressions are a strict subset of the
+# current partitioning expressions.
+##########
+
+##########
+# SETUP: Configuration and Data Generation
+##########
+
+statement ok
+set datafusion.optimizer.enable_round_robin_repartition = false;
+
+statement ok
+set datafusion.execution.target_partitions = 3;
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 1;
+
+# Create fact table partitioned by f_dkey (3 partitions)
+# Each partition has data sorted by timestamp
+# Partition: f_dkey=A
+statement ok
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 95.5),
+    (TIMESTAMP '2023-01-01T09:00:10', 102.3),
+    (TIMESTAMP '2023-01-01T09:00:20', 98.7),
+    (TIMESTAMP '2023-01-01T09:12:20', 105.1),
+    (TIMESTAMP '2023-01-01T09:12:30', 100.0),
+    (TIMESTAMP '2023-01-01T09:12:40', 150.0),
+    (TIMESTAMP '2023-01-01T09:12:50', 120.8)
+))
+TO 'test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=A/data.parquet'
+STORED AS PARQUET;
+
+# Partition: f_dkey=B
+statement ok
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 75.2),
+    (TIMESTAMP '2023-01-01T09:00:10', 82.4),
+    (TIMESTAMP '2023-01-01T09:00:20', 78.9),
+    (TIMESTAMP '2023-01-01T09:00:30', 85.6),
+    (TIMESTAMP '2023-01-01T09:12:30', 80.0),
+    (TIMESTAMP '2023-01-01T09:12:40', 120.0),
+    (TIMESTAMP '2023-01-01T09:12:50', 92.3)
+))
+TO 'test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=B/data.parquet'
+STORED AS PARQUET;
+
+# Partition: f_dkey=C
+statement ok
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 300.5),
+    (TIMESTAMP '2023-01-01T09:00:10', 285.7),
+    (TIMESTAMP '2023-01-01T09:00:20', 310.2),
+    (TIMESTAMP '2023-01-01T09:00:30', 295.8),
+    (TIMESTAMP '2023-01-01T09:00:40', 300.0),
+    (TIMESTAMP '2023-01-01T09:12:40', 250.0),
+    (TIMESTAMP '2023-01-01T09:12:50', 275.4)
+))
+TO 'test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=C/data.parquet'
+STORED AS PARQUET;
+
+# Create dimension table partitioned by d_dkey (4 partitions)
+query I
+COPY (SELECT column1 as env, column2 as service, column3 as host FROM (VALUES
+    ('dev', 'log', 'ma')
+))
+TO 'test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=A/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+query I
+COPY (SELECT column1 as env, column2 as service, column3 as host FROM (VALUES
+    ('prod', 'log', 'ma')
+))
+TO 'test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=B/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+query I
+COPY (SELECT column1 as env, column2 as service, column3 as host FROM (VALUES
+    ('prod', 'log', 'vim')
+))
+TO 'test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=C/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+query I
+COPY (SELECT column1 as env, column2 as service, column3 as host FROM (VALUES
+    ('prod', 'trace', 'vim')
+))
+TO 'test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=D/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+##########
+# TABLE DECLARATIONS
+##########
+
+# Fact table with ordering
+statement ok
+CREATE EXTERNAL TABLE fact_table_ordered (timestamp TIMESTAMP, value DOUBLE)
+STORED AS PARQUET
+PARTITIONED BY (f_dkey STRING)
+WITH ORDER (f_dkey ASC, timestamp ASC)
+LOCATION 'test_files/scratch/repartition_subset_satisfaction/fact/';
+
+# Dimension table (for join tests)
+statement ok
+CREATE EXTERNAL TABLE dimension_table (env STRING, service STRING, host STRING)
+STORED AS PARQUET
+PARTITIONED BY (d_dkey STRING)
+LOCATION 'test_files/scratch/repartition_subset_satisfaction/dimension/';
+
+##########
+# TEST 1: Basic Aggregate with Subset Partitioning
+# Demonstrates that GROUP BY [f_dkey, time_bin] can use
+# file partitioning on just [f_dkey]
+##########
+
+# With subset repartitioning forced (disables subset optimization)
+statement ok
+set datafusion.optimizer.subset_repartition_threshold = 4;
+
+query TT
+EXPLAIN SELECT f_dkey, date_bin(INTERVAL '30 seconds', timestamp) as time_bin,
+       COUNT(*), AVG(value)
+FROM fact_table_ordered
+GROUP BY f_dkey, date_bin(INTERVAL '30 seconds', timestamp)
+ORDER BY f_dkey, time_bin;
+----
+logical_plan
+01)Sort: fact_table_ordered.f_dkey ASC NULLS LAST, time_bin ASC NULLS LAST
+02)--Projection: fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp) AS time_bin, count(Int64(1)) AS count(*), avg(fact_table_ordered.value)
+03)----Aggregate: groupBy=[[fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"), fact_table_ordered.timestamp)]], aggr=[[count(Int64(1)), avg(fact_table_ordered.value)]]
+04)------TableScan: fact_table_ordered projection=[timestamp, value, f_dkey]
+physical_plan
+01)SortPreservingMergeExec: [f_dkey@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST]
+02)--ProjectionExec: expr=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)@1 as time_bin, count(Int64(1))@2 as count(*), avg(fact_table_ordered.value)@3 as avg(fact_table_ordered.value)]
+03)----AggregateExec: mode=FinalPartitioned, gby=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)@1 as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)], aggr=[count(Int64(1)), avg(fact_table_ordered.value)], ordering_mode=Sorted
+04)------RepartitionExec: partitioning=Hash([f_dkey@0, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)@1], 3), input_partitions=3, preserve_order=true, sort_exprs=f_dkey@0 ASC NULLS LAST, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)@1 ASC NULLS LAST
+05)--------AggregateExec: mode=Partial, gby=[f_dkey@2 as f_dkey, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@0) as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)], aggr=[count(Int64(1)), avg(fact_table_ordered.value)], ordering_mode=Sorted
+06)----------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet
+
+# Verify results without subset satisfaction
+query TPIR rowsort
+SELECT f_dkey, date_bin(INTERVAL '30 seconds', timestamp) as time_bin,
+       COUNT(*), AVG(value)
+FROM fact_table_ordered
+GROUP BY f_dkey, date_bin(INTERVAL '30 seconds', timestamp)
+ORDER BY f_dkey, time_bin;
+----
+A 2023-01-01T09:00:00 3 98.833333333333
+A 2023-01-01T09:12:00 1 105.1
+A 2023-01-01T09:12:30 3 123.6
+B 2023-01-01T09:00:00 3 78.833333333333
+B 2023-01-01T09:00:30 1 85.6
+B 2023-01-01T09:12:30 3 97.433333333333
+C 2023-01-01T09:00:00 3 298.8
+C 2023-01-01T09:00:30 2 297.9
+C 2023-01-01T09:12:30 2 262.7
+
+# With subset logic enabled (default - enables subset optimization)
+statement ok
+set datafusion.optimizer.subset_repartition_threshold = 1;
+
+query TT
+EXPLAIN SELECT f_dkey, date_bin(INTERVAL '30 seconds', timestamp) as time_bin,
+       COUNT(*), AVG(value)
+FROM fact_table_ordered
+GROUP BY f_dkey, date_bin(INTERVAL '30 seconds', timestamp)
+ORDER BY f_dkey, time_bin;
+----
+logical_plan
+01)Sort: fact_table_ordered.f_dkey ASC NULLS LAST, time_bin ASC NULLS LAST
+02)--Projection: fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp) AS time_bin, count(Int64(1)) AS count(*), avg(fact_table_ordered.value)
+03)----Aggregate: groupBy=[[fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"), fact_table_ordered.timestamp)]], aggr=[[count(Int64(1)), avg(fact_table_ordered.value)]]
+04)------TableScan: fact_table_ordered projection=[timestamp, value, f_dkey]
+physical_plan
+01)SortPreservingMergeExec: [f_dkey@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST]
+02)--ProjectionExec: expr=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)@1 as time_bin, count(Int64(1))@2 as count(*), avg(fact_table_ordered.value)@3 as avg(fact_table_ordered.value)]
+03)----AggregateExec: mode=SinglePartitioned, gby=[f_dkey@2 as f_dkey, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@0) as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)], aggr=[count(Int64(1)), avg(fact_table_ordered.value)], ordering_mode=Sorted
+04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet
+
+# Verify results match with subset satisfaction
+query TPIR rowsort
+SELECT f_dkey, date_bin(INTERVAL '30 seconds', timestamp) as time_bin,
+       COUNT(*), AVG(value)
+FROM fact_table_ordered
+GROUP BY f_dkey, date_bin(INTERVAL '30 seconds', timestamp)
+ORDER BY f_dkey, time_bin;
+----
+A 2023-01-01T09:00:00 3 98.833333333333
+A 2023-01-01T09:12:00 1 105.1
+A 2023-01-01T09:12:30 3 123.6
+B 2023-01-01T09:00:00 3 78.833333333333
+B 2023-01-01T09:00:30 1 85.6
+B 2023-01-01T09:12:30 3 97.433333333333
+C 2023-01-01T09:00:00 3 298.8
+C 2023-01-01T09:00:30 2 297.9
+C 2023-01-01T09:12:30 2 262.7
+
+##########
+# TEST 2: Window Functions with Subset Partitioning
+# Demonstrates that PARTITION BY [f_dkey, time_bin] can use
+# file partitioning on just [f_dkey]
+##########
+
+# With subset repartitioning forced (disables subset optimization)
+statement ok
+set datafusion.optimizer.subset_repartition_threshold = 4;
+
+query TT
+EXPLAIN SELECT f_dkey, timestamp, value,
+       ROW_NUMBER() OVER (
+           PARTITION BY f_dkey, date_bin(INTERVAL '30 seconds', timestamp)
+           ORDER BY timestamp
+       ) as rn
+FROM fact_table_ordered;
+----
+logical_plan
+01)Projection: fact_table_ordered.f_dkey, fact_table_ordered.timestamp, fact_table_ordered.value, row_number() PARTITION BY [fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn
+02)--WindowAggr: windowExpr=[[row_number() PARTITION BY [fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"), fact_table_ordered.timestamp)] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+03)----TableScan: fact_table_ordered projection=[timestamp, value, f_dkey]
+physical_plan
+01)ProjectionExec: expr=[f_dkey@2 as f_dkey, timestamp@0 as timestamp, value@1 as value, row_number() PARTITION BY [fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }\"),fact_table_ordered.timestamp)] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----SortExec: expr=[f_dkey@2 ASC NULLS LAST, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@0) ASC NULLS LAST, timestamp@0 ASC NULLS LAST], preserve_partitioning=[true]
+04)------RepartitionExec: partitioning=Hash([f_dkey@2, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@0)], 3), input_partitions=3
+05)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet
+
+# Verify results without subset satisfaction
+query TPRI rowsort
+SELECT f_dkey, timestamp, value,
+       ROW_NUMBER() OVER (
+           PARTITION BY f_dkey, date_bin(INTERVAL '30 seconds', timestamp)
+           ORDER BY timestamp
+       ) as rn
+FROM fact_table_ordered
+WHERE timestamp < TIMESTAMP '2023-01-01T09:00:30';
+----
+A 2023-01-01T09:00:00 95.5 1
+A 2023-01-01T09:00:10 102.3 2
+A 2023-01-01T09:00:20 98.7 3
+B 2023-01-01T09:00:00 75.2 1
+B 2023-01-01T09:00:10 82.4 2
+B 2023-01-01T09:00:20 78.9 3
+C 2023-01-01T09:00:00 300.5 1
+C 2023-01-01T09:00:10 285.7 2
+C 2023-01-01T09:00:20 310.2 3
+
+# With subset logic enabled (default - enables subset optimization)
+statement ok
+set datafusion.optimizer.subset_repartition_threshold = 1;
+
+query TT
+EXPLAIN SELECT f_dkey, timestamp, value,
+       ROW_NUMBER() OVER (
+           PARTITION BY f_dkey, date_bin(INTERVAL '30 seconds', timestamp)
+           ORDER BY timestamp
+       ) as rn
+FROM fact_table_ordered;
+----
+logical_plan
+01)Projection: fact_table_ordered.f_dkey, fact_table_ordered.timestamp, fact_table_ordered.value, row_number() PARTITION BY [fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn
+02)--WindowAggr: windowExpr=[[row_number() PARTITION BY [fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"), fact_table_ordered.timestamp)] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+03)----TableScan: fact_table_ordered projection=[timestamp, value, f_dkey]
+physical_plan
+01)ProjectionExec: expr=[f_dkey@2 as f_dkey, timestamp@0 as timestamp, value@1 as value, row_number() PARTITION BY [fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }\"),fact_table_ordered.timestamp)] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet
+
+# Verify results match with subset satisfaction
+query TPRI rowsort
+SELECT f_dkey, timestamp, value,
+       ROW_NUMBER() OVER (
+           PARTITION BY f_dkey, date_bin(INTERVAL '30 seconds', timestamp)
+           ORDER BY timestamp
+       ) as rn
+FROM fact_table_ordered
+WHERE timestamp < TIMESTAMP '2023-01-01T09:00:30';
+----
+A 2023-01-01T09:00:00 95.5 1
+A 2023-01-01T09:00:10 102.3 2
+A 2023-01-01T09:00:20 98.7 3
+B 2023-01-01T09:00:00 75.2 1
+B 2023-01-01T09:00:10 82.4 2
+B 2023-01-01T09:00:20 78.9 3
+C 2023-01-01T09:00:00 300.5 1
+C 2023-01-01T09:00:10 285.7 2
+C 2023-01-01T09:00:20 310.2 3
+
+##########
+# TEST 3: Complex Join and Aggregate with Subset Partitioning
+# Demonstrates subset partitioning with joins and nested aggregations
+##########
+
+# With subset repartitioning forced (disables subset optimization)
+statement ok
+set datafusion.optimizer.subset_repartition_threshold = 4;
+
+query TT
+EXPLAIN SELECT env, time_bin, AVG(max_bin_value) AS avg_max_value
+FROM
+(
+    SELECT  f_dkey,
+            date_bin(INTERVAL '30 seconds', timestamp) AS time_bin,
+            env,
+            MAX(value) AS max_bin_value
+    FROM
+        (
+        SELECT
+            f.f_dkey,
+            d.env,
+            d.service,
+            d.host,
+            f.timestamp,
+            f.value
+        FROM dimension_table d
+        INNER JOIN fact_table_ordered f ON d.d_dkey = f.f_dkey
+        WHERE service = 'log'
+        ) AS j
+    GROUP BY f_dkey, time_bin, env
+) AS a
+GROUP BY env, time_bin
+ORDER BY env, time_bin;
+----
+logical_plan
+01)Sort: a.env ASC NULLS LAST, a.time_bin ASC NULLS LAST
+02)--Projection: a.env, a.time_bin, avg(a.max_bin_value) AS avg_max_value
+03)----Aggregate: groupBy=[[a.env, a.time_bin]], aggr=[[avg(a.max_bin_value)]]
+04)------SubqueryAlias: a
+05)--------Projection: date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp) AS time_bin, j.env, max(j.value) AS max_bin_value
+06)----------Aggregate: groupBy=[[j.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"), j.timestamp), j.env]], aggr=[[max(j.value)]]
+07)------------SubqueryAlias: j
+08)--------------Projection: f.f_dkey, d.env, f.timestamp, f.value
+09)----------------Inner Join: d.d_dkey = f.f_dkey
+10)------------------SubqueryAlias: d
+11)--------------------Projection: dimension_table.env, dimension_table.d_dkey
+12)----------------------Filter: dimension_table.service = Utf8View("log")
+13)------------------------TableScan: dimension_table projection=[env, service, d_dkey], partial_filters=[dimension_table.service = Utf8View("log")]
+14)------------------SubqueryAlias: f
+15)--------------------TableScan: fact_table_ordered projection=[timestamp, value, f_dkey]
+physical_plan
+01)SortPreservingMergeExec: [env@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST]
+02)--SortExec: expr=[env@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[env@0 as env, time_bin@1 as time_bin, avg(a.max_bin_value)@2 as avg_max_value]
+04)------AggregateExec: mode=FinalPartitioned, gby=[env@0 as env, time_bin@1 as time_bin], aggr=[avg(a.max_bin_value)]
+05)--------RepartitionExec: partitioning=Hash([env@0, time_bin@1], 3), input_partitions=3
+06)----------AggregateExec: mode=Partial, gby=[env@1 as env, time_bin@0 as time_bin], aggr=[avg(a.max_bin_value)]
+07)------------ProjectionExec: expr=[date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp)@1 as time_bin, env@2 as env, max(j.value)@3 as max_bin_value]
+08)--------------AggregateExec: mode=FinalPartitioned, gby=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp)@1 as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp), env@2 as env], aggr=[max(j.value)], ordering_mode=PartiallySorted([0, 1])
+09)----------------RepartitionExec: partitioning=Hash([f_dkey@0, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp)@1, env@2], 3), input_partitions=3, preserve_order=true, sort_exprs=f_dkey@0 ASC NULLS LAST, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp)@1 ASC NULLS LAST
+10)------------------AggregateExec: mode=Partial, gby=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@2) as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp), env@1 as env], aggr=[max(j.value)], ordering_mode=PartiallySorted([0, 1])
+11)--------------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d_dkey@1, f_dkey@2)], projection=[f_dkey@4, env@0, timestamp@2, value@3]
+12)----------------------CoalescePartitionsExec
+13)------------------------FilterExec: service@1 = log, projection=[env@0, d_dkey@2]
+14)--------------------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=A/data.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=D/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=C/data.parquet]]}, projection=[env, service, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)]
+15)----------------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify results without subset satisfaction
+query TPR rowsort
+SELECT env, time_bin, AVG(max_bin_value) AS avg_max_value
+FROM
+(
+    SELECT  f_dkey,
+            date_bin(INTERVAL '30 seconds', timestamp) AS time_bin,
+            env,
+            MAX(value) AS max_bin_value
+    FROM
+        (
+        SELECT
+            f.f_dkey,
+            d.env,
+            d.service,
+            d.host,
+            f.timestamp,
+            f.value
+        FROM dimension_table d
+        INNER JOIN fact_table_ordered f ON d.d_dkey = f.f_dkey
+        WHERE service = 'log'
+        ) AS j
+    GROUP BY f_dkey, time_bin, env
+) AS a
+GROUP BY env, time_bin
+ORDER BY env, time_bin;
+----
+dev 2023-01-01T09:00:00 102.3
+dev 2023-01-01T09:12:00 105.1
+dev 2023-01-01T09:12:30 150
+prod 2023-01-01T09:00:00 196.3
+prod 2023-01-01T09:00:30 192.8
+prod 2023-01-01T09:12:30 197.7
+
+# With subset logic enabled (default - enables subset optimization)
+statement ok
+set datafusion.optimizer.subset_repartition_threshold = 1;
+
+query TT
+EXPLAIN SELECT env, time_bin, AVG(max_bin_value) AS avg_max_value
+FROM
+(
+    SELECT  f_dkey,
+            date_bin(INTERVAL '30 seconds', timestamp) AS time_bin,
+            env,
+            MAX(value) AS max_bin_value
+    FROM
+        (
+        SELECT
+            f.f_dkey,
+            d.env,
+            d.service,
+            d.host,
+            f.timestamp,
+            f.value
+        FROM dimension_table d
+        INNER JOIN fact_table_ordered f ON d.d_dkey = f.f_dkey
+        WHERE service = 'log'
+        ) AS j
+    GROUP BY f_dkey, time_bin, env
+) AS a
+GROUP BY env, time_bin
+ORDER BY env, time_bin;
+----
+logical_plan
+01)Sort: a.env ASC NULLS LAST, a.time_bin ASC NULLS LAST
+02)--Projection: a.env, a.time_bin, avg(a.max_bin_value) AS avg_max_value
+03)----Aggregate: groupBy=[[a.env, a.time_bin]], aggr=[[avg(a.max_bin_value)]]
+04)------SubqueryAlias: a
+05)--------Projection: date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp) AS time_bin, j.env, max(j.value) AS max_bin_value
+06)----------Aggregate: groupBy=[[j.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"), j.timestamp), j.env]], aggr=[[max(j.value)]]
+07)------------SubqueryAlias: j
+08)--------------Projection: f.f_dkey, d.env, f.timestamp, f.value
+09)----------------Inner Join: d.d_dkey = f.f_dkey
+10)------------------SubqueryAlias: d
+11)--------------------Projection: dimension_table.env, dimension_table.d_dkey
+12)----------------------Filter: dimension_table.service = Utf8View("log")
+13)------------------------TableScan: dimension_table projection=[env, service, d_dkey], partial_filters=[dimension_table.service = Utf8View("log")]
+14)------------------SubqueryAlias: f
+15)--------------------TableScan: fact_table_ordered projection=[timestamp, value, f_dkey]
+physical_plan
+01)SortPreservingMergeExec: [env@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST]
+02)--SortExec: expr=[env@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[env@0 as env, time_bin@1 as time_bin, avg(a.max_bin_value)@2 as avg_max_value]
+04)------AggregateExec: mode=FinalPartitioned, gby=[env@0 as env, time_bin@1 as time_bin], aggr=[avg(a.max_bin_value)]
+05)--------RepartitionExec: partitioning=Hash([env@0, time_bin@1], 3), input_partitions=3
+06)----------AggregateExec: mode=Partial, gby=[env@1 as env, time_bin@0 as time_bin], aggr=[avg(a.max_bin_value)]
+07)------------ProjectionExec: expr=[date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp)@1 as time_bin, env@2 as env, max(j.value)@3 as max_bin_value]
+08)--------------AggregateExec: mode=SinglePartitioned, gby=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@2) as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp), env@1 as env], aggr=[max(j.value)], ordering_mode=PartiallySorted([0, 1])
+09)----------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d_dkey@1, f_dkey@2)], projection=[f_dkey@4, env@0, timestamp@2, value@3]
+10)------------------CoalescePartitionsExec
+11)--------------------FilterExec: service@1 = log, projection=[env@0, d_dkey@2]
+12)----------------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=A/data.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=D/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=C/data.parquet]]}, projection=[env, service, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)]
+13)------------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify results match with subset satisfaction
+query TPR rowsort
+SELECT env, time_bin, AVG(max_bin_value) AS avg_max_value
+FROM
+(
+    SELECT  f_dkey,
+            date_bin(INTERVAL '30 seconds', timestamp) AS time_bin,
+            env,
+            MAX(value) AS max_bin_value
+    FROM
+        (
+        SELECT
+            f.f_dkey,
+            d.env,
+            d.service,
+            d.host,
+            f.timestamp,
+            f.value
+        FROM dimension_table d
+        INNER JOIN fact_table_ordered f ON d.d_dkey = f.f_dkey
+        WHERE service = 'log'
+        ) AS j
+    GROUP BY f_dkey, time_bin, env
+) AS a
+GROUP BY env, time_bin
+ORDER BY env, time_bin;
+----
+dev 2023-01-01T09:00:00 102.3
+dev 2023-01-01T09:12:00 105.1
+dev 2023-01-01T09:12:30 150
+prod 2023-01-01T09:00:00 196.3
+prod 2023-01-01T09:00:30 192.8
+prod 2023-01-01T09:12:30 197.7
+
+##########
+# CLEANUP
+##########
+
+# Config reset
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+reset datafusion.optimizer.enable_round_robin_repartition;
+
+statement ok
+reset datafusion.optimizer.preserve_file_partitions;
+
+statement ok
+reset datafusion.optimizer.subset_repartition_threshold;
+
+statement ok
+DROP TABLE fact_table_ordered;
+
+statement ok
+DROP TABLE dimension_table;
diff --git a/datafusion/sqllogictest/test_files/run_end_encoded.slt b/datafusion/sqllogictest/test_files/run_end_encoded.slt
new file mode 100644
index 0000000000000..b5909bc7c430d
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/run_end_encoded.slt
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for Run-End Encoded (REE) array support in aggregations
+# This tests that REE arrays can be used as GROUP BY keys (requires proper hashing support)
+
+# Create a table with REE-encoded sensor IDs using arrow_cast
+# First create primitive arrays, then cast to REE in a second step
+statement ok
+CREATE TABLE sensor_readings AS
+WITH raw_data AS (
+  SELECT * FROM (
+    VALUES
+      ('sensor_A', 22),
+      ('sensor_A', 23),
+      ('sensor_B', 20),
+      ('sensor_A', 24)
+  ) AS t(sensor_id, temperature)
+)
+SELECT
+  arrow_cast(sensor_id, 'RunEndEncoded("run_ends": non-null Int32, "values": Utf8)') AS sensor_id,
+  temperature
+FROM raw_data;
+
+# Test basic aggregation with REE column as GROUP BY key
+query ?RI rowsort
+SELECT
+    sensor_id,
+    AVG(temperature) AS avg_temp,
+    COUNT(*) AS reading_count
+FROM sensor_readings
+GROUP BY sensor_id;
+----
+sensor_A 23 3
+sensor_B 20 1
+
+# Test DISTINCT with REE column
+query ? rowsort
+SELECT DISTINCT sensor_id
+FROM sensor_readings;
+----
+sensor_A
+sensor_B
+
+# Test arithmetic on REE columns (numeric and decimal value types)
+query IR rowsort
+SELECT
+    arrow_cast(temperature, 'RunEndEncoded("run_ends": non-null Int32, "values": Int64)') + 1 AS t_plus_one,
+    arrow_cast(arrow_cast(temperature, 'Decimal128(10, 2)'), 'RunEndEncoded("run_ends": non-null Int32, "values": Decimal128(10, 2))') + 1 AS t_dec_plus_one
+FROM sensor_readings;
+----
+21 21
+23 23
+24 24
+25 25
+
+# Regex / LIKE match against an REE column whose values are themselves Dictionary-encoded
+query BB rowsort
+SELECT
+    arrow_cast(
+        arrow_cast(sensor_id, 'Dictionary(UInt32, Utf8)'),
+        'RunEndEncoded("run_ends": non-null Int32, "values": Dictionary(UInt32, Utf8))'
+    ) ~ 'sensor_A',
+    arrow_cast(
+        arrow_cast(sensor_id, 'Dictionary(UInt32, Utf8)'),
+        'RunEndEncoded("run_ends": non-null Int32, "values": Dictionary(UInt32, Utf8))'
+    ) LIKE 'sensor_A'
+FROM sensor_readings;
+----
+false false
+true true
+true true
+true true
diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt
index faa0d69ae84b9..89ae30e3c047b 100644
--- a/datafusion/sqllogictest/test_files/scalar.slt
+++ b/datafusion/sqllogictest/test_files/scalar.slt
@@ -317,6 +317,54 @@ select ceil(100.1234, 1)
 query error DataFusion error: This feature is not implemented: CEIL with datetime is not supported
 select ceil(100.1234 to year)
 
+# ceil with decimal argument
+query RRRR
+select
+  ceil(arrow_cast(1.23,'Decimal128(10,2)')),
+  ceil(arrow_cast(-1.23,'Decimal128(10,2)')),
+  ceil(arrow_cast(123.00,'Decimal128(10,2)')),
+  ceil(arrow_cast(-123.00,'Decimal128(10,2)'));
+----
+2 -1 123 -123
+
+# ceil overflow with limited precision
+query error Decimal overflow while applying ceil
+select ceil(arrow_cast(9.23,'Decimal128(3,2)'));
+
+# ceil with decimal32 argument (ensure decimal output)
+query TTTTTTTT
+select
+  arrow_typeof(ceil(arrow_cast(9.01,'Decimal32(7,2)'))),
+  arrow_cast(ceil(arrow_cast(9.01,'Decimal32(7,2)')), 'Utf8'),
+  arrow_typeof(ceil(arrow_cast(-9.01,'Decimal32(7,2)'))),
+  arrow_cast(ceil(arrow_cast(-9.01,'Decimal32(7,2)')), 'Utf8'),
+  arrow_typeof(ceil(arrow_cast(10.00,'Decimal32(7,2)'))),
+  arrow_cast(ceil(arrow_cast(10.00,'Decimal32(7,2)')), 'Utf8'),
+  arrow_typeof(ceil(arrow_cast(-0.99,'Decimal32(7,2)'))),
+  arrow_cast(ceil(arrow_cast(-0.99,'Decimal32(7,2)')), 'Utf8');
+----
+Decimal32(7, 2) 10.00 Decimal32(7, 2) -9.00 Decimal32(7, 2) 10.00 Decimal32(7, 2) 0.00
+
+# ceil with decimal64 zero scale
+query TTTT
+select
+  arrow_typeof(ceil(arrow_cast(123456789,'Decimal64(18,0)'))),
+  arrow_cast(ceil(arrow_cast(123456789,'Decimal64(18,0)')), 'Utf8'),
+  arrow_typeof(ceil(arrow_cast(-987654321,'Decimal64(18,0)'))),
+  arrow_cast(ceil(arrow_cast(-987654321,'Decimal64(18,0)')), 'Utf8');
+----
+Decimal64(18, 0) 123456789 Decimal64(18, 0) -987654321
+
+# ceil with decimal256 argument
+query TTTT
+select
+  arrow_typeof(ceil(arrow_cast('9999999999999999999999999999999999.01','Decimal256(38,2)'))),
+  arrow_cast(ceil(arrow_cast('9999999999999999999999999999999999.01','Decimal256(38,2)')), 'Utf8'),
+  arrow_typeof(ceil(arrow_cast('-9999999999999999999999999999999999.01','Decimal256(38,2)'))),
+  arrow_cast(ceil(arrow_cast('-9999999999999999999999999999999999.01','Decimal256(38,2)')), 'Utf8');
+----
+Decimal256(38, 2) 10000000000000000000000000000000000.00 Decimal256(38, 2) -9999999999999999999999999999999999.00
+
 ## degrees
 
 # degrees scalar function
@@ -464,6 +512,54 @@ select floor(a, 1)
 query error DataFusion error: This feature is not implemented: FLOOR with datetime is not supported
 select floor(a to year)
 
+# floor with decimal argument
+query RRRR
+select
+  floor(arrow_cast(1.23,'Decimal128(10,2)')),
+  floor(arrow_cast(-1.23,'Decimal128(10,2)')),
+  floor(arrow_cast(123.00,'Decimal128(10,2)')),
+  floor(arrow_cast(-123.00,'Decimal128(10,2)'));
+----
+1 -2 123 -123
+
+# floor overflow with limited precision
+query error Decimal overflow while applying floor
+select floor(arrow_cast(-9.23,'Decimal128(3,2)'));
+
+# floor with decimal32 argument (ensure decimal output)
+query TTTTTTTT
+select
+  arrow_typeof(floor(arrow_cast(9.99,'Decimal32(7,2)'))),
+  arrow_cast(floor(arrow_cast(9.99,'Decimal32(7,2)')), 'Utf8'),
+  arrow_typeof(floor(arrow_cast(-9.01,'Decimal32(7,2)'))),
+  arrow_cast(floor(arrow_cast(-9.01,'Decimal32(7,2)')), 'Utf8'),
+  arrow_typeof(floor(arrow_cast(10.00,'Decimal32(7,2)'))),
+  arrow_cast(floor(arrow_cast(10.00,'Decimal32(7,2)')), 'Utf8'),
+  arrow_typeof(floor(arrow_cast(-0.01,'Decimal32(7,2)'))),
+  arrow_cast(floor(arrow_cast(-0.01,'Decimal32(7,2)')), 'Utf8');
+----
+Decimal32(7, 2) 9.00 Decimal32(7, 2) -10.00 Decimal32(7, 2) 10.00 Decimal32(7, 2) -1.00
+
+# floor with decimal64 zero scale
+query TTTT
+select
+  arrow_typeof(floor(arrow_cast(123456789,'Decimal64(18,0)'))),
+  arrow_cast(floor(arrow_cast(123456789,'Decimal64(18,0)')), 'Utf8'),
+  arrow_typeof(floor(arrow_cast(-987654321,'Decimal64(18,0)'))),
+  arrow_cast(floor(arrow_cast(-987654321,'Decimal64(18,0)')), 'Utf8');
+----
+Decimal64(18, 0) 123456789 Decimal64(18, 0) -987654321
+
+# floor with decimal256 argument
+query TTTT
+select
+  arrow_typeof(floor(arrow_cast('9999999999999999999999999999999999.99','Decimal256(38,2)'))),
+  arrow_cast(floor(arrow_cast('9999999999999999999999999999999999.99','Decimal256(38,2)')), 'Utf8'),
+  arrow_typeof(floor(arrow_cast('-9999999999999999999999999999999999.99','Decimal256(38,2)'))),
+  arrow_cast(floor(arrow_cast('-9999999999999999999999999999999999.99','Decimal256(38,2)')), 'Utf8');
+----
+Decimal256(38, 2) 9999999999999999999999999999999999.00 Decimal256(38, 2) -10000000000000000000000000000000000.00
+
 ## ln
 
 # ln scalar function
@@ -669,11 +765,11 @@ select nanvl(null, 64);
 ----
 NULL
 
-# nanvl scalar nulls #1
+# nanvl scalar nulls #1 - x is not NaN, so return x even if y is NULL
 query R rowsort
 select nanvl(2, null);
 ----
-NULL
+2
 
 # nanvl scalar nulls #2
 query R rowsort
@@ -746,26 +842,26 @@ select pi(), pi() / 2, pi() / 3;
 
 ## power
 
-# power scalar function
-query III rowsort
+# power scalar function (always returns Float64, like PostgreSQL)
+query RRR rowsort
 select power(2, 0), power(2, 1), power(2, 2);
 ----
 1 2 4
 
 # power scalar nulls
-query I rowsort
+query R rowsort
 select power(null, 64);
 ----
 NULL
 
 # power scalar nulls #1
-query I rowsort
+query R rowsort
 select power(2, null);
 ----
 NULL
 
 # power scalar nulls #2
-query I rowsort
+query R rowsort
 select power(null, null);
 ----
 NULL
@@ -827,13 +923,103 @@ select round(a), round(b), round(c) from small_floats;
 
 # round with too large
 #  max Int32 is 2147483647
-query error DataFusion error: Execution error: Invalid values for decimal places: Cast error: Can't cast value 2147483648 to type Int32
+query error round decimal_places 2147483648 is out of supported i32 range
 select round(3.14, 2147483648);
 
 # with array
-query error DataFusion error: Execution error: Invalid values for decimal places: Cast error: Can't cast value 2147483649 to type Int32
+query error Arrow error: Cast error: Can't cast value 2147483649 to type Int32
 select round(column1, column2) from values (3.14, 2), (3.14, 3), (3.14, 2147483649);
 
+# round decimal should not cast to float
+# scale reduces to match decimal_places
+query TR
+select arrow_typeof(round('173975140545.855'::decimal(38,10), 2)),
+       round('173975140545.855'::decimal(38,10), 2);
+----
+Decimal128(38, 2) 173975140545.86
+
+# round decimal ties away from zero
+query RRRR
+select round('1.5'::decimal(2,1), 0),
+       round('-1.5'::decimal(2,1), 0),
+       round('2.5'::decimal(2,1), 0),
+       round('-2.5'::decimal(2,1), 0);
+----
+2 -2 3 -3
+
+# round decimal negative places (left of decimal)
+query TR
+select arrow_typeof(round('12345.55'::decimal(10,2), -1)),
+       round('12345.55'::decimal(10,2), -1);
+----
+Decimal128(10, 0) 12350
+
+# round decimal scale 0 negative places (carry can require extra precision)
+query TR
+select arrow_typeof(round('99'::decimal(2,0), -1)),
+       round('99'::decimal(2,0), -1);
+----
+Decimal128(3, 0) 100
+
+# round decimal256 keeps decimals
+query TR
+select arrow_typeof(round('1234.5678'::decimal(50,4), 2)),
+       round('1234.5678'::decimal(50,4), 2);
+----
+Decimal256(50, 2) 1234.57
+
+# round decimal with carry-over (reduce scale)
+# Scale reduces from 1 to 0, allowing extra digit for carry-over
+query TRRR
+select arrow_typeof(round('999.9'::decimal(4,1))),
+       round('999.9'::decimal(4,1)),
+       round('-999.9'::decimal(4,1)),
+       round('99.99'::decimal(4,2));
+----
+Decimal128(4, 0) 1000 -1000 100
+
+# round decimal with carry-over and non-literal decimal_places (increase precision)
+# Scale can't be reduced when decimal_places isn't a constant, so precision increases.
+query TR
+select arrow_typeof(round(val, dp)), round(val, dp)
+from (values (cast('999.9' as decimal(4,1)), 0)) as t(val, dp);
+----
+Decimal128(5, 1) 1000
+
+# round decimal at max precision now works (scale reduction handles overflow)
+query TR
+select arrow_typeof(round('9999999999999999999999999999999999999.9'::decimal(38,1))),
+       round('9999999999999999999999999999999999999.9'::decimal(38,1));
+----
+Decimal128(38, 0) 10000000000000000000000000000000000000
+
+# round decimal at max precision with non-literal decimal_places can overflow
+query error Decimal overflow: rounded value exceeds precision 38
+select round(val, dp)
+from (values (cast('9999999999999999999999999999999999999.9' as decimal(38,1)), 0)) as t(val, dp);
+
+# round decimal with negative scale
+query TRRR
+select arrow_typeof(round(cast(500 as decimal(10,-2)), -3)),
+       round(cast(500 as decimal(10,-2)), -3),
+       round(cast(400 as decimal(10,-2)), -3),
+       round(cast(-500 as decimal(10,-2)), -3);
+----
+Decimal128(10, -3) 1000 0 -1000
+
+# round decimal with negative scale and carry-over
+query TR
+select arrow_typeof(round(cast(999999999900 as decimal(10,-2)), -3)),
+       round(cast(999999999900 as decimal(10,-2)), -3);
+----
+Decimal128(10, -3) 1000000000000
+
+# round decimal with very small decimal_places (i32::MIN) should not error
+query TR
+select arrow_typeof(round('123.45'::decimal(5,2), -2147483648)),
+       round('123.45'::decimal(5,2), -2147483648);
+----
+Decimal128(5, 0) 0
 
 ## signum
 
@@ -1039,7 +1225,7 @@ from small_floats;
 ----
 0.447 0.4 0.447
 0.707 0.7 0.707
-0.837 0.8 0.837
+0.836 0.8 0.836
 1 1 1
 
 ## bitwise and
@@ -1185,6 +1371,14 @@ select a << b, c << d, e << f from signed_integers;
 33554432 123 10485760
 NULL NULL NULL
 
+## bitwise operations should reject non-integer types
+
+query error DataFusion error: Error during planning: Cannot infer common type for bitwise operation Float32 & Float32
+select arrow_cast(1, 'Float32') & arrow_cast(2, 'Float32');
+
+query error DataFusion error: Error during planning: Cannot infer common type for bitwise operation Date32 & Date32
+select arrow_cast(1, 'Date32') & arrow_cast(2, 'Date32');
+
 statement ok
 drop table unsigned_integers;
 
@@ -1545,6 +1739,13 @@ SELECT 'a' || 42 || 23.3
 ----
 a4223.3
 
+# Concat operator of two binaries uses their binary representation without text at all
+query ?
+select arrow_cast('Café', 'Binary') || arrow_cast('Foobar', 'Binary');
+----
+436166c3a9466f6f626172
+
+
 # test_not_expressions()
 
 query BB
@@ -1570,7 +1771,7 @@ SELECT null, -null
 ----
 NULL NULL
 
-query error DataFusion error: Error during planning: Negation only supports numeric, interval and timestamp types
+query error type_coercion\ncaused by\nError during planning: Negation only supports numeric, interval and timestamp types
 SELECT -'100'
 
 query error DataFusion error: Error during planning: Unary operator '\+' only supports numeric, interval and timestamp types
@@ -1775,7 +1976,7 @@ CREATE TABLE test(
 (-14, -14, -14.5, -14.5),
 (NULL, NULL, NULL, NULL);
 
-query IRRRIR rowsort
+query RRRRRR rowsort
 SELECT power(i32, exp_i) as power_i32,
        power(i64, exp_f) as power_i64,
        pow(f32, exp_i) as power_f32,
@@ -1867,10 +2068,10 @@ query TT
 EXPLAIN SELECT letter, letter = LEFT(letter2, 1) FROM simple_string;
 ----
 logical_plan
-01)Projection: simple_string.letter, simple_string.letter = CAST(left(simple_string.letter2, Int64(1)) AS Utf8View)
+01)Projection: simple_string.letter, simple_string.letter = left(simple_string.letter2, Int64(1))
 02)--TableScan: simple_string projection=[letter, letter2]
 physical_plan
-01)ProjectionExec: expr=[letter@0 as letter, letter@0 = CAST(left(letter2@1, 1) AS Utf8View) as simple_string.letter = left(simple_string.letter2,Int64(1))]
+01)ProjectionExec: expr=[letter@0 as letter, letter@0 = left(letter2@1, 1) as simple_string.letter = left(simple_string.letter2,Int64(1))]
 02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TB
@@ -1883,9 +2084,9 @@ D false
 
 # test string_temporal_coercion
 query BBBBBBBBBB
-select 
-  arrow_cast(to_timestamp('2020-01-01 01:01:11.1234567890Z'), 'Timestamp(Second, None)') == '2020-01-01T01:01:11',
-  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Second, None)') == arrow_cast('2020-01-02T01:01:11', 'LargeUtf8'),
+select
+  arrow_cast(to_timestamp('2020-01-01 01:01:11.1234567890Z'), 'Timestamp(s)') == '2020-01-01T01:01:11',
+  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(s)') == arrow_cast('2020-01-02T01:01:11', 'LargeUtf8'),
   arrow_cast(to_timestamp('2020-01-03 01:01:11.1234567890Z'), 'Time32(Second)') == '01:01:11',
   arrow_cast(to_timestamp('2020-01-04 01:01:11.1234567890Z'), 'Time32(Second)') == arrow_cast('01:01:11', 'LargeUtf8'),
   arrow_cast(to_timestamp('2020-01-05 01:01:11.1234567890Z'), 'Time64(Microsecond)') == '01:01:11.123456',
@@ -1943,7 +2144,7 @@ select position('' in '')
 ----
 1
 
-query error DataFusion error: Error during planning: Internal error: Expect TypeSignatureClass::Native\(LogicalType\(Native\(String\), String\)\) but received NativeType::Int64, DataType: Int64
+query error DataFusion error: Error during planning: Function 'strpos' requires String, but received Int64 \(DataType: Int64\).
 select position(1 in 1)
 
 query I
@@ -2018,6 +2219,110 @@ select strpos('joséésoj', arrow_cast(null, 'Utf8'));
 ----
 NULL
 
+# strpos with array inputs
+statement ok
+CREATE TABLE strpos_table AS VALUES
+  ('alphabet', 'ph'),
+  ('hello world', 'world'),
+  ('hello world', 'xyz'),
+  ('hello world', ''),
+  ('josé', 'é'),
+  ('joséésoj', 'so'),
+  ('ДатаФусион', 'Фусион'),
+  ('数据融合📊🔥', '📊'),
+  ('数据融合📊🔥', '融合'),
+  (NULL, 'abc'),
+  ('hello', NULL),
+  ('ab', 'abcd');
+
+# Array haystack + array needle
+query I
+SELECT strpos(column1, column2) FROM strpos_table;
+----
+3
+7
+0
+1
+4
+6
+5
+5
+3
+NULL
+NULL
+0
+
+# Array haystack + scalar ASCII needle
+query I
+SELECT strpos(column1, 'o') FROM strpos_table;
+----
+0
+5
+5
+5
+2
+2
+0
+0
+0
+NULL
+5
+0
+
+# Array haystack + scalar Unicode needle
+query I
+SELECT strpos(column1, '📊') FROM strpos_table;
+----
+0
+0
+0
+0
+0
+0
+0
+5
+5
+NULL
+0
+0
+
+# Array haystack + scalar empty needle
+query I
+SELECT strpos(column1, '') FROM strpos_table;
+----
+1
+1
+1
+1
+1
+1
+1
+1
+1
+NULL
+1
+1
+
+# Array haystack + scalar null needle
+query I
+SELECT strpos(column1, arrow_cast(null, 'Utf8')) FROM strpos_table;
+----
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+
+statement ok
+DROP TABLE strpos_table;
+
 statement ok
 CREATE TABLE t1 (v1 int) AS VALUES (1), (2), (3);
 
diff --git a/datafusion/sqllogictest/test_files/schema_evolution.slt b/datafusion/sqllogictest/test_files/schema_evolution.slt
index 5572c4a5ffef3..e29aa14f13e92 100644
--- a/datafusion/sqllogictest/test_files/schema_evolution.slt
+++ b/datafusion/sqllogictest/test_files/schema_evolution.slt
@@ -138,3 +138,147 @@ select * from parquet_table where c > 11.0;
 ----
 bzz 300 13.7
 foo 200 12.6
+
+##########
+# Projection tests - selecting subset of columns
+# These tests verify column reordering and projection work correctly
+# with schema evolution (addresses E2E column reordering concern)
+##########
+
+# Select only column a
+query T rowsort
+select a from parquet_table;
+----
+NULL
+bzz
+foo
+foo
+foo
+foo
+foo
+foo
+foo
+
+# Select columns in different order than table schema (c, a instead of a, b, c)
+query RT rowsort
+select c, a from parquet_table;
+----
+10.5 foo
+12.6 foo
+13.7 bzz
+NULL NULL
+NULL foo
+NULL foo
+NULL foo
+NULL foo
+NULL foo
+
+# Select single column that's missing in some files
+query I rowsort
+select b from parquet_table;
+----
+1
+10
+100
+2
+200
+3
+300
+NULL
+NULL
+
+##########
+# Projection with filter tests
+##########
+
+# Projection with equality filter
+query TI rowsort
+select a, b from parquet_table where a = 'foo';
+----
+foo 1
+foo 100
+foo 2
+foo 200
+foo 3
+foo NULL
+foo NULL
+
+# Projection with range filter on projected column
+query IR rowsort
+select b, c from parquet_table where b > 5;
+----
+10 NULL
+100 10.5
+200 12.6
+300 13.7
+
+# Projection excluding filtered column (filter on c, project a)
+query T rowsort
+select a from parquet_table where c > 11.0;
+----
+bzz
+foo
+
+##########
+# Complex filter tests - OR combinations and IS NOT NULL
+##########
+
+# OR combination
+query TIR rowsort
+select * from parquet_table where a = 'foo' OR b > 100;
+----
+bzz 300 13.7
+foo 1 NULL
+foo 100 10.5
+foo 2 NULL
+foo 200 12.6
+foo 3 NULL
+foo NULL NULL
+foo NULL NULL
+
+# IS NOT NULL on column a
+query TIR rowsort
+select * from parquet_table where a IS NOT NULL;
+----
+bzz 300 13.7
+foo 1 NULL
+foo 100 10.5
+foo 2 NULL
+foo 200 12.6
+foo 3 NULL
+foo NULL NULL
+foo NULL NULL
+
+# IS NOT NULL on column c (missing in most files)
+query TIR rowsort
+select * from parquet_table where c IS NOT NULL;
+----
+bzz 300 13.7
+foo 100 10.5
+foo 200 12.6
+
+# Combined conditions with NULL checks
+query TIR rowsort
+select * from parquet_table where a IS NULL OR (b IS NOT NULL AND b > 5);
+----
+NULL 10 NULL
+bzz 300 13.7
+foo 100 10.5
+foo 200 12.6
+
+##########
+# Multi-column predicates
+##########
+
+# AND across columns with different availability
+query TIR rowsort
+select * from parquet_table where a = 'foo' AND b > 50;
+----
+foo 100 10.5
+foo 200 12.6
+
+# Filter on multiple columns from reordered file (File4 has b, a, c order)
+query TIR rowsort
+select * from parquet_table where b = 100 AND c = 10.5;
+----
+foo 100 10.5
diff --git a/datafusion/sqllogictest/test_files/schema_evolution_nested.slt b/datafusion/sqllogictest/test_files/schema_evolution_nested.slt
new file mode 100644
index 0000000000000..53bc16fe51508
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/schema_evolution_nested.slt
@@ -0,0 +1,124 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+# End-user-facing happy-path coverage for nested list/struct Parquet schema evolution.
+#
+# These tests generate mixed-schema parquet files through SQL COPY statements and
+# query them through CREATE EXTERNAL TABLE, rather than constructing batches
+# directly in Rust. Rust tests still cover more specialized fixture shapes and
+# failure paths.
+##########
+
+statement ok
+CREATE EXTERNAL TABLE list_messages (
+  row_id INT,
+  messages ARRAY<STRUCT<id BIGINT, name VARCHAR, chain VARCHAR>>
+)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/schema_evolution_nested/list_messages/';
+
+statement ok
+COPY (
+  SELECT
+    1 AS row_id,
+    [
+      named_struct('id', 10, 'name', 'alpha'),
+      named_struct('id', 20, 'name', 'beta')
+    ] AS messages
+) TO 'test_files/scratch/schema_evolution_nested/list_messages/old.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (
+  SELECT
+    2 AS row_id,
+    [
+      named_struct('id', 30, 'name', 'gamma', 'chain', 'eth', 'ignored', 99)
+    ] AS messages
+) TO 'test_files/scratch/schema_evolution_nested/list_messages/new.parquet'
+STORED AS PARQUET;
+
+query I?
+SELECT row_id, messages FROM list_messages ORDER BY row_id;
+----
+1 [{id: 10, name: alpha, chain: NULL}, {id: 20, name: beta, chain: NULL}]
+2 [{id: 30, name: gamma, chain: eth}]
+
+query IIT rowsort
+SELECT
+  row_id,
+  get_field(messages[1], 'id') AS msg_id,
+  get_field(messages[1], 'chain') AS chain
+FROM list_messages;
+----
+1 10 NULL
+2 30 eth
+
+statement ok
+COPY (
+  SELECT
+    1 AS row_id,
+    arrow_cast(
+      [
+        named_struct('id', 10, 'name', 'alpha'),
+        named_struct('id', 20, 'name', 'beta')
+      ],
+      'LargeList(Struct("id": Int64, "name": Utf8View))'
+    ) AS messages
+) TO 'test_files/scratch/schema_evolution_nested/large_list_messages/old.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (
+  SELECT
+    2 AS row_id,
+    arrow_cast(
+      [
+        named_struct('id', 30, 'name', 'gamma', 'chain', 'eth', 'ignored', 99)
+      ],
+      'LargeList(Struct("id": Int64, "name": Utf8View, "chain": Utf8View, "ignored": Int64))'
+    ) AS messages
+) TO 'test_files/scratch/schema_evolution_nested/large_list_messages/new.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE large_list_messages
+STORED AS PARQUET
+LOCATION 'test_files/scratch/schema_evolution_nested/large_list_messages/';
+
+query TTT
+DESCRIBE large_list_messages;
+----
+row_id Int64 NO
+messages LargeList(Struct("id": Int64, "name": Utf8View, "chain": Utf8View, "ignored": Int64)) NO
+
+query I?
+SELECT row_id, messages FROM large_list_messages ORDER BY row_id;
+----
+1 [{id: 10, name: alpha, chain: NULL, ignored: NULL}, {id: 20, name: beta, chain: NULL, ignored: NULL}]
+2 [{id: 30, name: gamma, chain: eth, ignored: 99}]
+
+query IIT rowsort
+SELECT
+  row_id,
+  get_field(messages[1], 'id') AS msg_id,
+  get_field(messages[1], 'chain') AS chain
+FROM large_list_messages;
+----
+1 10 NULL
+2 30 eth
diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt
index 5c684eb83d1a0..3e97dc4588655 100644
--- a/datafusion/sqllogictest/test_files/select.slt
+++ b/datafusion/sqllogictest/test_files/select.slt
@@ -820,7 +820,7 @@ SELECT ALL c1 FROM aggregate_simple order by c1
 0.00005
 0.00005
 
-# select distinct
+# SELECT DISTINCT
 query RRB rowsort
 SELECT DISTINCT * FROM aggregate_simple
 ----
@@ -830,6 +830,31 @@ SELECT DISTINCT * FROM aggregate_simple
 0.00004 0.000000000004 false
 0.00005 0.000000000005 true
 
+# select ALL (inverse of distinct)
+query RRB rowsort
+SELECT ALL * FROM aggregate_simple;
+----
+0.00001 0.000000000001 true
+0.00002 0.000000000002 false
+0.00002 0.000000000002 false
+0.00003 0.000000000003 true
+0.00003 0.000000000003 true
+0.00003 0.000000000003 true
+0.00004 0.000000000004 false
+0.00004 0.000000000004 false
+0.00004 0.000000000004 false
+0.00004 0.000000000004 false
+0.00005 0.000000000005 true
+0.00005 0.000000000005 true
+0.00005 0.000000000005 true
+0.00005 0.000000000005 true
+0.00005 0.000000000005 true
+
+
+# select distinct all (
+query error DataFusion error: SQL error: ParserError\("Cannot specify DISTINCT then ALL at Line: 1, Column: 8"\)
+SELECT DISTINCT ALL * FROM aggregate_simple
+
 # select distinct with projection and order by
 query R
 SELECT DISTINCT c1 FROM aggregate_simple order by c1
@@ -1293,25 +1318,25 @@ statement error
 SELECT * EXCLUDE(a, a)
 FROM table1
 
-# if EXCEPT all the columns, query should still succeed but return empty
-statement ok
+# if EXCEPT all the columns, query should return an error
+statement error DataFusion error: Error during planning: SELECT list is empty after resolving \* expressions, the wildcard expanded to zero columns
 SELECT * EXCEPT(a, b, c, d)
 FROM table1
 
-# try zero column with LIMIT, 1 row but empty
-statement ok
+# try zero column with LIMIT, should error
+statement error DataFusion error: Error during planning: SELECT list is empty after resolving \* expressions, the wildcard expanded to zero columns
 SELECT * EXCEPT (a, b, c, d)
 FROM table1
 LIMIT 1
 
-# try zero column with GROUP BY, 2 row but empty
-statement ok
+# try zero column with GROUP BY, should error
+statement error DataFusion error: Error during planning: SELECT list is empty after resolving \* expressions, the wildcard expanded to zero columns
 SELECT * EXCEPT (a, b, c, d)
 FROM table1
 GROUP BY a
 
-# try zero column with WHERE, 1 row but empty
-statement ok
+# try zero column with WHERE, should error
+statement error DataFusion error: Error during planning: SELECT list is empty after resolving \* expressions, the wildcard expanded to zero columns
 SELECT * EXCEPT (a, b, c, d)
 FROM table1
 WHERE a = 1
@@ -1327,15 +1352,15 @@ CREATE TABLE table2 (
   (1, 10, 100, 1000),
   (2, 20, 200, 2000);
 
-# try zero column with inner JOIN, 2 row but empty
-statement ok
+# try zero column with inner JOIN, should error
+statement error DataFusion error: Error during planning: SELECT list is empty after resolving \* expressions, the wildcard expanded to zero columns
 WITH t1 AS (SELECT a AS t1_a FROM table1), t2 AS (SELECT a AS t2_a FROM table2)
 SELECT * EXCEPT (t1_a, t2_a)
 FROM t1
 JOIN t2 ON (t1_a = t2_a)
 
-# try zero column with more JOIN, 2 row but empty
-statement ok
+# try zero column with more JOIN, should error
+statement error DataFusion error: Error during planning: SELECT list is empty after resolving \* expressions, the wildcard expanded to zero columns
 SELECT * EXCEPT (b1, b2)
 FROM (
   SELECT b AS b1 FROM table1
@@ -1344,8 +1369,8 @@ JOIN (
   SELECT b AS b2 FROM table2
 ) ON b1 = b2
 
-# try zero column with Window, 2 row but empty
-statement ok
+# try zero column with Window, should error
+statement error DataFusion error: Error during planning: SELECT list is empty after resolving \* expressions, the wildcard expanded to zero columns
 SELECT * EXCEPT (a, b, row_num)
 FROM (
     SELECT
@@ -1404,7 +1429,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [a@0 ASC NULLS LAST]
 02)--ProjectionExec: expr=[a@0 as a, a@0 + b@1 as annotated_data_finite2.a + annotated_data_finite2.b]
-03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true
 
 # since query below doesn't computation
@@ -1420,9 +1445,7 @@ logical_plan
 01)Sort: annotated_data_finite2.a ASC NULLS LAST
 02)--Projection: annotated_data_finite2.a, annotated_data_finite2.b, Int64(2)
 03)----TableScan: annotated_data_finite2 projection=[a, b]
-physical_plan
-01)ProjectionExec: expr=[a@0 as a, b@1 as b, 2 as Int64(2)]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, 2 as Int64(2)], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true
 
 # source is ordered by a,b,c
 # when filter result is constant for column a
@@ -1440,10 +1463,9 @@ logical_plan
 03)----TableScan: annotated_data_finite2 projection=[a0, a, b, c, d], partial_filters=[annotated_data_finite2.a = Int32(0)]
 physical_plan
 01)SortPreservingMergeExec: [b@2 ASC NULLS LAST, c@3 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: a@1 = 0
-04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+02)--FilterExec: a@1 = 0
+03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 # source is ordered by a,b,c
 # when filter result is constant for column a and b
@@ -1461,10 +1483,9 @@ logical_plan
 03)----TableScan: annotated_data_finite2 projection=[a0, a, b, c, d], partial_filters=[annotated_data_finite2.a = Int32(0), annotated_data_finite2.b = Int32(0)]
 physical_plan
 01)SortPreservingMergeExec: [c@3 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: a@1 = 0 AND b@2 = 0
-04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+02)--FilterExec: a@1 = 0 AND b@2 = 0
+03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 # source is ordered by a,b,c
 # when filter result is constant for column a and b
@@ -1482,10 +1503,9 @@ logical_plan
 03)----TableScan: annotated_data_finite2 projection=[a0, a, b, c, d], partial_filters=[annotated_data_finite2.a = Int32(0), annotated_data_finite2.b = Int32(0)]
 physical_plan
 01)SortPreservingMergeExec: [b@2 ASC NULLS LAST, c@3 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: a@1 = 0 AND b@2 = 0
-04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+02)--FilterExec: a@1 = 0 AND b@2 = 0
+03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 # source is ordered by a,b,c
 # when filter result is constant for column a and b
@@ -1503,10 +1523,9 @@ logical_plan
 03)----TableScan: annotated_data_finite2 projection=[a0, a, b, c, d], partial_filters=[annotated_data_finite2.a = Int32(0), annotated_data_finite2.b = Int32(0)]
 physical_plan
 01)SortPreservingMergeExec: [a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: a@1 = 0 AND b@2 = 0
-04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+02)--FilterExec: a@1 = 0 AND b@2 = 0
+03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 # source is ordered by a,b,c
 # when filter result is when filter contains or
@@ -1525,10 +1544,9 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [c@3 ASC NULLS LAST]
 02)--SortExec: expr=[c@3 ASC NULLS LAST], preserve_partitioning=[true]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------FilterExec: a@1 = 0 OR b@2 = 0
-05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+03)----FilterExec: a@1 = 0 OR b@2 = 0
+04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 # When ordering lost during projection, we shouldn't keep the SortExec.
 # in the final physical plan.
@@ -1550,13 +1568,12 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[c2@0 as c2, count(Int64(1))@1 as count(*)]
 02)--AggregateExec: mode=FinalPartitioned, gby=[c2@0 as c2], aggr=[count(Int64(1))]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------RepartitionExec: partitioning=Hash([c2@0], 2), input_partitions=2
-05)--------AggregateExec: mode=Partial, gby=[c2@0 as c2], aggr=[count(Int64(1))]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)------------ProjectionExec: expr=[c2@0 as c2]
-08)--------------SortExec: TopK(fetch=4), expr=[c1@1 ASC NULLS LAST, c2@0 ASC NULLS LAST], preserve_partitioning=[false]
-09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c1], file_type=csv, has_header=true
+03)----RepartitionExec: partitioning=Hash([c2@0], 2), input_partitions=2
+04)------AggregateExec: mode=Partial, gby=[c2@0 as c2], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+06)----------ProjectionExec: expr=[c2@0 as c2]
+07)------------SortExec: TopK(fetch=4), expr=[c1@1 ASC NULLS LAST, c2@0 ASC NULLS LAST], preserve_partitioning=[false]
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c1], file_type=csv, has_header=true
 
 # FilterExec can track equality of non-column expressions.
 # plan below shouldn't have a SortExec because given column 'a' is ordered.
@@ -1573,10 +1590,9 @@ logical_plan
 03)----TableScan: annotated_data_finite2 projection=[a0, a, b, c, d], partial_filters=[CAST(round(CAST(annotated_data_finite2.b AS Float64)) AS Int32) = annotated_data_finite2.a]
 physical_plan
 01)SortPreservingMergeExec: [CAST(round(CAST(b@2 AS Float64)) AS Int32) ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: CAST(round(CAST(b@2 AS Float64)) AS Int32) = a@1
-04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+02)--FilterExec: CAST(round(CAST(b@2 AS Float64)) AS Int32) = a@1
+03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 
 statement ok
@@ -1935,3 +1951,22 @@ select "current_time" is not null from t_with_current_time;
 true
 false
 true
+
+# https://github.com/apache/datafusion/issues/20215
+statement count 0
+CREATE TABLE t0;
+
+query I
+SELECT COUNT(*) FROM t0 AS tt0 WHERE (4==(3/0));
+----
+0
+
+# Config reset
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+reset datafusion.optimizer.max_passes;
diff --git a/datafusion/sqllogictest/test_files/set_variable.slt b/datafusion/sqllogictest/test_files/set_variable.slt
index bb4ac920d0327..f270b9b169572 100644
--- a/datafusion/sqllogictest/test_files/set_variable.slt
+++ b/datafusion/sqllogictest/test_files/set_variable.slt
@@ -244,3 +244,447 @@ SET TIME ZONE = 'Asia/Taipei2'
 
 statement error Arrow error: Parser error: Invalid timezone "Asia/Taipei2": failed to parse timezone
 SELECT '2000-01-01T00:00:00'::TIMESTAMP::TIMESTAMPTZ
+
+# reset variable restores default
+statement ok
+set datafusion.catalog.information_schema = true
+
+statement ok
+SET datafusion.execution.batch_size = 1024
+
+query TT
+SHOW datafusion.execution.batch_size
+----
+datafusion.execution.batch_size 1024
+
+statement ok
+RESET datafusion.execution.batch_size
+
+query TT
+SHOW datafusion.execution.batch_size
+----
+datafusion.execution.batch_size 8192
+
+# reset variable with NULL default
+statement ok
+set datafusion.catalog.information_schema = true
+
+statement ok
+SET datafusion.execution.parquet.max_predicate_cache_size = '123'
+
+query TT
+SHOW datafusion.execution.parquet.max_predicate_cache_size
+----
+datafusion.execution.parquet.max_predicate_cache_size 123
+
+statement ok
+RESET datafusion.execution.parquet.max_predicate_cache_size
+
+query TT
+SHOW datafusion.execution.parquet.max_predicate_cache_size
+----
+datafusion.execution.parquet.max_predicate_cache_size NULL
+
+# reset time zone via aliases
+statement ok
+set datafusion.catalog.information_schema = true
+
+statement ok
+SET TIMEZONE = '-03:00'
+
+statement ok
+RESET TIMEZONE
+
+query TT
+SHOW TIMEZONE
+----
+datafusion.execution.time_zone NULL
+
+statement ok
+SET TIME ZONE = '+09:00'
+
+statement ok
+RESET timezone
+
+query TT
+SHOW TIME ZONE
+----
+datafusion.execution.time_zone NULL
+
+# reset runtime variables
+statement ok
+SET datafusion.runtime.memory_limit = '1M'
+
+statement ok
+RESET datafusion.runtime.memory_limit
+
+statement ok
+SET datafusion.runtime.max_temp_directory_size = '1M'
+
+statement ok
+RESET datafusion.runtime.max_temp_directory_size
+
+statement ok
+SET datafusion.runtime.metadata_cache_limit = '1M'
+
+statement ok
+RESET datafusion.runtime.metadata_cache_limit
+
+statement ok
+SET datafusion.runtime.temp_directory = './'
+
+statement ok
+RESET datafusion.runtime.temp_directory
+
+# test memory limit effect
+statement ok
+SET datafusion.runtime.memory_limit = '1K'
+
+# This query should fail with low memory
+statement error Not enough memory to continue external sort
+EXPLAIN ANALYZE SELECT * FROM generate_series(1, 1000) AS t1(v1) ORDER BY v1
+
+statement ok
+RESET datafusion.runtime.memory_limit
+
+# This query should succeed after resetting memory limit
+statement ok
+EXPLAIN ANALYZE SELECT * FROM generate_series(1, 1000) AS t1(v1) ORDER BY v1
+
+statement ok
+SET datafusion.runtime.list_files_cache_limit = '1K'
+
+statement ok
+RESET datafusion.runtime.list_files_cache_limit
+
+statement ok
+SET datafusion.runtime.list_files_cache_ttl = '1m'
+
+statement ok
+RESET datafusion.runtime.list_files_cache_ttl
+
+# reset invalid variable - typo in namespace
+statement error DataFusion error: Invalid or Unsupported Configuration: Could not find config namespace "dataexplosion"
+RESET dataexplosion.execution.batch_size
+
+# reset invalid variable - wrong namespace prefix
+statement error DataFusion error: Invalid or Unsupported Configuration: Config value "exec" not found on ConfigOptions
+RESET datafusion.exec.batch_size
+
+# reset invalid variable - typo in field name
+statement error DataFusion error: Invalid or Unsupported Configuration: Config value "batches_size" not found on ExecutionOptions
+RESET datafusion.execution.batches_size
+
+# reset invalid variable - extra suffix on valid field
+statement error DataFusion error: Invalid or Unsupported Configuration: Config field is a scalar usize and does not have nested field "bar"
+RESET datafusion.execution.batch_size.bar
+
+#############################################
+## Test datafusion.format.* configurations ##
+#############################################
+query T
+SELECT name FROM information_schema.df_settings WHERE name LIKE 'datafusion.format.%' ORDER BY name
+----
+datafusion.format.date_format
+datafusion.format.datetime_format
+datafusion.format.duration_format
+datafusion.format.null
+datafusion.format.safe
+datafusion.format.time_format
+datafusion.format.timestamp_format
+datafusion.format.timestamp_tz_format
+datafusion.format.types_info
+
+# date_format: SET / SHOW / RESET / SHOW
+statement ok
+SET datafusion.format.date_format = '%d-%m-%Y'
+
+query TT
+SHOW datafusion.format.date_format
+----
+datafusion.format.date_format %d-%m-%Y
+
+statement ok
+RESET datafusion.format.date_format
+
+query TT
+SHOW datafusion.format.date_format
+----
+datafusion.format.date_format %Y-%m-%d
+
+# datetime_format
+statement ok
+SET datafusion.format.datetime_format = '%Y/%m/%d %H:%M:%S'
+
+query TT
+SHOW datafusion.format.datetime_format
+----
+datafusion.format.datetime_format %Y/%m/%d %H:%M:%S
+
+statement ok
+RESET datafusion.format.datetime_format
+
+query TT
+SHOW datafusion.format.datetime_format
+----
+datafusion.format.datetime_format %Y-%m-%dT%H:%M:%S%.f
+
+# timestamp_format
+statement ok
+SET datafusion.format.timestamp_format = '%FT%H:%M:%S'
+
+query TT
+SHOW datafusion.format.timestamp_format
+----
+datafusion.format.timestamp_format %FT%H:%M:%S
+
+statement ok
+RESET datafusion.format.timestamp_format
+
+query TT
+SHOW datafusion.format.timestamp_format
+----
+datafusion.format.timestamp_format %Y-%m-%dT%H:%M:%S%.f
+
+# timestamp_tz_format (default NULL)
+statement ok
+SET datafusion.format.timestamp_tz_format = '%Y-%m-%d %H:%M:%S %z'
+
+query TT
+SHOW datafusion.format.timestamp_tz_format
+----
+datafusion.format.timestamp_tz_format %Y-%m-%d %H:%M:%S %z
+
+statement ok
+RESET datafusion.format.timestamp_tz_format
+
+query TT
+SHOW datafusion.format.timestamp_tz_format
+----
+datafusion.format.timestamp_tz_format NULL
+
+# time_format
+statement ok
+SET datafusion.format.time_format = '%H-%M-%S'
+
+query TT
+SHOW datafusion.format.time_format
+----
+datafusion.format.time_format %H-%M-%S
+
+statement ok
+RESET datafusion.format.time_format
+
+query TT
+SHOW datafusion.format.time_format
+----
+datafusion.format.time_format %H:%M:%S%.f
+
+# duration_format: values are normalized to lowercase; ISO8601 and pretty are valid
+statement ok
+SET datafusion.format.duration_format = ISO8601
+
+query TT
+SHOW datafusion.format.duration_format
+----
+datafusion.format.duration_format iso8601
+
+statement ok
+SET datafusion.format.duration_format to 'PRETTY'
+
+query TT
+SHOW datafusion.format.duration_format
+----
+datafusion.format.duration_format pretty
+
+statement ok
+RESET datafusion.format.duration_format
+
+query TT
+SHOW datafusion.format.duration_format
+----
+datafusion.format.duration_format pretty
+
+# null display string
+statement ok
+SET datafusion.format.null = 'NuLL'
+
+query TT
+SHOW datafusion.format.null
+----
+datafusion.format.null NuLL
+
+statement ok
+RESET datafusion.format.null
+
+query TT
+SHOW datafusion.format.null
+----
+datafusion.format.null (empty)
+
+# safe
+statement ok
+SET datafusion.format.safe = false
+
+query TT
+SHOW datafusion.format.safe
+----
+datafusion.format.safe false
+
+statement ok
+RESET datafusion.format.safe
+
+query TT
+SHOW datafusion.format.safe
+----
+datafusion.format.safe true
+
+# types_info
+statement ok
+SET datafusion.format.types_info to true
+
+query TT
+SHOW datafusion.format.types_info
+----
+datafusion.format.types_info true
+
+statement ok
+RESET datafusion.format.types_info
+
+query TT
+SHOW datafusion.format.types_info
+----
+datafusion.format.types_info false
+
+# Case-insensitive variable name
+statement ok
+SET datafusion.FORMAT.DATE_FORMAT = '%m/%d/%Y'
+
+query TT
+SHOW datafusion.format.date_format
+----
+datafusion.format.date_format %m/%d/%Y
+
+statement ok
+RESET datafusion.format.date_format
+
+query TT
+SHOW datafusion.format.date_format
+----
+datafusion.format.date_format %Y-%m-%d
+
+# Invalid format option name
+statement error DataFusion error: Invalid or Unsupported Configuration: Config value "unknown_option" not found on FormatOptions
+SET datafusion.format.unknown_option = true
+
+############
+## Test runtime configuration variables
+############
+
+# Test SHOW runtime.memory_limit (default value)
+query TT
+SHOW datafusion.runtime.memory_limit
+----
+datafusion.runtime.memory_limit unlimited
+
+# Test SET and SHOW runtime.memory_limit
+statement ok
+SET datafusion.runtime.memory_limit = '100M'
+
+query TT
+SHOW datafusion.runtime.memory_limit
+----
+datafusion.runtime.memory_limit 100M
+
+# Test SET and SHOW runtime.max_temp_directory_size
+statement ok
+SET datafusion.runtime.max_temp_directory_size = '10G'
+
+query TT
+SHOW datafusion.runtime.max_temp_directory_size
+----
+datafusion.runtime.max_temp_directory_size 10G
+
+# Test SET and SHOW runtime.metadata_cache_limit
+statement ok
+SET datafusion.runtime.metadata_cache_limit = '200M'
+
+query TT
+SHOW datafusion.runtime.metadata_cache_limit
+----
+datafusion.runtime.metadata_cache_limit 200M
+
+# Test SET and SHOW runtime.list_files_cache_limit
+statement ok
+SET datafusion.runtime.list_files_cache_limit = '2M'
+
+query TT
+SHOW datafusion.runtime.list_files_cache_limit
+----
+datafusion.runtime.list_files_cache_limit 2M
+
+# Test SET and SHOW runtime.list_files_cache_ttl
+statement ok
+SET datafusion.runtime.list_files_cache_ttl = '90s'
+
+query TT
+SHOW datafusion.runtime.list_files_cache_ttl
+----
+datafusion.runtime.list_files_cache_ttl 1m30s
+
+# Note: runtime.temp_directory shows the actual temp directory path with a unique suffix,
+# so we cannot test the exact value. We verify it exists in information_schema instead.
+
+# Test that all runtime variables appear in information_schema.df_settings
+query T
+SELECT name FROM information_schema.df_settings WHERE name LIKE 'datafusion.runtime.%' ORDER BY name
+----
+datafusion.runtime.list_files_cache_limit
+datafusion.runtime.list_files_cache_ttl
+datafusion.runtime.max_temp_directory_size
+datafusion.runtime.memory_limit
+datafusion.runtime.metadata_cache_limit
+datafusion.runtime.temp_directory
+
+statement error DataFusion error: Error during planning: Unsupported value Null
+SET datafusion.runtime.memory_limit = NULL
+
+statement error DataFusion error: Error during planning: Unsupported value Null
+SET datafusion.runtime.list_files_cache_ttl = NULL
+
+statement error DataFusion error: Error during planning: Duration should not be empty or blank for 'datafusion.runtime.list_files_cache_ttl'
+SET datafusion.runtime.list_files_cache_ttl = ' '
+
+statement ok
+SET datafusion.runtime.list_files_cache_ttl = '18446744073709551615s'
+
+statement error DataFusion error: Error during planning: Failed to parse number from duration '18446744073709551616s' for 'datafusion.runtime.list_files_cache_ttl'
+SET datafusion.runtime.list_files_cache_ttl = '18446744073709551616s'
+
+statement ok
+SET datafusion.runtime.list_files_cache_ttl = '307445734561825860m'
+
+statement ok
+SET datafusion.runtime.list_files_cache_ttl = '307445734561825860m10s'
+
+statement error DataFusion error: Error during planning: Duration has overflowed allowed maximum limit due to 'mins \* 60' when setting 'datafusion\.runtime\.list_files_cache_ttl'
+SET datafusion.runtime.list_files_cache_ttl = '307445734561825861m'
+
+statement error DataFusion error: Error during planning: Duration has overflowed allowed maximum limit due to 'mins \* 60 \+ secs' when setting 'datafusion\.runtime\.list_files_cache_ttl'
+SET datafusion.runtime.list_files_cache_ttl = '307445734561825860m60s'
+
+statement ok
+SET datafusion.runtime.list_files_cache_ttl = '1m18446744073709551555s'
+
+statement error DataFusion error: Error during planning: Duration has overflowed allowed maximum limit due to 'mins \* 60 \+ secs' when setting 'datafusion\.runtime\.list_files_cache_ttl'
+SET datafusion.runtime.list_files_cache_ttl = '1m18446744073709551556s'
+
+# Config reset
+statement ok
+RESET datafusion.catalog.create_default_catalog_and_schema
+
+statement ok
+RESET datafusion.catalog.information_schema
+
+statement ok
+RESET datafusion.execution.coalesce_batches
diff --git a/datafusion/sqllogictest/test_files/simplify_expr.slt b/datafusion/sqllogictest/test_files/simplify_expr.slt
index c77163dc996dc..58ec7a1b262c3 100644
--- a/datafusion/sqllogictest/test_files/simplify_expr.slt
+++ b/datafusion/sqllogictest/test_files/simplify_expr.slt
@@ -26,32 +26,44 @@ logical_plan
 01)Filter: t.a = Int32(3)
 02)--TableScan: t projection=[a]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: a@0 = 3
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: a@0 = 3
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 # test regex exprs
 query TT
 explain select b from t where b ~ '.*'
 ----
 logical_plan
-01)Filter: t.b ~ Utf8View(".*")
+01)Filter: t.b IS NOT NULL
 02)--TableScan: t projection=[b]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: b@0 ~ .*
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: b@0 IS NOT NULL
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TT
 explain select b from t where b !~ '.*'
 ----
 logical_plan
-01)Filter: t.b !~ Utf8View(".*")
+01)Filter: t.b IS NULL AND Boolean(NULL)
 02)--TableScan: t projection=[b]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: b@0 !~ .*
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: b@0 IS NULL AND NULL
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TB
+WITH vals(id, col) AS (
+    VALUES
+      (1, 'foo'::text),
+      (2, ''::text),
+      (3, NULL::text)
+)
+SELECT col, col !~ '.*'
+FROM vals
+ORDER BY id
+----
+foo false
+(empty) false
+NULL NULL
 
 query T
 select b from t where b ~ '.*'
@@ -70,9 +82,8 @@ logical_plan
 01)Filter: t.a IS NOT NULL OR Boolean(NULL)
 02)--TableScan: t projection=[a, b]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: a@0 IS NOT NULL OR NULL
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: a@0 IS NOT NULL OR NULL
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 drop table t;
@@ -107,3 +118,31 @@ query B
 SELECT a / NULL::DECIMAL(4,3) > 1.2::decimal(2,1) FROM VALUES (1) AS t(a);
 ----
 NULL
+
+query TT
+explain SELECT CASE WHEN 1 > 0 THEN MAP {'x': 100} ELSE MAP {'y': 200} END AS a;
+----
+logical_plan
+01)Projection: Map([{"x":"100"}]) AS a
+02)--EmptyRelation: rows=1
+physical_plan
+01)ProjectionExec: expr=[[{x:100}] as a]
+02)--PlaceholderRowExec
+
+# Simplify expr = L1 AND expr != L2 to expr = L1 when L1 != L2
+query TT
+EXPLAIN SELECT
+    v = 1 AND v != 0 as opt1,
+    v = 2 AND v != 2 as noopt1,
+    v != 3 AND v = 4 as opt2,
+    v != 5 AND v = 5 as noopt2
+FROM (VALUES (0), (1), (2)) t(v)
+----
+logical_plan
+01)Projection: t.v = Int64(1) AS opt1, t.v = Int64(2) AND t.v != Int64(2) AS noopt1, t.v = Int64(4) AS opt2, t.v != Int64(5) AND t.v = Int64(5) AS noopt2
+02)--SubqueryAlias: t
+03)----Projection: column1 AS v
+04)------Values: (Int64(0)), (Int64(1)), (Int64(2))
+physical_plan
+01)ProjectionExec: expr=[column1@0 = 1 as opt1, column1@0 = 2 AND column1@0 != 2 as noopt1, column1@0 = 4 as opt2, column1@0 != 5 AND column1@0 = 5 as noopt2]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
diff --git a/datafusion/sqllogictest/test_files/slt_features.slt b/datafusion/sqllogictest/test_files/slt_features.slt
new file mode 100644
index 0000000000000..5075ed10eae9a
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/slt_features.slt
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# =================================
+# Test sqllogictest runner features
+# =================================
+
+# --------------------------
+# Test `<slt:ignore>` marker
+# --------------------------
+query T
+select 'DataFusion'
+----
+<slt:ignore>
+
+query T
+select 'DataFusion'
+----
+Data<slt:ignore>
+
+query T
+select 'DataFusion'
+----
+<slt:ignore>Fusion
+
+query T
+select 'Apache DataFusion';
+----
+<slt:ignore>Data<slt:ignore>
+
+query T
+select 'DataFusion'
+----
+DataFusion<slt:ignore>
+
+query T
+select 'DataFusion'
+----
+<slt:ignore>DataFusion
+
+query T
+select 'DataFusion'
+----
+<slt:ignore>DataFusion<slt:ignore>
+
+query I
+select * from generate_series(3);
+----
+0
+1
+<slt:ignore>
+3
+
+query I
+select * from generate_series(3);
+----
+<slt:ignore>
+1
+<slt:ignore>
+<slt:ignore>
diff --git a/datafusion/sqllogictest/test_files/sort_merge_join.slt b/datafusion/sqllogictest/test_files/sort_merge_join.slt
index ed463333217af..6244ab70c1eb5 100644
--- a/datafusion/sqllogictest/test_files/sort_merge_join.slt
+++ b/datafusion/sqllogictest/test_files/sort_merge_join.slt
@@ -37,7 +37,7 @@ logical_plan
 02)--TableScan: t1 projection=[a, b]
 03)--TableScan: t2 projection=[a, b]
 physical_plan
-01)SortMergeJoin: join_type=Inner, on=[(a@0, a@0)], filter=CAST(b@1 AS Int64) * 50 <= CAST(b@0 AS Int64)
+01)SortMergeJoinExec: join_type=Inner, on=[(a@0, a@0)], filter=CAST(b@1 AS Int64) * 50 <= CAST(b@0 AS Int64)
 02)--SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
 03)----DataSourceExec: partitions=1, partition_sizes=[1]
 04)--SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
@@ -891,3 +891,86 @@ drop table t2;
 # return sql params back to default values
 statement ok
 set datafusion.optimizer.prefer_hash_join = true;
+
+##########
+## Tests for equijoins with different column counts
+##########
+
+statement ok
+set datafusion.optimizer.prefer_hash_join = false;
+
+statement ok
+DROP TABLE IF EXISTS t1;
+
+statement ok
+CREATE TABLE t1(a int, b int) AS VALUES (1, 100), (2, 200), (3, 300);
+
+statement ok
+DROP TABLE IF EXISTS t2;
+
+statement ok
+CREATE TABLE t2(a int, b int, c int) AS VALUES (4, 101, 1001), (3, 201, 2001), (2, 250, 3001);
+
+statement ok
+DROP TABLE IF EXISTS t3;
+
+statement ok
+CREATE TABLE t3(x int) AS VALUES (1);
+
+query IIIII
+SELECT * FROM t2 RIGHT JOIN t1 on t1.a = t2.a AND t1.b < t2.b
+----
+NULL NULL NULL 1 100
+2 250 3001 2 200
+NULL NULL NULL 3 300
+
+query IIIII
+SELECT * FROM t1 LEFT JOIN t2 on t1.a = t2.a AND t1.b < t2.b
+----
+1 100 NULL NULL NULL
+2 200 2 250 3001
+3 300 NULL NULL NULL
+
+# Small table for LeftMark
+
+# LeftMark equijoin with different columns count
+query III rowsort
+SELECT t2.a, t2.b, t2.c
+FROM t2
+WHERE t2.a > 3 OR t2.a IN (SELECT t3.x FROM t3 WHERE t2.b < 150)
+----
+4 101 1001
+
+# Regression test for https://github.com/apache/datafusion/issues/21197
+# LeftMark join with cross-table join filter on non-nullable columns.
+# The OR forces a LeftMark join. The cross-table filter (t2_data <> t1_data)
+# becomes a join filter referencing both sides. At scale, unmatched rows in
+# the mark join produce null right-side indices, and fetch_right_columns_by_idxs
+# maps those to null values in columns the schema declares non-nullable,
+# causing RecordBatch::try_new to fail.
+query I
+WITH t1_sorted AS (
+    SELECT value % 1000 as key, value as data
+    FROM range(10000)
+    ORDER BY key, data
+),
+t2_sorted AS (
+    SELECT value % 1000 as key, value as data
+    FROM range(100000)
+    ORDER BY key, data
+)
+SELECT count(*)
+FROM t1_sorted
+WHERE t1_sorted.data < 0
+   OR EXISTS (
+    SELECT 1 FROM t2_sorted
+    WHERE t2_sorted.key = t1_sorted.key
+      AND t2_sorted.data <> t1_sorted.data
+      AND t2_sorted.data % 100 = 0
+)
+----
+100
+
+# Config reset
+statement ok
+RESET datafusion.optimizer.prefer_hash_join;
diff --git a/datafusion/sqllogictest/test_files/sort_pushdown.slt b/datafusion/sqllogictest/test_files/sort_pushdown.slt
new file mode 100644
index 0000000000000..b6c75f3977010
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/sort_pushdown.slt
@@ -0,0 +1,2282 @@
+#Sort Pushdown for ordered Parquet files
+statement ok
+SET datafusion.execution.parquet.pushdown_filters = true;
+
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = true;
+
+# Test 1: Sort Pushdown for ordered Parquet files
+# Create a sorted dataset
+statement ok
+CREATE TABLE sorted_data(id INT, value INT, name VARCHAR) AS VALUES
+(1, 100, 'a'),
+(2, 200, 'b'),
+(3, 300, 'c'),
+(4, 400, 'd'),
+(5, 500, 'e'),
+(6, 600, 'f'),
+(7, 700, 'g'),
+(8, 800, 'h'),
+(9, 900, 'i'),
+(10, 1000, 'j');
+
+# Copy to parquet with sorting
+query I
+COPY (SELECT * FROM sorted_data ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/sorted_data.parquet';
+----
+10
+
+statement ok
+CREATE EXTERNAL TABLE sorted_parquet(id INT, value INT, name VARCHAR)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/sorted_data.parquet'
+WITH ORDER (id ASC);
+
+# Test 1.1: Sort pushdown with DESC (opposite of ASC)
+# Should show reverse_row_groups=true
+query TT
+EXPLAIN SELECT * FROM sorted_parquet ORDER BY id DESC LIMIT 3;
+----
+logical_plan
+01)Sort: sorted_parquet.id DESC NULLS FIRST, fetch=3
+02)--TableScan: sorted_parquet projection=[id, value, name]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/sorted_data.parquet]]}, projection=[id, value, name], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
+
+# Test 1.2: Verify results are correct
+query IIT
+SELECT * FROM sorted_parquet ORDER BY id DESC LIMIT 3;
+----
+10 1000 j
+9 900 i
+8 800 h
+
+# Test 1.3: Should NOT apply for ASC (same direction)
+query TT
+EXPLAIN SELECT * FROM sorted_parquet ORDER BY id ASC LIMIT 3;
+----
+logical_plan
+01)Sort: sorted_parquet.id ASC NULLS LAST, fetch=3
+02)--TableScan: sorted_parquet projection=[id, value, name]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/sorted_data.parquet]]}, projection=[id, value, name], limit=3, output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+# Test 1.4: Disable sort pushdown
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = false;
+
+query TT
+EXPLAIN SELECT * FROM sorted_parquet ORDER BY id DESC LIMIT 3;
+----
+logical_plan
+01)Sort: sorted_parquet.id DESC NULLS FIRST, fetch=3
+02)--TableScan: sorted_parquet projection=[id, value, name]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/sorted_data.parquet]]}, projection=[id, value, name], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Re-enable
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = true;
+
+# Test 1.5: With OFFSET
+query TT
+EXPLAIN SELECT * FROM sorted_parquet ORDER BY id DESC LIMIT 3 OFFSET 2;
+----
+logical_plan
+01)Limit: skip=2, fetch=3
+02)--Sort: sorted_parquet.id DESC NULLS FIRST, fetch=5
+03)----TableScan: sorted_parquet projection=[id, value, name]
+physical_plan
+01)GlobalLimitExec: skip=2, fetch=3
+02)--SortExec: TopK(fetch=5), expr=[id@0 DESC], preserve_partitioning=[false]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/sorted_data.parquet]]}, projection=[id, value, name], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
+
+query IIT
+SELECT * FROM sorted_parquet ORDER BY id DESC LIMIT 3 OFFSET 2;
+----
+8 800 h
+7 700 g
+6 600 f
+
+# Test 1.6: Reverse scan with row selection (page index pruning)
+# This tests that when reverse_row_groups=true, the RowSelection is also properly reversed
+
+# Create a dataset with multiple row groups and enable page index
+statement ok
+CREATE TABLE multi_rg_data(id INT, category VARCHAR, value INT) AS VALUES
+(1, 'alpha', 10),
+(2, 'alpha', 20),
+(3, 'beta', 30),
+(4, 'beta', 40),
+(5, 'gamma', 50),
+(6, 'gamma', 60),
+(7, 'delta', 70),
+(8, 'delta', 80);
+
+# Write with small row groups (2 rows each = 4 row groups)
+statement ok
+SET datafusion.execution.parquet.max_row_group_size = 2;
+
+query I
+COPY (SELECT * FROM multi_rg_data ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/multi_rg_sorted.parquet';
+----
+8
+
+# Reset row group size
+statement ok
+SET datafusion.execution.parquet.max_row_group_size = 1048576;
+
+statement ok
+CREATE EXTERNAL TABLE multi_rg_sorted(id INT, category VARCHAR, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/multi_rg_sorted.parquet'
+WITH ORDER (id ASC);
+
+# Enable page index for better pruning
+statement ok
+SET datafusion.execution.parquet.enable_page_index = true;
+
+statement ok
+SET datafusion.execution.parquet.pushdown_filters = true;
+
+# Test with reverse scan and filter that prunes some row groups
+# This will create a RowSelection with partial row group scans
+query TT
+EXPLAIN SELECT * FROM multi_rg_sorted
+WHERE category IN ('alpha', 'gamma')
+ORDER BY id DESC LIMIT 5;
+----
+logical_plan
+01)Sort: multi_rg_sorted.id DESC NULLS FIRST, fetch=5
+02)--Filter: multi_rg_sorted.category = Utf8View("alpha") OR multi_rg_sorted.category = Utf8View("gamma")
+03)----TableScan: multi_rg_sorted projection=[id, category, value], partial_filters=[multi_rg_sorted.category = Utf8View("alpha") OR multi_rg_sorted.category = Utf8View("gamma")]
+physical_plan
+01)SortExec: TopK(fetch=5), expr=[id@0 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/multi_rg_sorted.parquet]]}, projection=[id, category, value], file_type=parquet, predicate=(category@1 = alpha OR category@1 = gamma) AND DynamicFilter [ empty ], reverse_row_groups=true, pruning_predicate=category_null_count@2 != row_count@3 AND category_min@0 <= alpha AND alpha <= category_max@1 OR category_null_count@2 != row_count@3 AND category_min@0 <= gamma AND gamma <= category_max@1, required_guarantees=[category in (alpha, gamma)]
+
+# Verify the results are correct despite reverse scanning with row selection
+# Expected: gamma values (6, 5) then alpha values (2, 1), in DESC order by id
+query ITI
+SELECT * FROM multi_rg_sorted
+WHERE category IN ('alpha', 'gamma')
+ORDER BY id DESC LIMIT 5;
+----
+6 gamma 60
+5 gamma 50
+2 alpha 20
+1 alpha 10
+
+# Test with more complex selection pattern
+query ITI
+SELECT * FROM multi_rg_sorted
+WHERE category IN ('beta', 'delta')
+ORDER BY id DESC;
+----
+8 delta 80
+7 delta 70
+4 beta 40
+3 beta 30
+
+# Test forward scan for comparison (should give same logical results in ASC order)
+query ITI
+SELECT * FROM multi_rg_sorted
+WHERE category IN ('alpha', 'gamma')
+ORDER BY id ASC;
+----
+1 alpha 10
+2 alpha 20
+5 gamma 50
+6 gamma 60
+
+# Disable reverse scan and verify it still works
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = false;
+
+query ITI
+SELECT * FROM multi_rg_sorted
+WHERE category IN ('alpha', 'gamma')
+ORDER BY id DESC LIMIT 5;
+----
+6 gamma 60
+5 gamma 50
+2 alpha 20
+1 alpha 10
+
+# Re-enable
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = true;
+
+# Test 1.7: Sort pushdown with more than one partition
+# Create multiple parquet files to trigger it
+
+# Split data into multiple files
+statement ok
+CREATE TABLE sorted_data_part1(id INT, value INT, name VARCHAR) AS VALUES
+(1, 100, 'a'),
+(2, 200, 'b'),
+(3, 300, 'c');
+
+statement ok
+CREATE TABLE sorted_data_part2(id INT, value INT, name VARCHAR) AS VALUES
+(4, 400, 'd'),
+(5, 500, 'e'),
+(6, 600, 'f');
+
+statement ok
+CREATE TABLE sorted_data_part3(id INT, value INT, name VARCHAR) AS VALUES
+(7, 700, 'g'),
+(8, 800, 'h'),
+(9, 900, 'i'),
+(10, 1000, 'j');
+
+# Create directory for multi-file parquet
+query I
+COPY (SELECT * FROM sorted_data_part1 ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/sorted_multi/part1.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM sorted_data_part2 ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/sorted_multi/part2.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM sorted_data_part3 ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/sorted_multi/part3.parquet';
+----
+4
+
+# Create external table pointing to directory with multiple files
+statement ok
+CREATE EXTERNAL TABLE sorted_parquet_multi(id INT, value INT, name VARCHAR)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/sorted_multi/'
+WITH ORDER (id ASC);
+
+# Enable multiple partitions
+statement ok
+SET datafusion.execution.target_partitions = 4;
+
+# Now we should see RepartitionExec because we have 3 input partitions (3 files)
+query TT
+EXPLAIN SELECT * FROM sorted_parquet_multi ORDER BY id DESC LIMIT 3;
+----
+logical_plan
+01)Sort: sorted_parquet_multi.id DESC NULLS FIRST, fetch=3
+02)--TableScan: sorted_parquet_multi projection=[id, value, name]
+physical_plan
+01)SortPreservingMergeExec: [id@0 DESC], fetch=3
+02)--SortExec: TopK(fetch=3), expr=[id@0 DESC], preserve_partitioning=[true]
+03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/sorted_multi/part1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/sorted_multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/sorted_multi/part3.parquet]]}, projection=[id, value, name], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
+
+# Verify correctness with repartitioning and multiple files
+query IIT
+SELECT * FROM sorted_parquet_multi ORDER BY id DESC LIMIT 3;
+----
+10 1000 j
+9 900 i
+8 800 h
+
+# Test ASC order (should not trigger reverse scan)
+query IIT
+SELECT * FROM sorted_parquet_multi ORDER BY id ASC LIMIT 3;
+----
+1 100 a
+2 200 b
+3 300 c
+
+# Cleanup
+statement ok
+DROP TABLE sorted_data_part1;
+
+statement ok
+DROP TABLE sorted_data_part2;
+
+statement ok
+DROP TABLE sorted_data_part3;
+
+statement ok
+DROP TABLE sorted_parquet_multi;
+
+# Reset to default
+statement ok
+SET datafusion.execution.target_partitions = 4;
+
+# Cleanup
+statement ok
+DROP TABLE multi_rg_data;
+
+statement ok
+DROP TABLE multi_rg_sorted;
+
+statement ok
+SET datafusion.execution.parquet.enable_page_index = false;
+
+statement ok
+SET datafusion.execution.parquet.pushdown_filters = true;
+
+# Cleanup
+statement ok
+DROP TABLE sorted_data;
+
+statement ok
+DROP TABLE sorted_parquet;
+
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = true;
+
+
+# Test 2: Sort pushdown with constant column filtering
+# This tests the case where a leading sort column becomes constant through WHERE filtering
+
+# Create a multi-column sorted dataset (like time-series data)
+statement ok
+CREATE TABLE timeseries_data(timeframe VARCHAR, period_end INT, value DOUBLE) AS VALUES
+('daily', 1, 100.0),
+('daily', 2, 150.0),
+('daily', 3, 200.0),
+('weekly', 1, 500.0),
+('weekly', 2, 600.0),
+('weekly', 3, 700.0),
+('monthly', 1, 1000.0),
+('monthly', 2, 1100.0),
+('monthly', 3, 1200.0),
+('quarterly', 1, 5000.0),
+('quarterly', 2, 5500.0),
+('quarterly', 3, 6000.0);
+
+# Copy to parquet with multi-column sorting (timeframe ASC, period_end ASC)
+query I
+COPY (SELECT * FROM timeseries_data ORDER BY timeframe ASC, period_end ASC)
+TO 'test_files/scratch/sort_pushdown/timeseries_sorted.parquet';
+----
+12
+
+statement ok
+CREATE EXTERNAL TABLE timeseries_parquet(timeframe VARCHAR, period_end INT, value DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/timeseries_sorted.parquet'
+WITH ORDER (timeframe ASC, period_end ASC);
+
+# Test 2.1: Query with constant prefix filter and DESC on remaining column
+# WHERE timeframe='quarterly' makes the first sort column constant
+# ORDER BY period_end DESC should trigger reverse scan because:
+# File ordering: [timeframe ASC, period_end ASC]
+# After filtering timeframe='quarterly': effectively [period_end ASC]
+# Request: [period_end DESC] -> exact reverse!
+query TT
+EXPLAIN SELECT * FROM timeseries_parquet
+WHERE timeframe = 'quarterly'
+ORDER BY period_end DESC
+LIMIT 2;
+----
+logical_plan
+01)Sort: timeseries_parquet.period_end DESC NULLS FIRST, fetch=2
+02)--Filter: timeseries_parquet.timeframe = Utf8View("quarterly")
+03)----TableScan: timeseries_parquet projection=[timeframe, period_end, value], partial_filters=[timeseries_parquet.timeframe = Utf8View("quarterly")]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[period_end@1 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/timeseries_sorted.parquet]]}, projection=[timeframe, period_end, value], file_type=parquet, predicate=timeframe@0 = quarterly AND DynamicFilter [ empty ], reverse_row_groups=true, pruning_predicate=timeframe_null_count@2 != row_count@3 AND timeframe_min@0 <= quarterly AND quarterly <= timeframe_max@1, required_guarantees=[timeframe in (quarterly)]
+
+# Test 2.2: Verify the results are correct
+query TIR
+SELECT * FROM timeseries_parquet
+WHERE timeframe = 'quarterly'
+ORDER BY period_end DESC
+LIMIT 2;
+----
+quarterly 3 6000
+quarterly 2 5500
+
+# Test 2.3: Same filter but ASC order (should not trigger reverse scan, ordering already satisfied)
+query TT
+EXPLAIN SELECT * FROM timeseries_parquet
+WHERE timeframe = 'quarterly'
+ORDER BY period_end ASC
+LIMIT 2;
+----
+logical_plan
+01)Sort: timeseries_parquet.period_end ASC NULLS LAST, fetch=2
+02)--Filter: timeseries_parquet.timeframe = Utf8View("quarterly")
+03)----TableScan: timeseries_parquet projection=[timeframe, period_end, value], partial_filters=[timeseries_parquet.timeframe = Utf8View("quarterly")]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/timeseries_sorted.parquet]]}, projection=[timeframe, period_end, value], limit=2, output_ordering=[timeframe@0 ASC NULLS LAST, period_end@1 ASC NULLS LAST], file_type=parquet, predicate=timeframe@0 = quarterly, pruning_predicate=timeframe_null_count@2 != row_count@3 AND timeframe_min@0 <= quarterly AND quarterly <= timeframe_max@1, required_guarantees=[timeframe in (quarterly)]
+
+# Test 2.4: Verify ASC results
+query TIR
+SELECT * FROM timeseries_parquet
+WHERE timeframe = 'quarterly'
+ORDER BY period_end ASC
+LIMIT 2;
+----
+quarterly 1 5000
+quarterly 2 5500
+
+# Test 2.5: Test with different constant value
+query TIR
+SELECT * FROM timeseries_parquet
+WHERE timeframe = 'weekly'
+ORDER BY period_end DESC;
+----
+weekly 3 700
+weekly 2 600
+weekly 1 500
+
+# Test 2.6: Test without constant filter (no reverse scan because need both columns)
+# Request: [timeframe ASC, period_end DESC]
+# File has: [timeframe ASC, period_end ASC]
+# These are NOT reverse of each other - only second column is reversed
+query TT
+EXPLAIN SELECT * FROM timeseries_parquet
+ORDER BY timeframe ASC, period_end DESC
+LIMIT 3;
+----
+logical_plan
+01)Sort: timeseries_parquet.timeframe ASC NULLS LAST, timeseries_parquet.period_end DESC NULLS FIRST, fetch=3
+02)--TableScan: timeseries_parquet projection=[timeframe, period_end, value]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[timeframe@0 ASC NULLS LAST, period_end@1 DESC], preserve_partitioning=[false], sort_prefix=[timeframe@0 ASC NULLS LAST]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/timeseries_sorted.parquet]]}, projection=[timeframe, period_end, value], output_ordering=[timeframe@0 ASC NULLS LAST, period_end@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Test 2.7: Disable sort pushdown and verify filter still works
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = false;
+
+query TT
+EXPLAIN SELECT * FROM timeseries_parquet
+WHERE timeframe = 'quarterly'
+ORDER BY period_end DESC
+LIMIT 2;
+----
+logical_plan
+01)Sort: timeseries_parquet.period_end DESC NULLS FIRST, fetch=2
+02)--Filter: timeseries_parquet.timeframe = Utf8View("quarterly")
+03)----TableScan: timeseries_parquet projection=[timeframe, period_end, value], partial_filters=[timeseries_parquet.timeframe = Utf8View("quarterly")]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[period_end@1 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/timeseries_sorted.parquet]]}, projection=[timeframe, period_end, value], output_ordering=[timeframe@0 ASC NULLS LAST, period_end@1 ASC NULLS LAST], file_type=parquet, predicate=timeframe@0 = quarterly AND DynamicFilter [ empty ], pruning_predicate=timeframe_null_count@2 != row_count@3 AND timeframe_min@0 <= quarterly AND quarterly <= timeframe_max@1, required_guarantees=[timeframe in (quarterly)]
+
+# Results should still be correct
+query TIR
+SELECT * FROM timeseries_parquet
+WHERE timeframe = 'quarterly'
+ORDER BY period_end DESC
+LIMIT 2;
+----
+quarterly 3 6000
+quarterly 2 5500
+
+# Re-enable
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = true;
+
+# Test 2.8: Test with IN clause (multiple constant values)
+# Note: IN clause with multiple values means timeframe is NOT constant
+# (could be 'daily' or 'weekly'), so the first sort column cannot be eliminated.
+# Without a constant first column, we cannot reverse scan based on just period_end DESC.
+# The physical plan should NOT show reverse_row_groups=true
+query TT
+EXPLAIN SELECT * FROM timeseries_parquet
+WHERE timeframe IN ('daily', 'weekly')
+ORDER BY period_end DESC
+LIMIT 3;
+----
+logical_plan
+01)Sort: timeseries_parquet.period_end DESC NULLS FIRST, fetch=3
+02)--Filter: timeseries_parquet.timeframe = Utf8View("daily") OR timeseries_parquet.timeframe = Utf8View("weekly")
+03)----TableScan: timeseries_parquet projection=[timeframe, period_end, value], partial_filters=[timeseries_parquet.timeframe = Utf8View("daily") OR timeseries_parquet.timeframe = Utf8View("weekly")]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[period_end@1 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/timeseries_sorted.parquet]]}, projection=[timeframe, period_end, value], output_ordering=[timeframe@0 ASC NULLS LAST, period_end@1 ASC NULLS LAST], file_type=parquet, predicate=(timeframe@0 = daily OR timeframe@0 = weekly) AND DynamicFilter [ empty ], pruning_predicate=timeframe_null_count@2 != row_count@3 AND timeframe_min@0 <= daily AND daily <= timeframe_max@1 OR timeframe_null_count@2 != row_count@3 AND timeframe_min@0 <= weekly AND weekly <= timeframe_max@1, required_guarantees=[timeframe in (daily, weekly)]
+
+# Test 2.9: Complex case - literal constant in sort expression itself
+# The literal 'constant' is ignored in sort analysis
+# After stripping: ORDER BY period_end DESC
+# With WHERE timeframe='monthly' making first column constant
+# File: [period_end ASC] (after constant column removal)
+# Request: [period_end DESC] -> exact reverse, triggers reverse scan
+query TT
+EXPLAIN SELECT * FROM timeseries_parquet
+WHERE timeframe = 'monthly'
+ORDER BY 'constant', period_end DESC
+LIMIT 2;
+----
+logical_plan
+01)Sort: Utf8("constant") ASC NULLS LAST, timeseries_parquet.period_end DESC NULLS FIRST, fetch=2
+02)--Filter: timeseries_parquet.timeframe = Utf8View("monthly")
+03)----TableScan: timeseries_parquet projection=[timeframe, period_end, value], partial_filters=[timeseries_parquet.timeframe = Utf8View("monthly")]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[period_end@1 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/timeseries_sorted.parquet]]}, projection=[timeframe, period_end, value], file_type=parquet, predicate=timeframe@0 = monthly AND DynamicFilter [ empty ], reverse_row_groups=true, pruning_predicate=timeframe_null_count@2 != row_count@3 AND timeframe_min@0 <= monthly AND monthly <= timeframe_max@1, required_guarantees=[timeframe in (monthly)]
+
+# Verify results
+query TIR
+SELECT * FROM timeseries_parquet
+WHERE timeframe = 'monthly'
+ORDER BY period_end DESC
+LIMIT 2;
+----
+monthly 3 1200
+monthly 2 1100
+
+# Test 2.10: Filter on non-leading sort column
+# File order: [timeframe ASC, period_end ASC]
+# Filter: period_end = 2 (makes second column constant)
+# Request: [timeframe DESC]
+# After constant column removal: File has [timeframe ASC], Request wants [timeframe DESC]
+# This is exact reverse -> triggers reverse scan
+query TT
+EXPLAIN SELECT * FROM timeseries_parquet
+WHERE period_end = 2
+ORDER BY timeframe DESC;
+----
+logical_plan
+01)Sort: timeseries_parquet.timeframe DESC NULLS FIRST
+02)--Filter: timeseries_parquet.period_end = Int32(2)
+03)----TableScan: timeseries_parquet projection=[timeframe, period_end, value], partial_filters=[timeseries_parquet.period_end = Int32(2)]
+physical_plan
+01)SortExec: expr=[timeframe@0 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/timeseries_sorted.parquet]]}, projection=[timeframe, period_end, value], file_type=parquet, predicate=period_end@1 = 2, reverse_row_groups=true, pruning_predicate=period_end_null_count@2 != row_count@3 AND period_end_min@0 <= 2 AND 2 <= period_end_max@1, required_guarantees=[period_end in (2)]
+
+# Cleanup
+statement ok
+DROP TABLE timeseries_data;
+
+statement ok
+DROP TABLE timeseries_parquet;
+
+# Reset to default
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = true;
+
+
+# Test 3: Sort pushdown with monotonic functions
+# This tests that reverse scan works when sort expressions involve monotonic functions
+
+# Create test data with timestamp column
+statement ok
+CREATE TABLE timestamp_data(id INT, ts TIMESTAMP, volume BIGINT, price DOUBLE) AS VALUES
+(1, TIMESTAMP '2024-01-15 10:00:00', 1000, 100.0),
+(2, TIMESTAMP '2024-01-20 11:00:00', 1500, 105.0),
+(3, TIMESTAMP '2024-01-25 12:00:00', 2000, 110.0),
+(4, TIMESTAMP '2024-02-05 09:00:00', 1200, 108.0),
+(5, TIMESTAMP '2024-02-15 14:00:00', 1800, 112.0),
+(6, TIMESTAMP '2024-02-25 15:00:00', 2200, 115.0),
+(7, TIMESTAMP '2024-03-10 09:00:00', 1300, 113.0),
+(8, TIMESTAMP '2024-03-18 14:00:00', 1900, 118.0),
+(9, TIMESTAMP '2024-03-28 15:00:00', 2300, 120.0);
+
+# Copy to parquet with sorting by timestamp ASC
+query I
+COPY (SELECT * FROM timestamp_data ORDER BY ts ASC)
+TO 'test_files/scratch/sort_pushdown/timestamp_sorted.parquet';
+----
+9
+
+# Test 3.1: Simple monotonic function - date_trunc
+# Create external table with file ordering that conceptually includes date_trunc
+# File is actually sorted by [ts ASC], but conceptually [date_trunc('month', ts) ASC, ts ASC]
+statement ok
+CREATE EXTERNAL TABLE timestamp_parquet(id INT, ts TIMESTAMP, volume BIGINT, price DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/timestamp_sorted.parquet'
+WITH ORDER (ts ASC);
+
+# Query with ORDER BY ts DESC
+# File ordering: [ts ASC]
+# Request: [ts DESC]
+# This should trigger reverse_row_groups=true
+query TT
+EXPLAIN SELECT * FROM timestamp_parquet
+ORDER BY ts DESC
+LIMIT 3;
+----
+logical_plan
+01)Sort: timestamp_parquet.ts DESC NULLS FIRST, fetch=3
+02)--TableScan: timestamp_parquet projection=[id, ts, volume, price]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[ts@1 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/timestamp_sorted.parquet]]}, projection=[id, ts, volume, price], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
+
+# Verify results
+query IPIR
+SELECT * FROM timestamp_parquet
+ORDER BY ts DESC
+LIMIT 3;
+----
+9 2024-03-28T15:00:00 2300 120
+8 2024-03-18T14:00:00 1900 118
+7 2024-03-10T09:00:00 1300 113
+
+# Test 3.2: Monotonic function in ORDER BY - date_trunc DESC
+# File ordering: [ts ASC]
+# Request: [date_trunc('day', ts) DESC]
+# Since date_trunc is monotonic with ts, reversed file ordering [ts DESC] satisfies [date_trunc DESC]
+query TT
+EXPLAIN SELECT * FROM timestamp_parquet
+ORDER BY date_trunc('day', ts) DESC
+LIMIT 3;
+----
+logical_plan
+01)Sort: date_trunc(Utf8("day"), timestamp_parquet.ts) DESC NULLS FIRST, fetch=3
+02)--TableScan: timestamp_parquet projection=[id, ts, volume, price]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[date_trunc(day, ts@1) DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/timestamp_sorted.parquet]]}, projection=[id, ts, volume, price], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
+
+# Verify results (descending day)
+query IPIR
+SELECT * FROM timestamp_parquet
+ORDER BY date_trunc('day', ts) DESC
+LIMIT 3;
+----
+9 2024-03-28T15:00:00 2300 120
+8 2024-03-18T14:00:00 1900 118
+7 2024-03-10T09:00:00 1300 113
+
+# Test 3.3: Multi-column scenario with explicit monotonic function in file ordering
+# Create a table where we explicitly declare the ordering includes the monotonic function
+# This simulates files that are partitioned/sorted by [date_trunc('month', ts) ASC, ts ASC]
+
+# Create a new parquet file sorted by [ts ASC] (which implies date_trunc ordering)
+statement ok
+CREATE TABLE multi_month_data(id INT, ts TIMESTAMP, volume BIGINT, price DOUBLE) AS VALUES
+-- January 2024
+(1, TIMESTAMP '2024-01-05 09:30:00', 1000, 100.0),
+(2, TIMESTAMP '2024-01-15 14:30:00', 1500, 105.0),
+(3, TIMESTAMP '2024-01-25 15:59:00', 2000, 110.0),
+-- February 2024
+(4, TIMESTAMP '2024-02-03 09:30:00', 1200, 108.0),
+(5, TIMESTAMP '2024-02-14 12:00:00', 1800, 112.0),
+(6, TIMESTAMP '2024-02-28 15:59:00', 2200, 115.0),
+-- March 2024
+(7, TIMESTAMP '2024-03-01 09:30:00', 1300, 113.0),
+(8, TIMESTAMP '2024-03-15 14:00:00', 1900, 118.0),
+(9, TIMESTAMP '2024-03-29 15:59:00', 2300, 120.0);
+
+query I
+COPY (SELECT * FROM multi_month_data ORDER BY ts ASC)
+TO 'test_files/scratch/sort_pushdown/multi_month_sorted.parquet';
+----
+9
+
+# Declare the file has ordering [ts ASC]
+# Conceptually this means [date_trunc('month', ts) ASC, ts ASC] due to monotonicity
+statement ok
+CREATE EXTERNAL TABLE multi_month_parquet(id INT, ts TIMESTAMP, volume BIGINT, price DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/multi_month_sorted.parquet'
+WITH ORDER (ts ASC);
+
+# Test 3.3a: Request ORDER BY ts DESC (opposite direction)
+# File: [ts ASC]
+# Request: [ts DESC]
+# Should trigger reverse_row_groups=true
+query TT
+EXPLAIN SELECT * FROM multi_month_parquet
+ORDER BY ts DESC
+LIMIT 2;
+----
+logical_plan
+01)Sort: multi_month_parquet.ts DESC NULLS FIRST, fetch=2
+02)--TableScan: multi_month_parquet projection=[id, ts, volume, price]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[ts@1 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/multi_month_sorted.parquet]]}, projection=[id, ts, volume, price], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
+
+query IPIR
+SELECT * FROM multi_month_parquet
+ORDER BY ts DESC
+LIMIT 2;
+----
+9 2024-03-29T15:59:00 2300 120
+8 2024-03-15T14:00:00 1900 118
+
+# Test 3.3b: Request ORDER BY date_trunc('month', ts) DESC, ts DESC
+# File: [ts ASC] (which implies [date_trunc('month', ts) ASC, ts ASC])
+# Request: [date_trunc('month', ts) DESC, ts DESC]
+# The reversed file ordering [ts DESC] satisfies this because:
+# - date_trunc is monotonic with ts
+# - So [ts DESC] implies [date_trunc('month', ts) DESC, ts DESC]
+query TT
+EXPLAIN SELECT * FROM multi_month_parquet
+ORDER BY date_trunc('month', ts) DESC, ts DESC
+LIMIT 2;
+----
+logical_plan
+01)Sort: date_trunc(Utf8("month"), multi_month_parquet.ts) DESC NULLS FIRST, multi_month_parquet.ts DESC NULLS FIRST, fetch=2
+02)--TableScan: multi_month_parquet projection=[id, ts, volume, price]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[date_trunc(month, ts@1) DESC, ts@1 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/multi_month_sorted.parquet]]}, projection=[id, ts, volume, price], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
+
+query IPIR
+SELECT * FROM multi_month_parquet
+ORDER BY date_trunc('month', ts) DESC, ts DESC
+LIMIT 2;
+----
+9 2024-03-29T15:59:00 2300 120
+8 2024-03-15T14:00:00 1900 118
+
+# Test 3.4: CAST as a monotonic function
+statement ok
+CREATE TABLE int_data(id INT, small_val SMALLINT, big_val BIGINT) AS VALUES
+(1, 10, 100),
+(2, 20, 200),
+(3, 30, 300),
+(4, 40, 400),
+(5, 50, 500);
+
+query I
+COPY (SELECT * FROM int_data ORDER BY small_val ASC)
+TO 'test_files/scratch/sort_pushdown/int_sorted.parquet';
+----
+5
+
+statement ok
+CREATE EXTERNAL TABLE int_parquet(id INT, small_val SMALLINT, big_val BIGINT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/int_sorted.parquet'
+WITH ORDER (small_val ASC);
+
+# CAST preserves ordering: CAST(small_val AS BIGINT) is monotonic with small_val
+query TT
+EXPLAIN SELECT * FROM int_parquet
+ORDER BY CAST(small_val AS BIGINT) DESC
+LIMIT 2;
+----
+logical_plan
+01)Sort: CAST(int_parquet.small_val AS Int64) DESC NULLS FIRST, fetch=2
+02)--TableScan: int_parquet projection=[id, small_val, big_val]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[CAST(small_val@1 AS Int64) DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/int_sorted.parquet]]}, projection=[id, small_val, big_val], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
+
+query III
+SELECT * FROM int_parquet
+ORDER BY CAST(small_val AS BIGINT) DESC
+LIMIT 2;
+----
+5 50 500
+4 40 400
+
+# Test 3.5: CEIL as a monotonic function
+statement ok
+CREATE TABLE float_data(id INT, value DOUBLE) AS VALUES
+(1, 1.1),
+(2, 2.3),
+(3, 3.5),
+(4, 4.7),
+(5, 5.9);
+
+query I
+COPY (SELECT * FROM float_data ORDER BY value ASC)
+TO 'test_files/scratch/sort_pushdown/float_sorted.parquet';
+----
+5
+
+statement ok
+CREATE EXTERNAL TABLE float_parquet(id INT, value DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/float_sorted.parquet'
+WITH ORDER (value ASC);
+
+# CEIL is monotonic increasing
+query TT
+EXPLAIN SELECT * FROM float_parquet
+ORDER BY CEIL(value) DESC
+LIMIT 3;
+----
+logical_plan
+01)Sort: ceil(float_parquet.value) DESC NULLS FIRST, fetch=3
+02)--TableScan: float_parquet projection=[id, value]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[ceil(value@1) DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/float_sorted.parquet]]}, projection=[id, value], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
+
+query IR
+SELECT * FROM float_parquet
+ORDER BY CEIL(value) DESC
+LIMIT 3;
+----
+5 5.9
+4 4.7
+3 3.5
+
+# Test 3.6: Negative case - ABS is NOT monotonic over mixed positive/negative range
+statement ok
+CREATE TABLE signed_data(id INT, value DOUBLE) AS VALUES
+(1, -5.0),
+(2, -3.0),
+(3, -1.0),
+(4, 2.0),
+(5, 4.0);
+
+query I
+COPY (SELECT * FROM signed_data ORDER BY value ASC)
+TO 'test_files/scratch/sort_pushdown/signed_sorted.parquet';
+----
+5
+
+statement ok
+CREATE EXTERNAL TABLE signed_parquet(id INT, value DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/signed_sorted.parquet'
+WITH ORDER (value ASC);
+
+# ABS is NOT monotonic over the full range [-5, 4], so should NOT trigger reverse scan
+query TT
+EXPLAIN SELECT * FROM signed_parquet
+ORDER BY ABS(value) DESC
+LIMIT 3;
+----
+logical_plan
+01)Sort: abs(signed_parquet.value) DESC NULLS FIRST, fetch=3
+02)--TableScan: signed_parquet projection=[id, value]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[abs(value@1) DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/signed_sorted.parquet]]}, projection=[id, value], output_ordering=[value@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Results should still be correct (no optimization applied)
+query IR
+SELECT * FROM signed_parquet
+ORDER BY ABS(value) DESC
+LIMIT 3;
+----
+1 -5
+5 4
+2 -3
+
+# Test 3.7: Aggregate ORDER BY expression should keep SortExec
+# Source pattern declared on parquet scan: [x ASC, y ASC].
+# Requested pattern in ORDER BY: [x ASC, CAST(y AS BIGINT) % 2 ASC].
+# Example for x=1 input y order 1,2,3 gives bucket order 1,0,1, which does not
+# match requested bucket ASC order. SortExec is required above AggregateExec.
+statement ok
+SET datafusion.execution.target_partitions = 1;
+
+statement ok
+CREATE TABLE agg_expr_data(x INT, y INT, v INT) AS VALUES
+(1, 1, 10),
+(1, 2, 20),
+(1, 3, 30),
+(2, 1, 40),
+(2, 2, 50),
+(2, 3, 60);
+
+query I
+COPY (SELECT * FROM agg_expr_data ORDER BY x, y)
+TO 'test_files/scratch/sort_pushdown/agg_expr_sorted.parquet';
+----
+6
+
+statement ok
+CREATE EXTERNAL TABLE agg_expr_parquet(x INT, y INT, v INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/agg_expr_sorted.parquet'
+WITH ORDER (x ASC, y ASC);
+
+query TT
+EXPLAIN SELECT
+  x,
+  CAST(y AS BIGINT) % 2,
+  SUM(v)
+FROM agg_expr_parquet
+GROUP BY x, CAST(y AS BIGINT) % 2
+ORDER BY x, CAST(y AS BIGINT) % 2;
+----
+logical_plan
+01)Sort: agg_expr_parquet.x ASC NULLS LAST, agg_expr_parquet.y % Int64(2) ASC NULLS LAST
+02)--Aggregate: groupBy=[[agg_expr_parquet.x, CAST(agg_expr_parquet.y AS Int64) % Int64(2)]], aggr=[[sum(CAST(agg_expr_parquet.v AS Int64))]]
+03)----TableScan: agg_expr_parquet projection=[x, y, v]
+physical_plan
+01)SortExec: expr=[x@0 ASC NULLS LAST, agg_expr_parquet.y % Int64(2)@1 ASC NULLS LAST], preserve_partitioning=[false]
+02)--AggregateExec: mode=Single, gby=[x@0 as x, CAST(y@1 AS Int64) % 2 as agg_expr_parquet.y % Int64(2)], aggr=[sum(agg_expr_parquet.v)], ordering_mode=PartiallySorted([0])
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/agg_expr_sorted.parquet]]}, projection=[x, y, v], output_ordering=[x@0 ASC NULLS LAST, y@1 ASC NULLS LAST], file_type=parquet
+
+# Expected output pattern from ORDER BY [x, bucket]:
+# rows grouped by x, and within each x bucket appears as 0 then 1.
+query III
+SELECT
+  x,
+  CAST(y AS BIGINT) % 2,
+  SUM(v)
+FROM agg_expr_parquet
+GROUP BY x, CAST(y AS BIGINT) % 2
+ORDER BY x, CAST(y AS BIGINT) % 2;
+----
+1 0 20
+1 1 40
+2 0 50
+2 1 100
+
+# Test 3.8: Aggregate ORDER BY monotonic expression can push down (no SortExec)
+query TT
+EXPLAIN SELECT
+  x,
+  CAST(y AS BIGINT),
+  SUM(v)
+FROM agg_expr_parquet
+GROUP BY x, CAST(y AS BIGINT)
+ORDER BY x, CAST(y AS BIGINT);
+----
+logical_plan
+01)Sort: agg_expr_parquet.x ASC NULLS LAST, agg_expr_parquet.y ASC NULLS LAST
+02)--Aggregate: groupBy=[[agg_expr_parquet.x, CAST(agg_expr_parquet.y AS Int64)]], aggr=[[sum(CAST(agg_expr_parquet.v AS Int64))]]
+03)----TableScan: agg_expr_parquet projection=[x, y, v]
+physical_plan
+01)AggregateExec: mode=Single, gby=[x@0 as x, CAST(y@1 AS Int64) as agg_expr_parquet.y], aggr=[sum(agg_expr_parquet.v)], ordering_mode=Sorted
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/agg_expr_sorted.parquet]]}, projection=[x, y, v], output_ordering=[x@0 ASC NULLS LAST, y@1 ASC NULLS LAST], file_type=parquet
+
+query III
+SELECT
+  x,
+  CAST(y AS BIGINT),
+  SUM(v)
+FROM agg_expr_parquet
+GROUP BY x, CAST(y AS BIGINT)
+ORDER BY x, CAST(y AS BIGINT);
+----
+1 1 10
+1 2 20
+1 3 30
+2 1 40
+2 2 50
+2 3 60
+
+# Test 3.9: Aggregate ORDER BY aggregate output should keep SortExec
+query TT
+EXPLAIN SELECT x, SUM(v)
+FROM agg_expr_parquet
+GROUP BY x
+ORDER BY SUM(v);
+----
+logical_plan
+01)Sort: sum(agg_expr_parquet.v) ASC NULLS LAST
+02)--Aggregate: groupBy=[[agg_expr_parquet.x]], aggr=[[sum(CAST(agg_expr_parquet.v AS Int64))]]
+03)----TableScan: agg_expr_parquet projection=[x, v]
+physical_plan
+01)SortExec: expr=[sum(agg_expr_parquet.v)@1 ASC NULLS LAST], preserve_partitioning=[false]
+02)--AggregateExec: mode=Single, gby=[x@0 as x], aggr=[sum(agg_expr_parquet.v)], ordering_mode=Sorted
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/agg_expr_sorted.parquet]]}, projection=[x, v], output_ordering=[x@0 ASC NULLS LAST], file_type=parquet
+
+query II
+SELECT x, SUM(v)
+FROM agg_expr_parquet
+GROUP BY x
+ORDER BY SUM(v);
+----
+1 60
+2 150
+
+# Test 3.10: Aggregate with non-preserved input order should keep SortExec
+# v is not part of the order by
+query TT
+EXPLAIN SELECT v, SUM(y)
+FROM agg_expr_parquet
+GROUP BY v
+ORDER BY v;
+----
+logical_plan
+01)Sort: agg_expr_parquet.v ASC NULLS LAST
+02)--Aggregate: groupBy=[[agg_expr_parquet.v]], aggr=[[sum(CAST(agg_expr_parquet.y AS Int64))]]
+03)----TableScan: agg_expr_parquet projection=[y, v]
+physical_plan
+01)SortExec: expr=[v@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--AggregateExec: mode=Single, gby=[v@1 as v], aggr=[sum(agg_expr_parquet.y)]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/agg_expr_sorted.parquet]]}, projection=[y, v], file_type=parquet
+
+query II
+SELECT v, SUM(y)
+FROM agg_expr_parquet
+GROUP BY v
+ORDER BY v;
+----
+10 1
+20 2
+30 3
+40 1
+50 2
+60 3
+
+# Test 3.11: Aggregate ORDER BY non-column expression (unsatisfied) keeps SortExec
+# (though note in theory DataFusion could figure out that data sorted by x will also be sorted by x+1)
+query TT
+EXPLAIN SELECT x, SUM(v)
+FROM agg_expr_parquet
+GROUP BY x
+ORDER BY x + 1 DESC;
+----
+logical_plan
+01)Sort: CAST(agg_expr_parquet.x AS Int64) + Int64(1) DESC NULLS FIRST
+02)--Aggregate: groupBy=[[agg_expr_parquet.x]], aggr=[[sum(CAST(agg_expr_parquet.v AS Int64))]]
+03)----TableScan: agg_expr_parquet projection=[x, v]
+physical_plan
+01)SortExec: expr=[CAST(x@0 AS Int64) + 1 DESC], preserve_partitioning=[false]
+02)--AggregateExec: mode=Single, gby=[x@0 as x], aggr=[sum(agg_expr_parquet.v)], ordering_mode=Sorted
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/agg_expr_sorted.parquet]]}, projection=[x, v], output_ordering=[x@0 ASC NULLS LAST], file_type=parquet
+
+query II
+SELECT x, SUM(v)
+FROM agg_expr_parquet
+GROUP BY x
+ORDER BY x + 1 DESC;
+----
+2 150
+1 60
+
+# Test 3.12: Aggregate ORDER BY non-column expression (unsatisfied) keeps SortExec
+# (though note in theory DataFusion could figure out that data sorted by x will also be sorted by x+1)
+query TT
+EXPLAIN SELECT x, SUM(v)
+FROM agg_expr_parquet
+GROUP BY x
+ORDER BY 2 * x ASC;
+----
+logical_plan
+01)Sort: Int64(2) * CAST(agg_expr_parquet.x AS Int64) ASC NULLS LAST
+02)--Aggregate: groupBy=[[agg_expr_parquet.x]], aggr=[[sum(CAST(agg_expr_parquet.v AS Int64))]]
+03)----TableScan: agg_expr_parquet projection=[x, v]
+physical_plan
+01)SortExec: expr=[2 * CAST(x@0 AS Int64) ASC NULLS LAST], preserve_partitioning=[false]
+02)--AggregateExec: mode=Single, gby=[x@0 as x], aggr=[sum(agg_expr_parquet.v)], ordering_mode=Sorted
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/agg_expr_sorted.parquet]]}, projection=[x, v], output_ordering=[x@0 ASC NULLS LAST], file_type=parquet
+
+query II
+SELECT x, SUM(v)
+FROM agg_expr_parquet
+GROUP BY x
+ORDER BY 2 * x ASC;
+----
+1 60
+2 150
+
+# Test 4: Reversed filesystem order with inferred ordering
+# Create 3 parquet files with non-overlapping id ranges, named so filesystem
+# order is OPPOSITE to data order. Each file is internally sorted by id ASC.
+# Force target_partitions=1 so all files end up in one file group, which is
+# where the inter-file ordering bug manifests.
+# Without inter-file validation, the optimizer would incorrectly trust the
+# inferred ordering and remove SortExec.
+
+# Save current target_partitions and set to 1 to force single file group
+statement ok
+SET datafusion.execution.target_partitions = 1;
+
+statement ok
+CREATE TABLE reversed_high(id INT, value INT) AS VALUES (7, 700), (8, 800), (9, 900);
+
+statement ok
+CREATE TABLE reversed_mid(id INT, value INT) AS VALUES (4, 400), (5, 500), (6, 600);
+
+statement ok
+CREATE TABLE reversed_low(id INT, value INT) AS VALUES (1, 100), (2, 200), (3, 300);
+
+query I
+COPY (SELECT * FROM reversed_high ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/reversed/a_high.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM reversed_mid ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/reversed/b_mid.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM reversed_low ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/reversed/c_low.parquet';
+----
+3
+
+# External table with NO "WITH ORDER" — relies on inferred ordering from parquet metadata
+statement ok
+CREATE EXTERNAL TABLE reversed_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/reversed/';
+
+# Test 4.1: PushdownSort reorders files by min/max statistics so they are
+# already in correct sort order → non-overlapping → no SortExec needed.
+# (files reordered from [a_high, b_mid, c_low] to [c_low, b_mid, a_high])
+query TT
+EXPLAIN SELECT * FROM reversed_parquet ORDER BY id ASC;
+----
+logical_plan
+01)Sort: reversed_parquet.id ASC NULLS LAST
+02)--TableScan: reversed_parquet projection=[id, value]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/c_low.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/a_high.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+# Test 4.2: Results must be correct
+query II
+SELECT * FROM reversed_parquet ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+7 700
+8 800
+9 900
+
+# Test 5: Overlapping files with inferred ordering
+# Create files with overlapping id ranges
+
+statement ok
+CREATE TABLE overlap_x(id INT, value INT) AS VALUES (1, 100), (3, 300), (5, 500);
+
+statement ok
+CREATE TABLE overlap_y(id INT, value INT) AS VALUES (2, 200), (4, 400), (6, 600);
+
+query I
+COPY (SELECT * FROM overlap_x ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/overlap/file_x.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM overlap_y ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/overlap/file_y.parquet';
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE overlap_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/overlap/';
+
+# Test 5.1: SortExec must be present because files have overlapping ranges
+query TT
+EXPLAIN SELECT * FROM overlap_parquet ORDER BY id ASC;
+----
+logical_plan
+01)Sort: overlap_parquet.id ASC NULLS LAST
+02)--TableScan: overlap_parquet projection=[id, value]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/overlap/file_x.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/overlap/file_y.parquet]]}, projection=[id, value], file_type=parquet
+
+# Test 5.2: Results must be correct
+query II
+SELECT * FROM overlap_parquet ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+
+# Test 6: WITH ORDER + reversed filesystem order
+# Same file setup as Test 4 but explicitly declaring ordering via WITH ORDER.
+# Even with WITH ORDER, the optimizer should detect that inter-file order is wrong
+# and keep SortExec.
+
+statement ok
+CREATE EXTERNAL TABLE reversed_with_order_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/reversed/'
+WITH ORDER (id ASC);
+
+# Test 6.1: Files sorted by statistics → non-overlapping → SortExec eliminated
+# WITH ORDER declared + files reordered to correct order
+query TT
+EXPLAIN SELECT * FROM reversed_with_order_parquet ORDER BY id ASC;
+----
+logical_plan
+01)Sort: reversed_with_order_parquet.id ASC NULLS LAST
+02)--TableScan: reversed_with_order_parquet projection=[id, value]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/c_low.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/a_high.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+# Test 6.2: Results must be correct
+query II
+SELECT * FROM reversed_with_order_parquet ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+7 700
+8 800
+9 900
+
+# Test 7: Correctly ordered multi-file single group (positive case)
+# Files are in CORRECT inter-file order within a single group.
+# The validation should PASS and SortExec should be eliminated.
+
+statement ok
+CREATE TABLE correct_low(id INT, value INT) AS VALUES (1, 100), (2, 200), (3, 300);
+
+statement ok
+CREATE TABLE correct_mid(id INT, value INT) AS VALUES (4, 400), (5, 500), (6, 600);
+
+statement ok
+CREATE TABLE correct_high(id INT, value INT) AS VALUES (7, 700), (8, 800), (9, 900);
+
+query I
+COPY (SELECT * FROM correct_low ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/correct/a_low.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM correct_mid ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/correct/b_mid.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM correct_high ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/correct/c_high.parquet';
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE correct_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/correct/'
+WITH ORDER (id ASC);
+
+# Test 7.1: SortExec should be ELIMINATED — files are in correct inter-file order
+query TT
+EXPLAIN SELECT * FROM correct_parquet ORDER BY id ASC;
+----
+logical_plan
+01)Sort: correct_parquet.id ASC NULLS LAST
+02)--TableScan: correct_parquet projection=[id, value]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/a_low.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/c_high.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+# Test 7.2: Results must be correct
+query II
+SELECT * FROM correct_parquet ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+7 700
+8 800
+9 900
+
+# Test 7.3: DESC query on correctly ordered ASC files should still use SortExec
+# Note: reverse_row_groups=true reverses the file list in the plan display
+query TT
+EXPLAIN SELECT * FROM correct_parquet ORDER BY id DESC;
+----
+logical_plan
+01)Sort: correct_parquet.id DESC NULLS FIRST
+02)--TableScan: correct_parquet projection=[id, value]
+physical_plan
+01)SortExec: expr=[id@0 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/c_high.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/a_low.parquet]]}, projection=[id, value], file_type=parquet, reverse_row_groups=true
+
+query II
+SELECT * FROM correct_parquet ORDER BY id DESC;
+----
+9 900
+8 800
+7 700
+6 600
+5 500
+4 400
+3 300
+2 200
+1 100
+
+# Test 8: DESC ordering with files in wrong inter-file DESC order
+# Create files internally sorted by id DESC, but named so filesystem order
+# is WRONG for DESC ordering (low values first in filesystem order).
+
+statement ok
+CREATE TABLE desc_low(id INT, value INT) AS VALUES (3, 300), (2, 200), (1, 100);
+
+statement ok
+CREATE TABLE desc_high(id INT, value INT) AS VALUES (9, 900), (8, 800), (7, 700);
+
+query I
+COPY (SELECT * FROM desc_low ORDER BY id DESC)
+TO 'test_files/scratch/sort_pushdown/desc_reversed/a_low.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM desc_high ORDER BY id DESC)
+TO 'test_files/scratch/sort_pushdown/desc_reversed/b_high.parquet';
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE desc_reversed_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/desc_reversed/'
+WITH ORDER (id DESC);
+
+# Test 8.1: Files sorted by statistics → non-overlapping → SortExec eliminated
+# (files reordered: b_high(7-9) before a_low(1-3) for DESC order)
+query TT
+EXPLAIN SELECT * FROM desc_reversed_parquet ORDER BY id DESC;
+----
+logical_plan
+01)Sort: desc_reversed_parquet.id DESC NULLS FIRST
+02)--TableScan: desc_reversed_parquet projection=[id, value]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/desc_reversed/b_high.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/desc_reversed/a_low.parquet]]}, projection=[id, value], output_ordering=[id@0 DESC], file_type=parquet
+
+# Test 8.2: Results must be correct
+query II
+SELECT * FROM desc_reversed_parquet ORDER BY id DESC;
+----
+9 900
+8 800
+7 700
+3 300
+2 200
+1 100
+
+# Test 9: Multi-column sort key validation
+# Files have (category, id) ordering. Files share a boundary value on category='B'
+# so column-level min/max statistics overlap on the primary key column.
+# The validation conservatively rejects this because column-level stats can't
+# precisely represent row-level boundaries for multi-column keys.
+
+statement ok
+CREATE TABLE multi_col_a(category VARCHAR, id INT, value INT) AS VALUES
+('A', 1, 10), ('A', 2, 20), ('B', 1, 30);
+
+statement ok
+CREATE TABLE multi_col_b(category VARCHAR, id INT, value INT) AS VALUES
+('B', 2, 40), ('C', 1, 50), ('C', 2, 60);
+
+query I
+COPY (SELECT * FROM multi_col_a ORDER BY category ASC, id ASC)
+TO 'test_files/scratch/sort_pushdown/multi_col/a_first.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM multi_col_b ORDER BY category ASC, id ASC)
+TO 'test_files/scratch/sort_pushdown/multi_col/b_second.parquet';
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE multi_col_parquet(category VARCHAR, id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/multi_col/'
+WITH ORDER (category ASC, id ASC);
+
+# Test 9.1: SortExec is present — validation conservatively rejects because
+# column-level stats overlap on category='B' across both files
+query TT
+EXPLAIN SELECT * FROM multi_col_parquet ORDER BY category ASC, id ASC;
+----
+logical_plan
+01)Sort: multi_col_parquet.category ASC NULLS LAST, multi_col_parquet.id ASC NULLS LAST
+02)--TableScan: multi_col_parquet projection=[category, id, value]
+physical_plan
+01)SortExec: expr=[category@0 ASC NULLS LAST, id@1 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/multi_col/a_first.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/multi_col/b_second.parquet]]}, projection=[category, id, value], file_type=parquet
+
+# Test 9.2: Results must be correct
+query TII
+SELECT * FROM multi_col_parquet ORDER BY category ASC, id ASC;
+----
+A 1 10
+A 2 20
+B 1 30
+B 2 40
+C 1 50
+C 2 60
+
+# Test 9.3: Multi-column sort with non-overlapping primary key across files
+# When files don't overlap on the primary column, validation succeeds.
+
+statement ok
+CREATE TABLE multi_col_x(category VARCHAR, id INT, value INT) AS VALUES
+('A', 1, 10), ('A', 2, 20);
+
+statement ok
+CREATE TABLE multi_col_y(category VARCHAR, id INT, value INT) AS VALUES
+('B', 1, 30), ('B', 2, 40);
+
+query I
+COPY (SELECT * FROM multi_col_x ORDER BY category ASC, id ASC)
+TO 'test_files/scratch/sort_pushdown/multi_col_clean/x_first.parquet';
+----
+2
+
+query I
+COPY (SELECT * FROM multi_col_y ORDER BY category ASC, id ASC)
+TO 'test_files/scratch/sort_pushdown/multi_col_clean/y_second.parquet';
+----
+2
+
+statement ok
+CREATE EXTERNAL TABLE multi_col_clean_parquet(category VARCHAR, id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/multi_col_clean/'
+WITH ORDER (category ASC, id ASC);
+
+# Test 9.3a: SortExec should be eliminated — non-overlapping primary column
+query TT
+EXPLAIN SELECT * FROM multi_col_clean_parquet ORDER BY category ASC, id ASC;
+----
+logical_plan
+01)Sort: multi_col_clean_parquet.category ASC NULLS LAST, multi_col_clean_parquet.id ASC NULLS LAST
+02)--TableScan: multi_col_clean_parquet projection=[category, id, value]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/multi_col_clean/x_first.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/multi_col_clean/y_second.parquet]]}, projection=[category, id, value], output_ordering=[category@0 ASC NULLS LAST, id@1 ASC NULLS LAST], file_type=parquet
+
+# Test 9.3b: Results must be correct
+query TII
+SELECT * FROM multi_col_clean_parquet ORDER BY category ASC, id ASC;
+----
+A 1 10
+A 2 20
+B 1 30
+B 2 40
+
+# Test 10: Correctly ordered files WITH ORDER (positive counterpart to Test 6)
+# Files in correct_parquet are in correct ASC order — WITH ORDER should pass validation
+# and SortExec should be eliminated.
+
+statement ok
+CREATE EXTERNAL TABLE correct_with_order_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/correct/'
+WITH ORDER (id ASC);
+
+# Test 10.1: SortExec should be ELIMINATED — files are in correct order
+query TT
+EXPLAIN SELECT * FROM correct_with_order_parquet ORDER BY id ASC;
+----
+logical_plan
+01)Sort: correct_with_order_parquet.id ASC NULLS LAST
+02)--TableScan: correct_with_order_parquet projection=[id, value]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/a_low.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/c_high.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+# Test 10.2: Results must be correct
+query II
+SELECT * FROM correct_with_order_parquet ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+7 700
+8 800
+9 900
+
+# Test 11: Multiple file groups (target_partitions > 1) — each group has one file
+# When files are spread across separate partitions (one file per group), each
+# partition is trivially sorted and SortPreservingMergeExec handles the merge.
+
+# Restore higher target_partitions so files go into separate groups
+statement ok
+SET datafusion.execution.target_partitions = 4;
+
+statement ok
+CREATE EXTERNAL TABLE multi_partition_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/reversed/'
+WITH ORDER (id ASC);
+
+# Test 11.1: With separate partitions, each file is trivially sorted.
+# SortPreservingMergeExec merges, no SortExec needed per-partition.
+query TT
+EXPLAIN SELECT * FROM multi_partition_parquet ORDER BY id ASC;
+----
+logical_plan
+01)Sort: multi_partition_parquet.id ASC NULLS LAST
+02)--TableScan: multi_partition_parquet projection=[id, value]
+physical_plan
+01)SortPreservingMergeExec: [id@0 ASC NULLS LAST]
+02)--DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/a_high.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/b_mid.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/c_low.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+# Test 11.2: Results must be correct
+query II
+SELECT * FROM multi_partition_parquet ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+7 700
+8 800
+9 900
+
+# Config reset
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+SET datafusion.execution.target_partitions = 4;
+
+# Cleanup
+statement ok
+DROP TABLE reversed_high;
+
+statement ok
+DROP TABLE reversed_mid;
+
+statement ok
+DROP TABLE reversed_low;
+
+statement ok
+DROP TABLE reversed_parquet;
+
+statement ok
+DROP TABLE overlap_x;
+
+statement ok
+DROP TABLE overlap_y;
+
+statement ok
+DROP TABLE overlap_parquet;
+
+statement ok
+DROP TABLE reversed_with_order_parquet;
+
+statement ok
+DROP TABLE correct_low;
+
+statement ok
+DROP TABLE correct_mid;
+
+statement ok
+DROP TABLE correct_high;
+
+statement ok
+DROP TABLE correct_parquet;
+
+statement ok
+DROP TABLE desc_low;
+
+statement ok
+DROP TABLE desc_high;
+
+statement ok
+DROP TABLE desc_reversed_parquet;
+
+statement ok
+DROP TABLE multi_col_a;
+
+statement ok
+DROP TABLE multi_col_b;
+
+statement ok
+DROP TABLE multi_col_parquet;
+
+statement ok
+DROP TABLE multi_col_x;
+
+statement ok
+DROP TABLE multi_col_y;
+
+statement ok
+DROP TABLE multi_col_clean_parquet;
+
+statement ok
+DROP TABLE correct_with_order_parquet;
+
+statement ok
+DROP TABLE multi_partition_parquet;
+
+statement ok
+DROP TABLE timestamp_data;
+
+statement ok
+DROP TABLE timestamp_parquet;
+
+statement ok
+DROP TABLE multi_month_data;
+
+statement ok
+DROP TABLE multi_month_parquet;
+
+statement ok
+DROP TABLE int_data;
+
+statement ok
+DROP TABLE int_parquet;
+
+statement ok
+DROP TABLE float_data;
+
+statement ok
+DROP TABLE float_parquet;
+
+statement ok
+DROP TABLE signed_data;
+
+statement ok
+DROP TABLE signed_parquet;
+
+statement ok
+DROP TABLE agg_expr_data;
+
+statement ok
+DROP TABLE agg_expr_parquet;
+
+statement ok
+RESET datafusion.execution.parquet.pushdown_filters;
+
+statement ok
+RESET datafusion.execution.parquet.max_row_group_size;
+
+statement ok
+RESET datafusion.execution.parquet.enable_page_index;
+
+statement ok
+RESET datafusion.optimizer.enable_sort_pushdown;
+
+
+###############################################################
+# Statistics-based file sorting and sort elimination tests
+###############################################################
+
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = true;
+
+statement ok
+SET datafusion.execution.target_partitions = 1;
+
+statement ok
+SET datafusion.execution.collect_statistics = true;
+
+# Test A: Non-overlapping files with matching within-file ordering → Sort eliminated
+
+statement ok
+CREATE TABLE ta_src_a(id INT, value INT) AS VALUES (1, 100), (2, 200), (3, 300);
+
+statement ok
+CREATE TABLE ta_src_b(id INT, value INT) AS VALUES (4, 400), (5, 500), (6, 600);
+
+statement ok
+CREATE TABLE ta_src_c(id INT, value INT) AS VALUES (7, 700), (8, 800), (9, 900), (10, 1000);
+
+query I
+COPY (SELECT * FROM ta_src_a ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/ta_nonoverlap/file_a.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM ta_src_b ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/ta_nonoverlap/file_b.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM ta_src_c ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/ta_nonoverlap/file_c.parquet';
+----
+4
+
+statement ok
+CREATE EXTERNAL TABLE ta_sorted(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/ta_nonoverlap/'
+WITH ORDER (id ASC);
+
+# Test A.1: SortExec eliminated — Exact pushdown
+query TT
+EXPLAIN SELECT * FROM ta_sorted ORDER BY id ASC;
+----
+logical_plan
+01)Sort: ta_sorted.id ASC NULLS LAST
+02)--TableScan: ta_sorted projection=[id, value]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/ta_nonoverlap/file_a.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/ta_nonoverlap/file_b.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/ta_nonoverlap/file_c.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+query II
+SELECT * FROM ta_sorted ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+7 700
+8 800
+9 900
+10 1000
+
+# Cleanup Test A
+statement ok
+DROP TABLE ta_src_a;
+
+statement ok
+DROP TABLE ta_src_b;
+
+statement ok
+DROP TABLE ta_src_c;
+
+statement ok
+DROP TABLE ta_sorted;
+
+
+# Test B: Overlapping files → statistics-based reorder, SortExec retained
+
+statement ok
+CREATE TABLE tb_src_x(id INT, value INT) AS VALUES (1, 10), (2, 20), (3, 30), (4, 40), (5, 50);
+
+statement ok
+CREATE TABLE tb_src_y(id INT, value INT) AS VALUES (3, 31), (4, 41), (5, 51), (6, 61), (7, 71), (8, 81);
+
+statement ok
+CREATE TABLE tb_src_z(id INT, value INT) AS VALUES (6, 62), (7, 72), (8, 82), (9, 92), (10, 102);
+
+query I
+COPY (SELECT * FROM tb_src_x ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/tb_overlap/file_x.parquet';
+----
+5
+
+query I
+COPY (SELECT * FROM tb_src_y ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/tb_overlap/file_y.parquet';
+----
+6
+
+query I
+COPY (SELECT * FROM tb_src_z ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/tb_overlap/file_z.parquet';
+----
+5
+
+statement ok
+CREATE EXTERNAL TABLE tb_overlap(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/tb_overlap/'
+WITH ORDER (id ASC);
+
+# Test B.1: Multi-column DESC sort — statistics fallback sorts files [z, y, x]
+query TT
+EXPLAIN SELECT * FROM tb_overlap ORDER BY id DESC, value DESC LIMIT 5;
+----
+logical_plan
+01)Sort: tb_overlap.id DESC NULLS FIRST, tb_overlap.value DESC NULLS FIRST, fetch=5
+02)--TableScan: tb_overlap projection=[id, value]
+physical_plan
+01)SortExec: TopK(fetch=5), expr=[id@0 DESC, value@1 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tb_overlap/file_z.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tb_overlap/file_y.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tb_overlap/file_x.parquet]]}, projection=[id, value], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+query II
+SELECT * FROM tb_overlap ORDER BY id DESC, value DESC LIMIT 5;
+----
+10 102
+9 92
+8 82
+8 81
+7 72
+
+# Cleanup Test B
+statement ok
+DROP TABLE tb_src_x;
+
+statement ok
+DROP TABLE tb_src_y;
+
+statement ok
+DROP TABLE tb_src_z;
+
+statement ok
+DROP TABLE tb_overlap;
+
+
+# Test C: Non-overlapping files with LIMIT — sort elimination + limit pushdown
+
+statement ok
+CREATE TABLE tc_src_a(id INT, value INT) AS VALUES (1, 100), (2, 200), (3, 300);
+
+statement ok
+CREATE TABLE tc_src_b(id INT, value INT) AS VALUES (4, 400), (5, 500), (6, 600);
+
+statement ok
+CREATE TABLE tc_src_c(id INT, value INT) AS VALUES (7, 700), (8, 800), (9, 900), (10, 1000);
+
+query I
+COPY (SELECT * FROM tc_src_a ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/tc_limit/file_a.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM tc_src_b ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/tc_limit/file_b.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM tc_src_c ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/tc_limit/file_c.parquet';
+----
+4
+
+statement ok
+CREATE EXTERNAL TABLE tc_limit(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/tc_limit/'
+WITH ORDER (id ASC);
+
+# Test C.1: ASC LIMIT — sort eliminated, limit pushed down
+query TT
+EXPLAIN SELECT * FROM tc_limit ORDER BY id ASC LIMIT 3;
+----
+logical_plan
+01)Sort: tc_limit.id ASC NULLS LAST, fetch=3
+02)--TableScan: tc_limit projection=[id, value]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tc_limit/file_a.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tc_limit/file_b.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tc_limit/file_c.parquet]]}, projection=[id, value], limit=3, output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+query II
+SELECT * FROM tc_limit ORDER BY id ASC LIMIT 3;
+----
+1 100
+2 200
+3 300
+
+# Test C.2: DESC LIMIT — reverse scan path, SortExec stays
+query TT
+EXPLAIN SELECT * FROM tc_limit ORDER BY id DESC LIMIT 3;
+----
+logical_plan
+01)Sort: tc_limit.id DESC NULLS FIRST, fetch=3
+02)--TableScan: tc_limit projection=[id, value]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tc_limit/file_c.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tc_limit/file_b.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tc_limit/file_a.parquet]]}, projection=[id, value], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
+
+query II
+SELECT * FROM tc_limit ORDER BY id DESC LIMIT 3;
+----
+10 1000
+9 900
+8 800
+
+# Cleanup Test C
+statement ok
+DROP TABLE tc_src_a;
+
+statement ok
+DROP TABLE tc_src_b;
+
+statement ok
+DROP TABLE tc_src_c;
+
+statement ok
+DROP TABLE tc_limit;
+
+
+# Test D: Multi-group case with target_partitions=2
+
+statement ok
+CREATE TABLE td_src_a(id INT, value INT) AS VALUES (1, 100), (2, 200), (3, 300);
+
+statement ok
+CREATE TABLE td_src_b(id INT, value INT) AS VALUES (4, 400), (5, 500), (6, 600);
+
+statement ok
+CREATE TABLE td_src_c(id INT, value INT) AS VALUES (7, 700), (8, 800), (9, 900), (10, 1000);
+
+query I
+COPY (SELECT * FROM td_src_a ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/td_multi/file_a.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM td_src_b ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/td_multi/file_b.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM td_src_c ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/td_multi/file_c.parquet';
+----
+4
+
+statement ok
+SET datafusion.execution.target_partitions = 2;
+
+statement ok
+CREATE EXTERNAL TABLE td_multi(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/td_multi/'
+WITH ORDER (id ASC);
+
+# Test D.1: With 2 partitions, files split across 2 groups.
+# Each group's sort is eliminated; SortPreservingMergeExec merges groups.
+query TT
+EXPLAIN SELECT * FROM td_multi ORDER BY id ASC;
+----
+logical_plan
+01)Sort: td_multi.id ASC NULLS LAST
+02)--TableScan: td_multi projection=[id, value]
+physical_plan
+01)SortPreservingMergeExec: [id@0 ASC NULLS LAST]
+02)--DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/td_multi/file_a.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/td_multi/file_b.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/td_multi/file_c.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+query II
+SELECT * FROM td_multi ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+7 700
+8 800
+9 900
+10 1000
+
+# Cleanup Test D
+statement ok
+DROP TABLE td_src_a;
+
+statement ok
+DROP TABLE td_src_b;
+
+statement ok
+DROP TABLE td_src_c;
+
+statement ok
+DROP TABLE td_multi;
+
+# Restore target_partitions=1 for remaining tests
+statement ok
+SET datafusion.execution.target_partitions = 1;
+
+# ===========================================================
+# Test E: Inferred ordering from Parquet metadata (no WITH ORDER)
+# Parquet files written with ORDER BY have sorting_columns in metadata.
+# DataFusion should automatically infer the ordering and eliminate Sort.
+# ===========================================================
+
+# Create sorted parquet files — COPY with ORDER BY writes sorting_columns metadata
+statement ok
+CREATE TABLE te_src_a(id INT, value INT) AS VALUES (1, 100), (2, 200), (3, 300);
+
+statement ok
+CREATE TABLE te_src_b(id INT, value INT) AS VALUES (4, 400), (5, 500), (6, 600);
+
+statement ok
+CREATE TABLE te_src_c(id INT, value INT) AS VALUES (7, 700), (8, 800), (9, 900), (10, 1000);
+
+query I
+COPY (SELECT * FROM te_src_a ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/te_inferred/file_a.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM te_src_b ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/te_inferred/file_b.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM te_src_c ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/te_inferred/file_c.parquet';
+----
+4
+
+# Create external table WITHOUT "WITH ORDER" — ordering should be inferred
+# from Parquet sorting_columns metadata
+statement ok
+CREATE EXTERNAL TABLE te_inferred(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/te_inferred/';
+
+# Test E.1: Ordering should be inferred and SortExec eliminated
+# Even without WITH ORDER, the optimizer detects that files are sorted
+# (from Parquet sorting_columns metadata) and non-overlapping.
+# SortExec is completely eliminated — just DataSourceExec with output_ordering.
+query TT
+EXPLAIN SELECT * FROM te_inferred ORDER BY id ASC;
+----
+logical_plan
+01)Sort: te_inferred.id ASC NULLS LAST
+02)--TableScan: te_inferred projection=[id, value]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/te_inferred/file_a.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/te_inferred/file_b.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/te_inferred/file_c.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+# Test E.2: Verify result correctness
+query II
+SELECT * FROM te_inferred ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+7 700
+8 800
+9 900
+10 1000
+
+# Test E.3: LIMIT should also work with inferred ordering
+query II
+SELECT * FROM te_inferred ORDER BY id ASC LIMIT 3;
+----
+1 100
+2 200
+3 300
+
+# Test E.4: Inferred ordering with multiple partitions
+# With target_partitions=2, files split into 2 groups.
+# Each group has SortExec eliminated, SPM merges the sorted streams.
+statement ok
+DROP TABLE te_inferred;
+
+statement ok
+SET datafusion.execution.target_partitions = 2;
+
+statement ok
+CREATE EXTERNAL TABLE te_inferred_multi(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/te_inferred/';
+
+query TT
+EXPLAIN SELECT * FROM te_inferred_multi ORDER BY id ASC;
+----
+logical_plan
+01)Sort: te_inferred_multi.id ASC NULLS LAST
+02)--TableScan: te_inferred_multi projection=[id, value]
+physical_plan
+01)SortPreservingMergeExec: [id@0 ASC NULLS LAST]
+02)--DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/te_inferred/file_a.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/te_inferred/file_b.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/te_inferred/file_c.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+# Verify correctness with multi partition
+query II
+SELECT * FROM te_inferred_multi ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+7 700
+8 800
+9 900
+10 1000
+
+# Cleanup Test E
+statement ok
+DROP TABLE te_src_a;
+
+statement ok
+DROP TABLE te_src_b;
+
+statement ok
+DROP TABLE te_src_c;
+
+statement ok
+DROP TABLE te_inferred_multi;
+
+# ===========================================================
+# Test F: NULL handling — sort pushdown must not eliminate Sort
+# when files contain NULLs in sort columns, because NULL ordering
+# (NULLS FIRST/LAST) across files requires a full sort.
+# ===========================================================
+
+# Test F.1: NULLS LAST — file with NULL must not cause wrong ordering
+statement ok
+CREATE TABLE null_src_a(id INT) AS VALUES (1), (NULL);
+
+statement ok
+CREATE TABLE null_src_b(id INT) AS VALUES (2), (3);
+
+query I
+COPY (SELECT * FROM null_src_a ORDER BY id ASC NULLS LAST)
+TO 'test_files/scratch/sort_pushdown/tf_nulls/b_null_tail.parquet';
+----
+2
+
+query I
+COPY (SELECT * FROM null_src_b ORDER BY id ASC NULLS LAST)
+TO 'test_files/scratch/sort_pushdown/tf_nulls/a_nonnull.parquet';
+----
+2
+
+statement ok
+CREATE EXTERNAL TABLE tf_nulls_last(id INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/tf_nulls/'
+WITH ORDER (id ASC NULLS LAST);
+
+# With target_partitions=1, files end up in separate groups via
+# split_groups_by_statistics. EnforceSorting eliminates SortExec,
+# SPM merges the two sorted streams.
+query TT
+EXPLAIN SELECT * FROM tf_nulls_last ORDER BY id ASC NULLS LAST;
+----
+logical_plan
+01)Sort: tf_nulls_last.id ASC NULLS LAST
+02)--TableScan: tf_nulls_last projection=[id]
+physical_plan
+01)SortPreservingMergeExec: [id@0 ASC NULLS LAST]
+02)--DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tf_nulls/a_nonnull.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tf_nulls/b_null_tail.parquet]]}, projection=[id], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+# Results must be correct: NULLs at the end
+query I
+SELECT * FROM tf_nulls_last ORDER BY id ASC NULLS LAST;
+----
+1
+2
+3
+NULL
+
+# Test F.2: NULLS FIRST — NULLs should come first
+query I
+SELECT * FROM tf_nulls_last ORDER BY id ASC NULLS FIRST;
+----
+NULL
+1
+2
+3
+
+# Cleanup Test F
+statement ok
+DROP TABLE null_src_a;
+
+statement ok
+DROP TABLE null_src_b;
+
+statement ok
+DROP TABLE tf_nulls_last;
+
+# ===========================================================
+# Test G: BufferExec insertion when sort elimination removes
+# SortExec under SortPreservingMergeExec.
+#
+# When PushdownSort eliminates SortExec(preserve_partitioning=true),
+# SPM loses SortExec's memory buffer. A BufferExec is inserted to
+# compensate, allowing I/O pipelining with merge computation.
+# ===========================================================
+
+# Create files with reversed naming: c_low has smallest values,
+# a_high has largest — alphabetical order ≠ sort key order.
+statement ok
+CREATE TABLE tg_src_low(id INT, value INT) AS VALUES (1, 100), (2, 200), (3, 300);
+
+statement ok
+CREATE TABLE tg_src_mid(id INT, value INT) AS VALUES (4, 400), (5, 500), (6, 600);
+
+statement ok
+CREATE TABLE tg_src_high(id INT, value INT) AS VALUES (7, 700), (8, 800), (9, 900), (10, 1000);
+
+query I
+COPY (SELECT * FROM tg_src_low ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/tg_buffer/c_low.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM tg_src_mid ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/tg_buffer/b_mid.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM tg_src_high ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/tg_buffer/a_high.parquet';
+----
+4
+
+# Use target_partitions=2 so files are split across 2 groups.
+# Files are in wrong alphabetical order → validated_output_ordering strips ordering
+# → EnforceSorting adds SortExec(preserve_partitioning) + SPM
+# → PushdownSort eliminates SortExec and inserts BufferExec
+statement ok
+SET datafusion.execution.target_partitions = 2;
+
+statement ok
+CREATE EXTERNAL TABLE tg_buffer(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/tg_buffer/'
+WITH ORDER (id ASC);
+
+# Test G.1: BufferExec appears between SPM and DataSourceExec
+query TT
+EXPLAIN SELECT * FROM tg_buffer ORDER BY id ASC;
+----
+logical_plan
+01)Sort: tg_buffer.id ASC NULLS LAST
+02)--TableScan: tg_buffer projection=[id, value]
+physical_plan
+01)SortPreservingMergeExec: [id@0 ASC NULLS LAST]
+02)--BufferExec: capacity=1073741824
+03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/a_high.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/c_low.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+# Verify correctness
+query II
+SELECT * FROM tg_buffer ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+7 700
+8 800
+9 900
+10 1000
+
+# Test G.2: LIMIT query with BufferExec
+query TT
+EXPLAIN SELECT * FROM tg_buffer ORDER BY id ASC LIMIT 3;
+----
+logical_plan
+01)Sort: tg_buffer.id ASC NULLS LAST, fetch=3
+02)--TableScan: tg_buffer projection=[id, value]
+physical_plan
+01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=3
+02)--BufferExec: capacity=1073741824
+03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/a_high.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/c_low.parquet]]}, projection=[id, value], limit=3, output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+query II
+SELECT * FROM tg_buffer ORDER BY id ASC LIMIT 3;
+----
+1 100
+2 200
+3 300
+
+# Cleanup Test G
+statement ok
+DROP TABLE tg_src_low;
+
+statement ok
+DROP TABLE tg_src_mid;
+
+statement ok
+DROP TABLE tg_src_high;
+
+statement ok
+DROP TABLE tg_buffer;
+
+# Reset settings (SLT runner uses target_partitions=4, not system default)
+statement ok
+SET datafusion.execution.target_partitions = 4;
+
+statement ok
+SET datafusion.execution.collect_statistics = true;
+
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = true;
diff --git a/datafusion/sqllogictest/test_files/spark/README.md b/datafusion/sqllogictest/test_files/spark/README.md
index cffd28009889d..e61001c6e42e5 100644
--- a/datafusion/sqllogictest/test_files/spark/README.md
+++ b/datafusion/sqllogictest/test_files/spark/README.md
@@ -39,6 +39,18 @@ When testing Spark functions:
 - Test cases should only contain `SELECT` statements with the function being tested
 - Add explicit casts to input values to ensure the correct data type is used (e.g., `0::INT`)
   - Explicit casting is necessary because DataFusion and Spark do not infer data types in the same way
+- If the Spark built-in function under test behaves differently in ANSI SQL mode, please wrap your test cases like this example:
+
+```sql
+statement ok
+set datafusion.execution.enable_ansi_mode = true;
+
+# Functions under test
+select abs((-128)::TINYINT)
+
+statement ok
+set datafusion.execution.enable_ansi_mode = false;
+```
 
 ### Finding Test Cases
 
diff --git a/datafusion/sqllogictest/test_files/spark/aggregate/collect.slt b/datafusion/sqllogictest/test_files/spark/aggregate/collect.slt
new file mode 100644
index 0000000000000..c367c9cb7a6a6
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/aggregate/collect.slt
@@ -0,0 +1,124 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query ?
+SELECT collect_list(a) FROM (VALUES (1), (2), (3)) AS t(a);
+----
+[1, 2, 3]
+
+query ?
+SELECT collect_list(a) FROM (VALUES (1), (2), (2), (3), (1)) AS t(a);
+----
+[1, 2, 2, 3, 1]
+
+query ?
+SELECT collect_list(a) FROM (VALUES (1), (NULL), (3)) AS t(a);
+----
+[1, 3]
+
+query ?
+SELECT collect_list(a) FROM (VALUES (CAST(NULL AS INT)), (NULL), (NULL)) AS t(a);
+----
+[]
+
+query I?
+SELECT g, collect_list(a)
+FROM (VALUES (1, 10), (1, 20), (2, 30), (2, 30), (1, 10)) AS t(g, a)
+GROUP BY g
+ORDER BY g;
+----
+1 [10, 20, 10]
+2 [30, 30]
+
+query I?
+SELECT g, collect_list(a)
+FROM (VALUES (1, 10), (1, NULL), (2, 20), (2, NULL)) AS t(g, a)
+GROUP BY g
+ORDER BY g;
+----
+1 [10]
+2 [20]
+
+# we need to wrap collect_set with array_sort to have consistent outputs
+query ?
+SELECT array_sort(collect_set(a)) FROM (VALUES (1), (2), (3)) AS t(a);
+----
+[1, 2, 3]
+
+query ?
+SELECT array_sort(collect_set(a)) FROM (VALUES (1), (2), (2), (3), (1)) AS t(a);
+----
+[1, 2, 3]
+
+query ?
+SELECT array_sort(collect_set(a)) FROM (VALUES (1), (NULL), (3)) AS t(a);
+----
+[1, 3]
+
+query ?
+SELECT array_sort(collect_set(a)) FROM (VALUES (CAST(NULL AS INT)), (NULL), (NULL)) AS t(a);
+----
+[]
+
+query I?
+SELECT g, array_sort(collect_set(a))
+FROM (VALUES (1, 10), (1, 20), (2, 30), (2, 30), (1, 10)) AS t(g, a)
+GROUP BY g
+ORDER BY g;
+----
+1 [10, 20]
+2 [30]
+
+query I?
+SELECT g, array_sort(collect_set(a))
+FROM (VALUES (1, 10), (1, NULL), (1, NULL), (2, 20), (2, NULL)) AS t(g, a)
+GROUP BY g
+ORDER BY g;
+----
+1 [10]
+2 [20]
+
+# collect_set with GROUP BY: group where all values are NULL returns empty list
+query I?
+SELECT g, array_sort(collect_set(a))
+FROM (VALUES (1::INT, 10::INT), (1::INT, 20::INT), (2::INT, NULL::INT), (2::INT, NULL::INT)) AS t(g, a)
+GROUP BY g
+ORDER BY g;
+----
+1 [10, 20]
+2 []
+
+# collect_set with GROUP BY: string values with duplicates
+query T?
+SELECT g, array_sort(collect_set(v))
+FROM (VALUES ('a'::TEXT, 'x'::TEXT), ('a'::TEXT, 'y'::TEXT), ('a'::TEXT, 'x'::TEXT), ('b'::TEXT, 'z'::TEXT), ('b'::TEXT, 'z'::TEXT)) AS t(g, v)
+GROUP BY g
+ORDER BY g;
+----
+a [x, y]
+b [z]
+
+# collect_set with GROUP BY: multiple groups with mixed NULLs and duplicates
+query I?
+SELECT g, array_sort(collect_set(a))
+FROM (VALUES (1::INT, 5::INT), (1::INT, 5::INT), (1::INT, NULL::INT), (2::INT, 10::INT), (2::INT, 20::INT), (2::INT, 10::INT), (3::INT, NULL::INT), (3::INT, NULL::INT)) AS t(g, a)
+GROUP BY g
+ORDER BY g;
+----
+1 [5]
+2 [10, 20]
+3 []
diff --git a/datafusion/sqllogictest/test_files/spark/aggregate/try_sum.slt b/datafusion/sqllogictest/test_files/spark/aggregate/try_sum.slt
new file mode 100644
index 0000000000000..0f440a97dd1cc
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/aggregate/try_sum.slt
@@ -0,0 +1,140 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query I
+SELECT try_sum(x) AS sum_x FROM VALUES (1), (2), (3) AS tab(x);
+----
+6
+
+query I
+SELECT try_sum(x) AS sum_x FROM VALUES (NULL), (2), (NULL) AS tab(x);
+----
+2
+
+query I
+SELECT try_sum(x) AS sum_x FROM VALUES (CAST(9223372036854775807 AS BIGINT)), (1) AS tab(x);
+----
+NULL
+
+query R
+SELECT try_sum(x) AS sum_x FROM VALUES (1.5), (2.5), (3.0) AS tab(x);
+----
+7
+
+query R
+SELECT try_sum(x) AS sum_x FROM VALUES (1e308), (1e308) AS tab(x);
+----
+Infinity
+
+query R
+SELECT try_sum(x) AS sum_x FROM VALUES (CAST('NaN' AS DOUBLE)), (1.0) AS tab(x);
+----
+NaN
+
+query R
+SELECT try_sum(x) AS sum_x FROM VALUES (CAST('Infinity' AS DOUBLE)), (1.0) AS tab(x);
+----
+Infinity
+
+# Decimal
+
+query R
+SELECT try_sum(x) AS sum_x FROM VALUES (DECIMAL(10,2) '1.23'), (DECIMAL(10,2) '4.77') AS tab(x);
+----
+6
+
+query R
+SELECT try_sum(x) AS sum_x FROM VALUES (DECIMAL(10,2) '1.00'), (NULL), (DECIMAL(10,2) '2.50') AS tab(x);
+----
+3.5
+
+query R
+SELECT try_sum(x) AS sum_x FROM VALUES (DECIMAL(5,0) '90000'), (DECIMAL(5,0) '20000') AS tab(x);
+----
+110000
+
+query R
+SELECT try_sum(x) AS sum_x FROM VALUES (DECIMAL(38,0) '11111111111111111111111111111111111111'),
+                                      (DECIMAL(38,0) '11111111111111111111111111111111111111'),
+                                      (DECIMAL(38,0) '11111111111111111111111111111111111111'),
+                                      (DECIMAL(38,0) '11111111111111111111111111111111111111'),
+                                      (DECIMAL(38,0) '11111111111111111111111111111111111111'),
+                                      (DECIMAL(38,0) '11111111111111111111111111111111111111'),
+                                      (DECIMAL(38,0) '11111111111111111111111111111111111111'),
+                                      (DECIMAL(38,0) '11111111111111111111111111111111111111'),
+                                      (DECIMAL(38,0) '11111111111111111111111111111111111111'),
+                                      (DECIMAL(38,0) '11111111111111111111111111111111111111') AS tab(x);
+----
+NULL
+
+#Group By
+query TI
+SELECT g, try_sum(x) AS sum_x
+FROM VALUES
+  ('bad', CAST(9223372036854775807 AS BIGINT)),
+  ('bad', 1),
+  ('ok', 10),
+  ('ok', NULL),
+  ('ok', 5) AS tab(g, x)
+GROUP BY g
+ORDER BY g;
+----
+bad NULL
+ok 15
+
+query R
+SELECT try_sum(col) FROM VALUES (NULL), (NULL) AS tab(col);
+----
+NULL
+
+
+query R
+SELECT try_sum(col) AS sum_x FROM VALUES (CAST('-Infinity' AS DOUBLE)), (CAST('Infinity' AS DOUBLE)) AS tab(col);
+----
+NaN
+
+query R
+SELECT try_sum(col) AS sum_x FROM VALUES (CAST('-Infinity' AS DOUBLE)), (CAST('-Infinity' AS DOUBLE)) AS tab(col);
+----
+-Infinity
+
+query R
+SELECT try_sum(col) AS sum_x FROM VALUES (CAST('Infinity' AS FLOAT)), (CAST(1.0 AS FLOAT)) AS tab(col);
+----
+Infinity
+
+query R
+SELECT try_sum(col) AS sum_x FROM VALUES (-0.0), (0.0) AS tab(col);
+----
+0
+
+# need be 0.0
+query R
+SELECT try_sum(col) AS sum_x FROM VALUES (CAST(-0.0 AS DOUBLE)), (CAST(0.0 AS DOUBLE)) AS tab(col);
+----
+0
+
+query R
+SELECT try_sum(col) AS sum_x FROM VALUES (CAST(-5.5 AS DECIMAL(10,2))), (CAST(5.5 AS DECIMAL(10,2))) AS tab(col);
+----
+0
+
+# Compare double 0.0 vs decimal 0.00
+query RR
+SELECT 0.0 AS double_zero, CAST(0.0 AS DECIMAL(10,2)) AS decimal_zero;
+----
+0 0
diff --git a/datafusion/sqllogictest/test_files/spark/array/array.slt b/datafusion/sqllogictest/test_files/spark/array/array.slt
index 09821e6d582d2..79dca1c10a7d0 100644
--- a/datafusion/sqllogictest/test_files/spark/array/array.slt
+++ b/datafusion/sqllogictest/test_files/spark/array/array.slt
@@ -70,3 +70,18 @@ query ?
 SELECT array(array(1,2));
 ----
 [[1, 2]]
+
+query ?
+SELECT array(arrow_cast(array(1), 'LargeList(Int64)'));
+----
+[[1]]
+
+query ?
+SELECT array(arrow_cast(array(1), 'LargeList(Int64)'), arrow_cast(array(), 'LargeList(Int64)'));
+----
+[[1], []]
+
+query ?
+SELECT array(arrow_cast(array(1,2), 'LargeList(Int64)'), array(3));
+----
+[[1, 2], [3]]
diff --git a/datafusion/sqllogictest/test_files/spark/array/array_contains.slt b/datafusion/sqllogictest/test_files/spark/array/array_contains.slt
new file mode 100644
index 0000000000000..db9ac6b122e3f
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/array/array_contains.slt
@@ -0,0 +1,140 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for Spark-compatible array_contains function.
+# Spark semantics: if element is found -> true; if not found and array has nulls -> null; if not found and no nulls -> false.
+
+###
+### Scalar tests
+###
+
+# Element found in array
+query B
+SELECT array_contains(array(1, 2, 3), 2);
+----
+true
+
+# Element not found, no nulls in array
+query B
+SELECT array_contains(array(1, 2, 3), 4);
+----
+false
+
+# Element not found, array has null elements -> null
+query B
+SELECT array_contains(array(1, NULL, 3), 2);
+----
+NULL
+
+# Element found, array has null elements -> true (nulls don't matter)
+query B
+SELECT array_contains(array(1, NULL, 3), 1);
+----
+true
+
+# Element found at the end, array has null elements -> true
+query B
+SELECT array_contains(array(1, NULL, 3), 3);
+----
+true
+
+# Null array -> null
+query B
+SELECT array_contains(NULL, 1);
+----
+NULL
+
+# Null element -> null
+query B
+SELECT array_contains(array(1, 2, 3), NULL);
+----
+NULL
+
+# Empty array, element not found -> false
+query B
+SELECT array_contains(array(), 1);
+----
+false
+
+# Array with only nulls, element not found -> null
+query B
+SELECT array_contains(array(NULL, NULL), 1);
+----
+NULL
+
+# String array, element found
+query B
+SELECT array_contains(array('a', 'b', 'c'), 'b');
+----
+true
+
+# String array, element not found, no nulls
+query B
+SELECT array_contains(array('a', 'b', 'c'), 'd');
+----
+false
+
+# String array, element not found, has null
+query B
+SELECT array_contains(array('a', NULL, 'c'), 'd');
+----
+NULL
+
+###
+### Columnar tests with a table
+###
+
+statement ok
+CREATE TABLE test_arrays AS VALUES
+  (1, make_array(1, 2, 3),       10),
+  (2, make_array(4, NULL, 6),    5),
+  (3, make_array(7, 8, 9),       10),
+  (4, NULL,                      1),
+  (5, make_array(10, NULL, NULL), 10);
+
+# Column needle against column array
+query IBB
+SELECT column1,
+       array_contains(column2, column3),
+       array_contains(column2, 10)
+FROM test_arrays
+ORDER BY column1;
+----
+1 false false
+2 NULL NULL
+3 false false
+4 NULL NULL
+5 true true
+
+statement ok
+DROP TABLE test_arrays;
+
+###
+### Nested array tests
+###
+
+# Nested array element found
+query B
+SELECT array_contains(array(array(1, 2), array(3, 4)), array(3, 4));
+----
+true
+
+# Nested array element not found, no nulls
+query B
+SELECT array_contains(array(array(1, 2), array(3, 4)), array(5, 6));
+----
+false
diff --git a/datafusion/sqllogictest/test_files/spark/array/array_repeat.slt b/datafusion/sqllogictest/test_files/spark/array/array_repeat.slt
index 544c39608f33b..923e349140976 100644
--- a/datafusion/sqllogictest/test_files/spark/array/array_repeat.slt
+++ b/datafusion/sqllogictest/test_files/spark/array/array_repeat.slt
@@ -15,13 +15,100 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT array_repeat('123', 2);
-## PySpark 3.5.5 Result: {'array_repeat(123, 2)': ['123', '123'], 'typeof(array_repeat(123, 2))': 'array<string>', 'typeof(123)': 'string', 'typeof(2)': 'int'}
-#query
-#SELECT array_repeat('123'::string, 2::int);
+
+query ?
+SELECT array_repeat('123', 2);
+----
+[123, 123]
+
+query ?
+SELECT array_repeat('123', 0);
+----
+[]
+
+query ?
+SELECT array_repeat('123', -1);
+----
+[]
+
+query ?
+SELECT array_repeat('123', CAST('2' AS INT));
+----
+[123, 123]
+
+query ?
+SELECT array_repeat(123, 3);
+----
+[123, 123, 123]
+
+query ?
+SELECT array_repeat('2001-09-28T01:00:00'::timestamp, 2);
+----
+[2001-09-28T01:00:00, 2001-09-28T01:00:00]
+
+query ?
+SELECT array_repeat(array_repeat('123', CAST('2' AS INT)), CAST('3' AS INT));
+----
+[[123, 123], [123, 123], [123, 123]]
+
+query ?
+SELECT array_repeat(['123'], 2);
+----
+[[123], [123]]
+
+query ?
+SELECT array_repeat(NULL, 2);
+----
+[NULL, NULL]
+
+query ?
+SELECT array_repeat(NULL, 1);
+----
+[NULL]
+
+query ?
+SELECT array_repeat(NULL, 0);
+----
+[]
+
+query ?
+SELECT array_repeat([NULL], 2);
+----
+[[NULL], [NULL]]
+
+query ?
+SELECT array_repeat(['123', NULL], 2);
+----
+[[123, NULL], [123, NULL]]
+
+query ?
+SELECT array_repeat('123', CAST(NULL AS INT));
+----
+NULL
+
+query ?
+SELECT array_repeat(column1, column2)
+FROM VALUES
+('123', 2),
+('123', 0),
+('123', -1),
+(NULL, 1),
+('123', NULL);
+----
+[123, 123]
+[]
+[]
+[NULL]
+NULL
+
+
+query ?
+SELECT array_repeat(column1, column2)
+FROM VALUES
+(['123'], 2),
+([], 2),
+([NULL], 2);
+----
+[[123], [123]]
+[[], []]
+[[NULL], [NULL]]
diff --git a/datafusion/sqllogictest/test_files/spark/array/shuffle.slt b/datafusion/sqllogictest/test_files/spark/array/shuffle.slt
index 7614caef666bb..01d319b619dab 100644
--- a/datafusion/sqllogictest/test_files/spark/array/shuffle.slt
+++ b/datafusion/sqllogictest/test_files/spark/array/shuffle.slt
@@ -16,27 +16,16 @@
 # under the License.
 
 # Test shuffle function with simple arrays
-query B
-SELECT array_sort(shuffle([1, 2, 3, 4, 5, NULL])) = [NULL,1, 2, 3, 4, 5];
-----
-true
-
-query B
-SELECT shuffle([1, 2, 3, 4, 5, NULL]) != [1, 2, 3, 4, 5, NULL];
+query ?
+SELECT shuffle([1, 2, 3, 4, 5, NULL], 1);
 ----
-true
+[1, 4, NULL, 2, 5, 3]
 
 # Test shuffle function with string arrays
-
-query B
-SELECT array_sort(shuffle(['a', 'b', 'c', 'd', 'e', 'f'])) = ['a', 'b', 'c', 'd', 'e', 'f'];
-----
-true
-
-query B
-SELECT shuffle(['a', 'b', 'c', 'd', 'e', 'f']) != ['a', 'b', 'c', 'd', 'e', 'f'];;
+query ?
+SELECT shuffle(['a', 'b', 'c', 'd', 'e', 'f'], 1);
 ----
-true
+[a, d, f, b, e, c]
 
 # Test shuffle function with empty array
 query ?
@@ -57,15 +46,10 @@ SELECT shuffle(NULL);
 NULL
 
 # Test shuffle function with fixed size list arrays
-query B
-SELECT array_sort(shuffle(arrow_cast([1, 2, NULL, 3, 4, 5], 'FixedSizeList(6, Int64)'))) = [NULL, 1, 2, 3, 4, 5];
-----
-true
-
-query B
-SELECT shuffle(arrow_cast([1, 2, NULL, 3, 4, 5], 'FixedSizeList(6, Int64)')) != [1, 2, NULL, 3, 4, 5];
+query ?
+SELECT shuffle(arrow_cast([1, 2, NULL, 3, 4, 5], 'FixedSizeList(6, Int64)'), 1);
 ----
-true
+[1, 3, 5, 2, 4, NULL]
 
 # Test shuffle on table data with different list types
 statement ok
@@ -78,10 +62,10 @@ CREATE TABLE test_shuffle_list_types AS VALUES
 
 # Test shuffle with large list from table
 query ?
-SELECT array_sort(shuffle(column1)) FROM test_shuffle_list_types;
+SELECT shuffle(column1, 1) FROM test_shuffle_list_types;
 ----
-[1, 2, 3, 4]
-[5, 6, 7, 8, 9]
+[1, 4, 3, 2]
+[8, 9, 6, 5, 7]
 [10]
 NULL
 []
@@ -96,13 +80,43 @@ CREATE TABLE test_shuffle_fixed_size AS VALUES
 
 # Test shuffle with fixed size list from table
 query ?
-SELECT array_sort(shuffle(column1)) FROM test_shuffle_fixed_size;
+SELECT shuffle(column1, 1) FROM test_shuffle_fixed_size;
 ----
 [1, 2, 3]
-[4, 5, 6]
-[NULL, 8, 9]
+[4, 6, 5]
+[9, NULL, 8]
 NULL
 
+query ?
+SELECT shuffle(['2001-09-28T01:00:00'::timestamp, '2001-08-28T01:00:00'::timestamp, '2001-07-28T01:00:00'::timestamp, '2001-06-28T01:00:00'::timestamp, '2001-05-28T01:00:00'::timestamp], 1);
+----
+[2001-09-28T01:00:00, 2001-06-28T01:00:00, 2001-07-28T01:00:00, 2001-08-28T01:00:00, 2001-05-28T01:00:00]
+
+query ?
+SELECT shuffle(shuffle([1, 20, NULL, 3, 100, NULL, 98, 99], 1), 1);
+----
+[1, 99, NULL, 98, 100, NULL, 3, 20]
+
+query ?
+SELECT shuffle([' ', NULL, 'abc'], 1);
+----
+[ , NULL, abc]
+
+query ?
+SELECT shuffle([1, 2, 3, 4], CAST('2' AS INT));
+----
+[1, 4, 2, 3]
+
+query ?
+SELECT shuffle(['ab'], NULL);
+----
+[ab]
+
+query ?
+SELECT shuffle(shuffle([3, 3], NULL), NULL);
+----
+[3, 3]
+
 # Clean up
 statement ok
 DROP TABLE test_shuffle_list_types;
diff --git a/datafusion/sqllogictest/test_files/spark/array/slice.slt b/datafusion/sqllogictest/test_files/spark/array/slice.slt
new file mode 100644
index 0000000000000..6dfc1c0c6d0bf
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/array/slice.slt
@@ -0,0 +1,139 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query ?
+SELECT slice([], 2, 2);
+----
+[]
+
+query ?
+SELECT slice([1, 2, 3, 4], 2, 2);
+----
+[2, 3]
+
+query ?
+SELECT slice([1, 2, 3, 4], 1, 100);
+----
+[1, 2, 3, 4]
+
+query ?
+SELECT slice([1, 2, 3, 4], -2, 2);
+----
+[3, 4]
+
+query ?
+SELECT slice([1, 2, 3, 4], 100, 2);
+----
+[]
+
+query ?
+SELECT slice([1, 2, 3, 4], -200, 2);
+----
+[]
+
+query error DataFusion error: Execution error: Length must be non-negative, but got -2
+SELECT slice([1, 2, 3, 4], 2, -2);
+
+query error DataFusion error: Execution error: Length must be non-negative, but got -2
+SELECT slice([1, 2, 3, 4], -2, -2);
+
+query error DataFusion error: Execution error: Start index must not be zero
+SELECT slice([1, 2, 3, 4], 0, -2);
+
+query ?
+SELECT slice([NULL, NULL, NULL, NULL, NULL], 2, 2);
+----
+[NULL, NULL]
+
+query ?
+SELECT slice(arrow_cast(NULL, 'FixedSizeList(1, Int64)'), 2, 2);
+----
+NULL
+
+query ?
+SELECT slice([1, 2, 3, 4], NULL, 2);
+----
+NULL
+
+query ?
+SELECT slice([1, 2, 3, 4], 2, NULL);
+----
+NULL
+
+
+query ?
+SELECT slice(column1, column2, column3)
+FROM VALUES
+([1, 2, 3, 4], 2, 2),
+([1, 2, 3, 4], 1, 100),
+([1, 2, 3, 4], -2, 2),
+([], 2, 2),
+([1, 2, 3, 4], 100, 2),
+([1, 2, 3, 4], -200, 2),
+([NULL, NULL, NULL, NULL, NULL], 2, 2),
+(arrow_cast(NULL, 'FixedSizeList(1, Int64)'), 2, 2),
+([1, 2, 3, 4], NULL, 2),
+([1, 2, 3, 4], 2, NULL);
+----
+[2, 3]
+[1, 2, 3, 4]
+[3, 4]
+[]
+[]
+[]
+[NULL, NULL]
+NULL
+NULL
+NULL
+
+query ?
+SELECT slice(['2001-09-28T01:00:00'::timestamp, '2001-08-28T01:00:00'::timestamp, '2001-07-28T01:00:00'::timestamp, '2001-06-28T01:00:00'::timestamp, '2001-05-28T01:00:00'::timestamp], 1, 3);
+----
+[2001-09-28T01:00:00, 2001-08-28T01:00:00, 2001-07-28T01:00:00]
+
+query ?
+SELECT slice(slice([1, 2, 3, 4], 1, 3), 1, 2);
+----
+[1, 2]
+
+query ?
+SELECT slice([1, 2, 3, 4], CAST('2' AS INT), 4);
+----
+[2, 3, 4]
+
+query ?
+SELECT slice(column1, column2, column3)
+FROM VALUES
+(NULL, 1, 2),
+(NULL, 1, -2),
+(NULL, -1, 2),
+(NULL, 0, 2);
+----
+NULL
+NULL
+NULL
+NULL
+
+query ?
+SELECT slice(slice(NULL, 1, 2), 1, 2)
+----
+NULL
+
+query ?
+SELECT slice(slice(make_array(NULL), 1, 2), 1, 2)
+----
+[NULL]
diff --git a/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_bit_position.slt b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_bit_position.slt
new file mode 100644
index 0000000000000..4af3193a5db31
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_bit_position.slt
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+query I
+SELECT bitmap_bit_position(arrow_cast(1, 'Int8'));
+----
+0
+
+query I
+SELECT bitmap_bit_position(arrow_cast(3, 'Int8'));
+----
+2
+
+query I
+SELECT bitmap_bit_position(arrow_cast(7, 'Int8'));
+----
+6
+
+query I
+SELECT bitmap_bit_position(arrow_cast(15, 'Int8'));
+----
+14
+
+query I
+SELECT bitmap_bit_position(arrow_cast(-1, 'Int8'));
+----
+1
+
+query I
+SELECT bitmap_bit_position(arrow_cast(256, 'Int16'));
+----
+255
+
+query I
+SELECT bitmap_bit_position(arrow_cast(1024, 'Int16'));
+----
+1023
+
+query I
+SELECT bitmap_bit_position(arrow_cast(-32768, 'Int16'));
+----
+0
+
+query I
+SELECT bitmap_bit_position(arrow_cast(16384, 'Int16'));
+----
+16383
+
+query I
+SELECT bitmap_bit_position(arrow_cast(-1, 'Int16'));
+----
+1
+
+query I
+SELECT bitmap_bit_position(arrow_cast(65536, 'Int32'));
+----
+32767
+
+query I
+SELECT bitmap_bit_position(arrow_cast(1048576, 'Int32'));
+----
+32767
+
+query I
+SELECT bitmap_bit_position(arrow_cast(-2147483648, 'Int32'));
+----
+0
+
+query I
+SELECT bitmap_bit_position(arrow_cast(1073741824, 'Int32'));
+----
+32767
+
+query I
+SELECT bitmap_bit_position(arrow_cast(-1, 'Int32'));
+----
+1
+
+query I
+SELECT bitmap_bit_position(arrow_cast(4294967296, 'Int64'));
+----
+32767
+
+query I
+SELECT bitmap_bit_position(arrow_cast(-1, 'Int64'));
+----
+1
+
+query I
+SELECT bitmap_bit_position(arrow_cast(-9223372036854775808, 'Int64'));
+----
+0
+
+query I
+SELECT bitmap_bit_position(arrow_cast(9223372036854775807, 'Int64'));
+----
+32766
diff --git a/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_bucket_number.slt b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_bucket_number.slt
new file mode 100644
index 0000000000000..2a6e190b31eab
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_bucket_number.slt
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(1, 'Int8'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(127, 'Int8'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-1, 'Int8'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-64, 'Int8'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-65, 'Int8'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(1, 'Int16'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(257, 'Int16'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(32767, 'Int16'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-1, 'Int16'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-256, 'Int16'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(1, 'Int32'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(65537, 'Int32'));
+----
+3
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(2147483647, 'Int32'));
+----
+65536
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-1, 'Int32'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-65536, 'Int32'));
+----
+-2
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(1, 'Int64'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(4294967297, 'Int64'));
+----
+131073
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(9223372036854775807, 'Int64'));
+----
+281474976710656
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-1, 'Int64'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-4294967296, 'Int64'));
+----
+-131072
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-9223372036854775808, 'Int64'));
+----
+-281474976710656
diff --git a/datafusion/sqllogictest/test_files/spark/bitwise/bit_count.slt b/datafusion/sqllogictest/test_files/spark/bitwise/bit_count.slt
index 216d99025171a..8ec886d02e78f 100644
--- a/datafusion/sqllogictest/test_files/spark/bitwise/bit_count.slt
+++ b/datafusion/sqllogictest/test_files/spark/bitwise/bit_count.slt
@@ -225,3 +225,18 @@ query I
 SELECT bit_count(arrow_cast(-9223372036854775808, 'Int64'));
 ----
 1
+
+query I
+SELECT bit_count(true);
+----
+1
+
+query I
+SELECT bit_count(false);
+----
+0
+
+query I
+SELECT bit_count(cast(null as boolean));
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/bitwise/bit_get.slt b/datafusion/sqllogictest/test_files/spark/bitwise/bit_get.slt
index 6a2b244d58e69..faba0b66c4f20 100644
--- a/datafusion/sqllogictest/test_files/spark/bitwise/bit_get.slt
+++ b/datafusion/sqllogictest/test_files/spark/bitwise/bit_get.slt
@@ -69,7 +69,32 @@ SELECT bit_get(NULL, 0);
 ----
 NULL
 
+query I
+SELECT bit_get(NULL::int, 0);
+----
+NULL
+
 query I
 SELECT bit_get(11, NULL);
 ----
 NULL
+
+query I
+SELECT bit_get(11, NULL::int);
+----
+NULL
+
+query I
+SELECT bit_get(11::tinyint, 0);
+----
+1
+
+query I
+SELECT bit_get(11::bigint, 0);
+----
+1
+
+query I
+SELECT bit_get(11, 3::bigint);
+----
+1
diff --git a/datafusion/sqllogictest/test_files/spark/bitwise/shiftright.slt b/datafusion/sqllogictest/test_files/spark/bitwise/shiftright.slt
index 3587bcc7ca52b..3dd43509b1769 100644
--- a/datafusion/sqllogictest/test_files/spark/bitwise/shiftright.slt
+++ b/datafusion/sqllogictest/test_files/spark/bitwise/shiftright.slt
@@ -145,3 +145,118 @@ query I
 select shiftright(3::int,-32);
 ----
 3
+
+# i32 + nulls
+query IT
+SELECT
+	shiftright(arrow_cast(value, 'Int32'), shift),
+	arrow_typeof(shiftright(arrow_cast(value, 'Int32'), shift))
+FROM VALUES
+(4, 1),
+(8, 2),
+(16, 3),
+(32, 4),
+(null, 2),
+(8, null)
+t(value, shift)
+----
+2 Int32
+2 Int32
+2 Int32
+2 Int32
+NULL Int32
+NULL Int32
+
+# big shifts
+query IT
+SELECT
+	shiftright(arrow_cast(value, 'Int32'), shift),
+	arrow_typeof(shiftright(arrow_cast(value, 'Int32'), shift))
+FROM VALUES
+(1, 32),
+(2, 33),
+(3, 64)
+t(value, shift)
+----
+1 Int32
+1 Int32
+3 Int32
+
+# negative shift
+query IT
+SELECT
+	shiftright(arrow_cast(value, 'Int32'), shift),
+	arrow_typeof(shiftright(arrow_cast(value, 'Int32'), shift))
+FROM VALUES
+(4, -1),
+(8, -2),
+(16, -3)
+t(value, shift)
+----
+0 Int32
+0 Int32
+0 Int32
+
+# For signed integers, right shift preserves sign bit
+query IT
+SELECT
+	shiftright(arrow_cast(value, 'Int32'), shift),
+	arrow_typeof(shiftright(arrow_cast(value, 'Int32'), shift))
+FROM VALUES
+(-4, 1),
+(-8, 2),
+(-16, 3)
+t(value, shift)
+----
+-2 Int32
+-2 Int32
+-2 Int32
+
+# i64 value
+query IT
+SELECT
+	shiftright(arrow_cast(value, 'Int64'), shift),
+	arrow_typeof(shiftright(arrow_cast(value, 'Int64'), shift))
+FROM VALUES (4, 1), (8, 2), (16, 3) t(value, shift)
+----
+2 Int64
+2 Int64
+2 Int64
+
+# u32 value
+query IT
+SELECT
+	shiftright(arrow_cast(value, 'UInt32'), shift),
+	arrow_typeof(shiftright(arrow_cast(value, 'UInt32'), shift))
+FROM VALUES (4, 1), (8, 2), (16, 3) t(value, shift)
+----
+2 UInt32
+2 UInt32
+2 UInt32
+
+# u64 value
+query IT
+SELECT
+	shiftright(arrow_cast(value, 'UInt64'), shift),
+	arrow_typeof(shiftright(arrow_cast(value, 'UInt64'), shift))
+FROM VALUES (4, 1), (8, 2), (16, 3) t(value, shift)
+----
+2 UInt64
+2 UInt64
+2 UInt64
+
+# pure null handling
+query IT
+SELECT shiftright(null, 1), arrow_typeof(shiftright(null, 1));
+----
+NULL Int32
+
+query IT
+SELECT shiftright(null, null), arrow_typeof(shiftright(null, null));
+----
+NULL Int32
+
+query IT
+SELECT shiftright(1::bigint, null), arrow_typeof(shiftright(1::bigint, null));
+----
+NULL Int64
diff --git a/datafusion/sqllogictest/test_files/spark/bitwise/shiftrightunsigned.slt b/datafusion/sqllogictest/test_files/spark/bitwise/shiftrightunsigned.slt
index b0d4cfaec7021..b9ef530b36238 100644
--- a/datafusion/sqllogictest/test_files/spark/bitwise/shiftrightunsigned.slt
+++ b/datafusion/sqllogictest/test_files/spark/bitwise/shiftrightunsigned.slt
@@ -146,3 +146,106 @@ query I
 select shiftrightunsigned(3::int,-32);
 ----
 3
+
+# i32 + nulls
+query IT
+SELECT
+	shiftrightunsigned(arrow_cast(value, 'Int32'), shift),
+	arrow_typeof(shiftrightunsigned(arrow_cast(value, 'Int32'), shift))
+FROM VALUES
+(4, 1),
+(8, 2),
+(16, 3),
+(32, 4),
+(null, 2),
+(8, null)
+t(value, shift)
+----
+2 Int32
+2 Int32
+2 Int32
+2 Int32
+NULL Int32
+NULL Int32
+
+# negative shift
+query IT
+SELECT
+	shiftrightunsigned(arrow_cast(value, 'Int32'), shift),
+	arrow_typeof(shiftrightunsigned(arrow_cast(value, 'Int32'), shift))
+FROM VALUES
+(4, -1),
+(8, -2),
+(16, -3)
+t(value, shift)
+----
+0 Int32
+0 Int32
+0 Int32
+
+# Negative values are treated as large positive values
+# -4 as u32 = 4294967292, 4294967292 >>> 1 = 2147483646
+# -8 as u32 = 4294967288, 4294967288 >>> 2 = 1073741822
+# -16 as u32 = 4294967280, 4294967280 >>> 3 = 536870910
+query IT
+SELECT
+	shiftrightunsigned(arrow_cast(value, 'Int32'), shift),
+	arrow_typeof(shiftrightunsigned(arrow_cast(value, 'Int32'), shift))
+FROM VALUES
+(-4, 1),
+(-8, 2),
+(-16, 3)
+t(value, shift)
+----
+2147483646 Int32
+1073741822 Int32
+536870910 Int32
+
+# i64 value
+query IT
+SELECT
+	shiftrightunsigned(arrow_cast(value, 'Int64'), shift),
+	arrow_typeof(shiftrightunsigned(arrow_cast(value, 'Int64'), shift))
+FROM VALUES (4, 1), (8, 2), (16, 3) t(value, shift)
+----
+2 Int64
+2 Int64
+2 Int64
+
+# u32 value
+query IT
+SELECT
+	shiftrightunsigned(arrow_cast(value, 'UInt32'), shift),
+	arrow_typeof(shiftrightunsigned(arrow_cast(value, 'UInt32'), shift))
+FROM VALUES (4, 1), (8, 2), (16, 3) t(value, shift)
+----
+2 UInt32
+2 UInt32
+2 UInt32
+
+# u64 value
+query IT
+SELECT
+	shiftrightunsigned(arrow_cast(value, 'UInt64'), shift),
+	arrow_typeof(shiftrightunsigned(arrow_cast(value, 'UInt64'), shift))
+FROM VALUES (4, 1), (8, 2), (16, 3) t(value, shift)
+----
+2 UInt64
+2 UInt64
+2 UInt64
+
+# pure null handling
+query IT
+SELECT shiftrightunsigned(null, 1), arrow_typeof(shiftrightunsigned(null, 1));
+----
+NULL Int32
+
+query IT
+SELECT shiftrightunsigned(null, null), arrow_typeof(shiftrightunsigned(null, null));
+----
+NULL Int32
+
+query IT
+SELECT shiftrightunsigned(1::bigint, null), arrow_typeof(shiftrightunsigned(1::bigint, null));
+----
+NULL Int64
diff --git a/datafusion/sqllogictest/test_files/spark/collection/size.slt b/datafusion/sqllogictest/test_files/spark/collection/size.slt
new file mode 100644
index 0000000000000..106760eebfe42
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/collection/size.slt
@@ -0,0 +1,131 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT size(array(1, 2, 3));
+## PySpark 3.5.5 Result: {'size(array(1, 2, 3))': 3}
+
+# Basic array
+query I
+SELECT size(make_array(1, 2, 3));
+----
+3
+
+# Nested array
+query I
+SELECT size(make_array(make_array(1, 2), make_array(3, 4, 5)));
+----
+2
+
+# LargeList tests
+query I
+SELECT size(arrow_cast(make_array(1, 2, 3), 'LargeList(Int32)'));
+----
+3
+
+query I
+SELECT size(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'));
+----
+5
+
+# FixedSizeList tests
+query I
+SELECT size(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int32)'));
+----
+3
+
+query I
+SELECT size(arrow_cast(make_array(1, 2, 3, 4), 'FixedSizeList(4, Int32)'));
+----
+4
+
+# Map size tests
+query I
+SELECT size(map(make_array('a', 'b', 'c'), make_array(1, 2, 3)));
+----
+3
+
+query I
+SELECT size(map(make_array('a'), make_array(1)));
+----
+1
+
+# Empty array
+query I
+SELECT size(arrow_cast(make_array(), 'List(Int32)'));
+----
+0
+
+
+# Array with NULL elements (size counts elements including NULLs)
+query I
+SELECT size(make_array(1, NULL, 3));
+----
+3
+
+# NULL array returns -1 (Spark behavior)
+query I
+SELECT size(NULL::int[]);
+----
+-1
+
+
+# Empty map
+query I
+SELECT size(map(arrow_cast(make_array(), 'List(Utf8)'), arrow_cast(make_array(), 'List(Int32)')));
+----
+0
+
+# String array
+query I
+SELECT size(make_array('hello', 'world'));
+----
+2
+
+# Boolean array
+query I
+SELECT size(make_array(true, false, true));
+----
+3
+
+# Float array
+query I
+SELECT size(make_array(1.5, 2.5, 3.5, 4.5));
+----
+4
+
+# Array column tests (with NULL values)
+query I
+SELECT size(column1) FROM VALUES ([1]), ([1,2]), ([]), (NULL);
+----
+1
+2
+0
+-1
+
+# Map column tests (with NULL values)
+query I
+SELECT size(column1) FROM VALUES (map(['a'], [1])), (map(['a','b'], [1,2])), (NULL);
+----
+1
+2
+-1
diff --git a/datafusion/sqllogictest/test_files/spark/conversion/cast_float_to_timestamp.slt b/datafusion/sqllogictest/test_files/spark/conversion/cast_float_to_timestamp.slt
new file mode 100644
index 0000000000000..68bf340a8fc4f
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/conversion/cast_float_to_timestamp.slt
@@ -0,0 +1,228 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for casting float types to timestamp using spark_cast
+
+# Test spark_cast from float64 to timestamp
+query P
+SELECT spark_cast(0.0, 'timestamp');
+----
+1970-01-01T00:00:00Z
+
+query P
+SELECT spark_cast(1.0, 'timestamp');
+----
+1970-01-01T00:00:01Z
+
+query P
+SELECT spark_cast(-1.0, 'timestamp');
+----
+1969-12-31T23:59:59Z
+
+# Test fractional seconds
+query P
+SELECT spark_cast(1.5, 'timestamp');
+----
+1970-01-01T00:00:01.500Z
+
+query P
+SELECT spark_cast(-1.5, 'timestamp');
+----
+1969-12-31T23:59:58.500Z
+
+query P
+SELECT spark_cast(1.123456, 'timestamp');
+----
+1970-01-01T00:00:01.123456Z
+
+# Test larger values
+query P
+SELECT spark_cast(1704067200.0, 'timestamp');
+----
+2024-01-01T00:00:00Z
+
+query P
+SELECT spark_cast(1704067200.123456, 'timestamp');
+----
+2024-01-01T00:00:00.123456Z
+
+# Test spark_cast from float32 to timestamp
+query P
+SELECT spark_cast(arrow_cast(0.0, 'Float32'), 'timestamp');
+----
+1970-01-01T00:00:00Z
+
+query P
+SELECT spark_cast(arrow_cast(1.5, 'Float32'), 'timestamp');
+----
+1970-01-01T00:00:01.500Z
+
+query P
+SELECT spark_cast(arrow_cast(-1.5, 'Float32'), 'timestamp');
+----
+1969-12-31T23:59:58.500Z
+
+# Test NULL handling
+query P
+SELECT spark_cast(arrow_cast(NULL, 'Float32'), 'timestamp');
+----
+NULL
+
+query P
+SELECT spark_cast(arrow_cast(NULL, 'Float64'), 'timestamp');
+----
+NULL
+
+# Test NaN and Infinity in non-ANSI mode (default) - should return NULL
+query P
+SELECT spark_cast(arrow_cast('NaN', 'Float64'), 'timestamp');
+----
+NULL
+
+query P
+SELECT spark_cast(arrow_cast('Infinity', 'Float64'), 'timestamp');
+----
+NULL
+
+query P
+SELECT spark_cast(arrow_cast('-Infinity', 'Float64'), 'timestamp');
+----
+NULL
+
+# Test negative values
+query P
+SELECT spark_cast(-86400.0, 'timestamp');
+----
+1969-12-31T00:00:00Z
+
+query P
+SELECT spark_cast(-86400.5, 'timestamp');
+----
+1969-12-30T23:59:59.500Z
+
+# Test with timezone America/Los_Angeles
+statement ok
+SET datafusion.execution.time_zone = 'America/Los_Angeles';
+
+query P
+SELECT spark_cast(0.0, 'timestamp');
+----
+1969-12-31T16:00:00-08:00
+
+query P
+SELECT spark_cast(1704067200.0, 'timestamp');
+----
+2023-12-31T16:00:00-08:00
+
+# Reset to UTC
+statement ok
+SET datafusion.execution.time_zone = 'UTC';
+
+#############################
+# Array Tests
+#############################
+
+# Create test table with float columns
+statement ok
+CREATE TABLE float_test AS SELECT
+    arrow_cast(column1, 'Float32') as f32_col,
+    column2 as f64_col
+FROM (VALUES
+    (NULL, NULL),
+    (0.0, 0.0),
+    (1.5, 1.5),
+    (-1.5, -1.5),
+    (1704067200.0, 1704067200.123456)
+);
+
+# Test in UTC
+query PP
+SELECT spark_cast(f32_col, 'timestamp'), spark_cast(f64_col, 'timestamp') FROM float_test;
+----
+NULL NULL
+1970-01-01T00:00:00Z 1970-01-01T00:00:00Z
+1970-01-01T00:00:01.500Z 1970-01-01T00:00:01.500Z
+1969-12-31T23:59:58.500Z 1969-12-31T23:59:58.500Z
+2024-01-01T00:00:00Z 2024-01-01T00:00:00.123456Z
+
+# Test with NaN and Infinity in array
+statement ok
+CREATE TABLE float_special AS SELECT
+    column1 as f64_col
+FROM (VALUES
+    (1.0),
+    (arrow_cast('NaN', 'Float64')),
+    (arrow_cast('Infinity', 'Float64')),
+    (arrow_cast('-Infinity', 'Float64')),
+    (2.0)
+);
+
+# NaN and Infinity should return NULL in non-ANSI mode
+query P
+SELECT spark_cast(f64_col, 'timestamp') FROM float_special;
+----
+1970-01-01T00:00:01Z
+NULL
+NULL
+NULL
+1970-01-01T00:00:02Z
+
+# Cleanup
+statement ok
+DROP TABLE float_test;
+
+statement ok
+DROP TABLE float_special;
+
+# Note: Overflow saturation tests (1e19, -1e19) are not included here because
+# DataFusion's timestamp formatter cannot display i64::MAX/MIN microsecond values.
+# The saturation behavior (matching Spark) is verified in unit tests:
+# test_cast_float_overflow_non_ansi_mode and test_cast_float_negative_overflow_non_ansi_mode
+
+#############################
+# ANSI Mode Tests
+#############################
+
+# Enable ANSI mode
+statement ok
+SET datafusion.execution.enable_ansi_mode = true;
+
+# NaN should error in ANSI mode
+statement error
+SELECT spark_cast(arrow_cast('NaN', 'Float64'), 'timestamp');
+
+# Infinity should error in ANSI mode
+statement error
+SELECT spark_cast(arrow_cast('Infinity', 'Float64'), 'timestamp');
+
+# Very large value should error due to overflow
+statement error
+SELECT spark_cast(1e19, 'timestamp');
+
+# Normal values should still work in ANSI mode
+query P
+SELECT spark_cast(1.5, 'timestamp');
+----
+1970-01-01T00:00:01.500Z
+
+# Reset ANSI mode
+statement ok
+SET datafusion.execution.enable_ansi_mode = false;
+
+# Reset time_zone to NULL (default)
+statement ok
+RESET datafusion.execution.time_zone;
diff --git a/datafusion/sqllogictest/test_files/spark/conversion/cast_int_to_timestamp.slt b/datafusion/sqllogictest/test_files/spark/conversion/cast_int_to_timestamp.slt
new file mode 100644
index 0000000000000..931f7c5d359f0
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/conversion/cast_int_to_timestamp.slt
@@ -0,0 +1,254 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for casting integer types to timestamp using spark_cast
+
+# Test spark_cast from int8 to timestamp
+query P
+SELECT spark_cast(arrow_cast(0, 'Int8'), 'timestamp');
+----
+1970-01-01T00:00:00Z
+
+query P
+SELECT spark_cast(arrow_cast(1, 'Int8'), 'timestamp');
+----
+1970-01-01T00:00:01Z
+
+query P
+SELECT spark_cast(arrow_cast(-1, 'Int8'), 'timestamp');
+----
+1969-12-31T23:59:59Z
+
+# Test spark_cast from int16 to timestamp
+query P
+SELECT spark_cast(arrow_cast(0, 'Int16'), 'timestamp');
+----
+1970-01-01T00:00:00Z
+
+query P
+SELECT spark_cast(arrow_cast(3600, 'Int16'), 'timestamp');
+----
+1970-01-01T01:00:00Z
+
+# Test spark_cast from int32 to timestamp
+query P
+SELECT spark_cast(arrow_cast(0, 'Int32'), 'timestamp');
+----
+1970-01-01T00:00:00Z
+
+query P
+SELECT spark_cast(arrow_cast(1704067200, 'Int32'), 'timestamp');
+----
+2024-01-01T00:00:00Z
+
+# Test spark_cast from int64 to timestamp
+query P
+SELECT spark_cast(0::bigint, 'timestamp');
+----
+1970-01-01T00:00:00Z
+
+query P
+SELECT spark_cast(1704067200::bigint, 'timestamp');
+----
+2024-01-01T00:00:00Z
+
+# Test NULL handling
+query P
+SELECT spark_cast(arrow_cast(NULL, 'Int8'), 'timestamp');
+----
+NULL
+
+query P
+SELECT spark_cast(arrow_cast(NULL, 'Int16'), 'timestamp');
+----
+NULL
+
+query P
+SELECT spark_cast(arrow_cast(NULL, 'Int32'), 'timestamp');
+----
+NULL
+
+query P
+SELECT spark_cast(NULL::bigint, 'timestamp');
+----
+NULL
+
+# Test untyped NULL
+query P
+SELECT spark_cast(NULL, 'timestamp');
+----
+NULL
+
+# Test Int8 boundary values
+query P
+SELECT spark_cast(arrow_cast(127, 'Int8'), 'timestamp');
+----
+1970-01-01T00:02:07Z
+
+query P
+SELECT spark_cast(arrow_cast(-128, 'Int8'), 'timestamp');
+----
+1969-12-31T23:57:52Z
+
+# Test Int16 boundary values
+query P
+SELECT spark_cast(arrow_cast(32767, 'Int16'), 'timestamp');
+----
+1970-01-01T09:06:07Z
+
+query P
+SELECT spark_cast(arrow_cast(-32768, 'Int16'), 'timestamp');
+----
+1969-12-31T14:53:52Z
+
+# Test Int64 negative value
+query P
+SELECT spark_cast(-86400::bigint, 'timestamp');
+----
+1969-12-31T00:00:00Z
+
+# Test unsupported source type - should error
+statement error
+SELECT spark_cast('2024-01-01', 'timestamp');
+
+# Test unsupported target type - should error
+statement error
+SELECT spark_cast(100, 'string');
+
+# Test with different session timezones to verify simplify() picks up config
+
+# America/Los_Angeles (PST/PDT - has DST)
+statement ok
+SET datafusion.execution.time_zone = 'America/Los_Angeles';
+
+# Epoch in PST (UTC-8)
+query P
+SELECT spark_cast(0::bigint, 'timestamp');
+----
+1969-12-31T16:00:00-08:00
+
+# 2024-01-01 00:00:00 UTC in PST (winter, UTC-8)
+query P
+SELECT spark_cast(1704067200::bigint, 'timestamp');
+----
+2023-12-31T16:00:00-08:00
+
+# America/Phoenix (MST - no DST, always UTC-7)
+statement ok
+SET datafusion.execution.time_zone = 'America/Phoenix';
+
+# Epoch in Phoenix (UTC-7)
+query P
+SELECT spark_cast(0::bigint, 'timestamp');
+----
+1969-12-31T17:00:00-07:00
+
+# 2024-01-01 00:00:00 UTC in Phoenix (still UTC-7, no DST)
+query P
+SELECT spark_cast(1704067200::bigint, 'timestamp');
+----
+2023-12-31T17:00:00-07:00
+
+# Test with different timezones - LA (has DST)
+statement ok
+SET datafusion.execution.time_zone = 'America/Los_Angeles';
+
+query P
+SELECT spark_cast(1710054000::bigint, 'timestamp');
+----
+2024-03-09T23:00:00-08:00
+
+query P
+SELECT spark_cast(1710057600::bigint, 'timestamp');
+----
+2024-03-10T00:00:00-08:00
+
+# Phoenix has no DST - always UTC-7
+statement ok
+SET datafusion.execution.time_zone = 'America/Phoenix';
+
+query P
+SELECT spark_cast(1710054000::bigint, 'timestamp');
+----
+2024-03-10T00:00:00-07:00
+
+query P
+SELECT spark_cast(1710057600::bigint, 'timestamp');
+----
+2024-03-10T01:00:00-07:00
+
+# Reset to default
+statement ok
+SET datafusion.execution.time_zone = 'UTC';
+
+#############################
+# Array Tests
+#############################
+
+# Create test table with 4 int columns: null, min, max, regular value
+statement ok
+CREATE TABLE int_test AS SELECT
+    arrow_cast(column1, 'Int8') as i8_col,
+    arrow_cast(column2, 'Int16') as i16_col,
+    arrow_cast(column3, 'Int32') as i32_col,
+    column4::bigint as i64_col
+FROM (VALUES
+    (NULL, NULL, NULL, NULL),
+    (-128, -32768, -2147483648, -86400),
+    (127, 32767, 2147483647, 86400),
+    (100, 3600, 1710054000, 1710054000)
+);
+
+# Test in UTC
+query PPPP
+SELECT spark_cast(i8_col, 'timestamp'), spark_cast(i16_col, 'timestamp'), spark_cast(i32_col, 'timestamp'), spark_cast(i64_col, 'timestamp') FROM int_test;
+----
+NULL NULL NULL NULL
+1969-12-31T23:57:52Z 1969-12-31T14:53:52Z 1901-12-13T20:45:52Z 1969-12-31T00:00:00Z
+1970-01-01T00:02:07Z 1970-01-01T09:06:07Z 2038-01-19T03:14:07Z 1970-01-02T00:00:00Z
+1970-01-01T00:01:40Z 1970-01-01T01:00:00Z 2024-03-10T07:00:00Z 2024-03-10T07:00:00Z
+
+# Test in America/Los_Angeles (PST - has DST)
+statement ok
+SET datafusion.execution.time_zone = 'America/Los_Angeles';
+
+query PPPP
+SELECT spark_cast(i8_col, 'timestamp'), spark_cast(i16_col, 'timestamp'), spark_cast(i32_col, 'timestamp'), spark_cast(i64_col, 'timestamp') FROM int_test;
+----
+NULL NULL NULL NULL
+1969-12-31T15:57:52-08:00 1969-12-31T06:53:52-08:00 1901-12-13T12:45:52-08:00 1969-12-30T16:00:00-08:00
+1969-12-31T16:02:07-08:00 1970-01-01T01:06:07-08:00 2038-01-18T19:14:07-08:00 1970-01-01T16:00:00-08:00
+1969-12-31T16:01:40-08:00 1969-12-31T17:00:00-08:00 2024-03-09T23:00:00-08:00 2024-03-09T23:00:00-08:00
+
+# Test in America/Phoenix (MST - no DST, always UTC-7)
+statement ok
+SET datafusion.execution.time_zone = 'America/Phoenix';
+
+query PPPP
+SELECT spark_cast(i8_col, 'timestamp'), spark_cast(i16_col, 'timestamp'), spark_cast(i32_col, 'timestamp'), spark_cast(i64_col, 'timestamp') FROM int_test;
+----
+NULL NULL NULL NULL
+1969-12-31T16:57:52-07:00 1969-12-31T07:53:52-07:00 1901-12-13T13:45:52-07:00 1969-12-30T17:00:00-07:00
+1969-12-31T17:02:07-07:00 1970-01-01T02:06:07-07:00 2038-01-18T20:14:07-07:00 1970-01-01T17:00:00-07:00
+1969-12-31T17:01:40-07:00 1969-12-31T18:00:00-07:00 2024-03-10T00:00:00-07:00 2024-03-10T00:00:00-07:00
+
+# Reset and cleanup
+statement ok
+RESET datafusion.execution.time_zone;
+
+statement ok
+DROP TABLE int_test;
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/add_months.slt b/datafusion/sqllogictest/test_files/spark/datetime/add_months.slt
index cae9b21dd4766..b2df5dd481e1c 100644
--- a/datafusion/sqllogictest/test_files/spark/datetime/add_months.slt
+++ b/datafusion/sqllogictest/test_files/spark/datetime/add_months.slt
@@ -15,13 +15,42 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT add_months('2016-08-31', 1);
-## PySpark 3.5.5 Result: {'add_months(2016-08-31, 1)': datetime.date(2016, 9, 30), 'typeof(add_months(2016-08-31, 1))': 'date', 'typeof(2016-08-31)': 'string', 'typeof(1)': 'int'}
-#query
-#SELECT add_months('2016-08-31'::string, 1::int);
+query D
+SELECT add_months('2016-07-30'::date, 1::int);
+----
+2016-08-30
+
+query D
+SELECT add_months('2016-07-30'::date, 0::int);
+----
+2016-07-30
+
+query D
+SELECT add_months('2016-07-30'::date, 10000::int);
+----
+2849-11-30
+
+# Test integer overflow
+query error DataFusion error: Arrow error: Compute error: Date arithmetic overflow: 17012 \+ 2147483647 months
+SELECT add_months('2016-07-30'::date, 2147483647::int);
+
+query D
+SELECT add_months('2016-07-30'::date, -5::int);
+----
+2016-02-29
+
+# Test with NULL values
+query D
+SELECT add_months(NULL::date, 1::int);
+----
+NULL
+
+query D
+SELECT add_months('2016-07-30'::date, NULL::int);
+----
+NULL
+
+query D
+SELECT add_months(NULL::date, NULL::int);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/date_add.slt b/datafusion/sqllogictest/test_files/spark/datetime/date_add.slt
index 2e9851ca1e595..cb407a6453696 100644
--- a/datafusion/sqllogictest/test_files/spark/datetime/date_add.slt
+++ b/datafusion/sqllogictest/test_files/spark/datetime/date_add.slt
@@ -41,26 +41,25 @@ SELECT date_add('2016-07-30'::date, arrow_cast(1, 'Int8'));
 2016-07-31
 
 query D
-SELECT date_sub('2016-07-30'::date, 0::int);
+SELECT date_add('2016-07-30'::date, 0::int);
 ----
 2016-07-30
 
-query error DataFusion error: Arrow error: Arithmetic overflow: date_add
-SELECT date_add('2016-07-30'::date, 2147483647::int);
+query I
+SELECT date_add('2016-07-30'::date, 2147483647::int)::int;
+----
+-2147466637
 
-query error DataFusion error: Arrow error: Arithmetic overflow: date_sub
-SELECT date_sub('1969-01-01'::date, 2147483647::int);
+query I
+SELECT date_add('1969-01-01'::date, 2147483647::int)::int;
+----
+2147483282
 
 query D
 SELECT date_add('2016-07-30'::date, 100000::int);
 ----
 2290-05-15
 
-query D
-SELECT date_sub('2016-07-30'::date, 100000::int);
-----
-1742-10-15
-
 # Test with negative day values (should subtract days)
 query D
 SELECT date_add('2016-07-30'::date, -5::int);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/date_diff.slt b/datafusion/sqllogictest/test_files/spark/datetime/date_diff.slt
index c5871ab41e183..b0952d6a43510 100644
--- a/datafusion/sqllogictest/test_files/spark/datetime/date_diff.slt
+++ b/datafusion/sqllogictest/test_files/spark/datetime/date_diff.slt
@@ -15,18 +15,138 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT date_diff('2009-07-30', '2009-07-31');
-## PySpark 3.5.5 Result: {'date_diff(2009-07-30, 2009-07-31)': -1, 'typeof(date_diff(2009-07-30, 2009-07-31))': 'int', 'typeof(2009-07-30)': 'string', 'typeof(2009-07-31)': 'string'}
-#query
-#SELECT date_diff('2009-07-30'::string, '2009-07-31'::string);
-
-## Original Query: SELECT date_diff('2009-07-31', '2009-07-30');
-## PySpark 3.5.5 Result: {'date_diff(2009-07-31, 2009-07-30)': 1, 'typeof(date_diff(2009-07-31, 2009-07-30))': 'int', 'typeof(2009-07-31)': 'string', 'typeof(2009-07-30)': 'string'}
-#query
-#SELECT date_diff('2009-07-31'::string, '2009-07-30'::string);
+# date input
+query I
+SELECT date_diff('2009-07-30'::date, '2009-07-31'::date);
+----
+-1
+
+query I
+SELECT date_diff('2009-07-31'::date, '2009-07-30'::date);
+----
+1
+
+query I
+SELECT date_diff('2009-07-31'::string, '2009-07-30'::date);
+----
+1
+
+query I
+SELECT date_diff('2009-07-31'::timestamp, '2009-07-30'::date);
+----
+1
+
+# Date64 input
+query I
+SELECT date_diff(arrow_cast('2009-07-31', 'Date64'), arrow_cast('2009-07-30', 'Date64'));
+----
+1
+
+query I
+SELECT date_diff(arrow_cast('2009-07-30', 'Date64'), arrow_cast('2009-07-31', 'Date64'));
+----
+-1
+
+# Mixed Date32 and Date64 input
+query I
+SELECT date_diff('2009-07-31'::date, arrow_cast('2009-07-30', 'Date64'));
+----
+1
+
+query I
+SELECT date_diff(arrow_cast('2009-07-31', 'Date64'), '2009-07-30'::date);
+----
+1
+
+
+# Same date returns 0
+query I
+SELECT date_diff('2009-07-30'::date, '2009-07-30'::date);
+----
+0
+
+# Large difference
+query I
+SELECT date_diff('2020-01-01'::date, '1970-01-01'::date);
+----
+18262
+
+# timestamp input
+query I
+SELECT date_diff('2009-07-30 12:34:56'::timestamp, '2009-07-31 23:45:01'::timestamp);
+----
+-1
+
+query I
+SELECT date_diff('2009-07-31 23:45:01'::timestamp, '2009-07-30 12:34:56'::timestamp);
+----
+1
+
+query I
+SELECT date_diff('2009-07-31 23:45:01'::string, '2009-07-30 12:34:56'::timestamp);
+----
+1
+
+# string input
+query I
+SELECT date_diff('2009-07-30', '2009-07-31');
+----
+-1
+
+query I
+SELECT date_diff('2009-07-31', '2009-07-30');
+----
+1
+
+# NULL handling
+query I
+SELECT date_diff(NULL::date, '2009-07-30'::date);
+----
+NULL
+
+query I
+SELECT date_diff('2009-07-31'::date, NULL::date);
+----
+NULL
+
+query I
+SELECT date_diff(NULL::date, NULL::date);
+----
+NULL
+
+query I
+SELECT date_diff(column1, column2)
+FROM VALUES
+('2009-07-30'::date, '2009-07-31'::date),
+('2009-07-31'::date, '2009-07-30'::date),
+(NULL::date, '2009-07-30'::date),
+('2009-07-31'::date, NULL::date),
+(NULL::date, NULL::date);
+----
+-1
+1
+NULL
+NULL
+NULL
+
+
+# Alias datediff
+query I
+SELECT datediff('2009-07-30'::date, '2009-07-31'::date);
+----
+-1
+
+query I
+SELECT datediff(column1, column2)
+FROM VALUES
+('2009-07-30'::date, '2009-07-31'::date),
+('2009-07-31'::date, '2009-07-30'::date),
+(NULL::date, '2009-07-30'::date),
+('2009-07-31'::date, NULL::date),
+(NULL::date, NULL::date);
+----
+-1
+1
+NULL
+NULL
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/date_part.slt b/datafusion/sqllogictest/test_files/spark/datetime/date_part.slt
index cd3271cdc7df8..48216bd551692 100644
--- a/datafusion/sqllogictest/test_files/spark/datetime/date_part.slt
+++ b/datafusion/sqllogictest/test_files/spark/datetime/date_part.slt
@@ -15,48 +15,262 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT date_part('MINUTE', INTERVAL '123 23:55:59.002001' DAY TO SECOND);
-## PySpark 3.5.5 Result: {"date_part(MINUTE, INTERVAL '123 23:55:59.002001' DAY TO SECOND)": 55, "typeof(date_part(MINUTE, INTERVAL '123 23:55:59.002001' DAY TO SECOND))": 'tinyint', 'typeof(MINUTE)': 'string', "typeof(INTERVAL '123 23:55:59.002001' DAY TO SECOND)": 'interval day to second'}
-#query
-#SELECT date_part('MINUTE'::string, INTERVAL '123 23:55:59.002001' DAY TO SECOND::interval day to second);
-
-## Original Query: SELECT date_part('MONTH', INTERVAL '2021-11' YEAR TO MONTH);
-## PySpark 3.5.5 Result: {"date_part(MONTH, INTERVAL '2021-11' YEAR TO MONTH)": 11, "typeof(date_part(MONTH, INTERVAL '2021-11' YEAR TO MONTH))": 'tinyint', 'typeof(MONTH)': 'string', "typeof(INTERVAL '2021-11' YEAR TO MONTH)": 'interval year to month'}
-#query
-#SELECT date_part('MONTH'::string, INTERVAL '2021-11' YEAR TO MONTH::interval year to month);
-
-## Original Query: SELECT date_part('SECONDS', timestamp'2019-10-01 00:00:01.000001');
-## PySpark 3.5.5 Result: {"date_part(SECONDS, TIMESTAMP '2019-10-01 00:00:01.000001')": Decimal('1.000001'), "typeof(date_part(SECONDS, TIMESTAMP '2019-10-01 00:00:01.000001'))": 'decimal(8,6)', 'typeof(SECONDS)': 'string', "typeof(TIMESTAMP '2019-10-01 00:00:01.000001')": 'timestamp'}
-#query
-#SELECT date_part('SECONDS'::string, TIMESTAMP '2019-10-01 00:00:01.000001'::timestamp);
-
-## Original Query: SELECT date_part('YEAR', TIMESTAMP '2019-08-12 01:00:00.123456');
-## PySpark 3.5.5 Result: {"date_part(YEAR, TIMESTAMP '2019-08-12 01:00:00.123456')": 2019, "typeof(date_part(YEAR, TIMESTAMP '2019-08-12 01:00:00.123456'))": 'int', 'typeof(YEAR)': 'string', "typeof(TIMESTAMP '2019-08-12 01:00:00.123456')": 'timestamp'}
-#query
-#SELECT date_part('YEAR'::string, TIMESTAMP '2019-08-12 01:00:00.123456'::timestamp);
-
-## Original Query: SELECT date_part('days', interval 5 days 3 hours 7 minutes);
-## PySpark 3.5.5 Result: {"date_part(days, INTERVAL '5 03:07' DAY TO MINUTE)": 5, "typeof(date_part(days, INTERVAL '5 03:07' DAY TO MINUTE))": 'int', 'typeof(days)': 'string', "typeof(INTERVAL '5 03:07' DAY TO MINUTE)": 'interval day to minute'}
-#query
-#SELECT date_part('days'::string, INTERVAL '5 03:07' DAY TO MINUTE::interval day to minute);
-
-## Original Query: SELECT date_part('doy', DATE'2019-08-12');
-## PySpark 3.5.5 Result: {"date_part(doy, DATE '2019-08-12')": 224, "typeof(date_part(doy, DATE '2019-08-12'))": 'int', 'typeof(doy)': 'string', "typeof(DATE '2019-08-12')": 'date'}
-#query
-#SELECT date_part('doy'::string, DATE '2019-08-12'::date);
-
-## Original Query: SELECT date_part('seconds', interval 5 hours 30 seconds 1 milliseconds 1 microseconds);
-## PySpark 3.5.5 Result: {"date_part(seconds, INTERVAL '05:00:30.001001' HOUR TO SECOND)": Decimal('30.001001'), "typeof(date_part(seconds, INTERVAL '05:00:30.001001' HOUR TO SECOND))": 'decimal(8,6)', 'typeof(seconds)': 'string', "typeof(INTERVAL '05:00:30.001001' HOUR TO SECOND)": 'interval hour to second'}
-#query
-#SELECT date_part('seconds'::string, INTERVAL '05:00:30.001001' HOUR TO SECOND::interval hour to second);
-
-## Original Query: SELECT date_part('week', timestamp'2019-08-12 01:00:00.123456');
-## PySpark 3.5.5 Result: {"date_part(week, TIMESTAMP '2019-08-12 01:00:00.123456')": 33, "typeof(date_part(week, TIMESTAMP '2019-08-12 01:00:00.123456'))": 'int', 'typeof(week)': 'string', "typeof(TIMESTAMP '2019-08-12 01:00:00.123456')": 'timestamp'}
-#query
-#SELECT date_part('week'::string, TIMESTAMP '2019-08-12 01:00:00.123456'::timestamp);
+# YEAR
+query I
+SELECT date_part('YEAR'::string, '2000-01-01'::date);
+----
+2000
+
+query I
+SELECT date_part('YEARS'::string, '2000-01-01'::date);
+----
+2000
+
+query I
+SELECT date_part('Y'::string, '2000-01-01'::date);
+----
+2000
+
+query I
+SELECT date_part('YR'::string, '2000-01-01'::date);
+----
+2000
+
+query I
+SELECT date_part('YRS'::string, '2000-01-01'::date);
+----
+2000
+
+# YEAROFWEEK
+query I
+SELECT date_part('YEAROFWEEK'::string, '2000-01-01'::date);
+----
+1999
+
+# QUARTER
+query I
+SELECT date_part('QUARTER'::string, '2000-01-01'::date);
+----
+1
+
+query I
+SELECT date_part('QTR'::string, '2000-01-01'::date);
+----
+1
+
+# MONTH
+query I
+SELECT date_part('MONTH'::string, '2000-01-01'::date);
+----
+1
+
+query I
+SELECT date_part('MON'::string, '2000-01-01'::date);
+----
+1
+
+query I
+SELECT date_part('MONS'::string, '2000-01-01'::date);
+----
+1
+
+query I
+SELECT date_part('MONTHS'::string, '2000-01-01'::date);
+----
+1
+
+# WEEK
+query I
+SELECT date_part('WEEK'::string, '2000-01-01'::date);
+----
+52
+
+query I
+SELECT date_part('WEEKS'::string, '2000-01-01'::date);
+----
+52
+
+query I
+SELECT date_part('W'::string, '2000-01-01'::date);
+----
+52
+
+# DAYS
+query I
+SELECT date_part('DAY'::string, '2000-01-01'::date);
+----
+1
+
+query I
+SELECT date_part('D'::string, '2000-01-01'::date);
+----
+1
+
+query I
+SELECT date_part('DAYS'::string, '2000-01-01'::date);
+----
+1
+
+# DAYOFWEEK
+query I
+SELECT date_part('DAYOFWEEK'::string, '2000-01-01'::date);
+----
+7
+
+query I
+SELECT date_part('DOW'::string, '2000-01-01'::date);
+----
+7
+
+# DAYOFWEEK_ISO
+query I
+SELECT date_part('DAYOFWEEK_ISO'::string, '2000-01-01'::date);
+----
+6
+
+query I
+SELECT date_part('DOW_ISO'::string, '2000-01-01'::date);
+----
+6
+
+# DOY
+query I
+SELECT date_part('DOY'::string, '2000-01-01'::date);
+----
+1
+
+# HOUR
+query I
+SELECT date_part('HOUR'::string, '2000-01-01 12:30:45'::timestamp);
+----
+12
+
+query I
+SELECT date_part('H'::string, '2000-01-01 12:30:45'::timestamp);
+----
+12
+
+query I
+SELECT date_part('HOURS'::string, '2000-01-01 12:30:45'::timestamp);
+----
+12
+
+query I
+SELECT date_part('HR'::string, '2000-01-01 12:30:45'::timestamp);
+----
+12
+
+query I
+SELECT date_part('HRS'::string, '2000-01-01 12:30:45'::timestamp);
+----
+12
+
+# MINUTE
+query I
+SELECT date_part('MINUTE'::string, '2000-01-01 12:30:45'::timestamp);
+----
+30
+
+query I
+SELECT date_part('M'::string, '2000-01-01 12:30:45'::timestamp);
+----
+30
+
+query I
+SELECT date_part('MIN'::string, '2000-01-01 12:30:45'::timestamp);
+----
+30
+
+query I
+SELECT date_part('MINS'::string, '2000-01-01 12:30:45'::timestamp);
+----
+30
+
+query I
+SELECT date_part('MINUTES'::string, '2000-01-01 12:30:45'::timestamp);
+----
+30
+
+# SECOND
+query I
+SELECT date_part('SECOND'::string, '2000-01-01 12:30:45'::timestamp);
+----
+45
+
+query I
+SELECT date_part('S'::string, '2000-01-01 12:30:45'::timestamp);
+----
+45
+
+query I
+SELECT date_part('SEC'::string, '2000-01-01 12:30:45'::timestamp);
+----
+45
+
+query I
+SELECT date_part('SECONDS'::string, '2000-01-01 12:30:45'::timestamp);
+----
+45
+
+query I
+SELECT date_part('SECS'::string, '2000-01-01 12:30:45'::timestamp);
+----
+45
+
+# NULL input
+query I
+SELECT date_part('year'::string, NULL::timestamp);
+----
+NULL
+
+query error Internal error: First argument of `DATE_PART` must be non-null scalar Utf8
+SELECT date_part(NULL::string, '2000-01-01'::date);
+
+# Invalid part
+query error DataFusion error: Execution error: Date part 'test' not supported
+SELECT date_part('test'::string, '2000-01-01'::date);
+
+query I
+SELECT date_part('year', column1)
+FROM VALUES
+('2022-03-15'::date),
+('1999-12-31'::date),
+('2000-01-01'::date),
+(NULL::date);
+----
+2022
+1999
+2000
+NULL
+
+query I
+SELECT date_part('minutes', column1)
+FROM VALUES
+('2022-03-15 12:30:45'::timestamp),
+('1999-12-31 12:32:45'::timestamp),
+('2000-01-01 12:00:45'::timestamp),
+(NULL::timestamp);
+----
+30
+32
+0
+NULL
+
+# alias datepart
+query I
+SELECT datepart('YEAR'::string, '2000-01-01'::date);
+----
+2000
+
+query I
+SELECT datepart('year', column1)
+FROM VALUES
+('2022-03-15'::date),
+('1999-12-31'::date),
+('2000-01-01'::date),
+(NULL::date);
+----
+2022
+1999
+2000
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/date_sub.slt b/datafusion/sqllogictest/test_files/spark/datetime/date_sub.slt
index cb5e77c3b4f1e..bf36ebd867d19 100644
--- a/datafusion/sqllogictest/test_files/spark/datetime/date_sub.slt
+++ b/datafusion/sqllogictest/test_files/spark/datetime/date_sub.slt
@@ -45,6 +45,16 @@ SELECT date_sub('2016-07-30'::date, 0::int);
 ----
 2016-07-30
 
+query I
+SELECT date_sub('1969-01-01'::date, 2147483647::int)::int;
+----
+2147483284
+
+query D
+SELECT date_sub('2016-07-30'::date, 100000::int);
+----
+1742-10-15
+
 # Test with negative day values (should add days)
 query D
 SELECT date_sub('2016-07-30'::date, -1::int);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/date_trunc.slt b/datafusion/sqllogictest/test_files/spark/datetime/date_trunc.slt
index 8a15254e6795e..7fc1583bb9310 100644
--- a/datafusion/sqllogictest/test_files/spark/datetime/date_trunc.slt
+++ b/datafusion/sqllogictest/test_files/spark/datetime/date_trunc.slt
@@ -15,33 +15,150 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT date_trunc('DD', '2015-03-05T09:32:05.359');
-## PySpark 3.5.5 Result: {'date_trunc(DD, 2015-03-05T09:32:05.359)': datetime.datetime(2015, 3, 5, 0, 0), 'typeof(date_trunc(DD, 2015-03-05T09:32:05.359))': 'timestamp', 'typeof(DD)': 'string', 'typeof(2015-03-05T09:32:05.359)': 'string'}
-#query
-#SELECT date_trunc('DD'::string, '2015-03-05T09:32:05.359'::string);
-
-## Original Query: SELECT date_trunc('HOUR', '2015-03-05T09:32:05.359');
-## PySpark 3.5.5 Result: {'date_trunc(HOUR, 2015-03-05T09:32:05.359)': datetime.datetime(2015, 3, 5, 9, 0), 'typeof(date_trunc(HOUR, 2015-03-05T09:32:05.359))': 'timestamp', 'typeof(HOUR)': 'string', 'typeof(2015-03-05T09:32:05.359)': 'string'}
-#query
-#SELECT date_trunc('HOUR'::string, '2015-03-05T09:32:05.359'::string);
-
-## Original Query: SELECT date_trunc('MILLISECOND', '2015-03-05T09:32:05.123456');
-## PySpark 3.5.5 Result: {'date_trunc(MILLISECOND, 2015-03-05T09:32:05.123456)': datetime.datetime(2015, 3, 5, 9, 32, 5, 123000), 'typeof(date_trunc(MILLISECOND, 2015-03-05T09:32:05.123456))': 'timestamp', 'typeof(MILLISECOND)': 'string', 'typeof(2015-03-05T09:32:05.123456)': 'string'}
-#query
-#SELECT date_trunc('MILLISECOND'::string, '2015-03-05T09:32:05.123456'::string);
-
-## Original Query: SELECT date_trunc('MM', '2015-03-05T09:32:05.359');
-## PySpark 3.5.5 Result: {'date_trunc(MM, 2015-03-05T09:32:05.359)': datetime.datetime(2015, 3, 1, 0, 0), 'typeof(date_trunc(MM, 2015-03-05T09:32:05.359))': 'timestamp', 'typeof(MM)': 'string', 'typeof(2015-03-05T09:32:05.359)': 'string'}
-#query
-#SELECT date_trunc('MM'::string, '2015-03-05T09:32:05.359'::string);
-
-## Original Query: SELECT date_trunc('YEAR', '2015-03-05T09:32:05.359');
-## PySpark 3.5.5 Result: {'date_trunc(YEAR, 2015-03-05T09:32:05.359)': datetime.datetime(2015, 1, 1, 0, 0), 'typeof(date_trunc(YEAR, 2015-03-05T09:32:05.359))': 'timestamp', 'typeof(YEAR)': 'string', 'typeof(2015-03-05T09:32:05.359)': 'string'}
-#query
-#SELECT date_trunc('YEAR'::string, '2015-03-05T09:32:05.359'::string);
+# YEAR - truncate to first date of year, time zeroed
+query P
+SELECT date_trunc('YEAR', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-01-01T00:00:00
+
+query P
+SELECT date_trunc('YYYY', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-01-01T00:00:00
+
+query P
+SELECT date_trunc('YY', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-01-01T00:00:00
+
+# QUARTER - truncate to first date of quarter, time zeroed
+query P
+SELECT date_trunc('QUARTER', '2015-05-05T09:32:05.123456'::timestamp);
+----
+2015-04-01T00:00:00
+
+# MONTH - truncate to first date of month, time zeroed
+query P
+SELECT date_trunc('MONTH', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-01T00:00:00
+
+query P
+SELECT date_trunc('MM', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-01T00:00:00
+
+query P
+SELECT date_trunc('MON', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-01T00:00:00
+
+# WEEK - truncate to Monday of the week, time zeroed
+query P
+SELECT date_trunc('WEEK', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-02T00:00:00
+
+# DAY - zero out time part
+query P
+SELECT date_trunc('DAY', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-05T00:00:00
+
+query P
+SELECT date_trunc('DD', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-05T00:00:00
+
+# HOUR - zero out minute and second with fraction
+query P
+SELECT date_trunc('HOUR', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-05T09:00:00
+
+# MINUTE - zero out second with fraction
+query P
+SELECT date_trunc('MINUTE', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-05T09:32:00
+
+# SECOND - zero out fraction
+query P
+SELECT date_trunc('SECOND', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-05T09:32:05
+
+# MILLISECOND - zero out microseconds
+query P
+SELECT date_trunc('MILLISECOND', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-05T09:32:05.123
+
+# MICROSECOND - everything remains
+query P
+SELECT date_trunc('MICROSECOND', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-05T09:32:05.123456
+
+query P
+SELECT date_trunc('YEAR', column1)
+FROM VALUES
+('2015-03-05T09:32:05.123456'::timestamp),
+('2020-11-15T22:45:30.654321'::timestamp),
+('1999-07-20T14:20:10.000001'::timestamp),
+(NULL::timestamp);
+----
+2015-01-01T00:00:00
+2020-01-01T00:00:00
+1999-01-01T00:00:00
+NULL
+
+# String input
+query P
+SELECT date_trunc('YEAR', '2015-03-05T09:32:05.123456');
+----
+2015-01-01T00:00:00
+
+# Null handling
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed\ncaused by\nError during planning: First argument of `DATE_TRUNC` must be non-null scalar Utf8
+SELECT date_trunc(NULL, '2015-03-05T09:32:05.123456');
+
+query P
+SELECT date_trunc('YEAR', NULL::timestamp);
+----
+NULL
+
+# incorrect format
+query error DataFusion error: Execution error: Unsupported date_trunc granularity: 'test'. Supported values are: microsecond, millisecond, second, minute, hour, day, week, month, quarter, year
+SELECT date_trunc('test', '2015-03-05T09:32:05.123456');
+
+# Timezone handling - Spark-compatible behavior
+# Spark converts timestamps to session timezone before truncating for coarse granularities
+
+query P
+SELECT date_trunc('DAY', arrow_cast(timestamp '2024-07-15T03:30:00', 'Timestamp(Microsecond, Some("UTC"))'));
+----
+2024-07-15T00:00:00Z
+
+query P
+SELECT date_trunc('DAY', arrow_cast(timestamp '2024-07-15T03:30:00', 'Timestamp(Microsecond, None)'));
+----
+2024-07-15T00:00:00
+
+statement ok
+SET datafusion.execution.time_zone = 'America/New_York';
+
+# This timestamp is 03:30 UTC = 23:30 EDT (previous day) on July 14
+# With session timezone, truncation happens in America/New_York timezone
+query P
+SELECT date_trunc('DAY', arrow_cast(timestamp '2024-07-15T03:30:00', 'Timestamp(Microsecond, Some("UTC"))'));
+----
+2024-07-14T00:00:00Z
+
+query P
+SELECT date_trunc('DAY', arrow_cast(timestamp '2024-07-15T03:30:00', 'Timestamp(Microsecond, None)'));
+----
+2024-07-15T00:00:00
+
+statement ok
+RESET datafusion.execution.time_zone;
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/datediff.slt b/datafusion/sqllogictest/test_files/spark/datetime/datediff.slt
deleted file mode 100644
index 223e2c313ae86..0000000000000
--- a/datafusion/sqllogictest/test_files/spark/datetime/datediff.slt
+++ /dev/null
@@ -1,32 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-
-#   http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT datediff('2009-07-30', '2009-07-31');
-## PySpark 3.5.5 Result: {'datediff(2009-07-30, 2009-07-31)': -1, 'typeof(datediff(2009-07-30, 2009-07-31))': 'int', 'typeof(2009-07-30)': 'string', 'typeof(2009-07-31)': 'string'}
-#query
-#SELECT datediff('2009-07-30'::string, '2009-07-31'::string);
-
-## Original Query: SELECT datediff('2009-07-31', '2009-07-30');
-## PySpark 3.5.5 Result: {'datediff(2009-07-31, 2009-07-30)': 1, 'typeof(datediff(2009-07-31, 2009-07-30))': 'int', 'typeof(2009-07-31)': 'string', 'typeof(2009-07-30)': 'string'}
-#query
-#SELECT datediff('2009-07-31'::string, '2009-07-30'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/datepart.slt b/datafusion/sqllogictest/test_files/spark/datetime/datepart.slt
deleted file mode 100644
index b2dd0089c2823..0000000000000
--- a/datafusion/sqllogictest/test_files/spark/datetime/datepart.slt
+++ /dev/null
@@ -1,62 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-
-#   http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT datepart('MINUTE', INTERVAL '123 23:55:59.002001' DAY TO SECOND);
-## PySpark 3.5.5 Result: {"datepart(MINUTE FROM INTERVAL '123 23:55:59.002001' DAY TO SECOND)": 55, "typeof(datepart(MINUTE FROM INTERVAL '123 23:55:59.002001' DAY TO SECOND))": 'tinyint', 'typeof(MINUTE)': 'string', "typeof(INTERVAL '123 23:55:59.002001' DAY TO SECOND)": 'interval day to second'}
-#query
-#SELECT datepart('MINUTE'::string, INTERVAL '123 23:55:59.002001' DAY TO SECOND::interval day to second);
-
-## Original Query: SELECT datepart('MONTH', INTERVAL '2021-11' YEAR TO MONTH);
-## PySpark 3.5.5 Result: {"datepart(MONTH FROM INTERVAL '2021-11' YEAR TO MONTH)": 11, "typeof(datepart(MONTH FROM INTERVAL '2021-11' YEAR TO MONTH))": 'tinyint', 'typeof(MONTH)': 'string', "typeof(INTERVAL '2021-11' YEAR TO MONTH)": 'interval year to month'}
-#query
-#SELECT datepart('MONTH'::string, INTERVAL '2021-11' YEAR TO MONTH::interval year to month);
-
-## Original Query: SELECT datepart('SECONDS', timestamp'2019-10-01 00:00:01.000001');
-## PySpark 3.5.5 Result: {"datepart(SECONDS FROM TIMESTAMP '2019-10-01 00:00:01.000001')": Decimal('1.000001'), "typeof(datepart(SECONDS FROM TIMESTAMP '2019-10-01 00:00:01.000001'))": 'decimal(8,6)', 'typeof(SECONDS)': 'string', "typeof(TIMESTAMP '2019-10-01 00:00:01.000001')": 'timestamp'}
-#query
-#SELECT datepart('SECONDS'::string, TIMESTAMP '2019-10-01 00:00:01.000001'::timestamp);
-
-## Original Query: SELECT datepart('YEAR', TIMESTAMP '2019-08-12 01:00:00.123456');
-## PySpark 3.5.5 Result: {"datepart(YEAR FROM TIMESTAMP '2019-08-12 01:00:00.123456')": 2019, "typeof(datepart(YEAR FROM TIMESTAMP '2019-08-12 01:00:00.123456'))": 'int', 'typeof(YEAR)': 'string', "typeof(TIMESTAMP '2019-08-12 01:00:00.123456')": 'timestamp'}
-#query
-#SELECT datepart('YEAR'::string, TIMESTAMP '2019-08-12 01:00:00.123456'::timestamp);
-
-## Original Query: SELECT datepart('days', interval 5 days 3 hours 7 minutes);
-## PySpark 3.5.5 Result: {"datepart(days FROM INTERVAL '5 03:07' DAY TO MINUTE)": 5, "typeof(datepart(days FROM INTERVAL '5 03:07' DAY TO MINUTE))": 'int', 'typeof(days)': 'string', "typeof(INTERVAL '5 03:07' DAY TO MINUTE)": 'interval day to minute'}
-#query
-#SELECT datepart('days'::string, INTERVAL '5 03:07' DAY TO MINUTE::interval day to minute);
-
-## Original Query: SELECT datepart('doy', DATE'2019-08-12');
-## PySpark 3.5.5 Result: {"datepart(doy FROM DATE '2019-08-12')": 224, "typeof(datepart(doy FROM DATE '2019-08-12'))": 'int', 'typeof(doy)': 'string', "typeof(DATE '2019-08-12')": 'date'}
-#query
-#SELECT datepart('doy'::string, DATE '2019-08-12'::date);
-
-## Original Query: SELECT datepart('seconds', interval 5 hours 30 seconds 1 milliseconds 1 microseconds);
-## PySpark 3.5.5 Result: {"datepart(seconds FROM INTERVAL '05:00:30.001001' HOUR TO SECOND)": Decimal('30.001001'), "typeof(datepart(seconds FROM INTERVAL '05:00:30.001001' HOUR TO SECOND))": 'decimal(8,6)', 'typeof(seconds)': 'string', "typeof(INTERVAL '05:00:30.001001' HOUR TO SECOND)": 'interval hour to second'}
-#query
-#SELECT datepart('seconds'::string, INTERVAL '05:00:30.001001' HOUR TO SECOND::interval hour to second);
-
-## Original Query: SELECT datepart('week', timestamp'2019-08-12 01:00:00.123456');
-## PySpark 3.5.5 Result: {"datepart(week FROM TIMESTAMP '2019-08-12 01:00:00.123456')": 33, "typeof(datepart(week FROM TIMESTAMP '2019-08-12 01:00:00.123456'))": 'int', 'typeof(week)': 'string', "typeof(TIMESTAMP '2019-08-12 01:00:00.123456')": 'timestamp'}
-#query
-#SELECT datepart('week'::string, TIMESTAMP '2019-08-12 01:00:00.123456'::timestamp);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/from_utc_timestamp.slt b/datafusion/sqllogictest/test_files/spark/datetime/from_utc_timestamp.slt
new file mode 100644
index 0000000000000..5a39bda0a651b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/from_utc_timestamp.slt
@@ -0,0 +1,156 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# String inputs
+query P
+SELECT from_utc_timestamp('2016-08-31'::string, 'UTC'::string);
+----
+2016-08-31T00:00:00
+
+query P
+SELECT from_utc_timestamp('2016-08-31'::string, 'Asia/Seoul'::string);
+----
+2016-08-31T09:00:00
+
+query P
+SELECT from_utc_timestamp('2016-08-31'::string, 'America/New_York'::string);
+----
+2016-08-30T20:00:00
+
+# String inputs with offsets
+query P
+SELECT from_utc_timestamp('2018-03-13T06:18:23+02:00'::string, 'UTC'::string);
+----
+2018-03-13T04:18:23
+
+query P
+SELECT from_utc_timestamp('2018-03-13T06:18:23+02:00'::string, 'Asia/Seoul'::string);
+----
+2018-03-13T13:18:23
+
+query P
+SELECT from_utc_timestamp('2018-03-13T06:18:23+02:00'::string, 'America/New_York'::string);
+----
+2018-03-13T00:18:23
+
+# Timestamp inputs
+query P
+SELECT from_utc_timestamp('2018-03-13T06:18:23+02:00'::timestamp, 'UTC'::string);
+----
+2018-03-13T04:18:23
+
+query P
+SELECT from_utc_timestamp('2018-03-13T06:18:23+02:00'::timestamp, 'Asia/Seoul'::string);
+----
+2018-03-13T13:18:23
+
+query P
+SELECT from_utc_timestamp('2018-03-13T06:18:23+02:00'::timestamp, 'America/New_York'::string);
+----
+2018-03-13T00:18:23
+
+# Null inputs
+query P
+SELECT from_utc_timestamp(NULL::string, 'Asia/Seoul'::string);
+----
+NULL
+
+query P
+SELECT from_utc_timestamp(NULL::timestamp, 'Asia/Seoul'::string);
+----
+NULL
+
+query P
+SELECT from_utc_timestamp('2016-08-31'::string, NULL::string);
+----
+NULL
+
+query P
+SELECT from_utc_timestamp(column1, column2)
+FROM VALUES
+('2016-08-31'::string, 'Asia/Seoul'::string),
+('2018-03-13T06:18:23+02:00'::string, 'Asia/Seoul'::string),
+('2016-08-31'::string, 'UTC'::string),
+('2018-03-13T06:18:23+02:00'::string, 'UTC'::string),
+('2016-08-31'::string, 'America/New_York'::string),
+('2018-03-13T06:18:23+02:00'::string, 'America/New_York'::string),
+(NULL::string, 'Asia/Seoul'::string),
+('2016-08-31'::string, NULL::string);
+----
+2016-08-31T09:00:00
+2018-03-13T13:18:23
+2016-08-31T00:00:00
+2018-03-13T04:18:23
+2016-08-30T20:00:00
+2018-03-13T00:18:23
+NULL
+NULL
+
+query P
+SELECT from_utc_timestamp(column1, column2)
+FROM VALUES
+('2016-08-31'::timestamp, 'Asia/Seoul'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'Asia/Seoul'::string),
+('2016-08-31'::timestamp, 'UTC'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'UTC'::string),
+('2016-08-31'::timestamp, 'America/New_York'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'America/New_York'::string),
+(NULL::timestamp, 'Asia/Seoul'::string),
+('2018-03-13T06:18:23+00:00'::timestamp, NULL::string);
+----
+2016-08-31T09:00:00
+2018-03-13T13:18:23
+2016-08-31T00:00:00
+2018-03-13T04:18:23
+2016-08-30T20:00:00
+2018-03-13T00:18:23
+NULL
+NULL
+
+query P
+SELECT from_utc_timestamp(arrow_cast(column1, 'Timestamp(Microsecond, Some("Asia/Seoul"))'), column2)
+FROM VALUES
+('2016-08-31'::timestamp, 'Asia/Seoul'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'Asia/Seoul'::string),
+('2016-08-31'::timestamp, 'UTC'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'UTC'::string),
+('2016-08-31'::timestamp, 'America/New_York'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'America/New_York'::string),
+(NULL::timestamp, 'Asia/Seoul'::string),
+('2018-03-13T06:18:23+00:00'::timestamp, NULL::string);
+----
+2016-08-31T09:00:00+09:00
+2018-03-13T13:18:23+09:00
+2016-08-31T00:00:00+09:00
+2018-03-13T04:18:23+09:00
+2016-08-30T20:00:00+09:00
+2018-03-13T00:18:23+09:00
+NULL
+NULL
+
+
+# DST edge cases
+query P
+SELECT from_utc_timestamp('2020-03-31T13:40:00'::timestamp, 'America/New_York'::string);
+----
+2020-03-31T09:40:00
+
+
+query P
+SELECT from_utc_timestamp('2020-11-04T14:06:40'::timestamp, 'America/New_York'::string);
+----
+2020-11-04T09:06:40
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/hour.slt b/datafusion/sqllogictest/test_files/spark/datetime/hour.slt
index e129b271658ab..b595a98b579c8 100644
--- a/datafusion/sqllogictest/test_files/spark/datetime/hour.slt
+++ b/datafusion/sqllogictest/test_files/spark/datetime/hour.slt
@@ -23,5 +23,24 @@
 
 ## Original Query: SELECT hour('2009-07-30 12:58:59');
 ## PySpark 3.5.5 Result: {'hour(2009-07-30 12:58:59)': 12, 'typeof(hour(2009-07-30 12:58:59))': 'int', 'typeof(2009-07-30 12:58:59)': 'string'}
-#query
-#SELECT hour('2009-07-30 12:58:59'::string);
+query I
+SELECT hour('2009-07-30 12:58:59'::timestamp);
+----
+12
+
+# Test with different hours
+query I
+SELECT hour('2009-07-30 00:00:00'::timestamp);
+----
+0
+
+query I
+SELECT hour('2009-07-30 23:59:59'::timestamp);
+----
+23
+
+# Test with NULL
+query I
+SELECT hour(NULL::timestamp);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/make_dt_interval.slt b/datafusion/sqllogictest/test_files/spark/datetime/make_dt_interval.slt
index dc6c33caa9b4c..1223b777d1d63 100644
--- a/datafusion/sqllogictest/test_files/spark/datetime/make_dt_interval.slt
+++ b/datafusion/sqllogictest/test_files/spark/datetime/make_dt_interval.slt
@@ -63,12 +63,13 @@ SELECT (make_dt_interval(0, 0, 0, null))
 ----
 NULL
 
-# missing params
+# zero arguments - returns default zero duration
 query ?
 SELECT (make_dt_interval()) AS make_dt_interval
 ----
 0 days 0 hours 0 mins 0.000000 secs
 
+
 query ?
 SELECT (make_dt_interval(1)) AS make_dt_interval
 ----
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/make_interval.slt b/datafusion/sqllogictest/test_files/spark/datetime/make_interval.slt
index d6c5199b87b75..a796094979d97 100644
--- a/datafusion/sqllogictest/test_files/spark/datetime/make_interval.slt
+++ b/datafusion/sqllogictest/test_files/spark/datetime/make_interval.slt
@@ -90,21 +90,15 @@ SELECT make_interval(0, 0, 0, 0, 2147483647, 1, 0.0);
 ----
 NULL
 
-# Intervals being rendered as empty string, see issue:
-# https://github.com/apache/datafusion/issues/17455
-# We expect something like 0.00 secs with query ?
 query T
 SELECT make_interval(0, 0, 0, 0, 0, 0, 0.0) || '';
 ----
-(empty)
+0 secs
 
-# Intervals being rendered as empty string, see issue:
-# https://github.com/apache/datafusion/issues/17455
-# We expect something like 0.00 secs with query ?
 query T
 SELECT make_interval() || '';
 ----
-(empty)
+0 secs
 
 query ?
 SELECT INTERVAL '1' SECOND AS iv;
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/minute.slt b/datafusion/sqllogictest/test_files/spark/datetime/minute.slt
index dbe1e64be8377..8792c544736d0 100644
--- a/datafusion/sqllogictest/test_files/spark/datetime/minute.slt
+++ b/datafusion/sqllogictest/test_files/spark/datetime/minute.slt
@@ -23,5 +23,24 @@
 
 ## Original Query: SELECT minute('2009-07-30 12:58:59');
 ## PySpark 3.5.5 Result: {'minute(2009-07-30 12:58:59)': 58, 'typeof(minute(2009-07-30 12:58:59))': 'int', 'typeof(2009-07-30 12:58:59)': 'string'}
-#query
-#SELECT minute('2009-07-30 12:58:59'::string);
+query I
+SELECT minute('2009-07-30 12:58:59'::timestamp);
+----
+58
+
+# Test with different minutes
+query I
+SELECT minute('2009-07-30 12:00:00'::timestamp);
+----
+0
+
+query I
+SELECT minute('2009-07-30 12:59:59'::timestamp);
+----
+59
+
+# Test with NULL
+query I
+SELECT minute(NULL::timestamp);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/second.slt b/datafusion/sqllogictest/test_files/spark/datetime/second.slt
index f69c9af4a62d9..7a99dd8967b02 100644
--- a/datafusion/sqllogictest/test_files/spark/datetime/second.slt
+++ b/datafusion/sqllogictest/test_files/spark/datetime/second.slt
@@ -23,5 +23,24 @@
 
 ## Original Query: SELECT second('2009-07-30 12:58:59');
 ## PySpark 3.5.5 Result: {'second(2009-07-30 12:58:59)': 59, 'typeof(second(2009-07-30 12:58:59))': 'int', 'typeof(2009-07-30 12:58:59)': 'string'}
-#query
-#SELECT second('2009-07-30 12:58:59'::string);
+query I
+SELECT second('2009-07-30 12:58:59'::timestamp);
+----
+59
+
+# Test with different seconds
+query I
+SELECT second('2009-07-30 12:58:00'::timestamp);
+----
+0
+
+query I
+SELECT second('2009-07-30 12:58:30'::timestamp);
+----
+30
+
+# Test with NULL
+query I
+SELECT second(NULL::timestamp);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/time_trunc.slt b/datafusion/sqllogictest/test_files/spark/datetime/time_trunc.slt
new file mode 100644
index 0000000000000..35ffa483bb068
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/time_trunc.slt
@@ -0,0 +1,73 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# HOUR - zero out minute and second with fraction
+query D
+SELECT time_trunc('HOUR', '09:32:05.123456'::time);
+----
+09:00:00
+
+# MINUTE - zero out second with fraction
+query D
+SELECT time_trunc('MINUTE', '09:32:05.123456'::time);
+----
+09:32:00
+
+# SECOND - zero out fraction
+query D
+SELECT time_trunc('SECOND', '09:32:05.123456'::time);
+----
+09:32:05
+
+# MILLISECOND - zero out microseconds
+query D
+SELECT time_trunc('MILLISECOND', '09:32:05.123456'::time);
+----
+09:32:05.123
+
+# MICROSECOND - everything remains
+query D
+SELECT time_trunc('MICROSECOND', '09:32:05.123456'::time);
+----
+09:32:05.123456
+
+query D
+SELECT time_trunc('HOUR', column1)
+FROM VALUES
+('09:32:05.123456'::time),
+('22:45:30.654321'::time),
+('14:20:10.000001'::time),
+(NULL::time);
+----
+09:00:00
+22:00:00
+14:00:00
+NULL
+
+
+# Null handling
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed\ncaused by\nError during planning: First argument of `TIME_TRUNC` must be non-null scalar Utf8
+SELECT time_trunc(NULL, '09:32:05.123456'::time);
+
+query D
+SELECT time_trunc('HOUR', NULL::time);
+----
+NULL
+
+# incorrect format
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed\ncaused by\nError during planning: The format argument of `TIME_TRUNC` must be one of: hour, minute, second, millisecond, microsecond
+SELECT time_trunc('test', '09:32:05.123456'::time);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/to_utc_timestamp.slt b/datafusion/sqllogictest/test_files/spark/datetime/to_utc_timestamp.slt
index 24693016be1a7..086716e5bcd0e 100644
--- a/datafusion/sqllogictest/test_files/spark/datetime/to_utc_timestamp.slt
+++ b/datafusion/sqllogictest/test_files/spark/datetime/to_utc_timestamp.slt
@@ -15,13 +15,143 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT to_utc_timestamp('2016-08-31', 'Asia/Seoul');
-## PySpark 3.5.5 Result: {'to_utc_timestamp(2016-08-31, Asia/Seoul)': datetime.datetime(2016, 8, 30, 15, 0), 'typeof(to_utc_timestamp(2016-08-31, Asia/Seoul))': 'timestamp', 'typeof(2016-08-31)': 'string', 'typeof(Asia/Seoul)': 'string'}
-#query
-#SELECT to_utc_timestamp('2016-08-31'::string, 'Asia/Seoul'::string);
+
+# String inputs
+query P
+SELECT to_utc_timestamp('2016-08-31'::string, 'UTC'::string);
+----
+2016-08-31T00:00:00
+
+query P
+SELECT to_utc_timestamp('2016-08-31'::string, 'Asia/Seoul'::string);
+----
+2016-08-30T15:00:00
+
+query P
+SELECT to_utc_timestamp('2016-08-31'::string, 'America/New_York'::string);
+----
+2016-08-31T04:00:00
+
+# String inputs with offsets
+query P
+SELECT to_utc_timestamp('2018-03-13T06:18:23+02:00'::string, 'UTC'::string);
+----
+2018-03-13T04:18:23
+
+query P
+SELECT to_utc_timestamp('2018-03-13T06:18:23+02:00'::string, 'Asia/Seoul'::string);
+----
+2018-03-12T19:18:23
+
+query P
+SELECT to_utc_timestamp('2018-03-13T06:18:23+02:00'::string, 'America/New_York'::string);
+----
+2018-03-13T08:18:23
+
+# Timestamp inputs
+query P
+SELECT to_utc_timestamp('2018-03-13T06:18:23+02:00'::timestamp, 'UTC'::string);
+----
+2018-03-13T04:18:23
+
+query P
+SELECT to_utc_timestamp('2018-03-13T06:18:23+02:00'::timestamp, 'Asia/Seoul'::string);
+----
+2018-03-12T19:18:23
+
+query P
+SELECT to_utc_timestamp('2018-03-13T06:18:23+02:00'::timestamp, 'America/New_York'::string);
+----
+2018-03-13T08:18:23
+
+# Null inputs
+query P
+SELECT to_utc_timestamp(NULL::string, 'Asia/Seoul'::string);
+----
+NULL
+
+query P
+SELECT to_utc_timestamp(NULL::timestamp, 'Asia/Seoul'::string);
+----
+NULL
+
+query P
+SELECT to_utc_timestamp('2016-08-31'::string, NULL::string);
+----
+NULL
+
+query P
+SELECT to_utc_timestamp(column1, column2)
+FROM VALUES
+('2016-08-31'::string, 'Asia/Seoul'::string),
+('2018-03-13T06:18:23+02:00'::string, 'Asia/Seoul'::string),
+('2016-08-31'::string, 'UTC'::string),
+('2018-03-13T06:18:23+02:00'::string, 'UTC'::string),
+('2016-08-31'::string, 'America/New_York'::string),
+('2018-03-13T06:18:23+02:00'::string, 'America/New_York'::string),
+(NULL::string, 'Asia/Seoul'::string),
+('2016-08-31'::string, NULL::string);
+----
+2016-08-30T15:00:00
+2018-03-12T19:18:23
+2016-08-31T00:00:00
+2018-03-13T04:18:23
+2016-08-31T04:00:00
+2018-03-13T08:18:23
+NULL
+NULL
+
+query P
+SELECT to_utc_timestamp(column1, column2)
+FROM VALUES
+('2016-08-31'::timestamp, 'Asia/Seoul'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'Asia/Seoul'::string),
+('2016-08-31'::timestamp, 'UTC'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'UTC'::string),
+('2016-08-31'::timestamp, 'America/New_York'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'America/New_York'::string),
+(NULL::timestamp, 'Asia/Seoul'::string),
+('2018-03-13T06:18:23+00:00'::timestamp, NULL::string);
+----
+2016-08-30T15:00:00
+2018-03-12T19:18:23
+2016-08-31T00:00:00
+2018-03-13T04:18:23
+2016-08-31T04:00:00
+2018-03-13T08:18:23
+NULL
+NULL
+
+query P
+SELECT to_utc_timestamp(arrow_cast(column1, 'Timestamp(Microsecond, Some("Asia/Seoul"))'), column2)
+FROM VALUES
+('2016-08-31'::timestamp, 'Asia/Seoul'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'Asia/Seoul'::string),
+('2016-08-31'::timestamp, 'UTC'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'UTC'::string),
+('2016-08-31'::timestamp, 'America/New_York'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'America/New_York'::string),
+(NULL::timestamp, 'Asia/Seoul'::string),
+('2018-03-13T06:18:23+00:00'::timestamp, NULL::string);
+----
+2016-08-30T15:00:00+09:00
+2018-03-12T19:18:23+09:00
+2016-08-31T00:00:00+09:00
+2018-03-13T04:18:23+09:00
+2016-08-31T04:00:00+09:00
+2018-03-13T08:18:23+09:00
+NULL
+NULL
+
+
+# DST edge cases
+query P
+SELECT to_utc_timestamp('2020-03-31T13:40:00'::timestamp, 'America/New_York'::string);
+----
+2020-03-31T17:40:00
+
+
+query P
+SELECT to_utc_timestamp('2020-11-04T14:06:40'::timestamp, 'America/New_York'::string);
+----
+2020-11-04T19:06:40
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/trunc.slt b/datafusion/sqllogictest/test_files/spark/datetime/trunc.slt
index a502e2f7f7b00..aa26d7bd0ef06 100644
--- a/datafusion/sqllogictest/test_files/spark/datetime/trunc.slt
+++ b/datafusion/sqllogictest/test_files/spark/datetime/trunc.slt
@@ -15,28 +15,78 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT trunc('2009-02-12', 'MM');
-## PySpark 3.5.5 Result: {'trunc(2009-02-12, MM)': datetime.date(2009, 2, 1), 'typeof(trunc(2009-02-12, MM))': 'date', 'typeof(2009-02-12)': 'string', 'typeof(MM)': 'string'}
-#query
-#SELECT trunc('2009-02-12'::string, 'MM'::string);
-
-## Original Query: SELECT trunc('2015-10-27', 'YEAR');
-## PySpark 3.5.5 Result: {'trunc(2015-10-27, YEAR)': datetime.date(2015, 1, 1), 'typeof(trunc(2015-10-27, YEAR))': 'date', 'typeof(2015-10-27)': 'string', 'typeof(YEAR)': 'string'}
-#query
-#SELECT trunc('2015-10-27'::string, 'YEAR'::string);
-
-## Original Query: SELECT trunc('2019-08-04', 'quarter');
-## PySpark 3.5.5 Result: {'trunc(2019-08-04, quarter)': datetime.date(2019, 7, 1), 'typeof(trunc(2019-08-04, quarter))': 'date', 'typeof(2019-08-04)': 'string', 'typeof(quarter)': 'string'}
-#query
-#SELECT trunc('2019-08-04'::string, 'quarter'::string);
-
-## Original Query: SELECT trunc('2019-08-04', 'week');
-## PySpark 3.5.5 Result: {'trunc(2019-08-04, week)': datetime.date(2019, 7, 29), 'typeof(trunc(2019-08-04, week))': 'date', 'typeof(2019-08-04)': 'string', 'typeof(week)': 'string'}
-#query
-#SELECT trunc('2019-08-04'::string, 'week'::string);
+# YEAR - truncate to first date of year
+query D
+SELECT trunc('2009-02-12'::date, 'YEAR'::string);
+----
+2009-01-01
+
+query D
+SELECT trunc('2009-02-12'::date, 'YYYY'::string);
+----
+2009-01-01
+
+query D
+SELECT trunc('2009-02-12'::date, 'YY'::string);
+----
+2009-01-01
+
+# QUARTER - truncate to first date of quarter
+query D
+SELECT trunc('2009-02-12'::date, 'QUARTER'::string);
+----
+2009-01-01
+
+# MONTH - truncate to first date of month
+query D
+SELECT trunc('2009-02-12'::date, 'MONTH'::string);
+----
+2009-02-01
+
+query D
+SELECT trunc('2009-02-12'::date, 'MM'::string);
+----
+2009-02-01
+
+query D
+SELECT trunc('2009-02-12'::date, 'MON'::string);
+----
+2009-02-01
+
+# WEEK - truncate to Monday of the week
+query D
+SELECT trunc('2009-02-12'::date, 'WEEK'::string);
+----
+2009-02-09
+
+# string input
+query D
+SELECT trunc('2009-02-12'::string, 'YEAR'::string);
+----
+2009-01-01
+
+query D
+SELECT trunc(column1, 'YEAR'::string)
+FROM VALUES
+('2009-02-12'::date),
+('2000-02-12'::date),
+('2042-02-12'::date),
+(NULL::date);
+----
+2009-01-01
+2000-01-01
+2042-01-01
+NULL
+
+# Null handling
+query D
+SELECT trunc(NULL::date, 'YEAR'::string);
+----
+NULL
+
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed\ncaused by\nError during planning: Second argument of `TRUNC` must be non-null scalar Utf8
+SELECT trunc('2009-02-12'::date, NULL::string);
+
+# incorrect format
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed\ncaused by\nError during planning: The format argument of `TRUNC` must be one of: year, yy, yyyy, month, mm, mon, day, week, quarter.
+SELECT trunc('2009-02-12'::date, 'test'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/unix.slt b/datafusion/sqllogictest/test_files/spark/datetime/unix.slt
new file mode 100644
index 0000000000000..9dd39acd7f1de
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/unix.slt
@@ -0,0 +1,134 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Unix Date tests
+
+query I
+SELECT unix_date('1970-01-02'::date);
+----
+1
+
+query I
+SELECT unix_date('1900-01-02'::date);
+----
+-25566
+
+
+query I
+SELECT unix_date(arrow_cast('1970-01-02', 'Date64'));
+----
+1
+
+query I
+SELECT unix_date(NULL::date);
+----
+NULL
+
+query error Function 'unix_date' requires Date, but received String \(DataType: Utf8View\)
+SELECT unix_date('1970-01-02'::string);
+
+# Unix Micro Tests
+
+query I
+SELECT unix_micros('1970-01-01 00:00:01Z'::timestamp);
+----
+1000000
+
+query I
+SELECT unix_micros('1900-01-01 00:00:01Z'::timestamp);
+----
+-2208988799000000
+
+query I
+SELECT unix_micros(arrow_cast('1970-01-01 00:00:01+02:00', 'Timestamp(Microsecond, None)'));
+----
+-7199000000
+
+query I
+SELECT unix_micros(arrow_cast('1970-01-01 00:00:01Z', 'Timestamp(Second, None)'));
+----
+1000000
+
+query I
+SELECT unix_micros(NULL::timestamp);
+----
+NULL
+
+query error Function 'unix_micros' requires Timestamp, but received String \(DataType: Utf8View\)
+SELECT unix_micros('1970-01-01 00:00:01Z'::string);
+
+
+# Unix Millis Tests
+
+query I
+SELECT unix_millis('1970-01-01 00:00:01Z'::timestamp);
+----
+1000
+
+query I
+SELECT unix_millis('1900-01-01 00:00:01Z'::timestamp);
+----
+-2208988799000
+
+query I
+SELECT unix_millis(arrow_cast('1970-01-01 00:00:01+02:00', 'Timestamp(Microsecond, None)'));
+----
+-7199000
+
+query I
+SELECT unix_millis(arrow_cast('1970-01-01 00:00:01Z', 'Timestamp(Second, None)'));
+----
+1000
+
+query I
+SELECT unix_millis(NULL::timestamp);
+----
+NULL
+
+query error Function 'unix_millis' requires Timestamp, but received String \(DataType: Utf8View\)
+SELECT unix_millis('1970-01-01 00:00:01Z'::string);
+
+
+# Unix Seconds Tests
+
+query I
+SELECT unix_seconds('1970-01-01 00:00:01Z'::timestamp);
+----
+1
+
+query I
+SELECT unix_seconds('1900-01-01 00:00:01Z'::timestamp);
+----
+-2208988799
+
+query I
+SELECT unix_seconds(arrow_cast('1970-01-01 00:00:01+02:00', 'Timestamp(Microsecond, None)'));
+----
+-7199
+
+query I
+SELECT unix_seconds(arrow_cast('1970-01-01 00:00:01Z', 'Timestamp(Second, None)'));
+----
+1
+
+query I
+SELECT unix_seconds(NULL::timestamp);
+----
+NULL
+
+query error Function 'unix_seconds' requires Timestamp, but received String \(DataType: Utf8View\)
+SELECT unix_seconds('1970-01-01 00:00:01Z'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/hash/crc32.slt b/datafusion/sqllogictest/test_files/spark/hash/crc32.slt
index 87b69d8d404ea..df5588c75837d 100644
--- a/datafusion/sqllogictest/test_files/spark/hash/crc32.slt
+++ b/datafusion/sqllogictest/test_files/spark/hash/crc32.slt
@@ -30,11 +30,6 @@ SELECT crc32('Spark');
 ----
 1557323817
 
-query I
-SELECT crc32(NULL);
-----
-NULL
-
 query I
 SELECT crc32('');
 ----
@@ -45,32 +40,48 @@ SELECT crc32(arrow_cast('', 'Binary'));
 ----
 0
 
-# Test with LargeUtf8 (using CAST to ensure type)
+# Test with different types
+query I
+SELECT crc32(NULL);
+----
+NULL
+
 query I
 SELECT crc32(arrow_cast('Spark', 'LargeUtf8'));
 ----
 1557323817
 
-# Test with Utf8View (using CAST to ensure type)
 query I
 SELECT crc32(arrow_cast('Spark', 'Utf8View'));
 ----
 1557323817
 
-# Test with different binary types
+query I
+SELECT crc32(arrow_cast('Spark', 'Utf8'));
+----
+1557323817
+
 query I
 SELECT crc32(arrow_cast('Spark', 'Binary'));
 ----
 1557323817
 
-# Test with LargeBinary
+query I
+SELECT crc32(arrow_cast(arrow_cast('Spark', 'Binary'), 'FixedSizeBinary(5)'));
+----
+1557323817
+
 query I
 SELECT crc32(arrow_cast('Spark', 'LargeBinary'));
 ----
 1557323817
 
-# Test with BinaryView
 query I
 SELECT crc32(arrow_cast('Spark', 'BinaryView'));
 ----
 1557323817
+
+query I
+select crc32(arrow_cast(null, 'Dictionary(Int32, Utf8)'))
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/hash/sha1.slt b/datafusion/sqllogictest/test_files/spark/hash/sha1.slt
index 1ce7346160726..5185c45d090bd 100644
--- a/datafusion/sqllogictest/test_files/spark/hash/sha1.slt
+++ b/datafusion/sqllogictest/test_files/spark/hash/sha1.slt
@@ -31,40 +31,51 @@ SELECT sha1('Spark');
 85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
 
 query T
-SELECT sha1(NULL);
+SELECT sha('Spark');
 ----
-NULL
+85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
 
 query T
 SELECT sha1('');
 ----
 da39a3ee5e6b4b0d3255bfef95601890afd80709
 
-# Test with LargeUtf8 (using CAST to ensure type)
+# Test with different types
+query T
+SELECT sha1(NULL);
+----
+NULL
+
 query T
 SELECT sha1(arrow_cast('Spark', 'LargeUtf8'));
 ----
 85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
 
-# Test with Utf8View (using CAST to ensure type)
 query T
 SELECT sha1(arrow_cast('Spark', 'Utf8View'));
 ----
 85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
 
-# Test with Binary
+query T
+SELECT sha1(arrow_cast('Spark', 'Utf8'));
+----
+85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
+
 query T
 SELECT sha1(arrow_cast('Spark', 'Binary'));
 ----
 85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
 
-# Test with LargeBinary
+query T
+SELECT sha1(arrow_cast(arrow_cast('Spark', 'Binary'), 'FixedSizeBinary(5)'));
+----
+85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
+
 query T
 SELECT sha1(arrow_cast('Spark', 'LargeBinary'));
 ----
 85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
 
-# Test with BinaryView
 query T
 SELECT sha1(arrow_cast('Spark', 'BinaryView'));
 ----
diff --git a/datafusion/sqllogictest/test_files/spark/hash/sha2.slt b/datafusion/sqllogictest/test_files/spark/hash/sha2.slt
index 7690a38773b04..07f70947fe926 100644
--- a/datafusion/sqllogictest/test_files/spark/hash/sha2.slt
+++ b/datafusion/sqllogictest/test_files/spark/hash/sha2.slt
@@ -75,3 +75,58 @@ SELECT sha2(expr, bit_length) FROM VALUES ('foo',0::INT), ('bar',224::INT), ('ba
 967004d25de4abc1bd6a7c9a216254a5ac0733e8ad96dc9f1ea0fad9619da7c32d654ec8ad8ba2f9b5728fed6633bd91
 8c6be9ed448a34883a13a13f4ead4aefa036b67dcda59020c01e57ea075ea8a4792d428f2c6fd0c09d1c49994d6c22789336e062188df29572ed07e7f9779c52
 NULL
+
+# All string types
+query T
+SELECT sha2(arrow_cast('foo', 'Utf8'), bit_length) FROM VALUES (224::INT), (256::INT) AS t(bit_length);
+----
+0808f64e60d58979fcb676c96ec938270dea42445aeefcd3a4e6f8db
+2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae
+
+query T
+SELECT sha2(arrow_cast('foo', 'LargeUtf8'), bit_length) FROM VALUES (224::INT), (256::INT) AS t(bit_length);
+----
+0808f64e60d58979fcb676c96ec938270dea42445aeefcd3a4e6f8db
+2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae
+
+query T
+SELECT sha2(arrow_cast('foo', 'Utf8View'), bit_length) FROM VALUES (224::INT), (256::INT) AS t(bit_length);
+----
+0808f64e60d58979fcb676c96ec938270dea42445aeefcd3a4e6f8db
+2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae
+
+# All binary types
+query T
+SELECT sha2(arrow_cast('foo', 'Binary'), bit_length) FROM VALUES (224::INT), (256::INT) AS t(bit_length);
+----
+0808f64e60d58979fcb676c96ec938270dea42445aeefcd3a4e6f8db
+2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae
+
+query T
+SELECT sha2(arrow_cast('foo', 'LargeBinary'), bit_length) FROM VALUES (224::INT), (256::INT) AS t(bit_length);
+----
+0808f64e60d58979fcb676c96ec938270dea42445aeefcd3a4e6f8db
+2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae
+
+query T
+SELECT sha2(arrow_cast('foo', 'BinaryView'), bit_length) FROM VALUES (224::INT), (256::INT) AS t(bit_length);
+----
+0808f64e60d58979fcb676c96ec938270dea42445aeefcd3a4e6f8db
+2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae
+
+
+# Null cases
+query T
+select sha2(null, 0);
+----
+NULL
+
+query T
+select sha2('a', null);
+----
+NULL
+
+query T
+select sha2('a', null::int);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/json/json_tuple.slt b/datafusion/sqllogictest/test_files/spark/json/json_tuple.slt
new file mode 100644
index 0000000000000..c0c424946709f
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/json/json_tuple.slt
@@ -0,0 +1,154 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for Spark-compatible json_tuple function
+# https://spark.apache.org/docs/latest/api/sql/index.html#json_tuple
+#
+# Test cases derived from Spark JsonExpressionsSuite:
+# https://github.com/apache/spark/blob/master/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+
+# Scalar: hive key 1
+query ?
+SELECT json_tuple('{"f1":"value1","f2":"value2","f3":3,"f5":5.23}'::STRING, 'f1'::STRING, 'f2'::STRING, 'f3'::STRING, 'f4'::STRING, 'f5'::STRING);
+----
+{c0: value1, c1: value2, c2: 3, c3: NULL, c4: 5.23}
+
+# Scalar: hive key 2
+query ?
+SELECT json_tuple('{"f1":"value12","f3":"value3","f2":2,"f4":4.01}'::STRING, 'f1'::STRING, 'f2'::STRING, 'f3'::STRING, 'f4'::STRING, 'f5'::STRING);
+----
+{c0: value12, c1: 2, c2: value3, c3: 4.01, c4: NULL}
+
+# Scalar: hive key 3
+query ?
+SELECT json_tuple('{"f1":"value13","f4":"value44","f3":"value33","f2":2,"f5":5.01}'::STRING, 'f1'::STRING, 'f2'::STRING, 'f3'::STRING, 'f4'::STRING, 'f5'::STRING);
+----
+{c0: value13, c1: 2, c2: value33, c3: value44, c4: 5.01}
+
+# Scalar: null JSON input
+query ?
+SELECT json_tuple(NULL::STRING, 'f1'::STRING, 'f2'::STRING, 'f3'::STRING, 'f4'::STRING, 'f5'::STRING);
+----
+NULL
+
+# Scalar: null and empty values
+query ?
+SELECT json_tuple('{"f1":"","f5":null}'::STRING, 'f1'::STRING, 'f2'::STRING, 'f3'::STRING, 'f4'::STRING, 'f5'::STRING);
+----
+{c0: , c1: NULL, c2: NULL, c3: NULL, c4: NULL}
+
+# Scalar: invalid JSON (array)
+query ?
+SELECT json_tuple('[invalid JSON string]'::STRING, 'f1'::STRING);
+----
+NULL
+
+# Scalar: invalid JSON (start only)
+query ?
+SELECT json_tuple('{'::STRING, 'f1'::STRING);
+----
+NULL
+
+# Scalar: invalid JSON (no closing brace)
+query ?
+SELECT json_tuple('{"foo":"bar"'::STRING, 'f1'::STRING);
+----
+NULL
+
+# Scalar: invalid JSON (backslash)
+query ?
+SELECT json_tuple('\'::STRING, 'f1'::STRING);
+----
+NULL
+
+# Scalar: invalid JSON (quoted string, not an object)
+query ?
+SELECT json_tuple('"quote'::STRING, '"quote'::STRING);
+----
+NULL
+
+# Scalar: empty JSON object
+query ?
+SELECT json_tuple('{}'::STRING, 'a'::STRING);
+----
+{c0: NULL}
+
+# Array: multi-row test
+query ?
+SELECT json_tuple(col, 'f1'::STRING, 'f2'::STRING) FROM (VALUES
+    ('{"f1":"a","f2":"b"}'::STRING),
+    (NULL::STRING),
+    ('{"f1":"c"}'::STRING),
+    ('invalid'::STRING)
+) AS t(col);
+----
+{c0: a, c1: b}
+NULL
+{c0: c, c1: NULL}
+NULL
+
+# Array: SPARK-21677 null field key
+query ?
+SELECT json_tuple(col1, col2, col3, col4) FROM (VALUES
+    ('{"f1":1,"f2":2}'::STRING, 'f1'::STRING, NULL::STRING, 'f2'::STRING)
+) AS t(col1, col2, col3, col4);
+----
+{c0: 1, c1: NULL, c2: 2}
+
+# Array: SPARK-21804 repeated field
+query ?
+SELECT json_tuple(col1, col2, col3, col4) FROM (VALUES
+    ('{"f1":1,"f2":2}'::STRING, 'f1'::STRING, NULL::STRING, 'f1'::STRING)
+) AS t(col1, col2, col3, col4);
+----
+{c0: 1, c1: NULL, c2: 1}
+
+# Edge case: both json and field key are null
+query ?
+SELECT json_tuple(NULL::STRING, NULL::STRING);
+----
+NULL
+
+# Edge case: empty string json and empty string key
+query ?
+SELECT json_tuple(''::STRING, ''::STRING);
+----
+NULL
+
+# Edge case: mixed upper/lower case keys
+query ?
+SELECT json_tuple('{"Name":"Alice","name":"bob","NAME":"Charlie"}'::STRING, 'Name'::STRING, 'name'::STRING, 'NAME'::STRING);
+----
+{c0: Alice, c1: bob, c2: Charlie}
+
+# Edge case: UTF-8 Chinese characters
+query ?
+SELECT json_tuple('{"姓名":"小明","城市":"台北"}'::STRING, '姓名'::STRING, '城市'::STRING);
+----
+{c0: 小明, c1: 台北}
+
+# Edge case: UTF-8 Cyrillic characters
+query ?
+SELECT json_tuple('{"имя":"Иван","город":"Москва"}'::STRING, 'имя'::STRING, 'город'::STRING);
+----
+{c0: Иван, c1: Москва}
+
+# Verify return type with arrow_typeof
+query T
+SELECT arrow_typeof(json_tuple('{"a":1}'::STRING, 'a'::STRING));
+----
+Struct("c0": Utf8)
diff --git a/datafusion/sqllogictest/test_files/spark/map/str_to_map.slt b/datafusion/sqllogictest/test_files/spark/map/str_to_map.slt
new file mode 100644
index 0000000000000..30d1672aef0ae
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/map/str_to_map.slt
@@ -0,0 +1,114 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for Spark-compatible str_to_map function
+# https://spark.apache.org/docs/latest/api/sql/index.html#str_to_map
+#
+# Test cases derived from Spark test("StringToMap"):
+# https://github.com/apache/spark/blob/v4.0.0/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala#L525-L618
+
+# s0: Basic test with default delimiters
+query ?
+SELECT str_to_map('a:1,b:2,c:3');
+----
+{a: 1, b: 2, c: 3}
+
+# s1: Preserve spaces in values
+query ?
+SELECT str_to_map('a: ,b:2');
+----
+{a:  , b: 2}
+
+# s2: Custom key-value delimiter '='
+query ?
+SELECT str_to_map('a=1,b=2,c=3', ',', '=');
+----
+{a: 1, b: 2, c: 3}
+
+# s3: Empty string returns map with empty key and NULL value
+query ?
+SELECT str_to_map('', ',', '=');
+----
+{: NULL}
+
+# s4: Custom pair delimiter '_'
+query ?
+SELECT str_to_map('a:1_b:2_c:3', '_', ':');
+----
+{a: 1, b: 2, c: 3}
+
+# s5: Single key without value returns NULL value
+query ?
+SELECT str_to_map('a');
+----
+{a: NULL}
+
+# s6: Custom delimiters '&' and '='
+query ?
+SELECT str_to_map('a=1&b=2&c=3', '&', '=');
+----
+{a: 1, b: 2, c: 3}
+
+# Duplicate keys: EXCEPTION policy (Spark 3.0+ default)
+# TODO: Add LAST_WIN policy tests when spark.sql.mapKeyDedupPolicy config is supported
+statement error
+Duplicate map key
+SELECT str_to_map('a:1,b:2,a:3');
+
+# Additional tests (DataFusion-specific)
+
+# NULL input returns NULL
+query ?
+SELECT str_to_map(NULL, ',', ':');
+----
+NULL
+
+# Explicit 3-arg form
+query ?
+SELECT str_to_map('a:1,b:2,c:3', ',', ':');
+----
+{a: 1, b: 2, c: 3}
+
+# Missing key-value delimiter results in NULL value
+query ?
+SELECT str_to_map('a,b:2', ',', ':');
+----
+{a: NULL, b: 2}
+
+# Multi-row test
+query ?
+SELECT str_to_map(col) FROM (VALUES ('a:1,b:2'), ('x:9'), (NULL)) AS t(col);
+----
+{a: 1, b: 2}
+{x: 9}
+NULL
+
+# Multi-row with custom delimiter
+query ?
+SELECT str_to_map(col, ',', '=') FROM (VALUES ('a=1,b=2'), ('x=9'), (NULL)) AS t(col);
+----
+{a: 1, b: 2}
+{x: 9}
+NULL
+
+# Per-row delimiters: each row can have different delimiters
+query ?
+SELECT str_to_map(col1, col2, col3) FROM (VALUES ('a=1,b=2', ',', '='), ('x#9', ',', '#'), (NULL, ',', '=')) AS t(col1, col2, col3);
+----
+{a: 1, b: 2}
+{x: 9}
+NULL
\ No newline at end of file
diff --git a/datafusion/sqllogictest/test_files/spark/math/abs.slt b/datafusion/sqllogictest/test_files/spark/math/abs.slt
index 4b9edf7e29f27..94092caab9854 100644
--- a/datafusion/sqllogictest/test_files/spark/math/abs.slt
+++ b/datafusion/sqllogictest/test_files/spark/math/abs.slt
@@ -23,10 +23,191 @@
 
 ## Original Query: SELECT abs(-1);
 ## PySpark 3.5.5 Result: {'abs(-1)': 1, 'typeof(abs(-1))': 'int', 'typeof(-1)': 'int'}
-#query
-#SELECT abs(-1::int);
+
+# Scalar input
+## Scalar input: signed int and NULL
+query IIIIR
+SELECT abs(-127::TINYINT), abs(-32767::SMALLINT), abs(-2147483647::INT), abs(-9223372036854775807::BIGINT), abs(NULL);
+----
+127 32767 2147483647 9223372036854775807 NULL
+
+## Scalar input: signed int minimal values
+## See https://github.com/apache/datafusion/issues/18794 for operator precedence
+query IIII
+select abs((-128)::TINYINT), abs((-32768)::SMALLINT), abs((-2147483648)::INT), abs((-9223372036854775808)::BIGINT);
+----
+-128 -32768 -2147483648 -9223372036854775808
+
+## Scalar input: Spark ANSI mode, signed int minimal values
+statement ok
+set datafusion.execution.enable_ansi_mode = true;
+
+query error DataFusion error: Arrow error: Compute error: Int8 overflow on abs\(\-128\)
+select abs((-128)::TINYINT);
+
+query error DataFusion error: Arrow error: Compute error: Int16 overflow on abs\(\-32768\)
+select abs((-32768)::SMALLINT);
+
+query error DataFusion error: Arrow error: Compute error: Int32 overflow on abs\(\-2147483648\)
+select abs((-2147483648)::INT);
+
+query error DataFusion error: Arrow error: Compute error: Int64 overflow on abs\(\-9223372036854775808\)
+select abs((-9223372036854775808)::BIGINT);
+
+statement ok
+set datafusion.execution.enable_ansi_mode = false;
+
+## Scalar input: float, NULL, NaN, -0, infinity, -infinity
+query RRRRRRRRRRRR
+SELECT abs(-1.0::FLOAT), abs(0.::FLOAT), abs(-0.::FLOAT), abs(-0::FLOAT), abs(NULL::FLOAT), abs('NaN'::FLOAT), abs('inf'::FLOAT), abs('+inf'::FLOAT), abs('-inf'::FLOAT), abs('infinity'::FLOAT), abs('+infinity'::FLOAT), abs('-infinity'::FLOAT);
+----
+1 0 0 0 NULL NaN Infinity Infinity Infinity Infinity Infinity Infinity
+
+## Scalar input: double, NULL, NaN, -0, infinity, -infinity
+query RRRRRRRRRRRR
+SELECT abs(-1.0::DOUBLE), abs(0.::DOUBLE), abs(-0.::DOUBLE), abs(-0::DOUBLE), abs(NULL::DOUBLE), abs('NaN'::DOUBLE), abs('inf'::DOUBLE), abs('+inf'::DOUBLE), abs('-inf'::DOUBLE), abs('infinity'::DOUBLE), abs('+infinity'::DOUBLE), abs('-infinity'::DOUBLE);
+----
+1 0 0 0 NULL NaN Infinity Infinity Infinity Infinity Infinity Infinity
+
+## Scalar input: decimal128
+query RRR
+SELECT abs(('-99999999.99')::DECIMAL(10, 2)), abs(0::DECIMAL(10, 2)), abs(NULL::DECIMAL(10, 2));
+----
+99999999.99 0 NULL
+
+query RRR
+SELECT abs(('-9999999999999999999999999999.9999999999')::DECIMAL(38, 10)), abs(0::DECIMAL(38, 10)), abs(NULL::DECIMAL(38, 10));
+----
+9999999999999999999999999999.9999999999 0 NULL
+
+## Scalar input: decimal256
+query RRR
+SELECT abs(('-99999999999999999999999999999999999999.99')::DECIMAL(40, 2)), abs(0::DECIMAL(40, 2)), abs(NULL::DECIMAL(40, 2));
+----
+99999999999999999999999999999999999999.99 0 NULL
+
+query RRR
+SELECT abs(('-999999999999999999999999999999999999999999999999999999999999999999.9999999999')::DECIMAL(76, 10)), abs(0::DECIMAL(76, 10)), abs(NULL::DECIMAL(76, 10));
+----
+999999999999999999999999999999999999999999999999999999999999999999.9999999999 0 NULL
+
+
+# Array input
+## Array input: signed int, signed int minimal values and NULL
+query I
+SELECT abs(a) FROM (VALUES (-127::TINYINT), ((-128)::TINYINT), (NULL)) AS t(a);
+----
+127
+-128
+NULL
+
+query I
+select abs(a) FROM (VALUES (-32767::SMALLINT), ((-32768)::SMALLINT), (NULL)) AS t(a);
+----
+32767
+-32768
+NULL
+
+query I
+select abs(a) FROM (VALUES (-2147483647::INT), ((-2147483648)::INT), (NULL)) AS t(a);
+----
+2147483647
+-2147483648
+NULL
+
+query I
+select abs(a) FROM (VALUES (-9223372036854775807::BIGINT), ((-9223372036854775808)::BIGINT), (NULL)) AS t(a);
+----
+9223372036854775807
+-9223372036854775808
+NULL
+
+## Array Input: Spark ANSI mode, signed int minimal values
+statement ok
+set datafusion.execution.enable_ansi_mode = true;
+
+query error DataFusion error: Arrow error: Compute error: Int8Array overflow on abs\(\-128\)
+SELECT abs(a) FROM (VALUES (-127::TINYINT), ((-128)::TINYINT)) AS t(a);
+
+query error DataFusion error: Arrow error: Compute error: Int16Array overflow on abs\(\-32768\)
+select abs(a) FROM (VALUES (-32767::SMALLINT), ((-32768)::SMALLINT)) AS t(a);
+
+query error DataFusion error: Arrow error: Compute error: Int32Array overflow on abs\(\-2147483648\)
+select abs(a) FROM (VALUES (-2147483647::INT), ((-2147483648)::INT)) AS t(a);
+
+query error DataFusion error: Arrow error: Compute error: Int64Array overflow on abs\(\-9223372036854775808\)
+select abs(a) FROM (VALUES (-9223372036854775807::BIGINT), ((-9223372036854775808)::BIGINT)) AS t(a);
+
+statement ok
+set datafusion.execution.enable_ansi_mode = false;
+
+## Array input: float, NULL, NaN, -0, infinity, -infinity
+query R
+SELECT abs(a) FROM (VALUES (-1.0::FLOAT), (0.::FLOAT), (-0.::FLOAT), (-0::FLOAT), (NULL::FLOAT), ('NaN'::FLOAT), ('inf'::FLOAT), ('+inf'::FLOAT), ('-inf'::FLOAT), ('infinity'::FLOAT), ('+infinity'::FLOAT), ('-infinity'::FLOAT)) AS t(a);
+----
+1
+0
+0
+0
+NULL
+NaN
+Infinity
+Infinity
+Infinity
+Infinity
+Infinity
+Infinity
+
+
+## Array input: double, NULL, NaN, -0, infinity, -infinity
+query R
+SELECT abs(a) FROM (VALUES (-1.0::DOUBLE), (0.::DOUBLE), (-0.::DOUBLE), (-0::DOUBLE), (NULL::DOUBLE), ('NaN'::DOUBLE), ('inf'::DOUBLE), ('+inf'::DOUBLE), ('-inf'::DOUBLE), ('infinity'::DOUBLE), ('+infinity'::DOUBLE), ('-infinity'::DOUBLE)) AS t(a);
+----
+1
+0
+0
+0
+NULL
+NaN
+Infinity
+Infinity
+Infinity
+Infinity
+Infinity
+Infinity
+
+## Array input: decimal128
+query R
+SELECT abs(a) FROM (VALUES (('-99999999.99')::DECIMAL(10, 2)), (0::DECIMAL(10, 2)), (NULL::DECIMAL(10, 2))) AS t(a);
+----
+99999999.99
+0
+NULL
+
+query R
+SELECT abs(a) FROM (VALUES (('-9999999999999999999999999999.9999999999')::DECIMAL(38, 10)), (0::DECIMAL(38, 10)), (NULL::DECIMAL(38, 10))) AS t(a);
+----
+9999999999999999999999999999.9999999999
+0
+NULL
+
+## Array input: decimal256
+query R
+SELECT abs(a) FROM (VALUES (('-99999999999999999999999999999999999999.99')::DECIMAL(40, 2)), (0::DECIMAL(40, 2)), (NULL::DECIMAL(40, 2))) AS t(a);
+----
+99999999999999999999999999999999999999.99
+0
+NULL
+
+query R
+SELECT abs(a) FROM (VALUES (('-999999999999999999999999999999999999999999999999999999999999999999.9999999999')::DECIMAL(76, 10)), (0::DECIMAL(76, 10)), (NULL::DECIMAL(76, 10))) AS t(a);
+----
+999999999999999999999999999999999999999999999999999999999999999999.9999999999
+0
+NULL
 
 ## Original Query: SELECT abs(INTERVAL -'1-1' YEAR TO MONTH);
 ## PySpark 3.5.5 Result: {"abs(INTERVAL '-1-1' YEAR TO MONTH)": 13, "typeof(abs(INTERVAL '-1-1' YEAR TO MONTH))": 'interval year to month', "typeof(INTERVAL '-1-1' YEAR TO MONTH)": 'interval year to month'}
 #query
 #SELECT abs(INTERVAL '-1-1' YEAR TO MONTH::interval year to month);
+# See GitHub issue for ANSI interval support: https://github.com/apache/datafusion/issues/18793
diff --git a/datafusion/sqllogictest/test_files/spark/math/bin.slt b/datafusion/sqllogictest/test_files/spark/math/bin.slt
index 1fa24e6cda6b0..b2e2aadde44b6 100644
--- a/datafusion/sqllogictest/test_files/spark/math/bin.slt
+++ b/datafusion/sqllogictest/test_files/spark/math/bin.slt
@@ -15,23 +15,62 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT bin(-13);
-## PySpark 3.5.5 Result: {'bin(-13)': '1111111111111111111111111111111111111111111111111111111111110011', 'typeof(bin(-13))': 'string', 'typeof(-13)': 'int'}
-#query
-#SELECT bin(-13::int);
-
-## Original Query: SELECT bin(13);
-## PySpark 3.5.5 Result: {'bin(13)': '1101', 'typeof(bin(13))': 'string', 'typeof(13)': 'int'}
-#query
-#SELECT bin(13::int);
-
-## Original Query: SELECT bin(13.3);
-## PySpark 3.5.5 Result: {'bin(13.3)': '1101', 'typeof(bin(13.3))': 'string', 'typeof(13.3)': 'decimal(3,1)'}
-#query
-#SELECT bin(13.3::decimal(3,1));
+query T
+SELECT bin(arrow_cast(NULL, 'Int8'));
+----
+NULL
+
+query T
+SELECT bin(arrow_cast(0, 'Int8'));
+----
+0
+
+query T
+SELECT bin(arrow_cast(13, 'Int8'));
+----
+1101
+
+query T
+SELECT bin(arrow_cast(13.36, 'Float16'));
+----
+1101
+
+query T
+SELECT bin(13.3::decimal(3,1));
+----
+1101
+
+query T
+SELECT bin(arrow_cast(-13, 'Int8'));
+----
+1111111111111111111111111111111111111111111111111111111111110011
+
+query T
+SELECT bin(arrow_cast(256, 'Int16'));
+----
+100000000
+
+query T
+SELECT bin(arrow_cast(-32768, 'Int16'));
+----
+1111111111111111111111111111111111111111111111111000000000000000
+
+query T
+SELECT bin(arrow_cast(-2147483648, 'Int32'));
+----
+1111111111111111111111111111111110000000000000000000000000000000
+
+query T
+SELECT bin(arrow_cast(1073741824, 'Int32'));
+----
+1000000000000000000000000000000
+
+query T
+SELECT bin(arrow_cast(-9223372036854775808, 'Int64'));
+----
+1000000000000000000000000000000000000000000000000000000000000000
+
+query T
+SELECT bin(arrow_cast(9223372036854775807, 'Int64'));
+----
+111111111111111111111111111111111111111111111111111111111111111
diff --git a/datafusion/sqllogictest/test_files/spark/math/ceil.slt b/datafusion/sqllogictest/test_files/spark/math/ceil.slt
index c87a29b61fd49..087acf000b6f1 100644
--- a/datafusion/sqllogictest/test_files/spark/math/ceil.slt
+++ b/datafusion/sqllogictest/test_files/spark/math/ceil.slt
@@ -21,22 +21,138 @@
 # For more information, please see:
 #   https://github.com/apache/datafusion/issues/15914
 
+# Tests for Spark-compatible ceil function.
+# Spark semantics differ from DataFusion's built-in ceil in two ways:
+#   1. Return type: Spark returns Int64 for float/integer inputs;
+#      DataFusion returns the same float type (e.g. ceil(1.5::DOUBLE) -> DOUBLE in DF, BIGINT in Spark)
+#   2. Decimal precision: Spark adjusts precision to (p - s + 1) for Decimal128(p, s) with scale > 0;
+#      DataFusion preserves the original precision and scale
+#
+# Example: SELECT ceil(1.50::DECIMAL(10,2))
+#   Spark:      returns Decimal(9, 0) value 2
+#   DataFusion: returns Decimal(10, 2) value 2.00
+
 ## Original Query: SELECT ceil(-0.1);
 ## PySpark 3.5.5 Result: {'CEIL(-0.1)': Decimal('0'), 'typeof(CEIL(-0.1))': 'decimal(1,0)', 'typeof(-0.1)': 'decimal(1,1)'}
-#query
-#SELECT ceil(-0.1::decimal(1,1));
+query RT
+SELECT ceil(-0.1::decimal(1,1)), arrow_typeof(ceil(-0.1::decimal(1,1)));
+----
+0 Decimal128(1, 0)
 
 ## Original Query: SELECT ceil(3.1411, -3);
 ## PySpark 3.5.5 Result: {'ceil(3.1411, -3)': Decimal('1000'), 'typeof(ceil(3.1411, -3))': 'decimal(4,0)', 'typeof(3.1411)': 'decimal(5,4)', 'typeof(-3)': 'int'}
+## TODO: 2-argument ceil(value, scale) is not yet implemented
 #query
 #SELECT ceil(3.1411::decimal(5,4), -3::int);
 
 ## Original Query: SELECT ceil(3.1411, 3);
 ## PySpark 3.5.5 Result: {'ceil(3.1411, 3)': Decimal('3.142'), 'typeof(ceil(3.1411, 3))': 'decimal(5,3)', 'typeof(3.1411)': 'decimal(5,4)', 'typeof(3)': 'int'}
+## TODO: 2-argument ceil(value, scale) is not yet implemented
 #query
 #SELECT ceil(3.1411::decimal(5,4), 3::int);
 
 ## Original Query: SELECT ceil(5);
 ## PySpark 3.5.5 Result: {'CEIL(5)': 5, 'typeof(CEIL(5))': 'bigint', 'typeof(5)': 'int'}
-#query
-#SELECT ceil(5::int);
+query IT
+SELECT ceil(5::int), arrow_typeof(ceil(5::int));
+----
+5 Int64
+
+# Scalar input: float64 returns bigint
+query IIIIIIIT
+SELECT ceil(125.2345::DOUBLE), ceil(15.0001::DOUBLE), ceil(0.1::DOUBLE), ceil(-0.9::DOUBLE), ceil(-1.1::DOUBLE), ceil(123.0::DOUBLE), ceil(NULL::DOUBLE), arrow_typeof(ceil(125.2345::DOUBLE));
+----
+126 16 1 0 -1 123 NULL Int64
+
+# Scalar input: float32 returns bigint
+query IIIIIIIT
+SELECT ceil(125.2345::FLOAT), ceil(15.0001::FLOAT), ceil(0.1::FLOAT), ceil(-0.9::FLOAT), ceil(-1.1::FLOAT), ceil(123.0::FLOAT), ceil(NULL::FLOAT), arrow_typeof(ceil(125.2345::FLOAT));
+----
+126 16 1 0 -1 123 NULL Int64
+
+# Scalar input: integer types all return bigint
+query IIIT
+SELECT ceil(5::TINYINT), ceil(-3::TINYINT), ceil(NULL::TINYINT), arrow_typeof(ceil(5::TINYINT));
+----
+5 -3 NULL Int64
+
+query IIIT
+SELECT ceil(5::SMALLINT), ceil(-3::SMALLINT), ceil(NULL::SMALLINT), arrow_typeof(ceil(5::SMALLINT));
+----
+5 -3 NULL Int64
+
+query IIIT
+SELECT ceil(5::INT), ceil(-3::INT), ceil(NULL::INT), arrow_typeof(ceil(5::INT));
+----
+5 -3 NULL Int64
+
+query IIIT
+SELECT ceil(5::BIGINT), ceil(-3::BIGINT), ceil(NULL::BIGINT), arrow_typeof(ceil(5::BIGINT));
+----
+5 -3 NULL Int64
+
+# Scalar input: decimal128 with scale > 0 returns decimal with scale 0
+# ceil(1.50) = 2, ceil(-1.50) = -1, ceil(1.00) = 1
+query RRRT
+SELECT ceil(1.50::DECIMAL(10, 2)), ceil(-1.50::DECIMAL(10, 2)), ceil(1.00::DECIMAL(10, 2)), arrow_typeof(ceil(1.50::DECIMAL(10, 2)));
+----
+2 -1 1 Decimal128(9, 0)
+
+# ceil(-0.1) = 0 (smallest positive decimal rounds up to 0 for negatives)
+query RRT
+SELECT ceil(-0.1::DECIMAL(3, 1)), ceil(NULL::DECIMAL(10, 2)), arrow_typeof(ceil(-0.1::DECIMAL(3, 1)));
+----
+0 NULL Decimal128(3, 0)
+
+# ceil(3.1411) = 4
+query RT
+SELECT ceil(3.1411::DECIMAL(5, 4)), arrow_typeof(ceil(3.1411::DECIMAL(5, 4)));
+----
+4 Decimal128(2, 0)
+
+# Scalar input: decimal128 with scale = 0 passes through unchanged
+query RRRT
+SELECT ceil(5::DECIMAL(10, 0)), ceil(-3::DECIMAL(10, 0)), ceil(NULL::DECIMAL(10, 0)), arrow_typeof(ceil(5::DECIMAL(10, 0)));
+----
+5 -3 NULL Decimal128(10, 0)
+
+# Array input: float64
+query I
+SELECT ceil(a) FROM (VALUES (125.2345::DOUBLE), (15.0001::DOUBLE), (0.1::DOUBLE), (-0.9::DOUBLE), (-1.1::DOUBLE), (123.0::DOUBLE), (NULL::DOUBLE)) AS t(a);
+----
+126
+16
+1
+0
+-1
+123
+NULL
+
+# Array input: float32
+query I
+SELECT ceil(a) FROM (VALUES (125.2345::FLOAT), (15.0001::FLOAT), (0.1::FLOAT), (-0.9::FLOAT), (-1.1::FLOAT), (123.0::FLOAT), (NULL::FLOAT)) AS t(a);
+----
+126
+16
+1
+0
+-1
+123
+NULL
+
+# Array input: integers
+query I
+SELECT ceil(a) FROM (VALUES (5::INT), (-3::INT), (NULL::INT)) AS t(a);
+----
+5
+-3
+NULL
+
+# Array input: decimal128 with scale > 0
+query R
+SELECT ceil(a) FROM (VALUES (1.50::DECIMAL(10, 2)), (-1.50::DECIMAL(10, 2)), (1.00::DECIMAL(10, 2)), (NULL::DECIMAL(10, 2))) AS t(a);
+----
+2
+-1
+1
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/math/ceiling.slt b/datafusion/sqllogictest/test_files/spark/math/ceiling.slt
deleted file mode 100644
index 2b761faef47df..0000000000000
--- a/datafusion/sqllogictest/test_files/spark/math/ceiling.slt
+++ /dev/null
@@ -1,42 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-
-#   http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT ceiling(-0.1);
-## PySpark 3.5.5 Result: {'ceiling(-0.1)': Decimal('0'), 'typeof(ceiling(-0.1))': 'decimal(1,0)', 'typeof(-0.1)': 'decimal(1,1)'}
-#query
-#SELECT ceiling(-0.1::decimal(1,1));
-
-## Original Query: SELECT ceiling(3.1411, -3);
-## PySpark 3.5.5 Result: {'ceiling(3.1411, -3)': Decimal('1000'), 'typeof(ceiling(3.1411, -3))': 'decimal(4,0)', 'typeof(3.1411)': 'decimal(5,4)', 'typeof(-3)': 'int'}
-#query
-#SELECT ceiling(3.1411::decimal(5,4), -3::int);
-
-## Original Query: SELECT ceiling(3.1411, 3);
-## PySpark 3.5.5 Result: {'ceiling(3.1411, 3)': Decimal('3.142'), 'typeof(ceiling(3.1411, 3))': 'decimal(5,3)', 'typeof(3.1411)': 'decimal(5,4)', 'typeof(3)': 'int'}
-#query
-#SELECT ceiling(3.1411::decimal(5,4), 3::int);
-
-## Original Query: SELECT ceiling(5);
-## PySpark 3.5.5 Result: {'ceiling(5)': 5, 'typeof(ceiling(5))': 'bigint', 'typeof(5)': 'int'}
-#query
-#SELECT ceiling(5::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/csc.slt b/datafusion/sqllogictest/test_files/spark/math/csc.slt
index b11986c3e1b9f..837704113da4c 100644
--- a/datafusion/sqllogictest/test_files/spark/math/csc.slt
+++ b/datafusion/sqllogictest/test_files/spark/math/csc.slt
@@ -23,5 +23,24 @@
 
 ## Original Query: SELECT csc(1);
 ## PySpark 3.5.5 Result: {'CSC(1)': 1.1883951057781212, 'typeof(CSC(1))': 'double', 'typeof(1)': 'int'}
-#query
-#SELECT csc(1::int);
+
+query R
+SELECT csc(1::INT);
+----
+1.188395105778121
+
+query R
+SELECT csc(a) FROM (VALUES (0::INT), (1::INT), (-1::INT), (null)) AS t(a);
+----
+Infinity
+1.188395105778121
+-1.188395105778121
+NULL
+
+query R
+SELECT csc(a) FROM (VALUES (pi()), (-pi()), (pi()/2) , (arrow_cast('NAN','Float32'))) AS t(a);
+----
+8165619676597685
+-8165619676597685
+1
+NaN
diff --git a/datafusion/sqllogictest/test_files/spark/math/expm1.slt b/datafusion/sqllogictest/test_files/spark/math/expm1.slt
index 96d4abb0414b3..647a5ba341d0a 100644
--- a/datafusion/sqllogictest/test_files/spark/math/expm1.slt
+++ b/datafusion/sqllogictest/test_files/spark/math/expm1.slt
@@ -30,3 +30,8 @@ SELECT expm1(a) FROM (VALUES (0::INT), (1::INT)) AS t(a);
 ----
 0
 1.718281828459045
+
+query R
+SELECT expm1(0.0::double);
+----
+0
diff --git a/datafusion/sqllogictest/test_files/spark/math/hex.slt b/datafusion/sqllogictest/test_files/spark/math/hex.slt
index 0fb8b92de02d4..17e9ff432890d 100644
--- a/datafusion/sqllogictest/test_files/spark/math/hex.slt
+++ b/datafusion/sqllogictest/test_files/spark/math/hex.slt
@@ -48,3 +48,38 @@ SELECT hex(column1) FROM t_utf8view;
 666F6F
 NULL
 666F6F62617262617A
+
+query T
+SELECT hex(column1) FROM VALUES (arrow_cast('hello', 'LargeBinary')), (NULL), (arrow_cast('world', 'LargeBinary'));
+----
+68656C6C6F
+NULL
+776F726C64
+
+statement error Function 'hex' expects 1 arguments but received 2
+SELECT hex(1, 2);
+
+query T
+SELECT hex(arrow_cast('test', 'LargeBinary')) as lar_b;
+----
+74657374
+
+statement ok
+CREATE TABLE t_dict_binary AS
+SELECT arrow_cast(column1, 'Dictionary(Int32, Binary)') as dict_col
+FROM VALUES ('foo'), ('bar'), ('foo'), (NULL), ('baz'), ('bar');
+
+query T
+SELECT hex(dict_col) FROM t_dict_binary;
+----
+666F6F
+626172
+666F6F
+NULL
+62617A
+626172
+
+query T
+SELECT arrow_typeof(hex(dict_col)) FROM t_dict_binary LIMIT 1;
+----
+Dictionary(Int32, Utf8)
diff --git a/datafusion/sqllogictest/test_files/spark/math/mod.slt b/datafusion/sqllogictest/test_files/spark/math/mod.slt
index 2780b3e1053df..8229bb065152d 100644
--- a/datafusion/sqllogictest/test_files/spark/math/mod.slt
+++ b/datafusion/sqllogictest/test_files/spark/math/mod.slt
@@ -144,6 +144,35 @@ SELECT MOD(10.0::decimal(3,1), 3.0::decimal(2,1)) as mod_decimal_2;
 ----
 1
 
+# Division by zero returns NULL in legacy mode (ANSI off)
+query I
+SELECT MOD(10::int, 0::int) as mod_div_zero_1;
+----
+NULL
+
+query I
+SELECT MOD(-7::int, 0::int) as mod_div_zero_2;
+----
+NULL
+
+query R
+SELECT MOD(10.5::float8, 0.0::float8) as mod_div_zero_float;
+----
+NULL
+
+# Division by zero errors in ANSI mode
+statement ok
+set datafusion.execution.enable_ansi_mode = true;
+
+statement error DataFusion error: Arrow error: Divide by zero error
+SELECT MOD(10::int, 0::int);
+
+statement error DataFusion error: Arrow error: Divide by zero error
+SELECT MOD(-7::int, 0::int);
+
+statement ok
+set datafusion.execution.enable_ansi_mode = false;
+
 # Edge cases
 query I
 SELECT MOD(0::int, 5::int) as mod_zero_1;
diff --git a/datafusion/sqllogictest/test_files/spark/math/negative.slt b/datafusion/sqllogictest/test_files/spark/math/negative.slt
index aa8e558e9895e..40bfaf791fe81 100644
--- a/datafusion/sqllogictest/test_files/spark/math/negative.slt
+++ b/datafusion/sqllogictest/test_files/spark/math/negative.slt
@@ -23,5 +23,309 @@
 
 ## Original Query: SELECT negative(1);
 ## PySpark 3.5.5 Result: {'negative(1)': -1, 'typeof(negative(1))': 'int', 'typeof(1)': 'int'}
-#query
-#SELECT negative(1::int);
+
+# Test negative with integer
+query I
+SELECT negative(1::int);
+----
+-1
+
+# Test negative with positive integer
+query I
+SELECT negative(42::int);
+----
+-42
+
+# Test negative with negative integer
+query I
+SELECT negative(-10::int);
+----
+10
+
+# Test negative with zero
+query I
+SELECT negative(0::int);
+----
+0
+
+# Test negative with bigint
+query I
+SELECT negative(9223372036854775807::bigint);
+----
+-9223372036854775807
+
+# Test negative with negative bigint
+query I
+SELECT negative(-100::bigint);
+----
+100
+
+# Test negative with smallint
+query I
+SELECT negative(32767::smallint);
+----
+-32767
+
+# Test negative with float
+query R
+SELECT negative(3.14::float);
+----
+-3.14
+
+# Test negative with negative float
+query R
+SELECT negative(-2.5::float);
+----
+2.5
+
+# Test negative with double
+query R
+SELECT negative(3.14159265358979::double);
+----
+-3.14159265358979
+
+# Test negative with negative double
+query R
+SELECT negative(-1.5::double);
+----
+1.5
+
+# Test negative with decimal
+query R
+SELECT negative(123.456::decimal(10,3));
+----
+-123.456
+
+# Test negative with negative decimal
+query R
+SELECT negative(-99.99::decimal(10,2));
+----
+99.99
+
+# Test negative with NULL
+query I
+SELECT negative(NULL::int);
+----
+NULL
+
+# Test negative with column values
+statement ok
+CREATE TABLE test_negative (id int, value int) AS VALUES (1, 10), (2, -20), (3, 0), (4, NULL);
+
+query II rowsort
+SELECT id, negative(value) FROM test_negative;
+----
+1 -10
+2 20
+3 0
+4 NULL
+
+statement ok
+DROP TABLE test_negative;
+
+# Test negative in expressions
+query I
+SELECT negative(5) + 3;
+----
+-2
+
+# Test nested negative
+query I
+SELECT negative(negative(7));
+----
+7
+
+# Test negative with large numbers
+query R
+SELECT negative(1234567890.123456::double);
+----
+-1234567890.123456
+
+# Test wrap-around: negative of minimum int (should wrap to same value)
+# Using table to avoid constant folding overflow during optimization
+statement ok
+CREATE TABLE min_values_int AS VALUES (-2147483648);
+
+query I
+SELECT negative(column1::int) FROM min_values_int;
+----
+-2147483648
+
+statement ok
+DROP TABLE min_values_int;
+
+# Test wrap-around: negative of minimum bigint (should wrap to same value)
+statement ok
+CREATE TABLE min_values_bigint AS VALUES (-9223372036854775808);
+
+query I
+SELECT negative(column1::bigint) FROM min_values_bigint;
+----
+-9223372036854775808
+
+statement ok
+DROP TABLE min_values_bigint;
+
+# Test wrap-around: negative of minimum smallint (should wrap to same value)
+statement ok
+CREATE TABLE min_values_smallint AS VALUES (-32768);
+
+query I
+SELECT negative(column1::smallint) FROM min_values_smallint;
+----
+-32768
+
+statement ok
+DROP TABLE min_values_smallint;
+
+# Test wrap-around: negative of minimum tinyint (should wrap to same value)
+statement ok
+CREATE TABLE min_values_tinyint AS VALUES (-128);
+
+query I
+SELECT negative(column1::tinyint) FROM min_values_tinyint;
+----
+-128
+
+statement ok
+DROP TABLE min_values_tinyint;
+
+# Test overflow: negative of positive infinity (float)
+query R
+SELECT negative('Infinity'::float);
+----
+-Infinity
+
+# Test overflow: negative of negative infinity (float)
+query R
+SELECT negative('-Infinity'::float);
+----
+Infinity
+
+# Test overflow: negative of positive infinity (double)
+query R
+SELECT negative('Infinity'::double);
+----
+-Infinity
+
+# Test overflow: negative of negative infinity (double)
+query R
+SELECT negative('-Infinity'::double);
+----
+Infinity
+
+# Test overflow: negative of NaN (float)
+query R
+SELECT negative('NaN'::float);
+----
+NaN
+
+# Test overflow: negative of NaN (double)
+query R
+SELECT negative('NaN'::double);
+----
+NaN
+
+# Test overflow: negative of maximum float value
+query R
+SELECT negative(3.4028235e38::float);
+----
+-340282350000000000000000000000000000000
+
+# Test overflow: negative of minimum float value
+query R
+SELECT negative(-3.4028235e38::float);
+----
+340282350000000000000000000000000000000
+
+# Test overflow: negative of maximum double value
+query R
+SELECT negative(1.7976931348623157e308::double);
+----
+-179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+
+# Test overflow: negative of minimum double value
+query R
+SELECT negative(-1.7976931348623157e308::double);
+----
+179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+
+# Test negative with CalendarIntervalType (IntervalMonthDayNano)
+# Spark make_interval creates CalendarInterval
+query ?
+SELECT negative(make_interval(1, 2, 3, 4, 5, 6, 7.5));
+----
+-14 mons -25 days -5 hours -6 mins -7.500000000 secs
+
+# Test negative with negative CalendarIntervalType
+query ?
+SELECT negative(make_interval(-2, -5, -1, -10, -3, -30, -15.25));
+----
+29 mons 17 days 3 hours 30 mins 15.250000000 secs
+
+# Test negative with CalendarInterval from table
+statement ok
+CREATE TABLE interval_test AS VALUES
+  (make_interval(1, 2, 0, 5, 0, 0, 0.0)),
+  (make_interval(-3, -1, 0, -2, 0, 0, 0.0));
+
+query ? rowsort
+SELECT negative(column1) FROM interval_test;
+----
+-14 mons -5 days
+37 mons 2 days
+
+statement ok
+DROP TABLE interval_test;
+
+## ANSI mode tests: overflow detection
+statement ok
+set datafusion.execution.enable_ansi_mode = true;
+
+# Test ANSI mode: negative of minimum values should error (overflow)
+query error DataFusion error: Execution error: Int8 overflow on negative\(\-128\)
+SELECT negative((-128)::tinyint);
+
+query error DataFusion error: Execution error: Int16 overflow on negative\(\-32768\)
+SELECT negative((-32768)::smallint);
+
+query error DataFusion error: Execution error: Int32 overflow on negative\(\-2147483648\)
+SELECT negative((-2147483648)::int);
+
+query error DataFusion error: Execution error: Int64 overflow on negative\(\-9223372036854775808\)
+SELECT negative((-9223372036854775808)::bigint);
+
+# Test ANSI mode: negative of (MIN+1) should succeed (boundary test)
+query I
+SELECT negative((-127)::tinyint);
+----
+127
+
+query I
+SELECT negative((-32767)::smallint);
+----
+32767
+
+query I
+SELECT negative((-2147483647)::int);
+----
+2147483647
+
+query I
+SELECT negative((-9223372036854775807)::bigint);
+----
+9223372036854775807
+
+# Test ANSI mode: array with MIN value should error
+statement ok
+CREATE TABLE min_values_ansi AS VALUES (-2147483648);
+
+query error DataFusion error: Execution error: Int32 overflow on negative\(\-2147483648\)
+SELECT negative(column1::int) FROM min_values_ansi;
+
+statement ok
+DROP TABLE min_values_ansi;
+
+# Reset ANSI mode to false
+statement ok
+set datafusion.execution.enable_ansi_mode = false;
diff --git a/datafusion/sqllogictest/test_files/spark/math/pmod.slt b/datafusion/sqllogictest/test_files/spark/math/pmod.slt
index cf273c2d78f53..aa4a197ba470f 100644
--- a/datafusion/sqllogictest/test_files/spark/math/pmod.slt
+++ b/datafusion/sqllogictest/test_files/spark/math/pmod.slt
@@ -64,8 +64,28 @@ SELECT pmod(0::int, 5::int) as pmod_zero_1;
 ----
 0
 
-statement error DataFusion error: Arrow error: Divide by zero error
+query I
 SELECT pmod(10::int, 0::int) as pmod_zero_2;
+----
+NULL
+
+query I
+SELECT pmod(-7::int, 0::int) as pmod_zero_3;
+----
+NULL
+
+# Division by zero errors in ANSI mode
+statement ok
+set datafusion.execution.enable_ansi_mode = true;
+
+statement error DataFusion error: Arrow error: Divide by zero error
+SELECT pmod(10::int, 0::int);
+
+statement error DataFusion error: Arrow error: Divide by zero error
+SELECT pmod(-7::int, 0::int);
+
+statement ok
+set datafusion.execution.enable_ansi_mode = false;
 
 # PMOD tests with NULL values
 query I
diff --git a/datafusion/sqllogictest/test_files/spark/math/round.slt b/datafusion/sqllogictest/test_files/spark/math/round.slt
index bc1f6b72247a0..91c5bdf0506f5 100644
--- a/datafusion/sqllogictest/test_files/spark/math/round.slt
+++ b/datafusion/sqllogictest/test_files/spark/math/round.slt
@@ -15,13 +15,567 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
 # This file is part of the implementation of the datafusion-spark function library.
 # For more information, please see:
 #   https://github.com/apache/datafusion/issues/15914
 
-## Original Query: SELECT round(2.5, 0);
-## PySpark 3.5.5 Result: {'round(2.5, 0)': Decimal('3'), 'typeof(round(2.5, 0))': 'decimal(2,0)', 'typeof(2.5)': 'decimal(2,1)', 'typeof(0)': 'int'}
-#query
-#SELECT round(2.5::decimal(2,1), 0::int);
+# -------------------------------------------------------------------
+# Float / Double tests (HALF_UP rounding: .5 rounds away from zero)
+# -------------------------------------------------------------------
+
+# round(double) default scale = 0
+query R
+SELECT round(2.5::double);
+----
+3
+
+query R
+SELECT round(3.5::double);
+----
+4
+
+query R
+SELECT round(-2.5::double);
+----
+-3
+
+query R
+SELECT round(-3.5::double);
+----
+-4
+
+query R
+SELECT round(1.4::double);
+----
+1
+
+query R
+SELECT round(1.6::double);
+----
+2
+
+# round(double, scale)
+query R
+SELECT round(2.345::double, 2::int);
+----
+2.35
+
+query R
+SELECT round(2.355::double, 2::int);
+----
+2.36
+
+query R
+SELECT round(123.456::double, 1::int);
+----
+123.5
+
+# round(float)
+query R
+SELECT round(arrow_cast(2.5, 'Float32'));
+----
+3
+
+query R
+SELECT round(arrow_cast(-2.5, 'Float32'));
+----
+-3
+
+# round(float) with scale
+query R
+SELECT round(arrow_cast(2.345, 'Float32'), 2::int);
+----
+2.35
+
+# round(float32) with negative scale
+query R
+SELECT round(arrow_cast(125.0, 'Float32'), -1::int);
+----
+130
+
+# -------------------------------------------------------------------
+# Float16 tests
+# -------------------------------------------------------------------
+
+# round(float16) default scale = 0
+query R
+SELECT round(arrow_cast(2.5, 'Float16'));
+----
+3
+
+query R
+SELECT round(arrow_cast(-2.5, 'Float16'));
+----
+-3
+
+# round(float16) with negative scale
+query R
+SELECT round(arrow_cast(125, 'Float16'), -1::int);
+----
+130
+
+# round(double) with negative scale
+query R
+SELECT round(125.0::double, -1::int);
+----
+130
+
+query R
+SELECT round(125.0::double, -2::int);
+----
+100
+
+query R
+SELECT round(0.0::double);
+----
+0
+
+# round(double) with Infinity and NaN
+query R
+SELECT round('Infinity'::double);
+----
+Infinity
+
+query R
+SELECT round('-Infinity'::double);
+----
+-Infinity
+
+query R
+SELECT round('NaN'::double);
+----
+NaN
+
+query R
+SELECT round('Infinity'::double, 2::int);
+----
+Infinity
+
+query R
+SELECT round('NaN'::double, 2::int);
+----
+NaN
+
+# round(double) with extreme negative scale — should return 0, not NaN
+query R
+SELECT round(42.0::double, -400::int);
+----
+0
+
+# round(double) with extreme positive scale — should return value as-is
+query R
+SELECT round(2.5::double, 400::int);
+----
+2.5
+
+# -------------------------------------------------------------------
+# Integer tests (negative scale rounds to tens, hundreds, etc.)
+# -------------------------------------------------------------------
+
+# round(int, -1) — round to nearest 10
+query I
+SELECT round(25::int, -1::int);
+----
+30
+
+query I
+SELECT round(24::int, -1::int);
+----
+20
+
+query I
+SELECT round(-25::int, -1::int);
+----
+-30
+
+query I
+SELECT round(123::int, -1::int);
+----
+120
+
+# round(int, -2) — round to nearest 100
+query I
+SELECT round(150::int, -2::int);
+----
+200
+
+query I
+SELECT round(149::int, -2::int);
+----
+100
+
+# round(int, positive scale) — no-op for integers
+query I
+SELECT round(42::int, 2::int);
+----
+42
+
+# round(int) default scale = 0 — returns unchanged
+query I
+SELECT round(42::int);
+----
+42
+
+# round(bigint, -1)
+query I
+SELECT round(25::bigint, -1::int);
+----
+30
+
+# round(smallint, -1)
+query I
+SELECT round(25::smallint, -1::int);
+----
+30
+
+# round(tinyint, -1)
+query I
+SELECT round(arrow_cast(25, 'Int8'), -1::int);
+----
+30
+
+# round(int) with very large negative scale — should return 0
+query I
+SELECT round(42::int, -10::int);
+----
+0
+
+# -------------------------------------------------------------------
+# Unsigned integer tests
+# -------------------------------------------------------------------
+
+# round(uint8, -1)
+query I
+SELECT round(arrow_cast(25, 'UInt8'), -1::int);
+----
+30
+
+# round(uint16, -1)
+query I
+SELECT round(arrow_cast(25, 'UInt16'), -1::int);
+----
+30
+
+# round(uint32, -1)
+query I
+SELECT round(arrow_cast(150, 'UInt32'), -2::int);
+----
+200
+
+# round(uint64, -1)
+query I
+SELECT round(arrow_cast(25, 'UInt64'), -1::int);
+----
+30
+
+# round(uint32, positive scale) — no-op for integers
+query I
+SELECT round(arrow_cast(42, 'UInt32'), 2::int);
+----
+42
+
+# -------------------------------------------------------------------
+# Decimal tests (HALF_UP rounding)
+# -------------------------------------------------------------------
+
+# --- Decimal32 ---
+
+# round(decimal32, 0) — round to integer
+query ?
+SELECT round(arrow_cast(2.5, 'Decimal32(9, 1)'), 0::int);
+----
+3.0
+
+query ?
+SELECT round(arrow_cast(-2.5, 'Decimal32(9, 1)'), 0::int);
+----
+-3.0
+
+# round(decimal32, 2)
+query ?
+SELECT round(arrow_cast(2.345, 'Decimal32(9, 3)'), 2::int);
+----
+2.350
+
+# round(decimal32) default scale = 0
+query ?
+SELECT round(arrow_cast(3.5, 'Decimal32(9, 1)'));
+----
+4.0
+
+# --- Decimal64 ---
+
+# round(decimal64, 0) — round to integer
+query ?
+SELECT round(arrow_cast(2.5, 'Decimal64(18, 1)'), 0::int);
+----
+3.0
+
+query ?
+SELECT round(arrow_cast(-2.5, 'Decimal64(18, 1)'), 0::int);
+----
+-3.0
+
+# round(decimal64, 2)
+query ?
+SELECT round(arrow_cast(2.345, 'Decimal64(18, 3)'), 2::int);
+----
+2.350
+
+# round(decimal64) default scale = 0
+query ?
+SELECT round(arrow_cast(3.5, 'Decimal64(18, 1)'));
+----
+4.0
+
+# --- Decimal128 ---
+
+# round(decimal, 0) — round to integer
+query R
+SELECT round(2.5::decimal(2,1), 0::int);
+----
+3
+
+query R
+SELECT round(3.5::decimal(2,1), 0::int);
+----
+4
+
+query R
+SELECT round(-2.5::decimal(2,1), 0::int);
+----
+-3
+
+# round(decimal) default scale = 0
+query R
+SELECT round(2.5::decimal(2,1));
+----
+3
+
+# round(decimal, 2) — keep 2 decimal places
+query R
+SELECT round(2.345::decimal(10,3), 2::int);
+----
+2.35
+
+query R
+SELECT round(2.355::decimal(10,3), 2::int);
+----
+2.36
+
+# round(decimal, scale larger than input scale) — no change
+query R
+SELECT round(2.5::decimal(2,1), 5::int);
+----
+2.5
+
+# round(decimal, 1)
+query R
+SELECT round(123.456::decimal(10,3), 1::int);
+----
+123.5
+
+# round(decimal, negative scale) — round to tens
+query R
+SELECT round(125.0::decimal(10,1), -1::int);
+----
+130
+
+# round(decimal, extreme negative scale) — should return 0, not error
+query R
+SELECT round(2.5::decimal(10,1), -400::int);
+----
+0
+
+# --- Decimal256 ---
+
+# round(decimal256, 0) — round to integer
+query R
+SELECT round(arrow_cast(2.5, 'Decimal256(38, 1)'), 0::int);
+----
+3
+
+query R
+SELECT round(arrow_cast(-2.5, 'Decimal256(38, 1)'), 0::int);
+----
+-3
+
+# round(decimal256, 2)
+query R
+SELECT round(arrow_cast(2.345, 'Decimal256(38, 3)'), 2::int);
+----
+2.35
+
+# round(decimal256) default scale = 0
+query R
+SELECT round(arrow_cast(3.5, 'Decimal256(38, 1)'));
+----
+4
+
+# -------------------------------------------------------------------
+# NULL handling
+# -------------------------------------------------------------------
+
+query I
+SELECT round(NULL::int);
+----
+NULL
+
+query R
+SELECT round(NULL::double);
+----
+NULL
+
+query R
+SELECT round(NULL::decimal(10,2));
+----
+NULL
+
+# round with NULL scale — Spark returns NULL
+query I
+SELECT round(42::int, NULL::int);
+----
+NULL
+
+query R
+SELECT round(2.5::double, NULL::int);
+----
+NULL
+
+# -------------------------------------------------------------------
+# Column-based tests
+# -------------------------------------------------------------------
+
+statement ok
+CREATE TABLE test_round (id int, int_val int, float_val double, dec_val decimal(10,3)) AS VALUES
+  (1, 25, 2.5, 2.345),
+  (2, 35, 3.5, 3.555),
+  (3, -25, -2.5, -2.345),
+  (4, 123, 1.4, 1.005),
+  (5, NULL, NULL, NULL);
+
+query IIRR rowsort
+SELECT id, round(int_val, -1::int), round(float_val), round(dec_val, 2::int) FROM test_round;
+----
+1 30 3 2.35
+2 40 4 3.56
+3 -30 -3 -2.35
+4 120 1 1.01
+5 NULL NULL NULL
+
+statement ok
+DROP TABLE test_round;
+
+# -------------------------------------------------------------------
+# Expression tests
+# -------------------------------------------------------------------
+
+query R
+SELECT round(3.14159::double, 2::int) + 1.0;
+----
+4.140000000000001
+
+# -------------------------------------------------------------------
+# Non-ANSI wrapping behavior
+# When ANSI mode is off, integer overflow wraps silently.
+# -------------------------------------------------------------------
+
+# round(127::tinyint, -1) → 130, wraps as i8 → -126
+query I
+SELECT round(arrow_cast(127, 'Int8'), -1::int);
+----
+-126
+
+# round(32767::smallint, -1) → 32770, wraps as i16 → -32766
+query I
+SELECT round(arrow_cast(32767, 'Int16'), -1::int);
+----
+-32766
+
+# round(2147483647::int, -1) → 2147483650, wraps as i32 → -2147483646
+query I
+SELECT round(2147483647::int, -1::int);
+----
+-2147483646
+
+# round(i64::MAX, -1) wraps as i64 → -9223372036854775806
+query I
+SELECT round(9223372036854775807::bigint, -1::int);
+----
+-9223372036854775806
+
+# -------------------------------------------------------------------
+# ANSI mode tests: overflow detection for integer rounding
+# -------------------------------------------------------------------
+
+statement ok
+set datafusion.execution.enable_ansi_mode = true;
+
+# ANSI mode: normal rounding should still work
+query I
+SELECT round(25::int, -1::int);
+----
+30
+
+query I
+SELECT round(-25::int, -1::int);
+----
+-30
+
+query I
+SELECT round(150::int, -2::int);
+----
+200
+
+# ANSI mode: positive scale on integers — no-op, no overflow
+query I
+SELECT round(42::int, 2::int);
+----
+42
+
+# ANSI mode: floats and decimals should work normally
+query R
+SELECT round(2.5::double);
+----
+3
+
+query R
+SELECT round(2.5::decimal(2,1), 0::int);
+----
+3
+
+# ANSI mode: integer overflow should error
+query error DataFusion error: Execution error: Int64 overflow on round
+SELECT round(9223372036854775807::bigint, -1::int);
+
+# ANSI mode: Int32 overflow should error
+query error DataFusion error: Execution error: Int32 overflow on round
+SELECT round(2147483647::int, -1::int);
+
+# ANSI mode: Int16 overflow should error
+query error DataFusion error: Execution error: Int16 overflow on round
+SELECT round(arrow_cast(32767, 'Int16'), -1::int);
+
+# ANSI mode: Int8 overflow should error
+query error DataFusion error: Execution error: Int8 overflow on round
+SELECT round(arrow_cast(127, 'Int8'), -1::int);
+
+# Reset ANSI mode
+statement ok
+set datafusion.execution.enable_ansi_mode = false;
+
+# -------------------------------------------------------------------
+# Negative tests: unsupported data types
+# -------------------------------------------------------------------
+
+# round(string) should fail
+query error Error during planning: Internal error: Function 'round' failed to match any signature
+SELECT round('hello'::varchar);
+
+# round(boolean) should fail
+query error Error during planning: Internal error: Function 'round' failed to match any signature
+SELECT round(true);
+
+# round(timestamp) should fail
+query error Error during planning: Internal error: Function 'round' failed to match any signature
+SELECT round('2023-01-01T00:00:00'::timestamp);
diff --git a/datafusion/sqllogictest/test_files/spark/math/sec.slt b/datafusion/sqllogictest/test_files/spark/math/sec.slt
index 6c49a34549f0f..c95d583ce9154 100644
--- a/datafusion/sqllogictest/test_files/spark/math/sec.slt
+++ b/datafusion/sqllogictest/test_files/spark/math/sec.slt
@@ -23,5 +23,23 @@
 
 ## Original Query: SELECT sec(0);
 ## PySpark 3.5.5 Result: {'SEC(0)': 1.0, 'typeof(SEC(0))': 'double', 'typeof(0)': 'int'}
-#query
-#SELECT sec(0::int);
+query R
+SELECT sec(0::int);
+----
+1
+
+query R
+SELECT sec(a) FROM (VALUES (0::INT), (1::INT), (-1::INT), (null)) AS t(a);
+----
+1
+1.850815717680926
+1.850815717680926
+NULL
+
+query R
+SELECT sec(a) FROM (VALUES (pi()), (3 * pi()/2), (pi()/2) , (arrow_cast('NAN','Float32'))) AS t(a);
+----
+-1
+-5443746451065123
+16331239353195370
+NaN
diff --git a/datafusion/sqllogictest/test_files/spark/math/shiftleft.slt b/datafusion/sqllogictest/test_files/spark/math/shiftleft.slt
index 3676e4c18153c..c8ddeb6740871 100644
--- a/datafusion/sqllogictest/test_files/spark/math/shiftleft.slt
+++ b/datafusion/sqllogictest/test_files/spark/math/shiftleft.slt
@@ -144,3 +144,103 @@ query I
 select shiftleft(3::int,-32);
 ----
 3
+
+# i32 + nulls
+query IT
+SELECT
+	shiftleft(arrow_cast(value, 'Int32'), shift),
+	arrow_typeof(shiftleft(arrow_cast(value, 'Int32'), shift))
+FROM VALUES
+(1, 1),
+(2, 2),
+(3, 3),
+(4, 4),
+(null, 2),
+(8, null)
+t(value, shift)
+----
+2 Int32
+8 Int32
+24 Int32
+64 Int32
+NULL Int32
+NULL Int32
+
+# big shifts
+query IT
+SELECT
+	shiftleft(arrow_cast(value, 'Int32'), shift),
+	arrow_typeof(shiftleft(arrow_cast(value, 'Int32'), shift))
+FROM VALUES
+(1, 32),
+(2, 33),
+(3, 64)
+t(value, shift)
+----
+1 Int32
+4 Int32
+3 Int32
+
+# negative shift
+query IT
+SELECT
+	shiftleft(arrow_cast(value, 'Int32'), shift),
+	arrow_typeof(shiftleft(arrow_cast(value, 'Int32'), shift))
+FROM VALUES
+(4, -1),
+(8, -2),
+(16, -3)
+t(value, shift)
+----
+0 Int32
+0 Int32
+0 Int32
+
+# i64 value
+query IT
+SELECT
+	shiftleft(arrow_cast(value, 'Int64'), shift),
+	arrow_typeof(shiftleft(arrow_cast(value, 'Int64'), shift))
+FROM VALUES (1, 1), (2, 2), (3, 3) t(value, shift)
+----
+2 Int64
+8 Int64
+24 Int64
+
+# u32 value
+query IT
+SELECT
+	shiftleft(arrow_cast(value, 'UInt32'), shift),
+	arrow_typeof(shiftleft(arrow_cast(value, 'UInt32'), shift))
+FROM VALUES (1, 1), (2, 2), (3, 3) t(value, shift)
+----
+2 UInt32
+8 UInt32
+24 UInt32
+
+# u64 value
+query IT
+SELECT
+	shiftleft(arrow_cast(value, 'UInt64'), shift),
+	arrow_typeof(shiftleft(arrow_cast(value, 'UInt64'), shift))
+FROM VALUES (1, 1), (2, 2), (3, 3) t(value, shift)
+----
+2 UInt64
+8 UInt64
+24 UInt64
+
+# pure null handling
+query IT
+SELECT shiftleft(null, 1), arrow_typeof(shiftleft(null, 1));
+----
+NULL Int32
+
+query IT
+SELECT shiftleft(null, null), arrow_typeof(shiftleft(null, null));
+----
+NULL Int32
+
+query IT
+SELECT shiftleft(1::bigint, null), arrow_typeof(shiftleft(1::bigint, null));
+----
+NULL Int64
diff --git a/datafusion/sqllogictest/test_files/spark/math/unhex.slt b/datafusion/sqllogictest/test_files/spark/math/unhex.slt
new file mode 100644
index 0000000000000..051d8826c8a6c
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/unhex.slt
@@ -0,0 +1,98 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Basic hex string
+query ?
+SELECT unhex('537061726B2053514C');
+----
+537061726b2053514c
+
+query T
+SELECT arrow_cast(unhex('537061726B2053514C'), 'Utf8');
+----
+Spark SQL
+
+# Lowercase hex
+query ?
+SELECT unhex('616263');
+----
+616263
+
+query T
+SELECT arrow_cast(unhex('616263'), 'Utf8');
+----
+abc
+
+# Odd length hex (left pad with 0)
+query ?
+SELECT unhex(a) FROM VALUES ('1A2B3'), ('1'), ('ABC'), ('123') AS t(a);
+----
+01a2b3
+01
+0abc
+0123
+
+# Null input
+query ?
+SELECT unhex(NULL);
+----
+NULL
+
+# Invalid hex characters
+query ?
+SELECT unhex('GGHH');
+----
+NULL
+
+# Empty hex string
+query T
+SELECT arrow_cast(unhex(''), 'Utf8');
+----
+(empty)
+
+# Array with mixed case
+query ?
+SELECT unhex(a) FROM VALUES ('4a4B4c'), ('F'), ('A'), ('AbCdEf'), ('123abc'), ('41 42'), ('00'), ('FF') AS t(a);
+----
+4a4b4c
+0f
+0a
+abcdef
+123abc
+NULL
+00
+ff
+
+# LargeUtf8 type
+statement ok
+CREATE TABLE t_large_utf8 AS VALUES (arrow_cast('414243', 'LargeUtf8')), (NULL);
+
+query ?
+SELECT unhex(column1) FROM t_large_utf8;
+----
+414243
+NULL
+
+# Utf8View type
+statement ok
+CREATE TABLE t_utf8view AS VALUES (arrow_cast('414243', 'Utf8View')), (NULL);
+
+query ?
+SELECT unhex(column1) FROM t_utf8view;
+----
+414243
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/string/base64.slt b/datafusion/sqllogictest/test_files/spark/string/base64.slt
index 66edbe8442158..dbd266f65a132 100644
--- a/datafusion/sqllogictest/test_files/spark/string/base64.slt
+++ b/datafusion/sqllogictest/test_files/spark/string/base64.slt
@@ -15,18 +15,101 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT base64('Spark SQL');
-## PySpark 3.5.5 Result: {'base64(Spark SQL)': 'U3BhcmsgU1FM', 'typeof(base64(Spark SQL))': 'string', 'typeof(Spark SQL)': 'string'}
-#query
-#SELECT base64('Spark SQL'::string);
-
-## Original Query: SELECT base64(x'537061726b2053514c');
-## PySpark 3.5.5 Result: {"base64(X'537061726B2053514C')": 'U3BhcmsgU1FM', "typeof(base64(X'537061726B2053514C'))": 'string', "typeof(X'537061726B2053514C')": 'binary'}
-#query
-#SELECT base64(X'537061726B2053514C'::binary);
+query T
+SELECT base64('Spark SQL'::string);
+----
+U3BhcmsgU1FM
+
+query T
+SELECT base64('Spark SQ'::string);
+----
+U3BhcmsgU1E=
+
+query T
+SELECT base64('Spark S'::string);
+----
+U3BhcmsgUw==
+
+query T
+SELECT base64('Spark SQL'::bytea);
+----
+U3BhcmsgU1FM
+
+query T
+SELECT base64(NULL::string);
+----
+NULL
+
+query T
+SELECT base64(NULL::bytea);
+----
+NULL
+
+query T
+SELECT base64(column1)
+FROM VALUES
+('Spark SQL'::bytea),
+('Spark SQ'::bytea),
+('Spark S'::bytea),
+(NULL::bytea);
+----
+U3BhcmsgU1FM
+U3BhcmsgU1E=
+U3BhcmsgUw==
+NULL
+
+query error Error during planning: Function 'base64' requires Binary, but received Int32 \(DataType: Int32\)
+SELECT base64(12::integer);
+
+
+query T
+SELECT arrow_cast(unbase64('U3BhcmsgU1FM'::string), 'Utf8');
+----
+Spark SQL
+
+query T
+SELECT arrow_cast(unbase64('U3BhcmsgU1E='::string), 'Utf8');
+----
+Spark SQ
+
+query T
+SELECT arrow_cast(unbase64('U3BhcmsgUw=='::string), 'Utf8');
+----
+Spark S
+
+query T
+SELECT arrow_cast(unbase64('U3BhcmsgU1FM'::bytea), 'Utf8');
+----
+Spark SQL
+
+query ?
+SELECT unbase64(NULL::string);
+----
+NULL
+
+query ?
+SELECT unbase64(NULL::bytea);
+----
+NULL
+
+query T
+SELECT arrow_cast(unbase64(column1), 'Utf8')
+FROM VALUES
+('U3BhcmsgU1FM'::string),
+('U3BhcmsgU1E='::string),
+('U3BhcmsgUw=='::string),
+(NULL::string);
+----
+Spark SQL
+Spark SQ
+Spark S
+NULL
+
+query error Failed to decode value using base64
+SELECT unbase64('123'::string);
+
+query error Failed to decode value using base64
+SELECT unbase64('123'::bytea);
+
+query error Error during planning: Function 'unbase64' requires Binary, but received Int32 \(DataType: Int32\)
+SELECT unbase64(12::integer);
diff --git a/datafusion/sqllogictest/test_files/spark/string/concat.slt b/datafusion/sqllogictest/test_files/spark/string/concat.slt
index 258cb829d7d4b..df539a1c7a159 100644
--- a/datafusion/sqllogictest/test_files/spark/string/concat.slt
+++ b/datafusion/sqllogictest/test_files/spark/string/concat.slt
@@ -20,6 +20,12 @@ SELECT concat('Spark', 'SQL');
 ----
 SparkSQL
 
+# Test two Utf8View inputs: value and return type
+query TT
+SELECT concat(arrow_cast('Spark', 'Utf8View'), arrow_cast('SQL', 'Utf8View')), arrow_typeof(concat(arrow_cast('Spark', 'Utf8View'), arrow_cast('SQL', 'Utf8View')));
+----
+SparkSQL Utf8View
+
 query T
 SELECT concat('Spark', 'SQL', NULL);
 ----
@@ -46,3 +52,86 @@ SELECT concat(a, b, c) from (select 'a' a, 'b' b, 'c' c union all select null a,
 ----
 abc
 NULL
+
+# Test mixed types: Utf8View + Utf8
+query TT
+SELECT concat(arrow_cast('hello', 'Utf8View'), ' world'), arrow_typeof(concat(arrow_cast('hello', 'Utf8View'), ' world'));
+----
+hello world Utf8View
+
+# Test Utf8 + LargeUtf8 => return type LargeUtf8
+query TT
+SELECT concat('a', arrow_cast('b', 'LargeUtf8')), arrow_typeof(concat('a', arrow_cast('b', 'LargeUtf8')));
+----
+ab LargeUtf8
+
+# Test all three types mixed together
+query TT
+SELECT concat('a', arrow_cast('b', 'LargeUtf8'), arrow_cast('c', 'Utf8View')), arrow_typeof(concat('a', arrow_cast('b', 'LargeUtf8'), arrow_cast('c', 'Utf8View')));
+----
+abc Utf8View
+
+# Test mixed types: Utf8 + Binary
+query TT
+SELECT concat(arrow_cast('hello', 'Utf8'), arrow_cast(' world', 'Binary')), arrow_typeof(concat(arrow_cast('hello', 'Utf8'), arrow_cast(' world', 'Binary')));
+----
+hello world Utf8
+
+# Test mixed types: Utf8View + Binary
+query TT
+SELECT concat(arrow_cast('hello', 'Utf8View'), arrow_cast(' world', 'Binary')), arrow_typeof(concat(arrow_cast('hello', 'Utf8View'), arrow_cast(' world', 'Binary')));
+----
+hello world Utf8View
+
+# Test mixed types: Binary + Binary
+query TT
+SELECT concat(arrow_cast('hello', 'Binary'), arrow_cast(' world', 'Binary')), arrow_typeof(concat(arrow_cast('hello', 'Binary'), arrow_cast(' world', 'Binary')));
+----
+hello world Utf8
+
+# Test mixed types with ws: Binary + Binary
+query TT
+SELECT concat_ws('|', arrow_cast('hello', 'Binary'), arrow_cast('world', 'Binary')), arrow_typeof(concat_ws('|', arrow_cast('hello', 'Binary'), arrow_cast('world', 'Binary')));
+----
+hello|world Utf8
+
+# Invalid UTF8 binaries for concatenation, scalar case
+# 636166c3a9 = café  , where c3a9 is a char é
+# 68656c6c6f = hello
+query error Execution error: invalid UTF-8 in binary literal
+SELECT concat(x'636166c3', x'68656c6c6f');
+
+query error Execution error: invalid UTF-8 in binary literal
+SELECT concat(x'636166c3', arrow_cast(x'68656c6c6f', 'Utf8View'));
+
+statement ok
+create table t as values (x'636166c3', x'68656c6c6f');
+
+# Invalid UTF8 sequence for concatenation, array case
+query error Arrow error: Invalid argument error: Invalid UTF8 sequence at string
+SELECT concat(column1, column2) from t;
+
+# Invalid UTF8 sequence for concatenation, array case
+query error DataFusion error: Execution error: invalid UTF-8 in binary literal
+SELECT concat(column1, arrow_cast(column2, 'Utf8View')) from t;
+
+statement ok
+drop table t
+
+statement ok
+create table t as values (x'636166c3', x'a968656c6c6f');
+
+# Invalid UTF8 binaries make a valid UTF8 sequence after concatenation, array case
+query T
+SELECT concat(column1, column2) from t;
+----
+caféhello
+
+statement ok
+drop table t
+
+# Invalid UTF8 binaries make a valid UTF8 sequence after concatenation, scalar case
+query T
+SELECT concat(x'636166c3', x'a968656c6c6f');
+----
+caféhello
diff --git a/datafusion/sqllogictest/test_files/spark/string/format_string.slt b/datafusion/sqllogictest/test_files/spark/string/format_string.slt
index 048863ebfbedb..8ba3cfc951cdc 100644
--- a/datafusion/sqllogictest/test_files/spark/string/format_string.slt
+++ b/datafusion/sqllogictest/test_files/spark/string/format_string.slt
@@ -931,13 +931,13 @@ Char: NULL
 
 ## NULL with timestamp format using arrow_cast
 query T
-SELECT format_string('Hour: %tH', arrow_cast(NULL, 'Timestamp(Nanosecond, None)'));
+SELECT format_string('Hour: %tH', arrow_cast(NULL, 'Timestamp(ns)'));
 ----
 Hour: null
 
 ## NULL with timestamp format using arrow_cast
 query T
-SELECT format_string('Month: %tB', arrow_cast(NULL, 'Timestamp(Nanosecond, None)'));
+SELECT format_string('Month: %tB', arrow_cast(NULL, 'Timestamp(ns)'));
 ----
 Month: null
 
@@ -967,25 +967,25 @@ Month: null
 
 ## NULL with timestamp format using arrow_cast
 query T
-SELECT format_string('Month: %tB', arrow_cast(NULL, 'Timestamp(Second, None)'));
+SELECT format_string('Month: %tB', arrow_cast(NULL, 'Timestamp(s)'));
 ----
 Month: null
 
 ## NULL with timestamp format using arrow_cast
 query T
-SELECT format_string('Month: %tB', arrow_cast(NULL, 'Timestamp(Millisecond, None)'));
+SELECT format_string('Month: %tB', arrow_cast(NULL, 'Timestamp(ms)'));
 ----
 Month: null
 
 ## NULL with timestamp format using arrow_cast
 query T
-SELECT format_string('Month: %tB', arrow_cast(NULL, 'Timestamp(Microsecond, None)'));
+SELECT format_string('Month: %tB', arrow_cast(NULL, 'Timestamp(µs)'));
 ----
 Month: null
 
 ## NULL with timestamp format using arrow_cast
 query T
-SELECT format_string('Month: %tB', arrow_cast(NULL, 'Timestamp(Nanosecond, None)'));
+SELECT format_string('Month: %tB', arrow_cast(NULL, 'Timestamp(ns)'));
 ----
 Month: null
 
@@ -1051,7 +1051,7 @@ Value: null
 
 ## NULL Timestamp with string format using arrow_cast
 query T
-SELECT format_string('Value: %s', arrow_cast(NULL, 'Timestamp(Nanosecond, None)'));
+SELECT format_string('Value: %s', arrow_cast(NULL, 'Timestamp(ns)'));
 ----
 Value: null
 
@@ -1717,49 +1717,49 @@ String: 52245000000000
 
 ## TimestampSecond with time formats
 query T
-SELECT format_string('Year: %tY', arrow_cast(1703512245, 'Timestamp(Second, None)'));
+SELECT format_string('Year: %tY', arrow_cast(1703512245, 'Timestamp(s)'));
 ----
 Year: 2023
 
 query T
-SELECT format_string('Month: %tm', arrow_cast(1703512245, 'Timestamp(Second, None)'));
+SELECT format_string('Month: %tm', arrow_cast(1703512245, 'Timestamp(s)'));
 ----
 Month: 12
 
 query T
-SELECT format_string('String: %s', arrow_cast(1703512245, 'Timestamp(Second, None)'));
+SELECT format_string('String: %s', arrow_cast(1703512245, 'Timestamp(s)'));
 ----
 String: 1703512245
 
 query T
-SELECT format_string('String: %S', arrow_cast(1703512245, 'Timestamp(Second, None)'));
+SELECT format_string('String: %S', arrow_cast(1703512245, 'Timestamp(s)'));
 ----
 String: 1703512245
 
 ## TimestampMillisecond with time formats
 query T
-SELECT format_string('ISO Date: %tF', arrow_cast(1703512245000, 'Timestamp(Millisecond, None)'));
+SELECT format_string('ISO Date: %tF', arrow_cast(1703512245000, 'Timestamp(ms)'));
 ----
 ISO Date: 2023-12-25
 
 query T
-SELECT format_string('String: %s', arrow_cast(1703512245000, 'Timestamp(Millisecond, None)'));
+SELECT format_string('String: %s', arrow_cast(1703512245000, 'Timestamp(ms)'));
 ----
 String: 1703512245000
 
 ## TimestampMicrosecond with time formats
 query T
-SELECT format_string('Date: %tD', arrow_cast(1703512245000000, 'Timestamp(Microsecond, None)'));
+SELECT format_string('Date: %tD', arrow_cast(1703512245000000, 'Timestamp(µs)'));
 ----
 Date: 12/25/23
 
 query T
-SELECT format_string('String: %s', arrow_cast(1703512245000000, 'Timestamp(Microsecond, None)'));
+SELECT format_string('String: %s', arrow_cast(1703512245000000, 'Timestamp(µs)'));
 ----
 String: 1703512245000000
 
 query T
-SELECT format_string('String: %s', arrow_cast('2020-01-02 01:01:11.1234567890Z', 'Timestamp(Nanosecond, None)'));
+SELECT format_string('String: %s', arrow_cast('2020-01-02 01:01:11.1234567890Z', 'Timestamp(ns)'));
 ----
 String: 1577926871123456789
 
diff --git a/datafusion/sqllogictest/test_files/spark/string/is_valid_utf8.slt b/datafusion/sqllogictest/test_files/spark/string/is_valid_utf8.slt
new file mode 100644
index 0000000000000..9b04595334ae1
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/is_valid_utf8.slt
@@ -0,0 +1,203 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+statement ok
+CREATE TABLE test_is_valid_utf8(value STRING) AS VALUES
+  (arrow_cast('Hello, world!', 'Utf8')),
+  (arrow_cast('Spark', 'Utf8')),
+  (arrow_cast('DataFusion', 'Utf8')),
+  (arrow_cast('ASCII only 123 !@#', 'Utf8')),
+  (arrow_cast(NULL, 'Utf8'));
+
+query B
+SELECT is_valid_utf8(value) FROM test_is_valid_utf8;
+----
+true
+true
+true
+true
+NULL
+
+query B
+SELECT is_valid_utf8(NULL::string);
+----
+NULL
+
+query B
+SELECT is_valid_utf8('Hello, world!'::string);
+----
+true
+
+query B
+SELECT is_valid_utf8('😀🎉✨'::string);
+----
+true
+
+query B
+SELECT is_valid_utf8(''::string);
+----
+true
+
+query B
+SELECT is_valid_utf8('ASCII only 123 !@#'::string);
+----
+true
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'C2A9', 'Binary'));
+----
+true
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'C2AE', 'Binary'));
+----
+true
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'E282AC', 'Binary'));
+----
+true
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'E284A2', 'Binary'));
+----
+true
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'F09F9880', 'Binary'));
+----
+true
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'F09F8E89', 'Binary'));
+----
+true
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'80', 'Binary'));
+----
+false
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'BF', 'Binary'));
+----
+false
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'808080', 'Binary'));
+----
+false
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'C2', 'Binary'));
+----
+false
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'E2', 'Binary'));
+----
+false
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'F0', 'Binary'));
+----
+false
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'E282', 'Binary'));
+----
+false
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'C081', 'Binary'));
+----
+false
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'E08080', 'Binary'));
+----
+false
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'F0808080', 'Binary'));
+----
+false
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'FE', 'Binary'));
+----
+false
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'FF', 'Binary'));
+----
+false
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'61C262', 'Binary'));
+----
+false
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'41BF42', 'Binary'));
+----
+false
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'ED9FBF', 'Binary'));
+----
+true
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'EDA080', 'Binary'));
+----
+false
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'EDBFBF', 'Binary'));
+----
+false
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'F48FBFBF', 'Binary'));
+----
+true
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'F4908080', 'Binary'));
+----
+false
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'6162C2A963', 'Binary'));
+----
+true
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'6162806364', 'Binary'));
+----
+false
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'610062', 'Binary'));
+----
+true
+
+query B
+SELECT is_valid_utf8(arrow_cast(x'', 'Binary'));
+----
+true
diff --git a/datafusion/sqllogictest/test_files/spark/string/make_valid_utf8.slt b/datafusion/sqllogictest/test_files/spark/string/make_valid_utf8.slt
new file mode 100644
index 0000000000000..b87e0530e957d
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/make_valid_utf8.slt
@@ -0,0 +1,91 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query T
+SELECT make_valid_utf8('Spark'::string);
+----
+Spark
+
+query T
+SELECT make_valid_utf8(''::string);
+----
+(empty)
+
+query T
+SELECT make_valid_utf8(NULL::string);
+----
+NULL
+
+query T
+SELECT make_valid_utf8(arrow_cast(x'C3A9', 'Binary'));
+----
+é
+
+query T
+SELECT make_valid_utf8(arrow_cast(x'F0908C80', 'Binary'));
+----
+𐌀
+
+query T
+SELECT make_valid_utf8(arrow_cast(x'ED9FBF', 'Binary'));
+----
+퟿
+
+query T
+SELECT make_valid_utf8(arrow_cast(x'FF', 'Binary'));
+----
+�
+
+query T
+SELECT make_valid_utf8(arrow_cast(x'C0AF', 'Binary'));
+----
+��
+
+query T
+SELECT make_valid_utf8(arrow_cast(x'F4808080', 'Binary'));
+----
+􀀀
+
+query T
+SELECT make_valid_utf8(arrow_cast(x'EDA0BDEDB2A9', 'Binary'));
+----
+������
+
+query T
+SELECT make_valid_utf8(arrow_cast(x'F0', 'Binary'));
+----
+�
+
+query T
+SELECT make_valid_utf8(arrow_cast(x'E0', 'Binary'));
+----
+�
+
+query T
+SELECT make_valid_utf8(arrow_cast(x'F0808080', 'Binary'));
+----
+����
+
+query T
+SELECT make_valid_utf8(arrow_cast(x'61', 'Binary'));
+----
+a
+
+query T
+SELECT make_valid_utf8(arrow_cast(x'61C262', 'Binary'));
+----
+a�b
diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt b/datafusion/sqllogictest/test_files/spark/string/soundex.slt
index f0c46e10fd1de..ec85c4bd40b24 100644
--- a/datafusion/sqllogictest/test_files/spark/string/soundex.slt
+++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt
@@ -15,13 +15,187 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT soundex('Miller');
-## PySpark 3.5.5 Result: {'soundex(Miller)': 'M460', 'typeof(soundex(Miller))': 'string', 'typeof(Miller)': 'string'}
-#query
-#SELECT soundex('Miller'::string);
+query T
+SELECT soundex('Miller');
+----
+M460
+
+query T
+SELECT soundex(NULL);
+----
+NULL
+
+query T
+SELECT soundex('');
+----
+(empty)
+
+query T
+SELECT soundex('Apache Spark');
+----
+A122
+
+query T
+SELECT soundex('123');
+----
+123
+
+query T
+SELECT soundex('a123');
+----
+A000
+
+query T
+SELECT soundex('Datafusion');
+----
+D312
+
+query T
+SELECT soundex('Ashcroft');
+----
+A261
+
+query T
+SELECT soundex('B1B');
+----
+B100
+
+query T
+SELECT soundex('B B');
+----
+B100
+
+query T
+SELECT soundex('BAB');
+----
+B100
+
+query T
+SELECT soundex('#hello');
+----
+#hello
+
+query T
+SELECT soundex(' hello');
+----
+ hello
+
+query T
+SELECT soundex('\thello');
+----
+\thello
+
+query T
+SELECT soundex('😀hello');
+----
+😀hello
+
+query T
+SELECT soundex('123');
+----
+123
+
+query T
+SELECT soundex('1abc');
+----
+1abc
+
+query T
+SELECT soundex('A');
+----
+A000
+
+query T
+SELECT soundex('BFPV');
+----
+B000
+
+query T
+SELECT soundex('Robert');
+----
+R163
+
+query T
+SELECT soundex('Rupert');
+----
+R163
+
+query T
+SELECT soundex(NULL);
+----
+NULL
+
+query T
+SELECT soundex('');
+----
+(empty)
+
+query T
+SELECT soundex('robert');
+----
+R163
+
+query T
+SELECT soundex('rObErT');
+----
+R163
+
+query T
+SELECT soundex('Müller');
+----
+M460
+
+query T
+SELECT soundex('Abcdefghijklmnop');
+----
+A123
+
+query T
+SELECT soundex('Lloyd');
+----
+L300
+
+query T
+SELECT soundex('BWB');
+----
+B000
+
+query T
+SELECT soundex('BHB');
+----
+B000
+
+query T
+SELECT soundex('Tymczak');
+----
+T522
+
+query T
+SELECT soundex('Aeiou');
+----
+A000
+
+query T
+SELECT soundex('1Robert');
+----
+1Robert
+
+query T
+SELECT soundex('Smith-Jones');
+----
+S532
+
+query T
+SELECT soundex('#');
+----
+#
+
+query T
+SELECT soundex('\nhello');
+----
+\nhello
+
+query T
+SELECT concat(soundex('   '), 'Spark')
+----
+   Spark
diff --git a/datafusion/sqllogictest/test_files/spark/string/space.slt b/datafusion/sqllogictest/test_files/spark/string/space.slt
new file mode 100644
index 0000000000000..388f679c4da73
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/space.slt
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query T
+SELECT concat(space(1::INT), 'Spark');
+----
+ Spark
+
+query T
+SELECT concat(space(5::INT), 'Spark');
+----
+     Spark
+
+query T
+SELECT space(0::INT);
+----
+(empty)
+
+query T
+SELECT space(-1::INT);
+----
+(empty)
+
+query T
+SELECT space(NULL);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/string/substr.slt b/datafusion/sqllogictest/test_files/spark/string/substr.slt
deleted file mode 100644
index 0942bdd86a4ef..0000000000000
--- a/datafusion/sqllogictest/test_files/spark/string/substr.slt
+++ /dev/null
@@ -1,37 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-
-#   http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT substr('Spark SQL', -3);
-## PySpark 3.5.5 Result: {'substr(Spark SQL, -3, 2147483647)': 'SQL', 'typeof(substr(Spark SQL, -3, 2147483647))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(-3)': 'int'}
-#query
-#SELECT substr('Spark SQL'::string, -3::int);
-
-## Original Query: SELECT substr('Spark SQL', 5);
-## PySpark 3.5.5 Result: {'substr(Spark SQL, 5, 2147483647)': 'k SQL', 'typeof(substr(Spark SQL, 5, 2147483647))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(5)': 'int'}
-#query
-#SELECT substr('Spark SQL'::string, 5::int);
-
-## Original Query: SELECT substr('Spark SQL', 5, 1);
-## PySpark 3.5.5 Result: {'substr(Spark SQL, 5, 1)': 'k', 'typeof(substr(Spark SQL, 5, 1))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(5)': 'int', 'typeof(1)': 'int'}
-#query
-#SELECT substr('Spark SQL'::string, 5::int, 1::int);
diff --git a/datafusion/sqllogictest/test_files/spark/string/substring.slt b/datafusion/sqllogictest/test_files/spark/string/substring.slt
index 847ce4b6d4739..9189ce50c2ffe 100644
--- a/datafusion/sqllogictest/test_files/spark/string/substring.slt
+++ b/datafusion/sqllogictest/test_files/spark/string/substring.slt
@@ -15,23 +15,196 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT substring('Spark SQL', -3);
-## PySpark 3.5.5 Result: {'substring(Spark SQL, -3, 2147483647)': 'SQL', 'typeof(substring(Spark SQL, -3, 2147483647))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(-3)': 'int'}
-#query
-#SELECT substring('Spark SQL'::string, -3::int);
-
-## Original Query: SELECT substring('Spark SQL', 5);
-## PySpark 3.5.5 Result: {'substring(Spark SQL, 5, 2147483647)': 'k SQL', 'typeof(substring(Spark SQL, 5, 2147483647))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(5)': 'int'}
-#query
-#SELECT substring('Spark SQL'::string, 5::int);
-
-## Original Query: SELECT substring('Spark SQL', 5, 1);
-## PySpark 3.5.5 Result: {'substring(Spark SQL, 5, 1)': 'k', 'typeof(substring(Spark SQL, 5, 1))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(5)': 'int', 'typeof(1)': 'int'}
-#query
-#SELECT substring('Spark SQL'::string, 5::int, 1::int);
+
+query T
+SELECT substring('Spark SQL'::string, 0::int);
+----
+Spark SQL
+
+query T
+SELECT substring('Spark SQL'::string, 5::int);
+----
+k SQL
+
+query T
+SELECT substring('Spark SQL'::string, 3::int, 1::int);
+----
+a
+
+# Test negative start
+query T
+SELECT substring('Spark SQL'::string, -3::int);
+----
+SQL
+
+query T
+SELECT substring('Spark SQL'::string, -3::int, 2::int);
+----
+SQ
+
+# Test length exceeding string length
+query T
+SELECT substring('Spark SQL'::string, 2::int, 700::int);
+----
+park SQL
+
+# Test start position beyond string length
+query T
+SELECT substring('Spark SQL'::string, 30::int);
+----
+(empty)
+
+query T
+SELECT substring('Spark SQL'::string, -30::int);
+----
+Spark SQL
+
+# Test negative length
+query T
+SELECT substring('Spark SQL'::string, 3::int, -1::int);
+----
+(empty)
+
+query T
+SELECT substring('Spark SQL'::string, 3::int, 0::int);
+----
+(empty)
+
+# Test unicode strings
+query T
+SELECT substring('joséésoj'::string, 5::int);
+----
+ésoj
+
+query T
+SELECT substring('joséésoj'::string, 5::int, 2::int);
+----
+és
+
+# NULL handling
+query T
+SELECT substring('Spark SQL'::string, NULL::int);
+----
+NULL
+
+query T
+SELECT substring(NULL::string, 5::int);
+----
+NULL
+
+query T
+SELECT substring(NULL::string, 3::int, 1::int);
+----
+NULL
+
+query T
+SELECT substring('Spark SQL'::string, NULL::int, 1::int);
+----
+NULL
+
+query T
+SELECT substring('Spark SQL'::string, 3::int, NULL::int);
+----
+NULL
+
+query T
+SELECT substring(column1, column2)
+FROM VALUES
+('Spark SQL'::string, 0::int),
+('Spark SQL'::string, 5::int),
+('Spark SQL'::string, -3::int),
+('Spark SQL'::string, 500::int),
+('Spark SQL'::string, -300::int),
+(NULL::string, 5::int),
+('Spark SQL'::string, NULL::int);
+----
+Spark SQL
+k SQL
+SQL
+(empty)
+Spark SQL
+NULL
+NULL
+
+query T
+SELECT substring(column1, column2, column3)
+FROM VALUES
+('Spark SQL'::string, -3::int, 2::int),
+('Spark SQL'::string, 3::int, 1::int),
+('Spark SQL'::string, 3::int, 700::int),
+('Spark SQL'::string, 3::int, -1::int),
+('Spark SQL'::string, 3::int, 0::int),
+('Spark SQL'::string, 300::int, 3::int),
+('Spark SQL'::string, -300::int, 3::int),
+(NULL::string, 3::int, 1::int),
+('Spark SQL'::string, NULL::int, 1::int),
+('Spark SQL'::string, 3::int, NULL::int);
+----
+SQ
+a
+ark SQL
+(empty)
+(empty)
+(empty)
+(empty)
+NULL
+NULL
+NULL
+
+# alias substr
+
+query T
+SELECT substr('Spark SQL'::string, 0::int);
+----
+Spark SQL
+
+query T
+SELECT substr(column1, column2)
+FROM VALUES
+('Spark SQL'::string, 0::int),
+('Spark SQL'::string, 5::int),
+('Spark SQL'::string, -3::int),
+('Spark SQL'::string, 500::int),
+('Spark SQL'::string, -300::int),
+(NULL::string, 5::int),
+('Spark SQL'::string, NULL::int);
+----
+Spark SQL
+k SQL
+SQL
+(empty)
+Spark SQL
+NULL
+NULL
+
+query T
+SELECT substr(column1, column2, column3)
+FROM VALUES
+('Spark SQL'::string, -3::int, 2::int),
+('Spark SQL'::string, 3::int, 1::int),
+('Spark SQL'::string, 3::int, 700::int),
+('Spark SQL'::string, 3::int, -1::int),
+('Spark SQL'::string, 3::int, 0::int),
+('Spark SQL'::string, 300::int, 3::int),
+('Spark SQL'::string, -300::int, 3::int),
+(NULL::string, 3::int, 1::int),
+('Spark SQL'::string, NULL::int, 1::int),
+('Spark SQL'::string, 3::int, NULL::int);
+----
+SQ
+a
+ark SQL
+(empty)
+(empty)
+(empty)
+(empty)
+NULL
+NULL
+NULL
+
+query T
+SELECT substr(column1, -10, 3)
+FROM VALUES
+('abc'::string)
+----
+(empty)
\ No newline at end of file
diff --git a/datafusion/sqllogictest/test_files/spark/url/parse_url.slt b/datafusion/sqllogictest/test_files/spark/url/parse_url.slt
index f2dc55f75598a..7a5051d50e2ce 100644
--- a/datafusion/sqllogictest/test_files/spark/url/parse_url.slt
+++ b/datafusion/sqllogictest/test_files/spark/url/parse_url.slt
@@ -140,6 +140,96 @@ SELECT parse_url('notaurl', 'host');
 ----
 NULL
 
+# Schemeless URLs: Spark java.net.URI behavior
+# Simple schemeless string
+query T
+SELECT parse_url('notaurl', 'PATH');
+----
+notaurl
+
+query T
+SELECT parse_url('notaurl', 'FILE');
+----
+notaurl
+
+query T
+SELECT parse_url('notaurl', 'PROTOCOL');
+----
+NULL
+
+query T
+SELECT parse_url('notaurl', 'QUERY');
+----
+NULL
+
+# Schemeless URL with query string
+query T
+SELECT parse_url('notaurl?key=value', 'PATH');
+----
+notaurl
+
+query T
+SELECT parse_url('notaurl?key=value', 'FILE');
+----
+notaurl?key=value
+
+query T
+SELECT parse_url('notaurl?key=value', 'QUERY');
+----
+key=value
+
+query T
+SELECT parse_url('notaurl?key=value', 'QUERY', 'key');
+----
+value
+
+query T
+SELECT parse_url('notaurl?key=value', 'HOST');
+----
+NULL
+
+# Schemeless URL with fragment
+query T
+SELECT parse_url('notaurl#reference', 'REF');
+----
+reference
+
+query T
+SELECT parse_url('notaurl#reference', 'PATH');
+----
+notaurl
+
+query T
+SELECT parse_url('notaurl#reference', 'FILE');
+----
+notaurl
+
+# Schemeless URL with both query and fragment
+query T
+SELECT parse_url('notaurl?a=1&b=2#frag', 'PATH');
+----
+notaurl
+
+query T
+SELECT parse_url('notaurl?a=1&b=2#frag', 'QUERY');
+----
+a=1&b=2
+
+query T
+SELECT parse_url('notaurl?a=1&b=2#frag', 'QUERY', 'b');
+----
+2
+
+query T
+SELECT parse_url('notaurl?a=1&b=2#frag', 'REF');
+----
+frag
+
+query T
+SELECT parse_url('notaurl?a=1&b=2#frag', 'FILE');
+----
+notaurl?a=1&b=2
+
 query T
 SELECT parse_url('https://example.com', 'PATH');
 ----
@@ -175,3 +265,46 @@ SELECT parse_url();
 
 query error DataFusion error: Execution error: The url is invalid: inva lid://spark\.apache\.org/path\?query=1\. Use `try_parse_url` to tolerate invalid URL and return NULL instead\. SQLSTATE: 22P02
 SELECT parse_url('inva lid://spark.apache.org/path?query=1', 'QUERY');
+
+# NULL argument handling (Sail PR #1393)
+# NULL URL should return NULL
+query T
+SELECT parse_url(NULL, 'HOST');
+----
+NULL
+
+# NULL part should return NULL
+query T
+SELECT parse_url('https://example.com/path?query=1', NULL);
+----
+NULL
+
+# Both NULL should return NULL
+query T
+SELECT parse_url(NULL, NULL);
+----
+NULL
+
+# NULL URL with 3 args
+query T
+SELECT parse_url(NULL, 'QUERY', 'key');
+----
+NULL
+
+# NULL part with 3 args
+query T
+SELECT parse_url('https://example.com/path?query=1', NULL, 'key');
+----
+NULL
+
+# NULL key with 3 args (valid URL and part) - Spark returns NULL when third arg is NULL
+query T
+SELECT parse_url('https://example.com/path?query=1', 'QUERY', NULL);
+----
+NULL
+
+# All three NULL
+query T
+SELECT parse_url(NULL, NULL, NULL);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt b/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt
index 403747c63c77c..a0e42a16483f3 100644
--- a/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt
+++ b/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt
@@ -91,6 +91,96 @@ SELECT try_parse_url('notaurl', 'host');
 ----
 NULL
 
+# Schemeless URLs: Spark java.net.URI behavior
+# Simple schemeless string
+query T
+SELECT try_parse_url('notaurl', 'PATH');
+----
+notaurl
+
+query T
+SELECT try_parse_url('notaurl', 'FILE');
+----
+notaurl
+
+query T
+SELECT try_parse_url('notaurl', 'PROTOCOL');
+----
+NULL
+
+query T
+SELECT try_parse_url('notaurl', 'QUERY');
+----
+NULL
+
+# Schemeless URL with query string
+query T
+SELECT try_parse_url('notaurl?key=value', 'PATH');
+----
+notaurl
+
+query T
+SELECT try_parse_url('notaurl?key=value', 'FILE');
+----
+notaurl?key=value
+
+query T
+SELECT try_parse_url('notaurl?key=value', 'QUERY');
+----
+key=value
+
+query T
+SELECT try_parse_url('notaurl?key=value', 'QUERY', 'key');
+----
+value
+
+query T
+SELECT try_parse_url('notaurl?key=value', 'HOST');
+----
+NULL
+
+# Schemeless URL with fragment
+query T
+SELECT try_parse_url('notaurl#reference', 'REF');
+----
+reference
+
+query T
+SELECT try_parse_url('notaurl#reference', 'PATH');
+----
+notaurl
+
+query T
+SELECT try_parse_url('notaurl#reference', 'FILE');
+----
+notaurl
+
+# Schemeless URL with both query and fragment
+query T
+SELECT try_parse_url('notaurl?a=1&b=2#frag', 'PATH');
+----
+notaurl
+
+query T
+SELECT try_parse_url('notaurl?a=1&b=2#frag', 'QUERY');
+----
+a=1&b=2
+
+query T
+SELECT try_parse_url('notaurl?a=1&b=2#frag', 'QUERY', 'b');
+----
+2
+
+query T
+SELECT try_parse_url('notaurl?a=1&b=2#frag', 'REF');
+----
+frag
+
+query T
+SELECT try_parse_url('notaurl?a=1&b=2#frag', 'FILE');
+----
+notaurl?a=1&b=2
+
 query T
 SELECT try_parse_url('https://example.com', 'PATH');
 ----
@@ -125,3 +215,46 @@ query T
 SELECT try_parse_url('inva lid://spark.apache.org/path?query=1', 'QUERY');
 ----
 NULL
+
+# NULL argument handling (Sail PR #1393)
+# NULL URL should return NULL
+query T
+SELECT try_parse_url(NULL, 'HOST');
+----
+NULL
+
+# NULL part should return NULL
+query T
+SELECT try_parse_url('https://example.com/path?query=1', NULL);
+----
+NULL
+
+# Both NULL should return NULL
+query T
+SELECT try_parse_url(NULL, NULL);
+----
+NULL
+
+# NULL URL with 3 args
+query T
+SELECT try_parse_url(NULL, 'QUERY', 'key');
+----
+NULL
+
+# NULL part with 3 args
+query T
+SELECT try_parse_url('https://example.com/path?query=1', NULL, 'key');
+----
+NULL
+
+# NULL key with 3 args (valid URL and part) - Spark returns NULL when third arg is NULL
+query T
+SELECT try_parse_url('https://example.com/path?query=1', 'QUERY', NULL);
+----
+NULL
+
+# All three NULL
+query T
+SELECT try_parse_url(NULL, NULL, NULL);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/url/try_url_decode.slt b/datafusion/sqllogictest/test_files/spark/url/try_url_decode.slt
new file mode 100644
index 0000000000000..559c77af97e9a
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/url/try_url_decode.slt
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query T
+SELECT try_url_decode('https%3A%2F%2Fspark.apache.org');
+----
+https://spark.apache.org
+
+# Test with LargeUtf8
+query T
+SELECT try_url_decode(arrow_cast('https%3A%2F%2Fspark.apache.org', 'LargeUtf8'));
+----
+https://spark.apache.org
+
+# Test with Utf8View
+query T
+SELECT try_url_decode(arrow_cast('https%3A%2F%2Fspark.apache.org', 'Utf8View'));
+----
+https://spark.apache.org
+
+# Non-ASCII string
+query T
+SELECT try_url_decode('%E4%BD%A0%E5%A5%BD')
+----
+你好
+
+# Empty string
+query T
+SELECT try_url_decode('');
+----
+(empty)
+
+# Null value
+query T
+SELECT try_url_decode(NULL::string);
+----
+NULL
+
+# Roundtrip with url_encode
+query T
+SELECT try_url_decode(url_encode('Spark SQL ~!@#$%^&*()'));
+----
+Spark SQL ~!@#$%^&*()
+
+# Plus replacement
+query T
+SELECT try_url_decode('Spark+SQL%21');
+----
+Spark SQL!
+
+# Handled invalid percent encoding error
+query T
+SELECT try_url_decode('https%3%2F%2Fspark.apache.org'::string);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/url/url_decode.slt b/datafusion/sqllogictest/test_files/spark/url/url_decode.slt
index fa5028b647dc3..61399aa0ef2e7 100644
--- a/datafusion/sqllogictest/test_files/spark/url/url_decode.slt
+++ b/datafusion/sqllogictest/test_files/spark/url/url_decode.slt
@@ -15,13 +15,53 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT url_decode('https%3A%2F%2Fspark.apache.org');
-## PySpark 3.5.5 Result: {'url_decode(https%3A%2F%2Fspark.apache.org)': 'https://spark.apache.org', 'typeof(url_decode(https%3A%2F%2Fspark.apache.org))': 'string', 'typeof(https%3A%2F%2Fspark.apache.org)': 'string'}
-#query
-#SELECT url_decode('https%3A%2F%2Fspark.apache.org'::string);
+query T
+SELECT url_decode('https%3A%2F%2Fspark.apache.org');
+----
+https://spark.apache.org
+
+# Test with LargeUtf8
+query T
+SELECT url_decode(arrow_cast('https%3A%2F%2Fspark.apache.org', 'LargeUtf8'));
+----
+https://spark.apache.org
+
+# Test with Utf8View
+query T
+SELECT url_decode(arrow_cast('https%3A%2F%2Fspark.apache.org', 'Utf8View'));
+----
+https://spark.apache.org
+
+# Non-ASCII string
+query T
+SELECT url_decode('%E4%BD%A0%E5%A5%BD')
+----
+你好
+
+# Empty string
+query T
+SELECT url_decode('');
+----
+(empty)
+
+# Null value
+query T
+SELECT url_decode(NULL::string);
+----
+NULL
+
+# Roundtrip with url_encode
+query T
+SELECT url_decode(url_encode('Spark SQL ~!@#$%^&*()'));
+----
+Spark SQL ~!@#$%^&*()
+
+# Plus replacement
+query T
+SELECT url_decode('Spark+SQL%21');
+----
+Spark SQL!
+
+# Invalid percent encoding case
+query error DataFusion error: Execution error: Invalid percent\-encoding: invalid hex sequence '%3%' at position 5
+SELECT url_decode('https%3%2F%2Fspark.apache.org'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/url/url_encode.slt b/datafusion/sqllogictest/test_files/spark/url/url_encode.slt
index 6aef87dcb4c0f..3d7a42f19384b 100644
--- a/datafusion/sqllogictest/test_files/spark/url/url_encode.slt
+++ b/datafusion/sqllogictest/test_files/spark/url/url_encode.slt
@@ -15,13 +15,19 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
+query T
+SELECT url_encode('https://spark.apache.org');
+----
+https%3A%2F%2Fspark.apache.org
 
-## Original Query: SELECT url_encode('https://spark.apache.org');
-## PySpark 3.5.5 Result: {'url_encode(https://spark.apache.org)': 'https%3A%2F%2Fspark.apache.org', 'typeof(url_encode(https://spark.apache.org))': 'string', 'typeof(https://spark.apache.org)': 'string'}
-#query
-#SELECT url_encode('https://spark.apache.org'::string);
+# Test with LargeUtf8
+query T
+SELECT url_encode(arrow_cast('https://spark.apache.org', 'LargeUtf8'));
+----
+https%3A%2F%2Fspark.apache.org
+
+# Test with Utf8View
+query T
+SELECT url_encode(arrow_cast('https://spark.apache.org', 'Utf8View'));
+----
+https%3A%2F%2Fspark.apache.org
diff --git a/datafusion/sqllogictest/test_files/statistics_registry.slt b/datafusion/sqllogictest/test_files/statistics_registry.slt
new file mode 100644
index 0000000000000..6baa4e218ed20
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/statistics_registry.slt
@@ -0,0 +1,177 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# StatisticsRegistry: demonstrates improved join ordering via more conservative
+# cardinality estimates on a skewed dataset.
+#
+# customers (10 rows, 3 distinct customer_ids, skewed 8:1:1)
+# orders    (10 rows, same distribution)
+# dim_small (50 rows)
+#
+# Built-in: 10*10 / NDV(3) = 33 < 50  -> keeps inner join on build side (wrong; actual = 66)
+# Registry: 10*10 = 100 > 50           -> swaps dim_small to build side (correct)
+#
+# Parquet files written by COPY TO carry min/max stats (NDV=3 via range) but no
+# distinct_count, so the registry falls back to the cartesian product upper bound.
+# Threshold settings force Partitioned mode so statistics alone drive the swap.
+
+statement ok
+set datafusion.explain.physical_plan_only = true;
+
+statement ok
+set datafusion.optimizer.hash_join_single_partition_threshold = 1;
+
+statement ok
+set datafusion.optimizer.hash_join_single_partition_threshold_rows = 1;
+
+# -- Create test data --------------------------------------------------------
+
+query I
+COPY (SELECT arrow_cast(v, 'Int32') AS order_id,
+             arrow_cast(CASE WHEN v <= 8 THEN 1
+                             WHEN v <= 9 THEN 2
+                             ELSE 3 END, 'Int32') AS customer_id,
+             arrow_cast(v, 'Int32') AS small_id
+      FROM generate_series(1, 10) t(v))
+TO 'test_files/scratch/statistics_registry/orders.parquet'
+STORED AS PARQUET;
+----
+10
+
+query I
+COPY (SELECT arrow_cast(CASE WHEN v <= 8 THEN 1
+                             WHEN v <= 9 THEN 2
+                             ELSE 3 END, 'Int32') AS customer_id,
+             arrow_cast(v, 'Int32') AS region_id
+      FROM generate_series(1, 10) t(v))
+TO 'test_files/scratch/statistics_registry/customers.parquet'
+STORED AS PARQUET;
+----
+10
+
+query I
+COPY (SELECT arrow_cast(v, 'Int32') AS small_id,
+             arrow_cast(v, 'Int32') AS label
+      FROM generate_series(1, 50) t(v))
+TO 'test_files/scratch/statistics_registry/dim_small.parquet'
+STORED AS PARQUET;
+----
+50
+
+statement ok
+CREATE EXTERNAL TABLE orders
+STORED AS PARQUET
+LOCATION 'test_files/scratch/statistics_registry/orders.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE customers
+STORED AS PARQUET
+LOCATION 'test_files/scratch/statistics_registry/customers.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE dim_small
+STORED AS PARQUET
+LOCATION 'test_files/scratch/statistics_registry/dim_small.parquet';
+
+# -- Without registry --------------------------------------------------------
+# Built-in estimate 33 < 50: inner join stays on left (suboptimal; actual output is 66 rows)
+
+statement ok
+set datafusion.optimizer.use_statistics_registry = false;
+
+query TT
+EXPLAIN SELECT o.order_id, c.region_id, d.label
+FROM customers c
+JOIN orders o ON c.customer_id = o.customer_id
+JOIN dim_small d ON o.small_id = d.small_id;
+----
+physical_plan
+01)HashJoinExec: mode=Partitioned, join_type=Inner, on=[(small_id@2, small_id@0)], projection=[order_id@1, region_id@0, label@4]
+02)--RepartitionExec: partitioning=Hash([small_id@2], 4), input_partitions=1
+03)----HashJoinExec: mode=Partitioned, join_type=Inner, on=[(customer_id@0, customer_id@1)], projection=[region_id@1, order_id@2, small_id@4]
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/statistics_registry/customers.parquet]]}, projection=[customer_id, region_id], file_type=parquet
+05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/statistics_registry/orders.parquet]]}, projection=[order_id, customer_id, small_id], file_type=parquet, predicate=DynamicFilter [ empty ]
+06)--RepartitionExec: partitioning=Hash([small_id@0], 4), input_partitions=1
+07)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/statistics_registry/dim_small.parquet]]}, projection=[small_id, label], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# -- With registry -----------------------------------------------------------
+# Conservative estimate 100 > 50: dim_small correctly swapped to build side
+
+statement ok
+set datafusion.optimizer.use_statistics_registry = true;
+
+query TT
+EXPLAIN SELECT o.order_id, c.region_id, d.label
+FROM customers c
+JOIN orders o ON c.customer_id = o.customer_id
+JOIN dim_small d ON o.small_id = d.small_id;
+----
+physical_plan
+01)HashJoinExec: mode=Partitioned, join_type=Inner, on=[(small_id@0, small_id@2)], projection=[order_id@3, region_id@2, label@1]
+02)--RepartitionExec: partitioning=Hash([small_id@0], 4), input_partitions=1
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/statistics_registry/dim_small.parquet]]}, projection=[small_id, label], file_type=parquet
+04)--RepartitionExec: partitioning=Hash([small_id@2], 4), input_partitions=1
+05)----HashJoinExec: mode=Partitioned, join_type=Inner, on=[(customer_id@0, customer_id@1)], projection=[region_id@1, order_id@2, small_id@4]
+06)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/statistics_registry/customers.parquet]]}, projection=[customer_id, region_id], file_type=parquet
+07)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/statistics_registry/orders.parquet]]}, projection=[order_id, customer_id, small_id], file_type=parquet, predicate=DynamicFilter [ empty ] AND DynamicFilter [ empty ]
+
+# -- Verify results are identical regardless of join order --------------------
+
+statement ok
+set datafusion.optimizer.use_statistics_registry = false;
+
+query I
+SELECT count(*)
+FROM customers c
+JOIN orders o ON c.customer_id = o.customer_id
+JOIN dim_small d ON o.small_id = d.small_id;
+----
+66
+
+statement ok
+set datafusion.optimizer.use_statistics_registry = true;
+
+query I
+SELECT count(*)
+FROM customers c
+JOIN orders o ON c.customer_id = o.customer_id
+JOIN dim_small d ON o.small_id = d.small_id;
+----
+66
+
+# -- Cleanup -----------------------------------------------------------------
+
+statement ok
+set datafusion.explain.physical_plan_only = false;
+
+statement ok
+set datafusion.optimizer.use_statistics_registry = false;
+
+statement ok
+set datafusion.optimizer.hash_join_single_partition_threshold = 1048576;
+
+statement ok
+set datafusion.optimizer.hash_join_single_partition_threshold_rows = 131072;
+
+statement ok
+DROP TABLE orders;
+
+statement ok
+DROP TABLE customers;
+
+statement ok
+DROP TABLE dim_small;
diff --git a/datafusion/sqllogictest/test_files/string/string_literal.slt b/datafusion/sqllogictest/test_files/string/string_literal.slt
index f602dbb54b081..97f2a40c13fea 100644
--- a/datafusion/sqllogictest/test_files/string/string_literal.slt
+++ b/datafusion/sqllogictest/test_files/string/string_literal.slt
@@ -132,16 +132,16 @@ SELECT substr('Hello🌏世界', 5, 3)
 ----
 o🌏世
 
-statement error The first argument of the substr function can only be a string, but got Int64
+statement error Function 'substr' failed to match any signature
 SELECT substr(1, 3)
 
-statement error The first argument of the substr function can only be a string, but got Int64
+statement error Function 'substr' failed to match any signature
 SELECT substr(1, 3, 4)
 
-statement error Execution error: negative substring length not allowed
+statement error Execution error: negative count not allowed
 select substr(arrow_cast('foo', 'Utf8View'), 1, -1);
 
-statement error Execution error: negative substring length not allowed
+statement error Execution error: negative count not allowed
 select substr('', 1, -1);
 
 # StringView scalar to StringView scalar
@@ -207,6 +207,25 @@ SELECT ends_with('foobar', 'foo')
 ----
 false
 
+query B
+SELECT ends_with(a, '%bar') from (values ('foobar'), ('foo%bar')) as t(a);
+----
+false
+true
+
+query B
+SELECT ends_with(a, '_bar') from (values ('foobar'), ('foo_bar')) as t(a);
+----
+false
+true
+
+query B
+SELECT ends_with(a, '\_bar') from (values ('foobar'), ('foo\\bar'), ('foo\_bar')) as t(a);
+----
+false
+false
+true
+
 query I
 SELECT levenshtein('kitten', 'sitting')
 ----
@@ -293,6 +312,35 @@ SELECT lpad(NULL, 5, 'xy')
 ----
 NULL
 
+# lpad counts Unicode codepoints, not grapheme clusters.
+# chr(769) is U+0301 COMBINING ACUTE ACCENT — 'e' || chr(769) is 2 codepoints
+# but renders as a single grapheme cluster.
+
+# Input with combining character: 'e' + combining accent + 'x' = 3 codepoints.
+# Padding to 4 means 1 space prepended.
+query BII
+SELECT lpad('e' || chr(769) || 'x', 4) = ' ' || 'e' || chr(769) || 'x',
+       character_length('e' || chr(769) || 'x'),
+       character_length(lpad('e' || chr(769) || 'x', 4))
+----
+true 3 4
+
+# Truncating input with combining character: 'e' + combining accent + 'x' + 'y'
+# = 4 codepoints. Truncating to 3 keeps first 3 codepoints: 'e' + combining accent + 'x'.
+query BI
+SELECT lpad('e' || chr(769) || 'xy', 3) = 'e' || chr(769) || 'x',
+       character_length(lpad('e' || chr(769) || 'xy', 3))
+----
+true 3
+
+# Fill string with combining character: fill is 'e' + combining accent = 2 codepoints.
+# Padding 'x' (1 codepoint) to length 5 means 4 fill codepoints = 2 cycles of fill.
+query BI
+SELECT lpad('x', 5, 'e' || chr(769)) = 'e' || chr(769) || 'e' || chr(769) || 'x',
+       character_length(lpad('x', 5, 'e' || chr(769)))
+----
+true 5
+
 query T
 SELECT regexp_replace('foobar', 'bar', 'xx', 'gi')
 ----
@@ -328,11 +376,35 @@ SELECT repeat('foo', 3)
 ----
 foofoofoo
 
+query T
+SELECT repeat(arrow_cast('foo', 'LargeUtf8'), 3)
+----
+foofoofoo
+
+query T
+SELECT repeat(arrow_cast('foo', 'Utf8View'), 3)
+----
+foofoofoo
+
 query T
 SELECT repeat(arrow_cast('foo', 'Dictionary(Int32, Utf8)'), 3)
 ----
 foofoofoo
 
+query T
+SELECT arrow_typeof(repeat('foo', 3))
+----
+Utf8
+
+query T
+SELECT arrow_typeof(repeat(arrow_cast('foo', 'LargeUtf8'), 3))
+----
+LargeUtf8
+
+query T
+SELECT arrow_typeof(repeat(arrow_cast('foo', 'Utf8View'), 3))
+----
+Utf8View
 
 query T
 SELECT replace('foobar', 'bar', 'hello')
@@ -370,6 +442,21 @@ SELECT reverse(arrow_cast('abcde', 'Utf8View'))
 ----
 edcba
 
+query T
+SELECT arrow_typeof(reverse('abcde'))
+----
+Utf8
+
+query T
+SELECT arrow_typeof(reverse(arrow_cast('abcde', 'LargeUtf8')))
+----
+LargeUtf8
+
+query T
+SELECT arrow_typeof(reverse(arrow_cast('abcde', 'Utf8View')))
+----
+Utf8View
+
 query T
 SELECT reverse(arrow_cast('abcde', 'Dictionary(Int32, Utf8)'))
 ----
@@ -525,6 +612,34 @@ SELECT rpad(arrow_cast(NULL, 'Utf8View'), 5, 'xy')
 ----
 NULL
 
+# rpad counts Unicode codepoints, not grapheme clusters.
+# chr(769) is U+0301 COMBINING ACUTE ACCENT.
+
+# Input with combining character: 'e' + combining accent + 'x' = 3 codepoints.
+# Padding to 4 means 1 space appended.
+query BII
+SELECT rpad('e' || chr(769) || 'x', 4) = 'e' || chr(769) || 'x' || ' ',
+       character_length('e' || chr(769) || 'x'),
+       character_length(rpad('e' || chr(769) || 'x', 4))
+----
+true 3 4
+
+# Truncating input with combining character: 'e' + combining accent + 'x' + 'y'
+# = 4 codepoints. Truncating to 3 keeps first 3 codepoints: 'e' + combining accent + 'x'.
+query BI
+SELECT rpad('e' || chr(769) || 'xy', 3) = 'e' || chr(769) || 'x',
+       character_length(rpad('e' || chr(769) || 'xy', 3))
+----
+true 3
+
+# Fill string with combining character: fill is 'e' + combining accent = 2 codepoints.
+# Padding 'x' (1 codepoint) to length 5 means 4 fill codepoints = 2 cycles of fill.
+query BI
+SELECT rpad('x', 5, 'e' || chr(769)) = 'x' || 'e' || chr(769) || 'e' || chr(769),
+       character_length(rpad('x', 5, 'e' || chr(769)))
+----
+true 5
+
 query I
 SELECT char_length('')
 ----
@@ -846,6 +961,26 @@ SELECT starts_with('foobar', 'bar')
 ----
 false
 
+
+query B
+SELECT starts_with(a, 'foo%') from (values ('foobar'), ('foo%bar')) as t(a);
+----
+false
+true
+
+query B
+SELECT starts_with(a, 'foo\_') from (values ('foobar'), ('foo\\_bar'), ('foo\_bar')) as t(a);
+----
+false
+false
+true
+
+query B
+SELECT starts_with(a, 'foo_') from (values ('foobar'), ('foo_bar')) as t(a);
+----
+false
+true
+
 query TT
 select '   ', '|'
 ----
@@ -1729,3 +1864,49 @@ SELECT
 ;
 ----
 48 176 32 40
+
+# translate preserves input string type
+
+query T
+SELECT translate(arrow_cast('12345', 'Utf8View'), '143', 'ax')
+----
+a2x5
+
+query T
+SELECT arrow_typeof(translate('12345', '143', 'ax'))
+----
+Utf8
+
+query T
+SELECT arrow_typeof(translate(arrow_cast('12345', 'LargeUtf8'), '143', 'ax'))
+----
+LargeUtf8
+
+query T
+SELECT arrow_typeof(translate(arrow_cast('12345', 'Utf8View'), '143', 'ax'))
+----
+Utf8View
+
+# translate operates on Unicode codepoints, not grapheme clusters.
+# chr(769) is U+0301 COMBINING ACUTE ACCENT.
+
+# Replacing a combining accent (a single codepoint) with another character.
+# 'e' || chr(769) is 2 codepoints; translating chr(769) → 'X' replaces just the accent.
+query B
+SELECT translate('e' || chr(769), chr(769), 'X') = 'eX'
+----
+true
+
+# Replacing the base character but not the combining accent.
+query B
+SELECT translate('e' || chr(769) || 'y', 'e', 'a') = 'a' || chr(769) || 'y'
+----
+true
+
+# Deleting a combining accent (from longer than to).
+# 'e' || chr(769) || 'x' with chr(769) in `from` but no corresponding `to` entry → deleted.
+query BI
+SELECT translate('e' || chr(769) || 'x', chr(769), '') = 'ex',
+       character_length(translate('e' || chr(769) || 'x', chr(769), ''))
+----
+true 2
diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part
index 6268c4ccdb1a5..fb0901ebf8e37 100644
--- a/datafusion/sqllogictest/test_files/string/string_query.slt.part
+++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part
@@ -41,38 +41,17 @@ NULL R NULL 🔥
 
 # --------------------------------------
 # test type coercion (compare to int)
-# queries should not error
+#
+# Comparing a string column to an integer literal is allowed but will fail
+# at runtime if the string column contains any values that can't be cast
+# to integers.
 # --------------------------------------
 
-query BB
+statement error Arrow error: Cast error: Cannot cast string 'Andrew' to value of Int64 type
 select ascii_1 = 1 as col1, 1 = ascii_1 as col2 from test_basic_operator;
-----
-false false
-false false
-false false
-false false
-false false
-false false
-false false
-false false
-false false
-NULL NULL
-NULL NULL
 
-query BB
+statement error Arrow error: Cast error: Cannot cast string 'Andrew' to value of Int64 type
 select ascii_1 <> 1 as col1, 1 <> ascii_1 as col2 from test_basic_operator;
-----
-true true
-true true
-true true
-true true
-true true
-true true
-true true
-true true
-true true
-NULL NULL
-NULL NULL
 
 # Coercion to date/time
 query BBB
@@ -528,7 +507,7 @@ FROM test_basic_operator;
 Andrew
 Xiangpeng
 Raphael
-under_scrre
+under_scare
 percent
 (empty)
 (empty)
@@ -542,10 +521,10 @@ SELECT
   TRANSLATE(unicode_1, 'foo', 'bar') as c
 FROM test_basic_operator;
 ----
-databusirn📊🔥
-databusirn数据融合
-databusirnДатаФусион
-un iść crre
+databusian📊🔥
+databusian数据融合
+databusianДатаФусион
+un iść care
 pan Tadeusz ma iść w kąt
 (empty)
 (empty)
@@ -993,25 +972,27 @@ NULL NULL NULL NULL
 # Test FIND_IN_SET
 # --------------------------------------
 
-query IIII
+query IIIIII
 SELECT
   FIND_IN_SET(ascii_1, 'a,b,c,d'),
   FIND_IN_SET(ascii_1, 'Andrew,Xiangpeng,Raphael'),
   FIND_IN_SET(unicode_1, 'a,b,c,d'),
-  FIND_IN_SET(unicode_1, 'datafusion📊🔥,datafusion数据融合,datafusionДатаФусион')
+  FIND_IN_SET(unicode_1, 'datafusion📊🔥,datafusion数据融合,datafusionДатаФусион'),
+  FIND_IN_SET(NULL, unicode_1),
+  FIND_IN_SET(unicode_1, NULL)
 FROM test_basic_operator;
 ----
-0 1 0 1
-0 2 0 2
-0 3 0 3
-0 0 0 0
-0 0 0 0
-0 0 0 0
-0 0 0 0
-0 0 0 0
-0 0 0 0
-NULL NULL NULL NULL
-NULL NULL NULL NULL
+0 1 0 1 NULL NULL
+0 2 0 2 NULL NULL
+0 3 0 3 NULL NULL
+0 0 0 0 NULL NULL
+0 0 0 0 NULL NULL
+0 0 0 0 NULL NULL
+0 0 0 0 NULL NULL
+0 0 0 0 NULL NULL
+0 0 0 0 NULL NULL
+NULL NULL NULL NULL NULL NULL
+NULL NULL NULL NULL NULL NULL
 
 # --------------------------------------
 # Test || operator
@@ -1762,3 +1743,196 @@ bbf3f11cb5b43e700273a78d12de55e4a7eab741ed2abf13787a4d2dc832b8ec
 d2e2adf7177b7a8afddbc12d1634cf23ea1a71020f6a1308070a16400fb68fde
 NULL
 NULL
+
+# --------------------------------------
+# Test left() and right()
+# --------------------------------------
+
+# left with positive n on ASCII column
+query T
+SELECT left(ascii_1, 3) FROM test_basic_operator
+----
+And
+Xia
+Rap
+und
+per
+(empty)
+(empty)
+%
+_
+NULL
+NULL
+
+# left with positive n on unicode column
+query T
+SELECT left(unicode_1, 12) FROM test_basic_operator
+----
+datafusion📊🔥
+datafusion数据
+datafusionДа
+un iść core
+pan Tadeusz
+(empty)
+(empty)
+(empty)
+(empty)
+NULL
+NULL
+
+# left with negative n
+query T
+SELECT left(ascii_1, -3) FROM test_basic_operator
+----
+And
+Xiangp
+Raph
+under_sc
+perc
+(empty)
+(empty)
+(empty)
+(empty)
+NULL
+NULL
+
+# right with positive n on ASCII column
+query T
+SELECT right(ascii_1, 3) FROM test_basic_operator
+----
+rew
+eng
+ael
+ore
+ent
+(empty)
+(empty)
+%
+_
+NULL
+NULL
+
+# right with positive n on unicode column
+query T
+SELECT right(unicode_1, 3) FROM test_basic_operator
+----
+n📊🔥
+据融合
+ион
+ore
+kąt
+(empty)
+(empty)
+(empty)
+(empty)
+NULL
+NULL
+
+# right with negative n
+query T
+SELECT right(ascii_1, -3) FROM test_basic_operator
+----
+rew
+ngpeng
+hael
+er_score
+cent
+(empty)
+(empty)
+(empty)
+(empty)
+NULL
+NULL
+
+# left and right with n=0
+query TT
+SELECT left(ascii_1, 0), right(ascii_1, 0) FROM test_basic_operator
+----
+(empty) (empty)
+(empty) (empty)
+(empty) (empty)
+(empty) (empty)
+(empty) (empty)
+(empty) (empty)
+(empty) (empty)
+(empty) (empty)
+(empty) (empty)
+NULL NULL
+NULL NULL
+
+# left and right return Utf8View
+query TT
+SELECT arrow_typeof(left(ascii_1, 3)), arrow_typeof(right(ascii_1, 3)) FROM test_basic_operator LIMIT 1
+----
+Utf8View Utf8View
+
+# --------------------------------------
+# Test repeat() against array inputs with various null patterns. The scalar
+# paths are covered in `string_literal.slt`; these tests exercise the array
+# execution path of repeat() including the bulk null-buffer computation.
+# --------------------------------------
+
+# array string + array count, neither side has nulls
+query T
+SELECT repeat(ascii_2, 2) FROM test_basic_operator
+----
+XX
+XiangpengXiangpeng
+RR
+un_____coreun_____core
+p%tp%t
+%%
+%%%%
+\%\%
+\_\_
+%%
+RR
+
+# array string + array count, only the string side has nulls
+query T
+SELECT repeat(ascii_1, 2) FROM test_basic_operator
+----
+AndrewAndrew
+XiangpengXiangpeng
+RaphaelRaphael
+under_scoreunder_score
+percentpercent
+(empty)
+(empty)
+%%
+__
+NULL
+NULL
+
+# array string + array count, only the count side has nulls
+query T
+SELECT repeat(ascii_2, CASE WHEN length(ascii_2) = 2 THEN NULL ELSE 2 END) FROM test_basic_operator
+----
+XX
+XiangpengXiangpeng
+RR
+un_____coreun_____core
+p%tp%t
+%%
+NULL
+NULL
+NULL
+%%
+RR
+
+# array string + array count, nulls interleaved on both sides (the string
+# side is null at rows 9-10; the count side is null at rows 6-8)
+query T
+SELECT repeat(ascii_1, CASE WHEN length(ascii_2) = 2 THEN NULL ELSE 2 END) FROM test_basic_operator
+----
+AndrewAndrew
+XiangpengXiangpeng
+RaphaelRaphael
+under_scoreunder_score
+percentpercent
+(empty)
+NULL
+NULL
+NULL
+NULL
+NULL
diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt
index 4d30f572ad6fb..4de2d20b02e8a 100644
--- a/datafusion/sqllogictest/test_files/string/string_view.slt
+++ b/datafusion/sqllogictest/test_files/string/string_view.slt
@@ -370,7 +370,7 @@ EXPLAIN SELECT
 FROM test;
 ----
 logical_plan
-01)Projection: test.column1_utf8 LIKE Utf8("foo\%%") AS c1, test.column1_large_utf8 LIKE LargeUtf8("foo\%%") AS c2, test.column1_utf8view LIKE Utf8View("foo\%%") AS c3, test.column1_utf8 LIKE Utf8("f_o%") AS c4, test.column1_large_utf8 LIKE LargeUtf8("f_o%") AS c5, test.column1_utf8view LIKE Utf8View("f_o%") AS c6
+01)Projection: test.column1_utf8 LIKE Utf8("foo\%%") AS c1, test.column1_large_utf8 LIKE LargeUtf8("foo\%%") AS c2, test.column1_utf8view LIKE Utf8View("foo\%%") AS c3, test.column1_utf8 LIKE Utf8("f\_o%") AS c4, test.column1_large_utf8 LIKE LargeUtf8("f\_o%") AS c5, test.column1_utf8view LIKE Utf8View("f\_o%") AS c6
 02)--TableScan: test projection=[column1_utf8, column1_large_utf8, column1_utf8view]
 
 ## Test STARTS_WITH works with column arguments
@@ -642,7 +642,17 @@ EXPLAIN SELECT
 FROM test;
 ----
 logical_plan
-01)Projection: concat_ws(Utf8(", "), test.column1_utf8view, test.column2_utf8view) AS c
+01)Projection: concat_ws(Utf8View(", "), test.column1_utf8view, test.column2_utf8view) AS c
+02)--TableScan: test projection=[column1_utf8view, column2_utf8view]
+
+## Ensure CONCAT_WS simplification preserves Utf8View for merged literals
+query TT
+EXPLAIN SELECT
+  concat_ws(', ', column1_utf8view, 'foo', 'bar', column2_utf8view) as c
+FROM test;
+----
+logical_plan
+01)Projection: concat_ws(Utf8View(", "), test.column1_utf8view, Utf8View("foo, bar"), test.column2_utf8view) AS c
 02)--TableScan: test projection=[column1_utf8view, column2_utf8view]
 
 ## Ensure no casts for CONTAINS
@@ -898,6 +908,143 @@ logical_plan
 01)Projection: split_part(test.column1_utf8view, Utf8("f"), Int64(1)) AS c1, split_part(Utf8("testtesttest"), test.column1_utf8view, Int64(1)) AS c2
 02)--TableScan: test projection=[column1_utf8view]
 
+# SPLIT_PART with Utf8View
+query T
+SELECT split_part(arrow_cast('abc~@~def~@~ghi', 'Utf8View'), '~@~', 2);
+----
+def
+
+query T
+SELECT split_part(arrow_cast('abc~@~def~@~ghi', 'Utf8View'), '~@~', 20);
+----
+(empty)
+
+query T
+SELECT split_part(arrow_cast('abc~@~def~@~ghi', 'Utf8View'), '~@~', -1);
+----
+ghi
+
+statement error DataFusion error: Execution error: field position must not be zero
+SELECT split_part(arrow_cast('abc~@~def~@~ghi', 'Utf8View'), '~@~', 0);
+
+query T
+SELECT split_part(arrow_cast('a,b', 'Utf8View'), '', 1);
+----
+a,b
+
+query T
+SELECT split_part(arrow_cast('a,b', 'Utf8View'), '', 2);
+----
+(empty)
+
+query T
+SELECT split_part(arrow_cast('a,b', 'Utf8View'), '', -1);
+----
+a,b
+
+# Single-char delimiter
+query T
+SELECT split_part(arrow_cast('a.b.c', 'Utf8View'), '.', 2);
+----
+b
+
+# Verify Utf8View input produces Utf8View output
+query T
+SELECT arrow_typeof(split_part(arrow_cast('a.b.c', 'Utf8View'), '.', 2));
+----
+Utf8View
+
+# SPLIT_PART with Utf8View column (exercises the array fast path)
+query T
+SELECT split_part(column1_utf8view, 'ph', 1) FROM test;
+----
+Andrew
+Xiangpeng
+Ra
+(empty)
+NULL
+
+query T
+SELECT split_part(column1_utf8view, 'ph', 2) FROM test;
+----
+(empty)
+(empty)
+ael
+(empty)
+NULL
+
+# Negative position
+query T
+SELECT split_part(column1_utf8view, 'ph', -1) FROM test;
+----
+Andrew
+Xiangpeng
+ael
+(empty)
+NULL
+
+# Delimiter not found returns full string
+query T
+SELECT split_part(column1_utf8view, 'ZZZ', 1) FROM test;
+----
+Andrew
+Xiangpeng
+Raphael
+(empty)
+NULL
+
+# Empty delimiter with column
+query T
+SELECT split_part(column1_utf8view, '', 1) FROM test;
+----
+Andrew
+Xiangpeng
+Raphael
+(empty)
+NULL
+
+# Single-char delimiter with column
+query T
+SELECT split_part(column1_utf8view, 'a', 1) FROM test;
+----
+Andrew
+Xi
+R
+(empty)
+NULL
+
+# Verify array path also returns Utf8View
+query T
+SELECT arrow_typeof(split_part(column1_utf8view, '.', 1)) FROM test LIMIT 1;
+----
+Utf8View
+
+# Long strings (>12 bytes) exercise out-of-line StringView construction in split_part
+query T
+SELECT split_part(arrow_cast(column1, 'Utf8View'), '.', 1) FROM
+  (VALUES ('this_is_over_12.suffix'), ('short.val'), (NULL)) AS t(column1);
+----
+this_is_over_12
+short
+NULL
+
+query T
+SELECT split_part(arrow_cast(column1, 'Utf8View'), '.', -1) FROM
+  (VALUES ('prefix.this_is_over_12'), ('a.short'), (NULL)) AS t(column1);
+----
+this_is_over_12
+short
+NULL
+
+# Results at the 12-byte inline/out-of-line boundary
+query T
+SELECT split_part(arrow_cast(column1, 'Utf8View'), '.', 1) FROM
+  (VALUES ('exactly12byt.rest'), ('thirteen_byte.rest'), ('twelve_bytes.rest')) AS t(column1);
+----
+exactly12byt
+thirteen_byte
+twelve_bytes
+
 ## Ensure no casts for STRPOS
 query TT
 EXPLAIN SELECT
@@ -931,6 +1078,28 @@ logical_plan
 01)Projection: substr_index(test.column1_utf8view, Utf8View("a"), Int64(1)) AS c, substr_index(test.column1_utf8view, Utf8View("a"), Int64(2)) AS c2
 02)--TableScan: test projection=[column1_utf8view]
 
+## Verify Utf8View input produces Utf8View output for SUBSTR_INDEX
+query T
+SELECT arrow_typeof(substr_index(arrow_cast('a.b.c', 'Utf8View'), '.', 1));
+----
+Utf8View
+
+## Verify array path also returns Utf8View for SUBSTR_INDEX
+query T
+SELECT arrow_typeof(substr_index(column1_utf8view, 'a', 1)) FROM test LIMIT 1;
+----
+Utf8View
+
+## Verify array path values for SUBSTR_INDEX with Utf8View input
+query T
+SELECT substr_index(column1_utf8view, 'a', 1) FROM test;
+----
+Andrew
+Xi
+R
+(empty)
+NULL
+
 
 ## Ensure no casts on columns for STARTS_WITH
 query TT
@@ -1100,7 +1269,7 @@ EXPLAIN SELECT
 FROM test;
 ----
 logical_plan
-01)Projection: test.column1_utf8view ~ Utf8View("an") AS c1
+01)Projection: test.column1_utf8view LIKE Utf8View("%an%") AS c1
 02)--TableScan: test projection=[column1_utf8view]
 
 # `~*` operator (regex match case-insensitive)
@@ -1182,5 +1351,9 @@ logical_plan
 01)Projection: temp.column2 || temp.column3
 02)--TableScan: temp projection=[column2, column3]
 
+# Config reset
+statement ok
+RESET datafusion.explain.logical_plan_only;
+
 statement ok
 drop table test
diff --git a/datafusion/sqllogictest/test_files/string_numeric_coercion.slt b/datafusion/sqllogictest/test_files/string_numeric_coercion.slt
new file mode 100644
index 0000000000000..1567a149bcdf4
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/string_numeric_coercion.slt
@@ -0,0 +1,584 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## Tests for string-numeric comparison coercion
+## Verifies that when comparing a numeric column to a string literal,
+## the comparison is performed numerically (not lexicographically).
+## See: https://github.com/apache/datafusion/issues/15161
+##########
+
+# Setup test data
+statement ok
+CREATE TABLE t_int AS VALUES (1), (5), (325), (499), (1000);
+
+statement ok
+CREATE TABLE t_float AS VALUES (1.5), (5.0), (325.7), (499.9), (1000.1);
+
+# -------------------------------------------------
+# Integer column with comparison operators vs string literals.
+# Ensure that the comparison is done with numeric semantics,
+# not lexicographically.
+# -------------------------------------------------
+
+query I rowsort
+SELECT * FROM t_int WHERE column1 < '5';
+----
+1
+
+query I rowsort
+SELECT * FROM t_int WHERE column1 > '5';
+----
+1000
+325
+499
+
+query I rowsort
+SELECT * FROM t_int WHERE column1 <= '5';
+----
+1
+5
+
+query I rowsort
+SELECT * FROM t_int WHERE column1 >= '5';
+----
+1000
+325
+499
+5
+
+query I rowsort
+SELECT * FROM t_int WHERE column1 = '5';
+----
+5
+
+query I rowsort
+SELECT * FROM t_int WHERE column1 != '5';
+----
+1
+1000
+325
+499
+
+query I rowsort
+SELECT * FROM t_int WHERE column1 < '10';
+----
+1
+5
+
+query I rowsort
+SELECT * FROM t_int WHERE column1 <= '100';
+----
+1
+5
+
+query I rowsort
+SELECT * FROM t_int WHERE column1 > '100';
+----
+1000
+325
+499
+
+# -------------------------------------------------
+# Float column with comparison operators vs string literals
+# -------------------------------------------------
+
+query R rowsort
+SELECT * FROM t_float WHERE column1 < '5';
+----
+1.5
+
+query R rowsort
+SELECT * FROM t_float WHERE column1 > '5';
+----
+1000.1
+325.7
+499.9
+
+query R rowsort
+SELECT * FROM t_float WHERE column1 = '5';
+----
+5
+
+query R rowsort
+SELECT * FROM t_float WHERE column1 = '5.0';
+----
+5
+
+# -------------------------------------------------
+# Error on strings that cannot be cast to the numeric column type
+# -------------------------------------------------
+
+# Non-numeric string against integer column
+statement error Arrow error: Cast error: Cannot cast string 'hello' to value of Int64 type
+SELECT * FROM t_int WHERE column1 < 'hello';
+
+# Non-numeric string against float column
+statement error Arrow error: Cast error: Cannot cast string 'hello' to value of Float64 type
+SELECT * FROM t_float WHERE column1 < 'hello';
+
+# Float string against integer column
+statement error Arrow error: Cast error: Cannot cast string '99.99' to value of Int64 type
+SELECT * FROM t_int WHERE column1 = '99.99';
+
+# Empty string against integer column
+statement error Arrow error: Cast error: Cannot cast string '' to value of Int64 type
+SELECT * FROM t_int WHERE column1 = '';
+
+# Empty string against float column
+statement error Arrow error: Cast error: Cannot cast string '' to value of Float64 type
+SELECT * FROM t_float WHERE column1 = '';
+
+# Overflow
+statement error Arrow error: Cast error: Cannot cast string '99999999999999999999' to value of Int64 type
+SELECT * FROM t_int WHERE column1 = '99999999999999999999';
+
+
+# -------------------------------------------------
+# UNION still uses string coercion (type unification context)
+# -------------------------------------------------
+
+statement ok
+CREATE TABLE t_str AS VALUES ('one'), ('two'), ('three');
+
+query T rowsort
+SELECT column1 FROM t_int UNION ALL SELECT column1 FROM t_str;
+----
+1
+1000
+325
+499
+5
+one
+three
+two
+
+# Verify the UNION coerces to Utf8 (not numeric)
+query TT
+EXPLAIN SELECT column1 FROM t_int UNION ALL SELECT column1 FROM t_str;
+----
+logical_plan
+01)Union
+02)--Projection: CAST(t_int.column1 AS Utf8) AS column1
+03)----TableScan: t_int projection=[column1]
+04)--TableScan: t_str projection=[column1]
+physical_plan
+01)UnionExec
+02)--ProjectionExec: expr=[CAST(column1@0 AS Utf8) as column1]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+# -------------------------------------------------
+# BETWEEN uses comparison coercion (numeric preferred)
+# -------------------------------------------------
+
+query I rowsort
+SELECT * FROM t_int WHERE column1 BETWEEN '5' AND '100';
+----
+5
+
+# -------------------------------------------------
+# IN list uses comparison coercion (numeric preferred)
+# `x IN (a, b)` is semantically equivalent to `x = a OR x = b`
+# -------------------------------------------------
+
+# Basic IN list with string literals against integer column
+query I rowsort
+SELECT * FROM t_int WHERE column1 IN ('5', '325');
+----
+325
+5
+
+# IN list with a value where numeric coercion matters
+query I rowsort
+SELECT * FROM t_int WHERE column1 IN ('1000');
+----
+1000
+
+# IN list with NOT
+query I rowsort
+SELECT * FROM t_int WHERE column1 NOT IN ('1', '5');
+----
+1000
+325
+499
+
+# IN list with float column
+query R rowsort
+SELECT * FROM t_float WHERE column1 IN ('5.0', '325.7');
+----
+325.7
+5
+
+# Verify the plan shows numeric coercion (not CAST to Utf8)
+query TT
+EXPLAIN SELECT * FROM t_int WHERE column1 IN ('5', '325');
+----
+logical_plan
+01)Filter: t_int.column1 = Int64(5) OR t_int.column1 = Int64(325)
+02)--TableScan: t_int projection=[column1]
+physical_plan
+01)FilterExec: column1@0 = 5 OR column1@0 = 325
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Error on invalid string in IN list
+statement error Arrow error: Cast error: Cannot cast string 'hello' to value of Int64 type
+SELECT * FROM t_int WHERE column1 IN ('5', 'hello');
+
+# Mixed numeric literal and string literal in IN list against integer column
+query I rowsort
+SELECT * FROM t_int WHERE column1 IN (5, '325');
+----
+325
+5
+
+# Mixed numeric literal and string literal in IN list against float column
+query R rowsort
+SELECT * FROM t_float WHERE column1 IN (5, '325.7');
+----
+325.7
+5
+
+# String and numeric literal order reversed
+query R rowsort
+SELECT * FROM t_float WHERE column1 IN ('5', 325.7);
+----
+325.7
+5
+
+# Float string literal against integer column errors (cannot cast '10.0' to Int64)
+statement error Arrow error: Cast error: Cannot cast string '10.0' to value of Int64 type
+SELECT * FROM t_int WHERE column1 IN (5, '10.0');
+
+# Non-numeric string in mixed IN list still errors
+statement error Arrow error: Cast error: Cannot cast string 'hello' to value of Int64 type
+SELECT * FROM t_int WHERE column1 IN ('hello', 5);
+
+# -------------------------------------------------
+# CASE WHEN uses comparison coercion for conditions (numeric preferred)
+# `CASE expr WHEN val` is semantically equivalent to `expr = val`
+# -------------------------------------------------
+
+# Basic CASE with integer column and string WHEN values
+query T rowsort
+SELECT CASE column1 WHEN '5' THEN 'five' WHEN '1000' THEN 'thousand' ELSE 'other' END FROM t_int;
+----
+five
+other
+other
+other
+thousand
+
+# CASE with float column: '5' is cast to 5.0 numerically, matching the row.
+# (Under string comparison, '5.0' != '5' would fail to match.)
+query T rowsort
+SELECT CASE column1 WHEN '5' THEN 'matched' ELSE 'no match' END FROM t_float;
+----
+matched
+no match
+no match
+no match
+no match
+
+# THEN/ELSE results still use type union coercion (string preferred),
+# so mixing numeric and string coerces to string
+query T rowsort
+SELECT CASE WHEN column1 > 500 THEN column1 ELSE 'small' END FROM t_int;
+----
+1000
+small
+small
+small
+small
+
+# -------------------------------------------------
+# GREATEST / LEAST use comparison coercion (numeric preferred)
+# -------------------------------------------------
+
+# GREATEST with mixed int and string: numeric comparison, not lexicographic.
+query I
+SELECT GREATEST(10, '9');
+----
+10
+
+query T
+SELECT arrow_typeof(GREATEST(10, '9'));
+----
+Int64
+
+# LEAST with mixed int and string: numeric comparison.
+query I
+SELECT LEAST(10, '9');
+----
+9
+
+query T
+SELECT arrow_typeof(LEAST(10, '9'));
+----
+Int64
+
+# GREATEST with multiple mixed args
+query I
+SELECT GREATEST(1, '20', 3);
+----
+20
+
+# Non-numeric string in GREATEST errors
+statement error Arrow error: Cast error: Cannot cast string 'hello' to value of Int64 type
+SELECT GREATEST(1, 'hello');
+
+# -------------------------------------------------
+# NULLIF uses comparison coercion (numeric preferred)
+# -------------------------------------------------
+
+# NULLIF with mixed int and string: numeric comparison.
+# 10 != 9 numerically, so returns 10.
+query I
+SELECT NULLIF(10, '9');
+----
+10
+
+query T
+SELECT arrow_typeof(NULLIF(10, '9'));
+----
+Int64
+
+# NULLIF with matching values: 10 = 10 numerically, so returns NULL.
+query I
+SELECT NULLIF(10, '10');
+----
+NULL
+
+# -------------------------------------------------
+# Nested struct/map/list comparisons use comparison coercion
+# (numeric preferred) for their field/element types
+# -------------------------------------------------
+
+statement ok
+CREATE TABLE t_struct_int AS SELECT named_struct('val', column1) as s FROM (VALUES (1), (5), (10));
+
+statement ok
+CREATE TABLE t_struct_str AS SELECT named_struct('val', column1) as s FROM (VALUES ('5'), ('10'));
+
+# Struct comparison: the string field is cast to Int64 (numeric preferred).
+query ? rowsort
+SELECT t1.s FROM t_struct_int t1, t_struct_str t2 WHERE t1.s = t2.s;
+----
+{val: 10}
+{val: 5}
+
+# Struct in UNION uses type union coercion (string preferred).
+# The integer struct field is cast to Utf8.
+query ? rowsort
+SELECT s FROM t_struct_int UNION ALL SELECT s FROM t_struct_str;
+----
+{val: 10}
+{val: 10}
+{val: 1}
+{val: 5}
+{val: 5}
+
+statement ok
+DROP TABLE t_struct_int;
+
+statement ok
+DROP TABLE t_struct_str;
+
+# List comparison: string elements are cast to Int64 (numeric preferred).
+statement ok
+CREATE TABLE t_list_int AS SELECT column1 as l FROM (VALUES ([1, 5, 10]), ([20, 30]));
+
+statement ok
+CREATE TABLE t_list_str AS SELECT column1 as l FROM (VALUES (['5', '10']), (['20', '30']));
+
+# Verify the element types are Int64 and Utf8 respectively
+query T
+SELECT arrow_typeof(l) FROM t_list_int LIMIT 1;
+----
+List(Int64)
+
+query T
+SELECT arrow_typeof(l) FROM t_list_str LIMIT 1;
+----
+List(Utf8)
+
+query ? rowsort
+SELECT t1.l FROM t_list_int t1, t_list_str t2 WHERE t1.l = t2.l;
+----
+[20, 30]
+
+# List comparison casts the string list to the numeric element type (Int64),
+# not the other way around.
+query TT
+EXPLAIN SELECT t1.l FROM t_list_int t1, t_list_str t2 WHERE t1.l = t2.l;
+----
+logical_plan
+01)Projection: t1.l
+02)--Inner Join: t1.l = CAST(t2.l AS List(Int64))
+03)----SubqueryAlias: t1
+04)------TableScan: t_list_int projection=[l]
+05)----SubqueryAlias: t2
+06)------TableScan: t_list_str projection=[l]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(l@0, CAST(t2.l AS List(Int64))@1)], projection=[l@0]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--ProjectionExec: expr=[l@0 as l, CAST(l@0 AS List(Int64)) as CAST(t2.l AS List(Int64))]
+04)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+# List in UNION uses type union coercion (string preferred).
+# The integer list elements are cast to Utf8.
+query ? rowsort
+SELECT l FROM t_list_int UNION ALL SELECT l FROM t_list_str;
+----
+[1, 5, 10]
+[20, 30]
+[20, 30]
+[5, 10]
+
+# Verify the UNION result type has Utf8 elements (not Int64).
+query T
+SELECT arrow_typeof(l) FROM (SELECT l FROM t_list_int UNION ALL SELECT l FROM t_list_str) LIMIT 1;
+----
+List(Utf8)
+
+statement ok
+DROP TABLE t_list_int;
+
+statement ok
+DROP TABLE t_list_str;
+
+# Map comparison: string values are cast to Int64 (numeric preferred).
+statement ok
+CREATE TABLE t_map_int AS SELECT MAP {'a': 1, 'b': 5} as m;
+
+statement ok
+CREATE TABLE t_map_str AS SELECT MAP {'a': '1', 'b': '5'} as m;
+
+# Verify the value types are Int64 and Utf8 respectively
+query T
+SELECT arrow_typeof(m) FROM t_map_int LIMIT 1;
+----
+Map("entries": non-null Struct("key": non-null Utf8, "value": Int64), unsorted)
+
+query T
+SELECT arrow_typeof(m) FROM t_map_str LIMIT 1;
+----
+Map("entries": non-null Struct("key": non-null Utf8, "value": Utf8), unsorted)
+
+query ? rowsort
+SELECT t1.m FROM t_map_int t1, t_map_str t2 WHERE t1.m = t2.m;
+----
+{a: 1, b: 5}
+
+# Map in UNION uses type union coercion (string preferred).
+# The integer map values are cast to Utf8.
+query ? rowsort
+SELECT m FROM t_map_int UNION ALL SELECT m FROM t_map_str;
+----
+{a: 1, b: 5}
+{a: 1, b: 5}
+
+# Verify the UNION result type has Utf8 values (not Int64).
+query T
+SELECT arrow_typeof(m) FROM (SELECT m FROM t_map_int UNION ALL SELECT m FROM t_map_str) LIMIT 1;
+----
+Map("entries": non-null Struct("key": non-null Utf8, "value": Utf8), unsorted)
+
+statement ok
+DROP TABLE t_map_int;
+
+statement ok
+DROP TABLE t_map_str;
+
+# -------------------------------------------------
+# LIKE / regex on dictionary-encoded numeric columns should error,
+# consistent with LIKE on plain numeric columns
+# -------------------------------------------------
+
+# Plain integer column: LIKE is not supported
+statement error There isn't a common type to coerce Int64 and Utf8 in LIKE expression
+SELECT * FROM t_int WHERE column1 LIKE '%5%';
+
+# Dictionary-encoded integer column: should also error
+statement error There isn't a common type to coerce Dictionary\(Int32, Int64\) and Utf8 in LIKE expression
+SELECT arrow_cast(column1, 'Dictionary(Int32, Int64)') LIKE '%5%' FROM t_int;
+
+# Dictionary-encoded string column: LIKE works as normal
+query B rowsort
+SELECT arrow_cast('hello', 'Dictionary(Int32, Utf8)') LIKE '%ell%';
+----
+true
+
+# REE-encoded integer column: LIKE should also error
+statement error There isn't a common type to coerce RunEndEncoded.* and Utf8 in LIKE expression
+SELECT arrow_cast(column1, 'RunEndEncoded("run_ends": non-null Int32, "values": Int64)') LIKE '%5%' FROM t_int;
+
+# REE-encoded string column: LIKE works as normal
+query B rowsort
+SELECT arrow_cast('hello', 'RunEndEncoded("run_ends": non-null Int32, "values": Utf8)') LIKE '%ell%';
+----
+true
+
+# Dictionary-encoded integer column: regex should error
+statement error Cannot infer common argument type for regex operation
+SELECT arrow_cast(column1, 'Dictionary(Int32, Int64)') ~ '5' FROM t_int;
+
+# Dictionary-encoded string column: regex works as normal
+query B rowsort
+SELECT arrow_cast('hello', 'Dictionary(Int32, Utf8)') ~ 'ell';
+----
+true
+
+# REE-encoded integer column: regex should error
+statement error Cannot infer common argument type for regex operation
+SELECT arrow_cast(column1, 'RunEndEncoded("run_ends": non-null Int32, "values": Int64)') ~ '5' FROM t_int;
+
+# REE-encoded string column: regex works as normal
+query B rowsort
+SELECT arrow_cast('hello', 'RunEndEncoded("run_ends": non-null Int32, "values": Utf8)') ~ 'ell';
+----
+true
+
+# -------------------------------------------------
+# Cleanup
+# -------------------------------------------------
+
+statement ok
+DROP TABLE t_int;
+
+statement ok
+DROP TABLE t_float;
+
+statement ok
+DROP TABLE t_str;
+
+# -------------------------------------------------
+# List element coercion should reject mixed
+# numeric/string categories (same as array literals)
+# -------------------------------------------------
+
+# Array literal with mixed numeric/string elements errors
+query error Cannot cast string 'a' to value of Int64 type
+SELECT [1, 'a'];
+
+# MAP with mixed-category list keys should also error
+query error Cannot cast string 'a' to value of Int64 type
+SELECT MAP {[1,2,3]:1, ['a', 'b']:2};
+
+# MAP with mixed-category list values should also error
+query error Cannot cast string 'a' to value of Int64 type
+SELECT MAP {'a':[1,2,3], 'b':['a', 'b']};
diff --git a/datafusion/sqllogictest/test_files/struct.slt b/datafusion/sqllogictest/test_files/struct.slt
index dce5fe036b4ec..5cf6e4817d475 100644
--- a/datafusion/sqllogictest/test_files/struct.slt
+++ b/datafusion/sqllogictest/test_files/struct.slt
@@ -38,9 +38,9 @@ CREATE TABLE struct_values (
     s1 struct<INT>,
     s2 struct<a INT,b VARCHAR>
 ) AS VALUES
-  (struct(1), struct(1, 'string1')),
-  (struct(2), struct(2, 'string2')),
-  (struct(3), struct(3, 'string3'))
+  (struct(1), struct(1 AS a, 'string1' AS b)),
+  (struct(2), struct(2 AS a, 'string2' AS b)),
+  (struct(3), struct(3 AS a, 'string3' AS b))
 ;
 
 query ??
@@ -53,9 +53,9 @@ select * from struct_values;
 query TT
 select arrow_typeof(s1), arrow_typeof(s2) from struct_values;
 ----
-Struct("c0": nullable Int32) Struct("a": nullable Int32, "b": nullable Utf8View)
-Struct("c0": nullable Int32) Struct("a": nullable Int32, "b": nullable Utf8View)
-Struct("c0": nullable Int32) Struct("a": nullable Int32, "b": nullable Utf8View)
+Struct("c0": Int32) Struct("a": Int32, "b": Utf8View)
+Struct("c0": Int32) Struct("a": Int32, "b": Utf8View)
+Struct("c0": Int32) Struct("a": Int32, "b": Utf8View)
 
 
 # struct[i]
@@ -229,12 +229,12 @@ select named_struct('field_a', 1, 'field_b', 2);
 query T
 select arrow_typeof(named_struct('first', 1, 'second', 2, 'third', 3));
 ----
-Struct("first": nullable Int64, "second": nullable Int64, "third": nullable Int64)
+Struct("first": Int64, "second": Int64, "third": Int64)
 
 query T
 select arrow_typeof({'first': 1, 'second': 2, 'third': 3});
 ----
-Struct("first": nullable Int64, "second": nullable Int64, "third": nullable Int64)
+Struct("first": Int64, "second": Int64, "third": Int64)
 
 # test nested struct literal
 query ?
@@ -397,7 +397,8 @@ drop view complex_view;
 
 # struct with different keys r1 and r2 is not valid
 statement ok
-create table t(a struct<r1 varchar, c int>, b struct<r2 varchar, c float>) as values (struct('red', 1), struct('blue', 2.3));
+create table t(a struct<r1 varchar, c int>, b struct<r2 varchar, c float>) as values
+  (struct('red' AS r1, 1 AS c), struct('blue' AS r2, 2.3 AS c));
 
 # Expect same keys for struct type but got mismatched pair r1,c and r2,c
 query error
@@ -408,12 +409,13 @@ drop table t;
 
 # struct with the same key
 statement ok
-create table t(a struct<r varchar, c int>, b struct<r varchar, c float>) as values (struct('red', 1), struct('blue', 2.3));
+create table t(a struct<r varchar, c int>, b struct<r varchar, c float>) as values
+  (struct('red' AS r, 1 AS c), struct('blue' AS r, 2.3 AS c));
 
 query T
 select arrow_typeof([a, b]) from t;
 ----
-List(nullable Struct("r": nullable Utf8View, "c": nullable Float32))
+List(Struct("r": Utf8View, "c": Float32))
 
 query ?
 select [a, b] from t;
@@ -442,9 +444,9 @@ CREATE TABLE struct_values (
     s1 struct(a int, b varchar),
     s2 struct(a int, b varchar)
 ) AS VALUES
-  (row(1, 'red'), row(1, 'string1')),
-  (row(2, 'blue'), row(2, 'string2')),
-  (row(3, 'green'), row(3, 'string3'))
+  ({a: 1, b: 'red'}, {a: 1, b: 'string1'}),
+  ({a: 2, b: 'blue'}, {a: 2, b: 'string2'}),
+  ({a: 3, b: 'green'}, {a: 3, b: 'string3'})
 ;
 
 statement ok
@@ -452,8 +454,8 @@ drop table struct_values;
 
 statement ok
 create table t (c1 struct(r varchar, b int), c2 struct(r varchar, b float)) as values (
-    row('red', 2),
-    row('blue', 2.3)
+    {r: 'red', b: 2},
+    {r: 'blue', b: 2.3}
 );
 
 query ??
@@ -464,12 +466,12 @@ select * from t;
 query T
 select arrow_typeof(c1) from t;
 ----
-Struct("r": nullable Utf8View, "b": nullable Int32)
+Struct("r": Utf8View, "b": Int32)
 
 query T
 select arrow_typeof(c2) from t;
 ----
-Struct("r": nullable Utf8View, "b": nullable Float32)
+Struct("r": Utf8View, "b": Float32)
 
 statement ok
 drop table t;
@@ -486,15 +488,12 @@ select * from t;
 query T
 select arrow_typeof(column1) from t;
 ----
-Struct("r": nullable Utf8, "c": nullable Float64)
-Struct("r": nullable Utf8, "c": nullable Float64)
+Struct("r": Utf8, "c": Float64)
+Struct("r": Utf8, "c": Float64)
 
 statement ok
 drop table t;
 
-query error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Cast error: Cannot cast string 'a' to value of Float64 type
-create table t as values({r: 'a', c: 1}), ({c: 2.3, r: 'b'});
-
 ##################################
 ## Test Coalesce with Struct
 ##################################
@@ -504,9 +503,9 @@ CREATE TABLE t (
     s1 struct(a int, b varchar),
     s2 struct(a float, b varchar)
 ) AS VALUES
-  (row(1, 'red'), row(1.1, 'string1')),
-  (row(2, 'blue'), row(2.2, 'string2')),
-  (row(3, 'green'), row(33.2, 'string3'))
+  ({a: 1, b: 'red'}, {a: 1.1, b: 'string1'}),
+  ({a: 2, b: 'blue'}, {a: 2.2, b: 'string2'}),
+  ({a: 3, b: 'green'}, {a: 33.2, b: 'string3'})
 ;
 
 query ?
@@ -519,9 +518,9 @@ select coalesce(s1) from t;
 query T
 select arrow_typeof(coalesce(s1, s2)) from t;
 ----
-Struct("a": nullable Float32, "b": nullable Utf8View)
-Struct("a": nullable Float32, "b": nullable Utf8View)
-Struct("a": nullable Float32, "b": nullable Utf8View)
+Struct("a": Float32, "b": Utf8View)
+Struct("a": Float32, "b": Utf8View)
+Struct("a": Float32, "b": Utf8View)
 
 statement ok
 drop table t;
@@ -531,9 +530,9 @@ CREATE TABLE t (
     s1 struct(a int, b varchar),
     s2 struct(a float, b varchar)
 ) AS VALUES
-  (row(1, 'red'), row(1.1, 'string1')),
-  (null, row(2.2, 'string2')),
-  (row(3, 'green'), row(33.2, 'string3'))
+  ({a: 1, b: 'red'}, {a: 1.1, b: 'string1'}),
+  (null, {a: 2.2, b: 'string2'}),
+  ({a: 3, b: 'green'}, {a: 33.2, b: 'string3'})
 ;
 
 query ?
@@ -546,23 +545,19 @@ select coalesce(s1, s2) from t;
 query T
 select arrow_typeof(coalesce(s1, s2)) from t;
 ----
-Struct("a": nullable Float32, "b": nullable Utf8View)
-Struct("a": nullable Float32, "b": nullable Utf8View)
-Struct("a": nullable Float32, "b": nullable Utf8View)
+Struct("a": Float32, "b": Utf8View)
+Struct("a": Float32, "b": Utf8View)
+Struct("a": Float32, "b": Utf8View)
 
 statement ok
 drop table t;
 
-# row() with incorrect order
+# row() with incorrect order - row() is positional, not name-based
 statement error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Cast error: Cannot cast string 'blue' to value of Float32 type
 create table t(a struct(r varchar, c int), b struct(r varchar, c float)) as values
-    (row('red', 1), row(2.3, 'blue')),
-    (row('purple', 1), row('green', 2.3));
+  ({r: 'red', c: 1}, {r: 2.3, c: 'blue'}),
+  ({r: 'purple', c: 1}, {r: 'green', c: 2.3});
 
-# out of order struct literal
-# TODO: This query should not fail
-statement error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Cast error: Cannot cast string 'b' to value of Int32 type
-create table t(a struct(r varchar, c int)) as values ({r: 'a', c: 1}), ({c: 2, r: 'b'});
 
 ##################################
 ## Test Array of Struct
@@ -573,46 +568,33 @@ select [{r: 'a', c: 1}, {r: 'b', c: 2}];
 ----
 [{r: a, c: 1}, {r: b, c: 2}]
 
-# Can't create a list of struct with different field types
-query error
-select [{r: 'a', c: 1}, {c: 2, r: 'b'}];
 
 statement ok
-create table t(a struct(r varchar, c int), b struct(r varchar, c float)) as values (row('a', 1), row('b', 2.3));
+create table t(a struct(r varchar, c int), b struct(r varchar, c float)) as values ({r: 'a', c: 1}, {r: 'b', c: 2.3});
 
 query T
 select arrow_typeof([a, b]) from t;
 ----
-List(nullable Struct("r": nullable Utf8View, "c": nullable Float32))
+List(Struct("r": Utf8View, "c": Float32))
 
 statement ok
 drop table t;
 
-# create table with different struct type is fine
-statement ok
-create table t(a struct(r varchar, c int), b struct(c float, r varchar)) as values (row('a', 1), row(2.3, 'b'));
-
-# create array with different struct type is not valid
-query error
-select arrow_typeof([a, b]) from t;
 
 statement ok
-drop table t;
+create table t(a struct(r varchar, c int, g float), b struct(r varchar, c float, g int)) as values ({r: 'a', c: 1, g: 2.3}, {r: 'b', c: 2.3, g: 2});
 
-statement ok
-create table t(a struct(r varchar, c int, g float), b struct(r varchar, c float, g int)) as values (row('a', 1, 2.3), row('b', 2.3, 2));
-
-# type of each column should not coerced but perserve as it is
+# type of each column should not coerced but preserve as it is
 query T
 select arrow_typeof(a) from t;
 ----
-Struct("r": nullable Utf8View, "c": nullable Int32, "g": nullable Float32)
+Struct("r": Utf8View, "c": Int32, "g": Float32)
 
-# type of each column should not coerced but perserve as it is
+# type of each column should not coerced but preserve as it is
 query T
 select arrow_typeof(b) from t;
 ----
-Struct("r": nullable Utf8View, "c": nullable Float32, "g": nullable Int32)
+Struct("r": Utf8View, "c": Float32, "g": Int32)
 
 statement ok
 drop table t;
@@ -622,7 +604,7 @@ drop table t;
 # This tests accessing struct fields using the subscript notation with string literals
 
 statement ok
-create table test (struct_field struct(substruct int)) as values (struct(1));
+create table test (struct_field struct(substruct int)) as values ({substruct: 1});
 
 query ??
 select *
@@ -635,7 +617,7 @@ statement ok
 DROP TABLE test;
 
 statement ok
-create table test (struct_field struct(substruct struct(subsubstruct int))) as values (struct(struct(1)));
+create table test (struct_field struct(substruct struct(subsubstruct int))) as values ({substruct: {subsubstruct: 1}});
 
 query ??
 select *
@@ -653,3 +635,1039 @@ test1.struct_field['substruct']['subsubstruct'] = test2.struct_field['substruct'
 
 statement ok
 drop table test;
+
+# Test nested get_field with multiple arguments
+query I
+select get_field({'a': {'b': 1}}, 'a', 'b');
+----
+1
+
+# Test nested get_field with three levels
+query I
+select get_field({'a': {'b': {'c': 42}}}, 'a', 'b', 'c');
+----
+42
+
+# Test type validation error - accessing field on non-struct
+query error Cannot access field at argument 2: type Int64 is not Struct, Map, or Null
+select get_field({'a': 1}, 'a', 'b');
+
+# Test that bracket syntax produces a single get_field call (not nested)
+# We use a table column to prevent constant folding
+statement ok
+create table explain_test (s struct(a struct(b int))) as values ({'a': {'b': 1}});
+
+query TT
+explain select s['a']['b'] from explain_test;
+----
+logical_plan
+01)Projection: get_field(explain_test.s, Utf8("a"), Utf8("b"))
+02)--TableScan: explain_test projection=[s]
+physical_plan
+01)ProjectionExec: expr=[get_field(s@0, a, b) as explain_test.s[a][b]]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+drop table explain_test;
+
+# Test with nested struct in table
+statement ok
+create table nested_test (s struct(inner struct(val int))) as values ({'inner': {'val': 100}});
+
+query I
+select s['inner']['val'] from nested_test;
+----
+100
+
+query I
+select get_field(s, 'inner', 'val') from nested_test;
+----
+100
+
+statement ok
+drop table nested_test;
+
+# Test mixed struct and map access
+query I
+select get_field({'m': map(['k'], [42])}, 'm', 'k');
+----
+42
+
+# Test nested map access
+query I
+select {'m': map(['outer'], [map(['inner'], [99])])}['m']['outer']['inner'];
+----
+99
+
+###############################################
+# Additional tests for nested get_field support
+###############################################
+
+# Backwards compatibility: original 2-argument form still works
+query I
+select get_field({'a': 1}, 'a');
+----
+1
+
+# Error: get_field with no arguments
+query error get_field requires at least 2 arguments, got 0
+select get_field();
+
+# Error: get_field with only 1 argument
+query error get_field requires at least 2 arguments, got 1
+select get_field({'a': 1});
+
+# Null handling: null at base should return null
+query I
+select get_field(CAST(NULL AS STRUCT(a STRUCT(b INT))), 'a', 'b');
+----
+NULL
+
+# Null handling: null in middle of chain
+statement ok
+create table null_mid_test (s STRUCT(a STRUCT(b INT)));
+
+statement ok
+insert into null_mid_test values ({'a': NULL});
+
+query I
+select s['a']['b'] from null_mid_test;
+----
+NULL
+
+query I
+select get_field(s, 'a', 'b') from null_mid_test;
+----
+NULL
+
+statement ok
+drop table null_mid_test;
+
+# Type validation error at argument 3
+query error Cannot access field at argument 3: type Int64 is not Struct, Map, or Null
+select get_field({'a': {'b': 2}}, 'a', 'b', 'c');
+
+# Type validation error at argument 4
+query error Cannot access field at argument 4: type Int64 is not Struct, Map, or Null
+select get_field({'a': {'b': {'c': 3}}}, 'a', 'b', 'c', 'd');
+
+# Non-existent field at first level
+query error Field x not found in struct
+select get_field({'a': 1}, 'x');
+
+# Non-existent field at second level
+query error Field x not found in struct
+select get_field({'a': {'b': 1}}, 'a', 'x');
+
+# Deep nesting: 5-level access
+query I
+select get_field({'l1': {'l2': {'l3': {'l4': {'l5': 42}}}}}, 'l1', 'l2', 'l3', 'l4', 'l5');
+----
+42
+
+# Deep nesting: 5-level access via bracket syntax
+query I
+select {'l1': {'l2': {'l3': {'l4': {'l5': 99}}}}}['l1']['l2']['l3']['l4']['l5'];
+----
+99
+
+# Mixed array and struct access: array index should break the batching
+statement ok
+create table mixed_access_test (data STRUCT(items STRUCT(name VARCHAR)[]) );
+
+statement ok
+insert into mixed_access_test values ({'items': [{'name': 'first'}, {'name': 'second'}]});
+
+query T
+select data['items'][1]['name'] from mixed_access_test;
+----
+first
+
+query T
+select data['items'][2]['name'] from mixed_access_test;
+----
+second
+
+statement ok
+drop table mixed_access_test;
+
+# Nullable parent propagation: null parent should propagate
+statement ok
+create table nullable_parent_test (s STRUCT(a STRUCT(b INT)));
+
+statement ok
+insert into nullable_parent_test values ({'a': {'b': 1}}), (NULL);
+
+query I
+select s['a']['b'] from nullable_parent_test;
+----
+1
+NULL
+
+statement ok
+drop table nullable_parent_test;
+
+# Test struct casting with field reordering - string fields
+query ?
+SELECT CAST({b: 'b_value', a: 'a_value'} AS STRUCT(a VARCHAR, b VARCHAR));
+----
+{a: a_value, b: b_value}
+
+# Test struct casting with field reordering - integer fields
+query ?
+SELECT CAST({b: 3, a: 4} AS STRUCT(a INT, b INT));
+----
+{a: 4, b: 3}
+
+# Test with type casting AND field reordering
+query ?
+SELECT CAST({b: 3, a: 4} AS STRUCT(a BIGINT, b INT));
+----
+{a: 4, b: 3}
+
+# Test casting with explicit field names
+query ?
+SELECT CAST({a: 1, b: 'x'} AS STRUCT(a INT, b VARCHAR));
+----
+{a: 1, b: x}
+
+# Test with missing field - should insert nulls
+query ?
+SELECT CAST({a: 1} AS STRUCT(a INT, b INT));
+----
+{a: 1, b: NULL}
+
+# Test with extra source field - should be ignored
+query ?
+SELECT CAST({a: 1, b: 2, extra: 3} AS STRUCT(a INT, b INT));
+----
+{a: 1, b: 2}
+
+# Test no overlap with mismatched field count - should fail because no field names match
+statement error DataFusion error: (Plan error|Error during planning|This feature is not implemented): (Cannot cast struct: at least one field name must match between source and target|Cannot cast struct with 3 fields to 2 fields without name overlap|Unsupported CAST from Struct)
+SELECT CAST(struct(1, 'x', 'y') AS STRUCT(a INT, b VARCHAR));
+
+# Test nested struct with field reordering
+query ?
+SELECT CAST(
+  {inner: {y: 2, x: 1}}
+  AS STRUCT(inner STRUCT(x INT, y INT))
+);
+----
+{inner: {x: 1, y: 2}}
+
+# Test field reordering with table data
+statement ok
+CREATE TABLE struct_reorder_test (
+  data STRUCT(b INT, a VARCHAR)
+) AS VALUES
+  ({b: 100, a: 'first'}),
+  ({b: 200, a: 'second'}),
+  ({b: 300, a: 'third'})
+;
+
+query ?
+SELECT CAST(data AS STRUCT(a VARCHAR, b INT)) AS casted_data FROM struct_reorder_test ORDER BY data['b'];
+----
+{a: first, b: 100}
+{a: second, b: 200}
+{a: third, b: 300}
+
+statement ok
+drop table struct_reorder_test;
+
+# Test casting struct with multiple levels of nesting and reordering
+query ?
+SELECT CAST(
+  {level1: {z: 100, y: 'inner', x: 1}}
+  AS STRUCT(level1 STRUCT(x INT, y VARCHAR, z INT))
+);
+----
+{level1: {x: 1, y: inner, z: 100}}
+
+# Test field reordering with nulls in source
+query ?
+SELECT CAST(
+  {b: NULL::INT, a: 42}
+  AS STRUCT(a INT, b INT)
+);
+----
+{a: 42, b: NULL}
+
+# Test casting preserves struct-level nulls
+query ?
+SELECT CAST(NULL::STRUCT(b INT, a INT) AS STRUCT(a INT, b INT));
+----
+NULL
+
+############################
+# Implicit Coercion Tests with CREATE TABLE AS VALUES
+############################
+
+# Test implicit coercion with same field order, different types
+statement ok
+create table t as values({r: 'a', c: 1}), ({r: 'b', c: 2.3});
+
+query T
+select arrow_typeof(column1) from t limit 1;
+----
+Struct("r": Utf8, "c": Float64)
+
+query ?
+select * from t order by column1.r;
+----
+{r: a, c: 1.0}
+{r: b, c: 2.3}
+
+statement ok
+drop table t;
+
+# Test implicit coercion with nullable fields (same order)
+statement ok
+create table t as values({a: 1, b: 'x'}), ({a: 2, b: 'y'});
+
+query T
+select arrow_typeof(column1) from t limit 1;
+----
+Struct("a": Int64, "b": Utf8)
+
+query ?
+select * from t order by column1.a;
+----
+{a: 1, b: x}
+{a: 2, b: y}
+
+statement ok
+drop table t;
+
+# Test implicit coercion with nested structs (same field order)
+statement ok
+create table t as 
+  select {outer: {x: 1, y: 2}} as column1
+  union all
+  select {outer: {x: 3, y: 4}};
+
+query T
+select arrow_typeof(column1) from t limit 1;
+----
+Struct("outer": Struct("x": Int64, "y": Int64))
+
+query ?
+select column1 from t order by column1.outer.x;
+----
+{outer: {x: 1, y: 2}}
+{outer: {x: 3, y: 4}}
+
+statement ok
+drop table t;
+
+# Test implicit coercion with type widening (Int32 -> Int64)
+statement ok
+create table t as values({id: 1, val: 100}), ({id: 2, val: 9223372036854775807});
+
+query T
+select arrow_typeof(column1) from t limit 1;
+----
+Struct("id": Int64, "val": Int64)
+
+query ?
+select * from t order by column1.id;
+----
+{id: 1, val: 100}
+{id: 2, val: 9223372036854775807}
+
+statement ok
+drop table t;
+
+# Test implicit coercion with nested struct and type coercion
+statement ok
+create table t as 
+  select {name: 'Alice', data: {score: 100, active: true}} as column1
+  union all
+  select {name: 'Bob', data: {score: 200, active: false}};
+
+query T
+select arrow_typeof(column1) from t limit 1;
+----
+Struct("name": Utf8, "data": Struct("score": Int64, "active": Boolean))
+
+query ?
+select column1 from t order by column1.name;
+----
+{name: Alice, data: {score: 100, active: true}}
+{name: Bob, data: {score: 200, active: false}}
+
+statement ok
+drop table t;
+
+############################
+# Field Reordering Tests (using explicit CAST)
+############################
+
+# Test explicit cast with field reordering in VALUES - basic case
+query ?
+select CAST({c: 2.3, r: 'b'} AS STRUCT(r VARCHAR, c FLOAT));
+----
+{r: b, c: 2.3}
+
+# Test explicit cast with field reordering - multiple rows
+query ?
+select * from (values 
+  (CAST({c: 1, r: 'a'} AS STRUCT(r VARCHAR, c FLOAT))),
+  (CAST({c: 2.3, r: 'b'} AS STRUCT(r VARCHAR, c FLOAT)))
+) order by column1.r;
+----
+{r: a, c: 1.0}
+{r: b, c: 2.3}
+
+# Test table with explicit cast for field reordering
+statement ok
+create table t as select CAST({c: 1, r: 'a'} AS STRUCT(r VARCHAR, c FLOAT)) as s
+union all
+select CAST({c: 2.3, r: 'b'} AS STRUCT(r VARCHAR, c FLOAT));
+
+query T
+select arrow_typeof(s) from t limit 1;
+----
+Struct("r": Utf8View, "c": Float32)
+
+query ?
+select * from t order by s.r;
+----
+{r: a, c: 1.0}
+{r: b, c: 2.3}
+
+statement ok
+drop table t;
+
+# Test field reordering with nullable fields using CAST
+query ?
+select CAST({b: NULL, a: 42} AS STRUCT(a INT, b INT));
+----
+{a: 42, b: NULL}
+
+# Test field reordering with nested structs using CAST
+query ?
+select CAST({outer: {y: 4, x: 3}} AS STRUCT(outer STRUCT(x INT, y INT)));
+----
+{outer: {x: 3, y: 4}}
+
+# Test complex nested field reordering
+query ?
+select CAST(
+  {data: {active: false, score: 200}, name: 'Bob'}
+  AS STRUCT(name VARCHAR, data STRUCT(score INT, active BOOLEAN))
+);
+----
+{name: Bob, data: {score: 200, active: false}}
+
+############################
+# Array Literal Tests with Struct Field Reordering (Implicit Coercion)
+############################
+
+# Test array literal with reordered struct fields - implicit coercion by name
+# Field order in unified schema is determined during type coercion
+query ?
+select [{r: 'a', c: 1}, {c: 2.3, r: 'b'}];
+----
+[{c: 1.0, r: a}, {c: 2.3, r: b}]
+
+# Test array literal with same-named fields but different order
+# Fields are reordered during coercion
+query ?
+select [{a: 1, b: 2}, {b: 3, a: 4}];
+----
+[{b: 2, a: 1}, {b: 3, a: 4}]
+
+# Test array literal with explicit cast to unify struct schemas with partial overlap
+# Use CAST to explicitly unify schemas when fields don't match completely
+query ?
+select [
+  CAST({a: 1, b: 2} AS STRUCT(a INT, b INT, c INT)),
+  CAST({b: 3, c: 4} AS STRUCT(a INT, b INT, c INT))
+];
+----
+[{a: 1, b: 2, c: NULL}, {a: NULL, b: 3, c: 4}]
+
+# Test NULL handling in array literals with reordered but matching fields
+query ?
+select [{a: NULL, b: 1}, {b: 2, a: NULL}];
+----
+[{b: 1, a: NULL}, {b: 2, a: NULL}]
+
+# Verify arrow_typeof for array with reordered struct fields
+# The unified schema type follows the coercion order
+query T
+select arrow_typeof([{x: 1, y: 2}, {y: 3, x: 4}]);
+----
+List(Struct("y": Int64, "x": Int64))
+
+# Test array of structs with matching nested fields in different order
+# Inner nested fields are also reordered during coercion
+query ?
+select [
+  {id: 1, info: {name: 'Alice', age: 30}},
+  {info: {age: 25, name: 'Bob'}, id: 2}
+];
+----
+[{info: {age: 30, name: Alice}, id: 1}, {info: {age: 25, name: Bob}, id: 2}]
+
+# Test nested arrays with matching struct fields (different order)
+query ?
+select [[{x: 1, y: 2}], [{y: 4, x: 3}]];
+----
+[[{y: 2, x: 1}], [{y: 4, x: 3}]]
+
+# Test array literal with float type coercion across elements
+query ?
+select [{val: 1}, {val: 2.5}];
+----
+[{val: 1.0}, {val: 2.5}]
+
+############################
+# Dynamic Array Construction Tests (from Table Columns)
+############################
+
+# Setup test table with struct columns for dynamic array construction
+statement ok
+create table t_complete_overlap (
+  s1 struct(x int, y int),
+  s2 struct(y int, x int)
+) as values
+  ({x: 1, y: 2}, {y: 3, x: 4}),
+  ({x: 5, y: 6}, {y: 7, x: 8});
+
+# Test 1: Complete overlap - same fields, different order
+# Verify arrow_typeof for dynamically created array
+query T
+select arrow_typeof([s1, s2]) from t_complete_overlap limit 1;
+----
+List(Struct("y": Int32, "x": Int32))
+
+# Verify values are correctly mapped by name in the array
+# Field order follows the second column's field order
+query ?
+select [s1, s2] from t_complete_overlap order by s1.x;
+----
+[{y: 2, x: 1}, {y: 3, x: 4}]
+[{y: 6, x: 5}, {y: 7, x: 8}]
+
+statement ok
+drop table t_complete_overlap;
+
+# Test 2: Partial overlap - some shared fields between columns
+# Note: Columns must have the exact same field set for array construction to work
+# Test with identical field set (all fields present in both columns)
+statement ok
+create table t_partial_overlap (
+  col_a struct(name VARCHAR, age int, active boolean),
+  col_b struct(age int, name VARCHAR, active boolean)
+) as values
+  ({name: 'Alice', age: 30, active: true}, {age: 25, name: 'Bob', active: false}),
+  ({name: 'Charlie', age: 35, active: true}, {age: 40, name: 'Diana', active: false});
+
+# Verify unified type includes all fields from both structs
+query T
+select arrow_typeof([col_a, col_b]) from t_partial_overlap limit 1;
+----
+List(Struct("age": Int32, "name": Utf8View, "active": Boolean))
+
+# Verify values are correctly mapped by name in the array
+# Field order follows the second column's field order
+query ?
+select [col_a, col_b] from t_partial_overlap order by col_a.name;
+----
+[{age: 30, name: Alice, active: true}, {age: 25, name: Bob, active: false}]
+[{age: 35, name: Charlie, active: true}, {age: 40, name: Diana, active: false}]
+
+statement ok
+drop table t_partial_overlap;
+
+# Test 3: Complete field set matching (no CAST needed)
+# Schemas already align; confirm unified type and values
+statement ok
+create table t_with_cast (
+  col_x struct(id int, description VARCHAR),
+  col_y struct(id int, description VARCHAR)
+) as values
+  ({id: 1, description: 'First'}, {id: 10, description: 'First Value'}),
+  ({id: 2, description: 'Second'}, {id: 20, description: 'Second Value'});
+
+# Verify type unification with all fields
+query T
+select arrow_typeof([col_x, col_y]) from t_with_cast limit 1;
+----
+List(Struct("id": Int32, "description": Utf8View))
+
+# Verify values remain aligned by name
+query ?
+select [col_x, col_y] from t_with_cast order by col_x.id;
+----
+[{id: 1, description: First}, {id: 10, description: First Value}]
+[{id: 2, description: Second}, {id: 20, description: Second Value}]
+
+statement ok
+drop table t_with_cast;
+
+# Test 4: Explicit CAST for partial field overlap scenarios
+# When columns have different field sets, use explicit CAST to unify schemas
+query ?
+select [
+  CAST({id: 1} AS STRUCT(id INT, description VARCHAR)),
+  CAST({id: 10, description: 'Value'} AS STRUCT(id INT, description VARCHAR))
+];
+----
+[{id: 1, description: NULL}, {id: 10, description: Value}]
+
+# Test 5: Complex nested structs with field reordering
+# Nested fields must have the exact same field set for array construction
+statement ok
+create table t_nested (
+  col_1 struct(id int, outer struct(x int, y int)),
+  col_2 struct(id int, outer struct(x int, y int))
+) as values
+  ({id: 100, outer: {x: 1, y: 2}}, {id: 101, outer: {x: 4, y: 3}}),
+  ({id: 200, outer: {x: 5, y: 6}}, {id: 201, outer: {x: 8, y: 7}});
+
+# Verify nested struct in unified schema
+query T
+select arrow_typeof([col_1, col_2]) from t_nested limit 1;
+----
+List(Struct("id": Int32, "outer": Struct("x": Int32, "y": Int32)))
+
+# Verify nested field values are correctly mapped
+query ?
+select [col_1, col_2] from t_nested order by col_1.id;
+----
+[{id: 100, outer: {x: 1, y: 2}}, {id: 101, outer: {x: 4, y: 3}}]
+[{id: 200, outer: {x: 5, y: 6}}, {id: 201, outer: {x: 8, y: 7}}]
+
+statement ok
+drop table t_nested;
+
+# Test 6: NULL handling with matching field sets
+statement ok
+create table t_nulls (
+  col_a struct(val int, flag boolean),
+  col_b struct(val int, flag boolean)
+) as values
+  ({val: 1, flag: true}, {val: 10, flag: false}),
+  ({val: NULL, flag: false}, {val: NULL, flag: true});
+
+# Verify NULL values are preserved
+query ?
+select [col_a, col_b] from t_nulls order by col_a.val;
+----
+[{val: 1, flag: true}, {val: 10, flag: false}]
+[{val: NULL, flag: false}, {val: NULL, flag: true}]
+
+statement ok
+drop table t_nulls;
+
+# Test 7: Multiple columns with complete field matching
+statement ok
+create table t_multi (
+  col1 struct(a int, b int, c int),
+  col2 struct(a int, b int, c int)
+) as values
+  ({a: 1, b: 2, c: 3}, {a: 10, b: 20, c: 30}),
+  ({a: 4, b: 5, c: 6}, {a: 40, b: 50, c: 60});
+
+# Verify array with complete field matching
+query T
+select arrow_typeof([col1, col2]) from t_multi limit 1;
+----
+List(Struct("a": Int32, "b": Int32, "c": Int32))
+
+# Verify values are correctly unified
+query ?
+select [col1, col2] from t_multi order by col1.a;
+----
+[{a: 1, b: 2, c: 3}, {a: 10, b: 20, c: 30}]
+[{a: 4, b: 5, c: 6}, {a: 40, b: 50, c: 60}]
+
+statement ok
+drop table t_multi;
+
+############################
+# Comprehensive Implicit Struct Coercion Suite
+############################
+
+# Test 1: VALUES clause with field reordering coerced by name into declared schema
+statement ok
+create table implicit_values_reorder (
+  s struct(a int, b int)
+) as values
+  ({a: 1, b: 2}),
+  ({b: 3, a: 4});
+
+query T
+select arrow_typeof(s) from implicit_values_reorder limit 1;
+----
+Struct("a": Int32, "b": Int32)
+
+query ?
+select s from implicit_values_reorder order by s.a;
+----
+{a: 1, b: 2}
+{a: 4, b: 3}
+
+statement ok
+drop table implicit_values_reorder;
+
+# Test 2: Array literal coercion with reordered struct fields
+query IIII
+select 
+  [{a: 1, b: 2}, {b: 3, a: 4}][1]['a'],
+  [{a: 1, b: 2}, {b: 3, a: 4}][1]['b'],
+  [{a: 1, b: 2}, {b: 3, a: 4}][2]['a'],
+  [{a: 1, b: 2}, {b: 3, a: 4}][2]['b'];
+----
+1 2 4 3
+
+# Test 3: Array construction from columns with reordered struct fields
+statement ok
+create table struct_columns_order (
+  s1 struct(a int, b int),
+  s2 struct(b int, a int)
+) as values
+  ({a: 1, b: 2}, {b: 3, a: 4}),
+  ({a: 5, b: 6}, {b: 7, a: 8});
+
+query IIII
+select 
+  [s1, s2][1]['a'],
+  [s1, s2][1]['b'],
+  [s1, s2][2]['a'],
+  [s1, s2][2]['b']
+from struct_columns_order
+order by s1['a'];
+----
+1 2 4 3
+5 6 8 7
+
+statement ok
+drop table struct_columns_order;
+
+# Test 4: UNION with struct field reordering
+query II
+select s['a'], s['b']
+from (
+  select {a: 1, b: 2} as s
+  union all
+  select {b: 3, a: 4} as s
+) t
+order by s['a'];
+----
+1 2
+4 3
+
+# Test 5: CTE with struct coercion across branches
+query II
+with 
+  t1 as (select {a: 1, b: 2} as s),
+  t2 as (select {b: 3, a: 4} as s)
+select t1.s['a'], t1.s['b'] from t1
+union all
+select t2.s['a'], t2.s['b'] from t2
+order by 1;
+----
+1 2
+4 3
+
+# Test 6: Struct aggregation retains name-based mapping
+statement ok
+create table agg_structs_reorder (
+  k int,
+  s struct(x int, y int)
+) as values
+  (1, {x: 1, y: 2}),
+  (1, {y: 3, x: 4}),
+  (2, {x: 5, y: 6});
+
+query I?
+select k, array_agg(s) from agg_structs_reorder group by k order by k;
+----
+1 [{x: 1, y: 2}, {x: 4, y: 3}]
+2 [{x: 5, y: 6}]
+
+statement ok
+drop table agg_structs_reorder;
+
+# Test 7: Nested struct coercion with reordered inner fields
+query IIII
+with nested as (
+  select [{outer: {inner: 1, value: 2}}, {outer: {value: 3, inner: 4}}] as arr
+)
+select 
+  arr[1]['outer']['inner'],
+  arr[1]['outer']['value'],
+  arr[2]['outer']['inner'],
+  arr[2]['outer']['value']
+from nested;
+----
+1 2 4 3
+
+# Test 8: Partial name overlap - currently errors (field count mismatch detected)
+# This is a documented limitation: structs must have exactly same field set for coercion
+query error DataFusion error: Error during planning: Inconsistent data type across values list
+select column1 from (values ({a: 1, b: 2}), ({b: 3, c: 4})) order by column1['a'];
+
+# Negative test: mismatched struct field counts are rejected (documented limitation)
+query error DataFusion error: .*
+select [{a: 1}, {a: 2, b: 3}];
+
+# Test 9: INSERT with name-based struct coercion into target schema
+statement ok
+create table target_struct_insert (s struct(a int, b int));
+
+statement ok
+insert into target_struct_insert values ({b: 1, a: 2});
+
+query ?
+select s from target_struct_insert;
+----
+{a: 2, b: 1}
+
+statement ok
+drop table target_struct_insert;
+
+# Test 10: CASE expression with different struct field orders
+query II
+select 
+  (case when true then {a: 1, b: 2} else {b: 3, a: 4} end)['a'] as a_val,
+  (case when true then {a: 1, b: 2} else {b: 3, a: 4} end)['b'] as b_val;
+----
+1 2
+
+############################
+# JOIN Coercion Tests
+############################
+
+# Test: Struct coercion in JOIN ON condition
+statement ok
+create table t_left (
+  id int,
+  s struct(x int, y int)
+) as values
+  (1, {x: 1, y: 2}),
+  (2, {x: 3, y: 4});
+
+statement ok
+create table t_right (
+  id int,
+  s struct(y int, x int)
+) as values
+  (1, {y: 2, x: 1}),
+  (2, {y: 4, x: 3});
+
+# JOIN on reordered struct fields - matched by name
+query IIII
+select t_left.id, t_left.s['x'], t_left.s['y'], t_right.id
+from t_left
+join t_right on t_left.s = t_right.s
+order by t_left.id;
+----
+1 1 2 1
+2 3 4 2
+
+statement ok
+drop table t_left;
+
+statement ok
+drop table t_right;
+
+# Test: Struct coercion with filtered JOIN
+statement ok
+create table orders (
+  order_id int,
+  customer struct(name varchar, id int)
+) as values
+  (1, {name: 'Alice', id: 100}),
+  (2, {name: 'Bob', id: 101}),
+  (3, {name: 'Charlie', id: 102});
+
+statement ok
+create table customers (
+  customer_id int,
+  info struct(id int, name varchar)
+) as values
+  (100, {id: 100, name: 'Alice'}),
+  (101, {id: 101, name: 'Bob'}),
+  (103, {id: 103, name: 'Diana'});
+
+# Join with struct field reordering - names matched, not positions
+query I
+select count(*) from orders
+join customers on orders.customer = customers.info
+where orders.order_id <= 2;
+----
+2
+
+statement ok
+drop table orders;
+
+statement ok
+drop table customers;
+
+############################
+# WHERE Predicate Coercion Tests
+############################
+
+# Test: Struct equality in WHERE clause with field reordering
+statement ok
+create table t_where (
+  id int,
+  s struct(x int, y int)
+) as values
+  (1, {x: 1, y: 2}),
+  (2, {x: 3, y: 4}),
+  (3, {x: 1, y: 2});
+
+# WHERE clause with struct comparison - coerced by name
+query I
+select id from t_where
+where s = {y: 2, x: 1}
+order by id;
+----
+1
+3
+
+statement ok
+drop table t_where;
+
+# Test: Struct IN clause with reordering
+statement ok
+create table t_in (
+  id int,
+  s struct(a int, b varchar)
+) as values
+  (1, {a: 1, b: 'x'}),
+  (2, {a: 2, b: 'y'}),
+  (3, {a: 1, b: 'x'});
+
+# IN clause with reordered struct literals
+query I
+select id from t_in
+where s in ({b: 'x', a: 1}, {b: 'y', a: 2})
+order by id;
+----
+1
+2
+3
+
+statement ok
+drop table t_in;
+
+# Test: Struct BETWEEN (not supported, but documents limitation)
+# Structs don't support BETWEEN, but can use comparison operators
+
+statement ok
+create table t_between (
+  id int,
+  s struct(val int)
+) as values
+  (1, {val: 10}),
+  (2, {val: 20}),
+  (3, {val: 30});
+
+# Comparison via field extraction works
+query I
+select id from t_between
+where s['val'] >= 20
+order by id;
+----
+2
+3
+
+statement ok
+drop table t_between;
+
+############################
+# Window Function Coercion Tests
+############################
+
+# Test: Struct in window function PARTITION BY
+statement ok
+create table t_window (
+  id int,
+  s struct(category int, value int)
+) as values
+  (1, {category: 1, value: 10}),
+  (2, {category: 1, value: 20}),
+  (3, {category: 2, value: 30}),
+  (4, {category: 2, value: 40});
+
+# Window partition on struct field via extraction
+query III
+select 
+  id,
+  s['value'],
+  row_number() over (partition by s['category'] order by s['value'])
+from t_window
+order by id;
+----
+1 10 1
+2 20 2
+3 30 1
+4 40 2
+
+statement ok
+drop table t_window;
+
+# Test: Struct in window function ORDER BY with coercion
+statement ok
+create table t_rank (
+  id int,
+  s struct(rank_val int, group_id int)
+) as values
+  (1, {rank_val: 10, group_id: 1}),
+  (2, {rank_val: 20, group_id: 1}),
+  (3, {rank_val: 15, group_id: 2});
+
+# Window ranking with struct field extraction
+query III
+select 
+  id,
+  s['rank_val'],
+  rank() over (partition by s['group_id'] order by s['rank_val'])
+from t_rank
+order by id;
+----
+1 10 1
+2 20 2
+3 15 1
+
+statement ok
+drop table t_rank;
+
+# Test: Aggregate function with struct coercion across window partitions
+statement ok
+create table t_agg_window (
+  id int,
+  partition_id int,
+  s struct(amount int)
+) as values
+  (1, 1, {amount: 100}),
+  (2, 1, {amount: 200}),
+  (3, 2, {amount: 150});
+
+# Running sum via extracted struct field
+query III
+select 
+  id,
+  partition_id,
+  sum(s['amount']) over (partition by partition_id order by id)
+from t_agg_window
+order by id;
+----
+1 1 100
+2 1 300
+3 2 150
+
+# Config reset
+statement ok
+RESET datafusion.sql_parser.dialect;
+
+statement ok
+drop table t_agg_window;
diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt
index dec9357495356..e70f24303e191 100644
--- a/datafusion/sqllogictest/test_files/subquery.slt
+++ b/datafusion/sqllogictest/test_files/subquery.slt
@@ -200,19 +200,17 @@ logical_plan
 06)--------Aggregate: groupBy=[[t2.t2_id]], aggr=[[sum(CAST(t2.t2_int AS Int64))]]
 07)----------TableScan: t2 projection=[t2_id, t2_int]
 physical_plan
-01)ProjectionExec: expr=[t1_id@1 as t1_id, sum(t2.t2_int)@0 as t2_sum]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Right, on=[(t2_id@1, t1_id@0)], projection=[sum(t2.t2_int)@0, t1_id@2]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[sum(t2.t2_int)@1 as sum(t2.t2_int), t2_id@0 as t2_id]
-06)----------AggregateExec: mode=FinalPartitioned, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
-07)------------CoalesceBatchesExec: target_batch_size=2
-08)--------------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4
-09)----------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
-10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-11)--------------------DataSourceExec: partitions=1, partition_sizes=[2]
-12)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-13)--------DataSourceExec: partitions=1, partition_sizes=[2]
+01)ProjectionExec: expr=[t1_id@0 as t1_id, sum(t2.t2_int)@1 as t2_sum]
+02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(t2_id@1, t1_id@0)], projection=[t1_id@2, sum(t2.t2_int)@0]
+03)----CoalescePartitionsExec
+04)------ProjectionExec: expr=[sum(t2.t2_int)@1 as sum(t2.t2_int), t2_id@0 as t2_id]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
+06)----------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: partitions=1, partition_sizes=[2]
+10)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+11)------DataSourceExec: partitions=1, partition_sizes=[2]
 
 query II rowsort
 SELECT t1_id, (SELECT sum(t2_int) FROM t2 WHERE t2.t2_id = t1.t1_id) as t2_sum from t1
@@ -235,19 +233,17 @@ logical_plan
 06)--------Aggregate: groupBy=[[t2.t2_id]], aggr=[[sum(CAST(t2.t2_int AS Float64)) AS sum(t2.t2_int * Float64(1))]]
 07)----------TableScan: t2 projection=[t2_id, t2_int]
 physical_plan
-01)ProjectionExec: expr=[t1_id@1 as t1_id, sum(t2.t2_int * Float64(1)) + Int64(1)@0 as t2_sum]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Right, on=[(t2_id@1, t1_id@0)], projection=[sum(t2.t2_int * Float64(1)) + Int64(1)@0, t1_id@2]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[sum(t2.t2_int * Float64(1))@1 + 1 as sum(t2.t2_int * Float64(1)) + Int64(1), t2_id@0 as t2_id]
-06)----------AggregateExec: mode=FinalPartitioned, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int * Float64(1))]
-07)------------CoalesceBatchesExec: target_batch_size=2
-08)--------------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4
-09)----------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int * Float64(1))]
-10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-11)--------------------DataSourceExec: partitions=1, partition_sizes=[2]
-12)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-13)--------DataSourceExec: partitions=1, partition_sizes=[2]
+01)ProjectionExec: expr=[t1_id@0 as t1_id, sum(t2.t2_int * Float64(1)) + Int64(1)@1 as t2_sum]
+02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(t2_id@1, t1_id@0)], projection=[t1_id@2, sum(t2.t2_int * Float64(1)) + Int64(1)@0]
+03)----CoalescePartitionsExec
+04)------ProjectionExec: expr=[sum(t2.t2_int * Float64(1))@1 + 1 as sum(t2.t2_int * Float64(1)) + Int64(1), t2_id@0 as t2_id]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int * Float64(1))]
+06)----------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int * Float64(1))]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: partitions=1, partition_sizes=[2]
+10)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+11)------DataSourceExec: partitions=1, partition_sizes=[2]
 
 query IR rowsort
 SELECT t1_id, (SELECT sum(t2_int * 1.0) + 1 FROM t2 WHERE t2.t2_id = t1.t1_id) as t2_sum from t1
@@ -270,19 +266,17 @@ logical_plan
 06)--------Aggregate: groupBy=[[t2.t2_id]], aggr=[[sum(CAST(t2.t2_int AS Int64))]]
 07)----------TableScan: t2 projection=[t2_id, t2_int]
 physical_plan
-01)ProjectionExec: expr=[t1_id@1 as t1_id, sum(t2.t2_int)@0 as t2_sum]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Right, on=[(t2_id@1, t1_id@0)], projection=[sum(t2.t2_int)@0, t1_id@2]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[sum(t2.t2_int)@1 as sum(t2.t2_int), t2_id@0 as t2_id]
-06)----------AggregateExec: mode=FinalPartitioned, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
-07)------------CoalesceBatchesExec: target_batch_size=2
-08)--------------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4
-09)----------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
-10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-11)--------------------DataSourceExec: partitions=1, partition_sizes=[2]
-12)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-13)--------DataSourceExec: partitions=1, partition_sizes=[2]
+01)ProjectionExec: expr=[t1_id@0 as t1_id, sum(t2.t2_int)@1 as t2_sum]
+02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(t2_id@1, t1_id@0)], projection=[t1_id@2, sum(t2.t2_int)@0]
+03)----CoalescePartitionsExec
+04)------ProjectionExec: expr=[sum(t2.t2_int)@1 as sum(t2.t2_int), t2_id@0 as t2_id]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
+06)----------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: partitions=1, partition_sizes=[2]
+10)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+11)------DataSourceExec: partitions=1, partition_sizes=[2]
 
 query II rowsort
 SELECT t1_id, (SELECT sum(t2_int) FROM t2 WHERE t2.t2_id = t1.t1_id group by t2_id, 'a') as t2_sum from t1
@@ -306,21 +300,17 @@ logical_plan
 07)----------Aggregate: groupBy=[[t2.t2_id]], aggr=[[sum(CAST(t2.t2_int AS Int64))]]
 08)------------TableScan: t2 projection=[t2_id, t2_int]
 physical_plan
-01)ProjectionExec: expr=[t1_id@1 as t1_id, sum(t2.t2_int)@0 as t2_sum]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Right, on=[(t2_id@1, t1_id@0)], projection=[sum(t2.t2_int)@0, t1_id@2]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[sum(t2.t2_int)@1 as sum(t2.t2_int), t2_id@0 as t2_id]
-06)----------CoalesceBatchesExec: target_batch_size=2
-07)------------FilterExec: sum(t2.t2_int)@1 < 3
-08)--------------AggregateExec: mode=FinalPartitioned, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
-09)----------------CoalesceBatchesExec: target_batch_size=2
-10)------------------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4
-11)--------------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
-12)----------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-13)------------------------DataSourceExec: partitions=1, partition_sizes=[2]
-14)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-15)--------DataSourceExec: partitions=1, partition_sizes=[2]
+01)ProjectionExec: expr=[t1_id@0 as t1_id, sum(t2.t2_int)@1 as t2_sum]
+02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(t2_id@1, t1_id@0)], projection=[t1_id@2, sum(t2.t2_int)@0]
+03)----CoalescePartitionsExec
+04)------FilterExec: sum(t2.t2_int)@1 < 3, projection=[sum(t2.t2_int)@1, t2_id@0]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
+06)----------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: partitions=1, partition_sizes=[2]
+10)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+11)------DataSourceExec: partitions=1, partition_sizes=[2]
 
 query II rowsort
 SELECT t1_id, (SELECT sum(t2_int) FROM t2 WHERE t2.t2_id = t1.t1_id having sum(t2_int) < 3) as t2_sum from t1
@@ -439,7 +429,7 @@ SELECT t1_id, t1_name, t1_int, (select t2_id, t2_name FROM t2 WHERE t2.t2_id = t
 
 #subquery_not_allowed
 #In/Exist Subquery is not allowed in ORDER BY clause.
-statement error DataFusion error: Invalid \(non-executable\) plan after Analyzer\ncaused by\nError during planning: In/Exist subquery can only be used in Projection, Filter, TableScan, Window functions, Aggregate and Join plan nodes, but was used in \[Sort: t1.t1_int IN \(<subquery>\) ASC NULLS LAST\]
+statement error DataFusion error: Invalid \(non-executable\) plan after Analyzer\ncaused by\nError during planning: In/Exist/SetComparison subquery can only be used in Projection, Filter, TableScan, Window functions, Aggregate and Join plan nodes, but was used in \[Sort: t1.t1_int IN \(<subquery>\) ASC NULLS LAST\]
 SELECT t1_id, t1_name, t1_int FROM t1 order by t1_int in (SELECT t2_int FROM t2 WHERE t1.t1_id > t1.t1_int)
 
 #non_aggregated_correlated_scalar_subquery
@@ -698,10 +688,10 @@ query TT
 explain SELECT t1_id, (SELECT t2_id FROM t2 limit 0) FROM t1
 ----
 logical_plan
-01)Projection: t1.t1_id, __scalar_sq_1.t2_id AS t2_id
-02)--Left Join: 
-03)----TableScan: t1 projection=[t1_id]
-04)----EmptyRelation: rows=0
+01)Projection: t1.t1_id, (<subquery>)
+02)--Subquery:
+03)----EmptyRelation: rows=0
+04)--TableScan: t1 projection=[t1_id]
 
 query II rowsort
 SELECT t1_id, (SELECT t2_id FROM t2 limit 0) FROM t1
@@ -732,26 +722,79 @@ query TT
 explain select (select count(*) from t1) as b
 ----
 logical_plan
-01)Projection: __scalar_sq_1.count(*) AS b
-02)--SubqueryAlias: __scalar_sq_1
+01)Projection: (<subquery>) AS b
+02)--Subquery:
 03)----Projection: count(Int64(1)) AS count(*)
 04)------Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
 05)--------TableScan: t1 projection=[]
+06)--EmptyRelation: rows=1
 
 #simple_uncorrelated_scalar_subquery2
 query TT
 explain select (select count(*) from t1) as b, (select count(1) from t2)
 ----
 logical_plan
-01)Projection: __scalar_sq_1.count(*) AS b, __scalar_sq_2.count(Int64(1)) AS count(Int64(1))
-02)--Left Join: 
-03)----SubqueryAlias: __scalar_sq_1
-04)------Projection: count(Int64(1)) AS count(*)
-05)--------Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
-06)----------TableScan: t1 projection=[]
-07)----SubqueryAlias: __scalar_sq_2
-08)------Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
-09)--------TableScan: t2 projection=[]
+01)Projection: (<subquery>) AS b, (<subquery>)
+02)--Subquery:
+03)----Projection: count(Int64(1)) AS count(*)
+04)------Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+05)--------TableScan: t1 projection=[]
+06)--Subquery:
+07)----Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+08)------TableScan: t2 projection=[]
+09)--EmptyRelation: rows=1
+
+# Verify projection pushdown works inside uncorrelated scalar subqueries.
+# Each test targets a different early-return path in OptimizeProjections
+# to ensure subquery plans are optimized regardless of where the subquery
+# expression appears.
+
+# Subquery in a Filter predicate: the TableScan inside the subquery should
+# only read t2_id, not all columns.
+query TT
+explain select t1_id from t1 where t1_id > (select max(t2_id) from t2)
+----
+logical_plan
+01)Filter: t1.t1_id > (<subquery>)
+02)--Subquery:
+03)----Aggregate: groupBy=[[]], aggr=[[max(t2.t2_id)]]
+04)------TableScan: t2 projection=[t2_id]
+05)--TableScan: t1 projection=[t1_id]
+
+# Subquery in a Projection expression
+query TT
+explain select t1_id, (select max(t2_id) from t2) as max_t2 from t1
+----
+logical_plan
+01)Projection: t1.t1_id, (<subquery>) AS max_t2
+02)--Subquery:
+03)----Aggregate: groupBy=[[]], aggr=[[max(t2.t2_id)]]
+04)------TableScan: t2 projection=[t2_id]
+05)--TableScan: t1 projection=[t1_id]
+
+# Subquery in an Aggregate expression
+query TT
+explain select sum(t1_int + (select min(t2_int) from t2)) as s from t1
+----
+logical_plan
+01)Projection: sum(t1.t1_int + min(t2.t2_int)) AS s
+02)--Aggregate: groupBy=[[]], aggr=[[sum(CAST(t1.t1_int + (<subquery>) AS Int64))]]
+03)----Subquery:
+04)------Aggregate: groupBy=[[]], aggr=[[min(t2.t2_int)]]
+05)--------TableScan: t2 projection=[t2_int]
+06)----TableScan: t1 projection=[t1_int]
+
+# Subquery in a Window expression
+query TT
+explain select t1_id, sum(t1_int + (select min(t2_int) from t2)) over () as win from t1
+----
+logical_plan
+01)Projection: t1.t1_id, sum(t1.t1_int + min(t2.t2_int)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS win
+02)--WindowAggr: windowExpr=[[sum(CAST(t1.t1_int + (<subquery>) AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+03)----Subquery:
+04)------Aggregate: groupBy=[[]], aggr=[[min(t2.t2_int)]]
+05)--------TableScan: t2 projection=[t2_int]
+06)----TableScan: t1 projection=[t1_id, t1_int]
 
 statement ok
 set datafusion.explain.logical_plan_only = false;
@@ -760,22 +803,23 @@ query TT
 explain select (select count(*) from t1) as b, (select count(1) from t2)
 ----
 logical_plan
-01)Projection: __scalar_sq_1.count(*) AS b, __scalar_sq_2.count(Int64(1)) AS count(Int64(1))
-02)--Left Join: 
-03)----SubqueryAlias: __scalar_sq_1
-04)------Projection: count(Int64(1)) AS count(*)
-05)--------Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
-06)----------TableScan: t1 projection=[]
-07)----SubqueryAlias: __scalar_sq_2
-08)------Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
-09)--------TableScan: t2 projection=[]
+01)Projection: (<subquery>) AS b, (<subquery>)
+02)--Subquery:
+03)----Projection: count(Int64(1)) AS count(*)
+04)------Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+05)--------TableScan: t1 projection=[]
+06)--Subquery:
+07)----Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+08)------TableScan: t2 projection=[]
+09)--EmptyRelation: rows=1
 physical_plan
-01)ProjectionExec: expr=[count(*)@0 as b, count(Int64(1))@1 as count(Int64(1))]
-02)--NestedLoopJoinExec: join_type=Left
-03)----ProjectionExec: expr=[4 as count(*)]
-04)------PlaceholderRowExec
-05)----ProjectionExec: expr=[4 as count(Int64(1))]
-06)------PlaceholderRowExec
+01)ScalarSubqueryExec: subqueries=2
+02)--ProjectionExec: expr=[scalar_subquery(<pending>) as b, scalar_subquery(<pending>) as count(Int64(1))]
+03)----PlaceholderRowExec
+04)--ProjectionExec: expr=[4 as count(*)]
+05)----PlaceholderRowExec
+06)--ProjectionExec: expr=[4 as count(Int64(1))]
+07)----PlaceholderRowExec
 
 statement ok
 set datafusion.explain.logical_plan_only = true;
@@ -1189,13 +1233,11 @@ logical_plan
 05)------SubqueryAlias: __correlated_sq_1
 06)--------TableScan: t2 projection=[t2_id]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=2
-02)--FilterExec: t1_id@0 > 40 OR NOT mark@3, projection=[t1_id@0, t1_name@1, t1_int@2]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------HashJoinExec: mode=CollectLeft, join_type=RightMark, on=[(t2_id@0, t1_id@0)]
-05)--------DataSourceExec: partitions=1, partition_sizes=[2]
-06)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)----------DataSourceExec: partitions=1, partition_sizes=[2]
+01)FilterExec: t1_id@0 > 40 OR NOT mark@3, projection=[t1_id@0, t1_name@1, t1_int@2]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightMark, on=[(t2_id@0, t1_id@0)]
+03)----DataSourceExec: partitions=1, partition_sizes=[2]
+04)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)------DataSourceExec: partitions=1, partition_sizes=[2]
 
 statement ok
 set datafusion.explain.logical_plan_only = true;
@@ -1480,3 +1522,618 @@ logical_plan
 
 statement count 0
 drop table person;
+
+# Set comparison subqueries (ANY/ALL)
+statement ok
+create table set_cmp_t(v int) as values (1), (6), (10);
+
+statement ok
+create table set_cmp_s(v int) as values (5), (null);
+
+statement ok
+create table set_cmp_empty(v int);
+
+query I rowsort
+select v from set_cmp_t where v > any(select v from set_cmp_s);
+----
+10
+6
+
+query I rowsort
+select v from set_cmp_t where v < all(select v from set_cmp_empty);
+----
+1
+10
+6
+
+statement count 0
+drop table set_cmp_t;
+
+statement count 0
+drop table set_cmp_s;
+
+statement count 0
+drop table set_cmp_empty;
+
+query TT
+explain select v from (values (1), (6), (10)) set_cmp_t(v) where v > any(select v from (values (5), (null)) set_cmp_s(v));
+----
+logical_plan
+01)Projection: set_cmp_t.v
+02)--Filter: __correlated_sq_1.mark OR __correlated_sq_2.mark AND NOT __correlated_sq_3.mark AND Boolean(NULL)
+03)----LeftMark Join:  Filter: set_cmp_t.v > __correlated_sq_3.v IS TRUE
+04)------Filter: __correlated_sq_1.mark OR __correlated_sq_2.mark AND Boolean(NULL)
+05)--------LeftMark Join:  Filter: set_cmp_t.v > __correlated_sq_2.v IS NULL
+06)----------Filter: __correlated_sq_1.mark OR Boolean(NULL)
+07)------------LeftMark Join:  Filter: set_cmp_t.v > __correlated_sq_1.v IS TRUE
+08)--------------SubqueryAlias: set_cmp_t
+09)----------------Projection: column1 AS v
+10)------------------Values: (Int64(1)), (Int64(6)), (Int64(10))
+11)--------------SubqueryAlias: __correlated_sq_1
+12)----------------SubqueryAlias: set_cmp_s
+13)------------------Projection: column1 AS v
+14)--------------------Values: (Int64(5)), (Int64(NULL))
+15)----------SubqueryAlias: __correlated_sq_2
+16)------------SubqueryAlias: set_cmp_s
+17)--------------Projection: column1 AS v
+18)----------------Values: (Int64(5)), (Int64(NULL))
+19)------SubqueryAlias: __correlated_sq_3
+20)--------SubqueryAlias: set_cmp_s
+21)----------Projection: column1 AS v
+22)------------Values: (Int64(5)), (Int64(NULL))
+
+# correlated_recursive_scalar_subquery_with_level_3_exists_subquery_referencing_level1_relation
+query TT
+explain select c_custkey from customer
+where c_acctbal < (
+    select sum(o_totalprice) from orders
+    where o_custkey = c_custkey
+    and exists (
+        select * from lineitem where l_orderkey = o_orderkey
+        and l_extendedprice < c_acctbal
+    )
+) order by c_custkey;
+----
+logical_plan
+01)Sort: customer.c_custkey ASC NULLS LAST
+02)--Projection: customer.c_custkey
+03)----Inner Join: customer.c_custkey = __scalar_sq_2.o_custkey Filter: CAST(customer.c_acctbal AS Decimal128(25, 2)) < __scalar_sq_2.sum(orders.o_totalprice)
+04)------TableScan: customer projection=[c_custkey, c_acctbal]
+05)------SubqueryAlias: __scalar_sq_2
+06)--------Projection: sum(orders.o_totalprice), orders.o_custkey
+07)----------Aggregate: groupBy=[[orders.o_custkey]], aggr=[[sum(orders.o_totalprice)]]
+08)------------Projection: orders.o_custkey, orders.o_totalprice
+09)--------------LeftSemi Join: orders.o_orderkey = __correlated_sq_1.l_orderkey Filter: __correlated_sq_1.l_extendedprice < customer.c_acctbal
+10)----------------TableScan: orders projection=[o_orderkey, o_custkey, o_totalprice]
+11)----------------SubqueryAlias: __correlated_sq_1
+12)------------------TableScan: lineitem projection=[l_orderkey, l_extendedprice]
+
+# correlated_recursive_scalar_subquery_with_level_3_in_subquery_referencing_level1_relation
+query TT
+explain select c_custkey from customer
+where c_acctbal < (
+    select sum(o_totalprice) from orders
+    where o_custkey = c_custkey
+    and o_totalprice in (
+        select l_extendedprice as price from lineitem where l_orderkey = o_orderkey
+        and l_extendedprice < c_acctbal
+    )
+) order by c_custkey;
+----
+logical_plan
+01)Sort: customer.c_custkey ASC NULLS LAST
+02)--Projection: customer.c_custkey
+03)----Inner Join: customer.c_custkey = __scalar_sq_2.o_custkey Filter: CAST(customer.c_acctbal AS Decimal128(25, 2)) < __scalar_sq_2.sum(orders.o_totalprice)
+04)------TableScan: customer projection=[c_custkey, c_acctbal]
+05)------SubqueryAlias: __scalar_sq_2
+06)--------Projection: sum(orders.o_totalprice), orders.o_custkey
+07)----------Aggregate: groupBy=[[orders.o_custkey]], aggr=[[sum(orders.o_totalprice)]]
+08)------------Projection: orders.o_custkey, orders.o_totalprice
+09)--------------LeftSemi Join: orders.o_totalprice = __correlated_sq_1.price, orders.o_orderkey = __correlated_sq_1.l_orderkey Filter: __correlated_sq_1.l_extendedprice < customer.c_acctbal
+10)----------------TableScan: orders projection=[o_orderkey, o_custkey, o_totalprice]
+11)----------------SubqueryAlias: __correlated_sq_1
+12)------------------Projection: lineitem.l_extendedprice AS price, lineitem.l_extendedprice, lineitem.l_orderkey
+13)--------------------TableScan: lineitem projection=[l_orderkey, l_extendedprice]
+
+# Setup tables for recursive correlation tests
+statement ok
+CREATE TABLE employees (
+    employee_id INTEGER,
+    employee_name VARCHAR,
+    dept_id INTEGER,
+    salary DECIMAL
+);
+
+statement ok
+CREATE TABLE project_assignments (
+    project_id INTEGER,
+    employee_id INTEGER,
+    priority INTEGER
+);
+
+# Provided recursive scalar subquery explain case
+query TT
+EXPLAIN SELECT e1.employee_name, e1.salary
+FROM employees e1
+WHERE e1.salary > (
+    SELECT AVG(e2.salary)
+    FROM employees e2
+    WHERE e2.dept_id = e1.dept_id
+    AND e2.salary > (
+        SELECT AVG(e3.salary)
+        FROM employees e3
+        WHERE e3.dept_id = e1.dept_id
+    )
+);
+----
+logical_plan
+01)Projection: e1.employee_name, e1.salary
+02)--Inner Join: e1.dept_id = __scalar_sq_1.dept_id Filter: CAST(e1.salary AS Decimal128(38, 14)) > __scalar_sq_1.avg(e2.salary)
+03)----SubqueryAlias: e1
+04)------TableScan: employees projection=[employee_name, dept_id, salary]
+05)----SubqueryAlias: __scalar_sq_1
+06)------Projection: avg(e2.salary), e2.dept_id
+07)--------Aggregate: groupBy=[[e2.dept_id]], aggr=[[avg(e2.salary)]]
+08)----------Projection: e2.dept_id, e2.salary
+09)------------Inner Join:  Filter: CAST(e2.salary AS Decimal128(38, 14)) > __scalar_sq_2.avg(e3.salary) AND __scalar_sq_2.dept_id = e1.dept_id
+10)--------------SubqueryAlias: e2
+11)----------------TableScan: employees projection=[dept_id, salary]
+12)--------------SubqueryAlias: __scalar_sq_2
+13)----------------Projection: avg(e3.salary), e3.dept_id
+14)------------------Aggregate: groupBy=[[e3.dept_id]], aggr=[[avg(e3.salary)]]
+15)--------------------SubqueryAlias: e3
+16)----------------------TableScan: employees projection=[dept_id, salary]
+
+# Check shadowing: `dept_id` should resolve to the nearest outer relation (`e2`)
+# in the innermost subquery rather than the outermost
+query TT
+EXPLAIN SELECT e1.employee_id
+FROM employees e1
+WHERE EXISTS (
+    SELECT 1
+    FROM employees e2
+    WHERE EXISTS (
+        SELECT 1
+        FROM project_assignments p
+        WHERE p.project_id = dept_id
+    )
+);
+----
+logical_plan
+01)LeftSemi Join:
+02)--SubqueryAlias: e1
+03)----TableScan: employees projection=[employee_id]
+04)--SubqueryAlias: __correlated_sq_2
+05)----Projection:
+06)------LeftSemi Join: e2.dept_id = __correlated_sq_1.project_id
+07)--------SubqueryAlias: e2
+08)----------TableScan: employees projection=[dept_id]
+09)--------SubqueryAlias: __correlated_sq_1
+10)----------SubqueryAlias: p
+11)------------TableScan: project_assignments projection=[project_id]
+
+# Config reset
+statement ok
+RESET datafusion.execution.batch_size;
+
+statement ok
+RESET datafusion.explain.logical_plan_only;
+
+statement count 0
+drop table employees;
+
+statement count 0
+drop table project_assignments;
+
+#############
+## Uncorrelated scalar subquery row-count semantics
+## A scalar subquery must return at most one row; returning more is an error.
+#############
+
+statement ok
+CREATE TABLE sq_values(v INT) AS VALUES (1), (2), (3);
+
+statement ok
+CREATE TABLE sq_main(x INT) AS VALUES (10), (20);
+
+statement ok
+CREATE TABLE sq_empty(v INT) AS VALUES (1) LIMIT 0;
+
+# Scalar subquery returning multiple rows in SELECT position → error
+query error DataFusion error: Execution error: Scalar subquery returned more than one row
+SELECT (SELECT v FROM sq_values);
+
+# Scalar subquery returning multiple rows in WHERE position → error
+query error DataFusion error: Execution error: Scalar subquery returned more than one row
+SELECT x FROM sq_main WHERE x > (SELECT v FROM sq_values);
+
+# Scalar subquery returning multiple rows as a function argument → error
+query error DataFusion error: Execution error: Scalar subquery returned more than one row
+SELECT x + (SELECT v FROM sq_values) FROM sq_main;
+
+# Scalar subquery returning exactly one row → success
+query I
+SELECT (SELECT v FROM sq_values LIMIT 1);
+----
+1
+
+# Scalar subquery returning exactly one row in WHERE → success
+query I rowsort
+SELECT x FROM sq_main WHERE x > (SELECT v FROM sq_values LIMIT 1);
+----
+10
+20
+
+# Scalar subquery returning zero rows → NULL
+query I
+SELECT (SELECT v FROM sq_empty);
+----
+NULL
+
+# Scalar subquery returning zero rows in arithmetic → NULL propagation
+query I
+SELECT x + (SELECT v FROM sq_empty) FROM sq_main;
+----
+NULL
+NULL
+
+# Scalar subquery returning zero rows in WHERE comparison → no matching rows
+query I
+SELECT x FROM sq_main WHERE x > (SELECT v FROM sq_empty);
+----
+
+# Aggregated subquery always returns one row, even on empty input → success
+query I
+SELECT (SELECT count(*) FROM sq_empty);
+----
+0
+
+# Aggregated subquery on multi-row table → success (aggregation reduces to 1 row)
+query I
+SELECT (SELECT max(v) FROM sq_values);
+----
+3
+
+# Multiple scalar subqueries, one returns multiple rows → error
+query error DataFusion error: Execution error: Scalar subquery returned more than one row
+SELECT (SELECT count(*) FROM sq_empty), (SELECT v FROM sq_values);
+
+#############
+## Uncorrelated scalar subqueries in various expression contexts
+#############
+
+# HAVING clause with uncorrelated scalar subquery
+query II rowsort
+SELECT x, count(*) AS cnt FROM sq_main GROUP BY x
+HAVING count(*) > (SELECT min(v) FROM sq_values);
+----
+
+# CASE WHEN with uncorrelated scalar subquery as condition
+query T rowsort
+SELECT CASE WHEN x > (SELECT min(v) FROM sq_values)
+            THEN 'big' ELSE 'small' END AS label
+FROM sq_main;
+----
+big
+big
+
+# ORDER BY with uncorrelated scalar subquery
+query I
+SELECT x FROM sq_main ORDER BY x + (SELECT max(v) FROM sq_values);
+----
+10
+20
+
+# Aggregate function argument containing uncorrelated scalar subquery
+query I
+SELECT sum(x + (SELECT max(v) FROM sq_values)) AS s FROM sq_main;
+----
+36
+
+# JOIN ON condition with uncorrelated scalar subquery
+query II rowsort
+SELECT l.x, r.x AS rx
+FROM sq_main AS l JOIN sq_main AS r
+ON l.x = r.x + (SELECT min(v) FROM sq_values);
+----
+
+# Nested uncorrelated-in-uncorrelated scalar subquery.
+query I
+SELECT (SELECT max(v) + (SELECT min(v) FROM sq_values) FROM sq_values);
+----
+4
+
+# Verify nested subqueries are not hoisted: the root ScalarSubqueryExec
+# should manage only the outer subquery (subqueries=1), not both.
+query TT
+EXPLAIN SELECT (SELECT max(v) + (SELECT min(v) FROM sq_values) FROM sq_values);
+----
+logical_plan
+01)Projection: (<subquery>)
+02)--Subquery:
+03)----Projection: max(sq_values.v) + (<subquery>)
+04)------Subquery:
+05)--------Aggregate: groupBy=[[]], aggr=[[min(sq_values.v)]]
+06)----------TableScan: sq_values projection=[v]
+07)------Aggregate: groupBy=[[]], aggr=[[max(sq_values.v)]]
+08)--------TableScan: sq_values projection=[v]
+09)--EmptyRelation: rows=1
+physical_plan
+01)ScalarSubqueryExec: subqueries=1
+02)--ProjectionExec: expr=[scalar_subquery(<pending>) as max(sq_values.v) + min(sq_values.v)]
+03)----PlaceholderRowExec
+04)--ScalarSubqueryExec: subqueries=1
+05)----ProjectionExec: expr=[max(sq_values.v)@0 + scalar_subquery(<pending>) as max(sq_values.v) + min(sq_values.v)]
+06)------AggregateExec: mode=Single, gby=[], aggr=[max(sq_values.v)]
+07)--------DataSourceExec: partitions=1, partition_sizes=[1]
+08)----AggregateExec: mode=Single, gby=[], aggr=[min(sq_values.v)]
+09)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# CTE as source inside uncorrelated scalar subquery
+query I
+SELECT (SELECT s FROM (WITH cte AS (SELECT max(v) AS s FROM sq_values) SELECT s FROM cte));
+----
+3
+
+# Window function with uncorrelated scalar subquery
+query II rowsort
+SELECT x, sum(x + (SELECT max(v) FROM sq_values)) OVER () AS win_sum FROM sq_main;
+----
+10 36
+20 36
+
+# Duplicate uncorrelated scalar subqueries only appear in the query plan once
+statement ok
+set datafusion.explain.logical_plan_only = false;
+
+query TT
+explain SELECT (SELECT max(v) FROM sq_values) + (SELECT max(v) FROM sq_values) AS doubled;
+----
+logical_plan
+01)Projection: __common_expr_1 + __common_expr_1 AS doubled
+02)--Projection: (<subquery>) AS __common_expr_1
+03)----Subquery:
+04)------Aggregate: groupBy=[[]], aggr=[[max(sq_values.v)]]
+05)--------TableScan: sq_values projection=[v]
+06)----EmptyRelation: rows=1
+physical_plan
+01)ScalarSubqueryExec: subqueries=1
+02)--ProjectionExec: expr=[__common_expr_1@0 + __common_expr_1@0 as doubled]
+03)----ProjectionExec: expr=[scalar_subquery(<pending>) as __common_expr_1]
+04)------PlaceholderRowExec
+05)--AggregateExec: mode=Single, gby=[], aggr=[max(sq_values.v)]
+06)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+RESET datafusion.explain.logical_plan_only;
+
+#############
+## Additional edge cases inspired by DuckDB scalar subquery bugs and tests.
+## References:
+##   DuckDB issue #13639: volatile functions in uncorrelated subqueries
+##   DuckDB issue #13469 / PR #13514: multi-row error semantics
+##   DuckDB issue #4113: uncorrelated scalar subquery wrong results
+##   DuckDB test: test/sql/subquery/scalar/test_uncorrelated_scalar_subquery.test
+##   DuckDB test: test/sql/subquery/scalar/test_scalar_subquery.test
+##   DuckDB test: test/sql/subquery/scalar/test_scalar_subquery_cte.test
+##   DuckDB test: test/sql/order/test_limit.test
+##   DuckDB PR #8519: TopN optimization with scalar subquery in LIMIT/OFFSET
+#############
+
+# Volatile function in uncorrelated subquery: random() is evaluated once and
+# the same value is used for every outer row. Both Postgres and DuckDB (#13639)
+# exhibit this behavior; it is correct per the SQL standard.
+# We cannot assert the exact value, but we can verify all rows are identical.
+query B
+SELECT count(DISTINCT r) = 1 FROM (
+  SELECT (SELECT random()) AS r
+  FROM (VALUES (1), (2), (3)) AS t(x)
+);
+----
+true
+
+# Subquery as GROUP BY key.
+# Ref: DuckDB test_uncorrelated_scalar_subquery.test, test #6
+query II
+SELECT (SELECT 42) AS k, max(x) FROM (VALUES (1), (2), (3)) AS t(x) GROUP BY k;
+----
+42 3
+
+# Subquery inside an aggregate function argument.
+# Ref: DuckDB test_uncorrelated_scalar_subquery.test, test #7
+query II
+SELECT x, max((SELECT 42)) FROM (VALUES (1), (2), (3)) AS t(x) GROUP BY x ORDER BY x;
+----
+1 42
+2 42
+3 42
+
+# Doubly-nested constant subquery.
+# Ref: DuckDB test_scalar_subquery.test
+query I
+SELECT (SELECT (SELECT 42));
+----
+42
+
+# Triple-nested constant subquery.
+query I
+SELECT (SELECT (SELECT (SELECT 99)));
+----
+99
+
+# Star expansion: single column is OK.
+# Ref: DuckDB test_uncorrelated_scalar_subquery.test, tests #16-17
+query I
+SELECT (SELECT * FROM (VALUES (1)) AS t(x));
+----
+1
+
+# Star expansion: two columns must error.
+query error Too many columns
+SELECT (SELECT * FROM (VALUES (1, 2)) AS t(x, y));
+
+# Subquery in BETWEEN bounds.
+query I
+SELECT x FROM (VALUES (1), (2), (3), (4), (5)) AS t(x)
+WHERE x BETWEEN (SELECT 2) AND (SELECT 4)
+ORDER BY x;
+----
+2
+3
+4
+
+# DISTINCT subquery returning exactly one distinct value (multi-row input).
+query I
+SELECT (SELECT DISTINCT 42 FROM (VALUES (1), (2), (3)) AS t(x));
+----
+42
+
+# DISTINCT subquery returning multiple distinct values must error.
+query error DataFusion error: Execution error: Scalar subquery returned more than one row
+SELECT (SELECT DISTINCT x FROM (VALUES (1), (2), (3)) AS t(x));
+
+# NULL comparison semantics through subquery boundary.
+# Ref: DuckDB test_scalar_subquery.test, NULL edge cases
+query B
+SELECT 1 = (SELECT CAST(NULL AS INT));
+----
+NULL
+
+query B
+SELECT CAST(NULL AS INT) = (SELECT 1);
+----
+NULL
+
+# Nested CTE inside scalar subquery.
+# Ref: DuckDB test_scalar_subquery_cte.test, nested CTE test
+query I
+SELECT (WITH cte1 AS (WITH cte2 AS (SELECT 42) SELECT * FROM cte2) SELECT * FROM cte1);
+----
+42
+
+# Subquery in LIMIT: not yet supported, verify clear error message.
+# Ref: DuckDB PR #8519, DuckDB test/sql/order/test_limit.test
+query error This feature is not implemented: Unsupported LIMIT expression
+SELECT * FROM (VALUES (1), (2), (3)) AS t(x) ORDER BY x LIMIT (SELECT 2);
+
+# Subquery in OFFSET: not yet supported, verify clear error message.
+query error This feature is not implemented: Unsupported OFFSET expression
+SELECT * FROM (VALUES (1), (2), (3)) AS t(x) ORDER BY x OFFSET (SELECT 1);
+
+# UNION ALL subquery with ORDER BY + LIMIT 1 to avoid multi-row error.
+query I
+SELECT (SELECT v FROM (SELECT 1 AS v UNION ALL SELECT 2) AS t ORDER BY v LIMIT 1);
+----
+1
+
+statement count 0
+DROP TABLE sq_values;
+
+statement count 0
+DROP TABLE sq_main;
+
+statement count 0
+DROP TABLE sq_empty;
+
+# https://github.com/apache/datafusion/issues/21205
+statement ok
+CREATE TABLE dup_filter_t1(id INTEGER) AS VALUES (1), (2), (3);
+
+statement ok
+CREATE TABLE dup_filter_t2(id INTEGER) AS VALUES (1), (2), (3);
+
+query I
+SELECT * FROM dup_filter_t1 WHERE dup_filter_t1.id IN (
+    SELECT dup_filter_t2.id FROM dup_filter_t2 WHERE dup_filter_t2.id > dup_filter_t1.id
+);
+----
+
+statement ok
+DROP TABLE dup_filter_t1;
+
+statement ok
+DROP TABLE dup_filter_t2;
+
+# https://github.com/apache/datafusion/issues/21206
+statement ok
+CREATE TABLE sq_name_t1(id INTEGER) AS VALUES (1), (2), (3);
+
+statement ok
+CREATE TABLE sq_name_t2(id INTEGER, outer_id INTEGER) AS VALUES (10, 1), (20, 1), (30, 2);
+
+query II
+SELECT sq_name_t1.id,
+  (SELECT count(*) AS id FROM sq_name_t2 WHERE sq_name_t2.outer_id = sq_name_t1.id) AS cnt
+FROM sq_name_t1
+ORDER BY sq_name_t1.id;
+----
+1 2
+2 1
+3 0
+
+query I
+SELECT sq_name_t1.id
+FROM sq_name_t1
+WHERE sq_name_t1.id > (
+  SELECT count(*) AS id
+  FROM sq_name_t2
+  WHERE sq_name_t2.outer_id = sq_name_t1.id
+)
+ORDER BY sq_name_t1.id;
+----
+2
+3
+
+query I
+SELECT sq_name_t1.id * 10 + (
+  SELECT count(*) AS id
+  FROM sq_name_t2
+  WHERE sq_name_t2.outer_id = sq_name_t1.id
+) AS total
+FROM sq_name_t1
+ORDER BY sq_name_t1.id;
+----
+12
+21
+30
+
+statement ok
+DROP TABLE sq_name_t1;
+
+statement ok
+DROP TABLE sq_name_t2;
+
+# Test: scalar subquery in filter on a partition column of a partitioned table.
+# This exercises the code path where filters are pushed down to the table
+# provider for partition pruning. Scalar subqueries must not be pushed to the
+# provider because the subquery result is not available at partition listing
+# time.
+
+query I
+COPY (VALUES(1, 'a'), (2, 'b'), (3, 'c'))
+TO 'test_files/scratch/subquery/partition_pruning/part=1/file1.parquet';
+----
+3
+
+query I
+COPY (VALUES(4, 'd'), (5, 'e'))
+TO 'test_files/scratch/subquery/partition_pruning/part=2/file1.parquet';
+----
+2
+
+statement ok
+CREATE EXTERNAL TABLE subquery_partitioned
+STORED AS PARQUET
+LOCATION 'test_files/scratch/subquery/partition_pruning/';
+
+query IT
+SELECT column1, column2 FROM subquery_partitioned
+WHERE part = (SELECT 1)
+ORDER BY column1;
+----
+1 a
+2 b
+3 c
+
+statement ok
+DROP TABLE subquery_partitioned;
diff --git a/datafusion/sqllogictest/test_files/subquery_sort.slt b/datafusion/sqllogictest/test_files/subquery_sort.slt
index ea7addd8e36f7..6df93a3daabf6 100644
--- a/datafusion/sqllogictest/test_files/subquery_sort.slt
+++ b/datafusion/sqllogictest/test_files/subquery_sort.slt
@@ -51,6 +51,25 @@ EXPLAIN SELECT c1 FROM (SELECT c1 FROM sink_table ORDER BY c2)
 logical_plan TableScan: sink_table projection=[c1]
 physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true
 
+statement ok
+SET datafusion.sql_parser.enable_subquery_sort_elimination = false;
+
+query TT
+EXPLAIN SELECT c1 FROM (SELECT c1 FROM sink_table ORDER BY c2) AS ttt
+----
+logical_plan
+01)SubqueryAlias: ttt
+02)--Projection: sink_table.c1
+03)----Sort: sink_table.c2 ASC NULLS LAST
+04)------TableScan: sink_table projection=[c1, c2]
+physical_plan
+01)ProjectionExec: expr=[c1@0 as c1]
+02)--SortExec: expr=[c2@1 ASC NULLS LAST], preserve_partitioning=[false]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2], file_type=csv, has_header=true
+
+statement ok
+RESET datafusion.sql_parser.enable_subquery_sort_elimination;
+
 
 # Do not remove ordering when it's with limit
 
@@ -150,11 +169,10 @@ physical_plan
 03)----SortExec: expr=[c1@0 ASC NULLS LAST, c3@2 DESC, c9@3 ASC NULLS LAST], preserve_partitioning=[true]
 04)------ProjectionExec: expr=[first_value(sink_table.c1) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST]@1 as c1, first_value(sink_table.c2) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST]@2 as c2, first_value(sink_table.c3) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST]@3 as c3, first_value(sink_table.c9) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST]@4 as c9]
 05)--------AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[first_value(sink_table.c1) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c2) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c3) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c9) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST]]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------RepartitionExec: partitioning=Hash([c1@0], 4), input_partitions=4
-08)--------------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[first_value(sink_table.c1) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c2) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c3) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c9) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST]]
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c9], file_type=csv, has_header=true
+06)----------RepartitionExec: partitioning=Hash([c1@0], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[first_value(sink_table.c1) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c2) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c3) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c9) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST]]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c9], file_type=csv, has_header=true
 
 
 query TI
diff --git a/datafusion/sqllogictest/test_files/table_functions.slt b/datafusion/sqllogictest/test_files/table_functions.slt
index c843400efc2b2..3d654c4195feb 100644
--- a/datafusion/sqllogictest/test_files/table_functions.slt
+++ b/datafusion/sqllogictest/test_files/table_functions.slt
@@ -160,17 +160,20 @@ physical_plan LazyMemoryExec: partitions=1, batch_generators=[generate_series: s
 # Test generate_series with invalid arguments
 #
 
-query error DataFusion error: Error during planning: Start is bigger than end, but increment is positive: Cannot generate infinite series
+query I
 SELECT * FROM generate_series(5, 1)
+----
 
-query error DataFusion error: Error during planning: Start is smaller than end, but increment is negative: Cannot generate infinite series
+query I
 SELECT * FROM generate_series(-6, 6, -1)
+----
 
 query error DataFusion error: Error during planning: Step cannot be zero
 SELECT * FROM generate_series(-6, 6, 0)
 
-query error DataFusion error: Error during planning: Start is bigger than end, but increment is positive: Cannot generate infinite series
+query I
 SELECT * FROM generate_series(6, -6, 1)
+----
 
 
 statement error DataFusion error: Error during planning: generate_series function requires 1 to 3 arguments
@@ -298,17 +301,20 @@ physical_plan LazyMemoryExec: partitions=1, batch_generators=[range: start=1, en
 # Test range with invalid arguments
 #
 
-query error DataFusion error: Error during planning: Start is bigger than end, but increment is positive: Cannot generate infinite series
+query I
 SELECT * FROM range(5, 1)
+----
 
-query error DataFusion error: Error during planning: Start is smaller than end, but increment is negative: Cannot generate infinite series
+query I
 SELECT * FROM range(-6, 6, -1)
+----
 
 query error DataFusion error: Error during planning: Step cannot be zero
 SELECT * FROM range(-6, 6, 0)
 
-query error DataFusion error: Error during planning: Start is bigger than end, but increment is positive: Cannot generate infinite series
+query I
 SELECT * FROM range(6, -6, 1)
+----
 
 
 statement error DataFusion error: Error during planning: range function requires 1 to 3 arguments
@@ -378,18 +384,20 @@ SELECT * FROM range(TIMESTAMP '2023-01-03T00:00:00', TIMESTAMP '2023-01-01T00:00
 2023-01-03T00:00:00
 2023-01-02T00:00:00
 
-query error DataFusion error: Error during planning: Start is bigger than end, but increment is positive: Cannot generate infinite series
+query P
 SELECT * FROM range(TIMESTAMP '2023-01-03T00:00:00', TIMESTAMP '2023-01-01T00:00:00', INTERVAL '1' DAY)
+----
 
-query error DataFusion error: Error during planning: Start is smaller than end, but increment is negative: Cannot generate infinite series
+query P
 SELECT * FROM range(TIMESTAMP '2023-01-01T00:00:00', TIMESTAMP '2023-01-02T00:00:00', INTERVAL '-1' DAY)
+----
 
 query error DataFusion error: Error during planning: range function with timestamps requires exactly 3 arguments
 SELECT * FROM range(TIMESTAMP '2023-01-03T00:00:00', TIMESTAMP '2023-01-01T00:00:00')
 
 # Single timestamp (start == end)
 query P
-SELECT * FROM range(TIMESTAMP '2023-01-01T00:00:00', TIMESTAMP '2023-01-01T00:00:00', INTERVAL '1' DAY)  
+SELECT * FROM range(TIMESTAMP '2023-01-01T00:00:00', TIMESTAMP '2023-01-01T00:00:00', INTERVAL '1' DAY)
 ----
 
 # Timestamp range with NULL values
@@ -489,11 +497,13 @@ query P
 SELECT * FROM range(DATE '1992-09-01', DATE '1992-10-01', NULL::INTERVAL)
 ----
 
-query error DataFusion error: Error during planning: Start is bigger than end, but increment is positive: Cannot generate infinite series
+query P
 SELECT * FROM range(DATE '2023-01-03', DATE '2023-01-01', INTERVAL '1' DAY)
+----
 
-query error DataFusion error: Error during planning: Start is smaller than end, but increment is negative: Cannot generate infinite series
+query P
 SELECT * FROM range(DATE '2023-01-01', DATE '2023-01-02', INTERVAL '-1' DAY)
+----
 
 query error DataFusion error: Error during planning: range function with dates requires exactly 3 arguments
 SELECT * FROM range(DATE '2023-01-01', DATE '2023-01-03')
@@ -509,3 +519,40 @@ SELECT c, f.*  FROM json_table, LATERAL generate_series(1,2) f;
 1 2
 2 1
 2 2
+
+
+# Test generate_series in a recursive CTE to ensure the state is correctly reset
+query I rowsort
+WITH RECURSIVE t AS (
+    SELECT 1 i
+    UNION ALL
+    SELECT g.i
+    FROM generate_series(1, 1) g(i), t
+)
+SELECT *
+FROM t
+LIMIT 10;
+----
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+
+# Test that unsupported function argument types are properly reported
+# rather than being silently dropped (which previously caused a misleading
+# "requires 1 to 3 arguments" error instead)
+
+statement error DataFusion error: Error during planning: Unsupported function argument type: start => 1
+SELECT * FROM generate_series(start => 1, stop => 5)
+
+statement error DataFusion error: Error during planning: Unsupported function argument type: \*
+SELECT * FROM generate_series(*)
+
+statement error DataFusion error: Error during planning: Unsupported function argument type: t\.\*
+SELECT * FROM generate_series(t.*)
diff --git a/datafusion/sqllogictest/test_files/to_timestamp_timezone.slt b/datafusion/sqllogictest/test_files/to_timestamp_timezone.slt
new file mode 100644
index 0000000000000..d48e41d1204de
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/to_timestamp_timezone.slt
@@ -0,0 +1,204 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## to_timestamp timezone tests
+##########
+
+## Reset timezone for other tests
+statement ok
+RESET datafusion.execution.time_zone
+
+## Test 1: Default timezone (None) with naive timestamp
+## Naive timestamps (without explicit timezone) should be interpreted as UTC by default
+query P
+SELECT to_timestamp('2020-09-08T13:42:29');
+----
+2020-09-08T13:42:29
+
+## Test 2: Explicit UTC timezone ('Z' suffix)
+## Explicit timezone should be respected regardless of session timezone
+query P
+SELECT to_timestamp('2020-09-08T13:42:29Z');
+----
+2020-09-08T13:42:29
+
+## Test 3: Explicit timezone offset (+05:00)
+## Explicit offset should be respected - this is 13:42:29+05:00 which is 08:42:29 UTC
+query P
+SELECT to_timestamp('2020-09-08T13:42:29+05:00');
+----
+2020-09-08T08:42:29
+
+## Test 4: Explicit timezone offset without colon (+0500)
+## Should handle offset formats without colons
+query P
+SELECT to_timestamp('2020-09-08T13:42:29+0500');
+----
+2020-09-08T08:42:29
+
+## Test 5: Negative timezone offset
+query P
+SELECT to_timestamp('2020-09-08T13:42:29-03:30');
+----
+2020-09-08T17:12:29
+
+## Test 6: Configure session timezone to America/New_York
+statement ok
+SET datafusion.execution.time_zone = 'America/New_York';
+
+## Test 7: Naive timestamp with configured timezone
+## '2020-09-08T13:42:29' in America/New_York is EDT (UTC-4)
+## So this should become '2020-09-08T13:42:29-04:00'
+query P
+SELECT to_timestamp('2020-09-08T13:42:29');
+----
+2020-09-08T13:42:29-04:00
+
+## Test 8: Explicit UTC should be transformed to configured timezone
+query P
+SELECT to_timestamp('2020-09-08T13:42:29Z');
+----
+2020-09-08T09:42:29-04:00
+
+## Test 9: Explicit offset should be transformed to configured timezone
+query P
+SELECT to_timestamp('2020-09-08T13:42:29+05:00');
+----
+2020-09-08T04:42:29-04:00
+
+## Test 10: Check arrow_typeof returns timstamp in configured timezone
+## Result should be Timestamp(Nanosecond, "America/New_York") regardless of input timezone
+query T
+SELECT arrow_typeof(to_timestamp('2020-09-08T13:42:29Z'));
+----
+Timestamp(ns, "America/New_York")
+
+## Test 11: Configure to offset-based timezone
+statement ok
+SET datafusion.execution.time_zone = '+05:30';
+
+## Test 12: Naive timestamp with offset-based configured timezone
+query P
+SELECT to_timestamp('2020-09-08T13:42:29');
+----
+2020-09-08T13:42:29+05:30
+
+## Test 13: Reset to None
+statement ok
+RESET datafusion.execution.time_zone
+
+## Test 14: Naive timestamp
+query P
+SELECT to_timestamp('2020-09-08T13:42:29');
+----
+2020-09-08T13:42:29
+
+query P
+SELECT to_timestamp('2020-09-08T13:42:29Z');
+----
+2020-09-08T13:42:29
+
+query P
+SELECT to_timestamp('2020-09-08T13:42:29+05:00');
+----
+2020-09-08T08:42:29
+
+statement ok
+SET datafusion.execution.time_zone = 'America/New_York';
+
+## Test 15: to_timestamp with format string - naive timestamp with session timezone
+
+query P
+SELECT to_timestamp('2020-09-08 13:42:29', '%Y-%m-%d %H:%M:%S');
+----
+2020-09-08T13:42:29-04:00
+
+## Test 16: to_timestamp with format string - explicit timezone should be respected
+statement ok
+SET datafusion.execution.time_zone = 'UTC';
+
+query P
+SELECT to_timestamp('2020-09-08 13:42:29 +0000', '%Y-%m-%d %H:%M:%S %z');
+----
+2020-09-08T13:42:29Z
+
+query P
+SELECT to_timestamp('2020-09-08 13:42:29 America/Toronto', '%Y-%m-%d %H:%M:%S %Z');
+----
+2020-09-08T17:42:29Z
+
+query error Error parsing timestamp from '2020-09-08 13:42:29America/Toronto' using format '%Y-%m-%d %H:%M:%S%Z': '%Z' is only supported at the end of the format string preceded by a space
+SELECT to_timestamp('2020-09-08 13:42:29America/Toronto', '%Y-%m-%d %H:%M:%S%Z');
+
+## Test 17: Test all precision variants respect timezone
+statement ok
+SET datafusion.execution.time_zone = 'America/New_York';
+
+## to_timestamp_seconds
+query P
+SELECT to_timestamp_seconds('2020-09-08T13:42:29');
+----
+2020-09-08T13:42:29-04:00
+
+## to_timestamp_millis
+query P
+SELECT to_timestamp_millis('2020-09-08T13:42:29.123');
+----
+2020-09-08T13:42:29.123-04:00
+
+## to_timestamp_micros
+query P
+SELECT to_timestamp_micros('2020-09-08T13:42:29.123456');
+----
+2020-09-08T13:42:29.123456-04:00
+
+## to_timestamp_nanos
+query P
+SELECT to_timestamp_nanos('2020-09-08T13:42:29.123456789');
+----
+2020-09-08T13:42:29.123456789-04:00
+
+## test integers
+query T
+select arrow_typeof(to_timestamp_seconds(61))
+----
+Timestamp(s, "America/New_York")
+
+query T
+select arrow_typeof(to_timestamp_millis(61))
+----
+Timestamp(ms, "America/New_York")
+
+query T
+select arrow_typeof(to_timestamp_micros(61))
+----
+Timestamp(µs, "America/New_York")
+
+query T
+select arrow_typeof(to_timestamp_nanos(61))
+----
+Timestamp(ns, "America/New_York")
+
+query T
+select arrow_typeof(to_timestamp(61))
+----
+Timestamp(ns, "America/New_York")
+
+## Reset timezone for other tests
+statement ok
+RESET datafusion.execution.time_zone
diff --git a/datafusion/sqllogictest/test_files/topk.slt b/datafusion/sqllogictest/test_files/topk.slt
index 8a08cc17d4172..8a1fef0722297 100644
--- a/datafusion/sqllogictest/test_files/topk.slt
+++ b/datafusion/sqllogictest/test_files/topk.slt
@@ -340,7 +340,7 @@ explain select number, letter, age from partial_sorted order by number asc limit
 ----
 physical_plan
 01)SortExec: TopK(fetch=3), expr=[number@0 ASC NULLS LAST], preserve_partitioning=[false]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
 
 query TT
 explain select number, letter, age from partial_sorted order by letter asc, number desc limit 3;
@@ -371,8 +371,7 @@ explain select number, letter, age, number as column4, letter as column5 from pa
 ----
 physical_plan
 01)SortExec: TopK(fetch=3), expr=[number@0 DESC, letter@1 ASC NULLS LAST, age@2 DESC], preserve_partitioning=[false], sort_prefix=[number@0 DESC, letter@1 ASC NULLS LAST]
-02)--ProjectionExec: expr=[number@0 as number, letter@1 as letter, age@2 as age, number@0 as column4, letter@1 as column5]
-03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age, number@0 as column4, letter@1 as column5], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
 
 # Verify that the sort prefix is correctly computed over normalized, order-maintaining projections (number + 1, number, number + 1, age)
 query TT
@@ -383,8 +382,8 @@ physical_plan
 02)--SortExec: TopK(fetch=3), expr=[number_plus@0 DESC, number@1 DESC, age@3 ASC NULLS LAST], preserve_partitioning=[true], sort_prefix=[number_plus@0 DESC, number@1 DESC]
 03)----ProjectionExec: expr=[__common_expr_1@0 as number_plus, number@1 as number, __common_expr_1@0 as other_number_plus, age@2 as age]
 04)------ProjectionExec: expr=[CAST(number@0 AS Int64) + 1 as __common_expr_1, number@0 as number, age@1 as age]
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, age], output_ordering=[number@0 DESC], file_type=parquet
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, age], output_ordering=[number@0 DESC], file_type=parquet, predicate=DynamicFilter [ empty ]
 
 # Cleanup
 statement ok
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q1.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q1.slt.part
index 4a6ad5eddfb79..7e3617b1d597c 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q1.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q1.slt.part
@@ -51,10 +51,8 @@ physical_plan
 02)--SortExec: expr=[l_returnflag@0 ASC NULLS LAST, l_linestatus@1 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[l_returnflag@0 as l_returnflag, l_linestatus@1 as l_linestatus, sum(lineitem.l_quantity)@2 as sum_qty, sum(lineitem.l_extendedprice)@3 as sum_base_price, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@4 as sum_disc_price, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax)@5 as sum_charge, avg(lineitem.l_quantity)@6 as avg_qty, avg(lineitem.l_extendedprice)@7 as avg_price, avg(lineitem.l_discount)@8 as avg_disc, count(Int64(1))@9 as count_order]
 04)------AggregateExec: mode=FinalPartitioned, gby=[l_returnflag@0 as l_returnflag, l_linestatus@1 as l_linestatus], aggr=[sum(lineitem.l_quantity), sum(lineitem.l_extendedprice), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax), avg(lineitem.l_quantity), avg(lineitem.l_extendedprice), avg(lineitem.l_discount), count(Int64(1))]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([l_returnflag@0, l_linestatus@1], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[l_returnflag@5 as l_returnflag, l_linestatus@6 as l_linestatus], aggr=[sum(lineitem.l_quantity), sum(lineitem.l_extendedprice), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax), avg(lineitem.l_quantity), avg(lineitem.l_extendedprice), avg(lineitem.l_discount), count(Int64(1))]
-08)--------------ProjectionExec: expr=[l_extendedprice@1 * (Some(1),20,0 - l_discount@2) as __common_expr_1, l_quantity@0 as l_quantity, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, l_tax@3 as l_tax, l_returnflag@4 as l_returnflag, l_linestatus@5 as l_linestatus]
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------FilterExec: l_shipdate@6 <= 1998-09-02, projection=[l_quantity@0, l_extendedprice@1, l_discount@2, l_tax@3, l_returnflag@4, l_linestatus@5]
-11)--------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([l_returnflag@0, l_linestatus@1], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[l_returnflag@5 as l_returnflag, l_linestatus@6 as l_linestatus], aggr=[sum(lineitem.l_quantity), sum(lineitem.l_extendedprice), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax), avg(lineitem.l_quantity), avg(lineitem.l_extendedprice), avg(lineitem.l_discount), count(Int64(1))]
+07)------------ProjectionExec: expr=[l_extendedprice@0 * (Some(1),20,0 - l_discount@1) as __common_expr_1, l_quantity@2 as l_quantity, l_extendedprice@0 as l_extendedprice, l_discount@1 as l_discount, l_tax@3 as l_tax, l_returnflag@4 as l_returnflag, l_linestatus@5 as l_linestatus]
+08)--------------FilterExec: l_shipdate@6 <= 1998-09-02, projection=[l_extendedprice@1, l_discount@2, l_quantity@0, l_tax@3, l_returnflag@4, l_linestatus@5]
+09)----------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q10.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q10.slt.part
index 04de9153a0474..62649148bf058 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q10.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q10.slt.part
@@ -73,34 +73,20 @@ physical_plan
 02)--SortExec: TopK(fetch=10), expr=[revenue@2 DESC], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[c_custkey@0 as c_custkey, c_name@1 as c_name, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@7 as revenue, c_acctbal@2 as c_acctbal, n_name@4 as n_name, c_address@5 as c_address, c_phone@3 as c_phone, c_comment@6 as c_comment]
 04)------AggregateExec: mode=FinalPartitioned, gby=[c_custkey@0 as c_custkey, c_name@1 as c_name, c_acctbal@2 as c_acctbal, c_phone@3 as c_phone, n_name@4 as n_name, c_address@5 as c_address, c_comment@6 as c_comment], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([c_custkey@0, c_name@1, c_acctbal@2, c_phone@3, n_name@4, c_address@5, c_comment@6], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[c_custkey@0 as c_custkey, c_name@1 as c_name, c_acctbal@4 as c_acctbal, c_phone@3 as c_phone, n_name@8 as n_name, c_address@2 as c_address, c_comment@5 as c_comment], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-08)--------------CoalesceBatchesExec: target_batch_size=8192
-09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_nationkey@3, n_nationkey@0)], projection=[c_custkey@0, c_name@1, c_address@2, c_phone@4, c_acctbal@5, c_comment@6, l_extendedprice@7, l_discount@8, n_name@10]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------RepartitionExec: partitioning=Hash([c_nationkey@3], 4), input_partitions=4
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@7, l_orderkey@0)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, l_extendedprice@9, l_discount@10]
-14)--------------------------CoalesceBatchesExec: target_batch_size=8192
-15)----------------------------RepartitionExec: partitioning=Hash([o_orderkey@7], 4), input_partitions=4
-16)------------------------------CoalesceBatchesExec: target_batch_size=8192
-17)--------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, o_orderkey@7]
-18)----------------------------------CoalesceBatchesExec: target_batch_size=8192
-19)------------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4
-20)--------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-21)----------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment], file_type=csv, has_header=false
-22)----------------------------------CoalesceBatchesExec: target_batch_size=8192
-23)------------------------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
-24)--------------------------------------CoalesceBatchesExec: target_batch_size=8192
-25)----------------------------------------FilterExec: o_orderdate@2 >= 1993-10-01 AND o_orderdate@2 < 1994-01-01, projection=[o_orderkey@0, o_custkey@1]
-26)------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate], file_type=csv, has_header=false
-27)--------------------------CoalesceBatchesExec: target_batch_size=8192
-28)----------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-29)------------------------------CoalesceBatchesExec: target_batch_size=8192
-30)--------------------------------FilterExec: l_returnflag@3 = R, projection=[l_orderkey@0, l_extendedprice@1, l_discount@2]
-31)----------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], file_type=csv, has_header=false
-32)------------------CoalesceBatchesExec: target_batch_size=8192
-33)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-34)----------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-35)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([c_custkey@0, c_name@1, c_acctbal@2, c_phone@3, n_name@4, c_address@5, c_comment@6], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[c_custkey@0 as c_custkey, c_name@1 as c_name, c_acctbal@4 as c_acctbal, c_phone@3 as c_phone, n_name@8 as n_name, c_address@2 as c_address, c_comment@5 as c_comment], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
+07)------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_nationkey@3, n_nationkey@0)], projection=[c_custkey@0, c_name@1, c_address@2, c_phone@4, c_acctbal@5, c_comment@6, l_extendedprice@7, l_discount@8, n_name@10]
+08)--------------RepartitionExec: partitioning=Hash([c_nationkey@3], 4), input_partitions=4
+09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@7, l_orderkey@0)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, l_extendedprice@9, l_discount@10]
+10)------------------RepartitionExec: partitioning=Hash([o_orderkey@7], 4), input_partitions=4
+11)--------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, o_orderkey@7]
+12)----------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=1
+13)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment], file_type=csv, has_header=false
+14)----------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
+15)------------------------FilterExec: o_orderdate@2 >= 1993-10-01 AND o_orderdate@2 < 1994-01-01, projection=[o_orderkey@0, o_custkey@1]
+16)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate], file_type=csv, has_header=false
+17)------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+18)--------------------FilterExec: l_returnflag@3 = R, projection=[l_orderkey@0, l_extendedprice@1, l_discount@2]
+19)----------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], file_type=csv, has_header=false
+20)--------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1
+21)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q11.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q11.slt.part
index 6b03d708c7fa2..0c5b6d76dc1e1 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q11.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q11.slt.part
@@ -49,78 +49,62 @@ limit 10;
 logical_plan
 01)Sort: value DESC NULLS FIRST, fetch=10
 02)--Projection: partsupp.ps_partkey, sum(partsupp.ps_supplycost * partsupp.ps_availqty) AS value
-03)----Inner Join:  Filter: CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty) AS Decimal128(38, 15)) > __scalar_sq_1.sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)
-04)------Aggregate: groupBy=[[partsupp.ps_partkey]], aggr=[[sum(partsupp.ps_supplycost * CAST(partsupp.ps_availqty AS Decimal128(10, 0)))]]
-05)--------Projection: partsupp.ps_partkey, partsupp.ps_availqty, partsupp.ps_supplycost
-06)----------Inner Join: supplier.s_nationkey = nation.n_nationkey
-07)------------Projection: partsupp.ps_partkey, partsupp.ps_availqty, partsupp.ps_supplycost, supplier.s_nationkey
-08)--------------Inner Join: partsupp.ps_suppkey = supplier.s_suppkey
-09)----------------TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost], partial_filters=[Boolean(true)]
-10)----------------TableScan: supplier projection=[s_suppkey, s_nationkey]
-11)------------Projection: nation.n_nationkey
-12)--------------Filter: nation.n_name = Utf8View("GERMANY")
-13)----------------TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_name = Utf8View("GERMANY")]
-14)------SubqueryAlias: __scalar_sq_1
-15)--------Projection: CAST(CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty) AS Float64) * Float64(0.0001) AS Decimal128(38, 15))
-16)----------Aggregate: groupBy=[[]], aggr=[[sum(partsupp.ps_supplycost * CAST(partsupp.ps_availqty AS Decimal128(10, 0)))]]
-17)------------Projection: partsupp.ps_availqty, partsupp.ps_supplycost
-18)--------------Inner Join: supplier.s_nationkey = nation.n_nationkey
-19)----------------Projection: partsupp.ps_availqty, partsupp.ps_supplycost, supplier.s_nationkey
-20)------------------Inner Join: partsupp.ps_suppkey = supplier.s_suppkey
-21)--------------------TableScan: partsupp projection=[ps_suppkey, ps_availqty, ps_supplycost]
-22)--------------------TableScan: supplier projection=[s_suppkey, s_nationkey]
-23)----------------Projection: nation.n_nationkey
-24)------------------Filter: nation.n_name = Utf8View("GERMANY")
-25)--------------------TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_name = Utf8View("GERMANY")]
+03)----Filter: CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty) AS Decimal128(38, 15)) > (<subquery>)
+04)------Subquery:
+05)--------Projection: CAST(CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty) AS Float64) * Float64(0.0001) AS Decimal128(38, 15))
+06)----------Aggregate: groupBy=[[]], aggr=[[sum(partsupp.ps_supplycost * CAST(partsupp.ps_availqty AS Decimal128(10, 0)))]]
+07)------------Projection: partsupp.ps_availqty, partsupp.ps_supplycost
+08)--------------Inner Join: supplier.s_nationkey = nation.n_nationkey
+09)----------------Projection: partsupp.ps_availqty, partsupp.ps_supplycost, supplier.s_nationkey
+10)------------------Inner Join: partsupp.ps_suppkey = supplier.s_suppkey
+11)--------------------TableScan: partsupp projection=[ps_suppkey, ps_availqty, ps_supplycost]
+12)--------------------TableScan: supplier projection=[s_suppkey, s_nationkey]
+13)----------------Projection: nation.n_nationkey
+14)------------------Filter: nation.n_name = Utf8View("GERMANY")
+15)--------------------TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_name = Utf8View("GERMANY")]
+16)------Aggregate: groupBy=[[partsupp.ps_partkey]], aggr=[[sum(partsupp.ps_supplycost * CAST(partsupp.ps_availqty AS Decimal128(10, 0)))]]
+17)--------Projection: partsupp.ps_partkey, partsupp.ps_availqty, partsupp.ps_supplycost
+18)----------Inner Join: supplier.s_nationkey = nation.n_nationkey
+19)------------Projection: partsupp.ps_partkey, partsupp.ps_availqty, partsupp.ps_supplycost, supplier.s_nationkey
+20)--------------Inner Join: partsupp.ps_suppkey = supplier.s_suppkey
+21)----------------TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost]
+22)----------------TableScan: supplier projection=[s_suppkey, s_nationkey]
+23)------------Projection: nation.n_nationkey
+24)--------------Filter: nation.n_name = Utf8View("GERMANY")
+25)----------------TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_name = Utf8View("GERMANY")]
 physical_plan
-01)SortExec: TopK(fetch=10), expr=[value@1 DESC], preserve_partitioning=[false]
-02)--ProjectionExec: expr=[ps_partkey@0 as ps_partkey, sum(partsupp.ps_supplycost * partsupp.ps_availqty)@1 as value]
-03)----NestedLoopJoinExec: join_type=Inner, filter=join_proj_push_down_1@1 > sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)@0, projection=[ps_partkey@0, sum(partsupp.ps_supplycost * partsupp.ps_availqty)@1, sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)@3]
-04)------ProjectionExec: expr=[ps_partkey@0 as ps_partkey, sum(partsupp.ps_supplycost * partsupp.ps_availqty)@1 as sum(partsupp.ps_supplycost * partsupp.ps_availqty), CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty)@1 AS Decimal128(38, 15)) as join_proj_push_down_1]
-05)--------CoalescePartitionsExec
-06)----------AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)]
-07)------------CoalesceBatchesExec: target_batch_size=8192
+01)ScalarSubqueryExec: subqueries=1
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+03)----SortPreservingMergeExec: [value@1 DESC], fetch=10
+04)------SortExec: TopK(fetch=10), expr=[value@1 DESC], preserve_partitioning=[true]
+05)--------ProjectionExec: expr=[ps_partkey@0 as ps_partkey, sum(partsupp.ps_supplycost * partsupp.ps_availqty)@1 as value]
+06)----------FilterExec: CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty)@1 AS Decimal128(38, 15)) > scalar_subquery(<pending>)
+07)------------AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)]
 08)--------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4
 09)----------------AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@3, n_nationkey@0)], projection=[ps_partkey@0, ps_availqty@1, ps_supplycost@2]
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------RepartitionExec: partitioning=Hash([s_nationkey@3], 4), input_partitions=4
-14)--------------------------CoalesceBatchesExec: target_batch_size=8192
-15)----------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@1, s_suppkey@0)], projection=[ps_partkey@0, ps_availqty@2, ps_supplycost@3, s_nationkey@5]
-16)------------------------------CoalesceBatchesExec: target_batch_size=8192
-17)--------------------------------RepartitionExec: partitioning=Hash([ps_suppkey@1], 4), input_partitions=4
-18)----------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost], file_type=csv, has_header=false
-19)------------------------------CoalesceBatchesExec: target_batch_size=8192
-20)--------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-21)----------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-22)------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
-23)----------------------CoalesceBatchesExec: target_batch_size=8192
-24)------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-25)--------------------------CoalesceBatchesExec: target_batch_size=8192
-26)----------------------------FilterExec: n_name@1 = GERMANY, projection=[n_nationkey@0]
-27)------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-28)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
-29)------ProjectionExec: expr=[CAST(CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty)@0 AS Float64) * 0.0001 AS Decimal128(38, 15)) as sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)]
-30)--------AggregateExec: mode=Final, gby=[], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)]
-31)----------CoalescePartitionsExec
-32)------------AggregateExec: mode=Partial, gby=[], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)]
-33)--------------CoalesceBatchesExec: target_batch_size=8192
-34)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[ps_availqty@0, ps_supplycost@1]
-35)------------------CoalesceBatchesExec: target_batch_size=8192
-36)--------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4
-37)----------------------CoalesceBatchesExec: target_batch_size=8192
-38)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@0, s_suppkey@0)], projection=[ps_availqty@1, ps_supplycost@2, s_nationkey@4]
-39)--------------------------CoalesceBatchesExec: target_batch_size=8192
-40)----------------------------RepartitionExec: partitioning=Hash([ps_suppkey@0], 4), input_partitions=4
-41)------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_suppkey, ps_availqty, ps_supplycost], file_type=csv, has_header=false
-42)--------------------------CoalesceBatchesExec: target_batch_size=8192
-43)----------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-44)------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-45)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
-46)------------------CoalesceBatchesExec: target_batch_size=8192
-47)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-48)----------------------CoalesceBatchesExec: target_batch_size=8192
-49)------------------------FilterExec: n_name@1 = GERMANY, projection=[n_nationkey@0]
-50)--------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-51)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
+10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@3, n_nationkey@0)], projection=[ps_partkey@0, ps_availqty@1, ps_supplycost@2]
+11)--------------------RepartitionExec: partitioning=Hash([s_nationkey@3], 4), input_partitions=4
+12)----------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@1, s_suppkey@0)], projection=[ps_partkey@0, ps_availqty@2, ps_supplycost@3, s_nationkey@5]
+13)------------------------RepartitionExec: partitioning=Hash([ps_suppkey@1], 4), input_partitions=4
+14)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost], file_type=csv, has_header=false
+15)------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1
+16)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
+17)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
+18)----------------------FilterExec: n_name@1 = GERMANY, projection=[n_nationkey@0]
+19)------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+20)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
+21)--ProjectionExec: expr=[CAST(CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty)@0 AS Float64) * 0.0001 AS Decimal128(38, 15)) as sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)]
+22)----AggregateExec: mode=Final, gby=[], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)]
+23)------CoalescePartitionsExec
+24)--------AggregateExec: mode=Partial, gby=[], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)]
+25)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[ps_availqty@0, ps_supplycost@1]
+26)------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4
+27)--------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@0, s_suppkey@0)], projection=[ps_availqty@1, ps_supplycost@2, s_nationkey@4]
+28)----------------RepartitionExec: partitioning=Hash([ps_suppkey@0], 4), input_partitions=4
+29)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_suppkey, ps_availqty, ps_supplycost], file_type=csv, has_header=false
+30)----------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1
+31)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
+32)------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
+33)--------------FilterExec: n_name@1 = GERMANY, projection=[n_nationkey@0]
+34)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+35)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q12.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q12.slt.part
index f7344daed8c7a..b152fde02f060 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q12.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q12.slt.part
@@ -63,16 +63,11 @@ physical_plan
 02)--SortExec: expr=[l_shipmode@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[l_shipmode@0 as l_shipmode, sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)@1 as high_line_count, sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)@2 as low_line_count]
 04)------AggregateExec: mode=FinalPartitioned, gby=[l_shipmode@0 as l_shipmode], aggr=[sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END), sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([l_shipmode@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[l_shipmode@0 as l_shipmode], aggr=[sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END), sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)]
-08)--------------CoalesceBatchesExec: target_batch_size=8192
-09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@0, o_orderkey@0)], projection=[l_shipmode@1, o_orderpriority@3]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------FilterExec: (l_shipmode@4 = MAIL OR l_shipmode@4 = SHIP) AND l_receiptdate@3 > l_commitdate@2 AND l_shipdate@1 < l_commitdate@2 AND l_receiptdate@3 >= 1994-01-01 AND l_receiptdate@3 < 1995-01-01, projection=[l_orderkey@0, l_shipmode@4]
-14)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_shipdate, l_commitdate, l_receiptdate, l_shipmode], file_type=csv, has_header=false
-15)------------------CoalesceBatchesExec: target_batch_size=8192
-16)--------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
-17)----------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_orderpriority], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([l_shipmode@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[l_shipmode@0 as l_shipmode], aggr=[sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END), sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)]
+07)------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@0, o_orderkey@0)], projection=[l_shipmode@1, o_orderpriority@3]
+08)--------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+09)----------------FilterExec: (l_shipmode@4 = MAIL OR l_shipmode@4 = SHIP) AND l_receiptdate@3 > l_commitdate@2 AND l_shipdate@1 < l_commitdate@2 AND l_receiptdate@3 >= 1994-01-01 AND l_receiptdate@3 < 1995-01-01, projection=[l_orderkey@0, l_shipmode@4]
+10)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_shipdate, l_commitdate, l_receiptdate, l_shipmode], file_type=csv, has_header=false
+11)--------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
+12)----------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_orderpriority], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q13.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q13.slt.part
index 96f3bd6edf324..94e0848bfcce1 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q13.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q13.slt.part
@@ -57,19 +57,13 @@ physical_plan
 02)--SortExec: TopK(fetch=10), expr=[custdist@1 DESC, c_count@0 DESC], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[c_count@0 as c_count, count(Int64(1))@1 as custdist]
 04)------AggregateExec: mode=FinalPartitioned, gby=[c_count@0 as c_count], aggr=[count(Int64(1))]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([c_count@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[c_count@0 as c_count], aggr=[count(Int64(1))]
-08)--------------ProjectionExec: expr=[count(orders.o_orderkey)@1 as c_count]
-09)----------------AggregateExec: mode=SinglePartitioned, gby=[c_custkey@0 as c_custkey], aggr=[count(orders.o_orderkey)]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------HashJoinExec: mode=Partitioned, join_type=Left, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, o_orderkey@1]
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4
-14)--------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-15)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey], file_type=csv, has_header=false
-16)----------------------CoalesceBatchesExec: target_batch_size=8192
-17)------------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
-18)--------------------------CoalesceBatchesExec: target_batch_size=8192
-19)----------------------------FilterExec: o_comment@2 NOT LIKE %special%requests%, projection=[o_orderkey@0, o_custkey@1]
-20)------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_comment], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([c_count@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[c_count@0 as c_count], aggr=[count(Int64(1))]
+07)------------ProjectionExec: expr=[count(orders.o_orderkey)@1 as c_count]
+08)--------------AggregateExec: mode=SinglePartitioned, gby=[c_custkey@0 as c_custkey], aggr=[count(orders.o_orderkey)]
+09)----------------HashJoinExec: mode=Partitioned, join_type=Left, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, o_orderkey@1]
+10)------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=1
+11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey], file_type=csv, has_header=false
+12)------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
+13)--------------------FilterExec: o_comment@2 NOT LIKE %special%requests%, projection=[o_orderkey@0, o_custkey@1]
+14)----------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_comment], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q14.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q14.slt.part
index 8d8dd68c3d7bd..a9ac517f287d0 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q14.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q14.slt.part
@@ -46,14 +46,9 @@ physical_plan
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
 05)--------ProjectionExec: expr=[l_extendedprice@0 * (Some(1),20,0 - l_discount@1) as __common_expr_1, p_type@2 as p_type]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], projection=[l_extendedprice@1, l_discount@2, p_type@4]
-08)--------------CoalesceBatchesExec: target_batch_size=8192
-09)----------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------FilterExec: l_shipdate@3 >= 1995-09-01 AND l_shipdate@3 < 1995-10-01, projection=[l_partkey@0, l_extendedprice@1, l_discount@2]
-12)----------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
-13)--------------CoalesceBatchesExec: target_batch_size=8192
-14)----------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
-15)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-16)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_type], file_type=csv, has_header=false
+06)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], projection=[l_extendedprice@1, l_discount@2, p_type@4]
+07)------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
+08)--------------FilterExec: l_shipdate@3 >= 1995-09-01 AND l_shipdate@3 < 1995-10-01, projection=[l_partkey@0, l_extendedprice@1, l_discount@2]
+09)----------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
+10)------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=1
+11)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_type], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q15.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q15.slt.part
index 0636a033b25a3..3e1aca318b5c7 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q15.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q15.slt.part
@@ -52,51 +52,44 @@ order by
 logical_plan
 01)Sort: supplier.s_suppkey ASC NULLS LAST
 02)--Projection: supplier.s_suppkey, supplier.s_name, supplier.s_address, supplier.s_phone, revenue0.total_revenue
-03)----Inner Join: revenue0.total_revenue = __scalar_sq_1.max(revenue0.total_revenue)
-04)------Projection: supplier.s_suppkey, supplier.s_name, supplier.s_address, supplier.s_phone, revenue0.total_revenue
-05)--------Inner Join: supplier.s_suppkey = revenue0.supplier_no
-06)----------TableScan: supplier projection=[s_suppkey, s_name, s_address, s_phone], partial_filters=[Boolean(true)]
-07)----------SubqueryAlias: revenue0
-08)------------Projection: lineitem.l_suppkey AS supplier_no, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS total_revenue
-09)--------------Aggregate: groupBy=[[lineitem.l_suppkey]], aggr=[[sum(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]]
-10)----------------Projection: lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount
-11)------------------Filter: lineitem.l_shipdate >= Date32("1996-01-01") AND lineitem.l_shipdate < Date32("1996-04-01")
-12)--------------------TableScan: lineitem projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], partial_filters=[lineitem.l_shipdate >= Date32("1996-01-01"), lineitem.l_shipdate < Date32("1996-04-01")]
-13)------SubqueryAlias: __scalar_sq_1
-14)--------Aggregate: groupBy=[[]], aggr=[[max(revenue0.total_revenue)]]
-15)----------SubqueryAlias: revenue0
-16)------------Projection: sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS total_revenue
-17)--------------Aggregate: groupBy=[[lineitem.l_suppkey]], aggr=[[sum(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]]
-18)----------------Projection: lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount
-19)------------------Filter: lineitem.l_shipdate >= Date32("1996-01-01") AND lineitem.l_shipdate < Date32("1996-04-01")
-20)--------------------TableScan: lineitem projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], partial_filters=[lineitem.l_shipdate >= Date32("1996-01-01"), lineitem.l_shipdate < Date32("1996-04-01")]
+03)----Inner Join: supplier.s_suppkey = revenue0.supplier_no
+04)------TableScan: supplier projection=[s_suppkey, s_name, s_address, s_phone]
+05)------SubqueryAlias: revenue0
+06)--------Projection: lineitem.l_suppkey AS supplier_no, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS total_revenue
+07)----------Filter: sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) = (<subquery>)
+08)------------Subquery:
+09)--------------Aggregate: groupBy=[[]], aggr=[[max(revenue0.total_revenue)]]
+10)----------------SubqueryAlias: revenue0
+11)------------------Projection: sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS total_revenue
+12)--------------------Aggregate: groupBy=[[lineitem.l_suppkey]], aggr=[[sum(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]]
+13)----------------------Projection: lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount
+14)------------------------Filter: lineitem.l_shipdate >= Date32("1996-01-01") AND lineitem.l_shipdate < Date32("1996-04-01")
+15)--------------------------TableScan: lineitem projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], partial_filters=[lineitem.l_shipdate >= Date32("1996-01-01"), lineitem.l_shipdate < Date32("1996-04-01")]
+16)------------Aggregate: groupBy=[[lineitem.l_suppkey]], aggr=[[sum(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]]
+17)--------------Projection: lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount
+18)----------------Filter: lineitem.l_shipdate >= Date32("1996-01-01") AND lineitem.l_shipdate < Date32("1996-04-01")
+19)------------------TableScan: lineitem projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], partial_filters=[lineitem.l_shipdate >= Date32("1996-01-01"), lineitem.l_shipdate < Date32("1996-04-01")]
 physical_plan
-01)SortPreservingMergeExec: [s_suppkey@0 ASC NULLS LAST]
-02)--SortExec: expr=[s_suppkey@0 ASC NULLS LAST], preserve_partitioning=[true]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(max(revenue0.total_revenue)@0, total_revenue@4)], projection=[s_suppkey@1, s_name@2, s_address@3, s_phone@4, total_revenue@5]
-05)--------AggregateExec: mode=Final, gby=[], aggr=[max(revenue0.total_revenue)]
-06)----------CoalescePartitionsExec
-07)------------AggregateExec: mode=Partial, gby=[], aggr=[max(revenue0.total_revenue)]
-08)--------------ProjectionExec: expr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as total_revenue]
-09)----------------AggregateExec: mode=FinalPartitioned, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------RepartitionExec: partitioning=Hash([l_suppkey@0], 4), input_partitions=4
-12)----------------------AggregateExec: mode=Partial, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-13)------------------------CoalesceBatchesExec: target_batch_size=8192
-14)--------------------------FilterExec: l_shipdate@3 >= 1996-01-01 AND l_shipdate@3 < 1996-04-01, projection=[l_suppkey@0, l_extendedprice@1, l_discount@2]
-15)----------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
-16)--------CoalesceBatchesExec: target_batch_size=8192
-17)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, supplier_no@0)], projection=[s_suppkey@0, s_name@1, s_address@2, s_phone@3, total_revenue@5]
-18)------------CoalesceBatchesExec: target_batch_size=8192
-19)--------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-20)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-21)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_address, s_phone], file_type=csv, has_header=false
-22)------------ProjectionExec: expr=[l_suppkey@0 as supplier_no, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as total_revenue]
-23)--------------AggregateExec: mode=FinalPartitioned, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-24)----------------CoalesceBatchesExec: target_batch_size=8192
-25)------------------RepartitionExec: partitioning=Hash([l_suppkey@0], 4), input_partitions=4
-26)--------------------AggregateExec: mode=Partial, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-27)----------------------CoalesceBatchesExec: target_batch_size=8192
-28)------------------------FilterExec: l_shipdate@3 >= 1996-01-01 AND l_shipdate@3 < 1996-04-01, projection=[l_suppkey@0, l_extendedprice@1, l_discount@2]
-29)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
+01)ScalarSubqueryExec: subqueries=1
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+03)----SortPreservingMergeExec: [s_suppkey@0 ASC NULLS LAST]
+04)------SortExec: expr=[s_suppkey@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, supplier_no@0)], projection=[s_suppkey@0, s_name@1, s_address@2, s_phone@3, total_revenue@5]
+06)----------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_address, s_phone], file_type=csv, has_header=false
+08)----------ProjectionExec: expr=[l_suppkey@0 as supplier_no, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as total_revenue]
+09)------------FilterExec: sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 = scalar_subquery(<pending>)
+10)--------------AggregateExec: mode=FinalPartitioned, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
+11)----------------RepartitionExec: partitioning=Hash([l_suppkey@0], 4), input_partitions=4
+12)------------------AggregateExec: mode=Partial, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
+13)--------------------FilterExec: l_shipdate@3 >= 1996-01-01 AND l_shipdate@3 < 1996-04-01, projection=[l_suppkey@0, l_extendedprice@1, l_discount@2]
+14)----------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
+15)--AggregateExec: mode=Final, gby=[], aggr=[max(revenue0.total_revenue)]
+16)----CoalescePartitionsExec
+17)------AggregateExec: mode=Partial, gby=[], aggr=[max(revenue0.total_revenue)]
+18)--------ProjectionExec: expr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as total_revenue]
+19)----------AggregateExec: mode=FinalPartitioned, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
+20)------------RepartitionExec: partitioning=Hash([l_suppkey@0], 4), input_partitions=4
+21)--------------AggregateExec: mode=Partial, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
+22)----------------FilterExec: l_shipdate@3 >= 1996-01-01 AND l_shipdate@3 < 1996-04-01, projection=[l_suppkey@0, l_extendedprice@1, l_discount@2]
+23)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q16.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q16.slt.part
index 53d637ea3f510..b01110b567ca8 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q16.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q16.slt.part
@@ -69,31 +69,20 @@ physical_plan
 02)--SortExec: TopK(fetch=10), expr=[supplier_cnt@3 DESC, p_brand@0 ASC NULLS LAST, p_type@1 ASC NULLS LAST, p_size@2 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size, count(alias1)@3 as supplier_cnt]
 04)------AggregateExec: mode=FinalPartitioned, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size], aggr=[count(alias1)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([p_brand@0, p_type@1, p_size@2], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size], aggr=[count(alias1)]
-08)--------------AggregateExec: mode=FinalPartitioned, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size, alias1@3 as alias1], aggr=[]
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------RepartitionExec: partitioning=Hash([p_brand@0, p_type@1, p_size@2, alias1@3], 4), input_partitions=4
-11)--------------------AggregateExec: mode=Partial, gby=[p_brand@1 as p_brand, p_type@2 as p_type, p_size@3 as p_size, ps_suppkey@0 as alias1], aggr=[]
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(ps_suppkey@0, s_suppkey@0)]
-14)--------------------------CoalesceBatchesExec: target_batch_size=8192
-15)----------------------------RepartitionExec: partitioning=Hash([ps_suppkey@0], 4), input_partitions=4
-16)------------------------------CoalesceBatchesExec: target_batch_size=8192
-17)--------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_partkey@0, p_partkey@0)], projection=[ps_suppkey@1, p_brand@3, p_type@4, p_size@5]
-18)----------------------------------CoalesceBatchesExec: target_batch_size=8192
-19)------------------------------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4
-20)--------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey], file_type=csv, has_header=false
-21)----------------------------------CoalesceBatchesExec: target_batch_size=8192
-22)------------------------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
-23)--------------------------------------CoalesceBatchesExec: target_batch_size=8192
-24)----------------------------------------FilterExec: p_brand@1 != Brand#45 AND p_type@2 NOT LIKE MEDIUM POLISHED% AND p_size@3 IN (SET) ([49, 14, 23, 45, 19, 3, 36, 9])
-25)------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-26)--------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_brand, p_type, p_size], file_type=csv, has_header=false
-27)--------------------------CoalesceBatchesExec: target_batch_size=8192
-28)----------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-29)------------------------------CoalesceBatchesExec: target_batch_size=8192
-30)--------------------------------FilterExec: s_comment@1 LIKE %Customer%Complaints%, projection=[s_suppkey@0]
-31)----------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-32)------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_comment], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([p_brand@0, p_type@1, p_size@2], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size], aggr=[count(alias1)]
+07)------------AggregateExec: mode=FinalPartitioned, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size, alias1@3 as alias1], aggr=[]
+08)--------------RepartitionExec: partitioning=Hash([p_brand@0, p_type@1, p_size@2, alias1@3], 4), input_partitions=4
+09)----------------AggregateExec: mode=Partial, gby=[p_brand@1 as p_brand, p_type@2 as p_type, p_size@3 as p_size, ps_suppkey@0 as alias1], aggr=[]
+10)------------------HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(ps_suppkey@0, s_suppkey@0)]
+11)--------------------CoalescePartitionsExec
+12)----------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_partkey@0, p_partkey@0)], projection=[ps_suppkey@1, p_brand@3, p_type@4, p_size@5]
+13)------------------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4
+14)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey], file_type=csv, has_header=false
+15)------------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
+16)--------------------------FilterExec: p_brand@1 != Brand#45 AND p_type@2 NOT LIKE MEDIUM POLISHED% AND p_size@3 IN (SET) ([49, 14, 23, 45, 19, 3, 36, 9])
+17)----------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+18)------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_brand, p_type, p_size], file_type=csv, has_header=false
+19)--------------------FilterExec: s_comment@1 LIKE %Customer%Complaints%, projection=[s_suppkey@0]
+20)----------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+21)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_comment], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q17.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q17.slt.part
index 51a0d096428c0..83294d61a1698 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q17.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q17.slt.part
@@ -55,22 +55,16 @@ physical_plan
 02)--AggregateExec: mode=Final, gby=[], aggr=[sum(lineitem.l_extendedprice)]
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[sum(lineitem.l_extendedprice)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@2, l_partkey@1)], filter=CAST(l_quantity@0 AS Decimal128(30, 15)) < Float64(0.2) * avg(lineitem.l_quantity)@1, projection=[l_extendedprice@1]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], projection=[l_quantity@1, l_extendedprice@2, p_partkey@3]
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
-11)--------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_quantity, l_extendedprice], file_type=csv, has_header=false
-12)----------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
-14)--------------------CoalesceBatchesExec: target_batch_size=8192
-15)----------------------FilterExec: p_brand@1 = Brand#23 AND p_container@2 = MED BOX, projection=[p_partkey@0]
-16)------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-17)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_brand, p_container], file_type=csv, has_header=false
-18)------------ProjectionExec: expr=[CAST(0.2 * CAST(avg(lineitem.l_quantity)@1 AS Float64) AS Decimal128(30, 15)) as Float64(0.2) * avg(lineitem.l_quantity), l_partkey@0 as l_partkey]
-19)--------------AggregateExec: mode=FinalPartitioned, gby=[l_partkey@0 as l_partkey], aggr=[avg(lineitem.l_quantity)]
-20)----------------CoalesceBatchesExec: target_batch_size=8192
-21)------------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
-22)--------------------AggregateExec: mode=Partial, gby=[l_partkey@0 as l_partkey], aggr=[avg(lineitem.l_quantity)]
-23)----------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_quantity], file_type=csv, has_header=false
+05)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@2, l_partkey@1)], filter=CAST(l_quantity@0 AS Decimal128(30, 15)) < Float64(0.2) * avg(lineitem.l_quantity)@1, projection=[l_extendedprice@1]
+06)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], projection=[l_quantity@1, l_extendedprice@2, p_partkey@3]
+07)------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
+08)--------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_quantity, l_extendedprice], file_type=csv, has_header=false
+09)------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
+10)--------------FilterExec: p_brand@1 = Brand#23 AND p_container@2 = MED BOX, projection=[p_partkey@0]
+11)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+12)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_brand, p_container], file_type=csv, has_header=false
+13)----------ProjectionExec: expr=[CAST(0.2 * CAST(avg(lineitem.l_quantity)@1 AS Float64) AS Decimal128(30, 15)) as Float64(0.2) * avg(lineitem.l_quantity), l_partkey@0 as l_partkey]
+14)------------AggregateExec: mode=FinalPartitioned, gby=[l_partkey@0 as l_partkey], aggr=[avg(lineitem.l_quantity)]
+15)--------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
+16)----------------AggregateExec: mode=Partial, gby=[l_partkey@0 as l_partkey], aggr=[avg(lineitem.l_quantity)]
+17)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_quantity], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q18.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q18.slt.part
index 55da5371671e8..617051d602bd6 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q18.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q18.slt.part
@@ -69,32 +69,19 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [o_totalprice@4 DESC, o_orderdate@3 ASC NULLS LAST]
 02)--SortExec: expr=[o_totalprice@4 DESC, o_orderdate@3 ASC NULLS LAST], preserve_partitioning=[true]
-03)----AggregateExec: mode=FinalPartitioned, gby=[c_name@0 as c_name, c_custkey@1 as c_custkey, o_orderkey@2 as o_orderkey, o_orderdate@3 as o_orderdate, o_totalprice@4 as o_totalprice], aggr=[sum(lineitem.l_quantity)]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([c_name@0, c_custkey@1, o_orderkey@2, o_orderdate@3, o_totalprice@4], 4), input_partitions=4
-06)----------AggregateExec: mode=Partial, gby=[c_name@1 as c_name, c_custkey@0 as c_custkey, o_orderkey@2 as o_orderkey, o_orderdate@4 as o_orderdate, o_totalprice@3 as o_totalprice], aggr=[sum(lineitem.l_quantity)]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(o_orderkey@2, l_orderkey@0)]
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@2, l_orderkey@0)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@3, o_orderdate@4, l_quantity@6]
-11)--------------------CoalesceBatchesExec: target_batch_size=8192
-12)----------------------RepartitionExec: partitioning=Hash([o_orderkey@2], 4), input_partitions=4
-13)------------------------CoalesceBatchesExec: target_batch_size=8192
-14)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@4, o_orderdate@5]
-15)----------------------------CoalesceBatchesExec: target_batch_size=8192
-16)------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4
-17)--------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-18)----------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_name], file_type=csv, has_header=false
-19)----------------------------CoalesceBatchesExec: target_batch_size=8192
-20)------------------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
-21)--------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_totalprice, o_orderdate], file_type=csv, has_header=false
-22)--------------------CoalesceBatchesExec: target_batch_size=8192
-23)----------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-24)------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_quantity], file_type=csv, has_header=false
-25)----------------CoalesceBatchesExec: target_batch_size=8192
-26)------------------FilterExec: sum(lineitem.l_quantity)@1 > Some(30000),25,2, projection=[l_orderkey@0]
-27)--------------------AggregateExec: mode=FinalPartitioned, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)]
-28)----------------------CoalesceBatchesExec: target_batch_size=8192
-29)------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-30)--------------------------AggregateExec: mode=Partial, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)]
-31)----------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_quantity], file_type=csv, has_header=false
+03)----AggregateExec: mode=SinglePartitioned, gby=[c_name@1 as c_name, c_custkey@0 as c_custkey, o_orderkey@2 as o_orderkey, o_orderdate@4 as o_orderdate, o_totalprice@3 as o_totalprice], aggr=[sum(lineitem.l_quantity)]
+04)------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(o_orderkey@2, l_orderkey@0)]
+05)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@2, l_orderkey@0)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@3, o_orderdate@4, l_quantity@6]
+06)----------RepartitionExec: partitioning=Hash([o_orderkey@2], 4), input_partitions=4
+07)------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@4, o_orderdate@5]
+08)--------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_name], file_type=csv, has_header=false
+10)--------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
+11)----------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_totalprice, o_orderdate], file_type=csv, has_header=false
+12)----------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+13)------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_quantity], file_type=csv, has_header=false
+14)--------FilterExec: sum(lineitem.l_quantity)@1 > Some(30000),25,2, projection=[l_orderkey@0]
+15)----------AggregateExec: mode=FinalPartitioned, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)]
+16)------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+17)--------------AggregateExec: mode=Partial, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)]
+18)----------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_quantity], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q19.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q19.slt.part
index 4960ad1f4a914..72c21e060fa66 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q19.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q19.slt.part
@@ -68,16 +68,11 @@ physical_plan
 02)--AggregateExec: mode=Final, gby=[], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], filter=p_brand@1 = Brand#12 AND p_container@3 IN ([SM CASE, SM BOX, SM PACK, SM PKG]) AND l_quantity@0 >= Some(100),15,2 AND l_quantity@0 <= Some(1100),15,2 AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_container@3 IN ([MED BAG, MED BOX, MED PKG, MED PACK]) AND l_quantity@0 >= Some(1000),15,2 AND l_quantity@0 <= Some(2000),15,2 AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_container@3 IN ([LG CASE, LG BOX, LG PACK, LG PKG]) AND l_quantity@0 >= Some(2000),15,2 AND l_quantity@0 <= Some(3000),15,2 AND p_size@2 <= 15, projection=[l_extendedprice@2, l_discount@3]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------FilterExec: (l_quantity@1 >= Some(100),15,2 AND l_quantity@1 <= Some(1100),15,2 OR l_quantity@1 >= Some(1000),15,2 AND l_quantity@1 <= Some(2000),15,2 OR l_quantity@1 >= Some(2000),15,2 AND l_quantity@1 <= Some(3000),15,2) AND (l_shipmode@5 = AIR OR l_shipmode@5 = AIR REG) AND l_shipinstruct@4 = DELIVER IN PERSON, projection=[l_partkey@0, l_quantity@1, l_extendedprice@2, l_discount@3]
-11)--------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_quantity, l_extendedprice, l_discount, l_shipinstruct, l_shipmode], file_type=csv, has_header=false
-12)------------CoalesceBatchesExec: target_batch_size=8192
-13)--------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
-14)----------------CoalesceBatchesExec: target_batch_size=8192
-15)------------------FilterExec: (p_brand@1 = Brand#12 AND p_container@3 IN ([SM CASE, SM BOX, SM PACK, SM PKG]) AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_container@3 IN ([MED BAG, MED BOX, MED PKG, MED PACK]) AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_container@3 IN ([LG CASE, LG BOX, LG PACK, LG PKG]) AND p_size@2 <= 15) AND p_size@2 >= 1
-16)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-17)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_brand, p_size, p_container], file_type=csv, has_header=false
+05)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], filter=p_brand@1 = Brand#12 AND p_container@3 IN (SET) ([SM CASE, SM BOX, SM PACK, SM PKG]) AND l_quantity@0 >= Some(100),15,2 AND l_quantity@0 <= Some(1100),15,2 AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_container@3 IN (SET) ([MED BAG, MED BOX, MED PKG, MED PACK]) AND l_quantity@0 >= Some(1000),15,2 AND l_quantity@0 <= Some(2000),15,2 AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_container@3 IN (SET) ([LG CASE, LG BOX, LG PACK, LG PKG]) AND l_quantity@0 >= Some(2000),15,2 AND l_quantity@0 <= Some(3000),15,2 AND p_size@2 <= 15, projection=[l_extendedprice@2, l_discount@3]
+06)----------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
+07)------------FilterExec: (l_quantity@1 >= Some(100),15,2 AND l_quantity@1 <= Some(1100),15,2 OR l_quantity@1 >= Some(1000),15,2 AND l_quantity@1 <= Some(2000),15,2 OR l_quantity@1 >= Some(2000),15,2 AND l_quantity@1 <= Some(3000),15,2) AND (l_shipmode@5 = AIR OR l_shipmode@5 = AIR REG) AND l_shipinstruct@4 = DELIVER IN PERSON, projection=[l_partkey@0, l_quantity@1, l_extendedprice@2, l_discount@3]
+08)--------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_quantity, l_extendedprice, l_discount, l_shipinstruct, l_shipmode], file_type=csv, has_header=false
+09)----------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
+10)------------FilterExec: (p_brand@1 = Brand#12 AND p_container@3 IN (SET) ([SM CASE, SM BOX, SM PACK, SM PKG]) AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_container@3 IN (SET) ([MED BAG, MED BOX, MED PKG, MED PACK]) AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_container@3 IN (SET) ([LG CASE, LG BOX, LG PACK, LG PKG]) AND p_size@2 <= 15) AND p_size@2 >= 1
+11)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+12)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_brand, p_size, p_container], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q2.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q2.slt.part
index b2e0fb0cd1cc0..b1a15388270b3 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q2.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q2.slt.part
@@ -101,80 +101,46 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [s_acctbal@0 DESC, n_name@2 ASC NULLS LAST, s_name@1 ASC NULLS LAST, p_partkey@3 ASC NULLS LAST], fetch=10
 02)--SortExec: TopK(fetch=10), expr=[s_acctbal@0 DESC, n_name@2 ASC NULLS LAST, s_name@1 ASC NULLS LAST, p_partkey@3 ASC NULLS LAST], preserve_partitioning=[true]
-03)----ProjectionExec: expr=[s_acctbal@5 as s_acctbal, s_name@2 as s_name, n_name@7 as n_name, p_partkey@0 as p_partkey, p_mfgr@1 as p_mfgr, s_address@3 as s_address, s_phone@4 as s_phone, s_comment@6 as s_comment]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@1), (ps_supplycost@7, min(partsupp.ps_supplycost)@0)], projection=[p_partkey@0, p_mfgr@1, s_name@2, s_address@3, s_phone@4, s_acctbal@5, s_comment@6, n_name@8]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------RepartitionExec: partitioning=Hash([p_partkey@0, ps_supplycost@7], 4), input_partitions=4
-08)--------------CoalesceBatchesExec: target_batch_size=8192
-09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_regionkey@9, r_regionkey@0)], projection=[p_partkey@0, p_mfgr@1, s_name@2, s_address@3, s_phone@4, s_acctbal@5, s_comment@6, ps_supplycost@7, n_name@8]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------RepartitionExec: partitioning=Hash([n_regionkey@9], 4), input_partitions=4
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@4, n_nationkey@0)], projection=[p_partkey@0, p_mfgr@1, s_name@2, s_address@3, s_phone@5, s_acctbal@6, s_comment@7, ps_supplycost@8, n_name@10, n_regionkey@11]
-14)--------------------------CoalesceBatchesExec: target_batch_size=8192
-15)----------------------------RepartitionExec: partitioning=Hash([s_nationkey@4], 4), input_partitions=4
-16)------------------------------ProjectionExec: expr=[p_partkey@0 as p_partkey, p_mfgr@1 as p_mfgr, s_name@3 as s_name, s_address@4 as s_address, s_nationkey@5 as s_nationkey, s_phone@6 as s_phone, s_acctbal@7 as s_acctbal, s_comment@8 as s_comment, ps_supplycost@2 as ps_supplycost]
-17)--------------------------------CoalesceBatchesExec: target_batch_size=8192
-18)----------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@2, s_suppkey@0)], projection=[p_partkey@0, p_mfgr@1, ps_supplycost@3, s_name@5, s_address@6, s_nationkey@7, s_phone@8, s_acctbal@9, s_comment@10]
-19)------------------------------------CoalesceBatchesExec: target_batch_size=8192
-20)--------------------------------------RepartitionExec: partitioning=Hash([ps_suppkey@2], 4), input_partitions=4
-21)----------------------------------------CoalesceBatchesExec: target_batch_size=8192
-22)------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@0)], projection=[p_partkey@0, p_mfgr@1, ps_suppkey@3, ps_supplycost@4]
-23)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-24)----------------------------------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
-25)------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-26)--------------------------------------------------FilterExec: p_size@3 = 15 AND p_type@2 LIKE %BRASS, projection=[p_partkey@0, p_mfgr@1]
-27)----------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-28)------------------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_mfgr, p_type, p_size], file_type=csv, has_header=false
-29)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-30)----------------------------------------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4
-31)------------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_supplycost], file_type=csv, has_header=false
-32)------------------------------------CoalesceBatchesExec: target_batch_size=8192
-33)--------------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-34)----------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-35)------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment], file_type=csv, has_header=false
-36)--------------------------CoalesceBatchesExec: target_batch_size=8192
-37)----------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-38)------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-39)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name, n_regionkey], file_type=csv, has_header=false
-40)------------------CoalesceBatchesExec: target_batch_size=8192
-41)--------------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4
-42)----------------------CoalesceBatchesExec: target_batch_size=8192
-43)------------------------FilterExec: r_name@1 = EUROPE, projection=[r_regionkey@0]
-44)--------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-45)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], file_type=csv, has_header=false
-46)----------CoalesceBatchesExec: target_batch_size=8192
-47)------------RepartitionExec: partitioning=Hash([ps_partkey@1, min(partsupp.ps_supplycost)@0], 4), input_partitions=4
-48)--------------ProjectionExec: expr=[min(partsupp.ps_supplycost)@1 as min(partsupp.ps_supplycost), ps_partkey@0 as ps_partkey]
-49)----------------AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)]
-50)------------------CoalesceBatchesExec: target_batch_size=8192
-51)--------------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4
-52)----------------------AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)]
-53)------------------------CoalesceBatchesExec: target_batch_size=8192
-54)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_regionkey@2, r_regionkey@0)], projection=[ps_partkey@0, ps_supplycost@1]
-55)----------------------------CoalesceBatchesExec: target_batch_size=8192
-56)------------------------------RepartitionExec: partitioning=Hash([n_regionkey@2], 4), input_partitions=4
-57)--------------------------------CoalesceBatchesExec: target_batch_size=8192
-58)----------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[ps_partkey@0, ps_supplycost@1, n_regionkey@4]
-59)------------------------------------CoalesceBatchesExec: target_batch_size=8192
-60)--------------------------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4
-61)----------------------------------------CoalesceBatchesExec: target_batch_size=8192
-62)------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@1, s_suppkey@0)], projection=[ps_partkey@0, ps_supplycost@2, s_nationkey@4]
-63)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-64)----------------------------------------------RepartitionExec: partitioning=Hash([ps_suppkey@1], 4), input_partitions=4
-65)------------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_supplycost], file_type=csv, has_header=false
-66)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-67)----------------------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-68)------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-69)--------------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
-70)------------------------------------CoalesceBatchesExec: target_batch_size=8192
-71)--------------------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-72)----------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-73)------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_regionkey], file_type=csv, has_header=false
-74)----------------------------CoalesceBatchesExec: target_batch_size=8192
-75)------------------------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4
-76)--------------------------------CoalesceBatchesExec: target_batch_size=8192
-77)----------------------------------FilterExec: r_name@1 = EUROPE, projection=[r_regionkey@0]
-78)------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-79)--------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], file_type=csv, has_header=false
+03)----HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@1), (ps_supplycost@7, min(partsupp.ps_supplycost)@0)], projection=[s_acctbal@5, s_name@2, n_name@8, p_partkey@0, p_mfgr@1, s_address@3, s_phone@4, s_comment@6]
+04)------RepartitionExec: partitioning=Hash([p_partkey@0, ps_supplycost@7], 4), input_partitions=4
+05)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_regionkey@9, r_regionkey@0)], projection=[p_partkey@0, p_mfgr@1, s_name@2, s_address@3, s_phone@4, s_acctbal@5, s_comment@6, ps_supplycost@7, n_name@8]
+06)----------RepartitionExec: partitioning=Hash([n_regionkey@9], 4), input_partitions=4
+07)------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@4, n_nationkey@0)], projection=[p_partkey@0, p_mfgr@1, s_name@2, s_address@3, s_phone@5, s_acctbal@6, s_comment@7, ps_supplycost@8, n_name@10, n_regionkey@11]
+08)--------------RepartitionExec: partitioning=Hash([s_nationkey@4], 4), input_partitions=4
+09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@2, s_suppkey@0)], projection=[p_partkey@0, p_mfgr@1, s_name@5, s_address@6, s_nationkey@7, s_phone@8, s_acctbal@9, s_comment@10, ps_supplycost@3]
+10)------------------RepartitionExec: partitioning=Hash([ps_suppkey@2], 4), input_partitions=4
+11)--------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@0)], projection=[p_partkey@0, p_mfgr@1, ps_suppkey@3, ps_supplycost@4]
+12)----------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
+13)------------------------FilterExec: p_size@3 = 15 AND p_type@2 LIKE %BRASS, projection=[p_partkey@0, p_mfgr@1]
+14)--------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+15)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_mfgr, p_type, p_size], file_type=csv, has_header=false
+16)----------------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4
+17)------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_supplycost], file_type=csv, has_header=false
+18)------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1
+19)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment], file_type=csv, has_header=false
+20)--------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1
+21)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name, n_regionkey], file_type=csv, has_header=false
+22)----------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4
+23)------------FilterExec: r_name@1 = EUROPE, projection=[r_regionkey@0]
+24)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+25)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], file_type=csv, has_header=false
+26)------RepartitionExec: partitioning=Hash([ps_partkey@1, min(partsupp.ps_supplycost)@0], 4), input_partitions=4
+27)--------ProjectionExec: expr=[min(partsupp.ps_supplycost)@1 as min(partsupp.ps_supplycost), ps_partkey@0 as ps_partkey]
+28)----------AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)]
+29)------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4
+30)--------------AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)]
+31)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_regionkey@2, r_regionkey@0)], projection=[ps_partkey@0, ps_supplycost@1]
+32)------------------RepartitionExec: partitioning=Hash([n_regionkey@2], 4), input_partitions=4
+33)--------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[ps_partkey@0, ps_supplycost@1, n_regionkey@4]
+34)----------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4
+35)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@1, s_suppkey@0)], projection=[ps_partkey@0, ps_supplycost@2, s_nationkey@4]
+36)--------------------------RepartitionExec: partitioning=Hash([ps_suppkey@1], 4), input_partitions=4
+37)----------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_supplycost], file_type=csv, has_header=false
+38)--------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1
+39)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
+40)----------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1
+41)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_regionkey], file_type=csv, has_header=false
+42)------------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4
+43)--------------------FilterExec: r_name@1 = EUROPE, projection=[r_regionkey@0]
+44)----------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+45)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q20.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q20.slt.part
index 0b994de411ea3..426a1cbaa4e22 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q20.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q20.slt.part
@@ -83,44 +83,28 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [s_name@0 ASC NULLS LAST]
 02)--SortExec: expr=[s_name@0 ASC NULLS LAST], preserve_partitioning=[true]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(s_suppkey@0, ps_suppkey@0)], projection=[s_name@1, s_address@2]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@3, n_nationkey@0)], projection=[s_suppkey@0, s_name@1, s_address@2]
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------RepartitionExec: partitioning=Hash([s_nationkey@3], 4), input_partitions=4
-11)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-12)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_address, s_nationkey], file_type=csv, has_header=false
-13)----------------CoalesceBatchesExec: target_batch_size=8192
-14)------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-15)--------------------CoalesceBatchesExec: target_batch_size=8192
-16)----------------------FilterExec: n_name@1 = CANADA, projection=[n_nationkey@0]
-17)------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-18)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
-19)--------CoalesceBatchesExec: target_batch_size=8192
-20)----------RepartitionExec: partitioning=Hash([ps_suppkey@0], 4), input_partitions=4
-21)------------CoalesceBatchesExec: target_batch_size=8192
-22)--------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_partkey@0, l_partkey@1), (ps_suppkey@1, l_suppkey@2)], filter=CAST(ps_availqty@0 AS Float64) > Float64(0.5) * sum(lineitem.l_quantity)@1, projection=[ps_suppkey@1]
-23)----------------CoalesceBatchesExec: target_batch_size=8192
-24)------------------RepartitionExec: partitioning=Hash([ps_partkey@0, ps_suppkey@1], 4), input_partitions=4
-25)--------------------CoalesceBatchesExec: target_batch_size=8192
-26)----------------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(ps_partkey@0, p_partkey@0)]
-27)------------------------CoalesceBatchesExec: target_batch_size=8192
-28)--------------------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4
-29)----------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_availqty], file_type=csv, has_header=false
-30)------------------------CoalesceBatchesExec: target_batch_size=8192
-31)--------------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
-32)----------------------------CoalesceBatchesExec: target_batch_size=8192
-33)------------------------------FilterExec: p_name@1 LIKE forest%, projection=[p_partkey@0]
-34)--------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-35)----------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_name], file_type=csv, has_header=false
-36)----------------ProjectionExec: expr=[0.5 * CAST(sum(lineitem.l_quantity)@2 AS Float64) as Float64(0.5) * sum(lineitem.l_quantity), l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey]
-37)------------------AggregateExec: mode=FinalPartitioned, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[sum(lineitem.l_quantity)]
-38)--------------------CoalesceBatchesExec: target_batch_size=8192
-39)----------------------RepartitionExec: partitioning=Hash([l_partkey@0, l_suppkey@1], 4), input_partitions=4
-40)------------------------AggregateExec: mode=Partial, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[sum(lineitem.l_quantity)]
-41)--------------------------CoalesceBatchesExec: target_batch_size=8192
-42)----------------------------FilterExec: l_shipdate@3 >= 1994-01-01 AND l_shipdate@3 < 1995-01-01, projection=[l_partkey@0, l_suppkey@1, l_quantity@2]
-43)------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_suppkey, l_quantity, l_shipdate], file_type=csv, has_header=false
+03)----HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(s_suppkey@0, ps_suppkey@0)], projection=[s_name@1, s_address@2]
+04)------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
+05)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@3, n_nationkey@0)], projection=[s_suppkey@0, s_name@1, s_address@2]
+06)----------RepartitionExec: partitioning=Hash([s_nationkey@3], 4), input_partitions=1
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_address, s_nationkey], file_type=csv, has_header=false
+08)----------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
+09)------------FilterExec: n_name@1 = CANADA, projection=[n_nationkey@0]
+10)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+11)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
+12)------RepartitionExec: partitioning=Hash([ps_suppkey@0], 4), input_partitions=4
+13)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_partkey@0, l_partkey@1), (ps_suppkey@1, l_suppkey@2)], filter=CAST(ps_availqty@0 AS Float64) > Float64(0.5) * sum(lineitem.l_quantity)@1, projection=[ps_suppkey@1]
+14)----------RepartitionExec: partitioning=Hash([ps_partkey@0, ps_suppkey@1], 4), input_partitions=4
+15)------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(ps_partkey@0, p_partkey@0)]
+16)--------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4
+17)----------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_availqty], file_type=csv, has_header=false
+18)--------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
+19)----------------FilterExec: p_name@1 LIKE forest%, projection=[p_partkey@0]
+20)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+21)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_name], file_type=csv, has_header=false
+22)----------ProjectionExec: expr=[0.5 * CAST(sum(lineitem.l_quantity)@2 AS Float64) as Float64(0.5) * sum(lineitem.l_quantity), l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey]
+23)------------AggregateExec: mode=FinalPartitioned, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[sum(lineitem.l_quantity)]
+24)--------------RepartitionExec: partitioning=Hash([l_partkey@0, l_suppkey@1], 4), input_partitions=4
+25)----------------AggregateExec: mode=Partial, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[sum(lineitem.l_quantity)]
+26)------------------FilterExec: l_shipdate@3 >= 1994-01-01 AND l_shipdate@3 < 1995-01-01, projection=[l_partkey@0, l_suppkey@1, l_quantity@2]
+27)--------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_suppkey, l_quantity, l_shipdate], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q21.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q21.slt.part
index e52171524007e..5e9192d677532 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q21.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q21.slt.part
@@ -94,50 +94,30 @@ physical_plan
 02)--SortExec: expr=[numwait@1 DESC, s_name@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[s_name@0 as s_name, count(Int64(1))@1 as numwait]
 04)------AggregateExec: mode=FinalPartitioned, gby=[s_name@0 as s_name], aggr=[count(Int64(1))]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([s_name@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[s_name@0 as s_name], aggr=[count(Int64(1))]
-08)--------------CoalesceBatchesExec: target_batch_size=8192
-09)----------------HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(l_orderkey@1, l_orderkey@0)], filter=l_suppkey@1 != l_suppkey@0, projection=[s_name@0]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(l_orderkey@1, l_orderkey@0)], filter=l_suppkey@1 != l_suppkey@0
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------RepartitionExec: partitioning=Hash([l_orderkey@1], 4), input_partitions=4
-14)--------------------------CoalesceBatchesExec: target_batch_size=8192
-15)----------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@1, n_nationkey@0)], projection=[s_name@0, l_orderkey@2, l_suppkey@3]
-16)------------------------------CoalesceBatchesExec: target_batch_size=8192
-17)--------------------------------RepartitionExec: partitioning=Hash([s_nationkey@1], 4), input_partitions=4
-18)----------------------------------CoalesceBatchesExec: target_batch_size=8192
-19)------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@2, o_orderkey@0)], projection=[s_name@0, s_nationkey@1, l_orderkey@2, l_suppkey@3]
-20)--------------------------------------CoalesceBatchesExec: target_batch_size=8192
-21)----------------------------------------RepartitionExec: partitioning=Hash([l_orderkey@2], 4), input_partitions=4
-22)------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-23)--------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1)], projection=[s_name@1, s_nationkey@2, l_orderkey@3, l_suppkey@4]
-24)----------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-25)------------------------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-26)--------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-27)----------------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_nationkey], file_type=csv, has_header=false
-28)----------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-29)------------------------------------------------RepartitionExec: partitioning=Hash([l_suppkey@1], 4), input_partitions=4
-30)--------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-31)----------------------------------------------------FilterExec: l_receiptdate@3 > l_commitdate@2, projection=[l_orderkey@0, l_suppkey@1]
-32)------------------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], file_type=csv, has_header=false
-33)--------------------------------------CoalesceBatchesExec: target_batch_size=8192
-34)----------------------------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
-35)------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-36)--------------------------------------------FilterExec: o_orderstatus@1 = F, projection=[o_orderkey@0]
-37)----------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_orderstatus], file_type=csv, has_header=false
-38)------------------------------CoalesceBatchesExec: target_batch_size=8192
-39)--------------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-40)----------------------------------CoalesceBatchesExec: target_batch_size=8192
-41)------------------------------------FilterExec: n_name@1 = SAUDI ARABIA, projection=[n_nationkey@0]
-42)--------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-43)----------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
-44)----------------------CoalesceBatchesExec: target_batch_size=8192
-45)------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-46)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey], file_type=csv, has_header=false
-47)------------------CoalesceBatchesExec: target_batch_size=8192
-48)--------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-49)----------------------CoalesceBatchesExec: target_batch_size=8192
-50)------------------------FilterExec: l_receiptdate@3 > l_commitdate@2, projection=[l_orderkey@0, l_suppkey@1]
-51)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([s_name@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[s_name@0 as s_name], aggr=[count(Int64(1))]
+07)------------HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(l_orderkey@1, l_orderkey@0)], filter=l_suppkey@1 != l_suppkey@0, projection=[s_name@0]
+08)--------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(l_orderkey@1, l_orderkey@0)], filter=l_suppkey@1 != l_suppkey@0
+09)----------------RepartitionExec: partitioning=Hash([l_orderkey@1], 4), input_partitions=4
+10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@1, n_nationkey@0)], projection=[s_name@0, l_orderkey@2, l_suppkey@3]
+11)--------------------RepartitionExec: partitioning=Hash([s_nationkey@1], 4), input_partitions=4
+12)----------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@2, o_orderkey@0)], projection=[s_name@0, s_nationkey@1, l_orderkey@2, l_suppkey@3]
+13)------------------------RepartitionExec: partitioning=Hash([l_orderkey@2], 4), input_partitions=4
+14)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1)], projection=[s_name@1, s_nationkey@2, l_orderkey@3, l_suppkey@4]
+15)----------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1
+16)------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_nationkey], file_type=csv, has_header=false
+17)----------------------------RepartitionExec: partitioning=Hash([l_suppkey@1], 4), input_partitions=4
+18)------------------------------FilterExec: l_receiptdate@3 > l_commitdate@2, projection=[l_orderkey@0, l_suppkey@1]
+19)--------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], file_type=csv, has_header=false
+20)------------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
+21)--------------------------FilterExec: o_orderstatus@1 = F, projection=[o_orderkey@0]
+22)----------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_orderstatus], file_type=csv, has_header=false
+23)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
+24)----------------------FilterExec: n_name@1 = SAUDI ARABIA, projection=[n_nationkey@0]
+25)------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+26)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
+27)----------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+28)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey], file_type=csv, has_header=false
+29)--------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+30)----------------FilterExec: l_receiptdate@3 > l_commitdate@2, projection=[l_orderkey@0, l_suppkey@1]
+31)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part
index 22476156b80d8..3240cbfb697d5 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part
@@ -61,46 +61,36 @@ logical_plan
 03)----Aggregate: groupBy=[[custsale.cntrycode]], aggr=[[count(Int64(1)), sum(custsale.c_acctbal)]]
 04)------SubqueryAlias: custsale
 05)--------Projection: substr(customer.c_phone, Int64(1), Int64(2)) AS cntrycode, customer.c_acctbal
-06)----------Inner Join:  Filter: CAST(customer.c_acctbal AS Decimal128(19, 6)) > __scalar_sq_2.avg(customer.c_acctbal)
-07)------------Projection: customer.c_phone, customer.c_acctbal
-08)--------------LeftAnti Join: customer.c_custkey = __correlated_sq_1.o_custkey
-09)----------------Filter: substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8View("13"), Utf8View("31"), Utf8View("23"), Utf8View("29"), Utf8View("30"), Utf8View("18"), Utf8View("17")])
-10)------------------TableScan: customer projection=[c_custkey, c_phone, c_acctbal], partial_filters=[substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8View("13"), Utf8View("31"), Utf8View("23"), Utf8View("29"), Utf8View("30"), Utf8View("18"), Utf8View("17")]), Boolean(true)]
-11)----------------SubqueryAlias: __correlated_sq_1
-12)------------------TableScan: orders projection=[o_custkey]
-13)------------SubqueryAlias: __scalar_sq_2
-14)--------------Aggregate: groupBy=[[]], aggr=[[avg(customer.c_acctbal)]]
-15)----------------Projection: customer.c_acctbal
-16)------------------Filter: customer.c_acctbal > Decimal128(Some(0),15,2) AND substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8View("13"), Utf8View("31"), Utf8View("23"), Utf8View("29"), Utf8View("30"), Utf8View("18"), Utf8View("17")])
-17)--------------------TableScan: customer projection=[c_phone, c_acctbal], partial_filters=[customer.c_acctbal > Decimal128(Some(0),15,2), substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8View("13"), Utf8View("31"), Utf8View("23"), Utf8View("29"), Utf8View("30"), Utf8View("18"), Utf8View("17")])]
+06)----------LeftAnti Join: customer.c_custkey = __correlated_sq_1.o_custkey
+07)------------Filter: substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8View("13"), Utf8View("31"), Utf8View("23"), Utf8View("29"), Utf8View("30"), Utf8View("18"), Utf8View("17")]) AND CAST(customer.c_acctbal AS Decimal128(19, 6)) > (<subquery>)
+08)--------------Subquery:
+09)----------------Aggregate: groupBy=[[]], aggr=[[avg(customer.c_acctbal)]]
+10)------------------Projection: customer.c_acctbal
+11)--------------------Filter: customer.c_acctbal > Decimal128(Some(0),15,2) AND substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8View("13"), Utf8View("31"), Utf8View("23"), Utf8View("29"), Utf8View("30"), Utf8View("18"), Utf8View("17")])
+12)----------------------TableScan: customer projection=[c_phone, c_acctbal], partial_filters=[customer.c_acctbal > Decimal128(Some(0),15,2), substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8View("13"), Utf8View("31"), Utf8View("23"), Utf8View("29"), Utf8View("30"), Utf8View("18"), Utf8View("17")])]
+13)--------------TableScan: customer projection=[c_custkey, c_phone, c_acctbal], partial_filters=[substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8View("13"), Utf8View("31"), Utf8View("23"), Utf8View("29"), Utf8View("30"), Utf8View("18"), Utf8View("17")])]
+14)------------SubqueryAlias: __correlated_sq_1
+15)--------------TableScan: orders projection=[o_custkey]
 physical_plan
-01)SortPreservingMergeExec: [cntrycode@0 ASC NULLS LAST]
-02)--SortExec: expr=[cntrycode@0 ASC NULLS LAST], preserve_partitioning=[true]
-03)----ProjectionExec: expr=[cntrycode@0 as cntrycode, count(Int64(1))@1 as numcust, sum(custsale.c_acctbal)@2 as totacctbal]
-04)------AggregateExec: mode=FinalPartitioned, gby=[cntrycode@0 as cntrycode], aggr=[count(Int64(1)), sum(custsale.c_acctbal)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([cntrycode@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[cntrycode@0 as cntrycode], aggr=[count(Int64(1)), sum(custsale.c_acctbal)]
-08)--------------ProjectionExec: expr=[substr(c_phone@0, 1, 2) as cntrycode, c_acctbal@1 as c_acctbal]
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-10)------------------NestedLoopJoinExec: join_type=Inner, filter=join_proj_push_down_1@1 > avg(customer.c_acctbal)@0, projection=[c_phone@0, c_acctbal@1, avg(customer.c_acctbal)@3]
-11)--------------------ProjectionExec: expr=[c_phone@0 as c_phone, c_acctbal@1 as c_acctbal, CAST(c_acctbal@1 AS Decimal128(19, 6)) as join_proj_push_down_1]
-12)----------------------CoalescePartitionsExec
-13)------------------------CoalesceBatchesExec: target_batch_size=8192
-14)--------------------------HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(c_custkey@0, o_custkey@0)], projection=[c_phone@1, c_acctbal@2]
-15)----------------------------CoalesceBatchesExec: target_batch_size=8192
-16)------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4
-17)--------------------------------CoalesceBatchesExec: target_batch_size=8192
-18)----------------------------------FilterExec: substr(c_phone@1, 1, 2) IN ([13, 31, 23, 29, 30, 18, 17])
-19)------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-20)--------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_phone, c_acctbal], file_type=csv, has_header=false
-21)----------------------------CoalesceBatchesExec: target_batch_size=8192
-22)------------------------------RepartitionExec: partitioning=Hash([o_custkey@0], 4), input_partitions=4
-23)--------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_custkey], file_type=csv, has_header=false
-24)--------------------AggregateExec: mode=Final, gby=[], aggr=[avg(customer.c_acctbal)]
-25)----------------------CoalescePartitionsExec
-26)------------------------AggregateExec: mode=Partial, gby=[], aggr=[avg(customer.c_acctbal)]
-27)--------------------------CoalesceBatchesExec: target_batch_size=8192
-28)----------------------------FilterExec: c_acctbal@1 > Some(0),15,2 AND substr(c_phone@0, 1, 2) IN ([13, 31, 23, 29, 30, 18, 17]), projection=[c_acctbal@1]
-29)------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-30)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_phone, c_acctbal], file_type=csv, has_header=false
+01)ScalarSubqueryExec: subqueries=1
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+03)----SortPreservingMergeExec: [cntrycode@0 ASC NULLS LAST]
+04)------SortExec: expr=[cntrycode@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)--------ProjectionExec: expr=[cntrycode@0 as cntrycode, count(Int64(1))@1 as numcust, sum(custsale.c_acctbal)@2 as totacctbal]
+06)----------AggregateExec: mode=FinalPartitioned, gby=[cntrycode@0 as cntrycode], aggr=[count(Int64(1)), sum(custsale.c_acctbal)]
+07)------------RepartitionExec: partitioning=Hash([cntrycode@0], 4), input_partitions=4
+08)--------------AggregateExec: mode=Partial, gby=[cntrycode@0 as cntrycode], aggr=[count(Int64(1)), sum(custsale.c_acctbal)]
+09)----------------ProjectionExec: expr=[substr(c_phone@0, 1, 2) as cntrycode, c_acctbal@1 as c_acctbal]
+10)------------------HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(c_custkey@0, o_custkey@0)], projection=[c_phone@1, c_acctbal@2]
+11)--------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4
+12)----------------------FilterExec: substr(c_phone@1, 1, 2) IN (SET) ([13, 31, 23, 29, 30, 18, 17]) AND CAST(c_acctbal@2 AS Decimal128(19, 6)) > scalar_subquery(<pending>)
+13)------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+14)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_phone, c_acctbal], file_type=csv, has_header=false
+15)--------------------RepartitionExec: partitioning=Hash([o_custkey@0], 4), input_partitions=4
+16)----------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_custkey], file_type=csv, has_header=false
+17)--AggregateExec: mode=Final, gby=[], aggr=[avg(customer.c_acctbal)]
+18)----CoalescePartitionsExec
+19)------AggregateExec: mode=Partial, gby=[], aggr=[avg(customer.c_acctbal)]
+20)--------FilterExec: c_acctbal@1 > Some(0),15,2 AND substr(c_phone@0, 1, 2) IN (SET) ([13, 31, 23, 29, 30, 18, 17]), projection=[c_acctbal@1]
+21)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+22)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_phone, c_acctbal], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q3.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q3.slt.part
index d982ec32e9547..7fec4e5f5d624 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q3.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q3.slt.part
@@ -61,29 +61,17 @@ physical_plan
 01)SortPreservingMergeExec: [revenue@1 DESC, o_orderdate@2 ASC NULLS LAST], fetch=10
 02)--SortExec: TopK(fetch=10), expr=[revenue@1 DESC, o_orderdate@2 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[l_orderkey@0 as l_orderkey, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@3 as revenue, o_orderdate@1 as o_orderdate, o_shippriority@2 as o_shippriority]
-04)------AggregateExec: mode=FinalPartitioned, gby=[l_orderkey@0 as l_orderkey, o_orderdate@1 as o_orderdate, o_shippriority@2 as o_shippriority], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([l_orderkey@0, o_orderdate@1, o_shippriority@2], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[l_orderkey@2 as l_orderkey, o_orderdate@0 as o_orderdate, o_shippriority@1 as o_shippriority], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-08)--------------CoalesceBatchesExec: target_batch_size=8192
-09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@0, l_orderkey@0)], projection=[o_orderdate@1, o_shippriority@2, l_orderkey@3, l_extendedprice@4, l_discount@5]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[o_orderkey@1, o_orderdate@3, o_shippriority@4]
-14)--------------------------CoalesceBatchesExec: target_batch_size=8192
-15)----------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4
-16)------------------------------CoalesceBatchesExec: target_batch_size=8192
-17)--------------------------------FilterExec: c_mktsegment@1 = BUILDING, projection=[c_custkey@0]
-18)----------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-19)------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_mktsegment], file_type=csv, has_header=false
-20)--------------------------CoalesceBatchesExec: target_batch_size=8192
-21)----------------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
-22)------------------------------CoalesceBatchesExec: target_batch_size=8192
-23)--------------------------------FilterExec: o_orderdate@2 < 1995-03-15
-24)----------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate, o_shippriority], file_type=csv, has_header=false
-25)------------------CoalesceBatchesExec: target_batch_size=8192
-26)--------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-27)----------------------CoalesceBatchesExec: target_batch_size=8192
-28)------------------------FilterExec: l_shipdate@3 > 1995-03-15, projection=[l_orderkey@0, l_extendedprice@1, l_discount@2]
-29)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
+04)------AggregateExec: mode=SinglePartitioned, gby=[l_orderkey@2 as l_orderkey, o_orderdate@0 as o_orderdate, o_shippriority@1 as o_shippriority], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
+05)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@0, l_orderkey@0)], projection=[o_orderdate@1, o_shippriority@2, l_orderkey@3, l_extendedprice@4, l_discount@5]
+06)----------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
+07)------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[o_orderkey@1, o_orderdate@3, o_shippriority@4]
+08)--------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4
+09)----------------FilterExec: c_mktsegment@1 = BUILDING, projection=[c_custkey@0]
+10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_mktsegment], file_type=csv, has_header=false
+12)--------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
+13)----------------FilterExec: o_orderdate@2 < 1995-03-15
+14)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate, o_shippriority], file_type=csv, has_header=false
+15)----------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+16)------------FilterExec: l_shipdate@3 > 1995-03-15, projection=[l_orderkey@0, l_extendedprice@1, l_discount@2]
+17)--------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q4.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q4.slt.part
index f7de3cd3c967c..0007666f15365 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q4.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q4.slt.part
@@ -57,18 +57,12 @@ physical_plan
 02)--SortExec: expr=[o_orderpriority@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[o_orderpriority@0 as o_orderpriority, count(Int64(1))@1 as order_count]
 04)------AggregateExec: mode=FinalPartitioned, gby=[o_orderpriority@0 as o_orderpriority], aggr=[count(Int64(1))]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([o_orderpriority@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[o_orderpriority@0 as o_orderpriority], aggr=[count(Int64(1))]
-08)--------------CoalesceBatchesExec: target_batch_size=8192
-09)----------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(o_orderkey@0, l_orderkey@0)], projection=[o_orderpriority@1]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------FilterExec: o_orderdate@1 >= 1993-07-01 AND o_orderdate@1 < 1993-10-01, projection=[o_orderkey@0, o_orderpriority@2]
-14)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_orderdate, o_orderpriority], file_type=csv, has_header=false
-15)------------------CoalesceBatchesExec: target_batch_size=8192
-16)--------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-17)----------------------CoalesceBatchesExec: target_batch_size=8192
-18)------------------------FilterExec: l_receiptdate@2 > l_commitdate@1, projection=[l_orderkey@0]
-19)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_commitdate, l_receiptdate], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([o_orderpriority@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[o_orderpriority@0 as o_orderpriority], aggr=[count(Int64(1))]
+07)------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(o_orderkey@0, l_orderkey@0)], projection=[o_orderpriority@1]
+08)--------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
+09)----------------FilterExec: o_orderdate@1 >= 1993-07-01 AND o_orderdate@1 < 1993-10-01, projection=[o_orderkey@0, o_orderpriority@2]
+10)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_orderdate, o_orderpriority], file_type=csv, has_header=false
+11)--------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+12)----------------FilterExec: l_receiptdate@2 > l_commitdate@1, projection=[l_orderkey@0]
+13)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_commitdate, l_receiptdate], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q5.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q5.slt.part
index 15636056b8714..d854001f3cc4c 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q5.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q5.slt.part
@@ -71,50 +71,29 @@ physical_plan
 02)--SortExec: expr=[revenue@1 DESC], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[n_name@0 as n_name, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as revenue]
 04)------AggregateExec: mode=FinalPartitioned, gby=[n_name@0 as n_name], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([n_name@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[n_name@2 as n_name], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-08)--------------CoalesceBatchesExec: target_batch_size=8192
-09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_regionkey@3, r_regionkey@0)], projection=[l_extendedprice@0, l_discount@1, n_name@2]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------RepartitionExec: partitioning=Hash([n_regionkey@3], 4), input_partitions=4
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[l_extendedprice@0, l_discount@1, n_name@4, n_regionkey@5]
-14)--------------------------CoalesceBatchesExec: target_batch_size=8192
-15)----------------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4
-16)------------------------------CoalesceBatchesExec: target_batch_size=8192
-17)--------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_suppkey@1, s_suppkey@0), (c_nationkey@0, s_nationkey@1)], projection=[l_extendedprice@2, l_discount@3, s_nationkey@5]
-18)----------------------------------CoalesceBatchesExec: target_batch_size=8192
-19)------------------------------------RepartitionExec: partitioning=Hash([l_suppkey@1, c_nationkey@0], 4), input_partitions=4
-20)--------------------------------------CoalesceBatchesExec: target_batch_size=8192
-21)----------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@1, l_orderkey@0)], projection=[c_nationkey@0, l_suppkey@3, l_extendedprice@4, l_discount@5]
-22)------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-23)--------------------------------------------RepartitionExec: partitioning=Hash([o_orderkey@1], 4), input_partitions=4
-24)----------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-25)------------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_nationkey@1, o_orderkey@2]
-26)--------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-27)----------------------------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4
-28)------------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-29)--------------------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_nationkey], file_type=csv, has_header=false
-30)--------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-31)----------------------------------------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
-32)------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-33)--------------------------------------------------------FilterExec: o_orderdate@2 >= 1994-01-01 AND o_orderdate@2 < 1995-01-01, projection=[o_orderkey@0, o_custkey@1]
-34)----------------------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate], file_type=csv, has_header=false
-35)------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-36)--------------------------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-37)----------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount], file_type=csv, has_header=false
-38)----------------------------------CoalesceBatchesExec: target_batch_size=8192
-39)------------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0, s_nationkey@1], 4), input_partitions=4
-40)--------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-41)----------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
-42)--------------------------CoalesceBatchesExec: target_batch_size=8192
-43)----------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-44)------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-45)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name, n_regionkey], file_type=csv, has_header=false
-46)------------------CoalesceBatchesExec: target_batch_size=8192
-47)--------------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4
-48)----------------------CoalesceBatchesExec: target_batch_size=8192
-49)------------------------FilterExec: r_name@1 = ASIA, projection=[r_regionkey@0]
-50)--------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-51)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([n_name@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[n_name@2 as n_name], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
+07)------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_regionkey@3, r_regionkey@0)], projection=[l_extendedprice@0, l_discount@1, n_name@2]
+08)--------------RepartitionExec: partitioning=Hash([n_regionkey@3], 4), input_partitions=4
+09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[l_extendedprice@0, l_discount@1, n_name@4, n_regionkey@5]
+10)------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4
+11)--------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_suppkey@1, s_suppkey@0), (c_nationkey@0, s_nationkey@1)], projection=[l_extendedprice@2, l_discount@3, s_nationkey@5]
+12)----------------------RepartitionExec: partitioning=Hash([l_suppkey@1, c_nationkey@0], 4), input_partitions=4
+13)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@1, l_orderkey@0)], projection=[c_nationkey@0, l_suppkey@3, l_extendedprice@4, l_discount@5]
+14)--------------------------RepartitionExec: partitioning=Hash([o_orderkey@1], 4), input_partitions=4
+15)----------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_nationkey@1, o_orderkey@2]
+16)------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=1
+17)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_nationkey], file_type=csv, has_header=false
+18)------------------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
+19)--------------------------------FilterExec: o_orderdate@2 >= 1994-01-01 AND o_orderdate@2 < 1995-01-01, projection=[o_orderkey@0, o_custkey@1]
+20)----------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate], file_type=csv, has_header=false
+21)--------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+22)----------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount], file_type=csv, has_header=false
+23)----------------------RepartitionExec: partitioning=Hash([s_suppkey@0, s_nationkey@1], 4), input_partitions=1
+24)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
+25)------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1
+26)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name, n_regionkey], file_type=csv, has_header=false
+27)--------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4
+28)----------------FilterExec: r_name@1 = ASIA, projection=[r_regionkey@0]
+29)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+30)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q6.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q6.slt.part
index b1e5d2869a8c5..eb9063d691712 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q6.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q6.slt.part
@@ -38,6 +38,5 @@ physical_plan
 02)--AggregateExec: mode=Final, gby=[], aggr=[sum(lineitem.l_extendedprice * lineitem.l_discount)]
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[sum(lineitem.l_extendedprice * lineitem.l_discount)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------FilterExec: l_shipdate@3 >= 1994-01-01 AND l_shipdate@3 < 1995-01-01 AND l_discount@2 >= Some(5),15,2 AND l_discount@2 <= Some(7),15,2 AND l_quantity@0 < Some(2400),15,2, projection=[l_extendedprice@1, l_discount@2]
-07)------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_quantity, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
+05)--------FilterExec: l_shipdate@3 >= 1994-01-01 AND l_shipdate@3 < 1995-01-01 AND l_discount@2 >= Some(5),15,2 AND l_discount@2 <= Some(7),15,2 AND l_quantity@0 < Some(2400),15,2, projection=[l_extendedprice@1, l_discount@2]
+06)----------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_quantity, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part
index 291d56e43f2df..590a737703847 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part
@@ -88,53 +88,32 @@ physical_plan
 02)--SortExec: expr=[supp_nation@0 ASC NULLS LAST, cust_nation@1 ASC NULLS LAST, l_year@2 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year, sum(shipping.volume)@3 as revenue]
 04)------AggregateExec: mode=FinalPartitioned, gby=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year], aggr=[sum(shipping.volume)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([supp_nation@0, cust_nation@1, l_year@2], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year], aggr=[sum(shipping.volume)]
-08)--------------ProjectionExec: expr=[n_name@3 as supp_nation, n_name@4 as cust_nation, date_part(YEAR, l_shipdate@2) as l_year, l_extendedprice@0 * (Some(1),20,0 - l_discount@1) as volume]
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_nationkey@3, n_nationkey@0)], filter=n_name@0 = FRANCE AND n_name@1 = GERMANY OR n_name@0 = GERMANY AND n_name@1 = FRANCE, projection=[l_extendedprice@0, l_discount@1, l_shipdate@2, n_name@4, n_name@6]
-11)--------------------CoalesceBatchesExec: target_batch_size=8192
-12)----------------------RepartitionExec: partitioning=Hash([c_nationkey@3], 4), input_partitions=4
-13)------------------------CoalesceBatchesExec: target_batch_size=8192
-14)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@0, n_nationkey@0)], projection=[l_extendedprice@1, l_discount@2, l_shipdate@3, c_nationkey@4, n_name@6]
-15)----------------------------CoalesceBatchesExec: target_batch_size=8192
-16)------------------------------RepartitionExec: partitioning=Hash([s_nationkey@0], 4), input_partitions=4
-17)--------------------------------CoalesceBatchesExec: target_batch_size=8192
-18)----------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_custkey@4, c_custkey@0)], projection=[s_nationkey@0, l_extendedprice@1, l_discount@2, l_shipdate@3, c_nationkey@6]
-19)------------------------------------CoalesceBatchesExec: target_batch_size=8192
-20)--------------------------------------RepartitionExec: partitioning=Hash([o_custkey@4], 4), input_partitions=4
-21)----------------------------------------CoalesceBatchesExec: target_batch_size=8192
-22)------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@1, o_orderkey@0)], projection=[s_nationkey@0, l_extendedprice@2, l_discount@3, l_shipdate@4, o_custkey@6]
-23)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-24)----------------------------------------------RepartitionExec: partitioning=Hash([l_orderkey@1], 4), input_partitions=4
-25)------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-26)--------------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1)], projection=[s_nationkey@1, l_orderkey@2, l_extendedprice@4, l_discount@5, l_shipdate@6]
-27)----------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-28)------------------------------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-29)--------------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-30)----------------------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
-31)----------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-32)------------------------------------------------------RepartitionExec: partitioning=Hash([l_suppkey@1], 4), input_partitions=4
-33)--------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-34)----------------------------------------------------------FilterExec: l_shipdate@4 >= 1995-01-01 AND l_shipdate@4 <= 1996-12-31
-35)------------------------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
-36)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-37)----------------------------------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
-38)------------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey], file_type=csv, has_header=false
-39)------------------------------------CoalesceBatchesExec: target_batch_size=8192
-40)--------------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4
-41)----------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-42)------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_nationkey], file_type=csv, has_header=false
-43)----------------------------CoalesceBatchesExec: target_batch_size=8192
-44)------------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-45)--------------------------------CoalesceBatchesExec: target_batch_size=8192
-46)----------------------------------FilterExec: n_name@1 = FRANCE OR n_name@1 = GERMANY
-47)------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-48)--------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
-49)--------------------CoalesceBatchesExec: target_batch_size=8192
-50)----------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-51)------------------------CoalesceBatchesExec: target_batch_size=8192
-52)--------------------------FilterExec: n_name@1 = GERMANY OR n_name@1 = FRANCE
-53)----------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-54)------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([supp_nation@0, cust_nation@1, l_year@2], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year], aggr=[sum(shipping.volume)]
+07)------------ProjectionExec: expr=[n_name@0 as supp_nation, n_name@1 as cust_nation, date_part(YEAR, l_shipdate@2) as l_year, l_extendedprice@3 * (Some(1),20,0 - l_discount@4) as volume]
+08)--------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_nationkey@3, n_nationkey@0)], filter=n_name@0 = FRANCE AND n_name@1 = GERMANY OR n_name@0 = GERMANY AND n_name@1 = FRANCE, projection=[n_name@4, n_name@6, l_shipdate@2, l_extendedprice@0, l_discount@1]
+09)----------------RepartitionExec: partitioning=Hash([c_nationkey@3], 4), input_partitions=4
+10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@0, n_nationkey@0)], projection=[l_extendedprice@1, l_discount@2, l_shipdate@3, c_nationkey@4, n_name@6]
+11)--------------------RepartitionExec: partitioning=Hash([s_nationkey@0], 4), input_partitions=4
+12)----------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_custkey@4, c_custkey@0)], projection=[s_nationkey@0, l_extendedprice@1, l_discount@2, l_shipdate@3, c_nationkey@6]
+13)------------------------RepartitionExec: partitioning=Hash([o_custkey@4], 4), input_partitions=4
+14)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@1, o_orderkey@0)], projection=[s_nationkey@0, l_extendedprice@2, l_discount@3, l_shipdate@4, o_custkey@6]
+15)----------------------------RepartitionExec: partitioning=Hash([l_orderkey@1], 4), input_partitions=4
+16)------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1)], projection=[s_nationkey@1, l_orderkey@2, l_extendedprice@4, l_discount@5, l_shipdate@6]
+17)--------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1
+18)----------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
+19)--------------------------------RepartitionExec: partitioning=Hash([l_suppkey@1], 4), input_partitions=4
+20)----------------------------------FilterExec: l_shipdate@4 >= 1995-01-01 AND l_shipdate@4 <= 1996-12-31
+21)------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
+22)----------------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
+23)------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey], file_type=csv, has_header=false
+24)------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=1
+25)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_nationkey], file_type=csv, has_header=false
+26)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
+27)----------------------FilterExec: n_name@1 = FRANCE OR n_name@1 = GERMANY
+28)------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+29)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
+30)----------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
+31)------------------FilterExec: n_name@1 = GERMANY OR n_name@1 = FRANCE
+32)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+33)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part
index 50171c528db6d..7f160ce072c4d 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part
@@ -94,69 +94,40 @@ physical_plan
 02)--SortExec: expr=[o_year@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[o_year@0 as o_year, CAST(CAST(sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END)@1 AS Decimal128(12, 2)) / CAST(sum(all_nations.volume)@2 AS Decimal128(12, 2)) AS Decimal128(15, 2)) as mkt_share]
 04)------AggregateExec: mode=FinalPartitioned, gby=[o_year@0 as o_year], aggr=[sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END), sum(all_nations.volume)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([o_year@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[o_year@0 as o_year], aggr=[sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END), sum(all_nations.volume)]
-08)--------------ProjectionExec: expr=[date_part(YEAR, o_orderdate@2) as o_year, l_extendedprice@0 * (Some(1),20,0 - l_discount@1) as volume, n_name@3 as nation]
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_regionkey@3, r_regionkey@0)], projection=[l_extendedprice@0, l_discount@1, o_orderdate@2, n_name@4]
-11)--------------------CoalesceBatchesExec: target_batch_size=8192
-12)----------------------RepartitionExec: partitioning=Hash([n_regionkey@3], 4), input_partitions=4
-13)------------------------CoalesceBatchesExec: target_batch_size=8192
-14)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[l_extendedprice@0, l_discount@1, o_orderdate@3, n_regionkey@4, n_name@6]
-15)----------------------------CoalesceBatchesExec: target_batch_size=8192
-16)------------------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4
-17)--------------------------------CoalesceBatchesExec: target_batch_size=8192
-18)----------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_nationkey@4, n_nationkey@0)], projection=[l_extendedprice@0, l_discount@1, s_nationkey@2, o_orderdate@3, n_regionkey@6]
-19)------------------------------------CoalesceBatchesExec: target_batch_size=8192
-20)--------------------------------------RepartitionExec: partitioning=Hash([c_nationkey@4], 4), input_partitions=4
-21)----------------------------------------CoalesceBatchesExec: target_batch_size=8192
-22)------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_custkey@3, c_custkey@0)], projection=[l_extendedprice@0, l_discount@1, s_nationkey@2, o_orderdate@4, c_nationkey@6]
-23)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-24)----------------------------------------------RepartitionExec: partitioning=Hash([o_custkey@3], 4), input_partitions=4
-25)------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-26)--------------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@0, o_orderkey@0)], projection=[l_extendedprice@1, l_discount@2, s_nationkey@3, o_custkey@5, o_orderdate@6]
-27)----------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-28)------------------------------------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-29)--------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-30)----------------------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_suppkey@1, s_suppkey@0)], projection=[l_orderkey@0, l_extendedprice@2, l_discount@3, s_nationkey@5]
-31)------------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-32)--------------------------------------------------------------RepartitionExec: partitioning=Hash([l_suppkey@1], 4), input_partitions=4
-33)----------------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-34)------------------------------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@1)], projection=[l_orderkey@1, l_suppkey@3, l_extendedprice@4, l_discount@5]
-35)--------------------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-36)----------------------------------------------------------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
-37)------------------------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-38)--------------------------------------------------------------------------FilterExec: p_type@1 = ECONOMY ANODIZED STEEL, projection=[p_partkey@0]
-39)----------------------------------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-40)------------------------------------------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_type], file_type=csv, has_header=false
-41)--------------------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-42)----------------------------------------------------------------------RepartitionExec: partitioning=Hash([l_partkey@1], 4), input_partitions=4
-43)------------------------------------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_partkey, l_suppkey, l_extendedprice, l_discount], file_type=csv, has_header=false
-44)------------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-45)--------------------------------------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-46)----------------------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-47)------------------------------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
-48)----------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-49)------------------------------------------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
-50)--------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-51)----------------------------------------------------------FilterExec: o_orderdate@2 >= 1995-01-01 AND o_orderdate@2 <= 1996-12-31
-52)------------------------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate], file_type=csv, has_header=false
-53)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-54)----------------------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4
-55)------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-56)--------------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_nationkey], file_type=csv, has_header=false
-57)------------------------------------CoalesceBatchesExec: target_batch_size=8192
-58)--------------------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-59)----------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-60)------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_regionkey], file_type=csv, has_header=false
-61)----------------------------CoalesceBatchesExec: target_batch_size=8192
-62)------------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-63)--------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-64)----------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
-65)--------------------CoalesceBatchesExec: target_batch_size=8192
-66)----------------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4
-67)------------------------CoalesceBatchesExec: target_batch_size=8192
-68)--------------------------FilterExec: r_name@1 = AMERICA, projection=[r_regionkey@0]
-69)----------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-70)------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([o_year@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[o_year@0 as o_year], aggr=[sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END), sum(all_nations.volume)]
+07)------------ProjectionExec: expr=[date_part(YEAR, o_orderdate@0) as o_year, l_extendedprice@1 * (Some(1),20,0 - l_discount@2) as volume, n_name@3 as nation]
+08)--------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_regionkey@3, r_regionkey@0)], projection=[o_orderdate@2, l_extendedprice@0, l_discount@1, n_name@4]
+09)----------------RepartitionExec: partitioning=Hash([n_regionkey@3], 4), input_partitions=4
+10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[l_extendedprice@0, l_discount@1, o_orderdate@3, n_regionkey@4, n_name@6]
+11)--------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4
+12)----------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_nationkey@4, n_nationkey@0)], projection=[l_extendedprice@0, l_discount@1, s_nationkey@2, o_orderdate@3, n_regionkey@6]
+13)------------------------RepartitionExec: partitioning=Hash([c_nationkey@4], 4), input_partitions=4
+14)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_custkey@3, c_custkey@0)], projection=[l_extendedprice@0, l_discount@1, s_nationkey@2, o_orderdate@4, c_nationkey@6]
+15)----------------------------RepartitionExec: partitioning=Hash([o_custkey@3], 4), input_partitions=4
+16)------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@0, o_orderkey@0)], projection=[l_extendedprice@1, l_discount@2, s_nationkey@3, o_custkey@5, o_orderdate@6]
+17)--------------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+18)----------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_suppkey@1, s_suppkey@0)], projection=[l_orderkey@0, l_extendedprice@2, l_discount@3, s_nationkey@5]
+19)------------------------------------RepartitionExec: partitioning=Hash([l_suppkey@1], 4), input_partitions=4
+20)--------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@1)], projection=[l_orderkey@1, l_suppkey@3, l_extendedprice@4, l_discount@5]
+21)----------------------------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
+22)------------------------------------------FilterExec: p_type@1 = ECONOMY ANODIZED STEEL, projection=[p_partkey@0]
+23)--------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+24)----------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_type], file_type=csv, has_header=false
+25)----------------------------------------RepartitionExec: partitioning=Hash([l_partkey@1], 4), input_partitions=4
+26)------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_partkey, l_suppkey, l_extendedprice, l_discount], file_type=csv, has_header=false
+27)------------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1
+28)--------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
+29)--------------------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
+30)----------------------------------FilterExec: o_orderdate@2 >= 1995-01-01 AND o_orderdate@2 <= 1996-12-31
+31)------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate], file_type=csv, has_header=false
+32)----------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=1
+33)------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_nationkey], file_type=csv, has_header=false
+34)------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1
+35)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_regionkey], file_type=csv, has_header=false
+36)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1
+37)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
+38)----------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4
+39)------------------FilterExec: r_name@1 = AMERICA, projection=[r_regionkey@0]
+40)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+41)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part
index 3b31c1bc2e8e3..7a973490be479 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part
@@ -79,48 +79,29 @@ physical_plan
 02)--SortExec: TopK(fetch=10), expr=[nation@0 ASC NULLS LAST, o_year@1 DESC], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[nation@0 as nation, o_year@1 as o_year, sum(profit.amount)@2 as sum_profit]
 04)------AggregateExec: mode=FinalPartitioned, gby=[nation@0 as nation, o_year@1 as o_year], aggr=[sum(profit.amount)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([nation@0, o_year@1], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[nation@0 as nation, o_year@1 as o_year], aggr=[sum(profit.amount)]
-08)--------------ProjectionExec: expr=[n_name@5 as nation, date_part(YEAR, o_orderdate@4) as o_year, l_extendedprice@1 * (Some(1),20,0 - l_discount@2) - ps_supplycost@3 * l_quantity@0 as amount]
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@3, n_nationkey@0)], projection=[l_quantity@0, l_extendedprice@1, l_discount@2, ps_supplycost@4, o_orderdate@5, n_name@7]
-11)--------------------CoalesceBatchesExec: target_batch_size=8192
-12)----------------------RepartitionExec: partitioning=Hash([s_nationkey@3], 4), input_partitions=4
-13)------------------------CoalesceBatchesExec: target_batch_size=8192
-14)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@0, o_orderkey@0)], projection=[l_quantity@1, l_extendedprice@2, l_discount@3, s_nationkey@4, ps_supplycost@5, o_orderdate@7]
-15)----------------------------CoalesceBatchesExec: target_batch_size=8192
-16)------------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-17)--------------------------------CoalesceBatchesExec: target_batch_size=8192
-18)----------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_suppkey@2, ps_suppkey@1), (l_partkey@1, ps_partkey@0)], projection=[l_orderkey@0, l_quantity@3, l_extendedprice@4, l_discount@5, s_nationkey@6, ps_supplycost@9]
-19)------------------------------------CoalesceBatchesExec: target_batch_size=8192
-20)--------------------------------------RepartitionExec: partitioning=Hash([l_suppkey@2, l_partkey@1], 4), input_partitions=4
-21)----------------------------------------CoalesceBatchesExec: target_batch_size=8192
-22)------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_suppkey@2, s_suppkey@0)], projection=[l_orderkey@0, l_partkey@1, l_suppkey@2, l_quantity@3, l_extendedprice@4, l_discount@5, s_nationkey@7]
-23)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-24)----------------------------------------------RepartitionExec: partitioning=Hash([l_suppkey@2], 4), input_partitions=4
-25)------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-26)--------------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@1)], projection=[l_orderkey@1, l_partkey@2, l_suppkey@3, l_quantity@4, l_extendedprice@5, l_discount@6]
-27)----------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-28)------------------------------------------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
-29)--------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-30)----------------------------------------------------------FilterExec: p_name@1 LIKE %green%, projection=[p_partkey@0]
-31)------------------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-32)--------------------------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_name], file_type=csv, has_header=false
-33)----------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-34)------------------------------------------------------RepartitionExec: partitioning=Hash([l_partkey@1], 4), input_partitions=4
-35)--------------------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_partkey, l_suppkey, l_quantity, l_extendedprice, l_discount], file_type=csv, has_header=false
-36)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-37)----------------------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-38)------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-39)--------------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
-40)------------------------------------CoalesceBatchesExec: target_batch_size=8192
-41)--------------------------------------RepartitionExec: partitioning=Hash([ps_suppkey@1, ps_partkey@0], 4), input_partitions=4
-42)----------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_supplycost], file_type=csv, has_header=false
-43)----------------------------CoalesceBatchesExec: target_batch_size=8192
-44)------------------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
-45)--------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_orderdate], file_type=csv, has_header=false
-46)--------------------CoalesceBatchesExec: target_batch_size=8192
-47)----------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-48)------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-49)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([nation@0, o_year@1], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[nation@0 as nation, o_year@1 as o_year], aggr=[sum(profit.amount)]
+07)------------ProjectionExec: expr=[n_name@0 as nation, date_part(YEAR, o_orderdate@1) as o_year, l_extendedprice@2 * (Some(1),20,0 - l_discount@3) - ps_supplycost@4 * l_quantity@5 as amount]
+08)--------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@3, n_nationkey@0)], projection=[n_name@7, o_orderdate@5, l_extendedprice@1, l_discount@2, ps_supplycost@4, l_quantity@0]
+09)----------------RepartitionExec: partitioning=Hash([s_nationkey@3], 4), input_partitions=4
+10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@0, o_orderkey@0)], projection=[l_quantity@1, l_extendedprice@2, l_discount@3, s_nationkey@4, ps_supplycost@5, o_orderdate@7]
+11)--------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+12)----------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_suppkey@2, ps_suppkey@1), (l_partkey@1, ps_partkey@0)], projection=[l_orderkey@0, l_quantity@3, l_extendedprice@4, l_discount@5, s_nationkey@6, ps_supplycost@9]
+13)------------------------RepartitionExec: partitioning=Hash([l_suppkey@2, l_partkey@1], 4), input_partitions=4
+14)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_suppkey@2, s_suppkey@0)], projection=[l_orderkey@0, l_partkey@1, l_suppkey@2, l_quantity@3, l_extendedprice@4, l_discount@5, s_nationkey@7]
+15)----------------------------RepartitionExec: partitioning=Hash([l_suppkey@2], 4), input_partitions=4
+16)------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@1)], projection=[l_orderkey@1, l_partkey@2, l_suppkey@3, l_quantity@4, l_extendedprice@5, l_discount@6]
+17)--------------------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
+18)----------------------------------FilterExec: p_name@1 LIKE %green%, projection=[p_partkey@0]
+19)------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+20)--------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_name], file_type=csv, has_header=false
+21)--------------------------------RepartitionExec: partitioning=Hash([l_partkey@1], 4), input_partitions=4
+22)----------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_partkey, l_suppkey, l_quantity, l_extendedprice, l_discount], file_type=csv, has_header=false
+23)----------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1
+24)------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
+25)------------------------RepartitionExec: partitioning=Hash([ps_suppkey@1, ps_partkey@0], 4), input_partitions=4
+26)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_supplycost], file_type=csv, has_header=false
+27)--------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
+28)----------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_orderdate], file_type=csv, has_header=false
+29)----------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1
+30)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/tpch.slt b/datafusion/sqllogictest/test_files/tpch/tpch.slt
index c6d630997e291..764285784aa50 100644
--- a/datafusion/sqllogictest/test_files/tpch/tpch.slt
+++ b/datafusion/sqllogictest/test_files/tpch/tpch.slt
@@ -28,3 +28,7 @@ set datafusion.optimizer.prefer_hash_join = false;
 include ./answers/q*.slt.part
 
 include ./drop_tables.slt.part
+
+# Config reset
+statement ok
+reset datafusion.optimizer.prefer_hash_join;
\ No newline at end of file
diff --git a/datafusion/sqllogictest/test_files/truncate.slt b/datafusion/sqllogictest/test_files/truncate.slt
new file mode 100644
index 0000000000000..ad3ccbb1a7cf4
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/truncate.slt
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## Truncate Tests
+##########
+
+statement ok
+create table t1(a int, b varchar, c double, d int);
+
+statement ok
+insert into t1 values (1, 'abc', 3.14, 4), (2, 'def', 2.71, 5);
+
+# Truncate all rows from table
+query TT
+explain truncate table t1;
+----
+logical_plan
+01)Dml: op=[Truncate] table=[t1]
+02)--EmptyRelation: rows=0
+physical_plan_error
+01)TRUNCATE operation on table 't1'
+02)caused by
+03)This feature is not implemented: TRUNCATE not supported for Base table
+
+# Test TRUNCATE with fully qualified table name
+statement ok
+create schema test_schema;
+
+statement ok
+create table test_schema.t5(a int);
+
+query TT
+explain truncate table test_schema.t5;
+----
+logical_plan
+01)Dml: op=[Truncate] table=[test_schema.t5]
+02)--EmptyRelation: rows=0
+physical_plan_error
+01)TRUNCATE operation on table 'test_schema.t5'
+02)caused by
+03)This feature is not implemented: TRUNCATE not supported for Base table
+
+# Test TRUNCATE with CASCADE option
+statement error TRUNCATE with CASCADE/RESTRICT is not supported
+TRUNCATE TABLE t1 CASCADE;
+
+# Test TRUNCATE with multiple tables
+statement error TRUNCATE with multiple tables is not supported
+TRUNCATE TABLE t1, t2;
+
+statement error TRUNCATE with PARTITION is not supported
+TRUNCATE TABLE t1 PARTITION (p1);
+
+statement error TRUNCATE with ONLY is not supported
+TRUNCATE ONLY t1;
+
+statement error TRUNCATE with RESTART/CONTINUE IDENTITY is not supported
+TRUNCATE TABLE t1 RESTART IDENTITY;
+
+# Test TRUNCATE without TABLE keyword
+query TT
+explain truncate t1;
+----
+logical_plan
+01)Dml: op=[Truncate] table=[t1]
+02)--EmptyRelation: rows=0
+physical_plan_error
+01)TRUNCATE operation on table 't1'
+02)caused by
+03)This feature is not implemented: TRUNCATE not supported for Base table
diff --git a/datafusion/sqllogictest/test_files/type_coercion.slt b/datafusion/sqllogictest/test_files/type_coercion.slt
index e3baa8fedcf63..7039e66b38b15 100644
--- a/datafusion/sqllogictest/test_files/type_coercion.slt
+++ b/datafusion/sqllogictest/test_files/type_coercion.slt
@@ -254,3 +254,51 @@ DROP TABLE orders;
 ########################################
 ## Test type coercion with UNIONs end ##
 ########################################
+
+# https://github.com/apache/datafusion/issues/15661
+# LIKE is a string pattern matching operator and is not supported for nested types.
+
+statement ok
+CREATE TABLE t0(v0 BIGINT, v1 STRING, v2 BOOLEAN);
+
+statement ok
+INSERT INTO t0(v0, v2) VALUES (123, true);
+
+query error There isn't a common type to coerce .* in .* expression
+SELECT true FROM t0 WHERE ((REGEXP_MATCH(t0.v1, t0.v1)) NOT LIKE (REGEXP_MATCH(t0.v1, t0.v1, 'jH')));
+
+query error There isn't a common type to coerce .* in .* expression
+SELECT true FROM t0 WHERE (REGEXP_MATCH(t0.v1, t0.v1)) NOT LIKE [];
+
+query error There isn't a common type to coerce .* in .* expression
+SELECT true FROM t0 WHERE (REGEXP_MATCH(t0.v1, t0.v1)) LIKE [];
+
+query error There isn't a common type to coerce .* in .* expression
+SELECT true FROM t0 WHERE (REGEXP_MATCH(t0.v1, t0.v1)) ILIKE [];
+
+query error There isn't a common type to coerce .* in .* expression
+SELECT true FROM t0 WHERE (REGEXP_MATCH(t0.v1, t0.v1)) NOT ILIKE [];
+
+statement ok
+DROP TABLE t0;
+
+#############################################################
+## Test validation for functions with empty argument lists ##
+#############################################################
+
+# https://github.com/apache/datafusion/issues/20201
+
+query error does not support zero arguments
+SELECT * FROM (SELECT 1) WHERE (STARTS_WITH() IS NULL);
+
+query error does not support zero arguments
+SELECT * FROM (SELECT 1) WHERE (STARTS_WITH() IS NOT NULL);
+
+query error does not support zero arguments
+SELECT * FROM (SELECT 'a') WHERE (STARTS_WITH() SIMILAR TO 'abc%');
+
+query error does not support zero arguments
+SELECT * FROM (SELECT 1) WHERE CAST(STARTS_WITH() AS STRING) = 'x';
+
+query error does not support zero arguments
+SELECT * FROM (SELECT 1) WHERE TRY_CAST(STARTS_WITH() AS INT) = 1;
\ No newline at end of file
diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt
index 0c8b8c6edb1fc..3871468411c4b 100644
--- a/datafusion/sqllogictest/test_files/union.slt
+++ b/datafusion/sqllogictest/test_files/union.slt
@@ -234,15 +234,13 @@ logical_plan
 06)------TableScan: t2 projection=[name]
 physical_plan
 01)AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=3
-05)--------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[]
-06)----------UnionExec
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------------DataSourceExec: partitions=1, partition_sizes=[1]
-09)------------ProjectionExec: expr=[name@0 || _new as name]
-10)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=3
+03)----AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[]
+04)------UnionExec
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)--------DataSourceExec: partitions=1, partition_sizes=[1]
+07)--------ProjectionExec: expr=[name@0 || _new as name]
+08)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # nested_union_all
 query T rowsort
@@ -258,6 +256,30 @@ Bob_new
 John
 John_new
 
+# Test UNION ALL with unaliased duplicate literal values on the right side.
+# The second projection will inherit field names from the first one, and so
+# pass the unique projection expression name check.
+query TII rowsort
+SELECT name, 1 as table, 1 as row FROM t1 WHERE id = 1
+UNION ALL
+SELECT name, 2, 2 FROM t2 WHERE id = 2
+----
+Alex 1 1
+Bob 2 2
+
+# Test nested UNION, EXCEPT, INTERSECT with duplicate unaliased literals.
+# Only the first SELECT has column aliases, which should propagate to all projections.
+query III rowsort
+SELECT 1 as a, 0 as b, 0 as c
+UNION ALL
+((SELECT 2, 0, 0 UNION ALL SELECT 3, 0, 0) EXCEPT SELECT 3, 0, 0)
+UNION ALL
+(SELECT 4, 0, 0 INTERSECT SELECT 4, 0, 0)
+----
+1 0 0
+2 0 0
+4 0 0
+
 # Plan is unnested
 query TT
 EXPLAIN SELECT name FROM t1 UNION ALL (SELECT name from t2 UNION ALL SELECT name || '_new' from t2)
@@ -307,31 +329,27 @@ logical_plan
 10)------TableScan: t1 projection=[id, name]
 physical_plan
 01)UnionExec
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(id@0, CAST(t2.id AS Int32)@2), (name@1, name@1)], NullsEqual: true
-04)------CoalescePartitionsExec
-05)--------AggregateExec: mode=FinalPartitioned, gby=[id@0 as id, name@1 as name], aggr=[]
-06)----------CoalesceBatchesExec: target_batch_size=2
-07)------------RepartitionExec: partitioning=Hash([id@0, name@1], 4), input_partitions=4
-08)--------------AggregateExec: mode=Partial, gby=[id@0 as id, name@1 as name], aggr=[]
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-10)------------------DataSourceExec: partitions=1, partition_sizes=[1]
-11)------ProjectionExec: expr=[id@0 as id, name@1 as name, CAST(id@0 AS Int32) as CAST(t2.id AS Int32)]
-12)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-13)----------DataSourceExec: partitions=1, partition_sizes=[1]
-14)--ProjectionExec: expr=[CAST(id@0 AS Int32) as id, name@1 as name]
-15)----CoalesceBatchesExec: target_batch_size=2
-16)------HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(CAST(t2.id AS Int32)@2, id@0), (name@1, name@1)], projection=[id@0, name@1], NullsEqual: true
-17)--------CoalescePartitionsExec
-18)----------ProjectionExec: expr=[id@0 as id, name@1 as name, CAST(id@0 AS Int32) as CAST(t2.id AS Int32)]
-19)------------AggregateExec: mode=FinalPartitioned, gby=[id@0 as id, name@1 as name], aggr=[]
-20)--------------CoalesceBatchesExec: target_batch_size=2
-21)----------------RepartitionExec: partitioning=Hash([id@0, name@1], 4), input_partitions=4
-22)------------------AggregateExec: mode=Partial, gby=[id@0 as id, name@1 as name], aggr=[]
-23)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-24)----------------------DataSourceExec: partitions=1, partition_sizes=[1]
-25)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-26)----------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(CAST(t2.id AS Int32)@2, id@0), (name@1, name@1)], NullsEqual: true
+03)----CoalescePartitionsExec
+04)------ProjectionExec: expr=[id@0 as id, name@1 as name, CAST(id@0 AS Int32) as CAST(t2.id AS Int32)]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+07)----AggregateExec: mode=FinalPartitioned, gby=[id@0 as id, name@1 as name], aggr=[]
+08)------RepartitionExec: partitioning=Hash([id@0, name@1], 4), input_partitions=4
+09)--------AggregateExec: mode=Partial, gby=[id@0 as id, name@1 as name], aggr=[]
+10)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+11)------------DataSourceExec: partitions=1, partition_sizes=[1]
+12)--ProjectionExec: expr=[CAST(id@0 AS Int32) as id, name@1 as name]
+13)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(CAST(t2.id AS Int32)@2, id@0), (name@1, name@1)], projection=[id@0, name@1], NullsEqual: true
+14)------CoalescePartitionsExec
+15)--------ProjectionExec: expr=[id@0 as id, name@1 as name, CAST(id@0 AS Int32) as CAST(t2.id AS Int32)]
+16)----------AggregateExec: mode=FinalPartitioned, gby=[id@0 as id, name@1 as name], aggr=[]
+17)------------RepartitionExec: partitioning=Hash([id@0, name@1], 4), input_partitions=4
+18)--------------AggregateExec: mode=Partial, gby=[id@0 as id, name@1 as name], aggr=[]
+19)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+20)------------------DataSourceExec: partitions=1, partition_sizes=[1]
+21)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+22)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 
 query IT rowsort
@@ -377,28 +395,24 @@ logical_plan
 09)----TableScan: t1 projection=[name]
 physical_plan
 01)UnionExec
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(name@0, name@0)], NullsEqual: true
-04)------CoalescePartitionsExec
-05)--------AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[]
-06)----------CoalesceBatchesExec: target_batch_size=2
-07)------------RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4
-08)--------------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[]
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-10)------------------DataSourceExec: partitions=1, partition_sizes=[1]
-11)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-12)--------DataSourceExec: partitions=1, partition_sizes=[1]
-13)--CoalesceBatchesExec: target_batch_size=2
-14)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(name@0, name@0)], NullsEqual: true
-15)------CoalescePartitionsExec
-16)--------AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[]
-17)----------CoalesceBatchesExec: target_batch_size=2
-18)------------RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4
-19)--------------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[]
-20)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-21)------------------DataSourceExec: partitions=1, partition_sizes=[1]
-22)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-23)--------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(name@0, name@0)], NullsEqual: true
+03)----CoalescePartitionsExec
+04)------AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[]
+05)--------RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+09)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+10)------DataSourceExec: partitions=1, partition_sizes=[1]
+11)--HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(name@0, name@0)], NullsEqual: true
+12)----CoalescePartitionsExec
+13)------AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[]
+14)--------RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4
+15)----------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[]
+16)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+17)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+18)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+19)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # union_upcast_types
 query TT
@@ -453,17 +467,15 @@ physical_plan
 02)--AggregateExec: mode=SinglePartitioned, gby=[name@0 as name], aggr=[count(Int64(1))]
 03)----InterleaveExec
 04)------AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[]
-05)--------CoalesceBatchesExec: target_batch_size=2
-06)----------RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[]
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
-10)------AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[]
-11)--------CoalesceBatchesExec: target_batch_size=2
-12)----------RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4
-13)------------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[]
-14)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-15)----------------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--------RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+09)------AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[]
+10)--------RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4
+11)----------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[]
+12)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+13)--------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Union with limit push down 3 children test case
 query TT
@@ -506,24 +518,25 @@ physical_plan
 01)CoalescePartitionsExec: fetch=3
 02)--UnionExec
 03)----ProjectionExec: expr=[count(Int64(1))@0 as cnt]
-04)------AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
-05)--------CoalescePartitionsExec
-06)----------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
-07)------------ProjectionExec: expr=[]
-08)--------------AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[]
-09)----------------CoalesceBatchesExec: target_batch_size=2
+04)------GlobalLimitExec: skip=0, fetch=3
+05)--------AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
+06)----------CoalescePartitionsExec
+07)------------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
+08)--------------ProjectionExec: expr=[]
+09)----------------AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[]
 10)------------------RepartitionExec: partitioning=Hash([c1@0], 4), input_partitions=4
 11)--------------------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[]
-12)----------------------CoalesceBatchesExec: target_batch_size=2
-13)------------------------FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434, projection=[c1@0]
-14)--------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-15)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c13], file_type=csv, has_header=true
-16)----ProjectionExec: expr=[1 as cnt]
-17)------PlaceholderRowExec
+12)----------------------FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434, projection=[c1@0]
+13)------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+14)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c13], file_type=csv, has_header=true
+15)----ProjectionExec: expr=[1 as cnt]
+16)------GlobalLimitExec: skip=0, fetch=3
+17)--------PlaceholderRowExec
 18)----ProjectionExec: expr=[lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as cnt]
-19)------BoundedWindowAggExec: wdw=[lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
-20)--------ProjectionExec: expr=[1 as c1]
-21)----------PlaceholderRowExec
+19)------GlobalLimitExec: skip=0, fetch=3
+20)--------BoundedWindowAggExec: wdw=[lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
+21)----------ProjectionExec: expr=[1 as c1]
+22)------------PlaceholderRowExec
 
 
 ########
@@ -603,8 +616,7 @@ physical_plan
 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST]
 02)--UnionExec
 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1], output_ordering=[c1@0 ASC NULLS LAST], file_type=csv, has_header=true
-04)----ProjectionExec: expr=[c1a@0 as c1]
-05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1a], output_ordering=[c1a@0 ASC NULLS LAST], file_type=csv, has_header=true
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1a@0 as c1], file_type=csv, has_header=true
 
 statement ok
 drop table t1
@@ -835,14 +847,12 @@ logical_plan
 physical_plan
 01)CoalescePartitionsExec
 02)--UnionExec
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------FilterExec: c1@0 = a
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13], file_type=csv, has_header=true
-07)----CoalesceBatchesExec: target_batch_size=2
-08)------FilterExec: c1@0 = a
-09)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-10)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13], file_type=csv, has_header=true
+03)----FilterExec: c1@0 = a
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13], file_type=csv, has_header=true
+06)----FilterExec: c1@0 = a
+07)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13], file_type=csv, has_header=true
 
 # Clean up after the test
 statement ok
@@ -990,3 +1000,7 @@ query TITT
               END
 ----
 ns3 4 t3 m
+
+# Config reset
+statement ok
+RESET datafusion.execution.batch_size;
diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt
index 50121813133bf..ac01b7d1f7c6c 100644
--- a/datafusion/sqllogictest/test_files/unnest.slt
+++ b/datafusion/sqllogictest/test_files/unnest.slt
@@ -58,6 +58,20 @@ select unnest(struct(1,2,3));
 ----
 1 2 3
 
+## Basic unnest expression in select struct with alias (alias is ignored for struct unnest)
+query III
+select unnest(struct(1,2,3)) as ignored_alias;
+----
+1 2 3
+
+## Verify schema output for struct unnest with alias (alias is ignored)
+query TTT
+describe select unnest(struct(1,2,3)) as ignored_alias;
+----
+struct(Int64(1),Int64(2),Int64(3)).c0 Int64 YES
+struct(Int64(1),Int64(2),Int64(3)).c1 Int64 YES
+struct(Int64(1),Int64(2),Int64(3)).c2 Int64 YES
+
 ## Basic unnest list expression in from clause
 query I
 select * from unnest([1,2,3]);
@@ -511,7 +525,7 @@ x y [30, 40, 50]
 query error DataFusion error: type_coercion\ncaused by\nThis feature is not implemented: Unnest should be rewritten to LogicalPlan::Unnest before type coercion
 select sum(unnest(generate_series(1,10)));
 
-query error DataFusion error: Internal error: unnest on struct can only be applied at the root level of select expression
+query error DataFusion error: Internal error: Assertion failed: struct_allowed: unnest on struct can only be applied at the root level of select expression
 select arrow_typeof(unnest(column5)) from unnest_table;
 
 query T
@@ -594,18 +608,20 @@ query TT
 explain select unnest(unnest(column3)), column3 from recursive_unnest_table;
 ----
 logical_plan
-01)Unnest: lists[] structs[__unnest_placeholder(UNNEST(recursive_unnest_table.column3))]
-02)--Projection: __unnest_placeholder(recursive_unnest_table.column3,depth=1) AS UNNEST(recursive_unnest_table.column3) AS __unnest_placeholder(UNNEST(recursive_unnest_table.column3)), recursive_unnest_table.column3
-03)----Unnest: lists[__unnest_placeholder(recursive_unnest_table.column3)|depth=1] structs[]
-04)------Projection: recursive_unnest_table.column3 AS __unnest_placeholder(recursive_unnest_table.column3), recursive_unnest_table.column3
-05)--------TableScan: recursive_unnest_table projection=[column3]
+01)Projection: __unnest_placeholder(UNNEST(recursive_unnest_table.column3)).c0 AS UNNEST(recursive_unnest_table.column3).c0, __unnest_placeholder(UNNEST(recursive_unnest_table.column3)).c1 AS UNNEST(recursive_unnest_table.column3).c1, recursive_unnest_table.column3
+02)--Unnest: lists[] structs[__unnest_placeholder(UNNEST(recursive_unnest_table.column3))]
+03)----Projection: __unnest_placeholder(recursive_unnest_table.column3,depth=1) AS UNNEST(recursive_unnest_table.column3) AS __unnest_placeholder(UNNEST(recursive_unnest_table.column3)), recursive_unnest_table.column3
+04)------Unnest: lists[__unnest_placeholder(recursive_unnest_table.column3)|depth=1] structs[]
+05)--------Projection: recursive_unnest_table.column3 AS __unnest_placeholder(recursive_unnest_table.column3), recursive_unnest_table.column3
+06)----------TableScan: recursive_unnest_table projection=[column3]
 physical_plan
-01)UnnestExec
-02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-03)----ProjectionExec: expr=[__unnest_placeholder(recursive_unnest_table.column3,depth=1)@0 as __unnest_placeholder(UNNEST(recursive_unnest_table.column3)), column3@1 as column3]
-04)------UnnestExec
-05)--------ProjectionExec: expr=[column3@0 as __unnest_placeholder(recursive_unnest_table.column3), column3@0 as column3]
-06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+01)ProjectionExec: expr=[__unnest_placeholder(UNNEST(recursive_unnest_table.column3)).c0@0 as UNNEST(recursive_unnest_table.column3).c0, __unnest_placeholder(UNNEST(recursive_unnest_table.column3)).c1@1 as UNNEST(recursive_unnest_table.column3).c1, column3@2 as column3]
+02)--UnnestExec
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------ProjectionExec: expr=[__unnest_placeholder(recursive_unnest_table.column3,depth=1)@0 as __unnest_placeholder(UNNEST(recursive_unnest_table.column3)), column3@1 as column3]
+05)--------UnnestExec
+06)----------ProjectionExec: expr=[column3@0 as __unnest_placeholder(recursive_unnest_table.column3), column3@0 as column3]
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 ## unnest->field_access->unnest->unnest
 query I?
@@ -652,15 +668,15 @@ explain select unnest(unnest(unnest(column3)['c1'])), column3 from recursive_unn
 logical_plan
 01)Projection: __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1],depth=2) AS UNNEST(UNNEST(UNNEST(recursive_unnest_table.column3)[c1])), recursive_unnest_table.column3
 02)--Unnest: lists[__unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1])|depth=2] structs[]
-03)----Projection: get_field(__unnest_placeholder(recursive_unnest_table.column3,depth=1) AS UNNEST(recursive_unnest_table.column3), Utf8("c1")) AS __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1]), recursive_unnest_table.column3
+03)----Projection: get_field(__unnest_placeholder(recursive_unnest_table.column3,depth=1), Utf8("c1")) AS __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1]), recursive_unnest_table.column3
 04)------Unnest: lists[__unnest_placeholder(recursive_unnest_table.column3)|depth=1] structs[]
 05)--------Projection: recursive_unnest_table.column3 AS __unnest_placeholder(recursive_unnest_table.column3), recursive_unnest_table.column3
 06)----------TableScan: recursive_unnest_table projection=[column3]
 physical_plan
 01)ProjectionExec: expr=[__unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1],depth=2)@0 as UNNEST(UNNEST(UNNEST(recursive_unnest_table.column3)[c1])), column3@1 as column3]
 02)--UnnestExec
-03)----ProjectionExec: expr=[get_field(__unnest_placeholder(recursive_unnest_table.column3,depth=1)@0, c1) as __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1]), column3@1 as column3]
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------ProjectionExec: expr=[get_field(__unnest_placeholder(recursive_unnest_table.column3,depth=1)@0, c1) as __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1]), column3@1 as column3]
 05)--------UnnestExec
 06)----------ProjectionExec: expr=[column3@0 as __unnest_placeholder(recursive_unnest_table.column3), column3@0 as column3]
 07)------------DataSourceExec: partitions=1, partition_sizes=[1]
@@ -798,9 +814,21 @@ NULL 1
 query error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "nested_unnest_table\.column1" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "UNNEST\(nested_unnest_table\.column1\)\[c0\]" appears in the SELECT clause satisfies this requirement
 select unnest(column1) c1 from nested_unnest_table group by c1.c0;
 
-# TODO: this query should work. see issue: https://github.com/apache/datafusion/issues/12794
-query error DataFusion error: Internal error: unnest on struct can only be applied at the root level of select expression
+## Unnest struct with alias - alias is ignored (same as DuckDB behavior)
+## See: https://github.com/apache/datafusion/issues/12794
+query TT?
 select unnest(column1) c1 from nested_unnest_table
+----
+a b {c0: c}
+d e {c0: f}
+
+## Verify schema output for struct unnest with alias (alias is ignored)
+query TTT
+describe select unnest(column1) c1 from nested_unnest_table;
+----
+nested_unnest_table.column1.c0 Utf8 YES
+nested_unnest_table.column1.c1 Utf8 YES
+nested_unnest_table.column1.c2 Struct("c0": Utf8) YES
 
 query II??I??
 select unnest(column5), * from unnest_table;
@@ -961,16 +989,14 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[array_agg(unnested.ar)@1 as array_agg(unnested.ar)]
 02)--AggregateExec: mode=FinalPartitioned, gby=[generated_id@0 as generated_id], aggr=[array_agg(unnested.ar)], ordering_mode=Sorted
-03)----SortExec: expr=[generated_id@0 ASC NULLS LAST], preserve_partitioning=[true]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([generated_id@0], 4), input_partitions=4
-06)----------AggregateExec: mode=Partial, gby=[generated_id@0 as generated_id], aggr=[array_agg(unnested.ar)], ordering_mode=Sorted
-07)------------ProjectionExec: expr=[generated_id@0 as generated_id, __unnest_placeholder(make_array(range().value),depth=1)@1 as ar]
-08)--------------UnnestExec
-09)----------------ProjectionExec: expr=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as generated_id, make_array(value@0) as __unnest_placeholder(make_array(range().value))]
-10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-11)--------------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
-12)----------------------LazyMemoryExec: partitions=1, batch_generators=[range: start=1, end=5, batch_size=8192]
+03)----RepartitionExec: partitioning=Hash([generated_id@0], 4), input_partitions=4, preserve_order=true, sort_exprs=generated_id@0 ASC NULLS LAST
+04)------AggregateExec: mode=Partial, gby=[generated_id@0 as generated_id], aggr=[array_agg(unnested.ar)], ordering_mode=Sorted
+05)--------ProjectionExec: expr=[generated_id@0 as generated_id, __unnest_placeholder(make_array(range().value),depth=1)@1 as ar]
+06)----------UnnestExec
+07)------------ProjectionExec: expr=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as generated_id, make_array(value@0) as __unnest_placeholder(make_array(range().value))]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+09)----------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
+10)------------------LazyMemoryExec: partitions=1, batch_generators=[range: start=1, end=5, batch_size=8192]
 
 # Unnest array where data is already ordered by column2 (100, 200, 300, 400)
 statement ok
@@ -1021,8 +1047,7 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[__unnest_placeholder(t.column1,depth=1)@0 as UNNEST(t.column1), column2@1 as column2]
 02)--UnnestExec
-03)----ProjectionExec: expr=[column1@0 as __unnest_placeholder(t.column1), column2@1 as column2]
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_array.parquet]]}, projection=[column1, column2], output_ordering=[column2@1 ASC NULLS LAST], file_type=parquet
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_array.parquet]]}, projection=[column1@0 as __unnest_placeholder(t.column1), column2], output_ordering=[column2@1 ASC NULLS LAST], file_type=parquet
 
 # Explain should have a SortExec at the top because we order by the output of the unnest (i.e. discarding the ordering)
 query TT
@@ -1038,8 +1063,7 @@ physical_plan
 01)SortExec: expr=[unnested@0 ASC NULLS LAST], preserve_partitioning=[false]
 02)--ProjectionExec: expr=[__unnest_placeholder(t.column1,depth=1)@0 as unnested, column2@1 as column2]
 03)----UnnestExec
-04)------ProjectionExec: expr=[column1@0 as __unnest_placeholder(t.column1), column2@1 as column2]
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_array.parquet]]}, projection=[column1, column2], output_ordering=[column2@1 ASC NULLS LAST], file_type=parquet
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_array.parquet]]}, projection=[column1@0 as __unnest_placeholder(t.column1), column2], output_ordering=[column2@1 ASC NULLS LAST], file_type=parquet
 
 # cleanup
 statement ok
@@ -1076,16 +1100,17 @@ EXPLAIN WITH unnested AS (
 ) SELECT * FROM unnested order by 1;
 ----
 logical_plan
-01)Sort: unnested.__unnest_placeholder(struct(t.column1,t.column2,t.column3)).c0 ASC NULLS LAST
+01)Sort: unnested.struct(t.column1,t.column2,t.column3).c0 ASC NULLS LAST
 02)--SubqueryAlias: unnested
-03)----Unnest: lists[] structs[__unnest_placeholder(struct(t.column1,t.column2,t.column3))]
-04)------Projection: struct(t.column1, t.column2, t.column3) AS __unnest_placeholder(struct(t.column1,t.column2,t.column3))
-05)--------TableScan: t projection=[column1, column2, column3]
+03)----Projection: __unnest_placeholder(struct(t.column1,t.column2,t.column3)).c0 AS struct(t.column1,t.column2,t.column3).c0, __unnest_placeholder(struct(t.column1,t.column2,t.column3)).c1 AS struct(t.column1,t.column2,t.column3).c1, __unnest_placeholder(struct(t.column1,t.column2,t.column3)).c2 AS struct(t.column1,t.column2,t.column3).c2
+04)------Unnest: lists[] structs[__unnest_placeholder(struct(t.column1,t.column2,t.column3))]
+05)--------Projection: struct(t.column1, t.column2, t.column3) AS __unnest_placeholder(struct(t.column1,t.column2,t.column3))
+06)----------TableScan: t projection=[column1, column2, column3]
 physical_plan
-01)SortExec: expr=[__unnest_placeholder(struct(t.column1,t.column2,t.column3)).c0@0 ASC NULLS LAST], preserve_partitioning=[false]
-02)--UnnestExec
-03)----ProjectionExec: expr=[struct(column1@0, column2@1, column3@2) as __unnest_placeholder(struct(t.column1,t.column2,t.column3))]
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_tuples.parquet]]}, projection=[column1, column2, column3], output_ordering=[column1@0 ASC NULLS LAST], file_type=parquet
+01)SortExec: expr=[struct(t.column1,t.column2,t.column3).c0@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--ProjectionExec: expr=[__unnest_placeholder(struct(t.column1,t.column2,t.column3)).c0@0 as struct(t.column1,t.column2,t.column3).c0, __unnest_placeholder(struct(t.column1,t.column2,t.column3)).c1@1 as struct(t.column1,t.column2,t.column3).c1, __unnest_placeholder(struct(t.column1,t.column2,t.column3)).c2@2 as struct(t.column1,t.column2,t.column3).c2]
+03)----UnnestExec
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_tuples.parquet]]}, projection=[struct(column1@0, column2@1, column3@2) as __unnest_placeholder(struct(t.column1,t.column2,t.column3))], file_type=parquet
 
 # cleanup
 statement ok
@@ -1131,13 +1156,14 @@ EXPLAIN SELECT UNNEST(column1), column2 FROM t ORDER BY column2;
 ----
 logical_plan
 01)Sort: t.column2 ASC NULLS LAST
-02)--Unnest: lists[] structs[__unnest_placeholder(t.column1)]
-03)----Projection: t.column1 AS __unnest_placeholder(t.column1), t.column2
-04)------TableScan: t projection=[column1, column2]
+02)--Projection: __unnest_placeholder(t.column1).s1 AS t.column1.s1, __unnest_placeholder(t.column1).s2 AS t.column1.s2, __unnest_placeholder(t.column1).s3 AS t.column1.s3, t.column2
+03)----Unnest: lists[] structs[__unnest_placeholder(t.column1)]
+04)------Projection: t.column1 AS __unnest_placeholder(t.column1), t.column2
+05)--------TableScan: t projection=[column1, column2]
 physical_plan
-01)UnnestExec
-02)--ProjectionExec: expr=[column1@0 as __unnest_placeholder(t.column1), column2@1 as column2]
-03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_struct.parquet]]}, projection=[column1, column2], output_ordering=[column2@1 ASC NULLS LAST], file_type=parquet
+01)ProjectionExec: expr=[__unnest_placeholder(t.column1).s1@0 as t.column1.s1, __unnest_placeholder(t.column1).s2@1 as t.column1.s2, __unnest_placeholder(t.column1).s3@2 as t.column1.s3, column2@3 as column2]
+02)--UnnestExec
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_struct.parquet]]}, projection=[column1@0 as __unnest_placeholder(t.column1), column2], output_ordering=[column2@1 ASC NULLS LAST], file_type=parquet
 
 # cleanup
 statement ok
@@ -1200,16 +1226,111 @@ EXPLAIN SELECT UNNEST(UNNEST(column1)), UNNEST(column2), UNNEST(column3), column
 ----
 logical_plan
 01)Sort: t.column4 ASC NULLS LAST
-02)--Projection: __unnest_placeholder(t.column1,depth=2) AS UNNEST(UNNEST(t.column1)), __unnest_placeholder(t.column2,depth=1) AS UNNEST(t.column2), __unnest_placeholder(t.column3).s1, __unnest_placeholder(t.column3).s2, __unnest_placeholder(t.column3).s3, t.column4
+02)--Projection: __unnest_placeholder(t.column1,depth=2) AS UNNEST(UNNEST(t.column1)), __unnest_placeholder(t.column2,depth=1) AS UNNEST(t.column2), __unnest_placeholder(t.column3).s1 AS t.column3.s1, __unnest_placeholder(t.column3).s2 AS t.column3.s2, __unnest_placeholder(t.column3).s3 AS t.column3.s3, t.column4
 03)----Unnest: lists[__unnest_placeholder(t.column1)|depth=2, __unnest_placeholder(t.column2)|depth=1] structs[__unnest_placeholder(t.column3)]
 04)------Projection: t.column1 AS __unnest_placeholder(t.column1), t.column2 AS __unnest_placeholder(t.column2), t.column3 AS __unnest_placeholder(t.column3), t.column4
 05)--------TableScan: t projection=[column1, column2, column3, column4]
 physical_plan
-01)ProjectionExec: expr=[__unnest_placeholder(t.column1,depth=2)@0 as UNNEST(UNNEST(t.column1)), __unnest_placeholder(t.column2,depth=1)@1 as UNNEST(t.column2), __unnest_placeholder(t.column3).s1@2 as __unnest_placeholder(t.column3).s1, __unnest_placeholder(t.column3).s2@3 as __unnest_placeholder(t.column3).s2, __unnest_placeholder(t.column3).s3@4 as __unnest_placeholder(t.column3).s3, column4@5 as column4]
+01)ProjectionExec: expr=[__unnest_placeholder(t.column1,depth=2)@0 as UNNEST(UNNEST(t.column1)), __unnest_placeholder(t.column2,depth=1)@1 as UNNEST(t.column2), __unnest_placeholder(t.column3).s1@2 as t.column3.s1, __unnest_placeholder(t.column3).s2@3 as t.column3.s2, __unnest_placeholder(t.column3).s3@4 as t.column3.s3, column4@5 as column4]
 02)--UnnestExec
-03)----ProjectionExec: expr=[column1@0 as __unnest_placeholder(t.column1), column2@1 as __unnest_placeholder(t.column2), column3@2 as __unnest_placeholder(t.column3), column4@3 as column4]
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_struct_arrays.parquet]]}, projection=[column1, column2, column3, column4], output_ordering=[column4@3 ASC NULLS LAST], file_type=parquet
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_struct_arrays.parquet]]}, projection=[column1@0 as __unnest_placeholder(t.column1), column2@1 as __unnest_placeholder(t.column2), column3@2 as __unnest_placeholder(t.column3), column4], output_ordering=[column4@3 ASC NULLS LAST], file_type=parquet
 
 # cleanup
 statement ok
 drop table t;
+
+########################################
+# Unnest ListView / LargeListView Tests #
+########################################
+
+## Basic unnest ListView in select list
+query I
+select unnest(arrow_cast([1,2,3], 'ListView(Int64)'));
+----
+1
+2
+3
+
+## Basic unnest ListView in from clause
+query I
+select * from unnest(arrow_cast([1,2,3], 'ListView(Int64)'));
+----
+1
+2
+3
+
+## Basic unnest LargeListView in select list
+query I
+select unnest(arrow_cast([1,2,3], 'LargeListView(Int64)'));
+----
+1
+2
+3
+
+## Basic unnest LargeListView in from clause
+query I
+select * from unnest(arrow_cast([1,2,3], 'LargeListView(Int64)'));
+----
+1
+2
+3
+
+## Unnest ListView with range
+query I
+select unnest(arrow_cast(range(1, 3), 'ListView(Int64)'));
+----
+1
+2
+
+## Unnest LargeListView with range
+query I
+select * from unnest(arrow_cast(range(1, 3), 'LargeListView(Int64)'));
+----
+1
+2
+
+## Multiple unnest with ListView columns from a table
+query III
+select
+    unnest(column1),
+    unnest(arrow_cast(column2, 'ListView(Int64)')),
+    unnest(arrow_cast(column4, 'LargeListView(Int64)'))
+from unnest_table where column4 is not null;
+----
+1 7 13
+2 NULL 14
+3 NULL NULL
+4 8 15
+5 9 16
+NULL 10 NULL
+NULL NULL 17
+NULL NULL 18
+
+## Unnest ListView with null elements
+query I
+select unnest(arrow_cast([1, null, 3], 'ListView(Int64)'));
+----
+1
+NULL
+3
+
+## Unnest empty ListView
+query I
+select unnest(arrow_cast([], 'ListView(Int64)'));
+----
+
+## Unnest ListView of strings
+query T
+select unnest(arrow_cast(['a','b','c'], 'ListView(Utf8)'));
+----
+a
+b
+c
+
+## Unnest LargeListView of strings
+query T
+select unnest(arrow_cast(['a','b','c'], 'LargeListView(Utf8)'));
+----
+a
+b
+c
diff --git a/datafusion/sqllogictest/test_files/update.slt b/datafusion/sqllogictest/test_files/update.slt
index 9f2c16b21106f..e8fdab6ab18bb 100644
--- a/datafusion/sqllogictest/test_files/update.slt
+++ b/datafusion/sqllogictest/test_files/update.slt
@@ -33,7 +33,9 @@ logical_plan
 01)Dml: op=[Update] table=[t1]
 02)--Projection: CAST(Int64(1) AS Int32) AS a, CAST(Int64(2) AS Utf8View) AS b, Float64(3) AS c, CAST(NULL AS Int32) AS d
 03)----TableScan: t1
-physical_plan_error This feature is not implemented: Unsupported logical plan: Dml(Update)
+physical_plan
+01)CooperativeExec
+02)--DmlResultExec: rows_affected=0
 
 query TT
 explain update t1 set a=c+1, b=a, c=c+1.0, d=b;
@@ -42,7 +44,9 @@ logical_plan
 01)Dml: op=[Update] table=[t1]
 02)--Projection: CAST(t1.c + CAST(Int64(1) AS Float64) AS Int32) AS a, CAST(t1.a AS Utf8View) AS b, t1.c + Float64(1) AS c, CAST(t1.b AS Int32) AS d
 03)----TableScan: t1
-physical_plan_error This feature is not implemented: Unsupported logical plan: Dml(Update)
+physical_plan
+01)CooperativeExec
+02)--DmlResultExec: rows_affected=0
 
 statement ok
 create table t2(a int, b varchar, c double, d int);
@@ -63,35 +67,52 @@ logical_plan
 physical_plan_error This feature is not implemented: Physical plan does not support logical expression ScalarSubquery(<subquery>)
 
 # set from other table
-query TT
+# UPDATE ... FROM is currently unsupported
+# TODO fix https://github.com/apache/datafusion/issues/19950
+query error DataFusion error: This feature is not implemented: UPDATE ... FROM is not supported
 explain update t1 set b = t2.b, c = t2.a, d = 1 from t2 where t1.a = t2.a and t1.b > 'foo' and t2.c > 1.0;
-----
-logical_plan
-01)Dml: op=[Update] table=[t1]
-02)--Projection: t1.a AS a, t2.b AS b, CAST(t2.a AS Float64) AS c, CAST(Int64(1) AS Int32) AS d
-03)----Filter: t1.a = t2.a AND t1.b > CAST(Utf8("foo") AS Utf8View) AND t2.c > Float64(1)
-04)------Cross Join: 
-05)--------TableScan: t1
-06)--------TableScan: t2
-physical_plan_error This feature is not implemented: Unsupported logical plan: Dml(Update)
+
+# test update from other table with actual data
+statement ok
+insert into t1 values (1, 'zoo', 2.0, 10), (2, 'qux', 3.0, 20), (3, 'bar', 4.0, 30);
 
 statement ok
-create table t3(a int, b varchar, c double, d int);
+insert into t2 values (1, 'updated_b', 5.0, 40), (2, 'updated_b2', 2.5, 50), (4, 'updated_b3', 1.5, 60);
+
+# UPDATE ... FROM is currently unsupported - qualifier stripping breaks source column references
+# causing assignments like 'b = t2.b' to resolve to target table's 'b' instead of source table's 'b'
+# TODO fix https://github.com/apache/datafusion/issues/19950
+statement error DataFusion error: This feature is not implemented: UPDATE ... FROM is not supported
+update t1 set b = t2.b, c = t2.a, d = 1 from t2 where t1.a = t2.a and t1.b > 'foo' and t2.c > 1.0;
 
 # set from multiple tables, DataFusion only supports from one table
-query error DataFusion error: Error during planning: Multiple tables in UPDATE SET FROM not yet supported
+statement error DataFusion error: This feature is not implemented: Multiple tables in UPDATE SET FROM not yet supported
 explain update t1 set b = t2.b, c = t3.a, d = 1 from t2, t3 where t1.a = t2.a and t1.a = t3.a;
 
 # test table alias
-query TT
+# UPDATE ... FROM is currently unsupported
+# TODO fix https://github.com/apache/datafusion/issues/19950
+statement error DataFusion error: This feature is not implemented: UPDATE ... FROM is not supported
 explain update t1 as T set b = t2.b, c = t.a, d = 1 from t2 where t.a = t2.a and t.b > 'foo' and t2.c > 1.0;
-----
-logical_plan
-01)Dml: op=[Update] table=[t1]
-02)--Projection: t.a AS a, t2.b AS b, CAST(t.a AS Float64) AS c, CAST(Int64(1) AS Int32) AS d
-03)----Filter: t.a = t2.a AND t.b > CAST(Utf8("foo") AS Utf8View) AND t2.c > Float64(1)
-04)------Cross Join: 
-05)--------SubqueryAlias: t
-06)----------TableScan: t1
-07)--------TableScan: t2
-physical_plan_error This feature is not implemented: Unsupported logical plan: Dml(Update)
+
+# test update with table alias with actual data
+statement ok
+delete from t1;
+
+statement ok
+delete from t2;
+
+statement ok
+insert into t1 values (1, 'zebra', 1.5, 5), (2, 'wolf', 2.0, 10), (3, 'apple', 3.5, 15);
+
+statement ok
+insert into t2 values (1, 'new_val', 2.0, 100), (2, 'new_val2', 1.5, 200);
+
+# UPDATE ... FROM is currently unsupported
+# TODO fix https://github.com/apache/datafusion/issues/19950
+statement error DataFusion error: This feature is not implemented: UPDATE ... FROM is not supported
+update t1 as T set b = t2.b, c = t.a, d = 1 from t2 where t.a = t2.a and t.b > 'foo' and t2.c > 1.0;
+
+# Config reset
+statement ok
+RESET datafusion.optimizer.max_passes;
diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt
index c2fabb5e6eff4..caaf22f0adbd8 100644
--- a/datafusion/sqllogictest/test_files/window.slt
+++ b/datafusion/sqllogictest/test_files/window.slt
@@ -275,18 +275,17 @@ physical_plan
 02)--SortExec: expr=[b@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[b@0 as b, max(d.a)@1 as max_a]
 04)------AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[max(d.a)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[b@1 as b], aggr=[max(d.a)], ordering_mode=Sorted
-08)--------------UnionExec
-09)----------------ProjectionExec: expr=[1 as a, aa as b]
-10)------------------PlaceholderRowExec
-11)----------------ProjectionExec: expr=[3 as a, aa as b]
-12)------------------PlaceholderRowExec
-13)----------------ProjectionExec: expr=[5 as a, bb as b]
-14)------------------PlaceholderRowExec
-15)----------------ProjectionExec: expr=[7 as a, bb as b]
-16)------------------PlaceholderRowExec
+05)--------RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[b@1 as b], aggr=[max(d.a)], ordering_mode=Sorted
+07)------------UnionExec
+08)--------------ProjectionExec: expr=[1 as a, aa as b]
+09)----------------PlaceholderRowExec
+10)--------------ProjectionExec: expr=[3 as a, aa as b]
+11)----------------PlaceholderRowExec
+12)--------------ProjectionExec: expr=[5 as a, bb as b]
+13)----------------PlaceholderRowExec
+14)--------------ProjectionExec: expr=[7 as a, bb as b]
+15)----------------PlaceholderRowExec
 
 # Check actual result:
 query TI
@@ -362,17 +361,16 @@ physical_plan
 04)------ProjectionExec: expr=[row_number() PARTITION BY [s.b] ORDER BY [s.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as seq, a@0 as a, b@1 as b]
 05)--------BoundedWindowAggExec: wdw=[row_number() PARTITION BY [s.b] ORDER BY [s.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [s.b] ORDER BY [s.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 06)----------SortExec: expr=[b@1 ASC NULLS LAST, a@0 ASC NULLS LAST], preserve_partitioning=[true]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------RepartitionExec: partitioning=Hash([b@1], 4), input_partitions=4
-09)----------------UnionExec
-10)------------------ProjectionExec: expr=[1 as a, aa as b]
-11)--------------------PlaceholderRowExec
-12)------------------ProjectionExec: expr=[3 as a, aa as b]
-13)--------------------PlaceholderRowExec
-14)------------------ProjectionExec: expr=[5 as a, bb as b]
-15)--------------------PlaceholderRowExec
-16)------------------ProjectionExec: expr=[7 as a, bb as b]
-17)--------------------PlaceholderRowExec
+07)------------RepartitionExec: partitioning=Hash([b@1], 4), input_partitions=4
+08)--------------UnionExec
+09)----------------ProjectionExec: expr=[1 as a, aa as b]
+10)------------------PlaceholderRowExec
+11)----------------ProjectionExec: expr=[3 as a, aa as b]
+12)------------------PlaceholderRowExec
+13)----------------ProjectionExec: expr=[5 as a, bb as b]
+14)------------------PlaceholderRowExec
+15)----------------ProjectionExec: expr=[7 as a, bb as b]
+16)------------------PlaceholderRowExec
 
 
 # check actual result
@@ -1313,15 +1311,12 @@ physical_plan
 01)ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@2 as sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]
 02)--BoundedWindowAggExec: wdw=[count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
 03)----SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
-04)------CoalesceBatchesExec: target_batch_size=4096
-05)--------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-06)----------ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]
-07)------------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
-08)--------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
-09)----------------CoalesceBatchesExec: target_batch_size=4096
-10)------------------RepartitionExec: partitioning=Hash([c1@0, c2@1], 2), input_partitions=2
-11)--------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-12)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c4], file_type=csv, has_header=true
+04)------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
+05)--------ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]
+06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
+07)------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
+08)--------------RepartitionExec: partitioning=Hash([c1@0, c2@1], 2), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c4], file_type=csv, has_header=true
 
 
 # test_window_agg_sort_reversed_plan
@@ -1883,13 +1878,11 @@ physical_plan
 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
 05)--------ProjectionExec: expr=[]
 06)----------AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[]
-07)------------CoalesceBatchesExec: target_batch_size=4096
-08)--------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-09)----------------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[]
-10)------------------CoalesceBatchesExec: target_batch_size=4096
-11)--------------------FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434, projection=[c1@0]
-12)----------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-13)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c13], file_type=csv, has_header=true
+07)------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
+08)--------------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[]
+09)----------------FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434, projection=[c1@0]
+10)------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c13], file_type=csv, has_header=true
 
 
 query I
@@ -1925,16 +1918,14 @@ logical_plan
 06)----------TableScan: aggregate_test_100 projection=[c2, c3, c9]
 physical_plan
 01)SortPreservingMergeExec: [c3@0 ASC NULLS LAST], fetch=5
-02)--ProjectionExec: expr=[c3@0 as c3, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as sum1, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum2]
-03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
-04)------SortExec: expr=[c3@0 ASC NULLS LAST, c9@1 DESC], preserve_partitioning=[true]
-05)--------CoalesceBatchesExec: target_batch_size=4096
-06)----------RepartitionExec: partitioning=Hash([c3@0], 2), input_partitions=2
-07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-08)--------------ProjectionExec: expr=[c3@1 as c3, c9@2 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
-09)----------------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
-10)------------------SortExec: expr=[c3@1 DESC, c9@2 DESC, c2@0 ASC NULLS LAST], preserve_partitioning=[false]
-11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3, c9], file_type=csv, has_header=true
+02)--SortExec: TopK(fetch=5), expr=[c3@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[c3@0 as c3, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as sum1, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum2]
+04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+05)--------RepartitionExec: partitioning=Hash([c3@0], 2), input_partitions=1, maintains_sort_order=true
+06)----------ProjectionExec: expr=[c3@1 as c3, c9@2 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
+07)------------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+08)--------------SortExec: expr=[c3@1 DESC, c9@2 DESC, c2@0 ASC NULLS LAST], preserve_partitioning=[false]
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3, c9], file_type=csv, has_header=true
 
 
 
@@ -1943,7 +1934,7 @@ SELECT c3,
     SUM(c9) OVER(ORDER BY c3 DESC, c9 DESC, c2 ASC) as sum1,
     SUM(c9) OVER(PARTITION BY c3 ORDER BY c9 DESC ) as sum2
     FROM aggregate_test_100
-    ORDER BY c3
+    ORDER BY c3, c9 DESC
     LIMIT 5
 ----
 -117 219796664156 3023531799
@@ -1970,10 +1961,8 @@ physical_plan
 02)--ProjectionExec: expr=[c1@0 as c1, row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as rn1]
 03)----BoundedWindowAggExec: wdw=[row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
 04)------SortExec: expr=[c1@0 ASC NULLS LAST], preserve_partitioning=[true]
-05)--------CoalesceBatchesExec: target_batch_size=4096
-06)----------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true
+05)--------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=1
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true
 
 query TI
 SELECT c1, ROW_NUMBER() OVER (PARTITION BY c1) as rn1 FROM aggregate_test_100 ORDER BY c1 ASC
@@ -2099,10 +2088,8 @@ physical_plan
 02)--ProjectionExec: expr=[c1@0 as c1, row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as rn1]
 03)----BoundedWindowAggExec: wdw=[row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
 04)------SortExec: expr=[c1@0 ASC NULLS LAST], preserve_partitioning=[true]
-05)--------CoalesceBatchesExec: target_batch_size=4096
-06)----------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true
+05)--------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=1
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true
 
 statement ok
 set datafusion.optimizer.repartition_sorts = true;
@@ -2128,10 +2115,8 @@ physical_plan
 05)--------SortExec: expr=[c9@1 ASC NULLS LAST], preserve_partitioning=[true]
 06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING], mode=[Sorted]
 07)------------SortExec: expr=[c1@0 ASC NULLS LAST, c9@1 ASC NULLS LAST], preserve_partitioning=[true]
-08)--------------CoalesceBatchesExec: target_batch_size=4096
-09)----------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-10)------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c9], file_type=csv, has_header=true
+08)--------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c9], file_type=csv, has_header=true
 
 # test_window_agg_with_global_limit
 statement ok
@@ -2150,7 +2135,7 @@ physical_plan
 02)--AggregateExec: mode=Final, gby=[], aggr=[array_agg(aggregate_test_100.c13)]
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[array_agg(aggregate_test_100.c13)]
-05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
 06)----------SortExec: TopK(fetch=1), expr=[c13@0 ASC NULLS LAST], preserve_partitioning=[false]
 07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c13], file_type=csv, has_header=true
 
@@ -2273,8 +2258,7 @@ physical_plan
 07)------------BoundedWindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
 08)--------------WindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }]
 09)----------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, c9@3 ASC NULLS LAST, c8@2 ASC NULLS LAST], preserve_partitioning=[false]
-10)------------------ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c8@2 as c8, c9@3 as c9, c1@0 as c1_alias]
-11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c8, c9], file_type=csv, has_header=true
+10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c8, c9, c1@0 as c1_alias], file_type=csv, has_header=true
 
 query IIIII
 SELECT c9,
@@ -2441,11 +2425,9 @@ SELECT c9, rn1 FROM (SELECT c9,
 145294611 96
 
 # test_c9_rn_ordering_alias_opposite_direction3
-# These test check for whether datafusion is aware of the ordering of the column generated by ROW_NUMBER() window function.
-# Physical plan should have a SortExec after BoundedWindowAggExec.
-# The reason is that ordering of the table after BoundedWindowAggExec can be described as rn1 ASC, and also c9 DESC.
-# However, the requirement is rn1 ASC, c9 ASC (lexicographical order). Hence existing ordering cannot satisfy requirement
-# (Requirement is finer than existing ordering)
+# These tests check whether DataFusion tracks the ordering of the column generated by ROW_NUMBER() window function.
+# The outer ORDER BY can be simplified by ordering equivalence, so the plan should not need an additional SortExec
+# beyond the one required to satisfy the window input order.
 query TT
 EXPLAIN SELECT c9, rn1 FROM (SELECT c9,
                        ROW_NUMBER() OVER(ORDER BY c9 DESC) as rn1
@@ -2455,13 +2437,13 @@ EXPLAIN SELECT c9, rn1 FROM (SELECT c9,
        LIMIT 5
 ----
 logical_plan
-01)Sort: rn1 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST, fetch=5
+01)Sort: rn1 ASC NULLS LAST, fetch=5
 02)--Projection: aggregate_test_100.c9, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn1
 03)----WindowAggr: windowExpr=[[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
 04)------TableScan: aggregate_test_100 projection=[c9]
 physical_plan
-01)SortExec: TopK(fetch=5), expr=[rn1@1 ASC NULLS LAST, c9@0 ASC NULLS LAST], preserve_partitioning=[false], sort_prefix=[rn1@1 ASC NULLS LAST]
-02)--ProjectionExec: expr=[c9@0 as c9, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1]
+01)ProjectionExec: expr=[c9@0 as c9, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1]
+02)--GlobalLimitExec: skip=0, fetch=5
 03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 04)------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false]
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true
@@ -2530,7 +2512,7 @@ EXPLAIN SELECT c9, rn1 FROM (SELECT c9,
        LIMIT 5
 ----
 logical_plan
-01)Sort: rn1 ASC NULLS LAST, aggregate_test_100.c9 DESC NULLS FIRST, fetch=5
+01)Sort: rn1 ASC NULLS LAST, fetch=5
 02)--Projection: aggregate_test_100.c9, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn1
 03)----WindowAggr: windowExpr=[[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
 04)------TableScan: aggregate_test_100 projection=[c9]
@@ -2596,7 +2578,7 @@ statement ok
 set datafusion.optimizer.skip_failed_rules = true
 
 # Error is returned from the physical plan.
-query error Cannot cast Utf8\("1 DAY"\) to Int8
+query error Cannot cast 1 DAY to Int8
 SELECT
   COUNT(c1) OVER (ORDER BY c2 RANGE BETWEEN '1 DAY' PRECEDING AND '2 DAY' FOLLOWING)
   FROM aggregate_test_100;
@@ -2605,7 +2587,7 @@ statement ok
 set datafusion.optimizer.skip_failed_rules = false
 
 # Error is returned from the logical plan.
-query error Cannot cast Utf8\("1 DAY"\) to Int8
+query error Cannot cast 1 DAY to Int8
 SELECT
   COUNT(c1) OVER (ORDER BY c2 RANGE BETWEEN '1 DAY' PRECEDING AND '2 DAY' FOLLOWING)
   FROM aggregate_test_100;
@@ -2689,8 +2671,7 @@ physical_plan
 05)--------ProjectionExec: expr=[__common_expr_1@0 as __common_expr_1, inc_col@3 as inc_col, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING@5 as sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING@6 as sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@7 as sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@8 as min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@9 as min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@10 as min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@11 as max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@12 as max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@13 as max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@14 as count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@15 as count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@16 as sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@17 as sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@18 as sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@19 as min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@20 as min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@21 as min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@22 as max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@23 as max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@24 as max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING@25 as count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@26 as count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING]
 06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING: Field { "count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING": Int64 }, frame: RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
 07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN 4 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING: Field { "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN 8 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 5 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { "count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING": Int64 }, frame: RANGE BETWEEN 2 PRECEDING AND 6 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 8 FOLLOWING], mode=[Sorted]
-08)--------------ProjectionExec: expr=[CAST(desc_col@2 AS Int64) as __common_expr_1, CAST(inc_col@1 AS Int64) as __common_expr_2, ts@0 as ts, inc_col@1 as inc_col, desc_col@2 as desc_col]
-09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[ts, inc_col, desc_col], output_ordering=[ts@0 ASC NULLS LAST], file_type=csv, has_header=true
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[CAST(desc_col@2 AS Int64) as __common_expr_1, CAST(inc_col@1 AS Int64) as __common_expr_2, ts, inc_col, desc_col], output_ordering=[ts@2 ASC NULLS LAST], file_type=csv, has_header=true
 
 query IIIIIIIIIIIIIIIIIIIIIIII
 SELECT
@@ -2845,8 +2826,7 @@ physical_plan
 03)----ProjectionExec: expr=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@9 as sum1, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@4 as sum2, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@10 as min1, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@5 as min2, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@11 as max1, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@6 as max2, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@12 as count1, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@7 as count2, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@13 as avg1, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@8 as avg2, inc_col@3 as inc_col]
 04)------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": nullable Float64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING], mode=[Sorted]
 05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Float64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted]
-06)----------ProjectionExec: expr=[CAST(inc_col@1 AS Int64) as __common_expr_1, CAST(inc_col@1 AS Float64) as __common_expr_2, ts@0 as ts, inc_col@1 as inc_col]
-07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[ts, inc_col], output_ordering=[ts@0 ASC NULLS LAST], file_type=csv, has_header=true
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[CAST(inc_col@1 AS Int64) as __common_expr_1, CAST(inc_col@1 AS Float64) as __common_expr_2, ts, inc_col], output_ordering=[ts@2 ASC NULLS LAST], file_type=csv, has_header=true
 
 query IIIIIIIIRR
 SELECT
@@ -3163,8 +3143,7 @@ physical_plan
 11)--------------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND CURRENT ROW], mode=[Sorted]
 12)----------------------SortExec: expr=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, d@4 ASC NULLS LAST, c@3 ASC NULLS LAST], preserve_partitioning=[false]
 13)------------------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
-14)--------------------------ProjectionExec: expr=[CAST(c@2 AS Int64) as __common_expr_1, a@0 as a, b@1 as b, c@2 as c, d@3 as d]
-15)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], file_type=csv, has_header=true
+14)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[CAST(c@3 AS Int64) as __common_expr_1, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 query IIIIIIIIIIIIIII
 SELECT a, b, c,
@@ -3217,15 +3196,15 @@ EXPLAIN SELECT * FROM (SELECT *, ROW_NUMBER() OVER(ORDER BY a ASC) as rn1
 ----
 logical_plan
 01)Sort: rn1 ASC NULLS LAST
-02)--Sort: rn1 ASC NULLS LAST, fetch=5
-03)----Projection: annotated_data_infinite2.a0, annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.c, annotated_data_infinite2.d, row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn1
-04)------Filter: row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW < UInt64(50)
+02)--Filter: rn1 < UInt64(50)
+03)----Sort: rn1 ASC NULLS LAST, fetch=5
+04)------Projection: annotated_data_infinite2.a0, annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.c, annotated_data_infinite2.d, row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn1
 05)--------WindowAggr: windowExpr=[[row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
 06)----------TableScan: annotated_data_infinite2 projection=[a0, a, b, c, d]
 physical_plan
-01)ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as rn1]
-02)--CoalesceBatchesExec: target_batch_size=4096, fetch=5
-03)----FilterExec: row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 < 50
+01)FilterExec: rn1@5 < 50
+02)--ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as rn1]
+03)----GlobalLimitExec: skip=0, fetch=5
 04)------BoundedWindowAggExec: wdw=[row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 05)--------StreamingTableExec: partition_sizes=1, projection=[a0, a, b, c, d], infinite_source=true, output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST]
 
@@ -3365,21 +3344,17 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum1, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum2, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum3, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as sum4]
 02)--BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Linear]
-03)----CoalesceBatchesExec: target_batch_size=4096
-04)------RepartitionExec: partitioning=Hash([d@2], 2), input_partitions=2, preserve_order=true, sort_exprs=__common_expr_1@0 ASC NULLS LAST, a@1 ASC NULLS LAST
-05)--------ProjectionExec: expr=[__common_expr_1@0 as __common_expr_1, a@1 as a, d@4 as d, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@7 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
-06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
-07)------------CoalesceBatchesExec: target_batch_size=4096
-08)--------------RepartitionExec: partitioning=Hash([b@2, a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, __common_expr_1@0 ASC NULLS LAST
-09)----------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[PartiallySorted([0])]
-10)------------------CoalesceBatchesExec: target_batch_size=4096
-11)--------------------RepartitionExec: partitioning=Hash([a@1, d@4], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, __common_expr_1@0 ASC NULLS LAST
-12)----------------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
-13)------------------------CoalesceBatchesExec: target_batch_size=4096
-14)--------------------------RepartitionExec: partitioning=Hash([a@1, b@2], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, __common_expr_1@0 ASC NULLS LAST
-15)----------------------------ProjectionExec: expr=[CAST(a@0 AS Int64) as __common_expr_1, a@0 as a, b@1 as b, c@2 as c, d@3 as d]
-16)------------------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-17)--------------------------------StreamingTableExec: partition_sizes=1, projection=[a, b, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST]
+03)----RepartitionExec: partitioning=Hash([d@2], 2), input_partitions=2, preserve_order=true, sort_exprs=__common_expr_1@0 ASC NULLS LAST, a@1 ASC NULLS LAST
+04)------ProjectionExec: expr=[__common_expr_1@0 as __common_expr_1, a@1 as a, d@4 as d, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@7 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
+05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+06)----------RepartitionExec: partitioning=Hash([b@2, a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, __common_expr_1@0 ASC NULLS LAST
+07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[PartiallySorted([0])]
+08)--------------RepartitionExec: partitioning=Hash([a@1, d@4], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, __common_expr_1@0 ASC NULLS LAST
+09)----------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+10)------------------RepartitionExec: partitioning=Hash([a@1, b@2], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, __common_expr_1@0 ASC NULLS LAST
+11)--------------------ProjectionExec: expr=[CAST(a@0 AS Int64) as __common_expr_1, a@0 as a, b@1 as b, c@2 as c, d@3 as d]
+12)----------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
+13)------------------------StreamingTableExec: partition_sizes=1, projection=[a, b, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST]
 
 # reset the partition number 1 again
 statement ok
@@ -3530,9 +3505,8 @@ logical_plan
 03)----TableScan: multiple_ordered_table projection=[a0, a, b, c, d], partial_filters=[multiple_ordered_table.b = Int32(0)]
 physical_plan
 01)BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
-02)--CoalesceBatchesExec: target_batch_size=4096
-03)----FilterExec: b@2 = 0
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_orderings=[[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST], [c@3 ASC NULLS LAST]], file_type=csv, has_header=true
+02)--FilterExec: b@2 = 0
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_orderings=[[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST], [c@3 ASC NULLS LAST]], file_type=csv, has_header=true
 
 # Since column b is constant after filter b=0,
 # window requirement b ASC, d ASC can be satisfied
@@ -3549,9 +3523,8 @@ logical_plan
 physical_plan
 01)BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST, multiple_ordered_table.d ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST, multiple_ordered_table.d ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 02)--SortExec: expr=[d@4 ASC NULLS LAST], preserve_partitioning=[false]
-03)----CoalesceBatchesExec: target_batch_size=4096
-04)------FilterExec: b@2 = 0
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_orderings=[[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST], [c@3 ASC NULLS LAST]], file_type=csv, has_header=true
+03)----FilterExec: b@2 = 0
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_orderings=[[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST], [c@3 ASC NULLS LAST]], file_type=csv, has_header=true
 
 
 # Create an unbounded source where there is multiple orderings.
@@ -3604,9 +3577,8 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[max(multiple_ordered_table.c) PARTITION BY [multiple_ordered_table.d] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as max_c]
 02)--BoundedWindowAggExec: wdw=[max(multiple_ordered_table.c) PARTITION BY [multiple_ordered_table.d] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(multiple_ordered_table.c) PARTITION BY [multiple_ordered_table.d] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
-03)----CoalesceBatchesExec: target_batch_size=4096
-04)------FilterExec: d@1 = 0
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c, d], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true
+03)----FilterExec: d@1 = 0
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c, d], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 query TT
 explain SELECT SUM(d) OVER(PARTITION BY c ORDER BY a ASC)
@@ -3725,10 +3697,8 @@ physical_plan
 01)SortPreservingMergeExec: [c@3 ASC NULLS LAST]
 02)--ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, avg(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW@5 as avg_d]
 03)----BoundedWindowAggExec: wdw=[avg(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW: Field { "avg(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW": nullable Float64 }, frame: RANGE BETWEEN 10 PRECEDING AND CURRENT ROW], mode=[Linear]
-04)------CoalesceBatchesExec: target_batch_size=4096
-05)--------RepartitionExec: partitioning=Hash([d@4], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST
-06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)------------StreamingTableExec: partition_sizes=1, projection=[a0, a, b, c, d], infinite_source=true, output_orderings=[[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST], [c@3 ASC NULLS LAST]]
+04)------RepartitionExec: partitioning=Hash([d@4], 2), input_partitions=1, maintains_sort_order=true
+05)--------StreamingTableExec: partition_sizes=1, projection=[a0, a, b, c, d], infinite_source=true, output_orderings=[[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST], [c@3 ASC NULLS LAST]]
 
 # CTAS with NTILE function
 statement ok
@@ -4220,12 +4190,9 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[count(Int64(1)) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as count(*) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
 02)--BoundedWindowAggExec: wdw=[count(Int64(1)) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "count(Int64(1)) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
-03)----CoalesceBatchesExec: target_batch_size=4096
-04)------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
-05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-06)----------CoalesceBatchesExec: target_batch_size=4096
-07)------------FilterExec: a@0 = 1
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=1
+04)------FilterExec: a@0 = 1
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query I
 select ROW_NUMBER() over (partition by a) from (select * from a where a = 1);
@@ -4243,12 +4210,9 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]
 02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
-03)----CoalesceBatchesExec: target_batch_size=4096
-04)------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
-05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-06)----------CoalesceBatchesExec: target_batch_size=4096
-07)------------FilterExec: a@0 = 1
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=1
+04)------FilterExec: a@0 = 1
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # LAG window function IGNORE/RESPECT NULLS support with ascending order and default offset 1
 query TTTTTT
@@ -4422,9 +4386,9 @@ LIMIT 5;
 ----
 78 50
 63 38
-3 53
+NULL 19
 24 31
-14 94
+24 56
 
 # result should be same with above, when LAG/LEAD algorithm work with pruned data.
 # decreasing batch size, causes data to be produced in smaller chunks at the source.
@@ -4441,9 +4405,9 @@ LIMIT 5;
 ----
 78 50
 63 38
-3 53
+NULL 19
 24 31
-14 94
+24 56
 
 statement ok
 set datafusion.execution.batch_size = 100;
@@ -5313,12 +5277,10 @@ physical_plan
 02)--ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank]
 03)----BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 04)------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
-05)--------CoalesceBatchesExec: target_batch_size=1
-06)----------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-07)------------CoalesceBatchesExec: target_batch_size=1
-08)--------------FilterExec: c1@0 = 2 OR c1@0 = 3
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-10)------------------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
+06)----------FilterExec: c1@0 = 2 OR c1@0 = 3
+07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query III
 select c1, c2, rank
@@ -5353,14 +5315,11 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank@2 ASC NULLS LAST]
 02)--ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank]
-03)----CoalesceBatchesExec: target_batch_size=1
-04)------FilterExec: c2@1 >= 10
-05)--------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
-06)----------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
-07)------------CoalesceBatchesExec: target_batch_size=1
-08)--------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-10)------------------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----FilterExec: c2@1 >= 10
+04)------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+05)--------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
+06)----------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=1
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query III
 select c1, c2, rank
@@ -5395,16 +5354,13 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank@2 ASC NULLS LAST]
 02)--ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank]
-03)----CoalesceBatchesExec: target_batch_size=1
-04)------FilterExec: c2@1 = 10
-05)--------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
-06)----------SortExec: expr=[c2@1 ASC NULLS LAST], preserve_partitioning=[true]
-07)------------CoalesceBatchesExec: target_batch_size=1
-08)--------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-09)----------------CoalesceBatchesExec: target_batch_size=1
-10)------------------FilterExec: c1@0 = 1
-11)--------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-12)----------------------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----FilterExec: c2@1 = 10
+04)------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+05)--------SortExec: expr=[c2@1 ASC NULLS LAST], preserve_partitioning=[true]
+06)----------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
+07)------------FilterExec: c1@0 = 1
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query III
 select c1, c2, rank
@@ -5436,14 +5392,11 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank@2 ASC NULLS LAST]
 02)--ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank]
-03)----CoalesceBatchesExec: target_batch_size=1
-04)------FilterExec: c1@0 = 1 OR c2@1 = 10
-05)--------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
-06)----------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
-07)------------CoalesceBatchesExec: target_batch_size=1
-08)--------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-10)------------------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----FilterExec: c1@0 = 1 OR c2@1 = 10
+04)------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+05)--------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
+06)----------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=1
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query III
 select c1, c2, rank
@@ -5483,16 +5436,13 @@ physical_plan
 03)----ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank1, rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rank2]
 04)------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 05)--------SortExec: expr=[c2@1 ASC NULLS LAST, c1@0 ASC NULLS LAST], preserve_partitioning=[true]
-06)----------CoalesceBatchesExec: target_batch_size=1
-07)------------RepartitionExec: partitioning=Hash([c2@1, c1@0], 2), input_partitions=2
-08)--------------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
-09)----------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
-10)------------------CoalesceBatchesExec: target_batch_size=1
-11)--------------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-12)----------------------CoalesceBatchesExec: target_batch_size=1
-13)------------------------FilterExec: c1@0 > 1
-14)--------------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-15)----------------------------DataSourceExec: partitions=1, partition_sizes=[1]
+06)----------RepartitionExec: partitioning=Hash([c2@1, c1@0], 2), input_partitions=2
+07)------------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+08)--------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
+09)----------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
+10)------------------FilterExec: c1@0 > 1
+11)--------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+12)----------------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query IIII
 select c1, c2, rank1, rank2
@@ -5534,16 +5484,12 @@ physical_plan
 03)----ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank1, rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rank2]
 04)------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 05)--------SortExec: expr=[c2@1 ASC NULLS LAST, c1@0 ASC NULLS LAST], preserve_partitioning=[true]
-06)----------CoalesceBatchesExec: target_batch_size=1
-07)------------RepartitionExec: partitioning=Hash([c2@1, c1@0], 2), input_partitions=2
-08)--------------CoalesceBatchesExec: target_batch_size=1
-09)----------------FilterExec: c2@1 > 1
-10)------------------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
-11)--------------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
-12)----------------------CoalesceBatchesExec: target_batch_size=1
-13)------------------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-14)--------------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-15)----------------------------DataSourceExec: partitions=1, partition_sizes=[1]
+06)----------RepartitionExec: partitioning=Hash([c2@1, c1@0], 2), input_partitions=2
+07)------------FilterExec: c2@1 > 1
+08)--------------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+09)----------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
+10)------------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=1
+11)--------------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query IIII
 select c1, c2, rank1, rank2
@@ -5600,10 +5546,8 @@ physical_plan
 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, sum_c9@1 ASC NULLS LAST]
 02)--ProjectionExec: expr=[c1@0 as c1, sum(aggregate_test_100_ordered.c9) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 as sum_c9]
 03)----WindowAggExec: wdw=[sum(aggregate_test_100_ordered.c9) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100_ordered.c9) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
-04)------CoalesceBatchesExec: target_batch_size=1
-05)--------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2, preserve_order=true, sort_exprs=c1@0 ASC NULLS LAST
-06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c9], output_ordering=[c1@0 ASC NULLS LAST], file_type=csv, has_header=true
+04)------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=1, maintains_sort_order=true
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c9], output_ordering=[c1@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 query TT
 EXPLAIN SELECT SUM(c9) OVER() as sum_c9 FROM aggregate_test_100_ordered ORDER BY sum_c9;
@@ -5631,10 +5575,8 @@ physical_plan
 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, min_c5@1 DESC NULLS LAST]
 02)--ProjectionExec: expr=[c1@0 as c1, min(aggregate_test_100_ordered.c5) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 as min_c5]
 03)----WindowAggExec: wdw=[min(aggregate_test_100_ordered.c5) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "min(aggregate_test_100_ordered.c5) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
-04)------CoalesceBatchesExec: target_batch_size=1
-05)--------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2, preserve_order=true, sort_exprs=c1@0 ASC NULLS LAST
-06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c5], output_ordering=[c1@0 ASC NULLS LAST], file_type=csv, has_header=true
+04)------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=1, maintains_sort_order=true
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c5], output_ordering=[c1@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 query TT
 EXPLAIN SELECT MAX(c5) OVER() as max_c5 FROM aggregate_test_100_ordered ORDER BY max_c5;
@@ -5720,6 +5662,56 @@ DROP TABLE new_table;
 statement ok
 DROP TABLE aggregate_test_100_utf8view;
 
+# Window function only in ORDER BY
+query I
+SELECT c2 FROM aggregate_test_100 ORDER BY row_number() OVER (ORDER BY c9) LIMIT 5;
+----
+4
+2
+5
+2
+2
+
+# Window function in both SELECT and ORDER BY (deduplication)
+query II
+SELECT c2, row_number() OVER (ORDER BY c9) as rn FROM aggregate_test_100 ORDER BY row_number() OVER (ORDER BY c9) LIMIT 5;
+----
+4 1
+2 2
+5 3
+2 4
+2 5
+
+# Nested expression: ORDER BY window_func(...) + 1
+query I
+SELECT c2 FROM aggregate_test_100 ORDER BY row_number() OVER (ORDER BY c9) + 1 LIMIT 5;
+----
+4
+2
+5
+2
+2
+
+# Multiple window functions in ORDER BY
+query I
+SELECT c2 FROM aggregate_test_100 ORDER BY row_number() OVER (ORDER BY c9), max(c3) OVER (ORDER BY c9) LIMIT 5;
+----
+4
+2
+5
+2
+2
+
+# DESC ordering with window function
+query I
+SELECT c2 FROM aggregate_test_100 ORDER BY row_number() OVER (ORDER BY c9) DESC LIMIT 5;
+----
+5
+1
+1
+2
+1
+
 statement ok
 DROP TABLE aggregate_test_100
 
@@ -5831,9 +5823,8 @@ physical_plan
 02)--ProjectionExec: expr=[k@0 as k, time@2 as time, count(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW@3 as normal_count, count(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW@4 as distinct_count]
 03)----BoundedWindowAggExec: wdw=[count(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { "count(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW": Int64 }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW, count(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { "count(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW": Int64 }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW], mode=[Sorted]
 04)------SortExec: expr=[k@0 ASC NULLS LAST, time@2 ASC NULLS LAST], preserve_partitioning=[true]
-05)--------CoalesceBatchesExec: target_batch_size=1
-06)----------RepartitionExec: partitioning=Hash([k@0], 2), input_partitions=2
-07)------------DataSourceExec: partitions=2, partition_sizes=[5, 4]
+05)--------RepartitionExec: partitioning=Hash([k@0], 2), input_partitions=2
+06)----------DataSourceExec: partitions=2, partition_sizes=[5, 4]
 
 
 # Add testing for distinct sum
@@ -5894,10 +5885,9 @@ physical_plan
 02)--ProjectionExec: expr=[k@1 as k, time@2 as time, sum(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW@3 as sum_v, sum(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW@4 as sum_distinct_v]
 03)----BoundedWindowAggExec: wdw=[sum(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { "sum(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW, sum(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { "sum(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW], mode=[Sorted]
 04)------SortExec: expr=[k@1 ASC NULLS LAST, time@2 ASC NULLS LAST], preserve_partitioning=[true]
-05)--------CoalesceBatchesExec: target_batch_size=1
-06)----------RepartitionExec: partitioning=Hash([k@1], 2), input_partitions=2
-07)------------ProjectionExec: expr=[CAST(v@1 AS Int64) as __common_expr_1, k@0 as k, time@2 as time]
-08)--------------DataSourceExec: partitions=2, partition_sizes=[5, 4]
+05)--------RepartitionExec: partitioning=Hash([k@1], 2), input_partitions=2
+06)----------ProjectionExec: expr=[CAST(v@1 AS Int64) as __common_expr_1, k@0 as k, time@2 as time]
+07)------------DataSourceExec: partitions=2, partition_sizes=[5, 4]
 
 
 # FILTER clause with window functions
@@ -5937,7 +5927,7 @@ LIMIT 5
 ----
 DataFusion error: type_coercion
 caused by
-Error during planning: Cannot infer common argument type for comparison operation Int64 >= List(nullable Null)
+Error during planning: Cannot infer common argument type for comparison operation Int64 >= List(Null)
 
 
 
@@ -5965,12 +5955,10 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[c1@2 as c1, c2@3 as c2, sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum1, sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum2, count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as count1, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@7 as array_agg1, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@8 as array_agg2]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----BoundedWindowAggExec: wdw=[sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable List(nullable Int64) }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable List(nullable Int64) }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable List(Int64) }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable List(Int64) }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 04)------SortPreservingMergeExec: [c1@2 ASC NULLS LAST, c2@3 ASC NULLS LAST], fetch=5
 05)--------SortExec: TopK(fetch=5), expr=[c1@2 ASC NULLS LAST, c2@3 ASC NULLS LAST], preserve_partitioning=[true]
-06)----------ProjectionExec: expr=[__common_expr_3@0 as __common_expr_1, __common_expr_3@0 AND c2@2 < 4 AND c1@1 > 0 as __common_expr_2, c1@1 as c1, c2@2 as c2]
-07)------------ProjectionExec: expr=[c2@1 >= 2 as __common_expr_3, c1@0 as c1, c2@1 as c2]
-08)--------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_csv/partition-0.csv], [WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_csv/partition-1.csv], [WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_csv/partition-2.csv], [WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_csv/partition-3.csv]]}, projection=[c1, c2], file_type=csv, has_header=false
+06)----------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_csv/partition-0.csv], [WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_csv/partition-1.csv], [WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_csv/partition-2.csv], [WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_csv/partition-3.csv]]}, projection=[c2@1 >= 2 as __common_expr_1, c2@1 >= 2 AND c2@1 < 4 AND c1@0 > 0 as __common_expr_2, c1, c2], file_type=csv, has_header=false
 
 
 # FILTER filters out some rows
@@ -6142,3 +6130,73 @@ WHERE acctbal > (
 );
 ----
 1
+
+# Regression test for https://github.com/apache/datafusion/issues/20194
+# Window function with CASE WHEN in ORDER BY combined with NVL filter
+# should not trigger SanityCheckPlan error from equivalence normalization
+# replacing literals in sort expressions with complex filter expressions.
+statement ok
+CREATE TABLE issue_20194_t1 (
+  value_1_1 decimal(25) NULL,
+  value_1_2 int NULL,
+  value_1_3 bigint NULL
+);
+
+statement ok
+CREATE TABLE issue_20194_t2 (
+  value_2_1 bigint NULL,
+  value_2_2 varchar(140) NULL,
+  value_2_3 varchar(140) NULL
+);
+
+statement ok
+INSERT INTO issue_20194_t1 (value_1_1, value_1_2, value_1_3) VALUES (6774502793, 10040029, 1120);
+
+statement ok
+INSERT INTO issue_20194_t2 (value_2_1, value_2_2, value_2_3) VALUES (1120, '0', '0');
+
+query RII
+SELECT
+  t1.value_1_1, t1.value_1_2,
+  ROW_NUMBER() OVER (
+    PARTITION BY t1.value_1_1, t1.value_1_2
+    ORDER BY
+      CASE WHEN t2.value_2_2 = '0' THEN 1 ELSE 0 END ASC,
+      CASE WHEN t2.value_2_3 = '0' THEN 1 ELSE 0 END ASC
+  ) AS ord
+FROM issue_20194_t1 t1
+INNER JOIN issue_20194_t2 t2
+  ON t1.value_1_3 = t2.value_2_1
+  AND nvl(t2.value_2_3, '0') = '0';
+----
+6774502793 10040029 1
+
+# Config reset
+statement ok
+reset datafusion.execution.batch_size;
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+reset datafusion.optimizer.enable_window_limits;
+
+statement ok
+reset datafusion.optimizer.repartition_windows;
+
+statement ok
+reset datafusion.optimizer.repartition_sorts;
+
+statement ok
+reset datafusion.optimizer.skip_failed_rules;
+
+statement ok
+reset datafusion.optimizer.prefer_existing_sort;
+
+statement ok
+DROP TABLE issue_20194_t1;
+
+statement ok
+DROP TABLE issue_20194_t2;
diff --git a/datafusion/sqllogictest/test_files/window_limits.slt b/datafusion/sqllogictest/test_files/window_limits.slt
index 883cd4404f4f3..5c06e7f04ec1c 100644
--- a/datafusion/sqllogictest/test_files/window_limits.slt
+++ b/datafusion/sqllogictest/test_files/window_limits.slt
@@ -543,10 +543,8 @@ physical_plan
 02)--ProjectionExec: expr=[depname@0 as depname, empno@1 as empno, salary@2 as salary, sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW@3 as running_sum]
 03)----BoundedWindowAggExec: wdw=[sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW: Field { "sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted]
 04)------SortExec: expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([depname@0], 4), input_partitions=4
-07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[depname, empno, salary], file_type=csv, has_header=true
+05)--------RepartitionExec: partitioning=Hash([depname@0], 4), input_partitions=1
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[depname, empno, salary], file_type=csv, has_header=true
 
 # should handle partition by optimized
 statement ok
@@ -588,11 +586,10 @@ physical_plan
 01)SortPreservingMergeExec: [depname@0 ASC NULLS LAST], fetch=5
 02)--ProjectionExec: expr=[depname@0 as depname, empno@1 as empno, salary@2 as salary, sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW@3 as running_sum]
 03)----BoundedWindowAggExec: wdw=[sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW: Field { "sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted]
-04)------SortExec: TopK(fetch=5), expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([depname@0], 4), input_partitions=4
-07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[depname, empno, salary], file_type=csv, has_header=true
+04)------SortExec: TopK(fetch=5), expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true], sort_prefix=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST]
+05)--------RepartitionExec: partitioning=Hash([depname@0], 4), input_partitions=1, maintains_sort_order=true
+06)----------SortExec: TopK(fetch=5), expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true]
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[depname, empno, salary], file_type=csv, has_header=true
 
 # unbounded following
 statement ok
diff --git a/datafusion/sqllogictest/test_files/window_topk_pushdown.slt b/datafusion/sqllogictest/test_files/window_topk_pushdown.slt
new file mode 100644
index 0000000000000..54e907af5cd16
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/window_topk_pushdown.slt
@@ -0,0 +1,150 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for the TopKRepartition optimizer rule.
+#
+# When a partitioned window function has ORDER BY + LIMIT, the optimizer
+# can push a TopK (Sort with fetch) below the hash repartition to reduce
+# the volume of data flowing through the shuffle.
+#
+# The optimization is correct when the hash partition key is a prefix of
+# the sort key, because all rows with the same partition key land in the
+# same output partition.
+
+statement ok
+CREATE EXTERNAL TABLE employees (
+  depname VARCHAR NOT NULL,
+  c2  TINYINT NOT NULL,
+  c3  SMALLINT NOT NULL,
+  c4  SMALLINT,
+  c5  INT,
+  c6  BIGINT NOT NULL,
+  c7  SMALLINT NOT NULL,
+  empno INT NOT NULL,
+  salary BIGINT UNSIGNED NOT NULL,
+  c10 VARCHAR NOT NULL,
+  c11 FLOAT NOT NULL,
+  c12 DOUBLE NOT NULL,
+  c13 VARCHAR NOT NULL,
+  hire_date DATE NOT NULL,
+  c15 TIMESTAMP NOT NULL
+)
+STORED AS CSV
+LOCATION '../../testing/data/csv/aggregate_test_100_with_dates.csv'
+OPTIONS ('format.has_header' 'true');
+
+# Config reset
+
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# reset it explicitly.
+statement ok
+SET datafusion.execution.target_partitions = 4;
+
+###
+### Results correctness: both enabled and disabled must produce the same output
+###
+
+# Disabled: baseline results without the optimization
+statement ok
+SET datafusion.optimizer.enable_topk_repartition = false;
+
+query TI
+SELECT depname, SUM(1) OVER (PARTITION BY depname ORDER BY empno ASC ROWS UNBOUNDED PRECEDING) as running_total
+FROM employees
+ORDER BY depname, empno
+LIMIT 3;
+----
+a 1
+a 2
+a 3
+
+statement ok
+RESET datafusion.optimizer.enable_topk_repartition;
+
+# Enabled: results must match baseline
+statement ok
+SET datafusion.optimizer.enable_topk_repartition = true;
+
+query TI
+SELECT depname, SUM(1) OVER (PARTITION BY depname ORDER BY empno ASC ROWS UNBOUNDED PRECEDING) as running_total
+FROM employees
+ORDER BY depname, empno
+LIMIT 3;
+----
+a 1
+a 2
+a 3
+
+###
+### Plan shape: disabled should have TopK only above repartition
+###
+
+statement ok
+SET datafusion.optimizer.enable_topk_repartition = false;
+
+query TT
+EXPLAIN SELECT depname, SUM(1) OVER (PARTITION BY depname ORDER BY empno ASC ROWS UNBOUNDED PRECEDING) as running_total
+FROM employees
+ORDER BY depname, empno
+LIMIT 3;
+----
+logical_plan
+01)Projection: employees.depname, running_total
+02)--Sort: employees.depname ASC NULLS LAST, employees.empno ASC NULLS LAST, fetch=3
+03)----Projection: employees.depname, sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS running_total, employees.empno
+04)------WindowAggr: windowExpr=[[sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+05)--------TableScan: employees projection=[depname, empno]
+physical_plan
+01)ProjectionExec: expr=[depname@0 as depname, running_total@1 as running_total]
+02)--SortPreservingMergeExec: [depname@0 ASC NULLS LAST, empno@2 ASC NULLS LAST], fetch=3
+03)----ProjectionExec: expr=[depname@0 as depname, sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as running_total, empno@1 as empno]
+04)------BoundedWindowAggExec: wdw=[sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+05)--------SortExec: TopK(fetch=3), expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true]
+06)----------RepartitionExec: partitioning=Hash([depname@0], 4), input_partitions=1
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[depname, empno], file_type=csv, has_header=true
+
+###
+### Plan shape: enabled should have TopK on BOTH sides of the repartition
+###
+
+statement ok
+SET datafusion.optimizer.enable_topk_repartition = true;
+
+query TT
+EXPLAIN SELECT depname, SUM(1) OVER (PARTITION BY depname ORDER BY empno ASC ROWS UNBOUNDED PRECEDING) as running_total
+FROM employees
+ORDER BY depname, empno
+LIMIT 3;
+----
+logical_plan
+01)Projection: employees.depname, running_total
+02)--Sort: employees.depname ASC NULLS LAST, employees.empno ASC NULLS LAST, fetch=3
+03)----Projection: employees.depname, sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS running_total, employees.empno
+04)------WindowAggr: windowExpr=[[sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+05)--------TableScan: employees projection=[depname, empno]
+physical_plan
+01)ProjectionExec: expr=[depname@0 as depname, running_total@1 as running_total]
+02)--SortPreservingMergeExec: [depname@0 ASC NULLS LAST, empno@2 ASC NULLS LAST], fetch=3
+03)----ProjectionExec: expr=[depname@0 as depname, sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as running_total, empno@1 as empno]
+04)------BoundedWindowAggExec: wdw=[sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+05)--------SortExec: TopK(fetch=3), expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true], sort_prefix=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST]
+06)----------RepartitionExec: partitioning=Hash([depname@0], 4), input_partitions=1, maintains_sort_order=true
+07)------------SortExec: TopK(fetch=3), expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true]
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[depname, empno], file_type=csv, has_header=true
+
+statement ok
+RESET datafusion.optimizer.enable_topk_repartition;
diff --git a/datafusion/sqllogictest/test_files/window_topn.slt b/datafusion/sqllogictest/test_files/window_topn.slt
new file mode 100644
index 0000000000000..bf9ce26b35537
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/window_topn.slt
@@ -0,0 +1,626 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for Window TopN optimization: PartitionedTopKExec
+
+statement ok
+CREATE TABLE window_topn_t (id INT, pk INT, val INT) AS VALUES
+  (1, 1, 10),
+  (2, 1, 20),
+  (3, 1, 30),
+  (4, 1, 40),
+  (5, 2, 5),
+  (6, 2, 15),
+  (7, 2, 25),
+  (8, 3, 100),
+  (9, 3, 50),
+  (10, 3, 75);
+
+# Enable the optimization for all tests
+statement ok
+SET datafusion.optimizer.enable_window_topn = true;
+
+# Test 1: Correct results for top-3 per partition
+query III rowsort
+SELECT id, pk, val FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val) as rn FROM window_topn_t
+) WHERE rn <= 3;
+----
+1 1 10
+10 3 75
+2 1 20
+3 1 30
+5 2 5
+6 2 15
+7 2 25
+8 3 100
+9 3 50
+
+# Test 2: Verify plan contains PartitionedTopKExec with fetch=3
+query TT
+EXPLAIN SELECT * FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val) as rn FROM window_topn_t
+) WHERE rn <= 3;
+----
+logical_plan
+01)Projection: window_topn_t.id, window_topn_t.pk, window_topn_t.val, row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn
+02)--Filter: row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= UInt64(3)
+03)----WindowAggr: windowExpr=[[row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+04)------TableScan: window_topn_t projection=[id, pk, val]
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, pk@1 as pk, val@2 as val, row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----PartitionedTopKExec: fetch=3, partition=[pk@1], order=[val@2 ASC NULLS LAST]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Test 3: rn < 4 should give same results (fetch=3)
+query III rowsort
+SELECT id, pk, val FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val) as rn FROM window_topn_t
+) WHERE rn < 4;
+----
+1 1 10
+10 3 75
+2 1 20
+3 1 30
+5 2 5
+6 2 15
+7 2 25
+8 3 100
+9 3 50
+
+# Test 4: Without PARTITION BY — should NOT optimize
+query II rowsort
+SELECT id, val FROM (
+  SELECT *, ROW_NUMBER() OVER (ORDER BY val) as rn FROM window_topn_t
+) WHERE rn <= 3;
+----
+1 10
+5 5
+6 15
+
+# Test 5: Disabled config falls back to normal plan
+statement ok
+SET datafusion.optimizer.enable_window_topn = false;
+
+query TT
+EXPLAIN SELECT * FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val) as rn FROM window_topn_t
+) WHERE rn <= 3;
+----
+logical_plan
+01)Projection: window_topn_t.id, window_topn_t.pk, window_topn_t.val, row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn
+02)--Filter: row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= UInt64(3)
+03)----WindowAggr: windowExpr=[[row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+04)------TableScan: window_topn_t projection=[id, pk, val]
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, pk@1 as pk, val@2 as val, row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--FilterExec: row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 <= 3
+03)----BoundedWindowAggExec: wdw=[row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortExec: expr=[pk@1 ASC NULLS LAST, val@2 ASC NULLS LAST], preserve_partitioning=[false]
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+SET datafusion.optimizer.enable_window_topn = true;
+
+# Test 6: Flipped predicate: 3 >= rn should also trigger optimization
+query TT
+EXPLAIN SELECT * FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val) as rn FROM window_topn_t
+) WHERE 3 >= rn;
+----
+logical_plan
+01)Projection: window_topn_t.id, window_topn_t.pk, window_topn_t.val, row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn
+02)--Filter: row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= UInt64(3)
+03)----WindowAggr: windowExpr=[[row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+04)------TableScan: window_topn_t projection=[id, pk, val]
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, pk@1 as pk, val@2 as val, row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----PartitionedTopKExec: fetch=3, partition=[pk@1], order=[val@2 ASC NULLS LAST]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Test 7: Filter on data column (not window output) — should NOT optimize
+query TT
+EXPLAIN SELECT * FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val) as rn FROM window_topn_t
+) WHERE val <= 3;
+----
+logical_plan
+01)Projection: window_topn_t.id, window_topn_t.pk, window_topn_t.val, row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn
+02)--Filter: window_topn_t.val <= Int32(3)
+03)----WindowAggr: windowExpr=[[row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+04)------TableScan: window_topn_t projection=[id, pk, val]
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, pk@1 as pk, val@2 as val, row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--FilterExec: val@2 <= 3
+03)----BoundedWindowAggExec: wdw=[row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortExec: expr=[pk@1 ASC NULLS LAST, val@2 ASC NULLS LAST], preserve_partitioning=[false]
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Test 8: rn < 4 plan shows PartitionedTopKExec with fetch=3
+statement ok
+SET datafusion.explain.physical_plan_only = true;
+
+query TT
+EXPLAIN SELECT * FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val) as rn FROM window_topn_t
+) WHERE rn < 4;
+----
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, pk@1 as pk, val@2 as val, row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----PartitionedTopKExec: fetch=3, partition=[pk@1], order=[val@2 ASC NULLS LAST]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+SET datafusion.explain.physical_plan_only = false;
+
+# Test 9: 3 < rn is NOT a top-K pattern — should NOT optimize
+query TT
+EXPLAIN SELECT * FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val) as rn FROM window_topn_t
+) WHERE 3 < rn;
+----
+logical_plan
+01)Projection: window_topn_t.id, window_topn_t.pk, window_topn_t.val, row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn
+02)--Filter: row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW > UInt64(3)
+03)----WindowAggr: windowExpr=[[row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+04)------TableScan: window_topn_t projection=[id, pk, val]
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, pk@1 as pk, val@2 as val, row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--FilterExec: row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 > 3
+03)----BoundedWindowAggExec: wdw=[row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortExec: expr=[pk@1 ASC NULLS LAST, val@2 ASC NULLS LAST], preserve_partitioning=[false]
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Test 10: Tree format EXPLAIN shows partition and order expressions
+statement ok
+SET datafusion.explain.format = tree;
+
+statement ok
+SET datafusion.explain.physical_plan_only = true;
+
+query TT
+EXPLAIN SELECT * FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val) as rn FROM window_topn_t
+) WHERE rn <= 3;
+----
+physical_plan
+01)┌───────────────────────────┐
+02)│       ProjectionExec      │
+03)│    --------------------   │
+04)│           id: id          │
+05)│           pk: pk          │
+06)│                           │
+07)│            rn:            │
+08)│ row_number() PARTITION BY │
+09)│  [window_topn_t.pk] ORDER │
+10)│    BY [window_topn_t.val  │
+11)│    ASC NULLS LAST] RANGE  │
+12)│      BETWEEN UNBOUNDED    │
+13)│        PRECEDING AND      │
+14)│         CURRENT ROW       │
+15)│                           │
+16)│          val: val         │
+17)└─────────────┬─────────────┘
+18)┌─────────────┴─────────────┐
+19)│    BoundedWindowAggExec   │
+20)│    --------------------   │
+21)│        mode: Sorted       │
+22)│                           │
+23)│        select_list:       │
+24)│ row_number() PARTITION BY │
+25)│  [window_topn_t.pk] ORDER │
+26)│    BY [window_topn_t.val  │
+27)│    ASC NULLS LAST] RANGE  │
+28)│      BETWEEN UNBOUNDED    │
+29)│        PRECEDING AND      │
+30)│         CURRENT ROW       │
+31)└─────────────┬─────────────┘
+32)┌─────────────┴─────────────┐
+33)│    PartitionedTopKExec    │
+34)│    --------------------   │
+35)│          fetch: 3         │
+36)│                           │
+37)│           order:          │
+38)│   [val@2 ASC NULLS LAST]  │
+39)│                           │
+40)│     partition: [pk@1]     │
+41)└─────────────┬─────────────┘
+42)┌─────────────┴─────────────┐
+43)│       DataSourceExec      │
+44)│    --------------------   │
+45)│         bytes: 480        │
+46)│       format: memory      │
+47)│          rows: 1          │
+48)└───────────────────────────┘
+
+statement ok
+SET datafusion.explain.format = indent;
+
+statement ok
+SET datafusion.explain.physical_plan_only = false;
+
+# Test 11: ROW_NUMBER + RANK together — filter on rn should still optimize
+query IIIII rowsort
+SELECT id, pk, val, rn, rnk FROM (
+  SELECT *,
+    ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val) as rn,
+    RANK() OVER (PARTITION BY pk ORDER BY val) as rnk
+  FROM window_topn_t
+) WHERE rn <= 3;
+----
+1 1 10 1 1
+10 3 75 2 2
+2 1 20 2 2
+3 1 30 3 3
+5 2 5 1 1
+6 2 15 2 2
+7 2 25 3 3
+8 3 100 3 3
+9 3 50 1 1
+
+# Test 12: ROW_NUMBER + SUM together — filter on rn, correctness check
+query IIIII rowsort
+SELECT id, pk, val, rn, running_sum FROM (
+  SELECT *,
+    ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val) as rn,
+    SUM(val) OVER (PARTITION BY pk ORDER BY val) as running_sum
+  FROM window_topn_t
+) WHERE rn <= 3;
+----
+1 1 10 1 10
+10 3 75 2 125
+2 1 20 2 30
+3 1 30 3 60
+5 2 5 1 5
+6 2 15 2 20
+7 2 25 3 45
+8 3 100 3 225
+9 3 50 1 50
+
+# Test 13: Filter on RANK (not ROW_NUMBER) — should NOT optimize
+statement ok
+SET datafusion.explain.physical_plan_only = true;
+
+query TT
+EXPLAIN SELECT * FROM (
+  SELECT *,
+    ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val) as rn,
+    RANK() OVER (PARTITION BY pk ORDER BY val) as rnk
+  FROM window_topn_t
+) WHERE rnk <= 3;
+----
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, pk@1 as pk, val@2 as val, row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn, rank() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as rnk]
+02)--FilterExec: rank() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 <= 3
+03)----BoundedWindowAggExec: wdw=[row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, rank() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortExec: expr=[pk@1 ASC NULLS LAST, val@2 ASC NULLS LAST], preserve_partitioning=[false]
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Test 14: Filter on rn AND rnk — compound predicate should NOT optimize
+query TT
+EXPLAIN SELECT * FROM (
+  SELECT *,
+    ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val) as rn,
+    RANK() OVER (PARTITION BY pk ORDER BY val) as rnk
+  FROM window_topn_t
+) WHERE rn <= 3 AND rnk <= 3;
+----
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, pk@1 as pk, val@2 as val, row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn, rank() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as rnk]
+02)--FilterExec: row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 <= 3 AND rank() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 <= 3
+03)----BoundedWindowAggExec: wdw=[row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, rank() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortExec: expr=[pk@1 ASC NULLS LAST, val@2 ASC NULLS LAST], preserve_partitioning=[false]
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+SET datafusion.explain.physical_plan_only = false;
+
+# Test 15: ROW_NUMBER with DESC ordering — correctness
+query III rowsort
+SELECT id, pk, val FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val DESC) as rn FROM window_topn_t
+) WHERE rn <= 3;
+----
+10 3 75
+2 1 20
+3 1 30
+4 1 40
+5 2 5
+6 2 15
+7 2 25
+8 3 100
+9 3 50
+
+# Test 16: Multiple partition keys — should optimize
+statement ok
+SET datafusion.explain.physical_plan_only = true;
+
+query TT
+EXPLAIN SELECT * FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY pk, id ORDER BY val) as rn FROM window_topn_t
+) WHERE rn <= 3;
+----
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, pk@1 as pk, val@2 as val, row_number() PARTITION BY [window_topn_t.pk, window_topn_t.id] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [window_topn_t.pk, window_topn_t.id] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [window_topn_t.pk, window_topn_t.id] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----PartitionedTopKExec: fetch=3, partition=[pk@1, id@0], order=[val@2 ASC NULLS LAST]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+SET datafusion.explain.physical_plan_only = false;
+
+# Test 17: No PARTITION BY (only ORDER BY) — should NOT optimize
+statement ok
+SET datafusion.explain.physical_plan_only = true;
+
+query TT
+EXPLAIN SELECT * FROM (
+  SELECT *, ROW_NUMBER() OVER (ORDER BY id) as rn FROM window_topn_t
+) WHERE rn <= 3;
+----
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, pk@1 as pk, val@2 as val, row_number() ORDER BY [window_topn_t.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--FilterExec: row_number() ORDER BY [window_topn_t.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 <= 3
+03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [window_topn_t.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [window_topn_t.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Test 18: Overlapping partition and order keys (PARTITION BY id ORDER BY id, val) — should optimize
+query TT
+EXPLAIN SELECT * FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY id ORDER BY id, val) as rn FROM window_topn_t
+) WHERE rn <= 3;
+----
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, pk@1 as pk, val@2 as val, row_number() PARTITION BY [window_topn_t.id] ORDER BY [window_topn_t.id ASC NULLS LAST, window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [window_topn_t.id] ORDER BY [window_topn_t.id ASC NULLS LAST, window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [window_topn_t.id] ORDER BY [window_topn_t.id ASC NULLS LAST, window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----PartitionedTopKExec: fetch=3, partition=[id@0], order=[val@2 ASC NULLS LAST]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Test 19: Overlapping keys correctness (each id is unique, so rn=1 for all)
+statement ok
+SET datafusion.explain.physical_plan_only = false;
+
+query IIII rowsort
+SELECT id, pk, val, rn FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY id ORDER BY id, val) as rn FROM window_topn_t
+) WHERE rn <= 3;
+----
+1 1 10 1
+10 3 75 1
+2 1 20 1
+3 1 30 1
+4 1 40 1
+5 2 5 1
+6 2 15 1
+7 2 25 1
+8 3 100 1
+9 3 50 1
+
+# Test 20: PARTITION BY same column used in ORDER BY with different direction — should optimize
+statement ok
+SET datafusion.explain.physical_plan_only = true;
+
+query TT
+EXPLAIN SELECT * FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY pk, val DESC) as rn FROM window_topn_t
+) WHERE rn <= 3;
+----
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, pk@1 as pk, val@2 as val, row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.pk ASC NULLS LAST, window_topn_t.val DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.pk ASC NULLS LAST, window_topn_t.val DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.pk ASC NULLS LAST, window_topn_t.val DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----PartitionedTopKExec: fetch=3, partition=[pk@1], order=[val@2 DESC]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Test 21: Correctness for PARTITION BY pk ORDER BY pk, val DESC
+statement ok
+SET datafusion.explain.physical_plan_only = false;
+
+query IIII rowsort
+SELECT id, pk, val, rn FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY pk, val DESC) as rn FROM window_topn_t
+) WHERE rn <= 3;
+----
+10 3 75 2
+2 1 20 3
+3 1 30 2
+4 1 40 1
+5 2 5 3
+6 2 15 2
+7 2 25 1
+8 3 100 1
+9 3 50 3
+
+# Test 22: PARTITION BY pk ORDER BY pk DESC, val DESC — pk direction conflicts with partition sort
+statement ok
+SET datafusion.explain.physical_plan_only = true;
+
+query TT
+EXPLAIN SELECT * FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY pk DESC, val DESC) as rn FROM window_topn_t
+) WHERE rn <= 3;
+----
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, pk@1 as pk, val@2 as val, row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.pk DESC NULLS FIRST, window_topn_t.val DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.pk DESC NULLS FIRST, window_topn_t.val DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.pk DESC NULLS FIRST, window_topn_t.val DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----PartitionedTopKExec: fetch=3, partition=[pk@1], order=[val@2 DESC]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+SET datafusion.explain.physical_plan_only = false;
+
+# ---- Tests with QUALIFY clause ----
+
+# Test 28: QUALIFY with ROW_NUMBER — correctness
+query IIII rowsort
+SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val) as rn FROM window_topn_t
+QUALIFY rn <= 3;
+----
+1 1 10 1
+10 3 75 2
+2 1 20 2
+3 1 30 3
+5 2 5 1
+6 2 15 2
+7 2 25 3
+8 3 100 3
+9 3 50 1
+
+# Test 29: QUALIFY plan should use PartitionedTopKExec
+statement ok
+SET datafusion.explain.physical_plan_only = true;
+
+query TT
+EXPLAIN SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val) as rn FROM window_topn_t
+QUALIFY rn <= 3;
+----
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, pk@1 as pk, val@2 as val, row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----PartitionedTopKExec: fetch=3, partition=[pk@1], order=[val@2 ASC NULLS LAST]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Test 30: QUALIFY with < operator
+statement ok
+SET datafusion.explain.physical_plan_only = false;
+
+query IIII rowsort
+SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val) as rn FROM window_topn_t
+QUALIFY rn < 3;
+----
+1 1 10 1
+10 3 75 2
+2 1 20 2
+5 2 5 1
+6 2 15 2
+9 3 50 1
+
+# Test 31: QUALIFY on RANK — should NOT optimize
+statement ok
+SET datafusion.explain.physical_plan_only = true;
+
+query TT
+EXPLAIN SELECT *, RANK() OVER (PARTITION BY pk ORDER BY val) as rnk FROM window_topn_t
+QUALIFY rnk <= 3;
+----
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, pk@1 as pk, val@2 as val, rank() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rnk]
+02)--FilterExec: rank() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 <= 3
+03)----BoundedWindowAggExec: wdw=[rank() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [window_topn_t.pk] ORDER BY [window_topn_t.val ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortExec: expr=[pk@1 ASC NULLS LAST, val@2 ASC NULLS LAST], preserve_partitioning=[false]
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+SET datafusion.explain.physical_plan_only = false;
+
+statement ok
+DROP TABLE window_topn_t;
+
+# ---- Tests with NULL values for sort option coverage ----
+
+statement ok
+CREATE TABLE window_topn_nulls (id INT, pk INT, val INT) AS VALUES
+  (1, 1, 10),
+  (2, 1, NULL),
+  (3, 1, 30),
+  (4, 2, NULL),
+  (5, 2, 5),
+  (6, 2, 15),
+  (7, 2, NULL);
+
+# Test 23: ORDER BY val ASC NULLS LAST (default) — NULLs sorted last within partition
+query IIII rowsort
+SELECT id, pk, val, rn FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val ASC NULLS LAST) as rn FROM window_topn_nulls
+) WHERE rn <= 2;
+----
+1 1 10 1
+3 1 30 2
+5 2 5 1
+6 2 15 2
+
+# Test 24: ORDER BY val ASC NULLS FIRST — NULLs sorted first within partition
+query IIII rowsort
+SELECT id, pk, val, rn FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val ASC NULLS FIRST) as rn FROM window_topn_nulls
+) WHERE rn <= 2;
+----
+1 1 10 2
+2 1 NULL 1
+4 2 NULL 1
+7 2 NULL 2
+
+# Test 25: ORDER BY val DESC NULLS FIRST (default for DESC) — NULLs first
+query IIII rowsort
+SELECT id, pk, val, rn FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val DESC NULLS FIRST) as rn FROM window_topn_nulls
+) WHERE rn <= 2;
+----
+2 1 NULL 1
+3 1 30 2
+4 2 NULL 1
+7 2 NULL 2
+
+# Test 26: ORDER BY val DESC NULLS LAST — NULLs last
+query IIII rowsort
+SELECT id, pk, val, rn FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val DESC NULLS LAST) as rn FROM window_topn_nulls
+) WHERE rn <= 2;
+----
+1 1 10 2
+3 1 30 1
+5 2 5 2
+6 2 15 1
+
+# Test 27: Verify plans show correct sort options
+statement ok
+SET datafusion.explain.physical_plan_only = true;
+
+query TT
+EXPLAIN SELECT * FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val ASC NULLS FIRST) as rn FROM window_topn_nulls
+) WHERE rn <= 2;
+----
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, pk@1 as pk, val@2 as val, row_number() PARTITION BY [window_topn_nulls.pk] ORDER BY [window_topn_nulls.val ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [window_topn_nulls.pk] ORDER BY [window_topn_nulls.val ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [window_topn_nulls.pk] ORDER BY [window_topn_nulls.val ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----PartitionedTopKExec: fetch=2, partition=[pk@1], order=[val@2 ASC]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+EXPLAIN SELECT * FROM (
+  SELECT *, ROW_NUMBER() OVER (PARTITION BY pk ORDER BY val DESC NULLS LAST) as rn FROM window_topn_nulls
+) WHERE rn <= 2;
+----
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, pk@1 as pk, val@2 as val, row_number() PARTITION BY [window_topn_nulls.pk] ORDER BY [window_topn_nulls.val DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [window_topn_nulls.pk] ORDER BY [window_topn_nulls.val DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [window_topn_nulls.pk] ORDER BY [window_topn_nulls.val DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----PartitionedTopKExec: fetch=2, partition=[pk@1], order=[val@2 DESC NULLS LAST]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+SET datafusion.explain.physical_plan_only = false;
+
+statement ok
+DROP TABLE window_topn_nulls;
+
+# Reset config to default (false)
+statement ok
+SET datafusion.optimizer.enable_window_topn = false;
diff --git a/datafusion/substrait/Cargo.toml b/datafusion/substrait/Cargo.toml
index bff9a07a13de2..a0f203cec8db6 100644
--- a/datafusion/substrait/Cargo.toml
+++ b/datafusion/substrait/Cargo.toml
@@ -27,6 +27,9 @@ license = { workspace = true }
 authors = { workspace = true }
 rust-version = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -38,17 +41,17 @@ datafusion = { workspace = true, features = ["sql"] }
 half = { workspace = true }
 itertools = { workspace = true }
 object_store = { workspace = true }
-pbjson-types = { workspace = true }
+# We need to match the version in substrait, so we don't use the workspace version here
+pbjson-types = { version = "0.8.0" }
 prost = { workspace = true }
-substrait = { version = "0.62", features = ["serde"] }
+substrait = { version = "0.63.0", features = ["serde"] }
 url = { workspace = true }
 tokio = { workspace = true, features = ["fs"] }
-uuid = { version = "1.17.0", features = ["v4"] }
 
 [dev-dependencies]
 datafusion = { workspace = true, features = ["nested_expressions", "unicode_expressions"] }
 datafusion-functions-aggregate = { workspace = true }
-serde_json = "1.0"
+serde_json = { workspace = true }
 tokio = { workspace = true }
 insta = { workspace = true }
 
diff --git a/datafusion/substrait/src/extensions.rs b/datafusion/substrait/src/extensions.rs
index 0792928982268..78c357f3b8886 100644
--- a/datafusion/substrait/src/extensions.rs
+++ b/datafusion/substrait/src/extensions.rs
@@ -15,11 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion::common::{plan_err, DataFusionError, HashMap};
+use datafusion::common::{DataFusionError, HashMap, plan_err};
+use substrait::proto::extensions::SimpleExtensionDeclaration;
 use substrait::proto::extensions::simple_extension_declaration::{
     ExtensionFunction, ExtensionType, ExtensionTypeVariation, MappingType,
 };
-use substrait::proto::extensions::SimpleExtensionDeclaration;
 
 /// Substrait uses [SimpleExtensions](https://substrait.io/extensions/#simple-extensions) to define
 /// behavior of plans in addition to what's supported directly by the protobuf definitions.
@@ -38,7 +38,7 @@ impl Extensions {
     /// Registers a function and returns the anchor (reference) to it. If the function has already
     /// been registered, it returns the existing anchor.
     /// Function names are case-insensitive (converted to lowercase).
-    pub fn register_function(&mut self, function_name: String) -> u32 {
+    pub fn register_function(&mut self, function_name: &str) -> u32 {
         let function_name = function_name.to_lowercase();
 
         // Some functions are named differently in Substrait default extensions than in DF
@@ -64,7 +64,7 @@ impl Extensions {
 
     /// Registers a type and returns the anchor (reference) to it. If the type has already
     /// been registered, it returns the existing anchor.
-    pub fn register_type(&mut self, type_name: String) -> u32 {
+    pub fn register_type(&mut self, type_name: &str) -> u32 {
         let type_name = type_name.to_lowercase();
         match self.types.iter().find(|(_, t)| *t == &type_name) {
             Some((type_anchor, _)) => *type_anchor, // Type has been registered
@@ -113,14 +113,10 @@ impl TryFrom<&Vec<SimpleExtensionDeclaration>> for Extensions {
 }
 
 impl From<Extensions> for Vec<SimpleExtensionDeclaration> {
-    // Silence deprecation warnings for `extension_uri_reference` during the uri -> urn migration
-    // See: https://github.com/substrait-io/substrait/issues/856
-    #[allow(deprecated)]
     fn from(val: Extensions) -> Vec<SimpleExtensionDeclaration> {
         let mut extensions = vec![];
         for (f_anchor, f_name) in val.functions {
             let function_extension = ExtensionFunction {
-                extension_uri_reference: u32::MAX,
                 extension_urn_reference: u32::MAX,
                 function_anchor: f_anchor,
                 name: f_name,
@@ -133,7 +129,6 @@ impl From<Extensions> for Vec<SimpleExtensionDeclaration> {
 
         for (t_anchor, t_name) in val.types {
             let type_extension = ExtensionType {
-                extension_uri_reference: u32::MAX, // https://github.com/apache/datafusion/issues/11545
                 extension_urn_reference: u32::MAX, // https://github.com/apache/datafusion/issues/11545
                 type_anchor: t_anchor,
                 name: t_name,
@@ -146,8 +141,7 @@ impl From<Extensions> for Vec<SimpleExtensionDeclaration> {
 
         for (tv_anchor, tv_name) in val.type_variations {
             let type_variation_extension = ExtensionTypeVariation {
-                extension_uri_reference: u32::MAX, // We don't register proper extension URIs yet
-                extension_urn_reference: u32::MAX, // We don't register proper extension URIs yet
+                extension_urn_reference: u32::MAX, // We don't register proper extension URNs yet
                 type_variation_anchor: tv_anchor,
                 name: tv_name,
             };
diff --git a/datafusion/substrait/src/lib.rs b/datafusion/substrait/src/lib.rs
index 9a4f44e81df23..0819fd3a592f9 100644
--- a/datafusion/substrait/src/lib.rs
+++ b/datafusion/substrait/src/lib.rs
@@ -23,6 +23,7 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! Serialize / Deserialize DataFusion Plans to [Substrait.io]
 //!
@@ -66,19 +67,24 @@
 //! # use datafusion::arrow::array::{Int32Array, RecordBatch};
 //! # use datafusion_substrait::logical_plan;
 //! // Create a plan that scans table 't'
-//!  let ctx = SessionContext::new();
-//!  let batch = RecordBatch::try_from_iter(vec![("x", Arc::new(Int32Array::from(vec![42])) as _)])?;
-//!  ctx.register_batch("t", batch)?;
-//!  let df = ctx.sql("SELECT x from t").await?;
-//!  let plan = df.into_optimized_plan()?;
+//! let ctx = SessionContext::new();
+//! let batch = RecordBatch::try_from_iter(vec![(
+//!     "x",
+//!     Arc::new(Int32Array::from(vec![42])) as _,
+//! )])?;
+//! ctx.register_batch("t", batch)?;
+//! let df = ctx.sql("SELECT x from t").await?;
+//! let plan = df.into_optimized_plan()?;
 //!
-//!  // Convert the plan into a substrait (protobuf) Plan
-//!  let substrait_plan = logical_plan::producer::to_substrait_plan(&plan, &ctx.state())?;
+//! // Convert the plan into a substrait (protobuf) Plan
+//! let substrait_plan = logical_plan::producer::to_substrait_plan(&plan, &ctx.state())?;
 //!
-//!  // Receive a substrait protobuf from somewhere, and turn it into a LogicalPlan
-//!  let logical_round_trip = logical_plan::consumer::from_substrait_plan(&ctx.state(), &substrait_plan).await?;
-//!  let logical_round_trip = ctx.state().optimize(&logical_round_trip)?;
-//!  assert_eq!(format!("{:?}", plan), format!("{:?}", logical_round_trip));
+//! // Receive a substrait protobuf from somewhere, and turn it into a LogicalPlan
+//! let logical_round_trip =
+//!     logical_plan::consumer::from_substrait_plan(&ctx.state(), &substrait_plan)
+//!         .await?;
+//! let logical_round_trip = ctx.state().optimize(&logical_round_trip)?;
+//! assert_eq!(format!("{:?}", plan), format!("{:?}", logical_round_trip));
 //! # Ok(())
 //! # }
 //! ```
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/aggregate_function.rs b/datafusion/substrait/src/logical_plan/consumer/expr/aggregate_function.rs
index 62e140acc27b3..096eef7ae3b0e 100644
--- a/datafusion/substrait/src/logical_plan/consumer/expr/aggregate_function.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/aggregate_function.rs
@@ -16,11 +16,11 @@
 // under the License.
 
 use crate::logical_plan::consumer::{
-    from_substrait_func_args, substrait_fun_name, SubstraitConsumer,
+    SubstraitConsumer, from_substrait_func_args, substrait_fun_name,
 };
-use datafusion::common::{not_impl_datafusion_err, plan_err, DFSchema, ScalarValue};
+use datafusion::common::{DFSchema, ScalarValue, not_impl_datafusion_err, plan_err};
 use datafusion::execution::FunctionRegistry;
-use datafusion::logical_expr::{expr, Expr, SortExpr};
+use datafusion::logical_expr::{Expr, SortExpr, expr};
 use std::sync::Arc;
 use substrait::proto::AggregateFunction;
 
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/cast.rs b/datafusion/substrait/src/logical_plan/consumer/expr/cast.rs
index 5e8d3d93065f4..3dd62afe8f193 100644
--- a/datafusion/substrait/src/logical_plan/consumer/expr/cast.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/cast.rs
@@ -15,9 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::consumer::types::from_substrait_type_without_names;
-use crate::logical_plan::consumer::SubstraitConsumer;
-use datafusion::common::{substrait_err, DFSchema};
+use crate::logical_plan::consumer::{
+    SubstraitConsumer, field_from_substrait_type_without_names,
+};
+use datafusion::common::{DFSchema, substrait_err};
 use datafusion::logical_expr::{Cast, Expr, TryCast};
 use substrait::proto::expression as substrait_expression;
 use substrait::proto::expression::cast::FailureBehavior::ReturnNull;
@@ -37,11 +38,11 @@ pub async fn from_cast(
                     )
                     .await?,
             );
-            let data_type = from_substrait_type_without_names(consumer, output_type)?;
+            let field = field_from_substrait_type_without_names(consumer, output_type)?;
             if cast.failure_behavior() == ReturnNull {
-                Ok(Expr::TryCast(TryCast::new(input_expr, data_type)))
+                Ok(Expr::TryCast(TryCast::new_from_field(input_expr, field)))
             } else {
-                Ok(Expr::Cast(Cast::new(input_expr, data_type)))
+                Ok(Expr::Cast(Cast::new_from_field(input_expr, field)))
             }
         }
         None => substrait_err!("Cast expression without output type is not allowed"),
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/field_reference.rs b/datafusion/substrait/src/logical_plan/consumer/expr/field_reference.rs
index 90b5b6418149b..dae6c625ef55b 100644
--- a/datafusion/substrait/src/logical_plan/consumer/expr/field_reference.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/field_reference.rs
@@ -16,34 +16,51 @@
 // under the License.
 
 use crate::logical_plan::consumer::SubstraitConsumer;
-use datafusion::common::{not_impl_err, Column, DFSchema};
+use datafusion::common::{Column, DFSchema, not_impl_err, substrait_err};
 use datafusion::logical_expr::Expr;
+use std::sync::Arc;
+use substrait::proto::expression::FieldReference;
 use substrait::proto::expression::field_reference::ReferenceType::DirectReference;
+use substrait::proto::expression::field_reference::RootType;
 use substrait::proto::expression::reference_segment::ReferenceType::StructField;
-use substrait::proto::expression::FieldReference;
 
 pub async fn from_field_reference(
-    _consumer: &impl SubstraitConsumer,
+    consumer: &impl SubstraitConsumer,
     field_ref: &FieldReference,
     input_schema: &DFSchema,
 ) -> datafusion::common::Result<Expr> {
-    from_substrait_field_reference(field_ref, input_schema)
+    from_substrait_field_reference(consumer, field_ref, input_schema)
 }
 
 pub(crate) fn from_substrait_field_reference(
+    consumer: &impl SubstraitConsumer,
     field_ref: &FieldReference,
     input_schema: &DFSchema,
 ) -> datafusion::common::Result<Expr> {
     match &field_ref.reference_type {
         Some(DirectReference(direct)) => match &direct.reference_type.as_ref() {
-            Some(StructField(x)) => match &x.child.as_ref() {
-                Some(_) => not_impl_err!(
-                    "Direct reference StructField with child is not supported"
-                ),
-                None => Ok(Expr::Column(Column::from(
-                    input_schema.qualified_field(x.field as usize),
-                ))),
-            },
+            Some(StructField(struct_field)) => {
+                if struct_field.child.is_some() {
+                    return not_impl_err!(
+                        "Direct reference StructField with child is not supported"
+                    );
+                }
+                let field_idx = struct_field.field as usize;
+                match &field_ref.root_type {
+                    Some(RootType::RootReference(_)) | None => Ok(Expr::Column(
+                        Column::from(input_schema.qualified_field(field_idx)),
+                    )),
+                    Some(RootType::OuterReference(outer_ref)) => {
+                        resolve_outer_reference(consumer, outer_ref, field_idx)
+                    }
+                    Some(RootType::Expression(_)) => not_impl_err!(
+                        "Expression root type in field reference is not supported"
+                    ),
+                    Some(RootType::LambdaParameterReference(_)) => not_impl_err!(
+                        "Lambda parameter reference in field reference is not yet supported"
+                    ),
+                }
+            }
             _ => not_impl_err!(
                 "Direct reference with types other than StructField is not supported"
             ),
@@ -51,3 +68,20 @@ pub(crate) fn from_substrait_field_reference(
         _ => not_impl_err!("unsupported field ref type"),
     }
 }
+
+fn resolve_outer_reference(
+    consumer: &impl SubstraitConsumer,
+    outer_ref: &substrait::proto::expression::field_reference::OuterReference,
+    field_idx: usize,
+) -> datafusion::common::Result<Expr> {
+    let steps_out = outer_ref.steps_out as usize;
+    let Some(outer_schema) = consumer.get_outer_schema(steps_out) else {
+        return substrait_err!(
+            "OuterReference with steps_out={steps_out} \
+             but no outer schema is available"
+        );
+    };
+    let (qualifier, field) = outer_schema.qualified_field(field_idx);
+    let col = Column::from((qualifier, field));
+    Ok(Expr::OuterReferenceColumn(Arc::clone(field), col))
+}
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/function_arguments.rs b/datafusion/substrait/src/logical_plan/consumer/expr/function_arguments.rs
index 0b610b61b1dea..cae5ecb6e5a8b 100644
--- a/datafusion/substrait/src/logical_plan/consumer/expr/function_arguments.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/function_arguments.rs
@@ -16,10 +16,10 @@
 // under the License.
 
 use crate::logical_plan::consumer::SubstraitConsumer;
-use datafusion::common::{not_impl_err, DFSchema};
+use datafusion::common::{DFSchema, not_impl_err};
 use datafusion::logical_expr::Expr;
-use substrait::proto::function_argument::ArgType;
 use substrait::proto::FunctionArgument;
+use substrait::proto::function_argument::ArgType;
 
 /// Convert Substrait FunctionArguments to DataFusion Exprs
 pub async fn from_substrait_func_args(
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/literal.rs b/datafusion/substrait/src/logical_plan/consumer/expr/literal.rs
index eb3d345dc001d..d7d7a69581f05 100644
--- a/datafusion/substrait/src/logical_plan/consumer/expr/literal.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/literal.rs
@@ -15,39 +15,39 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::consumer::types::from_substrait_type;
-use crate::logical_plan::consumer::utils::{next_struct_field_name, DEFAULT_TIMEZONE};
 use crate::logical_plan::consumer::SubstraitConsumer;
+use crate::logical_plan::consumer::types::from_substrait_type;
+use crate::logical_plan::consumer::utils::{DEFAULT_TIMEZONE, next_struct_field_name};
 use crate::variation_const::FLOAT_16_TYPE_NAME;
-#[allow(deprecated)]
+#[expect(deprecated)]
 use crate::variation_const::{
     DEFAULT_CONTAINER_TYPE_VARIATION_REF, DEFAULT_TYPE_VARIATION_REF,
     INTERVAL_DAY_TIME_TYPE_REF, INTERVAL_MONTH_DAY_NANO_TYPE_NAME,
     INTERVAL_MONTH_DAY_NANO_TYPE_REF, INTERVAL_YEAR_MONTH_TYPE_REF,
-    LARGE_CONTAINER_TYPE_VARIATION_REF, TIMESTAMP_MICRO_TYPE_VARIATION_REF,
+    LARGE_CONTAINER_TYPE_VARIATION_REF, TIME_32_TYPE_VARIATION_REF,
+    TIME_64_TYPE_VARIATION_REF, TIMESTAMP_MICRO_TYPE_VARIATION_REF,
     TIMESTAMP_MILLI_TYPE_VARIATION_REF, TIMESTAMP_NANO_TYPE_VARIATION_REF,
-    TIMESTAMP_SECOND_TYPE_VARIATION_REF, TIME_32_TYPE_VARIATION_REF,
-    TIME_64_TYPE_VARIATION_REF, UNSIGNED_INTEGER_TYPE_VARIATION_REF,
+    TIMESTAMP_SECOND_TYPE_VARIATION_REF, UNSIGNED_INTEGER_TYPE_VARIATION_REF,
     VIEW_CONTAINER_TYPE_VARIATION_REF,
 };
-use datafusion::arrow::array::{new_empty_array, AsArray, MapArray};
+use datafusion::arrow::array::{AsArray, MapArray, new_empty_array};
 use datafusion::arrow::buffer::OffsetBuffer;
 use datafusion::arrow::datatypes::{Field, IntervalDayTime, IntervalMonthDayNano};
 use datafusion::arrow::temporal_conversions::NANOSECONDS;
 use datafusion::common::scalar::ScalarStructBuilder;
 use datafusion::common::{
-    not_impl_err, plan_err, substrait_datafusion_err, substrait_err, ScalarValue,
+    ScalarValue, not_impl_err, plan_err, substrait_datafusion_err, substrait_err,
 };
 use datafusion::logical_expr::Expr;
 use prost::Message;
 use std::sync::Arc;
 use substrait::proto;
-use substrait::proto::expression::literal::user_defined::Val;
+use substrait::proto::expression::Literal;
+use substrait::proto::expression::literal::user_defined::{TypeAnchorType, Val};
 use substrait::proto::expression::literal::{
-    interval_day_to_second, IntervalCompound, IntervalDayToSecond, IntervalYearToMonth,
-    LiteralType,
+    IntervalCompound, IntervalDayToSecond, IntervalYearToMonth, LiteralType,
+    interval_day_to_second,
 };
-use substrait::proto::expression::Literal;
 
 pub async fn from_literal(
     consumer: &impl SubstraitConsumer,
@@ -102,9 +102,10 @@ pub(crate) fn from_substrait_literal(
         },
         Some(LiteralType::Fp32(f)) => ScalarValue::Float32(Some(*f)),
         Some(LiteralType::Fp64(f)) => ScalarValue::Float64(Some(*f)),
+        #[expect(deprecated)]
         Some(LiteralType::Timestamp(t)) => {
             // Kept for backwards compatibility, new plans should use PrecisionTimestamp(Tz) instead
-            #[allow(deprecated)]
+            #[expect(deprecated)]
             match lit.type_variation_reference {
                 TIMESTAMP_SECOND_TYPE_VARIATION_REF => {
                     ScalarValue::TimestampSecond(Some(*t), None)
@@ -385,20 +386,25 @@ pub(crate) fn from_substrait_literal(
             use interval_day_to_second::PrecisionMode;
             // DF only supports millisecond precision, so for any more granular type we lose precision
             let milliseconds = match precision_mode {
+                #[expect(deprecated)]
                 Some(PrecisionMode::Microseconds(ms)) => ms / 1000,
-                None =>
+                None => {
                     if *subseconds != 0 {
-                        return substrait_err!("Cannot set subseconds field of IntervalDayToSecond without setting precision");
+                        return substrait_err!(
+                            "Cannot set subseconds field of IntervalDayToSecond without setting precision"
+                        );
                     } else {
                         0_i32
                     }
+                }
                 Some(PrecisionMode::Precision(0)) => *subseconds as i32 * 1000,
                 Some(PrecisionMode::Precision(3)) => *subseconds as i32,
                 Some(PrecisionMode::Precision(6)) => (subseconds / 1000) as i32,
                 Some(PrecisionMode::Precision(9)) => (subseconds / 1000 / 1000) as i32,
                 _ => {
                     return not_impl_err!(
-                    "Unsupported Substrait interval day to second precision mode: {precision_mode:?}")
+                        "Unsupported Substrait interval day to second precision mode: {precision_mode:?}"
+                    );
                 }
             };
 
@@ -468,11 +474,17 @@ pub(crate) fn from_substrait_literal(
                     )))
                 };
 
-            if let Some(name) = consumer
-                .get_extensions()
-                .types
-                .get(&user_defined.type_reference)
-            {
+            let type_ref = match user_defined.type_anchor_type {
+                Some(TypeAnchorType::TypeReference(ref_val)) => ref_val,
+                Some(TypeAnchorType::TypeAliasReference(_)) => {
+                    return not_impl_err!(
+                        "Type alias references in user-defined literals are not yet supported"
+                    );
+                }
+                None => 0,
+            };
+
+            if let Some(name) = consumer.get_extensions().types.get(&type_ref) {
                 match name.as_ref() {
                     FLOAT_16_TYPE_NAME => {
                         // Rules for encoding fp16 Substrait literals are defined as part of Arrow here:
@@ -505,21 +517,21 @@ pub(crate) fn from_substrait_literal(
                         return Ok(ScalarValue::Float16(Some(f16_val)));
                     }
                     // Kept for backwards compatibility - producers should use IntervalCompound instead
-                    #[allow(deprecated)]
+                    #[expect(deprecated)]
                     INTERVAL_MONTH_DAY_NANO_TYPE_NAME => {
                         interval_month_day_nano(user_defined)?
                     }
                     _ => {
                         return not_impl_err!(
-                        "Unsupported Substrait user defined type with ref {} and name {}",
-                        user_defined.type_reference,
-                        name
-                    )
+                            "Unsupported Substrait user defined type with ref {} and name {}",
+                            type_ref,
+                            name
+                        );
                     }
                 }
             } else {
-                #[allow(deprecated)]
-                match user_defined.type_reference {
+                #[expect(deprecated)]
+                match type_ref {
                     // Kept for backwards compatibility, producers should useIntervalYearToMonth instead
                     INTERVAL_YEAR_MONTH_TYPE_REF => {
                         let Some(Val::Value(raw_val)) = user_defined.val.as_ref() else {
@@ -562,8 +574,8 @@ pub(crate) fn from_substrait_literal(
                     _ => {
                         return not_impl_err!(
                             "Unsupported Substrait user defined type literal with ref {}",
-                            user_defined.type_reference
-                        )
+                            type_ref
+                        );
                     }
                 }
             }
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/mod.rs b/datafusion/substrait/src/logical_plan/consumer/expr/mod.rs
index 7358f1422f1b4..295456e95f9f3 100644
--- a/datafusion/substrait/src/logical_plan/consumer/expr/mod.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/mod.rs
@@ -21,6 +21,7 @@ mod field_reference;
 mod function_arguments;
 mod if_then;
 mod literal;
+mod nested;
 mod scalar_function;
 mod singular_or_list;
 mod subquery;
@@ -32,6 +33,7 @@ pub use field_reference::*;
 pub use function_arguments::*;
 pub use if_then::*;
 pub use literal::*;
+pub use nested::*;
 pub use scalar_function::*;
 pub use singular_or_list::*;
 pub use subquery::*;
@@ -39,11 +41,11 @@ pub use window_function::*;
 
 use crate::extensions::Extensions;
 use crate::logical_plan::consumer::{
-    from_substrait_named_struct, rename_field, DefaultSubstraitConsumer,
-    SubstraitConsumer,
+    DefaultSubstraitConsumer, SubstraitConsumer, from_substrait_named_struct,
+    rename_field,
 };
 use datafusion::arrow::datatypes::Field;
-use datafusion::common::{not_impl_err, plan_err, substrait_err, DFSchema, DFSchemaRef};
+use datafusion::common::{DFSchema, DFSchemaRef, not_impl_err, plan_err, substrait_err};
 use datafusion::execution::SessionState;
 use datafusion::logical_expr::{Expr, ExprSchemable};
 use substrait::proto::expression::RexType;
@@ -88,10 +90,14 @@ pub async fn from_substrait_rex(
                 consumer.consume_subquery(expr.as_ref(), input_schema).await
             }
             RexType::Nested(expr) => consumer.consume_nested(expr, input_schema).await,
+            #[expect(deprecated)]
             RexType::Enum(expr) => consumer.consume_enum(expr, input_schema).await,
             RexType::DynamicParameter(expr) => {
                 consumer.consume_dynamic_parameter(expr, input_schema).await
             }
+            RexType::Lambda(_) | RexType::LambdaInvocation(_) => {
+                not_impl_err!("Lambda expressions are not yet supported")
+            }
         },
         None => substrait_err!("Expression must set rex_type: {expression:?}"),
     }
@@ -116,15 +122,14 @@ pub async fn from_substrait_extended_expr(
         return not_impl_err!("Type variation extensions are not supported");
     }
 
-    let consumer = DefaultSubstraitConsumer {
-        extensions: &extensions,
-        state,
-    };
+    let consumer = DefaultSubstraitConsumer::new(&extensions, state);
 
     let input_schema = DFSchemaRef::new(match &extended_expr.base_schema {
         Some(base_schema) => from_substrait_named_struct(&consumer, base_schema),
         None => {
-            plan_err!("required property `base_schema` missing from Substrait ExtendedExpression message")
+            plan_err!(
+                "required property `base_schema` missing from Substrait ExtendedExpression message"
+            )
         }
     }?);
 
@@ -137,15 +142,15 @@ pub async fn from_substrait_extended_expr(
                 not_impl_err!("Measure expressions are not yet supported")
             }
             None => {
-                plan_err!("required property `expr_type` missing from Substrait ExpressionReference message")
+                plan_err!(
+                    "required property `expr_type` missing from Substrait ExpressionReference message"
+                )
             }
         }?;
         let expr = consumer
             .consume_expression(scalar_expr, &input_schema)
             .await?;
-        let (output_type, expected_nullability) =
-            expr.data_type_and_nullable(&input_schema)?;
-        let output_field = Field::new("", output_type, expected_nullability);
+        let output_field = expr.to_field(&input_schema)?.1;
         let mut names_idx = 0;
         let output_field = rename_field(
             &output_field,
@@ -198,13 +203,13 @@ mod tests {
     use crate::logical_plan::consumer::*;
     use datafusion::common::DFSchema;
     use datafusion::logical_expr::Expr;
-    use substrait::proto::expression::window_function::BoundsType;
-    use substrait::proto::expression::RexType;
     use substrait::proto::Expression;
+    use substrait::proto::expression::RexType;
+    use substrait::proto::expression::window_function::BoundsType;
 
     #[tokio::test]
-    async fn window_function_with_range_unit_and_no_order_by(
-    ) -> datafusion::common::Result<()> {
+    async fn window_function_with_range_unit_and_no_order_by()
+    -> datafusion::common::Result<()> {
         let substrait = Expression {
             rex_type: Some(RexType::WindowFunction(
                 substrait::proto::expression::WindowFunction {
@@ -221,7 +226,7 @@ mod tests {
         // Just registering a single function (index 0) so that the plan
         // does not throw a "function not found" error.
         let mut extensions = Extensions::default();
-        extensions.register_function("count".to_string());
+        extensions.register_function("count");
         consumer.extensions = &extensions;
 
         match from_substrait_rex(&consumer, &substrait, &DFSchema::empty()).await? {
@@ -248,7 +253,7 @@ mod tests {
         let mut consumer = test_consumer();
 
         let mut extensions = Extensions::default();
-        extensions.register_function("count".to_string());
+        extensions.register_function("count");
         consumer.extensions = &extensions;
 
         match from_substrait_rex(&consumer, &substrait, &DFSchema::empty()).await? {
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/nested.rs b/datafusion/substrait/src/logical_plan/consumer/expr/nested.rs
new file mode 100644
index 0000000000000..f94a701342826
--- /dev/null
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/nested.rs
@@ -0,0 +1,151 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::logical_plan::consumer::SubstraitConsumer;
+use datafusion::common::{DFSchema, not_impl_err, substrait_err};
+use datafusion::execution::FunctionRegistry;
+use datafusion::logical_expr::Expr;
+use substrait::proto::expression::Nested;
+use substrait::proto::expression::nested::NestedType;
+
+/// Converts a Substrait [Nested] expression into a DataFusion [Expr].
+///
+/// Substrait Nested expressions represent complex type constructors (list, struct, map)
+/// where elements are full expressions rather than just literals. This is used by
+/// producers that emit `Nested { list: ... }` for array construction, as opposed to
+/// `Literal { list: ... }` which only supports scalar values.
+pub async fn from_nested(
+    consumer: &impl SubstraitConsumer,
+    nested: &Nested,
+    input_schema: &DFSchema,
+) -> datafusion::common::Result<Expr> {
+    let Some(nested_type) = &nested.nested_type else {
+        return substrait_err!("Nested expression requires a nested_type");
+    };
+
+    match nested_type {
+        NestedType::List(list) => {
+            if list.values.is_empty() {
+                return substrait_err!(
+                    "Empty Nested lists are not supported; use Literal.empty_list instead"
+                );
+            }
+
+            let mut args = Vec::with_capacity(list.values.len());
+            for value in &list.values {
+                args.push(consumer.consume_expression(value, input_schema).await?);
+            }
+
+            let make_array_udf = consumer.get_function_registry().udf("make_array")?;
+            Ok(Expr::ScalarFunction(
+                datafusion::logical_expr::expr::ScalarFunction::new_udf(
+                    make_array_udf,
+                    args,
+                ),
+            ))
+        }
+        NestedType::Struct(_) => {
+            not_impl_err!("Nested struct expressions are not yet supported")
+        }
+        NestedType::Map(_) => {
+            not_impl_err!("Nested map expressions are not yet supported")
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::logical_plan::consumer::utils::tests::test_consumer;
+    use substrait::proto::expression::Literal;
+    use substrait::proto::expression::nested::List;
+    use substrait::proto::{self, Expression};
+
+    fn make_i64_literal(value: i64) -> Expression {
+        Expression {
+            rex_type: Some(proto::expression::RexType::Literal(Literal {
+                nullable: false,
+                type_variation_reference: 0,
+                literal_type: Some(proto::expression::literal::LiteralType::I64(value)),
+            })),
+        }
+    }
+
+    #[tokio::test]
+    async fn nested_list_with_literals() -> datafusion::common::Result<()> {
+        let consumer = test_consumer();
+        let schema = DFSchema::empty();
+        let nested = Nested {
+            nullable: false,
+            type_variation_reference: 0,
+            nested_type: Some(NestedType::List(List {
+                values: vec![
+                    make_i64_literal(1),
+                    make_i64_literal(2),
+                    make_i64_literal(3),
+                ],
+            })),
+        };
+
+        let expr = from_nested(&consumer, &nested, &schema).await?;
+        assert_eq!(
+            format!("{expr}"),
+            "make_array(Int64(1), Int64(2), Int64(3))"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn nested_list_empty_rejected() -> datafusion::common::Result<()> {
+        let consumer = test_consumer();
+        let schema = DFSchema::empty();
+        let nested = Nested {
+            nullable: true,
+            type_variation_reference: 0,
+            nested_type: Some(NestedType::List(List { values: vec![] })),
+        };
+
+        let result = from_nested(&consumer, &nested, &schema).await;
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Empty Nested lists are not supported")
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn nested_missing_type() -> datafusion::common::Result<()> {
+        let consumer = test_consumer();
+        let schema = DFSchema::empty();
+        let nested = Nested {
+            nullable: false,
+            type_variation_reference: 0,
+            nested_type: None,
+        };
+
+        let result = from_nested(&consumer, &nested, &schema).await;
+        assert!(result.is_err());
+        assert!(result.unwrap_err().to_string().contains("nested_type"));
+
+        Ok(())
+    }
+}
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/scalar_function.rs b/datafusion/substrait/src/logical_plan/consumer/expr/scalar_function.rs
index f80cf43eb81eb..1a0fb3f55f609 100644
--- a/datafusion/substrait/src/logical_plan/consumer/expr/scalar_function.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/scalar_function.rs
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::consumer::{from_substrait_func_args, SubstraitConsumer};
+use crate::logical_plan::consumer::{SubstraitConsumer, from_substrait_func_args};
 use datafusion::common::Result;
 use datafusion::common::{
-    not_impl_err, plan_err, substrait_err, DFSchema, DataFusionError, ScalarValue,
+    DFSchema, DataFusionError, ScalarValue, not_impl_err, plan_err, substrait_err,
 };
 use datafusion::execution::FunctionRegistry;
-use datafusion::logical_expr::{expr, Between, BinaryExpr, Expr, Like, Operator};
+use datafusion::logical_expr::{Between, BinaryExpr, Expr, Like, Operator, expr};
 use std::vec::Drain;
 use substrait::proto::expression::ScalarFunction;
 
@@ -30,6 +30,7 @@ pub async fn from_scalar_function(
     f: &ScalarFunction,
     input_schema: &DFSchema,
 ) -> Result<Expr> {
+    //TODO: handle higher order functions, as they are also encoded as scalar functions
     let Some(fn_signature) = consumer
         .get_extensions()
         .functions
@@ -62,9 +63,9 @@ pub async fn from_scalar_function(
     } else if let Some(op) = name_to_op(fn_name) {
         if args.len() < 2 {
             return not_impl_err!(
-                        "Expect at least two arguments for binary operator {op:?}, the provided number of operators is {:?}",
-                       f.arguments.len()
-                    );
+                "Expect at least two arguments for binary operator {op:?}, the provided number of operators is {:?}",
+                f.arguments.len()
+            );
         }
         // In those cases we build a balanced tree of BinaryExprs
         arg_list_to_binary_op_tree(op, args)
@@ -76,15 +77,14 @@ pub async fn from_scalar_function(
 }
 
 pub fn substrait_fun_name(name: &str) -> &str {
-    let name = match name.rsplit_once(':') {
+    (match name.rsplit_once(':') {
         // Since 0.32.0, Substrait requires the function names to be in a compound format
         // https://substrait.io/extensions/#function-signature-compound-names
         // for example, `add:i8_i8`.
         // On the consumer side, we don't really care about the signature though, just the name.
         Some((name, _)) => name,
         None => name,
-    };
-    name
+    }) as _
 }
 
 pub fn name_to_op(name: &str) -> Option<Operator> {
@@ -177,8 +177,9 @@ struct BuiltinExprBuilder {
 impl BuiltinExprBuilder {
     pub fn try_from_name(name: &str) -> Option<Self> {
         match name {
-            "not" | "like" | "ilike" | "is_null" | "is_not_null" | "is_true"
-            | "is_false" | "is_not_true" | "is_not_false" | "is_unknown"
+            "not" | "like" | "ilike" | "like_match" | "like_imatch"
+            | "like_not_match" | "like_not_imatch" | "is_null" | "is_not_null"
+            | "is_true" | "is_false" | "is_not_true" | "is_not_false" | "is_unknown"
             | "is_not_unknown" | "negative" | "negate" | "and_not" | "xor"
             | "between" | "logb" => Some(Self {
                 expr_name: name.to_string(),
@@ -194,8 +195,12 @@ impl BuiltinExprBuilder {
         args: Vec<Expr>,
     ) -> Result<Expr> {
         match self.expr_name.as_str() {
-            "like" => Self::build_like_expr(false, f, args).await,
-            "ilike" => Self::build_like_expr(true, f, args).await,
+            "like" => Self::build_like_expr(false, false, f, args).await,
+            "ilike" => Self::build_like_expr(true, false, f, args).await,
+            "like_match" => Self::build_like_expr(false, false, f, args).await,
+            "like_imatch" => Self::build_like_expr(true, false, f, args).await,
+            "like_not_match" => Self::build_like_expr(false, true, f, args).await,
+            "like_not_imatch" => Self::build_like_expr(true, true, f, args).await,
             "not" | "negative" | "negate" | "is_null" | "is_not_null" | "is_true"
             | "is_false" | "is_not_true" | "is_not_false" | "is_unknown"
             | "is_not_unknown" => Self::build_unary_expr(&self.expr_name, args).await,
@@ -236,6 +241,7 @@ impl BuiltinExprBuilder {
 
     async fn build_like_expr(
         case_insensitive: bool,
+        negated: bool,
         f: &ScalarFunction,
         args: Vec<Expr>,
     ) -> Result<Expr> {
@@ -265,8 +271,8 @@ impl BuiltinExprBuilder {
                 }
                 _ => {
                     return substrait_err!(
-                    "Expect Utf8 literal for escape char, but found {escape_char_expr:?}"
-                )
+                        "Expect Utf8 literal for escape char, but found {escape_char_expr:?}"
+                    );
                 }
             }
         } else {
@@ -274,7 +280,7 @@ impl BuiltinExprBuilder {
         };
 
         Ok(Expr::Like(Like {
-            negated: false,
+            negated,
             expr: Box::new(expr),
             pattern: Box::new(pattern),
             escape_char,
@@ -286,7 +292,7 @@ impl BuiltinExprBuilder {
         let [a, b] = match args.try_into() {
             Ok(args_arr) => args_arr,
             Err(_) => {
-                return substrait_err!("Expected two arguments for `{fn_name}` expr")
+                return substrait_err!("Expected two arguments for `{fn_name}` expr");
             }
         };
         match fn_name {
@@ -310,7 +316,7 @@ impl BuiltinExprBuilder {
         let [expression, low, high] = match args.try_into() {
             Ok(args_arr) => args_arr,
             Err(_) => {
-                return substrait_err!("Expected three arguments for `{fn_name}` expr")
+                return substrait_err!("Expected three arguments for `{fn_name}` expr");
             }
         };
 
@@ -476,4 +482,96 @@ mod tests {
         let _ = consumer.consume_scalar_function(&func, &df_schema).await?;
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_like_match_conversion() -> Result<()> {
+        // 1. Setup the consumer with the "like_match" function registered
+        let mut extensions = Extensions::default();
+        extensions
+            .functions
+            .insert(0, "like_match:str_str".to_string());
+        extensions
+            .functions
+            .insert(1, "like_not_match:str_str".to_string());
+        extensions
+            .functions
+            .insert(2, "like_imatch:str_str".to_string());
+
+        let consumer = DefaultSubstraitConsumer::new(&extensions, &TEST_SESSION_STATE);
+
+        // 2. Create the arguments (column "a" and pattern "%foo%")
+        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]);
+        let df_schema = DFSchema::try_from(schema).unwrap();
+
+        let col_arg = FunctionArgument {
+            arg_type: Some(ArgType::Value(Expression {
+                rex_type: Some(RexType::Selection(Box::new(
+                    substrait::proto::expression::FieldReference {
+                        reference_type: Some(substrait::proto::expression::field_reference::ReferenceType::DirectReference(
+                            substrait::proto::expression::ReferenceSegment {
+                                reference_type: Some(substrait::proto::expression::reference_segment::ReferenceType::StructField(
+                                    Box::new(substrait::proto::expression::reference_segment::StructField {
+                                        field: 0,
+                                        child: None,
+                                    })
+                                )),
+                            }
+                        )),
+                        root_type: Some(substrait::proto::expression::field_reference::RootType::RootReference(
+                            substrait::proto::expression::field_reference::RootReference {}
+                        )),
+                    }
+                ))),
+            })),
+        };
+
+        let pattern_arg = FunctionArgument {
+            arg_type: Some(ArgType::Value(Expression {
+                rex_type: Some(RexType::Literal(Literal {
+                    nullable: false,
+                    type_variation_reference: 0,
+                    literal_type: Some(LiteralType::String("foo".to_string())),
+                })),
+            })),
+        };
+
+        // 3. Test "like_match" (Standard LIKE)
+        let func_like = ScalarFunction {
+            function_reference: 0,
+            arguments: vec![col_arg.clone(), pattern_arg.clone()],
+            ..Default::default()
+        };
+
+        let result = consumer
+            .consume_scalar_function(&func_like, &df_schema)
+            .await?;
+
+        if let Expr::Like(like) = result {
+            assert!(!like.negated);
+            assert!(!like.case_insensitive);
+            assert_eq!(format!("{}", like.pattern), "Utf8(\"foo\")");
+        } else {
+            panic!("Expected Expr::Like, got {result:?}");
+        }
+
+        // 4. Test "like_not_match" (NOT LIKE)
+        let func_not_like = ScalarFunction {
+            function_reference: 1,
+            arguments: vec![col_arg.clone(), pattern_arg.clone()],
+            ..Default::default()
+        };
+
+        let result = consumer
+            .consume_scalar_function(&func_not_like, &df_schema)
+            .await?;
+
+        if let Expr::Like(like) = result {
+            assert!(like.negated);
+            assert!(!like.case_insensitive);
+        } else {
+            panic!("Expected Expr::Like (negated), got {result:?}");
+        }
+
+        Ok(())
+    }
 }
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/singular_or_list.rs b/datafusion/substrait/src/logical_plan/consumer/expr/singular_or_list.rs
index 6d44ebcce5908..3937ee7b15fde 100644
--- a/datafusion/substrait/src/logical_plan/consumer/expr/singular_or_list.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/singular_or_list.rs
@@ -15,10 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::consumer::{from_substrait_rex_vec, SubstraitConsumer};
+use crate::logical_plan::consumer::{SubstraitConsumer, from_substrait_rex_vec};
 use datafusion::common::DFSchema;
-use datafusion::logical_expr::expr::InList;
 use datafusion::logical_expr::Expr;
+use datafusion::logical_expr::expr::InList;
 use substrait::proto::expression::SingularOrList;
 
 pub async fn from_singular_or_list(
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/subquery.rs b/datafusion/substrait/src/logical_plan/consumer/expr/subquery.rs
index 917bcc007716b..83cf8400eebfc 100644
--- a/datafusion/substrait/src/logical_plan/consumer/expr/subquery.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/subquery.rs
@@ -16,13 +16,32 @@
 // under the License.
 
 use crate::logical_plan::consumer::SubstraitConsumer;
-use datafusion::common::{substrait_err, DFSchema, Spans};
-use datafusion::logical_expr::expr::{Exists, InSubquery};
-use datafusion::logical_expr::{Expr, Subquery};
+use datafusion::common::{DFSchema, Spans, substrait_datafusion_err, substrait_err};
+use datafusion::logical_expr::expr::{Exists, InSubquery, SetComparison, SetQuantifier};
+use datafusion::logical_expr::{Expr, LogicalPlan, Operator, Subquery};
 use std::sync::Arc;
+use substrait::proto::Rel;
 use substrait::proto::expression as substrait_expression;
-use substrait::proto::expression::subquery::set_predicate::PredicateOp;
 use substrait::proto::expression::subquery::SubqueryType;
+use substrait::proto::expression::subquery::set_comparison::{ComparisonOp, ReductionOp};
+use substrait::proto::expression::subquery::set_predicate::PredicateOp;
+
+/// Consume a subquery relation, making the enclosing query's schema
+/// available for resolving correlated column references.
+///
+/// Substrait represents correlated references using `OuterReference`
+/// field references with a `steps_out` depth. To resolve these,
+/// the consumer maintains a stack of outer schemas.
+async fn consume_subquery_rel(
+    consumer: &impl SubstraitConsumer,
+    rel: &Rel,
+    outer_schema: &DFSchema,
+) -> datafusion::common::Result<LogicalPlan> {
+    consumer.push_outer_schema(Arc::new(outer_schema.clone()));
+    let result = consumer.consume_rel(rel).await;
+    consumer.pop_outer_schema();
+    result
+}
 
 pub async fn from_subquery(
     consumer: &impl SubstraitConsumer,
@@ -33,12 +52,16 @@ pub async fn from_subquery(
         Some(subquery_type) => match subquery_type {
             SubqueryType::InPredicate(in_predicate) => {
                 if in_predicate.needles.len() != 1 {
-                    substrait_err!("InPredicate Subquery type must have exactly one Needle expression")
+                    substrait_err!(
+                        "InPredicate Subquery type must have exactly one Needle expression"
+                    )
                 } else {
                     let needle_expr = &in_predicate.needles[0];
                     let haystack_expr = &in_predicate.haystack;
                     if let Some(haystack_expr) = haystack_expr {
-                        let haystack_expr = consumer.consume_rel(haystack_expr).await?;
+                        let haystack_expr =
+                            consume_subquery_rel(consumer, haystack_expr, input_schema)
+                                .await?;
                         let outer_refs = haystack_expr.all_out_ref_exprs();
                         Ok(Expr::InSubquery(InSubquery {
                             expr: Box::new(
@@ -61,9 +84,12 @@ pub async fn from_subquery(
                 }
             }
             SubqueryType::Scalar(query) => {
-                let plan = consumer
-                    .consume_rel(&(query.input.clone()).unwrap_or_default())
-                    .await?;
+                let plan = consume_subquery_rel(
+                    consumer,
+                    &(query.input.clone()).unwrap_or_default(),
+                    input_schema,
+                )
+                .await?;
                 let outer_ref_columns = plan.all_out_ref_exprs();
                 Ok(Expr::ScalarSubquery(Subquery {
                     subquery: Arc::new(plan),
@@ -76,9 +102,12 @@ pub async fn from_subquery(
                     // exist
                     PredicateOp::Exists => {
                         let relation = &predicate.tuples;
-                        let plan = consumer
-                            .consume_rel(&relation.clone().unwrap_or_default())
-                            .await?;
+                        let plan = consume_subquery_rel(
+                            consumer,
+                            &relation.clone().unwrap_or_default(),
+                            input_schema,
+                        )
+                        .await?;
                         let outer_ref_columns = plan.all_out_ref_exprs();
                         Ok(Expr::Exists(Exists::new(
                             Subquery {
@@ -94,8 +123,53 @@ pub async fn from_subquery(
                     ),
                 }
             }
-            other_type => {
-                substrait_err!("Subquery type {other_type:?} not implemented")
+            SubqueryType::SetComparison(comparison) => {
+                let left = comparison.left.as_ref().ok_or_else(|| {
+                    substrait_datafusion_err!("SetComparison requires a left expression")
+                })?;
+                let right = comparison.right.as_ref().ok_or_else(|| {
+                    substrait_datafusion_err!("SetComparison requires a right relation")
+                })?;
+                let reduction_op = match ReductionOp::try_from(comparison.reduction_op) {
+                    Ok(ReductionOp::Any) => SetQuantifier::Any,
+                    Ok(ReductionOp::All) => SetQuantifier::All,
+                    _ => {
+                        return substrait_err!(
+                            "Unsupported reduction op for SetComparison: {}",
+                            comparison.reduction_op
+                        );
+                    }
+                };
+                let comparison_op = match ComparisonOp::try_from(comparison.comparison_op)
+                {
+                    Ok(ComparisonOp::Eq) => Operator::Eq,
+                    Ok(ComparisonOp::Ne) => Operator::NotEq,
+                    Ok(ComparisonOp::Lt) => Operator::Lt,
+                    Ok(ComparisonOp::Gt) => Operator::Gt,
+                    Ok(ComparisonOp::Le) => Operator::LtEq,
+                    Ok(ComparisonOp::Ge) => Operator::GtEq,
+                    _ => {
+                        return substrait_err!(
+                            "Unsupported comparison op for SetComparison: {}",
+                            comparison.comparison_op
+                        );
+                    }
+                };
+
+                let left_expr = consumer.consume_expression(left, input_schema).await?;
+                let plan = consume_subquery_rel(consumer, right, input_schema).await?;
+                let outer_ref_columns = plan.all_out_ref_exprs();
+
+                Ok(Expr::SetComparison(SetComparison::new(
+                    Box::new(left_expr),
+                    Subquery {
+                        subquery: Arc::new(plan),
+                        outer_ref_columns,
+                        spans: Spans::new(),
+                    },
+                    comparison_op,
+                    reduction_op,
+                )))
             }
         },
         None => {
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/window_function.rs b/datafusion/substrait/src/logical_plan/consumer/expr/window_function.rs
index 3399d660df62b..1f6f602a2ab73 100644
--- a/datafusion/substrait/src/logical_plan/consumer/expr/window_function.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/window_function.rs
@@ -16,19 +16,19 @@
 // under the License.
 
 use crate::logical_plan::consumer::{
-    from_substrait_func_args, from_substrait_rex_vec, from_substrait_sorts,
-    substrait_fun_name, SubstraitConsumer,
+    SubstraitConsumer, from_substrait_func_args, from_substrait_rex_vec,
+    from_substrait_sorts, substrait_fun_name,
 };
 use datafusion::common::{
-    not_impl_err, plan_datafusion_err, plan_err, substrait_err, DFSchema, ScalarValue,
+    DFSchema, ScalarValue, not_impl_err, plan_datafusion_err, plan_err, substrait_err,
 };
 use datafusion::execution::FunctionRegistry;
 use datafusion::logical_expr::expr::WindowFunctionParams;
 use datafusion::logical_expr::{
-    expr, Expr, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition,
+    Expr, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition, expr,
 };
-use substrait::proto::expression::window_function::{Bound, BoundsType};
 use substrait::proto::expression::WindowFunction;
+use substrait::proto::expression::window_function::{Bound, BoundsType};
 use substrait::proto::expression::{
     window_function::bound as SubstraitBound, window_function::bound::Kind as BoundKind,
 };
diff --git a/datafusion/substrait/src/logical_plan/consumer/plan.rs b/datafusion/substrait/src/logical_plan/consumer/plan.rs
index f994f792a17ea..407980c4a7f4b 100644
--- a/datafusion/substrait/src/logical_plan/consumer/plan.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/plan.rs
@@ -20,9 +20,9 @@ use super::{DefaultSubstraitConsumer, SubstraitConsumer};
 use crate::extensions::Extensions;
 use datafusion::common::{not_impl_err, plan_err};
 use datafusion::execution::SessionState;
-use datafusion::logical_expr::{col, Aggregate, LogicalPlan, Projection};
+use datafusion::logical_expr::{Aggregate, LogicalPlan, Projection, col};
 use std::sync::Arc;
-use substrait::proto::{plan_rel, Plan};
+use substrait::proto::{Plan, plan_rel};
 
 /// Convert Substrait Plan to DataFusion LogicalPlan
 pub async fn from_substrait_plan(
@@ -35,10 +35,7 @@ pub async fn from_substrait_plan(
         return not_impl_err!("Type variation extensions are not supported");
     }
 
-    let consumer = DefaultSubstraitConsumer {
-        extensions: &extensions,
-        state,
-    };
+    let consumer = DefaultSubstraitConsumer::new(&extensions, state);
     from_substrait_plan_with_consumer(&consumer, plan).await
 }
 
@@ -53,38 +50,75 @@ pub async fn from_substrait_plan_with_consumer(
                 Some(rt) => match rt {
                     plan_rel::RelType::Rel(rel) => Ok(consumer.consume_rel(rel).await?),
                     plan_rel::RelType::Root(root) => {
-                        let plan = consumer.consume_rel(root.input.as_ref().unwrap()).await?;
+                        let plan =
+                            consumer.consume_rel(root.input.as_ref().unwrap()).await?;
                         if root.names.is_empty() {
                             // Backwards compatibility for plans missing names
                             return Ok(plan);
                         }
-                        let renamed_schema = make_renamed_schema(plan.schema(), &root.names)?;
-                        if renamed_schema.has_equivalent_names_and_types(plan.schema()).is_ok() {
+                        let renamed_schema =
+                            make_renamed_schema(plan.schema(), &root.names)?;
+                        if renamed_schema
+                            .has_equivalent_names_and_types(plan.schema())
+                            .is_ok()
+                        {
                             // Nothing to do if the schema is already equivalent
                             return Ok(plan);
                         }
                         match plan {
                             // If the last node of the plan produces expressions, bake the renames into those expressions.
                             // This isn't necessary for correctness, but helps with roundtrip tests.
-                            LogicalPlan::Projection(p) => Ok(LogicalPlan::Projection(Projection::try_new(rename_expressions(p.expr, p.input.schema(), renamed_schema.fields())?, p.input)?)),
+                            LogicalPlan::Projection(p) => {
+                                Ok(LogicalPlan::Projection(Projection::try_new(
+                                    rename_expressions(
+                                        p.expr,
+                                        p.input.schema(),
+                                        renamed_schema.fields(),
+                                    )?,
+                                    p.input,
+                                )?))
+                            }
                             LogicalPlan::Aggregate(a) => {
-                                let (group_fields, expr_fields) = renamed_schema.fields().split_at(a.group_expr.len());
-                                let new_group_exprs = rename_expressions(a.group_expr, a.input.schema(), group_fields)?;
-                                let new_aggr_exprs = rename_expressions(a.aggr_expr, a.input.schema(), expr_fields)?;
-                                Ok(LogicalPlan::Aggregate(Aggregate::try_new(a.input, new_group_exprs, new_aggr_exprs)?))
-                            },
+                                let (group_fields, expr_fields) =
+                                    renamed_schema.fields().split_at(a.group_expr.len());
+                                let new_group_exprs = rename_expressions(
+                                    a.group_expr,
+                                    a.input.schema(),
+                                    group_fields,
+                                )?;
+                                let new_aggr_exprs = rename_expressions(
+                                    a.aggr_expr,
+                                    a.input.schema(),
+                                    expr_fields,
+                                )?;
+                                Ok(LogicalPlan::Aggregate(Aggregate::try_new(
+                                    a.input,
+                                    new_group_exprs,
+                                    new_aggr_exprs,
+                                )?))
+                            }
                             // There are probably more plans where we could bake things in, can add them later as needed.
                             // Otherwise, add a new Project to handle the renaming.
-                            _ => Ok(LogicalPlan::Projection(Projection::try_new(rename_expressions(plan.schema().columns().iter().map(|c| col(c.to_owned())), plan.schema(), renamed_schema.fields())?, Arc::new(plan))?))
+                            _ => Ok(LogicalPlan::Projection(Projection::try_new(
+                                rename_expressions(
+                                    plan.schema()
+                                        .columns()
+                                        .iter()
+                                        .map(|c| col(c.to_owned())),
+                                    plan.schema(),
+                                    renamed_schema.fields(),
+                                )?,
+                                Arc::new(plan),
+                            )?)),
                         }
                     }
                 },
-                None => plan_err!("Cannot parse plan relation: None")
+                None => plan_err!("Cannot parse plan relation: None"),
             }
-        },
+        }
         _ => not_impl_err!(
             "Substrait plan with more than 1 relation trees not supported. Number of relation trees: {:?}",
             plan.relations.len()
-        )
+        ),
     }
 }
diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/aggregate_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/aggregate_rel.rs
index fce074cd51983..ac7d2479c397a 100644
--- a/datafusion/substrait/src/logical_plan/consumer/rel/aggregate_rel.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/rel/aggregate_rel.rs
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::consumer::{from_substrait_agg_func, from_substrait_sorts};
 use crate::logical_plan::consumer::{NameTracker, SubstraitConsumer};
-use datafusion::common::{not_impl_err, DFSchemaRef};
+use crate::logical_plan::consumer::{from_substrait_agg_func, from_substrait_sorts};
+use datafusion::common::{DFSchemaRef, not_impl_err};
 use datafusion::logical_expr::{Expr, GroupingSet, LogicalPlan, LogicalPlanBuilder};
+use substrait::proto::AggregateRel;
 use substrait::proto::aggregate_function::AggregationInvocation;
 use substrait::proto::aggregate_rel::Grouping;
-use substrait::proto::AggregateRel;
 
 pub async fn from_aggregate_rel(
     consumer: &impl SubstraitConsumer,
@@ -106,7 +106,7 @@ pub async fn from_aggregate_rel(
                     not_impl_err!("Aggregate without aggregate function is not supported")
                 }
             };
-            aggr_exprs.push(agg_func?.as_ref().clone());
+            aggr_exprs.push(std::sync::Arc::unwrap_or_clone(agg_func?));
         }
 
         // Ensure that all expressions have a unique name
@@ -122,7 +122,7 @@ pub async fn from_aggregate_rel(
     }
 }
 
-#[allow(deprecated)]
+#[expect(deprecated)]
 async fn from_substrait_grouping(
     consumer: &impl SubstraitConsumer,
     grouping: &Grouping,
diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/exchange_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/exchange_rel.rs
index d326fff44bbbd..b275e523f5861 100644
--- a/datafusion/substrait/src/logical_plan/consumer/rel/exchange_rel.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/rel/exchange_rel.rs
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::consumer::from_substrait_field_reference;
 use crate::logical_plan::consumer::SubstraitConsumer;
+use crate::logical_plan::consumer::from_substrait_field_reference;
 use datafusion::common::{not_impl_err, substrait_err};
 use datafusion::logical_expr::{LogicalPlan, Partitioning, Repartition};
 use std::sync::Arc;
-use substrait::proto::exchange_rel::ExchangeKind;
 use substrait::proto::ExchangeRel;
+use substrait::proto::exchange_rel::ExchangeKind;
 
 pub async fn from_exchange_rel(
     consumer: &impl SubstraitConsumer,
@@ -42,7 +42,8 @@ pub async fn from_exchange_rel(
             let mut partition_columns = vec![];
             let input_schema = input.schema();
             for field_ref in &scatter_fields.fields {
-                let column = from_substrait_field_reference(field_ref, input_schema)?;
+                let column =
+                    from_substrait_field_reference(consumer, field_ref, input_schema)?;
                 partition_columns.push(column);
             }
             Partitioning::Hash(partition_columns, exchange.partition_count as usize)
diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/fetch_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/fetch_rel.rs
index 74161d8600ea6..12a8a77199b1a 100644
--- a/datafusion/substrait/src/logical_plan/consumer/rel/fetch_rel.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/rel/fetch_rel.rs
@@ -17,9 +17,9 @@
 
 use crate::logical_plan::consumer::SubstraitConsumer;
 use async_recursion::async_recursion;
-use datafusion::common::{not_impl_err, DFSchema, DFSchemaRef};
-use datafusion::logical_expr::{lit, LogicalPlan, LogicalPlanBuilder};
-use substrait::proto::{fetch_rel, FetchRel};
+use datafusion::common::{DFSchema, DFSchemaRef, not_impl_err};
+use datafusion::logical_expr::{LogicalPlan, LogicalPlanBuilder, lit};
+use substrait::proto::{FetchRel, fetch_rel};
 
 #[async_recursion]
 pub async fn from_fetch_rel(
@@ -30,6 +30,7 @@ pub async fn from_fetch_rel(
         let input = LogicalPlanBuilder::from(consumer.consume_rel(input).await?);
         let empty_schema = DFSchemaRef::new(DFSchema::empty());
         let offset = match &fetch.offset_mode {
+            #[expect(deprecated)]
             Some(fetch_rel::OffsetMode::Offset(offset)) => Some(lit(*offset)),
             Some(fetch_rel::OffsetMode::OffsetExpr(expr)) => {
                 Some(consumer.consume_expression(expr, &empty_schema).await?)
@@ -37,6 +38,7 @@ pub async fn from_fetch_rel(
             None => None,
         };
         let count = match &fetch.count_mode {
+            #[expect(deprecated)]
             Some(fetch_rel::CountMode::Count(count)) => {
                 // -1 means that ALL records should be returned, equivalent to None
                 (*count != -1).then(|| lit(*count))
diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/join_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/join_rel.rs
index 5681c92326e1a..7850dbea797fb 100644
--- a/datafusion/substrait/src/logical_plan/consumer/rel/join_rel.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/rel/join_rel.rs
@@ -16,14 +16,14 @@
 // under the License.
 
 use crate::logical_plan::consumer::SubstraitConsumer;
-use datafusion::common::{not_impl_err, plan_err, Column, JoinType, NullEquality};
+use datafusion::common::{Column, JoinType, NullEquality, not_impl_err, plan_err};
 use datafusion::logical_expr::requalify_sides_if_needed;
-use datafusion::logical_expr::utils::split_conjunction;
+use datafusion::logical_expr::utils::split_conjunction_owned;
 use datafusion::logical_expr::{
     BinaryExpr, Expr, LogicalPlan, LogicalPlanBuilder, Operator,
 };
 
-use substrait::proto::{join_rel, JoinRel};
+use substrait::proto::{JoinRel, join_rel};
 
 pub async fn from_join_rel(
     consumer: &impl SubstraitConsumer,
@@ -56,15 +56,10 @@ pub async fn from_join_rel(
             // So we extract each part as follows:
             // - If an Eq or IsNotDistinctFrom op is encountered, add the left column, right column and is_null_equal_nulls to `join_ons` vector
             // - Otherwise we add the expression to join_filter (use conjunction if filter already exists)
-            let (join_ons, nulls_equal_nulls, join_filter) =
-                split_eq_and_noneq_join_predicate_with_nulls_equality(&on);
+            let (join_ons, null_equality, join_filter) =
+                split_eq_and_noneq_join_predicate_with_nulls_equality(on);
             let (left_cols, right_cols): (Vec<_>, Vec<_>) =
                 itertools::multiunzip(join_ons);
-            let null_equality = if nulls_equal_nulls {
-                NullEquality::NullEqualsNull
-            } else {
-                NullEquality::NullEqualsNothing
-            };
             left.join_detailed(
                 right.build()?,
                 join_type,
@@ -89,49 +84,61 @@ pub async fn from_join_rel(
 }
 
 fn split_eq_and_noneq_join_predicate_with_nulls_equality(
-    filter: &Expr,
-) -> (Vec<(Column, Column)>, bool, Option<Expr>) {
-    let exprs = split_conjunction(filter);
+    filter: Expr,
+) -> (Vec<(Column, Column)>, NullEquality, Option<Expr>) {
+    let exprs = split_conjunction_owned(filter);
 
-    let mut accum_join_keys: Vec<(Column, Column)> = vec![];
+    let mut eq_keys: Vec<(Column, Column)> = vec![];
+    let mut indistinct_keys: Vec<(Column, Column)> = vec![];
     let mut accum_filters: Vec<Expr> = vec![];
-    let mut nulls_equal_nulls = false;
 
     for expr in exprs {
-        #[allow(clippy::collapsible_match)]
         match expr {
-            Expr::BinaryExpr(binary_expr) => match binary_expr {
-                x @ (BinaryExpr {
-                    left,
-                    op: Operator::Eq,
-                    right,
+            Expr::BinaryExpr(BinaryExpr {
+                left,
+                op: op @ (Operator::Eq | Operator::IsNotDistinctFrom),
+                right,
+            }) => match (*left, *right) {
+                (Expr::Column(l), Expr::Column(r)) => match op {
+                    Operator::Eq => eq_keys.push((l, r)),
+                    Operator::IsNotDistinctFrom => indistinct_keys.push((l, r)),
+                    _ => unreachable!(),
+                },
+                (left, right) => {
+                    accum_filters.push(Expr::BinaryExpr(BinaryExpr {
+                        left: Box::new(left),
+                        op,
+                        right: Box::new(right),
+                    }));
                 }
-                | BinaryExpr {
-                    left,
-                    op: Operator::IsNotDistinctFrom,
-                    right,
-                }) => {
-                    nulls_equal_nulls = match x.op {
-                        Operator::Eq => false,
-                        Operator::IsNotDistinctFrom => true,
-                        _ => unreachable!(),
-                    };
-
-                    match (left.as_ref(), right.as_ref()) {
-                        (Expr::Column(l), Expr::Column(r)) => {
-                            accum_join_keys.push((l.clone(), r.clone()));
-                        }
-                        _ => accum_filters.push(expr.clone()),
-                    }
-                }
-                _ => accum_filters.push(expr.clone()),
             },
-            _ => accum_filters.push(expr.clone()),
+            _ => accum_filters.push(expr),
         }
     }
 
+    let (join_keys, null_equality) =
+        match (eq_keys.is_empty(), indistinct_keys.is_empty()) {
+            // Mixed: use eq_keys as equijoin keys, demote indistinct keys to filter
+            (false, false) => {
+                for (l, r) in indistinct_keys {
+                    accum_filters.push(Expr::BinaryExpr(BinaryExpr {
+                        left: Box::new(Expr::Column(l)),
+                        op: Operator::IsNotDistinctFrom,
+                        right: Box::new(Expr::Column(r)),
+                    }));
+                }
+                (eq_keys, NullEquality::NullEqualsNothing)
+            }
+            // Only eq keys
+            (false, true) => (eq_keys, NullEquality::NullEqualsNothing),
+            // Only indistinct keys
+            (true, false) => (indistinct_keys, NullEquality::NullEqualsNull),
+            // No keys at all
+            (true, true) => (vec![], NullEquality::NullEqualsNothing),
+        };
+
     let join_filter = accum_filters.into_iter().reduce(Expr::and);
-    (accum_join_keys, nulls_equal_nulls, join_filter)
+    (join_keys, null_equality, join_filter)
 }
 
 fn from_substrait_jointype(join_type: i32) -> datafusion::common::Result<JoinType> {
@@ -153,3 +160,102 @@ fn from_substrait_jointype(join_type: i32) -> datafusion::common::Result<JoinTyp
         plan_err!("invalid join type variant {join_type}")
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn col(name: &str) -> Expr {
+        Expr::Column(Column::from_name(name))
+    }
+
+    fn indistinct(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::IsNotDistinctFrom,
+            right: Box::new(right),
+        })
+    }
+
+    fn fmt_keys(keys: &[(Column, Column)]) -> String {
+        keys.iter()
+            .map(|(l, r)| format!("{l} = {r}"))
+            .collect::<Vec<_>>()
+            .join(", ")
+    }
+
+    #[test]
+    fn split_only_eq_keys() {
+        let expr = col("a").eq(col("b"));
+        let (keys, null_eq, filter) =
+            split_eq_and_noneq_join_predicate_with_nulls_equality(expr);
+
+        assert_eq!(fmt_keys(&keys), "a = b");
+        assert_eq!(null_eq, NullEquality::NullEqualsNothing);
+        assert!(filter.is_none());
+    }
+
+    #[test]
+    fn split_only_indistinct_keys() {
+        let expr = indistinct(col("a"), col("b"));
+        let (keys, null_eq, filter) =
+            split_eq_and_noneq_join_predicate_with_nulls_equality(expr);
+
+        assert_eq!(fmt_keys(&keys), "a = b");
+        assert_eq!(null_eq, NullEquality::NullEqualsNull);
+        assert!(filter.is_none());
+    }
+
+    /// Regression: mixed `equal` + `is_not_distinct_from` must demote
+    /// the indistinct key to the join filter so the single NullEquality
+    /// flag stays consistent (NullEqualsNothing for the eq keys).
+    #[test]
+    fn split_mixed_eq_and_indistinct_demotes_indistinct_to_filter() {
+        let expr =
+            indistinct(col("val_l"), col("val_r")).and(col("id_l").eq(col("id_r")));
+
+        let (keys, null_eq, filter) =
+            split_eq_and_noneq_join_predicate_with_nulls_equality(expr);
+
+        assert_eq!(fmt_keys(&keys), "id_l = id_r");
+        assert_eq!(null_eq, NullEquality::NullEqualsNothing);
+        assert_eq!(
+            filter.unwrap().to_string(),
+            "val_l IS NOT DISTINCT FROM val_r"
+        );
+    }
+
+    /// Multiple IS NOT DISTINCT FROM keys with a single Eq key should demote
+    /// all indistinct keys to the filter.
+    #[test]
+    fn split_mixed_multiple_indistinct_demoted() {
+        let expr = indistinct(col("a_l"), col("a_r"))
+            .and(indistinct(col("b_l"), col("b_r")))
+            .and(col("id_l").eq(col("id_r")));
+
+        let (keys, null_eq, filter) =
+            split_eq_and_noneq_join_predicate_with_nulls_equality(expr);
+
+        assert_eq!(fmt_keys(&keys), "id_l = id_r");
+        assert_eq!(null_eq, NullEquality::NullEqualsNothing);
+        assert_eq!(
+            filter.unwrap().to_string(),
+            "a_l IS NOT DISTINCT FROM a_r AND b_l IS NOT DISTINCT FROM b_r"
+        );
+    }
+
+    #[test]
+    fn split_non_column_eq_goes_to_filter() {
+        let expr = Expr::Literal(
+            datafusion::common::ScalarValue::Utf8(Some("x".into())),
+            None,
+        )
+        .eq(col("b"));
+
+        let (keys, _, filter) =
+            split_eq_and_noneq_join_predicate_with_nulls_equality(expr);
+
+        assert!(keys.is_empty());
+        assert_eq!(filter.unwrap().to_string(), "Utf8(\"x\") = b");
+    }
+}
diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/mod.rs b/datafusion/substrait/src/logical_plan/consumer/rel/mod.rs
index a83ddd8997b29..038ada115b9d8 100644
--- a/datafusion/substrait/src/logical_plan/consumer/rel/mod.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/rel/mod.rs
@@ -37,16 +37,16 @@ pub use read_rel::*;
 pub use set_rel::*;
 pub use sort_rel::*;
 
-use crate::logical_plan::consumer::utils::NameTracker;
 use crate::logical_plan::consumer::SubstraitConsumer;
+use crate::logical_plan::consumer::utils::NameTracker;
 use async_recursion::async_recursion;
-use datafusion::common::{not_impl_err, substrait_datafusion_err, substrait_err, Column};
+use datafusion::common::{Column, not_impl_err, substrait_datafusion_err, substrait_err};
 use datafusion::logical_expr::builder::project;
 use datafusion::logical_expr::{Expr, LogicalPlan, Projection};
 use std::sync::Arc;
 use substrait::proto::rel::RelType;
 use substrait::proto::rel_common::{Emit, EmitKind};
-use substrait::proto::{rel_common, Rel, RelCommon};
+use substrait::proto::{Rel, RelCommon, rel_common};
 
 /// Convert Substrait Rel to DataFusion DataFrame
 #[async_recursion]
diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs
index 239073108ce50..0a4048650fa2b 100644
--- a/datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs
@@ -15,10 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::consumer::utils::NameTracker;
 use crate::logical_plan::consumer::SubstraitConsumer;
+use crate::logical_plan::consumer::utils::NameTracker;
 use async_recursion::async_recursion;
-use datafusion::common::{not_impl_err, Column};
+use datafusion::common::{Column, not_impl_err};
 use datafusion::logical_expr::builder::project;
 use datafusion::logical_expr::{Expr, LogicalPlan, LogicalPlanBuilder};
 use std::collections::HashSet;
@@ -50,6 +50,8 @@ pub async fn from_project_rel(
         // For WindowFunctions, we need to wrap them in a Window relation. If there are duplicates,
         // we can do the window'ing only once, then the project will duplicate the result.
         // Order here doesn't matter since LPB::window_plan sorts the expressions.
+        #[allow(clippy::allow_attributes, clippy::mutable_key_type)]
+        // Expr contains Arc with interior mutability but is intentionally used as hash key
         let mut window_exprs: HashSet<Expr> = HashSet::new();
         for expr in &p.expressions {
             let e = consumer
@@ -62,20 +64,7 @@ pub async fn from_project_rel(
                 // to transform it into a column reference
                 window_exprs.insert(e.clone());
             }
-            // Substrait plans are ordinal based, so they do not provide names for columns.
-            // Names for columns are generated by Datafusion during conversion, and for literals
-            // Datafusion produces names based on the literal value. It is possible to construct
-            // valid Substrait plans that result in duplicated names if the same literal value is
-            // used in multiple relations. To avoid this issue, we alias literals with unique names.
-            // The name tracker will ensure that two literals in the same project would have
-            // unique names but, it does not ensure that if a literal column exists in a previous
-            // project say before a join that it is deduplicated with respect to those columns.
-            // See: https://github.com/apache/datafusion/pull/17299
-            let maybe_apply_alias = match e {
-                lit @ Expr::Literal(_, _) => lit.alias(uuid::Uuid::new_v4().to_string()),
-                _ => e,
-            };
-            explicit_exprs.push(name_tracker.get_uniquely_named_expr(maybe_apply_alias)?);
+            explicit_exprs.push(name_tracker.get_uniquely_named_expr(e)?);
         }
 
         let input = if !window_exprs.is_empty() {
diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs
index 48e93c04bb034..832110e11131c 100644
--- a/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use crate::logical_plan::consumer::SubstraitConsumer;
 use crate::logical_plan::consumer::from_substrait_literal;
 use crate::logical_plan::consumer::from_substrait_named_struct;
 use crate::logical_plan::consumer::utils::ensure_schema_compatibility;
-use crate::logical_plan::consumer::SubstraitConsumer;
 use datafusion::common::{
-    not_impl_err, plan_err, substrait_datafusion_err, substrait_err, DFSchema,
-    DFSchemaRef, TableReference,
+    DFSchema, DFSchemaRef, TableReference, not_impl_err, plan_err,
+    substrait_datafusion_err, substrait_err,
 };
 use datafusion::datasource::provider_as_source;
 use datafusion::logical_expr::utils::split_conjunction_owned;
@@ -30,12 +30,12 @@ use datafusion::logical_expr::{
 };
 use std::sync::Arc;
 use substrait::proto::expression::MaskExpression;
-use substrait::proto::read_rel::local_files::file_or_files::PathType::UriFile;
 use substrait::proto::read_rel::ReadType;
+use substrait::proto::read_rel::local_files::file_or_files::PathType::UriFile;
 use substrait::proto::{Expression, ReadRel};
 use url::Url;
 
-#[allow(deprecated)]
+#[expect(deprecated)]
 pub async fn from_read_rel(
     consumer: &impl SubstraitConsumer,
     read: &ReadRel,
@@ -121,57 +121,53 @@ pub async fn from_read_rel(
                 }));
             }
 
+            // Check for produce_one_row pattern in both old (values) and new (expressions) formats.
+            // A VirtualTable with exactly one row containing only empty/default fields represents
+            // an EmptyRelation with produce_one_row=true. This pattern is used for queries without
+            // a FROM clause (e.g., "SELECT 1 AS one") where a single phantom row is needed to
+            // provide a context for evaluating scalar expressions. This is conceptually similar to
+            // the SQL "DUAL" table (see: https://en.wikipedia.org/wiki/DUAL_table) which some
+            // databases provide as a single-row source for selecting constant expressions when no
+            // real table is present.
+            let is_produce_one_row = (vt.values.len() == 1
+                && vt.expressions.is_empty()
+                && substrait_schema.fields().is_empty()
+                && vt.values[0].fields.is_empty())
+                || (vt.expressions.len() == 1
+                    && vt.values.is_empty()
+                    && substrait_schema.fields().is_empty()
+                    && vt.expressions[0].fields.is_empty());
+
+            if is_produce_one_row {
+                return Ok(LogicalPlan::EmptyRelation(EmptyRelation {
+                    produce_one_row: true,
+                    schema: DFSchemaRef::new(substrait_schema),
+                }));
+            }
+
             let values = if !vt.expressions.is_empty() {
                 let mut exprs = vec![];
                 for row in &vt.expressions {
-                    let mut name_idx = 0;
                     let mut row_exprs = vec![];
                     for expression in &row.fields {
-                        name_idx += 1;
                         let expr = consumer
-                            .consume_expression(expression, &DFSchema::empty())
+                            .consume_expression(expression, &substrait_schema)
                             .await?;
                         row_exprs.push(expr);
                     }
-                    if name_idx != named_struct.names.len() {
+                    // For expressions, validate against top-level schema fields, not nested names
+                    if row_exprs.len() != substrait_schema.fields().len() {
                         return substrait_err!(
-                                "Names list must match exactly to nested schema, but found {} uses for {} names",
-                                name_idx,
-                                named_struct.names.len()
-                            );
+                            "Field count mismatch: expected {} fields but found {} in virtual table row",
+                            substrait_schema.fields().len(),
+                            row_exprs.len()
+                        );
                     }
                     exprs.push(row_exprs);
                 }
                 exprs
             } else {
-                vt
-                .values
-                .iter()
-                .map(|row| {
-                    let mut name_idx = 0;
-                    let lits = row
-                        .fields
-                        .iter()
-                        .map(|lit| {
-                            name_idx += 1; // top-level names are provided through schema
-                            Ok(Expr::Literal(from_substrait_literal(
-                                consumer,
-                                lit,
-                                &named_struct.names,
-                                &mut name_idx,
-                            )?, None))
-                        })
-                        .collect::<datafusion::common::Result<_>>()?;
-                    if name_idx != named_struct.names.len() {
-                        return substrait_err!(
-                                "Names list must match exactly to nested schema, but found {} uses for {} names",
-                                name_idx,
-                                named_struct.names.len()
-                            );
-                    }
-                    Ok(lits)
-                })
-                .collect::<datafusion::common::Result<_>>()?
+                convert_literal_rows(consumer, vt, named_struct)?
             };
 
             Ok(LogicalPlan::Values(Values {
@@ -226,6 +222,46 @@ pub async fn from_read_rel(
     }
 }
 
+/// Converts Substrait literal rows from a VirtualTable into DataFusion expressions.
+///
+/// This function processes the deprecated `values` field of VirtualTable, converting
+/// each literal value into a `Expr::Literal` while tracking and validating the name
+/// indices against the provided named struct schema.
+fn convert_literal_rows(
+    consumer: &impl SubstraitConsumer,
+    vt: &substrait::proto::read_rel::VirtualTable,
+    named_struct: &substrait::proto::NamedStruct,
+) -> datafusion::common::Result<Vec<Vec<Expr>>> {
+    #[expect(deprecated)]
+    vt.values
+        .iter()
+        .map(|row| {
+            let mut name_idx = 0;
+            let lits = row
+                .fields
+                .iter()
+                .map(|lit| {
+                    name_idx += 1; // top-level names are provided through schema
+                    Ok(Expr::Literal(from_substrait_literal(
+                        consumer,
+                        lit,
+                        &named_struct.names,
+                        &mut name_idx,
+                    )?, None))
+                })
+                .collect::<datafusion::common::Result<_>>()?;
+            if name_idx != named_struct.names.len() {
+                return substrait_err!(
+                    "Names list must match exactly to nested schema, but found {} uses for {} names",
+                    name_idx,
+                    named_struct.names.len()
+                );
+            }
+            Ok(lits)
+        })
+        .collect::<datafusion::common::Result<_>>()
+}
+
 pub fn apply_masking(
     schema: DFSchema,
     mask_expression: &::core::option::Option<MaskExpression>,
@@ -242,9 +278,7 @@ pub fn apply_masking(
                 let fields = column_indices
                     .iter()
                     .map(|i| schema.qualified_field(*i))
-                    .map(|(qualifier, field)| {
-                        (qualifier.cloned(), Arc::new(field.clone()))
-                    })
+                    .map(|(qualifier, field)| (qualifier.cloned(), Arc::clone(field)))
                     .collect();
 
                 Ok(DFSchema::new_with_metadata(
@@ -288,7 +322,7 @@ fn apply_projection(
             let fields = column_indices
                 .iter()
                 .map(|i| df_schema.qualified_field(*i))
-                .map(|(qualifier, field)| (qualifier.cloned(), Arc::new(field.clone())))
+                .map(|(qualifier, field)| (qualifier.cloned(), Arc::clone(field)))
                 .collect();
 
             scan.projected_schema = DFSchemaRef::new(DFSchema::new_with_metadata(
diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/set_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/set_rel.rs
index 6688a80f52746..36bf8dbae4a92 100644
--- a/datafusion/substrait/src/logical_plan/consumer/rel/set_rel.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/rel/set_rel.rs
@@ -81,7 +81,7 @@ async fn intersect_rels(
             rel,
             consumer.consume_rel(input).await?,
             is_all,
-        )?
+        )?;
     }
 
     Ok(rel)
@@ -95,7 +95,8 @@ async fn except_rels(
     let mut rel = consumer.consume_rel(&rels[0]).await?;
 
     for input in &rels[1..] {
-        rel = LogicalPlanBuilder::except(rel, consumer.consume_rel(input).await?, is_all)?
+        rel =
+            LogicalPlanBuilder::except(rel, consumer.consume_rel(input).await?, is_all)?;
     }
 
     Ok(rel)
diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/sort_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/sort_rel.rs
index 56ca0ba03857d..24f6829c20394 100644
--- a/datafusion/substrait/src/logical_plan/consumer/rel/sort_rel.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/rel/sort_rel.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::consumer::{from_substrait_sorts, SubstraitConsumer};
+use crate::logical_plan::consumer::{SubstraitConsumer, from_substrait_sorts};
 use datafusion::common::not_impl_err;
 use datafusion::logical_expr::{LogicalPlan, LogicalPlanBuilder};
 use substrait::proto::SortRel;
diff --git a/datafusion/substrait/src/logical_plan/consumer/substrait_consumer.rs b/datafusion/substrait/src/logical_plan/consumer/substrait_consumer.rs
index 5392dd77b576b..65bc53ce0834e 100644
--- a/datafusion/substrait/src/logical_plan/consumer/substrait_consumer.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/substrait_consumer.rs
@@ -18,7 +18,7 @@
 use super::{
     from_aggregate_rel, from_cast, from_cross_rel, from_exchange_rel, from_fetch_rel,
     from_field_reference, from_filter_rel, from_if_then, from_join_rel, from_literal,
-    from_project_rel, from_read_rel, from_scalar_function, from_set_rel,
+    from_nested, from_project_rel, from_read_rel, from_scalar_function, from_set_rel,
     from_singular_or_list, from_sort_rel, from_subquery, from_substrait_rel,
     from_substrait_rex, from_window_function,
 };
@@ -27,11 +27,11 @@ use async_trait::async_trait;
 use datafusion::arrow::datatypes::DataType;
 use datafusion::catalog::TableProvider;
 use datafusion::common::{
-    not_impl_err, substrait_err, DFSchema, ScalarValue, TableReference,
+    DFSchema, ScalarValue, TableReference, not_impl_err, substrait_err,
 };
 use datafusion::execution::{FunctionRegistry, SessionState};
 use datafusion::logical_expr::{Expr, Extension, LogicalPlan};
-use std::sync::Arc;
+use std::sync::{Arc, RwLock};
 use substrait::proto;
 use substrait::proto::expression as substrait_expression;
 use substrait::proto::expression::{
@@ -39,9 +39,9 @@ use substrait::proto::expression::{
     SingularOrList, SwitchExpression, WindowFunction,
 };
 use substrait::proto::{
-    r#type, AggregateRel, ConsistentPartitionWindowRel, CrossRel, DynamicParameter,
-    ExchangeRel, Expression, ExtensionLeafRel, ExtensionMultiRel, ExtensionSingleRel,
-    FetchRel, FilterRel, JoinRel, ProjectRel, ReadRel, Rel, SetRel, SortRel,
+    AggregateRel, ConsistentPartitionWindowRel, CrossRel, DynamicParameter, ExchangeRel,
+    Expression, ExtensionLeafRel, ExtensionMultiRel, ExtensionSingleRel, FetchRel,
+    FilterRel, JoinRel, ProjectRel, ReadRel, Rel, SetRel, SortRel, r#type,
 };
 
 #[async_trait]
@@ -141,7 +141,15 @@ use substrait::proto::{
 ///
 ///     // and user-defined literals
 ///     fn consume_user_defined_literal(&self, literal: &proto::expression::literal::UserDefined) -> Result<ScalarValue> {
-///         let type_string = self.extensions.types.get(&literal.type_reference).unwrap();
+///         // extract type_reference from the new TypeAnchorType oneof
+///         let type_ref = match literal.type_anchor_type {
+///             Some(proto::expression::literal::user_defined::TypeAnchorType::TypeReference(r)) => r,
+///             Some(proto::expression::literal::user_defined::TypeAnchorType::TypeAliasReference(_)) => {
+///                 return not_impl_err!("Type alias references are not yet supported")
+///             }
+///             None => 0,
+///         };
+///         let type_string = self.extensions.types.get(&type_ref).unwrap();
 ///         match type_string.as_str() {
 ///             "u!foo" => not_impl_err!("handle foo conversion"),
 ///             "u!bar" => not_impl_err!("handle bar conversion"),
@@ -150,7 +158,6 @@ use substrait::proto::{
 ///     }
 /// }
 /// ```
-///
 pub trait SubstraitConsumer: Send + Sync + Sized {
     async fn resolve_table_ref(
         &self,
@@ -343,10 +350,10 @@ pub trait SubstraitConsumer: Send + Sync + Sized {
 
     async fn consume_nested(
         &self,
-        _expr: &Nested,
-        _input_schema: &DFSchema,
+        expr: &Nested,
+        input_schema: &DFSchema,
     ) -> datafusion::common::Result<Expr> {
-        not_impl_err!("Nested expression not supported")
+        from_nested(self, expr, input_schema).await
     }
 
     async fn consume_enum(
@@ -359,10 +366,42 @@ pub trait SubstraitConsumer: Send + Sync + Sized {
 
     async fn consume_dynamic_parameter(
         &self,
-        _expr: &DynamicParameter,
+        expr: &DynamicParameter,
         _input_schema: &DFSchema,
     ) -> datafusion::common::Result<Expr> {
-        not_impl_err!("Dynamic Parameter expression not supported")
+        let id = format!("${}", expr.parameter_reference + 1);
+        let field = expr
+            .r#type
+            .as_ref()
+            .map(|t| {
+                super::from_substrait_type_without_names(self, t).map(|dt| {
+                    Arc::new(datafusion::arrow::datatypes::Field::new(&id, dt, true))
+                })
+            })
+            .transpose()?;
+        Ok(Expr::Placeholder(
+            datafusion::logical_expr::expr::Placeholder::new_with_field(id, field),
+        ))
+    }
+
+    // Outer Schema Stack
+    // These methods manage a stack of outer schemas for correlated subquery support.
+    // When entering a subquery, the enclosing query's schema is pushed onto the stack.
+    // Field references with OuterReference root_type use these to resolve columns.
+
+    /// Push an outer schema onto the stack when entering a subquery.
+    fn push_outer_schema(&self, _schema: Arc<DFSchema>) {}
+
+    /// Pop an outer schema from the stack when leaving a subquery.
+    fn pop_outer_schema(&self) {}
+
+    /// Get the outer schema at the given nesting depth.
+    /// `steps_out = 1` is the immediately enclosing query, `steps_out = 2`
+    /// is two levels out, etc. Returns `None` if `steps_out` is 0 or
+    /// exceeds the current nesting depth (the caller should treat this as
+    /// an error in the Substrait plan).
+    fn get_outer_schema(&self, _steps_out: usize) -> Option<Arc<DFSchema>> {
+        None
     }
 
     // User-Defined Functionality
@@ -425,10 +464,22 @@ pub trait SubstraitConsumer: Send + Sync + Sized {
         &self,
         user_defined_literal: &proto::expression::literal::UserDefined,
     ) -> datafusion::common::Result<ScalarValue> {
-        substrait_err!(
-            "Missing handler for user-defined literals {}",
-            user_defined_literal.type_reference
-        )
+        let type_ref = match user_defined_literal.type_anchor_type {
+            Some(
+                proto::expression::literal::user_defined::TypeAnchorType::TypeReference(
+                    ref_val,
+                ),
+            ) => ref_val,
+            Some(
+                proto::expression::literal::user_defined::TypeAnchorType::TypeAliasReference(_),
+            ) => {
+                return not_impl_err!(
+                    "Type alias references in user-defined literals are not yet supported"
+                )
+            }
+            None => 0,
+        };
+        substrait_err!("Missing handler for user-defined literals {}", type_ref)
     }
 }
 
@@ -438,11 +489,16 @@ pub trait SubstraitConsumer: Send + Sync + Sized {
 pub struct DefaultSubstraitConsumer<'a> {
     pub(super) extensions: &'a Extensions,
     pub(super) state: &'a SessionState,
+    outer_schemas: RwLock<Vec<Arc<DFSchema>>>,
 }
 
 impl<'a> DefaultSubstraitConsumer<'a> {
     pub fn new(extensions: &'a Extensions, state: &'a SessionState) -> Self {
-        DefaultSubstraitConsumer { extensions, state }
+        DefaultSubstraitConsumer {
+            extensions,
+            state,
+            outer_schemas: RwLock::new(Vec::new()),
+        }
     }
 }
 
@@ -466,6 +522,24 @@ impl SubstraitConsumer for DefaultSubstraitConsumer<'_> {
         self.state
     }
 
+    fn push_outer_schema(&self, schema: Arc<DFSchema>) {
+        self.outer_schemas.write().unwrap().push(schema);
+    }
+
+    fn pop_outer_schema(&self) {
+        self.outer_schemas.write().unwrap().pop();
+    }
+
+    fn get_outer_schema(&self, steps_out: usize) -> Option<Arc<DFSchema>> {
+        let schemas = self.outer_schemas.read().unwrap();
+        // steps_out=1 → last element, steps_out=2 → second-to-last, etc.
+        // Returns None for steps_out=0 or steps_out > stack depth.
+        schemas
+            .len()
+            .checked_sub(steps_out)
+            .and_then(|idx| schemas.get(idx).cloned())
+    }
+
     async fn consume_extension_leaf(
         &self,
         rel: &ExtensionLeafRel,
@@ -493,8 +567,8 @@ impl SubstraitConsumer for DefaultSubstraitConsumer<'_> {
             .deserialize_logical_plan(&ext_detail.type_url, &ext_detail.value)?;
         let Some(input_rel) = &rel.input else {
             return substrait_err!(
-                    "ExtensionSingleRel missing input rel, try using ExtensionLeafRel instead"
-                );
+                "ExtensionSingleRel missing input rel, try using ExtensionLeafRel instead"
+            );
         };
         let input_plan = self.consume_rel(input_rel).await?;
         let plan = plan.with_exprs_and_inputs(plan.expressions(), vec![input_plan])?;
@@ -521,3 +595,79 @@ impl SubstraitConsumer for DefaultSubstraitConsumer<'_> {
         Ok(LogicalPlan::Extension(Extension { node: plan }))
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::logical_plan::consumer::utils::tests::test_consumer;
+    use datafusion::arrow::datatypes::{Field, Schema};
+
+    fn make_schema(fields: &[(&str, DataType)]) -> Arc<DFSchema> {
+        let arrow_fields: Vec<Field> = fields
+            .iter()
+            .map(|(name, dt)| Field::new(*name, dt.clone(), true))
+            .collect();
+        Arc::new(
+            DFSchema::try_from(Schema::new(arrow_fields))
+                .expect("failed to create schema"),
+        )
+    }
+
+    #[test]
+    fn test_get_outer_schema_empty_stack() {
+        let consumer = test_consumer();
+
+        // No schemas pushed — any steps_out should return None
+        assert!(consumer.get_outer_schema(0).is_none());
+        assert!(consumer.get_outer_schema(1).is_none());
+        assert!(consumer.get_outer_schema(2).is_none());
+    }
+
+    #[test]
+    fn test_get_outer_schema_single_level() {
+        let consumer = test_consumer();
+
+        let schema_a = make_schema(&[("a", DataType::Int64)]);
+        consumer.push_outer_schema(Arc::clone(&schema_a));
+
+        // steps_out=1 returns the one pushed schema
+        let result = consumer.get_outer_schema(1).unwrap();
+        assert_eq!(result.fields().len(), 1);
+        assert_eq!(result.fields()[0].name(), "a");
+
+        // steps_out=0 and steps_out=2 are out of range
+        assert!(consumer.get_outer_schema(0).is_none());
+        assert!(consumer.get_outer_schema(2).is_none());
+
+        consumer.pop_outer_schema();
+        assert!(consumer.get_outer_schema(1).is_none());
+    }
+
+    #[test]
+    fn test_get_outer_schema_nested() {
+        let consumer = test_consumer();
+
+        let schema_a = make_schema(&[("a", DataType::Int64)]);
+        let schema_b = make_schema(&[("b", DataType::Utf8)]);
+
+        consumer.push_outer_schema(Arc::clone(&schema_a));
+        consumer.push_outer_schema(Arc::clone(&schema_b));
+
+        // steps_out=1 returns the most recent (schema_b)
+        let result = consumer.get_outer_schema(1).unwrap();
+        assert_eq!(result.fields()[0].name(), "b");
+
+        // steps_out=2 returns the grandparent (schema_a)
+        let result = consumer.get_outer_schema(2).unwrap();
+        assert_eq!(result.fields()[0].name(), "a");
+
+        // steps_out=3 exceeds depth
+        assert!(consumer.get_outer_schema(3).is_none());
+
+        // Pop one level — now steps_out=1 returns schema_a
+        consumer.pop_outer_schema();
+        let result = consumer.get_outer_schema(1).unwrap();
+        assert_eq!(result.fields()[0].name(), "a");
+        assert!(consumer.get_outer_schema(2).is_none());
+    }
+}
diff --git a/datafusion/substrait/src/logical_plan/consumer/types.rs b/datafusion/substrait/src/logical_plan/consumer/types.rs
index ef1000a1ccdba..2493ac1e5ad57 100644
--- a/datafusion/substrait/src/logical_plan/consumer/types.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/types.rs
@@ -15,9 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use super::utils::{from_substrait_precision, next_struct_field_name, DEFAULT_TIMEZONE};
 use super::SubstraitConsumer;
-#[allow(deprecated)]
+use super::utils::{DEFAULT_TIMEZONE, from_substrait_precision, next_struct_field_name};
+#[expect(deprecated)]
 use crate::variation_const::{
     DATE_32_TYPE_VARIATION_REF, DATE_64_TYPE_VARIATION_REF,
     DECIMAL_128_TYPE_VARIATION_REF, DECIMAL_256_TYPE_VARIATION_REF,
@@ -26,21 +26,29 @@ use crate::variation_const::{
     DICTIONARY_MAP_TYPE_VARIATION_REF, DURATION_INTERVAL_DAY_TYPE_VARIATION_REF,
     INTERVAL_DAY_TIME_TYPE_REF, INTERVAL_MONTH_DAY_NANO_TYPE_NAME,
     INTERVAL_MONTH_DAY_NANO_TYPE_REF, INTERVAL_YEAR_MONTH_TYPE_REF,
-    LARGE_CONTAINER_TYPE_VARIATION_REF, TIMESTAMP_MICRO_TYPE_VARIATION_REF,
+    LARGE_CONTAINER_TYPE_VARIATION_REF, TIME_32_TYPE_VARIATION_REF,
+    TIME_64_TYPE_VARIATION_REF, TIMESTAMP_MICRO_TYPE_VARIATION_REF,
     TIMESTAMP_MILLI_TYPE_VARIATION_REF, TIMESTAMP_NANO_TYPE_VARIATION_REF,
-    TIMESTAMP_SECOND_TYPE_VARIATION_REF, TIME_32_TYPE_VARIATION_REF,
-    TIME_64_TYPE_VARIATION_REF, UNSIGNED_INTEGER_TYPE_VARIATION_REF,
+    TIMESTAMP_SECOND_TYPE_VARIATION_REF, UNSIGNED_INTEGER_TYPE_VARIATION_REF,
     VIEW_CONTAINER_TYPE_VARIATION_REF,
 };
 use crate::variation_const::{FLOAT_16_TYPE_NAME, NULL_TYPE_NAME};
 use datafusion::arrow::datatypes::{
-    DataType, Field, Fields, IntervalUnit, Schema, TimeUnit,
+    DataType, Field, FieldRef, Fields, IntervalUnit, Schema, TimeUnit,
 };
+use datafusion::common::datatype::DataTypeExt;
 use datafusion::common::{
-    not_impl_err, substrait_datafusion_err, substrait_err, DFSchema,
+    DFSchema, not_impl_err, substrait_datafusion_err, substrait_err,
 };
 use std::sync::Arc;
-use substrait::proto::{r#type, NamedStruct, Type};
+use substrait::proto::{NamedStruct, Type, r#type};
+
+pub(crate) fn field_from_substrait_type_without_names(
+    consumer: &impl SubstraitConsumer,
+    dt: &Type,
+) -> datafusion::common::Result<FieldRef> {
+    Ok(from_substrait_type_without_names(consumer, dt)?.into_nullable_field_ref())
+}
 
 pub(crate) fn from_substrait_type_without_names(
     consumer: &impl SubstraitConsumer,
@@ -49,6 +57,16 @@ pub(crate) fn from_substrait_type_without_names(
     from_substrait_type(consumer, dt, &[], &mut 0)
 }
 
+pub fn field_from_substrait_type(
+    consumer: &impl SubstraitConsumer,
+    dt: &Type,
+    dfs_names: &[String],
+    name_idx: &mut usize,
+) -> datafusion::common::Result<FieldRef> {
+    // We could add nullability here now that we are returning a Field
+    Ok(from_substrait_type(consumer, dt, dfs_names, name_idx)?.into_nullable_field_ref())
+}
+
 pub fn from_substrait_type(
     consumer: &impl SubstraitConsumer,
     dt: &Type,
@@ -88,9 +106,10 @@ pub fn from_substrait_type(
             },
             r#type::Kind::Fp32(_) => Ok(DataType::Float32),
             r#type::Kind::Fp64(_) => Ok(DataType::Float64),
+            #[expect(deprecated)]
             r#type::Kind::Timestamp(ts) => {
                 // Kept for backwards compatibility, new plans should use PrecisionTimestamp(Tz) instead
-                #[allow(deprecated)]
+                #[expect(deprecated)]
                 match ts.type_variation_reference {
                     TIMESTAMP_SECOND_TYPE_VARIATION_REF => {
                         Ok(DataType::Timestamp(TimeUnit::Second, None))
@@ -248,20 +267,22 @@ pub fn from_substrait_type(
                 // TODO: remove the code below once the producer has been updated
                 if let Some(name) = consumer.get_extensions().types.get(&u.type_reference)
                 {
-                    #[allow(deprecated)]
+                    #[expect(deprecated)]
                     match name.as_ref() {
                         // Kept for backwards compatibility, producers should use IntervalCompound instead
-                        INTERVAL_MONTH_DAY_NANO_TYPE_NAME => Ok(DataType::Interval(IntervalUnit::MonthDayNano)),
+                        INTERVAL_MONTH_DAY_NANO_TYPE_NAME => {
+                            Ok(DataType::Interval(IntervalUnit::MonthDayNano))
+                        }
                         FLOAT_16_TYPE_NAME => Ok(DataType::Float16),
                         NULL_TYPE_NAME => Ok(DataType::Null),
                         _ => not_impl_err!(
-                                "Unsupported Substrait user defined type with ref {} and variation {}",
-                                u.type_reference,
-                                u.type_variation_reference
-                            ),
+                            "Unsupported Substrait user defined type with ref {} and variation {}",
+                            u.type_reference,
+                            u.type_variation_reference
+                        ),
                     }
                 } else {
-                    #[allow(deprecated)]
+                    #[expect(deprecated)]
                     match u.type_reference {
                         // Kept for backwards compatibility, producers should use IntervalYear instead
                         INTERVAL_YEAR_MONTH_TYPE_REF => {
@@ -276,10 +297,10 @@ pub fn from_substrait_type(
                             Ok(DataType::Interval(IntervalUnit::MonthDayNano))
                         }
                         _ => not_impl_err!(
-                        "Unsupported Substrait user defined type with ref {} and variation {}",
-                        u.type_reference,
-                        u.type_variation_reference
-                    ),
+                            "Unsupported Substrait user defined type with ref {} and variation {}",
+                            u.type_reference,
+                            u.type_variation_reference
+                        ),
                     }
                 }
             }
diff --git a/datafusion/substrait/src/logical_plan/consumer/utils.rs b/datafusion/substrait/src/logical_plan/consumer/utils.rs
index f7eedcb7a2b25..59cdf4a8fc93f 100644
--- a/datafusion/substrait/src/logical_plan/consumer/utils.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/utils.rs
@@ -18,16 +18,17 @@
 use crate::logical_plan::consumer::SubstraitConsumer;
 use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit, UnionFields};
 use datafusion::common::{
-    exec_err, not_impl_err, substrait_datafusion_err, substrait_err, DFSchema,
-    DFSchemaRef,
+    DFSchema, DFSchemaRef, exec_err, not_impl_err, substrait_datafusion_err,
+    substrait_err,
 };
 use datafusion::logical_expr::expr::Sort;
 use datafusion::logical_expr::{Cast, Expr, ExprSchemable};
+use datafusion::sql::TableReference;
 use std::collections::HashSet;
 use std::sync::Arc;
+use substrait::proto::SortField;
 use substrait::proto::sort_field::SortDirection;
 use substrait::proto::sort_field::SortKind::{ComparisonFunctionReference, Direction};
-use substrait::proto::SortField;
 
 // Substrait PrecisionTimestampTz indicates that the timestamp is relative to UTC, which
 // is the same as the expectation for any non-empty timezone in DF, so any non-empty timezone
@@ -246,7 +247,8 @@ pub(super) fn make_renamed_schema(
         return substrait_err!(
             "Names list must match exactly to nested schema, but found {} uses for {} names",
             name_idx,
-            dfs_names.len());
+            dfs_names.len()
+        );
     }
 
     DFSchema::from_field_specific_qualified_schema(
@@ -358,35 +360,71 @@ fn compatible_nullabilities(
 }
 
 pub(super) struct NameTracker {
-    seen_names: HashSet<String>,
-}
-
-pub(super) enum NameTrackerStatus {
-    NeverSeen,
-    SeenBefore,
+    /// Tracks seen schema names (from expr.schema_name()).
+    /// Used to detect duplicates that would fail validate_unique_names.
+    seen_schema_names: HashSet<String>,
+    /// Tracks column names that have been seen with a qualifier.
+    /// Used to detect ambiguous references (qualified + unqualified with same name).
+    qualified_names: HashSet<String>,
+    /// Tracks column names that have been seen without a qualifier.
+    /// Used to detect ambiguous references.
+    unqualified_names: HashSet<String>,
 }
 
 impl NameTracker {
     pub(super) fn new() -> Self {
         NameTracker {
-            seen_names: HashSet::default(),
+            seen_schema_names: HashSet::default(),
+            qualified_names: HashSet::default(),
+            unqualified_names: HashSet::default(),
         }
     }
-    pub(super) fn get_unique_name(
-        &mut self,
-        name: String,
-    ) -> (String, NameTrackerStatus) {
-        match self.seen_names.insert(name.clone()) {
-            true => (name, NameTrackerStatus::NeverSeen),
-            false => {
-                let mut counter = 0;
-                loop {
-                    let candidate_name = format!("{name}__temp__{counter}");
-                    if self.seen_names.insert(candidate_name.clone()) {
-                        return (candidate_name, NameTrackerStatus::SeenBefore);
-                    }
-                    counter += 1;
-                }
+
+    /// Check if the expression would cause a conflict either in:
+    /// 1. validate_unique_names (duplicate schema_name)
+    /// 2. DFSchema::check_names (ambiguous reference)
+    fn would_conflict(&self, expr: &Expr) -> bool {
+        let (qualifier, name) = expr.qualified_name();
+        let schema_name = expr.schema_name().to_string();
+        self.would_conflict_inner((qualifier, &name), &schema_name)
+    }
+
+    fn would_conflict_inner(
+        &self,
+        qualified_name: (Option<TableReference>, &str),
+        schema_name: &str,
+    ) -> bool {
+        // Check for duplicate schema_name (would fail validate_unique_names)
+        if self.seen_schema_names.contains(schema_name) {
+            return true;
+        }
+
+        // Check for ambiguous reference (would fail DFSchema::check_names)
+        // This happens when a qualified field and unqualified field have the same name
+        let (qualifier, name) = qualified_name;
+        match qualifier {
+            Some(_) => {
+                // Adding a qualified name - conflicts if unqualified version exists
+                self.unqualified_names.contains(name)
+            }
+            None => {
+                // Adding an unqualified name - conflicts if qualified version exists
+                self.qualified_names.contains(name)
+            }
+        }
+    }
+
+    fn insert(&mut self, expr: &Expr) {
+        let schema_name = expr.schema_name().to_string();
+        self.seen_schema_names.insert(schema_name);
+
+        let (qualifier, name) = expr.qualified_name();
+        match qualifier {
+            Some(_) => {
+                self.qualified_names.insert(name);
+            }
+            None => {
+                self.unqualified_names.insert(name);
             }
         }
     }
@@ -395,10 +433,25 @@ impl NameTracker {
         &mut self,
         expr: Expr,
     ) -> datafusion::common::Result<Expr> {
-        match self.get_unique_name(expr.name_for_alias()?) {
-            (_, NameTrackerStatus::NeverSeen) => Ok(expr),
-            (name, NameTrackerStatus::SeenBefore) => Ok(expr.alias(name)),
+        if !self.would_conflict(&expr) {
+            self.insert(&expr);
+            return Ok(expr);
         }
+
+        // Name collision - need to generate a unique alias
+        let schema_name = expr.schema_name().to_string();
+        let mut counter = 0;
+        let candidate_name = loop {
+            let candidate_name = format!("{schema_name}__temp__{counter}");
+            // .alias always produces an unqualified name so check for conflicts accordingly.
+            if !self.would_conflict_inner((None, &candidate_name), &candidate_name) {
+                break candidate_name;
+            }
+            counter += 1;
+        };
+        let candidate_expr = expr.alias(&candidate_name);
+        self.insert(&candidate_expr);
+        Ok(candidate_expr)
     }
 }
 
@@ -468,13 +521,14 @@ pub(crate) fn from_substrait_precision(
 
 #[cfg(test)]
 pub(crate) mod tests {
-    use super::make_renamed_schema;
+    use super::{NameTracker, make_renamed_schema};
     use crate::extensions::Extensions;
     use crate::logical_plan::consumer::DefaultSubstraitConsumer;
     use datafusion::arrow::datatypes::{DataType, Field};
     use datafusion::common::DFSchema;
     use datafusion::error::Result;
     use datafusion::execution::SessionState;
+    use datafusion::logical_expr::{Expr, col};
     use datafusion::prelude::SessionContext;
     use datafusion::sql::TableReference;
     use std::collections::HashMap;
@@ -577,12 +631,12 @@ pub(crate) mod tests {
 
         assert_eq!(renamed_schema.fields().len(), 5);
         assert_eq!(
-            *renamed_schema.field(0),
-            Field::new("a", DataType::Int32, false)
+            renamed_schema.field(0),
+            &Arc::new(Field::new("a", DataType::Int32, false))
         );
         assert_eq!(
-            *renamed_schema.field(1),
-            Field::new_struct(
+            renamed_schema.field(1),
+            &Arc::new(Field::new_struct(
                 "b",
                 vec![
                     Field::new("c", DataType::Int32, false),
@@ -593,11 +647,11 @@ pub(crate) mod tests {
                     )
                 ],
                 false,
-            )
+            ))
         );
         assert_eq!(
-            *renamed_schema.field(2),
-            Field::new_list(
+            renamed_schema.field(2),
+            &Arc::new(Field::new_list(
                 "f",
                 Arc::new(Field::new_struct(
                     "item",
@@ -605,11 +659,11 @@ pub(crate) mod tests {
                     false,
                 )),
                 false,
-            )
+            ))
         );
         assert_eq!(
-            *renamed_schema.field(3),
-            Field::new_large_list(
+            renamed_schema.field(3),
+            &Arc::new(Field::new_large_list(
                 "h",
                 Arc::new(Field::new_struct(
                     "item",
@@ -617,11 +671,11 @@ pub(crate) mod tests {
                     false,
                 )),
                 false,
-            )
+            ))
         );
         assert_eq!(
-            *renamed_schema.field(4),
-            Field::new_map(
+            renamed_schema.field(4),
+            &Arc::new(Field::new_map(
                 "j",
                 "entries",
                 Arc::new(Field::new_struct(
@@ -636,8 +690,127 @@ pub(crate) mod tests {
                 )),
                 false,
                 false,
-            )
+            ))
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn name_tracker_unique_names_pass_through() -> Result<()> {
+        let mut tracker = NameTracker::new();
+
+        // First expression should pass through unchanged
+        let expr1 = col("a");
+        let result1 = tracker.get_uniquely_named_expr(expr1.clone())?;
+        assert_eq!(result1, col("a"));
+
+        // Different name should also pass through unchanged
+        let expr2 = col("b");
+        let result2 = tracker.get_uniquely_named_expr(expr2)?;
+        assert_eq!(result2, col("b"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn name_tracker_duplicate_schema_name_gets_alias() -> Result<()> {
+        let mut tracker = NameTracker::new();
+
+        // First expression with name "a"
+        let expr1 = col("a");
+        let result1 = tracker.get_uniquely_named_expr(expr1)?;
+        assert_eq!(result1, col("a"));
+
+        // Second expression with same name "a" should get aliased
+        let expr2 = col("a");
+        let result2 = tracker.get_uniquely_named_expr(expr2)?;
+        assert_eq!(result2, col("a").alias("a__temp__0"));
+
+        // Third expression with same name "a" should get a different alias
+        let expr3 = col("a");
+        let result3 = tracker.get_uniquely_named_expr(expr3)?;
+        assert_eq!(result3, col("a").alias("a__temp__1"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn name_tracker_qualified_then_unqualified_conflicts() -> Result<()> {
+        let mut tracker = NameTracker::new();
+
+        // First: qualified column "table.a"
+        let qualified_col =
+            Expr::Column(datafusion::common::Column::new(Some("table"), "a"));
+        let result1 = tracker.get_uniquely_named_expr(qualified_col)?;
+        assert_eq!(
+            result1,
+            Expr::Column(datafusion::common::Column::new(Some("table"), "a"))
+        );
+
+        // Second: unqualified column "a" - should conflict (ambiguous reference)
+        let unqualified_col = col("a");
+        let result2 = tracker.get_uniquely_named_expr(unqualified_col)?;
+        // Should be aliased to avoid ambiguous reference
+        assert_eq!(result2, col("a").alias("a__temp__0"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn name_tracker_unqualified_then_qualified_conflicts() -> Result<()> {
+        let mut tracker = NameTracker::new();
+
+        // First: unqualified column "a"
+        let unqualified_col = col("a");
+        let result1 = tracker.get_uniquely_named_expr(unqualified_col)?;
+        assert_eq!(result1, col("a"));
+
+        // Second: qualified column "table.a" - should conflict (ambiguous reference)
+        let qualified_col =
+            Expr::Column(datafusion::common::Column::new(Some("table"), "a"));
+        let result2 = tracker.get_uniquely_named_expr(qualified_col)?;
+        // Should be aliased to avoid ambiguous reference
+        assert_eq!(
+            result2,
+            Expr::Column(datafusion::common::Column::new(Some("table"), "a"))
+                .alias("table.a__temp__0")
         );
+
+        Ok(())
+    }
+
+    #[test]
+    fn name_tracker_different_qualifiers_no_conflict() -> Result<()> {
+        let mut tracker = NameTracker::new();
+
+        // First: qualified column "table1.a"
+        let col1 = Expr::Column(datafusion::common::Column::new(Some("table1"), "a"));
+        let result1 = tracker.get_uniquely_named_expr(col1.clone())?;
+        assert_eq!(result1, col1);
+
+        // Second: qualified column "table2.a" - different qualifier, different schema_name
+        // so should NOT conflict
+        let col2 = Expr::Column(datafusion::common::Column::new(Some("table2"), "a"));
+        let result2 = tracker.get_uniquely_named_expr(col2.clone())?;
+        assert_eq!(result2, col2);
+
+        Ok(())
+    }
+
+    #[test]
+    fn name_tracker_aliased_expressions() -> Result<()> {
+        let mut tracker = NameTracker::new();
+
+        // First: col("x").alias("result")
+        let expr1 = col("x").alias("result");
+        let result1 = tracker.get_uniquely_named_expr(expr1.clone())?;
+        assert_eq!(result1, col("x").alias("result"));
+
+        // Second: col("y").alias("result") - same alias name, should conflict
+        let expr2 = col("y").alias("result");
+        let result2 = tracker.get_uniquely_named_expr(expr2)?;
+        assert_eq!(result2, col("y").alias("result").alias("result__temp__0"));
+
         Ok(())
     }
 }
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/aggregate_function.rs b/datafusion/substrait/src/logical_plan/producer/expr/aggregate_function.rs
index 1e79897a1b770..3713f8934f19f 100644
--- a/datafusion/substrait/src/logical_plan/producer/expr/aggregate_function.rs
+++ b/datafusion/substrait/src/logical_plan/producer/expr/aggregate_function.rs
@@ -54,7 +54,7 @@ pub fn from_aggregate_function(
         });
     }
     let function_anchor = producer.register_function(func.name().to_string());
-    #[allow(deprecated)]
+    #[expect(deprecated)]
     Ok(Measure {
         measure: Some(AggregateFunction {
             function_reference: function_anchor,
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/cast.rs b/datafusion/substrait/src/logical_plan/producer/expr/cast.rs
index 71c2140bac8bf..2a5a6fe5c3758 100644
--- a/datafusion/substrait/src/logical_plan/producer/expr/cast.rs
+++ b/datafusion/substrait/src/logical_plan/producer/expr/cast.rs
@@ -15,32 +15,32 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::producer::{to_substrait_type, SubstraitProducer};
+use crate::logical_plan::producer::{SubstraitProducer, to_substrait_type_from_field};
 use crate::variation_const::DEFAULT_TYPE_VARIATION_REF;
 use datafusion::common::{DFSchemaRef, ScalarValue};
 use datafusion::logical_expr::{Cast, Expr, TryCast};
+use substrait::proto::Expression;
 use substrait::proto::expression::cast::FailureBehavior;
 use substrait::proto::expression::literal::LiteralType;
 use substrait::proto::expression::{Literal, RexType};
-use substrait::proto::Expression;
 
 pub fn from_cast(
     producer: &mut impl SubstraitProducer,
     cast: &Cast,
     schema: &DFSchemaRef,
 ) -> datafusion::common::Result<Expression> {
-    let Cast { expr, data_type } = cast;
+    let Cast { expr, field } = cast;
     // since substrait Null must be typed, so if we see a cast(null, dt), we make it a typed null
     if let Expr::Literal(lit, _) = expr.as_ref() {
         // only the untyped(a null scalar value) null literal need this special handling
         // since all other kind of nulls are already typed and can be handled by substrait
         // e.g. null::<Int32Type> or null::<Utf8Type>
-        if matches!(lit, ScalarValue::Null) {
+        if *lit == ScalarValue::Null {
             let lit = Literal {
                 nullable: true,
                 type_variation_reference: DEFAULT_TYPE_VARIATION_REF,
-                literal_type: Some(LiteralType::Null(to_substrait_type(
-                    producer, data_type, true,
+                literal_type: Some(LiteralType::Null(to_substrait_type_from_field(
+                    producer, field,
                 )?)),
             };
             return Ok(Expression {
@@ -51,7 +51,7 @@ pub fn from_cast(
     Ok(Expression {
         rex_type: Some(RexType::Cast(Box::new(
             substrait::proto::expression::Cast {
-                r#type: Some(to_substrait_type(producer, data_type, true)?),
+                r#type: Some(to_substrait_type_from_field(producer, field)?),
                 input: Some(Box::new(producer.handle_expr(expr, schema)?)),
                 failure_behavior: FailureBehavior::ThrowException.into(),
             },
@@ -64,11 +64,11 @@ pub fn from_try_cast(
     cast: &TryCast,
     schema: &DFSchemaRef,
 ) -> datafusion::common::Result<Expression> {
-    let TryCast { expr, data_type } = cast;
+    let TryCast { expr, field } = cast;
     Ok(Expression {
         rex_type: Some(RexType::Cast(Box::new(
             substrait::proto::expression::Cast {
-                r#type: Some(to_substrait_type(producer, data_type, true)?),
+                r#type: Some(to_substrait_type_from_field(producer, field)?),
                 input: Some(Box::new(producer.handle_expr(expr, schema)?)),
                 failure_behavior: FailureBehavior::ReturnNull.into(),
             },
@@ -80,7 +80,7 @@ pub fn from_try_cast(
 mod tests {
     use super::*;
     use crate::logical_plan::producer::{
-        to_substrait_extended_expr, DefaultSubstraitProducer,
+        DefaultSubstraitProducer, to_substrait_extended_expr, to_substrait_type,
     };
     use datafusion::arrow::datatypes::{DataType, Field};
     use datafusion::common::DFSchema;
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/field_reference.rs b/datafusion/substrait/src/logical_plan/producer/expr/field_reference.rs
index d1d80ca545ff2..aa34317a6e292 100644
--- a/datafusion/substrait/src/logical_plan/producer/expr/field_reference.rs
+++ b/datafusion/substrait/src/logical_plan/producer/expr/field_reference.rs
@@ -15,15 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion::common::{substrait_err, Column, DFSchemaRef};
+use datafusion::common::{Column, DFSchemaRef, substrait_err};
 use datafusion::logical_expr::Expr;
+use substrait::proto::Expression;
 use substrait::proto::expression::field_reference::{
     ReferenceType, RootReference, RootType,
 };
 use substrait::proto::expression::{
-    reference_segment, FieldReference, ReferenceSegment, RexType,
+    FieldReference, ReferenceSegment, RexType, reference_segment,
 };
-use substrait::proto::Expression;
 
 pub fn from_column(
     col: &Column,
@@ -76,6 +76,22 @@ pub(crate) fn try_to_substrait_field_reference(
     }
 }
 
+/// Convert an outer reference column to a Substrait field reference.
+/// Outer reference columns reference columns from an outer query scope in correlated subqueries.
+/// We convert them the same way as regular columns since the subquery plan will be
+/// reconstructed with the proper schema context during consumption.
+pub fn from_outer_reference_column(
+    col: &Column,
+    schema: &DFSchemaRef,
+) -> datafusion::common::Result<Expression> {
+    // OuterReferenceColumn is converted similarly to a regular column reference.
+    // The schema provided should be the schema context in which the outer reference
+    // column appears. During Substrait round-trip, the consumer will reconstruct
+    // the outer reference based on the subquery context.
+    let index = schema.index_of_column(col)?;
+    substrait_field_ref(index)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/if_then.rs b/datafusion/substrait/src/logical_plan/producer/expr/if_then.rs
index a34959ead76de..2c10b26436f50 100644
--- a/datafusion/substrait/src/logical_plan/producer/expr/if_then.rs
+++ b/datafusion/substrait/src/logical_plan/producer/expr/if_then.rs
@@ -18,9 +18,9 @@
 use crate::logical_plan::producer::SubstraitProducer;
 use datafusion::common::DFSchemaRef;
 use datafusion::logical_expr::Case;
+use substrait::proto::Expression;
 use substrait::proto::expression::if_then::IfClause;
 use substrait::proto::expression::{IfThen, RexType};
-use substrait::proto::Expression;
 
 pub fn from_case(
     producer: &mut impl SubstraitProducer,
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/literal.rs b/datafusion/substrait/src/logical_plan/producer/expr/literal.rs
index 1bb24168e57a4..bbed7ee9be417 100644
--- a/datafusion/substrait/src/logical_plan/producer/expr/literal.rs
+++ b/datafusion/substrait/src/logical_plan/producer/expr/literal.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::producer::{to_substrait_type, SubstraitProducer};
+use crate::logical_plan::producer::{SubstraitProducer, to_substrait_type};
 use crate::variation_const::{
     DATE_32_TYPE_VARIATION_REF, DECIMAL_128_TYPE_VARIATION_REF,
     DEFAULT_CONTAINER_TYPE_VARIATION_REF, DEFAULT_TYPE_VARIATION_REF, FLOAT_16_TYPE_NAME,
@@ -25,7 +25,7 @@ use crate::variation_const::{
 };
 use datafusion::arrow::array::{Array, GenericListArray, OffsetSizeTrait};
 use datafusion::arrow::temporal_conversions::NANOSECONDS;
-use datafusion::common::{exec_err, not_impl_err, ScalarValue};
+use datafusion::common::{ScalarValue, exec_err, not_impl_err};
 use substrait::proto::expression::literal::interval_day_to_second::PrecisionMode;
 use substrait::proto::expression::literal::map::KeyValue;
 use substrait::proto::expression::literal::{
@@ -33,7 +33,7 @@ use substrait::proto::expression::literal::{
     LiteralType, Map, PrecisionTime, PrecisionTimestamp, Struct,
 };
 use substrait::proto::expression::{Literal, RexType};
-use substrait::proto::{r#type, Expression};
+use substrait::proto::{Expression, r#type};
 
 pub fn from_literal(
     producer: &mut impl SubstraitProducer,
@@ -117,7 +117,7 @@ pub(crate) fn to_substrait_literal(
             (
                 LiteralType::UserDefined(
                     substrait::proto::expression::literal::UserDefined {
-                        type_reference: type_anchor,
+                        type_anchor_type: Some(substrait::proto::expression::literal::user_defined::TypeAnchorType::TypeReference(type_anchor)),
                         type_parameters: vec![],
                         val: Some(substrait::proto::expression::literal::user_defined::Val::Value(
                             pbjson_types::Any {
@@ -413,8 +413,8 @@ mod tests {
     use datafusion::arrow::datatypes::{
         DataType, Field, IntervalDayTime, IntervalMonthDayNano,
     };
-    use datafusion::common::scalar::ScalarStructBuilder;
     use datafusion::common::Result;
+    use datafusion::common::scalar::ScalarStructBuilder;
     use datafusion::prelude::SessionContext;
     use std::sync::Arc;
 
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/mod.rs b/datafusion/substrait/src/logical_plan/producer/expr/mod.rs
index f4e43fd586773..6e053f0d90a96 100644
--- a/datafusion/substrait/src/logical_plan/producer/expr/mod.rs
+++ b/datafusion/substrait/src/logical_plan/producer/expr/mod.rs
@@ -20,6 +20,7 @@ mod cast;
 mod field_reference;
 mod if_then;
 mod literal;
+mod placeholder;
 mod scalar_function;
 mod singular_or_list;
 mod subquery;
@@ -30,6 +31,7 @@ pub use cast::*;
 pub use field_reference::*;
 pub use if_then::*;
 pub use literal::*;
+pub use placeholder::*;
 pub use scalar_function::*;
 pub use singular_or_list::*;
 pub use subquery::*;
@@ -37,13 +39,13 @@ pub use window_function::*;
 
 use crate::logical_plan::producer::utils::flatten_names;
 use crate::logical_plan::producer::{
-    to_substrait_named_struct, DefaultSubstraitProducer, SubstraitProducer,
+    DefaultSubstraitProducer, SubstraitProducer, to_substrait_named_struct,
 };
 use datafusion::arrow::datatypes::Field;
-use datafusion::common::{internal_err, not_impl_err, DFSchemaRef};
+use datafusion::common::{DFSchemaRef, internal_err, not_impl_err};
 use datafusion::execution::SessionState;
-use datafusion::logical_expr::expr::Alias;
 use datafusion::logical_expr::Expr;
+use datafusion::logical_expr::expr::Alias;
 use substrait::proto::expression_reference::ExprType;
 use substrait::proto::{Expression, ExpressionReference, ExtendedExpression};
 use substrait::version;
@@ -60,9 +62,6 @@ use substrait::version;
 ///
 /// Substrait also requires the input schema of the expressions to be included in the
 /// message.  The field names of the input schema will be serialized.
-// Silence deprecation warnings for `extension_uris` during the uri -> urn migration
-// See: https://github.com/substrait-io/substrait/issues/856
-#[allow(deprecated)]
 pub fn to_substrait_extended_expr(
     exprs: &[(&Expr, &Field)],
     schema: &DFSchemaRef,
@@ -87,7 +86,6 @@ pub fn to_substrait_extended_expr(
     Ok(Box::new(ExtendedExpression {
         advanced_extensions: None,
         expected_type_urls: vec![],
-        extension_uris: vec![],
         extension_urns: vec![],
         extensions: extensions.into(),
         version: Some(version::version_with_producer("datafusion")),
@@ -139,19 +137,27 @@ pub fn to_substrait_rex(
         }
         Expr::WindowFunction(expr) => producer.handle_window_function(expr, schema),
         Expr::InList(expr) => producer.handle_in_list(expr, schema),
-        Expr::Exists(expr) => not_impl_err!("Cannot convert {expr:?} to Substrait"),
+        Expr::Exists(expr) => producer.handle_exists(expr, schema),
         Expr::InSubquery(expr) => producer.handle_in_subquery(expr, schema),
-        Expr::ScalarSubquery(expr) => {
-            not_impl_err!("Cannot convert {expr:?} to Substrait")
-        }
+        Expr::SetComparison(expr) => producer.handle_set_comparison(expr, schema),
+        Expr::ScalarSubquery(expr) => producer.handle_scalar_subquery(expr, schema),
         #[expect(deprecated)]
         Expr::Wildcard { .. } => not_impl_err!("Cannot convert {expr:?} to Substrait"),
         Expr::GroupingSet(expr) => not_impl_err!("Cannot convert {expr:?} to Substrait"),
-        Expr::Placeholder(expr) => not_impl_err!("Cannot convert {expr:?} to Substrait"),
+        Expr::Placeholder(expr) => producer.handle_placeholder(expr, schema),
         Expr::OuterReferenceColumn(_, _) => {
+            // OuterReferenceColumn requires tracking outer query schema context for correlated
+            // subqueries. This is a complex feature that is not yet implemented.
             not_impl_err!("Cannot convert {expr:?} to Substrait")
         }
         Expr::Unnest(expr) => not_impl_err!("Cannot convert {expr:?} to Substrait"),
+        Expr::HigherOrderFunction(expr) => {
+            producer.handle_higher_order_function(expr, schema)
+        }
+        Expr::Lambda(expr) => not_impl_err!("Cannot convert {expr:?} to Substrait"),
+        Expr::LambdaVariable(expr) => {
+            not_impl_err!("Cannot convert {expr:?} to Substrait")
+        }
     }
 }
 
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/placeholder.rs b/datafusion/substrait/src/logical_plan/producer/expr/placeholder.rs
new file mode 100644
index 0000000000000..44721abc0ce5f
--- /dev/null
+++ b/datafusion/substrait/src/logical_plan/producer/expr/placeholder.rs
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::logical_plan::producer::{SubstraitProducer, to_substrait_type};
+use datafusion::common::substrait_err;
+use datafusion::logical_expr::expr::Placeholder;
+use substrait::proto::expression::RexType;
+use substrait::proto::{DynamicParameter, Expression};
+
+pub fn from_placeholder(
+    producer: &mut impl SubstraitProducer,
+    placeholder: &Placeholder,
+) -> datafusion::common::Result<Expression> {
+    let parameter_reference = parse_placeholder_index(&placeholder.id)?;
+
+    let r#type = placeholder
+        .field
+        .as_ref()
+        .map(|field| to_substrait_type(producer, field.data_type(), field.is_nullable()))
+        .transpose()?;
+
+    Ok(Expression {
+        rex_type: Some(RexType::DynamicParameter(DynamicParameter {
+            r#type,
+            parameter_reference,
+        })),
+    })
+}
+
+/// Converts a placeholder id like "$1" into a zero-based parameter index.
+/// Substrait uses zero-based `parameter_reference` while DataFusion uses
+/// one-based `$N` placeholder ids.
+fn parse_placeholder_index(id: &str) -> datafusion::common::Result<u32> {
+    let num_str = id.strip_prefix('$').unwrap_or(id);
+    match num_str.parse::<u32>() {
+        Ok(n) if n > 0 => Ok(n - 1),
+        Ok(_) => substrait_err!("Placeholder index must be >= 1, got: {id}"),
+        Err(_) => substrait_err!("Cannot parse placeholder id as numeric index: {id}"),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_placeholder_index() {
+        assert_eq!(parse_placeholder_index("$1").unwrap(), 0);
+        assert_eq!(parse_placeholder_index("$2").unwrap(), 1);
+        assert_eq!(parse_placeholder_index("$100").unwrap(), 99);
+        assert!(parse_placeholder_index("$0").is_err());
+        assert!(parse_placeholder_index("$name").is_err());
+    }
+}
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/scalar_function.rs b/datafusion/substrait/src/logical_plan/producer/expr/scalar_function.rs
index abb26f6f66822..e36d5128cd293 100644
--- a/datafusion/substrait/src/logical_plan/producer/expr/scalar_function.rs
+++ b/datafusion/substrait/src/logical_plan/producer/expr/scalar_function.rs
@@ -15,9 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::producer::{to_substrait_literal_expr, SubstraitProducer};
-use datafusion::common::{not_impl_err, DFSchemaRef, ScalarValue};
-use datafusion::logical_expr::{expr, Between, BinaryExpr, Expr, Like, Operator};
+use crate::logical_plan::producer::{SubstraitProducer, to_substrait_literal_expr};
+use datafusion::common::{DFSchemaRef, ScalarValue, not_impl_err};
+use datafusion::logical_expr::{Between, BinaryExpr, Expr, Like, Operator, expr};
 use substrait::proto::expression::{RexType, ScalarFunction};
 use substrait::proto::function_argument::ArgType;
 use substrait::proto::{Expression, FunctionArgument};
@@ -26,18 +26,35 @@ pub fn from_scalar_function(
     producer: &mut impl SubstraitProducer,
     fun: &expr::ScalarFunction,
     schema: &DFSchemaRef,
+) -> datafusion::common::Result<Expression> {
+    from_function(producer, fun.name(), &fun.args, schema)
+}
+
+pub fn from_higher_order_function(
+    producer: &mut impl SubstraitProducer,
+    fun: &expr::HigherOrderFunction,
+    schema: &DFSchemaRef,
+) -> datafusion::common::Result<Expression> {
+    from_function(producer, fun.name(), &fun.args, schema)
+}
+
+fn from_function(
+    producer: &mut impl SubstraitProducer,
+    name: &str,
+    args: &[Expr],
+    schema: &DFSchemaRef,
 ) -> datafusion::common::Result<Expression> {
     let mut arguments: Vec<FunctionArgument> = vec![];
-    for arg in &fun.args {
+    for arg in args {
         arguments.push(FunctionArgument {
             arg_type: Some(ArgType::Value(producer.handle_expr(arg, schema)?)),
         });
     }
 
-    let arguments = custom_argument_handler(fun.name(), arguments);
+    let arguments = custom_argument_handler(name, arguments);
 
-    let function_anchor = producer.register_function(fun.name().to_string());
-    #[allow(deprecated)]
+    let function_anchor = producer.register_function(name.to_string());
+    #[expect(deprecated)]
     Ok(Expression {
         rex_type: Some(RexType::ScalarFunction(ScalarFunction {
             function_reference: function_anchor,
@@ -155,7 +172,7 @@ fn make_substrait_like_expr(
         },
     ];
 
-    #[allow(deprecated)]
+    #[expect(deprecated)]
     let substrait_like = Expression {
         rex_type: Some(RexType::ScalarFunction(ScalarFunction {
             function_reference: function_anchor,
@@ -169,7 +186,7 @@ fn make_substrait_like_expr(
     if negated {
         let function_anchor = producer.register_function("not".to_string());
 
-        #[allow(deprecated)]
+        #[expect(deprecated)]
         Ok(Expression {
             rex_type: Some(RexType::ScalarFunction(ScalarFunction {
                 function_reference: function_anchor,
@@ -217,7 +234,7 @@ pub fn make_binary_op_scalar_func(
     op: Operator,
 ) -> Expression {
     let function_anchor = producer.register_function(operator_to_name(op).to_string());
-    #[allow(deprecated)]
+    #[expect(deprecated)]
     Expression {
         rex_type: Some(RexType::ScalarFunction(ScalarFunction {
             function_reference: function_anchor,
@@ -344,5 +361,6 @@ pub fn operator_to_name(op: Operator) -> &'static str {
         Operator::BitwiseXor => "bitwise_xor",
         Operator::BitwiseShiftRight => "bitwise_shift_right",
         Operator::BitwiseShiftLeft => "bitwise_shift_left",
+        Operator::Colon => "colon",
     }
 }
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/singular_or_list.rs b/datafusion/substrait/src/logical_plan/producer/expr/singular_or_list.rs
index 1c0b6dcc154bc..fd09a60d5eadc 100644
--- a/datafusion/substrait/src/logical_plan/producer/expr/singular_or_list.rs
+++ b/datafusion/substrait/src/logical_plan/producer/expr/singular_or_list.rs
@@ -15,12 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::producer::SubstraitProducer;
+use crate::logical_plan::producer::{SubstraitProducer, negate};
 use datafusion::common::DFSchemaRef;
 use datafusion::logical_expr::expr::InList;
-use substrait::proto::expression::{RexType, ScalarFunction, SingularOrList};
-use substrait::proto::function_argument::ArgType;
-use substrait::proto::{Expression, FunctionArgument};
+use substrait::proto::Expression;
+use substrait::proto::expression::{RexType, SingularOrList};
 
 pub fn from_in_list(
     producer: &mut impl SubstraitProducer,
@@ -46,20 +45,7 @@ pub fn from_in_list(
     };
 
     if *negated {
-        let function_anchor = producer.register_function("not".to_string());
-
-        #[allow(deprecated)]
-        Ok(Expression {
-            rex_type: Some(RexType::ScalarFunction(ScalarFunction {
-                function_reference: function_anchor,
-                arguments: vec![FunctionArgument {
-                    arg_type: Some(ArgType::Value(substrait_or_list)),
-                }],
-                output_type: None,
-                args: vec![],
-                options: vec![],
-            })),
-        })
+        Ok(negate(producer, substrait_or_list))
     } else {
         Ok(substrait_or_list)
     }
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/subquery.rs b/datafusion/substrait/src/logical_plan/producer/expr/subquery.rs
index c1ee78c68c258..97699c2132781 100644
--- a/datafusion/substrait/src/logical_plan/producer/expr/subquery.rs
+++ b/datafusion/substrait/src/logical_plan/producer/expr/subquery.rs
@@ -15,13 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::producer::SubstraitProducer;
-use datafusion::common::DFSchemaRef;
-use datafusion::logical_expr::expr::InSubquery;
-use substrait::proto::expression::subquery::InPredicate;
-use substrait::proto::expression::{RexType, ScalarFunction};
-use substrait::proto::function_argument::ArgType;
-use substrait::proto::{Expression, FunctionArgument};
+use crate::logical_plan::producer::{SubstraitProducer, negate};
+use datafusion::common::{DFSchemaRef, substrait_err};
+use datafusion::logical_expr::expr::{Exists, InSubquery, SetComparison, SetQuantifier};
+use datafusion::logical_expr::{Operator, Subquery};
+use substrait::proto::Expression;
+use substrait::proto::expression::RexType;
+use substrait::proto::expression::subquery::set_comparison::{ComparisonOp, ReductionOp};
+use substrait::proto::expression::subquery::{InPredicate, Scalar, SetPredicate};
 
 pub fn from_in_subquery(
     producer: &mut impl SubstraitProducer,
@@ -52,21 +53,111 @@ pub fn from_in_subquery(
         ))),
     };
     if *negated {
-        let function_anchor = producer.register_function("not".to_string());
-
-        #[allow(deprecated)]
-        Ok(Expression {
-            rex_type: Some(RexType::ScalarFunction(ScalarFunction {
-                function_reference: function_anchor,
-                arguments: vec![FunctionArgument {
-                    arg_type: Some(ArgType::Value(substrait_subquery)),
-                }],
-                output_type: None,
-                args: vec![],
-                options: vec![],
-            })),
-        })
+        Ok(negate(producer, substrait_subquery))
     } else {
         Ok(substrait_subquery)
     }
 }
+
+fn comparison_op_to_proto(op: &Operator) -> datafusion::common::Result<ComparisonOp> {
+    match op {
+        Operator::Eq => Ok(ComparisonOp::Eq),
+        Operator::NotEq => Ok(ComparisonOp::Ne),
+        Operator::Lt => Ok(ComparisonOp::Lt),
+        Operator::Gt => Ok(ComparisonOp::Gt),
+        Operator::LtEq => Ok(ComparisonOp::Le),
+        Operator::GtEq => Ok(ComparisonOp::Ge),
+        _ => substrait_err!("Unsupported operator {op:?} for SetComparison subquery"),
+    }
+}
+
+fn reduction_op_to_proto(
+    quantifier: &SetQuantifier,
+) -> datafusion::common::Result<ReductionOp> {
+    match quantifier {
+        SetQuantifier::Any => Ok(ReductionOp::Any),
+        SetQuantifier::All => Ok(ReductionOp::All),
+    }
+}
+
+pub fn from_set_comparison(
+    producer: &mut impl SubstraitProducer,
+    set_comparison: &SetComparison,
+    schema: &DFSchemaRef,
+) -> datafusion::common::Result<Expression> {
+    let comparison_op = comparison_op_to_proto(&set_comparison.op)? as i32;
+    let reduction_op = reduction_op_to_proto(&set_comparison.quantifier)? as i32;
+    let left = producer.handle_expr(set_comparison.expr.as_ref(), schema)?;
+    let subquery_plan =
+        producer.handle_plan(set_comparison.subquery.subquery.as_ref())?;
+
+    Ok(Expression {
+        rex_type: Some(RexType::Subquery(Box::new(
+            substrait::proto::expression::Subquery {
+                subquery_type: Some(
+                    substrait::proto::expression::subquery::SubqueryType::SetComparison(
+                        Box::new(substrait::proto::expression::subquery::SetComparison {
+                            reduction_op,
+                            comparison_op,
+                            left: Some(Box::new(left)),
+                            right: Some(subquery_plan),
+                        }),
+                    ),
+                ),
+            },
+        ))),
+    })
+}
+
+/// Convert DataFusion ScalarSubquery to Substrait Scalar subquery type
+pub fn from_scalar_subquery(
+    producer: &mut impl SubstraitProducer,
+    subquery: &Subquery,
+    _schema: &DFSchemaRef,
+) -> datafusion::common::Result<Expression> {
+    let subquery_plan = producer.handle_plan(subquery.subquery.as_ref())?;
+
+    Ok(Expression {
+        rex_type: Some(RexType::Subquery(Box::new(
+            substrait::proto::expression::Subquery {
+                subquery_type: Some(
+                    substrait::proto::expression::subquery::SubqueryType::Scalar(
+                        Box::new(Scalar {
+                            input: Some(subquery_plan),
+                        }),
+                    ),
+                ),
+            },
+        ))),
+    })
+}
+
+/// Convert DataFusion Exists expression to Substrait SetPredicate subquery type
+pub fn from_exists(
+    producer: &mut impl SubstraitProducer,
+    exists: &Exists,
+    _schema: &DFSchemaRef,
+) -> datafusion::common::Result<Expression> {
+    let subquery_plan = producer.handle_plan(exists.subquery.subquery.as_ref())?;
+
+    let substrait_exists = Expression {
+        rex_type: Some(RexType::Subquery(Box::new(
+            substrait::proto::expression::Subquery {
+                subquery_type: Some(
+                    substrait::proto::expression::subquery::SubqueryType::SetPredicate(
+                        Box::new(SetPredicate {
+                            predicate_op: substrait::proto::expression::subquery::set_predicate::PredicateOp::Exists as i32,
+                            tuples: Some(subquery_plan),
+                        }),
+                    ),
+                ),
+            },
+        ))),
+    };
+
+    if exists.negated {
+        Ok(negate(producer, substrait_exists))
+    } else {
+        Ok(substrait_exists)
+    }
+}
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/window_function.rs b/datafusion/substrait/src/logical_plan/producer/expr/window_function.rs
index 465479e1e0488..5d5f31cf116b0 100644
--- a/datafusion/substrait/src/logical_plan/producer/expr/window_function.rs
+++ b/datafusion/substrait/src/logical_plan/producer/expr/window_function.rs
@@ -15,16 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::producer::utils::substrait_sort_field;
 use crate::logical_plan::producer::SubstraitProducer;
-use datafusion::common::{not_impl_err, DFSchemaRef, ScalarValue};
+use crate::logical_plan::producer::utils::substrait_sort_field;
+use datafusion::common::{DFSchemaRef, ScalarValue, not_impl_err};
 use datafusion::logical_expr::expr::{WindowFunction, WindowFunctionParams};
 use datafusion::logical_expr::{WindowFrame, WindowFrameBound, WindowFrameUnits};
+use substrait::proto::expression::RexType;
+use substrait::proto::expression::WindowFunction as SubstraitWindowFunction;
 use substrait::proto::expression::window_function::bound as SubstraitBound;
 use substrait::proto::expression::window_function::bound::Kind as BoundKind;
 use substrait::proto::expression::window_function::{Bound, BoundsType};
-use substrait::proto::expression::RexType;
-use substrait::proto::expression::WindowFunction as SubstraitWindowFunction;
 use substrait::proto::function_argument::ArgType;
 use substrait::proto::{Expression, FunctionArgument, SortField};
 
@@ -86,7 +86,7 @@ fn make_substrait_window_function(
     bounds: (Bound, Bound),
     bounds_type: BoundsType,
 ) -> Expression {
-    #[allow(deprecated)]
+    #[expect(deprecated)]
     Expression {
         rex_type: Some(RexType::WindowFunction(SubstraitWindowFunction {
             function_reference,
diff --git a/datafusion/substrait/src/logical_plan/producer/plan.rs b/datafusion/substrait/src/logical_plan/producer/plan.rs
index ad8f45ec3606f..3b58720dba832 100644
--- a/datafusion/substrait/src/logical_plan/producer/plan.rs
+++ b/datafusion/substrait/src/logical_plan/producer/plan.rs
@@ -16,17 +16,14 @@
 // under the License.
 
 use crate::logical_plan::producer::{
-    to_substrait_named_struct, DefaultSubstraitProducer, SubstraitProducer,
+    DefaultSubstraitProducer, SubstraitProducer, to_substrait_named_struct,
 };
 use datafusion::execution::SessionState;
 use datafusion::logical_expr::{LogicalPlan, SubqueryAlias};
-use substrait::proto::{plan_rel, Plan, PlanRel, Rel, RelRoot};
+use substrait::proto::{Plan, PlanRel, Rel, RelRoot, plan_rel};
 use substrait::version;
 
 /// Convert DataFusion LogicalPlan to Substrait Plan
-// Silence deprecation warnings for `extension_uris` during the uri -> urn migration
-// See: https://github.com/substrait-io/substrait/issues/856
-#[allow(deprecated)]
 pub fn to_substrait_plan(
     plan: &LogicalPlan,
     state: &SessionState,
@@ -47,7 +44,6 @@ pub fn to_substrait_plan(
     let extensions = producer.get_extensions();
     Ok(Box::new(Plan {
         version: Some(version::version_with_producer("datafusion")),
-        extension_uris: vec![],
         extension_urns: vec![],
         extensions: extensions.into(),
         relations: plan_rels,
diff --git a/datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs b/datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs
index 917959ea7ddae..dec94b0422257 100644
--- a/datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs
+++ b/datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs
@@ -16,10 +16,11 @@
 // under the License.
 
 use crate::logical_plan::producer::{
-    from_aggregate_function, substrait_field_ref, SubstraitProducer,
+    SubstraitProducer, from_aggregate_function, substrait_field_ref,
 };
-use datafusion::common::{internal_err, not_impl_err, DFSchemaRef, DataFusionError};
+use datafusion::common::{DFSchemaRef, internal_err, not_impl_err};
 use datafusion::logical_expr::expr::Alias;
+use datafusion::logical_expr::utils::powerset;
 use datafusion::logical_expr::{Aggregate, Distinct, Expr, GroupingSet};
 use substrait::proto::aggregate_rel::{Grouping, Measure};
 use substrait::proto::rel::RelType;
@@ -63,7 +64,7 @@ pub fn from_distinct(
                 .map(substrait_field_ref)
                 .collect::<datafusion::common::Result<Vec<_>>>()?;
 
-            #[allow(deprecated)]
+            #[expect(deprecated)]
             Ok(Box::new(Rel {
                 rel_type: Some(RelType::Aggregate(Box::new(AggregateRel {
                     common: None,
@@ -91,10 +92,22 @@ pub fn to_substrait_groupings(
     let groupings = match exprs.len() {
         1 => match &exprs[0] {
             Expr::GroupingSet(gs) => match gs {
-                GroupingSet::Cube(_) => Err(DataFusionError::NotImplemented(
-                    "GroupingSet CUBE is not yet supported".to_string(),
-                )),
-                GroupingSet::GroupingSets(sets) => Ok(sets
+                GroupingSet::Cube(set) => {
+                    // Generate power set of grouping expressions
+                    let cube_sets = powerset(set)?;
+                    cube_sets
+                        .iter()
+                        .map(|set| {
+                            parse_flat_grouping_exprs(
+                                producer,
+                                &set.iter().map(|v| (*v).clone()).collect::<Vec<_>>(),
+                                schema,
+                                &mut ref_group_exprs,
+                            )
+                        })
+                        .collect::<datafusion::common::Result<Vec<_>>>()
+                }
+                GroupingSet::GroupingSets(sets) => sets
                     .iter()
                     .map(|set| {
                         parse_flat_grouping_exprs(
@@ -104,14 +117,13 @@ pub fn to_substrait_groupings(
                             &mut ref_group_exprs,
                         )
                     })
-                    .collect::<datafusion::common::Result<Vec<_>>>()?),
+                    .collect::<datafusion::common::Result<Vec<_>>>(),
                 GroupingSet::Rollup(set) => {
                     let mut sets: Vec<Vec<Expr>> = vec![vec![]];
                     for i in 0..set.len() {
                         sets.push(set[..=i].to_vec());
                     }
-                    Ok(sets
-                        .iter()
+                    sets.iter()
                         .rev()
                         .map(|set| {
                             parse_flat_grouping_exprs(
@@ -121,7 +133,7 @@ pub fn to_substrait_groupings(
                                 &mut ref_group_exprs,
                             )
                         })
-                        .collect::<datafusion::common::Result<Vec<_>>>()?)
+                        .collect::<datafusion::common::Result<Vec<_>>>()
                 }
             },
             _ => Ok(vec![parse_flat_grouping_exprs(
@@ -156,7 +168,7 @@ pub fn parse_flat_grouping_exprs(
         ref_group_exprs.push(rex);
         expression_references.push((ref_group_exprs.len() - 1) as u32);
     }
-    #[allow(deprecated)]
+    #[expect(deprecated)]
     Ok(Grouping {
         grouping_expressions,
         expression_references,
@@ -169,7 +181,9 @@ pub fn to_substrait_agg_measure(
     schema: &DFSchemaRef,
 ) -> datafusion::common::Result<Measure> {
     match expr {
-        Expr::AggregateFunction(agg_fn) => from_aggregate_function(producer, agg_fn, schema),
+        Expr::AggregateFunction(agg_fn) => {
+            from_aggregate_function(producer, agg_fn, schema)
+        }
         Expr::Alias(Alias { expr, .. }) => {
             to_substrait_agg_measure(producer, expr, schema)
         }
diff --git a/datafusion/substrait/src/logical_plan/producer/rel/exchange_rel.rs b/datafusion/substrait/src/logical_plan/producer/rel/exchange_rel.rs
index 9e0ef8905f432..50c4b3da86cbe 100644
--- a/datafusion/substrait/src/logical_plan/producer/rel/exchange_rel.rs
+++ b/datafusion/substrait/src/logical_plan/producer/rel/exchange_rel.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use crate::logical_plan::producer::{
-    try_to_substrait_field_reference, SubstraitProducer,
+    SubstraitProducer, try_to_substrait_field_reference,
 };
 use datafusion::common::not_impl_err;
 use datafusion::logical_expr::{Partitioning, Repartition};
@@ -35,7 +35,7 @@ pub fn from_repartition(
         Partitioning::DistributeBy(_) => {
             return not_impl_err!(
                 "Physical plan does not support DistributeBy partitioning"
-            )
+            );
         }
     };
     // ref: https://substrait.io/relations/physical_relations/#exchange-types
@@ -53,7 +53,7 @@ pub fn from_repartition(
         Partitioning::DistributeBy(_) => {
             return not_impl_err!(
                 "Physical plan does not support DistributeBy partitioning"
-            )
+            );
         }
     };
     let exchange_rel = ExchangeRel {
diff --git a/datafusion/substrait/src/logical_plan/producer/rel/fetch_rel.rs b/datafusion/substrait/src/logical_plan/producer/rel/fetch_rel.rs
index 4706401d558ec..e878b3816ff42 100644
--- a/datafusion/substrait/src/logical_plan/producer/rel/fetch_rel.rs
+++ b/datafusion/substrait/src/logical_plan/producer/rel/fetch_rel.rs
@@ -20,7 +20,7 @@ use datafusion::common::DFSchema;
 use datafusion::logical_expr::Limit;
 use std::sync::Arc;
 use substrait::proto::rel::RelType;
-use substrait::proto::{fetch_rel, FetchRel, Rel};
+use substrait::proto::{FetchRel, Rel, fetch_rel};
 
 pub fn from_limit(
     producer: &mut impl SubstraitProducer,
diff --git a/datafusion/substrait/src/logical_plan/producer/rel/join.rs b/datafusion/substrait/src/logical_plan/producer/rel/join.rs
index 835d3ee37a459..cbf5593ffc86c 100644
--- a/datafusion/substrait/src/logical_plan/producer/rel/join.rs
+++ b/datafusion/substrait/src/logical_plan/producer/rel/join.rs
@@ -15,14 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::producer::{make_binary_op_scalar_func, SubstraitProducer};
+use crate::logical_plan::producer::{SubstraitProducer, make_binary_op_scalar_func};
 use datafusion::common::{
-    not_impl_err, DFSchemaRef, JoinConstraint, JoinType, NullEquality,
+    DFSchemaRef, JoinConstraint, JoinType, NullEquality, not_impl_err,
 };
 use datafusion::logical_expr::{Expr, Join, Operator};
 use std::sync::Arc;
 use substrait::proto::rel::RelType;
-use substrait::proto::{join_rel, Expression, JoinRel, Rel};
+use substrait::proto::{Expression, JoinRel, Rel, join_rel};
 
 pub fn from_join(
     producer: &mut impl SubstraitProducer,
diff --git a/datafusion/substrait/src/logical_plan/producer/rel/project_rel.rs b/datafusion/substrait/src/logical_plan/producer/rel/project_rel.rs
index 0190dca12bf53..33920cdf86f7a 100644
--- a/datafusion/substrait/src/logical_plan/producer/rel/project_rel.rs
+++ b/datafusion/substrait/src/logical_plan/producer/rel/project_rel.rs
@@ -15,12 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::producer::{substrait_field_ref, SubstraitProducer};
+use crate::logical_plan::producer::{SubstraitProducer, substrait_field_ref};
 use datafusion::logical_expr::{Projection, Window};
 use substrait::proto::rel::RelType;
 use substrait::proto::rel_common::EmitKind;
 use substrait::proto::rel_common::EmitKind::Emit;
-use substrait::proto::{rel_common, ProjectRel, Rel, RelCommon};
+use substrait::proto::{ProjectRel, Rel, RelCommon, rel_common};
 
 pub fn from_projection(
     producer: &mut impl SubstraitProducer,
diff --git a/datafusion/substrait/src/logical_plan/producer/rel/read_rel.rs b/datafusion/substrait/src/logical_plan/producer/rel/read_rel.rs
index 4b2e3782108b6..8dfbb36d3767d 100644
--- a/datafusion/substrait/src/logical_plan/producer/rel/read_rel.rs
+++ b/datafusion/substrait/src/logical_plan/producer/rel/read_rel.rs
@@ -16,19 +16,83 @@
 // under the License.
 
 use crate::logical_plan::producer::{
-    to_substrait_literal, to_substrait_named_struct, SubstraitProducer,
+    SubstraitProducer, to_substrait_literal, to_substrait_named_struct,
 };
-use datafusion::common::{not_impl_err, substrait_datafusion_err, DFSchema, ToDFSchema};
+use datafusion::common::{DFSchema, ToDFSchema, substrait_datafusion_err};
 use datafusion::logical_expr::utils::conjunction;
 use datafusion::logical_expr::{EmptyRelation, Expr, TableScan, Values};
+use datafusion::scalar::ScalarValue;
 use std::sync::Arc;
-use substrait::proto::expression::literal::Struct;
-use substrait::proto::expression::mask_expression::{StructItem, StructSelect};
 use substrait::proto::expression::MaskExpression;
+use substrait::proto::expression::literal::Struct as LiteralStruct;
+use substrait::proto::expression::mask_expression::{StructItem, StructSelect};
+use substrait::proto::expression::nested::Struct as NestedStruct;
 use substrait::proto::read_rel::{NamedTable, ReadType, VirtualTable};
 use substrait::proto::rel::RelType;
 use substrait::proto::{ReadRel, Rel};
 
+/// Converts rows of literal expressions into Substrait literal structs.
+///
+/// Each row is expected to contain only `Expr::Literal` or `Expr::Alias` wrapping literals.
+/// Aliases are unwrapped and the underlying literal is converted.
+fn convert_literal_rows(
+    producer: &mut impl SubstraitProducer,
+    rows: &[Vec<Expr>],
+) -> datafusion::common::Result<Vec<LiteralStruct>> {
+    rows.iter()
+        .map(|row| {
+            let fields = row
+                .iter()
+                .map(|expr| match expr {
+                    Expr::Literal(sv, _) => to_substrait_literal(producer, sv),
+                    Expr::Alias(alias) => match alias.expr.as_ref() {
+                        // The schema gives us the names, so we can skip aliases
+                        Expr::Literal(sv, _) => to_substrait_literal(producer, sv),
+                        _ => Err(substrait_datafusion_err!(
+                            "Only literal types can be aliased in Virtual Tables, got: {}",
+                            alias.expr.variant_name()
+                        )),
+                    },
+                    _ => Err(substrait_datafusion_err!(
+                        "Only literal types and aliases are supported in Virtual Tables, got: {}",
+                        expr.variant_name()
+                    )),
+                })
+                .collect::<datafusion::common::Result<_>>()?;
+            Ok(LiteralStruct { fields })
+        })
+        .collect()
+}
+
+/// Converts rows of arbitrary expressions into Substrait nested structs.
+///
+/// Validates that each row has the expected schema length and converts each expression
+/// using the producer's expression handler.
+fn convert_expression_rows(
+    producer: &mut impl SubstraitProducer,
+    rows: &[Vec<Expr>],
+    schema_len: usize,
+    empty_schema: &Arc<DFSchema>,
+) -> datafusion::common::Result<Vec<NestedStruct>> {
+    rows.iter()
+        .map(|row| {
+            if row.len() != schema_len {
+                return Err(substrait_datafusion_err!(
+                    "Names list must match exactly to nested schema, but found {} uses for {} names",
+                    row.len(),
+                    schema_len
+                ));
+            }
+
+            let fields = row
+                .iter()
+                .map(|expr| producer.handle_expr(expr, empty_schema))
+                .collect::<datafusion::common::Result<_>>()?;
+            Ok(NestedStruct { fields })
+        })
+        .collect()
+}
+
 pub fn from_table_scan(
     producer: &mut impl SubstraitProducer,
     scan: &TableScan,
@@ -83,26 +147,61 @@ pub fn from_table_scan(
     }))
 }
 
+/// Encodes an EmptyRelation as a Substrait VirtualTable.
+///
+/// EmptyRelation represents a relation with no input data. When `produce_one_row` is true,
+/// it generates a single row with all fields set to their default values (typically NULL).
+/// This is used for queries without a FROM clause, such as "SELECT 1 AS one" or
+/// "SELECT current_timestamp()".
+///
+/// When `produce_one_row` is false, it represents a truly empty relation with no rows,
+/// used in optimizations or as a placeholder.
 pub fn from_empty_relation(
     producer: &mut impl SubstraitProducer,
     e: &EmptyRelation,
 ) -> datafusion::common::Result<Box<Rel>> {
-    if e.produce_one_row {
-        return not_impl_err!("Producing a row from empty relation is unsupported");
-    }
-    #[allow(deprecated)]
+    let base_schema = to_substrait_named_struct(producer, &e.schema)?;
+
+    let read_type = if e.produce_one_row {
+        // Create one row with default scalar values for each field in the schema.
+        // For example, an Int32 field gets Int32(NULL), a Utf8 field gets Utf8(NULL), etc.
+        // This represents the "phantom row" that provides a context for evaluating
+        // scalar expressions in queries without a FROM clause.
+        let fields = e
+            .schema
+            .fields()
+            .iter()
+            .map(|f| {
+                let scalar = ScalarValue::try_from(f.data_type())?;
+                to_substrait_literal(producer, &scalar)
+            })
+            .collect::<datafusion::common::Result<_>>()?;
+
+        ReadType::VirtualTable(VirtualTable {
+            // Use deprecated 'values' field instead of 'expressions' because the consumer's
+            // nested expression support (RexType::Nested) is not yet implemented.
+            // The 'values' field uses literal::Struct which the consumer can properly
+            // deserialize with field name preservation.
+            #[expect(deprecated)]
+            values: vec![LiteralStruct { fields }],
+            expressions: vec![],
+        })
+    } else {
+        ReadType::VirtualTable(VirtualTable {
+            #[expect(deprecated)]
+            values: vec![],
+            expressions: vec![],
+        })
+    };
     Ok(Box::new(Rel {
         rel_type: Some(RelType::Read(Box::new(ReadRel {
             common: None,
-            base_schema: Some(to_substrait_named_struct(producer, &e.schema)?),
+            base_schema: Some(base_schema),
             filter: None,
             best_effort_filter: None,
             projection: None,
             advanced_extension: None,
-            read_type: Some(ReadType::VirtualTable(VirtualTable {
-                values: vec![],
-                expressions: vec![],
-            })),
+            read_type: Some(read_type),
         }))),
     }))
 }
@@ -111,30 +210,25 @@ pub fn from_values(
     producer: &mut impl SubstraitProducer,
     v: &Values,
 ) -> datafusion::common::Result<Box<Rel>> {
-    let values = v
-        .values
-        .iter()
-        .map(|row| {
-            let fields = row
-                .iter()
-                .map(|v| match v {
-                    Expr::Literal(sv, _) => to_substrait_literal(producer, sv),
-                    Expr::Alias(alias) => match alias.expr.as_ref() {
-                        // The schema gives us the names, so we can skip aliases
-                        Expr::Literal(sv, _) => to_substrait_literal(producer, sv),
-                        _ => Err(substrait_datafusion_err!(
-                                    "Only literal types can be aliased in Virtual Tables, got: {}", alias.expr.variant_name()
-                                )),
-                    },
-                    _ => Err(substrait_datafusion_err!(
-                                "Only literal types and aliases are supported in Virtual Tables, got: {}", v.variant_name()
-                            )),
-                })
-                .collect::<datafusion::common::Result<_>>()?;
-            Ok(Struct { fields })
+    let schema_len = v.schema.fields().len();
+    let empty_schema = Arc::new(DFSchema::empty());
+
+    let use_literals = v.values.iter().all(|row| {
+        row.iter().all(|expr| match expr {
+            Expr::Literal(_, _) => true,
+            Expr::Alias(alias) => matches!(alias.expr.as_ref(), Expr::Literal(_, _)),
+            _ => false,
         })
-        .collect::<datafusion::common::Result<_>>()?;
-    #[allow(deprecated)]
+    });
+
+    let (values, expressions) = if use_literals {
+        let values = convert_literal_rows(producer, &v.values)?;
+        (values, vec![])
+    } else {
+        let expressions =
+            convert_expression_rows(producer, &v.values, schema_len, &empty_schema)?;
+        (vec![], expressions)
+    };
     Ok(Box::new(Rel {
         rel_type: Some(RelType::Read(Box::new(ReadRel {
             common: None,
@@ -143,9 +237,10 @@ pub fn from_values(
             best_effort_filter: None,
             projection: None,
             advanced_extension: None,
+            #[expect(deprecated)]
             read_type: Some(ReadType::VirtualTable(VirtualTable {
                 values,
-                expressions: vec![],
+                expressions,
             })),
         }))),
     }))
diff --git a/datafusion/substrait/src/logical_plan/producer/rel/set_rel.rs b/datafusion/substrait/src/logical_plan/producer/rel/set_rel.rs
index 58ddfca3617ae..41482c11854bb 100644
--- a/datafusion/substrait/src/logical_plan/producer/rel/set_rel.rs
+++ b/datafusion/substrait/src/logical_plan/producer/rel/set_rel.rs
@@ -18,7 +18,7 @@
 use crate::logical_plan::producer::SubstraitProducer;
 use datafusion::logical_expr::Union;
 use substrait::proto::rel::RelType;
-use substrait::proto::{set_rel, Rel, SetRel};
+use substrait::proto::{Rel, SetRel, set_rel};
 
 pub fn from_union(
     producer: &mut impl SubstraitProducer,
diff --git a/datafusion/substrait/src/logical_plan/producer/rel/sort_rel.rs b/datafusion/substrait/src/logical_plan/producer/rel/sort_rel.rs
index aaa8be1635600..d4520a4c37b14 100644
--- a/datafusion/substrait/src/logical_plan/producer/rel/sort_rel.rs
+++ b/datafusion/substrait/src/logical_plan/producer/rel/sort_rel.rs
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::producer::{substrait_sort_field, SubstraitProducer};
+use crate::logical_plan::producer::{SubstraitProducer, substrait_sort_field};
 use crate::variation_const::DEFAULT_TYPE_VARIATION_REF;
 use datafusion::logical_expr::Sort;
 use substrait::proto::expression::literal::LiteralType;
 use substrait::proto::expression::{Literal, RexType};
 use substrait::proto::rel::RelType;
-use substrait::proto::{fetch_rel, Expression, FetchRel, Rel, SortRel};
+use substrait::proto::{Expression, FetchRel, Rel, SortRel, fetch_rel};
 
 pub fn from_sort(
     producer: &mut impl SubstraitProducer,
diff --git a/datafusion/substrait/src/logical_plan/producer/substrait_producer.rs b/datafusion/substrait/src/logical_plan/producer/substrait_producer.rs
index db08e0f7bfd0c..4228c32435897 100644
--- a/datafusion/substrait/src/logical_plan/producer/substrait_producer.rs
+++ b/datafusion/substrait/src/logical_plan/producer/substrait_producer.rs
@@ -18,20 +18,25 @@
 use crate::extensions::Extensions;
 use crate::logical_plan::producer::{
     from_aggregate, from_aggregate_function, from_alias, from_between, from_binary_expr,
-    from_case, from_cast, from_column, from_distinct, from_empty_relation, from_filter,
-    from_in_list, from_in_subquery, from_join, from_like, from_limit, from_literal,
-    from_projection, from_repartition, from_scalar_function, from_sort,
-    from_subquery_alias, from_table_scan, from_try_cast, from_unary_expr, from_union,
-    from_values, from_window, from_window_function, to_substrait_rel, to_substrait_rex,
+    from_case, from_cast, from_column, from_distinct, from_empty_relation, from_exists,
+    from_filter, from_higher_order_function, from_in_list, from_in_subquery, from_join,
+    from_like, from_limit, from_literal, from_placeholder, from_projection,
+    from_repartition, from_scalar_function, from_scalar_subquery, from_set_comparison,
+    from_sort, from_subquery_alias, from_table_scan, from_try_cast, from_unary_expr,
+    from_union, from_values, from_window, from_window_function, to_substrait_rel,
+    to_substrait_rex,
 };
-use datafusion::common::{substrait_err, Column, DFSchemaRef, ScalarValue};
-use datafusion::execution::registry::SerializerRegistry;
+use datafusion::common::{Column, DFSchemaRef, ScalarValue, substrait_err};
 use datafusion::execution::SessionState;
-use datafusion::logical_expr::expr::{Alias, InList, InSubquery, WindowFunction};
+use datafusion::execution::registry::SerializerRegistry;
+use datafusion::logical_expr::Subquery;
+use datafusion::logical_expr::expr::{
+    Alias, Exists, InList, InSubquery, Placeholder, SetComparison, WindowFunction,
+};
 use datafusion::logical_expr::{
-    expr, Aggregate, Between, BinaryExpr, Case, Cast, Distinct, EmptyRelation, Expr,
-    Extension, Filter, Join, Like, Limit, LogicalPlan, Projection, Repartition, Sort,
-    SubqueryAlias, TableScan, TryCast, Union, Values, Window,
+    Aggregate, Between, BinaryExpr, Case, Cast, Distinct, EmptyRelation, Expr, Extension,
+    Filter, Join, Like, Limit, LogicalPlan, Projection, Repartition, Sort, SubqueryAlias,
+    TableScan, TryCast, Union, Values, Window, expr,
 };
 use pbjson_types::Any as ProtoAny;
 use substrait::proto::aggregate_rel::Measure;
@@ -67,11 +72,11 @@ use substrait::proto::{
 /// impl SubstraitProducer for CustomSubstraitProducer {
 ///
 ///     fn register_function(&mut self, signature: String) -> u32 {
-///        self.extensions.register_function(signature)
+///        self.extensions.register_function(&signature)
 ///     }
 ///
 ///     fn register_type(&mut self, type_name: String) -> u32 {
-///         self.extensions.register_type(type_name)
+///         self.extensions.register_type(&type_name)
 ///     }
 ///
 ///     fn get_extensions(self) -> Extensions {
@@ -224,7 +229,9 @@ pub trait SubstraitProducer: Send + Sync + Sized {
         &mut self,
         _plan: &Extension,
     ) -> datafusion::common::Result<Box<Rel>> {
-        substrait_err!("Specify handling for LogicalPlan::Extension by implementing the SubstraitProducer trait")
+        substrait_err!(
+            "Specify handling for LogicalPlan::Extension by implementing the SubstraitProducer trait"
+        )
     }
 
     // Expression Methods
@@ -328,6 +335,14 @@ pub trait SubstraitProducer: Send + Sync + Sized {
         from_scalar_function(self, scalar_fn, schema)
     }
 
+    fn handle_higher_order_function(
+        &mut self,
+        scalar_fn: &expr::HigherOrderFunction,
+        schema: &DFSchemaRef,
+    ) -> datafusion::common::Result<Expression> {
+        from_higher_order_function(self, scalar_fn, schema)
+    }
+
     fn handle_aggregate_function(
         &mut self,
         agg_fn: &expr::AggregateFunction,
@@ -359,6 +374,37 @@ pub trait SubstraitProducer: Send + Sync + Sized {
     ) -> datafusion::common::Result<Expression> {
         from_in_subquery(self, in_subquery, schema)
     }
+
+    fn handle_set_comparison(
+        &mut self,
+        set_comparison: &SetComparison,
+        schema: &DFSchemaRef,
+    ) -> datafusion::common::Result<Expression> {
+        from_set_comparison(self, set_comparison, schema)
+    }
+    fn handle_scalar_subquery(
+        &mut self,
+        subquery: &Subquery,
+        schema: &DFSchemaRef,
+    ) -> datafusion::common::Result<Expression> {
+        from_scalar_subquery(self, subquery, schema)
+    }
+
+    fn handle_exists(
+        &mut self,
+        exists: &Exists,
+        schema: &DFSchemaRef,
+    ) -> datafusion::common::Result<Expression> {
+        from_exists(self, exists, schema)
+    }
+
+    fn handle_placeholder(
+        &mut self,
+        placeholder: &Placeholder,
+        _schema: &DFSchemaRef,
+    ) -> datafusion::common::Result<Expression> {
+        from_placeholder(self, placeholder)
+    }
 }
 
 pub struct DefaultSubstraitProducer<'a> {
@@ -377,11 +423,11 @@ impl<'a> DefaultSubstraitProducer<'a> {
 
 impl SubstraitProducer for DefaultSubstraitProducer<'_> {
     fn register_function(&mut self, fn_name: String) -> u32 {
-        self.extensions.register_function(fn_name)
+        self.extensions.register_function(&fn_name)
     }
 
     fn register_type(&mut self, type_name: String) -> u32 {
-        self.extensions.register_type(type_name)
+        self.extensions.register_type(&type_name)
     }
 
     fn get_extensions(self) -> Extensions {
diff --git a/datafusion/substrait/src/logical_plan/producer/types.rs b/datafusion/substrait/src/logical_plan/producer/types.rs
index 0613ed07be2a5..53cb2eebfbda7 100644
--- a/datafusion/substrait/src/logical_plan/producer/types.rs
+++ b/datafusion/substrait/src/logical_plan/producer/types.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use crate::logical_plan::producer::utils::flatten_names;
-use crate::logical_plan::producer::{to_substrait_precision, SubstraitProducer};
+use crate::logical_plan::producer::{SubstraitProducer, to_substrait_precision};
 use crate::variation_const::{
     DATE_32_TYPE_VARIATION_REF, DATE_64_TYPE_VARIATION_REF,
     DECIMAL_128_TYPE_VARIATION_REF, DECIMAL_256_TYPE_VARIATION_REF,
@@ -27,21 +27,28 @@ use crate::variation_const::{
     TIME_32_TYPE_VARIATION_REF, TIME_64_TYPE_VARIATION_REF,
     UNSIGNED_INTEGER_TYPE_VARIATION_REF, VIEW_CONTAINER_TYPE_VARIATION_REF,
 };
-use datafusion::arrow::datatypes::{DataType, IntervalUnit};
-use datafusion::common::{not_impl_err, plan_err, DFSchemaRef};
-use substrait::proto::{r#type, NamedStruct};
+use datafusion::arrow::datatypes::{DataType, Field, FieldRef, IntervalUnit};
+use datafusion::common::{DFSchemaRef, not_impl_err, plan_err};
+use substrait::proto::{NamedStruct, r#type};
 
 pub(crate) fn to_substrait_type(
     producer: &mut impl SubstraitProducer,
     dt: &DataType,
     nullable: bool,
 ) -> datafusion::common::Result<substrait::proto::Type> {
-    let nullability = if nullable {
+    to_substrait_type_from_field(producer, &Field::new("", dt.clone(), nullable).into())
+}
+
+pub(crate) fn to_substrait_type_from_field(
+    producer: &mut impl SubstraitProducer,
+    field: &FieldRef,
+) -> datafusion::common::Result<substrait::proto::Type> {
+    let nullability = if field.is_nullable() {
         r#type::Nullability::Nullable as i32
     } else {
         r#type::Nullability::Required as i32
     };
-    match dt {
+    match field.data_type() {
         DataType::Null => {
             let type_anchor = producer.register_type(NULL_TYPE_NAME.to_string());
             Ok(substrait::proto::Type {
@@ -288,16 +295,9 @@ pub(crate) fn to_substrait_type(
         }
         DataType::Map(inner, _) => match inner.data_type() {
             DataType::Struct(key_and_value) if key_and_value.len() == 2 => {
-                let key_type = to_substrait_type(
-                    producer,
-                    key_and_value[0].data_type(),
-                    key_and_value[0].is_nullable(),
-                )?;
-                let value_type = to_substrait_type(
-                    producer,
-                    key_and_value[1].data_type(),
-                    key_and_value[1].is_nullable(),
-                )?;
+                let key_type = to_substrait_type_from_field(producer, &key_and_value[0])?;
+                let value_type =
+                    to_substrait_type_from_field(producer, &key_and_value[1])?;
                 Ok(substrait::proto::Type {
                     kind: Some(r#type::Kind::Map(Box::new(r#type::Map {
                         key: Some(Box::new(key_type)),
@@ -310,8 +310,14 @@ pub(crate) fn to_substrait_type(
             _ => plan_err!("Map fields must contain a Struct with exactly 2 fields"),
         },
         DataType::Dictionary(key_type, value_type) => {
-            let key_type = to_substrait_type(producer, key_type, nullable)?;
-            let value_type = to_substrait_type(producer, value_type, nullable)?;
+            let key_type = to_substrait_type_from_field(
+                producer,
+                &Field::new("", key_type.as_ref().clone(), field.is_nullable()).into(),
+            )?;
+            let value_type = to_substrait_type_from_field(
+                producer,
+                &Field::new("", value_type.as_ref().clone(), field.is_nullable()).into(),
+            )?;
             Ok(substrait::proto::Type {
                 kind: Some(r#type::Kind::Map(Box::new(r#type::Map {
                     key: Some(Box::new(key_type)),
@@ -324,9 +330,7 @@ pub(crate) fn to_substrait_type(
         DataType::Struct(fields) => {
             let field_types = fields
                 .iter()
-                .map(|field| {
-                    to_substrait_type(producer, field.data_type(), field.is_nullable())
-                })
+                .map(|field| to_substrait_type_from_field(producer, field))
                 .collect::<datafusion::common::Result<Vec<_>>>()?;
             Ok(substrait::proto::Type {
                 kind: Some(r#type::Kind::Struct(r#type::Struct {
@@ -352,7 +356,7 @@ pub(crate) fn to_substrait_type(
                 precision: *p as i32,
             })),
         }),
-        _ => not_impl_err!("Unsupported cast type: {dt}"),
+        _ => not_impl_err!("Unsupported cast type: {field}"),
     }
 }
 
@@ -369,7 +373,7 @@ pub(crate) fn to_substrait_named_struct(
         types: schema
             .fields()
             .iter()
-            .map(|f| to_substrait_type(producer, f.data_type(), f.is_nullable()))
+            .map(|f| to_substrait_type_from_field(producer, f))
             .collect::<datafusion::common::Result<_>>()?,
         type_variation_reference: DEFAULT_TYPE_VARIATION_REF,
         nullability: r#type::Nullability::Required as i32,
@@ -386,11 +390,11 @@ mod tests {
     use super::*;
     use crate::logical_plan::consumer::tests::test_consumer;
     use crate::logical_plan::consumer::{
-        from_substrait_named_struct, from_substrait_type_without_names,
-        DefaultSubstraitConsumer,
+        DefaultSubstraitConsumer, from_substrait_named_struct,
+        from_substrait_type_without_names,
     };
     use crate::logical_plan::producer::DefaultSubstraitProducer;
-    use datafusion::arrow::datatypes::{Field, Fields, Schema, TimeUnit};
+    use datafusion::arrow::datatypes::{Fields, Schema, TimeUnit};
     use datafusion::common::{DFSchema, Result};
     use datafusion::prelude::SessionContext;
     use std::sync::Arc;
diff --git a/datafusion/substrait/src/logical_plan/producer/utils.rs b/datafusion/substrait/src/logical_plan/producer/utils.rs
index 9f96b88d084fe..e8310f4acd31e 100644
--- a/datafusion/substrait/src/logical_plan/producer/utils.rs
+++ b/datafusion/substrait/src/logical_plan/producer/utils.rs
@@ -17,10 +17,10 @@
 
 use crate::logical_plan::producer::SubstraitProducer;
 use datafusion::arrow::datatypes::{DataType, Field, TimeUnit};
-use datafusion::common::{plan_err, DFSchemaRef};
+use datafusion::common::{DFSchemaRef, plan_err};
 use datafusion::logical_expr::SortExpr;
 use substrait::proto::sort_field::{SortDirection, SortKind};
-use substrait::proto::SortField;
+use substrait::proto::{Expression, SortField};
 
 // Substrait wants a list of all field names, including nested fields from structs,
 // also from within e.g. lists and maps. However, it does not want the list and map field names
@@ -85,3 +85,28 @@ pub(crate) fn to_substrait_precision(time_unit: &TimeUnit) -> i32 {
         TimeUnit::Nanosecond => 9,
     }
 }
+
+/// Wraps an expression with a `not()` function.
+pub(crate) fn negate(
+    producer: &mut impl SubstraitProducer,
+    expr: Expression,
+) -> Expression {
+    let function_anchor = producer.register_function("not".to_string());
+
+    #[expect(deprecated)]
+    Expression {
+        rex_type: Some(substrait::proto::expression::RexType::ScalarFunction(
+            substrait::proto::expression::ScalarFunction {
+                function_reference: function_anchor,
+                arguments: vec![substrait::proto::FunctionArgument {
+                    arg_type: Some(substrait::proto::function_argument::ArgType::Value(
+                        expr,
+                    )),
+                }],
+                output_type: None,
+                args: vec![],
+                options: vec![],
+            },
+        )),
+    }
+}
diff --git a/datafusion/substrait/src/physical_plan/consumer.rs b/datafusion/substrait/src/physical_plan/consumer.rs
index 45a19cea80cfc..ccaf1abec4245 100644
--- a/datafusion/substrait/src/physical_plan/consumer.rs
+++ b/datafusion/substrait/src/physical_plan/consumer.rs
@@ -37,11 +37,11 @@ use async_recursion::async_recursion;
 use chrono::DateTime;
 use datafusion::datasource::memory::DataSourceExec;
 use object_store::ObjectMeta;
-use substrait::proto::r#type::{Kind, Nullability};
-use substrait::proto::read_rel::local_files::file_or_files::PathType;
 use substrait::proto::Type;
+use substrait::proto::read_rel::local_files::file_or_files::PathType;
+use substrait::proto::r#type::{Kind, Nullability};
 use substrait::proto::{
-    expression::MaskExpression, read_rel::ReadType, rel::RelType, Rel,
+    Rel, expression::MaskExpression, read_rel::ReadType, rel::RelType,
 };
 
 /// Convert Substrait Rel to DataFusion ExecutionPlan
@@ -53,7 +53,6 @@ pub async fn from_substrait_rel(
 ) -> Result<Arc<dyn ExecutionPlan>> {
     let mut base_config_builder;
 
-    let source = Arc::new(ParquetSource::default());
     match &rel.rel_type {
         Some(RelType::Read(read)) => {
             if read.filter.is_some() || read.best_effort_filter.is_some() {
@@ -80,9 +79,10 @@ pub async fn from_substrait_rel(
                 .collect::<Result<Vec<Field>>>()
             {
                 Ok(fields) => {
+                    let schema = Arc::new(Schema::new(fields));
+                    let source = Arc::new(ParquetSource::new(Arc::clone(&schema)));
                     base_config_builder = FileScanConfigBuilder::new(
                         ObjectStoreUrl::local_filesystem(),
-                        Arc::new(Schema::new(fields)),
                         source,
                     );
                 }
@@ -119,20 +119,14 @@ pub async fn from_substrait_rel(
                         .unwrap();
                         let size = 0;
 
-                        let partitioned_file = PartitionedFile {
-                            object_meta: ObjectMeta {
+                        let partitioned_file =
+                            PartitionedFile::new_from_meta(ObjectMeta {
                                 last_modified: last_modified.into(),
                                 location: path.into(),
                                 size,
                                 e_tag: None,
                                 version: None,
-                            },
-                            partition_values: vec![],
-                            range: None,
-                            statistics: None,
-                            extensions: None,
-                            metadata_size_hint: None,
-                        };
+                            });
 
                         let part_index = file.partition_index as usize;
                         while part_index >= file_groups.len() {
@@ -144,16 +138,16 @@ pub async fn from_substrait_rel(
                     base_config_builder =
                         base_config_builder.with_file_groups(file_groups);
 
-                    if let Some(MaskExpression { select, .. }) = &read.projection {
-                        if let Some(projection) = &select.as_ref() {
-                            let column_indices: Vec<usize> = projection
-                                .struct_items
-                                .iter()
-                                .map(|item| item.field as usize)
-                                .collect();
-                            base_config_builder = base_config_builder
-                                .with_projection_indices(Some(column_indices));
-                        }
+                    if let Some(MaskExpression { select, .. }) = &read.projection
+                        && let Some(projection) = &select.as_ref()
+                    {
+                        let column_indices: Vec<usize> = projection
+                            .struct_items
+                            .iter()
+                            .map(|item| item.field as usize)
+                            .collect();
+                        base_config_builder = base_config_builder
+                            .with_projection_indices(Some(column_indices))?;
                     }
 
                     Ok(
diff --git a/datafusion/substrait/src/physical_plan/producer.rs b/datafusion/substrait/src/physical_plan/producer.rs
index 20d41c2e6112a..17ca99ceff6e4 100644
--- a/datafusion/substrait/src/physical_plan/producer.rs
+++ b/datafusion/substrait/src/physical_plan/producer.rs
@@ -25,23 +25,23 @@ use crate::variation_const::{
 use datafusion::arrow::datatypes::DataType;
 use datafusion::datasource::source::DataSourceExec;
 use datafusion::error::{DataFusionError, Result};
-use datafusion::physical_plan::{displayable, ExecutionPlan};
+use datafusion::physical_plan::{ExecutionPlan, displayable};
 
 use datafusion::datasource::physical_plan::ParquetSource;
-use substrait::proto::expression::mask_expression::{StructItem, StructSelect};
+use substrait::proto::ReadRel;
+use substrait::proto::Rel;
 use substrait::proto::expression::MaskExpression;
-use substrait::proto::r#type::{
-    Binary, Boolean, Fp64, Kind, Nullability, String as SubstraitString, Struct, I64,
-};
-use substrait::proto::read_rel::local_files::file_or_files::ParquetReadOptions;
-use substrait::proto::read_rel::local_files::file_or_files::{FileFormat, PathType};
-use substrait::proto::read_rel::local_files::FileOrFiles;
+use substrait::proto::expression::mask_expression::{StructItem, StructSelect};
 use substrait::proto::read_rel::LocalFiles;
 use substrait::proto::read_rel::ReadType;
+use substrait::proto::read_rel::local_files::FileOrFiles;
+use substrait::proto::read_rel::local_files::file_or_files::ParquetReadOptions;
+use substrait::proto::read_rel::local_files::file_or_files::{FileFormat, PathType};
 use substrait::proto::rel::RelType;
-use substrait::proto::ReadRel;
-use substrait::proto::Rel;
-use substrait::proto::{extensions, NamedStruct, Type};
+use substrait::proto::r#type::{
+    Binary, Boolean, Fp64, I64, Kind, Nullability, String as SubstraitString, Struct,
+};
+use substrait::proto::{NamedStruct, Type, extensions};
 
 /// Convert DataFusion ExecutionPlan to Substrait Rel
 pub fn to_substrait_rel(
@@ -51,85 +51,84 @@ pub fn to_substrait_rel(
         HashMap<String, u32>,
     ),
 ) -> Result<Box<Rel>> {
-    if let Some(data_source_exec) = plan.as_any().downcast_ref::<DataSourceExec>() {
-        if let Some((file_config, _)) =
+    if let Some(data_source_exec) = plan.downcast_ref::<DataSourceExec>()
+        && let Some((file_config, _)) =
             data_source_exec.downcast_to_file_source::<ParquetSource>()
-        {
-            let mut substrait_files = vec![];
-            for (partition_index, files) in file_config.file_groups.iter().enumerate() {
-                for file in files.iter() {
-                    substrait_files.push(FileOrFiles {
-                        partition_index: partition_index.try_into().unwrap(),
-                        start: 0,
-                        length: file.object_meta.size,
-                        path_type: Some(PathType::UriPath(
-                            file.object_meta.location.as_ref().to_string(),
-                        )),
-                        file_format: Some(FileFormat::Parquet(ParquetReadOptions {})),
-                    });
-                }
+    {
+        let mut substrait_files = vec![];
+        for (partition_index, files) in file_config.file_groups.iter().enumerate() {
+            for file in files.iter() {
+                substrait_files.push(FileOrFiles {
+                    partition_index: partition_index.try_into().unwrap(),
+                    start: 0,
+                    length: file.object_meta.size,
+                    path_type: Some(PathType::UriPath(
+                        file.object_meta.location.as_ref().to_string(),
+                    )),
+                    file_format: Some(FileFormat::Parquet(ParquetReadOptions {})),
+                });
             }
+        }
 
-            let mut names = vec![];
-            let mut types = vec![];
+        let mut names = vec![];
+        let mut types = vec![];
 
-            for field in file_config.file_schema().fields.iter() {
-                match to_substrait_type(field.data_type(), field.is_nullable()) {
-                    Ok(t) => {
-                        names.push(field.name().clone());
-                        types.push(t);
-                    }
-                    Err(e) => return Err(e),
+        for field in file_config.file_schema().fields.iter() {
+            match to_substrait_type(field.data_type(), field.is_nullable()) {
+                Ok(t) => {
+                    names.push(field.name().clone());
+                    types.push(t);
                 }
+                Err(e) => return Err(e),
             }
+        }
 
-            let type_info = Struct {
-                types,
-                // FIXME: duckdb doesn't set this field, keep it as default variant 0.
-                // https://github.com/duckdb/substrait/blob/b6f56643cb11d52de0e32c24a01dfd5947df62be/src/to_substrait.cpp#L1106-L1127
-                type_variation_reference: 0,
-                nullability: Nullability::Required.into(),
-            };
+        let type_info = Struct {
+            types,
+            // FIXME: duckdb doesn't set this field, keep it as default variant 0.
+            // https://github.com/duckdb/substrait/blob/b6f56643cb11d52de0e32c24a01dfd5947df62be/src/to_substrait.cpp#L1106-L1127
+            type_variation_reference: 0,
+            nullability: Nullability::Required.into(),
+        };
 
-            let mut select_struct = None;
-            if let Some(projection) = file_config.projection_exprs.as_ref() {
-                let struct_items = projection
-                    .column_indices()
-                    .into_iter()
-                    .map(|index| StructItem {
-                        field: index as i32,
-                        // FIXME: duckdb sets this to None, but it's not clear why.
-                        // https://github.com/duckdb/substrait/blob/b6f56643cb11d52de0e32c24a01dfd5947df62be/src/to_substrait.cpp#L1191
-                        child: None,
-                    })
-                    .collect();
+        let mut select_struct = None;
+        if let Some(projection) = file_config.file_source().projection().as_ref() {
+            let struct_items = projection
+                .column_indices()
+                .into_iter()
+                .map(|index| StructItem {
+                    field: index as i32,
+                    // FIXME: duckdb sets this to None, but it's not clear why.
+                    // https://github.com/duckdb/substrait/blob/b6f56643cb11d52de0e32c24a01dfd5947df62be/src/to_substrait.cpp#L1191
+                    child: None,
+                })
+                .collect();
 
-                select_struct = Some(StructSelect { struct_items });
-            }
+            select_struct = Some(StructSelect { struct_items });
+        }
 
-            return Ok(Box::new(Rel {
-                rel_type: Some(RelType::Read(Box::new(ReadRel {
-                    common: None,
-                    base_schema: Some(NamedStruct {
-                        names,
-                        r#struct: Some(type_info),
-                    }),
-                    filter: None,
-                    best_effort_filter: None,
-                    projection: Some(MaskExpression {
-                        select: select_struct,
-                        // FIXME: duckdb set this to true, but it's not clear why.
-                        // https://github.com/duckdb/substrait/blob/b6f56643cb11d52de0e32c24a01dfd5947df62be/src/to_substrait.cpp#L1186.
-                        maintain_singular_struct: true,
-                    }),
+        return Ok(Box::new(Rel {
+            rel_type: Some(RelType::Read(Box::new(ReadRel {
+                common: None,
+                base_schema: Some(NamedStruct {
+                    names,
+                    r#struct: Some(type_info),
+                }),
+                filter: None,
+                best_effort_filter: None,
+                projection: Some(MaskExpression {
+                    select: select_struct,
+                    // FIXME: duckdb set this to true, but it's not clear why.
+                    // https://github.com/duckdb/substrait/blob/b6f56643cb11d52de0e32c24a01dfd5947df62be/src/to_substrait.cpp#L1186.
+                    maintain_singular_struct: true,
+                }),
+                advanced_extension: None,
+                read_type: Some(ReadType::LocalFiles(LocalFiles {
+                    items: substrait_files,
                     advanced_extension: None,
-                    read_type: Some(ReadType::LocalFiles(LocalFiles {
-                        items: substrait_files,
-                        advanced_extension: None,
-                    })),
-                }))),
-            }));
-        }
+                })),
+            }))),
+        }));
     }
     Err(DataFusionError::Substrait(format!(
         "Unsupported plan in Substrait physical plan producer: {}",
diff --git a/datafusion/substrait/src/serializer.rs b/datafusion/substrait/src/serializer.rs
index 4a9e5d55ce055..ee71bc3121afe 100644
--- a/datafusion/substrait/src/serializer.rs
+++ b/datafusion/substrait/src/serializer.rs
@@ -46,6 +46,7 @@ pub async fn serialize(
         .open(path)
         .await?;
     file.write_all(&protobuf_out).await?;
+    file.flush().await?;
     Ok(())
 }
 
diff --git a/datafusion/substrait/tests/cases/aggregation_tests.rs b/datafusion/substrait/tests/cases/aggregation_tests.rs
index 815550bca5b89..92a41850b208d 100644
--- a/datafusion/substrait/tests/cases/aggregation_tests.rs
+++ b/datafusion/substrait/tests/cases/aggregation_tests.rs
@@ -35,10 +35,10 @@ mod tests {
 
         assert_snapshot!(
             plan,
-            @r#"
-                Aggregate: groupBy=[[]], aggr=[[sum(c0) AS summation]]
-                  EmptyRelation: rows=0
-                "#
+            @r"
+        Aggregate: groupBy=[[]], aggr=[[sum(c0) AS summation]]
+          EmptyRelation: rows=0
+        "
         );
 
         // Trigger execution to ensure plan validity
@@ -57,10 +57,10 @@ mod tests {
 
         assert_snapshot!(
             plan,
-            @r#"
-                Aggregate: groupBy=[[c0]], aggr=[[sum(c0) AS summation]]
-                  EmptyRelation: rows=0
-                "#
+            @r"
+        Aggregate: groupBy=[[c0]], aggr=[[sum(c0) AS summation]]
+          EmptyRelation: rows=0
+        "
         );
 
         // Trigger execution to ensure plan validity
diff --git a/datafusion/substrait/tests/cases/consumer_integration.rs b/datafusion/substrait/tests/cases/consumer_integration.rs
index a92fc2957cae3..b5d9f36620c67 100644
--- a/datafusion/substrait/tests/cases/consumer_integration.rs
+++ b/datafusion/substrait/tests/cases/consumer_integration.rs
@@ -25,6 +25,8 @@
 #[cfg(test)]
 mod tests {
     use crate::utils::test::add_plan_schemas_to_ctx;
+    use datafusion::arrow::record_batch::RecordBatch;
+    use datafusion::arrow::util::pretty::pretty_format_batches;
     use datafusion::common::Result;
     use datafusion::prelude::SessionContext;
     use datafusion_substrait::logical_plan::consumer::from_substrait_plan;
@@ -33,6 +35,34 @@ mod tests {
     use std::io::BufReader;
     use substrait::proto::Plan;
 
+    async fn execute_plan(name: &str) -> Result<Vec<RecordBatch>> {
+        let path = format!("tests/testdata/test_plans/{name}");
+        let proto = serde_json::from_reader::<_, Plan>(BufReader::new(
+            File::open(path).expect("file not found"),
+        ))
+        .expect("failed to parse json");
+        let ctx = SessionContext::new();
+        let plan = from_substrait_plan(&ctx.state(), &proto).await?;
+        ctx.execute_logical_plan(plan).await?.collect().await
+    }
+
+    /// Pretty-print batches as a table with header on top and data rows sorted.
+    fn pretty_sorted(batches: &[RecordBatch]) -> String {
+        let pretty = pretty_format_batches(batches).unwrap().to_string();
+        let all_lines: Vec<&str> = pretty.trim().lines().collect();
+        let header = &all_lines[..3];
+        let mut data: Vec<&str> = all_lines[3..all_lines.len() - 1].to_vec();
+        data.sort();
+        let footer = &all_lines[all_lines.len() - 1..];
+        header
+            .iter()
+            .copied()
+            .chain(data)
+            .chain(footer.iter().copied())
+            .collect::<Vec<_>>()
+            .join("\n")
+    }
+
     async fn tpch_plan_to_string(query_id: i32) -> Result<String> {
         let path =
             format!("tests/testdata/tpch_substrait_plans/query_{query_id:02}_plan.json");
@@ -53,13 +83,13 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Projection: LINEITEM.L_RETURNFLAG, LINEITEM.L_LINESTATUS, sum(LINEITEM.L_QUANTITY) AS SUM_QTY, sum(LINEITEM.L_EXTENDEDPRICE) AS SUM_BASE_PRICE, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS SUM_DISC_PRICE, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT * Int32(1) + LINEITEM.L_TAX) AS SUM_CHARGE, avg(LINEITEM.L_QUANTITY) AS AVG_QTY, avg(LINEITEM.L_EXTENDEDPRICE) AS AVG_PRICE, avg(LINEITEM.L_DISCOUNT) AS AVG_DISC, count(Int64(1)) AS COUNT_ORDER
-              Sort: LINEITEM.L_RETURNFLAG ASC NULLS LAST, LINEITEM.L_LINESTATUS ASC NULLS LAST
-                Aggregate: groupBy=[[LINEITEM.L_RETURNFLAG, LINEITEM.L_LINESTATUS]], aggr=[[sum(LINEITEM.L_QUANTITY), sum(LINEITEM.L_EXTENDEDPRICE), sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT), sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT * Int32(1) + LINEITEM.L_TAX), avg(LINEITEM.L_QUANTITY), avg(LINEITEM.L_EXTENDEDPRICE), avg(LINEITEM.L_DISCOUNT), count(Int64(1))]]
-                  Projection: LINEITEM.L_RETURNFLAG, LINEITEM.L_LINESTATUS, LINEITEM.L_QUANTITY, LINEITEM.L_EXTENDEDPRICE, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT), LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT) * (CAST(Int32(1) AS Decimal128(15, 2)) + LINEITEM.L_TAX), LINEITEM.L_DISCOUNT
-                    Filter: LINEITEM.L_SHIPDATE <= Date32("1998-12-01") - IntervalDayTime("IntervalDayTime { days: 0, milliseconds: 10368000 }")
-                      TableScan: LINEITEM
-            "#
+        Projection: LINEITEM.L_RETURNFLAG, LINEITEM.L_LINESTATUS, sum(LINEITEM.L_QUANTITY) AS SUM_QTY, sum(LINEITEM.L_EXTENDEDPRICE) AS SUM_BASE_PRICE, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS SUM_DISC_PRICE, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT * Int32(1) + LINEITEM.L_TAX) AS SUM_CHARGE, avg(LINEITEM.L_QUANTITY) AS AVG_QTY, avg(LINEITEM.L_EXTENDEDPRICE) AS AVG_PRICE, avg(LINEITEM.L_DISCOUNT) AS AVG_DISC, count(Int64(1)) AS COUNT_ORDER
+          Sort: LINEITEM.L_RETURNFLAG ASC NULLS LAST, LINEITEM.L_LINESTATUS ASC NULLS LAST
+            Aggregate: groupBy=[[LINEITEM.L_RETURNFLAG, LINEITEM.L_LINESTATUS]], aggr=[[sum(LINEITEM.L_QUANTITY), sum(LINEITEM.L_EXTENDEDPRICE), sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT), sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT * Int32(1) + LINEITEM.L_TAX), avg(LINEITEM.L_QUANTITY), avg(LINEITEM.L_EXTENDEDPRICE), avg(LINEITEM.L_DISCOUNT), count(Int64(1))]]
+              Projection: LINEITEM.L_RETURNFLAG, LINEITEM.L_LINESTATUS, LINEITEM.L_QUANTITY, LINEITEM.L_EXTENDEDPRICE, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT), LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT) * (CAST(Int32(1) AS Decimal128(15, 2)) + LINEITEM.L_TAX), LINEITEM.L_DISCOUNT
+                Filter: LINEITEM.L_SHIPDATE <= Date32("1998-12-01") - IntervalDayTime("IntervalDayTime { days: 0, milliseconds: 10368000 }")
+                  TableScan: LINEITEM
+        "#
                 );
         Ok(())
     }
@@ -70,31 +100,31 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Limit: skip=0, fetch=100
-              Sort: SUPPLIER.S_ACCTBAL DESC NULLS FIRST, NATION.N_NAME ASC NULLS LAST, SUPPLIER.S_NAME ASC NULLS LAST, PART.P_PARTKEY ASC NULLS LAST
-                Projection: SUPPLIER.S_ACCTBAL, SUPPLIER.S_NAME, NATION.N_NAME, PART.P_PARTKEY, PART.P_MFGR, SUPPLIER.S_ADDRESS, SUPPLIER.S_PHONE, SUPPLIER.S_COMMENT
-                  Filter: PART.P_PARTKEY = PARTSUPP.PS_PARTKEY AND SUPPLIER.S_SUPPKEY = PARTSUPP.PS_SUPPKEY AND PART.P_SIZE = Int32(15) AND PART.P_TYPE LIKE CAST(Utf8("%BRASS") AS Utf8) AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_REGIONKEY = REGION.R_REGIONKEY AND REGION.R_NAME = Utf8("EUROPE") AND PARTSUPP.PS_SUPPLYCOST = (<subquery>)
-                    Subquery:
-                      Aggregate: groupBy=[[]], aggr=[[min(PARTSUPP.PS_SUPPLYCOST)]]
-                        Projection: PARTSUPP.PS_SUPPLYCOST
-                          Filter: PARTSUPP.PS_PARTKEY = PARTSUPP.PS_PARTKEY AND SUPPLIER.S_SUPPKEY = PARTSUPP.PS_SUPPKEY AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_REGIONKEY = REGION.R_REGIONKEY AND REGION.R_NAME = Utf8("EUROPE")
-                            Cross Join: 
-                              Cross Join: 
-                                Cross Join: 
-                                  TableScan: PARTSUPP
-                                  TableScan: SUPPLIER
-                                TableScan: NATION
-                              TableScan: REGION
-                    Cross Join: 
-                      Cross Join: 
-                        Cross Join: 
-                          Cross Join: 
-                            TableScan: PART
-                            TableScan: SUPPLIER
-                          TableScan: PARTSUPP
-                        TableScan: NATION
-                      TableScan: REGION
-            "#
+        Limit: skip=0, fetch=100
+          Sort: SUPPLIER.S_ACCTBAL DESC NULLS FIRST, NATION.N_NAME ASC NULLS LAST, SUPPLIER.S_NAME ASC NULLS LAST, PART.P_PARTKEY ASC NULLS LAST
+            Projection: SUPPLIER.S_ACCTBAL, SUPPLIER.S_NAME, NATION.N_NAME, PART.P_PARTKEY, PART.P_MFGR, SUPPLIER.S_ADDRESS, SUPPLIER.S_PHONE, SUPPLIER.S_COMMENT
+              Filter: PART.P_PARTKEY = PARTSUPP.PS_PARTKEY AND SUPPLIER.S_SUPPKEY = PARTSUPP.PS_SUPPKEY AND PART.P_SIZE = Int32(15) AND PART.P_TYPE LIKE CAST(Utf8("%BRASS") AS Utf8) AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_REGIONKEY = REGION.R_REGIONKEY AND REGION.R_NAME = Utf8("EUROPE") AND PARTSUPP.PS_SUPPLYCOST = (<subquery>)
+                Subquery:
+                  Aggregate: groupBy=[[]], aggr=[[min(PARTSUPP.PS_SUPPLYCOST)]]
+                    Projection: PARTSUPP.PS_SUPPLYCOST
+                      Filter: outer_ref(PART.P_PARTKEY) = PARTSUPP.PS_PARTKEY AND SUPPLIER.S_SUPPKEY = PARTSUPP.PS_SUPPKEY AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_REGIONKEY = REGION.R_REGIONKEY AND REGION.R_NAME = Utf8("EUROPE")
+                        Cross Join:
+                          Cross Join:
+                            Cross Join:
+                              TableScan: PARTSUPP
+                              TableScan: SUPPLIER
+                            TableScan: NATION
+                          TableScan: REGION
+                Cross Join:
+                  Cross Join:
+                    Cross Join:
+                      Cross Join:
+                        TableScan: PART
+                        TableScan: SUPPLIER
+                      TableScan: PARTSUPP
+                    TableScan: NATION
+                  TableScan: REGION
+        "#
                 );
         Ok(())
     }
@@ -105,19 +135,19 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Projection: LINEITEM.L_ORDERKEY, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS REVENUE, ORDERS.O_ORDERDATE, ORDERS.O_SHIPPRIORITY
-              Limit: skip=0, fetch=10
-                Sort: sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) DESC NULLS FIRST, ORDERS.O_ORDERDATE ASC NULLS LAST
-                  Projection: LINEITEM.L_ORDERKEY, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT), ORDERS.O_ORDERDATE, ORDERS.O_SHIPPRIORITY
-                    Aggregate: groupBy=[[LINEITEM.L_ORDERKEY, ORDERS.O_ORDERDATE, ORDERS.O_SHIPPRIORITY]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT)]]
-                      Projection: LINEITEM.L_ORDERKEY, ORDERS.O_ORDERDATE, ORDERS.O_SHIPPRIORITY, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT)
-                        Filter: CUSTOMER.C_MKTSEGMENT = Utf8("BUILDING") AND CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY AND LINEITEM.L_ORDERKEY = ORDERS.O_ORDERKEY AND ORDERS.O_ORDERDATE < CAST(Utf8("1995-03-15") AS Date32) AND LINEITEM.L_SHIPDATE > CAST(Utf8("1995-03-15") AS Date32)
-                          Cross Join: 
-                            Cross Join: 
-                              TableScan: LINEITEM
-                              TableScan: CUSTOMER
-                            TableScan: ORDERS
-            "#
+        Projection: LINEITEM.L_ORDERKEY, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS REVENUE, ORDERS.O_ORDERDATE, ORDERS.O_SHIPPRIORITY
+          Limit: skip=0, fetch=10
+            Sort: sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) DESC NULLS FIRST, ORDERS.O_ORDERDATE ASC NULLS LAST
+              Projection: LINEITEM.L_ORDERKEY, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT), ORDERS.O_ORDERDATE, ORDERS.O_SHIPPRIORITY
+                Aggregate: groupBy=[[LINEITEM.L_ORDERKEY, ORDERS.O_ORDERDATE, ORDERS.O_SHIPPRIORITY]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT)]]
+                  Projection: LINEITEM.L_ORDERKEY, ORDERS.O_ORDERDATE, ORDERS.O_SHIPPRIORITY, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT)
+                    Filter: CUSTOMER.C_MKTSEGMENT = Utf8("BUILDING") AND CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY AND LINEITEM.L_ORDERKEY = ORDERS.O_ORDERKEY AND ORDERS.O_ORDERDATE < CAST(Utf8("1995-03-15") AS Date32) AND LINEITEM.L_SHIPDATE > CAST(Utf8("1995-03-15") AS Date32)
+                      Cross Join:
+                        Cross Join:
+                          TableScan: LINEITEM
+                          TableScan: CUSTOMER
+                        TableScan: ORDERS
+        "#
                 );
         Ok(())
     }
@@ -128,16 +158,16 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Projection: ORDERS.O_ORDERPRIORITY, count(Int64(1)) AS ORDER_COUNT
-              Sort: ORDERS.O_ORDERPRIORITY ASC NULLS LAST
-                Aggregate: groupBy=[[ORDERS.O_ORDERPRIORITY]], aggr=[[count(Int64(1))]]
-                  Projection: ORDERS.O_ORDERPRIORITY
-                    Filter: ORDERS.O_ORDERDATE >= CAST(Utf8("1993-07-01") AS Date32) AND ORDERS.O_ORDERDATE < CAST(Utf8("1993-10-01") AS Date32) AND EXISTS (<subquery>)
-                      Subquery:
-                        Filter: LINEITEM.L_ORDERKEY = LINEITEM.L_ORDERKEY AND LINEITEM.L_COMMITDATE < LINEITEM.L_RECEIPTDATE
-                          TableScan: LINEITEM
-                      TableScan: ORDERS
-            "#
+        Projection: ORDERS.O_ORDERPRIORITY, count(Int64(1)) AS ORDER_COUNT
+          Sort: ORDERS.O_ORDERPRIORITY ASC NULLS LAST
+            Aggregate: groupBy=[[ORDERS.O_ORDERPRIORITY]], aggr=[[count(Int64(1))]]
+              Projection: ORDERS.O_ORDERPRIORITY
+                Filter: ORDERS.O_ORDERDATE >= CAST(Utf8("1993-07-01") AS Date32) AND ORDERS.O_ORDERDATE < CAST(Utf8("1993-10-01") AS Date32) AND EXISTS (<subquery>)
+                  Subquery:
+                    Filter: LINEITEM.L_ORDERKEY = outer_ref(ORDERS.O_ORDERKEY) AND LINEITEM.L_COMMITDATE < LINEITEM.L_RECEIPTDATE
+                      TableScan: LINEITEM
+                  TableScan: ORDERS
+        "#
                 );
         Ok(())
     }
@@ -148,23 +178,23 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Projection: NATION.N_NAME, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS REVENUE
-              Sort: sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) DESC NULLS FIRST
-                Aggregate: groupBy=[[NATION.N_NAME]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT)]]
-                  Projection: NATION.N_NAME, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT)
-                    Filter: CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY AND LINEITEM.L_ORDERKEY = ORDERS.O_ORDERKEY AND LINEITEM.L_SUPPKEY = SUPPLIER.S_SUPPKEY AND CUSTOMER.C_NATIONKEY = SUPPLIER.S_NATIONKEY AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_REGIONKEY = REGION.R_REGIONKEY AND REGION.R_NAME = Utf8("ASIA") AND ORDERS.O_ORDERDATE >= CAST(Utf8("1994-01-01") AS Date32) AND ORDERS.O_ORDERDATE < CAST(Utf8("1995-01-01") AS Date32)
-                      Cross Join: 
-                        Cross Join: 
-                          Cross Join: 
-                            Cross Join: 
-                              Cross Join: 
-                                TableScan: CUSTOMER
-                                TableScan: ORDERS
-                              TableScan: LINEITEM
-                            TableScan: SUPPLIER
-                          TableScan: NATION
-                        TableScan: REGION
-            "#
+        Projection: NATION.N_NAME, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS REVENUE
+          Sort: sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) DESC NULLS FIRST
+            Aggregate: groupBy=[[NATION.N_NAME]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT)]]
+              Projection: NATION.N_NAME, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT)
+                Filter: CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY AND LINEITEM.L_ORDERKEY = ORDERS.O_ORDERKEY AND LINEITEM.L_SUPPKEY = SUPPLIER.S_SUPPKEY AND CUSTOMER.C_NATIONKEY = SUPPLIER.S_NATIONKEY AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_REGIONKEY = REGION.R_REGIONKEY AND REGION.R_NAME = Utf8("ASIA") AND ORDERS.O_ORDERDATE >= CAST(Utf8("1994-01-01") AS Date32) AND ORDERS.O_ORDERDATE < CAST(Utf8("1995-01-01") AS Date32)
+                  Cross Join:
+                    Cross Join:
+                      Cross Join:
+                        Cross Join:
+                          Cross Join:
+                            TableScan: CUSTOMER
+                            TableScan: ORDERS
+                          TableScan: LINEITEM
+                        TableScan: SUPPLIER
+                      TableScan: NATION
+                    TableScan: REGION
+        "#
                 );
         Ok(())
     }
@@ -175,11 +205,11 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Aggregate: groupBy=[[]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE * LINEITEM.L_DISCOUNT) AS REVENUE]]
-              Projection: LINEITEM.L_EXTENDEDPRICE * LINEITEM.L_DISCOUNT
-                Filter: LINEITEM.L_SHIPDATE >= CAST(Utf8("1994-01-01") AS Date32) AND LINEITEM.L_SHIPDATE < CAST(Utf8("1995-01-01") AS Date32) AND LINEITEM.L_DISCOUNT >= Decimal128(Some(5),3,2) AND LINEITEM.L_DISCOUNT <= Decimal128(Some(7),3,2) AND LINEITEM.L_QUANTITY < CAST(Int32(24) AS Decimal128(15, 2))
-                  TableScan: LINEITEM
-            "#
+        Aggregate: groupBy=[[]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE * LINEITEM.L_DISCOUNT) AS REVENUE]]
+          Projection: LINEITEM.L_EXTENDEDPRICE * LINEITEM.L_DISCOUNT
+            Filter: LINEITEM.L_SHIPDATE >= CAST(Utf8("1994-01-01") AS Date32) AND LINEITEM.L_SHIPDATE < CAST(Utf8("1995-01-01") AS Date32) AND LINEITEM.L_DISCOUNT >= Decimal128(Some(5),3,2) AND LINEITEM.L_DISCOUNT <= Decimal128(Some(7),3,2) AND LINEITEM.L_QUANTITY < CAST(Int32(24) AS Decimal128(15, 2))
+              TableScan: LINEITEM
+        "#
                 );
         Ok(())
     }
@@ -214,21 +244,21 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Projection: CUSTOMER.C_CUSTKEY, CUSTOMER.C_NAME, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS REVENUE, CUSTOMER.C_ACCTBAL, NATION.N_NAME, CUSTOMER.C_ADDRESS, CUSTOMER.C_PHONE, CUSTOMER.C_COMMENT
-              Limit: skip=0, fetch=20
-                Sort: sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) DESC NULLS FIRST
-                  Projection: CUSTOMER.C_CUSTKEY, CUSTOMER.C_NAME, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT), CUSTOMER.C_ACCTBAL, NATION.N_NAME, CUSTOMER.C_ADDRESS, CUSTOMER.C_PHONE, CUSTOMER.C_COMMENT
-                    Aggregate: groupBy=[[CUSTOMER.C_CUSTKEY, CUSTOMER.C_NAME, CUSTOMER.C_ACCTBAL, CUSTOMER.C_PHONE, NATION.N_NAME, CUSTOMER.C_ADDRESS, CUSTOMER.C_COMMENT]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT)]]
-                      Projection: CUSTOMER.C_CUSTKEY, CUSTOMER.C_NAME, CUSTOMER.C_ACCTBAL, CUSTOMER.C_PHONE, NATION.N_NAME, CUSTOMER.C_ADDRESS, CUSTOMER.C_COMMENT, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT)
-                        Filter: CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY AND LINEITEM.L_ORDERKEY = ORDERS.O_ORDERKEY AND ORDERS.O_ORDERDATE >= CAST(Utf8("1993-10-01") AS Date32) AND ORDERS.O_ORDERDATE < CAST(Utf8("1994-01-01") AS Date32) AND LINEITEM.L_RETURNFLAG = Utf8("R") AND CUSTOMER.C_NATIONKEY = NATION.N_NATIONKEY
-                          Cross Join: 
-                            Cross Join: 
-                              Cross Join: 
-                                TableScan: CUSTOMER
-                                TableScan: ORDERS
-                              TableScan: LINEITEM
-                            TableScan: NATION
-            "#
+        Projection: CUSTOMER.C_CUSTKEY, CUSTOMER.C_NAME, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS REVENUE, CUSTOMER.C_ACCTBAL, NATION.N_NAME, CUSTOMER.C_ADDRESS, CUSTOMER.C_PHONE, CUSTOMER.C_COMMENT
+          Limit: skip=0, fetch=20
+            Sort: sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) DESC NULLS FIRST
+              Projection: CUSTOMER.C_CUSTKEY, CUSTOMER.C_NAME, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT), CUSTOMER.C_ACCTBAL, NATION.N_NAME, CUSTOMER.C_ADDRESS, CUSTOMER.C_PHONE, CUSTOMER.C_COMMENT
+                Aggregate: groupBy=[[CUSTOMER.C_CUSTKEY, CUSTOMER.C_NAME, CUSTOMER.C_ACCTBAL, CUSTOMER.C_PHONE, NATION.N_NAME, CUSTOMER.C_ADDRESS, CUSTOMER.C_COMMENT]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT)]]
+                  Projection: CUSTOMER.C_CUSTKEY, CUSTOMER.C_NAME, CUSTOMER.C_ACCTBAL, CUSTOMER.C_PHONE, NATION.N_NAME, CUSTOMER.C_ADDRESS, CUSTOMER.C_COMMENT, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT)
+                    Filter: CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY AND LINEITEM.L_ORDERKEY = ORDERS.O_ORDERKEY AND ORDERS.O_ORDERDATE >= CAST(Utf8("1993-10-01") AS Date32) AND ORDERS.O_ORDERDATE < CAST(Utf8("1994-01-01") AS Date32) AND LINEITEM.L_RETURNFLAG = Utf8("R") AND CUSTOMER.C_NATIONKEY = NATION.N_NATIONKEY
+                      Cross Join:
+                        Cross Join:
+                          Cross Join:
+                            TableScan: CUSTOMER
+                            TableScan: ORDERS
+                          TableScan: LINEITEM
+                        TableScan: NATION
+        "#
                 );
         Ok(())
     }
@@ -239,28 +269,28 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Projection: PARTSUPP.PS_PARTKEY, sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY) AS value
-              Sort: sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY) DESC NULLS FIRST
-                Filter: sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY) > (<subquery>)
-                  Subquery:
-                    Projection: sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY) * Decimal128(Some(1000000),11,10)
-                      Aggregate: groupBy=[[]], aggr=[[sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY)]]
-                        Projection: PARTSUPP.PS_SUPPLYCOST * CAST(PARTSUPP.PS_AVAILQTY AS Decimal128(19, 0))
-                          Filter: PARTSUPP.PS_SUPPKEY = SUPPLIER.S_SUPPKEY AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_NAME = Utf8("JAPAN")
-                            Cross Join: 
-                              Cross Join: 
-                                TableScan: PARTSUPP
-                                TableScan: SUPPLIER
-                              TableScan: NATION
-                  Aggregate: groupBy=[[PARTSUPP.PS_PARTKEY]], aggr=[[sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY)]]
-                    Projection: PARTSUPP.PS_PARTKEY, PARTSUPP.PS_SUPPLYCOST * CAST(PARTSUPP.PS_AVAILQTY AS Decimal128(19, 0))
+        Projection: PARTSUPP.PS_PARTKEY, sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY) AS value
+          Sort: sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY) DESC NULLS FIRST
+            Filter: sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY) > (<subquery>)
+              Subquery:
+                Projection: sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY) * Decimal128(Some(1000000),11,10)
+                  Aggregate: groupBy=[[]], aggr=[[sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY)]]
+                    Projection: PARTSUPP.PS_SUPPLYCOST * CAST(PARTSUPP.PS_AVAILQTY AS Decimal128(19, 0))
                       Filter: PARTSUPP.PS_SUPPKEY = SUPPLIER.S_SUPPKEY AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_NAME = Utf8("JAPAN")
-                        Cross Join: 
-                          Cross Join: 
+                        Cross Join:
+                          Cross Join:
                             TableScan: PARTSUPP
                             TableScan: SUPPLIER
                           TableScan: NATION
-            "#
+              Aggregate: groupBy=[[PARTSUPP.PS_PARTKEY]], aggr=[[sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY)]]
+                Projection: PARTSUPP.PS_PARTKEY, PARTSUPP.PS_SUPPLYCOST * CAST(PARTSUPP.PS_AVAILQTY AS Decimal128(19, 0))
+                  Filter: PARTSUPP.PS_SUPPKEY = SUPPLIER.S_SUPPKEY AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_NAME = Utf8("JAPAN")
+                    Cross Join:
+                      Cross Join:
+                        TableScan: PARTSUPP
+                        TableScan: SUPPLIER
+                      TableScan: NATION
+        "#
                 );
         Ok(())
     }
@@ -271,15 +301,15 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Projection: LINEITEM.L_SHIPMODE, sum(CASE WHEN ORDERS.O_ORDERPRIORITY = Utf8("1-URGENT") OR ORDERS.O_ORDERPRIORITY = Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END) AS HIGH_LINE_COUNT, sum(CASE WHEN ORDERS.O_ORDERPRIORITY != Utf8("1-URGENT") AND ORDERS.O_ORDERPRIORITY != Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END) AS LOW_LINE_COUNT
-              Sort: LINEITEM.L_SHIPMODE ASC NULLS LAST
-                Aggregate: groupBy=[[LINEITEM.L_SHIPMODE]], aggr=[[sum(CASE WHEN ORDERS.O_ORDERPRIORITY = Utf8("1-URGENT") OR ORDERS.O_ORDERPRIORITY = Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END), sum(CASE WHEN ORDERS.O_ORDERPRIORITY != Utf8("1-URGENT") AND ORDERS.O_ORDERPRIORITY != Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END)]]
-                  Projection: LINEITEM.L_SHIPMODE, CASE WHEN ORDERS.O_ORDERPRIORITY = Utf8("1-URGENT") OR ORDERS.O_ORDERPRIORITY = Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END, CASE WHEN ORDERS.O_ORDERPRIORITY != Utf8("1-URGENT") AND ORDERS.O_ORDERPRIORITY != Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END
-                    Filter: ORDERS.O_ORDERKEY = LINEITEM.L_ORDERKEY AND (LINEITEM.L_SHIPMODE = CAST(Utf8("MAIL") AS Utf8) OR LINEITEM.L_SHIPMODE = CAST(Utf8("SHIP") AS Utf8)) AND LINEITEM.L_COMMITDATE < LINEITEM.L_RECEIPTDATE AND LINEITEM.L_SHIPDATE < LINEITEM.L_COMMITDATE AND LINEITEM.L_RECEIPTDATE >= CAST(Utf8("1994-01-01") AS Date32) AND LINEITEM.L_RECEIPTDATE < CAST(Utf8("1995-01-01") AS Date32)
-                      Cross Join: 
-                        TableScan: ORDERS
-                        TableScan: LINEITEM
-            "#
+        Projection: LINEITEM.L_SHIPMODE, sum(CASE WHEN ORDERS.O_ORDERPRIORITY = Utf8("1-URGENT") OR ORDERS.O_ORDERPRIORITY = Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END) AS HIGH_LINE_COUNT, sum(CASE WHEN ORDERS.O_ORDERPRIORITY != Utf8("1-URGENT") AND ORDERS.O_ORDERPRIORITY != Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END) AS LOW_LINE_COUNT
+          Sort: LINEITEM.L_SHIPMODE ASC NULLS LAST
+            Aggregate: groupBy=[[LINEITEM.L_SHIPMODE]], aggr=[[sum(CASE WHEN ORDERS.O_ORDERPRIORITY = Utf8("1-URGENT") OR ORDERS.O_ORDERPRIORITY = Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END), sum(CASE WHEN ORDERS.O_ORDERPRIORITY != Utf8("1-URGENT") AND ORDERS.O_ORDERPRIORITY != Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END)]]
+              Projection: LINEITEM.L_SHIPMODE, CASE WHEN ORDERS.O_ORDERPRIORITY = Utf8("1-URGENT") OR ORDERS.O_ORDERPRIORITY = Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END, CASE WHEN ORDERS.O_ORDERPRIORITY != Utf8("1-URGENT") AND ORDERS.O_ORDERPRIORITY != Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END
+                Filter: ORDERS.O_ORDERKEY = LINEITEM.L_ORDERKEY AND (LINEITEM.L_SHIPMODE = CAST(Utf8("MAIL") AS Utf8) OR LINEITEM.L_SHIPMODE = CAST(Utf8("SHIP") AS Utf8)) AND LINEITEM.L_COMMITDATE < LINEITEM.L_RECEIPTDATE AND LINEITEM.L_SHIPDATE < LINEITEM.L_COMMITDATE AND LINEITEM.L_RECEIPTDATE >= CAST(Utf8("1994-01-01") AS Date32) AND LINEITEM.L_RECEIPTDATE < CAST(Utf8("1995-01-01") AS Date32)
+                  Cross Join:
+                    TableScan: ORDERS
+                    TableScan: LINEITEM
+        "#
                 );
         Ok(())
     }
@@ -290,17 +320,17 @@ mod tests {
         assert_snapshot!(
             plan_str,
             @r#"
-            Projection: count(ORDERS.O_ORDERKEY) AS C_COUNT, count(Int64(1)) AS CUSTDIST
-              Sort: count(Int64(1)) DESC NULLS FIRST, count(ORDERS.O_ORDERKEY) DESC NULLS FIRST
-                Projection: count(ORDERS.O_ORDERKEY), count(Int64(1))
-                  Aggregate: groupBy=[[count(ORDERS.O_ORDERKEY)]], aggr=[[count(Int64(1))]]
-                    Projection: count(ORDERS.O_ORDERKEY)
-                      Aggregate: groupBy=[[CUSTOMER.C_CUSTKEY]], aggr=[[count(ORDERS.O_ORDERKEY)]]
-                        Projection: CUSTOMER.C_CUSTKEY, ORDERS.O_ORDERKEY
-                          Left Join: CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY Filter: NOT ORDERS.O_COMMENT LIKE CAST(Utf8("%special%requests%") AS Utf8)
-                            TableScan: CUSTOMER
-                            TableScan: ORDERS
-            "#        );
+        Projection: count(ORDERS.O_ORDERKEY) AS C_COUNT, count(Int64(1)) AS CUSTDIST
+          Sort: count(Int64(1)) DESC NULLS FIRST, count(ORDERS.O_ORDERKEY) DESC NULLS FIRST
+            Projection: count(ORDERS.O_ORDERKEY), count(Int64(1))
+              Aggregate: groupBy=[[count(ORDERS.O_ORDERKEY)]], aggr=[[count(Int64(1))]]
+                Projection: count(ORDERS.O_ORDERKEY)
+                  Aggregate: groupBy=[[CUSTOMER.C_CUSTKEY]], aggr=[[count(ORDERS.O_ORDERKEY)]]
+                    Projection: CUSTOMER.C_CUSTKEY, ORDERS.O_ORDERKEY
+                      Left Join: CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY Filter: NOT ORDERS.O_COMMENT LIKE CAST(Utf8("%special%requests%") AS Utf8)
+                        TableScan: CUSTOMER
+                        TableScan: ORDERS
+        "#        );
         Ok(())
     }
 
@@ -310,14 +340,14 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Projection: Decimal128(Some(10000),5,2) * sum(CASE WHEN PART.P_TYPE LIKE Utf8("PROMO%") THEN LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT ELSE Decimal128(Some(0),19,4) END) / sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS PROMO_REVENUE
-              Aggregate: groupBy=[[]], aggr=[[sum(CASE WHEN PART.P_TYPE LIKE Utf8("PROMO%") THEN LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT ELSE Decimal128(Some(0),19,4) END), sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT)]]
-                Projection: CASE WHEN PART.P_TYPE LIKE CAST(Utf8("PROMO%") AS Utf8) THEN LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT) ELSE Decimal128(Some(0),19,4) END, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT)
-                  Filter: LINEITEM.L_PARTKEY = PART.P_PARTKEY AND LINEITEM.L_SHIPDATE >= Date32("1995-09-01") AND LINEITEM.L_SHIPDATE < CAST(Utf8("1995-10-01") AS Date32)
-                    Cross Join: 
-                      TableScan: LINEITEM
-                      TableScan: PART
-            "#
+        Projection: Decimal128(Some(10000),5,2) * sum(CASE WHEN PART.P_TYPE LIKE Utf8("PROMO%") THEN LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT ELSE Decimal128(Some(0),19,4) END) / sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS PROMO_REVENUE
+          Aggregate: groupBy=[[]], aggr=[[sum(CASE WHEN PART.P_TYPE LIKE Utf8("PROMO%") THEN LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT ELSE Decimal128(Some(0),19,4) END), sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT)]]
+            Projection: CASE WHEN PART.P_TYPE LIKE CAST(Utf8("PROMO%") AS Utf8) THEN LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT) ELSE Decimal128(Some(0),19,4) END, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT)
+              Filter: LINEITEM.L_PARTKEY = PART.P_PARTKEY AND LINEITEM.L_SHIPDATE >= Date32("1995-09-01") AND LINEITEM.L_SHIPDATE < CAST(Utf8("1995-10-01") AS Date32)
+                Cross Join:
+                  TableScan: LINEITEM
+                  TableScan: PART
+        "#
                 );
         Ok(())
     }
@@ -336,28 +366,44 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Projection: PART.P_BRAND, PART.P_TYPE, PART.P_SIZE, count(DISTINCT PARTSUPP.PS_SUPPKEY) AS SUPPLIER_CNT
-              Sort: count(DISTINCT PARTSUPP.PS_SUPPKEY) DESC NULLS FIRST, PART.P_BRAND ASC NULLS LAST, PART.P_TYPE ASC NULLS LAST, PART.P_SIZE ASC NULLS LAST
-                Aggregate: groupBy=[[PART.P_BRAND, PART.P_TYPE, PART.P_SIZE]], aggr=[[count(DISTINCT PARTSUPP.PS_SUPPKEY)]]
-                  Projection: PART.P_BRAND, PART.P_TYPE, PART.P_SIZE, PARTSUPP.PS_SUPPKEY
-                    Filter: PART.P_PARTKEY = PARTSUPP.PS_PARTKEY AND PART.P_BRAND != Utf8("Brand#45") AND NOT PART.P_TYPE LIKE CAST(Utf8("MEDIUM POLISHED%") AS Utf8) AND (PART.P_SIZE = Int32(49) OR PART.P_SIZE = Int32(14) OR PART.P_SIZE = Int32(23) OR PART.P_SIZE = Int32(45) OR PART.P_SIZE = Int32(19) OR PART.P_SIZE = Int32(3) OR PART.P_SIZE = Int32(36) OR PART.P_SIZE = Int32(9)) AND NOT PARTSUPP.PS_SUPPKEY IN (<subquery>)
-                      Subquery:
-                        Projection: SUPPLIER.S_SUPPKEY
-                          Filter: SUPPLIER.S_COMMENT LIKE CAST(Utf8("%Customer%Complaints%") AS Utf8)
-                            TableScan: SUPPLIER
-                      Cross Join: 
-                        TableScan: PARTSUPP
-                        TableScan: PART
-            "#
+        Projection: PART.P_BRAND, PART.P_TYPE, PART.P_SIZE, count(DISTINCT PARTSUPP.PS_SUPPKEY) AS SUPPLIER_CNT
+          Sort: count(DISTINCT PARTSUPP.PS_SUPPKEY) DESC NULLS FIRST, PART.P_BRAND ASC NULLS LAST, PART.P_TYPE ASC NULLS LAST, PART.P_SIZE ASC NULLS LAST
+            Aggregate: groupBy=[[PART.P_BRAND, PART.P_TYPE, PART.P_SIZE]], aggr=[[count(DISTINCT PARTSUPP.PS_SUPPKEY)]]
+              Projection: PART.P_BRAND, PART.P_TYPE, PART.P_SIZE, PARTSUPP.PS_SUPPKEY
+                Filter: PART.P_PARTKEY = PARTSUPP.PS_PARTKEY AND PART.P_BRAND != Utf8("Brand#45") AND NOT PART.P_TYPE LIKE CAST(Utf8("MEDIUM POLISHED%") AS Utf8) AND (PART.P_SIZE = Int32(49) OR PART.P_SIZE = Int32(14) OR PART.P_SIZE = Int32(23) OR PART.P_SIZE = Int32(45) OR PART.P_SIZE = Int32(19) OR PART.P_SIZE = Int32(3) OR PART.P_SIZE = Int32(36) OR PART.P_SIZE = Int32(9)) AND NOT PARTSUPP.PS_SUPPKEY IN (<subquery>)
+                  Subquery:
+                    Projection: SUPPLIER.S_SUPPKEY
+                      Filter: SUPPLIER.S_COMMENT LIKE CAST(Utf8("%Customer%Complaints%") AS Utf8)
+                        TableScan: SUPPLIER
+                  Cross Join:
+                    TableScan: PARTSUPP
+                    TableScan: PART
+        "#
                 );
         Ok(())
     }
 
-    #[ignore]
     #[tokio::test]
     async fn tpch_test_17() -> Result<()> {
         let plan_str = tpch_plan_to_string(17).await?;
-        assert_snapshot!(plan_str, "panics due to out of bounds field access");
+        assert_snapshot!(
+        plan_str,
+        @r#"
+        Projection: sum(LINEITEM.L_EXTENDEDPRICE) / Decimal128(Some(70),2,1) AS AVG_YEARLY
+          Aggregate: groupBy=[[]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE)]]
+            Projection: LINEITEM.L_EXTENDEDPRICE
+              Filter: PART.P_PARTKEY = LINEITEM.L_PARTKEY AND PART.P_BRAND = Utf8("Brand#23") AND PART.P_CONTAINER = Utf8("MED BOX") AND LINEITEM.L_QUANTITY < (<subquery>)
+                Subquery:
+                  Projection: Decimal128(Some(2),2,1) * avg(LINEITEM.L_QUANTITY)
+                    Aggregate: groupBy=[[]], aggr=[[avg(LINEITEM.L_QUANTITY)]]
+                      Projection: LINEITEM.L_QUANTITY
+                        Filter: LINEITEM.L_PARTKEY = outer_ref(PART.P_PARTKEY)
+                          TableScan: LINEITEM
+                Cross Join:
+                  TableScan: LINEITEM
+                  TableScan: PART
+        "#
+                );
         Ok(())
     }
 
@@ -366,25 +412,25 @@ mod tests {
         let plan_str = tpch_plan_to_string(18).await?;
         assert_snapshot!(
         plan_str,
-        @r#"
-            Projection: CUSTOMER.C_NAME, CUSTOMER.C_CUSTKEY, ORDERS.O_ORDERKEY, ORDERS.O_ORDERDATE, ORDERS.O_TOTALPRICE, sum(LINEITEM.L_QUANTITY) AS EXPR$5
-              Limit: skip=0, fetch=100
-                Sort: ORDERS.O_TOTALPRICE DESC NULLS FIRST, ORDERS.O_ORDERDATE ASC NULLS LAST
-                  Aggregate: groupBy=[[CUSTOMER.C_NAME, CUSTOMER.C_CUSTKEY, ORDERS.O_ORDERKEY, ORDERS.O_ORDERDATE, ORDERS.O_TOTALPRICE]], aggr=[[sum(LINEITEM.L_QUANTITY)]]
-                    Projection: CUSTOMER.C_NAME, CUSTOMER.C_CUSTKEY, ORDERS.O_ORDERKEY, ORDERS.O_ORDERDATE, ORDERS.O_TOTALPRICE, LINEITEM.L_QUANTITY
-                      Filter: ORDERS.O_ORDERKEY IN (<subquery>) AND CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY AND ORDERS.O_ORDERKEY = LINEITEM.L_ORDERKEY
-                        Subquery:
-                          Projection: LINEITEM.L_ORDERKEY
-                            Filter: sum(LINEITEM.L_QUANTITY) > CAST(Int32(300) AS Decimal128(15, 2))
-                              Aggregate: groupBy=[[LINEITEM.L_ORDERKEY]], aggr=[[sum(LINEITEM.L_QUANTITY)]]
-                                Projection: LINEITEM.L_ORDERKEY, LINEITEM.L_QUANTITY
-                                  TableScan: LINEITEM
-                        Cross Join: 
-                          Cross Join: 
-                            TableScan: CUSTOMER
-                            TableScan: ORDERS
-                          TableScan: LINEITEM
-            "#
+        @r"
+        Projection: CUSTOMER.C_NAME, CUSTOMER.C_CUSTKEY, ORDERS.O_ORDERKEY, ORDERS.O_ORDERDATE, ORDERS.O_TOTALPRICE, sum(LINEITEM.L_QUANTITY) AS EXPR$5
+          Limit: skip=0, fetch=100
+            Sort: ORDERS.O_TOTALPRICE DESC NULLS FIRST, ORDERS.O_ORDERDATE ASC NULLS LAST
+              Aggregate: groupBy=[[CUSTOMER.C_NAME, CUSTOMER.C_CUSTKEY, ORDERS.O_ORDERKEY, ORDERS.O_ORDERDATE, ORDERS.O_TOTALPRICE]], aggr=[[sum(LINEITEM.L_QUANTITY)]]
+                Projection: CUSTOMER.C_NAME, CUSTOMER.C_CUSTKEY, ORDERS.O_ORDERKEY, ORDERS.O_ORDERDATE, ORDERS.O_TOTALPRICE, LINEITEM.L_QUANTITY
+                  Filter: ORDERS.O_ORDERKEY IN (<subquery>) AND CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY AND ORDERS.O_ORDERKEY = LINEITEM.L_ORDERKEY
+                    Subquery:
+                      Projection: LINEITEM.L_ORDERKEY
+                        Filter: sum(LINEITEM.L_QUANTITY) > CAST(Int32(300) AS Decimal128(15, 2))
+                          Aggregate: groupBy=[[LINEITEM.L_ORDERKEY]], aggr=[[sum(LINEITEM.L_QUANTITY)]]
+                            Projection: LINEITEM.L_ORDERKEY, LINEITEM.L_QUANTITY
+                              TableScan: LINEITEM
+                    Cross Join:
+                      Cross Join:
+                        TableScan: CUSTOMER
+                        TableScan: ORDERS
+                      TableScan: LINEITEM
+        "
                 );
         Ok(())
     }
@@ -394,13 +440,13 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Aggregate: groupBy=[[]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS REVENUE]]
-              Projection: LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT)
-                Filter: PART.P_PARTKEY = LINEITEM.L_PARTKEY AND PART.P_BRAND = Utf8("Brand#12") AND (PART.P_CONTAINER = CAST(Utf8("SM CASE") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("SM BOX") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("SM PACK") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("SM PKG") AS Utf8)) AND LINEITEM.L_QUANTITY >= CAST(Int32(1) AS Decimal128(15, 2)) AND LINEITEM.L_QUANTITY <= CAST(Int32(1) + Int32(10) AS Decimal128(15, 2)) AND PART.P_SIZE >= Int32(1) AND PART.P_SIZE <= Int32(5) AND (LINEITEM.L_SHIPMODE = CAST(Utf8("AIR") AS Utf8) OR LINEITEM.L_SHIPMODE = CAST(Utf8("AIR REG") AS Utf8)) AND LINEITEM.L_SHIPINSTRUCT = Utf8("DELIVER IN PERSON") OR PART.P_PARTKEY = LINEITEM.L_PARTKEY AND PART.P_BRAND = Utf8("Brand#23") AND (PART.P_CONTAINER = CAST(Utf8("MED BAG") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("MED BOX") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("MED PKG") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("MED PACK") AS Utf8)) AND LINEITEM.L_QUANTITY >= CAST(Int32(10) AS Decimal128(15, 2)) AND LINEITEM.L_QUANTITY <= CAST(Int32(10) + Int32(10) AS Decimal128(15, 2)) AND PART.P_SIZE >= Int32(1) AND PART.P_SIZE <= Int32(10) AND (LINEITEM.L_SHIPMODE = CAST(Utf8("AIR") AS Utf8) OR LINEITEM.L_SHIPMODE = CAST(Utf8("AIR REG") AS Utf8)) AND LINEITEM.L_SHIPINSTRUCT = Utf8("DELIVER IN PERSON") OR PART.P_PARTKEY = LINEITEM.L_PARTKEY AND PART.P_BRAND = Utf8("Brand#34") AND (PART.P_CONTAINER = CAST(Utf8("LG CASE") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("LG BOX") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("LG PACK") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("LG PKG") AS Utf8)) AND LINEITEM.L_QUANTITY >= CAST(Int32(20) AS Decimal128(15, 2)) AND LINEITEM.L_QUANTITY <= CAST(Int32(20) + Int32(10) AS Decimal128(15, 2)) AND PART.P_SIZE >= Int32(1) AND PART.P_SIZE <= Int32(15) AND (LINEITEM.L_SHIPMODE = CAST(Utf8("AIR") AS Utf8) OR LINEITEM.L_SHIPMODE = CAST(Utf8("AIR REG") AS Utf8)) AND LINEITEM.L_SHIPINSTRUCT = Utf8("DELIVER IN PERSON")
-                  Cross Join: 
-                    TableScan: LINEITEM
-                    TableScan: PART
-            "#
+        Aggregate: groupBy=[[]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS REVENUE]]
+          Projection: LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT)
+            Filter: PART.P_PARTKEY = LINEITEM.L_PARTKEY AND PART.P_BRAND = Utf8("Brand#12") AND (PART.P_CONTAINER = CAST(Utf8("SM CASE") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("SM BOX") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("SM PACK") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("SM PKG") AS Utf8)) AND LINEITEM.L_QUANTITY >= CAST(Int32(1) AS Decimal128(15, 2)) AND LINEITEM.L_QUANTITY <= CAST(Int32(1) + Int32(10) AS Decimal128(15, 2)) AND PART.P_SIZE >= Int32(1) AND PART.P_SIZE <= Int32(5) AND (LINEITEM.L_SHIPMODE = CAST(Utf8("AIR") AS Utf8) OR LINEITEM.L_SHIPMODE = CAST(Utf8("AIR REG") AS Utf8)) AND LINEITEM.L_SHIPINSTRUCT = Utf8("DELIVER IN PERSON") OR PART.P_PARTKEY = LINEITEM.L_PARTKEY AND PART.P_BRAND = Utf8("Brand#23") AND (PART.P_CONTAINER = CAST(Utf8("MED BAG") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("MED BOX") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("MED PKG") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("MED PACK") AS Utf8)) AND LINEITEM.L_QUANTITY >= CAST(Int32(10) AS Decimal128(15, 2)) AND LINEITEM.L_QUANTITY <= CAST(Int32(10) + Int32(10) AS Decimal128(15, 2)) AND PART.P_SIZE >= Int32(1) AND PART.P_SIZE <= Int32(10) AND (LINEITEM.L_SHIPMODE = CAST(Utf8("AIR") AS Utf8) OR LINEITEM.L_SHIPMODE = CAST(Utf8("AIR REG") AS Utf8)) AND LINEITEM.L_SHIPINSTRUCT = Utf8("DELIVER IN PERSON") OR PART.P_PARTKEY = LINEITEM.L_PARTKEY AND PART.P_BRAND = Utf8("Brand#34") AND (PART.P_CONTAINER = CAST(Utf8("LG CASE") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("LG BOX") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("LG PACK") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("LG PKG") AS Utf8)) AND LINEITEM.L_QUANTITY >= CAST(Int32(20) AS Decimal128(15, 2)) AND LINEITEM.L_QUANTITY <= CAST(Int32(20) + Int32(10) AS Decimal128(15, 2)) AND PART.P_SIZE >= Int32(1) AND PART.P_SIZE <= Int32(15) AND (LINEITEM.L_SHIPMODE = CAST(Utf8("AIR") AS Utf8) OR LINEITEM.L_SHIPMODE = CAST(Utf8("AIR REG") AS Utf8)) AND LINEITEM.L_SHIPINSTRUCT = Utf8("DELIVER IN PERSON")
+              Cross Join:
+                TableScan: LINEITEM
+                TableScan: PART
+        "#
                 );
         Ok(())
     }
@@ -411,27 +457,27 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Sort: SUPPLIER.S_NAME ASC NULLS LAST
-              Projection: SUPPLIER.S_NAME, SUPPLIER.S_ADDRESS
-                Filter: SUPPLIER.S_SUPPKEY IN (<subquery>) AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_NAME = Utf8("CANADA")
-                  Subquery:
-                    Projection: PARTSUPP.PS_SUPPKEY
-                      Filter: PARTSUPP.PS_PARTKEY IN (<subquery>) AND CAST(PARTSUPP.PS_AVAILQTY AS Decimal128(19, 0)) > (<subquery>)
-                        Subquery:
-                          Projection: PART.P_PARTKEY
-                            Filter: PART.P_NAME LIKE CAST(Utf8("forest%") AS Utf8)
-                              TableScan: PART
-                        Subquery:
-                          Projection: Decimal128(Some(5),2,1) * sum(LINEITEM.L_QUANTITY)
-                            Aggregate: groupBy=[[]], aggr=[[sum(LINEITEM.L_QUANTITY)]]
-                              Projection: LINEITEM.L_QUANTITY
-                                Filter: LINEITEM.L_PARTKEY = LINEITEM.L_ORDERKEY AND LINEITEM.L_SUPPKEY = LINEITEM.L_PARTKEY AND LINEITEM.L_SHIPDATE >= CAST(Utf8("1994-01-01") AS Date32) AND LINEITEM.L_SHIPDATE < CAST(Utf8("1995-01-01") AS Date32)
-                                  TableScan: LINEITEM
-                        TableScan: PARTSUPP
-                  Cross Join: 
-                    TableScan: SUPPLIER
-                    TableScan: NATION
-            "#
+        Sort: SUPPLIER.S_NAME ASC NULLS LAST
+          Projection: SUPPLIER.S_NAME, SUPPLIER.S_ADDRESS
+            Filter: SUPPLIER.S_SUPPKEY IN (<subquery>) AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_NAME = Utf8("CANADA")
+              Subquery:
+                Projection: PARTSUPP.PS_SUPPKEY
+                  Filter: PARTSUPP.PS_PARTKEY IN (<subquery>) AND CAST(PARTSUPP.PS_AVAILQTY AS Decimal128(19, 0)) > (<subquery>)
+                    Subquery:
+                      Projection: PART.P_PARTKEY
+                        Filter: PART.P_NAME LIKE CAST(Utf8("forest%") AS Utf8)
+                          TableScan: PART
+                    Subquery:
+                      Projection: Decimal128(Some(5),2,1) * sum(LINEITEM.L_QUANTITY)
+                        Aggregate: groupBy=[[]], aggr=[[sum(LINEITEM.L_QUANTITY)]]
+                          Projection: LINEITEM.L_QUANTITY
+                            Filter: LINEITEM.L_PARTKEY = outer_ref(PARTSUPP.PS_PARTKEY) AND LINEITEM.L_SUPPKEY = outer_ref(PARTSUPP.PS_SUPPKEY) AND LINEITEM.L_SHIPDATE >= CAST(Utf8("1994-01-01") AS Date32) AND LINEITEM.L_SHIPDATE < CAST(Utf8("1995-01-01") AS Date32)
+                              TableScan: LINEITEM
+                    TableScan: PARTSUPP
+              Cross Join:
+                TableScan: SUPPLIER
+                TableScan: NATION
+        "#
                 );
         Ok(())
     }
@@ -449,14 +495,14 @@ mod tests {
                 Projection: SUPPLIER.S_NAME
                   Filter: SUPPLIER.S_SUPPKEY = LINEITEM.L_SUPPKEY AND ORDERS.O_ORDERKEY = LINEITEM.L_ORDERKEY AND ORDERS.O_ORDERSTATUS = Utf8("F") AND LINEITEM.L_RECEIPTDATE > LINEITEM.L_COMMITDATE AND EXISTS (<subquery>) AND NOT EXISTS (<subquery>) AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_NAME = Utf8("SAUDI ARABIA")
                     Subquery:
-                      Filter: LINEITEM.L_ORDERKEY = LINEITEM.L_TAX AND LINEITEM.L_SUPPKEY != LINEITEM.L_LINESTATUS
+                      Filter: LINEITEM.L_ORDERKEY = outer_ref(LINEITEM.L_ORDERKEY) AND LINEITEM.L_SUPPKEY != outer_ref(LINEITEM.L_SUPPKEY)
                         TableScan: LINEITEM
                     Subquery:
-                      Filter: LINEITEM.L_ORDERKEY = LINEITEM.L_TAX AND LINEITEM.L_SUPPKEY != LINEITEM.L_LINESTATUS AND LINEITEM.L_RECEIPTDATE > LINEITEM.L_COMMITDATE
+                      Filter: LINEITEM.L_ORDERKEY = outer_ref(LINEITEM.L_ORDERKEY) AND LINEITEM.L_SUPPKEY != outer_ref(LINEITEM.L_SUPPKEY) AND LINEITEM.L_RECEIPTDATE > LINEITEM.L_COMMITDATE
                         TableScan: LINEITEM
-                    Cross Join: 
-                      Cross Join: 
-                        Cross Join: 
+                    Cross Join:
+                      Cross Join:
+                        Cross Join:
                           TableScan: SUPPLIER
                           TableScan: LINEITEM
                         TableScan: ORDERS
@@ -483,7 +529,7 @@ mod tests {
                         Filter: CUSTOMER.C_ACCTBAL > Decimal128(Some(0),3,2) AND (substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8("13") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8("31") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8("23") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8("29") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8("30") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8("18") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8("17") AS Utf8))
                           TableScan: CUSTOMER
                   Subquery:
-                    Filter: ORDERS.O_CUSTKEY = ORDERS.O_ORDERKEY
+                    Filter: ORDERS.O_CUSTKEY = outer_ref(CUSTOMER.C_CUSTKEY)
                       TableScan: ORDERS
                   TableScan: CUSTOMER
         "#
@@ -491,6 +537,52 @@ mod tests {
         Ok(())
     }
 
+    /// Tests nested correlated subqueries where the innermost subquery
+    /// references the outermost query (steps_out=2).
+    ///
+    /// This tests the outer schema stack with depth > 1.
+    /// The plan represents:
+    /// ```sql
+    /// SELECT * FROM A
+    /// WHERE EXISTS (
+    ///     SELECT * FROM B
+    ///     WHERE B.b1 = A.a1              -- steps_out=1 (references immediate parent)
+    ///       AND EXISTS (
+    ///         SELECT * FROM C
+    ///         WHERE C.c1 = A.a1          -- steps_out=2 (references grandparent)
+    ///           AND C.c2 = B.b2          -- steps_out=1 (references immediate parent)
+    ///     )
+    /// )
+    /// ```
+    ///
+    #[tokio::test]
+    async fn test_nested_correlated_subquery() -> Result<()> {
+        let path = "tests/testdata/test_plans/nested_correlated_subquery.substrait.json";
+        let proto = serde_json::from_reader::<_, Plan>(BufReader::new(
+            File::open(path).expect("file not found"),
+        ))
+        .expect("failed to parse json");
+
+        let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto)?;
+        let plan = from_substrait_plan(&ctx.state(), &proto).await?;
+        let plan_str = format!("{plan}");
+
+        assert_snapshot!(
+            plan_str,
+            @r#"
+        Filter: EXISTS (<subquery>)
+          Subquery:
+            Filter: B.b1 = outer_ref(A.a1) AND EXISTS (<subquery>)
+              Subquery:
+                Filter: C.c1 = outer_ref(A.a1) AND C.c2 = outer_ref(B.b2)
+                  TableScan: C
+              TableScan: B
+          TableScan: A
+        "#
+        );
+        Ok(())
+    }
+
     async fn test_plan_to_string(name: &str) -> Result<String> {
         let path = format!("tests/testdata/test_plans/{name}");
         let proto = serde_json::from_reader::<_, Plan>(BufReader::new(
@@ -511,10 +603,10 @@ mod tests {
 
         assert_snapshot!(
         plan_str,
-        @r#"
-            Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]]
-              Values: (Int64(0))
-            "#
+        @r"
+        Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]]
+          Values: (Int64(0))
+        "
                 );
         Ok(())
     }
@@ -527,9 +619,9 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Projection: dummy1 AS result1, dummy2 AS result2
-              Values: (Int64(0), Utf8("temp")), (Int64(1), Utf8("test"))
-            "#
+        Projection: dummy1 AS result1, dummy2 AS result2
+          Values: (Int64(0), Utf8("temp")), (Int64(1), Utf8("test"))
+        "#
                 );
         Ok(())
     }
@@ -545,10 +637,10 @@ mod tests {
 
         //Test correct plan structure
         assert_snapshot!(plan_str,
-          @r#"
+          @r"
         Projection: a, b, (a OR b) AND NOT a AND b AS result
           Values: (Boolean(true), Boolean(true)), (Boolean(true), Boolean(false)), (Boolean(false), Boolean(true)), (Boolean(false), Boolean(false))
-        "#
+        "
         );
 
         Ok(())
@@ -566,10 +658,10 @@ mod tests {
 
         //Test correct plan structure
         assert_snapshot!(plan_str,
-          @r#"
+          @r"
         Projection: a, b, a AND NOT b AS result
           Values: (Boolean(true), Boolean(true)), (Boolean(true), Boolean(false)), (Boolean(false), Boolean(true)), (Boolean(false), Boolean(false))
-        "#
+        "
         );
 
         Ok(())
@@ -582,10 +674,10 @@ mod tests {
         let plan_str =
             test_plan_to_string("scalar_fn_to_between_expr.substrait.json").await?;
         assert_snapshot!(plan_str,
-          @r#"
-          Projection: expr BETWEEN low AND high AS result
-            Values: (Int8(2), Int8(1), Int8(3)), (Int8(4), Int8(1), Int8(2))
-          "#
+          @r"
+        Projection: expr BETWEEN low AND high AS result
+          Values: (Int8(2), Int8(1), Int8(3)), (Int8(4), Int8(1), Int8(2))
+        "
         );
         Ok(())
     }
@@ -594,10 +686,10 @@ mod tests {
     async fn test_logb_expr() -> Result<()> {
         let plan_str = test_plan_to_string("scalar_fn_logb_expr.substrait.json").await?;
         assert_snapshot!(plan_str,
-          @r#"
-          Projection: x, base, log(base, x) AS result
-            Values: (Float32(1), Float32(10)), (Float32(100), Float32(10))
-          "#
+          @r"
+        Projection: x, base, log(base, x) AS result
+          Values: (Float32(1), Float32(10)), (Float32(100), Float32(10))
+        "
         );
         Ok(())
     }
@@ -639,11 +731,11 @@ mod tests {
 
         assert_snapshot!(
         plan_str,
-        @r#"
+        @r"
         Projection: count(Int64(1)) PARTITION BY [DATA.PART] ORDER BY [DATA.ORD ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING AS LEAD_EXPR
           WindowAggr: windowExpr=[[count(Int64(1)) PARTITION BY [DATA.PART] ORDER BY [DATA.ORD ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING]]
             TableScan: DATA
-        "#
+        "
                         );
         Ok(())
     }
@@ -651,31 +743,23 @@ mod tests {
     #[tokio::test]
     async fn test_multiple_unions() -> Result<()> {
         let plan_str = test_plan_to_string("multiple_unions.json").await?;
-
-        let mut settings = insta::Settings::clone_current();
-        settings.add_filter(
-            r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
-            "[UUID]",
-        );
-        settings.bind(|| {
-            assert_snapshot!(
-            plan_str,
-            @r#"
-            Projection: [UUID] AS product_category, [UUID] AS product_type, product_key
-              Union
-                Projection: Utf8("people") AS [UUID], Utf8("people") AS [UUID], sales.product_key
-                  Left Join: sales.product_key = food.@food_id
-                    TableScan: sales
-                    TableScan: food
-                Union
-                  Projection: people.$f3, people.$f5, people.product_key0
-                    Left Join: people.product_key0 = food.@food_id
-                      TableScan: people
-                      TableScan: food
-                  TableScan: more_products
-            "#
+        assert_snapshot!(
+        plan_str,
+        @r#"
+        Projection: Utf8("people") AS product_category, Utf8("people")__temp__0 AS product_type, product_key
+          Union
+            Projection: Utf8("people"), Utf8("people") AS Utf8("people")__temp__0, sales.product_key
+              Left Join: sales.product_key = food.@food_id
+                TableScan: sales
+                TableScan: food
+            Union
+              Projection: people.$f3, people.$f5, people.product_key0
+                Left Join: people.product_key0 = food.@food_id
+                  TableScan: people
+                  TableScan: food
+              TableScan: more_products
+        "#
         );
-        });
 
         Ok(())
     }
@@ -708,4 +792,80 @@ mod tests {
 
         Ok(())
     }
+
+    /// Substrait join with both `equal` and `is_not_distinct_from` must demote
+    /// `IS NOT DISTINCT FROM` to the join filter.
+    #[tokio::test]
+    async fn test_mixed_join_equal_and_indistinct_inner_join() -> Result<()> {
+        let plan_str =
+            test_plan_to_string("mixed_join_equal_and_indistinct.json").await?;
+        // Eq becomes the equijoin key; IS NOT DISTINCT FROM is demoted to filter.
+        assert_snapshot!(
+            plan_str,
+            @r#"
+        Projection: left.id, left.val, left.comment, right.id AS id0, right.val AS val0, right.comment AS comment0
+          Inner Join: left.id = right.id Filter: left.val IS NOT DISTINCT FROM right.val
+            SubqueryAlias: left
+              Values: (Utf8("1"), Utf8("a"), Utf8("c1")), (Utf8("2"), Utf8("b"), Utf8("c2")), (Utf8("3"), Utf8(NULL), Utf8("c3")), (Utf8("4"), Utf8(NULL), Utf8("c4")), (Utf8("5"), Utf8("e"), Utf8("c5"))...
+            SubqueryAlias: right
+              Values: (Utf8("1"), Utf8("a"), Utf8("c1")), (Utf8("2"), Utf8("b"), Utf8("c2")), (Utf8("3"), Utf8(NULL), Utf8("c3")), (Utf8("4"), Utf8(NULL), Utf8("c4")), (Utf8("5"), Utf8("e"), Utf8("c5"))...
+        "#
+        );
+
+        // Execute and verify actual rows, including NULL=NULL matches (ids 3,4).
+        let results = execute_plan("mixed_join_equal_and_indistinct.json").await?;
+        assert_snapshot!(pretty_sorted(&results),
+            @r"
+        +----+-----+---------+-----+------+----------+
+        | id | val | comment | id0 | val0 | comment0 |
+        +----+-----+---------+-----+------+----------+
+        | 1  | a   | c1      | 1   | a    | c1       |
+        | 2  | b   | c2      | 2   | b    | c2       |
+        | 3  |     | c3      | 3   |      | c3       |
+        | 4  |     | c4      | 4   |      | c4       |
+        | 5  | e   | c5      | 5   | e    | c5       |
+        | 6  | f   | c6      | 6   | f    | c6       |
+        +----+-----+---------+-----+------+----------+
+        "
+        );
+
+        Ok(())
+    }
+
+    /// Substrait join with both `equal` and `is_not_distinct_from` must demote
+    /// `IS NOT DISTINCT FROM` to the join filter.
+    #[tokio::test]
+    async fn test_mixed_join_equal_and_indistinct_left_join() -> Result<()> {
+        let plan_str =
+            test_plan_to_string("mixed_join_equal_and_indistinct_left.json").await?;
+        assert_snapshot!(
+            plan_str,
+            @r#"
+        Projection: left.id, left.val, left.comment, right.id AS id0, right.val AS val0, right.comment AS comment0
+          Left Join: left.id = right.id Filter: left.val IS NOT DISTINCT FROM right.val
+            SubqueryAlias: left
+              Values: (Utf8("1"), Utf8("a"), Utf8("c1")), (Utf8("2"), Utf8("b"), Utf8("c2")), (Utf8("3"), Utf8(NULL), Utf8("c3")), (Utf8("4"), Utf8(NULL), Utf8("c4")), (Utf8("5"), Utf8("e"), Utf8("c5"))...
+            SubqueryAlias: right
+              Values: (Utf8("1"), Utf8("a"), Utf8("c1")), (Utf8("2"), Utf8("b"), Utf8("c2")), (Utf8("3"), Utf8(NULL), Utf8("c3")), (Utf8("4"), Utf8(NULL), Utf8("c4")), (Utf8("5"), Utf8("e"), Utf8("c5"))...
+        "#
+        );
+
+        let results = execute_plan("mixed_join_equal_and_indistinct_left.json").await?;
+        assert_snapshot!(pretty_sorted(&results),
+            @r"
+        +----+-----+---------+-----+------+----------+
+        | id | val | comment | id0 | val0 | comment0 |
+        +----+-----+---------+-----+------+----------+
+        | 1  | a   | c1      | 1   | a    | c1       |
+        | 2  | b   | c2      | 2   | b    | c2       |
+        | 3  |     | c3      | 3   |      | c3       |
+        | 4  |     | c4      | 4   |      | c4       |
+        | 5  | e   | c5      | 5   | e    | c5       |
+        | 6  | f   | c6      | 6   | f    | c6       |
+        +----+-----+---------+-----+------+----------+
+        "
+        );
+
+        Ok(())
+    }
 }
diff --git a/datafusion/substrait/tests/cases/emit_kind_tests.rs b/datafusion/substrait/tests/cases/emit_kind_tests.rs
index e916b4cb0e1a9..24508fd054d97 100644
--- a/datafusion/substrait/tests/cases/emit_kind_tests.rs
+++ b/datafusion/substrait/tests/cases/emit_kind_tests.rs
@@ -38,10 +38,10 @@ mod tests {
 
         assert_snapshot!(
         plan,
-        @r#"
-            Projection: DATA.A AS a, DATA.B AS b, DATA.A + Int64(1) AS add1
-              TableScan: DATA
-            "#
+        @r"
+        Projection: DATA.A AS a, DATA.B AS b, DATA.A + Int64(1) AS add1
+          TableScan: DATA
+        "
                 );
         Ok(())
     }
@@ -57,11 +57,11 @@ mod tests {
         assert_snapshot!(
         plan,
         // Note that duplicate references in the remap are aliased
-        @r#"
-            Projection: DATA.B, DATA.A AS A1, DATA.A AS DATA.A__temp__0 AS A2
-              Filter: DATA.B = Int64(2)
-                TableScan: DATA
-            "#
+        @r"
+        Projection: DATA.B, DATA.A AS A1, DATA.A AS DATA.A__temp__0 AS A2
+          Filter: DATA.B = Int64(2)
+            TableScan: DATA
+        "
                 );
         Ok(())
     }
@@ -88,21 +88,21 @@ mod tests {
         let plan = df.into_unoptimized_plan();
         assert_snapshot!(
             plan,
-            @r#"
-            Projection: random() AS c1, data.a + Int64(1) AS c2
-              TableScan: data
-            "#        );
+            @r"
+        Projection: random() AS c1, data.a + Int64(1) AS c2
+          TableScan: data
+        "        );
 
         let proto = to_substrait_plan(&plan, &ctx.state())?;
         let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
         // note how the Projections are not flattened
         assert_snapshot!(
         plan2,
-        @r#"
-            Projection: random() AS c1, data.a + Int64(1) AS c2
-              Projection: data.a, data.b, data.c, data.d, data.e, data.f, random(), data.a + Int64(1)
-                TableScan: data
-            "#
+        @r"
+        Projection: random() AS c1, data.a + Int64(1) AS c2
+          Projection: data.a, data.b, data.c, data.d, data.e, data.f, random(), data.a + Int64(1)
+            TableScan: data
+        "
                 );
         Ok(())
     }
@@ -115,10 +115,10 @@ mod tests {
         let plan = df.into_unoptimized_plan();
         assert_snapshot!(
         plan,
-        @r#"
-            Projection: data.a + Int64(1), data.b + Int64(2)
-              TableScan: data
-            "#
+        @r"
+        Projection: data.a + Int64(1), data.b + Int64(2)
+          TableScan: data
+        "
                 );
 
         let proto = to_substrait_plan(&plan, &ctx.state())?;
diff --git a/datafusion/substrait/tests/cases/function_test.rs b/datafusion/substrait/tests/cases/function_test.rs
index 1816c64d39212..d71c80678a091 100644
--- a/datafusion/substrait/tests/cases/function_test.rs
+++ b/datafusion/substrait/tests/cases/function_test.rs
@@ -35,10 +35,10 @@ mod tests {
         assert_snapshot!(
         plan,
         @r#"
-            Projection: nation.n_name
-              Filter: contains(nation.n_name, Utf8("IA"))
-                TableScan: nation
-            "#
+        Projection: nation.n_name
+          Filter: contains(nation.n_name, Utf8("IA"))
+            TableScan: nation
+        "#
                 );
         Ok(())
     }
diff --git a/datafusion/substrait/tests/cases/logical_plans.rs b/datafusion/substrait/tests/cases/logical_plans.rs
index 426f3c12e5a15..663a372fe2e4f 100644
--- a/datafusion/substrait/tests/cases/logical_plans.rs
+++ b/datafusion/substrait/tests/cases/logical_plans.rs
@@ -20,6 +20,9 @@
 #[cfg(test)]
 mod tests {
     use crate::utils::test::{add_plan_schemas_to_ctx, read_json};
+    use datafusion::common::test_util::format_batches;
+    use std::collections::HashSet;
+
     use datafusion::common::Result;
     use datafusion::dataframe::DataFrame;
     use datafusion::prelude::SessionContext;
@@ -43,10 +46,10 @@ mod tests {
 
         assert_snapshot!(
         plan,
-        @r#"
-            Projection: NOT DATA.D AS EXPR$0
-              TableScan: DATA
-            "#
+        @r"
+        Projection: NOT DATA.D AS EXPR$0
+          TableScan: DATA
+        "
                 );
 
         // Trigger execution to ensure plan validity
@@ -74,11 +77,11 @@ mod tests {
 
         assert_snapshot!(
         plan,
-        @r#"
-            Projection: sum(DATA.D) PARTITION BY [DATA.PART] ORDER BY [DATA.ORD ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING AS LEAD_EXPR
-              WindowAggr: windowExpr=[[sum(DATA.D) PARTITION BY [DATA.PART] ORDER BY [DATA.ORD ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING]]
-                TableScan: DATA
-            "#
+        @r"
+        Projection: sum(DATA.D) PARTITION BY [DATA.PART] ORDER BY [DATA.ORD ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING AS LEAD_EXPR
+          WindowAggr: windowExpr=[[sum(DATA.D) PARTITION BY [DATA.PART] ORDER BY [DATA.ORD ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING]]
+            TableScan: DATA
+        "
                 );
 
         // Trigger execution to ensure plan validity
@@ -101,11 +104,11 @@ mod tests {
 
         assert_snapshot!(
         plan,
-        @r#"
-            Projection: row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS EXPR$0, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW__temp__0 AS ALIASED
-              WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-                TableScan: DATA
-            "#
+        @r"
+        Projection: row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS EXPR$0, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW__temp__0 AS ALIASED
+          WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+            TableScan: DATA
+        "
                 );
 
         // Trigger execution to ensure plan validity
@@ -130,12 +133,12 @@ mod tests {
 
         assert_snapshot!(
         plan,
-        @r#"
-            Projection: row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS EXPR$0, row_number() PARTITION BY [DATA.A] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS EXPR$1
-              WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-                WindowAggr: windowExpr=[[row_number() PARTITION BY [DATA.A] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-                  TableScan: DATA
-            "#
+        @r"
+        Projection: row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS EXPR$0, row_number() PARTITION BY [DATA.A] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS EXPR$1
+          WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+            WindowAggr: windowExpr=[[row_number() PARTITION BY [DATA.A] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+              TableScan: DATA
+        "
                 );
 
         // Trigger execution to ensure plan validity
@@ -151,33 +154,27 @@ mod tests {
 
         // File generated with substrait-java's Isthmus:
         // ./isthmus-cli/build/graal/isthmus --create "create table A (a int); create table B (a int, c int); create table C (a int, d int)" "select t.*, C.d, CAST(NULL AS VARCHAR) as e from (select a, CAST(NULL AS VARCHAR) as c from A UNION ALL select a, c from B) t LEFT JOIN C ON t.a = C.a"
-        let proto_plan =
-            read_json("tests/testdata/test_plans/disambiguate_literals_with_same_name.substrait.json");
+        let proto_plan = read_json(
+            "tests/testdata/test_plans/disambiguate_literals_with_same_name.substrait.json",
+        );
         let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto_plan)?;
         let plan = from_substrait_plan(&ctx.state(), &proto_plan).await?;
 
-        let mut settings = insta::Settings::clone_current();
-        settings.add_filter(
-            r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
-            "[UUID]",
+        assert_snapshot!(
+            plan,
+            @r"
+        Projection: left.A, left.Utf8(NULL) AS C, right.D, Utf8(NULL) AS Utf8(NULL)__temp__0 AS E
+          Left Join: left.A = right.A
+            SubqueryAlias: left
+              Union
+                Projection: A.A, Utf8(NULL)
+                  TableScan: A
+                Projection: B.A, CAST(B.C AS Utf8)
+                  TableScan: B
+            SubqueryAlias: right
+              TableScan: C
+        "
         );
-        settings.bind(|| {
-            assert_snapshot!(
-                plan,
-                @r#"
-            Projection: left.A, left.[UUID] AS C, right.D, Utf8(NULL) AS [UUID] AS E
-              Left Join: left.A = right.A
-                SubqueryAlias: left
-                  Union
-                    Projection: A.A, Utf8(NULL) AS [UUID]
-                      TableScan: A
-                    Projection: B.A, CAST(B.C AS Utf8)
-                      TableScan: B
-                SubqueryAlias: right
-                  TableScan: C
-            "#
-            );
-        });
 
         // Trigger execution to ensure plan validity
         DataFrame::new(ctx.state(), plan).show().await?;
@@ -197,9 +194,7 @@ mod tests {
 
         assert_snapshot!(
                 &plan,
-            @r#"
-        Values: (List([1, 2]))
-        "#
+            @"Values: (List([1, 2]))"
         );
 
         // Trigger execution to ensure plan validity
@@ -217,12 +212,12 @@ mod tests {
 
         assert_snapshot!(
         plan,
-        @r#"
-            Projection: lower(sales.product) AS lower(product), sum(count(sales.product)) AS product_count
-              Aggregate: groupBy=[[sales.product]], aggr=[[sum(count(sales.product))]]
-                Aggregate: groupBy=[[sales.product]], aggr=[[count(sales.product)]]
-                  TableScan: sales
-            "#
+        @r"
+        Projection: lower(sales.product) AS lower(product), sum(count(sales.product)) AS product_count
+          Aggregate: groupBy=[[sales.product]], aggr=[[sum(count(sales.product))]]
+            Aggregate: groupBy=[[sales.product]], aggr=[[count(sales.product)]]
+              TableScan: sales
+        "
                 );
 
         // Trigger execution to ensure plan validity
@@ -230,4 +225,72 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn duplicate_name_in_union() -> Result<()> {
+        let proto_plan =
+            read_json("tests/testdata/test_plans/duplicate_name_in_union.substrait.json");
+        let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto_plan)?;
+        let plan = from_substrait_plan(&ctx.state(), &proto_plan).await?;
+
+        assert_snapshot!(
+        plan,
+        @r"
+        Projection: foo AS col1, bar AS col2
+          Union
+            Projection: foo, bar
+              Values: (Int64(100), Int64(200))
+            Projection: x, foo
+              Values: (Int32(300), Int64(400))
+        "
+                );
+
+        // Trigger execution to ensure plan validity
+        let results = DataFrame::new(ctx.state(), plan).collect().await?;
+
+        assert_snapshot!(
+            format_batches(&results)?,
+            @r"
+        +------+------+
+        | col1 | col2 |
+        +------+------+
+        | 100  | 200  |
+        | 300  | 400  |
+        +------+------+
+        ",
+        );
+
+        // also verify that the output schema has unique field names
+        let schema = results[0].schema();
+        for batch in &results {
+            assert_eq!(schema, batch.schema());
+        }
+        let field_names: HashSet<_> = schema.fields().iter().map(|f| f.name()).collect();
+        assert_eq!(field_names.len(), schema.fields().len());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn nested_list_expressions() -> Result<()> {
+        // Tests that a Substrait Nested list expression containing non-literal
+        // expressions (column references) uses the make_array UDF.
+        let proto_plan =
+            read_json("tests/testdata/test_plans/nested_list_expressions.substrait.json");
+        let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto_plan)?;
+        let plan = from_substrait_plan(&ctx.state(), &proto_plan).await?;
+
+        assert_snapshot!(
+            plan,
+            @r"
+        Projection: make_array(DATA.a, DATA.b) AS my_list
+          TableScan: DATA
+        "
+        );
+
+        // Trigger execution to ensure plan validity
+        DataFrame::new(ctx.state(), plan).show().await?;
+
+        Ok(())
+    }
 }
diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs
index f14d4cbf1fcc3..1b8496c3dc729 100644
--- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs
+++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs
@@ -17,6 +17,8 @@
 
 use crate::utils::test::read_json;
 use datafusion::arrow::array::ArrayRef;
+use datafusion::functions_nested::map::map;
+use datafusion::logical_expr::LogicalPlanBuilder;
 use datafusion::physical_plan::Accumulator;
 use datafusion::scalar::ScalarValue;
 use datafusion_substrait::logical_plan::{
@@ -26,14 +28,16 @@ use std::cmp::Ordering;
 use std::mem::size_of_val;
 
 use datafusion::arrow::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUnit};
-use datafusion::common::{not_impl_err, plan_err, DFSchema, DFSchemaRef};
+use datafusion::common::tree_node::Transformed;
+use datafusion::common::{DFSchema, DFSchemaRef, Spans, not_impl_err, plan_err};
 use datafusion::error::Result;
 use datafusion::execution::registry::SerializerRegistry;
 use datafusion::execution::runtime_env::RuntimeEnv;
 use datafusion::execution::session_state::SessionStateBuilder;
+use datafusion::logical_expr::expr::{Exists, SetComparison, SetQuantifier};
 use datafusion::logical_expr::{
-    Extension, InvariantLevel, LogicalPlan, PartitionEvaluator, Repartition,
-    UserDefinedLogicalNode, Values, Volatility,
+    EmptyRelation, Extension, InvariantLevel, LogicalPlan, Operator, PartitionEvaluator,
+    Repartition, Subquery, UserDefinedLogicalNode, Values, Volatility,
 };
 use datafusion::optimizer::simplify_expressions::expr_simplifier::THRESHOLD_INLINE_INLIST;
 use datafusion::prelude::*;
@@ -42,7 +46,7 @@ use std::hash::Hash;
 use std::sync::Arc;
 use substrait::proto::extensions::simple_extension_declaration::MappingType;
 use substrait::proto::rel::RelType;
-use substrait::proto::{plan_rel, Plan, Rel};
+use substrait::proto::{Plan, Rel, plan_rel};
 
 #[derive(Debug)]
 struct MockSerializerRegistry;
@@ -185,16 +189,56 @@ async fn simple_select() -> Result<()> {
     roundtrip("SELECT a, b FROM data").await
 }
 
+#[tokio::test]
+async fn roundtrip_literal_without_from() -> Result<()> {
+    roundtrip("SELECT 1 AS one").await
+}
+
+#[tokio::test]
+async fn roundtrip_empty_relation_with_schema() -> Result<()> {
+    // Test produce_one_row=true with multiple typed columns
+    roundtrip("SELECT 1::int as a, 'hello'::text as b, 3.14::double as c").await
+}
+
+#[tokio::test]
+async fn roundtrip_empty_relation_no_rows() -> Result<()> {
+    // Test produce_one_row=false
+    let ctx = create_context().await?;
+    let plan = LogicalPlan::EmptyRelation(EmptyRelation {
+        produce_one_row: false,
+        schema: DFSchemaRef::new(DFSchema::empty()),
+    });
+    roundtrip_logical_plan_with_ctx(plan, ctx).await?;
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_subquery_with_empty_relation() -> Result<()> {
+    // Test EmptyRelation in the context of scalar subqueries.
+    // The optimizer may simplify the subquery away, but we're testing that
+    // the EmptyRelation round-trips correctly when it appears in the plan.
+    let ctx = create_context().await?;
+    let df = ctx.sql("SELECT (SELECT 1) as nested").await?;
+    let plan = df.into_optimized_plan()?;
+
+    // Just verify the round-trip succeeds and produces valid results
+    let proto = to_substrait_plan(&plan, &ctx.state())?;
+    let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
+    let df2 = DataFrame::new(ctx.state(), plan2);
+    df2.show().await?;
+    Ok(())
+}
+
 #[tokio::test]
 async fn wildcard_select() -> Result<()> {
     let plan = generate_plan_from_sql("SELECT * FROM data", true, false).await?;
 
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     Projection: data.a, data.b, data.c, data.d, data.e, data.f
       TableScan: data
-    "#
+    "
     );
     Ok(())
 }
@@ -310,11 +354,31 @@ async fn aggregate_grouping_rollup() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
-        Projection: data.a, data.c, data.e, avg(data.b)
-          Aggregate: groupBy=[[GROUPING SETS ((data.a, data.c, data.e), (data.a, data.c), (data.a), ())]], aggr=[[avg(data.b)]]
-            TableScan: data projection=[a, b, c, e]
-        "#
+    @r"
+    Projection: data.a, data.c, data.e, avg(data.b)
+      Aggregate: groupBy=[[GROUPING SETS ((data.a, data.c, data.e), (data.a, data.c), (data.a), ())]], aggr=[[avg(data.b)]]
+        TableScan: data projection=[a, b, c, e]
+    "
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn aggregate_grouping_cube() -> Result<()> {
+    let plan = generate_plan_from_sql(
+        "SELECT a, c, avg(b) FROM data GROUP BY CUBE (a, c)",
+        true,
+        true,
+    )
+    .await?;
+
+    assert_snapshot!(
+    plan,
+    @r"
+    Projection: data.a, data.c, avg(data.b)
+      Aggregate: groupBy=[[GROUPING SETS ((), (data.a), (data.c), (data.a, data.c))]], aggr=[[avg(data.b)]]
+        TableScan: data projection=[a, b, c]
+    "
     );
     Ok(())
 }
@@ -330,11 +394,11 @@ async fn multilayer_aggregate() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     Aggregate: groupBy=[[data.a]], aggr=[[sum(count(data.b)) AS sum(partial_count_b)]]
       Aggregate: groupBy=[[data.a]], aggr=[[count(data.b)]]
         TableScan: data projection=[a, b]
-    "#
+    "
     );
     Ok(())
 }
@@ -504,7 +568,7 @@ async fn try_cast_decimal_to_int() -> Result<()> {
 
 #[tokio::test]
 async fn try_cast_decimal_to_string() -> Result<()> {
-    roundtrip("SELECT * FROM data WHERE a = TRY_CAST(b AS string)").await
+    roundtrip("SELECT * FROM data WHERE f = TRY_CAST(b AS string)").await
 }
 
 #[tokio::test]
@@ -518,10 +582,10 @@ async fn aggregate_case() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     Aggregate: groupBy=[[]], aggr=[[sum(CASE WHEN data.a > Int64(0) THEN Int64(1) ELSE Int64(NULL) END) AS sum(CASE WHEN data.a > Int64(0) THEN Int64(1) ELSE NULL END)]]
       TableScan: data projection=[a]
-    "#
+    "
     );
     Ok(())
 }
@@ -616,16 +680,70 @@ async fn roundtrip_exists_filter() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     Projection: data.b
       LeftSemi Join: data.a = data2.a Filter: data2.e != CAST(data.e AS Int64)
         TableScan: data projection=[a, b, e]
         TableScan: data2 projection=[a, e]
-    "#
+    "
             );
     Ok(())
 }
 
+// assemble logical plan manually to ensure SetComparison expr is present (not rewrite away)
+#[tokio::test]
+async fn roundtrip_set_comparison_any_substrait() -> Result<()> {
+    let ctx = create_context().await?;
+    let plan = build_set_comparison_plan(&ctx, SetQuantifier::Any, Operator::Gt).await?;
+    let proto = to_substrait_plan(&plan, &ctx.state())?;
+    let roundtrip_plan = from_substrait_plan(&ctx.state(), &proto).await?;
+    assert_set_comparison_predicate(&roundtrip_plan, Operator::Gt, SetQuantifier::Any);
+    Ok(())
+}
+
+// assemble logical plan manually to ensure SetComparison expr is present (not rewrite away)
+#[tokio::test]
+async fn roundtrip_set_comparison_all_substrait() -> Result<()> {
+    let ctx = create_context().await?;
+    let plan =
+        build_set_comparison_plan(&ctx, SetQuantifier::All, Operator::NotEq).await?;
+    let proto = to_substrait_plan(&plan, &ctx.state())?;
+    let roundtrip_plan = from_substrait_plan(&ctx.state(), &proto).await?;
+    assert_set_comparison_predicate(&roundtrip_plan, Operator::NotEq, SetQuantifier::All);
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_scalar_subquery_substrait() -> Result<()> {
+    let ctx = create_context().await?;
+    let plan = build_scalar_subquery_projection_plan(&ctx).await?;
+    let proto = to_substrait_plan(&plan, &ctx.state())?;
+    assert_root_project_has_scalar_subquery(proto.as_ref());
+    let roundtrip_plan = from_substrait_plan(&ctx.state(), &proto).await?;
+    assert_projection_contains_scalar_subquery(&roundtrip_plan);
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_exists_substrait() -> Result<()> {
+    let ctx = create_context().await?;
+    let plan = build_exists_filter_plan(&ctx, false).await?;
+    let proto = to_substrait_plan(&plan, &ctx.state())?;
+    let roundtrip_plan = from_substrait_plan(&ctx.state(), &proto).await?;
+    assert_exists_predicate(&roundtrip_plan, false);
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_not_exists_substrait() -> Result<()> {
+    let ctx = create_context().await?;
+    let plan = build_exists_filter_plan(&ctx, true).await?;
+    let proto = to_substrait_plan(&plan, &ctx.state())?;
+    let roundtrip_plan = from_substrait_plan(&ctx.state(), &proto).await?;
+    assert_exists_predicate(&roundtrip_plan, true);
+    Ok(())
+}
+
 #[tokio::test]
 async fn roundtrip_not_exists_filter_left_anti_join() -> Result<()> {
     let plan = generate_plan_from_sql(
@@ -637,11 +755,11 @@ async fn roundtrip_not_exists_filter_left_anti_join() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     LeftAnti Join: book_author.isbn = book.isbn
       TableScan: book_author projection=[isbn, author]
       TableScan: book projection=[isbn]
-    "#
+    "
             );
     Ok(())
 }
@@ -657,11 +775,11 @@ async fn roundtrip_right_anti_join() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     RightAnti Join: book.isbn = book_author.isbn
       TableScan: book projection=[isbn]
       TableScan: book_author projection=[isbn, author]
-    "#
+    "
             );
     Ok(())
 }
@@ -677,11 +795,11 @@ async fn roundtrip_right_semi_join() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     RightSemi Join: book.isbn = book_author.isbn
       TableScan: book projection=[isbn]
       TableScan: book_author projection=[isbn, author]
-    "#
+    "
             );
     Ok(())
 }
@@ -697,12 +815,12 @@ async fn inner_join() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     Projection: data.a
       Inner Join: data.a = data2.a
         TableScan: data projection=[a]
         TableScan: data2 projection=[a]
-    "#
+    "
             );
     Ok(())
 }
@@ -726,17 +844,50 @@ async fn roundtrip_outer_join() -> Result<()> {
 async fn roundtrip_self_join() -> Result<()> {
     // Substrait does currently NOT maintain the alias of the tables.
     // Instead, when we consume Substrait, we add aliases before a join that'd otherwise collide.
-    // This roundtrip works because we set aliases to what the Substrait consumer will generate.
-    roundtrip("SELECT left.a as left_a, left.b, right.a as right_a, right.c FROM data AS left JOIN data AS right ON left.a = right.a").await?;
-    roundtrip("SELECT left.a as left_a, left.b, right.a as right_a, right.c FROM data AS left JOIN data AS right ON left.b = right.b").await
+    // The improved NameTracker now adds __temp__0 suffix to handle naming conflicts.
+    // We verify semantic equivalence rather than exact string match.
+    let ctx = create_context().await?;
+    let sql = "SELECT left.a as left_a, left.b, right.a as right_a, right.c FROM data AS left JOIN data AS right ON left.a = right.a";
+    let df = ctx.sql(sql).await?;
+    let plan = df.into_optimized_plan()?;
+    let plan2 = substrait_roundtrip(&plan, &ctx).await?;
+
+    // Verify schemas are equivalent
+    assert_eq!(plan.schema(), plan2.schema());
+
+    // Execute to ensure plan validity
+    DataFrame::new(ctx.state(), plan2).show().await?;
+
+    // Test second variant
+    let sql2 = "SELECT left.a as left_a, left.b, right.a as right_a, right.c FROM data AS left JOIN data AS right ON left.b = right.b";
+    let df2 = ctx.sql(sql2).await?;
+    let plan3 = df2.into_optimized_plan()?;
+    let plan4 = substrait_roundtrip(&plan3, &ctx).await?;
+    assert_eq!(plan3.schema(), plan4.schema());
+    DataFrame::new(ctx.state(), plan4).show().await?;
+
+    Ok(())
 }
 
 #[tokio::test]
 async fn roundtrip_self_implicit_cross_join() -> Result<()> {
     // Substrait does currently NOT maintain the alias of the tables.
     // Instead, when we consume Substrait, we add aliases before a join that'd otherwise collide.
-    // This roundtrip works because we set aliases to what the Substrait consumer will generate.
-    roundtrip("SELECT left.a left_a, left.b, right.a right_a, right.c FROM data AS left, data AS right").await
+    // The improved NameTracker now adds __temp__0 suffix to handle naming conflicts.
+    // We verify semantic equivalence rather than exact string match.
+    let ctx = create_context().await?;
+    let sql = "SELECT left.a left_a, left.b, right.a right_a, right.c FROM data AS left, data AS right";
+    let df = ctx.sql(sql).await?;
+    let plan = df.into_optimized_plan()?;
+    let plan2 = substrait_roundtrip(&plan, &ctx).await?;
+
+    // Verify schemas are equivalent
+    assert_eq!(plan.schema(), plan2.schema());
+
+    // Execute to ensure plan validity
+    DataFrame::new(ctx.state(), plan2).show().await?;
+
+    Ok(())
 }
 
 #[tokio::test]
@@ -750,14 +901,14 @@ async fn self_join_introduces_aliases() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     Projection: left.b, right.c
       Inner Join: left.b = right.b
         SubqueryAlias: left
           TableScan: data projection=[b]
         SubqueryAlias: right
           TableScan: data projection=[b, c]
-    "#
+    "
             );
     Ok(())
 }
@@ -907,26 +1058,27 @@ async fn aggregate_wo_projection_consume() -> Result<()> {
     let plan = generate_plan_from_substrait(proto_plan).await?;
     assert_snapshot!(
     plan,
-    @r#"
-            Aggregate: groupBy=[[data.a]], aggr=[[count(data.a) AS countA]]
-              TableScan: data projection=[a]
-            "#
+    @r"
+    Aggregate: groupBy=[[data.a]], aggr=[[count(data.a) AS countA]]
+      TableScan: data projection=[a]
+    "
         );
     Ok(())
 }
 
 #[tokio::test]
 async fn aggregate_wo_projection_group_expression_ref_consume() -> Result<()> {
-    let proto_plan =
-        read_json("tests/testdata/test_plans/aggregate_no_project_group_expression_ref.substrait.json");
+    let proto_plan = read_json(
+        "tests/testdata/test_plans/aggregate_no_project_group_expression_ref.substrait.json",
+    );
 
     let plan = generate_plan_from_substrait(proto_plan).await?;
     assert_snapshot!(
     plan,
-    @r#"
-            Aggregate: groupBy=[[data.a]], aggr=[[count(data.a) AS countA]]
-              TableScan: data projection=[a]
-            "#
+    @r"
+    Aggregate: groupBy=[[data.a]], aggr=[[count(data.a) AS countA]]
+      TableScan: data projection=[a]
+    "
         );
     Ok(())
 }
@@ -939,26 +1091,27 @@ async fn aggregate_wo_projection_sorted_consume() -> Result<()> {
     let plan = generate_plan_from_substrait(proto_plan).await?;
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     Aggregate: groupBy=[[data.a]], aggr=[[count(data.a) ORDER BY [data.a DESC NULLS FIRST] AS countA]]
       TableScan: data projection=[a]
-    "#
+    "
             );
     Ok(())
 }
 
 #[tokio::test]
 async fn aggregate_identical_grouping_expressions() -> Result<()> {
-    let proto_plan =
-        read_json("tests/testdata/test_plans/aggregate_identical_grouping_expressions.substrait.json");
+    let proto_plan = read_json(
+        "tests/testdata/test_plans/aggregate_identical_grouping_expressions.substrait.json",
+    );
 
     let plan = generate_plan_from_substrait(proto_plan).await?;
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     Aggregate: groupBy=[[Int32(1) AS grouping_col_1, Int32(1) AS grouping_col_2]], aggr=[[]]
       TableScan: data projection=[]
-    "#
+    "
             );
     Ok(())
 }
@@ -1099,6 +1252,96 @@ async fn simple_intersect_table_reuse() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn self_referential_intersect() -> Result<()> {
+    // Test INTERSECT with the same table on both sides
+    // This previously failed with "Schema contains duplicate qualified field name"
+    // The fix ensures requalify_sides_if_needed is called in intersect_or_except
+    // After roundtrip through Substrait, SubqueryAlias is lost and requalification
+    // produces "left" and "right" aliases
+    // Note: INTERSECT (without ALL) includes DISTINCT, but the outer Aggregate
+    // is optimized away, resulting in just the **LeftSemi** join
+    // (LeftSemi returns rows from left that exist in right)
+    assert_expected_plan(
+        "SELECT a FROM data WHERE a > 0 INTERSECT SELECT a FROM data WHERE a < 5",
+        "LeftSemi Join: left.a = right.a\
+        \n  SubqueryAlias: left\
+        \n    Aggregate: groupBy=[[data.a]], aggr=[[]]\
+        \n      Filter: data.a > Int64(0)\
+        \n        TableScan: data projection=[a], partial_filters=[data.a > Int64(0)]\
+        \n  SubqueryAlias: right\
+        \n    Filter: data.a < Int64(5)\
+        \n      TableScan: data projection=[a], partial_filters=[data.a < Int64(5)]",
+        true,
+    )
+    .await
+}
+
+#[tokio::test]
+async fn self_referential_except() -> Result<()> {
+    // Test EXCEPT with the same table on both sides
+    // This previously failed with "Schema contains duplicate qualified field name"
+    // The fix ensures requalify_sides_if_needed is called in intersect_or_except
+    // After roundtrip through Substrait, SubqueryAlias is lost and requalification
+    // produces "left" and "right" aliases
+    // Note: EXCEPT (without ALL) includes DISTINCT, but the outer Aggregate
+    // is optimized away, resulting in just the **LeftAnti** join
+    // (LeftAnti returns rows from left that don't exist in right)
+    assert_expected_plan(
+        "SELECT a FROM data WHERE a > 0 EXCEPT SELECT a FROM data WHERE a < 5",
+        "LeftAnti Join: left.a = right.a\
+        \n  SubqueryAlias: left\
+        \n    Aggregate: groupBy=[[data.a]], aggr=[[]]\
+        \n      Filter: data.a > Int64(0)\
+        \n        TableScan: data projection=[a], partial_filters=[data.a > Int64(0)]\
+        \n  SubqueryAlias: right\
+        \n    Filter: data.a < Int64(5)\
+        \n      TableScan: data projection=[a], partial_filters=[data.a < Int64(5)]",
+        true,
+    )
+    .await
+}
+
+#[tokio::test]
+async fn self_referential_intersect_all() -> Result<()> {
+    // Test INTERSECT ALL with the same table on both sides
+    // INTERSECT ALL preserves duplicates and does not include DISTINCT
+    // Uses **LeftSemi** join (returns rows from left that exist in right)
+    // The requalification ensures no duplicate field name errors
+    assert_expected_plan(
+        "SELECT a FROM data WHERE a > 0 INTERSECT ALL SELECT a FROM data WHERE a < 5",
+        "LeftSemi Join: left.a = right.a\
+        \n  SubqueryAlias: left\
+        \n    Filter: data.a > Int64(0)\
+        \n      TableScan: data projection=[a], partial_filters=[data.a > Int64(0)]\
+        \n  SubqueryAlias: right\
+        \n    Filter: data.a < Int64(5)\
+        \n      TableScan: data projection=[a], partial_filters=[data.a < Int64(5)]",
+        true,
+    )
+    .await
+}
+
+#[tokio::test]
+async fn self_referential_except_all() -> Result<()> {
+    // Test EXCEPT ALL with the same table on both sides
+    // EXCEPT ALL preserves duplicates and does not include DISTINCT
+    // Uses **LeftAnti** join (returns rows from left that don't exist in right)
+    // The requalification ensures no duplicate field name errors
+    assert_expected_plan(
+        "SELECT a FROM data WHERE a > 0 EXCEPT ALL SELECT a FROM data WHERE a < 5",
+        "LeftAnti Join: left.a = right.a\
+        \n  SubqueryAlias: left\
+        \n    Filter: data.a > Int64(0)\
+        \n      TableScan: data projection=[a], partial_filters=[data.a > Int64(0)]\
+        \n  SubqueryAlias: right\
+        \n    Filter: data.a < Int64(5)\
+        \n      TableScan: data projection=[a], partial_filters=[data.a < Int64(5)]",
+        true,
+    )
+    .await
+}
+
 #[tokio::test]
 async fn simple_window_function() -> Result<()> {
     roundtrip("SELECT RANK() OVER (PARTITION BY a ORDER BY b), d, sum(b) OVER (PARTITION BY a) FROM data;").await
@@ -1178,10 +1421,10 @@ async fn roundtrip_literal_struct() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     Projection: Struct({c0:1,c1:true,c2:}) AS struct(Int64(1),Boolean(true),NULL)
       TableScan: data projection=[]
-    "#
+    "
             );
     Ok(())
 }
@@ -1198,7 +1441,7 @@ async fn roundtrip_literal_named_struct() -> Result<()> {
     assert_snapshot!(
     plan,
     @r#"
-    Projection: Struct({int_field:1,boolean_field:true,string_field:}) AS named_struct(Utf8("int_field"),Int64(1),Utf8("boolean_field"),Boolean(true),Utf8("string_field"),NULL)
+    Projection: CAST(Struct({c0:1,c1:true,c2:}) AS Struct("int_field": Int64, "boolean_field": Boolean, "string_field": Utf8View)) AS named_struct(Utf8("int_field"),Int64(1),Utf8("boolean_field"),Boolean(true),Utf8("string_field"),NULL)
       TableScan: data projection=[]
     "#
             );
@@ -1219,7 +1462,7 @@ async fn roundtrip_literal_renamed_struct() -> Result<()> {
     assert_snapshot!(
     plan,
     @r#"
-    Projection: Struct({int_field:1}) AS Struct({c0:1})
+    Projection: CAST(Struct({c0:1}) AS Struct("int_field": Int32))
       TableScan: data projection=[]
     "#
             );
@@ -1247,9 +1490,7 @@ async fn roundtrip_values() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
-    Values: (Int64(1), Utf8("a"), List([[-213.1, , 5.5, 2.0, 1.0], []]), LargeList([1, 2, 3]), Struct({c0:true,int_field:1,c2:}), List([{struct_field: {string_field: a}}, {struct_field: {string_field: b}}])), (Int64(NULL), Utf8(NULL), List(), LargeList(), Struct({c0:,int_field:,c2:}), List())
-    "#
+    @r#"Values: (Int64(1), Utf8("a"), List([[-213.1, , 5.5, 2.0, 1.0], []]), LargeList([1, 2, 3]), Struct({c0:true,int_field:1,c2:}), List([{struct_field: {string_field: a}}, {struct_field: {string_field: b}}])), (Int64(NULL), Utf8(NULL), List(), LargeList(), Struct({c0:,int_field:,c2:}), List())"#
             );
     Ok(())
 }
@@ -1266,6 +1507,34 @@ async fn roundtrip_values_no_columns() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn roundtrip_values_with_scalar_function() -> Result<()> {
+    let ctx = create_context().await?;
+    //  datafusion::functions_nested::map::map;
+    let expr = map(vec![lit("a")], vec![lit(1)]);
+    let plan = LogicalPlanBuilder::values(vec![vec![expr]])?.build()?;
+    let expected = ctx.state().optimize(&plan)?;
+
+    let actual = substrait_roundtrip(&plan, &ctx).await?;
+
+    let strip_aliases_from_values = |plan: &LogicalPlan| -> LogicalPlan {
+        plan.clone()
+            .map_expressions(|expr| Ok(Transformed::yes(expr.unalias())))
+            .map(|t| t.data)
+            .unwrap_or_else(|_| plan.clone())
+    };
+
+    let normalized_expected = strip_aliases_from_values(&expected);
+    let normalized_actual = strip_aliases_from_values(&actual);
+
+    assert_eq!(
+        format!("{normalized_expected}"),
+        format!("{normalized_actual}")
+    );
+    assert_eq!(normalized_expected.schema(), normalized_actual.schema());
+    Ok(())
+}
+
 #[tokio::test]
 async fn roundtrip_values_empty_relation() -> Result<()> {
     roundtrip("SELECT * FROM (VALUES ('a')) LIMIT 0").await
@@ -1275,16 +1544,26 @@ async fn roundtrip_values_empty_relation() -> Result<()> {
 async fn roundtrip_values_duplicate_column_join() -> Result<()> {
     // Substrait does currently NOT maintain the alias of the tables.
     // Instead, when we consume Substrait, we add aliases before a join that'd otherwise collide.
-    // This roundtrip works because we set aliases to what the Substrait consumer will generate.
-    roundtrip(
-        "SELECT left.column1 as c1, right.column1 as c2 \
+    // The improved NameTracker now adds __temp__0 suffix to handle naming conflicts.
+    // We verify semantic equivalence rather than exact string match.
+    let ctx = create_context().await?;
+    let sql = "SELECT left.column1 as c1, right.column1 as c2 \
     FROM \
         (VALUES (1)) AS left \
     JOIN \
         (VALUES (2)) AS right \
-    ON left.column1 == right.column1",
-    )
-    .await
+    ON left.column1 == right.column1";
+    let df = ctx.sql(sql).await?;
+    let plan = df.into_optimized_plan()?;
+    let plan2 = substrait_roundtrip(&plan, &ctx).await?;
+
+    // Verify schemas are equivalent
+    assert_eq!(plan.schema(), plan2.schema());
+
+    // Execute to ensure plan validity
+    DataFrame::new(ctx.state(), plan2).show().await?;
+
+    Ok(())
 }
 
 #[tokio::test]
@@ -1302,11 +1581,11 @@ async fn duplicate_column() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     Projection: data.a + Int64(1) AS sum_a, data.a + Int64(1) AS data.a + Int64(1)__temp__0 AS sum_a_2
       Projection: data.a + Int64(1)
         TableScan: data projection=[a]
-    "#
+    "
         );
     Ok(())
 }
@@ -1449,9 +1728,7 @@ async fn roundtrip_repartition_roundrobin() -> Result<()> {
         partitioning_scheme: Partitioning::RoundRobinBatch(8),
     });
 
-    let proto = to_substrait_plan(&plan, &ctx.state())?;
-    let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
-    let plan2 = ctx.state().optimize(&plan2)?;
+    let plan2 = substrait_roundtrip(&plan, &ctx).await?;
 
     assert_eq!(format!("{plan}"), format!("{plan2}"));
     Ok(())
@@ -1466,9 +1743,7 @@ async fn roundtrip_repartition_hash() -> Result<()> {
         partitioning_scheme: Partitioning::Hash(vec![col("data.a")], 8),
     });
 
-    let proto = to_substrait_plan(&plan, &ctx.state())?;
-    let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
-    let plan2 = ctx.state().optimize(&plan2)?;
+    let plan2 = substrait_roundtrip(&plan, &ctx).await?;
 
     assert_eq!(format!("{plan}"), format!("{plan2}"));
     Ok(())
@@ -1479,6 +1754,140 @@ async fn roundtrip_read_filter() -> Result<()> {
     roundtrip_verify_read_filter_count("SELECT a FROM data where a < 5", 1).await
 }
 
+#[tokio::test]
+async fn roundtrip_placeholder_sql_filter() -> Result<()> {
+    let plan = generate_plan_from_sql("SELECT a, b FROM data WHERE a > $1", false, false)
+        .await?;
+
+    assert_snapshot!(
+    plan,
+    @r"
+    Projection: data.a, data.b
+      Filter: data.a > $1
+        TableScan: data
+    "
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_placeholder_sql_projection() -> Result<()> {
+    let plan =
+        generate_plan_from_sql("SELECT a, $1 FROM data WHERE a > $2", false, false)
+            .await?;
+
+    assert_snapshot!(
+    plan,
+    @r"
+    Projection: data.a, $1
+      Filter: data.a > $2
+        TableScan: data
+    "
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_placeholder_typed_int64() -> Result<()> {
+    let ctx = create_context().await?;
+
+    let placeholder =
+        Expr::Placeholder(datafusion::logical_expr::expr::Placeholder::new_with_field(
+            "$1".into(),
+            Some(Arc::new(Field::new("$1", DataType::Int64, true))),
+        ));
+    let scan_plan = ctx.table("data").await?.into_optimized_plan()?;
+    let plan = LogicalPlanBuilder::from(scan_plan)
+        .filter(col("a").gt(placeholder))?
+        .build()?;
+
+    let proto = to_substrait_plan(&plan, &ctx.state())?;
+
+    // Verify the producer emits a DynamicParameter in the Substrait proto
+    let plan_rel = proto.relations.first().unwrap();
+    let plan_json = format!("{plan_rel:?}");
+    assert!(
+        plan_json.contains("DynamicParameter"),
+        "Substrait proto should contain DynamicParameter, got: {plan_json}"
+    );
+
+    let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
+
+    assert_snapshot!(
+    plan2,
+    @r"
+    Filter: data.a > $1
+      TableScan: data
+    "
+    );
+
+    assert_eq!(plan.schema(), plan2.schema());
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_placeholder_multiple_typed() -> Result<()> {
+    let ctx = create_context().await?;
+
+    let p1 =
+        Expr::Placeholder(datafusion::logical_expr::expr::Placeholder::new_with_field(
+            "$1".into(),
+            Some(Arc::new(Field::new("$1", DataType::Int64, true))),
+        ));
+    let p2 =
+        Expr::Placeholder(datafusion::logical_expr::expr::Placeholder::new_with_field(
+            "$2".into(),
+            Some(Arc::new(Field::new("$2", DataType::Decimal128(5, 2), true))),
+        ));
+    let scan_plan = ctx.table("data").await?.into_optimized_plan()?;
+    let plan = LogicalPlanBuilder::from(scan_plan)
+        .filter(col("a").gt(p1).and(col("b").lt(p2)))?
+        .build()?;
+
+    let proto = to_substrait_plan(&plan, &ctx.state())?;
+    let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
+
+    assert_snapshot!(
+    plan2,
+    @r"
+    Filter: data.a > $1 AND data.b < $2
+      TableScan: data
+    "
+    );
+
+    assert_eq!(plan.schema(), plan2.schema());
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_placeholder_typed_utf8() -> Result<()> {
+    let ctx = create_context().await?;
+
+    let placeholder =
+        Expr::Placeholder(datafusion::logical_expr::expr::Placeholder::new_with_field(
+            "$1".into(),
+            Some(Arc::new(Field::new("$1", DataType::Utf8, true))),
+        ));
+    let scan_plan = ctx.table("data").await?.into_optimized_plan()?;
+    let plan = LogicalPlanBuilder::from(scan_plan)
+        .filter(col("f").eq(placeholder))?
+        .build()?;
+
+    let proto = to_substrait_plan(&plan, &ctx.state())?;
+    let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
+
+    assert_snapshot!(
+    plan2,
+    @r"
+    Filter: data.f = $1
+      TableScan: data
+    "
+    );
+
+    assert_eq!(plan.schema(), plan2.schema());
+    Ok(())
+}
+
 fn check_post_join_filters(rel: &Rel) -> Result<()> {
     // search for target_rel and field value in proto
     match &rel.rel_type {
@@ -1650,9 +2059,7 @@ async fn assert_expected_plan(
     let ctx = create_context().await?;
     let df = ctx.sql(sql).await?;
     let plan = df.into_optimized_plan()?;
-    let proto = to_substrait_plan(&plan, &ctx.state())?;
-    let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
-    let plan2 = ctx.state().optimize(&plan2)?;
+    let plan2 = substrait_roundtrip(&plan, &ctx).await?;
 
     if assert_schema {
         assert_eq!(plan.schema(), plan2.schema());
@@ -1690,13 +2097,193 @@ async fn assert_substrait_sql(substrait_plan: Plan, sql: &str) -> Result<()> {
     Ok(())
 }
 
+async fn build_set_comparison_plan(
+    ctx: &SessionContext,
+    quantifier: SetQuantifier,
+    op: Operator,
+) -> Result<LogicalPlan> {
+    let base_scan = ctx.table("data").await?.into_unoptimized_plan();
+    let subquery_scan = ctx.table("data2").await?.into_unoptimized_plan();
+    let subquery_plan = LogicalPlanBuilder::from(subquery_scan)
+        .project(vec![col("data2.a")])?
+        .build()?;
+    let predicate = Expr::SetComparison(SetComparison::new(
+        Box::new(col("data.a")),
+        Subquery {
+            subquery: Arc::new(subquery_plan),
+            outer_ref_columns: vec![],
+            spans: Spans::new(),
+        },
+        op,
+        quantifier,
+    ));
+
+    LogicalPlanBuilder::from(base_scan)
+        .filter(predicate)?
+        .project(vec![col("data.a")])?
+        .build()
+}
+
+async fn build_scalar_subquery_projection_plan(
+    ctx: &SessionContext,
+) -> Result<LogicalPlan> {
+    let subquery_scan = ctx.table("data2").await?.into_unoptimized_plan();
+    let subquery_plan = LogicalPlanBuilder::from(subquery_scan)
+        .project(vec![col("a")])?
+        .limit(0, Some(1))?
+        .build()?;
+
+    let scalar_subquery = Expr::ScalarSubquery(Subquery {
+        subquery: Arc::new(subquery_plan),
+        outer_ref_columns: vec![],
+        spans: Spans::new(),
+    });
+
+    let outer_empty_relation = LogicalPlan::EmptyRelation(EmptyRelation {
+        produce_one_row: true,
+        schema: DFSchemaRef::new(DFSchema::empty()),
+    });
+
+    LogicalPlanBuilder::from(outer_empty_relation)
+        .project(vec![scalar_subquery.alias("sq")])?
+        .build()
+}
+
+async fn build_exists_filter_plan(
+    ctx: &SessionContext,
+    negated: bool,
+) -> Result<LogicalPlan> {
+    let base_scan = ctx.table("data").await?.into_unoptimized_plan();
+    let subquery_scan = ctx.table("data2").await?.into_unoptimized_plan();
+    let subquery_plan = LogicalPlanBuilder::from(subquery_scan)
+        .project(vec![col("data2.a")])?
+        .build()?;
+
+    let predicate = Expr::Exists(Exists::new(
+        Subquery {
+            subquery: Arc::new(subquery_plan),
+            outer_ref_columns: vec![],
+            spans: Spans::new(),
+        },
+        negated,
+    ));
+
+    LogicalPlanBuilder::from(base_scan)
+        .filter(predicate)?
+        .project(vec![col("data.a")])?
+        .build()
+}
+
+fn assert_set_comparison_predicate(
+    plan: &LogicalPlan,
+    expected_op: Operator,
+    expected_quantifier: SetQuantifier,
+) {
+    let predicate = match plan {
+        LogicalPlan::Projection(p) => match p.input.as_ref() {
+            LogicalPlan::Filter(filter) => &filter.predicate,
+            other => panic!("expected Filter inside Projection, got {other:?}"),
+        },
+        LogicalPlan::Filter(filter) => &filter.predicate,
+        other => panic!("expected Filter plan, got {other:?}"),
+    };
+
+    match predicate {
+        Expr::SetComparison(set_comparison) => {
+            assert_eq!(set_comparison.op, expected_op);
+            assert_eq!(set_comparison.quantifier, expected_quantifier);
+        }
+        other => panic!("expected SetComparison predicate, got {other:?}"),
+    }
+}
+
+fn assert_root_project_has_scalar_subquery(proto: &Plan) {
+    let relation = proto
+        .relations
+        .first()
+        .expect("expected Substrait plan to have at least one relation");
+
+    let root = match relation.rel_type.as_ref() {
+        Some(plan_rel::RelType::Root(root)) => root,
+        other => panic!("expected root relation, got {other:?}"),
+    };
+
+    let input = root.input.as_ref().expect("expected root input relation");
+    let project = match input.rel_type.as_ref() {
+        Some(RelType::Project(project)) => project,
+        other => panic!("expected Project relation at root input, got {other:?}"),
+    };
+
+    let expr = project
+        .expressions
+        .first()
+        .expect("expected at least one project expression");
+    let subquery = match expr.rex_type.as_ref() {
+        Some(substrait::proto::expression::RexType::Subquery(subquery)) => subquery,
+        other => panic!("expected Subquery expression, got {other:?}"),
+    };
+
+    assert!(
+        matches!(
+            subquery.subquery_type.as_ref(),
+            Some(substrait::proto::expression::subquery::SubqueryType::Scalar(_))
+        ),
+        "expected scalar subquery type"
+    );
+}
+
+fn assert_projection_contains_scalar_subquery(plan: &LogicalPlan) {
+    let projection = match plan {
+        LogicalPlan::Projection(projection) => projection,
+        other => panic!("expected Projection plan, got {other:?}"),
+    };
+
+    let found_scalar_subquery = projection.expr.iter().any(expr_contains_scalar_subquery);
+    assert!(
+        found_scalar_subquery,
+        "expected Projection to contain ScalarSubquery expression"
+    );
+}
+
+fn expr_contains_scalar_subquery(expr: &Expr) -> bool {
+    match expr {
+        Expr::ScalarSubquery(_) => true,
+        Expr::Alias(alias) => expr_contains_scalar_subquery(alias.expr.as_ref()),
+        _ => false,
+    }
+}
+
+fn assert_exists_predicate(plan: &LogicalPlan, expected_negated: bool) {
+    let predicate = match plan {
+        LogicalPlan::Projection(projection) => match projection.input.as_ref() {
+            LogicalPlan::Filter(filter) => &filter.predicate,
+            other => panic!("expected Filter inside Projection, got {other:?}"),
+        },
+        LogicalPlan::Filter(filter) => &filter.predicate,
+        other => panic!("expected Filter plan, got {other:?}"),
+    };
+
+    if expected_negated {
+        match predicate {
+            Expr::Not(inner) => match inner.as_ref() {
+                Expr::Exists(exists) => assert!(!exists.negated),
+                other => panic!("expected Exists inside NOT, got {other:?}"),
+            },
+            other => panic!("expected NOT EXISTS predicate, got {other:?}"),
+        }
+    } else {
+        match predicate {
+            Expr::Exists(exists) => assert!(!exists.negated),
+            other => panic!("expected EXISTS predicate, got {other:?}"),
+        }
+    }
+}
+
 async fn roundtrip_fill_na(sql: &str) -> Result<()> {
     let ctx = create_context().await?;
     let df = ctx.sql(sql).await?;
     let plan = df.into_optimized_plan()?;
-    let proto = to_substrait_plan(&plan, &ctx.state())?;
-    let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
-    let plan2 = ctx.state().optimize(&plan2)?;
+    let plan2 = substrait_roundtrip(&plan, &ctx).await?;
 
     // Format plan string and replace all None's with 0
     let plan1str = format!("{plan}").replace("None", "0");
@@ -1708,6 +2295,18 @@ async fn roundtrip_fill_na(sql: &str) -> Result<()> {
     Ok(())
 }
 
+/// Converts a logical plan to Substrait and back, applying optimization.
+/// Returns the roundtripped and optimized logical plan.
+async fn substrait_roundtrip(
+    plan: &LogicalPlan,
+    ctx: &SessionContext,
+) -> Result<LogicalPlan> {
+    let proto = to_substrait_plan(plan, &ctx.state())?;
+    let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
+    let plan2 = ctx.state().optimize(&plan2)?;
+    Ok(plan2)
+}
+
 async fn test_alias(sql_with_alias: &str, sql_no_alias: &str) -> Result<()> {
     // Since we ignore the SubqueryAlias in the producer, the result should be
     // the same as producing a Substrait plan from the same query without aliases
@@ -1735,8 +2334,7 @@ async fn roundtrip_logical_plan_with_ctx(
     ctx: SessionContext,
 ) -> Result<Box<Plan>> {
     let proto = to_substrait_plan(&plan, &ctx.state())?;
-    let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
-    let plan2 = ctx.state().optimize(&plan2)?;
+    let plan2 = substrait_roundtrip(&plan, &ctx).await?;
 
     let plan1str = format!("{plan}");
     let plan2str = format!("{plan2}");
diff --git a/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs
index 64599465f96f7..9773cf4aba10f 100644
--- a/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs
+++ b/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs
@@ -26,7 +26,7 @@ use datafusion::datasource::physical_plan::{
     FileGroup, FileScanConfigBuilder, ParquetSource,
 };
 use datafusion::error::Result;
-use datafusion::physical_plan::{displayable, ExecutionPlan};
+use datafusion::physical_plan::{ExecutionPlan, displayable};
 use datafusion::prelude::{ParquetReadOptions, SessionContext};
 use datafusion_substrait::physical_plan::{consumer, producer};
 
@@ -35,24 +35,22 @@ use substrait::proto::extensions;
 
 #[tokio::test]
 async fn parquet_exec() -> Result<()> {
-    let source = Arc::new(ParquetSource::default());
-
-    let scan_config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::local_filesystem(),
-        Arc::new(Schema::empty()),
-        source,
-    )
-    .with_file_groups(vec![
-        FileGroup::new(vec![PartitionedFile::new(
-            "file://foo/part-0.parquet".to_string(),
-            123,
-        )]),
-        FileGroup::new(vec![PartitionedFile::new(
-            "file://foo/part-1.parquet".to_string(),
-            123,
-        )]),
-    ])
-    .build();
+    let schema = Arc::new(Schema::empty());
+    let source = Arc::new(ParquetSource::new(schema.clone()));
+
+    let scan_config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source)
+            .with_file_groups(vec![
+                FileGroup::new(vec![PartitionedFile::new(
+                    "file://foo/part-0.parquet".to_string(),
+                    123,
+                )]),
+                FileGroup::new(vec![PartitionedFile::new(
+                    "file://foo/part-1.parquet".to_string(),
+                    123,
+                )]),
+            ])
+            .build();
     let parquet_exec: Arc<dyn ExecutionPlan> =
         DataSourceExec::from_data_source(scan_config);
 
diff --git a/datafusion/substrait/tests/cases/serialize.rs b/datafusion/substrait/tests/cases/serialize.rs
index 39c0622e3ba39..2d7257fad3394 100644
--- a/datafusion/substrait/tests/cases/serialize.rs
+++ b/datafusion/substrait/tests/cases/serialize.rs
@@ -17,7 +17,6 @@
 
 #[cfg(test)]
 mod tests {
-    use datafusion::common::assert_contains;
     use datafusion::datasource::provider_as_source;
     use datafusion::logical_expr::LogicalPlanBuilder;
     use datafusion_substrait::logical_plan::consumer::from_substrait_plan;
@@ -31,7 +30,7 @@ mod tests {
     use std::fs;
     use substrait::proto::plan_rel::RelType;
     use substrait::proto::rel_common::{Emit, EmitKind};
-    use substrait::proto::{rel, RelCommon};
+    use substrait::proto::{RelCommon, rel};
 
     #[tokio::test]
     async fn serialize_to_file() -> Result<()> {
@@ -44,8 +43,18 @@ mod tests {
         serializer::deserialize(path).await?;
 
         // Test case 2: serializing to an existing file should fail.
-        let got = serializer::serialize(sql, &ctx, path).await.unwrap_err();
-        assert_contains!(got.to_string(), "File exists");
+        let got = serializer::serialize(sql, &ctx, path)
+            .await
+            .unwrap_err()
+            .to_string();
+        assert!(
+            [
+                "File exists", // unix
+                "os error 80"  // windows
+            ]
+            .iter()
+            .any(|s| got.contains(s))
+        );
 
         fs::remove_file(path)?;
 
@@ -95,10 +104,10 @@ mod tests {
 
         assert_snapshot!(
                     format!("{}", datafusion_plan),
-                    @r#"
-Projection: data.b, data.a + data.a, data.a
-  TableScan: data projection=[a, b]
-"#
+                    @r"
+        Projection: data.b, data.a + data.a, data.a
+          TableScan: data projection=[a, b]
+        "
         ,
                 );
 
@@ -142,11 +151,11 @@ Projection: data.b, data.a + data.a, data.a
         let datafusion_plan = df.into_optimized_plan()?;
         assert_snapshot!(
                     datafusion_plan,
-                    @r#"
-Projection: data.b, rank() PARTITION BY [data.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, data.c
-  WindowAggr: windowExpr=[[rank() PARTITION BY [data.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    TableScan: data projection=[a, b, c]
-"#
+                    @r"
+        Projection: data.b, rank() PARTITION BY [data.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, data.c
+          WindowAggr: windowExpr=[[rank() PARTITION BY [data.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+            TableScan: data projection=[a, b, c]
+        "
         ,
                 );
 
diff --git a/datafusion/substrait/tests/cases/substrait_validations.rs b/datafusion/substrait/tests/cases/substrait_validations.rs
index c8cc3fe9940ce..9841c736da8c9 100644
--- a/datafusion/substrait/tests/cases/substrait_validations.rs
+++ b/datafusion/substrait/tests/cases/substrait_validations.rs
@@ -69,10 +69,10 @@ mod tests {
 
             assert_snapshot!(
             plan,
-            @r#"
-                Projection: DATA.a, DATA.b
-                  TableScan: DATA
-                "#
+            @r"
+            Projection: DATA.a, DATA.b
+              TableScan: DATA
+            "
                         );
             Ok(())
         }
@@ -92,10 +92,10 @@ mod tests {
 
             assert_snapshot!(
             plan,
-            @r#"
-                Projection: DATA.a, DATA.b
-                  TableScan: DATA projection=[a, b]
-                "#
+            @r"
+            Projection: DATA.a, DATA.b
+              TableScan: DATA projection=[a, b]
+            "
                         );
             Ok(())
         }
@@ -117,10 +117,10 @@ mod tests {
 
             assert_snapshot!(
             plan,
-            @r#"
-                Projection: DATA.a, DATA.b
-                  TableScan: DATA projection=[a, b]
-                "#
+            @r"
+            Projection: DATA.a, DATA.b
+              TableScan: DATA projection=[a, b]
+            "
                         );
             Ok(())
         }
diff --git a/datafusion/substrait/tests/testdata/test_plans/duplicate_name_in_union.substrait.json b/datafusion/substrait/tests/testdata/test_plans/duplicate_name_in_union.substrait.json
new file mode 100644
index 0000000000000..1da2ff6131368
--- /dev/null
+++ b/datafusion/substrait/tests/testdata/test_plans/duplicate_name_in_union.substrait.json
@@ -0,0 +1,171 @@
+{
+  "version": {
+    "minorNumber": 54,
+    "producer": "datafusion-test"
+  },
+  "relations": [
+    {
+      "root": {
+        "input": {
+          "set": {
+            "common": {
+              "direct": {}
+            },
+            "inputs": [
+              {
+                "project": {
+                  "common": {
+                    "emit": {
+                      "outputMapping": [2, 3]
+                    }
+                  },
+                  "input": {
+                    "read": {
+                      "common": {
+                        "direct": {}
+                      },
+                      "baseSchema": {
+                        "names": ["foo", "bar"],
+                        "struct": {
+                          "types": [
+                            {
+                              "i64": {
+                                "nullability": "NULLABILITY_REQUIRED"
+                              }
+                            },
+                            {
+                              "i64": {
+                                "nullability": "NULLABILITY_REQUIRED"
+                              }
+                            }
+                          ],
+                          "nullability": "NULLABILITY_REQUIRED"
+                        }
+                      },
+                      "virtualTable": {
+                        "expressions": [
+                          {
+                            "fields": [
+                              {
+                                "literal": {
+                                  "i64": "100"
+                                }
+                              },
+                              {
+                                "literal": {
+                                  "i64": "200"
+                                }
+                              }
+                            ]
+                          }
+                        ]
+                      }
+                    }
+                  },
+                  "expressions": [
+                    {
+                      "selection": {
+                        "directReference": {
+                          "structField": {
+                            "field": 0
+                          }
+                        },
+                        "rootReference": {}
+                      }
+                    },
+                    {
+                      "selection": {
+                        "directReference": {
+                          "structField": {
+                            "field": 1
+                          }
+                        },
+                        "rootReference": {}
+                      }
+                    }
+                  ]
+                }
+              },
+              {
+                "project": {
+                  "common": {
+                    "emit": {
+                      "outputMapping": [2, 3]
+                    }
+                  },
+                  "input": {
+                    "read": {
+                      "common": {
+                        "direct": {}
+                      },
+                      "baseSchema": {
+                        "names": ["x", "foo"],
+                        "struct": {
+                          "types": [
+                            {
+                              "i32": {
+                                "nullability": "NULLABILITY_REQUIRED"
+                              }
+                            },
+                            {
+                              "i64": {
+                                "nullability": "NULLABILITY_REQUIRED"
+                              }
+                            }
+                          ],
+                          "nullability": "NULLABILITY_REQUIRED"
+                        }
+                      },
+                      "virtualTable": {
+                        "expressions": [
+                          {
+                            "fields": [
+                              {
+                                "literal": {
+                                  "i32": 300
+                                }
+                              },
+                              {
+                                "literal": {
+                                  "i64": "400"
+                                }
+                              }
+                            ]
+                          }
+                        ]
+                      }
+                    }
+                  },
+                  "expressions": [
+                    {
+                      "selection": {
+                        "directReference": {
+                          "structField": {
+                            "field": 0
+                          }
+                        },
+                        "rootReference": {}
+                      }
+                    },
+                    {
+                      "selection": {
+                        "directReference": {
+                          "structField": {
+                            "field": 1
+                          }
+                        },
+                        "rootReference": {}
+                      }
+                    }
+                  ]
+                }
+              }
+            ],
+            "op": "SET_OP_UNION_ALL"
+          }
+        },
+        "names": ["col1", "col2"]
+      }
+    }
+  ]
+}
diff --git a/datafusion/substrait/tests/testdata/test_plans/mixed_join_equal_and_indistinct.json b/datafusion/substrait/tests/testdata/test_plans/mixed_join_equal_and_indistinct.json
new file mode 100644
index 0000000000000..642256c562995
--- /dev/null
+++ b/datafusion/substrait/tests/testdata/test_plans/mixed_join_equal_and_indistinct.json
@@ -0,0 +1,102 @@
+{
+  "extensions": [
+    { "extensionFunction": { "functionAnchor": 0, "name": "is_not_distinct_from" } },
+    { "extensionFunction": { "functionAnchor": 1, "name": "equal" } },
+    { "extensionFunction": { "functionAnchor": 2, "name": "and" } }
+  ],
+  "relations": [{
+    "root": {
+      "input": {
+        "join": {
+          "common": { "direct": {} },
+          "left": {
+            "read": {
+              "common": { "direct": {} },
+              "baseSchema": {
+                "names": ["id", "val", "comment"],
+                "struct": {
+                  "types": [
+                    { "string": { "nullability": "NULLABILITY_REQUIRED" } },
+                    { "string": { "nullability": "NULLABILITY_NULLABLE" } },
+                    { "string": { "nullability": "NULLABILITY_REQUIRED" } }
+                  ],
+                  "nullability": "NULLABILITY_REQUIRED"
+                }
+              },
+              "virtualTable": {
+                "values": [
+                  { "fields": [{ "string": "1", "nullable": false }, { "string": "a", "nullable": true }, { "string": "c1", "nullable": false }] },
+                  { "fields": [{ "string": "2", "nullable": false }, { "string": "b", "nullable": true }, { "string": "c2", "nullable": false }] },
+                  { "fields": [{ "string": "3", "nullable": false }, { "null": { "string": { "nullability": "NULLABILITY_NULLABLE" } }, "nullable": true }, { "string": "c3", "nullable": false }] },
+                  { "fields": [{ "string": "4", "nullable": false }, { "null": { "string": { "nullability": "NULLABILITY_NULLABLE" } }, "nullable": true }, { "string": "c4", "nullable": false }] },
+                  { "fields": [{ "string": "5", "nullable": false }, { "string": "e", "nullable": true }, { "string": "c5", "nullable": false }] },
+                  { "fields": [{ "string": "6", "nullable": false }, { "string": "f", "nullable": true }, { "string": "c6", "nullable": false }] }
+                ]
+              }
+            }
+          },
+          "right": {
+            "read": {
+              "common": { "direct": {} },
+              "baseSchema": {
+                "names": ["id", "val", "comment"],
+                "struct": {
+                  "types": [
+                    { "string": { "nullability": "NULLABILITY_REQUIRED" } },
+                    { "string": { "nullability": "NULLABILITY_NULLABLE" } },
+                    { "string": { "nullability": "NULLABILITY_REQUIRED" } }
+                  ],
+                  "nullability": "NULLABILITY_REQUIRED"
+                }
+              },
+              "virtualTable": {
+                "values": [
+                  { "fields": [{ "string": "1", "nullable": false }, { "string": "a", "nullable": true }, { "string": "c1", "nullable": false }] },
+                  { "fields": [{ "string": "2", "nullable": false }, { "string": "b", "nullable": true }, { "string": "c2", "nullable": false }] },
+                  { "fields": [{ "string": "3", "nullable": false }, { "null": { "string": { "nullability": "NULLABILITY_NULLABLE" } }, "nullable": true }, { "string": "c3", "nullable": false }] },
+                  { "fields": [{ "string": "4", "nullable": false }, { "null": { "string": { "nullability": "NULLABILITY_NULLABLE" } }, "nullable": true }, { "string": "c4", "nullable": false }] },
+                  { "fields": [{ "string": "5", "nullable": false }, { "string": "e", "nullable": true }, { "string": "c5", "nullable": false }] },
+                  { "fields": [{ "string": "6", "nullable": false }, { "string": "f", "nullable": true }, { "string": "c6", "nullable": false }] }
+                ]
+              }
+            }
+          },
+          "expression": {
+            "scalarFunction": {
+              "functionReference": 2,
+              "outputType": { "bool": { "nullability": "NULLABILITY_NULLABLE" } },
+              "arguments": [
+                {
+                  "value": {
+                    "scalarFunction": {
+                      "functionReference": 0,
+                      "outputType": { "bool": { "nullability": "NULLABILITY_NULLABLE" } },
+                      "arguments": [
+                        { "value": { "selection": { "directReference": { "structField": { "field": 1 } }, "rootReference": {} } } },
+                        { "value": { "selection": { "directReference": { "structField": { "field": 4 } }, "rootReference": {} } } }
+                      ]
+                    }
+                  }
+                },
+                {
+                  "value": {
+                    "scalarFunction": {
+                      "functionReference": 1,
+                      "outputType": { "bool": { "nullability": "NULLABILITY_NULLABLE" } },
+                      "arguments": [
+                        { "value": { "selection": { "directReference": { "structField": { "field": 0 } }, "rootReference": {} } } },
+                        { "value": { "selection": { "directReference": { "structField": { "field": 3 } }, "rootReference": {} } } }
+                      ]
+                    }
+                  }
+                }
+              ]
+            }
+          },
+          "type": "JOIN_TYPE_INNER"
+        }
+      },
+      "names": ["id", "val", "comment", "id0", "val0", "comment0"]
+    }
+  }]
+}
diff --git a/datafusion/substrait/tests/testdata/test_plans/mixed_join_equal_and_indistinct_left.json b/datafusion/substrait/tests/testdata/test_plans/mixed_join_equal_and_indistinct_left.json
new file mode 100644
index 0000000000000..f16672947e1ee
--- /dev/null
+++ b/datafusion/substrait/tests/testdata/test_plans/mixed_join_equal_and_indistinct_left.json
@@ -0,0 +1,102 @@
+{
+  "extensions": [
+    { "extensionFunction": { "functionAnchor": 0, "name": "is_not_distinct_from" } },
+    { "extensionFunction": { "functionAnchor": 1, "name": "equal" } },
+    { "extensionFunction": { "functionAnchor": 2, "name": "and" } }
+  ],
+  "relations": [{
+    "root": {
+      "input": {
+        "join": {
+          "common": { "direct": {} },
+          "left": {
+            "read": {
+              "common": { "direct": {} },
+              "baseSchema": {
+                "names": ["id", "val", "comment"],
+                "struct": {
+                  "types": [
+                    { "string": { "nullability": "NULLABILITY_REQUIRED" } },
+                    { "string": { "nullability": "NULLABILITY_NULLABLE" } },
+                    { "string": { "nullability": "NULLABILITY_REQUIRED" } }
+                  ],
+                  "nullability": "NULLABILITY_REQUIRED"
+                }
+              },
+              "virtualTable": {
+                "values": [
+                  { "fields": [{ "string": "1", "nullable": false }, { "string": "a", "nullable": true }, { "string": "c1", "nullable": false }] },
+                  { "fields": [{ "string": "2", "nullable": false }, { "string": "b", "nullable": true }, { "string": "c2", "nullable": false }] },
+                  { "fields": [{ "string": "3", "nullable": false }, { "null": { "string": { "nullability": "NULLABILITY_NULLABLE" } }, "nullable": true }, { "string": "c3", "nullable": false }] },
+                  { "fields": [{ "string": "4", "nullable": false }, { "null": { "string": { "nullability": "NULLABILITY_NULLABLE" } }, "nullable": true }, { "string": "c4", "nullable": false }] },
+                  { "fields": [{ "string": "5", "nullable": false }, { "string": "e", "nullable": true }, { "string": "c5", "nullable": false }] },
+                  { "fields": [{ "string": "6", "nullable": false }, { "string": "f", "nullable": true }, { "string": "c6", "nullable": false }] }
+                ]
+              }
+            }
+          },
+          "right": {
+            "read": {
+              "common": { "direct": {} },
+              "baseSchema": {
+                "names": ["id", "val", "comment"],
+                "struct": {
+                  "types": [
+                    { "string": { "nullability": "NULLABILITY_REQUIRED" } },
+                    { "string": { "nullability": "NULLABILITY_NULLABLE" } },
+                    { "string": { "nullability": "NULLABILITY_REQUIRED" } }
+                  ],
+                  "nullability": "NULLABILITY_REQUIRED"
+                }
+              },
+              "virtualTable": {
+                "values": [
+                  { "fields": [{ "string": "1", "nullable": false }, { "string": "a", "nullable": true }, { "string": "c1", "nullable": false }] },
+                  { "fields": [{ "string": "2", "nullable": false }, { "string": "b", "nullable": true }, { "string": "c2", "nullable": false }] },
+                  { "fields": [{ "string": "3", "nullable": false }, { "null": { "string": { "nullability": "NULLABILITY_NULLABLE" } }, "nullable": true }, { "string": "c3", "nullable": false }] },
+                  { "fields": [{ "string": "4", "nullable": false }, { "null": { "string": { "nullability": "NULLABILITY_NULLABLE" } }, "nullable": true }, { "string": "c4", "nullable": false }] },
+                  { "fields": [{ "string": "5", "nullable": false }, { "string": "e", "nullable": true }, { "string": "c5", "nullable": false }] },
+                  { "fields": [{ "string": "6", "nullable": false }, { "string": "f", "nullable": true }, { "string": "c6", "nullable": false }] }
+                ]
+              }
+            }
+          },
+          "expression": {
+            "scalarFunction": {
+              "functionReference": 2,
+              "outputType": { "bool": { "nullability": "NULLABILITY_NULLABLE" } },
+              "arguments": [
+                {
+                  "value": {
+                    "scalarFunction": {
+                      "functionReference": 0,
+                      "outputType": { "bool": { "nullability": "NULLABILITY_NULLABLE" } },
+                      "arguments": [
+                        { "value": { "selection": { "directReference": { "structField": { "field": 1 } }, "rootReference": {} } } },
+                        { "value": { "selection": { "directReference": { "structField": { "field": 4 } }, "rootReference": {} } } }
+                      ]
+                    }
+                  }
+                },
+                {
+                  "value": {
+                    "scalarFunction": {
+                      "functionReference": 1,
+                      "outputType": { "bool": { "nullability": "NULLABILITY_NULLABLE" } },
+                      "arguments": [
+                        { "value": { "selection": { "directReference": { "structField": { "field": 0 } }, "rootReference": {} } } },
+                        { "value": { "selection": { "directReference": { "structField": { "field": 3 } }, "rootReference": {} } } }
+                      ]
+                    }
+                  }
+                }
+              ]
+            }
+          },
+          "type": "JOIN_TYPE_LEFT"
+        }
+      },
+      "names": ["id", "val", "comment", "id0", "val0", "comment0"]
+    }
+  }]
+}
diff --git a/datafusion/substrait/tests/testdata/test_plans/nested_correlated_subquery.substrait.json b/datafusion/substrait/tests/testdata/test_plans/nested_correlated_subquery.substrait.json
new file mode 100644
index 0000000000000..6c565a0f94e2f
--- /dev/null
+++ b/datafusion/substrait/tests/testdata/test_plans/nested_correlated_subquery.substrait.json
@@ -0,0 +1,265 @@
+{
+  "extensionUris": [{
+    "extensionUriAnchor": 1,
+    "uri": "/functions_boolean.yaml"
+  }, {
+    "extensionUriAnchor": 2,
+    "uri": "/functions_comparison.yaml"
+  }],
+  "extensions": [{
+    "extensionFunction": {
+      "extensionUriReference": 1,
+      "name": "and:bool"
+    }
+  }, {
+    "extensionFunction": {
+      "extensionUriReference": 2,
+      "functionAnchor": 1,
+      "name": "equal:any_any"
+    }
+  }],
+  "relations": [{
+    "root": {
+      "input": {
+        "filter": {
+          "common": {
+            "direct": {}
+          },
+          "input": {
+            "read": {
+              "common": {
+                "direct": {}
+              },
+              "baseSchema": {
+                "names": ["a1", "a2"],
+                "struct": {
+                  "types": [{
+                    "i64": {
+                      "nullability": "NULLABILITY_REQUIRED"
+                    }
+                  }, {
+                    "i64": {
+                      "nullability": "NULLABILITY_REQUIRED"
+                    }
+                  }],
+                  "nullability": "NULLABILITY_REQUIRED"
+                }
+              },
+              "namedTable": {
+                "names": ["A"]
+              }
+            }
+          },
+          "condition": {
+            "subquery": {
+              "setPredicate": {
+                "predicateOp": "PREDICATE_OP_EXISTS",
+                "tuples": {
+                  "filter": {
+                    "common": {
+                      "direct": {}
+                    },
+                    "input": {
+                      "read": {
+                        "common": {
+                          "direct": {}
+                        },
+                        "baseSchema": {
+                          "names": ["b1", "b2"],
+                          "struct": {
+                            "types": [{
+                              "i64": {
+                                "nullability": "NULLABILITY_REQUIRED"
+                              }
+                            }, {
+                              "i64": {
+                                "nullability": "NULLABILITY_REQUIRED"
+                              }
+                            }],
+                            "nullability": "NULLABILITY_REQUIRED"
+                          }
+                        },
+                        "namedTable": {
+                          "names": ["B"]
+                        }
+                      }
+                    },
+                    "condition": {
+                      "scalarFunction": {
+                        "outputType": {
+                          "bool": {
+                            "nullability": "NULLABILITY_REQUIRED"
+                          }
+                        },
+                        "arguments": [{
+                          "value": {
+                            "scalarFunction": {
+                              "functionReference": 1,
+                              "outputType": {
+                                "bool": {
+                                  "nullability": "NULLABILITY_REQUIRED"
+                                }
+                              },
+                              "arguments": [{
+                                "value": {
+                                  "selection": {
+                                    "directReference": {
+                                      "structField": {
+                                        "field": 0
+                                      }
+                                    },
+                                    "rootReference": {}
+                                  }
+                                }
+                              }, {
+                                "value": {
+                                  "selection": {
+                                    "directReference": {
+                                      "structField": {
+                                        "field": 0
+                                      }
+                                    },
+                                    "outerReference": {
+                                      "stepsOut": 1
+                                    }
+                                  }
+                                }
+                              }]
+                            }
+                          }
+                        }, {
+                          "value": {
+                            "subquery": {
+                              "setPredicate": {
+                                "predicateOp": "PREDICATE_OP_EXISTS",
+                                "tuples": {
+                                  "filter": {
+                                    "common": {
+                                      "direct": {}
+                                    },
+                                    "input": {
+                                      "read": {
+                                        "common": {
+                                          "direct": {}
+                                        },
+                                        "baseSchema": {
+                                          "names": ["c1", "c2"],
+                                          "struct": {
+                                            "types": [{
+                                              "i64": {
+                                                "nullability": "NULLABILITY_REQUIRED"
+                                              }
+                                            }, {
+                                              "i64": {
+                                                "nullability": "NULLABILITY_REQUIRED"
+                                              }
+                                            }],
+                                            "nullability": "NULLABILITY_REQUIRED"
+                                          }
+                                        },
+                                        "namedTable": {
+                                          "names": ["C"]
+                                        }
+                                      }
+                                    },
+                                    "condition": {
+                                      "scalarFunction": {
+                                        "outputType": {
+                                          "bool": {
+                                            "nullability": "NULLABILITY_REQUIRED"
+                                          }
+                                        },
+                                        "arguments": [{
+                                          "value": {
+                                            "scalarFunction": {
+                                              "functionReference": 1,
+                                              "outputType": {
+                                                "bool": {
+                                                  "nullability": "NULLABILITY_REQUIRED"
+                                                }
+                                              },
+                                              "arguments": [{
+                                                "value": {
+                                                  "selection": {
+                                                    "directReference": {
+                                                      "structField": {
+                                                        "field": 0
+                                                      }
+                                                    },
+                                                    "rootReference": {}
+                                                  }
+                                                }
+                                              }, {
+                                                "value": {
+                                                  "selection": {
+                                                    "directReference": {
+                                                      "structField": {
+                                                        "field": 0
+                                                      }
+                                                    },
+                                                    "outerReference": {
+                                                      "stepsOut": 2
+                                                    }
+                                                  }
+                                                }
+                                              }]
+                                            }
+                                          }
+                                        }, {
+                                          "value": {
+                                            "scalarFunction": {
+                                              "functionReference": 1,
+                                              "outputType": {
+                                                "bool": {
+                                                  "nullability": "NULLABILITY_REQUIRED"
+                                                }
+                                              },
+                                              "arguments": [{
+                                                "value": {
+                                                  "selection": {
+                                                    "directReference": {
+                                                      "structField": {
+                                                        "field": 1
+                                                      }
+                                                    },
+                                                    "rootReference": {}
+                                                  }
+                                                }
+                                              }, {
+                                                "value": {
+                                                  "selection": {
+                                                    "directReference": {
+                                                      "structField": {
+                                                        "field": 1
+                                                      }
+                                                    },
+                                                    "outerReference": {
+                                                      "stepsOut": 1
+                                                    }
+                                                  }
+                                                }
+                                              }]
+                                            }
+                                          }
+                                        }]
+                                      }
+                                    }
+                                  }
+                                }
+                              }
+                            }
+                          }
+                        }]
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      },
+      "names": ["a1", "a2"]
+    }
+  }]
+}
diff --git a/datafusion/substrait/tests/testdata/test_plans/nested_list_expressions.substrait.json b/datafusion/substrait/tests/testdata/test_plans/nested_list_expressions.substrait.json
new file mode 100644
index 0000000000000..85a69c41c5eb1
--- /dev/null
+++ b/datafusion/substrait/tests/testdata/test_plans/nested_list_expressions.substrait.json
@@ -0,0 +1,77 @@
+{
+  "relations": [
+    {
+      "root": {
+        "input": {
+          "project": {
+            "common": {
+              "emit": {
+                "outputMapping": [2]
+              }
+            },
+            "input": {
+              "read": {
+                "common": {
+                  "direct": {}
+                },
+                "baseSchema": {
+                  "names": ["a", "b"],
+                  "struct": {
+                    "types": [
+                      {
+                        "i32": {
+                          "nullability": "NULLABILITY_NULLABLE"
+                        }
+                      },
+                      {
+                        "i32": {
+                          "nullability": "NULLABILITY_NULLABLE"
+                        }
+                      }
+                    ],
+                    "nullability": "NULLABILITY_REQUIRED"
+                  }
+                },
+                "namedTable": {
+                  "names": ["DATA"]
+                }
+              }
+            },
+            "expressions": [
+              {
+                "nested": {
+                  "nullable": false,
+                  "list": {
+                    "values": [
+                      {
+                        "selection": {
+                          "directReference": {
+                            "structField": {
+                              "field": 0
+                            }
+                          },
+                          "rootReference": {}
+                        }
+                      },
+                      {
+                        "selection": {
+                          "directReference": {
+                            "structField": {
+                              "field": 1
+                            }
+                          },
+                          "rootReference": {}
+                        }
+                      }
+                    ]
+                  }
+                }
+              }
+            ]
+          }
+        },
+        "names": ["my_list"]
+      }
+    }
+  ]
+}
diff --git a/datafusion/substrait/tests/utils.rs b/datafusion/substrait/tests/utils.rs
index f84594312b634..4d9b5ca83e5e0 100644
--- a/datafusion/substrait/tests/utils.rs
+++ b/datafusion/substrait/tests/utils.rs
@@ -17,14 +17,14 @@
 
 #[cfg(test)]
 pub mod test {
-    use datafusion::common::{substrait_datafusion_err, substrait_err, TableReference};
-    use datafusion::datasource::empty::EmptyTable;
+    use datafusion::common::{TableReference, substrait_datafusion_err, substrait_err};
     use datafusion::datasource::TableProvider;
+    use datafusion::datasource::empty::EmptyTable;
     use datafusion::error::Result;
     use datafusion::prelude::SessionContext;
     use datafusion_substrait::extensions::Extensions;
     use datafusion_substrait::logical_plan::consumer::{
-        from_substrait_named_struct, DefaultSubstraitConsumer, SubstraitConsumer,
+        DefaultSubstraitConsumer, SubstraitConsumer, from_substrait_named_struct,
     };
     use std::collections::HashMap;
     use std::fs::File;
@@ -32,9 +32,9 @@ pub mod test {
     use std::sync::Arc;
     use substrait::proto::exchange_rel::ExchangeKind;
     use substrait::proto::expand_rel::expand_field::FieldType;
+    use substrait::proto::expression::RexType;
     use substrait::proto::expression::nested::NestedType;
     use substrait::proto::expression::subquery::SubqueryType;
-    use substrait::proto::expression::RexType;
     use substrait::proto::function_argument::ArgType;
     use substrait::proto::read_rel::{NamedTable, ReadType};
     use substrait::proto::rel::RelType;
@@ -69,12 +69,14 @@ pub mod test {
             let schema = table.schema();
             if let Some(existing_table) =
                 schema_map.insert(table_reference.clone(), table)
+                && existing_table.schema() != schema
             {
-                if existing_table.schema() != schema {
-                    return substrait_err!(
-                        "Substrait plan contained the same table {} with different schemas.\nSchema 1: {}\nSchema 2: {}",
-                        table_reference, existing_table.schema(), schema);
-                }
+                return substrait_err!(
+                    "Substrait plan contained the same table {} with different schemas.\nSchema 1: {}\nSchema 2: {}",
+                    table_reference,
+                    existing_table.schema(),
+                    schema
+                );
             }
         }
         for (table_reference, table) in schema_map.into_iter() {
@@ -155,7 +157,7 @@ pub mod test {
             Ok(())
         }
 
-        #[allow(deprecated)]
+        #[expect(deprecated)]
         fn collect_schemas_from_rel(&mut self, rel: &Rel) -> Result<()> {
             let rel_type = rel
                 .rel_type
@@ -482,7 +484,9 @@ pub mod test {
                 }
                 RexType::DynamicParameter(_) => {}
                 // Enum is deprecated
+                #[expect(deprecated)]
                 RexType::Enum(_) => {}
+                RexType::Lambda(_) | RexType::LambdaInvocation(_) => {}
             }
             Ok(())
         }
diff --git a/datafusion/wasmtest/Cargo.toml b/datafusion/wasmtest/Cargo.toml
index a1e344979ad01..d6cea2e68d384 100644
--- a/datafusion/wasmtest/Cargo.toml
+++ b/datafusion/wasmtest/Cargo.toml
@@ -26,10 +26,14 @@ repository = { workspace = true }
 license = { workspace = true }
 authors = { workspace = true }
 rust-version = { workspace = true }
+publish = false
 
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -44,7 +48,7 @@ chrono = { version = "0.4", features = ["wasmbind"] }
 # all the `std::fmt` and `std::panicking` infrastructure, so isn't great for
 # code size when deploying.
 console_error_panic_hook = { version = "0.1.1", optional = true }
-datafusion = { workspace = true, features = ["parquet", "sql"] }
+datafusion = { workspace = true, features = ["compression", "parquet", "sql"] }
 datafusion-common = { workspace = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
@@ -56,11 +60,13 @@ getrandom = { version = "0.3", features = ["wasm_js"] }
 wasm-bindgen = "0.2.99"
 
 [dev-dependencies]
+bytes = { workspace = true }
+futures = { workspace = true }
 object_store = { workspace = true }
 # needs to be compiled
 tokio = { workspace = true }
 url = { workspace = true }
-wasm-bindgen-test = "0.3.55"
+wasm-bindgen-test = "0.3.66"
 
 [package.metadata.cargo-machete]
 ignored = ["chrono", "getrandom"]
diff --git a/datafusion/wasmtest/datafusion-wasm-app/package-lock.json b/datafusion/wasmtest/datafusion-wasm-app/package-lock.json
index 80d3d7b473bca..0e3077ee6248c 100644
--- a/datafusion/wasmtest/datafusion-wasm-app/package-lock.json
+++ b/datafusion/wasmtest/datafusion-wasm-app/package-lock.json
@@ -12,8 +12,8 @@
         "datafusion-wasmtest": "../pkg"
       },
       "devDependencies": {
-        "copy-webpack-plugin": "12.0.2",
-        "webpack": "5.94.0",
+        "copy-webpack-plugin": "14.0.0",
+        "webpack": "5.105.0",
         "webpack-cli": "5.1.4",
         "webpack-dev-server": "5.2.1"
       }
@@ -32,17 +32,13 @@
       }
     },
     "node_modules/@jridgewell/gen-mapping": {
-      "version": "0.3.5",
-      "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.5.tgz",
-      "integrity": "sha512-IzL8ZoEDIBRWEzlCcRhOaCupYyN5gdIK+Q6fbFdPDg6HqX6jpkItn7DFIpW9LQzXG6Df9sA7+OKnq0qlz/GaQg==",
+      "version": "0.3.13",
+      "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.13.tgz",
+      "integrity": "sha512-2kkt/7niJ6MgEPxF0bYdQ6etZaA+fQvDcLKckhy1yIQOzaoKjBBjSj63/aLVjYE3qhRt5dvM+uUyfCg6UKCBbA==",
       "dev": true,
       "dependencies": {
-        "@jridgewell/set-array": "^1.2.1",
-        "@jridgewell/sourcemap-codec": "^1.4.10",
+        "@jridgewell/sourcemap-codec": "^1.5.0",
         "@jridgewell/trace-mapping": "^0.3.24"
-      },
-      "engines": {
-        "node": ">=6.0.0"
       }
     },
     "node_modules/@jridgewell/resolve-uri": {
@@ -54,19 +50,10 @@
         "node": ">=6.0.0"
       }
     },
-    "node_modules/@jridgewell/set-array": {
-      "version": "1.2.1",
-      "resolved": "https://registry.npmjs.org/@jridgewell/set-array/-/set-array-1.2.1.tgz",
-      "integrity": "sha512-R8gLRTZeyp03ymzP/6Lil/28tGeGEzhx1q2k703KGWRAI1VdvPIXdG70VJc2pAMw3NA6JKL5hhFu1sJX0Mnn/A==",
-      "dev": true,
-      "engines": {
-        "node": ">=6.0.0"
-      }
-    },
     "node_modules/@jridgewell/source-map": {
-      "version": "0.3.6",
-      "resolved": "https://registry.npmjs.org/@jridgewell/source-map/-/source-map-0.3.6.tgz",
-      "integrity": "sha512-1ZJTZebgqllO79ue2bm3rIGud/bOe0pP5BjSRCRxxYkEZS8STV7zN84UBbiYu7jy+eCKSnVIUgoWWE/tt+shMQ==",
+      "version": "0.3.11",
+      "resolved": "https://registry.npmjs.org/@jridgewell/source-map/-/source-map-0.3.11.tgz",
+      "integrity": "sha512-ZMp1V8ZFcPG5dIWnQLr3NSI1MiCU7UETdS/A0G8V/XWHvJv3ZsFqutJn1Y5RPmAPX6F3BiE397OqveU/9NCuIA==",
       "dev": true,
       "dependencies": {
         "@jridgewell/gen-mapping": "^0.3.5",
@@ -74,15 +61,15 @@
       }
     },
     "node_modules/@jridgewell/sourcemap-codec": {
-      "version": "1.5.0",
-      "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.0.tgz",
-      "integrity": "sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==",
+      "version": "1.5.5",
+      "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz",
+      "integrity": "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==",
       "dev": true
     },
     "node_modules/@jridgewell/trace-mapping": {
-      "version": "0.3.25",
-      "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.25.tgz",
-      "integrity": "sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ==",
+      "version": "0.3.31",
+      "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.31.tgz",
+      "integrity": "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==",
       "dev": true,
       "dependencies": {
         "@jridgewell/resolve-uri": "^3.1.0",
@@ -96,57 +83,6 @@
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/@nodelib/fs.scandir": {
-      "version": "2.1.5",
-      "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
-      "integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@nodelib/fs.stat": "2.0.5",
-        "run-parallel": "^1.1.9"
-      },
-      "engines": {
-        "node": ">= 8"
-      }
-    },
-    "node_modules/@nodelib/fs.stat": {
-      "version": "2.0.5",
-      "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz",
-      "integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 8"
-      }
-    },
-    "node_modules/@nodelib/fs.walk": {
-      "version": "1.2.8",
-      "resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz",
-      "integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@nodelib/fs.scandir": "2.1.5",
-        "fastq": "^1.6.0"
-      },
-      "engines": {
-        "node": ">= 8"
-      }
-    },
-    "node_modules/@sindresorhus/merge-streams": {
-      "version": "2.3.0",
-      "resolved": "https://registry.npmjs.org/@sindresorhus/merge-streams/-/merge-streams-2.3.0.tgz",
-      "integrity": "sha512-LtoMMhxAlorcGhmFYI+LhPgbPZCkgP6ra1YL604EeF6U98pLlQ3iWIGMdWSC+vWmPBWBNgmDBAhnAobLROJmwg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=18"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/@types/body-parser": {
       "version": "1.19.3",
       "resolved": "https://registry.npmjs.org/@types/body-parser/-/body-parser-1.19.3.tgz",
@@ -187,10 +123,30 @@
         "@types/node": "*"
       }
     },
+    "node_modules/@types/eslint": {
+      "version": "9.6.1",
+      "resolved": "https://registry.npmjs.org/@types/eslint/-/eslint-9.6.1.tgz",
+      "integrity": "sha512-FXx2pKgId/WyYo2jXw63kk7/+TY7u7AziEJxJAnSFzHlqTAS3Ync6SvgYAN/k4/PQpnnVuzoMuVnByKK2qp0ag==",
+      "dev": true,
+      "dependencies": {
+        "@types/estree": "*",
+        "@types/json-schema": "*"
+      }
+    },
+    "node_modules/@types/eslint-scope": {
+      "version": "3.7.7",
+      "resolved": "https://registry.npmjs.org/@types/eslint-scope/-/eslint-scope-3.7.7.tgz",
+      "integrity": "sha512-MzMFlSLBqNF2gcHWO0G1vP/YQyfvrxZ0bF+u7mzUdZ1/xK4A4sru+nraZz5i3iEIk1l1uyicaDVTB4QbbEkAYg==",
+      "dev": true,
+      "dependencies": {
+        "@types/eslint": "*",
+        "@types/estree": "*"
+      }
+    },
     "node_modules/@types/estree": {
-      "version": "1.0.5",
-      "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.5.tgz",
-      "integrity": "sha512-/kYRxGDLWzHOB7q+wtSUQlFrtcdUccpfy+X+9iMBpHK8QLLhx2wIPYuS5DYtR9Wa/YlZAbIovy7qVdB1Aq6Lyw==",
+      "version": "1.0.8",
+      "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz",
+      "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==",
       "dev": true
     },
     "node_modules/@types/express": {
@@ -234,9 +190,9 @@
       }
     },
     "node_modules/@types/json-schema": {
-      "version": "7.0.13",
-      "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.13.tgz",
-      "integrity": "sha512-RbSSoHliUbnXj3ny0CNFOoxrIDV6SUGyStHsvDqosw6CkdPV8TtWGlfecuK4ToyMEAql6pzNxgCFKanovUzlgQ==",
+      "version": "7.0.15",
+      "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz",
+      "integrity": "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==",
       "dev": true
     },
     "node_modules/@types/mime": {
@@ -333,148 +289,148 @@
       }
     },
     "node_modules/@webassemblyjs/ast": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/ast/-/ast-1.12.1.tgz",
-      "integrity": "sha512-EKfMUOPRRUTy5UII4qJDGPpqfwjOmZ5jeGFwid9mnoqIFK+e0vqoi1qH56JpmZSzEL53jKnNzScdmftJyG5xWg==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/ast/-/ast-1.14.1.tgz",
+      "integrity": "sha512-nuBEDgQfm1ccRp/8bCQrx1frohyufl4JlbMMZ4P1wpeOfDhF6FQkxZJ1b/e+PLwr6X1Nhw6OLme5usuBWYBvuQ==",
       "dev": true,
       "dependencies": {
-        "@webassemblyjs/helper-numbers": "1.11.6",
-        "@webassemblyjs/helper-wasm-bytecode": "1.11.6"
+        "@webassemblyjs/helper-numbers": "1.13.2",
+        "@webassemblyjs/helper-wasm-bytecode": "1.13.2"
       }
     },
     "node_modules/@webassemblyjs/floating-point-hex-parser": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/floating-point-hex-parser/-/floating-point-hex-parser-1.11.6.tgz",
-      "integrity": "sha512-ejAj9hfRJ2XMsNHk/v6Fu2dGS+i4UaXBXGemOfQ/JfQ6mdQg/WXtwleQRLLS4OvfDhv8rYnVwH27YJLMyYsxhw==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/floating-point-hex-parser/-/floating-point-hex-parser-1.13.2.tgz",
+      "integrity": "sha512-6oXyTOzbKxGH4steLbLNOu71Oj+C8Lg34n6CqRvqfS2O71BxY6ByfMDRhBytzknj9yGUPVJ1qIKhRlAwO1AovA==",
       "dev": true
     },
     "node_modules/@webassemblyjs/helper-api-error": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-api-error/-/helper-api-error-1.11.6.tgz",
-      "integrity": "sha512-o0YkoP4pVu4rN8aTJgAyj9hC2Sv5UlkzCHhxqWj8butaLvnpdc2jOwh4ewE6CX0txSfLn/UYaV/pheS2Txg//Q==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-api-error/-/helper-api-error-1.13.2.tgz",
+      "integrity": "sha512-U56GMYxy4ZQCbDZd6JuvvNV/WFildOjsaWD3Tzzvmw/mas3cXzRJPMjP83JqEsgSbyrmaGjBfDtV7KDXV9UzFQ==",
       "dev": true
     },
     "node_modules/@webassemblyjs/helper-buffer": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-buffer/-/helper-buffer-1.12.1.tgz",
-      "integrity": "sha512-nzJwQw99DNDKr9BVCOZcLuJJUlqkJh+kVzVl6Fmq/tI5ZtEyWT1KZMyOXltXLZJmDtvLCDgwsyrkohEtopTXCw==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-buffer/-/helper-buffer-1.14.1.tgz",
+      "integrity": "sha512-jyH7wtcHiKssDtFPRB+iQdxlDf96m0E39yb0k5uJVhFGleZFoNw1c4aeIcVUPPbXUVJ94wwnMOAqUHyzoEPVMA==",
       "dev": true
     },
     "node_modules/@webassemblyjs/helper-numbers": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-numbers/-/helper-numbers-1.11.6.tgz",
-      "integrity": "sha512-vUIhZ8LZoIWHBohiEObxVm6hwP034jwmc9kuq5GdHZH0wiLVLIPcMCdpJzG4C11cHoQ25TFIQj9kaVADVX7N3g==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-numbers/-/helper-numbers-1.13.2.tgz",
+      "integrity": "sha512-FE8aCmS5Q6eQYcV3gI35O4J789wlQA+7JrqTTpJqn5emA4U2hvwJmvFRC0HODS+3Ye6WioDklgd6scJ3+PLnEA==",
       "dev": true,
       "dependencies": {
-        "@webassemblyjs/floating-point-hex-parser": "1.11.6",
-        "@webassemblyjs/helper-api-error": "1.11.6",
+        "@webassemblyjs/floating-point-hex-parser": "1.13.2",
+        "@webassemblyjs/helper-api-error": "1.13.2",
         "@xtuc/long": "4.2.2"
       }
     },
     "node_modules/@webassemblyjs/helper-wasm-bytecode": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-bytecode/-/helper-wasm-bytecode-1.11.6.tgz",
-      "integrity": "sha512-sFFHKwcmBprO9e7Icf0+gddyWYDViL8bpPjJJl0WHxCdETktXdmtWLGVzoHbqUcY4Be1LkNfwTmXOJUFZYSJdA==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-bytecode/-/helper-wasm-bytecode-1.13.2.tgz",
+      "integrity": "sha512-3QbLKy93F0EAIXLh0ogEVR6rOubA9AoZ+WRYhNbFyuB70j3dRdwH9g+qXhLAO0kiYGlg3TxDV+I4rQTr/YNXkA==",
       "dev": true
     },
     "node_modules/@webassemblyjs/helper-wasm-section": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-section/-/helper-wasm-section-1.12.1.tgz",
-      "integrity": "sha512-Jif4vfB6FJlUlSbgEMHUyk1j234GTNG9dBJ4XJdOySoj518Xj0oGsNi59cUQF4RRMS9ouBUxDDdyBVfPTypa5g==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-section/-/helper-wasm-section-1.14.1.tgz",
+      "integrity": "sha512-ds5mXEqTJ6oxRoqjhWDU83OgzAYjwsCV8Lo/N+oRsNDmx/ZDpqalmrtgOMkHwxsG0iI//3BwWAErYRHtgn0dZw==",
       "dev": true,
       "dependencies": {
-        "@webassemblyjs/ast": "1.12.1",
-        "@webassemblyjs/helper-buffer": "1.12.1",
-        "@webassemblyjs/helper-wasm-bytecode": "1.11.6",
-        "@webassemblyjs/wasm-gen": "1.12.1"
+        "@webassemblyjs/ast": "1.14.1",
+        "@webassemblyjs/helper-buffer": "1.14.1",
+        "@webassemblyjs/helper-wasm-bytecode": "1.13.2",
+        "@webassemblyjs/wasm-gen": "1.14.1"
       }
     },
     "node_modules/@webassemblyjs/ieee754": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/ieee754/-/ieee754-1.11.6.tgz",
-      "integrity": "sha512-LM4p2csPNvbij6U1f19v6WR56QZ8JcHg3QIJTlSwzFcmx6WSORicYj6I63f9yU1kEUtrpG+kjkiIAkevHpDXrg==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/ieee754/-/ieee754-1.13.2.tgz",
+      "integrity": "sha512-4LtOzh58S/5lX4ITKxnAK2USuNEvpdVV9AlgGQb8rJDHaLeHciwG4zlGr0j/SNWlr7x3vO1lDEsuePvtcDNCkw==",
       "dev": true,
       "dependencies": {
         "@xtuc/ieee754": "^1.2.0"
       }
     },
     "node_modules/@webassemblyjs/leb128": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/leb128/-/leb128-1.11.6.tgz",
-      "integrity": "sha512-m7a0FhE67DQXgouf1tbN5XQcdWoNgaAuoULHIfGFIEVKA6tu/edls6XnIlkmS6FrXAquJRPni3ZZKjw6FSPjPQ==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/leb128/-/leb128-1.13.2.tgz",
+      "integrity": "sha512-Lde1oNoIdzVzdkNEAWZ1dZ5orIbff80YPdHx20mrHwHrVNNTjNr8E3xz9BdpcGqRQbAEa+fkrCb+fRFTl/6sQw==",
       "dev": true,
       "dependencies": {
         "@xtuc/long": "4.2.2"
       }
     },
     "node_modules/@webassemblyjs/utf8": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/utf8/-/utf8-1.11.6.tgz",
-      "integrity": "sha512-vtXf2wTQ3+up9Zsg8sa2yWiQpzSsMyXj0qViVP6xKGCUT8p8YJ6HqI7l5eCnWx1T/FYdsv07HQs2wTFbbof/RA==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/utf8/-/utf8-1.13.2.tgz",
+      "integrity": "sha512-3NQWGjKTASY1xV5m7Hr0iPeXD9+RDobLll3T9d2AO+g3my8xy5peVyjSag4I50mR1bBSN/Ct12lo+R9tJk0NZQ==",
       "dev": true
     },
     "node_modules/@webassemblyjs/wasm-edit": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-edit/-/wasm-edit-1.12.1.tgz",
-      "integrity": "sha512-1DuwbVvADvS5mGnXbE+c9NfA8QRcZ6iKquqjjmR10k6o+zzsRVesil54DKexiowcFCPdr/Q0qaMgB01+SQ1u6g==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-edit/-/wasm-edit-1.14.1.tgz",
+      "integrity": "sha512-RNJUIQH/J8iA/1NzlE4N7KtyZNHi3w7at7hDjvRNm5rcUXa00z1vRz3glZoULfJ5mpvYhLybmVcwcjGrC1pRrQ==",
       "dev": true,
       "dependencies": {
-        "@webassemblyjs/ast": "1.12.1",
-        "@webassemblyjs/helper-buffer": "1.12.1",
-        "@webassemblyjs/helper-wasm-bytecode": "1.11.6",
-        "@webassemblyjs/helper-wasm-section": "1.12.1",
-        "@webassemblyjs/wasm-gen": "1.12.1",
-        "@webassemblyjs/wasm-opt": "1.12.1",
-        "@webassemblyjs/wasm-parser": "1.12.1",
-        "@webassemblyjs/wast-printer": "1.12.1"
+        "@webassemblyjs/ast": "1.14.1",
+        "@webassemblyjs/helper-buffer": "1.14.1",
+        "@webassemblyjs/helper-wasm-bytecode": "1.13.2",
+        "@webassemblyjs/helper-wasm-section": "1.14.1",
+        "@webassemblyjs/wasm-gen": "1.14.1",
+        "@webassemblyjs/wasm-opt": "1.14.1",
+        "@webassemblyjs/wasm-parser": "1.14.1",
+        "@webassemblyjs/wast-printer": "1.14.1"
       }
     },
     "node_modules/@webassemblyjs/wasm-gen": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-gen/-/wasm-gen-1.12.1.tgz",
-      "integrity": "sha512-TDq4Ojh9fcohAw6OIMXqiIcTq5KUXTGRkVxbSo1hQnSy6lAM5GSdfwWeSxpAo0YzgsgF182E/U0mDNhuA0tW7w==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-gen/-/wasm-gen-1.14.1.tgz",
+      "integrity": "sha512-AmomSIjP8ZbfGQhumkNvgC33AY7qtMCXnN6bL2u2Js4gVCg8fp735aEiMSBbDR7UQIj90n4wKAFUSEd0QN2Ukg==",
       "dev": true,
       "dependencies": {
-        "@webassemblyjs/ast": "1.12.1",
-        "@webassemblyjs/helper-wasm-bytecode": "1.11.6",
-        "@webassemblyjs/ieee754": "1.11.6",
-        "@webassemblyjs/leb128": "1.11.6",
-        "@webassemblyjs/utf8": "1.11.6"
+        "@webassemblyjs/ast": "1.14.1",
+        "@webassemblyjs/helper-wasm-bytecode": "1.13.2",
+        "@webassemblyjs/ieee754": "1.13.2",
+        "@webassemblyjs/leb128": "1.13.2",
+        "@webassemblyjs/utf8": "1.13.2"
       }
     },
     "node_modules/@webassemblyjs/wasm-opt": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-opt/-/wasm-opt-1.12.1.tgz",
-      "integrity": "sha512-Jg99j/2gG2iaz3hijw857AVYekZe2SAskcqlWIZXjji5WStnOpVoat3gQfT/Q5tb2djnCjBtMocY/Su1GfxPBg==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-opt/-/wasm-opt-1.14.1.tgz",
+      "integrity": "sha512-PTcKLUNvBqnY2U6E5bdOQcSM+oVP/PmrDY9NzowJjislEjwP/C4an2303MCVS2Mg9d3AJpIGdUFIQQWbPds0Sw==",
       "dev": true,
       "dependencies": {
-        "@webassemblyjs/ast": "1.12.1",
-        "@webassemblyjs/helper-buffer": "1.12.1",
-        "@webassemblyjs/wasm-gen": "1.12.1",
-        "@webassemblyjs/wasm-parser": "1.12.1"
+        "@webassemblyjs/ast": "1.14.1",
+        "@webassemblyjs/helper-buffer": "1.14.1",
+        "@webassemblyjs/wasm-gen": "1.14.1",
+        "@webassemblyjs/wasm-parser": "1.14.1"
       }
     },
     "node_modules/@webassemblyjs/wasm-parser": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-parser/-/wasm-parser-1.12.1.tgz",
-      "integrity": "sha512-xikIi7c2FHXysxXe3COrVUPSheuBtpcfhbpFj4gmu7KRLYOzANztwUU0IbsqvMqzuNK2+glRGWCEqZo1WCLyAQ==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-parser/-/wasm-parser-1.14.1.tgz",
+      "integrity": "sha512-JLBl+KZ0R5qB7mCnud/yyX08jWFw5MsoalJ1pQ4EdFlgj9VdXKGuENGsiCIjegI1W7p91rUlcB/LB5yRJKNTcQ==",
       "dev": true,
       "dependencies": {
-        "@webassemblyjs/ast": "1.12.1",
-        "@webassemblyjs/helper-api-error": "1.11.6",
-        "@webassemblyjs/helper-wasm-bytecode": "1.11.6",
-        "@webassemblyjs/ieee754": "1.11.6",
-        "@webassemblyjs/leb128": "1.11.6",
-        "@webassemblyjs/utf8": "1.11.6"
+        "@webassemblyjs/ast": "1.14.1",
+        "@webassemblyjs/helper-api-error": "1.13.2",
+        "@webassemblyjs/helper-wasm-bytecode": "1.13.2",
+        "@webassemblyjs/ieee754": "1.13.2",
+        "@webassemblyjs/leb128": "1.13.2",
+        "@webassemblyjs/utf8": "1.13.2"
       }
     },
     "node_modules/@webassemblyjs/wast-printer": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/wast-printer/-/wast-printer-1.12.1.tgz",
-      "integrity": "sha512-+X4WAlOisVWQMikjbcvY2e0rwPsKQ9F688lksZhBcPycBBuii3O7m8FACbDMWDojpAqvjIncrG8J0XHKyQfVeA==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/wast-printer/-/wast-printer-1.14.1.tgz",
+      "integrity": "sha512-kPSSXE6De1XOR820C90RIo2ogvZG+c3KiHzqUoO/F34Y2shGzesfqv7o57xrxovZJH/MetF5UjroJ/R/3isoiw==",
       "dev": true,
       "dependencies": {
-        "@webassemblyjs/ast": "1.12.1",
+        "@webassemblyjs/ast": "1.14.1",
         "@xtuc/long": "4.2.2"
       }
     },
@@ -548,9 +504,9 @@
       }
     },
     "node_modules/acorn": {
-      "version": "8.12.1",
-      "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.12.1.tgz",
-      "integrity": "sha512-tcpGyI9zbizT9JbV6oYE477V6mTlXvvi0T0G3SNIYE2apm/G5huBa1+K89VGeovbg+jycCrfhl3ADxErOuO6Jg==",
+      "version": "8.15.0",
+      "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz",
+      "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
       "dev": true,
       "bin": {
         "acorn": "bin/acorn"
@@ -559,25 +515,28 @@
         "node": ">=0.4.0"
       }
     },
-    "node_modules/acorn-import-attributes": {
-      "version": "1.9.5",
-      "resolved": "https://registry.npmjs.org/acorn-import-attributes/-/acorn-import-attributes-1.9.5.tgz",
-      "integrity": "sha512-n02Vykv5uA3eHGM/Z2dQrcD56kL8TyDb2p1+0P83PClMnC/nc+anbQRhIOWnSq4Ke/KvDPrY3C9hDtC/A3eHnQ==",
+    "node_modules/acorn-import-phases": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/acorn-import-phases/-/acorn-import-phases-1.0.4.tgz",
+      "integrity": "sha512-wKmbr/DDiIXzEOiWrTTUcDm24kQ2vGfZQvM2fwg2vXqR5uW6aapr7ObPtj1th32b9u90/Pf4AItvdTh42fBmVQ==",
       "dev": true,
+      "engines": {
+        "node": ">=10.13.0"
+      },
       "peerDependencies": {
-        "acorn": "^8"
+        "acorn": "^8.14.0"
       }
     },
     "node_modules/ajv": {
-      "version": "6.12.6",
-      "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
-      "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==",
+      "version": "8.17.1",
+      "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.17.1.tgz",
+      "integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==",
       "dev": true,
       "dependencies": {
-        "fast-deep-equal": "^3.1.1",
-        "fast-json-stable-stringify": "^2.0.0",
-        "json-schema-traverse": "^0.4.1",
-        "uri-js": "^4.2.2"
+        "fast-deep-equal": "^3.1.3",
+        "fast-uri": "^3.0.1",
+        "json-schema-traverse": "^1.0.0",
+        "require-from-string": "^2.0.2"
       },
       "funding": {
         "type": "github",
@@ -601,35 +560,16 @@
         }
       }
     },
-    "node_modules/ajv-formats/node_modules/ajv": {
-      "version": "8.12.0",
-      "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
-      "integrity": "sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==",
+    "node_modules/ajv-keywords": {
+      "version": "5.1.0",
+      "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-5.1.0.tgz",
+      "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==",
       "dev": true,
       "dependencies": {
-        "fast-deep-equal": "^3.1.1",
-        "json-schema-traverse": "^1.0.0",
-        "require-from-string": "^2.0.2",
-        "uri-js": "^4.2.2"
+        "fast-deep-equal": "^3.1.3"
       },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/epoberezkin"
-      }
-    },
-    "node_modules/ajv-formats/node_modules/json-schema-traverse": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-      "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
-      "dev": true
-    },
-    "node_modules/ajv-keywords": {
-      "version": "3.5.2",
-      "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-3.5.2.tgz",
-      "integrity": "sha512-5p6WTN0DdTGVQk6VjcEju19IgaHudalcfabD7yhDGeA6bcQnmL+CpveLJq/3hvfwd1aof6L386Ougkx6RfyMIQ==",
-      "dev": true,
       "peerDependencies": {
-        "ajv": "^6.9.1"
+        "ajv": "^8.8.2"
       }
     },
     "node_modules/ansi-html-community": {
@@ -665,6 +605,15 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/baseline-browser-mapping": {
+      "version": "2.9.19",
+      "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.19.tgz",
+      "integrity": "sha512-ipDqC8FrAl/76p2SSWKSI+H9tFwm7vYqXQrItCuiVPt26Km0jS+NzSsBWAaBusvSbQcfJG+JitdMm+wZAgTYqg==",
+      "dev": true,
+      "bin": {
+        "baseline-browser-mapping": "dist/cli.js"
+      }
+    },
     "node_modules/batch": {
       "version": "0.6.1",
       "resolved": "https://registry.npmjs.org/batch/-/batch-0.6.1.tgz",
@@ -753,9 +702,9 @@
       }
     },
     "node_modules/browserslist": {
-      "version": "4.21.11",
-      "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.21.11.tgz",
-      "integrity": "sha512-xn1UXOKUz7DjdGlg9RrUr0GGiWzI97UQJnugHtH0OLDfJB7jMgoIkYvRIEO1l9EeEERVqeqLYOcFBW9ldjypbQ==",
+      "version": "4.28.1",
+      "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.28.1.tgz",
+      "integrity": "sha512-ZC5Bd0LgJXgwGqUknZY/vkUQ04r8NXnJZ3yYi4vDmSiZmC/pdSN0NbNRPxZpbtO4uAfDUAFffO8IZoM3Gj8IkA==",
       "dev": true,
       "funding": [
         {
@@ -772,10 +721,11 @@
         }
       ],
       "dependencies": {
-        "caniuse-lite": "^1.0.30001538",
-        "electron-to-chromium": "^1.4.526",
-        "node-releases": "^2.0.13",
-        "update-browserslist-db": "^1.0.13"
+        "baseline-browser-mapping": "^2.9.0",
+        "caniuse-lite": "^1.0.30001759",
+        "electron-to-chromium": "^1.5.263",
+        "node-releases": "^2.0.27",
+        "update-browserslist-db": "^1.2.0"
       },
       "bin": {
         "browserslist": "cli.js"
@@ -847,9 +797,9 @@
       }
     },
     "node_modules/caniuse-lite": {
-      "version": "1.0.30001538",
-      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001538.tgz",
-      "integrity": "sha512-HWJnhnID+0YMtGlzcp3T9drmBJUVDchPJ08tpUGFLs9CYlwWPH2uLgpHn8fND5pCgXVtnGS3H4QR9XLMHVNkHw==",
+      "version": "1.0.30001768",
+      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001768.tgz",
+      "integrity": "sha512-qY3aDRZC5nWPgHUgIB84WL+nySuo19wk0VJpp/XI9T34lrvkyhRvNVOFJOp2kxClQhiFBu+TaUSudf6oa3vkSA==",
       "dev": true,
       "funding": [
         {
@@ -1068,21 +1018,19 @@
       "license": "MIT"
     },
     "node_modules/copy-webpack-plugin": {
-      "version": "12.0.2",
-      "resolved": "https://registry.npmjs.org/copy-webpack-plugin/-/copy-webpack-plugin-12.0.2.tgz",
-      "integrity": "sha512-SNwdBeHyII+rWvee/bTnAYyO8vfVdcSTud4EIb6jcZ8inLeWucJE0DnxXQBjlQ5zlteuuvooGQy3LIyGxhvlOA==",
+      "version": "14.0.0",
+      "resolved": "https://registry.npmjs.org/copy-webpack-plugin/-/copy-webpack-plugin-14.0.0.tgz",
+      "integrity": "sha512-3JLW90aBGeaTLpM7mYQKpnVdgsUZRExY55giiZgLuX/xTQRUs1dOCwbBnWnvY6Q6rfZoXMNwzOQJCSZPppfqXA==",
       "dev": true,
-      "license": "MIT",
       "dependencies": {
-        "fast-glob": "^3.3.2",
         "glob-parent": "^6.0.1",
-        "globby": "^14.0.0",
         "normalize-path": "^3.0.0",
         "schema-utils": "^4.2.0",
-        "serialize-javascript": "^6.0.2"
+        "serialize-javascript": "^7.0.3",
+        "tinyglobby": "^0.2.12"
       },
       "engines": {
-        "node": ">= 18.12.0"
+        "node": ">= 20.9.0"
       },
       "funding": {
         "type": "opencollective",
@@ -1092,36 +1040,6 @@
         "webpack": "^5.1.0"
       }
     },
-    "node_modules/copy-webpack-plugin/node_modules/ajv": {
-      "version": "8.17.1",
-      "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.17.1.tgz",
-      "integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "fast-deep-equal": "^3.1.3",
-        "fast-uri": "^3.0.1",
-        "json-schema-traverse": "^1.0.0",
-        "require-from-string": "^2.0.2"
-      },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/epoberezkin"
-      }
-    },
-    "node_modules/copy-webpack-plugin/node_modules/ajv-keywords": {
-      "version": "5.1.0",
-      "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-5.1.0.tgz",
-      "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "fast-deep-equal": "^3.1.3"
-      },
-      "peerDependencies": {
-        "ajv": "^8.8.2"
-      }
-    },
     "node_modules/copy-webpack-plugin/node_modules/glob-parent": {
       "version": "6.0.2",
       "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz",
@@ -1135,31 +1053,13 @@
         "node": ">=10.13.0"
       }
     },
-    "node_modules/copy-webpack-plugin/node_modules/json-schema-traverse": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-      "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/copy-webpack-plugin/node_modules/schema-utils": {
-      "version": "4.3.0",
-      "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.3.0.tgz",
-      "integrity": "sha512-Gf9qqc58SpCA/xdziiHz35F4GNIWYWZrEshUc/G/r5BnLph6xpKuLeoJoQuj5WfBIx/eQLf+hmVPYHaxJu7V2g==",
+    "node_modules/copy-webpack-plugin/node_modules/serialize-javascript": {
+      "version": "7.0.5",
+      "resolved": "https://registry.npmjs.org/serialize-javascript/-/serialize-javascript-7.0.5.tgz",
+      "integrity": "sha512-F4LcB0UqUl1zErq+1nYEEzSHJnIwb3AF2XWB94b+afhrekOUijwooAYqFyRbjYkm2PAKBabx6oYv/xDxNi8IBw==",
       "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/json-schema": "^7.0.9",
-        "ajv": "^8.9.0",
-        "ajv-formats": "^2.1.1",
-        "ajv-keywords": "^5.1.0"
-      },
       "engines": {
-        "node": ">= 10.13.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/webpack"
+        "node": ">=20.0.0"
       }
     },
     "node_modules/core-util-is": {
@@ -1307,9 +1207,9 @@
       "license": "MIT"
     },
     "node_modules/electron-to-chromium": {
-      "version": "1.4.528",
-      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.528.tgz",
-      "integrity": "sha512-UdREXMXzLkREF4jA8t89FQjA8WHI6ssP38PMY4/4KhXFQbtImnghh4GkCgrtiZwLKUKVD2iTVXvDVQjfomEQuA==",
+      "version": "1.5.286",
+      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.286.tgz",
+      "integrity": "sha512-9tfDXhJ4RKFNerfjdCcZfufu49vg620741MNs26a9+bhLThdB+plgMeou98CAaHu/WATj2iHOOHTp1hWtABj2A==",
       "dev": true
     },
     "node_modules/encodeurl": {
@@ -1323,13 +1223,13 @@
       }
     },
     "node_modules/enhanced-resolve": {
-      "version": "5.17.1",
-      "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.17.1.tgz",
-      "integrity": "sha512-LMHl3dXhTcfv8gM4kEzIUeTQ+7fpdA0l2tUf34BddXPkz2A5xJ5L/Pchd5BL6rdccM9QGvu0sWZzK1Z1t4wwyg==",
+      "version": "5.19.0",
+      "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.19.0.tgz",
+      "integrity": "sha512-phv3E1Xl4tQOShqSte26C7Fl84EwUdZsyOuSSk9qtAGyyQs2s3jJzComh+Abf4g187lUUAvH+H26omrqia2aGg==",
       "dev": true,
       "dependencies": {
         "graceful-fs": "^4.2.4",
-        "tapable": "^2.2.0"
+        "tapable": "^2.3.0"
       },
       "engines": {
         "node": ">=10.13.0"
@@ -1368,9 +1268,9 @@
       }
     },
     "node_modules/es-module-lexer": {
-      "version": "1.3.1",
-      "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-1.3.1.tgz",
-      "integrity": "sha512-JUFAyicQV9mXc3YRxPnDlrfBKpqt6hUYzz9/boprUJHs4e4KVr3XwOF70doO6gwXUor6EWZJAyWAfKki84t20Q==",
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-2.0.0.tgz",
+      "integrity": "sha512-5POEcUuZybH7IdmGsD8wlf0AI55wMecM9rVBTI/qEAy2c1kTOm3DjFYjrBdI2K3BaJjJYfYFeRtM0t9ssnRuxw==",
       "dev": true
     },
     "node_modules/es-object-atoms": {
@@ -1387,9 +1287,9 @@
       }
     },
     "node_modules/escalade": {
-      "version": "3.1.1",
-      "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.1.tgz",
-      "integrity": "sha512-k0er2gUkLf8O0zKJiAhmkTnJlTvINGv7ygDNPbeIsX/TJjGJZHuh9B2UxbsaEkmlEo9MfhrSzmhIlhRlI2GXnw==",
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
+      "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==",
       "dev": true,
       "engines": {
         "node": ">=6"
@@ -1470,40 +1370,39 @@
       }
     },
     "node_modules/express": {
-      "version": "4.21.2",
-      "resolved": "https://registry.npmjs.org/express/-/express-4.21.2.tgz",
-      "integrity": "sha512-28HqgMZAmih1Czt9ny7qr6ek2qddF4FclbMzwhCREB6OFfH+rXAnuNCwo1/wFvrtbgsQDb4kSbX9de9lFbrXnA==",
+      "version": "4.22.1",
+      "resolved": "https://registry.npmjs.org/express/-/express-4.22.1.tgz",
+      "integrity": "sha512-F2X8g9P1X7uCPZMA3MVf9wcTqlyNp7IhH5qPCI0izhaOIYXaW9L535tGA3qmjRzpH+bZczqq7hVKxTR4NWnu+g==",
       "dev": true,
-      "license": "MIT",
       "dependencies": {
         "accepts": "~1.3.8",
         "array-flatten": "1.1.1",
-        "body-parser": "1.20.3",
-        "content-disposition": "0.5.4",
+        "body-parser": "~1.20.3",
+        "content-disposition": "~0.5.4",
         "content-type": "~1.0.4",
-        "cookie": "0.7.1",
-        "cookie-signature": "1.0.6",
+        "cookie": "~0.7.1",
+        "cookie-signature": "~1.0.6",
         "debug": "2.6.9",
         "depd": "2.0.0",
         "encodeurl": "~2.0.0",
         "escape-html": "~1.0.3",
         "etag": "~1.8.1",
-        "finalhandler": "1.3.1",
-        "fresh": "0.5.2",
-        "http-errors": "2.0.0",
+        "finalhandler": "~1.3.1",
+        "fresh": "~0.5.2",
+        "http-errors": "~2.0.0",
         "merge-descriptors": "1.0.3",
         "methods": "~1.1.2",
-        "on-finished": "2.4.1",
+        "on-finished": "~2.4.1",
         "parseurl": "~1.3.3",
-        "path-to-regexp": "0.1.12",
+        "path-to-regexp": "~0.1.12",
         "proxy-addr": "~2.0.7",
-        "qs": "6.13.0",
+        "qs": "~6.14.0",
         "range-parser": "~1.2.1",
         "safe-buffer": "5.2.1",
-        "send": "0.19.0",
-        "serve-static": "1.16.2",
+        "send": "~0.19.0",
+        "serve-static": "~1.16.2",
         "setprototypeof": "1.2.0",
-        "statuses": "2.0.1",
+        "statuses": "~2.0.1",
         "type-is": "~1.6.18",
         "utils-merge": "1.0.1",
         "vary": "~1.1.2"
@@ -1536,6 +1435,21 @@
         "node": ">= 0.8"
       }
     },
+    "node_modules/express/node_modules/qs": {
+      "version": "6.14.0",
+      "resolved": "https://registry.npmjs.org/qs/-/qs-6.14.0.tgz",
+      "integrity": "sha512-YWWTjgABSKcvs/nWBi9PycY/JiPJqOD4JA6o9Sej2AtvSGarXxKC3OQSk4pAarbdQlKAh5D4FCQkJNkW+GAn3w==",
+      "dev": true,
+      "dependencies": {
+        "side-channel": "^1.1.0"
+      },
+      "engines": {
+        "node": ">=0.6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
     "node_modules/express/node_modules/safe-buffer": {
       "version": "5.2.1",
       "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
@@ -1573,33 +1487,10 @@
       "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==",
       "dev": true
     },
-    "node_modules/fast-glob": {
-      "version": "3.3.3",
-      "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.3.tgz",
-      "integrity": "sha512-7MptL8U0cqcFdzIzwOTHoilX9x5BrNqye7Z/LuC7kCMRio1EMSyqRK3BEAUD7sXRq4iT4AzTVuZdhgQ2TCvYLg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@nodelib/fs.stat": "^2.0.2",
-        "@nodelib/fs.walk": "^1.2.3",
-        "glob-parent": "^5.1.2",
-        "merge2": "^1.3.0",
-        "micromatch": "^4.0.8"
-      },
-      "engines": {
-        "node": ">=8.6.0"
-      }
-    },
-    "node_modules/fast-json-stable-stringify": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz",
-      "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==",
-      "dev": true
-    },
     "node_modules/fast-uri": {
-      "version": "3.0.6",
-      "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.0.6.tgz",
-      "integrity": "sha512-Atfo14OibSv5wAp4VWNsFYE1AchQRTv9cBGWET4pZWHzYshFSS9NQI6I57rdKn9croWVMbYFbLhJ+yJvmZIIHw==",
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.0.tgz",
+      "integrity": "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==",
       "dev": true,
       "funding": [
         {
@@ -1610,8 +1501,7 @@
           "type": "opencollective",
           "url": "https://opencollective.com/fastify"
         }
-      ],
-      "license": "BSD-3-Clause"
+      ]
     },
     "node_modules/fastest-levenshtein": {
       "version": "1.0.16",
@@ -1622,16 +1512,6 @@
         "node": ">= 4.9.1"
       }
     },
-    "node_modules/fastq": {
-      "version": "1.19.0",
-      "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.19.0.tgz",
-      "integrity": "sha512-7SFSRCNjBQIZH/xZR3iy5iQYR8aGBE0h3VG6/cwlbrpdciNYBMotQav8c1XI3HjHH+NikUpP53nPdlZSdWmFzA==",
-      "dev": true,
-      "license": "ISC",
-      "dependencies": {
-        "reusify": "^1.0.4"
-      }
-    },
     "node_modules/faye-websocket": {
       "version": "0.11.4",
       "resolved": "https://registry.npmjs.org/faye-websocket/-/faye-websocket-0.11.4.tgz",
@@ -1709,9 +1589,9 @@
       }
     },
     "node_modules/follow-redirects": {
-      "version": "1.15.6",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
-      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
+      "version": "1.16.0",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.16.0.tgz",
+      "integrity": "sha512-y5rN/uOsadFT/JfYwhxRS5R7Qce+g3zG97+JrtFZlC9klX/W5hD7iiLzScI4nZqUS7DNUdhPgw4xI8W2LuXlUw==",
       "dev": true,
       "funding": [
         {
@@ -1829,27 +1709,6 @@
       "integrity": "sha512-lkX1HJXwyMcprw/5YUZc2s7DrpAiHB21/V+E1rHUrVNokkvB6bqMzT0VfV6/86ZNabt1k14YOIaT7nDvOX3Iiw==",
       "dev": true
     },
-    "node_modules/globby": {
-      "version": "14.1.0",
-      "resolved": "https://registry.npmjs.org/globby/-/globby-14.1.0.tgz",
-      "integrity": "sha512-0Ia46fDOaT7k4og1PDW4YbodWWr3scS2vAr2lTbsplOt2WkKp0vQbkI9wKis/T5LV/dqPjO3bpS/z6GTJB82LA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@sindresorhus/merge-streams": "^2.1.0",
-        "fast-glob": "^3.3.3",
-        "ignore": "^7.0.3",
-        "path-type": "^6.0.0",
-        "slash": "^5.1.0",
-        "unicorn-magic": "^0.3.0"
-      },
-      "engines": {
-        "node": ">=18"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/gopd": {
       "version": "1.2.0",
       "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz",
@@ -2052,16 +1911,6 @@
         "node": ">=0.10.0"
       }
     },
-    "node_modules/ignore": {
-      "version": "7.0.3",
-      "resolved": "https://registry.npmjs.org/ignore/-/ignore-7.0.3.tgz",
-      "integrity": "sha512-bAH5jbK/F3T3Jls4I0SO1hmPR0dKU0a7+SY6n1yzRtG54FLO8d6w/nxLFX2Nb7dBu6cCWXPaAME6cYqFUMmuCA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 4"
-      }
-    },
     "node_modules/import-local": {
       "version": "3.1.0",
       "resolved": "https://registry.npmjs.org/import-local/-/import-local-3.1.0.tgz",
@@ -2290,9 +2139,9 @@
       "dev": true
     },
     "node_modules/json-schema-traverse": {
-      "version": "0.4.1",
-      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
-      "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==",
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
+      "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
       "dev": true
     },
     "node_modules/kind-of": {
@@ -2316,12 +2165,16 @@
       }
     },
     "node_modules/loader-runner": {
-      "version": "4.3.0",
-      "resolved": "https://registry.npmjs.org/loader-runner/-/loader-runner-4.3.0.tgz",
-      "integrity": "sha512-3R/1M+yS3j5ou80Me59j7F9IMs4PXs3VqRrm0TU3AbKPxlmpoY1TNscJV/oGJXo8qCatFGTfDbY6W6ipGOYXfg==",
+      "version": "4.3.1",
+      "resolved": "https://registry.npmjs.org/loader-runner/-/loader-runner-4.3.1.tgz",
+      "integrity": "sha512-IWqP2SCPhyVFTBtRcgMHdzlf9ul25NwaFx4wCEH/KjAXuuHY4yNjvPXsBokp8jCB936PyWRaPKUNh8NvylLp2Q==",
       "dev": true,
       "engines": {
         "node": ">=6.11.5"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/webpack"
       }
     },
     "node_modules/locate-path": {
@@ -2486,16 +2339,6 @@
       "integrity": "sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==",
       "dev": true
     },
-    "node_modules/merge2": {
-      "version": "1.4.1",
-      "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz",
-      "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 8"
-      }
-    },
     "node_modules/methods": {
       "version": "1.1.2",
       "resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz",
@@ -2596,19 +2439,18 @@
       "dev": true
     },
     "node_modules/node-forge": {
-      "version": "1.3.1",
-      "resolved": "https://registry.npmjs.org/node-forge/-/node-forge-1.3.1.tgz",
-      "integrity": "sha512-dPEtOeMvF9VMcYV/1Wb8CPoVAXtp6MKMlcbAt4ddqmGqUJ6fQZFXkNZNkNlfevtNkGtaSoXf/vNNNSvgrdXwtA==",
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/node-forge/-/node-forge-1.4.0.tgz",
+      "integrity": "sha512-LarFH0+6VfriEhqMMcLX2F7SwSXeWwnEAJEsYm5QKWchiVYVvJyV9v7UDvUv+w5HO23ZpQTXDv/GxdDdMyOuoQ==",
       "dev": true,
-      "license": "(BSD-3-Clause OR GPL-2.0)",
       "engines": {
         "node": ">= 6.13.0"
       }
     },
     "node_modules/node-releases": {
-      "version": "2.0.13",
-      "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.13.tgz",
-      "integrity": "sha512-uYr7J37ae/ORWdZeQ1xxMJe3NtdmqMC/JZK+geofDrkLUApKRHPd18/TxtBOJ4A0/+uUIliorNrfYV6s1b02eQ==",
+      "version": "2.0.27",
+      "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.27.tgz",
+      "integrity": "sha512-nmh3lCkYZ3grZvqcCH+fjmQ7X+H0OeZgP40OierEaAptX4XofMh5kwNbWh7lBduUzCcV/8kZ+NDLCwm2iorIlA==",
       "dev": true
     },
     "node_modules/normalize-path": {
@@ -2774,29 +2616,16 @@
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/path-type": {
-      "version": "6.0.0",
-      "resolved": "https://registry.npmjs.org/path-type/-/path-type-6.0.0.tgz",
-      "integrity": "sha512-Vj7sf++t5pBD637NSfkxpHSMfWaeig5+DKWLhcqIYx6mWQz5hdJTGDVMQiJcw1ZYkhs7AazKDGpRVji1LJCZUQ==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=18"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/picocolors": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.0.0.tgz",
-      "integrity": "sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==",
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
+      "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==",
       "dev": true
     },
     "node_modules/picomatch": {
-      "version": "2.3.1",
-      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz",
-      "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==",
+      "version": "2.3.2",
+      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.2.tgz",
+      "integrity": "sha512-V7+vQEJ06Z+c5tSye8S+nHUfI51xoXIXjHQ99cQtKUkQqqO1kO/KCJUfZXuB47h/YBlDhah2H3hdUGXn8ie0oA==",
       "dev": true,
       "engines": {
         "node": ">=8.6"
@@ -2847,15 +2676,6 @@
         "node": ">= 0.10"
       }
     },
-    "node_modules/punycode": {
-      "version": "2.3.0",
-      "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.0.tgz",
-      "integrity": "sha512-rRV+zQD8tVFys26lAGR9WUuS4iUAngJScM+ZRSKtvl5tKeZ2t5bvdNFdNHBW9FWR4guGHlgmsZ1G7BSm2wTbuA==",
-      "dev": true,
-      "engines": {
-        "node": ">=6"
-      }
-    },
     "node_modules/qs": {
       "version": "6.13.0",
       "resolved": "https://registry.npmjs.org/qs/-/qs-6.13.0.tgz",
@@ -2872,36 +2692,6 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
-    "node_modules/queue-microtask": {
-      "version": "1.2.3",
-      "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz",
-      "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==",
-      "dev": true,
-      "funding": [
-        {
-          "type": "github",
-          "url": "https://github.com/sponsors/feross"
-        },
-        {
-          "type": "patreon",
-          "url": "https://www.patreon.com/feross"
-        },
-        {
-          "type": "consulting",
-          "url": "https://feross.org/support"
-        }
-      ],
-      "license": "MIT"
-    },
-    "node_modules/randombytes": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.1.0.tgz",
-      "integrity": "sha512-vYl3iOX+4CKUWuxGi9Ukhie6fsqXqS9FE2Zaic4tNFD2N2QQaXOMFbuKK4QmDHC0JO6B1Zp41J0LpT0oR68amQ==",
-      "dev": true,
-      "dependencies": {
-        "safe-buffer": "^5.1.0"
-      }
-    },
     "node_modules/range-parser": {
       "version": "1.2.1",
       "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
@@ -3031,17 +2821,6 @@
         "node": ">= 4"
       }
     },
-    "node_modules/reusify": {
-      "version": "1.0.4",
-      "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.0.4.tgz",
-      "integrity": "sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "iojs": ">=1.0.0",
-        "node": ">=0.10.0"
-      }
-    },
     "node_modules/run-applescript": {
       "version": "7.0.0",
       "resolved": "https://registry.npmjs.org/run-applescript/-/run-applescript-7.0.0.tgz",
@@ -3055,30 +2834,6 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
-    "node_modules/run-parallel": {
-      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
-      "integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==",
-      "dev": true,
-      "funding": [
-        {
-          "type": "github",
-          "url": "https://github.com/sponsors/feross"
-        },
-        {
-          "type": "patreon",
-          "url": "https://www.patreon.com/feross"
-        },
-        {
-          "type": "consulting",
-          "url": "https://feross.org/support"
-        }
-      ],
-      "license": "MIT",
-      "dependencies": {
-        "queue-microtask": "^1.2.2"
-      }
-    },
     "node_modules/safe-buffer": {
       "version": "5.1.2",
       "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
@@ -3093,14 +2848,15 @@
       "license": "MIT"
     },
     "node_modules/schema-utils": {
-      "version": "3.3.0",
-      "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-3.3.0.tgz",
-      "integrity": "sha512-pN/yOAvcC+5rQ5nERGuwrjLlYvLTbCibnZ1I7B1LaiAz9BRBlE9GMgE/eqV30P7aJQUf7Ddimy/RsbYO/GrVGg==",
+      "version": "4.3.3",
+      "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.3.3.tgz",
+      "integrity": "sha512-eflK8wEtyOE6+hsaRVPxvUKYCpRgzLqDTb8krvAsRIwOGlHoSgYLgBXoubGgLd2fT41/OUYdb48v4k4WWHQurA==",
       "dev": true,
       "dependencies": {
-        "@types/json-schema": "^7.0.8",
-        "ajv": "^6.12.5",
-        "ajv-keywords": "^3.5.2"
+        "@types/json-schema": "^7.0.9",
+        "ajv": "^8.9.0",
+        "ajv-formats": "^2.1.1",
+        "ajv-keywords": "^5.1.0"
       },
       "engines": {
         "node": ">= 10.13.0"
@@ -3209,16 +2965,6 @@
         "node": ">= 0.8"
       }
     },
-    "node_modules/serialize-javascript": {
-      "version": "6.0.2",
-      "resolved": "https://registry.npmjs.org/serialize-javascript/-/serialize-javascript-6.0.2.tgz",
-      "integrity": "sha512-Saa1xPByTTq2gdeFZYLLo+RFE35NHZkAbqZeWNd3BpzppeVisAqpDjcp8dyf6uIvEqJRd46jemmyA4iFIeVk8g==",
-      "dev": true,
-      "license": "BSD-3-Clause",
-      "dependencies": {
-        "randombytes": "^2.1.0"
-      }
-    },
     "node_modules/serve-index": {
       "version": "1.9.1",
       "resolved": "https://registry.npmjs.org/serve-index/-/serve-index-1.9.1.tgz",
@@ -3412,19 +3158,6 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
-    "node_modules/slash": {
-      "version": "5.1.0",
-      "resolved": "https://registry.npmjs.org/slash/-/slash-5.1.0.tgz",
-      "integrity": "sha512-ZA6oR3T/pEyuqwMgAKT0/hAv8oAXckzbkmR0UkUosQ+Mc4RxGoJkRmwHgHufaenlyAgE1Mxgpdcrf75y6XcnDg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=14.16"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/sockjs": {
       "version": "0.3.24",
       "resolved": "https://registry.npmjs.org/sockjs/-/sockjs-0.3.24.tgz",
@@ -3545,22 +3278,26 @@
       }
     },
     "node_modules/tapable": {
-      "version": "2.2.1",
-      "resolved": "https://registry.npmjs.org/tapable/-/tapable-2.2.1.tgz",
-      "integrity": "sha512-GNzQvQTOIP6RyTfE2Qxb8ZVlNmw0n88vp1szwWRimP02mnTsx3Wtn5qRdqY9w2XduFNUgvOwhNnQsjwCp+kqaQ==",
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/tapable/-/tapable-2.3.0.tgz",
+      "integrity": "sha512-g9ljZiwki/LfxmQADO3dEY1CbpmXT5Hm2fJ+QaGKwSXUylMybePR7/67YW7jOrrvjEgL1Fmz5kzyAjWVWLlucg==",
       "dev": true,
       "engines": {
         "node": ">=6"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/webpack"
       }
     },
     "node_modules/terser": {
-      "version": "5.31.6",
-      "resolved": "https://registry.npmjs.org/terser/-/terser-5.31.6.tgz",
-      "integrity": "sha512-PQ4DAriWzKj+qgehQ7LK5bQqCFNMmlhjR2PFFLuqGCpuCAauxemVBWwWOxo3UIwWQx8+Pr61Df++r76wDmkQBg==",
+      "version": "5.46.0",
+      "resolved": "https://registry.npmjs.org/terser/-/terser-5.46.0.tgz",
+      "integrity": "sha512-jTwoImyr/QbOWFFso3YoU3ik0jBBDJ6JTOQiy/J2YxVJdZCc+5u7skhNwiOR3FQIygFqVUPHl7qbbxtjW2K3Qg==",
       "dev": true,
       "dependencies": {
         "@jridgewell/source-map": "^0.3.3",
-        "acorn": "^8.8.2",
+        "acorn": "^8.15.0",
         "commander": "^2.20.0",
         "source-map-support": "~0.5.20"
       },
@@ -3572,16 +3309,15 @@
       }
     },
     "node_modules/terser-webpack-plugin": {
-      "version": "5.3.10",
-      "resolved": "https://registry.npmjs.org/terser-webpack-plugin/-/terser-webpack-plugin-5.3.10.tgz",
-      "integrity": "sha512-BKFPWlPDndPs+NGGCr1U59t0XScL5317Y0UReNrHaw9/FwhPENlq6bfgs+4yPfyP51vqC1bQ4rp1EfXW5ZSH9w==",
+      "version": "5.4.0",
+      "resolved": "https://registry.npmjs.org/terser-webpack-plugin/-/terser-webpack-plugin-5.4.0.tgz",
+      "integrity": "sha512-Bn5vxm48flOIfkdl5CaD2+1CiUVbonWQ3KQPyP7/EuIl9Gbzq/gQFOzaMFUEgVjB1396tcK0SG8XcNJ/2kDH8g==",
       "dev": true,
       "dependencies": {
-        "@jridgewell/trace-mapping": "^0.3.20",
+        "@jridgewell/trace-mapping": "^0.3.25",
         "jest-worker": "^27.4.5",
-        "schema-utils": "^3.1.1",
-        "serialize-javascript": "^6.0.1",
-        "terser": "^5.26.0"
+        "schema-utils": "^4.3.0",
+        "terser": "^5.31.1"
       },
       "engines": {
         "node": ">= 10.13.0"
@@ -3612,6 +3348,51 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/tinyglobby": {
+      "version": "0.2.15",
+      "resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.15.tgz",
+      "integrity": "sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ==",
+      "dev": true,
+      "dependencies": {
+        "fdir": "^6.5.0",
+        "picomatch": "^4.0.3"
+      },
+      "engines": {
+        "node": ">=12.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/SuperchupuDev"
+      }
+    },
+    "node_modules/tinyglobby/node_modules/fdir": {
+      "version": "6.5.0",
+      "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.5.0.tgz",
+      "integrity": "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==",
+      "dev": true,
+      "engines": {
+        "node": ">=12.0.0"
+      },
+      "peerDependencies": {
+        "picomatch": "^3 || ^4"
+      },
+      "peerDependenciesMeta": {
+        "picomatch": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/tinyglobby/node_modules/picomatch": {
+      "version": "4.0.4",
+      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz",
+      "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==",
+      "dev": true,
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/jonschlinkert"
+      }
+    },
     "node_modules/to-regex-range": {
       "version": "5.0.1",
       "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
@@ -3654,19 +3435,6 @@
         "node": ">= 0.6"
       }
     },
-    "node_modules/unicorn-magic": {
-      "version": "0.3.0",
-      "resolved": "https://registry.npmjs.org/unicorn-magic/-/unicorn-magic-0.3.0.tgz",
-      "integrity": "sha512-+QBBXBCvifc56fsbuxZQ6Sic3wqqc3WWaqxs58gvJrcOuN83HGTCwz3oS5phzU9LthRNE9VrJCFCLUgHeeFnfA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=18"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/unpipe": {
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
@@ -3678,9 +3446,9 @@
       }
     },
     "node_modules/update-browserslist-db": {
-      "version": "1.0.13",
-      "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.0.13.tgz",
-      "integrity": "sha512-xebP81SNcPuNpPP3uzeW1NYXxI3rxyJzF3pD6sH4jE7o/IX+WtSpwnVU+qIsDPyk0d3hmFQ7mjqc6AtV604hbg==",
+      "version": "1.2.3",
+      "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.2.3.tgz",
+      "integrity": "sha512-Js0m9cx+qOgDxo0eMiFGEueWztz+d4+M3rGlmKPT+T4IS/jP4ylw3Nwpu6cpTTP8R1MAC1kF4VbdLt3ARf209w==",
       "dev": true,
       "funding": [
         {
@@ -3697,8 +3465,8 @@
         }
       ],
       "dependencies": {
-        "escalade": "^3.1.1",
-        "picocolors": "^1.0.0"
+        "escalade": "^3.2.0",
+        "picocolors": "^1.1.1"
       },
       "bin": {
         "update-browserslist-db": "cli.js"
@@ -3707,15 +3475,6 @@
         "browserslist": ">= 4.21.0"
       }
     },
-    "node_modules/uri-js": {
-      "version": "4.4.1",
-      "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz",
-      "integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==",
-      "dev": true,
-      "dependencies": {
-        "punycode": "^2.1.0"
-      }
-    },
     "node_modules/util-deprecate": {
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
@@ -3751,9 +3510,9 @@
       }
     },
     "node_modules/watchpack": {
-      "version": "2.4.2",
-      "resolved": "https://registry.npmjs.org/watchpack/-/watchpack-2.4.2.tgz",
-      "integrity": "sha512-TnbFSbcOCcDgjZ4piURLCbJ3nJhznVh9kw6F6iokjiFPl8ONxe9A6nMDVXDiNbrSfLILs6vB07F7wLBrwPYzJw==",
+      "version": "2.5.1",
+      "resolved": "https://registry.npmjs.org/watchpack/-/watchpack-2.5.1.tgz",
+      "integrity": "sha512-Zn5uXdcFNIA1+1Ei5McRd+iRzfhENPCe7LeABkJtNulSxjma+l7ltNx55BWZkRlwRnpOgHqxnjyaDgJnNXnqzg==",
       "dev": true,
       "dependencies": {
         "glob-to-regexp": "^0.4.1",
@@ -3773,34 +3532,36 @@
       }
     },
     "node_modules/webpack": {
-      "version": "5.94.0",
-      "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.94.0.tgz",
-      "integrity": "sha512-KcsGn50VT+06JH/iunZJedYGUJS5FGjow8wb9c0v5n1Om8O1g4L6LjtfxwlXIATopoQu+vOXXa7gYisWxCoPyg==",
-      "dev": true,
-      "dependencies": {
-        "@types/estree": "^1.0.5",
-        "@webassemblyjs/ast": "^1.12.1",
-        "@webassemblyjs/wasm-edit": "^1.12.1",
-        "@webassemblyjs/wasm-parser": "^1.12.1",
-        "acorn": "^8.7.1",
-        "acorn-import-attributes": "^1.9.5",
-        "browserslist": "^4.21.10",
+      "version": "5.105.0",
+      "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.105.0.tgz",
+      "integrity": "sha512-gX/dMkRQc7QOMzgTe6KsYFM7DxeIONQSui1s0n/0xht36HvrgbxtM1xBlgx596NbpHuQU8P7QpKwrZYwUX48nw==",
+      "dev": true,
+      "dependencies": {
+        "@types/eslint-scope": "^3.7.7",
+        "@types/estree": "^1.0.8",
+        "@types/json-schema": "^7.0.15",
+        "@webassemblyjs/ast": "^1.14.1",
+        "@webassemblyjs/wasm-edit": "^1.14.1",
+        "@webassemblyjs/wasm-parser": "^1.14.1",
+        "acorn": "^8.15.0",
+        "acorn-import-phases": "^1.0.3",
+        "browserslist": "^4.28.1",
         "chrome-trace-event": "^1.0.2",
-        "enhanced-resolve": "^5.17.1",
-        "es-module-lexer": "^1.2.1",
+        "enhanced-resolve": "^5.19.0",
+        "es-module-lexer": "^2.0.0",
         "eslint-scope": "5.1.1",
         "events": "^3.2.0",
         "glob-to-regexp": "^0.4.1",
         "graceful-fs": "^4.2.11",
         "json-parse-even-better-errors": "^2.3.1",
-        "loader-runner": "^4.2.0",
+        "loader-runner": "^4.3.1",
         "mime-types": "^2.1.27",
         "neo-async": "^2.6.2",
-        "schema-utils": "^3.2.0",
-        "tapable": "^2.1.1",
-        "terser-webpack-plugin": "^5.3.10",
-        "watchpack": "^2.4.1",
-        "webpack-sources": "^3.2.3"
+        "schema-utils": "^4.3.3",
+        "tapable": "^2.3.0",
+        "terser-webpack-plugin": "^5.3.16",
+        "watchpack": "^2.5.1",
+        "webpack-sources": "^3.3.3"
       },
       "bin": {
         "webpack": "bin/webpack.js"
@@ -3902,63 +3663,6 @@
         }
       }
     },
-    "node_modules/webpack-dev-middleware/node_modules/ajv": {
-      "version": "8.17.1",
-      "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.17.1.tgz",
-      "integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "fast-deep-equal": "^3.1.3",
-        "fast-uri": "^3.0.1",
-        "json-schema-traverse": "^1.0.0",
-        "require-from-string": "^2.0.2"
-      },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/epoberezkin"
-      }
-    },
-    "node_modules/webpack-dev-middleware/node_modules/ajv-keywords": {
-      "version": "5.1.0",
-      "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-5.1.0.tgz",
-      "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "fast-deep-equal": "^3.1.3"
-      },
-      "peerDependencies": {
-        "ajv": "^8.8.2"
-      }
-    },
-    "node_modules/webpack-dev-middleware/node_modules/json-schema-traverse": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-      "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/webpack-dev-middleware/node_modules/schema-utils": {
-      "version": "4.3.2",
-      "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.3.2.tgz",
-      "integrity": "sha512-Gn/JaSk/Mt9gYubxTtSn/QCV4em9mpAPiR1rqy/Ocu19u/G9J5WWdNoUT4SiV6mFC3y6cxyFcFwdzPM3FgxGAQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/json-schema": "^7.0.9",
-        "ajv": "^8.9.0",
-        "ajv-formats": "^2.1.1",
-        "ajv-keywords": "^5.1.0"
-      },
-      "engines": {
-        "node": ">= 10.13.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/webpack"
-      }
-    },
     "node_modules/webpack-dev-server": {
       "version": "5.2.1",
       "resolved": "https://registry.npmjs.org/webpack-dev-server/-/webpack-dev-server-5.2.1.tgz",
@@ -4017,59 +3721,6 @@
         }
       }
     },
-    "node_modules/webpack-dev-server/node_modules/ajv": {
-      "version": "8.12.0",
-      "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
-      "integrity": "sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==",
-      "dev": true,
-      "dependencies": {
-        "fast-deep-equal": "^3.1.1",
-        "json-schema-traverse": "^1.0.0",
-        "require-from-string": "^2.0.2",
-        "uri-js": "^4.2.2"
-      },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/epoberezkin"
-      }
-    },
-    "node_modules/webpack-dev-server/node_modules/ajv-keywords": {
-      "version": "5.1.0",
-      "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-5.1.0.tgz",
-      "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==",
-      "dev": true,
-      "dependencies": {
-        "fast-deep-equal": "^3.1.3"
-      },
-      "peerDependencies": {
-        "ajv": "^8.8.2"
-      }
-    },
-    "node_modules/webpack-dev-server/node_modules/json-schema-traverse": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-      "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
-      "dev": true
-    },
-    "node_modules/webpack-dev-server/node_modules/schema-utils": {
-      "version": "4.2.0",
-      "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.2.0.tgz",
-      "integrity": "sha512-L0jRsrPpjdckP3oPug3/VxNKt2trR8TcabrM6FOAAlvC/9Phcmm+cuAgTlxBqdBR1WJx7Naj9WHw+aOmheSVbw==",
-      "dev": true,
-      "dependencies": {
-        "@types/json-schema": "^7.0.9",
-        "ajv": "^8.9.0",
-        "ajv-formats": "^2.1.1",
-        "ajv-keywords": "^5.1.0"
-      },
-      "engines": {
-        "node": ">= 12.13.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/webpack"
-      }
-    },
     "node_modules/webpack-merge": {
       "version": "5.9.0",
       "resolved": "https://registry.npmjs.org/webpack-merge/-/webpack-merge-5.9.0.tgz",
@@ -4083,10 +3734,10 @@
         "node": ">=10.0.0"
       }
     },
-    "node_modules/webpack/node_modules/webpack-sources": {
-      "version": "3.2.3",
-      "resolved": "https://registry.npmjs.org/webpack-sources/-/webpack-sources-3.2.3.tgz",
-      "integrity": "sha512-/DyMEOrDgLKKIG0fmvtz+4dUX/3Ghozwgm6iPp8KRhvn+eQf9+Q7GWxVNMk3+uCPWfdXYC4ExGBckIXdFEfH1w==",
+    "node_modules/webpack-sources": {
+      "version": "3.3.3",
+      "resolved": "https://registry.npmjs.org/webpack-sources/-/webpack-sources-3.3.3.tgz",
+      "integrity": "sha512-yd1RBzSGanHkitROoPFd6qsrxt+oFhg/129YzheDGqeustzX0vTZJZsSsQjVQC4yzBQ56K55XU8gaNCtIzOnTg==",
       "dev": true,
       "engines": {
         "node": ">=10.13.0"
@@ -4167,13 +3818,12 @@
       "dev": true
     },
     "@jridgewell/gen-mapping": {
-      "version": "0.3.5",
-      "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.5.tgz",
-      "integrity": "sha512-IzL8ZoEDIBRWEzlCcRhOaCupYyN5gdIK+Q6fbFdPDg6HqX6jpkItn7DFIpW9LQzXG6Df9sA7+OKnq0qlz/GaQg==",
+      "version": "0.3.13",
+      "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.13.tgz",
+      "integrity": "sha512-2kkt/7niJ6MgEPxF0bYdQ6etZaA+fQvDcLKckhy1yIQOzaoKjBBjSj63/aLVjYE3qhRt5dvM+uUyfCg6UKCBbA==",
       "dev": true,
       "requires": {
-        "@jridgewell/set-array": "^1.2.1",
-        "@jridgewell/sourcemap-codec": "^1.4.10",
+        "@jridgewell/sourcemap-codec": "^1.5.0",
         "@jridgewell/trace-mapping": "^0.3.24"
       }
     },
@@ -4183,16 +3833,10 @@
       "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==",
       "dev": true
     },
-    "@jridgewell/set-array": {
-      "version": "1.2.1",
-      "resolved": "https://registry.npmjs.org/@jridgewell/set-array/-/set-array-1.2.1.tgz",
-      "integrity": "sha512-R8gLRTZeyp03ymzP/6Lil/28tGeGEzhx1q2k703KGWRAI1VdvPIXdG70VJc2pAMw3NA6JKL5hhFu1sJX0Mnn/A==",
-      "dev": true
-    },
     "@jridgewell/source-map": {
-      "version": "0.3.6",
-      "resolved": "https://registry.npmjs.org/@jridgewell/source-map/-/source-map-0.3.6.tgz",
-      "integrity": "sha512-1ZJTZebgqllO79ue2bm3rIGud/bOe0pP5BjSRCRxxYkEZS8STV7zN84UBbiYu7jy+eCKSnVIUgoWWE/tt+shMQ==",
+      "version": "0.3.11",
+      "resolved": "https://registry.npmjs.org/@jridgewell/source-map/-/source-map-0.3.11.tgz",
+      "integrity": "sha512-ZMp1V8ZFcPG5dIWnQLr3NSI1MiCU7UETdS/A0G8V/XWHvJv3ZsFqutJn1Y5RPmAPX6F3BiE397OqveU/9NCuIA==",
       "dev": true,
       "requires": {
         "@jridgewell/gen-mapping": "^0.3.5",
@@ -4200,15 +3844,15 @@
       }
     },
     "@jridgewell/sourcemap-codec": {
-      "version": "1.5.0",
-      "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.0.tgz",
-      "integrity": "sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==",
+      "version": "1.5.5",
+      "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz",
+      "integrity": "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==",
       "dev": true
     },
     "@jridgewell/trace-mapping": {
-      "version": "0.3.25",
-      "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.25.tgz",
-      "integrity": "sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ==",
+      "version": "0.3.31",
+      "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.31.tgz",
+      "integrity": "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==",
       "dev": true,
       "requires": {
         "@jridgewell/resolve-uri": "^3.1.0",
@@ -4221,38 +3865,6 @@
       "integrity": "sha512-Vo+PSpZG2/fmgmiNzYK9qWRh8h/CHrwD0mo1h1DzL4yzHNSfWYujGTYsWGreD000gcgmZ7K4Ys6Tx9TxtsKdDw==",
       "dev": true
     },
-    "@nodelib/fs.scandir": {
-      "version": "2.1.5",
-      "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
-      "integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==",
-      "dev": true,
-      "requires": {
-        "@nodelib/fs.stat": "2.0.5",
-        "run-parallel": "^1.1.9"
-      }
-    },
-    "@nodelib/fs.stat": {
-      "version": "2.0.5",
-      "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz",
-      "integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==",
-      "dev": true
-    },
-    "@nodelib/fs.walk": {
-      "version": "1.2.8",
-      "resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz",
-      "integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==",
-      "dev": true,
-      "requires": {
-        "@nodelib/fs.scandir": "2.1.5",
-        "fastq": "^1.6.0"
-      }
-    },
-    "@sindresorhus/merge-streams": {
-      "version": "2.3.0",
-      "resolved": "https://registry.npmjs.org/@sindresorhus/merge-streams/-/merge-streams-2.3.0.tgz",
-      "integrity": "sha512-LtoMMhxAlorcGhmFYI+LhPgbPZCkgP6ra1YL604EeF6U98pLlQ3iWIGMdWSC+vWmPBWBNgmDBAhnAobLROJmwg==",
-      "dev": true
-    },
     "@types/body-parser": {
       "version": "1.19.3",
       "resolved": "https://registry.npmjs.org/@types/body-parser/-/body-parser-1.19.3.tgz",
@@ -4291,10 +3903,30 @@
         "@types/node": "*"
       }
     },
+    "@types/eslint": {
+      "version": "9.6.1",
+      "resolved": "https://registry.npmjs.org/@types/eslint/-/eslint-9.6.1.tgz",
+      "integrity": "sha512-FXx2pKgId/WyYo2jXw63kk7/+TY7u7AziEJxJAnSFzHlqTAS3Ync6SvgYAN/k4/PQpnnVuzoMuVnByKK2qp0ag==",
+      "dev": true,
+      "requires": {
+        "@types/estree": "*",
+        "@types/json-schema": "*"
+      }
+    },
+    "@types/eslint-scope": {
+      "version": "3.7.7",
+      "resolved": "https://registry.npmjs.org/@types/eslint-scope/-/eslint-scope-3.7.7.tgz",
+      "integrity": "sha512-MzMFlSLBqNF2gcHWO0G1vP/YQyfvrxZ0bF+u7mzUdZ1/xK4A4sru+nraZz5i3iEIk1l1uyicaDVTB4QbbEkAYg==",
+      "dev": true,
+      "requires": {
+        "@types/eslint": "*",
+        "@types/estree": "*"
+      }
+    },
     "@types/estree": {
-      "version": "1.0.5",
-      "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.5.tgz",
-      "integrity": "sha512-/kYRxGDLWzHOB7q+wtSUQlFrtcdUccpfy+X+9iMBpHK8QLLhx2wIPYuS5DYtR9Wa/YlZAbIovy7qVdB1Aq6Lyw==",
+      "version": "1.0.8",
+      "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz",
+      "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==",
       "dev": true
     },
     "@types/express": {
@@ -4337,9 +3969,9 @@
       }
     },
     "@types/json-schema": {
-      "version": "7.0.13",
-      "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.13.tgz",
-      "integrity": "sha512-RbSSoHliUbnXj3ny0CNFOoxrIDV6SUGyStHsvDqosw6CkdPV8TtWGlfecuK4ToyMEAql6pzNxgCFKanovUzlgQ==",
+      "version": "7.0.15",
+      "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz",
+      "integrity": "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==",
       "dev": true
     },
     "@types/mime": {
@@ -4430,148 +4062,148 @@
       }
     },
     "@webassemblyjs/ast": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/ast/-/ast-1.12.1.tgz",
-      "integrity": "sha512-EKfMUOPRRUTy5UII4qJDGPpqfwjOmZ5jeGFwid9mnoqIFK+e0vqoi1qH56JpmZSzEL53jKnNzScdmftJyG5xWg==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/ast/-/ast-1.14.1.tgz",
+      "integrity": "sha512-nuBEDgQfm1ccRp/8bCQrx1frohyufl4JlbMMZ4P1wpeOfDhF6FQkxZJ1b/e+PLwr6X1Nhw6OLme5usuBWYBvuQ==",
       "dev": true,
       "requires": {
-        "@webassemblyjs/helper-numbers": "1.11.6",
-        "@webassemblyjs/helper-wasm-bytecode": "1.11.6"
+        "@webassemblyjs/helper-numbers": "1.13.2",
+        "@webassemblyjs/helper-wasm-bytecode": "1.13.2"
       }
     },
     "@webassemblyjs/floating-point-hex-parser": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/floating-point-hex-parser/-/floating-point-hex-parser-1.11.6.tgz",
-      "integrity": "sha512-ejAj9hfRJ2XMsNHk/v6Fu2dGS+i4UaXBXGemOfQ/JfQ6mdQg/WXtwleQRLLS4OvfDhv8rYnVwH27YJLMyYsxhw==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/floating-point-hex-parser/-/floating-point-hex-parser-1.13.2.tgz",
+      "integrity": "sha512-6oXyTOzbKxGH4steLbLNOu71Oj+C8Lg34n6CqRvqfS2O71BxY6ByfMDRhBytzknj9yGUPVJ1qIKhRlAwO1AovA==",
       "dev": true
     },
     "@webassemblyjs/helper-api-error": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-api-error/-/helper-api-error-1.11.6.tgz",
-      "integrity": "sha512-o0YkoP4pVu4rN8aTJgAyj9hC2Sv5UlkzCHhxqWj8butaLvnpdc2jOwh4ewE6CX0txSfLn/UYaV/pheS2Txg//Q==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-api-error/-/helper-api-error-1.13.2.tgz",
+      "integrity": "sha512-U56GMYxy4ZQCbDZd6JuvvNV/WFildOjsaWD3Tzzvmw/mas3cXzRJPMjP83JqEsgSbyrmaGjBfDtV7KDXV9UzFQ==",
       "dev": true
     },
     "@webassemblyjs/helper-buffer": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-buffer/-/helper-buffer-1.12.1.tgz",
-      "integrity": "sha512-nzJwQw99DNDKr9BVCOZcLuJJUlqkJh+kVzVl6Fmq/tI5ZtEyWT1KZMyOXltXLZJmDtvLCDgwsyrkohEtopTXCw==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-buffer/-/helper-buffer-1.14.1.tgz",
+      "integrity": "sha512-jyH7wtcHiKssDtFPRB+iQdxlDf96m0E39yb0k5uJVhFGleZFoNw1c4aeIcVUPPbXUVJ94wwnMOAqUHyzoEPVMA==",
       "dev": true
     },
     "@webassemblyjs/helper-numbers": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-numbers/-/helper-numbers-1.11.6.tgz",
-      "integrity": "sha512-vUIhZ8LZoIWHBohiEObxVm6hwP034jwmc9kuq5GdHZH0wiLVLIPcMCdpJzG4C11cHoQ25TFIQj9kaVADVX7N3g==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-numbers/-/helper-numbers-1.13.2.tgz",
+      "integrity": "sha512-FE8aCmS5Q6eQYcV3gI35O4J789wlQA+7JrqTTpJqn5emA4U2hvwJmvFRC0HODS+3Ye6WioDklgd6scJ3+PLnEA==",
       "dev": true,
       "requires": {
-        "@webassemblyjs/floating-point-hex-parser": "1.11.6",
-        "@webassemblyjs/helper-api-error": "1.11.6",
+        "@webassemblyjs/floating-point-hex-parser": "1.13.2",
+        "@webassemblyjs/helper-api-error": "1.13.2",
         "@xtuc/long": "4.2.2"
       }
     },
     "@webassemblyjs/helper-wasm-bytecode": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-bytecode/-/helper-wasm-bytecode-1.11.6.tgz",
-      "integrity": "sha512-sFFHKwcmBprO9e7Icf0+gddyWYDViL8bpPjJJl0WHxCdETktXdmtWLGVzoHbqUcY4Be1LkNfwTmXOJUFZYSJdA==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-bytecode/-/helper-wasm-bytecode-1.13.2.tgz",
+      "integrity": "sha512-3QbLKy93F0EAIXLh0ogEVR6rOubA9AoZ+WRYhNbFyuB70j3dRdwH9g+qXhLAO0kiYGlg3TxDV+I4rQTr/YNXkA==",
       "dev": true
     },
     "@webassemblyjs/helper-wasm-section": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-section/-/helper-wasm-section-1.12.1.tgz",
-      "integrity": "sha512-Jif4vfB6FJlUlSbgEMHUyk1j234GTNG9dBJ4XJdOySoj518Xj0oGsNi59cUQF4RRMS9ouBUxDDdyBVfPTypa5g==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-section/-/helper-wasm-section-1.14.1.tgz",
+      "integrity": "sha512-ds5mXEqTJ6oxRoqjhWDU83OgzAYjwsCV8Lo/N+oRsNDmx/ZDpqalmrtgOMkHwxsG0iI//3BwWAErYRHtgn0dZw==",
       "dev": true,
       "requires": {
-        "@webassemblyjs/ast": "1.12.1",
-        "@webassemblyjs/helper-buffer": "1.12.1",
-        "@webassemblyjs/helper-wasm-bytecode": "1.11.6",
-        "@webassemblyjs/wasm-gen": "1.12.1"
+        "@webassemblyjs/ast": "1.14.1",
+        "@webassemblyjs/helper-buffer": "1.14.1",
+        "@webassemblyjs/helper-wasm-bytecode": "1.13.2",
+        "@webassemblyjs/wasm-gen": "1.14.1"
       }
     },
     "@webassemblyjs/ieee754": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/ieee754/-/ieee754-1.11.6.tgz",
-      "integrity": "sha512-LM4p2csPNvbij6U1f19v6WR56QZ8JcHg3QIJTlSwzFcmx6WSORicYj6I63f9yU1kEUtrpG+kjkiIAkevHpDXrg==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/ieee754/-/ieee754-1.13.2.tgz",
+      "integrity": "sha512-4LtOzh58S/5lX4ITKxnAK2USuNEvpdVV9AlgGQb8rJDHaLeHciwG4zlGr0j/SNWlr7x3vO1lDEsuePvtcDNCkw==",
       "dev": true,
       "requires": {
         "@xtuc/ieee754": "^1.2.0"
       }
     },
     "@webassemblyjs/leb128": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/leb128/-/leb128-1.11.6.tgz",
-      "integrity": "sha512-m7a0FhE67DQXgouf1tbN5XQcdWoNgaAuoULHIfGFIEVKA6tu/edls6XnIlkmS6FrXAquJRPni3ZZKjw6FSPjPQ==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/leb128/-/leb128-1.13.2.tgz",
+      "integrity": "sha512-Lde1oNoIdzVzdkNEAWZ1dZ5orIbff80YPdHx20mrHwHrVNNTjNr8E3xz9BdpcGqRQbAEa+fkrCb+fRFTl/6sQw==",
       "dev": true,
       "requires": {
         "@xtuc/long": "4.2.2"
       }
     },
     "@webassemblyjs/utf8": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/utf8/-/utf8-1.11.6.tgz",
-      "integrity": "sha512-vtXf2wTQ3+up9Zsg8sa2yWiQpzSsMyXj0qViVP6xKGCUT8p8YJ6HqI7l5eCnWx1T/FYdsv07HQs2wTFbbof/RA==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/utf8/-/utf8-1.13.2.tgz",
+      "integrity": "sha512-3NQWGjKTASY1xV5m7Hr0iPeXD9+RDobLll3T9d2AO+g3my8xy5peVyjSag4I50mR1bBSN/Ct12lo+R9tJk0NZQ==",
       "dev": true
     },
     "@webassemblyjs/wasm-edit": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-edit/-/wasm-edit-1.12.1.tgz",
-      "integrity": "sha512-1DuwbVvADvS5mGnXbE+c9NfA8QRcZ6iKquqjjmR10k6o+zzsRVesil54DKexiowcFCPdr/Q0qaMgB01+SQ1u6g==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-edit/-/wasm-edit-1.14.1.tgz",
+      "integrity": "sha512-RNJUIQH/J8iA/1NzlE4N7KtyZNHi3w7at7hDjvRNm5rcUXa00z1vRz3glZoULfJ5mpvYhLybmVcwcjGrC1pRrQ==",
       "dev": true,
       "requires": {
-        "@webassemblyjs/ast": "1.12.1",
-        "@webassemblyjs/helper-buffer": "1.12.1",
-        "@webassemblyjs/helper-wasm-bytecode": "1.11.6",
-        "@webassemblyjs/helper-wasm-section": "1.12.1",
-        "@webassemblyjs/wasm-gen": "1.12.1",
-        "@webassemblyjs/wasm-opt": "1.12.1",
-        "@webassemblyjs/wasm-parser": "1.12.1",
-        "@webassemblyjs/wast-printer": "1.12.1"
+        "@webassemblyjs/ast": "1.14.1",
+        "@webassemblyjs/helper-buffer": "1.14.1",
+        "@webassemblyjs/helper-wasm-bytecode": "1.13.2",
+        "@webassemblyjs/helper-wasm-section": "1.14.1",
+        "@webassemblyjs/wasm-gen": "1.14.1",
+        "@webassemblyjs/wasm-opt": "1.14.1",
+        "@webassemblyjs/wasm-parser": "1.14.1",
+        "@webassemblyjs/wast-printer": "1.14.1"
       }
     },
     "@webassemblyjs/wasm-gen": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-gen/-/wasm-gen-1.12.1.tgz",
-      "integrity": "sha512-TDq4Ojh9fcohAw6OIMXqiIcTq5KUXTGRkVxbSo1hQnSy6lAM5GSdfwWeSxpAo0YzgsgF182E/U0mDNhuA0tW7w==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-gen/-/wasm-gen-1.14.1.tgz",
+      "integrity": "sha512-AmomSIjP8ZbfGQhumkNvgC33AY7qtMCXnN6bL2u2Js4gVCg8fp735aEiMSBbDR7UQIj90n4wKAFUSEd0QN2Ukg==",
       "dev": true,
       "requires": {
-        "@webassemblyjs/ast": "1.12.1",
-        "@webassemblyjs/helper-wasm-bytecode": "1.11.6",
-        "@webassemblyjs/ieee754": "1.11.6",
-        "@webassemblyjs/leb128": "1.11.6",
-        "@webassemblyjs/utf8": "1.11.6"
+        "@webassemblyjs/ast": "1.14.1",
+        "@webassemblyjs/helper-wasm-bytecode": "1.13.2",
+        "@webassemblyjs/ieee754": "1.13.2",
+        "@webassemblyjs/leb128": "1.13.2",
+        "@webassemblyjs/utf8": "1.13.2"
       }
     },
     "@webassemblyjs/wasm-opt": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-opt/-/wasm-opt-1.12.1.tgz",
-      "integrity": "sha512-Jg99j/2gG2iaz3hijw857AVYekZe2SAskcqlWIZXjji5WStnOpVoat3gQfT/Q5tb2djnCjBtMocY/Su1GfxPBg==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-opt/-/wasm-opt-1.14.1.tgz",
+      "integrity": "sha512-PTcKLUNvBqnY2U6E5bdOQcSM+oVP/PmrDY9NzowJjislEjwP/C4an2303MCVS2Mg9d3AJpIGdUFIQQWbPds0Sw==",
       "dev": true,
       "requires": {
-        "@webassemblyjs/ast": "1.12.1",
-        "@webassemblyjs/helper-buffer": "1.12.1",
-        "@webassemblyjs/wasm-gen": "1.12.1",
-        "@webassemblyjs/wasm-parser": "1.12.1"
+        "@webassemblyjs/ast": "1.14.1",
+        "@webassemblyjs/helper-buffer": "1.14.1",
+        "@webassemblyjs/wasm-gen": "1.14.1",
+        "@webassemblyjs/wasm-parser": "1.14.1"
       }
     },
     "@webassemblyjs/wasm-parser": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-parser/-/wasm-parser-1.12.1.tgz",
-      "integrity": "sha512-xikIi7c2FHXysxXe3COrVUPSheuBtpcfhbpFj4gmu7KRLYOzANztwUU0IbsqvMqzuNK2+glRGWCEqZo1WCLyAQ==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-parser/-/wasm-parser-1.14.1.tgz",
+      "integrity": "sha512-JLBl+KZ0R5qB7mCnud/yyX08jWFw5MsoalJ1pQ4EdFlgj9VdXKGuENGsiCIjegI1W7p91rUlcB/LB5yRJKNTcQ==",
       "dev": true,
       "requires": {
-        "@webassemblyjs/ast": "1.12.1",
-        "@webassemblyjs/helper-api-error": "1.11.6",
-        "@webassemblyjs/helper-wasm-bytecode": "1.11.6",
-        "@webassemblyjs/ieee754": "1.11.6",
-        "@webassemblyjs/leb128": "1.11.6",
-        "@webassemblyjs/utf8": "1.11.6"
+        "@webassemblyjs/ast": "1.14.1",
+        "@webassemblyjs/helper-api-error": "1.13.2",
+        "@webassemblyjs/helper-wasm-bytecode": "1.13.2",
+        "@webassemblyjs/ieee754": "1.13.2",
+        "@webassemblyjs/leb128": "1.13.2",
+        "@webassemblyjs/utf8": "1.13.2"
       }
     },
     "@webassemblyjs/wast-printer": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/wast-printer/-/wast-printer-1.12.1.tgz",
-      "integrity": "sha512-+X4WAlOisVWQMikjbcvY2e0rwPsKQ9F688lksZhBcPycBBuii3O7m8FACbDMWDojpAqvjIncrG8J0XHKyQfVeA==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/wast-printer/-/wast-printer-1.14.1.tgz",
+      "integrity": "sha512-kPSSXE6De1XOR820C90RIo2ogvZG+c3KiHzqUoO/F34Y2shGzesfqv7o57xrxovZJH/MetF5UjroJ/R/3isoiw==",
       "dev": true,
       "requires": {
-        "@webassemblyjs/ast": "1.12.1",
+        "@webassemblyjs/ast": "1.14.1",
         "@xtuc/long": "4.2.2"
       }
     },
@@ -4619,28 +4251,28 @@
       }
     },
     "acorn": {
-      "version": "8.12.1",
-      "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.12.1.tgz",
-      "integrity": "sha512-tcpGyI9zbizT9JbV6oYE477V6mTlXvvi0T0G3SNIYE2apm/G5huBa1+K89VGeovbg+jycCrfhl3ADxErOuO6Jg==",
+      "version": "8.15.0",
+      "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz",
+      "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
       "dev": true
     },
-    "acorn-import-attributes": {
-      "version": "1.9.5",
-      "resolved": "https://registry.npmjs.org/acorn-import-attributes/-/acorn-import-attributes-1.9.5.tgz",
-      "integrity": "sha512-n02Vykv5uA3eHGM/Z2dQrcD56kL8TyDb2p1+0P83PClMnC/nc+anbQRhIOWnSq4Ke/KvDPrY3C9hDtC/A3eHnQ==",
+    "acorn-import-phases": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/acorn-import-phases/-/acorn-import-phases-1.0.4.tgz",
+      "integrity": "sha512-wKmbr/DDiIXzEOiWrTTUcDm24kQ2vGfZQvM2fwg2vXqR5uW6aapr7ObPtj1th32b9u90/Pf4AItvdTh42fBmVQ==",
       "dev": true,
       "requires": {}
     },
     "ajv": {
-      "version": "6.12.6",
-      "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
-      "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==",
+      "version": "8.17.1",
+      "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.17.1.tgz",
+      "integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==",
       "dev": true,
       "requires": {
-        "fast-deep-equal": "^3.1.1",
-        "fast-json-stable-stringify": "^2.0.0",
-        "json-schema-traverse": "^0.4.1",
-        "uri-js": "^4.2.2"
+        "fast-deep-equal": "^3.1.3",
+        "fast-uri": "^3.0.1",
+        "json-schema-traverse": "^1.0.0",
+        "require-from-string": "^2.0.2"
       }
     },
     "ajv-formats": {
@@ -4650,34 +4282,16 @@
       "dev": true,
       "requires": {
         "ajv": "^8.0.0"
-      },
-      "dependencies": {
-        "ajv": {
-          "version": "8.12.0",
-          "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
-          "integrity": "sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==",
-          "dev": true,
-          "requires": {
-            "fast-deep-equal": "^3.1.1",
-            "json-schema-traverse": "^1.0.0",
-            "require-from-string": "^2.0.2",
-            "uri-js": "^4.2.2"
-          }
-        },
-        "json-schema-traverse": {
-          "version": "1.0.0",
-          "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-          "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
-          "dev": true
-        }
       }
     },
     "ajv-keywords": {
-      "version": "3.5.2",
-      "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-3.5.2.tgz",
-      "integrity": "sha512-5p6WTN0DdTGVQk6VjcEju19IgaHudalcfabD7yhDGeA6bcQnmL+CpveLJq/3hvfwd1aof6L386Ougkx6RfyMIQ==",
+      "version": "5.1.0",
+      "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-5.1.0.tgz",
+      "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==",
       "dev": true,
-      "requires": {}
+      "requires": {
+        "fast-deep-equal": "^3.1.3"
+      }
     },
     "ansi-html-community": {
       "version": "0.0.8",
@@ -4701,6 +4315,12 @@
       "integrity": "sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg==",
       "dev": true
     },
+    "baseline-browser-mapping": {
+      "version": "2.9.19",
+      "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.19.tgz",
+      "integrity": "sha512-ipDqC8FrAl/76p2SSWKSI+H9tFwm7vYqXQrItCuiVPt26Km0jS+NzSsBWAaBusvSbQcfJG+JitdMm+wZAgTYqg==",
+      "dev": true
+    },
     "batch": {
       "version": "0.6.1",
       "resolved": "https://registry.npmjs.org/batch/-/batch-0.6.1.tgz",
@@ -4770,15 +4390,16 @@
       }
     },
     "browserslist": {
-      "version": "4.21.11",
-      "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.21.11.tgz",
-      "integrity": "sha512-xn1UXOKUz7DjdGlg9RrUr0GGiWzI97UQJnugHtH0OLDfJB7jMgoIkYvRIEO1l9EeEERVqeqLYOcFBW9ldjypbQ==",
+      "version": "4.28.1",
+      "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.28.1.tgz",
+      "integrity": "sha512-ZC5Bd0LgJXgwGqUknZY/vkUQ04r8NXnJZ3yYi4vDmSiZmC/pdSN0NbNRPxZpbtO4uAfDUAFffO8IZoM3Gj8IkA==",
       "dev": true,
       "requires": {
-        "caniuse-lite": "^1.0.30001538",
-        "electron-to-chromium": "^1.4.526",
-        "node-releases": "^2.0.13",
-        "update-browserslist-db": "^1.0.13"
+        "baseline-browser-mapping": "^2.9.0",
+        "caniuse-lite": "^1.0.30001759",
+        "electron-to-chromium": "^1.5.263",
+        "node-releases": "^2.0.27",
+        "update-browserslist-db": "^1.2.0"
       }
     },
     "buffer-from": {
@@ -4823,9 +4444,9 @@
       }
     },
     "caniuse-lite": {
-      "version": "1.0.30001538",
-      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001538.tgz",
-      "integrity": "sha512-HWJnhnID+0YMtGlzcp3T9drmBJUVDchPJ08tpUGFLs9CYlwWPH2uLgpHn8fND5pCgXVtnGS3H4QR9XLMHVNkHw==",
+      "version": "1.0.30001768",
+      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001768.tgz",
+      "integrity": "sha512-qY3aDRZC5nWPgHUgIB84WL+nySuo19wk0VJpp/XI9T34lrvkyhRvNVOFJOp2kxClQhiFBu+TaUSudf6oa3vkSA==",
       "dev": true
     },
     "chokidar": {
@@ -4965,40 +4586,18 @@
       "dev": true
     },
     "copy-webpack-plugin": {
-      "version": "12.0.2",
-      "resolved": "https://registry.npmjs.org/copy-webpack-plugin/-/copy-webpack-plugin-12.0.2.tgz",
-      "integrity": "sha512-SNwdBeHyII+rWvee/bTnAYyO8vfVdcSTud4EIb6jcZ8inLeWucJE0DnxXQBjlQ5zlteuuvooGQy3LIyGxhvlOA==",
+      "version": "14.0.0",
+      "resolved": "https://registry.npmjs.org/copy-webpack-plugin/-/copy-webpack-plugin-14.0.0.tgz",
+      "integrity": "sha512-3JLW90aBGeaTLpM7mYQKpnVdgsUZRExY55giiZgLuX/xTQRUs1dOCwbBnWnvY6Q6rfZoXMNwzOQJCSZPppfqXA==",
       "dev": true,
       "requires": {
-        "fast-glob": "^3.3.2",
         "glob-parent": "^6.0.1",
-        "globby": "^14.0.0",
         "normalize-path": "^3.0.0",
         "schema-utils": "^4.2.0",
-        "serialize-javascript": "^6.0.2"
+        "serialize-javascript": "^7.0.3",
+        "tinyglobby": "^0.2.12"
       },
       "dependencies": {
-        "ajv": {
-          "version": "8.17.1",
-          "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.17.1.tgz",
-          "integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==",
-          "dev": true,
-          "requires": {
-            "fast-deep-equal": "^3.1.3",
-            "fast-uri": "^3.0.1",
-            "json-schema-traverse": "^1.0.0",
-            "require-from-string": "^2.0.2"
-          }
-        },
-        "ajv-keywords": {
-          "version": "5.1.0",
-          "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-5.1.0.tgz",
-          "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==",
-          "dev": true,
-          "requires": {
-            "fast-deep-equal": "^3.1.3"
-          }
-        },
         "glob-parent": {
           "version": "6.0.2",
           "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz",
@@ -5008,23 +4607,11 @@
             "is-glob": "^4.0.3"
           }
         },
-        "json-schema-traverse": {
-          "version": "1.0.0",
-          "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-          "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
+        "serialize-javascript": {
+          "version": "7.0.5",
+          "resolved": "https://registry.npmjs.org/serialize-javascript/-/serialize-javascript-7.0.5.tgz",
+          "integrity": "sha512-F4LcB0UqUl1zErq+1nYEEzSHJnIwb3AF2XWB94b+afhrekOUijwooAYqFyRbjYkm2PAKBabx6oYv/xDxNi8IBw==",
           "dev": true
-        },
-        "schema-utils": {
-          "version": "4.3.0",
-          "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.3.0.tgz",
-          "integrity": "sha512-Gf9qqc58SpCA/xdziiHz35F4GNIWYWZrEshUc/G/r5BnLph6xpKuLeoJoQuj5WfBIx/eQLf+hmVPYHaxJu7V2g==",
-          "dev": true,
-          "requires": {
-            "@types/json-schema": "^7.0.9",
-            "ajv": "^8.9.0",
-            "ajv-formats": "^2.1.1",
-            "ajv-keywords": "^5.1.0"
-          }
         }
       }
     },
@@ -5132,9 +4719,9 @@
       "dev": true
     },
     "electron-to-chromium": {
-      "version": "1.4.528",
-      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.528.tgz",
-      "integrity": "sha512-UdREXMXzLkREF4jA8t89FQjA8WHI6ssP38PMY4/4KhXFQbtImnghh4GkCgrtiZwLKUKVD2iTVXvDVQjfomEQuA==",
+      "version": "1.5.286",
+      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.286.tgz",
+      "integrity": "sha512-9tfDXhJ4RKFNerfjdCcZfufu49vg620741MNs26a9+bhLThdB+plgMeou98CAaHu/WATj2iHOOHTp1hWtABj2A==",
       "dev": true
     },
     "encodeurl": {
@@ -5144,13 +4731,13 @@
       "dev": true
     },
     "enhanced-resolve": {
-      "version": "5.17.1",
-      "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.17.1.tgz",
-      "integrity": "sha512-LMHl3dXhTcfv8gM4kEzIUeTQ+7fpdA0l2tUf34BddXPkz2A5xJ5L/Pchd5BL6rdccM9QGvu0sWZzK1Z1t4wwyg==",
+      "version": "5.19.0",
+      "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.19.0.tgz",
+      "integrity": "sha512-phv3E1Xl4tQOShqSte26C7Fl84EwUdZsyOuSSk9qtAGyyQs2s3jJzComh+Abf4g187lUUAvH+H26omrqia2aGg==",
       "dev": true,
       "requires": {
         "graceful-fs": "^4.2.4",
-        "tapable": "^2.2.0"
+        "tapable": "^2.3.0"
       }
     },
     "envinfo": {
@@ -5172,9 +4759,9 @@
       "dev": true
     },
     "es-module-lexer": {
-      "version": "1.3.1",
-      "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-1.3.1.tgz",
-      "integrity": "sha512-JUFAyicQV9mXc3YRxPnDlrfBKpqt6hUYzz9/boprUJHs4e4KVr3XwOF70doO6gwXUor6EWZJAyWAfKki84t20Q==",
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-2.0.0.tgz",
+      "integrity": "sha512-5POEcUuZybH7IdmGsD8wlf0AI55wMecM9rVBTI/qEAy2c1kTOm3DjFYjrBdI2K3BaJjJYfYFeRtM0t9ssnRuxw==",
       "dev": true
     },
     "es-object-atoms": {
@@ -5187,9 +4774,9 @@
       }
     },
     "escalade": {
-      "version": "3.1.1",
-      "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.1.tgz",
-      "integrity": "sha512-k0er2gUkLf8O0zKJiAhmkTnJlTvINGv7ygDNPbeIsX/TJjGJZHuh9B2UxbsaEkmlEo9MfhrSzmhIlhRlI2GXnw==",
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
+      "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==",
       "dev": true
     },
     "escape-html": {
@@ -5250,39 +4837,39 @@
       "dev": true
     },
     "express": {
-      "version": "4.21.2",
-      "resolved": "https://registry.npmjs.org/express/-/express-4.21.2.tgz",
-      "integrity": "sha512-28HqgMZAmih1Czt9ny7qr6ek2qddF4FclbMzwhCREB6OFfH+rXAnuNCwo1/wFvrtbgsQDb4kSbX9de9lFbrXnA==",
+      "version": "4.22.1",
+      "resolved": "https://registry.npmjs.org/express/-/express-4.22.1.tgz",
+      "integrity": "sha512-F2X8g9P1X7uCPZMA3MVf9wcTqlyNp7IhH5qPCI0izhaOIYXaW9L535tGA3qmjRzpH+bZczqq7hVKxTR4NWnu+g==",
       "dev": true,
       "requires": {
         "accepts": "~1.3.8",
         "array-flatten": "1.1.1",
-        "body-parser": "1.20.3",
-        "content-disposition": "0.5.4",
+        "body-parser": "~1.20.3",
+        "content-disposition": "~0.5.4",
         "content-type": "~1.0.4",
-        "cookie": "0.7.1",
-        "cookie-signature": "1.0.6",
+        "cookie": "~0.7.1",
+        "cookie-signature": "~1.0.6",
         "debug": "2.6.9",
         "depd": "2.0.0",
         "encodeurl": "~2.0.0",
         "escape-html": "~1.0.3",
         "etag": "~1.8.1",
-        "finalhandler": "1.3.1",
-        "fresh": "0.5.2",
-        "http-errors": "2.0.0",
+        "finalhandler": "~1.3.1",
+        "fresh": "~0.5.2",
+        "http-errors": "~2.0.0",
         "merge-descriptors": "1.0.3",
         "methods": "~1.1.2",
-        "on-finished": "2.4.1",
+        "on-finished": "~2.4.1",
         "parseurl": "~1.3.3",
-        "path-to-regexp": "0.1.12",
+        "path-to-regexp": "~0.1.12",
         "proxy-addr": "~2.0.7",
-        "qs": "6.13.0",
+        "qs": "~6.14.0",
         "range-parser": "~1.2.1",
         "safe-buffer": "5.2.1",
-        "send": "0.19.0",
-        "serve-static": "1.16.2",
+        "send": "~0.19.0",
+        "serve-static": "~1.16.2",
         "setprototypeof": "1.2.0",
-        "statuses": "2.0.1",
+        "statuses": "~2.0.1",
         "type-is": "~1.6.18",
         "utils-merge": "1.0.1",
         "vary": "~1.1.2"
@@ -5303,6 +4890,15 @@
           "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==",
           "dev": true
         },
+        "qs": {
+          "version": "6.14.0",
+          "resolved": "https://registry.npmjs.org/qs/-/qs-6.14.0.tgz",
+          "integrity": "sha512-YWWTjgABSKcvs/nWBi9PycY/JiPJqOD4JA6o9Sej2AtvSGarXxKC3OQSk4pAarbdQlKAh5D4FCQkJNkW+GAn3w==",
+          "dev": true,
+          "requires": {
+            "side-channel": "^1.1.0"
+          }
+        },
         "safe-buffer": {
           "version": "5.2.1",
           "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
@@ -5323,29 +4919,10 @@
       "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==",
       "dev": true
     },
-    "fast-glob": {
-      "version": "3.3.3",
-      "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.3.tgz",
-      "integrity": "sha512-7MptL8U0cqcFdzIzwOTHoilX9x5BrNqye7Z/LuC7kCMRio1EMSyqRK3BEAUD7sXRq4iT4AzTVuZdhgQ2TCvYLg==",
-      "dev": true,
-      "requires": {
-        "@nodelib/fs.stat": "^2.0.2",
-        "@nodelib/fs.walk": "^1.2.3",
-        "glob-parent": "^5.1.2",
-        "merge2": "^1.3.0",
-        "micromatch": "^4.0.8"
-      }
-    },
-    "fast-json-stable-stringify": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz",
-      "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==",
-      "dev": true
-    },
     "fast-uri": {
-      "version": "3.0.6",
-      "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.0.6.tgz",
-      "integrity": "sha512-Atfo14OibSv5wAp4VWNsFYE1AchQRTv9cBGWET4pZWHzYshFSS9NQI6I57rdKn9croWVMbYFbLhJ+yJvmZIIHw==",
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.0.tgz",
+      "integrity": "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==",
       "dev": true
     },
     "fastest-levenshtein": {
@@ -5354,15 +4931,6 @@
       "integrity": "sha512-eRnCtTTtGZFpQCwhJiUOuxPQWRXVKYDn0b2PeHfXL6/Zi53SLAzAHfVhVWK2AryC/WH05kGfxhFIPvTF0SXQzg==",
       "dev": true
     },
-    "fastq": {
-      "version": "1.19.0",
-      "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.19.0.tgz",
-      "integrity": "sha512-7SFSRCNjBQIZH/xZR3iy5iQYR8aGBE0h3VG6/cwlbrpdciNYBMotQav8c1XI3HjHH+NikUpP53nPdlZSdWmFzA==",
-      "dev": true,
-      "requires": {
-        "reusify": "^1.0.4"
-      }
-    },
     "faye-websocket": {
       "version": "0.11.4",
       "resolved": "https://registry.npmjs.org/faye-websocket/-/faye-websocket-0.11.4.tgz",
@@ -5424,9 +4992,9 @@
       }
     },
     "follow-redirects": {
-      "version": "1.15.6",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
-      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
+      "version": "1.16.0",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.16.0.tgz",
+      "integrity": "sha512-y5rN/uOsadFT/JfYwhxRS5R7Qce+g3zG97+JrtFZlC9klX/W5hD7iiLzScI4nZqUS7DNUdhPgw4xI8W2LuXlUw==",
       "dev": true
     },
     "forwarded": {
@@ -5497,20 +5065,6 @@
       "integrity": "sha512-lkX1HJXwyMcprw/5YUZc2s7DrpAiHB21/V+E1rHUrVNokkvB6bqMzT0VfV6/86ZNabt1k14YOIaT7nDvOX3Iiw==",
       "dev": true
     },
-    "globby": {
-      "version": "14.1.0",
-      "resolved": "https://registry.npmjs.org/globby/-/globby-14.1.0.tgz",
-      "integrity": "sha512-0Ia46fDOaT7k4og1PDW4YbodWWr3scS2vAr2lTbsplOt2WkKp0vQbkI9wKis/T5LV/dqPjO3bpS/z6GTJB82LA==",
-      "dev": true,
-      "requires": {
-        "@sindresorhus/merge-streams": "^2.1.0",
-        "fast-glob": "^3.3.3",
-        "ignore": "^7.0.3",
-        "path-type": "^6.0.0",
-        "slash": "^5.1.0",
-        "unicorn-magic": "^0.3.0"
-      }
-    },
     "gopd": {
       "version": "1.2.0",
       "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz",
@@ -5655,12 +5209,6 @@
         "safer-buffer": ">= 2.1.2 < 3"
       }
     },
-    "ignore": {
-      "version": "7.0.3",
-      "resolved": "https://registry.npmjs.org/ignore/-/ignore-7.0.3.tgz",
-      "integrity": "sha512-bAH5jbK/F3T3Jls4I0SO1hmPR0dKU0a7+SY6n1yzRtG54FLO8d6w/nxLFX2Nb7dBu6cCWXPaAME6cYqFUMmuCA==",
-      "dev": true
-    },
     "import-local": {
       "version": "3.1.0",
       "resolved": "https://registry.npmjs.org/import-local/-/import-local-3.1.0.tgz",
@@ -5809,9 +5357,9 @@
       "dev": true
     },
     "json-schema-traverse": {
-      "version": "0.4.1",
-      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
-      "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==",
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
+      "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
       "dev": true
     },
     "kind-of": {
@@ -5831,9 +5379,9 @@
       }
     },
     "loader-runner": {
-      "version": "4.3.0",
-      "resolved": "https://registry.npmjs.org/loader-runner/-/loader-runner-4.3.0.tgz",
-      "integrity": "sha512-3R/1M+yS3j5ou80Me59j7F9IMs4PXs3VqRrm0TU3AbKPxlmpoY1TNscJV/oGJXo8qCatFGTfDbY6W6ipGOYXfg==",
+      "version": "4.3.1",
+      "resolved": "https://registry.npmjs.org/loader-runner/-/loader-runner-4.3.1.tgz",
+      "integrity": "sha512-IWqP2SCPhyVFTBtRcgMHdzlf9ul25NwaFx4wCEH/KjAXuuHY4yNjvPXsBokp8jCB936PyWRaPKUNh8NvylLp2Q==",
       "dev": true
     },
     "locate-path": {
@@ -5929,12 +5477,6 @@
       "integrity": "sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==",
       "dev": true
     },
-    "merge2": {
-      "version": "1.4.1",
-      "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz",
-      "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==",
-      "dev": true
-    },
     "methods": {
       "version": "1.1.2",
       "resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz",
@@ -6007,15 +5549,15 @@
       "dev": true
     },
     "node-forge": {
-      "version": "1.3.1",
-      "resolved": "https://registry.npmjs.org/node-forge/-/node-forge-1.3.1.tgz",
-      "integrity": "sha512-dPEtOeMvF9VMcYV/1Wb8CPoVAXtp6MKMlcbAt4ddqmGqUJ6fQZFXkNZNkNlfevtNkGtaSoXf/vNNNSvgrdXwtA==",
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/node-forge/-/node-forge-1.4.0.tgz",
+      "integrity": "sha512-LarFH0+6VfriEhqMMcLX2F7SwSXeWwnEAJEsYm5QKWchiVYVvJyV9v7UDvUv+w5HO23ZpQTXDv/GxdDdMyOuoQ==",
       "dev": true
     },
     "node-releases": {
-      "version": "2.0.13",
-      "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.13.tgz",
-      "integrity": "sha512-uYr7J37ae/ORWdZeQ1xxMJe3NtdmqMC/JZK+geofDrkLUApKRHPd18/TxtBOJ4A0/+uUIliorNrfYV6s1b02eQ==",
+      "version": "2.0.27",
+      "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.27.tgz",
+      "integrity": "sha512-nmh3lCkYZ3grZvqcCH+fjmQ7X+H0OeZgP40OierEaAptX4XofMh5kwNbWh7lBduUzCcV/8kZ+NDLCwm2iorIlA==",
       "dev": true
     },
     "normalize-path": {
@@ -6130,22 +5672,16 @@
       "integrity": "sha512-RA1GjUVMnvYFxuqovrEqZoxxW5NUZqbwKtYz/Tt7nXerk0LbLblQmrsgdeOxV5SFHf0UDggjS/bSeOZwt1pmEQ==",
       "dev": true
     },
-    "path-type": {
-      "version": "6.0.0",
-      "resolved": "https://registry.npmjs.org/path-type/-/path-type-6.0.0.tgz",
-      "integrity": "sha512-Vj7sf++t5pBD637NSfkxpHSMfWaeig5+DKWLhcqIYx6mWQz5hdJTGDVMQiJcw1ZYkhs7AazKDGpRVji1LJCZUQ==",
-      "dev": true
-    },
     "picocolors": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.0.0.tgz",
-      "integrity": "sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==",
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
+      "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==",
       "dev": true
     },
     "picomatch": {
-      "version": "2.3.1",
-      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz",
-      "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==",
+      "version": "2.3.2",
+      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.2.tgz",
+      "integrity": "sha512-V7+vQEJ06Z+c5tSye8S+nHUfI51xoXIXjHQ99cQtKUkQqqO1kO/KCJUfZXuB47h/YBlDhah2H3hdUGXn8ie0oA==",
       "dev": true
     },
     "pkg-dir": {
@@ -6181,12 +5717,6 @@
         }
       }
     },
-    "punycode": {
-      "version": "2.3.0",
-      "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.0.tgz",
-      "integrity": "sha512-rRV+zQD8tVFys26lAGR9WUuS4iUAngJScM+ZRSKtvl5tKeZ2t5bvdNFdNHBW9FWR4guGHlgmsZ1G7BSm2wTbuA==",
-      "dev": true
-    },
     "qs": {
       "version": "6.13.0",
       "resolved": "https://registry.npmjs.org/qs/-/qs-6.13.0.tgz",
@@ -6196,21 +5726,6 @@
         "side-channel": "^1.0.6"
       }
     },
-    "queue-microtask": {
-      "version": "1.2.3",
-      "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz",
-      "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==",
-      "dev": true
-    },
-    "randombytes": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.1.0.tgz",
-      "integrity": "sha512-vYl3iOX+4CKUWuxGi9Ukhie6fsqXqS9FE2Zaic4tNFD2N2QQaXOMFbuKK4QmDHC0JO6B1Zp41J0LpT0oR68amQ==",
-      "dev": true,
-      "requires": {
-        "safe-buffer": "^5.1.0"
-      }
-    },
     "range-parser": {
       "version": "1.2.1",
       "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
@@ -6306,27 +5821,12 @@
       "integrity": "sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==",
       "dev": true
     },
-    "reusify": {
-      "version": "1.0.4",
-      "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.0.4.tgz",
-      "integrity": "sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw==",
-      "dev": true
-    },
     "run-applescript": {
       "version": "7.0.0",
       "resolved": "https://registry.npmjs.org/run-applescript/-/run-applescript-7.0.0.tgz",
       "integrity": "sha512-9by4Ij99JUr/MCFBUkDKLWK3G9HVXmabKz9U5MlIAIuvuzkiOicRYs8XJLxX+xahD+mLiiCYDqF9dKAgtzKP1A==",
       "dev": true
     },
-    "run-parallel": {
-      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
-      "integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==",
-      "dev": true,
-      "requires": {
-        "queue-microtask": "^1.2.2"
-      }
-    },
     "safe-buffer": {
       "version": "5.1.2",
       "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
@@ -6340,14 +5840,15 @@
       "dev": true
     },
     "schema-utils": {
-      "version": "3.3.0",
-      "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-3.3.0.tgz",
-      "integrity": "sha512-pN/yOAvcC+5rQ5nERGuwrjLlYvLTbCibnZ1I7B1LaiAz9BRBlE9GMgE/eqV30P7aJQUf7Ddimy/RsbYO/GrVGg==",
+      "version": "4.3.3",
+      "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.3.3.tgz",
+      "integrity": "sha512-eflK8wEtyOE6+hsaRVPxvUKYCpRgzLqDTb8krvAsRIwOGlHoSgYLgBXoubGgLd2fT41/OUYdb48v4k4WWHQurA==",
       "dev": true,
       "requires": {
-        "@types/json-schema": "^7.0.8",
-        "ajv": "^6.12.5",
-        "ajv-keywords": "^3.5.2"
+        "@types/json-schema": "^7.0.9",
+        "ajv": "^8.9.0",
+        "ajv-formats": "^2.1.1",
+        "ajv-keywords": "^5.1.0"
       }
     },
     "select-hose": {
@@ -6430,15 +5931,6 @@
         }
       }
     },
-    "serialize-javascript": {
-      "version": "6.0.2",
-      "resolved": "https://registry.npmjs.org/serialize-javascript/-/serialize-javascript-6.0.2.tgz",
-      "integrity": "sha512-Saa1xPByTTq2gdeFZYLLo+RFE35NHZkAbqZeWNd3BpzppeVisAqpDjcp8dyf6uIvEqJRd46jemmyA4iFIeVk8g==",
-      "dev": true,
-      "requires": {
-        "randombytes": "^2.1.0"
-      }
-    },
     "serve-index": {
       "version": "1.9.1",
       "resolved": "https://registry.npmjs.org/serve-index/-/serve-index-1.9.1.tgz",
@@ -6579,12 +6071,6 @@
         "side-channel-map": "^1.0.1"
       }
     },
-    "slash": {
-      "version": "5.1.0",
-      "resolved": "https://registry.npmjs.org/slash/-/slash-5.1.0.tgz",
-      "integrity": "sha512-ZA6oR3T/pEyuqwMgAKT0/hAv8oAXckzbkmR0UkUosQ+Mc4RxGoJkRmwHgHufaenlyAgE1Mxgpdcrf75y6XcnDg==",
-      "dev": true
-    },
     "sockjs": {
       "version": "0.3.24",
       "resolved": "https://registry.npmjs.org/sockjs/-/sockjs-0.3.24.tgz",
@@ -6683,34 +6169,33 @@
       "dev": true
     },
     "tapable": {
-      "version": "2.2.1",
-      "resolved": "https://registry.npmjs.org/tapable/-/tapable-2.2.1.tgz",
-      "integrity": "sha512-GNzQvQTOIP6RyTfE2Qxb8ZVlNmw0n88vp1szwWRimP02mnTsx3Wtn5qRdqY9w2XduFNUgvOwhNnQsjwCp+kqaQ==",
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/tapable/-/tapable-2.3.0.tgz",
+      "integrity": "sha512-g9ljZiwki/LfxmQADO3dEY1CbpmXT5Hm2fJ+QaGKwSXUylMybePR7/67YW7jOrrvjEgL1Fmz5kzyAjWVWLlucg==",
       "dev": true
     },
     "terser": {
-      "version": "5.31.6",
-      "resolved": "https://registry.npmjs.org/terser/-/terser-5.31.6.tgz",
-      "integrity": "sha512-PQ4DAriWzKj+qgehQ7LK5bQqCFNMmlhjR2PFFLuqGCpuCAauxemVBWwWOxo3UIwWQx8+Pr61Df++r76wDmkQBg==",
+      "version": "5.46.0",
+      "resolved": "https://registry.npmjs.org/terser/-/terser-5.46.0.tgz",
+      "integrity": "sha512-jTwoImyr/QbOWFFso3YoU3ik0jBBDJ6JTOQiy/J2YxVJdZCc+5u7skhNwiOR3FQIygFqVUPHl7qbbxtjW2K3Qg==",
       "dev": true,
       "requires": {
         "@jridgewell/source-map": "^0.3.3",
-        "acorn": "^8.8.2",
+        "acorn": "^8.15.0",
         "commander": "^2.20.0",
         "source-map-support": "~0.5.20"
       }
     },
     "terser-webpack-plugin": {
-      "version": "5.3.10",
-      "resolved": "https://registry.npmjs.org/terser-webpack-plugin/-/terser-webpack-plugin-5.3.10.tgz",
-      "integrity": "sha512-BKFPWlPDndPs+NGGCr1U59t0XScL5317Y0UReNrHaw9/FwhPENlq6bfgs+4yPfyP51vqC1bQ4rp1EfXW5ZSH9w==",
+      "version": "5.4.0",
+      "resolved": "https://registry.npmjs.org/terser-webpack-plugin/-/terser-webpack-plugin-5.4.0.tgz",
+      "integrity": "sha512-Bn5vxm48flOIfkdl5CaD2+1CiUVbonWQ3KQPyP7/EuIl9Gbzq/gQFOzaMFUEgVjB1396tcK0SG8XcNJ/2kDH8g==",
       "dev": true,
       "requires": {
-        "@jridgewell/trace-mapping": "^0.3.20",
+        "@jridgewell/trace-mapping": "^0.3.25",
         "jest-worker": "^27.4.5",
-        "schema-utils": "^3.1.1",
-        "serialize-javascript": "^6.0.1",
-        "terser": "^5.26.0"
+        "schema-utils": "^4.3.0",
+        "terser": "^5.31.1"
       }
     },
     "thunky": {
@@ -6719,6 +6204,31 @@
       "integrity": "sha512-eHY7nBftgThBqOyHGVN+l8gF0BucP09fMo0oO/Lb0w1OF80dJv+lDVpXG60WMQvkcxAkNybKsrEIE3ZtKGmPrA==",
       "dev": true
     },
+    "tinyglobby": {
+      "version": "0.2.15",
+      "resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.15.tgz",
+      "integrity": "sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ==",
+      "dev": true,
+      "requires": {
+        "fdir": "^6.5.0",
+        "picomatch": "^4.0.3"
+      },
+      "dependencies": {
+        "fdir": {
+          "version": "6.5.0",
+          "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.5.0.tgz",
+          "integrity": "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==",
+          "dev": true,
+          "requires": {}
+        },
+        "picomatch": {
+          "version": "4.0.4",
+          "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz",
+          "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==",
+          "dev": true
+        }
+      }
+    },
     "to-regex-range": {
       "version": "5.0.1",
       "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
@@ -6750,12 +6260,6 @@
         "mime-types": "~2.1.24"
       }
     },
-    "unicorn-magic": {
-      "version": "0.3.0",
-      "resolved": "https://registry.npmjs.org/unicorn-magic/-/unicorn-magic-0.3.0.tgz",
-      "integrity": "sha512-+QBBXBCvifc56fsbuxZQ6Sic3wqqc3WWaqxs58gvJrcOuN83HGTCwz3oS5phzU9LthRNE9VrJCFCLUgHeeFnfA==",
-      "dev": true
-    },
     "unpipe": {
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
@@ -6763,22 +6267,13 @@
       "dev": true
     },
     "update-browserslist-db": {
-      "version": "1.0.13",
-      "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.0.13.tgz",
-      "integrity": "sha512-xebP81SNcPuNpPP3uzeW1NYXxI3rxyJzF3pD6sH4jE7o/IX+WtSpwnVU+qIsDPyk0d3hmFQ7mjqc6AtV604hbg==",
-      "dev": true,
-      "requires": {
-        "escalade": "^3.1.1",
-        "picocolors": "^1.0.0"
-      }
-    },
-    "uri-js": {
-      "version": "4.4.1",
-      "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz",
-      "integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==",
+      "version": "1.2.3",
+      "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.2.3.tgz",
+      "integrity": "sha512-Js0m9cx+qOgDxo0eMiFGEueWztz+d4+M3rGlmKPT+T4IS/jP4ylw3Nwpu6cpTTP8R1MAC1kF4VbdLt3ARf209w==",
       "dev": true,
       "requires": {
-        "punycode": "^2.1.0"
+        "escalade": "^3.2.0",
+        "picocolors": "^1.1.1"
       }
     },
     "util-deprecate": {
@@ -6806,9 +6301,9 @@
       "dev": true
     },
     "watchpack": {
-      "version": "2.4.2",
-      "resolved": "https://registry.npmjs.org/watchpack/-/watchpack-2.4.2.tgz",
-      "integrity": "sha512-TnbFSbcOCcDgjZ4piURLCbJ3nJhznVh9kw6F6iokjiFPl8ONxe9A6nMDVXDiNbrSfLILs6vB07F7wLBrwPYzJw==",
+      "version": "2.5.1",
+      "resolved": "https://registry.npmjs.org/watchpack/-/watchpack-2.5.1.tgz",
+      "integrity": "sha512-Zn5uXdcFNIA1+1Ei5McRd+iRzfhENPCe7LeABkJtNulSxjma+l7ltNx55BWZkRlwRnpOgHqxnjyaDgJnNXnqzg==",
       "dev": true,
       "requires": {
         "glob-to-regexp": "^0.4.1",
@@ -6825,42 +6320,36 @@
       }
     },
     "webpack": {
-      "version": "5.94.0",
-      "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.94.0.tgz",
-      "integrity": "sha512-KcsGn50VT+06JH/iunZJedYGUJS5FGjow8wb9c0v5n1Om8O1g4L6LjtfxwlXIATopoQu+vOXXa7gYisWxCoPyg==",
-      "dev": true,
-      "requires": {
-        "@types/estree": "^1.0.5",
-        "@webassemblyjs/ast": "^1.12.1",
-        "@webassemblyjs/wasm-edit": "^1.12.1",
-        "@webassemblyjs/wasm-parser": "^1.12.1",
-        "acorn": "^8.7.1",
-        "acorn-import-attributes": "^1.9.5",
-        "browserslist": "^4.21.10",
+      "version": "5.105.0",
+      "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.105.0.tgz",
+      "integrity": "sha512-gX/dMkRQc7QOMzgTe6KsYFM7DxeIONQSui1s0n/0xht36HvrgbxtM1xBlgx596NbpHuQU8P7QpKwrZYwUX48nw==",
+      "dev": true,
+      "requires": {
+        "@types/eslint-scope": "^3.7.7",
+        "@types/estree": "^1.0.8",
+        "@types/json-schema": "^7.0.15",
+        "@webassemblyjs/ast": "^1.14.1",
+        "@webassemblyjs/wasm-edit": "^1.14.1",
+        "@webassemblyjs/wasm-parser": "^1.14.1",
+        "acorn": "^8.15.0",
+        "acorn-import-phases": "^1.0.3",
+        "browserslist": "^4.28.1",
         "chrome-trace-event": "^1.0.2",
-        "enhanced-resolve": "^5.17.1",
-        "es-module-lexer": "^1.2.1",
+        "enhanced-resolve": "^5.19.0",
+        "es-module-lexer": "^2.0.0",
         "eslint-scope": "5.1.1",
         "events": "^3.2.0",
         "glob-to-regexp": "^0.4.1",
         "graceful-fs": "^4.2.11",
         "json-parse-even-better-errors": "^2.3.1",
-        "loader-runner": "^4.2.0",
+        "loader-runner": "^4.3.1",
         "mime-types": "^2.1.27",
         "neo-async": "^2.6.2",
-        "schema-utils": "^3.2.0",
-        "tapable": "^2.1.1",
-        "terser-webpack-plugin": "^5.3.10",
-        "watchpack": "^2.4.1",
-        "webpack-sources": "^3.2.3"
-      },
-      "dependencies": {
-        "webpack-sources": {
-          "version": "3.2.3",
-          "resolved": "https://registry.npmjs.org/webpack-sources/-/webpack-sources-3.2.3.tgz",
-          "integrity": "sha512-/DyMEOrDgLKKIG0fmvtz+4dUX/3Ghozwgm6iPp8KRhvn+eQf9+Q7GWxVNMk3+uCPWfdXYC4ExGBckIXdFEfH1w==",
-          "dev": true
-        }
+        "schema-utils": "^4.3.3",
+        "tapable": "^2.3.0",
+        "terser-webpack-plugin": "^5.3.16",
+        "watchpack": "^2.5.1",
+        "webpack-sources": "^3.3.3"
       }
     },
     "webpack-cli": {
@@ -6904,47 +6393,6 @@
         "on-finished": "^2.4.1",
         "range-parser": "^1.2.1",
         "schema-utils": "^4.0.0"
-      },
-      "dependencies": {
-        "ajv": {
-          "version": "8.17.1",
-          "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.17.1.tgz",
-          "integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==",
-          "dev": true,
-          "requires": {
-            "fast-deep-equal": "^3.1.3",
-            "fast-uri": "^3.0.1",
-            "json-schema-traverse": "^1.0.0",
-            "require-from-string": "^2.0.2"
-          }
-        },
-        "ajv-keywords": {
-          "version": "5.1.0",
-          "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-5.1.0.tgz",
-          "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==",
-          "dev": true,
-          "requires": {
-            "fast-deep-equal": "^3.1.3"
-          }
-        },
-        "json-schema-traverse": {
-          "version": "1.0.0",
-          "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-          "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
-          "dev": true
-        },
-        "schema-utils": {
-          "version": "4.3.2",
-          "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.3.2.tgz",
-          "integrity": "sha512-Gn/JaSk/Mt9gYubxTtSn/QCV4em9mpAPiR1rqy/Ocu19u/G9J5WWdNoUT4SiV6mFC3y6cxyFcFwdzPM3FgxGAQ==",
-          "dev": true,
-          "requires": {
-            "@types/json-schema": "^7.0.9",
-            "ajv": "^8.9.0",
-            "ajv-formats": "^2.1.1",
-            "ajv-keywords": "^5.1.0"
-          }
-        }
       }
     },
     "webpack-dev-server": {
@@ -6981,47 +6429,6 @@
         "spdy": "^4.0.2",
         "webpack-dev-middleware": "^7.4.2",
         "ws": "^8.18.0"
-      },
-      "dependencies": {
-        "ajv": {
-          "version": "8.12.0",
-          "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
-          "integrity": "sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==",
-          "dev": true,
-          "requires": {
-            "fast-deep-equal": "^3.1.1",
-            "json-schema-traverse": "^1.0.0",
-            "require-from-string": "^2.0.2",
-            "uri-js": "^4.2.2"
-          }
-        },
-        "ajv-keywords": {
-          "version": "5.1.0",
-          "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-5.1.0.tgz",
-          "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==",
-          "dev": true,
-          "requires": {
-            "fast-deep-equal": "^3.1.3"
-          }
-        },
-        "json-schema-traverse": {
-          "version": "1.0.0",
-          "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-          "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
-          "dev": true
-        },
-        "schema-utils": {
-          "version": "4.2.0",
-          "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.2.0.tgz",
-          "integrity": "sha512-L0jRsrPpjdckP3oPug3/VxNKt2trR8TcabrM6FOAAlvC/9Phcmm+cuAgTlxBqdBR1WJx7Naj9WHw+aOmheSVbw==",
-          "dev": true,
-          "requires": {
-            "@types/json-schema": "^7.0.9",
-            "ajv": "^8.9.0",
-            "ajv-formats": "^2.1.1",
-            "ajv-keywords": "^5.1.0"
-          }
-        }
       }
     },
     "webpack-merge": {
@@ -7034,6 +6441,12 @@
         "wildcard": "^2.0.0"
       }
     },
+    "webpack-sources": {
+      "version": "3.3.3",
+      "resolved": "https://registry.npmjs.org/webpack-sources/-/webpack-sources-3.3.3.tgz",
+      "integrity": "sha512-yd1RBzSGanHkitROoPFd6qsrxt+oFhg/129YzheDGqeustzX0vTZJZsSsQjVQC4yzBQ56K55XU8gaNCtIzOnTg==",
+      "dev": true
+    },
     "websocket-driver": {
       "version": "0.7.4",
       "resolved": "https://registry.npmjs.org/websocket-driver/-/websocket-driver-0.7.4.tgz",
diff --git a/datafusion/wasmtest/datafusion-wasm-app/package.json b/datafusion/wasmtest/datafusion-wasm-app/package.json
index b46993de77d9b..7261ab76c42c5 100644
--- a/datafusion/wasmtest/datafusion-wasm-app/package.json
+++ b/datafusion/wasmtest/datafusion-wasm-app/package.json
@@ -27,9 +27,9 @@
     "datafusion-wasmtest": "../pkg"
   },
   "devDependencies": {
-    "webpack": "5.94.0",
+    "webpack": "5.105.0",
     "webpack-cli": "5.1.4",
     "webpack-dev-server": "5.2.1",
-    "copy-webpack-plugin": "12.0.2"
+    "copy-webpack-plugin": "14.0.0"
   }
 }
diff --git a/datafusion/wasmtest/src/lib.rs b/datafusion/wasmtest/src/lib.rs
index d2efe995f100d..f545ccf19306a 100644
--- a/datafusion/wasmtest/src/lib.rs
+++ b/datafusion/wasmtest/src/lib.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 #![doc(
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
@@ -23,14 +24,12 @@
 
 extern crate wasm_bindgen;
 
-use datafusion_common::{DFSchema, ScalarValue};
-use datafusion_expr::execution_props::ExecutionProps;
+use datafusion_common::ScalarValue;
 use datafusion_expr::lit;
 use datafusion_expr::simplify::SimplifyContext;
 use datafusion_optimizer::simplify_expressions::ExprSimplifier;
 use datafusion_sql::sqlparser::dialect::GenericDialect;
 use datafusion_sql::sqlparser::parser::Parser;
-use std::sync::Arc;
 use wasm_bindgen::prelude::*;
 pub fn set_panic_hook() {
     // When the `console_error_panic_hook` feature is enabled, we can call the
@@ -62,10 +61,7 @@ pub fn basic_exprs() {
     log(&format!("Expr: {expr:?}"));
 
     // Simplify Expr (using datafusion-phys-expr and datafusion-optimizer)
-    let schema = Arc::new(DFSchema::empty());
-    let execution_props = ExecutionProps::new();
-    let simplifier =
-        ExprSimplifier::new(SimplifyContext::new(&execution_props).with_schema(schema));
+    let simplifier = ExprSimplifier::new(SimplifyContext::default());
     let simplified_expr = simplifier.simplify(expr).unwrap();
     log(&format!("Simplified Expr: {simplified_expr:?}"));
 }
@@ -81,7 +77,10 @@ pub fn basic_parse() {
 
 #[cfg(test)]
 mod test {
-    use super::*;
+    use std::sync::Arc;
+
+    use bytes::Bytes;
+    use datafusion::datasource::file_format::file_compression_type::FileCompressionType;
     use datafusion::{
         arrow::{
             array::{ArrayRef, Int32Array, RecordBatch, StringArray},
@@ -89,8 +88,9 @@ mod test {
         },
         datasource::MemTable,
         execution::context::SessionContext,
+        prelude::CsvReadOptions,
     };
-    use datafusion_common::test_util::batches_to_string;
+    use datafusion_common::{DataFusionError, test_util::batches_to_string};
     use datafusion_execution::{
         config::SessionConfig,
         disk_manager::{DiskManagerBuilder, DiskManagerMode},
@@ -98,17 +98,18 @@ mod test {
     };
     use datafusion_physical_plan::collect;
     use datafusion_sql::parser::DFParser;
-    use object_store::{memory::InMemory, path::Path, ObjectStore};
+    use futures::{StreamExt, TryStreamExt, stream};
+    use object_store::{ObjectStoreExt, PutPayload, memory::InMemory, path::Path};
     use url::Url;
     use wasm_bindgen_test::wasm_bindgen_test;
 
     wasm_bindgen_test::wasm_bindgen_test_configure!(run_in_browser);
 
+    #[cfg(target_arch = "wasm32")]
     #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
-    #[cfg_attr(not(target_arch = "wasm32"), allow(dead_code))]
     fn datafusion_test() {
-        basic_exprs();
-        basic_parse();
+        super::basic_exprs();
+        super::basic_parse();
     }
 
     fn get_ctx() -> Arc<SessionContext> {
@@ -261,4 +262,55 @@ mod test {
              +----+-------+"
         );
     }
+
+    #[wasm_bindgen_test(unsupported = tokio::test)]
+    async fn test_csv_read_xz_compressed() {
+        let csv_data = "id,value\n1,a\n2,b\n3,c\n";
+        let input = Bytes::from(csv_data.as_bytes().to_vec());
+        let input_stream =
+            stream::iter(vec![Ok::<Bytes, DataFusionError>(input)]).boxed();
+
+        let compressed_stream = FileCompressionType::XZ
+            .convert_to_compress_stream(input_stream)
+            .unwrap();
+        let compressed_data: Vec<Bytes> = compressed_stream.try_collect().await.unwrap();
+
+        let store = InMemory::new();
+        let path = Path::from("data.csv.xz");
+        store
+            .put(&path, PutPayload::from_iter(compressed_data))
+            .await
+            .unwrap();
+
+        let url = Url::parse("memory://").unwrap();
+        let ctx = SessionContext::new();
+        ctx.register_object_store(&url, Arc::new(store));
+
+        let csv_options = CsvReadOptions::new()
+            .has_header(true)
+            .file_compression_type(FileCompressionType::XZ)
+            .file_extension("csv.xz");
+        ctx.register_csv("compressed", "memory:///data.csv.xz", csv_options)
+            .await
+            .unwrap();
+
+        let result = ctx
+            .sql("SELECT * FROM compressed")
+            .await
+            .unwrap()
+            .collect()
+            .await
+            .unwrap();
+
+        assert_eq!(
+            batches_to_string(&result),
+            "+----+-------+\n\
+             | id | value |\n\
+             +----+-------+\n\
+             | 1  | a     |\n\
+             | 2  | b     |\n\
+             | 3  | c     |\n\
+             +----+-------+"
+        );
+    }
 }
diff --git a/dev/changelog/51.0.0.md b/dev/changelog/51.0.0.md
new file mode 100644
index 0000000000000..60dd24cde5595
--- /dev/null
+++ b/dev/changelog/51.0.0.md
@@ -0,0 +1,717 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 51.0.0 Changelog
+
+This release consists of 537 commits from 129 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Breaking changes:**
+
+- Introduce `TypeSignatureClass::Binary` to allow accepting arbitrarily sized `FixedSizeBinary` arguments [#17531](https://github.com/apache/datafusion/pull/17531) (Jefffrey)
+- feat: change `datafusion-proto` to use `TaskContext` rather than`SessionContext` for physical plan serialization [#17601](https://github.com/apache/datafusion/pull/17601) (milenkovicm)
+- chore: refactor usage of `reassign_predicate_columns` [#17703](https://github.com/apache/datafusion/pull/17703) (rkrishn7)
+- fix: correct edge case where null haystack returns false instead of null [#17818](https://github.com/apache/datafusion/pull/17818) (Jefffrey)
+- clean up duplicate information in FileOpener trait [#17956](https://github.com/apache/datafusion/pull/17956) (adriangb)
+- refactor : deprecate `ParquetSource::predicate()` and merge into `FileSource::filter()` [#17971](https://github.com/apache/datafusion/pull/17971) (getChan)
+- feat: convert_array_to_scalar_vec respects null elements [#17891](https://github.com/apache/datafusion/pull/17891) (vegarsti)
+- make Union::try_new pub [#18125](https://github.com/apache/datafusion/pull/18125) (leoyvens)
+- refactor: remove unused `type_coercion/aggregate.rs` functions [#18091](https://github.com/apache/datafusion/pull/18091) (Jefffrey)
+- refactor: remove core crate from datafusion-proto [#18123](https://github.com/apache/datafusion/pull/18123) (timsaucer)
+- Use TableSchema in FileScanConfig [#18231](https://github.com/apache/datafusion/pull/18231) (adriangb)
+- Enable placeholders with extension types [#17986](https://github.com/apache/datafusion/pull/17986) (paleolimbot)
+- Implement `DESCRIBE SELECT` to show schema rather than `EXPLAIN` plan [#18238](https://github.com/apache/datafusion/pull/18238) (djanderson)
+- Push partition_statistics into DataSource [#18233](https://github.com/apache/datafusion/pull/18233) (adriangb)
+- Let `FileScanConfig` own a list of `ProjectionExpr`s [#18253](https://github.com/apache/datafusion/pull/18253) (friendlymatthew)
+- Introduce `expr_fields` to `AccumulatorArgs` to hold input argument fields [#18100](https://github.com/apache/datafusion/pull/18100) (Jefffrey)
+- Rename `is_ordered_set_aggregate` to `supports_within_group_clause` for UDAFs [#18397](https://github.com/apache/datafusion/pull/18397) (Jefffrey)
+- Move generate_series projection logic into LazyMemoryStream [#18373](https://github.com/apache/datafusion/pull/18373) (mkleen)
+
+**Performance related:**
+
+- Improve `Hash` and `Ord` speed for `dyn LogicalType` [#17437](https://github.com/apache/datafusion/pull/17437) (findepi)
+- Faster `&&String::to_string` [#17583](https://github.com/apache/datafusion/pull/17583) (findepi)
+- perf: Simplify CASE for any WHEN TRUE [#17602](https://github.com/apache/datafusion/pull/17602) (petern48)
+- perf: Improve the performance of WINDOW functions with many partitions [#17528](https://github.com/apache/datafusion/pull/17528) (nuno-faria)
+- Avoid redundant Schema clones [#17643](https://github.com/apache/datafusion/pull/17643) (findepi)
+- Prevent exponential planning time for Window functions - v2 [#17684](https://github.com/apache/datafusion/pull/17684) (berkaysynnada)
+- Add case expr simplifiers for literal comparisons [#17743](https://github.com/apache/datafusion/pull/17743) (jackkleeman)
+- Enable Projection Pushdown Optimization for Recursive CTEs [#16696](https://github.com/apache/datafusion/pull/16696) (kosiew)
+- perf: Optimize CASE for any WHEN false [#17835](https://github.com/apache/datafusion/pull/17835) (petern48)
+- feat: Simplify `NOT(IN ..)` to `NOT IN` and `NOT (EXISTS ..)` to `NOT EXISTS` [#17848](https://github.com/apache/datafusion/pull/17848) (Tpt)
+- perf: Faster `string_agg()` aggregate function (1000x speed for no DISTINCT and ORDER case) [#17837](https://github.com/apache/datafusion/pull/17837) (2010YOUY01)
+- optimizer: allow projection pushdown through aliased recursive CTE references [#17875](https://github.com/apache/datafusion/pull/17875) (kosiew)
+- perf: Implement boolean group values [#17726](https://github.com/apache/datafusion/pull/17726) (ashdnazg)
+- #17838 Rewrite `regexp_like` calls as `~` and `*~` operator expressions when possible [#17839](https://github.com/apache/datafusion/pull/17839) (pepijnve)
+- perf: add to `aggregate_vectorized` bench benchmark for `PrimitiveGroupValueBuilder` as well [#17930](https://github.com/apache/datafusion/pull/17930) (rluvaton)
+- #17972 Restore case expr/expr optimisation while ensuring lazy evaluation [#17973](https://github.com/apache/datafusion/pull/17973) (pepijnve)
+- chore: use `NullBuffer::union` for Spark `concat` [#18087](https://github.com/apache/datafusion/pull/18087) (comphead)
+- Short circuit complex case evaluation modes as soon as possible [#17898](https://github.com/apache/datafusion/pull/17898) (pepijnve)
+- perf: Fix NLJ slow join with condition `array_has` [#18161](https://github.com/apache/datafusion/pull/18161) (2010YOUY01)
+- perf: improve `ScalarValue::to_array_of_size` for Boolean and some null values [#18180](https://github.com/apache/datafusion/pull/18180) (rluvaton)
+- Allow filter pushdown through AggregateExec [#18404](https://github.com/apache/datafusion/pull/18404) (LiaCastaneda)
+- Avoid scatter operation in `ExpressionOrExpression` case evaluation method [#18444](https://github.com/apache/datafusion/pull/18444) (pepijnve)
+
+**Implemented enhancements:**
+
+- feat: Implement `DFSchema.print_schema_tree()` method [#17459](https://github.com/apache/datafusion/pull/17459) (comphead)
+- feat(spark): implement Spark `length` function [#17475](https://github.com/apache/datafusion/pull/17475) (wForget)
+- feat: Add binary to `join_fuzz` testing [#17497](https://github.com/apache/datafusion/pull/17497) (jonathanc-n)
+- feat: Support log for Decimal128 and Decimal256 [#17023](https://github.com/apache/datafusion/pull/17023) (theirix)
+- feat(spark): implement Spark bitwise function shiftleft/shiftright/shiftrightunsighed [#17013](https://github.com/apache/datafusion/pull/17013) (chenkovsky)
+- feat: Ensure explain format in config is valid [#17549](https://github.com/apache/datafusion/pull/17549) (Weijun-H)
+- feat: Simplify CASE WHEN true THEN expr to expr [#17450](https://github.com/apache/datafusion/pull/17450) (EeshanBembi)
+- feat: add `sql` feature to make sql planning optional [#17332](https://github.com/apache/datafusion/pull/17332) (timsaucer)
+- feat: Add `OR REPLACE` to creating external tables [#17580](https://github.com/apache/datafusion/pull/17580) (jonathanc-n)
+- feat(substrait): add support for RightAnti and RightSemi join types [#17604](https://github.com/apache/datafusion/pull/17604) (bvolpato)
+- feat(small): Display `NullEquality` in join executor's `EXPLAIN` output [#17664](https://github.com/apache/datafusion/pull/17664) (2010YOUY01)
+- feat(substrait): add time literal support [#17655](https://github.com/apache/datafusion/pull/17655) (bvolpato)
+- feat(spark): implement Spark `make_interval` function [#17424](https://github.com/apache/datafusion/pull/17424) (davidlghellin)
+- feat: expose `udafs` and `udwfs` methods on `FunctionRegistry` [#17650](https://github.com/apache/datafusion/pull/17650) (milenkovicm)
+- feat: Support Seconds and Milliseconds literals in substrait [#17707](https://github.com/apache/datafusion/pull/17707) (petern48)
+- feat: support for null, date, and timestamp types in approx_distinct [#17618](https://github.com/apache/datafusion/pull/17618) (killme2008)
+- feat: support `Utf8View` for more args of `regexp_replace` [#17195](https://github.com/apache/datafusion/pull/17195) (mbutrovich)
+- feat(spark): implement Spark `map` function `map_from_arrays` [#17456](https://github.com/apache/datafusion/pull/17456) (SparkApplicationMaster)
+- feat: Display window function's alias name in output column [#17788](https://github.com/apache/datafusion/pull/17788) (devampatel03)
+- feat(spark): implement Spark `make_dt_interval` function [#17728](https://github.com/apache/datafusion/pull/17728) (davidlghellin)
+- feat: support multi-threaded writing of Parquet files with modular encryption [#16738](https://github.com/apache/datafusion/pull/16738) (rok)
+- feat(spark): implement Spark `map` function `map_from_entries` [#17779](https://github.com/apache/datafusion/pull/17779) (SparkApplicationMaster)
+- feat: Add Hash Join benchmarks [#17636](https://github.com/apache/datafusion/pull/17636) (jonathanc-n)
+- feat: Support swap for `RightMark` Join [#17651](https://github.com/apache/datafusion/pull/17651) (jonathanc-n)
+- feat: support spark udf format_string [#17561](https://github.com/apache/datafusion/pull/17561) (chenkovsky)
+- feat(spark): implement Spark `try_parse_url` function [#17485](https://github.com/apache/datafusion/pull/17485) (rafafrdz)
+- feat: Support reading CSV files with inconsistent column counts [#17553](https://github.com/apache/datafusion/pull/17553) (EeshanBembi)
+- feat: Adds Instrumented Object Store Registry to datafusion-cli [#17953](https://github.com/apache/datafusion/pull/17953) (BlakeOrth)
+- feat: add cargo-machete in CI [#18030](https://github.com/apache/datafusion/pull/18030) (Weijun-H)
+- feat(spark): implement Spark `elt` function [#17729](https://github.com/apache/datafusion/pull/17729) (davidlghellin)
+- feat: support Spark `concat` string function [#18063](https://github.com/apache/datafusion/pull/18063) (comphead)
+- feat: support `null_treatment`, `distinct`, and `filter` for window functions in proto [#18024](https://github.com/apache/datafusion/pull/18024) (dqkqd)
+- feat: Add percentile_cont aggregate function [#17988](https://github.com/apache/datafusion/pull/17988) (adriangb)
+- feat: spark udf array shuffle [#17674](https://github.com/apache/datafusion/pull/17674) (chenkovsky)
+- feat: Support configurable `EXPLAIN ANALYZE` detail level [#18098](https://github.com/apache/datafusion/pull/18098) (2010YOUY01)
+- feat: add fp16 support to Substrait [#18086](https://github.com/apache/datafusion/pull/18086) (westonpace)
+- feat: `ClassicJoin` for PWMJ [#17482](https://github.com/apache/datafusion/pull/17482) (jonathanc-n)
+- feat(docs): display compatible logo for dark mode [#18197](https://github.com/apache/datafusion/pull/18197) (foskey51)
+- feat: Add `deregister_object_store` [#17999](https://github.com/apache/datafusion/pull/17999) (jonathanc-n)
+- feat: Add existence join to NestedLoopJoin benchmarks [#18005](https://github.com/apache/datafusion/pull/18005) (jonathanc-n)
+- feat(small): Set 'summary' level metrics for `DataSourceExec` with parquet source [#18196](https://github.com/apache/datafusion/pull/18196) (2010YOUY01)
+- feat: be indifferent to padding when decoding base64 [#18264](https://github.com/apache/datafusion/pull/18264) (colinmarc)
+- feat: Add `output_bytes` to baseline metrics [#18268](https://github.com/apache/datafusion/pull/18268) (2010YOUY01)
+- feat: Introduce `PruningMetrics` and use it in parquet file pruning metric [#18297](https://github.com/apache/datafusion/pull/18297) (2010YOUY01)
+- feat: Improve metrics for aggregate streams. [#18325](https://github.com/apache/datafusion/pull/18325) (EmilyMatt)
+- feat: allow pushdown of dynamic filters having partition cols [#18172](https://github.com/apache/datafusion/pull/18172) (feniljain)
+- feat: support temporary views in DataFrameTableProvider [#18158](https://github.com/apache/datafusion/pull/18158) (r1b)
+- feat: Better parquet row-group/page pruning metrics display [#18321](https://github.com/apache/datafusion/pull/18321) (2010YOUY01)
+- feat: Add Hash trait to StatsType enum [#18382](https://github.com/apache/datafusion/pull/18382) (rluvaton)
+- feat: support get_field for map literal [#18371](https://github.com/apache/datafusion/pull/18371) (chenkovsky)
+- feat(docs): enable navbar [#18324](https://github.com/apache/datafusion/pull/18324) (foskey51)
+- feat: Add `selectivity` metrics to `FilterExec` [#18406](https://github.com/apache/datafusion/pull/18406) (2010YOUY01)
+- feat: Add `reduction_factor` metric to `AggregateExec` for EXPLAIN ANALYZE [#18455](https://github.com/apache/datafusion/pull/18455) (petern48)
+- feat: support named arguments for aggregate and window udfs [#18389](https://github.com/apache/datafusion/pull/18389) (bubulalabu)
+- feat: Add selectivity metric to NestedLoopJoinExec for EXPLAIN ANALYZE [#18481](https://github.com/apache/datafusion/pull/18481) (petern48)
+
+**Fixed bugs:**
+
+- fix: lazy evaluation for coalesce [#17357](https://github.com/apache/datafusion/pull/17357) (chenkovsky)
+- fix: Implement AggregateUDFImpl::reverse_expr for StringAgg [#17165](https://github.com/apache/datafusion/pull/17165) (nuno-faria)
+- fix: Support aggregate expressions in `QUALIFY` [#17313](https://github.com/apache/datafusion/pull/17313) (rkrishn7)
+- fix: synchronize partition bounds reporting in HashJoin [#17452](https://github.com/apache/datafusion/pull/17452) (rkrishn7)
+- fix: correct typos in `CONTRIBUTING.md` [#17507](https://github.com/apache/datafusion/pull/17507) (Weijun-H)
+- fix: Add AWS environment variable checks for S3 tests [#17519](https://github.com/apache/datafusion/pull/17519) (Weijun-H)
+- fix: Ensure the CachedParquetFileReader respects the metadata prefetch hint [#17302](https://github.com/apache/datafusion/pull/17302) (nuno-faria)
+- fix: prevent UnionExec panic with empty inputs [#17449](https://github.com/apache/datafusion/pull/17449) (EeshanBembi)
+- fix: ignore non-existent columns when adding filter equivalence info in `FileScanConfig` [#17546](https://github.com/apache/datafusion/pull/17546) (rkrishn7)
+- fix: Prevent duplicate expressions in DynamicPhysicalExpr [#17551](https://github.com/apache/datafusion/pull/17551) (UBarney)
+- fix: `SortExec` `TopK` OOM [#17622](https://github.com/apache/datafusion/pull/17622) (nuno-faria)
+- fix: Change `OuterReferenceColumn` to contain the entire outer field to prevent metadata loss [#17524](https://github.com/apache/datafusion/pull/17524) (Kontinuation)
+- fix: Preserves field metadata when creating logical plan for VALUES expression [#17525](https://github.com/apache/datafusion/pull/17525) (Kontinuation)
+- fix: Ignore governance doc from typos [#17678](https://github.com/apache/datafusion/pull/17678) (rkrishn7)
+- fix: null padding for `array_reverse` on `FixedSizeList` [#17673](https://github.com/apache/datafusion/pull/17673) (chenkovsky)
+- fix: correct statistics for `NestedLoopJoinExec` [#17680](https://github.com/apache/datafusion/pull/17680) (duongcongtoai)
+- fix: Partial AggregateMode will generate duplicate field names which will fail DFSchema construct [#17706](https://github.com/apache/datafusion/pull/17706) (zhuqi-lucas)
+- fix: Remove parquet encryption feature from root deps [#17700](https://github.com/apache/datafusion/pull/17700) (Vyquos)
+- fix: Remove datafusion-macros's dependency on datafusion-expr [#17688](https://github.com/apache/datafusion/pull/17688) (yutannihilation)
+- fix: Filter out nulls properly in approx_percentile_cont_with_weight [#17780](https://github.com/apache/datafusion/pull/17780) (Jefffrey)
+- fix: ignore `DataType::Null` in possible types during csv type inference [#17796](https://github.com/apache/datafusion/pull/17796) (dqkqd)
+- fix: `ParquetSource` - `with_predicate()` don't have to reset metrics [#17858](https://github.com/apache/datafusion/pull/17858) (2010YOUY01)
+- fix: optimizer `common_sub_expression_eliminate` fails in a window function [#17852](https://github.com/apache/datafusion/pull/17852) (dqkqd)
+- fix: fix failing test compilation on main [#17955](https://github.com/apache/datafusion/pull/17955) (Jefffrey)
+- fix: update `PrimitiveGroupValueBuilder` to match NaN correctly in scalar `equal_to` [#17979](https://github.com/apache/datafusion/pull/17979) (rluvaton)
+- fix: Add overflow checks to SparkDateAdd/Sub to avoid panics [#18013](https://github.com/apache/datafusion/pull/18013) (andygrove)
+- fix: Ensure ListingTable partitions are pruned when filters are not used [#17958](https://github.com/apache/datafusion/pull/17958) (peasee)
+- fix: Improve null handling in array_to_string function [#18076](https://github.com/apache/datafusion/pull/18076) (Weijun-H)
+- fix: Re-bump latest datafusion-testing module so extended tests succeed [#18110](https://github.com/apache/datafusion/pull/18110) (Jefffrey)
+- fix: window unparsing [#17367](https://github.com/apache/datafusion/pull/17367) (chenkovsky)
+- fix: Add dictionary coercion support for numeric comparison operations [#18099](https://github.com/apache/datafusion/pull/18099) (ahmed-mez)
+- fix(substrait): schema errors for Aggregates with no groupings [#17909](https://github.com/apache/datafusion/pull/17909) (vbarua)
+- fix: `array_distinct` inner nullability causing type mismatch [#18104](https://github.com/apache/datafusion/pull/18104) (dqkqd)
+- fix: improve document ui [#18157](https://github.com/apache/datafusion/pull/18157) (getChan)
+- fix(docs): resolve extra outline on tables [#18193](https://github.com/apache/datafusion/pull/18193) (foskey51)
+- fix: Use dynamic timezone in now() function for accurate timestamp [#18017](https://github.com/apache/datafusion/pull/18017) (Weijun-H)
+- fix: UnnestExec preserves relevant equivalence properties of input [#16985](https://github.com/apache/datafusion/pull/16985) (vegarsti)
+- fix: wrong simplification for >= >, <= < [#18222](https://github.com/apache/datafusion/pull/18222) (chenkovsky)
+- fix: only fall back to listing prefixes on 404 errors [#18263](https://github.com/apache/datafusion/pull/18263) (colinmarc)
+- fix: Support Dictionary[Int32, Binary] for bitmap count spark function [#18273](https://github.com/apache/datafusion/pull/18273) (kazantsev-maksim)
+- fix: support float16 for `abs()` [#18304](https://github.com/apache/datafusion/pull/18304) (Jefffrey)
+- fix: Add WITH ORDER display in information_schema.views [#18282](https://github.com/apache/datafusion/pull/18282) (gene-bordegaray)
+- fix: correct date_trunc for times before the epoch [#18356](https://github.com/apache/datafusion/pull/18356) (mhilton)
+- fix: Preserve percent-encoding in `PartitionedFile` paths during deserialization [#18346](https://github.com/apache/datafusion/pull/18346) (lonless9)
+- fix: SortPreservingMerge sanity check rejects valid ORDER BY with CASE expression [#18342](https://github.com/apache/datafusion/pull/18342) (watford-ep)
+- fix: `DataFrame::select_columns` and `DataFrame::drop_columns` for qualified duplicated field names [#18236](https://github.com/apache/datafusion/pull/18236) (dqkqd)
+- fix(docs): remove navbar padding breaking ui on mobile [#18402](https://github.com/apache/datafusion/pull/18402) (foskey51)
+- fix: null cast not valid in substrait round trip [#18414](https://github.com/apache/datafusion/pull/18414) (gene-bordegaray)
+- fix: map benchmark failing [#18469](https://github.com/apache/datafusion/pull/18469) (randyli)
+- fix: eliminate warning when building without sql feature [#18480](https://github.com/apache/datafusion/pull/18480) (corasaurus-hex)
+- fix: spark array return type mismatch when inner data type is LargeList [#18485](https://github.com/apache/datafusion/pull/18485) (jizezhang)
+- fix: shuffle seed [#18518](https://github.com/apache/datafusion/pull/18518) (chenkovsky)
+
+**Documentation updates:**
+
+- Auto detect hive column partitioning with ListingTableFactory / `CREATE EXTERNAL TABLE` [#17232](https://github.com/apache/datafusion/pull/17232) (BlakeOrth)
+- Rename Blaze to Auron [#17532](https://github.com/apache/datafusion/pull/17532) (merrily01)
+- Revert #17295 (Support from-first SQL syntax) [#17520](https://github.com/apache/datafusion/pull/17520) (adriangb)
+- minor: Update doc comments on type signature [#17556](https://github.com/apache/datafusion/pull/17556) (Jefffrey)
+- docs: Update documentation on Epics and Supervising Maintainers [#17505](https://github.com/apache/datafusion/pull/17505) (alamb)
+- docs: Move Google Summer of Code 2025 pages to a section [#17504](https://github.com/apache/datafusion/pull/17504) (alamb)
+- Upgrade to arrow 56.1.0 [#17275](https://github.com/apache/datafusion/pull/17275) (alamb)
+- docs: add xorq to list of known users [#17668](https://github.com/apache/datafusion/pull/17668) (dlovell)
+- docs: deduplicate links in `introduction.md` [#17669](https://github.com/apache/datafusion/pull/17669) (Jefffrey)
+- Add explicit PMC/committers list to governance docs page [#17574](https://github.com/apache/datafusion/pull/17574) (alamb)
+- chore: Update READMEs of crates to be more consistent [#17691](https://github.com/apache/datafusion/pull/17691) (Jefffrey)
+- chore: fix wasm-pack installation link in wasmtest README [#17704](https://github.com/apache/datafusion/pull/17704) (Jefffrey)
+- docs: Remove disclaimer that `datafusion` 50.0.0 is not released [#17695](https://github.com/apache/datafusion/pull/17695) (nuno-faria)
+- Bump MSRV to 1.87.0 [#17724](https://github.com/apache/datafusion/pull/17724) (findepi)
+- docs: Fix 'Clicking a link in optimizer docs downloads the file instead of redirecting to github' [#17723](https://github.com/apache/datafusion/pull/17723) (petern48)
+- Move misplaced upgrading entry about MSRV [#17727](https://github.com/apache/datafusion/pull/17727) (findepi)
+- Introduce `avg_distinct()` and `sum_distinct()` functions to DataFrame API [#17536](https://github.com/apache/datafusion/pull/17536) (Jefffrey)
+- Support `WHERE`, `ORDER BY`, `LIMIT`, `SELECT`, `EXTEND` pipe operators [#17278](https://github.com/apache/datafusion/pull/17278) (simonvandel)
+- doc: add missing examples for multiple math functions [#17018](https://github.com/apache/datafusion/pull/17018) (Adez017)
+- chore: remove homebrew publish instructions from release steps [#17735](https://github.com/apache/datafusion/pull/17735) (Jefffrey)
+- Improve documentation for ordered set aggregate functions [#17744](https://github.com/apache/datafusion/pull/17744) (alamb)
+- docs: fix sidebar overlapping table on configuration page on website [#17738](https://github.com/apache/datafusion/pull/17738) (saimahendra282)
+- docs: add Ballista link to landing page (#17746) [#17775](https://github.com/apache/datafusion/pull/17775) (Nihallllll)
+- [DOCS] Add dbt Fusion engine and R2 Query Engine to "Known Users" [#17793](https://github.com/apache/datafusion/pull/17793) (dataders)
+- docs: update wasmtest README with instructions for Apple silicon [#17755](https://github.com/apache/datafusion/pull/17755) (Jefffrey)
+- docs: Add SedonaDB as known user of Apache DataFusion [#17806](https://github.com/apache/datafusion/pull/17806) (petern48)
+- minor: simplify docs build process & pin pip package versions [#17816](https://github.com/apache/datafusion/pull/17816) (Jefffrey)
+- Cleanup user guide known users section [#17834](https://github.com/apache/datafusion/pull/17834) (blaginin)
+- Fix the doc about row_groups pruning metrics in explain_usage.md [#17846](https://github.com/apache/datafusion/pull/17846) (xudong963)
+- Fix docs.rs build: Replace `auto_doc_cfg` with `doc_cfg` [#17845](https://github.com/apache/datafusion/pull/17845) (mbrobbel)
+- docs: Add rerun.io to known users guide [#17825](https://github.com/apache/datafusion/pull/17825) (alamb)
+- chore: fix typos & pin action hashes [#17855](https://github.com/apache/datafusion/pull/17855) (Jefffrey)
+- Clarify email reply instructions for invitations [#17851](https://github.com/apache/datafusion/pull/17851) (rluvaton)
+- Add missing parenthesis in features documentation [#17869](https://github.com/apache/datafusion/pull/17869) (Viicos)
+- Improve comments for DataSinkExec [#17873](https://github.com/apache/datafusion/pull/17873) (xudong963)
+- minor: Make `FunctionRegistry` `udafs` and `udwfs` methods mandatory [#17847](https://github.com/apache/datafusion/pull/17847) (milenkovicm)
+- docs: Improve documentation for FunctionFactory / CREATE FUNCTION [#17859](https://github.com/apache/datafusion/pull/17859) (alamb)
+- Support `AS`, `UNION`, `INTERSECTION`, `EXCEPT`, `AGGREGATE` pipe operators [#17312](https://github.com/apache/datafusion/pull/17312) (simonvandel)
+- [forward port] Change version to 50.1.0 and add changelog (#17748) [#17826](https://github.com/apache/datafusion/pull/17826) (alamb)
+- chore(deps): bump maturin from 1.9.4 to 1.9.5 in /docs [#17940](https://github.com/apache/datafusion/pull/17940) (dependabot[bot])
+- docs: `Window::try_new_with_schema` with a descriptive error message [#17926](https://github.com/apache/datafusion/pull/17926) (dqkqd)
+- Support `JOIN` pipe operator [#17969](https://github.com/apache/datafusion/pull/17969) (simonvandel)
+- Adds Object Store Profiling options/commands to CLI [#18004](https://github.com/apache/datafusion/pull/18004) (BlakeOrth)
+- docs: typo in `working-with-exprs.md` [#18033](https://github.com/apache/datafusion/pull/18033) (Weijun-H)
+- chore(deps): bump maturin from 1.9.5 to 1.9.6 in /docs [#18039](https://github.com/apache/datafusion/pull/18039) (dependabot[bot])
+- [forward port] Change version to 50.2.0 and add changelog [#18057](https://github.com/apache/datafusion/pull/18057) (xudong963)
+- Update committers on governance page [#18015](https://github.com/apache/datafusion/pull/18015) (alamb)
+- Feat: Make current_date aware of execution timezone. [#18034](https://github.com/apache/datafusion/pull/18034) (codetyri0n)
+- Add independent configs for topk/join dynamic filter [#18090](https://github.com/apache/datafusion/pull/18090) (xudong963)
+- Adds Trace and Summary to CLI instrumented stores [#18064](https://github.com/apache/datafusion/pull/18064) (BlakeOrth)
+- refactor: add dialect enum [#18043](https://github.com/apache/datafusion/pull/18043) (dariocurr)
+- #17982 Make `nvl` a thin wrapper for `coalesce` [#17991](https://github.com/apache/datafusion/pull/17991) (pepijnve)
+- minor: fix incorrect deprecation version & window docs [#18093](https://github.com/apache/datafusion/pull/18093) (Jefffrey)
+- Adding hiop as known user [#18114](https://github.com/apache/datafusion/pull/18114) (enryls)
+- Improve datafusion-cli object store profiling summary display [#18085](https://github.com/apache/datafusion/pull/18085) (alamb)
+- Feat: Make current_time aware of execution timezone. [#18040](https://github.com/apache/datafusion/pull/18040) (codetyri0n)
+- Docs: Update SQL example for current_time() and current_date(). [#18200](https://github.com/apache/datafusion/pull/18200) (codetyri0n)
+- doc: Add `Metrics` section to the user-guide [#18216](https://github.com/apache/datafusion/pull/18216) (2010YOUY01)
+- docs: Update HOWTOs for adding new functions [#18089](https://github.com/apache/datafusion/pull/18089) (Jefffrey)
+- docs: fix trim for `rust,ignore` blocks [#18239](https://github.com/apache/datafusion/pull/18239) (Jefffrey)
+- docs: refine `AggregateUDFImpl::is_ordered_set_aggregate` documentation [#17805](https://github.com/apache/datafusion/pull/17805) (Jefffrey)
+- docs: fix broken SQL & DataFrame links in root README (#18153) [#18274](https://github.com/apache/datafusion/pull/18274) (manasa-manoj-nbr)
+- doc: Contributor guide for AI-generated PRs [#18237](https://github.com/apache/datafusion/pull/18237) (2010YOUY01)
+- doc: Add Join Physical Plan documentation, and configuration flag to benchmarks [#18209](https://github.com/apache/datafusion/pull/18209) (jonathanc-n)
+- "Gentle Introduction to Arrow / Record Batches" #11336 [#18051](https://github.com/apache/datafusion/pull/18051) (sm4rtm4art)
+- Upgrade DataFusion to arrow/parquet 57.0.0 [#17888](https://github.com/apache/datafusion/pull/17888) (alamb)
+- Deduplicate range/gen_series nested functions code [#18198](https://github.com/apache/datafusion/pull/18198) (Jefffrey)
+- minor: doc fixes for timestamp output format [#18315](https://github.com/apache/datafusion/pull/18315) (Jefffrey)
+- Add PostgreSQL-style named arguments support for scalar functions [#18019](https://github.com/apache/datafusion/pull/18019) (bubulalabu)
+- Change default prefetch_hint to 512Kb to reduce number of object store requests when reading parquet files [#18160](https://github.com/apache/datafusion/pull/18160) (zhuqi-lucas)
+- Bump MSRV to 1.88.0 [#18403](https://github.com/apache/datafusion/pull/18403) (harshasiddartha)
+- Change default `time_zone` to `None` (was `"+00:00"`) [#18359](https://github.com/apache/datafusion/pull/18359) (Omega359)
+- Fix instances of "the the" to be "the" in comments/docs [#18478](https://github.com/apache/datafusion/pull/18478) (corasaurus-hex)
+- Update roadmap links for DataFusion Q1 2026 [#18495](https://github.com/apache/datafusion/pull/18495) (alamb)
+- Add a SpillingPool to manage collections of spill files [#18207](https://github.com/apache/datafusion/pull/18207) (adriangb)
+- [branch-51] Update version to 51.0.0, add Changelog [#18551](https://github.com/apache/datafusion/pull/18551) (alamb)
+- [branch-51] Revert rewrite for coalesce, `nvl` and `nvl2` simplification [#18567](https://github.com/apache/datafusion/pull/18567) (alamb)
+
+**Other:**
+
+- Extract complex default impls from AggregateUDFImpl trait [#17391](https://github.com/apache/datafusion/pull/17391) (findepi)
+- chore: make `TableFunction` clonable [#17457](https://github.com/apache/datafusion/pull/17457) (sunng87)
+- chore(deps): bump wasm-bindgen-test from 0.3.50 to 0.3.51 [#17470](https://github.com/apache/datafusion/pull/17470) (dependabot[bot])
+- chore(deps): bump log from 0.4.27 to 0.4.28 [#17471](https://github.com/apache/datafusion/pull/17471) (dependabot[bot])
+- Support csv truncated rows in datafusion [#17465](https://github.com/apache/datafusion/pull/17465) (zhuqi-lucas)
+- chore(deps): bump indexmap from 2.11.0 to 2.11.1 [#17484](https://github.com/apache/datafusion/pull/17484) (dependabot[bot])
+- chore(deps): bump chrono from 0.4.41 to 0.4.42 [#17483](https://github.com/apache/datafusion/pull/17483) (dependabot[bot])
+- Improve `PartialEq`, `Eq` speed for `LexOrdering`, make `PartialEq` and `PartialOrd` consistent [#17442](https://github.com/apache/datafusion/pull/17442) (findepi)
+- Fix array types coercion: preserve child element nullability for list types [#17306](https://github.com/apache/datafusion/pull/17306) (sgrebnov)
+- better preserve statistics when applying limits [#17381](https://github.com/apache/datafusion/pull/17381) (adriangb)
+- Refactor HashJoinExec to progressively accumulate dynamic filter bounds instead of computing them after data is accumulated [#17444](https://github.com/apache/datafusion/pull/17444) (adriangb)
+- Fix `PartialOrd` for logical plan nodes and expressions [#17438](https://github.com/apache/datafusion/pull/17438) (findepi)
+- chore(deps): bump sqllogictest from 0.28.3 to 0.28.4 [#17500](https://github.com/apache/datafusion/pull/17500) (dependabot[bot])
+- chore(deps): bump tempfile from 3.21.0 to 3.22.0 [#17499](https://github.com/apache/datafusion/pull/17499) (dependabot[bot])
+- refactor: Move `SMJ` tests into own file [#17495](https://github.com/apache/datafusion/pull/17495) (jonathanc-n)
+- move MinAggregator and MaxAggregator to functions-aggregate-common [#17492](https://github.com/apache/datafusion/pull/17492) (adriangb)
+- Update datafusion-testing pin to update expected output for extended tests [#17490](https://github.com/apache/datafusion/pull/17490) (alamb)
+- update physical-plan to use datafusion-functions-aggregate-common for Min/MaxAccumulator [#17502](https://github.com/apache/datafusion/pull/17502) (adriangb)
+- bug: Always use 'indent' format for explain verbose [#17481](https://github.com/apache/datafusion/pull/17481) (petern48)
+- Fix ambiguous column names in substrait conversion as a result of literals having the same name during conversion. [#17299](https://github.com/apache/datafusion/pull/17299) (xanderbailey)
+- Fix NULL Arithmetic Handling for Numerical Operators in Type Coercion [#17418](https://github.com/apache/datafusion/pull/17418) (etolbakov)
+- Prepare for Merge Queue [#17183](https://github.com/apache/datafusion/pull/17183) (blaginin)
+- bug: Support null as argument to to_local_time [#17491](https://github.com/apache/datafusion/pull/17491) (petern48)
+- Implement timestamp_cast_dtype for SqliteDialect [#17479](https://github.com/apache/datafusion/pull/17479) (krinart)
+- Disable `required_status_checks` for now [#17537](https://github.com/apache/datafusion/pull/17537) (blaginin)
+- Update Bug issue template to use Bug issue type [#17540](https://github.com/apache/datafusion/pull/17540) (findepi)
+- Fix predicate simplification for incompatible types in push_down_filter [#17521](https://github.com/apache/datafusion/pull/17521) (adriangb)
+- Add assertion that ScalarUDFImpl implementation is consistent with declared return type [#17515](https://github.com/apache/datafusion/pull/17515) (findepi)
+- Using `encode_arrow_schema` from arrow-rs. [#17543](https://github.com/apache/datafusion/pull/17543) (samueleresca)
+- Add test for decimal256 and float math [#17530](https://github.com/apache/datafusion/pull/17530) (Jefffrey)
+- Document how schema projection works. [#17250](https://github.com/apache/datafusion/pull/17250) (wiedld)
+- chore(deps): bump rust_decimal from 1.37.2 to 1.38.0 [#17564](https://github.com/apache/datafusion/pull/17564) (dependabot[bot])
+- chore(deps): bump semver from 1.0.26 to 1.0.27 [#17566](https://github.com/apache/datafusion/pull/17566) (dependabot[bot])
+- Generalize struct-to-struct casting with CastOptions and SchemaAdapter integration [#17468](https://github.com/apache/datafusion/pull/17468) (kosiew)
+- Add `TableProvider::scan_with_args` [#17336](https://github.com/apache/datafusion/pull/17336) (adriangb)
+- Use taiki-e/install-action and binstall in CI [#17573](https://github.com/apache/datafusion/pull/17573) (AdamGS)
+- Trying cargo machete to prune unused deps. [#17545](https://github.com/apache/datafusion/pull/17545) (samueleresca)
+- Fix typo in error message in `substring.rs` [#17570](https://github.com/apache/datafusion/pull/17570) (AdamGS)
+- chore(deps): bump taiki-e/install-action from 2.61.5 to 2.61.6 [#17586](https://github.com/apache/datafusion/pull/17586) (dependabot[bot])
+- datafusion/substrait: enable `unicode_expressions` in dev-dependencies to fix substring planning test [#17584](https://github.com/apache/datafusion/pull/17584) (kosiew)
+- chore: replace deprecated UnionExec API [#17588](https://github.com/apache/datafusion/pull/17588) (etolbakov)
+- minor: fix compilation issue for extended tests due to missing parquet encryption flag [#17579](https://github.com/apache/datafusion/pull/17579) (Jefffrey)
+- Update release README for new `datafusion/physical-expr-adapter` crate [#17591](https://github.com/apache/datafusion/pull/17591) (xudong963)
+- chore(deps): bump indexmap from 2.11.1 to 2.11.3 [#17587](https://github.com/apache/datafusion/pull/17587) (dependabot[bot])
+- chore(deps): bump serde_json from 1.0.143 to 1.0.145 [#17585](https://github.com/apache/datafusion/pull/17585) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.61.6 to 2.61.8 [#17615](https://github.com/apache/datafusion/pull/17615) (dependabot[bot])
+- Always run CI checks [#17538](https://github.com/apache/datafusion/pull/17538) (blaginin)
+- Revert "Always run CI checks" [#17629](https://github.com/apache/datafusion/pull/17629) (blaginin)
+- Bump datafusion-testing to latest [#17609](https://github.com/apache/datafusion/pull/17609) (Jefffrey)
+- Use `Display` formatting of `DataType`:s in error messages [#17565](https://github.com/apache/datafusion/pull/17565) (emilk)
+- `avg(distinct)` support for decimal types [#17560](https://github.com/apache/datafusion/pull/17560) (Jefffrey)
+- chore(deps): bump taiki-e/install-action from 2.61.8 to 2.61.9 [#17640](https://github.com/apache/datafusion/pull/17640) (dependabot[bot])
+- chore(deps): bump Swatinem/rust-cache from 2.8.0 to 2.8.1 [#17641](https://github.com/apache/datafusion/pull/17641) (dependabot[bot])
+- Validate the memory consumption in SPM created by multi level merge [#17029](https://github.com/apache/datafusion/pull/17029) (ding-young)
+- fix(SubqueryAlias): use maybe_project_redundant_column [#17478](https://github.com/apache/datafusion/pull/17478) (notfilippo)
+- minor: Ensure `datafusion-sql` package dependencies have `sql` flag [#17644](https://github.com/apache/datafusion/pull/17644) (Jefffrey)
+- optimizer: Rewrite `IS NOT DISTINCT FROM` joins as Hash Joins [#17319](https://github.com/apache/datafusion/pull/17319) (2010YOUY01)
+- chore(deps): bump serde from 1.0.223 to 1.0.225 [#17614](https://github.com/apache/datafusion/pull/17614) (dependabot[bot])
+- chore: Update dynamic filter formatting [#17647](https://github.com/apache/datafusion/pull/17647) (rkrishn7)
+- chore(deps): bump taiki-e/install-action from 2.61.9 to 2.61.10 [#17660](https://github.com/apache/datafusion/pull/17660) (dependabot[bot])
+- proto: don't include parquet feature by default [#17577](https://github.com/apache/datafusion/pull/17577) (jackkleeman)
+- minor: Ensure `proto` crate has datetime & unicode expr flags in datafusion dev dependency [#17656](https://github.com/apache/datafusion/pull/17656) (Jefffrey)
+- chore(deps): bump indexmap from 2.11.3 to 2.11.4 [#17661](https://github.com/apache/datafusion/pull/17661) (dependabot[bot])
+- Support Decimal32/64 types [#17501](https://github.com/apache/datafusion/pull/17501) (AdamGS)
+- minor: Improve hygiene for `datafusion-functions` macros [#17638](https://github.com/apache/datafusion/pull/17638) (Jefffrey)
+- [unparser] Custom timestamp format for DuckDB [#17653](https://github.com/apache/datafusion/pull/17653) (krinart)
+- Support LargeList for array_sort [#17657](https://github.com/apache/datafusion/pull/17657) (Jefffrey)
+- Support FixedSizeList for array_except [#17658](https://github.com/apache/datafusion/pull/17658) (Jefffrey)
+- chore: refactor array fn signatures & add more slt tests [#17672](https://github.com/apache/datafusion/pull/17672) (Jefffrey)
+- Support FixedSizeList for array_to_string [#17666](https://github.com/apache/datafusion/pull/17666) (Jefffrey)
+- minor: add SQLancer fuzzed SLT case for natural joins [#17683](https://github.com/apache/datafusion/pull/17683) (Jefffrey)
+- chore: Upgrade Rust version to 1.90.0 [#17677](https://github.com/apache/datafusion/pull/17677) (rkrishn7)
+- Support FixedSizeList for array_position [#17659](https://github.com/apache/datafusion/pull/17659) (Jefffrey)
+- chore(deps): bump the proto group with 2 updates [#16806](https://github.com/apache/datafusion/pull/16806) (dependabot[bot])
+- chore: update a bunch of dependencies [#17708](https://github.com/apache/datafusion/pull/17708) (Jefffrey)
+- Support FixedSizeList for array_slice via coercion to List [#17667](https://github.com/apache/datafusion/pull/17667) (Jefffrey)
+- chore(deps): bump taiki-e/install-action from 2.61.10 to 2.62.1 [#17710](https://github.com/apache/datafusion/pull/17710) (dependabot[bot])
+- fix(agg/corr): return NULL when variance is zero or samples < 2 [#17621](https://github.com/apache/datafusion/pull/17621) (killme2008)
+- chore(deps): bump taiki-e/install-action from 2.62.1 to 2.62.4 [#17739](https://github.com/apache/datafusion/pull/17739) (dependabot[bot])
+- chore(deps): bump tempfile from 3.22.0 to 3.23.0 [#17741](https://github.com/apache/datafusion/pull/17741) (dependabot[bot])
+- chore: make `LimitPushPastWindows` public [#17736](https://github.com/apache/datafusion/pull/17736) (linhr)
+- minor: create `OptimizerContext` with provided `ConfigOptions` [#17742](https://github.com/apache/datafusion/pull/17742) (MichaelScofield)
+- Add support for calling async UDF as aggregation expression [#17620](https://github.com/apache/datafusion/pull/17620) (simonvandel)
+- chore(deps): bump taiki-e/install-action from 2.62.4 to 2.62.5 [#17750](https://github.com/apache/datafusion/pull/17750) (dependabot[bot])
+- (fix): Lag function creates unwanted projection (#17630) [#17639](https://github.com/apache/datafusion/pull/17639) (renato2099)
+- Support `LargeList` in `array_has` simplification to `InList` [#17732](https://github.com/apache/datafusion/pull/17732) (Jefffrey)
+- chore(deps): bump wasm-bindgen-test from 0.3.51 to 0.3.53 [#17642](https://github.com/apache/datafusion/pull/17642) (dependabot[bot])
+- chore(deps): bump object_store from 0.12.3 to 0.12.4 [#17753](https://github.com/apache/datafusion/pull/17753) (dependabot[bot])
+- Update `arrow` / `parquet` to 56.2.0 [#17631](https://github.com/apache/datafusion/pull/17631) (alamb)
+- chore(deps): bump taiki-e/install-action from 2.62.5 to 2.62.6 [#17766](https://github.com/apache/datafusion/pull/17766) (dependabot[bot])
+- Keep aggregate udaf schema names unique when missing an order-by [#17731](https://github.com/apache/datafusion/pull/17731) (wiedld)
+- feat : Display function alias in output column name [#17690](https://github.com/apache/datafusion/pull/17690) (devampatel03)
+- Support join cardinality estimation less conservatively [#17476](https://github.com/apache/datafusion/pull/17476) (jackkleeman)
+- chore(deps): bump libc from 0.2.175 to 0.2.176 [#17767](https://github.com/apache/datafusion/pull/17767) (dependabot[bot])
+- chore(deps): bump postgres-types from 0.2.9 to 0.2.10 [#17768](https://github.com/apache/datafusion/pull/17768) (dependabot[bot])
+- Use `Expr::qualified_name()` and `Column::new()` to extract partition keys from window and aggregate operators [#17757](https://github.com/apache/datafusion/pull/17757) (masonh22)
+- chore(deps): bump taiki-e/install-action from 2.62.6 to 2.62.8 [#17781](https://github.com/apache/datafusion/pull/17781) (dependabot[bot])
+- chore(deps): bump wasm-bindgen-test from 0.3.53 to 0.3.54 [#17784](https://github.com/apache/datafusion/pull/17784) (dependabot[bot])
+- chore: Action some old TODOs in github actions [#17694](https://github.com/apache/datafusion/pull/17694) (Jefffrey)
+- dev: Add benchmark for compilation profiles [#17754](https://github.com/apache/datafusion/pull/17754) (2010YOUY01)
+- chore(deps): bump tokio-postgres from 0.7.13 to 0.7.14 [#17785](https://github.com/apache/datafusion/pull/17785) (dependabot[bot])
+- chore(deps): bump serde from 1.0.226 to 1.0.227 [#17783](https://github.com/apache/datafusion/pull/17783) (dependabot[bot])
+- chore(deps): bump regex from 1.11.2 to 1.11.3 [#17782](https://github.com/apache/datafusion/pull/17782) (dependabot[bot])
+- Test `CAST` from temporal to `Utf8View` [#17535](https://github.com/apache/datafusion/pull/17535) (findepi)
+- chore: dependabot to run weekly [#17797](https://github.com/apache/datafusion/pull/17797) (comphead)
+- chore(deps): bump sysinfo from 0.37.0 to 0.37.1 [#17800](https://github.com/apache/datafusion/pull/17800) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.8 to 2.62.9 [#17799](https://github.com/apache/datafusion/pull/17799) (dependabot[bot])
+- Fix potential overflow when we print verbose physical plan [#17798](https://github.com/apache/datafusion/pull/17798) (zhuqi-lucas)
+- Extend datatype semantic equality check to include timestamps [#17777](https://github.com/apache/datafusion/pull/17777) (shivbhatia10)
+- dev: Add Apache license check to the lint script [#17787](https://github.com/apache/datafusion/pull/17787) (2010YOUY01)
+- Fix: common_sub_expression_eliminate optimizer rule failed [#16066](https://github.com/apache/datafusion/pull/16066) (Col-Waltz)
+- chore: remove dialect fixes in SLT tests that are outdated [#17807](https://github.com/apache/datafusion/pull/17807) (Jefffrey)
+- chore(deps): bump thiserror from 2.0.16 to 2.0.17 [#17821](https://github.com/apache/datafusion/pull/17821) (dependabot[bot])
+- chore(deps): bump quote from 1.0.40 to 1.0.41 [#17822](https://github.com/apache/datafusion/pull/17822) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.9 to 2.62.12 [#17823](https://github.com/apache/datafusion/pull/17823) (dependabot[bot])
+- chore(deps): bump serde from 1.0.227 to 1.0.228 [#17827](https://github.com/apache/datafusion/pull/17827) (dependabot[bot])
+- Temporarily disable failing `sql_planner` benchmark query [#17809](https://github.com/apache/datafusion/pull/17809) (alamb)
+- chore(deps): bump taiki-e/install-action from 2.62.12 to 2.62.13 [#17836](https://github.com/apache/datafusion/pull/17836) (dependabot[bot])
+- More decimal 32/64 support - type coercsion and misc gaps [#17808](https://github.com/apache/datafusion/pull/17808) (AdamGS)
+- Implement `AsRef` for `Expr` [#17819](https://github.com/apache/datafusion/pull/17819) (findepi)
+- chore(deps): bump taiki-e/install-action from 2.62.13 to 2.62.14 [#17840](https://github.com/apache/datafusion/pull/17840) (dependabot[bot])
+- chore(deps): bump petgraph from 0.8.2 to 0.8.3 [#17842](https://github.com/apache/datafusion/pull/17842) (dependabot[bot])
+- Relax constraint that file sort order must only reference individual columns [#17419](https://github.com/apache/datafusion/pull/17419) (pepijnve)
+- minor: Include consumer name in OOM message [#17870](https://github.com/apache/datafusion/pull/17870) (andygrove)
+- Implement `partition_statistics` API for `InterleaveExec` [#17051](https://github.com/apache/datafusion/pull/17051) (liamzwbao)
+- Add `CastColumnExpr` for struct-aware column casting [#17773](https://github.com/apache/datafusion/pull/17773) (kosiew)
+- chore(deps): bump taiki-e/install-action from 2.62.14 to 2.62.16 [#17879](https://github.com/apache/datafusion/pull/17879) (dependabot[bot])
+- chore(deps): bump crate-ci/typos from 1.37.0 to 1.37.1 [#17878](https://github.com/apache/datafusion/pull/17878) (dependabot[bot])
+- Fix failing CI caused by hash collisions [#17886](https://github.com/apache/datafusion/pull/17886) (liamzwbao)
+- Minor: reuse test schemas in simplify tests [#17864](https://github.com/apache/datafusion/pull/17864) (alamb)
+- Make limit pushdown work for SortPreservingMergeExec [#17893](https://github.com/apache/datafusion/pull/17893) (Dandandan)
+- chore(deps): bump taiki-e/install-action from 2.62.16 to 2.62.17 [#17896](https://github.com/apache/datafusion/pull/17896) (dependabot[bot])
+- Consolidate `apply_schema_adapter_tests` [#17905](https://github.com/apache/datafusion/pull/17905) (alamb)
+- Improve `InListExpr` plan display [#17884](https://github.com/apache/datafusion/pull/17884) (pepijnve)
+- Export JoinSetTracerError from datafusion-common-runtime [#17877](https://github.com/apache/datafusion/pull/17877) (JanKaul)
+- Clippy to `extended_tests` [#17922](https://github.com/apache/datafusion/pull/17922) (blaginin)
+- chore: rename Schema `print_schema_tree` to `tree_string` [#17919](https://github.com/apache/datafusion/pull/17919) (comphead)
+- chore: utilize trait upcasting for AsyncScalarUDF PartialEq & Hash [#17872](https://github.com/apache/datafusion/pull/17872) (Jefffrey)
+- Refactor: Update enforce_sorting tests to use insta snapshots for easier updates [#17900](https://github.com/apache/datafusion/pull/17900) (alamb)
+- chore(deps): bump flate2 from 1.1.2 to 1.1.4 [#17938](https://github.com/apache/datafusion/pull/17938) (dependabot[bot])
+- chore(deps): bump actions/stale from 10.0.0 to 10.1.0 [#17937](https://github.com/apache/datafusion/pull/17937) (dependabot[bot])
+- chore(deps): bump aws-credential-types from 1.2.6 to 1.2.7 [#17936](https://github.com/apache/datafusion/pull/17936) (dependabot[bot])
+- chore(deps): bump rustyline from 17.0.1 to 17.0.2 [#17932](https://github.com/apache/datafusion/pull/17932) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.17 to 2.62.21 [#17934](https://github.com/apache/datafusion/pull/17934) (dependabot[bot])
+- chore(deps): bump crate-ci/typos from 1.37.1 to 1.37.2 [#17935](https://github.com/apache/datafusion/pull/17935) (dependabot[bot])
+- chore: upgrade sqlparser [#17925](https://github.com/apache/datafusion/pull/17925) (chenkovsky)
+- minor: impl Clone and Debug on CaseBuilder [#17927](https://github.com/apache/datafusion/pull/17927) (timsaucer)
+- chore: Extend backtrace coverage for `Execution` and `Internal` errors [#17921](https://github.com/apache/datafusion/pull/17921) (comphead)
+- chore(deps): bump taiki-e/install-action from 2.62.21 to 2.62.22 [#17949](https://github.com/apache/datafusion/pull/17949) (dependabot[bot])
+- chore(deps): bump crate-ci/typos from 1.37.2 to 1.38.0 [#17948](https://github.com/apache/datafusion/pull/17948) (dependabot[bot])
+- Feat: [datafusion-spark] Migrate avg from comet to datafusion-spark and add tests. [#17871](https://github.com/apache/datafusion/pull/17871) (codetyri0n)
+- Update tests to use insta / make them easier to update [#17945](https://github.com/apache/datafusion/pull/17945) (alamb)
+- Minor Test refactor: avoid creating the same SchemaRef [#17951](https://github.com/apache/datafusion/pull/17951) (alamb)
+- Precision::<usize>::{add, sub, multiply}: avoid overflows [#17929](https://github.com/apache/datafusion/pull/17929) (Tpt)
+- Resolve `ListingScan` projection against table schema including partition columns [#17911](https://github.com/apache/datafusion/pull/17911) (mach-kernel)
+- chore(deps): bump crate-ci/typos from 1.38.0 to 1.38.1 [#17960](https://github.com/apache/datafusion/pull/17960) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.22 to 2.62.23 [#17959](https://github.com/apache/datafusion/pull/17959) (dependabot[bot])
+- bench: fix `vectorized_equal_to` bench mutated between iterations [#17968](https://github.com/apache/datafusion/pull/17968) (rluvaton)
+- fix docs and broken example from #17956 [#17980](https://github.com/apache/datafusion/pull/17980) (adriangb)
+- Refactor: Update `replace_with_order_preserving_variants` tests to use insta snapshots for easier updates [#17962](https://github.com/apache/datafusion/pull/17962) (blaginin)
+- Support repartitioned() method in RepartitionExec [#17990](https://github.com/apache/datafusion/pull/17990) (gabotechs)
+- Adds Instrumented Object Store to CLI [#17984](https://github.com/apache/datafusion/pull/17984) (BlakeOrth)
+- Migrate `join_selection` tests to snapshot-based testing [#17974](https://github.com/apache/datafusion/pull/17974) (blaginin)
+- bench: fix actually generate a lot of unique values in benchmark table [#17967](https://github.com/apache/datafusion/pull/17967) (rluvaton)
+- Adds Instrument Mode for InstrumentedObjectStore in datafusion-cli [#18000](https://github.com/apache/datafusion/pull/18000) (BlakeOrth)
+- minor: refactor Spark ascii function to reuse DataFusion ascii function code [#17965](https://github.com/apache/datafusion/pull/17965) (Jefffrey)
+- chore(deps): bump taiki-e/install-action from 2.62.23 to 2.62.24 [#17989](https://github.com/apache/datafusion/pull/17989) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.24 to 2.62.25 [#18007](https://github.com/apache/datafusion/pull/18007) (dependabot[bot])
+- Clarify documentation that ScalarUDFImpl::simplity must not change the schema [#17981](https://github.com/apache/datafusion/pull/17981) (alamb)
+- Expose trace_future and trace_block outside of common-runtime [#17976](https://github.com/apache/datafusion/pull/17976) (AdamGS)
+- Adds instrumentation to get requests for datafusion-cli [#18016](https://github.com/apache/datafusion/pull/18016) (BlakeOrth)
+- chore(deps): bump half from 2.6.0 to 2.7.0 [#18036](https://github.com/apache/datafusion/pull/18036) (dependabot[bot])
+- chore(deps): bump aws-config from 1.8.6 to 1.8.7 [#18038](https://github.com/apache/datafusion/pull/18038) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.25 to 2.62.28 [#18037](https://github.com/apache/datafusion/pull/18037) (dependabot[bot])
+- refactor: cleanup naming and macro usages for binary operator [#17985](https://github.com/apache/datafusion/pull/17985) (sunng87)
+- Impl `gather_filters_for_pushdown` for `CoalescePartitionsExec` [#18046](https://github.com/apache/datafusion/pull/18046) (xudong963)
+- Fix bug in LimitPushPastWindows [#18029](https://github.com/apache/datafusion/pull/18029) (avantgardnerio)
+- Fix `SortPreservingMergeExec` tree formatting with limit [#18009](https://github.com/apache/datafusion/pull/18009) (AdamGS)
+- chore(deps): bump actions/setup-node from 5.0.0 to 6.0.0 [#18049](https://github.com/apache/datafusion/pull/18049) (dependabot[bot])
+- chore(deps): bump sysinfo from 0.37.1 to 0.37.2 [#18035](https://github.com/apache/datafusion/pull/18035) (dependabot[bot])
+- FileScanConfig: Preserve schema metadata across ser/de boundary [#17966](https://github.com/apache/datafusion/pull/17966) (mach-kernel)
+- physical-plan: push filters down to UnionExec children [#18054](https://github.com/apache/datafusion/pull/18054) (asubiotto)
+- Add `min_max_bytes` benchmark (Reproduce quadratic runtime in min_max_bytes) [#18041](https://github.com/apache/datafusion/pull/18041) (ctsk)
+- Adds summary output to CLI instrumented object stores [#18045](https://github.com/apache/datafusion/pull/18045) (BlakeOrth)
+- Impl spark bit not function [#18018](https://github.com/apache/datafusion/pull/18018) (kazantsev-maksim)
+- chore: revert tests [#18065](https://github.com/apache/datafusion/pull/18065) (comphead)
+- chore: Use an enum to express the different kinds of nullability in an array [#18048](https://github.com/apache/datafusion/pull/18048) (martin-g)
+- chore(deps): bump taiki-e/install-action from 2.62.28 to 2.62.29 [#18069](https://github.com/apache/datafusion/pull/18069) (dependabot[bot])
+- Split up monster test_window_partial_constant_and_set_monotonicity into smaller functions [#17952](https://github.com/apache/datafusion/pull/17952) (alamb)
+- Push Down Filter Subexpressions in Nested Loop Joins as Projections [#17906](https://github.com/apache/datafusion/pull/17906) (tobixdev)
+- ci: Use PR description for merge commit body in squash merges [#18027](https://github.com/apache/datafusion/pull/18027) (Weijun-H)
+- Fix extended tests on main to get CI green [#18096](https://github.com/apache/datafusion/pull/18096) (alamb)
+- chore(deps): bump taiki-e/install-action from 2.62.29 to 2.62.31 [#18094](https://github.com/apache/datafusion/pull/18094) (dependabot[bot])
+- chore: run extended suite on PRs for critical areas [#18088](https://github.com/apache/datafusion/pull/18088) (comphead)
+- chore(deps): bump taiki-e/install-action from 2.62.31 to 2.62.33 [#18113](https://github.com/apache/datafusion/pull/18113) (dependabot[bot])
+- chore: remove unnecessary `skip_failed_rules` config in slt [#18117](https://github.com/apache/datafusion/pull/18117) (Jefffrey)
+- Refactor repartition to use `insta` [#18106](https://github.com/apache/datafusion/pull/18106) (blaginin)
+- refactor: move ListingTable over to the catalog-listing-table crate [#18080](https://github.com/apache/datafusion/pull/18080) (timsaucer)
+- refactor: move arrow datasource to new `datafusion-datasource-arrow` crate [#18082](https://github.com/apache/datafusion/pull/18082) (timsaucer)
+- Adds instrumentation to LIST operations in CLI [#18103](https://github.com/apache/datafusion/pull/18103) (BlakeOrth)
+- Add extra case_when benchmarks [#18097](https://github.com/apache/datafusion/pull/18097) (pepijnve)
+- Adds instrumentation to delimited LIST operations in CLI [#18134](https://github.com/apache/datafusion/pull/18134) (BlakeOrth)
+- test: `to_timestamp(double)` for vectorized input [#18147](https://github.com/apache/datafusion/pull/18147) (dqkqd)
+- Fix `concat_elements_utf8view` capacity initialization. [#18003](https://github.com/apache/datafusion/pull/18003) (samueleresca)
+- Use < instead of = in case benchmark predicates, use Integers [#18144](https://github.com/apache/datafusion/pull/18144) (pepijnve)
+- Adds instrumentation to PUT ops in the CLI [#18139](https://github.com/apache/datafusion/pull/18139) (BlakeOrth)
+- [main] chore: Fix `no space left on device` (#18141) [#18151](https://github.com/apache/datafusion/pull/18151) (alamb)
+- Fix `DISTINCT ON` for tables with no columns (ReplaceDistinctWithAggregate: do not fail when on input without columns) [#18133](https://github.com/apache/datafusion/pull/18133) (Tpt)
+- Fix quadratic runtime in min_max_bytes [#18044](https://github.com/apache/datafusion/pull/18044) (ctsk)
+- chore(deps): bump getrandom from 0.3.3 to 0.3.4 [#18163](https://github.com/apache/datafusion/pull/18163) (dependabot[bot])
+- chore(deps): bump tokio from 1.47.1 to 1.48.0 [#18164](https://github.com/apache/datafusion/pull/18164) (dependabot[bot])
+- chore(deps): bump indexmap from 2.11.4 to 2.12.0 [#18162](https://github.com/apache/datafusion/pull/18162) (dependabot[bot])
+- chore(deps): bump bzip2 from 0.6.0 to 0.6.1 [#18165](https://github.com/apache/datafusion/pull/18165) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.33 to 2.62.34 [#18194](https://github.com/apache/datafusion/pull/18194) (dependabot[bot])
+- Fix COPY TO does not produce an output file for the empty set [#18074](https://github.com/apache/datafusion/pull/18074) (bert-beyondloops)
+- Add Projection struct w/ helper methods to manipulate projections [#18176](https://github.com/apache/datafusion/pull/18176) (adriangb)
+- Add TableSchema helper to encapsulate file schema + partition fields [#18178](https://github.com/apache/datafusion/pull/18178) (adriangb)
+- Add spilling to RepartitionExec [#18014](https://github.com/apache/datafusion/pull/18014) (adriangb)
+- Adds DELETE and HEAD instrumentation to CLI [#18206](https://github.com/apache/datafusion/pull/18206) (BlakeOrth)
+- [branch-50] Prepare 50.3.0 release version number and README (#18173) [#18182](https://github.com/apache/datafusion/pull/18182) (alamb)
+- Fix array_has simplification with null argument [#18186](https://github.com/apache/datafusion/pull/18186) (joroKr21)
+- chore(deps): bump taiki-e/install-action from 2.62.34 to 2.62.35 [#18215](https://github.com/apache/datafusion/pull/18215) (dependabot[bot])
+- bench: create benchmark for lookup table like `CASE WHEN` [#18203](https://github.com/apache/datafusion/pull/18203) (rluvaton)
+- Adds instrumentation to COPY operations in the CLI [#18227](https://github.com/apache/datafusion/pull/18227) (BlakeOrth)
+- Consolidate core_integration/datasource and rename parquet_source --> parquet_integration [#18226](https://github.com/apache/datafusion/pull/18226) (alamb)
+- CoalescePartitionsExec fetch is not consistent with one partition and more than one partition [#18245](https://github.com/apache/datafusion/pull/18245) (zhuqi-lucas)
+- Migrate core test to insta part 3 [#16978](https://github.com/apache/datafusion/pull/16978) (Chen-Yuan-Lai)
+- chore(deps): bump taiki-e/install-action from 2.62.35 to 2.62.36 [#18240](https://github.com/apache/datafusion/pull/18240) (dependabot[bot])
+- Fix: Do not normalize table names when deserializing from protobuf [#18187](https://github.com/apache/datafusion/pull/18187) (drin)
+- Revert "chore: revert tests (#18065)" [#18255](https://github.com/apache/datafusion/pull/18255) (dqkqd)
+- Refactor `nvl2` Function to Support Lazy Evaluation and Simplification via CASE Expression [#18191](https://github.com/apache/datafusion/pull/18191) (kosiew)
+- fix null count stats computation [#18276](https://github.com/apache/datafusion/pull/18276) (adriangb)
+- Improve docs and examples for `DataTypeExt` and `FieldExt` [#18271](https://github.com/apache/datafusion/pull/18271) (alamb)
+- Easier construction of ScalarAndMetadata [#18272](https://github.com/apache/datafusion/pull/18272) (alamb)
+- Add integration test for IO operations for listing tables queries [#18229](https://github.com/apache/datafusion/pull/18229) (alamb)
+- Fix: Error rather than silently ignore extra parameter passed to ceil/floor [#18265](https://github.com/apache/datafusion/pull/18265) (toxicteddy00077)
+- chore(deps): Update `half` to 2.7.1, ignore `RUSTSEC-2025-0111` [#18287](https://github.com/apache/datafusion/pull/18287) (alamb)
+- chore(deps): bump taiki-e/install-action from 2.62.36 to 2.62.38 [#18293](https://github.com/apache/datafusion/pull/18293) (dependabot[bot])
+- chore(deps): bump regex from 1.11.3 to 1.12.2 [#18294](https://github.com/apache/datafusion/pull/18294) (dependabot[bot])
+- chore(deps): bump clap from 4.5.48 to 4.5.50 [#18292](https://github.com/apache/datafusion/pull/18292) (dependabot[bot])
+- chore(deps): bump syn from 2.0.106 to 2.0.108 [#18291](https://github.com/apache/datafusion/pull/18291) (dependabot[bot])
+- Enforce unique names for `is_set` on `first_value` and `last_value` [#18303](https://github.com/apache/datafusion/pull/18303) (marc-pydantic)
+- chore(deps): update testcontainers to `0.25.2` and drop ignore of `RUSTSEC-2025-0111` [#18305](https://github.com/apache/datafusion/pull/18305) (DDtKey)
+- Using `try_append_value` from arrow-rs 57.0.0 [#18313](https://github.com/apache/datafusion/pull/18313) (samueleresca)
+- minor: Add documentation to function `concat_elements_utf8view` [#18316](https://github.com/apache/datafusion/pull/18316) (2010YOUY01)
+- chore(deps): bump taiki-e/install-action from 2.62.38 to 2.62.40 [#18318](https://github.com/apache/datafusion/pull/18318) (dependabot[bot])
+- Fix: Add projection to generate_series [#18298](https://github.com/apache/datafusion/pull/18298) (mkleen)
+- Do not accept null is_set for first_value/last_value [#18301](https://github.com/apache/datafusion/pull/18301) (marc-pydantic)
+- Optimize merging of partial case expression results [#18152](https://github.com/apache/datafusion/pull/18152) (pepijnve)
+- chore: Format examples in doc strings - execution [#18339](https://github.com/apache/datafusion/pull/18339) (CuteChuanChuan)
+- chore: Format examples in doc strings - common [#18336](https://github.com/apache/datafusion/pull/18336) (CuteChuanChuan)
+- chore: Format examples in doc strings - crate datafusion [#18333](https://github.com/apache/datafusion/pull/18333) (CuteChuanChuan)
+- chore: Format examples in doc strings - expr [#18340](https://github.com/apache/datafusion/pull/18340) (CuteChuanChuan)
+- chore: Format examples in doc strings - datasource crates [#18338](https://github.com/apache/datafusion/pull/18338) (CuteChuanChuan)
+- Insta for enforce_distrubution (easy ones) [#18248](https://github.com/apache/datafusion/pull/18248) (blaginin)
+- chore: Format examples in doc strings - macros and optmizer [#18354](https://github.com/apache/datafusion/pull/18354) (CuteChuanChuan)
+- chore: Format examples in doc strings - proto, pruning, and session [#18358](https://github.com/apache/datafusion/pull/18358) (CuteChuanChuan)
+- chore: Format examples in doc strings - catalog listing [#18335](https://github.com/apache/datafusion/pull/18335) (CuteChuanChuan)
+- ci: fix temporary file creation in tests and tighten CI check [#18374](https://github.com/apache/datafusion/pull/18374) (2010YOUY01)
+- Run extended tests when there are changes to datafusion-testing pin [#18310](https://github.com/apache/datafusion/pull/18310) (alamb)
+- Add simple unit test for `merge` in case expression [#18369](https://github.com/apache/datafusion/pull/18369) (pepijnve)
+- chore(deps): bump taiki-e/install-action from 2.62.40 to 2.62.41 [#18377](https://github.com/apache/datafusion/pull/18377) (dependabot[bot])
+- Refactor `range`/`gen_series` signature away from user defined [#18317](https://github.com/apache/datafusion/pull/18317) (Jefffrey)
+- Adds Partitioned CSV test to object store access tests [#18370](https://github.com/apache/datafusion/pull/18370) (BlakeOrth)
+- Add reproducer for consecutive RepartitionExec [#18343](https://github.com/apache/datafusion/pull/18343) (NGA-TRAN)
+- chore: bump substrait version to `0.60.0` to use substrait spec v0.75.0 [#17866](https://github.com/apache/datafusion/pull/17866) (benbellick)
+- Use the upstream arrow-rs coalesce kernel [#17193](https://github.com/apache/datafusion/pull/17193) (zhuqi-lucas)
+- Extract out super slow planning benchmark to it's own benchmark [#18388](https://github.com/apache/datafusion/pull/18388) (Omega359)
+- minor: Fix parquet pruning metrics display order [#18379](https://github.com/apache/datafusion/pull/18379) (2010YOUY01)
+- chore: use enum as `date_trunc` granularity [#18390](https://github.com/apache/datafusion/pull/18390) (comphead)
+- chore(deps): bump taiki-e/install-action from 2.62.41 to 2.62.43 [#18398](https://github.com/apache/datafusion/pull/18398) (dependabot[bot])
+- Project record batches to avoid filtering unused columns in `CASE` evaluation [#18329](https://github.com/apache/datafusion/pull/18329) (pepijnve)
+- catch errors when simplifying cast(lit(...), ...) and bubble those up [#18332](https://github.com/apache/datafusion/pull/18332) (adriangb)
+- Align `NowFunc::new()` with canonical `ConfigOptions` timezone and enhance documentation [#18347](https://github.com/apache/datafusion/pull/18347) (kosiew)
+- chore: Format examples in doc strings - physical expr, optimizer, and plan [#18357](https://github.com/apache/datafusion/pull/18357) (CuteChuanChuan)
+- Fix: spark bit_count function [#18322](https://github.com/apache/datafusion/pull/18322) (kazantsev-maksim)
+- chore: bump workspace rust version to 1.91.0 [#18422](https://github.com/apache/datafusion/pull/18422) (randyli)
+- Minor: Remove unneccessary vec! in SortMergeJoinStream initialization [#18430](https://github.com/apache/datafusion/pull/18430) (mapleFU)
+- minor: refactor array reverse internals [#18445](https://github.com/apache/datafusion/pull/18445) (Jefffrey)
+- chore(deps): bump taiki-e/install-action from 2.62.43 to 2.62.45 [#18465](https://github.com/apache/datafusion/pull/18465) (dependabot[bot])
+- chore(deps): bump crate-ci/typos from 1.38.1 to 1.39.0 [#18464](https://github.com/apache/datafusion/pull/18464) (dependabot[bot])
+- chore(deps): bump rstest from 0.25.0 to 0.26.1 [#18463](https://github.com/apache/datafusion/pull/18463) (dependabot[bot])
+- chore(deps): bump wasm-bindgen-test from 0.3.54 to 0.3.55 [#18462](https://github.com/apache/datafusion/pull/18462) (dependabot[bot])
+- chore(deps): bump postgres-types from 0.2.10 to 0.2.11 [#18461](https://github.com/apache/datafusion/pull/18461) (dependabot[bot])
+- chore(deps): bump ctor from 0.4.3 to 0.6.1 [#18460](https://github.com/apache/datafusion/pull/18460) (dependabot[bot])
+- chore(deps): bump libc from 0.2.176 to 0.2.177 [#18459](https://github.com/apache/datafusion/pull/18459) (dependabot[bot])
+- chore: Format examples in doc strings - functions [#18353](https://github.com/apache/datafusion/pull/18353) (CuteChuanChuan)
+- Feat: Support array flatten() on `List(LargeList(_))` types [#18363](https://github.com/apache/datafusion/pull/18363) (sdf-jkl)
+- Reproducer tests for #18380 (resorting sorted inputs) [#18352](https://github.com/apache/datafusion/pull/18352) (rgehan)
+- Update criterion to 0.7.\* [#18472](https://github.com/apache/datafusion/pull/18472) (Omega359)
+- chore(deps): bump taiki-e/install-action from 2.62.45 to 2.62.46 [#18484](https://github.com/apache/datafusion/pull/18484) (dependabot[bot])
+- Consolidate flight examples (#18142) [#18442](https://github.com/apache/datafusion/pull/18442) (cj-zhukov)
+- Support reverse for ListView [#18424](https://github.com/apache/datafusion/pull/18424) (vegarsti)
+- Complete migrating `enforce_distrubution` tests to insta [#18185](https://github.com/apache/datafusion/pull/18185) (blaginin)
+- Add benchmark for array_reverse [#18425](https://github.com/apache/datafusion/pull/18425) (vegarsti)
+- chore: simplify map const [#18440](https://github.com/apache/datafusion/pull/18440) (chenkovsky)
+- Fix an out of date comment for `snapshot_physical_expr` [#18498](https://github.com/apache/datafusion/pull/18498) (AdamGS)
+- Disable `parquet_encryption` by default in datafusion-sqllogictests [#18492](https://github.com/apache/datafusion/pull/18492) (zhuqi-lucas)
+- Make extended test to use optional parquet_encryption feature [#18507](https://github.com/apache/datafusion/pull/18507) (zhuqi-lucas)
+- Consolidate udf examples (#18142) [#18493](https://github.com/apache/datafusion/pull/18493) (cj-zhukov)
+- test: add prepare alias slt test [#18522](https://github.com/apache/datafusion/pull/18522) (dqkqd)
+- CI: add `clippy::needless_pass_by_value` rule [#18468](https://github.com/apache/datafusion/pull/18468) (2010YOUY01)
+- Refactor create_hashes to accept array references [#18448](https://github.com/apache/datafusion/pull/18448) (adriangb)
+- chore: Format examples in doc strings - spark, sql, sqllogictest, sibstrait [#18443](https://github.com/apache/datafusion/pull/18443) (CuteChuanChuan)
+- refactor: simplify `calculate_binary_math` in datafusion-functions [#18525](https://github.com/apache/datafusion/pull/18525) (Jefffrey)
+- ci: enforce needless_pass_by_value for datafusion-optimzer [#18533](https://github.com/apache/datafusion/pull/18533) (jizezhang)
+- Add comments to Cargo.toml about workspace overrides [#18526](https://github.com/apache/datafusion/pull/18526) (alamb)
+- minor: Remove inconsistent comment [#18539](https://github.com/apache/datafusion/pull/18539) (2010YOUY01)
+- Refactor `log()` signature to use coercion API + fixes [#18519](https://github.com/apache/datafusion/pull/18519) (Jefffrey)
+- [branch-51] Update Changelog [#18592](https://github.com/apache/datafusion/pull/18592) (alamb)
+- [branch-51] bugfix: correct regression on TableType in into_view in DF51 [#18618](https://github.com/apache/datafusion/pull/18618) (timsaucer)
+- [branch-51]: Add timezone to date_trunc fast path (#18596) [#18629](https://github.com/apache/datafusion/pull/18629) (hareshkh)
+- [branch-51] bugfix: select_columns should validate column names [#18624](https://github.com/apache/datafusion/pull/18624) (timsaucer)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+    88	dependabot[bot]
+    49	Jeffrey Vo
+    35	Andrew Lamb
+    20	Yongting You
+    19	Adrian Garcia Badaracco
+    14	Blake Orth
+    12	Pepijn Van Eeckhoudt
+    12	Piotr Findeisen
+    11	Chen Chongchen
+    11	Dmitrii Blaginin
+    11	Yu-Chuan Hung
+     9	Jonathan Chen
+     9	Khanh Duong
+     9	Oleks V
+     9	Peter Nguyen
+     8	Alex Huang
+     8	Qi Zhu
+     8	Raz Luvaton
+     7	Adam Gutglick
+     7	Rohan Krishnaswamy
+     7	Tim Saucer
+     7	kosiew
+     6	xudong.w
+     5	Nuno Faria
+     4	Dhanush
+     4	Samuele Resca
+     4	Simon Vandel Sillesen
+     4	Sriram Sundar
+     4	Vegard Stikbakke
+     3	Bruce Ritchie
+     3	David López
+     3	EeshanBembi
+     3	Jack Kleeman
+     3	Kazantsev Maksim
+     3	Marko Milenković
+     3	Thomas Tanon
+     2	Andy Grove
+     2	Bruno Volpato
+     2	Christian
+     2	Colin Marc
+     2	Cora Sutton
+     2	David Stancu
+     2	Devam Patel
+     2	Eugene Tolbakov
+     2	Evgenii Glotov
+     2	Kristin Cowalcijk
+     2	Liam Bao
+     2	Marc Brinkmann
+     2	Michael Kleen
+     2	Namgung Chan
+     2	Ning Sun
+     2	Randy
+     2	Sergey Zhukov
+     2	Viktor Yershov
+     2	bubulalabu
+     2	dennis zhuang
+     2	jizezhang
+     2	wiedld
+     1	Ahmed Mezghani
+     1	Aldrin M
+     1	Alfonso Subiotto Marqués
+     1	Anders
+     1	Artem Medvedev
+     1	Aryamaan Singh
+     1	Ben Bellick
+     1	Berkay Şahin
+     1	Bert Vermeiren
+     1	Brent Gardner
+     1	Christopher Watford
+     1	Dan Lovell
+     1	Daniël Heres
+     1	Dewey Dunnington
+     1	Douglas Anderson
+     1	Duong Cong Toai
+     1	Emil Ernerfeldt
+     1	Emily Matheys
+     1	Enrico La Sala
+     1	Eshed Schacham
+     1	Filippo Rossi
+     1	Gabriel
+     1	Gene Bordegaray
+     1	Georgi Krastev
+     1	Haresh Khanna
+     1	Heran Lin
+     1	Hiroaki Yutani
+     1	Ian Lai
+     1	Ilya Ostanevich
+     1	JanKaul
+     1	Kosta Tarasov
+     1	LFC
+     1	Leonardo Yvens
+     1	Lía Adriana
+     1	Manasa Manoj
+     1	Martin
+     1	Martin Grigorov
+     1	Martin Hilton
+     1	Mason
+     1	Matt Butrovich
+     1	Matthew Kim
+     1	Matthijs Brobbel
+     1	Nga Tran
+     1	Nihal Rajak
+     1	Rafael Fernández
+     1	Renan GEHAN
+     1	Renato Marroquin
+     1	Rok Mihevc
+     1	Ruilei Ma
+     1	Sai Mahendra
+     1	Sergei Grebnov
+     1	Shiv Bhatia
+     1	Tobias Schwarzinger
+     1	UBarney
+     1	Victor Barua
+     1	Victorien
+     1	Vyquos
+     1	Weston Pace
+     1	XL Liang
+     1	Xander
+     1	Zhen Wang
+     1	aditya singh rathore
+     1	dario curreri
+     1	ding-young
+     1	feniljain
+     1	gene-bordegaray
+     1	harshasiddartha
+     1	mwish
+     1	peasee
+     1	r1b
+     1	theirix
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/changelog/52.0.0.md b/dev/changelog/52.0.0.md
new file mode 100644
index 0000000000000..4536fd5a06907
--- /dev/null
+++ b/dev/changelog/52.0.0.md
@@ -0,0 +1,745 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 52.0.0 Changelog
+
+This release consists of 549 commits from 121 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Breaking changes:**
+
+- Force `FileSource` to be constructed with a `Schema` [#18386](https://github.com/apache/datafusion/pull/18386) (adriangb)
+- Support Arrow IPC Stream Files [#18457](https://github.com/apache/datafusion/pull/18457) (corasaurus-hex)
+- Change default of `AggregateUDFImpl::supports_null_handling_clause` to `false` [#18441](https://github.com/apache/datafusion/pull/18441) (Jefffrey)
+- [Minor] Remove RawTableAllocExt [#18748](https://github.com/apache/datafusion/pull/18748) (Dandandan)
+- Change `CacheAccessor::remove` to take `&self` rather than `&mut self` [#18726](https://github.com/apache/datafusion/pull/18726) (alchemist51)
+- Move statistics handling into FileScanConfig [#18721](https://github.com/apache/datafusion/pull/18721) (adriangb)
+- chore: remove `pyarrow` feature [#18528](https://github.com/apache/datafusion/pull/18528) (timsaucer)
+- Limit visibility of internal impl functions in function crates [#18877](https://github.com/apache/datafusion/pull/18877) (Jefffrey)
+- FFI: return underlying trait type when converting from FFI structs [#18672](https://github.com/apache/datafusion/pull/18672) (timsaucer)
+- Refactor crypto functions code [#18664](https://github.com/apache/datafusion/pull/18664) (Jefffrey)
+- move projection handling into FileSource [#18627](https://github.com/apache/datafusion/pull/18627) (adriangb)
+- Add PhysicalOptimizerRule::optimize_plan to allow passing more context into optimizer rules [#18739](https://github.com/apache/datafusion/pull/18739) (adriangb)
+- Optimize planning / stop cloning Strings / Fields so much (2-3% faster planning time) [#18415](https://github.com/apache/datafusion/pull/18415) (alamb)
+- Adds memory-bound DefaultListFilesCache [#18855](https://github.com/apache/datafusion/pull/18855) (BlakeOrth)
+- Allow Logical expression ScalarVariable to represent an extension type or metadata [#18243](https://github.com/apache/datafusion/pull/18243) (batmnnn)
+- feat: Implement the `statistics_cache` function [#19054](https://github.com/apache/datafusion/pull/19054) (nuno-faria)
+- Move `newlines_in_values` from `FileScanConfig` to `CsvSource` [#19313](https://github.com/apache/datafusion/pull/19313) (adriangb)
+- Remove SchemaAdapter [#19345](https://github.com/apache/datafusion/pull/19345) (adriangb)
+- feat: hash partitioning satisfies subset [#19304](https://github.com/apache/datafusion/pull/19304) (gene-bordegaray)
+- feat: update FFI TableProvider and ExecutionPlan to use FFI Session and TaskContext [#19281](https://github.com/apache/datafusion/pull/19281) (timsaucer)
+- Allow logical optimizer to be run without evaluating now() & refactor SimplifyInfo [#19505](https://github.com/apache/datafusion/pull/19505) (adriangb)
+- Make default ListingFilesCache table scoped [#19616](https://github.com/apache/datafusion/pull/19616) (jizezhang)
+
+**Performance related:**
+
+- Normalize partitioned and flat object listing [#18146](https://github.com/apache/datafusion/pull/18146) (BlakeOrth)
+- perf: Improve NLJ for very small right side case [#17562](https://github.com/apache/datafusion/pull/17562) (2010YOUY01)
+- Consolidate `EliminateNestedUnion` and `EliminateOneUnion` optimizer rules' [#18678](https://github.com/apache/datafusion/pull/18678) (alamb)
+- perf: improve performance of `vectorized_equal_to` for `PrimitiveGroupValueBuilder` in multi group by aggregation [#17977](https://github.com/apache/datafusion/pull/17977) (rluvaton)
+- optimizer: Support dynamic filter in `MIN/MAX` aggregates [#18644](https://github.com/apache/datafusion/pull/18644) (2010YOUY01)
+- perf: use `new_repeated` when converting scalar to an array [#19018](https://github.com/apache/datafusion/pull/19018) (rluvaton)
+- perf: optimize CASE WHEN lookup table (2.5-22.5 times faster) [#18183](https://github.com/apache/datafusion/pull/18183) (rluvaton)
+- add specialized InList implementations for common scalar types [#18832](https://github.com/apache/datafusion/pull/18832) (adriangb)
+- Add hashing microbenchmark `with_hashes` [#19373](https://github.com/apache/datafusion/pull/19373) (alamb)
+- Optimize muti-column grouping with StringView/ByteView (option 2) - 25% faster [#19413](https://github.com/apache/datafusion/pull/19413) (alamb)
+- Optimize hashing for StringView and ByteView (15-70% faster) [#19374](https://github.com/apache/datafusion/pull/19374) (alamb)
+- perf: Improve performance of `to_hex` (> 2x) [#19503](https://github.com/apache/datafusion/pull/19503) (andygrove)
+- perf: improve performance of string repeat [#19502](https://github.com/apache/datafusion/pull/19502) (andygrove)
+- perf: Optimize `starts_with` and `ends_with` for scalar arguments [#19516](https://github.com/apache/datafusion/pull/19516) (andygrove)
+- perf: improve performance of string replace [#19530](https://github.com/apache/datafusion/pull/19530) (viirya)
+- perf: improve performance of levenshtein by reusing cache buffer [#19532](https://github.com/apache/datafusion/pull/19532) (viirya)
+- perf: improve performance of translate by reusing buffers [#19533](https://github.com/apache/datafusion/pull/19533) (viirya)
+- perf: Optimize `contains` for scalar search arg [#19529](https://github.com/apache/datafusion/pull/19529) (andygrove)
+- perf: improve performance of lpad/rpad by reusing buffers [#19558](https://github.com/apache/datafusion/pull/19558) (viirya)
+- perf: optimize regexp_count to avoid String allocation when start position is provided [#19553](https://github.com/apache/datafusion/pull/19553) (viirya)
+- perf: Improve performance of `md5` [#19568](https://github.com/apache/datafusion/pull/19568) (andygrove)
+- perf: optimize strpos by eliminating double iteration for UTF-8 [#19572](https://github.com/apache/datafusion/pull/19572) (viirya)
+- perf: optimize factorial function performance [#19575](https://github.com/apache/datafusion/pull/19575) (getChan)
+- perf: Improve performance of ltrim, rtrim, btrim [#19551](https://github.com/apache/datafusion/pull/19551) (andygrove)
+- perf: optimize `HashTableLookupExpr::evaluate` [#19602](https://github.com/apache/datafusion/pull/19602) (UBarney)
+- perf: Improve performance of `split_part` [#19570](https://github.com/apache/datafusion/pull/19570) (andygrove)
+- Optimize `Nullstate` / accumulators [#19625](https://github.com/apache/datafusion/pull/19625) (Dandandan)
+
+**Implemented enhancements:**
+
+- feat: Enhance `array_slice` functionality to support `ListView` and `LargeListView` types [#18432](https://github.com/apache/datafusion/pull/18432) (Weijun-H)
+- feat: support complex expr for prepared statement argument [#18383](https://github.com/apache/datafusion/pull/18383) (chenkovsky)
+- feat: Implement `SessionState::create_logical_expr_from_sql_expr` [#18423](https://github.com/apache/datafusion/pull/18423) (petern48)
+- feat: added clippy::needless_pass_by_value lint rule to datafusion/expr [#18532](https://github.com/apache/datafusion/pull/18532) (Gohlub)
+- feat: support nested key for get_field [#18394](https://github.com/apache/datafusion/pull/18394) (chenkovsky)
+- feat: Add `ansi` enable parameter for execution config [#18635](https://github.com/apache/datafusion/pull/18635) (comphead)
+- feat: Add evaluate_to_arrays function [#18446](https://github.com/apache/datafusion/pull/18446) (EmilyMatt)
+- feat: support named variables & defaults for `CREATE FUNCTION` [#18450](https://github.com/apache/datafusion/pull/18450) (r1b)
+- feat: Add new() constructor for CachedParquetFileReader [#18575](https://github.com/apache/datafusion/pull/18575) (petern48)
+- feat: support decimal for math functions: power [#18032](https://github.com/apache/datafusion/pull/18032) (theirix)
+- feat: selectivity metrics (for Explain Analyze) in Hash Join [#18488](https://github.com/apache/datafusion/pull/18488) (feniljain)
+- feat: Handle edge case with `corr` with single row and `NaN` [#18677](https://github.com/apache/datafusion/pull/18677) (comphead)
+- feat: support spark csc [#18642](https://github.com/apache/datafusion/pull/18642) (psvri)
+- feat: support spark sec [#18728](https://github.com/apache/datafusion/pull/18728) (psvri)
+- feat(parquet): Implement `scan_efficiency_ratio` metric for parquet reading [#18577](https://github.com/apache/datafusion/pull/18577) (petern48)
+- feat: Enhance map handling to support NULL map values [#18531](https://github.com/apache/datafusion/pull/18531) (Weijun-H)
+- feat: add RESET statement for configuration variabless [#18408](https://github.com/apache/datafusion/pull/18408) (Weijun-H)
+- feat: add human-readable formatting to EXPLAIN ANALYZE metrics #18689 [#18734](https://github.com/apache/datafusion/pull/18734) (T2MIX)
+- feat: support Spark-compatible `abs` math function part 1 - non-ANSI mode [#18205](https://github.com/apache/datafusion/pull/18205) (hsiang-c)
+- feat: Support Show runtime settings [#18564](https://github.com/apache/datafusion/pull/18564) (Weijun-H)
+- feat(small): Support `<slt:ignore>` marker in `sqllogictest` for non-deterministic expected parts [#18857](https://github.com/apache/datafusion/pull/18857) (2010YOUY01)
+- feat: allow custom caching via logical node [#18688](https://github.com/apache/datafusion/pull/18688) (jizezhang)
+- feat: add `array_slice` benchmark [#18879](https://github.com/apache/datafusion/pull/18879) (dqkqd)
+- feat: Support recursive queries with a distinct 'UNION' [#18254](https://github.com/apache/datafusion/pull/18254) (Tpt)
+- feat: Makes error macros hygienic [#18995](https://github.com/apache/datafusion/pull/18995) (Tpt)
+- feat: Add builder API for CreateExternalTable to reduce verbosity [#19066](https://github.com/apache/datafusion/pull/19066) (AryanBagade)
+- feat(spark): Implement Spark functions `url_encode`, `url_decode` and `try_url_decode` [#17399](https://github.com/apache/datafusion/pull/17399) (anhvdq)
+- feat: Move DefaultMetadataCache into its own module [#19125](https://github.com/apache/datafusion/pull/19125) (AryanBagade)
+- feat: Add `remove_optimizer_rule` to `SessionContext` [#19209](https://github.com/apache/datafusion/pull/19209) (nuno-faria)
+- feat: integrate batch coalescer with repartition exec [#19002](https://github.com/apache/datafusion/pull/19002) (jizezhang)
+- feat: Preserve File Partitioning From File Scans [#19124](https://github.com/apache/datafusion/pull/19124) (gene-bordegaray)
+- feat: Add constant column extraction and rewriting for projections in ParquetOpener [#19136](https://github.com/apache/datafusion/pull/19136) (Weijun-H)
+- feat: Support sliding window queries for MedianAccumulator by implementing `retract_batch` [#19278](https://github.com/apache/datafusion/pull/19278) (petern48)
+- feat: add compression level configuration for JSON/CSV writers [#18954](https://github.com/apache/datafusion/pull/18954) (Smotrov)
+- feat(spark): implement Spark `try_sum` function [#18569](https://github.com/apache/datafusion/pull/18569) (davidlghellin)
+- feat: Support log for Decimal32 and Decimal64 [#18999](https://github.com/apache/datafusion/pull/18999) (Mark1626)
+- feat(proto): Add protobuf serialization for HashExpr [#19379](https://github.com/apache/datafusion/pull/19379) (adriangb)
+- feat: Add decimal support for round [#19384](https://github.com/apache/datafusion/pull/19384) (kumarUjjawal)
+- Support nested field access in `get_field` with multiple path arguments [#19389](https://github.com/apache/datafusion/pull/19389) (adriangb)
+- feat: fix matching for named parameters with non-lowercase signatures [#19378](https://github.com/apache/datafusion/pull/19378) (bubulalabu)
+- feat: Add per-expression evaluation timing metrics to ProjectionExec [#19447](https://github.com/apache/datafusion/pull/19447) (2010YOUY01)
+- feat: Improve sort memory resilience [#19494](https://github.com/apache/datafusion/pull/19494) (EmilyMatt)
+- feat: Add DELETE/UPDATE hooks to TableProvider trait and to MemTable implementation [#19142](https://github.com/apache/datafusion/pull/19142) (ethan-tyler)
+- feat: implement partition_statistics for WindowAggExec [#18534](https://github.com/apache/datafusion/pull/18534) (0xPoe)
+- feat: integrate batch coalescer with async fn exec [#19342](https://github.com/apache/datafusion/pull/19342) (feniljain)
+- feat: output statistics for constant columns in projections [#19419](https://github.com/apache/datafusion/pull/19419) (shashidhar-bm)
+- feat: `to_time` function [#19540](https://github.com/apache/datafusion/pull/19540) (kumarUjjawal)
+- feat: Implement Spark functions hour, minute, second [#19512](https://github.com/apache/datafusion/pull/19512) (andygrove)
+- feat: plan-time SQL expression simplifying [#19311](https://github.com/apache/datafusion/pull/19311) (theirix)
+- feat: Implement Spark function `space` [#19610](https://github.com/apache/datafusion/pull/19610) (kazantsev-maksim)
+- feat: Implement `partition_statistics` API for `SortMergeJoinExec` [#19567](https://github.com/apache/datafusion/pull/19567) (kumarUjjawal)
+- feat: add list_files_cache table function for `datafusion-cli` [#19388](https://github.com/apache/datafusion/pull/19388) (jizezhang)
+- feat: implement metrics for AsyncFuncExec [#19626](https://github.com/apache/datafusion/pull/19626) (feniljain)
+- feat: split BatchPartitioner::try_new into hash and round-robin constructors [#19668](https://github.com/apache/datafusion/pull/19668) (mohit7705)
+- feat: add Time type support to date_trunc function [#19640](https://github.com/apache/datafusion/pull/19640) (kumarUjjawal)
+- feat: Allow log with non-integer base on decimals [#19372](https://github.com/apache/datafusion/pull/19372) (Yuvraj-cyborg)
+
+**Fixed bugs:**
+
+- fix: Eliminate consecutive repartitions [#18521](https://github.com/apache/datafusion/pull/18521) (gene-bordegaray)
+- fix: `with_param_values` on `LogicalPlan::EmptyRelation` returns incorrect schema [#18286](https://github.com/apache/datafusion/pull/18286) (dqkqd)
+- fix: Nested arrays should not get a field in lookup [#18745](https://github.com/apache/datafusion/pull/18745) (EmilyMatt)
+- fix: update schema's data type for `LogicalPlan::Values` after placeholder substitution [#18740](https://github.com/apache/datafusion/pull/18740) (dqkqd)
+- fix: Pick correct columns in Sort Merge Equijoin [#18772](https://github.com/apache/datafusion/pull/18772) (tglanz)
+- fix: remove `WorkTableExec` special case in `reset_plan_states` [#18803](https://github.com/apache/datafusion/pull/18803) (geoffreyclaude)
+- fix: display the failed sqllogictest file and query that failed in case of a panic [#18785](https://github.com/apache/datafusion/pull/18785) (rluvaton)
+- fix: preserve byte-size statistics in AggregateExec [#18885](https://github.com/apache/datafusion/pull/18885) (Tamar-Posen)
+- fix: Track elapsed_compute metric for CSV scans [#18901](https://github.com/apache/datafusion/pull/18901) (Nithurshen)
+- fix: Implement Substrait consumer support for like_match, like_imatch, and negated variants [#18929](https://github.com/apache/datafusion/pull/18929) (Nithurshen)
+- fix: Initialize CsvOptions::double_quote from proto_opts.double_quote [#18967](https://github.com/apache/datafusion/pull/18967) (martin-g)
+- fix: `rstest` is a DEV dependency [#19014](https://github.com/apache/datafusion/pull/19014) (crepererum)
+- fix: partition pruning stats pruning when multiple values are present [#18923](https://github.com/apache/datafusion/pull/18923) (Mark1626)
+- fix: deprecate data_type_and_nullable and simplify API usage [#18869](https://github.com/apache/datafusion/pull/18869) (BipulLamsal)
+- fix: pre-warm listing file statistics cache during listing table creation [#18971](https://github.com/apache/datafusion/pull/18971) (bharath-techie)
+- fix: log metadata differences when comparing physical and logical schema [#19070](https://github.com/apache/datafusion/pull/19070) (erratic-pattern)
+- fix: fix panic when lo is greater than hi [#19099](https://github.com/apache/datafusion/pull/19099) (tshauck)
+- fix: escape underscores when simplifying `starts_with` [#19077](https://github.com/apache/datafusion/pull/19077) (willemv)
+- fix: custom nullability for length (#19175) [#19182](https://github.com/apache/datafusion/pull/19182) (skushagra)
+- fix: inverted null_percent logic in in_list benchmark [#19204](https://github.com/apache/datafusion/pull/19204) (geoffreyclaude)
+- fix: Ensure column names do not change with `expand_views_at_output` [#19019](https://github.com/apache/datafusion/pull/19019) (nuno-faria)
+- fix: bitmap_count should report nullability correctly [#19195](https://github.com/apache/datafusion/pull/19195) (harshitsaini17)
+- fix: bit_count function to report nullability correctly [#19197](https://github.com/apache/datafusion/pull/19197) (harshitsaini17)
+- fix: derive custom nullability for spark `bit_shift` [#19222](https://github.com/apache/datafusion/pull/19222) (kumarUjjawal)
+- fix: spark elt custom nullability [#19207](https://github.com/apache/datafusion/pull/19207) (EeshanBembi)
+- fix: `array_remove`/`array_remove_n`/`array_remove_all` not using the same nullability as the input [#19259](https://github.com/apache/datafusion/pull/19259) (rluvaton)
+- fix: typo in sql/ddl [#19276](https://github.com/apache/datafusion/pull/19276) (mag1c1an1)
+- fix: flaky cache test [#19140](https://github.com/apache/datafusion/pull/19140) (xonx4l)
+- fix: Add custom nullability for Spark ILIKE function [#19206](https://github.com/apache/datafusion/pull/19206) (Eshaan-byte)
+- fix: derive custom nullability for spark `map_from_arrays` [#19275](https://github.com/apache/datafusion/pull/19275) (kumarUjjawal)
+- fix: derive custom nullability for spark map_from_entries [#19274](https://github.com/apache/datafusion/pull/19274) (kumarUjjawal)
+- fix: derive custom nullable for spark `make_dt_interval` [#19236](https://github.com/apache/datafusion/pull/19236) (kumarUjjawal)
+- fix: derive custome nullable for the spark last_day [#19232](https://github.com/apache/datafusion/pull/19232) (kumarUjjawal)
+- fix: derive custom nullable for spark `date_sub` [#19225](https://github.com/apache/datafusion/pull/19225) (kumarUjjawal)
+- fix: Fix a few minor issues with join metrics [#19283](https://github.com/apache/datafusion/pull/19283) (linhr)
+- fix: derive nullability for spark `bit_get` [#19220](https://github.com/apache/datafusion/pull/19220) (kumarUjjawal)
+- fix: pow() with integer base and negative float exponent returns error [#19303](https://github.com/apache/datafusion/pull/19303) (adriangb)
+- fix(concat): correct nullability inference (nullable only if all arguments nullable) [#19189](https://github.com/apache/datafusion/pull/19189) (ujjwaltwri)
+- fix: Added nullable return from date_add(#19151) [#19229](https://github.com/apache/datafusion/pull/19229) (manishkr)
+- fix: spark sha1 nullability reporting [#19242](https://github.com/apache/datafusion/pull/19242) (shashidhar-bm)
+- fix: derive custom nullability for the spark `next_day` [#19253](https://github.com/apache/datafusion/pull/19253) (kumarUjjawal)
+- fix: preserve ListFilesCache TTL when not set in config [#19401](https://github.com/apache/datafusion/pull/19401) (shashidhar-bm)
+- fix: projection for `CooperativeExec` and `CoalesceBatchesExec` [#19400](https://github.com/apache/datafusion/pull/19400) (haohuaijin)
+- fix: spark crc32 custom nullability [#19271](https://github.com/apache/datafusion/pull/19271) (watanaberin)
+- fix: Fix skip aggregate test to cover regression [#19461](https://github.com/apache/datafusion/pull/19461) (kumarUjjawal)
+- fix: [19450]Added flush for tokio file(substrait) write [#19456](https://github.com/apache/datafusion/pull/19456) (manishkr)
+- fix: csv schema_infer_max_records set to 0 return null datatype [#19432](https://github.com/apache/datafusion/pull/19432) (haohuaijin)
+- fix: Add custom nullability for Spark LIKE function [#19218](https://github.com/apache/datafusion/pull/19218) (KaranPradhan266)
+- fix: implement custom nullability for spark abs function [#19395](https://github.com/apache/datafusion/pull/19395) (batmnnn)
+- fix: custom nullability for format_string (#19173) [#19190](https://github.com/apache/datafusion/pull/19190) (skushagra)
+- fix: Implement `reset_state` for `LazyMemoryExec` [#19362](https://github.com/apache/datafusion/pull/19362) (nuno-faria)
+- fix: CteWorkTable: properly apply TableProvider::scan projection argument [#18993](https://github.com/apache/datafusion/pull/18993) (Tpt)
+- fix: Median() integer overflow [#19509](https://github.com/apache/datafusion/pull/19509) (kumarUjjawal)
+- fix: Reverse row selection should respect the row group index [#19557](https://github.com/apache/datafusion/pull/19557) (zhuqi-lucas)
+- fix: emit empty RecordBatch for empty file writes [#19370](https://github.com/apache/datafusion/pull/19370) (nlimpid)
+- fix: handle invalid byte ranges in calculate_range for single-line files [#19607](https://github.com/apache/datafusion/pull/19607) (vigimite)
+- fix: NULL handling in arrow_intersect and arrow_union [#19415](https://github.com/apache/datafusion/pull/19415) (feniljain)
+- fix(doc): close #19393, make upgrading guide match v51 api [#19648](https://github.com/apache/datafusion/pull/19648) (mag1c1an1)
+- fix(spark): Use wrapping addition/subtraction in `SparkDateAdd` and `SparkDateSub` [#19377](https://github.com/apache/datafusion/pull/19377) (mzabaluev)
+- fix(functions): Make translate function postgres compatible [#19630](https://github.com/apache/datafusion/pull/19630) (devanshu0987)
+- fix: Return Int for Date - Date instead of duration [#19563](https://github.com/apache/datafusion/pull/19563) (kumarUjjawal)
+- fix: DynamicFilterPhysicalExpr violates Hash/Eq contract [#19659](https://github.com/apache/datafusion/pull/19659) (kumarUjjawal)
+
+**Documentation updates:**
+
+- [main] Update version to 51.0.0, add Changelog (#18551) [#18565](https://github.com/apache/datafusion/pull/18565) (alamb)
+- refactor: include metric output_batches into BaselineMetrics [#18491](https://github.com/apache/datafusion/pull/18491) (nmbr7)
+- chore(deps): bump maturin from 1.9.6 to 1.10.0 in /docs [#18590](https://github.com/apache/datafusion/pull/18590) (dependabot[bot])
+- Update release download links on download page [#18550](https://github.com/apache/datafusion/pull/18550) (alamb)
+- docs: fix rustup cmd for adding rust-analyzer [#18605](https://github.com/apache/datafusion/pull/18605) (Jefffrey)
+- Enforce explicit opt-in for `WITHIN GROUP` syntax in aggregate UDAFs [#18607](https://github.com/apache/datafusion/pull/18607) (kosiew)
+- docs: fix broken catalog example links [#18765](https://github.com/apache/datafusion/pull/18765) (nlimpid)
+- doc: Add documentation for error handling [#18762](https://github.com/apache/datafusion/pull/18762) (2010YOUY01)
+- docs: Fix the examples for char_length() and character_length() [#18808](https://github.com/apache/datafusion/pull/18808) (martin-g)
+- chore: Support 'untake' for unassigning github issues [#18637](https://github.com/apache/datafusion/pull/18637) (petern48)
+- chore: Add filtered pending PRs link to main page [#18854](https://github.com/apache/datafusion/pull/18854) (comphead)
+- Docs: Enhance contributor guide with testing section [#18852](https://github.com/apache/datafusion/pull/18852) (alamb)
+- Docs: Enhance testing documentation with examples and links [#18851](https://github.com/apache/datafusion/pull/18851) (alamb)
+- chore(deps): bump maturin from 1.10.0 to 1.10.2 in /docs [#18905](https://github.com/apache/datafusion/pull/18905) (dependabot[bot])
+- Update links in documentation to point at new example locations [#18931](https://github.com/apache/datafusion/pull/18931) (alamb)
+- Add Kubeflow Trainer to known users [#18935](https://github.com/apache/datafusion/pull/18935) (andreyvelich)
+- Add PGO documentation section to crate configuration [#18959](https://github.com/apache/datafusion/pull/18959) (jatinkumarsingh)
+- Add upgrade guide for PhysicalOptimizerRule::optimize_plan [#19030](https://github.com/apache/datafusion/pull/19030) (adriangb)
+- doc: add `FilterExec` metrics to `user-guide/metrics.md` [#19043](https://github.com/apache/datafusion/pull/19043) (2010YOUY01)
+- Add `force_filter_selections` to restore `pushdown_filters` behavior prior to parquet 57.1.0 upgrade [#19003](https://github.com/apache/datafusion/pull/19003) (alamb)
+- Implement FFI task context and task context provider [#18918](https://github.com/apache/datafusion/pull/18918) (timsaucer)
+- Minor: fix link errors in docs [#19088](https://github.com/apache/datafusion/pull/19088) (alamb)
+- Cut `Parquet` over to PhysicalExprAdapter, remove `SchemaAdapter` [#18998](https://github.com/apache/datafusion/pull/18998) (adriangb)
+- Update Committer / PMC list [#19105](https://github.com/apache/datafusion/pull/19105) (alamb)
+- Revert adding PhysicalOptimizerRule::optimize_plan [#19186](https://github.com/apache/datafusion/pull/19186) (adriangb)
+- Push down InList or hash table references from HashJoinExec depending on the size of the build side [#18393](https://github.com/apache/datafusion/pull/18393) (adriangb)
+- Move partition handling out of PhysicalExprAdapter [#19128](https://github.com/apache/datafusion/pull/19128) (adriangb)
+- Push down projection expressions into ParquetOpener [#19111](https://github.com/apache/datafusion/pull/19111) (adriangb)
+- Track column sizes in Statistics; propagate through projections [#19113](https://github.com/apache/datafusion/pull/19113) (adriangb)
+- Improve ProjectionExpr documentation and comments [#19263](https://github.com/apache/datafusion/pull/19263) (alamb)
+- Update README occording to the new examples (#18529) [#19257](https://github.com/apache/datafusion/pull/19257) (cj-zhukov)
+- Add make_time function [#19183](https://github.com/apache/datafusion/pull/19183) (Omega359)
+- Update to_date udf function to support a consistent set of argument types [#19134](https://github.com/apache/datafusion/pull/19134) (Omega359)
+- Add library user guide for extending SQL syntax [#19265](https://github.com/apache/datafusion/pull/19265) (geoffreyclaude)
+- Add runtime config options for `list_files_cache_limit` and `list_files_cache_ttl` [#19108](https://github.com/apache/datafusion/pull/19108) (delamarch3)
+- Minor: clean up titles and links n extending operators and optimizer pages [#19317](https://github.com/apache/datafusion/pull/19317) (alamb)
+- Establish the high level API for sort pushdown and the optimizer rule and support reverse files and row groups [#19064](https://github.com/apache/datafusion/pull/19064) (zhuqi-lucas)
+- Add Decimal support to Ceil and Floor [#18979](https://github.com/apache/datafusion/pull/18979) (kumarUjjawal)
+- doc: add example for cache factory [#19139](https://github.com/apache/datafusion/pull/19139) (jizezhang)
+- chore(deps): bump sphinx-reredirects from 1.0.0 to 1.1.0 in /docs [#19455](https://github.com/apache/datafusion/pull/19455) (dependabot[bot])
+- Add:arrow_metadata() UDF [#19435](https://github.com/apache/datafusion/pull/19435) (xonx4l)
+- Update date_bin to support Time32 and Time64 data types [#19341](https://github.com/apache/datafusion/pull/19341) (Omega359)
+- Update `to_unixtime` udf function to support a consistent set of argument types [#19442](https://github.com/apache/datafusion/pull/19442) (kumarUjjawal)
+- docs: Improve config tables' readability [#19522](https://github.com/apache/datafusion/pull/19522) (nuno-faria)
+- Introduce `TypeSignatureClass::Any` [#19485](https://github.com/apache/datafusion/pull/19485) (Jefffrey)
+- Enables DefaultListFilesCache by default [#19366](https://github.com/apache/datafusion/pull/19366) (BlakeOrth)
+- Fix typo in contributor guide architecture section [#19613](https://github.com/apache/datafusion/pull/19613) (cdegroc)
+- docs: fix typos in PartitionEvaluator trait documentation [#19631](https://github.com/apache/datafusion/pull/19631) (SolariSystems)
+- Respect execution timezone in to_timestamp and related functions [#19078](https://github.com/apache/datafusion/pull/19078) (Omega359)
+- perfect hash join [#19411](https://github.com/apache/datafusion/pull/19411) (UBarney)
+
+**Other:**
+
+- chore(deps): bump taiki-e/install-action from 2.62.46 to 2.62.47 [#18508](https://github.com/apache/datafusion/pull/18508) (dependabot[bot])
+- Consolidate builtin functions examples (#18142) [#18523](https://github.com/apache/datafusion/pull/18523) (cj-zhukov)
+- refactor: update cmp and nested data in binary operator [#18256](https://github.com/apache/datafusion/pull/18256) (sunng87)
+- Fix: topk_aggregate benchmark failing [#18502](https://github.com/apache/datafusion/pull/18502) (randyli)
+- refactor: Add `assert_or_internal_err!` macro for more ergonomic internal invariant checks [#18511](https://github.com/apache/datafusion/pull/18511) (2010YOUY01)
+- chore: enforce clippy lint needless_pass_by_value to datafusion-physical-optimizer [#18555](https://github.com/apache/datafusion/pull/18555) (foskey51)
+- chore: enforce clippy lint needless_pass_by_value for datafusion-sql [#18554](https://github.com/apache/datafusion/pull/18554) (foskey51)
+- chore: enforce clippy lint needless_pass_by_value to physical-expr-common [#18556](https://github.com/apache/datafusion/pull/18556) (foskey51)
+- chore: Enforce lint rule `clippy::needless_pass_by_value` to `datafusion-physical-expr` [#18557](https://github.com/apache/datafusion/pull/18557) (corasaurus-hex)
+- Fix out-of-bounds access in SLT runner [#18562](https://github.com/apache/datafusion/pull/18562) (theirix)
+- Make array_reverse faster for List and FixedSizeList [#18500](https://github.com/apache/datafusion/pull/18500) (vegarsti)
+- Consolidate custom data source examples (#18142) [#18553](https://github.com/apache/datafusion/pull/18553) (cj-zhukov)
+- chore(deps): bump taiki-e/install-action from 2.62.47 to 2.62.49 [#18581](https://github.com/apache/datafusion/pull/18581) (dependabot[bot])
+- chore: Remove unused `tokio` dependency and clippy [#18598](https://github.com/apache/datafusion/pull/18598) (comphead)
+- minor: enforce `clippy::needless_pass_by_value` for crates that don't require code changes. [#18586](https://github.com/apache/datafusion/pull/18586) (2010YOUY01)
+- refactor: merge CoalesceAsyncExecInput into CoalesceBatches [#18540](https://github.com/apache/datafusion/pull/18540) (Tim-53)
+- Enhance the help message for invalid command in datafusion-cli [#18603](https://github.com/apache/datafusion/pull/18603) (klion26)
+- Update Release README.md with latest process [#18549](https://github.com/apache/datafusion/pull/18549) (alamb)
+- Add timezone to date_trunc fast path [#18596](https://github.com/apache/datafusion/pull/18596) (hareshkh)
+- Coalesce batches inside FilterExec [#18604](https://github.com/apache/datafusion/pull/18604) (Dandandan)
+- Fix misleading boolean 'null' interval tests [#18620](https://github.com/apache/datafusion/pull/18620) (pepijnve)
+- Clarify tests for `Interval::and`, `Interval::not`, and add `Interval::or` tests [#18621](https://github.com/apache/datafusion/pull/18621) (pepijnve)
+- bugfix: correct regression on TableType for into_view [#18617](https://github.com/apache/datafusion/pull/18617) (timsaucer)
+- Separating Benchmarks for physical sorted union over large columns in SQL planner based on Datatype [#18599](https://github.com/apache/datafusion/pull/18599) (logan-keede)
+- Add RunEndEncoded type coercion [#18561](https://github.com/apache/datafusion/pull/18561) (vegarsti)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/sql` [#18614](https://github.com/apache/datafusion/pull/18614) (2010YOUY01)
+- chore: ASF tracking process on `.asf.yaml` [#18636](https://github.com/apache/datafusion/pull/18636) (comphead)
+- Refactor bit aggregate functions signature [#18593](https://github.com/apache/datafusion/pull/18593) (Jefffrey)
+- chore(deps): bump taiki-e/install-action from 2.62.49 to 2.62.50 [#18645](https://github.com/apache/datafusion/pull/18645) (dependabot[bot])
+- bugfix: select_columns should validate column names [#18623](https://github.com/apache/datafusion/pull/18623) (timsaucer)
+- Consolidate data io examples (#18142) [#18591](https://github.com/apache/datafusion/pull/18591) (cj-zhukov)
+- Correct implementations of `NullableInterval::and` and `NullableInterval::or`. [#18625](https://github.com/apache/datafusion/pull/18625) (pepijnve)
+- chore: ASF tracking process on `.asf.yaml` [#18652](https://github.com/apache/datafusion/pull/18652) (comphead)
+- Refactor Spark bitshift signature [#18649](https://github.com/apache/datafusion/pull/18649) (Jefffrey)
+- chore(deps): bump crate-ci/typos from 1.39.0 to 1.39.1 [#18667](https://github.com/apache/datafusion/pull/18667) (dependabot[bot])
+- Update docs for aggregate repartition test [#18650](https://github.com/apache/datafusion/pull/18650) (xanderbailey)
+- chore: Enforce lint rule `clippy::needless_pass_by_value` to `datafusion-catalog` [#18638](https://github.com/apache/datafusion/pull/18638) (Standing-Man)
+- [main] Update Changelog (#18592) [#18616](https://github.com/apache/datafusion/pull/18616) (alamb)
+- Refactor distinct aggregate implementations to use common buffer [#18348](https://github.com/apache/datafusion/pull/18348) (Jefffrey)
+- chore: enforce lint rule `clippy::needless_pass_by_value` to `datafusion-datasource-avro` [#18641](https://github.com/apache/datafusion/pull/18641) (Standing-Man)
+- Refactor Spark expm1 signature [#18655](https://github.com/apache/datafusion/pull/18655) (Jefffrey)
+- chore(core): Enforce lint rule `clippy::needless_pass_by_value` to `datafusion-core` [#18640](https://github.com/apache/datafusion/pull/18640) (Standing-Man)
+- Refactor substr signature [#18653](https://github.com/apache/datafusion/pull/18653) (Jefffrey)
+- minor: Use allow->expect to explicitly suppress Clippy lint checks [#18686](https://github.com/apache/datafusion/pull/18686) (2010YOUY01)
+- chore(deps): bump taiki-e/install-action from 2.62.50 to 2.62.51 [#18693](https://github.com/apache/datafusion/pull/18693) (dependabot[bot])
+- chore(deps): bump crate-ci/typos from 1.39.1 to 1.39.2 [#18694](https://github.com/apache/datafusion/pull/18694) (dependabot[bot])
+- Remove FilterExec from CoalesceBatches optimization rule, add fetch support [#18630](https://github.com/apache/datafusion/pull/18630) (Dandandan)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/datasource` [#18697](https://github.com/apache/datafusion/pull/18697) (kumarUjjawal)
+- chore: Enforce lint rule `clippy::needless_pass_by_value` to datafusion-datasource [#18682](https://github.com/apache/datafusion/pull/18682) (AryanBagade)
+- [main] Update changelog for 51.0.0 RC2 [#18710](https://github.com/apache/datafusion/pull/18710) (alamb)
+- Refactor Spark crc32/sha1 signatures [#18662](https://github.com/apache/datafusion/pull/18662) (Jefffrey)
+- CI: try free up space in `Rust / cargo test (amd64)` action [#18709](https://github.com/apache/datafusion/pull/18709) (Jefffrey)
+- chore: enforce clippy lint needless_pass_by_value to datafusion-proto [#18715](https://github.com/apache/datafusion/pull/18715) (foskey51)
+- chore: enforce clippy lint needless_pass_by_value to datafusion-spark [#18714](https://github.com/apache/datafusion/pull/18714) (foskey51)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/optimizer` [#18699](https://github.com/apache/datafusion/pull/18699) (kumarUjjawal)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/functions` [#18700](https://github.com/apache/datafusion/pull/18700) (kumarUjjawal)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/expr-common` [#18702](https://github.com/apache/datafusion/pull/18702) (kumarUjjawal)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/functions-aggregate` [#18716](https://github.com/apache/datafusion/pull/18716) (kumarUjjawal)
+- chore: enforce clippy lint needless_pass_by_value to datafusion-execution [#18723](https://github.com/apache/datafusion/pull/18723) (foskey51)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/functions-nested` [#18724](https://github.com/apache/datafusion/pull/18724) (kumarUjjawal)
+- chore: enforce clippy lint needless_pass_by_value to datafusion-substrait [#18703](https://github.com/apache/datafusion/pull/18703) (foskey51)
+- chore: Refactor with assert_or_internal_err!() in datafusion/spark. [#18674](https://github.com/apache/datafusion/pull/18674) (codetyri0n)
+- Minor: Add docs to release/README.md about rate limits [#18704](https://github.com/apache/datafusion/pull/18704) (alamb)
+- Consolidate query planning examples (#18142) [#18690](https://github.com/apache/datafusion/pull/18690) (cj-zhukov)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/physical-expr-common` [#18735](https://github.com/apache/datafusion/pull/18735) (kumarUjjawal)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/physical-expr` [#18736](https://github.com/apache/datafusion/pull/18736) (kumarUjjawal)
+- Consolidate ArrowFileSource and ArrowStreamFileSource [#18720](https://github.com/apache/datafusion/pull/18720) (adriangb)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/physical-optimizer` [#18732](https://github.com/apache/datafusion/pull/18732) (kumarUjjawal)
+- refactor: reduce duplication in make_udf_function macro [#18733](https://github.com/apache/datafusion/pull/18733) (shashidhar-bm)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/physical-plan` [#18730](https://github.com/apache/datafusion/pull/18730) (kumarUjjawal)
+- chore: enforce clippy lint needless_pass_by_value to datafusion-functions-aggregate-common [#18741](https://github.com/apache/datafusion/pull/18741) (foskey51)
+- Optimize NullState::build [#18737](https://github.com/apache/datafusion/pull/18737) (Dandandan)
+- chore: enforce clippy lint needless_pass_by_value to datafusion-datasource-parquet [#18695](https://github.com/apache/datafusion/pull/18695) (foskey51)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/expr` [#18731](https://github.com/apache/datafusion/pull/18731) (kumarUjjawal)
+- minor: Fix an example in the `PruningPredicate` documentation [#18742](https://github.com/apache/datafusion/pull/18742) (2010YOUY01)
+- chore(deps): bump indicatif from 0.18.2 to 0.18.3 [#18756](https://github.com/apache/datafusion/pull/18756) (dependabot[bot])
+- Fix map_query_sql benchmark duplicate key error [#18427](https://github.com/apache/datafusion/pull/18427) (atheendre130505)
+- minor: enforce lint rule clippy::needless_pass_by_value to datafusion-ffi [#18764](https://github.com/apache/datafusion/pull/18764) (Standing-Man)
+- Rename boolean `Interval` constants to match `NullableInterval` [#18654](https://github.com/apache/datafusion/pull/18654) (pepijnve)
+- chore(deps): bump bytes from 1.10.1 to 1.11.0 [#18755](https://github.com/apache/datafusion/pull/18755) (dependabot[bot])
+- CI: Fix `main` branch CI test failure [#18792](https://github.com/apache/datafusion/pull/18792) (2010YOUY01)
+- chore: Enforce 'clippy::needless_pass_by_value' to datafusion-expr-common [#18775](https://github.com/apache/datafusion/pull/18775) (petern48)
+- chore: Finish refactor with `assert_or_internal_err!()` [#18790](https://github.com/apache/datafusion/pull/18790) (2010YOUY01)
+- Switch from xz2 to liblzma to reduce duplicate dependencies [#17509](https://github.com/apache/datafusion/pull/17509) (timsaucer)
+- chore(deps): bump taiki-e/install-action from 2.62.51 to 2.62.53 [#18796](https://github.com/apache/datafusion/pull/18796) (dependabot[bot])
+- chore(deps): bump actions/checkout from 5.0.0 to 5.0.1 [#18797](https://github.com/apache/datafusion/pull/18797) (dependabot[bot])
+- Misc improvements to ProjectionExprs [#18719](https://github.com/apache/datafusion/pull/18719) (adriangb)
+- Fix incorrect link for sql_query.rs example in README [#18807](https://github.com/apache/datafusion/pull/18807) (kondamudikarthik)
+- Adds prefix filtering for table URLs [#18780](https://github.com/apache/datafusion/pull/18780) (BlakeOrth)
+- Refactor InListExpr to support structs by re-using existing hashing infrastructure [#18449](https://github.com/apache/datafusion/pull/18449) (adriangb)
+- chore: Add script to protect RC branches during the release [#18660](https://github.com/apache/datafusion/pull/18660) (comphead)
+- Prevent overflow and panics when casting DATE to TIMESTAMP by validating bounds [#18761](https://github.com/apache/datafusion/pull/18761) (kosiew)
+- chore(deps): bump taiki-e/install-action from 2.62.53 to 2.62.54 [#18815](https://github.com/apache/datafusion/pull/18815) (dependabot[bot])
+- CI : Enforce clippy: :needless_pass_by_value rule to datafusion-functions-aggregate [#18805](https://github.com/apache/datafusion/pull/18805) (codetyri0n)
+- Consolidate sql operations examples (#18142) [#18743](https://github.com/apache/datafusion/pull/18743) (cj-zhukov)
+- Move `GuaranteeRewriter` to datafusion_expr [#18821](https://github.com/apache/datafusion/pull/18821) (pepijnve)
+- Refactor state management in `HashJoinExec` and use CASE expressions for more precise filters [#18451](https://github.com/apache/datafusion/pull/18451) (adriangb)
+- Refactor avg & sum signatures away from user defined [#18769](https://github.com/apache/datafusion/pull/18769) (Jefffrey)
+- Hash UnionArrays [#18718](https://github.com/apache/datafusion/pull/18718) (friendlymatthew)
+- CI: add clippy::needless_pass_by_value rule to datafusion-functions-window crate [#18838](https://github.com/apache/datafusion/pull/18838) (codetyri0n)
+- Add field to DynamicPhysicalExpr to indicate when the filter is complete or updated [#18799](https://github.com/apache/datafusion/pull/18799) (LiaCastaneda)
+- #17801 Improve nullability reporting of case expressions [#17813](https://github.com/apache/datafusion/pull/17813) (pepijnve)
+- Consolidate execution monitoring examples (#18142) [#18846](https://github.com/apache/datafusion/pull/18846) (cj-zhukov)
+- Implement CatalogProviderList in FFI [#18657](https://github.com/apache/datafusion/pull/18657) (timsaucer)
+- Removed incorrect union check in enforce_sorting and updated tests [#18661](https://github.com/apache/datafusion/pull/18661) (gene-bordegaray)
+- chore(deps): bump actions/checkout from 5.0.1 to 6.0.0 [#18865](https://github.com/apache/datafusion/pull/18865) (dependabot[bot])
+- Remove unnecessary bit counting code from spark `bit_count` [#18841](https://github.com/apache/datafusion/pull/18841) (pepijnve)
+- Fix async_udf batch size behaviour [#18819](https://github.com/apache/datafusion/pull/18819) (shivbhatia10)
+- Fix Partial AggregateExec correctness issue dropping rows [#18712](https://github.com/apache/datafusion/pull/18712) (xanderbailey)
+- chore: Add missing boolean tests to `bit_count` Spark function [#18871](https://github.com/apache/datafusion/pull/18871) (comphead)
+- Consolidate proto examples (#18142) [#18861](https://github.com/apache/datafusion/pull/18861) (cj-zhukov)
+- Use logical null count in `case_when_with_expr` [#18872](https://github.com/apache/datafusion/pull/18872) (pepijnve)
+- chore: enforce `clippy::needless_pass_by_value` to `datafusion-physical-plan` [#18864](https://github.com/apache/datafusion/pull/18864) (2010YOUY01)
+- Refactor spark `bit_get()` signature away from user defined [#18836](https://github.com/apache/datafusion/pull/18836) (Jefffrey)
+- minor: enforce lint rule clippy::needless_pass_by_value to datafusion-functions [#18768](https://github.com/apache/datafusion/pull/18768) (Standing-Man)
+- chore: enforce clippy lint needless_pass_by_value to datafusion-functions-nested [#18839](https://github.com/apache/datafusion/pull/18839) (foskey51)
+- chore: fix CI on main [#18876](https://github.com/apache/datafusion/pull/18876) (Jefffrey)
+- chore: update Repartition DisplayAs to indicate maintained sort order [#18673](https://github.com/apache/datafusion/pull/18673) (ruchirK)
+- implement sum for durations [#18853](https://github.com/apache/datafusion/pull/18853) (logan-keede)
+- Consolidate dataframe examples (#18142) [#18862](https://github.com/apache/datafusion/pull/18862) (cj-zhukov)
+- Avoid the need to rewrite expressions when evaluating logical case nullability [#18849](https://github.com/apache/datafusion/pull/18849) (pepijnve)
+- Avoid skew in Roundrobin repartition [#18880](https://github.com/apache/datafusion/pull/18880) (Dandandan)
+- Add benchmark for array_has/array_has_all/array_has_any [#18729](https://github.com/apache/datafusion/pull/18729) (zhuqi-lucas)
+- chore(deps): bump taiki-e/install-action from 2.62.54 to 2.62.56 [#18899](https://github.com/apache/datafusion/pull/18899) (dependabot[bot])
+- chore(deps): bump indicatif from 0.18.0 to 0.18.3 [#18897](https://github.com/apache/datafusion/pull/18897) (dependabot[bot])
+- chore(deps): bump tokio-util from 0.7.16 to 0.7.17 [#18898](https://github.com/apache/datafusion/pull/18898) (dependabot[bot])
+- Support Non-Literal Expressions in Substrait VirtualTable Values and Improve Round-Trip Robustness [#18866](https://github.com/apache/datafusion/pull/18866) (kosiew)
+- chore(deps): bump indexmap from 2.12.0 to 2.12.1 [#18895](https://github.com/apache/datafusion/pull/18895) (dependabot[bot])
+- chore(deps): bump aws-config from 1.8.7 to 1.8.11 [#18896](https://github.com/apache/datafusion/pull/18896) (dependabot[bot])
+- chore(deps): bump flate2 from 1.1.4 to 1.1.5 [#18900](https://github.com/apache/datafusion/pull/18900) (dependabot[bot])
+- Add iter() method to `Extensions` [#18887](https://github.com/apache/datafusion/pull/18887) (gabotechs)
+- chore: Enforce `clippy::needless_pass_by_value` globally across the workspace [#18904](https://github.com/apache/datafusion/pull/18904) (2010YOUY01)
+- Consolidate external dependency examples (#18142) [#18747](https://github.com/apache/datafusion/pull/18747) (cj-zhukov)
+- Optimize planning for projected nested union [#18713](https://github.com/apache/datafusion/pull/18713) (logan-keede)
+- chore(deps): bump taiki-e/install-action from 2.62.56 to 2.62.57 [#18927](https://github.com/apache/datafusion/pull/18927) (dependabot[bot])
+- chore(deps): bump actions/setup-python from 6.0.0 to 6.1.0 [#18925](https://github.com/apache/datafusion/pull/18925) (dependabot[bot])
+- Fix `map` function alias handling in SQL planner [#18914](https://github.com/apache/datafusion/pull/18914) (friendlymatthew)
+- minor: add builder setting `NdJsonReadOptions::schema_infer_max_records` [#18920](https://github.com/apache/datafusion/pull/18920) (Jefffrey)
+- Implement Substrait Support for `GROUPING SET CUBE` [#18798](https://github.com/apache/datafusion/pull/18798) (kosiew)
+- chore: unify common dependencies as workspace dependencies [#18665](https://github.com/apache/datafusion/pull/18665) (Jefffrey)
+- Fix bug where binary types were incorrectly being casted for coercible signatures [#18750](https://github.com/apache/datafusion/pull/18750) (Jefffrey)
+- Refactor approx_median signature & support f16 [#18647](https://github.com/apache/datafusion/pull/18647) (Jefffrey)
+- Refactor `to_local_time()` signature away from user_defined [#18707](https://github.com/apache/datafusion/pull/18707) (Jefffrey)
+- chore(deps-dev): bump node-forge from 1.3.1 to 1.3.2 in /datafusion/wasmtest/datafusion-wasm-app [#18958](https://github.com/apache/datafusion/pull/18958) (dependabot[bot])
+- Support LikeMatch, ILikeMatch, NotLikeMatch, NotILikeMatch operators in protobuf serialization [#18961](https://github.com/apache/datafusion/pull/18961) (zhuqi-lucas)
+- chore: cargo fmt to fix CI [#18969](https://github.com/apache/datafusion/pull/18969) (Jefffrey)
+- chore(deps): bump Swatinem/rust-cache from 2.8.1 to 2.8.2 [#18963](https://github.com/apache/datafusion/pull/18963) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.57 to 2.62.58 [#18964](https://github.com/apache/datafusion/pull/18964) (dependabot[bot])
+- chore(deps): bump crate-ci/typos from 1.39.2 to 1.40.0 [#18965](https://github.com/apache/datafusion/pull/18965) (dependabot[bot])
+- [Minor] Refactor `traverse_chain` macro to function [#18951](https://github.com/apache/datafusion/pull/18951) (Dandandan)
+- Enable clippy::allow_attributes lint for datafusion-catalog [#18973](https://github.com/apache/datafusion/pull/18973) (chakkk309)
+- chore: update group of crates to rust 2024 edition [#18915](https://github.com/apache/datafusion/pull/18915) (timsaucer)
+- chore(deps): bump taiki-e/install-action from 2.62.58 to 2.62.59 [#18978](https://github.com/apache/datafusion/pull/18978) (dependabot[bot])
+- Simplify percentile_cont for 0/1 percentiles [#18837](https://github.com/apache/datafusion/pull/18837) (kumarUjjawal)
+- chore: enforce clippy::allow_attributes for functions-\* crates [#18986](https://github.com/apache/datafusion/pull/18986) (carlosahs)
+- chore: enforce clippy::allow_attributes for common crates [#18988](https://github.com/apache/datafusion/pull/18988) (chakkk309)
+- Fix predicate_rows_pruned & predicate_rows_matched metrics [#18980](https://github.com/apache/datafusion/pull/18980) (xudong963)
+- Allocate a buffer of the correct length for ScalarValue::FixedSizeBinary in ScalarValue::to_array_of_size [#18903](https://github.com/apache/datafusion/pull/18903) (tobixdev)
+- Fix error planning aggregates with duplicated names in select list [#18831](https://github.com/apache/datafusion/pull/18831) (tshauck)
+- chore: remove `deny`s of `needless_pass_by_value` in `lib.rs` files [#18996](https://github.com/apache/datafusion/pull/18996) (Jefffrey)
+- Add Explicit Error Handling for Unsupported SQL `FETCH` Clause in Planner and CLI [#18691](https://github.com/apache/datafusion/pull/18691) (kosiew)
+- chore(deps): bump criterion from 0.7.0 to 0.8.0 [#19009](https://github.com/apache/datafusion/pull/19009) (dependabot[bot])
+- chore(deps): bump syn from 2.0.108 to 2.0.111 [#19011](https://github.com/apache/datafusion/pull/19011) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.59 to 2.62.60 [#19012](https://github.com/apache/datafusion/pull/19012) (dependabot[bot])
+- chore: remove redundant clone code [#18997](https://github.com/apache/datafusion/pull/18997) (Smith-Cruise)
+- Update to `arrow`, `parquet` to `57.1.0` [#18820](https://github.com/apache/datafusion/pull/18820) (alamb)
+- deny on allow_attributes lint in physical-plan [#18983](https://github.com/apache/datafusion/pull/18983) (YuraLitvinov)
+- Add additional test coverage of multi-value PartitionPruningStats [#19021](https://github.com/apache/datafusion/pull/19021) (alamb)
+- Fix tpch benchmark harness [#19033](https://github.com/apache/datafusion/pull/19033) (alamb)
+- Fix data for tpch_csv and tpch_csv10 [#19034](https://github.com/apache/datafusion/pull/19034) (alamb)
+- chore: update group of 3 crates to rust 2024 edition [#19001](https://github.com/apache/datafusion/pull/19001) (timsaucer)
+- chore(deps-dev): bump express from 4.21.2 to 4.22.1 in /datafusion/wasmtest/datafusion-wasm-app [#19040](https://github.com/apache/datafusion/pull/19040) (dependabot[bot])
+- Allow repartitioning on files with ranges [#18948](https://github.com/apache/datafusion/pull/18948) (Samyak2)
+- Support simplify not for physical expr [#18970](https://github.com/apache/datafusion/pull/18970) (xudong963)
+- dev: Add typos check to the local `dev/rust_lint.sh` [#17863](https://github.com/apache/datafusion/pull/17863) (2010YOUY01)
+- Implement FFI_PhysicalExpr and the structs it needs to support it. [#18916](https://github.com/apache/datafusion/pull/18916) (timsaucer)
+- chore(deps): bump actions/setup-node from 6.0.0 to 6.1.0 [#19063](https://github.com/apache/datafusion/pull/19063) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.60 to 2.62.61 [#19062](https://github.com/apache/datafusion/pull/19062) (dependabot[bot])
+- chore(deps): bump actions/stale from 10.1.0 to 10.1.1 [#19061](https://github.com/apache/datafusion/pull/19061) (dependabot[bot])
+- chore: merge make_array and spark array [#19006](https://github.com/apache/datafusion/pull/19006) (jizezhang)
+- chore(deps): bump actions/checkout from 6.0.0 to 6.0.1 [#19060](https://github.com/apache/datafusion/pull/19060) (dependabot[bot])
+- Add documentation example for `PartitionPruningStatistics` [#19020](https://github.com/apache/datafusion/pull/19020) (alamb)
+- chore: upgrade expr and execution crates to rust 2024 edition [#19047](https://github.com/apache/datafusion/pull/19047) (timsaucer)
+- refactor: Refactor spark make_interval signature away from user defined [#19027](https://github.com/apache/datafusion/pull/19027) (kumarUjjawal)
+- Fix: Align sort_merge_join filter output with join schema to fix right-anti panic [#18800](https://github.com/apache/datafusion/pull/18800) (kumarUjjawal)
+- Support Substrait Round-Trip of `EmptyRelation` Including `produce_one_row` Semantics [#18842](https://github.com/apache/datafusion/pull/18842) (kosiew)
+- chore(deps): bump taiki-e/install-action from 2.62.61 to 2.62.62 [#19081](https://github.com/apache/datafusion/pull/19081) (dependabot[bot])
+- chore: enforce clippy::allow_attributes for datasource crates [#19068](https://github.com/apache/datafusion/pull/19068) (chakkk309)
+- common: Add hashing support for REE arrays [#18981](https://github.com/apache/datafusion/pull/18981) (brancz)
+- Use `tpchgen-cli` to generate tpch data in bench.sh [#19035](https://github.com/apache/datafusion/pull/19035) (alamb)
+- Update aggregate probe to be locked only if skipping aggregation [#18766](https://github.com/apache/datafusion/pull/18766) (hareshkh)
+- Fix function doc CI check [#19093](https://github.com/apache/datafusion/pull/19093) (alamb)
+- Fix Schema Duplication Errors in Self‑Referential INTERSECT/EXCEPT by Requalifying Input Sides [#18814](https://github.com/apache/datafusion/pull/18814) (kosiew)
+- run cargo fmt to fix after #18998 [#19102](https://github.com/apache/datafusion/pull/19102) (adriangb)
+- bench: set test_util as required feature for aggregate_vectorized [#19101](https://github.com/apache/datafusion/pull/19101) (rluvaton)
+- use ProjectionExprs:project_statistics in FileScanConfig [#19094](https://github.com/apache/datafusion/pull/19094) (adriangb)
+- Temporarily ignore test_cache_with_ttl_and_lru test [#19115](https://github.com/apache/datafusion/pull/19115) (alamb)
+- refactor: move human readable display utilities to `datafusion-common` crate [#19080](https://github.com/apache/datafusion/pull/19080) (2010YOUY01)
+- Always remove unecessary software from github runners for all jobs (fix intermittent out of space on runners) [#19122](https://github.com/apache/datafusion/pull/19122) (alamb)
+- [datafusion-spark]: Refactor make_dt_interval's signature away from user defined [#19083](https://github.com/apache/datafusion/pull/19083) (codetyri0n)
+- fix deprecation notes with incorrect versions from #13083 [#19135](https://github.com/apache/datafusion/pull/19135) (adriangb)
+- Run the examples in the new format [#18946](https://github.com/apache/datafusion/pull/18946) (cj-zhukov)
+- Add constant expression evaluator to physical expression simplifier [#19130](https://github.com/apache/datafusion/pull/19130) (adriangb)
+- Fix shuffle function to report nullability correctly [#19184](https://github.com/apache/datafusion/pull/19184) (harshitsaini17)
+- chore: enforce clippy::allow_attributes for physical crates [#19185](https://github.com/apache/datafusion/pull/19185) (carlosahs)
+- Update 5 crates to rust 2024 edition [#19091](https://github.com/apache/datafusion/pull/19091) (timsaucer)
+- Coalesce batches inside hash join, reuse indices buffer [#18972](https://github.com/apache/datafusion/pull/18972) (Dandandan)
+- slt test coverage for `CASE` exprs with constant value lookup tables [#19143](https://github.com/apache/datafusion/pull/19143) (alamb)
+- Fix fmt after logical conflict [#19208](https://github.com/apache/datafusion/pull/19208) (alamb)
+- chore: Add TPCDS benchmarks [#19138](https://github.com/apache/datafusion/pull/19138) (comphead)
+- Arc partition values in TableSchema [#19137](https://github.com/apache/datafusion/pull/19137) (adriangb)
+- Add sorted data benchmark. [#19042](https://github.com/apache/datafusion/pull/19042) (zhuqi-lucas)
+- Refactor PhysicalExprSimplfier to &self instead of &mut self [#19212](https://github.com/apache/datafusion/pull/19212) (adriangb)
+- chore(deps): bump uuid from 1.18.1 to 1.19.0 [#19199](https://github.com/apache/datafusion/pull/19199) (dependabot[bot])
+- chore(deps): bump async-compression from 0.4.34 to 0.4.35 [#19201](https://github.com/apache/datafusion/pull/19201) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.62 to 2.62.63 [#19198](https://github.com/apache/datafusion/pull/19198) (dependabot[bot])
+- chore(deps): bump tracing-subscriber from 0.3.20 to 0.3.22 [#19200](https://github.com/apache/datafusion/pull/19200) (dependabot[bot])
+- chore(deps): bump wasm-bindgen-test from 0.3.55 to 0.3.56 [#19202](https://github.com/apache/datafusion/pull/19202) (dependabot[bot])
+- bench: add dedicated Utf8View benchmarks for InList [#19211](https://github.com/apache/datafusion/pull/19211) (geoffreyclaude)
+- Fix PruningPredicate interaction with DynamicFilterPhysicalExpr that references partition columns [#19129](https://github.com/apache/datafusion/pull/19129) (adriangb)
+- Implement physical and logical codecs in FFI [#19079](https://github.com/apache/datafusion/pull/19079) (timsaucer)
+- refactor: Refactor spark width bucket signature away from user defined [#19065](https://github.com/apache/datafusion/pull/19065) (kumarUjjawal)
+- Sort Merge Join: Reduce batch concatenation, use `BatchCoalescer`, new benchmarks (TPC-H Q21 SMJ up to ~4000x faster) [#18875](https://github.com/apache/datafusion/pull/18875) (mbutrovich)
+- Add relation planner extension support to customize SQL planning [#17843](https://github.com/apache/datafusion/pull/17843) (geoffreyclaude)
+- Add additional tests for InListExpr [#19050](https://github.com/apache/datafusion/pull/19050) (adriangb)
+- chore(deps): bump taiki-e/install-action from 2.62.63 to 2.62.64 [#19226](https://github.com/apache/datafusion/pull/19226) (dependabot[bot])
+- Use strum in the examples (#19126) [#19205](https://github.com/apache/datafusion/pull/19205) (cj-zhukov)
+- [Proto]: Serialization support for `AsyncFuncExec` [#19118](https://github.com/apache/datafusion/pull/19118) (mach-kernel)
+- chore: add test case for decimal overflow [#19255](https://github.com/apache/datafusion/pull/19255) (Jefffrey)
+- chore(deps): bump taiki-e/install-action from 2.62.64 to 2.62.65 [#19251](https://github.com/apache/datafusion/pull/19251) (dependabot[bot])
+- chore: update 6 crates to rust edition 2024 [#19196](https://github.com/apache/datafusion/pull/19196) (timsaucer)
+- Implement FFI_Session [#19223](https://github.com/apache/datafusion/pull/19223) (timsaucer)
+- Feat: Add an option for fast tests by gating slow tests to extended_tests feature [#19237](https://github.com/apache/datafusion/pull/19237) (Yuvraj-cyborg)
+- chore: enforce clippy::allow_attributes for 7 crates [#19133](https://github.com/apache/datafusion/pull/19133) (chakkk309)
+- dev: Add CI doc prettier check to local `rust_lint.sh` [#19254](https://github.com/apache/datafusion/pull/19254) (2010YOUY01)
+- bug: Eliminate dead round-robin insertion in enforce distribution [#19132](https://github.com/apache/datafusion/pull/19132) (gene-bordegaray)
+- Automatically download tpcds benchmark data to the right place [#19244](https://github.com/apache/datafusion/pull/19244) (alamb)
+- [datafusion-spark]: Refactor hex's signature away from user_defined [#19235](https://github.com/apache/datafusion/pull/19235) (codetyri0n)
+- fix : correct nullability propagation for spark.bitwise_not [#19224](https://github.com/apache/datafusion/pull/19224) (shifluxxc)
+- added custom nullability for char [#19268](https://github.com/apache/datafusion/pull/19268) (skushagra)
+- replace HashTableLookupExpr with lit(true) in proto serialization [#19300](https://github.com/apache/datafusion/pull/19300) (adriangb)
+- chore: fix return_field_from_args doc [#19307](https://github.com/apache/datafusion/pull/19307) (xumingming)
+- chore: enforce clippy::allow_attributes for spark,sql,sustrait [#19309](https://github.com/apache/datafusion/pull/19309) (kumarUjjawal)
+- Simplify make_date & fix null handling [#19296](https://github.com/apache/datafusion/pull/19296) (Jefffrey)
+- Allow base64 encoding of fixedsizebinary arrays [#18950](https://github.com/apache/datafusion/pull/18950) (maxburke)
+- chore: update 11 crates to Rust 2024 edition [#19258](https://github.com/apache/datafusion/pull/19258) (timsaucer)
+- Minor: remove unnecessary unit tests for fixed size binary [#19318](https://github.com/apache/datafusion/pull/19318) (alamb)
+- Populate partition column statistics for PartitionedFile [#19284](https://github.com/apache/datafusion/pull/19284) (adriangb)
+- refactor: move metrics module to `datafusion-common` crate [#19247](https://github.com/apache/datafusion/pull/19247) (2010YOUY01)
+- chore(deps): bump taiki-e/install-action from 2.62.65 to 2.62.67 [#19295](https://github.com/apache/datafusion/pull/19295) (dependabot[bot])
+- chore(deps): bump ctor from 0.6.1 to 0.6.3 [#19328](https://github.com/apache/datafusion/pull/19328) (dependabot[bot])
+- Refactor `power()` signature away from user defined [#18968](https://github.com/apache/datafusion/pull/18968) (Jefffrey)
+- chore: enforce `clippy::allow_attributes` for optimizer and macros [#19310](https://github.com/apache/datafusion/pull/19310) (kumarUjjawal)
+- chore(deps): bump taiki-e/install-action from 2.62.67 to 2.63.3 [#19349](https://github.com/apache/datafusion/pull/19349) (dependabot[bot])
+- chore(deps): bump clap from 4.5.50 to 4.5.53 [#19326](https://github.com/apache/datafusion/pull/19326) (dependabot[bot])
+- chore(deps): bump insta from 1.43.2 to 1.44.3 [#19327](https://github.com/apache/datafusion/pull/19327) (dependabot[bot])
+- remove repartition exec from coalesce batches optimizer [#19239](https://github.com/apache/datafusion/pull/19239) (jizezhang)
+- minor: cleanup unnecessary config in `decimal.slt` [#19352](https://github.com/apache/datafusion/pull/19352) (Jefffrey)
+- Fix panic for `GROUPING SETS(())` and handle empty-grouping aggregates [#19252](https://github.com/apache/datafusion/pull/19252) (kosiew)
+- Update datafusion-core crate to Rust 2024 edition [#19332](https://github.com/apache/datafusion/pull/19332) (timsaucer)
+- Update 4 crates to rust 2024 edition [#19357](https://github.com/apache/datafusion/pull/19357) (timsaucer)
+- preserve Field metadata in first_value/last_value [#19335](https://github.com/apache/datafusion/pull/19335) (adriangb)
+- Fix flaky SpillPool channel test by synchronizing reader and writer tasks [#19110](https://github.com/apache/datafusion/pull/19110) (kosiew)
+- [minor] Upgrade rust version [#19363](https://github.com/apache/datafusion/pull/19363) (Dandandan)
+- Minor: fix cargo fmt [#19368](https://github.com/apache/datafusion/pull/19368) (zhuqi-lucas)
+- chore: enforce clippy::allow_attributes for proto, pruning, session [#19350](https://github.com/apache/datafusion/pull/19350) (kumarUjjawal)
+- Update remaining crates to rust 2024 edition [#19361](https://github.com/apache/datafusion/pull/19361) (timsaucer)
+- Minor: Make `ProjectionExpr::new` easier to use with constants [#19343](https://github.com/apache/datafusion/pull/19343) (alamb)
+- Feat: DefaultListFilesCache prefix-aware for partition pruning optimization [#19298](https://github.com/apache/datafusion/pull/19298) (Yuvraj-cyborg)
+- Extend in_list benchmark coverage [#19376](https://github.com/apache/datafusion/pull/19376) (geoffreyclaude)
+- [datafusion-cli] Implement average LIST duration for object store profiling [#19127](https://github.com/apache/datafusion/pull/19127) (peterxcli)
+- chore(deps): bump taiki-e/install-action from 2.63.3 to 2.64.0 [#19382](https://github.com/apache/datafusion/pull/19382) (dependabot[bot])
+- update insta snapshots [#19381](https://github.com/apache/datafusion/pull/19381) (kosiew)
+- Fix regression for negative-scale decimal128 in log [#19315](https://github.com/apache/datafusion/pull/19315) (shifluxxc)
+- Fix input handling for encoding functions & various refactors [#18754](https://github.com/apache/datafusion/pull/18754) (Jefffrey)
+- Fix ORDER BY positional reference regression with aliased aggregates [#19412](https://github.com/apache/datafusion/pull/19412) (adriangb)
+- Implement disk spilling for all grouping ordering modes in GroupedHashAggregateStream [#19287](https://github.com/apache/datafusion/pull/19287) (pepijnve)
+- refactor: add ParquetOpenerBuilder to reduce test code duplication [#19405](https://github.com/apache/datafusion/pull/19405) (shashidhar-bm)
+- bench: add `range_and_generate_series` [#19428](https://github.com/apache/datafusion/pull/19428) (rluvaton)
+- chore: use extend instead of manual loop in multi group by [#19429](https://github.com/apache/datafusion/pull/19429) (rluvaton)
+- chore(deps): bump taiki-e/install-action from 2.64.0 to 2.64.2 [#19399](https://github.com/apache/datafusion/pull/19399) (dependabot[bot])
+- Add recursive protection on planner's `create_physical_expr` [#19299](https://github.com/apache/datafusion/pull/19299) (rgehan)
+- chore(deps): bump aws-config from 1.8.11 to 1.8.12 [#19453](https://github.com/apache/datafusion/pull/19453) (dependabot[bot])
+- chore(deps): bump log from 0.4.28 to 0.4.29 [#19452](https://github.com/apache/datafusion/pull/19452) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.64.2 to 2.65.1 [#19451](https://github.com/apache/datafusion/pull/19451) (dependabot[bot])
+- chore(deps): bump insta from 1.44.3 to 1.45.0 [#19454](https://github.com/apache/datafusion/pull/19454) (dependabot[bot])
+- added support for negative scale for log decimal32/64 and power [#19409](https://github.com/apache/datafusion/pull/19409) (shifluxxc)
+- Remove core dependency from ffi [#19422](https://github.com/apache/datafusion/pull/19422) (timsaucer)
+- bench: increase in_list benchmark coverage [#19443](https://github.com/apache/datafusion/pull/19443) (geoffreyclaude)
+- Use SortMergeJoinExec name consistently in physical plan outputs [#19246](https://github.com/apache/datafusion/pull/19246) (xavlee)
+- Fix panic during spill to disk in clickbench query [#19421](https://github.com/apache/datafusion/pull/19421) (alamb)
+- Optimize memory footprint of view arrays from `ScalarValue::to_array_of_size` [#19441](https://github.com/apache/datafusion/pull/19441) (Jefffrey)
+- minor: refactoring of some `ScalarValue` code [#19439](https://github.com/apache/datafusion/pull/19439) (Jefffrey)
+- Refactor Spark crc32 & sha1 to remove unnecessary scalar argument check [#19466](https://github.com/apache/datafusion/pull/19466) (Jefffrey)
+- Add link to arrow-rs ticket in comments [#19479](https://github.com/apache/datafusion/pull/19479) (alamb)
+- chore(deps): bump taiki-e/install-action from 2.65.1 to 2.65.2 [#19474](https://github.com/apache/datafusion/pull/19474) (dependabot[bot])
+- Improve plan_to_sql handling of empty projections with dialect-specific SELECT list support [#19221](https://github.com/apache/datafusion/pull/19221) (kosiew)
+- examples: replace sql_dialect with custom_sql_parser example [#19383](https://github.com/apache/datafusion/pull/19383) (geoffreyclaude)
+- Replace custom merge operator with arrow-rs implementation [#19424](https://github.com/apache/datafusion/pull/19424) (pepijnve)
+- Implement nested recursive CTEs [#18956](https://github.com/apache/datafusion/pull/18956) (Tpt)
+- Add: PI upper/lower bound f16 constants to ScalarValue [#19497](https://github.com/apache/datafusion/pull/19497) (xonx4l)
+- chore: enforce clippy::allow_attributes for datafusion-ffi crate [#19480](https://github.com/apache/datafusion/pull/19480) (chakkk309)
+- Add CI check to ensure examples are documented in README [#19371](https://github.com/apache/datafusion/pull/19371) (cj-zhukov)
+- fix : snapshot to the modern multiline format [#19517](https://github.com/apache/datafusion/pull/19517) (Nachiket-Roy)
+- chore(deps): bump taiki-e/install-action from 2.65.2 to 2.65.3 [#19499](https://github.com/apache/datafusion/pull/19499) (dependabot[bot])
+- docs : clarify unused test utility [#19508](https://github.com/apache/datafusion/pull/19508) (Nachiket-Roy)
+- Date / time / interval arithmetic improvements [#19460](https://github.com/apache/datafusion/pull/19460) (Omega359)
+- Preserve ORDER BY in Unparser for projection -> order by pattern [#19483](https://github.com/apache/datafusion/pull/19483) (adriangb)
+- Redesign the try_reverse_output to support more cases [#19446](https://github.com/apache/datafusion/pull/19446) (zhuqi-lucas)
+- refactor: Spark `ascii` signature away from `user_defined` [#19513](https://github.com/apache/datafusion/pull/19513) (kumarUjjawal)
+- Fix: SparkAscii nullability to depend on input nullability [#19531](https://github.com/apache/datafusion/pull/19531) (Yuvraj-cyborg)
+- chore(deps): bump tracing from 0.1.41 to 0.1.43 [#19543](https://github.com/apache/datafusion/pull/19543) (dependabot[bot])
+- chore(deps): bump substrait from 0.62.0 to 0.62.2 [#19542](https://github.com/apache/datafusion/pull/19542) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.65.3 to 2.65.6 [#19541](https://github.com/apache/datafusion/pull/19541) (dependabot[bot])
+- minor: run all examples by default [#19506](https://github.com/apache/datafusion/pull/19506) (theirix)
+- Refactor TopKHashTable to use HashTable API [#19464](https://github.com/apache/datafusion/pull/19464) (Dandandan)
+- Revert Spark Elt nullability change [#19510](https://github.com/apache/datafusion/pull/19510) (Jefffrey)
+- minor: implement more arms for `get_data_types()` for `NativeType` [#19449](https://github.com/apache/datafusion/pull/19449) (Jefffrey)
+- Upgrade hashbrown to 0.16 [#19554](https://github.com/apache/datafusion/pull/19554) (Dandandan)
+- minor : add crypto function benchmark [#19539](https://github.com/apache/datafusion/pull/19539) (getChan)
+- chore(deps): bump taiki-e/install-action from 2.65.6 to 2.65.8 [#19559](https://github.com/apache/datafusion/pull/19559) (dependabot[bot])
+- bugfix: preserve schema metadata for record batch in FFI [#19293](https://github.com/apache/datafusion/pull/19293) (timsaucer)
+- refactor: extract the data generate out of aggregate_topk benchmark [#19523](https://github.com/apache/datafusion/pull/19523) (haohuaijin)
+- Compute Dynamic Filters only when a consumer supports them [#19546](https://github.com/apache/datafusion/pull/19546) (LiaCastaneda)
+- Various refactors to string functions [#19402](https://github.com/apache/datafusion/pull/19402) (Jefffrey)
+- Implement `partition_statistics` API for `NestedLoopJoinExec` [#19468](https://github.com/apache/datafusion/pull/19468) (kumarUjjawal)
+- Replace deprecated structopt with clap in datafusion-benchmarks [#19492](https://github.com/apache/datafusion/pull/19492) (Yuvraj-cyborg)
+- Refactor duplicate code in `type_coercion/functions.rs` [#19518](https://github.com/apache/datafusion/pull/19518) (Jefffrey)
+- chore(deps): bump taiki-e/install-action from 2.65.8 to 2.65.10 [#19578](https://github.com/apache/datafusion/pull/19578) (dependabot[bot])
+- perf: Improve performance of hex encoding in spark functions [#19586](https://github.com/apache/datafusion/pull/19586) (shashidhar-bm)
+- Add left function benchmark [#19600](https://github.com/apache/datafusion/pull/19600) (viirya)
+- chore: Add TPCDS benchmark comparison for PR [#19552](https://github.com/apache/datafusion/pull/19552) (comphead)
+- chore(deps): bump taiki-e/install-action from 2.65.10 to 2.65.11 [#19601](https://github.com/apache/datafusion/pull/19601) (dependabot[bot])
+- chore: bump testcontainers-modules to 0.14 and remove testcontainers dep [#19620](https://github.com/apache/datafusion/pull/19620) (Jefffrey)
+- Validate parquet writer version [#19515](https://github.com/apache/datafusion/pull/19515) (AlyAbdelmoneim)
+- chore(deps): bump insta from 1.45.0 to 1.46.0 [#19643](https://github.com/apache/datafusion/pull/19643) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.65.11 to 2.65.13 [#19646](https://github.com/apache/datafusion/pull/19646) (dependabot[bot])
+- chore(deps): bump tracing from 0.1.43 to 0.1.44 [#19644](https://github.com/apache/datafusion/pull/19644) (dependabot[bot])
+- chore(deps): bump syn from 2.0.111 to 2.0.113 [#19645](https://github.com/apache/datafusion/pull/19645) (dependabot[bot])
+- Refactor `percentile_cont` to clarify support input types [#19611](https://github.com/apache/datafusion/pull/19611) (Jefffrey)
+- Add a protection to release candidate branch 52 [#19660](https://github.com/apache/datafusion/pull/19660) (xudong963)
+- Downgrade aws-smithy-runtime, update `rust_decimal`, ignore RUSTSEC-2026-0001 to get clean CI [#19657](https://github.com/apache/datafusion/pull/19657) (alamb)
+- Update dependencies [#19667](https://github.com/apache/datafusion/pull/19667) (alamb)
+- Refactor PartitionedFile: add ordering field and new_from_meta constructor [#19596](https://github.com/apache/datafusion/pull/19596) (adriangb)
+- Remove coalesce batches rule and deprecate CoalesceBatchesExec [#19622](https://github.com/apache/datafusion/pull/19622) (feniljain)
+- Perf: Optimize `substring_index` via single-byte fast path and direct indexing [#19590](https://github.com/apache/datafusion/pull/19590) (lyne7-sc)
+- refactor: Use `Signature::coercible` for isnan/iszero [#19604](https://github.com/apache/datafusion/pull/19604) (kumarUjjawal)
+- Parquet: Push down supported list predicates (array_has/any/all) during decoding [#19545](https://github.com/apache/datafusion/pull/19545) (kosiew)
+- Remove dependency on `rust_decimal`, remove ignore of `RUSTSEC-2026-0001` [#19666](https://github.com/apache/datafusion/pull/19666) (alamb)
+- Store example data directly inside the datafusion-examples (#19141) [#19319](https://github.com/apache/datafusion/pull/19319) (cj-zhukov)
+- minor: More comments to `ParquetOpener::open()` [#19677](https://github.com/apache/datafusion/pull/19677) (2010YOUY01)
+- Feat: Allow pow with negative & non-integer exponent on decimals [#19369](https://github.com/apache/datafusion/pull/19369) (Yuvraj-cyborg)
+- chore(deps): bump taiki-e/install-action from 2.65.13 to 2.65.15 [#19676](https://github.com/apache/datafusion/pull/19676) (dependabot[bot])
+- Refactor cache APIs to support ordering information [#19597](https://github.com/apache/datafusion/pull/19597) (adriangb)
+- Record sort order when writing Parquet with WITH ORDER [#19595](https://github.com/apache/datafusion/pull/19595) (adriangb)
+- implement var distinct [#19706](https://github.com/apache/datafusion/pull/19706) (thinh2)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+    67	dependabot[bot]
+    38	Andrew Lamb
+    36	Jeffrey Vo
+    35	Kumar Ujjawal
+    34	Adrian Garcia Badaracco
+    22	Tim Saucer
+    19	Yongting You
+    13	Sergey Zhukov
+    11	Pepijn Van Eeckhoudt
+    11	kosiew
+    10	Daniël Heres
+    10	Dhanush
+    10	Oleks V
+     8	Geoffrey Claude
+     8	Raz Luvaton
+     7	Andy Grove
+     7	Liang-Chi Hsieh
+     7	Qi Zhu
+     6	Peter Nguyen
+     6	Shashidhar B M
+     5	Alan Tang
+     5	Alex Huang
+     5	Bruce Ritchie
+     5	Gene Bordegaray
+     5	Nuno Faria
+     5	Sriram Sundar
+     4	Blake Orth
+     4	Thomas Tanon
+     4	Yuvraj
+     4	theirix
+     3	Aryan Bagade
+     3	Chakkk
+     3	Emily Matheys
+     3	Huaijin
+     3	Khanh Duong
+     3	Kushagra S
+     3	Vedic Chawla
+     3	feniljain
+     3	harshit saini
+     3	jizezhang
+     3	shifluxxc
+     3	xonx
+     3	xudong.w
+     2	Carlos Hurtado
+     2	Chen Chongchen
+     2	Cora Sutton
+     2	Haresh Khanna
+     2	Lía Adriana
+     2	Manish Kumar
+     2	Martin Grigorov
+     2	Matthew Kim
+     2	Namgung Chan
+     2	Nimalan
+     2	Nithurshen
+     2	Rosai
+     2	Shubham Yadav
+     2	Trent Hauck
+     2	Vegard Stikbakke
+     2	Vrishabh
+     2	Xander
+     2	chakkk309
+     2	mag1c1an1
+     2	nlimpid
+     2	yqrz
+     1	Adam Curtis
+     1	Aly Abdelmoneim
+     1	Andrey Velichkevich
+     1	Arpit Bandejiya
+     1	Bharathwaj G
+     1	Bipul Lamsal
+     1	Clement de Groc
+     1	Congxian Qiu
+     1	David López
+     1	David Stancu
+     1	Devanshu
+     1	Dongpo Liu
+     1	EeshanBembi
+     1	Eshaan Gupta
+     1	Ethan Urbanski
+     1	Frederic Branczyk
+     1	Gabriel
+     1	Gohlub
+     1	Heran Lin
+     1	James Xu
+     1	Jatin Kumar singh
+     1	Karan Pradhan
+     1	Karthik Kondamudi
+     1	Kazantsev Maksim
+     1	Marco Neumann
+     1	Matt Butrovich
+     1	Max Burke
+     1	Michele Vigilante
+     1	Mikhail Zabaluev
+     1	Mohit rao
+     1	Ning Sun
+     1	Peter Lee
+     1	Quoc Anh
+     1	Ram
+     1	Randy
+     1	Renan GEHAN
+     1	Ruchir Khaitan
+     1	Samyak Sarnayak
+     1	Shiv Bhatia
+     1	Smith Cruise
+     1	Smotrov Oleksii
+     1	Solari Systems
+     1	Suhail
+     1	T2MIX
+     1	Tal Glanzman
+     1	Tamar
+     1	Tim-53
+     1	Tobias Schwarzinger
+     1	Ujjwal Kumar Tiwari
+     1	Willem Verstraeten
+     1	YuraLitvinov
+     1	bubulalabu
+     1	delamarch3
+     1	hsiang-c
+     1	r1b
+     1	rin
+     1	xavlee
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/changelog/52.1.0.md b/dev/changelog/52.1.0.md
new file mode 100644
index 0000000000000..97a1435c41a44
--- /dev/null
+++ b/dev/changelog/52.1.0.md
@@ -0,0 +1,46 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 52.1.0 Changelog
+
+This release consists of 3 commits from 3 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Documentation updates:**
+
+- [branch-52] Fix Internal error: Assertion failed: !self.finished: LimitedBatchCoalescer (#19785) [#19836](https://github.com/apache/datafusion/pull/19836) (alamb)
+
+**Other:**
+
+- [branch-52] fix: expose `ListFilesEntry` [#19818](https://github.com/apache/datafusion/pull/19818) (lonless9)
+- [branch 52] Fix grouping set subset satisfaction [#19855](https://github.com/apache/datafusion/pull/19855) (gabotechs)
+- Add BatchAdapter to simplify using PhysicalExprAdapter / Projector [#19877](https://github.com/apache/datafusion/pull/19877) (alamb)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+     1	Andrew Lamb
+     1	Gabriel
+     1	XL Liang
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/changelog/52.2.0.md b/dev/changelog/52.2.0.md
new file mode 100644
index 0000000000000..0801ec5e6a7ee
--- /dev/null
+++ b/dev/changelog/52.2.0.md
@@ -0,0 +1,47 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 52.2.0 Changelog
+
+This release consists of 5 commits from 3 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Other:**
+
+- [branch-52] fix: filter pushdown when merge filter (#20110) [#20289](https://github.com/apache/datafusion/pull/20289) (haohuaijin)
+- [branch-52] FilterExec should remap indices of parent dynamic filters (#20286) [#20347](https://github.com/apache/datafusion/pull/20347) (alamb)
+- [branch-52] fix: validate inter-file ordering in eq_properties() (#20329) [#20509](https://github.com/apache/datafusion/pull/20509) (alamb)
+- Fix name tracker (#19856) [#20539](https://github.com/apache/datafusion/pull/20539) (hareshkh)
+- [branch-52] fix: HashJoin panic with dictionary-encoded columns in multi-key joins (#20441) [#20512](https://github.com/apache/datafusion/pull/20512) (alamb)
+- [branch-52] Fix incorrect `SortExec` removal before `AggregateExec` (#20247) [#20507](https://github.com/apache/datafusion/pull/20507) (alamb)
+- [branch-52] Update aws-smithy, bytes and time for security audits [#20546](https://github.com/apache/datafusion/pull/20546) (alamb)
+- [branch-52] Clamp early aggregation emit to the sort boundary when using partial group ordering (#20446) [#20558](https://github.com/apache/datafusion/pull/20558) (alamb)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+     3	Andrew Lamb
+     1	Haresh Khanna
+     1	Huaijin
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/changelog/52.3.0.md b/dev/changelog/52.3.0.md
new file mode 100644
index 0000000000000..ed505b7fc2d0a
--- /dev/null
+++ b/dev/changelog/52.3.0.md
@@ -0,0 +1,50 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 52.3.0 Changelog
+
+This release consists of 7 commits from 4 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Performance related:**
+
+- [branch-52] perf: sort replace free()->try_grow() pattern with try_resize() to reduce memory pool interactions [#20732](https://github.com/apache/datafusion/pull/20732) (mbutrovich)
+
+**Other:**
+
+- [branch-52] Backport fix: SortMergeJoin don't wait for all input before emitting #20482 [#20699](https://github.com/apache/datafusion/pull/20699) (mbutrovich)
+- [branch-52] Fix Arrow Spill Underrun (#20159) [#20684](https://github.com/apache/datafusion/pull/20684) (hareshkh)
+- [branch-52] Fix constant value from stats (#20042) [#20709](https://github.com/apache/datafusion/pull/20709) (alamb)
+- [branch-52] fix: `HashJoin` panic with String dictionary keys (don't flatten keys) (#20505) [#20708](https://github.com/apache/datafusion/pull/20708) (alamb)
+- [branch-52] FFI_TableOptions are using default values only [#20705](https://github.com/apache/datafusion/pull/20705) (timsaucer)
+- [branch-52] Fix repartition from dropping data when spilling (#20672) [#20777](https://github.com/apache/datafusion/pull/20777) (alamb)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+     3	Andrew Lamb
+     2	Matt Butrovich
+     1	Haresh Khanna
+     1	Tim Saucer
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/changelog/52.4.0.md b/dev/changelog/52.4.0.md
new file mode 100644
index 0000000000000..7ea1c8c29ea29
--- /dev/null
+++ b/dev/changelog/52.4.0.md
@@ -0,0 +1,57 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 52.4.0 Changelog
+
+This release consists of 11 commits from 10 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Other:**
+
+- [branch-52] fix: maintain inner list nullability for `array_sort` (#19948) [#20878](https://github.com/apache/datafusion/pull/20878) (Jefffrey)
+- [branch-52] fix: Ensure columns are casted to the correct names with Unions (#20146) [#20879](https://github.com/apache/datafusion/pull/20879) (nuno-faria)
+- [branch-52] fix: interval analysis error when have two filterexec that inner filter proves zero selectivity (#20743) [#20880](https://github.com/apache/datafusion/pull/20880) (haohuaijin)
+- [branch-52] fix: Return `probe_side.len()` for RightMark/Anti count(\*) queries (#20710) [#20881](https://github.com/apache/datafusion/pull/20881) (jonathanc-n)
+- [branch-52] fix: disable dynamic filter pushdown for non min/max aggregates (#20279) [#20877](https://github.com/apache/datafusion/pull/20877) (notashes)
+- [branch-52] Fix duplicate group keys after hash aggregation spill (#20724) (#20858) [#20917](https://github.com/apache/datafusion/pull/20917) (gboucher90)
+- [branch-52] perf: Cache num_output_rows in sort merge join to avoid O(n) recount (#20478) [#20936](https://github.com/apache/datafusion/pull/20936) (andygrove)
+- [branch-52] fix: SanityCheckPlan error with window functions and NVL filter (#20231) [#20931](https://github.com/apache/datafusion/pull/20931) (EeshanBembi)
+- [branch-52] chore: Ignore RUSTSEC-2024-0014 (#20862) [#21020](https://github.com/apache/datafusion/pull/21020) (comphead)
+- [branch-52] fix: InList Dictionary filter pushdown type mismatch (#20962) [#20997](https://github.com/apache/datafusion/pull/20997) (alamb)
+- [branch-52] Update to use lz4_flex 0.12.1 and quinn-proto 0.11.14 [#21009](https://github.com/apache/datafusion/pull/21009) (alamb)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+     2	Andrew Lamb
+     1	Andy Grove
+     1	EeshanBembi
+     1	Guillaume Boucher
+     1	Huaijin
+     1	Jeffrey Vo
+     1	Jonathan Chen
+     1	Nuno Faria
+     1	Oleks V
+     1	notashes
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/changelog/52.5.0.md b/dev/changelog/52.5.0.md
new file mode 100644
index 0000000000000..e3f5ec0842bb2
--- /dev/null
+++ b/dev/changelog/52.5.0.md
@@ -0,0 +1,45 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 52.5.0 Changelog
+
+This release consists of 8 commits from 3 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Other:**
+
+- [branch-52] Fix push_down_filter for children with non-empty fetch fields (#21057) [#21141](https://github.com/apache/datafusion/pull/21141) (hareshkh)
+- [branch-52] Substrait join consumer should not merge nullability of join keys (#21121) [#21161](https://github.com/apache/datafusion/pull/21161) (hareshkh)
+- [branch-52] Fix incorrect regex pattern in regex_replace_posix_groups (#19827) [#21396](https://github.com/apache/datafusion/pull/21396) (alamb)
+- [branch-52] fix: Fix panic in regexp_like() (#20200) [#21397](https://github.com/apache/datafusion/pull/21397) (alamb)
+- [branch-52] fix: use spill writer's schema instead of the first batch schema for spill files (#21293) [#21403](https://github.com/apache/datafusion/pull/21403) (alamb)
+- [branch-52] chore: update deps for cargo audit [#21415](https://github.com/apache/datafusion/pull/21415) (alamb)
+- [branch-52] Restore Sort unparser guard for correct ORDER BY placement (#20658) [#21358](https://github.com/apache/datafusion/pull/21358) (alamb)
+- [BRANCH-52] fix: foreign inner ffi types [#21439](https://github.com/apache/datafusion/pull/21439) (timsaucer)
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+     5  Andrew Lamb
+     2  Haresh Khanna
+     1  Tim Saucer
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/changelog/53.0.0.md b/dev/changelog/53.0.0.md
new file mode 100644
index 0000000000000..11820f3caad7f
--- /dev/null
+++ b/dev/changelog/53.0.0.md
@@ -0,0 +1,640 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 53.0.0 Changelog
+
+This release consists of 475 commits from 114 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Breaking changes:**
+
+- Allow logical optimizer to be run without evaluating now() & refactor SimplifyInfo [#19505](https://github.com/apache/datafusion/pull/19505) (adriangb)
+- Make default ListingFilesCache table scoped [#19616](https://github.com/apache/datafusion/pull/19616) (jizezhang)
+- chore(deps): Update sqlparser to 0.60 [#19672](https://github.com/apache/datafusion/pull/19672) (Standing-Man)
+- Do not require mut in memory reservation methods [#19759](https://github.com/apache/datafusion/pull/19759) (gabotechs)
+- refactor: make PhysicalExprAdatperFactory::create fallible [#20017](https://github.com/apache/datafusion/pull/20017) (niebayes)
+- Add `ScalarValue::RunEndEncoded` variant [#19895](https://github.com/apache/datafusion/pull/19895) (Jefffrey)
+- minor: remove unused crypto functions & narrow public API [#20045](https://github.com/apache/datafusion/pull/20045) (Jefffrey)
+- Wrap immutable plan parts into Arc (make creating `ExecutionPlan`s less costly) [#19893](https://github.com/apache/datafusion/pull/19893) (askalt)
+- feat: Support planning subqueries with OuterReferenceColumn belongs to non-adjacent outer relations [#19930](https://github.com/apache/datafusion/pull/19930) (mkleen)
+- Remove the statistics() api in execution plan [#20319](https://github.com/apache/datafusion/pull/20319) (xudong963)
+- Remove recursive const check in `simplify_const_expr` [#20234](https://github.com/apache/datafusion/pull/20234) (AdamGS)
+- Cache `PlanProperties`, add fast-path for `with_new_children` [#19792](https://github.com/apache/datafusion/pull/19792) (askalt)
+- [branch-53] feat: parse `JsonAccess` as a binary operator, add `Operator::Colon` [#20717](https://github.com/apache/datafusion/pull/20717) (Samyak2)
+
+**Performance related:**
+
+- perf: optimize `HashTableLookupExpr::evaluate` [#19602](https://github.com/apache/datafusion/pull/19602) (UBarney)
+- perf: Improve performance of `split_part` [#19570](https://github.com/apache/datafusion/pull/19570) (andygrove)
+- Optimize `Nullstate` / accumulators [#19625](https://github.com/apache/datafusion/pull/19625) (Dandandan)
+- perf: optimize `NthValue` when `ignore_nulls` is true [#19496](https://github.com/apache/datafusion/pull/19496) (mzabaluev)
+- Optimize `concat/concat_ws` scalar path by pre-allocating memory [#19547](https://github.com/apache/datafusion/pull/19547) (lyne7-sc)
+- perf: optimize left function by eliminating double chars() iteration [#19571](https://github.com/apache/datafusion/pull/19571) (viirya)
+- perf: Optimize floor and ceil scalar performance [#19752](https://github.com/apache/datafusion/pull/19752) (kumarUjjawal)
+- perf: improve performance of `spark hex` function [#19738](https://github.com/apache/datafusion/pull/19738) (lyne7-sc)
+- perf: Optimize initcap scalar performance [#19776](https://github.com/apache/datafusion/pull/19776) (kumarUjjawal)
+- Row group limit pruning for row groups that entirely match predicates [#18868](https://github.com/apache/datafusion/pull/18868) (xudong963)
+- perf: Optimize trunc scalar performance [#19788](https://github.com/apache/datafusion/pull/19788) (kumarUjjawal)
+- perf: optimize `spark_hex` dictionary path by avoiding dictionary expansion [#19832](https://github.com/apache/datafusion/pull/19832) (lyne7-sc)
+- Add FilterExecBuilder to avoid recomputing properties multiple times [#19854](https://github.com/apache/datafusion/pull/19854) (adriangb)
+- perf: Optimize round scalar performance [#19831](https://github.com/apache/datafusion/pull/19831) (kumarUjjawal)
+- perf: Optimize signum scalar performance with fast path [#19871](https://github.com/apache/datafusion/pull/19871) (kumarUjjawal)
+- perf: Optimize scalar performance for cot [#19888](https://github.com/apache/datafusion/pull/19888) (kumarUjjawal)
+- perf: Optimize scalar fast path for iszero [#19919](https://github.com/apache/datafusion/pull/19919) (kumarUjjawal)
+- Misc hash / hash aggregation performance improvements [#19910](https://github.com/apache/datafusion/pull/19910) (Dandandan)
+- perf: Optimize scalar path for ascii function [#19951](https://github.com/apache/datafusion/pull/19951) (kumarUjjawal)
+- perf: Optimize factorial scalar path [#19949](https://github.com/apache/datafusion/pull/19949) (kumarUjjawal)
+- Speedup statistics_from_parquet_metadata [#20004](https://github.com/apache/datafusion/pull/20004) (Dandandan)
+- perf: improve performance of `array_remove`, `array_remove_n` and `array_remove_all` functions [#19996](https://github.com/apache/datafusion/pull/19996) (lyne7-sc)
+- perf: Optimize ArrowBytesViewMap with direct view access [#19975](https://github.com/apache/datafusion/pull/19975) (Tushar7012)
+- perf: Optimize repeat function for scalar and array fast [#19976](https://github.com/apache/datafusion/pull/19976) (kumarUjjawal)
+- perf: Push down join key filters for LEFT/RIGHT/ANTI joins [#19918](https://github.com/apache/datafusion/pull/19918) (nuno-faria)
+- perf: Optimize scalar path for chr function [#20073](https://github.com/apache/datafusion/pull/20073) (kumarUjjawal)
+- perf: improve performance of `array_repeat` function [#20049](https://github.com/apache/datafusion/pull/20049) (lyne7-sc)
+- perf: optimise right for byte access and StringView [#20069](https://github.com/apache/datafusion/pull/20069) (theirix)
+- Optimize `PhysicalExprSimplifier` [#20111](https://github.com/apache/datafusion/pull/20111) (AdamGS)
+- Improve performance of `CASE WHEN x THEN y ELSE NULL` expressions [#20097](https://github.com/apache/datafusion/pull/20097) (pepijnve)
+- perf: Optimize scalar fast path of to_hex function [#20112](https://github.com/apache/datafusion/pull/20112) (kumarUjjawal)
+- perf: Optimize scalar fast path & write() encoding for sha2 [#20116](https://github.com/apache/datafusion/pull/20116) (kumarUjjawal)
+- perf: improve performance of `array_union`/`array_intersect` with batched row conversion [#20243](https://github.com/apache/datafusion/pull/20243) (lyne7-sc)
+- perf: various optimizations to eliminate branch misprediction in hash_utils [#20168](https://github.com/apache/datafusion/pull/20168) (notashes)
+- perf: Optimize strpos() for ASCII-only inputs [#20295](https://github.com/apache/datafusion/pull/20295) (neilconway)
+- perf: Optimize compare_element_to_list [#20323](https://github.com/apache/datafusion/pull/20323) (neilconway)
+- perf: Optimize replace() fastpath by avoiding alloc [#20344](https://github.com/apache/datafusion/pull/20344) (neilconway)
+- perf: optimize `array_distinct` with batched row conversion [#20364](https://github.com/apache/datafusion/pull/20364) (lyne7-sc)
+- perf: Optimize scalar fast path of atan2 [#20336](https://github.com/apache/datafusion/pull/20336) (kumarUjjawal)
+- perf: Optimize concat()/concat_ws() UDFs [#20317](https://github.com/apache/datafusion/pull/20317) (neilconway)
+- perf: Optimize translate() UDF for scalar inputs [#20305](https://github.com/apache/datafusion/pull/20305) (neilconway)
+- perf: Optimize `array_has()` for scalar needle [#20374](https://github.com/apache/datafusion/pull/20374) (neilconway)
+- perf: Optimize lpad, rpad for ASCII strings [#20278](https://github.com/apache/datafusion/pull/20278) (neilconway)
+- perf: Optimize trim UDFs for single-character trims [#20328](https://github.com/apache/datafusion/pull/20328) (neilconway)
+- perf: Optimize scalar fast path for `regexp_like` and rejects g inside combined flags like ig [#20354](https://github.com/apache/datafusion/pull/20354) (kumarUjjawal)
+- perf: Use zero-copy slice instead of take kernel in sort merge join [#20463](https://github.com/apache/datafusion/pull/20463) (andygrove)
+- perf: Optimize `initcap()` [#20352](https://github.com/apache/datafusion/pull/20352) (neilconway)
+- perf: Fix quadratic behavior of `to_array_of_size` [#20459](https://github.com/apache/datafusion/pull/20459) (neilconway)
+- perf: Optimize `array_has_any()` with scalar arg [#20385](https://github.com/apache/datafusion/pull/20385) (neilconway)
+- perf: Use Hashbrown for array_distinct [#20538](https://github.com/apache/datafusion/pull/20538) (neilconway)
+- perf: Cache num_output_rows in sort merge join to avoid O(n) recount [#20478](https://github.com/apache/datafusion/pull/20478) (andygrove)
+- perf: Optimize heap handling in TopK operator [#20556](https://github.com/apache/datafusion/pull/20556) (AdamGS)
+- perf: Optimize `array_position` for scalar needle [#20532](https://github.com/apache/datafusion/pull/20532) (neilconway)
+- perf: Use Arrow vectorized eq kernel for IN list with column references [#20528](https://github.com/apache/datafusion/pull/20528) (zhangxffff)
+- perf: Optimize `array_agg()` using `GroupsAccumulator` [#20504](https://github.com/apache/datafusion/pull/20504) (neilconway)
+- perf: Optimize `array_to_string()`, support more types [#20553](https://github.com/apache/datafusion/pull/20553) (neilconway)
+- [branch-53] perf: sort replace free()->try_grow() pattern with try_resize() to reduce memory pool interactions [#20733](https://github.com/apache/datafusion/pull/20733) (mbutrovich)
+
+**Implemented enhancements:**
+
+- feat: add list_files_cache table function for `datafusion-cli` [#19388](https://github.com/apache/datafusion/pull/19388) (jizezhang)
+- feat: implement metrics for AsyncFuncExec [#19626](https://github.com/apache/datafusion/pull/19626) (feniljain)
+- feat: split BatchPartitioner::try_new into hash and round-robin constructors [#19668](https://github.com/apache/datafusion/pull/19668) (mohit7705)
+- feat: add Time type support to date_trunc function [#19640](https://github.com/apache/datafusion/pull/19640) (kumarUjjawal)
+- feat: Allow log with non-integer base on decimals [#19372](https://github.com/apache/datafusion/pull/19372) (Yuvraj-cyborg)
+- feat(spark): implement array_repeat function [#19702](https://github.com/apache/datafusion/pull/19702) (cht42)
+- feat(spark): Implement collect_list/collect_set aggregate functions [#19699](https://github.com/apache/datafusion/pull/19699) (cht42)
+- feat: implement Spark size function for arrays and maps [#19592](https://github.com/apache/datafusion/pull/19592) (CuteChuanChuan)
+- feat: support Set Comparison Subquery [#19109](https://github.com/apache/datafusion/pull/19109) (waynexia)
+- feat(spark): implement array slice function [#19811](https://github.com/apache/datafusion/pull/19811) (cht42)
+- feat(spark): implement substring function [#19805](https://github.com/apache/datafusion/pull/19805) (cht42)
+- feat: Add support for 'isoyear' in date_part function [#19821](https://github.com/apache/datafusion/pull/19821) (cht42)
+- feat: support `SELECT DISTINCT id FROM t ORDER BY id LIMIT n` query use GroupedTopKAggregateStream [#19653](https://github.com/apache/datafusion/pull/19653) (haohuaijin)
+- feat(spark): add trunc, date_trunc and time_trunc functions [#19829](https://github.com/apache/datafusion/pull/19829) (cht42)
+- feat(spark): implement Spark `date_diff` function [#19845](https://github.com/apache/datafusion/pull/19845) (cht42)
+- feat(spark): implement add_months function [#19711](https://github.com/apache/datafusion/pull/19711) (cht42)
+- feat: support pushdown alias on dynamic filter with `ProjectionExec` [#19404](https://github.com/apache/datafusion/pull/19404) (discord9)
+- feat(spark): add `base64` and `unbase64` functions [#19968](https://github.com/apache/datafusion/pull/19968) (cht42)
+- feat: Show the number of matched Parquet pages in `DataSourceExec` [#19977](https://github.com/apache/datafusion/pull/19977) (nuno-faria)
+- feat(spark): Add `SessionStateBuilderSpark` to datafusion-spark [#19865](https://github.com/apache/datafusion/pull/19865) (cht42)
+- feat(spark): implement `from/to_utc_timestamp` functions [#19880](https://github.com/apache/datafusion/pull/19880) (cht42)
+- feat(spark): implement `StringView` for `SparkConcat` [#19984](https://github.com/apache/datafusion/pull/19984) (aryan-212)
+- feat(spark): add unix date and timestamp functions [#19892](https://github.com/apache/datafusion/pull/19892) (cht42)
+- feat: implement protobuf converter trait to allow control over serialization and deserialization processes [#19437](https://github.com/apache/datafusion/pull/19437) (timsaucer)
+- feat: optimise copying in `left` for Utf8 and LargeUtf8 [#19980](https://github.com/apache/datafusion/pull/19980) (theirix)
+- feat: support Spark-compatible abs math function part 2 - ANSI mode [#18828](https://github.com/apache/datafusion/pull/18828) (hsiang-c)
+- feat: add AggregateMode::PartialReduce for tree-reduce aggregation [#20019](https://github.com/apache/datafusion/pull/20019) (njsmith)
+- feat: add ExpressionPlacement enum for optimizer expression placement decisions [#20065](https://github.com/apache/datafusion/pull/20065) (adriangb)
+- feat: support f16 in coercion logic [#18944](https://github.com/apache/datafusion/pull/18944) (Jefffrey)
+- feat: unify left and right functions and benches [#20114](https://github.com/apache/datafusion/pull/20114) (theirix)
+- feat(spark): Adds negative spark function [#20006](https://github.com/apache/datafusion/pull/20006) (SubhamSinghal)
+- feat: support limited deletion [#20137](https://github.com/apache/datafusion/pull/20137) (askalt)
+- feat: Pushdown filters through `UnionExec` nodes [#20145](https://github.com/apache/datafusion/pull/20145) (haohuaijin)
+- feat: support Spark-compatible `string_to_map` function [#20120](https://github.com/apache/datafusion/pull/20120) (unknowntpo)
+- feat: Add `partition_stats()` for `EmptyExec` [#20203](https://github.com/apache/datafusion/pull/20203) (jonathanc-n)
+- feat: add ExtractLeafExpressions optimizer rule for get_field pushdown [#20117](https://github.com/apache/datafusion/pull/20117) (adriangb)
+- feat: Push limit into hash join [#20228](https://github.com/apache/datafusion/pull/20228) (jonathanc-n)
+- feat: Optimize hash util for `MapArray` [#20179](https://github.com/apache/datafusion/pull/20179) (jonathanc-n)
+- feat: Implement Spark `bitmap_bit_position` function [#20275](https://github.com/apache/datafusion/pull/20275) (kazantsev-maksim)
+- feat: support sqllogictest output coloring [#20368](https://github.com/apache/datafusion/pull/20368) (theirix)
+- feat: support Spark-compatible `json_tuple` function [#20412](https://github.com/apache/datafusion/pull/20412) (CuteChuanChuan)
+- feat: Implement Spark `bitmap_bucket_number` function [#20288](https://github.com/apache/datafusion/pull/20288) (kazantsev-maksim)
+- feat: support `arrays_zip` function [#20440](https://github.com/apache/datafusion/pull/20440) (comphead)
+- feat: Implement Spark `bin` function [#20479](https://github.com/apache/datafusion/pull/20479) (kazantsev-maksim)
+- feat: support extension planner for `TableScan` [#20548](https://github.com/apache/datafusion/pull/20548) (linhr)
+
+**Fixed bugs:**
+
+- fix: Return Int for Date - Date instead of duration [#19563](https://github.com/apache/datafusion/pull/19563) (kumarUjjawal)
+- fix: DynamicFilterPhysicalExpr violates Hash/Eq contract [#19659](https://github.com/apache/datafusion/pull/19659) (kumarUjjawal)
+- fix: unnest struct field with an alias failed with internal error [#19698](https://github.com/apache/datafusion/pull/19698) (kumarUjjawal)
+- fix(accumulators): preserve state in evaluate() for window frame queries [#19618](https://github.com/apache/datafusion/pull/19618) (GaneshPatil7517)
+- fix: Don't treat quoted column names as placeholder variables in SQL [#19339](https://github.com/apache/datafusion/pull/19339) (pmallex)
+- fix: enhance CTE resolution with identifier normalization [#19519](https://github.com/apache/datafusion/pull/19519) (kysshsy)
+- feat: Add null-aware anti join support [#19635](https://github.com/apache/datafusion/pull/19635) (viirya)
+- fix: expose `ListFilesEntry` [#19804](https://github.com/apache/datafusion/pull/19804) (lonless9)
+- fix: trunc function with precision uses round instead of trunc semantics [#19794](https://github.com/apache/datafusion/pull/19794) (kumarUjjawal)
+- fix: calculate total seconds from interval fields for `extract(epoch)` [#19807](https://github.com/apache/datafusion/pull/19807) (lemorage)
+- fix: predicate cache stats calculation [#19561](https://github.com/apache/datafusion/pull/19561) (feniljain)
+- fix: preserve state in DistinctMedianAccumulator::evaluate() for window frame queries [#19887](https://github.com/apache/datafusion/pull/19887) (kumarUjjawal)
+- fix: null in array_agg with DISTINCT and IGNORE [#19736](https://github.com/apache/datafusion/pull/19736) (davidlghellin)
+- fix: union should retrun error instead of panic when input schema's len different [#19922](https://github.com/apache/datafusion/pull/19922) (haohuaijin)
+- fix: change token consumption to pick to test on EOF in parser [#19927](https://github.com/apache/datafusion/pull/19927) (askalt)
+- fix: maintain inner list nullability for `array_sort` [#19948](https://github.com/apache/datafusion/pull/19948) (Jefffrey)
+- fix: Make `generate_series` return an empty set with invalid ranges [#19999](https://github.com/apache/datafusion/pull/19999) (nuno-faria)
+- fix: return correct length array for scalar null input to `calculate_binary_math` [#19861](https://github.com/apache/datafusion/pull/19861) (Jefffrey)
+- fix: respect DataFrameWriteOptions::with_single_file_output for paths without extensions [#19931](https://github.com/apache/datafusion/pull/19931) (kumarUjjawal)
+- fix: correct weight handling in approx_percentile_cont_with_weight [#19941](https://github.com/apache/datafusion/pull/19941) (sesteves)
+- fix: The limit_pushdown physical optimization rule removes limits in some cases leading to incorrect results [#20048](https://github.com/apache/datafusion/pull/20048) (masonh22)
+- Add duplicate name error reproducer [#20106](https://github.com/apache/datafusion/pull/20106) (gabotechs)
+- fix: filter pushdown when merge filter [#20110](https://github.com/apache/datafusion/pull/20110) (haohuaijin)
+- fix: Make `serialize_to_file` test cross platform [#20147](https://github.com/apache/datafusion/pull/20147) (nuno-faria)
+- fix: regression of `dict_id` in physical plan proto [#20063](https://github.com/apache/datafusion/pull/20063) (kumarUjjawal)
+- fix: panic in ListingTableFactory when session is not SessionState [#20139](https://github.com/apache/datafusion/pull/20139) (evangelisilva)
+- fix: update comment on FilterPushdownPropagation [#20040](https://github.com/apache/datafusion/pull/20040) (niebayes)
+- fix: datatype_is_logically_equal for dictionaries [#20153](https://github.com/apache/datafusion/pull/20153) (dd-annarose)
+- fix: Avoid integer overflow in split_part() [#20198](https://github.com/apache/datafusion/pull/20198) (neilconway)
+- fix: Fix panic in regexp_like() [#20200](https://github.com/apache/datafusion/pull/20200) (neilconway)
+- fix: Handle NULL inputs correctly in find_in_set() [#20209](https://github.com/apache/datafusion/pull/20209) (neilconway)
+- fix: Ensure columns are casted to the correct names with Unions [#20146](https://github.com/apache/datafusion/pull/20146) (nuno-faria)
+- fix: Avoid assertion failure on divide-by-zero [#20216](https://github.com/apache/datafusion/pull/20216) (neilconway)
+- fix: Throw coercion error for `LIKE` operations for nested types. [#20212](https://github.com/apache/datafusion/pull/20212) (jonathanc-n)
+- fix: disable dynamic filter pushdown for non min/max aggregates [#20279](https://github.com/apache/datafusion/pull/20279) (notashes)
+- fix: Avoid integer overflow in substr() [#20199](https://github.com/apache/datafusion/pull/20199) (neilconway)
+- fix: Fix scalar broadcast for to_timestamp() [#20224](https://github.com/apache/datafusion/pull/20224) (neilconway)
+- fix: Add integer check for bitwise coercion [#20241](https://github.com/apache/datafusion/pull/20241) (Acfboy)
+- fix: percentile_cont interpolation causes NaN for f16 input [#20208](https://github.com/apache/datafusion/pull/20208) (kumarUjjawal)
+- fix: validate inter-file ordering in eq_properties() [#20329](https://github.com/apache/datafusion/pull/20329) (adriangb)
+- fix: update filter predicates for min/max aggregates only if bounds change [#20380](https://github.com/apache/datafusion/pull/20380) (notashes)
+- fix: Handle Utf8View and LargeUtf8 separators in concat_ws [#20361](https://github.com/apache/datafusion/pull/20361) (neilconway)
+- fix: HashJoin panic with dictionary-encoded columns in multi-key joins [#20441](https://github.com/apache/datafusion/pull/20441) (Tim-53)
+- fix: handle out of range errors in DATE_BIN instead of panicking [#20221](https://github.com/apache/datafusion/pull/20221) (mishop-15)
+- fix: prevent duplicate alias collision with user-provided \_\_datafusion_extracted names [#20432](https://github.com/apache/datafusion/pull/20432) (adriangb)
+- fix: SortMergeJoin don't wait for all input before emitting [#20482](https://github.com/apache/datafusion/pull/20482) (rluvaton)
+- fix: `cardinality()` of an empty array should be zero [#20533](https://github.com/apache/datafusion/pull/20533) (neilconway)
+- fix: Unaccounted spill sort in row_hash [#20314](https://github.com/apache/datafusion/pull/20314) (EmilyMatt)
+- fix: IS NULL panic with invalid function without input arguments [#20306](https://github.com/apache/datafusion/pull/20306) (Acfboy)
+- fix: handle empty delimiter in split_part (closes #20503) [#20542](https://github.com/apache/datafusion/pull/20542) (gferrate)
+- fix(substrait): Correctly parse field references in subqueries [#20439](https://github.com/apache/datafusion/pull/20439) (neilconway)
+- fix: increase ROUND decimal precision to prevent overflow truncation [#19926](https://github.com/apache/datafusion/pull/19926) (kumarUjjawal)
+- fix: Fix `array_to_string` with columnar third arg [#20536](https://github.com/apache/datafusion/pull/20536) (neilconway)
+- fix: Fix and Refactor Spark `shuffle` function [#20484](https://github.com/apache/datafusion/pull/20484) (erenavsarogullari)
+
+**Documentation updates:**
+
+- perfect hash join [#19411](https://github.com/apache/datafusion/pull/19411) (UBarney)
+- docs: Fix two small issues in introduction.md [#19712](https://github.com/apache/datafusion/pull/19712) (AdamGS)
+- docs: Refine Communication documentation to highlight Discord [#19714](https://github.com/apache/datafusion/pull/19714) (alamb)
+- chore(deps): bump maturin from 1.10.2 to 1.11.5 in /docs [#19740](https://github.com/apache/datafusion/pull/19740) (dependabot[bot])
+- chore: remove LZO Parquet compression [#19726](https://github.com/apache/datafusion/pull/19726) (kumarUjjawal)
+- Update 52.0.0 release version number and changelog [#19767](https://github.com/apache/datafusion/pull/19767) (xudong963)
+- Update the upgrading.md [#19769](https://github.com/apache/datafusion/pull/19769) (xudong963)
+- chore: update copyright notice year [#19758](https://github.com/apache/datafusion/pull/19758) (Jefffrey)
+- doc: Add an auto-generated dependency graph for internal crates [#19280](https://github.com/apache/datafusion/pull/19280) (2010YOUY01)
+- Docs: Fix some links in docs [#19834](https://github.com/apache/datafusion/pull/19834) (alamb)
+- Docs: add additional links to blog posts [#19833](https://github.com/apache/datafusion/pull/19833) (alamb)
+- Ensure null inputs to array setop functions return null output [#19683](https://github.com/apache/datafusion/pull/19683) (Jefffrey)
+- chore(deps): bump sphinx from 8.2.3 to 9.1.0 in /docs [#19647](https://github.com/apache/datafusion/pull/19647) (dependabot[bot])
+- Fix struct casts to align fields by name (prevent positional mis-casts) [#19674](https://github.com/apache/datafusion/pull/19674) (kosiew)
+- chore(deps): bump setuptools from 80.9.0 to 80.10.1 in /docs [#19988](https://github.com/apache/datafusion/pull/19988) (dependabot[bot])
+- minor: Fix doc about `write_batch_size` [#19979](https://github.com/apache/datafusion/pull/19979) (nuno-faria)
+- Fix broken links in the documentation [#19964](https://github.com/apache/datafusion/pull/19964) (alamb)
+- minor: Add favicon [#20000](https://github.com/apache/datafusion/pull/20000) (nuno-faria)
+- docs: Fix some broken / missing links in the DataFusion documentation [#19958](https://github.com/apache/datafusion/pull/19958) (alamb)
+- chore(deps): bump setuptools from 80.10.1 to 80.10.2 in /docs [#20022](https://github.com/apache/datafusion/pull/20022) (dependabot[bot])
+- docs: Automatically update DataFusion version in docs [#20001](https://github.com/apache/datafusion/pull/20001) (nuno-faria)
+- docs: update data_types.md to reflect current Arrow type mappings [#20072](https://github.com/apache/datafusion/pull/20072) (karuppuchamysuresh)
+- Runs-on for `linux-build-lib` and `linux-test` (2X faster CI) [#20107](https://github.com/apache/datafusion/pull/20107) (blaginin)
+- Disallow positional struct casting when field names don’t overlap [#19955](https://github.com/apache/datafusion/pull/19955) (kosiew)
+- docs: fix docstring formatting [#20158](https://github.com/apache/datafusion/pull/20158) (Jefffrey)
+- Break upgrade guides into separate pages [#20183](https://github.com/apache/datafusion/pull/20183) (mishop-15)
+- Better document the relationship between `FileFormat::projection` / `FileFormat::filter` and `FileScanConfig::Statistics` [#20188](https://github.com/apache/datafusion/pull/20188) (alamb)
+- Document the relationship between FileFormat::projection / FileFormat::filter and FileScanConfig::output_ordering [#20196](https://github.com/apache/datafusion/pull/20196) (alamb)
+- More documentation on `FileSource::table_schema` and `FileSource::projection` [#20242](https://github.com/apache/datafusion/pull/20242) (alamb)
+- chore(deps): bump setuptools from 80.10.2 to 82.0.0 in /docs [#20255](https://github.com/apache/datafusion/pull/20255) (dependabot[bot])
+- docs: fix typos and improve wording in README [#20301](https://github.com/apache/datafusion/pull/20301) (iampratap7997-dot)
+- Reduce ExtractLeafExpressions optimizer overhead with fast pre-scan [#20341](https://github.com/apache/datafusion/pull/20341) (adriangb)
+- chore(deps): bump maturin from 1.11.5 to 1.12.2 in /docs [#20400](https://github.com/apache/datafusion/pull/20400) (dependabot[bot])
+- Migrate Python usage to uv workspace [#20414](https://github.com/apache/datafusion/pull/20414) (adriangb)
+- test: Extend Spark Array functions: `array_repeat `, `shuffle` and `slice` test coverage [#20420](https://github.com/apache/datafusion/pull/20420) (erenavsarogullari)
+- Runs-on for more actions [#20274](https://github.com/apache/datafusion/pull/20274) (blaginin)
+- docs: Document that adding new optimizer rules are expensive [#20348](https://github.com/apache/datafusion/pull/20348) (alamb)
+- add redirect for old upgrading.html URL to fix broken changelog links [#20582](https://github.com/apache/datafusion/pull/20582) (mishop-15)
+- Upgrade DataFusion to arrow-rs/parquet 58.0.0 / `object_store` 0.13.0 [#19728](https://github.com/apache/datafusion/pull/19728) (alamb)
+- Document guidance on how to evaluate breaking API changes [#20584](https://github.com/apache/datafusion/pull/20584) (alamb)
+- [branch-53] chore: prepare 53 release [#20649](https://github.com/apache/datafusion/pull/20649) (comphead)
+
+**Other:**
+
+- [branch-53] chore: Add branch protection (comphead)
+- Add a protection to release candidate branch 52 [#19660](https://github.com/apache/datafusion/pull/19660) (xudong963)
+- Downgrade aws-smithy-runtime, update `rust_decimal`, ignore RUSTSEC-2026-0001 to get clean CI [#19657](https://github.com/apache/datafusion/pull/19657) (alamb)
+- Update dependencies [#19667](https://github.com/apache/datafusion/pull/19667) (alamb)
+- Refactor PartitionedFile: add ordering field and new_from_meta constructor [#19596](https://github.com/apache/datafusion/pull/19596) (adriangb)
+- Remove coalesce batches rule and deprecate CoalesceBatchesExec [#19622](https://github.com/apache/datafusion/pull/19622) (feniljain)
+- Perf: Optimize `substring_index` via single-byte fast path and direct indexing [#19590](https://github.com/apache/datafusion/pull/19590) (lyne7-sc)
+- refactor: Use `Signature::coercible` for isnan/iszero [#19604](https://github.com/apache/datafusion/pull/19604) (kumarUjjawal)
+- Parquet: Push down supported list predicates (array_has/any/all) during decoding [#19545](https://github.com/apache/datafusion/pull/19545) (kosiew)
+- Remove dependency on `rust_decimal`, remove ignore of `RUSTSEC-2026-0001` [#19666](https://github.com/apache/datafusion/pull/19666) (alamb)
+- Store example data directly inside the datafusion-examples (#19141) [#19319](https://github.com/apache/datafusion/pull/19319) (cj-zhukov)
+- minor: More comments to `ParquetOpener::open()` [#19677](https://github.com/apache/datafusion/pull/19677) (2010YOUY01)
+- Feat: Allow pow with negative & non-integer exponent on decimals [#19369](https://github.com/apache/datafusion/pull/19369) (Yuvraj-cyborg)
+- chore(deps): bump taiki-e/install-action from 2.65.13 to 2.65.15 [#19676](https://github.com/apache/datafusion/pull/19676) (dependabot[bot])
+- Refactor cache APIs to support ordering information [#19597](https://github.com/apache/datafusion/pull/19597) (adriangb)
+- Record sort order when writing Parquet with WITH ORDER [#19595](https://github.com/apache/datafusion/pull/19595) (adriangb)
+- implement var distinct [#19706](https://github.com/apache/datafusion/pull/19706) (thinh2)
+- Fix TopK aggregation for UTF-8/Utf8View group keys and add safe fallback for unsupported string aggregates [#19285](https://github.com/apache/datafusion/pull/19285) (kosiew)
+- infer parquet file order from metadata and use it to optimize scans [#19433](https://github.com/apache/datafusion/pull/19433) (adriangb)
+- Add support for additional numeric types in to_timestamp functions [#19663](https://github.com/apache/datafusion/pull/19663) (gokselk)
+- Fix internal error "Physical input schema should be the same as the one converted from logical input schema." [#18412](https://github.com/apache/datafusion/pull/18412) (alamb)
+- fix(functions-aggregate): drain CORR state vectors for streaming aggregation [#19669](https://github.com/apache/datafusion/pull/19669) (geoffreyclaude)
+- chore: bump dependabot PR limit for cargo from 5 to 15 [#19730](https://github.com/apache/datafusion/pull/19730) (Jefffrey)
+- chore(deps): bump taiki-e/install-action from 2.65.15 to 2.66.1 [#19741](https://github.com/apache/datafusion/pull/19741) (dependabot[bot])
+- chore(deps): bump sqllogictest from 0.28.4 to 0.29.0 [#19744](https://github.com/apache/datafusion/pull/19744) (dependabot[bot])
+- chore(deps): bump blake3 from 1.8.2 to 1.8.3 [#19746](https://github.com/apache/datafusion/pull/19746) (dependabot[bot])
+- chore(deps): bump libc from 0.2.179 to 0.2.180 [#19748](https://github.com/apache/datafusion/pull/19748) (dependabot[bot])
+- chore(deps): bump async-compression from 0.4.36 to 0.4.37 [#19742](https://github.com/apache/datafusion/pull/19742) (dependabot[bot])
+- chore(deps): bump indexmap from 2.12.1 to 2.13.0 [#19747](https://github.com/apache/datafusion/pull/19747) (dependabot[bot])
+- Improve comment for predicate_cache_inner_records [#19762](https://github.com/apache/datafusion/pull/19762) (xudong963)
+- Fix dynamic filter is_used function [#19734](https://github.com/apache/datafusion/pull/19734) (LiaCastaneda)
+- slt: Add test for REE arrays in group by [#19763](https://github.com/apache/datafusion/pull/19763) (brancz)
+- Fix run_tpcds data dir [#19771](https://github.com/apache/datafusion/pull/19771) (gabotechs)
+- chore(deps): bump taiki-e/install-action from 2.66.1 to 2.66.2 [#19778](https://github.com/apache/datafusion/pull/19778) (dependabot[bot])
+- Include .proto files in datafusion-proto distribution [#19490](https://github.com/apache/datafusion/pull/19490) (DarkWanderer)
+- Simplify `expr = L1 AND expr != L2` to `expr = L1` when `L1 != L2` [#19731](https://github.com/apache/datafusion/pull/19731) (simonvandel)
+- chore(deps): bump flate2 from 1.1.5 to 1.1.8 [#19780](https://github.com/apache/datafusion/pull/19780) (dependabot[bot])
+- Upgrade DataFusion to arrow-rs/parquet 57.2.0 [#19355](https://github.com/apache/datafusion/pull/19355) (alamb)
+- Expose Spilling Progress Interface in DataFusion [#19708](https://github.com/apache/datafusion/pull/19708) (xudong963)
+- dev: Add a script to auto fix all lint violations [#19560](https://github.com/apache/datafusion/pull/19560) (2010YOUY01)
+- refactor: Optimize `required_columns` from `BTreeSet` to `Vec` in struct `PushdownChecker` [#19678](https://github.com/apache/datafusion/pull/19678) (kumarUjjawal)
+- Revert Workround for Empty FixedSizeBinary Values Buffer After arrow-rs Upgrade [#19801](https://github.com/apache/datafusion/pull/19801) (tobixdev)
+- chore(deps): bump taiki-e/install-action from 2.66.2 to 2.66.3 [#19802](https://github.com/apache/datafusion/pull/19802) (dependabot[bot])
+- Add Reproducer for Issues with LEFT joins on Fixed Size Binary Columns [#19800](https://github.com/apache/datafusion/pull/19800) (tobixdev)
+- Improvements to `list_files_cache` table function [#19703](https://github.com/apache/datafusion/pull/19703) (alamb)
+- Issue 19781 : Internal error: Assertion failed: !self.finished: LimitedBatchCoalescer [#19785](https://github.com/apache/datafusion/pull/19785) (bert-beyondloops)
+- physical plan: add `reset_plan_states `, plan re-use benchmark [#19806](https://github.com/apache/datafusion/pull/19806) (askalt)
+- chore(deps): bump actions/setup-node from 6.1.0 to 6.2.0 [#19825](https://github.com/apache/datafusion/pull/19825) (dependabot[bot])
+- Use correct setting for click bench queries in sql_planner benchmark [#19835](https://github.com/apache/datafusion/pull/19835) (alamb)
+- chore(deps): bump taiki-e/install-action from 2.66.3 to 2.66.5 [#19824](https://github.com/apache/datafusion/pull/19824) (dependabot[bot])
+- chore: refactor scalarvalue/encoding using available upstream arrow-rs methods [#19797](https://github.com/apache/datafusion/pull/19797) (Jefffrey)
+- Refactor Spark `date_add`/`date_sub`/`bitwise_not` to remove unnecessary scalar arg check [#19473](https://github.com/apache/datafusion/pull/19473) (Jefffrey)
+- Add BatchAdapter to simplify using PhysicalExprAdapter / Projector to map RecordBatch between schemas [#19716](https://github.com/apache/datafusion/pull/19716) (adriangb)
+- [Minor] Reuse indices buffer in RepartitionExec [#19775](https://github.com/apache/datafusion/pull/19775) (Dandandan)
+- Fix(optimizer): Make `EnsureCooperative` optimizer idempotent under multiple runs [#19757](https://github.com/apache/datafusion/pull/19757) (danielhumanmod)
+- Allow dropping qualified columns [#19549](https://github.com/apache/datafusion/pull/19549) (ntjohnson1)
+- Doc: Add more blog links to doc comments [#19837](https://github.com/apache/datafusion/pull/19837) (alamb)
+- datafusion/common: Add support for hashing ListView arrays [#19814](https://github.com/apache/datafusion/pull/19814) (brancz)
+- Project sort expressions in StreamingTable [#19719](https://github.com/apache/datafusion/pull/19719) (timsaucer)
+- Fix grouping set subset satisfaction [#19853](https://github.com/apache/datafusion/pull/19853) (freakyzoidberg)
+- Spark date part [#19823](https://github.com/apache/datafusion/pull/19823) (cht42)
+- chore(deps): bump wasm-bindgen-test from 0.3.56 to 0.3.58 [#19898](https://github.com/apache/datafusion/pull/19898) (dependabot[bot])
+- chore(deps): bump tokio-postgres from 0.7.15 to 0.7.16 [#19899](https://github.com/apache/datafusion/pull/19899) (dependabot[bot])
+- chore(deps): bump postgres-types from 0.2.11 to 0.2.12 [#19902](https://github.com/apache/datafusion/pull/19902) (dependabot[bot])
+- chore(deps): bump insta from 1.46.0 to 1.46.1 [#19901](https://github.com/apache/datafusion/pull/19901) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.66.5 to 2.66.7 [#19883](https://github.com/apache/datafusion/pull/19883) (dependabot[bot])
+- Consolidate cte_quoted_reference.slt into cte.slt [#19862](https://github.com/apache/datafusion/pull/19862) (AnjaliChoudhary99)
+- Disable failing `array_union` edge-case with nested null array [#19904](https://github.com/apache/datafusion/pull/19904) (Jefffrey)
+- chore(deps): bump the proto group across 1 directory with 5 updates [#19745](https://github.com/apache/datafusion/pull/19745) (dependabot[bot])
+- test(wasmtest): enable compression feature for wasm build [#19860](https://github.com/apache/datafusion/pull/19860) (ChanTsune)
+- Feat : added truncate table support [#19633](https://github.com/apache/datafusion/pull/19633) (Nachiket-Roy)
+- Remove UDAF manual Debug impls and simplify signatures [#19727](https://github.com/apache/datafusion/pull/19727) (Jefffrey)
+- chore(deps): bump thiserror from 2.0.17 to 2.0.18 [#19900](https://github.com/apache/datafusion/pull/19900) (dependabot[bot])
+- Include license and notice files in more crates [#19913](https://github.com/apache/datafusion/pull/19913) (ankane)
+- chore(deps): bump actions/setup-python from 6.1.0 to 6.2.0 [#19935](https://github.com/apache/datafusion/pull/19935) (dependabot[bot])
+- Coerce expressions to udtf [#19915](https://github.com/apache/datafusion/pull/19915) (XiangpengHao)
+- Fix trailing whitespace in CROSS JOIN logical plan formatting [#19936](https://github.com/apache/datafusion/pull/19936) (mkleen)
+- chore(deps): bump chrono from 0.4.42 to 0.4.43 [#19897](https://github.com/apache/datafusion/pull/19897) (dependabot[bot])
+- Improve error message when string functions receive Binary types [#19819](https://github.com/apache/datafusion/pull/19819) (lemorage)
+- Refactor ListArray hashing to consider only sliced values [#19500](https://github.com/apache/datafusion/pull/19500) (Jefffrey)
+- feat(datafusion-spark): implement spark compatible `unhex` function [#19909](https://github.com/apache/datafusion/pull/19909) (lyne7-sc)
+- Support API for "pre-image" for pruning predicate evaluation [#19722](https://github.com/apache/datafusion/pull/19722) (sdf-jkl)
+- Support LargeUtf8 as partition column [#19942](https://github.com/apache/datafusion/pull/19942) (paleolimbot)
+- chore(deps): bump actions/checkout from 6.0.1 to 6.0.2 [#19953](https://github.com/apache/datafusion/pull/19953) (dependabot[bot])
+- preserve FilterExec batch size during ser/de [#19960](https://github.com/apache/datafusion/pull/19960) (askalt)
+- Add struct pushdown query benchmark and projection pushdown tests [#19962](https://github.com/apache/datafusion/pull/19962) (adriangb)
+- Improve error messages with nicer formatting of Date and Time types [#19954](https://github.com/apache/datafusion/pull/19954) (emilk)
+- export `SessionState::register_catalog_list(...)` [#19925](https://github.com/apache/datafusion/pull/19925) (askalt)
+- Change GitHub actions dependabot schedule to weekly [#19981](https://github.com/apache/datafusion/pull/19981) (Jefffrey)
+- chore(deps): bump taiki-e/install-action from 2.66.7 to 2.67.9 [#19987](https://github.com/apache/datafusion/pull/19987) (dependabot[bot])
+- chore(deps): bump quote from 1.0.43 to 1.0.44 [#19992](https://github.com/apache/datafusion/pull/19992) (dependabot[bot])
+- chore(deps): bump nix from 0.30.1 to 0.31.1 [#19991](https://github.com/apache/datafusion/pull/19991) (dependabot[bot])
+- chore(deps): bump sysinfo from 0.37.2 to 0.38.0 [#19990](https://github.com/apache/datafusion/pull/19990) (dependabot[bot])
+- chore(deps): bump uuid from 1.19.0 to 1.20.0 [#19993](https://github.com/apache/datafusion/pull/19993) (dependabot[bot])
+- minor: pull `uuid` into workspace dependencies [#19997](https://github.com/apache/datafusion/pull/19997) (Jefffrey)
+- Fix ClickBench EventDate handling by casting UInt16 days-since-epoch to DATE via `hits` view [#19881](https://github.com/apache/datafusion/pull/19881) (kosiew)
+- refactor: extract pushdown test utilities to shared module [#20010](https://github.com/apache/datafusion/pull/20010) (adriangb)
+- chore(deps): bump taiki-e/install-action from 2.67.9 to 2.67.13 [#20020](https://github.com/apache/datafusion/pull/20020) (dependabot[bot])
+- add more projection pushdown slt tests [#20015](https://github.com/apache/datafusion/pull/20015) (adriangb)
+- minor: Move metric `page_index_rows_pruned` to verbose level in `EXPLAIN ANALYZE` [#20026](https://github.com/apache/datafusion/pull/20026) (2010YOUY01)
+- Tweak `adapter serialization` example [#20035](https://github.com/apache/datafusion/pull/20035) (adriangb)
+- Simplify wait_complete function [#19937](https://github.com/apache/datafusion/pull/19937) (LiaCastaneda)
+- [main] Update version to `52.1.0` (#19878) [#20028](https://github.com/apache/datafusion/pull/20028) (alamb)
+- Fix/parquet opener page index policy [#19890](https://github.com/apache/datafusion/pull/19890) (aviralgarg05)
+- minor: add tests for coercible signature considering nulls/dicts/ree [#19459](https://github.com/apache/datafusion/pull/19459) (Jefffrey)
+- Enforce `clippy::allow_attributes` globally across workspace [#19576](https://github.com/apache/datafusion/pull/19576) (Jefffrey)
+- Fix constant value from stats [#20042](https://github.com/apache/datafusion/pull/20042) (gabotechs)
+- Simplify Spark `sha2` implementation [#19475](https://github.com/apache/datafusion/pull/19475) (Jefffrey)
+- Further refactoring of type coercion function code [#19603](https://github.com/apache/datafusion/pull/19603) (Jefffrey)
+- replace private is_volatile_expression_tree with equivalent public is_volatile [#20056](https://github.com/apache/datafusion/pull/20056) (adriangb)
+- Improve documentation for ScalarUDFImpl::preimage [#20008](https://github.com/apache/datafusion/pull/20008) (alamb)
+- Use BooleanBufferBuilder rather than Vec<bool> in ArrowBytesViewMap [#20064](https://github.com/apache/datafusion/pull/20064) (etk18)
+- chore: Add microbenchmark (compared to ExprOrExpr) [#20076](https://github.com/apache/datafusion/pull/20076) (CuteChuanChuan)
+- Minor: update tests in limit_pushdown.rs to insta [#20066](https://github.com/apache/datafusion/pull/20066) (alamb)
+- Reduce number of traversals per node in `PhysicalExprSimplifier` [#20082](https://github.com/apache/datafusion/pull/20082) (AdamGS)
+- Automatically generate examples documentation adv (#19294) [#19750](https://github.com/apache/datafusion/pull/19750) (cj-zhukov)
+- Implement preimage for floor function to enable predicate pushdown [#20059](https://github.com/apache/datafusion/pull/20059) (devanshu0987)
+- Refactor `iszero()` and `isnan()` to accept all numeric types [#20093](https://github.com/apache/datafusion/pull/20093) (kumarUjjawal)
+- Use return_field_from_args in information schema and date_trunc [#20079](https://github.com/apache/datafusion/pull/20079) (AndreaBozzo)
+- Preserve PhysicalExpr graph in proto round trip using Arc pointers as unique identifiers [#20037](https://github.com/apache/datafusion/pull/20037) (adriangb)
+- add ability to customize tokens in parser [#19978](https://github.com/apache/datafusion/pull/19978) (askalt)
+- Adjust `case_when DivideByZeroProtection` benchmark so that "percentage of zeroes" corresponds to "number of times protection is needed" [#20105](https://github.com/apache/datafusion/pull/20105) (pepijnve)
+- refactor: Rename `FileSource::try_reverse_output` to `FileSource::try_pushdown_sort` [#20043](https://github.com/apache/datafusion/pull/20043) (kumarUjjawal)
+- Improve memory accounting for ArrowBytesViewMap [#20077](https://github.com/apache/datafusion/pull/20077) (vigneshsiva11)
+- chore: reduce production noise by using `debug` macro [#19885](https://github.com/apache/datafusion/pull/19885) (Standing-Man)
+- chore(deps): bump taiki-e/install-action from 2.67.13 to 2.67.18 [#20124](https://github.com/apache/datafusion/pull/20124) (dependabot[bot])
+- chore(deps): bump actions/setup-node from 4 to 6 [#20125](https://github.com/apache/datafusion/pull/20125) (dependabot[bot])
+- chore(deps): bump tonic from 0.14.2 to 0.14.3 [#20127](https://github.com/apache/datafusion/pull/20127) (dependabot[bot])
+- chore(deps): bump insta from 1.46.1 to 1.46.3 [#20129](https://github.com/apache/datafusion/pull/20129) (dependabot[bot])
+- chore(deps): bump flate2 from 1.1.8 to 1.1.9 [#20130](https://github.com/apache/datafusion/pull/20130) (dependabot[bot])
+- chore(deps): bump clap from 4.5.54 to 4.5.56 [#20131](https://github.com/apache/datafusion/pull/20131) (dependabot[bot])
+- Add BufferExec execution plan [#19760](https://github.com/apache/datafusion/pull/19760) (gabotechs)
+- Optimize the evaluation of date_part(<col>) == <constant> when pushed down [#19733](https://github.com/apache/datafusion/pull/19733) (sdf-jkl)
+- chore(deps): bump bytes from 1.11.0 to 1.11.1 [#20141](https://github.com/apache/datafusion/pull/20141) (dependabot[bot])
+- Make session state builder clonable [#20136](https://github.com/apache/datafusion/pull/20136) (askalt)
+- chore: remove datatype check functions in favour of upstream versions [#20104](https://github.com/apache/datafusion/pull/20104) (Jefffrey)
+- Add Decimal support for floor preimage [#20099](https://github.com/apache/datafusion/pull/20099) (devanshu0987)
+- Add more struct pushdown tests and planning benchmark [#20143](https://github.com/apache/datafusion/pull/20143) (adriangb)
+- Add RepartitionExec test to projection_pushdown.slt [#20156](https://github.com/apache/datafusion/pull/20156) (adriangb)
+- chore: Fix typos in comments [#20157](https://github.com/apache/datafusion/pull/20157) (neilconway)
+- Fix `array_repeat` handling of null count values [#20102](https://github.com/apache/datafusion/pull/20102) (lyne7-sc)
+- Refactor schema rewriter: remove lifetimes, extract column/cast helpers, add mismatch coverage [#20166](https://github.com/apache/datafusion/pull/20166) (kosiew)
+- chore(deps): bump time from 0.3.44 to 0.3.47 [#20172](https://github.com/apache/datafusion/pull/20172) (dependabot[bot])
+- chore(deps-dev): bump webpack from 5.94.0 to 5.105.0 in /datafusion/wasmtest/datafusion-wasm-app [#20178](https://github.com/apache/datafusion/pull/20178) (dependabot[bot])
+- Fix Arrow Spill Underrun [#20159](https://github.com/apache/datafusion/pull/20159) (cetra3)
+- nom parser instead of ad-hoc in examples [#20122](https://github.com/apache/datafusion/pull/20122) (cj-zhukov)
+- fix(datafusion-cli): solve row count bug adding`saturating_add` to prevent potential overflow [#20185](https://github.com/apache/datafusion/pull/20185) (dariocurr)
+- Enable inlist support for preimage [#20051](https://github.com/apache/datafusion/pull/20051) (sdf-jkl)
+- unify the prettier versions [#20167](https://github.com/apache/datafusion/pull/20167) (cj-zhukov)
+- chore: Unbreak doctest CI [#20218](https://github.com/apache/datafusion/pull/20218) (neilconway)
+- Minor: verify plan output and unique field names [#20220](https://github.com/apache/datafusion/pull/20220) (alamb)
+- Add more tests to projection_pushdown.slt [#20236](https://github.com/apache/datafusion/pull/20236) (adriangb)
+- Add Expr::Alias passthrough to Expr::placement() [#20237](https://github.com/apache/datafusion/pull/20237) (adriangb)
+- Make PushDownFilter and CommonSubexprEliminate aware of Expr::placement [#20239](https://github.com/apache/datafusion/pull/20239) (adriangb)
+- Refactor example metadata parsing utilities(#20204) [#20233](https://github.com/apache/datafusion/pull/20233) (cj-zhukov)
+- add module structure and unit tests for expression pushdown logical optimizer [#20238](https://github.com/apache/datafusion/pull/20238) (adriangb)
+- repro and disable dyn filter for preserve file partitions [#20175](https://github.com/apache/datafusion/pull/20175) (gene-bordegaray)
+- chore(deps): bump taiki-e/install-action from 2.67.18 to 2.67.27 [#20254](https://github.com/apache/datafusion/pull/20254) (dependabot[bot])
+- chore(deps): bump sysinfo from 0.38.0 to 0.38.1 [#20261](https://github.com/apache/datafusion/pull/20261) (dependabot[bot])
+- chore(deps): bump clap from 4.5.56 to 4.5.57 [#20265](https://github.com/apache/datafusion/pull/20265) (dependabot[bot])
+- chore(deps): bump tempfile from 3.24.0 to 3.25.0 [#20262](https://github.com/apache/datafusion/pull/20262) (dependabot[bot])
+- chore(deps): bump regex from 1.12.2 to 1.12.3 [#20260](https://github.com/apache/datafusion/pull/20260) (dependabot[bot])
+- chore(deps): bump criterion from 0.8.1 to 0.8.2 [#20258](https://github.com/apache/datafusion/pull/20258) (dependabot[bot])
+- chore(deps): bump regex-syntax from 0.8.8 to 0.8.9 [#20264](https://github.com/apache/datafusion/pull/20264) (dependabot[bot])
+- chore(deps): bump aws-config from 1.8.12 to 1.8.13 [#20263](https://github.com/apache/datafusion/pull/20263) (dependabot[bot])
+- chore(deps): bump async-compression from 0.4.37 to 0.4.39 [#20259](https://github.com/apache/datafusion/pull/20259) (dependabot[bot])
+- Support JSON arrays reader/parse for datafusion [#19924](https://github.com/apache/datafusion/pull/19924) (zhuqi-lucas)
+- chore: Add confirmation before tarball is released [#20207](https://github.com/apache/datafusion/pull/20207) (milenkovicm)
+- FilterExec should remap indices of parent dynamic filters [#20286](https://github.com/apache/datafusion/pull/20286) (jackkleeman)
+- Clean up expression placement UDF usage in tests [#20272](https://github.com/apache/datafusion/pull/20272) (adriangb)
+- chore(deps): bump the arrow-parquet group with 7 updates [#20256](https://github.com/apache/datafusion/pull/20256) (dependabot[bot])
+- Cleanup example metadata parsing utilities(#20251) [#20252](https://github.com/apache/datafusion/pull/20252) (cj-zhukov)
+- Add `StructArray` and `RunArray` benchmark tests to `with_hashes` [#20182](https://github.com/apache/datafusion/pull/20182) (notashes)
+- Add protoc support for ArrowScanExecNode (#20280) [#20284](https://github.com/apache/datafusion/pull/20284) (JoshElkind)
+- Improve ExternalSorter ResourcesExhausted Error Message [#20226](https://github.com/apache/datafusion/pull/20226) (erenavsarogullari)
+- Introduce ProjectionExprs::unproject_exprs/project_exprs and improve docs [#20193](https://github.com/apache/datafusion/pull/20193) (alamb)
+- chore: Remove "extern crate criterion" in benches [#20299](https://github.com/apache/datafusion/pull/20299) (neilconway)
+- Support pushing down empty projections into joins [#20191](https://github.com/apache/datafusion/pull/20191) (jackkleeman)
+- chore: change width_bucket buckets parameter from i32 to i64 [#20330](https://github.com/apache/datafusion/pull/20330) (comphead)
+- fix null handling for `nanvl` & implement fast path [#20205](https://github.com/apache/datafusion/pull/20205) (kumarUjjawal)
+- unify the prettier version adv(#20024) [#20311](https://github.com/apache/datafusion/pull/20311) (cj-zhukov)
+- chore: Make memchr a workspace dependency [#20345](https://github.com/apache/datafusion/pull/20345) (neilconway)
+- feat(datafusion-cli): enhance CLI helper with default hint [#20310](https://github.com/apache/datafusion/pull/20310) (dariocurr)
+- Adds support for ANSI mode in negative function [#20189](https://github.com/apache/datafusion/pull/20189) (SubhamSinghal)
+- Support parent dynamic filters for more join types [#20192](https://github.com/apache/datafusion/pull/20192) (jackkleeman)
+- Fix incorrect `SortExec` removal before `AggregateExec` (option 2) [#20247](https://github.com/apache/datafusion/pull/20247) (alamb)
+- Fix `try_shrink` not freeing back to pool [#20382](https://github.com/apache/datafusion/pull/20382) (cetra3)
+- chore(deps): bump sysinfo from 0.38.1 to 0.38.2 [#20411](https://github.com/apache/datafusion/pull/20411) (dependabot[bot])
+- chore(deps): bump indicatif from 0.18.3 to 0.18.4 [#20410](https://github.com/apache/datafusion/pull/20410) (dependabot[bot])
+- chore(deps): bump liblzma from 0.4.5 to 0.4.6 [#20409](https://github.com/apache/datafusion/pull/20409) (dependabot[bot])
+- chore(deps): bump aws-config from 1.8.13 to 1.8.14 [#20407](https://github.com/apache/datafusion/pull/20407) (dependabot[bot])
+- chore(deps): bump tonic from 0.14.3 to 0.14.4 [#20406](https://github.com/apache/datafusion/pull/20406) (dependabot[bot])
+- chore(deps): bump clap from 4.5.57 to 4.5.59 [#20404](https://github.com/apache/datafusion/pull/20404) (dependabot[bot])
+- chore(deps): bump sqllogictest from 0.29.0 to 0.29.1 [#20405](https://github.com/apache/datafusion/pull/20405) (dependabot[bot])
+- chore(deps): bump env_logger from 0.11.8 to 0.11.9 [#20402](https://github.com/apache/datafusion/pull/20402) (dependabot[bot])
+- chore(deps): bump actions/stale from 10.1.1 to 10.2.0 [#20397](https://github.com/apache/datafusion/pull/20397) (dependabot[bot])
+- chore(deps): bump uuid from 1.20.0 to 1.21.0 [#20401](https://github.com/apache/datafusion/pull/20401) (dependabot[bot])
+- [Minor] Update object_store to 0.12.5 [#20378](https://github.com/apache/datafusion/pull/20378) (Dandandan)
+- chore(deps): bump syn from 2.0.114 to 2.0.116 [#20399](https://github.com/apache/datafusion/pull/20399) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.67.27 to 2.68.0 [#20398](https://github.com/apache/datafusion/pull/20398) (dependabot[bot])
+- chore: Cleanup returning null arrays [#20423](https://github.com/apache/datafusion/pull/20423) (neilconway)
+- chore: fix labeler for `datafusion-functions-nested` [#20442](https://github.com/apache/datafusion/pull/20442) (comphead)
+- build: update Rust toolchain version from 1.92.0 to 1.93.0 in `rust-toolchain.toml` [#20309](https://github.com/apache/datafusion/pull/20309) (dariocurr)
+- chore: Cleanup "!is_valid(i)" -> "is_null(i)" [#20453](https://github.com/apache/datafusion/pull/20453) (neilconway)
+- refactor: Extract sort-merge join filter logic into separate module [#19614](https://github.com/apache/datafusion/pull/19614) (viirya)
+- Implement FFI table provider factory [#20326](https://github.com/apache/datafusion/pull/20326) (davisp)
+- bench: Add criterion benchmark for sort merge join [#20464](https://github.com/apache/datafusion/pull/20464) (andygrove)
+- chore: group minor dependencies into single PR [#20457](https://github.com/apache/datafusion/pull/20457) (comphead)
+- chore(deps): bump taiki-e/install-action from 2.68.0 to 2.68.6 [#20467](https://github.com/apache/datafusion/pull/20467) (dependabot[bot])
+- chore(deps): bump astral-sh/setup-uv from 6.1.0 to 7.3.0 [#20468](https://github.com/apache/datafusion/pull/20468) (dependabot[bot])
+- chore(deps): bump the all-other-cargo-deps group with 6 updates [#20470](https://github.com/apache/datafusion/pull/20470) (dependabot[bot])
+- chore(deps): bump testcontainers-modules from 0.14.0 to 0.15.0 [#20471](https://github.com/apache/datafusion/pull/20471) (dependabot[bot])
+- [Minor] Use buffer_unordered [#20462](https://github.com/apache/datafusion/pull/20462) (Dandandan)
+- bench: Add IN list benchmarks for non-constant list expressions [#20444](https://github.com/apache/datafusion/pull/20444) (zhangxffff)
+- feat(memory-tracking): implement arrow_buffer::MemoryPool for MemoryPool [#18928](https://github.com/apache/datafusion/pull/18928) (notfilippo)
+- chore: Avoid build fails on MinIO rate limits [#20472](https://github.com/apache/datafusion/pull/20472) (comphead)
+- chore: Add end-to-end benchmark for array_agg, code cleanup [#20496](https://github.com/apache/datafusion/pull/20496) (neilconway)
+- Upgrade to sqlparser 0.61.0 [#20177](https://github.com/apache/datafusion/pull/20177) (alamb)
+- Switch to the latest Mac OS [#20510](https://github.com/apache/datafusion/pull/20510) (blaginin)
+- Fix name tracker [#19856](https://github.com/apache/datafusion/pull/19856) (xanderbailey)
+- Runs-on for extended CI checks [#20511](https://github.com/apache/datafusion/pull/20511) (blaginin)
+- chore(deps): bump strum from 0.27.2 to 0.28.0 [#20520](https://github.com/apache/datafusion/pull/20520) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.68.6 to 2.68.8 [#20518](https://github.com/apache/datafusion/pull/20518) (dependabot[bot])
+- chore(deps): bump the all-other-cargo-deps group with 2 updates [#20519](https://github.com/apache/datafusion/pull/20519) (dependabot[bot])
+- Make `custom_file_casts` example schema nullable to allow null `id` values during casting [#20486](https://github.com/apache/datafusion/pull/20486) (kosiew)
+- Add support for FFI config extensions [#19469](https://github.com/apache/datafusion/pull/19469) (timsaucer)
+- chore: Cleanup code to use `repeat_n` in a few places [#20527](https://github.com/apache/datafusion/pull/20527) (neilconway)
+- chore(deps): bump strum_macros from 0.27.2 to 0.28.0 [#20521](https://github.com/apache/datafusion/pull/20521) (dependabot[bot])
+- chore: Replace `matches!` on fieldless enums with `==` [#20525](https://github.com/apache/datafusion/pull/20525) (neilconway)
+- Update comments on OptimizerRule about function name matching [#20346](https://github.com/apache/datafusion/pull/20346) (alamb)
+- Fix incorrect regex pattern in regex_replace_posix_groups [#19827](https://github.com/apache/datafusion/pull/19827) (GaneshPatil7517)
+- Improve `HashJoinExecBuilder` to save state from previous fields [#20276](https://github.com/apache/datafusion/pull/20276) (askalt)
+- [Minor] Fix error messages for `shrink` and `try_shrink` [#20422](https://github.com/apache/datafusion/pull/20422) (hareshkh)
+- Fix physical expr adapter to resolve physical fields by name, not column index [#20485](https://github.com/apache/datafusion/pull/20485) (kosiew)
+- [fix] Add type coercion from NULL to Interval to make date_bin more postgres compatible [#20499](https://github.com/apache/datafusion/pull/20499) (LiaCastaneda)
+- Clamp early aggregation emit to the sort boundary when using partial group ordering [#20446](https://github.com/apache/datafusion/pull/20446) (jackkleeman)
+- Split `push_down_filter.slt` into standalone sqllogictest files to reduce long-tail runtime [#20566](https://github.com/apache/datafusion/pull/20566) (kosiew)
+- Add deterministic per-file timing summary to sqllogictest runner [#20569](https://github.com/apache/datafusion/pull/20569) (kosiew)
+- chore: Enable workspace lint for all workspace members [#20577](https://github.com/apache/datafusion/pull/20577) (neilconway)
+- Fix serde of window lead/lag defaults [#20608](https://github.com/apache/datafusion/pull/20608) (avantgardnerio)
+- [branch-53] fix: make the `sql` feature truly optional (#20625) [#20680](https://github.com/apache/datafusion/pull/20680) (linhr)
+- [53] fix: Fix bug in `array_has` scalar path with sliced arrays (#20677) [#20700](https://github.com/apache/datafusion/pull/20700) (neilconway)
+- [branch-53] fix: Return `probe_side.len()` for RightMark/Anti count(\*) queries (#… [#20726](https://github.com/apache/datafusion/pull/20726) (jonathanc-n)
+- [branch-53] FFI_TableOptions are using default values only [#20722](https://github.com/apache/datafusion/pull/20722) (timsaucer)
+- chore(deps): pin substrait to `0.62.2` [#20827](https://github.com/apache/datafusion/pull/20827) (milenkovicm)
+- chore(deps): pin substrait version [#20848](https://github.com/apache/datafusion/pull/20848) (milenkovicm)
+- [branch-53] Fix repartition from dropping data when spilling (#20672) [#20792](https://github.com/apache/datafusion/pull/20792) (xanderbailey)
+- [branch-53] fix: `HashJoin` panic with String dictionary keys (don't flatten keys) (#20505) [#20791](https://github.com/apache/datafusion/pull/20791) (alamb)
+- [branch-53] cli: Fix datafusion-cli hint edge cases (#20609) [#20887](https://github.com/apache/datafusion/pull/20887) (comphead)
+- [branch-53] perf: Optimize `to_char` to allocate less, fix NULL handling (#20635) [#20885](https://github.com/apache/datafusion/pull/20885) (neilconway)
+- [branch-53] fix: interval analysis error when have two filterexec that inner filter proves zero selectivity (#20743) [#20882](https://github.com/apache/datafusion/pull/20882) (haohuaijin)
+- [branch-53] correct parquet leaf index mapping when schema contains struct cols (#20698) [#20884](https://github.com/apache/datafusion/pull/20884) (friendlymatthew)
+- [branch-53] ser/de fetch in FilterExec (#20738) [#20883](https://github.com/apache/datafusion/pull/20883) (haohuaijin)
+- [branch-53] fix: use try_shrink instead of shrink in try_resize (#20424) [#20890](https://github.com/apache/datafusion/pull/20890) (ariel-miculas)
+- [branch-53] Reattach parquet metadata cache after deserializing in datafusion-proto (#20574) [#20891](https://github.com/apache/datafusion/pull/20891) (nathanb9)
+- [branch-53] fix: do not recompute hash join exec properties if not required (#20900) [#20903](https://github.com/apache/datafusion/pull/20903) (askalt)
+- [branch-53] fix(spark): handle divide-by-zero in Spark `mod`/`pmod` with ANSI mode support (#20461) [#20896](https://github.com/apache/datafusion/pull/20896) (davidlghellin)
+- [branch-53] fix: Provide more generic API for the capacity limit parsing (#20372) [#20893](https://github.com/apache/datafusion/pull/20893) (erenavsarogullari)
+- [branch-53] fix: sqllogictest cannot convert <subquery> to Substrait (#19739) [#20897](https://github.com/apache/datafusion/pull/20897) (kumarUjjawal)
+- [branch-53] Fix DELETE/UPDATE filter extraction when predicates are pushed down into TableScan (#19884) [#20898](https://github.com/apache/datafusion/pull/20898) (kosiew)
+- [branch-53] fix: preserve None projection semantics across FFI boundary in ForeignTableProvider::scan (#20393) [#20895](https://github.com/apache/datafusion/pull/20895) (Kontinuation)
+- [branch-53] Fix FilterExec converting Absent column stats to Exact(NULL) (#20391) [#20892](https://github.com/apache/datafusion/pull/20892) (fwojciec)
+- [branch-53] backport: Support Spark `array_contains` builtin function (#20685) [#20914](https://github.com/apache/datafusion/pull/20914) (comphead)
+- [branch-53] Fix duplicate group keys after hash aggregation spill (#20724) (#20858) [#20918](https://github.com/apache/datafusion/pull/20918) (gboucher90)
+- [branch-53] fix: SanityCheckPlan error with window functions and NVL filter (#20231) [#20932](https://github.com/apache/datafusion/pull/20932) (EeshanBembi)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+    73	dependabot[bot]
+    37	Neil Conway
+    32	Kumar Ujjawal
+    28	Andrew Lamb
+    26	Adrian Garcia Badaracco
+    21	Jeffrey Vo
+    13	cht42
+    11	Albert Skalt
+    11	kosiew
+    10	lyne
+     8	Nuno Faria
+     8	Oleks V
+     7	Sergey Zhukov
+     7	xudong.w
+     6	Daniël Heres
+     6	Huaijin
+     5	Adam Gutglick
+     5	Gabriel
+     5	Jonathan Chen
+     4	Andy Grove
+     4	Dmitrii Blaginin
+     4	Eren Avsarogullari
+     4	Jack Kleeman
+     4	notashes
+     4	theirix
+     4	Tim Saucer
+     4	Yongting You
+     3	dario curreri
+     3	feniljain
+     3	Kazantsev Maksim
+     3	Kosta Tarasov
+     3	Liang-Chi Hsieh
+     3	Lía Adriana
+     3	Marko Milenković
+     3	mishop-15
+     3	Yu-Chuan Hung
+     2	Acfboy
+     2	Alan Tang
+     2	David López
+     2	Devanshu
+     2	Frederic Branczyk
+     2	Ganesh Patil
+     2	Heran Lin
+     2	jizezhang
+     2	Miao
+     2	Michael Kleen
+     2	niebayes
+     2	Pepijn Van Eeckhoudt
+     2	Peter L
+     2	Subham Singhal
+     2	Tobias Schwarzinger
+     2	UBarney
+     2	Xander
+     2	Yuvraj
+     2	Zhang Xiaofeng
+     1	Andrea Bozzo
+     1	Andrew Kane
+     1	Anjali Choudhary
+     1	Anna-Rose Lescure
+     1	Ariel Miculas-Trif
+     1	Aryan Anand
+     1	Aviral Garg
+     1	Bert Vermeiren
+     1	Brent Gardner
+     1	ChanTsune
+     1	comphead
+     1	danielhumanmod
+     1	Dewey Dunnington
+     1	discord9
+     1	Divyansh Pratap Singh
+     1	Eesh Sagar Singh
+     1	EeshanBembi
+     1	Emil Ernerfeldt
+     1	Emily Matheys
+     1	Eric Chang
+     1	Evangeli Silva
+     1	Filip Wojciechowski
+     1	Filippo
+     1	Gabriel Ferraté
+     1	Gene Bordegaray
+     1	Geoffrey Claude
+     1	Goksel Kabadayi
+     1	Guillaume Boucher
+     1	Haresh Khanna
+     1	hsiang-c
+     1	iamthinh
+     1	Josh Elkind
+     1	karuppuchamysuresh
+     1	Kristin Cowalcijk
+     1	Mason
+     1	Matt Butrovich
+     1	Matthew Kim
+     1	Mikhail Zabaluev
+     1	Mohit rao
+     1	nathan
+     1	Nathaniel J. Smith
+     1	Nick
+     1	Oleg V. Kozlyuk
+     1	Paul J. Davis
+     1	Pierre Lacave
+     1	pmallex
+     1	Qi Zhu
+     1	Raz Luvaton
+     1	Rosai
+     1	Ruihang Xia
+     1	Samyak Sarnayak
+     1	Sergio Esteves
+     1	Simon Vandel Sillesen
+     1	Siyuan Huang
+     1	Tim-53
+     1	Tushar Das
+     1	Vignesh
+     1	Xiangpeng Hao
+     1	XL Liang
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/changelog/53.1.0.md b/dev/changelog/53.1.0.md
new file mode 100644
index 0000000000000..5e39e0041f4af
--- /dev/null
+++ b/dev/changelog/53.1.0.md
@@ -0,0 +1,51 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 53.1.0 Changelog
+
+This release consists of 10 commits from 4 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Other:**
+
+- [branch-53] fix: InList Dictionary filter pushdown type mismatch (#20962) [#20996](https://github.com/apache/datafusion/pull/20996) (alamb)
+- [branch-53] Planning speed improve (port of #21084) [#21137](https://github.com/apache/datafusion/pull/21137) (blaginin)
+- [branch-53] Fix push_down_filter for children with non-empty fetch fields (#21057) [#21142](https://github.com/apache/datafusion/pull/21142) (hareshkh)
+- [branch-53] Substrait join consumer should not merge nullability of join keys (#21121) [#21162](https://github.com/apache/datafusion/pull/21162) (hareshkh)
+- [branch-53] chore: Optimize schema rewriter usages (#21158) [#21183](https://github.com/apache/datafusion/pull/21183) (comphead)
+- [branch-53] fix: use spill writer's schema instead of the first batch schema for … [#21451](https://github.com/apache/datafusion/pull/21451) (comphead)
+- [branch-53] fix: use datafusion_expr instead of datafusion crate in spark bitmap/… [#21452](https://github.com/apache/datafusion/pull/21452) (comphead)
+- [branch-53] fix: FilterExec should drop projection when apply projection pushdown [#21492](https://github.com/apache/datafusion/pull/21492) (comphead)
+- [branch-53] fix: foreign inner ffi types (#21439) [#21524](https://github.com/apache/datafusion/pull/21524) (alamb)
+- [branch-53] Restore Sort unparser guard for correct ORDER BY placement (#20658) [#21523](https://github.com/apache/datafusion/pull/21523) (alamb)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+     4  Oleks V
+     3  Andrew Lamb
+     2  Haresh Khanna
+     1  Dmitrii Blaginin
+
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/depcheck/Cargo.toml b/dev/depcheck/Cargo.toml
index 23cefaec43be4..3e4bf39cced42 100644
--- a/dev/depcheck/Cargo.toml
+++ b/dev/depcheck/Cargo.toml
@@ -18,8 +18,9 @@
 # Circular dependency checker for DataFusion
 [package]
 name = "depcheck"
+edition = "2024"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-cargo = "0.81.0"
+cargo = "0.92.0"
diff --git a/benchmarks/requirements.txt b/dev/depcheck/rust-toolchain.toml
similarity index 82%
rename from benchmarks/requirements.txt
rename to dev/depcheck/rust-toolchain.toml
index 20a5a2bddbf20..55d572362d142 100644
--- a/benchmarks/requirements.txt
+++ b/dev/depcheck/rust-toolchain.toml
@@ -15,4 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
-rich
+# This file specifies the default version of Rust used
+# to compile this workspace and run CI jobs.
+
+[toolchain]
+channel = "1.89.0"
+components = ["rustfmt", "clippy"]
diff --git a/dev/depcheck/src/main.rs b/dev/depcheck/src/main.rs
index 80feefcd1b1c5..ebd79faa6f465 100644
--- a/dev/depcheck/src/main.rs
+++ b/dev/depcheck/src/main.rs
@@ -48,7 +48,7 @@ fn main() -> CargoResult<()> {
         root_cargo_toml.display()
     );
     let workspace = cargo::core::Workspace::new(&root_cargo_toml, &gctx)?;
-    let (_, resolve) = cargo::ops::resolve_ws(&workspace)?;
+    let (_, resolve) = cargo::ops::resolve_ws(&workspace, false)?;
 
     let mut package_deps = HashMap::new();
     for package_id in resolve
diff --git a/dev/pyproject.toml b/dev/pyproject.toml
new file mode 100644
index 0000000000000..a2f5653d9d879
--- /dev/null
+++ b/dev/pyproject.toml
@@ -0,0 +1,5 @@
+[project]
+name = "datafusion-dev"
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = ["tomlkit", "PyGithub", "requests"]
diff --git a/dev/release/README.md b/dev/release/README.md
index 1b78f8d13be98..2ca495cbb135f 100644
--- a/dev/release/README.md
+++ b/dev/release/README.md
@@ -19,59 +19,47 @@
 
 # Release Process
 
-DataFusion typically has major releases around once per month, including breaking API changes.
+For contributor-facing guidance on release branches and backports, see the
+[Contributor Guide Release Management page](../../docs/source/contributor-guide/release_management.md).
 
-Patch releases are made on an adhoc basis, but we try and avoid them given the frequent major releases.
+This guide is for maintainers to create release candidates and run the release
+process.
 
-## Branching Policy
+## Release Prerequisites
 
-- When we prepare a new release, we create a release branch, such as `branch-37` in the Apache repository (not in a fork)
-- We update the crate version and generate the changelog in this branch and create a PR against the main branch
-- Once the PR is approved and merged, we tag the rc in the release branch, and release from the release branch
-- Bug fixes can be merged to the release branch and patch releases can be created from the release branch
+### Add git remote for `apache` repo
 
-#### How to backport (add changes) to `branch-*` branch
-
-If you would like to propose your change for inclusion in a release branch for a
-patch release:
-
-1. Find (or create) the issue for the incremental release ([example release issue]) and discuss the proposed change there with the maintainers.
-2. Follow normal workflow to create PR to `main` branch and wait for its approval and merge.
-3. After PR is squash merged to `main`, branch from most recent release branch (e.g. `branch-37`), cherry-pick the commit and create a PR targeting the release branch [example backport PR].
-
-For example, to backport commit `12345` from `main` to `branch-43`:
+The instructions below assume the upstream git repo `git@github.com:apache/datafusion.git` in remote `apache`.
 
 ```shell
-git checkout branch-43
-git checkout -b backport_to_43
-git cherry-pick 12345
-git push -u <your fork>
-# make a PR as normal
+git remote add apache git@github.com:apache/datafusion.git
 ```
 
-[example release issue]: https://github.com/apache/datafusion/issues/9904
-[example backport pr]: https://github.com/apache/datafusion/pull/10123
+### Create GitHub Personal Access Token (PAT)
 
-## Release Prerequisite
+A personal access token (PAT) is needed for changelog automation script. If you
+do not already have one, create a token with the `repo` access by navigating to
+[GitHub Developer Settings] page, and [follow these steps].
 
-- Have upstream git repo `git@github.com:apache/datafusion.git` add as git remote `apache`.
-- Created a personal access token in GitHub for changelog automation script.
-  - Github PAT should be created with `repo` access
-- Make sure your signing key is added to the following files in SVN:
-  - https://dist.apache.org/repos/dist/dev/datafusion/KEYS
-  - https://dist.apache.org/repos/dist/release/datafusion/KEYS
+[github developer settings]: https://github.com/settings/developers
+[follow these steps]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token
 
-### How to add signing key
+### Add GPG Public Key to SVN `KEYS` file
+
+If you will be releasing the final tarball, your GPG public key must be present in the following SVN files:
+
+- https://dist.apache.org/repos/dist/dev/datafusion/KEYS
+- https://dist.apache.org/repos/dist/release/datafusion/KEYS
 
 See instructions at https://infra.apache.org/release-signing.html#generate for generating keys.
 
-Committers can add signing keys in Subversion client with their ASF account. e.g.:
+Committers can add signing keys using the Subversion client and their ASF account:
 
 ```shell
 $ svn co https://dist.apache.org/repos/dist/dev/datafusion
 $ cd datafusion
-$ editor KEYS
-$ svn ci KEYS
+$ editor KEYS # add your key here
+$ svn ci KEYS # commit changes
 ```
 
 Follow the instructions in the header of the KEYS file to append your key. Here is an example:
@@ -81,175 +69,230 @@ Follow the instructions in the header of the KEYS file to append your key. Here
 svn commit KEYS -m "Add key for John Doe"
 ```
 
-## Process Overview
+## Release Process: Step by Step
 
 As part of the Apache governance model, official releases consist of signed
 source tarballs approved by the PMC.
+We then publish the code in the approved artifacts to crates.io.
 
-We then use the code in the approved artifacts to release to crates.io and
-PyPI.
-
-### Change Log
+### 1. Create Release Branch
 
-We maintain a `CHANGELOG.md` so our users know what has been changed between releases.
+First create a new release branch from `main` in the `apache` repository.
 
-You will need a GitHub Personal Access Token for the following steps. Follow
-[these instructions](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token)
-to generate one if you do not already have one.
-
-The changelog is generated using a Python script. There is a dependency on `PyGitHub`, which can be installed using pip:
+For example, to create the `branch-50` branch for the `50.x.y` release series:
 
 ```shell
-pip3 install PyGitHub
+git fetch apache             # make sure we are up to date
+git checkout apache/main     # checkout current latest development branch
+git checkout -b branch-50    # create local branch
+git push -u apache branch-50 # push branch to apache remote
 ```
 
-To generate the changelog, set the `GITHUB_TOKEN` environment variable to a valid token and then run the script
-providing two commit ids or tags followed by the version number of the release being created. The following
-example generates a change log of all changes between the first commit and the current HEAD revision.
+### 2. Prepare PR to Update the Release Version
+
+Manually update the DataFusion version in the root `Cargo.toml` to
+reflect the new release version. Ensure `Cargo.lock` is updated accordingly by
+running:
 
 ```shell
-export GITHUB_TOKEN=<your-token-here>
-./dev/release/generate-changelog.py 24.0.0 HEAD 25.0.0 > dev/changelog/25.0.0.md
+cargo check -p datafusion
 ```
 
-This script creates a changelog from GitHub PRs based on the labels associated with them as well as looking for
-titles starting with `feat:`, `fix:`, or `docs:`.
-
-Once the change log is generated, run `prettier` to format the document:
+Then commit the changes and create a PR targeting the release branch `branch-N`.
 
 ```shell
-prettier -w dev/changelog/25.0.0md
+git commit -a -m 'Update version'
 ```
 
-## Prepare release commits and PR
+### 3. Protect Release Branch
 
-Prepare a PR to update `CHANGELOG.md` and versions to reflect the planned
-release.
+To protect a release candidate branch from accidental merges, create PR against `main`:
 
-See [#9697](https://github.com/apache/datafusion/pull/9697) for an example.
+```shell
+git fetch apache && git checkout -b protect_branch_50
+./dev/release/add-branch-protection.sh 50
+```
 
-Modify `asf.yaml` to protect future release candidate branch to prevent accidental merges:
+The script will modify `.asf.yaml` and add following block:
 
 ```yaml
-# needs to be updated as part of the release process
 branch-50:
   required_pull_request_reviews:
     required_approving_review_count: 1
 ```
 
-Here are the commands that could be used to prepare the `38.0.0` release:
+- Commit changes
+- Push to `origin/protect_branch_50`
+- Create a PR against `main`.
+- Merge to `main`.
+- Notify community in Discord/Slack that release branch is created
+
+### 4. Backporting urgent changes
+
+After release branch `branch-N` created, protected and got its version updated, please check if there are any backports expected from the community.
+Please refer to [Backport Flow](../../docs/source/contributor-guide/release_management.md#backport-workflow) for more details.
+
+Backports are important and sometimes unexpected, so please proceed to next release steps once all expected backports are applied.
 
-### Update Version
+### 5. Prepare PR to Update Changelog
 
-Checkout the main commit to be released
+Update the changelog in `dev/changelog/`. Each release has its
+own file, such as `dev/changelog/50.0.0.md`, which should include all changes
+since the previous release.
+
+The changelog is generated using a Python script, which requires a GitHub
+Personal Access Token (described in the prerequisites section) and the
+`PyGitHub` library. First install the dev dependencies via `uv`:
 
 ```shell
-git fetch apache
-git checkout apache/main
+uv sync
+```
+
+To generate the changelog, set the `GITHUB_TOKEN` environment variable and run
+`./dev/release/generate-changelog.py` with two commit IDs or tags followed by
+the release version. For example, to generate a changelog of all changes
+between the `50.3.0` tag and `branch-51` for release `51.0.0`:
+
+> [!NOTE]
+>
+> If you see errors such as the following, it is likely due to not setting
+> the `GITHUB_TOKEN` environment variable.
+>
+> ```
+> Request GET ... failed with 403: rate limit exceeded
+> ```
+
+```shell
+export GITHUB_TOKEN=<your-token-here>
+uv run ./dev/release/generate-changelog.py 50.3.0 branch-51 51.0.0 > dev/changelog/51.0.0.md
 ```
 
-Manually update the datafusion version in the root `Cargo.toml` to `38.0.0`.
+This script creates a changelog from GitHub PRs based on the labels associated with them as well as looking for
+titles starting with `feat:`, `fix:`, or `docs:`.
 
-Run `cargo test` to re-generate some example files:
+Once the changelog is generated, run `prettier` to format the document:
 
 ```shell
-cargo test
+prettier -w dev/changelog/51.0.0.md
 ```
 
-Lastly commit the version change:
+Then commit the changes and create a PR targeting the release branch.
 
 ```shell
-git commit -a -m 'Update version'
+git commit -a -m 'Update changelog'
 ```
 
-## Prepare release candidate artifacts
+### 6. Prepare Release Candidate Artifacts
 
-After the PR gets merged, you are ready to create release artifacts based off the
+After the changelog updates merged to `branch-N`, you are ready to create release artifacts based off the
 merged commit.
 
-(Note you need to be a committer to run these scripts as they upload to the apache svn distribution servers)
+- You must be a committer to run these scripts because they upload to the
+  Apache SVN distribution servers.
+- If there are code changes between RCs, create and merge a changelog PR before
+  creating the next RC.
 
-### Pick a Release Candidate (RC) number
+#### Pick a Release Candidate (RC) number
 
-Pick numbers in sequential order, with `0` for `rc0`, `1` for `rc1`, etc.
+Pick numbers in sequential order, with `1` for `rc1`, `2` for `rc2`, etc.
 
-### Create git tag for the release:
+#### Create Git tag for the Release Candidate
 
 While the official release artifacts are signed tarballs and zip files, we also
-tag the commit it was created for convenience and code archaeology.
+tag the commit it was created from for convenience and code archaeology. Release tags
+look like `50.3.0`, and release candidate tags look like `50.3.0-rc1`. See [the list of existing
+tags].
+
+[the list of existing tags]: https://github.com/apache/datafusion/tags
 
-Using a string such as `38.0.0` as the `<version>`, create and push the tag by running these commands:
+Create and push the RC tag, for example, to create the `50.3.0-rc1` tag from `branch-50`, use:
 
 ```shell
 git fetch apache
-git tag <version>-<rc> apache/main
-# push tag to Github remote
-git push apache <version>
+git tag 50.3.0-rc1 apache/branch-50
+git push apache 50.3.0-rc1
 ```
 
-### Create, sign, and upload artifacts
+Please make sure the format is correct, tools like Homebrew listens for tags and in case of malformed tags users would be notified for non-existent version
 
-Run `create-tarball.sh` with the `<version>` tag and `<rc>` and you found in previous steps:
+#### Create, Sign, and Upload Artifacts
+
+Run the `create-tarball.sh` script with the `<version>` tag and `<rc>` number you determined in previous steps:
+
+For example, to create the `50.3.0-rc1` artifacts:
 
 ```shell
-GH_TOKEN=<TOKEN> ./dev/release/create-tarball.sh 38.0.0 0
+GITHUB_TOKEN=<TOKEN> ./dev/release/create-tarball.sh 50.3.0 1
 ```
 
 The `create-tarball.sh` script
 
-1. creates and uploads all release candidate artifacts to the [datafusion
+1. Creates and uploads all release candidate artifacts to the [datafusion
    dev](https://dist.apache.org/repos/dist/dev/datafusion) location on the
-   apache distribution svn server
+   Apache distribution SVN server
 
-2. provide you an email template to
-   send to dev@datafusion.apache.org for release voting.
+2. Provides you an email template to
+   send to `dev@datafusion.apache.org` for release voting.
 
-### Vote on Release Candidate artifacts
+### 7. Vote on Release Candidate Artifacts
 
-Send the email output from the script to dev@datafusion.apache.org.
+Send the email output from the script to `dev@datafusion.apache.org`.
 
-For the release to become "official" it needs at least three PMC members to vote +1 on it.
+In order to publish the release on crates.io, it must be "official." To become
+official, it needs at least three PMC members to vote +1 on it and no -1 votes.
+The vote must remain open for at least 72 hours to give everyone a chance to
+review the release candidate.
 
-### Verifying Release Candidates
+#### Verifying Release Candidates
 
-The `dev/release/verify-release-candidate.sh` is a script in this repository that can assist in the verification process. Run it like:
+`dev/release/verify-release-candidate.sh` is a script in this repository that can assist in the verification process. Run it like this:
 
 ```shell
-./dev/release/verify-release-candidate.sh 38.0.0 0
+./dev/release/verify-release-candidate.sh 50.3.0 1
 ```
 
-#### If the release is not approved
+#### If Changes Are Requested
+
+If the release is not approved or urgent backports requested, please start over from [here](#4-backporting-urgent-changes)
+
+#### If the Vote Passes: Announce the Result
+
+Call the vote on the Datafusion dev list by replying to the RC voting thread. The
+reply should have a new subject constructed by adding the `[RESULT]` prefix to the
+old subject line.
 
-If the release is not approved, fix whatever the problem is, merge changelog
-changes into main if there is any and try again with the next RC number.
+Sample announcement template:
 
-## Finalize the release
+```
+The vote has passed with <NUMBER> +1 votes. Thank you to all who helped
+with the release verification.
+```
 
-NOTE: steps in this section can only be done by PMC members.
+### 8. Finalize the Release
 
-### After the release is approved
+NOTE: steps in this section can only be done by PMC members after release is approved.
 
 Move artifacts to the release location in SVN, e.g.
-https://dist.apache.org/repos/dist/release/datafusion/datafusion-38.0.0/, using
+https://dist.apache.org/repos/dist/release/datafusion/datafusion-50.3.0/, using
 the `release-tarball.sh` script:
 
 ```shell
-./dev/release/release-tarball.sh 38.0.0 0
+./dev/release/release-tarball.sh 50.3.0 1
 ```
 
 Congratulations! The release is now official!
 
-### Create release git tags
+### 9. Create Release git tags
 
 Tag the same release candidate commit with the final release tag
 
 ```shell
-git co apache/38.0.0-rc0
-git tag 38.0.0
-git push apache 38.0.0
+git checkout 50.3.0-rc1
+git tag 50.3.0
+git push apache 50.3.0
 ```
 
-### Publish on Crates.io
+### 10. Publish on Crates.io
 
 Only approved releases of the tarball should be published to
 crates.io, in order to conform to Apache Software Foundation
@@ -261,139 +304,131 @@ been made to crates.io using the following instructions.
 Follow [these
 instructions](https://doc.rust-lang.org/cargo/reference/publishing.html) to
 create an account and login to crates.io before asking to be added as an owner
-to all of the DataFusion crates.
+to all DataFusion crates.
 
 Download and unpack the official release tarball
 
 Verify that the Cargo.toml in the tarball contains the correct version
-(e.g. `version = "38.0.0"`) and then publish the crates by running the following commands
+(e.g. `version = "50.3.0"`) and then publish the crates by running the following commands
 
 ```shell
 (cd datafusion/common && cargo publish)
+(cd datafusion/common-runtime && cargo publish)
+(cd datafusion/doc && cargo publish)
 (cd datafusion/expr-common && cargo publish)
+(cd datafusion/macros && cargo publish)
+(cd datafusion/proto-common && cargo publish)
 (cd datafusion/physical-expr-common && cargo publish)
 (cd datafusion/functions-aggregate-common && cargo publish)
 (cd datafusion/functions-window-common && cargo publish)
-(cd datafusion/doc && cargo publish)
 (cd datafusion/expr && cargo publish)
-(cd datafusion/macros && cargo publish)
 (cd datafusion/execution && cargo publish)
 (cd datafusion/functions && cargo publish)
 (cd datafusion/physical-expr && cargo publish)
-(cd datafusion/physical-expr-adapter && cargo publish)
 (cd datafusion/functions-aggregate && cargo publish)
 (cd datafusion/functions-window && cargo publish)
+(cd datafusion/physical-expr-adapter && cargo publish)
 (cd datafusion/functions-nested && cargo publish)
-(cd datafusion/sql && cargo publish)
-(cd datafusion/optimizer && cargo publish)
-(cd datafusion/common-runtime && cargo publish)
 (cd datafusion/physical-plan && cargo publish)
-(cd datafusion/pruning && cargo publish)
-(cd datafusion/physical-optimizer && cargo publish)
 (cd datafusion/session && cargo publish)
+(cd datafusion/sql && cargo publish)
 (cd datafusion/datasource && cargo publish)
+(cd datafusion/optimizer && cargo publish)
 (cd datafusion/catalog && cargo publish)
-(cd datafusion/catalog-listing && cargo publish)
-(cd datafusion/functions-table && cargo publish)
 (cd datafusion/datasource-arrow && cargo publish)
+(cd datafusion/datasource-avro && cargo publish)
 (cd datafusion/datasource-csv && cargo publish)
 (cd datafusion/datasource-json && cargo publish)
+(cd datafusion/pruning && cargo publish)
 (cd datafusion/datasource-parquet && cargo publish)
+(cd datafusion/functions-table && cargo publish)
+(cd datafusion/physical-optimizer && cargo publish)
+(cd datafusion/catalog-listing && cargo publish)
 (cd datafusion/core && cargo publish)
-(cd datafusion/proto-common && cargo publish)
+(cd datafusion-cli && cargo publish)
 (cd datafusion/proto && cargo publish)
-(cd datafusion/datasource-avro && cargo publish)
+(cd datafusion/spark && cargo publish)
 (cd datafusion/substrait && cargo publish)
 (cd datafusion/ffi && cargo publish)
-(cd datafusion-cli && cargo publish)
-(cd datafusion/spark && cargo publish)
 (cd datafusion/sqllogictest && cargo publish)
 ```
 
+Crates.io publishing depends on crates dependency tree, this list might contain wrong order.
+If it happens crates.io fails with wrong dependency message like below, just rerun all publishing commands.
+
+```shell
+error: failed to prepare local package for uploading
+
+Caused by:
+  failed to select a version for the requirement `datafusion-proto = "^53.1.0"`
+  candidate versions found which didn't match: 53.0.0, 52.5.0, 52.4.0, ...
+  location searched: crates.io index
+  required by package `datafusion-ffi v53.1.0 (/private/tmp/apache-datafusion-53.1.0/datafusion/ffi)`
+```
+
 ### Publish datafusion-cli on Homebrew
 
-[`datafusion` formula](https://formulae.brew.sh/formula/datafusion) is [updated automatically](https://github.com/Homebrew/homebrew-core/pulls?q=is%3Apr+datafusion+is%3Aclosed),
+Note: [`datafusion` formula](https://formulae.brew.sh/formula/datafusion) is [updated automatically](https://github.com/Homebrew/homebrew-core/pulls?q=is%3Apr+datafusion+is%3Aclosed),
 so no action is needed.
 
-### Call the vote
-
-Call the vote on the Arrow dev list by replying to the RC voting thread. The
-reply should have a new subject constructed by adding `[RESULT]` prefix to the
-old subject line.
+### 11. Sync Changelog and Version to main
 
-Sample announcement template:
+To sync changelog version and create PR against `main`:
 
-```
-The vote has passed with <NUMBER> +1 votes. Thank you to all who helped
-with the release verification.
+```shell
+git fetch apache && git checkout -b sync_change_log_version
 ```
 
-### Add the release to Apache Reporter
+- Cherry-pick or patch the version updates from step 2
+- Cherry-pick or patch the changelog updates from step 5
+- Commit changes, `git commit -a -m 'Sync changelog and version to main'`
+- Push to `origin/sync_change_log_version`
+- Create a PR against `main`.
+- Merge to `main`.
 
-Add the release to https://reporter.apache.org/addrelease.html?datafusion using the version number e.g. 38.0.0.
+### 12. Add the release to Apache Reporter
+
+When you have published the release, please help the project by adding the release to
+[Apache Reporter](https://reporter.apache.org/). The reporter system should
+send you a reminder email, but in case you miss it, you can add
+the release to https://reporter.apache.org/addrelease.html?datafusion following
+the examples from previous releases.
 
 The release information is used to generate a template for a board report (see example from Apache Arrow project
 [here](https://github.com/apache/arrow/pull/14357)).
 
-### Delete old RCs and Releases
+### 13. Delete Old RCs and Releases
 
 See the ASF documentation on [when to archive](https://www.apache.org/legal/release-policy.html#when-to-archive)
 for more information.
 
-#### Deleting old release candidates from `dev` svn
-
 Release candidates should be deleted once the release is published.
 
-Get a list of DataFusion release candidates:
+To get a list of DataFusion release candidates:
 
 ```shell
 svn ls https://dist.apache.org/repos/dist/dev/datafusion
 ```
 
-Delete a release candidate:
+To delete a release candidate:
 
 ```shell
-svn delete -m "delete old DataFusion RC" https://dist.apache.org/repos/dist/dev/datafusion/apache-datafusion-38.0.0-rc1/
+svn delete -m "delete old DataFusion RC" https://dist.apache.org/repos/dist/dev/datafusion/apache-datafusion-50.0.0-rc1/
 ```
 
-#### Deleting old releases from `release` svn
+#### Delete old releases from `release` SVN
 
-Only the latest release should be available. Delete old releases after publishing the new release.
+Only the latest release should be available. Delete old releases after
+publishing the new release.
 
-Get a list of DataFusion releases:
+To get a list of DataFusion releases:
 
 ```shell
 svn ls https://dist.apache.org/repos/dist/release/datafusion
 ```
 
-Delete a release:
+To delete a release:
 
 ```shell
-svn delete -m "delete old DataFusion release" https://dist.apache.org/repos/dist/release/datafusion/datafusion-37.0.0
+svn delete -m "delete old DataFusion release" https://dist.apache.org/repos/dist/release/datafusion/datafusion-50.0.0
 ```
-
-### Optional: Write a blog post announcing the release
-
-We typically crowd source release announcements by collaborating on a Google document, usually starting
-with a copy of the previous release announcement.
-
-Run the following commands to get the number of commits and number of unique contributors for inclusion in the blog post.
-
-```shell
-git log --pretty=oneline 37.0.0..38.0.0 datafusion datafusion-cli datafusion-examples | wc -l
-git shortlog -sn 37.0.0..38.0.0 datafusion datafusion-cli datafusion-examples | wc -l
-```
-
-Once there is consensus on the contents of the post, create a PR to add a blog post to the
-[arrow-site](https://github.com/apache/arrow-site) repository. Note that there is no need for a formal
-PMC vote on the blog post contents since this isn't considered to be a "release".
-
-Here is an example blog post PR:
-
-- https://github.com/apache/arrow-site/pull/217
-
-Once the PR is merged, a GitHub action will publish the new blog post to https://arrow.apache.org/blog/.
-
-### Update the version on the download page
-
-Update the version on the [download page](https://datafusion.apache.org/download) to point to the latest release [here](../../docs/source/download.md).
diff --git a/dev/release/add-branch-protection.sh b/dev/release/add-branch-protection.sh
new file mode 100755
index 0000000000000..735bae7f90fd9
--- /dev/null
+++ b/dev/release/add-branch-protection.sh
@@ -0,0 +1,160 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -eu
+
+# Script to add branch protection for a new release branch in .asf.yaml
+#
+# This script automates the process of adding branch protection rules to .asf.yaml
+# for new release branches. It ensures the branch protection block doesn't already
+# exist before adding it.
+#
+# Usage:
+#   ./dev/release/add-branch-protection.sh <release_number>
+#
+# Examples:
+#   ./dev/release/add-branch-protection.sh 52
+#   ./dev/release/add-branch-protection.sh 53
+#
+# The script will:
+#   1. Validate the release number is a positive integer
+#   2. Check if branch protection already exists for branch-<release_number>
+#   3. Add the branch protection block to .asf.yaml if it doesn't exist
+#   4. Error out if the block already exists
+
+# Check if release number is provided
+if [ $# -eq 0 ]; then
+    echo "Error: Release number is required"
+    echo "Usage: $0 <release_number>"
+    echo "Example: $0 52"
+    exit 1
+fi
+
+RELEASE_NUM=$1
+BRANCH_NAME="branch-${RELEASE_NUM}"
+ASF_YAML_FILE=".asf.yaml"
+
+# Validate release number is a positive integer
+if ! [[ "$RELEASE_NUM" =~ ^[0-9]+$ ]]; then
+    echo "Error: Release number must be a positive integer"
+    echo "Provided: $RELEASE_NUM"
+    echo "Example: ./dev/release/add-branch-protection.sh 52"
+    exit 1
+fi
+
+# Check if .asf.yaml exists
+if [ ! -f "$ASF_YAML_FILE" ]; then
+    echo "Error: $ASF_YAML_FILE not found in current directory"
+    echo "Please run this script from the repository root"
+    exit 1
+fi
+
+# Check if the branch exists in the official Apache DataFusion repository
+GITHUB_API_URL="https://api.github.com/repos/apache/datafusion/branches/${BRANCH_NAME}"
+HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$GITHUB_API_URL")
+
+if [ "$HTTP_STATUS" != "200" ]; then
+    echo "Error: Branch ${BRANCH_NAME} does not exist in the official Apache DataFusion repository"
+    echo "Please create the branch '${BRANCH_NAME}' first before adding branch protection"
+    echo ""
+    echo "To check existing branches, visit:"
+    echo "  https://github.com/apache/datafusion/branches"
+    exit 1
+fi
+
+# Check if branch protection already exists for this release
+if grep -q "^[[:space:]]*${BRANCH_NAME}:" "$ASF_YAML_FILE"; then
+    echo "Error: Branch protection for ${BRANCH_NAME} already exists in $ASF_YAML_FILE"
+    exit 1
+fi
+
+# Create a temporary file
+TEMP_FILE=$(mktemp)
+
+# Read the file and insert the new branch protection block
+# We'll insert it after the last branch-XX block
+awk -v branch="$BRANCH_NAME" '
+/^[[:space:]]*branch-[0-9]+:/ {
+    last_branch_line = NR
+    last_branch_content = $0
+}
+{
+    lines[NR] = $0
+}
+END {
+    if (last_branch_line == 0) {
+        print "Error: No existing branch protection blocks found" > "/dev/stderr"
+        exit 1
+    }
+    
+    # Print all lines up to and including the last branch block
+    for (i = 1; i <= last_branch_line; i++) {
+        print lines[i]
+    }
+    
+    # Print the required_pull_request_reviews lines after the last branch
+    for (i = last_branch_line + 1; i <= NR; i++) {
+        print lines[i]
+        # After printing the required_approving_review_count line, insert new branch
+        if (lines[i] ~ /required_approving_review_count:/) {
+            # Check if this belongs to the last branch block by looking ahead
+            next_non_empty = i + 1
+            while (next_non_empty <= NR && lines[next_non_empty] ~ /^[[:space:]]*$/) {
+                next_non_empty++
+            }
+            # If next non-empty line is not indented more than branch level, we found the end
+            if (next_non_empty > NR || lines[next_non_empty] !~ /^[[:space:]]{6,}/) {
+                print "    " branch ":"
+                print "      required_pull_request_reviews:"
+                print "        required_approving_review_count: 1"
+                # Skip to next iteration to avoid double printing
+                for (j = i + 1; j <= NR; j++) {
+                    i = j
+                    if (j <= NR) print lines[j]
+                }
+                break
+            }
+        }
+    }
+}
+' "$ASF_YAML_FILE" > "$TEMP_FILE"
+
+# Check if awk succeeded
+if [ $? -ne 0 ]; then
+    rm -f "$TEMP_FILE"
+    exit 1
+fi
+
+# Verify the new content was added
+if ! grep -q "^[[:space:]]*${BRANCH_NAME}:" "$TEMP_FILE"; then
+    echo "Error: Failed to add branch protection block"
+    rm -f "$TEMP_FILE"
+    exit 1
+fi
+
+# Replace the original file with the modified version
+mv "$TEMP_FILE" "$ASF_YAML_FILE"
+
+echo "Successfully added branch protection for ${BRANCH_NAME} to $ASF_YAML_FILE"
+echo ""
+echo "Added block:"
+echo "    ${BRANCH_NAME}:"
+echo "      required_pull_request_reviews:"
+echo "        required_approving_review_count: 1"
+echo ""
+echo "Please review the changes and commit them."
\ No newline at end of file
diff --git a/dev/release/create-tarball.sh b/dev/release/create-tarball.sh
index 7d2b7d56bd25e..6671d2a930d7f 100755
--- a/dev/release/create-tarball.sh
+++ b/dev/release/create-tarball.sh
@@ -102,7 +102,7 @@ Here is my vote:
 
 [1]: https://github.com/apache/datafusion/tree/${release_hash}
 [2]: ${url}
-[3]: https://github.com/apache/datafusion/blob/${release_hash}/CHANGELOG.md
+[3]: https://github.com/apache/datafusion/blob/${release_hash}/dev/changelog/${version}.md
 MAIL
 echo "---------------------------------------------------------"
 
diff --git a/dev/release/download-python-wheels.py b/dev/release/download-python-wheels.py
index 3f97d0a05c3d0..b0998d4dc7733 100644
--- a/dev/release/download-python-wheels.py
+++ b/dev/release/download-python-wheels.py
@@ -39,11 +39,11 @@ def main():
     args = parser.parse_args()
 
     tag = args.tag
-    ghp_token = os.environ.get("GH_TOKEN")
+    ghp_token = os.environ.get("GITHUB_TOKEN")
     if not ghp_token:
         print(
             "ERROR: Personal Github token is required to download workflow artifacts. "
-            "Please specify a token through GH_TOKEN environment variable.")
+            "Please specify a token through GITHUB_TOKEN environment variable.")
         sys.exit(1)
 
     print(f"Downloading latest python wheels for RC tag {tag}...")
diff --git a/dev/release/release-tarball.sh b/dev/release/release-tarball.sh
index bd858d23a767c..a284b6c4351f3 100755
--- a/dev/release/release-tarball.sh
+++ b/dev/release/release-tarball.sh
@@ -43,6 +43,13 @@ fi
 version=$1
 rc=$2
 
+read -r -p "Proceed to release tarball for ${version}-rc${rc}? [y/N]: " answer
+answer=${answer:-no}
+if [ "${answer}" != "y" ]; then
+  echo "Cancelled tarball release!"
+  exit 1
+fi
+
 tmp_dir=tmp-apache-datafusion-dist
 
 echo "Recreate temporary directory: ${tmp_dir}"
diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index 9ecbe1bc1713c..9ddd1d3ba8553 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -22,7 +22,7 @@
 check_dependencies() {
   local missing_deps=0
   local required_deps=("curl" "git" "gpg" "cc" "protoc")
-  
+
   # Either shasum or sha256sum/sha512sum are required
   local has_sha_tools=0
 
@@ -32,7 +32,7 @@ check_dependencies() {
       missing_deps=1
     fi
   done
-  
+
   # Check for either shasum or sha256sum/sha512sum
   if command -v shasum &> /dev/null; then
     has_sha_tools=1
@@ -42,7 +42,7 @@ check_dependencies() {
     echo "Error: Neither shasum nor sha256sum/sha512sum are installed or in PATH"
     missing_deps=1
   fi
-  
+
   if [ $missing_deps -ne 0 ]; then
     echo "Please install missing dependencies and try again"
     exit 1
@@ -163,7 +163,7 @@ test_source_distribution() {
   git clone https://github.com/apache/parquet-testing.git parquet-testing
 
   cargo build
-  cargo test --all --features=avro
+  cargo test --profile=ci --all --features=avro
 
   if ( find -iname 'Cargo.toml' | xargs grep SNAPSHOT ); then
     echo "Cargo.toml version should not contain SNAPSHOT for releases"
diff --git a/dev/requirements.txt b/dev/requirements.txt
deleted file mode 100644
index 7fcba04931290..0000000000000
--- a/dev/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-tomlkit
-PyGitHub
\ No newline at end of file
diff --git a/dev/rust_lint.sh b/dev/rust_lint.sh
index 8fe7220085c93..43d29bd88166d 100755
--- a/dev/rust_lint.sh
+++ b/dev/rust_lint.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
@@ -19,22 +19,107 @@
 
 # This script runs all the Rust lints locally the same way the
 # DataFusion CI does
+#
+# Note: The installed checking tools (e.g., taplo) are not guaranteed to match
+# the CI versions for simplicity, there might be some minor differences. Check
+# `.github/workflows` for the CI versions.
+#
+#
+#
+# For each lint scripts:
+#
+# By default, they run in check mode:
+#     ./ci/scripts/rust_fmt.sh
+#
+# With `--write`, scripts perform best-effort auto fixes:
+#     ./ci/scripts/rust_fmt.sh --write
+#
+# The `--write` flag assumes a clean git repository (no uncommitted changes); to force
+# auto fixes even if there are unstaged changes, use `--allow-dirty`:
+#     ./ci/scripts/rust_fmt.sh --write --allow-dirty
+#
+# New scripts can use `rust_fmt.sh` as a reference.
+
+set -euo pipefail
+
+usage() {
+  cat >&2 <<EOF
+Usage: $0 [--write] [--allow-dirty]
+
+Runs the local Rust lint suite similar to CI.
+--write        Run formatters, clippy and other non-functional checks in best-effort write/fix mode (requires a clean git worktree, no uncommitted changes; some checks are test-only and ignore this flag).
+--allow-dirty  Allow \`--write\` to run even when the git worktree has uncommitted changes.
+EOF
+  exit 1
+}
+
+ensure_tool() {
+  local cmd="$1"
+  local install_cmd="$2"
+  if ! command -v "$cmd" &> /dev/null; then
+    echo "Installing $cmd using: $install_cmd"
+    eval "$install_cmd"
+  fi
+}
+
+MODE="check"
+ALLOW_DIRTY=0
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --write)
+      MODE="write"
+      ;;
+    --allow-dirty)
+      ALLOW_DIRTY=1
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      usage
+      ;;
+  esac
+  shift
+done
+
+SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")"
+
+ensure_tool "taplo" "cargo install taplo-cli --locked"
+ensure_tool "hawkeye" "cargo install hawkeye --locked"
+ensure_tool "typos" "cargo install typos-cli --locked"
+
+run_step() {
+  local name="$1"
+  shift
+  echo "[${SCRIPT_NAME}] Running ${name}"
+  "$@"
+}
+
+declare -a WRITE_STEPS=(
+  "ci/scripts/rust_fmt.sh|true"
+  "ci/scripts/rust_clippy.sh|true"
+  "ci/scripts/rust_toml_fmt.sh|true"
+  "ci/scripts/license_header.sh|true"
+  "ci/scripts/typos_check.sh|true"
+  "ci/scripts/doc_prettier_check.sh|true"
+)
+
+declare -a READONLY_STEPS=(
+  "ci/scripts/rust_docs.sh|false"
+)
 
-# For `.toml` format checking
-set -e
-if ! command -v taplo &> /dev/null; then
-    echo "Installing taplo using cargo"
-    cargo install taplo-cli
-fi
-
-# For Apache licence header checking
-if ! command -v hawkeye &> /dev/null; then
-    echo "Installing hawkeye using cargo"
-    cargo install hawkeye --locked
-fi
-
-ci/scripts/rust_fmt.sh
-ci/scripts/rust_clippy.sh
-ci/scripts/rust_toml_fmt.sh
-ci/scripts/rust_docs.sh
-ci/scripts/license_header.sh
\ No newline at end of file
+for entry in "${WRITE_STEPS[@]}" "${READONLY_STEPS[@]}"; do
+  IFS='|' read -r script_path supports_write <<<"$entry"
+  script_name="$(basename "$script_path")"
+  args=()
+  if [[ "$supports_write" == "true" && "$MODE" == "write" ]]; then
+    args+=(--write)
+    [[ $ALLOW_DIRTY -eq 1 ]] && args+=(--allow-dirty)
+  fi
+  if [[ ${#args[@]} -gt 0 ]]; then
+    run_step "$script_name" "$script_path" "${args[@]}"
+  else
+    run_step "$script_name" "$script_path"
+  fi
+done
diff --git a/dev/update_arrow_deps.py b/dev/update_arrow_deps.py
index 6bd5d47ff0597..bdfdfe22eaeb6 100755
--- a/dev/update_arrow_deps.py
+++ b/dev/update_arrow_deps.py
@@ -19,7 +19,7 @@
 # Script that updates the arrow dependencies in datafusion locally
 #
 # installation:
-# pip install tomlkit requests
+# uv sync
 #
 # pin all arrow crates deps to a specific version:
 #
diff --git a/dev/update_config_docs.sh b/dev/update_config_docs.sh
index 90bbc5d3bad06..7ab998f3dad48 100755
--- a/dev/update_config_docs.sh
+++ b/dev/update_config_docs.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
@@ -20,14 +20,16 @@
 
 set -e
 
-SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "${SOURCE_DIR}/../" && pwd
+ROOT_DIR="$(git rev-parse --show-toplevel)"
+cd "${ROOT_DIR}"
+
+# Load centralized tool versions
+source "${ROOT_DIR}/ci/scripts/utils/tool_versions.sh"
 
 TARGET_FILE="docs/source/user-guide/configs.md"
 PRINT_CONFIG_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_config_docs"
 PRINT_RUNTIME_CONFIG_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_runtime_config_docs"
 
-
 echo "Inserting header"
 cat <<'EOF' > "$TARGET_FILE"
 <!---
@@ -99,9 +101,24 @@ EOF
 echo "Running CLI and inserting config docs table"
 $PRINT_CONFIG_DOCS_COMMAND >> "$TARGET_FILE"
 
-echo "Inserting runtime config header"
+echo "Inserting reset command details and runtime config header"
 cat <<'EOF' >> "$TARGET_FILE"
 
+You can also reset configuration options to default settings via SQL using the `RESET` command. For
+example, to set and reset `datafusion.execution.batch_size`:
+
+```sql
+SET datafusion.execution.batch_size = '10000';
+
+SHOW datafusion.execution.batch_size;
+datafusion.execution.batch_size 10000
+
+RESET datafusion.execution.batch_size;
+
+SHOW datafusion.execution.batch_size;
+datafusion.execution.batch_size 8192
+```
+
 # Runtime Configuration Settings
 
 DataFusion runtime configurations can be set via SQL using the `SET` command.
@@ -238,7 +255,7 @@ Enables the experimental Piecewise Merge Join algorithm.
 EOF
 
 
-echo "Running prettier"
-npx prettier@2.3.2 --write "$TARGET_FILE"
+echo "Running prettier ${PRETTIER_VERSION}"
+npx "prettier@${PRETTIER_VERSION}" --write "$TARGET_FILE"
 
 echo "'$TARGET_FILE' successfully updated!"
diff --git a/dev/update_datafusion_versions.py b/dev/update_datafusion_versions.py
index cf72e5a4159dc..1edf0f127112f 100755
--- a/dev/update_datafusion_versions.py
+++ b/dev/update_datafusion_versions.py
@@ -19,7 +19,7 @@
 # Script that updates versions for datafusion crates, locally
 #
 # dependencies:
-# pip install tomlkit
+# uv sync
 
 import re
 import argparse
@@ -48,7 +48,6 @@
     'datafusion-benchmarks': 'benchmarks/Cargo.toml',
     'datafusion-cli': 'datafusion-cli/Cargo.toml',
     'datafusion-examples': 'datafusion-examples/Cargo.toml',
-    'datafusion-docs': 'docs/Cargo.toml',
 }
 
 def update_workspace_version(new_version: str):
@@ -116,7 +115,9 @@ def update_docs(path: str, new_version: str):
     with open(path, 'r+') as fd:
         content = fd.read()
         fd.seek(0)
-        content = re.sub(r'datafusion = "(.+)"', f'datafusion = "{new_version}"', content)
+        content = re.sub(r'datafusion\s*=\s*"(.+?)"', f'datafusion = "{new_version}"', content)
+        content = re.sub(r'datafusion\s*=\s*\{\s*version\s*=\s*"(.+?)"', f'datafusion = {{ version = "{new_version}"', content)
+        fd.truncate()
         fd.write(content)
 
 
@@ -144,6 +145,9 @@ def main():
         update_downstream_versions(cargo_toml, new_version)
 
     update_docs("README.md", new_version)
+    update_docs("docs/source/download.md", new_version)
+    update_docs("docs/source/user-guide/example-usage.md", new_version)
+    update_docs("docs/source/user-guide/crate-configuration.md", new_version)
 
 
 if __name__ == "__main__":
diff --git a/dev/update_function_docs.sh b/dev/update_function_docs.sh
index 6ed760bd22ff4..86a272ae196c8 100755
--- a/dev/update_function_docs.sh
+++ b/dev/update_function_docs.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
@@ -20,9 +20,11 @@
 
 set -e
 
-SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "${SOURCE_DIR}/../" && pwd
+ROOT_DIR="$(git rev-parse --show-toplevel)"
+cd "${ROOT_DIR}"
 
+# Load centralized tool versions
+source "${ROOT_DIR}/ci/scripts/utils/tool_versions.sh"
 
 TARGET_FILE="docs/source/user-guide/sql/aggregate_functions.md"
 PRINT_AGGREGATE_FUNCTION_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_functions_docs -- aggregate"
@@ -78,13 +80,43 @@ FROM employees;
 ```
 
 Note: When no rows pass the filter, `COUNT` returns `0` while `SUM`/`AVG`/`MIN`/`MAX` return `NULL`.
+
+## WITHIN GROUP / Ordered-set aggregates
+
+Some aggregate functions accept the SQL `WITHIN GROUP (ORDER BY ...)` clause to specify the ordering the
+aggregate relies on. In DataFusion this is opt-in: only aggregate functions whose implementation returns
+`true` from `AggregateUDFImpl::supports_within_group_clause()` accept the `WITHIN GROUP` clause. Attempting to
+use `WITHIN GROUP` with a regular aggregate (for example, `SELECT SUM(x) WITHIN GROUP (ORDER BY x)`) will fail
+during planning with an error: "WITHIN GROUP is only supported for ordered-set aggregate functions".
+
+Currently, the built-in aggregate functions that support `WITHIN GROUP` are:
+
+- `percentile_cont` — exact percentile aggregate (also available as `percentile_cont(column, percentile)`)
+- `approx_percentile_cont` — approximate percentile using the t-digest algorithm
+- `approx_percentile_cont_with_weight` — approximate weighted percentile using the t-digest algorithm
+
+Note: rank-like functions such as `rank()`, `dense_rank()`, and `percent_rank()` are window functions and
+use the `OVER (...)` clause; they are not ordered-set aggregates that accept `WITHIN GROUP` in DataFusion.
+
+Example (ordered-set aggregate):
+
+```sql
+percentile_cont(0.5) WITHIN GROUP (ORDER BY value)
+```
+
+Example (invalid usage — planner will error):
+
+```sql
+-- This will fail: SUM is not an ordered-set aggregate
+SELECT SUM(x) WITHIN GROUP (ORDER BY x) FROM t;
+```
 EOF
 
 echo "Running CLI and inserting aggregate function docs table"
 $PRINT_AGGREGATE_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE"
 
-echo "Running prettier"
-npx prettier@2.3.2 --write "$TARGET_FILE"
+echo "Running prettier ${PRETTIER_VERSION}"
+npx "prettier@${PRETTIER_VERSION}" --write "$TARGET_FILE"
 
 echo "'$TARGET_FILE' successfully updated!"
 
@@ -127,8 +159,8 @@ EOF
 echo "Running CLI and inserting scalar function docs table"
 $PRINT_SCALAR_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE"
 
-echo "Running prettier"
-npx prettier@2.3.2 --write "$TARGET_FILE"
+echo "Running prettier ${PRETTIER_VERSION}"
+npx "prettier@${PRETTIER_VERSION}" --write "$TARGET_FILE"
 
 echo "'$TARGET_FILE' successfully updated!"
 
@@ -305,7 +337,7 @@ EOF
 echo "Running CLI and inserting window function docs table"
 $PRINT_WINDOW_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE"
 
-echo "Running prettier"
-npx prettier@2.3.2 --write "$TARGET_FILE"
+echo "Running prettier ${PRETTIER_VERSION}"
+npx "prettier@${PRETTIER_VERSION}" --write "$TARGET_FILE"
 
 echo "'$TARGET_FILE' successfully updated!"
diff --git a/docs/.gitignore b/docs/.gitignore
index a3adddc690ab0..e73866cc0f359 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -20,3 +20,7 @@ build/
 venv/
 .python-version
 __pycache__/
+
+# Generated dependency graph artifacts (produced during docs CI)
+source/_static/data/deps.dot
+source/_static/data/deps.svg
diff --git a/docs/README.md b/docs/README.md
index c3d87ee8e84a3..48fdcefdeae1a 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -25,20 +25,18 @@ https://datafusion.apache.org/ as part of the release process.
 
 ## Dependencies
 
-It's recommended to install build dependencies and build the documentation
-inside a Python virtualenv.
+Install build dependencies and build the documentation using
+[uv](https://docs.astral.sh/uv/):
 
 ```sh
-python3 -m venv venv
-pip install -r requirements.txt
+uv sync
+uv run bash build.sh
 ```
 
-If using [uv](https://docs.astral.sh/uv/) the script can be run like so without
-needing to create a virtual environment:
-
-```sh
-uv run --with-requirements requirements.txt bash build.sh
-```
+The docs build regenerates the workspace dependency graph via
+`docs/scripts/generate_dependency_graph.sh`, so ensure `cargo`, `cargo-depgraph`
+(`cargo install cargo-depgraph --version ^1.6 --locked`), and Graphviz `dot`
+(`brew install graphviz` or `sudo apt-get install -y graphviz`) are available.
 
 ## Build & Preview
 
diff --git a/docs/build.sh b/docs/build.sh
index 9e4a118580cab..e12e3c1a5f202 100755
--- a/docs/build.sh
+++ b/docs/build.sh
@@ -18,7 +18,14 @@
 # under the License.
 #
 
-set -e
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "${SCRIPT_DIR}"
+
 rm -rf build 2> /dev/null
 
+# Keep the workspace dependency graph in sync with the codebase.
+scripts/generate_dependency_graph.sh
+
 make html
diff --git a/docs/pyproject.toml b/docs/pyproject.toml
new file mode 100644
index 0000000000000..244e072805fba
--- /dev/null
+++ b/docs/pyproject.toml
@@ -0,0 +1,13 @@
+[project]
+name = "datafusion-docs"
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = [
+  "sphinx>=9,<10",
+  "sphinx-reredirects>=1.1,<2",
+  "pydata-sphinx-theme>=0.17.1,<1",
+  "myst-parser>=5,<6",
+  "maturin>=1.13.1,<2",
+  "jinja2>=3.1.6,<4",
+  "setuptools>=82.0.1,<83",
+]
diff --git a/docs/scripts/generate_dependency_graph.sh b/docs/scripts/generate_dependency_graph.sh
new file mode 100755
index 0000000000000..771f6f1932c37
--- /dev/null
+++ b/docs/scripts/generate_dependency_graph.sh
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# See `usage()` for details about this script.
+#
+# The key commands to generate the dependency graph SVG in this script are:
+#   cargo depgraph ... | dot -Tsvg > deps.svg
+# See below for the exact command used.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+OUTPUT_DIR="${REPO_DIR}/docs/source/_static/data"
+SVG_OUTPUT="${OUTPUT_DIR}/deps.svg"
+
+usage() {
+  cat <<EOF
+Generate the workspace dependency graph SVG for the docs.
+
+'deps.svg' is embedded in the DataFusion docs (Contributor Guide → Architecture → Workspace Dependency Graph).
+
+Output:
+  SVG: ${SVG_OUTPUT}
+
+Usage: $(basename "$0")
+
+Options:
+  -h, --help  Show this help message.
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      usage
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+if ! command -v cargo >/dev/null 2>&1; then
+  echo "cargo is required to build the dependency graph." >&2
+  exit 1
+fi
+
+if ! command -v cargo-depgraph > /dev/null 2>&1; then
+  echo "cargo-depgraph is required (install with: cargo install cargo-depgraph)." >&2
+  exit 1
+fi
+
+if ! command -v dot >/dev/null 2>&1; then
+  echo "Graphviz 'dot' is required to render the SVG." >&2
+  exit 1
+fi
+
+mkdir -p "${OUTPUT_DIR}"
+
+(
+  cd "${REPO_DIR}"
+  # Ignore utility crates only used by internal scripts
+  cargo depgraph \
+    --workspace-only \
+    --all-deps \
+    --dedup-transitive-deps \
+    --exclude gen,gen-common \
+    | dot \
+      -Grankdir=TB \
+      -Gconcentrate=true \
+      -Goverlap=false \
+      -Tsvg \
+      > "${SVG_OUTPUT}"
+)
+
+echo "Wrote dependency graph SVG to ${SVG_OUTPUT}"
diff --git a/docs/source/_static/favicon.svg b/docs/source/_static/favicon.svg
new file mode 100644
index 0000000000000..bf174719bcf21
--- /dev/null
+++ b/docs/source/_static/favicon.svg
@@ -0,0 +1,10 @@
+<svg width="153" height="168" viewBox="0 0 153 168" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path fill-rule="evenodd" clip-rule="evenodd" d="M76.134 168C88.4689 168 99.6146 158.249 107.584 142.553C127.231 144.598 143.098 139.996 149.76 128.509C156.288 117.253 152.646 101.683 141.581 86.0881C152.361 70.6665 155.851 55.3164 149.395 44.1852C143.015 33.1851 128.195 28.4991 109.697 29.9172C101.651 11.6232 89.6012 0 76.134 0C62.6024 0 50.5019 11.7346 42.4559 30.18C24.3535 28.9678 9.88686 33.6772 3.6047 44.5089C-2.8057 55.5617 0.590737 70.774 11.1922 86.0857C0.30461 101.573 -3.24333 117.007 3.24001 128.185C9.80558 139.506 25.3108 144.139 44.5631 142.314C52.5405 158.15 63.7366 168 76.134 168ZM52.723 141.192C59.5131 154.212 68.6244 162.178 78.6404 162.178C88.3909 162.178 97.6445 154.628 104.759 142.216C95.7498 141.011 86.0249 138.458 76.0813 134.579C68.0559 137.653 60.1825 139.861 52.723 141.192ZM65.066 129.772C59.6365 131.524 54.3493 132.843 49.3044 133.722C47.3604 128.877 45.6948 123.542 44.3578 117.82C47.6743 120.075 51.143 122.249 54.7512 124.322C58.1804 126.293 61.6254 128.11 65.066 129.772ZM76.1153 125.681C82.0554 123.201 88.0811 120.236 94.0712 116.793C102.753 111.803 110.601 106.248 117.406 100.431C118.007 95.1332 118.325 89.6363 118.325 84C118.325 76.7877 117.804 69.8034 116.837 63.1724C111.111 58.5306 104.726 54.1259 97.7783 50.1327C90.5496 45.9781 83.2382 42.5717 76.0688 39.9083C69.1761 42.5221 62.1608 45.8209 55.2217 49.809C50.8567 52.3176 46.714 54.9887 42.8173 57.7789C41.3987 65.9754 40.6264 74.8022 40.6264 84C40.6264 91.6514 41.1608 99.046 42.1572 106.035C47.3013 109.933 52.9148 113.66 58.9288 117.117C64.6605 120.411 70.4247 123.268 76.1153 125.681ZM87.1306 129.834C90.8382 128.069 94.5526 126.123 98.2488 123.999C104.42 120.452 110.183 116.612 115.475 112.577C113.788 120.729 111.414 128.213 108.516 134.761C101.819 133.947 94.6043 132.298 87.1306 129.834ZM117.71 110.839C116.16 119.645 113.869 127.798 110.979 135.025C127.491 136.554 140.523 132.79 145.949 123.434C151.173 114.426 148.569 101.574 140.004 88.2746C134.26 96.0014 126.717 103.697 117.71 110.839ZM136.558 79.6112C145.456 66.5866 148.333 53.995 143.077 44.9322C137.99 36.1605 126.021 31.9659 110.76 32.4296C114.303 41.1419 116.969 51.2264 118.504 62.197C125.46 67.7548 131.53 73.6409 136.558 79.6112ZM108.207 32.5487C111.854 40.6018 114.7 50.1134 116.432 60.5713C110.979 56.37 105.016 52.3749 98.6135 48.6953C91.9605 44.8716 85.2484 41.6273 78.6231 38.9688C89.0583 35.2503 99.1528 33.1155 108.207 32.5487ZM107.079 30.1561C97.4373 31.1742 86.8906 33.7897 76.078 37.9747C67.3015 34.6398 58.7118 32.342 50.652 31.0966C57.5997 15.5588 67.5671 5.82178 78.6404 5.82178C89.4885 5.82178 99.7214 15.1669 107.079 30.1561ZM73.5221 38.9912C65.2343 36.1013 57.1742 34.2138 49.6945 33.3208C46.9942 39.8377 44.805 47.278 43.2545 55.3791C46.7937 52.9437 50.5095 50.5998 54.3865 48.3716C60.7535 44.7123 67.1747 41.5836 73.5221 38.9912ZM33.7221 62.4957C35.2332 51.5364 37.8718 41.4543 41.3859 32.7321C26.5331 32.4484 14.915 36.6486 9.92297 45.2558C4.71482 54.2357 7.49152 66.68 16.1993 79.577C21.0882 73.7301 26.9781 67.96 33.7221 62.4957ZM12.7737 88.2981C18.3786 95.9001 25.7286 103.478 34.5053 110.536C36.0375 119.356 38.3126 127.526 41.1886 134.774C25.0714 136.101 12.386 132.308 7.05132 123.11C1.87209 114.18 4.38588 101.474 12.7737 88.2981ZM17.5655 81.5365C21.902 75.9826 27.2176 70.4453 33.3781 65.1689C32.6541 71.2227 32.2716 77.5258 32.2716 84C32.2716 88.9107 32.4917 93.723 32.9139 98.4027C26.8006 92.8905 21.6275 87.1849 17.5655 81.5365ZM135.191 81.5497C130.996 87.3221 125.646 93.1492 119.321 98.7635C119.765 93.9705 119.996 89.037 119.996 84C119.996 77.4071 119.6 70.9917 118.85 64.8363C125.223 70.211 130.718 75.8692 135.191 81.5497Z" fill="url(#paint0_linear_1_164)"/>
+<path d="M139.008 3.736H135.616V13H133.456V3.736H130.128V1.656H139.008V3.736ZM151.557 13H149.477V5.72L146.501 11.336H145.125L142.149 5.72L142.133 13H140.117V1.656H142.149L145.893 8.584L149.477 1.656H151.541L151.557 13Z" fill="#1F2937"/>
+<defs>
+<linearGradient id="paint0_linear_1_164" x1="30.4301" y1="141.748" x2="176.07" y2="60.7867" gradientUnits="userSpaceOnUse">
+<stop stop-color="#EF4136"/>
+<stop offset="1" stop-color="#FBB040"/>
+</linearGradient>
+</defs>
+</svg>
diff --git a/docs/source/_static/theme_overrides.css b/docs/source/_static/theme_overrides.css
index 6a05d1a30fc00..91bc8359658f1 100644
--- a/docs/source/_static/theme_overrides.css
+++ b/docs/source/_static/theme_overrides.css
@@ -138,3 +138,21 @@ Details: 8rem for search box etc*/
 .bd-content table tbody tr:nth-of-type(odd) {
   background-color: rgba(0, 0, 0, 0.03);
 }
+
+
+/* Ensure the config tables are readable without having to scroll horizontally. */
+
+:is(#configuration-settings, #runtime-configuration-settings) table {
+  display: table;
+  table-layout: fixed;
+}
+
+:is(#configuration-settings, #runtime-configuration-settings) th,
+:is(#configuration-settings, #runtime-configuration-settings) td {
+  word-wrap: break-word;
+}
+
+:is(#configuration-settings, #runtime-configuration-settings) th:nth-child(2),
+:is(#configuration-settings, #runtime-configuration-settings) td:nth-child(2) {
+  width: 15%;
+}
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 3d9b05b5b81ae..03dcfb5bfa61b 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -117,6 +117,8 @@
     "**": ["docs-sidebar.html"],
 }
 
+html_favicon = '_static/favicon.svg'
+
 # tell myst_parser to auto-generate anchor links for headers h1, h2, h3
 myst_heading_anchors = 3
 
@@ -132,4 +134,5 @@
 redirects = {
     "library-user-guide/adding-udfs": "functions/index.html",
     "user-guide/runtime_configs": "configs.html",
+    "library-user-guide/upgrading": "/library-user-guide/upgrading/index.html",
 }
diff --git a/docs/source/contributor-guide/api-health.md b/docs/source/contributor-guide/api-health.md
index d811bc357445a..f950c7cc0b365 100644
--- a/docs/source/contributor-guide/api-health.md
+++ b/docs/source/contributor-guide/api-health.md
@@ -19,39 +19,68 @@
 
 # API health policy
 
-DataFusion is used extensively as a library and has a large public API, thus it
-is important that the API is well maintained. In general, we try to minimize
-breaking API changes, but they are sometimes necessary.
+DataFusion is used extensively as a library in other applications and has a
+large public API. We try to keep the API well maintained and minimize breaking
+changes to avoid issues for downstream users.
 
-When possible, rather than making breaking API changes, we prefer to deprecate
-APIs to give users time to adjust to the changes.
+## Breaking API Changes
 
-## Upgrade Guides
-
-When making changes that require DataFusion users to make changes to their code
-as part of an upgrade please consider adding documentation to the version
-specific [Upgrade Guide]
-
-[upgrade guide]: ../library-user-guide/upgrading.md
+### What is the public API and what is a breaking API change?
 
-## Breaking Changes
-
-In general, a function is part of the public API if it appears on the [docs.rs page]
+In general, an item is part of the public API if it appears on the [docs.rs page].
 
 Breaking public API changes are those that _require_ users to change their code
 for it to compile and execute, and are listed as "Major Changes" in the [SemVer
-Compatibility Section of the cargo book]. Common examples of breaking changes:
+Compatibility Section of the Cargo Book]. Common examples of breaking changes include:
 
 - Adding new required parameters to a function (`foo(a: i32, b: i32)` -> `foo(a: i32, b: i32, c: i32)`)
 - Removing a `pub` function
 - Changing the return type of a function
+- Adding a new function to a `trait` without a default implementation
+
+Examples of non-breaking changes include:
+
+- Marking a function as deprecated (`#[deprecated]`)
+- Adding a new function to a `trait` with a default implementation
+
+### When to make breaking API changes?
+
+When possible, we prefer to avoid making breaking API changes. One common way to
+avoid such changes is to deprecate the old API, as described in the [Deprecation
+Guidelines](#deprecation-guidelines) section below.
+
+If you do want to propose a breaking API change, we must weigh the benefits of the
+change with the cost (impact on downstream users). It is often frustrating for
+downstream users to change their applications, and it is even more so if they
+do not gain improved capabilities.
+
+Examples of good reasons for making a breaking API change include:
 
-When making breaking public API changes, please add the `api-change` label to
-the PR so we can highlight the changes in the release notes.
+- The change allows new use cases that were not possible before
+- The change significantly enables improved performance
+
+Examples of potentially weak reasons for making breaking API changes include:
+
+- The change is an internal refactor to make DataFusion more consistent
+- The change is to remove an API that is not widely used but has not been marked as deprecated
+
+### What to do when making breaking API changes?
+
+When making breaking public API changes, please:
+
+1. Add the `api-change` label to the PR so we can highlight the changes in the release notes.
+2. Consider adding documentation to the version-specific [Upgrade Guide] if the required changes are non-trivial.
 
 [docs.rs page]: https://docs.rs/datafusion/latest/datafusion/index.html
 [semver compatibility section of the cargo book]: https://doc.rust-lang.org/cargo/reference/semver.html#change-categories
 
+## Upgrade Guides
+
+When a change requires DataFusion users to modify their code as part of an
+upgrade, please consider documenting it in the version-specific [Upgrade Guide].
+
+[upgrade guide]: ../library-user-guide/upgrading/index.rst
+
 ## Deprecation Guidelines
 
 When deprecating a method:
@@ -59,8 +88,8 @@ When deprecating a method:
 - Mark the API as deprecated using `#[deprecated]` and specify the exact DataFusion version in which it was deprecated
 - Concisely describe the preferred API to help the user transition
 
-The deprecated version is the next version which contains the deprecation. For
-example, if the current version listed in [`Cargo.toml`] is `43.0.0` then the next
+The deprecated version is the next version that introduces the deprecation. For
+example, if the current version listed in [`Cargo.toml`] is `43.0.0`, then the next
 version will be `44.0.0`.
 
 [`cargo.toml`]: https://github.com/apache/datafusion/blob/main/Cargo.toml
@@ -76,4 +105,4 @@ pub fn api_to_deprecated(a: usize, b: usize) {}
 
 Deprecated methods will remain in the codebase for a period of 6 major versions or 6 months, whichever is longer, to provide users ample time to transition away from them.
 
-Please refer to [DataFusion releases](https://crates.io/crates/datafusion/versions) to plan ahead API migration
+Please refer to [DataFusion releases](https://crates.io/crates/datafusion/versions) to plan API migration ahead of time.
diff --git a/docs/source/contributor-guide/architecture.md b/docs/source/contributor-guide/architecture.md
index 1a094968a2742..8197e0cd00a08 100644
--- a/docs/source/contributor-guide/architecture.md
+++ b/docs/source/contributor-guide/architecture.md
@@ -55,7 +55,7 @@ contain features that are useful for a wide range of use cases. Use case specifi
 functionality (such as very specific time series or stream processing features)
 are typically implemented using the extension APIs.
 
-If have a use case that is not covered by the existing APIs, we would love to
+If you have a use case that is not covered by the existing APIs, we would love to
 work with you to design a new general purpose API. There are often others who are
 interested in similar extensions and the act of defining the API often improves
 the code overall for everyone.
diff --git a/docs/source/contributor-guide/architecture/dependency-graph.md b/docs/source/contributor-guide/architecture/dependency-graph.md
new file mode 100644
index 0000000000000..be3502f48beda
--- /dev/null
+++ b/docs/source/contributor-guide/architecture/dependency-graph.md
@@ -0,0 +1,180 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Workspace Dependency Graph
+
+This page shows the dependency relationships between DataFusion's workspace
+crates. This only includes internal dependencies, external crates like `Arrow` are not included
+
+The dependency graph is auto-generated by `docs/scripts/generate_dependency_graph.sh` to ensure it stays up-to-date, and the script now runs automatically as part of `docs/build.sh`.
+
+## Dependency Graph for Workspace Crates
+
+<!--
+  Below is an embedded .svg file, with interactive functionalities like drag/zoom-in/etc.
+  -->
+
+```{raw} html
+<div id="workspace-deps-wrapper" style="border:1px solid #d4d4d8;border-radius:10px;overflow:hidden;background:#fff;">
+  <div id="workspace-deps-inline" style="min-height:760px;width:100%;background:#f8fafc;overflow:hidden;padding:0;margin:0;">
+```
+
+```{eval-rst}
+.. raw:: html
+   :file: ../../_static/data/deps.svg
+```
+
+```{raw} html
+  </div>
+  <div style="padding:10px 12px;background:#f1f5f9;border-top:1px solid #e5e7eb;display:flex;justify-content:space-between;align-items:center;flex-wrap:wrap;gap:8px;">
+    <span style="color:#334155;font-size:0.95rem;">Interactive SVG (pan, zoom, search)</span>
+    <div style="display:flex;align-items:center;gap:6px;">
+      <button id="workspace-deps-zoom-out" type="button" style="padding:6px 10px;border:1px solid #cbd5e1;border-radius:6px;background:#fff;color:#334155;cursor:pointer;">−</button>
+      <button id="workspace-deps-zoom-in" type="button" style="padding:6px 10px;border:1px solid #cbd5e1;border-radius:6px;background:#fff;color:#334155;cursor:pointer;">+</button>
+    </div>
+    <a href="../../_static/data/deps.svg" target="_blank" rel="noopener"
+       style="font-weight:600;color:#2563eb;text-decoration:none;">Open SVG ↗</a>
+  </div>
+</div>
+<script>
+  (function () {
+    const host = document.getElementById("workspace-deps-inline");
+    if (!host) {
+      return;
+    }
+
+    const svg = host.querySelector("svg");
+      if (!svg) {
+        host.textContent = "Unable to load dependency graph.";
+        host.style.display = "flex";
+        host.style.alignItems = "center";
+        host.style.justifyContent = "center";
+      host.style.background = "#f8fafc";
+      return;
+    }
+
+    svg.removeAttribute("width");
+    svg.removeAttribute("height");
+    svg.style.width = "100%";
+    svg.style.height = "100%";
+    svg.style.cursor = "grab";
+    svg.style.touchAction = "none";
+
+    const rawViewBox = (svg.getAttribute("viewBox") || "").split(/\s+/).map(Number);
+    if (rawViewBox.length !== 4 || rawViewBox.some((v) => Number.isNaN(v))) {
+      return;
+    }
+
+    const initial = {
+      x: rawViewBox[0],
+      y: rawViewBox[1],
+      width: rawViewBox[2],
+      height: rawViewBox[3],
+    };
+
+    const state = { ...initial };
+    const applyViewBox = () => {
+      svg.setAttribute("viewBox", `${state.x} ${state.y} ${state.width} ${state.height}`);
+    };
+
+    let isPanning = false;
+    let last = { x: 0, y: 0 };
+
+    svg.addEventListener("pointerdown", (event) => {
+      isPanning = true;
+      last = { x: event.clientX, y: event.clientY };
+      svg.setPointerCapture(event.pointerId);
+      svg.style.cursor = "grabbing";
+    });
+
+    const endPan = (event) => {
+      if (event && svg.hasPointerCapture(event.pointerId)) {
+        svg.releasePointerCapture(event.pointerId);
+      }
+      isPanning = false;
+      svg.style.cursor = "grab";
+    };
+
+    svg.addEventListener("pointerup", endPan);
+    svg.addEventListener("pointerleave", endPan);
+    svg.addEventListener("pointercancel", endPan);
+
+    const zoomBy = (factor) => {
+      const targetWidth = state.width * factor;
+      const targetHeight = state.height * factor;
+      const minSize = Math.max(initial.width * 0.05, 10);
+      const maxSize = initial.width * 20;
+      const clampedWidth = Math.min(Math.max(targetWidth, minSize), maxSize);
+      const clampedHeight = Math.min(Math.max(targetHeight, minSize), maxSize);
+
+      state.x += (state.width - clampedWidth) / 2;
+      state.y += (state.height - clampedHeight) / 2;
+      state.width = clampedWidth;
+      state.height = clampedHeight;
+      applyViewBox();
+    };
+
+    const normalizeDelta = (deltaY, deltaMode) => {
+      // Make trackpad/wheel zoom feel smooth across devices.
+      const multiplier = deltaMode === 1 ? 16 : deltaMode === 2 ? window.innerHeight : 1;
+      return deltaY * multiplier;
+    };
+
+    svg.addEventListener("pointermove", (event) => {
+      if (!isPanning) {
+        return;
+      }
+      const scaleX = state.width / svg.clientWidth;
+      const scaleY = state.height / svg.clientHeight;
+      state.x -= (event.clientX - last.x) * scaleX;
+      state.y -= (event.clientY - last.y) * scaleY;
+      last = { x: event.clientX, y: event.clientY };
+      applyViewBox();
+    });
+
+    svg.addEventListener("wheel", (event) => {
+      event.preventDefault();
+
+      const delta = normalizeDelta(event.deltaY, event.deltaMode);
+      const factor = Math.exp(delta * 0.0015); // smaller magnitude for smoother scrolling
+      zoomBy(factor);
+    }, { passive: false });
+
+    const zoomIn = document.getElementById("workspace-deps-zoom-in");
+    const zoomOut = document.getElementById("workspace-deps-zoom-out");
+    if (zoomIn) {
+      zoomIn.addEventListener("click", () => zoomBy(0.9));
+    }
+    if (zoomOut) {
+      zoomOut.addEventListener("click", () => zoomBy(1.1));
+    }
+  })();
+</script>
+```
+
+### Legend
+
+- black lines: normal dependency
+- blue lines: dev-dependency
+- green lines: build-dependency
+- dotted lines: optional dependency (could be removed by disabling a cargo feature)
+
+Transitive dependencies are intentionally ignored to keep the graph readable.
+
+The dependency graph is generated through `cargo depgraph` by `docs/scripts/generate_dependency_graph.sh`.
diff --git a/docs/source/contributor-guide/communication.md b/docs/source/contributor-guide/communication.md
index 5d4561a3512c8..4af61cb804c70 100644
--- a/docs/source/contributor-guide/communication.md
+++ b/docs/source/contributor-guide/communication.md
@@ -17,56 +17,65 @@
   under the License.
 -->
 
-# Communication
+# Community Communication
 
 We welcome participation from everyone and encourage you to join us, ask
 questions, and get involved.
-
 All participation in the Apache DataFusion project is governed by the
 Apache Software Foundation's [code of
 conduct](https://www.apache.org/foundation/policies/conduct.html).
 
 ## GitHub
 
-The vast majority of communication occurs in the open on our
-[github repository](https://github.com/apache/datafusion) in the form of tickets, issues, discussions, and Pull Requests.
+The primary means of communication is the
+[GitHub repository](https://github.com/apache/datafusion) in the form of issues, discussions, and Pull Requests.
+Our repository is open to everyone. We encourage you to
+participate by reporting issues, asking questions, and contributing code.
+
+## Chat
+
+We also use the Discord and Slack platforms for lower latency, informal discussions and coordination.
+These are great places to
+meet other members of the community, ask questions, and brainstorm ideas.
+However, to ensure technical discussions are archived and accessible to everyone,
+all technical designs are recorded and formalized in GitHub issues.
+
+### Discord
 
-## Slack and Discord
+Historically, the most active discussion forum has been the [Arrow Rust Discord
+server][discord-link] which has specific channels for Arrow, DataFusion, and
+DataFusion subprojects such as Ballista, Comet, Python, etc.
+DataFusion specific channels are prefixed with the `#datafusion-` tag.
+We recommend new users join this server for real-time discussions with the community.
 
-We use the Slack and Discord platforms for informal discussions and coordination. These are great places to
-meet other contributors and get guidance on where to contribute. It is important to note that any technical designs and
-decisions are made fully in the open, on GitHub.
+### Slack
 
-Most of us use the [ASF Slack
-workspace](https://s.apache.org/slack-invite) and the [Arrow Rust Discord
-server][discord-link] for discussions.
+Some of the community also uses the [ASF Slack workspace] for discussions. This
+has historically been much less active than the Discord server.
+Unfortunately, due to spammers, the ASF Slack workspace [requires an invitation]
+to join. We are happy to invite any community member -- please ask for an
+invitation in the Discord server.
 
-There are specific channels for Arrow, DataFusion, and the DataFusion subprojects (Ballista, Comet, Python, etc).
+[asf slack workspace]: https://the-asf.slack.com/
+[requires an invitation]: https://s.apache.org/slack-invite
 
-In Slack we use these channels:
+In Slack, we use these channels:
 
-- #arrow
-- #arrow-rust
-- #datafusion
-- #datafusion-ballista
-- #datafusion-comet
-- #datafusion-python
+- `#arrow`
+- `#arrow-rust`
+- `#datafusion`
+- `#datafusion-ballista`
+- `#datafusion-comet`
+- `#datafusion-python`
 
-In Discord we use these channels:
+## Weekly Video Call Syncs
 
-- #ballista
-- #comet
-- #contrib-federation
-- #datafusion
-- #datafusion-python
-- #dolomite-optimizer
-- #general
-- #hiring
-- #incremental-materialized-views
+The DataFusion community also holds a weekly video call sync for real-time
+discussion and coordination. You can join the meeting using the Google Meet
+link below, and use the shared meeting document for agenda items and notes:
 
-Unfortunately, due to spammers, the ASF Slack workspace requires an invitation
-to join. We are happy to invite you -- please ask for an invitation in the
-Discord server.
+- [Video call link](https://meet.google.com/nfg-eviu-qrm)
+- [Meeting details and notes](https://docs.google.com/document/d/1NBpkIAuU7O9h8Br5CbFksDhX-L9TyO9wmGLPMe0Plc8)
 
 ### Job Board
 
@@ -77,8 +86,8 @@ Please feel free to post links to DataFusion related jobs there.
 ## Mailing Lists
 
 Like other Apache projects, we use [mailing lists] for certain purposes, most
-importantly release coordination. Other than the release process, most
-DataFusion mailing list traffic will simply link to a GitHub issue or PR where
+importantly release coordination and announcing new committers and PMC members.
+Other than these processes, most DataFusion mailing list traffic will link to a GitHub issue or PR where
 the actual discussion occurs. The project mailing lists are:
 
 - [`dev@datafusion.apache.org`](mailto:dev@datafusion.apache.org): the main
diff --git a/docs/source/contributor-guide/development_environment.md b/docs/source/contributor-guide/development_environment.md
index 53f2eb97c6fb2..4d04d38c746f8 100644
--- a/docs/source/contributor-guide/development_environment.md
+++ b/docs/source/contributor-guide/development_environment.md
@@ -21,7 +21,38 @@
 
 This section describes how you can get started at developing DataFusion.
 
-## Windows setup
+## Quick Start
+
+For the fastest path to a working local environment, follow these steps
+from the repository root:
+
+```shell
+# 1. Install Rust (https://rust-lang.org/tools/install/) and verify the active toolchain with
+rustup show
+
+# 2. Install protoc 3.15+ (see details below)
+protoc --version
+
+# 3. Download test data used by examples and many tests
+git submodule update --init --recursive
+
+# 4. Build the workspace
+cargo build
+
+# 5. Verify that Rust integration tests can be run
+cargo test -p datafusion --test parquet_integration
+
+# 6. Verify that sqllogictests can run
+cargo test --profile=ci --test sqllogictests
+```
+
+Notes:
+
+- The pinned Rust version is defined in `rust-toolchain.toml`.
+- `protoc` is required to compile DataFusion from source.
+- Some tests and examples rely on git submodule data being present locally.
+
+## Windows Setup
 
 ```shell
 wget https://az792536.vo.msecnd.net/vms/VMBuild_20190311/VirtualBox/MSEdge/MSEdge.Win10.VirtualBox.zip
@@ -34,7 +65,7 @@ cargo build
 
 DataFusion has support for [dev containers](https://containers.dev/) which may be used for
 developing DataFusion in an isolated environment either locally or remote if desired. Using dev containers for developing
-DataFusion is not a requirement by any means but is available for those where doing local development could be tricky
+DataFusion is not a requirement but is available where doing local development could be tricky
 such as with Windows and WSL2, those with older hardware, etc.
 
 For specific details on IDE support for dev containers see the documentation for [Visual Studio Code](https://code.visualstudio.com/docs/devcontainers/containers),
@@ -42,11 +73,11 @@ For specific details on IDE support for dev containers see the documentation for
 [Rust Rover](https://www.jetbrains.com/help/rust/connect-to-devcontainer.html), and
 [GitHub Codespaces](https://docs.github.com/en/codespaces/setting-up-your-project-for-codespaces/adding-a-dev-container-configuration/introduction-to-dev-containers).
 
-## Protoc Installation
+## `protoc` Installation
 
 Compiling DataFusion from sources requires an installed version of the protobuf compiler, `protoc`.
 
-On most platforms this can be installed from your system's package manager
+On most platforms this can be installed from your system's package manager. For example:
 
 ```
 # Ubuntu
@@ -71,13 +102,13 @@ libprotoc 3.15.0
 
 Alternatively a binary release can be downloaded from the [Release Page](https://github.com/protocolbuffers/protobuf/releases) or [built from source](https://github.com/protocolbuffers/protobuf/blob/main/src/README.md).
 
-## Bootstrap environment
+## Bootstrap Environment
 
 DataFusion is written in Rust and it uses a standard rust toolkit:
 
 - `rustup update stable` DataFusion generally uses the latest stable release of Rust, though it may lag when new Rust toolchains release
   - See which toolchain is currently pinned in the [`rust-toolchain.toml`](https://github.com/apache/datafusion/blob/main/rust-toolchain.toml) file
-  - This can cause issues such as not having the rust-analyzer component installed for the specified toolchain, in which case just install it manually, e.g. `rustup component add --toolchain 1.88 rust-analyzer`
+  - This can cause issues such as not having the rust-analyzer component installed for the specified toolchain, in which case just install it manually, e.g. `rustup component add --toolchain 1.88.0 rust-analyzer`
 - `cargo build`
 - `cargo fmt` to format the code
 - etc.
diff --git a/docs/source/contributor-guide/governance.md b/docs/source/contributor-guide/governance.md
index 857a82fa9613f..e08308ad7a816 100644
--- a/docs/source/contributor-guide/governance.md
+++ b/docs/source/contributor-guide/governance.md
@@ -64,6 +64,7 @@ Notes:
 | QP Hou                  | houqp            | [houqp](https://github.com/houqp)                       |                | PMC       |
 | Jie Wen                 | jakevin          | [jackwener](https://github.com/jackwener)               |                | PMC       |
 | Jay Zhan                | jayzhan          | [jayzhan211](https://github.com/jayzhan211)             |                | PMC       |
+| Jeffrey Vo              | jeffreyvo        | [Jefffrey](https://github.com/Jefffrey)                 |                | PMC       |
 | Jonah Gao               | jonah            | [jonahgao](https://github.com/jonahgao)                 |                | PMC       |
 | Kun Liu                 | liukun           | [liukun4515](https://github.com/liukun4515)             |                | PMC       |
 | Mehmet Ozan Kabak       | ozankabak        | [ozankabak](https://github.com/ozankabak)               | Synnada, Inc   | PMC       |
@@ -73,14 +74,15 @@ Notes:
 | Wes McKinney            | wesm             | [wesm](https://github.com/wesm)                         | Posit          | PMC       |
 | Will Jones              | wjones127        | [wjones127](https://github.com/wjones127)               | LanceDB        | PMC       |
 | Xudong Wang             | xudong963        | [xudong963](https://github.com/xudong963)               | Polygon.io     | PMC       |
+| Yongting You            | ytyou            | [2010YOUY01](https://github.com/2010YOUY01)             | Independent    | PMC       |
 | Adrian Garcia Badaracco | adriangb         | [adriangb](https://github.com/adriangb)                 | Pydantic       | Committer |
 | Brent Gardner           | avantgardner     | [avantgardnerio](https://github.com/avantgardnerio)     | Coralogix      | Committer |
 | Dmitrii Blaginin        | blaginin         | [blaginin](https://github.com/blaginin)                 | SpiralDB       | Committer |
 | Piotr Findeisen         | findepi          | [findepi](https://github.com/findepi)                   | dbt Labs       | Committer |
+| Gabriel Musat           | gabotechs        | [gabotechs](https://github.com/gabotechs)               | DataDog        | Committer |
 | Jax Liu                 | goldmedal        | [goldmedal](https://github.com/goldmedal)               | Canner         | Committer |
 | Huaxin Gao              | huaxingao        | [huaxingao](https://github.com/huaxingao)               |                | Committer |
 | Ifeanyi Ubah            | iffyio           | [iffyio](https://github.com/iffyio)                     | Validio        | Committer |
-| Jeffrey Vo              | jeffreyvo        | [Jefffrey](https://github.com/Jefffrey)                 |                | Committer |
 | Liu Jiayu               | jiayuliu         | [jimexist](https://github.com/jimexist)                 |                | Committer |
 | Ruiqiu Cao              | kamille          | [Rachelint](https://github.com/Rachelint)               | Tencent        | Committer |
 | Kazuyuki Tanimura       | kazuyukitanimura | [kazuyukitanimura](https://github.com/kazuyukitanimura) |                | Committer |
@@ -106,7 +108,6 @@ Notes:
 | Yang Jiang              | yangjiang        | [Ted-jiang](https://github.com/Ted-jiang)               | Ebay           | Committer |
 | Yoav Cohen              | ycohen           | [yoavcloud](https://github.com/yoavcloud)               |                | Committer |
 | Yijie Shen              | yjshen           | [yjshen](https://github.com/yjshen)                     | DataPelago     | Committer |
-| Yongting You            | ytyou            | [2010YOUY01](https://github.com/2010YOUY01)             | Independent    | Committer |
 | Qi Zhu                  | zhuqi            | [zhuqi-lucas](https://github.com/zhuqi-lucas)           | Polygon.io     | Committer |
 
 <!-- End Auto-Generated Committer List -->
diff --git a/docs/source/contributor-guide/howtos.md b/docs/source/contributor-guide/howtos.md
index 24b63865cb71b..18d9391d24bbe 100644
--- a/docs/source/contributor-guide/howtos.md
+++ b/docs/source/contributor-guide/howtos.md
@@ -64,10 +64,10 @@ function types (e.g. scalar, nested, aggregate) are grouped together in the sing
 [`partitionevaluator`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.PartitionEvaluator.html
 [`tablefunctionimpl`]: https://docs.rs/datafusion/latest/datafusion/catalog/trait.TableFunctionImpl.html
 [`tableprovider`]: https://docs.rs/datafusion/latest/datafusion/catalog/trait.TableProvider.html
-[`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udf.rs
-[`advanced_udaf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udaf.rs
-[`advanced_udwf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udwf.rs
-[`simple_udtf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/simple_udtf.rs
+[`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udf.rs
+[`advanced_udaf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udaf.rs
+[`advanced_udwf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udwf.rs
+[`simple_udtf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/simple_udtf.rs
 [rust feature]: https://doc.rust-lang.org/cargo/reference/features.html
 
 **Testing**
@@ -187,4 +187,4 @@ valid installation of [protoc] (see [installation instructions] for details).
 ```
 
 [protoc]: https://github.com/protocolbuffers/protobuf#protocol-compiler-installation
-[installation instructions]: https://datafusion.apache.org/contributor-guide/getting_started.html#protoc-installation
+[installation instructions]: https://datafusion.apache.org/contributor-guide/development_environment.html#protoc-installation
diff --git a/docs/source/contributor-guide/index.md b/docs/source/contributor-guide/index.md
index df664975a84ae..6ec1efa4d99fa 100644
--- a/docs/source/contributor-guide/index.md
+++ b/docs/source/contributor-guide/index.md
@@ -32,7 +32,10 @@ community as well as get more familiar with Rust and the relevant codebases.
 
 ## Development Environment
 
-You can find how to setup build and testing environment [here](https://datafusion.apache.org/contributor-guide/development_environment.html)
+Start with the [Development Environment Quick Start](development_environment.md#quick-start).
+
+For more detail, see the full [development environment guide](development_environment.md)
+and the [testing guide](testing.md).
 
 ## Finding and Creating Issues to Work On
 
@@ -59,9 +62,8 @@ If you want to work on an issue which is not already assigned to someone else
 and there are no comment indicating that someone is already working on that
 issue then you can assign the issue to yourself by submitting a single word
 comment `take`. This will assign the issue to yourself. However, if you are
-unable to make progress you should unassign the issue by using the `unassign me`
-link at the top of the issue page (and ask for help if are stuck) so that
-someone else can get involved in the work.
+unable to make progress you should unassign the issue by commenting a single
+word `untake`.
 
 # Developer's guide
 
@@ -99,6 +101,24 @@ If you are concerned that a larger design will be lost in a string of small PRs,
 
 Note all commits in a PR are squashed when merged to the `main` branch so there is one commit per PR after merge.
 
+## Release Management and Backports
+
+Contributor-facing guidance for release branches, patch releases, and backports
+is documented in the [Release Management](release_management.md) guide.
+
+## Before Submitting a PR
+
+Before submitting a PR, run the standard non-functional checks. PRs must pass
+before merge.
+
+```bash
+./dev/rust_lint.sh
+# use `--write` to automatically fix some formatting and lint errors
+# ./dev/rust_lint.sh --write --allow-dirty
+```
+
+You should also run any relevant commands from the [testing quick start](testing.md#testing-quick-start).
+
 ## Conventional Commits & Labeling PRs
 
 We generate change logs for each release using an automated process that will categorize PRs based on the title
@@ -199,3 +219,35 @@ Please understand the reviewing capacity is **very limited** for the project, so
 ### Better ways to contribute than an “AI dump”
 
 It's recommended to write a high-quality issue with a clear problem statement and a minimal, reproducible example. This can make it easier for others to contribute.
+
+### CI Runners
+
+#### Runs-On
+
+We use [Runs-On](https://runs-on.com/) for some actions in the main repository, which run in the ASF AWS account to speed up CI. In forks, these actions run on the default GitHub runners since forks do not have access to ASF infrastructure.
+
+To configure them, we use the following format:
+
+`runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}`
+
+This is a conditional expression that uses Runs-On custom runners for the main repository and falls back to the standard GitHub runners for forks. Runs-On configuration follows the [Runs-On pattern](https://runs-on.com/configuration/job-labels/).
+
+For those actions we also use the [Runs-On action](https://runs-on.com/caching/magic-cache/#how-to-use), which adds support for external caching and reports job metrics:
+
+`- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e`
+
+For the standard GitHub runners, this action will do nothing.
+
+##### Spot Instances
+
+By default, Runs-On actions run as [spot instances](https://runs-on.com/configuration/spot-instances/), which means they might occasionally be interrupted. In the CI you would see:
+
+```
+Error: The operation was canceled.
+```
+
+According to Runs-On, spot instance termination is extremely rare for instances running for less than 1h. Those actions will be restarted automatically.
+
+#### GitHub Runners
+
+We also use standard GitHub runners for some actions in the main repository; these are also runnable in forks.
diff --git a/docs/source/contributor-guide/inviting.md b/docs/source/contributor-guide/inviting.md
index a61e16c9a65b7..75ef7c1e98d6d 100644
--- a/docs/source/contributor-guide/inviting.md
+++ b/docs/source/contributor-guide/inviting.md
@@ -95,7 +95,7 @@ Your Name
 [1] LINK TO DISCUSSION THREAD (e.g. https://lists.apache.org/thread/7rocc026wckknrjt9j6bsqk3z4c0g5yf)
 ```
 
-If the vote passes (requires 3 `+1`, and no `-1` voites) , send a result email
+If the vote passes (requires 3 `+1`, and no `-1` votes) , send a result email
 like the following (substitute `N` with the number of `+1` votes)
 
 ```
@@ -126,7 +126,7 @@ explicitly add them to the roster on the [Whimsy Roster Tool].
 ### Step 4: Announce and Celebrate the New Committer
 
 Email to Send an email such as the following to
-[dev@datafusion.apache.org](mailto:dev@datafusion.apache.org]) to celebrate and
+[dev@datafusion.apache.org](mailto:dev@datafusion.apache.org) to celebrate and
 acknowledge the new committer to the community.
 
 ```
@@ -296,14 +296,14 @@ Subject: [DISCUSS] $NEW_PMC_MEMBER for PMC
 
 I would like to propose adding $NEW_PMC_MEMBER[1] to the DataFusion PMC.
 
-$NEW_PMC_MEMBMER has been a committer since $COMMITTER_MONTH [2], has a
+$NEW_PMC_MEMBER has been a committer since $COMMITTER_MONTH [2], has a
 strong and sustained contribution record for more than a year, and focused on
 helping the community and the project grow[3].
 
 Are there any thoughts about inviting $NEW_PMC_MEMBER to become a PMC member?
 
 [1] https://github.com/$NEW_PMC_MEMBERS_GITHUB_ACCOUNT
-[2] LINK TO COMMMITER VOTE RESULT THREAD (e.g. https://lists.apache.org/thread/ovgp8z97l1vh0wzjkgn0ktktggomxq9t)
+[2] LINK TO COMMITTER VOTE RESULT THREAD (e.g. https://lists.apache.org/thread/ovgp8z97l1vh0wzjkgn0ktktggomxq9t)
 [3]: https://github.com/apache/datafusion/pulls?q=commenter%3A<$NEW_PMC_MEMBERS_GITHUB_ACCOUNT>+
 
 Thanks,
diff --git a/docs/source/contributor-guide/release_management.md b/docs/source/contributor-guide/release_management.md
new file mode 100644
index 0000000000000..0515204a5ecbc
--- /dev/null
+++ b/docs/source/contributor-guide/release_management.md
@@ -0,0 +1,123 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Release Management
+
+This page describes DataFusion release branches and backports. For the
+maintainer release guide, including release candidate artifacts, voting, and
+publication, see [the release process README in `dev/release`].
+
+## Overview
+
+DataFusion typically has a major release about once per month, including
+breaking API changes. Patch releases are made on an ad hoc basis, but we try to
+avoid them because major releases are frequent.
+
+New development happens on the [`main` branch]. Releases are made from release
+branches named `branch-NN`, such as [`branch-50`] for the `50.x.y` release
+series.
+
+In general:
+
+- New features land on [`main`]
+- Patch releases are cut from the corresponding `branch-NN`
+- Only targeted, low-risk fixes should be added to a release branch
+
+Changes reach a release branch in one of two ways:
+
+- (Most common) Fix the issue on `main` and then backport the merged change to the release branch
+- Fix the issue on the release branch and then forward-port the change to `main`
+
+Releases are coordinated in a GitHub issue, such as the
+[release issue for 50.3.0]. If you think a fix should be included in a patch
+release, discuss it on the relevant tracking issue first. You can also open the
+backport PR first and then link it from the tracking issue.
+
+To prepare for a new release series, maintainers:
+
+- Create a new branch from `main`, such as `branch-50`, in the Apache repository
+- Continue merging new features to `main`
+- Prepare the release branch for release by updating versions, changelog content,
+  and any additional release-specific fixes via the
+  [Backport Workflow](#backport-workflow)
+- Create release candidate artifacts from the release branch
+- After approval, publish to crates.io, ASF distribution servers, and Git tags
+
+## Backport Workflow
+
+The usual workflow is:
+
+1. Fix on `main` first, and merge the fix via a normal PR workflow.
+2. Cherry-pick the merged commit onto the release branch.
+3. Open a backport PR targeting the release branch (examples below).
+
+- [Example backport PR]
+- [Additional backport PR example]
+
+### Inputs
+
+To backport a change, gather the following information:
+
+- Target branch, such as `apache/branch-52`
+- The release tracking issue URL, such as https://github.com/apache/datafusion/issues/19692
+- The original PR URL, such as https://github.com/apache/datafusion/pull/20192
+- Optional explicit commit SHA to backport
+
+### Apply the Backport
+
+Start from the target release branch, create a dedicated backport branch, and
+use `git cherry-pick`. For example, to backport PR #1234 to `branch-52` when
+the commit SHA is `abc123`, run:
+
+```bash
+git checkout apache/branch-52
+git checkout -b alamb/backport_1234
+git cherry-pick abc123
+```
+
+### Test
+
+Run tests as described in the [testing documentation].
+
+### Open the PR
+
+Create a PR against the release branch, not `main`, and prefix it with
+`[branch-NN]` to show which release branch the backport targets. For example:
+
+- `[branch-52] fix: validate inter-file ordering in eq_properties() (#20329)`
+
+Use a PR description that links the tracking issue, original PR, and target
+branch, for example:
+
+```markdown
+- Part of <tracking-issue-url>
+- Closes <backport-issue-url> on <branch-name>
+
+This PR:
+
+- Backports <original-pr-url> from @<author> to the <branch-name> line
+```
+
+[`main` branch]: https://github.com/apache/datafusion/tree/main
+[`branch-50`]: https://github.com/apache/datafusion/tree/branch-50
+[the release process readme in `dev/release`]: https://github.com/apache/datafusion/blob/main/dev/release/README.md
+[release issue for 50.3.0]: https://github.com/apache/datafusion/issues/18072
+[example backport pr]: https://github.com/apache/datafusion/pull/18131
+[additional backport pr example]: https://github.com/apache/datafusion/pull/20792
+[testing documentation]: testing.md
diff --git a/docs/source/contributor-guide/roadmap.md b/docs/source/contributor-guide/roadmap.md
index 073682008047d..bfaf398d3f549 100644
--- a/docs/source/contributor-guide/roadmap.md
+++ b/docs/source/contributor-guide/roadmap.md
@@ -19,7 +19,7 @@ under the License.
 
 # Roadmap and Improvement Proposals
 
-The [project introduction](../user-guide/introduction) explains the
+The [project introduction](../user-guide/introduction.md) explains the
 overview and goals of DataFusion, and our development efforts largely
 align to that vision.
 
@@ -55,8 +55,9 @@ discussion.
 For more information:
 
 1. [Search for issues labeled `roadmap`](https://github.com/apache/datafusion/issues?q=is%3Aissue%20%20%20roadmap)
-2. [DataFusion Road Map: Q3-Q4 2025](https://github.com/apache/datafusion/issues/15878)
-3. [2024 Q4 / 2025 Q1 Roadmap](https://github.com/apache/datafusion/issues/13274)
+2. [DataFusion Road Map: Q1 2026](https://github.com/apache/datafusion/issues/18494)
+3. [DataFusion Road Map: Q3-Q4 2025](https://github.com/apache/datafusion/issues/15878)
+4. [2024 Q4 / 2025 Q1 Roadmap](https://github.com/apache/datafusion/issues/13274)
 
 ## Improvement Proposals
 
diff --git a/docs/source/contributor-guide/specification/output-field-name-semantic.md b/docs/source/contributor-guide/specification/output-field-name-semantic.md
index fe378a52cda10..1a3c373c9bbd5 100644
--- a/docs/source/contributor-guide/specification/output-field-name-semantic.md
+++ b/docs/source/contributor-guide/specification/output-field-name-semantic.md
@@ -20,7 +20,7 @@
 # Output field name semantics
 
 This specification documents how field names in output record batches should be
-generated based on given user queries. The filed name rules apply to
+generated based on given user queries. The field name rules apply to
 DataFusion queries planned from both SQL queries and Dataframe APIs.
 
 ## Field name rules
diff --git a/docs/source/contributor-guide/testing.md b/docs/source/contributor-guide/testing.md
index dd22e1236081a..3b644f610b90e 100644
--- a/docs/source/contributor-guide/testing.md
+++ b/docs/source/contributor-guide/testing.md
@@ -23,6 +23,37 @@ Tests are critical to ensure that DataFusion is working properly and
 is not accidentally broken during refactorings. All new features
 should have test coverage and the entire test suite is run as part of CI.
 
+## Testing Quick Start
+
+While developing a feature or bug fix, best practice is to run the smallest set
+of tests that gives confidence for your change, then expand as needed.
+
+Initially, run the tests in the crates you changed. For example, if you made changes
+to files in `datafusion-optimizer/src`, run the corresponding crate tests:
+
+```shell
+cargo test -p datafusion-optimizer
+```
+
+Then, run the `sqllogictest` suite, which provides a strong speed–coverage tradeoff for development: it runs quickly while offering broad regression coverage across most SQL behavior in DataFusion.
+
+```shell
+cargo test --profile=ci --test sqllogictests
+```
+
+Finally, before submitting a PR, run the tests for the core `datafusion` and
+`datafusion-cli` crates:
+
+```shell
+cargo test -p datafusion
+cargo test -p datafusion-cli
+```
+
+Some integration tests require optional external services such as Docker-backed
+containers and may skip when unavailable.
+
+## Testing Overview
+
 DataFusion has several levels of tests in its [Test Pyramid] and tries to follow
 the Rust standard [Testing Organization] described in [The Book].
 
@@ -46,19 +77,43 @@ cargo nextest run
 ## Unit tests
 
 Tests for code in an individual module are defined in the same source file with a `test` module, following Rust convention.
-The [test_util](https://github.com/apache/datafusion/tree/main/datafusion/common/src/test_util.rs) module provides useful macros to write unit tests effectively, such as `assert_batches_sorted_eq` and `assert_batches_eq` for RecordBatches and `assert_contains` / `assert_not_contains` which are used extensively in the codebase.
+
+For example, to run tests in the `datafusion` crate:
+
+```shell
+cargo test -p datafusion
+```
+
+The [test_util] module provides useful macros to write unit tests effectively, such as [`assert_batches_sorted_eq`] and [`assert_batches_eq`] for RecordBatches and [`assert_contains`] / [`assert_not_contains`] which are used extensively in the codebase.
+
+[test_util]: https://github.com/apache/datafusion/tree/main/datafusion/common/src/test_util.rs
+[`assert_batches_sorted_eq`]: https://docs.rs/datafusion/latest/datafusion/macro.assert_batches_sorted_eq.html
+[`assert_batches_eq`]: https://docs.rs/datafusion/latest/datafusion/macro.assert_batches_eq.html
+[`assert_contains`]: https://docs.rs/datafusion/latest/datafusion/common/macro.assert_contains.html
+[`assert_not_contains`]: https://docs.rs/datafusion/latest/datafusion/common/macro.assert_not_contains.html
 
 ## sqllogictests Tests
 
-DataFusion's SQL implementation is tested using [sqllogictest](https://github.com/apache/datafusion/tree/main/datafusion/sqllogictest) which are run like other tests using `cargo test --test sqllogictests`.
+DataFusion's SQL implementation is tested using [sqllogictest](https://github.com/apache/datafusion/tree/main/datafusion/sqllogictest). You can run these tests with commands like:
+
+```shell
+# Run all tests
+cargo test --profile=ci --test sqllogictests
+# Run a specific test file
+cargo test --profile=ci --test sqllogictests -- aggregate.slt
+# Run a specific test file and update expected outputs
+cargo test --profile=ci --test sqllogictests -- aggregate.slt --complete
+# Run and update expected outputs for all test files
+cargo test --profile=ci --test sqllogictests -- --complete
+```
 
-`sqllogictests` tests may be less convenient for new contributors who are familiar with writing `.rs` tests as they require learning another tool. However, `sqllogictest` based tests are much easier to develop and maintain as they 1) do not require a slow recompile/link cycle and 2) can be automatically updated via `cargo test --test sqllogictests -- --complete`.
+`sqllogictests` may be less convenient for new contributors who are familiar with writing `.rs` tests as they require learning another tool. However, `sqllogictest` based tests are much easier to develop and maintain as they 1) do not require a slow recompile/link cycle and 2) can be automatically updated.
 
 Like similar systems such as [DuckDB](https://duckdb.org/dev/testing), DataFusion has chosen to trade off a slightly higher barrier to contribution for longer term maintainability.
 
 DataFusion has integrated [sqlite's test suite](https://sqlite.org/sqllogictest/doc/trunk/about.wiki) as a supplemental test suite that is run whenever a PR is merged into DataFusion. To run it manually please refer to the [README](https://github.com/apache/datafusion/blob/main/datafusion/sqllogictest/README.md#running-tests-sqlite) file for instructions.
 
-## Snapshot testing
+## Snapshot testing (`cargo insta`)
 
 [Insta](https://github.com/mitsuhiko/insta) is used for snapshot testing. Snapshots are generated
 and compared on each test run. If the output changes, tests will fail.
@@ -82,6 +137,7 @@ locally by following the [instructions in the documentation].
 
 [sqlite test suite]: https://www.sqlite.org/sqllogictest/dir?ci=tip
 [instructions in the documentation]: https://github.com/apache/datafusion/tree/main/datafusion/sqllogictest#running-tests-sqlite
+[extended.yml]: https://github.com/apache/datafusion/blob/main/.github/workflows/extended.yml
 
 ## Rust Integration Tests
 
@@ -90,7 +146,7 @@ There are several public interface tests for the DataFusion library in the [test
 You can run these tests individually using `cargo` as normal command such as
 
 ```shell
-cargo test -p datafusion --test parquet_exec
+cargo test -p datafusion --test parquet_integration
 ```
 
 ## SQL "Fuzz" testing
@@ -130,6 +186,34 @@ tested in the same way using the [doc_comment] crate. See the end of
 [doc_comment]: https://docs.rs/doc-comment/latest/doc_comment
 [core/src/lib.rs]: https://github.com/apache/datafusion/blob/main/datafusion/core/src/lib.rs#L583
 
+## Documentation Link Checks
+
+Run the internal markdown link check locally:
+
+```shell
+source ci/scripts/utils/tool_versions.sh
+cargo install lychee --locked --version "${LYCHEE_VERSION}"
+bash ci/scripts/markdown_link_check.sh
+```
+
+Notes:
+
+- The script is run with `bash` and is compatible with the default Bash on macOS (no `mapfile` dependency).
+- The CI configuration currently checks internal markdown links only. External `http(s)` and `mailto` links are excluded to avoid flaky failures.
+
+When a link is broken, lychee prints the file and URL/path that failed. For example:
+
+```text
+[docs/source/user-guide/cli/overview.md]:
+  [ERROR] file:///.../docs/source/user-guide/cli/missing-page.md | Cannot find file: File not found. Check if file exists and path is correct
+```
+
+Rust doc comments are validated by rustdoc in CI and can be checked locally with:
+
+```shell
+bash ci/scripts/rust_docs.sh
+```
+
 ## Benchmarks
 
 ### Criterion Benchmarks
diff --git a/docs/source/download.md b/docs/source/download.md
index 33a6d70088779..ed8fc06440f0c 100644
--- a/docs/source/download.md
+++ b/docs/source/download.md
@@ -19,19 +19,25 @@
 
 # Download
 
-While DataFusion is also distributed via the Rust [crates.io] package manager as a convenience, the
+Most users use DataFusion as a library in their Rust projects by adding it as a dependency
+in their `Cargo.toml` file and downloading it from the Rust [crates.io] package registry.
+
+For example:
+
+```toml
+[dependencies]
+datafusion = "53.0.0"
+```
+
+While DataFusion is distributed via [crates.io] as a convenience, the
 official Apache DataFusion releases are provided as source artifacts.
 
 [crates.io]: https://crates.io/crates/datafusion
 
 ## Releases
 
-The latest source release is [41.0.0][source-link] ([asc][asc-link],
-[sha512][sha512-link]).
-
-[source-link]: https://www.apache.org/dyn/closer.lua/datafusion/datafusion-41.0.0/apache-datafusion-41.0.0.tar.gz?action=download
-[asc-link]: https://downloads.apache.org/datafusion/datafusion-41.0.0/apache-datafusion-41.0.0.tar.gz.asc
-[sha512-link]: https://downloads.apache.org/datafusion/datafusion-41.0.0/apache-datafusion-41.0.0.tar.gz.sha512
+You can find the latest releases, signatures and checksums on
+the [ASF Release Page](https://dist.apache.org/repos/dist/release/datafusion)
 
 For previous releases, please check the [archive](https://archive.apache.org/dist/datafusion/).
 
@@ -40,8 +46,10 @@ For releases earlier than 37.0.0, please check [Arrow's archive](https://archive
 ## Notes
 
 - When downloading a release, please verify the OpenPGP compatible signature (or failing that, check the SHA-512); these should be fetched from the main Apache site.
-- The KEYS file contains the public keys used for signing release. It is recommended that (when possible) a web of trust is used to confirm the identity of these keys.
-- Please download the [KEYS](https://downloads.apache.org/datafusion/KEYS) as well as the .asc signature files.
+- The [KEYS] file contains the public keys used for signing release. It is recommended that (when possible) a web of trust is used to confirm the identity of these keys.
+- Please download the [KEYS] file as well as the .asc signature files.
+
+[keys]: https://downloads.apache.org/datafusion/KEYS
 
 ### To verify the signature of the release artifact
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index b589c9ce4047d..cc0da3c44473e 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -61,7 +61,7 @@ The following related subprojects target end users and have separate documentati
 "Out of the box," DataFusion offers `SQL <https://datafusion.apache.org/user-guide/sql/index.html>`_
 and `Dataframe <https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html>`_ APIs,
 excellent `performance <https://benchmark.clickhouse.com/>`_, built-in support for CSV, Parquet, JSON, and Avro,
-extensive customization, and a great community.
+extensive customization, and a great `community`_.
 `Python Bindings <https://github.com/apache/datafusion-python>`_ are also available.
 `Ballista <https://datafusion.apache.org/ballista/>`_ is Apache DataFusion extension enabling the parallelized execution of workloads across multiple nodes in a distributed environment.
 
@@ -81,6 +81,7 @@ To get started, see
 .. _datafusion-examples: https://github.com/apache/datafusion/tree/main/datafusion-examples
 .. _developer’s guide: contributor-guide/index.html#developer-s-guide
 .. _library user guide: library-user-guide/index.html
+.. _community: contributor-guide/communication.html
 .. _communication: contributor-guide/communication.html
 
 .. _toc.asf-links:
@@ -133,9 +134,10 @@ To get started, see
    :caption: Library User Guide
    
    library-user-guide/index
-   library-user-guide/upgrading
+   library-user-guide/upgrading/index
    library-user-guide/extensions
    library-user-guide/using-the-sql-api
+   library-user-guide/extending-sql
    library-user-guide/working-with-exprs
    library-user-guide/using-the-dataframe-api
    library-user-guide/building-logical-plans
@@ -157,9 +159,11 @@ To get started, see
    contributor-guide/communication
    contributor-guide/development_environment
    contributor-guide/architecture
+   contributor-guide/architecture/dependency-graph
    contributor-guide/testing
    contributor-guide/api-health
    contributor-guide/howtos
+   contributor-guide/release_management
    contributor-guide/roadmap
    contributor-guide/governance
    contributor-guide/inviting
diff --git a/docs/source/library-user-guide/catalogs.md b/docs/source/library-user-guide/catalogs.md
index d4e6633d40ba7..fc1d0abb7823a 100644
--- a/docs/source/library-user-guide/catalogs.md
+++ b/docs/source/library-user-guide/catalogs.md
@@ -19,7 +19,7 @@
 
 # Catalogs, Schemas, and Tables
 
-This section describes how to create and manage catalogs, schemas, and tables in DataFusion. For those wanting to dive into the code quickly please see the [example](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/catalog.rs).
+This section describes how to create and manage catalogs, schemas, and tables in DataFusion. For those wanting to dive into the code quickly please see the [example](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/data_io/catalog.rs).
 
 ## General Concepts
 
@@ -74,10 +74,6 @@ use datafusion::common::{Result, exec_err};
 
 #[async_trait]
 impl SchemaProvider for MemorySchemaProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn table_names(&self) -> Vec<String> {
         self.tables
             .iter()
@@ -131,10 +127,6 @@ Without getting into a `CatalogProvider` implementation, we can create a `Memory
 #
 # #[async_trait]
 # impl SchemaProvider for MemorySchemaProvider {
-#     fn as_any(&self) -> &dyn Any {
-#         self
-#     }
-#
 #     fn table_names(&self) -> Vec<String> {
 #         self.tables
 #             .iter()
@@ -219,7 +211,6 @@ impl SchemaProvider for Schema {
 #       todo!();
     }
 
-#    fn as_any(&self) -> &(dyn std::any::Any + 'static) { todo!() }
 #    fn table_names(&self) -> Vec<std::string::String> { todo!() }
 #    fn table_exist(&self, _: &str) -> bool { todo!() }
 }
@@ -242,10 +233,6 @@ pub struct MemoryCatalogProvider {
 }
 
 impl CatalogProvider for MemoryCatalogProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn schema_names(&self) -> Vec<String> {
         self.schemas.iter().map(|s| s.key().clone()).collect()
     }
@@ -298,10 +285,6 @@ pub struct MemoryCatalogProviderList {
 }
 
 impl CatalogProviderList for MemoryCatalogProviderList {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn register_catalog(
         &self,
         name: String,
diff --git a/docs/source/library-user-guide/custom-table-providers.md b/docs/source/library-user-guide/custom-table-providers.md
index 695cb16ac8604..81b2d131e65c3 100644
--- a/docs/source/library-user-guide/custom-table-providers.md
+++ b/docs/source/library-user-guide/custom-table-providers.md
@@ -19,556 +19,933 @@
 
 # Custom Table Provider
 
-Like other areas of DataFusion, you extend DataFusion's functionality by implementing a trait. The [`TableProvider`] and associated traits allow you to implement a custom table provider, i.e. use DataFusion's other functionality with your custom data source.
-
-This section describes how to create a [`TableProvider`] and how to configure DataFusion to use it for reading.
+One of DataFusion's greatest strengths is its extensibility. If your data lives
+in a custom format, behind an API, or in a system that DataFusion does not
+natively support, you can teach DataFusion to read it by implementing a
+**custom table provider**. This post walks through the three layers you need to
+understand to design a table provider and where planning and execution work should happen.
 
 For details on how table constraints such as primary keys or unique
 constraints are handled, see [Table Constraint Enforcement](table-constraints.md).
 
-## Table Provider and Scan
-
-The [`TableProvider::scan`] method reads data from the table and is likely the most important. It returns an [`ExecutionPlan`] that DataFusion will use to read the actual data during execution of the query. The [`TableProvider::insert_into`] method is used to `INSERT` data into the table.
-
-### Scan
-
-As mentioned, [`TableProvider::scan`] returns an execution plan, and in particular a `Result<Arc<dyn ExecutionPlan>>`. The core of this is returning something that can be dynamically dispatched to an `ExecutionPlan`. And as per the general DataFusion idea, we'll need to implement it.
-
-[`tableprovider`]: https://docs.rs/datafusion/latest/datafusion/datasource/trait.TableProvider.html
-[`tableprovider::scan`]: https://docs.rs/datafusion/latest/datafusion/datasource/trait.TableProvider.html#tymethod.scan
-[`tableprovider::insert_into`]: https://docs.rs/datafusion/latest/datafusion/datasource/trait.TableProvider.html#tymethod.insert_into
-[`executionplan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html
-
-#### Execution Plan
-
-The `ExecutionPlan` trait at its core is a way to get a stream of batches. The aptly-named `execute` method returns a `Result<SendableRecordBatchStream>`, which should be a stream of `RecordBatch`es that can be sent across threads, and has a schema that matches the data to be contained in those batches.
-
-There are many different types of `SendableRecordBatchStream` implemented in DataFusion -- you can use a pre existing one, such as `MemoryStream` (if your `RecordBatch`es are all in memory) or implement your own custom logic, depending on your usecase.
-
-Looking at the full example below:
-
-```rust
-use std::any::Any;
-use std::sync::{Arc, Mutex};
-use std::collections::{BTreeMap, HashMap};
-use datafusion::common::Result;
-use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
-use datafusion::physical_plan::expressions::PhysicalSortExpr;
-use datafusion::physical_plan::{
-    ExecutionPlan, SendableRecordBatchStream, DisplayAs, DisplayFormatType,
-    Statistics, PlanProperties
-};
-use datafusion::execution::context::TaskContext;
-use datafusion::arrow::array::{UInt64Builder, UInt8Builder};
-use datafusion::physical_plan::memory::MemoryStream;
-use datafusion::arrow::record_batch::RecordBatch;
-
-/// A User, with an id and a bank account
-#[derive(Clone, Debug)]
-struct User {
-    id: u8,
-    bank_account: u64,
-}
-
-/// A custom datasource, used to represent a datastore with a single index
-#[derive(Clone, Debug)]
-pub struct CustomDataSource {
-    inner: Arc<Mutex<CustomDataSourceInner>>,
-}
-
-#[derive(Debug)]
-struct CustomDataSourceInner {
-    data: HashMap<u8, User>,
-    bank_account_index: BTreeMap<u64, u8>,
-}
-
-#[derive(Debug)]
-struct CustomExec {
-    db: CustomDataSource,
-    projected_schema: SchemaRef,
-}
-
-impl DisplayAs for CustomExec {
-    fn fmt_as(&self, _t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(f, "CustomExec")
-    }
-}
+The majority of this content was originally posted in the blog
+[Writing Custom Table Providers in Apache DataFusion](https://datafusion.apache.org/blog/2026/03/31/writing-table-providers/).
+
+## The Three Layers
+
+When DataFusion executes a query against a table, three abstractions collaborate
+to produce results:
+
+1. **[TableProvider]** -- Describes the table (schema, capabilities) and
+   produces an execution plan when queried. This is part of the **Logical Plan**.
+2. **[ExecutionPlan]** -- Describes _how_ to compute the result: partitioning,
+   ordering, and child plan relationships. This is part of the **Physical Plan**.
+3. **[SendableRecordBatchStream]** -- The async stream that _actually does the
+   work_, yielding `RecordBatch`es one at a time.
+
+Think of these as a funnel: `TableProvider::scan()` is called once during
+planning to create an `ExecutionPlan`, then `ExecutionPlan::execute()` is called
+once per partition to create a stream, and those streams are where rows are
+actually produced during execution.
+
+[tableprovider]: https://docs.rs/datafusion/latest/datafusion/catalog/trait.TableProvider.html
+[executionplan]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html
+[sendablerecordbatchstream]: https://docs.rs/datafusion/latest/datafusion/execution/type.SendableRecordBatchStream.html
+[memtable]: https://docs.rs/datafusion/latest/datafusion/datasource/memory/struct.MemTable.html
+[streamtable]: https://docs.rs/datafusion/latest/datafusion/datasource/stream/struct.StreamTable.html
+[listingtable]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTable.html
+[viewtable]: https://docs.rs/datafusion/latest/datafusion/datasource/view/struct.ViewTable.html
+[planproperties]: https://docs.rs/datafusion/latest/datafusion/physical_plan/struct.PlanProperties.html
+[streamingtableexec]: https://docs.rs/datafusion/latest/datafusion/physical_plan/streaming/struct.StreamingTableExec.html
+[datasourceexec]: https://docs.rs/datafusion/latest/datafusion/datasource/source/struct.DataSourceExec.html
+
+## Background: Logical and Physical Planning
+
+Before diving into the three layers, it helps to understand how DataFusion
+processes a query. There are several phases between a SQL string (or DataFrame
+call) and streaming results:
+
+```text
+SQL / DataFrame API
+  → Logical Plan          (abstract: what to compute)
+  → Logical Optimization  (rewrite rules that preserve semantics)
+  → Physical Plan         (concrete: how to compute it)
+  → Physical Optimization (hardware- and data-aware rewrites)
+  → Execution             (streaming RecordBatches)
+```
 
-impl ExecutionPlan for CustomExec {
-    fn name(&self) -> &str {
-        "CustomExec"
+### Logical Planning
+
+A **logical plan** describes _what_ the query computes without specifying _how_.
+It is a tree of relational operators -- `TableScan`, `Filter`, `Projection`,
+`Aggregate`, `Join`, `Sort`, `Limit`, and so on. The logical optimizer rewrites
+this tree to reduce work while preserving the query's meaning. Some logical
+optimizations include:
+
+- **Predicate pushdown** -- moves filters as close to the data source as
+  possible, so fewer rows flow through the rest of the plan.
+- **Projection pruning** -- eliminates columns that are never referenced
+  downstream, reducing memory and I/O.
+- **Expression simplification** -- rewrites expressions like `1 = 1` or
+  `x AND true` into simpler forms.
+- **Subquery decorrelation** -- converts correlated `IN` / `EXISTS` subqueries
+  into more efficient semi-joins.
+- **Limit pushdown** -- pushes `LIMIT` earlier in the plan so operators
+  produce less data.
+
+### Physical Planning
+
+The **physical planner** converts the optimized logical plan into an
+`ExecutionPlan` tree -- the concrete plan that will actually run. This is where
+decisions like "use a hash join vs. a sort-merge join" or "how many partitions
+to scan" are made. The physical optimizer then refines this tree further with rewrites such as:
+
+- **Distribution enforcement** -- inserts `RepartitionExec` nodes so that data
+  is partitioned correctly for joins and aggregations.
+- **Sort enforcement** -- inserts `SortExec` nodes where ordering is required,
+  and removes them where the data is already sorted.
+- **Join selection** -- picks the most efficient join strategy based on
+  statistics and table sizes.
+- **Aggregate optimization** -- combines partial and final aggregation stages,
+  and can use exact statistics to skip scanning entirely.
+
+### Why This Matters for Table Providers
+
+Your `TableProvider` sits at the boundary between logical and physical planning.
+During logical optimization, DataFusion determines which filters and projections
+_could_ be pushed down to the source. When `scan()` is called during physical
+planning, those hints are passed to you. By implementing capabilities like
+`supports_filters_pushdown`, you influence what the optimizer can do -- and the
+metadata you declare in your `ExecutionPlan` (partitioning, ordering) directly
+affects which physical optimizations apply.
+
+## Choosing the Right Starting Point
+
+Not every custom data source requires implementing all three layers from
+scratch. DataFusion provides building blocks that let you plug in at whatever
+level makes sense:
+
+| If your data is...                                 | Start with                                                                | You implement                  |
+| -------------------------------------------------- | ------------------------------------------------------------------------- | ------------------------------ |
+| Already in `RecordBatch`es in memory               | [MemTable]                                                                | Nothing -- just construct it   |
+| An async stream of batches                         | [StreamTable]                                                             | A stream factory               |
+| A logical transformation of other tables           | [ViewTable] wrapping a logical plan                                       | The logical plan               |
+| A variant of an existing file format               | [ListingTable] with a custom [FileFormat] wrapping an existing one        | A thin `FileFormat` wrapper    |
+| Files in a custom format on disk or object storage | [ListingTable] with a custom [FileFormat], [FileSource], and [FileOpener] | The format, source, and opener |
+| A custom source needing full control               | `TableProvider` + `ExecutionPlan` + stream                                | All three layers               |
+
+[fileformat]: https://docs.rs/datafusion/latest/datafusion/datasource/file_format/trait.FileFormat.html
+[filesource]: https://docs.rs/datafusion-datasource/latest/datafusion_datasource/file/trait.FileSource.html
+[fileopener]: https://docs.rs/datafusion-datasource/latest/datafusion_datasource/file_stream/trait.FileOpener.html
+
+If your data is file-based, `ListingTable` handles file discovery, partition
+column inference, and plan construction -- you only need to implement
+`FileFormat`, `FileSource`, and `FileOpener` to describe how to read your
+files. See the [custom_file_format example] for a minimal wrapping approach,
+or [ParquetSource] and [ParquetOpener] for a full custom implementation to
+use as a reference.
+
+[custom_file_format example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/custom_data_source/custom_file_format.rs
+[parquetsource]: https://docs.rs/datafusion/latest/datafusion/datasource/physical_plan/struct.ParquetSource.html
+[parquetopener]: https://github.com/apache/datafusion/blob/main/datafusion/datasource-parquet/src/opener.rs
+
+The rest of this post focuses on the full `TableProvider` + `ExecutionPlan` +
+stream path, which gives you complete control and applies to any data source.
+
+## Layer 1: TableProvider
+
+A [TableProvider] represents a queryable data source. For a minimal read-only
+table, you need three methods:
+
+```rust,ignore
+impl TableProvider for MyTable {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
+    fn table_type(&self) -> TableType {
+        TableType::Base
     }
 
-    fn schema(&self) -> SchemaRef {
-        self.projected_schema.clone()
+    async fn scan(
+        &self,
+        state: &dyn Session,
+        projection: Option<&Vec<usize>>,
+        filters: &[Expr],
+        limit: Option<usize>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        // Build and return an ExecutionPlan -- don't do any execution work here -- keep lightweight!
+        Ok(Arc::new(MyExecPlan::new(
+            Arc::clone(&self.schema),
+            projection,
+            limit,
+        )))
     }
+}
+```
 
-
-    fn properties(&self) -> &PlanProperties {
-        unreachable!()
+The `scan` method is the heart of `TableProvider`. It receives three pushdown
+hints from the optimizer, each reducing the amount of data your source needs
+to produce:
+
+- **`projection`** -- Which columns are needed. This reduces the **width** of
+  the output. If your source supports it, read only these columns rather than
+  the full schema.
+- **`filters`** -- Predicates the engine would like you to apply during the
+  scan. This reduces the **number of rows** by skipping data that does not
+  match. Implement `supports_filters_pushdown` to advertise which filters you
+  can handle.
+- **`limit`** -- A row count cap. This also reduces the **number of rows** --
+  if you can stop reading early once you have produced enough rows, this avoids
+  unnecessary work.
+
+You can also use the [scan_with_args()](https://docs.rs/datafusion/latest/datafusion/catalog/trait.TableProvider.html#method.scan_with_args)
+variant that provides additional pushdown information for other advanced use cases.
+
+### Keep `scan()` Lightweight
+
+This is a critical point: **`scan()` runs during planning, not execution.** It
+should return quickly. Best practice is to avoid performing I/O, network
+calls, or heavy computation here. The `scan` method's job is to _describe_ how
+the data will be produced, not to produce it. All the real work belongs in the
+stream (Layer 3).
+
+A common pitfall is to fetch data or open connections in `scan()`. This blocks
+the planning thread and can cause timeouts or deadlocks, especially if the query
+involves multiple tables or subqueries that all need to be planned before
+execution begins.
+
+### Existing Implementations to Learn From
+
+DataFusion ships several `TableProvider` implementations that are excellent
+references:
+
+- **[MemTable]** -- Holds data in memory as `Vec<RecordBatch>`. The simplest
+  possible provider; great for tests and small datasets.
+- **[StreamTable]** -- Wraps a user-provided stream factory. Useful when your
+  data arrives as a continuous stream (e.g., from Kafka or a socket).
+- **[ListingTable]** -- The file-based data source behind DataFusion's
+  built-in Parquet, CSV, and JSON support. Demonstrates sophisticated filter
+  and projection pushdown, file pruning, and schema inference.
+- **[ViewTable]** -- Wraps a logical plan, representing a SQL view. Useful
+  if your provider is best expressed as a transformation of other tables.
+
+## Layer 2: ExecutionPlan
+
+An [ExecutionPlan] is a node in the physical query plan tree. Your table
+provider's `scan()` method returns one. The required methods are:
+
+```rust,ignore
+impl ExecutionPlan for MyExecPlan {
+    fn name(&self) -> &str { "MyExecPlan" }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.properties
     }
 
     fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
-        Vec::new()
+        vec![]  // Leaf node -- no children
     }
 
     fn with_new_children(
         self: Arc<Self>,
-        _: Vec<Arc<dyn ExecutionPlan>>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        assert!(children.is_empty());
         Ok(self)
     }
 
     fn execute(
         &self,
-        _partition: usize,
-        _context: Arc<TaskContext>,
+        partition: usize,
+        context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        let users: Vec<User> = {
-            let db = self.db.inner.lock().unwrap();
-            db.data.values().cloned().collect()
-        };
+        // This is where you build and return your stream
+        // ...
+    }
+}
+```
+
+The key properties to set correctly in [PlanProperties] are **output
+partitioning** and **output ordering**.
+
+**Output partitioning** tells the engine how many partitions your data has,
+which determines parallelism. If your source naturally partitions data (e.g.,
+by file or by shard), expose that here.
+
+**Output ordering** declares whether your data is naturally sorted. This
+enables the optimizer to avoid inserting a `SortExec` when a query requires
+ordered data. Getting this right can be a significant performance win.
+
+### Partitioning Strategies
+
+Since `execute()` is called once per partition, partitioning directly controls
+the parallelism of your table scan. Each partition produces an independent
+stream that DataFusion schedules as a **task** on the tokio runtime. It is
+important to distinguish tasks from threads: tasks are lightweight units of
+async work that are multiplexed onto a thread pool. You can have many more
+tasks (partitions) than physical threads -- the runtime will interleave them
+efficiently as they await I/O or yield.
+
+**Start simple: match your data's natural layout.** If you have 4 files, expose
+4 partitions. If your source has 8 shards, expose 8 partitions. DataFusion will
+insert a `RepartitionExec` above your scan when downstream operators need a
+different distribution. You can also implement the
+[`repartitioned`](https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html#method.repartitioned)
+method on your `ExecutionPlan` to let DataFusion request a different partition
+count directly from your source, avoiding the extra operator entirely.
+
+Consider how your data source naturally divides its data:
+
+- **By file or object:** If you are reading from S3, each file can be a
+  partition. DataFusion will read them in parallel.
+- **By shard or region:** If your source is a sharded database, each shard
+  maps naturally to a partition.
+- **By key range:** If your data is keyed (e.g., by timestamp or customer ID),
+  you can split it into ranges.
+
+**Advanced: aligning with `target_partitions`.** Once you have something
+working, you can tune further. Having _too many_ partitions is not free: each
+partition adds scheduling overhead, and downstream operators may need to
+repartition the data anyway. The session configuration exposes a
+**target partition count** that reflects how many partitions the optimizer
+expects to work with:
+
+```rust,ignore
+async fn scan(
+    &self,
+    state: &dyn Session,
+    projection: Option<&Vec<usize>>,
+    filters: &[Expr],
+    limit: Option<usize>,
+) -> Result<Arc<dyn ExecutionPlan>> {
+    let target_partitions = state.config().target_partitions();
+    // Optionally coalesce or split partitions to match target_partitions.
+    // ...
+}
+```
+
+If your source produces data in exactly `target_partitions` partitions, the
+optimizer is less likely to insert a `RepartitionExec` above your scan.
+For small datasets, `target_partitions` may be set to 1, which avoids any
+repartitioning overhead entirely.
 
-        let mut id_array = UInt8Builder::with_capacity(users.len());
-        let mut account_array = UInt64Builder::with_capacity(users.len());
+**Advanced: declaring hash partitioning.** If your source stores data
+pre-partitioned by a specific key (e.g., `customer_id`), you can declare this
+in your output partitioning. For a query like:
+
+```sql
+SELECT customer_id, SUM(amount)
+FROM my_table
+GROUP BY customer_id;
+```
 
-        for user in users {
-            id_array.append_value(user.id);
-            account_array.append_value(user.bank_account);
+If you declare your output partitioning as `Hash([customer_id], N)`, the
+optimizer recognizes that the data is already distributed correctly for the
+aggregation and eliminates the `RepartitionExec` that would otherwise appear
+in the plan. You can verify this with `EXPLAIN` (more on this below).
+
+Conversely, if you report `UnknownPartitioning`, DataFusion must assume the
+worst case and will always insert repartitioning operators as needed.
+
+### Keep `execute()` Lightweight Too
+
+Like `scan()`, the `execute()` method should construct and return a stream
+without doing heavy work. The actual data production happens when the stream
+is polled. Do not block on async operations here -- build the stream and let
+the runtime drive it.
+
+### Existing Implementations to Learn From
+
+- **[StreamingTableExec]** -- Executes a streaming table scan. It takes a
+  stream factory (a closure that produces streams) and handles partitioning.
+  Good reference for wrapping external streams.
+- **[DataSourceExec]** -- The execution plan behind DataFusion's built-in file
+  scanning (Parquet, CSV, JSON). It demonstrates sophisticated partitioning,
+  filter pushdown, and projection pushdown.
+
+## Layer 3: SendableRecordBatchStream
+
+[SendableRecordBatchStream] is where the real work happens. It is defined as:
+
+```rust,ignore
+type SendableRecordBatchStream =
+    Pin<Box<dyn RecordBatchStream<Item = Result<RecordBatch>> + Send>>;
+```
+
+This is an async stream of `RecordBatch`es that can be sent across threads. When
+the DataFusion runtime polls this stream, your code runs: reading files, calling
+APIs, transforming data, etc.
+
+### Using RecordBatchStreamAdapter
+
+The easiest way to create a `SendableRecordBatchStream` is with
+[RecordBatchStreamAdapter]. It bridges any `futures::Stream<Item = Result<RecordBatch>>` into the `SendableRecordBatchStream` type:
+
+```rust,ignore
+use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
+
+fn execute(
+    &self,
+    partition: usize,
+    context: Arc<TaskContext>,
+) -> Result<SendableRecordBatchStream> {
+    let schema = self.schema();
+    let config = self.config.clone();
+
+    let stream = futures::stream::once(async move {
+        // ALL the heavy work happens here, inside the stream:
+        // - Open connections
+        // - Read data from external sources
+        // - Transform and batch the results
+        let batches = fetch_data_from_source(&config).await?;
+        Ok(batches)
+    })
+    .flat_map(|result| match result {
+        Ok(batch) => futures::stream::iter(vec![Ok(batch)]),
+        Err(e) => futures::stream::iter(vec![Err(e)]),
+    });
+
+    Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
+}
+```
+
+[recordbatchstreamadapter]: https://docs.rs/datafusion/latest/datafusion/physical_plan/stream/struct.RecordBatchStreamAdapter.html
+
+### Blocking Work: Use a Separate Thread Pool
+
+If your stream performs **blocking** work -- such as blocking I/O, or CPU work
+that runs for hundreds of milliseconds without yielding -- you must avoid
+blocking the tokio async runtime. Short CPU work (e.g., parsing a batch in a
+few milliseconds) is fine to do inline as long as your code yields back to the
+runtime frequently. But for long-running synchronous work that cannot yield,
+offload to a dedicated thread pool and send results back through a channel:
+
+```rust,ignore
+fn execute(
+    &self,
+    partition: usize,
+    context: Arc<TaskContext>,
+) -> Result<SendableRecordBatchStream> {
+    let schema = self.schema();
+    let config = self.config.clone();
+
+    let (tx, rx) = tokio::sync::mpsc::channel(2);
+
+    // Spawn blocking work on a dedicated thread pool
+    tokio::task::spawn_blocking(move || {
+        let batches = generate_data(&config);
+        for batch in batches {
+            if tx.blocking_send(Ok(batch)).is_err() {
+                break; // Receiver dropped, query was cancelled
+            }
         }
+    });
 
-        Ok(Box::pin(MemoryStream::try_new(
-            vec![RecordBatch::try_new(
-                self.projected_schema.clone(),
-                vec![
-                    Arc::new(id_array.finish()),
-                    Arc::new(account_array.finish()),
-                ],
-            )?],
-            self.schema(),
-            None,
-        )?))
-    }
+    let stream = tokio_stream::wrappers::ReceiverStream::new(rx);
+    Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
 }
 ```
 
-This `execute` method:
+This pattern keeps the async runtime responsive while long-running synchronous
+work runs on its own threads. For a working example that shows how to configure
+separate thread pools for I/O and CPU work, see the
+[thread_pools example](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/thread_pools.rs)
+in the DataFusion repository.
 
-1. Gets the users from the database
-2. Constructs the individual output arrays (columns)
-3. Returns a `MemoryStream` of a single `RecordBatch` with the arrays
+## Where Should the Work Happen?
 
-I.e. returns the "physical" data. For other examples, refer to the [`CsvSource`][csv] and [`ParquetSource`][parquet] for more complex implementations.
+This table summarizes what belongs at each layer:
 
-With the `ExecutionPlan` implemented, we can now implement the `scan` method of the `TableProvider`.
+| Layer                         | Runs During                    | Should Do                              | Should NOT Do                         |
+| ----------------------------- | ------------------------------ | -------------------------------------- | ------------------------------------- |
+| `TableProvider::scan()`       | Planning                       | Build an `ExecutionPlan` with metadata | I/O, network calls, heavy computation |
+| `ExecutionPlan::execute()`    | Execution (once per partition) | Construct a stream, set up channels    | Block on async work, read data        |
+| `RecordBatchStream` (polling) | Execution                      | All I/O, computation, data production  | --                                    |
 
-#### Scan Revisited
+The guiding principle: **push work as late as possible.** Planning should be
+fast so the optimizer can do its job. Execution setup should be fast so all
+partitions can start promptly. The stream is where you spend time producing
+data.
 
-The `scan` method of the `TableProvider` returns a `Result<Arc<dyn ExecutionPlan>>`. We can use the `Arc` to return a reference-counted pointer to the `ExecutionPlan` we implemented. In the example, this is done by:
+### Why This Matters
 
-```rust
+When `scan()` does heavy work, several problems arise:
 
+1. **Planning becomes slow.** If a query touches 10 tables and each `scan()`
+   takes 500ms, planning alone takes 5 seconds before any data flows.
+2. **Execution is single-threaded.** `scan()` runs on a single thread during
+   planning, so any work done there cannot benefit from the parallel execution
+   that DataFusion provides across partitions.
+3. **The optimizer cannot help.** The optimizer runs between planning and
+   execution. If you have already fetched data during planning, optimizations
+   like predicate pushdown or partition pruning cannot reduce the work.
+4. **Resource management breaks down.** DataFusion manages concurrency and
+   memory during execution. Work done during planning bypasses these controls.
+
+## Filter Pushdown: Doing Less Work
+
+One of the most impactful optimizations you can add to a custom table provider
+is **filter pushdown** -- letting the source skip data that the query does not
+need, rather than reading everything and filtering it afterward.
+
+### How Filter Pushdown Works
+
+When DataFusion plans a query with a `WHERE` clause, it passes the filter
+predicates to your `scan()` method as the `filters` parameter. By default,
+DataFusion assumes your provider cannot handle any filters and inserts a
+`FilterExec` node above your scan to apply them. But if your source _can_
+evaluate some predicates during scanning -- for example, by skipping files,
+partitions, or row groups that cannot match -- you can eliminate a huge amount
+of unnecessary I/O.
+
+To opt in, implement `supports_filters_pushdown`:
+
+```rust
 # use std::any::Any;
-# use std::sync::{Arc, Mutex};
-# use std::collections::{BTreeMap, HashMap};
+# use std::sync::Arc;
+# use arrow::datatypes::SchemaRef;
+# use datafusion::catalog::{TableProvider, Session};
 # use datafusion::common::Result;
-# use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
-# use datafusion::physical_plan::expressions::PhysicalSortExpr;
-# use datafusion::physical_plan::{
-#     ExecutionPlan, SendableRecordBatchStream, DisplayAs, DisplayFormatType,
-#     Statistics, PlanProperties
-# };
-# use datafusion::execution::context::TaskContext;
-# use datafusion::arrow::array::{UInt64Builder, UInt8Builder};
-# use datafusion::physical_plan::memory::MemoryStream;
-# use datafusion::arrow::record_batch::RecordBatch;
+# use datafusion::datasource::TableType;
+# use datafusion::logical_expr::{Expr, BinaryExpr, Operator, TableProviderFilterPushDown};
+# use datafusion::physical_plan::ExecutionPlan;
 #
-# /// A User, with an id and a bank account
-# #[derive(Clone, Debug)]
-# struct User {
-#     id: u8,
-#     bank_account: u64,
-# }
-#
-# /// A custom datasource, used to represent a datastore with a single index
-# #[derive(Clone, Debug)]
-# pub struct CustomDataSource {
-#     inner: Arc<Mutex<CustomDataSourceInner>>,
-# }
-#
-# #[derive(Debug)]
-# struct CustomDataSourceInner {
-#     data: HashMap<u8, User>,
-#     bank_account_index: BTreeMap<u64, u8>,
-# }
+# fn is_partition_column(_expr: &Expr) -> bool { false }
 #
 # #[derive(Debug)]
-# struct CustomExec {
-#     db: CustomDataSource,
-#     projected_schema: SchemaRef,
-# }
-#
-# impl DisplayAs for CustomExec {
-#     fn fmt_as(&self, _t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-#         write!(f, "CustomExec")
-#     }
-# }
-#
-# impl ExecutionPlan for CustomExec {
-#     fn name(&self) -> &str {
-#         "CustomExec"
-#     }
-#
-#     fn as_any(&self) -> &dyn Any {
-#         self
-#     }
-#
-#     fn schema(&self) -> SchemaRef {
-#         self.projected_schema.clone()
-#     }
-#
-#
-#     fn properties(&self) -> &PlanProperties {
-#         unreachable!()
-#     }
-#
-#     fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
-#         Vec::new()
-#     }
-#
-#     fn with_new_children(
-#         self: Arc<Self>,
-#         _: Vec<Arc<dyn ExecutionPlan>>,
-#     ) -> Result<Arc<dyn ExecutionPlan>> {
-#         Ok(self)
-#     }
-#
-#     fn execute(
-#         &self,
-#         _partition: usize,
-#         _context: Arc<TaskContext>,
-#     ) -> Result<SendableRecordBatchStream> {
-#         let users: Vec<User> = {
-#             let db = self.db.inner.lock().unwrap();
-#             db.data.values().cloned().collect()
-#         };
-#
-#         let mut id_array = UInt8Builder::with_capacity(users.len());
-#         let mut account_array = UInt64Builder::with_capacity(users.len());
-#
-#         for user in users {
-#             id_array.append_value(user.id);
-#             account_array.append_value(user.bank_account);
-#         }
-#
-#         Ok(Box::pin(MemoryStream::try_new(
-#             vec![RecordBatch::try_new(
-#                 self.projected_schema.clone(),
-#                 vec![
-#                     Arc::new(id_array.finish()),
-#                     Arc::new(account_array.finish()),
-#                 ],
-#             )?],
-#             self.schema(),
-#             None,
-#         )?))
-#     }
+# struct MyFilterTable;
+#
+# #[async_trait::async_trait]
+# impl TableProvider for MyFilterTable {
+#     fn schema(&self) -> SchemaRef { todo!() }
+#     fn table_type(&self) -> TableType { TableType::Base }
+#     async fn scan(&self, _: &dyn Session, _: Option<&Vec<usize>>, _: &[Expr], _: Option<usize>) -> Result<Arc<dyn ExecutionPlan>> { todo!() }
+#
+fn supports_filters_pushdown(
+    &self,
+    filters: &[&Expr],
+) -> Result<Vec<TableProviderFilterPushDown>> {
+    Ok(filters.iter().map(|f| {
+        match f {
+            // We can fully evaluate equality filters on
+            // the partition column at the source
+            Expr::BinaryExpr(BinaryExpr {
+                left, op: Operator::Eq, right
+            }) if is_partition_column(left) || is_partition_column(right) => {
+                TableProviderFilterPushDown::Exact
+            }
+            // All other filters: let DataFusion handle them
+            _ => TableProviderFilterPushDown::Unsupported,
+        }
+    }).collect())
+}
 # }
+```
 
-use async_trait::async_trait;
-use datafusion::logical_expr::expr::Expr;
-use datafusion::datasource::{TableProvider, TableType};
-use datafusion::physical_plan::project_schema;
-use datafusion::catalog::Session;
+The three possible responses for each filter are:
 
-impl CustomExec {
-    fn new(
-        projections: Option<&Vec<usize>>,
-        schema: SchemaRef,
-        db: CustomDataSource,
-    ) -> Self {
-        let projected_schema = project_schema(&schema, projections).unwrap();
-        Self {
-            db,
-            projected_schema,
-        }
-    }
-}
+- **`Exact`** -- Your source guarantees that no output rows will have a false
+  value for this predicate. Because the filter is fully evaluated at the source,
+  DataFusion will **not** add a `FilterExec` for it.
+- **`Inexact`** -- Your source has the ability to reduce the data produced, but
+  the output may still include rows that do not satisfy the predicate. For
+  example, you might skip entire files based on metadata statistics but not
+  filter individual rows within a file. DataFusion will still add a `FilterExec`
+  above your scan to remove any remaining rows that slipped through.
+- **`Unsupported`** -- Your source ignores this filter entirely. DataFusion
+  handles it.
 
-impl CustomDataSource {
-    pub(crate) async fn create_physical_plan(
-        &self,
-        projections: Option<&Vec<usize>>,
-        schema: SchemaRef,
-    ) -> Result<Arc<dyn ExecutionPlan>> {
-        Ok(Arc::new(CustomExec::new(projections, schema, self.clone())))
-    }
-}
+### Why Filter Pushdown Matters
 
-#[async_trait]
-impl TableProvider for CustomDataSource {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
+Consider a table with 1 billion rows partitioned by `region`, and a query:
 
-    fn schema(&self) -> SchemaRef {
-        SchemaRef::new(Schema::new(vec![
-            Field::new("id", DataType::UInt8, false),
-            Field::new("bank_account", DataType::UInt64, true),
-        ]))
-    }
+```sql
+SELECT * FROM events WHERE region = 'us-east-1' AND event_type = 'click';
+```
 
-    fn table_type(&self) -> TableType {
-        TableType::Base
-    }
+**Without filter pushdown:** Your table provider reads all 1 billion rows
+across all regions. DataFusion then applies both filters, discarding the vast
+majority of the data.
 
-    async fn scan(
-        &self,
-        _state: &dyn Session,
-        projection: Option<&Vec<usize>>,
-        // filters and limit can be used here to inject some push-down operations if needed
-        _filters: &[Expr],
-        _limit: Option<usize>,
-    ) -> Result<Arc<dyn ExecutionPlan>> {
-        return self.create_physical_plan(projection, self.schema()).await;
-    }
-}
+**With filter pushdown on `region`:** Your `scan()` method sees the
+`region = 'us-east-1'` filter and constructs an execution plan that only reads
+the `us-east-1` partition. If that partition holds 100 million rows, you have
+just eliminated 90% of the I/O. DataFusion still applies the `event_type`
+filter via `FilterExec` if you reported it as `Unsupported`.
+
+### Only Push Down Filters When the Data Source Can Do Better
+
+DataFusion already pushes filters as close to the data source as possible, typically placing them directly above the scan. `FilterExec` is also highly optimized, with vectorized evaluation and type-specialized kernels for fast predicate evaluation.
+
+Because of this, you should only implement filter pushdown when your data source
+can do strictly better -- for example, by avoiding I/O entirely through
+skipping files or partitions based on metadata. If your data source cannot
+eliminate I/O in this way, it is usually better to let DataFusion handle the
+filter, as its in-memory execution is already highly efficient.
+
+### Using EXPLAIN to Debug Your Table Provider
+
+The `EXPLAIN` statement is your best tool for understanding what DataFusion is
+actually doing with your table provider. It shows the physical plan that
+DataFusion will execute, including any operators it inserted:
+
+```sql
+EXPLAIN SELECT * FROM events WHERE region = 'us-east-1' AND event_type = 'click';
 ```
 
-With this, and the implementation of the omitted methods, we can now use the `CustomDataSource` as a `TableProvider` in DataFusion.
+If you are using DataFrames, call `.explain(false, false)` for the logical plan
+or `.explain(false, true)` for the physical plan. You can also print the plans
+in verbose mode with `.explain(true, true)`.
 
-##### Additional `TableProvider` Methods
+**Before filter pushdown**, the plan might look like:
 
-`scan` has no default implementation, so it needed to be written. There are other methods on the `TableProvider` that have default implementations, but can be overridden if needed to provide additional functionality.
+```text
+FilterExec: region@0 = us-east-1 AND event_type@1 = click
+  MyExecPlan: partitions=50
+```
+
+Here DataFusion is reading all 50 partitions and filtering everything
+afterward. The `FilterExec` above your scan is doing all the predicate work.
 
-###### `supports_filters_pushdown`
+**After implementing pushdown for `region`** (reported as `Exact`):
+
+```text
+FilterExec: event_type@1 = click
+  MyExecPlan: partitions=5, filter=[region = us-east-1]
+```
 
-The `supports_filters_pushdown` method can be overridden to indicate which filter expressions support being pushed down to the data source and within that the specificity of the pushdown.
+Now your exec reads only the 5 partitions for `us-east-1`, and the remaining
+`FilterExec` only handles the `event_type` predicate. The `region` filter has
+been fully absorbed by your scan.
 
-This returns a `Vec` of `TableProviderFilterPushDown` enums where each enum represents a filter that can be pushed down. The `TableProviderFilterPushDown` enum has three variants:
+**After implementing pushdown for both filters** (both `Exact`):
 
-- `TableProviderFilterPushDown::Unsupported` - the filter cannot be pushed down
-- `TableProviderFilterPushDown::Exact` - the filter can be pushed down and the data source can guarantee that the filter will be applied completely to all rows. This is the highest performance option.
-- `TableProviderFilterPushDown::Inexact` - the filter can be pushed down, but the data source cannot guarantee that the filter will be applied to all rows. DataFusion will apply `Inexact` filters again after the scan to ensure correctness.
+```text
+MyExecPlan: partitions=5, filter=[region = us-east-1 AND event_type = click]
+```
 
-For filters that can be pushed down, they'll be passed to the `scan` method as the `filters` parameter and they can be made use of there.
+No `FilterExec` at all -- your source handles everything.
 
-## Using the Custom Table Provider
+Similarly, `EXPLAIN` will reveal whether DataFusion is inserting unnecessary
+`SortExec` or `RepartitionExec` nodes that you could eliminate by declaring
+better output properties. Whenever your queries seem slower than expected,
+`EXPLAIN` is the first place to look.
 
-In order to use the custom table provider, we need to register it with DataFusion. This is done by creating a `TableProvider` and registering it with the `SessionContext`.
+### A Complete Filter Pushdown Example
 
-This will allow you to use the custom table provider in DataFusion. For example, you could use it in a SQL query to get a `DataFrame`.
+To make filter pushdown concrete, here is an illustrative example. Imagine a
+table provider that reads from a set of date-partitioned directories on disk
+(e.g., `data/2026-03-01/`, `data/2026-03-02/`, ...). Each directory contains
+one or more Parquet files for that date. By pushing down a filter on the `date`
+column, the provider can skip entire directories -- avoiding the I/O of listing
+and reading files that cannot possibly match the query.
 
 ```rust
 # use std::any::Any;
-# use std::sync::{Arc, Mutex};
-# use std::collections::{BTreeMap, HashMap};
+# use std::collections::HashMap;
+# use std::fmt;
+# use std::sync::Arc;
+# use arrow::datatypes::SchemaRef;
+# use datafusion::catalog::{TableProvider, Session};
 # use datafusion::common::Result;
-# use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
-# use datafusion::physical_plan::expressions::PhysicalSortExpr;
-# use datafusion::physical_plan::{
-#     ExecutionPlan, SendableRecordBatchStream, DisplayAs, DisplayFormatType,
-#     Statistics, PlanProperties
-# };
+# use datafusion::common::tree_node::TreeNodeRecursion;
+# use datafusion::datasource::TableType;
+# use datafusion::execution::SendableRecordBatchStream;
 # use datafusion::execution::context::TaskContext;
-# use datafusion::arrow::array::{UInt64Builder, UInt8Builder};
-# use datafusion::physical_plan::memory::MemoryStream;
-# use datafusion::arrow::record_batch::RecordBatch;
-#
-# /// A User, with an id and a bank account
-# #[derive(Clone, Debug)]
-# struct User {
-#     id: u8,
-#     bank_account: u64,
-# }
-#
-# /// A custom datasource, used to represent a datastore with a single index
-# #[derive(Clone, Debug)]
-# pub struct CustomDataSource {
-#     inner: Arc<Mutex<CustomDataSourceInner>>,
-# }
-#
+# use datafusion::logical_expr::{Expr, TableProviderFilterPushDown};
+# use datafusion::physical_expr::EquivalenceProperties;
+# use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PhysicalExpr, PlanProperties};
+# use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
+#
+/// A table provider backed by date-partitioned directories.
+/// Each date directory contains data files; by filtering on the
+/// `date` column we can skip entire directories of I/O.
 # #[derive(Debug)]
-# struct CustomDataSourceInner {
-#     data: HashMap<u8, User>,
-#     bank_account_index: BTreeMap<u64, u8>,
-# }
+struct DatePartitionedTable {
+    schema: SchemaRef,
+    /// Maps date strings ("2026-03-01") to directory paths
+    partitions: HashMap<String, String>,
+}
+
+#[async_trait::async_trait]
+impl TableProvider for DatePartitionedTable {
+    fn schema(&self) -> SchemaRef { Arc::clone(&self.schema) }
+    fn table_type(&self) -> TableType { TableType::Base }
+
+    fn supports_filters_pushdown(
+        &self,
+        filters: &[&Expr],
+    ) -> Result<Vec<TableProviderFilterPushDown>> {
+        Ok(filters.iter().map(|f| {
+            if Self::is_date_equality_filter(f) {
+                // We can fully evaluate this: we will only read
+                // directories matching the date, so no rows with
+                // a different date will appear in the output.
+                TableProviderFilterPushDown::Exact
+            } else {
+                TableProviderFilterPushDown::Unsupported
+            }
+        }).collect())
+    }
+
+    async fn scan(
+        &self,
+        _state: &dyn Session,
+        projection: Option<&Vec<usize>>,
+        filters: &[Expr],
+        limit: Option<usize>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        // Determine which date partitions to read by inspecting
+        // the pushed-down filters. This is the key optimization:
+        // we decide *during planning* which directories to scan,
+        // so that execution never touches irrelevant data.
+        let dates_to_read: Vec<String> = self
+            .extract_date_values(filters)
+            .unwrap_or_else(||
+                self.partitions.keys().cloned().collect()
+            );
+
+        let dirs: Vec<String> = dates_to_read
+            .iter()
+            .filter_map(|d| self.partitions.get(d).cloned())
+            .collect();
+        let num_dirs = dirs.len();
+
+        Ok(Arc::new(DatePartitionedExec {
+            schema: Arc::clone(&self.schema),
+            directories: dirs,
+            properties: Arc::new(PlanProperties::new(
+                EquivalenceProperties::new(
+                    Arc::clone(&self.schema),
+                ),
+                // One partition per date directory -- these
+                // will be read in parallel.
+                Partitioning::UnknownPartitioning(num_dirs),
+                EmissionType::Incremental,
+                Boundedness::Bounded,
+            )),
+        }))
+    }
+}
+
+impl DatePartitionedTable {
+    /// Check if a filter is an equality comparison on the `date` column.
+    fn is_date_equality_filter(expr: &Expr) -> bool {
+        // In practice, match on BinaryExpr { left, op: Eq, right }
+        // and check if either side references the "date" column.
+        // Simplified here for clarity.
+        todo!("match on date equality expressions")
+    }
+
+    /// Extract date literal values from pushed-down equality filters.
+    fn extract_date_values(&self, filters: &[Expr]) -> Option<Vec<String>> {
+        // Parse filters like `date = '2026-03-01'` and return
+        // the literal date strings. Returns None if no date
+        // filters are present (meaning: read all partitions).
+        todo!("extract date literals from filter expressions")
+    }
+}
 #
 # #[derive(Debug)]
-# struct CustomExec {
-#     db: CustomDataSource,
-#     projected_schema: SchemaRef,
+# struct DatePartitionedExec {
+#     schema: SchemaRef,
+#     directories: Vec<String>,
+#     properties: Arc<PlanProperties>,
 # }
 #
-# impl DisplayAs for CustomExec {
-#     fn fmt_as(&self, _t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-#         write!(f, "CustomExec")
+# impl DisplayAs for DatePartitionedExec {
+#     fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
+#         write!(f, "DatePartitionedExec")
 #     }
 # }
 #
-# impl ExecutionPlan for CustomExec {
-#     fn name(&self) -> &str {
-#         "CustomExec"
-#     }
-#
-#     fn as_any(&self) -> &dyn Any {
-#         self
-#     }
-#
-#     fn schema(&self) -> SchemaRef {
-#         self.projected_schema.clone()
-#     }
-#
-#
-#     fn properties(&self) -> &PlanProperties {
-#         unreachable!()
-#     }
-#
-#     fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
-#         Vec::new()
-#     }
-#
-#     fn with_new_children(
-#         self: Arc<Self>,
-#         _: Vec<Arc<dyn ExecutionPlan>>,
-#     ) -> Result<Arc<dyn ExecutionPlan>> {
-#         Ok(self)
-#     }
-#
-#     fn execute(
-#         &self,
-#         _partition: usize,
-#         _context: Arc<TaskContext>,
-#     ) -> Result<SendableRecordBatchStream> {
-#         let users: Vec<User> = {
-#             let db = self.db.inner.lock().unwrap();
-#             db.data.values().cloned().collect()
-#         };
-#
-#         let mut id_array = UInt8Builder::with_capacity(users.len());
-#         let mut account_array = UInt64Builder::with_capacity(users.len());
-#
-#         for user in users {
-#             id_array.append_value(user.id);
-#             account_array.append_value(user.bank_account);
-#         }
-#
-#         Ok(Box::pin(MemoryStream::try_new(
-#             vec![RecordBatch::try_new(
-#                 self.projected_schema.clone(),
-#                 vec![
-#                     Arc::new(id_array.finish()),
-#                     Arc::new(account_array.finish()),
-#                 ],
-#             )?],
-#             self.schema(),
-#             None,
-#         )?))
-#     }
+# impl ExecutionPlan for DatePartitionedExec {
+#     fn name(&self) -> &str { "DatePartitionedExec" }
+#     fn properties(&self) -> &Arc<PlanProperties> { &self.properties }
+#     fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> { vec![] }
+#     fn with_new_children(self: Arc<Self>, _: Vec<Arc<dyn ExecutionPlan>>) -> Result<Arc<dyn ExecutionPlan>> { Ok(self) }
+#     fn execute(&self, _: usize, _: Arc<TaskContext>) -> Result<SendableRecordBatchStream> { todo!() }
+#     fn apply_expressions(&self, _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>) -> Result<TreeNodeRecursion> { Ok(TreeNodeRecursion::Continue) }
 # }
+```
 
-# use async_trait::async_trait;
-# use datafusion::logical_expr::expr::Expr;
-# use datafusion::datasource::{TableProvider, TableType};
-# use datafusion::physical_plan::project_schema;
-# use datafusion::catalog::Session;
-#
-# impl CustomExec {
-#     fn new(
-#         projections: Option<&Vec<usize>>,
-#         schema: SchemaRef,
-#         db: CustomDataSource,
-#     ) -> Self {
-#         let projected_schema = project_schema(&schema, projections).unwrap();
-#         Self {
-#             db,
-#             projected_schema,
-#         }
-#     }
-# }
-#
-# impl CustomDataSource {
-#     pub(crate) async fn create_physical_plan(
-#         &self,
-#         projections: Option<&Vec<usize>>,
-#         schema: SchemaRef,
-#     ) -> Result<Arc<dyn ExecutionPlan>> {
-#         Ok(Arc::new(CustomExec::new(projections, schema, self.clone())))
+The key insight is that the filter pushdown decision (`supports_filters_pushdown`)
+and the partition pruning (`scan()`) work together: the first tells DataFusion
+that a `FilterExec` is unnecessary for the `date` predicate, and the second
+ensures that only the relevant directories are scanned. The actual file reading
+happens later, in the stream produced by `execute()`.
+
+## Putting It All Together
+
+Here is a minimal but complete example of a custom table provider that generates
+data lazily during streaming:
+
+```rust
+use std::any::Any;
+# use std::fmt;
+use std::sync::Arc;
+
+use arrow::array::Int64Array;
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use arrow::record_batch::RecordBatch;
+use datafusion::catalog::TableProvider;
+use datafusion::common::Result;
+# use datafusion::common::tree_node::TreeNodeRecursion;
+use datafusion::datasource::TableType;
+use datafusion::catalog::Session;
+use datafusion::execution::SendableRecordBatchStream;
+# use datafusion::execution::context::TaskContext;
+use datafusion::logical_expr::Expr;
+use datafusion::physical_expr::EquivalenceProperties;
+use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
+use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
+use datafusion::physical_plan::{
+#     DisplayAs, DisplayFormatType,
+    ExecutionPlan, Partitioning,
+#     PhysicalExpr,
+    PlanProperties,
+};
+use futures::stream;
+
+/// A table provider that generates sequential numbers on demand.
+# #[derive(Debug)]
+struct CountingTable {
+    schema: SchemaRef,
+    num_partitions: usize,
+    rows_per_partition: usize,
+}
+
+impl CountingTable {
+    fn new(num_partitions: usize, rows_per_partition: usize) -> Self {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("partition", DataType::Int64, false),
+            Field::new("value", DataType::Int64, false),
+        ]));
+        Self { schema, num_partitions, rows_per_partition }
+    }
+}
+
+#[async_trait::async_trait]
+impl TableProvider for CountingTable {
+    fn schema(&self) -> SchemaRef { Arc::clone(&self.schema) }
+    fn table_type(&self) -> TableType { TableType::Base }
+
+    async fn scan(
+        &self,
+        _state: &dyn Session,
+        projection: Option<&Vec<usize>>,
+        _filters: &[Expr],
+        limit: Option<usize>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        // Light work only: build the plan with metadata
+        Ok(Arc::new(CountingExec {
+            schema: Arc::clone(&self.schema),
+            num_partitions: self.num_partitions,
+            rows_per_partition: limit
+                .unwrap_or(self.rows_per_partition)
+                .min(self.rows_per_partition),
+            properties: Arc::new(PlanProperties::new(
+                EquivalenceProperties::new(Arc::clone(&self.schema)),
+                Partitioning::UnknownPartitioning(self.num_partitions),
+                EmissionType::Incremental,
+                Boundedness::Bounded,
+            )),
+        }))
+    }
+}
+
+# #[derive(Debug)]
+struct CountingExec {
+    schema: SchemaRef,
+    num_partitions: usize,
+    rows_per_partition: usize,
+    properties: Arc<PlanProperties>,
+}
+
+# impl DisplayAs for CountingExec {
+#     fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
+#         write!(f, "CountingExec: partitions={}", self.num_partitions)
 #     }
 # }
 #
-# #[async_trait]
-# impl TableProvider for CustomDataSource {
-#     fn as_any(&self) -> &dyn Any {
-#         self
-#     }
-#
-#     fn schema(&self) -> SchemaRef {
-#         SchemaRef::new(Schema::new(vec![
-#             Field::new("id", DataType::UInt8, false),
-#             Field::new("bank_account", DataType::UInt64, true),
-#         ]))
-#     }
-#
-#     fn table_type(&self) -> TableType {
-#         TableType::Base
-#     }
-#
-#     async fn scan(
+impl ExecutionPlan for CountingExec {
+    fn name(&self) -> &str { "CountingExec" }
+    fn properties(&self) -> &Arc<PlanProperties> { &self.properties }
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> { vec![] }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        _children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(self)
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        _context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        let schema = Arc::clone(&self.schema);
+        let rows = self.rows_per_partition;
+
+        // The heavy work (data generation) happens inside the stream,
+        // not here in execute().
+        let batch_stream = stream::once(async move {
+            let partitions = Int64Array::from(
+                vec![partition as i64; rows],
+            );
+            let values = Int64Array::from(
+                (0..rows as i64).collect::<Vec<_>>(),
+            );
+            let batch = RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![Arc::new(partitions), Arc::new(values)],
+            )?;
+            Ok(batch)
+        });
+
+        Ok(Box::pin(RecordBatchStreamAdapter::new(
+            Arc::clone(&self.schema),
+            batch_stream,
+        )))
+    }
+
+#     fn apply_expressions(
 #         &self,
-#         _state: &dyn Session,
-#         projection: Option<&Vec<usize>>,
-#         // filters and limit can be used here to inject some push-down operations if needed
-#         _filters: &[Expr],
-#         _limit: Option<usize>,
-#     ) -> Result<Arc<dyn ExecutionPlan>> {
-#         return self.create_physical_plan(projection, self.schema()).await;
+#         _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+#     ) -> Result<TreeNodeRecursion> {
+#         Ok(TreeNodeRecursion::Continue)
 #     }
-# }
+}
+```
+
+## Using Your Table Provider
+
+Once you have implemented a `TableProvider`, register it with a `SessionContext`
+to make it queryable:
 
+```rust,ignore
 use datafusion::execution::context::SessionContext;
 
 #[tokio::main]
 async fn main() -> Result<()> {
     let ctx = SessionContext::new();
 
-    let custom_table_provider = CustomDataSource {
-        inner: Arc::new(Mutex::new(CustomDataSourceInner {
-            data: Default::default(),
-            bank_account_index: Default::default(),
-        })),
-    };
+    let provider = CountingTable::new(4, 1000);
+    ctx.register_table("counting", Arc::new(provider))?;
 
-    ctx.register_table("customers", Arc::new(custom_table_provider));
-    let df = ctx.sql("SELECT id, bank_account FROM customers").await?;
+    let df = ctx.sql("SELECT * FROM counting LIMIT 10").await?;
+    df.show().await?;
 
     Ok(())
 }
-
 ```
 
-## Recap
-
-To recap, in order to implement a custom table provider, you need to:
-
-1. Implement the `TableProvider` trait
-2. Implement the `ExecutionPlan` trait
-3. Register the `TableProvider` with the `SessionContext`
-
-## Next Steps
-
-As mentioned the [csv] and [parquet] implementations are good examples of how to implement a `TableProvider`. The [example in this repo][ex] is a good example of how to implement a `TableProvider` that uses a custom data source.
-
-More abstractly, see the following traits for more information on how to implement a custom `TableProvider` for a file format:
-
-- `FileOpener` - a trait for opening a file and inferring the schema
-- `FileFormat` - a trait for reading a file format
-- `ListingTableProvider` - a useful trait for implementing a `TableProvider` that lists files in a directory
+## Further Reading
 
-[ex]: https://github.com/apache/datafusion/blob/a5e86fae3baadbd99f8fd0df83f45fde22f7b0c6/datafusion-examples/examples/custom_datasource.rs#L214C1-L276
-[csv]: https://github.com/apache/datafusion/blob/a5e86fae3baadbd99f8fd0df83f45fde22f7b0c6/datafusion/core/src/datasource/physical_plan/csv.rs#L57-L70
-[parquet]: https://github.com/apache/datafusion/blob/a5e86fae3baadbd99f8fd0df83f45fde22f7b0c6/datafusion/core/src/datasource/physical_plan/parquet.rs#L77-L104
+- [`TableProvider` API docs](https://docs.rs/datafusion/latest/datafusion/catalog/trait.TableProvider.html)
+- [`ExecutionPlan` API docs](https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html)
+- [`SendableRecordBatchStream` API docs](https://docs.rs/datafusion/latest/datafusion/execution/type.SendableRecordBatchStream.html)
+- [DataFusion examples directory](https://github.com/apache/datafusion/tree/main/datafusion-examples/examples) --
+  contains working examples including custom table providers
diff --git a/docs/source/library-user-guide/extending-operators.md b/docs/source/library-user-guide/extending-operators.md
index 5c28d1e670586..0a169531757c2 100644
--- a/docs/source/library-user-guide/extending-operators.md
+++ b/docs/source/library-user-guide/extending-operators.md
@@ -17,9 +17,12 @@
   under the License.
 -->
 
-# Extending DataFusion's operators: custom LogicalPlan and Execution Plans
+# Extending Operators
 
-DataFusion supports extension of operators by transforming logical plan and execution plan through customized [optimizer rules](https://docs.rs/datafusion/latest/datafusion/optimizer/trait.OptimizerRule.html). This section will use the µWheel project to illustrate such capabilities.
+DataFusion supports extending operators by transforming [`LogicalPlan`] and [`ExecutionPlan`] through customized [optimizer rules](https://docs.rs/datafusion/latest/datafusion/optimizer/trait.OptimizerRule.html). This section will use the µWheel project to illustrate such capabilities.
+
+[`logicalplan`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.LogicalPlan.html
+[`executionplan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html
 
 ## About DataFusion µWheel
 
diff --git a/docs/source/library-user-guide/extending-sql.md b/docs/source/library-user-guide/extending-sql.md
new file mode 100644
index 0000000000000..eea5b3b1acfc9
--- /dev/null
+++ b/docs/source/library-user-guide/extending-sql.md
@@ -0,0 +1,389 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Extending SQL Syntax
+
+DataFusion provides a flexible extension system that allows you to customize SQL
+parsing and planning without modifying the core codebase. This is useful when you
+need to:
+
+- Support custom operators from other SQL dialects (e.g., PostgreSQL's `->` for JSON)
+- Add custom data types not natively supported
+- Implement SQL constructs like `TABLESAMPLE`, `PIVOT`/`UNPIVOT`, or `MATCH_RECOGNIZE`
+
+You can read more about this topic in the [Extending SQL in DataFusion: from ->>
+to TABLESAMPLE] blog.
+
+[extending sql in datafusion: from ->> to tablesample]: https://datafusion.apache.org/blog/2026/01/12/extending-sql
+
+## Architecture Overview
+
+When DataFusion processes a SQL query, it goes through these stages:
+
+```text
+┌─────────────┐    ┌─────────┐    ┌──────────────────────┐    ┌─────────────┐
+│ SQL String  │───▶│ Parser  │───▶│      SqlToRel        │───▶│ LogicalPlan │
+└─────────────┘    └─────────┘    │ (SQL to LogicalPlan) │    └─────────────┘
+                                  └──────────────────────┘
+                                              │
+                                              │ uses
+                                              ▼
+                                  ┌───────────────────────┐
+                                  │  Extension Planners   │
+                                  │  • ExprPlanner        │
+                                  │  • TypePlanner        │
+                                  │  • RelationPlanner    │
+                                  └───────────────────────┘
+```
+
+The extension planners intercept specific parts of the SQL AST during the
+`SqlToRel` phase and allow you to customize how they are converted to DataFusion's
+logical plan.
+
+## Extension Points
+
+DataFusion provides three planner traits for extending SQL:
+
+| Trait               | Purpose                                 | Registration Method                        |
+| ------------------- | --------------------------------------- | ------------------------------------------ |
+| [`ExprPlanner`]     | Custom expressions and operators        | `ctx.register_expr_planner()`              |
+| [`TypePlanner`]     | Custom SQL data types                   | `SessionStateBuilder::with_type_planner()` |
+| [`RelationPlanner`] | Custom FROM clause elements (relations) | `ctx.register_relation_planner()`          |
+
+**Planner Precedence**: Multiple [`ExprPlanner`]s and [`RelationPlanner`]s can be
+registered; they are invoked in reverse registration order (last registered wins).
+Return `Original(...)` to delegate to the next planner. Only one `TypePlanner`
+can be active at a time.
+
+### ExprPlanner: Custom Expressions and Operators
+
+Use [`ExprPlanner`] to customize how SQL expressions are converted to DataFusion
+logical expressions. This is useful for:
+
+- Custom binary operators (e.g., `->`, `->>`, `@>`, `?`)
+- Custom field access patterns
+- Custom aggregate or window function handling
+
+#### Available Methods
+
+| Category           | Methods                                                                            |
+| ------------------ | ---------------------------------------------------------------------------------- |
+| Operators          | `plan_binary_op`, `plan_any`                                                       |
+| Literals           | `plan_array_literal`, `plan_dictionary_literal`, `plan_struct_literal`             |
+| Functions          | `plan_extract`, `plan_substring`, `plan_overlay`, `plan_position`, `plan_make_map` |
+| Identifiers        | `plan_field_access`, `plan_compound_identifier`                                    |
+| Aggregates/Windows | `plan_aggregate`, `plan_window`                                                    |
+
+See the [ExprPlanner API documentation] for full method signatures.
+
+#### Example: Custom Arrow Operator
+
+This example maps the `->` operator to string concatenation:
+
+```rust
+# use std::sync::Arc;
+# use datafusion::common::DFSchema;
+# use datafusion::error::Result;
+# use datafusion::logical_expr::Operator;
+# use datafusion::prelude::*;
+# use datafusion::sql::sqlparser::ast::BinaryOperator;
+use datafusion_expr::planner::{ExprPlanner, PlannerResult, RawBinaryExpr};
+# use datafusion_expr::BinaryExpr;
+
+#[derive(Debug)]
+struct MyCustomPlanner;
+
+impl ExprPlanner for MyCustomPlanner {
+    fn plan_binary_op(
+        &self,
+        expr: RawBinaryExpr,
+        _schema: &DFSchema,
+    ) -> Result<PlannerResult<RawBinaryExpr>> {
+        match &expr.op {
+            // Map `->` to string concatenation
+            BinaryOperator::Arrow => {
+                Ok(PlannerResult::Planned(Expr::BinaryExpr(BinaryExpr {
+                    left: Box::new(expr.left.clone()),
+                    right: Box::new(expr.right.clone()),
+                    op: Operator::StringConcat,
+                })))
+            }
+            _ => Ok(PlannerResult::Original(expr)),
+        }
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    // Use postgres dialect to enable `->` operator parsing
+    let config = SessionConfig::new()
+        .set_str("datafusion.sql_parser.dialect", "postgres");
+    let mut ctx = SessionContext::new_with_config(config);
+
+    // Register the custom planner
+    ctx.register_expr_planner(Arc::new(MyCustomPlanner))?;
+
+    // Now `->` works as string concatenation
+    let results = ctx.sql("SELECT 'hello'->'world'").await?.collect().await?;
+    // Returns: "helloworld"
+    Ok(())
+}
+```
+
+For more details, see the [ExprPlanner API documentation] and the
+[expr_planner test examples].
+
+### TypePlanner: Custom Data Types
+
+Use [`TypePlanner`] to map SQL data types to Arrow/DataFusion types. This is useful
+when you need to support SQL types that aren't natively recognized.
+
+#### Example: Custom DATETIME Type
+
+```rust
+# use std::sync::Arc;
+# use arrow::datatypes::{DataType, FieldRef, TimeUnit};
+# use datafusion::error::Result;
+# use datafusion::prelude::*;
+# use datafusion::execution::SessionStateBuilder;
+use datafusion_expr::planner::TypePlanner;
+# use sqlparser::ast;
+
+#[derive(Debug)]
+struct MyTypePlanner;
+
+impl TypePlanner for MyTypePlanner {
+    fn plan_type_field(&self, sql_type: &ast::DataType) -> Result<Option<FieldRef>> {
+        match sql_type {
+            // Map DATETIME(precision) to Arrow Timestamp
+            ast::DataType::Datetime(precision) => {
+                let time_unit = match precision {
+                    Some(0) => TimeUnit::Second,
+                    Some(3) => TimeUnit::Millisecond,
+                    Some(6) => TimeUnit::Microsecond,
+                    None | Some(9) => TimeUnit::Nanosecond,
+                    _ => return Ok(None), // Let default handling take over
+                };
+                Ok(Some(
+                    DataType::Timestamp(time_unit, None).into_nullable_field_ref()
+                ))
+            }
+            _ => Ok(None), // Return None for types we don't handle
+        }
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let state = SessionStateBuilder::new()
+        .with_default_features()
+        .with_type_planner(Arc::new(MyTypePlanner))
+        .build();
+
+    let ctx = SessionContext::new_with_state(state);
+
+    // Now DATETIME type is recognized
+    ctx.sql("CREATE TABLE events (ts DATETIME(3))").await?;
+    Ok(())
+}
+```
+
+#### Example: Supporting the UUID Type
+
+```rust
+# use std::sync::Arc;
+# use arrow::datatypes::{DataType, FieldRef, TimeUnit};
+# use datafusion::error::Result;
+# use datafusion::prelude::*;
+# use datafusion::execution::SessionStateBuilder;
+use datafusion_expr::planner::TypePlanner;
+# use sqlparser::ast;
+
+#[derive(Debug)]
+struct MyTypePlanner;
+
+impl TypePlanner for MyTypePlanner {
+    fn plan_type_field(&self, sql_type: &ast::DataType) -> Result<Option<FieldRef>> {
+        match sql_type {
+            sqlparser::ast::DataType::Uuid => Ok(Some(Arc::new(
+                Field::new("", DataType::FixedSizeBinary(16), true).with_metadata(
+                    [("ARROW:extension:name".to_string(), "arrow.uuid".to_string())]
+                        .into(),
+                ),
+            ))),
+            _ => Ok(None),
+        }
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let state = SessionStateBuilder::new()
+        .with_default_features()
+        .with_type_planner(Arc::new(MyTypePlanner))
+        .build();
+
+    let ctx = SessionContext::new_with_state(state);
+
+    // Now UUID type is recognized
+    ctx.sql("CREATE TABLE idx (uuid UUID)").await?;
+    Ok(())
+}
+```
+
+For more details, see the [TypePlanner API documentation].
+
+### RelationPlanner: Custom FROM Clause Elements
+
+Use [`RelationPlanner`] to handle custom relations in the FROM clause. This
+enables you to implement SQL constructs like:
+
+- `TABLESAMPLE` for sampling data
+- `PIVOT` / `UNPIVOT` for data reshaping
+- `MATCH_RECOGNIZE` for pattern matching
+- Any custom relation syntax parsed by sqlparser
+
+#### The RelationPlannerContext
+
+When implementing [`RelationPlanner`], you receive a [`RelationPlannerContext`] that
+provides utilities for planning:
+
+| Method                      | Purpose                                         |
+| --------------------------- | ----------------------------------------------- |
+| `plan(relation)`            | Recursively plan a nested relation              |
+| `sql_to_expr(expr, schema)` | Convert SQL expression to DataFusion Expr       |
+| `context_provider()`        | Access session configuration, tables, functions |
+
+See the [RelationPlanner API documentation] for additional methods like
+`normalize_ident()` and `object_name_to_table_reference()`.
+
+#### Implementation Strategies
+
+There are two main approaches when implementing a [`RelationPlanner`]:
+
+1. **Rewrite to Standard SQL**: Transform custom syntax into equivalent standard
+   operations that DataFusion already knows how to execute (e.g., PIVOT → GROUP BY
+   with CASE expressions). This is the simplest approach when possible.
+
+2. **Custom Logical and Physical Nodes**: Create a [`UserDefinedLogicalNode`] to
+   represent the operation in the logical plan, along with a custom [`ExecutionPlan`]
+   to execute it. Both are required for end-to-end execution.
+
+#### Example: Basic RelationPlanner Structure
+
+```rust
+# use std::sync::Arc;
+# use datafusion::error::Result;
+# use datafusion::prelude::*;
+use datafusion_expr::planner::{
+    PlannedRelation, RelationPlanner, RelationPlannerContext, RelationPlanning,
+};
+use datafusion_sql::sqlparser::ast::TableFactor;
+
+#[derive(Debug)]
+struct MyRelationPlanner;
+
+impl RelationPlanner for MyRelationPlanner {
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        ctx: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning> {
+        match relation {
+            // Handle your custom relation
+            TableFactor::Pivot { table, alias, .. } => {
+                // Plan the input table
+                let input = ctx.plan(*table)?;
+
+                // Transform or wrap the plan as needed
+                // ...
+
+                Ok(RelationPlanning::Planned(PlannedRelation::new(input, alias)))
+            }
+
+            // Return Original for relations you don't handle
+            other => Ok(RelationPlanning::Original(other)),
+        }
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    // Register the custom planner
+    ctx.register_relation_planner(Arc::new(MyRelationPlanner))?;
+
+    Ok(())
+}
+```
+
+## Complete Examples
+
+The DataFusion repository includes comprehensive examples demonstrating each
+approach:
+
+### TABLESAMPLE (Custom Logical and Physical Nodes)
+
+The [table_sample.rs] example shows a complete end-to-end implementation of how to
+support queries such as:
+
+```sql
+SELECT * FROM table TABLESAMPLE BERNOULLI(10 PERCENT) REPEATABLE(42)
+```
+
+### PIVOT/UNPIVOT (Rewrite Strategy)
+
+The [pivot_unpivot.rs] example demonstrates rewriting custom syntax to standard SQL
+for queries such as:
+
+```sql
+SELECT * FROM sales
+  PIVOT (SUM(amount) FOR quarter IN ('Q1', 'Q2', 'Q3', 'Q4'))
+```
+
+## Recap
+
+1. Use [`ExprPlanner`] for custom operators and expression handling
+2. Use [`TypePlanner` for custom SQL data types
+3. Use [`RelationPlanner`] for custom FROM clause syntax (TABLESAMPLE, PIVOT, etc.)
+4. Register planners via [`SessionContext`] or [`SessionStateBuilder`]
+
+## See Also
+
+- API Documentation: [`ExprPlanner`], [`TypePlanner`], [`RelationPlanner`]
+- [relation_planner examples] - Complete TABLESAMPLE, PIVOT/UNPIVOT implementations
+- [expr_planner test examples] - Custom operator examples
+- [Custom Expression Planning](functions/adding-udfs.md#custom-expression-planning) in the UDF guide
+
+[`exprplanner`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/planner/trait.ExprPlanner.html
+[`typeplanner`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/planner/trait.TypePlanner.html
+[`relationplanner`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/planner/trait.RelationPlanner.html
+[`userdefinedlogicalnode`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.UserDefinedLogicalNode.html
+[`executionplan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html
+[`sessioncontext`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html
+[`sessionstatebuilder`]: https://docs.rs/datafusion/latest/datafusion/execution/session_state/struct.SessionStateBuilder.html
+[`relationplannercontext`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/planner/trait.RelationPlannerContext.html
+[exprplanner api documentation]: https://docs.rs/datafusion/latest/datafusion/logical_expr/planner/trait.ExprPlanner.html
+[typeplanner api documentation]: https://docs.rs/datafusion/latest/datafusion/logical_expr/planner/trait.TypePlanner.html
+[relationplanner api documentation]: https://docs.rs/datafusion/latest/datafusion/logical_expr/planner/trait.RelationPlanner.html
+[expr_planner test examples]: https://github.com/apache/datafusion/blob/main/datafusion/core/tests/user_defined/expr_planner.rs
+[relation_planner examples]: https://github.com/apache/datafusion/tree/main/datafusion-examples/examples/relation_planner
+[table_sample.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/relation_planner/table_sample.rs
+[pivot_unpivot.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/relation_planner/pivot_unpivot.rs
diff --git a/docs/source/library-user-guide/functions/adding-udfs.md b/docs/source/library-user-guide/functions/adding-udfs.md
index 7581d8b6505ea..0221e2e5adeb0 100644
--- a/docs/source/library-user-guide/functions/adding-udfs.md
+++ b/docs/source/library-user-guide/functions/adding-udfs.md
@@ -31,14 +31,14 @@ This page covers how to add UDFs to DataFusion. In particular, it covers how to
 | Table          | A function that takes parameters and returns a `TableProvider` to be used in an query plan.                | [simple_udtf.rs]                      |
 | Scalar (async) | A scalar function for performing `async` operations (such as network or I/O calls) within the UDF.         | [async_udf.rs]                        |
 
-[simple_udf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/simple_udf.rs
-[advanced_udf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udf.rs
-[simple_udwf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/simple_udwf.rs
-[advanced_udwf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udwf.rs
-[simple_udaf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/simple_udaf.rs
-[advanced_udaf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udaf.rs
-[simple_udtf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/simple_udtf.rs
-[async_udf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/async_udf.rs
+[simple_udf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/simple_udf.rs
+[advanced_udf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udf.rs
+[simple_udwf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/simple_udwf.rs
+[advanced_udwf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udwf.rs
+[simple_udaf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/simple_udaf.rs
+[advanced_udaf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udaf.rs
+[simple_udtf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/simple_udtf.rs
+[async_udf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/async_udf.rs
 
 First we'll talk about adding an Scalar UDF end-to-end, then we'll talk about the differences between the different
 types of UDFs.
@@ -98,7 +98,6 @@ impl AddOne {
 
 /// Implement the ScalarUDFImpl trait for AddOne
 impl ScalarUDFImpl for AddOne {
-   fn as_any(&self) -> &dyn Any { self }
    fn name(&self) -> &str { "add_one" }
    fn signature(&self) -> &Signature { &self.signature }
    fn return_type(&self, args: &[DataType]) -> Result<DataType> {
@@ -161,7 +160,6 @@ We now need to register the function with DataFusion so that it can be used in t
 #
 # /// Implement the ScalarUDFImpl trait for AddOne
 # impl ScalarUDFImpl for AddOne {
-#    fn as_any(&self) -> &dyn Any { self }
 #    fn name(&self) -> &str { "add_one" }
 #    fn signature(&self) -> &Signature { &self.signature }
 #    fn return_type(&self, args: &[DataType]) -> Result<DataType> {
@@ -411,10 +409,6 @@ impl AsyncUpper {
 /// Implement the normal ScalarUDFImpl trait for AsyncUpper
 #[async_trait]
 impl ScalarUDFImpl for AsyncUpper {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
     fn name(&self) -> &str {
         "async_upper"
     }
@@ -514,10 +508,6 @@ We can now transfer the async UDF into the normal scalar using `into_scalar_udf`
 #
 # #[async_trait]
 # impl ScalarUDFImpl for AsyncUpper {
-#     fn as_any(&self) -> &dyn Any {
-#         self
-#     }
-#
 #     fn name(&self) -> &str {
 #         "async_upper"
 #     }
@@ -579,19 +569,25 @@ After registration, you can use these async UDFs directly in SQL queries, for ex
 SELECT async_upper('datafusion');
 ```
 
-For async UDF implementation details, see [`async_udf.rs`](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/async_udf.rs).
+For async UDF implementation details, see [`async_udf.rs`](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/async_udf.rs).
 
 [`scalarudf`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/struct.ScalarUDF.html
 [`create_udf`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/fn.create_udf.html
-[`process_scalar_func_inputs`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/functions/fn.process_scalar_func_inputs.html
-[`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udf.rs
+[`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udf.rs
 
 ## Named Arguments
 
-DataFusion supports PostgreSQL-style named arguments for scalar functions, allowing you to pass arguments by parameter name:
+DataFusion supports named arguments for Scalar, Window, and Aggregate UDFs, allowing you to pass arguments by parameter name:
 
 ```sql
+-- Scalar function
 SELECT substr(str => 'hello', start_pos => 2, length => 3);
+
+-- Window function
+SELECT lead(expr => value, offset => 1) OVER (ORDER BY id) FROM table;
+
+-- Aggregate function
+SELECT corr(y => col1, x => col2) FROM table;
 ```
 
 Named arguments can be mixed with positional arguments, but positional arguments must come first:
@@ -602,38 +598,7 @@ SELECT substr('hello', start_pos => 2, length => 3);  -- Valid
 
 ### Implementing Functions with Named Arguments
 
-To support named arguments in your UDF, add parameter names to your function's signature using `.with_parameter_names()`:
-
-```rust
-# use arrow::datatypes::DataType;
-# use datafusion_expr::{Signature, Volatility};
-#
-# #[derive(Debug)]
-# struct MyFunction {
-#     signature: Signature,
-# }
-#
-impl MyFunction {
-    fn new() -> Self {
-        Self {
-            signature: Signature::uniform(
-                2,
-                vec![DataType::Float64],
-                Volatility::Immutable
-            )
-            .with_parameter_names(vec![
-                "base".to_string(),
-                "exponent".to_string()
-            ])
-            .expect("valid parameter names"),
-        }
-    }
-}
-```
-
-The parameter names should match the order of arguments in your function's signature. DataFusion automatically resolves named arguments to the correct positional order before invoking your function.
-
-### Example
+To support named arguments in your UDF, add parameter names to your function's signature using `.with_parameter_names()`. This works the same way for Scalar, Window, and Aggregate UDFs:
 
 ```rust
 # use std::sync::Arc;
@@ -666,7 +631,6 @@ impl PowerFunction {
 }
 
 impl ScalarUDFImpl for PowerFunction {
-    fn as_any(&self) -> &dyn Any { self }
     fn name(&self) -> &str { "power" }
     fn signature(&self) -> &Signature { &self.signature }
 
@@ -681,10 +645,14 @@ impl ScalarUDFImpl for PowerFunction {
 }
 ```
 
-Once registered, users can call your function with named arguments:
+The parameter names should match the order of arguments in your function's signature. DataFusion automatically resolves named arguments to the correct positional order before invoking your function.
+
+Once registered, users can call your functions with named arguments in any order:
 
 ```sql
+-- All equivalent
 SELECT power(base => 2.0, exponent => 3.0);
+SELECT power(exponent => 3.0, base => 2.0);
 SELECT power(2.0, exponent => 3.0);
 ```
 
@@ -704,6 +672,10 @@ No function matches the given name and argument types substr(Utf8).
 Scalar UDFs are functions that take a row of data and return a single value. Window UDFs are similar, but they also have
 access to the rows around them. Access to the proximal rows is helpful, but adds some complexity to the implementation.
 
+For background and other considerations, see the [User defined Window Functions in DataFusion] blog.
+
+[user defined window functions in datafusion]: https://datafusion.apache.org/blog/2025/04/19/user-defined-window-functions
+
 For example, we will declare a user defined window function that computes a moving average.
 
 ```rust
@@ -840,7 +812,7 @@ let smooth_it = create_udwf(
 
 [`windowudf`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/struct.WindowUDF.html
 [`create_udwf`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/fn.create_udwf.html
-[`advanced_udwf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udwf.rs
+[`advanced_udwf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udwf.rs
 
 The `create_udwf` has five arguments to check:
 
@@ -1368,7 +1340,7 @@ async fn main() -> Result<()> {
 
 [`aggregateudf`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/struct.AggregateUDF.html
 [`create_udaf`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/fn.create_udaf.html
-[`advanced_udaf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udaf.rs
+[`advanced_udaf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udaf.rs
 
 ## Adding a Table UDF
 
@@ -1405,15 +1377,16 @@ in the CLI to read the metadata from a Parquet file.
 
 The simple UDTF used here takes a single `Int64` argument and returns a table with a single column with the value of the
 argument. To create a function in DataFusion, you need to implement the `TableFunctionImpl` trait. This trait has a
-single method, `call`, that takes a slice of `Expr`s and returns a `Result<Arc<dyn TableProvider>>`.
+single method, `call_with_args`, that takes a `TableFunctionArgs` struct and returns a `Result<Arc<dyn TableProvider>>`.
+Passed struct includes function arguments as a slice of `Expr`s.
 
-In the `call` method, you parse the input `Expr`s and return a `TableProvider`. You might also want to do some
+In the `call_with_args` method, you parse the input `Expr`s and return a `TableProvider`. You might also want to do some
 validation of the input `Expr`s, e.g. checking that the number of arguments is correct.
 
 ```rust
 use std::sync::Arc;
 use datafusion::common::{plan_err, ScalarValue, Result};
-use datafusion::catalog::{TableFunctionImpl, TableProvider};
+use datafusion::catalog::{TableFunctionArgs, TableFunctionImpl, TableProvider};
 use datafusion::arrow::array::{ArrayRef, Int64Array};
 use datafusion::datasource::memory::MemTable;
 use arrow::record_batch::RecordBatch;
@@ -1425,7 +1398,8 @@ use datafusion_expr::Expr;
 pub struct EchoFunction {}
 
 impl TableFunctionImpl for EchoFunction {
-    fn call(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
+    fn call_with_args(&self, args: TableFunctionArgs) -> Result<Arc<dyn TableProvider>> {
+        let exprs = args.exprs();
         let Some(Expr::Literal(ScalarValue::Int64(Some(value)), _)) = exprs.get(0) else {
             return plan_err!("First argument must be an integer");
         };
@@ -1454,7 +1428,7 @@ With the UDTF implemented, you can register it with the `SessionContext`:
 ```rust
 # use std::sync::Arc;
 # use datafusion::common::{plan_err, ScalarValue, Result};
-# use datafusion::catalog::{TableFunctionImpl, TableProvider};
+# use datafusion::catalog::{TableFunctionArgs, TableFunctionImpl, TableProvider};
 # use datafusion::arrow::array::{ArrayRef, Int64Array};
 # use datafusion::datasource::memory::MemTable;
 # use arrow::record_batch::RecordBatch;
@@ -1466,7 +1440,8 @@ With the UDTF implemented, you can register it with the `SessionContext`:
 # pub struct EchoFunction {}
 #
 # impl TableFunctionImpl for EchoFunction {
-#     fn call(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
+#    fn call_with_args(&self, args: TableFunctionArgs) -> Result<Arc<dyn TableProvider>> {
+#        let exprs = args.exprs();
 #         let Some(Expr::Literal(ScalarValue::Int64(Some(value)), _)) = exprs.get(0) else {
 #             return plan_err!("First argument must be an integer");
 #         };
@@ -1512,7 +1487,9 @@ async fn main() -> Result<()> {
 
 ## Custom Expression Planning
 
-DataFusion provides native support for common SQL operators by default such as `+`, `-`, `||`. However it does not provide support for other operators such as `@>`. To override DataFusion's default handling or support unsupported operators, developers can extend DataFusion by implementing custom expression planning, a core feature of DataFusion
+DataFusion provides native support for common SQL operators and constructs by default such as `+`, `-`, `||`. However it does not provide support for other operators such as `@>` or constructs like `TABLESAMPLE` which are less common or vary more between SQL dialects. To override DataFusion's default handling or support these unsupported features, developers can extend DataFusion by implementing custom expression planning, a core feature of DataFusion.
+
+For a comprehensive guide on extending SQL syntax including `ExprPlanner`, `TypePlanner`, and `RelationPlanner`, see [Extending DataFusion's SQL Syntax](../extending-sql.md)
 
 ### Implementing Custom Expression Planning
 
diff --git a/docs/source/library-user-guide/query-optimizer.md b/docs/source/library-user-guide/query-optimizer.md
index 877ff8c754ad5..a3853d39ddced 100644
--- a/docs/source/library-user-guide/query-optimizer.md
+++ b/docs/source/library-user-guide/query-optimizer.md
@@ -17,7 +17,7 @@
   under the License.
 -->
 
-# DataFusion Query Optimizer
+# Query Optimizer
 
 [DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory
 format.
@@ -25,9 +25,26 @@ format.
 DataFusion has modular design, allowing individual crates to be re-used in other projects.
 
 This crate is a submodule of DataFusion that provides a query optimizer for logical plans, and
-contains an extensive set of OptimizerRules that may rewrite the plan and/or its expressions so
+contains an extensive set of [`OptimizerRule`]s and [`PhysicalOptimizerRule`]s that may rewrite the plan and/or its expressions so
 they execute more quickly while still computing the same result.
 
+For a reference list of the built-in analyzer, logical optimizer, and physical optimizer rules,
+see [Optimizer Rule Reference].
+
+For a deeper background on optimizer architecture and rule types and predicates, see
+[Optimizing SQL (and DataFrames) in DataFusion, Part 1], [Part 2],
+[Using Ordering for Better Plans in Apache DataFusion], and
+[Dynamic Filters: Passing Information Between Operators During Execution for 25x Faster Queries].
+
+[`optimizerrule`]: https://docs.rs/datafusion/latest/datafusion/optimizer/trait.OptimizerRule.html
+[`physicaloptimizerrule`]: https://docs.rs/datafusion/latest/datafusion/physical_optimizer/trait.PhysicalOptimizerRule.html
+[optimizing sql (and dataframes) in datafusion, part 1]: https://datafusion.apache.org/blog/2025/06/15/optimizing-sql-dataframes-part-one
+[part 2]: https://datafusion.apache.org/blog/2025/06/15/optimizing-sql-dataframes-part-two
+[using ordering for better plans in apache datafusion]: https://datafusion.apache.org/blog/2025/03/11/ordering-analysis
+[dynamic filters: passing information between operators during execution for 25x faster queries]: https://datafusion.apache.org/blog/2025/09/10/dynamic-filters
+[optimizer rule reference]: https://docs.rs/datafusion/latest/datafusion/index.html#built-in-optimizer-rules
+[`logicalplan`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.LogicalPlan.html
+
 ## Running the Optimizer
 
 The following code demonstrates the basic flow of creating the optimizer with a default set of optimization rules
@@ -68,11 +85,11 @@ fn observer(plan: &LogicalPlan, rule: &dyn OptimizerRule) {
 ## Writing Optimization Rules
 
 Please refer to the
-[optimizer_rule.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/optimizer_rule.rs)
+[optimizer_rule.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/optimizer_rule.rs)
 example to learn more about the general approach to writing optimizer rules and
 then move onto studying the existing rules.
 
-`OptimizerRule` transforms one ['LogicalPlan'] into another which
+`OptimizerRule` transforms one [`LogicalPlan`] into another which
 computes the same results, but in a potentially more efficient
 way. If there are no suitable transformations for the input plan,
 the optimizer can simply return it as is.
@@ -478,13 +495,10 @@ fn analyze_filter_example() -> Result<()> {
     let schema = Arc::new(Schema::new(vec![age]));
 
     // Define column statistics
-    let column_stats = ColumnStatistics {
-        null_count: Precision::Exact(0),
-        max_value: Precision::Exact(ScalarValue::Int64(Some(79))),
-        min_value: Precision::Exact(ScalarValue::Int64(Some(14))),
-        distinct_count: Precision::Absent,
-        sum_value: Precision::Absent,
-    };
+    let column_stats = ColumnStatistics::default()
+        .with_min_value(Precision::Exact(ScalarValue::Int64(Some(14))))
+        .with_max_value(Precision::Exact(ScalarValue::Int64(Some(79))))
+        .with_null_count(Precision::Exact(0));
 
     // Create expression: age > 18 AND age <= 25
     let expr = col("age")
@@ -504,3 +518,5 @@ fn analyze_filter_example() -> Result<()> {
     Ok(())
 }
 ```
+
+[treenode api]: https://docs.rs/datafusion/latest/datafusion/common/tree_node/trait.TreeNode.html
diff --git a/docs/source/library-user-guide/table-constraints.md b/docs/source/library-user-guide/table-constraints.md
index dea746463d234..252817822d990 100644
--- a/docs/source/library-user-guide/table-constraints.md
+++ b/docs/source/library-user-guide/table-constraints.md
@@ -37,6 +37,6 @@ They are provided for informational purposes and can be used by custom
 - **Foreign keys and check constraints**: These constraints are parsed
   but are not validated or used during query planning.
 
-[`tableconstraint`]: https://docs.rs/datafusion/latest/datafusion/sql/planner/enum.TableConstraint.html
-[`constraints`]: https://docs.rs/datafusion/latest/datafusion/common/functional_dependencies/struct.Constraints.html
-[`field`]: https://docs.rs/arrow/latest/arrow/datatype/struct.Field.html
+[`tableconstraint`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/sqlparser/ast/enum.TableConstraint.html
+[`constraints`]: https://docs.rs/datafusion/latest/datafusion/common/struct.Constraints.html
+[`field`]: https://docs.rs/arrow/latest/arrow/datatypes/struct.Field.html
diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md
deleted file mode 100644
index 0b227000f73d9..0000000000000
--- a/docs/source/library-user-guide/upgrading.md
+++ /dev/null
@@ -1,1436 +0,0 @@
-<!---
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
--->
-
-# Upgrade Guides
-
-## DataFusion `51.0.0`
-
-**Note:** DataFusion `51.0.0` has not been released yet. The information provided in this section pertains to features and changes that have already been merged to the main branch and are awaiting release in this version.
-
-You can see the current [status of the `51.0.0`release here](https://github.com/apache/datafusion/issues/17558)
-
-### `arrow` / `parquet` updated to 57.0.0
-
-### Upgrade to arrow `57.0.0` and parquet `57.0.0`
-
-This version of DataFusion upgrades the underlying Apache Arrow implementation
-to version `57.0.0`, including several dependent crates such as `prost`,
-`tonic`, `pyo3`, and `substrait`. . See the [release
-notes](https://github.com/apache/arrow-rs/releases/tag/57.0.0) for more details.
-
-### `MSRV` updated to 1.88.0
-
-The Minimum Supported Rust Version (MSRV) has been updated to [`1.88.0`].
-
-[`1.88.0`]: https://releases.rs/docs/1.88.0/
-
-### `FunctionRegistry` exposes two additional methods
-
-`FunctionRegistry` exposes two additional methods `udafs` and `udwfs` which expose set of registered user defined aggregation and window function names. To upgrade implement methods returning set of registered function names:
-
-```diff
-impl FunctionRegistry for FunctionRegistryImpl {
-      fn udfs(&self) -> HashSet<String> {
-         self.scalar_functions.keys().cloned().collect()
-     }
-+    fn udafs(&self) -> HashSet<String> {
-+        self.aggregate_functions.keys().cloned().collect()
-+    }
-+
-+    fn udwfs(&self) -> HashSet<String> {
-+        self.window_functions.keys().cloned().collect()
-+    }
-}
-```
-
-### `datafusion-proto` use `TaskContext` rather than `SessionContext` in physical plan serde methods
-
-There have been changes in the public API methods of `datafusion-proto` which handle physical plan serde.
-
-Methods like `physical_plan_from_bytes`, `parse_physical_expr` and similar, expect `TaskContext` instead of `SessionContext`
-
-```diff
-- let plan2 = physical_plan_from_bytes(&bytes, &ctx)?;
-+ let plan2 = physical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
-```
-
-as `TaskContext` contains `RuntimeEnv` methods such as `try_into_physical_plan` will not have explicit `RuntimeEnv` parameter.
-
-```diff
-let result_exec_plan: Arc<dyn ExecutionPlan> = proto
--   .try_into_physical_plan(&ctx, runtime.deref(), &composed_codec)
-+.  .try_into_physical_plan(&ctx.task_ctx(), &composed_codec)
-```
-
-`PhysicalExtensionCodec::try_decode()` expects `TaskContext` instead of `FunctionRegistry`:
-
-```diff
-pub trait PhysicalExtensionCodec {
-    fn try_decode(
-        &self,
-        buf: &[u8],
-        inputs: &[Arc<dyn ExecutionPlan>],
--        registry: &dyn FunctionRegistry,
-+        ctx: &TaskContext,
-    ) -> Result<Arc<dyn ExecutionPlan>>;
-```
-
-See [issue #17601] for more details.
-
-[issue #17601]: https://github.com/apache/datafusion/issues/17601
-
-### `SessionState`'s `sql_to_statement` method takes `Dialect` rather than a `str`
-
-The `dialect` parameter of `sql_to_statement` method defined in `datafusion::execution::session_state::SessionState`
-has changed from `&str` to `&Dialect`.
-`Dialect` is an enum defined in the `datafusion-common`
-crate under the `config` module that provides type safety
-and better validation for SQL dialect selection
-
-### Reorganization of `ListingTable` into `datafusion-catalog-listing` crate
-
-There has been a long standing request to remove features such as `ListingTable`
-from the `datafusion` crate to support faster build times. The structs
-`ListingOptions`, `ListingTable`, and `ListingTableConfig` are now available
-within the `datafusion-catalog-listing` crate. These are re-exported in
-the `datafusion` crate, so this should be a minimal impact to existing users.
-
-See [issue #14462] and [issue #17713] for more details.
-
-[issue #14462]: https://github.com/apache/datafusion/issues/14462
-[issue #17713]: https://github.com/apache/datafusion/issues/17713
-
-### Reorganization of `ArrowSource` into `datafusion-datasource-arrow` crate
-
-To support [issue #17713] the `ArrowSource` code has been removed from
-the `datafusion` core crate into it's own crate, `datafusion-datasource-arrow`.
-This follows the pattern for the AVRO, CSV, JSON, and Parquet data sources.
-Users may need to update their paths to account for these changes.
-
-See [issue #17713] for more details.
-
-### `FileScanConfig::projection` renamed to `FileScanConfig::projection_exprs`
-
-The `projection` field in `FileScanConfig` has been renamed to `projection_exprs` and its type has changed from `Option<Vec<usize>>` to `Option<ProjectionExprs>`. This change enables more powerful projection pushdown capabilities by supporting arbitrary physical expressions rather than just column indices.
-
-**Impact on direct field access:**
-
-If you directly access the `projection` field:
-
-```rust,ignore
-let config: FileScanConfig = ...;
-let projection = config.projection;
-```
-
-You should update to:
-
-```rust,ignore
-let config: FileScanConfig = ...;
-let projection_exprs = config.projection_exprs;
-```
-
-**Impact on builders:**
-
-The `FileScanConfigBuilder::with_projection()` method has been deprecated in favor of `with_projection_indices()`:
-
-```diff
-let config = FileScanConfigBuilder::new(url, schema, file_source)
--   .with_projection(Some(vec![0, 2, 3]))
-+   .with_projection_indices(Some(vec![0, 2, 3]))
-    .build();
-```
-
-Note: `with_projection()` still works but is deprecated and will be removed in a future release.
-
-**What is `ProjectionExprs`?**
-
-`ProjectionExprs` is a new type that represents a list of physical expressions for projection. While it can be constructed from column indices (which is what `with_projection_indices` does internally), it also supports arbitrary physical expressions, enabling advanced features like expression evaluation during scanning.
-
-You can access column indices from `ProjectionExprs` using its methods if needed:
-
-```rust,ignore
-let projection_exprs: ProjectionExprs = ...;
-// Get the column indices if the projection only contains simple column references
-let indices = projection_exprs.column_indices();
-```
-
-### `DESCRIBE query` support
-
-`DESCRIBE query` was previously an alias for `EXPLAIN query`, which outputs the
-_execution plan_ of the query. With this release, `DESCRIBE query` now outputs
-the computed _schema_ of the query, consistent with the behavior of `DESCRIBE table_name`.
-
-### `datafusion.execution.time_zone` default configuration changed
-
-The default value for `datafusion.execution.time_zone` previously was a string value of `+00:00` (GMT/Zulu time).
-This was changed to be an `Option<String>` with a default of `None`. If you want to change the timezone back
-to the previous value you can execute the sql:
-
-```sql
-SET
-TIMEZONE = '+00:00';
-```
-
-This change was made to better support using the default timezone in scalar UDF functions such as
-`now`, `current_date`, `current_time`, and `to_timestamp` among others.
-
-### Introduction of `TableSchema` and changes to `FileSource::with_schema()` method
-
-A new `TableSchema` struct has been introduced in the `datafusion-datasource` crate to better manage table schemas with partition columns. This struct helps distinguish between:
-
-- **File schema**: The schema of actual data files on disk
-- **Partition columns**: Columns derived from directory structure (e.g., Hive-style partitioning)
-- **Table schema**: The complete schema combining both file and partition columns
-
-As part of this change, the `FileSource::with_schema()` method signature has changed from accepting a `SchemaRef` to accepting a `TableSchema`.
-
-**Who is affected:**
-
-- Users who have implemented custom `FileSource` implementations will need to update their code
-- Users who only use built-in file sources (Parquet, CSV, JSON, AVRO, Arrow) are not affected
-
-**Migration guide for custom `FileSource` implementations:**
-
-```diff
- use datafusion_datasource::file::FileSource;
--use arrow::datatypes::SchemaRef;
-+use datafusion_datasource::TableSchema;
-
- impl FileSource for MyCustomSource {
--    fn with_schema(&self, schema: SchemaRef) -> Arc<dyn FileSource> {
-+    fn with_schema(&self, schema: TableSchema) -> Arc<dyn FileSource> {
-         Arc::new(Self {
--            schema: Some(schema),
-+            // Use schema.file_schema() to get the file schema without partition columns
-+            schema: Some(Arc::clone(schema.file_schema())),
-             ..self.clone()
-         })
-     }
- }
-```
-
-For implementations that need access to partition columns:
-
-```rust,ignore
-fn with_schema(&self, schema: TableSchema) -> Arc<dyn FileSource> {
-    Arc::new(Self {
-        file_schema: Arc::clone(schema.file_schema()),
-        partition_cols: schema.table_partition_cols().clone(),
-        table_schema: Arc::clone(schema.table_schema()),
-        ..self.clone()
-    })
-}
-```
-
-**Note**: Most `FileSource` implementations only need to store the file schema (without partition columns), as shown in the first example. The second pattern of storing all three schema components is typically only needed for advanced use cases where you need access to different schema representations for different operations (e.g., ParquetSource uses the file schema for building pruning predicates but needs the table schema for filter pushdown logic).
-
-**Using `TableSchema` directly:**
-
-If you're constructing a `FileScanConfig` or working with table schemas and partition columns, you can now use `TableSchema`:
-
-```rust
-use datafusion_datasource::TableSchema;
-use arrow::datatypes::{Schema, Field, DataType};
-use std::sync::Arc;
-
-// Create a TableSchema with partition columns
-let file_schema = Arc::new(Schema::new(vec![
-    Field::new("user_id", DataType::Int64, false),
-    Field::new("amount", DataType::Float64, false),
-]));
-
-let partition_cols = vec![
-    Arc::new(Field::new("date", DataType::Utf8, false)),
-    Arc::new(Field::new("region", DataType::Utf8, false)),
-];
-
-let table_schema = TableSchema::new(file_schema, partition_cols);
-
-// Access different schema representations
-let file_schema_ref = table_schema.file_schema();      // Schema without partition columns
-let full_schema = table_schema.table_schema();          // Complete schema with partition columns
-let partition_cols_ref = table_schema.table_partition_cols(); // Just the partition columns
-```
-
-### `AggregateUDFImpl::is_ordered_set_aggregate` has been renamed to `AggregateUDFImpl::supports_within_group_clause`
-
-This method has been renamed to better reflect the actual impact it has for aggregate UDF implementations.
-The accompanying `AggregateUDF::is_ordered_set_aggregate` has also been renamed to `AggregateUDF::supports_within_group_clause`.
-No functionality has been changed with regards to this method; it still refers only to permitting use of `WITHIN GROUP`
-SQL syntax for the aggregate function.
-
-## DataFusion `50.0.0`
-
-### ListingTable automatically detects Hive Partitioned tables
-
-DataFusion 50.0.0 automatically infers Hive partitions when using the `ListingTableFactory` and `CREATE EXTERNAL TABLE`. Previously,
-when creating a `ListingTable`, datasets that use Hive partitioning (e.g.
-`/table_root/column1=value1/column2=value2/data.parquet`) would not have the Hive columns reflected in
-the table's schema or data. The previous behavior can be
-restored by setting the `datafusion.execution.listing_table_factory_infer_partitions` configuration option to `false`.
-See [issue #17049] for more details.
-
-[issue #17049]: https://github.com/apache/datafusion/issues/17049
-
-### `MSRV` updated to 1.86.0
-
-The Minimum Supported Rust Version (MSRV) has been updated to [`1.86.0`].
-See [#17230] for details.
-
-[`1.86.0`]: https://releases.rs/docs/1.86.0/
-[#17230]: https://github.com/apache/datafusion/pull/17230
-
-### `ScalarUDFImpl`, `AggregateUDFImpl` and `WindowUDFImpl` traits now require `PartialEq`, `Eq`, and `Hash` traits
-
-To address error-proneness of `ScalarUDFImpl::equals`, `AggregateUDFImpl::equals`and
-`WindowUDFImpl::equals` methods and to make it easy to implement function equality correctly,
-the `equals` and `hash_value` methods have been removed from `ScalarUDFImpl`, `AggregateUDFImpl`
-and `WindowUDFImpl` traits. They are replaced the requirement to implement the `PartialEq`, `Eq`,
-and `Hash` traits on any type implementing `ScalarUDFImpl`, `AggregateUDFImpl` or `WindowUDFImpl`.
-Please see [issue #16677] for more details.
-
-Most of the scalar functions are stateless and have a `signature` field. These can be migrated
-using regular expressions
-
-- search for `\#\[derive\(Debug\)\](\n *(pub )?struct \w+ \{\n *signature\: Signature\,\n *\})`,
-- replace with `#[derive(Debug, PartialEq, Eq, Hash)]$1`,
-- review all the changes and make sure only function structs were changed.
-
-[issue #16677]: https://github.com/apache/datafusion/issues/16677
-
-### `AsyncScalarUDFImpl::invoke_async_with_args` returns `ColumnarValue`
-
-In order to enable single value optimizations and be consistent with other
-user defined function APIs, the `AsyncScalarUDFImpl::invoke_async_with_args` method now
-returns a `ColumnarValue` instead of a `ArrayRef`.
-
-To upgrade, change the return type of your implementation
-
-```rust
-# /* comment to avoid running
-impl AsyncScalarUDFImpl for AskLLM {
-    async fn invoke_async_with_args(
-        &self,
-        args: ScalarFunctionArgs,
-        _option: &ConfigOptions,
-    ) -> Result<ColumnarValue> {
-        ..
-      return array_ref; // old code
-    }
-}
-# */
-```
-
-To return a `ColumnarValue`
-
-```rust
-# /* comment to avoid running
-impl AsyncScalarUDFImpl for AskLLM {
-    async fn invoke_async_with_args(
-        &self,
-        args: ScalarFunctionArgs,
-        _option: &ConfigOptions,
-    ) -> Result<ColumnarValue> {
-        ..
-      return ColumnarValue::from(array_ref); // new code
-    }
-}
-# */
-```
-
-See [#16896](https://github.com/apache/datafusion/issues/16896) for more details.
-
-### `ProjectionExpr` changed from type alias to struct
-
-`ProjectionExpr` has been changed from a type alias to a struct with named fields to improve code clarity and maintainability.
-
-**Before:**
-
-```rust,ignore
-pub type ProjectionExpr = (Arc<dyn PhysicalExpr>, String);
-```
-
-**After:**
-
-```rust,ignore
-#[derive(Debug, Clone)]
-pub struct ProjectionExpr {
-    pub expr: Arc<dyn PhysicalExpr>,
-    pub alias: String,
-}
-```
-
-To upgrade your code:
-
-- Replace tuple construction `(expr, alias)` with `ProjectionExpr::new(expr, alias)` or `ProjectionExpr { expr, alias }`
-- Replace tuple field access `.0` and `.1` with `.expr` and `.alias`
-- Update pattern matching from `(expr, alias)` to `ProjectionExpr { expr, alias }`
-
-This mainly impacts use of `ProjectionExec`.
-
-This change was done in [#17398]
-
-[#17398]: https://github.com/apache/datafusion/pull/17398
-
-### `SessionState`, `SessionConfig`, and `OptimizerConfig` returns `&Arc<ConfigOptions>` instead of `&ConfigOptions`
-
-To provide broader access to `ConfigOptions` and reduce required clones, some
-APIs have been changed to return a `&Arc<ConfigOptions>` instead of a
-`&ConfigOptions`. This allows sharing the same `ConfigOptions` across multiple
-threads without needing to clone the entire `ConfigOptions` structure unless it
-is modified.
-
-Most users will not be impacted by this change since the Rust compiler typically
-automatically dereference the `Arc` when needed. However, in some cases you may
-have to change your code to explicitly call `as_ref()` for example, from
-
-```rust
-# /* comment to avoid running
-let optimizer_config: &ConfigOptions = state.options();
-#  */
-```
-
-To
-
-```rust
-# /* comment to avoid running
-let optimizer_config: &ConfigOptions = state.options().as_ref();
-#  */
-```
-
-See PR [#16970](https://github.com/apache/datafusion/pull/16970)
-
-### API Change to `AsyncScalarUDFImpl::invoke_async_with_args`
-
-The `invoke_async_with_args` method of the `AsyncScalarUDFImpl` trait has been
-updated to remove the `_option: &ConfigOptions` parameter to simplify the API
-now that the `ConfigOptions` can be accessed through the `ScalarFunctionArgs`
-parameter.
-
-You can change your code like this
-
-```rust
-# /* comment to avoid running
-impl AsyncScalarUDFImpl for AskLLM {
-    async fn invoke_async_with_args(
-        &self,
-        args: ScalarFunctionArgs,
-        _option: &ConfigOptions,
-    ) -> Result<ArrayRef> {
-        ..
-    }
-    ...
-}
-# */
-```
-
-To this:
-
-```rust
-# /* comment to avoid running
-
-impl AsyncScalarUDFImpl for AskLLM {
-    async fn invoke_async_with_args(
-        &self,
-        args: ScalarFunctionArgs,
-    ) -> Result<ArrayRef> {
-        let options = &args.config_options;
-        ..
-    }
-    ...
-}
-# */
-```
-
-### Schema Rewriter Module Moved to New Crate
-
-The `schema_rewriter` module and its associated symbols have been moved from `datafusion_physical_expr` to a new crate `datafusion_physical_expr_adapter`. This affects the following symbols:
-
-- `DefaultPhysicalExprAdapter`
-- `DefaultPhysicalExprAdapterFactory`
-- `PhysicalExprAdapter`
-- `PhysicalExprAdapterFactory`
-
-To upgrade, change your imports to:
-
-```rust
-use datafusion_physical_expr_adapter::{
-    DefaultPhysicalExprAdapter, DefaultPhysicalExprAdapterFactory,
-    PhysicalExprAdapter, PhysicalExprAdapterFactory
-};
-```
-
-### Upgrade to arrow `56.0.0` and parquet `56.0.0`
-
-This version of DataFusion upgrades the underlying Apache Arrow implementation
-to version `56.0.0`. See the [release notes](https://github.com/apache/arrow-rs/releases/tag/56.0.0)
-for more details.
-
-### Added `ExecutionPlan::reset_state`
-
-In order to fix a bug in DataFusion `49.0.0` where dynamic filters (currently only generated in the presence of a query such as `ORDER BY ... LIMIT ...`)
-produced incorrect results in recursive queries, a new method `reset_state` has been added to the `ExecutionPlan` trait.
-
-Any `ExecutionPlan` that needs to maintain internal state or references to other nodes in the execution plan tree should implement this method to reset that state.
-See [#17028] for more details and an example implementation for `SortExec`.
-
-[#17028]: https://github.com/apache/datafusion/pull/17028
-
-### Nested Loop Join input sort order cannot be preserved
-
-The Nested Loop Join operator has been rewritten from scratch to improve performance and memory efficiency. From the micro-benchmarks: this change introduces up to 5X speed-up and uses only 1% memory in extreme cases compared to the previous implementation.
-
-However, the new implementation cannot preserve input sort order like the old version could. This is a fundamental design trade-off that prioritizes performance and memory efficiency over sort order preservation.
-
-See [#16996] for details.
-
-[#16996]: https://github.com/apache/datafusion/pull/16996
-
-### Add `as_any()` method to `LazyBatchGenerator`
-
-To help with protobuf serialization, the `as_any()` method has been added to the `LazyBatchGenerator` trait. This means you will need to add `as_any()` to your implementation of `LazyBatchGenerator`:
-
-```rust
-# /* comment to avoid running
-
-impl LazyBatchGenerator for MyBatchGenerator {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    ...
-}
-
-# */
-```
-
-See [#17200](https://github.com/apache/datafusion/pull/17200) for details.
-
-### Refactored `DataSource::try_swapping_with_projection`
-
-We refactored `DataSource::try_swapping_with_projection` to simplify the method and minimize leakage across the ExecutionPlan <-> DataSource abstraction layer.
-Reimplementation for any custom `DataSource` should be relatively straightforward, see [#17395] for more details.
-
-[#17395]: https://github.com/apache/datafusion/pull/17395/
-
-### `FileOpenFuture` now uses `DataFusionError` instead of `ArrowError`
-
-The `FileOpenFuture` type alias has been updated to use `DataFusionError` instead of `ArrowError` for its error type. This change affects the `FileOpener` trait and any implementations that work with file streaming operations.
-
-**Before:**
-
-```rust,ignore
-pub type FileOpenFuture = BoxFuture<'static, Result<BoxStream<'static, Result<RecordBatch, ArrowError>>>>;
-```
-
-**After:**
-
-```rust,ignore
-pub type FileOpenFuture = BoxFuture<'static, Result<BoxStream<'static, Result<RecordBatch>>>>;
-```
-
-If you have custom implementations of `FileOpener` or work directly with `FileOpenFuture`, you'll need to update your error handling to use `DataFusionError` instead of `ArrowError`. The `FileStreamState` enum's `Open` variant has also been updated accordingly. See [#17397] for more details.
-
-[#17397]: https://github.com/apache/datafusion/pull/17397
-
-### FFI user defined aggregate function signature change
-
-The Foreign Function Interface (FFI) signature for user defined aggregate functions
-has been updated to call `return_field` instead of `return_type` on the underlying
-aggregate function. This is to support metadata handling with these aggregate functions.
-This change should be transparent to most users. If you have written unit tests to call
-`return_type` directly, you may need to change them to calling `return_field` instead.
-
-This update is a breaking change to the FFI API. The current best practice when using the
-FFI crate is to ensure that all libraries that are interacting are using the same
-underlying Rust version. Issue [#17374] has been opened to discuss stabilization of
-this interface so that these libraries can be used across different DataFusion versions.
-
-See [#17407] for details.
-
-[#17407]: https://github.com/apache/datafusion/pull/17407
-[#17374]: https://github.com/apache/datafusion/issues/17374
-
-### Added `PhysicalExpr::is_volatile_node`
-
-We added a method to `PhysicalExpr` to mark a `PhysicalExpr` as volatile:
-
-```rust,ignore
-impl PhysicalExpr for MyRandomExpr {
-  fn is_volatile_node(&self) -> bool {
-    true
-  }
-}
-```
-
-We've shipped this with a default value of `false` to minimize breakage but we highly recommend that implementers of `PhysicalExpr` opt into a behavior, even if it is returning `false`.
-
-You can see more discussion and example implementations in [#17351].
-
-[#17351]: https://github.com/apache/datafusion/pull/17351
-
-## DataFusion `49.0.0`
-
-### `MSRV` updated to 1.85.1
-
-The Minimum Supported Rust Version (MSRV) has been updated to [`1.85.1`]. See
-[#16728] for details.
-
-[`1.85.1`]: https://releases.rs/docs/1.85.1/
-[#16728]: https://github.com/apache/datafusion/pull/16728
-
-### `DataFusionError` variants are now `Box`ed
-
-To reduce the size of `DataFusionError`, several variants that were previously stored inline are now `Box`ed. This reduces the size of `Result<T, DataFusionError>` and thus stack usage and async state machine size. Please see [#16652] for more details.
-
-The following variants of `DataFusionError` are now boxed:
-
-- `ArrowError`
-- `SQL`
-- `SchemaError`
-
-This is a breaking change. Code that constructs or matches on these variants will need to be updated.
-
-For example, to create a `SchemaError`, instead of:
-
-```rust
-# /* comment to avoid running
-use datafusion_common::{DataFusionError, SchemaError};
-DataFusionError::SchemaError(
-  SchemaError::DuplicateUnqualifiedField { name: "foo".to_string() },
-  Box::new(None)
-)
-# */
-```
-
-You now need to `Box` the inner error:
-
-```rust
-# /* comment to avoid running
-use datafusion_common::{DataFusionError, SchemaError};
-DataFusionError::SchemaError(
-  Box::new(SchemaError::DuplicateUnqualifiedField { name: "foo".to_string() }),
-  Box::new(None)
-)
-# */
-```
-
-[#16652]: https://github.com/apache/datafusion/issues/16652
-
-### Metadata on Arrow Types is now represented by `FieldMetadata`
-
-Metadata from the Arrow `Field` is now stored using the `FieldMetadata`
-structure. In prior versions it was stored as both a `HashMap<String, String>`
-and a `BTreeMap<String, String>`. `FieldMetadata` is a easier to work with and
-is more efficient.
-
-To create `FieldMetadata` from a `Field`:
-
-```rust
-# /* comment to avoid running
- let metadata = FieldMetadata::from(&field);
-# */
-```
-
-To add metadata to a `Field`, use the `add_to_field` method:
-
-```rust
-# /* comment to avoid running
-let updated_field = metadata.add_to_field(field);
-# */
-```
-
-See [#16317] for details.
-
-[#16317]: https://github.com/apache/datafusion/pull/16317
-
-### New `datafusion.execution.spill_compression` configuration option
-
-DataFusion 49.0.0 adds support for compressing spill files when data is written to disk during spilling query execution. A new configuration option `datafusion.execution.spill_compression` controls the compression codec used.
-
-**Configuration:**
-
-- **Key**: `datafusion.execution.spill_compression`
-- **Default**: `uncompressed`
-- **Valid values**: `uncompressed`, `lz4_frame`, `zstd`
-
-**Usage:**
-
-```rust
-# /* comment to avoid running
-use datafusion::prelude::*;
-use datafusion_common::config::SpillCompression;
-
-let config = SessionConfig::default()
-    .with_spill_compression(SpillCompression::Zstd);
-let ctx = SessionContext::new_with_config(config);
-# */
-```
-
-Or via SQL:
-
-```sql
-SET datafusion.execution.spill_compression = 'zstd';
-```
-
-For more details about this configuration option, including performance trade-offs between different compression codecs, see the [Configuration Settings](../user-guide/configs.md) documentation.
-
-### Deprecated `map_varchar_to_utf8view` configuration option
-
-See [issue #16290](https://github.com/apache/datafusion/pull/16290) for more information
-The old configuration
-
-```text
-datafusion.sql_parser.map_varchar_to_utf8view
-```
-
-is now **deprecated** in favor of the unified option below.\
-If you previously used this to control only `VARCHAR`→`Utf8View` mapping, please migrate to `map_string_types_to_utf8view`.
-
----
-
-### New `map_string_types_to_utf8view` configuration option
-
-To unify **all** SQL string types (`CHAR`, `VARCHAR`, `TEXT`, `STRING`) to Arrow’s zero‑copy `Utf8View`, DataFusion 49.0.0 introduces:
-
-- **Key**: `datafusion.sql_parser.map_string_types_to_utf8view`
-- **Default**: `true`
-
-**Description:**
-
-- When **true** (default), **all** SQL string types are mapped to `Utf8View`, avoiding full‑copy UTF‑8 allocations and improving performance.
-- When **false**, DataFusion falls back to the legacy `Utf8` mapping for **all** string types.
-
-#### Examples
-
-```rust
-# /* comment to avoid running
-// Disable Utf8View mapping for all SQL string types
-let opts = datafusion::sql::planner::ParserOptions::new()
-    .with_map_string_types_to_utf8view(false);
-
-// Verify the setting is applied
-assert!(!opts.map_string_types_to_utf8view);
-# */
-```
-
----
-
-```sql
--- Disable Utf8View mapping globally
-SET datafusion.sql_parser.map_string_types_to_utf8view = false;
-
--- Now VARCHAR, CHAR, TEXT, STRING all use Utf8 rather than Utf8View
-CREATE TABLE my_table (a VARCHAR, b TEXT, c STRING);
-DESCRIBE my_table;
-```
-
-### Deprecating `SchemaAdapterFactory` and `SchemaAdapter`
-
-We are moving away from converting data (using `SchemaAdapter`) to converting the expressions themselves (which is more efficient and flexible).
-
-See [issue #16800](https://github.com/apache/datafusion/issues/16800) for more information
-The first place this change has taken place is in predicate pushdown for Parquet.
-By default if you do not use a custom `SchemaAdapterFactory` we will use expression conversion instead.
-If you do set a custom `SchemaAdapterFactory` we will continue to use it but emit a warning about that code path being deprecated.
-
-To resolve this you need to implement a custom `PhysicalExprAdapterFactory` and use that instead of a `SchemaAdapterFactory`.
-See the [default values](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/default_column_values.rs) for an example of how to do this.
-Opting into the new APIs will set you up for future changes since we plan to expand use of `PhysicalExprAdapterFactory` to other areas of DataFusion.
-
-See [#16800] for details.
-
-[#16800]: https://github.com/apache/datafusion/issues/16800
-
-### `TableParquetOptions` Updated
-
-The `TableParquetOptions` struct has a new `crypto` field to specify encryption
-options for Parquet files. The `ParquetEncryptionOptions` implements `Default`
-so you can upgrade your existing code like this:
-
-```rust
-# /* comment to avoid running
-TableParquetOptions {
-  global,
-  column_specific_options,
-  key_value_metadata,
-}
-# */
-```
-
-To this:
-
-```rust
-# /* comment to avoid running
-TableParquetOptions {
-  global,
-  column_specific_options,
-  key_value_metadata,
-  crypto: Default::default(), // New crypto field
-}
-# */
-```
-
-## DataFusion `48.0.1`
-
-### `datafusion.execution.collect_statistics` now defaults to `true`
-
-The default value of the `datafusion.execution.collect_statistics` configuration
-setting is now true. This change impacts users that use that value directly and relied
-on its default value being `false`.
-
-This change also restores the default behavior of `ListingTable` to its previous. If you use it directly
-you can maintain the current behavior by overriding the default value in your code.
-
-```rust
-# /* comment to avoid running
-ListingOptions::new(Arc::new(ParquetFormat::default()))
-    .with_collect_stat(false)
-    // other options
-# */
-```
-
-## DataFusion `48.0.0`
-
-### `Expr::Literal` has optional metadata
-
-The [`Expr::Literal`] variant now includes optional metadata, which allows for
-carrying through Arrow field metadata to support extension types and other uses.
-
-This means code such as
-
-```rust
-# /* comment to avoid running
-match expr {
-...
-  Expr::Literal(scalar) => ...
-...
-}
-#  */
-```
-
-Should be updated to:
-
-```rust
-# /* comment to avoid running
-match expr {
-...
-  Expr::Literal(scalar, _metadata) => ...
-...
-}
-#  */
-```
-
-Likewise constructing `Expr::Literal` requires metadata as well. The [`lit`] function
-has not changed and returns an `Expr::Literal` with no metadata.
-
-[`expr::literal`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.Expr.html#variant.Literal
-[`lit`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/fn.lit.html
-
-### `Expr::WindowFunction` is now `Box`ed
-
-`Expr::WindowFunction` is now a `Box<WindowFunction>` instead of a `WindowFunction` directly.
-This change was made to reduce the size of `Expr` and improve performance when
-planning queries (see [details on #16207]).
-
-This is a breaking change, so you will need to update your code if you match
-on `Expr::WindowFunction` directly. For example, if you have code like this:
-
-```rust
-# /* comment to avoid running
-match expr {
-  Expr::WindowFunction(WindowFunction {
-    params:
-      WindowFunctionParams {
-       partition_by,
-       order_by,
-      ..
-    }
-  }) => {
-    // Use partition_by and order_by as needed
-  }
-  _ => {
-    // other expr
-  }
-}
-# */
-```
-
-You will need to change it to:
-
-```rust
-# /* comment to avoid running
-match expr {
-  Expr::WindowFunction(window_fun) => {
-    let WindowFunction {
-      fun,
-      params: WindowFunctionParams {
-        args,
-        partition_by,
-        ..
-        },
-    } = window_fun.as_ref();
-    // Use partition_by and order_by as needed
-  }
-  _ => {
-    // other expr
-  }
-}
-#  */
-```
-
-[details on #16207]: https://github.com/apache/datafusion/pull/16207#issuecomment-2922659103
-
-### The `VARCHAR` SQL type is now represented as `Utf8View` in Arrow
-
-The mapping of the SQL `VARCHAR` type has been changed from `Utf8` to `Utf8View`
-which improves performance for many string operations. You can read more about
-`Utf8View` in the [DataFusion blog post on German-style strings]
-
-[datafusion blog post on german-style strings]: https://datafusion.apache.org/blog/2024/09/13/string-view-german-style-strings-part-1/
-
-This means that when you create a table with a `VARCHAR` column, it will now use
-`Utf8View` as the underlying data type. For example:
-
-```sql
-> CREATE TABLE my_table (my_column VARCHAR);
-0 row(s) fetched.
-Elapsed 0.001 seconds.
-
-> DESCRIBE my_table;
-+-------------+-----------+-------------+
-| column_name | data_type | is_nullable |
-+-------------+-----------+-------------+
-| my_column   | Utf8View  | YES         |
-+-------------+-----------+-------------+
-1 row(s) fetched.
-Elapsed 0.000 seconds.
-```
-
-You can restore the old behavior of using `Utf8` by changing the
-`datafusion.sql_parser.map_varchar_to_utf8view` configuration setting. For
-example
-
-```sql
-> set datafusion.sql_parser.map_varchar_to_utf8view = false;
-0 row(s) fetched.
-Elapsed 0.001 seconds.
-
-> CREATE TABLE my_table (my_column VARCHAR);
-0 row(s) fetched.
-Elapsed 0.014 seconds.
-
-> DESCRIBE my_table;
-+-------------+-----------+-------------+
-| column_name | data_type | is_nullable |
-+-------------+-----------+-------------+
-| my_column   | Utf8      | YES         |
-+-------------+-----------+-------------+
-1 row(s) fetched.
-Elapsed 0.004 seconds.
-```
-
-### `ListingOptions` default for `collect_stat` changed from `true` to `false`
-
-This makes it agree with the default for `SessionConfig`.
-Most users won't be impacted by this change but if you were using `ListingOptions` directly
-and relied on the default value of `collect_stat` being `true`, you will need to
-explicitly set it to `true` in your code.
-
-```rust
-# /* comment to avoid running
-ListingOptions::new(Arc::new(ParquetFormat::default()))
-    .with_collect_stat(true)
-    // other options
-# */
-```
-
-### Processing `FieldRef` instead of `DataType` for user defined functions
-
-In order to support metadata handling and extension types, user defined functions are
-now switching to traits which use `FieldRef` rather than a `DataType` and nullability.
-This gives a single interface to both of these parameters and additionally allows
-access to metadata fields, which can be used for extension types.
-
-To upgrade structs which implement `ScalarUDFImpl`, if you have implemented
-`return_type_from_args` you need instead to implement `return_field_from_args`.
-If your functions do not need to handle metadata, this should be straightforward
-repackaging of the output data into a `FieldRef`. The name you specify on the
-field is not important. It will be overwritten during planning. `ReturnInfo`
-has been removed, so you will need to remove all references to it.
-
-`ScalarFunctionArgs` now contains a field called `arg_fields`. You can use this
-to access the metadata associated with the columnar values during invocation.
-
-To upgrade user defined aggregate functions, there is now a function
-`return_field` that will allow you to specify both metadata and nullability of
-your function. You are not required to implement this if you do not need to
-handle metadata.
-
-The largest change to aggregate functions happens in the accumulator arguments.
-Both the `AccumulatorArgs` and `StateFieldsArgs` now contain `FieldRef` rather
-than `DataType`.
-
-To upgrade window functions, `ExpressionArgs` now contains input fields instead
-of input data types. When setting these fields, the name of the field is
-not important since this gets overwritten during the planning stage. All you
-should need to do is wrap your existing data types in fields with nullability
-set depending on your use case.
-
-### Physical Expression return `Field`
-
-To support the changes to user defined functions processing metadata, the
-`PhysicalExpr` trait, which now must specify a return `Field` based on the input
-schema. To upgrade structs which implement `PhysicalExpr` you need to implement
-the `return_field` function. There are numerous examples in the `physical-expr`
-crate.
-
-### `FileFormat::supports_filters_pushdown` replaced with `FileSource::try_pushdown_filters`
-
-To support more general filter pushdown, the `FileFormat::supports_filters_pushdown` was replaced with
-`FileSource::try_pushdown_filters`.
-If you implemented a custom `FileFormat` that uses a custom `FileSource` you will need to implement
-`FileSource::try_pushdown_filters`.
-See `ParquetSource::try_pushdown_filters` for an example of how to implement this.
-
-`FileFormat::supports_filters_pushdown` has been removed.
-
-### `ParquetExec`, `AvroExec`, `CsvExec`, `JsonExec` Removed
-
-`ParquetExec`, `AvroExec`, `CsvExec`, and `JsonExec` were deprecated in
-DataFusion 46 and are removed in DataFusion 48. This is sooner than the normal
-process described in the [API Deprecation Guidelines] because all the tests
-cover the new `DataSourceExec` rather than the older structures. As we evolve
-`DataSource`, the old structures began to show signs of "bit rotting" (not
-working but no one knows due to lack of test coverage).
-
-[api deprecation guidelines]: https://datafusion.apache.org/contributor-guide/api-health.html#deprecation-guidelines
-
-### `PartitionedFile` added as an argument to the `FileOpener` trait
-
-This is necessary to properly fix filter pushdown for filters that combine partition
-columns and file columns (e.g. `day = username['dob']`).
-
-If you implemented a custom `FileOpener` you will need to add the `PartitionedFile` argument
-but are not required to use it in any way.
-
-## DataFusion `47.0.0`
-
-This section calls out some of the major changes in the `47.0.0` release of DataFusion.
-
-Here are some example upgrade PRs that demonstrate changes required when upgrading from DataFusion 46.0.0:
-
-- [delta-rs Upgrade to `47.0.0`](https://github.com/delta-io/delta-rs/pull/3378)
-- [DataFusion Comet Upgrade to `47.0.0`](https://github.com/apache/datafusion-comet/pull/1563)
-- [Sail Upgrade to `47.0.0`](https://github.com/lakehq/sail/pull/434)
-
-### Upgrades to `arrow-rs` and `arrow-parquet` 55.0.0 and `object_store` 0.12.0
-
-Several APIs are changed in the underlying arrow and parquet libraries to use a
-`u64` instead of `usize` to better support WASM (See [#7371] and [#6961])
-
-Additionally `ObjectStore::list` and `ObjectStore::list_with_offset` have been changed to return `static` lifetimes (See [#6619])
-
-[#6619]: https://github.com/apache/arrow-rs/pull/6619
-[#7371]: https://github.com/apache/arrow-rs/pull/7371
-
-This requires converting from `usize` to `u64` occasionally as well as changes to `ObjectStore` implementations such as
-
-```rust
-# /* comment to avoid running
-impl Objectstore {
-    ...
-    // The range is now a u64 instead of usize
-    async fn get_range(&self, location: &Path, range: Range<u64>) -> ObjectStoreResult<Bytes> {
-        self.inner.get_range(location, range).await
-    }
-    ...
-    // the lifetime is now 'static instead of `_ (meaning the captured closure can't contain references)
-    // (this also applies to list_with_offset)
-    fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, ObjectStoreResult<ObjectMeta>> {
-        self.inner.list(prefix)
-    }
-}
-# */
-```
-
-The `ParquetObjectReader` has been updated to no longer require the object size
-(it can be fetched using a single suffix request). See [#7334] for details
-
-[#7334]: https://github.com/apache/arrow-rs/pull/7334
-
-Pattern in DataFusion `46.0.0`:
-
-```rust
-# /* comment to avoid running
-let meta: ObjectMeta = ...;
-let reader = ParquetObjectReader::new(store, meta);
-# */
-```
-
-Pattern in DataFusion `47.0.0`:
-
-```rust
-# /* comment to avoid running
-let meta: ObjectMeta = ...;
-let reader = ParquetObjectReader::new(store, location)
-  .with_file_size(meta.size);
-# */
-```
-
-### `DisplayFormatType::TreeRender`
-
-DataFusion now supports [`tree` style explain plans]. Implementations of
-`Executionplan` must also provide a description in the
-`DisplayFormatType::TreeRender` format. This can be the same as the existing
-`DisplayFormatType::Default`.
-
-[`tree` style explain plans]: https://datafusion.apache.org/user-guide/sql/explain.html#tree-format-default
-
-### Removed Deprecated APIs
-
-Several APIs have been removed in this release. These were either deprecated
-previously or were hard to use correctly such as the multiple different
-`ScalarUDFImpl::invoke*` APIs. See [#15130], [#15123], and [#15027] for more
-details.
-
-[#15130]: https://github.com/apache/datafusion/pull/15130
-[#15123]: https://github.com/apache/datafusion/pull/15123
-[#15027]: https://github.com/apache/datafusion/pull/15027
-
-### `FileScanConfig` --> `FileScanConfigBuilder`
-
-Previously, `FileScanConfig::build()` directly created ExecutionPlans. In
-DataFusion 47.0.0 this has been changed to use `FileScanConfigBuilder`. See
-[#15352] for details.
-
-[#15352]: https://github.com/apache/datafusion/pull/15352
-
-Pattern in DataFusion `46.0.0`:
-
-```rust
-# /* comment to avoid running
-let plan = FileScanConfig::new(url, schema, Arc::new(file_source))
-  .with_statistics(stats)
-  ...
-  .build()
-# */
-```
-
-Pattern in DataFusion `47.0.0`:
-
-```rust
-# /* comment to avoid running
-let config = FileScanConfigBuilder::new(url, schema, Arc::new(file_source))
-  .with_statistics(stats)
-  ...
-  .build();
-let scan = DataSourceExec::from_data_source(config);
-# */
-```
-
-## DataFusion `46.0.0`
-
-### Use `invoke_with_args` instead of `invoke()` and `invoke_batch()`
-
-DataFusion is moving to a consistent API for invoking ScalarUDFs,
-[`ScalarUDFImpl::invoke_with_args()`], and deprecating
-[`ScalarUDFImpl::invoke()`], [`ScalarUDFImpl::invoke_batch()`], and [`ScalarUDFImpl::invoke_no_args()`]
-
-If you see errors such as the following it means the older APIs are being used:
-
-```text
-This feature is not implemented: Function concat does not implement invoke but called
-```
-
-To fix this error, use [`ScalarUDFImpl::invoke_with_args()`] instead, as shown
-below. See [PR 14876] for an example.
-
-Given existing code like this:
-
-```rust
-# /* comment to avoid running
-impl ScalarUDFImpl for SparkConcat {
-...
-    fn invoke_batch(&self, args: &[ColumnarValue], number_rows: usize) -> Result<ColumnarValue> {
-        if args
-            .iter()
-            .any(|arg| matches!(arg.data_type(), DataType::List(_)))
-        {
-            ArrayConcat::new().invoke_batch(args, number_rows)
-        } else {
-            ConcatFunc::new().invoke_batch(args, number_rows)
-        }
-    }
-}
-# */
-```
-
-To
-
-```rust
-# /* comment to avoid running
-impl ScalarUDFImpl for SparkConcat {
-    ...
-    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        if args
-            .args
-            .iter()
-            .any(|arg| matches!(arg.data_type(), DataType::List(_)))
-        {
-            ArrayConcat::new().invoke_with_args(args)
-        } else {
-            ConcatFunc::new().invoke_with_args(args)
-        }
-    }
-}
- # */
-```
-
-[`scalarudfimpl::invoke()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke
-[`scalarudfimpl::invoke_batch()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke_batch
-[`scalarudfimpl::invoke_no_args()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke_no_args
-[`scalarudfimpl::invoke_with_args()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke_with_args
-[pr 14876]: https://github.com/apache/datafusion/pull/14876
-
-### `ParquetExec`, `AvroExec`, `CsvExec`, `JsonExec` deprecated
-
-DataFusion 46 has a major change to how the built in DataSources are organized.
-Instead of individual `ExecutionPlan`s for the different file formats they now
-all use `DataSourceExec` and the format specific information is embodied in new
-traits `DataSource` and `FileSource`.
-
-Here is more information about
-
-- [Design Ticket]
-- Change PR [PR #14224]
-- Example of an Upgrade [PR in delta-rs]
-
-[design ticket]: https://github.com/apache/datafusion/issues/13838
-[pr #14224]: https://github.com/apache/datafusion/pull/14224
-[pr in delta-rs]: https://github.com/delta-io/delta-rs/pull/3261
-
-### Cookbook: Changes to `ParquetExecBuilder`
-
-Code that looks for `ParquetExec` like this will no longer work:
-
-```rust
-# /* comment to avoid running
-    if let Some(parquet_exec) = plan.as_any().downcast_ref::<ParquetExec>() {
-        // Do something with ParquetExec here
-    }
-# */
-```
-
-Instead, with `DataSourceExec`, the same information is now on `FileScanConfig` and
-`ParquetSource`. The equivalent code is
-
-```rust
-# /* comment to avoid running
-if let Some(datasource_exec) = plan.as_any().downcast_ref::<DataSourceExec>() {
-  if let Some(scan_config) = datasource_exec.data_source().as_any().downcast_ref::<FileScanConfig>() {
-    // FileGroups, and other information is on the FileScanConfig
-    // parquet
-    if let Some(parquet_source) = scan_config.file_source.as_any().downcast_ref::<ParquetSource>()
-    {
-      // Information on PruningPredicates and parquet options are here
-    }
-}
-# */
-```
-
-### Cookbook: Changes to `ParquetExecBuilder`
-
-Likewise code that builds `ParquetExec` using the `ParquetExecBuilder` such as
-the following must be changed:
-
-```rust
-# /* comment to avoid running
-let mut exec_plan_builder = ParquetExecBuilder::new(
-    FileScanConfig::new(self.log_store.object_store_url(), file_schema)
-        .with_projection(self.projection.cloned())
-        .with_limit(self.limit)
-        .with_table_partition_cols(table_partition_cols),
-)
-.with_schema_adapter_factory(Arc::new(DeltaSchemaAdapterFactory {}))
-.with_table_parquet_options(parquet_options);
-
-// Add filter
-if let Some(predicate) = logical_filter {
-    if config.enable_parquet_pushdown {
-        exec_plan_builder = exec_plan_builder.with_predicate(predicate);
-    }
-};
-# */
-```
-
-New code should use `FileScanConfig` to build the appropriate `DataSourceExec`:
-
-```rust
-# /* comment to avoid running
-let mut file_source = ParquetSource::new(parquet_options)
-    .with_schema_adapter_factory(Arc::new(DeltaSchemaAdapterFactory {}));
-
-// Add filter
-if let Some(predicate) = logical_filter {
-    if config.enable_parquet_pushdown {
-        file_source = file_source.with_predicate(predicate);
-    }
-};
-
-let file_scan_config = FileScanConfig::new(
-    self.log_store.object_store_url(),
-    file_schema,
-    Arc::new(file_source),
-)
-.with_statistics(stats)
-.with_projection(self.projection.cloned())
-.with_limit(self.limit)
-.with_table_partition_cols(table_partition_cols);
-
-// Build the actual scan like this
-parquet_scan: file_scan_config.build(),
-# */
-```
-
-### `datafusion-cli` no longer automatically unescapes strings
-
-`datafusion-cli` previously would incorrectly unescape string literals (see [ticket] for more details).
-
-To escape `'` in SQL literals, use `''`:
-
-```sql
-> select 'it''s escaped';
-+----------------------+
-| Utf8("it's escaped") |
-+----------------------+
-| it's escaped         |
-+----------------------+
-1 row(s) fetched.
-```
-
-To include special characters (such as newlines via `\n`) you can use an `E` literal string. For example
-
-```sql
-> select 'foo\nbar';
-+------------------+
-| Utf8("foo\nbar") |
-+------------------+
-| foo\nbar         |
-+------------------+
-1 row(s) fetched.
-Elapsed 0.005 seconds.
-```
-
-### Changes to array scalar function signatures
-
-DataFusion 46 has changed the way scalar array function signatures are
-declared. Previously, functions needed to select from a list of predefined
-signatures within the `ArrayFunctionSignature` enum. Now the signatures
-can be defined via a `Vec` of pseudo-types, which each correspond to a
-single argument. Those pseudo-types are the variants of the
-`ArrayFunctionArgument` enum and are as follows:
-
-- `Array`: An argument of type List/LargeList/FixedSizeList. All Array
-  arguments must be coercible to the same type.
-- `Element`: An argument that is coercible to the inner type of the `Array`
-  arguments.
-- `Index`: An `Int64` argument.
-
-Each of the old variants can be converted to the new format as follows:
-
-`TypeSignature::ArraySignature(ArrayFunctionSignature::ArrayAndElement)`:
-
-```rust
-# use datafusion::common::utils::ListCoercion;
-# use datafusion_expr_common::signature::{ArrayFunctionArgument, ArrayFunctionSignature, TypeSignature};
-
-TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
-    arguments: vec![ArrayFunctionArgument::Array, ArrayFunctionArgument::Element],
-    array_coercion: Some(ListCoercion::FixedSizedListToList),
-});
-```
-
-`TypeSignature::ArraySignature(ArrayFunctionSignature::ElementAndArray)`:
-
-```rust
-# use datafusion::common::utils::ListCoercion;
-# use datafusion_expr_common::signature::{ArrayFunctionArgument, ArrayFunctionSignature, TypeSignature};
-
-TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
-    arguments: vec![ArrayFunctionArgument::Element, ArrayFunctionArgument::Array],
-    array_coercion: Some(ListCoercion::FixedSizedListToList),
-});
-```
-
-`TypeSignature::ArraySignature(ArrayFunctionSignature::ArrayAndIndex)`:
-
-```rust
-# use datafusion::common::utils::ListCoercion;
-# use datafusion_expr_common::signature::{ArrayFunctionArgument, ArrayFunctionSignature, TypeSignature};
-
-TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
-    arguments: vec![ArrayFunctionArgument::Array, ArrayFunctionArgument::Index],
-    array_coercion: None,
-});
-```
-
-`TypeSignature::ArraySignature(ArrayFunctionSignature::ArrayAndElementAndOptionalIndex)`:
-
-```rust
-# use datafusion::common::utils::ListCoercion;
-# use datafusion_expr_common::signature::{ArrayFunctionArgument, ArrayFunctionSignature, TypeSignature};
-
-TypeSignature::OneOf(vec![
-    TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
-        arguments: vec![ArrayFunctionArgument::Array, ArrayFunctionArgument::Element],
-        array_coercion: None,
-    }),
-    TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
-        arguments: vec![
-            ArrayFunctionArgument::Array,
-            ArrayFunctionArgument::Element,
-            ArrayFunctionArgument::Index,
-        ],
-        array_coercion: None,
-    }),
-]);
-```
-
-`TypeSignature::ArraySignature(ArrayFunctionSignature::Array)`:
-
-```rust
-# use datafusion::common::utils::ListCoercion;
-# use datafusion_expr_common::signature::{ArrayFunctionArgument, ArrayFunctionSignature, TypeSignature};
-
-TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
-    arguments: vec![ArrayFunctionArgument::Array],
-    array_coercion: None,
-});
-```
-
-Alternatively, you can switch to using one of the following functions which
-take care of constructing the `TypeSignature` for you:
-
-- `Signature::array_and_element`
-- `Signature::array_and_element_and_optional_index`
-- `Signature::array_and_index`
-- `Signature::array`
-
-[ticket]: https://github.com/apache/datafusion/issues/13286
diff --git a/docs/source/library-user-guide/upgrading/46.0.0.md b/docs/source/library-user-guide/upgrading/46.0.0.md
new file mode 100644
index 0000000000000..e38d18c3d6609
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/46.0.0.md
@@ -0,0 +1,310 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Upgrade Guides
+
+## DataFusion 46.0.0
+
+### Use `invoke_with_args` instead of `invoke()` and `invoke_batch()`
+
+DataFusion is moving to a consistent API for invoking ScalarUDFs,
+[`ScalarUDFImpl::invoke_with_args()`], and deprecating
+[`ScalarUDFImpl::invoke()`], [`ScalarUDFImpl::invoke_batch()`], and [`ScalarUDFImpl::invoke_no_args()`]
+
+If you see errors such as the following it means the older APIs are being used:
+
+```text
+This feature is not implemented: Function concat does not implement invoke but called
+```
+
+To fix this error, use [`ScalarUDFImpl::invoke_with_args()`] instead, as shown
+below. See [PR 14876] for an example.
+
+Given existing code like this:
+
+```rust
+# /* comment to avoid running
+impl ScalarUDFImpl for SparkConcat {
+...
+    fn invoke_batch(&self, args: &[ColumnarValue], number_rows: usize) -> Result<ColumnarValue> {
+        if args
+            .iter()
+            .any(|arg| matches!(arg.data_type(), DataType::List(_)))
+        {
+            ArrayConcat::new().invoke_batch(args, number_rows)
+        } else {
+            ConcatFunc::new().invoke_batch(args, number_rows)
+        }
+    }
+}
+# */
+```
+
+To
+
+```rust
+# /* comment to avoid running
+impl ScalarUDFImpl for SparkConcat {
+    ...
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        if args
+            .args
+            .iter()
+            .any(|arg| matches!(arg.data_type(), DataType::List(_)))
+        {
+            ArrayConcat::new().invoke_with_args(args)
+        } else {
+            ConcatFunc::new().invoke_with_args(args)
+        }
+    }
+}
+ # */
+```
+
+[`scalarudfimpl::invoke()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke
+[`scalarudfimpl::invoke_batch()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke_batch
+[`scalarudfimpl::invoke_no_args()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke_no_args
+[`scalarudfimpl::invoke_with_args()`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html#method.invoke_with_args
+[pr 14876]: https://github.com/apache/datafusion/pull/14876
+
+### `ParquetExec`, `AvroExec`, `CsvExec`, `JsonExec` deprecated
+
+DataFusion 46 has a major change to how the built in DataSources are organized.
+Instead of individual `ExecutionPlan`s for the different file formats they now
+all use `DataSourceExec` and the format specific information is embodied in new
+traits `DataSource` and `FileSource`.
+
+Here is more information about
+
+- [Design Ticket]
+- Change PR [PR #14224]
+- Example of an Upgrade [PR in delta-rs]
+
+[design ticket]: https://github.com/apache/datafusion/issues/13838
+[pr #14224]: https://github.com/apache/datafusion/pull/14224
+[pr in delta-rs]: https://github.com/delta-io/delta-rs/pull/3261
+
+### Cookbook: Changes to `ParquetExecBuilder`
+
+Code that looks for `ParquetExec` like this will no longer work:
+
+```rust
+# /* comment to avoid running
+    if let Some(parquet_exec) = plan.as_any().downcast_ref::<ParquetExec>() {
+        // Do something with ParquetExec here
+    }
+# */
+```
+
+Instead, with `DataSourceExec`, the same information is now on `FileScanConfig` and
+`ParquetSource`. The equivalent code is
+
+```rust
+# /* comment to avoid running
+if let Some(datasource_exec) = plan.as_any().downcast_ref::<DataSourceExec>() {
+  if let Some(scan_config) = datasource_exec.data_source().as_any().downcast_ref::<FileScanConfig>() {
+    // FileGroups, and other information is on the FileScanConfig
+    // parquet
+    if let Some(parquet_source) = scan_config.file_source.as_any().downcast_ref::<ParquetSource>()
+    {
+      // Information on PruningPredicates and parquet options are here
+    }
+}
+# */
+```
+
+### Cookbook: Changes to `ParquetExecBuilder`
+
+Likewise code that builds `ParquetExec` using the `ParquetExecBuilder` such as
+the following must be changed:
+
+```rust
+# /* comment to avoid running
+let mut exec_plan_builder = ParquetExecBuilder::new(
+    FileScanConfig::new(self.log_store.object_store_url(), file_schema)
+        .with_projection(self.projection.cloned())
+        .with_limit(self.limit)
+        .with_table_partition_cols(table_partition_cols),
+)
+.with_schema_adapter_factory(Arc::new(DeltaSchemaAdapterFactory {}))
+.with_table_parquet_options(parquet_options);
+
+// Add filter
+if let Some(predicate) = logical_filter {
+    if config.enable_parquet_pushdown {
+        exec_plan_builder = exec_plan_builder.with_predicate(predicate);
+    }
+};
+# */
+```
+
+New code should use `FileScanConfig` to build the appropriate `DataSourceExec`:
+
+```rust
+# /* comment to avoid running
+let mut file_source = ParquetSource::new(parquet_options)
+    .with_schema_adapter_factory(Arc::new(DeltaSchemaAdapterFactory {}));
+
+// Add filter
+if let Some(predicate) = logical_filter {
+    if config.enable_parquet_pushdown {
+        file_source = file_source.with_predicate(predicate);
+    }
+};
+
+let file_scan_config = FileScanConfig::new(
+    self.log_store.object_store_url(),
+    file_schema,
+    Arc::new(file_source),
+)
+.with_statistics(stats)
+.with_projection(self.projection.cloned())
+.with_limit(self.limit)
+.with_table_partition_cols(table_partition_cols);
+
+// Build the actual scan like this
+parquet_scan: file_scan_config.build(),
+# */
+```
+
+### `datafusion-cli` no longer automatically unescapes strings
+
+`datafusion-cli` previously would incorrectly unescape string literals (see [ticket] for more details).
+
+To escape `'` in SQL literals, use `''`:
+
+```sql
+> select 'it''s escaped';
++----------------------+
+| Utf8("it's escaped") |
++----------------------+
+| it's escaped         |
++----------------------+
+1 row(s) fetched.
+```
+
+To include special characters (such as newlines via `\n`) you can use an `E` literal string. For example
+
+```sql
+> select 'foo\nbar';
++------------------+
+| Utf8("foo\nbar") |
++------------------+
+| foo\nbar         |
++------------------+
+1 row(s) fetched.
+Elapsed 0.005 seconds.
+```
+
+### Changes to array scalar function signatures
+
+DataFusion 46 has changed the way scalar array function signatures are
+declared. Previously, functions needed to select from a list of predefined
+signatures within the `ArrayFunctionSignature` enum. Now the signatures
+can be defined via a `Vec` of pseudo-types, which each correspond to a
+single argument. Those pseudo-types are the variants of the
+`ArrayFunctionArgument` enum and are as follows:
+
+- `Array`: An argument of type List/LargeList/FixedSizeList. All Array
+  arguments must be coercible to the same type.
+- `Element`: An argument that is coercible to the inner type of the `Array`
+  arguments.
+- `Index`: An `Int64` argument.
+
+Each of the old variants can be converted to the new format as follows:
+
+`TypeSignature::ArraySignature(ArrayFunctionSignature::ArrayAndElement)`:
+
+```rust
+# use datafusion::common::utils::ListCoercion;
+# use datafusion_expr_common::signature::{ArrayFunctionArgument, ArrayFunctionSignature, TypeSignature};
+
+TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+    arguments: vec![ArrayFunctionArgument::Array, ArrayFunctionArgument::Element],
+    array_coercion: Some(ListCoercion::FixedSizedListToList),
+});
+```
+
+`TypeSignature::ArraySignature(ArrayFunctionSignature::ElementAndArray)`:
+
+```rust
+# use datafusion::common::utils::ListCoercion;
+# use datafusion_expr_common::signature::{ArrayFunctionArgument, ArrayFunctionSignature, TypeSignature};
+
+TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+    arguments: vec![ArrayFunctionArgument::Element, ArrayFunctionArgument::Array],
+    array_coercion: Some(ListCoercion::FixedSizedListToList),
+});
+```
+
+`TypeSignature::ArraySignature(ArrayFunctionSignature::ArrayAndIndex)`:
+
+```rust
+# use datafusion::common::utils::ListCoercion;
+# use datafusion_expr_common::signature::{ArrayFunctionArgument, ArrayFunctionSignature, TypeSignature};
+
+TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+    arguments: vec![ArrayFunctionArgument::Array, ArrayFunctionArgument::Index],
+    array_coercion: None,
+});
+```
+
+`TypeSignature::ArraySignature(ArrayFunctionSignature::ArrayAndElementAndOptionalIndex)`:
+
+```rust
+# use datafusion::common::utils::ListCoercion;
+# use datafusion_expr_common::signature::{ArrayFunctionArgument, ArrayFunctionSignature, TypeSignature};
+
+TypeSignature::OneOf(vec![
+    TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+        arguments: vec![ArrayFunctionArgument::Array, ArrayFunctionArgument::Element],
+        array_coercion: None,
+    }),
+    TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+        arguments: vec![
+            ArrayFunctionArgument::Array,
+            ArrayFunctionArgument::Element,
+            ArrayFunctionArgument::Index,
+        ],
+        array_coercion: None,
+    }),
+]);
+```
+
+`TypeSignature::ArraySignature(ArrayFunctionSignature::Array)`:
+
+```rust
+# use datafusion::common::utils::ListCoercion;
+# use datafusion_expr_common::signature::{ArrayFunctionArgument, ArrayFunctionSignature, TypeSignature};
+
+TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+    arguments: vec![ArrayFunctionArgument::Array],
+    array_coercion: None,
+});
+```
+
+Alternatively, you can switch to using one of the following functions which
+take care of constructing the `TypeSignature` for you:
+
+- `Signature::array_and_element`
+- `Signature::array_and_element_and_optional_index`
+- `Signature::array_and_index`
+- `Signature::array`
+
+[ticket]: https://github.com/apache/datafusion/issues/13286
diff --git a/docs/source/library-user-guide/upgrading/47.0.0.md b/docs/source/library-user-guide/upgrading/47.0.0.md
new file mode 100644
index 0000000000000..354b6740df02f
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/47.0.0.md
@@ -0,0 +1,135 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Upgrade Guides
+
+## DataFusion 47.0.0
+
+This section calls out some of the major changes in the `47.0.0` release of DataFusion.
+
+Here are some example upgrade PRs that demonstrate changes required when upgrading from DataFusion 46.0.0:
+
+- [delta-rs Upgrade to `47.0.0`](https://github.com/delta-io/delta-rs/pull/3378)
+- [DataFusion Comet Upgrade to `47.0.0`](https://github.com/apache/datafusion-comet/pull/1563)
+- [Sail Upgrade to `47.0.0`](https://github.com/lakehq/sail/pull/434)
+
+### Upgrades to `arrow-rs` and `arrow-parquet` 55.0.0 and `object_store` 0.12.0
+
+Several APIs are changed in the underlying arrow and parquet libraries to use a
+`u64` instead of `usize` to better support WASM (See [#7371] and [#6961])
+
+Additionally `ObjectStore::list` and `ObjectStore::list_with_offset` have been changed to return `static` lifetimes (See [#6619])
+
+[#6619]: https://github.com/apache/arrow-rs/pull/6619
+[#7371]: https://github.com/apache/arrow-rs/pull/7371
+
+This requires converting from `usize` to `u64` occasionally as well as changes to `ObjectStore` implementations such as
+
+```rust
+# /* comment to avoid running
+impl Objectstore {
+    ...
+    // The range is now a u64 instead of usize
+    async fn get_range(&self, location: &Path, range: Range<u64>) -> ObjectStoreResult<Bytes> {
+        self.inner.get_range(location, range).await
+    }
+    ...
+    // the lifetime is now 'static instead of `_ (meaning the captured closure can't contain references)
+    // (this also applies to list_with_offset)
+    fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, ObjectStoreResult<ObjectMeta>> {
+        self.inner.list(prefix)
+    }
+}
+# */
+```
+
+The `ParquetObjectReader` has been updated to no longer require the object size
+(it can be fetched using a single suffix request). See [#7334] for details
+
+[#7334]: https://github.com/apache/arrow-rs/pull/7334
+
+Pattern in DataFusion `46.0.0`:
+
+```rust
+# /* comment to avoid running
+let meta: ObjectMeta = ...;
+let reader = ParquetObjectReader::new(store, meta);
+# */
+```
+
+Pattern in DataFusion `47.0.0`:
+
+```rust
+# /* comment to avoid running
+let meta: ObjectMeta = ...;
+let reader = ParquetObjectReader::new(store, location)
+  .with_file_size(meta.size);
+# */
+```
+
+### `DisplayFormatType::TreeRender`
+
+DataFusion now supports [`tree` style explain plans]. Implementations of
+`Executionplan` must also provide a description in the
+`DisplayFormatType::TreeRender` format. This can be the same as the existing
+`DisplayFormatType::Default`.
+
+[`tree` style explain plans]: https://datafusion.apache.org/user-guide/sql/explain.html#tree-format-default
+
+### Removed Deprecated APIs
+
+Several APIs have been removed in this release. These were either deprecated
+previously or were hard to use correctly such as the multiple different
+`ScalarUDFImpl::invoke*` APIs. See [#15130], [#15123], and [#15027] for more
+details.
+
+[#15130]: https://github.com/apache/datafusion/pull/15130
+[#15123]: https://github.com/apache/datafusion/pull/15123
+[#15027]: https://github.com/apache/datafusion/pull/15027
+
+### `FileScanConfig` --> `FileScanConfigBuilder`
+
+Previously, `FileScanConfig::build()` directly created ExecutionPlans. In
+DataFusion 47.0.0 this has been changed to use `FileScanConfigBuilder`. See
+[#15352] for details.
+
+[#15352]: https://github.com/apache/datafusion/pull/15352
+
+Pattern in DataFusion `46.0.0`:
+
+```rust
+# /* comment to avoid running
+let plan = FileScanConfig::new(url, schema, Arc::new(file_source))
+  .with_statistics(stats)
+  ...
+  .build()
+# */
+```
+
+Pattern in DataFusion `47.0.0`:
+
+```rust
+# /* comment to avoid running
+let config = FileScanConfigBuilder::new(url, Arc::new(file_source))
+  .with_statistics(stats)
+  ...
+  .build();
+let scan = DataSourceExec::from_data_source(config);
+# */
+```
diff --git a/docs/source/library-user-guide/upgrading/48.0.0.md b/docs/source/library-user-guide/upgrading/48.0.0.md
new file mode 100644
index 0000000000000..7872a6f54f245
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/48.0.0.md
@@ -0,0 +1,244 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Upgrade Guides
+
+## DataFusion 48.0.0
+
+### `Expr::Literal` has optional metadata
+
+The [`Expr::Literal`] variant now includes optional metadata, which allows for
+carrying through Arrow field metadata to support extension types and other uses.
+
+This means code such as
+
+```rust
+# /* comment to avoid running
+match expr {
+...
+  Expr::Literal(scalar) => ...
+...
+}
+#  */
+```
+
+Should be updated to:
+
+```rust
+# /* comment to avoid running
+match expr {
+...
+  Expr::Literal(scalar, _metadata) => ...
+...
+}
+#  */
+```
+
+Likewise constructing `Expr::Literal` requires metadata as well. The [`lit`] function
+has not changed and returns an `Expr::Literal` with no metadata.
+
+[`expr::literal`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.Expr.html#variant.Literal
+[`lit`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/fn.lit.html
+
+### `Expr::WindowFunction` is now `Box`ed
+
+`Expr::WindowFunction` is now a `Box<WindowFunction>` instead of a `WindowFunction` directly.
+This change was made to reduce the size of `Expr` and improve performance when
+planning queries (see [details on #16207]).
+
+This is a breaking change, so you will need to update your code if you match
+on `Expr::WindowFunction` directly. For example, if you have code like this:
+
+```rust
+# /* comment to avoid running
+match expr {
+  Expr::WindowFunction(WindowFunction {
+    params:
+      WindowFunctionParams {
+       partition_by,
+       order_by,
+      ..
+    }
+  }) => {
+    // Use partition_by and order_by as needed
+  }
+  _ => {
+    // other expr
+  }
+}
+# */
+```
+
+You will need to change it to:
+
+```rust
+# /* comment to avoid running
+match expr {
+  Expr::WindowFunction(window_fun) => {
+    let WindowFunction {
+      fun,
+      params: WindowFunctionParams {
+        args,
+        partition_by,
+        ..
+        },
+    } = window_fun.as_ref();
+    // Use partition_by and order_by as needed
+  }
+  _ => {
+    // other expr
+  }
+}
+#  */
+```
+
+[details on #16207]: https://github.com/apache/datafusion/pull/16207#issuecomment-2922659103
+
+### The `VARCHAR` SQL type is now represented as `Utf8View` in Arrow
+
+The mapping of the SQL `VARCHAR` type has been changed from `Utf8` to `Utf8View`
+which improves performance for many string operations. You can read more about
+`Utf8View` in the [DataFusion blog post on German-style strings]
+
+[datafusion blog post on german-style strings]: https://datafusion.apache.org/blog/2024/09/13/string-view-german-style-strings-part-1/
+
+This means that when you create a table with a `VARCHAR` column, it will now use
+`Utf8View` as the underlying data type. For example:
+
+```sql
+> CREATE TABLE my_table (my_column VARCHAR);
+0 row(s) fetched.
+Elapsed 0.001 seconds.
+
+> DESCRIBE my_table;
++-------------+-----------+-------------+
+| column_name | data_type | is_nullable |
++-------------+-----------+-------------+
+| my_column   | Utf8View  | YES         |
++-------------+-----------+-------------+
+1 row(s) fetched.
+Elapsed 0.000 seconds.
+```
+
+You can restore the old behavior of using `Utf8` by changing the
+`datafusion.sql_parser.map_varchar_to_utf8view` configuration setting. For
+example
+
+```sql
+> set datafusion.sql_parser.map_varchar_to_utf8view = false;
+0 row(s) fetched.
+Elapsed 0.001 seconds.
+
+> CREATE TABLE my_table (my_column VARCHAR);
+0 row(s) fetched.
+Elapsed 0.014 seconds.
+
+> DESCRIBE my_table;
++-------------+-----------+-------------+
+| column_name | data_type | is_nullable |
++-------------+-----------+-------------+
+| my_column   | Utf8      | YES         |
++-------------+-----------+-------------+
+1 row(s) fetched.
+Elapsed 0.004 seconds.
+```
+
+### `ListingOptions` default for `collect_stat` changed from `true` to `false`
+
+This makes it agree with the default for `SessionConfig`.
+Most users won't be impacted by this change but if you were using `ListingOptions` directly
+and relied on the default value of `collect_stat` being `true`, you will need to
+explicitly set it to `true` in your code.
+
+```rust
+# /* comment to avoid running
+ListingOptions::new(Arc::new(ParquetFormat::default()))
+    .with_collect_stat(true)
+    // other options
+# */
+```
+
+### Processing `FieldRef` instead of `DataType` for user defined functions
+
+In order to support metadata handling and extension types, user defined functions are
+now switching to traits which use `FieldRef` rather than a `DataType` and nullability.
+This gives a single interface to both of these parameters and additionally allows
+access to metadata fields, which can be used for extension types.
+
+To upgrade structs which implement `ScalarUDFImpl`, if you have implemented
+`return_type_from_args` you need instead to implement `return_field_from_args`.
+If your functions do not need to handle metadata, this should be straightforward
+repackaging of the output data into a `FieldRef`. The name you specify on the
+field is not important. It will be overwritten during planning. `ReturnInfo`
+has been removed, so you will need to remove all references to it.
+
+`ScalarFunctionArgs` now contains a field called `arg_fields`. You can use this
+to access the metadata associated with the columnar values during invocation.
+
+To upgrade user defined aggregate functions, there is now a function
+`return_field` that will allow you to specify both metadata and nullability of
+your function. You are not required to implement this if you do not need to
+handle metadata.
+
+The largest change to aggregate functions happens in the accumulator arguments.
+Both the `AccumulatorArgs` and `StateFieldsArgs` now contain `FieldRef` rather
+than `DataType`.
+
+To upgrade window functions, `ExpressionArgs` now contains input fields instead
+of input data types. When setting these fields, the name of the field is
+not important since this gets overwritten during the planning stage. All you
+should need to do is wrap your existing data types in fields with nullability
+set depending on your use case.
+
+### Physical Expression return `Field`
+
+To support the changes to user defined functions processing metadata, the
+`PhysicalExpr` trait, which now must specify a return `Field` based on the input
+schema. To upgrade structs which implement `PhysicalExpr` you need to implement
+the `return_field` function. There are numerous examples in the `physical-expr`
+crate.
+
+### `FileFormat::supports_filters_pushdown` replaced with `FileSource::try_pushdown_filters`
+
+To support more general filter pushdown, the `FileFormat::supports_filters_pushdown` was replaced with
+`FileSource::try_pushdown_filters`.
+If you implemented a custom `FileFormat` that uses a custom `FileSource` you will need to implement
+`FileSource::try_pushdown_filters`.
+See `ParquetSource::try_pushdown_filters` for an example of how to implement this.
+
+`FileFormat::supports_filters_pushdown` has been removed.
+
+### `ParquetExec`, `AvroExec`, `CsvExec`, `JsonExec` Removed
+
+`ParquetExec`, `AvroExec`, `CsvExec`, and `JsonExec` were deprecated in
+DataFusion 46 and are removed in DataFusion 48. This is sooner than the normal
+process described in the [API Deprecation Guidelines] because all the tests
+cover the new `DataSourceExec` rather than the older structures. As we evolve
+`DataSource`, the old structures began to show signs of "bit rotting" (not
+working but no one knows due to lack of test coverage).
+
+[api deprecation guidelines]: https://datafusion.apache.org/contributor-guide/api-health.html#deprecation-guidelines
+
+### `PartitionedFile` added as an argument to the `FileOpener` trait
+
+This is necessary to properly fix filter pushdown for filters that combine partition
+columns and file columns (e.g. `day = username['dob']`).
+
+If you implemented a custom `FileOpener` you will need to add the `PartitionedFile` argument
+but are not required to use it in any way.
diff --git a/docs/source/library-user-guide/upgrading/48.0.1.md b/docs/source/library-user-guide/upgrading/48.0.1.md
new file mode 100644
index 0000000000000..5dfb9e1e3d0b1
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/48.0.1.md
@@ -0,0 +1,39 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Upgrade Guides
+
+## DataFusion 48.0.1
+
+### `datafusion.execution.collect_statistics` now defaults to `true`
+
+The default value of the `datafusion.execution.collect_statistics` configuration
+setting is now true. This change impacts users that use that value directly and relied
+on its default value being `false`.
+
+This change also restores the default behavior of `ListingTable` to its previous. If you use it directly
+you can maintain the current behavior by overriding the default value in your code.
+
+```rust
+# /* comment to avoid running
+ListingOptions::new(Arc::new(ParquetFormat::default()))
+    .with_collect_stat(false)
+    // other options
+# */
+```
diff --git a/docs/source/library-user-guide/upgrading/49.0.0.md b/docs/source/library-user-guide/upgrading/49.0.0.md
new file mode 100644
index 0000000000000..92267a80fae69
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/49.0.0.md
@@ -0,0 +1,222 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Upgrade Guides
+
+## DataFusion 49.0.0
+
+### `MSRV` updated to 1.85.1
+
+The Minimum Supported Rust Version (MSRV) has been updated to [`1.85.1`]. See
+[#16728] for details.
+
+[`1.85.1`]: https://releases.rs/docs/1.85.1/
+[#16728]: https://github.com/apache/datafusion/pull/16728
+
+### `DataFusionError` variants are now `Box`ed
+
+To reduce the size of `DataFusionError`, several variants that were previously stored inline are now `Box`ed. This reduces the size of `Result<T, DataFusionError>` and thus stack usage and async state machine size. Please see [#16652] for more details.
+
+The following variants of `DataFusionError` are now boxed:
+
+- `ArrowError`
+- `SQL`
+- `SchemaError`
+
+This is a breaking change. Code that constructs or matches on these variants will need to be updated.
+
+For example, to create a `SchemaError`, instead of:
+
+```rust
+# /* comment to avoid running
+use datafusion_common::{DataFusionError, SchemaError};
+DataFusionError::SchemaError(
+  SchemaError::DuplicateUnqualifiedField { name: "foo".to_string() },
+  Box::new(None)
+)
+# */
+```
+
+You now need to `Box` the inner error:
+
+```rust
+# /* comment to avoid running
+use datafusion_common::{DataFusionError, SchemaError};
+DataFusionError::SchemaError(
+  Box::new(SchemaError::DuplicateUnqualifiedField { name: "foo".to_string() }),
+  Box::new(None)
+)
+# */
+```
+
+[#16652]: https://github.com/apache/datafusion/issues/16652
+
+### Metadata on Arrow Types is now represented by `FieldMetadata`
+
+Metadata from the Arrow `Field` is now stored using the `FieldMetadata`
+structure. In prior versions it was stored as both a `HashMap<String, String>`
+and a `BTreeMap<String, String>`. `FieldMetadata` is a easier to work with and
+is more efficient.
+
+To create `FieldMetadata` from a `Field`:
+
+```rust
+# /* comment to avoid running
+ let metadata = FieldMetadata::from(&field);
+# */
+```
+
+To add metadata to a `Field`, use the `add_to_field` method:
+
+```rust
+# /* comment to avoid running
+let updated_field = metadata.add_to_field(field);
+# */
+```
+
+See [#16317] for details.
+
+[#16317]: https://github.com/apache/datafusion/pull/16317
+
+### New `datafusion.execution.spill_compression` configuration option
+
+DataFusion 49.0.0 adds support for compressing spill files when data is written to disk during spilling query execution. A new configuration option `datafusion.execution.spill_compression` controls the compression codec used.
+
+**Configuration:**
+
+- **Key**: `datafusion.execution.spill_compression`
+- **Default**: `uncompressed`
+- **Valid values**: `uncompressed`, `lz4_frame`, `zstd`
+
+**Usage:**
+
+```rust
+# /* comment to avoid running
+use datafusion::prelude::*;
+use datafusion_common::config::SpillCompression;
+
+let config = SessionConfig::default()
+    .with_spill_compression(SpillCompression::Zstd);
+let ctx = SessionContext::new_with_config(config);
+# */
+```
+
+Or via SQL:
+
+```sql
+SET datafusion.execution.spill_compression = 'zstd';
+```
+
+For more details about this configuration option, including performance trade-offs between different compression codecs, see the [Configuration Settings](../../user-guide/configs.md) documentation.
+
+### Deprecated `map_varchar_to_utf8view` configuration option
+
+See [issue #16290](https://github.com/apache/datafusion/pull/16290) for more information
+The old configuration
+
+```text
+datafusion.sql_parser.map_varchar_to_utf8view
+```
+
+is now **deprecated** in favor of the unified option below.\
+If you previously used this to control only `VARCHAR`→`Utf8View` mapping, please migrate to `map_string_types_to_utf8view`.
+
+---
+
+### New `map_string_types_to_utf8view` configuration option
+
+To unify **all** SQL string types (`CHAR`, `VARCHAR`, `TEXT`, `STRING`) to Arrow’s zero‑copy `Utf8View`, DataFusion 49.0.0 introduces:
+
+- **Key**: `datafusion.sql_parser.map_string_types_to_utf8view`
+- **Default**: `true`
+
+**Description:**
+
+- When **true** (default), **all** SQL string types are mapped to `Utf8View`, avoiding full‑copy UTF‑8 allocations and improving performance.
+- When **false**, DataFusion falls back to the legacy `Utf8` mapping for **all** string types.
+
+#### Examples
+
+```rust
+# /* comment to avoid running
+// Disable Utf8View mapping for all SQL string types
+let opts = datafusion::sql::planner::ParserOptions::new()
+    .with_map_string_types_to_utf8view(false);
+
+// Verify the setting is applied
+assert!(!opts.map_string_types_to_utf8view);
+# */
+```
+
+---
+
+```sql
+-- Disable Utf8View mapping globally
+SET datafusion.sql_parser.map_string_types_to_utf8view = false;
+
+-- Now VARCHAR, CHAR, TEXT, STRING all use Utf8 rather than Utf8View
+CREATE TABLE my_table (a VARCHAR, b TEXT, c STRING);
+DESCRIBE my_table;
+```
+
+### Deprecating `SchemaAdapterFactory` and `SchemaAdapter`
+
+We are moving away from converting data (using `SchemaAdapter`) to converting the expressions themselves (which is more efficient and flexible).
+
+See [issue #16800](https://github.com/apache/datafusion/issues/16800) for more information
+The first place this change has taken place is in predicate pushdown for Parquet.
+By default if you do not use a custom `SchemaAdapterFactory` we will use expression conversion instead.
+If you do set a custom `SchemaAdapterFactory` we will continue to use it but emit a warning about that code path being deprecated.
+
+To resolve this you need to implement a custom `PhysicalExprAdapterFactory` and use that instead of a `SchemaAdapterFactory`.
+See the [default values](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/custom_data_source/default_column_values.rs) for an example of how to do this.
+Opting into the new APIs will set you up for future changes since we plan to expand use of `PhysicalExprAdapterFactory` to other areas of DataFusion.
+
+See [#16800] for details.
+
+[#16800]: https://github.com/apache/datafusion/issues/16800
+
+### `TableParquetOptions` Updated
+
+The `TableParquetOptions` struct has a new `crypto` field to specify encryption
+options for Parquet files. The `ParquetEncryptionOptions` implements `Default`
+so you can upgrade your existing code like this:
+
+```rust
+# /* comment to avoid running
+TableParquetOptions {
+  global,
+  column_specific_options,
+  key_value_metadata,
+}
+# */
+```
+
+To this:
+
+```rust
+# /* comment to avoid running
+TableParquetOptions {
+  global,
+  column_specific_options,
+  key_value_metadata,
+  crypto: Default::default(), // New crypto field
+}
+# */
+```
diff --git a/docs/source/library-user-guide/upgrading/50.0.0.md b/docs/source/library-user-guide/upgrading/50.0.0.md
new file mode 100644
index 0000000000000..d8155dab58962
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/50.0.0.md
@@ -0,0 +1,330 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Upgrade Guides
+
+## DataFusion 50.0.0
+
+### ListingTable automatically detects Hive Partitioned tables
+
+DataFusion 50.0.0 automatically infers Hive partitions when using the `ListingTableFactory` and `CREATE EXTERNAL TABLE`. Previously,
+when creating a `ListingTable`, datasets that use Hive partitioning (e.g.
+`/table_root/column1=value1/column2=value2/data.parquet`) would not have the Hive columns reflected in
+the table's schema or data. The previous behavior can be
+restored by setting the `datafusion.execution.listing_table_factory_infer_partitions` configuration option to `false`.
+See [issue #17049] for more details.
+
+[issue #17049]: https://github.com/apache/datafusion/issues/17049
+
+### `MSRV` updated to 1.86.0
+
+The Minimum Supported Rust Version (MSRV) has been updated to [`1.86.0`].
+See [#17230] for details.
+
+[`1.86.0`]: https://releases.rs/docs/1.86.0/
+[#17230]: https://github.com/apache/datafusion/pull/17230
+
+### `ScalarUDFImpl`, `AggregateUDFImpl` and `WindowUDFImpl` traits now require `PartialEq`, `Eq`, and `Hash` traits
+
+To address error-proneness of `ScalarUDFImpl::equals`, `AggregateUDFImpl::equals`and
+`WindowUDFImpl::equals` methods and to make it easy to implement function equality correctly,
+the `equals` and `hash_value` methods have been removed from `ScalarUDFImpl`, `AggregateUDFImpl`
+and `WindowUDFImpl` traits. They are replaced the requirement to implement the `PartialEq`, `Eq`,
+and `Hash` traits on any type implementing `ScalarUDFImpl`, `AggregateUDFImpl` or `WindowUDFImpl`.
+Please see [issue #16677] for more details.
+
+Most of the scalar functions are stateless and have a `signature` field. These can be migrated
+using regular expressions
+
+- search for `\#\[derive\(Debug\)\](\n *(pub )?struct \w+ \{\n *signature\: Signature\,\n *\})`,
+- replace with `#[derive(Debug, PartialEq, Eq, Hash)]$1`,
+- review all the changes and make sure only function structs were changed.
+
+[issue #16677]: https://github.com/apache/datafusion/issues/16677
+
+### `AsyncScalarUDFImpl::invoke_async_with_args` returns `ColumnarValue`
+
+In order to enable single value optimizations and be consistent with other
+user defined function APIs, the `AsyncScalarUDFImpl::invoke_async_with_args` method now
+returns a `ColumnarValue` instead of a `ArrayRef`.
+
+To upgrade, change the return type of your implementation
+
+```rust
+# /* comment to avoid running
+impl AsyncScalarUDFImpl for AskLLM {
+    async fn invoke_async_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+        _option: &ConfigOptions,
+    ) -> Result<ColumnarValue> {
+        ..
+      return array_ref; // old code
+    }
+}
+# */
+```
+
+To return a `ColumnarValue`
+
+```rust
+# /* comment to avoid running
+impl AsyncScalarUDFImpl for AskLLM {
+    async fn invoke_async_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+        _option: &ConfigOptions,
+    ) -> Result<ColumnarValue> {
+        ..
+      return ColumnarValue::from(array_ref); // new code
+    }
+}
+# */
+```
+
+See [#16896](https://github.com/apache/datafusion/issues/16896) for more details.
+
+### `ProjectionExpr` changed from type alias to struct
+
+`ProjectionExpr` has been changed from a type alias to a struct with named fields to improve code clarity and maintainability.
+
+**Before:**
+
+```rust,ignore
+pub type ProjectionExpr = (Arc<dyn PhysicalExpr>, String);
+```
+
+**After:**
+
+```rust,ignore
+#[derive(Debug, Clone)]
+pub struct ProjectionExpr {
+    pub expr: Arc<dyn PhysicalExpr>,
+    pub alias: String,
+}
+```
+
+To upgrade your code:
+
+- Replace tuple construction `(expr, alias)` with `ProjectionExpr::new(expr, alias)` or `ProjectionExpr { expr, alias }`
+- Replace tuple field access `.0` and `.1` with `.expr` and `.alias`
+- Update pattern matching from `(expr, alias)` to `ProjectionExpr { expr, alias }`
+
+This mainly impacts use of `ProjectionExec`.
+
+This change was done in [#17398]
+
+[#17398]: https://github.com/apache/datafusion/pull/17398
+
+### `SessionState`, `SessionConfig`, and `OptimizerConfig` returns `&Arc<ConfigOptions>` instead of `&ConfigOptions`
+
+To provide broader access to `ConfigOptions` and reduce required clones, some
+APIs have been changed to return a `&Arc<ConfigOptions>` instead of a
+`&ConfigOptions`. This allows sharing the same `ConfigOptions` across multiple
+threads without needing to clone the entire `ConfigOptions` structure unless it
+is modified.
+
+Most users will not be impacted by this change since the Rust compiler typically
+automatically dereference the `Arc` when needed. However, in some cases you may
+have to change your code to explicitly call `as_ref()` for example, from
+
+```rust
+# /* comment to avoid running
+let optimizer_config: &ConfigOptions = state.options();
+#  */
+```
+
+To
+
+```rust
+# /* comment to avoid running
+let optimizer_config: &ConfigOptions = state.options().as_ref();
+#  */
+```
+
+See PR [#16970](https://github.com/apache/datafusion/pull/16970)
+
+### API Change to `AsyncScalarUDFImpl::invoke_async_with_args`
+
+The `invoke_async_with_args` method of the `AsyncScalarUDFImpl` trait has been
+updated to remove the `_option: &ConfigOptions` parameter to simplify the API
+now that the `ConfigOptions` can be accessed through the `ScalarFunctionArgs`
+parameter.
+
+You can change your code like this
+
+```rust
+# /* comment to avoid running
+impl AsyncScalarUDFImpl for AskLLM {
+    async fn invoke_async_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+        _option: &ConfigOptions,
+    ) -> Result<ArrayRef> {
+        ..
+    }
+    ...
+}
+# */
+```
+
+To this:
+
+```rust
+# /* comment to avoid running
+
+impl AsyncScalarUDFImpl for AskLLM {
+    async fn invoke_async_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> Result<ArrayRef> {
+        let options = &args.config_options;
+        ..
+    }
+    ...
+}
+# */
+```
+
+### Schema Rewriter Module Moved to New Crate
+
+The `schema_rewriter` module and its associated symbols have been moved from `datafusion_physical_expr` to a new crate `datafusion_physical_expr_adapter`. This affects the following symbols:
+
+- `DefaultPhysicalExprAdapter`
+- `DefaultPhysicalExprAdapterFactory`
+- `PhysicalExprAdapter`
+- `PhysicalExprAdapterFactory`
+
+To upgrade, change your imports to:
+
+```rust
+use datafusion_physical_expr_adapter::{
+    DefaultPhysicalExprAdapter, DefaultPhysicalExprAdapterFactory,
+    PhysicalExprAdapter, PhysicalExprAdapterFactory
+};
+```
+
+### Upgrade to arrow `56.0.0` and parquet `56.0.0`
+
+This version of DataFusion upgrades the underlying Apache Arrow implementation
+to version `56.0.0`. See the [release notes](https://github.com/apache/arrow-rs/releases/tag/56.0.0)
+for more details.
+
+### Added `ExecutionPlan::reset_state`
+
+In order to fix a bug in DataFusion `49.0.0` where dynamic filters (currently only generated in the presence of a query such as `ORDER BY ... LIMIT ...`)
+produced incorrect results in recursive queries, a new method `reset_state` has been added to the `ExecutionPlan` trait.
+
+Any `ExecutionPlan` that needs to maintain internal state or references to other nodes in the execution plan tree should implement this method to reset that state.
+See [#17028] for more details and an example implementation for `SortExec`.
+
+[#17028]: https://github.com/apache/datafusion/pull/17028
+
+### Nested Loop Join input sort order cannot be preserved
+
+The Nested Loop Join operator has been rewritten from scratch to improve performance and memory efficiency. From the micro-benchmarks: this change introduces up to 5X speed-up and uses only 1% memory in extreme cases compared to the previous implementation.
+
+However, the new implementation cannot preserve input sort order like the old version could. This is a fundamental design trade-off that prioritizes performance and memory efficiency over sort order preservation.
+
+See [#16996] for details.
+
+[#16996]: https://github.com/apache/datafusion/pull/16996
+
+### Add `as_any()` method to `LazyBatchGenerator`
+
+To help with protobuf serialization, the `as_any()` method has been added to the `LazyBatchGenerator` trait. This means you will need to add `as_any()` to your implementation of `LazyBatchGenerator`:
+
+```rust
+# /* comment to avoid running
+
+impl LazyBatchGenerator for MyBatchGenerator {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    ...
+}
+
+# */
+```
+
+See [#17200](https://github.com/apache/datafusion/pull/17200) for details.
+
+### Refactored `DataSource::try_swapping_with_projection`
+
+We refactored `DataSource::try_swapping_with_projection` to simplify the method and minimize leakage across the ExecutionPlan <-> DataSource abstraction layer.
+Reimplementation for any custom `DataSource` should be relatively straightforward, see [#17395] for more details.
+
+[#17395]: https://github.com/apache/datafusion/pull/17395/
+
+### `FileOpenFuture` now uses `DataFusionError` instead of `ArrowError`
+
+The `FileOpenFuture` type alias has been updated to use `DataFusionError` instead of `ArrowError` for its error type. This change affects the `FileOpener` trait and any implementations that work with file streaming operations.
+
+**Before:**
+
+```rust,ignore
+pub type FileOpenFuture = BoxFuture<'static, Result<BoxStream<'static, Result<RecordBatch, ArrowError>>>>;
+```
+
+**After:**
+
+```rust,ignore
+pub type FileOpenFuture = BoxFuture<'static, Result<BoxStream<'static, Result<RecordBatch>>>>;
+```
+
+If you have custom implementations of `FileOpener` or work directly with `FileOpenFuture`, you'll need to update your error handling to use `DataFusionError` instead of `ArrowError`. The `FileStreamState` enum's `Open` variant has also been updated accordingly. See [#17397] for more details.
+
+[#17397]: https://github.com/apache/datafusion/pull/17397
+
+### FFI user defined aggregate function signature change
+
+The Foreign Function Interface (FFI) signature for user defined aggregate functions
+has been updated to call `return_field` instead of `return_type` on the underlying
+aggregate function. This is to support metadata handling with these aggregate functions.
+This change should be transparent to most users. If you have written unit tests to call
+`return_type` directly, you may need to change them to calling `return_field` instead.
+
+This update is a breaking change to the FFI API. The current best practice when using the
+FFI crate is to ensure that all libraries that are interacting are using the same
+underlying Rust version. Issue [#17374] has been opened to discuss stabilization of
+this interface so that these libraries can be used across different DataFusion versions.
+
+See [#17407] for details.
+
+[#17407]: https://github.com/apache/datafusion/pull/17407
+[#17374]: https://github.com/apache/datafusion/issues/17374
+
+### Added `PhysicalExpr::is_volatile_node`
+
+We added a method to `PhysicalExpr` to mark a `PhysicalExpr` as volatile:
+
+```rust,ignore
+impl PhysicalExpr for MyRandomExpr {
+  fn is_volatile_node(&self) -> bool {
+    true
+  }
+}
+```
+
+We've shipped this with a default value of `false` to minimize breakage but we highly recommend that implementers of `PhysicalExpr` opt into a behavior, even if it is returning `false`.
+
+You can see more discussion and example implementations in [#17351].
+
+[#17351]: https://github.com/apache/datafusion/pull/17351
diff --git a/docs/source/library-user-guide/upgrading/51.0.0.md b/docs/source/library-user-guide/upgrading/51.0.0.md
new file mode 100644
index 0000000000000..c3acfe15c493f
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/51.0.0.md
@@ -0,0 +1,272 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Upgrade Guides
+
+## DataFusion 51.0.0
+
+### `arrow` / `parquet` updated to 57.0.0
+
+### Upgrade to arrow `57.0.0` and parquet `57.0.0`
+
+This version of DataFusion upgrades the underlying Apache Arrow implementation
+to version `57.0.0`, including several dependent crates such as `prost`,
+`tonic`, `pyo3`, and `substrait`. . See the [release
+notes](https://github.com/apache/arrow-rs/releases/tag/57.0.0) for more details.
+
+### `MSRV` updated to 1.88.0
+
+The Minimum Supported Rust Version (MSRV) has been updated to [`1.88.0`].
+
+[`1.88.0`]: https://releases.rs/docs/1.88.0/
+
+### `FunctionRegistry` exposes two additional methods
+
+`FunctionRegistry` exposes two additional methods `udafs` and `udwfs` which expose set of registered user defined aggregation and window function names. To upgrade implement methods returning set of registered function names:
+
+```diff
+impl FunctionRegistry for FunctionRegistryImpl {
+      fn udfs(&self) -> HashSet<String> {
+         self.scalar_functions.keys().cloned().collect()
+     }
++    fn udafs(&self) -> HashSet<String> {
++        self.aggregate_functions.keys().cloned().collect()
++    }
++
++    fn udwfs(&self) -> HashSet<String> {
++        self.window_functions.keys().cloned().collect()
++    }
+}
+```
+
+### `datafusion-proto` use `TaskContext` rather than `SessionContext` in physical plan serde methods
+
+There have been changes in the public API methods of `datafusion-proto` which handle physical plan serde.
+
+Methods like `physical_plan_from_bytes`, `parse_physical_expr` and similar, expect `TaskContext` instead of `SessionContext`
+
+```diff
+- let plan2 = physical_plan_from_bytes(&bytes, &ctx)?;
++ let plan2 = physical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
+```
+
+as `TaskContext` contains `RuntimeEnv` methods such as `try_into_physical_plan` will not have explicit `RuntimeEnv` parameter.
+
+```diff
+let result_exec_plan: Arc<dyn ExecutionPlan> = proto
+-   .try_into_physical_plan(&ctx, runtime.deref(), &composed_codec)
++.  .try_into_physical_plan(&ctx.task_ctx(), &composed_codec)
+```
+
+`PhysicalExtensionCodec::try_decode()` expects `TaskContext` instead of `FunctionRegistry`:
+
+```diff
+pub trait PhysicalExtensionCodec {
+    fn try_decode(
+        &self,
+        buf: &[u8],
+        inputs: &[Arc<dyn ExecutionPlan>],
+-        registry: &dyn FunctionRegistry,
++        ctx: &TaskContext,
+    ) -> Result<Arc<dyn ExecutionPlan>>;
+```
+
+See [issue #17601] for more details.
+
+[issue #17601]: https://github.com/apache/datafusion/issues/17601
+
+### `SessionState`'s `sql_to_statement` method takes `Dialect` rather than a `str`
+
+The `dialect` parameter of `sql_to_statement` method defined in `datafusion::execution::session_state::SessionState`
+has changed from `&str` to `&Dialect`.
+`Dialect` is an enum defined in the `datafusion-common`
+crate under the `config` module that provides type safety
+and better validation for SQL dialect selection
+
+### Reorganization of `ListingTable` into `datafusion-catalog-listing` crate
+
+There has been a long standing request to remove features such as `ListingTable`
+from the `datafusion` crate to support faster build times. The structs
+`ListingOptions`, `ListingTable`, and `ListingTableConfig` are now available
+within the `datafusion-catalog-listing` crate. These are re-exported in
+the `datafusion` crate, so this should be a minimal impact to existing users.
+
+See [issue #14462] and [issue #17713] for more details.
+
+[issue #14462]: https://github.com/apache/datafusion/issues/14462
+[issue #17713]: https://github.com/apache/datafusion/issues/17713
+
+### Reorganization of `ArrowSource` into `datafusion-datasource-arrow` crate
+
+To support [issue #17713] the `ArrowSource` code has been removed from
+the `datafusion` core crate into it's own crate, `datafusion-datasource-arrow`.
+This follows the pattern for the AVRO, CSV, JSON, and Parquet data sources.
+Users may need to update their paths to account for these changes.
+
+See [issue #17713] for more details.
+
+### `FileScanConfig::projection` renamed to `FileScanConfig::projection_exprs`
+
+The `projection` field in `FileScanConfig` has been renamed to `projection_exprs` and its type has changed from `Option<Vec<usize>>` to `Option<ProjectionExprs>`. This change enables more powerful projection pushdown capabilities by supporting arbitrary physical expressions rather than just column indices.
+
+**Impact on direct field access:**
+
+If you directly access the `projection` field:
+
+```rust,ignore
+let config: FileScanConfig = ...;
+let projection = config.projection;
+```
+
+You should update to:
+
+```rust,ignore
+let config: FileScanConfig = ...;
+let projection_exprs = config.projection_exprs;
+```
+
+**Impact on builders:**
+
+The `FileScanConfigBuilder::with_projection()` method has been deprecated in favor of `with_projection_indices()`:
+
+```diff
+let config = FileScanConfigBuilder::new(url, file_source)
+-   .with_projection(Some(vec![0, 2, 3]))
++   .with_projection_indices(Some(vec![0, 2, 3]))
+    .build();
+```
+
+Note: `with_projection()` still works but is deprecated and will be removed in a future release.
+
+**What is `ProjectionExprs`?**
+
+`ProjectionExprs` is a new type that represents a list of physical expressions for projection. While it can be constructed from column indices (which is what `with_projection_indices` does internally), it also supports arbitrary physical expressions, enabling advanced features like expression evaluation during scanning.
+
+You can access column indices from `ProjectionExprs` using its methods if needed:
+
+```rust,ignore
+let projection_exprs: ProjectionExprs = ...;
+// Get the column indices if the projection only contains simple column references
+let indices = projection_exprs.column_indices();
+```
+
+### `DESCRIBE query` support
+
+`DESCRIBE query` was previously an alias for `EXPLAIN query`, which outputs the
+_execution plan_ of the query. With this release, `DESCRIBE query` now outputs
+the computed _schema_ of the query, consistent with the behavior of `DESCRIBE table_name`.
+
+### `datafusion.execution.time_zone` default configuration changed
+
+The default value for `datafusion.execution.time_zone` previously was a string value of `+00:00` (GMT/Zulu time).
+This was changed to be an `Option<String>` with a default of `None`. If you want to change the timezone back
+to the previous value you can execute the sql:
+
+```sql
+SET
+TIMEZONE = '+00:00';
+```
+
+This change was made to better support using the default timezone in scalar UDF functions such as
+`now`, `current_date`, `current_time`, and `to_timestamp` among others.
+
+### Introduction of `TableSchema` and changes to `FileSource::with_schema()` method
+
+A new `TableSchema` struct has been introduced in the `datafusion-datasource` crate to better manage table schemas with partition columns. This struct helps distinguish between:
+
+- **File schema**: The schema of actual data files on disk
+- **Partition columns**: Columns derived from directory structure (e.g., Hive-style partitioning)
+- **Table schema**: The complete schema combining both file and partition columns
+
+As part of this change, the `FileSource::with_schema()` method signature has changed from accepting a `SchemaRef` to accepting a `TableSchema`.
+
+**Who is affected:**
+
+- Users who have implemented custom `FileSource` implementations will need to update their code
+- Users who only use built-in file sources (Parquet, CSV, JSON, AVRO, Arrow) are not affected
+
+**Migration guide for custom `FileSource` implementations:**
+
+```diff
+ use datafusion_datasource::file::FileSource;
+-use arrow::datatypes::SchemaRef;
++use datafusion_datasource::TableSchema;
+
+ impl FileSource for MyCustomSource {
+-    fn with_schema(&self, schema: SchemaRef) -> Arc<dyn FileSource> {
++    fn with_schema(&self, schema: TableSchema) -> Arc<dyn FileSource> {
+         Arc::new(Self {
+-            schema: Some(schema),
++            // Use schema.file_schema() to get the file schema without partition columns
++            schema: Some(Arc::clone(schema.file_schema())),
+             ..self.clone()
+         })
+     }
+ }
+```
+
+For implementations that need access to partition columns:
+
+```rust,ignore
+fn with_schema(&self, schema: TableSchema) -> Arc<dyn FileSource> {
+    Arc::new(Self {
+        file_schema: Arc::clone(schema.file_schema()),
+        partition_cols: schema.table_partition_cols().clone(),
+        table_schema: Arc::clone(schema.table_schema()),
+        ..self.clone()
+    })
+}
+```
+
+**Note**: Most `FileSource` implementations only need to store the file schema (without partition columns), as shown in the first example. The second pattern of storing all three schema components is typically only needed for advanced use cases where you need access to different schema representations for different operations (e.g., ParquetSource uses the file schema for building pruning predicates but needs the table schema for filter pushdown logic).
+
+**Using `TableSchema` directly:**
+
+If you're constructing a `FileScanConfig` or working with table schemas and partition columns, you can now use `TableSchema`:
+
+```rust
+use datafusion_datasource::TableSchema;
+use arrow::datatypes::{Schema, Field, DataType};
+use std::sync::Arc;
+
+// Create a TableSchema with partition columns
+let file_schema = Arc::new(Schema::new(vec![
+    Field::new("user_id", DataType::Int64, false),
+    Field::new("amount", DataType::Float64, false),
+]));
+
+let partition_cols = vec![
+    Arc::new(Field::new("date", DataType::Utf8, false)),
+    Arc::new(Field::new("region", DataType::Utf8, false)),
+];
+
+let table_schema = TableSchema::new(file_schema, partition_cols);
+
+// Access different schema representations
+let file_schema_ref = table_schema.file_schema();      // Schema without partition columns
+let full_schema = table_schema.table_schema();          // Complete schema with partition columns
+let partition_cols_ref = table_schema.table_partition_cols(); // Just the partition columns
+```
+
+### `AggregateUDFImpl::is_ordered_set_aggregate` has been renamed to `AggregateUDFImpl::supports_within_group_clause`
+
+This method has been renamed to better reflect the actual impact it has for aggregate UDF implementations.
+The accompanying `AggregateUDF::is_ordered_set_aggregate` has also been renamed to `AggregateUDF::supports_within_group_clause`.
+No functionality has been changed with regards to this method; it still refers only to permitting use of `WITHIN GROUP`
+SQL syntax for the aggregate function.
diff --git a/docs/source/library-user-guide/upgrading/52.0.0.md b/docs/source/library-user-guide/upgrading/52.0.0.md
new file mode 100644
index 0000000000000..8bf2f803bede6
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/52.0.0.md
@@ -0,0 +1,669 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Upgrade Guides
+
+## DataFusion 52.0.0
+
+### Changes to DFSchema API
+
+To permit more efficient planning, several methods on `DFSchema` have been
+changed to return references to the underlying [`&FieldRef`] rather than
+[`&Field`]. This allows planners to more cheaply copy the references via
+`Arc::clone` rather than cloning the entire `Field` structure.
+
+You may need to change code to use `Arc::clone` instead of `.as_ref().clone()`
+directly on the `Field`. For example:
+
+```diff
+- let field = df_schema.field("my_column").as_ref().clone();
++ let field = Arc::clone(df_schema.field("my_column"));
+```
+
+### ListingTableProvider now caches `LIST` commands
+
+In prior versions, `ListingTableProvider` would issue `LIST` commands to
+the underlying object store each time it needed to list files for a query.
+To improve performance, `ListingTableProvider` now caches the results of
+`LIST` commands for the lifetime of the `ListingTableProvider` instance or
+until a cache entry expires.
+
+Note that by default the cache has no expiration time, so if files are added or removed
+from the underlying object store, the `ListingTableProvider` will not see
+those changes until the `ListingTableProvider` instance is dropped and recreated.
+
+You can configure the maximum cache size and cache entry expiration time via configuration options:
+
+- `datafusion.runtime.list_files_cache_limit` - Limits the size of the cache in bytes
+- `datafusion.runtime.list_files_cache_ttl` - Limits the TTL (time-to-live) of an entry in minutes and/or seconds
+
+Detailed configuration information can be found in the [DataFusion Runtime
+Configuration](https://datafusion.apache.org/user-guide/configs.html#runtime-configuration-settings) user's guide.
+
+Caching can be disabled by setting the limit to 0:
+
+```sql
+SET datafusion.runtime.list_files_cache_limit TO "0K";
+```
+
+Note that the internal API has changed to use a trait `ListFilesCache` instead of a type alias.
+
+### `newlines_in_values` moved from `FileScanConfig` to `CsvOptions`
+
+The CSV-specific `newlines_in_values` configuration option has been moved from `FileScanConfig` to `CsvOptions`, as it only applies to CSV file parsing.
+
+**Who is affected:**
+
+- Users who set `newlines_in_values` via `FileScanConfigBuilder::with_newlines_in_values()`
+
+**Migration guide:**
+
+Set `newlines_in_values` in `CsvOptions` instead of on `FileScanConfigBuilder`:
+
+**Before:**
+
+```rust,ignore
+let source = Arc::new(CsvSource::new(file_schema.clone()));
+let config = FileScanConfigBuilder::new(object_store_url, source)
+    .with_newlines_in_values(true)
+    .build();
+```
+
+**After:**
+
+```rust,ignore
+let options = CsvOptions {
+    newlines_in_values: Some(true),
+    ..Default::default()
+};
+let source = Arc::new(CsvSource::new(file_schema.clone())
+    .with_csv_options(options));
+let config = FileScanConfigBuilder::new(object_store_url, source)
+    .build();
+```
+
+### Removal of `pyarrow` feature
+
+The `pyarrow` feature flag has been removed. This feature has been migrated to
+the `datafusion-python` repository since version `44.0.0`.
+
+### Refactoring of `FileSource` constructors and `FileScanConfigBuilder` to accept schemas upfront
+
+The way schemas are passed to file sources and scan configurations has been significantly refactored. File sources now require the schema (including partition columns) to be provided at construction time, and `FileScanConfigBuilder` no longer takes a separate schema parameter.
+
+**Who is affected:**
+
+- Users who create `FileScanConfig` or file sources (`ParquetSource`, `CsvSource`, `JsonSource`, `AvroSource`) directly
+- Users who implement custom `FileFormat` implementations
+
+**Key changes:**
+
+1. **FileSource constructors now require TableSchema**: All built-in file sources now take the schema in their constructor:
+
+   ```diff
+   - let source = ParquetSource::default();
+   + let source = ParquetSource::new(table_schema);
+   ```
+
+2. **FileScanConfigBuilder no longer takes schema as a parameter**: The schema is now passed via the FileSource:
+
+   ```diff
+   - FileScanConfigBuilder::new(url, schema, source)
+   + FileScanConfigBuilder::new(url, source)
+   ```
+
+3. **Partition columns are now part of TableSchema**: The `with_table_partition_cols()` method has been removed from `FileScanConfigBuilder`. Partition columns are now passed as part of the `TableSchema` to the FileSource constructor:
+
+   ```diff
+   + let table_schema = TableSchema::new(
+   +     file_schema,
+   +     vec![Arc::new(Field::new("date", DataType::Utf8, false))],
+   + );
+   + let source = ParquetSource::new(table_schema);
+     let config = FileScanConfigBuilder::new(url, source)
+   -     .with_table_partition_cols(vec![Field::new("date", DataType::Utf8, false)])
+         .with_file(partitioned_file)
+         .build();
+   ```
+
+4. **FileFormat::file_source() now takes TableSchema parameter**: Custom `FileFormat` implementations must be updated:
+   ```diff
+   impl FileFormat for MyFileFormat {
+   -   fn file_source(&self) -> Arc<dyn FileSource> {
+   +   fn file_source(&self, table_schema: TableSchema) -> Arc<dyn FileSource> {
+   -       Arc::new(MyFileSource::default())
+   +       Arc::new(MyFileSource::new(table_schema))
+       }
+   }
+   ```
+
+**Migration examples:**
+
+For Parquet files:
+
+```diff
+- let source = Arc::new(ParquetSource::default());
+- let config = FileScanConfigBuilder::new(url, schema, source)
++ let table_schema = TableSchema::new(schema, vec![]);
++ let source = Arc::new(ParquetSource::new(table_schema));
++ let config = FileScanConfigBuilder::new(url, source)
+      .with_file(partitioned_file)
+      .build();
+```
+
+For CSV files with partition columns:
+
+```diff
+- let source = Arc::new(CsvSource::new(true, b',', b'"'));
+- let config = FileScanConfigBuilder::new(url, file_schema, source)
+-     .with_table_partition_cols(vec![Field::new("year", DataType::Int32, false)])
++ let options = CsvOptions {
++     has_header: Some(true),
++     delimiter: b',',
++     quote: b'"',
++     ..Default::default()
++ };
++ let table_schema = TableSchema::new(
++     file_schema,
++     vec![Arc::new(Field::new("year", DataType::Int32, false))],
++ );
++ let source = Arc::new(CsvSource::new(table_schema).with_csv_options(options));
++ let config = FileScanConfigBuilder::new(url, source)
+      .build();
+```
+
+### Adaptive filter representation in Parquet filter pushdown
+
+As of Arrow 57.1.0, DataFusion uses a new adaptive filter strategy when
+evaluating pushed down filters for Parquet files. This new strategy improves
+performance for certain types of queries where the results of filtering are
+more efficiently represented with a bitmask rather than a selection.
+See [arrow-rs #5523] for more details.
+
+This change only applies to the built-in Parquet data source with filter-pushdown enabled (
+which is [not yet the default behavior]).
+
+You can disable the new behavior by setting the
+`datafusion.execution.parquet.force_filter_selections` [configuration setting] to true.
+
+```sql
+> set datafusion.execution.parquet.force_filter_selections = true;
+```
+
+[arrow-rs #5523]: https://github.com/apache/arrow-rs/issues/5523
+[configuration setting]: https://datafusion.apache.org/user-guide/configs.html
+[not yet the default behavior]: https://github.com/apache/datafusion/issues/3463
+
+### Statistics handling moved from `FileSource` to `FileScanConfig`
+
+Statistics are now managed directly by `FileScanConfig` instead of being delegated to `FileSource` implementations. This simplifies the `FileSource` trait and provides more consistent statistics handling across all file formats.
+
+**Who is affected:**
+
+- Users who have implemented custom `FileSource` implementations
+
+**Breaking changes:**
+
+Two methods have been removed from the `FileSource` trait:
+
+- `with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource>`
+- `statistics(&self) -> Result<Statistics>`
+
+**Migration guide:**
+
+If you have a custom `FileSource` implementation, you need to:
+
+1. Remove the `with_statistics` method implementation
+2. Remove the `statistics` method implementation
+3. Remove any internal state that was storing statistics
+
+**Before:**
+
+```rust,ignore
+#[derive(Clone)]
+struct MyCustomSource {
+    table_schema: TableSchema,
+    projected_statistics: Option<Statistics>,
+    // other fields...
+}
+
+impl FileSource for MyCustomSource {
+    fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource> {
+        Arc::new(Self {
+            table_schema: self.table_schema.clone(),
+            projected_statistics: Some(statistics),
+            // other fields...
+        })
+    }
+
+    fn statistics(&self) -> Result<Statistics> {
+        Ok(self.projected_statistics.clone().unwrap_or_else(||
+            Statistics::new_unknown(self.table_schema.file_schema())
+        ))
+    }
+
+    // other methods...
+}
+```
+
+**After:**
+
+```rust,ignore
+#[derive(Clone)]
+struct MyCustomSource {
+    table_schema: TableSchema,
+    // projected_statistics field removed
+    // other fields...
+}
+
+impl FileSource for MyCustomSource {
+    // with_statistics method removed
+    // statistics method removed
+
+    // other methods...
+}
+```
+
+**Accessing statistics:**
+
+Statistics are now accessed through `FileScanConfig` instead of `FileSource`:
+
+```diff
+- let stats = config.file_source.statistics()?;
++ let stats = config.statistics();
+```
+
+Note that `FileScanConfig::statistics()` automatically marks statistics as inexact when filters are present, ensuring correctness when filters are pushed down.
+
+### Partition column handling moved out of `PhysicalExprAdapter`
+
+Partition column replacement is now a separate preprocessing step performed before expression rewriting via `PhysicalExprAdapter`. This change provides better separation of concerns and makes the adapter more focused on schema differences rather than partition value substitution.
+
+**Who is affected:**
+
+- Users who have custom implementations of `PhysicalExprAdapterFactory` that handle partition columns
+- Users who directly use the `FilePruner` API
+
+**Breaking changes:**
+
+1. `FilePruner::try_new()` signature changed: the `partition_fields` parameter has been removed since partition column handling is now done separately
+2. Partition column replacement must now be done via `replace_columns_with_literals()` before expressions are passed to the adapter
+
+**Migration guide:**
+
+If you have code that creates a `FilePruner` with partition fields:
+
+**Before:**
+
+```rust,ignore
+use datafusion_pruning::FilePruner;
+
+let pruner = FilePruner::try_new(
+    predicate,
+    file_schema,
+    partition_fields,  // This parameter is removed
+    file_stats,
+)?;
+```
+
+**After:**
+
+```rust,ignore
+use datafusion_pruning::FilePruner;
+
+// Partition fields are no longer needed
+let pruner = FilePruner::try_new(
+    predicate,
+    file_schema,
+    file_stats,
+)?;
+```
+
+If you have custom code that relies on `PhysicalExprAdapter` to handle partition columns, you must now call `replace_columns_with_literals()` separately:
+
+**Before:**
+
+```rust,ignore
+// Adapter handled partition column replacement internally
+let adapted_expr = adapter.rewrite(expr)?;
+```
+
+**After:**
+
+```rust,ignore
+use datafusion_physical_expr_adapter::replace_columns_with_literals;
+
+// Replace partition columns first
+let expr_with_literals = replace_columns_with_literals(expr, &partition_values)?;
+// Then apply the adapter
+let adapted_expr = adapter.rewrite(expr_with_literals)?;
+```
+
+### `build_row_filter` signature simplified
+
+The `build_row_filter` function in `datafusion-datasource-parquet` has been simplified to take a single schema parameter instead of two.
+The expectation is now that the filter has been adapted to the physical file schema (the arrow representation of the parquet file's schema) before being passed to this function
+using a `PhysicalExprAdapter` for example.
+
+**Who is affected:**
+
+- Users who call `build_row_filter` directly
+
+**Breaking changes:**
+
+The function signature changed from:
+
+```rust,ignore
+pub fn build_row_filter(
+    expr: &Arc<dyn PhysicalExpr>,
+    physical_file_schema: &SchemaRef,
+    predicate_file_schema: &SchemaRef,  // removed
+    metadata: &ParquetMetaData,
+    reorder_predicates: bool,
+    file_metrics: &ParquetFileMetrics,
+) -> Result<Option<RowFilter>>
+```
+
+To:
+
+```rust,ignore
+pub fn build_row_filter(
+    expr: &Arc<dyn PhysicalExpr>,
+    file_schema: &SchemaRef,
+    metadata: &ParquetMetaData,
+    reorder_predicates: bool,
+    file_metrics: &ParquetFileMetrics,
+) -> Result<Option<RowFilter>>
+```
+
+**Migration guide:**
+
+Remove the duplicate schema parameter from your call:
+
+```diff
+- build_row_filter(&predicate, &file_schema, &file_schema, metadata, reorder, metrics)
++ build_row_filter(&predicate, &file_schema, metadata, reorder, metrics)
+```
+
+### Planner now requires explicit opt-in for WITHIN GROUP syntax
+
+The SQL planner now enforces the aggregate UDF contract more strictly: the
+`WITHIN GROUP (ORDER BY ...)` syntax is accepted only if the aggregate UDAF
+explicitly advertises support by returning `true` from
+`AggregateUDFImpl::supports_within_group_clause()`.
+
+Previously the planner forwarded a `WITHIN GROUP` clause to order-sensitive
+aggregates even when they did not implement ordered-set semantics, which could
+cause queries such as `SUM(x) WITHIN GROUP (ORDER BY x)` to plan successfully.
+This behavior was too permissive and has been changed to match PostgreSQL and
+the documented semantics.
+
+Migration: If your UDAF intentionally implements ordered-set semantics and
+wants to accept the `WITHIN GROUP` SQL syntax, update your implementation to
+return `true` from `supports_within_group_clause()` and handle the ordering
+semantics in your accumulator implementation. If your UDAF is merely
+order-sensitive (but not an ordered-set aggregate), do not advertise
+`supports_within_group_clause()` and clients should use alternative function
+signatures (for example, explicit ordering as a function argument) instead.
+
+### `AggregateUDFImpl::supports_null_handling_clause` now defaults to `false`
+
+This method specifies whether an aggregate function allows `IGNORE NULLS`/`RESPECT NULLS`
+during SQL parsing, with the implication it respects these configs during computation.
+
+Most DataFusion aggregate functions silently ignored this syntax in prior versions
+as they did not make use of it and it was permitted by default. We change this so
+only the few functions which do respect this clause (e.g. `array_agg`, `first_value`,
+`last_value`) need to implement it.
+
+Custom user defined aggregate functions will also error if this syntax is used,
+unless they explicitly declare support by overriding the method.
+
+For example, SQL parsing will now fail for queries such as this:
+
+```sql
+SELECT median(c1) IGNORE NULLS FROM table
+```
+
+Instead of silently succeeding.
+
+### API change for `CacheAccessor` trait
+
+The remove API no longer requires a mutable instance
+
+### FFI crate updates
+
+Many of the structs in the `datafusion-ffi` crate have been updated to allow easier
+conversion to the underlying trait types they represent. This simplifies some code
+paths, but also provides an additional improvement in cases where library code goes
+through a round trip via the foreign function interface.
+
+To update your code, suppose you have a `FFI_SchemaProvider` called `ffi_provider`
+and you wish to use this as a `SchemaProvider`. In the old approach you would do
+something like:
+
+```rust,ignore
+    let foreign_provider: ForeignSchemaProvider = ffi_provider.into();
+    let foreign_provider = Arc::new(foreign_provider) as Arc<dyn SchemaProvider>;
+```
+
+This code should now be written as:
+
+```rust,ignore
+    let foreign_provider: Arc<dyn SchemaProvider + Send> = ffi_provider.into();
+    let foreign_provider = foreign_provider as Arc<dyn SchemaProvider>;
+```
+
+For the case of user defined functions, the updates are similar but you
+may need to change the way you call the creation of the `ScalarUDF`.
+Aggregate and window functions follow the same pattern.
+
+Previously you may write:
+
+```rust,ignore
+    let foreign_udf: ForeignScalarUDF = ffi_udf.try_into()?;
+    let foreign_udf: ScalarUDF = foreign_udf.into();
+```
+
+Instead this should now be:
+
+```rust,ignore
+    let foreign_udf: Arc<dyn ScalarUDFImpl> = ffi_udf.into();
+    let foreign_udf = ScalarUDF::new_from_shared_impl(foreign_udf);
+```
+
+When creating any of the following structs, we now require the user to
+provide a `TaskContextProvider` and optionally a `LogicalExtensionCodec`:
+
+- `FFI_CatalogListProvider`
+- `FFI_CatalogProvider`
+- `FFI_SchemaProvider`
+- `FFI_TableProvider`
+- `FFI_TableFunction`
+
+Each of these structs has a `new()` and a `new_with_ffi_codec()` method for
+instantiation. For example, when you previously would write
+
+```rust,ignore
+   let table = Arc::new(MyTableProvider::new());
+   let ffi_table = FFI_TableProvider::new(table, None);
+```
+
+Now you will need to provide a `TaskContextProvider`. The most common
+implementation of this trait is `SessionContext`.
+
+```rust,ignore
+   let ctx = Arc::new(SessionContext::default());
+   let table = Arc::new(MyTableProvider::new());
+   let ffi_table = FFI_TableProvider::new(table, None, ctx, None);
+```
+
+The alternative function to create these structures may be more convenient
+if you are doing many of these operations. A `FFI_LogicalExtensionCodec` will
+store the `TaskContextProvider` as well.
+
+```rust,ignore
+   let codec = Arc::new(DefaultLogicalExtensionCodec {});
+   let ctx = Arc::new(SessionContext::default());
+   let ffi_codec = FFI_LogicalExtensionCodec::new(codec, None, ctx);
+   let table = Arc::new(MyTableProvider::new());
+   let ffi_table = FFI_TableProvider::new_with_ffi_codec(table, None, ffi_codec);
+```
+
+Additional information about the usage of the `TaskContextProvider` can be
+found in the crate README.
+
+Additionally, the FFI structure for Scalar UDF's no longer contains a
+`return_type` call. This code was not used since the `ForeignScalarUDF`
+struct implements the `return_field_from_args` instead.
+
+### Projection handling moved from FileScanConfig to FileSource
+
+Projection handling has been moved from `FileScanConfig` into `FileSource` implementations. This enables format-specific projection pushdown (e.g., Parquet can push down struct field access, Vortex can push down computed expressions into un-decoded data).
+
+**Who is affected:**
+
+- Users who have implemented custom `FileSource` implementations
+- Users who use `FileScanConfigBuilder::with_projection_indices` directly
+
+**Breaking changes:**
+
+1. **`FileSource::with_projection` replaced with `try_pushdown_projection`:**
+
+   The `with_projection(&self, config: &FileScanConfig) -> Arc<dyn FileSource>` method has been removed and replaced with `try_pushdown_projection(&self, projection: &ProjectionExprs) -> Result<Option<Arc<dyn FileSource>>>`.
+
+2. **`FileScanConfig.projection_exprs` field removed:**
+
+   Projections are now stored in the `FileSource` directly, not in `FileScanConfig`.
+   Various public helper methods that access projection information have been removed from `FileScanConfig`.
+
+3. **`FileScanConfigBuilder::with_projection_indices` now returns `Result<Self>`:**
+
+   This method can now fail if the projection pushdown fails.
+
+4. **`FileSource::create_file_opener` now returns `Result<Arc<dyn FileOpener>>`:**
+
+   Previously returned `Arc<dyn FileOpener>` directly.
+   Any `FileSource` implementation that may fail to create a `FileOpener` should now return an appropriate error.
+
+5. **`DataSource::try_swapping_with_projection` signature changed:**
+
+   Parameter changed from `&[ProjectionExpr]` to `&ProjectionExprs`.
+
+**Migration guide:**
+
+If you have a custom `FileSource` implementation:
+
+**Before:**
+
+```rust,ignore
+impl FileSource for MyCustomSource {
+    fn with_projection(&self, config: &FileScanConfig) -> Arc<dyn FileSource> {
+        // Apply projection from config
+        Arc::new(Self { /* ... */ })
+    }
+
+    fn create_file_opener(
+        &self,
+        object_store: Arc<dyn ObjectStore>,
+        base_config: &FileScanConfig,
+        partition: usize,
+    ) -> Arc<dyn FileOpener> {
+        Arc::new(MyOpener { /* ... */ })
+    }
+}
+```
+
+**After:**
+
+```rust,ignore
+impl FileSource for MyCustomSource {
+    fn try_pushdown_projection(
+        &self,
+        projection: &ProjectionExprs,
+    ) -> Result<Option<Arc<dyn FileSource>>> {
+        // Return None if projection cannot be pushed down
+        // Return Some(new_source) with projection applied if it can
+        Ok(Some(Arc::new(Self {
+            projection: Some(projection.clone()),
+            /* ... */
+        })))
+    }
+
+    fn projection(&self) -> Option<&ProjectionExprs> {
+        self.projection.as_ref()
+    }
+
+    fn create_file_opener(
+        &self,
+        object_store: Arc<dyn ObjectStore>,
+        base_config: &FileScanConfig,
+        partition: usize,
+    ) -> Result<Arc<dyn FileOpener>> {
+        Ok(Arc::new(MyOpener { /* ... */ }))
+    }
+}
+```
+
+We recommend you look at [#18627](https://github.com/apache/datafusion/pull/18627)
+that introduced these changes for more examples for how this was handled for the various built in file sources.
+
+We have added [`SplitProjection`](https://docs.rs/datafusion-datasource/latest/datafusion_datasource/projection/struct.SplitProjection.html) and [`ProjectionOpener`](https://docs.rs/datafusion-datasource/latest/datafusion_datasource/projection/struct.ProjectionOpener.html) helpers to make it easier to handle projections in your `FileSource` implementations.
+
+For file sources that can only handle simple column selections (not computed expressions), use the `SplitProjection` and `ProjectionOpener` helpers to split the projection into pushdownable and non-pushdownable parts:
+
+```rust,ignore
+use datafusion_datasource::projection::{SplitProjection, ProjectionOpener};
+
+// In try_pushdown_projection:
+let split = SplitProjection::new(projection, self.table_schema())?;
+// Use split.file_projection() for what to push down to the file format
+// The ProjectionOpener wrapper will handle the rest
+```
+
+**For `FileScanConfigBuilder` users:**
+
+```diff
+let config = FileScanConfigBuilder::new(url, source)
+-   .with_projection_indices(Some(vec![0, 2, 3]))
++   .with_projection_indices(Some(vec![0, 2, 3]))?
+    .build();
+```
+
+### `SchemaAdapter` and `SchemaAdapterFactory` completely removed
+
+Following the deprecation announced in [DataFusion 49.0.0](49.0.0.md#deprecating-schemaadapterfactory-and-schemaadapter), `SchemaAdapterFactory` has been fully removed from Parquet scanning. This applies to both:
+
+The following symbols have been deprecated and will be removed in the next release:
+
+- `SchemaAdapter` trait
+- `SchemaAdapterFactory` trait
+- `SchemaMapper` trait
+- `SchemaMapping` struct
+- `DefaultSchemaAdapterFactory` struct
+
+These types were previously used to adapt record batch schemas during file reading.
+This functionality has been replaced by `PhysicalExprAdapterFactory`, which rewrites expressions at planning time rather than transforming batches at runtime.
+If you were using a custom `SchemaAdapterFactory` for schema adaptation (e.g., default column values, type coercion), you should now implement `PhysicalExprAdapterFactory` instead.
+See the [default column values example](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/custom_data_source/default_column_values.rs) for how to implement a custom `PhysicalExprAdapterFactory`.
+
+**Migration guide:**
+
+If you implemented a custom `SchemaAdapterFactory`, migrate to `PhysicalExprAdapterFactory`.
+See the [default column values example](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/custom_data_source/default_column_values.rs) for a complete implementation.
diff --git a/docs/source/library-user-guide/upgrading/53.0.0.md b/docs/source/library-user-guide/upgrading/53.0.0.md
new file mode 100644
index 0000000000000..a619862e2a153
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/53.0.0.md
@@ -0,0 +1,551 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Upgrade Guides
+
+## DataFusion 53.0.0
+
+### Upgrade arrow/parquet to 58.0.0 and object_store to 0.13.0
+
+DataFusion 53.0.0 uses `arrow` and `parquet` 58.0.0, and `object_store` 0.13.0.
+This may require updates to your Cargo.toml if you have direct dependencies on
+these crates.
+
+See the [Arrow 58.0.0 release notes] and the [object_store 0.13.0 upgrade guide] for details on breaking changes in those versions.
+
+[arrow 58.0.0 release notes]: https://github.com/apache/arrow-rs/releases/tag/58.0.0
+[object_store 0.13.0 upgrade guide]: https://github.com/apache/arrow-rs-object-store/blob/v0.13.0/CHANGELOG.md
+
+### `ExecutionPlan::statistics` removed
+
+The deprecated `ExecutionPlan::statistics()` method has been removed. If you
+implement custom `ExecutionPlan`s, remove that method from your impl and
+implement `partition_statistics()` instead.
+
+**Before:**
+
+```rust,ignore
+impl ExecutionPlan for MyExec {
+    // ...
+
+    fn statistics(&self) -> Result<Statistics> {
+        Ok(Statistics::new_unknown(&self.schema()))
+    }
+}
+```
+
+**After:**
+
+```rust,ignore
+impl ExecutionPlan for MyExec {
+    // ...
+
+    fn partition_statistics(&self, _partition: Option<usize>) -> Result<Statistics> {
+        Ok(Statistics::new_unknown(&self.schema()))
+    }
+}
+```
+
+If you do not have partition-specific statistics, return the same value for
+`None` and for any partition index.
+
+### `ExecutionPlan::properties` now returns `&Arc<PlanProperties>`
+
+Now `ExecutionPlan::properties()` returns `&Arc<PlanProperties>` instead of a
+reference. This make it possible to cheaply clone properties and reuse them across multiple
+`ExecutionPlans`. It also makes it possible to optimize [`ExecutionPlan::with_new_children`]
+to reuse properties when the children plans have not changed, which can significantly reduce
+planning time for complex queries.
+
+[`ExecutionPlan::with_new_children`](https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html#tymethod.with_new_children)
+
+To migrate, in all `ExecutionPlan` implementations, you will likely need to wrap
+stored `PlanProperties` in an `Arc`:
+
+```diff
+-    cache: PlanProperties,
++    cache: Arc<PlanProperties>,
+
+...
+
+-    fn properties(&self) -> &PlanProperties {
++    fn properties(&self) -> &Arc<PlanProperties> {
+         &self.cache
+     }
+```
+
+To improve performance of `with_new_children` for custom `ExecutionPlan`
+implementations, you can use the new macro: `check_if_same_properties`. For it
+to work, you need to implement the function:
+`with_new_children_and_same_properties` with semantics identical to
+`with_new_children`, but operating under the assumption that the properties of
+the children plans have not changed.
+
+An example of supporting this optimization for `ProjectionExec`:
+
+```diff
+     impl ProjectionExec {
++       fn with_new_children_and_same_properties(
++           &self,
++           mut children: Vec<Arc<dyn ExecutionPlan>>,
++       ) -> Self {
++           Self {
++               input: children.swap_remove(0),
++               metrics: ExecutionPlanMetricsSet::new(),
++               ..Self::clone(self)
++           }
++       }
+    }
+
+    impl ExecutionPlan for ProjectionExec {
+        fn with_new_children(
+            self: Arc<Self>,
+            mut children: Vec<Arc<dyn ExecutionPlan>>,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
++           check_if_same_properties!(self, children);
+            ProjectionExec::try_new(
+                self.projector.projection().into_iter().cloned(),
+                children.swap_remove(0),
+            )
+            .map(|p| Arc::new(p) as _)
+        }
+    }
+```
+
+### `PlannerContext` outer query schema API now uses a stack
+
+`PlannerContext` no longer stores a single `outer_query_schema`. It now tracks a
+stack of outer relation schemas so nested subqueries can access non-adjacent
+outer relations.
+
+**Before:**
+
+```rust,ignore
+let old_outer_query_schema =
+    planner_context.set_outer_query_schema(Some(input_schema.clone().into()));
+let sub_plan = self.query_to_plan(subquery, planner_context)?;
+planner_context.set_outer_query_schema(old_outer_query_schema);
+```
+
+**After:**
+
+```rust,ignore
+planner_context.append_outer_query_schema(input_schema.clone().into());
+let sub_plan = self.query_to_plan(subquery, planner_context)?;
+planner_context.pop_outer_query_schema();
+```
+
+### `HashJoinExec::try_new` adds `null_aware`
+
+`HashJoinExec::try_new` now takes an extra `null_aware: bool` argument. This
+flag is used for null-aware anti joins, such as plans generated for `NOT IN`
+subqueries.
+
+Most callers should pass `false`, the previous behavior. Pass `true` only for null-aware
+`JoinType::LeftAnti` joins.
+
+### `FileSinkConfig` adds `file_output_mode`
+
+`FileSinkConfig` now includes a `file_output_mode: FileOutputMode` field to control
+single-file vs directory output behavior. Any code constructing `FileSinkConfig` via struct
+literals must initialize this field.
+
+The `FileOutputMode` enum has three variants:
+
+- `Automatic` (default): Infer output mode from the URL (extension/trailing `/` heuristic)
+- `SingleFile`: Write to a single file at the exact output path
+- `Directory`: Write to a directory with generated filenames
+
+**Before:**
+
+```rust,ignore
+FileSinkConfig {
+    // ...
+    file_extension: "parquet".into(),
+}
+```
+
+**After:**
+
+```rust,ignore
+use datafusion_datasource::file_sink_config::FileOutputMode;
+
+FileSinkConfig {
+    // ...
+    file_extension: "parquet".into(),
+    file_output_mode: FileOutputMode::Automatic,
+}
+```
+
+### `SimplifyInfo` trait removed, `SimplifyContext` now uses builder-style API
+
+The `SimplifyInfo` trait has been removed and replaced with the concrete `SimplifyContext` struct. This simplifies the expression simplification API and removes the need for trait objects.
+
+**Who is affected:**
+
+- Users who implemented custom `SimplifyInfo` implementations
+- Users who implemented `ScalarUDFImpl::simplify()` for custom scalar functions
+- Users who directly use `SimplifyContext` or `ExprSimplifier`
+
+**Breaking changes:**
+
+1. The `SimplifyInfo` trait has been removed entirely
+2. `SimplifyContext` no longer takes `&ExecutionProps` - it now uses a builder-style API with direct fields
+3. `ScalarUDFImpl::simplify()` now takes `&SimplifyContext` instead of `&dyn SimplifyInfo`
+4. Time-dependent function simplification (e.g., `now()`) is now optional - if `query_execution_start_time` is `None`, these functions won't be simplified
+
+**Migration guide:**
+
+If you implemented a custom `SimplifyInfo`:
+
+**Before:**
+
+```rust,ignore
+impl SimplifyInfo for MySimplifyInfo {
+    fn is_boolean_type(&self, expr: &Expr) -> Result<bool> { ... }
+    fn nullable(&self, expr: &Expr) -> Result<bool> { ... }
+    fn execution_props(&self) -> &ExecutionProps { ... }
+    fn get_data_type(&self, expr: &Expr) -> Result<DataType> { ... }
+}
+```
+
+**After:**
+
+Use `SimplifyContext` directly with the builder-style API:
+
+```rust,ignore
+let context = SimplifyContext::default()
+    .with_schema(schema)
+    .with_config_options(config_options)
+    .with_query_execution_start_time(Some(Utc::now())); // or use .with_current_time()
+```
+
+If you implemented `ScalarUDFImpl::simplify()`:
+
+**Before:**
+
+```rust,ignore
+fn simplify(
+    &self,
+    args: Vec<Expr>,
+    info: &dyn SimplifyInfo,
+) -> Result<ExprSimplifyResult> {
+    let now_ts = info.execution_props().query_execution_start_time;
+    // ...
+}
+```
+
+**After:**
+
+```rust,ignore
+fn simplify(
+    &self,
+    args: Vec<Expr>,
+    info: &SimplifyContext,
+) -> Result<ExprSimplifyResult> {
+    // query_execution_start_time is now Option<DateTime<Utc>>
+    // Return Original if time is not set (simplification skipped)
+    let Some(now_ts) = info.query_execution_start_time() else {
+        return Ok(ExprSimplifyResult::Original(args));
+    };
+    // ...
+}
+```
+
+If you created `SimplifyContext` from `ExecutionProps`:
+
+**Before:**
+
+```rust,ignore
+let props = ExecutionProps::new();
+let context = SimplifyContext::new(&props).with_schema(schema);
+```
+
+**After:**
+
+```rust,ignore
+let context = SimplifyContext::default()
+    .with_schema(schema)
+    .with_config_options(config_options)
+    .with_current_time(); // Sets query_execution_start_time to Utc::now()
+```
+
+See [`SimplifyContext` documentation](https://docs.rs/datafusion-expr/latest/datafusion_expr/simplify/struct.SimplifyContext.html) for more details.
+
+### Struct Casting Now Requires Field Name Overlap
+
+DataFusion's struct casting mechanism previously allowed casting between structs with differing field names if the field counts matched. This "positional fallback" behavior could silently misalign fields and cause data corruption.
+
+**Breaking Change:**
+
+Starting with DataFusion 53.0.0, struct casts now require **at least one overlapping field name** between the source and target structs. Casts without field name overlap are rejected at plan time with a clear error message.
+
+**Who is affected:**
+
+- Applications that cast between structs with no overlapping field names
+- Queries that rely on positional struct field mapping (e.g., casting `struct(x, y)` to `struct(a, b)` based solely on position)
+- Code that constructs or transforms struct columns programmatically
+
+**Migration guide:**
+
+If you encounter an error like:
+
+```text
+Cannot cast struct with 2 fields to 2 fields because there is no field name overlap
+```
+
+You must explicitly rename or map fields to ensure at least one field name matches. Here are common patterns:
+
+**Example 1: Source and target field names already match (Name-based casting)**
+
+**Success case (field names align):**
+
+```sql
+-- source_col has schema: STRUCT<x INT, y INT>
+-- Casting to the same field names succeeds (no-op or type validation only)
+SELECT CAST(source_col AS STRUCT<x INT, y INT>) FROM table1;
+```
+
+**Example 2: Source and target field names differ (Migration scenario)**
+
+**What fails now (no field name overlap):**
+
+```sql
+-- source_col has schema: STRUCT<a INT, b INT>
+-- This FAILS because there is no field name overlap:
+-- ❌ SELECT CAST(source_col AS STRUCT<x INT, y INT>) FROM table1;
+-- Error: Cannot cast struct with 2 fields to 2 fields because there is no field name overlap
+```
+
+**Migration options (must align names):**
+
+**Option A: Use struct constructor for explicit field mapping**
+
+```sql
+-- source_col has schema: STRUCT<a INT, b INT>
+-- Use STRUCT_CONSTRUCT with explicit field names
+SELECT STRUCT_CONSTRUCT(
+    'x', source_col.a,
+    'y', source_col.b
+) AS renamed_struct FROM table1;
+```
+
+**Option B: Rename in the cast target to match source names**
+
+```sql
+-- source_col has schema: STRUCT<a INT, b INT>
+-- Cast to target with matching field names
+SELECT CAST(source_col AS STRUCT<a INT, b INT>) FROM table1;
+```
+
+**Example 3: Using struct constructors in Rust API**
+
+If you need to map fields programmatically, build the target struct explicitly:
+
+```rust,ignore
+// Build the target struct with explicit field names
+let target_struct_type = DataType::Struct(vec![
+    FieldRef::new("x", DataType::Int32),
+    FieldRef::new("y", DataType::Utf8),
+]);
+
+// Use struct constructors rather than casting for field mapping
+// This makes the field mapping explicit and unambiguous
+// Use struct builders or row constructors that preserve your mapping logic
+```
+
+**Why this change:**
+
+1. **Safety:** Field names are now the primary contract for struct compatibility
+2. **Explicitness:** Prevents silent data misalignment caused by positional assumptions
+3. **Consistency:** Matches DuckDB's behavior and aligns with other SQL engines that enforce name-based matching
+4. **Debuggability:** Errors now appear at plan time rather than as silent data corruption
+
+See [Issue #19841](https://github.com/apache/datafusion/issues/19841) and [PR #19955](https://github.com/apache/datafusion/pull/19955) for more details.
+
+### `FilterExec` builder methods deprecated
+
+The following methods on `FilterExec` have been deprecated in favor of using `FilterExecBuilder`:
+
+- `with_projection()`
+- `with_batch_size()`
+
+**Who is affected:**
+
+- Users who create `FilterExec` instances and use these methods to configure them
+
+**Migration guide:**
+
+Use `FilterExecBuilder` instead of chaining method calls on `FilterExec`:
+
+**Before:**
+
+```rust,ignore
+let filter = FilterExec::try_new(predicate, input)?
+    .with_projection(Some(vec![0, 2]))?
+    .with_batch_size(8192)?;
+```
+
+**After:**
+
+```rust,ignore
+let filter = FilterExecBuilder::new(predicate, input)
+    .with_projection(Some(vec![0, 2]))
+    .with_batch_size(8192)
+    .build()?;
+```
+
+The builder pattern is more efficient as it computes properties once during `build()` rather than recomputing them for each method call.
+
+Note: `with_default_selectivity()` is not deprecated as it simply updates a field value and does not require the overhead of the builder pattern.
+
+### Protobuf conversion trait added
+
+A new trait, `PhysicalProtoConverterExtension`, has been added to the `datafusion-proto`
+crate. This is used for controlling the process of conversion of physical plans and
+expressions to and from their protobuf equivalents. The methods for conversion now
+require an additional parameter.
+
+The primary APIs for interacting with this crate have not been modified, so most users
+should not need to make any changes. If you do require this trait, you can use the
+`DefaultPhysicalProtoConverter` implementation.
+
+For example, to convert a sort expression protobuf node you can make the following
+updates:
+
+**Before:**
+
+```rust,ignore
+let sort_expr = parse_physical_sort_expr(
+    sort_proto,
+    ctx,
+    input_schema,
+    codec,
+);
+```
+
+**After:**
+
+```rust,ignore
+let converter = DefaultPhysicalProtoConverter {};
+let sort_expr = parse_physical_sort_expr(
+    sort_proto,
+    ctx,
+    input_schema,
+    codec,
+    &converter
+);
+```
+
+Similarly to convert from a physical sort expression into a protobuf node:
+
+**Before:**
+
+```rust,ignore
+let sort_proto = serialize_physical_sort_expr(
+    sort_expr,
+    codec,
+);
+```
+
+**After:**
+
+```rust,ignore
+let converter = DefaultPhysicalProtoConverter {};
+let sort_proto = serialize_physical_sort_expr(
+    sort_expr,
+    codec,
+    &converter,
+);
+```
+
+### `generate_series` and `range` table functions changed
+
+The `generate_series` and `range` table functions now return an empty set when the interval is invalid, instead of an error.
+This behavior is consistent with systems like PostgreSQL.
+
+Before:
+
+```sql
+> select * from generate_series(0, -1);
+Error during planning: Start is bigger than end, but increment is positive: Cannot generate infinite series
+
+> select * from range(0, -1);
+Error during planning: Start is bigger than end, but increment is positive: Cannot generate infinite series
+```
+
+Now:
+
+```sql
+> select * from generate_series(0, -1);
++-------+
+| value |
++-------+
++-------+
+0 row(s) fetched.
+
+> select * from range(0, -1);
++-------+
+| value |
++-------+
++-------+
+0 row(s) fetched.
+```
+
+### `array_remove`, `array_remove_n`, `array_remove_all` now return NULL when the element argument is NULL
+
+Previously, calling `array_remove(array, NULL)` would attempt to match and
+remove NULL entries from the array, returning the array with NULLs stripped.
+Now, passing NULL as the element to remove causes the function to return NULL,
+which is consistent with standard SQL NULL propagation semantics.
+
+The same change applies to `array_remove_n` (aliases: `list_remove_n`) and
+`array_remove_all` (aliases: `list_remove_all`).
+
+**Who is affected:**
+
+- Queries that call `array_remove`, `array_remove_n`, or `array_remove_all`
+  with a NULL element argument (literal or column-derived).
+
+**Behavioral changes:**
+
+| Expression                                           | Old result        | New result |
+| ---------------------------------------------------- | ----------------- | ---------- |
+| `array_remove(make_array(1, NULL, 2), NULL)`         | `[1, 2]`          | `NULL`     |
+| `array_remove(make_array(1, NULL, 2, NULL), NULL)`   | `[1, 2, NULL]`    | `NULL`     |
+| `array_remove_n(make_array(1, 2, 2, 1, 1), NULL, 2)` | `[1, 2, 2, 1, 1]` | `NULL`     |
+| `array_remove_all(make_array(1, 2, 2, 1, 1), NULL)`  | `[1, 2, 2, 1, 1]` | `NULL`     |
+
+**Migration guide:**
+
+If your queries relied on the old behavior to strip NULLs from arrays, use
+`array_remove_all` with a non-NULL sentinel or use `array_filter` instead:
+
+```sql
+-- Before (removed NULLs from array):
+SELECT array_remove(make_array(1, NULL, 2), NULL);
+-- Old result: [1, 2]
+
+-- After (returns NULL due to NULL propagation):
+SELECT array_remove(make_array(1, NULL, 2), NULL);
+-- New result: NULL
+```
+
+See [#21011](https://github.com/apache/datafusion/issues/21011) and
+[PR #21013](https://github.com/apache/datafusion/pull/21013) for details.
diff --git a/docs/source/library-user-guide/upgrading/54.0.0.md b/docs/source/library-user-guide/upgrading/54.0.0.md
new file mode 100644
index 0000000000000..34d1f7c61eaf1
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/54.0.0.md
@@ -0,0 +1,499 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Upgrade Guides
+
+## DataFusion 54.0.0
+
+**Note:** DataFusion `54.0.0` has not been released yet. The information provided
+in this section pertains to features and changes that have already been merged
+to the main branch and are awaiting release in this version.
+
+### String/numeric comparison coercion now prefers numeric types
+
+Previously, comparing a numeric column with a string value (e.g.,
+`WHERE int_col > '100'`) coerced both sides to strings and performed a
+lexicographic comparison. This produced surprising results — for example,
+`5 > '100'` yielded `true` because `'5' > '1'` lexicographically, even
+though `5 > 100` is `false` numerically.
+
+DataFusion now coerces the string side to the numeric type in comparison
+contexts (`=`, `<`, `>`, `<=`, `>=`, `<>`, `IN`, `BETWEEN`, `CASE .. WHEN`,
+`GREATEST`, `LEAST`). For example, `5 > '100'` will now yield `false`.
+
+**Who is affected:**
+
+- Queries that compare numeric values with string values
+- Queries that use `IN` lists with mixed string and numeric types
+- Queries that use `CASE expr WHEN` with mixed string and numeric types
+- Queries that use `GREATEST` or `LEAST` with mixed string and numeric types
+
+**Behavioral changes:**
+
+| Expression              | Old behavior                    | New behavior                               |
+| ----------------------- | ------------------------------- | ------------------------------------------ |
+| `int_col > '100'`       | Lexicographic                   | Numeric                                    |
+| `float_col = '5'`       | String `'5' != '5.0'`           | Numeric `5.0 = 5.0`                        |
+| `int_col = 'hello'`     | String comparison, always false | Cast error                                 |
+| `str_col IN ('a', 1)`   | Coerce to Utf8                  | Cast error (`'a'` cannot be cast to Int64) |
+| `float_col IN ('1.0')`  | String `'1.0' != '1'`           | Numeric `1.0 = 1.0` (correct)              |
+| `CASE str_col WHEN 1.0` | Coerce to Utf8                  | Coerce to Float64                          |
+| `GREATEST(10, '9')`     | Utf8 `'9'` (lexicographic)      | Int64 `10` (numeric)                       |
+| `LEAST(10, '9')`        | Utf8 `10` (lexicographic)       | Int64 `9` (numeric)                        |
+
+**Migration guide:**
+
+Most queries will produce more correct results with no changes needed.
+However, queries that relied on the old string-comparison behavior may need
+adjustment:
+
+- **Queries comparing numeric columns with non-numeric strings** (e.g.,
+  `int_col = 'hello'` or `int_col > text_col` where `text_col` contains
+  non-numeric values) will now produce a cast error instead of silently
+  returning no rows.
+- **Mixed-type `IN` lists** (e.g., `str_col IN ('a', 1)`) are now rejected. Use
+  consistent types for the `IN` list or add an explicit `CAST`.
+- **Queries comparing integer columns with non-integer numeric string literals** (e.g.,
+  `int_col = '99.99'`) will now produce a cast error because `'99.99'`
+  cannot be cast to an integer. Use a float column or adjust the literal.
+
+See [#15161](https://github.com/apache/datafusion/issues/15161) and
+[PR #20426](https://github.com/apache/datafusion/pull/20426) for details.
+
+### `CastColumnExpr` removed in favor of field-aware `CastExpr`
+
+`datafusion_physical_expr::expressions::CastColumnExpr` has been removed; use
+the field-aware `datafusion_physical_expr::expressions::CastExpr` instead.
+
+If your code downcasted to `CastColumnExpr`, downcast to `CastExpr` instead and
+use `CastExpr::target_field()` for the output field metadata and
+`CastExpr::expr()` for the input expression. To construct casts with explicit
+field semantics, use `CastExpr::new_with_target_field(...)`. The type-only
+`CastExpr::new(...)` and `cast(...)` helpers remain available for callers that
+only have a `DataType`.
+
+### `comparison_coercion_numeric` removed, replaced by `comparison_coercion`
+
+The `comparison_coercion_numeric` function has been removed. Its behavior
+(preferring numeric types for string/numeric comparisons) is now the default in
+`comparison_coercion`. A new function, `type_union_coercion`, handles contexts
+where string types are preferred (`UNION`, `CASE THEN/ELSE`, `NVL2`).
+
+**Who is affected:**
+
+- Crates that call `comparison_coercion_numeric` directly
+- Crates that call `comparison_coercion` and relied on its old
+  string-preferring behavior
+- Crates that call `get_coerce_type_for_case_expression`
+
+### `ExecutionPlan::apply_expressions` is now a required method
+
+`apply_expressions` has been added as a **required** method on the `ExecutionPlan` trait (no default implementation). The same applies to the `FileSource` and `DataSource` traits. Any custom implementation of these traits must now implement `apply_expressions`.
+
+**Who is affected:**
+
+- Users who implement custom `ExecutionPlan` nodes
+- Users who implement custom `FileSource` or `DataSource` sources
+
+**Migration guide:**
+
+Add `apply_expressions` to your implementation. Call `f` on each top-level `PhysicalExpr` your node owns, using `visit_sibling` to correctly propagate `TreeNodeRecursion`:
+
+**Node with no expressions:**
+
+```rust,ignore
+fn apply_expressions(
+    &self,
+    _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+) -> Result<TreeNodeRecursion> {
+    Ok(TreeNodeRecursion::Continue)
+}
+```
+
+**Node with a single expression:**
+
+```rust,ignore
+fn apply_expressions(
+    &self,
+    f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+) -> Result<TreeNodeRecursion> {
+    f(self.predicate.as_ref())
+}
+```
+
+**Node with multiple expressions:**
+
+```rust,ignore
+fn apply_expressions(
+    &self,
+    f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+) -> Result<TreeNodeRecursion> {
+    let mut tnr = TreeNodeRecursion::Continue;
+    for expr in &self.expressions {
+        tnr = tnr.visit_sibling(|| f(expr.as_ref()))?;
+    }
+    Ok(tnr)
+}
+```
+
+**Node whose only expressions are in `output_ordering()` (e.g. a synthetic test node with no owned expression fields):**
+
+```rust,ignore
+fn apply_expressions(
+    &self,
+    f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+) -> Result<TreeNodeRecursion> {
+    let mut tnr = TreeNodeRecursion::Continue;
+    if let Some(ordering) = self.cache.output_ordering() {
+        for sort_expr in ordering {
+            tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+        }
+    }
+    Ok(tnr)
+}
+```
+
+### `ExecutionPlan::partition_statistics` now returns `Arc<Statistics>`
+
+`ExecutionPlan::partition_statistics` now returns `Result<Arc<Statistics>>` instead of `Result<Statistics>`. This avoids cloning `Statistics` when it is shared across multiple consumers.
+
+**Before:**
+
+```rust,ignore
+fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    Ok(Statistics::new_unknown(&self.schema()))
+}
+```
+
+**After:**
+
+```rust,ignore
+fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+    Ok(Arc::new(Statistics::new_unknown(&self.schema())))
+}
+```
+
+If you need an owned `Statistics` value (e.g. to mutate it), use `Arc::unwrap_or_clone`:
+
+```rust,ignore
+// If you previously consumed the Statistics directly:
+let stats = plan.partition_statistics(None)?;
+stats.column_statistics[0].min_value = ...;
+
+// Now unwrap the Arc first:
+let mut stats = Arc::unwrap_or_clone(plan.partition_statistics(None)?);
+stats.column_statistics[0].min_value = ...;
+```
+
+### Remove `as_any` from `PhysicalExpr`, `ScalarUDFImpl`, `AggregateUDFImpl`, `WindowUDFImpl`, `ExecutionPlan`, `TableProvider`, `SchemaProvider`, `CatalogProvider`, `CatalogProviderList`, `TableSource`, `FileSource`, `FileFormat`, `FileFormatFactory`, `DataSource`, and `DataSink`
+
+Now that we have a more recent minimum version of Rust, we can take advantage of
+trait upcasting. This reduces the amount of boilerplate code that
+users need to implement. In your implementations of the traits listed above,
+you can simply remove the `as_any` function. For example:
+
+```diff
+ impl PhysicalExpr for MyExpr {
+-    fn as_any(&self) -> &dyn Any {
+-        self
+-    }
+-
+     fn data_type(&self, input_schema: &Schema) -> Result<DataType> {
+         ...
+     }
+
+     ...
+ }
+```
+
+The same change applies to all of the above traits — simply delete the
+`as_any` method from each implementation.
+
+If you have code that is downcasting, you can drop the `.as_any()` call
+and use `downcast_ref` / `is` directly on the trait object:
+
+```diff
+-let exec = plan.as_any().downcast_ref::<MyExec>().unwrap();
++let exec = plan.downcast_ref::<MyExec>().unwrap();
+```
+
+These methods work correctly whether the value is a bare reference or
+behind an `Arc` (Rust auto-derefs through the `Arc`).
+
+> **Warning:** Do not cast an `Arc<dyn Trait>` directly to `&dyn Any`.
+> Writing `(&plan as &dyn Any)` gives you an `Any` reference to the
+> **`Arc` itself**, not the underlying trait object, so the downcast will
+> always return `None`. Use the `downcast_ref` method above instead, or
+> dereference through the `Arc` first with `plan.as_ref() as &dyn Any`.
+
+### `PruningStatistics::row_counts` no longer takes a `column` parameter
+
+The `row_counts` method on the `PruningStatistics` trait no longer takes a
+`&Column` argument, since row counts are a container-level property (the same
+for every column).
+
+**Before:**
+
+```rust,ignore
+fn row_counts(&self, column: &Column) -> Option<ArrayRef> {
+    // ...
+}
+```
+
+**After:**
+
+```rust,ignore
+fn row_counts(&self) -> Option<ArrayRef> {
+    // ...
+}
+```
+
+**Who is affected:**
+
+- Users who implement the `PruningStatistics` trait
+
+**Migration guide:**
+
+Remove the `column: &Column` parameter from your `row_counts` implementation
+and any corresponding call sites. If your implementation was using the column
+argument, note that row counts are identical for all columns in a container, so
+the parameter was unnecessary.
+
+See [PR #21369](https://github.com/apache/datafusion/pull/21369) for details.
+
+### Avro API and timestamp decoding changes
+
+DataFusion has switched to use `arrow-avro` (see [#17861]) when reading avro files
+which results in a few changes:
+
+- `DataFusionError::AvroError` has been removed.
+- `From<apache_avro::Error> for DataFusionError` has been removed.
+- Avro crate re-export changed:
+  - Before: `datafusion::apache_avro`
+  - After: `datafusion::arrow_avro`
+- **Avro timestamp logical type interpretation changed.** Notable effects:
+  - Avro `timestamp-*` logical types are read as UTC timezone-aware Arrow
+    timestamps (`Timestamp(..., Some("+00:00"))`)
+  - Avro `local-timestamp-*` logical types remain timezone-naive
+    (`Timestamp(..., None)`)
+
+**Who is affected:**
+
+- Users matching on `DataFusionError::AvroError`
+- Users importing `datafusion::apache_avro`
+- Users relying on previous Avro timestamp logical type behavior
+
+**Migration guide:**
+
+- Replace `datafusion::apache_avro` imports with `datafusion::arrow_avro`.
+- Update error handling code that matches on `DataFusionError::AvroError` to use
+  the current error surface.
+- Validate timestamp handling where timezone semantics matter:
+  `timestamp-*` is UTC timezone-aware, while `local-timestamp-*` is
+  timezone-naive.
+
+### `lpad`, `rpad`, and `translate` now operate on Unicode codepoints instead of grapheme clusters
+
+Previously, `lpad`, `rpad`, and `translate` used Unicode grapheme cluster
+segmentation to measure and manipulate strings. They now use Unicode codepoints,
+which is consistent with the SQL standard and most other SQL implementations. It
+also matches the behavior of other string-related functions in DataFusion.
+
+The difference is only observable for strings containing combining characters
+(e.g., U+0301 COMBINING ACUTE ACCENT) or other multi-codepoint grapheme
+clusters (e.g., ZWJ emoji sequences). For ASCII and most common Unicode text,
+behavior is unchanged.
+
+### Scalar subquery execution changes
+
+Uncorrelated scalar subqueries (e.g. `SELECT ... WHERE x > (SELECT max(v) FROM t)`)
+are now executed by a dedicated physical operator rather than being rewritten to
+a join. Correlated scalar subqueries are unchanged.
+
+This produces two user-visible changes:
+
+- **Subqueries that return multiple rows now fail at runtime.** An uncorrelated
+  scalar subquery that returns more than one row fails with `Execution error: Scalar subquery returned more than one row`. This matches the SQL standard and
+  the behavior of most other SQL implementations. The previous join-based
+  rewrite could silently produce multi-row output. Add a `LIMIT 1` or an
+  aggregate to the subquery to fix such queries.
+- **Plan shape changes.** Uncorrelated `Expr::ScalarSubquery` nodes now survive
+  into the final logical plan instead of being replaced by a join; the
+  corresponding physical plan contains a new `ScalarSubqueryExec` node and a
+  `ScalarSubqueryExpr` expression. Code that walks or transforms `LogicalPlan` /
+  `ExecutionPlan` trees, as well as `EXPLAIN` output, may need updating.
+
+### `datafusion-proto`: expression deserialization now takes a `TaskContext`
+
+`Serializeable::from_bytes_with_registry` is renamed to `from_bytes_with_ctx`
+and takes a `&TaskContext` instead of a `&dyn FunctionRegistry`. `parse_expr`,
+`parse_exprs`, and `parse_sorts` take the same change. `Expr::from_bytes`
+(without a registry argument) is unchanged.
+
+```diff
+-let expr = Expr::from_bytes_with_registry(&bytes, &registry)?;
++let expr = Expr::from_bytes_with_ctx(&bytes, ctx.task_ctx().as_ref())?;
+```
+
+```diff
+-let expr = parse_expr(&proto, &registry, &codec)?;
++let expr = parse_expr(&proto, ctx.task_ctx().as_ref(), &codec)?;
+```
+
+### `datafusion-proto`: `PhysicalProtoConverterExtension` reshaped
+
+`PhysicalProtoConverterExtension` and the `parse_physical_*_with_converter`
+helpers now take a single `&PhysicalPlanDecodeContext<'_>` that bundles the
+`TaskContext` and the `PhysicalExtensionCodec`. Implementations update like
+this:
+
+```diff
+ impl PhysicalProtoConverterExtension for MyConverter {
+     fn proto_to_execution_plan(
+         &self,
+-        ctx: &TaskContext,
+-        codec: &dyn PhysicalExtensionCodec,
+         proto: &protobuf::PhysicalPlanNode,
++        ctx: &PhysicalPlanDecodeContext<'_>,
+     ) -> Result<Arc<dyn ExecutionPlan>> {
+-        proto.try_into_physical_plan_with_converter(ctx, codec, self)
++        self.default_proto_to_execution_plan(proto, ctx)
+     }
+
+     fn proto_to_physical_expr(
+         &self,
+         proto: &PhysicalExprNode,
+-        ctx: &TaskContext,
+         input_schema: &Schema,
+-        codec: &dyn PhysicalExtensionCodec,
++        ctx: &PhysicalPlanDecodeContext<'_>,
+     ) -> Result<Arc<dyn PhysicalExpr>> {
+-        parse_physical_expr_with_converter(proto, ctx, input_schema, codec, self)
++        self.default_proto_to_physical_expr(proto, input_schema, ctx)
+     }
+ }
+```
+
+Pull out the `TaskContext` or codec inside these methods with
+`ctx.task_ctx()` and `ctx.codec()`. Construct a fresh context at an API
+boundary with `PhysicalPlanDecodeContext::new(task_ctx, codec)`.
+
+### `ExecutionProps` has new fields
+
+`ExecutionProps` gained new public fields. Code that constructs it via a
+struct literal, or pattern-matches it without `..`, no longer compiles. Use
+`ExecutionProps::new()` and include `..` in exhaustive patterns.
+
+### Items in `datafusion_functions::strings` are no longer public
+
+`StringArrayBuilder`, `LargeStringArrayBuilder`, `StringViewArrayBuilder`,
+`ColumnarValueRef`, and `append_view` have been reduced to `pub(crate)`. They
+were only ever used to implement `concat` and `concat_ws` inside the crate. If
+you were importing them externally, use Arrow's corresponding builders with a
+caller-computed `NullBuffer`.
+
+[#17861]: https://github.com/apache/datafusion/pull/17861
+
+### Conversion from `FileDecryptionProperties` to `ConfigFileDecryptionProperties` is now fallible
+
+Previously, `datafusion_common::config::ConfigFileDecryptionProperties`
+implemented `From<&Arc<parquet::encryption::decrypt::FileDecryptionProperties>>`.
+If an error was encountered when retrieving the footer key without providing key metadata,
+the error would be ignored and an empty footer key set in the result.
+This could lead to obscure errors later.
+
+`ConfigFileDecryptionProperties` now instead implements `TryFrom<&Arc<FileDecryptionProperties>>`,
+and errors retrieving the footer key will be propagated up.
+
+**Migration guide:**
+
+Replace calls to `ConfigFileDecryptionProperties::from` with `ConfigFileDecryptionProperties::try_from`,
+and affected calls to `into` with `try_into`, with appropriate error handling added.
+
+**Before:**
+
+```rust,ignore
+let config_decryption_properties: ConfigFileDecryptionProperties = (&decryption_properties).into();
+// or
+let config_decryption_properties = ConfigFileDecryptionProperties::from(&decryption_properties);
+```
+
+(where `decryption_properties` is an `Arc<FileDecryptionProperties>`)
+
+**After:**
+
+```rust,ignore
+let config_decryption_properties: ConfigFileDecryptionProperties = (&decryption_properties).try_into()?;
+// or
+let config_decryption_properties = ConfigFileDecryptionProperties::try_from(&decryption_properties)?;
+```
+
+See [#21602](https://github.com/apache/datafusion/issues/21602) and
+[PR #21603](https://github.com/apache/datafusion/pull/21603) for details.
+
+### `approx_percentile_cont`, `approx_percentile_cont_with_weight`, `approx_median` now coerce to floats
+
+The type signatures of `approx_percentile_cont`, `approx_percentile_cont_with_weight`, and
+`approx_median` now coerce integer input values to `Float64` before computing the approximation.
+As a result, these functions always return a float, even when the input column is an integer type.
+
+**Who is affected:**
+
+- Queries or downstream code that relied on `approx_percentile_cont` / `approx_percentile_cont_with_weight` /
+  `approx_median` returning an integer type when given an integer column.
+
+**Migration guide:**
+
+If downstream code checks or relies on the return type being an integer, add an explicit
+`CAST` back to the desired integer type, or update the type assertion:
+
+```sql
+-- Before (returned Int64):
+SELECT approx_percentile_cont(quantity, 0.5) FROM orders;
+
+-- After (returns Float64); cast if an integer result is required:
+SELECT CAST(approx_percentile_cont(quantity, 0.5) AS BIGINT) FROM orders;
+```
+
+[#21074]: https://github.com/apache/datafusion/pull/21074
+
+### `Box<C>` and `Arc<C>` `TreeNodeContainer` impls now require `C: Default`
+
+The generic `TreeNodeContainer` implementations for `Box<C>` and `Arc<C>` now
+require `C: Default`. This change was necessary as part of optimizing tree
+rewriting to reduce heap allocations.
+
+**Who is affected:**
+
+- Users that implement `TreeNodeContainer` on a custom type and wrap it in
+  `Box` or `Arc` when walking trees.
+
+**Migration guide:**
+
+Add a `Default` implementation to your type. The default value is used as a
+temporary placeholder during query optimization, so when possible, pick a cheap,
+allocation-free variant:
+
+```rust,ignore
+impl Default for MyTreeNode {
+    fn default() -> Self {
+        MyTreeNode::Leaf // or whichever variant is cheapest to construct
+    }
+}
+```
diff --git a/docs/source/library-user-guide/upgrading/index.rst b/docs/source/library-user-guide/upgrading/index.rst
new file mode 100644
index 0000000000000..1ed5eca2a5d2a
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/index.rst
@@ -0,0 +1,33 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Upgrade Guides
+==============
+
+.. toctree::
+   :maxdepth: 1
+
+   DataFusion 54.0.0 <54.0.0>
+   DataFusion 53.0.0 <53.0.0>
+   DataFusion 52.0.0 <52.0.0>
+   DataFusion 51.0.0 <51.0.0>
+   DataFusion 50.0.0 <50.0.0>
+   DataFusion 49.0.0 <49.0.0>
+   DataFusion 48.0.1 <48.0.1>
+   DataFusion 48.0.0 <48.0.0>
+   DataFusion 47.0.0 <47.0.0>
+   DataFusion 46.0.0 <46.0.0>
diff --git a/docs/source/library-user-guide/using-the-dataframe-api.md b/docs/source/library-user-guide/using-the-dataframe-api.md
index 7f3e28c255c6e..024eff5d20834 100644
--- a/docs/source/library-user-guide/using-the-dataframe-api.md
+++ b/docs/source/library-user-guide/using-the-dataframe-api.md
@@ -198,7 +198,7 @@ async fn main() -> Result<()> {
 }
 ```
 
-[`custom_file_format.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/custom_file_format.rs
+[`custom_file_format.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/custom_data_source/custom_file_format.rs
 
 The output file will look like (Example Output):
 
diff --git a/docs/source/library-user-guide/working-with-exprs.md b/docs/source/library-user-guide/working-with-exprs.md
index bdcaaeae0a6e2..472ab2481360e 100644
--- a/docs/source/library-user-guide/working-with-exprs.md
+++ b/docs/source/library-user-guide/working-with-exprs.md
@@ -71,7 +71,7 @@ From DFSchema to Schema: Since the `Into` trait has been implemented for DFSchem
 
 ## Creating and Evaluating `Expr`s
 
-Please see [expr_api.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/expr_api.rs) for well commented code for creating, evaluating, simplifying, and analyzing `Expr`s.
+Please see [expr_api.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/expr_api.rs) for well commented code for creating, evaluating, simplifying, and analyzing `Expr`s.
 
 ## A Scalar UDF Example
 
@@ -123,9 +123,9 @@ If you'd like to learn more about `Expr`s, before we get into the details of cre
 
 There are several examples of rewriting and working with `Expr`s:
 
-- [expr_api.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/expr_api.rs)
-- [analyzer_rule.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/analyzer_rule.rs)
-- [optimizer_rule.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/optimizer_rule.rs)
+- [expr_api.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/expr_api.rs)
+- [analyzer_rule.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/analyzer_rule.rs)
+- [optimizer_rule.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/optimizer_rule.rs)
 
 Rewriting Expressions is the process of taking an `Expr` and transforming it into another `Expr`. This is useful for a number of reasons, including:
 
diff --git a/docs/source/user-guide/arrow-introduction.md b/docs/source/user-guide/arrow-introduction.md
index 89662a0c29c5d..5a225782adfdb 100644
--- a/docs/source/user-guide/arrow-introduction.md
+++ b/docs/source/user-guide/arrow-introduction.md
@@ -220,14 +220,15 @@ When working with Arrow and RecordBatches, watch out for these common issues:
 - [Schema](https://docs.rs/arrow-schema/latest/arrow_schema/struct.Schema.html) - Describes the structure of a RecordBatch (column names and types)
 
 [apache arrow]: https://arrow.apache.org/docs/index.html
+[arrow-rs]: https://github.com/apache/arrow-rs
 [`arc`]: https://doc.rust-lang.org/std/sync/struct.Arc.html
 [`arrayref`]: https://docs.rs/arrow-array/latest/arrow_array/array/type.ArrayRef.html
 [`cast`]: https://docs.rs/arrow/latest/arrow/compute/fn.cast.html
 [`field`]: https://docs.rs/arrow-schema/latest/arrow_schema/struct.Field.html
 [`schema`]: https://docs.rs/arrow-schema/latest/arrow_schema/struct.Schema.html
 [`datatype`]: https://docs.rs/arrow-schema/latest/arrow_schema/enum.DataType.html
-[`int32array`]: https://docs.rs/arrow-array/latest/arrow_array/array/struct.Int32Array.html
-[`stringarray`]: https://docs.rs/arrow-array/latest/arrow_array/array/struct.StringArray.html
+[`int32array`]: https://docs.rs/arrow/latest/arrow/array/type.Int32Array.html
+[`stringarray`]: https://docs.rs/arrow/latest/arrow/array/type.StringArray.html
 [`int32`]: https://docs.rs/arrow-schema/latest/arrow_schema/enum.DataType.html#variant.Int32
 [`int64`]: https://docs.rs/arrow-schema/latest/arrow_schema/enum.DataType.html#variant.Int64
 [extension points]: ../library-user-guide/extensions.md
@@ -241,8 +242,8 @@ When working with Arrow and RecordBatches, watch out for these common issues:
 [`.show()`]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html#method.show
 [`memtable`]: https://docs.rs/datafusion/latest/datafusion/datasource/struct.MemTable.html
 [`sessioncontext`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html
-[`csvreadoptions`]: https://docs.rs/datafusion/latest/datafusion/execution/options/struct.CsvReadOptions.html
-[`parquetreadoptions`]: https://docs.rs/datafusion/latest/datafusion/execution/options/struct.ParquetReadOptions.html
+[`csvreadoptions`]: https://docs.rs/datafusion/latest/datafusion/datasource/file_format/options/struct.CsvReadOptions.html
+[`parquetreadoptions`]: https://docs.rs/datafusion/latest/datafusion/datasource/file_format/options/struct.ParquetReadOptions.html
 [`recordbatch`]: https://docs.rs/arrow-array/latest/arrow_array/struct.RecordBatch.html
 [`read_csv`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.read_csv
 [`read_parquet`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.read_parquet
diff --git a/docs/source/user-guide/cli/functions.md b/docs/source/user-guide/cli/functions.md
index 305b53c16f65e..ea353d5c8dcc8 100644
--- a/docs/source/user-guide/cli/functions.md
+++ b/docs/source/user-guide/cli/functions.md
@@ -138,5 +138,87 @@ The columns of the returned table are:
 | hits                | UInt64    | Number of times the cached metadata has been accessed                                     |
 | extra               | Utf8      | Extra information about the cached metadata (e.g., if page index information is included) |
 
+## `statistics_cache`
+
+Similarly to the `metadata_cache`, the `statistics_cache` function can be used to show information
+about the File Statistics Cache that is used by the [`ListingTable`] implementation in DataFusion.
+For the statistics to be collected, the config `datafusion.execution.collect_statistics` must be
+enabled.
+
+You can inspect the statistics cache by querying the `statistics_cache` function. For example:
+
+```sql
+> select * from statistics_cache();
++------------------+---------------------+-----------------+------------------------+---------+-----------------+-------------+--------------------+-----------------------+
+| path             | file_modified       | file_size_bytes | e_tag                  | version | num_rows        | num_columns | table_size_bytes   | statistics_size_bytes |
++------------------+---------------------+-----------------+------------------------+---------+-----------------+-------------+--------------------+-----------------------+
+| .../hits.parquet | 2022-06-25T22:22:22 | 14779976446     | 0-5e24d1ee16380-370f48 | NULL    | Exact(99997497) | 105         | Exact(36445943240) | 0                     |
++------------------+---------------------+-----------------+------------------------+---------+-----------------+-------------+--------------------+-----------------------+
+```
+
+The columns of the returned table are:
+
+| column_name           | data_type | Description                                                                  |
+| --------------------- | --------- | ---------------------------------------------------------------------------- |
+| path                  | Utf8      | File path relative to the object store / filesystem root                     |
+| file_modified         | Timestamp | Last modified time of the file                                               |
+| file_size_bytes       | UInt64    | Size of the file in bytes                                                    |
+| e_tag                 | Utf8      | [Entity Tag] (ETag) of the file if available                                 |
+| version               | Utf8      | Version of the file if available (for object stores that support versioning) |
+| num_rows              | Utf8      | Number of rows in the table                                                  |
+| num_columns           | UInt64    | Number of columns in the table                                               |
+| table_size_bytes      | Utf8      | Size of the table, in bytes                                                  |
+| statistics_size_bytes | UInt64    | Size of the cached statistics in memory                                      |
+
+## `list_files_cache`
+
+The `list_files_cache` function shows information about the `ListFilesCache` that is used by the [`ListingTable`] implementation in DataFusion. When creating a [`ListingTable`], DataFusion lists the files in the table's location and caches results in the `ListFilesCache`. Subsequent queries against the same table can reuse this cached information instead of re-listing the files. Cache entries are scoped to tables.
+
+You can inspect the cache by querying the `list_files_cache` function. For example,
+
+```sql
+> set datafusion.runtime.list_files_cache_ttl = "30s";
+> create external table overturemaps
+stored as parquet
+location 's3://overturemaps-us-west-2/release/2025-12-17.0/theme=base/type=infrastructure';
+0 row(s) fetched.
+> select table, path, metadata_size_bytes, expires_in, unnest(metadata_list)['file_size_bytes'] as file_size_bytes, unnest(metadata_list)['e_tag'] as e_tag from list_files_cache() limit 10;
++--------------+-----------------------------------------------------+---------------------+-----------------------------------+-----------------+---------------------------------------+
+| table        | path                                                | metadata_size_bytes | expires_in                        | file_size_bytes | e_tag                                 |
++--------------+-----------------------------------------------------+---------------------+-----------------------------------+-----------------+---------------------------------------+
+| overturemaps | release/2025-12-17.0/theme=base/type=infrastructure | 2750                | 0 days 0 hours 0 mins 25.264 secs | 999055952       | "35fc8fbe8400960b54c66fbb408c48e8-60" |
+| overturemaps | release/2025-12-17.0/theme=base/type=infrastructure | 2750                | 0 days 0 hours 0 mins 25.264 secs | 975592768       | "8a16e10b722681cdc00242564b502965-59" |
+| overturemaps | release/2025-12-17.0/theme=base/type=infrastructure | 2750                | 0 days 0 hours 0 mins 25.264 secs | 1082925747      | "24cd13ddb5e0e438952d2499f5dabe06-65" |
+| overturemaps | release/2025-12-17.0/theme=base/type=infrastructure | 2750                | 0 days 0 hours 0 mins 25.264 secs | 1008425557      | "37663e31c7c64d4ef355882bcd47e361-61" |
+| overturemaps | release/2025-12-17.0/theme=base/type=infrastructure | 2750                | 0 days 0 hours 0 mins 25.264 secs | 1065561905      | "4e7c50d2d1b3c5ed7b82b4898f5ac332-64" |
+| overturemaps | release/2025-12-17.0/theme=base/type=infrastructure | 2750                | 0 days 0 hours 0 mins 25.264 secs | 1045655427      | "8fff7e6a72d375eba668727c55d4f103-63" |
+| overturemaps | release/2025-12-17.0/theme=base/type=infrastructure | 2750                | 0 days 0 hours 0 mins 25.264 secs | 1086822683      | "b67167d8022d778936c330a52a5f1922-65" |
+| overturemaps | release/2025-12-17.0/theme=base/type=infrastructure | 2750                | 0 days 0 hours 0 mins 25.264 secs | 1016732378      | "6d70857a0473ed9ed3fc6e149814168b-61" |
+| overturemaps | release/2025-12-17.0/theme=base/type=infrastructure | 2750                | 0 days 0 hours 0 mins 25.264 secs | 991363784       | "c9cafb42fcbb413f851691c895dd7c2b-60" |
+| overturemaps | release/2025-12-17.0/theme=base/type=infrastructure | 2750                | 0 days 0 hours 0 mins 25.264 secs | 1032469715      | "7540252d0d67158297a67038a3365e0f-62" |
++--------------+-----------------------------------------------------+---------------------+-----------------------------------+-----------------+---------------------------------------+
+```
+
+The columns of the returned table are:
+| column_name | data_type | Description |
+| ------------------- | ------------ | ----------------------------------------------------------------------------------------- |
+| table | Utf8 | Name of the table |
+| path | Utf8 | File path relative to the object store / filesystem root |
+| metadata_size_bytes | UInt64 | Size of the cached metadata in memory (not its thrift encoded form) |
+| expires_in | Duration(ms) | Last modified time of the file |
+| metadata_list | List(Struct) | List of metadatas, one for each file under the path. |
+
+A metadata struct in the metadata_list contains the following fields:
+
+```text
+{
+  "file_path": "release/2025-12-17.0/theme=base/type=infrastructure/part-00000-d556e455-e0c5-4940-b367-daff3287a952-c000.zstd.parquet",
+  "file_modified": "2025-12-17T22:20:29",
+  "file_size_bytes": 999055952,
+  "e_tag": "35fc8fbe8400960b54c66fbb408c48e8-60",
+  "version": null
+}
+```
+
 [`listingtable`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTable.html
 [entity tag]: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/ETag
diff --git a/docs/source/user-guide/cli/overview.md b/docs/source/user-guide/cli/overview.md
index 86beea0e82d5c..e0228d3ea00e4 100644
--- a/docs/source/user-guide/cli/overview.md
+++ b/docs/source/user-guide/cli/overview.md
@@ -41,5 +41,5 @@ DataFusion CLI v37.0.0
 Elapsed 1.969 seconds.
 ```
 
-For more information, see the [Installation](installation), [Usage Guide](usage)
-and [Data Sources](datasources) sections.
+For more information, see the [Installation](installation.md), [Usage Guide](usage.md)
+and [Data Sources](datasources.md) sections.
diff --git a/docs/source/user-guide/cli/usage.md b/docs/source/user-guide/cli/usage.md
index 68804b2817e74..75c6698f007a5 100644
--- a/docs/source/user-guide/cli/usage.md
+++ b/docs/source/user-guide/cli/usage.md
@@ -22,62 +22,45 @@
 See the current usage using `datafusion-cli --help`:
 
 ```bash
-Apache Arrow <dev@arrow.apache.org>
 Command Line Client for DataFusion query engine.
 
-USAGE:
-    datafusion-cli [OPTIONS]
-
-OPTIONS:
-    -b, --batch-size <BATCH_SIZE>
-            The batch size of each query, or use DataFusion default
-
-    -c, --command <COMMAND>...
-            Execute the given command string(s), then exit
-
-        --color
-            Enables console syntax highlighting
-
-    -f, --file <FILE>...
-            Execute commands from file(s), then exit
-
-        --format <FORMAT>
-            [default: table] [possible values: csv, tsv, table, json, nd-json]
-
-    -h, --help
-            Print help information
-
-    -m, --memory-limit <MEMORY_LIMIT>
-            The memory pool limitation (e.g. '10g'), default to None (no limit)
-
-        --maxrows <MAXROWS>
-            The max number of rows to display for 'Table' format
-            [possible values: numbers(0/10/...), inf(no limit)] [default: 40]
-
-        --mem-pool-type <MEM_POOL_TYPE>
-            Specify the memory pool type 'greedy' or 'fair', default to 'greedy'
-
-        --top-memory-consumers <TOP_MEMORY_CONSUMERS>
-            The number of top memory consumers to display when query fails due to memory exhaustion. To disable memory consumer tracking, set this value to 0 [default: 3]
-
-    -d, --disk-limit <DISK_LIMIT>
-            Available disk space for spilling queries (e.g. '10g'), default to None (uses DataFusion's default value of '100g')
-
+Usage: datafusion-cli [OPTIONS]
+
+Options:
+  -p, --data-path <DATA_PATH>
+          Path to your data, default to current directory
+  -b, --batch-size <BATCH_SIZE>
+          The batch size of each query, or use DataFusion default
+  -c, --command [<COMMAND>...]
+          Execute the given command string(s), then exit. Commands are expected to be non empty.
+  -m, --memory-limit <MEMORY_LIMIT>
+          The memory pool limitation (e.g. '10g'), default to None (no limit)
+  -f, --file [<FILE>...]
+          Execute commands from file(s), then exit
+  -r, --rc [<RC>...]
+          Run the provided files on startup instead of ~/.datafusionrc
+      --format <FORMAT>
+          [default: automatic] [possible values: csv, tsv, table, json, nd-json, automatic]
+  -q, --quiet
+          Reduce printing other than the results and work quietly
+      --mem-pool-type <MEM_POOL_TYPE>
+          Specify the memory pool type 'greedy' or 'fair' [default: greedy]
+      --top-memory-consumers <TOP_MEMORY_CONSUMERS>
+          The number of top memory consumers to display when query fails due to memory exhaustion. To disable memory consumer tracking, set this value to 0 [default: 3]
+      --maxrows <MAXROWS>
+          The max number of rows to display for 'Table' format
+          [possible values: numbers(0/10/...), inf(no limit)] [default: 40]
+      --color
+          Enables console syntax highlighting
+  -d, --disk-limit <DISK_LIMIT>
+          Available disk space for spilling queries (e.g. '10g'), default to None (uses DataFusion's default value of '100g')
       --object-store-profiling <OBJECT_STORE_PROFILING>
           Specify the default object_store_profiling mode, defaults to 'disabled'.
           [possible values: disabled, summary, trace] [default: Disabled]
-
-    -p, --data-path <DATA_PATH>
-            Path to your data, default to current directory
-
-    -q, --quiet
-            Reduce printing other than the results and work quietly
-
-    -r, --rc <RC>...
-            Run the provided files on startup instead of ~/.datafusionrc
-
-    -V, --version
-            Print version information
+  -h, --help
+          Print help
+  -V, --version
+          Print version
 ```
 
 ## Commands
diff --git a/docs/source/user-guide/concepts-readings-events.md b/docs/source/user-guide/concepts-readings-events.md
index ad444ef91c474..f4ecb43d8254c 100644
--- a/docs/source/user-guide/concepts-readings-events.md
+++ b/docs/source/user-guide/concepts-readings-events.md
@@ -21,7 +21,7 @@
 
 ## 🧭 Background Concepts
 
-- **2024-06-13**: [2024 ACM SIGMOD International Conference on Management of Data: Apache Arrow DataFusion: A Fast, Embeddable, Modular Analytic Query Engine](https://dl.acm.org/doi/10.1145/3626246.3653368) - [Download](http://andrew.nerdnetworks.org/other/SIGMOD-2024-lamb.pdf), [Talk](https://youtu.be/-DpKcPfnNms), [Slides](https://docs.google.com/presentation/d/1gqcxSNLGVwaqN0_yJtCbNm19-w5pqPuktII5_EDA6_k/edit#slide=id.p), [Recording ](https://youtu.be/-DpKcPfnNms)
+- **2024-06-13**: [2024 ACM SIGMOD International Conference on Management of Data: Apache Arrow DataFusion: A Fast, Embeddable, Modular Analytic Query Engine](https://dl.acm.org/doi/10.1145/3626246.3653368) - [Download](https://andrew.nerdnetworks.org/pdf/SIGMOD-2024-lamb.pdf), [Talk](https://youtu.be/-DpKcPfnNms), [Slides](https://docs.google.com/presentation/d/1gqcxSNLGVwaqN0_yJtCbNm19-w5pqPuktII5_EDA6_k/edit#slide=id.p), [Recording ](https://youtu.be/-DpKcPfnNms)
 
 - **2024-06-07**: [Video: SIGMOD 2024 Practice: Apache Arrow DataFusion A Fast, Embeddable, Modular Analytic Query Engine](https://www.youtube.com/watch?v=-DpKcPfnNms&t=5s) - [Slides](https://docs.google.com/presentation/d/1gqcxSNLGVwaqN0_yJtCbNm19-w5pqPuktII5_EDA6_k/edit#slide=id.p)
 
@@ -37,6 +37,48 @@
 
 This is a list of DataFusion related blog posts, articles, and other resources. Please open a PR to add any new resources you create or find
 
+- **2026-04-10** [Blog: DataFusion and the Rise of Deconstructed Data Systems](https://thedataquarry.com/blog/datafusion-and-the-rise-of-deconstructed-data-systems/)
+
+- **2026-04-04** [Video: Generalized Consensus & Native Top-K Joins in ParadeDB](https://www.youtube.com/watch?v=TeFsBVIYBis)
+
+- **2026-03-31** [Blog: Writing Custom Table Providers in Apache DataFusion](https://datafusion.apache.org/blog/2026/03/31/custom-table-providers/)
+
+- **2026-03-24** [Podcast: The Data Fusion Secret & Why Custom Query Engines Fail with Nikita Lapkov](https://www.youtube.com/watch?v=HkYF2So6nHQ)
+
+- **2026-03-20** [Blog: Turning LIMIT into an I/O Optimization: Inside DataFusion’s Multi-Layer Pruning Stack](https://datafusion.apache.org/blog/2026/03/20/multi-layer-pruning/)
+
+- **2026-02-09** [Blog: Vector search using only Parquet and DataFusion](https://blog.xiangpeng.systems/posts/vector-search-with-parquet-datafusion/)
+
+- **2026-02-02** [Blog: Optimizing SQL CASE Expression Evaluation](https://datafusion.apache.org/blog/2026/02/02/case-expression/)
+
+- **2026-01-12** [Blog: Extending SQL in DataFusion: from ->> to TABLESAMPLE](https://datafusion.apache.org/blog/2026/01/12/extending-sql)
+
+- **2025-12-15** [Blog: Optimizing Repartitions in DataFusion: How I Went From Database Noob to Core Contribution](https://datafusion.apache.org/blog/2025/12/15/avoid-consecutive-repartitions)
+
+- **2025-09-21** [Blog: Implementing User Defined Types and Custom Metadata in DataFusion](https://datafusion.apache.org/blog/2025/09/21/custom-types-using-metadata)
+
+- **2025-09-10** [Blog: Dynamic Filters: Passing Information Between Operators During Execution for 25x Faster Queries](https://datafusion.apache.org/blog/2025/09/10/dynamic-filters)
+
+- **2025-08-15** [Blog: Using External Indexes, Metadata Stores, Catalogs and Caches to Accelerate Queries on Apache Parquet](https://datafusion.apache.org/blog/2025/08/15/external-parquet-indexes)
+
+- **2025-07-14** [Blog: Embedding User-Defined Indexes in Apache Parquet Files](https://datafusion.apache.org/blog/2025/07/14/user-defined-parquet-indexes)
+
+- **2025-06-30** [Blog: Using Rust async for Query Execution and Cancelling Long-Running Queries](https://datafusion.apache.org/blog/2025/06/30/cancellation)
+
+- **2025-06-15** [Blog: Optimizing SQL (and DataFrames) in DataFusion, Part 1: Query Optimization Overview](https://datafusion.apache.org/blog/2025/06/15/optimizing-sql-dataframes-part-one)
+
+- **2025-06-15** [Blog: Optimizing SQL (and DataFrames) in DataFusion, Part 2: Optimizers in Apache DataFusion](https://datafusion.apache.org/blog/2025/06/15/optimizing-sql-dataframes-part-two)
+
+- **2025-04-19** [Blog: User defined Window Functions in DataFusion](https://datafusion.apache.org/blog/2025/04/19/user-defined-window-functions)
+
+- **2025-04-10** [Blog: tpchgen-rs World's fastest open source TPC-H data generator, written in Rust](https://datafusion.apache.org/blog/2025/04/10/fastest-tpch-generator)
+
+- **2025-03-11** [Blog: Using Ordering for Better Plans in Apache DataFusion](https://datafusion.apache.org/blog/2025/03/11/ordering-analysis)
+
+- **2024-05-07** [Blog: Announcing Apache Arrow DataFusion is now Apache DataFusion](https://datafusion.apache.org/blog/2024/05/07/datafusion-tlp)
+
+- **2024-03-06** [Blog: Announcing Apache Arrow DataFusion Comet](https://datafusion.apache.org/blog/2024/03/06/comet-donation)
+
 - **2025-03-21** [Blog: Efficient Filter Pushdown in Parquet](https://datafusion.apache.org/blog/2025/03/21/parquet-pushdown/)
 
 - **2025-03-20** [Blog: Parquet Pruning in DataFusion: Read Only What Matters](https://datafusion.apache.org/blog/2025/03/20/parquet-pruning/)
@@ -59,16 +101,14 @@ This is a list of DataFusion related blog posts, articles, and other resources.
 
 - **2024-10-29** [Video: MiDAS Seminar Fall 2024 on "Apache DataFusion" by Andrew Lamb](https://www.youtube.com/watch?v=CpnxuBwHbUc)
 
-- **2024-10-27** [Blog: Caching in DataFusion: Don't read twice](https://blog.haoxp.xyz/posts/caching-datafusion)
+- **2024-10-27** [Blog: Caching in DataFusion: Don't read twice](https://blog.xiangpeng.systems/posts/caching-datafusion/)
 
-- **2024-10-24** [Blog: Parquet pruning in DataFusion: Read no more than you need](https://blog.haoxp.xyz/posts/parquet-to-arrow/)
+- **2024-10-24** [Blog: Parquet pruning in DataFusion: Read no more than you need](https://blog.xiangpeng.systems/posts/parquet-to-arrow/)
 
 - **2024-09-13** [Blog: Using StringView / German Style Strings to make Queries Faster: Part 2 - String Operations](https://www.influxdata.com/blog/faster-queries-with-stringview-part-two-influxdb/) | [Reposted on DataFusion Blog](https://datafusion.apache.org/blog/2024/09/13/string-view-german-style-strings-part-2/)
 
 - **2024-09-13** [Blog: Using StringView / German Style Strings to Make Queries Faster: Part 1- Reading Parquet](https://www.influxdata.com/blog/faster-queries-with-stringview-part-one-influxdb/) | [Reposted on Datafusion Blog](https://datafusion.apache.org/blog/2024/09/13/string-view-german-style-strings-part-1/)
 
-- **2024-10-16** [Blog: Candle Image Segmentation](https://www.letsql.com/posts/candle-image-segmentation/)
-
 - **2024-09-23 → 2024-12-02** [Talks: Carnegie Mellon University: Database Building Blocks Seminar Series - Fall 2024](https://db.cs.cmu.edu/seminar2024/)
 
   - **2024-11-12** [Video: Building InfluxDB 3.0 with the FDAP Stack: Apache Flight, DataFusion, Arrow and Parquet (Paul Dix)](https://www.youtube.com/watch?v=AGS4GNGDK_4)
@@ -138,34 +178,39 @@ This is a list of DataFusion related blog posts, articles, and other resources.
 
 ## 📅 Release Notes & Updates
 
-- **2025-03-24** [Apache DataFusion 46.0.0 Released](https://datafusion.apache.org/blog/2025/03/24/datafusion-46.0.0/)
-
-- **2024-09-14** [Apache DataFusion Python 43.1.0 Released](https://datafusion.apache.org/blog/2024/12/14/datafusion-python-43.1.0/)
-
-- **2024-08-24** [Apache DataFusion Python 40.1.0 Released, Significant usability updates](https://datafusion.apache.org/blog/2024/08/20/python-datafusion-40.0.0/)
-
-- **2024-07-24** [DataFusion 40.0.0 Release](https://datafusion.apache.org/blog/2024/07/24/datafusion-40.0.0/)
-
-- **2024-01-19** [DataFusion 34.0.0 Release](https://datafusion.apache.org/blog/2024/01/19/datafusion-34.0.0/)
-
-- **2023-06-24** [DataFusion 25.0.0 Release](https://arrow.apache.org/blog/2023/06/24/datafusion-25.0.0/)
-
-- **2023-01-19** [DataFusion 16.0.0 Release](https://arrow.apache.org/blog/2023/01/19/datafusion-16.0.0/)
-
-- **2022-10-25** [DataFusion 13.0.0 Release](https://arrow.apache.org/blog/2022/10/25/datafusion-13.0.0/)
-
-- **2022-05-16** [DataFusion 8.0.0 Release](https://arrow.apache.org/blog/2022/05/16/datafusion-8.0.0/)
-
-- **2022-02-28** [DataFusion 7.0.0 Release](https://arrow.apache.org/blog/2022/02/28/datafusion-7.0.0/)
-
-- **2021-11-19** [DataFusion 6.0.0 Release](https://arrow.apache.org/blog/2021/11/19/datafusion-6.0.0/)
-
-- **2021-08-18** [DataFusion 5.0.0 Release](https://arrow.apache.org/blog/2021/08/18/datafusion-5.0.0/)
-
-- **2019-09-22** [DataFusion 0.15.0 Release Notes](https://andygrove.io/2019/09/datafusion-0.15.0-release-notes/)
+- **2026-04-02** [Apache DataFusion 53.0.0 Released](https://datafusion.apache.org/blog/2026/04/02/datafusion-53.0.0)
+- **2026-03-18** [Apache DataFusion Comet 0.14.0 Release](https://datafusion.apache.org/blog/2026/03/18/datafusion-comet-0.14.0)
+- **2026-01-30** [Apache DataFusion Comet 0.13.0 Release](https://datafusion.apache.org/blog/2026/01/30/datafusion-comet-0.13.0)
+- **2026-01-12** [Apache DataFusion 52.0.0 Released](https://datafusion.apache.org/blog/2026/01/12/datafusion-52.0.0)
+- **2025-12-04** [Apache DataFusion Comet 0.12.0 Release](https://datafusion.apache.org/blog/2025/12/04/datafusion-comet-0.12.0)
+- **2025-11-25** [Apache DataFusion 51.0.0 Released](https://datafusion.apache.org/blog/2025/11/25/datafusion-51.0.0)
+- **2025-10-21** [Apache DataFusion Comet 0.11.0 Release](https://datafusion.apache.org/blog/2025/10/21/datafusion-comet-0.11.0)
+- **2025-09-29** [Apache DataFusion 50.0.0 Released](https://datafusion.apache.org/blog/2025/09/29/datafusion-50.0.0)
+- **2025-09-16** [Apache DataFusion Comet 0.10.0 Release](https://datafusion.apache.org/blog/2025/09/16/datafusion-comet-0.10.0)
+- **2025-07-28** [Apache DataFusion 49.0.0 Released](https://datafusion.apache.org/blog/2025/07/28/datafusion-49.0.0)
+- **2025-07-16** [Apache DataFusion 48.0.0 Released](https://datafusion.apache.org/blog/2025/07/16/datafusion-48.0.0)
+- **2025-07-11** [Apache DataFusion 47.0.0 Released](https://datafusion.apache.org/blog/2025/07/11/datafusion-47.0.0)
+- **2025-07-01** [Apache DataFusion Comet 0.9.0 Release](https://datafusion.apache.org/blog/2025/07/01/datafusion-comet-0.9.0)
+- **2025-05-06** [Apache DataFusion Comet 0.8.0 Release](https://datafusion.apache.org/blog/2025/05/06/datafusion-comet-0.8.0)
+- **2025-03-30** [Apache DataFusion Python 46.0.0 Released](https://datafusion.apache.org/blog/2025/03/30/datafusion-python-46.0.0)
+- **2025-03-24** [Apache DataFusion 46.0.0 Released](https://datafusion.apache.org/blog/2025/03/24/datafusion-46.0.0)
+- **2025-03-20** [Apache DataFusion Comet 0.7.0 Release](https://datafusion.apache.org/blog/2025/03/20/datafusion-comet-0.7.0)
+- **2025-02-20** [Apache DataFusion 45.0.0 Released](https://datafusion.apache.org/blog/2025/02/20/datafusion-45.0.0)
+- **2025-02-17** [Apache DataFusion Comet 0.6.0 Release](https://datafusion.apache.org/blog/2025/02/17/datafusion-comet-0.6.0)
+- **2025-02-02** [Apache DataFusion Ballista 43.0.0 Released](https://datafusion.apache.org/blog/2025/02/02/datafusion-ballista-43.0.0)
+- **2025-01-17** [Apache DataFusion Comet 0.5.0 Release](https://datafusion.apache.org/blog/2025/01/17/datafusion-comet-0.5.0)
 
 # 🌎 Community Events
 
+- **2026-07-22** [Denver Apache DataFusion Meetup](https://github.com/apache/datafusion/discussions/18428) - [RSVP](https://luma.com/jsu6faie)
+- **2026-05-12** [New York City Apache DataFusion Meetup](https://github.com/apache/datafusion/discussions/20030) - [RSVP](https://luma.com/adhshv92)
+- **2026-05-11** [San Francisco Apache DataFusion Meetup](https://github.com/apache/datafusion/discussions/21638) - [RSVP](https://luma.com/k3ointcl)
+- **2026-04-23** [Seattle Apache DataFusion Meetup](https://github.com/apache/datafusion/discussions/13500) - [RSVP](https://luma.com/hxshbp0m)
+- **2026-04-22** [Portland Apache DataFusion Meetup](https://github.com/apache/datafusion/discussions/19817) - [RSVP](https://luma.com/dsp3ud82)
+- **2026-03-05** [Stockholm Apache DataFusion Meetup](https://github.com/apache/datafusion/discussions/18429) - [RSVP](https://luma.com/ctqtiqap), [Recording](https://youtu.be/9u4cNmL14Xs)
+- **2026-02-19** [San Francisco Apache DataFusion Meetup](https://github.com/apache/datafusion/discussions/19859) - [RSVP](https://luma.com/p7r6fp2z), [Recording](https://www.youtube.com/playlist?list=PL42Ljm2tTt5peGUWMBN7WFkASq73j8PoU)
+- **2025-11-12** [Boston Apache DataFusion Meetup](https://github.com/apache/datafusion/discussions/16703) - [Recording](https://youtu.be/wCAud478Dg8), [Slides](https://drive.google.com/file/d/18KGH_wGHkgdAfjy5sQVKFhnN1GyYXSzU)
+- **2025-09-15** [New York City Apache DataFusion Meetup](https://github.com/apache/datafusion/discussions/16265) - [RSVP](https://lu.ma/qkcyycg0), [Recording](https://youtu.be/ElAiN_1fX_4)
 - **2025-01-23** [Amsterdam Apache DataFusion Meetup](https://github.com/apache/datafusion/discussions/12988) - [Slides](https://github.com/apache/datafusion/discussions/12988)
 - **2025-01-22** [Datadog Apache DataFusion Community Meeting](https://www.linkedin.com/posts/seshendranalla_apache-datafusion-community-meeting-2025-activity-7290384383201435648-8tqv) - [Recording](https://www.youtube.com/watch?v=ceTo2vUyRI0)
 - **2025-01-15** [Boston Apache DataFusion Meetup](https://github.com/apache/datafusion/discussions/13165) - [Slides](https://docs.google.com/presentation/d/1_zBLHdqxPlhWuNK2oCA2d_hCpb6HWgHbVJBseiUXA80)
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
index 5950a4fa9a6a9..46039f3c99c27 100644
--- a/docs/source/user-guide/configs.md
+++ b/docs/source/user-guide/configs.md
@@ -63,123 +63,159 @@ SET datafusion.execution.target_partitions = '1';
 
 The following configuration settings are available:
 
-| key                                                                     | default                   | description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| ----------------------------------------------------------------------- | ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| datafusion.catalog.create_default_catalog_and_schema                    | true                      | Whether the default catalog and schema should be created automatically.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| datafusion.catalog.default_catalog                                      | datafusion                | The default catalog name - this impacts what SQL queries use if not specified                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-| datafusion.catalog.default_schema                                       | public                    | The default schema name - this impacts what SQL queries use if not specified                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
-| datafusion.catalog.information_schema                                   | false                     | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.catalog.location                                             | NULL                      | Location scanned to load tables for `default` schema                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.catalog.format                                               | NULL                      | Type of `TableProvider` to use when loading `default` schema                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
-| datafusion.catalog.has_header                                           | true                      | Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| datafusion.catalog.newlines_in_values                                   | false                     | Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| datafusion.execution.batch_size                                         | 8192                      | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
-| datafusion.execution.coalesce_batches                                   | true                      | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| datafusion.execution.collect_statistics                                 | true                      | Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| datafusion.execution.target_partitions                                  | 0                         | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of CPU cores on the system                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| datafusion.execution.time_zone                                          | NULL                      | The default time zone Some functions, e.g. `now` return timestamps in this time zone                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.execution.parquet.enable_page_index                          | true                      | (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.execution.parquet.pruning                                    | true                      | (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.execution.parquet.skip_metadata                              | true                      | (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| datafusion.execution.parquet.metadata_size_hint                         | 524288                    | (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer Default setting to 512 KiB, which should be sufficient for most parquet files, it can reduce one I/O operation per parquet file. If the metadata is larger than the hint, two reads will still be performed.                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.execution.parquet.pushdown_filters                           | false                     | (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization".                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.execution.parquet.reorder_filters                            | false                     | (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.execution.parquet.schema_force_view_types                    | true                      | (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.execution.parquet.binary_as_string                           | false                     | (reading) If true, parquet reader will read columns of `Binary/LargeBinary` with `Utf8`, and `BinaryView` with `Utf8View`. Parquet files generated by some legacy writers do not correctly set the UTF8 flag for strings, causing string columns to be loaded as BLOB instead.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.execution.parquet.coerce_int96                               | NULL                      | (reading) If true, parquet reader will read columns of physical type int96 as originating from a different resolution than nanosecond. This is useful for reading data from systems like Spark which stores microsecond resolution timestamps in an int96 allowing it to write values with a larger date range than 64-bit timestamps with nanosecond resolution.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| datafusion.execution.parquet.bloom_filter_on_read                       | true                      | (reading) Use any available bloom filters when reading parquet files                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.execution.parquet.max_predicate_cache_size                   | NULL                      | (reading) The maximum predicate cache size, in bytes. When `pushdown_filters` is enabled, sets the maximum memory used to cache the results of predicate evaluation between filter evaluation and output generation. Decreasing this value will reduce memory usage, but may increase IO and CPU usage. None means use the default parquet reader setting. 0 means no caching.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.execution.parquet.data_pagesize_limit                        | 1048576                   | (writing) Sets best effort maximum size of data page in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-| datafusion.execution.parquet.write_batch_size                           | 1024                      | (writing) Sets write_batch_size in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| datafusion.execution.parquet.writer_version                             | 1.0                       | (writing) Sets parquet writer version valid values are "1.0" and "2.0"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| datafusion.execution.parquet.skip_arrow_metadata                        | false                     | (writing) Skip encoding the embedded arrow metadata in the KV_meta This is analogous to the `ArrowWriterOptions::with_skip_arrow_metadata`. Refer to <https://docs.rs/parquet/53.3.0/parquet/arrow/arrow_writer/struct.ArrowWriterOptions.html#method.with_skip_arrow_metadata>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| datafusion.execution.parquet.compression                                | zstd(3)                   | (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| datafusion.execution.parquet.dictionary_enabled                         | true                      | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.execution.parquet.dictionary_page_size_limit                 | 1048576                   | (writing) Sets best effort maximum dictionary page size, in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| datafusion.execution.parquet.statistics_enabled                         | page                      | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| datafusion.execution.parquet.max_row_group_size                         | 1048576                   | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| datafusion.execution.parquet.created_by                                 | datafusion version 50.3.0 | (writing) Sets "created by" property                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.execution.parquet.column_index_truncate_length               | 64                        | (writing) Sets column index truncate length                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| datafusion.execution.parquet.statistics_truncate_length                 | 64                        | (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| datafusion.execution.parquet.data_page_row_count_limit                  | 20000                     | (writing) Sets best effort maximum number of rows in data page                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.execution.parquet.encoding                                   | NULL                      | (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.execution.parquet.bloom_filter_on_write                      | false                     | (writing) Write bloom filters for all columns when creating parquet files                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.execution.parquet.bloom_filter_fpp                           | NULL                      | (writing) Sets bloom filter false positive probability. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.execution.parquet.bloom_filter_ndv                           | NULL                      | (writing) Sets bloom filter number of distinct values. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
-| datafusion.execution.parquet.allow_single_file_parallelism              | true                      | (writing) Controls whether DataFusion will attempt to speed up writing parquet files by serializing them in parallel. Each column in each row group in each output file are serialized in parallel leveraging a maximum possible core count of n_files*n_row_groups*n_columns.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.execution.parquet.maximum_parallel_row_group_writers         | 1                         | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.execution.parquet.maximum_buffered_record_batches_per_stream | 2                         | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.execution.planning_concurrency                               | 0                         | Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.execution.skip_physical_aggregate_schema_check               | false                     | When set to true, skips verifying that the schema produced by planning the input of `LogicalPlan::Aggregate` exactly matches the schema of the input plan. When set to false, if the schema does not match exactly (including nullability and metadata), a planning error will be raised. This is used to workaround bugs in the planner that are now caught by the new schema verification step.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| datafusion.execution.spill_compression                                  | uncompressed              | Sets the compression codec used when spilling data to disk. Since datafusion writes spill files using the Arrow IPC Stream format, only codecs supported by the Arrow IPC Stream Writer are allowed. Valid values are: uncompressed, lz4_frame, zstd. Note: lz4_frame offers faster (de)compression, but typically results in larger spill files. In contrast, zstd achieves higher compression ratios at the cost of slower (de)compression speed.                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
-| datafusion.execution.sort_spill_reservation_bytes                       | 10485760                  | Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| datafusion.execution.sort_in_place_threshold_bytes                      | 1048576                   | When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| datafusion.execution.meta_fetch_concurrency                             | 32                        | Number of files to read in parallel when inferring schema and statistics                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| datafusion.execution.minimum_parallel_output_files                      | 4                         | Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
-| datafusion.execution.soft_max_rows_per_output_file                      | 50000000                  | Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-| datafusion.execution.max_buffered_batches_per_output_file               | 2                         | This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| datafusion.execution.listing_table_ignore_subdirectory                  | true                      | Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| datafusion.execution.listing_table_factory_infer_partitions             | true                      | Should a `ListingTable` created through the `ListingTableFactory` infer table partitions from Hive compliant directories. Defaults to true (partition columns are inferred and will be represented in the table schema).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| datafusion.execution.enable_recursive_ctes                              | true                      | Should DataFusion support recursive CTEs                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| datafusion.execution.split_file_groups_by_statistics                    | false                     | Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| datafusion.execution.keep_partition_by_columns                          | false                     | Should DataFusion keep the columns used for partition_by in the output RecordBatches                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.execution.skip_partial_aggregation_probe_ratio_threshold     | 0.8                       | Aggregation ratio (number of distinct groups / number of input rows) threshold for skipping partial aggregation. If the value is greater then partial aggregation will skip aggregation for further input                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.execution.skip_partial_aggregation_probe_rows_threshold      | 100000                    | Number of input rows partial aggregation partition should process, before aggregation ratio check and trying to switch to skipping aggregation mode                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
-| datafusion.execution.use_row_number_estimates_to_optimize_partitioning  | false                     | Should DataFusion use row number estimates at the input to decide whether increasing parallelism is beneficial or not. By default, only exact row numbers (not estimates) are used for this decision. Setting this flag to `true` will likely produce better plans. if the source of statistics is accurate. We plan to make this the default in the future.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
-| datafusion.execution.enforce_batch_size_in_joins                        | false                     | Should DataFusion enforce batch size in joins or not. By default, DataFusion will not enforce batch size in joins. Enforcing batch size in joins can reduce memory usage when joining large tables with a highly-selective join filter, but is also slightly slower.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.execution.objectstore_writer_buffer_size                     | 10485760                  | Size (bytes) of data buffer DataFusion uses when writing output files. This affects the size of the data chunks that are uploaded to remote object stores (e.g. AWS S3). If very large (>= 100 GiB) output files are being written, it may be necessary to increase this size to avoid errors from the remote end point.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| datafusion.optimizer.enable_distinct_aggregation_soft_limit             | true                      | When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-| datafusion.optimizer.enable_round_robin_repartition                     | true                      | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| datafusion.optimizer.enable_topk_aggregation                            | true                      | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.optimizer.enable_window_limits                               | true                      | When set to true, the optimizer will attempt to push limit operations past window functions, if possible                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| datafusion.optimizer.enable_topk_dynamic_filter_pushdown                | true                      | When set to true, the optimizer will attempt to push down TopK dynamic filters into the file scan phase.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| datafusion.optimizer.enable_join_dynamic_filter_pushdown                | true                      | When set to true, the optimizer will attempt to push down Join dynamic filters into the file scan phase.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| datafusion.optimizer.enable_dynamic_filter_pushdown                     | true                      | When set to true attempts to push down dynamic filters generated by operators (topk & join) into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. The config will suppress `enable_join_dynamic_filter_pushdown` & `enable_topk_dynamic_filter_pushdown` So if you disable `enable_topk_dynamic_filter_pushdown`, then enable `enable_dynamic_filter_pushdown`, the `enable_topk_dynamic_filter_pushdown` will be overridden.                                                                                                                       |
-| datafusion.optimizer.filter_null_join_keys                              | false                     | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| datafusion.optimizer.repartition_aggregations                           | true                      | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| datafusion.optimizer.repartition_file_min_size                          | 10485760                  | Minimum total files size in bytes to perform file scan repartitioning.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| datafusion.optimizer.repartition_joins                                  | true                      | Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-| datafusion.optimizer.allow_symmetric_joins_without_pruning              | true                      | Should DataFusion allow symmetric hash joins for unbounded data sources even when its inputs do not have any ordering or filtering If the flag is not enabled, the SymmetricHashJoin operator will be unable to prune its internal buffers, resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, RightAnti, and RightSemi - being produced only at the end of the execution. This is not typical in stream processing. Additionally, without proper design for long runner execution, all types of joins may encounter out-of-memory errors.                                                                                                                                                                                                                                                                                                                                                |
-| datafusion.optimizer.repartition_file_scans                             | true                      | When set to `true`, datasource partitions will be repartitioned to achieve maximum parallelism. This applies to both in-memory partitions and FileSource's file groups (1 group is 1 partition). For FileSources, only Parquet and CSV formats are currently supported. If set to `true` for a FileSource, all files will be repartitioned evenly (i.e., a single large file might be partitioned into smaller chunks) for parallel scanning. If set to `false` for a FileSource, different files will be read in parallel, but repartitioning won't happen within a single file. If set to `true` for an in-memory source, all memtable's partitions will have their batches repartitioned evenly to the desired number of `target_partitions`. Repartitioning can change the total number of partitions and batches per partition, but does not slice the initial record tables provided to the MemTable on creation. |
-| datafusion.optimizer.repartition_windows                                | true                      | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| datafusion.optimizer.repartition_sorts                                  | true                      | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below `text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` would turn into the plan below which performs better in multithreaded environments `text "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", `                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.optimizer.prefer_existing_sort                               | false                     | When true, DataFusion will opportunistically remove sorts when the data is already sorted, (i.e. setting `preserve_order` to true on `RepartitionExec` and using `SortPreservingMergeExec`) When false, DataFusion will maximize plan parallelism using `RepartitionExec` even if this requires subsequently resorting data using a `SortExec`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| datafusion.optimizer.skip_failed_rules                                  | false                     | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.optimizer.max_passes                                         | 3                         | Number of times that the optimizer will attempt to optimize the plan                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.optimizer.top_down_join_key_reordering                       | true                      | When set to true, the physical plan optimizer will run a top down process to reorder the join keys                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-| datafusion.optimizer.prefer_hash_join                                   | true                      | When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.optimizer.enable_piecewise_merge_join                        | false                     | When set to true, piecewise merge join is enabled. PiecewiseMergeJoin is currently experimental. Physical planner will opt for PiecewiseMergeJoin when there is only one range filter.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| datafusion.optimizer.hash_join_single_partition_threshold               | 1048576                   | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.optimizer.hash_join_single_partition_threshold_rows          | 131072                    | The maximum estimated size in rows for one input side of a HashJoin will be collected into a single partition                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-| datafusion.optimizer.default_filter_selectivity                         | 20                        | The default filter selectivity used by Filter Statistics when an exact selectivity cannot be determined. Valid values are between 0 (no selectivity) and 100 (all rows are selected).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.optimizer.prefer_existing_union                              | false                     | When set to true, the optimizer will not attempt to convert Union to Interleave                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| datafusion.optimizer.expand_views_at_output                             | false                     | When set to true, if the returned type is a view type then the output will be coerced to a non-view. Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| datafusion.explain.logical_plan_only                                    | false                     | When set to true, the explain statement will only print logical plans                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.explain.physical_plan_only                                   | false                     | When set to true, the explain statement will only print physical plans                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| datafusion.explain.show_statistics                                      | false                     | When set to true, the explain statement will print operator statistics for physical plans                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.explain.show_sizes                                           | true                      | When set to true, the explain statement will print the partition sizes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| datafusion.explain.show_schema                                          | false                     | When set to true, the explain statement will print schema information                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.explain.format                                               | indent                    | Display format of explain. Default is "indent". When set to "tree", it will print the plan in a tree-rendered format.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.explain.tree_maximum_render_width                            | 240                       | (format=tree only) Maximum total width of the rendered tree. When set to 0, the tree will have no width limit.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.explain.analyze_level                                        | dev                       | Verbosity level for "EXPLAIN ANALYZE". Default is "dev" "summary" shows common metrics for high-level insights. "dev" provides deep operator-level introspection for developers.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-| datafusion.sql_parser.parse_float_as_decimal                            | false                     | When set to true, SQL parser will parse float as decimal type                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-| datafusion.sql_parser.enable_ident_normalization                        | true                      | When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.sql_parser.enable_options_value_normalization                | false                     | When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.sql_parser.dialect                                           | generic                   | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.sql_parser.support_varchar_with_length                       | true                      | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.sql_parser.map_string_types_to_utf8view                      | true                      | If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. If false, they are mapped to `Utf8`. Default is true.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| datafusion.sql_parser.collect_spans                                     | false                     | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| datafusion.sql_parser.recursion_limit                                   | 50                        | Specifies the recursion depth limit when parsing complex SQL Queries                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.sql_parser.default_null_ordering                             | nulls_max                 | Specifies the default null ordering for query results. There are 4 options: - `nulls_max`: Nulls appear last in ascending order. - `nulls_min`: Nulls appear first in ascending order. - `nulls_first`: Nulls always be first in any order. - `nulls_last`: Nulls always be last in any order. By default, `nulls_max` is used to follow Postgres's behavior. postgres rule: <https://www.postgresql.org/docs/current/queries-order.html>                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.format.safe                                                  | true                      | If set to `true` any formatting errors will be written to the output instead of being converted into a [`std::fmt::Error`]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| datafusion.format.null                                                  |                           | Format string for nulls                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| datafusion.format.date_format                                           | %Y-%m-%d                  | Date format for date arrays                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| datafusion.format.datetime_format                                       | %Y-%m-%dT%H:%M:%S%.f      | Format for DateTime arrays                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| datafusion.format.timestamp_format                                      | %Y-%m-%dT%H:%M:%S%.f      | Timestamp format for timestamp arrays                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.format.timestamp_tz_format                                   | NULL                      | Timestamp format for timestamp with timezone arrays. When `None`, ISO 8601 format is used.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| datafusion.format.time_format                                           | %H:%M:%S%.f               | Time format for time arrays                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| datafusion.format.duration_format                                       | pretty                    | Duration format. Can be either `"pretty"` or `"ISO8601"`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| datafusion.format.types_info                                            | false                     | Show types in visual representation batches                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| key                                                                     | default                   | description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| ----------------------------------------------------------------------- | ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| datafusion.catalog.create_default_catalog_and_schema                    | true                      | Whether the default catalog and schema should be created automatically.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| datafusion.catalog.default_catalog                                      | datafusion                | The default catalog name - this impacts what SQL queries use if not specified                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| datafusion.catalog.default_schema                                       | public                    | The default schema name - this impacts what SQL queries use if not specified                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| datafusion.catalog.information_schema                                   | false                     | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.catalog.location                                             | NULL                      | Location scanned to load tables for `default` schema                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.catalog.format                                               | NULL                      | Type of `TableProvider` to use when loading `default` schema                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| datafusion.catalog.has_header                                           | true                      | Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| datafusion.catalog.newlines_in_values                                   | false                     | Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.execution.batch_size                                         | 8192                      | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| datafusion.execution.perfect_hash_join_small_build_threshold            | 1024                      | A perfect hash join (see `HashJoinExec` for more details) will be considered if the range of keys (max - min) on the build side is < this threshold. This provides a fast path for joins with very small key ranges, bypassing the density check. Currently only supports cases where build_side.num_rows() < u32::MAX. Support for build_side.num_rows() >= u32::MAX will be added in the future.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| datafusion.execution.perfect_hash_join_min_key_density                  | 0.15                      | The minimum required density of join keys on the build side to consider a perfect hash join (see `HashJoinExec` for more details). Density is calculated as: `(number of rows) / (max_key - min_key + 1)`. A perfect hash join may be used if the actual key density > this value. Currently only supports cases where build_side.num_rows() < u32::MAX. Support for build_side.num_rows() >= u32::MAX will be added in the future.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| datafusion.execution.coalesce_batches                                   | true                      | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.execution.collect_statistics                                 | true                      | Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.execution.target_partitions                                  | 0                         | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of CPU cores on the system                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.execution.time_zone                                          | NULL                      | The default time zone Some functions, e.g. `now` return timestamps in this time zone                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.parquet.enable_page_index                          | true                      | (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.execution.parquet.pruning                                    | true                      | (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.execution.parquet.skip_metadata                              | true                      | (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.execution.parquet.metadata_size_hint                         | 524288                    | (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer Default setting to 512 KiB, which should be sufficient for most parquet files, it can reduce one I/O operation per parquet file. If the metadata is larger than the hint, two reads will still be performed.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.execution.parquet.pushdown_filters                           | false                     | (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization".                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.execution.parquet.reorder_filters                            | false                     | (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.execution.parquet.force_filter_selections                    | false                     | (reading) Force the use of RowSelections for filter results, when pushdown_filters is enabled. If false, the reader will automatically choose between a RowSelection and a Bitmap based on the number and pattern of selected rows.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| datafusion.execution.parquet.schema_force_view_types                    | true                      | (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.parquet.binary_as_string                           | false                     | (reading) If true, parquet reader will read columns of `Binary/LargeBinary` with `Utf8`, and `BinaryView` with `Utf8View`. Parquet files generated by some legacy writers do not correctly set the UTF8 flag for strings, causing string columns to be loaded as BLOB instead.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.execution.parquet.coerce_int96                               | NULL                      | (reading) If true, parquet reader will read columns of physical type int96 as originating from a different resolution than nanosecond. This is useful for reading data from systems like Spark which stores microsecond resolution timestamps in an int96 allowing it to write values with a larger date range than 64-bit timestamps with nanosecond resolution.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| datafusion.execution.parquet.bloom_filter_on_read                       | true                      | (reading) Use any available bloom filters when reading parquet files                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.parquet.max_predicate_cache_size                   | NULL                      | (reading) The maximum predicate cache size, in bytes. When `pushdown_filters` is enabled, sets the maximum memory used to cache the results of predicate evaluation between filter evaluation and output generation. Decreasing this value will reduce memory usage, but may increase IO and CPU usage. None means use the default parquet reader setting. 0 means no caching.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.execution.parquet.data_pagesize_limit                        | 1048576                   | (writing) Sets best effort maximum size of data page in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| datafusion.execution.parquet.write_batch_size                           | 1024                      | (writing) Sets write_batch_size in rows                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| datafusion.execution.parquet.writer_version                             | 1.0                       | (writing) Sets parquet writer version valid values are "1.0" and "2.0"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| datafusion.execution.parquet.skip_arrow_metadata                        | false                     | (writing) Skip encoding the embedded arrow metadata in the KV_meta This is analogous to the `ArrowWriterOptions::with_skip_arrow_metadata`. Refer to <https://docs.rs/parquet/53.3.0/parquet/arrow/arrow_writer/struct.ArrowWriterOptions.html#method.with_skip_arrow_metadata>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| datafusion.execution.parquet.compression                                | zstd(3)                   | (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| datafusion.execution.parquet.dictionary_enabled                         | true                      | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.execution.parquet.dictionary_page_size_limit                 | 1048576                   | (writing) Sets best effort maximum dictionary page size, in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| datafusion.execution.parquet.statistics_enabled                         | page                      | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| datafusion.execution.parquet.max_row_group_size                         | 1048576                   | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| datafusion.execution.parquet.created_by                                 | datafusion version 53.1.0 | (writing) Sets "created by" property                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.parquet.column_index_truncate_length               | 64                        | (writing) Sets column index truncate length                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.execution.parquet.statistics_truncate_length                 | 64                        | (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| datafusion.execution.parquet.data_page_row_count_limit                  | 20000                     | (writing) Sets best effort maximum number of rows in data page                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.execution.parquet.encoding                                   | NULL                      | (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.parquet.bloom_filter_on_write                      | false                     | (writing) Write bloom filters for all columns when creating parquet files                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.execution.parquet.bloom_filter_fpp                           | NULL                      | (writing) Sets bloom filter false positive probability. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.parquet.bloom_filter_ndv                           | NULL                      | (writing) Sets bloom filter number of distinct values. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| datafusion.execution.parquet.allow_single_file_parallelism              | true                      | (writing) Controls whether DataFusion will attempt to speed up writing parquet files by serializing them in parallel. Each column in each row group in each output file are serialized in parallel leveraging a maximum possible core count of n_files*n_row_groups*n_columns.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.execution.parquet.maximum_parallel_row_group_writers         | 1                         | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.execution.parquet.maximum_buffered_record_batches_per_stream | 2                         | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.execution.parquet.use_content_defined_chunking               | NULL                      | (writing) EXPERIMENTAL: Enable content-defined chunking (CDC) when writing parquet files. When `Some`, CDC is enabled with the given options; when `None` (the default), CDC is disabled. When CDC is enabled, parallel writing is automatically disabled since the chunker state must persist across row groups.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| datafusion.execution.planning_concurrency                               | 0                         | Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.skip_physical_aggregate_schema_check               | false                     | When set to true, skips verifying that the schema produced by planning the input of `LogicalPlan::Aggregate` exactly matches the schema of the input plan. When set to false, if the schema does not match exactly (including nullability and metadata), a planning error will be raised. This is used to workaround bugs in the planner that are now caught by the new schema verification step.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| datafusion.execution.spill_compression                                  | uncompressed              | Sets the compression codec used when spilling data to disk. Since datafusion writes spill files using the Arrow IPC Stream format, only codecs supported by the Arrow IPC Stream Writer are allowed. Valid values are: uncompressed, lz4_frame, zstd. Note: lz4_frame offers faster (de)compression, but typically results in larger spill files. In contrast, zstd achieves higher compression ratios at the cost of slower (de)compression speed.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| datafusion.execution.sort_spill_reservation_bytes                       | 10485760                  | Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| datafusion.execution.sort_in_place_threshold_bytes                      | 1048576                   | When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| datafusion.execution.sort_pushdown_buffer_capacity                      | 1073741824                | Maximum buffer capacity (in bytes) per partition for BufferExec inserted during sort pushdown optimization. When PushdownSort eliminates a SortExec under SortPreservingMergeExec, a BufferExec is inserted to replace SortExec's buffering role. This prevents I/O stalls by allowing the scan to run ahead of the merge. This uses strictly less memory than the SortExec it replaces (which buffers the entire partition). The buffer respects the global memory pool limit. Setting this to a large value is safe — actual memory usage is bounded by partition size and global memory limits.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| datafusion.execution.max_spill_file_size_bytes                          | 134217728                 | Maximum size in bytes for individual spill files before rotating to a new file. When operators spill data to disk (e.g., RepartitionExec), they write multiple batches to the same file until this size limit is reached, then rotate to a new file. This reduces syscall overhead compared to one-file-per-batch while preventing files from growing too large. A larger value reduces file creation overhead but may hold more disk space. A smaller value creates more files but allows finer-grained space reclamation as files can be deleted once fully consumed. Now only `RepartitionExec` supports this spill file rotation feature, other spilling operators may create spill files larger than the limit. Default: 128 MB                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.meta_fetch_concurrency                             | 32                        | Number of files to read in parallel when inferring schema and statistics                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.execution.minimum_parallel_output_files                      | 4                         | Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| datafusion.execution.soft_max_rows_per_output_file                      | 50000000                  | Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| datafusion.execution.max_buffered_batches_per_output_file               | 2                         | This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| datafusion.execution.listing_table_ignore_subdirectory                  | true                      | Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| datafusion.execution.listing_table_factory_infer_partitions             | true                      | Should a `ListingTable` created through the `ListingTableFactory` infer table partitions from Hive compliant directories. Defaults to true (partition columns are inferred and will be represented in the table schema).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.execution.enable_recursive_ctes                              | true                      | Should DataFusion support recursive CTEs                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.execution.split_file_groups_by_statistics                    | false                     | Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| datafusion.execution.keep_partition_by_columns                          | false                     | Should DataFusion keep the columns used for partition_by in the output RecordBatches                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.skip_partial_aggregation_probe_ratio_threshold     | 0.8                       | Aggregation ratio (number of distinct groups / number of input rows) threshold for skipping partial aggregation. If the value is greater then partial aggregation will skip aggregation for further input                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.execution.skip_partial_aggregation_probe_rows_threshold      | 100000                    | Number of input rows partial aggregation partition should process, before aggregation ratio check and trying to switch to skipping aggregation mode                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| datafusion.execution.use_row_number_estimates_to_optimize_partitioning  | false                     | Should DataFusion use row number estimates at the input to decide whether increasing parallelism is beneficial or not. By default, only exact row numbers (not estimates) are used for this decision. Setting this flag to `true` will likely produce better plans. if the source of statistics is accurate. We plan to make this the default in the future.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| datafusion.execution.enforce_batch_size_in_joins                        | false                     | Should DataFusion enforce batch size in joins or not. By default, DataFusion will not enforce batch size in joins. Enforcing batch size in joins can reduce memory usage when joining large tables with a highly-selective join filter, but is also slightly slower.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.objectstore_writer_buffer_size                     | 10485760                  | Size (bytes) of data buffer DataFusion uses when writing output files. This affects the size of the data chunks that are uploaded to remote object stores (e.g. AWS S3). If very large (>= 100 GiB) output files are being written, it may be necessary to increase this size to avoid errors from the remote end point.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.execution.enable_ansi_mode                                   | false                     | Whether to enable ANSI SQL mode. The flag is experimental and relevant only for DataFusion Spark built-in functions When `enable_ansi_mode` is set to `true`, the query engine follows ANSI SQL semantics for expressions, casting, and error handling. This means: - **Strict type coercion rules:** implicit casts between incompatible types are disallowed. - **Standard SQL arithmetic behavior:** operations such as division by zero, numeric overflow, or invalid casts raise runtime errors rather than returning `NULL` or adjusted values. - **Consistent ANSI behavior** for string concatenation, comparisons, and `NULL` handling. When `enable_ansi_mode` is `false` (the default), the engine uses a more permissive, non-ANSI mode designed for user convenience and backward compatibility. In this mode: - Implicit casts between types are allowed (e.g., string to integer when possible). - Arithmetic operations are more lenient — for example, `abs()` on the minimum representable integer value returns the input value instead of raising overflow. - Division by zero or invalid casts may return `NULL` instead of failing. # Default `false` — ANSI SQL mode is disabled by default.                          |
+| datafusion.execution.hash_join_buffering_capacity                       | 0                         | How many bytes to buffer in the probe side of hash joins while the build side is concurrently being built. Without this, hash joins will wait until the full materialization of the build side before polling the probe side. This is useful in scenarios where the query is not completely CPU bounded, allowing to do some early work concurrently and reducing the latency of the query. Note that when hash join buffering is enabled, the probe side will start eagerly polling data, not giving time for the producer side of dynamic filters to produce any meaningful predicate. Queries with dynamic filters might see performance degradation. Disabled by default, set to a number greater than 0 for enabling it.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| datafusion.optimizer.enable_distinct_aggregation_soft_limit             | true                      | When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| datafusion.optimizer.enable_round_robin_repartition                     | true                      | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.optimizer.enable_topk_aggregation                            | true                      | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.optimizer.enable_window_limits                               | true                      | When set to true, the optimizer will attempt to push limit operations past window functions, if possible                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.optimizer.enable_window_topn                                 | false                     | When set to true, the optimizer will replace Filter(rn<=K) → Window(ROW_NUMBER) → Sort patterns with a PartitionedTopKExec that maintains per-partition heaps, avoiding a full sort of the input. When the window partition key has low cardinality, enabling this optimization can improve performance. However, for high cardinality keys, it may cause regressions in both memory usage and runtime.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| datafusion.optimizer.enable_topk_repartition                            | true                      | When set to true, the optimizer will push TopK (Sort with fetch) below hash repartition when the partition key is a prefix of the sort key, reducing data volume before the shuffle.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.optimizer.enable_topk_dynamic_filter_pushdown                | true                      | When set to true, the optimizer will attempt to push down TopK dynamic filters into the file scan phase.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.optimizer.enable_join_dynamic_filter_pushdown                | true                      | When set to true, the optimizer will attempt to push down Join dynamic filters into the file scan phase.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown           | true                      | When set to true, the optimizer will attempt to push down Aggregate dynamic filters into the file scan phase.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| datafusion.optimizer.enable_dynamic_filter_pushdown                     | true                      | When set to true attempts to push down dynamic filters generated by operators (TopK, Join & Aggregate) into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. The config will suppress `enable_join_dynamic_filter_pushdown`, `enable_topk_dynamic_filter_pushdown` & `enable_aggregate_dynamic_filter_pushdown` So if you disable `enable_topk_dynamic_filter_pushdown`, then enable `enable_dynamic_filter_pushdown`, the `enable_topk_dynamic_filter_pushdown` will be overridden.                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.optimizer.filter_null_join_keys                              | false                     | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| datafusion.optimizer.repartition_aggregations                           | true                      | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.optimizer.repartition_file_min_size                          | 10485760                  | Minimum total files size in bytes to perform file scan repartitioning.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| datafusion.optimizer.repartition_joins                                  | true                      | Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| datafusion.optimizer.allow_symmetric_joins_without_pruning              | true                      | Should DataFusion allow symmetric hash joins for unbounded data sources even when its inputs do not have any ordering or filtering If the flag is not enabled, the SymmetricHashJoin operator will be unable to prune its internal buffers, resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, RightAnti, and RightSemi - being produced only at the end of the execution. This is not typical in stream processing. Additionally, without proper design for long runner execution, all types of joins may encounter out-of-memory errors.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.optimizer.repartition_file_scans                             | true                      | When set to `true`, datasource partitions will be repartitioned to achieve maximum parallelism. This applies to both in-memory partitions and FileSource's file groups (1 group is 1 partition). For FileSources, only Parquet and CSV formats are currently supported. If set to `true` for a FileSource, all files will be repartitioned evenly (i.e., a single large file might be partitioned into smaller chunks) for parallel scanning. If set to `false` for a FileSource, different files will be read in parallel, but repartitioning won't happen within a single file. If set to `true` for an in-memory source, all memtable's partitions will have their batches repartitioned evenly to the desired number of `target_partitions`. Repartitioning can change the total number of partitions and batches per partition, but does not slice the initial record tables provided to the MemTable on creation.                                                                                                                                                                                                                                                                                                                      |
+| datafusion.optimizer.preserve_file_partitions                           | 0                         | Minimum number of distinct partition values required to group files by their Hive partition column values (enabling Hash partitioning declaration). How the option is used: - preserve_file_partitions=0: Disable it. - preserve_file_partitions=1: Always enable it. - preserve_file_partitions=N, actual file partitions=M: Only enable when M >= N. This threshold preserves I/O parallelism when file partitioning is below it. Note: This may reduce parallelism, rooting from the I/O level, if the number of distinct partitions is less than the target_partitions.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.optimizer.repartition_windows                                | true                      | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| datafusion.optimizer.repartition_sorts                                  | true                      | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below `text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` would turn into the plan below which performs better in multithreaded environments `text "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", `                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.optimizer.subset_repartition_threshold                       | 4                         | Partition count threshold for subset satisfaction optimization. When the current partition count is >= this threshold, DataFusion will skip repartitioning if the required partitioning expression is a subset of the current partition expression such as Hash(a) satisfies Hash(a, b). When the current partition count is < this threshold, DataFusion will repartition to increase parallelism even when subset satisfaction applies. Set to 0 to always repartition (disable subset satisfaction optimization). Set to a high value to always use subset satisfaction. Example (subset_repartition_threshold = 4): `text Hash([a]) satisfies Hash([a, b]) because (Hash([a, b]) is subset of Hash([a]) If current partitions (3) < threshold (4), repartition: AggregateExec: mode=FinalPartitioned, gby=[a, b], aggr=[SUM(x)] RepartitionExec: partitioning=Hash([a, b], 8), input_partitions=3 AggregateExec: mode=Partial, gby=[a, b], aggr=[SUM(x)] DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 3) If current partitions (8) >= threshold (4), use subset satisfaction: AggregateExec: mode=SinglePartitioned, gby=[a, b], aggr=[SUM(x)] DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 8) ` |
+| datafusion.optimizer.prefer_existing_sort                               | false                     | When true, DataFusion will opportunistically remove sorts when the data is already sorted, (i.e. setting `preserve_order` to true on `RepartitionExec` and using `SortPreservingMergeExec`) When false, DataFusion will maximize plan parallelism using `RepartitionExec` even if this requires subsequently resorting data using a `SortExec`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| datafusion.optimizer.skip_failed_rules                                  | false                     | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.optimizer.max_passes                                         | 3                         | Number of times that the optimizer will attempt to optimize the plan                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.optimizer.top_down_join_key_reordering                       | true                      | When set to true, the physical plan optimizer will run a top down process to reorder the join keys                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| datafusion.optimizer.join_reordering                                    | true                      | When set to true, the physical plan optimizer may swap join inputs based on statistics. When set to false, statistics-driven join input reordering is disabled and the original join order in the query is used.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| datafusion.optimizer.use_statistics_registry                            | false                     | When set to true, the physical plan optimizer uses the pluggable `StatisticsRegistry` for statistics propagation across operators. This enables more accurate cardinality estimates compared to each operator's built-in `partition_statistics`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| datafusion.optimizer.prefer_hash_join                                   | true                      | When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.optimizer.enable_piecewise_merge_join                        | false                     | When set to true, piecewise merge join is enabled. PiecewiseMergeJoin is currently experimental. Physical planner will opt for PiecewiseMergeJoin when there is only one range filter.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| datafusion.optimizer.hash_join_single_partition_threshold               | 1048576                   | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.optimizer.hash_join_single_partition_threshold_rows          | 131072                    | The maximum estimated size in rows for one input side of a HashJoin will be collected into a single partition                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| datafusion.optimizer.hash_join_inlist_pushdown_max_size                 | 131072                    | Maximum size in bytes for the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides larger than this will use hash table lookups instead. Set to 0 to always use hash table lookups. InList pushdown can be more efficient for small build sides because it can result in better statistics pruning as well as use any bloom filters present on the scan side. InList expressions are also more transparent and easier to serialize over the network in distributed uses of DataFusion. On the other hand InList pushdown requires making a copy of the data and thus adds some overhead to the build side and uses more memory. This setting is per-partition, so we may end up using `hash_join_inlist_pushdown_max_size` \* `target_partitions` memory. The default is 128kB per partition. This should allow point lookup joins (e.g. joining on a unique primary key) to use InList pushdown in most cases but avoids excessive memory usage or overhead for larger joins.                                                                                                                                                                                                             |
+| datafusion.optimizer.hash_join_inlist_pushdown_max_distinct_values      | 150                       | Maximum number of distinct values (rows) in the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides with more rows than this will use hash table lookups instead. Set to 0 to always use hash table lookups. This provides an additional limit beyond `hash_join_inlist_pushdown_max_size` to prevent very large IN lists that might not provide much benefit over hash table lookups. This uses the deduplicated row count once the build side has been evaluated. The default is 150 values per partition. This is inspired by Trino's `max-filter-keys-per-column` setting. See: <https://trino.io/docs/current/admin/dynamic-filtering.html#dynamic-filter-collection-thresholds>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.optimizer.default_filter_selectivity                         | 20                        | The default filter selectivity used by Filter Statistics when an exact selectivity cannot be determined. Valid values are between 0 (no selectivity) and 100 (all rows are selected).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.optimizer.prefer_existing_union                              | false                     | When set to true, the optimizer will not attempt to convert Union to Interleave                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| datafusion.optimizer.expand_views_at_output                             | false                     | When set to true, if the returned type is a view type then the output will be coerced to a non-view. Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.optimizer.enable_sort_pushdown                               | true                      | Enable sort pushdown optimization. When enabled, attempts to push sort requirements down to data sources that can natively handle them (e.g., by reversing file/row group read order). Returns **inexact ordering**: Sort operator is kept for correctness, but optimized input enables early termination for TopK queries (ORDER BY ... LIMIT N), providing significant speedup. Memory: No additional overhead (only changes read order). Future: Will add option to detect perfectly sorted data and eliminate Sort completely. Default: true                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| datafusion.optimizer.enable_leaf_expression_pushdown                    | true                      | When set to true, the optimizer will extract leaf expressions (such as `get_field`) from filter/sort/join nodes into projections closer to the leaf table scans, and push those projections down towards the leaf nodes.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.explain.logical_plan_only                                    | false                     | When set to true, the explain statement will only print logical plans                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.explain.physical_plan_only                                   | false                     | When set to true, the explain statement will only print physical plans                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| datafusion.explain.show_statistics                                      | false                     | When set to true, the explain statement will print operator statistics for physical plans                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.explain.show_sizes                                           | true                      | When set to true, the explain statement will print the partition sizes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| datafusion.explain.show_schema                                          | false                     | When set to true, the explain statement will print schema information                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.explain.format                                               | indent                    | Display format of explain. Default is "indent". When set to "tree", it will print the plan in a tree-rendered format.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.explain.tree_maximum_render_width                            | 240                       | (format=tree only) Maximum total width of the rendered tree. When set to 0, the tree will have no width limit.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.explain.analyze_level                                        | dev                       | Verbosity level for "EXPLAIN ANALYZE". Default is "dev" "summary" shows common metrics for high-level insights. "dev" provides deep operator-level introspection for developers.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| datafusion.explain.analyze_categories                                   | all                       | Which metric categories to include in "EXPLAIN ANALYZE" output. Comma-separated list of: "rows", "bytes", "timing", "uncategorized". Use "none" to show plan structure only, or "all" (default) to show everything. Metrics without a declared category are treated as "uncategorized".                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| datafusion.sql_parser.parse_float_as_decimal                            | false                     | When set to true, SQL parser will parse float as decimal type                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| datafusion.sql_parser.enable_ident_normalization                        | true                      | When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.sql_parser.enable_options_value_normalization                | false                     | When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.sql_parser.dialect                                           | generic                   | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.sql_parser.support_varchar_with_length                       | true                      | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.sql_parser.map_string_types_to_utf8view                      | true                      | If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. If false, they are mapped to `Utf8`. Default is true.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.sql_parser.collect_spans                                     | false                     | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.sql_parser.recursion_limit                                   | 50                        | Specifies the recursion depth limit when parsing complex SQL Queries                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.sql_parser.default_null_ordering                             | nulls_max                 | Specifies the default null ordering for query results. There are 4 options: - `nulls_max`: Nulls appear last in ascending order. - `nulls_min`: Nulls appear first in ascending order. - `nulls_first`: Nulls always be first in any order. - `nulls_last`: Nulls always be last in any order. By default, `nulls_max` is used to follow Postgres's behavior. postgres rule: <https://www.postgresql.org/docs/current/queries-order.html>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.sql_parser.enable_subquery_sort_elimination                  | true                      | When set to true, DataFusion may remove `ORDER BY` clauses from subqueries or CTEs during SQL planning when their ordering cannot affect the result, such as when no `LIMIT` or other order-sensitive operator depends on them. Disable this option to preserve explicit subquery ordering in the planned query.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| datafusion.format.safe                                                  | true                      | If set to `true` any formatting errors will be written to the output instead of being converted into a [`std::fmt::Error`]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.format.null                                                  |                           | Format string for nulls                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| datafusion.format.date_format                                           | %Y-%m-%d                  | Date format for date arrays                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.format.datetime_format                                       | %Y-%m-%dT%H:%M:%S%.f      | Format for DateTime arrays                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.format.timestamp_format                                      | %Y-%m-%dT%H:%M:%S%.f      | Timestamp format for timestamp arrays                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.format.timestamp_tz_format                                   | NULL                      | Timestamp format for timestamp with timezone arrays. When `None`, ISO 8601 format is used.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.format.time_format                                           | %H:%M:%S%.f               | Time format for time arrays                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.format.duration_format                                       | pretty                    | Duration format. Can be either `"pretty"` or `"ISO8601"`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.format.types_info                                            | false                     | Show types in visual representation batches                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+
+You can also reset configuration options to default settings via SQL using the `RESET` command. For
+example, to set and reset `datafusion.execution.batch_size`:
+
+```sql
+SET datafusion.execution.batch_size = '10000';
+
+SHOW datafusion.execution.batch_size;
+datafusion.execution.batch_size 10000
+
+RESET datafusion.execution.batch_size;
+
+SHOW datafusion.execution.batch_size;
+datafusion.execution.batch_size 8192
+```
 
 # Runtime Configuration Settings
 
@@ -195,6 +231,8 @@ The following runtime configuration settings are available:
 
 | key                                        | default | description                                                                                                                                                               |
 | ------------------------------------------ | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| datafusion.runtime.list_files_cache_limit  | 1M      | Maximum memory to use for list files cache. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.                             |
+| datafusion.runtime.list_files_cache_ttl    | NULL    | TTL (time-to-live) of the entries in the list file cache. Supports units m (minutes), and s (seconds). Example: '2m' for 2 minutes.                                       |
 | datafusion.runtime.max_temp_directory_size | 100G    | Maximum temporary file directory size. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.                                  |
 | datafusion.runtime.memory_limit            | NULL    | Maximum memory limit for query execution. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.                               |
 | datafusion.runtime.metadata_cache_limit    | 50M     | Maximum memory to use for file metadata cache such as Parquet metadata. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes. |
diff --git a/docs/source/user-guide/crate-configuration.md b/docs/source/user-guide/crate-configuration.md
index eecf7f5bde6e1..92c0f37807c72 100644
--- a/docs/source/user-guide/crate-configuration.md
+++ b/docs/source/user-guide/crate-configuration.md
@@ -24,6 +24,7 @@ your Rust project. The [Configuration Settings] section lists options that
 control additional aspects DataFusion's runtime behavior.
 
 [configuration settings]: configs.md
+[support for adding dependencies]: https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#specifying-dependencies
 
 ## Using the nightly DataFusion builds
 
@@ -53,7 +54,7 @@ More on [Cargo dependencies](https://doc.rust-lang.org/cargo/reference/specifyin
 
 ## Optimizing Builds
 
-Here are several suggestions to get the Rust compler to produce faster code when
+Here are several suggestions to get the Rust compiler to produce faster code when
 compiling DataFusion. Note that these changes may increase compile time and
 binary size.
 
@@ -92,6 +93,36 @@ lto = true
 codegen-units = 1
 ```
 
+### Profile Guided Optimization (PGO)
+
+Profile Guided Optimization can improve DataFusion performance by up to 25%. It works by compiling with instrumentation, running representative workloads to collect profile data, then recompiling with optimizations based on that data.
+
+Build with instrumentation:
+
+```shell
+RUSTFLAGS="-C profile-generate=/tmp/pgo-data" cargo build --release
+```
+
+Run your workloads to collect profile data. Use benchmarks like TPCH or Clickbench, or your actual production queries:
+
+```shell
+./target/release/your-datafusion-app --benchmark
+```
+
+Rebuild using the collected profile:
+
+```shell
+RUSTFLAGS="-C profile-use=/tmp/pgo-data" cargo build --release
+```
+
+Tips:
+
+- Use workloads that match your production patterns
+- Run multiple iterations during profiling for better coverage
+- Combine with LTO and CPU-specific optimizations for best results
+
+See the [Rust compiler guide](https://rustc-dev-guide.rust-lang.org/building/optimized-build.html#profile-guided-optimization) for more details. Discussion and results in [issue #9507](https://github.com/apache/datafusion/issues/9507).
+
 ### Alternate Allocator: `snmalloc`
 
 You can also use [snmalloc-rs](https://crates.io/crates/snmalloc-rs) crate as
@@ -125,7 +156,7 @@ By default, Datafusion returns errors as a plain text message. You can enable mo
 such as backtraces by enabling the `backtrace` feature to your `Cargo.toml` file like this:
 
 ```toml
-datafusion = { version = "31.0.0", features = ["backtrace"]}
+datafusion = { version = "53.0.0", features = ["backtrace"]}
 ```
 
 Set environment [variables](https://doc.rust-lang.org/std/backtrace/index.html#environment-variables)
diff --git a/docs/source/user-guide/dataframe.md b/docs/source/user-guide/dataframe.md
index 85724a72399ad..d35b543bfdadd 100644
--- a/docs/source/user-guide/dataframe.md
+++ b/docs/source/user-guide/dataframe.md
@@ -122,4 +122,4 @@ async fn main() -> Result<()> {
 [`collect`]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html#method.collect
 [library users guide]: ../library-user-guide/using-the-dataframe-api.md
 [api reference on docs.rs]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html
-[expressions reference]: expressions
+[expressions reference]: expressions.md
diff --git a/docs/source/user-guide/example-usage.md b/docs/source/user-guide/example-usage.md
index 6108315f398aa..fd755715eec91 100644
--- a/docs/source/user-guide/example-usage.md
+++ b/docs/source/user-guide/example-usage.md
@@ -29,7 +29,7 @@ Find latest available Datafusion version on [DataFusion's
 crates.io] page. Add the dependency to your `Cargo.toml` file:
 
 ```toml
-datafusion = "latest_version"
+datafusion = "53.0.0"
 tokio = { version = "1.0", features = ["rt-multi-thread"] }
 ```
 
@@ -103,8 +103,8 @@ exported by DataFusion, for example:
 use datafusion::arrow::datatypes::Schema;
 ```
 
-For example, [DataFusion `25.0.0` dependencies] require `arrow`
-`39.0.0`. If instead you used `arrow` `40.0.0` in your project you may
+For example, [DataFusion `26.0.0` dependencies] require `arrow`
+`40.0.0`. If instead you used `arrow` `41.0.0` in your project you may
 see errors such as:
 
 ```text
diff --git a/docs/source/user-guide/explain-usage.md b/docs/source/user-guide/explain-usage.md
index 5a1184539c034..4102c436b0c1c 100644
--- a/docs/source/user-guide/explain-usage.md
+++ b/docs/source/user-guide/explain-usage.md
@@ -159,7 +159,7 @@ next section)
 ## More Debugging Information: `EXPLAIN VERBOSE`
 
 If the plan has to read too many files, not all of them will be shown in the
-`EXPLAIN`. To see them, use `EXPLAIN VEBOSE`. Like `EXPLAIN`, `EXPLAIN VERBOSE`
+`EXPLAIN`. To see them, use `EXPLAIN VERBOSE`. Like `EXPLAIN`, `EXPLAIN VERBOSE`
 does not run the query. Instead it shows the full explain plan, with information
 that is omitted from the default explain, as well as all intermediate physical
 plans DataFusion generates before returning. This mode can be very helpful for
@@ -225,9 +225,12 @@ Again, reading from bottom up:
 
 When predicate pushdown is enabled, `DataSourceExec` with `ParquetSource` gains the following metrics:
 
+- `output_rows_skew`: output skew score derived from per-partition `output_rows`. `0%` is perfectly balanced, `100%` is maximally skewed, and `N/A` means no output rows were produced.
 - `page_index_rows_pruned`: number of rows evaluated by page index filters. The metric reports both how many rows were considered in total and how many matched (were not pruned).
+- `page_index_pages_pruned`: number of pages evaluated by page index filters. The metric reports both how many pages were considered in total and how many matched (were not pruned).
 - `row_groups_pruned_bloom_filter`: number of row groups evaluated by Bloom Filters, reporting both total checked groups and groups that matched.
 - `row_groups_pruned_statistics`: number of row groups evaluated by row-group statistics (min/max), reporting both total checked groups and groups that matched.
+- `limit_pruned_row_groups`: number of row groups pruned by the limit.
 - `pushdown_rows_matched`: rows that were tested by any of the above filters, and passed all of them.
 - `pushdown_rows_pruned`: rows that were tested by any of the above filters, and did not pass at least one of them.
 - `predicate_evaluation_errors`: number of times evaluating the filter expression failed (expected to be zero in normal operation)
diff --git a/docs/source/user-guide/expressions.md b/docs/source/user-guide/expressions.md
index 56e4369a9b8b5..3fbc11e0c92c0 100644
--- a/docs/source/user-guide/expressions.md
+++ b/docs/source/user-guide/expressions.md
@@ -179,8 +179,8 @@ select log(-1), log(0), sqrt(-1);
 | ascii(character)                               | Returns a numeric representation of the character (`character`). Example: `ascii('a') -> 97`                                                                                                                                             |
 | bit_length(text)                               | Returns the length of the string (`text`) in bits. Example: `bit_length('spider') -> 48`                                                                                                                                                 |
 | btrim(text, characters)                        | Removes all specified characters (`characters`) from both the beginning and the end of the string (`text`). Example: `btrim('aabchelloccb', 'abc') -> hello`                                                                             |
-| char_length(text)                              | Returns number of characters in the string (`text`). The same as `character_length` and `length`. Example: `character_length('lion') -> 4`                                                                                               |
-| character_length(text)                         | Returns number of characters in the string (`text`). The same as `char_length` and `length`. Example: `char_length('lion') -> 4`                                                                                                         |
+| char_length(text)                              | Returns number of characters in the string (`text`). The same as `character_length` and `length`. Example: `char_length('lion') -> 4`                                                                                                    |
+| character_length(text)                         | Returns number of characters in the string (`text`). The same as `char_length` and `length`. Example: `character_length('lion') -> 4`                                                                                                    |
 | concat(value1, [value2 [, ...]])               | Concatenates the text representations (`value1, [value2 [, ...]]`) of all the arguments. NULL arguments are ignored. Example: `concat('aaa', 'bbc', NULL, 321) -> aaabbc321`                                                             |
 | concat_ws(separator, value1, [value2 [, ...]]) | Concatenates the text representations (`value1, [value2 [, ...]]`) of all the arguments with the separator (`separator`). NULL arguments are ignored. `concat_ws('/', 'path', 'to', NULL, 'my', 'folder', 123) -> path/to/my/folder/123` |
 | chr(integer)                                   | Returns a character by its numeric representation (`integer`). Example: `chr(90) -> 8`                                                                                                                                                   |
@@ -209,46 +209,46 @@ select log(-1), log(0), sqrt(-1);
 
 ## Array Expressions
 
-| Syntax                                         | Description                                                                                                                                                                                                             |
-| ---------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| array_any_value(array)                         | Returns the first non-null element in the array. `array_any_value([NULL, 1, 2, 3]) -> 1`                                                                                                                                |
-| array_append(array, element)                   | Appends an element to the end of an array. `array_append([1, 2, 3], 4) -> [1, 2, 3, 4]`                                                                                                                                 |
-| array_concat(array[, ..., array_n])            | Concatenates arrays. `array_concat([1, 2, 3], [4, 5, 6]) -> [1, 2, 3, 4, 5, 6]`                                                                                                                                         |
-| array_has(array, element)                      | Returns true if the array contains the element `array_has([1,2,3], 1) -> true`                                                                                                                                          |
-| array_has_all(array, sub-array)                | Returns true if all elements of sub-array exist in array `array_has_all([1,2,3], [1,3]) -> true`                                                                                                                        |
-| array_has_any(array, sub-array)                | Returns true if any elements exist in both arrays `array_has_any([1,2,3], [1,4]) -> true`                                                                                                                               |
-| array_dims(array)                              | Returns an array of the array's dimensions. `array_dims([[1, 2, 3], [4, 5, 6]]) -> [2, 3]`                                                                                                                              |
-| array_distinct(array)                          | Returns distinct values from the array after removing duplicates. `array_distinct([1, 3, 2, 3, 1, 2, 4]) -> [1, 2, 3, 4]`                                                                                               |
-| array_element(array, index)                    | Extracts the element with the index n from the array `array_element([1, 2, 3, 4], 3) -> 3`                                                                                                                              |
-| empty(array)                                   | Returns true for an empty array or false for a non-empty array. `empty([1]) -> false`                                                                                                                                   |
-| flatten(array)                                 | Converts an array of arrays to a flat array `flatten([[1], [2, 3], [4, 5, 6]]) -> [1, 2, 3, 4, 5, 6]`                                                                                                                   |
-| array_length(array, dimension)                 | Returns the length of the array dimension. `array_length([1, 2, 3, 4, 5]) -> 5`                                                                                                                                         |
-| array_ndims(array)                             | Returns the number of dimensions of the array. `array_ndims([[1, 2, 3], [4, 5, 6]]) -> 2`                                                                                                                               |
-| array_pop_front(array)                         | Returns the array without the first element. `array_pop_front([1, 2, 3]) -> [2, 3]`                                                                                                                                     |
-| array_pop_back(array)                          | Returns the array without the last element. `array_pop_back([1, 2, 3]) -> [1, 2]`                                                                                                                                       |
-| array_position(array, element)                 | Searches for an element in the array, returns first occurrence. `array_position([1, 2, 2, 3, 4], 2) -> 2`                                                                                                               |
-| array_positions(array, element)                | Searches for an element in the array, returns all occurrences. `array_positions([1, 2, 2, 3, 4], 2) -> [2, 3]`                                                                                                          |
-| array_prepend(element, array)                  | Prepends an element to the beginning of an array. `array_prepend(1, [2, 3, 4]) -> [1, 2, 3, 4]`                                                                                                                         |
-| array_repeat(element, count)                   | Returns an array containing element `count` times. `array_repeat(1, 3) -> [1, 1, 1]`                                                                                                                                    |
-| array_remove(array, element)                   | Removes the first element from the array equal to the given value. `array_remove([1, 2, 2, 3, 2, 1, 4], 2) -> [1, 2, 3, 2, 1, 4]`                                                                                       |
-| array_remove_n(array, element, max)            | Removes the first `max` elements from the array equal to the given value. `array_remove_n([1, 2, 2, 3, 2, 1, 4], 2, 2) -> [1, 3, 2, 1, 4]`                                                                              |
-| array_remove_all(array, element)               | Removes all elements from the array equal to the given value. `array_remove_all([1, 2, 2, 3, 2, 1, 4], 2) -> [1, 3, 1, 4]`                                                                                              |
-| array_replace(array, from, to)                 | Replaces the first occurrence of the specified element with another specified element. `array_replace([1, 2, 2, 3, 2, 1, 4], 2, 5) -> [1, 5, 2, 3, 2, 1, 4]`                                                            |
-| array_replace_n(array, from, to, max)          | Replaces the first `max` occurrences of the specified element with another specified element. `array_replace_n([1, 2, 2, 3, 2, 1, 4], 2, 5, 2) -> [1, 5, 5, 3, 2, 1, 4]`                                                |
-| array_replace_all(array, from, to)             | Replaces all occurrences of the specified element with another specified element. `array_replace_all([1, 2, 2, 3, 2, 1, 4], 2, 5) -> [1, 5, 5, 3, 5, 1, 4]`                                                             |
-| array_slice(array, begin,end)                  | Returns a slice of the array. `array_slice([1, 2, 3, 4, 5, 6, 7, 8], 3, 6) -> [3, 4, 5, 6]`                                                                                                                             |
-| array_slice(array, begin, end, stride)         | Returns a slice of the array with added stride feature. `array_slice([1, 2, 3, 4, 5, 6, 7, 8], 3, 6, 2) -> [3, 5, 6]`                                                                                                   |
-| array_to_string(array, delimiter)              | Converts each element to its text representation. `array_to_string([1, 2, 3, 4], ',') -> 1,2,3,4`                                                                                                                       |
-| array_intersect(array1, array2)                | Returns an array of the elements in the intersection of array1 and array2. `array_intersect([1, 2, 3, 4], [5, 6, 3, 4]) -> [3, 4]`                                                                                      |
-| array_union(array1, array2)                    | Returns an array of the elements in the union of array1 and array2 without duplicates. `array_union([1, 2, 3, 4], [5, 6, 3, 4]) -> [1, 2, 3, 4, 5, 6]`                                                                  |
-| array_except(array1, array2)                   | Returns an array of the elements that appear in the first array but not in the second. `array_except([1, 2, 3, 4], [5, 6, 3, 4]) -> [1, 2]`                                                                             |
-| array_resize(array, size, value)               | Resizes the list to contain size elements. Initializes new elements with value or empty if value is not set. `array_resize([1, 2, 3], 5, 0) -> [1, 2, 3, 0, 0]`                                                         |
-| array_sort(array, desc, null_first)            | Returns sorted array. `array_sort([3, 1, 2, 5, 4]) -> [1, 2, 3, 4, 5]`                                                                                                                                                  |
-| cardinality(array/map)                         | Returns the total number of elements in the array or map. `cardinality([[1, 2, 3], [4, 5, 6]]) -> 6`                                                                                                                    |
-| make_array(value1, [value2 [, ...]])           | Returns an Arrow array using the specified input expressions. `make_array(1, 2, 3) -> [1, 2, 3]`                                                                                                                        |
-| range(start [, stop, step])                    | Returns an Arrow array between start and stop with step. `SELECT range(2, 10, 3) -> [2, 5, 8]`                                                                                                                          |
-| string_to_array(array, delimiter, null_string) | Splits a `string` based on a `delimiter` and returns an array of parts. Any parts matching the optional `null_string` will be replaced with `NULL`. `string_to_array('abc#def#ghi', '#', ' ') -> ['abc', 'def', 'ghi']` |
-| trim_array(array, n)                           | Deprecated                                                                                                                                                                                                              |
+| Syntax                                         | Description                                                                                                                                                                                                                                                                      |
+| ---------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| array_any_value(array)                         | Returns the first non-null element in the array. `array_any_value([NULL, 1, 2, 3]) -> 1`                                                                                                                                                                                         |
+| array_append(array, element)                   | Appends an element to the end of an array. `array_append([1, 2, 3], 4) -> [1, 2, 3, 4]`                                                                                                                                                                                          |
+| array_concat(array[, ..., array_n])            | Concatenates arrays. `array_concat([1, 2, 3], [4, 5, 6]) -> [1, 2, 3, 4, 5, 6]`                                                                                                                                                                                                  |
+| array_has(array, element)                      | Returns true if the array contains the element `array_has([1,2,3], 1) -> true`                                                                                                                                                                                                   |
+| array_has_all(array, sub-array)                | Returns true if all elements of sub-array exist in array `array_has_all([1,2,3], [1,3]) -> true`                                                                                                                                                                                 |
+| array_has_any(array, sub-array)                | Returns true if any elements exist in both arrays `array_has_any([1,2,3], [1,4]) -> true`                                                                                                                                                                                        |
+| array_dims(array)                              | Returns an array of the array's dimensions. `array_dims([[1, 2, 3], [4, 5, 6]]) -> [2, 3]`                                                                                                                                                                                       |
+| array_distinct(array)                          | Returns distinct values from the array after removing duplicates. `array_distinct([1, 3, 2, 3, 1, 2, 4]) -> [1, 2, 3, 4]`                                                                                                                                                        |
+| array_element(array, index)                    | Extracts the element with the index n from the array `array_element([1, 2, 3, 4], 3) -> 3`                                                                                                                                                                                       |
+| empty(array)                                   | Returns true for an empty array or false for a non-empty array. `empty([1]) -> false`                                                                                                                                                                                            |
+| flatten(array)                                 | Converts an array of arrays to a flat array `flatten([[1], [2, 3], [4, 5, 6]]) -> [1, 2, 3, 4, 5, 6]`                                                                                                                                                                            |
+| array_length(array, dimension)                 | Returns the length of the array dimension. `array_length([1, 2, 3, 4, 5]) -> 5`                                                                                                                                                                                                  |
+| array_ndims(array)                             | Returns the number of dimensions of the array. `array_ndims([[1, 2, 3], [4, 5, 6]]) -> 2`                                                                                                                                                                                        |
+| array_pop_front(array)                         | Returns the array without the first element. `array_pop_front([1, 2, 3]) -> [2, 3]`                                                                                                                                                                                              |
+| array_pop_back(array)                          | Returns the array without the last element. `array_pop_back([1, 2, 3]) -> [1, 2]`                                                                                                                                                                                                |
+| array_position(array, element)                 | Searches for an element in the array, returns first occurrence. `array_position([1, 2, 2, 3, 4], 2) -> 2`                                                                                                                                                                        |
+| array_positions(array, element)                | Searches for an element in the array, returns all occurrences. `array_positions([1, 2, 2, 3, 4], 2) -> [2, 3]`                                                                                                                                                                   |
+| array_prepend(element, array)                  | Prepends an element to the beginning of an array. `array_prepend(1, [2, 3, 4]) -> [1, 2, 3, 4]`                                                                                                                                                                                  |
+| array_repeat(element, count)                   | Returns an array containing element `count` times. `array_repeat(1, 3) -> [1, 1, 1]`                                                                                                                                                                                             |
+| array_remove(array, element)                   | Removes the first element from the array equal to the given value. `NULL` elements already in the array are preserved when removing a non-`NULL` value, and `array_remove(array, NULL)` returns `NULL`. `array_remove([1, 2, NULL, 2, 4], 2) -> [1, NULL, 2, 4]`                 |
+| array_remove_n(array, element, max)            | Removes the first `max` elements from the array equal to the given value. `NULL` elements already in the array are preserved when removing a non-`NULL` value, and `array_remove_n(array, NULL, max)` returns `NULL`. `array_remove_n([1, 2, NULL, 2, 4], 2, 2) -> [1, NULL, 4]` |
+| array_remove_all(array, element)               | Removes all elements from the array equal to the given value. `NULL` elements already in the array are preserved when removing a non-`NULL` value, and `array_remove_all(array, NULL)` returns `NULL`. `array_remove_all([1, 2, NULL, 2, 4], 2) -> [1, NULL, 4]`                 |
+| array_replace(array, from, to)                 | Replaces the first occurrence of the specified element with another specified element. `array_replace([1, 2, 2, 3, 2, 1, 4], 2, 5) -> [1, 5, 2, 3, 2, 1, 4]`                                                                                                                     |
+| array_replace_n(array, from, to, max)          | Replaces the first `max` occurrences of the specified element with another specified element. `array_replace_n([1, 2, 2, 3, 2, 1, 4], 2, 5, 2) -> [1, 5, 5, 3, 2, 1, 4]`                                                                                                         |
+| array_replace_all(array, from, to)             | Replaces all occurrences of the specified element with another specified element. `array_replace_all([1, 2, 2, 3, 2, 1, 4], 2, 5) -> [1, 5, 5, 3, 5, 1, 4]`                                                                                                                      |
+| array_slice(array, begin,end)                  | Returns a slice of the array. `array_slice([1, 2, 3, 4, 5, 6, 7, 8], 3, 6) -> [3, 4, 5, 6]`                                                                                                                                                                                      |
+| array_slice(array, begin, end, stride)         | Returns a slice of the array with added stride feature. `array_slice([1, 2, 3, 4, 5, 6, 7, 8], 3, 6, 2) -> [3, 5, 6]`                                                                                                                                                            |
+| array_to_string(array, delimiter)              | Converts each element to its text representation. `array_to_string([1, 2, 3, 4], ',') -> 1,2,3,4`                                                                                                                                                                                |
+| array_intersect(array1, array2)                | Returns an array of the elements in the intersection of array1 and array2. `array_intersect([1, 2, 3, 4], [5, 6, 3, 4]) -> [3, 4]`                                                                                                                                               |
+| array_union(array1, array2)                    | Returns an array of the elements in the union of array1 and array2 without duplicates. `array_union([1, 2, 3, 4], [5, 6, 3, 4]) -> [1, 2, 3, 4, 5, 6]`                                                                                                                           |
+| array_except(array1, array2)                   | Returns an array of the elements that appear in the first array but not in the second. `array_except([1, 2, 3, 4], [5, 6, 3, 4]) -> [1, 2]`                                                                                                                                      |
+| array_resize(array, size, value)               | Resizes the list to contain size elements. Initializes new elements with value or empty if value is not set. `array_resize([1, 2, 3], 5, 0) -> [1, 2, 3, 0, 0]`                                                                                                                  |
+| array_sort(array, desc, null_first)            | Returns sorted array. `array_sort([3, 1, 2, 5, 4]) -> [1, 2, 3, 4, 5]`                                                                                                                                                                                                           |
+| cardinality(array/map)                         | Returns the total number of elements in the array or map. `cardinality([[1, 2, 3], [4, 5, 6]]) -> 6`                                                                                                                                                                             |
+| make_array(value1, [value2 [, ...]])           | Returns an Arrow array using the specified input expressions. `make_array(1, 2, 3) -> [1, 2, 3]`                                                                                                                                                                                 |
+| range(start [, stop, step])                    | Returns an Arrow array between start and stop with step. `SELECT range(2, 10, 3) -> [2, 5, 8]`                                                                                                                                                                                   |
+| string_to_array(array, delimiter, null_string) | Splits a `string` based on a `delimiter` and returns an array of parts. Any parts matching the optional `null_string` will be replaced with `NULL`. `string_to_array('abc#def#ghi', '#', ' ') -> ['abc', 'def', 'ghi']`                                                          |
+| trim_array(array, n)                           | Deprecated                                                                                                                                                                                                                                                                       |
 
 ## Regular Expressions
 
@@ -287,7 +287,7 @@ select log(-1), log(0), sqrt(-1);
 
 | Syntax                                                                          | Description                                                                                                                                              |
 | ------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| avg(expr)                                                                       | Сalculates the average value for `expr`.                                                                                                                 |
+| avg(expr)                                                                       | Calculates the average value for `expr`.                                                                                                                 |
 | avg_distinct(expr)                                                              | Creates an expression to represent the avg(distinct) aggregate function                                                                                  |
 | approx_distinct(expr)                                                           | Calculates an approximate count of the number of distinct values for `expr`.                                                                             |
 | approx_median(expr)                                                             | Calculates an approximation of the median for `expr`.                                                                                                    |
@@ -303,17 +303,17 @@ select log(-1), log(0), sqrt(-1);
 | cube(exprs)                                                                     | Creates a grouping set for all combination of `exprs`                                                                                                    |
 | grouping_set(exprs)                                                             | Create a grouping set.                                                                                                                                   |
 | max(expr)                                                                       | Finds the maximum value of `expr`.                                                                                                                       |
-| median(expr)                                                                    | Сalculates the median of `expr`.                                                                                                                         |
+| median(expr)                                                                    | Calculates the median of `expr`.                                                                                                                         |
 | min(expr)                                                                       | Finds the minimum value of `expr`.                                                                                                                       |
 | rollup(exprs)                                                                   | Creates a grouping set for rollup sets.                                                                                                                  |
-| sum(expr)                                                                       | Сalculates the sum of `expr`.                                                                                                                            |
+| sum(expr)                                                                       | Calculates the sum of `expr`.                                                                                                                            |
 | sum_distinct(expr)                                                              | Creates an expression to represent the sum(distinct) aggregate function                                                                                  |
 
 ## Aggregate Function Builder
 
 You can also use the `ExprFunctionExt` trait to more easily build Aggregate arguments `Expr`.
 
-See `datafusion-examples/examples/expr_api.rs` for example usage.
+See `datafusion-examples/examples/query_planning/expr_api.rs` for example usage.
 
 | Syntax                                                                  | Equivalent to                       |
 | ----------------------------------------------------------------------- | ----------------------------------- |
diff --git a/docs/source/user-guide/introduction.md b/docs/source/user-guide/introduction.md
index 778562d55ffcb..a47fa10d9dd3b 100644
--- a/docs/source/user-guide/introduction.md
+++ b/docs/source/user-guide/introduction.md
@@ -82,6 +82,7 @@ Here are some example systems built using DataFusion:
 - Streaming data platforms such as [Synnada]
 - Tools for reading / sorting / transcoding Parquet, CSV, AVRO, and JSON files such as [qv]
 - Native Spark runtime replacement such as [Auron]
+- Distributed data cache to boost GPU utilization of AI workloads with [Kubeflow Trainer](https://www.kubeflow.org/docs/components/trainer/user-guides/data-cache/)
 
 By using DataFusion, projects are freed to focus on their specific
 features, and avoid reimplementing general (but still necessary)
@@ -114,6 +115,8 @@ Here are some active projects using DataFusion:
 - [Iceberg-rust](https://github.com/apache/iceberg-rust) Rust implementation of Apache Iceberg
 - [InfluxDB] Time Series Database
 - [Kamu] Planet-scale streaming data pipeline
+- [Kubeflow Trainer](https://github.com/kubeflow/trainer) Kubernetes-native project designed for
+  scalable LLMs fine-tuning and distributed AI model training.
 - [LakeSoul](https://github.com/lakesoul-io/LakeSoul) Open source LakeHouse framework with native IO in Rust.
 - [Lance](https://github.com/lancedb/lance) Modern columnar data format for ML
 - [OpenObserve] Distributed cloud native observability platform
@@ -134,6 +137,7 @@ Here are some active projects using DataFusion:
 - [Vortex] An extensible, state of the art columnar file format
 - [Telemetry](https://telemetry.sh/) Structured logging made easy
 - [Xorq](https://github.com/xorq-labs/xorq/) Xorq is a multi-engine batch transformation framework built on Ibis, DataFusion and Arrow
+- [KalamDB](https://github.com/jamals86/KalamDB) SQL-first realtime state database for AI agents, chat products, and multi-tenant SaaS.
 
 Here are some less active projects that used DataFusion:
 
@@ -144,13 +148,14 @@ Here are some less active projects that used DataFusion:
 - [Flock]
 - [Tensorbase]
 
+If you know of another project, please submit a PR to add a link!
+
 [ballista]: https://github.com/apache/datafusion-ballista
 [auron]: https://github.com/apache/auron
 [cloudfuse buzz]: https://github.com/cloudfuse-io/buzz-rust
 [cnosdb]: https://github.com/cnosdb/cnosdb
 [cube store]: https://github.com/cube-js/cube.js/tree/master/rust
 [dask sql]: https://github.com/dask-contrib/dask-sql
-[datafusion-tui]: https://github.com/datafusion-contrib/datafusion-tui
 [delta-rs]: https://github.com/delta-io/delta-rs
 [edb postgres lakehouse]: https://www.enterprisedb.com/products/analytics
 [exon]: https://github.com/wheretrue/exon
@@ -169,7 +174,7 @@ Here are some less active projects that used DataFusion:
 [synnada]: https://synnada.ai/
 [tensorbase]: https://github.com/tensorbase/tensorbase
 [vegafusion]: https://vegafusion.io/
-[vortex]: https://vortex.dev/ "if you know of another project, please submit a PR to add a link!"
+[vortex]: https://vortex.dev/
 
 ## Integrations and Extensions
 
diff --git a/docs/source/user-guide/metrics.md b/docs/source/user-guide/metrics.md
index 1fb2f4a5c7700..7e0363f4ceb9b 100644
--- a/docs/source/user-guide/metrics.md
+++ b/docs/source/user-guide/metrics.md
@@ -32,7 +32,16 @@ DataFusion operators expose runtime metrics so you can understand where time is
 | elapsed_compute | CPU time the operator actively spends processing work.                                                                                                                                             |
 | output_rows     | Total number of rows the operator produces.                                                                                                                                                        |
 | output_bytes    | Memory usage of all output batches. Note: This value may be overestimated. If multiple output `RecordBatch` instances share underlying memory buffers, their sizes will be counted multiple times. |
+| output_batches  | Total number of output batches the operator produces.                                                                                                                                              |
 
 ## Operator-specific Metrics
 
-TODO
+### FilterExec
+
+| Metric      | Description                                                       |
+| ----------- | ----------------------------------------------------------------- |
+| selectivity | Selectivity of the filter, calculated as output_rows / input_rows |
+
+## TODO
+
+Add metrics for the remaining operators
diff --git a/docs/source/user-guide/sql/aggregate_functions.md b/docs/source/user-guide/sql/aggregate_functions.md
index f17e09f2ce9d0..ba9c6ae12477b 100644
--- a/docs/source/user-guide/sql/aggregate_functions.md
+++ b/docs/source/user-guide/sql/aggregate_functions.md
@@ -48,6 +48,36 @@ FROM employees;
 
 Note: When no rows pass the filter, `COUNT` returns `0` while `SUM`/`AVG`/`MIN`/`MAX` return `NULL`.
 
+## WITHIN GROUP / Ordered-set aggregates
+
+Some aggregate functions accept the SQL `WITHIN GROUP (ORDER BY ...)` clause to specify the ordering the
+aggregate relies on. In DataFusion this is opt-in: only aggregate functions whose implementation returns
+`true` from `AggregateUDFImpl::supports_within_group_clause()` accept the `WITHIN GROUP` clause. Attempting to
+use `WITHIN GROUP` with a regular aggregate (for example, `SELECT SUM(x) WITHIN GROUP (ORDER BY x)`) will fail
+during planning with an error: "WITHIN GROUP is only supported for ordered-set aggregate functions".
+
+Currently, the built-in aggregate functions that support `WITHIN GROUP` are:
+
+- `percentile_cont` — exact percentile aggregate (also available as `percentile_cont(column, percentile)`)
+- `approx_percentile_cont` — approximate percentile using the t-digest algorithm
+- `approx_percentile_cont_with_weight` — approximate weighted percentile using the t-digest algorithm
+
+Note: rank-like functions such as `rank()`, `dense_rank()`, and `percent_rank()` are window functions and
+use the `OVER (...)` clause; they are not ordered-set aggregates that accept `WITHIN GROUP` in DataFusion.
+
+Example (ordered-set aggregate):
+
+```sql
+percentile_cont(0.5) WITHIN GROUP (ORDER BY value)
+```
+
+Example (invalid usage — planner will error):
+
+```sql
+-- This will fail: SUM is not an ordered-set aggregate
+SELECT SUM(x) WITHIN GROUP (ORDER BY x) FROM t;
+```
+
 ## General Functions
 
 - [array_agg](#array_agg)
diff --git a/docs/source/user-guide/sql/data_types.md b/docs/source/user-guide/sql/data_types.md
index 02edb6371ce3e..502193df41a64 100644
--- a/docs/source/user-guide/sql/data_types.md
+++ b/docs/source/user-guide/sql/data_types.md
@@ -25,6 +25,11 @@ execution. The SQL types from
 are mapped to [Arrow data types](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) according to the following table.
 This mapping occurs when defining the schema in a `CREATE EXTERNAL TABLE` command or when performing a SQL `CAST` operation.
 
+For background on extension types and custom metadata, see the
+[Implementing User Defined Types and Custom Metadata in DataFusion] blog.
+
+[implementing user defined types and custom metadata in datafusion]: https://datafusion.apache.org/blog/2025/09/21/custom-types-using-metadata
+
 You can see the corresponding Arrow type for any SQL expression using
 the `arrow_typeof` function. For example:
 
@@ -64,27 +69,32 @@ select arrow_cast(now(), 'Timestamp(Second, None)') as "now()";
 
 | SQL DataType | Arrow DataType |
 | ------------ | -------------- |
-| `CHAR`       | `Utf8`         |
-| `VARCHAR`    | `Utf8`         |
-| `TEXT`       | `Utf8`         |
-| `STRING`     | `Utf8`         |
+| `CHAR`       | `Utf8View`     |
+| `VARCHAR`    | `Utf8View`     |
+| `TEXT`       | `Utf8View`     |
+| `STRING`     | `Utf8View`     |
+
+By default, string types are mapped to `Utf8View`. This can be configured using the `datafusion.sql_parser.map_string_types_to_utf8view` setting. When set to `false`, string types are mapped to `Utf8` instead.
 
 ## Numeric Types
 
-| SQL DataType                         | Arrow DataType                 |
-| ------------------------------------ | :----------------------------- |
-| `TINYINT`                            | `Int8`                         |
-| `SMALLINT`                           | `Int16`                        |
-| `INT` or `INTEGER`                   | `Int32`                        |
-| `BIGINT`                             | `Int64`                        |
-| `TINYINT UNSIGNED`                   | `UInt8`                        |
-| `SMALLINT UNSIGNED`                  | `UInt16`                       |
-| `INT UNSIGNED` or `INTEGER UNSIGNED` | `UInt32`                       |
-| `BIGINT UNSIGNED`                    | `UInt64`                       |
-| `FLOAT`                              | `Float32`                      |
-| `REAL`                               | `Float32`                      |
-| `DOUBLE`                             | `Float64`                      |
-| `DECIMAL(precision, scale)`          | `Decimal128(precision, scale)` |
+| SQL DataType                                     | Arrow DataType                 |
+| ------------------------------------------------ | :----------------------------- |
+| `TINYINT`                                        | `Int8`                         |
+| `SMALLINT`                                       | `Int16`                        |
+| `INT` or `INTEGER`                               | `Int32`                        |
+| `BIGINT`                                         | `Int64`                        |
+| `TINYINT UNSIGNED`                               | `UInt8`                        |
+| `SMALLINT UNSIGNED`                              | `UInt16`                       |
+| `INT UNSIGNED` or `INTEGER UNSIGNED`             | `UInt32`                       |
+| `BIGINT UNSIGNED`                                | `UInt64`                       |
+| `FLOAT`                                          | `Float32`                      |
+| `REAL`                                           | `Float32`                      |
+| `DOUBLE`                                         | `Float64`                      |
+| `DECIMAL(precision, scale)` where precision ≤ 38 | `Decimal128(precision, scale)` |
+| `DECIMAL(precision, scale)` where precision > 38 | `Decimal256(precision, scale)` |
+
+The maximum supported precision for `DECIMAL` types is 76.
 
 ## Date/Time Types
 
@@ -126,42 +136,3 @@ You can create binary literals using a hex string literal such as
 | `ENUM`        | _Not yet supported_ |
 | `SET`         | _Not yet supported_ |
 | `DATETIME`    | _Not yet supported_ |
-
-## Supported Arrow Types
-
-The following types are supported by the `arrow_typeof` function:
-
-| Arrow Type                                                  |
-| ----------------------------------------------------------- |
-| `Null`                                                      |
-| `Boolean`                                                   |
-| `Int8`                                                      |
-| `Int16`                                                     |
-| `Int32`                                                     |
-| `Int64`                                                     |
-| `UInt8`                                                     |
-| `UInt16`                                                    |
-| `UInt32`                                                    |
-| `UInt64`                                                    |
-| `Float16`                                                   |
-| `Float32`                                                   |
-| `Float64`                                                   |
-| `Utf8`                                                      |
-| `LargeUtf8`                                                 |
-| `Binary`                                                    |
-| `Timestamp(Second, None)`                                   |
-| `Timestamp(Millisecond, None)`                              |
-| `Timestamp(Microsecond, None)`                              |
-| `Timestamp(Nanosecond, None)`                               |
-| `Time32`                                                    |
-| `Time64`                                                    |
-| `Duration(Second)`                                          |
-| `Duration(Millisecond)`                                     |
-| `Duration(Microsecond)`                                     |
-| `Duration(Nanosecond)`                                      |
-| `Interval(YearMonth)`                                       |
-| `Interval(DayTime)`                                         |
-| `Interval(MonthDayNano)`                                    |
-| `FixedSizeBinary(<len>)` (e.g. `FixedSizeBinary(16)`)       |
-| `Decimal128(<precision>, <scale>)` e.g. `Decimal128(3, 10)` |
-| `Decimal256(<precision>, <scale>)` e.g. `Decimal256(3, 10)` |
diff --git a/docs/source/user-guide/sql/ddl.md b/docs/source/user-guide/sql/ddl.md
index bd41f691bf90b..3a5c934ae8156 100644
--- a/docs/source/user-guide/sql/ddl.md
+++ b/docs/source/user-guide/sql/ddl.md
@@ -71,7 +71,7 @@ LOCATION <literal>
 
 <ordered_column_list> := (<column_name> <sort_clause>, ...)
 
-<key_value_list> := (<literal> <literal, <literal> <literal>, ...)
+<key_value_list> := (<literal> <literal>, <literal> <literal>, ...)
 ```
 
 For a comprehensive list of format-specific options that can be specified in the `OPTIONS` clause, see [Format Options](format_options.md).
diff --git a/docs/source/user-guide/sql/explain.md b/docs/source/user-guide/sql/explain.md
index 1caadcc291416..23101632625b1 100644
--- a/docs/source/user-guide/sql/explain.md
+++ b/docs/source/user-guide/sql/explain.md
@@ -70,19 +70,10 @@ to see the high level structure of the plan
 |               | │      RepartitionExec      │ |
 |               | │    --------------------   │ |
 |               | │   input_partition_count:  │ |
-|               | │             16            │ |
-|               | │                           │ |
-|               | │    partitioning_scheme:   │ |
-|               | │      Hash([b@0], 16)      │ |
-|               | └─────────────┬─────────────┘ |
-|               | ┌─────────────┴─────────────┐ |
-|               | │      RepartitionExec      │ |
-|               | │    --------------------   │ |
-|               | │   input_partition_count:  │ |
 |               | │             1             │ |
 |               | │                           │ |
 |               | │    partitioning_scheme:   │ |
-|               | │    RoundRobinBatch(16)    │ |
+|               | │      Hash([b@0], 16)      │ |
 |               | └─────────────┬─────────────┘ |
 |               | ┌─────────────┴─────────────┐ |
 |               | │       AggregateExec       │ |
@@ -126,10 +117,9 @@ Elapsed 0.004 seconds.
 | physical_plan | ProjectionExec: expr=[sum(t.x)@1 as sum(t.x)]                                 |
 |               |   AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[sum(t.x)]       |
 |               |     CoalesceBatchesExec: target_batch_size=8192                               |
-|               |       RepartitionExec: partitioning=Hash([b@0], 16), input_partitions=16      |
-|               |         RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1 |
-|               |           AggregateExec: mode=Partial, gby=[b@1 as b], aggr=[sum(t.x)]        |
-|               |             DataSourceExec: partitions=1, partition_sizes=[1]                 |
+|               |       RepartitionExec: partitioning=Hash([b@0], 16), input_partitions=1       |
+|               |         AggregateExec: mode=Partial, gby=[b@1 as b], aggr=[sum(t.x)]          |
+|               |           DataSourceExec: partitions=1, partition_sizes=[1]                   |
 |               |                                                                               |
 +---------------+-------------------------------------------------------------------------------+
 2 row(s) fetched.
diff --git a/docs/source/user-guide/sql/format_options.md b/docs/source/user-guide/sql/format_options.md
index e8008eafb166c..46d251c18ed74 100644
--- a/docs/source/user-guide/sql/format_options.md
+++ b/docs/source/user-guide/sql/format_options.md
@@ -29,7 +29,7 @@ Format-related options can be specified in three ways, in decreasing order of pr
 - `COPY` option tuples
 - Session-level config defaults
 
-For a list of supported session-level config defaults, see [Configuration Settings](../configs). These defaults apply to all operations but have the lowest level of precedence.
+For a list of supported session-level config defaults, see [Configuration Settings](../configs.md). These defaults apply to all operations but have the lowest level of precedence.
 
 If creating an external table, table-specific format options can be specified when the table is created using the `OPTIONS` clause:
 
@@ -99,25 +99,25 @@ OPTIONS('COMPRESSION' 'gzip');
 
 The following options are available when reading or writing CSV files. Note: If any unsupported option is specified, an error will be raised and the query will fail.
 
-| Option               | Description                                                                                                                       | Default Value      |
-| -------------------- | --------------------------------------------------------------------------------------------------------------------------------- | ------------------ |
-| COMPRESSION          | Sets the compression that should be applied to the entire CSV file. Supported values are GZIP, BZIP2, XZ, ZSTD, and UNCOMPRESSED. | UNCOMPRESSED       |
-| HAS_HEADER           | Sets if the CSV file should include column headers. If not set, uses session or system default.                                   | None               |
-| DELIMITER            | Sets the character which should be used as the column delimiter within the CSV file.                                              | `,` (comma)        |
-| QUOTE                | Sets the character which should be used for quoting values within the CSV file.                                                   | `"` (double quote) |
-| TERMINATOR           | Sets the character which should be used as the line terminator within the CSV file.                                               | None               |
-| ESCAPE               | Sets the character which should be used for escaping special characters within the CSV file.                                      | None               |
-| DOUBLE_QUOTE         | Sets if quotes within quoted fields should be escaped by doubling them (e.g., `"aaa""bbb"`).                                      | None               |
-| NEWLINES_IN_VALUES   | Sets if newlines in quoted values are supported. If not set, uses session or system default.                                      | None               |
-| DATE_FORMAT          | Sets the format that dates should be encoded in within the CSV file.                                                              | None               |
-| DATETIME_FORMAT      | Sets the format that datetimes should be encoded in within the CSV file.                                                          | None               |
-| TIMESTAMP_FORMAT     | Sets the format that timestamps should be encoded in within the CSV file.                                                         | None               |
-| TIMESTAMP_TZ_FORMAT  | Sets the format that timestamps with timezone should be encoded in within the CSV file.                                           | None               |
-| TIME_FORMAT          | Sets the format that times should be encoded in within the CSV file.                                                              | None               |
-| NULL_VALUE           | Sets the string which should be used to indicate null values within the CSV file.                                                 | None               |
-| NULL_REGEX           | Sets the regex pattern to match null values when loading CSVs.                                                                    | None               |
-| SCHEMA_INFER_MAX_REC | Sets the maximum number of records to scan to infer the schema.                                                                   | None               |
-| COMMENT              | Sets the character which should be used to indicate comment lines in the CSV file.                                                | None               |
+| Option               | Description                                                                                                                                                      | Default Value      |
+| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------ |
+| COMPRESSION          | Sets the compression that should be applied to the entire CSV file. Supported values are GZIP, BZIP2, XZ, ZSTD, and UNCOMPRESSED.                                | UNCOMPRESSED       |
+| HAS_HEADER           | Sets if the CSV file should include column headers. If not set, uses session or system default.                                                                  | None               |
+| DELIMITER            | Sets the character which should be used as the column delimiter within the CSV file.                                                                             | `,` (comma)        |
+| QUOTE                | Sets the character which should be used for quoting values within the CSV file.                                                                                  | `"` (double quote) |
+| TERMINATOR           | Sets the character which should be used as the line terminator within the CSV file.                                                                              | None               |
+| ESCAPE               | Sets the character which should be used for escaping special characters within the CSV file.                                                                     | None               |
+| DOUBLE_QUOTE         | Sets if quotes within quoted fields should be escaped by doubling them (e.g., `"aaa""bbb"`).                                                                     | None               |
+| NEWLINES_IN_VALUES   | Sets if newlines in quoted values are supported. If not set, uses session or system default.                                                                     | None               |
+| DATE_FORMAT          | Sets the format that dates should be encoded in within the CSV file.                                                                                             | None               |
+| DATETIME_FORMAT      | Sets the format that datetimes should be encoded in within the CSV file.                                                                                         | None               |
+| TIMESTAMP_FORMAT     | Sets the format that timestamps should be encoded in within the CSV file.                                                                                        | None               |
+| TIMESTAMP_TZ_FORMAT  | Sets the format that timestamps with timezone should be encoded in within the CSV file.                                                                          | None               |
+| TIME_FORMAT          | Sets the format that times should be encoded in within the CSV file.                                                                                             | None               |
+| NULL_VALUE           | Sets the string which should be used to indicate null values within the CSV file.                                                                                | None               |
+| NULL_REGEX           | Sets the regex pattern to match null values when loading CSVs.                                                                                                   | None               |
+| SCHEMA_INFER_MAX_REC | Sets the maximum number of records to scan to infer the schema. If set to 0, schema inference is disabled and all fields will be inferred as Utf8 (string) type. | None               |
+| COMMENT              | Sets the character which should be used to indicate comment lines in the CSV file.                                                                               | None               |
 
 **Example:**
 
@@ -132,38 +132,38 @@ OPTIONS('DELIMITER' '|', 'HAS_HEADER' 'true', 'NEWLINES_IN_VALUES' 'true');
 
 The following options are available when reading or writing Parquet files. If any unsupported option is specified, an error will be raised and the query will fail. If a column-specific option is specified for a column that does not exist, the option will be ignored without error.
 
-| Option                                     | Can be Column Specific? | Description                                                                                                                                                                                                                                                                                                                                 | OPTIONS Key                                           | Default Value            |
-| ------------------------------------------ | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------- | ------------------------ |
-| COMPRESSION                                | Yes                     | Sets the internal Parquet **compression codec** for data pages, optionally including the compression level. Applies globally if set without `::col`, or specifically to a column if set using `'compression::column_name'`. Valid values: `uncompressed`, `snappy`, `gzip(level)`, `lzo`, `brotli(level)`, `lz4`, `zstd(level)`, `lz4_raw`. | `'compression'` or `'compression::col'`               | zstd(3)                  |
-| ENCODING                                   | Yes                     | Sets the **encoding** scheme for data pages. Valid values: `plain`, `plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`, `delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, `byte_stream_split`. Use key `'encoding'` or `'encoding::col'` in OPTIONS.                                                             | `'encoding'` or `'encoding::col'`                     | None                     |
-| DICTIONARY_ENABLED                         | Yes                     | Sets whether dictionary encoding should be enabled globally or for a specific column.                                                                                                                                                                                                                                                       | `'dictionary_enabled'` or `'dictionary_enabled::col'` | true                     |
-| STATISTICS_ENABLED                         | Yes                     | Sets the level of statistics to write (`none`, `chunk`, `page`).                                                                                                                                                                                                                                                                            | `'statistics_enabled'` or `'statistics_enabled::col'` | page                     |
-| BLOOM_FILTER_ENABLED                       | Yes                     | Sets whether a bloom filter should be written for a specific column.                                                                                                                                                                                                                                                                        | `'bloom_filter_enabled::column_name'`                 | None                     |
-| BLOOM_FILTER_FPP                           | Yes                     | Sets bloom filter false positive probability (global or per column).                                                                                                                                                                                                                                                                        | `'bloom_filter_fpp'` or `'bloom_filter_fpp::col'`     | None                     |
-| BLOOM_FILTER_NDV                           | Yes                     | Sets bloom filter number of distinct values (global or per column).                                                                                                                                                                                                                                                                         | `'bloom_filter_ndv'` or `'bloom_filter_ndv::col'`     | None                     |
-| MAX_ROW_GROUP_SIZE                         | No                      | Sets the maximum number of rows per row group. Larger groups require more memory but can improve compression and scan efficiency.                                                                                                                                                                                                           | `'max_row_group_size'`                                | 1048576                  |
-| ENABLE_PAGE_INDEX                          | No                      | If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce I/O and decoding.                                                                                                                                                                                                                               | `'enable_page_index'`                                 | true                     |
-| PRUNING                                    | No                      | If true, enables row group pruning based on min/max statistics.                                                                                                                                                                                                                                                                             | `'pruning'`                                           | true                     |
-| SKIP_METADATA                              | No                      | If true, skips optional embedded metadata in the file schema.                                                                                                                                                                                                                                                                               | `'skip_metadata'`                                     | true                     |
-| METADATA_SIZE_HINT                         | No                      | Sets the size hint (in bytes) for fetching Parquet file metadata.                                                                                                                                                                                                                                                                           | `'metadata_size_hint'`                                | None                     |
-| PUSHDOWN_FILTERS                           | No                      | If true, enables filter pushdown during Parquet decoding.                                                                                                                                                                                                                                                                                   | `'pushdown_filters'`                                  | false                    |
-| REORDER_FILTERS                            | No                      | If true, enables heuristic reordering of filters during Parquet decoding.                                                                                                                                                                                                                                                                   | `'reorder_filters'`                                   | false                    |
-| SCHEMA_FORCE_VIEW_TYPES                    | No                      | If true, reads Utf8/Binary columns as view types.                                                                                                                                                                                                                                                                                           | `'schema_force_view_types'`                           | true                     |
-| BINARY_AS_STRING                           | No                      | If true, reads Binary columns as strings.                                                                                                                                                                                                                                                                                                   | `'binary_as_string'`                                  | false                    |
-| DATA_PAGESIZE_LIMIT                        | No                      | Sets best effort maximum size of data page in bytes.                                                                                                                                                                                                                                                                                        | `'data_pagesize_limit'`                               | 1048576                  |
-| DATA_PAGE_ROW_COUNT_LIMIT                  | No                      | Sets best effort maximum number of rows in data page.                                                                                                                                                                                                                                                                                       | `'data_page_row_count_limit'`                         | 20000                    |
-| DICTIONARY_PAGE_SIZE_LIMIT                 | No                      | Sets best effort maximum dictionary page size, in bytes.                                                                                                                                                                                                                                                                                    | `'dictionary_page_size_limit'`                        | 1048576                  |
-| WRITE_BATCH_SIZE                           | No                      | Sets write_batch_size in bytes.                                                                                                                                                                                                                                                                                                             | `'write_batch_size'`                                  | 1024                     |
-| WRITER_VERSION                             | No                      | Sets the Parquet writer version (`1.0` or `2.0`).                                                                                                                                                                                                                                                                                           | `'writer_version'`                                    | 1.0                      |
-| SKIP_ARROW_METADATA                        | No                      | If true, skips writing Arrow schema information into the Parquet file metadata.                                                                                                                                                                                                                                                             | `'skip_arrow_metadata'`                               | false                    |
-| CREATED_BY                                 | No                      | Sets the "created by" string in the Parquet file metadata.                                                                                                                                                                                                                                                                                  | `'created_by'`                                        | datafusion version X.Y.Z |
-| COLUMN_INDEX_TRUNCATE_LENGTH               | No                      | Sets the length (in bytes) to truncate min/max values in column indexes.                                                                                                                                                                                                                                                                    | `'column_index_truncate_length'`                      | 64                       |
-| STATISTICS_TRUNCATE_LENGTH                 | No                      | Sets statistics truncate length.                                                                                                                                                                                                                                                                                                            | `'statistics_truncate_length'`                        | None                     |
-| BLOOM_FILTER_ON_WRITE                      | No                      | Sets whether bloom filters should be written for all columns by default (can be overridden per column).                                                                                                                                                                                                                                     | `'bloom_filter_on_write'`                             | false                    |
-| ALLOW_SINGLE_FILE_PARALLELISM              | No                      | Enables parallel serialization of columns in a single file.                                                                                                                                                                                                                                                                                 | `'allow_single_file_parallelism'`                     | true                     |
-| MAXIMUM_PARALLEL_ROW_GROUP_WRITERS         | No                      | Maximum number of parallel row group writers.                                                                                                                                                                                                                                                                                               | `'maximum_parallel_row_group_writers'`                | 1                        |
-| MAXIMUM_BUFFERED_RECORD_BATCHES_PER_STREAM | No                      | Maximum number of buffered record batches per stream.                                                                                                                                                                                                                                                                                       | `'maximum_buffered_record_batches_per_stream'`        | 2                        |
-| KEY_VALUE_METADATA                         | No (Key is specific)    | Adds custom key-value pairs to the file metadata. Use the format `'metadata::your_key_name' 'your_value'`. Multiple entries allowed.                                                                                                                                                                                                        | `'metadata::key_name'`                                | None                     |
+| Option                                     | Can be Column Specific? | Description                                                                                                                                                                                                                                                                                                                          | OPTIONS Key                                           | Default Value            |
+| ------------------------------------------ | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------- | ------------------------ |
+| COMPRESSION                                | Yes                     | Sets the internal Parquet **compression codec** for data pages, optionally including the compression level. Applies globally if set without `::col`, or specifically to a column if set using `'compression::column_name'`. Valid values: `uncompressed`, `snappy`, `gzip(level)`, `brotli(level)`, `lz4`, `zstd(level)`, `lz4_raw`. | `'compression'` or `'compression::col'`               | zstd(3)                  |
+| ENCODING                                   | Yes                     | Sets the **encoding** scheme for data pages. Valid values: `plain`, `plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`, `delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, `byte_stream_split`. Use key `'encoding'` or `'encoding::col'` in OPTIONS.                                                      | `'encoding'` or `'encoding::col'`                     | None                     |
+| DICTIONARY_ENABLED                         | Yes                     | Sets whether dictionary encoding should be enabled globally or for a specific column.                                                                                                                                                                                                                                                | `'dictionary_enabled'` or `'dictionary_enabled::col'` | true                     |
+| STATISTICS_ENABLED                         | Yes                     | Sets the level of statistics to write (`none`, `chunk`, `page`).                                                                                                                                                                                                                                                                     | `'statistics_enabled'` or `'statistics_enabled::col'` | page                     |
+| BLOOM_FILTER_ENABLED                       | Yes                     | Sets whether a bloom filter should be written for a specific column.                                                                                                                                                                                                                                                                 | `'bloom_filter_enabled::column_name'`                 | None                     |
+| BLOOM_FILTER_FPP                           | Yes                     | Sets bloom filter false positive probability (global or per column).                                                                                                                                                                                                                                                                 | `'bloom_filter_fpp'` or `'bloom_filter_fpp::col'`     | None                     |
+| BLOOM_FILTER_NDV                           | Yes                     | Sets bloom filter number of distinct values (global or per column).                                                                                                                                                                                                                                                                  | `'bloom_filter_ndv'` or `'bloom_filter_ndv::col'`     | None                     |
+| MAX_ROW_GROUP_SIZE                         | No                      | Sets the maximum number of rows per row group. Larger groups require more memory but can improve compression and scan efficiency.                                                                                                                                                                                                    | `'max_row_group_size'`                                | 1048576                  |
+| ENABLE_PAGE_INDEX                          | No                      | If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce I/O and decoding.                                                                                                                                                                                                                        | `'enable_page_index'`                                 | true                     |
+| PRUNING                                    | No                      | If true, enables row group pruning based on min/max statistics.                                                                                                                                                                                                                                                                      | `'pruning'`                                           | true                     |
+| SKIP_METADATA                              | No                      | If true, skips optional embedded metadata in the file schema.                                                                                                                                                                                                                                                                        | `'skip_metadata'`                                     | true                     |
+| METADATA_SIZE_HINT                         | No                      | Sets the size hint (in bytes) for fetching Parquet file metadata.                                                                                                                                                                                                                                                                    | `'metadata_size_hint'`                                | None                     |
+| PUSHDOWN_FILTERS                           | No                      | If true, enables filter pushdown during Parquet decoding.                                                                                                                                                                                                                                                                            | `'pushdown_filters'`                                  | false                    |
+| REORDER_FILTERS                            | No                      | If true, enables heuristic reordering of filters during Parquet decoding.                                                                                                                                                                                                                                                            | `'reorder_filters'`                                   | false                    |
+| SCHEMA_FORCE_VIEW_TYPES                    | No                      | If true, reads Utf8/Binary columns as view types.                                                                                                                                                                                                                                                                                    | `'schema_force_view_types'`                           | true                     |
+| BINARY_AS_STRING                           | No                      | If true, reads Binary columns as strings.                                                                                                                                                                                                                                                                                            | `'binary_as_string'`                                  | false                    |
+| DATA_PAGESIZE_LIMIT                        | No                      | Sets best effort maximum size of data page in bytes.                                                                                                                                                                                                                                                                                 | `'data_pagesize_limit'`                               | 1048576                  |
+| DATA_PAGE_ROW_COUNT_LIMIT                  | No                      | Sets best effort maximum number of rows in data page.                                                                                                                                                                                                                                                                                | `'data_page_row_count_limit'`                         | 20000                    |
+| DICTIONARY_PAGE_SIZE_LIMIT                 | No                      | Sets best effort maximum dictionary page size, in bytes.                                                                                                                                                                                                                                                                             | `'dictionary_page_size_limit'`                        | 1048576                  |
+| WRITE_BATCH_SIZE                           | No                      | Sets write_batch_size in rows.                                                                                                                                                                                                                                                                                                       | `'write_batch_size'`                                  | 1024                     |
+| WRITER_VERSION                             | No                      | Sets the Parquet writer version (`1.0` or `2.0`).                                                                                                                                                                                                                                                                                    | `'writer_version'`                                    | 1.0                      |
+| SKIP_ARROW_METADATA                        | No                      | If true, skips writing Arrow schema information into the Parquet file metadata.                                                                                                                                                                                                                                                      | `'skip_arrow_metadata'`                               | false                    |
+| CREATED_BY                                 | No                      | Sets the "created by" string in the Parquet file metadata.                                                                                                                                                                                                                                                                           | `'created_by'`                                        | datafusion version X.Y.Z |
+| COLUMN_INDEX_TRUNCATE_LENGTH               | No                      | Sets the length (in bytes) to truncate min/max values in column indexes.                                                                                                                                                                                                                                                             | `'column_index_truncate_length'`                      | 64                       |
+| STATISTICS_TRUNCATE_LENGTH                 | No                      | Sets statistics truncate length.                                                                                                                                                                                                                                                                                                     | `'statistics_truncate_length'`                        | None                     |
+| BLOOM_FILTER_ON_WRITE                      | No                      | Sets whether bloom filters should be written for all columns by default (can be overridden per column).                                                                                                                                                                                                                              | `'bloom_filter_on_write'`                             | false                    |
+| ALLOW_SINGLE_FILE_PARALLELISM              | No                      | Enables parallel serialization of columns in a single file.                                                                                                                                                                                                                                                                          | `'allow_single_file_parallelism'`                     | true                     |
+| MAXIMUM_PARALLEL_ROW_GROUP_WRITERS         | No                      | Maximum number of parallel row group writers.                                                                                                                                                                                                                                                                                        | `'maximum_parallel_row_group_writers'`                | 1                        |
+| MAXIMUM_BUFFERED_RECORD_BATCHES_PER_STREAM | No                      | Maximum number of buffered record batches per stream.                                                                                                                                                                                                                                                                                | `'maximum_buffered_record_batches_per_stream'`        | 2                        |
+| KEY_VALUE_METADATA                         | No (Key is specific)    | Adds custom key-value pairs to the file metadata. Use the format `'metadata::your_key_name' 'your_value'`. Multiple entries allowed.                                                                                                                                                                                                 | `'metadata::key_name'`                                | None                     |
 
 **Example:**
 
diff --git a/docs/source/user-guide/sql/index.rst b/docs/source/user-guide/sql/index.rst
index a13d40334b639..f1fef45f705a8 100644
--- a/docs/source/user-guide/sql/index.rst
+++ b/docs/source/user-guide/sql/index.rst
@@ -22,6 +22,7 @@ SQL Reference
    :maxdepth: 2
 
    data_types
+   struct_coercion
    select
    subqueries
    ddl
diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md
index 7c88d1fd9c3eb..c107a9173081d 100644
--- a/docs/source/user-guide/sql/scalar_functions.md
+++ b/docs/source/user-guide/sql/scalar_functions.md
@@ -294,7 +294,7 @@ ceil(numeric_expression)
 #### Example
 
 ```sql
-    > SELECT ceil(3.14);
+> SELECT ceil(3.14);
 +------------+
 | ceil(3.14) |
 +------------+
@@ -1225,7 +1225,7 @@ bit_length(str)
 
 ### `btrim`
 
-Trims the specified trim string from the start and end of a string. If no trim string is provided, all whitespace is removed from the start and end of the input string.
+Trims the specified trim string from the start and end of a string. If no trim string is provided, all spaces are removed from the start and end of the input string.
 
 ```sql
 btrim(str[, trim_str])
@@ -1234,7 +1234,7 @@ btrim(str[, trim_str])
 #### Arguments
 
 - **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
-- **trim_str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. _Default is whitespace characters._
+- **trim_str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. _Default is a space._
 
 #### Example
 
@@ -1592,7 +1592,7 @@ lpad(str, n[, padding_str])
 #### Arguments
 
 - **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
-- **n**: String length to pad to.
+- **n**: String length to pad to. If the input string is longer than this length, it is truncated (on the right).
 - **padding_str**: Optional string expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._
 
 #### Example
@@ -1612,7 +1612,7 @@ lpad(str, n[, padding_str])
 
 ### `ltrim`
 
-Trims the specified trim string from the beginning of a string. If no trim string is provided, all whitespace is removed from the start of the input string.
+Trims the specified trim string from the beginning of a string. If no trim string is provided, spaces are removed from the start of the input string.
 
 ```sql
 ltrim(str[, trim_str])
@@ -1621,7 +1621,7 @@ ltrim(str[, trim_str])
 #### Arguments
 
 - **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
-- **trim_str**: String expression to trim from the beginning of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is whitespace characters._
+- **trim_str**: String expression to trim from the beginning of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is a space._
 
 #### Example
 
@@ -1820,7 +1820,7 @@ rpad(str, n[, padding_str])
 #### Arguments
 
 - **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
-- **n**: String length to pad to.
+- **n**: String length to pad to. If the input string is longer than this length, it is truncated.
 - **padding_str**: String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._
 
 #### Example
@@ -1840,7 +1840,7 @@ rpad(str, n[, padding_str])
 
 ### `rtrim`
 
-Trims the specified trim string from the end of a string. If no trim string is provided, all whitespace is removed from the end of the input string.
+Trims the specified trim string from the end of a string. If no trim string is provided, all spaces are removed from the end of the input string.
 
 ```sql
 rtrim(str[, trim_str])
@@ -1849,7 +1849,7 @@ rtrim(str[, trim_str])
 #### Arguments
 
 - **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
-- **trim_str**: String expression to trim from the end of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is whitespace characters._
+- **trim_str**: String expression to trim from the end of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is a space._
 
 #### Example
 
@@ -1891,7 +1891,7 @@ split_part(str, delimiter, pos)
 
 - **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
 - **delimiter**: String or character to split on.
-- **pos**: Position of the part to return.
+- **pos**: Position of the part to return (counting from 1). Negative values count backward from the end of the string.
 
 #### Example
 
@@ -1974,7 +1974,7 @@ substr(str, start_pos[, length])
 #### Arguments
 
 - **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
-- **start_pos**: Character position to start the substring at. The first character in the string has a position of 1.
+- **start_pos**: Character position to start the substring at. The first character in the string has a position of 1. If the start position is less than 1, it is treated as if it is before the start of the string and the (absolute) number of characters before position 1 is subtracted from `length` (if given). For example, `substr('abc', -3, 6)` returns `'ab'`.
 - **length**: Number of characters to extract. If not specified, returns the rest of the string after the start position.
 
 #### Example
@@ -2068,17 +2068,17 @@ to_hex(int)
 
 ### `translate`
 
-Translates characters in a string to specified translation characters.
+Performs character-wise substitution based on a mapping.
 
 ```sql
-translate(str, chars, translation)
+translate(str, from, to)
 ```
 
 #### Arguments
 
 - **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
-- **chars**: Characters to translate.
-- **translation**: Translation characters. Translation characters replace only characters at the same position in the **chars** string.
+- **from**: The characters to be replaced.
+- **to**: The characters to replace them with. Each character in **from** that is found in **str** is replaced by the character at the same index in **to**. Any characters in **from** that don't have a corresponding character in **to** are removed. If a character appears more than once in **from**, the first occurrence determines the mapping.
 
 #### Example
 
@@ -2125,7 +2125,7 @@ upper(str)
 
 ### `uuid`
 
-Returns [`UUID v4`](<https://en.wikipedia.org/wiki/Universally_unique_identifier#Version_4_(random)>) string value which is unique per row.
+Returns [`UUID v4`](https://en.wikipedia.org/wiki/Universally_unique_identifier#Version_4_%28random%29) string value which is unique per row.
 
 ```sql
 uuid()
@@ -2175,7 +2175,7 @@ encode(expression, format)
 #### Arguments
 
 - **expression**: Expression containing string or binary data
-- **format**: Supported formats are: `base64`, `hex`
+- **format**: Supported formats are: `base64`, `base64pad`, `hex`
 
 **Related functions**:
 
@@ -2294,7 +2294,7 @@ SELECT regexp_like('aBc', '(b|d)', 'i');
 +--------------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
 
 ### `regexp_match`
 
@@ -2333,7 +2333,7 @@ regexp_match(str, regexp[, flags])
             +---------------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
 
 ### `regexp_replace`
 
@@ -2374,7 +2374,7 @@ SELECT regexp_replace('aBc', '(b|d)', 'Ab\\1a', 'i');
 +-------------------------------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
 
 ## Time and Date Functions
 
@@ -2389,10 +2389,12 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
 - [datetrunc](#datetrunc)
 - [from_unixtime](#from_unixtime)
 - [make_date](#make_date)
+- [make_time](#make_time)
 - [now](#now)
 - [to_char](#to_char)
 - [to_date](#to_date)
 - [to_local_time](#to_local_time)
+- [to_time](#to_time)
 - [to_timestamp](#to_timestamp)
 - [to_timestamp_micros](#to_timestamp_micros)
 - [to_timestamp_millis](#to_timestamp_millis)
@@ -2413,6 +2415,26 @@ current_date()
     SELECT current_date();
 ```
 
+#### Example
+
+```sql
+> SELECT current_date();
++----------------+
+| current_date() |
++----------------+
+| 2024-12-23     |
++----------------+
+
+-- The current date is based on the session time zone (UTC by default)
+> SET datafusion.execution.time_zone = 'Asia/Tokyo';
+> SELECT current_date();
++----------------+
+| current_date() |
++----------------+
+| 2024-12-24     |
++----------------+
+```
+
 #### Aliases
 
 - today
@@ -2431,6 +2453,26 @@ current_time()
     SELECT current_time();
 ```
 
+#### Example
+
+```sql
+> SELECT current_time();
++--------------------+
+| current_time()     |
++--------------------+
+| 06:30:00.123456789 |
++--------------------+
+
+-- The current time is based on the session time zone (UTC by default)
+> SET datafusion.execution.time_zone = 'Asia/Tokyo';
+> SELECT current_time();
++--------------------+
+| current_time()     |
++--------------------+
+| 15:30:00.123456789 |
++--------------------+
+```
+
 ### `current_timestamp`
 
 _Alias of [now](#now)._
@@ -2487,6 +2529,17 @@ FROM VALUES ('2023-01-01T18:18:18Z'), ('2023-01-03T19:00:03Z')  t(time);
 | 2023-01-03T03:00:00 |
 +---------------------+
 2 row(s) fetched.
+
+-- Bin the time into 15 minute intervals starting at 1 min
+>  SELECT date_bin(interval '15 minutes', time, TIME '00:01:00') as bin
+FROM VALUES (TIME '02:18:18'), (TIME '19:00:03')  t(time);
++----------+
+| bin      |
++----------+
+| 02:16:00 |
+| 18:46:00 |
++----------+
+2 row(s) fetched.
 ```
 
 ### `date_format`
@@ -2506,6 +2559,7 @@ date_part(part, expression)
 - **part**: Part of the date to return. The following date parts are supported:
 
   - year
+  - isoyear (ISO 8601 week-numbering year)
   - quarter (emits value in inclusive range [1, 4] based on which quartile of the year the date is in)
   - month
   - week (week of the year)
@@ -2518,11 +2572,28 @@ date_part(part, expression)
   - nanosecond
   - dow (day of the week where Sunday is 0)
   - doy (day of the year)
-  - epoch (seconds since Unix epoch)
+  - epoch (seconds since Unix epoch for timestamps/dates, total seconds for intervals)
   - isodow (day of the week where Monday is 0)
 
 - **expression**: Time expression to operate on. Can be a constant, column, or function.
 
+#### Example
+
+```sql
+> SELECT date_part('year', '2024-05-01T00:00:00');
++-----------------------------------------------------+
+| date_part(Utf8("year"),Utf8("2024-05-01T00:00:00")) |
++-----------------------------------------------------+
+| 2024                                                |
++-----------------------------------------------------+
+> SELECT extract(day FROM timestamp '2024-05-01T00:00:00');
++----------------------------------------------------+
+| date_part(Utf8("DAY"),Utf8("2024-05-01T00:00:00")) |
++----------------------------------------------------+
+| 1                                                  |
++----------------------------------------------------+
+```
+
 #### Alternative Syntax
 
 ```sql
@@ -2535,7 +2606,7 @@ extract(field FROM source)
 
 ### `date_trunc`
 
-Truncates a timestamp value to a specified precision.
+Truncates a timestamp or time value to a specified precision.
 
 ```sql
 date_trunc(precision, expression)
@@ -2545,6 +2616,8 @@ date_trunc(precision, expression)
 
 - **precision**: Time precision to truncate to. The following precisions are supported:
 
+  For Timestamp types:
+
   - year / YEAR
   - quarter / QUARTER
   - month / MONTH
@@ -2556,7 +2629,32 @@ date_trunc(precision, expression)
   - millisecond / MILLISECOND
   - microsecond / MICROSECOND
 
-- **expression**: Time expression to operate on. Can be a constant, column, or function.
+  For Time types (hour, minute, second, millisecond, microsecond only):
+
+  - hour / HOUR
+  - minute / MINUTE
+  - second / SECOND
+  - millisecond / MILLISECOND
+  - microsecond / MICROSECOND
+
+- **expression**: Timestamp or time expression to operate on. Can be a constant, column, or function.
+
+#### Example
+
+```sql
+> SELECT date_trunc('month', '2024-05-15T10:30:00');
++-----------------------------------------------+
+| date_trunc(Utf8("month"),Utf8("2024-05-15T10:30:00")) |
++-----------------------------------------------+
+| 2024-05-01T00:00:00                           |
++-----------------------------------------------+
+> SELECT date_trunc('hour', '2024-05-15T10:30:00');
++----------------------------------------------+
+| date_trunc(Utf8("hour"),Utf8("2024-05-15T10:30:00")) |
++----------------------------------------------+
+| 2024-05-15T10:00:00                          |
++----------------------------------------------+
+```
 
 #### Aliases
 
@@ -2625,7 +2723,40 @@ make_date(year, month, day)
 +-----------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/date_time_functions.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
+
+### `make_time`
+
+Make a time from hour/minute/second component parts.
+
+```sql
+make_time(hour, minute, second)
+```
+
+#### Arguments
+
+- **hour**: Hour to use when making the time. Can be a constant, column or function, and any combination of arithmetic operators.
+- **minute**: Minute to use when making the time. Can be a constant, column or function, and any combination of arithmetic operators.
+- **second**: Second to use when making the time. Can be a constant, column or function, and any combination of arithmetic operators.
+
+#### Example
+
+```sql
+> select make_time(13, 23, 1);
++-------------------------------------------+
+| make_time(Int64(13),Int64(23),Int64(1))   |
++-------------------------------------------+
+| 13:23:01                                  |
++-------------------------------------------+
+> select make_time('23', '01', '31');
++-----------------------------------------------+
+| make_time(Utf8("23"),Utf8("01"),Utf8("31"))   |
++-----------------------------------------------+
+| 23:01:31                                      |
++-----------------------------------------------+
+```
+
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 
 ### `now`
 
@@ -2637,6 +2768,26 @@ The `now()` return value is determined at query time and will return the same ti
 now()
 ```
 
+#### Example
+
+```sql
+> SELECT now();
++----------------------------------+
+| now()                            |
++----------------------------------+
+| 2024-12-23T06:30:00.123456789    |
++----------------------------------+
+
+-- The timezone of the returned timestamp depends on the session time zone
+> SET datafusion.execution.time_zone = 'America/New_York';
+> SELECT now();
++--------------------------------------+
+| now()                                |
++--------------------------------------+
+| 2024-12-23T01:30:00.123456789-05:00  |
++--------------------------------------+
+```
+
 #### Aliases
 
 - current_timestamp
@@ -2666,7 +2817,7 @@ to_char(expression, format)
 +----------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/date_time_functions.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 
 #### Aliases
 
@@ -2675,7 +2826,7 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
 ### `to_date`
 
 Converts a value to a date (`YYYY-MM-DD`).
-Supports strings, integer and double types as input.
+Supports strings, numeric and timestamp types as input.
 Strings are parsed as YYYY-MM-DD (e.g. '2023-07-20') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided.
 Integers and doubles are interpreted as days since the unix epoch (`1970-01-01T00:00:00Z`).
 Returns the corresponding date.
@@ -2710,7 +2861,7 @@ to_date('2017-05-31', '%Y-%m-%d')
 +---------------------------------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/date_time_functions.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 
 ### `to_local_time`
 
@@ -2773,11 +2924,68 @@ FROM (
 +---------------------------+
 ```
 
+### `to_time`
+
+Converts a value to a time (`HH:MM:SS.nnnnnnnnn`).
+Supports strings and timestamps as input.
+Strings are parsed as `HH:MM:SS`, `HH:MM:SS.nnnnnnnnn`, or `HH:MM` if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided.
+Timestamps will have the time portion extracted.
+Returns the corresponding time.
+
+Note: `to_time` returns Time64(Nanosecond), which represents the time of day in nanoseconds since midnight.
+
+```sql
+to_time('12:30:45', '%H:%M:%S')
+```
+
+#### Arguments
+
+- **expression**: String or Timestamp expression to operate on. Can be a constant, column, or function, and any combination of operators.
+- **format_n**: Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order
+  they appear with the first successful one being returned. If none of the formats successfully parse the expression
+  an error will be returned.
+
+#### Example
+
+```sql
+> select to_time('12:30:45');
++---------------------------+
+| to_time(Utf8("12:30:45")) |
++---------------------------+
+| 12:30:45                  |
++---------------------------+
+> select to_time('12-30-45', '%H-%M-%S');
++--------------------------------------------+
+| to_time(Utf8("12-30-45"),Utf8("%H-%M-%S")) |
++--------------------------------------------+
+| 12:30:45                                   |
++--------------------------------------------+
+> select to_time('2024-01-15 14:30:45'::timestamp);
++--------------------------------------------------+
+| to_time(Utf8("2024-01-15 14:30:45"))             |
++--------------------------------------------------+
+| 14:30:45                                         |
++--------------------------------------------------+
+```
+
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
+
 ### `to_timestamp`
 
-Converts a value to a timestamp (`YYYY-MM-DDT00:00:00Z`). Supports strings, integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono formats] are provided. Integers, unsigned integers, and doubles are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp.
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000<TZ>`) in the session time zone. Supports strings,
+integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided.
+Strings that parse without a time zone are treated as if they are in the
+session time zone, or UTC if no session time zone is set.
+Integers, unsigned integers, and doubles are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`).
+
+Note: `to_timestamp` returns `Timestamp(ns, TimeZone)` where the time zone is the session time zone. The supported range
+for integer input is between`-9223372037` and `9223372036`. Supported range for string input is between
+`1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. Please use `to_timestamp_seconds`
+for the input outside of supported bounds.
 
-Note: `to_timestamp` returns `Timestamp(ns)`. The supported range for integer input is between `-9223372037` and `9223372036`. Supported range for string input is between `1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. Please use `to_timestamp_seconds` for the input outside of supported bounds.
+The session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.
+The time zone can be a value like +00:00, 'Europe/London' etc.
 
 ```sql
 to_timestamp(expression[, ..., format_n])
@@ -2786,7 +2994,11 @@ to_timestamp(expression[, ..., format_n])
 #### Arguments
 
 - **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.
-- **format_n**: Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned.
+- **format_n**:
+  Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression.
+  Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully
+  parse the expression an error will be returned. Note: parsing of named timezones (e.g. 'America/New_York') using %Z is
+  only supported at the end of the string preceded by a space.
 
 #### Example
 
@@ -2805,11 +3017,19 @@ to_timestamp(expression[, ..., format_n])
 +--------------------------------------------------------------------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/date_time_functions.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 
 ### `to_timestamp_micros`
 
-Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000Z`). Supports strings, integer, and unsigned integer types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. Integers and unsigned integers are interpreted as microseconds since the unix epoch (`1970-01-01T00:00:00Z`) Returns the corresponding timestamp.
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000<TZ>`) in the session time zone. Supports strings,
+integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided.
+Strings that parse without a time zone are treated as if they are in the
+session time zone, or UTC if no session time zone is set.
+Integers, unsigned integers, and doubles are interpreted as microseconds since the unix epoch (`1970-01-01T00:00:00Z`).
+
+The session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.
+The time zone can be a value like +00:00, 'Europe/London' etc.
 
 ```sql
 to_timestamp_micros(expression[, ..., format_n])
@@ -2818,7 +3038,11 @@ to_timestamp_micros(expression[, ..., format_n])
 #### Arguments
 
 - **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.
-- **format_n**: Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned.
+- **format_n**:
+  Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression.
+  Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully
+  parse the expression an error will be returned. Note: parsing of named timezones (e.g. 'America/New_York') using %Z is
+  only supported at the end of the string preceded by a space.
 
 #### Example
 
@@ -2837,11 +3061,19 @@ to_timestamp_micros(expression[, ..., format_n])
 +---------------------------------------------------------------------------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/date_time_functions.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 
 ### `to_timestamp_millis`
 
-Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000Z`). Supports strings, integer, and unsigned integer types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided. Integers and unsigned integers are interpreted as milliseconds since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp.
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000<TZ>`) in the session time zone. Supports strings,
+integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided.
+Strings that parse without a time zone are treated as if they are in the
+session time zone, or UTC if no session time zone is set.
+Integers, unsigned integers, and doubles are interpreted as milliseconds since the unix epoch (`1970-01-01T00:00:00Z`).
+
+The session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.
+The time zone can be a value like +00:00, 'Europe/London' etc.
 
 ```sql
 to_timestamp_millis(expression[, ..., format_n])
@@ -2850,7 +3082,11 @@ to_timestamp_millis(expression[, ..., format_n])
 #### Arguments
 
 - **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.
-- **format_n**: Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned.
+- **format_n**:
+  Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression.
+  Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully
+  parse the expression an error will be returned. Note: parsing of named timezones (e.g. 'America/New_York') using %Z is
+  only supported at the end of the string preceded by a space.
 
 #### Example
 
@@ -2869,11 +3105,18 @@ to_timestamp_millis(expression[, ..., format_n])
 +---------------------------------------------------------------------------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/date_time_functions.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 
 ### `to_timestamp_nanos`
 
-Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000000Z`). Supports strings, integer, and unsigned integer types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. Integers and unsigned integers are interpreted as nanoseconds since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp.
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000000<TZ>`) in the session time zone. Supports strings,
+integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided.
+Strings that parse without a time zone are treated as if they are in the
+session time zone. Integers, unsigned integers, and doubles are interpreted as nanoseconds since the unix epoch (`1970-01-01T00:00:00Z`).
+
+The session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.
+The time zone can be a value like +00:00, 'Europe/London' etc.
 
 ```sql
 to_timestamp_nanos(expression[, ..., format_n])
@@ -2882,7 +3125,11 @@ to_timestamp_nanos(expression[, ..., format_n])
 #### Arguments
 
 - **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.
-- **format_n**: Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned.
+- **format_n**:
+  Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression.
+  Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully
+  parse the expression an error will be returned. Note: parsing of named timezones (e.g. 'America/New_York') using %Z is
+  only supported at the end of the string preceded by a space.
 
 #### Example
 
@@ -2901,11 +3148,19 @@ to_timestamp_nanos(expression[, ..., format_n])
 +---------------------------------------------------------------------------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/date_time_functions.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 
 ### `to_timestamp_seconds`
 
-Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000Z`). Supports strings, integer, and unsigned integer types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. Integers and unsigned integers are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp.
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00<TZ>`) in the session time zone. Supports strings,
+integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided.
+Strings that parse without a time zone are treated as if they are in the
+session time zone, or UTC if no session time zone is set.
+Integers, unsigned integers, and doubles are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`).
+
+The session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.
+The time zone can be a value like +00:00, 'Europe/London' etc.
 
 ```sql
 to_timestamp_seconds(expression[, ..., format_n])
@@ -2914,7 +3169,11 @@ to_timestamp_seconds(expression[, ..., format_n])
 #### Arguments
 
 - **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.
-- **format_n**: Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned.
+- **format_n**:
+  Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression.
+  Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully
+  parse the expression an error will be returned. Note: parsing of named timezones (e.g. 'America/New_York') using %Z is
+  only supported at the end of the string preceded by a space.
 
 #### Example
 
@@ -2933,11 +3192,15 @@ to_timestamp_seconds(expression[, ..., format_n])
 +----------------------------------------------------------------------------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/date_time_functions.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 
 ### `to_unixtime`
 
-Converts a value to seconds since the unix epoch (`1970-01-01T00:00:00Z`). Supports strings, dates, timestamps and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided.
+Converts a value to seconds since the unix epoch (`1970-01-01T00:00:00`).
+Supports strings, dates, timestamps, integer, unsigned integer, and float types as input.
+Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided.
+Integers, unsigned integers, and floats are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00`).
 
 ```sql
 to_unixtime(expression[, ..., format_n])
@@ -2971,9 +3234,12 @@ _Alias of [current_date](#current_date)._
 
 ## Array Functions
 
+- [any_match](#any_match)
+- [array_any_match](#array_any_match)
 - [array_any_value](#array_any_value)
 - [array_append](#array_append)
 - [array_cat](#array_cat)
+- [array_compact](#array_compact)
 - [array_concat](#array_concat)
 - [array_contains](#array_contains)
 - [array_dims](#array_dims)
@@ -3012,15 +3278,20 @@ _Alias of [current_date](#current_date)._
 - [array_slice](#array_slice)
 - [array_sort](#array_sort)
 - [array_to_string](#array_to_string)
+- [array_transform](#array_transform)
 - [array_union](#array_union)
 - [arrays_overlap](#arrays_overlap)
+- [arrays_zip](#arrays_zip)
 - [cardinality](#cardinality)
+- [cosine_distance](#cosine_distance)
 - [empty](#empty)
 - [flatten](#flatten)
 - [generate_series](#generate_series)
+- [list_any_match](#list_any_match)
 - [list_any_value](#list_any_value)
 - [list_append](#list_append)
 - [list_cat](#list_cat)
+- [list_compact](#list_compact)
 - [list_concat](#list_concat)
 - [list_contains](#list_contains)
 - [list_dims](#list_dims)
@@ -3058,13 +3329,48 @@ _Alias of [current_date](#current_date)._
 - [list_slice](#list_slice)
 - [list_sort](#list_sort)
 - [list_to_string](#list_to_string)
+- [list_transform](#list_transform)
 - [list_union](#list_union)
+- [list_zip](#list_zip)
 - [make_array](#make_array)
 - [make_list](#make_list)
 - [range](#range)
 - [string_to_array](#string_to_array)
 - [string_to_list](#string_to_list)
 
+### `any_match`
+
+_Alias of [array_any_match](#array_any_match)._
+
+### `array_any_match`
+
+Returns whether any elements of an array match the given predicate. Returns true if one or more elements match, false if none match (including empty arrays), and null if the predicate returns null for some elements and false for all others.
+
+```sql
+any_match(array, predicate)
+```
+
+#### Arguments
+
+- **array**: Array expression. Can be a constant, column, or function, and any combination of array operators.
+- **predicate**: Lambda predicate that returns a boolean
+
+#### Example
+
+```sql
+> select any_match([1, 2, 3], x -> x > 2);
++----------------------------------+
+| any_match([1, 2, 3], x -> x > 2) |
++----------------------------------+
+| true                             |
++----------------------------------+
+```
+
+#### Aliases
+
+- any_match
+- list_any_match
+
 ### `array_any_value`
 
 Returns the first non-null element in the array.
@@ -3126,6 +3432,33 @@ array_append(array, element)
 
 _Alias of [array_concat](#array_concat)._
 
+### `array_compact`
+
+Removes null values from the array.
+
+```sql
+array_compact(array)
+```
+
+#### Arguments
+
+- **array**: Array expression. Can be a constant, column, or function, and any combination of array operators.
+
+#### Example
+
+```sql
+> select array_compact([1, NULL, 2, NULL, 3]) arr;
++-----------+
+| arr       |
++-----------+
+| [1, 2, 3] |
++-----------+
+```
+
+#### Aliases
+
+- list_compact
+
 ### `array_concat`
 
 Concatenates arrays.
@@ -3374,16 +3707,16 @@ array_has_all(array, sub-array)
 
 ### `array_has_any`
 
-Returns true if any elements exist in both arrays.
+Returns true if the arrays have any elements in common.
 
 ```sql
-array_has_any(array, sub-array)
+array_has_any(array1, array2)
 ```
 
 #### Arguments
 
-- **array**: Array expression. Can be a constant, column, or function, and any combination of array operators.
-- **sub-array**: Array expression. Can be a constant, column, or function, and any combination of array operators.
+- **array1**: Array expression. Can be a constant, column, or function, and any combination of array operators.
+- **array2**: Array expression. Can be a constant, column, or function, and any combination of array operators.
 
 #### Example
 
@@ -3605,7 +3938,7 @@ array_pop_front(array)
 
 ### `array_position`
 
-Returns the position of the first occurrence of the specified element in the array, or NULL if not found.
+Returns the position of the first occurrence of the specified element in the array, or NULL if not found. Comparisons are done using `IS DISTINCT FROM` semantics, so NULL is considered to match NULL.
 
 ```sql
 array_position(array, element)
@@ -3615,7 +3948,7 @@ array_position(array, element, index)
 #### Arguments
 
 - **array**: Array expression. Can be a constant, column, or function, and any combination of array operators.
-- **element**: Element to search for position in the array.
+- **element**: Element to search for in the array.
 - **index**: Index at which to start searching (1-indexed).
 
 #### Example
@@ -3652,7 +3985,7 @@ array_positions(array, element)
 #### Arguments
 
 - **array**: Array expression. Can be a constant, column, or function, and any combination of array operators.
-- **element**: Element to search for position in the array.
+- **element**: Element to search for in the array.
 
 #### Example
 
@@ -3709,7 +4042,7 @@ _Alias of [array_prepend](#array_prepend)._
 
 ### `array_remove`
 
-Removes the first element from the array equal to the given value.
+Removes the first element from the array equal to the given value. NULL elements already in the array are preserved when removing a non-NULL value. If `element` evaluates to NULL, the result is NULL rather than removing NULL entries.
 
 ```sql
 array_remove(array, element)
@@ -3729,6 +4062,13 @@ array_remove(array, element)
 +----------------------------------------------+
 | [1, 2, 3, 2, 1, 4]                           |
 +----------------------------------------------+
+
+> select array_remove([1, 2, NULL, 2, 4], 2);
++---------------------------------------------------+
+| array_remove(List([1,2,NULL,2,4]),Int64(2)) |
++---------------------------------------------------+
+| [1, NULL, 2, 4]                              |
++---------------------------------------------------+
 ```
 
 #### Aliases
@@ -3737,7 +4077,7 @@ array_remove(array, element)
 
 ### `array_remove_all`
 
-Removes all elements from the array equal to the given value.
+Removes all elements from the array equal to the given value. NULL elements already in the array are preserved when removing a non-NULL value. If `element` evaluates to NULL, the result is NULL rather than removing NULL entries.
 
 ```sql
 array_remove_all(array, element)
@@ -3757,6 +4097,13 @@ array_remove_all(array, element)
 +--------------------------------------------------+
 | [1, 3, 1, 4]                                     |
 +--------------------------------------------------+
+
+> select array_remove_all([1, 2, NULL, 2, 4], 2);
++-----------------------------------------------------+
+| array_remove_all(List([1,2,NULL,2,4]),Int64(2)) |
++-----------------------------------------------------+
+| [1, NULL, 4]                                     |
++-----------------------------------------------------+
 ```
 
 #### Aliases
@@ -3765,10 +4112,10 @@ array_remove_all(array, element)
 
 ### `array_remove_n`
 
-Removes the first `max` elements from the array equal to the given value.
+Removes the first `max` elements from the array equal to the given value. NULL elements already in the array are preserved when removing a non-NULL value. If `element` evaluates to NULL, the result is NULL rather than removing NULL entries.
 
 ```sql
-array_remove_n(array, element, max))
+array_remove_n(array, element, max)
 ```
 
 #### Arguments
@@ -3786,6 +4133,13 @@ array_remove_n(array, element, max))
 +---------------------------------------------------------+
 | [1, 3, 2, 1, 4]                                         |
 +---------------------------------------------------------+
+
+> select array_remove_n([1, 2, NULL, 2, 4], 2, 2);
++----------------------------------------------------------+
+| array_remove_n(List([1,2,NULL,2,4]),Int64(2),Int64(2)) |
++----------------------------------------------------------+
+| [1, NULL, 4]                                            |
++----------------------------------------------------------+
 ```
 
 #### Aliases
@@ -4011,8 +4365,8 @@ array_sort(array, desc, nulls_first)
 #### Arguments
 
 - **array**: Array expression. Can be a constant, column, or function, and any combination of array operators.
-- **desc**: Whether to sort in descending order(`ASC` or `DESC`).
-- **nulls_first**: Whether to sort nulls first(`NULLS FIRST` or `NULLS LAST`).
+- **desc**: Whether to sort in ascending (`ASC`) or descending (`DESC`) order. The default is `ASC`.
+- **nulls_first**: Whether to sort nulls first (`NULLS FIRST`) or last (`NULLS LAST`). The default is `NULLS FIRST`.
 
 #### Example
 
@@ -4041,7 +4395,7 @@ array_to_string(array, delimiter[, null_string])
 
 - **array**: Array expression. Can be a constant, column, or function, and any combination of array operators.
 - **delimiter**: Array element separator.
-- **null_string**: Optional. String to replace null values in the array. If not provided, nulls will be handled by default behavior.
+- **null_string**: Optional. String to use for null values in the output. If not provided, nulls will be omitted.
 
 #### Example
 
@@ -4060,9 +4414,37 @@ array_to_string(array, delimiter[, null_string])
 - array_join
 - list_join
 
+### `array_transform`
+
+transforms the values of an array
+
+```sql
+array_transform(array, x -> x*2)
+```
+
+#### Arguments
+
+- **array**: Array expression. Can be a constant, column, or function, and any combination of array operators.
+- **lambda**: Lambda
+
+#### Example
+
+```sql
+> select array_transform([1, 2, 3, 4, 5], x -> x*2);
++-------------------------------------------+
+| array_transform([1, 2, 3, 4, 5], x -> x*2)       |
++-------------------------------------------+
+| [2, 4, 6, 8, 10]                          |
++-------------------------------------------+
+```
+
+#### Aliases
+
+- list_transform
+
 ### `array_union`
 
-Returns an array of elements that are present in both arrays (all elements from both arrays) with out duplicates.
+Returns an array of elements that are present in both arrays (all elements from both arrays) without duplicates.
 
 ```sql
 array_union(array1, array2)
@@ -4098,6 +4480,40 @@ array_union(array1, array2)
 
 _Alias of [array_has_any](#array_has_any)._
 
+### `arrays_zip`
+
+Returns an array of structs created by combining the elements of each input array at the same index. If the arrays have different lengths, shorter arrays are padded with NULLs.
+
+```sql
+arrays_zip(array1[, ..., array_n])
+```
+
+#### Arguments
+
+- **array1**: First array expression.
+- **array_n**: Optional additional array expressions.
+
+#### Example
+
+```sql
+> select arrays_zip([1, 2, 3]);
++---------------------------------------------------+
+| arrays_zip([1, 2, 3])                             |
++---------------------------------------------------+
+| [{1: 1}, {1: 2}, {1: 3}]                          |
++---------------------------------------------------+
+> select arrays_zip([1, 2], [3, 4, 5]);
++---------------------------------------------------+
+| arrays_zip([1, 2], [3, 4, 5])                     |
++---------------------------------------------------+
+| [{1: 1, 2: 3}, {1: 2, 2: 4}, {1: NULL, 2: 5}]     |
++---------------------------------------------------+
+```
+
+#### Aliases
+
+- list_zip
+
 ### `cardinality`
 
 Returns the total number of elements in the array.
@@ -4121,6 +4537,30 @@ cardinality(array)
 +--------------------------------------+
 ```
 
+### `cosine_distance`
+
+Returns the cosine distance between two input arrays of equal length. The cosine distance is defined as 1 - cosine_similarity, i.e. `1 - dot(a,b) / (||a|| * ||b||)`. Returns NULL if either array is NULL or contains only zeros.
+
+```sql
+cosine_distance(array1, array2)
+```
+
+#### Arguments
+
+- **array1**: Array expression. Can be a constant, column, or function, and any combination of array operators.
+- **array2**: Array expression. Can be a constant, column, or function, and any combination of array operators.
+
+#### Example
+
+```sql
+> select cosine_distance([1.0, 0.0], [0.0, 1.0]);
++-----------------------------------------------+
+| cosine_distance(List([1.0,0.0]),List([0.0,1.0])) |
++-----------------------------------------------+
+| 1.0                                           |
++-----------------------------------------------+
+```
+
 ### `empty`
 
 Returns 1 for an empty array or 0 for a non-empty array.
@@ -4203,6 +4643,10 @@ generate_series(start, stop[, step])
 +------------------------------------+
 ```
 
+### `list_any_match`
+
+_Alias of [array_any_match](#array_any_match)._
+
 ### `list_any_value`
 
 _Alias of [array_any_value](#array_any_value)._
@@ -4215,6 +4659,10 @@ _Alias of [array_append](#array_append)._
 
 _Alias of [array_concat](#array_concat)._
 
+### `list_compact`
+
+_Alias of [array_compact](#array_compact)._
+
 ### `list_concat`
 
 _Alias of [array_concat](#array_concat)._
@@ -4363,10 +4811,18 @@ _Alias of [array_sort](#array_sort)._
 
 _Alias of [array_to_string](#array_to_string)._
 
+### `list_transform`
+
+_Alias of [array_transform](#array_transform)._
+
 ### `list_union`
 
 _Alias of [array_union](#array_union)._
 
+### `list_zip`
+
+_Alias of [arrays_zip](#arrays_zip)._
+
 ### `make_array`
 
 Returns an array using the specified input expressions.
@@ -4479,6 +4935,8 @@ _Alias of [string_to_array](#string_to_array)._
 ### `named_struct`
 
 Returns an Arrow struct using the specified name and input expressions pairs.
+For information on comparing and ordering struct values (including `NULL` handling),
+see [Comparison and Ordering](struct_coercion.md#comparison-and-ordering).
 
 ```sql
 named_struct(expression1_name, expression1_input[, ..., expression_n_name, expression_n_input])
@@ -4520,6 +4978,8 @@ _Alias of [struct](#struct)._
 Returns an Arrow struct using the specified input expressions optionally named.
 Fields in the returned struct use the optional name or the `cN` naming convention.
 For example: `c0`, `c1`, `c2`, etc.
+For information on comparing and ordering struct values (including `NULL` handling),
+see [Comparison and Ordering](struct_coercion.md#comparison-and-ordering).
 
 ```sql
 struct(expression1[, ..., expression_n])
@@ -4678,6 +5138,11 @@ SELECT map_extract(MAP {1: 'one', 2: 'two'}, 2);
 
 SELECT map_extract(MAP {'x': 10, 'y': NULL, 'z': 30}, 'y');
 ----
+[NULL]
+
+-- non-existing key
+SELECT map_extract(MAP {'x': 10, 'y': NULL, 'z': 30}, 'a');
+----
 []
 ```
 
@@ -4767,11 +5232,11 @@ digest(expression, algorithm)
 
 ```sql
 > select digest('foo', 'sha256');
-+------------------------------------------+
-| digest(Utf8("foo"), Utf8("sha256"))      |
-+------------------------------------------+
-| <binary_hash_result>                     |
-+------------------------------------------+
++------------------------------------------------------------------+
+| digest(Utf8("foo"),Utf8("sha256"))                               |
++------------------------------------------------------------------+
+| 2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae |
++------------------------------------------------------------------+
 ```
 
 ### `md5`
@@ -4790,11 +5255,11 @@ md5(expression)
 
 ```sql
 > select md5('foo');
-+-------------------------------------+
-| md5(Utf8("foo"))                    |
-+-------------------------------------+
-| <md5_checksum_result>               |
-+-------------------------------------+
++----------------------------------+
+| md5(Utf8("foo"))                 |
++----------------------------------+
+| acbd18db4cc2f85cedef654fccc4a4d8 |
++----------------------------------+
 ```
 
 ### `sha224`
@@ -4813,11 +5278,11 @@ sha224(expression)
 
 ```sql
 > select sha224('foo');
-+------------------------------------------+
-| sha224(Utf8("foo"))                      |
-+------------------------------------------+
-| <sha224_hash_result>                     |
-+------------------------------------------+
++----------------------------------------------------------+
+| sha224(Utf8("foo"))                                      |
++----------------------------------------------------------+
+| 0808f64e60d58979fcb676c96ec938270dea42445aeefcd3a4e6f8db |
++----------------------------------------------------------+
 ```
 
 ### `sha256`
@@ -4836,11 +5301,11 @@ sha256(expression)
 
 ```sql
 > select sha256('foo');
-+--------------------------------------+
-| sha256(Utf8("foo"))                  |
-+--------------------------------------+
-| <sha256_hash_result>                 |
-+--------------------------------------+
++------------------------------------------------------------------+
+| sha256(Utf8("foo"))                                              |
++------------------------------------------------------------------+
+| 2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae |
++------------------------------------------------------------------+
 ```
 
 ### `sha384`
@@ -4859,11 +5324,11 @@ sha384(expression)
 
 ```sql
 > select sha384('foo');
-+-----------------------------------------+
-| sha384(Utf8("foo"))                     |
-+-----------------------------------------+
-| <sha384_hash_result>                    |
-+-----------------------------------------+
++--------------------------------------------------------------------------------------------------+
+| sha384(Utf8("foo"))                                                                              |
++--------------------------------------------------------------------------------------------------+
+| 98c11ffdfdd540676b1a137cb1a22b2a70350c9a44171d6b1180c6be5cbb2ee3f79d532c8a1dd9ef2e8e08e752a3babb |
++--------------------------------------------------------------------------------------------------+
 ```
 
 ### `sha512`
@@ -4882,11 +5347,11 @@ sha512(expression)
 
 ```sql
 > select sha512('foo');
-+-------------------------------------------+
-| sha512(Utf8("foo"))                       |
-+-------------------------------------------+
-| <sha512_hash_result>                      |
-+-------------------------------------------+
++----------------------------------------------------------------------------------------------------------------------------------+
+| sha512(Utf8("foo"))                                                                                                              |
++----------------------------------------------------------------------------------------------------------------------------------+
+| f7fbba6e0636f890e56fbbf3283e524c6fa3204ae298382d624741d0dc6638326e282c41be5e4254d8820772c5518a2c5a8c0c7f7eda19594a7eb539453e1ed7 |
++----------------------------------------------------------------------------------------------------------------------------------+
 ```
 
 ## Union Functions
@@ -4954,9 +5419,15 @@ union_tag(union_expression)
 ## Other Functions
 
 - [arrow_cast](#arrow_cast)
+- [arrow_field](#arrow_field)
+- [arrow_metadata](#arrow_metadata)
+- [arrow_try_cast](#arrow_try_cast)
 - [arrow_typeof](#arrow_typeof)
+- [cast_to_type](#cast_to_type)
 - [get_field](#get_field)
+- [try_cast_to_type](#try_cast_to_type)
 - [version](#version)
+- [with_metadata](#with_metadata)
 
 ### `arrow_cast`
 
@@ -4996,6 +5467,92 @@ arrow_cast(expression, datatype)
 +---------------------------+---------------------+
 ```
 
+### `arrow_field`
+
+Returns a struct containing the Arrow field information of the expression, including name, data type, nullability, and metadata.
+
+```sql
+arrow_field(expression)
+```
+
+#### Arguments
+
+- **expression**: Expression to evaluate. The expression can be a constant, column, or function, and any combination of operators.
+
+#### Example
+
+```sql
+> select arrow_field(1);
++-------------------------------------------------------------+
+| arrow_field(Int64(1))                                       |
++-------------------------------------------------------------+
+| {name: lit, data_type: Int64, nullable: false, metadata: {}} |
++-------------------------------------------------------------+
+
+> select arrow_field(1)['data_type'];
++-----------------------------------+
+| arrow_field(Int64(1))[data_type]  |
++-----------------------------------+
+| Int64                             |
++-----------------------------------+
+```
+
+### `arrow_metadata`
+
+Returns the metadata of the input expression. If a key is provided, returns the value for that key. If no key is provided, returns a Map of all metadata.
+
+```sql
+arrow_metadata(expression[, key])
+```
+
+#### Arguments
+
+- **expression**: The expression to retrieve metadata from. Can be a column or other expression.
+- **key**: Optional. The specific metadata key to retrieve.
+
+#### Example
+
+```sql
+> select arrow_metadata(col) from table;
++----------------------------+
+| arrow_metadata(table.col)  |
++----------------------------+
+| {k: v}                     |
++----------------------------+
+> select arrow_metadata(col, 'k') from table;
++-------------------------------+
+| arrow_metadata(table.col, 'k')|
++-------------------------------+
+| v                             |
++-------------------------------+
+```
+
+### `arrow_try_cast`
+
+Casts a value to a specific Arrow data type, returning NULL if the cast fails.
+
+```sql
+arrow_try_cast(expression, datatype)
+```
+
+#### Arguments
+
+- **expression**: Expression to cast. The expression can be a constant, column, or function, and any combination of operators.
+- **datatype**: [Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) name to cast to, as a string. The format is the same as that returned by [`arrow_typeof`]
+
+#### Example
+
+```sql
+> select arrow_try_cast('123', 'Int64') as a,
+         arrow_try_cast('not_a_number', 'Int64') as b;
+
++-----+------+
+| a   | b    |
++-----+------+
+| 123 | NULL |
++-----+------+
+```
+
 ### `arrow_typeof`
 
 Returns the name of the underlying [Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) of the expression.
@@ -5019,47 +5576,113 @@ arrow_typeof(expression)
 +---------------------------+------------------------+
 ```
 
+### `cast_to_type`
+
+Casts the first argument to the data type of the second argument. Only the type of the second argument is used; its value is ignored.
+
+```sql
+cast_to_type(expression, reference)
+```
+
+#### Arguments
+
+- **expression**: The expression to cast. It can be a constant, column, or function, and any combination of operators.
+- **reference**: Reference expression whose data type determines the target cast type. The value is ignored.
+
+#### Example
+
+```sql
+> select cast_to_type('42', NULL::INTEGER) as a;
++----+
+| a  |
++----+
+| 42 |
++----+
+
+> select cast_to_type(1 + 2, NULL::DOUBLE) as b;
++-----+
+| b   |
++-----+
+| 3.0 |
++-----+
+```
+
 ### `get_field`
 
 Returns a field within a map or a struct with the given key.
+Supports nested field access by providing multiple field names.
 Note: most users invoke `get_field` indirectly via field access
 syntax such as `my_struct_col['field_name']` which results in a call to
 `get_field(my_struct_col, 'field_name')`.
+Nested access like `my_struct['a']['b']` is optimized to a single call:
+`get_field(my_struct, 'a', 'b')`.
 
 ```sql
-get_field(expression1, expression2)
+get_field(expression, field_name[, field_name2, ...])
 ```
 
 #### Arguments
 
-- **expression1**: The map or struct to retrieve a field for.
-- **expression2**: The field name in the map or struct to retrieve data for. Must evaluate to a string.
+- **expression**: The map or struct to retrieve a field from.
+- **field_name**: The field name(s) to access, in order for nested access. Must evaluate to strings.
 
 #### Example
 
 ```sql
-> create table t (idx varchar, v varchar) as values ('data','fusion'), ('apache', 'arrow');
-> select struct(idx, v) from t as c;
-+-------------------------+
-| struct(c.idx,c.v)       |
-+-------------------------+
-| {c0: data, c1: fusion}  |
-| {c0: apache, c1: arrow} |
-+-------------------------+
-> select get_field((select struct(idx, v) from t), 'c0');
-+-----------------------+
-| struct(t.idx,t.v)[c0] |
-+-----------------------+
-| data                  |
-| apache                |
-+-----------------------+
-> select get_field((select struct(idx, v) from t), 'c1');
-+-----------------------+
-| struct(t.idx,t.v)[c1] |
-+-----------------------+
-| fusion                |
-| arrow                 |
-+-----------------------+
+> -- Access a field from a struct column
+> create table test( struct_col) as values
+    ({name: 'Alice', age: 30}),
+    ({name: 'Bob', age: 25});
+> select struct_col from test;
++-----------------------------+
+| struct_col                  |
++-----------------------------+
+| {name: Alice, age: 30}      |
+| {name: Bob, age: 25}        |
++-----------------------------+
+> select struct_col['name'] as name from test;
++-------+
+| name  |
++-------+
+| Alice |
+| Bob   |
++-------+
+
+> -- Nested field access with multiple arguments
+> create table test(struct_col) as values
+    ({outer: {inner_val: 42}});
+> select struct_col['outer']['inner_val'] as result from test;
++--------+
+| result |
++--------+
+| 42     |
++--------+
+```
+
+### `try_cast_to_type`
+
+Casts the first argument to the data type of the second argument, returning NULL if the cast fails. Only the type of the second argument is used; its value is ignored.
+
+```sql
+try_cast_to_type(expression, reference)
+```
+
+#### Arguments
+
+- **expression**: The expression to cast. It can be a constant, column, or function, and any combination of operators.
+- **reference**: Reference expression whose data type determines the target cast type. The value is ignored.
+
+#### Example
+
+```sql
+> select try_cast_to_type('123', NULL::INTEGER) as a,
+         try_cast_to_type('not_a_number', NULL::INTEGER) as b;
+
++-----+------+
+| a   | b    |
++-----+------+
+| 123 | NULL |
++-----+------+
 ```
 
 ### `version`
@@ -5080,3 +5703,32 @@ version()
 | Apache DataFusion 42.0.0, aarch64 on macos |
 +--------------------------------------------+
 ```
+
+### `with_metadata`
+
+Attaches Arrow field metadata (key/value pairs) to the input expression. Keys must be non-empty constant strings and values must be constant strings (empty values are allowed). Existing metadata on the input field is preserved; new keys overwrite on collision. This is the inverse of `arrow_metadata`.
+
+```sql
+with_metadata(expression, key1, value1[, key2, value2, ...])
+```
+
+#### Arguments
+
+- **expression**: The expression whose output Arrow field should be annotated. Values flow through unchanged.
+- **key**: Metadata key. Must be a non-empty constant string literal.
+- **value**: Metadata value. Must be a constant string literal (may be empty).
+
+#### Example
+
+```sql
+> select arrow_metadata(with_metadata(column1, 'unit', 'ms'), 'unit') from (values (1));
++---------------------------------------------------------------+
+| arrow_metadata(with_metadata(column1,Utf8("unit"),Utf8("ms")),Utf8("unit")) |
++---------------------------------------------------------------+
+| ms                                                            |
++---------------------------------------------------------------+
+> select arrow_metadata(with_metadata(column1, 'unit', 'ms', 'source', 'sensor')) from (values (1));
++--------------------------+
+| {source: sensor, unit: ms} |
++--------------------------+
+```
diff --git a/docs/source/user-guide/sql/select.md b/docs/source/user-guide/sql/select.md
index baacf432f5fde..3564884b041ad 100644
--- a/docs/source/user-guide/sql/select.md
+++ b/docs/source/user-guide/sql/select.md
@@ -86,7 +86,7 @@ SELECT a FROM table WHERE a > 10
 
 ## JOIN clause
 
-DataFusion supports `INNER JOIN`, `LEFT OUTER JOIN`, `RIGHT OUTER JOIN`, `FULL OUTER JOIN`, `NATURAL JOIN`, `CROSS JOIN`, `LEFT SEMI JOIN`, `RIGHT SEMI JOIN`, `LEFT ANTI JOIN`, and `RIGHT ANTI JOIN`.
+DataFusion supports `INNER JOIN`, `LEFT OUTER JOIN`, `RIGHT OUTER JOIN`, `FULL OUTER JOIN`, `NATURAL JOIN`, `CROSS JOIN`, `LEFT SEMI JOIN`, `RIGHT SEMI JOIN`, `LEFT ANTI JOIN`, `RIGHT ANTI JOIN`, `LATERAL JOIN`, and `LEFT JOIN LATERAL`.
 
 The following examples are based on this table:
 
@@ -238,6 +238,136 @@ SELECT * FROM x RIGHT ANTI JOIN x y ON x.column_1 = y.column_1;
 +----------+----------+
 ```
 
+### LATERAL JOIN
+
+A `LATERAL JOIN` allows the right-hand side of a join to reference columns from
+the left-hand side. Conceptually, the subquery on the right is evaluated once
+for each row of the left-hand table, which makes it possible to "parameterize" a
+subquery with values from preceding tables.
+
+The `LATERAL` keyword is required; DataFusion does not implicitly detect
+correlation in `FROM` clause subqueries.
+
+The following examples use these tables:
+
+```sql
+CREATE TABLE departments(id INT, name TEXT) AS VALUES (1, 'HR'), (2, 'Eng'), (3, 'Sales');
+CREATE TABLE employees(id INT, dept_id INT, name TEXT) AS VALUES
+  (10, 1, 'Alice'), (20, 1, 'Bob'), (30, 2, 'Carol');
+```
+
+#### Comma syntax
+
+The most concise form places `LATERAL` after a comma in the `FROM` clause.
+Rows from the left table that have no matching rows in the subquery are excluded
+(inner join semantics).
+
+```sql
+SELECT d.name AS dept, e.name AS emp
+FROM departments d, LATERAL (
+    SELECT employees.name FROM employees WHERE employees.dept_id = d.id
+) AS e
+ORDER BY dept, emp;
++------+-------+
+| dept | emp   |
++------+-------+
+| Eng  | Carol |
+| HR   | Alice |
+| HR   | Bob   |
++------+-------+
+```
+
+#### CROSS JOIN LATERAL
+
+Equivalent to the comma syntax above.
+
+```sql
+SELECT d.name AS dept, e.name AS emp
+FROM departments d
+CROSS JOIN LATERAL (
+    SELECT employees.name FROM employees WHERE employees.dept_id = d.id
+) AS e
+ORDER BY dept, emp;
++------+-------+
+| dept | emp   |
++------+-------+
+| Eng  | Carol |
+| HR   | Alice |
+| HR   | Bob   |
++------+-------+
+```
+
+#### JOIN LATERAL ... ON
+
+`JOIN LATERAL` with an `ON` clause applies the `ON` condition as an additional
+filter after the lateral subquery is evaluated.
+
+```sql
+SELECT d.name AS dept, sub.emp, sub.cnt
+FROM departments d
+JOIN LATERAL (
+    SELECT count(*) AS cnt, min(employees.name) AS emp
+    FROM employees WHERE employees.dept_id = d.id
+) AS sub ON sub.cnt > 0
+ORDER BY dept;
++------+-------+-----+
+| dept | emp   | cnt |
++------+-------+-----+
+| Eng  | Carol | 1   |
+| HR   | Alice | 2   |
++------+-------+-----+
+```
+
+#### LEFT JOIN LATERAL
+
+`LEFT JOIN LATERAL` preserves all rows from the left table. When the lateral
+subquery produces no matching rows, the right-side columns are filled with
+NULLs.
+
+```sql
+SELECT d.name AS dept, e.name AS emp
+FROM departments d
+LEFT JOIN LATERAL (
+    SELECT employees.name FROM employees WHERE employees.dept_id = d.id
+) AS e ON true
+ORDER BY dept, emp;
++-------+-------+
+| dept  | emp   |
++-------+-------+
+| Eng   | Carol |
+| HR    | Alice |
+| HR    | Bob   |
+| Sales | NULL  |
++-------+-------+
+```
+
+The `ON` clause can also filter results. Rows that do not satisfy the `ON`
+condition are preserved with NULLs, just like a regular `LEFT JOIN`:
+
+```sql
+SELECT d.name AS dept, sub.cnt
+FROM departments d
+LEFT JOIN LATERAL (
+    SELECT count(*) AS cnt
+    FROM employees WHERE employees.dept_id = d.id
+) AS sub ON sub.cnt > 0
+ORDER BY dept;
++-------+------+
+| dept  | cnt  |
++-------+------+
+| Eng   | 1    |
+| HR    | 2    |
+| Sales | NULL |
++-------+------+
+```
+
+#### Limitations
+
+The following patterns are not yet supported:
+
+- Outer references in the `SELECT` list of the lateral subquery (e.g., `LATERAL (SELECT outer.col + 1)`).
+- `HAVING` in lateral subqueries.
+
 ## GROUP BY clause
 
 Example:
diff --git a/docs/source/user-guide/sql/special_functions.md b/docs/source/user-guide/sql/special_functions.md
index 4f2a39f642b06..f4b1ea43aff67 100644
--- a/docs/source/user-guide/sql/special_functions.md
+++ b/docs/source/user-guide/sql/special_functions.md
@@ -69,7 +69,7 @@ Expands an array or map into rows.
 ### `unnest (struct)`
 
 Expand a struct fields into individual columns.
-Each field of the struct will be prefixed with `__unnest_placeholder` and could be accessed via `"__unnest_placeholder(<struct>).<field>"`.
+Each field of the struct can be accessed via `"<table>.<struct>.<field>"`.
 
 #### Arguments
 
@@ -93,7 +93,7 @@ Each field of the struct will be prefixed with `__unnest_placeholder` and could
 
 > select unnest(struct_column) from foov;
 +--------------------------------------------+--------------------------------------------+
-| __unnest_placeholder(foov.struct_column).a | __unnest_placeholder(foov.struct_column).b |
+| foov.struct_column.a                       | foov.struct_column.b                       |
 +--------------------------------------------+--------------------------------------------+
 | 5                                          | a string                                   |
 | 6                                          | another string                             |
diff --git a/docs/source/user-guide/sql/struct_coercion.md b/docs/source/user-guide/sql/struct_coercion.md
new file mode 100644
index 0000000000000..059f9fd009b00
--- /dev/null
+++ b/docs/source/user-guide/sql/struct_coercion.md
@@ -0,0 +1,374 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Struct Type Coercion and Field Mapping
+
+DataFusion uses **name-based field mapping** when coercing struct types across different operations. This document explains how struct coercion works, when it applies, and how to handle NULL fields.
+
+## Overview: Name-Based vs Positional Mapping
+
+When combining structs from different sources (e.g., in UNION, array construction, or JOINs), DataFusion matches struct fields by **name** rather than by **position**. This provides more robust and predictable behavior compared to positional matching.
+
+### Example: Field Reordering is Handled Transparently
+
+```sql
+-- These two structs have the same fields in different order
+SELECT [{a: 1, b: 2}, {b: 3, a: 4}];
+
+-- Result: Field names matched, values unified
+-- [{"a": 1, "b": 2}, {"a": 4, "b": 3}]
+```
+
+## Coercion Paths Using Name-Based Matching
+
+The following query operations use name-based field mapping for struct coercion:
+
+### 1. Array Literal Construction
+
+When creating array literals with struct elements that have different field orders:
+
+```sql
+-- Structs with reordered fields in array literal
+SELECT [{x: 1, y: 2}, {y: 3, x: 4}];
+
+-- Unified type: List(Struct("x": Int32, "y": Int32))
+-- Values: [{"x": 1, "y": 2}, {"x": 4, "y": 3}]
+```
+
+**When it applies:**
+
+- Array literals with struct elements: `[{...}, {...}]`
+- Nested arrays with structs: `[[{x: 1}, {x: 2}]]`
+
+### 2. Array Construction from Columns
+
+When constructing arrays from table columns with different struct schemas:
+
+```sql
+CREATE TABLE t_left (s struct(x int, y int)) AS VALUES ({x: 1, y: 2});
+CREATE TABLE t_right (s struct(y int, x int)) AS VALUES ({y: 3, x: 4});
+
+-- Dynamically constructs unified array schema
+SELECT [t_left.s, t_right.s] FROM t_left JOIN t_right;
+
+-- Result: [{"x": 1, "y": 2}, {"x": 4, "y": 3}]
+```
+
+**When it applies:**
+
+- Array construction with column references: `[col1, col2]`
+- Array construction in joins with matching field names
+
+### 3. UNION Operations
+
+When combining query results with different struct field orders:
+
+```sql
+SELECT {a: 1, b: 2} as s
+UNION ALL
+SELECT {b: 3, a: 4} as s;
+
+-- Result: {"a": 1, "b": 2} and {"a": 4, "b": 3}
+```
+
+**When it applies:**
+
+- UNION ALL with structs: field names matched across branches
+- UNION (deduplicated) with structs
+
+### 4. Common Table Expressions (CTEs)
+
+When multiple CTEs produce structs with different field orders that are combined:
+
+```sql
+WITH
+  t1 AS (SELECT {a: 1, b: 2} as s),
+  t2 AS (SELECT {b: 3, a: 4} as s)
+SELECT s FROM t1
+UNION ALL
+SELECT s FROM t2;
+
+-- Result: Field names matched across CTEs
+```
+
+### 5. VALUES Clauses
+
+When creating tables or temporary results with struct values in different field orders:
+
+```sql
+CREATE TABLE t AS VALUES ({a: 1, b: 2}), ({b: 3, a: 4});
+
+-- Table schema unified: struct(a: int, b: int)
+-- Values: {a: 1, b: 2} and {a: 4, b: 3}
+```
+
+### 6. JOIN Operations
+
+When joining tables where the JOIN condition involves structs with different field orders:
+
+```sql
+CREATE TABLE orders (customer struct(name varchar, id int));
+CREATE TABLE customers (info struct(id int, name varchar));
+
+-- Join matches struct fields by name
+SELECT * FROM orders
+JOIN customers ON orders.customer = customers.info;
+```
+
+### 7. Aggregate Functions
+
+When collecting structs with different field orders using aggregate functions like `array_agg`:
+
+```sql
+SELECT array_agg(s) FROM (
+  SELECT {x: 1, y: 2} as s
+  UNION ALL
+  SELECT {y: 3, x: 4} as s
+) t
+GROUP BY category;
+
+-- Result: Array of structs with unified field order
+```
+
+### 8. Window Functions
+
+When using window functions with struct expressions having different field orders:
+
+```sql
+SELECT
+  id,
+  row_number() over (partition by s order by id) as rn
+FROM (
+  SELECT {category: 1, value: 10} as s, 1 as id
+  UNION ALL
+  SELECT {value: 20, category: 1} as s, 2 as id
+);
+
+-- Fields matched by name in PARTITION BY clause
+```
+
+## NULL Handling for Missing Fields
+
+When structs have different field sets, missing fields are filled with **NULL** values during coercion.
+
+### Example: Partial Field Overlap
+
+```sql
+-- Struct in first position has fields: a, b
+-- Struct in second position has fields: b, c
+-- Unified schema includes all fields: a, b, c
+
+SELECT [
+  CAST({a: 1, b: 2} AS STRUCT(a INT, b INT, c INT)),
+  CAST({b: 3, c: 4} AS STRUCT(a INT, b INT, c INT))
+];
+
+-- Result:
+-- [
+--   {"a": 1, "b": 2, "c": NULL},
+--   {"a": NULL, "b": 3, "c": 4}
+-- ]
+```
+
+### Limitations
+
+**Field count must match exactly.** If structs have different numbers of fields and their field names don't completely overlap, the query will fail:
+
+```sql
+-- This fails because field sets don't match:
+-- t_left has {x, y} but t_right has {x, y, z}
+SELECT [t_left.s, t_right.s] FROM t_left JOIN t_right;
+-- Error: Cannot coerce struct with mismatched field counts
+```
+
+**Workaround: Use explicit CAST**
+
+To handle partial field overlap, explicitly cast structs to a unified schema:
+
+```sql
+SELECT [
+  CAST(t_left.s AS STRUCT(x INT, y INT, z INT)),
+  CAST(t_right.s AS STRUCT(x INT, y INT, z INT))
+] FROM t_left JOIN t_right;
+```
+
+## Comparison and Ordering
+
+DataFusion supports comparing `STRUCT` values with standard comparison operators
+(`=`, `!=`, `<`, `<=`, `>`, `>=`). Ordering comparisons are lexicographical and
+follow DataFusion's default ascending comparison behavior, where `NULL` sorts
+before non-`NULL` values.
+
+### Examples
+
+```sql
+SELECT {x: 1, y: 2} < {x: 1, y: 3};
+-- true
+
+SELECT {x: 1, y: NULL} < {x: 1, y: 2};
+-- true
+
+SELECT {x: 1, y: NULL} = {x: 1, y: NULL};
+--true
+```
+
+## Migration Guide: From Positional to Name-Based Matching
+
+If you have existing code that relied on **positional** struct field matching, you may need to update it.
+
+### Example: Query That Changes Behavior
+
+**Old behavior (positional):**
+
+```sql
+-- These would have been positionally mapped (left-to-right)
+SELECT [{x: 1, y: 2}, {y: 3, x: 4}];
+-- Old result (positional): [{"x": 1, "y": 2}, {"y": 3, "x": 4}]
+```
+
+**New behavior (name-based):**
+
+```sql
+-- Now uses name-based matching
+SELECT [{x: 1, y: 2}, {y: 3, x: 4}];
+-- New result (by name): [{"x": 1, "y": 2}, {"x": 4, "y": 3}]
+```
+
+### Migration Steps
+
+1. **Review struct operations** - Look for queries that combine structs from different sources
+2. **Check field names** - Verify that field names match as expected (not positions)
+3. **Test with new coercion** - Run queries and verify the results match your expectations
+4. **Handle field reordering** - If you need specific field orders, use explicit CAST operations
+
+### Using Explicit CAST for Compatibility
+
+If you need precise control over struct field order and types, use explicit `CAST`:
+
+```sql
+-- Guarantee specific field order and types
+SELECT CAST({b: 3, a: 4} AS STRUCT(a INT, b INT));
+-- Result: {"a": 4, "b": 3}
+```
+
+## Best Practices
+
+### 1. Be Explicit with Schema Definitions
+
+When joining or combining structs, define target schemas explicitly:
+
+```sql
+-- Good: explicit schema definition
+SELECT CAST(data AS STRUCT(id INT, name VARCHAR, active BOOLEAN))
+FROM external_source;
+```
+
+### 2. Use Named Struct Constructors
+
+Prefer named struct constructors for clarity:
+
+```sql
+-- Good: field names are explicit
+SELECT named_struct('id', 1, 'name', 'Alice', 'active', true);
+
+-- Or using struct literal syntax
+SELECT {id: 1, name: 'Alice', active: true};
+```
+
+### 3. Test Field Mappings
+
+Always verify that field mappings work as expected:
+
+```sql
+-- Use arrow_typeof to verify unified schema
+SELECT arrow_typeof([{x: 1, y: 2}, {y: 3, x: 4}]);
+-- Result: List(Struct("x": Int32, "y": Int32))
+```
+
+### 4. Handle Partial Field Overlap Explicitly
+
+When combining structs with partial field overlap, use explicit CAST:
+
+```sql
+-- Instead of relying on implicit coercion
+SELECT [
+  CAST(left_struct AS STRUCT(x INT, y INT, z INT)),
+  CAST(right_struct AS STRUCT(x INT, y INT, z INT))
+];
+```
+
+### 5. Document Struct Schemas
+
+In complex queries, document the expected struct schemas:
+
+```sql
+-- Expected schema: {customer_id: INT, name: VARCHAR, age: INT}
+SELECT {
+  customer_id: c.id,
+  name: c.name,
+  age: c.age
+} as customer_info
+FROM customers c;
+```
+
+## Error Messages and Troubleshooting
+
+### "Cannot coerce struct with different field counts"
+
+**Cause:** Trying to combine structs with different numbers of fields.
+
+**Solution:**
+
+```sql
+-- Use explicit CAST to handle missing fields
+SELECT [
+  CAST(struct1 AS STRUCT(a INT, b INT, c INT)),
+  CAST(struct2 AS STRUCT(a INT, b INT, c INT))
+];
+```
+
+### "Field X not found in struct"
+
+**Cause:** Referencing a field name that doesn't exist in the struct.
+
+**Solution:**
+
+```sql
+-- Verify field names match exactly (case-sensitive)
+SELECT s['field_name'] FROM my_table;  -- Use bracket notation for access
+-- Or use get_field function
+SELECT get_field(s, 'field_name') FROM my_table;
+```
+
+### Unexpected NULL values after coercion
+
+**Cause:** Struct coercion added NULL for missing fields.
+
+**Solution:** Check that all structs have the required fields, or explicitly handle NULLs:
+
+```sql
+SELECT COALESCE(s['field'], default_value) FROM my_table;
+```
+
+## Related Functions
+
+- `arrow_typeof()` - Returns the Arrow type of an expression
+- `struct()` / `named_struct()` - Creates struct values
+- `get_field()` - Extracts field values from structs
+- `CAST()` - Explicitly casts structs to specific schemas
diff --git a/docs/source/user-guide/sql/subqueries.md b/docs/source/user-guide/sql/subqueries.md
index 692d1c4020d74..cba15f1bfd5c0 100644
--- a/docs/source/user-guide/sql/subqueries.md
+++ b/docs/source/user-guide/sql/subqueries.md
@@ -102,6 +102,18 @@ SELECT * FROM x WHERE column_1 NOT IN (1,3);
 +----------+----------+
 ```
 
+#### `IN` with tuple-like values and `NULL`
+
+For tuple-like values, `IN` uses DataFusion's struct equality semantics:
+
+```sql
+SELECT (1, 1) IN ((1, NULL));
+-- false
+
+SELECT (1, NULL) IN ((1, NULL));
+-- true
+```
+
 ## SELECT clause subqueries
 
 `SELECT` clause subqueries use values returned from the inner query as part
@@ -150,6 +162,8 @@ operated on by the outer query.
 SELECT expression1[, expression2, ..., expressionN] FROM (<subquery>)
 ```
 
+To reference columns from other tables in the same `FROM` clause, use [`LATERAL JOIN`](select.md#lateral-join).
+
 ### Example
 
 The following query returns the average of maximum values per room.
diff --git a/docs/requirements.txt b/lychee.toml
similarity index 80%
rename from docs/requirements.txt
rename to lychee.toml
index 355cd347ef582..74245ee7ecbfb 100644
--- a/docs/requirements.txt
+++ b/lychee.toml
@@ -15,10 +15,18 @@
 # specific language governing permissions and limitations
 # under the License.
 
-sphinx==8.2.3
-sphinx-reredirects==1.0.0
-pydata-sphinx-theme==0.16.1
-myst-parser==4.0.1
-maturin==1.9.6
-jinja2==3.1.6
-setuptools==80.9.0
+timeout = 20
+max_retries = 2
+retry_wait_time = 2
+
+exclude_path = [
+  "target",
+  "docs/build",
+  "datafusion/core/benches/tpch-csv",
+]
+
+exclude = [
+  "^http://",
+  "^https://",
+  "^mailto:",
+]
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000..6fc7705d8536f
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,2 @@
+[tool.uv.workspace]
+members = ["benchmarks", "dev", "docs"]
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 22666a1b45b56..c7d61a9e24f7f 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -19,5 +19,5 @@
 # to compile this workspace and run CI jobs.
 
 [toolchain]
-channel = "1.91.0"
+channel = "1.94.0"
 components = ["rustfmt", "clippy"]
diff --git a/rustfmt.toml b/rustfmt.toml
index 4522e520a469b..c680d9d068d5c 100644
--- a/rustfmt.toml
+++ b/rustfmt.toml
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-edition = "2021"
+edition = "2024"
 max_width = 90
 
 # ignore generated files
diff --git a/snowflake_flatten_validation.sql b/snowflake_flatten_validation.sql
new file mode 100644
index 0000000000000..cae6f5ea59e77
--- /dev/null
+++ b/snowflake_flatten_validation.sql
@@ -0,0 +1,219 @@
+-- ============================================================================
+-- Snowflake LATERAL FLATTEN validation queries
+--
+-- Run this file against a real Snowflake instance to verify that the
+-- Unparser-generated SQL is syntactically and semantically correct.
+--
+-- Each section shows:
+--   1. The DataFusion input (SQL parsed by the planner)
+--   2. The Snowflake SQL produced by the Unparser
+--
+-- NOTE: The Unparser emits array literals as [1, 2, 3] (DataFusion syntax).
+-- Snowflake requires ARRAY_CONSTRUCT(1, 2, 3). The queries below use
+-- ARRAY_CONSTRUCT so they can run directly on Snowflake. The exact Unparser
+-- output is shown in the "Unparser output:" comment above each query.
+-- ============================================================================
+
+-- ----------------------------------------------------------------------------
+-- Setup: create and seed test tables
+-- ----------------------------------------------------------------------------
+
+CREATE OR REPLACE TABLE source (
+    items ARRAY
+);
+
+INSERT INTO source SELECT PARSE_JSON('[1, 2, 3]');
+INSERT INTO source SELECT PARSE_JSON('["a", "b"]');
+INSERT INTO source SELECT NULL;
+
+CREATE OR REPLACE TABLE unnest_table (
+    array_col ARRAY
+);
+
+INSERT INTO unnest_table SELECT PARSE_JSON('[10, 20, 30]');
+INSERT INTO unnest_table SELECT PARSE_JSON('[40, 50]');
+INSERT INTO unnest_table SELECT NULL;
+
+CREATE OR REPLACE TABLE multi_array_table (
+    column_a ARRAY,
+    column_b ARRAY
+);
+
+INSERT INTO multi_array_table SELECT PARSE_JSON('[1, 2, 3]'), PARSE_JSON('["x", "y"]');
+INSERT INTO multi_array_table SELECT PARSE_JSON('[4]'),       PARSE_JSON('["z"]');
+
+-- ============================================================================
+-- Roundtrip tests: SQL parsed → plan → Snowflake SQL
+-- ============================================================================
+
+-- --------------------------------------------------------------------------
+-- Test: snowflake_unnest_to_lateral_flatten_simple
+-- DataFusion input: SELECT * FROM UNNEST([1,2,3])
+-- Unparser output:  SELECT "_unnest_1"."VALUE" FROM LATERAL FLATTEN(INPUT => [1, 2, 3]) AS "_unnest_1"
+-- --------------------------------------------------------------------------
+SELECT "_unnest_1"."VALUE"
+FROM LATERAL FLATTEN(INPUT => ARRAY_CONSTRUCT(1, 2, 3)) AS "_unnest_1";
+
+-- --------------------------------------------------------------------------
+-- Test: snowflake_flatten_implicit_from
+-- DataFusion input: SELECT UNNEST([1,2,3])
+-- Unparser output:  SELECT "_unnest_1"."VALUE" FROM LATERAL FLATTEN(INPUT => [1, 2, 3]) AS "_unnest_1"
+-- --------------------------------------------------------------------------
+SELECT "_unnest_1"."VALUE"
+FROM LATERAL FLATTEN(INPUT => ARRAY_CONSTRUCT(1, 2, 3)) AS "_unnest_1";
+
+-- --------------------------------------------------------------------------
+-- Test: snowflake_flatten_string_array
+-- DataFusion input: SELECT * FROM UNNEST(['a','b','c'])
+-- Unparser output:  SELECT "_unnest_1"."VALUE" FROM LATERAL FLATTEN(INPUT => ['a', 'b', 'c']) AS "_unnest_1"
+-- --------------------------------------------------------------------------
+SELECT "_unnest_1"."VALUE"
+FROM LATERAL FLATTEN(INPUT => ARRAY_CONSTRUCT('a', 'b', 'c')) AS "_unnest_1";
+
+-- --------------------------------------------------------------------------
+-- Test: snowflake_flatten_select_unnest_with_alias
+-- DataFusion input: SELECT UNNEST([1,2,3]) as c1
+-- Unparser output:  SELECT "_unnest_1"."VALUE" AS "c1" FROM LATERAL FLATTEN(INPUT => [1, 2, 3]) AS "_unnest_1"
+-- --------------------------------------------------------------------------
+SELECT "_unnest_1"."VALUE" AS "c1"
+FROM LATERAL FLATTEN(INPUT => ARRAY_CONSTRUCT(1, 2, 3)) AS "_unnest_1";
+
+-- --------------------------------------------------------------------------
+-- Test: snowflake_flatten_from_unnest_with_table_alias
+-- DataFusion input: SELECT * FROM UNNEST([1,2,3]) AS t1 (c1)
+-- Unparser output:  SELECT "t1"."VALUE" FROM LATERAL FLATTEN(INPUT => [1, 2, 3]) AS "t1"
+-- --------------------------------------------------------------------------
+SELECT "t1"."VALUE"
+FROM LATERAL FLATTEN(INPUT => ARRAY_CONSTRUCT(1, 2, 3)) AS "t1";
+
+-- ============================================================================
+-- Plan-built tests: LogicalPlan → Snowflake SQL
+-- These use a table called "source" with an ARRAY column "items".
+-- ============================================================================
+
+-- --------------------------------------------------------------------------
+-- Test: snowflake_flatten_limit_between_projection_and_unnest
+-- Plan: Projection → Limit → Unnest → Projection → TableScan
+-- Unparser output: SELECT "_unnest_1"."VALUE" AS "item" FROM "source" CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1" LIMIT 5
+-- --------------------------------------------------------------------------
+SELECT "_unnest_1"."VALUE" AS "item"
+FROM "source"
+CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1"
+LIMIT 5;
+
+-- --------------------------------------------------------------------------
+-- Test: snowflake_flatten_sort_between_projection_and_unnest
+-- Plan: Projection → Sort → Unnest → Projection → TableScan
+-- Unparser output: SELECT "_unnest_1"."VALUE" AS "item" FROM "source" CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1" ORDER BY "_unnest_1"."VALUE" ASC NULLS FIRST
+-- --------------------------------------------------------------------------
+SELECT "_unnest_1"."VALUE" AS "item"
+FROM "source"
+CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1"
+ORDER BY "_unnest_1"."VALUE" ASC NULLS FIRST;
+
+-- --------------------------------------------------------------------------
+-- Test: snowflake_flatten_limit_between_projection_and_unnest_with_subquery_alias
+-- Plan: Projection → Limit → Unnest → SubqueryAlias → Projection → TableScan
+-- Unparser output: SELECT "_unnest_1"."VALUE" AS "item" FROM "source" CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1" LIMIT 10
+-- --------------------------------------------------------------------------
+SELECT "_unnest_1"."VALUE" AS "item"
+FROM "source"
+CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1"
+LIMIT 10;
+
+-- --------------------------------------------------------------------------
+-- Test: snowflake_flatten_composed_expression_wrapping_unnest
+-- Plan: Projection(CAST(placeholder AS Int64)) → Unnest → Projection → TableScan
+-- Unparser output: SELECT CAST("_unnest_1"."VALUE" AS BIGINT) AS "item_id" FROM "source" CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1"
+-- --------------------------------------------------------------------------
+SELECT CAST("_unnest_1"."VALUE" AS BIGINT) AS "item_id"
+FROM "source"
+CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1";
+
+-- --------------------------------------------------------------------------
+-- Test: snowflake_flatten_composed_expression_with_limit
+-- Plan: Projection(CAST) → Limit → Unnest → Projection → TableScan
+-- Unparser output: SELECT CAST("_unnest_1"."VALUE" AS BIGINT) AS "item_id" FROM "source" CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1" LIMIT 5
+-- --------------------------------------------------------------------------
+SELECT CAST("_unnest_1"."VALUE" AS BIGINT) AS "item_id"
+FROM "source"
+CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1"
+LIMIT 5;
+
+-- --------------------------------------------------------------------------
+-- Test: snowflake_flatten_multi_expression_projection
+-- Plan: Projection([CAST AS Int64, CAST AS Utf8]) → Unnest → Projection → TableScan
+-- Unparser output: SELECT CAST("_unnest_1"."VALUE" AS BIGINT) AS "a", CAST("_unnest_1"."VALUE" AS VARCHAR) AS "b" FROM "source" CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1"
+-- --------------------------------------------------------------------------
+SELECT CAST("_unnest_1"."VALUE" AS BIGINT) AS "a",
+       CAST("_unnest_1"."VALUE" AS VARCHAR) AS "b"
+FROM "source"
+CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1";
+
+-- --------------------------------------------------------------------------
+-- Test: snowflake_flatten_multi_expression_with_limit
+-- Plan: Projection([CAST, CAST]) → Limit → Unnest → Projection → TableScan
+-- Unparser output: SELECT CAST("_unnest_1"."VALUE" AS BIGINT) AS "a", CAST("_unnest_1"."VALUE" AS VARCHAR) AS "b" FROM "source" CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1" LIMIT 10
+-- --------------------------------------------------------------------------
+SELECT CAST("_unnest_1"."VALUE" AS BIGINT) AS "a",
+       CAST("_unnest_1"."VALUE" AS VARCHAR) AS "b"
+FROM "source"
+CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1"
+LIMIT 10;
+
+-- --------------------------------------------------------------------------
+-- Test: snowflake_unnest_through_subquery_alias
+-- Plan: Projection → Unnest → SubqueryAlias → Projection → TableScan
+-- Unparser output: SELECT "_unnest_1"."VALUE" AS "item" FROM "source" CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1"
+-- --------------------------------------------------------------------------
+SELECT "_unnest_1"."VALUE" AS "item"
+FROM "source"
+CROSS JOIN LATERAL FLATTEN(INPUT => "source"."items", OUTER => true) AS "_unnest_1";
+
+-- ============================================================================
+-- Roundtrip tests with table columns
+-- ============================================================================
+
+-- --------------------------------------------------------------------------
+-- Test: snowflake_flatten_unnest_from_subselect
+-- DataFusion input: SELECT UNNEST(array_col) FROM (SELECT array_col FROM unnest_table WHERE array_col IS NOT NULL LIMIT 3)
+-- Unparser output:  SELECT "_unnest_1"."VALUE" FROM (SELECT "unnest_table"."array_col" FROM "unnest_table" WHERE "unnest_table"."array_col" IS NOT NULL LIMIT 3) CROSS JOIN LATERAL FLATTEN(INPUT => "unnest_table"."array_col") AS "_unnest_1"
+-- --------------------------------------------------------------------------
+SELECT "_unnest_1"."VALUE"
+FROM (
+    SELECT "unnest_table"."array_col"
+    FROM "unnest_table"
+    WHERE "unnest_table"."array_col" IS NOT NULL
+    LIMIT 3
+) CROSS JOIN LATERAL FLATTEN(INPUT => "unnest_table"."array_col") AS "_unnest_1";
+
+-- --------------------------------------------------------------------------
+-- Test: snowflake_flatten_cross_join_unnest_table_column
+-- DataFusion input: SELECT * FROM multi_array_table CROSS JOIN UNNEST(column_a) AS a (a)
+-- Unparser output:  SELECT "multi_array_table"."column_a", "multi_array_table"."column_b", "a"."VALUE" FROM "multi_array_table" CROSS JOIN LATERAL FLATTEN(INPUT => "multi_array_table"."column_a") AS "a"
+-- --------------------------------------------------------------------------
+SELECT "multi_array_table"."column_a",
+       "multi_array_table"."column_b",
+       "a"."VALUE"
+FROM "multi_array_table"
+CROSS JOIN LATERAL FLATTEN(INPUT => "multi_array_table"."column_a") AS "a";
+
+-- --------------------------------------------------------------------------
+-- Test: snowflake_flatten_multiple_unnest_cross_join
+-- DataFusion input: SELECT a.a, b.b FROM multi_array_table
+--                   CROSS JOIN UNNEST(column_a) AS a (a)
+--                   CROSS JOIN UNNEST(column_b) AS b (b)
+-- Unparser output:  SELECT "a"."VALUE", "b"."VALUE" FROM "multi_array_table" CROSS JOIN LATERAL FLATTEN(INPUT => "multi_array_table"."column_a") AS "a" CROSS JOIN LATERAL FLATTEN(INPUT => "multi_array_table"."column_b") AS "b"
+-- --------------------------------------------------------------------------
+SELECT "a"."VALUE",
+       "b"."VALUE"
+FROM "multi_array_table"
+CROSS JOIN LATERAL FLATTEN(INPUT => "multi_array_table"."column_a") AS "a"
+CROSS JOIN LATERAL FLATTEN(INPUT => "multi_array_table"."column_b") AS "b";
+
+-- ============================================================================
+-- Cleanup
+-- ============================================================================
+-- DROP TABLE IF EXISTS source;
+-- DROP TABLE IF EXISTS unnest_table;
+-- DROP TABLE IF EXISTS multi_array_table;
diff --git a/test-utils/Cargo.toml b/test-utils/Cargo.toml
index 3a161d5f4d645..525c892a406f7 100644
--- a/test-utils/Cargo.toml
+++ b/test-utils/Cargo.toml
@@ -19,9 +19,13 @@
 name = "test-utils"
 version = "0.1.0"
 edition = { workspace = true }
+publish = false
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
diff --git a/test-utils/src/array_gen/binary.rs b/test-utils/src/array_gen/binary.rs
index 9740eeae5e7fe..ab0530a9ab4e4 100644
--- a/test-utils/src/array_gen/binary.rs
+++ b/test-utils/src/array_gen/binary.rs
@@ -19,8 +19,8 @@ use arrow::array::{
     ArrayRef, BinaryViewArray, GenericBinaryArray, OffsetSizeTrait, UInt32Array,
 };
 use arrow::compute;
-use rand::rngs::StdRng;
 use rand::Rng;
+use rand::rngs::StdRng;
 
 /// Randomly generate binary arrays
 pub struct BinaryArrayGenerator {
diff --git a/test-utils/src/array_gen/boolean.rs b/test-utils/src/array_gen/boolean.rs
index 004d615b4caa4..c9104c0b8e788 100644
--- a/test-utils/src/array_gen/boolean.rs
+++ b/test-utils/src/array_gen/boolean.rs
@@ -17,8 +17,8 @@
 
 use arrow::array::{ArrayRef, BooleanArray, BooleanBuilder, UInt32Array};
 use arrow::compute::take;
-use rand::rngs::StdRng;
 use rand::Rng;
+use rand::rngs::StdRng;
 
 /// Randomly generate boolean arrays
 pub struct BooleanArrayGenerator {
diff --git a/test-utils/src/array_gen/decimal.rs b/test-utils/src/array_gen/decimal.rs
index c5ec8ac5e8938..54fa2269d6e4c 100644
--- a/test-utils/src/array_gen/decimal.rs
+++ b/test-utils/src/array_gen/decimal.rs
@@ -17,8 +17,8 @@
 
 use arrow::array::{ArrayRef, PrimitiveArray, PrimitiveBuilder, UInt32Array};
 use arrow::datatypes::DecimalType;
-use rand::rngs::StdRng;
 use rand::Rng;
+use rand::rngs::StdRng;
 
 use super::random_data::RandomNativeData;
 
diff --git a/test-utils/src/array_gen/primitive.rs b/test-utils/src/array_gen/primitive.rs
index 62a38a1b4ce1d..5944879600cb0 100644
--- a/test-utils/src/array_gen/primitive.rs
+++ b/test-utils/src/array_gen/primitive.rs
@@ -17,9 +17,9 @@
 
 use arrow::array::{ArrayRef, ArrowPrimitiveType, PrimitiveArray, UInt32Array};
 use arrow::datatypes::DataType;
-use chrono_tz::{Tz, TZ_VARIANTS};
+use chrono_tz::{TZ_VARIANTS, Tz};
 use rand::prelude::IndexedRandom;
-use rand::{rng, rngs::StdRng, Rng};
+use rand::{Rng, rng, rngs::StdRng};
 use std::sync::Arc;
 
 use super::random_data::RandomNativeData;
diff --git a/test-utils/src/array_gen/random_data.rs b/test-utils/src/array_gen/random_data.rs
index ea2b872f7d86f..f341d23417439 100644
--- a/test-utils/src/array_gen/random_data.rs
+++ b/test-utils/src/array_gen/random_data.rs
@@ -17,19 +17,19 @@
 
 use arrow::array::ArrowPrimitiveType;
 use arrow::datatypes::{
-    i256, Date32Type, Date64Type, Decimal128Type, Decimal256Type, Decimal32Type,
-    Decimal64Type, DurationMicrosecondType, DurationMillisecondType,
-    DurationNanosecondType, DurationSecondType, Float32Type, Float64Type, Int16Type,
-    Int32Type, Int64Type, Int8Type, IntervalDayTime, IntervalDayTimeType,
-    IntervalMonthDayNano, IntervalMonthDayNanoType, IntervalYearMonthType,
-    Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType,
+    Date32Type, Date64Type, Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type,
+    DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType,
+    DurationSecondType, Float32Type, Float64Type, Int8Type, Int16Type, Int32Type,
+    Int64Type, IntervalDayTime, IntervalDayTimeType, IntervalMonthDayNano,
+    IntervalMonthDayNanoType, IntervalYearMonthType, Time32MillisecondType,
+    Time32SecondType, Time64MicrosecondType, Time64NanosecondType,
     TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
-    TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
+    TimestampSecondType, UInt8Type, UInt16Type, UInt32Type, UInt64Type, i256,
 };
+use rand::Rng;
 use rand::distr::StandardUniform;
 use rand::prelude::Distribution;
 use rand::rngs::StdRng;
-use rand::Rng;
 
 /// Generate corresponding NativeType value randomly according to
 /// ArrowPrimitiveType.
diff --git a/test-utils/src/array_gen/string.rs b/test-utils/src/array_gen/string.rs
index 546485fd8dc16..896182290ccca 100644
--- a/test-utils/src/array_gen/string.rs
+++ b/test-utils/src/array_gen/string.rs
@@ -18,9 +18,9 @@
 use arrow::array::{
     ArrayRef, GenericStringArray, OffsetSizeTrait, StringViewArray, UInt32Array,
 };
+use rand::Rng;
 use rand::distr::StandardUniform;
 use rand::rngs::StdRng;
-use rand::Rng;
 
 /// Randomly generate string arrays
 pub struct StringArrayGenerator {
diff --git a/test-utils/src/data_gen.rs b/test-utils/src/data_gen.rs
index 2228010b28dd1..bb8fdad5a0f89 100644
--- a/test-utils/src/data_gen.rs
+++ b/test-utils/src/data_gen.rs
@@ -129,7 +129,7 @@ impl BatchBuilder {
         }
     }
 
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     fn append_row(
         &mut self,
         rng: &mut StdRng,
diff --git a/test-utils/src/string_gen.rs b/test-utils/src/string_gen.rs
index 75ed03898a279..21eecc05b8ce9 100644
--- a/test-utils/src/string_gen.rs
+++ b/test-utils/src/string_gen.rs
@@ -19,7 +19,7 @@ use crate::array_gen::StringArrayGenerator;
 use crate::stagger_batch;
 use arrow::record_batch::RecordBatch;
 use rand::rngs::StdRng;
-use rand::{rng, Rng, SeedableRng};
+use rand::{Rng, SeedableRng, rng};
 
 /// Randomly generate strings
 pub struct StringBatchGenerator(StringArrayGenerator);
diff --git a/test-utils/src/tpcds.rs b/test-utils/src/tpcds.rs
index ce5bac5bfd83d..a12ae8ceaef9c 100644
--- a/test-utils/src/tpcds.rs
+++ b/test-utils/src/tpcds.rs
@@ -299,7 +299,7 @@ pub fn tpcds_schemas() -> Vec<TableDef> {
                 Field::new("c_birth_country", DataType::Utf8, false),
                 Field::new("c_login", DataType::Utf8, false),
                 Field::new("c_email_address", DataType::Utf8, false),
-                Field::new("c_last_review_date_sk", DataType::Utf8, false),
+                Field::new("c_last_review_date", DataType::Utf8, false),
             ]),
         ),
         TableDef::new(
diff --git a/testing b/testing
index 0d60ccae40d0e..7df2b70baf4f0 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit 0d60ccae40d0e8f2d22c15fafb01c5d4be8c63a6
+Subproject commit 7df2b70baf4f081ebf8e0c6bd22745cf3cbfd824
diff --git a/typos.toml b/typos.toml
index 09c5c55c452ab..196766f12fbc0 100644
--- a/typos.toml
+++ b/typos.toml
@@ -46,5 +46,9 @@ extend-exclude = [
     "dev/changelog/**",
     "benchmarks/**",
     "*.csv",
-    "docs/source/contributor-guide/governance.md"
+    "docs/source/contributor-guide/governance.md",
+    # submodules
+    "parquet-testing/**",
+    "datafusion-testing/**",
+    "testing/**",
 ]
diff --git a/uv.lock b/uv.lock
new file mode 100644
index 0000000000000..d8d2cc13cf72b
--- /dev/null
+++ b/uv.lock
@@ -0,0 +1,1149 @@
+version = 1
+revision = 3
+requires-python = ">=3.11"
+resolution-markers = [
+    "python_full_version >= '3.12'",
+    "python_full_version < '3.12'",
+]
+
+[manifest]
+members = [
+    "datafusion-benchmarks",
+    "datafusion-dev",
+    "datafusion-docs",
+]
+
+[[package]]
+name = "accessible-pygments"
+version = "0.0.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/bc/c1/bbac6a50d02774f91572938964c582fff4270eee73ab822a4aeea4d8b11b/accessible_pygments-0.0.5.tar.gz", hash = "sha256:40918d3e6a2b619ad424cb91e556bd3bd8865443d9f22f1dcdf79e33c8046872", size = 1377899, upload-time = "2024-05-10T11:23:10.216Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8d/3f/95338030883d8c8b91223b4e21744b04d11b161a3ef117295d8241f50ab4/accessible_pygments-0.0.5-py3-none-any.whl", hash = "sha256:88ae3211e68a1d0b011504b2ffc1691feafce124b845bd072ab6f9f66f34d4b7", size = 1395903, upload-time = "2024-05-10T11:23:08.421Z" },
+]
+
+[[package]]
+name = "alabaster"
+version = "1.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a6/f8/d9c74d0daf3f742840fd818d69cfae176fa332022fd44e3469487d5a9420/alabaster-1.0.0.tar.gz", hash = "sha256:c00dca57bca26fa62a6d7d0a9fcce65f3e026e9bfe33e9c538fd3fbb2144fd9e", size = 24210, upload-time = "2024-07-26T18:15:03.762Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7e/b3/6b4067be973ae96ba0d615946e314c5ae35f9f993eca561b356540bb0c2b/alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b", size = 13929, upload-time = "2024-07-26T18:15:02.05Z" },
+]
+
+[[package]]
+name = "annotated-doc"
+version = "0.0.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/57/ba/046ceea27344560984e26a590f90bc7f4a75b06701f653222458922b558c/annotated_doc-0.0.4.tar.gz", hash = "sha256:fbcda96e87e9c92ad167c2e53839e57503ecfda18804ea28102353485033faa4", size = 7288, upload-time = "2025-11-10T22:07:42.062Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl", hash = "sha256:571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320", size = 5303, upload-time = "2025-11-10T22:07:40.673Z" },
+]
+
+[[package]]
+name = "babel"
+version = "2.18.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7d/b2/51899539b6ceeeb420d40ed3cd4b7a40519404f9baf3d4ac99dc413a834b/babel-2.18.0.tar.gz", hash = "sha256:b80b99a14bd085fcacfa15c9165f651fbb3406e66cc603abf11c5750937c992d", size = 9959554, upload-time = "2026-02-01T12:30:56.078Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/77/f5/21d2de20e8b8b0408f0681956ca2c69f1320a3848ac50e6e7f39c6159675/babel-2.18.0-py3-none-any.whl", hash = "sha256:e2b422b277c2b9a9630c1d7903c2a00d0830c409c59ac8cae9081c92f1aeba35", size = 10196845, upload-time = "2026-02-01T12:30:53.445Z" },
+]
+
+[[package]]
+name = "beautifulsoup4"
+version = "4.14.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "soupsieve" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c3/b0/1c6a16426d389813b48d95e26898aff79abbde42ad353958ad95cc8c9b21/beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86", size = 627737, upload-time = "2025-11-30T15:08:26.084Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" },
+]
+
+[[package]]
+name = "certifi"
+version = "2026.1.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e0/2d/a891ca51311197f6ad14a7ef42e2399f36cf2f9bd44752b3dc4eab60fdc5/certifi-2026.1.4.tar.gz", hash = "sha256:ac726dd470482006e014ad384921ed6438c457018f4b3d204aea4281258b2120", size = 154268, upload-time = "2026-01-04T02:42:41.825Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e6/ad/3cc14f097111b4de0040c83a525973216457bbeeb63739ef1ed275c1c021/certifi-2026.1.4-py3-none-any.whl", hash = "sha256:9943707519e4add1115f44c2bc244f782c0249876bf51b6599fee1ffbedd685c", size = 152900, upload-time = "2026-01-04T02:42:40.15Z" },
+]
+
+[[package]]
+name = "cffi"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pycparser", marker = "implementation_name != 'PyPy'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/12/4a/3dfd5f7850cbf0d06dc84ba9aa00db766b52ca38d8b86e3a38314d52498c/cffi-2.0.0-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:b4c854ef3adc177950a8dfc81a86f5115d2abd545751a304c5bcf2c2c7283cfe", size = 184344, upload-time = "2025-09-08T23:22:26.456Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/8b/f0e4c441227ba756aafbe78f117485b25bb26b1c059d01f137fa6d14896b/cffi-2.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2de9a304e27f7596cd03d16f1b7c72219bd944e99cc52b84d0145aefb07cbd3c", size = 180560, upload-time = "2025-09-08T23:22:28.197Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/b7/1200d354378ef52ec227395d95c2576330fd22a869f7a70e88e1447eb234/cffi-2.0.0-cp311-cp311-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:baf5215e0ab74c16e2dd324e8ec067ef59e41125d3eade2b863d294fd5035c92", size = 209613, upload-time = "2025-09-08T23:22:29.475Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/56/6033f5e86e8cc9bb629f0077ba71679508bdf54a9a5e112a3c0b91870332/cffi-2.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:730cacb21e1bdff3ce90babf007d0a0917cc3e6492f336c2f0134101e0944f93", size = 216476, upload-time = "2025-09-08T23:22:31.063Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/7f/55fecd70f7ece178db2f26128ec41430d8720f2d12ca97bf8f0a628207d5/cffi-2.0.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:6824f87845e3396029f3820c206e459ccc91760e8fa24422f8b0c3d1731cbec5", size = 203374, upload-time = "2025-09-08T23:22:32.507Z" },
+    { url = "https://files.pythonhosted.org/packages/84/ef/a7b77c8bdc0f77adc3b46888f1ad54be8f3b7821697a7b89126e829e676a/cffi-2.0.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:9de40a7b0323d889cf8d23d1ef214f565ab154443c42737dfe52ff82cf857664", size = 202597, upload-time = "2025-09-08T23:22:34.132Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/91/500d892b2bf36529a75b77958edfcd5ad8e2ce4064ce2ecfeab2125d72d1/cffi-2.0.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8941aaadaf67246224cee8c3803777eed332a19d909b47e29c9842ef1e79ac26", size = 215574, upload-time = "2025-09-08T23:22:35.443Z" },
+    { url = "https://files.pythonhosted.org/packages/44/64/58f6255b62b101093d5df22dcb752596066c7e89dd725e0afaed242a61be/cffi-2.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a05d0c237b3349096d3981b727493e22147f934b20f6f125a3eba8f994bec4a9", size = 218971, upload-time = "2025-09-08T23:22:36.805Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/49/fa72cebe2fd8a55fbe14956f9970fe8eb1ac59e5df042f603ef7c8ba0adc/cffi-2.0.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:94698a9c5f91f9d138526b48fe26a199609544591f859c870d477351dc7b2414", size = 211972, upload-time = "2025-09-08T23:22:38.436Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/28/dd0967a76aab36731b6ebfe64dec4e981aff7e0608f60c2d46b46982607d/cffi-2.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5fed36fccc0612a53f1d4d9a816b50a36702c28a2aa880cb8a122b3466638743", size = 217078, upload-time = "2025-09-08T23:22:39.776Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/c0/015b25184413d7ab0a410775fdb4a50fca20f5589b5dab1dbbfa3baad8ce/cffi-2.0.0-cp311-cp311-win32.whl", hash = "sha256:c649e3a33450ec82378822b3dad03cc228b8f5963c0c12fc3b1e0ab940f768a5", size = 172076, upload-time = "2025-09-08T23:22:40.95Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/8f/dc5531155e7070361eb1b7e4c1a9d896d0cb21c49f807a6c03fd63fc877e/cffi-2.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:66f011380d0e49ed280c789fbd08ff0d40968ee7b665575489afa95c98196ab5", size = 182820, upload-time = "2025-09-08T23:22:42.463Z" },
+    { url = "https://files.pythonhosted.org/packages/95/5c/1b493356429f9aecfd56bc171285a4c4ac8697f76e9bbbbb105e537853a1/cffi-2.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:c6638687455baf640e37344fe26d37c404db8b80d037c3d29f58fe8d1c3b194d", size = 177635, upload-time = "2025-09-08T23:22:43.623Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/47/4f61023ea636104d4f16ab488e268b93008c3d0bb76893b1b31db1f96802/cffi-2.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d", size = 185271, upload-time = "2025-09-08T23:22:44.795Z" },
+    { url = "https://files.pythonhosted.org/packages/df/a2/781b623f57358e360d62cdd7a8c681f074a71d445418a776eef0aadb4ab4/cffi-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c", size = 181048, upload-time = "2025-09-08T23:22:45.938Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/df/a4f0fbd47331ceeba3d37c2e51e9dfc9722498becbeec2bd8bc856c9538a/cffi-2.0.0-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe", size = 212529, upload-time = "2025-09-08T23:22:47.349Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/72/12b5f8d3865bf0f87cf1404d8c374e7487dcf097a1c91c436e72e6badd83/cffi-2.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062", size = 220097, upload-time = "2025-09-08T23:22:48.677Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/95/7a135d52a50dfa7c882ab0ac17e8dc11cec9d55d2c18dda414c051c5e69e/cffi-2.0.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e", size = 207983, upload-time = "2025-09-08T23:22:50.06Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/c8/15cb9ada8895957ea171c62dc78ff3e99159ee7adb13c0123c001a2546c1/cffi-2.0.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037", size = 206519, upload-time = "2025-09-08T23:22:51.364Z" },
+    { url = "https://files.pythonhosted.org/packages/78/2d/7fa73dfa841b5ac06c7b8855cfc18622132e365f5b81d02230333ff26e9e/cffi-2.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba", size = 219572, upload-time = "2025-09-08T23:22:52.902Z" },
+    { url = "https://files.pythonhosted.org/packages/07/e0/267e57e387b4ca276b90f0434ff88b2c2241ad72b16d31836adddfd6031b/cffi-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94", size = 222963, upload-time = "2025-09-08T23:22:54.518Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/75/1f2747525e06f53efbd878f4d03bac5b859cbc11c633d0fb81432d98a795/cffi-2.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187", size = 221361, upload-time = "2025-09-08T23:22:55.867Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/2b/2b6435f76bfeb6bbf055596976da087377ede68df465419d192acf00c437/cffi-2.0.0-cp312-cp312-win32.whl", hash = "sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18", size = 172932, upload-time = "2025-09-08T23:22:57.188Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/ed/13bd4418627013bec4ed6e54283b1959cf6db888048c7cf4b4c3b5b36002/cffi-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5", size = 183557, upload-time = "2025-09-08T23:22:58.351Z" },
+    { url = "https://files.pythonhosted.org/packages/95/31/9f7f93ad2f8eff1dbc1c3656d7ca5bfd8fb52c9d786b4dcf19b2d02217fa/cffi-2.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6", size = 177762, upload-time = "2025-09-08T23:22:59.668Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/8d/a0a47a0c9e413a658623d014e91e74a50cdd2c423f7ccfd44086ef767f90/cffi-2.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb", size = 185230, upload-time = "2025-09-08T23:23:00.879Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/d2/a6c0296814556c68ee32009d9c2ad4f85f2707cdecfd7727951ec228005d/cffi-2.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca", size = 181043, upload-time = "2025-09-08T23:23:02.231Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/1e/d22cc63332bd59b06481ceaac49d6c507598642e2230f201649058a7e704/cffi-2.0.0-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b", size = 212446, upload-time = "2025-09-08T23:23:03.472Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/f5/a2c23eb03b61a0b8747f211eb716446c826ad66818ddc7810cc2cc19b3f2/cffi-2.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b", size = 220101, upload-time = "2025-09-08T23:23:04.792Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/7f/e6647792fc5850d634695bc0e6ab4111ae88e89981d35ac269956605feba/cffi-2.0.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2", size = 207948, upload-time = "2025-09-08T23:23:06.127Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/1e/a5a1bd6f1fb30f22573f76533de12a00bf274abcdc55c8edab639078abb6/cffi-2.0.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3", size = 206422, upload-time = "2025-09-08T23:23:07.753Z" },
+    { url = "https://files.pythonhosted.org/packages/98/df/0a1755e750013a2081e863e7cd37e0cdd02664372c754e5560099eb7aa44/cffi-2.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26", size = 219499, upload-time = "2025-09-08T23:23:09.648Z" },
+    { url = "https://files.pythonhosted.org/packages/50/e1/a969e687fcf9ea58e6e2a928ad5e2dd88cc12f6f0ab477e9971f2309b57c/cffi-2.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c", size = 222928, upload-time = "2025-09-08T23:23:10.928Z" },
+    { url = "https://files.pythonhosted.org/packages/36/54/0362578dd2c9e557a28ac77698ed67323ed5b9775ca9d3fe73fe191bb5d8/cffi-2.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b", size = 221302, upload-time = "2025-09-08T23:23:12.42Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/6d/bf9bda840d5f1dfdbf0feca87fbdb64a918a69bca42cfa0ba7b137c48cb8/cffi-2.0.0-cp313-cp313-win32.whl", hash = "sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27", size = 172909, upload-time = "2025-09-08T23:23:14.32Z" },
+    { url = "https://files.pythonhosted.org/packages/37/18/6519e1ee6f5a1e579e04b9ddb6f1676c17368a7aba48299c3759bbc3c8b3/cffi-2.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75", size = 183402, upload-time = "2025-09-08T23:23:15.535Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/0e/02ceeec9a7d6ee63bb596121c2c8e9b3a9e150936f4fbef6ca1943e6137c/cffi-2.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91", size = 177780, upload-time = "2025-09-08T23:23:16.761Z" },
+    { url = "https://files.pythonhosted.org/packages/92/c4/3ce07396253a83250ee98564f8d7e9789fab8e58858f35d07a9a2c78de9f/cffi-2.0.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5", size = 185320, upload-time = "2025-09-08T23:23:18.087Z" },
+    { url = "https://files.pythonhosted.org/packages/59/dd/27e9fa567a23931c838c6b02d0764611c62290062a6d4e8ff7863daf9730/cffi-2.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13", size = 181487, upload-time = "2025-09-08T23:23:19.622Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/43/0e822876f87ea8a4ef95442c3d766a06a51fc5298823f884ef87aaad168c/cffi-2.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b", size = 220049, upload-time = "2025-09-08T23:23:20.853Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/89/76799151d9c2d2d1ead63c2429da9ea9d7aac304603de0c6e8764e6e8e70/cffi-2.0.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c", size = 207793, upload-time = "2025-09-08T23:23:22.08Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/dd/3465b14bb9e24ee24cb88c9e3730f6de63111fffe513492bf8c808a3547e/cffi-2.0.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef", size = 206300, upload-time = "2025-09-08T23:23:23.314Z" },
+    { url = "https://files.pythonhosted.org/packages/47/d9/d83e293854571c877a92da46fdec39158f8d7e68da75bf73581225d28e90/cffi-2.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775", size = 219244, upload-time = "2025-09-08T23:23:24.541Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/0f/1f177e3683aead2bb00f7679a16451d302c436b5cbf2505f0ea8146ef59e/cffi-2.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205", size = 222828, upload-time = "2025-09-08T23:23:26.143Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/0f/cafacebd4b040e3119dcb32fed8bdef8dfe94da653155f9d0b9dc660166e/cffi-2.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1", size = 220926, upload-time = "2025-09-08T23:23:27.873Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/aa/df335faa45b395396fcbc03de2dfcab242cd61a9900e914fe682a59170b1/cffi-2.0.0-cp314-cp314-win32.whl", hash = "sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f", size = 175328, upload-time = "2025-09-08T23:23:44.61Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/92/882c2d30831744296ce713f0feb4c1cd30f346ef747b530b5318715cc367/cffi-2.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25", size = 185650, upload-time = "2025-09-08T23:23:45.848Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/2c/98ece204b9d35a7366b5b2c6539c350313ca13932143e79dc133ba757104/cffi-2.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad", size = 180687, upload-time = "2025-09-08T23:23:47.105Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/61/c768e4d548bfa607abcda77423448df8c471f25dbe64fb2ef6d555eae006/cffi-2.0.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9", size = 188773, upload-time = "2025-09-08T23:23:29.347Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/ea/5f76bce7cf6fcd0ab1a1058b5af899bfbef198bea4d5686da88471ea0336/cffi-2.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d", size = 185013, upload-time = "2025-09-08T23:23:30.63Z" },
+    { url = "https://files.pythonhosted.org/packages/be/b4/c56878d0d1755cf9caa54ba71e5d049479c52f9e4afc230f06822162ab2f/cffi-2.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c", size = 221593, upload-time = "2025-09-08T23:23:31.91Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/0d/eb704606dfe8033e7128df5e90fee946bbcb64a04fcdaa97321309004000/cffi-2.0.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8", size = 209354, upload-time = "2025-09-08T23:23:33.214Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/19/3c435d727b368ca475fb8742ab97c9cb13a0de600ce86f62eab7fa3eea60/cffi-2.0.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc", size = 208480, upload-time = "2025-09-08T23:23:34.495Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/44/681604464ed9541673e486521497406fadcc15b5217c3e326b061696899a/cffi-2.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592", size = 221584, upload-time = "2025-09-08T23:23:36.096Z" },
+    { url = "https://files.pythonhosted.org/packages/25/8e/342a504ff018a2825d395d44d63a767dd8ebc927ebda557fecdaca3ac33a/cffi-2.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512", size = 224443, upload-time = "2025-09-08T23:23:37.328Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/5e/b666bacbbc60fbf415ba9988324a132c9a7a0448a9a8f125074671c0f2c3/cffi-2.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4", size = 223437, upload-time = "2025-09-08T23:23:38.945Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/1d/ec1a60bd1a10daa292d3cd6bb0b359a81607154fb8165f3ec95fe003b85c/cffi-2.0.0-cp314-cp314t-win32.whl", hash = "sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e", size = 180487, upload-time = "2025-09-08T23:23:40.423Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/41/4c1168c74fac325c0c8156f04b6749c8b6a8f405bbf91413ba088359f60d/cffi-2.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6", size = 191726, upload-time = "2025-09-08T23:23:41.742Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/3a/dbeec9d1ee0844c679f6bb5d6ad4e9f198b1224f4e7a32825f47f6192b0c/cffi-2.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9", size = 184195, upload-time = "2025-09-08T23:23:43.004Z" },
+]
+
+[[package]]
+name = "charset-normalizer"
+version = "3.4.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/13/69/33ddede1939fdd074bce5434295f38fae7136463422fe4fd3e0e89b98062/charset_normalizer-3.4.4.tar.gz", hash = "sha256:94537985111c35f28720e43603b8e7b43a6ecfb2ce1d3058bbe955b73404e21a", size = 129418, upload-time = "2025-10-14T04:42:32.879Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ed/27/c6491ff4954e58a10f69ad90aca8a1b6fe9c5d3c6f380907af3c37435b59/charset_normalizer-3.4.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6e1fcf0720908f200cd21aa4e6750a48ff6ce4afe7ff5a79a90d5ed8a08296f8", size = 206988, upload-time = "2025-10-14T04:40:33.79Z" },
+    { url = "https://files.pythonhosted.org/packages/94/59/2e87300fe67ab820b5428580a53cad894272dbb97f38a7a814a2a1ac1011/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f819d5fe9234f9f82d75bdfa9aef3a3d72c4d24a6e57aeaebba32a704553aa0", size = 147324, upload-time = "2025-10-14T04:40:34.961Z" },
+    { url = "https://files.pythonhosted.org/packages/07/fb/0cf61dc84b2b088391830f6274cb57c82e4da8bbc2efeac8c025edb88772/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:a59cb51917aa591b1c4e6a43c132f0cdc3c76dbad6155df4e28ee626cc77a0a3", size = 142742, upload-time = "2025-10-14T04:40:36.105Z" },
+    { url = "https://files.pythonhosted.org/packages/62/8b/171935adf2312cd745d290ed93cf16cf0dfe320863ab7cbeeae1dcd6535f/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8ef3c867360f88ac904fd3f5e1f902f13307af9052646963ee08ff4f131adafc", size = 160863, upload-time = "2025-10-14T04:40:37.188Z" },
+    { url = "https://files.pythonhosted.org/packages/09/73/ad875b192bda14f2173bfc1bc9a55e009808484a4b256748d931b6948442/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d9e45d7faa48ee908174d8fe84854479ef838fc6a705c9315372eacbc2f02897", size = 157837, upload-time = "2025-10-14T04:40:38.435Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/fc/de9cce525b2c5b94b47c70a4b4fb19f871b24995c728e957ee68ab1671ea/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:840c25fb618a231545cbab0564a799f101b63b9901f2569faecd6b222ac72381", size = 151550, upload-time = "2025-10-14T04:40:40.053Z" },
+    { url = "https://files.pythonhosted.org/packages/55/c2/43edd615fdfba8c6f2dfbd459b25a6b3b551f24ea21981e23fb768503ce1/charset_normalizer-3.4.4-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ca5862d5b3928c4940729dacc329aa9102900382fea192fc5e52eb69d6093815", size = 149162, upload-time = "2025-10-14T04:40:41.163Z" },
+    { url = "https://files.pythonhosted.org/packages/03/86/bde4ad8b4d0e9429a4e82c1e8f5c659993a9a863ad62c7df05cf7b678d75/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d9c7f57c3d666a53421049053eaacdd14bbd0a528e2186fcb2e672effd053bb0", size = 150019, upload-time = "2025-10-14T04:40:42.276Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/86/a151eb2af293a7e7bac3a739b81072585ce36ccfb4493039f49f1d3cae8c/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:277e970e750505ed74c832b4bf75dac7476262ee2a013f5574dd49075879e161", size = 143310, upload-time = "2025-10-14T04:40:43.439Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/fe/43dae6144a7e07b87478fdfc4dbe9efd5defb0e7ec29f5f58a55aeef7bf7/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:31fd66405eaf47bb62e8cd575dc621c56c668f27d46a61d975a249930dd5e2a4", size = 162022, upload-time = "2025-10-14T04:40:44.547Z" },
+    { url = "https://files.pythonhosted.org/packages/80/e6/7aab83774f5d2bca81f42ac58d04caf44f0cc2b65fc6db2b3b2e8a05f3b3/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:0d3d8f15c07f86e9ff82319b3d9ef6f4bf907608f53fe9d92b28ea9ae3d1fd89", size = 149383, upload-time = "2025-10-14T04:40:46.018Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/e8/b289173b4edae05c0dde07f69f8db476a0b511eac556dfe0d6bda3c43384/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:9f7fcd74d410a36883701fafa2482a6af2ff5ba96b9a620e9e0721e28ead5569", size = 159098, upload-time = "2025-10-14T04:40:47.081Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/df/fe699727754cae3f8478493c7f45f777b17c3ef0600e28abfec8619eb49c/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ebf3e58c7ec8a8bed6d66a75d7fb37b55e5015b03ceae72a8e7c74495551e224", size = 152991, upload-time = "2025-10-14T04:40:48.246Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/86/584869fe4ddb6ffa3bd9f491b87a01568797fb9bd8933f557dba9771beaf/charset_normalizer-3.4.4-cp311-cp311-win32.whl", hash = "sha256:eecbc200c7fd5ddb9a7f16c7decb07b566c29fa2161a16cf67b8d068bd21690a", size = 99456, upload-time = "2025-10-14T04:40:49.376Z" },
+    { url = "https://files.pythonhosted.org/packages/65/f6/62fdd5feb60530f50f7e38b4f6a1d5203f4d16ff4f9f0952962c044e919a/charset_normalizer-3.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:5ae497466c7901d54b639cf42d5b8c1b6a4fead55215500d2f486d34db48d016", size = 106978, upload-time = "2025-10-14T04:40:50.844Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/9d/0710916e6c82948b3be62d9d398cb4fcf4e97b56d6a6aeccd66c4b2f2bd5/charset_normalizer-3.4.4-cp311-cp311-win_arm64.whl", hash = "sha256:65e2befcd84bc6f37095f5961e68a6f077bf44946771354a28ad434c2cce0ae1", size = 99969, upload-time = "2025-10-14T04:40:52.272Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/85/1637cd4af66fa687396e757dec650f28025f2a2f5a5531a3208dc0ec43f2/charset_normalizer-3.4.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0a98e6759f854bd25a58a73fa88833fba3b7c491169f86ce1180c948ab3fd394", size = 208425, upload-time = "2025-10-14T04:40:53.353Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/6a/04130023fef2a0d9c62d0bae2649b69f7b7d8d24ea5536feef50551029df/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b5b290ccc2a263e8d185130284f8501e3e36c5e02750fc6b6bdeb2e9e96f1e25", size = 148162, upload-time = "2025-10-14T04:40:54.558Z" },
+    { url = "https://files.pythonhosted.org/packages/78/29/62328d79aa60da22c9e0b9a66539feae06ca0f5a4171ac4f7dc285b83688/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:74bb723680f9f7a6234dcf67aea57e708ec1fbdf5699fb91dfd6f511b0a320ef", size = 144558, upload-time = "2025-10-14T04:40:55.677Z" },
+    { url = "https://files.pythonhosted.org/packages/86/bb/b32194a4bf15b88403537c2e120b817c61cd4ecffa9b6876e941c3ee38fe/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f1e34719c6ed0b92f418c7c780480b26b5d9c50349e9a9af7d76bf757530350d", size = 161497, upload-time = "2025-10-14T04:40:57.217Z" },
+    { url = "https://files.pythonhosted.org/packages/19/89/a54c82b253d5b9b111dc74aca196ba5ccfcca8242d0fb64146d4d3183ff1/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2437418e20515acec67d86e12bf70056a33abdacb5cb1655042f6538d6b085a8", size = 159240, upload-time = "2025-10-14T04:40:58.358Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/10/d20b513afe03acc89ec33948320a5544d31f21b05368436d580dec4e234d/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:11d694519d7f29d6cd09f6ac70028dba10f92f6cdd059096db198c283794ac86", size = 153471, upload-time = "2025-10-14T04:40:59.468Z" },
+    { url = "https://files.pythonhosted.org/packages/61/fa/fbf177b55bdd727010f9c0a3c49eefa1d10f960e5f09d1d887bf93c2e698/charset_normalizer-3.4.4-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ac1c4a689edcc530fc9d9aa11f5774b9e2f33f9a0c6a57864e90908f5208d30a", size = 150864, upload-time = "2025-10-14T04:41:00.623Z" },
+    { url = "https://files.pythonhosted.org/packages/05/12/9fbc6a4d39c0198adeebbde20b619790e9236557ca59fc40e0e3cebe6f40/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:21d142cc6c0ec30d2efee5068ca36c128a30b0f2c53c1c07bd78cb6bc1d3be5f", size = 150647, upload-time = "2025-10-14T04:41:01.754Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/1f/6a9a593d52e3e8c5d2b167daf8c6b968808efb57ef4c210acb907c365bc4/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:5dbe56a36425d26d6cfb40ce79c314a2e4dd6211d51d6d2191c00bed34f354cc", size = 145110, upload-time = "2025-10-14T04:41:03.231Z" },
+    { url = "https://files.pythonhosted.org/packages/30/42/9a52c609e72471b0fc54386dc63c3781a387bb4fe61c20231a4ebcd58bdd/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:5bfbb1b9acf3334612667b61bd3002196fe2a1eb4dd74d247e0f2a4d50ec9bbf", size = 162839, upload-time = "2025-10-14T04:41:04.715Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/5b/c0682bbf9f11597073052628ddd38344a3d673fda35a36773f7d19344b23/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:d055ec1e26e441f6187acf818b73564e6e6282709e9bcb5b63f5b23068356a15", size = 150667, upload-time = "2025-10-14T04:41:05.827Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/24/a41afeab6f990cf2daf6cb8c67419b63b48cf518e4f56022230840c9bfb2/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:af2d8c67d8e573d6de5bc30cdb27e9b95e49115cd9baad5ddbd1a6207aaa82a9", size = 160535, upload-time = "2025-10-14T04:41:06.938Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/e5/6a4ce77ed243c4a50a1fecca6aaaab419628c818a49434be428fe24c9957/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:780236ac706e66881f3b7f2f32dfe90507a09e67d1d454c762cf642e6e1586e0", size = 154816, upload-time = "2025-10-14T04:41:08.101Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/ef/89297262b8092b312d29cdb2517cb1237e51db8ecef2e9af5edbe7b683b1/charset_normalizer-3.4.4-cp312-cp312-win32.whl", hash = "sha256:5833d2c39d8896e4e19b689ffc198f08ea58116bee26dea51e362ecc7cd3ed26", size = 99694, upload-time = "2025-10-14T04:41:09.23Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/2d/1e5ed9dd3b3803994c155cd9aacb60c82c331bad84daf75bcb9c91b3295e/charset_normalizer-3.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:a79cfe37875f822425b89a82333404539ae63dbdddf97f84dcbc3d339aae9525", size = 107131, upload-time = "2025-10-14T04:41:10.467Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/d9/0ed4c7098a861482a7b6a95603edce4c0d9db2311af23da1fb2b75ec26fc/charset_normalizer-3.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:376bec83a63b8021bb5c8ea75e21c4ccb86e7e45ca4eb81146091b56599b80c3", size = 100390, upload-time = "2025-10-14T04:41:11.915Z" },
+    { url = "https://files.pythonhosted.org/packages/97/45/4b3a1239bbacd321068ea6e7ac28875b03ab8bc0aa0966452db17cd36714/charset_normalizer-3.4.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:e1f185f86a6f3403aa2420e815904c67b2f9ebc443f045edd0de921108345794", size = 208091, upload-time = "2025-10-14T04:41:13.346Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/62/73a6d7450829655a35bb88a88fca7d736f9882a27eacdca2c6d505b57e2e/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b39f987ae8ccdf0d2642338faf2abb1862340facc796048b604ef14919e55ed", size = 147936, upload-time = "2025-10-14T04:41:14.461Z" },
+    { url = "https://files.pythonhosted.org/packages/89/c5/adb8c8b3d6625bef6d88b251bbb0d95f8205831b987631ab0c8bb5d937c2/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3162d5d8ce1bb98dd51af660f2121c55d0fa541b46dff7bb9b9f86ea1d87de72", size = 144180, upload-time = "2025-10-14T04:41:15.588Z" },
+    { url = "https://files.pythonhosted.org/packages/91/ed/9706e4070682d1cc219050b6048bfd293ccf67b3d4f5a4f39207453d4b99/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:81d5eb2a312700f4ecaa977a8235b634ce853200e828fbadf3a9c50bab278328", size = 161346, upload-time = "2025-10-14T04:41:16.738Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/0d/031f0d95e4972901a2f6f09ef055751805ff541511dc1252ba3ca1f80cf5/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5bd2293095d766545ec1a8f612559f6b40abc0eb18bb2f5d1171872d34036ede", size = 158874, upload-time = "2025-10-14T04:41:17.923Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/83/6ab5883f57c9c801ce5e5677242328aa45592be8a00644310a008d04f922/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a8a8b89589086a25749f471e6a900d3f662d1d3b6e2e59dcecf787b1cc3a1894", size = 153076, upload-time = "2025-10-14T04:41:19.106Z" },
+    { url = "https://files.pythonhosted.org/packages/75/1e/5ff781ddf5260e387d6419959ee89ef13878229732732ee73cdae01800f2/charset_normalizer-3.4.4-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc7637e2f80d8530ee4a78e878bce464f70087ce73cf7c1caf142416923b98f1", size = 150601, upload-time = "2025-10-14T04:41:20.245Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/57/71be810965493d3510a6ca79b90c19e48696fb1ff964da319334b12677f0/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f8bf04158c6b607d747e93949aa60618b61312fe647a6369f88ce2ff16043490", size = 150376, upload-time = "2025-10-14T04:41:21.398Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/d5/c3d057a78c181d007014feb7e9f2e65905a6c4ef182c0ddf0de2924edd65/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:554af85e960429cf30784dd47447d5125aaa3b99a6f0683589dbd27e2f45da44", size = 144825, upload-time = "2025-10-14T04:41:22.583Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/8c/d0406294828d4976f275ffbe66f00266c4b3136b7506941d87c00cab5272/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:74018750915ee7ad843a774364e13a3db91682f26142baddf775342c3f5b1133", size = 162583, upload-time = "2025-10-14T04:41:23.754Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/24/e2aa1f18c8f15c4c0e932d9287b8609dd30ad56dbe41d926bd846e22fb8d/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:c0463276121fdee9c49b98908b3a89c39be45d86d1dbaa22957e38f6321d4ce3", size = 150366, upload-time = "2025-10-14T04:41:25.27Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/5b/1e6160c7739aad1e2df054300cc618b06bf784a7a164b0f238360721ab86/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:362d61fd13843997c1c446760ef36f240cf81d3ebf74ac62652aebaf7838561e", size = 160300, upload-time = "2025-10-14T04:41:26.725Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/10/f882167cd207fbdd743e55534d5d9620e095089d176d55cb22d5322f2afd/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9a26f18905b8dd5d685d6d07b0cdf98a79f3c7a918906af7cc143ea2e164c8bc", size = 154465, upload-time = "2025-10-14T04:41:28.322Z" },
+    { url = "https://files.pythonhosted.org/packages/89/66/c7a9e1b7429be72123441bfdbaf2bc13faab3f90b933f664db506dea5915/charset_normalizer-3.4.4-cp313-cp313-win32.whl", hash = "sha256:9b35f4c90079ff2e2edc5b26c0c77925e5d2d255c42c74fdb70fb49b172726ac", size = 99404, upload-time = "2025-10-14T04:41:29.95Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/26/b9924fa27db384bdcd97ab83b4f0a8058d96ad9626ead570674d5e737d90/charset_normalizer-3.4.4-cp313-cp313-win_amd64.whl", hash = "sha256:b435cba5f4f750aa6c0a0d92c541fb79f69a387c91e61f1795227e4ed9cece14", size = 107092, upload-time = "2025-10-14T04:41:31.188Z" },
+    { url = "https://files.pythonhosted.org/packages/af/8f/3ed4bfa0c0c72a7ca17f0380cd9e4dd842b09f664e780c13cff1dcf2ef1b/charset_normalizer-3.4.4-cp313-cp313-win_arm64.whl", hash = "sha256:542d2cee80be6f80247095cc36c418f7bddd14f4a6de45af91dfad36d817bba2", size = 100408, upload-time = "2025-10-14T04:41:32.624Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/35/7051599bd493e62411d6ede36fd5af83a38f37c4767b92884df7301db25d/charset_normalizer-3.4.4-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:da3326d9e65ef63a817ecbcc0df6e94463713b754fe293eaa03da99befb9a5bd", size = 207746, upload-time = "2025-10-14T04:41:33.773Z" },
+    { url = "https://files.pythonhosted.org/packages/10/9a/97c8d48ef10d6cd4fcead2415523221624bf58bcf68a802721a6bc807c8f/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8af65f14dc14a79b924524b1e7fffe304517b2bff5a58bf64f30b98bbc5079eb", size = 147889, upload-time = "2025-10-14T04:41:34.897Z" },
+    { url = "https://files.pythonhosted.org/packages/10/bf/979224a919a1b606c82bd2c5fa49b5c6d5727aa47b4312bb27b1734f53cd/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:74664978bb272435107de04e36db5a9735e78232b85b77d45cfb38f758efd33e", size = 143641, upload-time = "2025-10-14T04:41:36.116Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/33/0ad65587441fc730dc7bd90e9716b30b4702dc7b617e6ba4997dc8651495/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:752944c7ffbfdd10c074dc58ec2d5a8a4cd9493b314d367c14d24c17684ddd14", size = 160779, upload-time = "2025-10-14T04:41:37.229Z" },
+    { url = "https://files.pythonhosted.org/packages/67/ed/331d6b249259ee71ddea93f6f2f0a56cfebd46938bde6fcc6f7b9a3d0e09/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d1f13550535ad8cff21b8d757a3257963e951d96e20ec82ab44bc64aeb62a191", size = 159035, upload-time = "2025-10-14T04:41:38.368Z" },
+    { url = "https://files.pythonhosted.org/packages/67/ff/f6b948ca32e4f2a4576aa129d8bed61f2e0543bf9f5f2b7fc3758ed005c9/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ecaae4149d99b1c9e7b88bb03e3221956f68fd6d50be2ef061b2381b61d20838", size = 152542, upload-time = "2025-10-14T04:41:39.862Z" },
+    { url = "https://files.pythonhosted.org/packages/16/85/276033dcbcc369eb176594de22728541a925b2632f9716428c851b149e83/charset_normalizer-3.4.4-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cb6254dc36b47a990e59e1068afacdcd02958bdcce30bb50cc1700a8b9d624a6", size = 149524, upload-time = "2025-10-14T04:41:41.319Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/f2/6a2a1f722b6aba37050e626530a46a68f74e63683947a8acff92569f979a/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c8ae8a0f02f57a6e61203a31428fa1d677cbe50c93622b4149d5c0f319c1d19e", size = 150395, upload-time = "2025-10-14T04:41:42.539Z" },
+    { url = "https://files.pythonhosted.org/packages/60/bb/2186cb2f2bbaea6338cad15ce23a67f9b0672929744381e28b0592676824/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:47cc91b2f4dd2833fddaedd2893006b0106129d4b94fdb6af1f4ce5a9965577c", size = 143680, upload-time = "2025-10-14T04:41:43.661Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/a5/bf6f13b772fbb2a90360eb620d52ed8f796f3c5caee8398c3b2eb7b1c60d/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:82004af6c302b5d3ab2cfc4cc5f29db16123b1a8417f2e25f9066f91d4411090", size = 162045, upload-time = "2025-10-14T04:41:44.821Z" },
+    { url = "https://files.pythonhosted.org/packages/df/c5/d1be898bf0dc3ef9030c3825e5d3b83f2c528d207d246cbabe245966808d/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:2b7d8f6c26245217bd2ad053761201e9f9680f8ce52f0fcd8d0755aeae5b2152", size = 149687, upload-time = "2025-10-14T04:41:46.442Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/42/90c1f7b9341eef50c8a1cb3f098ac43b0508413f33affd762855f67a410e/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:799a7a5e4fb2d5898c60b640fd4981d6a25f1c11790935a44ce38c54e985f828", size = 160014, upload-time = "2025-10-14T04:41:47.631Z" },
+    { url = "https://files.pythonhosted.org/packages/76/be/4d3ee471e8145d12795ab655ece37baed0929462a86e72372fd25859047c/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:99ae2cffebb06e6c22bdc25801d7b30f503cc87dbd283479e7b606f70aff57ec", size = 154044, upload-time = "2025-10-14T04:41:48.81Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/6f/8f7af07237c34a1defe7defc565a9bc1807762f672c0fde711a4b22bf9c0/charset_normalizer-3.4.4-cp314-cp314-win32.whl", hash = "sha256:f9d332f8c2a2fcbffe1378594431458ddbef721c1769d78e2cbc06280d8155f9", size = 99940, upload-time = "2025-10-14T04:41:49.946Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/51/8ade005e5ca5b0d80fb4aff72a3775b325bdc3d27408c8113811a7cbe640/charset_normalizer-3.4.4-cp314-cp314-win_amd64.whl", hash = "sha256:8a6562c3700cce886c5be75ade4a5db4214fda19fede41d9792d100288d8f94c", size = 107104, upload-time = "2025-10-14T04:41:51.051Z" },
+    { url = "https://files.pythonhosted.org/packages/da/5f/6b8f83a55bb8278772c5ae54a577f3099025f9ade59d0136ac24a0df4bde/charset_normalizer-3.4.4-cp314-cp314-win_arm64.whl", hash = "sha256:de00632ca48df9daf77a2c65a484531649261ec9f25489917f09e455cb09ddb2", size = 100743, upload-time = "2025-10-14T04:41:52.122Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/4c/925909008ed5a988ccbb72dcc897407e5d6d3bd72410d69e051fc0c14647/charset_normalizer-3.4.4-py3-none-any.whl", hash = "sha256:7a32c560861a02ff789ad905a2fe94e3f840803362c84fecf1851cb4cf3dc37f", size = 53402, upload-time = "2025-10-14T04:42:31.76Z" },
+]
+
+[[package]]
+name = "click"
+version = "8.3.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" },
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
+]
+
+[[package]]
+name = "cryptography"
+version = "46.0.7"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cffi", marker = "platform_python_implementation != 'PyPy'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/47/93/ac8f3d5ff04d54bc814e961a43ae5b0b146154c89c61b47bb07557679b18/cryptography-46.0.7.tar.gz", hash = "sha256:e4cfd68c5f3e0bfdad0d38e023239b96a2fe84146481852dffbcca442c245aa5", size = 750652, upload-time = "2026-04-08T01:57:54.692Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0b/5d/4a8f770695d73be252331e60e526291e3df0c9b27556a90a6b47bccca4c2/cryptography-46.0.7-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:ea42cbe97209df307fdc3b155f1b6fa2577c0defa8f1f7d3be7d31d189108ad4", size = 7179869, upload-time = "2026-04-08T01:56:17.157Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/45/6d80dc379b0bbc1f9d1e429f42e4cb9e1d319c7a8201beffd967c516ea01/cryptography-46.0.7-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b36a4695e29fe69215d75960b22577197aca3f7a25b9cf9d165dcfe9d80bc325", size = 4275492, upload-time = "2026-04-08T01:56:19.36Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/9a/1765afe9f572e239c3469f2cb429f3ba7b31878c893b246b4b2994ffe2fe/cryptography-46.0.7-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5ad9ef796328c5e3c4ceed237a183f5d41d21150f972455a9d926593a1dcb308", size = 4426670, upload-time = "2026-04-08T01:56:21.415Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/3e/af9246aaf23cd4ee060699adab1e47ced3f5f7e7a8ffdd339f817b446462/cryptography-46.0.7-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:73510b83623e080a2c35c62c15298096e2a5dc8d51c3b4e1740211839d0dea77", size = 4280275, upload-time = "2026-04-08T01:56:23.539Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/54/6bbbfc5efe86f9d71041827b793c24811a017c6ac0fd12883e4caa86b8ed/cryptography-46.0.7-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:cbd5fb06b62bd0721e1170273d3f4d5a277044c47ca27ee257025146c34cbdd1", size = 4928402, upload-time = "2026-04-08T01:56:25.624Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/cf/054b9d8220f81509939599c8bdbc0c408dbd2bdd41688616a20731371fe0/cryptography-46.0.7-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:420b1e4109cc95f0e5700eed79908cef9268265c773d3a66f7af1eef53d409ef", size = 4459985, upload-time = "2026-04-08T01:56:27.309Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/46/4e4e9c6040fb01c7467d47217d2f882daddeb8828f7df800cb806d8a2288/cryptography-46.0.7-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:24402210aa54baae71d99441d15bb5a1919c195398a87b563df84468160a65de", size = 3990652, upload-time = "2026-04-08T01:56:29.095Z" },
+    { url = "https://files.pythonhosted.org/packages/36/5f/313586c3be5a2fbe87e4c9a254207b860155a8e1f3cca99f9910008e7d08/cryptography-46.0.7-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:8a469028a86f12eb7d2fe97162d0634026d92a21f3ae0ac87ed1c4a447886c83", size = 4279805, upload-time = "2026-04-08T01:56:30.928Z" },
+    { url = "https://files.pythonhosted.org/packages/69/33/60dfc4595f334a2082749673386a4d05e4f0cf4df8248e63b2c3437585f2/cryptography-46.0.7-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:9694078c5d44c157ef3162e3bf3946510b857df5a3955458381d1c7cfc143ddb", size = 4892883, upload-time = "2026-04-08T01:56:32.614Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/0b/333ddab4270c4f5b972f980adef4faa66951a4aaf646ca067af597f15563/cryptography-46.0.7-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:42a1e5f98abb6391717978baf9f90dc28a743b7d9be7f0751a6f56a75d14065b", size = 4459756, upload-time = "2026-04-08T01:56:34.306Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/14/633913398b43b75f1234834170947957c6b623d1701ffc7a9600da907e89/cryptography-46.0.7-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:91bbcb08347344f810cbe49065914fe048949648f6bd5c2519f34619142bbe85", size = 4410244, upload-time = "2026-04-08T01:56:35.977Z" },
+    { url = "https://files.pythonhosted.org/packages/10/f2/19ceb3b3dc14009373432af0c13f46aa08e3ce334ec6eff13492e1812ccd/cryptography-46.0.7-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5d1c02a14ceb9148cc7816249f64f623fbfee39e8c03b3650d842ad3f34d637e", size = 4674868, upload-time = "2026-04-08T01:56:38.034Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/bb/a5c213c19ee94b15dfccc48f363738633a493812687f5567addbcbba9f6f/cryptography-46.0.7-cp311-abi3-win32.whl", hash = "sha256:d23c8ca48e44ee015cd0a54aeccdf9f09004eba9fc96f38c911011d9ff1bd457", size = 3026504, upload-time = "2026-04-08T01:56:39.666Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/02/7788f9fefa1d060ca68717c3901ae7fffa21ee087a90b7f23c7a603c32ae/cryptography-46.0.7-cp311-abi3-win_amd64.whl", hash = "sha256:397655da831414d165029da9bc483bed2fe0e75dde6a1523ec2fe63f3c46046b", size = 3488363, upload-time = "2026-04-08T01:56:41.893Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/56/15619b210e689c5403bb0540e4cb7dbf11a6bf42e483b7644e471a2812b3/cryptography-46.0.7-cp314-cp314t-macosx_10_9_universal2.whl", hash = "sha256:d151173275e1728cf7839aaa80c34fe550c04ddb27b34f48c232193df8db5842", size = 7119671, upload-time = "2026-04-08T01:56:44Z" },
+    { url = "https://files.pythonhosted.org/packages/74/66/e3ce040721b0b5599e175ba91ab08884c75928fbeb74597dd10ef13505d2/cryptography-46.0.7-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:db0f493b9181c7820c8134437eb8b0b4792085d37dbb24da050476ccb664e59c", size = 4268551, upload-time = "2026-04-08T01:56:46.071Z" },
+    { url = "https://files.pythonhosted.org/packages/03/11/5e395f961d6868269835dee1bafec6a1ac176505a167f68b7d8818431068/cryptography-46.0.7-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ebd6daf519b9f189f85c479427bbd6e9c9037862cf8fe89ee35503bd209ed902", size = 4408887, upload-time = "2026-04-08T01:56:47.718Z" },
+    { url = "https://files.pythonhosted.org/packages/40/53/8ed1cf4c3b9c8e611e7122fb56f1c32d09e1fff0f1d77e78d9ff7c82653e/cryptography-46.0.7-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:b7b412817be92117ec5ed95f880defe9cf18a832e8cafacf0a22337dc1981b4d", size = 4271354, upload-time = "2026-04-08T01:56:49.312Z" },
+    { url = "https://files.pythonhosted.org/packages/50/46/cf71e26025c2e767c5609162c866a78e8a2915bbcfa408b7ca495c6140c4/cryptography-46.0.7-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:fbfd0e5f273877695cb93baf14b185f4878128b250cc9f8e617ea0c025dfb022", size = 4905845, upload-time = "2026-04-08T01:56:50.916Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/ea/01276740375bac6249d0a971ebdf6b4dc9ead0ee0a34ef3b5a88c1a9b0d4/cryptography-46.0.7-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:ffca7aa1d00cf7d6469b988c581598f2259e46215e0140af408966a24cf086ce", size = 4444641, upload-time = "2026-04-08T01:56:52.882Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/4c/7d258f169ae71230f25d9f3d06caabcff8c3baf0978e2b7d65e0acac3827/cryptography-46.0.7-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:60627cf07e0d9274338521205899337c5d18249db56865f943cbe753aa96f40f", size = 3967749, upload-time = "2026-04-08T01:56:54.597Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/2a/2ea0767cad19e71b3530e4cad9605d0b5e338b6a1e72c37c9c1ceb86c333/cryptography-46.0.7-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:80406c3065e2c55d7f49a9550fe0c49b3f12e5bfff5dedb727e319e1afb9bf99", size = 4270942, upload-time = "2026-04-08T01:56:56.416Z" },
+    { url = "https://files.pythonhosted.org/packages/41/3d/fe14df95a83319af25717677e956567a105bb6ab25641acaa093db79975d/cryptography-46.0.7-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:c5b1ccd1239f48b7151a65bc6dd54bcfcc15e028c8ac126d3fada09db0e07ef1", size = 4871079, upload-time = "2026-04-08T01:56:58.31Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/59/4a479e0f36f8f378d397f4eab4c850b4ffb79a2f0d58704b8fa0703ddc11/cryptography-46.0.7-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:d5f7520159cd9c2154eb61eb67548ca05c5774d39e9c2c4339fd793fe7d097b2", size = 4443999, upload-time = "2026-04-08T01:57:00.508Z" },
+    { url = "https://files.pythonhosted.org/packages/28/17/b59a741645822ec6d04732b43c5d35e4ef58be7bfa84a81e5ae6f05a1d33/cryptography-46.0.7-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:fcd8eac50d9138c1d7fc53a653ba60a2bee81a505f9f8850b6b2888555a45d0e", size = 4399191, upload-time = "2026-04-08T01:57:02.654Z" },
+    { url = "https://files.pythonhosted.org/packages/59/6a/bb2e166d6d0e0955f1e9ff70f10ec4b2824c9cfcdb4da772c7dd69cc7d80/cryptography-46.0.7-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:65814c60f8cc400c63131584e3e1fad01235edba2614b61fbfbfa954082db0ee", size = 4655782, upload-time = "2026-04-08T01:57:04.592Z" },
+    { url = "https://files.pythonhosted.org/packages/95/b6/3da51d48415bcb63b00dc17c2eff3a651b7c4fed484308d0f19b30e8cb2c/cryptography-46.0.7-cp314-cp314t-win32.whl", hash = "sha256:fdd1736fed309b4300346f88f74cd120c27c56852c3838cab416e7a166f67298", size = 3002227, upload-time = "2026-04-08T01:57:06.91Z" },
+    { url = "https://files.pythonhosted.org/packages/32/a8/9f0e4ed57ec9cebe506e58db11ae472972ecb0c659e4d52bbaee80ca340a/cryptography-46.0.7-cp314-cp314t-win_amd64.whl", hash = "sha256:e06acf3c99be55aa3b516397fe42f5855597f430add9c17fa46bf2e0fb34c9bb", size = 3475332, upload-time = "2026-04-08T01:57:08.807Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/7f/cd42fc3614386bc0c12f0cb3c4ae1fc2bbca5c9662dfed031514911d513d/cryptography-46.0.7-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:462ad5cb1c148a22b2e3bcc5ad52504dff325d17daf5df8d88c17dda1f75f2a4", size = 7165618, upload-time = "2026-04-08T01:57:10.645Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/d0/36a49f0262d2319139d2829f773f1b97ef8aef7f97e6e5bd21455e5a8fb5/cryptography-46.0.7-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:84d4cced91f0f159a7ddacad249cc077e63195c36aac40b4150e7a57e84fffe7", size = 4270628, upload-time = "2026-04-08T01:57:12.885Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/6c/1a42450f464dda6ffbe578a911f773e54dd48c10f9895a23a7e88b3e7db5/cryptography-46.0.7-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:128c5edfe5e5938b86b03941e94fac9ee793a94452ad1365c9fc3f4f62216832", size = 4415405, upload-time = "2026-04-08T01:57:14.923Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/92/4ed714dbe93a066dc1f4b4581a464d2d7dbec9046f7c8b7016f5286329e2/cryptography-46.0.7-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:5e51be372b26ef4ba3de3c167cd3d1022934bc838ae9eaad7e644986d2a3d163", size = 4272715, upload-time = "2026-04-08T01:57:16.638Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/e6/a26b84096eddd51494bba19111f8fffe976f6a09f132706f8f1bf03f51f7/cryptography-46.0.7-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:cdf1a610ef82abb396451862739e3fc93b071c844399e15b90726ef7470eeaf2", size = 4918400, upload-time = "2026-04-08T01:57:19.021Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/08/ffd537b605568a148543ac3c2b239708ae0bd635064bab41359252ef88ed/cryptography-46.0.7-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:1d25aee46d0c6f1a501adcddb2d2fee4b979381346a78558ed13e50aa8a59067", size = 4450634, upload-time = "2026-04-08T01:57:21.185Z" },
+    { url = "https://files.pythonhosted.org/packages/16/01/0cd51dd86ab5b9befe0d031e276510491976c3a80e9f6e31810cce46c4ad/cryptography-46.0.7-cp38-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:cdfbe22376065ffcf8be74dc9a909f032df19bc58a699456a21712d6e5eabfd0", size = 3985233, upload-time = "2026-04-08T01:57:22.862Z" },
+    { url = "https://files.pythonhosted.org/packages/92/49/819d6ed3a7d9349c2939f81b500a738cb733ab62fbecdbc1e38e83d45e12/cryptography-46.0.7-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:abad9dac36cbf55de6eb49badd4016806b3165d396f64925bf2999bcb67837ba", size = 4271955, upload-time = "2026-04-08T01:57:24.814Z" },
+    { url = "https://files.pythonhosted.org/packages/80/07/ad9b3c56ebb95ed2473d46df0847357e01583f4c52a85754d1a55e29e4d0/cryptography-46.0.7-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:935ce7e3cfdb53e3536119a542b839bb94ec1ad081013e9ab9b7cfd478b05006", size = 4879888, upload-time = "2026-04-08T01:57:26.88Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/c7/201d3d58f30c4c2bdbe9b03844c291feb77c20511cc3586daf7edc12a47b/cryptography-46.0.7-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:35719dc79d4730d30f1c2b6474bd6acda36ae2dfae1e3c16f2051f215df33ce0", size = 4449961, upload-time = "2026-04-08T01:57:29.068Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/ef/649750cbf96f3033c3c976e112265c33906f8e462291a33d77f90356548c/cryptography-46.0.7-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:7bbc6ccf49d05ac8f7d7b5e2e2c33830d4fe2061def88210a126d130d7f71a85", size = 4401696, upload-time = "2026-04-08T01:57:31.029Z" },
+    { url = "https://files.pythonhosted.org/packages/41/52/a8908dcb1a389a459a29008c29966c1d552588d4ae6d43f3a1a4512e0ebe/cryptography-46.0.7-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a1529d614f44b863a7b480c6d000fe93b59acee9c82ffa027cfadc77521a9f5e", size = 4664256, upload-time = "2026-04-08T01:57:33.144Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/fa/f0ab06238e899cc3fb332623f337a7364f36f4bb3f2534c2bb95a35b132c/cryptography-46.0.7-cp38-abi3-win32.whl", hash = "sha256:f247c8c1a1fb45e12586afbb436ef21ff1e80670b2861a90353d9b025583d246", size = 3013001, upload-time = "2026-04-08T01:57:34.933Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/f1/00ce3bde3ca542d1acd8f8cfa38e446840945aa6363f9b74746394b14127/cryptography-46.0.7-cp38-abi3-win_amd64.whl", hash = "sha256:506c4ff91eff4f82bdac7633318a526b1d1309fc07ca76a3ad182cb5b686d6d3", size = 3472985, upload-time = "2026-04-08T01:57:36.714Z" },
+    { url = "https://files.pythonhosted.org/packages/63/0c/dca8abb64e7ca4f6b2978769f6fea5ad06686a190cec381f0a796fdcaaba/cryptography-46.0.7-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:fc9ab8856ae6cf7c9358430e49b368f3108f050031442eaeb6b9d87e4dcf4e4f", size = 3476879, upload-time = "2026-04-08T01:57:38.664Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/ea/075aac6a84b7c271578d81a2f9968acb6e273002408729f2ddff517fed4a/cryptography-46.0.7-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d3b99c535a9de0adced13d159c5a9cf65c325601aa30f4be08afd680643e9c15", size = 4219700, upload-time = "2026-04-08T01:57:40.625Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/7b/1c55db7242b5e5612b29fc7a630e91ee7a6e3c8e7bf5406d22e206875fbd/cryptography-46.0.7-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d02c738dacda7dc2a74d1b2b3177042009d5cab7c7079db74afc19e56ca1b455", size = 4385982, upload-time = "2026-04-08T01:57:42.725Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/da/9870eec4b69c63ef5925bf7d8342b7e13bc2ee3d47791461c4e49ca212f4/cryptography-46.0.7-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:04959522f938493042d595a736e7dbdff6eb6cc2339c11465b3ff89343b65f65", size = 4219115, upload-time = "2026-04-08T01:57:44.939Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/72/05aa5832b82dd341969e9a734d1812a6aadb088d9eb6f0430fc337cc5a8f/cryptography-46.0.7-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:3986ac1dee6def53797289999eabe84798ad7817f3e97779b5061a95b0ee4968", size = 4385479, upload-time = "2026-04-08T01:57:46.86Z" },
+    { url = "https://files.pythonhosted.org/packages/20/2a/1b016902351a523aa2bd446b50a5bc1175d7a7d1cf90fe2ef904f9b84ebc/cryptography-46.0.7-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:258514877e15963bd43b558917bc9f54cf7cf866c38aa576ebf47a77ddbc43a4", size = 3412829, upload-time = "2026-04-08T01:57:48.874Z" },
+]
+
+[[package]]
+name = "datafusion-benchmarks"
+version = "0.1.0"
+source = { virtual = "benchmarks" }
+dependencies = [
+    { name = "falsa" },
+    { name = "rich" },
+    { name = "typing-extensions" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "falsa" },
+    { name = "rich" },
+    { name = "typing-extensions" },
+]
+
+[[package]]
+name = "datafusion-dev"
+version = "0.1.0"
+source = { virtual = "dev" }
+dependencies = [
+    { name = "pygithub" },
+    { name = "requests" },
+    { name = "tomlkit" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "pygithub" },
+    { name = "requests" },
+    { name = "tomlkit" },
+]
+
+[[package]]
+name = "datafusion-docs"
+version = "0.1.0"
+source = { virtual = "docs" }
+dependencies = [
+    { name = "jinja2" },
+    { name = "maturin" },
+    { name = "myst-parser" },
+    { name = "pydata-sphinx-theme" },
+    { name = "setuptools" },
+    { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
+    { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "sphinx-reredirects" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "jinja2", specifier = ">=3.1,<4" },
+    { name = "maturin", specifier = ">=1.11,<2" },
+    { name = "myst-parser", specifier = ">=5,<6" },
+    { name = "pydata-sphinx-theme", specifier = ">=0.16,<1" },
+    { name = "setuptools", specifier = ">=82,<83" },
+    { name = "sphinx", specifier = ">=9,<10" },
+    { name = "sphinx-reredirects", specifier = ">=1.1,<2" },
+]
+
+[[package]]
+name = "docutils"
+version = "0.22.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ae/b6/03bb70946330e88ffec97aefd3ea75ba575cb2e762061e0e62a213befee8/docutils-0.22.4.tar.gz", hash = "sha256:4db53b1fde9abecbb74d91230d32ab626d94f6badfc575d6db9194a49df29968", size = 2291750, upload-time = "2025-12-18T19:00:26.443Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/02/10/5da547df7a391dcde17f59520a231527b8571e6f46fc8efb02ccb370ab12/docutils-0.22.4-py3-none-any.whl", hash = "sha256:d0013f540772d1420576855455d050a2180186c91c15779301ac2ccb3eeb68de", size = 633196, upload-time = "2025-12-18T19:00:18.077Z" },
+]
+
+[[package]]
+name = "falsa"
+version = "0.0.6"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+    { name = "pyarrow" },
+    { name = "typer" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/36/65/0f51f3509cfe4f8cc5b9b1a7ba614a5c0ca0b7ada7a2f8de4275ddc5d979/falsa-0.0.6.tar.gz", hash = "sha256:1b037941886755a73a77f3c80ecb661ee4732085bd68947c0ec788f77b487b32", size = 524238, upload-time = "2025-09-20T07:35:15.162Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/46/61/9fb4f242b37ecf4b706703cdc1c8ca0e8333edab42172340d27680c19c86/falsa-0.0.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:048d6b23fe7d2457761a406c667110904634685bac4816732455ee0c4f38ad0b", size = 437619, upload-time = "2025-09-20T07:33:31.806Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/cd/efb9c57f94d339a9dc7cf3ae555fa7dabcdf9c4c5d18bd1cf464b93e5457/falsa-0.0.6-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:85d96e0a0c481f50023ff5aa18b4dd663cdad7b778d2f98ca7d21e3fa132eef3", size = 435477, upload-time = "2025-09-20T07:33:43.118Z" },
+    { url = "https://files.pythonhosted.org/packages/17/85/814e049f046f25611be25352959be8a9a711ef384b46cba7c0797fe03882/falsa-0.0.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e44ecdff3361e4ecbfc67b84dc0ed04e3f73d37b20ebfb435c8d1ebca7b85bb9", size = 652226, upload-time = "2025-09-20T07:33:54.515Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/a3/0a064fedccc3462ea413c87d15b35da854878b300d432bd79a3404b4de36/falsa-0.0.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5dc08fbb6833ead8bf63106837615236e259dd05fc4d1dd4b1b91b949ba632e2", size = 476290, upload-time = "2025-09-20T07:34:05.171Z" },
+    { url = "https://files.pythonhosted.org/packages/46/38/d7f9182a505439d893c9741acf12a9daa04ea2ae9c9afff01a65fc5619ef/falsa-0.0.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b57b6ef70842776c5698498d04c1c38602b255083ee6822fe6d8a67aa32b3260", size = 598436, upload-time = "2025-09-20T07:34:26.207Z" },
+    { url = "https://files.pythonhosted.org/packages/61/03/6199cc9011e8e708bef3e0420009b4e93be517f642184ee1f564b33b16d5/falsa-0.0.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9305aabafdf1be131b157d97ba7e105da115eef0e02af73f4716bcae64a18041", size = 461327, upload-time = "2025-09-20T07:34:16.337Z" },
+    { url = "https://files.pythonhosted.org/packages/85/58/8d72300acf63c671f4ed8fcf6d74312581e6ad72d530676ec4a8c30e2b06/falsa-0.0.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a0ffaf1c24296b16320b11116420d221b4678f1c4942ecf88599b33b094e78c7", size = 616922, upload-time = "2025-09-20T07:34:34.73Z" },
+    { url = "https://files.pythonhosted.org/packages/31/09/da0a47ef5f56d3b9466f24b0451d6f326c6637da383b3b95b07ccd7be7c3/falsa-0.0.6-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:828f151c6737ed4d9051edbf695738e4d758815c316b58fa18166e0ab3d1fea7", size = 699657, upload-time = "2025-09-20T07:34:45.774Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/98/bc733bc0d88fb975577b530dca848cfcfbae20010af1884822d18fed634e/falsa-0.0.6-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:938f5170282f699638e0c7a941cc80235bd5ca8a8c5a19b65615aa0dc6fbf3f8", size = 632823, upload-time = "2025-09-20T07:34:56.436Z" },
+    { url = "https://files.pythonhosted.org/packages/42/8e/eb5a164f44dddf674c6c248da8d4f241dc8d2bf1fcff4db74bc00f9c0036/falsa-0.0.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:56e500c635ad608fe3cf7d2634bd6e3d736aa432dfe00498af14e470eb354254", size = 605256, upload-time = "2025-09-20T07:35:06.564Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/20/3d74be0cc90d3d6d4edea625c5e57efa404a388428506c54f11cbd8413f0/falsa-0.0.6-cp311-cp311-win32.whl", hash = "sha256:fe0ff809e7246d1b06e03662c3a84f2e10d252590f62e06d0f937d498cda24d8", size = 253058, upload-time = "2025-09-20T07:35:21.813Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/f4/95c01bd3fda06fbe711e69252ba99a99484a701ca426481556cb362a7121/falsa-0.0.6-cp311-cp311-win_amd64.whl", hash = "sha256:050bc5eb7cbd1c0c6551851af0d3ef6a6db1794123c49718bdf2472103facf65", size = 276389, upload-time = "2025-09-20T07:35:17.047Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/f7/bce7df04f3ea86c88e6b2b82bd4cfce3d50b0057b68ae98fb1703730ad3e/falsa-0.0.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2a17bf26161fd5fdde8db3bcb0f290bbcad679ae231842d53bfebd506130faf", size = 436615, upload-time = "2025-09-20T07:33:32.811Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/34/e42d33525910f37b165ba765a8548eca8079ee94ec4ca4001a3f13e7eab1/falsa-0.0.6-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c685c34779b33e8db9d13517931d3ea6df785756fea26b7ac11a49059c1375ca", size = 435130, upload-time = "2025-09-20T07:33:44.498Z" },
+    { url = "https://files.pythonhosted.org/packages/53/dc/212f5b3b7e7a99a3867af1d49745e393d79610aa4c2218c72b6a4c9e9312/falsa-0.0.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6682631faa42ad303730872db6dce7b809da94842546fbd15431ebabba2b99bc", size = 651373, upload-time = "2025-09-20T07:33:55.721Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/e5/076c350bd7f6887463f28d7c49d97abb738daaeab356da5c5793720d32ba/falsa-0.0.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cf5d69cce8670b8d8617daa0a874e5bcb0a3409d368bfb044354b0db9404ff72", size = 475126, upload-time = "2025-09-20T07:34:06.562Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/3c/44d9e23b01da33b094bd4ee4cdae4f667a1cf0e123413981d16509660609/falsa-0.0.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:818ed089f8088ff9d170f366ad3df07c1458581d864ec3153b48be5bf06fc6f3", size = 597193, upload-time = "2025-09-20T07:34:27.531Z" },
+    { url = "https://files.pythonhosted.org/packages/11/aa/70afcfbb1d76ccf275d7fb1cb6ee99720039a11b9d66ed23219f6cd4209a/falsa-0.0.6-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0e48df7acf762af490fcc3bfe9baeaeec82d151669e111c7630b37d38707bf73", size = 460932, upload-time = "2025-09-20T07:34:17.351Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/54/bd69faa0989fbbdf61793dedff7d953cd3832580ef35398f9f5a43443b29/falsa-0.0.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:eee10e87d74efe7a089db0a58c8cb6e02082b80618c8be70c75816e818d0194a", size = 616017, upload-time = "2025-09-20T07:34:36.222Z" },
+    { url = "https://files.pythonhosted.org/packages/26/29/06a92316c7799337a40c7e3d8737827ea3590b1bdc66fb8341c720d96e8e/falsa-0.0.6-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:a97cc63f77f635e9ec738584565edf933d31078e94825788c236864488e7b062", size = 698946, upload-time = "2025-09-20T07:34:47.185Z" },
+    { url = "https://files.pythonhosted.org/packages/df/14/5081e53d8e2927f86af70007e7d424a8bc3992527f87db78d8f21541e89c/falsa-0.0.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4a3494b7c352e506c64c708b64e85afcb593419d541dbadf38405dc0fbc02f61", size = 632186, upload-time = "2025-09-20T07:34:57.499Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/fe/8d691ed9f2159726828cbe0765c579c032d35eb647ccfeb6ab10ffaa2f48/falsa-0.0.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:852d57713f169043d9ecbdb2ae6b8a93e87de68aa790e800f487fa61dfed1729", size = 603671, upload-time = "2025-09-20T07:35:07.65Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/70/425e1ad3b447a86c4f246433020d6c5ff359f278120e57e08e4b0b91cd16/falsa-0.0.6-cp312-cp312-win_amd64.whl", hash = "sha256:ea831bfdcbca03c2ca220dc61b2a8de14526af9a9a6a014f275299aace25f5c5", size = 275829, upload-time = "2025-09-20T07:35:18.074Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/8f/fb2e90057ae3f69b89f188c83dc4b930b34e6ecf89d7e5b7d99ae07e6b52/falsa-0.0.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c7bb9884f8cf468e1de57f0fa59532ed99c8bfd41999cf85e57e78a9fb8fd0ca", size = 436591, upload-time = "2025-09-20T07:33:34.336Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/c2/57e1b88757e637865fb2390560f927fd9eb60e793d82bbcf18d411b36104/falsa-0.0.6-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bc80e361b29d19d5739a6cb1ace1e00765f139e1d065c70693a644f7c4375089", size = 434955, upload-time = "2025-09-20T07:33:45.802Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/29/79585d31bce867fa083d2ca11bb469a3530077407ea2549046d6e496df24/falsa-0.0.6-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ca667084eb89a07893c373bbe05492235482a214b23b13da39626d71c9028ce7", size = 650688, upload-time = "2025-09-20T07:33:56.767Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/50/cda029ec50341601c283b040748172ba9cacc0a16880e93e4cb6239a715e/falsa-0.0.6-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4811ab6aa0b2a155180aac6b3800ae5ea800bf422bddf8fb11daa509908c793", size = 475074, upload-time = "2025-09-20T07:34:07.88Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/62/1272b0c50203d0be2df3253e237f1ddbadce1642117d9dab4fb658fd241a/falsa-0.0.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd0e7075aa22daaa970ca113502c51d1e0d89cf3322be116213099f61aa5fe", size = 597359, upload-time = "2025-09-20T07:34:28.566Z" },
+    { url = "https://files.pythonhosted.org/packages/72/c9/4cc472d2e734bd4788ff5ce43825aaeba4715fc70f4900f2bfd6099b809e/falsa-0.0.6-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:37882088385512187511311d56a26226d45fd4f53dad081e50fdb07f587e0201", size = 461025, upload-time = "2025-09-20T07:34:18.436Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/a3/32206b72a42c06d771cd18b1211321d2fa413695e4cc9616b72d80708252/falsa-0.0.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:22f0c8dd927e857480c83b4db1e4209021e0a301efb8e76b2d3a91ad747b3768", size = 616183, upload-time = "2025-09-20T07:34:37.526Z" },
+    { url = "https://files.pythonhosted.org/packages/54/57/244227fd859a5173938501a17bd2ec81c09ce25a60472dceb1f54dbb529b/falsa-0.0.6-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:fd06795b6873926a507f685eb147a06fb6c7282789ceb550558c42325bcbc637", size = 698951, upload-time = "2025-09-20T07:34:48.241Z" },
+    { url = "https://files.pythonhosted.org/packages/41/6f/57d82f555f288ea9106b7a7ffb1978d27f8ffc1bf52753b8c2c4298acc00/falsa-0.0.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:5b6fd2c5cc4bbcae5b1a28f533705eb95ba0e220c8b70c67c830e86309477fb5", size = 632175, upload-time = "2025-09-20T07:34:58.664Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/a4/64c6c7dfe0e73948ead7e19217e38116853fa49512ee91dfdf41e8f799ca/falsa-0.0.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ea73bd1b098198b0cabd94eec7952de37051024b26805a30906ed350d3b474a8", size = 604022, upload-time = "2025-09-20T07:35:08.71Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/e2/42d9b92f09671cacc629a000d08656fe4f0da4ec818f4841fa700a0651f0/falsa-0.0.6-cp313-cp313-win_amd64.whl", hash = "sha256:80908855b7e8144add3d5f9b1ff7ef58d2fc574a6e8f7ac755437a178058d2ac", size = 275625, upload-time = "2025-09-20T07:35:19.664Z" },
+    { url = "https://files.pythonhosted.org/packages/90/9e/304d3ce465ca33055ed22560e7694dd8418f200d1c6eaca16236aa24035e/falsa-0.0.6-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f6177b18bb6e61f333cca5c73d1c60a809a688937090130f8baeea4363366b9e", size = 436505, upload-time = "2025-09-20T07:33:35.655Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/e8/0f51c6562ee4e0c572e3cac4c9ea338678a15e349351474e4f298184f8c0/falsa-0.0.6-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5df6bedb01acf73134f565b0352493b981aa3ea84d09fd4e8d6f2c618042a1f3", size = 433993, upload-time = "2025-09-20T07:33:47.056Z" },
+    { url = "https://files.pythonhosted.org/packages/46/6e/7a0a4acfc0bf397fd6f3c749040287c75e6fc9677d32ec20bca8e06ae4e0/falsa-0.0.6-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:731acd74b9b41e9bca388176c7e7be6ea48b5ba136f149f41bdfaaaaa53a40e4", size = 649979, upload-time = "2025-09-20T07:33:57.991Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/2a/19d66b0b38232d6230ed163e9c24c55683f38348930e25c7e36188b9e7a1/falsa-0.0.6-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d7aa02f407b473fe81a5e94d3cbaa5ba34e243da35593fbfb1b71351093eac8", size = 474443, upload-time = "2025-09-20T07:34:08.949Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/df/80bea42472af460b2b18c3bb547ae5eaf55bea9eff63f5abf266dca51b5a/falsa-0.0.6-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f0214f94434924e03308b48a81ddf246d0c8c9e1e4b323184bb417fe81df190e", size = 615972, upload-time = "2025-09-20T07:34:38.639Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/6d/449f03ad7b5c31f7cac1fc7177419a67d0c53b7733c83034772ca491b697/falsa-0.0.6-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:4e2982b9ef053fedca216f6abeb5d7325d73f4df24540dd9a0fe8463a9c80abd", size = 698052, upload-time = "2025-09-20T07:34:49.336Z" },
+    { url = "https://files.pythonhosted.org/packages/34/6f/723bed02c00e9b3741a2b8fdbbca1afb7ba3fc2ad398be85cd477408f611/falsa-0.0.6-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:4953ae9f87aefed8a3936562dbab20dd6b3a6cdadf32f009ef552e9e5df96a56", size = 631684, upload-time = "2025-09-20T07:34:59.715Z" },
+    { url = "https://files.pythonhosted.org/packages/54/70/a8a0bda4afa93bd602ce05efe3f615f25e2145880e5abb0f8138312fcaed/falsa-0.0.6-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:fcf31b451835037ccdf6b9adb9353d99981178d6e96601b6b023fbac1db74342", size = 604314, upload-time = "2025-09-20T07:35:09.78Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/47/6e1a6a2cf730e7cf5b2a5159066590a5151867b0cf1c913386285b39d52c/falsa-0.0.6-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de0bd27c505b47c8870463ef9376e52e72d54a7f3bb7b393e6a0f5fe8227c95e", size = 597105, upload-time = "2025-09-20T07:34:29.668Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/a0/3d697341c44c238e635af6f4ccc87d1150edbb5374c67e6f7c86c9818336/falsa-0.0.6-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bf8f50d6f8f65009ae5b986f4220dd823cb22d704221e29ca91a06dd0c178599", size = 461233, upload-time = "2025-09-20T07:34:19.704Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/a6/a59e8d6f27c049a0955f3b7d7a229633213f485b0175d6a348fc66047bdd/falsa-0.0.6-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b8714397240eeb05f490b8e2c1ca6592edb2e6c5e6652baaf1d29ea4bd2c4a6", size = 438116, upload-time = "2025-09-20T07:33:39.668Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/e8/27f367c60dd662e009dd2945c1fdbc74fad277c6b668d02ee004ba41e2ee/falsa-0.0.6-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:47a610301a11f1b53c12092d97b5dff80e576b1534883e62a02d019bc759d06f", size = 436210, upload-time = "2025-09-20T07:33:50.477Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/a4/6163320b1130da9333f851633a6f7b726ea42974bafc6db333fc3c0a69e0/falsa-0.0.6-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:13c98c49225232016dfd8bdd0e5f2e10649f9d0388fde9b1020b04d7409c9078", size = 651561, upload-time = "2025-09-20T07:34:01.522Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/5d/f06f625cb2e9af5769f0f755154469e9a280b9ce6bedfff15564bce9483a/falsa-0.0.6-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e8d63db146847709114032382c4cdaf7274654781d3a56732eb5e622350654f2", size = 476530, upload-time = "2025-09-20T07:34:12.248Z" },
+    { url = "https://files.pythonhosted.org/packages/54/cb/81fd6f2d542ef1833485d95f766c29bf5a9bf73213d4c6dad8b2c4541327/falsa-0.0.6-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b552b1525300b14abd2400dc692cfb79de6813cec725deca03aaf251ca94111", size = 598516, upload-time = "2025-09-20T07:34:31.807Z" },
+    { url = "https://files.pythonhosted.org/packages/97/33/07809af6ff17d1fc3e059ea1a73a76cc5593661832cf0c91498be9bc8172/falsa-0.0.6-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:535f9d6cc9a745d7aed0b108f8447de1780e548fc30fbeb0d360f8403ed86b6e", size = 461808, upload-time = "2025-09-20T07:34:24.119Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/6a/0b4a3903f7c8ed15e2f5c8b4d226e0cf214f7f32dca1b74a8064f6d27c47/falsa-0.0.6-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:04109d8e1c58cd8d87d513546fa945db4b5883e1ddc29a1dc14b9bb999991d6d", size = 617349, upload-time = "2025-09-20T07:34:42.168Z" },
+    { url = "https://files.pythonhosted.org/packages/08/cc/3a7d98bd4f8569c9ec683d358379b6167e19911007263fcc45e4f414f407/falsa-0.0.6-pp311-pypy311_pp73-musllinux_1_2_armv7l.whl", hash = "sha256:9623ada575625e65245488ec6ef7cf09e40e134245c5ab8a440267338212f73e", size = 700202, upload-time = "2025-09-20T07:34:52.724Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/5c/88e1a1d2c29b83e0c5da30960815f830dd79694c474f6b7ae2eb716a8e65/falsa-0.0.6-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:9a8e8cd40e0389f56c2fb41bd0a0c2472c2365265b78966c7f187aaf3409558a", size = 633105, upload-time = "2025-09-20T07:35:03.315Z" },
+    { url = "https://files.pythonhosted.org/packages/37/03/94f5e53369796b3e93c3d942d6c010f3215957330a697a2c715fe93f2ac6/falsa-0.0.6-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:afaadf6ac8599bbf2e42f54bccda76e9f0218f6d6429085186d38d243c6b28da", size = 605690, upload-time = "2025-09-20T07:35:13.015Z" },
+]
+
+[[package]]
+name = "idna"
+version = "3.11"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" },
+]
+
+[[package]]
+name = "imagesize"
+version = "1.4.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a7/84/62473fb57d61e31fef6e36d64a179c8781605429fd927b5dd608c997be31/imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a", size = 1280026, upload-time = "2022-07-01T12:21:05.687Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ff/62/85c4c919272577931d407be5ba5d71c20f0b616d31a0befe0ae45bb79abd/imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b", size = 8769, upload-time = "2022-07-01T12:21:02.467Z" },
+]
+
+[[package]]
+name = "jinja2"
+version = "3.1.6"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markupsafe" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" },
+]
+
+[[package]]
+name = "markdown-it-py"
+version = "4.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "mdurl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" },
+]
+
+[[package]]
+name = "markupsafe"
+version = "3.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/08/db/fefacb2136439fc8dd20e797950e749aa1f4997ed584c62cfb8ef7c2be0e/markupsafe-3.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cc7ea17a6824959616c525620e387f6dd30fec8cb44f649e31712db02123dad", size = 11631, upload-time = "2025-09-27T18:36:18.185Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/2e/5898933336b61975ce9dc04decbc0a7f2fee78c30353c5efba7f2d6ff27a/markupsafe-3.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4bd4cd07944443f5a265608cc6aab442e4f74dff8088b0dfc8238647b8f6ae9a", size = 12058, upload-time = "2025-09-27T18:36:19.444Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/09/adf2df3699d87d1d8184038df46a9c80d78c0148492323f4693df54e17bb/markupsafe-3.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b5420a1d9450023228968e7e6a9ce57f65d148ab56d2313fcd589eee96a7a50", size = 24287, upload-time = "2025-09-27T18:36:20.768Z" },
+    { url = "https://files.pythonhosted.org/packages/30/ac/0273f6fcb5f42e314c6d8cd99effae6a5354604d461b8d392b5ec9530a54/markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bf2a864d67e76e5c9a34dc26ec616a66b9888e25e7b9460e1c76d3293bd9dbf", size = 22940, upload-time = "2025-09-27T18:36:22.249Z" },
+    { url = "https://files.pythonhosted.org/packages/19/ae/31c1be199ef767124c042c6c3e904da327a2f7f0cd63a0337e1eca2967a8/markupsafe-3.0.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc51efed119bc9cfdf792cdeaa4d67e8f6fcccab66ed4bfdd6bde3e59bfcbb2f", size = 21887, upload-time = "2025-09-27T18:36:23.535Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/76/7edcab99d5349a4532a459e1fe64f0b0467a3365056ae550d3bcf3f79e1e/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:068f375c472b3e7acbe2d5318dea141359e6900156b5b2ba06a30b169086b91a", size = 23692, upload-time = "2025-09-27T18:36:24.823Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/28/6e74cdd26d7514849143d69f0bf2399f929c37dc2b31e6829fd2045b2765/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7be7b61bb172e1ed687f1754f8e7484f1c8019780f6f6b0786e76bb01c2ae115", size = 21471, upload-time = "2025-09-27T18:36:25.95Z" },
+    { url = "https://files.pythonhosted.org/packages/62/7e/a145f36a5c2945673e590850a6f8014318d5577ed7e5920a4b3448e0865d/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9e130248f4462aaa8e2552d547f36ddadbeaa573879158d721bbd33dfe4743a", size = 22923, upload-time = "2025-09-27T18:36:27.109Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/62/d9c46a7f5c9adbeeeda52f5b8d802e1094e9717705a645efc71b0913a0a8/markupsafe-3.0.3-cp311-cp311-win32.whl", hash = "sha256:0db14f5dafddbb6d9208827849fad01f1a2609380add406671a26386cdf15a19", size = 14572, upload-time = "2025-09-27T18:36:28.045Z" },
+    { url = "https://files.pythonhosted.org/packages/83/8a/4414c03d3f891739326e1783338e48fb49781cc915b2e0ee052aa490d586/markupsafe-3.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:de8a88e63464af587c950061a5e6a67d3632e36df62b986892331d4620a35c01", size = 15077, upload-time = "2025-09-27T18:36:29.025Z" },
+    { url = "https://files.pythonhosted.org/packages/35/73/893072b42e6862f319b5207adc9ae06070f095b358655f077f69a35601f0/markupsafe-3.0.3-cp311-cp311-win_arm64.whl", hash = "sha256:3b562dd9e9ea93f13d53989d23a7e775fdfd1066c33494ff43f5418bc8c58a5c", size = 13876, upload-time = "2025-09-27T18:36:29.954Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/72/147da192e38635ada20e0a2e1a51cf8823d2119ce8883f7053879c2199b5/markupsafe-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e", size = 11615, upload-time = "2025-09-27T18:36:30.854Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/81/7e4e08678a1f98521201c3079f77db69fb552acd56067661f8c2f534a718/markupsafe-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce", size = 12020, upload-time = "2025-09-27T18:36:31.971Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d", size = 24332, upload-time = "2025-09-27T18:36:32.813Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/2e/8d0c2ab90a8c1d9a24f0399058ab8519a3279d1bd4289511d74e909f060e/markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d", size = 22947, upload-time = "2025-09-27T18:36:33.86Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/54/887f3092a85238093a0b2154bd629c89444f395618842e8b0c41783898ea/markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a", size = 21962, upload-time = "2025-09-27T18:36:35.099Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/2f/336b8c7b6f4a4d95e91119dc8521402461b74a485558d8f238a68312f11c/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b", size = 23760, upload-time = "2025-09-27T18:36:36.001Z" },
+    { url = "https://files.pythonhosted.org/packages/32/43/67935f2b7e4982ffb50a4d169b724d74b62a3964bc1a9a527f5ac4f1ee2b/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f", size = 21529, upload-time = "2025-09-27T18:36:36.906Z" },
+    { url = "https://files.pythonhosted.org/packages/89/e0/4486f11e51bbba8b0c041098859e869e304d1c261e59244baa3d295d47b7/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b", size = 23015, upload-time = "2025-09-27T18:36:37.868Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/e1/78ee7a023dac597a5825441ebd17170785a9dab23de95d2c7508ade94e0e/markupsafe-3.0.3-cp312-cp312-win32.whl", hash = "sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d", size = 14540, upload-time = "2025-09-27T18:36:38.761Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/5b/bec5aa9bbbb2c946ca2733ef9c4ca91c91b6a24580193e891b5f7dbe8e1e/markupsafe-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c", size = 15105, upload-time = "2025-09-27T18:36:39.701Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f", size = 13906, upload-time = "2025-09-27T18:36:40.689Z" },
+    { url = "https://files.pythonhosted.org/packages/38/2f/907b9c7bbba283e68f20259574b13d005c121a0fa4c175f9bed27c4597ff/markupsafe-3.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e1cf1972137e83c5d4c136c43ced9ac51d0e124706ee1c8aa8532c1287fa8795", size = 11622, upload-time = "2025-09-27T18:36:41.777Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/d9/5f7756922cdd676869eca1c4e3c0cd0df60ed30199ffd775e319089cb3ed/markupsafe-3.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:116bb52f642a37c115f517494ea5feb03889e04df47eeff5b130b1808ce7c219", size = 12029, upload-time = "2025-09-27T18:36:43.257Z" },
+    { url = "https://files.pythonhosted.org/packages/00/07/575a68c754943058c78f30db02ee03a64b3c638586fba6a6dd56830b30a3/markupsafe-3.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:133a43e73a802c5562be9bbcd03d090aa5a1fe899db609c29e8c8d815c5f6de6", size = 24374, upload-time = "2025-09-27T18:36:44.508Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/21/9b05698b46f218fc0e118e1f8168395c65c8a2c750ae2bab54fc4bd4e0e8/markupsafe-3.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccfcd093f13f0f0b7fdd0f198b90053bf7b2f02a3927a30e63f3ccc9df56b676", size = 22980, upload-time = "2025-09-27T18:36:45.385Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/71/544260864f893f18b6827315b988c146b559391e6e7e8f7252839b1b846a/markupsafe-3.0.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:509fa21c6deb7a7a273d629cf5ec029bc209d1a51178615ddf718f5918992ab9", size = 21990, upload-time = "2025-09-27T18:36:46.916Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/28/b50fc2f74d1ad761af2f5dcce7492648b983d00a65b8c0e0cb457c82ebbe/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4afe79fb3de0b7097d81da19090f4df4f8d3a2b3adaa8764138aac2e44f3af1", size = 23784, upload-time = "2025-09-27T18:36:47.884Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/76/104b2aa106a208da8b17a2fb72e033a5a9d7073c68f7e508b94916ed47a9/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:795e7751525cae078558e679d646ae45574b47ed6e7771863fcc079a6171a0fc", size = 21588, upload-time = "2025-09-27T18:36:48.82Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/99/16a5eb2d140087ebd97180d95249b00a03aa87e29cc224056274f2e45fd6/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8485f406a96febb5140bfeca44a73e3ce5116b2501ac54fe953e488fb1d03b12", size = 23041, upload-time = "2025-09-27T18:36:49.797Z" },
+    { url = "https://files.pythonhosted.org/packages/19/bc/e7140ed90c5d61d77cea142eed9f9c303f4c4806f60a1044c13e3f1471d0/markupsafe-3.0.3-cp313-cp313-win32.whl", hash = "sha256:bdd37121970bfd8be76c5fb069c7751683bdf373db1ed6c010162b2a130248ed", size = 14543, upload-time = "2025-09-27T18:36:51.584Z" },
+    { url = "https://files.pythonhosted.org/packages/05/73/c4abe620b841b6b791f2edc248f556900667a5a1cf023a6646967ae98335/markupsafe-3.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:9a1abfdc021a164803f4d485104931fb8f8c1efd55bc6b748d2f5774e78b62c5", size = 15113, upload-time = "2025-09-27T18:36:52.537Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/3a/fa34a0f7cfef23cf9500d68cb7c32dd64ffd58a12b09225fb03dd37d5b80/markupsafe-3.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:7e68f88e5b8799aa49c85cd116c932a1ac15caaa3f5db09087854d218359e485", size = 13911, upload-time = "2025-09-27T18:36:53.513Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/d7/e05cd7efe43a88a17a37b3ae96e79a19e846f3f456fe79c57ca61356ef01/markupsafe-3.0.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:218551f6df4868a8d527e3062d0fb968682fe92054e89978594c28e642c43a73", size = 11658, upload-time = "2025-09-27T18:36:54.819Z" },
+    { url = "https://files.pythonhosted.org/packages/99/9e/e412117548182ce2148bdeacdda3bb494260c0b0184360fe0d56389b523b/markupsafe-3.0.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3524b778fe5cfb3452a09d31e7b5adefeea8c5be1d43c4f810ba09f2ceb29d37", size = 12066, upload-time = "2025-09-27T18:36:55.714Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/e6/fa0ffcda717ef64a5108eaa7b4f5ed28d56122c9a6d70ab8b72f9f715c80/markupsafe-3.0.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4e885a3d1efa2eadc93c894a21770e4bc67899e3543680313b09f139e149ab19", size = 25639, upload-time = "2025-09-27T18:36:56.908Z" },
+    { url = "https://files.pythonhosted.org/packages/96/ec/2102e881fe9d25fc16cb4b25d5f5cde50970967ffa5dddafdb771237062d/markupsafe-3.0.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8709b08f4a89aa7586de0aadc8da56180242ee0ada3999749b183aa23df95025", size = 23569, upload-time = "2025-09-27T18:36:57.913Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/30/6f2fce1f1f205fc9323255b216ca8a235b15860c34b6798f810f05828e32/markupsafe-3.0.3-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b8512a91625c9b3da6f127803b166b629725e68af71f8184ae7e7d54686a56d6", size = 23284, upload-time = "2025-09-27T18:36:58.833Z" },
+    { url = "https://files.pythonhosted.org/packages/58/47/4a0ccea4ab9f5dcb6f79c0236d954acb382202721e704223a8aafa38b5c8/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9b79b7a16f7fedff2495d684f2b59b0457c3b493778c9eed31111be64d58279f", size = 24801, upload-time = "2025-09-27T18:36:59.739Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/70/3780e9b72180b6fecb83a4814d84c3bf4b4ae4bf0b19c27196104149734c/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:12c63dfb4a98206f045aa9563db46507995f7ef6d83b2f68eda65c307c6829eb", size = 22769, upload-time = "2025-09-27T18:37:00.719Z" },
+    { url = "https://files.pythonhosted.org/packages/98/c5/c03c7f4125180fc215220c035beac6b9cb684bc7a067c84fc69414d315f5/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8f71bc33915be5186016f675cd83a1e08523649b0e33efdb898db577ef5bb009", size = 23642, upload-time = "2025-09-27T18:37:01.673Z" },
+    { url = "https://files.pythonhosted.org/packages/80/d6/2d1b89f6ca4bff1036499b1e29a1d02d282259f3681540e16563f27ebc23/markupsafe-3.0.3-cp313-cp313t-win32.whl", hash = "sha256:69c0b73548bc525c8cb9a251cddf1931d1db4d2258e9599c28c07ef3580ef354", size = 14612, upload-time = "2025-09-27T18:37:02.639Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/98/e48a4bfba0a0ffcf9925fe2d69240bfaa19c6f7507b8cd09c70684a53c1e/markupsafe-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:1b4b79e8ebf6b55351f0d91fe80f893b4743f104bff22e90697db1590e47a218", size = 15200, upload-time = "2025-09-27T18:37:03.582Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/72/e3cc540f351f316e9ed0f092757459afbc595824ca724cbc5a5d4263713f/markupsafe-3.0.3-cp313-cp313t-win_arm64.whl", hash = "sha256:ad2cf8aa28b8c020ab2fc8287b0f823d0a7d8630784c31e9ee5edea20f406287", size = 13973, upload-time = "2025-09-27T18:37:04.929Z" },
+    { url = "https://files.pythonhosted.org/packages/33/8a/8e42d4838cd89b7dde187011e97fe6c3af66d8c044997d2183fbd6d31352/markupsafe-3.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe", size = 11619, upload-time = "2025-09-27T18:37:06.342Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/64/7660f8a4a8e53c924d0fa05dc3a55c9cee10bbd82b11c5afb27d44b096ce/markupsafe-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026", size = 12029, upload-time = "2025-09-27T18:37:07.213Z" },
+    { url = "https://files.pythonhosted.org/packages/da/ef/e648bfd021127bef5fa12e1720ffed0c6cbb8310c8d9bea7266337ff06de/markupsafe-3.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737", size = 24408, upload-time = "2025-09-27T18:37:09.572Z" },
+    { url = "https://files.pythonhosted.org/packages/41/3c/a36c2450754618e62008bf7435ccb0f88053e07592e6028a34776213d877/markupsafe-3.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97", size = 23005, upload-time = "2025-09-27T18:37:10.58Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/20/b7fdf89a8456b099837cd1dc21974632a02a999ec9bf7ca3e490aacd98e7/markupsafe-3.0.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d", size = 22048, upload-time = "2025-09-27T18:37:11.547Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/a7/591f592afdc734f47db08a75793a55d7fbcc6902a723ae4cfbab61010cc5/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda", size = 23821, upload-time = "2025-09-27T18:37:12.48Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/33/45b24e4f44195b26521bc6f1a82197118f74df348556594bd2262bda1038/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf", size = 21606, upload-time = "2025-09-27T18:37:13.485Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/0e/53dfaca23a69fbfbbf17a4b64072090e70717344c52eaaaa9c5ddff1e5f0/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe", size = 23043, upload-time = "2025-09-27T18:37:14.408Z" },
+    { url = "https://files.pythonhosted.org/packages/46/11/f333a06fc16236d5238bfe74daccbca41459dcd8d1fa952e8fbd5dccfb70/markupsafe-3.0.3-cp314-cp314-win32.whl", hash = "sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9", size = 14747, upload-time = "2025-09-27T18:37:15.36Z" },
+    { url = "https://files.pythonhosted.org/packages/28/52/182836104b33b444e400b14f797212f720cbc9ed6ba34c800639d154e821/markupsafe-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581", size = 15341, upload-time = "2025-09-27T18:37:16.496Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/18/acf23e91bd94fd7b3031558b1f013adfa21a8e407a3fdb32745538730382/markupsafe-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4", size = 14073, upload-time = "2025-09-27T18:37:17.476Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/f0/57689aa4076e1b43b15fdfa646b04653969d50cf30c32a102762be2485da/markupsafe-3.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab", size = 11661, upload-time = "2025-09-27T18:37:18.453Z" },
+    { url = "https://files.pythonhosted.org/packages/89/c3/2e67a7ca217c6912985ec766c6393b636fb0c2344443ff9d91404dc4c79f/markupsafe-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175", size = 12069, upload-time = "2025-09-27T18:37:19.332Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/00/be561dce4e6ca66b15276e184ce4b8aec61fe83662cce2f7d72bd3249d28/markupsafe-3.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634", size = 25670, upload-time = "2025-09-27T18:37:20.245Z" },
+    { url = "https://files.pythonhosted.org/packages/50/09/c419f6f5a92e5fadde27efd190eca90f05e1261b10dbd8cbcb39cd8ea1dc/markupsafe-3.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50", size = 23598, upload-time = "2025-09-27T18:37:21.177Z" },
+    { url = "https://files.pythonhosted.org/packages/22/44/a0681611106e0b2921b3033fc19bc53323e0b50bc70cffdd19f7d679bb66/markupsafe-3.0.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e", size = 23261, upload-time = "2025-09-27T18:37:22.167Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/57/1b0b3f100259dc9fffe780cfb60d4be71375510e435efec3d116b6436d43/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5", size = 24835, upload-time = "2025-09-27T18:37:23.296Z" },
+    { url = "https://files.pythonhosted.org/packages/26/6a/4bf6d0c97c4920f1597cc14dd720705eca0bf7c787aebc6bb4d1bead5388/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523", size = 22733, upload-time = "2025-09-27T18:37:24.237Z" },
+    { url = "https://files.pythonhosted.org/packages/14/c7/ca723101509b518797fedc2fdf79ba57f886b4aca8a7d31857ba3ee8281f/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc", size = 23672, upload-time = "2025-09-27T18:37:25.271Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819, upload-time = "2025-09-27T18:37:26.285Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426, upload-time = "2025-09-27T18:37:27.316Z" },
+    { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" },
+]
+
+[[package]]
+name = "maturin"
+version = "1.12.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ae/13/aeff8a21835ed0e40c329c286750fcdcdcbf231f1a5cb327378666c5def6/maturin-1.12.2.tar.gz", hash = "sha256:d6253079f53dbb692395a13abddc0f2d3d96af32f8c0b32e2912849713c55794", size = 257279, upload-time = "2026-02-16T13:56:20.221Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8a/9d/4811e1fcaa346a0b9fad6aee0ac0eec9eb376a24fe27c66d5d4fe975586e/maturin-1.12.2-py3-none-linux_armv6l.whl", hash = "sha256:0ed31b6a392928ad23645a470edc4f3814b952a416e41f8e5daac42d7bfbabc6", size = 9653200, upload-time = "2026-02-16T13:56:16.216Z" },
+    { url = "https://files.pythonhosted.org/packages/69/db/74d582af74c32bbda12e4d7e153b389884409a1c5cd31edc9d3194d515f7/maturin-1.12.2-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:f1c2e4ee43bf286b052091a3b2356a157978985837c7aed42354deb2947a4006", size = 18870087, upload-time = "2026-02-16T13:56:18.463Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/6f/71be226c6780387f032c0b4ab791c390c7162ed62f93a11e600f9266dafd/maturin-1.12.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:04c9c4f9c9f904f007cbfcd4640c406e53f19d04c220f5940d1537edb914d325", size = 9762083, upload-time = "2026-02-16T13:56:27.853Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/cc/989dce6140227277b4184aab248d07fe67fa11f95411ccf90e272542287d/maturin-1.12.2-py3-none-manylinux_2_12_i686.manylinux2010_i686.musllinux_1_1_i686.whl", hash = "sha256:4bdc486b9ab80d8b50143ecc9a1924b890866fe95be150dd9a59fa22a6b37238", size = 9710711, upload-time = "2026-02-16T13:56:21.364Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/e8/02bb64f7150013d8af3ca622944e22f550beb312b6d5cf8760dc2896cce8/maturin-1.12.2-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:134e895578258a693ba1d55b166c2ba96e9f51067e106b8a74d422432653d45b", size = 10205015, upload-time = "2026-02-16T13:56:07.994Z" },
+    { url = "https://files.pythonhosted.org/packages/84/81/b603a74bef68fabd402d1e54f43560213ea69c3c01467610d0256eea013b/maturin-1.12.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:39665d622dcc950ab17b9569e8cab84a4d64eea6a18b540a8b49e00c0f7dda02", size = 9536887, upload-time = "2026-02-16T13:56:25.658Z" },
+    { url = "https://files.pythonhosted.org/packages/70/a5/387c7bced34f7fd8d08d399c6b1ac3d94d7ca50c9f87db9e1bc0dd8c8d08/maturin-1.12.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:ca3b20bcc3aff115c9eaf97340e78bff58829ea1efa16764940dd0d858dcf6af", size = 9487394, upload-time = "2026-02-16T13:56:29.875Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/30/d5ae812c54a70d5d3a5b67b073e92d1d14d36675242e2d00e6a175fa6117/maturin-1.12.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:d1617989b4a5dc543fea6d23c28b2f07fadb2c726ff00fe959538ee71a301384", size = 12577754, upload-time = "2026-02-16T13:56:31.902Z" },
+    { url = "https://files.pythonhosted.org/packages/84/f4/7baac2fa5324ccdc3f888ff5f6a793f3eb5a7805d89bc17a8bacbe9fc566/maturin-1.12.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6af778e7ee048612e55a1255488db7678741bea2ba881e66a19712f59f2534cb", size = 10375409, upload-time = "2026-02-16T13:56:23.316Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/ed/5680efbb1becb4f47da3ada8ea4eb6844d2fd91ae558287e1dd0871cb603/maturin-1.12.2-py3-none-manylinux_2_31_riscv64.musllinux_1_1_riscv64.whl", hash = "sha256:72aad9efe09a6392de9930f2bea80bfcc36fd98e18caa621f512571179c02d41", size = 10010584, upload-time = "2026-02-16T13:56:10.357Z" },
+    { url = "https://files.pythonhosted.org/packages/86/20/7e27e07dd2270b707dd0124256cd46bef7c8832476b0aefa2ecd74835365/maturin-1.12.2-py3-none-win32.whl", hash = "sha256:9763d277e143409cf0ce309eb1a493fc4e1e75777364d67ccac39a161b51b5b0", size = 8483122, upload-time = "2026-02-16T13:56:12.606Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/6e/9cc0e19c9a336fbc1b9664c1a7955caa6d8fd510c0047ace9be66a33704a/maturin-1.12.2-py3-none-win_amd64.whl", hash = "sha256:c06d218931985035d7ab4d0211ba96027e1bc7e4b01a87c8c4e30a57790403ec", size = 9825577, upload-time = "2026-02-16T13:56:34.193Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/67/07ea2c991ca1a55c6b08cd821710736276af7a3e160e1f869ea5c41c78c3/maturin-1.12.2-py3-none-win_arm64.whl", hash = "sha256:a882cc80c241b1e2c27bd1acd713b09e9ac9266a3159cc1e34e8c7b77f049bba", size = 8522702, upload-time = "2026-02-16T13:56:14.42Z" },
+]
+
+[[package]]
+name = "mdit-py-plugins"
+version = "0.5.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markdown-it-py" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b2/fd/a756d36c0bfba5f6e39a1cdbdbfdd448dc02692467d83816dff4592a1ebc/mdit_py_plugins-0.5.0.tar.gz", hash = "sha256:f4918cb50119f50446560513a8e311d574ff6aaed72606ddae6d35716fe809c6", size = 44655, upload-time = "2025-08-11T07:25:49.083Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fb/86/dd6e5db36df29e76c7a7699123569a4a18c1623ce68d826ed96c62643cae/mdit_py_plugins-0.5.0-py3-none-any.whl", hash = "sha256:07a08422fc1936a5d26d146759e9155ea466e842f5ab2f7d2266dd084c8dab1f", size = 57205, upload-time = "2025-08-11T07:25:47.597Z" },
+]
+
+[[package]]
+name = "mdurl"
+version = "0.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" },
+]
+
+[[package]]
+name = "myst-parser"
+version = "5.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "docutils" },
+    { name = "jinja2" },
+    { name = "markdown-it-py" },
+    { name = "mdit-py-plugins" },
+    { name = "pyyaml" },
+    { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
+    { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/33/fa/7b45eef11b7971f0beb29d27b7bfe0d747d063aa29e170d9edd004733c8a/myst_parser-5.0.0.tar.gz", hash = "sha256:f6f231452c56e8baa662cc352c548158f6a16fcbd6e3800fc594978002b94f3a", size = 98535, upload-time = "2026-01-15T09:08:18.036Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d3/ac/686789b9145413f1a61878c407210e41bfdb097976864e0913078b24098c/myst_parser-5.0.0-py3-none-any.whl", hash = "sha256:ab31e516024918296e169139072b81592336f2fef55b8986aa31c9f04b5f7211", size = 84533, upload-time = "2026-01-15T09:08:16.788Z" },
+]
+
+[[package]]
+name = "numpy"
+version = "2.4.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/57/fd/0005efbd0af48e55eb3c7208af93f2862d4b1a56cd78e84309a2d959208d/numpy-2.4.2.tar.gz", hash = "sha256:659a6107e31a83c4e33f763942275fd278b21d095094044eb35569e86a21ddae", size = 20723651, upload-time = "2026-01-31T23:13:10.135Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d3/44/71852273146957899753e69986246d6a176061ea183407e95418c2aa4d9a/numpy-2.4.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e7e88598032542bd49af7c4747541422884219056c268823ef6e5e89851c8825", size = 16955478, upload-time = "2026-01-31T23:10:25.623Z" },
+    { url = "https://files.pythonhosted.org/packages/74/41/5d17d4058bd0cd96bcbd4d9ff0fb2e21f52702aab9a72e4a594efa18692f/numpy-2.4.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7edc794af8b36ca37ef5fcb5e0d128c7e0595c7b96a2318d1badb6fcd8ee86b1", size = 14965467, upload-time = "2026-01-31T23:10:28.186Z" },
+    { url = "https://files.pythonhosted.org/packages/49/48/fb1ce8136c19452ed15f033f8aee91d5defe515094e330ce368a0647846f/numpy-2.4.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:6e9f61981ace1360e42737e2bae58b27bf28a1b27e781721047d84bd754d32e7", size = 5475172, upload-time = "2026-01-31T23:10:30.848Z" },
+    { url = "https://files.pythonhosted.org/packages/40/a9/3feb49f17bbd1300dd2570432961f5c8a4ffeff1db6f02c7273bd020a4c9/numpy-2.4.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:cb7bbb88aa74908950d979eeaa24dbdf1a865e3c7e45ff0121d8f70387b55f73", size = 6805145, upload-time = "2026-01-31T23:10:32.352Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/39/fdf35cbd6d6e2fcad42fcf85ac04a85a0d0fbfbf34b30721c98d602fd70a/numpy-2.4.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4f069069931240b3fc703f1e23df63443dbd6390614c8c44a87d96cd0ec81eb1", size = 15966084, upload-time = "2026-01-31T23:10:34.502Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/46/6fa4ea94f1ddf969b2ee941290cca6f1bfac92b53c76ae5f44afe17ceb69/numpy-2.4.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c02ef4401a506fb60b411467ad501e1429a3487abca4664871d9ae0b46c8ba32", size = 16899477, upload-time = "2026-01-31T23:10:37.075Z" },
+    { url = "https://files.pythonhosted.org/packages/09/a1/2a424e162b1a14a5bd860a464ab4e07513916a64ab1683fae262f735ccd2/numpy-2.4.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2653de5c24910e49c2b106499803124dde62a5a1fe0eedeaecf4309a5f639390", size = 17323429, upload-time = "2026-01-31T23:10:39.704Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/a2/73014149ff250628df72c58204822ac01d768697913881aacf839ff78680/numpy-2.4.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1ae241bbfc6ae276f94a170b14785e561cb5e7f626b6688cf076af4110887413", size = 18635109, upload-time = "2026-01-31T23:10:41.924Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/0c/73e8be2f1accd56df74abc1c5e18527822067dced5ec0861b5bb882c2ce0/numpy-2.4.2-cp311-cp311-win32.whl", hash = "sha256:df1b10187212b198dd45fa943d8985a3c8cf854aed4923796e0e019e113a1bda", size = 6237915, upload-time = "2026-01-31T23:10:45.26Z" },
+    { url = "https://files.pythonhosted.org/packages/76/ae/e0265e0163cf127c24c3969d29f1c4c64551a1e375d95a13d32eab25d364/numpy-2.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:b9c618d56a29c9cb1c4da979e9899be7578d2e0b3c24d52079c166324c9e8695", size = 12607972, upload-time = "2026-01-31T23:10:47.021Z" },
+    { url = "https://files.pythonhosted.org/packages/29/a5/c43029af9b8014d6ea157f192652c50042e8911f4300f8f6ed3336bf437f/numpy-2.4.2-cp311-cp311-win_arm64.whl", hash = "sha256:47c5a6ed21d9452b10227e5e8a0e1c22979811cad7dcc19d8e3e2fb8fa03f1a3", size = 10485763, upload-time = "2026-01-31T23:10:50.087Z" },
+    { url = "https://files.pythonhosted.org/packages/51/6e/6f394c9c77668153e14d4da83bcc247beb5952f6ead7699a1a2992613bea/numpy-2.4.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:21982668592194c609de53ba4933a7471880ccbaadcc52352694a59ecc860b3a", size = 16667963, upload-time = "2026-01-31T23:10:52.147Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/f8/55483431f2b2fd015ae6ed4fe62288823ce908437ed49db5a03d15151678/numpy-2.4.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40397bda92382fcec844066efb11f13e1c9a3e2a8e8f318fb72ed8b6db9f60f1", size = 14693571, upload-time = "2026-01-31T23:10:54.789Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/20/18026832b1845cdc82248208dd929ca14c9d8f2bac391f67440707fff27c/numpy-2.4.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:b3a24467af63c67829bfaa61eecf18d5432d4f11992688537be59ecd6ad32f5e", size = 5203469, upload-time = "2026-01-31T23:10:57.343Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/33/2eb97c8a77daaba34eaa3fa7241a14ac5f51c46a6bd5911361b644c4a1e2/numpy-2.4.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:805cc8de9fd6e7a22da5aed858e0ab16be5a4db6c873dde1d7451c541553aa27", size = 6550820, upload-time = "2026-01-31T23:10:59.429Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/91/b97fdfd12dc75b02c44e26c6638241cc004d4079a0321a69c62f51470c4c/numpy-2.4.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d82351358ffbcdcd7b686b90742a9b86632d6c1c051016484fa0b326a0a1548", size = 15663067, upload-time = "2026-01-31T23:11:01.291Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/c6/a18e59f3f0b8071cc85cbc8d80cd02d68aa9710170b2553a117203d46936/numpy-2.4.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e35d3e0144137d9fdae62912e869136164534d64a169f86438bc9561b6ad49f", size = 16619782, upload-time = "2026-01-31T23:11:03.669Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/83/9751502164601a79e18847309f5ceec0b1446d7b6aa12305759b72cf98b2/numpy-2.4.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:adb6ed2ad29b9e15321d167d152ee909ec73395901b70936f029c3bc6d7f4460", size = 17013128, upload-time = "2026-01-31T23:11:05.913Z" },
+    { url = "https://files.pythonhosted.org/packages/61/c4/c4066322256ec740acc1c8923a10047818691d2f8aec254798f3dd90f5f2/numpy-2.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8906e71fd8afcb76580404e2a950caef2685df3d2a57fe82a86ac8d33cc007ba", size = 18345324, upload-time = "2026-01-31T23:11:08.248Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/af/6157aa6da728fa4525a755bfad486ae7e3f76d4c1864138003eb84328497/numpy-2.4.2-cp312-cp312-win32.whl", hash = "sha256:ec055f6dae239a6299cace477b479cca2fc125c5675482daf1dd886933a1076f", size = 5960282, upload-time = "2026-01-31T23:11:10.497Z" },
+    { url = "https://files.pythonhosted.org/packages/92/0f/7ceaaeaacb40567071e94dbf2c9480c0ae453d5bb4f52bea3892c39dc83c/numpy-2.4.2-cp312-cp312-win_amd64.whl", hash = "sha256:209fae046e62d0ce6435fcfe3b1a10537e858249b3d9b05829e2a05218296a85", size = 12314210, upload-time = "2026-01-31T23:11:12.176Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/a3/56c5c604fae6dd40fa2ed3040d005fca97e91bd320d232ac9931d77ba13c/numpy-2.4.2-cp312-cp312-win_arm64.whl", hash = "sha256:fbde1b0c6e81d56f5dccd95dd4a711d9b95df1ae4009a60887e56b27e8d903fa", size = 10220171, upload-time = "2026-01-31T23:11:14.684Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/22/815b9fe25d1d7ae7d492152adbc7226d3eff731dffc38fe970589fcaaa38/numpy-2.4.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:25f2059807faea4b077a2b6837391b5d830864b3543627f381821c646f31a63c", size = 16663696, upload-time = "2026-01-31T23:11:17.516Z" },
+    { url = "https://files.pythonhosted.org/packages/09/f0/817d03a03f93ba9c6c8993de509277d84e69f9453601915e4a69554102a1/numpy-2.4.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bd3a7a9f5847d2fb8c2c6d1c862fa109c31a9abeca1a3c2bd5a64572955b2979", size = 14688322, upload-time = "2026-01-31T23:11:19.883Z" },
+    { url = "https://files.pythonhosted.org/packages/da/b4/f805ab79293c728b9a99438775ce51885fd4f31b76178767cfc718701a39/numpy-2.4.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:8e4549f8a3c6d13d55041925e912bfd834285ef1dd64d6bc7d542583355e2e98", size = 5198157, upload-time = "2026-01-31T23:11:22.375Z" },
+    { url = "https://files.pythonhosted.org/packages/74/09/826e4289844eccdcd64aac27d13b0fd3f32039915dd5b9ba01baae1f436c/numpy-2.4.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:aea4f66ff44dfddf8c2cffd66ba6538c5ec67d389285292fe428cb2c738c8aef", size = 6546330, upload-time = "2026-01-31T23:11:23.958Z" },
+    { url = "https://files.pythonhosted.org/packages/19/fb/cbfdbfa3057a10aea5422c558ac57538e6acc87ec1669e666d32ac198da7/numpy-2.4.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c3cd545784805de05aafe1dde61752ea49a359ccba9760c1e5d1c88a93bbf2b7", size = 15660968, upload-time = "2026-01-31T23:11:25.713Z" },
+    { url = "https://files.pythonhosted.org/packages/04/dc/46066ce18d01645541f0186877377b9371b8fa8017fa8262002b4ef22612/numpy-2.4.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d0d9b7c93578baafcbc5f0b83eaf17b79d345c6f36917ba0c67f45226911d499", size = 16607311, upload-time = "2026-01-31T23:11:28.117Z" },
+    { url = "https://files.pythonhosted.org/packages/14/d9/4b5adfc39a43fa6bf918c6d544bc60c05236cc2f6339847fc5b35e6cb5b0/numpy-2.4.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f74f0f7779cc7ae07d1810aab8ac6b1464c3eafb9e283a40da7309d5e6e48fbb", size = 17012850, upload-time = "2026-01-31T23:11:30.888Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/20/adb6e6adde6d0130046e6fdfb7675cc62bc2f6b7b02239a09eb58435753d/numpy-2.4.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:c7ac672d699bf36275c035e16b65539931347d68b70667d28984c9fb34e07fa7", size = 18334210, upload-time = "2026-01-31T23:11:33.214Z" },
+    { url = "https://files.pythonhosted.org/packages/78/0e/0a73b3dff26803a8c02baa76398015ea2a5434d9b8265a7898a6028c1591/numpy-2.4.2-cp313-cp313-win32.whl", hash = "sha256:8e9afaeb0beff068b4d9cd20d322ba0ee1cecfb0b08db145e4ab4dd44a6b5110", size = 5958199, upload-time = "2026-01-31T23:11:35.385Z" },
+    { url = "https://files.pythonhosted.org/packages/43/bc/6352f343522fcb2c04dbaf94cb30cca6fd32c1a750c06ad6231b4293708c/numpy-2.4.2-cp313-cp313-win_amd64.whl", hash = "sha256:7df2de1e4fba69a51c06c28f5a3de36731eb9639feb8e1cf7e4a7b0daf4cf622", size = 12310848, upload-time = "2026-01-31T23:11:38.001Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/8d/6da186483e308da5da1cc6918ce913dcfe14ffde98e710bfeff2a6158d4e/numpy-2.4.2-cp313-cp313-win_arm64.whl", hash = "sha256:0fece1d1f0a89c16b03442eae5c56dc0be0c7883b5d388e0c03f53019a4bfd71", size = 10221082, upload-time = "2026-01-31T23:11:40.392Z" },
+    { url = "https://files.pythonhosted.org/packages/25/a1/9510aa43555b44781968935c7548a8926274f815de42ad3997e9e83680dd/numpy-2.4.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5633c0da313330fd20c484c78cdd3f9b175b55e1a766c4a174230c6b70ad8262", size = 14815866, upload-time = "2026-01-31T23:11:42.495Z" },
+    { url = "https://files.pythonhosted.org/packages/36/30/6bbb5e76631a5ae46e7923dd16ca9d3f1c93cfa8d4ed79a129814a9d8db3/numpy-2.4.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:d9f64d786b3b1dd742c946c42d15b07497ed14af1a1f3ce840cce27daa0ce913", size = 5325631, upload-time = "2026-01-31T23:11:44.7Z" },
+    { url = "https://files.pythonhosted.org/packages/46/00/3a490938800c1923b567b3a15cd17896e68052e2145d8662aaf3e1ffc58f/numpy-2.4.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:b21041e8cb6a1eb5312dd1d2f80a94d91efffb7a06b70597d44f1bd2dfc315ab", size = 6646254, upload-time = "2026-01-31T23:11:46.341Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/e9/fac0890149898a9b609caa5af7455a948b544746e4b8fe7c212c8edd71f8/numpy-2.4.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:00ab83c56211a1d7c07c25e3217ea6695e50a3e2f255053686b081dc0b091a82", size = 15720138, upload-time = "2026-01-31T23:11:48.082Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/5c/08887c54e68e1e28df53709f1893ce92932cc6f01f7c3d4dc952f61ffd4e/numpy-2.4.2-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2fb882da679409066b4603579619341c6d6898fc83a8995199d5249f986e8e8f", size = 16655398, upload-time = "2026-01-31T23:11:50.293Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/89/253db0fa0e66e9129c745e4ef25631dc37d5f1314dad2b53e907b8538e6d/numpy-2.4.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:66cb9422236317f9d44b67b4d18f44efe6e9c7f8794ac0462978513359461554", size = 17079064, upload-time = "2026-01-31T23:11:52.927Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/d5/cbade46ce97c59c6c3da525e8d95b7abe8a42974a1dc5c1d489c10433e88/numpy-2.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0f01dcf33e73d80bd8dc0f20a71303abbafa26a19e23f6b68d1aa9990af90257", size = 18379680, upload-time = "2026-01-31T23:11:55.22Z" },
+    { url = "https://files.pythonhosted.org/packages/40/62/48f99ae172a4b63d981babe683685030e8a3df4f246c893ea5c6ef99f018/numpy-2.4.2-cp313-cp313t-win32.whl", hash = "sha256:52b913ec40ff7ae845687b0b34d8d93b60cb66dcee06996dd5c99f2fc9328657", size = 6082433, upload-time = "2026-01-31T23:11:58.096Z" },
+    { url = "https://files.pythonhosted.org/packages/07/38/e054a61cfe48ad9f1ed0d188e78b7e26859d0b60ef21cd9de4897cdb5326/numpy-2.4.2-cp313-cp313t-win_amd64.whl", hash = "sha256:5eea80d908b2c1f91486eb95b3fb6fab187e569ec9752ab7d9333d2e66bf2d6b", size = 12451181, upload-time = "2026-01-31T23:11:59.782Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/a4/a05c3a6418575e185dd84d0b9680b6bb2e2dc3e4202f036b7b4e22d6e9dc/numpy-2.4.2-cp313-cp313t-win_arm64.whl", hash = "sha256:fd49860271d52127d61197bb50b64f58454e9f578cb4b2c001a6de8b1f50b0b1", size = 10290756, upload-time = "2026-01-31T23:12:02.438Z" },
+    { url = "https://files.pythonhosted.org/packages/18/88/b7df6050bf18fdcfb7046286c6535cabbdd2064a3440fca3f069d319c16e/numpy-2.4.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:444be170853f1f9d528428eceb55f12918e4fda5d8805480f36a002f1415e09b", size = 16663092, upload-time = "2026-01-31T23:12:04.521Z" },
+    { url = "https://files.pythonhosted.org/packages/25/7a/1fee4329abc705a469a4afe6e69b1ef7e915117747886327104a8493a955/numpy-2.4.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:d1240d50adff70c2a88217698ca844723068533f3f5c5fa6ee2e3220e3bdb000", size = 14698770, upload-time = "2026-01-31T23:12:06.96Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/0b/f9e49ba6c923678ad5bc38181c08ac5e53b7a5754dbca8e581aa1a56b1ff/numpy-2.4.2-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:7cdde6de52fb6664b00b056341265441192d1291c130e99183ec0d4b110ff8b1", size = 5208562, upload-time = "2026-01-31T23:12:09.632Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/12/d7de8f6f53f9bb76997e5e4c069eda2051e3fe134e9181671c4391677bb2/numpy-2.4.2-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:cda077c2e5b780200b6b3e09d0b42205a3d1c68f30c6dceb90401c13bff8fe74", size = 6543710, upload-time = "2026-01-31T23:12:11.969Z" },
+    { url = "https://files.pythonhosted.org/packages/09/63/c66418c2e0268a31a4cf8a8b512685748200f8e8e8ec6c507ce14e773529/numpy-2.4.2-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d30291931c915b2ab5717c2974bb95ee891a1cf22ebc16a8006bd59cd210d40a", size = 15677205, upload-time = "2026-01-31T23:12:14.33Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/6c/7f237821c9642fb2a04d2f1e88b4295677144ca93285fd76eff3bcba858d/numpy-2.4.2-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bba37bc29d4d85761deed3954a1bc62be7cf462b9510b51d367b769a8c8df325", size = 16611738, upload-time = "2026-01-31T23:12:16.525Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/a7/39c4cdda9f019b609b5c473899d87abff092fc908cfe4d1ecb2fcff453b0/numpy-2.4.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b2f0073ed0868db1dcd86e052d37279eef185b9c8db5bf61f30f46adac63c909", size = 17028888, upload-time = "2026-01-31T23:12:19.306Z" },
+    { url = "https://files.pythonhosted.org/packages/da/b3/e84bb64bdfea967cc10950d71090ec2d84b49bc691df0025dddb7c26e8e3/numpy-2.4.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7f54844851cdb630ceb623dcec4db3240d1ac13d4990532446761baede94996a", size = 18339556, upload-time = "2026-01-31T23:12:21.816Z" },
+    { url = "https://files.pythonhosted.org/packages/88/f5/954a291bc1192a27081706862ac62bb5920fbecfbaa302f64682aa90beed/numpy-2.4.2-cp314-cp314-win32.whl", hash = "sha256:12e26134a0331d8dbd9351620f037ec470b7c75929cb8a1537f6bfe411152a1a", size = 6006899, upload-time = "2026-01-31T23:12:24.14Z" },
+    { url = "https://files.pythonhosted.org/packages/05/cb/eff72a91b2efdd1bc98b3b8759f6a1654aa87612fc86e3d87d6fe4f948c4/numpy-2.4.2-cp314-cp314-win_amd64.whl", hash = "sha256:068cdb2d0d644cdb45670810894f6a0600797a69c05f1ac478e8d31670b8ee75", size = 12443072, upload-time = "2026-01-31T23:12:26.33Z" },
+    { url = "https://files.pythonhosted.org/packages/37/75/62726948db36a56428fce4ba80a115716dc4fad6a3a4352487f8bb950966/numpy-2.4.2-cp314-cp314-win_arm64.whl", hash = "sha256:6ed0be1ee58eef41231a5c943d7d1375f093142702d5723ca2eb07db9b934b05", size = 10494886, upload-time = "2026-01-31T23:12:28.488Z" },
+    { url = "https://files.pythonhosted.org/packages/36/2f/ee93744f1e0661dc267e4b21940870cabfae187c092e1433b77b09b50ac4/numpy-2.4.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:98f16a80e917003a12c0580f97b5f875853ebc33e2eaa4bccfc8201ac6869308", size = 14818567, upload-time = "2026-01-31T23:12:30.709Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/24/6535212add7d76ff938d8bdc654f53f88d35cddedf807a599e180dcb8e66/numpy-2.4.2-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:20abd069b9cda45874498b245c8015b18ace6de8546bf50dfa8cea1696ed06ef", size = 5328372, upload-time = "2026-01-31T23:12:32.962Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/9d/c48f0a035725f925634bf6b8994253b43f2047f6778a54147d7e213bc5a7/numpy-2.4.2-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:e98c97502435b53741540a5717a6749ac2ada901056c7db951d33e11c885cc7d", size = 6649306, upload-time = "2026-01-31T23:12:34.797Z" },
+    { url = "https://files.pythonhosted.org/packages/81/05/7c73a9574cd4a53a25907bad38b59ac83919c0ddc8234ec157f344d57d9a/numpy-2.4.2-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da6cad4e82cb893db4b69105c604d805e0c3ce11501a55b5e9f9083b47d2ffe8", size = 15722394, upload-time = "2026-01-31T23:12:36.565Z" },
+    { url = "https://files.pythonhosted.org/packages/35/fa/4de10089f21fc7d18442c4a767ab156b25c2a6eaf187c0db6d9ecdaeb43f/numpy-2.4.2-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e4424677ce4b47fe73c8b5556d876571f7c6945d264201180db2dc34f676ab5", size = 16653343, upload-time = "2026-01-31T23:12:39.188Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/f9/d33e4ffc857f3763a57aa85650f2e82486832d7492280ac21ba9efda80da/numpy-2.4.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2b8f157c8a6f20eb657e240f8985cc135598b2b46985c5bccbde7616dc9c6b1e", size = 17078045, upload-time = "2026-01-31T23:12:42.041Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/b8/54bdb43b6225badbea6389fa038c4ef868c44f5890f95dd530a218706da3/numpy-2.4.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5daf6f3914a733336dab21a05cdec343144600e964d2fcdabaac0c0269874b2a", size = 18380024, upload-time = "2026-01-31T23:12:44.331Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/55/6e1a61ded7af8df04016d81b5b02daa59f2ea9252ee0397cb9f631efe9e5/numpy-2.4.2-cp314-cp314t-win32.whl", hash = "sha256:8c50dd1fc8826f5b26a5ee4d77ca55d88a895f4e4819c7ecc2a9f5905047a443", size = 6153937, upload-time = "2026-01-31T23:12:47.229Z" },
+    { url = "https://files.pythonhosted.org/packages/45/aa/fa6118d1ed6d776b0983f3ceac9b1a5558e80df9365b1c3aa6d42bf9eee4/numpy-2.4.2-cp314-cp314t-win_amd64.whl", hash = "sha256:fcf92bee92742edd401ba41135185866f7026c502617f422eb432cfeca4fe236", size = 12631844, upload-time = "2026-01-31T23:12:48.997Z" },
+    { url = "https://files.pythonhosted.org/packages/32/0a/2ec5deea6dcd158f254a7b372fb09cfba5719419c8d66343bab35237b3fb/numpy-2.4.2-cp314-cp314t-win_arm64.whl", hash = "sha256:1f92f53998a17265194018d1cc321b2e96e900ca52d54c7c77837b71b9465181", size = 10565379, upload-time = "2026-01-31T23:12:51.345Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/f8/50e14d36d915ef64d8f8bc4a087fc8264d82c785eda6711f80ab7e620335/numpy-2.4.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:89f7268c009bc492f506abd6f5265defa7cb3f7487dc21d357c3d290add45082", size = 16833179, upload-time = "2026-01-31T23:12:53.5Z" },
+    { url = "https://files.pythonhosted.org/packages/17/17/809b5cad63812058a8189e91a1e2d55a5a18fd04611dbad244e8aeae465c/numpy-2.4.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:e6dee3bb76aa4009d5a912180bf5b2de012532998d094acee25d9cb8dee3e44a", size = 14889755, upload-time = "2026-01-31T23:12:55.933Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/ea/181b9bcf7627fc8371720316c24db888dcb9829b1c0270abf3d288b2e29b/numpy-2.4.2-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:cd2bd2bbed13e213d6b55dc1d035a4f91748a7d3edc9480c13898b0353708920", size = 5399500, upload-time = "2026-01-31T23:12:58.671Z" },
+    { url = "https://files.pythonhosted.org/packages/33/9f/413adf3fc955541ff5536b78fcf0754680b3c6d95103230252a2c9408d23/numpy-2.4.2-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:cf28c0c1d4c4bf00f509fa7eb02c58d7caf221b50b467bcb0d9bbf1584d5c821", size = 6714252, upload-time = "2026-01-31T23:13:00.518Z" },
+    { url = "https://files.pythonhosted.org/packages/91/da/643aad274e29ccbdf42ecd94dafe524b81c87bcb56b83872d54827f10543/numpy-2.4.2-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e04ae107ac591763a47398bb45b568fc38f02dbc4aa44c063f67a131f99346cb", size = 15797142, upload-time = "2026-01-31T23:13:02.219Z" },
+    { url = "https://files.pythonhosted.org/packages/66/27/965b8525e9cb5dc16481b30a1b3c21e50c7ebf6e9dbd48d0c4d0d5089c7e/numpy-2.4.2-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:602f65afdef699cda27ec0b9224ae5dc43e328f4c24c689deaf77133dbee74d0", size = 16727979, upload-time = "2026-01-31T23:13:04.62Z" },
+    { url = "https://files.pythonhosted.org/packages/de/e5/b7d20451657664b07986c2f6e3be564433f5dcaf3482d68eaecd79afaf03/numpy-2.4.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:be71bf1edb48ebbbf7f6337b5bfd2f895d1902f6335a5830b20141fc126ffba0", size = 12502577, upload-time = "2026-01-31T23:13:07.08Z" },
+]
+
+[[package]]
+name = "packaging"
+version = "26.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" },
+]
+
+[[package]]
+name = "pyarrow"
+version = "23.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/88/22/134986a4cc224d593c1afde5494d18ff629393d74cc2eddb176669f234a4/pyarrow-23.0.1.tar.gz", hash = "sha256:b8c5873e33440b2bc2f4a79d2b47017a89c5a24116c055625e6f2ee50523f019", size = 1167336, upload-time = "2026-02-16T10:14:12.39Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b0/41/8e6b6ef7e225d4ceead8459427a52afdc23379768f54dd3566014d7618c1/pyarrow-23.0.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:6f0147ee9e0386f519c952cc670eb4a8b05caa594eeffe01af0e25f699e4e9bb", size = 34302230, upload-time = "2026-02-16T10:09:03.859Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/4a/1472c00392f521fea03ae93408bf445cc7bfa1ab81683faf9bc188e36629/pyarrow-23.0.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:0ae6e17c828455b6265d590100c295193f93cc5675eb0af59e49dbd00d2de350", size = 35850050, upload-time = "2026-02-16T10:09:11.877Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/b2/bd1f2f05ded56af7f54d702c8364c9c43cd6abb91b0e9933f3d77b4f4132/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:fed7020203e9ef273360b9e45be52a2a47d3103caf156a30ace5247ffb51bdbd", size = 44491918, upload-time = "2026-02-16T10:09:18.144Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/62/96459ef5b67957eac38a90f541d1c28833d1b367f014a482cb63f3b7cd2d/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:26d50dee49d741ac0e82185033488d28d35be4d763ae6f321f97d1140eb7a0e9", size = 47562811, upload-time = "2026-02-16T10:09:25.792Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/94/1170e235add1f5f45a954e26cd0e906e7e74e23392dcb560de471f7366ec/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3c30143b17161310f151f4a2bcfe41b5ff744238c1039338779424e38579d701", size = 48183766, upload-time = "2026-02-16T10:09:34.645Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/2d/39a42af4570377b99774cdb47f63ee6c7da7616bd55b3d5001aa18edfe4f/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db2190fa79c80a23fdd29fef4b8992893f024ae7c17d2f5f4db7171fa30c2c78", size = 50607669, upload-time = "2026-02-16T10:09:44.153Z" },
+    { url = "https://files.pythonhosted.org/packages/00/ca/db94101c187f3df742133ac837e93b1f269ebdac49427f8310ee40b6a58f/pyarrow-23.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:f00f993a8179e0e1c9713bcc0baf6d6c01326a406a9c23495ec1ba9c9ebf2919", size = 27527698, upload-time = "2026-02-16T10:09:50.263Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/4b/4166bb5abbfe6f750fc60ad337c43ecf61340fa52ab386da6e8dbf9e63c4/pyarrow-23.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f4b0dbfa124c0bb161f8b5ebb40f1a680b70279aa0c9901d44a2b5a20806039f", size = 34214575, upload-time = "2026-02-16T10:09:56.225Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/da/3f941e3734ac8088ea588b53e860baeddac8323ea40ce22e3d0baa865cc9/pyarrow-23.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:7707d2b6673f7de054e2e83d59f9e805939038eebe1763fe811ee8fa5c0cd1a7", size = 35832540, upload-time = "2026-02-16T10:10:03.428Z" },
+    { url = "https://files.pythonhosted.org/packages/88/7c/3d841c366620e906d54430817531b877ba646310296df42ef697308c2705/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86ff03fb9f1a320266e0de855dee4b17da6794c595d207f89bba40d16b5c78b9", size = 44470940, upload-time = "2026-02-16T10:10:10.704Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/a5/da83046273d990f256cb79796a190bbf7ec999269705ddc609403f8c6b06/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:813d99f31275919c383aab17f0f455a04f5a429c261cc411b1e9a8f5e4aaaa05", size = 47586063, upload-time = "2026-02-16T10:10:17.95Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/3c/b7d2ebcff47a514f47f9da1e74b7949138c58cfeb108cdd4ee62f43f0cf3/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bf5842f960cddd2ef757d486041d57c96483efc295a8c4a0e20e704cbbf39c67", size = 48173045, upload-time = "2026-02-16T10:10:25.363Z" },
+    { url = "https://files.pythonhosted.org/packages/43/b2/b40961262213beaba6acfc88698eb773dfce32ecdf34d19291db94c2bd73/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:564baf97c858ecc03ec01a41062e8f4698abc3e6e2acd79c01c2e97880a19730", size = 50621741, upload-time = "2026-02-16T10:10:33.477Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/70/1fdda42d65b28b078e93d75d371b2185a61da89dda4def8ba6ba41ebdeb4/pyarrow-23.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:07deae7783782ac7250989a7b2ecde9b3c343a643f82e8a4df03d93b633006f0", size = 27620678, upload-time = "2026-02-16T10:10:39.31Z" },
+    { url = "https://files.pythonhosted.org/packages/47/10/2cbe4c6f0fb83d2de37249567373d64327a5e4d8db72f486db42875b08f6/pyarrow-23.0.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6b8fda694640b00e8af3c824f99f789e836720aa8c9379fb435d4c4953a756b8", size = 34210066, upload-time = "2026-02-16T10:10:45.487Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/4f/679fa7e84dadbaca7a65f7cdba8d6c83febbd93ca12fa4adf40ba3b6362b/pyarrow-23.0.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:8ff51b1addc469b9444b7c6f3548e19dc931b172ab234e995a60aea9f6e6025f", size = 35825526, upload-time = "2026-02-16T10:10:52.266Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/63/d2747d930882c9d661e9398eefc54f15696547b8983aaaf11d4a2e8b5426/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:71c5be5cbf1e1cb6169d2a0980850bccb558ddc9b747b6206435313c47c37677", size = 44473279, upload-time = "2026-02-16T10:11:01.557Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/93/10a48b5e238de6d562a411af6467e71e7aedbc9b87f8d3a35f1560ae30fb/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9b6f4f17b43bc39d56fec96e53fe89d94bac3eb134137964371b45352d40d0c2", size = 47585798, upload-time = "2026-02-16T10:11:09.401Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/20/476943001c54ef078dbf9542280e22741219a184a0632862bca4feccd666/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fc13fc6c403d1337acab46a2c4346ca6c9dec5780c3c697cf8abfd5e19b6b37", size = 48179446, upload-time = "2026-02-16T10:11:17.781Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/b6/5dd0c47b335fcd8edba9bfab78ad961bd0fd55ebe53468cc393f45e0be60/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5c16ed4f53247fa3ffb12a14d236de4213a4415d127fe9cebed33d51671113e2", size = 50623972, upload-time = "2026-02-16T10:11:26.185Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/09/a532297c9591a727d67760e2e756b83905dd89adb365a7f6e9c72578bcc1/pyarrow-23.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:cecfb12ef629cf6be0b1887f9f86463b0dd3dc3195ae6224e74006be4736035a", size = 27540749, upload-time = "2026-02-16T10:12:23.297Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/8e/38749c4b1303e6ae76b3c80618f84861ae0c55dd3c2273842ea6f8258233/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:29f7f7419a0e30264ea261fdc0e5fe63ce5a6095003db2945d7cd78df391a7e1", size = 34471544, upload-time = "2026-02-16T10:11:32.535Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/73/f237b2bc8c669212f842bcfd842b04fc8d936bfc9d471630569132dc920d/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:33d648dc25b51fd8055c19e4261e813dfc4d2427f068bcecc8b53d01b81b0500", size = 35949911, upload-time = "2026-02-16T10:11:39.813Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/86/b912195eee0903b5611bf596833def7d146ab2d301afeb4b722c57ffc966/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cd395abf8f91c673dd3589cadc8cc1ee4e8674fa61b2e923c8dd215d9c7d1f41", size = 44520337, upload-time = "2026-02-16T10:11:47.764Z" },
+    { url = "https://files.pythonhosted.org/packages/69/c2/f2a717fb824f62d0be952ea724b4f6f9372a17eed6f704b5c9526f12f2f1/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:00be9576d970c31defb5c32eb72ef585bf600ef6d0a82d5eccaae96639cf9d07", size = 47548944, upload-time = "2026-02-16T10:11:56.607Z" },
+    { url = "https://files.pythonhosted.org/packages/84/a7/90007d476b9f0dc308e3bc57b832d004f848fd6c0da601375d20d92d1519/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c2139549494445609f35a5cda4eb94e2c9e4d704ce60a095b342f82460c73a83", size = 48236269, upload-time = "2026-02-16T10:12:04.47Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/3f/b16fab3e77709856eb6ac328ce35f57a6d4a18462c7ca5186ef31b45e0e0/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7044b442f184d84e2351e5084600f0d7343d6117aabcbc1ac78eb1ae11eb4125", size = 50604794, upload-time = "2026-02-16T10:12:11.797Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/a1/22df0620a9fac31d68397a75465c344e83c3dfe521f7612aea33e27ab6c0/pyarrow-23.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:a35581e856a2fafa12f3f54fce4331862b1cfb0bef5758347a858a4aa9d6bae8", size = 27660642, upload-time = "2026-02-16T10:12:17.746Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/1b/6da9a89583ce7b23ac611f183ae4843cd3a6cf54f079549b0e8c14031e73/pyarrow-23.0.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:5df1161da23636a70838099d4aaa65142777185cc0cdba4037a18cee7d8db9ca", size = 34238755, upload-time = "2026-02-16T10:12:32.819Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/b5/d58a241fbe324dbaeb8df07be6af8752c846192d78d2272e551098f74e88/pyarrow-23.0.1-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:fa8e51cb04b9f8c9c5ace6bab63af9a1f88d35c0d6cbf53e8c17c098552285e1", size = 35847826, upload-time = "2026-02-16T10:12:38.949Z" },
+    { url = "https://files.pythonhosted.org/packages/54/a5/8cbc83f04aba433ca7b331b38f39e000efd9f0c7ce47128670e737542996/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:0b95a3994f015be13c63148fef8832e8a23938128c185ee951c98908a696e0eb", size = 44536859, upload-time = "2026-02-16T10:12:45.467Z" },
+    { url = "https://files.pythonhosted.org/packages/36/2e/c0f017c405fcdc252dbccafbe05e36b0d0eb1ea9a958f081e01c6972927f/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:4982d71350b1a6e5cfe1af742c53dfb759b11ce14141870d05d9e540d13bc5d1", size = 47614443, upload-time = "2026-02-16T10:12:55.525Z" },
+    { url = "https://files.pythonhosted.org/packages/af/6b/2314a78057912f5627afa13ba43809d9d653e6630859618b0fd81a4e0759/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c250248f1fe266db627921c89b47b7c06fee0489ad95b04d50353537d74d6886", size = 48232991, upload-time = "2026-02-16T10:13:04.729Z" },
+    { url = "https://files.pythonhosted.org/packages/40/f2/1bcb1d3be3460832ef3370d621142216e15a2c7c62602a4ea19ec240dd64/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5f4763b83c11c16e5f4c15601ba6dfa849e20723b46aa2617cb4bffe8768479f", size = 50645077, upload-time = "2026-02-16T10:13:14.147Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/3f/b1da7b61cd66566a4d4c8383d376c606d1c34a906c3f1cb35c479f59d1aa/pyarrow-23.0.1-cp314-cp314-win_amd64.whl", hash = "sha256:3a4c85ef66c134161987c17b147d6bffdca4566f9a4c1d81a0a01cdf08414ea5", size = 28234271, upload-time = "2026-02-16T10:14:09.397Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/78/07f67434e910a0f7323269be7bfbf58699bd0c1d080b18a1ab49ba943fe8/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:17cd28e906c18af486a499422740298c52d7c6795344ea5002a7720b4eadf16d", size = 34488692, upload-time = "2026-02-16T10:13:21.541Z" },
+    { url = "https://files.pythonhosted.org/packages/50/76/34cf7ae93ece1f740a04910d9f7e80ba166b9b4ab9596a953e9e62b90fe1/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:76e823d0e86b4fb5e1cf4a58d293036e678b5a4b03539be933d3b31f9406859f", size = 35964383, upload-time = "2026-02-16T10:13:28.63Z" },
+    { url = "https://files.pythonhosted.org/packages/46/90/459b827238936d4244214be7c684e1b366a63f8c78c380807ae25ed92199/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a62e1899e3078bf65943078b3ad2a6ddcacf2373bc06379aac61b1e548a75814", size = 44538119, upload-time = "2026-02-16T10:13:35.506Z" },
+    { url = "https://files.pythonhosted.org/packages/28/a1/93a71ae5881e99d1f9de1d4554a87be37da11cd6b152239fb5bd924fdc64/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:df088e8f640c9fae3b1f495b3c64755c4e719091caf250f3a74d095ddf3c836d", size = 47571199, upload-time = "2026-02-16T10:13:42.504Z" },
+    { url = "https://files.pythonhosted.org/packages/88/a3/d2c462d4ef313521eaf2eff04d204ac60775263f1fb08c374b543f79f610/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:46718a220d64677c93bc243af1d44b55998255427588e400677d7192671845c7", size = 48259435, upload-time = "2026-02-16T10:13:49.226Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/f1/11a544b8c3d38a759eb3fbb022039117fd633e9a7b19e4841cc3da091915/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a09f3876e87f48bc2f13583ab551f0379e5dfb83210391e68ace404181a20690", size = 50629149, upload-time = "2026-02-16T10:13:57.238Z" },
+    { url = "https://files.pythonhosted.org/packages/50/f2/c0e76a0b451ffdf0cf788932e182758eb7558953f4f27f1aff8e2518b653/pyarrow-23.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:527e8d899f14bd15b740cd5a54ad56b7f98044955373a17179d5956ddb93d9ce", size = 28365807, upload-time = "2026-02-16T10:14:03.892Z" },
+]
+
+[[package]]
+name = "pycparser"
+version = "3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/1b/7d/92392ff7815c21062bea51aa7b87d45576f649f16458d78b7cf94b9ab2e6/pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29", size = 103492, upload-time = "2026-01-21T14:26:51.89Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0c/c3/44f3fbbfa403ea2a7c779186dc20772604442dde72947e7d01069cbe98e3/pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992", size = 48172, upload-time = "2026-01-21T14:26:50.693Z" },
+]
+
+[[package]]
+name = "pydata-sphinx-theme"
+version = "0.16.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "accessible-pygments" },
+    { name = "babel" },
+    { name = "beautifulsoup4" },
+    { name = "docutils" },
+    { name = "pygments" },
+    { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
+    { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/00/20/bb50f9de3a6de69e6abd6b087b52fa2418a0418b19597601605f855ad044/pydata_sphinx_theme-0.16.1.tar.gz", hash = "sha256:a08b7f0b7f70387219dc659bff0893a7554d5eb39b59d3b8ef37b8401b7642d7", size = 2412693, upload-time = "2024-12-17T10:53:39.537Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e2/0d/8ba33fa83a7dcde13eb3c1c2a0c1cc29950a048bfed6d9b0d8b6bd710b4c/pydata_sphinx_theme-0.16.1-py3-none-any.whl", hash = "sha256:225331e8ac4b32682c18fcac5a57a6f717c4e632cea5dd0e247b55155faeccde", size = 6723264, upload-time = "2024-12-17T10:53:35.645Z" },
+]
+
+[[package]]
+name = "pygithub"
+version = "2.8.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pyjwt", extra = ["crypto"] },
+    { name = "pynacl" },
+    { name = "requests" },
+    { name = "typing-extensions" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c1/74/e560bdeffea72ecb26cff27f0fad548bbff5ecc51d6a155311ea7f9e4c4c/pygithub-2.8.1.tar.gz", hash = "sha256:341b7c78521cb07324ff670afd1baa2bf5c286f8d9fd302c1798ba594a5400c9", size = 2246994, upload-time = "2025-09-02T17:41:54.674Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/07/ba/7049ce39f653f6140aac4beb53a5aaf08b4407b6a3019aae394c1c5244ff/pygithub-2.8.1-py3-none-any.whl", hash = "sha256:23a0a5bca93baef082e03411bf0ce27204c32be8bfa7abc92fe4a3e132936df0", size = 432709, upload-time = "2025-09-02T17:41:52.947Z" },
+]
+
+[[package]]
+name = "pygments"
+version = "2.20.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" },
+]
+
+[[package]]
+name = "pyjwt"
+version = "2.12.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a8/10/e8192be5f38f3e8e7e046716de4cae33d56fd5ae08927a823bb916be36c1/pyjwt-2.12.0.tar.gz", hash = "sha256:2f62390b667cd8257de560b850bb5a883102a388829274147f1d724453f8fb02", size = 102511, upload-time = "2026-03-12T17:15:30.831Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/15/70/70f895f404d363d291dcf62c12c85fdd47619ad9674ac0f53364d035925a/pyjwt-2.12.0-py3-none-any.whl", hash = "sha256:9bb459d1bdd0387967d287f5656bf7ec2b9a26645d1961628cda1764e087fd6e", size = 29700, upload-time = "2026-03-12T17:15:29.257Z" },
+]
+
+[package.optional-dependencies]
+crypto = [
+    { name = "cryptography" },
+]
+
+[[package]]
+name = "pynacl"
+version = "1.6.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cffi", marker = "platform_python_implementation != 'PyPy'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d9/9a/4019b524b03a13438637b11538c82781a5eda427394380381af8f04f467a/pynacl-1.6.2.tar.gz", hash = "sha256:018494d6d696ae03c7e656e5e74cdfd8ea1326962cc401bcf018f1ed8436811c", size = 3511692, upload-time = "2026-01-01T17:48:10.851Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4b/79/0e3c34dc3c4671f67d251c07aa8eb100916f250ee470df230b0ab89551b4/pynacl-1.6.2-cp314-cp314t-macosx_10_10_universal2.whl", hash = "sha256:622d7b07cc5c02c666795792931b50c91f3ce3c2649762efb1ef0d5684c81594", size = 390064, upload-time = "2026-01-01T17:31:57.264Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/1c/23a26e931736e13b16483795c8a6b2f641bf6a3d5238c22b070a5112722c/pynacl-1.6.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d071c6a9a4c94d79eb665db4ce5cedc537faf74f2355e4d502591d850d3913c0", size = 809370, upload-time = "2026-01-01T17:31:59.198Z" },
+    { url = "https://files.pythonhosted.org/packages/87/74/8d4b718f8a22aea9e8dcc8b95deb76d4aae380e2f5b570cc70b5fd0a852d/pynacl-1.6.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fe9847ca47d287af41e82be1dd5e23023d3c31a951da134121ab02e42ac218c9", size = 1408304, upload-time = "2026-01-01T17:32:01.162Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/73/be4fdd3a6a87fe8a4553380c2b47fbd1f7f58292eb820902f5c8ac7de7b0/pynacl-1.6.2-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:04316d1fc625d860b6c162fff704eb8426b1a8bcd3abacea11142cbd99a6b574", size = 844871, upload-time = "2026-01-01T17:32:02.824Z" },
+    { url = "https://files.pythonhosted.org/packages/55/ad/6efc57ab75ee4422e96b5f2697d51bbcf6cdcc091e66310df91fbdc144a8/pynacl-1.6.2-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44081faff368d6c5553ccf55322ef2819abb40e25afaec7e740f159f74813634", size = 1446356, upload-time = "2026-01-01T17:32:04.452Z" },
+    { url = "https://files.pythonhosted.org/packages/78/b7/928ee9c4779caa0a915844311ab9fb5f99585621c5d6e4574538a17dca07/pynacl-1.6.2-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:a9f9932d8d2811ce1a8ffa79dcbdf3970e7355b5c8eb0c1a881a57e7f7d96e88", size = 826814, upload-time = "2026-01-01T17:32:06.078Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/a9/1bdba746a2be20f8809fee75c10e3159d75864ef69c6b0dd168fc60e485d/pynacl-1.6.2-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:bc4a36b28dd72fb4845e5d8f9760610588a96d5a51f01d84d8c6ff9849968c14", size = 1411742, upload-time = "2026-01-01T17:32:07.651Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/2f/5e7ea8d85f9f3ea5b6b87db1d8388daa3587eed181bdeb0306816fdbbe79/pynacl-1.6.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:3bffb6d0f6becacb6526f8f42adfb5efb26337056ee0831fb9a7044d1a964444", size = 801714, upload-time = "2026-01-01T17:32:09.558Z" },
+    { url = "https://files.pythonhosted.org/packages/06/ea/43fe2f7eab5f200e40fb10d305bf6f87ea31b3bbc83443eac37cd34a9e1e/pynacl-1.6.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:2fef529ef3ee487ad8113d287a593fa26f48ee3620d92ecc6f1d09ea38e0709b", size = 1372257, upload-time = "2026-01-01T17:32:11.026Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/54/c9ea116412788629b1347e415f72195c25eb2f3809b2d3e7b25f5c79f13a/pynacl-1.6.2-cp314-cp314t-win32.whl", hash = "sha256:a84bf1c20339d06dc0c85d9aea9637a24f718f375d861b2668b2f9f96fa51145", size = 231319, upload-time = "2026-01-01T17:32:12.46Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/04/64e9d76646abac2dccf904fccba352a86e7d172647557f35b9fe2a5ee4a1/pynacl-1.6.2-cp314-cp314t-win_amd64.whl", hash = "sha256:320ef68a41c87547c91a8b58903c9caa641ab01e8512ce291085b5fe2fcb7590", size = 244044, upload-time = "2026-01-01T17:32:13.781Z" },
+    { url = "https://files.pythonhosted.org/packages/33/33/7873dc161c6a06f43cda13dec67b6fe152cb2f982581151956fa5e5cdb47/pynacl-1.6.2-cp314-cp314t-win_arm64.whl", hash = "sha256:d29bfe37e20e015a7d8b23cfc8bd6aa7909c92a1b8f41ee416bbb3e79ef182b2", size = 188740, upload-time = "2026-01-01T17:32:15.083Z" },
+    { url = "https://files.pythonhosted.org/packages/be/7b/4845bbf88e94586ec47a432da4e9107e3fc3ce37eb412b1398630a37f7dd/pynacl-1.6.2-cp38-abi3-macosx_10_10_universal2.whl", hash = "sha256:c949ea47e4206af7c8f604b8278093b674f7c79ed0d4719cc836902bf4517465", size = 388458, upload-time = "2026-01-01T17:32:16.829Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/b4/e927e0653ba63b02a4ca5b4d852a8d1d678afbf69b3dbf9c4d0785ac905c/pynacl-1.6.2-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8845c0631c0be43abdd865511c41eab235e0be69c81dc66a50911594198679b0", size = 800020, upload-time = "2026-01-01T17:32:18.34Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/81/d60984052df5c97b1d24365bc1e30024379b42c4edcd79d2436b1b9806f2/pynacl-1.6.2-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:22de65bb9010a725b0dac248f353bb072969c94fa8d6b1f34b87d7953cf7bbe4", size = 1399174, upload-time = "2026-01-01T17:32:20.239Z" },
+    { url = "https://files.pythonhosted.org/packages/68/f7/322f2f9915c4ef27d140101dd0ed26b479f7e6f5f183590fd32dfc48c4d3/pynacl-1.6.2-cp38-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:46065496ab748469cdd999246d17e301b2c24ae2fdf739132e580a0e94c94a87", size = 835085, upload-time = "2026-01-01T17:32:22.24Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/d0/f301f83ac8dbe53442c5a43f6a39016f94f754d7a9815a875b65e218a307/pynacl-1.6.2-cp38-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8a66d6fb6ae7661c58995f9c6435bda2b1e68b54b598a6a10247bfcdadac996c", size = 1437614, upload-time = "2026-01-01T17:32:23.766Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/58/fc6e649762b029315325ace1a8c6be66125e42f67416d3dbd47b69563d61/pynacl-1.6.2-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:26bfcd00dcf2cf160f122186af731ae30ab120c18e8375684ec2670dccd28130", size = 818251, upload-time = "2026-01-01T17:32:25.69Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/a8/b917096b1accc9acd878819a49d3d84875731a41eb665f6ebc826b1af99e/pynacl-1.6.2-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:c8a231e36ec2cab018c4ad4358c386e36eede0319a0c41fed24f840b1dac59f6", size = 1402859, upload-time = "2026-01-01T17:32:27.215Z" },
+    { url = "https://files.pythonhosted.org/packages/85/42/fe60b5f4473e12c72f977548e4028156f4d340b884c635ec6b063fe7e9a5/pynacl-1.6.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:68be3a09455743ff9505491220b64440ced8973fe930f270c8e07ccfa25b1f9e", size = 791926, upload-time = "2026-01-01T17:32:29.314Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/f9/e40e318c604259301cc091a2a63f237d9e7b424c4851cafaea4ea7c4834e/pynacl-1.6.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:8b097553b380236d51ed11356c953bf8ce36a29a3e596e934ecabe76c985a577", size = 1363101, upload-time = "2026-01-01T17:32:31.263Z" },
+    { url = "https://files.pythonhosted.org/packages/48/47/e761c254f410c023a469284a9bc210933e18588ca87706ae93002c05114c/pynacl-1.6.2-cp38-abi3-win32.whl", hash = "sha256:5811c72b473b2f38f7e2a3dc4f8642e3a3e9b5e7317266e4ced1fba85cae41aa", size = 227421, upload-time = "2026-01-01T17:32:33.076Z" },
+    { url = "https://files.pythonhosted.org/packages/41/ad/334600e8cacc7d86587fe5f565480fde569dfb487389c8e1be56ac21d8ac/pynacl-1.6.2-cp38-abi3-win_amd64.whl", hash = "sha256:62985f233210dee6548c223301b6c25440852e13d59a8b81490203c3227c5ba0", size = 239754, upload-time = "2026-01-01T17:32:34.557Z" },
+    { url = "https://files.pythonhosted.org/packages/29/7d/5945b5af29534641820d3bd7b00962abbbdfee84ec7e19f0d5b3175f9a31/pynacl-1.6.2-cp38-abi3-win_arm64.whl", hash = "sha256:834a43af110f743a754448463e8fd61259cd4ab5bbedcf70f9dabad1d28a394c", size = 184801, upload-time = "2026-01-01T17:32:36.309Z" },
+]
+
+[[package]]
+name = "pyyaml"
+version = "6.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6d/16/a95b6757765b7b031c9374925bb718d55e0a9ba8a1b6a12d25962ea44347/pyyaml-6.0.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e", size = 185826, upload-time = "2025-09-25T21:31:58.655Z" },
+    { url = "https://files.pythonhosted.org/packages/16/19/13de8e4377ed53079ee996e1ab0a9c33ec2faf808a4647b7b4c0d46dd239/pyyaml-6.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824", size = 175577, upload-time = "2025-09-25T21:32:00.088Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/62/d2eb46264d4b157dae1275b573017abec435397aa59cbcdab6fc978a8af4/pyyaml-6.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c", size = 775556, upload-time = "2025-09-25T21:32:01.31Z" },
+    { url = "https://files.pythonhosted.org/packages/10/cb/16c3f2cf3266edd25aaa00d6c4350381c8b012ed6f5276675b9eba8d9ff4/pyyaml-6.0.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00", size = 882114, upload-time = "2025-09-25T21:32:03.376Z" },
+    { url = "https://files.pythonhosted.org/packages/71/60/917329f640924b18ff085ab889a11c763e0b573da888e8404ff486657602/pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d", size = 806638, upload-time = "2025-09-25T21:32:04.553Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/6f/529b0f316a9fd167281a6c3826b5583e6192dba792dd55e3203d3f8e655a/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a", size = 767463, upload-time = "2025-09-25T21:32:06.152Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/6a/b627b4e0c1dd03718543519ffb2f1deea4a1e6d42fbab8021936a4d22589/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4", size = 794986, upload-time = "2025-09-25T21:32:07.367Z" },
+    { url = "https://files.pythonhosted.org/packages/45/91/47a6e1c42d9ee337c4839208f30d9f09caa9f720ec7582917b264defc875/pyyaml-6.0.3-cp311-cp311-win32.whl", hash = "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b", size = 142543, upload-time = "2025-09-25T21:32:08.95Z" },
+    { url = "https://files.pythonhosted.org/packages/da/e3/ea007450a105ae919a72393cb06f122f288ef60bba2dc64b26e2646fa315/pyyaml-6.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf", size = 158763, upload-time = "2025-09-25T21:32:09.96Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" },
+    { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" },
+    { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011, upload-time = "2025-09-25T21:32:15.21Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870, upload-time = "2025-09-25T21:32:16.431Z" },
+    { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089, upload-time = "2025-09-25T21:32:17.56Z" },
+    { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181, upload-time = "2025-09-25T21:32:18.834Z" },
+    { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658, upload-time = "2025-09-25T21:32:20.209Z" },
+    { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003, upload-time = "2025-09-25T21:32:21.167Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669, upload-time = "2025-09-25T21:32:23.673Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252, upload-time = "2025-09-25T21:32:25.149Z" },
+    { url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081, upload-time = "2025-09-25T21:32:26.575Z" },
+    { url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159, upload-time = "2025-09-25T21:32:27.727Z" },
+    { url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626, upload-time = "2025-09-25T21:32:28.878Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613, upload-time = "2025-09-25T21:32:30.178Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115, upload-time = "2025-09-25T21:32:31.353Z" },
+    { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427, upload-time = "2025-09-25T21:32:32.58Z" },
+    { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090, upload-time = "2025-09-25T21:32:33.659Z" },
+    { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" },
+    { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" },
+    { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" },
+    { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" },
+    { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" },
+    { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" },
+]
+
+[[package]]
+name = "requests"
+version = "2.33.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "charset-normalizer" },
+    { name = "idna" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/34/64/8860370b167a9721e8956ae116825caff829224fbca0ca6e7bf8ddef8430/requests-2.33.0.tar.gz", hash = "sha256:c7ebc5e8b0f21837386ad0e1c8fe8b829fa5f544d8df3b2253bff14ef29d7652", size = 134232, upload-time = "2026-03-25T15:10:41.586Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/56/5d/c814546c2333ceea4ba42262d8c4d55763003e767fa169adc693bd524478/requests-2.33.0-py3-none-any.whl", hash = "sha256:3324635456fa185245e24865e810cecec7b4caf933d7eb133dcde67d48cee69b", size = 65017, upload-time = "2026-03-25T15:10:40.382Z" },
+]
+
+[[package]]
+name = "rich"
+version = "14.3.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markdown-it-py" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/74/99/a4cab2acbb884f80e558b0771e97e21e939c5dfb460f488d19df485e8298/rich-14.3.2.tar.gz", hash = "sha256:e712f11c1a562a11843306f5ed999475f09ac31ffb64281f73ab29ffdda8b3b8", size = 230143, upload-time = "2026-02-01T16:20:47.908Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ef/45/615f5babd880b4bd7d405cc0dc348234c5ffb6ed1ea33e152ede08b2072d/rich-14.3.2-py3-none-any.whl", hash = "sha256:08e67c3e90884651da3239ea668222d19bea7b589149d8014a21c633420dbb69", size = 309963, upload-time = "2026-02-01T16:20:46.078Z" },
+]
+
+[[package]]
+name = "roman-numerals"
+version = "4.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ae/f9/41dc953bbeb056c17d5f7a519f50fdf010bd0553be2d630bc69d1e022703/roman_numerals-4.1.0.tar.gz", hash = "sha256:1af8b147eb1405d5839e78aeb93131690495fe9da5c91856cb33ad55a7f1e5b2", size = 9077, upload-time = "2025-12-17T18:25:34.381Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/54/6f679c435d28e0a568d8e8a7c0a93a09010818634c3c3907fc98d8983770/roman_numerals-4.1.0-py3-none-any.whl", hash = "sha256:647ba99caddc2cc1e55a51e4360689115551bf4476d90e8162cf8c345fe233c7", size = 7676, upload-time = "2025-12-17T18:25:33.098Z" },
+]
+
+[[package]]
+name = "setuptools"
+version = "82.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/82/f3/748f4d6f65d1756b9ae577f329c951cda23fb900e4de9f70900ced962085/setuptools-82.0.0.tar.gz", hash = "sha256:22e0a2d69474c6ae4feb01951cb69d515ed23728cf96d05513d36e42b62b37cb", size = 1144893, upload-time = "2026-02-08T15:08:40.206Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e1/c6/76dc613121b793286a3f91621d7b75a2b493e0390ddca50f11993eadf192/setuptools-82.0.0-py3-none-any.whl", hash = "sha256:70b18734b607bd1da571d097d236cfcfacaf01de45717d59e6e04b96877532e0", size = 1003468, upload-time = "2026-02-08T15:08:38.723Z" },
+]
+
+[[package]]
+name = "shellingham"
+version = "1.5.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310, upload-time = "2023-10-24T04:13:40.426Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" },
+]
+
+[[package]]
+name = "snowballstemmer"
+version = "3.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/75/a7/9810d872919697c9d01295633f5d574fb416d47e535f258272ca1f01f447/snowballstemmer-3.0.1.tar.gz", hash = "sha256:6d5eeeec8e9f84d4d56b847692bacf79bc2c8e90c7f80ca4444ff8b6f2e52895", size = 105575, upload-time = "2025-05-09T16:34:51.843Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c8/78/3565d011c61f5a43488987ee32b6f3f656e7f107ac2782dd57bdd7d91d9a/snowballstemmer-3.0.1-py3-none-any.whl", hash = "sha256:6cd7b3897da8d6c9ffb968a6781fa6532dce9c3618a4b127d920dab764a19064", size = 103274, upload-time = "2025-05-09T16:34:50.371Z" },
+]
+
+[[package]]
+name = "soupsieve"
+version = "2.8.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7b/ae/2d9c981590ed9999a0d91755b47fc74f74de286b0f5cee14c9269041e6c4/soupsieve-2.8.3.tar.gz", hash = "sha256:3267f1eeea4251fb42728b6dfb746edc9acaffc4a45b27e19450b676586e8349", size = 118627, upload-time = "2026-01-20T04:27:02.457Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/46/2c/1462b1d0a634697ae9e55b3cecdcb64788e8b7d63f54d923fcd0bb140aed/soupsieve-2.8.3-py3-none-any.whl", hash = "sha256:ed64f2ba4eebeab06cc4962affce381647455978ffc1e36bb79a545b91f45a95", size = 37016, upload-time = "2026-01-20T04:27:01.012Z" },
+]
+
+[[package]]
+name = "sphinx"
+version = "9.0.4"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.12'",
+]
+dependencies = [
+    { name = "alabaster", marker = "python_full_version < '3.12'" },
+    { name = "babel", marker = "python_full_version < '3.12'" },
+    { name = "colorama", marker = "python_full_version < '3.12' and sys_platform == 'win32'" },
+    { name = "docutils", marker = "python_full_version < '3.12'" },
+    { name = "imagesize", marker = "python_full_version < '3.12'" },
+    { name = "jinja2", marker = "python_full_version < '3.12'" },
+    { name = "packaging", marker = "python_full_version < '3.12'" },
+    { name = "pygments", marker = "python_full_version < '3.12'" },
+    { name = "requests", marker = "python_full_version < '3.12'" },
+    { name = "roman-numerals", marker = "python_full_version < '3.12'" },
+    { name = "snowballstemmer", marker = "python_full_version < '3.12'" },
+    { name = "sphinxcontrib-applehelp", marker = "python_full_version < '3.12'" },
+    { name = "sphinxcontrib-devhelp", marker = "python_full_version < '3.12'" },
+    { name = "sphinxcontrib-htmlhelp", marker = "python_full_version < '3.12'" },
+    { name = "sphinxcontrib-jsmath", marker = "python_full_version < '3.12'" },
+    { name = "sphinxcontrib-qthelp", marker = "python_full_version < '3.12'" },
+    { name = "sphinxcontrib-serializinghtml", marker = "python_full_version < '3.12'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/42/50/a8c6ccc36d5eacdfd7913ddccd15a9cee03ecafc5ee2bc40e1f168d85022/sphinx-9.0.4.tar.gz", hash = "sha256:594ef59d042972abbc581d8baa577404abe4e6c3b04ef61bd7fc2acbd51f3fa3", size = 8710502, upload-time = "2025-12-04T07:45:27.343Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c6/3f/4bbd76424c393caead2e1eb89777f575dee5c8653e2d4b6afd7a564f5974/sphinx-9.0.4-py3-none-any.whl", hash = "sha256:5bebc595a5e943ea248b99c13814c1c5e10b3ece718976824ffa7959ff95fffb", size = 3917713, upload-time = "2025-12-04T07:45:24.944Z" },
+]
+
+[[package]]
+name = "sphinx"
+version = "9.1.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.12'",
+]
+dependencies = [
+    { name = "alabaster", marker = "python_full_version >= '3.12'" },
+    { name = "babel", marker = "python_full_version >= '3.12'" },
+    { name = "colorama", marker = "python_full_version >= '3.12' and sys_platform == 'win32'" },
+    { name = "docutils", marker = "python_full_version >= '3.12'" },
+    { name = "imagesize", marker = "python_full_version >= '3.12'" },
+    { name = "jinja2", marker = "python_full_version >= '3.12'" },
+    { name = "packaging", marker = "python_full_version >= '3.12'" },
+    { name = "pygments", marker = "python_full_version >= '3.12'" },
+    { name = "requests", marker = "python_full_version >= '3.12'" },
+    { name = "roman-numerals", marker = "python_full_version >= '3.12'" },
+    { name = "snowballstemmer", marker = "python_full_version >= '3.12'" },
+    { name = "sphinxcontrib-applehelp", marker = "python_full_version >= '3.12'" },
+    { name = "sphinxcontrib-devhelp", marker = "python_full_version >= '3.12'" },
+    { name = "sphinxcontrib-htmlhelp", marker = "python_full_version >= '3.12'" },
+    { name = "sphinxcontrib-jsmath", marker = "python_full_version >= '3.12'" },
+    { name = "sphinxcontrib-qthelp", marker = "python_full_version >= '3.12'" },
+    { name = "sphinxcontrib-serializinghtml", marker = "python_full_version >= '3.12'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cd/bd/f08eb0f4eed5c83f1ba2a3bd18f7745a2b1525fad70660a1c00224ec468a/sphinx-9.1.0.tar.gz", hash = "sha256:7741722357dd75f8190766926071fed3bdc211c74dd2d7d4df5404da95930ddb", size = 8718324, upload-time = "2025-12-31T15:09:27.646Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/73/f7/b1884cb3188ab181fc81fa00c266699dab600f927a964df02ec3d5d1916a/sphinx-9.1.0-py3-none-any.whl", hash = "sha256:c84fdd4e782504495fe4f2c0b3413d6c2bf388589bb352d439b2a3bb99991978", size = 3921742, upload-time = "2025-12-31T15:09:25.561Z" },
+]
+
+[[package]]
+name = "sphinx-reredirects"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
+    { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/1b/8d/0e39fe2740d7d71417edf9a6424aa80ca2c27c17fc21282cdc39f90d5a40/sphinx_reredirects-1.1.0.tar.gz", hash = "sha256:fb9b195335ab14b43f8273287d0c7eeb637ba6c56c66581c11b47202f6718b29", size = 614624, upload-time = "2025-12-22T08:28:02.792Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/51/81/b5dd07067f3daac6d23687ec737b2d593740671ebcd145830c8f92d381c5/sphinx_reredirects-1.1.0-py3-none-any.whl", hash = "sha256:4b5692273c72cd2d4d917f4c6f87d5919e4d6114a752d4be033f7f5f6310efd9", size = 6351, upload-time = "2025-12-22T08:27:59.724Z" },
+]
+
+[[package]]
+name = "sphinxcontrib-applehelp"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ba/6e/b837e84a1a704953c62ef8776d45c3e8d759876b4a84fe14eba2859106fe/sphinxcontrib_applehelp-2.0.0.tar.gz", hash = "sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1", size = 20053, upload-time = "2024-07-29T01:09:00.465Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5d/85/9ebeae2f76e9e77b952f4b274c27238156eae7979c5421fba91a28f4970d/sphinxcontrib_applehelp-2.0.0-py3-none-any.whl", hash = "sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5", size = 119300, upload-time = "2024-07-29T01:08:58.99Z" },
+]
+
+[[package]]
+name = "sphinxcontrib-devhelp"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f6/d2/5beee64d3e4e747f316bae86b55943f51e82bb86ecd325883ef65741e7da/sphinxcontrib_devhelp-2.0.0.tar.gz", hash = "sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad", size = 12967, upload-time = "2024-07-29T01:09:23.417Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/35/7a/987e583882f985fe4d7323774889ec58049171828b58c2217e7f79cdf44e/sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2", size = 82530, upload-time = "2024-07-29T01:09:21.945Z" },
+]
+
+[[package]]
+name = "sphinxcontrib-htmlhelp"
+version = "2.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/43/93/983afd9aa001e5201eab16b5a444ed5b9b0a7a010541e0ddfbbfd0b2470c/sphinxcontrib_htmlhelp-2.1.0.tar.gz", hash = "sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9", size = 22617, upload-time = "2024-07-29T01:09:37.889Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0a/7b/18a8c0bcec9182c05a0b3ec2a776bba4ead82750a55ff798e8d406dae604/sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8", size = 98705, upload-time = "2024-07-29T01:09:36.407Z" },
+]
+
+[[package]]
+name = "sphinxcontrib-jsmath"
+version = "1.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b2/e8/9ed3830aeed71f17c026a07a5097edcf44b692850ef215b161b8ad875729/sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8", size = 5787, upload-time = "2019-01-21T16:10:16.347Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c2/42/4c8646762ee83602e3fb3fbe774c2fac12f317deb0b5dbeeedd2d3ba4b77/sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", size = 5071, upload-time = "2019-01-21T16:10:14.333Z" },
+]
+
+[[package]]
+name = "sphinxcontrib-qthelp"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/68/bc/9104308fc285eb3e0b31b67688235db556cd5b0ef31d96f30e45f2e51cae/sphinxcontrib_qthelp-2.0.0.tar.gz", hash = "sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab", size = 17165, upload-time = "2024-07-29T01:09:56.435Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/27/83/859ecdd180cacc13b1f7e857abf8582a64552ea7a061057a6c716e790fce/sphinxcontrib_qthelp-2.0.0-py3-none-any.whl", hash = "sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb", size = 88743, upload-time = "2024-07-29T01:09:54.885Z" },
+]
+
+[[package]]
+name = "sphinxcontrib-serializinghtml"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/3b/44/6716b257b0aa6bfd51a1b31665d1c205fb12cb5ad56de752dfa15657de2f/sphinxcontrib_serializinghtml-2.0.0.tar.gz", hash = "sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d", size = 16080, upload-time = "2024-07-29T01:10:09.332Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331", size = 92072, upload-time = "2024-07-29T01:10:08.203Z" },
+]
+
+[[package]]
+name = "tomlkit"
+version = "0.14.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c3/af/14b24e41977adb296d6bd1fb59402cf7d60ce364f90c890bd2ec65c43b5a/tomlkit-0.14.0.tar.gz", hash = "sha256:cf00efca415dbd57575befb1f6634c4f42d2d87dbba376128adb42c121b87064", size = 187167, upload-time = "2026-01-13T01:14:53.304Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b5/11/87d6d29fb5d237229d67973a6c9e06e048f01cf4994dee194ab0ea841814/tomlkit-0.14.0-py3-none-any.whl", hash = "sha256:592064ed85b40fa213469f81ac584f67a4f2992509a7c3ea2d632208623a3680", size = 39310, upload-time = "2026-01-13T01:14:51.965Z" },
+]
+
+[[package]]
+name = "typer"
+version = "0.23.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "annotated-doc" },
+    { name = "click" },
+    { name = "rich" },
+    { name = "shellingham" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d3/ae/93d16574e66dfe4c2284ffdaca4b0320ade32858cb2cc586c8dd79f127c5/typer-0.23.2.tar.gz", hash = "sha256:a99706a08e54f1aef8bb6a8611503808188a4092808e86addff1828a208af0de", size = 120162, upload-time = "2026-02-16T18:52:40.354Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/14/2c/dee705c427875402200fe779eb8a3c00ccb349471172c41178336e9599cc/typer-0.23.2-py3-none-any.whl", hash = "sha256:e9c8dc380f82450b3c851a9b9d5a0edf95d1d6456ae70c517d8b06a50c7a9978", size = 56834, upload-time = "2026-02-16T18:52:39.308Z" },
+]
+
+[[package]]
+name = "typing-extensions"
+version = "4.15.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
+]
+
+[[package]]
+name = "urllib3"
+version = "2.6.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" },
+]